From ac44e7b7793c9839acb1a38f1dad4d078d1d6725 Mon Sep 17 00:00:00 2001
From: Chunwei <yanchunwei@outlook.com>
Date: Thu, 12 Sep 2019 19:55:50 +0800
Subject: [PATCH] init from wiki

---
 .gitmodules                                   |   12 -
 CMakeLists.txt                                |  183 -
 Home.md                                       |   54 +
 README.md                                     |   74 -
 README_cn.md                                  |   62 -
 add_new_operation.md                          |  189 +
 architecture-intro.md                         |  247 +
 architecture.md                               |   94 +
 benchmark.md                                  |  162 +
 benchmark_tools.md                            |  196 +
 benchmark_tools.md.toc.2019-08-25_233116      |   11 +
 benchmark_tools.md.toc.2019-08-25_233528      |   11 +
 cmake/FindGflags.cmake                        |  582 -
 cmake/FindGlog.cmake                          |   24 -
 cmake/FindGperftools.cmake                    |   63 -
 cmake/FindJeMalloc.cmake                      |   28 -
 cmake/FindNumPy.cmake                         |   38 -
 cmake/cblas.cmake                             |   94 -
 cmake/ccache.cmake                            |    9 -
 cmake/configure.cmake                         |  160 -
 cmake/coveralls.cmake                         |  103 -
 cmake/coverallsGcovJsons.cmake                |  401 -
 cmake/cross_compiling/android.cmake           |   85 -
 cmake/cross_compiling/armlinux.cmake          |   41 -
 cmake/cross_compiling/findar.cmake            |   33 -
 cmake/cross_compiling/host.cmake              |   48 -
 cmake/cross_compiling/ios.cmake               |  692 --
 cmake/cross_compiling/npu.cmake               |   90 -
 cmake/cross_compiling/postproject.cmake       |   99 -
 cmake/cross_compiling/preproject.cmake        |   59 -
 cmake/cuda.cmake                              |  228 -
 cmake/cudnn.cmake                             |   99 -
 cmake/cupti.cmake                             |   41 -
 cmake/external/eigen.cmake                    |   54 -
 cmake/external/gflags.cmake                   |   75 -
 cmake/external/glog.cmake                     |   77 -
 cmake/external/gtest.cmake                    |   86 -
 cmake/external/libxsmm.cmake                  |   55 -
 cmake/external/mkldnn.cmake                   |  120 -
 cmake/external/mklml.cmake                    |   77 -
 cmake/external/openblas.cmake                 |   93 -
 cmake/external/opencl-clhpp.cmake             |   36 -
 cmake/external/opencl-headers.cmake           |   33 -
 cmake/external/protobuf.cmake                 |  308 -
 cmake/external/xbyak.cmake                    |   57 -
 cmake/external/xxhash.cmake                   |   73 -
 cmake/flags.cmake                             |  194 -
 cmake/generic.cmake                           |  567 -
 cmake/hip.cmake                               |   53 -
 cmake/lite.cmake                              |  435 -
 cmake/lite_utils.cmake                        |   56 -
 cmake/make_resource.py                        |   25 -
 cmake/operators.cmake                         |  227 -
 cmake/package.cmake                           |   21 -
 cmake/simd.cmake                              |   99 -
 cmake/system.cmake                            |   89 -
 cmake/tensorrt.cmake                          |   38 -
 cmake/util.cmake                              |   55 -
 cmake/version.cmake                           |   66 -
 cpp_demo.md                                   |  271 +
 cxx_api.md                                    |   63 +
 debug_tools.md                                |   77 +
 demos.md.toc.2019-08-26_222115                |   19 +
 demos.md.toc.2019-08-26_222307                |   19 +
 for-developer.md                              |   15 +
 fpga.md                                       |  107 +
 images/architecture.jpg                       |  Bin 0 -> 80268 bytes
 images/benchmark_result.png                   |  Bin 0 -> 160704 bytes
 images/img_mobilenetv1_inference.png          |  Bin 0 -> 72038 bytes
 images/lite1.png                              |  Bin 0 -> 258476 bytes
 images/model_quan_fig.png                     |  Bin 0 -> 315888 bytes
 images/model_quan_table1.png                  |  Bin 0 -> 128197 bytes
 images/phone_list.png                         |  Bin 0 -> 16772 bytes
 images/run_benchmark.png                      |  Bin 0 -> 136012 bytes
 java_demo.md                                  |  112 +
 lite/CMakeLists.txt                           |  159 -
 lite/api/CMakeLists.txt                       |  239 -
 lite/api/_paddle_use_kernels.h                |  209 -
 lite/api/_paddle_use_ops.h                    |  127 -
 lite/api/android/.gitignore                   |    2 -
 lite/api/android/CMakeLists.txt               |    5 -
 lite/api/android/jni/.gitignore               |    3 -
 lite/api/android/jni/CMakeLists.txt           |   52 -
 lite/api/android/jni/native/CMakeLists.txt    |   32 -
 .../api/android/jni/native/convert_util_jni.h |  197 -
 .../api/android/jni/native/paddle_lite_jni.cc |  164 -
 lite/api/android/jni/native/paddle_lite_jni.h |  113 -
 lite/api/android/jni/native/tensor_jni.cc     |  168 -
 lite/api/android/jni/native/tensor_jni.h      |   90 -
 .../jni/src/com/baidu/paddle/lite/.gitignore  |    2 -
 .../src/com/baidu/paddle/lite/ConfigBase.java |   31 -
 .../src/com/baidu/paddle/lite/CxxConfig.java  |   39 -
 .../com/baidu/paddle/lite/MobileConfig.java   |   69 -
 .../paddle/lite/PaddleLiteInitializer.java    |   23 -
 .../baidu/paddle/lite/PaddlePredictor.java    |  192 -
 .../jni/src/com/baidu/paddle/lite/Place.java  |  148 -
 .../src/com/baidu/paddle/lite/PowerMode.java  |   36 -
 .../jni/src/com/baidu/paddle/lite/Tensor.java |  141 -
 .../paddle/lite/PaddlePredictorTest.java      |   54 -
 lite/api/apis_test.cc                         |  118 -
 lite/api/benchmark.cc                         |  190 -
 lite/api/cxx_api.cc                           |  177 -
 lite/api/cxx_api.h                            |  173 -
 lite/api/cxx_api_bin.cc                       |  129 -
 lite/api/cxx_api_impl.cc                      |   90 -
 lite/api/cxx_api_test.cc                      |  157 -
 lite/api/detection_model_test.cc              |  137 -
 lite/api/efficientnet_b0_test.cc              |  102 -
 lite/api/inceptionv4_test.cc                  |   94 -
 lite/api/light_api.cc                         |   95 -
 lite/api/light_api.h                          |   80 -
 lite/api/light_api_impl.cc                    |   79 -
 lite/api/light_api_test.cc                    |   90 -
 lite/api/lite_api_test_helper.cc              |   62 -
 lite/api/lite_api_test_helper.h               |   31 -
 lite/api/mobilenetv1_int8_test.cc             |   89 -
 lite/api/mobilenetv1_ssd_test.cc              |  112 -
 lite/api/mobilenetv1_test.cc                  |  145 -
 lite/api/mobilenetv1_yolov3_test.cc           |  119 -
 lite/api/mobilenetv2_test.cc                  |  147 -
 lite/api/model_optimize_tool.cc               |  119 -
 lite/api/model_run_test_image.cc              |   79 -
 lite/api/model_test.cc                        |  181 -
 lite/api/ocr_attention_test.cc                |  115 -
 lite/api/paddle_api.cc                        |   73 -
 lite/api/paddle_api.h                         |  167 -
 lite/api/paddle_api_test.cc                   |  122 -
 lite/api/paddle_lite_factory_helper.h         |   37 -
 lite/api/paddle_place.cc                      |  117 -
 lite/api/paddle_place.h                       |  164 -
 lite/api/paddle_use_passes.h                  |   41 -
 lite/api/resnet18_test.cc                     |   88 -
 lite/api/resnet50_test.cc                     |  107 -
 lite/api/resnet50_test_fpga.cc                |   63 -
 lite/api/shufflenetv2_test.cc                 |   92 -
 lite/api/test_googlenet_lite.cc               |   80 -
 lite/api/test_helper.h                        |   40 -
 lite/api/test_inceptionv4_lite_x86.cc         |  112 -
 lite/api/test_mobilenetv1_lite_x86.cc         |  109 -
 lite/api/test_mobilenetv2_lite_x86.cc         |  112 -
 lite/api/unet_test.cc                         |  106 -
 lite/backends/CMakeLists.txt                  |    7 -
 lite/backends/arm/CMakeLists.txt              |    1 -
 lite/backends/arm/math/CMakeLists.txt         |  111 -
 lite/backends/arm/math/activation.cc          |  698 --
 lite/backends/arm/math/activation.h           |   71 -
 lite/backends/arm/math/affine_channel.cc      |   69 -
 lite/backends/arm/math/affine_channel.h       |   41 -
 lite/backends/arm/math/anchor_generator.cc    |   82 -
 lite/backends/arm/math/anchor_generator.h     |   41 -
 lite/backends/arm/math/argmax.cc              |   65 -
 lite/backends/arm/math/argmax.h               |   35 -
 lite/backends/arm/math/axpy.cc                |  203 -
 lite/backends/arm/math/axpy.h                 |   49 -
 lite/backends/arm/math/beam_search.cc         |  271 -
 lite/backends/arm/math/beam_search.h          |   41 -
 lite/backends/arm/math/box_coder.cc           |   92 -
 lite/backends/arm/math/box_coder.h            |   36 -
 lite/backends/arm/math/col_im_transform.cc    |   75 -
 lite/backends/arm/math/col_im_transform.h     |   40 -
 lite/backends/arm/math/concat.cc              |   60 -
 lite/backends/arm/math/concat.h               |   35 -
 .../arm/math/conv3x3s1_direct_int8.cc         |  806 --
 .../arm/math/conv3x3s2_direct_int8.cc         | 1081 --
 lite/backends/arm/math/conv_block_utils.h     | 4292 --------
 lite/backends/arm/math/conv_depthwise.cc      |  239 -
 lite/backends/arm/math/conv_depthwise.h       |  100 -
 .../arm/math/conv_depthwise_3x3_int8.cc       | 5832 ----------
 .../backends/arm/math/conv_depthwise_3x3p0.cc | 4178 -------
 .../backends/arm/math/conv_depthwise_3x3p1.cc | 4850 ---------
 .../backends/arm/math/conv_depthwise_5x5s1.cc | 9615 -----------------
 .../arm/math/conv_depthwise_5x5s1_int8.cc     |  618 --
 .../backends/arm/math/conv_depthwise_5x5s2.cc | 3746 -------
 lite/backends/arm/math/conv_direct.cc         |  242 -
 lite/backends/arm/math/conv_direct.h          |  107 -
 lite/backends/arm/math/conv_direct_3x3s1.cc   | 1067 --
 lite/backends/arm/math/conv_direct_3x3s2.cc   | 1209 ---
 lite/backends/arm/math/conv_gemmlike.cc       |  285 -
 lite/backends/arm/math/conv_gemmlike.h        |  108 -
 lite/backends/arm/math/conv_impl.cc           |  900 --
 lite/backends/arm/math/conv_impl.h            |  423 -
 lite/backends/arm/math/conv_winograd.cc       |  141 -
 lite/backends/arm/math/conv_winograd.h        |   65 -
 lite/backends/arm/math/conv_winograd_3x3.cc   |  479 -
 lite/backends/arm/math/decode_bboxes.cc       |  651 --
 lite/backends/arm/math/decode_bboxes.h        |   39 -
 .../backends/arm/math/dot_toolchain_support.h |  196 -
 lite/backends/arm/math/dropout.cc             |   93 -
 lite/backends/arm/math/dropout.h              |   32 -
 lite/backends/arm/math/elementwise.cc         | 1290 ---
 lite/backends/arm/math/elementwise.h          |   95 -
 lite/backends/arm/math/fill_bias_relu.cc      |  122 -
 lite/backends/arm/math/fill_bias_relu.h       |   44 -
 lite/backends/arm/math/funcs.cc               |  153 -
 lite/backends/arm/math/funcs.h                |  427 -
 lite/backends/arm/math/gemm_prepacked_int8.cc | 3942 -------
 lite/backends/arm/math/gemm_prepacked_int8.h  |   94 -
 lite/backends/arm/math/gemv_arm_int8.cc       |  480 -
 lite/backends/arm/math/gemv_arm_int8.h        |   40 -
 lite/backends/arm/math/gru_utils.h            |  434 -
 lite/backends/arm/math/im2sequence.cc         |   72 -
 lite/backends/arm/math/im2sequence.h          |   44 -
 lite/backends/arm/math/increment.cc           |   37 -
 lite/backends/arm/math/increment.h            |   33 -
 lite/backends/arm/math/interpolate.cc         |  534 -
 lite/backends/arm/math/interpolate.h          |   58 -
 lite/backends/arm/math/lrn.cc                 |  101 -
 lite/backends/arm/math/lrn.h                  |   49 -
 lite/backends/arm/math/negative.cc            |   37 -
 lite/backends/arm/math/negative.h             |   33 -
 lite/backends/arm/math/norm.cc                |   52 -
 lite/backends/arm/math/norm.h                 |   35 -
 lite/backends/arm/math/packed_sgemm.cc        | 3481 ------
 lite/backends/arm/math/packed_sgemm.h         |   84 -
 lite/backends/arm/math/pad2d.cc               |  413 -
 lite/backends/arm/math/pad2d.h                |   71 -
 lite/backends/arm/math/pooling.cc             | 3173 ------
 lite/backends/arm/math/pooling.h              |  154 -
 lite/backends/arm/math/power.cc               |   96 -
 lite/backends/arm/math/power.h                |   33 -
 lite/backends/arm/math/prior_box.cc           |  362 -
 lite/backends/arm/math/prior_box.h            |   68 -
 lite/backends/arm/math/reduce_max.cc          |  207 -
 lite/backends/arm/math/reduce_max.h           |   89 -
 lite/backends/arm/math/reduce_mean.cc         |  204 -
 lite/backends/arm/math/reduce_mean.h          |   89 -
 lite/backends/arm/math/saturate.h             |  320 -
 lite/backends/arm/math/scale.cc               |  177 -
 lite/backends/arm/math/scale.h                |   45 -
 lite/backends/arm/math/sequence2batch.h       |  210 -
 lite/backends/arm/math/sequence_expand.cc     |   63 -
 lite/backends/arm/math/sequence_expand.h      |   35 -
 lite/backends/arm/math/sequence_pool.cc       |  224 -
 lite/backends/arm/math/sequence_pool.h        |   69 -
 lite/backends/arm/math/sequence_softmax.cc    |   49 -
 lite/backends/arm/math/sequence_softmax.h     |   34 -
 lite/backends/arm/math/sgemm.cc               |   68 -
 lite/backends/arm/math/sgemm.h                |   48 -
 lite/backends/arm/math/sgemv.cc               | 1054 --
 lite/backends/arm/math/sgemv.h                |   38 -
 lite/backends/arm/math/shuffle_channel.cc     |   81 -
 lite/backends/arm/math/shuffle_channel.h      |   34 -
 lite/backends/arm/math/slice.cc               |   93 -
 lite/backends/arm/math/slice.h                |   38 -
 lite/backends/arm/math/softmax.cc             |  616 --
 lite/backends/arm/math/softmax.h              |   71 -
 lite/backends/arm/math/split.cc               |   85 -
 lite/backends/arm/math/split.h                |   37 -
 lite/backends/arm/math/stack.cc               |   55 -
 lite/backends/arm/math/stack.h                |   30 -
 lite/backends/arm/math/topk.cc                |   53 -
 lite/backends/arm/math/topk.h                 |   34 -
 lite/backends/arm/math/type_trans.cc          |  919 --
 lite/backends/arm/math/type_trans.h           |  117 -
 lite/backends/arm/math/yolo_box.cc            |  168 -
 lite/backends/arm/math/yolo_box.h             |   37 -
 lite/backends/cuda/CMakeLists.txt             |    8 -
 lite/backends/cuda/blas.cc                    |   57 -
 lite/backends/cuda/blas.h                     |   99 -
 lite/backends/cuda/cuda_utils.h               |  124 -
 lite/backends/cuda/math/CMakeLists.txt        |   21 -
 lite/backends/cuda/math/activation.cu         |  285 -
 lite/backends/cuda/math/activation.h          |   58 -
 lite/backends/cuda/math/cudnn_conv.cc         |  481 -
 lite/backends/cuda/math/cudnn_conv.h          |  132 -
 lite/backends/cuda/math/cudnn_helper.h        |   24 -
 lite/backends/cuda/math/scale.cu              |   74 -
 lite/backends/cuda/math/scale.h               |   37 -
 lite/backends/cuda/math/transpose.cu          |  191 -
 lite/backends/cuda/math/transpose.h           |   44 -
 lite/backends/cuda/math/type_trans.cu         |   71 -
 lite/backends/cuda/math/type_trans.h          |   37 -
 lite/backends/cuda/math/utils.h               |   51 -
 lite/backends/cuda/target_wrapper.cc          |   80 -
 lite/backends/cuda/target_wrapper.h           |   64 -
 lite/backends/fpga/CMakeLists.txt             |   15 -
 lite/backends/fpga/KD/alignment.h             |   26 -
 lite/backends/fpga/KD/context.hpp             |   50 -
 lite/backends/fpga/KD/dl_engine.cpp           |   27 -
 lite/backends/fpga/KD/dl_engine.hpp           |   36 -
 lite/backends/fpga/KD/float16.hpp             |  508 -
 lite/backends/fpga/KD/fpga_cv.cpp             |   78 -
 lite/backends/fpga/KD/fpga_cv.hpp             |   28 -
 lite/backends/fpga/KD/layout.hpp              |   99 -
 lite/backends/fpga/KD/llapi/bias_scale.cpp    |  102 -
 lite/backends/fpga/KD/llapi/bias_scale.h      |   30 -
 lite/backends/fpga/KD/llapi/config.h          |   19 -
 lite/backends/fpga/KD/llapi/filter.cpp        |  317 -
 lite/backends/fpga/KD/llapi/filter.h          |   58 -
 lite/backends/fpga/KD/llapi/zynqmp_api.cpp    |  327 -
 lite/backends/fpga/KD/llapi/zynqmp_api.h      |  347 -
 lite/backends/fpga/KD/pe.hpp                  |   37 -
 lite/backends/fpga/KD/pe_params.hpp           |  233 -
 lite/backends/fpga/KD/pes/batchnorm_pe.hpp    |  105 -
 lite/backends/fpga/KD/pes/concat_pe.hpp       |  135 -
 lite/backends/fpga/KD/pes/conv_pe.hpp         |  138 -
 lite/backends/fpga/KD/pes/conv_process.hpp    |  418 -
 lite/backends/fpga/KD/pes/crop_pe.cpp         |   88 -
 lite/backends/fpga/KD/pes/crop_pe.hpp         |   45 -
 .../fpga/KD/pes/depthwise_conv_pe.hpp         |  102 -
 .../fpga/KD/pes/elementwise_add_pe.hpp        |   81 -
 .../fpga/KD/pes/fully_connected_pe.hpp        |   94 -
 lite/backends/fpga/KD/pes/input_pe.hpp        |   54 -
 lite/backends/fpga/KD/pes/norm_pe.hpp         |  121 -
 lite/backends/fpga/KD/pes/output_pe.hpp       |   53 -
 lite/backends/fpga/KD/pes/pooling_pe.hpp      |  176 -
 lite/backends/fpga/KD/pes/prior_box_pe.cpp    |  273 -
 lite/backends/fpga/KD/pes/prior_box_pe.hpp    |   46 -
 lite/backends/fpga/KD/pes/relu_pe.hpp         |   75 -
 lite/backends/fpga/KD/pes/resize.hpp          |   89 -
 lite/backends/fpga/KD/pes/scale_pe.hpp        |  120 -
 lite/backends/fpga/KD/pes/softmax_pe.cpp      |  162 -
 lite/backends/fpga/KD/pes/softmax_pe.hpp      |   44 -
 lite/backends/fpga/KD/pes/split_pe.hpp        |  124 -
 lite/backends/fpga/KD/shape.hpp               |  116 -
 lite/backends/fpga/KD/tensor.hpp              |  456 -
 lite/backends/fpga/KD/tensor_util.cpp         |   32 -
 lite/backends/fpga/KD/tensor_util.hpp         |   25 -
 lite/backends/fpga/lite_tensor.cc             |  110 -
 lite/backends/fpga/lite_tensor.h              |  251 -
 lite/backends/fpga/target_wrapper.cc          |   37 -
 lite/backends/host/CMakeLists.txt             |    3 -
 lite/backends/host/target_wrapper.cc          |   49 -
 lite/backends/npu/CMakeLists.txt              |    6 -
 lite/backends/npu/bridge/CMakeLists.txt       |   67 -
 lite/backends/npu/bridge/act_op.cc            |   88 -
 lite/backends/npu/bridge/act_op_test.cc       |  100 -
 lite/backends/npu/bridge/batch_norm_op.cc     |   96 -
 .../backends/npu/bridge/batch_norm_op_test.cc |  166 -
 lite/backends/npu/bridge/concat_op.cc         |   74 -
 lite/backends/npu/bridge/concat_op_test.cc    |  128 -
 lite/backends/npu/bridge/conv_op.cc           |  216 -
 lite/backends/npu/bridge/conv_op_test.cc      |  280 -
 lite/backends/npu/bridge/conv_transpose_op.cc |  146 -
 .../npu/bridge/conv_transpose_op_test.cc      |  369 -
 lite/backends/npu/bridge/elementwise_ops.cc   |   79 -
 .../npu/bridge/elementwise_ops_test.cc        |  182 -
 lite/backends/npu/bridge/fc_op.cc             |  119 -
 lite/backends/npu/bridge/fc_op_test.cc        |  146 -
 lite/backends/npu/bridge/interpolate_op.cc    |  143 -
 .../npu/bridge/interpolate_op_test.cc         |  405 -
 lite/backends/npu/bridge/mul_op.cc            |  122 -
 lite/backends/npu/bridge/mul_op_test.cc       |  125 -
 lite/backends/npu/bridge/pad2d_op.cc          |   86 -
 lite/backends/npu/bridge/pad2d_op_test.cc     |  189 -
 .../npu/bridge/paddle_use_npu_bridges.h       |   37 -
 lite/backends/npu/bridge/pool_op.cc           |   89 -
 lite/backends/npu/bridge/pool_op_test.cc      |  249 -
 lite/backends/npu/bridge/registry.cc          |   39 -
 lite/backends/npu/bridge/registry.h           |   84 -
 lite/backends/npu/bridge/reshape_op.cc        |  121 -
 lite/backends/npu/bridge/reshape_op_test.cc   |  202 -
 lite/backends/npu/bridge/scale_op.cc          |   89 -
 lite/backends/npu/bridge/scale_op_test.cc     |  123 -
 .../backends/npu/bridge/shuffle_channel_op.cc |   60 -
 .../npu/bridge/shuffle_channel_op_test.cc     |  115 -
 lite/backends/npu/bridge/softmax_op.cc        |   67 -
 lite/backends/npu/bridge/softmax_op_test.cc   |  134 -
 lite/backends/npu/bridge/split_op.cc          |   86 -
 lite/backends/npu/bridge/split_op_test.cc     |  170 -
 lite/backends/npu/bridge/test_helper.cc       |  101 -
 lite/backends/npu/bridge/test_helper.h        |   64 -
 lite/backends/npu/bridge/transpose_op.cc      |   78 -
 lite/backends/npu/bridge/transpose_op_test.cc |  151 -
 lite/backends/npu/bridge/utils.cc             |  137 -
 lite/backends/npu/bridge/utils.h              |   94 -
 lite/backends/npu/npu_helper.cc               |  139 -
 lite/backends/npu/npu_helper.h                |  110 -
 lite/backends/opencl/CMakeLists.txt           |   18 -
 lite/backends/opencl/cl_caller.cc             |  169 -
 lite/backends/opencl/cl_caller.h              |   52 -
 lite/backends/opencl/cl_context.cc            |  126 -
 lite/backends/opencl/cl_context.h             |   54 -
 lite/backends/opencl/cl_functions_test.cc     |  451 -
 lite/backends/opencl/cl_im2col_test.cc        |  330 -
 lite/backends/opencl/cl_image.cc              |  160 -
 lite/backends/opencl/cl_image.h               |  114 -
 lite/backends/opencl/cl_image_converter.cc    |  461 -
 lite/backends/opencl/cl_image_converter.h     |  139 -
 lite/backends/opencl/cl_include.h             |   21 -
 .../buffer/depthwise_conv2d_kernel.cl         |   70 -
 .../buffer/elementwise_add_kernel.cl          |   45 -
 .../opencl/cl_kernel/buffer/fc_kernel.cl      |  424 -
 .../opencl/cl_kernel/buffer/im2col_kernel.cl  |   64 -
 .../opencl/cl_kernel/buffer/mat_mul_kernel.cl |   93 -
 .../opencl/cl_kernel/buffer/pool_kernel.cl    |  112 -
 .../opencl/cl_kernel/buffer/relu_kernel.cl    |   22 -
 lite/backends/opencl/cl_kernel/cl_common.h    |   38 -
 .../cl_kernel/image/channel_add_kernel.cl     |   29 -
 .../cl_kernel/image/elementwise_add_kernel.cl |   26 -
 .../opencl/cl_kernel/image/pool_kernel.cl     |   90 -
 lite/backends/opencl/cl_runtime.cc            |  170 -
 lite/backends/opencl/cl_runtime.h             |  101 -
 lite/backends/opencl/cl_utility.cc            |   84 -
 lite/backends/opencl/cl_utility.h             |   46 -
 lite/backends/opencl/cl_wrapper.cc            |  732 --
 lite/backends/opencl/cl_wrapper.h             |  572 -
 lite/backends/opencl/target_wrapper.cc        |  341 -
 lite/backends/opencl/target_wrapper.h         |   83 -
 lite/backends/x86/CMakeLists.txt              |   14 -
 lite/backends/x86/cpu_info.cc                 |  160 -
 lite/backends/x86/cpu_info.h                  |   80 -
 lite/backends/x86/cupti_lib_path.h.in         |   17 -
 lite/backends/x86/dynamic_loader.cc           |  263 -
 lite/backends/x86/dynamic_loader.h            |   38 -
 lite/backends/x86/jit/CMakeLists.txt          |   26 -
 lite/backends/x86/jit/README.en.md            |  103 -
 lite/backends/x86/jit/README.md               |   94 -
 lite/backends/x86/jit/benchmark.cc            |  576 -
 lite/backends/x86/jit/gen/CMakeLists.txt      |   36 -
 lite/backends/x86/jit/gen/act.cc              |  164 -
 lite/backends/x86/jit/gen/act.h               |  347 -
 lite/backends/x86/jit/gen/blas.cc             |  190 -
 lite/backends/x86/jit/gen/blas.h              |  125 -
 lite/backends/x86/jit/gen/embseqpool.cc       |  148 -
 lite/backends/x86/jit/gen/embseqpool.h        |   81 -
 lite/backends/x86/jit/gen/gru.cc              |  116 -
 lite/backends/x86/jit/gen/gru.h               |  116 -
 lite/backends/x86/jit/gen/hopv.cc             |  103 -
 lite/backends/x86/jit/gen/hopv.h              |   92 -
 lite/backends/x86/jit/gen/jitcode.h           |  133 -
 lite/backends/x86/jit/gen/lstm.cc             |  142 -
 lite/backends/x86/jit/gen/lstm.h              |  121 -
 lite/backends/x86/jit/gen/matmul.cc           |  127 -
 lite/backends/x86/jit/gen/matmul.h            |   62 -
 lite/backends/x86/jit/gen/seqpool.cc          |   85 -
 lite/backends/x86/jit/gen/seqpool.h           |  216 -
 lite/backends/x86/jit/gen/sgd.cc              |  130 -
 lite/backends/x86/jit/gen/sgd.h               |   60 -
 lite/backends/x86/jit/gen/vbroadcast.cc       |   91 -
 lite/backends/x86/jit/gen/vbroadcast.h        |   54 -
 lite/backends/x86/jit/gen_base.cc             |   95 -
 lite/backends/x86/jit/gen_base.h              |   87 -
 lite/backends/x86/jit/helper.cc               |  139 -
 lite/backends/x86/jit/helper.h                |  267 -
 lite/backends/x86/jit/kernel_base.h           |  365 -
 lite/backends/x86/jit/kernel_key.cc           |   71 -
 lite/backends/x86/jit/kernel_key.h            |   55 -
 lite/backends/x86/jit/kernel_pool.cc          |   41 -
 lite/backends/x86/jit/kernel_pool.h           |  116 -
 lite/backends/x86/jit/macro.h                 |   32 -
 lite/backends/x86/jit/more/CMakeLists.txt     |   18 -
 .../x86/jit/more/intrinsic/CMakeLists.txt     |    9 -
 .../x86/jit/more/intrinsic/crf_decoding.cc    |  185 -
 .../x86/jit/more/intrinsic/crf_decoding.h     |   45 -
 .../x86/jit/more/intrinsic/layer_norm.cc      |  181 -
 .../x86/jit/more/intrinsic/layer_norm.h       |   48 -
 lite/backends/x86/jit/more/mix/CMakeLists.txt |   15 -
 lite/backends/x86/jit/more/mix/mix.cc         |  255 -
 lite/backends/x86/jit/more/mix/mix.h          |   65 -
 lite/backends/x86/jit/more/mkl/CMakeLists.txt |   20 -
 lite/backends/x86/jit/more/mkl/mkl.cc         |  336 -
 lite/backends/x86/jit/more/mkl/mkl.h          |  244 -
 lite/backends/x86/jit/refer/CMakeLists.txt    |   40 -
 lite/backends/x86/jit/refer/refer.cc          |   61 -
 lite/backends/x86/jit/refer/refer.h           |  603 --
 lite/backends/x86/jit/registry.h              |  178 -
 lite/backends/x86/jit/test.cc                 | 1447 ---
 lite/backends/x86/legacy_place.h              |   30 -
 lite/backends/x86/math/CMakeLists.txt         |   62 -
 lite/backends/x86/math/beam_search.cc         |  322 -
 lite/backends/x86/math/beam_search.h          |  125 -
 lite/backends/x86/math/beam_search_test.cc    |  152 -
 lite/backends/x86/math/blas.cc                |   57 -
 lite/backends/x86/math/blas.h                 |  408 -
 lite/backends/x86/math/blas_impl.h            |  812 --
 lite/backends/x86/math/concat_and_split.cc    |  131 -
 lite/backends/x86/math/concat_and_split.h     |   83 -
 lite/backends/x86/math/context_project.cc     |   28 -
 lite/backends/x86/math/context_project.h      |  361 -
 lite/backends/x86/math/cos_sim_functor.cc     |   57 -
 lite/backends/x86/math/cos_sim_functor.h      |  187 -
 lite/backends/x86/math/cpu_vec.h              |  662 --
 lite/backends/x86/math/cross_entropy.cc       |   78 -
 lite/backends/x86/math/cross_entropy.h        |   74 -
 lite/backends/x86/math/detail/CMakeLists.txt  |    1 -
 .../x86/math/detail/activation_functions.h    |  193 -
 .../backends/x86/math/detail/avx_functions.cc |   91 -
 lite/backends/x86/math/detail/avx_mathfun.h   |  731 --
 .../backends/x86/math/detail/gru_cpu_kernel.h |  608 --
 lite/backends/x86/math/detail/gru_kernel.h    |  222 -
 lite/backends/x86/math/gru_compute.cc         |  181 -
 lite/backends/x86/math/gru_compute.h          |   69 -
 lite/backends/x86/math/im2col.cc              |  292 -
 lite/backends/x86/math/im2col.h               |  108 -
 lite/backends/x86/math/im2col_cfo_cpu.h       |  256 -
 lite/backends/x86/math/im2col_test.cc         |  331 -
 lite/backends/x86/math/math_function.cc       |  158 -
 lite/backends/x86/math/math_function.h        |   93 -
 lite/backends/x86/math/math_function_impl.h   |  192 -
 lite/backends/x86/math/math_function_test.cc  |  344 -
 lite/backends/x86/math/maxouting.cc           |  106 -
 lite/backends/x86/math/maxouting.h            |   47 -
 lite/backends/x86/math/pooling.cc             |  906 --
 lite/backends/x86/math/pooling.h              |  258 -
 lite/backends/x86/math/prelu.h                |   51 -
 lite/backends/x86/math/sample_prob.cc         |   28 -
 lite/backends/x86/math/sample_prob.h          |  128 -
 lite/backends/x86/math/sampler.cc             |  102 -
 lite/backends/x86/math/sampler.h              |  131 -
 lite/backends/x86/math/sequence2batch.cc      |   67 -
 lite/backends/x86/math/sequence2batch.h       |  190 -
 lite/backends/x86/math/sequence_padding.cc    |  187 -
 lite/backends/x86/math/sequence_padding.h     |  114 -
 lite/backends/x86/math/sequence_pooling.cc    |  406 -
 lite/backends/x86/math/sequence_pooling.h     |   52 -
 .../x86/math/sequence_pooling_test.cc         |  130 -
 lite/backends/x86/math/sequence_scale.cc      |   51 -
 lite/backends/x86/math/sequence_scale.h       |   59 -
 lite/backends/x86/math/softmax.cc             |   33 -
 lite/backends/x86/math/softmax.h              |   67 -
 lite/backends/x86/math/softmax_impl.h         |  245 -
 lite/backends/x86/math/tree2col.cc            |  204 -
 lite/backends/x86/math/tree2col.h             |   95 -
 lite/backends/x86/math/unpooling.cc           |   96 -
 lite/backends/x86/math/unpooling.h            |   44 -
 lite/backends/x86/math/vol2col.cc             |  204 -
 lite/backends/x86/math/vol2col.h              |   92 -
 lite/backends/x86/mklml.cc                    |   30 -
 lite/backends/x86/mklml.h                     |   99 -
 lite/backends/x86/port.h                      |  175 -
 lite/backends/x86/target_wrapper.cc           |   36 -
 lite/backends/x86/target_wrapper.h            |   22 -
 lite/backends/x86/warpctc_lib_path.h.in       |   17 -
 lite/core/CMakeLists.txt                      |  124 -
 lite/core/arena/CMakeLists.txt                |   10 -
 lite/core/arena/framework.cc                  |   70 -
 lite/core/arena/framework.h                   |  258 -
 lite/core/arena/framework_test.cc             |   83 -
 lite/core/context.cc                          |   23 -
 lite/core/context.h                           |  400 -
 lite/core/context_test.cc                     |   51 -
 lite/core/device_info.cc                      | 1151 --
 lite/core/device_info.h                       |  209 -
 lite/core/framework.proto                     |  188 -
 lite/core/kernel.cc                           |  104 -
 lite/core/kernel.h                            |  189 -
 lite/core/kernel_test.cc                      |   63 -
 lite/core/lite.map                            |    6 -
 lite/core/lite_gtest_main.cc                  |   23 -
 lite/core/lite_tensor_test.cc                 |   32 -
 lite/core/memory.cc                           |  109 -
 lite/core/memory.h                            |  115 -
 lite/core/memory_test.cc                      |   34 -
 lite/core/mir/CMakeLists.txt                  |  109 -
 lite/core/mir/argument_type_display_pass.cc   |   46 -
 lite/core/mir/demo_pass.cc                    |   37 -
 lite/core/mir/dot.h                           |  167 -
 lite/core/mir/elimination/CMakeLists.txt      |   10 -
 .../identity_scale_eliminate_pass.cc          |   73 -
 .../identity_scale_eliminate_pass_test.cc     |   93 -
 lite/core/mir/fusion/CMakeLists.txt           |   48 -
 .../mir/fusion/conv_activation_fuse_pass.cc   |   42 -
 .../mir/fusion/conv_activation_fuse_pass.h    |   32 -
 lite/core/mir/fusion/conv_activation_fuser.cc |   83 -
 lite/core/mir/fusion/conv_activation_fuser.h  |   50 -
 lite/core/mir/fusion/conv_bn_fuse_pass.cc     |   38 -
 lite/core/mir/fusion/conv_bn_fuse_pass.h      |   32 -
 .../core/mir/fusion/conv_bn_fuse_pass_test.cc |  140 -
 lite/core/mir/fusion/conv_bn_fuser.cc         |  163 -
 lite/core/mir/fusion/conv_bn_fuser.h          |   58 -
 ...ementwise_add_activation_fuse_pass_test.cc |  157 -
 .../mir/fusion/conv_elementwise_fuse_pass.cc  |   42 -
 .../mir/fusion/conv_elementwise_fuse_pass.h   |   32 -
 .../core/mir/fusion/conv_elementwise_fuser.cc |  102 -
 lite/core/mir/fusion/conv_elementwise_fuser.h |   43 -
 .../elementwise_add_activation_fuse_pass.cc   |   37 -
 .../elementwise_add_activation_fuse_pass.h    |   32 -
 ...ementwise_add_activation_fuse_pass_test.cc |  117 -
 .../elementwise_add_activation_fuser.cc       |   87 -
 .../fusion/elementwise_add_activation_fuser.h |   41 -
 lite/core/mir/fusion/fc_fuse_pass.cc          |   35 -
 lite/core/mir/fusion/fc_fuse_pass.h           |   32 -
 lite/core/mir/fusion/fc_fuse_pass_test.cc     |  117 -
 lite/core/mir/fusion/fc_fuser.cc              |   78 -
 lite/core/mir/fusion/fc_fuser.h               |   38 -
 lite/core/mir/fusion/interpolate_fuse_pass.cc |   39 -
 lite/core/mir/fusion/interpolate_fuse_pass.h  |   32 -
 lite/core/mir/fusion/interpolate_fuser.cc     |   95 -
 lite/core/mir/fusion/interpolate_fuser.h      |   42 -
 .../mir/fusion/quant_dequant_fuse_pass.cc     |   47 -
 .../core/mir/fusion/quant_dequant_fuse_pass.h |   33 -
 .../core/mir/fusion/quant_dequant_op_fuser.cc |  200 -
 lite/core/mir/fusion/quant_dequant_op_fuser.h |   59 -
 .../mir/fusion/shuffle_channel_fuse_pass.cc   |   39 -
 .../mir/fusion/shuffle_channel_fuse_pass.h    |   32 -
 lite/core/mir/fusion/shuffle_channel_fuser.cc |  109 -
 lite/core/mir/fusion/shuffle_channel_fuser.h  |   44 -
 .../transpose_softmax_transpose_fuse_pass.cc  |   40 -
 .../transpose_softmax_transpose_fuse_pass.h   |   32 -
 .../transpose_softmax_transpose_fuser.cc      |   99 -
 .../transpose_softmax_transpose_fuser.h       |   44 -
 lite/core/mir/generate_program_pass.cc        |   42 -
 lite/core/mir/generate_program_pass.h         |   50 -
 lite/core/mir/graph_visualize_pass.cc         |  102 -
 lite/core/mir/graph_visualize_pass.h          |   39 -
 lite/core/mir/io_copy_kernel_pick_pass.cc     |   75 -
 lite/core/mir/node.cc                         |   74 -
 lite/core/mir/node.h                          |  173 -
 lite/core/mir/pass.cc                         |   15 -
 lite/core/mir/pass.h                          |   88 -
 lite/core/mir/pass_manager.cc                 |   21 -
 lite/core/mir/pass_manager.h                  |   87 -
 lite/core/mir/pass_manager_test.cc            |   33 -
 lite/core/mir/pass_registry.cc                |   21 -
 lite/core/mir/pass_registry.h                 |   55 -
 lite/core/mir/pattern_matcher.cc              |  528 -
 lite/core/mir/pattern_matcher.h               |  432 -
 lite/core/mir/pattern_matcher_high_api.cc     |   80 -
 lite/core/mir/pattern_matcher_high_api.h      |   83 -
 .../core/mir/pattern_matcher_high_api_test.cc |  150 -
 lite/core/mir/pattern_matcher_test.cc         |  233 -
 lite/core/mir/pattern_matcher_tester.cc       |  233 -
 lite/core/mir/runtime_context_assign_pass.cc  |   42 -
 lite/core/mir/ssa_graph.cc                    |  240 -
 lite/core/mir/ssa_graph.h                     |  144 -
 lite/core/mir/ssa_graph_test.cc               |   59 -
 lite/core/mir/static_kernel_pick_pass.cc      |  136 -
 lite/core/mir/static_kernel_pick_pass.h       |   97 -
 lite/core/mir/subgraph/CMakeLists.txt         |   34 -
 .../mir/subgraph/generate_npu_program_pass.cc |  218 -
 .../mir/subgraph/generate_npu_program_pass.h  |   65 -
 .../generate_npu_program_pass_test.cc         |  114 -
 .../mir/subgraph/subgraph_program_pass.cc     |  314 -
 .../core/mir/subgraph/subgraph_program_pass.h |  105 -
 .../subgraph/subgraph_program_pass_test.cc    |  223 -
 lite/core/mir/type_layout_cast_pass.cc        |  177 -
 lite/core/mir/type_layout_cast_pass.h         |   62 -
 lite/core/mir/type_precision_cast_pass.cc     |  183 -
 lite/core/mir/type_precision_cast_pass.h      |   66 -
 lite/core/mir/type_target_cast_pass.cc        |  183 -
 lite/core/mir/type_target_cast_pass.h         |   66 -
 .../core/mir/variable_place_inference_pass.cc |   35 -
 lite/core/mir/variable_place_inference_pass.h |  157 -
 .../mir/variable_place_inference_pass_test.cc |  101 -
 lite/core/naive_test_model.py                 |   56 -
 lite/core/op_lite.cc                          |  105 -
 lite/core/op_lite.h                           |  231 -
 lite/core/op_lite_test.cc                     |   24 -
 lite/core/op_registry.cc                      |  154 -
 lite/core/op_registry.h                       |  306 -
 lite/core/optimizer.cc                        |   34 -
 lite/core/optimizer.h                         |  213 -
 lite/core/optimizer_test.cc                   |   51 -
 lite/core/profile/CMakeLists.txt              |    8 -
 lite/core/profile/basic_profiler.cc           |   26 -
 lite/core/profile/basic_profiler.h            |  210 -
 lite/core/profile/basic_profiler_test.cc      |   46 -
 lite/core/profile/precision_profiler.h        |  137 -
 lite/core/program.cc                          |  208 -
 lite/core/program.h                           |  156 -
 lite/core/program_fake_utils.cc               |   22 -
 lite/core/program_fake_utils.h                |  142 -
 lite/core/scope.cc                            |   72 -
 lite/core/scope.h                             |   79 -
 lite/core/scope_test.cc                       |   37 -
 lite/core/target_wrapper.cc                   |   21 -
 lite/core/target_wrapper.h                    |  170 -
 lite/core/tensor.cc                           |  115 -
 lite/core/tensor.h                            |  249 -
 lite/core/type_system.cc                      |  157 -
 lite/core/type_system.h                       |  390 -
 lite/core/type_system_test.cc                 |   35 -
 lite/core/types.cc                            |   95 -
 lite/core/types.h                             |  147 -
 lite/core/types_test.cc                       |   43 -
 lite/core/variable.cc                         |   19 -
 lite/core/variable.h                          |   52 -
 lite/core/workspace.cc                        |   15 -
 lite/core/workspace.h                         |   83 -
 lite/demo/cxx/Makefile.def                    |   35 -
 lite/demo/cxx/README.md                       |   42 -
 .../mobile_full/Makefile.android.armv7        |   22 -
 .../mobile_full/Makefile.android.armv8        |   22 -
 .../mobile_light/Makefile.android.armv7       |   22 -
 .../mobile_light/Makefile.android.armv8       |   22 -
 .../cxx/mobile_full/mobilenetv1_full_api.cc   |   83 -
 .../cxx/mobile_light/mobilenetv1_light_api.cc |   65 -
 lite/demo/java/README.md                      |  118 -
 .../java/android/PaddlePredictor/.gitignore   |   13 -
 .../android/PaddlePredictor/app/.gitignore    |    1 -
 .../android/PaddlePredictor/app/build.gradle  |   28 -
 .../PaddlePredictor/app/proguard-rules.pro    |   21 -
 .../paddle/lite/ExampleInstrumentedTest.java  |  114 -
 .../app/src/main/AndroidManifest.xml          |   21 -
 .../app/src/main/assets/README.txt            |    8 -
 .../com/baidu/paddle/lite/MainActivity.java   |  206 -
 .../drawable-v24/ic_launcher_foreground.xml   |   34 -
 .../res/drawable/ic_launcher_background.xml   |  170 -
 .../app/src/main/res/layout/activity_main.xml |   19 -
 .../res/mipmap-anydpi-v26/ic_launcher.xml     |    5 -
 .../mipmap-anydpi-v26/ic_launcher_round.xml   |    5 -
 .../src/main/res/mipmap-hdpi/ic_launcher.png  |  Bin 2963 -> 0 bytes
 .../res/mipmap-hdpi/ic_launcher_round.png     |  Bin 4905 -> 0 bytes
 .../src/main/res/mipmap-mdpi/ic_launcher.png  |  Bin 2060 -> 0 bytes
 .../res/mipmap-mdpi/ic_launcher_round.png     |  Bin 2783 -> 0 bytes
 .../src/main/res/mipmap-xhdpi/ic_launcher.png |  Bin 4490 -> 0 bytes
 .../res/mipmap-xhdpi/ic_launcher_round.png    |  Bin 6895 -> 0 bytes
 .../main/res/mipmap-xxhdpi/ic_launcher.png    |  Bin 6387 -> 0 bytes
 .../res/mipmap-xxhdpi/ic_launcher_round.png   |  Bin 10413 -> 0 bytes
 .../main/res/mipmap-xxxhdpi/ic_launcher.png   |  Bin 9128 -> 0 bytes
 .../res/mipmap-xxxhdpi/ic_launcher_round.png  |  Bin 15132 -> 0 bytes
 .../app/src/main/res/values/colors.xml        |    6 -
 .../app/src/main/res/values/strings.xml       |    3 -
 .../app/src/main/res/values/styles.xml        |   11 -
 .../baidu/paddle/lite/ExampleUnitTest.java    |   17 -
 .../java/android/PaddlePredictor/build.gradle |   27 -
 .../android/PaddlePredictor/gradle.properties |   13 -
 .../gradle/wrapper/gradle-wrapper.jar         |  Bin 54329 -> 0 bytes
 .../gradle/wrapper/gradle-wrapper.properties  |    6 -
 .../demo/java/android/PaddlePredictor/gradlew |  172 -
 .../java/android/PaddlePredictor/gradlew.bat  |   84 -
 .../android/PaddlePredictor/settings.gradle   |    1 -
 lite/demo/java/android/prepare_demo.bash      |   23 -
 lite/fluid/CMakeLists.txt                     |    4 -
 lite/fluid/data_type.cc                       |  101 -
 lite/fluid/data_type.h                        |   88 -
 lite/fluid/data_type_test.cc                  |   40 -
 lite/fluid/eigen.h                            |  141 -
 lite/fluid/float16.h                          | 1100 --
 lite/fluid/lod.h                              |   38 -
 lite/fluid/math.h                             |   42 -
 lite/gen_code/CMakeLists.txt                  |   49 -
 lite/gen_code/gen_code.cc                     |  223 -
 lite/gen_code/gen_code.h                      |  258 -
 lite/gen_code/gen_code_test.cc                |  168 -
 lite/gen_code/generated_code_test.cc          |   87 -
 lite/gen_code/paddle_code_generator.cc        |   56 -
 lite/gen_code/paddle_infer.cc                 |  145 -
 lite/gen_code/paddle_infer.h                  |   72 -
 lite/kernels/CMakeLists.txt                   |   11 -
 lite/kernels/arm/CMakeLists.txt               |   95 -
 lite/kernels/arm/activation_compute.cc        |  247 -
 lite/kernels/arm/activation_compute.h         |  136 -
 lite/kernels/arm/affine_channel_compute.cc    |   77 -
 lite/kernels/arm/affine_channel_compute.h     |   38 -
 lite/kernels/arm/anchor_generator_compute.cc  |   66 -
 lite/kernels/arm/anchor_generator_compute.h   |   38 -
 lite/kernels/arm/argmax_compute.cc            |   51 -
 lite/kernels/arm/argmax_compute.h             |   37 -
 lite/kernels/arm/argmax_compute_test.cc       |  139 -
 lite/kernels/arm/assign_compute.cc            |   47 -
 lite/kernels/arm/assign_compute.h             |   37 -
 lite/kernels/arm/assign_value_compute.cc      |   66 -
 lite/kernels/arm/assign_value_compute.h       |   37 -
 lite/kernels/arm/axpy_compute.cc              |   62 -
 lite/kernels/arm/axpy_compute.h               |   37 -
 lite/kernels/arm/axpy_compute_test.cc         |  142 -
 lite/kernels/arm/batch_norm_compute.cc        |  123 -
 lite/kernels/arm/batch_norm_compute.h         |   42 -
 lite/kernels/arm/batch_norm_compute_test.cc   |  221 -
 lite/kernels/arm/beam_search_compute.cc       |   60 -
 lite/kernels/arm/beam_search_compute.h        |   42 -
 .../kernels/arm/beam_search_decode_compute.cc |  296 -
 lite/kernels/arm/beam_search_decode_compute.h |   39 -
 lite/kernels/arm/box_clip_compute.cc          |   87 -
 lite/kernels/arm/box_clip_compute.h           |   37 -
 lite/kernels/arm/box_coder_compute.cc         |  241 -
 lite/kernels/arm/box_coder_compute.h          |   36 -
 lite/kernels/arm/calib_compute.cc             |   90 -
 lite/kernels/arm/calib_compute.h              |   51 -
 lite/kernels/arm/calib_compute_test.cc        |  156 -
 lite/kernels/arm/cast_compute.cc              |   62 -
 lite/kernels/arm/cast_compute.h               |   42 -
 lite/kernels/arm/compare_compute.cc           |  186 -
 lite/kernels/arm/compare_compute.h            |   43 -
 lite/kernels/arm/concat_compute.cc            |   87 -
 lite/kernels/arm/concat_compute.h             |   37 -
 lite/kernels/arm/concat_compute_test.cc       |  236 -
 lite/kernels/arm/conv_compute.cc              |  241 -
 lite/kernels/arm/conv_compute.h               |   67 -
 lite/kernels/arm/conv_compute_test.cc         | 1045 --
 lite/kernels/arm/conv_transpose_compute.cc    |  164 -
 lite/kernels/arm/conv_transpose_compute.h     |   40 -
 .../arm/conv_transpose_compute_test.cc        |  371 -
 lite/kernels/arm/crop_compute.cc              |   77 -
 lite/kernels/arm/crop_compute.h               |   49 -
 lite/kernels/arm/decode_bboxes_compute.cc     |   68 -
 lite/kernels/arm/decode_bboxes_compute.h      |   36 -
 .../kernels/arm/decode_bboxes_compute_test.cc |  185 -
 lite/kernels/arm/density_prior_box_compute.cc |  121 -
 lite/kernels/arm/density_prior_box_compute.h  |   37 -
 lite/kernels/arm/dropout_compute.cc           |   51 -
 lite/kernels/arm/dropout_compute.h            |   35 -
 lite/kernels/arm/dropout_compute_test.cc      |  106 -
 lite/kernels/arm/elementwise_compute.cc       |  417 -
 lite/kernels/arm/elementwise_compute.h        |  108 -
 lite/kernels/arm/elementwise_compute_test.cc  |  721 --
 lite/kernels/arm/expand_compute.cc            |   72 -
 lite/kernels/arm/expand_compute.h             |   34 -
 lite/kernels/arm/fc_compute.cc                |  263 -
 lite/kernels/arm/fc_compute.h                 |   68 -
 lite/kernels/arm/fc_compute_test.cc           |  211 -
 lite/kernels/arm/fill_constant_compute.cc     |   54 -
 .../kernels/arm/generate_proposals_compute.cc |  494 -
 lite/kernels/arm/generate_proposals_compute.h |   38 -
 lite/kernels/arm/gru_compute.cc               |  146 -
 lite/kernels/arm/gru_compute.h                |   38 -
 lite/kernels/arm/gru_unit_compute.cc          |  116 -
 lite/kernels/arm/gru_unit_compute.h           |   38 -
 lite/kernels/arm/im2sequence_compute.cc       |  141 -
 lite/kernels/arm/im2sequence_compute.h        |   42 -
 lite/kernels/arm/increment_compute.cc         |   49 -
 lite/kernels/arm/increment_compute.h          |   42 -
 lite/kernels/arm/interpolate_compute.cc       |   94 -
 lite/kernels/arm/interpolate_compute.h        |   44 -
 lite/kernels/arm/is_empty_compute.cc          |   47 -
 lite/kernels/arm/is_empty_compute.h           |   40 -
 lite/kernels/arm/lod_reset_compute.cc         |   64 -
 lite/kernels/arm/lod_reset_compute.h          |   41 -
 lite/kernels/arm/logical_compute.cc           |  128 -
 lite/kernels/arm/logical_compute.h            |   53 -
 lite/kernels/arm/lookup_table_compute.cc      |   77 -
 lite/kernels/arm/lookup_table_compute.h       |   38 -
 lite/kernels/arm/lrn_compute.cc               |   56 -
 lite/kernels/arm/lrn_compute.h                |   36 -
 lite/kernels/arm/lrn_compute_test.cc          |  196 -
 lite/kernels/arm/matmul_compute.cc            |  277 -
 lite/kernels/arm/matmul_compute.h             |   42 -
 lite/kernels/arm/mul_compute.cc               |   98 -
 lite/kernels/arm/mul_compute.h                |   42 -
 lite/kernels/arm/mul_compute_test.cc          |  182 -
 lite/kernels/arm/negative_compute.cc          |   53 -
 lite/kernels/arm/negative_compute.h           |   37 -
 lite/kernels/arm/norm_compute.cc              |   50 -
 lite/kernels/arm/norm_compute.h               |   42 -
 lite/kernels/arm/pad2d_compute.cc             |   72 -
 lite/kernels/arm/pad2d_compute.h              |   46 -
 lite/kernels/arm/pool_compute.cc              |  228 -
 lite/kernels/arm/pool_compute.h               |   38 -
 lite/kernels/arm/pool_compute_test.cc         |  286 -
 lite/kernels/arm/power_compute.cc             |   45 -
 lite/kernels/arm/power_compute.h              |   34 -
 lite/kernels/arm/prior_box_compute.cc         |  103 -
 lite/kernels/arm/prior_box_compute.h          |   36 -
 lite/kernels/arm/read_from_array_compute.cc   |   57 -
 lite/kernels/arm/read_from_array_compute.h    |   43 -
 lite/kernels/arm/reduce_max_compute.cc        |   91 -
 lite/kernels/arm/reduce_max_compute.h         |   38 -
 lite/kernels/arm/reduce_mean_compute.cc       |   91 -
 lite/kernels/arm/reduce_mean_compute.h        |   38 -
 lite/kernels/arm/roi_align_compute.cc         |  236 -
 lite/kernels/arm/roi_align_compute.h          |   37 -
 lite/kernels/arm/scale_compute.cc             |   49 -
 lite/kernels/arm/scale_compute.h              |   34 -
 lite/kernels/arm/scale_compute_test.cc        |  117 -
 lite/kernels/arm/sequence_expand_compute.cc   |  132 -
 lite/kernels/arm/sequence_expand_compute.h    |   39 -
 lite/kernels/arm/sequence_pool_compute.cc     |   79 -
 lite/kernels/arm/sequence_pool_compute.h      |   40 -
 lite/kernels/arm/sequence_softmax_compute.cc  |   58 -
 lite/kernels/arm/sequence_softmax_compute.h   |   43 -
 lite/kernels/arm/shape_compute.cc             |   41 -
 lite/kernels/arm/shape_compute.h              |   34 -
 lite/kernels/arm/shuffle_channel_compute.cc   |   50 -
 lite/kernels/arm/shuffle_channel_compute.h    |   35 -
 lite/kernels/arm/slice_compute.cc             |   57 -
 lite/kernels/arm/slice_compute.h              |   41 -
 lite/kernels/arm/softmax_compute.cc           |   80 -
 lite/kernels/arm/softmax_compute.h            |   35 -
 lite/kernels/arm/softmax_compute_test.cc      |  135 -
 lite/kernels/arm/split_compute.cc             |   46 -
 lite/kernels/arm/split_compute.h              |   35 -
 lite/kernels/arm/split_compute_test.cc        |  179 -
 lite/kernels/arm/squeeze_compute.cc           |   70 -
 lite/kernels/arm/squeeze_compute.h            |   42 -
 lite/kernels/arm/stack_compute.cc             |   42 -
 lite/kernels/arm/stack_compute.h              |   34 -
 lite/kernels/arm/topk_compute.cc              |   47 -
 lite/kernels/arm/topk_compute.h               |   34 -
 lite/kernels/arm/transpose_compute.cc         |  185 -
 lite/kernels/arm/transpose_compute.h          |   48 -
 lite/kernels/arm/transpose_compute_test.cc    |  205 -
 lite/kernels/arm/while_compute.cc             |   54 -
 lite/kernels/arm/while_compute.h              |   83 -
 lite/kernels/arm/write_to_array_compute.cc    |   61 -
 lite/kernels/arm/write_to_array_compute.h     |   42 -
 lite/kernels/arm/yolo_box_compute.cc          |   60 -
 lite/kernels/arm/yolo_box_compute.h           |   34 -
 lite/kernels/cuda/CMakeLists.txt              |   42 -
 lite/kernels/cuda/calib_compute.cu            |  131 -
 lite/kernels/cuda/calib_compute.h             |   52 -
 lite/kernels/cuda/calib_compute_cuda_test.cc  |  178 -
 lite/kernels/cuda/concat_compute.cu           |  276 -
 lite/kernels/cuda/concat_compute.h            |   34 -
 lite/kernels/cuda/concat_compute_test.cc      |  227 -
 lite/kernels/cuda/conv_compute.cc             |  103 -
 lite/kernels/cuda/conv_compute.h              |   53 -
 lite/kernels/cuda/conv_compute_test.cc        |  248 -
 lite/kernels/cuda/elementwise_add_compute.cu  |   79 -
 lite/kernels/cuda/elementwise_add_compute.h   |   35 -
 .../cuda/elementwise_add_compute_test.cc      |  107 -
 lite/kernels/cuda/io_copy_compute.cc          |  143 -
 lite/kernels/cuda/leaky_relu_compute.cu       |   69 -
 lite/kernels/cuda/leaky_relu_compute.h        |   34 -
 lite/kernels/cuda/leaky_relu_compute_test.cc  |   72 -
 lite/kernels/cuda/mul_compute.cc              |   31 -
 lite/kernels/cuda/mul_compute.h               |   84 -
 lite/kernels/cuda/nearest_interp_compute.cu   |  160 -
 lite/kernels/cuda/nearest_interp_compute.h    |   35 -
 .../cuda/nearest_interp_compute_test.cc       |  152 -
 lite/kernels/cuda/transpose_compute.cu        |   86 -
 lite/kernels/cuda/transpose_compute.h         |   38 -
 lite/kernels/cuda/transpose_compute_test.cc   |  290 -
 lite/kernels/cuda/use_kernels.h               |   24 -
 lite/kernels/cuda/yolo_box_compute.cu         |  224 -
 lite/kernels/cuda/yolo_box_compute.h          |   37 -
 lite/kernels/cuda/yolo_box_compute_test.cc    |  258 -
 lite/kernels/fpga/CMakeLists.txt              |   32 -
 lite/kernels/fpga/activation_compute.cc       |   53 -
 lite/kernels/fpga/activation_compute.h        |   46 -
 lite/kernels/fpga/activation_compute_test.cc  |   97 -
 lite/kernels/fpga/calib_compute.cc            |  114 -
 lite/kernels/fpga/calib_compute.h             |   51 -
 lite/kernels/fpga/conv_compute.cc             |   71 -
 lite/kernels/fpga/conv_compute.h              |   45 -
 lite/kernels/fpga/conv_compute_test.cc        |  315 -
 lite/kernels/fpga/elementwise_compute.cc      |  102 -
 lite/kernels/fpga/elementwise_compute.h       |   56 -
 lite/kernels/fpga/elementwise_compute_test.cc |  286 -
 lite/kernels/fpga/fc_compute.cc               |   65 -
 lite/kernels/fpga/fc_compute.h                |   49 -
 lite/kernels/fpga/fc_compute_test.cc          |  205 -
 lite/kernels/fpga/feed_compute.cc             |   60 -
 lite/kernels/fpga/feed_compute.h              |   42 -
 lite/kernels/fpga/fetch_compute.cc            |   59 -
 lite/kernels/fpga/fetch_compute.h             |   41 -
 lite/kernels/fpga/io_copy_compute.cc          |  157 -
 lite/kernels/fpga/layout_compute.cc           |  146 -
 lite/kernels/fpga/pooling_compute.cc          |   65 -
 lite/kernels/fpga/pooling_compute.h           |   44 -
 lite/kernels/fpga/pooling_compute_test.cc     |  291 -
 lite/kernels/fpga/scale_compute.cc            |   39 -
 lite/kernels/fpga/scale_compute.h             |   35 -
 lite/kernels/fpga/softmax_compute.cc          |   57 -
 lite/kernels/fpga/softmax_compute.h           |   46 -
 lite/kernels/fpga/softmax_compute_test.cc     |  136 -
 lite/kernels/host/CMakeLists.txt              |    9 -
 lite/kernels/host/feed_compute.cc             |   46 -
 lite/kernels/host/fetch_compute.cc            |   53 -
 lite/kernels/host/multiclass_nms_compute.cc   |  398 -
 lite/kernels/host/multiclass_nms_compute.h    |   36 -
 .../host/multiclass_nms_compute_test.cc       |  368 -
 lite/kernels/host/reshape_compute.cc          |  138 -
 lite/kernels/host/reshape_compute.h           |   36 -
 lite/kernels/host/reshape_compute_test.cc     |  101 -
 lite/kernels/host/use_kernels.h               |   21 -
 lite/kernels/npu/CMakeLists.txt               |    9 -
 lite/kernels/npu/graph_compute.cc             |  151 -
 lite/kernels/npu/graph_compute.h              |   56 -
 lite/kernels/opencl/CMakeLists.txt            |   49 -
 lite/kernels/opencl/conv_compute.cc           |  296 -
 lite/kernels/opencl/conv_compute.h            |   63 -
 lite/kernels/opencl/conv_compute_test.cc      |  602 --
 .../opencl/depthwise_conv2d_compute.cc        |  132 -
 .../opencl/depthwise_conv2d_compute_test.cc   |  181 -
 .../kernels/opencl/elementwise_add_compute.cc |  107 -
 lite/kernels/opencl/elementwise_add_compute.h |   51 -
 .../opencl/elementwise_add_compute_test.cc    |  251 -
 lite/kernels/opencl/fc_compute.cc             |  126 -
 lite/kernels/opencl/fc_compute_test.cc        |  200 -
 ...sion_elementwise_add_activation_compute.cc |   56 -
 lite/kernels/opencl/io_copy_compute.cc        |  145 -
 lite/kernels/opencl/io_copy_compute_test.cc   |   83 -
 lite/kernels/opencl/mul_compute.cc            |  119 -
 lite/kernels/opencl/mul_compute_test.cc       |  170 -
 lite/kernels/opencl/pool_compute.cc           |  127 -
 lite/kernels/opencl/pool_compute_test.cc      |  147 -
 lite/kernels/opencl/relu_compute.cc           |   91 -
 lite/kernels/opencl/relu_compute_test.cc      |   94 -
 lite/kernels/x86/CMakeLists.txt               |   48 -
 lite/kernels/x86/activation_compute.cc        |  127 -
 lite/kernels/x86/batch_norm_compute.cc        |   34 -
 lite/kernels/x86/batch_norm_compute.h         |  159 -
 lite/kernels/x86/batch_norm_compute_test.cc   |  139 -
 lite/kernels/x86/concat_compute.cc            |   25 -
 lite/kernels/x86/concat_compute.h             |   71 -
 lite/kernels/x86/concat_compute_test.cc       |   82 -
 lite/kernels/x86/conv_compute.cc              |   39 -
 lite/kernels/x86/conv_compute.h               |  167 -
 lite/kernels/x86/conv_compute_test.cc         |   92 -
 lite/kernels/x86/dropout_compute.cc           |   26 -
 lite/kernels/x86/dropout_compute.h            |   82 -
 lite/kernels/x86/dropout_compute_test.cc      |   78 -
 lite/kernels/x86/elementwise_compute.cc       |   55 -
 lite/kernels/x86/elementwise_compute.h        |  142 -
 lite/kernels/x86/elementwise_compute_test.cc  |   88 -
 lite/kernels/x86/fc_compute.cc                |   23 -
 lite/kernels/x86/fc_compute.h                 |  106 -
 lite/kernels/x86/fc_compute_test.cc           |  100 -
 lite/kernels/x86/fill_constant_compute.cc     |   59 -
 lite/kernels/x86/mean_compute.cc              |  108 -
 lite/kernels/x86/mul_compute.cc               |   44 -
 lite/kernels/x86/mul_compute.h                |  159 -
 lite/kernels/x86/mul_compute_test.cc          |   86 -
 lite/kernels/x86/pool_compute.cc              |   25 -
 lite/kernels/x86/pool_compute.h               |   87 -
 lite/kernels/x86/pool_compute_test.cc         |   79 -
 lite/kernels/x86/relu_compute.cc              |   25 -
 lite/kernels/x86/relu_compute.h               |   52 -
 lite/kernels/x86/relu_compute_test.cc         |   75 -
 lite/kernels/x86/reshape_compute.cc           |   36 -
 lite/kernels/x86/reshape_compute.h            |   79 -
 lite/kernels/x86/reshape_compute_test.cc      |  156 -
 lite/kernels/x86/scale_compute.cc             |   25 -
 lite/kernels/x86/scale_compute.h              |   58 -
 lite/kernels/x86/scale_compute_test.cc        |   76 -
 lite/kernels/x86/sequence_pool_compute.cc     |   25 -
 lite/kernels/x86/sequence_pool_compute.h      |   59 -
 .../kernels/x86/sequence_pool_compute_test.cc |   88 -
 lite/kernels/x86/sgd_compute.cc               |   82 -
 lite/kernels/x86/shape_compute.cc             |   25 -
 lite/kernels/x86/shape_compute.h              |   45 -
 lite/kernels/x86/shape_compute_test.cc        |   73 -
 lite/kernels/x86/slice_compute.cc             |   25 -
 lite/kernels/x86/slice_compute.h              |  145 -
 lite/kernels/x86/slice_compute_test.cc        |  265 -
 lite/kernels/x86/softmax_compute.cc           |   25 -
 lite/kernels/x86/softmax_compute.h            |   82 -
 lite/kernels/x86/softmax_compute_test.cc      |   84 -
 lite/kernels/x86/squeeze_compute.cc           |   36 -
 lite/kernels/x86/squeeze_compute.h            |   70 -
 lite/kernels/x86/squeeze_compute_test.cc      |  142 -
 lite/kernels/x86/uniform_random_compute.cc    |   70 -
 lite/model_parser/CMakeLists.txt              |   34 -
 lite/model_parser/compatible_pb.cc            |  286 -
 lite/model_parser/compatible_pb.h             |   71 -
 lite/model_parser/compatible_pb_test.cc       |  433 -
 lite/model_parser/cpp/CMakeLists.txt          |    6 -
 lite/model_parser/cpp/block_desc.cc           |   47 -
 lite/model_parser/cpp/block_desc.h            |   75 -
 lite/model_parser/cpp/op_desc.cc              |  122 -
 lite/model_parser/cpp/op_desc.h               |  122 -
 lite/model_parser/cpp/program_desc.cc         |   35 -
 lite/model_parser/cpp/program_desc.h          |   57 -
 lite/model_parser/cpp/var_desc.cc             |   15 -
 lite/model_parser/cpp/var_desc.h              |   53 -
 lite/model_parser/desc_apis.h                 |  229 -
 lite/model_parser/model_parser.cc             |  794 --
 lite/model_parser/model_parser.h              |  108 -
 lite/model_parser/model_parser_test.cc        |  138 -
 lite/model_parser/naive_buffer/CMakeLists.txt |   19 -
 lite/model_parser/naive_buffer/block_desc.cc  |  103 -
 lite/model_parser/naive_buffer/block_desc.h   |   86 -
 .../naive_buffer/combined_params_desc.cc      |   15 -
 .../naive_buffer/combined_params_desc.h       |   63 -
 .../model_parser/naive_buffer/naive_buffer.cc |  144 -
 lite/model_parser/naive_buffer/naive_buffer.h |  374 -
 .../naive_buffer/naive_buffer_test.cc         |  178 -
 .../naive_buffer_wrapper_helper.h             |   47 -
 .../naive_buffer/naive_buffer_wrapper_test.cc |  316 -
 lite/model_parser/naive_buffer/op_desc.cc     |  129 -
 lite/model_parser/naive_buffer/op_desc.h      |  234 -
 lite/model_parser/naive_buffer/param_desc.cc  |  228 -
 lite/model_parser/naive_buffer/param_desc.h   |   92 -
 .../model_parser/naive_buffer/program_desc.cc |   58 -
 lite/model_parser/naive_buffer/program_desc.h |   66 -
 .../naive_buffer/proto/CMakeLists.txt         |    1 -
 .../naive_buffer/proto/framework.nb.cc        |   15 -
 .../naive_buffer/proto/framework.nb.h         |  203 -
 lite/model_parser/naive_buffer/var_desc.cc    |  109 -
 lite/model_parser/naive_buffer/var_desc.h     |   63 -
 lite/model_parser/pb/CMakeLists.txt           |    6 -
 lite/model_parser/pb/block_desc.cc            |   47 -
 lite/model_parser/pb/block_desc.h             |   80 -
 lite/model_parser/pb/op_desc.cc               |  132 -
 lite/model_parser/pb/op_desc.h                |  215 -
 lite/model_parser/pb/program_desc.cc          |   36 -
 lite/model_parser/pb/program_desc.h           |   62 -
 lite/model_parser/pb/var_desc.cc              |  317 -
 lite/model_parser/pb/var_desc.h               |  125 -
 lite/model_parser/runtime.cc                  |  109 -
 lite/model_parser/runtime.h                   |  122 -
 lite/operators/CMakeLists.txt                 |  120 -
 lite/operators/activation_ops.cc              |  123 -
 lite/operators/activation_ops.h               |   63 -
 lite/operators/affine_channel_op.cc           |   76 -
 lite/operators/affine_channel_op.h            |   48 -
 lite/operators/anchor_generator_op.cc         |   71 -
 lite/operators/anchor_generator_op.h          |   49 -
 lite/operators/argmax_op.cc                   |   62 -
 lite/operators/argmax_op.h                    |   48 -
 lite/operators/assign_op.cc                   |   52 -
 lite/operators/assign_op.h                    |   46 -
 lite/operators/assign_value_op.cc             |   62 -
 lite/operators/assign_value_op.h              |   48 -
 lite/operators/axpy_op.cc                     |   63 -
 lite/operators/axpy_op.h                      |   48 -
 lite/operators/batch_norm_op.cc               |  112 -
 lite/operators/batch_norm_op.h                |   46 -
 lite/operators/batch_norm_op_test.cc          |  139 -
 lite/operators/beam_search_decode_op.cc       |   59 -
 lite/operators/beam_search_decode_op.h        |   47 -
 lite/operators/beam_search_op.cc              |   69 -
 lite/operators/beam_search_op.h               |   47 -
 lite/operators/box_clip_op.cc                 |   61 -
 lite/operators/box_clip_op.h                  |   48 -
 lite/operators/box_coder_op.cc                |  106 -
 lite/operators/box_coder_op.h                 |   45 -
 lite/operators/calib_once_op.cc               |   30 -
 lite/operators/calib_once_op.h                |   33 -
 lite/operators/calib_op.cc                    |   52 -
 lite/operators/calib_op.h                     |   59 -
 lite/operators/calib_op_test.cc               |   62 -
 lite/operators/cast_op.cc                     |   52 -
 lite/operators/cast_op.h                      |   47 -
 lite/operators/compare_op.cc                  |   61 -
 lite/operators/compare_op.h                   |   47 -
 lite/operators/concat_op.cc                   |   77 -
 lite/operators/concat_op.h                    |   46 -
 lite/operators/concat_op_test.cc              |   59 -
 lite/operators/conv_op.cc                     |   80 -
 lite/operators/conv_op.h                      |  107 -
 lite/operators/conv_transpose_op.cc           |   99 -
 lite/operators/conv_transpose_op.h            |   51 -
 lite/operators/crop_op.cc                     |   55 -
 lite/operators/crop_op.h                      |   46 -
 lite/operators/decode_bboxes_op.cc            |   60 -
 lite/operators/decode_bboxes_op.h             |   45 -
 lite/operators/density_prior_box_op.cc        |   94 -
 lite/operators/density_prior_box_op.h         |   46 -
 lite/operators/dropout_op.cc                  |   78 -
 lite/operators/elementwise_ops.cc             |   97 -
 lite/operators/elementwise_ops.h              |   66 -
 lite/operators/expand_op.cc                   |   57 -
 lite/operators/expand_op.h                    |   44 -
 lite/operators/fake_dequantize_max_abs.cc     |   25 -
 lite/operators/fake_dequantize_max_abs.h      |   64 -
 .../fake_quantize_moving_avg_max_abs.cc       |   25 -
 .../fake_quantize_moving_avg_max_abs.h        |   69 -
 lite/operators/fake_quantize_range_abs_max.cc |   25 -
 lite/operators/fake_quantize_range_abs_max.h  |   69 -
 lite/operators/fc_op.cc                       |  107 -
 lite/operators/fc_op.h                        |   61 -
 lite/operators/fc_op_test.cc                  |   78 -
 lite/operators/feed_op.cc                     |   65 -
 lite/operators/fetch_op.cc                    |   60 -
 lite/operators/fill_constant_op.cc            |   59 -
 lite/operators/flatten_op.cc                  |   99 -
 lite/operators/flatten_op.h                   |   62 -
 .../fusion_elementwise_activation_ops.cc      |  107 -
 .../fusion_elementwise_activation_ops.h       |   71 -
 .../fusion_elementwise_activation_ops_test.cc |   63 -
 lite/operators/generate_proposals_op.cc       |   86 -
 lite/operators/generate_proposals_op.h        |   49 -
 lite/operators/graph_op.cc                    |   52 -
 lite/operators/graph_op.h                     |   52 -
 lite/operators/gru_op.cc                      |  108 -
 lite/operators/gru_op.h                       |   46 -
 lite/operators/gru_unit_op.cc                 |  105 -
 lite/operators/gru_unit_op.h                  |   46 -
 lite/operators/im2sequence_op.cc              |   77 -
 lite/operators/im2sequence_op.h               |   47 -
 lite/operators/increment_op.cc                |   51 -
 lite/operators/increment_op.h                 |   47 -
 lite/operators/interpolate_op.cc              |  101 -
 lite/operators/interpolate_op.h               |   47 -
 lite/operators/io_copy_once_op.cc             |   30 -
 lite/operators/io_copy_once_op.h              |   33 -
 lite/operators/io_copy_op.cc                  |   46 -
 lite/operators/io_copy_op.h                   |   42 -
 lite/operators/is_empty_op.cc                 |   40 -
 lite/operators/is_empty_op.h                  |   47 -
 lite/operators/layout_once_op.cc              |   30 -
 lite/operators/layout_once_op.h               |   33 -
 lite/operators/layout_op.cc                   |   46 -
 lite/operators/layout_op.h                    |   42 -
 lite/operators/lod_reset_op.cc                |   60 -
 lite/operators/lod_reset_op.h                 |   47 -
 lite/operators/logical_op.cc                  |   80 -
 lite/operators/logical_op.h                   |   66 -
 lite/operators/lookup_table_op.cc             |   75 -
 lite/operators/lookup_table_op.h              |   46 -
 lite/operators/lrn_op.cc                      |   52 -
 lite/operators/lrn_op.h                       |   44 -
 lite/operators/matmul_op.cc                   |  165 -
 lite/operators/matmul_op.h                    |   50 -
 lite/operators/mean_op.cc                     |  100 -
 lite/operators/mul_op.cc                      |  122 -
 lite/operators/mul_op.h                       |   93 -
 lite/operators/multiclass_nms_op.cc           |   80 -
 lite/operators/multiclass_nms_op.h            |   45 -
 lite/operators/negative_op.cc                 |   51 -
 lite/operators/negative_op.h                  |   46 -
 lite/operators/norm_op.cc                     |   52 -
 lite/operators/norm_op.h                      |   47 -
 lite/operators/op_params.cc                   |   15 -
 lite/operators/op_params.h                    |  824 --
 lite/operators/pad2d_op.cc                    |   58 -
 lite/operators/pad2d_op.h                     |   46 -
 lite/operators/pool_op.cc                     |   90 -
 lite/operators/pool_op.h                      |   82 -
 lite/operators/pool_op_test.cc                |   90 -
 lite/operators/power_op.cc                    |   53 -
 lite/operators/power_op.h                     |   47 -
 lite/operators/prior_box_op.cc                |   77 -
 lite/operators/prior_box_op.h                 |   45 -
 lite/operators/read_from_array_op.cc          |   47 -
 lite/operators/read_from_array_op.h           |   47 -
 lite/operators/reduce_max_op.cc               |  112 -
 lite/operators/reduce_max_op.h                |   43 -
 lite/operators/reduce_mean_op.cc              |  112 -
 lite/operators/reduce_mean_op.h               |   43 -
 lite/operators/relu_op.cc                     |   49 -
 lite/operators/relu_op.h                      |   46 -
 lite/operators/reshape_op.cc                  |  182 -
 lite/operators/reshape_op.h                   |   63 -
 lite/operators/reshape_op_test.cc             |  145 -
 lite/operators/roi_align_op.cc                |   71 -
 lite/operators/roi_align_op.h                 |   48 -
 lite/operators/scale_op.cc                    |   49 -
 lite/operators/scale_op.h                     |   46 -
 lite/operators/scale_op_test.cc               |   58 -
 lite/operators/sequence_expand_op.cc          |   86 -
 lite/operators/sequence_expand_op.h           |   46 -
 lite/operators/sequence_pool_op.cc            |   55 -
 lite/operators/sequence_pool_op.h             |   43 -
 lite/operators/sequence_softmax_op.cc         |   50 -
 lite/operators/sequence_softmax_op.h          |   47 -
 lite/operators/sgd_op.cc                      |   55 -
 lite/operators/sgd_op.h                       |   50 -
 lite/operators/shape_op.cc                    |   49 -
 lite/operators/shape_op.h                     |   44 -
 lite/operators/shuffle_channel_op.cc          |   52 -
 lite/operators/shuffle_channel_op.h           |   50 -
 lite/operators/slice_op.cc                    |   92 -
 lite/operators/slice_op.h                     |   47 -
 lite/operators/softmax_op.cc                  |   59 -
 lite/operators/softmax_op.h                   |   46 -
 lite/operators/softmax_op_test.cc             |   54 -
 lite/operators/split_op.cc                    |   82 -
 lite/operators/split_op.h                     |   46 -
 lite/operators/squeeze_op.cc                  |  133 -
 lite/operators/squeeze_op.h                   |   61 -
 lite/operators/stack_op.cc                    |   62 -
 lite/operators/stack_op.h                     |   47 -
 lite/operators/topk_op.cc                     |   59 -
 lite/operators/topk_op.h                      |   46 -
 lite/operators/transpose_op.cc                |  165 -
 lite/operators/transpose_op.h                 |   66 -
 lite/operators/transpose_op_test.cc           |   93 -
 lite/operators/uniform_random_op.cc           |   45 -
 lite/operators/uniform_random_op.h            |   50 -
 lite/operators/while_op.cc                    |   55 -
 lite/operators/while_op.h                     |   48 -
 lite/operators/write_to_array_op.cc           |   48 -
 lite/operators/write_to_array_op.h            |   47 -
 lite/operators/yolo_box_op.cc                 |   70 -
 lite/operators/yolo_box_op.h                  |   46 -
 lite/tests/CMakeLists.txt                     |    1 -
 lite/tests/README.md                          |    1 -
 lite/tests/kernels/CMakeLists.txt             |   54 -
 lite/tests/kernels/activation_compute_test.cc |  557 -
 .../kernels/affine_channel_compute_test.cc    |  162 -
 .../kernels/anchor_generator_compute_test.cc  |  177 -
 lite/tests/kernels/argmax_compute_test.cc     |  130 -
 lite/tests/kernels/assign_compute_test.cc     |   80 -
 .../kernels/assign_value_compute_test.cc      |  121 -
 lite/tests/kernels/axpy_compute_test.cc       |  136 -
 .../kernels/bilinear_interp_compute_test.cc   |  282 -
 lite/tests/kernels/box_clip_compute_test.cc   |   97 -
 lite/tests/kernels/box_coder_compute_test.cc  |  212 -
 lite/tests/kernels/cast_compute_test.cc       |   89 -
 lite/tests/kernels/compare_compute_test.cc    |  243 -
 .../kernels/conv2d_transpose_compute_test.cc  |  465 -
 lite/tests/kernels/crop_compute_test.cc       |  129 -
 .../kernels/decode_bboxes_compute_test.cc     |  225 -
 .../tests/kernels/elementwise_compute_test.cc |  665 --
 lite/tests/kernels/expand_compute_test.cc     |  135 -
 lite/tests/kernels/fc_compute_test.cc         |  201 -
 lite/tests/kernels/fill_data.h                |   33 -
 .../generate_proposals_compute_test.cc        |  183 -
 lite/tests/kernels/gru_unit_test.cc           |  363 -
 .../tests/kernels/im2sequence_compute_test.cc |  249 -
 lite/tests/kernels/increment_compute_test.cc  |   94 -
 lite/tests/kernels/logical_compute_test.cc    |  106 -
 lite/tests/kernels/lrn_compute_test.cc        |  206 -
 lite/tests/kernels/matmul_compute_test.cc     |  592 -
 .../kernels/nearest_interp_compute_test.cc    |  192 -
 lite/tests/kernels/negative_compute_test.cc   |   80 -
 lite/tests/kernels/norm_compute_test.cc       |  110 -
 lite/tests/kernels/pad2d_compute_test.cc      |  182 -
 lite/tests/kernels/power_compute_test.cc      |   99 -
 lite/tests/kernels/prior_box_compute_test.cc  |  752 --
 .../kernels/read_from_array_compute_test.cc   |  105 -
 lite/tests/kernels/reduce_max_compute_test.cc |  347 -
 .../tests/kernels/reduce_mean_compute_test.cc |  346 -
 lite/tests/kernels/roi_align_compute_test.cc  |  133 -
 lite/tests/kernels/scale_compute_test.cc      |  125 -
 .../kernels/sequence_expand_compute_test.cc   |  188 -
 .../kernels/sequence_pool_compute_test.cc     |  195 -
 .../kernels/sequence_softmax_compute_test.cc  |  123 -
 lite/tests/kernels/shape_compute_test.cc      |   87 -
 .../kernels/shuffle_channel_compute_test.cc   |  110 -
 lite/tests/kernels/slice_compute_test.cc      |  190 -
 lite/tests/kernels/squeeze_compute_test.cc    |  253 -
 lite/tests/kernels/stack_compute_test.cc      |  116 -
 lite/tests/kernels/test_funcs.h               |  191 -
 lite/tests/kernels/test_sgemm.cc              |  353 -
 lite/tests/kernels/topk_compute_test.cc       |  119 -
 .../kernels/write_to_array_compute_test.cc    |  116 -
 lite/tests/kernels/yolo_box_compute_test.cc   |  254 -
 lite/tools/CMakeLists.txt                     |    1 -
 lite/tools/Dockerfile.mobile                  |   96 -
 lite/tools/benchmark.sh                       |   58 -
 lite/tools/build.sh                           |  272 -
 lite/tools/build_fpga.sh                      |   26 -
 lite/tools/build_npu.sh                       |  178 -
 lite/tools/ci_build.sh                        |  955 --
 lite/tools/cmake_tools/ast.py                 |  321 -
 .../create_fake_kernel_registry.py            |  104 -
 .../cmake_tools/parse_kernel_registry.py      |   50 -
 lite/tools/cmake_tools/parse_op_registry.py   |   49 -
 lite/tools/cmake_tools/utils.py               |   18 -
 lite/tools/debug/CMakeLists.txt               |   15 -
 lite/tools/debug/analysis_tool.py             |  401 -
 lite/tools/debug/check_model.sh               |  182 -
 lite/tools/debug/debug_utils.cc               |   15 -
 lite/tools/debug/debug_utils.h                |  337 -
 lite/tools/debug/model_debug_tool.cc          |  112 -
 lite/tools/gitlab_review.sh                   |   75 -
 lite/tools/mobile_readme.md                   |  135 -
 lite/tools/prepare_benchmark.sh               |   46 -
 lite/tools/python/lite_test.py                |  103 -
 lite/tools/search_support_ops.py              |   66 -
 lite/utils/CMakeLists.txt                     |   26 -
 lite/utils/all.h                              |   28 -
 lite/utils/any.cc                             |   23 -
 lite/utils/any.h                              |   71 -
 lite/utils/check.h                            |   41 -
 lite/utils/container.h                        |   51 -
 lite/utils/cp_logging.cc                      |   19 -
 lite/utils/cp_logging.h                       |   21 -
 lite/utils/factory.h                          |  100 -
 lite/utils/hash.h                             |   28 -
 lite/utils/io.h                               |   56 -
 lite/utils/logging.cc                         |   63 -
 lite/utils/logging.h                          |  185 -
 lite/utils/logging_test.cc                    |   31 -
 lite/utils/macros.h                           |   55 -
 lite/utils/paddle_enforce.h                   |   39 -
 lite/utils/replace_stl/stream.cc              |  105 -
 lite/utils/replace_stl/stream.h               |   76 -
 lite/utils/string.cc                          |   19 -
 lite/utils/string.h                           |   97 -
 lite/utils/varient.h                          |  151 -
 lite/utils/varient_test.cc                    |   58 -
 .../MobileNetDemo.xcodeproj/project.pbxproj   |  504 -
 .../contents.xcworkspacedata                  |    7 -
 .../xcshareddata/IDEWorkspaceChecks.plist     |    8 -
 .../MobileNetDemo/AppDelegate.swift           |   46 -
 .../AppIcon.appiconset/Contents.json          |   98 -
 .../Assets.xcassets/Contents.json             |    6 -
 .../Base.lproj/LaunchScreen.storyboard        |   25 -
 .../MobileNetDemo/Base.lproj/Main.storyboard  |  166 -
 metal/MobileNetDemo/MobileNetDemo/Info.plist  |   47 -
 .../MobileNetDemo/MobileNet.swift             |   76 -
 .../MobileNetDemo/MobilenetPreProcess.metal   |   38 -
 .../MobileNetDemo/ViewController.swift        |   94 -
 .../project.pbxproj                           |  457 -
 .../contents.xcworkspacedata                  |    7 -
 .../xcshareddata/IDEWorkspaceChecks.plist     |    8 -
 .../xcschemes/PaddleMobileTest.xcscheme       |   91 -
 .../PaddleMobileTest/AppDelegate.swift        |   46 -
 .../AppIcon.appiconset/Contents.json          |   98 -
 .../Assets.xcassets/Contents.json             |    6 -
 .../Base.lproj/LaunchScreen.storyboard        |   25 -
 .../Base.lproj/Main.storyboard                |   45 -
 .../PaddleMobileTest/Info.plist               |   52 -
 .../PaddleMobileTest/TestViewController.swift |  478 -
 .../PaddleMobileTest/ViewController.swift     |  122 -
 metal/Podfile                                 |   40 -
 metal/README.md                               |   12 -
 .../project.pbxproj                           |  742 --
 .../contents.xcworkspacedata                  |    7 -
 .../xcshareddata/IDEWorkspaceChecks.plist     |    8 -
 .../UserInterfaceState.xcuserstate            |  Bin 5181 -> 0 bytes
 .../xcschemes/paddle-mobile-demo.xcscheme     |   91 -
 .../paddle-mobile-demo/AppDelegate.swift      |   51 -
 .../AppIcon.appiconset/Contents.json          |   98 -
 .../Assets.xcassets/Contents.json             |    6 -
 .../paddle-mobile.imageset/Contents.json      |   21 -
 .../paddle-mobile.imageset/paddle-mobile.png  |  Bin 5331 -> 0 bytes
 .../Base.lproj/LaunchScreen.storyboard        |   25 -
 .../Base.lproj/Main.storyboard                |  325 -
 .../paddle-mobile-demo/Info.plist             |   47 -
 .../paddle-mobile-demo/MetalHelper.swift      |   31 -
 .../MultiPredictViewController.swift          |   66 -
 .../Net/BufferToTexture.metal                 |   35 -
 .../paddle-mobile-demo/Net/CPUCompute.h       |   44 -
 .../paddle-mobile-demo/Net/CPUCompute.mm      |  318 -
 .../paddle-mobile-demo/Net/Genet.swift        |   61 -
 .../paddle-mobile-demo/Net/MobileNet.swift    |   75 -
 .../Net/MobileNetCombined.swift               |   70 -
 .../paddle-mobile-demo/Net/MobileNetSSD.swift |   64 -
 .../Net/MobilenetSSD_AR.swift                 |   62 -
 .../Net/PreProcessKernel.metal                |  117 -
 .../paddle-mobile-demo/Net/YoloNet.swift      |   47 -
 .../paddle-mobile-demo/OC/ImageTool.h         |   22 -
 .../paddle-mobile-demo/OC/ImageTool.m         |   38 -
 .../OCDemo/LoadPointerViewController.h        |   23 -
 .../OCDemo/LoadPointerViewController.m        |  116 -
 .../OCDemo/OCDemoViewController.h             |   22 -
 .../OCDemo/OCDemoViewController.m             |   19 -
 .../OCInterface/PaddleMobileGPU.h             |  106 -
 .../OCInterface/PaddleMobileGPU.m             |  107 -
 .../OCInterface/SuperResolutionNet.swift      |   84 -
 .../VideoCapture/FPSCounter.swift             |   31 -
 .../VideoCapture/VideoCapture.swift           |  218 -
 .../paddle-mobile-demo/ViewController.swift   |  302 -
 .../metal/BatchNormKernel.metal               |   42 -
 .../metal/BatchNormRelu.metal                 |   36 -
 .../metal/BilinearInterp.inc.metal            |   49 -
 .../metal/BilinearInterp.metal                |   29 -
 .../metal/BoxCoder.inc.metal                  |   54 -
 .../paddle-mobile-demo/metal/BoxCoder.metal   |   23 -
 .../paddle-mobile-demo/metal/Common.metal     |  120 -
 .../metal/ConcatKernel.inc.metal              |  318 -
 .../metal/ConcatKernel.metal                  |  171 -
 .../metal/ConvAddBNReluKernel.metal           |  310 -
 .../metal/ConvAddMetal.metal                  |  622 --
 .../metal/ConvAddPrelu.inc.metal              |  447 -
 .../metal/ConvAddPreluKernel.metal            |   65 -
 .../metal/ConvBNReluKernel.metal              |  297 -
 .../paddle-mobile-demo/metal/ConvKernel.metal |  280 -
 .../metal/ConvTransposeKernel.metal           |  174 -
 .../metal/Elementwise.metal                   |  100 -
 .../metal/ElementwiseAddPreluKernel.inc.metal |   91 -
 .../metal/ElementwiseAddPreluKernel.metal     |   75 -
 .../metal/FetchKernel.inc.metal               |   46 -
 .../metal/FetchKernel.metal                   |   40 -
 .../paddle-mobile-demo/metal/Kernels.metal    |   69 -
 .../paddle-mobile-demo/metal/Macro.metal      |   29 -
 .../metal/NMSFetchResultKernel.metal          |   80 -
 .../metal/PoolKernel.inc.metal                |   44 -
 .../paddle-mobile-demo/metal/PoolKernel.metal |   36 -
 .../metal/PreluKernel.metal                   |  151 -
 .../metal/PriorBoxKernel.metal                |  367 -
 .../paddle-mobile-demo/metal/ReluKernel.metal |   41 -
 .../metal/ReshapeKernel.inc.metal             |   66 -
 .../metal/ReshapeKernel.metal                 |  150 -
 .../metal/ResizeBilinear.metal                |   75 -
 .../paddle-mobile-demo/metal/Shape.metal      |   21 -
 .../metal/Softmax.inc.metal                   |   61 -
 .../paddle-mobile-demo/metal/Softmax.metal    |   29 -
 .../paddle-mobile-demo/metal/Split.inc.metal  |  122 -
 .../paddle-mobile-demo/metal/Split.metal      |   64 -
 .../metal/TransposeKernel.inc.metal           |   60 -
 .../metal/TransposeKernel.metal               |   63 -
 .../paddle-mobile-demo-Bridging-Header.h      |    6 -
 .../project.pbxproj                           |  407 -
 .../contents.xcworkspacedata                  |    7 -
 .../xcshareddata/IDEWorkspaceChecks.plist     |    8 -
 .../xcschemes/paddle-mobile-metallib.xcscheme |   80 -
 .../ActivationKernel.metal                    |   64 -
 .../BatchNormKernel.metal                     |   42 -
 .../BatchNormRelu.metal                       |   28 -
 .../BilinearInterp.inc.metal                  |   49 -
 .../BilinearInterp.metal                      |   29 -
 .../paddle-mobile-metallib/BoxCoder.inc.metal |   54 -
 .../paddle-mobile-metallib/BoxCoder.metal     |   23 -
 .../BufferToTexture.metal                     |   67 -
 .../paddle-mobile-metallib/Common.metal       |  136 -
 .../ConcatKernel.inc.metal                    |  318 -
 .../paddle-mobile-metallib/ConcatKernel.metal |  219 -
 .../ConvAddBNReluKernel.metal                 |  310 -
 .../ConvAddPrelu.inc.metal                    |  447 -
 .../ConvAddPreluKernel.metal                  |   65 -
 .../ConvAddReluMetal.metal                    |  889 --
 .../ConvBNReluKernel.metal                    |  297 -
 .../ConvTransposeKernel.metal                 |  174 -
 .../paddle-mobile-metallib/Elementwise.metal  |   90 -
 .../ElementwiseAddPreluKernel.inc.metal       |   91 -
 .../ElementwiseAddPreluKernel.metal           |   65 -
 .../FetchKernel.inc.metal                     |   60 -
 .../paddle-mobile-metallib/FetchKernel.metal  |   40 -
 .../paddle-mobile-metallib/Kernels.metal      |   69 -
 .../paddle-mobile-metallib/Macro.metal        |   29 -
 .../NMSFetchResultKernel.metal                |   80 -
 .../NearestInterpKernel.metal                 |   50 -
 .../PoolKernel.inc.metal                      |   50 -
 .../paddle-mobile-metallib/PoolKernel.metal   |   36 -
 .../paddle-mobile-metallib/PreluKernel.metal  |  151 -
 .../PriorBoxKernel.metal                      |  367 -
 .../paddle-mobile-metallib/ReluKernel.metal   |  104 -
 .../ReshapeKernel.inc.metal                   |   66 -
 .../ReshapeKernel.metal                       |  150 -
 .../ResizeBilinear.metal                      |   75 -
 .../paddle-mobile-metallib/Scale.metal        |   30 -
 .../paddle-mobile-metallib/ScaleKernel.metal  |   82 -
 .../paddle-mobile-metallib/Shape.metal        |   21 -
 .../paddle-mobile-metallib/SliceKernel.metal  |   75 -
 .../paddle-mobile-metallib/Softmax.inc.metal  |   61 -
 .../paddle-mobile-metallib/Softmax.metal      |   29 -
 .../paddle-mobile-metallib/Split.inc.metal    |  122 -
 .../paddle-mobile-metallib/Split.metal        |   64 -
 .../TransposeKernel.inc.metal                 |   60 -
 .../TransposeKernel.metal                     |   63 -
 .../project.pbxproj                           |  478 -
 .../contents.xcworkspacedata                  |    7 -
 .../xcshareddata/IDEWorkspaceChecks.plist     |    8 -
 .../UserInterfaceState.xcuserstate            |  Bin 5178 -> 0 bytes
 .../paddle-mobile-unit-test/AppDelegate.swift |   50 -
 .../AppIcon.appiconset/Contents.json          |   98 -
 .../Assets.xcassets/Contents.json             |    6 -
 .../Base.lproj/LaunchScreen.storyboard        |   25 -
 .../Base.lproj/Main.storyboard                |   24 -
 .../paddle-mobile-unit-test/Info.plist        |   45 -
 .../ViewController.swift                      |   35 -
 .../paddle-mobile.xcodeproj/project.pbxproj   |  913 --
 .../contents.xcworkspacedata                  |    7 -
 .../xcshareddata/IDEWorkspaceChecks.plist     |    8 -
 .../UserInterfaceState.xcuserstate            |  Bin 9571 -> 0 bytes
 .../xcschemes/paddle-mobile.xcscheme          |   80 -
 .../paddle-mobile/API/GlobalConfig.swift      |   40 -
 .../paddle-mobile/paddle-mobile/API/Net.swift |   99 -
 .../paddle-mobile/API/Runner.swift            |  415 -
 metal/paddle-mobile/paddle-mobile/Info.plist  |   24 -
 .../paddle-mobile/Src/Common/Errors.swift     |   56 -
 .../paddle-mobile/Src/Common/Extensions.swift |  124 -
 .../Src/Common/MetalExtension.swift           |  666 --
 .../Src/Common/PaddleMobileUnitTest.swift     |  362 -
 .../paddle-mobile/Src/Common/Tools.swift      |   56 -
 .../paddle-mobile/Src/Common/Types.swift      |  224 -
 .../paddle-mobile/Src/Framework/Dim.swift     |   56 -
 .../Src/Framework/Executor.swift              |  168 -
 .../paddle-mobile/Src/Framework/Loader.swift  |  297 -
 .../paddle-mobile/Src/Framework/Tensor.swift  |  585 -
 .../paddle-mobile/Src/Framework/Texture.swift |  262 -
 .../paddle-mobile/Src/Framework/Utils.swift   |   29 -
 .../Src/Operators/Base/OpCreator.swift        |   84 -
 .../Src/Operators/Base/OpParam.swift          |  159 -
 .../Src/Operators/Base/Operator.swift         |  219 -
 .../Src/Operators/BatchNormOp.swift           |   57 -
 .../Src/Operators/BilinearInterpOp.swift      |   58 -
 .../Src/Operators/BoxcoderOp.swift            |   79 -
 .../Src/Operators/CNNMPSConvOp.swift          |   75 -
 .../Src/Operators/ConcatOp.swift              |   75 -
 .../Src/Operators/ConvAddAddPreluOp.swift     |  105 -
 .../Operators/ConvAddBatchNormReluOp.swift    |  125 -
 .../Src/Operators/ConvAddOp.swift             |   70 -
 .../Src/Operators/ConvAddPreluOp.swift        |   97 -
 .../Src/Operators/ConvAddReluOp.swift         |  121 -
 .../Src/Operators/ConvBNReluOp.swift          |  111 -
 .../paddle-mobile/Src/Operators/ConvOp.swift  |   75 -
 .../Src/Operators/ConvReluOp.swift            |   73 -
 .../Src/Operators/ConvTransposeOp.swift       |   53 -
 .../Src/Operators/DepthwiseConvOp.swift       |   55 -
 .../Src/Operators/DwConvBNReluOp.swift        |   70 -
 .../Src/Operators/ElementwiseAddOp.swift      |   88 -
 .../Src/Operators/ElementwiseAddPreluOp.swift |  108 -
 .../paddle-mobile/Src/Operators/ExpOp.swift   |   47 -
 .../paddle-mobile/Src/Operators/FeedOp.swift  |   67 -
 .../paddle-mobile/Src/Operators/FetchOp.swift |   50 -
 .../Src/Operators/FlattenOp.swift             |   78 -
 .../Src/Operators/Kernels/Base/Kernel.swift   |  238 -
 .../Operators/Kernels/BatchNormKernel.swift   |   68 -
 .../Kernels/BatchNormReluKernel.swift         |   91 -
 .../Kernels/BilinearInterpKernel.swift        |   69 -
 .../Operators/Kernels/BoxcoderKernel.swift    |   66 -
 .../Src/Operators/Kernels/CNNConvKernel.swift |  176 -
 .../Src/Operators/Kernels/Concat.swift        |   31 -
 .../Src/Operators/Kernels/ConcatKernel.swift  |  164 -
 .../Kernels/ConvAddAddPreluKernel.swift       |  175 -
 .../Kernels/ConvAddBatchNormReluKernel.swift  |  217 -
 .../Src/Operators/Kernels/ConvAddKernel.swift |   27 -
 .../Kernels/ConvAddPreluKernel.swift          |  175 -
 .../Operators/Kernels/ConvAddReluKernel.swift |  394 -
 .../Operators/Kernels/ConvBNReluKernel.swift  |  214 -
 .../Src/Operators/Kernels/ConvKernel.swift    |  207 -
 .../Operators/Kernels/ConvReluKernel.swift    |   27 -
 .../Kernels/ConvTransposeKernel.swift         |  102 -
 .../Kernels/ElementwiseAddKernel.swift        |  100 -
 .../Kernels/ElementwiseAddPreluKernel.swift   |   77 -
 .../Src/Operators/Kernels/ExpKernel.swift     |   52 -
 .../Src/Operators/Kernels/FetchKernel.swift   |   92 -
 .../Src/Operators/Kernels/FlattenKernel.swift |  149 -
 .../Operators/Kernels/LeakyReluKernel.swift   |   58 -
 .../Kernels/MulticlassNMSKernel.swift         |   70 -
 .../Kernels/NearestInterpKernel.swift         |   58 -
 .../Src/Operators/Kernels/PoolKernel.swift    |   84 -
 .../Src/Operators/Kernels/PreluKernel.swift   |   67 -
 .../Operators/Kernels/PriorBoxKernel.swift    |  162 -
 .../Src/Operators/Kernels/Relu6Kernel.swift   |   58 -
 .../Src/Operators/Kernels/ReluKernel.swift    |   51 -
 .../Src/Operators/Kernels/ReshapeKernel.swift |   97 -
 .../Kernels/ResizeBilinearKernel.swift        |   64 -
 .../Src/Operators/Kernels/Scale.swift         |   36 -
 .../Src/Operators/Kernels/ScaleOpKernel.swift |   74 -
 .../Src/Operators/Kernels/ShapeKernel.swift   |   44 -
 .../Src/Operators/Kernels/SigmoidKernel.swift |   52 -
 .../Src/Operators/Kernels/SliceKernel.swift   |  101 -
 .../Src/Operators/Kernels/SoftmaxKernel.swift |   66 -
 .../Src/Operators/Kernels/SplitKernel.swift   |  108 -
 .../Kernels/Texture2DTo2DArrayKernel.swift    |   55 -
 .../Operators/Kernels/TransposeKernel.swift   |   91 -
 .../Kernels/metal/BatchNormRelu.metal         |   36 -
 .../Kernels/metal/ResizeBilinear.metal        |   75 -
 .../Src/Operators/LeakyReluOp.swift           |   49 -
 .../Src/Operators/MulticlassNMSOp.swift       |   68 -
 .../Src/Operators/NearestInterpOp.swift       |   52 -
 .../paddle-mobile/Src/Operators/PoolOp.swift  |   71 -
 .../paddle-mobile/Src/Operators/PreluOp.swift |   65 -
 .../Src/Operators/PriorBoxOp.swift            |   76 -
 .../paddle-mobile/Src/Operators/Relu6Op.swift |   52 -
 .../paddle-mobile/Src/Operators/ReluOp.swift  |   48 -
 .../Src/Operators/ReshapeOp.swift             |   73 -
 .../Src/Operators/ResizeBilinearOp.swift      |   47 -
 .../paddle-mobile/Src/Operators/ScaleOp.swift |   53 -
 .../paddle-mobile/Src/Operators/ShapeOp.swift |   49 -
 .../Src/Operators/SigmoidOp.swift             |   47 -
 .../paddle-mobile/Src/Operators/SliceOp.swift |   63 -
 .../Src/Operators/SoftmaxOp.swift             |   55 -
 .../paddle-mobile/Src/Operators/SplitOp.swift |   69 -
 .../Src/Operators/TransposeOp.swift           |   49 -
 .../paddle-mobile/Src/Program/Attribute.swift |   84 -
 .../Src/Program/Framework.pbobjc.h            |  599 -
 .../Src/Program/Framework.pbobjc.m            | 1417 ---
 .../Src/Program/MemoryOptimze.swift           |  206 -
 .../Src/Program/PMBlockDesc.swift             |   67 -
 .../paddle-mobile/Src/Program/PMOpDesc.swift  |   85 -
 .../Src/Program/PMProgramDesc.swift           |   44 -
 .../paddle-mobile/Src/Program/PMVarDesc.swift |  104 -
 .../paddle-mobile/Src/Program/Program.swift   |   31 -
 .../Src/Program/ProgramOptimize.swift         |  308 -
 .../paddle-mobile/Src/Program/Scope.swift     |   55 -
 .../Src/Program/TensorDesc.swift              |   89 -
 .../Src/Program/framework.pb.swift            | 1820 ----
 .../paddle-mobile/paddle_mobile.h             |   24 -
 mobile.md                                     |    7 +
 mobile/.clang-format                          |    5 -
 mobile/.clang-tidy                            |   67 -
 mobile/.gitignore                             |  103 -
 mobile/.pre-commit-config.yaml                |   69 -
 mobile/.travis.yml                            |   36 -
 mobile/.travis/pre-commit-job.sh              |   21 -
 mobile/CMakeLists.txt                         |  293 -
 mobile/CONTRIBUTING.md                        |  234 -
 mobile/Dockerfile                             |   38 -
 mobile/LICENSE                                |  204 -
 mobile/README.md                              |  137 -
 mobile/benchmark/arm_benchmark.md             |   36 -
 mobile/benchmark/metal_benchmark.md           |   10 -
 mobile/demo/ReadMe.md                         |   10 -
 mobile/demo/getDemo.sh                        |    8 -
 mobile/doc/build.md                           |   63 -
 mobile/doc/design_doc.md                      |  171 -
 mobile/doc/development_android.md             |  189 -
 mobile/doc/development_android_GPU.md         |   77 -
 mobile/doc/development_arm_linux.md           |   62 -
 mobile/doc/development_fpga.md                |    5 -
 mobile/doc/development_ios.md                 |   85 -
 mobile/doc/quantification.md                  |   33 -
 mobile/src/common/common.h                    |   31 -
 mobile/src/common/enforce.h                   |   73 -
 mobile/src/common/log.h                       |  235 -
 mobile/src/common/threadpool.h                |  126 -
 mobile/src/common/type_define.h               |  187 -
 mobile/src/common/types.cpp                   |  260 -
 mobile/src/common/types.h                     |  268 -
 mobile/src/common/util.cpp                    |   46 -
 mobile/src/common/util.h                      |   26 -
 mobile/src/common/variant.h                   |  106 -
 mobile/src/fpga/KD/alignment.h                |   32 -
 mobile/src/fpga/KD/context.hpp                |   55 -
 mobile/src/fpga/KD/dl_engine.cpp              |   15 -
 mobile/src/fpga/KD/dl_engine.hpp              |   33 -
 mobile/src/fpga/KD/float16.hpp                |  506 -
 mobile/src/fpga/KD/layout.hpp                 |   99 -
 mobile/src/fpga/KD/llapi/bias_scale.cpp       |  100 -
 mobile/src/fpga/KD/llapi/bias_scale.h         |   29 -
 mobile/src/fpga/KD/llapi/config.h             |   19 -
 mobile/src/fpga/KD/llapi/filter.cpp           |  346 -
 mobile/src/fpga/KD/llapi/filter.h             |   54 -
 mobile/src/fpga/KD/llapi/image.cpp            |  149 -
 mobile/src/fpga/KD/llapi/image.h              |   38 -
 mobile/src/fpga/KD/llapi/zynqmp_api.cpp       |  384 -
 mobile/src/fpga/KD/llapi/zynqmp_api.h         |  329 -
 mobile/src/fpga/KD/pe.hpp                     |   45 -
 mobile/src/fpga/KD/pe_params.hpp              |  179 -
 mobile/src/fpga/KD/pes/concat_pe.hpp          |   70 -
 mobile/src/fpga/KD/pes/conv_pe.hpp            |   96 -
 mobile/src/fpga/KD/pes/conv_process.hpp       |  374 -
 mobile/src/fpga/KD/pes/depthwise_conv_pe.hpp  |   98 -
 mobile/src/fpga/KD/pes/elementwise_add_pe.hpp |   74 -
 mobile/src/fpga/KD/pes/fully_connected_pe.hpp |   98 -
 mobile/src/fpga/KD/pes/input_pe.hpp           |   53 -
 mobile/src/fpga/KD/pes/math_func_neon.h       |  330 -
 mobile/src/fpga/KD/pes/output_pe.hpp          |   52 -
 mobile/src/fpga/KD/pes/pooling_pe.hpp         |   72 -
 mobile/src/fpga/KD/pes/softmax_pe.cpp         |  162 -
 mobile/src/fpga/KD/pes/softmax_pe.hpp         |   44 -
 mobile/src/fpga/KD/shape.hpp                  |  112 -
 mobile/src/fpga/KD/tensor.hpp                 |  281 -
 mobile/src/fpga/KD/tensor_util.cpp            |   31 -
 mobile/src/fpga/KD/tensor_util.hpp            |   25 -
 mobile/src/fpga/V1/api.cpp                    | 1021 --
 mobile/src/fpga/V1/api.h                      |  102 -
 mobile/src/fpga/V1/bias_scale.cpp             |  102 -
 mobile/src/fpga/V1/bias_scale.h               |   29 -
 mobile/src/fpga/V1/deconv_bias_scale.cpp      |   48 -
 mobile/src/fpga/V1/deconv_bias_scale.h        |   26 -
 mobile/src/fpga/V1/deconv_filter.cpp          |  280 -
 mobile/src/fpga/V1/deconv_filter.h            |   39 -
 mobile/src/fpga/V1/filter.cpp                 |  362 -
 mobile/src/fpga/V1/filter.h                   |   50 -
 mobile/src/fpga/V1/image.cpp                  |  138 -
 mobile/src/fpga/V1/image.h                    |   76 -
 mobile/src/fpga/V1/pe.cpp                     | 1180 --
 mobile/src/fpga/V2/api.cpp                    | 1011 --
 mobile/src/fpga/V2/api.h                      |   94 -
 mobile/src/fpga/V2/bias_scale.cpp             |  102 -
 mobile/src/fpga/V2/bias_scale.h               |   29 -
 mobile/src/fpga/V2/deconv_bias_scale.cpp      |   48 -
 mobile/src/fpga/V2/deconv_bias_scale.h        |   26 -
 mobile/src/fpga/V2/deconv_filter.cpp          |  280 -
 mobile/src/fpga/V2/deconv_filter.h            |   39 -
 mobile/src/fpga/V2/filter.cpp                 |  362 -
 mobile/src/fpga/V2/filter.h                   |   50 -
 mobile/src/fpga/V2/image.cpp                  |  146 -
 mobile/src/fpga/V2/image.h                    |   71 -
 mobile/src/fpga/V2/pe.cpp                     | 1175 --
 mobile/src/fpga/common/config.h               |   18 -
 mobile/src/fpga/common/driver.cpp             |  295 -
 mobile/src/fpga/common/driver.h               |  141 -
 mobile/src/fpga/common/fpga_common.cpp        |  214 -
 mobile/src/fpga/common/fpga_common.h          |  330 -
 mobile/src/fpga/common/pe.h                   |   35 -
 mobile/src/framework/CMakeLists.txt           |    0
 mobile/src/framework/attribute.cpp            |   40 -
 mobile/src/framework/attribute.h              |  183 -
 mobile/src/framework/cl/cl_deleter.h          |   59 -
 mobile/src/framework/cl/cl_engine.cpp         |  136 -
 mobile/src/framework/cl/cl_engine.h           |  256 -
 mobile/src/framework/cl/cl_half.cpp           |  518 -
 mobile/src/framework/cl/cl_half.h             |   32 -
 mobile/src/framework/cl/cl_helper.h           |   91 -
 mobile/src/framework/cl/cl_image.cpp          |  156 -
 mobile/src/framework/cl/cl_image.h            |  312 -
 .../src/framework/cl/cl_image_converter.cpp   |  510 -
 mobile/src/framework/cl/cl_image_converter.h  |  121 -
 mobile/src/framework/cl/cl_scope.h            |  125 -
 mobile/src/framework/cl/cl_tensor.h           |  193 -
 mobile/src/framework/cl/cl_tool.cpp           |   84 -
 mobile/src/framework/cl/cl_tool.h             |   34 -
 mobile/src/framework/context.cpp              |  605 --
 mobile/src/framework/context.h                |   81 -
 mobile/src/framework/data_layout.h            |   63 -
 mobile/src/framework/data_type.cpp            |  106 -
 mobile/src/framework/data_type.h              |   80 -
 mobile/src/framework/ddim.cpp                 |  327 -
 mobile/src/framework/ddim.h                   |  192 -
 mobile/src/framework/dim.h                    |  335 -
 mobile/src/framework/executor.cpp             | 1102 --
 mobile/src/framework/executor.h               |  124 -
 mobile/src/framework/framework.pb-c.cpp       | 1465 ---
 mobile/src/framework/framework.pb-c.h         |  615 --
 mobile/src/framework/framework.proto          |  196 -
 mobile/src/framework/load_ops.h               |  379 -
 mobile/src/framework/loader.cpp               |  290 -
 mobile/src/framework/loader.h                 |   65 -
 mobile/src/framework/lod_tensor.cpp           |  192 -
 mobile/src/framework/lod_tensor.h             |  234 -
 mobile/src/framework/mixed_vector.h           |  271 -
 mobile/src/framework/op_info.h                |   96 -
 mobile/src/framework/op_kernel_type.h         |   60 -
 mobile/src/framework/op_proto_maker.h         |   22 -
 mobile/src/framework/op_registry.h            |  125 -
 mobile/src/framework/operator.cpp             |  154 -
 mobile/src/framework/operator.h               |  203 -
 mobile/src/framework/program/block_desc.cpp   |   44 -
 mobile/src/framework/program/block_desc.h     |   86 -
 mobile/src/framework/program/op_desc.cpp      |  100 -
 mobile/src/framework/program/op_desc.h        |   78 -
 .../program-optimize/fusion_op_register.h     |   82 -
 .../program/program-optimize/node.cpp         |  281 -
 .../framework/program/program-optimize/node.h |   81 -
 .../program-optimize/program_optimize.cpp     |  300 -
 .../program-optimize/program_optimize.h       |   45 -
 mobile/src/framework/program/program.h        |   40 -
 mobile/src/framework/program/program_desc.cpp |  118 -
 mobile/src/framework/program/program_desc.h   |   62 -
 mobile/src/framework/program/tensor_desc.h    |   75 -
 mobile/src/framework/program/var_desc.h       |   80 -
 mobile/src/framework/scope.cpp                |  155 -
 mobile/src/framework/scope.h                  |  113 -
 mobile/src/framework/selected_rows.cpp        |  127 -
 mobile/src/framework/selected_rows.h          |  138 -
 mobile/src/framework/tensor.h                 |  355 -
 mobile/src/framework/tensor_base.h            |  147 -
 mobile/src/framework/tensor_util.cpp          |   30 -
 mobile/src/framework/tensor_util.h            |   39 -
 mobile/src/framework/type_trait.h             |   44 -
 mobile/src/framework/variable.h               |   96 -
 mobile/src/framework/zynqmp/ztensor.hpp       |  312 -
 mobile/src/io/api.cc                          |   85 -
 mobile/src/io/api_paddle_mobile.cc            |  259 -
 mobile/src/io/api_paddle_mobile.h             |   53 -
 mobile/src/io/ios_io/PaddleMobileCPU.h        |  184 -
 mobile/src/io/ios_io/PaddleMobileCPU.mm       |  410 -
 mobile/src/io/jni/PML.java                    |   66 -
 mobile/src/io/jni/paddle_mobile_jni.cpp       |  465 -
 mobile/src/io/jni/paddle_mobile_jni.h         |   91 -
 mobile/src/io/loader.h                        |   49 -
 mobile/src/io/opencl_interface.cpp            |   35 -
 mobile/src/io/opencl_interface.h              |   27 -
 mobile/src/io/paddle_inference_api.h          |  178 -
 mobile/src/io/paddle_mobile.cpp               |  545 -
 mobile/src/io/paddle_mobile.h                 |  124 -
 mobile/src/io/paddle_mobile_wrap.cpp          |  361 -
 mobile/src/io/paddle_mobile_wrap.h            |   97 -
 mobile/src/io/paddle_test_inference_api.cpp   |   36 -
 mobile/src/io/paddle_test_inference_api.h     |   35 -
 mobile/src/memory/t_malloc.cpp                |   92 -
 mobile/src/memory/t_malloc.h                  |   63 -
 mobile/src/operators/activation_op.cpp        |  105 -
 mobile/src/operators/activation_op.h          |   47 -
 mobile/src/operators/assign_op.cpp            |   39 -
 mobile/src/operators/assign_op.h              |   33 -
 mobile/src/operators/assign_value_op.cpp      |   37 -
 mobile/src/operators/assign_value_op.h        |   33 -
 mobile/src/operators/batchnorm_op.cpp         |   44 -
 mobile/src/operators/batchnorm_op.h           |   48 -
 .../src/operators/beam_search_decode_op.cpp   |   34 -
 mobile/src/operators/beam_search_decode_op.h  |   32 -
 mobile/src/operators/beam_search_op.cpp       |   34 -
 mobile/src/operators/beam_search_op.h         |   31 -
 mobile/src/operators/bilinear_interp_op.cpp   |   55 -
 mobile/src/operators/bilinear_interp_op.h     |   48 -
 mobile/src/operators/box_coder_op.cpp         |   64 -
 mobile/src/operators/box_coder_op.h           |   49 -
 mobile/src/operators/cast_op.cpp              |   36 -
 mobile/src/operators/cast_op.h                |   45 -
 mobile/src/operators/compare_op.cpp           |   45 -
 mobile/src/operators/compare_op.h             |   34 -
 mobile/src/operators/concat_op.cpp            |   77 -
 mobile/src/operators/concat_op.h              |   45 -
 mobile/src/operators/conditional_block_op.cpp |   34 -
 mobile/src/operators/conditional_block_op.h   |   34 -
 .../tensor_array_read_write_op.cpp            |   43 -
 .../controlflow/tensor_array_read_write_op.h  |   34 -
 mobile/src/operators/controlflow/while_op.cpp |   36 -
 mobile/src/operators/controlflow/while_op.h   |   30 -
 mobile/src/operators/conv_op.cpp              |   67 -
 mobile/src/operators/conv_op.h                |   45 -
 mobile/src/operators/conv_transpose_op.cpp    |   36 -
 mobile/src/operators/conv_transpose_op.h      |   97 -
 mobile/src/operators/crf_op.cpp               |   55 -
 mobile/src/operators/crf_op.h                 |   46 -
 mobile/src/operators/depthwise_conv_op.cpp    |   62 -
 mobile/src/operators/depthwise_conv_op.h      |   43 -
 mobile/src/operators/dequantize_op.cpp        |   36 -
 mobile/src/operators/dequantize_op.h          |   46 -
 mobile/src/operators/detection_ops.cpp        |  145 -
 mobile/src/operators/detection_ops.h          |   46 -
 mobile/src/operators/dropout_op.cpp           |   40 -
 mobile/src/operators/dropout_op.h             |   49 -
 mobile/src/operators/elementwise_add_op.cpp   |   44 -
 mobile/src/operators/elementwise_add_op.h     |   47 -
 mobile/src/operators/elementwise_mul_op.cpp   |   39 -
 mobile/src/operators/elementwise_mul_op.h     |   51 -
 mobile/src/operators/elementwise_sub_op.cpp   |   38 -
 mobile/src/operators/elementwise_sub_op.h     |   51 -
 mobile/src/operators/exp_op.cpp               |   36 -
 mobile/src/operators/exp_op.h                 |   30 -
 mobile/src/operators/feed_op.cpp              |   47 -
 mobile/src/operators/feed_op.h                |   45 -
 mobile/src/operators/fetch_op.cpp             |   39 -
 mobile/src/operators/fetch_op.h               |   44 -
 .../fill_constant_batch_size_like_op.cpp      |   25 -
 .../fill_constant_batch_size_like_op.h        |   96 -
 mobile/src/operators/fill_constant_op.cpp     |   27 -
 mobile/src/operators/fill_constant_op.h       |   79 -
 mobile/src/operators/flatten2_op.cpp          |   48 -
 mobile/src/operators/flatten2_op.h            |   34 -
 mobile/src/operators/flatten_op.cpp           |   52 -
 mobile/src/operators/flatten_op.h             |   71 -
 .../src/operators/fusion_conv_add_bn_op.cpp   |   61 -
 mobile/src/operators/fusion_conv_add_bn_op.h  |   76 -
 .../operators/fusion_conv_add_bn_relu_op.cpp  |   64 -
 .../operators/fusion_conv_add_bn_relu_op.h    |   77 -
 mobile/src/operators/fusion_conv_add_op.cpp   |   64 -
 mobile/src/operators/fusion_conv_add_op.h     |   66 -
 .../src/operators/fusion_conv_add_relu_op.cpp |   62 -
 .../src/operators/fusion_conv_add_relu_op.h   |   68 -
 .../operators/fusion_conv_bn_add_relu_op.cpp  |   65 -
 .../operators/fusion_conv_bn_add_relu_op.h    |   83 -
 mobile/src/operators/fusion_conv_bn_op.cpp    |   61 -
 mobile/src/operators/fusion_conv_bn_op.h      |   72 -
 .../src/operators/fusion_conv_bn_relu_op.cpp  |   64 -
 mobile/src/operators/fusion_conv_bn_relu_op.h |   74 -
 mobile/src/operators/fusion_conv_relu_op.cpp  |   64 -
 mobile/src/operators/fusion_conv_relu_op.h    |   66 -
 .../src/operators/fusion_deconv_add_bn_op.cpp |   32 -
 .../src/operators/fusion_deconv_add_bn_op.h   |  116 -
 .../fusion_deconv_add_bn_relu_op.cpp          |   33 -
 .../operators/fusion_deconv_add_bn_relu_op.h  |  118 -
 mobile/src/operators/fusion_deconv_add_op.cpp |   32 -
 mobile/src/operators/fusion_deconv_add_op.h   |  108 -
 .../operators/fusion_deconv_add_relu_op.cpp   |   33 -
 .../src/operators/fusion_deconv_add_relu_op.h |  110 -
 .../operators/fusion_deconv_bn_relu_op.cpp    |   32 -
 .../src/operators/fusion_deconv_bn_relu_op.h  |  115 -
 .../src/operators/fusion_deconv_relu_op.cpp   |   31 -
 mobile/src/operators/fusion_deconv_relu_op.h  |  107 -
 .../operators/fusion_dequant_add_bn_op.cpp    |   38 -
 .../src/operators/fusion_dequant_add_bn_op.h  |   75 -
 .../fusion_dequant_add_bn_relu_op.cpp         |   40 -
 .../operators/fusion_dequant_add_bn_relu_op.h |   77 -
 .../fusion_dequant_add_bn_relu_quant_op.cpp   |   62 -
 .../fusion_dequant_add_bn_relu_quant_op.h     |  123 -
 mobile/src/operators/fusion_dequant_bn_op.cpp |   54 -
 mobile/src/operators/fusion_dequant_bn_op.h   |  101 -
 .../src/operators/fusion_dequant_bn_relu_op.h |   74 -
 .../operators/fusion_dwconv_bn_relu_op.cpp    |   63 -
 .../src/operators/fusion_dwconv_bn_relu_op.h  |   76 -
 .../fusion_elementwise_add_relu_op.cpp        |   44 -
 .../fusion_elementwise_add_relu_op.h          |   68 -
 mobile/src/operators/fusion_fc_op.cpp         |   70 -
 mobile/src/operators/fusion_fc_op.h           |   64 -
 mobile/src/operators/fusion_fc_relu_op.cpp    |   67 -
 mobile/src/operators/fusion_fc_relu_op.h      |   66 -
 .../operators/fusion_instancenorm_relu_op.cpp |   39 -
 .../operators/fusion_instancenorm_relu_op.h   |   68 -
 mobile/src/operators/gru_op.cpp               |   66 -
 mobile/src/operators/gru_op.h                 |   46 -
 mobile/src/operators/gru_unit_op.cpp          |   69 -
 mobile/src/operators/gru_unit_op.h            |   44 -
 mobile/src/operators/im2sequence_op.cpp       |   55 -
 mobile/src/operators/im2sequence_op.h         |   48 -
 mobile/src/operators/increment_op.cpp         |   49 -
 mobile/src/operators/increment_op.h           |   48 -
 mobile/src/operators/instancenorm_op.cpp      |   39 -
 mobile/src/operators/instancenorm_op.h        |   48 -
 mobile/src/operators/is_empty_op.cpp          |   44 -
 mobile/src/operators/is_empty_op.h            |   47 -
 .../src/operators/kernel/activation_kernel.h  |   44 -
 .../kernel/arm/activation_kernel.cpp          |  116 -
 .../kernel/arm/anchor_generator_kernel.cpp    |   37 -
 .../operators/kernel/arm/assign_kernel.cpp    |   39 -
 .../kernel/arm/assign_value_kernel.cpp        |   73 -
 .../operators/kernel/arm/batchnorm_kernel.cpp |   36 -
 .../kernel/arm/beam_search_decode_kernel.cpp  |  278 -
 .../kernel/arm/beam_search_kernel.cpp         |  262 -
 .../kernel/arm/bilinear_interp_kernel.cpp     |   37 -
 .../operators/kernel/arm/box_coder_kernel.cpp |   36 -
 .../src/operators/kernel/arm/cast_kernel.cpp  |   84 -
 .../operators/kernel/arm/compare_kernel.cpp   |  274 -
 .../operators/kernel/arm/concat_kernel.cpp    |   41 -
 .../kernel/arm/conditional_block_kernel.cpp   |  100 -
 .../convolution/conv_add_bn_relu_kernel.cpp   |  178 -
 .../arm/convolution/conv_add_kernel.cpp       |   79 -
 .../arm/convolution/conv_add_relu_kernel.cpp  |   77 -
 .../convolution/conv_bn_add_relu_kernel.cpp   |   96 -
 .../arm/convolution/conv_bn_relu_kernel.cpp   |  146 -
 .../kernel/arm/convolution/conv_common.cpp    |  113 -
 .../kernel/arm/convolution/conv_common.h      |   25 -
 .../kernel/arm/convolution/conv_kernel.cpp    |   75 -
 .../arm/convolution/conv_relu_kernel.cpp      |   66 -
 .../arm/convolution/conv_transpose_kernel.cpp |   39 -
 .../arm/convolution/dwconv_bn_relu_kernel.cpp |   95 -
 .../src/operators/kernel/arm/crf_kernel.cpp   |   39 -
 .../kernel/arm/density_prior_box_kernel.cpp   |   37 -
 .../kernel/arm/dequantize_bn_kernel.cpp       |  340 -
 .../kernel/arm/dequantize_kernel.cpp          |   81 -
 .../operators/kernel/arm/dropout_kernel.cpp   |   51 -
 .../kernel/arm/elementwise_add_kernel.cpp     |   43 -
 .../kernel/arm/elementwise_mul_kernel.cpp     |   38 -
 .../kernel/arm/elementwise_sub_kernel.cpp     |   38 -
 .../src/operators/kernel/arm/exp_kernel.cpp   |   47 -
 .../src/operators/kernel/arm/feed_kernel.cpp  |   35 -
 .../src/operators/kernel/arm/fetch_kernel.cpp |   31 -
 .../operators/kernel/arm/flatten_kernel.cpp   |   36 -
 .../operators/kernel/arm/fusion_fc_kernel.cpp |   75 -
 .../src/operators/kernel/arm/gru_kernel.cpp   |   39 -
 .../operators/kernel/arm/gru_unit_kernel.cpp  |   38 -
 .../kernel/arm/im2sequence_kernel.cpp         |   87 -
 .../operators/kernel/arm/increment_kernel.cpp |   36 -
 .../operators/kernel/arm/is_empty_kernel.cpp  |   37 -
 .../operators/kernel/arm/lod_reset_kernel.cpp |   68 -
 .../operators/kernel/arm/logical_kernel.cpp   |  125 -
 .../operators/kernel/arm/lookup_kernel.cpp    |   36 -
 .../src/operators/kernel/arm/lrn_kernel.cpp   |   36 -
 .../src/operators/kernel/arm/mul_kernel.cpp   |   39 -
 .../kernel/arm/multiclass_nms_kernel.cpp      |   37 -
 .../kernel/arm/nearest_interp_kernel.cpp      |   88 -
 .../src/operators/kernel/arm/norm_kernel.cpp  |   36 -
 .../operators/kernel/arm/one_hot_kernel.cpp   |   61 -
 .../src/operators/kernel/arm/pad2d_kernel.cpp |   45 -
 .../arm/polygon_box_transform_kernel.cpp      |   38 -
 .../src/operators/kernel/arm/pool_kernel.cpp  |   36 -
 .../src/operators/kernel/arm/prelu_kernel.cpp |  122 -
 .../operators/kernel/arm/prior_box_kernel.cpp |   36 -
 .../operators/kernel/arm/proposal_kernel.cpp  |   36 -
 .../kernel/arm/psroi_pool_kernel.cpp          |   36 -
 .../operators/kernel/arm/quantize_kernel.cpp  |  221 -
 .../operators/kernel/arm/reshape2_kernel.cpp  |   36 -
 .../operators/kernel/arm/reshape_kernel.cpp   |   36 -
 .../operators/kernel/arm/resize_kernel.cpp    |  124 -
 .../kernel/arm/roi_perspective_kernel.cpp     |  291 -
 .../src/operators/kernel/arm/scale_kernel.cpp |   88 -
 .../kernel/arm/sequence_expand_kernel.cpp     |  115 -
 .../kernel/arm/sequence_pool_kernel.cpp       |  215 -
 .../kernel/arm/sequence_softmax_kernel.cpp    |   44 -
 .../src/operators/kernel/arm/shape_kernel.cpp |   36 -
 .../src/operators/kernel/arm/slice_kernel.cpp |   86 -
 .../operators/kernel/arm/softmax_kernel.cpp   |   38 -
 .../src/operators/kernel/arm/split_kernel.cpp |   36 -
 .../src/operators/kernel/arm/sum_kernel.cpp   |   37 -
 .../arm/tensor_array_read_write_kernel.cpp    |   66 -
 .../src/operators/kernel/arm/top_k_kernel.cpp |   68 -
 .../kernel/arm/transpose2_kernel.cpp          |  146 -
 .../operators/kernel/arm/transpose_kernel.cpp |   35 -
 .../src/operators/kernel/arm/while_kernel.cpp |  128 -
 mobile/src/operators/kernel/assign_kernel.h   |   53 -
 .../operators/kernel/assign_value_kernel.h    |   53 -
 .../src/operators/kernel/batchnorm_kernel.h   |   36 -
 .../kernel/beam_search_decode_kernel.h        |   58 -
 .../src/operators/kernel/beam_search_kernel.h |   74 -
 .../operators/kernel/bilinear_interp_kernel.h |   38 -
 .../src/operators/kernel/box_coder_kernel.h   |   38 -
 .../central-arm-func/activation_arm_func.h    |  107 -
 .../central-arm-func/batchnorm_arm_func.h     |   83 -
 .../bilinear_interp_arm_func.h                |   91 -
 .../central-arm-func/box_coder_arm_func.h     |  142 -
 .../kernel/central-arm-func/concat_arm_func.h |   90 -
 .../central-arm-func/conv_add_arm_func.h      |  151 -
 .../conv_add_bn_relu_arm_func.h               |  143 -
 .../central-arm-func/conv_add_relu_arm_func.h |  154 -
 .../kernel/central-arm-func/conv_arm_func.cpp |  377 -
 .../kernel/central-arm-func/conv_arm_func.h   |   58 -
 .../conv_bn_add_relu_arm_func.h               |  148 -
 .../central-arm-func/conv_bn_relu_arm_func.h  |  146 -
 .../conv_transpose_arm_func.h                 |  111 -
 .../kernel/central-arm-func/crf_arm_func.h    |  118 -
 .../density_prior_box_arm_func.h              |  161 -
 .../dwconv_bn_relu_arm_func.h                 |  144 -
 .../elementwise_add_arm_func.h                |   78 -
 .../elementwise_mul_arm_func.h                |   45 -
 .../elementwise_sub_arm_func.h                |   65 -
 .../central-arm-func/flatten_arm_func.h       |   50 -
 .../central-arm-func/fusion_fc_arm_func.h     |   75 -
 .../kernel/central-arm-func/gru_arm_func.h    |  107 -
 .../central-arm-func/gru_unit_arm_func.h      |   72 -
 .../central-arm-func/increment_arm_func.h     |   39 -
 .../kernel/central-arm-func/lookup_arm_func.h |   58 -
 .../kernel/central-arm-func/lrn_arm_func.h    |   47 -
 .../kernel/central-arm-func/mul_arm_func.h    |   59 -
 .../multiclass_nms_arm_func.h                 |  307 -
 .../kernel/central-arm-func/norm_arm_func.h   |  106 -
 .../polygon_box_transform_arm_func.h          |   53 -
 .../kernel/central-arm-func/pool_arm_func.h   |   91 -
 .../central-arm-func/prior_box_arm_func.h     |  199 -
 .../central-arm-func/reshape2_arm_func.h      |   59 -
 .../central-arm-func/reshape_arm_func.h       |   56 -
 .../kernel/central-arm-func/shape_arm_func.h  |   38 -
 .../central-arm-func/softmax_arm_func.h       |   32 -
 .../kernel/central-arm-func/split_arm_func.h  |   86 -
 .../kernel/central-arm-func/sum_arm_func.h    |  153 -
 .../central-arm-func/transpose_arm_func.h     |   70 -
 .../operators/kernel/cl/batchnorm_kernel.cpp  |  111 -
 .../operators/kernel/cl/box_coder_kernel.cpp  |   78 -
 .../kernel/cl/cl-kernel-func/conv_func.cpp    |  760 --
 .../kernel/cl/cl-kernel-func/conv_func.h      |   77 -
 .../kernel/cl/cl_kernel/batchnorm_kernel.cl   |   37 -
 .../kernel/cl/cl_kernel/box_coder_kernel.cl   |  147 -
 .../kernel/cl/cl_kernel/channel_add_kernel.cl |   51 -
 .../operators/kernel/cl/cl_kernel/cl_common.h |   34 -
 .../kernel/cl/cl_kernel/concat_kernel.cl      |  291 -
 .../kernel/cl/cl_kernel/conv_kernel.cl        |   15 -
 .../kernel/cl/cl_kernel/conv_kernel.inc.cl    | 2801 -----
 .../cl/cl_kernel/conv_transpose_kernel.cl     |  443 -
 .../cl/cl_kernel/density_prior_box_kernel.cl  |  114 -
 .../depthwise_conv_add_bn_relu_kernel.cl      |   18 -
 .../cl/cl_kernel/depthwise_conv_kernel.cl     |   15 -
 .../kernel/cl/cl_kernel/dropout_kernel.cl     |   42 -
 .../cl/cl_kernel/elementwise_add_kernel.cl    |   27 -
 .../kernel/cl/cl_kernel/exp_kernel.cl         |   34 -
 .../kernel/cl/cl_kernel/feed_kernel.cl        |   62 -
 .../kernel/cl/cl_kernel/fetch_kernel.cl       |   69 -
 .../kernel/cl/cl_kernel/flatten2_kernel.cl    |   48 -
 .../cl/cl_kernel/instancenorm_kernel.cl       |  119 -
 .../kernel/cl/cl_kernel/leakyrelu_kernel.cl   |   38 -
 .../kernel/cl/cl_kernel/lrn_kernel.cl         |  136 -
 .../cl/cl_kernel/nearest_interp_kernel.cl     |   37 -
 .../kernel/cl/cl_kernel/pad2d_kernel.cl       |   57 -
 .../kernel/cl/cl_kernel/pool_kernel.cl        |   95 -
 .../kernel/cl/cl_kernel/prior_box_kernel.cl   |  129 -
 .../src/operators/kernel/cl/cl_kernel/relu.cl |   58 -
 .../operators/kernel/cl/cl_kernel/relu6.cl    |   32 -
 .../operators/kernel/cl/cl_kernel/reshape.cl  |  202 -
 .../kernel/cl/cl_kernel/scale_kernel.cl       |   35 -
 .../operators/kernel/cl/cl_kernel/sigmoid.cl  |   34 -
 .../kernel/cl/cl_kernel/slice_kernel.cl       |   77 -
 .../operators/kernel/cl/cl_kernel/softmax.cl  |   92 -
 .../kernel/cl/cl_kernel/tanh_kernel.cl        |   31 -
 .../kernel/cl/cl_kernel/transpose_kernel.cl   |  169 -
 .../src/operators/kernel/cl/concat_kernel.cpp |  196 -
 .../kernel/cl/conv_add_bn_relu_kernel.cpp     |  230 -
 .../operators/kernel/cl/conv_add_kernel.cpp   |  142 -
 .../kernel/cl/conv_add_relu_kernel.cpp        |  144 -
 .../kernel/cl/conv_bn_add_relu_kernel.cpp     |  184 -
 .../kernel/cl/conv_bn_relu_kernel.cpp         |  183 -
 .../src/operators/kernel/cl/conv_kernel.cpp   |  130 -
 .../operators/kernel/cl/conv_relu_kernel.cpp  |  136 -
 .../kernel/cl/conv_transpose_kernel.cpp       |   71 -
 .../kernel/cl/density_prior_box_kernel.cpp    |  156 -
 .../kernel/cl/depthwise_conv_kernel.cpp       |   96 -
 .../operators/kernel/cl/dropout_kernel.cpp    |   59 -
 .../kernel/cl/dwconv_bn_relu_kernel.cpp       |  176 -
 .../kernel/cl/elementwise_add_kernel.cpp      |  129 -
 mobile/src/operators/kernel/cl/exp_kernel.cpp |   52 -
 .../src/operators/kernel/cl/feed_kernel.cpp   |   78 -
 .../src/operators/kernel/cl/fetch_kernel.cpp  |  101 -
 .../operators/kernel/cl/flatten2_kernel.cpp   |   79 -
 .../operators/kernel/cl/fusion_fc_kernel.cpp  |  123 -
 mobile/src/operators/kernel/cl/gen_code.py    |  208 -
 .../kernel/cl/instancenorm_kernel.cpp         |   94 -
 .../kernel/cl/instancenorm_relu_kernel.cpp    |   95 -
 .../operators/kernel/cl/leakyrelu_kernel.cpp  |   59 -
 mobile/src/operators/kernel/cl/lrn_kernel.cpp |   79 -
 mobile/src/operators/kernel/cl/mul_kernel.cpp |   88 -
 .../kernel/cl/multiclass_nms_kernel.cpp       |  340 -
 .../kernel/cl/nearest_interp_kernel.cpp       |   73 -
 .../src/operators/kernel/cl/pad2d_kernel.cpp  |   94 -
 .../src/operators/kernel/cl/pool_kernel.cpp   |   99 -
 .../operators/kernel/cl/prior_box_kernel.cpp  |  216 -
 .../src/operators/kernel/cl/relu6_kernel.cpp  |   53 -
 .../src/operators/kernel/cl/relu_kernel.cpp   |   72 -
 .../operators/kernel/cl/reshape2_kernel.cpp   |  150 -
 .../operators/kernel/cl/reshape_kernel.cpp    |  106 -
 .../src/operators/kernel/cl/scale_kernel.cpp  |   62 -
 .../operators/kernel/cl/sigmoid_kernel.cpp    |   50 -
 .../src/operators/kernel/cl/slice_kernel.cpp  |   64 -
 .../operators/kernel/cl/softmax_kernel.cpp    |   65 -
 .../src/operators/kernel/cl/split_kernel.cpp  |  116 -
 .../src/operators/kernel/cl/tanh_kernel.cpp   |   51 -
 .../operators/kernel/cl/transpose2_kernel.cpp |  213 -
 .../operators/kernel/cl/transpose_kernel.cpp  |  134 -
 mobile/src/operators/kernel/compare_kernel.h  |   32 -
 mobile/src/operators/kernel/concat_kernel.h   |   37 -
 .../kernel/conditional_block_kernel.h         |   70 -
 .../src/operators/kernel/conv_add_bn_kernel.h |   44 -
 .../kernel/conv_add_bn_relu_kernel.h          |   49 -
 mobile/src/operators/kernel/conv_add_kernel.h |   49 -
 .../operators/kernel/conv_add_relu_kernel.h   |   44 -
 .../kernel/conv_bn_add_relu_kernel.h          |   44 -
 mobile/src/operators/kernel/conv_bn_kernel.h  |   44 -
 .../operators/kernel/conv_bn_relu_kernel.h    |   48 -
 mobile/src/operators/kernel/conv_kernel.h     |   41 -
 .../src/operators/kernel/conv_relu_kernel.h   |   42 -
 .../operators/kernel/conv_transpose_kernel.h  |   39 -
 mobile/src/operators/kernel/crf_kernel.h      |   37 -
 .../operators/kernel/deconv_add_bn_kernel.h   |   39 -
 .../kernel/deconv_add_bn_relu_kernel.h        |   39 -
 .../src/operators/kernel/deconv_add_kernel.h  |   39 -
 .../operators/kernel/deconv_add_relu_kernel.h |   39 -
 .../operators/kernel/deconv_bn_relu_kernel.h  |   39 -
 .../src/operators/kernel/deconv_relu_kernel.h |   39 -
 .../src/operators/kernel/dequant_bn_kernel.h  |   48 -
 .../src/operators/kernel/dequantize_kernel.h  |   36 -
 .../src/operators/kernel/detection_kernel.h   |  232 -
 mobile/src/operators/kernel/dropout_kernel.h  |   35 -
 .../operators/kernel/dwconv_bn_relu_kernel.h  |   44 -
 .../operators/kernel/elementwise_add_kernel.h |   39 -
 .../kernel/elementwise_add_relu_kernel.h      |   38 -
 .../operators/kernel/elementwise_mul_kernel.h |   36 -
 .../operators/kernel/elementwise_sub_kernel.h |   38 -
 mobile/src/operators/kernel/exp_kernel.h      |   24 -
 mobile/src/operators/kernel/fc_relu_kernel.h  |   37 -
 mobile/src/operators/kernel/feed_kernel.h     |   32 -
 mobile/src/operators/kernel/fetch_kernel.h    |   34 -
 mobile/src/operators/kernel/flatten2_kernel.h |   28 -
 mobile/src/operators/kernel/flatten_kernel.h  |   37 -
 .../kernel/fpga/KD/conv_add_bn_kernel.cpp     |   47 -
 .../kernel/fpga/KD/conv_add_kernel.cpp        |   34 -
 .../kernel/fpga/KD/conv_add_relu_kernel.cpp   |   34 -
 .../kernel/fpga/KD/conv_bn_kernel.cpp         |   69 -
 .../kernel/fpga/KD/conv_bn_relu_kernel.cpp    |   76 -
 .../fpga/KD/elementwise_add_relu_kernel.cpp   |   60 -
 .../operators/kernel/fpga/KD/feed_kernel.cpp  |   65 -
 .../operators/kernel/fpga/KD/fetch_kernel.cpp |   55 -
 .../kernel/fpga/KD/fusion_fc_kernel.cpp       |   56 -
 .../operators/kernel/fpga/KD/pool_kernel.cpp  |   62 -
 .../kernel/fpga/KD/softmax_kernel.cpp         |   55 -
 .../fpga/V1/anchor_generator_kernel.cpp       |   88 -
 .../kernel/fpga/V1/concat_kernel.cpp          |   69 -
 .../kernel/fpga/V1/conv_add_bn_kernel.cpp     |   86 -
 .../fpga/V1/conv_add_bn_relu_kernel.cpp       |  100 -
 .../kernel/fpga/V1/conv_add_kernel.cpp        |   63 -
 .../kernel/fpga/V1/conv_add_relu_kernel.cpp   |   63 -
 .../kernel/fpga/V1/conv_bn_kernel.cpp         |   75 -
 .../kernel/fpga/V1/conv_bn_relu_kernel.cpp    |   85 -
 .../operators/kernel/fpga/V1/conv_kernel.cpp  |   56 -
 .../kernel/fpga/V1/conv_transpose_kernel.cpp  |   89 -
 .../kernel/fpga/V1/deconv_add_bn_kernel.cpp   |   90 -
 .../fpga/V1/deconv_add_bn_relu_kernel.cpp     |   91 -
 .../kernel/fpga/V1/deconv_add_kernel.cpp      |   90 -
 .../kernel/fpga/V1/deconv_add_relu_kernel.cpp |   91 -
 .../kernel/fpga/V1/deconv_bn_relu_kernel.cpp  |  108 -
 .../kernel/fpga/V1/dropout_kernel.cpp         |   34 -
 .../kernel/fpga/V1/elementwise_add_kernel.cpp |  191 -
 .../fpga/V1/elementwise_add_relu_kernel.cpp   |   72 -
 .../kernel/fpga/V1/elementwise_mul_kernel.cpp |   93 -
 .../operators/kernel/fpga/V1/feed_kernel.cpp  |  108 -
 .../operators/kernel/fpga/V1/fetch_kernel.cpp |  127 -
 .../kernel/fpga/V1/fusion_fc_kernel.cpp       |   74 -
 .../kernel/fpga/V1/fusion_fc_relu_kernel.cpp  |   75 -
 .../operators/kernel/fpga/V1/pad2d_kernel.cpp |   60 -
 .../operators/kernel/fpga/V1/pool_kernel.cpp  |  104 -
 .../kernel/fpga/V1/proposal_kernel.cpp        |  567 -
 .../kernel/fpga/V1/psroi_pool_kernel.cpp      |  284 -
 .../operators/kernel/fpga/V1/relu_kernel.cpp  |   35 -
 .../kernel/fpga/V1/reshape2_kernel.cpp        |  127 -
 .../kernel/fpga/V1/reshape_kernel.cpp         |   40 -
 .../kernel/fpga/V1/roialign_pool_kernel.cpp   |  296 -
 .../kernel/fpga/V1/sigmoid_kernel.cpp         |   54 -
 .../operators/kernel/fpga/V1/slice_kernel.cpp |   63 -
 .../kernel/fpga/V1/softmax_kernel.cpp         |  138 -
 .../operators/kernel/fpga/V1/split_kernel.cpp |   74 -
 .../operators/kernel/fpga/V1/tanh_kernel.cpp  |   79 -
 .../kernel/fpga/V1/transpose2_kernel.cpp      |   55 -
 .../fpga/V2/anchor_generator_kernel.cpp       |   87 -
 .../kernel/fpga/V2/concat_kernel.cpp          |   69 -
 .../kernel/fpga/V2/conv_add_bn_kernel.cpp     |   89 -
 .../fpga/V2/conv_add_bn_relu_kernel.cpp       |  104 -
 .../kernel/fpga/V2/conv_add_kernel.cpp        |   64 -
 .../kernel/fpga/V2/conv_add_relu_kernel.cpp   |   64 -
 .../kernel/fpga/V2/conv_bn_kernel.cpp         |   76 -
 .../kernel/fpga/V2/conv_bn_relu_kernel.cpp    |   93 -
 .../operators/kernel/fpga/V2/conv_kernel.cpp  |   58 -
 .../kernel/fpga/V2/conv_transpose_kernel.cpp  |   94 -
 .../kernel/fpga/V2/deconv_add_bn_kernel.cpp   |   98 -
 .../fpga/V2/deconv_add_bn_relu_kernel.cpp     |   98 -
 .../kernel/fpga/V2/deconv_add_kernel.cpp      |   98 -
 .../kernel/fpga/V2/deconv_add_relu_kernel.cpp |   93 -
 .../kernel/fpga/V2/deconv_bn_relu_kernel.cpp  |  114 -
 .../kernel/fpga/V2/dropout_kernel.cpp         |   34 -
 .../kernel/fpga/V2/elementwise_add_kernel.cpp |   71 -
 .../fpga/V2/elementwise_add_relu_kernel.cpp   |   69 -
 .../kernel/fpga/V2/elementwise_mul_kernel.cpp |   93 -
 .../operators/kernel/fpga/V2/feed_kernel.cpp  |   64 -
 .../operators/kernel/fpga/V2/fetch_kernel.cpp |  118 -
 .../kernel/fpga/V2/fusion_fc_kernel.cpp       |   75 -
 .../kernel/fpga/V2/fusion_fc_relu_kernel.cpp  |   76 -
 .../operators/kernel/fpga/V2/pool_kernel.cpp  |  106 -
 .../kernel/fpga/V2/proposal_kernel.cpp        |  501 -
 .../kernel/fpga/V2/psroi_pool_kernel.cpp      |  202 -
 .../operators/kernel/fpga/V2/relu_kernel.cpp  |   33 -
 .../kernel/fpga/V2/reshape2_kernel.cpp        |  128 -
 .../kernel/fpga/V2/reshape_kernel.cpp         |   40 -
 .../kernel/fpga/V2/roialign_pool_kernel.cpp   |  296 -
 .../kernel/fpga/V2/sigmoid_kernel.cpp         |   57 -
 .../operators/kernel/fpga/V2/slice_kernel.cpp |   63 -
 .../kernel/fpga/V2/softmax_kernel.cpp         |  123 -
 .../operators/kernel/fpga/V2/split_kernel.cpp |   74 -
 .../operators/kernel/fpga/V2/tanh_kernel.cpp  |   79 -
 .../kernel/fpga/V2/transpose2_kernel.cpp      |   55 -
 .../src/operators/kernel/fusion_fc_kernel.h   |   37 -
 mobile/src/operators/kernel/gru_kernel.h      |   37 -
 mobile/src/operators/kernel/gru_unit_kernel.h |   35 -
 .../src/operators/kernel/im2sequence_kernel.h |   38 -
 .../src/operators/kernel/increment_kernel.h   |   36 -
 .../operators/kernel/instancenorm_kernel.h    |   37 -
 .../kernel/instancenorm_relu_kernel.h         |   42 -
 mobile/src/operators/kernel/is_empty_kernel.h |   36 -
 mobile/src/operators/kernel/kernels.h         |   36 -
 mobile/src/operators/kernel/logical_kernel.h  |   42 -
 mobile/src/operators/kernel/lookup_kernel.h   |   37 -
 mobile/src/operators/kernel/lrn_kernel.h      |  181 -
 mobile/src/operators/kernel/mul_kernel.h      |   38 -
 .../operators/kernel/multiclass_nms_kernel.h  |   37 -
 .../operators/kernel/nearest_interp_kernel.h  |   38 -
 mobile/src/operators/kernel/norm_kernel.h     |   36 -
 mobile/src/operators/kernel/one_hot_kernel.h  |   51 -
 mobile/src/operators/kernel/pad2d_kernel.h    |   54 -
 .../kernel/polygon_box_transform_kernel.h     |   36 -
 mobile/src/operators/kernel/pool_kernel.h     |   35 -
 mobile/src/operators/kernel/prelu_kernel.h    |   30 -
 .../src/operators/kernel/prior_box_kernel.h   |  114 -
 mobile/src/operators/kernel/quantize_kernel.h |   36 -
 mobile/src/operators/kernel/range_kernel.cpp  |   49 -
 mobile/src/operators/kernel/range_kernel.h    |   71 -
 .../operators/kernel/reduce_prod_kernel.cpp   |   65 -
 .../src/operators/kernel/reduce_prod_kernel.h |   65 -
 mobile/src/operators/kernel/reshape2_kernel.h |   36 -
 mobile/src/operators/kernel/reshape_kernel.h  |   80 -
 mobile/src/operators/kernel/resize_kernel.h   |   82 -
 mobile/src/operators/kernel/scale_kernel.h    |   35 -
 .../src/operators/kernel/sequence_kernels.h   |   36 -
 mobile/src/operators/kernel/shape_kernel.h    |   37 -
 mobile/src/operators/kernel/slice_kernel.h    |   31 -
 mobile/src/operators/kernel/softmax_kernel.h  |   36 -
 mobile/src/operators/kernel/split_kernel.h    |   37 -
 mobile/src/operators/kernel/sum_kernel.h      |   35 -
 mobile/src/operators/kernel/tanh_kernel.h     |   37 -
 .../kernel/tensor_array_read_write_kernel.h   |   32 -
 .../src/operators/kernel/transpose2_kernel.h  |   37 -
 .../src/operators/kernel/transpose_kernel.h   |   37 -
 mobile/src/operators/kernel/while_kernel.h    |   47 -
 mobile/src/operators/lod_reset_op.cpp         |   41 -
 mobile/src/operators/lod_reset_op.h           |   32 -
 mobile/src/operators/logical_op.cpp           |   69 -
 mobile/src/operators/logical_op.h             |   42 -
 mobile/src/operators/lookup_op.cpp            |   66 -
 mobile/src/operators/lookup_op.h              |   46 -
 mobile/src/operators/lrn_op.cpp               |   39 -
 mobile/src/operators/lrn_op.h                 |   46 -
 mobile/src/operators/math/activation.h        |  187 -
 .../math/depthwise/faster_depthwise_conv3x3.h |   34 -
 .../depthwise/faster_depthwise_conv3x3p1.cpp  | 2011 ----
 .../src/operators/math/depthwise_conv3x3.cpp  | 1060 --
 mobile/src/operators/math/depthwise_conv3x3.h |   47 -
 .../operators/math/depthwise_conv3x3_int8.cpp | 1660 ---
 .../src/operators/math/depthwise_conv5x5.cpp  | 1106 --
 mobile/src/operators/math/depthwise_conv5x5.h |   47 -
 .../operators/math/depthwise_conv5x5_int8.cpp | 1041 --
 mobile/src/operators/math/element_wise.h      |  396 -
 .../operators/math/elementwise_op_function.h  |  178 -
 mobile/src/operators/math/gemm.cpp            | 3807 -------
 mobile/src/operators/math/gemm.h              |  492 -
 mobile/src/operators/math/gemm/cblas.cc       |   50 -
 mobile/src/operators/math/gemm/cblas.h        |   32 -
 mobile/src/operators/math/gemm/executor.h     |  266 -
 mobile/src/operators/math/gemm/gemm1x1s1.cpp  | 2221 ----
 mobile/src/operators/math/gemm/gemm1x1s1.h    |   81 -
 mobile/src/operators/math/gemm/gemm_kernel.h  |  792 --
 mobile/src/operators/math/gemm/pack_kernel.h  |  801 --
 mobile/src/operators/math/gemm/strategy.h     |  120 -
 mobile/src/operators/math/gemm_int8.cpp       | 2077 ----
 mobile/src/operators/math/gemm_omp_int8.cpp   |  453 -
 mobile/src/operators/math/gpc.cpp             | 2142 ----
 mobile/src/operators/math/gpc.h               |  222 -
 mobile/src/operators/math/gru_compute.cpp     |   56 -
 mobile/src/operators/math/gru_compute.h       |   40 -
 mobile/src/operators/math/gru_cpu_kernel.h    |  203 -
 mobile/src/operators/math/im2col.cpp          |  668 --
 mobile/src/operators/math/im2col.h            |  129 -
 mobile/src/operators/math/math.h              |  342 -
 mobile/src/operators/math/math_function.cpp   |  176 -
 mobile/src/operators/math/math_function.h     |   62 -
 .../src/operators/math/math_function_int8.cpp |  109 -
 mobile/src/operators/math/pad.cpp             |   54 -
 mobile/src/operators/math/pad.h               |   32 -
 mobile/src/operators/math/poly_util.cpp       |  120 -
 mobile/src/operators/math/poly_util.h         |   70 -
 mobile/src/operators/math/pooling.cpp         |   82 -
 mobile/src/operators/math/pooling.h           |  199 -
 mobile/src/operators/math/pooling2x2.cpp      |  791 --
 mobile/src/operators/math/pooling3x3.cpp      | 1317 ---
 mobile/src/operators/math/quantize.h          |  108 -
 .../operators/math/selected_rows_functor.h    |  174 -
 mobile/src/operators/math/sequence2batch.cpp  |   60 -
 mobile/src/operators/math/sequence2batch.h    |  169 -
 .../operators/math/slidingwindow_conv3x3.cpp  | 5668 ----------
 .../operators/math/slidingwindow_conv3x3.h    |   51 -
 .../operators/math/slidingwindow_utils.cpp    |  365 -
 .../src/operators/math/slidingwindow_utils.h  |  159 -
 mobile/src/operators/math/softmax.cpp         |  157 -
 mobile/src/operators/math/softmax.h           |   42 -
 mobile/src/operators/math/transform.h         |   55 -
 mobile/src/operators/math/vol2col.cpp         |  147 -
 mobile/src/operators/math/vol2col.h           |   94 -
 .../math/winograd/winograd_transform.h        |   42 -
 .../math/winograd/winograd_transform_f6k3.cpp | 1681 ---
 mobile/src/operators/mul_op.cpp               |   67 -
 mobile/src/operators/mul_op.h                 |   46 -
 mobile/src/operators/multiclass_nms_op.cpp    |   50 -
 mobile/src/operators/multiclass_nms_op.h      |   50 -
 mobile/src/operators/nearest_interp_op.cpp    |   56 -
 mobile/src/operators/nearest_interp_op.h      |   50 -
 mobile/src/operators/norm_op.cpp              |   51 -
 mobile/src/operators/norm_op.h                |   47 -
 mobile/src/operators/one_hot_op.cpp           |   43 -
 mobile/src/operators/one_hot_op.h             |   31 -
 mobile/src/operators/op_param.cpp             |   98 -
 mobile/src/operators/op_param.h               | 3566 ------
 mobile/src/operators/pad2d_op.cpp             |   46 -
 mobile/src/operators/pad2d_op.h               |   32 -
 .../operators/polygon_box_transform_op.cpp    |   45 -
 .../src/operators/polygon_box_transform_op.h  |   56 -
 mobile/src/operators/pool_op.cpp              |   73 -
 mobile/src/operators/pool_op.h                |   46 -
 mobile/src/operators/prelu_op.cpp             |   40 -
 mobile/src/operators/prelu_op.h               |   49 -
 mobile/src/operators/prior_box_op.cpp         |  101 -
 mobile/src/operators/prior_box_op.h           |   34 -
 mobile/src/operators/quantize_op.cpp          |   39 -
 mobile/src/operators/quantize_op.h            |   45 -
 mobile/src/operators/range_op.cpp             |   45 -
 mobile/src/operators/range_op.h               |   33 -
 mobile/src/operators/reduce_prod_op.cpp       |   86 -
 mobile/src/operators/reduce_prod_op.h         |   33 -
 mobile/src/operators/reshape2_op.cpp          |  100 -
 mobile/src/operators/reshape2_op.h            |   53 -
 mobile/src/operators/reshape_op.cpp           |   45 -
 mobile/src/operators/reshape_op.h             |   49 -
 mobile/src/operators/resize_op.cpp            |   36 -
 mobile/src/operators/resize_op.h              |   48 -
 mobile/src/operators/scale_op.cpp             |   38 -
 mobile/src/operators/scale_op.h               |   49 -
 .../sequence_ops/sequence_expand_op.cpp       |   56 -
 .../sequence_ops/sequence_expand_op.h         |   47 -
 .../sequence_ops/sequence_pool_op.cpp         |   38 -
 .../operators/sequence_ops/sequence_pool_op.h |   46 -
 .../sequence_ops/sequence_softmax_op.cpp      |   39 -
 .../sequence_ops/sequence_softmax_op.h        |   47 -
 mobile/src/operators/shape_op.cpp             |   38 -
 mobile/src/operators/shape_op.h               |   47 -
 mobile/src/operators/slice_op.cpp             |  109 -
 mobile/src/operators/slice_op.h               |   49 -
 mobile/src/operators/softmax_op.cpp           |   40 -
 mobile/src/operators/softmax_op.h             |   45 -
 mobile/src/operators/split_op.cpp             |   93 -
 mobile/src/operators/split_op.h               |   46 -
 mobile/src/operators/sum_op.cpp               |   67 -
 mobile/src/operators/sum_op.h                 |   49 -
 mobile/src/operators/top_k_op.cpp             |   44 -
 mobile/src/operators/top_k_op.h               |   45 -
 mobile/src/operators/transpose2_op.cpp        |  121 -
 mobile/src/operators/transpose2_op.h          |   52 -
 mobile/src/operators/transpose_op.cpp         |   62 -
 mobile/src/operators/transpose_op.h           |   48 -
 mobile/src/pass/memory_optimize.cpp           |  170 -
 mobile/src/pass/memory_optimize.h             |   62 -
 mobile/src/pass/memory_optimize_super.cpp     |  209 -
 mobile/src/pass/memory_optimize_super.h       |   70 -
 mobile/src/pass/model_obfuscate.cpp           |   36 -
 mobile/src/pass/model_obfuscate.h             |   36 -
 mobile/src/pass/pass_base.h                   |   27 -
 mobile/src/protobuf-c/protobuf-c.cpp          | 2249 ----
 mobile/src/protobuf-c/protobuf-c.h            |  962 --
 mobile/test/CMakeLists.txt                    |  542 -
 mobile/test/common/test_enforce.cpp           |   21 -
 mobile/test/common/test_gemm_accuracy.cpp     |  131 -
 .../test/common/test_gemm_int8_accuracy.cpp   |  346 -
 mobile/test/common/test_gemm_perf.cpp         |  164 -
 mobile/test/common/test_lib_size.cpp          |   21 -
 mobile/test/common/test_lib_size.h            |   97 -
 mobile/test/common/test_log.cpp               |   34 -
 mobile/test/common/test_openmp.cpp            |   29 -
 mobile/test/executor_for_test.h               |  138 -
 mobile/test/fpga/test_concat_op.cpp           |   87 -
 mobile/test/fpga/test_densebox_combine.cpp    |   49 -
 mobile/test/fpga/test_format_data.cpp         |   93 -
 mobile/test/fpga/test_marker.cpp              |  125 -
 mobile/test/fpga/test_marker2.cpp             |  181 -
 mobile/test/fpga/test_marker_api.cpp          |  241 -
 mobile/test/fpga/test_mobilenet_api.cpp       |  158 -
 mobile/test/fpga/test_pe.cpp                  |  111 -
 mobile/test/fpga/test_resnet50.cpp            |  140 -
 mobile/test/fpga/test_rfcn.cpp                |  152 -
 mobile/test/fpga/test_rfcn_api.cpp            |  172 -
 mobile/test/fpga/test_ssd.cpp                 |   46 -
 mobile/test/fpga/test_tensor_quant.cpp        |   45 -
 mobile/test/fpga/test_yolo_api.cpp            |  158 -
 mobile/test/framework/test_inference_api.cpp  |   62 -
 mobile/test/framework/test_load.cpp           |   34 -
 mobile/test/framework/test_load_memory.cpp    |   68 -
 .../test_load_memory_inference_api.cpp        |   80 -
 mobile/test/framework/test_optimize.cpp       |   33 -
 mobile/test/net/test_alexnet.cpp              |   59 -
 mobile/test/net/test_benchmark.cpp            |   79 -
 mobile/test/net/test_eng.cpp                  |   50 -
 mobile/test/net/test_genet_combine.cpp        |   51 -
 mobile/test/net/test_gesture.cpp              |   97 -
 mobile/test/net/test_googlenet.cpp            |   85 -
 mobile/test/net/test_googlenet_quali.cpp      |   55 -
 mobile/test/net/test_googlenetv1_combine.cpp  |   60 -
 mobile/test/net/test_inceptionv4.cpp          |   59 -
 mobile/test/net/test_mobilenet+ssd.cpp        |   48 -
 mobile/test/net/test_mobilenet.cpp            |   60 -
 mobile/test/net/test_mobilenet_025_fssd.cpp   |   61 -
 mobile/test/net/test_mobilenet_GPU.cpp        |   64 -
 mobile/test/net/test_mobilenet_combine.cpp    |   59 -
 .../test/net/test_multi_inference_predict.cpp |  104 -
 mobile/test/net/test_net.cpp                  |  272 -
 mobile/test/net/test_net_benchmark.cpp        |   59 -
 mobile/test/net/test_nlp.cpp                  |   94 -
 mobile/test/net/test_ocr.cpp                  |  108 -
 mobile/test/net/test_op_in_net.cpp            |  125 -
 mobile/test/net/test_resnet.cpp               |   73 -
 mobile/test/net/test_squeezenet.cpp           |   49 -
 mobile/test/net/test_super.cpp                |  119 -
 mobile/test/net/test_vgg16ssd.cpp             |   46 -
 mobile/test/net/test_wrap.cpp                 |   65 -
 mobile/test/net/test_yolo.cpp                 |   50 -
 mobile/test/net/test_yolo_combined.cpp        |   53 -
 mobile/test/net/test_yologpu.cpp              |  190 -
 mobile/test/operators/test_batchnorm_op.cpp   |  122 -
 mobile/test/operators/test_box_coder_op.cpp   |  196 -
 mobile/test/operators/test_cast_op.cpp        |  126 -
 mobile/test/operators/test_concat_op.cpp      |  136 -
 .../test/operators/test_conv_add_relu_op.cpp  |   45 -
 .../test/operators/test_conv_bn_relu_op.cpp   |  172 -
 mobile/test/operators/test_conv_gpu.cpp       |  199 -
 mobile/test/operators/test_conv_op.cpp        |  358 -
 .../test/operators/test_depthwise_conv_op.cpp |   45 -
 mobile/test/operators/test_dequantize_op.cpp  |   76 -
 .../test/operators/test_dwconv_bn_relu_op.cpp |  145 -
 .../operators/test_elementwise_add_op.cpp     |   62 -
 .../operators/test_elementwise_sub_op.cpp     |  157 -
 .../test/operators/test_fill_constant_op.cpp  |  112 -
 .../test_fusion_conv_add_bn_relu_op.cpp       |   63 -
 mobile/test/operators/test_fusion_fc_op.cpp   |  166 -
 mobile/test/operators/test_gru_op.cpp         |  100 -
 mobile/test/operators/test_im2sequence_op.cpp |  137 -
 mobile/test/operators/test_increment_op.cpp   |   75 -
 mobile/test/operators/test_is_empty_op.cpp    |   69 -
 mobile/test/operators/test_leaky_relu_op.cpp  |   80 -
 mobile/test/operators/test_less_than_op.cpp   |  122 -
 mobile/test/operators/test_log_op.cpp         |   80 -
 mobile/test/operators/test_logical_and_op.cpp |   84 -
 mobile/test/operators/test_logical_not_op.cpp |   76 -
 mobile/test/operators/test_logical_or_op.cpp  |   84 -
 mobile/test/operators/test_logical_xor_op.cpp |   86 -
 mobile/test/operators/test_lrn_op.cpp         |   83 -
 mobile/test/operators/test_mul_op.cpp         |  102 -
 .../test/operators/test_multiclass_nms_op.cpp |  162 -
 .../test_polygon_box_transform_op.cpp         |  125 -
 mobile/test/operators/test_pool_op.cpp        |  231 -
 mobile/test/operators/test_prelu_op.cpp       |   58 -
 mobile/test/operators/test_prior_box_op.cpp   |  152 -
 mobile/test/operators/test_quantize_op.cpp    |  153 -
 mobile/test/operators/test_relu6_op.cpp       |   83 -
 mobile/test/operators/test_relu_op.cpp        |   82 -
 mobile/test/operators/test_reshape2_op.cpp    |  142 -
 mobile/test/operators/test_reshape_op.cpp     |   47 -
 mobile/test/operators/test_resize_op.cpp      |   47 -
 mobile/test/operators/test_scale_op.cpp       |   18 -
 .../operators/test_sequence_expand_op.cpp     |   97 -
 .../test/operators/test_sequence_pool_op.cpp  |  293 -
 .../operators/test_sequence_softmax_op.cpp    |  100 -
 mobile/test/operators/test_sigmoid_op.cpp     |   80 -
 mobile/test/operators/test_slice_op.cpp       |   18 -
 mobile/test/operators/test_softmax_op.cpp     |  100 -
 mobile/test/operators/test_sum_op.cpp         |  131 -
 mobile/test/operators/test_tanh_op.cpp        |   81 -
 mobile/test/operators/test_topk_op.cpp        |  139 -
 mobile/test/operators/test_transpose2_op.cpp  |  143 -
 mobile/test/operators/test_transpose_op.cpp   |   49 -
 mobile/test/test_helper.h                     |  147 -
 mobile/test/test_include.h                    |   39 -
 .../third_party/opencl/OpenCL-Headers/CL/cl.h | 1782 ---
 .../opencl/OpenCL-Headers/CL/cl_d3d10.h       |  130 -
 .../opencl/OpenCL-Headers/CL/cl_d3d11.h       |  130 -
 .../OpenCL-Headers/CL/cl_dx9_media_sharing.h  |  131 -
 .../CL/cl_dx9_media_sharing_intel.h           |  181 -
 .../opencl/OpenCL-Headers/CL/cl_egl.h         |  136 -
 .../opencl/OpenCL-Headers/CL/cl_ext.h         |  723 --
 .../opencl/OpenCL-Headers/CL/cl_ext_intel.h   |  428 -
 .../opencl/OpenCL-Headers/CL/cl_gl.h          |  175 -
 .../opencl/OpenCL-Headers/CL/cl_gl_ext.h      |   74 -
 .../opencl/OpenCL-Headers/CL/cl_platform.h    | 1460 ---
 .../CL/cl_va_api_media_sharing_intel.h        |  171 -
 .../opencl/OpenCL-Headers/CL/cl_version.h     |   86 -
 .../opencl/OpenCL-Headers/CL/opencl.h         |   58 -
 .../third_party/opencl/OpenCL-Headers/LICENSE |   25 -
 .../opencl/OpenCL-Headers/README.md           |   50 -
 .../android-cmake/android.toolchain.cmake     |  784 --
 .../android-debug-script/push2android.sh      |   42 -
 .../android-debug-script/run_on_android.sh    |   37 -
 mobile/tools/arm-platform.cmake               |    9 -
 mobile/tools/build.sh                         |  225 -
 mobile/tools/ci_build.sh                      |  270 -
 mobile/tools/ci_run_test.sh                   |   43 -
 mobile/tools/docker_build_fpga.sh             |    7 -
 mobile/tools/ios-cmake/ios.toolchain.cmake    |  216 -
 mobile/tools/net-detail.awk                   |   91 -
 mobile/tools/net.awk                          |   27 -
 mobile/tools/op.cmake                         |  753 --
 .../tools/pre-commit.hooks/clang-format.hook  |   23 -
 mobile/tools/pre-commit.hooks/clang-tidy.hook |   18 -
 mobile/tools/pre-commit.hooks/copyright.hook  |  124 -
 mobile/tools/pre-commit.hooks/cpplint.hook    |   13 -
 mobile/tools/prepare_images_and_models.sh     |   20 -
 mobile/tools/profile_show.sh                  |  138 -
 mobile/tools/python/caffetools/run.py         |   30 -
 mobile/tools/python/fluidtools/.gitignore     |    5 -
 mobile/tools/python/fluidtools/run.py         |  639 --
 mobile/tools/python/fluidtools/test_wrap.py   |  546 -
 mobile/tools/python/imagetools/README.md      |   24 -
 mobile/tools/python/imagetools/imagetools.py  |   71 -
 mobile/tools/python/imagetools/img2nchw.py    |   88 -
 mobile/tools/python/imagetools/img2nhwc.py    |   34 -
 .../tools/python/imagetools/numpy2binary.py   |   60 -
 mobile/tools/python/misc/.gitignore           |    4 -
 mobile/tools/python/misc/fluidtools.py        |  175 -
 mobile/tools/python/misc/ios-test-server.py   |  126 -
 mobile/tools/python/misc/restore-git.py       |   54 -
 .../python/misc/test-fluid-op-feature.py      |   13 -
 mobile/tools/python/modeltools/.gitignore     |  109 -
 .../tools/python/modeltools/core/__init__.py  |    0
 .../python/modeltools/core/framework.proto    |  176 -
 .../python/modeltools/core/framework_pb2.py   | 1141 --
 .../tools/python/modeltools/core/op_types.py  |   93 -
 .../python/modeltools/mobilenet/__init__.py   |    0
 .../mobilenet/converter_mobilenet.py          |  509 -
 .../python/modeltools/mobilenet/swicher.py    |  119 -
 .../tools/python/modeltools/tools/__init__.py |    0
 .../modeltools/tools/float2halffloat.py       |   70 -
 .../tools/python/modeltools/tools/loader.py   |   11 -
 .../python/modeltools/tools/model_combine.py  |   19 -
 .../python/modeltools/tools/model_reader.py   |   30 -
 .../tools/python/modeltools/yolo/__init__.py  |    0
 .../tools/python/modeltools/yolo/mdl2fluid.py |  333 -
 .../tools/python/modeltools/yolo/swicher.py   |  115 -
 mobile/tools/quantification/CMakeLists.txt    |   12 -
 mobile/tools/quantification/README.md         |   37 -
 mobile/tools/quantification/convert.cpp       |  438 -
 .../quantification/src/block_desc_local.cpp   |   48 -
 .../quantification/src/block_desc_local.h     |   56 -
 mobile/tools/quantification/src/enforce.h     |   67 -
 .../tools/quantification/src/framework.pb-c.c | 1403 ---
 .../tools/quantification/src/framework.pb-c.h |  579 -
 .../tools/quantification/src/program_desc.cpp |   30 -
 .../tools/quantification/src/program_desc.h   |   41 -
 mobile/tools/quantification/src/protobuf-c.c  | 2098 ----
 mobile/tools/quantification/src/protobuf-c.h  |  921 --
 mobile/tools/quantification/src/tensor_desc.h |   72 -
 mobile/tools/quantification/src/var_desc.h    |   80 -
 mobile/tools/shell/change_mobile_namespace.sh |   39 -
 mobile/tools/shell/check-bitcode.sh           |   34 -
 mobile/tools/shell/check-filename.sh          |   41 -
 .../tools/shell/generate-include/.gitignore   |    2 -
 .../generate-include/check_include_diff.sh    |   30 -
 mobile/tools/shell/generate-include/main.cpp  |    6 -
 mobile/tools/shell/generate-include/parse.py  |   21 -
 mobile/tools/shell/generate-include/run.sh    |    9 -
 mobile/tools/shell/merge.sh                   |   60 -
 mobile/tools/shell/prune_static_library.sh    |   41 -
 mobile/tools/shell/restore-private-repo.sh    |    5 -
 .../tools/toolchains/arm-android-neon.cmake   |    5 -
 .../tools/toolchains/arm-linux-gnueabi.cmake  |   16 -
 .../toolchains/arm-linux-gnueabihf.cmake      |   10 -
 model_optimize_tool.md                        |   61 +
 model_quantization.md                         |  305 +
 npu.md                                        |  125 +
 opencl.md                                     |  107 +
 paddle-mobile.md                              |    1 +
 roadmap.md                                    |   28 +
 source_compile.md                             |  286 +
 source_compile.md.toc.2019-08-29_160045       |   20 +
 support_operation_list.md                     |  196 +
 tech_highlights.md                            |   45 +
 third-party/gflags                            |    1 -
 third-party/googletest                        |    1 -
 third-party/protobuf-host                     |    1 -
 third-party/protobuf-mobile                   |    1 -
 tools/codestyle/.gitignore                    |    1 -
 tools/codestyle/clang_format.hook             |   15 -
 tools/codestyle/copyright.hook                |  121 -
 tools/codestyle/cpplint_pre_commit.hook       |   27 -
 tools/codestyle/docstring_checker.py          |  349 -
 tools/codestyle/pylint_pre_commit.hook        |   19 -
 tools/codestyle/test_docstring_checker.py     |  232 -
 tools/document_preview.sh                     |   13 -
 tutorial.md                                   |   74 +
 web/.editorconfig                             |    9 -
 web/.gitignore                                |   78 -
 web/.npmrc                                    |    1 -
 web/README.md                                 |   46 -
 web/README_cn.md                              |   43 -
 web/demo/index.es6                            |  457 -
 web/demo/index.html                           |   39 -
 web/demo/videoDemo.es6                        |   57 -
 web/demo/videoDemo.html                       |   36 -
 web/package.json                              |   32 -
 web/scripts/build.sh                          |    8 -
 web/src/banana.jpeg                           |  Bin 30262 -> 0 bytes
 web/src/executor/camera.es6                   |  142 -
 web/src/executor/executor.es6                 |  105 -
 web/src/executor/loader.es6                   |  423 -
 web/src/executor/postProcess.es6              |  262 -
 web/src/executor/runner.es6                   |  153 -
 web/src/factory/fshader/factory.es6           |   83 -
 web/src/factory/fshader/ops.es6               |  166 -
 web/src/feed/ImageFeed.es6                    |  237 -
 web/src/feed/dataFeed.es6                     |   42 -
 web/src/feed/io.es6                           |  854 --
 web/src/gpu/gpu.es6                           |  550 -
 web/src/index.es6                             |   20 -
 web/src/index.html                            |   13 -
 web/src/runtime/runtime.es6                   |   98 -
 web/src/shader/atom/common_func.es6           |   35 -
 web/src/shader/atom/common_params.es6         |   18 -
 .../atom/getArrayIndexFromTensorPos.es6       |   15 -
 .../atom/getArrayIndexFromTexturePos.es6      |   14 -
 web/src/shader/atom/getOutputTensorPos.es6    |   16 -
 .../shader/atom/getPixelsFromTexturePos.es6   |    9 -
 .../atom/getRangePowSumFromArrayIndex.es6     |   15 -
 .../shader/atom/getRangeSumFromArrayIndex.es6 |   15 -
 .../atom/getTensorPosFromArrayIndex.es6       |   18 -
 .../atom/getTexturePosFromArrayIndex.es6      |   25 -
 web/src/shader/atom/getValueFromTensorPos.es6 |   32 -
 .../atom/getValueFromTensorPosPacked.es6      |   26 -
 .../shader/atom/getValueFromTexturePos.es6    |   22 -
 web/src/shader/atom/moveTexture2PosToReal.es6 |   19 -
 web/src/shader/atom/prefix.es6                |   18 -
 web/src/shader/atom/prefix2.es6               |   22 -
 web/src/shader/atom/prelu.es6                 |   15 -
 web/src/shader/atom/scale.es6                 |   11 -
 web/src/shader/atom/sigmoid.es6               |   12 -
 web/src/shader/atom/softmax.es6               |   14 -
 web/src/shader/atom/suffix.es6                |   42 -
 web/src/shader/atom/type_ivec56.es6           |   23 -
 web/src/shader/batchnorm/conf.es6             |   58 -
 web/src/shader/batchnorm/main.es6             |   18 -
 web/src/shader/batchnorm/params.es6           |   24 -
 web/src/shader/conv2d/conf.es6                |   86 -
 web/src/shader/conv2d/main.es6                |   48 -
 web/src/shader/conv2d/params.es6              |   45 -
 web/src/shader/conv2d_depthwise/conf.es6      |   67 -
 web/src/shader/conv2d_depthwise/main.es6      |   42 -
 web/src/shader/conv2d_depthwise/params.es6    |   43 -
 .../shader/conv2d_elementwise_add/conf.es6    |   77 -
 .../shader/conv2d_elementwise_add/main.es6    |   53 -
 .../shader/conv2d_elementwise_add/params.es6  |   54 -
 .../conv2d_elementwise_add_winograd/conf.es6  |   72 -
 .../conv2d_elementwise_add_winograd/main.es6  |   98 -
 .../params.es6                                |   47 -
 web/src/shader/dynamic/conf.es6               |   35 -
 web/src/shader/dynamic/main.es6               |   14 -
 web/src/shader/dynamic/params.es6             |    9 -
 web/src/shader/elementwise_add/conf.es6       |   57 -
 web/src/shader/elementwise_add/main.es6       |   17 -
 web/src/shader/elementwise_add/params.es6     |   20 -
 web/src/shader/mul/conf.es6                   |   57 -
 web/src/shader/mul/main.es6                   |   18 -
 web/src/shader/mul/params.es6                 |   27 -
 web/src/shader/pool2d/conf.es6                |   48 -
 web/src/shader/pool2d/main.es6                |   49 -
 web/src/shader/pool2d/params.es6              |   30 -
 web/src/shader/pool2d_avg/conf.es6            |   47 -
 web/src/shader/pool2d_avg/main.es6            |   40 -
 web/src/shader/pool2d_avg/params.es6          |   30 -
 web/src/shader/pool2d_max/conf.es6            |   47 -
 web/src/shader/pool2d_max/main.es6            |   45 -
 web/src/shader/pool2d_max/params.es6          |   29 -
 web/src/shader/pool2d_winograd/conf.es6       |   50 -
 web/src/shader/pool2d_winograd/main.es6       |   63 -
 web/src/shader/pool2d_winograd/params.es6     |   33 -
 web/src/shader/softmax/conf.es6               |   29 -
 web/src/shader/softmax/main.es6               |   55 -
 web/src/shader/softmax/params.es6             |   15 -
 web/src/shader/v_shader.es6                   |   15 -
 web/src/shader/v_shader2.es6                  |   15 -
 web/src/test/getMaxUniforms.es6               |   59 -
 web/src/utils/models.es6                      |   46 -
 web/src/utils/opData.es6                      |  407 -
 web/src/utils/tensor.es6                      |  161 -
 web/src/utils/utils.es6                       |  190 -
 web/tools/logger.es6                          |  101 -
 web/tools/toBinaryFile.py                     |  111 -
 web/webpack.config.js                         |   48 -
 x2paddle.md                                   |   44 +
 ...20\350\241\214\345\215\225\346\265\213.md" |   28 +
 2757 files changed, 2974 insertions(+), 395109 deletions(-)
 delete mode 100644 CMakeLists.txt
 create mode 100644 Home.md
 delete mode 100644 README.md
 delete mode 100644 README_cn.md
 create mode 100644 add_new_operation.md
 create mode 100644 architecture-intro.md
 create mode 100644 architecture.md
 create mode 100644 benchmark.md
 create mode 100644 benchmark_tools.md
 create mode 100644 benchmark_tools.md.toc.2019-08-25_233116
 create mode 100644 benchmark_tools.md.toc.2019-08-25_233528
 delete mode 100644 cmake/FindGflags.cmake
 delete mode 100644 cmake/FindGlog.cmake
 delete mode 100644 cmake/FindGperftools.cmake
 delete mode 100644 cmake/FindJeMalloc.cmake
 delete mode 100644 cmake/FindNumPy.cmake
 delete mode 100644 cmake/cblas.cmake
 delete mode 100644 cmake/ccache.cmake
 delete mode 100644 cmake/configure.cmake
 delete mode 100644 cmake/coveralls.cmake
 delete mode 100644 cmake/coverallsGcovJsons.cmake
 delete mode 100644 cmake/cross_compiling/android.cmake
 delete mode 100644 cmake/cross_compiling/armlinux.cmake
 delete mode 100644 cmake/cross_compiling/findar.cmake
 delete mode 100644 cmake/cross_compiling/host.cmake
 delete mode 100644 cmake/cross_compiling/ios.cmake
 delete mode 100644 cmake/cross_compiling/npu.cmake
 delete mode 100644 cmake/cross_compiling/postproject.cmake
 delete mode 100644 cmake/cross_compiling/preproject.cmake
 delete mode 100644 cmake/cuda.cmake
 delete mode 100644 cmake/cudnn.cmake
 delete mode 100644 cmake/cupti.cmake
 delete mode 100644 cmake/external/eigen.cmake
 delete mode 100644 cmake/external/gflags.cmake
 delete mode 100644 cmake/external/glog.cmake
 delete mode 100644 cmake/external/gtest.cmake
 delete mode 100644 cmake/external/libxsmm.cmake
 delete mode 100644 cmake/external/mkldnn.cmake
 delete mode 100644 cmake/external/mklml.cmake
 delete mode 100644 cmake/external/openblas.cmake
 delete mode 100644 cmake/external/opencl-clhpp.cmake
 delete mode 100644 cmake/external/opencl-headers.cmake
 delete mode 100644 cmake/external/protobuf.cmake
 delete mode 100644 cmake/external/xbyak.cmake
 delete mode 100644 cmake/external/xxhash.cmake
 delete mode 100644 cmake/flags.cmake
 delete mode 100644 cmake/generic.cmake
 delete mode 100644 cmake/hip.cmake
 delete mode 100644 cmake/lite.cmake
 delete mode 100644 cmake/lite_utils.cmake
 delete mode 100644 cmake/make_resource.py
 delete mode 100644 cmake/operators.cmake
 delete mode 100644 cmake/package.cmake
 delete mode 100644 cmake/simd.cmake
 delete mode 100644 cmake/system.cmake
 delete mode 100644 cmake/tensorrt.cmake
 delete mode 100644 cmake/util.cmake
 delete mode 100644 cmake/version.cmake
 create mode 100644 cpp_demo.md
 create mode 100644 cxx_api.md
 create mode 100644 debug_tools.md
 create mode 100644 demos.md.toc.2019-08-26_222115
 create mode 100644 demos.md.toc.2019-08-26_222307
 create mode 100644 for-developer.md
 create mode 100644 fpga.md
 create mode 100644 images/architecture.jpg
 create mode 100644 images/benchmark_result.png
 create mode 100644 images/img_mobilenetv1_inference.png
 create mode 100644 images/lite1.png
 create mode 100644 images/model_quan_fig.png
 create mode 100644 images/model_quan_table1.png
 create mode 100644 images/phone_list.png
 create mode 100644 images/run_benchmark.png
 create mode 100644 java_demo.md
 delete mode 100644 lite/CMakeLists.txt
 delete mode 100644 lite/api/CMakeLists.txt
 delete mode 100644 lite/api/_paddle_use_kernels.h
 delete mode 100644 lite/api/_paddle_use_ops.h
 delete mode 100644 lite/api/android/.gitignore
 delete mode 100644 lite/api/android/CMakeLists.txt
 delete mode 100644 lite/api/android/jni/.gitignore
 delete mode 100644 lite/api/android/jni/CMakeLists.txt
 delete mode 100644 lite/api/android/jni/native/CMakeLists.txt
 delete mode 100644 lite/api/android/jni/native/convert_util_jni.h
 delete mode 100644 lite/api/android/jni/native/paddle_lite_jni.cc
 delete mode 100644 lite/api/android/jni/native/paddle_lite_jni.h
 delete mode 100644 lite/api/android/jni/native/tensor_jni.cc
 delete mode 100644 lite/api/android/jni/native/tensor_jni.h
 delete mode 100644 lite/api/android/jni/src/com/baidu/paddle/lite/.gitignore
 delete mode 100644 lite/api/android/jni/src/com/baidu/paddle/lite/ConfigBase.java
 delete mode 100644 lite/api/android/jni/src/com/baidu/paddle/lite/CxxConfig.java
 delete mode 100644 lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java
 delete mode 100644 lite/api/android/jni/src/com/baidu/paddle/lite/PaddleLiteInitializer.java
 delete mode 100644 lite/api/android/jni/src/com/baidu/paddle/lite/PaddlePredictor.java
 delete mode 100644 lite/api/android/jni/src/com/baidu/paddle/lite/Place.java
 delete mode 100644 lite/api/android/jni/src/com/baidu/paddle/lite/PowerMode.java
 delete mode 100644 lite/api/android/jni/src/com/baidu/paddle/lite/Tensor.java
 delete mode 100644 lite/api/android/jni/test/com/baidu/paddle/lite/PaddlePredictorTest.java
 delete mode 100644 lite/api/apis_test.cc
 delete mode 100644 lite/api/benchmark.cc
 delete mode 100644 lite/api/cxx_api.cc
 delete mode 100644 lite/api/cxx_api.h
 delete mode 100644 lite/api/cxx_api_bin.cc
 delete mode 100644 lite/api/cxx_api_impl.cc
 delete mode 100644 lite/api/cxx_api_test.cc
 delete mode 100644 lite/api/detection_model_test.cc
 delete mode 100644 lite/api/efficientnet_b0_test.cc
 delete mode 100644 lite/api/inceptionv4_test.cc
 delete mode 100644 lite/api/light_api.cc
 delete mode 100644 lite/api/light_api.h
 delete mode 100644 lite/api/light_api_impl.cc
 delete mode 100644 lite/api/light_api_test.cc
 delete mode 100644 lite/api/lite_api_test_helper.cc
 delete mode 100644 lite/api/lite_api_test_helper.h
 delete mode 100644 lite/api/mobilenetv1_int8_test.cc
 delete mode 100644 lite/api/mobilenetv1_ssd_test.cc
 delete mode 100644 lite/api/mobilenetv1_test.cc
 delete mode 100644 lite/api/mobilenetv1_yolov3_test.cc
 delete mode 100644 lite/api/mobilenetv2_test.cc
 delete mode 100644 lite/api/model_optimize_tool.cc
 delete mode 100644 lite/api/model_run_test_image.cc
 delete mode 100644 lite/api/model_test.cc
 delete mode 100644 lite/api/ocr_attention_test.cc
 delete mode 100644 lite/api/paddle_api.cc
 delete mode 100644 lite/api/paddle_api.h
 delete mode 100644 lite/api/paddle_api_test.cc
 delete mode 100644 lite/api/paddle_lite_factory_helper.h
 delete mode 100644 lite/api/paddle_place.cc
 delete mode 100644 lite/api/paddle_place.h
 delete mode 100644 lite/api/paddle_use_passes.h
 delete mode 100644 lite/api/resnet18_test.cc
 delete mode 100644 lite/api/resnet50_test.cc
 delete mode 100644 lite/api/resnet50_test_fpga.cc
 delete mode 100644 lite/api/shufflenetv2_test.cc
 delete mode 100644 lite/api/test_googlenet_lite.cc
 delete mode 100644 lite/api/test_helper.h
 delete mode 100644 lite/api/test_inceptionv4_lite_x86.cc
 delete mode 100644 lite/api/test_mobilenetv1_lite_x86.cc
 delete mode 100644 lite/api/test_mobilenetv2_lite_x86.cc
 delete mode 100644 lite/api/unet_test.cc
 delete mode 100644 lite/backends/CMakeLists.txt
 delete mode 100644 lite/backends/arm/CMakeLists.txt
 delete mode 100644 lite/backends/arm/math/CMakeLists.txt
 delete mode 100644 lite/backends/arm/math/activation.cc
 delete mode 100644 lite/backends/arm/math/activation.h
 delete mode 100644 lite/backends/arm/math/affine_channel.cc
 delete mode 100644 lite/backends/arm/math/affine_channel.h
 delete mode 100644 lite/backends/arm/math/anchor_generator.cc
 delete mode 100644 lite/backends/arm/math/anchor_generator.h
 delete mode 100644 lite/backends/arm/math/argmax.cc
 delete mode 100644 lite/backends/arm/math/argmax.h
 delete mode 100644 lite/backends/arm/math/axpy.cc
 delete mode 100644 lite/backends/arm/math/axpy.h
 delete mode 100644 lite/backends/arm/math/beam_search.cc
 delete mode 100644 lite/backends/arm/math/beam_search.h
 delete mode 100644 lite/backends/arm/math/box_coder.cc
 delete mode 100644 lite/backends/arm/math/box_coder.h
 delete mode 100644 lite/backends/arm/math/col_im_transform.cc
 delete mode 100644 lite/backends/arm/math/col_im_transform.h
 delete mode 100644 lite/backends/arm/math/concat.cc
 delete mode 100644 lite/backends/arm/math/concat.h
 delete mode 100644 lite/backends/arm/math/conv3x3s1_direct_int8.cc
 delete mode 100644 lite/backends/arm/math/conv3x3s2_direct_int8.cc
 delete mode 100644 lite/backends/arm/math/conv_block_utils.h
 delete mode 100644 lite/backends/arm/math/conv_depthwise.cc
 delete mode 100644 lite/backends/arm/math/conv_depthwise.h
 delete mode 100644 lite/backends/arm/math/conv_depthwise_3x3_int8.cc
 delete mode 100644 lite/backends/arm/math/conv_depthwise_3x3p0.cc
 delete mode 100644 lite/backends/arm/math/conv_depthwise_3x3p1.cc
 delete mode 100644 lite/backends/arm/math/conv_depthwise_5x5s1.cc
 delete mode 100644 lite/backends/arm/math/conv_depthwise_5x5s1_int8.cc
 delete mode 100644 lite/backends/arm/math/conv_depthwise_5x5s2.cc
 delete mode 100644 lite/backends/arm/math/conv_direct.cc
 delete mode 100644 lite/backends/arm/math/conv_direct.h
 delete mode 100644 lite/backends/arm/math/conv_direct_3x3s1.cc
 delete mode 100644 lite/backends/arm/math/conv_direct_3x3s2.cc
 delete mode 100644 lite/backends/arm/math/conv_gemmlike.cc
 delete mode 100644 lite/backends/arm/math/conv_gemmlike.h
 delete mode 100644 lite/backends/arm/math/conv_impl.cc
 delete mode 100644 lite/backends/arm/math/conv_impl.h
 delete mode 100644 lite/backends/arm/math/conv_winograd.cc
 delete mode 100644 lite/backends/arm/math/conv_winograd.h
 delete mode 100644 lite/backends/arm/math/conv_winograd_3x3.cc
 delete mode 100644 lite/backends/arm/math/decode_bboxes.cc
 delete mode 100644 lite/backends/arm/math/decode_bboxes.h
 delete mode 100644 lite/backends/arm/math/dot_toolchain_support.h
 delete mode 100644 lite/backends/arm/math/dropout.cc
 delete mode 100644 lite/backends/arm/math/dropout.h
 delete mode 100644 lite/backends/arm/math/elementwise.cc
 delete mode 100644 lite/backends/arm/math/elementwise.h
 delete mode 100644 lite/backends/arm/math/fill_bias_relu.cc
 delete mode 100644 lite/backends/arm/math/fill_bias_relu.h
 delete mode 100644 lite/backends/arm/math/funcs.cc
 delete mode 100644 lite/backends/arm/math/funcs.h
 delete mode 100644 lite/backends/arm/math/gemm_prepacked_int8.cc
 delete mode 100644 lite/backends/arm/math/gemm_prepacked_int8.h
 delete mode 100644 lite/backends/arm/math/gemv_arm_int8.cc
 delete mode 100644 lite/backends/arm/math/gemv_arm_int8.h
 delete mode 100644 lite/backends/arm/math/gru_utils.h
 delete mode 100644 lite/backends/arm/math/im2sequence.cc
 delete mode 100644 lite/backends/arm/math/im2sequence.h
 delete mode 100644 lite/backends/arm/math/increment.cc
 delete mode 100644 lite/backends/arm/math/increment.h
 delete mode 100644 lite/backends/arm/math/interpolate.cc
 delete mode 100644 lite/backends/arm/math/interpolate.h
 delete mode 100644 lite/backends/arm/math/lrn.cc
 delete mode 100644 lite/backends/arm/math/lrn.h
 delete mode 100644 lite/backends/arm/math/negative.cc
 delete mode 100644 lite/backends/arm/math/negative.h
 delete mode 100644 lite/backends/arm/math/norm.cc
 delete mode 100644 lite/backends/arm/math/norm.h
 delete mode 100644 lite/backends/arm/math/packed_sgemm.cc
 delete mode 100644 lite/backends/arm/math/packed_sgemm.h
 delete mode 100644 lite/backends/arm/math/pad2d.cc
 delete mode 100644 lite/backends/arm/math/pad2d.h
 delete mode 100644 lite/backends/arm/math/pooling.cc
 delete mode 100644 lite/backends/arm/math/pooling.h
 delete mode 100644 lite/backends/arm/math/power.cc
 delete mode 100644 lite/backends/arm/math/power.h
 delete mode 100644 lite/backends/arm/math/prior_box.cc
 delete mode 100644 lite/backends/arm/math/prior_box.h
 delete mode 100644 lite/backends/arm/math/reduce_max.cc
 delete mode 100644 lite/backends/arm/math/reduce_max.h
 delete mode 100644 lite/backends/arm/math/reduce_mean.cc
 delete mode 100644 lite/backends/arm/math/reduce_mean.h
 delete mode 100644 lite/backends/arm/math/saturate.h
 delete mode 100644 lite/backends/arm/math/scale.cc
 delete mode 100644 lite/backends/arm/math/scale.h
 delete mode 100644 lite/backends/arm/math/sequence2batch.h
 delete mode 100644 lite/backends/arm/math/sequence_expand.cc
 delete mode 100644 lite/backends/arm/math/sequence_expand.h
 delete mode 100644 lite/backends/arm/math/sequence_pool.cc
 delete mode 100644 lite/backends/arm/math/sequence_pool.h
 delete mode 100644 lite/backends/arm/math/sequence_softmax.cc
 delete mode 100644 lite/backends/arm/math/sequence_softmax.h
 delete mode 100644 lite/backends/arm/math/sgemm.cc
 delete mode 100644 lite/backends/arm/math/sgemm.h
 delete mode 100644 lite/backends/arm/math/sgemv.cc
 delete mode 100644 lite/backends/arm/math/sgemv.h
 delete mode 100644 lite/backends/arm/math/shuffle_channel.cc
 delete mode 100644 lite/backends/arm/math/shuffle_channel.h
 delete mode 100644 lite/backends/arm/math/slice.cc
 delete mode 100644 lite/backends/arm/math/slice.h
 delete mode 100644 lite/backends/arm/math/softmax.cc
 delete mode 100644 lite/backends/arm/math/softmax.h
 delete mode 100644 lite/backends/arm/math/split.cc
 delete mode 100644 lite/backends/arm/math/split.h
 delete mode 100644 lite/backends/arm/math/stack.cc
 delete mode 100644 lite/backends/arm/math/stack.h
 delete mode 100644 lite/backends/arm/math/topk.cc
 delete mode 100644 lite/backends/arm/math/topk.h
 delete mode 100644 lite/backends/arm/math/type_trans.cc
 delete mode 100644 lite/backends/arm/math/type_trans.h
 delete mode 100644 lite/backends/arm/math/yolo_box.cc
 delete mode 100644 lite/backends/arm/math/yolo_box.h
 delete mode 100644 lite/backends/cuda/CMakeLists.txt
 delete mode 100644 lite/backends/cuda/blas.cc
 delete mode 100644 lite/backends/cuda/blas.h
 delete mode 100644 lite/backends/cuda/cuda_utils.h
 delete mode 100644 lite/backends/cuda/math/CMakeLists.txt
 delete mode 100644 lite/backends/cuda/math/activation.cu
 delete mode 100644 lite/backends/cuda/math/activation.h
 delete mode 100644 lite/backends/cuda/math/cudnn_conv.cc
 delete mode 100644 lite/backends/cuda/math/cudnn_conv.h
 delete mode 100644 lite/backends/cuda/math/cudnn_helper.h
 delete mode 100644 lite/backends/cuda/math/scale.cu
 delete mode 100644 lite/backends/cuda/math/scale.h
 delete mode 100644 lite/backends/cuda/math/transpose.cu
 delete mode 100644 lite/backends/cuda/math/transpose.h
 delete mode 100644 lite/backends/cuda/math/type_trans.cu
 delete mode 100644 lite/backends/cuda/math/type_trans.h
 delete mode 100644 lite/backends/cuda/math/utils.h
 delete mode 100644 lite/backends/cuda/target_wrapper.cc
 delete mode 100644 lite/backends/cuda/target_wrapper.h
 delete mode 100644 lite/backends/fpga/CMakeLists.txt
 delete mode 100644 lite/backends/fpga/KD/alignment.h
 delete mode 100644 lite/backends/fpga/KD/context.hpp
 delete mode 100644 lite/backends/fpga/KD/dl_engine.cpp
 delete mode 100644 lite/backends/fpga/KD/dl_engine.hpp
 delete mode 100755 lite/backends/fpga/KD/float16.hpp
 delete mode 100644 lite/backends/fpga/KD/fpga_cv.cpp
 delete mode 100644 lite/backends/fpga/KD/fpga_cv.hpp
 delete mode 100644 lite/backends/fpga/KD/layout.hpp
 delete mode 100644 lite/backends/fpga/KD/llapi/bias_scale.cpp
 delete mode 100644 lite/backends/fpga/KD/llapi/bias_scale.h
 delete mode 100755 lite/backends/fpga/KD/llapi/config.h
 delete mode 100644 lite/backends/fpga/KD/llapi/filter.cpp
 delete mode 100644 lite/backends/fpga/KD/llapi/filter.h
 delete mode 100644 lite/backends/fpga/KD/llapi/zynqmp_api.cpp
 delete mode 100644 lite/backends/fpga/KD/llapi/zynqmp_api.h
 delete mode 100644 lite/backends/fpga/KD/pe.hpp
 delete mode 100644 lite/backends/fpga/KD/pe_params.hpp
 delete mode 100644 lite/backends/fpga/KD/pes/batchnorm_pe.hpp
 delete mode 100644 lite/backends/fpga/KD/pes/concat_pe.hpp
 delete mode 100644 lite/backends/fpga/KD/pes/conv_pe.hpp
 delete mode 100644 lite/backends/fpga/KD/pes/conv_process.hpp
 delete mode 100644 lite/backends/fpga/KD/pes/crop_pe.cpp
 delete mode 100755 lite/backends/fpga/KD/pes/crop_pe.hpp
 delete mode 100755 lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
 delete mode 100755 lite/backends/fpga/KD/pes/elementwise_add_pe.hpp
 delete mode 100644 lite/backends/fpga/KD/pes/fully_connected_pe.hpp
 delete mode 100755 lite/backends/fpga/KD/pes/input_pe.hpp
 delete mode 100644 lite/backends/fpga/KD/pes/norm_pe.hpp
 delete mode 100644 lite/backends/fpga/KD/pes/output_pe.hpp
 delete mode 100644 lite/backends/fpga/KD/pes/pooling_pe.hpp
 delete mode 100644 lite/backends/fpga/KD/pes/prior_box_pe.cpp
 delete mode 100755 lite/backends/fpga/KD/pes/prior_box_pe.hpp
 delete mode 100755 lite/backends/fpga/KD/pes/relu_pe.hpp
 delete mode 100644 lite/backends/fpga/KD/pes/resize.hpp
 delete mode 100755 lite/backends/fpga/KD/pes/scale_pe.hpp
 delete mode 100755 lite/backends/fpga/KD/pes/softmax_pe.cpp
 delete mode 100644 lite/backends/fpga/KD/pes/softmax_pe.hpp
 delete mode 100644 lite/backends/fpga/KD/pes/split_pe.hpp
 delete mode 100755 lite/backends/fpga/KD/shape.hpp
 delete mode 100644 lite/backends/fpga/KD/tensor.hpp
 delete mode 100644 lite/backends/fpga/KD/tensor_util.cpp
 delete mode 100644 lite/backends/fpga/KD/tensor_util.hpp
 delete mode 100644 lite/backends/fpga/lite_tensor.cc
 delete mode 100644 lite/backends/fpga/lite_tensor.h
 delete mode 100644 lite/backends/fpga/target_wrapper.cc
 delete mode 100644 lite/backends/host/CMakeLists.txt
 delete mode 100644 lite/backends/host/target_wrapper.cc
 delete mode 100644 lite/backends/npu/CMakeLists.txt
 delete mode 100644 lite/backends/npu/bridge/CMakeLists.txt
 delete mode 100644 lite/backends/npu/bridge/act_op.cc
 delete mode 100644 lite/backends/npu/bridge/act_op_test.cc
 delete mode 100644 lite/backends/npu/bridge/batch_norm_op.cc
 delete mode 100644 lite/backends/npu/bridge/batch_norm_op_test.cc
 delete mode 100644 lite/backends/npu/bridge/concat_op.cc
 delete mode 100644 lite/backends/npu/bridge/concat_op_test.cc
 delete mode 100644 lite/backends/npu/bridge/conv_op.cc
 delete mode 100644 lite/backends/npu/bridge/conv_op_test.cc
 delete mode 100644 lite/backends/npu/bridge/conv_transpose_op.cc
 delete mode 100644 lite/backends/npu/bridge/conv_transpose_op_test.cc
 delete mode 100644 lite/backends/npu/bridge/elementwise_ops.cc
 delete mode 100644 lite/backends/npu/bridge/elementwise_ops_test.cc
 delete mode 100644 lite/backends/npu/bridge/fc_op.cc
 delete mode 100644 lite/backends/npu/bridge/fc_op_test.cc
 delete mode 100644 lite/backends/npu/bridge/interpolate_op.cc
 delete mode 100644 lite/backends/npu/bridge/interpolate_op_test.cc
 delete mode 100644 lite/backends/npu/bridge/mul_op.cc
 delete mode 100644 lite/backends/npu/bridge/mul_op_test.cc
 delete mode 100644 lite/backends/npu/bridge/pad2d_op.cc
 delete mode 100644 lite/backends/npu/bridge/pad2d_op_test.cc
 delete mode 100644 lite/backends/npu/bridge/paddle_use_npu_bridges.h
 delete mode 100644 lite/backends/npu/bridge/pool_op.cc
 delete mode 100644 lite/backends/npu/bridge/pool_op_test.cc
 delete mode 100644 lite/backends/npu/bridge/registry.cc
 delete mode 100644 lite/backends/npu/bridge/registry.h
 delete mode 100644 lite/backends/npu/bridge/reshape_op.cc
 delete mode 100644 lite/backends/npu/bridge/reshape_op_test.cc
 delete mode 100644 lite/backends/npu/bridge/scale_op.cc
 delete mode 100644 lite/backends/npu/bridge/scale_op_test.cc
 delete mode 100644 lite/backends/npu/bridge/shuffle_channel_op.cc
 delete mode 100644 lite/backends/npu/bridge/shuffle_channel_op_test.cc
 delete mode 100644 lite/backends/npu/bridge/softmax_op.cc
 delete mode 100644 lite/backends/npu/bridge/softmax_op_test.cc
 delete mode 100644 lite/backends/npu/bridge/split_op.cc
 delete mode 100644 lite/backends/npu/bridge/split_op_test.cc
 delete mode 100644 lite/backends/npu/bridge/test_helper.cc
 delete mode 100644 lite/backends/npu/bridge/test_helper.h
 delete mode 100644 lite/backends/npu/bridge/transpose_op.cc
 delete mode 100644 lite/backends/npu/bridge/transpose_op_test.cc
 delete mode 100644 lite/backends/npu/bridge/utils.cc
 delete mode 100644 lite/backends/npu/bridge/utils.h
 delete mode 100644 lite/backends/npu/npu_helper.cc
 delete mode 100644 lite/backends/npu/npu_helper.h
 delete mode 100644 lite/backends/opencl/CMakeLists.txt
 delete mode 100644 lite/backends/opencl/cl_caller.cc
 delete mode 100644 lite/backends/opencl/cl_caller.h
 delete mode 100644 lite/backends/opencl/cl_context.cc
 delete mode 100644 lite/backends/opencl/cl_context.h
 delete mode 100644 lite/backends/opencl/cl_functions_test.cc
 delete mode 100644 lite/backends/opencl/cl_im2col_test.cc
 delete mode 100644 lite/backends/opencl/cl_image.cc
 delete mode 100644 lite/backends/opencl/cl_image.h
 delete mode 100644 lite/backends/opencl/cl_image_converter.cc
 delete mode 100644 lite/backends/opencl/cl_image_converter.h
 delete mode 100644 lite/backends/opencl/cl_include.h
 delete mode 100644 lite/backends/opencl/cl_kernel/buffer/depthwise_conv2d_kernel.cl
 delete mode 100644 lite/backends/opencl/cl_kernel/buffer/elementwise_add_kernel.cl
 delete mode 100644 lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl
 delete mode 100644 lite/backends/opencl/cl_kernel/buffer/im2col_kernel.cl
 delete mode 100644 lite/backends/opencl/cl_kernel/buffer/mat_mul_kernel.cl
 delete mode 100644 lite/backends/opencl/cl_kernel/buffer/pool_kernel.cl
 delete mode 100644 lite/backends/opencl/cl_kernel/buffer/relu_kernel.cl
 delete mode 100644 lite/backends/opencl/cl_kernel/cl_common.h
 delete mode 100644 lite/backends/opencl/cl_kernel/image/channel_add_kernel.cl
 delete mode 100644 lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl
 delete mode 100644 lite/backends/opencl/cl_kernel/image/pool_kernel.cl
 delete mode 100644 lite/backends/opencl/cl_runtime.cc
 delete mode 100644 lite/backends/opencl/cl_runtime.h
 delete mode 100644 lite/backends/opencl/cl_utility.cc
 delete mode 100644 lite/backends/opencl/cl_utility.h
 delete mode 100644 lite/backends/opencl/cl_wrapper.cc
 delete mode 100644 lite/backends/opencl/cl_wrapper.h
 delete mode 100644 lite/backends/opencl/target_wrapper.cc
 delete mode 100644 lite/backends/opencl/target_wrapper.h
 delete mode 100644 lite/backends/x86/CMakeLists.txt
 delete mode 100644 lite/backends/x86/cpu_info.cc
 delete mode 100644 lite/backends/x86/cpu_info.h
 delete mode 100644 lite/backends/x86/cupti_lib_path.h.in
 delete mode 100644 lite/backends/x86/dynamic_loader.cc
 delete mode 100644 lite/backends/x86/dynamic_loader.h
 delete mode 100644 lite/backends/x86/jit/CMakeLists.txt
 delete mode 100644 lite/backends/x86/jit/README.en.md
 delete mode 100644 lite/backends/x86/jit/README.md
 delete mode 100644 lite/backends/x86/jit/benchmark.cc
 delete mode 100644 lite/backends/x86/jit/gen/CMakeLists.txt
 delete mode 100644 lite/backends/x86/jit/gen/act.cc
 delete mode 100644 lite/backends/x86/jit/gen/act.h
 delete mode 100644 lite/backends/x86/jit/gen/blas.cc
 delete mode 100644 lite/backends/x86/jit/gen/blas.h
 delete mode 100644 lite/backends/x86/jit/gen/embseqpool.cc
 delete mode 100644 lite/backends/x86/jit/gen/embseqpool.h
 delete mode 100644 lite/backends/x86/jit/gen/gru.cc
 delete mode 100644 lite/backends/x86/jit/gen/gru.h
 delete mode 100644 lite/backends/x86/jit/gen/hopv.cc
 delete mode 100644 lite/backends/x86/jit/gen/hopv.h
 delete mode 100644 lite/backends/x86/jit/gen/jitcode.h
 delete mode 100644 lite/backends/x86/jit/gen/lstm.cc
 delete mode 100644 lite/backends/x86/jit/gen/lstm.h
 delete mode 100644 lite/backends/x86/jit/gen/matmul.cc
 delete mode 100644 lite/backends/x86/jit/gen/matmul.h
 delete mode 100644 lite/backends/x86/jit/gen/seqpool.cc
 delete mode 100644 lite/backends/x86/jit/gen/seqpool.h
 delete mode 100644 lite/backends/x86/jit/gen/sgd.cc
 delete mode 100644 lite/backends/x86/jit/gen/sgd.h
 delete mode 100644 lite/backends/x86/jit/gen/vbroadcast.cc
 delete mode 100644 lite/backends/x86/jit/gen/vbroadcast.h
 delete mode 100644 lite/backends/x86/jit/gen_base.cc
 delete mode 100644 lite/backends/x86/jit/gen_base.h
 delete mode 100644 lite/backends/x86/jit/helper.cc
 delete mode 100644 lite/backends/x86/jit/helper.h
 delete mode 100644 lite/backends/x86/jit/kernel_base.h
 delete mode 100644 lite/backends/x86/jit/kernel_key.cc
 delete mode 100644 lite/backends/x86/jit/kernel_key.h
 delete mode 100644 lite/backends/x86/jit/kernel_pool.cc
 delete mode 100644 lite/backends/x86/jit/kernel_pool.h
 delete mode 100644 lite/backends/x86/jit/macro.h
 delete mode 100644 lite/backends/x86/jit/more/CMakeLists.txt
 delete mode 100644 lite/backends/x86/jit/more/intrinsic/CMakeLists.txt
 delete mode 100644 lite/backends/x86/jit/more/intrinsic/crf_decoding.cc
 delete mode 100644 lite/backends/x86/jit/more/intrinsic/crf_decoding.h
 delete mode 100644 lite/backends/x86/jit/more/intrinsic/layer_norm.cc
 delete mode 100644 lite/backends/x86/jit/more/intrinsic/layer_norm.h
 delete mode 100644 lite/backends/x86/jit/more/mix/CMakeLists.txt
 delete mode 100644 lite/backends/x86/jit/more/mix/mix.cc
 delete mode 100644 lite/backends/x86/jit/more/mix/mix.h
 delete mode 100644 lite/backends/x86/jit/more/mkl/CMakeLists.txt
 delete mode 100644 lite/backends/x86/jit/more/mkl/mkl.cc
 delete mode 100644 lite/backends/x86/jit/more/mkl/mkl.h
 delete mode 100644 lite/backends/x86/jit/refer/CMakeLists.txt
 delete mode 100644 lite/backends/x86/jit/refer/refer.cc
 delete mode 100644 lite/backends/x86/jit/refer/refer.h
 delete mode 100644 lite/backends/x86/jit/registry.h
 delete mode 100644 lite/backends/x86/jit/test.cc
 delete mode 100644 lite/backends/x86/legacy_place.h
 delete mode 100644 lite/backends/x86/math/CMakeLists.txt
 delete mode 100644 lite/backends/x86/math/beam_search.cc
 delete mode 100644 lite/backends/x86/math/beam_search.h
 delete mode 100644 lite/backends/x86/math/beam_search_test.cc
 delete mode 100644 lite/backends/x86/math/blas.cc
 delete mode 100644 lite/backends/x86/math/blas.h
 delete mode 100644 lite/backends/x86/math/blas_impl.h
 delete mode 100644 lite/backends/x86/math/concat_and_split.cc
 delete mode 100644 lite/backends/x86/math/concat_and_split.h
 delete mode 100644 lite/backends/x86/math/context_project.cc
 delete mode 100644 lite/backends/x86/math/context_project.h
 delete mode 100644 lite/backends/x86/math/cos_sim_functor.cc
 delete mode 100644 lite/backends/x86/math/cos_sim_functor.h
 delete mode 100644 lite/backends/x86/math/cpu_vec.h
 delete mode 100644 lite/backends/x86/math/cross_entropy.cc
 delete mode 100644 lite/backends/x86/math/cross_entropy.h
 delete mode 100644 lite/backends/x86/math/detail/CMakeLists.txt
 delete mode 100644 lite/backends/x86/math/detail/activation_functions.h
 delete mode 100644 lite/backends/x86/math/detail/avx_functions.cc
 delete mode 100644 lite/backends/x86/math/detail/avx_mathfun.h
 delete mode 100644 lite/backends/x86/math/detail/gru_cpu_kernel.h
 delete mode 100644 lite/backends/x86/math/detail/gru_kernel.h
 delete mode 100644 lite/backends/x86/math/gru_compute.cc
 delete mode 100644 lite/backends/x86/math/gru_compute.h
 delete mode 100644 lite/backends/x86/math/im2col.cc
 delete mode 100644 lite/backends/x86/math/im2col.h
 delete mode 100644 lite/backends/x86/math/im2col_cfo_cpu.h
 delete mode 100644 lite/backends/x86/math/im2col_test.cc
 delete mode 100644 lite/backends/x86/math/math_function.cc
 delete mode 100644 lite/backends/x86/math/math_function.h
 delete mode 100644 lite/backends/x86/math/math_function_impl.h
 delete mode 100644 lite/backends/x86/math/math_function_test.cc
 delete mode 100644 lite/backends/x86/math/maxouting.cc
 delete mode 100644 lite/backends/x86/math/maxouting.h
 delete mode 100644 lite/backends/x86/math/pooling.cc
 delete mode 100644 lite/backends/x86/math/pooling.h
 delete mode 100644 lite/backends/x86/math/prelu.h
 delete mode 100644 lite/backends/x86/math/sample_prob.cc
 delete mode 100644 lite/backends/x86/math/sample_prob.h
 delete mode 100644 lite/backends/x86/math/sampler.cc
 delete mode 100644 lite/backends/x86/math/sampler.h
 delete mode 100644 lite/backends/x86/math/sequence2batch.cc
 delete mode 100644 lite/backends/x86/math/sequence2batch.h
 delete mode 100644 lite/backends/x86/math/sequence_padding.cc
 delete mode 100644 lite/backends/x86/math/sequence_padding.h
 delete mode 100644 lite/backends/x86/math/sequence_pooling.cc
 delete mode 100644 lite/backends/x86/math/sequence_pooling.h
 delete mode 100644 lite/backends/x86/math/sequence_pooling_test.cc
 delete mode 100644 lite/backends/x86/math/sequence_scale.cc
 delete mode 100644 lite/backends/x86/math/sequence_scale.h
 delete mode 100644 lite/backends/x86/math/softmax.cc
 delete mode 100644 lite/backends/x86/math/softmax.h
 delete mode 100644 lite/backends/x86/math/softmax_impl.h
 delete mode 100644 lite/backends/x86/math/tree2col.cc
 delete mode 100644 lite/backends/x86/math/tree2col.h
 delete mode 100644 lite/backends/x86/math/unpooling.cc
 delete mode 100644 lite/backends/x86/math/unpooling.h
 delete mode 100644 lite/backends/x86/math/vol2col.cc
 delete mode 100644 lite/backends/x86/math/vol2col.h
 delete mode 100644 lite/backends/x86/mklml.cc
 delete mode 100644 lite/backends/x86/mklml.h
 delete mode 100644 lite/backends/x86/port.h
 delete mode 100644 lite/backends/x86/target_wrapper.cc
 delete mode 100644 lite/backends/x86/target_wrapper.h
 delete mode 100644 lite/backends/x86/warpctc_lib_path.h.in
 delete mode 100644 lite/core/CMakeLists.txt
 delete mode 100644 lite/core/arena/CMakeLists.txt
 delete mode 100644 lite/core/arena/framework.cc
 delete mode 100644 lite/core/arena/framework.h
 delete mode 100644 lite/core/arena/framework_test.cc
 delete mode 100644 lite/core/context.cc
 delete mode 100644 lite/core/context.h
 delete mode 100644 lite/core/context_test.cc
 delete mode 100644 lite/core/device_info.cc
 delete mode 100644 lite/core/device_info.h
 delete mode 100644 lite/core/framework.proto
 delete mode 100644 lite/core/kernel.cc
 delete mode 100644 lite/core/kernel.h
 delete mode 100644 lite/core/kernel_test.cc
 delete mode 100644 lite/core/lite.map
 delete mode 100644 lite/core/lite_gtest_main.cc
 delete mode 100644 lite/core/lite_tensor_test.cc
 delete mode 100644 lite/core/memory.cc
 delete mode 100644 lite/core/memory.h
 delete mode 100644 lite/core/memory_test.cc
 delete mode 100644 lite/core/mir/CMakeLists.txt
 delete mode 100644 lite/core/mir/argument_type_display_pass.cc
 delete mode 100644 lite/core/mir/demo_pass.cc
 delete mode 100644 lite/core/mir/dot.h
 delete mode 100644 lite/core/mir/elimination/CMakeLists.txt
 delete mode 100644 lite/core/mir/elimination/identity_scale_eliminate_pass.cc
 delete mode 100644 lite/core/mir/elimination/identity_scale_eliminate_pass_test.cc
 delete mode 100644 lite/core/mir/fusion/CMakeLists.txt
 delete mode 100644 lite/core/mir/fusion/conv_activation_fuse_pass.cc
 delete mode 100644 lite/core/mir/fusion/conv_activation_fuse_pass.h
 delete mode 100644 lite/core/mir/fusion/conv_activation_fuser.cc
 delete mode 100644 lite/core/mir/fusion/conv_activation_fuser.h
 delete mode 100644 lite/core/mir/fusion/conv_bn_fuse_pass.cc
 delete mode 100644 lite/core/mir/fusion/conv_bn_fuse_pass.h
 delete mode 100644 lite/core/mir/fusion/conv_bn_fuse_pass_test.cc
 delete mode 100644 lite/core/mir/fusion/conv_bn_fuser.cc
 delete mode 100644 lite/core/mir/fusion/conv_bn_fuser.h
 delete mode 100644 lite/core/mir/fusion/conv_elementwise_add_activation_fuse_pass_test.cc
 delete mode 100644 lite/core/mir/fusion/conv_elementwise_fuse_pass.cc
 delete mode 100644 lite/core/mir/fusion/conv_elementwise_fuse_pass.h
 delete mode 100644 lite/core/mir/fusion/conv_elementwise_fuser.cc
 delete mode 100644 lite/core/mir/fusion/conv_elementwise_fuser.h
 delete mode 100644 lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc
 delete mode 100644 lite/core/mir/fusion/elementwise_add_activation_fuse_pass.h
 delete mode 100644 lite/core/mir/fusion/elementwise_add_activation_fuse_pass_test.cc
 delete mode 100644 lite/core/mir/fusion/elementwise_add_activation_fuser.cc
 delete mode 100644 lite/core/mir/fusion/elementwise_add_activation_fuser.h
 delete mode 100644 lite/core/mir/fusion/fc_fuse_pass.cc
 delete mode 100644 lite/core/mir/fusion/fc_fuse_pass.h
 delete mode 100644 lite/core/mir/fusion/fc_fuse_pass_test.cc
 delete mode 100644 lite/core/mir/fusion/fc_fuser.cc
 delete mode 100644 lite/core/mir/fusion/fc_fuser.h
 delete mode 100644 lite/core/mir/fusion/interpolate_fuse_pass.cc
 delete mode 100644 lite/core/mir/fusion/interpolate_fuse_pass.h
 delete mode 100644 lite/core/mir/fusion/interpolate_fuser.cc
 delete mode 100644 lite/core/mir/fusion/interpolate_fuser.h
 delete mode 100644 lite/core/mir/fusion/quant_dequant_fuse_pass.cc
 delete mode 100644 lite/core/mir/fusion/quant_dequant_fuse_pass.h
 delete mode 100644 lite/core/mir/fusion/quant_dequant_op_fuser.cc
 delete mode 100644 lite/core/mir/fusion/quant_dequant_op_fuser.h
 delete mode 100644 lite/core/mir/fusion/shuffle_channel_fuse_pass.cc
 delete mode 100644 lite/core/mir/fusion/shuffle_channel_fuse_pass.h
 delete mode 100644 lite/core/mir/fusion/shuffle_channel_fuser.cc
 delete mode 100644 lite/core/mir/fusion/shuffle_channel_fuser.h
 delete mode 100644 lite/core/mir/fusion/transpose_softmax_transpose_fuse_pass.cc
 delete mode 100644 lite/core/mir/fusion/transpose_softmax_transpose_fuse_pass.h
 delete mode 100644 lite/core/mir/fusion/transpose_softmax_transpose_fuser.cc
 delete mode 100644 lite/core/mir/fusion/transpose_softmax_transpose_fuser.h
 delete mode 100644 lite/core/mir/generate_program_pass.cc
 delete mode 100644 lite/core/mir/generate_program_pass.h
 delete mode 100644 lite/core/mir/graph_visualize_pass.cc
 delete mode 100644 lite/core/mir/graph_visualize_pass.h
 delete mode 100644 lite/core/mir/io_copy_kernel_pick_pass.cc
 delete mode 100644 lite/core/mir/node.cc
 delete mode 100644 lite/core/mir/node.h
 delete mode 100644 lite/core/mir/pass.cc
 delete mode 100644 lite/core/mir/pass.h
 delete mode 100644 lite/core/mir/pass_manager.cc
 delete mode 100644 lite/core/mir/pass_manager.h
 delete mode 100644 lite/core/mir/pass_manager_test.cc
 delete mode 100644 lite/core/mir/pass_registry.cc
 delete mode 100644 lite/core/mir/pass_registry.h
 delete mode 100644 lite/core/mir/pattern_matcher.cc
 delete mode 100644 lite/core/mir/pattern_matcher.h
 delete mode 100644 lite/core/mir/pattern_matcher_high_api.cc
 delete mode 100644 lite/core/mir/pattern_matcher_high_api.h
 delete mode 100644 lite/core/mir/pattern_matcher_high_api_test.cc
 delete mode 100644 lite/core/mir/pattern_matcher_test.cc
 delete mode 100644 lite/core/mir/pattern_matcher_tester.cc
 delete mode 100644 lite/core/mir/runtime_context_assign_pass.cc
 delete mode 100644 lite/core/mir/ssa_graph.cc
 delete mode 100644 lite/core/mir/ssa_graph.h
 delete mode 100644 lite/core/mir/ssa_graph_test.cc
 delete mode 100644 lite/core/mir/static_kernel_pick_pass.cc
 delete mode 100644 lite/core/mir/static_kernel_pick_pass.h
 delete mode 100644 lite/core/mir/subgraph/CMakeLists.txt
 delete mode 100644 lite/core/mir/subgraph/generate_npu_program_pass.cc
 delete mode 100644 lite/core/mir/subgraph/generate_npu_program_pass.h
 delete mode 100644 lite/core/mir/subgraph/generate_npu_program_pass_test.cc
 delete mode 100644 lite/core/mir/subgraph/subgraph_program_pass.cc
 delete mode 100644 lite/core/mir/subgraph/subgraph_program_pass.h
 delete mode 100644 lite/core/mir/subgraph/subgraph_program_pass_test.cc
 delete mode 100644 lite/core/mir/type_layout_cast_pass.cc
 delete mode 100644 lite/core/mir/type_layout_cast_pass.h
 delete mode 100644 lite/core/mir/type_precision_cast_pass.cc
 delete mode 100644 lite/core/mir/type_precision_cast_pass.h
 delete mode 100644 lite/core/mir/type_target_cast_pass.cc
 delete mode 100644 lite/core/mir/type_target_cast_pass.h
 delete mode 100644 lite/core/mir/variable_place_inference_pass.cc
 delete mode 100644 lite/core/mir/variable_place_inference_pass.h
 delete mode 100644 lite/core/mir/variable_place_inference_pass_test.cc
 delete mode 100644 lite/core/naive_test_model.py
 delete mode 100644 lite/core/op_lite.cc
 delete mode 100644 lite/core/op_lite.h
 delete mode 100644 lite/core/op_lite_test.cc
 delete mode 100644 lite/core/op_registry.cc
 delete mode 100644 lite/core/op_registry.h
 delete mode 100644 lite/core/optimizer.cc
 delete mode 100644 lite/core/optimizer.h
 delete mode 100644 lite/core/optimizer_test.cc
 delete mode 100644 lite/core/profile/CMakeLists.txt
 delete mode 100644 lite/core/profile/basic_profiler.cc
 delete mode 100644 lite/core/profile/basic_profiler.h
 delete mode 100644 lite/core/profile/basic_profiler_test.cc
 delete mode 100644 lite/core/profile/precision_profiler.h
 delete mode 100644 lite/core/program.cc
 delete mode 100644 lite/core/program.h
 delete mode 100644 lite/core/program_fake_utils.cc
 delete mode 100644 lite/core/program_fake_utils.h
 delete mode 100644 lite/core/scope.cc
 delete mode 100644 lite/core/scope.h
 delete mode 100644 lite/core/scope_test.cc
 delete mode 100644 lite/core/target_wrapper.cc
 delete mode 100644 lite/core/target_wrapper.h
 delete mode 100644 lite/core/tensor.cc
 delete mode 100644 lite/core/tensor.h
 delete mode 100644 lite/core/type_system.cc
 delete mode 100644 lite/core/type_system.h
 delete mode 100644 lite/core/type_system_test.cc
 delete mode 100644 lite/core/types.cc
 delete mode 100644 lite/core/types.h
 delete mode 100644 lite/core/types_test.cc
 delete mode 100644 lite/core/variable.cc
 delete mode 100644 lite/core/variable.h
 delete mode 100644 lite/core/workspace.cc
 delete mode 100644 lite/core/workspace.h
 delete mode 100644 lite/demo/cxx/Makefile.def
 delete mode 100644 lite/demo/cxx/README.md
 delete mode 100644 lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7
 delete mode 100644 lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8
 delete mode 100644 lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7
 delete mode 100644 lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8
 delete mode 100644 lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
 delete mode 100644 lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
 delete mode 100644 lite/demo/java/README.md
 delete mode 100644 lite/demo/java/android/PaddlePredictor/.gitignore
 delete mode 100644 lite/demo/java/android/PaddlePredictor/app/.gitignore
 delete mode 100644 lite/demo/java/android/PaddlePredictor/app/build.gradle
 delete mode 100644 lite/demo/java/android/PaddlePredictor/app/proguard-rules.pro
 delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/androidTest/java/com/baidu/paddle/lite/ExampleInstrumentedTest.java
 delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/AndroidManifest.xml
 delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/assets/README.txt
 delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/java/com/baidu/paddle/lite/MainActivity.java
 delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/drawable-v24/ic_launcher_foreground.xml
 delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/drawable/ic_launcher_background.xml
 delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/layout/activity_main.xml
 delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml
 delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml
 delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-hdpi/ic_launcher.png
 delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-hdpi/ic_launcher_round.png
 delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-mdpi/ic_launcher.png
 delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-mdpi/ic_launcher_round.png
 delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xhdpi/ic_launcher.png
 delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xhdpi/ic_launcher_round.png
 delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxhdpi/ic_launcher.png
 delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.png
 delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png
 delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png
 delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/values/colors.xml
 delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/values/strings.xml
 delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/main/res/values/styles.xml
 delete mode 100644 lite/demo/java/android/PaddlePredictor/app/src/test/java/com/baidu/paddle/lite/ExampleUnitTest.java
 delete mode 100644 lite/demo/java/android/PaddlePredictor/build.gradle
 delete mode 100644 lite/demo/java/android/PaddlePredictor/gradle.properties
 delete mode 100644 lite/demo/java/android/PaddlePredictor/gradle/wrapper/gradle-wrapper.jar
 delete mode 100644 lite/demo/java/android/PaddlePredictor/gradle/wrapper/gradle-wrapper.properties
 delete mode 100755 lite/demo/java/android/PaddlePredictor/gradlew
 delete mode 100644 lite/demo/java/android/PaddlePredictor/gradlew.bat
 delete mode 100644 lite/demo/java/android/PaddlePredictor/settings.gradle
 delete mode 100644 lite/demo/java/android/prepare_demo.bash
 delete mode 100644 lite/fluid/CMakeLists.txt
 delete mode 100644 lite/fluid/data_type.cc
 delete mode 100644 lite/fluid/data_type.h
 delete mode 100644 lite/fluid/data_type_test.cc
 delete mode 100644 lite/fluid/eigen.h
 delete mode 100644 lite/fluid/float16.h
 delete mode 100644 lite/fluid/lod.h
 delete mode 100644 lite/fluid/math.h
 delete mode 100644 lite/gen_code/CMakeLists.txt
 delete mode 100644 lite/gen_code/gen_code.cc
 delete mode 100644 lite/gen_code/gen_code.h
 delete mode 100644 lite/gen_code/gen_code_test.cc
 delete mode 100644 lite/gen_code/generated_code_test.cc
 delete mode 100644 lite/gen_code/paddle_code_generator.cc
 delete mode 100644 lite/gen_code/paddle_infer.cc
 delete mode 100644 lite/gen_code/paddle_infer.h
 delete mode 100644 lite/kernels/CMakeLists.txt
 delete mode 100644 lite/kernels/arm/CMakeLists.txt
 delete mode 100644 lite/kernels/arm/activation_compute.cc
 delete mode 100644 lite/kernels/arm/activation_compute.h
 delete mode 100644 lite/kernels/arm/affine_channel_compute.cc
 delete mode 100644 lite/kernels/arm/affine_channel_compute.h
 delete mode 100644 lite/kernels/arm/anchor_generator_compute.cc
 delete mode 100644 lite/kernels/arm/anchor_generator_compute.h
 delete mode 100644 lite/kernels/arm/argmax_compute.cc
 delete mode 100644 lite/kernels/arm/argmax_compute.h
 delete mode 100644 lite/kernels/arm/argmax_compute_test.cc
 delete mode 100644 lite/kernels/arm/assign_compute.cc
 delete mode 100644 lite/kernels/arm/assign_compute.h
 delete mode 100644 lite/kernels/arm/assign_value_compute.cc
 delete mode 100644 lite/kernels/arm/assign_value_compute.h
 delete mode 100644 lite/kernels/arm/axpy_compute.cc
 delete mode 100644 lite/kernels/arm/axpy_compute.h
 delete mode 100644 lite/kernels/arm/axpy_compute_test.cc
 delete mode 100644 lite/kernels/arm/batch_norm_compute.cc
 delete mode 100644 lite/kernels/arm/batch_norm_compute.h
 delete mode 100644 lite/kernels/arm/batch_norm_compute_test.cc
 delete mode 100644 lite/kernels/arm/beam_search_compute.cc
 delete mode 100644 lite/kernels/arm/beam_search_compute.h
 delete mode 100644 lite/kernels/arm/beam_search_decode_compute.cc
 delete mode 100644 lite/kernels/arm/beam_search_decode_compute.h
 delete mode 100644 lite/kernels/arm/box_clip_compute.cc
 delete mode 100644 lite/kernels/arm/box_clip_compute.h
 delete mode 100644 lite/kernels/arm/box_coder_compute.cc
 delete mode 100644 lite/kernels/arm/box_coder_compute.h
 delete mode 100644 lite/kernels/arm/calib_compute.cc
 delete mode 100644 lite/kernels/arm/calib_compute.h
 delete mode 100644 lite/kernels/arm/calib_compute_test.cc
 delete mode 100644 lite/kernels/arm/cast_compute.cc
 delete mode 100644 lite/kernels/arm/cast_compute.h
 delete mode 100644 lite/kernels/arm/compare_compute.cc
 delete mode 100644 lite/kernels/arm/compare_compute.h
 delete mode 100644 lite/kernels/arm/concat_compute.cc
 delete mode 100644 lite/kernels/arm/concat_compute.h
 delete mode 100644 lite/kernels/arm/concat_compute_test.cc
 delete mode 100644 lite/kernels/arm/conv_compute.cc
 delete mode 100644 lite/kernels/arm/conv_compute.h
 delete mode 100644 lite/kernels/arm/conv_compute_test.cc
 delete mode 100644 lite/kernels/arm/conv_transpose_compute.cc
 delete mode 100644 lite/kernels/arm/conv_transpose_compute.h
 delete mode 100644 lite/kernels/arm/conv_transpose_compute_test.cc
 delete mode 100644 lite/kernels/arm/crop_compute.cc
 delete mode 100644 lite/kernels/arm/crop_compute.h
 delete mode 100644 lite/kernels/arm/decode_bboxes_compute.cc
 delete mode 100644 lite/kernels/arm/decode_bboxes_compute.h
 delete mode 100644 lite/kernels/arm/decode_bboxes_compute_test.cc
 delete mode 100644 lite/kernels/arm/density_prior_box_compute.cc
 delete mode 100644 lite/kernels/arm/density_prior_box_compute.h
 delete mode 100644 lite/kernels/arm/dropout_compute.cc
 delete mode 100644 lite/kernels/arm/dropout_compute.h
 delete mode 100644 lite/kernels/arm/dropout_compute_test.cc
 delete mode 100644 lite/kernels/arm/elementwise_compute.cc
 delete mode 100644 lite/kernels/arm/elementwise_compute.h
 delete mode 100644 lite/kernels/arm/elementwise_compute_test.cc
 delete mode 100644 lite/kernels/arm/expand_compute.cc
 delete mode 100644 lite/kernels/arm/expand_compute.h
 delete mode 100644 lite/kernels/arm/fc_compute.cc
 delete mode 100644 lite/kernels/arm/fc_compute.h
 delete mode 100644 lite/kernels/arm/fc_compute_test.cc
 delete mode 100644 lite/kernels/arm/fill_constant_compute.cc
 delete mode 100644 lite/kernels/arm/generate_proposals_compute.cc
 delete mode 100644 lite/kernels/arm/generate_proposals_compute.h
 delete mode 100644 lite/kernels/arm/gru_compute.cc
 delete mode 100644 lite/kernels/arm/gru_compute.h
 delete mode 100644 lite/kernels/arm/gru_unit_compute.cc
 delete mode 100644 lite/kernels/arm/gru_unit_compute.h
 delete mode 100644 lite/kernels/arm/im2sequence_compute.cc
 delete mode 100644 lite/kernels/arm/im2sequence_compute.h
 delete mode 100644 lite/kernels/arm/increment_compute.cc
 delete mode 100644 lite/kernels/arm/increment_compute.h
 delete mode 100644 lite/kernels/arm/interpolate_compute.cc
 delete mode 100644 lite/kernels/arm/interpolate_compute.h
 delete mode 100644 lite/kernels/arm/is_empty_compute.cc
 delete mode 100644 lite/kernels/arm/is_empty_compute.h
 delete mode 100644 lite/kernels/arm/lod_reset_compute.cc
 delete mode 100644 lite/kernels/arm/lod_reset_compute.h
 delete mode 100644 lite/kernels/arm/logical_compute.cc
 delete mode 100644 lite/kernels/arm/logical_compute.h
 delete mode 100644 lite/kernels/arm/lookup_table_compute.cc
 delete mode 100644 lite/kernels/arm/lookup_table_compute.h
 delete mode 100644 lite/kernels/arm/lrn_compute.cc
 delete mode 100644 lite/kernels/arm/lrn_compute.h
 delete mode 100644 lite/kernels/arm/lrn_compute_test.cc
 delete mode 100644 lite/kernels/arm/matmul_compute.cc
 delete mode 100644 lite/kernels/arm/matmul_compute.h
 delete mode 100644 lite/kernels/arm/mul_compute.cc
 delete mode 100644 lite/kernels/arm/mul_compute.h
 delete mode 100644 lite/kernels/arm/mul_compute_test.cc
 delete mode 100644 lite/kernels/arm/negative_compute.cc
 delete mode 100644 lite/kernels/arm/negative_compute.h
 delete mode 100644 lite/kernels/arm/norm_compute.cc
 delete mode 100644 lite/kernels/arm/norm_compute.h
 delete mode 100644 lite/kernels/arm/pad2d_compute.cc
 delete mode 100644 lite/kernels/arm/pad2d_compute.h
 delete mode 100644 lite/kernels/arm/pool_compute.cc
 delete mode 100644 lite/kernels/arm/pool_compute.h
 delete mode 100644 lite/kernels/arm/pool_compute_test.cc
 delete mode 100644 lite/kernels/arm/power_compute.cc
 delete mode 100644 lite/kernels/arm/power_compute.h
 delete mode 100644 lite/kernels/arm/prior_box_compute.cc
 delete mode 100644 lite/kernels/arm/prior_box_compute.h
 delete mode 100644 lite/kernels/arm/read_from_array_compute.cc
 delete mode 100644 lite/kernels/arm/read_from_array_compute.h
 delete mode 100644 lite/kernels/arm/reduce_max_compute.cc
 delete mode 100644 lite/kernels/arm/reduce_max_compute.h
 delete mode 100644 lite/kernels/arm/reduce_mean_compute.cc
 delete mode 100644 lite/kernels/arm/reduce_mean_compute.h
 delete mode 100644 lite/kernels/arm/roi_align_compute.cc
 delete mode 100644 lite/kernels/arm/roi_align_compute.h
 delete mode 100644 lite/kernels/arm/scale_compute.cc
 delete mode 100644 lite/kernels/arm/scale_compute.h
 delete mode 100644 lite/kernels/arm/scale_compute_test.cc
 delete mode 100644 lite/kernels/arm/sequence_expand_compute.cc
 delete mode 100644 lite/kernels/arm/sequence_expand_compute.h
 delete mode 100644 lite/kernels/arm/sequence_pool_compute.cc
 delete mode 100644 lite/kernels/arm/sequence_pool_compute.h
 delete mode 100644 lite/kernels/arm/sequence_softmax_compute.cc
 delete mode 100644 lite/kernels/arm/sequence_softmax_compute.h
 delete mode 100644 lite/kernels/arm/shape_compute.cc
 delete mode 100644 lite/kernels/arm/shape_compute.h
 delete mode 100644 lite/kernels/arm/shuffle_channel_compute.cc
 delete mode 100644 lite/kernels/arm/shuffle_channel_compute.h
 delete mode 100644 lite/kernels/arm/slice_compute.cc
 delete mode 100644 lite/kernels/arm/slice_compute.h
 delete mode 100644 lite/kernels/arm/softmax_compute.cc
 delete mode 100644 lite/kernels/arm/softmax_compute.h
 delete mode 100644 lite/kernels/arm/softmax_compute_test.cc
 delete mode 100644 lite/kernels/arm/split_compute.cc
 delete mode 100644 lite/kernels/arm/split_compute.h
 delete mode 100644 lite/kernels/arm/split_compute_test.cc
 delete mode 100644 lite/kernels/arm/squeeze_compute.cc
 delete mode 100644 lite/kernels/arm/squeeze_compute.h
 delete mode 100644 lite/kernels/arm/stack_compute.cc
 delete mode 100644 lite/kernels/arm/stack_compute.h
 delete mode 100644 lite/kernels/arm/topk_compute.cc
 delete mode 100644 lite/kernels/arm/topk_compute.h
 delete mode 100644 lite/kernels/arm/transpose_compute.cc
 delete mode 100644 lite/kernels/arm/transpose_compute.h
 delete mode 100644 lite/kernels/arm/transpose_compute_test.cc
 delete mode 100644 lite/kernels/arm/while_compute.cc
 delete mode 100644 lite/kernels/arm/while_compute.h
 delete mode 100644 lite/kernels/arm/write_to_array_compute.cc
 delete mode 100644 lite/kernels/arm/write_to_array_compute.h
 delete mode 100644 lite/kernels/arm/yolo_box_compute.cc
 delete mode 100644 lite/kernels/arm/yolo_box_compute.h
 delete mode 100644 lite/kernels/cuda/CMakeLists.txt
 delete mode 100644 lite/kernels/cuda/calib_compute.cu
 delete mode 100644 lite/kernels/cuda/calib_compute.h
 delete mode 100644 lite/kernels/cuda/calib_compute_cuda_test.cc
 delete mode 100644 lite/kernels/cuda/concat_compute.cu
 delete mode 100644 lite/kernels/cuda/concat_compute.h
 delete mode 100644 lite/kernels/cuda/concat_compute_test.cc
 delete mode 100644 lite/kernels/cuda/conv_compute.cc
 delete mode 100644 lite/kernels/cuda/conv_compute.h
 delete mode 100644 lite/kernels/cuda/conv_compute_test.cc
 delete mode 100644 lite/kernels/cuda/elementwise_add_compute.cu
 delete mode 100644 lite/kernels/cuda/elementwise_add_compute.h
 delete mode 100644 lite/kernels/cuda/elementwise_add_compute_test.cc
 delete mode 100644 lite/kernels/cuda/io_copy_compute.cc
 delete mode 100644 lite/kernels/cuda/leaky_relu_compute.cu
 delete mode 100644 lite/kernels/cuda/leaky_relu_compute.h
 delete mode 100644 lite/kernels/cuda/leaky_relu_compute_test.cc
 delete mode 100644 lite/kernels/cuda/mul_compute.cc
 delete mode 100644 lite/kernels/cuda/mul_compute.h
 delete mode 100644 lite/kernels/cuda/nearest_interp_compute.cu
 delete mode 100644 lite/kernels/cuda/nearest_interp_compute.h
 delete mode 100644 lite/kernels/cuda/nearest_interp_compute_test.cc
 delete mode 100644 lite/kernels/cuda/transpose_compute.cu
 delete mode 100644 lite/kernels/cuda/transpose_compute.h
 delete mode 100644 lite/kernels/cuda/transpose_compute_test.cc
 delete mode 100644 lite/kernels/cuda/use_kernels.h
 delete mode 100644 lite/kernels/cuda/yolo_box_compute.cu
 delete mode 100644 lite/kernels/cuda/yolo_box_compute.h
 delete mode 100644 lite/kernels/cuda/yolo_box_compute_test.cc
 delete mode 100644 lite/kernels/fpga/CMakeLists.txt
 delete mode 100644 lite/kernels/fpga/activation_compute.cc
 delete mode 100644 lite/kernels/fpga/activation_compute.h
 delete mode 100644 lite/kernels/fpga/activation_compute_test.cc
 delete mode 100644 lite/kernels/fpga/calib_compute.cc
 delete mode 100644 lite/kernels/fpga/calib_compute.h
 delete mode 100644 lite/kernels/fpga/conv_compute.cc
 delete mode 100644 lite/kernels/fpga/conv_compute.h
 delete mode 100644 lite/kernels/fpga/conv_compute_test.cc
 delete mode 100644 lite/kernels/fpga/elementwise_compute.cc
 delete mode 100644 lite/kernels/fpga/elementwise_compute.h
 delete mode 100644 lite/kernels/fpga/elementwise_compute_test.cc
 delete mode 100644 lite/kernels/fpga/fc_compute.cc
 delete mode 100644 lite/kernels/fpga/fc_compute.h
 delete mode 100644 lite/kernels/fpga/fc_compute_test.cc
 delete mode 100644 lite/kernels/fpga/feed_compute.cc
 delete mode 100644 lite/kernels/fpga/feed_compute.h
 delete mode 100644 lite/kernels/fpga/fetch_compute.cc
 delete mode 100644 lite/kernels/fpga/fetch_compute.h
 delete mode 100644 lite/kernels/fpga/io_copy_compute.cc
 delete mode 100644 lite/kernels/fpga/layout_compute.cc
 delete mode 100644 lite/kernels/fpga/pooling_compute.cc
 delete mode 100644 lite/kernels/fpga/pooling_compute.h
 delete mode 100644 lite/kernels/fpga/pooling_compute_test.cc
 delete mode 100644 lite/kernels/fpga/scale_compute.cc
 delete mode 100644 lite/kernels/fpga/scale_compute.h
 delete mode 100644 lite/kernels/fpga/softmax_compute.cc
 delete mode 100644 lite/kernels/fpga/softmax_compute.h
 delete mode 100644 lite/kernels/fpga/softmax_compute_test.cc
 delete mode 100644 lite/kernels/host/CMakeLists.txt
 delete mode 100644 lite/kernels/host/feed_compute.cc
 delete mode 100644 lite/kernels/host/fetch_compute.cc
 delete mode 100644 lite/kernels/host/multiclass_nms_compute.cc
 delete mode 100644 lite/kernels/host/multiclass_nms_compute.h
 delete mode 100644 lite/kernels/host/multiclass_nms_compute_test.cc
 delete mode 100644 lite/kernels/host/reshape_compute.cc
 delete mode 100644 lite/kernels/host/reshape_compute.h
 delete mode 100644 lite/kernels/host/reshape_compute_test.cc
 delete mode 100644 lite/kernels/host/use_kernels.h
 delete mode 100644 lite/kernels/npu/CMakeLists.txt
 delete mode 100644 lite/kernels/npu/graph_compute.cc
 delete mode 100644 lite/kernels/npu/graph_compute.h
 delete mode 100644 lite/kernels/opencl/CMakeLists.txt
 delete mode 100644 lite/kernels/opencl/conv_compute.cc
 delete mode 100644 lite/kernels/opencl/conv_compute.h
 delete mode 100644 lite/kernels/opencl/conv_compute_test.cc
 delete mode 100644 lite/kernels/opencl/depthwise_conv2d_compute.cc
 delete mode 100644 lite/kernels/opencl/depthwise_conv2d_compute_test.cc
 delete mode 100644 lite/kernels/opencl/elementwise_add_compute.cc
 delete mode 100644 lite/kernels/opencl/elementwise_add_compute.h
 delete mode 100644 lite/kernels/opencl/elementwise_add_compute_test.cc
 delete mode 100644 lite/kernels/opencl/fc_compute.cc
 delete mode 100644 lite/kernels/opencl/fc_compute_test.cc
 delete mode 100644 lite/kernels/opencl/fusion_elementwise_add_activation_compute.cc
 delete mode 100644 lite/kernels/opencl/io_copy_compute.cc
 delete mode 100644 lite/kernels/opencl/io_copy_compute_test.cc
 delete mode 100644 lite/kernels/opencl/mul_compute.cc
 delete mode 100644 lite/kernels/opencl/mul_compute_test.cc
 delete mode 100644 lite/kernels/opencl/pool_compute.cc
 delete mode 100644 lite/kernels/opencl/pool_compute_test.cc
 delete mode 100644 lite/kernels/opencl/relu_compute.cc
 delete mode 100644 lite/kernels/opencl/relu_compute_test.cc
 delete mode 100644 lite/kernels/x86/CMakeLists.txt
 delete mode 100644 lite/kernels/x86/activation_compute.cc
 delete mode 100644 lite/kernels/x86/batch_norm_compute.cc
 delete mode 100644 lite/kernels/x86/batch_norm_compute.h
 delete mode 100644 lite/kernels/x86/batch_norm_compute_test.cc
 delete mode 100644 lite/kernels/x86/concat_compute.cc
 delete mode 100644 lite/kernels/x86/concat_compute.h
 delete mode 100644 lite/kernels/x86/concat_compute_test.cc
 delete mode 100644 lite/kernels/x86/conv_compute.cc
 delete mode 100644 lite/kernels/x86/conv_compute.h
 delete mode 100644 lite/kernels/x86/conv_compute_test.cc
 delete mode 100644 lite/kernels/x86/dropout_compute.cc
 delete mode 100644 lite/kernels/x86/dropout_compute.h
 delete mode 100644 lite/kernels/x86/dropout_compute_test.cc
 delete mode 100644 lite/kernels/x86/elementwise_compute.cc
 delete mode 100644 lite/kernels/x86/elementwise_compute.h
 delete mode 100644 lite/kernels/x86/elementwise_compute_test.cc
 delete mode 100644 lite/kernels/x86/fc_compute.cc
 delete mode 100644 lite/kernels/x86/fc_compute.h
 delete mode 100644 lite/kernels/x86/fc_compute_test.cc
 delete mode 100644 lite/kernels/x86/fill_constant_compute.cc
 delete mode 100644 lite/kernels/x86/mean_compute.cc
 delete mode 100644 lite/kernels/x86/mul_compute.cc
 delete mode 100644 lite/kernels/x86/mul_compute.h
 delete mode 100644 lite/kernels/x86/mul_compute_test.cc
 delete mode 100644 lite/kernels/x86/pool_compute.cc
 delete mode 100644 lite/kernels/x86/pool_compute.h
 delete mode 100644 lite/kernels/x86/pool_compute_test.cc
 delete mode 100644 lite/kernels/x86/relu_compute.cc
 delete mode 100644 lite/kernels/x86/relu_compute.h
 delete mode 100644 lite/kernels/x86/relu_compute_test.cc
 delete mode 100644 lite/kernels/x86/reshape_compute.cc
 delete mode 100644 lite/kernels/x86/reshape_compute.h
 delete mode 100644 lite/kernels/x86/reshape_compute_test.cc
 delete mode 100644 lite/kernels/x86/scale_compute.cc
 delete mode 100644 lite/kernels/x86/scale_compute.h
 delete mode 100644 lite/kernels/x86/scale_compute_test.cc
 delete mode 100644 lite/kernels/x86/sequence_pool_compute.cc
 delete mode 100644 lite/kernels/x86/sequence_pool_compute.h
 delete mode 100644 lite/kernels/x86/sequence_pool_compute_test.cc
 delete mode 100644 lite/kernels/x86/sgd_compute.cc
 delete mode 100644 lite/kernels/x86/shape_compute.cc
 delete mode 100644 lite/kernels/x86/shape_compute.h
 delete mode 100644 lite/kernels/x86/shape_compute_test.cc
 delete mode 100644 lite/kernels/x86/slice_compute.cc
 delete mode 100644 lite/kernels/x86/slice_compute.h
 delete mode 100644 lite/kernels/x86/slice_compute_test.cc
 delete mode 100644 lite/kernels/x86/softmax_compute.cc
 delete mode 100644 lite/kernels/x86/softmax_compute.h
 delete mode 100644 lite/kernels/x86/softmax_compute_test.cc
 delete mode 100644 lite/kernels/x86/squeeze_compute.cc
 delete mode 100644 lite/kernels/x86/squeeze_compute.h
 delete mode 100644 lite/kernels/x86/squeeze_compute_test.cc
 delete mode 100644 lite/kernels/x86/uniform_random_compute.cc
 delete mode 100644 lite/model_parser/CMakeLists.txt
 delete mode 100644 lite/model_parser/compatible_pb.cc
 delete mode 100644 lite/model_parser/compatible_pb.h
 delete mode 100644 lite/model_parser/compatible_pb_test.cc
 delete mode 100644 lite/model_parser/cpp/CMakeLists.txt
 delete mode 100644 lite/model_parser/cpp/block_desc.cc
 delete mode 100644 lite/model_parser/cpp/block_desc.h
 delete mode 100644 lite/model_parser/cpp/op_desc.cc
 delete mode 100644 lite/model_parser/cpp/op_desc.h
 delete mode 100644 lite/model_parser/cpp/program_desc.cc
 delete mode 100644 lite/model_parser/cpp/program_desc.h
 delete mode 100644 lite/model_parser/cpp/var_desc.cc
 delete mode 100644 lite/model_parser/cpp/var_desc.h
 delete mode 100644 lite/model_parser/desc_apis.h
 delete mode 100644 lite/model_parser/model_parser.cc
 delete mode 100644 lite/model_parser/model_parser.h
 delete mode 100644 lite/model_parser/model_parser_test.cc
 delete mode 100644 lite/model_parser/naive_buffer/CMakeLists.txt
 delete mode 100644 lite/model_parser/naive_buffer/block_desc.cc
 delete mode 100644 lite/model_parser/naive_buffer/block_desc.h
 delete mode 100644 lite/model_parser/naive_buffer/combined_params_desc.cc
 delete mode 100644 lite/model_parser/naive_buffer/combined_params_desc.h
 delete mode 100644 lite/model_parser/naive_buffer/naive_buffer.cc
 delete mode 100644 lite/model_parser/naive_buffer/naive_buffer.h
 delete mode 100644 lite/model_parser/naive_buffer/naive_buffer_test.cc
 delete mode 100644 lite/model_parser/naive_buffer/naive_buffer_wrapper_helper.h
 delete mode 100644 lite/model_parser/naive_buffer/naive_buffer_wrapper_test.cc
 delete mode 100644 lite/model_parser/naive_buffer/op_desc.cc
 delete mode 100644 lite/model_parser/naive_buffer/op_desc.h
 delete mode 100644 lite/model_parser/naive_buffer/param_desc.cc
 delete mode 100644 lite/model_parser/naive_buffer/param_desc.h
 delete mode 100644 lite/model_parser/naive_buffer/program_desc.cc
 delete mode 100644 lite/model_parser/naive_buffer/program_desc.h
 delete mode 100644 lite/model_parser/naive_buffer/proto/CMakeLists.txt
 delete mode 100644 lite/model_parser/naive_buffer/proto/framework.nb.cc
 delete mode 100644 lite/model_parser/naive_buffer/proto/framework.nb.h
 delete mode 100644 lite/model_parser/naive_buffer/var_desc.cc
 delete mode 100644 lite/model_parser/naive_buffer/var_desc.h
 delete mode 100644 lite/model_parser/pb/CMakeLists.txt
 delete mode 100644 lite/model_parser/pb/block_desc.cc
 delete mode 100644 lite/model_parser/pb/block_desc.h
 delete mode 100644 lite/model_parser/pb/op_desc.cc
 delete mode 100644 lite/model_parser/pb/op_desc.h
 delete mode 100644 lite/model_parser/pb/program_desc.cc
 delete mode 100644 lite/model_parser/pb/program_desc.h
 delete mode 100644 lite/model_parser/pb/var_desc.cc
 delete mode 100644 lite/model_parser/pb/var_desc.h
 delete mode 100644 lite/model_parser/runtime.cc
 delete mode 100644 lite/model_parser/runtime.h
 delete mode 100644 lite/operators/CMakeLists.txt
 delete mode 100644 lite/operators/activation_ops.cc
 delete mode 100644 lite/operators/activation_ops.h
 delete mode 100644 lite/operators/affine_channel_op.cc
 delete mode 100644 lite/operators/affine_channel_op.h
 delete mode 100644 lite/operators/anchor_generator_op.cc
 delete mode 100644 lite/operators/anchor_generator_op.h
 delete mode 100644 lite/operators/argmax_op.cc
 delete mode 100644 lite/operators/argmax_op.h
 delete mode 100644 lite/operators/assign_op.cc
 delete mode 100644 lite/operators/assign_op.h
 delete mode 100644 lite/operators/assign_value_op.cc
 delete mode 100644 lite/operators/assign_value_op.h
 delete mode 100644 lite/operators/axpy_op.cc
 delete mode 100644 lite/operators/axpy_op.h
 delete mode 100644 lite/operators/batch_norm_op.cc
 delete mode 100644 lite/operators/batch_norm_op.h
 delete mode 100644 lite/operators/batch_norm_op_test.cc
 delete mode 100644 lite/operators/beam_search_decode_op.cc
 delete mode 100644 lite/operators/beam_search_decode_op.h
 delete mode 100644 lite/operators/beam_search_op.cc
 delete mode 100644 lite/operators/beam_search_op.h
 delete mode 100644 lite/operators/box_clip_op.cc
 delete mode 100644 lite/operators/box_clip_op.h
 delete mode 100644 lite/operators/box_coder_op.cc
 delete mode 100644 lite/operators/box_coder_op.h
 delete mode 100644 lite/operators/calib_once_op.cc
 delete mode 100644 lite/operators/calib_once_op.h
 delete mode 100644 lite/operators/calib_op.cc
 delete mode 100644 lite/operators/calib_op.h
 delete mode 100644 lite/operators/calib_op_test.cc
 delete mode 100644 lite/operators/cast_op.cc
 delete mode 100644 lite/operators/cast_op.h
 delete mode 100644 lite/operators/compare_op.cc
 delete mode 100644 lite/operators/compare_op.h
 delete mode 100644 lite/operators/concat_op.cc
 delete mode 100644 lite/operators/concat_op.h
 delete mode 100644 lite/operators/concat_op_test.cc
 delete mode 100644 lite/operators/conv_op.cc
 delete mode 100644 lite/operators/conv_op.h
 delete mode 100644 lite/operators/conv_transpose_op.cc
 delete mode 100644 lite/operators/conv_transpose_op.h
 delete mode 100644 lite/operators/crop_op.cc
 delete mode 100644 lite/operators/crop_op.h
 delete mode 100644 lite/operators/decode_bboxes_op.cc
 delete mode 100644 lite/operators/decode_bboxes_op.h
 delete mode 100644 lite/operators/density_prior_box_op.cc
 delete mode 100644 lite/operators/density_prior_box_op.h
 delete mode 100644 lite/operators/dropout_op.cc
 delete mode 100644 lite/operators/elementwise_ops.cc
 delete mode 100644 lite/operators/elementwise_ops.h
 delete mode 100644 lite/operators/expand_op.cc
 delete mode 100644 lite/operators/expand_op.h
 delete mode 100644 lite/operators/fake_dequantize_max_abs.cc
 delete mode 100644 lite/operators/fake_dequantize_max_abs.h
 delete mode 100644 lite/operators/fake_quantize_moving_avg_max_abs.cc
 delete mode 100644 lite/operators/fake_quantize_moving_avg_max_abs.h
 delete mode 100644 lite/operators/fake_quantize_range_abs_max.cc
 delete mode 100644 lite/operators/fake_quantize_range_abs_max.h
 delete mode 100644 lite/operators/fc_op.cc
 delete mode 100644 lite/operators/fc_op.h
 delete mode 100644 lite/operators/fc_op_test.cc
 delete mode 100644 lite/operators/feed_op.cc
 delete mode 100644 lite/operators/fetch_op.cc
 delete mode 100644 lite/operators/fill_constant_op.cc
 delete mode 100644 lite/operators/flatten_op.cc
 delete mode 100644 lite/operators/flatten_op.h
 delete mode 100644 lite/operators/fusion_elementwise_activation_ops.cc
 delete mode 100644 lite/operators/fusion_elementwise_activation_ops.h
 delete mode 100644 lite/operators/fusion_elementwise_activation_ops_test.cc
 delete mode 100644 lite/operators/generate_proposals_op.cc
 delete mode 100644 lite/operators/generate_proposals_op.h
 delete mode 100644 lite/operators/graph_op.cc
 delete mode 100644 lite/operators/graph_op.h
 delete mode 100644 lite/operators/gru_op.cc
 delete mode 100644 lite/operators/gru_op.h
 delete mode 100644 lite/operators/gru_unit_op.cc
 delete mode 100644 lite/operators/gru_unit_op.h
 delete mode 100644 lite/operators/im2sequence_op.cc
 delete mode 100644 lite/operators/im2sequence_op.h
 delete mode 100644 lite/operators/increment_op.cc
 delete mode 100644 lite/operators/increment_op.h
 delete mode 100644 lite/operators/interpolate_op.cc
 delete mode 100644 lite/operators/interpolate_op.h
 delete mode 100644 lite/operators/io_copy_once_op.cc
 delete mode 100644 lite/operators/io_copy_once_op.h
 delete mode 100644 lite/operators/io_copy_op.cc
 delete mode 100644 lite/operators/io_copy_op.h
 delete mode 100644 lite/operators/is_empty_op.cc
 delete mode 100644 lite/operators/is_empty_op.h
 delete mode 100644 lite/operators/layout_once_op.cc
 delete mode 100644 lite/operators/layout_once_op.h
 delete mode 100644 lite/operators/layout_op.cc
 delete mode 100644 lite/operators/layout_op.h
 delete mode 100644 lite/operators/lod_reset_op.cc
 delete mode 100644 lite/operators/lod_reset_op.h
 delete mode 100644 lite/operators/logical_op.cc
 delete mode 100644 lite/operators/logical_op.h
 delete mode 100644 lite/operators/lookup_table_op.cc
 delete mode 100644 lite/operators/lookup_table_op.h
 delete mode 100644 lite/operators/lrn_op.cc
 delete mode 100644 lite/operators/lrn_op.h
 delete mode 100644 lite/operators/matmul_op.cc
 delete mode 100644 lite/operators/matmul_op.h
 delete mode 100644 lite/operators/mean_op.cc
 delete mode 100644 lite/operators/mul_op.cc
 delete mode 100644 lite/operators/mul_op.h
 delete mode 100644 lite/operators/multiclass_nms_op.cc
 delete mode 100644 lite/operators/multiclass_nms_op.h
 delete mode 100644 lite/operators/negative_op.cc
 delete mode 100644 lite/operators/negative_op.h
 delete mode 100644 lite/operators/norm_op.cc
 delete mode 100644 lite/operators/norm_op.h
 delete mode 100644 lite/operators/op_params.cc
 delete mode 100644 lite/operators/op_params.h
 delete mode 100644 lite/operators/pad2d_op.cc
 delete mode 100644 lite/operators/pad2d_op.h
 delete mode 100644 lite/operators/pool_op.cc
 delete mode 100644 lite/operators/pool_op.h
 delete mode 100644 lite/operators/pool_op_test.cc
 delete mode 100644 lite/operators/power_op.cc
 delete mode 100644 lite/operators/power_op.h
 delete mode 100644 lite/operators/prior_box_op.cc
 delete mode 100644 lite/operators/prior_box_op.h
 delete mode 100644 lite/operators/read_from_array_op.cc
 delete mode 100644 lite/operators/read_from_array_op.h
 delete mode 100644 lite/operators/reduce_max_op.cc
 delete mode 100644 lite/operators/reduce_max_op.h
 delete mode 100644 lite/operators/reduce_mean_op.cc
 delete mode 100644 lite/operators/reduce_mean_op.h
 delete mode 100644 lite/operators/relu_op.cc
 delete mode 100644 lite/operators/relu_op.h
 delete mode 100644 lite/operators/reshape_op.cc
 delete mode 100644 lite/operators/reshape_op.h
 delete mode 100644 lite/operators/reshape_op_test.cc
 delete mode 100644 lite/operators/roi_align_op.cc
 delete mode 100644 lite/operators/roi_align_op.h
 delete mode 100644 lite/operators/scale_op.cc
 delete mode 100644 lite/operators/scale_op.h
 delete mode 100644 lite/operators/scale_op_test.cc
 delete mode 100644 lite/operators/sequence_expand_op.cc
 delete mode 100644 lite/operators/sequence_expand_op.h
 delete mode 100644 lite/operators/sequence_pool_op.cc
 delete mode 100644 lite/operators/sequence_pool_op.h
 delete mode 100644 lite/operators/sequence_softmax_op.cc
 delete mode 100644 lite/operators/sequence_softmax_op.h
 delete mode 100644 lite/operators/sgd_op.cc
 delete mode 100644 lite/operators/sgd_op.h
 delete mode 100644 lite/operators/shape_op.cc
 delete mode 100644 lite/operators/shape_op.h
 delete mode 100644 lite/operators/shuffle_channel_op.cc
 delete mode 100644 lite/operators/shuffle_channel_op.h
 delete mode 100644 lite/operators/slice_op.cc
 delete mode 100644 lite/operators/slice_op.h
 delete mode 100644 lite/operators/softmax_op.cc
 delete mode 100644 lite/operators/softmax_op.h
 delete mode 100644 lite/operators/softmax_op_test.cc
 delete mode 100644 lite/operators/split_op.cc
 delete mode 100644 lite/operators/split_op.h
 delete mode 100644 lite/operators/squeeze_op.cc
 delete mode 100644 lite/operators/squeeze_op.h
 delete mode 100644 lite/operators/stack_op.cc
 delete mode 100644 lite/operators/stack_op.h
 delete mode 100644 lite/operators/topk_op.cc
 delete mode 100644 lite/operators/topk_op.h
 delete mode 100644 lite/operators/transpose_op.cc
 delete mode 100644 lite/operators/transpose_op.h
 delete mode 100644 lite/operators/transpose_op_test.cc
 delete mode 100644 lite/operators/uniform_random_op.cc
 delete mode 100644 lite/operators/uniform_random_op.h
 delete mode 100644 lite/operators/while_op.cc
 delete mode 100644 lite/operators/while_op.h
 delete mode 100644 lite/operators/write_to_array_op.cc
 delete mode 100644 lite/operators/write_to_array_op.h
 delete mode 100644 lite/operators/yolo_box_op.cc
 delete mode 100644 lite/operators/yolo_box_op.h
 delete mode 100644 lite/tests/CMakeLists.txt
 delete mode 100644 lite/tests/README.md
 delete mode 100644 lite/tests/kernels/CMakeLists.txt
 delete mode 100644 lite/tests/kernels/activation_compute_test.cc
 delete mode 100644 lite/tests/kernels/affine_channel_compute_test.cc
 delete mode 100644 lite/tests/kernels/anchor_generator_compute_test.cc
 delete mode 100644 lite/tests/kernels/argmax_compute_test.cc
 delete mode 100644 lite/tests/kernels/assign_compute_test.cc
 delete mode 100644 lite/tests/kernels/assign_value_compute_test.cc
 delete mode 100644 lite/tests/kernels/axpy_compute_test.cc
 delete mode 100644 lite/tests/kernels/bilinear_interp_compute_test.cc
 delete mode 100644 lite/tests/kernels/box_clip_compute_test.cc
 delete mode 100644 lite/tests/kernels/box_coder_compute_test.cc
 delete mode 100644 lite/tests/kernels/cast_compute_test.cc
 delete mode 100644 lite/tests/kernels/compare_compute_test.cc
 delete mode 100644 lite/tests/kernels/conv2d_transpose_compute_test.cc
 delete mode 100644 lite/tests/kernels/crop_compute_test.cc
 delete mode 100644 lite/tests/kernels/decode_bboxes_compute_test.cc
 delete mode 100644 lite/tests/kernels/elementwise_compute_test.cc
 delete mode 100644 lite/tests/kernels/expand_compute_test.cc
 delete mode 100644 lite/tests/kernels/fc_compute_test.cc
 delete mode 100644 lite/tests/kernels/fill_data.h
 delete mode 100644 lite/tests/kernels/generate_proposals_compute_test.cc
 delete mode 100644 lite/tests/kernels/gru_unit_test.cc
 delete mode 100644 lite/tests/kernels/im2sequence_compute_test.cc
 delete mode 100644 lite/tests/kernels/increment_compute_test.cc
 delete mode 100644 lite/tests/kernels/logical_compute_test.cc
 delete mode 100644 lite/tests/kernels/lrn_compute_test.cc
 delete mode 100644 lite/tests/kernels/matmul_compute_test.cc
 delete mode 100644 lite/tests/kernels/nearest_interp_compute_test.cc
 delete mode 100644 lite/tests/kernels/negative_compute_test.cc
 delete mode 100644 lite/tests/kernels/norm_compute_test.cc
 delete mode 100644 lite/tests/kernels/pad2d_compute_test.cc
 delete mode 100644 lite/tests/kernels/power_compute_test.cc
 delete mode 100644 lite/tests/kernels/prior_box_compute_test.cc
 delete mode 100644 lite/tests/kernels/read_from_array_compute_test.cc
 delete mode 100644 lite/tests/kernels/reduce_max_compute_test.cc
 delete mode 100644 lite/tests/kernels/reduce_mean_compute_test.cc
 delete mode 100644 lite/tests/kernels/roi_align_compute_test.cc
 delete mode 100644 lite/tests/kernels/scale_compute_test.cc
 delete mode 100644 lite/tests/kernels/sequence_expand_compute_test.cc
 delete mode 100644 lite/tests/kernels/sequence_pool_compute_test.cc
 delete mode 100644 lite/tests/kernels/sequence_softmax_compute_test.cc
 delete mode 100644 lite/tests/kernels/shape_compute_test.cc
 delete mode 100644 lite/tests/kernels/shuffle_channel_compute_test.cc
 delete mode 100644 lite/tests/kernels/slice_compute_test.cc
 delete mode 100644 lite/tests/kernels/squeeze_compute_test.cc
 delete mode 100644 lite/tests/kernels/stack_compute_test.cc
 delete mode 100644 lite/tests/kernels/test_funcs.h
 delete mode 100644 lite/tests/kernels/test_sgemm.cc
 delete mode 100644 lite/tests/kernels/topk_compute_test.cc
 delete mode 100644 lite/tests/kernels/write_to_array_compute_test.cc
 delete mode 100644 lite/tests/kernels/yolo_box_compute_test.cc
 delete mode 100644 lite/tools/CMakeLists.txt
 delete mode 100644 lite/tools/Dockerfile.mobile
 delete mode 100644 lite/tools/benchmark.sh
 delete mode 100755 lite/tools/build.sh
 delete mode 100755 lite/tools/build_fpga.sh
 delete mode 100755 lite/tools/build_npu.sh
 delete mode 100755 lite/tools/ci_build.sh
 delete mode 100644 lite/tools/cmake_tools/ast.py
 delete mode 100644 lite/tools/cmake_tools/create_fake_kernel_registry.py
 delete mode 100644 lite/tools/cmake_tools/parse_kernel_registry.py
 delete mode 100644 lite/tools/cmake_tools/parse_op_registry.py
 delete mode 100644 lite/tools/cmake_tools/utils.py
 delete mode 100644 lite/tools/debug/CMakeLists.txt
 delete mode 100644 lite/tools/debug/analysis_tool.py
 delete mode 100755 lite/tools/debug/check_model.sh
 delete mode 100644 lite/tools/debug/debug_utils.cc
 delete mode 100644 lite/tools/debug/debug_utils.h
 delete mode 100644 lite/tools/debug/model_debug_tool.cc
 delete mode 100755 lite/tools/gitlab_review.sh
 delete mode 100644 lite/tools/mobile_readme.md
 delete mode 100755 lite/tools/prepare_benchmark.sh
 delete mode 100644 lite/tools/python/lite_test.py
 delete mode 100644 lite/tools/search_support_ops.py
 delete mode 100644 lite/utils/CMakeLists.txt
 delete mode 100644 lite/utils/all.h
 delete mode 100644 lite/utils/any.cc
 delete mode 100644 lite/utils/any.h
 delete mode 100644 lite/utils/check.h
 delete mode 100644 lite/utils/container.h
 delete mode 100644 lite/utils/cp_logging.cc
 delete mode 100644 lite/utils/cp_logging.h
 delete mode 100644 lite/utils/factory.h
 delete mode 100644 lite/utils/hash.h
 delete mode 100644 lite/utils/io.h
 delete mode 100644 lite/utils/logging.cc
 delete mode 100644 lite/utils/logging.h
 delete mode 100644 lite/utils/logging_test.cc
 delete mode 100644 lite/utils/macros.h
 delete mode 100644 lite/utils/paddle_enforce.h
 delete mode 100644 lite/utils/replace_stl/stream.cc
 delete mode 100644 lite/utils/replace_stl/stream.h
 delete mode 100644 lite/utils/string.cc
 delete mode 100644 lite/utils/string.h
 delete mode 100644 lite/utils/varient.h
 delete mode 100644 lite/utils/varient_test.cc
 delete mode 100644 metal/MobileNetDemo/MobileNetDemo.xcodeproj/project.pbxproj
 delete mode 100644 metal/MobileNetDemo/MobileNetDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata
 delete mode 100644 metal/MobileNetDemo/MobileNetDemo.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
 delete mode 100644 metal/MobileNetDemo/MobileNetDemo/AppDelegate.swift
 delete mode 100644 metal/MobileNetDemo/MobileNetDemo/Assets.xcassets/AppIcon.appiconset/Contents.json
 delete mode 100644 metal/MobileNetDemo/MobileNetDemo/Assets.xcassets/Contents.json
 delete mode 100644 metal/MobileNetDemo/MobileNetDemo/Base.lproj/LaunchScreen.storyboard
 delete mode 100644 metal/MobileNetDemo/MobileNetDemo/Base.lproj/Main.storyboard
 delete mode 100644 metal/MobileNetDemo/MobileNetDemo/Info.plist
 delete mode 100644 metal/MobileNetDemo/MobileNetDemo/MobileNet.swift
 delete mode 100644 metal/MobileNetDemo/MobileNetDemo/MobilenetPreProcess.metal
 delete mode 100644 metal/MobileNetDemo/MobileNetDemo/ViewController.swift
 delete mode 100644 metal/PaddleMobileTest/PaddleMobileTest.xcodeproj/project.pbxproj
 delete mode 100644 metal/PaddleMobileTest/PaddleMobileTest.xcodeproj/project.xcworkspace/contents.xcworkspacedata
 delete mode 100644 metal/PaddleMobileTest/PaddleMobileTest.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
 delete mode 100644 metal/PaddleMobileTest/PaddleMobileTest.xcodeproj/xcshareddata/xcschemes/PaddleMobileTest.xcscheme
 delete mode 100644 metal/PaddleMobileTest/PaddleMobileTest/AppDelegate.swift
 delete mode 100644 metal/PaddleMobileTest/PaddleMobileTest/Assets.xcassets/AppIcon.appiconset/Contents.json
 delete mode 100644 metal/PaddleMobileTest/PaddleMobileTest/Assets.xcassets/Contents.json
 delete mode 100644 metal/PaddleMobileTest/PaddleMobileTest/Base.lproj/LaunchScreen.storyboard
 delete mode 100644 metal/PaddleMobileTest/PaddleMobileTest/Base.lproj/Main.storyboard
 delete mode 100644 metal/PaddleMobileTest/PaddleMobileTest/Info.plist
 delete mode 100644 metal/PaddleMobileTest/PaddleMobileTest/TestViewController.swift
 delete mode 100644 metal/PaddleMobileTest/PaddleMobileTest/ViewController.swift
 delete mode 100644 metal/Podfile
 delete mode 100644 metal/README.md
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.xcworkspace/contents.xcworkspacedata
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/xcshareddata/xcschemes/paddle-mobile-demo.xcscheme
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/AppIcon.appiconset/Contents.json
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/Contents.json
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/paddle-mobile.imageset/Contents.json
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/paddle-mobile.imageset/paddle-mobile.png
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/LaunchScreen.storyboard
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Info.plist
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/MetalHelper.swift
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Net/BufferToTexture.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Net/CPUCompute.h
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Net/CPUCompute.mm
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Net/Genet.swift
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNet.swift
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetCombined.swift
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobilenetSSD_AR.swift
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/Net/YoloNet.swift
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/OC/ImageTool.h
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/OC/ImageTool.m
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.h
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.m
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/OCDemoViewController.h
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/OCDemoViewController.m
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.h
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.m
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/SuperResolutionNet.swift
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/BatchNormKernel.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/BatchNormRelu.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/BilinearInterp.inc.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/BilinearInterp.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/BoxCoder.inc.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/BoxCoder.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/Common.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConcatKernel.inc.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConcatKernel.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvAddBNReluKernel.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvAddMetal.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvAddPrelu.inc.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvAddPreluKernel.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvBNReluKernel.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvKernel.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvTransposeKernel.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/Elementwise.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ElementwiseAddPreluKernel.inc.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ElementwiseAddPreluKernel.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/FetchKernel.inc.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/FetchKernel.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/Kernels.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/Macro.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/NMSFetchResultKernel.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/PoolKernel.inc.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/PoolKernel.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/PreluKernel.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/PriorBoxKernel.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ReluKernel.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ReshapeKernel.inc.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ReshapeKernel.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/ResizeBilinear.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/Shape.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/Softmax.inc.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/Softmax.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/Split.inc.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/Split.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/TransposeKernel.inc.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/metal/TransposeKernel.metal
 delete mode 100644 metal/paddle-mobile-demo/paddle-mobile-demo/paddle-mobile-demo-Bridging-Header.h
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.xcworkspace/contents.xcworkspacedata
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/xcshareddata/xcschemes/paddle-mobile-metallib.xcscheme
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ActivationKernel.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormKernel.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.inc.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/BoxCoder.inc.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/BoxCoder.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/BufferToTexture.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.inc.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddBNReluKernel.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPrelu.inc.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPreluKernel.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvBNReluKernel.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvTransposeKernel.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.inc.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.inc.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/Kernels.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/Macro.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/NMSFetchResultKernel.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/NearestInterpKernel.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.inc.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/PreluKernel.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/PriorBoxKernel.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ReluKernel.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.inc.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ResizeBilinear.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/Scale.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/ScaleKernel.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/Shape.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/SliceKernel.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.inc.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.inc.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.inc.metal
 delete mode 100644 metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.metal
 delete mode 100644 metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.pbxproj
 delete mode 100644 metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.xcworkspace/contents.xcworkspacedata
 delete mode 100644 metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
 delete mode 100644 metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
 delete mode 100644 metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
 delete mode 100644 metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Assets.xcassets/AppIcon.appiconset/Contents.json
 delete mode 100644 metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Assets.xcassets/Contents.json
 delete mode 100644 metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Base.lproj/LaunchScreen.storyboard
 delete mode 100644 metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Base.lproj/Main.storyboard
 delete mode 100644 metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Info.plist
 delete mode 100644 metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
 delete mode 100644 metal/paddle-mobile/paddle-mobile.xcodeproj/project.xcworkspace/contents.xcworkspacedata
 delete mode 100644 metal/paddle-mobile/paddle-mobile.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
 delete mode 100644 metal/paddle-mobile/paddle-mobile.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
 delete mode 100644 metal/paddle-mobile/paddle-mobile.xcodeproj/xcshareddata/xcschemes/paddle-mobile.xcscheme
 delete mode 100644 metal/paddle-mobile/paddle-mobile/API/GlobalConfig.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/API/Net.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/API/Runner.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Info.plist
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Common/Errors.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Common/Extensions.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Common/Tools.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Common/Types.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Framework/Dim.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Framework/Executor.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Framework/Texture.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Framework/Utils.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpCreator.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpParam.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/BatchNormOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/BilinearInterpOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/BoxcoderOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/CNNMPSConvOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddAddPreluOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddBatchNormReluOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddPreluOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddReluOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ConvBNReluOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ConvOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ConvReluOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ConvTransposeOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/DepthwiseConvOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/DwConvBNReluOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddPreluOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ExpOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/FetchOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/FlattenOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Base/Kernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BatchNormKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BatchNormReluKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BilinearInterpKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BoxcoderKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/CNNConvKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Concat.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConcatKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvReluKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvTransposeKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddPreluKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ExpKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FetchKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FlattenKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/LeakyReluKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/MulticlassNMSKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/NearestInterpKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PoolKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PreluKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PriorBoxKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Relu6Kernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReshapeKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ResizeBilinearKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Scale.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ScaleOpKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ShapeKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SigmoidKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SliceKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SoftmaxKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SplitKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Texture2DTo2DArrayKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/TransposeKernel.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BatchNormRelu.metal
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ResizeBilinear.metal
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/LeakyReluOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/MulticlassNMSOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/NearestInterpOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/PoolOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/PreluOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/PriorBoxOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/Relu6Op.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ReluOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ReshapeOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ResizeBilinearOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ScaleOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/ShapeOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/SigmoidOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/SliceOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/SoftmaxOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/SplitOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Operators/TransposeOp.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Program/Attribute.swift
 delete mode 100755 metal/paddle-mobile/paddle-mobile/Src/Program/Framework.pbobjc.h
 delete mode 100755 metal/paddle-mobile/paddle-mobile/Src/Program/Framework.pbobjc.m
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Program/MemoryOptimze.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Program/PMBlockDesc.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Program/PMOpDesc.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Program/PMProgramDesc.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Program/PMVarDesc.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Program/Program.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Program/ProgramOptimize.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Program/Scope.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Program/TensorDesc.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/Src/Program/framework.pb.swift
 delete mode 100644 metal/paddle-mobile/paddle-mobile/paddle_mobile.h
 create mode 100644 mobile.md
 delete mode 100644 mobile/.clang-format
 delete mode 100644 mobile/.clang-tidy
 delete mode 100644 mobile/.gitignore
 delete mode 100644 mobile/.pre-commit-config.yaml
 delete mode 100644 mobile/.travis.yml
 delete mode 100755 mobile/.travis/pre-commit-job.sh
 delete mode 100644 mobile/CMakeLists.txt
 delete mode 100644 mobile/CONTRIBUTING.md
 delete mode 100644 mobile/Dockerfile
 delete mode 100644 mobile/LICENSE
 delete mode 100644 mobile/README.md
 delete mode 100644 mobile/benchmark/arm_benchmark.md
 delete mode 100644 mobile/benchmark/metal_benchmark.md
 delete mode 100644 mobile/demo/ReadMe.md
 delete mode 100644 mobile/demo/getDemo.sh
 delete mode 100644 mobile/doc/build.md
 delete mode 100644 mobile/doc/design_doc.md
 delete mode 100644 mobile/doc/development_android.md
 delete mode 100644 mobile/doc/development_android_GPU.md
 delete mode 100644 mobile/doc/development_arm_linux.md
 delete mode 100644 mobile/doc/development_fpga.md
 delete mode 100644 mobile/doc/development_ios.md
 delete mode 100644 mobile/doc/quantification.md
 delete mode 100644 mobile/src/common/common.h
 delete mode 100644 mobile/src/common/enforce.h
 delete mode 100644 mobile/src/common/log.h
 delete mode 100644 mobile/src/common/threadpool.h
 delete mode 100644 mobile/src/common/type_define.h
 delete mode 100755 mobile/src/common/types.cpp
 delete mode 100644 mobile/src/common/types.h
 delete mode 100644 mobile/src/common/util.cpp
 delete mode 100644 mobile/src/common/util.h
 delete mode 100644 mobile/src/common/variant.h
 delete mode 100644 mobile/src/fpga/KD/alignment.h
 delete mode 100644 mobile/src/fpga/KD/context.hpp
 delete mode 100644 mobile/src/fpga/KD/dl_engine.cpp
 delete mode 100644 mobile/src/fpga/KD/dl_engine.hpp
 delete mode 100644 mobile/src/fpga/KD/float16.hpp
 delete mode 100644 mobile/src/fpga/KD/layout.hpp
 delete mode 100644 mobile/src/fpga/KD/llapi/bias_scale.cpp
 delete mode 100644 mobile/src/fpga/KD/llapi/bias_scale.h
 delete mode 100755 mobile/src/fpga/KD/llapi/config.h
 delete mode 100644 mobile/src/fpga/KD/llapi/filter.cpp
 delete mode 100644 mobile/src/fpga/KD/llapi/filter.h
 delete mode 100644 mobile/src/fpga/KD/llapi/image.cpp
 delete mode 100644 mobile/src/fpga/KD/llapi/image.h
 delete mode 100644 mobile/src/fpga/KD/llapi/zynqmp_api.cpp
 delete mode 100644 mobile/src/fpga/KD/llapi/zynqmp_api.h
 delete mode 100644 mobile/src/fpga/KD/pe.hpp
 delete mode 100644 mobile/src/fpga/KD/pe_params.hpp
 delete mode 100644 mobile/src/fpga/KD/pes/concat_pe.hpp
 delete mode 100644 mobile/src/fpga/KD/pes/conv_pe.hpp
 delete mode 100644 mobile/src/fpga/KD/pes/conv_process.hpp
 delete mode 100644 mobile/src/fpga/KD/pes/depthwise_conv_pe.hpp
 delete mode 100644 mobile/src/fpga/KD/pes/elementwise_add_pe.hpp
 delete mode 100644 mobile/src/fpga/KD/pes/fully_connected_pe.hpp
 delete mode 100644 mobile/src/fpga/KD/pes/input_pe.hpp
 delete mode 100755 mobile/src/fpga/KD/pes/math_func_neon.h
 delete mode 100644 mobile/src/fpga/KD/pes/output_pe.hpp
 delete mode 100644 mobile/src/fpga/KD/pes/pooling_pe.hpp
 delete mode 100644 mobile/src/fpga/KD/pes/softmax_pe.cpp
 delete mode 100644 mobile/src/fpga/KD/pes/softmax_pe.hpp
 delete mode 100644 mobile/src/fpga/KD/shape.hpp
 delete mode 100644 mobile/src/fpga/KD/tensor.hpp
 delete mode 100644 mobile/src/fpga/KD/tensor_util.cpp
 delete mode 100644 mobile/src/fpga/KD/tensor_util.hpp
 delete mode 100644 mobile/src/fpga/V1/api.cpp
 delete mode 100644 mobile/src/fpga/V1/api.h
 delete mode 100644 mobile/src/fpga/V1/bias_scale.cpp
 delete mode 100755 mobile/src/fpga/V1/bias_scale.h
 delete mode 100644 mobile/src/fpga/V1/deconv_bias_scale.cpp
 delete mode 100644 mobile/src/fpga/V1/deconv_bias_scale.h
 delete mode 100644 mobile/src/fpga/V1/deconv_filter.cpp
 delete mode 100644 mobile/src/fpga/V1/deconv_filter.h
 delete mode 100644 mobile/src/fpga/V1/filter.cpp
 delete mode 100755 mobile/src/fpga/V1/filter.h
 delete mode 100644 mobile/src/fpga/V1/image.cpp
 delete mode 100644 mobile/src/fpga/V1/image.h
 delete mode 100644 mobile/src/fpga/V1/pe.cpp
 delete mode 100644 mobile/src/fpga/V2/api.cpp
 delete mode 100644 mobile/src/fpga/V2/api.h
 delete mode 100644 mobile/src/fpga/V2/bias_scale.cpp
 delete mode 100644 mobile/src/fpga/V2/bias_scale.h
 delete mode 100644 mobile/src/fpga/V2/deconv_bias_scale.cpp
 delete mode 100644 mobile/src/fpga/V2/deconv_bias_scale.h
 delete mode 100644 mobile/src/fpga/V2/deconv_filter.cpp
 delete mode 100644 mobile/src/fpga/V2/deconv_filter.h
 delete mode 100644 mobile/src/fpga/V2/filter.cpp
 delete mode 100644 mobile/src/fpga/V2/filter.h
 delete mode 100644 mobile/src/fpga/V2/image.cpp
 delete mode 100644 mobile/src/fpga/V2/image.h
 delete mode 100644 mobile/src/fpga/V2/pe.cpp
 delete mode 100644 mobile/src/fpga/common/config.h
 delete mode 100644 mobile/src/fpga/common/driver.cpp
 delete mode 100644 mobile/src/fpga/common/driver.h
 delete mode 100644 mobile/src/fpga/common/fpga_common.cpp
 delete mode 100644 mobile/src/fpga/common/fpga_common.h
 delete mode 100644 mobile/src/fpga/common/pe.h
 delete mode 100644 mobile/src/framework/CMakeLists.txt
 delete mode 100644 mobile/src/framework/attribute.cpp
 delete mode 100644 mobile/src/framework/attribute.h
 delete mode 100644 mobile/src/framework/cl/cl_deleter.h
 delete mode 100644 mobile/src/framework/cl/cl_engine.cpp
 delete mode 100644 mobile/src/framework/cl/cl_engine.h
 delete mode 100644 mobile/src/framework/cl/cl_half.cpp
 delete mode 100644 mobile/src/framework/cl/cl_half.h
 delete mode 100644 mobile/src/framework/cl/cl_helper.h
 delete mode 100644 mobile/src/framework/cl/cl_image.cpp
 delete mode 100644 mobile/src/framework/cl/cl_image.h
 delete mode 100644 mobile/src/framework/cl/cl_image_converter.cpp
 delete mode 100644 mobile/src/framework/cl/cl_image_converter.h
 delete mode 100644 mobile/src/framework/cl/cl_scope.h
 delete mode 100644 mobile/src/framework/cl/cl_tensor.h
 delete mode 100644 mobile/src/framework/cl/cl_tool.cpp
 delete mode 100644 mobile/src/framework/cl/cl_tool.h
 delete mode 100644 mobile/src/framework/context.cpp
 delete mode 100644 mobile/src/framework/context.h
 delete mode 100644 mobile/src/framework/data_layout.h
 delete mode 100644 mobile/src/framework/data_type.cpp
 delete mode 100644 mobile/src/framework/data_type.h
 delete mode 100644 mobile/src/framework/ddim.cpp
 delete mode 100644 mobile/src/framework/ddim.h
 delete mode 100644 mobile/src/framework/dim.h
 delete mode 100644 mobile/src/framework/executor.cpp
 delete mode 100644 mobile/src/framework/executor.h
 delete mode 100644 mobile/src/framework/framework.pb-c.cpp
 delete mode 100644 mobile/src/framework/framework.pb-c.h
 delete mode 100644 mobile/src/framework/framework.proto
 delete mode 100755 mobile/src/framework/load_ops.h
 delete mode 100644 mobile/src/framework/loader.cpp
 delete mode 100644 mobile/src/framework/loader.h
 delete mode 100644 mobile/src/framework/lod_tensor.cpp
 delete mode 100644 mobile/src/framework/lod_tensor.h
 delete mode 100644 mobile/src/framework/mixed_vector.h
 delete mode 100644 mobile/src/framework/op_info.h
 delete mode 100644 mobile/src/framework/op_kernel_type.h
 delete mode 100644 mobile/src/framework/op_proto_maker.h
 delete mode 100644 mobile/src/framework/op_registry.h
 delete mode 100644 mobile/src/framework/operator.cpp
 delete mode 100644 mobile/src/framework/operator.h
 delete mode 100644 mobile/src/framework/program/block_desc.cpp
 delete mode 100644 mobile/src/framework/program/block_desc.h
 delete mode 100644 mobile/src/framework/program/op_desc.cpp
 delete mode 100644 mobile/src/framework/program/op_desc.h
 delete mode 100644 mobile/src/framework/program/program-optimize/fusion_op_register.h
 delete mode 100644 mobile/src/framework/program/program-optimize/node.cpp
 delete mode 100644 mobile/src/framework/program/program-optimize/node.h
 delete mode 100644 mobile/src/framework/program/program-optimize/program_optimize.cpp
 delete mode 100644 mobile/src/framework/program/program-optimize/program_optimize.h
 delete mode 100644 mobile/src/framework/program/program.h
 delete mode 100644 mobile/src/framework/program/program_desc.cpp
 delete mode 100644 mobile/src/framework/program/program_desc.h
 delete mode 100644 mobile/src/framework/program/tensor_desc.h
 delete mode 100644 mobile/src/framework/program/var_desc.h
 delete mode 100644 mobile/src/framework/scope.cpp
 delete mode 100644 mobile/src/framework/scope.h
 delete mode 100644 mobile/src/framework/selected_rows.cpp
 delete mode 100644 mobile/src/framework/selected_rows.h
 delete mode 100644 mobile/src/framework/tensor.h
 delete mode 100644 mobile/src/framework/tensor_base.h
 delete mode 100644 mobile/src/framework/tensor_util.cpp
 delete mode 100644 mobile/src/framework/tensor_util.h
 delete mode 100644 mobile/src/framework/type_trait.h
 delete mode 100644 mobile/src/framework/variable.h
 delete mode 100644 mobile/src/framework/zynqmp/ztensor.hpp
 delete mode 100644 mobile/src/io/api.cc
 delete mode 100644 mobile/src/io/api_paddle_mobile.cc
 delete mode 100644 mobile/src/io/api_paddle_mobile.h
 delete mode 100644 mobile/src/io/ios_io/PaddleMobileCPU.h
 delete mode 100644 mobile/src/io/ios_io/PaddleMobileCPU.mm
 delete mode 100644 mobile/src/io/jni/PML.java
 delete mode 100644 mobile/src/io/jni/paddle_mobile_jni.cpp
 delete mode 100644 mobile/src/io/jni/paddle_mobile_jni.h
 delete mode 100644 mobile/src/io/loader.h
 delete mode 100644 mobile/src/io/opencl_interface.cpp
 delete mode 100644 mobile/src/io/opencl_interface.h
 delete mode 100644 mobile/src/io/paddle_inference_api.h
 delete mode 100644 mobile/src/io/paddle_mobile.cpp
 delete mode 100644 mobile/src/io/paddle_mobile.h
 delete mode 100644 mobile/src/io/paddle_mobile_wrap.cpp
 delete mode 100644 mobile/src/io/paddle_mobile_wrap.h
 delete mode 100644 mobile/src/io/paddle_test_inference_api.cpp
 delete mode 100644 mobile/src/io/paddle_test_inference_api.h
 delete mode 100755 mobile/src/memory/t_malloc.cpp
 delete mode 100644 mobile/src/memory/t_malloc.h
 delete mode 100755 mobile/src/operators/activation_op.cpp
 delete mode 100644 mobile/src/operators/activation_op.h
 delete mode 100644 mobile/src/operators/assign_op.cpp
 delete mode 100644 mobile/src/operators/assign_op.h
 delete mode 100644 mobile/src/operators/assign_value_op.cpp
 delete mode 100644 mobile/src/operators/assign_value_op.h
 delete mode 100644 mobile/src/operators/batchnorm_op.cpp
 delete mode 100644 mobile/src/operators/batchnorm_op.h
 delete mode 100644 mobile/src/operators/beam_search_decode_op.cpp
 delete mode 100644 mobile/src/operators/beam_search_decode_op.h
 delete mode 100644 mobile/src/operators/beam_search_op.cpp
 delete mode 100644 mobile/src/operators/beam_search_op.h
 delete mode 100644 mobile/src/operators/bilinear_interp_op.cpp
 delete mode 100644 mobile/src/operators/bilinear_interp_op.h
 delete mode 100644 mobile/src/operators/box_coder_op.cpp
 delete mode 100644 mobile/src/operators/box_coder_op.h
 delete mode 100644 mobile/src/operators/cast_op.cpp
 delete mode 100644 mobile/src/operators/cast_op.h
 delete mode 100644 mobile/src/operators/compare_op.cpp
 delete mode 100644 mobile/src/operators/compare_op.h
 delete mode 100644 mobile/src/operators/concat_op.cpp
 delete mode 100644 mobile/src/operators/concat_op.h
 delete mode 100644 mobile/src/operators/conditional_block_op.cpp
 delete mode 100644 mobile/src/operators/conditional_block_op.h
 delete mode 100644 mobile/src/operators/controlflow/tensor_array_read_write_op.cpp
 delete mode 100644 mobile/src/operators/controlflow/tensor_array_read_write_op.h
 delete mode 100644 mobile/src/operators/controlflow/while_op.cpp
 delete mode 100644 mobile/src/operators/controlflow/while_op.h
 delete mode 100644 mobile/src/operators/conv_op.cpp
 delete mode 100644 mobile/src/operators/conv_op.h
 delete mode 100755 mobile/src/operators/conv_transpose_op.cpp
 delete mode 100755 mobile/src/operators/conv_transpose_op.h
 delete mode 100644 mobile/src/operators/crf_op.cpp
 delete mode 100644 mobile/src/operators/crf_op.h
 delete mode 100644 mobile/src/operators/depthwise_conv_op.cpp
 delete mode 100644 mobile/src/operators/depthwise_conv_op.h
 delete mode 100644 mobile/src/operators/dequantize_op.cpp
 delete mode 100644 mobile/src/operators/dequantize_op.h
 delete mode 100644 mobile/src/operators/detection_ops.cpp
 delete mode 100644 mobile/src/operators/detection_ops.h
 delete mode 100644 mobile/src/operators/dropout_op.cpp
 delete mode 100644 mobile/src/operators/dropout_op.h
 delete mode 100644 mobile/src/operators/elementwise_add_op.cpp
 delete mode 100644 mobile/src/operators/elementwise_add_op.h
 delete mode 100644 mobile/src/operators/elementwise_mul_op.cpp
 delete mode 100644 mobile/src/operators/elementwise_mul_op.h
 delete mode 100644 mobile/src/operators/elementwise_sub_op.cpp
 delete mode 100644 mobile/src/operators/elementwise_sub_op.h
 delete mode 100644 mobile/src/operators/exp_op.cpp
 delete mode 100644 mobile/src/operators/exp_op.h
 delete mode 100644 mobile/src/operators/feed_op.cpp
 delete mode 100644 mobile/src/operators/feed_op.h
 delete mode 100644 mobile/src/operators/fetch_op.cpp
 delete mode 100644 mobile/src/operators/fetch_op.h
 delete mode 100644 mobile/src/operators/fill_constant_batch_size_like_op.cpp
 delete mode 100644 mobile/src/operators/fill_constant_batch_size_like_op.h
 delete mode 100644 mobile/src/operators/fill_constant_op.cpp
 delete mode 100644 mobile/src/operators/fill_constant_op.h
 delete mode 100644 mobile/src/operators/flatten2_op.cpp
 delete mode 100644 mobile/src/operators/flatten2_op.h
 delete mode 100644 mobile/src/operators/flatten_op.cpp
 delete mode 100644 mobile/src/operators/flatten_op.h
 delete mode 100644 mobile/src/operators/fusion_conv_add_bn_op.cpp
 delete mode 100644 mobile/src/operators/fusion_conv_add_bn_op.h
 delete mode 100644 mobile/src/operators/fusion_conv_add_bn_relu_op.cpp
 delete mode 100644 mobile/src/operators/fusion_conv_add_bn_relu_op.h
 delete mode 100644 mobile/src/operators/fusion_conv_add_op.cpp
 delete mode 100644 mobile/src/operators/fusion_conv_add_op.h
 delete mode 100644 mobile/src/operators/fusion_conv_add_relu_op.cpp
 delete mode 100644 mobile/src/operators/fusion_conv_add_relu_op.h
 delete mode 100644 mobile/src/operators/fusion_conv_bn_add_relu_op.cpp
 delete mode 100644 mobile/src/operators/fusion_conv_bn_add_relu_op.h
 delete mode 100644 mobile/src/operators/fusion_conv_bn_op.cpp
 delete mode 100644 mobile/src/operators/fusion_conv_bn_op.h
 delete mode 100644 mobile/src/operators/fusion_conv_bn_relu_op.cpp
 delete mode 100644 mobile/src/operators/fusion_conv_bn_relu_op.h
 delete mode 100644 mobile/src/operators/fusion_conv_relu_op.cpp
 delete mode 100644 mobile/src/operators/fusion_conv_relu_op.h
 delete mode 100644 mobile/src/operators/fusion_deconv_add_bn_op.cpp
 delete mode 100644 mobile/src/operators/fusion_deconv_add_bn_op.h
 delete mode 100755 mobile/src/operators/fusion_deconv_add_bn_relu_op.cpp
 delete mode 100644 mobile/src/operators/fusion_deconv_add_bn_relu_op.h
 delete mode 100644 mobile/src/operators/fusion_deconv_add_op.cpp
 delete mode 100644 mobile/src/operators/fusion_deconv_add_op.h
 delete mode 100644 mobile/src/operators/fusion_deconv_add_relu_op.cpp
 delete mode 100644 mobile/src/operators/fusion_deconv_add_relu_op.h
 delete mode 100644 mobile/src/operators/fusion_deconv_bn_relu_op.cpp
 delete mode 100644 mobile/src/operators/fusion_deconv_bn_relu_op.h
 delete mode 100644 mobile/src/operators/fusion_deconv_relu_op.cpp
 delete mode 100644 mobile/src/operators/fusion_deconv_relu_op.h
 delete mode 100644 mobile/src/operators/fusion_dequant_add_bn_op.cpp
 delete mode 100644 mobile/src/operators/fusion_dequant_add_bn_op.h
 delete mode 100644 mobile/src/operators/fusion_dequant_add_bn_relu_op.cpp
 delete mode 100644 mobile/src/operators/fusion_dequant_add_bn_relu_op.h
 delete mode 100644 mobile/src/operators/fusion_dequant_add_bn_relu_quant_op.cpp
 delete mode 100644 mobile/src/operators/fusion_dequant_add_bn_relu_quant_op.h
 delete mode 100644 mobile/src/operators/fusion_dequant_bn_op.cpp
 delete mode 100644 mobile/src/operators/fusion_dequant_bn_op.h
 delete mode 100644 mobile/src/operators/fusion_dequant_bn_relu_op.h
 delete mode 100644 mobile/src/operators/fusion_dwconv_bn_relu_op.cpp
 delete mode 100644 mobile/src/operators/fusion_dwconv_bn_relu_op.h
 delete mode 100644 mobile/src/operators/fusion_elementwise_add_relu_op.cpp
 delete mode 100644 mobile/src/operators/fusion_elementwise_add_relu_op.h
 delete mode 100644 mobile/src/operators/fusion_fc_op.cpp
 delete mode 100644 mobile/src/operators/fusion_fc_op.h
 delete mode 100644 mobile/src/operators/fusion_fc_relu_op.cpp
 delete mode 100644 mobile/src/operators/fusion_fc_relu_op.h
 delete mode 100644 mobile/src/operators/fusion_instancenorm_relu_op.cpp
 delete mode 100644 mobile/src/operators/fusion_instancenorm_relu_op.h
 delete mode 100644 mobile/src/operators/gru_op.cpp
 delete mode 100644 mobile/src/operators/gru_op.h
 delete mode 100644 mobile/src/operators/gru_unit_op.cpp
 delete mode 100644 mobile/src/operators/gru_unit_op.h
 delete mode 100644 mobile/src/operators/im2sequence_op.cpp
 delete mode 100644 mobile/src/operators/im2sequence_op.h
 delete mode 100644 mobile/src/operators/increment_op.cpp
 delete mode 100644 mobile/src/operators/increment_op.h
 delete mode 100644 mobile/src/operators/instancenorm_op.cpp
 delete mode 100644 mobile/src/operators/instancenorm_op.h
 delete mode 100644 mobile/src/operators/is_empty_op.cpp
 delete mode 100644 mobile/src/operators/is_empty_op.h
 delete mode 100644 mobile/src/operators/kernel/activation_kernel.h
 delete mode 100644 mobile/src/operators/kernel/arm/activation_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/anchor_generator_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/assign_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/assign_value_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/batchnorm_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/beam_search_decode_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/beam_search_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/bilinear_interp_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/box_coder_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/cast_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/compare_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/concat_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/conditional_block_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_add_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_common.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_common.h
 delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/convolution/conv_transpose_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/crf_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/density_prior_box_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/dequantize_bn_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/dequantize_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/dropout_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/elementwise_add_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/elementwise_mul_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/elementwise_sub_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/exp_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/feed_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/fetch_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/flatten_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/fusion_fc_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/gru_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/gru_unit_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/im2sequence_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/increment_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/is_empty_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/lod_reset_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/logical_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/lookup_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/lrn_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/mul_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/multiclass_nms_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/nearest_interp_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/norm_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/one_hot_kernel.cpp
 delete mode 100755 mobile/src/operators/kernel/arm/pad2d_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/polygon_box_transform_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/pool_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/prelu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/prior_box_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/proposal_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/psroi_pool_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/quantize_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/reshape2_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/reshape_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/resize_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/roi_perspective_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/scale_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/sequence_expand_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/sequence_pool_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/sequence_softmax_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/shape_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/slice_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/softmax_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/split_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/sum_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/tensor_array_read_write_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/top_k_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/transpose2_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/transpose_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/arm/while_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/assign_kernel.h
 delete mode 100644 mobile/src/operators/kernel/assign_value_kernel.h
 delete mode 100644 mobile/src/operators/kernel/batchnorm_kernel.h
 delete mode 100644 mobile/src/operators/kernel/beam_search_decode_kernel.h
 delete mode 100644 mobile/src/operators/kernel/beam_search_kernel.h
 delete mode 100644 mobile/src/operators/kernel/bilinear_interp_kernel.h
 delete mode 100644 mobile/src/operators/kernel/box_coder_kernel.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/activation_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/bilinear_interp_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/box_coder_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/concat_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/conv_add_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/conv_arm_func.cpp
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/conv_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/crf_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/density_prior_box_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/elementwise_mul_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/elementwise_sub_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/flatten_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/gru_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/gru_unit_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/increment_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/lookup_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/lrn_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/mul_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/norm_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/pool_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/prior_box_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/reshape2_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/reshape_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/shape_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/softmax_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/split_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/sum_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/central-arm-func/transpose_arm_func.h
 delete mode 100644 mobile/src/operators/kernel/cl/batchnorm_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/box_coder_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.h
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/box_coder_kernel.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/cl_common.h
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/concat_kernel.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
 delete mode 100755 mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/conv_transpose_kernel.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/density_prior_box_kernel.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/dropout_kernel.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/exp_kernel.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/flatten2_kernel.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/instancenorm_kernel.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/leakyrelu_kernel.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/lrn_kernel.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/nearest_interp_kernel.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/pad2d_kernel.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/pool_kernel.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/relu.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/relu6.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/reshape.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/scale_kernel.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/sigmoid.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/slice_kernel.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/softmax.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/tanh_kernel.cl
 delete mode 100644 mobile/src/operators/kernel/cl/cl_kernel/transpose_kernel.cl
 delete mode 100644 mobile/src/operators/kernel/cl/concat_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/conv_add_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
 delete mode 100755 mobile/src/operators/kernel/cl/conv_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/density_prior_box_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/depthwise_conv_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/dropout_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/dwconv_bn_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/elementwise_add_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/exp_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/feed_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/fetch_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/flatten2_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/fusion_fc_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/gen_code.py
 delete mode 100644 mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/leakyrelu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/lrn_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/mul_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/multiclass_nms_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/nearest_interp_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/pad2d_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/pool_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/prior_box_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/relu6_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/reshape2_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/reshape_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/scale_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/sigmoid_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/slice_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/softmax_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/split_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/tanh_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/transpose2_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/cl/transpose_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/compare_kernel.h
 delete mode 100644 mobile/src/operators/kernel/concat_kernel.h
 delete mode 100644 mobile/src/operators/kernel/conditional_block_kernel.h
 delete mode 100644 mobile/src/operators/kernel/conv_add_bn_kernel.h
 delete mode 100644 mobile/src/operators/kernel/conv_add_bn_relu_kernel.h
 delete mode 100644 mobile/src/operators/kernel/conv_add_kernel.h
 delete mode 100644 mobile/src/operators/kernel/conv_add_relu_kernel.h
 delete mode 100644 mobile/src/operators/kernel/conv_bn_add_relu_kernel.h
 delete mode 100644 mobile/src/operators/kernel/conv_bn_kernel.h
 delete mode 100644 mobile/src/operators/kernel/conv_bn_relu_kernel.h
 delete mode 100644 mobile/src/operators/kernel/conv_kernel.h
 delete mode 100644 mobile/src/operators/kernel/conv_relu_kernel.h
 delete mode 100644 mobile/src/operators/kernel/conv_transpose_kernel.h
 delete mode 100644 mobile/src/operators/kernel/crf_kernel.h
 delete mode 100755 mobile/src/operators/kernel/deconv_add_bn_kernel.h
 delete mode 100755 mobile/src/operators/kernel/deconv_add_bn_relu_kernel.h
 delete mode 100644 mobile/src/operators/kernel/deconv_add_kernel.h
 delete mode 100644 mobile/src/operators/kernel/deconv_add_relu_kernel.h
 delete mode 100755 mobile/src/operators/kernel/deconv_bn_relu_kernel.h
 delete mode 100644 mobile/src/operators/kernel/deconv_relu_kernel.h
 delete mode 100644 mobile/src/operators/kernel/dequant_bn_kernel.h
 delete mode 100644 mobile/src/operators/kernel/dequantize_kernel.h
 delete mode 100644 mobile/src/operators/kernel/detection_kernel.h
 delete mode 100644 mobile/src/operators/kernel/dropout_kernel.h
 delete mode 100644 mobile/src/operators/kernel/dwconv_bn_relu_kernel.h
 delete mode 100644 mobile/src/operators/kernel/elementwise_add_kernel.h
 delete mode 100644 mobile/src/operators/kernel/elementwise_add_relu_kernel.h
 delete mode 100644 mobile/src/operators/kernel/elementwise_mul_kernel.h
 delete mode 100644 mobile/src/operators/kernel/elementwise_sub_kernel.h
 delete mode 100644 mobile/src/operators/kernel/exp_kernel.h
 delete mode 100644 mobile/src/operators/kernel/fc_relu_kernel.h
 delete mode 100644 mobile/src/operators/kernel/feed_kernel.h
 delete mode 100644 mobile/src/operators/kernel/fetch_kernel.h
 delete mode 100644 mobile/src/operators/kernel/flatten2_kernel.h
 delete mode 100644 mobile/src/operators/kernel/flatten_kernel.h
 delete mode 100644 mobile/src/operators/kernel/fpga/KD/conv_add_bn_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/KD/conv_add_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/KD/conv_add_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/KD/conv_bn_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/KD/conv_bn_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/KD/elementwise_add_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/KD/feed_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/KD/fetch_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/KD/fusion_fc_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/KD/pool_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/KD/softmax_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/concat_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp
 delete mode 100755 mobile/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/conv_add_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/conv_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp
 delete mode 100755 mobile/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/dropout_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/elementwise_mul_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/feed_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/fetch_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/pad2d_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/pool_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/proposal_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/reshape2_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/reshape_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/slice_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/softmax_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/split_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/tanh_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V1/transpose2_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/conv_add_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/conv_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/dropout_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/feed_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/fetch_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/pool_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/proposal_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/relu_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/reshape_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/roialign_pool_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp
 delete mode 100755 mobile/src/operators/kernel/fpga/V2/softmax_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/split_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/tanh_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fpga/V2/transpose2_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/fusion_fc_kernel.h
 delete mode 100644 mobile/src/operators/kernel/gru_kernel.h
 delete mode 100644 mobile/src/operators/kernel/gru_unit_kernel.h
 delete mode 100644 mobile/src/operators/kernel/im2sequence_kernel.h
 delete mode 100644 mobile/src/operators/kernel/increment_kernel.h
 delete mode 100644 mobile/src/operators/kernel/instancenorm_kernel.h
 delete mode 100644 mobile/src/operators/kernel/instancenorm_relu_kernel.h
 delete mode 100644 mobile/src/operators/kernel/is_empty_kernel.h
 delete mode 100644 mobile/src/operators/kernel/kernels.h
 delete mode 100644 mobile/src/operators/kernel/logical_kernel.h
 delete mode 100644 mobile/src/operators/kernel/lookup_kernel.h
 delete mode 100644 mobile/src/operators/kernel/lrn_kernel.h
 delete mode 100644 mobile/src/operators/kernel/mul_kernel.h
 delete mode 100644 mobile/src/operators/kernel/multiclass_nms_kernel.h
 delete mode 100644 mobile/src/operators/kernel/nearest_interp_kernel.h
 delete mode 100644 mobile/src/operators/kernel/norm_kernel.h
 delete mode 100644 mobile/src/operators/kernel/one_hot_kernel.h
 delete mode 100644 mobile/src/operators/kernel/pad2d_kernel.h
 delete mode 100644 mobile/src/operators/kernel/polygon_box_transform_kernel.h
 delete mode 100644 mobile/src/operators/kernel/pool_kernel.h
 delete mode 100644 mobile/src/operators/kernel/prelu_kernel.h
 delete mode 100644 mobile/src/operators/kernel/prior_box_kernel.h
 delete mode 100644 mobile/src/operators/kernel/quantize_kernel.h
 delete mode 100644 mobile/src/operators/kernel/range_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/range_kernel.h
 delete mode 100644 mobile/src/operators/kernel/reduce_prod_kernel.cpp
 delete mode 100644 mobile/src/operators/kernel/reduce_prod_kernel.h
 delete mode 100644 mobile/src/operators/kernel/reshape2_kernel.h
 delete mode 100644 mobile/src/operators/kernel/reshape_kernel.h
 delete mode 100644 mobile/src/operators/kernel/resize_kernel.h
 delete mode 100644 mobile/src/operators/kernel/scale_kernel.h
 delete mode 100644 mobile/src/operators/kernel/sequence_kernels.h
 delete mode 100644 mobile/src/operators/kernel/shape_kernel.h
 delete mode 100644 mobile/src/operators/kernel/slice_kernel.h
 delete mode 100644 mobile/src/operators/kernel/softmax_kernel.h
 delete mode 100644 mobile/src/operators/kernel/split_kernel.h
 delete mode 100644 mobile/src/operators/kernel/sum_kernel.h
 delete mode 100644 mobile/src/operators/kernel/tanh_kernel.h
 delete mode 100644 mobile/src/operators/kernel/tensor_array_read_write_kernel.h
 delete mode 100644 mobile/src/operators/kernel/transpose2_kernel.h
 delete mode 100644 mobile/src/operators/kernel/transpose_kernel.h
 delete mode 100644 mobile/src/operators/kernel/while_kernel.h
 delete mode 100644 mobile/src/operators/lod_reset_op.cpp
 delete mode 100644 mobile/src/operators/lod_reset_op.h
 delete mode 100644 mobile/src/operators/logical_op.cpp
 delete mode 100644 mobile/src/operators/logical_op.h
 delete mode 100644 mobile/src/operators/lookup_op.cpp
 delete mode 100644 mobile/src/operators/lookup_op.h
 delete mode 100644 mobile/src/operators/lrn_op.cpp
 delete mode 100644 mobile/src/operators/lrn_op.h
 delete mode 100644 mobile/src/operators/math/activation.h
 delete mode 100644 mobile/src/operators/math/depthwise/faster_depthwise_conv3x3.h
 delete mode 100644 mobile/src/operators/math/depthwise/faster_depthwise_conv3x3p1.cpp
 delete mode 100644 mobile/src/operators/math/depthwise_conv3x3.cpp
 delete mode 100644 mobile/src/operators/math/depthwise_conv3x3.h
 delete mode 100644 mobile/src/operators/math/depthwise_conv3x3_int8.cpp
 delete mode 100644 mobile/src/operators/math/depthwise_conv5x5.cpp
 delete mode 100644 mobile/src/operators/math/depthwise_conv5x5.h
 delete mode 100644 mobile/src/operators/math/depthwise_conv5x5_int8.cpp
 delete mode 100644 mobile/src/operators/math/element_wise.h
 delete mode 100644 mobile/src/operators/math/elementwise_op_function.h
 delete mode 100644 mobile/src/operators/math/gemm.cpp
 delete mode 100644 mobile/src/operators/math/gemm.h
 delete mode 100644 mobile/src/operators/math/gemm/cblas.cc
 delete mode 100644 mobile/src/operators/math/gemm/cblas.h
 delete mode 100644 mobile/src/operators/math/gemm/executor.h
 delete mode 100644 mobile/src/operators/math/gemm/gemm1x1s1.cpp
 delete mode 100644 mobile/src/operators/math/gemm/gemm1x1s1.h
 delete mode 100644 mobile/src/operators/math/gemm/gemm_kernel.h
 delete mode 100644 mobile/src/operators/math/gemm/pack_kernel.h
 delete mode 100644 mobile/src/operators/math/gemm/strategy.h
 delete mode 100644 mobile/src/operators/math/gemm_int8.cpp
 delete mode 100644 mobile/src/operators/math/gemm_omp_int8.cpp
 delete mode 100644 mobile/src/operators/math/gpc.cpp
 delete mode 100644 mobile/src/operators/math/gpc.h
 delete mode 100644 mobile/src/operators/math/gru_compute.cpp
 delete mode 100644 mobile/src/operators/math/gru_compute.h
 delete mode 100644 mobile/src/operators/math/gru_cpu_kernel.h
 delete mode 100644 mobile/src/operators/math/im2col.cpp
 delete mode 100644 mobile/src/operators/math/im2col.h
 delete mode 100644 mobile/src/operators/math/math.h
 delete mode 100644 mobile/src/operators/math/math_function.cpp
 delete mode 100644 mobile/src/operators/math/math_function.h
 delete mode 100644 mobile/src/operators/math/math_function_int8.cpp
 delete mode 100644 mobile/src/operators/math/pad.cpp
 delete mode 100644 mobile/src/operators/math/pad.h
 delete mode 100644 mobile/src/operators/math/poly_util.cpp
 delete mode 100644 mobile/src/operators/math/poly_util.h
 delete mode 100644 mobile/src/operators/math/pooling.cpp
 delete mode 100644 mobile/src/operators/math/pooling.h
 delete mode 100644 mobile/src/operators/math/pooling2x2.cpp
 delete mode 100644 mobile/src/operators/math/pooling3x3.cpp
 delete mode 100644 mobile/src/operators/math/quantize.h
 delete mode 100644 mobile/src/operators/math/selected_rows_functor.h
 delete mode 100644 mobile/src/operators/math/sequence2batch.cpp
 delete mode 100644 mobile/src/operators/math/sequence2batch.h
 delete mode 100644 mobile/src/operators/math/slidingwindow_conv3x3.cpp
 delete mode 100644 mobile/src/operators/math/slidingwindow_conv3x3.h
 delete mode 100644 mobile/src/operators/math/slidingwindow_utils.cpp
 delete mode 100644 mobile/src/operators/math/slidingwindow_utils.h
 delete mode 100644 mobile/src/operators/math/softmax.cpp
 delete mode 100644 mobile/src/operators/math/softmax.h
 delete mode 100644 mobile/src/operators/math/transform.h
 delete mode 100644 mobile/src/operators/math/vol2col.cpp
 delete mode 100644 mobile/src/operators/math/vol2col.h
 delete mode 100644 mobile/src/operators/math/winograd/winograd_transform.h
 delete mode 100644 mobile/src/operators/math/winograd/winograd_transform_f6k3.cpp
 delete mode 100644 mobile/src/operators/mul_op.cpp
 delete mode 100644 mobile/src/operators/mul_op.h
 delete mode 100644 mobile/src/operators/multiclass_nms_op.cpp
 delete mode 100644 mobile/src/operators/multiclass_nms_op.h
 delete mode 100644 mobile/src/operators/nearest_interp_op.cpp
 delete mode 100644 mobile/src/operators/nearest_interp_op.h
 delete mode 100644 mobile/src/operators/norm_op.cpp
 delete mode 100644 mobile/src/operators/norm_op.h
 delete mode 100644 mobile/src/operators/one_hot_op.cpp
 delete mode 100644 mobile/src/operators/one_hot_op.h
 delete mode 100644 mobile/src/operators/op_param.cpp
 delete mode 100644 mobile/src/operators/op_param.h
 delete mode 100755 mobile/src/operators/pad2d_op.cpp
 delete mode 100644 mobile/src/operators/pad2d_op.h
 delete mode 100644 mobile/src/operators/polygon_box_transform_op.cpp
 delete mode 100644 mobile/src/operators/polygon_box_transform_op.h
 delete mode 100644 mobile/src/operators/pool_op.cpp
 delete mode 100644 mobile/src/operators/pool_op.h
 delete mode 100644 mobile/src/operators/prelu_op.cpp
 delete mode 100644 mobile/src/operators/prelu_op.h
 delete mode 100644 mobile/src/operators/prior_box_op.cpp
 delete mode 100644 mobile/src/operators/prior_box_op.h
 delete mode 100644 mobile/src/operators/quantize_op.cpp
 delete mode 100644 mobile/src/operators/quantize_op.h
 delete mode 100644 mobile/src/operators/range_op.cpp
 delete mode 100644 mobile/src/operators/range_op.h
 delete mode 100644 mobile/src/operators/reduce_prod_op.cpp
 delete mode 100644 mobile/src/operators/reduce_prod_op.h
 delete mode 100644 mobile/src/operators/reshape2_op.cpp
 delete mode 100644 mobile/src/operators/reshape2_op.h
 delete mode 100644 mobile/src/operators/reshape_op.cpp
 delete mode 100644 mobile/src/operators/reshape_op.h
 delete mode 100644 mobile/src/operators/resize_op.cpp
 delete mode 100644 mobile/src/operators/resize_op.h
 delete mode 100644 mobile/src/operators/scale_op.cpp
 delete mode 100644 mobile/src/operators/scale_op.h
 delete mode 100644 mobile/src/operators/sequence_ops/sequence_expand_op.cpp
 delete mode 100644 mobile/src/operators/sequence_ops/sequence_expand_op.h
 delete mode 100644 mobile/src/operators/sequence_ops/sequence_pool_op.cpp
 delete mode 100644 mobile/src/operators/sequence_ops/sequence_pool_op.h
 delete mode 100644 mobile/src/operators/sequence_ops/sequence_softmax_op.cpp
 delete mode 100644 mobile/src/operators/sequence_ops/sequence_softmax_op.h
 delete mode 100644 mobile/src/operators/shape_op.cpp
 delete mode 100644 mobile/src/operators/shape_op.h
 delete mode 100644 mobile/src/operators/slice_op.cpp
 delete mode 100644 mobile/src/operators/slice_op.h
 delete mode 100644 mobile/src/operators/softmax_op.cpp
 delete mode 100644 mobile/src/operators/softmax_op.h
 delete mode 100644 mobile/src/operators/split_op.cpp
 delete mode 100644 mobile/src/operators/split_op.h
 delete mode 100644 mobile/src/operators/sum_op.cpp
 delete mode 100644 mobile/src/operators/sum_op.h
 delete mode 100644 mobile/src/operators/top_k_op.cpp
 delete mode 100644 mobile/src/operators/top_k_op.h
 delete mode 100644 mobile/src/operators/transpose2_op.cpp
 delete mode 100644 mobile/src/operators/transpose2_op.h
 delete mode 100644 mobile/src/operators/transpose_op.cpp
 delete mode 100644 mobile/src/operators/transpose_op.h
 delete mode 100644 mobile/src/pass/memory_optimize.cpp
 delete mode 100644 mobile/src/pass/memory_optimize.h
 delete mode 100644 mobile/src/pass/memory_optimize_super.cpp
 delete mode 100644 mobile/src/pass/memory_optimize_super.h
 delete mode 100644 mobile/src/pass/model_obfuscate.cpp
 delete mode 100644 mobile/src/pass/model_obfuscate.h
 delete mode 100644 mobile/src/pass/pass_base.h
 delete mode 100644 mobile/src/protobuf-c/protobuf-c.cpp
 delete mode 100644 mobile/src/protobuf-c/protobuf-c.h
 delete mode 100644 mobile/test/CMakeLists.txt
 delete mode 100644 mobile/test/common/test_enforce.cpp
 delete mode 100644 mobile/test/common/test_gemm_accuracy.cpp
 delete mode 100644 mobile/test/common/test_gemm_int8_accuracy.cpp
 delete mode 100644 mobile/test/common/test_gemm_perf.cpp
 delete mode 100644 mobile/test/common/test_lib_size.cpp
 delete mode 100644 mobile/test/common/test_lib_size.h
 delete mode 100644 mobile/test/common/test_log.cpp
 delete mode 100644 mobile/test/common/test_openmp.cpp
 delete mode 100644 mobile/test/executor_for_test.h
 delete mode 100644 mobile/test/fpga/test_concat_op.cpp
 delete mode 100644 mobile/test/fpga/test_densebox_combine.cpp
 delete mode 100644 mobile/test/fpga/test_format_data.cpp
 delete mode 100644 mobile/test/fpga/test_marker.cpp
 delete mode 100644 mobile/test/fpga/test_marker2.cpp
 delete mode 100644 mobile/test/fpga/test_marker_api.cpp
 delete mode 100644 mobile/test/fpga/test_mobilenet_api.cpp
 delete mode 100644 mobile/test/fpga/test_pe.cpp
 delete mode 100644 mobile/test/fpga/test_resnet50.cpp
 delete mode 100644 mobile/test/fpga/test_rfcn.cpp
 delete mode 100644 mobile/test/fpga/test_rfcn_api.cpp
 delete mode 100644 mobile/test/fpga/test_ssd.cpp
 delete mode 100644 mobile/test/fpga/test_tensor_quant.cpp
 delete mode 100644 mobile/test/fpga/test_yolo_api.cpp
 delete mode 100644 mobile/test/framework/test_inference_api.cpp
 delete mode 100644 mobile/test/framework/test_load.cpp
 delete mode 100644 mobile/test/framework/test_load_memory.cpp
 delete mode 100644 mobile/test/framework/test_load_memory_inference_api.cpp
 delete mode 100644 mobile/test/framework/test_optimize.cpp
 delete mode 100644 mobile/test/net/test_alexnet.cpp
 delete mode 100644 mobile/test/net/test_benchmark.cpp
 delete mode 100644 mobile/test/net/test_eng.cpp
 delete mode 100644 mobile/test/net/test_genet_combine.cpp
 delete mode 100644 mobile/test/net/test_gesture.cpp
 delete mode 100644 mobile/test/net/test_googlenet.cpp
 delete mode 100644 mobile/test/net/test_googlenet_quali.cpp
 delete mode 100644 mobile/test/net/test_googlenetv1_combine.cpp
 delete mode 100644 mobile/test/net/test_inceptionv4.cpp
 delete mode 100644 mobile/test/net/test_mobilenet+ssd.cpp
 delete mode 100644 mobile/test/net/test_mobilenet.cpp
 delete mode 100644 mobile/test/net/test_mobilenet_025_fssd.cpp
 delete mode 100644 mobile/test/net/test_mobilenet_GPU.cpp
 delete mode 100644 mobile/test/net/test_mobilenet_combine.cpp
 delete mode 100644 mobile/test/net/test_multi_inference_predict.cpp
 delete mode 100644 mobile/test/net/test_net.cpp
 delete mode 100644 mobile/test/net/test_net_benchmark.cpp
 delete mode 100644 mobile/test/net/test_nlp.cpp
 delete mode 100644 mobile/test/net/test_ocr.cpp
 delete mode 100644 mobile/test/net/test_op_in_net.cpp
 delete mode 100644 mobile/test/net/test_resnet.cpp
 delete mode 100644 mobile/test/net/test_squeezenet.cpp
 delete mode 100644 mobile/test/net/test_super.cpp
 delete mode 100644 mobile/test/net/test_vgg16ssd.cpp
 delete mode 100644 mobile/test/net/test_wrap.cpp
 delete mode 100644 mobile/test/net/test_yolo.cpp
 delete mode 100644 mobile/test/net/test_yolo_combined.cpp
 delete mode 100644 mobile/test/net/test_yologpu.cpp
 delete mode 100644 mobile/test/operators/test_batchnorm_op.cpp
 delete mode 100644 mobile/test/operators/test_box_coder_op.cpp
 delete mode 100644 mobile/test/operators/test_cast_op.cpp
 delete mode 100644 mobile/test/operators/test_concat_op.cpp
 delete mode 100644 mobile/test/operators/test_conv_add_relu_op.cpp
 delete mode 100644 mobile/test/operators/test_conv_bn_relu_op.cpp
 delete mode 100644 mobile/test/operators/test_conv_gpu.cpp
 delete mode 100644 mobile/test/operators/test_conv_op.cpp
 delete mode 100644 mobile/test/operators/test_depthwise_conv_op.cpp
 delete mode 100644 mobile/test/operators/test_dequantize_op.cpp
 delete mode 100644 mobile/test/operators/test_dwconv_bn_relu_op.cpp
 delete mode 100644 mobile/test/operators/test_elementwise_add_op.cpp
 delete mode 100644 mobile/test/operators/test_elementwise_sub_op.cpp
 delete mode 100644 mobile/test/operators/test_fill_constant_op.cpp
 delete mode 100644 mobile/test/operators/test_fusion_conv_add_bn_relu_op.cpp
 delete mode 100644 mobile/test/operators/test_fusion_fc_op.cpp
 delete mode 100644 mobile/test/operators/test_gru_op.cpp
 delete mode 100644 mobile/test/operators/test_im2sequence_op.cpp
 delete mode 100644 mobile/test/operators/test_increment_op.cpp
 delete mode 100644 mobile/test/operators/test_is_empty_op.cpp
 delete mode 100644 mobile/test/operators/test_leaky_relu_op.cpp
 delete mode 100644 mobile/test/operators/test_less_than_op.cpp
 delete mode 100644 mobile/test/operators/test_log_op.cpp
 delete mode 100644 mobile/test/operators/test_logical_and_op.cpp
 delete mode 100644 mobile/test/operators/test_logical_not_op.cpp
 delete mode 100644 mobile/test/operators/test_logical_or_op.cpp
 delete mode 100644 mobile/test/operators/test_logical_xor_op.cpp
 delete mode 100644 mobile/test/operators/test_lrn_op.cpp
 delete mode 100644 mobile/test/operators/test_mul_op.cpp
 delete mode 100644 mobile/test/operators/test_multiclass_nms_op.cpp
 delete mode 100644 mobile/test/operators/test_polygon_box_transform_op.cpp
 delete mode 100644 mobile/test/operators/test_pool_op.cpp
 delete mode 100644 mobile/test/operators/test_prelu_op.cpp
 delete mode 100644 mobile/test/operators/test_prior_box_op.cpp
 delete mode 100644 mobile/test/operators/test_quantize_op.cpp
 delete mode 100644 mobile/test/operators/test_relu6_op.cpp
 delete mode 100644 mobile/test/operators/test_relu_op.cpp
 delete mode 100644 mobile/test/operators/test_reshape2_op.cpp
 delete mode 100644 mobile/test/operators/test_reshape_op.cpp
 delete mode 100644 mobile/test/operators/test_resize_op.cpp
 delete mode 100644 mobile/test/operators/test_scale_op.cpp
 delete mode 100644 mobile/test/operators/test_sequence_expand_op.cpp
 delete mode 100644 mobile/test/operators/test_sequence_pool_op.cpp
 delete mode 100644 mobile/test/operators/test_sequence_softmax_op.cpp
 delete mode 100644 mobile/test/operators/test_sigmoid_op.cpp
 delete mode 100644 mobile/test/operators/test_slice_op.cpp
 delete mode 100644 mobile/test/operators/test_softmax_op.cpp
 delete mode 100644 mobile/test/operators/test_sum_op.cpp
 delete mode 100644 mobile/test/operators/test_tanh_op.cpp
 delete mode 100644 mobile/test/operators/test_topk_op.cpp
 delete mode 100644 mobile/test/operators/test_transpose2_op.cpp
 delete mode 100644 mobile/test/operators/test_transpose_op.cpp
 delete mode 100644 mobile/test/test_helper.h
 delete mode 100644 mobile/test/test_include.h
 delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/CL/cl.h
 delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/CL/cl_d3d10.h
 delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/CL/cl_d3d11.h
 delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing.h
 delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing_intel.h
 delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/CL/cl_egl.h
 delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/CL/cl_ext.h
 delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/CL/cl_ext_intel.h
 delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/CL/cl_gl.h
 delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/CL/cl_gl_ext.h
 delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/CL/cl_platform.h
 delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/CL/cl_va_api_media_sharing_intel.h
 delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/CL/cl_version.h
 delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/CL/opencl.h
 delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/LICENSE
 delete mode 100644 mobile/third_party/opencl/OpenCL-Headers/README.md
 delete mode 100644 mobile/tools/android-cmake/android.toolchain.cmake
 delete mode 100644 mobile/tools/android-debug-script/push2android.sh
 delete mode 100644 mobile/tools/android-debug-script/run_on_android.sh
 delete mode 100644 mobile/tools/arm-platform.cmake
 delete mode 100755 mobile/tools/build.sh
 delete mode 100755 mobile/tools/ci_build.sh
 delete mode 100644 mobile/tools/ci_run_test.sh
 delete mode 100644 mobile/tools/docker_build_fpga.sh
 delete mode 100644 mobile/tools/ios-cmake/ios.toolchain.cmake
 delete mode 100644 mobile/tools/net-detail.awk
 delete mode 100644 mobile/tools/net.awk
 delete mode 100755 mobile/tools/op.cmake
 delete mode 100644 mobile/tools/pre-commit.hooks/clang-format.hook
 delete mode 100755 mobile/tools/pre-commit.hooks/clang-tidy.hook
 delete mode 100644 mobile/tools/pre-commit.hooks/copyright.hook
 delete mode 100644 mobile/tools/pre-commit.hooks/cpplint.hook
 delete mode 100755 mobile/tools/prepare_images_and_models.sh
 delete mode 100644 mobile/tools/profile_show.sh
 delete mode 100644 mobile/tools/python/caffetools/run.py
 delete mode 100644 mobile/tools/python/fluidtools/.gitignore
 delete mode 100644 mobile/tools/python/fluidtools/run.py
 delete mode 100644 mobile/tools/python/fluidtools/test_wrap.py
 delete mode 100644 mobile/tools/python/imagetools/README.md
 delete mode 100644 mobile/tools/python/imagetools/imagetools.py
 delete mode 100644 mobile/tools/python/imagetools/img2nchw.py
 delete mode 100644 mobile/tools/python/imagetools/img2nhwc.py
 delete mode 100644 mobile/tools/python/imagetools/numpy2binary.py
 delete mode 100644 mobile/tools/python/misc/.gitignore
 delete mode 100644 mobile/tools/python/misc/fluidtools.py
 delete mode 100644 mobile/tools/python/misc/ios-test-server.py
 delete mode 100644 mobile/tools/python/misc/restore-git.py
 delete mode 100644 mobile/tools/python/misc/test-fluid-op-feature.py
 delete mode 100644 mobile/tools/python/modeltools/.gitignore
 delete mode 100644 mobile/tools/python/modeltools/core/__init__.py
 delete mode 100644 mobile/tools/python/modeltools/core/framework.proto
 delete mode 100644 mobile/tools/python/modeltools/core/framework_pb2.py
 delete mode 100644 mobile/tools/python/modeltools/core/op_types.py
 delete mode 100644 mobile/tools/python/modeltools/mobilenet/__init__.py
 delete mode 100644 mobile/tools/python/modeltools/mobilenet/converter_mobilenet.py
 delete mode 100644 mobile/tools/python/modeltools/mobilenet/swicher.py
 delete mode 100644 mobile/tools/python/modeltools/tools/__init__.py
 delete mode 100644 mobile/tools/python/modeltools/tools/float2halffloat.py
 delete mode 100644 mobile/tools/python/modeltools/tools/loader.py
 delete mode 100644 mobile/tools/python/modeltools/tools/model_combine.py
 delete mode 100644 mobile/tools/python/modeltools/tools/model_reader.py
 delete mode 100644 mobile/tools/python/modeltools/yolo/__init__.py
 delete mode 100644 mobile/tools/python/modeltools/yolo/mdl2fluid.py
 delete mode 100644 mobile/tools/python/modeltools/yolo/swicher.py
 delete mode 100644 mobile/tools/quantification/CMakeLists.txt
 delete mode 100644 mobile/tools/quantification/README.md
 delete mode 100644 mobile/tools/quantification/convert.cpp
 delete mode 100644 mobile/tools/quantification/src/block_desc_local.cpp
 delete mode 100644 mobile/tools/quantification/src/block_desc_local.h
 delete mode 100644 mobile/tools/quantification/src/enforce.h
 delete mode 100644 mobile/tools/quantification/src/framework.pb-c.c
 delete mode 100644 mobile/tools/quantification/src/framework.pb-c.h
 delete mode 100644 mobile/tools/quantification/src/program_desc.cpp
 delete mode 100644 mobile/tools/quantification/src/program_desc.h
 delete mode 100644 mobile/tools/quantification/src/protobuf-c.c
 delete mode 100644 mobile/tools/quantification/src/protobuf-c.h
 delete mode 100644 mobile/tools/quantification/src/tensor_desc.h
 delete mode 100644 mobile/tools/quantification/src/var_desc.h
 delete mode 100755 mobile/tools/shell/change_mobile_namespace.sh
 delete mode 100644 mobile/tools/shell/check-bitcode.sh
 delete mode 100644 mobile/tools/shell/check-filename.sh
 delete mode 100644 mobile/tools/shell/generate-include/.gitignore
 delete mode 100644 mobile/tools/shell/generate-include/check_include_diff.sh
 delete mode 100644 mobile/tools/shell/generate-include/main.cpp
 delete mode 100644 mobile/tools/shell/generate-include/parse.py
 delete mode 100755 mobile/tools/shell/generate-include/run.sh
 delete mode 100644 mobile/tools/shell/merge.sh
 delete mode 100644 mobile/tools/shell/prune_static_library.sh
 delete mode 100644 mobile/tools/shell/restore-private-repo.sh
 delete mode 100644 mobile/tools/toolchains/arm-android-neon.cmake
 delete mode 100644 mobile/tools/toolchains/arm-linux-gnueabi.cmake
 delete mode 100644 mobile/tools/toolchains/arm-linux-gnueabihf.cmake
 create mode 100644 model_optimize_tool.md
 create mode 100644 model_quantization.md
 create mode 100644 npu.md
 create mode 100644 opencl.md
 create mode 100644 paddle-mobile.md
 create mode 100644 roadmap.md
 create mode 100644 source_compile.md
 create mode 100644 source_compile.md.toc.2019-08-29_160045
 create mode 100644 support_operation_list.md
 create mode 100644 tech_highlights.md
 delete mode 160000 third-party/gflags
 delete mode 160000 third-party/googletest
 delete mode 160000 third-party/protobuf-host
 delete mode 160000 third-party/protobuf-mobile
 delete mode 100644 tools/codestyle/.gitignore
 delete mode 100755 tools/codestyle/clang_format.hook
 delete mode 100644 tools/codestyle/copyright.hook
 delete mode 100755 tools/codestyle/cpplint_pre_commit.hook
 delete mode 100644 tools/codestyle/docstring_checker.py
 delete mode 100755 tools/codestyle/pylint_pre_commit.hook
 delete mode 100644 tools/codestyle/test_docstring_checker.py
 delete mode 100755 tools/document_preview.sh
 create mode 100644 tutorial.md
 delete mode 100644 web/.editorconfig
 delete mode 100644 web/.gitignore
 delete mode 100644 web/.npmrc
 delete mode 100644 web/README.md
 delete mode 100644 web/README_cn.md
 delete mode 100644 web/demo/index.es6
 delete mode 100644 web/demo/index.html
 delete mode 100644 web/demo/videoDemo.es6
 delete mode 100644 web/demo/videoDemo.html
 delete mode 100644 web/package.json
 delete mode 100644 web/scripts/build.sh
 delete mode 100644 web/src/banana.jpeg
 delete mode 100644 web/src/executor/camera.es6
 delete mode 100644 web/src/executor/executor.es6
 delete mode 100644 web/src/executor/loader.es6
 delete mode 100644 web/src/executor/postProcess.es6
 delete mode 100644 web/src/executor/runner.es6
 delete mode 100644 web/src/factory/fshader/factory.es6
 delete mode 100644 web/src/factory/fshader/ops.es6
 delete mode 100644 web/src/feed/ImageFeed.es6
 delete mode 100644 web/src/feed/dataFeed.es6
 delete mode 100644 web/src/feed/io.es6
 delete mode 100644 web/src/gpu/gpu.es6
 delete mode 100644 web/src/index.es6
 delete mode 100644 web/src/index.html
 delete mode 100644 web/src/runtime/runtime.es6
 delete mode 100644 web/src/shader/atom/common_func.es6
 delete mode 100644 web/src/shader/atom/common_params.es6
 delete mode 100644 web/src/shader/atom/getArrayIndexFromTensorPos.es6
 delete mode 100644 web/src/shader/atom/getArrayIndexFromTexturePos.es6
 delete mode 100644 web/src/shader/atom/getOutputTensorPos.es6
 delete mode 100644 web/src/shader/atom/getPixelsFromTexturePos.es6
 delete mode 100644 web/src/shader/atom/getRangePowSumFromArrayIndex.es6
 delete mode 100644 web/src/shader/atom/getRangeSumFromArrayIndex.es6
 delete mode 100644 web/src/shader/atom/getTensorPosFromArrayIndex.es6
 delete mode 100644 web/src/shader/atom/getTexturePosFromArrayIndex.es6
 delete mode 100644 web/src/shader/atom/getValueFromTensorPos.es6
 delete mode 100644 web/src/shader/atom/getValueFromTensorPosPacked.es6
 delete mode 100644 web/src/shader/atom/getValueFromTexturePos.es6
 delete mode 100644 web/src/shader/atom/moveTexture2PosToReal.es6
 delete mode 100644 web/src/shader/atom/prefix.es6
 delete mode 100644 web/src/shader/atom/prefix2.es6
 delete mode 100644 web/src/shader/atom/prelu.es6
 delete mode 100644 web/src/shader/atom/scale.es6
 delete mode 100644 web/src/shader/atom/sigmoid.es6
 delete mode 100644 web/src/shader/atom/softmax.es6
 delete mode 100644 web/src/shader/atom/suffix.es6
 delete mode 100644 web/src/shader/atom/type_ivec56.es6
 delete mode 100644 web/src/shader/batchnorm/conf.es6
 delete mode 100644 web/src/shader/batchnorm/main.es6
 delete mode 100644 web/src/shader/batchnorm/params.es6
 delete mode 100644 web/src/shader/conv2d/conf.es6
 delete mode 100644 web/src/shader/conv2d/main.es6
 delete mode 100644 web/src/shader/conv2d/params.es6
 delete mode 100644 web/src/shader/conv2d_depthwise/conf.es6
 delete mode 100644 web/src/shader/conv2d_depthwise/main.es6
 delete mode 100644 web/src/shader/conv2d_depthwise/params.es6
 delete mode 100644 web/src/shader/conv2d_elementwise_add/conf.es6
 delete mode 100644 web/src/shader/conv2d_elementwise_add/main.es6
 delete mode 100644 web/src/shader/conv2d_elementwise_add/params.es6
 delete mode 100644 web/src/shader/conv2d_elementwise_add_winograd/conf.es6
 delete mode 100644 web/src/shader/conv2d_elementwise_add_winograd/main.es6
 delete mode 100644 web/src/shader/conv2d_elementwise_add_winograd/params.es6
 delete mode 100644 web/src/shader/dynamic/conf.es6
 delete mode 100644 web/src/shader/dynamic/main.es6
 delete mode 100644 web/src/shader/dynamic/params.es6
 delete mode 100644 web/src/shader/elementwise_add/conf.es6
 delete mode 100644 web/src/shader/elementwise_add/main.es6
 delete mode 100644 web/src/shader/elementwise_add/params.es6
 delete mode 100644 web/src/shader/mul/conf.es6
 delete mode 100644 web/src/shader/mul/main.es6
 delete mode 100644 web/src/shader/mul/params.es6
 delete mode 100644 web/src/shader/pool2d/conf.es6
 delete mode 100644 web/src/shader/pool2d/main.es6
 delete mode 100644 web/src/shader/pool2d/params.es6
 delete mode 100644 web/src/shader/pool2d_avg/conf.es6
 delete mode 100644 web/src/shader/pool2d_avg/main.es6
 delete mode 100644 web/src/shader/pool2d_avg/params.es6
 delete mode 100644 web/src/shader/pool2d_max/conf.es6
 delete mode 100644 web/src/shader/pool2d_max/main.es6
 delete mode 100644 web/src/shader/pool2d_max/params.es6
 delete mode 100644 web/src/shader/pool2d_winograd/conf.es6
 delete mode 100644 web/src/shader/pool2d_winograd/main.es6
 delete mode 100644 web/src/shader/pool2d_winograd/params.es6
 delete mode 100644 web/src/shader/softmax/conf.es6
 delete mode 100644 web/src/shader/softmax/main.es6
 delete mode 100644 web/src/shader/softmax/params.es6
 delete mode 100644 web/src/shader/v_shader.es6
 delete mode 100644 web/src/shader/v_shader2.es6
 delete mode 100644 web/src/test/getMaxUniforms.es6
 delete mode 100644 web/src/utils/models.es6
 delete mode 100644 web/src/utils/opData.es6
 delete mode 100644 web/src/utils/tensor.es6
 delete mode 100644 web/src/utils/utils.es6
 delete mode 100644 web/tools/logger.es6
 delete mode 100644 web/tools/toBinaryFile.py
 delete mode 100644 web/webpack.config.js
 create mode 100644 x2paddle.md
 create mode 100644 "\345\246\202\344\275\225\345\234\250Android\346\211\213\346\234\272\344\270\212\350\277\220\350\241\214\345\215\225\346\265\213.md"

diff --git a/.gitmodules b/.gitmodules
index 107036c702..e69de29bb2 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,12 +0,0 @@
-[submodule "third-party/gflags"]
-	path = third-party/gflags
-	url = https://github.com/gflags/gflags.git
-[submodule "third-party/googletest"]
-	path = third-party/googletest
-	url = https://github.com/google/googletest.git
-[submodule "third-party/protobuf-mobile"]
-	path = third-party/protobuf-mobile
-	url = https://github.com/tensor-tang/protobuf.git
-[submodule "third-party/protobuf-host"]
-	path = third-party/protobuf-host
-	url = https://github.com/protocolbuffers/protobuf.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
deleted file mode 100644
index 3643379acb..0000000000
--- a/CMakeLists.txt
+++ /dev/null
@@ -1,183 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License
-
-cmake_minimum_required(VERSION 3.0)
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
-include(lite_utils)
-
-lite_option(WITH_PADDLE_MOBILE   "Use the paddle-mobile legacy build"    OFF)
-if (WITH_PADDLE_MOBILE)
-    add_subdirectory(mobile)
-    return()
-endif(WITH_PADDLE_MOBILE)
-
-set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
-set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
-set(CMAKE_CXX_STANDARD 11)
-
-include(system)
-include(cross_compiling/preproject)
-
-project(paddle CXX C)
-message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
-        "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
-message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
-        "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
-message(STATUS "AR tools: ${CMAKE_AR}")
-
-if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-    find_package(CUDA QUIET)
-endif()
-find_package(Git REQUIRED)
-find_package(Threads REQUIRED)
-
-include(simd)
-
-################################ Exposed Configurations #######################################
-lite_option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
-lite_option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ON IF ${AVX_FOUND})
-lite_option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
-lite_option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
-lite_option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ON IF ${AVX_FOUND})
-lite_option(WITH_ARM_DOTPROD "Compile PaddlePaddle with ARM dot production"  ON)
-lite_option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
-# TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
-if(ANDROID OR IOS OR ARMLINUX)
-    set(WITH_GPU OFF CACHE STRING
-            "Disable GPU when cross-compiling for Android and iOS" FORCE)
-    set(WITH_DSO OFF CACHE STRING
-            "Disable DSO when cross-compiling for Android and iOS" FORCE)
-    set(WITH_AVX OFF CACHE STRING
-            "Disable AVX when cross-compiling for Android and iOS" FORCE)
-    set(WITH_PYTHON OFF CACHE STRING
-            "Disable PYTHON when cross-compiling for Android and iOS" FORCE)
-    set(WITH_RDMA OFF CACHE STRING
-            "Disable RDMA when cross-compiling for Android and iOS" FORCE)
-    set(WITH_MKL OFF CACHE STRING
-            "Disable MKL when cross-compiling for Android and iOS" FORCE)
-endif()
-
-# for lite, both server and mobile framework.
-lite_option(LITE_WITH_JAVA "Enable Java JNI lib in lite mode" OFF)
-lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF)
-lite_option(LITE_WITH_X86  "Enable X86 in lite mode"  ON)
-lite_option(LITE_WITH_ARM  "Enable ARM in lite mode"  OFF)
-lite_option(LITE_WITH_NPU  "Enable NPU in lite mode"  OFF)
-lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON)
-lite_option(LITE_WITH_OPENCL   "Enable OpenCL support in lite" OFF)
-lite_option(LITE_WITH_FPGA   "Enable FPGA support in lite" OFF)
-lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK  "Enable light-weight framework" OFF)
-lite_option(LITE_WITH_PROFILE  "Enable profile mode in lite framework"  OFF)
-lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF IF LITE_WITH_PROFILE)
-lite_option(LITE_SHUTDOWN_LOG "Shutdown log system or not." OFF)
-lite_option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF)
-lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
-# publish options
-lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kernels and operators" OFF)
-
-set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
-        "A path setting third party libraries download & build directories.")
-
-# CMAKE_BUILD_TYPE
-if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
-            "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
-            FORCE)
-endif()
-
-# check options
-if (LITE_ON_TINY_PUBLISH)
-    if (NOT (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND NOT WITH_TESTING))#LITE_WITH_JAVA AND
-        message(FATAL_ERROR "LITE_ON_TINY_PUBLISH=ON must be used with WITH_LITE=ON LITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON LITE_WITH_JAVA=ON WITH_TESTING=OFF")
-        return()
-    endif()
-endif()
-
-include_directories("${PADDLE_SOURCE_DIR}")
-# the generated header files.
-set(LITE_GENERATED_INCLUDE_DIR "${CMAKE_BINARY_DIR}")
-include_directories("${LITE_GENERATED_INCLUDE_DIR}")
-
-# for mobile
-if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-    message(STATUS "Building the mobile framework")
-    include(cross_compiling/postproject)
-    include(cross_compiling/npu) # check and prepare NPU DDK
-
-    # We compile the mobile deployment library when LITE_ON_TINY_PUBLISH=ON
-    # So the following third party dependencies are not needed.
-    if (NOT LITE_ON_TINY_PUBLISH)
-        # include the necessary thirdparty dependencies
-        include(external/gflags)    # download, build, install gflags
-        # LITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON will disable glog
-        # TODO(sangoly): refine WITH_LITE and LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-        include(external/gtest)     # download, build, install gtest
-        include(ccache)             # set ccache for compilation
-        include(external/protobuf)  # download, build, install protobuf
-    endif()
-
-    # for opencl
-    if (LITE_WITH_OPENCL)
-        include(external/opencl-headers)
-        include(external/opencl-clhpp)
-    endif()
-
-    include(generic)            # simplify cmake module
-    include(configure)          # add paddle env configuration
-
-    add_subdirectory(lite)
-    return()
-endif()
-#################################  End of mobile compile ##############################
-
-set(WITH_MKLML ${WITH_MKL})
-if (NOT DEFINED WITH_MKLDNN)
-    if (WITH_MKL AND AVX2_FOUND)
-        set(WITH_MKLDNN ON)
-    else()
-        message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
-        set(WITH_MKLDNN OFF)
-    endif()
-endif()
-
-########################################################################################
-
-include(external/mklml)     # download mklml package
-include(external/xbyak)     # download xbyak package
-include(external/libxsmm)   # download, build, install libxsmm
-include(external/gflags)    # download, build, install gflags
-include(external/glog)      # download, build, install glog
-include(external/gtest)     # download, build, install gtest
-include(external/protobuf)  # download, build, install protobuf
-include(external/openblas)  # download, build, install openblas
-include(external/mkldnn)    # download, build, install mkldnn
-include(external/eigen)     # download eigen3
-include(external/xxhash)    # download install xxhash needed for x86 jit
-
-include(cudnn)
-include(configure)          # add paddle env configuration
-
-if(LITE_WITH_CUDA) 
-  include(cuda)
-endif()
-
-include(generic)            # simplify cmake module
-include(ccache)             # set ccache for compilation
-include(util)               # set unittest and link libs
-include(version)            # set PADDLE_VERSION
-
-set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
-set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
-
-add_subdirectory(lite)
diff --git a/Home.md b/Home.md
new file mode 100644
index 0000000000..f521dbf171
--- /dev/null
+++ b/Home.md
@@ -0,0 +1,54 @@
+# Paddle Lite 文档
+## 总体概述
+
+Paddle-Lite 框架是 PaddleMobile 新一代架构，重点支持移动端推理预测，特点**高性能、多硬件、轻量级** 。支持PaddleFluid/TensorFlow/Caffe/ONNX模型的推理部署，目前已经支持 ARM CPU, Mali GPU, Adreno GPU, Huawei NPU 等多种硬件，正在逐步增加 X86 CPU, Nvidia GPU 等多款硬件，相关硬件性能业内领先。
+
+
+## 简介
+
+- [技术特点](./tech_highlights)
+- [架构设计](./architecture)
+- [Road Map](./roadmap)
+
+## Benchmark
+
+- [最新性能](./benchmark)
+- [测试方法](./benchmark_tools)
+
+## 安装
+
+- [源码编译](./source_compile)
+
+## 使用
+
+- [使用流程](./tutorial)
+- [C++实例](./cpp_demo)
+- [Java实例](./java_demo)
+- [Android/IOS APP demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)
+- [模型转化方法](./model_optimize_tool)
+
+## 进阶
+
+- [通过 X2Paddle 支持 Caffe, TensorFlow 模型](x2paddle)
+- [模型量化](./model_quantization)
+- [支持Op列表](./support_operation_list)
+- [新增Op方法](./add_new_operation)
+- [测试工具](./debug_tools)
+- [调试方法](./debug_tools)
+- [使用华为NPU](./npu)
+- [使用Android GPU](./opencl)
+- [使用FPGA](./fpga)
+
+## 开发者文档
+
+- [开发基础须知](./for-developer)
+- [架构详解](./architecture-intro)
+
+## FAQ
+
+- 问题或建议可以[发Issue](https://github.com/PaddlePaddle/Paddle-Lite/issues)，为加快问题解决效率，可先检索是否有类似问题，我们也会及时解答！
+- 欢迎加入Paddle-Lite百度官方QQ群：696965088
+
+## paddle-mobile
+
+- [paddle-mobile 编译](./mobile)
diff --git a/README.md b/README.md
deleted file mode 100644
index e32840a21d..0000000000
--- a/README.md
+++ /dev/null
@@ -1,74 +0,0 @@
-[中文版](./README_cn.md)
-
-# Paddle Lite
-
-<!--[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle-Lite.svg?branch=develop&longCache=true&style=flat-square)](https://travis-ci.org/PaddlePaddle/Paddle-Lite)-->
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://github.com/PaddlePaddle/Paddle-Lite/wiki)
-[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
-<!-- [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle-Mobile.svg)](https://github.com/PaddlePaddle/Paddle-Mobile/releases) -->
-
-
-Paddle Lite is an updated version of Paddle-Mobile, an open-open source deep learning framework designed to make it easy to perform inference on mobile, embeded, and IoT devices. It is compatible with PaddlePaddle and pre-trained models from other sources.
-
-For tutorials, please see [PaddleLite Wiki](https://github.com/PaddlePaddle/Paddle-Lite/wiki).
-
-## Key Features
-
-### Light Weight
-
-On mobile devices, execution module can be deployed without third-party libraries, because our excecution module and analysis module are decoupled.
-
-On ARM V7, only 800KB are taken up, while on ARM V8, 1.3MB are taken up with the 80 operators and 85 kernels in the dynamic libraries provided by Paddle Lite.
-
-Paddle Lite enables immediate inference without extra optimization.
-
-### High Performance
-
-Paddle Lite enables device-optimized kernels, maximizing ARM CPU performance.
-
-It also supports INT8 quantizations with [PaddleSlim model compression tools](https://github.com/PaddlePaddle/models/tree/v1.5/PaddleSlim), reducing the size of models and increasing the performance of models.
-
-On Huawei NPU and FPGA, the performance is also boosted.
-
-The latest benchmark is located at [benchmark](https://github.com/PaddlePaddle/Paddle-Lite/wiki/benchmark)
-
-### High Compatibility
-
-Hardware compatibility: Paddle Lite supports a diversity of hardwares — ARM CPU, Mali GPU, Adreno GPU, Huawei NPU and FPGA. In the near future, we will also support AI microchips from Cambricon and Bitmain.
-
-Model compatibility: The Op of Paddle Lite is fully compatible to that of PaddlePaddle. The accuracy and performance of 18 models (mostly CV models and OCR models) and 85 operators have been validated. In the future, we will also support other models.
-
-Framework compatibility: In addition to models trained on PaddlePaddle, those trained on Caffe and TensorFlow can also be converted to be used on Paddle Lite, via [X2Paddle](https://github.com/PaddlePaddle/X2Paddle). In the future to come, we will also support models of ONNX format.
-
-## Architecture
-
-Paddle Lite is designed to support a wide range of hardwares and devices, and it enables mixed execution of a single model on multiple devices, optimization on various phases, and leight-weighted applications on devices.
-
-![img](https://github.com/Superjomn/_tmp_images/raw/master/images/paddle-lite-architecture.png)
-
-As is shown in the figure above, analysis phase includes Machine IR module, and it enables optimizations like Op fusion and redundant computation pruning. Besides, excecution phase only involves Kernal exevution, so it can be deployed on its own to ensure maximized light-weighted deployment.
-
-## Key Info about the Update
-
-The earlier Paddle-Mobile was designed to be compatible with PaddlePaddle and multiple hardwares, including ARM CPU, Mali GPU, Adreno GPU, FPGA, ARM-Linux and Apple's GPU Metal. Within Baidu, inc, many product lines have been using Paddle-Mobile. For more details, please see: [mobile/README](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/README.md).
-
-As an update of Paddle-Mobile, Paddle Lite has incorporated many older capabilities into the [new architecture](https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/lite). For the time being, the code of Paddle-mobile will be kept under the directory `mobile/`, before complete transfer to Paddle Lite.
-
-For demands of Apple's GPU Metal and web front end inference, please see `./metal` and `./web` . These two modules will be further developed and maintained.
-
-## Special Thanks
-
-Paddle Lite has referenced the following open-source projects:
-
-- [ARM compute library](http://agroup.baidu.com/paddle-infer/md/article/%28https://github.com/ARM-software/ComputeLibrary%29)
-- [Anakin](https://github.com/PaddlePaddle/Anakin). The optimizations under Anakin has been incorporated into Paddle Lite, and so there will not be any future updates of Anakin. As another high-performance inference project under PaddlePaddle, Anakin has been forward-looking and helpful to the making of Paddle Lite. 
-
-
-## Feedback and Community Support
-
-- Questions, reports, and suggestions are welcome through Github Issues!
-- Forum: Opinions and questions are welcome at our [PaddlePaddle Forum](https://ai.baidu.com/forum/topic/list/168)！
-- WeChat Official Account: PaddlePaddle
-- QQ Group Chat: 696965088
-<p align="center"><img width="200" height="200"  src="https://user-images.githubusercontent.com/45189361/64117959-1969de80-cdc9-11e9-84f7-e1c2849a004c.jpeg"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="200" height="200" margin="500" src="https://user-images.githubusercontent.com/45189361/64117844-cb54db00-cdc8-11e9-8c08-24bbe594608e.jpeg"/></p>
-<p align="center">&#8194; WeChat Official Account&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;QQ Group Chat&#8194;&#8194;&#8194;&#8194;&#8194;</p>
diff --git a/README_cn.md b/README_cn.md
deleted file mode 100644
index d2111786b1..0000000000
--- a/README_cn.md
+++ /dev/null
@@ -1,62 +0,0 @@
-#  Paddle Lite
-
-<!--[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle-Lite.svg?branch=develop&longCache=true&style=flat-square)](https://travis-ci.org/PaddlePaddle/Paddle-Lite)-->
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://github.com/PaddlePaddle/Paddle-Lite/wiki)
-[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
-<!-- [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle-Mobile.svg)](https://github.com/PaddlePaddle/Paddle-Mobile/releases) -->
-
-Paddle Lite为Paddle-Mobile的升级版，定位支持包括手机移动端在内更多场景的轻量化高效预测，支持更广泛的硬件和平台，是一个高性能、轻量级的深度学习预测引擎。在保持和PaddlePaddle无缝对接外，也兼容支持其他训练框架产出的模型。
-
-完整使用文档位于 [PaddleLite Wiki](https://github.com/PaddlePaddle/Paddle-Lite/wiki) 。
-
-## 特性
-
-### 轻量级
-执行阶段和计算优化阶段实现良好解耦拆分，移动端可以直接部署执行阶段，无任何第三方依赖。
-包含完整的80个 Op+85个 Kernel 的动态库，对于ARMV7只有800K，ARMV8下为1.3M，并可以裁剪到更低。
-在应用部署时，载入模型即可直接预测，无需额外分析优化。
-
-### 高性能
-极致的 ARM CPU 性能优化，针对不同微架构特点实现kernel的定制，最大发挥计算性能，在主流模型上展现出领先的速度优势。
-支持INT8量化计算，结合 [PaddleSlim 模型压缩工具](https://github.com/PaddlePaddle/models/tree/v1.5/PaddleSlim) 中 INT8量化训练功能，可以提供高精度高性能的预测能力。
-在Huawei NPU， FPGA上也具有有很好的性能表现。
-
-最新 Benchmark 位于 [benchmark](https://github.com/PaddlePaddle/Paddle-Lite/wiki/benchmark)。
-
-### 通用性
-硬件方面，Paddle Lite 的架构设计为多硬件兼容支持做了良好设计。除了支持ARM CPU、Mali GPU、Adreno GPU，还特别支持了华为 NPU，以及 FPGA 等边缘设备广泛使用的硬件。即将支持支持包括寒武纪、比特大陆等AI芯片，未来会增加对更多硬件的支持。
-
-模型支持方面，Paddle Lite和PaddlePaddle训练框架的Op对齐，提供更广泛的模型支持能力。目前已严格验证18个模型85个OP的精度和性能，对视觉类模型做到了较为充分的支持，覆盖分类、检测和定位，包含了特色的OCR模型的支持。未来会持续增加更多模型的支持验证。
-
-框架兼容方面：除了PaddlePaddle外，对其他训练框架也提供兼容支持。当前，支持Caffe 和 TensorFlow 训练出来的模型，通过X2Paddle (https://github.com/PaddlePaddle/X2Paddle) 转换工具实现。接下来将会对ONNX等格式模型提供兼容支持。
-
-## 架构
-
-PaddleLite 的架构设计着重考虑了对多硬件和平台的支持，并且强化了多个硬件在一个模型中混合执行的能力，多个层面的性能优化处理，以及对端侧应用的轻量化设计。
-
-![](https://github.com/Superjomn/_tmp_images/raw/master/images/paddle-lite-architecture.png)
-
-其中，Analysis Phase 包括了 MIR(Machine IR) 相关模块，能够对原有的模型的计算图针对具体的硬件列表进行算子融合、计算裁剪 在内的多种优化。Execution Phase 只涉及到Kernel 的执行，且可以单独部署，以支持极致的轻量级部署。
-
-
-## Paddle-Mobile升级为Paddle Lite的说明
-原Paddle-Mobile作为一个致力于嵌入式平台的PaddlePaddle预测引擎，已支持多种硬件平台，包括ARM CPU、 Mali GPU、Adreno GPU，以及支持苹果设备的GPU Metal实现、ZU5、ZU9等FPGA开发板、树莓派等arm-linux开发板。在百度内已经过广泛业务场景应用验证。对应设计文档可参考: [mobile/README](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/README.md)
-
-Paddle-Mobile 整体升级重构并更名为Paddle Lite后，原paddle-mobile 的底层能力大部分已集成到[新架构 ](https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/lite)下。作为过渡，暂时保留原Paddle-mobile代码。 主体代码位于 `mobile/` 目录中，后续一段时间会继续维护，并完成全部迁移。新功能会统一到[新架构 ](https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/lite)下开发。
-
-metal, web的模块相对独立，会继续在 `./metal` 和 `./web` 目录下开发和维护。对苹果设备的GPU Metal实现的需求及web前端预测需求，可以直接进入这两个目录。
-
-## 致谢：
-Paddle Lite 借鉴了以下开源项目：
-- [ARM compute library]((https://github.com/ARM-software/ComputeLibrary))
-- [Anakin](https://github.com/PaddlePaddle/Anakin) ，Anakin对应底层的一些优化实现已被集成到Paddle Lite。Anakin作为PaddlePaddle组织下的一个高性能预测项目，极具前瞻性，对Paddle Lite有重要贡献。Anakin已和本项目实现整合。之后，Anakin不再升级。
-
-##  交流与反馈
-* 欢迎您通过Github Issues来提交问题、报告与建议
-* 微信公众号：飞桨PaddlePaddle
-* QQ群: 696965088 
-
-<p align="center"><img width="200" height="200"  src="https://user-images.githubusercontent.com/45189361/64117959-1969de80-cdc9-11e9-84f7-e1c2849a004c.jpeg"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="200" height="200" margin="500" src="https://user-images.githubusercontent.com/45189361/64117844-cb54db00-cdc8-11e9-8c08-24bbe594608e.jpeg"/></p>
-<p align="center">  &#8194;&#8194;&#8194;微信公众号&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;官方技术交流QQ群</p>
-
-* 论坛: 欢迎大家在[PaddlePaddle论坛](https://ai.baidu.com/forum/topic/list/168)分享在使用PaddlePaddle中遇到的问题和经验, 营造良好的论坛氛围
diff --git a/add_new_operation.md b/add_new_operation.md
new file mode 100644
index 0000000000..a077a20696
--- /dev/null
+++ b/add_new_operation.md
@@ -0,0 +1,189 @@
+# 新增op的方法
+
+以下以添加argmax为例，详细说明新增op的方法步骤。
+
+## 1. 添加OpParam 结构体以传导 Op 的输入和输出
+
+- 这里命名为 `ArgmaxParam`
+
+- 在 `paddlelite/lite/operators/op_params.h` 中添加 `ArgmaxParam` 结构体，代码如下：
+    ```c++
+    struct ArgmaxParam {
+        lite::Tensor* X{};
+        lite::Tensor* Out{};
+        int Axis{0};
+    };
+    ```
+## 2. 添加 Argmax Op 并注册
+
+- 在paddlelite/lite/operators/目录下新建argmax_op.h文件，主要代码如下：
+    ```c++
+    class ArgmaxOpLite : public OpLite {
+    public:
+        ArgmaxOpLite() {}
+        explicit ArgmaxOpLite(const std::string &op_type) : OpLite(op_type) {}
+        bool CheckShape() const override;
+        bool InferShape() const override;
+        bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+        void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+        std::string DebugString() const override { return "argmax"; }
+    private:
+        mutable ArgmaxParam param_;
+    };
+    ```
+    `ArgmaxOpLite` 继承 `OpLite` ，成员变量包括 `ArgmaxParam` 结构体，需要实现的接口包括 `CheckShape()` 、`InferShape()` 、`AttachImp()` 、`AttachKernel()` 和 `DebugString()` 函数。`AttachKernel()` 和 `DebugString() `函数较为简单，此处直接实现；
+
+- 在 `paddlelite/lite/operators/` 目录下新建argmax_op.cc文件，需要具体实现`CheckShape()`、`InferShape()`和`AttachImp()`函数。`CheckShape()`函数检查输入是否符合要求，`InferShape()`函数基于输入推断得到输出的维度，`AttachImp()`函数绑定Op的输入输出。然后在argmax_op.cc文件中注册argmax，核心代码如下：
+    ```c++
+    bool ArgmaxOpLite::CheckShape() const {
+        CHECK_OR_FALSE(param_.X);
+        CHECK_OR_FALSE(param_.Out);
+        CHECK_OR_FALSE(param_.Axis < (param_.X)->dims().size());
+        return true;
+    }
+    
+    bool ArgmaxOpLite::InferShape() const {
+        auto x_dims = param_.X->dims();
+        int x_rank = x_dims.size();
+        int axis = param_.Axis;
+        if (axis < 0) axis += x_rank;
+    
+    std::vector<int64_t> out_dims;
+        for (int64_t i = 0; i < axis; i++) {
+            out_dims.push_back(x_dims[i]);
+        }
+        for (int64_t i = axis + 1; i < x_rank; i++) {
+            out_dims.push_back(x_dims[i]);
+        }
+    
+    	// Set output dims
+        param_.Out->Resize(lite::DDim(out_dims));
+        return true;
+    }
+    
+    bool ArgmaxOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+        auto x = op_desc.Input("X").front();
+        auto out = op_desc.Output("Out").front();
+    
+    param_.X = scope->FindVar(x)->GetMutable<lite::Tensor>();
+        param_.Out = scope->FindVar(out)->GetMutable<lite::Tensor>();
+        param_.Axis = op_desc.GetAttr<int>("Axis");
+    
+    return true;
+    }
+    REGISTER_LITE_OP(argmax, paddle::lite::operators::ArgmaxOpLite);
+    ```
+- 在paddlelite/lite/operators/CMakeLists.txt中添加```lite_cc_library(argmax_op SRCS argmax_op.cc DEPS ${op_DEPS})```，并且在set ops lite 中添加argmax_op；
+- 在paddlelite/lite/api/paddle_use_ops.h中添加```USE_LITE_OP(argmax)```。
+
+## 3. 添加Argmax Kernel并绑定
+以下以arm端argmax实现为例说明
+- 在paddlelite/lite/kernels/arm/目录下新建argmax_compute.h文件，声明ArgmaxCompute类，并继承KernelLite，主要代码如下：
+    ```c++
+    class ArgmaxCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+    public:
+        using param_t = operators::ArgmaxParam;
+        void Run() override;
+        virtual ~ArgmaxCompute() = default;
+    };
+    ```
+- 在paddlelite/lite/kernels/arm/目录下新建argmax_compute.cc文件，主要实现Run函数。`Run()`函数调用paddlelite/lite/arm/math/argmax.h中的`argmax_func()`函数，根据输入计算输出。最后在argmax_compute.cc文件中，我们绑定argmax的输入输出（为tensor的输入参数都需要绑定），代码如下：
+    ```c++
+    void ArgmaxCompute::Run() {
+        auto& param = Param<operators::ArgmaxParam>();
+        lite::Tensor* input = param.X;
+        lite::Tensor* output = param.Out;
+        int axis = param.Axis;
+        lite::arm::math::argmax_func(input, axis, output);
+        return;
+    }
+
+    REGISTER_LITE_KERNEL(
+        argmax, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ArgmaxCompute, def)
+        .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+        .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+        .Finalize();
+    ```
+
+- 在paddlelite/lite/kernels/arm/CMakeLists.txt中添加
+    ```cmake
+    lite_cc_library(argmax_compute_arm SRCS argmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
+    ```
+    CMakeLists.txt中set arm_kernels需要添加argmax_compute_arm;
+- 在paddlelite/lite/api/paddle_use_kernels.h中添加```USE_LITE_KERNEL(argmax, kARM, kFloat, kNCHW, def)```。
+
+## 4. 添加Argmax实现
+- 在paddlelite/lite/arm/math/目录下新建argmax.h文件，声明`argmax_func()`函数，代码如下：
+    ```c++
+    void argmax_func(const lite::Tensor* input, const int axis, lite::Tensor* output);
+    ```
+- 在paddlelite/lite/arm/math/目录下新建argmax.cc文件，具体实现`argmax_func()`函数，代码如下：
+    ```c++
+    void argmax_func(const lite::Tensor *input,
+                    const int axis,
+                    lite::Tensor *output) {
+    auto input_ddim = input->dims();
+    auto output_ddim = output->dims();
+
+    const int size = input_ddim[axis];
+    const int in_channel = input_ddim.count(axis, input_ddim.size());
+    const int out_channel = output_ddim.count(axis, output_ddim.size());
+    const int in_stride = input_ddim.count(axis + 1, input_ddim.size());
+    const int out_stride = input_ddim.count(0, axis);
+
+    for (int n = 0; n < out_stride; n++) {
+        for (int k = 0; k < in_stride; k++) {
+        const float *in_ptr = input->data<float>() + n * in_channel + k;
+        std::vector<std::pair<float, int>> vec;
+        vec.resize(size);
+        for (int i = 0; i < size; i++) {
+            vec[i] = std::make_pair(in_ptr[i * in_stride], i);
+        }
+        // sort
+        std::partial_sort(vec.begin(),
+                            vec.begin() + 1,
+                            vec.end(),
+                            std::greater<std::pair<float, int>>());
+
+        // out
+        float *out_ptr = output->mutable_data<float>() + n * out_channel + k;
+        *out_ptr = vec[0].second;
+        }
+    }
+    }
+    ```
+- 在paddlelite/lite/arm/math/CMakeFile.txt中的```math_arm library```中添加argmax.cc，在paddlelite/lite/arm/math/funcs.h中添加```#include "lite/arm/math/argmax.h"```
+
+## 5. 添加Argmax单测
+- 在paddlelite/lite/tests/kernels目录下新建argmax_compute_test.cc文件，声明并实现ArgmaxComputeTester类；
+- ArgmaxComputeTester类中主要包括PrepareOpDesc、PrepareData和RunBaseline函数。PrepareOpDesc函数设定单测op的类型和输入输出参数，PrepareData函数对输入tensor进行初始化，RunBaseline是基于输入计算得到输出，用于和框架计算的输出进行对比；
+- 使用gtest添加单测，代码如下：
+    ```c++
+    TEST(Argmax, precision) {
+        #ifdef LITE_WITH_ARM
+        LOG(INFO) << "test argmax arm";
+        Place place(TARGET(kARM));
+
+        for (int axis : {0, 1, 2, 3}) {
+            for (int n : {1, 3}) {
+            for (int c : {3, 6}) {
+                for (int h : {9, 18}) {
+                for (int w : {9, 18}) {
+                    std::unique_ptr<arena::TestCase> tester(
+                        new ArgmaxComputeTester(place, "def", axis, n, c, h, w));
+                    arena::Arena arena(std::move(tester), place, 2e-5);
+                    arena.TestPrecision();
+                }
+                }
+            }
+            }
+        }
+        #endif
+    }
+    ```
+- 在paddlelite/lite/tests/kernels/CMakeLists.txt中添加
+    ```cmake
+    lite_cc_test(test_kernel_argmax_compute SRCS argmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    ```
+# 6. 编译运行
+- 在paddlelite目录中，执行```./lite/tools/ci_build.sh build_test_arm```，该脚本会创建手机模拟器，并编译运行所有单测（花费时间较久）。如果运行无误，则表明添加argmax成功。
diff --git a/architecture-intro.md b/architecture-intro.md
new file mode 100644
index 0000000000..e7a705677c
--- /dev/null
+++ b/architecture-intro.md
@@ -0,0 +1,247 @@
+# Paddle-Lite 开发者文档
+
+这篇文档会从开发者角度详细介绍开发 Paddle-Lite 需要的相关信息。
+
+
+
+## 设计及思考
+
+近年来，各种深度学习预估硬件称出不穷，从手机APP到车载设备，再到音箱，均需要部署深度学习预测，且有如下共性需求：
+
+1. 高性能
+2. 硬件支持和扩展容易
+3. 轻量级部署
+
+Paddle-Lite 的架构方面便是定向参考如上需求设计实现的，具体地
+
+- 高性能方面
+  - 通过 MIR(Machine IR) 实现精细复杂的计算图的分析和优化
+  - 执行期 Kernel 的简单设计，几乎没有额外调度开销
+  - 适当的硬件层抽象，框架支持各个硬件后端中做特定的调度实现
+- 轻量级部署方面
+  - 拆分分析和执行两个阶段，执行阶段轻量级实现，可以单独部署
+  - 轻量级 Op 和 Kernel 设计
+- 硬件支持和扩展方面
+  - 通过 MIR 支撑带硬件和执行信息的宏观分析优化
+  - TypeSystem 抽象带硬件的不同计算模式的表示，实现整个计算图的强类型推导，以及执行状态机的静态分析
+
+Paddle-Lite 的架构尝试从强类型推导的角度建模支持多硬件，多种计算模式（不同量化精度、不同的 data layout等）的混合计算，从而实现宏观上的各异硬件和计算模式的混合。
+
+框架部分已经经过 FPGA，GPU，NPU 等异构硬件的打磨，各项能力也在完善中。
+
+## 重要模块介绍
+
+### OpLite
+
+[OpLite](https://github.com/PaddlePaddle/Paddle-Lite/blob/v2.0.0-beta1-prerel/lite/core/op_lite.h#L52) 是 Paddle-Lite 中的 Operator，用户扩展单个硬件时，最多的就是扩展 Op 和 Kernel。
+
+重要方法如下：
+
+```c++
+class OpLite : public Registry {
+ public:
+  // Check the shape.
+  virtual bool CheckShape() const { return true; }
+  // Inference the outputs' shape.
+  virtual bool InferShape() const { return true; }
+  // Link the external execution environ to internal context.
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope);
+};
+```
+
+其中，分析期执行
+
+- `AttachImpl`
+
+执行期执行
+
+- `CheckShape`
+- `InferShape`
+
+扩展须知：
+
+1. `CheckShape` 只在第一个 batch 执行，所以耗时不敏感
+
+2. `InferShape` 需要在每个 batch 执行，应该严格耗时
+
+   1. 可以通过添加 member variable 的方式，对其中一部分信息增加 cache，比如
+
+   ```c++
+   class XXOp : public OpLite {
+       void InferShape() {
+           int batch_size = param().input.shape[0];
+           if (!shape_cache_.empty()) {
+               shape_cache_[0] = batch_size;
+               param().output->Resize(shape_cache_);
+           }
+       }
+       
+    private:
+       shape_t shape_cache_;
+   }
+   ```
+
+   
+
+### OpParam
+
+[OpParam](https://github.com/PaddlePaddle/Paddle-Lite/blob/v2.0.0-beta1-prerel/lite/operators/op_params.h) 用于存储执行期 Kernel 需要的各项参数。 所有字段可以直接存储（比如指针或者 `int`），以避免执行中获取参数的延迟。
+
+因为没有需求，OpParam 暂时没有设置基类。
+
+实际例子：
+
+```c++
+// For Softmax op
+struct SoftmaxParam {
+  lite::Tensor* x{};
+  lite::Tensor* output{};
+  int axis{-1};
+};
+```
+
+OpLite 的 `AttachImpl` 方法就用于构建 `OpParam` ，复制传递给 `Kernel` 用于执行。
+
+OpParam  是执行期的重要模块，需要严格保证性能，相应的扩展要求：
+
+1. 字段的获取必须是低延迟的，可以直接用指针，或者直接复制值
+2. 避免执行无关信息混入，包括 debug 信息
+3. 命名需要与 Paddle OpDesc 中的信息严格一致，以降低功能对齐和理解的难度
+
+### Kernel
+
+```c++
+template <TargetType Target,
+          PrecisionType Precision,
+          DataLayoutType DataLayout = DataLayoutType::kNCHW>
+class KernelLite : public KernelBase {
+ public:
+  // Run the kernel.
+  virtual void Run() { CHECK(false) << "Not Implemented"; }
+
+  TargetType target() const override { return Target; }
+  PrecisionType precision() const override { return Precision; }
+  DataLayoutType layout() const override { return DataLayout; }
+  Place place() const override { return Place{Target, Precision, DataLayout}; }
+  std::string name() const override;
+};
+```
+
+由于是执行期的重要概念，因此 Kernel 设计地非常简单高效。 
+
+其中，执行期的 `Run` 是其唯一重要的接口，其中包含具体的计算逻辑。
+
+模板中的参数主要用于方便多硬件编译，以及自解释：
+
+- Target: 执行硬件
+- Precision: 主要的计算精度
+- DataLayout：主要计算的 data layout
+
+这部分信息用于帮助挑选 kernel，具体的值并不严格。
+
+
+
+Kernel 的注册需要用到 TypeSystem，不光对 Kernel 本身的特性进行描述，对其输入和输出均进行详尽的定义。
+
+例如 FullyConnected 的注册
+
+```c++
+REGISTER_LITE_KERNEL(
+    fc, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::FcCompute, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat), LAYOUT(kNCHW))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+```
+
+Kernel自身定义是 `kARM` 的，也就是ARM上的kernel，主要的计算精度是 `kFloat`，主要的 Data layout 是 `kNCHW`。
+
+接着会对其所有的输入和输出做详细定义，比如看 `Input` 输入的定义是 `LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat), LAYOUT(kNCHW))`，也就是声明其 Target 是 `kARM`， PRECISION 是 `kFloat`，Data Layout 是 `kNCHW`。
+
+这里的设计思想是类似C++中的函数重载，同一个 Kernel（的名字），在重载了其输入输出的类型之后可以是不同的kernel。
+
+#### 扩展须知
+
+1. 模板参数选用计算中主要的来表示
+   1. 比如，scale kernel，同时能接受 `float` 和 `int` 的输入，但其不算量化 kernel，那应该设置为 `Precision=float`，代表常规的计算精度中使用
+2. Kernel 输入输出的定义需要足够精确，是什么类型就是什么类型；框架会根据其输入输出的定义来动态构建状态机，否则会出现分析期和执行期的状态机不一致，造成未定义行为
+
+### MIR
+
+MIR 类似于 LLVM 里的 IR，只是加上了硬件和执行期的信息参与分析优化。
+
+Pass 是MIR中的模块化策略，其输入和输出都是 SSA Graph.
+
+框架会自动基于模型的Program 构建 SSA Graph，之后按 [Optimizer](https://github.com/PaddlePaddle/Paddle-Lite/blob/v2.0.0-beta1-prerel/lite/core/optimizer.h) 中定义的pass的顺序调用一系列 Pass。
+
+#### Op Fusion
+
+MIR 中的 [PatternMacher](https://github.com/PaddlePaddle/Paddle-Lite/blob/v2.0.0-beta1-prerel/lite/core/mir/pattern_matcher.h) 实现了简单有效的基于图的模板识别的算法，相关的 op fusion 的图操作可以基于此实现。
+
+实际的例子可以参考 [fc_fuse_pass.h](https://github.com/PaddlePaddle/Paddle-Lite/blob/v2.0.0-beta1-prerel/lite/core/mir/fusion/fc_fuse_pass.h)。
+
+### TypeSystem
+
+TypeSystem 是 Paddle-Lite 中构建复杂计算图的基础模块，核心思想是协助 SSA Graph 构建一个状态机，表示其中不同的状态。
+
+这里的 Type 主要包含下面四组信息，更多的信息可以按需扩展：
+
+- TargetType
+- Precision
+- DataLayout
+- device id，用于表示卡号
+
+
+
+状态机的表示：
+
+```python
+Tensor0(kARM, kFloat, kNCHW) --pass--> Tensor1(kOpenCL, kFloat, kNCHW)
+```
+
+MIR 会识别出，Tensor0 和 Tensor1 的硬件位置不同，因此触发相依的 Pass 插入对应的 cast op 来进行 type cast，比如
+
+```
+Tensor0(kARM, kFloat, kNCHW) --pass-> IoCopyOp(kARM, kOpenCL) --pass-> Tensor1(kOpenCL, kFloat, kNCHW)
+```
+
+### KernelContext
+
+KernelContext 是硬件支持的核心封装，主要用于为 Kernel 提供执行期的硬件上下文。
+
+KernelContext 的设计类似于 OpParam，两者均没有基类；对于 KernelContext，其假定是，不同的硬件间的接口和逻辑可能完全不同，比如 kARM 和 kCUDA，因此不设定基类，也不需要提供统一的接口来封装不同硬件行为。
+
+不同硬件的 KernelContext 直接与该硬件对应的 Kernel 对接。
+
+KernelContext 的行为可以被 MIR 在分析期确定和调度。
+
+注意事项：
+
+1. 由于是执行期概念，KernelContext 也需要注意性能和轻量化
+2. 移动端部署时只会部署执行期，因此 MIR 和 KernelContext 会拆开，因此 KernelContext 相应的设置需要能够序列化到 ProgramDesc 中，以便执行期载入和执行
+
+## 扩展硬件后端
+
+### 扩展现有的硬件后端
+
+主要是扩充 Op 和 Kernel 的工作，如果需要 fuse，则参考 MIR 章节，增加相应的fuse pass便可，具体地，可以参考
+
+- [fc_op](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/operators/fc_op.h) 实现类似的 Op
+- [fc_compute](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/kernels/arm/fc_compute.h) 实现类似的 Kernel
+- [fc_fuse_pass](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/mir/fusion/fc_fuse_pass.h) 实现fuse逻辑，并注册到 [optimizer](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/optimizer.h)
+
+### 扩展全新硬件后端
+
+需要额外扩充如下模块，让框架能够支撑硬件执行：
+
+- TypeSystem，需要扩充其中相关的 type
+  - 相关 [enum](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/api/paddle_place.h#L44)
+- MIR，需要扩展其中的 type cast 相关的 pass
+  - [TargetType cast pass](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/mir/type_target_cast_pass.cc) 用于拷贝不同硬件上的tensor
+  - [Data layout cast pass](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/mir/type_target_cast_pass.h) 用于转化不同的 data layout
+  - [Precision cast pass](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/mir/type_precision_cast_pass.h) 用于转化不同 tensor 的量化精度
+- KernelContext，具体地可以参考
+  - [ARM context](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/context.h#L91)
+  - 需要注意的是，硬件 context 的接口只服务于该硬件的 kernel
+  - context 有分析期和执行期两个阶段，如果分析期没有特殊的优化，则无需考虑；否则，需要注意将分析期的信息整理并序列化到离线模型中，用于执行期直接加载。
\ No newline at end of file
diff --git a/architecture.md b/architecture.md
new file mode 100644
index 0000000000..fbcd9b05eb
--- /dev/null
+++ b/architecture.md
@@ -0,0 +1,94 @@
+# 架构设计
+
+Mobile 在这次升级为 Lite 架构， 侧重多硬件、高性能的支持，其主要设计思想如下
+
+- 引入 Type system，强化多硬件、量化方法、data layout 的混合调度能力
+- 硬件细节隔离，通过不同编译开关，对支持的任何硬件可以自由插拔
+- 引入 MIR(Machine IR) 的概念，强化带执行环境下的优化支持
+- 优化期和执行期严格隔离，保证预测时轻量和高效率
+
+架构图如下
+
+![Paddle Inference Refactor1.0](./images/architecture.jpg)
+
+## 编译期和执行期严格隔离设计
+
+- compile time 优化完毕可以将优化信息存储到模型中；execution time 载入并执行
+- 两套 API 及对应的预测lib，满足不同场景
+  - `CxxPredictor` 打包了 `Compile Time` 和 `Execution Time`，可以 runtime 在具体硬件上做分析和优化，得到最优效果
+  - `MobilePredictor` 只打包 `Execution Time`，保持部署和执行的轻量
+
+## `Execution Time` 轻量级设计和实现
+
+- 每个 batch 实际执行只包含两个步骤执行
+  - `Op.InferShape`
+  - `Kernel.Run`，Kernel 相关参数均使用指针提前确定，后续无查找或传参消耗
+  - 设计目标，执行时，只有 kernel 计算本身消耗
+- 轻量级 `Op` 及 `Kernel` 设计，避免框架额外消耗
+  - `Op` 只有 `CreateKernels` 和 `InferShape` 两个重要职能
+  - `Kernel` 只有 `Run` 职能
+
+## 多硬件后端支持
+
+- 硬件通用行为，使用 `TargetWrapper` 模块做适配器适配，对上层框架提供一致界面
+- 框架上层策略保持硬件无关，如存储优化 (Memory optimize)，计算剪枝 (Computation prune) 等，任何硬件接入均可直接复用
+- 框架支持了硬件通用行为，特定硬件细节不做过多约束，各硬件可以自行实现并接入框架
+- 计算模式上目前支持两种主流模型，一种是类似 X86, ARM CPU 等非异构设备；一种是 GPU，或 FPGA 等异构设备（支持 stream, event异步执行模式以及跨设备拷贝）
+
+---
+## 多硬件及算法混合调度支持
+`TensorTy` 用来表示 Tensor 类型
+
+```c++
+struct TensorTy {
+    TargetType target;
+    PrecisionType precision;
+    DataLayout layout;
+    int deviceid;
+};
+```
+
+```c++
+enum class TargetType { kARM, kX86, kCUDA, kOpenCL };
+enum class PrecisionType { kFP32, kFP16, kInt8, kInt16 };
+enum class DataLayout { kNCHW, kNHWC };
+```
+---
+
+注册 Kernel，确定特定 Kernel 的输入输出特征
+
+```c++
+REGISTER_LITE_KERNEL(
+  mul, kARM, kFloat, kNCHW, arm::MulCompute, def)
+  .BindInput("X", {LiteType::GetTensorTy(kARM, kFloat, kNCHW)})
+  .BindInput("Y", {LiteType::GetTensorTy(kARM, kFloat, kNCHW))})
+  .BindOutput("Out", {LiteType::GetTensorTy(kARM, kFloat, kNCHW)})
+  .Finalize();
+```
+
+---
+
+同一个 Op 的不同 Kernel 类似函数重载
+
+用于支持任意的混合调度：
+
+1. 标记模型中所有 tensor 的 Type
+2. 标记 Kernel 的 硬件、执行精度、data layout 等信息
+
+全局做类型推断，当发现 tensor 传递中有类型冲突，采用 type cast 操作，通过插入特定功能 Op 来实现正确的传导
+
+![lite-7](images/lite1.png)
+
+
+
+---
+
+## MIR 用于图分析优化
+
+基于 Type System 的 SSA，通过 IR Pass 对计算图进行分析和优化：
+
+- 支持对整个 graph 进行类型推断，发现类型冲突并加入 type cast op，来支持通用混合调度
+- 计算剪枝 (Compute prune)，比如去掉 scale(1), assign op 等
+- 存储优化 (Memory optimize)
+- 操作熔合 (Operator fuse)（已经支持 fc, conv_bn, ele_add+act 等6种 fuse 策略）
+- 支持量化处理（已支持 Int8预测）
\ No newline at end of file
diff --git a/benchmark.md b/benchmark.md
new file mode 100644
index 0000000000..8125e02218
--- /dev/null
+++ b/benchmark.md
@@ -0,0 +1,162 @@
+# Benchmark
+
+可以参考[benchmark_tools](https://github.com/PaddlePaddle/Paddle-Lite/wiki/benchmark_tools)，推荐**一键benchmark**。
+
+## 测试环境
+
+* 测试模型
+    * fp32模型
+        * mobilenet_v1
+        * mobilenet_v2
+        * squeezenet_v1.1
+        * mnasnet
+        * shufflenet_v2
+    
+    * int8模型
+        * mobilenet_v1
+        * mobilenet_v2
+        * resnet50
+
+* 测试机器(android ndk ndk-r17c)
+   *  骁龙855
+      * xiaomi mi9, snapdragon 855 
+      * 4xA76(1@2.84GHz + 3@2.4GHz) + 4xA55@1.78GHz
+
+
+   *  骁龙845
+      * xiaomi mi8, 845
+      * 2.8GHz（大四核），1.7GHz（小四核）
+
+   *  骁龙835
+      * xiaomi mix2, snapdragon 835
+      * 2.45GHz（大四核），1.9GHz（小四核）
+ 
+   *  骁龙625
+      * oppo R9s, snapdragon625
+      * A53 x 8, big core@2.0GHz
+ 
+   * 骁龙653
+      * 360 N5, snapdragon 653
+      * 4 x A73@2.0GHz + 4 x A53@1.4GHz
+ 
+   * 麒麟970
+      * HUAWEI Mate10
+ 
+* 测试说明
+    * commit id: 12c129affaacd476e27a0a82b235a9d547d33f0f
+    * warmup=10, repeats=30，统计平均时间，单位是ms
+    * 当线程数为1时，```DeviceInfo::Global().SetRunMode```设置LITE_POWER_HIGH，否者设置LITE_POWER_NO_BIND
+    * 模型的输入图像的维度是{1, 3, 224, 224}，输入图像的每一位数值是1
+    
+## 测试数据
+
+### fp32 模型测试数据
+
+## 测试数据
+
+### fp32 模型测试数据
+
+骁龙855 | armv8 |  |   |armv7  |||
+---- | ---- | ---- | ----  |----  |----| ----|
+num_threads | 1 | 2 | 4  |1  |2| 4
+ mobilenet_v1 | 31.64 | 18.98 | 10.67 | 33.17 | 19.55 | 11.43 
+ mobilenet_v2 | 25.54 | 13.80 | 8.75 | 29.25 | 15.19 | 9.65 
+ squeezenet_v1.1 | 26.81 | 14.39 | 8.92 | 28.63 | 15.37 | 9.53 
+ mnasnet | 25.39 | 13.89 | 9.63	| 28.97	| 15.54	| 10.10 
+ shufflenet_v2 | 13.85 | 7.81 | 5.87 | 14.64 | 8.35 | 6.14 
+
+ 
+ 骁龙845 | armv8 |  |   |armv7  |||
+---- | ---- | ---- | ----  |----  |----| ----|
+num_threads | 1 | 2 | 4  |1  |2| 4
+ mobilenet_v1 | 62.04 | 33.63 | 18.63 | 66.23 | 35.78 | 20.14 
+ mobilenet_v2 | 40.41 | 22.94 | 13.33 | 44.22 | 24.58 | 14.50 
+ squeezenet_v1.1 | 49.92 | 23.78 | 13.86 | 52.00 | 24.85 | 15.87 
+ mnasnet | 40.14 | 23.36 | 14.46 | 43.77 | 24.78 | 14.76
+ shufflenet_v2 | 22.27 | 13.69 | 8.96 | 26.11 | 14.95 | 9.02 
+
+ 
+ 骁龙835 | armv8 |  |   |armv7  |||
+---- | ---- | ---- | ----  |----  |----| ----|
+num_threads | 1 | 2 | 4  |1  |2| 4
+ mobilenet_v1 | 89.57 | 50.88 | 27.62 | 96.11 | 53.18 | 31.99 
+ mobilenet_v2 | 59.92 | 33.93 | 20.91 | 64.04 | 36.85 | 23.10 
+ squeezenet_v1.1 | 65.25 | 37.92 | 23.40 | 74.87 | 40.96 | 23.69 
+ mnasnet | 60.97 | 35.04 | 22.40 | 64.88 | 37.90 | 24.53
+ shufflenet_v2 | 30.87 | 19.33 | 12.78 | 31.71 | 19.52 | 13.25
+
+ 
+ 骁龙625 | armv8 |  |   |armv7  |||
+---- | ---- | ---- | ----  |----  |----| ----|
+num_threads | 1 | 2 | 4  |1  |2| 4
+ mobilenet_v1 | 180.98 | 92.27 | 51.51 | 216.12 | 110.33 | 61.68 
+ mobilenet_v2 | 132.46 | 68.38 | 43.54 | 146.18 | 76.62 | 46.21 
+ squeezenet_v1.1 | 124.49 | 66.84 | 41.53 | 153.28 | 82.42 | 47.14 
+ mnasnet | 122.50 | 67.46 |	43.04 |	146.20 | 79.64 | 48.56 
+ shufflenet_v2 | 68.70 | 40.77 | 26.53 | 75.38 | 42.40 | 28.36 
+
+ 
+ 骁龙653 | armv8 |  |   |armv7  |||
+---- | ---- | ---- | ----  |----  |----| ----|
+num_threads | 1 | 2 | 4  |1  |2| 4
+ mobilenet_v1 | 121.27 | 59.36 | 34.06 | 126.55 | 64.96 | 39.23 
+ mobilenet_v2 | 79.48 | 46.17 | 27.81 | 87.93 | 48.28 | 31.87 
+ squeezenet_v1.1 | 81.10 | 42.66 | 42.07 | 82.29 | 45.88 | 28.84 
+ mnasnet | 75.60 | 44.22 | 30.16 | 82.99 | 49.07 | 32.34
+ shufflenet_v2 | 39.18 | 23.54 | 16.73 | 40.12 | 24.76 | 17.68 
+
+ 
+
+ 麒麟970 | armv8 |  |   |armv7  |||
+---- | ---- | ---- | ----  |----  |----| ----|
+num_threads | 1 | 2 | 4  |1  |2| 4
+ mobilenet_v1 | 99.58 | 56.91 | 29.02 | 102.42 | 57.81 | 35.36 
+ mobilenet_v2 | 69.22 | 42.41 | 23.55 | 69.49 | 43.38 | 25.26 
+ squeezenet_v1.1 | 67.48 | 41.06 | 24.47 | 75.03 | 43.57 | 26.35 
+ mnasnet | 74.55 | 43.06 | 24.22 | 75.48 | 44.43 | 26.69 
+ shufflenet_v2 | 39.20 | 24.54 | 16.34 | 37.40 | 24.32 | 16.66 
+
+### int8 模型测试数据
+
+骁龙855 | armv8 |  |   |armv7  |||
+---- | ---- | ---- | ----  |----  |----| ----|
+num_threads | 1 | 2 | 4  |1  |2| 4
+ mobilenet_v1_int8 | 16.77 | 8.38 | 4.59 | 43.42 | 20.80 | 10.89 
+ mobilenet_v2_int8 | 22.81 | 13.71 | 10.43 | 29.65 | 20.09 | 13.99 
+ resnet50_int8 | 258.83 | 157.22 | 85.83 | 424.99 | 209.37 | 112.32 
+
+ 骁龙845 | armv8 |  |   |armv7  |||
+---- | ---- | ---- | ----  |----  |----| ----|
+num_threads | 1 | 2 | 4  |1  |2| 4
+ mobilenet_v1_int8 | 44.08 | 23.75 | 12.52 | 49.19 | 26.77 | 13.82 
+ mobilenet_v2_int8 | 36.61 | 22.70 | 15.29 | 40.51 | 25.84 | 17.89 
+ resnet50_int8 | 399.64 | 217.74 | 112.86 | 408.80 | 224.72 | 122.15 
+
+ 骁龙835 | armv8 |  |   |armv7  |||
+---- | ---- | ---- | ----  |----  |----| ----|
+num_threads | 1 | 2 | 4  |1  |2| 4
+ mobilenet_v1_int8 | 59.99 | 31.59 | 16.55 | 62.92 | 33.33 | 17.38 
+ mobilenet_v2_int8 | 50.68 | 31.25 | 21.62 | 52.56 | 33.88 | 24.31 
+ resnet50_int8 | 498.85 | 267.65 | 146.03 | 510.54 | 278.77 | 155.05
+
+ 骁龙625 | armv8 |  |   |armv7  |||
+---- | ---- | ---- | ----  |----  |----| ----|
+num_threads | 1 | 2 | 4  |1  |2| 4
+ mobilenet_v1_int8 | 122.86 | 63.52 | 33.91 | 125.77 | 64.78 | 34.25 
+ mobilenet_v2_int8 | 110.71 | 67.76 | 49.85 | 114.63 | 71.74 | 51.73 
+ resnet50_int8 | 954.67 | 505.78 | 286.64 | 1016.64 | 532.84 | 305.20
+
+ 骁龙653 | armv8 |  |   |armv7  |||
+---- | ---- | ---- | ----  |----  |----| ----|
+num_threads | 1 | 2 | 4  |1  |2| 4
+ mobilenet_v1_int8 | 81.46 | 42.99 | 31.69 | 81.20 | 42.46 | 23.47 
+ mobilenet_v2_int8 | 68.39 | 43.47 | 32.03 | 69.40 | 44.47 | 33.46 
+ resnet50_int8 | 687.59 | 369.70 | 208.99 | 684.55 | 369.04 | 208.42 
+
+ 麒麟970 | armv8 |  |   |armv7  |||
+---- | ---- | ---- | ----  |----  |----| ----|
+num_threads | 1 | 2 | 4  |1  |2| 4
+ mobilenet_v1_int8 | 64.27 | 35.48 | 18.76 | 64.63 | 37.67 | 20.70 
+ mobilenet_v2_int8 | 64.54 | 36.76 | 22.17 | 68.80 | 38.85 | 24.30 
+ resnet50_int8 | 509.94 | 268.95 | 276.13 | 520.57 | 281.92 | 157.82 
+
diff --git a/benchmark_tools.md b/benchmark_tools.md
new file mode 100644
index 0000000000..8148f712f0
--- /dev/null
+++ b/benchmark_tools.md
@@ -0,0 +1,196 @@
+<!--ts-->
+  * [Benchmark](#Benchmark)
+      * [环境准备](#环境准备)
+      * [1. 一键Benchmark](#一-一键benchmark)
+      * [2. 逐步Benchmark](#二-逐步Benchmark)
+         * [1. 获取benchmark可执行文件](#1-获取benchmark可执行文件)
+         * [2. 下载模型](#2-下载模型)
+         * [3. benchmark.sh脚本](#3-benchmark-sh脚本)
+         * [4. 测试](#4-测试)
+<!--te-->
+
+# Benchmark
+
+本文将会介绍，在**Ubuntu:16.04交叉编译环境**下，用安卓手机在终端测试Paddle-Lite的性能，并介绍两种Benchmark方法：
+
+1. **一键Benchmark**：适用于想快速获得常见模型性能的用户，下载预编译好的benchmark可执行文件；
+2. **逐步Benchmark**：将**一键Benchmark**流程拆解讲解。
+
+# 环境准备
+
+1. 准备[adb](https://developer.android.com/studio/command-line/adb)等必备软件：
+```shell
+sudo apt update
+sudo apt install -y wget adb
+```
+2. 检查手机与电脑连接。安卓手机USB连上电脑，打开设置 -> 开启开发者模式 -> 开启USB调试 -> 允许（授权）当前电脑调试手机；
+3. 在电脑终端输入`adb devices`命令，查看当前连接到的设备：
+```shell
+adb devices
+```
+命令成功执行，显示结果类似下面（序列码略有不同）：
+```shell
+List of devices attached
+712QSDSEMMS7C   device
+```
+
+## 一. 一键Benchmark
+
+执行以下命令，完成Benchmark：
+
+```shell
+wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/run_benchmark.sh
+sh run_benchmark.sh
+```
+
+该`run_benchmark.sh`脚本会：
+
+1. 下载模型，并上传手机：包含mobilenetv1/v2、shufflenetv2、squeezenetv1.1、mnasnet；
+2. 下载pre-built android-armv7和android-armv8的可执行文件，并上传手机：`benchmark_bin_v7`和`benchmark_bin_v8`；
+3. 自动执行另一个脚本`benchmark.sh`（多台手机连接USB，请在`benchmark.sh`脚本中对`adb`命令后加上测试手机的`serial number`）；
+4. 从手机下载benchmark结果`result_armv7.txt`和`result_armv8.txt`，到当前目录，并显示Benchmark结果。
+
+## 二. 逐步Benchmark
+
+### 1. 获取benchmark可执行文件
+
+benchmark_bin文件可以测试PaddleLite的性能，有下面两种方式获得。
+
+#### 方式一：下载benchmark_bin可执行文件
+
+```shell
+# Download benchmark_bin for android-armv7
+wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_bin_v7
+
+# Download benchmark_bin for android-armv8
+wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_bin_v8
+```
+
+#### 方式二：由源码编译benchmark_bin文件
+
+根据[源码编译](./source_compile)准备编译环境，拉取PaddleLite最新release发布版代码，并在仓库根目录下，执行：
+
+```shell
+###########################################
+# Build benchmark_bin for android-armv7   #
+###########################################
+./lite/tools/ci_build.sh  \
+  --arm_os="android" \
+  --arm_abi="armv7" \
+  --arm_lang="gcc " \
+  build_arm
+
+# build result see: <paddle-lite-repo>/build.lite.android.armv7.gcc/lite/api/benchmark_bin
+
+###########################################
+# Build benchmark_bin for android-armv8   #
+###########################################
+./lite/tools/ci_build.sh  \
+  --arm_os="android" \
+  --arm_abi="armv8" \
+  --arm_lang="gcc "  \
+  build_arm
+
+# build result see: <paddle-lite-repo>/build.lite.android.armv8.gcc/lite/api/benchmark_bin
+```
+
+> **注意**：为了避免在docker内部访问不到手机的问题，建议编译得到benchmark_bin后退出到docker外面，并且将benchmark_bin文件拷贝到一个临时目录。然后在该临时目录下，按照下面步骤下载模型、拷贝脚本、测试。
+
+### 2. 下载模型
+
+PaddleLite为Benchmark准备好了[常见Benchmark模型](https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_models.tar.gz)。
+
+执行以下命令，下载常见Benchmark模型并解压：
+
+```shell
+wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_models.tar.gz
+tar zxvf benchmark_models.tar.gz
+```
+
+| 模型            | 下载地址                                                        |
+| --------------- | ------------------------------------------------------------ |
+| MobilenetV1     | [下载](https://paddle-inference-dist.bj.bcebos.com/PaddleLite/mobilenet_v1.tar.gz) |
+| MobilenetV2     | [下载](https://paddle-inference-dist.bj.bcebos.com/PaddleLite/mobilenet_v2.tar.gz) |
+| ShufflenetV2    | [下载](https://paddle-inference-dist.bj.bcebos.com/PaddleLite/shufflenet_v2.tar.gz) |
+| Squeezenet_V1.1 | [下载](https://paddle-inference-dist.bj.bcebos.com/PaddleLite/squeezenet_v11.tar.gz) |
+| Mnasnet         | [下载](https://paddle-inference-dist.bj.bcebos.com/PaddleLite/mnasnet.tar.gz) |
+
+> 注：若要使用测试脚本，**对单个模型测试**，请把单个模型放入 `benchmark_models` 文件夹，并确保测试脚本、`benchmark_models`文件夹在同一级的目录。
+
+注：上述模型都已经使用`model_optimize_tool`进行转化，而且Lite移动端只支持加载转化后的模型。如果需要测试其他模型，请先参考[模型转化方法](./model_optimize_tool)。
+
+
+### 3. benchmark.sh脚本
+
+benchmark测试的执行脚本`benchmark.sh` 位于源码中的`/PaddleLite/lite/tools/benchmark.sh`位置，测试时需要将`benchmark.sh`、 `benchmark_bin` 、 `benchmark_models` 文件复制到同一目录下。
+
+### 4. 测试
+
+从终端进入benchmark.sh、可执行文件（benchmark_bin_v7、benchmark_bin_v8）和模型文件（benchmark_models）所在文件夹。
+
+运行 benchmark.sh 脚本执行测试
+
+```shell
+# Benchmark for android-armv7
+sh benchmark.sh ./benchmark_bin_v7 ./benchmark_models result_armv7.txt
+
+# Benchmark for android-armv8
+sh benchmark.sh ./benchmark_bin_v8 ./benchmark_models result_armv8.txt
+```
+测试结束后，armv7和armv8的结果，分别保存在当前目录下的`result_armv7.txt`和`result_armv8.txt`文件中。
+
+**查看测试结果**
+
+在当前目录的`result_armv7.txt`和`result_armv8.txt`文件，查看测试结果。
+
+```shell
+run benchmark armv7
+--------------------------------------
+PaddleLite Benchmark
+Threads=1 Warmup=10 Repeats=30
+-- mnasnet               avg = 159.8427 ms
+-- mobilenet_v1          avg = 235.0072 ms
+-- mobilenet_v2          avg = 173.0387 ms
+-- shufflenet_v2         avg = 76.0040 ms
+-- squeezenet_v11        avg = 164.2957 ms
+
+Threads=2 Warmup=10 Repeats=30
+-- mnasnet               avg = 83.1287 ms
+-- mobilenet_v1          avg = 121.6029 ms
+-- mobilenet_v2          avg = 86.6175 ms
+-- shufflenet_v2         avg = 41.5761 ms
+-- squeezenet_v11        avg = 87.8678 ms
+
+Threads=4 Warmup=10 Repeats=30
+-- mnasnet               avg = 73.3880 ms
+-- mobilenet_v1          avg = 119.0739 ms
+-- mobilenet_v2          avg = 85.3050 ms
+-- shufflenet_v2         avg = 38.0762 ms
+-- squeezenet_v11        avg = 64.2201 ms
+--------------------------------------
+
+run benchmark armv8
+--------------------------------------
+PaddleLite Benchmark
+Threads=1 Warmup=10 Repeats=30
+-- mnasnet               avg = 165.3073 ms
+-- mobilenet_v1          avg = 306.0188 ms
+-- mobilenet_v2          avg = 195.1884 ms
+-- shufflenet_v2         avg = 99.3692 ms
+-- squeezenet_v11        avg = 156.6971 ms
+
+Threads=2 Warmup=10 Repeats=30
+-- mnasnet               avg = 90.2290 ms
+-- mobilenet_v1          avg = 157.0007 ms
+-- mobilenet_v2          avg = 118.1607 ms
+-- shufflenet_v2         avg = 68.6804 ms
+-- squeezenet_v11        avg = 91.3090 ms
+
+Threads=4 Warmup=10 Repeats=30
+-- mnasnet               avg = 179.9730 ms
+-- mobilenet_v1          avg = 204.0684 ms
+-- mobilenet_v2          avg = 181.6486 ms
+-- shufflenet_v2         avg = 123.2728 ms
+-- squeezenet_v11        avg = 412.9046 ms
+--------------------------------------
+```
\ No newline at end of file
diff --git a/benchmark_tools.md.toc.2019-08-25_233116 b/benchmark_tools.md.toc.2019-08-25_233116
new file mode 100644
index 0000000000..6fbec144e8
--- /dev/null
+++ b/benchmark_tools.md.toc.2019-08-25_233116
@@ -0,0 +1,11 @@
+   * [Benchmark 测试方法](#benchmark-测试方法)
+      * [1. 一键Benchmark](#1-一键benchmark)
+      * [2. 逐步测试说明](#2-逐步测试说明)
+            * [1. benchmark可执行文件](#1-benchmark可执行文件)
+            * [2. 下载模型](#2-下载模型)
+            * [3. benchmark.sh 脚本](#3-benchmarksh-脚本)
+            * [4. 测试](#4-测试)
+      * [3. 完整实例](#3-完整实例)
+
+<!-- Added by: yanchunwei, at: Sun Aug 25 23:31:16 CST 2019 -->
+
diff --git a/benchmark_tools.md.toc.2019-08-25_233528 b/benchmark_tools.md.toc.2019-08-25_233528
new file mode 100644
index 0000000000..238a7cb053
--- /dev/null
+++ b/benchmark_tools.md.toc.2019-08-25_233528
@@ -0,0 +1,11 @@
+   * [Benchmark 测试方法](#benchmark-测试方法)
+      * [1. 一键Benchmark](#1-一键benchmark)
+      * [2. 逐步测试说明](#2-逐步测试说明)
+         * [1. benchmark可执行文件](#1-benchmark可执行文件)
+         * [2. 下载模型](#2-下载模型)
+         * [3. benchmark.sh 脚本](#3-benchmarksh-脚本)
+         * [4. 测试](#4-测试)
+      * [3. 完整实例](#3-完整实例)
+
+<!-- Added by: yanchunwei, at: Sun Aug 25 23:35:28 CST 2019 -->
+
diff --git a/cmake/FindGflags.cmake b/cmake/FindGflags.cmake
deleted file mode 100644
index 6587089ba3..0000000000
--- a/cmake/FindGflags.cmake
+++ /dev/null
@@ -1,582 +0,0 @@
-# Ceres Solver - A fast non-linear least squares minimizer
-# Copyright 2015 Google Inc. All rights reserved.
-# http://ceres-solver.org/
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-#   this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-# * Neither the name of Google Inc. nor the names of its contributors may be
-#   used to endorse or promote products derived from this software without
-#   specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#
-# Author: alexs.mac@gmail.com (Alex Stewart)
-#
-
-# FindGflags.cmake - Find Google gflags logging library.
-#
-# This module will attempt to find gflags, either via an exported CMake
-# configuration (generated by gflags >= 2.1 which are built with CMake), or
-# by performing a standard search for all gflags components.  The order of
-# precedence for these two methods of finding gflags is controlled by:
-# GFLAGS_PREFER_EXPORTED_GFLAGS_CMAKE_CONFIGURATION.
-#
-# This module defines the following variables:
-#
-# GFLAGS_FOUND: TRUE iff gflags is found.
-# GFLAGS_INCLUDE_DIRS: Include directories for gflags.
-# GFLAGS_LIBRARIES: Libraries required to link gflags.
-# GFLAGS_NAMESPACE: The namespace in which gflags is defined.  In versions of
-#                   gflags < 2.1, this was google, for versions >= 2.1 it is
-#                   by default gflags, although can be configured when building
-#                   gflags to be something else (i.e. google for legacy
-#                   compatibility).
-#
-# The following variables control the behaviour of this module when an exported
-# gflags CMake configuration is not found.
-#
-# GFLAGS_PREFER_EXPORTED_GFLAGS_CMAKE_CONFIGURATION: TRUE/FALSE, iff TRUE then
-#                           then prefer using an exported CMake configuration
-#                           generated by gflags >= 2.1 over searching for the
-#                           gflags components manually.  Otherwise (FALSE)
-#                           ignore any exported gflags CMake configurations and
-#                           always perform a manual search for the components.
-#                           Default: TRUE iff user does not define this variable
-#                           before we are called, and does NOT specify either
-#                           GFLAGS_INCLUDE_DIR_HINTS or GFLAGS_LIBRARY_DIR_HINTS
-#                           otherwise FALSE.
-# GFLAGS_INCLUDE_DIR_HINTS: List of additional directories in which to
-#                           search for gflags includes, e.g: /timbuktu/include.
-# GFLAGS_LIBRARY_DIR_HINTS: List of additional directories in which to
-#                           search for gflags libraries, e.g: /timbuktu/lib.
-#
-# The following variables are also defined by this module, but in line with
-# CMake recommended FindPackage() module style should NOT be referenced directly
-# by callers (use the plural variables detailed above instead).  These variables
-# do however affect the behaviour of the module via FIND_[PATH/LIBRARY]() which
-# are NOT re-called (i.e. search for library is not repeated) if these variables
-# are set with valid values _in the CMake cache_. This means that if these
-# variables are set directly in the cache, either by the user in the CMake GUI,
-# or by the user passing -DVAR=VALUE directives to CMake when called (which
-# explicitly defines a cache variable), then they will be used verbatim,
-# bypassing the HINTS variables and other hard-coded search locations.
-#
-# GFLAGS_INCLUDE_DIR: Include directory for gflags, not including the
-#                     include directory of any dependencies.
-# GFLAGS_LIBRARY: gflags library, not including the libraries of any
-#                 dependencies.
-
-# Reset CALLERS_CMAKE_FIND_LIBRARY_PREFIXES to its value when FindGflags was
-# invoked, necessary for MSVC.
-macro(GFLAGS_RESET_FIND_LIBRARY_PREFIX)
-  if (MSVC)
-    set(CMAKE_FIND_LIBRARY_PREFIXES "${CALLERS_CMAKE_FIND_LIBRARY_PREFIXES}")
-  endif (MSVC)
-endmacro(GFLAGS_RESET_FIND_LIBRARY_PREFIX)
-
-# Called if we failed to find gflags or any of it's required dependencies,
-# unsets all public (designed to be used externally) variables and reports
-# error message at priority depending upon [REQUIRED/QUIET/<NONE>] argument.
-macro(GFLAGS_REPORT_NOT_FOUND REASON_MSG)
-  unset(GFLAGS_FOUND)
-  unset(GFLAGS_INCLUDE_DIRS)
-  unset(GFLAGS_LIBRARIES)
-  # Do not use unset, as we want to keep GFLAGS_NAMESPACE in the cache,
-  # but simply clear its value.
-  set(GFLAGS_NAMESPACE "" CACHE STRING
-    "gflags namespace (google or gflags)" FORCE)
-
-  # Make results of search visible in the CMake GUI if gflags has not
-  # been found so that user does not have to toggle to advanced view.
-  mark_as_advanced(CLEAR GFLAGS_INCLUDE_DIR
-                         GFLAGS_LIBRARY
-                         GFLAGS_NAMESPACE)
-
-  gflags_reset_find_library_prefix()
-
-  # Note <package>_FIND_[REQUIRED/QUIETLY] variables defined by FindPackage()
-  # use the camelcase library name, not uppercase.
-  if (Gflags_FIND_QUIETLY)
-    message(STATUS "Failed to find gflags - " ${REASON_MSG} ${ARGN})
-  elseif (Gflags_FIND_REQUIRED)
-    message(FATAL_ERROR "Failed to find gflags - " ${REASON_MSG} ${ARGN})
-  else()
-    # Neither QUIETLY nor REQUIRED, use no priority which emits a message
-    # but continues configuration and allows generation.
-    message("-- Failed to find gflags - " ${REASON_MSG} ${ARGN})
-  endif ()
-  return()
-endmacro(GFLAGS_REPORT_NOT_FOUND)
-
-# Verify that all variable names passed as arguments are defined (can be empty
-# but must be defined) or raise a fatal error.
-macro(GFLAGS_CHECK_VARS_DEFINED)
-  foreach(CHECK_VAR ${ARGN})
-    if (NOT DEFINED ${CHECK_VAR})
-      message(FATAL_ERROR "Ceres Bug: ${CHECK_VAR} is not defined.")
-    endif()
-  endforeach()
-endmacro(GFLAGS_CHECK_VARS_DEFINED)
-
-# Use check_cxx_source_compiles() to compile trivial test programs to determine
-# the gflags namespace.  This works on all OSs except Windows.  If using Visual
-# Studio, it fails because msbuild forces check_cxx_source_compiles() to use
-# CMAKE_BUILD_TYPE=Debug for the test project, which usually breaks detection
-# because MSVC requires that the test project use the same build type as gflags,
-# which would normally be built in Release.
-#
-# Defines: GFLAGS_NAMESPACE in the caller's scope with the detected namespace,
-#          which is blank (empty string, will test FALSE is CMake conditionals)
-#          if detection failed.
-function(GFLAGS_CHECK_GFLAGS_NAMESPACE_USING_TRY_COMPILE)
-  # Verify that all required variables are defined.
-  gflags_check_vars_defined(
-    GFLAGS_INCLUDE_DIR GFLAGS_LIBRARY)
-  # Ensure that GFLAGS_NAMESPACE is always unset on completion unless
-  # we explicitly set if after having the correct namespace.
-  set(GFLAGS_NAMESPACE "" PARENT_SCOPE)
-
-  include(CheckCXXSourceCompiles)
-  # Setup include path & link library for gflags for CHECK_CXX_SOURCE_COMPILES.
-  set(CMAKE_REQUIRED_INCLUDES ${GFLAGS_INCLUDE_DIR})
-  set(CMAKE_REQUIRED_LIBRARIES ${GFLAGS_LIBRARY} ${GFLAGS_LINK_LIBRARIES})
-  # First try the (older) google namespace.  Note that the output variable
-  # MUST be unique to the build type as otherwise the test is not repeated as
-  # it is assumed to have already been performed.
-  check_cxx_source_compiles(
-    "#include <gflags/gflags.h>
-     int main(int argc, char * argv[]) {
-       google::ParseCommandLineFlags(&argc, &argv, true);
-       return 0;
-     }"
-     GFLAGS_IN_GOOGLE_NAMESPACE)
-  if (GFLAGS_IN_GOOGLE_NAMESPACE)
-    set(GFLAGS_NAMESPACE google PARENT_SCOPE)
-    return()
-  endif()
-
-  # Try (newer) gflags namespace instead.  Note that the output variable
-  # MUST be unique to the build type as otherwise the test is not repeated as
-  # it is assumed to have already been performed.
-  set(CMAKE_REQUIRED_INCLUDES ${GFLAGS_INCLUDE_DIR})
-  set(CMAKE_REQUIRED_LIBRARIES ${GFLAGS_LIBRARY} ${GFLAGS_LINK_LIBRARIES})
-  check_cxx_source_compiles(
-    "#include <gflags/gflags.h>
-     int main(int argc, char * argv[]) {
-        gflags::ParseCommandLineFlags(&argc, &argv, true);
-        return 0;
-     }"
-     GFLAGS_IN_GFLAGS_NAMESPACE)
-  if (GFLAGS_IN_GFLAGS_NAMESPACE)
-    set(GFLAGS_NAMESPACE gflags PARENT_SCOPE)
-    return()
-  endif (GFLAGS_IN_GFLAGS_NAMESPACE)
-endfunction(GFLAGS_CHECK_GFLAGS_NAMESPACE_USING_TRY_COMPILE)
-
-# Use regex on the gflags headers to attempt to determine the gflags namespace.
-# Checks both gflags.h (contained namespace on versions < 2.1.2) and
-# gflags_declare.h, which contains the namespace on versions >= 2.1.2.
-# In general, this method should only be used when
-# GFLAGS_CHECK_GFLAGS_NAMESPACE_USING_TRY_COMPILE() cannot be used, or has
-# failed.
-#
-# Defines: GFLAGS_NAMESPACE in the caller's scope with the detected namespace,
-#          which is blank (empty string, will test FALSE is CMake conditionals)
-#          if detection failed.
-function(GFLAGS_CHECK_GFLAGS_NAMESPACE_USING_REGEX)
-  # Verify that all required variables are defined.
-  gflags_check_vars_defined(GFLAGS_INCLUDE_DIR)
-  # Ensure that GFLAGS_NAMESPACE is always undefined on completion unless
-  # we explicitly set if after having the correct namespace.
-  set(GFLAGS_NAMESPACE "" PARENT_SCOPE)
-
-  # Scan gflags.h to identify what namespace gflags was built with.  On
-  # versions of gflags < 2.1.2, gflags.h was configured with the namespace
-  # directly, on >= 2.1.2, gflags.h uses the GFLAGS_NAMESPACE #define which
-  # is defined in gflags_declare.h, we try each location in turn.
-  set(GFLAGS_HEADER_FILE ${GFLAGS_INCLUDE_DIR}/gflags/gflags.h)
-  if (NOT EXISTS ${GFLAGS_HEADER_FILE})
-    gflags_report_not_found(
-      "Could not find file: ${GFLAGS_HEADER_FILE} "
-      "containing namespace information in gflags install located at: "
-      "${GFLAGS_INCLUDE_DIR}.")
-  endif()
-  file(READ ${GFLAGS_HEADER_FILE} GFLAGS_HEADER_FILE_CONTENTS)
-
-  string(REGEX MATCH "namespace [A-Za-z]+"
-    GFLAGS_NAMESPACE "${GFLAGS_HEADER_FILE_CONTENTS}")
-  string(REGEX REPLACE "namespace ([A-Za-z]+)" "\\1"
-    GFLAGS_NAMESPACE "${GFLAGS_NAMESPACE}")
-
-  if (NOT GFLAGS_NAMESPACE)
-    gflags_report_not_found(
-      "Failed to extract gflags namespace from header file: "
-      "${GFLAGS_HEADER_FILE}.")
-  endif (NOT GFLAGS_NAMESPACE)
-
-  if (GFLAGS_NAMESPACE STREQUAL "google" OR
-      GFLAGS_NAMESPACE STREQUAL "gflags")
-    # Found valid gflags namespace from gflags.h.
-    set(GFLAGS_NAMESPACE "${GFLAGS_NAMESPACE}" PARENT_SCOPE)
-    return()
-  endif()
-
-  # Failed to find gflags namespace from gflags.h, gflags is likely a new
-  # version, check gflags_declare.h, which in newer versions (>= 2.1.2) contains
-  # the GFLAGS_NAMESPACE #define, which is then referenced in gflags.h.
-  set(GFLAGS_DECLARE_FILE ${GFLAGS_INCLUDE_DIR}/gflags/gflags_declare.h)
-  if (NOT EXISTS ${GFLAGS_DECLARE_FILE})
-    gflags_report_not_found(
-      "Could not find file: ${GFLAGS_DECLARE_FILE} "
-      "containing namespace information in gflags install located at: "
-      "${GFLAGS_INCLUDE_DIR}.")
-  endif()
-  file(READ ${GFLAGS_DECLARE_FILE} GFLAGS_DECLARE_FILE_CONTENTS)
-
-  string(REGEX MATCH "#define GFLAGS_NAMESPACE [A-Za-z]+"
-    GFLAGS_NAMESPACE "${GFLAGS_DECLARE_FILE_CONTENTS}")
-  string(REGEX REPLACE "#define GFLAGS_NAMESPACE ([A-Za-z]+)" "\\1"
-    GFLAGS_NAMESPACE "${GFLAGS_NAMESPACE}")
-
-  if (NOT GFLAGS_NAMESPACE)
-    gflags_report_not_found(
-      "Failed to extract gflags namespace from declare file: "
-      "${GFLAGS_DECLARE_FILE}.")
-  endif (NOT GFLAGS_NAMESPACE)
-
-  if (GFLAGS_NAMESPACE STREQUAL "google" OR
-      GFLAGS_NAMESPACE STREQUAL "gflags")
-    # Found valid gflags namespace from gflags.h.
-    set(GFLAGS_NAMESPACE "${GFLAGS_NAMESPACE}" PARENT_SCOPE)
-    return()
-  endif()
-endfunction(GFLAGS_CHECK_GFLAGS_NAMESPACE_USING_REGEX)
-
-# -----------------------------------------------------------------
-# By default, if the user has expressed no preference for using an exported
-# gflags CMake configuration over performing a search for the installed
-# components, and has not specified any hints for the search locations, then
-# prefer a gflags exported configuration if available.
-if (NOT DEFINED GFLAGS_PREFER_EXPORTED_GFLAGS_CMAKE_CONFIGURATION
-    AND NOT GFLAGS_INCLUDE_DIR_HINTS
-    AND NOT GFLAGS_LIBRARY_DIR_HINTS)
-  message(STATUS "No preference for use of exported gflags CMake configuration "
-    "set, and no hints for include/library directories provided. "
-    "Defaulting to preferring an installed/exported gflags CMake configuration "
-    "if available.")
-  set(GFLAGS_PREFER_EXPORTED_GFLAGS_CMAKE_CONFIGURATION TRUE)
-endif()
-
-if (GFLAGS_PREFER_EXPORTED_GFLAGS_CMAKE_CONFIGURATION)
-  # Try to find an exported CMake configuration for gflags, as generated by
-  # gflags versions >= 2.1.
-  #
-  # We search twice, s/t we can invert the ordering of precedence used by
-  # find_package() for exported package build directories, and installed
-  # packages (found via CMAKE_SYSTEM_PREFIX_PATH), listed as items 6) and 7)
-  # respectively in [1].
-  #
-  # By default, exported build directories are (in theory) detected first, and
-  # this is usually the case on Windows.  However, on OS X & Linux, the install
-  # path (/usr/local) is typically present in the PATH environment variable
-  # which is checked in item 4) in [1] (i.e. before both of the above, unless
-  # NO_SYSTEM_ENVIRONMENT_PATH is passed).  As such on those OSs installed
-  # packages are usually detected in preference to exported package build
-  # directories.
-  #
-  # To ensure a more consistent response across all OSs, and as users usually
-  # want to prefer an installed version of a package over a locally built one
-  # where both exist (esp. as the exported build directory might be removed
-  # after installation), we first search with NO_CMAKE_PACKAGE_REGISTRY which
-  # means any build directories exported by the user are ignored, and thus
-  # installed directories are preferred.  If this fails to find the package
-  # we then research again, but without NO_CMAKE_PACKAGE_REGISTRY, so any
-  # exported build directories will now be detected.
-  #
-  # To prevent confusion on Windows, we also pass NO_CMAKE_BUILDS_PATH (which
-  # is item 5) in [1]), to not preferentially use projects that were built
-  # recently with the CMake GUI to ensure that we always prefer an installed
-  # version if available.
-  #
-  # [1] http://www.cmake.org/cmake/help/v2.8.11/cmake.html#command:find_package
-  find_package(gflags QUIET
-                      NO_MODULE
-                      NO_CMAKE_PACKAGE_REGISTRY
-                      NO_CMAKE_BUILDS_PATH)
-  if (gflags_FOUND)
-    message(STATUS "Found installed version of gflags: ${gflags_DIR}")
-  else(gflags_FOUND)
-    # Failed to find an installed version of gflags, repeat search allowing
-    # exported build directories.
-    message(STATUS "Failed to find installed gflags CMake configuration, "
-      "searching for gflags build directories exported with CMake.")
-    # Again pass NO_CMAKE_BUILDS_PATH, as we know that gflags is exported and
-    # do not want to treat projects built with the CMake GUI preferentially.
-    find_package(gflags QUIET
-                        NO_MODULE
-                        NO_CMAKE_BUILDS_PATH)
-    if (gflags_FOUND)
-      message(STATUS "Found exported gflags build directory: ${gflags_DIR}")
-    endif(gflags_FOUND)
-  endif(gflags_FOUND)
-
-  set(FOUND_INSTALLED_GFLAGS_CMAKE_CONFIGURATION ${gflags_FOUND})
-
-  # gflags v2.1 - 2.1.2 shipped with a bug in their gflags-config.cmake [1]
-  # whereby gflags_LIBRARIES = "gflags", but there was no imported target
-  # called "gflags", they were called: gflags[_nothreads]-[static/shared].
-  # As this causes linker errors when gflags is not installed in a location
-  # on the current library paths, detect if this problem is present and
-  # fix it.
-  #
-  # [1] https://github.com/gflags/gflags/issues/110
-  if (gflags_FOUND)
-    # NOTE: This is not written as additional conditions in the outer
-    #       if (gflags_FOUND) as the NOT TARGET "${gflags_LIBRARIES}"
-    #       condition causes problems if gflags is not found.
-    if (${gflags_VERSION} VERSION_LESS 2.1.3 AND
-        NOT TARGET "${gflags_LIBRARIES}")
-      message(STATUS "Detected broken gflags install in: ${gflags_DIR}, "
-        "version: ${gflags_VERSION} <= 2.1.2 which defines gflags_LIBRARIES = "
-        "${gflags_LIBRARIES} which is not an imported CMake target, see: "
-        "https://github.com/gflags/gflags/issues/110.  Attempting to fix by "
-        "detecting correct gflags target.")
-      # Ordering here expresses preference for detection, specifically we do not
-      # want to use the _nothreads variants if the full library is available.
-      list(APPEND CHECK_GFLAGS_IMPORTED_TARGET_NAMES
-        gflags-shared gflags-static
-        gflags_nothreads-shared gflags_nothreads-static)
-      foreach(CHECK_GFLAGS_TARGET ${CHECK_GFLAGS_IMPORTED_TARGET_NAMES})
-        if (TARGET ${CHECK_GFLAGS_TARGET})
-          message(STATUS "Found valid gflags target: ${CHECK_GFLAGS_TARGET}, "
-            "updating gflags_LIBRARIES.")
-          set(gflags_LIBRARIES ${CHECK_GFLAGS_TARGET})
-          break()
-        endif()
-      endforeach()
-      if (NOT TARGET ${gflags_LIBRARIES})
-        message(STATUS "Failed to fix detected broken gflags install in: "
-          "${gflags_DIR}, version: ${gflags_VERSION} <= 2.1.2, none of the "
-          "imported targets for gflags: ${CHECK_GFLAGS_IMPORTED_TARGET_NAMES} "
-          "are defined.  Will continue with a manual search for gflags "
-          "components.  We recommend you build/install a version of gflags > "
-          "2.1.2 (or master).")
-        set(FOUND_INSTALLED_GFLAGS_CMAKE_CONFIGURATION FALSE)
-      endif()
-    endif()
-  endif()
-
-  if (FOUND_INSTALLED_GFLAGS_CMAKE_CONFIGURATION)
-    message(STATUS "Detected gflags version: ${gflags_VERSION}")
-    set(GFLAGS_FOUND ${gflags_FOUND})
-    set(GFLAGS_INCLUDE_DIR ${gflags_INCLUDE_DIR})
-    set(GFLAGS_LIBRARY ${gflags_LIBRARIES})
-
-    # gflags does not export the namespace in their CMake configuration, so
-    # use our function to determine what it should be, as it can be either
-    # gflags or google dependent upon version & configuration.
-    #
-    # NOTE: We use the regex method to determine the namespace here, as
-    #       check_cxx_source_compiles() will not use imported targets, which
-    #       is what gflags will be in this case.
-    gflags_check_gflags_namespace_using_regex()
-
-    if (NOT GFLAGS_NAMESPACE)
-      gflags_report_not_found(
-        "Failed to determine gflags namespace using regex for gflags "
-        "version: ${gflags_VERSION} exported here: ${gflags_DIR} using CMake.")
-    endif (NOT GFLAGS_NAMESPACE)
-  else (FOUND_INSTALLED_GFLAGS_CMAKE_CONFIGURATION)
-    message(STATUS "Failed to find an installed/exported CMake configuration "
-      "for gflags, will perform search for installed gflags components.")
-  endif (FOUND_INSTALLED_GFLAGS_CMAKE_CONFIGURATION)
-endif(GFLAGS_PREFER_EXPORTED_GFLAGS_CMAKE_CONFIGURATION)
-
-if (NOT GFLAGS_FOUND)
-  # Either failed to find an exported gflags CMake configuration, or user
-  # told us not to use one.  Perform a manual search for all gflags components.
-
-  # Handle possible presence of lib prefix for libraries on MSVC, see
-  # also GFLAGS_RESET_FIND_LIBRARY_PREFIX().
-  if (MSVC)
-    # Preserve the caller's original values for CMAKE_FIND_LIBRARY_PREFIXES
-    # s/t we can set it back before returning.
-    set(CALLERS_CMAKE_FIND_LIBRARY_PREFIXES "${CMAKE_FIND_LIBRARY_PREFIXES}")
-    # The empty string in this list is important, it represents the case when
-    # the libraries have no prefix (shared libraries / DLLs).
-    set(CMAKE_FIND_LIBRARY_PREFIXES "lib" "" "${CMAKE_FIND_LIBRARY_PREFIXES}")
-  endif (MSVC)
-
-  # Search user-installed locations first, so that we prefer user installs
-  # to system installs where both exist.
-  list(APPEND GFLAGS_CHECK_INCLUDE_DIRS
-    /usr/local/include
-    /usr/local/homebrew/include # Mac OS X
-    /opt/local/var/macports/software # Mac OS X.
-    /opt/local/include
-    /usr/include)
-  list(APPEND GFLAGS_CHECK_PATH_SUFFIXES
-    gflags/include # Windows (for C:/Program Files prefix).
-    gflags/Include ) # Windows (for C:/Program Files prefix).
-
-  list(APPEND GFLAGS_CHECK_LIBRARY_DIRS
-    /usr/local/lib
-    /usr/local/homebrew/lib # Mac OS X.
-    /opt/local/lib
-    /usr/lib)
-  list(APPEND GFLAGS_CHECK_LIBRARY_SUFFIXES
-    gflags/lib # Windows (for C:/Program Files prefix).
-    gflags/Lib ) # Windows (for C:/Program Files prefix).
-
-  # Search supplied hint directories first if supplied.
-  find_path(GFLAGS_INCLUDE_DIR
-    NAMES gflags/gflags.h
-    PATHS ${GFLAGS_INCLUDE_DIR_HINTS}
-    ${GFLAGS_CHECK_INCLUDE_DIRS}
-    PATH_SUFFIXES ${GFLAGS_CHECK_PATH_SUFFIXES})
-  if (NOT GFLAGS_INCLUDE_DIR OR
-      NOT EXISTS ${GFLAGS_INCLUDE_DIR})
-    gflags_report_not_found(
-      "Could not find gflags include directory, set GFLAGS_INCLUDE_DIR "
-      "to directory containing gflags/gflags.h")
-  endif (NOT GFLAGS_INCLUDE_DIR OR
-    NOT EXISTS ${GFLAGS_INCLUDE_DIR})
-
-  find_library(GFLAGS_LIBRARY NAMES gflags
-    PATHS ${GFLAGS_LIBRARY_DIR_HINTS}
-    ${GFLAGS_CHECK_LIBRARY_DIRS}
-    PATH_SUFFIXES ${GFLAGS_CHECK_LIBRARY_SUFFIXES})
-  if (NOT GFLAGS_LIBRARY OR
-      NOT EXISTS ${GFLAGS_LIBRARY})
-    gflags_report_not_found(
-      "Could not find gflags library, set GFLAGS_LIBRARY "
-      "to full path to libgflags.")
-  endif (NOT GFLAGS_LIBRARY OR
-    NOT EXISTS ${GFLAGS_LIBRARY})
-
-  # gflags typically requires a threading library (which is OS dependent), note
-  # that this defines the CMAKE_THREAD_LIBS_INIT variable.  If we are able to
-  # detect threads, we assume that gflags requires it.
-  find_package(Threads QUIET)
-  set(GFLAGS_LINK_LIBRARIES ${CMAKE_THREAD_LIBS_INIT})
-  # On Windows (including MinGW), the Shlwapi library is used by gflags if
-  # available.
-  if (WIN32)
-    include(CheckIncludeFileCXX)
-    check_include_file_cxx("shlwapi.h" HAVE_SHLWAPI)
-    if (HAVE_SHLWAPI)
-      list(APPEND GFLAGS_LINK_LIBRARIES shlwapi.lib)
-    endif(HAVE_SHLWAPI)
-  endif (WIN32)
-
-  # Mark internally as found, then verify. GFLAGS_REPORT_NOT_FOUND() unsets
-  # if called.
-  set(GFLAGS_FOUND TRUE)
-
-  # Identify what namespace gflags was built with.
-  if (GFLAGS_INCLUDE_DIR AND NOT GFLAGS_NAMESPACE)
-    # To handle Windows peculiarities / CMake bugs on MSVC we try two approaches
-    # to detect the gflags namespace:
-    #
-    # 1) Try to use check_cxx_source_compiles() to compile a trivial program
-    #    with the two choices for the gflags namespace.
-    #
-    # 2) [In the event 1) fails] Use regex on the gflags headers to try to
-    #    determine the gflags namespace.  Whilst this is less robust than 1),
-    #    it does avoid any interaction with msbuild.
-    gflags_check_gflags_namespace_using_try_compile()
-
-    if (NOT GFLAGS_NAMESPACE)
-      # Failed to determine gflags namespace using check_cxx_source_compiles()
-      # method, try and obtain it using regex on the gflags headers instead.
-      message(STATUS "Failed to find gflags namespace using using "
-        "check_cxx_source_compiles(), trying namespace regex instead, "
-        "this is expected on Windows.")
-      gflags_check_gflags_namespace_using_regex()
-
-      if (NOT GFLAGS_NAMESPACE)
-        gflags_report_not_found(
-          "Failed to determine gflags namespace either by "
-          "check_cxx_source_compiles(), or namespace regex.")
-      endif (NOT GFLAGS_NAMESPACE)
-    endif (NOT GFLAGS_NAMESPACE)
-  endif (GFLAGS_INCLUDE_DIR AND NOT GFLAGS_NAMESPACE)
-
-  # Make the GFLAGS_NAMESPACE a cache variable s/t the user can view it, and could
-  # overwrite it in the CMake GUI.
-  set(GFLAGS_NAMESPACE "${GFLAGS_NAMESPACE}" CACHE STRING
-    "gflags namespace (google or gflags)" FORCE)
-
-  # gflags does not seem to provide any record of the version in its
-  # source tree, thus cannot extract version.
-
-  # Catch case when caller has set GFLAGS_NAMESPACE in the cache / GUI
-  # with an invalid value.
-  if (GFLAGS_NAMESPACE AND
-      NOT GFLAGS_NAMESPACE STREQUAL "google" AND
-      NOT GFLAGS_NAMESPACE STREQUAL "gflags")
-    gflags_report_not_found(
-      "Caller defined GFLAGS_NAMESPACE:"
-      " ${GFLAGS_NAMESPACE} is not valid, not google or gflags.")
-  endif ()
-  # Catch case when caller has set GFLAGS_INCLUDE_DIR in the cache / GUI and
-  # thus FIND_[PATH/LIBRARY] are not called, but specified locations are
-  # invalid, otherwise we would report the library as found.
-  if (GFLAGS_INCLUDE_DIR AND
-      NOT EXISTS ${GFLAGS_INCLUDE_DIR}/gflags/gflags.h)
-    gflags_report_not_found(
-      "Caller defined GFLAGS_INCLUDE_DIR:"
-      " ${GFLAGS_INCLUDE_DIR} does not contain gflags/gflags.h header.")
-  endif (GFLAGS_INCLUDE_DIR AND
-    NOT EXISTS ${GFLAGS_INCLUDE_DIR}/gflags/gflags.h)
-  # TODO: This regex for gflags library is pretty primitive, we use lowercase
-  #       for comparison to handle Windows using CamelCase library names, could
-  #       this check be better?
-  string(TOLOWER "${GFLAGS_LIBRARY}" LOWERCASE_GFLAGS_LIBRARY)
-  if (GFLAGS_LIBRARY AND
-      NOT "${LOWERCASE_GFLAGS_LIBRARY}" MATCHES ".*gflags[^/]*")
-    gflags_report_not_found(
-      "Caller defined GFLAGS_LIBRARY: "
-      "${GFLAGS_LIBRARY} does not match gflags.")
-  endif (GFLAGS_LIBRARY AND
-    NOT "${LOWERCASE_GFLAGS_LIBRARY}" MATCHES ".*gflags[^/]*")
-
-  gflags_reset_find_library_prefix()
-
-endif(NOT GFLAGS_FOUND)
-
-# Set standard CMake FindPackage variables if found.
-if (GFLAGS_FOUND)
-  set(GFLAGS_INCLUDE_DIRS ${GFLAGS_INCLUDE_DIR})
-  set(GFLAGS_LIBRARIES ${GFLAGS_LIBRARY} ${GFLAGS_LINK_LIBRARIES})
-endif (GFLAGS_FOUND)
-
-# Handle REQUIRED / QUIET optional arguments.
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(Gflags DEFAULT_MSG
-  GFLAGS_INCLUDE_DIRS GFLAGS_LIBRARIES GFLAGS_NAMESPACE)
-
-# Only mark internal variables as advanced if we found gflags, otherwise
-# leave them visible in the standard GUI for the user to set manually.
-if (GFLAGS_FOUND)
-  mark_as_advanced(FORCE GFLAGS_INCLUDE_DIR
-    GFLAGS_LIBRARY
-    GFLAGS_NAMESPACE
-    gflags_DIR) # Autogenerated by find_package(gflags)
-endif (GFLAGS_FOUND)
diff --git a/cmake/FindGlog.cmake b/cmake/FindGlog.cmake
deleted file mode 100644
index 142e2ca96b..0000000000
--- a/cmake/FindGlog.cmake
+++ /dev/null
@@ -1,24 +0,0 @@
-#
-# Find libglog
-#
-#  LIBGLOG_INCLUDE_DIR - where to find glog/logging.h, etc.
-#  LIBGLOG_LIBRARY     - List of libraries when using libglog.
-#  LIBGLOG_FOUND       - True if libglog found.
-#
-# from https://github.com/facebook/hhvm/blob/master/CMake/FindGlog.cmake
-
-IF (LIBGLOG_INCLUDE_DIR)
-  # Already in cache, be silent
-  SET(LIBGLOG_FIND_QUIETLY TRUE)
-ENDIF ()
-
-FIND_PATH(LIBGLOG_INCLUDE_DIR glog/logging.h)
-
-FIND_LIBRARY(LIBGLOG_LIBRARY glog)
-
-# handle the QUIETLY and REQUIRED arguments and set LIBGLOG_FOUND to TRUE if
-# all listed variables are TRUE
-INCLUDE(FindPackageHandleStandardArgs)
-FIND_PACKAGE_HANDLE_STANDARD_ARGS(LIBGLOG DEFAULT_MSG LIBGLOG_LIBRARY LIBGLOG_INCLUDE_DIR)
-
-MARK_AS_ADVANCED(LIBGLOG_LIBRARY LIBGLOG_INCLUDE_DIR)
\ No newline at end of file
diff --git a/cmake/FindGperftools.cmake b/cmake/FindGperftools.cmake
deleted file mode 100644
index 928f573a4f..0000000000
--- a/cmake/FindGperftools.cmake
+++ /dev/null
@@ -1,63 +0,0 @@
-# Tries to find Gperftools.
-#
-# Usage of this module as follows:
-#
-#     find_package(Gperftools)
-#
-# Variables used by this module, they can change the default behaviour and need
-# to be set before calling find_package:
-#
-#  Gperftools_ROOT_DIR  Set this variable to the root installation of
-#                       Gperftools if the module has problems finding
-#                       the proper installation path.
-#
-# Variables defined by this module:
-#
-#  GPERFTOOLS_FOUND              System has Gperftools libs/headers
-#  GPERFTOOLS_LIBRARIES          The Gperftools libraries (tcmalloc & profiler)
-#  GPERFTOOLS_INCLUDE_DIR        The location of Gperftools headers
-
-find_library(GPERFTOOLS_TCMALLOC
-  NAMES tcmalloc
-  HINTS ${Gperftools_ROOT_DIR}/lib)
-
-find_library(GPERFTOOLS_PROFILER
-  NAMES profiler
-  HINTS ${Gperftools_ROOT_DIR}/lib)
-
-find_library(GPERFTOOLS_TCMALLOC_AND_PROFILER
-  NAMES tcmalloc_and_profiler
-  HINTS ${Gperftools_ROOT_DIR}/lib)
-
-find_path(GPERFTOOLS_INCLUDE_DIR
-  NAMES gperftools/heap-profiler.h
-  HINTS ${Gperftools_ROOT_DIR}/include)
-
-set(GPERFTOOLS_LIBRARIES ${GPERFTOOLS_TCMALLOC_AND_PROFILER})
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(
-  Gperftools
-  DEFAULT_MSG
-  GPERFTOOLS_LIBRARIES
-  GPERFTOOLS_INCLUDE_DIR)
-
-mark_as_advanced(
-  Gperftools_ROOT_DIR
-  GPERFTOOLS_TCMALLOC
-  GPERFTOOLS_PROFILER
-  GPERFTOOLS_TCMALLOC_AND_PROFILER
-  GPERFTOOLS_LIBRARIES
-  GPERFTOOLS_INCLUDE_DIR)
-
-# create IMPORTED targets
-if (Gperftools_FOUND AND NOT TARGET gperftools::tcmalloc)
-  add_library(gperftools::tcmalloc UNKNOWN IMPORTED)
-  set_target_properties(gperftools::tcmalloc PROPERTIES
-    IMPORTED_LOCATION ${GPERFTOOLS_TCMALLOC}
-    INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}")
-  add_library(gperftools::profiler UNKNOWN IMPORTED)
-  set_target_properties(gperftools::profiler PROPERTIES
-    IMPORTED_LOCATION ${GPERFTOOLS_PROFILER}
-    INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}")
-endif()
diff --git a/cmake/FindJeMalloc.cmake b/cmake/FindJeMalloc.cmake
deleted file mode 100644
index b95287160b..0000000000
--- a/cmake/FindJeMalloc.cmake
+++ /dev/null
@@ -1,28 +0,0 @@
-# - Find JeMalloc library
-# Find the native JeMalloc includes and library
-#
-# JEMALLOC_INCLUDE_DIR - where to find jemalloc.h, etc.
-# JEMALLOC_LIBRARIES - List of libraries when using jemalloc.
-# JEMALLOC_FOUND - True if jemalloc found.
-
-find_path(JEMALLOC_INCLUDE_DIR
-  NAMES jemalloc/jemalloc.h
-  HINTS ${JEMALLOC_ROOT_DIR}/include)
-
-find_library(JEMALLOC_LIBRARIES
-  NAMES jemalloc
-  HINTS ${JEMALLOC_ROOT_DIR}/lib)
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(jemalloc DEFAULT_MSG JEMALLOC_LIBRARIES JEMALLOC_INCLUDE_DIR)
-
-mark_as_advanced(
-  JEMALLOC_LIBRARIES
-  JEMALLOC_INCLUDE_DIR)
-
-if (JEMALLOC_FOUND)
-  add_library(jemalloc::jemalloc UNKNOWN IMPORTED)
-  set_target_properties(jemalloc::jemalloc PROPERTIES
-    IMPORTED_LOCATION ${JEMALLOC_LIBRARIES}
-    INTERFACE_INCLUDE_DIRECTORIES "${JEMALLOC_INCLUDE_DIR}")
-endif()
diff --git a/cmake/FindNumPy.cmake b/cmake/FindNumPy.cmake
deleted file mode 100644
index 8cdd642ac0..0000000000
--- a/cmake/FindNumPy.cmake
+++ /dev/null
@@ -1,38 +0,0 @@
-# Find the Python NumPy package
-# PYTHON_NUMPY_INCLUDE_DIR
-# NUMPY_FOUND
-# will be set by this script
-
-cmake_minimum_required(VERSION 2.6)
-
-if(NOT PYTHON_EXECUTABLE)
-  if(NumPy_FIND_QUIETLY)
-    find_package(PythonInterp QUIET)
-  else()
-    find_package(PythonInterp)
-    set(_numpy_out 1)
-  endif()
-endif()
-
-if (PYTHON_EXECUTABLE)
-  # write a python script that finds the numpy path
-  file(WRITE ${PROJECT_BINARY_DIR}/FindNumpyPath.py
-      "try: import numpy; print(numpy.get_include())\nexcept:pass\n")
-
-  # execute the find script
-  exec_program("${PYTHON_EXECUTABLE}" ${PROJECT_BINARY_DIR}
-    ARGS "FindNumpyPath.py"
-    OUTPUT_VARIABLE NUMPY_PATH)
-elseif(_numpy_out)
-  message(STATUS "Python executable not found.")
-endif(PYTHON_EXECUTABLE)
-
-find_path(PYTHON_NUMPY_INCLUDE_DIR numpy/arrayobject.h
-  HINTS "${NUMPY_PATH}" "${PYTHON_INCLUDE_PATH}")
-
-if(PYTHON_NUMPY_INCLUDE_DIR)
-  set(PYTHON_NUMPY_FOUND 1 CACHE INTERNAL "Python numpy found")
-endif(PYTHON_NUMPY_INCLUDE_DIR)
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(NumPy DEFAULT_MSG PYTHON_NUMPY_INCLUDE_DIR)
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
deleted file mode 100644
index 52ac31d1d1..0000000000
--- a/cmake/cblas.cmake
+++ /dev/null
@@ -1,94 +0,0 @@
-# Find the CBlas and lapack libraries
-#
-# It will search MKLML, atlas, OpenBlas, reference-cblas in order.
-#
-# If any cblas implementation found, the following variable will be set.
-#    CBLAS_PROVIDER  # one of MKLML, OPENBLAS, REFERENCE
-#    CBLAS_INC_DIR   # the include directory for cblas.
-#    CBLAS_LIBS      # a list of libraries should be linked by paddle.
-#                    # Each library should be full path to object file.
-
-set(CBLAS_FOUND OFF)
-
-## Find MKLML First.
-if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
-  set(CBLAS_FOUND ON)
-  set(CBLAS_PROVIDER MKLML)
-  set(CBLAS_INC_DIR ${MKLML_INC_DIR})
-  set(CBLAS_LIBRARIES ${MKLML_LIB})
-
-  add_definitions(-DPADDLE_WITH_MKLML)
-  add_definitions(-DLAPACK_FOUND)
-
-  message(STATUS "Found cblas and lapack in MKLML "
-    "(include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
-  return()
-endif()
-
-## Then find openblas.
-set(OPENBLAS_ROOT $ENV{OPENBLAS_ROOT} CACHE PATH "Folder contains Openblas")
-set(OPENBLAS_INCLUDE_SEARCH_PATHS
-        ${OPENBLAS_ROOT}/include
-        /usr/include
-        /usr/include/openblas
-        /usr/local/opt/openblas/include)
-set(OPENBLAS_LIB_SEARCH_PATHS
-        ${OPENBLAS_ROOT}/lib
-        /usr/lib
-        /usr/lib/blas/openblas
-        /usr/lib/openblas
-        /usr/local/opt/openblas/lib)
-
-find_path(OPENBLAS_INC_DIR NAMES cblas.h
-  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS} NO_DEFAULT_PATH)
-find_path(OPENBLAS_LAPACKE_INC_DIR NAMES lapacke.h
-  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
-find_library(OPENBLAS_LIB NAMES openblas
-  PATHS ${OPENBLAS_LIB_SEARCH_PATHS})
-
-if(OPENBLAS_LAPACKE_INC_DIR AND OPENBLAS_INC_DIR AND OPENBLAS_LIB)
-  set(CBLAS_FOUND ON)
-  set(CBLAS_PROVIDER OPENBLAS)
-  set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR} ${OPENBLAS_LAPACKE_INC_DIR})
-  set(CBLAS_LIBRARIES ${OPENBLAS_LIB})
-
-  add_definitions(-DPADDLE_USE_OPENBLAS)
-  add_definitions(-DLAPACK_FOUND)
-
-  message(STATUS "Found OpenBLAS (include: ${OPENBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
-  message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})")
-  return()
-endif()
-
-
-## Then find the reference-cblas.  www.netlib.org/blas/
-set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH
-  "Folder contains reference-cblas")
-set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
-  ${REFERENCE_CBLAS_ROOT}/include
-  /usr/include
-  /usr/include/cblas
-)
-
-set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
-  ${REFERENCE_CBLAS_ROOT}/lib
-  /usr/lib
-  /usr/lib/blas/reference/
-  /usr/lib/reference/
-)
-
-if(WITH_SYSTEM_BLAS)
-  find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
-        ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS})
-  find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
-        ${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
-
-  if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
-    set(CBLAS_FOUND ON)
-    set(CBLAS_PROVIDER REFERENCE)
-    set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
-    set(CBLAS_LIBRARIES ${REFERENCE_CBLAS_LIBRARY})
-    add_definitions(-DPADDLE_USE_REFERENCE_CBLAS)
-    message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
-  endif()
-endif()
diff --git a/cmake/ccache.cmake b/cmake/ccache.cmake
deleted file mode 100644
index 900f59d4cb..0000000000
--- a/cmake/ccache.cmake
+++ /dev/null
@@ -1,9 +0,0 @@
-# Use ccache if found ccache program
-
-find_program(CCACHE_PATH ccache)
-
-if(CCACHE_PATH)
-    message(STATUS "Ccache is founded, use ccache to speed up compile.")
-    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PATH})
-    set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ${CCACHE_PATH})
-endif(CCACHE_PATH)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
deleted file mode 100644
index 67830fe2e0..0000000000
--- a/cmake/configure.cmake
+++ /dev/null
@@ -1,160 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-if(NOT WITH_PYTHON)
-    add_definitions(-DPADDLE_NO_PYTHON)
-endif(NOT WITH_PYTHON)
-
-if(WITH_DSO)
-    add_definitions(-DPADDLE_USE_DSO)
-endif(WITH_DSO)
-
-if(WITH_TESTING)
-    add_definitions(-DPADDLE_WITH_TESTING)
-endif(WITH_TESTING)
-
-if(NOT WITH_PROFILER)
-    add_definitions(-DPADDLE_DISABLE_PROFILER)
-endif(NOT WITH_PROFILER)
-
-if(WITH_AVX AND AVX_FOUND)
-    set(SIMD_FLAG ${AVX_FLAG})
-elseif(SSE3_FOUND)
-    set(SIMD_FLAG ${SSE3_FLAG})
-endif()
-
-if(LITE_WITH_CUDA)
-    add_definitions(-DLITE_WITH_CUDA)
-    add_definitions(-DEIGEN_USE_GPU)
-
-    FIND_PACKAGE(CUDA REQUIRED)
-
-    if(${CUDA_VERSION_MAJOR} VERSION_LESS 7)
-        message(FATAL_ERROR "Paddle needs CUDA >= 7.0 to compile")
-    endif()
-
-    if(NOT CUDNN_FOUND)
-        message(FATAL_ERROR "Paddle needs cudnn to compile")
-    endif()
-    if(CUPTI_FOUND)
-        include_directories(${CUPTI_INCLUDE_DIR})
-        add_definitions(-DPADDLE_WITH_CUPTI)
-    else()
-        message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.")
-    endif()
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")
-
-    # Include cuda and cudnn
-    include_directories(${CUDNN_INCLUDE_DIR})
-    include_directories(${CUDA_TOOLKIT_INCLUDE})
-
-elseif(WITH_AMD_GPU)
-    add_definitions(-DPADDLE_WITH_HIP)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__")
-else()
-    add_definitions(-DHPPL_STUB_FUNC)
-    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
-endif()
-
-if (WITH_MKLML AND MKLML_IOMP_LIB)
-    message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
-    if(WIN32)
-        # openmp not support well for now on windows
-        set(OPENMP_FLAGS "")
-    else(WIN32)
-        set(OPENMP_FLAGS "-fopenmp")
-    endif(WIN32)
-    set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
-    set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
-endif()
-
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
-
-if(WITH_DISTRIBUTE)
-  add_definitions(-DPADDLE_WITH_DISTRIBUTE)
-endif()
-
-if(WITH_GRPC)
-    add_definitions(-DPADDLE_WITH_GRPC)
-endif(WITH_GRPC)
-
-if(WITH_BRPC_RDMA)
-    add_definitions(-DPADDLE_WITH_BRPC_RDMA)
-endif(WITH_BRPC_RDMA)
-
-if(ON_INFER)
-    add_definitions(-DPADDLE_ON_INFERENCE)
-endif(ON_INFER)
-
-if(WITH_WBAES)
-    add_definitions(-DPADDLE_WITH_WBAES)
-endif(WITH_WBAES)
-
-if (REPLACE_ENFORCE_GLOG)
-  add_definitions("-DREPLACE_ENFORCE_GLOG")
-endif()
-
-# for lite
-# TODO(Superjomn) not work fine with the option
-if (LITE_WITH_X86)
-    add_definitions("-DLITE_WITH_X86")
-endif()
-
-if (LITE_WITH_ARM)
-    add_definitions("-DLITE_WITH_ARM")
-endif()
-
-if (WITH_ARM_DOTPROD)
-    add_definitions("-DWITH_ARM_DOTPROD")
-endif()
-
-if (LITE_WITH_NPU)
-    add_definitions("-DLITE_WITH_NPU")
-endif()
-
-if (LITE_WITH_OPENCL)
-    add_definitions("-DLITE_WITH_OPENCL")
-endif()
-
-if (LITE_WITH_FPGA)
-add_definitions("-DLITE_WITH_FPGA")
-endif()
-
-if (LITE_WITH_PROFILE)
-    add_definitions("-DLITE_WITH_PROFILE")
-    if (LITE_WITH_PRECISION_PROFILE)
-        add_definitions("-DLITE_WITH_PRECISION_PROFILE")
-    endif()
-endif()
-
-if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-  add_definitions("-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK")
-endif()
-
-if (LITE_SHUTDOWN_LOG)
-  add_definitions("-DLITE_SHUTDOWN_LOG")
-endif()
-
-if (LITE_ON_TINY_PUBLISH)
-  add_definitions("-DLITE_ON_TINY_PUBLISH")
-endif()
-
-if (LITE_ON_MODEL_OPTIMIZE_TOOL)
-  add_definitions("-DLITE_ON_MODEL_OPTIMIZE_TOOL")
-endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
-
diff --git a/cmake/coveralls.cmake b/cmake/coveralls.cmake
deleted file mode 100644
index ca1471cabb..0000000000
--- a/cmake/coveralls.cmake
+++ /dev/null
@@ -1,103 +0,0 @@
-# CMake script for code coverage.
-# If _COVERALLS_UPLOAD is ON, it will upload json files to overalls.io automatically.
-
-# Param _COVERAGE_SRCS          A list of coverage source files.
-# Param _COVERALLS_UPLOAD       Upload the result to coveralls.
-# Param _CMAKE_SCRIPT_PATH      CMake script path.
-function(code_coverage _COVERAGE_SRCS _COVERALLS_UPLOAD _CMAKE_SCRIPT_PATH)
-    # clean previous gcov data.
-    file(REMOVE_RECURSE ${PROJECT_BINARY_DIR}/*.gcda)
-
-    # find curl for upload JSON soon.
-    if (_COVERALLS_UPLOAD)
-        find_program(CURL_EXECUTABLE curl)
-        if (NOT CURL_EXECUTABLE)
-            message(FATAL_ERROR "Coveralls: curl not found!")
-        endif()
-    endif()
-
-    # When passing a CMake list to an external process, the list
-    # will be converted from the format "1;2;3" to "1 2 3".
-    set(COVERAGE_SRCS "")
-    foreach (SINGLE_SRC ${_COVERAGE_SRCS})
-        set(COVERAGE_SRCS "${COVERAGE_SRCS}*${SINGLE_SRC}")
-    endforeach()
-
-    # query number of logical cores
-    cmake_host_system_information(RESULT core_size QUERY NUMBER_OF_LOGICAL_CORES)
-    # coveralls json file.
-    set(COVERALLS_FILE ${PROJECT_BINARY_DIR}/coveralls.json)
-    add_custom_target(coveralls_generate
-        # Run regress tests.
-        COMMAND ${CMAKE_CTEST_COMMAND}
-                -j ${core_size}
-                --output-on-failure
-        # Generate Gcov and translate it into coveralls JSON.
-        COMMAND ${CMAKE_COMMAND}
-                -DCOVERAGE_SRCS="${COVERAGE_SRCS}"
-                -DCOVERALLS_OUTPUT_FILE="${COVERALLS_FILE}"
-                -DCOV_PATH="${PROJECT_BINARY_DIR}"
-                -DPROJECT_ROOT="${PROJECT_SOURCE_DIR}"
-                -P "${_CMAKE_SCRIPT_PATH}/coverallsGcovJsons.cmake"
-        WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
-        COMMENT "Coveralls: generating coveralls output..."
-    )
-
-    if (_COVERALLS_UPLOAD)
-        message("COVERALLS UPLOAD: ON")
-        # Upload the JSON to coveralls.
-        add_custom_target(coveralls_upload
-            COMMAND ${CURL_EXECUTABLE}
-                    -S -F json_file=@${COVERALLS_FILE}
-                    https://coveralls.io/api/v1/jobs
-            DEPENDS coveralls_generate
-            WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
-            COMMENT "Coveralls: uploading coveralls output...")
-
-        add_custom_target(coveralls DEPENDS coveralls_upload)
-    else()
-        message("COVERALLS UPLOAD: OFF")
-        add_custom_target(coveralls DEPENDS coveralls_generate)
-    endif()
-endfunction()
-
-if(WITH_COVERAGE)
-    set(CMAKE_BUILD_TYPE "Debug")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
-
-    set(EXCLUDE_DIRS
-        "demo/"
-        "build/"
-        "tests/"
-        ".test_env/"
-    )
-
-    if(WITH_GPU)
-        file(GLOB_RECURSE PADDLE_SOURCES RELATIVE "${PROJECT_SOURCE_DIR}" "*.cpp" "*.cc" ".c" "*.cu")
-    else()
-        file(GLOB_RECURSE PADDLE_SOURCES RELATIVE "${PROJECT_SOURCE_DIR}" "*.cpp" "*.cc" "*.c")
-    endif()
-
-    # exclude trivial files in PADDLE_SOURCES
-    foreach(EXCLUDE_DIR ${EXCLUDE_DIRS})
-        foreach(TMP_PATH ${PADDLE_SOURCES})
-            string(FIND ${TMP_PATH} ${EXCLUDE_DIR} EXCLUDE_DIR_FOUND)
-            if(NOT ${EXCLUDE_DIR_FOUND} EQUAL -1)
-                list(REMOVE_ITEM PADDLE_SOURCES ${TMP_PATH})
-            endif()
-        endforeach(TMP_PATH)
-    endforeach()
-
-    # convert to absolute path
-    set(PADDLE_SRCS "")
-    foreach(PADDLE_SRC ${PADDLE_SOURCES})
-        set(PADDLE_SRCS "${PADDLE_SRCS};${PROJECT_SOURCE_DIR}/${PADDLE_SRC}")
-    endforeach()
-
-    code_coverage(
-        "${PADDLE_SRCS}"
-        ${COVERALLS_UPLOAD}
-        "${PROJECT_SOURCE_DIR}/cmake"
-    )
-endif()
diff --git a/cmake/coverallsGcovJsons.cmake b/cmake/coverallsGcovJsons.cmake
deleted file mode 100644
index 4641184fcf..0000000000
--- a/cmake/coverallsGcovJsons.cmake
+++ /dev/null
@@ -1,401 +0,0 @@
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-# Copyright (C) 2014 Joakim Söderberg <joakim.soderberg@gmail.com>
-#
-# This is intended to be run by a custom target in a CMake project like this.
-# 0. Compile program with coverage support.
-# 1. Clear coverage data. (Recursively delete *.gcda in build dir)
-# 2. Run the unit tests.
-# 3. Run this script specifying which source files the coverage should be performed on.
-#
-# This script will then use gcov to generate .gcov files in the directory specified
-# via the COV_PATH var. This should probably be the same as your cmake build dir.
-#
-# It then parses the .gcov files to convert them into the Coveralls JSON format:
-# https://coveralls.io/docs/api
-#
-
-CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
-
-# Since it's not possible to pass a CMake list properly in the
-# "1;2;3" format to an external process, we have replaced the
-# ";" with "*", so reverse that here so we get it back into the
-# CMake list format.
-string(REGEX REPLACE "\\*" ";" COVERAGE_SRCS ${COVERAGE_SRCS})
-
-find_program(GCOV_EXECUTABLE gcov)
-if (NOT GCOV_EXECUTABLE)
-	message(FATAL_ERROR "gcov not found! Aborting...")
-endif()
-
-find_package(Git)
-
-# TODO: Add these git things to the coveralls json.
-if (GIT_FOUND)
-	# Branch.
-	execute_process(
-		COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref HEAD
-		WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
-		OUTPUT_VARIABLE GIT_BRANCH
-		OUTPUT_STRIP_TRAILING_WHITESPACE
-	)
-
-	macro (git_log_format FORMAT_CHARS VAR_NAME)
-		execute_process(
-			COMMAND ${GIT_EXECUTABLE} log -1 --pretty=format:%${FORMAT_CHARS}
-			WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
-			OUTPUT_VARIABLE ${VAR_NAME}
-			OUTPUT_STRIP_TRAILING_WHITESPACE
-		)
-	endmacro()
-
-	git_log_format(an GIT_AUTHOR_EMAIL)
-	git_log_format(ae GIT_AUTHOR_EMAIL)
-	git_log_format(cn GIT_COMMITTER_NAME)
-	git_log_format(ce GIT_COMMITTER_EMAIL)
-	git_log_format(B GIT_COMMIT_MESSAGE)
-
-	message("Git exe: ${GIT_EXECUTABLE}")
-	message("Git branch: ${GIT_BRANCH}")
-	message("Git author: ${GIT_AUTHOR_NAME}")
-	message("Git e-mail: ${GIT_AUTHOR_EMAIL}")
-	message("Git commiter name: ${GIT_COMMITTER_NAME}")
-	message("Git commiter e-mail: ${GIT_COMMITTER_EMAIL}")
-	message("Git commit message: ${GIT_COMMIT_MESSAGE}")
-
-endif()
-
-############################# Macros #########################################
-
-#
-# This macro converts from the full path format gcov outputs:
-#
-#    /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov
-#
-# to the original source file path the .gcov is for:
-#
-#   /path/to/project/root/subdir/the_file.c
-#
-macro(get_source_path_from_gcov_filename _SRC_FILENAME _GCOV_FILENAME)
-
-	# /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov 
-	# -> 
-	# #path#to#project#root#subdir#the_file.c.gcov   
-	get_filename_component(_GCOV_FILENAME_WEXT ${_GCOV_FILENAME} NAME)
-
-	# #path#to#project#root#subdir#the_file.c.gcov -> /path/to/project/root/subdir/the_file.c
-	string(REGEX REPLACE "\\.gcov$" "" SRC_FILENAME_TMP ${_GCOV_FILENAME_WEXT})
-	string(REGEX REPLACE "\#" "/" SRC_FILENAME_TMP ${SRC_FILENAME_TMP})
-	set(${_SRC_FILENAME} "${SRC_FILENAME_TMP}")
-endmacro()
-
-##############################################################################
-
-# Get the coverage data.
-file(GLOB_RECURSE GCDA_FILES "${COV_PATH}" "*.gcda")
-message("Process GCDA files:")
-message("===============================")
-
-# Get a list of all the object directories needed by gcov
-# (The directories the .gcda files and .o files are found in)
-# and run gcov on those.
-foreach(GCDA ${GCDA_FILES})
-	get_filename_component(GCDA_DIR ${GCDA} PATH)
-
-	#
-	# The -p below refers to "Preserve path components",
-	# This means that the generated gcov filename of a source file will
-	# keep the original files entire filepath, but / is replaced with #.
-	# Example:
-	#
-	# /path/to/project/root/build/CMakeFiles/the_file.dir/subdir/the_file.c.gcda
-	# ------------------------------------------------------------------------------
-	# File '/path/to/project/root/subdir/the_file.c'
-	# Lines executed:68.34% of 199
-	# /path/to/project/root/subdir/the_file.c:creating '#path#to#project#root#subdir#the_file.c.gcov'
-	#
-	# If -p is not specified then the file is named only "the_file.c.gcov"
-	#
-	execute_process(
-		COMMAND ${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA} >/dev/null
-		WORKING_DIRECTORY ${GCDA_DIR}
-	)
-endforeach()
-
-# TODO: Make these be absolute path
-file(GLOB_RECURSE ALL_GCOV_FILES "${COV_PATH}" "*.gcov")
-
-# Get only the filenames to use for filtering.
-#set(COVERAGE_SRCS_NAMES "")
-#foreach (COVSRC ${COVERAGE_SRCS})
-#	get_filename_component(COVSRC_NAME ${COVSRC} NAME)
-#	message("${COVSRC} -> ${COVSRC_NAME}")
-#	list(APPEND COVERAGE_SRCS_NAMES "${COVSRC_NAME}")
-#endforeach()
-
-#
-# Filter out all but the gcov files we want.
-#
-# We do this by comparing the list of COVERAGE_SRCS filepaths that the
-# user wants the coverage data for with the paths of the generated .gcov files,
-# so that we only keep the relevant gcov files.
-#
-# Example:
-# COVERAGE_SRCS =
-#				/path/to/project/root/subdir/the_file.c
-#
-# ALL_GCOV_FILES =
-#				/path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov
-#				/path/to/project/root/build/#path#to#project#root#subdir#other_file.c.gcov
-# 
-# Result should be:
-# GCOV_FILES = 
-#				/path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov
-#
-set(GCOV_FILES "")
-#message("Look in coverage sources: ${COVERAGE_SRCS}")
-message("\nFilter out unwanted GCOV files:")
-message("===============================")
-
-set(COVERAGE_SRCS_REMAINING ${COVERAGE_SRCS})
-
-foreach (GCOV_FILE ${ALL_GCOV_FILES})
-
-	#
-	# /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov 
-	# -> 
-	# /path/to/project/root/subdir/the_file.c 
-	get_source_path_from_gcov_filename(GCOV_SRC_PATH ${GCOV_FILE})
-
-	# Is this in the list of source files?
-	# TODO: We want to match against relative path filenames from the source file root...
-	list(FIND COVERAGE_SRCS ${GCOV_SRC_PATH} WAS_FOUND)
-
-	if (NOT WAS_FOUND EQUAL -1)
-		message("YES: ${GCOV_FILE}")
-		list(APPEND GCOV_FILES ${GCOV_FILE})
-
-		# We remove it from the list, so we don't bother searching for it again.
-		# Also files left in COVERAGE_SRCS_REMAINING after this loop ends should
-		# have coverage data generated from them (no lines are covered).
-		list(REMOVE_ITEM COVERAGE_SRCS_REMAINING ${GCOV_SRC_PATH})
-	else()
-		message("NO:  ${GCOV_FILE}")
-	endif()
-endforeach()
-
-# TODO: Enable setting these
-set(JSON_SERVICE_NAME "travis-ci")
-set(JSON_SERVICE_JOB_ID $ENV{TRAVIS_JOB_ID})
-
-set(JSON_TEMPLATE
-"{
-  \"service_name\": \"\@JSON_SERVICE_NAME\@\",
-  \"service_job_id\": \"\@JSON_SERVICE_JOB_ID\@\",
-  \"source_files\": \@JSON_GCOV_FILES\@
-}"
-)
-
-set(SRC_FILE_TEMPLATE
-"{
-      \"name\": \"\@GCOV_SRC_REL_PATH\@\",
-      \"source_digest\": \"\@GCOV_CONTENTS_MD5\@\",
-      \"coverage\": \@GCOV_FILE_COVERAGE\@
-  }"
-)
-
-message("\nGenerate JSON for files:")
-message("=========================")
-
-set(JSON_GCOV_FILES "[")
-
-# Read the GCOV files line by line and get the coverage data.
-foreach (GCOV_FILE ${GCOV_FILES})
-
-	get_source_path_from_gcov_filename(GCOV_SRC_PATH ${GCOV_FILE})
-	file(RELATIVE_PATH GCOV_SRC_REL_PATH "${PROJECT_ROOT}" "${GCOV_SRC_PATH}")
-
-	# The new coveralls API doesn't need the entire source (Yay!)
-	# However, still keeping that part for now. Will cleanup in the future.
-	file(MD5 "${GCOV_SRC_PATH}" GCOV_CONTENTS_MD5)
-	message("MD5: ${GCOV_SRC_PATH} = ${GCOV_CONTENTS_MD5}")
-
-	# Loads the gcov file as a list of lines.
-	# (We first open the file and replace all occurences of [] with _
-	#  because CMake will fail to parse a line containing unmatched brackets...
-	#  also the \ to escaped \n in macros screws up things.)
-	# https://public.kitware.com/Bug/view.php?id=15369
-	file(READ ${GCOV_FILE} GCOV_CONTENTS)
-	string(REPLACE "[" "_" GCOV_CONTENTS "${GCOV_CONTENTS}")
-	string(REPLACE "]" "_" GCOV_CONTENTS "${GCOV_CONTENTS}")
-	string(REPLACE "\\" "_" GCOV_CONTENTS "${GCOV_CONTENTS}")
-	file(WRITE ${GCOV_FILE}_tmp "${GCOV_CONTENTS}")
-
-	file(STRINGS ${GCOV_FILE}_tmp GCOV_LINES)
-	list(LENGTH GCOV_LINES LINE_COUNT)
-
-	# Instead of trying to parse the source from the
-	# gcov file, simply read the file contents from the source file.
-	# (Parsing it from the gcov is hard because C-code uses ; in many places
-	#  which also happens to be the same as the CMake list delimeter).
-	file(READ ${GCOV_SRC_PATH} GCOV_FILE_SOURCE)
-
-	string(REPLACE "\\" "\\\\" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
-	string(REGEX REPLACE "\"" "\\\\\"" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
-	string(REPLACE "\t" "\\\\t" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
-	string(REPLACE "\r" "\\\\r" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
-	string(REPLACE "\n" "\\\\n" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
-	# According to http://json.org/ these should be escaped as well.
-	# Don't know how to do that in CMake however...
-	#string(REPLACE "\b" "\\\\b" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
-	#string(REPLACE "\f" "\\\\f" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
-	#string(REGEX REPLACE "\u([a-fA-F0-9]{4})" "\\\\u\\1" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
-
-	# We want a json array of coverage data as a single string
-	# start building them from the contents of the .gcov
-	set(GCOV_FILE_COVERAGE "[")
-
-	set(GCOV_LINE_COUNT 1) # Line number for the .gcov.
-	set(DO_SKIP 0)
-	foreach (GCOV_LINE ${GCOV_LINES})
-		#message("${GCOV_LINE}")
-		# Example of what we're parsing:
-		# Hitcount  |Line | Source
-		# "        8:   26:        if (!allowed || (strlen(allowed) == 0))"
-		string(REGEX REPLACE 
-			"^([^:]*):([^:]*):(.*)$" 
-			"\\1;\\2;\\3"
-			RES
-			"${GCOV_LINE}")
-
-		# Check if we should exclude lines using the Lcov syntax.
-		string(REGEX MATCH "LCOV_EXCL_START" START_SKIP "${GCOV_LINE}")
-		string(REGEX MATCH "LCOV_EXCL_END" END_SKIP "${GCOV_LINE}")
-		string(REGEX MATCH "LCOV_EXCL_LINE" LINE_SKIP "${GCOV_LINE}")
-
-		set(RESET_SKIP 0)
-		if (LINE_SKIP AND NOT DO_SKIP)
-			set(DO_SKIP 1)
-			set(RESET_SKIP 1)
-		endif()
-
-		if (START_SKIP)
-			set(DO_SKIP 1)
-			message("${GCOV_LINE_COUNT}: Start skip")
-		endif()
-
-		if (END_SKIP)
-			set(DO_SKIP 0)
-		endif()
-
-		list(LENGTH RES RES_COUNT)
-
-		if (RES_COUNT GREATER 2)
-			list(GET RES 0 HITCOUNT)
-			list(GET RES 1 LINE)
-			list(GET RES 2 SOURCE)
-
-			string(STRIP ${HITCOUNT} HITCOUNT)
-			string(STRIP ${LINE} LINE)
-
-			# Lines with 0 line numbers are metadata and can be ignored.
-			if (NOT ${LINE} EQUAL 0)
-				
-				if (DO_SKIP)
-					set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}null, ")
-				else()
-					# Translate the hitcount into valid JSON values.
-					if (${HITCOUNT} STREQUAL "#####")
-						set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}0, ")
-					elseif (${HITCOUNT} STREQUAL "-")
-						set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}null, ")
-					else()
-						set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}${HITCOUNT}, ")
-					endif()
-				endif()
-			endif()
-		else()
-			message(WARNING "Failed to properly parse line (RES_COUNT = ${RES_COUNT}) ${GCOV_FILE}:${GCOV_LINE_COUNT}\n-->${GCOV_LINE}")
-		endif()
-
-		if (RESET_SKIP)
-			set(DO_SKIP 0)
-		endif()
-		math(EXPR GCOV_LINE_COUNT "${GCOV_LINE_COUNT}+1")
-	endforeach()
-
-	message("${GCOV_LINE_COUNT} of ${LINE_COUNT} lines read!")
-
-	# Advanced way of removing the trailing comma in the JSON array.
-	# "[1, 2, 3, " -> "[1, 2, 3"
-	string(REGEX REPLACE ",[ ]*$" "" GCOV_FILE_COVERAGE ${GCOV_FILE_COVERAGE})
-
-	# Append the trailing ] to complete the JSON array.
-	set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]")
-
-	# Generate the final JSON for this file.
-	message("Generate JSON for file: ${GCOV_SRC_REL_PATH}...")
-	string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON)
-
-	set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ")
-endforeach()
-
-# Loop through all files we couldn't find any coverage for
-# as well, and generate JSON for those as well with 0% coverage.
-foreach(NOT_COVERED_SRC ${COVERAGE_SRCS_REMAINING})
-
-	# Loads the source file as a list of lines.
-	file(STRINGS ${NOT_COVERED_SRC} SRC_LINES)
-
-	set(GCOV_FILE_COVERAGE "[")
-	set(GCOV_FILE_SOURCE "")
-
-	foreach (SOURCE ${SRC_LINES})
-		set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}0, ")
-
-		string(REPLACE "\\" "\\\\" SOURCE "${SOURCE}")
-		string(REGEX REPLACE "\"" "\\\\\"" SOURCE "${SOURCE}")
-		string(REPLACE "\t" "\\\\t" SOURCE "${SOURCE}")
-		string(REPLACE "\r" "\\\\r" SOURCE "${SOURCE}")
-		set(GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}${SOURCE}\\n")
-	endforeach()
-
-	# Remove trailing comma, and complete JSON array with ]
-	string(REGEX REPLACE ",[ ]*$" "" GCOV_FILE_COVERAGE ${GCOV_FILE_COVERAGE})
-	set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]")
-
-	# Generate the final JSON for this file.
-	string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON)
-	set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ")
-endforeach()
-
-# Get rid of trailing comma.
-string(REGEX REPLACE ",[ ]*$" "" JSON_GCOV_FILES ${JSON_GCOV_FILES})
-set(JSON_GCOV_FILES "${JSON_GCOV_FILES}]")
-
-# Generate the final complete JSON!
-message("Generate final JSON...")
-string(CONFIGURE ${JSON_TEMPLATE} JSON)
-
-file(WRITE "${COVERALLS_OUTPUT_FILE}" "${JSON}")
-message("###########################################################################")
-message("Generated coveralls JSON containing coverage data:") 
-message("${COVERALLS_OUTPUT_FILE}")
-message("###########################################################################")
diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake
deleted file mode 100644
index 11a803ff03..0000000000
--- a/cmake/cross_compiling/android.cmake
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#     http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-if(NOT ARM_TARGET_OS STREQUAL "android")
-    return()
-endif()
-
-set(ANDROID TRUE)
-add_definitions(-DLITE_WITH_LINUX)
-
-if(NOT DEFINED ANDROID_NDK)
-    set(ANDROID_NDK $ENV{NDK_ROOT})
-    if(NOT ANDROID_NDK)
-        message(FATAL_ERROR "Must set ANDROID_NDK or env NDK_ROOT")
-    endif()
-endif()
-
-if(ARM_TARGET_LANG STREQUAL "gcc")
-    # gcc do not need set lang on android
-    set(ARM_TARGET_LANG "")
-endif()
-
-if(NOT DEFINED ANDROID_API_LEVEL)
-    set(ANDROID_API_LEVEL "22")
-endif()
-
-# then check input arm abi
-if(ARM_TARGET_ARCH_ABI STREQUAL "armv7hf")
-    message(FATAL_ERROR "ANDROID does not support hardfp on v7 use armv7 instead.")
-endif()
-
-set(ANDROID_ARCH_ABI ${ARM_TARGET_ARCH_ABI} CACHE STRING "Choose Android Arch ABI")
-if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
-    set(ANDROID_ARCH_ABI "arm64-v8a")
-endif()
-
-if(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
-    set(ANDROID_ARCH_ABI "armeabi-v7a")
-endif()
-
-check_input_var(ANDROID_ARCH_ABI DEFAULT ${ANDROID_ARCH_ABI} LIST "arm64-v8a" "armeabi-v7a"
-    "armeabi-v6" "armeabi" "mips" "mips64" "x86" "x86_64")
-check_input_var(ANDROID_STL_TYPE DEFAULT "c++_static" LIST "c++_static" "gnustl_static" "c++_shared")
-
-if(ANDROID_ARCH_ABI STREQUAL "armeabi-v7a")
-    message(STATUS "armeabi-v7a use softfp by default.")
-    set(CMAKE_ANDROID_ARM_NEON ON)
-    message(STATUS "NEON is enabled on arm-v7a with softfp.")
-endif()
-
-set(CMAKE_SYSTEM_NAME Android)
-set(CMAKE_SYSTEM_VERSION ${ANDROID_API_LEVEL})
-set(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ARCH_ABI})
-set(CMAKE_ANDROID_NDK ${ANDROID_NDK})
-set(CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION ${ARM_TARGET_LANG})
-set(CMAKE_ANDROID_STL_TYPE ${ANDROID_STL_TYPE})
-
-if (ARM_TARGET_LANG STREQUAL "clang")
-    if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
-        set(triple aarch64-v8a-linux-android)
-    elseif(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
-        set(triple arm-v7a-linux-android)
-        set(LITE_WITH_OPENMP OFF CACHE STRING "Due to libomp's bug(For ARM64, it has been fixed by https://reviews.llvm.org/D19879, but still exists on ARM32), disable OpenMP on armv7 when cross-compiling using Clang" FORCE)
-    else()
-        message(FATAL_ERROR "Clang do not support this ${ARM_TARGET_ARCH_ABI}, use armv8 or armv7")
-    endif()
-
-    set(CMAKE_C_COMPILER clang)
-    set(CMAKE_C_COMPILER_TARGET ${triple})
-    set(CMAKE_CXX_COMPILER clang++)
-    set(CMAKE_CXX_COMPILER_TARGET ${triple})
-
-    message(STATUS "CMAKE_CXX_COMPILER_TARGET: ${CMAKE_CXX_COMPILER_TARGET}")
-endif()
diff --git a/cmake/cross_compiling/armlinux.cmake b/cmake/cross_compiling/armlinux.cmake
deleted file mode 100644
index 98f23d4300..0000000000
--- a/cmake/cross_compiling/armlinux.cmake
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#     http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-if(NOT ARM_TARGET_OS STREQUAL "armlinux")
-    return()
-endif()
-
-set(ARMLINUX TRUE)
-add_definitions(-DLITE_WITH_LINUX)
-set(CMAKE_SYSTEM_NAME Linux)
-
-check_input_var(ARMLINUX_ARCH_ABI DEFAULT ${ARM_TARGET_ARCH_ABI} LIST "armv8" "armv7" "armv7hf")
-
-if(ARMLINUX_ARCH_ABI STREQUAL "armv8")
-    set(CMAKE_SYSTEM_PROCESSOR aarch64)
-    set(CMAKE_C_COMPILER "aarch64-linux-gnu-gcc")
-    set(CMAKE_CXX_COMPILER "aarch64-linux-gnu-g++")
-endif()
-
-if(ARMLINUX_ARCH_ABI STREQUAL "armv7")
-    set(CMAKE_SYSTEM_PROCESSOR arm)
-    set(CMAKE_C_COMPILER "arm-linux-gnueabi-gcc")
-    set(CMAKE_CXX_COMPILER "arm-linux-gnueabi-g++")
-endif()
-
-if(ARMLINUX_ARCH_ABI STREQUAL "armv7hf")
-    set(CMAKE_SYSTEM_PROCESSOR arm)
-    set(CMAKE_C_COMPILER "arm-linux-gnueabihf-gcc")
-    set(CMAKE_CXX_COMPILER "arm-linux-gnueabihf-g++")
-endif()
diff --git a/cmake/cross_compiling/findar.cmake b/cmake/cross_compiling/findar.cmake
deleted file mode 100644
index bcb0dc70fd..0000000000
--- a/cmake/cross_compiling/findar.cmake
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-if(NOT ARM_TARGET_LANG STREQUAL "clang")
-    # only clang need find ar tool
-    return()
-endif()
-
-if(NOT EXISTS "${CMAKE_CXX_COMPILER}")
-    message(ERROR "Can not find CMAKE_CXX_COMPILER ${CMAKE_CXX_COMPILER}")
-endif()
-
-get_filename_component(AR_PATH ${CMAKE_CXX_COMPILER} PATH)
-
-find_file(AR_TOOL NAMES llvm-ar PATHS ${AR_PATH})
-
-if(NOT AR_TOOL)
-    message(ERROR "Failed to find AR_TOOL in ${AR_PATH}")
-else()
-    set(CMAKE_AR ${AR_TOOL})
-    message(STATUS "Found CMAKE_AR : " ${CMAKE_AR})
-endif()
diff --git a/cmake/cross_compiling/host.cmake b/cmake/cross_compiling/host.cmake
deleted file mode 100644
index b76dd60046..0000000000
--- a/cmake/cross_compiling/host.cmake
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-# http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set(HOST_C_COMPILER $ENV{CC})
-set(HOST_CXX_COMPILER $ENV{CXX})
-
-if(IOS)
-    set(default_cc clang)
-    set(default_cxx clang++)
-else()
-    set(default_cc gcc)
-    set(default_cxx g++)
-endif()
-
-if(NOT HOST_C_COMPILER)
-    find_program(HOST_C_COMPILER NAMES ${default_cc} PATH
-        /usr/bin
-        /usr/local/bin)
-endif()
-
-if(NOT HOST_CXX_COMPILER)
-    find_program(HOST_CXX_COMPILER NAMES ${default_cxx} PATH
-        /usr/bin
-        /usr/local/bin)
-endif()
-
-if(NOT HOST_C_COMPILER OR NOT EXISTS ${HOST_C_COMPILER})
-    MESSAGE(FATAL_ERROR "Cannot find host C compiler. export CC=/path/to/cc")
-ENDIF()
-
-if(NOT HOST_CXX_COMPILER OR NOT EXISTS ${HOST_CXX_COMPILER})
-    MESSAGE(FATAL_ERROR "Cannot find host C compiler. export CC=/path/to/cc")
-ENDIF()
-
-MESSAGE(STATUS "Found host C compiler: " ${HOST_C_COMPILER})
-MESSAGE(STATUS "Found host CXX compiler: " ${HOST_CXX_COMPILER})
-
diff --git a/cmake/cross_compiling/ios.cmake b/cmake/cross_compiling/ios.cmake
deleted file mode 100644
index 76f62765af..0000000000
--- a/cmake/cross_compiling/ios.cmake
+++ /dev/null
@@ -1,692 +0,0 @@
-# This file is part of the ios-cmake project. It was retrieved from
-# https://github.com/cristeab/ios-cmake.git, which is a fork of
-# https://code.google.com/p/ios-cmake/. Which in turn is based off of
-# the Platform/Darwin.cmake and Platform/UnixPaths.cmake files which
-# are included with CMake 2.8.4
-#
-# The ios-cmake project is licensed under the new BSD license.
-#
-# Copyright (c) 2014, Bogdan Cristea and LTE Engineering Software,
-# Kitware, Inc., Insight Software Consortium.  All rights reserved.
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# 1. Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#
-# This file is based off of the Platform/Darwin.cmake and
-# Platform/UnixPaths.cmake files which are included with CMake 2.8.4
-# It has been altered for iOS development.
-#
-# Updated by Alex Stewart (alexs.mac@gmail.com)
-#
-# *****************************************************************************
-#      Now maintained by Alexander Widerberg (widerbergaren [at] gmail.com)
-#                      under the BSD-3-Clause license
-#                   https://github.com/leetal/ios-cmake
-# *****************************************************************************
-#
-#                           INFORMATION / HELP
-#
-# The following arguments control the behaviour of this toolchain:
-#
-# PLATFORM: (default "OS")
-#    OS = Build for iPhoneOS.
-#    OS64 = Build for arm64 iphoneOS.
-#    OS64COMBINED = Build for arm64 x86_64 iphoneOS. Combined into FAT STATIC lib (supported on 3.14+ of CMakewith "-G Xcode" argument ONLY)
-#    SIMULATOR = Build for x86 i386 iphoneOS Simulator.
-#    SIMULATOR64 = Build for x86_64 iphoneOS Simulator.
-#    TVOS = Build for arm64 tvOS.
-#    TVOSCOMBINED = Build for arm64 x86_64 tvOS. Combined into FAT STATIC lib (supported on 3.14+ of CMake with "-G Xcode" argument ONLY)
-#    SIMULATOR_TVOS = Build for x86_64 tvOS Simulator.
-#    WATCHOS = Build for armv7k arm64_32 for watchOS.
-#    WATCHOSCOMBINED = Build for armv7k arm64_32 x86_64 watchOS. Combined into FAT STATIC lib (supported on 3.14+ of CMake with "-G Xcode" argument ONLY)
-#    SIMULATOR_WATCHOS = Build for x86_64 for watchOS Simulator.
-#
-# CMAKE_OSX_SYSROOT: Path to the SDK to use.  By default this is
-#    automatically determined from PLATFORM and xcodebuild, but
-#    can also be manually specified (although this should not be required).
-#
-# CMAKE_DEVELOPER_ROOT: Path to the Developer directory for the platform
-#    being compiled for.  By default this is automatically determined from
-#    CMAKE_OSX_SYSROOT, but can also be manually specified (although this should
-#    not be required).
-#
-# DEPLOYMENT_TARGET: Minimum SDK version to target. Default 2.0 on watchOS and 9.0 on tvOS+iOS
-#
-# ENABLE_BITCODE: (1|0) Enables or disables bitcode support. Default 1 (true)
-#
-# ENABLE_ARC: (1|0) Enables or disables ARC support. Default 1 (true, ARC enabled by default)
-#
-# ENABLE_VISIBILITY: (1|0) Enables or disables symbol visibility support. Default 0 (false, visibility hidden by default)
-#
-# ARCHS: (armv7 armv7s armv7k arm64 arm64_32 i386 x86_64) If specified, will override the default architectures for the given PLATFORM
-#    OS = armv7 armv7s arm64 (if applicable)
-#    OS64 = arm64 (if applicable)
-#    SIMULATOR = i386
-#    SIMULATOR64 = x86_64
-#    TVOS = arm64
-#    SIMULATOR_TVOS = x86_64 (i386 has since long been deprecated)
-#    WATCHOS = armv7k arm64_32 (if applicable)
-#    SIMULATOR_WATCHOS = x86_64 (i386 has since long been deprecated)
-#
-# This toolchain defines the following variables for use externally:
-#
-# XCODE_VERSION: Version number (not including Build version) of Xcode detected.
-# SDK_VERSION: Version of SDK being used.
-# CMAKE_OSX_ARCHITECTURES: Architectures being compiled for (generated from PLATFORM).
-#
-# This toolchain defines the following macros for use externally:
-#
-# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE XCODE_VARIANT)
-#   A convenience macro for setting xcode specific properties on targets.
-#   Available variants are: All, Release, RelWithDebInfo, Debug, MinSizeRel
-#   example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1" "all").
-#
-# find_host_package (PROGRAM ARGS)
-#   A macro used to find executable programs on the host system, not within the
-#   environment.  Thanks to the android-cmake project for providing the
-#   command.
-#
-# ******************************** DEPRECATIONS *******************************
-#
-# IOS_DEPLOYMENT_TARGET: (Deprecated) Alias to DEPLOYMENT_TARGET
-# CMAKE_IOS_DEVELOPER_ROOT: (Deprecated) Alias to CMAKE_DEVELOPER_ROOT
-# IOS_PLATFORM: (Deprecated) Alias to PLATFORM
-# IOS_ARCH: (Deprecated) Alias to ARCHS
-#
-# *****************************************************************************
-#
-
-## Lite settings
-if (ARM_TARGET_OS STREQUAL "ios")
-  set(PLATFORM "OS")
-elseif(ARM_TARGET_OS STREQUAL "ios64")
-  set(PLATFORM "OS64")
-else()
-  return()
-endif()
-add_definitions(-DTARGET_IOS)
-
-# if do not specify the ARM_TARGET_ARCH_ABI then use default all supported
-if(ARM_TARGET_ARCH_ABI STREQUAL "armv7"
-    OR ARM_TARGET_ARCH_ABI STREQUAL "armv7hf"
-    OR ARM_TARGET_ARCH_ABI STREQUAL "armeabi-v7a")
-  set(ARCHS "armv7")
-elseif(ARM_TARGET_ARCH_ABI STREQUAL "armv8"
-    OR ARM_TARGET_ARCH_ABI STREQUAL "arm64-v8a")
-  set(ARCHS "arm64")
-# else() all default choice: armv7 armv7s arm64
-endif()
-
-if(PLATFORM STREQUAL "OS64" AND ARCHS STREQUAL "armv7")
-  message(FATAL_ERROR "Can not build IOS64 with armv7")
-endif()
-
-# TODO(xxx): enable omp on ios
-set(LITE_WITH_OPENMP OFF CACHE STRING "Disable OpenMP when cross-compiling for Android and iOS" FORCE)
-set(ARM_TARGET_LANG "clang" CACHE STRING "Force use clang on IOS" FORCE)
-
-add_definitions(-DLITE_WITH_IPHONE)
-## End lite settings
-
-# Fix for PThread library not in path
-set(CMAKE_THREAD_LIBS_INIT "-lpthread")
-set(CMAKE_HAVE_THREADS_LIBRARY 1)
-set(CMAKE_USE_WIN32_THREADS_INIT 0)
-set(CMAKE_USE_PTHREADS_INIT 1)
-
-# Cache what generator is used
-set(USED_CMAKE_GENERATOR "${CMAKE_GENERATOR}" CACHE STRING "Expose CMAKE_GENERATOR" FORCE)
-
-if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.14")
-  set(MODERN_CMAKE YES)
-  message(STATUS "Merging integrated CMake 3.14+ iOS,tvOS,watchOS,macOS toolchain(s) with this toolchain!")
-endif()
-
-# Get the Xcode version being used.
-execute_process(COMMAND xcodebuild -version
-  OUTPUT_VARIABLE XCODE_VERSION
-  ERROR_QUIET
-  OUTPUT_STRIP_TRAILING_WHITESPACE)
-string(REGEX MATCH "Xcode [0-9\\.]+" XCODE_VERSION "${XCODE_VERSION}")
-string(REGEX REPLACE "Xcode ([0-9\\.]+)" "\\1" XCODE_VERSION "${XCODE_VERSION}")
-message(STATUS "Building with Xcode version: ${XCODE_VERSION}")
-
-######## ALIASES (DEPRECATION WARNINGS)
-
-if(DEFINED IOS_PLATFORM)
-  set(PLATFORM ${IOS_PLATFORM})
-  message(DEPRECATION "IOS_PLATFORM argument is DEPRECATED. Consider using the new PLATFORM argument instead.")
-endif()
-
-if(DEFINED IOS_DEPLOYMENT_TARGET)
-  set(DEPLOYMENT_TARGET ${IOS_DEPLOYMENT_TARGET})
-  message(DEPRECATION "IOS_DEPLOYMENT_TARGET argument is DEPRECATED. Consider using the new DEPLOYMENT_TARGET argument instead.")
-endif()
-
-if(DEFINED CMAKE_IOS_DEVELOPER_ROOT)
-  set(CMAKE_DEVELOPER_ROOT ${CMAKE_IOS_DEVELOPER_ROOT})
-  message(DEPRECATION "CMAKE_IOS_DEVELOPER_ROOT argument is DEPRECATED. Consider using the new CMAKE_DEVELOPER_ROOT argument instead.")
-endif()
-
-if(DEFINED IOS_ARCH)
-  set(ARCHS ${IOS_ARCH})
-  message(DEPRECATION "IOS_ARCH argument is DEPRECATED. Consider using the new ARCHS argument instead.")
-endif()
-
-######## END ALIASES
-
-# Unset the FORCE on cache variables if in try_compile()
-set(FORCE_CACHE FORCE)
-get_property(_CMAKE_IN_TRY_COMPILE GLOBAL PROPERTY IN_TRY_COMPILE)
-if(_CMAKE_IN_TRY_COMPILE)
-  unset(FORCE_CACHE)
-endif()
-
-# Default to building for iPhoneOS if not specified otherwise, and we cannot
-# determine the platform from the CMAKE_OSX_ARCHITECTURES variable. The use
-# of CMAKE_OSX_ARCHITECTURES is such that try_compile() projects can correctly
-# determine the value of PLATFORM from the root project, as
-# CMAKE_OSX_ARCHITECTURES is propagated to them by CMake.
-if(NOT DEFINED PLATFORM)
-  if (CMAKE_OSX_ARCHITECTURES)
-    if(CMAKE_OSX_ARCHITECTURES MATCHES ".*arm.*" AND CMAKE_OSX_SYSROOT MATCHES ".*iphoneos.*")
-      set(PLATFORM "OS")
-    elseif(CMAKE_OSX_ARCHITECTURES MATCHES "i386" AND CMAKE_OSX_SYSROOT MATCHES ".*iphonesimulator.*")
-      set(PLATFORM "SIMULATOR")
-    elseif(CMAKE_OSX_ARCHITECTURES MATCHES "x86_64" AND CMAKE_OSX_SYSROOT MATCHES ".*iphonesimulator.*")
-      set(PLATFORM "SIMULATOR64")
-    elseif(CMAKE_OSX_ARCHITECTURES MATCHES "arm64" AND CMAKE_OSX_SYSROOT MATCHES ".*appletvos.*")
-      set(PLATFORM "TVOS")
-    elseif(CMAKE_OSX_ARCHITECTURES MATCHES "x86_64" AND CMAKE_OSX_SYSROOT MATCHES ".*appletvsimulator.*")
-      set(PLATFORM "SIMULATOR_TVOS")
-    elseif(CMAKE_OSX_ARCHITECTURES MATCHES ".*armv7k.*" AND CMAKE_OSX_SYSROOT MATCHES ".*watchos.*")
-      set(PLATFORM "WATCHOS")
-    elseif(CMAKE_OSX_ARCHITECTURES MATCHES "i386" AND CMAKE_OSX_SYSROOT MATCHES ".*watchsimulator.*")
-      set(PLATFORM "SIMULATOR_WATCHOS")
-    endif()
-  endif()
-  if (NOT PLATFORM)
-    set(PLATFORM "OS")
-  endif()
-endif()
-
-set(PLATFORM_INT "${PLATFORM}" CACHE STRING "Type of platform for which the build targets.")
-
-# Handle the case where we are targeting iOS and a version above 10.0 (32-bit support dropped officially)
-if(PLATFORM_INT STREQUAL "OS" AND DEPLOYMENT_TARGET VERSION_GREATER_EQUAL 10.0)
-  set(PLATFORM_INT "OS64")
-  message(STATUS "Targeting minimum SDK version ${DEPLOYMENT_TARGET}. Dropping 32-bit support.")
-elseif(PLATFORM_INT STREQUAL "SIMULATOR" AND DEPLOYMENT_TARGET VERSION_GREATER_EQUAL 10.0)
-  set(PLATFORM_INT "SIMULATOR64")
-  message(STATUS "Targeting minimum SDK version ${DEPLOYMENT_TARGET}. Dropping 32-bit support.")
-endif()
-
-# Determine the platform name and architectures for use in xcodebuild commands
-# from the specified PLATFORM name.
-if(PLATFORM_INT STREQUAL "OS")
-  set(SDK_NAME iphoneos)
-  if(NOT ARCHS)
-    set(ARCHS armv7 armv7s arm64)
-  endif()
-elseif(PLATFORM_INT STREQUAL "OS64")
-  set(SDK_NAME iphoneos)
-  if(NOT ARCHS)
-    if (XCODE_VERSION VERSION_GREATER 10.0)
-      set(ARCHS arm64) # Add arm64e when Apple have fixed the integration issues with it, libarclite_iphoneos.a is currently missung bitcode markers for example
-    else()
-      set(ARCHS arm64)
-    endif()
-  endif()
-elseif(PLATFORM_INT STREQUAL "OS64COMBINED")
-  set(SDK_NAME iphoneos)
-  if(MODERN_CMAKE)
-    if(NOT ARCHS)
-      if (XCODE_VERSION VERSION_GREATER 10.0)
-        set(ARCHS arm64 x86_64) # Add arm64e when Apple have fixed the integration issues with it, libarclite_iphoneos.a is currently missung bitcode markers for example
-      else()
-        set(ARCHS arm64 x86_64)
-      endif()
-    endif()
-  else()
-    message(FATAL_ERROR "Please make sure that you are running CMake 3.14+ to make the OS64COMBINED setting work")
-  endif()
-elseif(PLATFORM_INT STREQUAL "SIMULATOR")
-  set(SDK_NAME iphonesimulator)
-  if(NOT ARCHS)
-    set(ARCHS i386)
-  endif()
-  message(DEPRECATION "SIMULATOR IS DEPRECATED. Consider using SIMULATOR64 instead.")
-elseif(PLATFORM_INT STREQUAL "SIMULATOR64")
-  set(SDK_NAME iphonesimulator)
-  if(NOT ARCHS)
-    set(ARCHS x86_64)
-  endif()
-elseif(PLATFORM_INT STREQUAL "TVOS")
-  set(SDK_NAME appletvos)
-  if(NOT ARCHS)
-    set(ARCHS arm64)
-  endif()
-elseif (PLATFORM_INT STREQUAL "TVOSCOMBINED")
-  set(SDK_NAME appletvos)
-  if(MODERN_CMAKE)
-    if(NOT ARCHS)
-      set(ARCHS arm64 x86_64)
-    endif()
-  else()
-    message(FATAL_ERROR "Please make sure that you are running CMake 3.14+ to make the TVOSCOMBINED setting work")
-  endif()
-elseif(PLATFORM_INT STREQUAL "SIMULATOR_TVOS")
-  set(SDK_NAME appletvsimulator)
-  if(NOT ARCHS)
-    set(ARCHS x86_64)
-  endif()
-elseif(PLATFORM_INT STREQUAL "WATCHOS")
-  set(SDK_NAME watchos)
-  if(NOT ARCHS)
-    if (XCODE_VERSION VERSION_GREATER 10.0)
-      set(ARCHS armv7k arm64_32)
-    else()
-      set(ARCHS armv7k)
-    endif()
-  endif()
-elseif(PLATFORM_INT STREQUAL "WATCHOSCOMBINED")
-  set(SDK_NAME watchos)
-  if(MODERN_CMAKE)
-    if(NOT ARCHS)
-      if (XCODE_VERSION VERSION_GREATER 10.0)
-        set(ARCHS armv7k arm64_32 i386)
-      else()
-        set(ARCHS armv7k i386)
-      endif()
-    endif()
-  else()
-    message(FATAL_ERROR "Please make sure that you are running CMake 3.14+ to make the WATCHOSCOMBINED setting work")
-  endif()
-elseif(PLATFORM_INT STREQUAL "SIMULATOR_WATCHOS")
-  set(SDK_NAME watchsimulator)
-  if(NOT ARCHS)
-    set(ARCHS i386)
-  endif()
-else()
-  message(FATAL_ERROR "Invalid PLATFORM: ${PLATFORM_INT}")
-endif()
-message(STATUS "Configuring ${SDK_NAME} build for platform: ${PLATFORM_INT}, architecture(s): ${ARCHS}")
-
-if(MODERN_CMAKE AND PLATFORM_INT MATCHES ".*COMBINED" AND NOT USED_CMAKE_GENERATOR MATCHES "Xcode")
-  message(FATAL_ERROR "The COMBINED options only work with Xcode generator, -G Xcode")
-endif()
-
-# If user did not specify the SDK root to use, then query xcodebuild for it.
-execute_process(COMMAND xcodebuild -version -sdk ${SDK_NAME} Path
-    OUTPUT_VARIABLE CMAKE_OSX_SYSROOT_INT
-    ERROR_QUIET
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
-if (NOT DEFINED CMAKE_OSX_SYSROOT_INT AND NOT DEFINED CMAKE_OSX_SYSROOT)
-  message(SEND_ERROR "Please make sure that Xcode is installed and that the toolchain"
-  "is pointing to the correct path. Please run:"
-  "sudo xcode-select -s /Applications/Xcode.app/Contents/Developer"
-  "and see if that fixes the problem for you.")
-  message(FATAL_ERROR "Invalid CMAKE_OSX_SYSROOT: ${CMAKE_OSX_SYSROOT} "
-  "does not exist.")
-elseif(DEFINED CMAKE_OSX_SYSROOT)
-  message(STATUS "Using SDK: ${CMAKE_OSX_SYSROOT} for platform: ${PLATFORM_INT} when checking compatibility")
-elseif(DEFINED CMAKE_OSX_SYSROOT_INT)
-   message(STATUS "Using SDK: ${CMAKE_OSX_SYSROOT_INT} for platform: ${PLATFORM_INT}")
-   set(CMAKE_OSX_SYSROOT "${CMAKE_OSX_SYSROOT_INT}" CACHE INTERNAL "")
-endif()
-
-# Set Xcode property for SDKROOT as well if Xcode generator is used
-if(USED_CMAKE_GENERATOR MATCHES "Xcode")
-  set(CMAKE_OSX_SYSROOT "${SDK_NAME}" CACHE INTERNAL "")
-  if(NOT DEFINED CMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM)
-    set(CMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM 123456789A CACHE INTERNAL "")
-  endif()
-endif()
-
-# Specify minimum version of deployment target.
-if(NOT DEFINED DEPLOYMENT_TARGET)
-  if (PLATFORM_INT STREQUAL "WATCHOS" OR PLATFORM_INT STREQUAL "SIMULATOR_WATCHOS")
-    # Unless specified, SDK version 2.0 is used by default as minimum target version (watchOS).
-    set(DEPLOYMENT_TARGET "2.0"
-            CACHE STRING "Minimum SDK version to build for." )
-  else()
-    # Unless specified, SDK version 9.0 is used by default as minimum target version (iOS, tvOS).
-    set(DEPLOYMENT_TARGET "9.0"
-            CACHE STRING "Minimum SDK version to build for." )
-  endif()
-  message(STATUS "Using the default min-version since DEPLOYMENT_TARGET not provided!")
-endif()
-# Use bitcode or not
-if(NOT DEFINED ENABLE_BITCODE AND NOT ARCHS MATCHES "((^|, )(i386|x86_64))+")
-  # Unless specified, enable bitcode support by default
-  message(STATUS "Enabling bitcode support by default. ENABLE_BITCODE not provided!")
-  set(ENABLE_BITCODE TRUE)
-elseif(NOT DEFINED ENABLE_BITCODE)
-  message(STATUS "Disabling bitcode support by default on simulators. ENABLE_BITCODE not provided for override!")
-  set(ENABLE_BITCODE FALSE)
-endif()
-set(ENABLE_BITCODE_INT ${ENABLE_BITCODE} CACHE BOOL "Whether or not to enable bitcode" ${FORCE_CACHE})
-# Use ARC or not
-if(NOT DEFINED ENABLE_ARC)
-  # Unless specified, enable ARC support by default
-  set(ENABLE_ARC TRUE)
-  message(STATUS "Enabling ARC support by default. ENABLE_ARC not provided!")
-endif()
-set(ENABLE_ARC_INT ${ENABLE_ARC} CACHE BOOL "Whether or not to enable ARC" ${FORCE_CACHE})
-# Use hidden visibility or not
-if(NOT DEFINED ENABLE_VISIBILITY)
-  # Unless specified, disable symbols visibility by default
-  set(ENABLE_VISIBILITY FALSE)
-  message(STATUS "Hiding symbols visibility by default. ENABLE_VISIBILITY not provided!")
-endif()
-set(ENABLE_VISIBILITY_INT ${ENABLE_VISIBILITY} CACHE BOOL "Whether or not to hide symbols (-fvisibility=hidden)" ${FORCE_CACHE})
-# Get the SDK version information.
-execute_process(COMMAND xcodebuild -sdk ${CMAKE_OSX_SYSROOT} -version SDKVersion
-  OUTPUT_VARIABLE SDK_VERSION
-  ERROR_QUIET
-  OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-# Find the Developer root for the specific iOS platform being compiled for
-# from CMAKE_OSX_SYSROOT.  Should be ../../ from SDK specified in
-# CMAKE_OSX_SYSROOT. There does not appear to be a direct way to obtain
-# this information from xcrun or xcodebuild.
-if (NOT DEFINED CMAKE_DEVELOPER_ROOT AND NOT USED_CMAKE_GENERATOR MATCHES "Xcode")
-  get_filename_component(PLATFORM_SDK_DIR ${CMAKE_OSX_SYSROOT} PATH)
-  get_filename_component(CMAKE_DEVELOPER_ROOT ${PLATFORM_SDK_DIR} PATH)
-
-  if (NOT DEFINED CMAKE_DEVELOPER_ROOT)
-    message(FATAL_ERROR "Invalid CMAKE_DEVELOPER_ROOT: "
-      "${CMAKE_DEVELOPER_ROOT} does not exist.")
-  endif()
-endif()
-# Find the C & C++ compilers for the specified SDK.
-if(NOT CMAKE_C_COMPILER)
-  execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang
-    OUTPUT_VARIABLE CMAKE_C_COMPILER
-    ERROR_QUIET
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
-  message(STATUS "Using C compiler: ${CMAKE_C_COMPILER}")
-endif()
-if(NOT CMAKE_CXX_COMPILER)
-  execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang++
-    OUTPUT_VARIABLE CMAKE_CXX_COMPILER
-    ERROR_QUIET
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
-  message(STATUS "Using CXX compiler: ${CMAKE_CXX_COMPILER}")
-endif()
-# Find (Apple's) libtool.
-execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find libtool
-  OUTPUT_VARIABLE BUILD_LIBTOOL
-  ERROR_QUIET
-  OUTPUT_STRIP_TRAILING_WHITESPACE)
-message(STATUS "Using libtool: ${BUILD_LIBTOOL}")
-# Configure libtool to be used instead of ar + ranlib to build static libraries.
-# This is required on Xcode 7+, but should also work on previous versions of
-# Xcode.
-set(CMAKE_C_CREATE_STATIC_LIBRARY
-  "${BUILD_LIBTOOL} -static -o <TARGET> <LINK_FLAGS> <OBJECTS> ")
-set(CMAKE_CXX_CREATE_STATIC_LIBRARY
-  "${BUILD_LIBTOOL} -static -o <TARGET> <LINK_FLAGS> <OBJECTS> ")
-# Get the version of Darwin (OS X) of the host.
-execute_process(COMMAND uname -r
-  OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_VERSION
-  ERROR_QUIET
-  OUTPUT_STRIP_TRAILING_WHITESPACE)
-# CMake 3.14+ support building for iOS, watchOS and tvOS out of the box.
-if(MODERN_CMAKE)
-  if(SDK_NAME MATCHES "iphone")
-    set(CMAKE_SYSTEM_NAME iOS CACHE INTERNAL "" ${FORCE_CACHE})
-  elseif(SDK_NAME MATCHES "appletv")
-    set(CMAKE_SYSTEM_NAME tvOS CACHE INTERNAL "" ${FORCE_CACHE})
-  elseif(SDK_NAME MATCHES "watch")
-    set(CMAKE_SYSTEM_NAME watchOS CACHE INTERNAL "" ${FORCE_CACHE})
-  endif()
-
-  # Provide flags for a combined FAT library build on newer CMake versions
-  if(PLATFORM_INT MATCHES ".*COMBINED")
-    set(CMAKE_XCODE_ATTRIBUTE_ONLY_ACTIVE_ARCH NO CACHE INTERNAL "")
-    set(CMAKE_IOS_INSTALL_COMBINED YES CACHE INTERNAL "")
-    message(STATUS "Will combine built (static) artifacts into FAT lib...")
-  endif()
-else()
-  # Legacy code path prior to CMake 3.14
-  set(CMAKE_SYSTEM_NAME Darwin CACHE INTERNAL "" ${FORCE_CACHE})
-endif()
-# Standard settings.
-set(CMAKE_SYSTEM_VERSION ${SDK_VERSION} CACHE INTERNAL "")
-set(UNIX TRUE CACHE BOOL "")
-set(APPLE TRUE CACHE BOOL "")
-set(IOS TRUE CACHE BOOL "")
-set(CMAKE_AR ar CACHE FILEPATH "" FORCE)
-set(CMAKE_RANLIB ranlib CACHE FILEPATH "" FORCE)
-set(CMAKE_STRIP strip CACHE FILEPATH "" FORCE)
-# Set the architectures for which to build.
-set(CMAKE_OSX_ARCHITECTURES ${ARCHS} CACHE STRING "Build architecture for iOS")
-# Change the type of target generated for try_compile() so it'll work when cross-compiling
-set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
-# All iOS/Darwin specific settings - some may be redundant.
-set(CMAKE_SHARED_LIBRARY_PREFIX "lib")
-set(CMAKE_SHARED_LIBRARY_SUFFIX ".dylib")
-set(CMAKE_SHARED_MODULE_PREFIX "lib")
-set(CMAKE_SHARED_MODULE_SUFFIX ".so")
-set(CMAKE_C_COMPILER_ABI ELF)
-set(CMAKE_CXX_COMPILER_ABI ELF)
-set(CMAKE_C_HAS_ISYSROOT 1)
-set(CMAKE_CXX_HAS_ISYSROOT 1)
-set(CMAKE_MODULE_EXISTS 1)
-set(CMAKE_DL_LIBS "")
-set(CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ")
-set(CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ")
-set(CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}")
-set(CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}")
-
-if(ARCHS MATCHES "((^|, )(arm64|arm64e|x86_64))+")
-  set(CMAKE_C_SIZEOF_DATA_PTR 8)
-  set(CMAKE_CXX_SIZEOF_DATA_PTR 8)
-  if(ARCHS MATCHES "((^|, )(arm64|arm64e))+")
-    set(CMAKE_SYSTEM_PROCESSOR "arm64")
-  else()
-    set(CMAKE_SYSTEM_PROCESSOR "x86_64")
-  endif()
-  message(STATUS "Using a data_ptr size of 8")
-else()
-  set(CMAKE_C_SIZEOF_DATA_PTR 4)
-  set(CMAKE_CXX_SIZEOF_DATA_PTR 4)
-  set(CMAKE_SYSTEM_PROCESSOR "arm")
-  message(STATUS "Using a data_ptr size of 4")
-endif()
-
-message(STATUS "Building for minimum ${SDK_NAME} version: ${DEPLOYMENT_TARGET}"
-               " (SDK version: ${SDK_VERSION})")
-# Note that only Xcode 7+ supports the newer more specific:
-# -m${SDK_NAME}-version-min flags, older versions of Xcode use:
-# -m(ios/ios-simulator)-version-min instead.
-if(PLATFORM_INT STREQUAL "OS" OR PLATFORM_INT STREQUAL "OS64")
-  if(XCODE_VERSION VERSION_LESS 7.0)
-    set(SDK_NAME_VERSION_FLAGS
-      "-mios-version-min=${DEPLOYMENT_TARGET}")
-  else()
-    # Xcode 7.0+ uses flags we can build directly from SDK_NAME.
-    set(SDK_NAME_VERSION_FLAGS
-      "-m${SDK_NAME}-version-min=${DEPLOYMENT_TARGET}")
-  endif()
-elseif(PLATFORM_INT STREQUAL "TVOS")
-  set(SDK_NAME_VERSION_FLAGS
-    "-mtvos-version-min=${DEPLOYMENT_TARGET}")
-elseif(PLATFORM_INT STREQUAL "SIMULATOR_TVOS")
-  set(SDK_NAME_VERSION_FLAGS
-    "-mtvos-simulator-version-min=${DEPLOYMENT_TARGET}")
-elseif(PLATFORM_INT STREQUAL "WATCHOS")
-  set(SDK_NAME_VERSION_FLAGS
-    "-mwatchos-version-min=${DEPLOYMENT_TARGET}")
-elseif(PLATFORM_INT STREQUAL "SIMULATOR_WATCHOS")
-  set(SDK_NAME_VERSION_FLAGS
-    "-mwatchos-simulator-version-min=${DEPLOYMENT_TARGET}")
-else()
-  # SIMULATOR or SIMULATOR64 both use -mios-simulator-version-min.
-  set(SDK_NAME_VERSION_FLAGS
-    "-mios-simulator-version-min=${DEPLOYMENT_TARGET}")
-endif()
-message(STATUS "Version flags set to: ${SDK_NAME_VERSION_FLAGS}")
-set(CMAKE_OSX_DEPLOYMENT_TARGET ${DEPLOYMENT_TARGET} CACHE STRING
-    "Set CMake deployment target" ${FORCE_CACHE})
-
-if(ENABLE_BITCODE_INT)
-  set(BITCODE "-fembed-bitcode")
-  set(CMAKE_XCODE_ATTRIBUTE_BITCODE_GENERATION_MODE bitcode CACHE INTERNAL "")
-  message(STATUS "Enabling bitcode support.")
-else()
-  set(BITCODE "")
-  set(CMAKE_XCODE_ATTRIBUTE_ENABLE_BITCODE NO CACHE INTERNAL "")
-  message(STATUS "Disabling bitcode support.")
-endif()
-
-if(ENABLE_ARC_INT)
-  set(FOBJC_ARC "-fobjc-arc")
-  set(CMAKE_XCODE_ATTRIBUTE_CLANG_ENABLE_OBJC_ARC YES CACHE INTERNAL "")
-  message(STATUS "Enabling ARC support.")
-else()
-  set(FOBJC_ARC "-fno-objc-arc")
-  set(CMAKE_XCODE_ATTRIBUTE_CLANG_ENABLE_OBJC_ARC NO CACHE INTERNAL "")
-  message(STATUS "Disabling ARC support.")
-endif()
-
-if(NOT ENABLE_VISIBILITY_INT)
-  set(VISIBILITY "-fvisibility=hidden")
-  set(CMAKE_XCODE_ATTRIBUTE_GCC_SYMBOLS_PRIVATE_EXTERN YES CACHE INTERNAL "")
-  message(STATUS "Hiding symbols (-fvisibility=hidden).")
-else()
-  set(VISIBILITY "")
-  set(CMAKE_XCODE_ATTRIBUTE_GCC_SYMBOLS_PRIVATE_EXTERN NO CACHE INTERNAL "")
-endif()
-
-#Check if Xcode generator is used, since that will handle these flags automagically
-if(USED_CMAKE_GENERATOR MATCHES "Xcode")
-  message(STATUS "Not setting any manual command-line buildflags, since Xcode is selected as generator.")
-else()
-  set(CMAKE_C_FLAGS
-  "${SDK_NAME_VERSION_FLAGS} ${BITCODE} -fobjc-abi-version=2 ${FOBJC_ARC} ${CMAKE_C_FLAGS}")
-  # Hidden visibilty is required for C++ on iOS.
-  set(CMAKE_CXX_FLAGS
-  "${SDK_NAME_VERSION_FLAGS} ${BITCODE} ${VISIBILITY} -fvisibility-inlines-hidden -fobjc-abi-version=2 ${FOBJC_ARC} ${CMAKE_CXX_FLAGS}")
-  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -O0 -g ${CMAKE_CXX_FLAGS_DEBUG}")
-  set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS} -DNDEBUG -Os -ffast-math ${CMAKE_CXX_FLAGS_MINSIZEREL}")
-  set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} -DNDEBUG -O2 -g -ffast-math ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
-  set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -DNDEBUG -O3 -ffast-math ${CMAKE_CXX_FLAGS_RELEASE}")
-  set(CMAKE_C_LINK_FLAGS "${SDK_NAME_VERSION_FLAGS} -Wl,-search_paths_first ${CMAKE_C_LINK_FLAGS}")
-  set(CMAKE_CXX_LINK_FLAGS "${SDK_NAME_VERSION_FLAGS}  -Wl,-search_paths_first ${CMAKE_CXX_LINK_FLAGS}")
-
-  # In order to ensure that the updated compiler flags are used in try_compile()
-  # tests, we have to forcibly set them in the CMake cache, not merely set them
-  # in the local scope.
-  list(APPEND VARS_TO_FORCE_IN_CACHE
-    CMAKE_C_FLAGS
-    CMAKE_CXX_FLAGS
-    CMAKE_CXX_FLAGS_DEBUG
-    CMAKE_CXX_FLAGS_RELWITHDEBINFO
-    CMAKE_CXX_FLAGS_MINSIZEREL
-    CMAKE_CXX_FLAGS_RELEASE
-    CMAKE_C_LINK_FLAGS
-    CMAKE_CXX_LINK_FLAGS)
-  foreach(VAR_TO_FORCE ${VARS_TO_FORCE_IN_CACHE})
-    set(${VAR_TO_FORCE} "${${VAR_TO_FORCE}}" CACHE STRING "")
-  endforeach()
-endif()
-
-set(CMAKE_PLATFORM_HAS_INSTALLNAME 1)
-set(CMAKE_SHARED_LINKER_FLAGS "-rpath @executable_path/Frameworks -rpath @loader_path/Frameworks")
-set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -Wl,-headerpad_max_install_names")
-set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -Wl,-headerpad_max_install_names")
-set(CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,")
-set(CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,")
-set(CMAKE_FIND_LIBRARY_SUFFIXES ".tbd" ".dylib" ".so" ".a")
-set(CMAKE_SHARED_LIBRARY_SONAME_C_FLAG "-install_name")
-
-# Hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old
-# build tree (where install_name_tool was hardcoded) and where
-# CMAKE_INSTALL_NAME_TOOL isn't in the cache and still cmake didn't fail in
-# CMakeFindBinUtils.cmake (because it isn't rerun) hardcode
-# CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did
-# before, Alex.
-if(NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
-  find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool)
-endif(NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
-
-# Set the find root to the iOS developer roots and to user defined paths.
-set(CMAKE_FIND_ROOT_PATH ${CMAKE_DEVELOPER_ROOT} ${CMAKE_OSX_SYSROOT_INT}
-  ${CMAKE_PREFIX_PATH} CACHE STRING "Root path that will be prepended to all search paths")
-# Default to searching for frameworks first.
-set(CMAKE_FIND_FRAMEWORK FIRST)
-# Set up the default search directories for frameworks.
-set(CMAKE_FRAMEWORK_PATH
-  ${CMAKE_DEVELOPER_ROOT}/Library/Frameworks
-  ${CMAKE_DEVELOPER_ROOT}/Library/PrivateFrameworks
-  ${CMAKE_OSX_SYSROOT_INT}/System/Library/Frameworks
-  ${CMAKE_FRAMEWORK_PATH} CACHE STRING "Frameworks search paths")
-
-# By default, search both the specified iOS SDK and the remainder of the host filesystem.
-if(NOT CMAKE_FIND_ROOT_PATH_MODE_PROGRAM)
-  set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM BOTH CACHE STRING "" ${FORCE_CACHE})
-endif()
-if(NOT CMAKE_FIND_ROOT_PATH_MODE_LIBRARY)
-  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH CACHE STRING "" ${FORCE_CACHE})
-endif()
-if(NOT CMAKE_FIND_ROOT_PATH_MODE_INCLUDE)
-  set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE BOTH CACHE STRING "" ${FORCE_CACHE})
-endif()
-if(NOT CMAKE_FIND_ROOT_PATH_MODE_PACKAGE)
-  set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE BOTH CACHE STRING "" ${FORCE_CACHE})
-endif()
-
-#
-# Some helper-macros below to simplify and beautify the CMakeFile
-#
-
-# This little macro lets you set any XCode specific property.
-macro(set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE XCODE_RELVERSION)
-  set(XCODE_RELVERSION_I "${XCODE_RELVERSION}")
-  if(XCODE_RELVERSION_I STREQUAL "All")
-    set_property(TARGET ${TARGET} PROPERTY
-    XCODE_ATTRIBUTE_${XCODE_PROPERTY} "${XCODE_VALUE}")
-  else()
-    set_property(TARGET ${TARGET} PROPERTY
-    XCODE_ATTRIBUTE_${XCODE_PROPERTY}[variant=${XCODE_RELVERSION_I}] "${XCODE_VALUE}")
-  endif()
-endmacro(set_xcode_property)
-# This macro lets you find executable programs on the host system.
-macro(find_host_package)
-  set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
-  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
-  set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
-  set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE NEVER)
-  set(IOS FALSE)
-  find_package(${ARGN})
-  set(IOS TRUE)
-  set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM BOTH)
-  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH)
-  set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE BOTH)
-  set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE BOTH)
-endmacro(find_host_package)
diff --git a/cmake/cross_compiling/npu.cmake b/cmake/cross_compiling/npu.cmake
deleted file mode 100644
index 863200986c..0000000000
--- a/cmake/cross_compiling/npu.cmake
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-# http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-if(NOT LITE_WITH_NPU)
-  return()
-endif()
-
-if(NOT DEFINED NPU_DDK_ROOT)
-    set(NPU_DDK_ROOT $ENV{NPU_DDK_ROOT})
-    if(NOT NPU_DDK_ROOT)
-        message(FATAL_ERROR "Must set NPU_DDK_ROOT or env NPU_DDK_ROOT when LITE_WITH_NPU=ON")
-    endif()
-endif()
-
-message(STATUS "NPU_DDK_ROOT: ${NPU_DDK_ROOT}")
-find_path(NPU_DDK_INC NAMES HiAiModelManagerService.h
-  PATHS ${NPU_DDK_ROOT}/include NO_DEFAULT_PATH)
-if(NOT NPU_DDK_INC)
-  message(FATAL_ERROR "Can not find HiAiModelManagerService.h in ${NPU_DDK_ROOT}/include")
-endif()
-
-include_directories("${NPU_DDK_ROOT}")
-
-set(NPU_SUB_LIB_PATH "lib64")
-if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
-    set(NPU_SUB_LIB_PATH "lib64")
-endif()
-
-if(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
-    set(NPU_SUB_LIB_PATH "lib")
-endif()
-
-find_library(NPU_DDK_HIAI_FILE NAMES hiai
-  PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH})
-
-find_library(NPU_DDK_IR_FILE NAMES hiai_ir
-  PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH})
-
-find_library(NPU_DDK_IR_BUILD_FILE NAMES hiai_ir_build
-  PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH})
-
-find_library(NPU_DDK_PROTO_FILE NAMES protobuf-lite
-  PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH})
-
-if(NOT NPU_DDK_HIAI_FILE)
-  message(FATAL_ERROR "Can not find NPU_DDK_HIAI_FILE in ${NPU_DDK_ROOT}")
-else()
-  message(STATUS "Found NPU_DDK HIAI Library: ${NPU_DDK_HIAI_FILE}")
-  add_library(npu_ddk_hiai SHARED IMPORTED GLOBAL)
-  set_property(TARGET npu_ddk_hiai PROPERTY IMPORTED_LOCATION ${NPU_DDK_HIAI_FILE})
-endif()
-
-if(NOT NPU_DDK_IR_FILE)
-  message(FATAL_ERROR "Can not find NPU_DDK_IR_FILE in ${NPU_DDK_ROOT}")
-else()
-  message(STATUS "Found NPU_DDK IR Library: ${NPU_DDK_IR_FILE}")
-  add_library(npu_ddk_ir SHARED IMPORTED GLOBAL)
-  set_property(TARGET npu_ddk_ir PROPERTY IMPORTED_LOCATION ${NPU_DDK_IR_FILE})
-endif()
-
-if(NOT NPU_DDK_IR_BUILD_FILE)
-  message(FATAL_ERROR "Can not find NPU_DDK_IR_BUILD_FILE in ${NPU_DDK_ROOT}")
-else()
-  message(STATUS "Found NPU_DDK IR_BUILD Library: ${NPU_DDK_IR_BUILD_FILE}")
-  add_library(npu_ddk_ir_build SHARED IMPORTED GLOBAL)
-  set_property(TARGET npu_ddk_ir_build PROPERTY IMPORTED_LOCATION ${NPU_DDK_IR_BUILD_FILE})
-endif()
-
-if(NOT NPU_DDK_PROTO_FILE)
-  message(FATAL_ERROR "Can not find NPU_DDK_PROTO_FILE in ${NPU_DDK_ROOT}")
-else()
-  message(STATUS "Found NPU_DDK Protobuf Library: ${NPU_DDK_PROTO_FILE}")
-  add_library(npu_ddk_proto SHARED IMPORTED GLOBAL)
-  set_property(TARGET npu_ddk_proto PROPERTY IMPORTED_LOCATION ${NPU_DDK_PROTO_FILE})
-endif()
-
-set(npu_ddk_libs npu_ddk_hiai npu_ddk_ir npu_ddk_ir_build npu_ddk_proto CACHE INTERNAL "npu ddk libs")
-
-
diff --git a/cmake/cross_compiling/postproject.cmake b/cmake/cross_compiling/postproject.cmake
deleted file mode 100644
index 33254df03c..0000000000
--- a/cmake/cross_compiling/postproject.cmake
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#     http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-    return()
-endif()
-
-include(CheckCXXCompilerFlag)
-
-if(ANDROID)
-    include(cross_compiling/findar)
-    
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -llog -fPIC")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog -fPIC")
-endif()
-
-if(ARMLINUX)
-    if(ARMLINUX_ARCH_ABI STREQUAL "armv8")
-        set(CMAKE_CXX_FLAGS "-march=armv8-a ${CMAKE_CXX_FLAGS}")
-        set(CMAKE_C_FLAGS "-march=armv8-a ${CMAKE_C_FLAGS}")
-        message(STATUS "NEON is enabled on arm64-v8a")
-    endif()
-
-    if(ARMLINUX_ARCH_ABI STREQUAL "armv7")
-        set(CMAKE_CXX_FLAGS "-march=armv7-a -mfloat-abi=softfp -mfpu=neon-vfpv4 ${CMAKE_CXX_FLAGS}")
-        set(CMAKE_C_FLAGS "-march=armv7-a -mfloat-abi=softfp -mfpu=neon-vfpv4 ${CMAKE_C_FLAGS}")
-        message(STATUS "NEON is enabled on arm-v7a with softfp")
-    endif()
-
-    if(ARMLINUX_ARCH_ABI STREQUAL "armv7hf")
-        set(CMAKE_CXX_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 ${CMAKE_CXX_FLAGS}")
-        set(CMAKE_C_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 ${CMAKE_C_FLAGS}" )
-        message(STATUS "NEON is enabled on arm-v7a with hard float")
-    endif()
-endif()
-
-function(check_linker_flag)
-    foreach(flag ${ARGN})
-        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${flag}")
-        check_cxx_compiler_flag("" out_var)
-        if(${out_var})
-            set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${flag}")
-        endif()
-    endforeach()
-    set(CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS} PARENT_SCOPE)
-endfunction()
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-if (LITE_ON_TINY_PUBLISH)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fno-exceptions -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flto -fvisibility=hidden -fvisibility-inlines-hidden -fdata-sections -ffunction-sections")
-    check_linker_flag(-Wl,--gc-sections)
-endif()
-
-if(LITE_WITH_OPENMP)
-    find_package(OpenMP REQUIRED)
-    if(OPENMP_FOUND OR OpenMP_CXX_FOUND)
-        add_definitions(-DARM_WITH_OMP)
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-        message(STATUS "Found OpenMP ${OpenMP_VERSION} ${OpenMP_CXX_VERSION}")
-        message(STATUS "OpenMP C flags:  ${OpenMP_C_FLAGS}")
-        message(STATUS "OpenMP CXX flags:  ${OpenMP_CXX_FLAGS}")
-        message(STATUS "OpenMP OpenMP_CXX_LIB_NAMES:  ${OpenMP_CXX_LIB_NAMES}")
-        message(STATUS "OpenMP OpenMP_CXX_LIBRARIES:  ${OpenMP_CXX_LIBRARIES}")
-    else()
-        message(FATAL_ERROR "Could not found OpenMP!")
-    endif()
-endif()
-
-# third party cmake args
-set(CROSS_COMPILE_CMAKE_ARGS
-    "-DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}"
-    "-DCMAKE_SYSTEM_VERSION=${CMAKE_SYSTEM_VERSION}")
-
-if(ANDROID)
-    set(CROSS_COMPILE_CMAKE_ARGS ${CROSS_COMPILE_CMAKE_ARGS}
-        "-DCMAKE_ANDROID_ARCH_ABI=${CMAKE_ANDROID_ARCH_ABI}"
-        "-DCMAKE_ANDROID_NDK=${CMAKE_ANDROID_NDK}"
-        "-DCMAKE_ANDROID_STL_TYPE=${CMAKE_ANDROID_STL_TYPE}"
-        "-DCMAKE_ANDROID_NDK_TOOLCHAIN_VERSION=${CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION}")
-endif()
-  
-if(IOS)
-    set(CROSS_COMPILE_CMAKE_ARGS ${CROSS_COMPILE_CMAKE_ARGS}
-        "-DCMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES}"
-        "-DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}"
-        "-DCMAKE_OSX_SYSROOT=${CMAKE_OSX_SYSROOT}")
-endif()
diff --git a/cmake/cross_compiling/preproject.cmake b/cmake/cross_compiling/preproject.cmake
deleted file mode 100644
index 813d1910fc..0000000000
--- a/cmake/cross_compiling/preproject.cmake
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#     http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-    return()
-endif()
-
-cmake_minimum_required(VERSION 3.10)
-
-# define check function
-function(check_input_var VAR_NAME)
-  set(options "")
-  set(oneValueArgs "")
-  set(multiValueArgs DEFAULT LIST)
-  cmake_parse_arguments(check_input_var "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-  set(var_out "")
-  if(NOT DEFINED ${VAR_NAME})
-    set(var_out ${check_input_var_DEFAULT})
-  else()
-    set(var_out ${${VAR_NAME}})
-  endif()
-  
-  if(NOT var_out IN_LIST check_input_var_LIST)
-    message(FATAL_ERROR "${VAR_NAME}:${var_out} must be in one of ${check_input_var_LIST}")
-  endif()
-  set(${VAR_NAME} ${var_out} PARENT_SCOPE)
-endfunction(check_input_var)
-
-check_input_var(ARM_TARGET_OS DEFAULT "android" LIST "android" "armlinux" "ios" "ios64")
-check_input_var(ARM_TARGET_ARCH_ABI DEFAULT "armv8" LIST "armv8" "armv7" "armv7hf" "arm64-v8a" "armeabi-v7a")
-check_input_var(ARM_TARGET_LANG DEFAULT "gcc" LIST "gcc" "clang")
-check_input_var(ARM_TARGET_LIB_TYPE DEFAULT "static" LIST "static" "shared")
-
-include(cross_compiling/armlinux)
-include(cross_compiling/android)
-include(cross_compiling/ios)
-include(cross_compiling/host)
-
-if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Default use Release in android" FORCE)
-endif()
-
-if(NOT THIRD_PARTY_BUILD_TYPE)
-    set(THIRD_PARTY_BUILD_TYPE "MinSizeRel" CACHE STRING "Default use MinSizeRel in android" FORCE)
-endif()
-
-message(STATUS "Lite ARM Compile ${ARM_TARGET_OS} with ${ARM_TARGET_ARCH_ABI} ${ARM_TARGET_LANG}")
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
deleted file mode 100644
index 1e6f34a621..0000000000
--- a/cmake/cuda.cmake
+++ /dev/null
@@ -1,228 +0,0 @@
-if(NOT LITE_WITH_CUDA)
-    return()
-endif()
-
-set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
-set(paddle_known_gpu_archs7 "30 35 50 52")
-set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
-set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70")
-set(paddle_known_gpu_archs10 "30 35 50 52 60 61 70 75")
-
-######################################################################################
-# A function for automatic detection of GPUs installed  (if autodetection is enabled)
-# Usage:
-#   detect_installed_gpus(out_variable)
-function(detect_installed_gpus out_variable)
-  if(NOT CUDA_gpu_detect_output)
-    set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
-
-    file(WRITE ${cufile} ""
-      "#include <cstdio>\n"
-      "int main() {\n"
-      "  int count = 0;\n"
-      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
-      "  if (count == 0) return -1;\n"
-      "  for (int device = 0; device < count; ++device) {\n"
-      "    cudaDeviceProp prop;\n"
-      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
-      "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
-      "  }\n"
-      "  return 0;\n"
-      "}\n")
-
-    execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "-ccbin=${CUDA_HOST_COMPILER}"
-                    "--run" "${cufile}"
-                    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
-                    RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
-                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-    if(nvcc_res EQUAL 0)
-      # only keep the last line of nvcc_out
-      STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}")
-      STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}")
-      list(GET nvcc_out -1 nvcc_out)
-      string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}")
-      set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE)
-    endif()
-  endif()
-
-  if(NOT CUDA_gpu_detect_output)
-    message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
-    set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
-  else()
-    set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
-  endif()
-endfunction()
-
-
-########################################################################
-# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
-# Usage:
-#   select_nvcc_arch_flags(out_variable)
-function(select_nvcc_arch_flags out_variable)
-  # List of arch names
-  set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual")
-  set(archs_name_default "All")
-  list(APPEND archs_names "Auto")
-
-  # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
-  set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
-  set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${archs_names} )
-  mark_as_advanced(CUDA_ARCH_NAME)
-
-  # verify CUDA_ARCH_NAME value
-  if(NOT ";${archs_names};" MATCHES ";${CUDA_ARCH_NAME};")
-    string(REPLACE ";" ", " archs_names "${archs_names}")
-    message(FATAL_ERROR "Only ${archs_names} architeture names are supported.")
-  endif()
-
-  if(${CUDA_ARCH_NAME} STREQUAL "Manual")
-    set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
-    set(CUDA_ARCH_PTX "50"                     CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
-    mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
-  else()
-    unset(CUDA_ARCH_BIN CACHE)
-    unset(CUDA_ARCH_PTX CACHE)
-  endif()
-
-  if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
-    set(cuda_arch_bin "30 35")
-  elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
-    set(cuda_arch_bin "50")
-  elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
-    set(cuda_arch_bin "60 61")
-  elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
-    set(cuda_arch_bin "70")
-  elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
-    set(cuda_arch_bin "75")
-  elseif(${CUDA_ARCH_NAME} STREQUAL "All")
-    set(cuda_arch_bin ${paddle_known_gpu_archs})
-  elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
-    detect_installed_gpus(cuda_arch_bin)
-  else()  # (${CUDA_ARCH_NAME} STREQUAL "Manual")
-    set(cuda_arch_bin ${CUDA_ARCH_BIN})
-  endif()
-
-  # remove dots and convert to lists
-  string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
-  string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}")
-  string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
-  string(REGEX MATCHALL "[0-9]+"   cuda_arch_ptx "${cuda_arch_ptx}")
-  list(REMOVE_DUPLICATES cuda_arch_bin)
-  list(REMOVE_DUPLICATES cuda_arch_ptx)
-
-  set(nvcc_flags "")
-  set(nvcc_archs_readable "")
-
-  # Tell NVCC to add binaries for the specified GPUs
-  foreach(arch ${cuda_arch_bin})
-    if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
-      # User explicitly specified PTX for the concrete BIN
-      list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
-      list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
-    else()
-      # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
-      list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
-      list(APPEND nvcc_archs_readable sm_${arch})
-    endif()
-  endforeach()
-
-  # Tell NVCC to add PTX intermediate code for the specified architectures
-  foreach(arch ${cuda_arch_ptx})
-    list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch})
-    list(APPEND nvcc_archs_readable compute_${arch})
-  endforeach()
-
-  string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
-  set(${out_variable}          ${nvcc_flags}          PARENT_SCOPE)
-  set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
-endfunction()
-
-message(STATUS "CUDA detected: " ${CUDA_VERSION})
-if (${CUDA_VERSION} LESS 7.0)
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
-  add_definitions("-DPADDLE_CUDA_BINVER=\"60\"")
-elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
-  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
-  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
-  add_definitions("-DPADDLE_CUDA_BINVER=\"70\"")
-elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
-  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
-  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
-  # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
-  # warning for now.
-  list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
-  add_definitions("-DPADDLE_CUDA_BINVER=\"80\"")
-elseif (${CUDA_VERSION} LESS 10.0) # CUDA 9.x
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs9})
-  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
-  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
-  add_definitions("-DPADDLE_CUDA_BINVER=\"90\"")
-elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs10})
-  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
-  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
-  add_definitions("-DPADDLE_CUDA_BINVER=\"100\"")
-endif()
-
-include_directories(${CUDA_INCLUDE_DIRS})
-if(NOT WITH_DSO)
-    if(WIN32)
-      set_property(GLOBAL PROPERTY CUDA_MODULES ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
-    endif(WIN32)
-endif(NOT WITH_DSO)
-
-# setting nvcc arch flags
-select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
-list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
-message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}")
-
-# Set C++11 support
-set(CUDA_PROPAGATE_HOST_FLAGS OFF)
-
-# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
-# So, don't set these flags here.
-if (NOT WIN32) # windows msvc2015 support c++11 natively. 
-# -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake.
-list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
-list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
-endif(NOT WIN32)
-
-if(WITH_FAST_MATH)
-  # Make use of fast math library. https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
-  list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
-endif()
-# in cuda9, suppress cuda warning on eigen 
-list(APPEND CUDA_NVCC_FLAGS "-w")
-# Set :expt-relaxed-constexpr to suppress Eigen warnings
-list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
-
-if (NOT WIN32)
-  if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
-      list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
-  elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
-      list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
-  elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
-      list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
-  elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
-      # nvcc 9 does not support -Os. Use Release flags instead
-      list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
-  endif()
-else(NOT WIN32)
-  list(APPEND CUDA_NVCC_FLAGS  "-Xcompiler \"/wd 4244 /wd 4267 /wd 4819\"")
-  list(APPEND CUDA_NVCC_FLAGS  "--compiler-options;/bigobj")
-  if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
-    list(APPEND CUDA_NVCC_FLAGS  "-g -G")
-    # match the cl's _ITERATOR_DEBUG_LEVEL
-    list(APPEND CUDA_NVCC_FLAGS  "-D_DEBUG")
-  elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
-    list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
-  else()
-  message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.")
-endif()
-endif(NOT WIN32)
-
-mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
-mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
deleted file mode 100644
index 3775d6cc2b..0000000000
--- a/cmake/cudnn.cmake
+++ /dev/null
@@ -1,99 +0,0 @@
-if(NOT LITE_WITH_CUDA)
-    return()
-endif()
-
-if(WIN32)
-    set(CUDNN_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
-else(WIN32)
-    set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT")
-endif(WIN32)
-
-find_path(CUDNN_INCLUDE_DIR cudnn.h
-    PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include
-    $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE}
-    NO_DEFAULT_PATH
-)
-
-get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
-
-set(TARGET_ARCH "x86_64")
-if(NOT ${CMAKE_SYSTEM_PROCESSOR})
-    set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
-endif()
-
-list(APPEND CUDNN_CHECK_LIBRARY_DIRS
-    ${CUDNN_ROOT}
-    ${CUDNN_ROOT}/lib64
-    ${CUDNN_ROOT}/lib
-    ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
-    ${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
-    $ENV{CUDNN_ROOT}
-    $ENV{CUDNN_ROOT}/lib64
-    $ENV{CUDNN_ROOT}/lib
-    /usr/lib
-	${CUDA_TOOLKIT_ROOT_DIR}
-	${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
-	)
-set(CUDNN_LIB_NAME "libcudnn.so")
-
-if(WIN32)
-# only support cudnn7
-set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll")
-endif(WIN32)
-
-if(APPLE)
-set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so")
-endif(APPLE)
-
-find_library(CUDNN_LIBRARY NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a
-    PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}
-          NO_DEFAULT_PATH
-    DOC "Path to cuDNN library.")
-
-
-if(CUDNN_INCLUDE_DIR AND CUDNN_LIBRARY)
-    set(CUDNN_FOUND ON)
-else()
-    set(CUDNN_FOUND OFF)
-endif()
-
-if(CUDNN_FOUND)
-    file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
-
-    get_filename_component(CUDNN_LIB_PATH ${CUDNN_LIBRARY} DIRECTORY)
-
-    string(REGEX MATCH "define CUDNN_VERSION +([0-9]+)"
-        CUDNN_VERSION "${CUDNN_VERSION_FILE_CONTENTS}")
-    string(REGEX REPLACE "define CUDNN_VERSION +([0-9]+)" "\\1"
-        CUDNN_VERSION "${CUDNN_VERSION}")
-
-    if("${CUDNN_VERSION}" STREQUAL "2000")
-        message(STATUS "Current cuDNN version is v2. ")
-    else()
-        string(REGEX MATCH "define CUDNN_MAJOR +([0-9]+)" CUDNN_MAJOR_VERSION
-            "${CUDNN_VERSION_FILE_CONTENTS}")
-        string(REGEX REPLACE "define CUDNN_MAJOR +([0-9]+)" "\\1"
-            CUDNN_MAJOR_VERSION "${CUDNN_MAJOR_VERSION}")
-        string(REGEX MATCH "define CUDNN_MINOR +([0-9]+)" CUDNN_MINOR_VERSION
-            "${CUDNN_VERSION_FILE_CONTENTS}")
-        string(REGEX REPLACE "define CUDNN_MINOR +([0-9]+)" "\\1"
-            CUDNN_MINOR_VERSION "${CUDNN_MINOR_VERSION}")
-        string(REGEX MATCH "define CUDNN_PATCHLEVEL +([0-9]+)"
-            CUDNN_PATCHLEVEL_VERSION "${CUDNN_VERSION_FILE_CONTENTS}")
-        string(REGEX REPLACE "define CUDNN_PATCHLEVEL +([0-9]+)" "\\1"
-            CUDNN_PATCHLEVEL_VERSION "${CUDNN_PATCHLEVEL_VERSION}")
-
-        if(NOT CUDNN_MAJOR_VERSION)
-            set(CUDNN_VERSION "???")
-        else()
-            add_definitions("-DPADDLE_CUDNN_BINVER=\"${CUDNN_MAJOR_VERSION}\"")
-            math(EXPR CUDNN_VERSION
-                "${CUDNN_MAJOR_VERSION} * 1000 +
-                 ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")
-        endif()
-
-        message(STATUS "Current cuDNN header is ${CUDNN_INCLUDE_DIR}/cudnn.h. "
-            "Current cuDNN version is v${CUDNN_MAJOR_VERSION}. ")
-
-    endif()
-endif()
diff --git a/cmake/cupti.cmake b/cmake/cupti.cmake
deleted file mode 100644
index 72ed0f1e58..0000000000
--- a/cmake/cupti.cmake
+++ /dev/null
@@ -1,41 +0,0 @@
-if(NOT WITH_GPU)
-    return()
-endif()
-
-
-set(CUPTI_ROOT "/usr" CACHE PATH "CUPTI ROOT")
-find_path(CUPTI_INCLUDE_DIR cupti.h
-        PATHS ${CUPTI_ROOT} ${CUPTI_ROOT}/include
-        $ENV{CUPTI_ROOT} $ENV{CUPTI_ROOT}/include
-        ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/include
-        NO_DEFAULT_PATH
-        )
-
-get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
-
-set(TARGET_ARCH "x86_64")
-if(NOT ${CMAKE_SYSTEM_PROCESSOR})
-    set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
-endif()
-
-list(APPEND CUPTI_CHECK_LIBRARY_DIRS
-        ${CUPTI_ROOT}
-        ${CUPTI_ROOT}/lib64
-        ${CUPTI_ROOT}/lib
-        ${CUPTI_ROOT}/lib/${TARGET_ARCH}-linux-gnu
-        $ENV{CUPTI_ROOT}
-        $ENV{CUPTI_ROOT}/lib64
-        $ENV{CUPTI_ROOT}/lib
-        /usr/lib
-        ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/lib64)
-find_library(CUPTI_LIBRARY NAMES libcupti.so libcupti.dylib # libcupti_static.a
-       PATHS ${CUPTI_CHECK_LIBRARY_DIRS} ${CUPTI_INCLUDE_DIR} ${__libpath_hist}
-       NO_DEFAULT_PATH
-       DOC "Path to cuPTI library.")
-
-get_filename_component(CUPTI_LIBRARY_PATH ${CUPTI_LIBRARY} DIRECTORY)
-if(CUPTI_INCLUDE_DIR AND CUPTI_LIBRARY)
-    set(CUPTI_FOUND ON)
-else()
-    set(CUPTI_FOUND OFF)
-endif()
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
deleted file mode 100644
index bd0d117a63..0000000000
--- a/cmake/external/eigen.cmake
+++ /dev/null
@@ -1,54 +0,0 @@
-INCLUDE(ExternalProject)
-
-SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
-SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3)
-INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
-if(NOT WITH_FAST_MATH)
-  # EIGEN_FAST_MATH: https://eigen.tuxfamily.org/dox/TopicPreprocessorDirectives.html
-  # enables some optimizations which might affect the accuracy of the result.
-  # This currently enables the SSE vectorization of sin() and cos(),
-  # and speedups sqrt() for single precision.
-  # Defined to 1 by default. Define it to 0 to disable.
-  add_definitions(-DEIGEN_FAST_MATH=0)
-endif()
-
-if(WITH_AMD_GPU)
-    ExternalProject_Add(
-        extern_eigen3
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY  "https://github.com/sabreshao/hipeigen.git"
-        GIT_TAG         7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e
-        PREFIX          ${EIGEN_SOURCE_DIR}
-        UPDATE_COMMAND  ""
-        CONFIGURE_COMMAND ""
-        BUILD_COMMAND     ""
-        INSTALL_COMMAND   ""
-        TEST_COMMAND      ""
-    )
-else()
-    ExternalProject_Add(
-        extern_eigen3
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY  "https://github.com/eigenteam/eigen-git-mirror"
-        # eigen on cuda9.1 missing header of math_funtions.hpp
-        # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
-        GIT_TAG         917060c364181f33a735dc023818d5a54f60e54c
-        PREFIX          ${EIGEN_SOURCE_DIR}
-        DOWNLOAD_NAME   "eigen"
-        UPDATE_COMMAND  ""
-        CONFIGURE_COMMAND ""
-        BUILD_COMMAND     ""
-        INSTALL_COMMAND   ""
-        TEST_COMMAND      ""
-    )
-endif()
-
-if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
-    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c)
-    file(WRITE ${dummyfile} "const char *dummy_eigen3 = \"${dummyfile}\";")
-    add_library(eigen3 STATIC ${dummyfile})
-else()
-    add_library(eigen3 INTERFACE)
-endif()
-
-add_dependencies(eigen3 extern_eigen3)
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
deleted file mode 100644
index 44ede96171..0000000000
--- a/cmake/external/gflags.cmake
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-INCLUDE(ExternalProject)
-
-SET(GFLAGS_SOURCES_DIR ${CMAKE_SOURCE_DIR}/third-party/gflags)
-SET(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags)
-SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE)
-IF(WIN32)
-  set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
-ELSE(WIN32)
-  set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
-ENDIF(WIN32)
-
-INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
-
-SET(OPTIONAL_ARGS "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
-                  "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
-                  "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
-                  "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}"
-                  "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}"
-                  "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
-                  "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}"
-                  "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}")
-
-ExternalProject_Add(
-    extern_gflags
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  ""
-    GIT_TAG         77592648e3f3be87d6c7123eb81cbad75f9aef5a
-    SOURCE_DIR      ${GFLAGS_SOURCES_DIR}
-    PREFIX          ${GFLAGS_INCLUDE_DIR}
-    UPDATE_COMMAND  ""
-    CMAKE_ARGS      -DBUILD_STATIC_LIBS=ON
-                    -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
-                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                    -DBUILD_TESTING=OFF
-                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                    ${CROSS_COMPILE_CMAKE_ARGS}
-                    ${OPTIONAL_ARGS}
-                    ${EXTERNAL_OPTIONAL_ARGS}
-    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
-                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-)
-IF(WIN32)
-  IF(NOT EXISTS "${GFLAGS_INSTALL_DIR}/lib/libgflags.lib")
-    add_custom_command(TARGET extern_gflags POST_BUILD
-            COMMAND cmake -E copy ${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib ${GFLAGS_INSTALL_DIR}/lib/libgflags.lib
-            )
-  ENDIF()
-ENDIF(WIN32)
-ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
-ADD_DEPENDENCIES(gflags extern_gflags)
-
-# On Windows (including MinGW), the Shlwapi library is used by gflags if available.
-if (WIN32)
-  include(CheckIncludeFileCXX)
-  check_include_file_cxx("shlwapi.h" HAVE_SHLWAPI)
-  if (HAVE_SHLWAPI)
-    set_property(GLOBAL PROPERTY OS_DEPENDENCY_MODULES shlwapi.lib)
-  endif(HAVE_SHLWAPI)
-endif (WIN32)
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
deleted file mode 100644
index 970020d784..0000000000
--- a/cmake/external/glog.cmake
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-INCLUDE(ExternalProject)
-
-SET(GLOG_SOURCES_DIR ${THIRD_PARTY_PATH}/glog)
-SET(GLOG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/glog)
-SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include directory." FORCE)
-
-IF(WIN32)
-  SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE)
-  SET(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4530")
-ELSE(WIN32)
-  SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE)
-  SET(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
-ENDIF(WIN32)
-
-INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
-
-SET(GLOG_REPOSITORY "https://github.com/google/glog.git")
-SET(GLOG_TAG "v0.3.5")
-
-SET(OPTIONAL_ARGS "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
-                  "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
-                  "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
-                  "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}"
-                  "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}"
-                  "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
-                  "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}"
-                  "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}")
-
-ExternalProject_Add(
-    extern_glog
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    DEPENDS gflags
-    GIT_REPOSITORY  ${GLOG_REPOSITORY}
-    GIT_TAG         ${GLOG_TAG}
-    PREFIX          ${GLOG_SOURCES_DIR}
-    UPDATE_COMMAND  ""
-    CMAKE_ARGS      ${CROSS_COMPILE_CMAKE_ARGS}
-                    ${OPTIONAL_ARGS}
-                    -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
-                    -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
-                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                    -DWITH_GFLAGS=ON
-                    -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
-                    -DBUILD_TESTING=OFF
-                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                    ${EXTERNAL_OPTIONAL_ARGS}
-    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
-                     -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib
-                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-)
-IF(WIN32)
-  IF(NOT EXISTS "${GLOG_INSTALL_DIR}/lib/libglog.lib")
-    add_custom_command(TARGET extern_glog POST_BUILD
-    COMMAND cmake -E copy ${GLOG_INSTALL_DIR}/lib/glog.lib ${GLOG_INSTALL_DIR}/lib/libglog.lib
-  )
-  ENDIF()
-ENDIF(WIN32)
-
-ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
-ADD_DEPENDENCIES(glog extern_glog gflags)
-LINK_LIBRARIES(glog gflags)
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
deleted file mode 100644
index 0df39138dd..0000000000
--- a/cmake/external/gtest.cmake
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# the gtest is only used when WITH_TESTING=ON
-IF(WITH_TESTING)
-    IF(WITH_TESTING)
-        ENABLE_TESTING()
-    ENDIF(WITH_TESTING)
-
-    INCLUDE(ExternalProject)
-
-    SET(GTEST_SOURCES_DIR ${CMAKE_SOURCE_DIR}/third-party/googletest)
-    SET(GTEST_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gtest)
-    SET(GTEST_INCLUDE_DIR "${GTEST_INSTALL_DIR}/include" CACHE PATH "gtest include directory." FORCE)
-
-    INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIR})
-
-    IF(WIN32)
-        set(GTEST_LIBRARIES
-            "${GTEST_INSTALL_DIR}/lib/gtest.lib" CACHE FILEPATH "gtest libraries." FORCE)
-        set(GTEST_MAIN_LIBRARIES
-            "${GTEST_INSTALL_DIR}/lib/gtest_main.lib" CACHE FILEPATH "gtest main libraries." FORCE)
-    ELSE(WIN32)
-        set(GTEST_LIBRARIES
-            "${GTEST_INSTALL_DIR}/lib/libgtest.a" CACHE FILEPATH "gtest libraries." FORCE)
-        set(GTEST_MAIN_LIBRARIES
-            "${GTEST_INSTALL_DIR}/lib/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE)
-    ENDIF(WIN32)
-
-    IF(WITH_MKLML)
-        # wait for mklml downloading completed
-        SET(GTEST_DEPENDS   ${MKLML_PROJECT})
-    ENDIF()
-
-    SET(OPTIONAL_ARGS "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
-        "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
-        "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
-        "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}"
-        "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}"
-        "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
-        "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}"
-        "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}")
-
-    ExternalProject_Add(
-        extern_gtest
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        DEPENDS         ${GTEST_DEPENDS}
-        GIT_REPOSITORY  ""
-        SOURCE_DIR      ${GTEST_SOURCES_DIR}
-        GIT_TAG         "release-1.8.0"
-        PREFIX          ${GTEST_INSTALL_DIR}
-        UPDATE_COMMAND  ""
-        CMAKE_ARGS      ${CROSS_COMPILE_CMAKE_ARGS}
-                        ${OPTIONAL_ARGS}
-                        -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
-                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                        -DBUILD_GMOCK=ON
-                        -Dgtest_disable_pthreads=ON
-                        -Dgtest_force_shared_crt=ON
-                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                        ${EXTERNAL_OPTIONAL_ARGS}
-        CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
-                         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                         -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-    )
-
-    ADD_LIBRARY(gtest STATIC IMPORTED GLOBAL)
-    SET_PROPERTY(TARGET gtest PROPERTY IMPORTED_LOCATION ${GTEST_LIBRARIES})
-    ADD_DEPENDENCIES(gtest extern_gtest)
-
-    ADD_LIBRARY(gtest_main STATIC IMPORTED GLOBAL)
-    SET_PROPERTY(TARGET gtest_main PROPERTY IMPORTED_LOCATION ${GTEST_MAIN_LIBRARIES})
-    ADD_DEPENDENCIES(gtest_main extern_gtest)
-
-ENDIF()
diff --git a/cmake/external/libxsmm.cmake b/cmake/external/libxsmm.cmake
deleted file mode 100644
index 69cdba7c59..0000000000
--- a/cmake/external/libxsmm.cmake
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-OPTION(WITH_LIBXSMM "Compile with libxsmm" OFF)
-
-IF(NOT WITH_LIBXSMM)
-    return()
-ENDIF()
-
-IF(WIN32 OR APPLE)
-    MESSAGE(WARNING "Windows, Mac are not supported with libxsmm in Paddle yet.")
-    SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM" FORCE)
-    return()
-ENDIF()
-
-INCLUDE (ExternalProject)
-
-SET(LIBXSMM_SOURCES_DIR ${THIRD_PARTY_PATH}/libxsmm)
-SET(LIBXSMM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/libxsmm)
-SET(LIBXSMM_INCLUDE_DIR "${LIBXSMM_INSTALL_DIR}/include" CACHE PATH "LIBXSMM include directory." FORCE)
-SET(LIBXSMM_LIBRARY_DIR "${LIBXSMM_INSTALL_DIR}/lib" CACHE PATH "LIBXSMM library directory." FORCE)
-SET(LIBXSMM_LIBS        "${LIBXSMM_LIBRARY_DIR}/libxsmm.a"
-                        "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a")
-
-ExternalProject_Add(
-    extern_libxsmm
-    GIT_REPOSITORY  "https://github.com/hfp/libxsmm.git"
-    GIT_TAG         "7cc03b5b342fdbc6b6d990b190671c5dbb8489a2"
-    PREFIX          ${LIBXSMM_SOURCES_DIR}
-    UPDATE_COMMAND  ""
-    CONFIGURE_COMMAND ""
-    BUILD_IN_SOURCE 1
-    BUILD_COMMAND   $(MAKE) --silent PREFIX=${LIBXSMM_INSTALL_DIR} CXX=g++ CC=gcc WARP=0 install
-    INSTALL_COMMAND ""
-)
-ADD_LIBRARY(libxsmm STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmm.a")
-SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a")
-
-MESSAGE(STATUS "Libxsmm library: ${LIBXSMM_LIBS}")
-include_directories(${LIBXSMM_INCLUDE_DIR})
-ADD_DEFINITIONS(-DPADDLE_WITH_LIBXSMM)
-ADD_DEPENDENCIES(libxsmm extern_libxsmm)
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
deleted file mode 100644
index b1e437a900..0000000000
--- a/cmake/external/mkldnn.cmake
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-IF(NOT ${WITH_MKLDNN})
-  return()
-ENDIF(NOT ${WITH_MKLDNN})
-
-INCLUDE(ExternalProject)
-
-SET(MKLDNN_PROJECT        "extern_mkldnn")
-SET(MKLDNN_SOURCES_DIR    ${THIRD_PARTY_PATH}/mkldnn)
-SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
-SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
-
-IF(APPLE)
-    MESSAGE(WARNING
-        "Mac is not supported with MKLDNN in Paddle yet."
-        "Force WITH_MKLDNN=OFF")
-    SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in MacOS" FORCE)
-    return()
-ENDIF()
-
-# Introduce variables:
-# * CMAKE_INSTALL_LIBDIR
-INCLUDE(GNUInstallDirs)
-SET(LIBDIR "lib")
-if(CMAKE_INSTALL_LIBDIR MATCHES ".*lib64$")
-  SET(LIBDIR "lib64")
-endif()
-
-MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/l${LIBDIR} to runtime path")
-SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/${LIBDIR}")
-
-INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers.
-
-IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
-    SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
-    MESSAGE(STATUS "Build MKLDNN with MKLML ${MKLML_ROOT}")
-ELSE()
-    MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
-ENDIF()
-
-IF(NOT WIN32)
-    SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-error=array-bounds")
-    SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
-    SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
-    SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
-ELSE()
-    SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} /EHsc")
-ENDIF(NOT WIN32)
-
-ExternalProject_Add(
-    ${MKLDNN_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    DEPENDS             ${MKLDNN_DEPENDS}
-    GIT_REPOSITORY      "https://github.com/intel/mkl-dnn.git"
-    GIT_TAG             "863ff6e7042cec7d2e29897fe9f0872e0888b0fc"
-    PREFIX              ${MKLDNN_SOURCES_DIR}
-    UPDATE_COMMAND      ""
-    CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-    CMAKE_ARGS          -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    CMAKE_ARGS          -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-    CMAKE_ARGS          -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-    CMAKE_ARGS          -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-    CMAKE_ARGS          -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-    CMAKE_ARGS          -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
-    CMAKE_ARGS          -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-    CMAKE_ARGS          -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    CMAKE_ARGS          -DMKLROOT=${MKLML_ROOT}
-    CMAKE_ARGS          -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
-    CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
-    CMAKE_ARGS          -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF
-    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
-                        -DMKLROOT:PATH=${MKLML_ROOT}
-)
-if(WIN32)
-    SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/${LIBDIR}/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE)
-else(WIN32)
-    SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/${LIBDIR}/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
-endif(WIN32)
-
-ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
-ADD_DEPENDENCIES(shared_mkldnn ${MKLDNN_PROJECT})
-MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
-add_definitions(-DPADDLE_WITH_MKLDNN)
-
-# generate a static dummy target to track mkldnn dependencies
-# for cc_library(xxx SRCS xxx.c DEPS mkldnn)
-SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/mkldnn_dummy.c)
-FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
-ADD_LIBRARY(mkldnn STATIC ${dummyfile})
-TARGET_LINK_LIBRARIES(mkldnn ${MKLDNN_LIB} ${MKLML_LIB} ${MKLML_IOMP_LIB})
-ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
-
-# copy the real so.0 lib to install dir
-# it can be directly contained in wheel or capi
-if(WIN32)
-    SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/bin/mkldnn.dll)
-else(WIN32)
-    SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
-    ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
-            COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
-            DEPENDS mkldnn shared_mkldnn)
-endif(WIN32)
-ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB})
-ADD_DEPENDENCIES(mkldnn_shared_lib ${MKLDNN_PROJECT} mkldnn)
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
deleted file mode 100644
index 142fce816d..0000000000
--- a/cmake/external/mklml.cmake
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-IF(NOT ${WITH_MKLML})
-  return()
-ENDIF(NOT ${WITH_MKLML})
-
-IF(APPLE)
-    MESSAGE(WARNING "Mac is not supported with MKLML in Paddle yet. Force WITH_MKLML=OFF.")
-    SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in MacOS" FORCE)
-    return()
-ENDIF()
-
-INCLUDE(ExternalProject)
-SET(MKLML_DST_DIR       "mklml")
-SET(MKLML_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
-SET(MKLML_INSTALL_DIR   ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
-SET(MKLML_ROOT          ${MKLML_INSTALL_DIR})
-SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)
-SET(MKLML_LIB_DIR       ${MKLML_ROOT}/lib)
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
-
-SET(TIME_VERSION "2019.0.1.20181227")
-IF(WIN32)
-    SET(MKLML_VER "mklml_win_${TIME_VERSION}" CACHE STRING "" FORCE)
-    SET(MKLML_URL "https://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
-    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/mklml.lib)
-    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5md.lib)
-    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/mklml.dll)
-    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5md.dll)
-ELSE()
-    #TODO(intel-huying):
-    #  Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
-    SET(MKLML_VER "Glibc225_vsErf_mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
-    SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
-    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/libmklml_intel.so)
-    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5.so)
-    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/libmklml_intel.so)
-    SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5.so)
-ENDIF()
-
-SET(MKLML_PROJECT       "extern_mklml")
-MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")
-SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
-SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
-
-ExternalProject_Add(
-    ${MKLML_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                 ${MKLML_SOURCE_DIR}
-    URL                    ${MKLML_URL}
-    DOWNLOAD_DIR          ${MKLML_DOWNLOAD_DIR}
-    DOWNLOAD_NO_PROGRESS  1
-    CONFIGURE_COMMAND     ""
-    BUILD_COMMAND         ""
-    UPDATE_COMMAND ""
-    INSTALL_COMMAND
-        ${CMAKE_COMMAND} -E copy_directory ${MKLML_DOWNLOAD_DIR}/include ${MKLML_INC_DIR} &&
-        ${CMAKE_COMMAND} -E copy_directory ${MKLML_DOWNLOAD_DIR}/lib ${MKLML_LIB_DIR}
-)
-
-INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
-
-ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
-ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
deleted file mode 100644
index d8a4a0be6f..0000000000
--- a/cmake/external/openblas.cmake
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-INCLUDE(cblas)
-
-IF(NOT ${CBLAS_FOUND})
-    INCLUDE(ExternalProject)
-
-    SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
-    SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
-    SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
-
-    SET(CBLAS_LIBRARIES
-        "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
-        CACHE FILEPATH "openblas library." FORCE)
-
-    ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS)
-
-    IF (WIN32)
-        SET(CBLAS_FOUND true)
-        MESSAGE(WARNING, "In windows, openblas only support msvc build, please build it manually and put it at " ${CBLAS_INSTALL_DIR})
-    ENDIF(WIN32)
-
-    IF (NOT WIN32)
-    SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
-    SET(OPENBLAS_COMMIT "v0.2.20")
-
-    IF(APPLE)
-        SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
-    ENDIF()
-    SET(OPTIONAL_ARGS "")
-    IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
-        SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
-    ENDIF()
-
-    SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
-    ExternalProject_Add(
-        extern_openblas
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
-        GIT_TAG             ${OPENBLAS_COMMIT}
-        PREFIX              ${CBLAS_SOURCES_DIR}
-        INSTALL_DIR         ${CBLAS_INSTALL_DIR}
-        BUILD_IN_SOURCE     1
-        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS}
-        INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX=<INSTALL_DIR> 
-                            && rm -r ${CBLAS_INSTALL_DIR}/lib/cmake ${CBLAS_INSTALL_DIR}/lib/pkgconfig
-        UPDATE_COMMAND      ""
-        CONFIGURE_COMMAND   ""
-    )
-    ELSE()
-    ENDIF(NOT WIN32)
-    SET(CBLAS_PROVIDER openblas)
-ENDIF(NOT ${CBLAS_FOUND})
-
-MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}")
-MESSAGE(STATUS "BLAS Include: ${CBLAS_INC_DIR}")
-INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
-
-# FIXME(gangliao): generate cblas target to track all high performance
-# linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
-SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
-FILE(WRITE ${dummyfile} "const char *dummy_cblas = \"${dummyfile}\";")
-ADD_LIBRARY(cblas STATIC ${dummyfile})
-
-IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
-  TARGET_LINK_LIBRARIES(cblas dynload_mklml)
-ELSE()
-  TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
-ENDIF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
-
-IF(WITH_LIBXSMM)
-  TARGET_LINK_LIBRARIES(cblas ${LIBXSMM_LIBS})
-  ADD_DEPENDENCIES(cblas extern_libxsmm)
-ENDIF()
-
-IF(NOT ${CBLAS_FOUND})
-    ADD_DEPENDENCIES(cblas extern_openblas)
-ELSE()
-    IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
-        ADD_DEPENDENCIES(cblas mklml)
-    ENDIF()
-ENDIF(NOT ${CBLAS_FOUND})
diff --git a/cmake/external/opencl-clhpp.cmake b/cmake/external/opencl-clhpp.cmake
deleted file mode 100644
index ea724860d9..0000000000
--- a/cmake/external/opencl-clhpp.cmake
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-INCLUDE(ExternalProject)
-
-SET(OPENCL_CLHPP_SRCS_DIR    ${THIRD_PARTY_PATH}/opencl-clhpp)
-SET(OPENCL_CLHPP_INSTALL_DIR ${THIRD_PARTY_PATH}/install/opencl-clhpp)
-SET(OPENCL_CLHPP_INCLUDE_DIR "${OPENCL_CLHPP_INSTALL_DIR}" CACHE PATH "opencl-clhpp include directory." FORCE)
-
-INCLUDE_DIRECTORIES(${OPENCL_CLHPP_INCLUDE_DIR})
-
-ExternalProject_Add(
-  opencl_clhpp
-  GIT_REPOSITORY    "https://github.com/KhronosGroup/OpenCL-CLHPP.git"
-  GIT_TAG           "v2.0.10"
-  PREFIX            "${OPENCL_CLHPP_SRCS_DIR}"
-  CMAKE_ARGS        -DBUILD_DOCS=OFF
-                    -DBUILD_EXAMPLES=OFF
-                    -DBUILD_TESTS=OFF
-                    -DCMAKE_INSTALL_PREFIX=${OPENCL_CLHPP_INSTALL_DIR}
-  CMAKE_CACHE_ARGS  -DCMAKE_INSTALL_PREFIX:PATH=${OPENCL_CLHPP_INSTALL_DIR}
-                    -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-)
-
-ADD_DEPENDENCIES(opencl_clhpp opencl_headers)
diff --git a/cmake/external/opencl-headers.cmake b/cmake/external/opencl-headers.cmake
deleted file mode 100644
index 68c9c5251c..0000000000
--- a/cmake/external/opencl-headers.cmake
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-INCLUDE(ExternalProject)
-
-SET(OPENCL_HEADERS_SRCS_DIR    ${THIRD_PARTY_PATH}/opencl-headers)
-SET(OPENCL_HEADERS_INCLUDE_DIR "${OPENCL_HEADERS_SRCS_DIR}/src/opencl_headers" CACHE PATH "opencl-headers include directory." FORCE)
-
-INCLUDE_DIRECTORIES(${OPENCL_HEADERS_INCLUDE_DIR})
-
-ExternalProject_Add(
-  opencl_headers
-  ${EXTERNAL_PROJECT_LOG_ARGS}
-  GIT_REPOSITORY    "https://github.com/KhronosGroup/OpenCL-Headers.git"
-  GIT_TAG           "c5a4bbeabb10d8ed3d1c651b93aa31737bc473dd"
-  PREFIX            ${OPENCL_HEADERS_SRCS_DIR}
-  DOWNLOAD_NAME     "OpenCL-Headers"
-  CONFIGURE_COMMAND ""
-  BUILD_COMMAND     ""
-  INSTALL_COMMAND   ""
-  TEST_COMMAND      ""
-)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
deleted file mode 100644
index 2a88cf0321..0000000000
--- a/cmake/external/protobuf.cmake
+++ /dev/null
@@ -1,308 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-INCLUDE(ExternalProject)
-# Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp
-IF(NOT WIN32)
-FIND_PACKAGE(Protobuf QUIET)
-ENDIF(NOT WIN32)
-macro(UNSET_VAR VAR_NAME)
-    UNSET(${VAR_NAME} CACHE)
-    UNSET(${VAR_NAME})
-endmacro()
-
-UNSET_VAR(PROTOBUF_INCLUDE_DIR)
-UNSET_VAR(PROTOBUF_FOUND)
-UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE)
-UNSET_VAR(PROTOBUF_PROTOC_LIBRARY)
-UNSET_VAR(PROTOBUF_LITE_LIBRARY)
-UNSET_VAR(PROTOBUF_LIBRARY)
-UNSET_VAR(PROTOBUF_INCLUDE_DIR)
-UNSET_VAR(Protobuf_PROTOC_EXECUTABLE)
-function(protobuf_generate_python SRCS)
-    # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake
-    if(NOT ARGN)
-        message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files")
-        return()
-    endif()
-
-    if(PROTOBUF_GENERATE_CPP_APPEND_PATH)
-        # Create an include path for each file specified
-        foreach(FIL ${ARGN})
-            get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
-            get_filename_component(ABS_PATH ${ABS_FIL} PATH)
-            list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
-            if(${_contains_already} EQUAL -1)
-                list(APPEND _protobuf_include_path -I ${ABS_PATH})
-            endif()
-        endforeach()
-    else()
-        set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
-    endif()
-    if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS)
-        set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}")
-    endif()
-
-    if(DEFINED Protobuf_IMPORT_DIRS)
-        foreach(DIR ${Protobuf_IMPORT_DIRS})
-            get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
-            list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
-            if(${_contains_already} EQUAL -1)
-                list(APPEND _protobuf_include_path -I ${ABS_PATH})
-            endif()
-        endforeach()
-    endif()
-
-    set(${SRCS})
-    foreach(FIL ${ARGN})
-        get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
-        get_filename_component(FIL_WE ${FIL} NAME_WE)
-        if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH)
-            get_filename_component(FIL_DIR ${FIL} DIRECTORY)
-            if(FIL_DIR)
-                set(FIL_WE "${FIL_DIR}/${FIL_WE}")
-            endif()
-        endif()
-        list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py")
-        add_custom_command(
-                OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py"
-                COMMAND  ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL}
-                DEPENDS ${ABS_FIL} ${PROTOBUF_PROTOC_EXECUTABLE}
-                COMMENT "Running Python protocol buffer compiler on ${FIL}"
-                VERBATIM )
-    endforeach()
-
-    set(${SRCS} ${${SRCS}} PARENT_SCOPE)
-endfunction()
-
-# Print and set the protobuf library information,
-# finish this cmake process and exit from this file.
-macro(PROMPT_PROTOBUF_LIB)
-    SET(protobuf_DEPS ${ARGN})
-
-    MESSAGE(STATUS "Protobuf protoc executable: ${PROTOBUF_PROTOC_EXECUTABLE}")
-    MESSAGE(STATUS "Protobuf-lite library: ${PROTOBUF_LITE_LIBRARY}")
-    MESSAGE(STATUS "Protobuf library: ${PROTOBUF_LIBRARY}")
-    MESSAGE(STATUS "Protoc library: ${PROTOBUF_PROTOC_LIBRARY}")
-    MESSAGE(STATUS "Protobuf version: ${PROTOBUF_VERSION}")
-    INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
-
-    # Assuming that all the protobuf libraries are of the same type.
-    IF(${PROTOBUF_LIBRARY} MATCHES ${CMAKE_STATIC_LIBRARY_SUFFIX})
-        SET(protobuf_LIBTYPE STATIC)
-    ELSEIF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_SHARED_LIBRARY_SUFFIX}$")
-        SET(protobuf_LIBTYPE SHARED)
-    ELSE()
-        MESSAGE(FATAL_ERROR "Unknown library type: ${PROTOBUF_LIBRARY}")
-    ENDIF()
-
-    ADD_LIBRARY(protobuf ${protobuf_LIBTYPE} IMPORTED GLOBAL)
-    SET_PROPERTY(TARGET protobuf PROPERTY IMPORTED_LOCATION ${PROTOBUF_LIBRARY})
-
-    ADD_LIBRARY(protobuf_lite ${protobuf_LIBTYPE} IMPORTED GLOBAL)
-    SET_PROPERTY(TARGET protobuf_lite PROPERTY IMPORTED_LOCATION ${PROTOBUF_LITE_LIBRARY})
-
-    ADD_LIBRARY(libprotoc ${protobuf_LIBTYPE} IMPORTED GLOBAL)
-    SET_PROPERTY(TARGET libprotoc PROPERTY IMPORTED_LOCATION ${PROTOC_LIBRARY})
-
-    ADD_EXECUTABLE(protoc IMPORTED GLOBAL)
-    SET_PROPERTY(TARGET protoc PROPERTY IMPORTED_LOCATION ${PROTOBUF_PROTOC_EXECUTABLE})
-    # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`.
-    # make `protobuf_generate_cpp` happy.
-    SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE})
-
-    FOREACH(dep ${protobuf_DEPS})
-        ADD_DEPENDENCIES(protobuf ${dep})
-        ADD_DEPENDENCIES(protobuf_lite ${dep})
-        ADD_DEPENDENCIES(libprotoc ${dep})
-        ADD_DEPENDENCIES(protoc ${dep})
-    ENDFOREACH()
-
-    RETURN()
-endmacro()
-macro(SET_PROTOBUF_VERSION)
-    EXEC_PROGRAM(${PROTOBUF_PROTOC_EXECUTABLE} ARGS --version OUTPUT_VARIABLE PROTOBUF_VERSION)
-    STRING(REGEX MATCH "[0-9]+.[0-9]+" PROTOBUF_VERSION "${PROTOBUF_VERSION}")
-endmacro()
-
-set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
-IF (WIN32)
-    SET(PROTOBUF_ROOT ${THIRD_PARTY_PATH}/install/protobuf)
-ENDIF(WIN32)
-
-if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
-    find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH)
-    find_library(PROTOBUF_LIBRARY protobuf libprotobuf.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
-    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite libprotobuf-lite.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
-    find_library(PROTOBUF_PROTOC_LIBRARY protoc libprotoc.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
-    find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH)
-    if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE)
-        message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
-        SET(PROTOBUF_FOUND true)
-        SET_PROTOBUF_VERSION()
-        PROMPT_PROTOBUF_LIB()
-    else()
-        message(WARNING "Cannot find protobuf library in ${PROTOBUF_ROOT}")
-    endif()
-endif()
-
-FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
-    STRING(REPLACE "extern_" "" TARGET_DIR_NAME "${TARGET_NAME}")
-    SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/${TARGET_DIR_NAME})
-    SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/${TARGET_DIR_NAME})
-
-    SET(${TARGET_NAME}_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE)
-    SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE)
-    SET(${TARGET_NAME}_LITE_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX}"
-         PARENT_SCOPE)
-    SET(${TARGET_NAME}_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX}"
-         PARENT_SCOPE)
-    SET(${TARGET_NAME}_PROTOC_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX}"
-         PARENT_SCOPE)
-    SET(${TARGET_NAME}_PROTOC_EXECUTABLE
-        "${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX}"
-         PARENT_SCOPE)
-
-    # https://github.com/protocolbuffers/protobuf.git
-    SET(PROTOBUF_REPO "")
-    SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
-    SET(OPTIONAL_CACHE_ARGS "")
-    SET(OPTIONAL_ARGS "")
-    SET(SOURCE_DIR "${CMAKE_SOURCE_DIR}/third-party/protobuf-host")
-
-    IF(BUILD_FOR_HOST)
-        # set for server compile.
-        if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-          set(HOST_C_COMPILER "${CMAKE_C_COMPILER}")
-          set(HOST_CXX_COMPILER "${CMAKE_CXX_COMPILER}")
-        endif()
-
-        SET(OPTIONAL_ARGS
-            "-DCMAKE_C_COMPILER=${HOST_C_COMPILER}"
-            "-DCMAKE_CXX_COMPILER=${HOST_CXX_COMPILER}"
-            "-Dprotobuf_WITH_ZLIB=OFF"
-            "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}")
-        SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}")
-    ELSE()
-        # protobuf have compile issue when use android stl c++_static
-        # https://github.com/tensor-tang/protobuf.git
-        SET(PROTOBUF_REPO "")
-        SET(PROTOBUF_TAG "mobile")
-        SET(SOURCE_DIR "${CMAKE_SOURCE_DIR}/third-party/protobuf-mobile")
-        SET(OPTIONAL_ARGS "-Dprotobuf_WITH_ZLIB=OFF"
-                ${CROSS_COMPILE_CMAKE_ARGS}
-                "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
-                "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
-                "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
-                "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}"
-                "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}"
-                "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
-                "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}"
-                "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}")
-    ENDIF()
-    IF(WIN32)
-        SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64")
-    ENDIF()
-
-    if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-        ExternalProject_Add(
-            ${TARGET_NAME}
-            ${EXTERNAL_PROJECT_LOG_ARGS}
-            PREFIX          ${PROTOBUF_SOURCES_DIR}
-            SOURCE_SUBDIR   cmake
-            UPDATE_COMMAND  ""
-            GIT_REPOSITORY  ""
-            GIT_TAG         ${PROTOBUF_TAG}
-            SOURCE_DIR      ${SOURCE_DIR}
-            CMAKE_ARGS
-                ${OPTIONAL_ARGS}
-                -Dprotobuf_BUILD_TESTS=OFF
-                -DCMAKE_SKIP_RPATH=ON
-                -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
-                -DCMAKE_INSTALL_LIBDIR=lib
-                -DBUILD_SHARED_LIBS=OFF
-            CMAKE_CACHE_ARGS
-                -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
-                -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-                -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-                -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                ${OPTIONAL_CACHE_ARGS}
-        )
-    else()
-        ExternalProject_Add(
-            ${TARGET_NAME}
-            ${EXTERNAL_PROJECT_LOG_ARGS}
-            PREFIX          ${SOURCE_DIR}
-            UPDATE_COMMAND  ""
-            GIT_REPOSITORY  ""
-            GIT_TAG         ${PROTOBUF_TAG}
-            SOURCE_DIR      ${SOURCE_DIR}
-            CONFIGURE_COMMAND ${CMAKE_COMMAND} ${SOURCE_DIR}/cmake
-                ${OPTIONAL_ARGS}
-                -Dprotobuf_BUILD_TESTS=OFF
-                -DCMAKE_SKIP_RPATH=ON
-                -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
-                -DCMAKE_INSTALL_LIBDIR=lib
-                -DBUILD_SHARED_LIBS=OFF
-            CMAKE_CACHE_ARGS
-                -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
-                -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-                -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-                -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                ${OPTIONAL_CACHE_ARGS}
-        )
-    endif()
-ENDFUNCTION()
-
-SET(PROTOBUF_VERSION 3.1.0)
-
-IF(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-    build_protobuf(protobuf_host TRUE)
-    LIST(APPEND external_project_dependencies protobuf_host)
-    SET(PROTOBUF_PROTOC_EXECUTABLE ${protobuf_host_PROTOC_EXECUTABLE}
-        CACHE FILEPATH "protobuf executable." FORCE)
-ENDIF()
-
-IF(NOT PROTOBUF_FOUND)
-    if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-      build_protobuf(extern_protobuf FALSE)
-    else()
-      build_protobuf(extern_protobuf TRUE)
-    endif()
-
-    SET(PROTOBUF_INCLUDE_DIR ${extern_protobuf_INCLUDE_DIR}
-        CACHE PATH "protobuf include directory." FORCE)
-    SET(PROTOBUF_LITE_LIBRARY ${extern_protobuf_LITE_LIBRARY}
-        CACHE FILEPATH "protobuf lite library." FORCE)
-    SET(PROTOBUF_LIBRARY ${extern_protobuf_LIBRARY}
-        CACHE FILEPATH "protobuf library." FORCE)
-    SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY}
-        CACHE FILEPATH "protoc library." FORCE)
-
-    IF(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-        PROMPT_PROTOBUF_LIB(protobuf_host extern_protobuf)
-    ELSE()
-        SET(PROTOBUF_PROTOC_EXECUTABLE ${extern_protobuf_PROTOC_EXECUTABLE}
-            CACHE FILEPATH "protobuf executable." FORCE)
-        PROMPT_PROTOBUF_LIB(extern_protobuf)
-    ENDIF()
-
-ENDIF(NOT PROTOBUF_FOUND)
diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake
deleted file mode 100644
index 1d61154c0d..0000000000
--- a/cmake/external/xbyak.cmake
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set(WITH_XBYAK ON)
-if(WIN32 OR APPLE)
-    SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE)
-    return()
-endif()
-
-include(ExternalProject)
-
-set(XBYAK_PROJECT       extern_xbyak)
-set(XBYAK_PREFIX_DIR    ${THIRD_PARTY_PATH}/xbyak)
-set(XBYAK_INSTALL_ROOT  ${THIRD_PARTY_PATH}/install/xbyak)
-set(XBYAK_INC_DIR       ${XBYAK_INSTALL_ROOT}/include)
-
-include_directories(${XBYAK_INC_DIR})
-include_directories(${XBYAK_INC_DIR}/xbyak)
-
-add_definitions(-DPADDLE_WITH_XBYAK)
-
-# xbyak options
-add_definitions(-DXBYAK64)
-add_definitions(-DXBYAK_NO_OP_NAMES)
-
-ExternalProject_Add(
-    ${XBYAK_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    DEPENDS             ""
-    GIT_REPOSITORY      "https://github.com/herumi/xbyak.git"
-    GIT_TAG             "v5.661"  # Jul 26th
-    PREFIX              ${XBYAK_PREFIX_DIR}
-    UPDATE_COMMAND      ""
-    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT}
-    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${XBYAK_INSTALL_ROOT}
-)
-
-if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
-    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/xbyak_dummy.c)
-    file(WRITE ${dummyfile} "const char *dummy_xbyak = \"${dummyfile}\";")
-    add_library(xbyak STATIC ${dummyfile})
-else()
-    add_library(xbyak INTERFACE)
-endif()
-
-add_dependencies(xbyak ${XBYAK_PROJECT})
diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake
deleted file mode 100644
index 23b1e02108..0000000000
--- a/cmake/external/xxhash.cmake
+++ /dev/null
@@ -1,73 +0,0 @@
-INCLUDE(ExternalProject)
-
-set(XXHASH_SOURCE_DIR ${THIRD_PARTY_PATH}/xxhash)
-set(XXHASH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/xxhash)
-set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include")
-
-IF(WITH_STATIC_LIB)
-  SET(BUILD_CMD make lib)
-ELSE()
-  IF(APPLE)
-    SET(BUILD_CMD sed -i \"\" "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/src/extern_xxhash/Makefile && make lib)
-  ELSE(APPLE)
-    SET(BUILD_CMD sed -i "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/src/extern_xxhash/Makefile && make lib)
-  ENDIF(APPLE)
-ENDIF()
-
-if(WIN32)
-  ExternalProject_Add(
-          extern_xxhash
-          ${EXTERNAL_PROJECT_LOG_ARGS}
-          GIT_REPOSITORY  "https://github.com/Cyan4973/xxHash"
-          GIT_TAG         "v0.6.5"
-          PREFIX          ${XXHASH_SOURCE_DIR}
-          DOWNLOAD_NAME   "xxhash"
-          UPDATE_COMMAND  ""
-          BUILD_IN_SOURCE 1
-          PATCH_COMMAND
-          CONFIGURE_COMMAND
-          ${CMAKE_COMMAND} ${XXHASH_SOURCE_DIR}/src/extern_xxhash/cmake_unofficial
-          -DCMAKE_INSTALL_PREFIX:PATH=${XXHASH_INSTALL_DIR}
-          -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
-          -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-          -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-          -DBUILD_XXHSUM=OFF
-          -DCMAKE_GENERATOR_PLATFORM=x64
-          -DBUILD_SHARED_LIBS=OFF
-          ${OPTIONAL_CACHE_ARGS}
-          TEST_COMMAND      ""
-  )
-else()
-  ExternalProject_Add(
-      extern_xxhash
-      ${EXTERNAL_PROJECT_LOG_ARGS}
-      GIT_REPOSITORY  "https://github.com/Cyan4973/xxHash"
-      GIT_TAG         "v0.6.5"
-      PREFIX          ${XXHASH_SOURCE_DIR}
-      DOWNLOAD_NAME   "xxhash"
-      UPDATE_COMMAND  ""
-      CONFIGURE_COMMAND ""
-      BUILD_IN_SOURCE 1
-      PATCH_COMMAND
-      BUILD_COMMAND     ${BUILD_CMD}
-      INSTALL_COMMAND   export PREFIX=${XXHASH_INSTALL_DIR}/ && make install
-      TEST_COMMAND      ""
-  )
-endif()
-
-if (WIN32)
-  IF(NOT EXISTS "${XXHASH_INSTALL_DIR}/lib/libxxhash.lib")
-    add_custom_command(TARGET extern_xxhash POST_BUILD
-            COMMAND cmake -E copy ${XXHASH_INSTALL_DIR}/lib/xxhash.lib ${XXHASH_INSTALL_DIR}/lib/libxxhash.lib
-            )
-  ENDIF()
-  set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.lib")
-else()
-  set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a")
-endif ()
-INCLUDE_DIRECTORIES(${XXHASH_INCLUDE_DIR})
-
-add_library(xxhash STATIC IMPORTED GLOBAL)
-set_property(TARGET xxhash PROPERTY IMPORTED_LOCATION ${XXHASH_LIBRARIES})
-include_directories(${XXHASH_INCLUDE_DIR})
-add_dependencies(xxhash extern_xxhash)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
deleted file mode 100644
index 36b533aa4f..0000000000
--- a/cmake/flags.cmake
+++ /dev/null
@@ -1,194 +0,0 @@
-# Setting Paddle Compile Flags
-include(CheckCXXCompilerFlag)
-include(CheckCCompilerFlag)
-include(CheckCXXSymbolExists)
-include(CheckTypeSize)
-
-function(CheckCompilerCXX11Flag)
-    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
-            message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
-        endif()
-    elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-        # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
-        # Apple Clang is a different compiler than upstream Clang which havs different version numbers.
-        # https://gist.github.com/yamaya/2924292
-        if(APPLE)  # cmake < 3.0 compiler id "Clang" on Mac OS X
-            if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.1)
-                message(FATAL_ERROR "Unsupported AppleClang version. AppleClang >= 5.1 required.")
-            endif()
-        else()
-            if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3)
-                message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.")
-            endif()
-        endif()
-    endif()
-endfunction()
-
-CheckCompilerCXX11Flag()
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-# safe_set_flag
-#
-# Set a compile flag only if compiler is support
-# is_c: is C flag or C++ flag, bool type.
-# src_list: The list name which the flag name will be append to.
-# flag_name: the flag name for compiler, such as '-Werror' '-Wall' etc
-# rest arguments: not used.
-function(safe_set_flag is_c src_list flag_name)
-    string(REPLACE "-" "_" safe_name ${flag_name})
-    string(REPLACE "=" "_" safe_name ${safe_name})
-    if(is_c)
-        CHECK_C_COMPILER_FLAG(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name})
-        set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name})
-    else()
-        CHECK_CXX_COMPILER_FLAG(${flag_name} CXX_COMPILER_SUPPORT_FLAG_${safe_name})
-        set(safe_name CXX_COMPILER_SUPPORT_FLAG_${safe_name})
-    endif()
-    if(${safe_name})
-        set(${src_list} "${${src_list}} ${flag_name}" PARENT_SCOPE)
-    endif()
-endfunction()
-
-# helper macro to set cflag
-macro(safe_set_cflag src_list flag_name)
-    safe_set_flag(ON ${src_list} ${flag_name})
-endmacro()
-
-# helper macro to set cxxflag
-macro(safe_set_cxxflag src_list flag_name)
-    safe_set_flag(OFF ${src_list} ${flag_name})
-endmacro()
-
-# helper macro to set nvcc flag
-macro(safe_set_nvflag flag_name)
-    string(REPLACE "-" "_" safe_name ${flag_name})
-    string(REPLACE "=" "_" safe_name ${safe_name})
-    CHECK_C_COMPILER_FLAG(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name})
-    set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name})
-    if(${safe_name})
-        LIST(APPEND CUDA_NVCC_FLAGS -Xcompiler ${flag_name})
-    endif()
-endmacro()
-
-macro(safe_set_static_flag) # set c_flags and cxx_flags to static or shared
-    if (BUILD_SHARED_LIBS) 
-        return() # if build shared libs, the flags keep same with '/MD'
-    endif(BUILD_SHARED_LIBS)
-    foreach(flag_var
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
-        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-      if(${flag_var} MATCHES "/MD")
-        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-      endif(${flag_var} MATCHES "/MD")
-    endforeach(flag_var)
-endmacro()
-
-CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
-if(NOT UINT64_MAX_EXISTS)
-  set(CMAKE_REQUIRED_DEFINITIONS -D__STDC_LIMIT_MACROS)
-  CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS_HERE)
-  if(UINT64_MAX_EXISTS_HERE)
-    set(CMAKE_REQUIRED_DEFINITIONS)
-    add_definitions(-D__STDC_LIMIT_MACROS)
-  else()
-    message(FATAL_ERROR "Cannot find symbol UINT64_MAX")
-  endif()
-endif()
-
-SET(CMAKE_EXTRA_INCLUDE_FILES "pthread.h")
-CHECK_TYPE_SIZE(pthread_spinlock_t SPINLOCK_FOUND)
-CHECK_TYPE_SIZE(pthread_barrier_t BARRIER_FOUND)
-if(SPINLOCK_FOUND)
-  add_definitions(-DPADDLE_USE_PTHREAD_SPINLOCK)
-endif(SPINLOCK_FOUND)
-if(BARRIER_FOUND)
-  add_definitions(-DPADDLE_USE_PTHREAD_BARRIER)
-endif(BARRIER_FOUND)
-SET(CMAKE_EXTRA_INCLUDE_FILES "")
-
-# Common flags. the compiler flag used for C/C++ sources whenever release or debug
-# Do not care if this flag is support for gcc.
-
-# https://github.com/PaddlePaddle/Paddle/issues/12773
-if (NOT WIN32)
-set(COMMON_FLAGS
-    -fPIC
-    -fno-omit-frame-pointer
-    -Werror
-    -Wall
-    -Wextra
-    -Wnon-virtual-dtor
-    -Wdelete-non-virtual-dtor
-    -Wno-unused-parameter
-    -Wno-unused-function
-    -Wno-error=literal-suffix
-    -Wno-error=sign-compare
-    -Wno-error=unused-local-typedefs
-    -Wno-error=parentheses-equality # Warnings in pybind11
-    -Wno-error=ignored-attributes  # Warnings in Eigen, gcc 6.3
-    -Wno-error=terminate  # Warning in PADDLE_ENFORCE
-    -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2
-    -Wimplicit-fallthrough=0 # Warning in tinyformat.h
-    -Wno-error=maybe-uninitialized # Warning in boost gcc 7.2
-)
-
-set(GPU_COMMON_FLAGS
-    -fPIC
-    -fno-omit-frame-pointer
-    -Wnon-virtual-dtor
-    -Wdelete-non-virtual-dtor
-    -Wno-unused-parameter
-    -Wno-unused-function
-    -Wno-error=sign-compare
-    -Wno-error=literal-suffix
-    -Wno-error=unused-local-typedefs
-    -Wno-error=unused-function  # Warnings in Numpy Header.
-    -Wno-error=array-bounds # Warnings in Eigen::array
-)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64")
-endif(NOT WIN32)
-
-if (APPLE)
-    # On Mac OS X build fat binaries with x86_64 architectures by default.
-    set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
-    # On Mac OS X register class specifier is deprecated and will cause warning error on latest clang 10.0
-    set (COMMON_FLAGS -Wno-deprecated-register)
-endif(APPLE)
-
-if(LINUX)
-    set(GPU_COMMON_FLAGS
-        -Wall
-        -Wextra
-        -Werror
-        ${GPU_COMMON_FLAGS})
-endif(LINUX)
-
-if(UNIX AND NOT APPLE)
-  # except apple from nix*Os family
-  set(LINUX TRUE)
-endif(UNIX AND NOT APPLE)
-
-foreach(flag ${COMMON_FLAGS})
-    safe_set_cflag(CMAKE_C_FLAGS ${flag})
-    safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
-
-endforeach()
-
-foreach(flag ${GPU_COMMON_FLAGS})
-    safe_set_nvflag(${flag})
-endforeach()
-
-if(WIN32)
-# windows build turn off warnings.
-safe_set_static_flag()
-    foreach(flag_var
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
-        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-        string(REGEX REPLACE "(^| )/W[0-9]( |$)" " " ${flag_var} "${${flag_var}}")
-        set(flag_var "${flag_var} /w")
-    endforeach(flag_var)
-endif(WIN32)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
deleted file mode 100644
index a87c64cbe9..0000000000
--- a/cmake/generic.cmake
+++ /dev/null
@@ -1,567 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-
-# generic.cmake defines CMakes functions that look like Bazel's
-# building rules (https://bazel.build/).
-#
-#
-# -------------------------------------------
-#     C++        CUDA C++       Go
-# -------------------------------------------
-# cc_library    nv_library   go_library
-# cc_binary     nv_binary    go_binary
-# cc_test       nv_test      go_test
-# -------------------------------------------
-#
-# To build a static library example.a from example.cc using the system
-#  compiler (like GCC):
-#
-#   cc_library(example SRCS example.cc)
-#
-# To build a static library example.a from multiple source files
-# example{1,2,3}.cc:
-#
-#   cc_library(example SRCS example1.cc example2.cc example3.cc)
-#
-# To build a shared library example.so from example.cc:
-#
-#   cc_library(example SHARED SRCS example.cc)
-#
-# To build a library using Nvidia's NVCC from .cu file(s), use the nv_
-# prefixed version:
-#
-#   nv_library(example SRCS example.cu)
-#
-# To specify that a library new_example.a depends on other libraies:
-#
-#   cc_library(new_example SRCS new_example.cc DEPS example)
-#
-# Static libraries can be composed of other static libraries:
-#
-#   cc_library(composed DEPS dependent1 dependent2 dependent3)
-#
-# To build an executable binary file from some source files and
-# dependent libraries:
-#
-#   cc_binary(example SRCS main.cc something.cc DEPS example1 example2)
-#
-# To build an executable binary file using NVCC, use the nv_ prefixed
-# version:
-#
-#   nv_binary(example SRCS main.cc something.cu DEPS example1 example2)
-#
-# To build a unit test binary, which is an executable binary with
-# GoogleTest linked:
-#
-#   cc_test(example_test SRCS example_test.cc DEPS example)
-#
-# To build a unit test binary using NVCC, use the nv_ prefixed version:
-#
-#   nv_test(example_test SRCS example_test.cu DEPS example)
-#
-# It is pretty often that executable and test binaries depend on
-# pre-defined external libaries like glog and gflags defined in
-# /cmake/external/*.cmake:
-#
-#   cc_test(example_test SRCS example_test.cc DEPS example glog gflags)
-#
-# To build a go static library using Golang, use the go_ prefixed version:
-#
-#   go_library(example STATIC)
-#
-# To build a go shared library using Golang, use the go_ prefixed version:
-#
-#   go_library(example SHARED)
-#
-
-# including binary directory for generated headers.
-include_directories(${CMAKE_CURRENT_BINARY_DIR})
-
-if(NOT APPLE)
-  find_package(Threads REQUIRED)
-  link_libraries(${CMAKE_THREAD_LIBS_INIT})
-  set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl")
-  if (NOT ANDROID)
-    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -lrt")
-  endif()
-endif(NOT APPLE)
-
-set_property(GLOBAL PROPERTY FLUID_MODULES "")
-# find all fluid modules is used for paddle fluid static library
-# for building inference libs
-function(find_fluid_modules TARGET_NAME)
-  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
-  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
-  string(FIND "${__target_path}" "fluid" pos)
-  if(pos GREATER 1)
-    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-    set(fluid_modules ${fluid_modules} ${TARGET_NAME})
-    set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
-  endif()
-endfunction(find_fluid_modules)
-
-
-function(common_link TARGET_NAME)
-  if (WITH_PROFILER)
-    target_link_libraries(${TARGET_NAME} gperftools::profiler)
-  endif()
-
-  if (WITH_JEMALLOC)
-    target_link_libraries(${TARGET_NAME} jemalloc::jemalloc)
-  endif()
-endfunction()
-
-
-# find all third_party modules is used for paddle static library
-# for reduce the dependency when building the inference libs.
-set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY)
-function(find_fluid_thirdparties TARGET_NAME)
-  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
-  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
-  string(FIND "${__target_path}" "third_party" pos)
-  if(pos GREATER 1)
-    get_property(fluid_ GLOBAL PROPERTY FLUID_THIRD_PARTY)
-    set(fluid_third_partys ${fluid_third_partys} ${TARGET_NAME})
-    set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY "${fluid_third_partys}")
-  endif()
-endfunction(find_fluid_thirdparties)
-
-function(merge_static_libs TARGET_NAME)
-  set(libs ${ARGN})
-  list(REMOVE_DUPLICATES libs)
-
-  # Get all propagation dependencies from the merged libraries
-  foreach(lib ${libs})
-    list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
-  endforeach()
-  if(libs_deps)
-    list(REMOVE_DUPLICATES libs_deps)
-  endif()
-
-  # To produce a library we need at least one source file.
-  # It is created by add_custom_command below and will helps
-  # also help to track dependencies.
-  set(target_SRCS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
-
-  if(APPLE) # Use OSX's libtool to merge archives
-    # Make the generated dummy source file depended on all static input
-    # libs. If input lib changes,the source file is touched
-    # which causes the desired effect (relink).
-    add_custom_command(OUTPUT ${target_SRCS}
-      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
-      DEPENDS ${libs})
-
-    # Generate dummy staic lib
-    file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
-    add_library(${TARGET_NAME} STATIC ${target_SRCS})
-    target_link_libraries(${TARGET_NAME} ${libs_deps})
-
-    foreach(lib ${libs})
-      # Get the file names of the libraries to be merged
-      set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
-    endforeach()
-    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-      COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
-      COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}
-      )
-  endif(APPLE)
-  if(LINUX) # general UNIX: use "ar" to extract objects and re-add to a common lib
-    set(target_DIR ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}.dir)
-
-    foreach(lib ${libs})
-      set(objlistfile ${target_DIR}/${lib}.objlist) # list of objects in the input library
-      set(objdir ${target_DIR}/${lib}.objdir)
-
-      add_custom_command(OUTPUT ${objdir}
-        COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir}
-        DEPENDS ${lib})
-
-      add_custom_command(OUTPUT ${objlistfile}
-        COMMAND ${CMAKE_AR} -x "$<TARGET_FILE:${lib}>"
-        COMMAND ${CMAKE_AR} -t "$<TARGET_FILE:${lib}>" > ${objlistfile}
-        DEPENDS ${lib} ${objdir}
-        WORKING_DIRECTORY ${objdir})
-
-      list(APPEND target_OBJS "${objlistfile}")
-    endforeach()
-
-    # Make the generated dummy source file depended on all static input
-    # libs. If input lib changes,the source file is touched
-    # which causes the desired effect (relink).
-    add_custom_command(OUTPUT ${target_SRCS}
-      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
-      DEPENDS ${libs} ${target_OBJS})
-
-    # Generate dummy staic lib
-    file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
-    add_library(${TARGET_NAME} STATIC ${target_SRCS})
-    target_link_libraries(${TARGET_NAME} ${libs_deps})
-
-    # Get the file name of the generated library
-    set(target_LIBNAME "$<TARGET_FILE:${TARGET_NAME}>")
-
-    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-        COMMAND ${CMAKE_AR} crs ${target_LIBNAME} `find ${target_DIR} -name '*.o'`
-        COMMAND ${CMAKE_RANLIB} ${target_LIBNAME}
-        WORKING_DIRECTORY ${target_DIR})
-  endif(LINUX)
-  if(WIN32) # windows do not support gcc/nvcc combined compiling. Use msvc lib.exe to merge libs.
-    # Make the generated dummy source file depended on all static input
-    # libs. If input lib changes,the source file is touched
-    # which causes the desired effect (relink).
-    add_custom_command(OUTPUT ${target_SRCS}
-      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
-      DEPENDS ${libs})
-
-    # Generate dummy staic lib
-    file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
-    add_library(${TARGET_NAME} STATIC ${target_SRCS})
-    target_link_libraries(${TARGET_NAME} ${libs_deps})
-
-    foreach(lib ${libs})
-      # Get the file names of the libraries to be merged
-      set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
-    endforeach()
-    # msvc will put libarary in directory of "/Release/xxxlib" by default
-    #       COMMAND cmake -E remove "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib"
-    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-      COMMAND cmake -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}"
-      COMMAND lib /OUT:${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/lib${TARGET_NAME}.lib ${libfiles}
-      )
-  endif(WIN32)
-endfunction(merge_static_libs)
-
-function(cc_library TARGET_NAME)
-  set(options STATIC static SHARED shared)
-  set(oneValueArgs "")
-  set(multiValueArgs SRCS DEPS)
-  cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  if(WIN32)
-      # add libxxx.lib prefix in windows
-      set(${TARGET_NAME}_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}")
-  endif(WIN32)
-  if(cc_library_SRCS)
-    if(cc_library_SHARED OR cc_library_shared) # build *.so
-      add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
-    else()
-      add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
-      find_fluid_modules(${TARGET_NAME})
-    endif()
-
-    if(cc_library_DEPS)
-      # Don't need link libwarpctc.so
-      if("${cc_library_DEPS};" MATCHES "warpctc;")
-        list(REMOVE_ITEM cc_library_DEPS warpctc)
-        add_dependencies(${TARGET_NAME} warpctc)
-      endif()
-      # Only deps libmklml.so, not link
-      if("${cc_library_DEPS};" MATCHES "mklml;")
-        list(REMOVE_ITEM cc_library_DEPS mklml)
-        if(NOT "${TARGET_NAME}" MATCHES "dynload_mklml")
-          list(APPEND cc_library_DEPS dynload_mklml)
-        endif()
-        add_dependencies(${TARGET_NAME} mklml)
-        if(WIN32)
-          target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB})
-        else(WIN32)
-          target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
-        endif(WIN32)
-      endif()
-      # remove link to python, see notes at:
-      # https://github.com/pybind/pybind11/blob/master/docs/compiling.rst#building-manually
-      if("${cc_library_DEPS};" MATCHES "python;")
-        list(REMOVE_ITEM cc_library_DEPS python)
-        add_dependencies(${TARGET_NAME} python)
-        if(WIN32)
-          target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES})
-        else()
-          target_link_libraries(${TARGET_NAME} "-Wl,-undefined,dynamic_lookup")
-        endif(WIN32)
-      endif()
-      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
-      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
-      common_link(${TARGET_NAME})
-    endif()
-
-    set(full_path_src "")
-    # cpplint code style
-    foreach(source_file ${cc_library_SRCS})
-      string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
-      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
-        list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
-      endif()
-      if(${source_file} MATCHES "framework.pb.cc")
-        list(APPEND full_path_src ${source_file})
-      else()
-        list(APPEND full_path_src ${CMAKE_CURRENT_SOURCE_DIR}/${source_file})
-      endif()
-    endforeach()
-    set(__lite_cc_files ${__lite_cc_files} ${full_path_src} CACHE INTERNAL "")
-  else(cc_library_SRCS)
-    if(cc_library_DEPS)
-      merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
-    else()
-      message(FATAL_ERROR "Please specify source files or libraries in cc_library(${TARGET_NAME} ...).")
-    endif()
-  endif(cc_library_SRCS)
-endfunction(cc_library)
-
-# The link operation under windows may exceeds the maximum characters limit, simply break the link command
-# into multiple link opeartion can fix that, say
-# original:
-#     lib /out:target.lib a.lib b.lib c.lib d.lib
-# after:
-#    1. lib /out:dummy_lib_1.lib a.lib b.lib
-#    2. lib /out:dummy_lib_2.lib c.lib d.lib
-#    1. lib /out:target.lib dummy_lib_1.lib dummy_lib_2.lib
-function(sep_library TARGET_NAME)
-  set(options STATIC static SHARED shared)
-  set(oneValueArgs "")
-  set(multiValueArgs SRCS DEPS)
-  cmake_parse_arguments(sep_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  set(dummy_index 1)
-  set(dummy_offset 1)
-  # the dummy target would be consisted of limit size libraries
-  set(dummy_limit 50)
-  list(LENGTH sep_library_DEPS sep_all_len)
-  foreach(v ${sep_library_DEPS})
-    list(APPEND dummy_list ${v})
-    list(LENGTH dummy_list listlen )
-    if ((${listlen} GREATER ${dummy_limit}) OR (${dummy_offset} EQUAL ${sep_all_len}))
-      message("create dummy library ${TARGET_NAME}_dummy_lib_${dummy_index} for ${TARGET_NAME}")
-      cc_library(${TARGET_NAME}_dummy_lib_${dummy_index} STATIC DEPS ${dummy_list})
-      foreach(i ${dummy_list})
-        list(REMOVE_AT dummy_list 0)
-      endforeach()
-      list(APPEND ${TARGET_NAME}_dummy_list ${TARGET_NAME}_dummy_lib_${dummy_index})
-      MATH(EXPR dummy_index "${dummy_index}+1")
-    endif()
-    MATH(EXPR dummy_offset "${dummy_offset}+1")
-  endforeach()
-  if(${sep_library_SHARED})
-    cc_library(${TARGET_NAME} SHARED SRCS ${sep_library_SRCS} DEPS ${${TARGET_NAME}_dummy_list})
-  else(${sep_library_SHARED})
-    cc_library(${TARGET_NAME} STATIC SRCS ${sep_library_SRCS} DEPS ${${TARGET_NAME}_dummy_list})
-  endif(${sep_library_SHARED})
-endfunction(sep_library)
-
-function(cc_binary TARGET_NAME)
-  set(options "")
-  set(oneValueArgs "")
-  set(multiValueArgs SRCS DEPS)
-  cmake_parse_arguments(cc_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  add_executable(${TARGET_NAME} ${cc_binary_SRCS})
-  if(cc_binary_DEPS)
-    target_link_libraries(${TARGET_NAME} ${cc_binary_DEPS})
-    add_dependencies(${TARGET_NAME} ${cc_binary_DEPS})
-    common_link(${TARGET_NAME})
-  endif()
-  get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-  target_link_libraries(${TARGET_NAME} ${os_dependency_modules})
-endfunction(cc_binary)
-
-function(cc_test TARGET_NAME)
-  if(WITH_TESTING)
-    set(options SERIAL)
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS ARGS)
-    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    if(WIN32)
-      if("${cc_test_DEPS};" MATCHES "python;")
-        list(REMOVE_ITEM cc_test_DEPS python)
-        target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES})
-      endif()
-    endif(WIN32)
-    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} ${os_dependency_modules} paddle_gtest_main memory gtest gflags glog)
-    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
-    common_link(${TARGET_NAME})
-    add_test(NAME ${TARGET_NAME}
-             COMMAND ${TARGET_NAME} ${cc_test_ARGS}
-             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    if (${cc_test_SERIAL})
-        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
-    endif()
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
-    # No unit test should exceed 10 minutes.
-    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
-  endif()
-endfunction(cc_test)
-
-# cc_test without default dependencies
-function(raw_cc_test TARGET_NAME)
-  if(WITH_TESTING)
-    set(options SERIAL)
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS ARGS)
-    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    if(WIN32)
-      if("${cc_test_DEPS};" MATCHES "python;")
-        list(REMOVE_ITEM cc_test_DEPS python)
-        target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES})
-      endif()
-    endif(WIN32)
-    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-
-    if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-      target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} ${os_dependency_modules} lite_gtest_main gtest gflags logging)
-      add_dependencies(${TARGET_NAME} ${cc_test_DEPS} lite_gtest_main gtest gflags logging)
-    else()
-      target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} ${os_dependency_modules} lite_gtest_main gtest gflags glog)
-      add_dependencies(${TARGET_NAME} ${cc_test_DEPS} lite_gtest_main gtest gflags glog)
-    endif(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-
-    common_link(${TARGET_NAME})
-    add_test(NAME ${TARGET_NAME}
-            COMMAND ${TARGET_NAME} ${cc_test_ARGS}
-            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    if (${cc_test_SERIAL})
-      set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
-    endif()
-    # No unit test should exceed 10 minutes.
-    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
-  endif()
-endfunction(raw_cc_test)
-
-function(_lite_cc_test args)
-  message(STATUS "building lite raw test: ${args}")
-  raw_cc_test(${args} ${ARGN})
-endfunction()
-
-function(nv_library TARGET_NAME)
-  if (LITE_WITH_CUDA)
-    set(options STATIC static SHARED shared)
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(nv_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    if(nv_library_SRCS)
-      if (nv_library_SHARED OR nv_library_shared) # build *.so
-        cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS})
-      else()
-        cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
-        find_fluid_modules(${TARGET_NAME})
-      endif()
-      if (nv_library_DEPS)
-        add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
-        target_link_libraries(${TARGET_NAME} ${nv_library_DEPS})
-      endif()
-      # cpplint code style
-      foreach(source_file ${nv_library_SRCS})
-        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
-        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
-          list(APPEND nv_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
-        endif()
-      endforeach()
-    else(nv_library_SRCS)
-      if (nv_library_DEPS)
-        merge_static_libs(${TARGET_NAME} ${nv_library_DEPS})
-      else()
-        message(FATAL "Please specify source file or library in nv_library.")
-      endif()
-    endif(nv_library_SRCS)
-  endif()
-endfunction(nv_library)
-
-function(nv_binary TARGET_NAME)
-  if (LITE_WITH_CUDA)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    cuda_add_executable(${TARGET_NAME} ${nv_binary_SRCS})
-    if(nv_binary_DEPS)
-      target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS})
-      add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
-      common_link(${TARGET_NAME})
-    endif()
-  endif()
-endfunction(nv_binary)
-
-function(nv_test TARGET_NAME)
-  if (LITE_WITH_CUDA AND WITH_TESTING)
-    set(options SERIAL)
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
-    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest
-gflags glog ${os_dependency_modules} ${CUDNN_LIBRARY})
-    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest gflags glog)
-    common_link(${TARGET_NAME})
-    add_test(${TARGET_NAME} ${TARGET_NAME})
-    if (nv_test_SERIAL)
-        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
-    endif()
-  endif()
-endfunction(nv_test)
-
-
-# Modification of standard 'protobuf_generate_cpp()' with protobuf-lite support
-# Usage:
-#   paddle_protobuf_generate_cpp(<proto_srcs> <proto_hdrs> <proto_files>)
-
-function(paddle_protobuf_generate_cpp SRCS HDRS)
-  if(NOT ARGN)
-    message(SEND_ERROR "Error: paddle_protobuf_generate_cpp() called without any proto files")
-    return()
-  endif()
-
-  set(${SRCS})
-  set(${HDRS})
-
-  foreach(FIL ${ARGN})
-    get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
-    get_filename_component(FIL_WE ${FIL} NAME_WE)
-
-    set(_protobuf_protoc_src "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc")
-    set(_protobuf_protoc_hdr "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h")
-    list(APPEND ${SRCS} "${_protobuf_protoc_src}")
-    list(APPEND ${HDRS} "${_protobuf_protoc_hdr}")
-
-    add_custom_command(
-      OUTPUT "${_protobuf_protoc_src}"
-             "${_protobuf_protoc_hdr}"
-
-      COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}"
-      COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
-      -I${CMAKE_CURRENT_SOURCE_DIR}
-      --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL}
-      DEPENDS ${ABS_FIL} protoc
-      COMMENT "Running C++ protocol buffer compiler on ${FIL}"
-      VERBATIM )
-  endforeach()
-
-  set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE)
-  set(${SRCS} ${${SRCS}} PARENT_SCOPE)
-  set(${HDRS} ${${HDRS}} PARENT_SCOPE)
-endfunction()
-
-
-function(proto_library TARGET_NAME)
-  set(oneValueArgs "")
-  set(multiValueArgs SRCS DEPS)
-  cmake_parse_arguments(proto_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  set(proto_srcs)
-  set(proto_hdrs)
-  paddle_protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
-  cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf)
-endfunction()
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
deleted file mode 100644
index c3a748db50..0000000000
--- a/cmake/hip.cmake
+++ /dev/null
@@ -1,53 +0,0 @@
-if(NOT WITH_AMD_GPU)
-    return()
-endif()
-
-include_directories("/opt/rocm/include")
-include_directories("/opt/rocm/hip/include")
-include_directories("/opt/rocm/miopen/include")
-include_directories("/opt/rocm/hipblas/include")
-include_directories("/opt/rocm/hiprand/include")
-include_directories("/opt/rocm/rocrand/include")
-include_directories("/opt/rocm/rccl/include")
-include_directories("/opt/rocm/thrust")
-
-set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" )
-
-if(WITH_DSO)
-  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO")
-endif(WITH_DSO)
-
-if(WITH_TESTING)
-  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING")
-endif(WITH_TESTING)
-
-if(WITH_DISTRIBUTE)
-  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_DISTRIBUTE")
-endif(WITH_DISTRIBUTE)
-
-if(WITH_GRPC)
-  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_GRPC")
-endif(WITH_GRPC)
-
-if(WITH_MKLDNN)
-  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_MKLDNN")
-endif(WITH_MKLDNN)
-
-set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE")
-
-if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
-    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
-    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
-    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
-endif()
-
-if("x${HCC_HOME}" STREQUAL "x")
-  set(HCC_HOME "/opt/rocm/hcc")
-endif()
-
-set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
-set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared")
-set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared")
-
diff --git a/cmake/lite.cmake b/cmake/lite.cmake
deleted file mode 100644
index 707982a3e7..0000000000
--- a/cmake/lite.cmake
+++ /dev/null
@@ -1,435 +0,0 @@
-set(LITE_URL "http://paddle-inference-dist.bj.bcebos.com" CACHE STRING "inference download url")
-
-function(lite_download_and_uncompress INSTALL_DIR URL FILENAME)
-    message(STATUS "Download inference test stuff from ${URL}/${FILENAME}")
-    string(REGEX REPLACE "[-%.]" "_" FILENAME_EX ${FILENAME})
-    set(EXTERNAL_PROJECT_NAME "extern_lite_download_${FILENAME_EX}")
-    set(UNPACK_DIR "${INSTALL_DIR}/src/${EXTERNAL_PROJECT_NAME}")
-    ExternalProject_Add(
-            ${EXTERNAL_PROJECT_NAME}
-            ${EXTERNAL_PROJECT_LOG_ARGS}
-            PREFIX                ${INSTALL_DIR}
-            DOWNLOAD_COMMAND      wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} && ${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME}
-            DOWNLOAD_DIR          ${INSTALL_DIR}
-            DOWNLOAD_NO_PROGRESS  1
-            CONFIGURE_COMMAND     ""
-            BUILD_COMMAND         ""
-            UPDATE_COMMAND        ""
-            INSTALL_COMMAND       ""
-    )
-endfunction()
-
-function (lite_deps TARGET)
-  set(options "")
-  set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS NPU_DEPS ARGS)
-  cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-  set(deps ${lite_deps_DEPS})
-
-  if(LITE_WITH_X86)
-    foreach(var ${lite_deps_X86_DEPS})
-      set(deps ${deps} ${var})
-    endforeach(var)
-  endif()
-
-  if(LITE_WITH_CUDA)
-    foreach(var ${lite_deps_CUDA_DEPS})
-      set(deps ${deps} ${var})
-    endforeach(var)
-  endif()
-
-  if(LITE_WITH_ARM)
-    foreach(var ${lite_deps_ARM_DEPS})
-      set(deps ${deps} ${var})
-    endforeach(var)
-  endif()
-
-  if(LITE_WITH_PROFILE)
-    foreach(var ${lite_deps_PROFILE_DEPS})
-      set(deps ${deps} ${var})
-    endforeach(var)
-  endif()
-
-  if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-    foreach(var ${lite_deps_LIGHT_DEPS})
-      set(deps ${deps} ${var})
-    endforeach(var)
-  endif()
-
-
-
-  if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-    foreach(var ${lite_deps_HVY_DEPS})
-      set(deps ${deps} ${var})
-    endforeach(var)
-  endif()
-
-  if (LITE_WITH_OPENCL)
-    foreach(var ${lite_deps_CL_DEPS})
-      set(deps ${deps} ${var})
-    endforeach(var)
-  endif()
-
-  if (LITE_WITH_FPGA)
-    foreach(var ${lite_deps_FPGA_DEPS})
-      set(deps ${deps} ${var})
-    endforeach(var)
-  endif()
-
-  if (LITE_WITH_NPU)
-    foreach(var ${lite_deps_NPU_DEPS})
-      set(deps ${deps} ${var})
-    endforeach(var)
-  endif()
-
-  set(${TARGET} ${deps} PARENT_SCOPE)
-endfunction()
-
-
-# A fake target to include all the libraries and tests the lite module depends.
-add_custom_target(lite_compile_deps COMMAND echo 1)
-
-# Add names for lite libraries for latter compile. We use this name list to avoid compiling
-# the whole fluid project to accelerate the compile speed.
-set(offline_lib_registry_file "${CMAKE_BINARY_DIR}/lite_libs.txt")
-file(WRITE ${offline_lib_registry_file} "") # clean
-
-# cc_library with branch support.
-# The branches:
-#  X86_DEPS: works only when LITE_WITH_X86 is ON.
-#  CUDA_DEPS:     LITE_WITH_CUDA
-#  ARM_DEPS:      LITE_WITH_ARM
-#  PROFILE_DEPS:  LITE_WITH_PROFILE
-#  LIGHT_DEPS:    LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-#  HVY_DEPS:      NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-#  EXCLUDE_COMPILE_DEPS: TARGET will not be included in lite_compile_deps if this is not None
-function(lite_cc_library TARGET)
-    set(options SHARED shared STATIC static MODULE module)
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS NPU_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS LIGHT_DEPS
-      HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
-    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    set(deps "")
-    lite_deps(deps
-            DEPS ${args_DEPS}
-            X86_DEPS ${args_X86_DEPS}
-            CUDA_DEPS ${args_CUDA_DEPS}
-            CL_DEPS ${args_CL_DEPS}
-            NPU_DEPS ${args_NPU_DEPS}
-            ARM_DEPS ${args_ARM_DEPS}
-            FPGA_DEPS ${args_FPGA_DEPS}
-            PROFILE_DEPS ${args_PROFILE_DEPS}
-            LIGHT_DEPS ${args_LIGHT_DEPS}
-            HVY_DEPS ${args_HVY_DEPS}
-            )
-
-    if (args_SHARED OR ARGS_shared)
-        cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ${args_DEPS} SHARED)
-    elseif (args_MODULE OR ARGS_module)
-        add_library(${TARGET} MODULE ${args_SRCS})
-        add_dependencies(${TARGET} ${deps} ${args_DEPS})
-    else()
-        cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ${args_DEPS})
-    endif()
-    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
-
-    # collect targets need to compile for lite
-    if (args_SRCS AND NOT args_EXCLUDE_COMPILE_DEPS)
-        add_dependencies(lite_compile_deps ${TARGET})
-    endif()
-
-    # register a library name.
-    file(APPEND ${offline_lib_registry_file} "${TARGET}\n")
-endfunction()
-
-function(lite_cc_binary TARGET)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS
-      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
-    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    set(deps "")
-    lite_deps(deps
-            DEPS ${args_DEPS}
-            X86_DEPS ${args_X86_DEPS}
-            CUDA_DEPS ${args_CUDA_DEPS}
-            CL_DEPS ${args_CL_DEPS}
-            ARM_DEPS ${args_ARM_DEPS}
-            FPGA_DEPS ${args_FPGA_DEPS}
-            PROFILE_DEPS ${args_PROFILE_DEPS}
-            LIGHT_DEPS ${args_LIGHT_DEPS}
-            HVY_DEPS ${args_HVY_DEPS}
-            )
-    cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ${args_DEPS})
-    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
-    # collect targets need to compile for lite
-    if (NOT args_EXCLUDE_COMPILE_DEPS)
-        add_dependencies(lite_compile_deps ${TARGET})
-    endif()
-endfunction()
-
-# Add a unit-test name to file for latter offline manual test.
-set(offline_test_registry_file "${CMAKE_BINARY_DIR}/lite_tests.txt")
-file(WRITE ${offline_test_registry_file} "") # clean
-# Test lite modules.
-
-function(lite_cc_test TARGET)
-    if(NOT WITH_TESTING)
-        return()
-    endif()
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS
-        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
-        ARGS
-        COMPILE_LEVEL # (basic|extra)
-        )
-    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    if (args_COMPILE_LEVEL STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA))
-      MESSAGE(STATUS "Ignore test ${TARGET} due to compile level ${args_COMPILE_LEVEL}")
-      return()
-    endif()
-
-    set(deps "")
-    lite_deps(deps
-              DEPS ${args_DEPS}
-              X86_DEPS ${args_X86_DEPS}
-              CUDA_DEPS ${args_CUDA_DEPS}
-              CL_DEPS ${args_CL_DEPS}
-              ARM_DEPS ${args_ARM_DEPS}
-              FPGA_DEPS ${args_FPGA_DEPS}
-              PROFILE_DEPS ${args_PROFILE_DEPS}
-              LIGHT_DEPS ${args_LIGHT_DEPS}
-              HVY_DEPS ${args_HVY_DEPS}
-              )
-    _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
-    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
-    file(APPEND ${offline_test_registry_file} "${TARGET}\n")
-
-    # collect targets need to compile for lite
-    if (NOT args_EXCLUDE_COMPILE_DEPS)
-        add_dependencies(lite_compile_deps ${TARGET})
-    endif()
-endfunction()
-
-set(arm_kernels CACHE INTERNAL "arm kernels")
-set(x86_kernels CACHE INTERNAL "x86 kernels")
-set(fpga_kernels CACHE INTERNAL "fpga kernels")
-set(npu_kernels CACHE INTERNAL "npu kernels")
-set(opencl_kernels CACHE INTERNAL "opencl kernels")
-set(host_kernels CACHE INTERNAL "host kernels")
-
-set(kernels_src_list "${CMAKE_BINARY_DIR}/kernels_src_list.txt")
-file(WRITE ${kernels_src_list} "") # clean
-# add a kernel for some specific device
-# device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA)
-# level: one of (basic, extra)
-function(add_kernel TARGET device level)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS
-        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
-        ARGS)
-    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA))
-        return()
-    endif()
-
-    if (LITE_ON_MODEL_OPTIMIZE_TOOL)
-      # the source list will collect for model_optimize_tool to fake kernel generation.
-      foreach(src ${args_SRCS})
-          file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
-      endforeach()
-      return()
-    endif()
-
-    # when compiling the model_optimize_tool, a source file with all the fake kernel definitions will be generated,
-    # no need to continue the compilation of the true kernel source.
-    if (LITE_ON_MODEL_OPTIMIZE_TOOL)
-      return()
-    endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
-
-
-    if ("${device}" STREQUAL "Host")
-        set(host_kernels "${host_kernels};${TARGET}" CACHE INTERNAL "")
-    endif()
-    if ("${device}" STREQUAL "ARM")
-        if (NOT LITE_WITH_ARM)
-            return()
-        endif()
-        set(arm_kernels "${arm_kernels};${TARGET}" CACHE INTERNAL "")
-    endif()
-    if ("${device}" STREQUAL "X86")
-        if (NOT LITE_WITH_X86)
-            return()
-        endif()
-        set(x86_kernels "${x86_kernels};${TARGET}" CACHE INTERNAL "")
-    endif()
-    if ("${device}" STREQUAL "NPU")
-        if (NOT LITE_WITH_NPU)
-            return()
-        endif()
-        set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "")
-    endif()
-    if ("${device}" STREQUAL "FPGA")
-        if (NOT LITE_WITH_FPGA)
-            return()
-        endif()
-        set(fpga_kernels "${fpga_kernels};${TARGET}" CACHE INTERNAL "")
-    endif()
-    if ("${device}" STREQUAL "OPENCL")
-        if (NOT LITE_WITH_OPENCL)
-            return()
-        endif()
-        set(opencl_kernels "${opencl_kernels};${TARGET}" CACHE INTERNAL "")
-    endif()
-
-    # the source list will collect for paddle_use_kernel.h code generation.
-    foreach(src ${args_SRCS})
-        file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
-    endforeach()
-
-    lite_cc_library(${TARGET} SRCS ${args_SRCS}
-              DEPS ${args_DEPS}
-              X86_DEPS ${args_X86_DEPS}
-              CUDA_DEPS ${args_CUDA_DEPS}
-              CL_DEPS ${args_CL_DEPS}
-              ARM_DEPS ${args_ARM_DEPS}
-              FPGA_DEPS ${args_FPGA_DEPS}
-              PROFILE_DEPS ${args_PROFILE_DEPS}
-              LIGHT_DEPS ${args_LIGHT_DEPS}
-              HVY_DEPS ${args_HVY_DEPS}
-      )
-endfunction()
-
-set(ops CACHE INTERNAL "ops")
-set(ops_src_list "${CMAKE_BINARY_DIR}/ops_src_list.txt")
-file(WRITE ${ops_src_list} "") # clean
-# add an operator
-# level: one of (basic, extra)
-function(add_operator TARGET level)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS
-        LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
-        ARGS)
-    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA))
-        return()
-    endif()
-
-    set(ops "${ops};${TARGET}" CACHE INTERNAL "source")
-
-    foreach(src ${args_SRCS})
-      file(APPEND ${ops_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
-    endforeach()
-
-    lite_cc_library(${TARGET} SRCS ${args_SRCS}
-              DEPS ${args_DEPS}
-              X86_DEPS ${args_X86_DEPS}
-              CUDA_DEPS ${args_CUDA_DEPS}
-              CL_DEPS ${args_CL_DEPS}
-              ARM_DEPS ${args_ARM_DEPS}
-              FPGA_DEPS ${args_FPGA_DEPS}
-              PROFILE_DEPS ${args_PROFILE_DEPS}
-              LIGHT_DEPS ${args_LIGHT_DEPS}
-              HVY_DEPS ${args_HVY_DEPS}
-      )
-endfunction()
-
-
-# Bundle several static libraries into one.
-function(bundle_static_library tgt_name bundled_tgt_name fake_target)
-  list(APPEND static_libs ${tgt_name})
-
-  function(_recursively_collect_dependencies input_target)
-    set(_input_link_libraries LINK_LIBRARIES)
-    get_target_property(_input_type ${input_target} TYPE)
-    if (${_input_type} STREQUAL "INTERFACE_LIBRARY")
-      set(_input_link_libraries INTERFACE_LINK_LIBRARIES)
-    endif()
-    get_target_property(public_dependencies ${input_target} ${_input_link_libraries})
-    foreach(dependency IN LISTS public_dependencies)
-      if(TARGET ${dependency})
-        get_target_property(alias ${dependency} ALIASED_TARGET)
-        if (TARGET ${alias})
-          set(dependency ${alias})
-        endif()
-        get_target_property(_type ${dependency} TYPE)
-        if (${_type} STREQUAL "STATIC_LIBRARY")
-          list(APPEND static_libs ${dependency})
-        endif()
-
-        get_property(library_already_added
-          GLOBAL PROPERTY _${tgt_name}_static_bundle_${dependency})
-        if (NOT library_already_added)
-          set_property(GLOBAL PROPERTY _${tgt_name}_static_bundle_${dependency} ON)
-          _recursively_collect_dependencies(${dependency})
-        endif()
-      endif()
-    endforeach()
-    set(static_libs ${static_libs} PARENT_SCOPE)
-  endfunction()
-
-  _recursively_collect_dependencies(${tgt_name})
-
-  list(REMOVE_DUPLICATES static_libs)
-
-  set(bundled_tgt_full_name
-    ${CMAKE_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${bundled_tgt_name}${CMAKE_STATIC_LIBRARY_SUFFIX})
-
-  #message(STATUS "bundled_tgt_full_name: ${bundled_tgt_full_name}")
-
-  if(NOT IOS)
-    file(WRITE ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in
-      "CREATE ${bundled_tgt_full_name}\n" )
-
-    foreach(tgt IN LISTS static_libs)
-      file(APPEND ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in
-        "ADDLIB $<TARGET_FILE:${tgt}>\n")
-    endforeach()
-
-    file(APPEND ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in "SAVE\n")
-    file(APPEND ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in "END\n")
-
-    file(GENERATE
-      OUTPUT ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar
-      INPUT ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in)
-
-    set(ar_tool ${CMAKE_AR})
-    if (CMAKE_INTERPROCEDURAL_OPTIMIZATION)
-      set(ar_tool ${CMAKE_CXX_COMPILER_AR})
-    endif()
-
-    add_custom_command(
-      COMMAND ${ar_tool} -M < ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar
-      OUTPUT ${bundled_tgt_full_name}
-      COMMENT "Bundling ${bundled_tgt_name}"
-      VERBATIM)
-  else()
-    foreach(lib ${static_libs})
-      set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
-    endforeach()
-    add_custom_command(
-      COMMAND /usr/bin/libtool -static -o ${bundled_tgt_full_name} ${libfiles}
-      OUTPUT ${bundled_tgt_full_name}
-    )
-  endif()
-
-  add_custom_target(${fake_target} ALL DEPENDS ${bundled_tgt_full_name})
-  add_dependencies(${fake_target} ${tgt_name})
-
-  add_library(${bundled_tgt_name} STATIC IMPORTED)
-  set_target_properties(${bundled_tgt_name}
-    PROPERTIES
-      IMPORTED_LOCATION ${bundled_tgt_full_name}
-      INTERFACE_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:${tgt_name},INTERFACE_INCLUDE_DIRECTORIES>)
-  add_dependencies(${bundled_tgt_name} ${fake_target})
-
-endfunction()
diff --git a/cmake/lite_utils.cmake b/cmake/lite_utils.cmake
deleted file mode 100644
index f07ea85936..0000000000
--- a/cmake/lite_utils.cmake
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# ----------------------------------------------------------------------------
-# section: Provides an paddle lite config option macro
-# usage：  lite_option(var "help string to describe the var" [if or IF (condition)])
-# ----------------------------------------------------------------------------
-macro(lite_option variable description value)
-    set(__value ${value})
-    set(__condition "")
-    set(__varname "__value")
-    foreach(arg ${ARGN})
-        if(arg STREQUAL "IF" OR arg STREQUAL "if")
-            set(__varname "__condition")
-        else()
-            list(APPEND ${__varname} ${arg})
-        endif()
-    endforeach()
-    unset(__varname)
-    if(__condition STREQUAL "")
-        set(__condition 2 GREATER 1)
-    endif()
-
-    if(${__condition})
-        if(__value MATCHES ";")
-            if(${__value})
-                option(${variable} "${description}" ON)
-            else()
-                option(${variable} "${description}" OFF)
-            endif()
-        elseif(DEFINED ${__value})
-            if(${__value})
-                option(${variable} "${description}" ON)
-            else()
-                option(${variable} "${description}" OFF)
-            endif()
-        else()
-             option(${variable} "${description}" ${__value})
-        endif()
-    else()
-        unset(${variable} CACHE)
-    endif()
-    unset(__condition)
-    unset(__value)
-endmacro()
diff --git a/cmake/make_resource.py b/cmake/make_resource.py
deleted file mode 100644
index 09a2ca877d..0000000000
--- a/cmake/make_resource.py
+++ /dev/null
@@ -1,25 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import re
-import sys
-
-res = sys.argv[1]
-out = sys.argv[2]
-var = re.sub(r'[ .-]', '_', os.path.basename(res))
-
-open(out, "w").write("const unsigned char " + var + "[] = {" + ",".join([
-    "0x%02x" % ord(c) for c in open(res).read()
-]) + ",0};\n" + "const unsigned " + var + "_size = sizeof(" + var + ");\n")
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
deleted file mode 100644
index c17e718f42..0000000000
--- a/cmake/operators.cmake
+++ /dev/null
@@ -1,227 +0,0 @@
-set(PART_CUDA_KERNEL_FILES)
-function(op_library TARGET)
-    # op_library is a function to create op library. The interface is same as
-    # cc_library. But it handle split GPU/CPU code and link some common library
-    # for ops.
-    set(cc_srcs)
-    set(cu_srcs)
-    set(hip_cu_srcs)
-    set(miopen_hip_cc_srcs)
-    set(cu_cc_srcs)
-    set(cudnn_cu_cc_srcs)
-    set(CUDNN_FILE)
-    set(mkldnn_cc_srcs)
-    set(MKLDNN_FILE)
-    set(op_common_deps operator op_registry math_function)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    set(pybind_flag 0)
-    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}"
-            "${multiValueArgs}" ${ARGN})
-
-    list(LENGTH op_library_SRCS op_library_SRCS_len)
-    if (${op_library_SRCS_len} EQUAL 0)
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
-            list(APPEND cc_srcs ${TARGET}.cc)
-        endif()
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
-            list(APPEND cu_cc_srcs ${TARGET}.cu.cc)
-        endif()
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
-            list(APPEND cu_srcs ${TARGET}.cu)
-        endif()
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
-            set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
-                    ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE)
-            list(APPEND cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
-        endif()
-
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
-            list(APPEND hip_cu_srcs ${TARGET}.hip.cu)
-        endif()
-        string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}")
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
-            list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
-        endif()
-        if(WITH_AMD_GPU)
-            string(REPLACE "_op" "_miopen_op" MIOPEN_FILE "${TARGET}")
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cc)
-                list(APPEND miopen_hip_cc_srcs ${MIOPEN_FILE}.hip.cc)
-            endif()
-        endif()
-        if(WITH_MKLDNN)
-            string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
-            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn/${MKLDNN_FILE}.cc)
-                list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc)
-            endif()
-        endif()
-    else()
-        foreach(src ${op_library_SRCS})
-            if (${src} MATCHES ".*\\.hip.cu$")
-                list(APPEND hip_cu_srcs ${src})
-            elseif (${src} MATCHES ".*\\.cu$")
-                list(APPEND cu_srcs ${src})
-            elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
-                list(APPEND cudnn_cu_cc_srcs ${src})
-            elseif(WITH_AMD_GPU AND ${src} MATCHES ".*_miopen_op.hip.cc$")
-                list(APPEND miopen_hip_cc_srcs ${src})
-            elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
-                list(APPEND mkldnn_cc_srcs ${src})
-            elseif(${src} MATCHES ".*\\.cu.cc$")
-                list(APPEND cu_cc_srcs ${src})
-            elseif(${src} MATCHES ".*\\.cc$")
-                list(APPEND cc_srcs ${src})
-            else()
-                message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
-            endif()
-        endforeach()
-    endif()
-
-    list(LENGTH cc_srcs cc_srcs_len)
-    if (${cc_srcs_len} EQUAL 0)
-        message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
-    endif()
-    if (WIN32)
-    # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
-    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op")
-        if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
-          return()
-        endif()
-    endforeach()
-    endif(WIN32)
-    set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} CACHE INTERNAL "op libs")
-
-    list(LENGTH op_library_DEPS op_library_DEPS_len)
-    if (${op_library_DEPS_len} GREATER 0)
-        set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
-    endif()
-    if (WITH_GPU)
-        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
-                ${op_common_deps})
-    elseif (WITH_AMD_GPU)
-        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
-                ${op_common_deps})
-    else()
-        cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
-            ${op_common_deps})
-    endif()
-
-    # Define operators that don't need pybind here.
-    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
-"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
-"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op" "dgc_op")
-        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
-            set(pybind_flag 1)
-        endif()
-    endforeach()
-
-    # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
-    # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
-    # And for detail pybind information, please see generated paddle/pybind/pybind.h.
-    file(READ ${TARGET}.cc TARGET_CONTENT)
-    string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
-    string(REGEX MATCH "REGISTER_OPERATOR\\([a-z0-9_]*," one_register "${multi_register}")
-    if (one_register STREQUAL "")
-        string(REPLACE "_op" "" TARGET "${TARGET}")
-    else ()
-        string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}")
-        string(REPLACE "," "" TARGET "${TARGET}")
-    endif()
-
-    # pybind USE_NO_KERNEL_OP
-    # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
-    string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
-    string(REPLACE "_op" "" TARGET "${TARGET}")
-    if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
-        file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
-        set(pybind_flag 1)
-    endif()
-
-    # pybind USE_CPU_ONLY_OP
-    list(LENGTH cu_srcs cu_srcs_len)
-    list(LENGTH cu_cc_srcs cu_cc_srcs_len)
-    list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
-    list(LENGTH hip_cu_srcs hip_cu_srcs_len)
-    list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
-    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
-        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0)
-        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
-        set(pybind_flag 1)
-    endif()
-
-    # pybind USE_OP_DEVICE_KERNEL for CUDNN
-    list(LENGTH cudnn_cu_cc_srcs cudnn_cu_cc_srcs_len)
-    if (WITH_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0)
-      if(${TARGET} STREQUAL "activation")
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, CUDNN);\n")
-      else()
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
-      endif()
-    endif()
-
-    # pybind USE_OP_DEVICE_KERNEL for MIOPEN
-    if (WITH_AMD_GPU AND ${miopen_hip_cc_srcs_len} GREATER 0)
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
-    endif()
-
-    # pybind USE_OP_DEVICE_KERNEL for MKLDNN
-    if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
-      # Append first implemented MKLDNN activation operator
-      if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
-      elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op")
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n")
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, S8);\n")
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, U8);\n")
-        
-      else()
-        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
-      endif()
-    endif()
-
-    # pybind USE_OP
-    if (${pybind_flag} EQUAL 0)
-      # NOTE(*): activation use macro to regist the kernels, set use_op manually.
-      if(${TARGET} STREQUAL "activation")
-        file(APPEND ${pybind_file} "USE_OP(relu);\n")
-      elseif(${TARGET} STREQUAL "fake_dequantize")
-        file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
-      elseif(${TARGET} STREQUAL "fake_quantize")
-        file(APPEND ${pybind_file} "USE_OP(fake_quantize_abs_max);\n")
-      elseif(${TARGET} STREQUAL "tensorrt_engine_op")
-          message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
-      elseif(${TARGET} STREQUAL "fc")
-        # HACK: fc only have mkldnn and cpu, which would mismatch the cpu only condition
-        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
-      else()
-        file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
-      endif()
-    endif()
-endfunction()
-
-
-function(register_operators)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs EXCLUDES DEPS)
-    cmake_parse_arguments(register_operators "${options}" "${oneValueArgs}"
-            "${multiValueArgs}" ${ARGN})
-
-    file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
-    string(REPLACE "_mkldnn" "" OPS "${OPS}")
-    string(REPLACE ".cc" "" OPS "${OPS}")
-    list(REMOVE_DUPLICATES OPS)
-    list(LENGTH register_operators_DEPS register_operators_DEPS_len)
-
-    foreach(src ${OPS})
-        list(FIND register_operators_EXCLUDES ${src} _index)
-        if (${_index} EQUAL -1)
-            if (${register_operators_DEPS_len} GREATER 0)
-                op_library(${src} DEPS ${register_operators_DEPS})
-            else()
-                op_library(${src})
-            endif()
-        endif()
-    endforeach()
-endfunction()
diff --git a/cmake/package.cmake b/cmake/package.cmake
deleted file mode 100644
index 79e02147f3..0000000000
--- a/cmake/package.cmake
+++ /dev/null
@@ -1,21 +0,0 @@
-set(CPACK_PACKAGE_NAME paddle)
-set(CPACK_PACKAGE_VERSION_MAJOR ${PADDLE_MAJOR_VERSION})
-set(CPACK_PACKAGE_VERSION_MINOR ${PADDLE_MINOR_VERSION})
-set(CPACK_PACKAGE_VERSION_PATCH ${PADDLE_PATCH_VERSION})
-set(CPACK_PACKAGE_VERSION ${PADDLE_VERSION})
-## DEB Settings
-set(CPACK_DEBIAN_PACKAGE_NAME paddle)
-set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE amd64)
-set(CPACK_DEBIAN_PACKAGE_MAINTAINER PaddlePaddle Dev <paddle-dev@baidu.com>)
-set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Paddle")
-set(CPACK_PACKAGE_DESCRIPTION "")
-set(CPACK_DEBIAN_PACKAGE_DEPENDS "libpython2.7-dev, libstdc++6, python-pip, curl, libgfortran3, python-pip-whl")
-set(CPACK_DEBIAN_PACKAGE_SECTION Devel)
-set(CPACK_DEBIAN_PACKAGE_VERSION ${PADDLE_VERSION})
-set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PADDLE_SOURCE_DIR}/paddle/scripts/deb/postinst")
-#set(CPACK_GENERATOR "DEB")
-# Start cpack
-include (CMakePackageConfigHelpers)
-include (CPack)
-
-
diff --git a/cmake/simd.cmake b/cmake/simd.cmake
deleted file mode 100644
index 566dc75fda..0000000000
--- a/cmake/simd.cmake
+++ /dev/null
@@ -1,99 +0,0 @@
-# This file is use to check all support level of AVX on your machine
-# so that PaddlePaddle can unleash the vectorization power of muticore.
-
-include(CheckCXXSourceRuns)
-include(CheckCXXSourceCompiles)
-
-if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-    set(MMX_FLAG "-mmmx")
-    set(SSE2_FLAG "-msse2")
-    set(SSE3_FLAG "-msse3")
-    set(AVX_FLAG "-mavx")
-    set(AVX2_FLAG "-mavx2")
-    set(AVX512F_FLAG "-mavx512f")
-elseif(MSVC)
-    set(MMX_FLAG "/arch:MMX")
-    set(SSE2_FLAG "/arch:SSE2")
-    set(SSE3_FLAG "/arch:SSE3")
-    SET(AVX_FLAG "/arch:AVX")
-    SET(AVX2_FLAG "/arch:AVX2")
-endif()
-
-set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS})
-
-# Check  MMX
-set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG})
-set(MMX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <mmintrin.h>
-int main()
-{
-    _mm_setzero_si64();
-    return 0;
-}" MMX_FOUND)
-
-# Check SSE2
-set(CMAKE_REQUIRED_FLAGS ${SSE2_FLAG})
-set(SSE2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <emmintrin.h>
-int main()
-{
-    _mm_setzero_si128();
-    return 0;
-}" SSE2_FOUND)
-
-# Check SSE3
-set(CMAKE_REQUIRED_FLAGS ${SSE3_FLAG})
-set(SSE3_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <pmmintrin.h>
-int main()
-{
-    __m128d a = _mm_set1_pd(6.28);
-    __m128d b = _mm_set1_pd(3.14);
-    __m128d result = _mm_addsub_pd(a, b);
-    result = _mm_movedup_pd(result);
-    return 0;
-}" SSE3_FOUND)
-
-# Check AVX
-set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
-set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
-    __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
-    __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
-    __m256 result = _mm256_add_ps (a, b);
-    return 0;
-}" AVX_FOUND)
-
-# Check AVX 2
-set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
-set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
-    __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
-    __m256i result = _mm256_abs_epi32 (a);
-    return 0;
-}" AVX2_FOUND)
-
-# Check AVX512F
-set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
-set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
-    __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
-                                  13, -5, 6, -7, 9, 2, -6, 3);
-    __m512i result = _mm512_abs_epi32 (a);
-    return 0;
-}" AVX512F_FOUND)
-
-set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
-mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)
diff --git a/cmake/system.cmake b/cmake/system.cmake
deleted file mode 100644
index ba00df928a..0000000000
--- a/cmake/system.cmake
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-# http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Detects the OS and sets appropriate variables.
-# CMAKE_SYSTEM_NAME only give us a coarse-grained name of the OS CMake is
-# building for, but the host processor name like centos is necessary
-# in some scenes to distinguish system for customization.
-#
-# for instance, protobuf libs path is <install_dir>/lib64
-# on CentOS, but <install_dir>/lib on other systems.
-
-IF(WIN32)
-    SET(HOST_SYSTEM "win32")
-ELSE(WIN32)
-    IF(APPLE)
-        SET(HOST_SYSTEM "macosx")
-        EXEC_PROGRAM(sw_vers ARGS -productVersion OUTPUT_VARIABLE HOST_SYSTEM_VERSION)
-        STRING(REGEX MATCH "[0-9]+.[0-9]+" MACOS_VERSION "${HOST_SYSTEM_VERSION}")
-        IF(NOT DEFINED $ENV{MACOSX_DEPLOYMENT_TARGET})
-            # Set cache variable - end user may change this during ccmake or cmake-gui configure.
-            SET(CMAKE_OSX_DEPLOYMENT_TARGET ${MACOS_VERSION} CACHE STRING
-                "Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value.")
-        ENDIF()
-        IF(ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"
-                OR ARM_TARGET_OS STREQUAL "ios" OR ARM_TARGET_OS STREQUAL "ios64")
-        ELSE()
-            set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
-        ENDIF()
-    ELSE(APPLE)
-
-        IF(EXISTS "/etc/issue")
-            FILE(READ "/etc/issue" LINUX_ISSUE)
-            IF(LINUX_ISSUE MATCHES "CentOS")
-                SET(HOST_SYSTEM "centos")
-            ELSEIF(LINUX_ISSUE MATCHES "Debian")
-                SET(HOST_SYSTEM "debian")
-            ELSEIF(LINUX_ISSUE MATCHES "Ubuntu")
-                SET(HOST_SYSTEM "ubuntu")
-            ELSEIF(LINUX_ISSUE MATCHES "Red Hat")
-                SET(HOST_SYSTEM "redhat")
-            ELSEIF(LINUX_ISSUE MATCHES "Fedora")
-                SET(HOST_SYSTEM "fedora")
-            ENDIF()
-
-            STRING(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" HOST_SYSTEM_VERSION "${LINUX_ISSUE}")
-        ENDIF(EXISTS "/etc/issue")
-
-        IF(EXISTS "/etc/redhat-release")
-            FILE(READ "/etc/redhat-release" LINUX_ISSUE)
-            IF(LINUX_ISSUE MATCHES "CentOS")
-                SET(HOST_SYSTEM "centos")
-            ENDIF()
-        ENDIF(EXISTS "/etc/redhat-release")
-
-        IF(NOT HOST_SYSTEM)
-            SET(HOST_SYSTEM ${CMAKE_SYSTEM_NAME})
-        ENDIF()
-
-    ENDIF(APPLE)
-ENDIF(WIN32)
-
-# query number of logical cores
-CMAKE_HOST_SYSTEM_INFORMATION(RESULT CPU_CORES QUERY NUMBER_OF_LOGICAL_CORES)
-
-MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES)
-
-MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}, version: ${HOST_SYSTEM_VERSION}")
-MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")
-
-# external dependencies log output
-SET(EXTERNAL_PROJECT_LOG_ARGS
-    LOG_DOWNLOAD    0     # Wrap download in script to log output
-    LOG_UPDATE      1     # Wrap update in script to log output
-    LOG_CONFIGURE   1     # Wrap configure in script to log output
-    LOG_BUILD       0     # Wrap build in script to log output
-    LOG_TEST        1     # Wrap test in script to log output
-    LOG_INSTALL     0     # Wrap install in script to log output
-)
diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake
deleted file mode 100644
index 3bf12094e4..0000000000
--- a/cmake/tensorrt.cmake
+++ /dev/null
@@ -1,38 +0,0 @@
-if(NOT WITH_GPU)
-    return()
-endif()
-
-set(TENSORRT_ROOT "/usr" CACHE PATH "TENSORRT ROOT")
-find_path(TENSORRT_INCLUDE_DIR NvInfer.h
-    PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/include
-    $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/include
-    NO_DEFAULT_PATH
-)
-
-find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a
-    PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/lib
-    $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/lib
-    NO_DEFAULT_PATH
-    DOC "Path to TensorRT library.")
-
-if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY)
-  if(WITH_DSO)
-    set(TENSORRT_FOUND ON)
-  endif(WITH_DSO)
-else()
-    set(TENSORRT_FOUND OFF)
-endif()
-
-if(TENSORRT_FOUND)
-    file(READ ${TENSORRT_INCLUDE_DIR}/NvInfer.h TENSORRT_VERSION_FILE_CONTENTS)
-    string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION
-        "${TENSORRT_VERSION_FILE_CONTENTS}")
-    string(REGEX REPLACE "define NV_TENSORRT_MAJOR +([0-9]+)" "\\1"
-        TENSORRT_MAJOR_VERSION "${TENSORRT_MAJOR_VERSION}")
-
-    message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
-        "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
-    include_directories(${TENSORRT_INCLUDE_DIR})
-    link_directories(${TENSORRT_LIBRARY})
-    add_definitions(-DPADDLE_WITH_TENSORRT)
-endif()
diff --git a/cmake/util.cmake b/cmake/util.cmake
deleted file mode 100644
index 02667dbce6..0000000000
--- a/cmake/util.cmake
+++ /dev/null
@@ -1,55 +0,0 @@
-# Some common routine for paddle compile.
-
-# target_circle_link_libraries
-# Link libraries to target which has circle dependencies.
-#
-# First Argument: target name want to be linked with libraries
-# Rest Arguments: libraries which link together.
-function(target_circle_link_libraries TARGET_NAME)
-    if(APPLE)
-        set(LIBS)
-        set(inArchive OFF)
-        set(libsInArgn)
-
-        foreach(arg ${ARGN})
-            if(${arg} STREQUAL "ARCHIVE_START")
-                set(inArchive ON)
-            elseif(${arg} STREQUAL "ARCHIVE_END")
-                set(inArchive OFF)
-            else()
-                if(inArchive)
-                    list(APPEND LIBS "-Wl,-force_load")
-                endif()
-                list(APPEND LIBS ${arg})
-                list(APPEND libsInArgn ${arg})
-            endif()
-        endforeach()
-        if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
-            if(NOT IOS_ENABLE_BITCODE)
-                list(APPEND LIBS "-undefined dynamic_lookup")
-            endif()
-        endif()
-        list(REVERSE libsInArgn)
-        target_link_libraries(${TARGET_NAME}
-            ${LIBS}
-            ${libsInArgn})
-
-    else()  # LINUX
-        set(LIBS)
-
-        foreach(arg ${ARGN})
-            if(${arg} STREQUAL "ARCHIVE_START")
-                list(APPEND LIBS "-Wl,--whole-archive")
-            elseif(${arg} STREQUAL "ARCHIVE_END")
-                list(APPEND LIBS "-Wl,--no-whole-archive")
-            else()
-                list(APPEND LIBS ${arg})
-            endif()
-        endforeach()
-
-        target_link_libraries(${TARGET_NAME}
-                "-Wl,--start-group"
-                ${LIBS}
-                "-Wl,--end-group")
-    endif()
-endfunction()
diff --git a/cmake/version.cmake b/cmake/version.cmake
deleted file mode 100644
index 8bcc4ffe72..0000000000
--- a/cmake/version.cmake
+++ /dev/null
@@ -1,66 +0,0 @@
-# Get the latest git tag.
-set(PADDLE_VERSION $ENV{PADDLE_VERSION})
-set(tmp_version "HEAD")
-set(TAG_VERSION_REGEX "[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?")
-set(COMMIT_VERSION_REGEX "[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+")
-# set(LATEST_PADDLE_VERSION "latest")
-set(LATEST_PADDLE_VERSION "0.0.0")
-
-while ("${PADDLE_VERSION}" STREQUAL "")
-  # Check current branch name
-  execute_process(
-    COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref ${tmp_version}
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-    OUTPUT_VARIABLE GIT_BRANCH_NAME
-    RESULT_VARIABLE GIT_BRANCH_RESULT
-    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-  if (NOT ${GIT_BRANCH_RESULT})
-    execute_process(
-      COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 --always ${tmp_version}
-      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-      OUTPUT_VARIABLE GIT_TAG_NAME
-      RESULT_VARIABLE GIT_RESULT
-      ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-    if (NOT ${GIT_RESULT})
-      # Check if current branch is release branch
-      if (${GIT_BRANCH_NAME} MATCHES "release/${TAG_VERSION_REGEX}")
-        # Check the tag is a correct version
-        if (${GIT_TAG_NAME} MATCHES "${COMMIT_VERSION_REGEX}")
-          # if no tag was found, set PADDLE_VERSION to "latest"
-          set(PADDLE_VERSION "${LATEST_PADDLE_VERSION}")
-        elseif (${GIT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
-          string(REPLACE "v" "" PADDLE_VERSION ${GIT_TAG_NAME})
-        else()  # otherwise, get the previous git tag name.
-          set(tmp_version "${GIT_TAG_NAME}~1")
-        endif()
-      else()
-        execute_process(
-          COMMAND ${GIT_EXECUTABLE} describe --exact-match --tags ${tmp_version}
-          WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-          OUTPUT_VARIABLE GIT_EXACT_TAG_NAME
-          RESULT_VARIABLE GIT_EXACT_TAG_RESULT
-          ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-        if (NOT ${GIT_EXACT_TAG_NAME})
-          # Check if current branch is tag branch
-          if (${GIT_EXACT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
-            string(REPLACE "v" "" PADDLE_VERSION ${GIT_EXACT_TAG_NAME})
-          else()
-            set(PADDLE_VERSION "${LATEST_PADDLE_VERSION}")
-          endif()
-        else()
-          # otherwise, we always set PADDLE_VERSION to "latest"
-          set(PADDLE_VERSION "${LATEST_PADDLE_VERSION}")
-        endif()
-      endif()
-    else()
-      set(PADDLE_VERSION "${LATEST_PADDLE_VERSION}")
-      message(WARNING "Cannot add paddle version from git tag")
-    endif()
-  else()
-    set(PADDLE_VERSION "${LATEST_PADDLE_VERSION}")
-    message(WARNING "Cannot add paddle version for wrong git branch result")
-  endif()
-endwhile()
-
-add_definitions(-DPADDLE_VERSION=${PADDLE_VERSION})
-message(STATUS "Paddle version is ${PADDLE_VERSION}")
diff --git a/cpp_demo.md b/cpp_demo.md
new file mode 100644
index 0000000000..bfb3439998
--- /dev/null
+++ b/cpp_demo.md
@@ -0,0 +1,271 @@
+<!--ts-->
+* [C++ Demo](#c-demo)
+  * [编译](#编译-1)
+  * [准备执行环境](#准备执行环境)
+     * [使用安卓手机](#使用安卓手机)
+     * [使用安卓模拟器](#使用安卓模拟器)
+  * [下载模型并运行示例](#下载模型并运行示例)
+  * [Demo 程序运行结果](#demo-程序运行结果)
+  * [如何在代码中使用 API](#如何在代码中使用-api)
+
+<!-- Added by: yanchunwei, at: Mon Aug 26 22:23:07 CST 2019 -->
+
+<!--te-->
+
+# `C++` Demo
+
+## 编译
+
+首先按照[PaddleLite 源码编译](https://github.com/PaddlePaddle/Paddle-Lite/wiki/source_compile)准备交叉编译环境，之后拉取最新[PaddleLite release发布版代码](https://github.com/PaddlePaddle/Paddle-Lite)。下面以Android-ARMv8架构为例，介绍编译过程，并最终在手机上跑通MobilNetv1模型。
+
+进入 Paddle-Lite 目录，运行以下命令编译代码（**需加编译选项`--build_extra=ON`确保完整编译**）：
+
+```shell
+./lite/tools/build.sh        \
+    --arm_os=android         \
+    --arm_abi=armv8          \
+    --arm_lang=gcc           \
+    --android_stl=c++_static \
+    --build_extra=ON         \
+    full_publish
+```
+
+编译完成后 `./build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/` 文件夹下包含：
+
+```bash
+cxx/include/
+cxx/lib/libpaddle_api_full_bundled.a
+cxx/lib/libpaddle_api_light_bundled.a
+demo/cxx/  #其中包括{include  Makefile.def  mobile_light}
+third_party/gflags/
+```
+
+## 准备执行环境
+
+执行环境有两种：使用安卓手机；若没安卓手机，也可在安卓模拟器中执行。
+
+### 环境一：使用安卓手机
+
+将手机连上电脑，在手机上打开选项 -> 开启-开发者模式 -> 开启-USB调试模式。确保 `adb devices` 能够看到相应的设备。
+
+### 环境二：使用安卓模拟器
+
+运行下面命令，分别创建安卓armv8、armv7架构的模拟器。若需在真机测试，将模拟器换成相应架构的真机环境即可。
+
+```shell
+# android-armv8
+adb kill-server
+adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
+echo n | avdmanager create avd -f -n paddle-armv8 -k "system-images;android-24;google_apis;arm64-v8a"
+echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv8 -noaudio -no-window -gpu off -port 5554 &
+sleep 1m
+```
+
+```shell
+# android-armv7
+adb kill-server
+adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
+echo n | avdmanager create avd -f -n paddle-armv7 -k "system-images;android-24;google_apis;armeabi-v7a"
+echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -port 5554 &
+sleep 1m
+```
+
+## 下载模型并运行示例
+
+```bash
+cd inference_lite_lib.android.armv8/demo/cxx/mobile_full
+wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
+tar zxvf mobilenet_v1.tar.gz
+
+make
+
+adb -s emulator-5554 push mobilenet_v1 /data/local/tmp/
+adb -s emulator-5554 push mobilenetv1_full_api /data/local/tmp/
+adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_full_api
+adb -s emulator-5554 shell "/data/local/tmp/mobilenetv1_full_api --model_dir=/data/local/tmp/mobilenet_v1 --optimized_model_dir=/data/local/tmp/mobilenet_v1.opt"
+```
+注：我们也提供了轻量级 API 的 demo，可以执行以下代码运行轻量级 API 示例。
+
+```bash
+cd ../mobile_light
+make
+adb -s emulator-5554 push mobilenetv1_light_api /data/local/tmp/
+adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_light_api
+adb -s emulator-5554 shell "/data/local/tmp/mobilenetv1_light_api --model_dir=/data/local/tmp/mobilenet_v1.opt --threads=1 "
+```
+## Demo 程序运行结果
+Demo 运行成功后 ，将在控制台输出预测结果的前10个类别的预测概率：
+
+```bash
+Output dim: 1000
+Output[0]: 0.000191
+Output[100]: 0.000160
+Output[200]: 0.000264
+Output[300]: 0.000211
+Output[400]: 0.001032
+Output[500]: 0.000110
+Output[600]: 0.004829
+Output[700]: 0.001845
+Output[800]: 0.000202
+Output[900]: 0.000586
+```
+
+## 如何在代码中使用 API
+
+在C++中使用PaddleLite API非常简单，不需要添加太多额外代码，具体步骤如下：
+
+- 加入头文件引用
+
+```cpp
+  #include <iostream>
+  #include <vector>        
+  #include "paddle_api.h"          
+  #include "paddle_use_kernels.h"  
+  #include "paddle_use_ops.h"      
+  #include "paddle_use_passes.h"   
+```
+
+- 通过MobileConfig设置：模型文件位置（model_dir）、线程数（thread）和能耗模式( power mode )。输入数据（input），从 MobileConfig 创建 PaddlePredictor 并执行预测。  （注：Lite还支持从memory直接加载模型，可以通过MobileConfig::set_model_buffer方法实现）
+
+代码示例：
+```cpp
+  // 1. Create MobileConfig
+  MobileConfig config;
+
+  // 2. Load model
+  config.set_model_dir("path to your model directory");    //model dir
+    /*load model: Lite supports loading model from file or from memory (naive buffer from optimized model)
+    //Method One: Load model from memory:
+    void set_model_buffer(const char* model_buffer,
+                        size_t model_buffer_size,
+                        const char* param_buffer,
+                        size_t param_buffer_size)
+    //Method Two: Load model from file:
+    void set_model_dir(const std::string& model_dir)  */
+
+  // 3. Set MobileConfig (or you can skip this step to use default value):
+  config.set_power_mode(LITE_POWER_HIGH);   //power mode
+    /*power modes: Lite supports the following power modes
+        LITE_POWER_HIGH
+        LITE_POWER_LOW 
+ 	LITE_POWER_FULL
+  	LITE_POWER_NO_BIND
+  	LITE_POWER_RAND_HIGH
+  	LITE_POWER_RAND_LOW */
+  config.set_threads("num of threads");        //threads
+
+  // 4. Create PaddlePredictor by MobileConfig
+  std::shared_ptr<PaddlePredictor> predictor =
+  CreatePaddlePredictor<MobileConfig>(config);
+
+  // 5. Prepare input data
+  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+  input_tensor->Resize({1, 3, 224, 224});
+  auto* data = input_tensor->mutable_data<float>();
+  for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+    data[i] = 1;
+  }
+
+  // 6. Run predictor
+  predictor->Run();
+
+  // 7. Get output
+  std::unique_ptr<const Tensor> output_tensor(
+  std::move(predictor->GetOutput(0)));
+```
+
+## CxxConfig案例: OCR_model的运行
+
+1. OCR 模型文件：
+  - 我们提供Pb格式的[ocr_attention_mode](https://paddle-inference-dist.cdn.bcebos.com/ocr_attention.tar.gz)l下载
+  - 也可以从[Paddle/model项目](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/ocr_recognition)中训练出模型
+
+2. 示例代码：
+```c++
+#include <gflags/gflags.h>
+#include <stdio.h>
+#include <vector>
+#include "paddle_api.h"          // NOLINT
+#include "paddle_use_kernels.h"  // NOLINT
+#include "paddle_use_ops.h"      // NOLINT
+#include "paddle_use_passes.h"   // NOLINT
+using namespace paddle::lite_api;  // NOLINT
+
+DEFINE_string(model_dir, "", "Model dir path.");
+DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels");
+
+int64_t ShapeProduction(const shape_t& shape) {
+  int64_t res = 1;
+  for (auto i : shape) res *= i;
+  return res;
+}
+
+void RunModel() {
+  // 1. Set CxxConfig
+  CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  std::vector<Place> valid_places{Place{TARGET(kARM), PRECISION(kFloat)},Place{TARGET(kHost), PRECISION(kFloat)}};
+  config.set_preferred_place(Place{TARGET(kARM), PRECISION(kFloat)});
+  config.set_valid_places(valid_places);
+
+  // 2. Create PaddlePredictor by CxxConfig
+  std::shared_ptr<PaddlePredictor> predictor =
+      CreatePaddlePredictor<CxxConfig>(config);
+
+  // 3. Prepare input data
+  //input 0
+  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+  input_tensor->Resize(shape_t({1,1,48,512}));
+  auto* data = input_tensor->mutable_data<float>();
+  for(int i = 0; i < ShapeProduction(input_tensor->shape()); ++i){
+      data[i] = 1;
+  }
+  //input1
+  std::unique_ptr<Tensor> init_ids(std::move(predictor->GetInput(1)));
+  init_ids->Resize(shape_t({1,1}));
+  auto* data_ids = init_ids->mutable_data<float>();
+  for(int i = 0; i < ShapeProduction(init_ids->shape()); ++i){
+      data_ids[i] = 0;
+  }
+  lod_t lod_i{{0,1},{0,1}};
+  init_ids->SetLoD(lod_i);
+  //input2
+  std::unique_ptr<Tensor> init_scores(std::move(predictor->GetInput(2)));
+  init_scores->Resize(shape_t({1,1}));
+  auto* data_scores = init_scores->mutable_data<float>();
+  for(int i = 0; i < ShapeProduction(init_scores->shape()); ++i){
+      data_scores[i] = 0;
+  }
+  lod_t lod_s{{0,1},{0,1}};
+  init_scores->SetLoD(lod_s);
+
+
+  // 4. Run predictor
+  predictor->Run();
+
+  // 5. Get output 
+  std::unique_ptr<const Tensor> output_tensor(
+      std::move(predictor->GetOutput(0)));
+  for (int i = 0; i < ShapeProduction(output_tensor->shape()); i ++) {
+    printf("Output[%d]: %f\n", i, output_tensor->data<float>()[i]);
+  }
+}
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  RunModel();
+  return 0;
+}
+```
+3. 运行方法：
+参考以上代码编译出可执行文件`OCR_DEMO`，模型文件夹为`ocr_attention`。手机以USB调试、文件传输模式连接电脑
+在终端中输入以下命令执行OCR model测试：
+```
+#OCR_DEMO为编译出的可执行文件名称，ocr_attention为ocr_attention模型的文件夹名称
+adb push OCR_DEMO data/local/tmp
+adb push ocr_attention data/local/tmp
+adb shell 'cd data/local/tmp && ./OCR_DEMO --model_dir=./OCR_DEMO'
+```
+4. 运行结果
+
+<img src='https://user-images.githubusercontent.com/45189361/64398400-46531580-d097-11e9-9f1c-5aba1dfbc24f.png' align='left' width="150" height="200"/>
\ No newline at end of file
diff --git a/cxx_api.md b/cxx_api.md
new file mode 100644
index 0000000000..a05b2d3d69
--- /dev/null
+++ b/cxx_api.md
@@ -0,0 +1,63 @@
+# C++ API接口使用指南
+
+请参考[源码编译](./source_compile)确保 Lite 可以正确编译，下面用Lite的c++接口加载并执行 MobileNetV1 模型为例，详细说明使用方法。
+
+## 准备模型
+
+Lite支持PaddlePaddle训练好的模型，MobileNetV1模型可以由以下三种方式得到：
+
+- 直接下载训练好的[MobileNetV1模型](https://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz)
+- 使用[PaddlePaddle](https://paddlepaddle.org.cn/)构建MobileNetV1网络并训练
+- 使用[X2Paddle](./x2paddle)对caffe或者tensorflow的MobileNetV1模型进行转换得到
+
+## 模型优化
+
+使用Model Optimize Tool优化模型，使得模型预测过程表现出优异的性能。Model Optimize Tool的具体使用方法请参考[文档](./model_optimize_tool)。
+
+- 准备model_optimize_tool
+- 使用model_optimize_tool优化模型
+- 得到优化后的模型，包括__model__.nb文件和param.nb文件
+
+## 加载模型
+
+加载MobileNetV1网络模型，创建predictor，具体可以参考```paddlelite/lite/api/model_test.cc```文件。
+```c++
+lite::DeviceInfo::Init();
+lite::DeviceInfo::Global().SetRunMode(lite::LITE_POWER_HIGH, thread_num);
+lite_api::MobileConfig config;
+config.set_model_dir(model_dir);
+
+auto predictor = lite_api::CreatePaddlePredictor(config);
+```
+
+## 设定输入
+
+得到input_tensor，设置输入值，此处我们设定为全1
+
+```cpp
+// 获取第 j 个 tensor 的句柄
+auto input_tensor = predictor->GetInput(j);
+input_tensor->Resize(input_shapes[j]);
+
+// 获取数据指针，以塞入数据
+auto input_data = input_tensor->mutable_data<float>();
+int input_num = 1;
+for (int i = 0; i < input_shapes[j].size(); ++i) {
+  input_num *= input_shapes[j][i];
+}
+for (int i = 0; i < input_num; ++i) {
+  input_data[i] = 1.f;
+}
+```
+
+## 执行并输出
+
+```cpp
+predictor.Run()；
+auto* out = predictor.GetOutput(0);
+LOG(INFO) << "dims " << out->dims();
+LOG(INFO) << "out data size: " << out->data_size();
+```
+
+输出为```dims dims{1000,}， out data size: 1000```
+
diff --git a/debug_tools.md b/debug_tools.md
new file mode 100644
index 0000000000..b904fdcd71
--- /dev/null
+++ b/debug_tools.md
@@ -0,0 +1,77 @@
+# Debug tools
+
+**Lite Model Debug Tool** 是用来检查Paddle-Lite框架与Paddle-Fluid框架运行时tensor(包括variable与weight)之间diff信息的基础工具。
+
+## 工作流程:
+
+1. 运行 `/bin/bash check_model.sh --model_dir=<your_model_path> --build_root_dir=<your_cmake_root_dir> debug_cpp_stage` 获得模型在Paddle-Lite框架下的运行拓扑信息、varibles信息和weights信息。运行后拓扑信息将会存储在默认名为 `topo_file.txt` 的文件中，variables和weights信息将会存储在默认名为 `tensor_cpp.txt` 的文件中。
+2. 运行 `/bin/bash check_model.sh --model_dir=<your_model_path> --build_root_dir=<your_cmake_root_dir> debug_py_stage`执行fluid框架预测以获取相同模型在fluid框架下的variable与weight信息(注意：我们使用fluid的python api运行fluid模型，因此您在运行此步之前应确保已正确安装fluid的python api)。然后debug tool将会自动比较Paddle-Lite框架输出的信息和Paddle-Fluid框架输出的信息来检查是否存在运行时diff。 执行Paddle-Fluid框架，输出的信息将会存储在默认名为 `tensor_py.txt` 的文件中，相应的diff信息将会存储在默认名为 `diff.txt`的文件中(默认情况下，只会输出执行拓扑序中第一个有diff的variable相关的信息)。
+
+## 注意事项:
+
+1. 输出的结果是在**执行完一次预测后**输出的相应变量/权重的最终值，因此如果您在预测过程进行过诸如变量复用/子图融合等优化方法，则相应的输出可能会出现偏差。
+2. 默认情况下debug tools将以全1作为输入进行比对。
+3. 默认情况下，为了保证与Paddle-Fluid框架的结果可比对，debug tool将会禁用掉所有的Paddle-Lite的优化策略。
+4. Paddle-Lite框架的执行环境由与您的编译选项有关，比如您开启了LITE_WITH_ARM编译选项，那debug tool的`debug_cpp_stage`也需要在ARM平台下运行。
+
+## Diff信息输出：
+
+如果debug tool检测到diff信息，那么在`diff.txt`中将会输出类似以下结构信息
+
+```c++
+>>>>>>>>>>>>>>>>>>DIFF VARIABLE: dropout_0.tmp_0<<<<<<<<<<<<<<<<<<<
+dropout	(X:pool2d_7.tmp_0)	(Mask:dropout_0.tmp_1 Out:dropout_0.tmp_0)
+--------------- Tensor File info ---------------
+pool2d_7.tmp_0	{1,1536,1,1}	0.749892 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0150336 0.621641 0.147099 0.636727 0.0 0.0 0.00410917 0.784708 0.0 0.0704846 0.233599 0.840123 0.239201 0.112878 0.0 0.155352 0.306906 0.0 0.0 0.860938 0.221037 0.787316 0.256585 ... 
+dropout_0.tmp_0	{1,1536,1,1}	0.749892 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0150336 0.621641 0.147099 0.636727 0.0 0.0 0.00410917 0.784708 0.0 0.0704846 0.233599 0.840123 0.239201 0.112878 0.0 0.155352 0.306906 0.0 0.0 0.860938 0.221037 0.787316 0.256585 ...
+--------------- Fluid Tensor info ---------------
+pool2d_7.tmp_0	{1,1536,1,1}	0.7498912 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.015033395 0.6216395 0.14709876 0.63672537 0.0 0.0 0.0041093696 0.7847073 0.0 0.07048465 0.23359808 0.8401219 0.23919891 0.1128789 0.0 0.1553514 0.3069055 0.0 0.0 0.8609365 0.22103554 ...
+dropout_0.tmp_0	{1,1536,1,1}	0.599913 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.012026716 0.4973116 0.117679015 0.5093803 0.0 0.0 0.0032874958 0.62776583 0.0 0.056387722 0.18687847 0.67209756 0.19135913 0.090303116 0.0 0.12428112 0.2455244 0.0 0.0 0.68874925 ... 
+```
+
+其中第二行为op相关信息，标明了执行哪个op出现了diff及其对应的输入输出变量名。Tensor File info为Paddle-Lite框架的输出信息，而Fluid Tensor info为Paddle-Fluid框架的相应输出信息。
+示例中的`dropout_0.tmp_1`没有相应的tensor信息是因为工具检测到其在预测的后序流程中未被使用，因此不会对预测结果造成影响，从而将其自动屏蔽掉以保证输出尽量简洁。
+
+## 其他选项：
+
+| Option                      | Description                                                  |
+| --------------------------- | ------------------------------------------------------------ |
+| --input_file                | 输入文件名，不同field以逗号分隔，相同field内以空格分隔, 只有文件中的第一行输入信息会被使用. 如果您不指定input_file，那么所有输入将会被置为1。注意：`debug_py_stage`目前不支持多field输入。 |
+| --cpp_topo_file             | 存储运行时拓扑信息，由`debug_cpp_stage`写入并且由`debug_py_stage`读取使用。 默认为`topo_file.txt` 。 |
+| --cpp_tensor_file           | 存储`debug_cpp_stage` 在运行拓扑序下的输出信息，默认为 `tensor_cpp.txt` 。 |
+| --tensor_names              | 如果此选项不为空，那么只输出由此选项中指定名字的variable/weight信息，名字间用逗号分隔。 |
+| --tensor_output_length      | 输出数据的长度，默认为全部输出。                             |
+| --py_threshold              | 判断diff发生的阈值，默认为 `1e-5` 。                         |
+| --py_tensor_file            | 存储`debug_py_stage` 在运行拓扑序下的输出信息，默认为`tensor_py.txt`. |
+| --py_output_file            | diff信息的存储文件，默认为`diff.txt`。                       |
+| --py_only_output_first_diff | 是否只输出运行时拓扑序中第一个有diff的var/op信息，默认为true |
+
+您可以参考 `check_model.sh` 脚本中的代码以获得更多细节.
+
+## Basic Profiler
+
+Basic profiler 用于 CPU 上kernel 耗时的统计，在 cmake 时添加 `-DLITE_WITH_PROFILER=ON` ，就可以开启相应支持。
+
+在模型执行完毕后，会自动打印类似如下 profiler 的日志
+
+```
+                        kernel   average       min       max     count
+                feed/def/1/4/2         0         0         0         1
+              conv2d/def/4/1/1      1175      1175      1175         1
+              conv2d/def/4/1/1      1253      1253      1253         1
+    depthwise_conv2d/def/4/1/1       519       519       519         1
+              conv2d/def/4/1/1       721       721       721         1
+     elementwise_add/def/4/1/1        18        18        18         1
+              conv2d/def/4/1/1      2174      2174      2174         1
+    depthwise_conv2d/def/4/1/1       380       380       380         1
+              conv2d/def/4/1/1       773       773       773         1
+     elementwise_add/def/4/1/1         2         2         2         1
+              conv2d/def/4/1/1      1248      1248      1248         1
+    depthwise_conv2d/def/4/1/1       492       492       492         1
+              conv2d/def/4/1/1      1150      1150      1150         1
+     elementwise_add/def/4/1/1        33        33        33         1
+     elementwise_add/def/4/1/1         3         3         3         1
+              conv2d/def/4/1/1      1254      1254      1254         1
+    depthwise_conv2d/def/4/1/1       126       126       126         1
+```
+
diff --git a/demos.md.toc.2019-08-26_222115 b/demos.md.toc.2019-08-26_222115
new file mode 100644
index 0000000000..ab60264fec
--- /dev/null
+++ b/demos.md.toc.2019-08-26_222115
@@ -0,0 +1,19 @@
+-e -e    * [Java Android Demo](#java-android-demo)
+      * [编译](#编译)
+      * [准备 demo 需要的其他文件](#准备-demo-需要的其他文件)
+         * [脚本方法](#脚本方法)
+         * [手动拷贝方法](#手动拷贝方法)
+         * [把 .so 动态库和 .jar 拷贝进安卓demo程序：](#把-so-动态库和-jar-拷贝进安卓demo程序)
+         * [把demo使用到的模型文件拷贝进安卓程序：](#把demo使用到的模型文件拷贝进安卓程序)
+      * [运行 Android 程序结果](#运行-android-程序结果)
+   * [C   Demo](#c-demo)
+      * [编译](#编译-1)
+      * [准备执行环境](#准备执行环境)
+         * [使用安卓手机](#使用安卓手机)
+         * [使用安卓模拟器](#使用安卓模拟器)
+      * [下载模型并运行示例](#下载模型并运行示例)
+      * [Demo 程序运行结果](#demo-程序运行结果)
+      * [如何在代码中使用 API](#如何在代码中使用-api)
+
+<!-- Added by: yanchunwei, at: Mon Aug 26 22:21:15 CST 2019 -->
+
diff --git a/demos.md.toc.2019-08-26_222307 b/demos.md.toc.2019-08-26_222307
new file mode 100644
index 0000000000..ae8ee6b208
--- /dev/null
+++ b/demos.md.toc.2019-08-26_222307
@@ -0,0 +1,19 @@
+-e -e    * [Java Android Demo](#java-android-demo)
+      * [编译](#编译)
+      * [准备 demo 需要的其他文件](#准备-demo-需要的其他文件)
+         * [脚本方法](#脚本方法)
+         * [手动拷贝方法](#手动拷贝方法)
+         * [把 .so 动态库和 .jar 拷贝进安卓demo程序：](#把-so-动态库和-jar-拷贝进安卓demo程序)
+         * [把demo使用到的模型文件拷贝进安卓程序：](#把demo使用到的模型文件拷贝进安卓程序)
+      * [运行 Android 程序结果](#运行-android-程序结果)
+   * [C   Demo](#c-demo)
+      * [编译](#编译-1)
+      * [准备执行环境](#准备执行环境)
+         * [使用安卓手机](#使用安卓手机)
+         * [使用安卓模拟器](#使用安卓模拟器)
+      * [下载模型并运行示例](#下载模型并运行示例)
+      * [Demo 程序运行结果](#demo-程序运行结果)
+      * [如何在代码中使用 API](#如何在代码中使用-api)
+
+<!-- Added by: yanchunwei, at: Mon Aug 26 22:23:07 CST 2019 -->
+
diff --git a/for-developer.md b/for-developer.md
new file mode 100644
index 0000000000..8c01f6e1e0
--- /dev/null
+++ b/for-developer.md
@@ -0,0 +1,15 @@
+# 基础须知
+
+可以参考 [Paddle 开发者文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/advanced_usage/development/contribute_to_paddle/local_dev_guide.html)。
+
+# 提交PR
+
+需要在 commit message 里加上 `test=develop` 才能触发 CI
+
+# 版本发布检查清单
+
+1. 所有 feature 梳理，确认状态
+2. 所有 QA 测试结果梳理，确认版本可靠
+3. Release note 确认 review 通过
+4. 确认需要 release 的 binary 编译完毕
+
diff --git a/fpga.md b/fpga.md
new file mode 100644
index 0000000000..fdb48a26bf
--- /dev/null
+++ b/fpga.md
@@ -0,0 +1,107 @@
+# Lite基于fpga的模型预测
+
+Paddle Lite支持基于arm的fpga zu3/zu5/zu9的模型预测，提供armv8的交叉编译
+
+Lite基于fpga运行模型需要相应的fpga驱动，目前只支持百度edgeboard开发板
+
+**Lite实现fpga简介**
+
+Lite支持fpga作为后端硬件进行模型推理，其主要特性如下：
+
+- Lite中fpga的kernel（feed、fetch除外）均以FP16、NHWC的格式作为输入输出格式，所有的weights和bias仍为FP32、NCHW的格式，feed的输入和fetch的输出均为FP32、NCHW格式的数据，在提升计算速度的同时能做到用户对数据格式无感知
+
+- 对于fpga暂不支持的kernel，均会切回arm端运行，实现arm+fpga混合布署运行
+
+- 目前fpga成本功耗都较低，Lite基于fpga的模型性能远远好于arm端，可作为边缘设备首选硬件
+# 编译
+
+需要提前准备带有fpgadrv.ko的fpga开发板（如edgeboard开发板）和Lite代码
+
+CMAKE编译选项：
+
+- 设置`LITE_WITH_FPGA=ON`和`LITE_WITH_ARM=ON`
+
+其他编译选项与ARM编译相同，可以参考[“Paddle Lite在Docker下的ARM编译”](./source_compile)。
+示例如下：
+```shell
+    cmake .. \
+        -DWITH_GPU=OFF \
+        -DWITH_MKL=OFF \
+        -DWITH_LITE=ON \
+        -DLITE_WITH_CUDA=OFF \
+        -DLITE_WITH_X86=OFF \
+        -DLITE_WITH_ARM=ON \
+        -DLITE_WITH_OPENMP=ON   \
+        -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
+        -DWITH_TESTING=ON \
+        -DLITE_WITH_FPGA=ON \
+        -DARM_TARGET_OS=armlinux 
+    make -j2
+```
+Lite提供fpga编译脚本，位于lite/tools/build_fpga.sh，在Lite根目录执行该脚本即可编译
+
+# 运行示例
+
+- **运行文件准备**
+
+下面以Resnet50模型为例，介绍如何使用edgeboard开发板实现模型运行
+
+```bash
+#连接开发板，并利用screen命令启动 [本机执行]
+screen /dev/cu.SLAB_USBtoUART 115200
+#查看开发板ip并ssh登录到开发板，假设开发板ip为192.0.1.1 [本机执行]
+ssh root@192.0.1.1
+
+#在开发板上建立目录workspace，拷贝fpga驱动fpgadrv.ko到workspace目录 [开发板执行]
+mkdir workspace && scp $DRIVER_PATH/fpgadrv.ko workspace
+
+#将Lite中编译好的测试程序拷贝到开发板workspace目录 [本机执行]
+scp $LITE_ROOT/build_fpga/lite/api/test_resnet50_fpga root@$EDGEBOARD_IP:workspace/
+#把Resnet50的模型和参数scp到开发板workspace目录 [本机执行]
+scp -r $LITE_ROOT/build_fpga/lite/third_party/install/resnet50/ root@$EDGEBOARD_IP:workspace/
+
+#在运行模型前需要加载fpga驱动 [开发板执行]
+insmod fpgadrv.ko
+#给测试程序添加可运行权限 [开发板执行]
+chmod +x test_resnet50_fpga
+```
+
+- **使用fpga进行模型预测**
+
+```bash
+#以下命令均在开发板上运行
+#直接运行单测程序
+./test_resnet50_fpga --model_dir=resnet50
+#如果需要测试性能，可以用repeats参数设置模型运行次数（如1000），同时可以设置预热次数（如10）来让硬件事先运行到稳定水平
+./test_resnet50_fpga --model_dir=resnet50 --repeats=1000 --warmup=10
+```
+
+# 如何在Code中使用
+
+在Lite中使用fpga与ARM相似，具体的区别如下：
+
+- 由于fpga运行模式为fp16精度、nhwc布局，所以需要修改相应的`valid_place`和`preferred_place`
+- fpga不需要device的初始化和运行模式设置
+
+代码示例：
+```cpp
+lite::Predictor predictor;
+std::vector<Place> valid_places(
+      {Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
+       Place{TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNHWC)}});
+Place preferred_place = Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)};
+
+predictor.Build(model_dir, preferred_place, valid_places);
+
+auto* input_tensor = predictor.GetInput(0);
+input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+auto* data = input_tensor->mutable_data<float>();
+auto item_size = input_tensor->dims().production();
+//假设设置输入数据全为1
+for (int i = 0; i < item_size; i++) {
+  data[i] = 1;
+}
+
+predictor.Run();
+auto* out = predictor.GetOutput(0);
+```
diff --git a/images/architecture.jpg b/images/architecture.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..0e6caa88a932553e212cbd899515ef4f5366839a
GIT binary patch
literal 80268
zcmeFZ2UHX5-Y*<PQ9w|TB1n-EAe11XBcOB=YC@B$Vkiks5Kx-Bm0nE<p@Tp|2)%`>
zq972ulz<dbx>S*F|L~l%&%5`z=X+o8z3*M?e&5;;YcVsKdH(hH{AT8v|IClqKi&aO
z!}Ov006IDVfbQ@Q@M9971E4?hv;F)}f7p&P9R1l&FfbfrU_8OZ#CU>{k?G{AlT6I4
z%#4gIY%Hv&PM>Bw&2*Ce4EyObhwG<*3ZeVilm6(5!;Yt!8JQ1%{hw?<o&(rU9LYO^
zrKb}H9ATrQXQTV^62O0GBt3wh?x(^3XvY|iAL=<m$9T9Kc^UvXc4#%jv6C!}$LZ<m
z7y)!g=#L&A!p44vL+Au2moWS~_gT4A8br&&+UClQ-%{|yBO*%L2xnivzz3MTnmSPh
z<y!%_DKCcP6;=En;%ap)H=_tu9~psSy86h%>h?n|yocJ4(Eo=8hlbG6A34-=xKEu8
zaO6<s5k}^thll*_7&^8?6NKRGXO3~m$!j5eF@v1Kkn7IB<yBR`*i=yTyQPg~nE5Eu
z`eOoc@=)p#HhMO|CBXO8vByxa?G#4VGBjL*I;EldU1&UUY9c|^YlOYFgvOP^Bj7SE
z7WLkRjp3q1u|?yoC<PK}X`E5h2U8SW=c7>}nIXZEQP#8ZIHb2g)(^nxYNX?R@jl;2
zui-}(0vAU>J>Eew%)5Gd`UdH@qgO&w(rQ>9pxThM6?_f+`=EzlI`?%3iwjDYUH0Yr
z8EumzirscNWP$!jm*~Cdz&YGRiYhdng1TE&haL*ra}_g={z~BCz7`EXtpv9sX<SLT
zFqiC2F0c;HvMu@na9{i%h1f#Ct@Of|K=k8b^w)Jy4|aROvIxBswJnzRiYWCj@@ykZ
zC2WO!eqJ;M^ngX&p<7&~ag~Ri&ft2_R2RbD_$iDEOy`v&MhbwuN`+97uR8U7g(Ku{
z<GvE3UA-FX<bfJF2-09dh%MmFUVDO`-@9fHIkNxWXk=Z5l#F+oE`CDWNKs{W1c}QN
zMI+*4L|u{ELb_V#B;x;_0~GW4ja^pVg^c)=8u9o?j^lw$aZd<07l;^6`ZL)gFzb}6
zh}4Tk2uocgveu$c{Q0*Vg2%gUSj~F8X30MQI~glGH*=MxBfFzzF;m$VBnd+1$EkPu
zN~y%FadP~o>9$FVa9j*)EqcJdNAYYiH^@O%7FZwq3{_rXa6G<{FT}3hOkvEd;2|jO
zoFuZr#=fZIc5sxiU!Oi~Jh{UVyw-f&?_zSX{{lzy-HTZq`a?S<A6+Hf{>6eni9OLR
z%+M|4znqe`ZN+2wXjQ*-_el*r(QY;1Z2t%Ic~je>E_=wGu48XfuaOSAuXj>PEw@el
zKdYR=mGkA|C)+K40G1VL?`Jf<ZCO%IUlnOoBbmaX%5#k`biHI@EnLoO*^n9mbODH+
z-<_6Ik4MW{f)NdVPDVbQp~H?&iKlpsRrV;cIjHhngTI6AW_dmQU&G8ge;duo>sKFf
zk*BrGwk}BOBNNY7N?Y>Q<elPs-^7}p@(DoxAe0TK%Yw59?MjwF`XvX2M*Ez~Xy|V`
z8Fja2HJ)@}<bn_LXOu45T-`$j<v{Lu8>HK-kaPEc`gJ!KWUOLSTvuiO1Mt}-AXtT$
z1f+M-<)(h<BM;+!K=1px`6ks<4#LAUiB_(*?RKJS<0Z7Fva3awJ-9+XHl+2mB-=gz
zF`I&@rTCn4m__S;Nw;sqj+%)-G0W3+t1Rmv$w!W0>{I~s99o2e*Q&QzM{X~UEcYnm
zcHEc~y>eh*wIf4`7iOG_C)#!X9^Q9gMA}U~jQyP=y+c_}iEK>5C&u-va}mWg(z9jk
z{GPOxH`Wy+swHlkB9pzSGu($+Q)t8*HA_EvkUl3KIUkf#6Px}6;C~37sO2qOTSRb)
z2j{6D0K%c6zZLtLD_LrVCVo8}(fTXd@iV?6o?AAv{7hkpqTZU=S-}8?3hSArwp}KS
zAFl0zbnHaCw(Q1FFn|4sZS&lIf9T@myNu<Xn^}d|JePp|wd=4mlWpuo8A*tBmnu?k
ze4f7!XhI$yEwI33fvb3pl4dzL{gT@PHmz!6qYgHIcI5`Ct3Z*N9-JqdD}QePjYhml
zv|=xqrH%O)vSM)krEkY&P!l-u`bZ$m&JtOM^`pksP(d!;1}uiVoK>+wdk11oVSJaY
zy3JU|UK|EqaIV#ZG}!8#ReetKe2bi<muU`#H!FT7AbQB*-sz-&5q?a!5jxXqBszJe
zNbgPf&q&SqVcHNHdY;ZXVb2)<0nl66WxGHk<N8bt0-wSoh0LlJUfW`wWG@_9UWJpX
z{oGd)fSn(im=0w%9Y~J1>wLZQ6IaZ*96r%%6H08j<LjT>x)RanB<d|zXi7v$siLrv
zjHmXw-yh;Bl4QE|W_bZS4rD$`hV)yY>;LGUO}7zC>dJbSrOn^nvm<usxJk(Nnq*p#
zK<;+44GvIUto&Eovq>cD6$!krptzHWO^KahoNTnE#bIb(*@_(|5Zv18Q8L&z-)JzC
zCy-D1Xr}r0=uc|tOMeh}3%hEUVsU@(O?r|H69le+AAa5}&tE5YI0SIDGQJLA(R?V+
zcj<WZe=+5G4&sjYgD#~}Nwz?)Gk?SB`2?Ijnbs5w%rfX2i>9rNVugEV`6y&7l~Z2h
zX<M%ffM=jy>dZBAFfjbU-Jqx})MB9ih2rr0nVn~dE1oSjp0oKa0%s1#4+Hs?XVHVN
zFUKire)x!Tsfd?2sK}*uO!Vgf@BA1ywyl6!#b62{y80kOPo1#&<`~wi)KYj&`X@VR
z8sYBpvaLv6<=9@|{26|;O!=d0OTY;Vq{Qs@9cPbXuR{tw1BD2$^#-OFnUF0;R+&{2
z@dhQXu!Cy3%U3~sO1B>TGvsZ%>V(SiV??h0KRSwiYuX^mh%fT1OUeo5@?wX`M*EGD
ztjPWA6Im360ueE6tC{fHKsN~3eeq<MUF=x9`j^n3o(YyLh(z`WH{al({e3Lxsj_4}
zZmBU(w;+*6xr!3GV6+<G%{6MLcJ@unGcx82UV`ce(i5V{tEY>^arkrRYxBqdn9=L$
zQgFKiQm)T~pYRVI$MAFomYJ4VWKU95(I|a=6JU6b-gqwV&{bpd@S&@g2FyV8IJO}W
z>3shgLdA2bq)yJ{YSK^4pBmZ1wjsH2_9{hsudn=ML|uvBs_mD@j%h(?tk#3u0}uN0
z_4Pio+KsGGRNpjP;0tj`<j@u9ykq(&J<&#kKN-`u8%nr2K49bNobmUpo*)U;;CSgd
z8B^4Pv#+D0XOLy~5F6Iscz@LX(N^5e1&=Vhr?$0*)MrAbW*})LEtFo(;TVbt|H-f~
z{Nme$t*a?j{{V0J0>s!2xL9RQ9?qHMXj8I8A}6khK)}77E*bH~DH>+_)@w7qeTKBQ
z8WE1PCyy)h3xuVEGsl1@3N_(rcNc0_c2rh_^l`*<sE@XuIHC+Al((rH4phyhBPvWX
z9>z`0tWsA(1X=#P!39?kC2rqMTiMfnAbb!d<D6$f(x?MPdKOW;YAf}Tj_EPgF+7r^
z#w5+DsNs0XhVC57+L0gzI;X0|)u(R-%OMp=1>S5}B%o);F^Tcak+v`uzknxYP8CQ)
zVcF;_uX;>Ku+zGZ;X3c~^<k7`&_Mn9ZC7age0Zs)r2kS~`-Q#~9v&t9iQc#xxpCJ5
zNurs>U`n%!!!3m3Ay!@%{8Q&8^WQ~j3u&DL`W<VNCi1DOO<PykREx54zf?u<!m<_D
z%Qqh)d*dIOxaNa<Uykd>r+A22<a+DuzV6kuRU4NA56YPy@{EWbxvS{8#b$zZLyl^h
zD*kG;|1}3JD=gpWi{vg6W)y4~eZ~GdJ~^+B8vX71rg=UwxC^(keo6_c@25>$Nm28d
zbev&mjsJ(D&but|fD`jypo0YJh`y8M9_wmVT4;4ZwY6ixZXGX5&0b5i`KTzCQR@@`
zR+w6hDl7Z|M+9OGqwoIq==WQIKP(ySIy&Ah$XY}p6xB1qeMGZDNbTa}DH?DjJ#6P=
z52~h6X<TAB!+b6Vicy;pAV5lPeFuF!^`)ie-b?Qb48?yC_3u}Ylv%ekvqnWL@JW?8
z$mlnD-@%d)s8v)SVmuzIoFFCoUIZ$U1TDfLVGt3~9HV<@N{xzln=Wj;ZLrI{Ii8uT
z*{Wm8)~aff8PASKiAs!wr_ZPGq`q-U2gsbibdnm~`!(XFp51LLhJ!lP?S!&RVFYXE
zdZm-u*vX0N+7(!>bPTjNiJC96e*S3XiCC5~pQ#VDIrHLUpa~W&|KbNwO2KuIOx<jz
zhOS*xVqFjkvKkbc@9^)P`RA_zOV1x#Jrql|41Y<r;Bg3&BPgZv1`!EEwK7a1Kv8OM
ze0()TUMSknrJ2}Ozi0;`b!Ei4K6R)UxdcSpmn4}LLnHesAQ*yHllVZ9tWR{wNRpRp
zPLkca_5%<yEjWE_>8pV`mp!!iy^h9Zz>(_9e^bp0g51!KU+co2bG~@OqP%IPOTvp$
zdW0x(z{bXwo}LKT>sY&(8>e#Bp)JkEAugcli(!nCzx(`l=gHCJN8KkzboEzt>k<so
z*;L#JUmv<1_wHNMYrYe^C*dVo9Jyx^)L`3S+uDb*Kq=v>>66Dm4wn{$JS@R+AfM{k
z06P<z*IU8m7$Mu*ja(40BY_#rcXA}F_m!(6=AE};R<_kc8x6bFDzIHESsqQ9;8;Np
z>5%VbuCm|I4aGmuPazgs!bG6h1X?=kozArRltophT@R4QSK#4ym;3L^S!XgwRk-S{
z!nY>O(D-Wq#)Zf%MMyy*f^^C;VA6E9cL_kZ5cm9_N%+%BciF%J>E)%3?|nZ2Yz-a7
z|Fb})@A*nBEh!8HzxoKHRTp!{77E%`d(@KhwJpjo3T4DAUNUnD;yE*v=i^D_Sq$E$
zZt<MEViq{KYE_bd(3Y8D#k%7?((qB3gV#Ss)>M3cOdy5Xh;dAY&H9@n>57(&0S{Iy
z52_)#{|=zee`J7EI1~NtWBC5s6PCM<p`|^68Vp6NwT(=AigH{F8iWsQr%AlUV96})
zGkn=#X|!kDrOb%5hD@d2eA7G!|8)4~=rqZedXb52o2y@vCtIk$@-+zanqaJ*Dri3p
z(S8IKvCU(?{5|bHz2x%@U|<~l<?a}2Ypysi)G(E2Y>u3d?;3tr;5jZ>tX0M=43U=}
za>=DDzW=9?{+o@h9{{ma`%h=Sy>q(vGJtM{ZsEJJ;D?a5FVCO+07S1GnRL_$mmdKD
zBt?JkESJTg%{o<tAv~Lwh5P_~#S)Yd!ot+(8+VxZ8sT3L4*SJO>bUUV75p}Q=3B})
z<MJ_OGiLxm_1OQ5F)sU)0nH5G^}z^<n$_AaTx8MhV@j+rd}LZ!r*lu@1P<3&qY8=X
zh+B&@A}eiMgop>wZLS&yF-*pYKsO;5oGNd~ZneBHjC<X4aW@qTLSWv_&N?MSM5ukI
zO~hPs3^2KToq~z_$G&KMSsY#XeE-$bby|z+ToAPjD42SG^^SDC#&4I3?74Q4uJBO5
z3$o)M-YV70j?j0C9Ql^>ZQH}|+hwOGT=qH7b4LW2Wd^R9t(>5g@ZhfU#WU}P?Dv)R
z>;gVgpSQTRgicJgm~&Sg!!K2e?;d~9bL;h)&ktnK9WLB|5T*Vf8jh{LaA{m){HFD;
zdTQ_J_s4N|Rq1+v5OwN5G|&Za+Ioi^xo7?lYlTNk|1$Qk$o>U`|4Rl~`Zt}FMFHyl
z@`sbRiEWm*5F%}NM~en06gR1FFdC8OlP)Fla1U#!b-0YKp~|$WQkioCH|h!&u>{44
z<(Xq~WQky8>X@+hs}$Y@31vrHqF4dMRM$1gRZ(Fkpdp2A?Mz#PEy(|l0v?(cPdw@v
z4w=l4wIiiqSf2E60+>$zPOkL4&#zLtla|_-zCsb_Bswa&*Zm4UeGK{mV7H9yR{MJC
zOQHA|sn6+-<=#0j0RTVd-+BFCmPF^2F^788j-6uBr24l&Yo5H}+%Td_uO5hWHP|TH
zj58TNNA;EHjq%Vvjn<MFv4N-D5Z{UW?b7toff>6^`gMmK8QVUf&9VJhE8C_7?_O!`
z(qY_6ZyY>R;gj6l<M<v8nOYo|8xaJipKB+vKCPOq$c`%+60rg>Sj`ScIlca9)89^8
zKK&8ZR^fT4WXXADCz^JdF6B<`%vhbe)pk`tu&%n@&DBW($xpAFo(e0P16-VBR07tM
zI78PQoiL}kvefoEW1j_rD-07X0X#Nwru-o{+OJ;Zwjt>JrSPwi{+|R5UcRgO)Y*Vj
z!n5rn7HA-wEN6KHp*KkrWObR^)6gTr);~~oE$ofAJ+>Uy_rm>6(xoH9tL#=_7s<zy
zOkpxUk}^IQR*k9FJ|I(}4r+{%PB=bLYiYxJMZxF=QFIIP68HnaIvQySB8%zFD^rKg
zVY{6?w_$N|-RDa?ITnsU@Ig7)AAn79UCX=|=K)9mKDgydk9~0{`W||Csh1*D9q&PR
zcTDn9E_d!nprL0$McWTRlk>gm7IMnO;_}*M!142y|H29X*gRRnbBW-<_<d9C9@AK4
zF{Bjv9dlFs%k%WFQceK$<A2}jPrq&km+TYY1dQX*$#<UK?g&_e`K|<H1~nSsLWupt
zYz{#*vuIO=9w`e$t%fquTC4>|z$t1z-Jzl6jvIXW+}IIEk^?Om>2AaEbfd&H{$+~N
z6v{6^%6k^7x~|xmVf{dv52DDwINfTjL6MRJGaY-{G;lw*#&>!;*C~8tQ4BdWM9eTl
z3DwHR2xQe3z>J50>&=>PYOSoTo*sh-Z4q{z{BB3_!4{gJ;(Rdc0<m3`h{9uR_Rrfo
zoa<5QpPc!38`~eGDwb4~zs<-V1lDRCRsYhx`l+Ag2jGs;cmA@I>z_Spqo|&NW)ok1
zx@3O<j)Y#ez2xz+MYQEB<44ZV#^B2UdiKkIllw3H{pSoOww(ID9P_>Vd!da-CaDSl
z;Ct}=-&q3yK&JeZKo*;$UyCsLDeg|tTm+^>RL|K2oM$3F^)5&$N|J?__Ci7nghrT0
zoE?ceNyRL+2XfSzugX52^ks&EB%yeHeS_!Jo>9Opz;<5<mL{N?HX6_tVn4Dt;@YPf
z6+DW1^jbPPMy84van9=~R9KWKio+mvwYl3LRBT{kpWX7lR32y5xU36Z0SH|Ec&+3M
z!MP%9GHEq$_Jz&9Vf$6pdciTBfOR49B0$i+KXi(UJ9hPr2xsWx)ICR*vD)yQ$8-xn
z09;HhP5P3b`Wn7B2}pC^^N+XM&h-7}uk19$9(@0J>enlh-<QvpEGf->et)OZ>f4Z<
zO}|&??yEN;hJWe*EB^m?<L`L(CxqE=*3FyGUd1ksl8nl!O`=K>zDX<`<CTbd6&|j7
zH@5TfL+J4))QCxi1ilesZ5zN^!1mF2B!WFUCB}w5IXw{52Z3y89_hNib(>^Y?GO;X
z0yOl`!F?G}`5I%&4i&A>@gJEh7A1m!GWtjyCg?=8n;yrjz2eYtrSWoGsKVsO=pHcn
z-ocr+2ORn{`ns+?qK~WjW@ED&A(#J~Pbh#BQXLx)T`g`{is`1v+(KVi_ASs8Jy+fW
z+);?9kU?E)eQ0^j6WRlbwr81}-rpTB<Y)@0xYDqw_DSFM7_e?+B};*~$L@_CFN#Mz
zcFNA89hifL3TiKtol9X)tfe!-BgMnqHPZx|H%eiDq7OFVIgUOW-bfTpmY&itkQW)o
zyUBV_`cFEasRB*j<_7Neq=@Mhu5RUCE%t5CJ)<6t)q5eKZ63~U9!?-$=KQwzeHlMD
zSaL5v<XO4@QfFhQPioOry$L@4`-h-`&9Li&Oysz7c!|%5XNX2@4J!}axhen_s6P75
z1sb~CWqD&m#|#sg)&Mw?t3-|~8!Sp1E)CK6T(iMuo!=4wL#vB@3(yZGcUvZJ*c-F$
zJ$m%kXCvKPgSY;(c=O7VX=q4R;HTh(9{+LofsOsc!iM9cTmXQ@jFntKx1!hQge12r
z>+5Zj($-+IX=ldUQnGBes$_f^6}1H(W~3N9dMD`K&Au`IftGtSmcsd^*aAMIyjm>3
zV2U8*`b}I*s*QLK_{+z?^7t24{~C{fan4`7`hViwQ4y!7Uj{3=64)dO&;9|}*Huy0
z`B1M|`n5uBo~u4!VIO2f9*AslZ~)NR#e}roNJH)W`_+5grZ?pa_D_Aa=%a+Sh$qhT
z)KRd;BT$hM0uU`aAF*7`hQT9mL?X`yFesxw@?b}6Rr5yJhs&6NdA5S(DL_e)EewXo
zSIDGF(#A#3%x-_KGKZ`mwmxmC1JP{=(evIQi?8r0b&1xd9<h}*vk?Lr%AN<D|NW+l
zLGgZzM~dJ;+`z%%V-I-3HQ9*;gYDVv@1kTDT)@G-ePgpdn_Wn=!yAog0AT*`to-j9
z-7()wf$}zW=jzXQ-p%=D)v>yvHUose^M>G}&WUJNCnByF2>0%k8O)2wsYxo=N<hl(
z=|0_l@V>vTA?L2NXd|J-G+XTP!$_AR3cGnd&;{G@Qluv`|Hu?Q*`~yzNFkjv@r2Ov
zn`z>FqnSi}SD`(Wj8<=$hl*(OKNPL6oK{kX3tc^+F<I_O84G(shVLSVO^LbzqJ@)>
z+R)?OICq70huiKriP1>^0(b*MFTHdKmIPd8E!+R3eBC3lp}=mmSuN|ZTyR@nUg=JH
z+NY1E^Hm>Etrh9DAMy@o03a0{8%6?qS(2YJO+TOhYv;cSLPw*d-?i)d^WLUcZt%Ck
z9+}FcE_)$iA|hQ-_xJ`LJo0WseA%nl^J~2!tpn+*Txu?z30EIO)7J3UAlqbjQtJd6
zb}a+(s{V>rGR!^O&fM%Ch29mwbFLKHc`@f5!KirT+Lb_xmZe*P#iA<P%C1(ekk9&6
z6f{7N7txiCwR{-R@m5?B0yVntaUasYWi?OzY!7;pv$^OuX^IW@s$w$guub313ZTe1
z7!SzH&$?=cS~N<BOx!G4^t$-|GH=nzlJ<+(&Zi`%XRgD~!qxxU`CkMfwEuZ9kdRu=
z$Ly?;dQ*|SIIHK|QA9bjmB~JHeou`fRH^TAj*q;8l!zS{L%H0!gur!qoGuBK62n~4
zS5Gz~4=3;_^dc~-a=EZtF0?Nc8^J|E!Ut)lK@~=A%@YhFwj3c7*_;OnRyTkP?e}C#
zZKC_Ym5a#=8I!(JUi!%jiw~*2)pBXm+vjcx4w^}+&z8jnW^9%7=jfROyR-C|ogNI_
z9?wkoP_JLxAz6FW4G^bw=9LyLJy1-SK!o_PM2VKZ$+>+|RXyGN|KPR%9YQ>NTqf8?
z^STCm(jeNZ>zUK&ocf@3hznC-0K>D^&wJh|FDE^LzEJs<5~`e<UUS@Qh0ios{hG~U
zjZe4W?|qAu#*52L3iG6UEYpj0dON)!0)5q8EbRA^Yta*~g!tvWF{;kZkkaFc=Wi@Z
zwFYwub+=ep#;$Kes8*Oh%xUi`p8bNX2LYmHxM7JU8OK|0&IM2fUETM@QC0Yy%aKun
zW2G-T&IxmSh6vm9&argwa!!(CepsdB8gZsvUl_XrxsB*EX>eWQ$YXh9$3h5MJ02r<
zNl-57)P_Pw;VeJ;39Yb6A-FBU{C(we5;a@4G`;817E;OrTo-xnK{iwY(?G%%2&a>s
ztu<`C-I)n<OcEnKi9Y~kwu3WOKDt5XsE>0!0T9F-8D4B>!wA(*A!oC4)!_*%N*iTI
z3tnG_c$?nATy5FxPqk=v5q7a{JhQyzl838#%4eHoAfCXuLNqcGC9)f0VeolfTQ(07
zu52(5h{`d!;Z@JI+=iFJe~pbFPnBOABv5iPJX;Ga@nlwCFu6k)e+RQm5RWRi5xUx#
z{Uo!@&Yd#*GB9KP`b!6oJ5)UFV&*~y$Ed4qMW_T+J^FL()l(qck?igqqgS6U%zkZH
z5p8OZx~USu3$+C)?Dr;#ybCp=-Z1m38~2h?m@p3)#o`FnqK9)1aic<bzp5XAh<N1p
z4;zA;OCA4J0TR9Rg7r$l*0uI!6RZ+u4Ys|W@dzebc3J>og~4Fv^F`)fE6Sz(N#=9j
zq8?S{Cer5j>M!95`4y`(k_dC?h+$)~p=l8J8PPmoZtDG!4y!KGXSI^UIa=<wAFa%P
zp?0E2RjG+|bEOC-%BC1Ly(T1~1qA8b)-X8%Wn<$$N{E|fyFKqwb(Y4C{2cBkbKAXg
zqVOfm$a7?eio3;UyDajmZtQ6CsHTi;aaJ!4_$Tew9}Mx$p#@pFOl82ylvSrkefmeR
zAzbJ49SvvIlZSNCzMW!3#~bgu)yTaG*5f;0zUx|1y)pMP>r<y6E%3oxGV5~>S2z!+
zYX08-cODC}MmWd7D)hY-?b6xNXTute#f(5VGr}lnuMuqYPMM^zLQ%uvWtWIkjYM)L
zas^j}or)2H%z^e0GnJnWyTBrFffNi#7X}v956f>oV&U<Kr+8EORKfJASsBgt`q#}s
zCC?TIbXK=p83Wc-|C6}wu3=oyys+#Qq*9bf38J%E%wd<MbtY9QTMWPa)W>4h$BP<N
zPq?TsSDmoFOUBpux?tE(iDJsoHpRzhh+;t*h8^ey!zh@mk*}YBYssjs&Bj*<MJgad
zS4CW{z#vIXe*>AXL}_?v9gnXb&T+i4E_VsmIvJRgw(R|uSm1}BKWK7RvS%~}qI`Fd
zjy4>rjv}^5S5xWu&z;$uM|~tu+qiRufv3FK+(%wN63+9unHJk^=ZU9ejFy3f59_x;
z`ga@C(?7~N1J2gH1)aC_aW{8ysH5NdaBMI;%2)mC73eLfb+(#`*71`)aI|QiH>?Nf
z8*PxvqUZfTF*?nemZSs^u5_4fe4jad9u$f$kX*R&1HgQ?>)YF@OEUw9k3IjR(JijX
z&E35)1dcba_wDe?ps7)=5GQv|_`9={CqOj<u^vUBJu9IF_4xp@Zci`!=~^NWNNl!g
z<eAOB^du`-gp0LV_-)~Yzfs-$oY+<#6}X$ea<)XNOpRqkB`#wm0`z2(Iz*He>z{L)
ze_=!(SzWr^ZvUB9@7-mT=~P!#<6O_+(f<RWV#nvFHs7w?va9Tt(c1@;kwV~Z?j{<^
zIHVF=O{Y`&8=WmtmUcoDI|j4Dxrm1>tZbcOtOoXF=eu+V*Z4h0$1D%n0K!F~4Qpz9
zl6V1~Bwb4sXLkgJ)39+CT-DMhz87~Fch%KdlE(8==_45i&tNghV}>OW;R|`T#On!V
zv5LJH9^37#K9fqZ^MU!iw|ogca-(tu;?&n>SzlK&uCHyWA4yS}leF%w%@7oaDYgV$
zv?a~5%*S%8nw54?EWj828-@$lO?3$F*CCG3vi&JBr=Si*JquI|r61ijmEsjKEZyxw
zf4agbBoALTp3qg;a_UYl6mbPuz4a!(#tZ#sQlm61E+V6rtct&EHL+k8q#9slah&zW
zh&u(wPloizX5`yut9e~V_}7agSWYV8Dk(C$9!i5eXyKcqhQ?J}nN7ophc~!bn${22
zsk>DWEG?)Yf`7*%84d!a6%KjpTe@IY#(_iDsBzbe(evq^HAnai50XNvchw^F^V@L4
zo91h`QD!RhJYB;a$xqJb1W?%U?H!|pGNM~rZtW-aNu@s5)Buwvu;gwh0|opp)uyPK
zFNbV`Iqs{d8ENZ9$Ybj|q0Os<GVK$`<P?~nPLt`jeRBO-i8amxox&39j4VnKn0K~<
z@up*{l#2b=PXsfU3wU_Oc}4)4M{1>DR`v0Po%3HzYbvPwJ72Y%y<;q?XF&e0QXY}Z
z7Q(O~k|XLAg}+f2@2P;xigJVl1sr;Ncdz{u(L6T)DfsUz!c+WE-xG^}0H%-af3>pY
zE$VJm_yI7my7%G8m+z9rfWwM`e>S=YewZd-$Zx)rC*hLZ>s;M|(qmFqaa?Z`?26?S
za9w&{)DLn(sCYuLHm<Iq%<j<xMWw7=pBri~jn8@SHIm2LTcWu&Fn+^%&BkiPqq8!I
zJe6wO&am`&&7K$_0UzN$`cM%CXUFL?*rT>EPgksoyxG@YUB4j{tRZINU1R=GJT2~~
z_l=L^S_VNx>0;A!tdD2O(vP#RZ&@U8;j$CdiN$ecg~WFqI(Nc`l)R2jnbOSDV@gy$
z-?-Pj!?V$atBxlrO={o7g`DxMd-(CY;a-8|p77-BlzXqlo-%o9Hd~S`3viRa#rWBX
z>F>?OOFWu+w!Lap)s;f2yP1h#N9u{b6*F>H?2g5Gbg!7y(NNry(sP)z%X};GxO6es
zt}be}@I9CMaj!U3!G&XxQ_`JeJkCM;WhpS*%U-br=JG~6S)jCcX`zO*ZYm%uLo`or
z%1*$`OMpT&lvid+VSDV43MKjq3do%69-ZI(&Q}!^xavRdrDAjDHYdx?g{Bq5eg_j?
zQ6cGzWfY|{Nl~P!5bDZ@ncE^%%!Q#CuN%M*{*I7>@!~qwYTbh`DQpmXj}TpMkxmyH
zvhWf9O^~U~2`>;7!&@^}`Sv8?wXEd|r>8-)tXj5Z8mf!3RKQl-Bbx73mmb}7QY0=8
zfj3#_La&l_aqO2VgP(>ouHTxSD=%oSe<pcRe^`GHJL;}g=89Vx5LPS^{!*}&`0k%r
zIb5NaoFD$iZ}kK4g!#ikanbn?K?#jX?@1{+zSix{CP6l=1Hwh@3#P&m6<Ti1Ykj?x
z9Q(asixS2DKOvH>{@1S$ymHN2=JgyR+qpkA-^1$QeD%J2PhSnz!C7|wZrI5`KhXbk
zA<*jP55PuR?pno)OJHMD$Y}r_Rr>die_kGGxpgr}*{*ibqdj_EaiT4{Ve}(LruepU
zj{>N)x5nwf92>;8%kfOWLJCkc{pxXdqv!F3Y3729tg}G}mDojWnVNsb16Ce<{s>fi
zDg_UlJDmjPjb`fn^Ki|NO2Ut<4j-xF?Jv|1)a_m;+7ueNR9Medv<1`>)lbD_M92O1
zaW~%9i#aDfc*5VlJ9bNeG)5jQ_T|&=LfOkIrJhFSzeA$nzUPlN{Mm(n-GY=`U)rpS
z&wcZ)?K|HAP2avd$wegju&`Lmj%#jAhF!0QtyThC?_wF}n;5=mFlte4FD#uT50Q?i
zBcCiuHrolX`MxRJGF$wZrf=J|Ht|sFiOdb{%yjJ*Z?APqX#!N0CSIJtgV!UZ2t013
zCT&<J_fo07EXMz8$iHk0Mj~}x-7u%=rTzpbsoMl2$7k?<vj}VKdiQRHp=$vhR3q**
z98B;PfG!Iws>~%L%BwbA_$?RpOD>5eWF8oBHS$yPWx5<1z*k&}?38x0wX5lMBvMyD
zMG?pZO)brzy=!s*>7vr&rTiT0pmuw%b;SjTi|VnDjZ#!%xt<=&`B0+_84dCoUp6fr
z`yFiK=RP$DeOqRQ|Bkx<ez0YR_0`S3QSoc{dy9c*LKed^6!0~}CTW&vEKR5%NK>I=
z2iP@h!#8)k=g)&v!Fw)~{dT48fyx*26bq+&;KL(rUGkp2JwW2z^}Ekdu;{qxBcU!s
zrP6v;u05#_IqjI16&YEL?~Ydx=k=X+Vig<aRgG61{L9b?{7^;!0|l+$Q<}1J9?&Du
z&}dGiC7;!ZC~?OjBV^gpAUH{r0cj~h-P*0W)Nx~p5K8{mv02jZXz)tsXPMfNrsem!
zJ622Jg7@c_1tstKK9Sr`h3}mHS*GTnlya$Inlq@U=<^9RgYMiZAEqY@BVk8>mZ>?4
zESPO5M>zSdXg*-YY?1cQwN|JG2x_tXEK@W5Wbq|o#wM#N>G8w=&nirC;45-~=dFpf
zj<TbyO~u*J;0!Cw@(MqjDn7_<6wH&j-V(JRfo?gn)xNS0E>L<zxoA*;5NV((Wo<oj
zS7ov&Rt6dl5eZpvp%buP?4XoI+~9ox$Hlf&B|c-HLn8N&qB&pr4zk-C$C4F3*ZT7&
z)Daa@@{k1Hv6>(X_LYZv*Gt!liu-^bZpQ^3ewo^7t{Kk-_Pf;26)W-Dh?+|x4*eV0
z9@%D1F_%@>dm4Yp2yYPpQvQdm9REW`B8vehOa71*tv^6OsQ9op|Bo>EBLoi1qPm;@
z00aL2Unu;V>FPWxnmpnOFPg+j!M7E9^Cy$2sltU_o3Aga&dcXTa3N>UUBg#vT+}7t
zQWdWOLLohJ>AH8;g$gsELMODLAedpburT<ju2W^l2kMx!T(M}^?umZ8jlt^_lV}`%
zoV<*YSX8MQGRX5PXB00DwYzGinqJ~|7U(r;-I%niMrl#fEvBfQWDc%O_8e-Hx;Bj)
zI$faUVVx0^5sxEa(#4Ajr4e;u;4R5{W%t7gtey{seu*mc$Tp-)CzMqSe*PbvO!w#6
z{2N#P7hh-a_rRgU+6^+bOD_26<h0H`dtmnntsPC>LKx$c=UyDX86Rp$?YslvC{}!x
z=}Ize%lPPUMpQU$h^K*6J2YPOWWoe|rXK<oo98t*m9p?sDl0TufgspCNh8Y?_jaWe
z^8_WeB;mk|WM77RxMT=lfSM(&gyCgj7gTeIN6$XlS`bFEdhc6irv|!B9}`%1^f9G*
zgv%C5nj9t$%K$N@NmR#&YF3JV4I=VLx`(~<lp7=020;PL&Iyv7mHCdW@~?R9Eh$`C
zi4Ydm3ob-eUPetq^NHK%=2x{{*KVKYA+_m^e7RUdsjYmFGbgHRuV~ogj_Lx*h?O3`
z3e4lZyvpeMnw~zU3-+A!3LTiOIh3cLH^x6n1H})g>)Q6a8kFtDsPS8)%sAO{kF4<}
ztIAy2J*xn7)PG$7i|~|4)rsF>z|&;(NFwnMO^ta_ruq-e$lz9%HyYPvb?(r`)OU@B
z_r2aYFK-qZN8Y>b=7G9?Orygvx(<!OzZ%9O>`ZKtdap%h{ScL)9o}q`S#1%#D4;WN
z7j{Eo5zcih9kUviZyIHlVbQMZ_Pm+?w%HXU<>zPB`;DwM^jX+b;dK&2dG<`eS92P4
zM2Zm{MV&i8oXlWnfq9&~VpJLB`^>af%)8T<E+bUI(X6KVqP@`x&;~?Tuf{UIE;0*?
z9YXgAgP*?{9%b*VWH^2XoAR64<B@f>Frs#XPWi?9VhbJ;B)TT6H_3jT-CwPAC`VF6
z^Vn-uF+aPil?(2;ri3c}6sFg3a1_sb7MWtZI8y^2hO9KmP*%dxT!drzxu)P&r^rqE
zGd{^>jG0wamtAvNxGmN;`bURanL^?*Y@Rd`nLLsxp*EwM8dquXa-Mk2;=STU(_EB!
zf%$6U`U<WC)tfK9Brh`hMK_zZM1d3=J@8u~!$wXe-sk>>pe%f8a6B&%T;K^OYS(cX
z<#hzp8=>*}!<Y^&lGw>wl8Bc0Na}UgxzKEP<2Q-3^a=;=O}TD?fpz%P7E{RRI9D={
z7mF$oNTA_$9e-;kl>#%bon#dUN*&47B2V~#fkp5}Io$TKER;8OVRKVW7pR{QeIH;;
zRJYU*Wv=uy{vDF%{&Pqw`|pN7WqHRS_Ey~PjPi|!`+b_cVAA-T>@s3F$3Bm+@OZJl
zy^bjqic)mJa9>OMRY?9sn)F-1we4R-CV$5#wO8rQ{4W_$n}Y$G{cG!9(&tGHU2I%^
z2*?DlAIu0^A|3sApea2+%V!F+KBoE5qIP*Ng-MaMVPl$X?3&dr+<yzA;ZqBJwNeAS
zQxt!|eQiFy3*A}tA|J7$)Xr-ubzQfK&+V~79T_d7%7^6jyVe8qQ(zB+2A=*o=7P^X
zKWXwy$b2FAtC0MOG}~u@7mL4$%!>(I0eSL)7o$%O&*ZO!WsgrxT+uS~#R|)05L{{%
z=5hFBYg@Lz1I_FH1(hsZDN`5&2{pVf$${qgee)EN)~AZo%UVPRzjaT}qq*Wf!g)5q
zFS_Khv?TaLUE+vWtU{4>YmC3jz-5#dV#Ll*g{m<B0*{+_7yXP~JO6Xc?I_<roB2z~
ze4zQOko<|Xn$SOx<$r<9UnI-F0c{d+%~ut4h2t}e=`d-+Qes4Ht~r<}mvKuXPPLc1
z`)p8Lr??L^n=k*)@gdd?SRzos5Mxv{$$>I2i(r56n{$6?f>=~L@B0YV^-u;xeFfbG
zzSjCV=HB}N<dl91nMXr^6_S5r_XGOlzlzKa<zFJ}zX7cd@cdUf`?%;YlJ&!%<o;?J
z;OzBF$UJfIi;(;~yKgd_`bA_4P5l~K{|#uKbie&7XR|_ok*t4`durYjz{|;BI*qe^
z{6$Fqo!#G^6!=AC>NL(SUA~xTvRLdd(%`C)wuPQ-Qtljw`6qjX$e^L3(b4f+J@@}c
zXlIloFHWNBNvaXFCjLQNw5<geF9v*H<pdG2o3?Np6&ot;t4#;Nh8|i3kzeXv4<m^=
z+FA8L`0i}<W9RE*BB2D(5(nCDC$$_cFE8KQd-xn%Q@Bm*MHBz|3DFDVN`q{`T0#s6
z+$zj{aI@frdwSwHP}KG&kJ@s*F?@JPS7}?7S}hEI@fjci`TfrQziE;MaHR2;)vRGw
z-$T~9WN7*53^axFeN|w84Sz~)Y7VrF3B@GSj7+{6PmxZ7bx9i1=eR6gjtF>BZM{SC
zI5Vyaa#a`4(UHceAXhhp#3oO`Yio(7AW<S0ZZY&|7<i{)h5JU`TB&F=sCI9!$zzK!
zltS0X1Tn=aD*83e>k<}U+=S2w+rrhC!`qckqWvAqJERP+KP@Q<2uA4Ykv-%Q1H!XJ
zp7c+~y`9(S%pyV^wu)#?1qZ5kbdq5=2idc}7Hd&U*uth<g#9w0f^7|5dzhn>vrBRM
z`BrCN^t2)~O2T9O7l8;SJ<+6%dGamnQ1mmp&ZNrt!-}4(%Q7B?j%PPRZr<|#P`4~F
zo!d9DHE^RKXnOba_{Qf;Ck$GjfrBj3Z{T0~o(OdjRor7t?^>Oow~BtY0l}!6h$hWU
zA-@<NmZ@@ls&60QO3*Eqdzc#DrD(&*zT68VaKu6d-*h3CfJC-X`Xs^YXCM7n3a<eE
z^6{@c{)N@Q#^Ya{^B1rFA3ArO8o&KbI~$rulf2?~ySlhJd@3zLj>~>0SzcHKn>Ns(
z?+|7ow#>%1h?IE;Sk*6jp0<-V=nC0>l`SS(7sUv3Y16hgwB_a~(SB8$l$b*J8mUta
z+NUt+sR8XiSFPW<Rl)Vx9d!;7-;{9<E+dz9>w$4^49d&Y)r8#6gG`ZCE%AFo3(T3q
zqB<N7i;sH9Lv!feq5NH(lg{C{jc^aH{Qg?c;kC|x(|d<faG-_=JNi{_EhC=1qZU$C
z(v57v73;<zcN7}&_nakqRR~L|Tj2suS}vT_3+4JB+3VpcHw}XCf8)aS@*?~u+S8(z
zveKDma#HR6gwMP@=(u`hD~;;{r?JlUbaPOZQ@JV0%MAm(u%OD*rEjbjB2$2dQKO@$
z-nr!TB+;E%5e&A3#*t}^ROrySaQ^4h`3Jh}UEti{E`6o->y8sjgt5kG>y`0``gB>f
zQR69KXUPB^KeT^?QPHzn(Pu0SHAV*051@(s?GCiwuj_VicGnfRpOihjH|C|(cpKf*
zVEgjwm9~U3mJ1Ai|A{UZp*<I)wAgI(sf=te`-*HCqwGVi(xJiq$JpzK0(>cX1%Tdt
z&RaI}+Z2!6W3^-bTXmm)032&47T%PAXL|n=(R_NlQRnP<FPvN;X!eqCu~`(KOST}k
zI_mA_0XN+hGwd{X*o}>@xTz_m??i}sREpH_F9MCy)>8Qb+SN>z%V;25(MN*|x!)hR
zZm><}jj2akI&%g{UTXj_4Hv;YYKRm|{A#qXOy<Sbu0qY14+m=o^j-Ch2BbfKhf~{U
z@$4z~7GTZ047(-vqa%28&cm-AR(|#G%|Sg1Dy1ndNVEX;O9wxTU2bUBKbuHS?vfi&
z;B|eHBDf%tlJNsz*n8`1b-}BHAArHCm(xVTF%Q`O^v!AFy8rv(Cyg~dpP$;vf9R3x
zh;CLAOiWMigPc;#D11WXo-d+hT+*eQ;ZZB0AuXCnUu80i&f3AT;G}_N*L>Pi28+BC
zENKe|>$^vARaK}6TVh#p8Z%9}*psQ=QbH@x{91ELdU)hu|9fcEnc)8gvkmZ;!GeOH
z#bcft8L|>VpAh7m3#Lhx*qy+spo$x|gaqh85`<IT#)Yhq;b7y<8zC`GW;aih=Z~Fl
zllUOe={|%#ocLB&;c=@L*5o^W^-lKF*&&#x?OR9d%$WVun`c-12ks(4J!PYeR7TNS
zoaMX_g}YTL1<S9w&!~U0StBOdO}PGwvN&OA#m3@M-OyEr3B0@emr2nyQhXU}E~Yri
z?49RjImc-;`E9{V4(IBIQ589hK0JF~d(93l$DJJ8Dz|IirZf4i20ZXP0+j)Bs-E!D
z+e{!Lqs*SR!lQWXPKj8J9o)Vf+852B)>HF3UCB=>;Gtiz378n5%IpQ)^dh@PCOxf&
zCO-Qhp8hE*x141XRXa9tOF90F(ig|_s}C!{@8+bm=Fv@u#YsN^0~iXR(<v}yTgcSW
zyVCSB6iCP?39%GCW2-HFw7lN{yWlduW}K<Y+4Q1OLo5~1loe5jJy05{(J`1F7j26#
zJN^%!B+fdl!(%vMQbdi<p4K-m@gm0ujC?z@EMBe#Pa{&WjCD!UKLESOzFyS}@n7fb
zHHnIBT343aM*v*+8_C=ykQ-A0VsGQ<A~sH%1Qgnf_|A%3PjEnQj$ugU26$>Un+&u_
z9_;Ya8eKR)vV1cY9_B^(Cle$zd3u48InkC~6@ZQ%qs4O9rS8QK6^F3}dToDiMp1g2
ztsUFa#}$tgU#2znS!RW2%U~sC^A3}D9uM{ZG76JZzdCBVH0tpj!bKS2Nd-GgDO&u4
zaeij;p)IlCgO+F~YfLDucj}YF-x%lR9~eiarPMpjl}axZXri_3D+aEy5*HFTLdM3u
zKTKaV5bLP9i;d?^Hg|Psj4q{_bdiT<!1<pip`pAiuK1*<N0V8>yXR8L|C#yvRfQv@
zNx9q-1A7SjTBISq#tuL1Bg3MCR`2ceA~N@WKe+DK-<mF0KHZJI12A9j*3)1emAttT
z+in}cI6NLShOV|87E1>cjcW%RYuu1XFZN8B*;-Kg(o2N(RRE$%+;kC~xSjH%$t$qg
z^pV*##VyCaf6|Mz8L%*Q5h3A=9QY0H7vPXxNO;t3U()SGOl_@AWjrMq5(B?4b4?QS
z-uH%CF5El7;SE{z)s{m^w=aSFg#wA?WNruwnpmN`Jk>rp?bo1eQa*+nV*D~%y4@2r
z=~erk^-1%6ZgWm3hf1~E^P)zza?@dip^FS{L20OG#?B}_D(UIVqE-!7Y1H<LOn*rx
zMKnXeC?&{L;lY6zZYD4JgBR$+vC?Bw`TiTu??0$IoGTpjs9iq#ZPhwx;mLa4#J!&b
zt0LfeDp%Tir-65cW`*|sZQSca1$86!kH#~-nkF1X)zNS^*BOa}`?3YRFFJFGqaHgV
z&F31~X@Ny9iuH=iG{w0$xH1oUd!9AU^d)l2O*66#?wX2_*hs4X8}RsP(bmGkm5SOg
z2uoLrnVvPmO~sb%sMlj0L8y_&*6v!Yz$Sx=n@?WVY5@R>4nJ0Kr2ISRdX{}xsakS$
z;INIwxaa%0;>z^ZKC$Yqht{vQp}bFWv<Ij%^nFUbX_k`o2L@MPcsytfcp9XExRYvt
z&G4syi~a0Mic6|hmc=HN-cY9|uo+@d-WX9yn(f(Vy8^QpU7aMvKB*~&ZUjy{-&3yp
zq%?C`(sb}#-K(r|&qN*`a<%O<n3KK}?FXP;X1c<e))!tqQOryBcYtRrtwfs?)@3Uc
z7f*!nP97k}7PCneoz_Gv;^17NqZ5&fnK>-2E6}J5#S}EgK52p}q9z#mx@%i3AaGde
zUTP%0QMFG=;7-v<-c(u?7%03K#gx{&QralzKrkeA_}Pu7p6uG|HmaDP-SxDbjvSdl
zxby4@j(r5T=+vU?vU*L5d~ja^_1>)qa_S(99~6%~8DqYqb$&u6Yu39M?77y(Y`dai
z;Fxq!b~fTkeVCehmoHmgCXAFp0-nju*+Ya9nm;CA&CR8Mb;ro9yMF!C(P9et@kldv
zrlj`k@xIZ7bV>|%(zFaX1`~u?>pjP(u<>Bnvc4_Fs<=c+MNbPk%T#%k2g$(C*p_sL
z`!Vu(D0R_DPT9}_>8%mlPX?*Q2{(YeCQ2AKZlVOnM7u0}7)24cl`LM7H`VI)R!UpU
zPH)ptJO<2WQan%XLd*OB;D8J+?Gc<XFU|M`Q|Eb$zLWBN)4saty&zYpusju;EWai!
zzZ-ey`*DgHlU)hVZFJl&4V(s7lvT6yudS;YDn^qe@P%`)L!>ZVt9JX<AJQBakZ*R3
z9`LXUe1sJ#$~=i5(GAW-#C1oc&O!IiW}da$z5LXpAw}R#>@Ll$PJHZLz1(N{Eg?1?
zNj*JNjNuJfrWVy0Nru9CWDJENa6*R<oH+Ni#(hwdBcoi%G}Wo@b#j2DeKPPOr-|kA
za)7h~R?*;|Z?G>aKK#jya+xRz0pqm&0Z6#gVWnLYU6|_B11p`e%-k-9XIRoSeZbv^
z-?AH=_U|p%8C^}~ukTiS164V!LDGEDt4<5QRZg^i^u{?XY$6z6P{4^+=x`}yl%Qgt
z9_-gMyPGaMJTQt(1~Xt`Da*Q_v0rrtZYDB!9mH0Cv4;<LO*iq*d*pTZHpSMB<UR%{
z7~lVWI`H=;lNGQ;QDrdT5`!hhF^=;7L@^{^;{m80TzV7tutx?RV0Fai%)7_~Zu*Nw
zCBuvTx6XL%EJ^R^N47IO`l7hy12hZplRP>vZ=D0=%k*lMF~ENgWmX?KzHqQ+J(-`c
zr-5(?Jghct1+%3#Yr4+t^FrOfdY_KqIyaVm8+}`OZ9i=PQ(z0Y&3VHvWUpB^PR@xx
zR9qo3P$cUbfT8R1-^`?waixG1(V447K;KjW$#XHeTzJ@>(V2Z|+=Ig}VLOW&+B(pN
z)cQM7{q{Kixl8cM3Fx@X+Y+zb_px&7eNG=TWUBfr`X4MxnaV`a&?vM}Ex}OR@h~N;
zQqJS6vV#hw9X*^4;lq<T<>0~U2~yzj!uZ-?PT1$&uePTI+tm9m^{jnq;f7AX`0P`&
zk@N^4aOeH+0X+JY=M=@@!Bn~nXkH%$)--rQV>ePvozuK@ws<!A)kK`OZie%M%C5cc
zZG?lcYLg+nDXb&Btfga_S*E$s|89C;)4oYx_S>t)<L_lu_ZJ@h_A=+r;)<J+XTA4L
zu<zKyo4r#mq!zEL1Ij5!(>v%8?uzZ&p%)8rjuo^|;~SuB0L4@PESKi;RCaYn@1`@8
z^QdtNEG*h0tB_f%^P|g$p_(xIT);h;eVrt~lhIN+v0Xl6I(=*R+_NF4PVhJT&@Z(K
z96mFjD4&Lc?s9yya{7S%y1SKLwy|dA<Xsu-hXHa>OZ=Q1__dnPKN`RNc&O$Fpjcbf
z^s4&mkWY>0r+Qzv+dZ`Py~Oof8^cvSJK<+n?RVf-0<i#q{6FTPWk=}|>6C(e7d?3g
z<pt?%=ps~`pyKZHF;5I97<}xg(}m%TCgA~jFXgbUyH81}K2M+49n>`&H&=5wYn%#l
ze5+f_<W(hnG>v_Vgy1%DFQzGtboYqZlQXs)Lq4ii+Y_f|m>-o4sqSAFC)o3UxR+To
z?m`Nfc6d|sRVK2>X2w;a??%(_c{|ZusQxLi-AM*q9&0)F*w9;HHwPNMT$iuY5f-hU
z@{v*N*xhYzt#UBVt1&afWjnD`qj0s#dd&1{K!LU6JLcyvH#-~$)jl-}J|W93Q;v>F
z8dtx(C8B-E9wXX!wh&GI?MAF6F#%aNR+@ym7ZEdj%IBSI<XNYW`GagYug_+gt}BTy
zXzk*pcS~eOaB#og+Ko>reY-PW6<cdUnU4WXF^8s--kf~vfpZir+2{*iXs&Dd0T>q(
zQucJ%>p5Jj+PPsw=iJC7#Kz$2>+4CSZWvR40KWGn74QeUoH`>h^1e1$?|RzJYn9Q1
z1#(B!&bbgovEf3Z)6k!fE8>1*V=-q4*~9l9eiwOr@vP(7LH}2U#$?B4xAcY$UbIz1
zhpCBfNl)yMpApj05)!_&pkAZi+uJ@D*goLd@-0(PU+CtaKhox)@V=(?N?hPFEX5He
ziiJw%7`9Giobh?w_a7!-Z+2d)yZX~e-Ycw5`y<V9=X;j}iR0io*GAug>1<TKi`2Lf
z&-wV?I8h9Dioxd&ydH)v@bovjgn*_`+J~jj*9`y83Ja?jQz$kZvVN`VmmJI2$8}3+
z+_w#}#9|qN6t3d0)|o~i|E%B}eG23AFtLeIZ&_ibiK|5hnX6Ay6AH^dSw9`hu<G%g
zq8UBlcgaBC?QYi(sdm*0%D466^{t0e1gbT>6qb<$%t<oz2^IMvk@akCMuc`4@B1iS
zXCT4~Quj>*$k5o=*Dzl<Q-UfT#X(k*DLSYC8M4W_7!mVjT9E7Ok?56Vah8uFIvoBE
z$%gOUz04)L!xrn88GYx;waEX6z3&Wba@+pI#t{*O&;&#U5_&*FQ>t_lnn|c3fK*8!
zAV?7eEc7Ojgdn~5CcTM;UZsX!qzed2M?ud-&;8HbJI~y6?=$~9^XYtgKjg`N*Sq#w
zzrEJpd;L~($^^R$bwF>T6bPdsI>&wUe*H=sD=rHl(tyLJ>F2AJFVv!V^CdS>D<jgu
z^>V;h6?%Olf_9`q>|$#D{hi109?GwIn;r<s=zMxH>imc_?9LbN>T+$9*5c9UW5rT4
zB?F&2lg*qVHPW|e!Cv{inf03I=X}&<6%Zh5TxY9fA>l%SK;q4T8TU~=M~g4-996Lx
zArBhVKvpNRxYWcwu|PS?7<^$_uOiQuk{&ctrA0vlJxuTODDJRs&Dd53R~o&$#35Or
z4ZPmjMxWQ&4G>p0?zvC}R3{h(nc?>~q$L|BftolX6TFaKq>IGtIt9<3K!%O1gfL*C
znTj8lF!v34k=)fjcMLfIOC|M{V6{f0oPzbhFh|b>DFDNgu}(IVy(2$7Nx<k?9~5qu
z2j{O&^sXfo<lrOf{6aYmviZa_c8uw-_+~yd7`X|#ZG64Cp0gD;qha`tYyHd~3V8zq
z?QvwazLx4{D;(@&j_VB7XIf<UrS0|<hh<mW=TUB6eCQ?>m3uawV$m?re5rmjhx{B6
z<84=>SC|f4@MV}g{ox+fmSvQ>YlF~|0SkM3p+*fS?>y*H<q|H<6*&5)ed0V^%3qG9
z=3{=WzCk=!0mtS05rKAF{Rwq`?Rb<DXCSwxB~2f1O6clk=3?vDLJuF`?JF@G$kMF?
z$z6?83F9|vSBD|HtRd*Z(FwwHkTzsNH|Ji9NZZIASEb)@z2*gGQI;aAg;VZ=0DdLq
zgo|m`zBv%wjm5;c?4cp_!!GI|`y6i5>;cnHhT^fh;<pmaXS$6(gazNkcxu#(IEv!P
zW<CL~`VAZux-uSHhe~?nEoRasI4syHdt?Re_W;5oG$z9NEd*Oyt8O0sxFcuwe*SI|
zX;|Nx2@`T^V)!|PUNr&wyhli)YYgEF0c$wul{mZEWlXYxr72gRBXn!>$82hKrDv^Q
z5IX{$htN;<>?28Pfr1@|yw7e5YU`Qf)MpBI{3-0eAujkbO)S&?M~_2Hkd9b|Y@ev1
zLaSi&{m`yfEtN`1dyR?Mbr5T~^`eMg;C-$3?!8om0UDS-bRAlqmfo3!C`w37)>VaZ
zSxeY!^N76^NHKCSvfc3J@ZYexT*~Z*jSA|Ya*OqW6>Ft;7Rv60!QFhYz^dMdov8g(
zvsWhN6TFMXNxbW;F_S?=+~e3F_`uof#7{Yfg-)WWL=ClvMa~vzbJKzavwPFeV3&Rr
zgvS1h@Z*)Sgz+j&qWrzs`)8BFdfb7ASS;4GA*WL%MfrECk44rK4q}7AqP-xZQL-6r
z;mFX4Dn=_Nuz|9IjvBR9j#_47^gl*lYZ0%{`|6kQw-_V+XN+;VM%Jd}G1Z3q`u;no
zU2-hyxqD~wXfqjs{U9L`!R}qAl0Y<-3|iCc8X;A4pm{aCBhW=<(~Q`2*BIb;Z`Yet
zo7FAd_Aww4(4^WKoI#@1O4ekJZh91X9aJ!KwU0)ML&`~K>CkCWVHCLyKX5#FC9C=p
zLs~39pBN$ascSu~4l99v<^p4fgQ(v-3-VztFj?gK8`f$cB^G!&)+*-eH?<Dzh82sj
zd<F?GZA$gpRxWwN*a;P+1vE0uJEsnMKT(tU>P)RY>)4z4-P&y;xmNU2fqX%ZRy*{e
zw~k>sJy>Wl2-g{5=!rxuH|&bsT$qX>y%!eN`=T~XFEDm}!aWAlS-E$&8MK2f-u={#
z6K1f{d|AXs2l-eC&xS{Riy+)3+Nf}(y5R40k5NNN+EyJmZ$eQ}^(-$I^YI>=-QDek
z;p)@@s#DMeF%7e=!4LI(-0I%0iO5LC`xp&;-?^jckXE6(>|Pn`fq1^$x>5<FF|O=1
z8fRxb0C&xmTl>Lwl?quybrb%fmd_OC&6ubi9s9*Y4c}*X)F0hW`RAT==0Ggpi+Vi;
zqd~5b`18-5@yyO?t{K8E+YPE?V#5W4{<fG8Z;W-YKfc9Xr{V}H5a+;cN5?lQ=7e5;
zOg*r2-h(}6ZFKKre~L<RFO=4*gWgq<lv&ayS&6$~*UU3mqMqNyv`NVU+>6nvb$4ib
zsBN~_x&~gQzy!r}S*OZI6|y#Xhw}=dVxso<5noupVx_FRYPQ|iSm}H+_<R!|xknkA
z<l}cz<2Zmj5Anl#H)|nMH)<L51Y#<>s8h`v&$xu`Yd0lbW9AtEf4V(U*mObnk{_`r
zh`p|~{@kuNjh&t*rfO-!1tg?bA<Cz!BuPn!*W-DQe7-84^`$+$R)*?xD!%1;@!<%E
zpOD79VUCVjWg3AF!1|0hovGu#QbGf-Sj+3SEv``HoFnGgvtAvqZHsek_v?9<oK#p%
z{U{tsJob_0L?1F%%9FlH_i>+FH^54Cc2+P+U1Vu>U_0k&#AQ9BIFU8YnBL|uM;h!)
zVHeM0c0+p$5lKUpDq5Y>R?Q$*d)Usk#Vf>@spxk2!zp4<Apx2(y!q{%V%aIcg?J8#
zaS0UFhf49#=0$bl?RC@Z)_PYkYuv++r1bm8J~-(p_L4XWTqAYE`JfDpd>0hb8yWqw
z(=U9K69u^cqb1W6;P~9|>vNv#65^wM^fz1ljKXZOMp$jzz{JW84qa1LZx*5vPS6q3
z7yMbr(4vydy;X*hF{7j)+x%WKA{#KNxCAhRKSo)96EJb^Z6$9&*pRBRY#baMftsUs
ztrQCM;Vo0as`z~ku`K@%>1eTL*Y?6_<1Y4+tr3G+=hxxk@gVokd^bK{4rRIP1M^<R
zBNdbkNCl{EvDId{vT;675CGQ+uLUi?8{w>=p^NvS6IS|`&7+BmR%aRfm@fXwyS-tQ
z<$Zcb?Z+?cbk|c#(Hn%aujiV$+LPkAE1|JCL;+m^u()$*YWm`kG-KTKN%H+UhR?Fg
z^WjFDSGU|USfvzS^l2oATp+Myiy=4=edZnw8>NC6U>fUi>_^^Pg*H6sc1`tSY?*ks
zo;`ppD0xt#DR5CI&u}P<-w*+pU0)QFN?^a@ne2M&dIg_xLaMJsk{pqAW5LM;Y{tBS
zc*;{B3dm>fSSPMQ1`H5cg*mJ$lg`>2F0!+o{iS%G2fTW=B2rhOgu(;g2%oG7mNSL*
zdKtsxr4m$0#P=VjgSp&acYBwrVf3o9daE`N4HHX9!`J#Adq%K@mu25PF8DsGD}9Jb
zxNR{0skXNuXDfHowh9ZTk)g58Uh?LLE+;ImUjEuvcF{$<xd>ffEUyx#wR&t0=Q5a7
zK|5bL!pG!5)=?hXX0#ybb>@i@ZwC+{GA`CZB|3QA174XuCMuk{CWl|Czh*ZXrF^5C
zQxHdVu$Q|Ch_Aq$b%cPxSgD)U9Fe^i)Q$uR$#dV{dLl=wJbLr^m;z6-nV=$JS~?*#
z^!`%W_{z^F=@I)+Msjfu9~0m@V{~7&`4QguI|cL{oF@*@oZn`j>hTU7#?9_&GvrP&
ze$aRdZtyVo;Co(zbk!Hxm6R<g6C8caq%Bot-WOB<FH<Kv36psCQAKMmW7;okJr&W?
zQ8YV+$m^wcR(V_@ECmuARvk0i&eB;hhK2ZdcIz79LPwO-ux}9;qTfGyzKY5rA0%;=
zHodn&czR>?PzNZFdt`E|CP{;u7qNWK{ap>lqtj$CfP6E~EFqDpx*b!&nTG`2UwYm2
zy71{rrE~)Dd+b?MKSaJqbh)vc{-{3Dyuew>e%fPgjQb8ohU-SE5N98e>BR}b1zVjz
zp`u%<aus40{A@ImoM+u6;`x~)F7J+KI(*p4sJjBk$*Dp(A*pi8@3Tg)a#Y*;XNlCU
zm|uam#DCUuH<jLeoK-z#pJ@nF+iTMaPV%x})RW%e!%Rvp>ea;oiG&3W7%$I_bEuAX
z8nAAYU@dvq%GhS&OMSCCVwx08>}ISM6zL8uxFp*=DIL0>SXE@ob`GQlyurq_m8_9L
zW`DDJR3)4m4X*0R>yY9wi{cJmYBM(J?K9(t(#r2lO0KA>nO-0p6LJQ<D9v}0Rsr)p
zK5qbYLV2ud$}>Dd$*n{9CHo<l61y63mLqDkva5x1G>8X}I{!9#+0|dG@y1S?Uaep?
z2Nbz3iS<1T>5ZRyWCnNujRiTtW|c{V5p8|IVC>T@N|LK~L~64z6paZl9=)sOCV7s8
zA+1Sgsq&de$P>21l-b-n(nQCL+H%N)u*5wYy7tt{E^#jD5v_;)<-g%yUqAQif9>Bg
zf4xyq&``9gI1<NRf*dp?CcJ0G8pGX&xiKO=z!C+6Z}_=j{$R*f*jQiZ_ho3bZa!Z<
z!j8n7z`=H|{&^--2R?4pQ^>>vtzEp(P_PgBBh|h8N2>cPsr&ETQakfyh_GO*x<_@z
zzC~!KhHw-zDE({)7&1Vm$UF?AY=G!*$8dt#G19~F3)k*aeT6YR>!RTio!_IPy3nNP
zv??zBM$fHZ&B5so=2E34-;TkdF;BXlsuX4c-c>WrGve)%_qMXn=qWnj!jcQ4um`*G
zS+D#{gBYE6IsD*a(@=l!u9#BxHIG^zZh8GkyDwsbPmg_eOE>jazNdbE{X3QD*z&lO
zd5^yTnBBI$a5MG<D;zaQc}4zvDTLI^iEA{%Gikg|$6<WcKcqWVp1)>F)Rbd<e}n3b
zZTc_WP{mZ#7iD{-JReT36_9K9&$6l_RP?+tno=5MTtM}CA}AXoZ2A<#l{c&v6BR8U
z*#=i`DT99doywr?a;klqS0C&6s`08aj%7{m^Uc25DyN!)nX0dzk0kG<`8PdgRcZc{
z;58(AKGi;L`M~dYs_693cfZR_9BK|wU6ObILo~`>J#1#WK}lz2=27m-`_E-$r#;r>
z<DJgi3;Oa8JS38&&1!G5-Q3vOI{vL<c&=vr1Qql=vUVoxotutI_|Bqa-esN-Ls6ha
z+U;wXLJeJ^FPz&lM0Ist(e@>!a!^IU^b8O<skop*T3vJqDa{j+Qivdb9mm|=Z7Uty
zIN3b>6<7UF4i1`6k}LiM;{#3$k7>cF_x@R4liK6Ub44y`p0SzXtej2ahCaO>!wl1E
z8o@QWu^y?euo!KJ;}|GT{W~V&<6O(G<8JoOiQu=wra92p^(BeM)f6Ay16-Q)mdMsu
zpQhJ?Go^l$p11sC0sv!;aHMy`Hz6T(+%Qi@XTlfE*-rtgL9>;Jo*%L2V@GR0W)><i
zQ&HL5{-LTr+2(Tff5}DMFX4J1*cCXhBzSg6n+L(f-K?4xg#i{Hj|#N<T-|AW9cInq
zqpA1_xEEp5^m*6wkx126V^31e<Di!l*b|YqbR;hH3-4{xjBIHGLDQ(9E|27$$Yn4_
z+)xV3Nt_g2>BJ<q{?EG8>t(anAEnxcqG}}`=)86vU#*()6q;P?*{Z_!ywVH(7x?i6
zyhz)8)A{zRQhTmf7d|3Ng<9Hfrc63Py8|`^OgD_o3IsBj6EHJ7Ly=Yd?w9G$L#shE
zGTJrjiaJ7A=fY$CYLJGb!JeTV--DpUxDj+!CrTUAC9F@`GApV6VIC;qPj%`)Ug(4<
z0Gk%HM-&`Eatc40m)%x}Q@Ru|>Ir(-1ZH0~bwn2C1M1Sw%Dk;(pHa7#tpC{B5Xmx$
zlcHH?<~J+NxMmx(4upbQT}{0?K^d)#Mcv2sjII)A68A!Q477y9Zo3tpco(>yohr$I
z@8ViHbE*(pJFq@J(^Y&wk9-BgE3(C^h06-N9398uW+msvABW)FF{Oc?R~B!brrM0L
zDa81W*33rc=^{+R#vNx(o}QRSz4YN&qU{u|zhEiUv&MEiJ7Q;q@Y~(#sJ1X}))eJ2
zV18IT!wCXyy?MWH0UEy~s3Ccs)pdQ(@){GpphnsJ2l%bRruVPrdLf=1QlF>nrP;r4
zndz23&WL~b0ZZ4pWa`F-8R2k)L?&6ev`KY|j`pTX#`>f1HY!w$W0IEF7~34k2^%k+
zH5OW9BJ|QX@CIs&=lumm#Xalnw}Cx7%vo1X{`qI!-0x%SXmv$N+53!e#&%XLi4NyW
zF4?h5H_wGT;>}AE<S{%a)xcJAnr>-(oM?`LcDYS&6gwSq2Ekmnm7&iW4$pvTU_n)A
zX8&{)mDK|_^KNrGk>324VRcycID?of(c@OPb<6@orFNs6>#J;G73AA$dl(OtKa9~s
zJ2drFZ=01v+faVV&aG|*#~cOS9%J)7f#Q!v&SqmbbSBq_8*!H>?WR(PH7s+|?k}Nz
z@Yk%ehit)Otd1U{QaP#1_7xJ}0e8x&26v=Vt1U0lr=5r`#(jLP2=ZWD*Y-PzgT@3_
zm}Zb~j>b0<@Vz~IVZ71mW^S|f&+NyHQl=c*hef9k5+UD3j)PvD=^XVT`^w8I?YVYS
z<_FMFw+~cofo6B2EJQ}v(T>^*-F>$Az5&zKeDHIAIS5;B5t4(4o>b1((o!YlJC9a5
zRT{5p8muhj2}OiA3d{HY&u<FhSsaNursE@UV;)wT6}Xm&912UtrhUW*g`V~1=tq@>
zgy{dEzhx1zxZr_-FRSIWM96B)YKu?dR{dga@gU(v`CemJ))YvFULKd?q-Qyglrnvd
zuKciLM3SC#UIex(`=?c_c~hj~?csJhH9V7}@i{A|ki1@EZfq=c@-aZ<BbnTPWn!>i
zuBPAe=H}wo*Y7Jot!{7UGnc1>SHR1aF7tS}U1y<WHJ6)J627OpQhVi@*amaJ7RUI7
zqiWg8jOC;ml)s{a(vfZSozK)6&G-n>%w0SY>M09s;pbs<49@GKwj>wao0j9^XsBOI
zl9l22oY+|9dgV8#^r{C_-Tx!id=Yu-3M$29r*Er7>Dtu*pnb?gIeMqK2@h6uc$PMN
zjE}uuq|r?BwFA$OaxKLVJ|PdK=$VP89&E|POzrYPKK5G-jc6%FEy#}&ZN>SLC$1<D
z+pqR}-y~0bbt<!rjpZNQ(^%S?Qt(}@jc!~$OrYGa`kiXH=at7q#w>kH63EkBJ)S3j
zXvCu{*jZb2v`;MY{F`FLr8>vE{1M0^-X58TsuN$I$6s=+HUUddg**^dAy(}vtU~O$
zuWCk@?#1Nwkp-p@v5EMah{CMjsq975Pp=CYAS<5x^L#f+dcLVGljo}0kS`?^H#)U<
z><v?Wd1tCxp;GO6=Cov0c4C;X6s55tpZSB{9`4g*>Q8PW^|V!aKmAX#c}e~6o!#hE
zx$d_D_*vl+0Z6=qjE$?cmU_;upWrge$ny`*{++%4>sZp)r{Cu^nvR{@Ta=o9>Ut0I
znd9Nbx;7N|Yae!P1J3fFu0E9f6S{_JW0}W*jg*GOL+W3#(10q=^15;$&rtGDDA_n4
z%N&2X5iRi$Z};^AJj&!uq+uq%@16vG<n+l#-^tq_pR+8!*V<U6%`R6>u6|?pdQP1$
zV-suFX`XW{k5`~8myPG|fsy0oZ<mYIG`@bc6CH4F{|Q;J2c=LrQ!$U)-$SGr$xpOE
zBfVO`SVs_Y=Ok`JjK|Du!_473^+=$9XyU0LFLNwA3)&<9F7kSAmR^X-bTLPub%ZRk
zSHoUSRW__)XByd?@JcNOiz3;s)!1Soi_3B^E`)bShR1ooU;x!x8>M+#hl#1<j)0$d
zj#qUNG=H;IIiEL*yP6t8a=AU2_emyGrDd;%tEaHPQ|08nGQE8Y9*rndIGq9R0aUfT
z9KqNNN57M?kDrso#Qe=fV-aN%r_a`Wf(>u=!a*NP-kePdwDesoRKDKxt(pQdbyO@R
zO_piil;dKqVY{kT-D2FB@hoO?v-<JS@T~0My5VZN0jS>E=!GXolt`QF5#Ohw<CQ1~
z-C4^~!QZCAZz-2A3XBt;o;LFYVGivD@Owsc&-9wbUb~a8$Jl*Gk^EZ9+$20k5(`m#
zvbq|Sk_yL<A@^~-S4?)8CkXfQ9BU~F!nI?OI&bxHs#9U}k&T%iyKYL}2%BLnO{qbG
zUFeEN&^u?F{J`y{9FLT65|=Sx<7R209~2jG;a!|Y-2G9Z7}4eiZiDs8bqY_}!x`N8
zkloE)0gTTm3{x|$swG(j&8S)f<^khs>V^EX*an=_sf7O1r7ovWZ)O`63g+sjqv?m1
zhJ{36?qMaBkVQb%8?N?!eg&MMZP2|WXC}ukTLSLE;%?8|w+y7dO`Q4GNY<S~zE6qr
z)y-+_qiCz6t?b?iq%%kuC=h{QBZUAc)&6+fdpCvHYOV}^769Ds<u_BSVshcU1P;m%
z-RCczz(?>dXi)lAv<7<{zB3Egkp&4REo@`K4Vea9vmXl}3p)^qAieZgQIt;(*>_r`
zQlvuSeVa;Ac9{JVrM&iH8L70x#eKF`a3uAf7xF?$kD+{QEGy=jvvYy5LBrCZuzbIK
z?{0M1vv@QIo!*4?_og`dms^wCW#-i~IT@z0ORYJtT{`p)8j4G3cD^ilzu%Zt(fwI8
zM!mK6#eul`xv@`VsvFdTl+lI8n?r_4JL}g#y`ly72<G`~S5ETMTwg(6qZ+xIhJ7e!
zwcaT+IzqKzgB=Yu(_EVRh8%Om20^WOSAJKbui+qu@Uv{}opvc5b)HkX|7>@Ni5Ctu
zRyvt9G*H~RV~v=TS<EXmH!=$R`p{MWE8pFj8AUok^7ljUoFs3xI;{O#0c0Z=<u+@s
zx>O%T7U@<==3Pu9?m9PR1Ao@f&ByJK$2Yf<Mwev?y<6*6hV|GyJcOt3hHf8OXgwaC
z2)dWBnvm+ceFL!(pj<E|Vp&^}G0HUs_1LJ%-K!FY$iE=g_8qOQ#~a3FC|X?6m;Or`
ze<|ZH%lOMp{=$sE0Q9fb<lk6Y^WS2N!_W3;l)0i=5~Xy9JkeOPn;ld%wFITJ)G&{i
z;5lZ$`uwW0(9l`|%|2_SN!xiiyK{HOFq?_F5ACv$4HF%SyB$SG1M#RF%`?KT=TMu1
zgb}2G2}1I?*gd}Kte3%AJaTlBqy!8Nr4uy>k5J#BG@|xZJ$U~vT%j)bvYHXf8UmOS
zshQxe8Nhc5!nB_<GTN$HDCcNCAF;>xP%4l?@u#E=b(ufGKXda<R}!htQQ!XC-sXRb
zVaHI+S0ihJHtdx9WVLfOf}e#tN;B-0PE2rTR0l*t8z-GSq_NS1t3bp=p+*A>M<LT9
zu2MQi?UY=5BVMm=t!6{Ch!<fcp_<o+%apGK67~fhhZ9J7Ms{r)R9`K`7dVWvv+yLr
zI*LM3Sm}B|L{;d<*~i799jn1Q>W6vOW!*+)xf(yIpj!BNj)%*QmxYaox#tOterw}J
zJ8?y8u|KJxzI27Y-uq<u6-M6cd}!Nh>EkJMl7szb`;!Xl6~tPsNsh%IQSAr0$}Y|}
zn-cy;5cN9YC-+p@#TOBu_1Y|nC6+oNf4%T8mHxjp8W9bt=|O;7sfC_Nmg<v<mA&d(
z&HPV;v5E07^9+EUri^T{bb^`*0ImKfg934r3SHU}X6YkY{ZFNcN>_EfXejjz5t0Rb
z=!=n&ULC0~Q(re+g;=*mT&!{Lw)>zR-es$WLkb#v{k+D*!$an=*-#{dHM|pEbSHY&
z|Hl3C&b2`A*vsE9KlvL%{HMweA;p|OVqK?>--etBD^qx1U%Gr#?{N8i`r4P0v;T_N
z`}*gI$?=v)DrNo_af;3vcdT#8W{6oYHnK997N9s`q~eHCm`?vFH?3uRq*#r>Xw*Bi
z#)WeIH$n^;&&4%d2NF&Ym(Aj?P|@imrOFc3@S=RS+_&@xh1mG{H@1Rc8E;JEWuI?=
zhsU4iO%v_yl58in`n*YYv5YJ7{itTnA0EyL4*f0b%^xoddJphU;5mbH%0;jAJOjol
zb^{1q41y(@Ldl{u(9Yh@N-7`hhh@P))`=FiZnC}2@CsRQdpvte3YN6)pZrx^GAciI
z8ig1xgz4I=5qhf<VpqVP^gI*eDXNBTw&eHj(%HA@_UhTk``cq8t1FH#f#@sNA3zhY
zs2Hf(db@J3(7Id@7b*)qm{%^E_2l6l-Ihj@6LF#t63Nb(_ZEBgm5==uT7GApukQg!
zx+_6;FGjqTOydokl-oG<7V{}39J7su)Dh1M&SrHM)$HAD?|q-i(7XqN-X%68=60N_
za?~=Z^qZIEo{>2dimXBXQdk09R^BILkUuw?$G?fQ(oo{w1J{`hPTRm&_RfsymBt2G
zo0hWCZB}Vos}&{`$r*H~OqixnL8uI;Sw5biOoRi~wJXT7wwfj$*%@&SU2VRpBUPy>
z7zNsXI?#{^w!}U!*pMS|s*yz)TBcoir-<#02s|+Xjs`Vqg)Uem&}iTQmz{>**8{(W
zp*>D*adTbxkZmB$B6RlFBk0~UD<sE6evR6WuI_$hRS^W5c1FI*H)QwQ6n4@oQmOK*
zvK6J9jb)HtwxugUE5e>Ut&P>S=Opguo&oMmVZ>8#;$tQ4<M0gN8u)BxP1$J|(}#8z
zO)h4rg`x%A>F#R!Fz#U=wHN!Lkzi+qD}z6V;9wsvSH3JbKP+-62k_)j;8OFB>$UX%
za+v?q^U@8IH`@}Cb17B7Q_UCN*gM)i7x6n4)6KrGQ~EzRvS^*Y>Awt*)~8T-EnLgm
zR?L3G@SO$lUjtnpJ^j7Z;IX2Z(gG}ILgODB-i%Rb#%IKpCM-v~Hy^kkT&Fq{WAa~r
z{KqG?VZh@dQMtzt87Sob4!nk!_qzAR&TP|f)w%Gq(GS12Z7Y7z1<5Hto!QH0)i6@2
zqHjGp5yGzfx^e(JIvI6g0+*aVe%sD**}Mx^&FSn$Xl@1b<MBo9$r_p)v9V2WJ65Bj
zLD9Z7EWSH=emBk-QW`^#@0c9+SQ+;Vc@6*e*N^|7l%VMrqUl-J*1dTv$k4jYlNQo<
z&CXe_2<I#f+sEfX1`~W%0uSiFbiZ=)7J0P%vS>9<ZY`j_1K;u5?80kr<m(p{JpSv)
z|Be!(|IytC=a#Ly3dE;<lAarxyN-7DW(;6Ik?rcd+3{x4jC!)wS2fg#*}eTwUD-$J
zIts^JPp`gNmn6T7^&E{eM5`AFp*`VTFUkiN2xJ+VhsBAVkzK-4Kw<?HY&@YaGe|8Z
zF&wrJItuxCqF*4^s-*rO5AQwl-bTZp=kGAUU=gCc6|!;IYSAE_I8@1q&F#E*c5c9K
z{F?MI7+8rI&9Q|-8QBwFoQ{zA&7r~qS-=Ao*zLT3#G3W}aFaDq=^?p@7@i`kF3nO}
z7sQ8Ty8h-p;tGl_Clf{tYy+}aWl4eB*zp4x7qI4N&O|(rwtVWj1$f3o=uwinB#n3h
z^vP^Bdmd~uFw9&90(;=Q%L7J5OUQYVOYze1Qy);#hjUf(wD2Ft2Vr$a3KSwDvz#}P
zUO$P5=v;-w+I|rcO>4le=TL}<<Xo;{Ug(x@QFx6&;7FTayhhsbaBJ`{UZVw=@i_{w
z(Fac3<BI{0EwbzwQqSf8PUUy3`nLMo=Qj@3d)Vq>QNoc*-K|=$@x2FD`YW5>b~R&-
zE<WFVYoo3`<DEA-lt)Q9lYWTLG6BhGsn+~AtT6~d;A2;IfoCM5`j^dDez{g}%HS2D
z`lm-8zy6-_hy1#I|Bg(7AsR*IkTs=1;eaGBzi3Tq<{!>uBs1Kjn!~cOo`-`++&@PS
z&wGkhn)4s{0@ghKJjwU(i6q@ozYNbO)`&m<5<;o56*X56tqO=W;yDfzM6*cZm7d@T
zpCEMKeWHje4}VSUrC;J_jtv%$=``riU-+;xGiNi{7o?O_y-b?2*lgO^-<g&4KOY@r
zGrjFVZZJFQc&I}#ygvrWJV7aIWygMRwavVMg0r>MY#$^9|7g<a=Kf<ALw<=rDD!WL
z1mNN*aqr3g!B9$%n>kRpGLHP(Xkm)0-kO?yV+mf={P1gJp?{pmq<>4Mz^SXNaT=M3
z@pif9D~ZU!dJk5fA$oO_h%I2weS;ybAE#>cUv|puuS>&e{@w~BEdRwU@hGxHKWNJ`
zT60<Cnuq8bwxD4V+n94ac+Epbe5fEs&60kqW73}U+dij2;2*Pi^$$B#oBp?C(sKV7
zcKU1R!he`Z=YLBkCZ|WKh8$tqHkUu2cf;rbmPbq?v~ICji`C)d-XWp9N7NIFE!XG9
z@+Z{z|K1A0e*a>cLXZ9}*LB$#amr-+!yEq)pXtHBBN8|Fzp~R`BNzPrJpP3yf1$}=
zX!5f2FEshz0q*}HntYGPSafC&%p=56`A9)M#=azYu5wIag#_g&m;*yh-{IwvrYWvm
z(D()60V1qfv8(xJ;6}wHXPI1KD0wPI%5?-)&o{6>x>Q%r5U6lEKMvG6G+>KI8OEE&
zBdXaQOl`B*5X{BN+7DVY;Q+jMY-r)Khkz3>_8Zmdirat7Apbk9%KknHEi<4{)okY>
zV`0x4tr#rhWOV}ZGX_hPEdgBS7<ou-HyoaRl+(ng`Ql}G$*s_^HsN5H=q>@&;~zOi
zyx8;g42A%g)Y(}*Ngt^>3AF62W8dziy>q@OcqI!pXcBMgx5uO1#zib=nV`S95+nw@
zGh~ppo^fyK162I&=9){T5K!uk2!-!SH}Sd_3SfD@LOCaE%QU-<I9ZQK#Na$H%MlUt
zReDqb*uC1>I8lykBs~?qJi7Zp?h9f8rs?>#!|Nk90nO(s`_S7}1<v0Nd~W4h%qgvV
znxPx_mS_20y$X*LIH&yVW=-BIH~N@;>JFoG>5<{fL)wP|SOepmMPe`x$0W>+KN;Vu
zu9585`q_c(MR+2d$^cHqu14i(dyQ2N_CXFP1XRe1$>q5`(e3wt@<A>COOXHTkKv4H
zGM;Wfm^1Yv-C%H@B^^UDRn2z#x(Y@nGS3E4h^!)j94N)*L>-PM@L`U2aL6(thEULg
zra7+?@nO^f*x=VGb=g^T-_liXQp$Yk!-dsA+Ixxhix|Yr{-?bMI!&)1k9?}B_gd?D
zrLTeS?cD6P(!e#~Q945y`RJ->UfwN5UXGG64G+Gs+y?9(Iys%Elru5PQQg|y?r<PX
zjM)>>YHWLRF_F1RP!)=ZE(}T|JOy@~sm*!an;$cJQt5{Ekal>=?Ev-V>7ph@*V_}A
zbc^Byb4g=X^vle3IHo7pC;vqB4_}@-?HbGDgv?$yr9Q{R*Y<x)A&rTCzvUG+ZNC0`
zMdSaAY99QV75y(c2>-X=61?!)@KC&|cGP@1@+{|{=#IKf+iwjqpYhdp3AlcL?6eAn
zwoNLLMl{WhPUDxU=p95Ed*3`=KYw~+rS##XL9O2-g)75iE@}I6;8OE%Zl9S3vnwHT
zZZF_T&e<eN3simHP2SGV<LJ=#B9FloXeOpKv5AdefIi5%_VnB{4~|&aI6myRMl~PU
zt`a{yyTH_IIc81nPn;^3>`{<Q`u=WASCo^VO1av{>@zH$?otnhiwPNA2Ve+ddsjR_
z;d_d~iog4(QnLRv*$3jYQyZc??WgIchUV-Iv>g3S>!oCqByY!12<RAX1GF`#E1?0#
zIj{E$?hl+)xB)zFIzHt-5BKHhI~%WGPn|{AHe0mC|45_gRq`qHKQ*N1UM|{x32^R`
zul7qL>rR8GRR-`!>UcWvHMc&#UrO!#CCB}<oPYAJD_8$1NPPKaAu=`JdR~=33Gm#j
zornW)%#oaR^>X_dfM6Q2K-23K#ygRryW};OCLy$>k_d$yOONA&mI|{ab)_<BcuP6|
z7np?24i<jqr!tnmZYEFer|h|ZF1weNXt%lhBhRk7=Z&8EV#=SW>X(&4e#w!1iuzC9
z)$!Xu1qm58pR)R>&DRy5MefQne_IyHKaRcIs~ug@gSg@_gh6pqn(#?6#<JVvD5aPR
zNUnRLlT1>1;KOS%NDiprz4}*9d6M|hs7+3Yk%nfTk+1fXHnWl@VsZj*h}d=G^eggL
zToWt0RqhVoya0usD_&3-e!@Xlfy|kG20v4~T+?epVBNeum6%E4^+M->`M-qn_35(t
zbO9m1lx*|MhXT)v=?KVg^d7gSypGCk>iQmRIp#2NZ>C6b{_UT=fcmE!(rt247LMD-
zDR00v)~zpUvDDLKfEf_6EQnDGRF=j%G>$^KdmG=+IA<4{;pbEfoMw=P!P}a^SbOi+
z4ik)r*xtV>i87A=tWlNR=xD)YWo};}Sv^P+@u8`Lz|t>t72|z1o}mIZWUqn3o5seU
zvTQzn59-(T$R{h_pSFE-%1xKxFykp#WY;3e&X~PlQ_&-4ViC+4P|e)aP>PF6_iS{i
z7Z!1*o{cJ~z2o<r1ek`RFt^H{=i8|{tN9B-Rf!VboH0}v{^L#1l3+==`bc91S@0X~
z#ADVznl-ZFy^1(Gx569_*@w#od7Px``=S7N>q(Fs%aPtJ?9GOVUIe3bqQG9;&@7ZR
z<H6*E(S`NM?}4(1`{ZfErx$#+o-8!Y5T}@{H1l$`o%;ulgV8p;cee>avsfVS_D$=v
z`WBlo*KpM02orlv-iBYLA_V^OVOYgy5C_{)5`}r>NDp)6lm1OADktLKMLMUfUrvVi
z)dZ^M3pw6{VEfvx+Q$_lb8CL50x)Bhy{;RphqrhNk501Dr{?PtyYi(+OgTsidid!R
zZMdM_ba_G4O4G(tU!>TrtuHq}d)Z01ff?7<@Kcq0iu3W8JICd1=e-ic&Y$RAe#$h~
zh)A|3)$N~c#}#~=bLx*II$h;)YV6Vo`oY(GJMv}n5HHhvDk@IU-^oX1QY@WWiOXPE
zzmPc5n^@_pnUI4eu9duL4dRF~?_?RYeo5hsstIbzD`c{RVCkw^HF9#1*|kc55tyl$
z2-e2C%;&3w4ek3!uju7w3iQOa(LT>?q({EH)ZA-_Nnc`=Hebq8p`38oUsr4UovIJz
zfS#|Ky)|&ibyfHE2Csto6u7J-CWPJUR<_39^CA76hyWwAazo3am*GCKAclExm-l@S
z>iV<6Jx$bcY2Oiy_^(CTW53C4N*`~S9lYV2bB`vaZjyQydvzwJH{Kh9Yx<Ks8Ebu;
zEas3?=;NwSZr{1tk)Fu-bLIfSwa6(1rz$<8D?Vx_OXkumqNC=5Q*g<(g0jlD<3%s>
zYFihmPE|QRp`tpA>*+JUQn<S{oaDuS*cV?k?$nRTd!y$~!#z^eD5SvW_yx0;2CCZv
zzX+$q<XwCir(>cwTRUGW;u?3}Mg5D5=CNW$QAM*E95|?9H=$VWa%jv8TGD!YjW<Za
zUCqE<_~}Zvz`ii~>i7*2K$zpZhEkHqWL0>ag0Z3BYFZ2dl=&bIDxBD5Puxcl^CtDr
ztk<MgG{L`YK908<x2FW@nYaI@wEtd^o^AGU@mYLSYop>oJ1t+mg$t%MUbl`1N=ivs
zT5+nGFRn_954nUswb-c2DQ-VoyRChxkgZK6!}7IgVWa0c6|)WRZ*D-NWkj-FCZZ4K
zOurRo3hcyBWvi({d(~M*79d|#5<|ouHTYIrfogX9TZ><ftUcf=<j%8HZJ!r}m^<}x
zB29&hP4h!5#mw}VJ_q~mtCqd1vdQ(RmMubNcOu`{)m32iZZ46tM=1Of4z*|n{zPOE
z5U&*>aFm?zWfsh%z{vXg&caZ}ps9;*>_`yHf?sguq`eQC$0$pIQm$){KDW2ED8Z&c
ztsEfZN4~v$ii7M-2xo1h$CW5P*elvuGxYPIp@W46&xWpim9VPNF47L0%rcs(1PVj*
zn?e}BFUVf)0jmjA+$el1z4-Kz_LCfAK{)Vjm_VAVEN`cowm0=NqaZx7MRSxFB6+_@
z;{Eoy9C3C!;con8<H{f0`cTTHanl$U4qE~6LSGl$`3pKBda6p}YC2~^?inotQhEj8
zasL+eVpPJK$RW>yw{6n>+@ahlXB(D&y$F|1``2xiDn1^Drk`?D(S-#X&r@$<G40DG
zXxc!cs}lFA{>QsUiEKA<GSA%=Iy4C`4W0eQo;{A+OWdvdTc2{rb21eaAZDKaA=8jW
zcFx}-lSOyo+}FZd7<kJ9S7P6{K8U>31{Q#pV1e1Maw$C!cfoBB&;#O=3l-p`<dQrW
z?bJ9O&kFdY-*`!_vZr3?M{glV=lPdVXkActgz1_D4>yfgVRPGwMwN)Kdzx<j=24SS
z-a8%Aop8GjgS>vL;1nMvR&zhbzD2$YNx;yzA^d<^gjdSBM_l|RT3u=|Xl(2@FZVb3
zHdw*b%hH=Kgu4!Y#5oKUvYoSntcz!L5Ze@W1O|;ZMezye&CWwh7Ey$Z*7QkIApT;d
zbMb-XVpA8!oXwnxVa>hOqy}GvO2~xl`RtLf9$K9b`|PuvMS?MF_lJh68S4UY%{&mZ
zhp}=sk1qfAqEM#v#?aQ>IWg-EOf1jAQ>``HJqL_m9X@}^UNCraNN0oJ6vmFgXMwdq
zgb8-hp!+sbU!6x}?{C7+y|bt)Nt%oR6zfP&F&+BM2uZk2<Ahb!vg6s#lcszoEm`q?
zCSHh-7pCsjJn<F{2}7r2Wv-yXajqlzcsq2=^*HN#`oQnzb!9{AkTjg@T7MJ!1>WGv
z1Xrk@?L1&$o?GK}))6`C=rLBocT7wx-ui3ep0<03;KkPG2mzOy)7~73JP*tod1VMi
z_ePI7%z7kTi~>CSEuZXwl-O=5cSIv=a>&^&`JDtzvQSV9AhaG{|2`jX*)0VzD=<L1
zNLcA%EHI{=&)KnpD04@qT<erP%$O^-KwLT%sGp(4?-s)4kzZJ=_XtA_PJ+>C$rube
z4LyT&`9LwSaIYdtqMMq_nxvHC9aQ>oV)J0f-RUHWuHiuGY7eFQ*1A~4h!bvFGZ5-=
zPS<Hg*R(FHcc>cXK03Fc5~U@&bV2;p*qheMUn@z;KV2z;Q3I2@&H{M+PC}|0|DP_1
zyPATWScOfG@zKhA_l&R{SmffQ4DntZv?QadBCZS@2tFV4eo#nRklq9B-o=GP@n2l%
zew>{7Eh%?*`P5>sRB5AnnOS&_5&wjwwm4g|Pf1nl%KAuP6Z1lYL*$PxQBeuZBk6%>
zDv1*XnZPxa9^*8$Jt8&Aq-x_~lsn?8CtbO#I3oUZ7zcuhuXIZIW^b$OA{H=Y>!9;$
zc3n^y(K(_rv;JJAX|DL-%O#0=&spR)AS>9N_8Ta2XbCM<FGVUSNBf&L-#ibr{bn$l
zwk+hjj2X)j02W*^FkN~jUut8w>@zxVrN?6F;XQ()ZeK4n1irfo;L6FcJz>t|?Po#}
z^?_ZZB-8jS-`;LHJj*hA<n&wobt9Js^XY?2NU@ra$U-)4$$~>`%yx2$$ifu!;fP%=
zED>EBpRnOt9Sa%ijF8OkE@xtOne-G>4kx$i#p^T(9<)KKL+{&g-?`jvXI^t>+9M4<
z;S9P+$kn#1F6%6w0_p9GK5e{L@#JA<pvTvTPhkO(RY&RpE|!OqFd>_6L@eLJYQ3n2
zTj@kPp?ng@`8-M^!0$XV5NoEE(BpV@ySd0y+%)!rqk$0FVlkUfJ~OjgKa#A#<SD8m
z*(Dn=M4`x<;$CrQ7i@u<0b>hU$mx`VxR+N=GpDa)IGef6`EIAmv$L3;^UCfLFBuQU
zb;C80jHbBO0=d(??#WW@btUG86j(M&-yybdWHv;k5@2zx5qkM%%=@jH5}9$_QUWip
zJ@)oN=dOZwZxyPZ<7#5hIcvLJpho)nVNM4oRTC#0z2FD!{VuhHtb0r2xUur112g&K
z-OIo6J&<TItxx-%ihO8!^lAQ;2~OR9N`-rbMXEAE%6=hQ0!9`bHhUaKVFPW95olK^
z68tvSMVcp~9myYDXScd>;V9nWVNmV8kA5wp;;}Cub~S&N0x%vWi&N!*1f>Mk)X+M;
zM+|9Alf*^l>W+Lq37r1uvAwTt*4=F;yIGi|*tWFI`^!_hwa(L)Elx_UBwp=`^=)B*
zca1A)8ChK3kgRb{IW@k*?g5u73<Mk*Ew04zA%k4)l-?u63hJEC+Gc`)ot;Oq@uf<d
z5tt)_Y4bPSX2=oF7Eux&;f1?E%xA6kSFm7_>JD(6<fBD$gm4J2keCkmSt_O!VGwvi
zhNqeINymw58S=~g)Vglwp51g*PvNc8(b9#-cj_zeI@fY_F^uEm-lMMY@WSu`3Q1U$
zJGwM)z_{^V(d~zLI@yx$(+BAmQpJodzN!WN&5fQJ;G#@yMkUQ^cE41iIP)#bO=d!G
zLjr)Q&7!*hoVc_{PVf!E{l?hDN0L)QO(@rdA;7$HEFgQs5l3HVpLaPr0TL&$BgtI}
z5!pHCY2x`||Mn#jF(&a8O#WO)C|E+P)>&KcxjPWSyOtat>0@!=BA=ZU)@67O+?0?$
zl${_jN&x1Twk<w3G3xWLYy04J&L;`hCb+jAhvKAHqU-c{QBu`y5J=KVAXU0_tpiNC
ztAI)5^qI<u5AyHsPe0tZ9(4o=-E1dY(AAsQaLgFGsNM5(E)dV(G4Y0q3=L!M_fa}z
z&a<<UU2J_ai(ffYfaF_VXTk6)-&DPDJROaz9T$6|uO~MjM!F8BmX+TKd@dy2<xz4=
zunh#ppC`+;iJ$RYu-#9;#Ng)rZYnYEu2rtU)RFmJ6WzfQTe7`zzK5no;WS+-u_`0M
ziKt;VREx&#<7Xa$D)1uxrN-LM2knm%<255t9a^MM)h4dn)P~B;0)kkTY+z?5;OdAv
zy<+FcPH5NimEaxk%+BX56Ns_Zw{qS~{f1Ar_x$hNEPpV$Y5hzL<x%SDET=c;({Fw&
zgl=ghDYubZ+jW>9zfD}#ZQ{>Pbl6oP`NZ<A@Y!6Ma&3544k+BzSZ^wl<dEW<u3byo
zV8B#h&%+^mhfJJK*_t?Kx$Avrr%-oB2Q{i?@|`EvKU~##8Zo};Yt_q{8sV>&UawbC
z6n}OMsS^h7!hg_1!iS2ZoIXvfYEN+rb?*BGVR%8LvH)XN?v&#Prfka#5itt`4qj&<
z7bx7q6BFGpLb7T|p_NwrjC@fUCC#Im$cTAjQFvY~eQ<1pDNbG2zx12X1$10XU<l8>
zciDg?jw_=3i6qOtQ1?O{%=tr6sk$;Hlu~uHs!$Y<ZLHFmI7|$4Jkev=ns9D2^e=4|
zTvsEi2L7Te;#dN3itce3J>J*Sps<q@&{yWFraqUrOr9k94Jo1?)tHGzi|bx1EGzem
z5Lyx3+*Z0xg=41TIy?4J8wzI0zW}Y;Sh53AYfQn0)=ysNT!4VC#Y8_{l8cfZ-sC>u
zQjMy^qgaC&bAFOKMZU_ud#QY+s9H#!&yfL3^Ayy%#hrka50~xZgWf;JdtYmnvzA_7
zDSd9}eTeC6Bpk2wkUvS^s~VomlMfv3R$Y>U2hN{uU!9ea*fd>r%#i`v5uV{b9kkCV
z?_HU$;#c@cN%lK;XC+2n?H74oPGxUz<gqm7G;UInoWvK!@tb&xGjiWx1Cy1Vc3?gq
zL-RixpXt+ruNxOCa#8p{<t|Q9Q5pQfb4f{D{<og4|9I6+(=z0DD)-jksq7vjg{%99
z$DdHqE8qQFy7TjKmAx4aX?3h<YYA_9=QVY{x}@Rh^c~DzkNId}w`+){v>AF?8=)^@
z<R}RVaJ`kj1B*+m4^cO$F&0=V4()&0X{TyXZ!6!wQbOo9+VuADx^5%1raMidbIx>I
znTehTd?0h6?W*c;ghM!Tp>`qHkg+_DK+64m{3FQ%L|#-J`(^Jwk;OA`fkDwByTfej
z=`qZMA$PJ|gGM9gy+=0ul5Z*>ON?r~FG<8d{eC)s5^n#d2FNY1YszTXdXg)<Y}*PX
zXapbF9dc-kjewJE?J2kXEbffg-id}0E-_kUzK#$gNDuhbd&%h%;01N{=RI~psLzy)
zjb`b%7d!*z!3MIW<K(r$qSNh#JGT(!I{DVL)tCnxLu>C5m@%6!w~My#P+ei3C&AE~
zMUd8LcmORwYmP)hlCvQ1Se~7_C7Wb^Pq<D`*HPndW2!b3o}KHIYp+G3eC7$ENj4C-
zG-2P!_(n|!k91WXfi){if3pf9-+J~_mVui~ML#N9KbA9LGNX0O!Al9aF}O@FaiqAi
zUj;=erhi*k@}9pn{H>5==N+@QBz)$~hkA)?wGNRAw`C-BsAKtZalOFx4{mfFoDxH(
zTk5SlT{r0awnp+U=aecaizi|IYo{a@L7cyZCW-FAGaiRkGVoacPPI}ZDSCg><;urL
z&!hR~JYrKzH%ViQovQ!EDFbP8ou7H?^)`QKtC1(bB*fTEZ{%{Cp530dnVQ;A3Mjh^
zs|*4XjO>#mL@KVZ=zv>bGg%yOi|03MWhVkj!yHjOq?VW$K%Zx-d?_@;8@B54#b7FF
zZss1LIC;M2Npcq8ZP9#Nx%9*nhnqxiX3~|QE5oS-96>LT-QL?)eOYmkCxWlnnQ3Hn
zlS9Z_E3X>rbL+N4k{d1Rz?GPCi&J3oR!-$QsoGXOAuRp8im6JgJX6cG9sy=Pfg4)Y
z2r$w`IYA{?uGtq>srm~$5aSshsbxT<virjgCLs`>PJNY@4f>SRV9_+&0UOl-eNO_b
zoWTlWiXlZZsXPX+2gTS2awLfsM$z;}^Duqj6eem5OTWKtRUJ<ah77bEyM+NRRvYk&
z>*thnUF=3;G5Q$-Pykwbw2Gsr7z59fkLKo?DR8WC5-f5i21FJJ-5sW@n79(ek<IbY
zLzm2+xU+9>kL`s$Q2}(9R0kfJ+au9gsLL*ngV*K63j<;?d!gHt2D!vI>Ftd3JTeT~
zz@g$wXm$mpGsF`YVl!f!Yp2qe(G?S(<WbeIuMk&U2(BSbkzkUmwJ-)EQ))gQ&m$ii
zsz(|BZMlhao6nTW{4klOB0-9JGJ+Q|EI4GA7KR0hG?Iifhwtr{-Kg&8RCqnB7nMe^
zs<r4?oN(Ix+>2fGE_)p2&z5}nkW2edx2FWWQl8pagTc(&Kl(D!#Df|BLDrH}1vu_z
zULF);K_|?Von+U+t2pTS=4+8IZL)MyhwRnGtghM_+j~WgcGX8a42vQ{NkUq!N<Sh?
zHClY-L#U&5!mOrxRcm$cX+57253JRf%Fr_Hfb#QIb7P@vJrH>8R~=R4OWM<G40Cyi
zm)7ketjJ61ekAb6P}SlF2N^RI^c68E3AYaYqCnH%sd9A2#?ApLQiL`^1|51RH0X|n
zNVnNDK@JZV^|L!-hHTe{Yk**C)ZV^_T`F4snGe@hf*yg-PB9)J5|*T}JZb`CJqW}z
zmZ+H)!L;zEU(Z*Fo~b!?KFrQ@p(R<dv0!+O%`nPk^g7wLR@xo2H)@GB2;@GmTAG7_
zpoW5{pn|3eAR5Z7gz%Eig>Hhs@zxlgb6>kNo~J}-Fsd73U0Gz^9GeGAazRw3_rN@^
zs+qn>r3%!kxN4r_Fd~4*6&#dY)#Y(~uSL-Jq1BGuEpneljSl9)h<Q5G5|G|e`>e%Q
zcOEN$j(7EiGQ}c_HRE+cn}Xp%h7D#k@!%XrK+HyKiX#+{OkhvAil=Dg{i*ZXyf11t
z$4jbuvFD_6#{&&U@u4Nzu?!X6vRV-VXxe;pi*s*PifF!68?%UR8F5aoqBAv}L6Dq2
z#>0G_>rn!jcy}z01|)zdCNKgbBSd;pZ@FpiArhcDaGNVaQV`(zb?*u4UVFq4-547A
zybPulftW^(X6+;r?Y$EcB5=W<j7zFSxfJds2@>w$@++jkkQ7k;5h)YVQ*Ye>mVPX|
zON-vFoZ4LaIPq1Q)8muB0;yl|q`g!o(fp0Qla+VB*Hf;4wqYZ9XqV(LM>NLu%#LMG
zGv^WG>Z$YS>U9L6@<UmwftcRtu&d~ElZT#2C&b{@)C6#&lIj+_Yh+YFyB?c&eoS=h
zd?+J{Ne-st%8s_vlLpMA<=k{rnCNi!0!W)oV4{ut!?p0;^9WM#=pZE+kGomT$Zo4q
z&F(4zV5Vj5s^&$`o3Z)<V%qrR2JgyHpV8qH7A+C>s0pJ1)1anQi0ZKd0|fcrP(7;g
zK%_`SHmauWb<K486tMxD7~H2&trk4R7h1v*l}ig{)lDC*gekyuv7HeT42H$*&9rzC
zEg5<i6wA#4+s*`*1fo{Ibk7c}v*J1+n;qmV(x(k~MMO*V9z}$n<rWDQRvZ*4dnb&F
zfoFF{gIBy*t7MPQPTW{0<bl8yR(O@@aA@uhrSbAw9^IEs`jKn8LC%CY8)D(JZc@A3
zd#E&mZg}`Wx3n$Ar*f=9qj-;Gbb|uCoRG;{A%1Q{TK^=0<$A8MzK>C5*cw)CEA8Hf
z_Q_CUfTAFTS0pmf_KHMHq8(ZBTb_yZCo_EEM73;bpEyyM(nCP9Zkr#~xt9l#kl%~9
zq7|SByVjgJSWxAOTle5&Mj1G)1^2R=sPNV_BeN5=ZBfv(sRh6YkviM!&hq@t8_g_2
zNuE}1<)$GjLk4xbYa)u%07_AgoiLPOt41V2J1Z!Sq^qe^Y_-K3I|$%1Uu#>mTG@zc
zjO)Gs!QOj@HMOm4!?6np3Ia-(4xvd$>XH&jD3Z`jXwm`%f^-E1EOZD+Xwszw2vthx
z$dX>Ag<g~n0aU62f?r%~@3Y+dylb6x&iB6GpYP3|%<Ia`n9q31{XBD&dkmAPC6?$;
zFO}zOt3`Go-3SfR*Jsh9#aZy>Vs!|FF^6rtG3cR<m;SJ`*|?YUNdD@}rQ1>ml?m*T
zNqQnqZrf-_sn~GSC_(KWN$LFFe&~wBDHegWToe4u2O+z6vH0Cb%<rEwqBv06wD`!X
zsZKvo9<~z41JMqbl&1X*O}Bp;htbGZOW!kQ<GXs*RujoRWNJm>)y^!PqKmhMCtq=q
za|EY@M~ljPJ3P-ebHT;!JxXsGR!u3R$3ZjOdyo9?p#Q<AZ>Kw<XU5>Em6t1t)<R#B
z(`bjtdPhzxZ$pbY!phmEMDp>aBkwhCYh!C%Uo7f;m3n6OA1q`v@w#~3X$G&(d_HZL
zPPDU(X1j!}cj%Pz4pbx{teKhUUr^G2#=68h{r%NY!1=+F+ga`9rp`$&?=o@a7*Qr3
z{6-ADk2XIKAt6Jb)EPJbI08zWAKBXKFgVwN;sLQ(1z$;+bgabc2hx&PHeWl<#nl-a
za^oyOKK_B7W=@|!wUXpwHCtN+twxd-oTpDD!#$eoYlpKJa1#;(PHts>b`R{X5Q;=Z
zb468vv1ZDP>6%*<W;v32*BL3)2_I%mv>DL|4M=)n!JD1_PB3=xR(DQ(pKS%iw-{ou
z$^UfE^bXu2|I6UtugOE4zSYyT$le3|Z{I<3P9|Pv%Hw*8ucxWMn)JR%^3~yjf_^(7
z^O6xn%5$s7Ku=Z#^}&oMJM4gRQRs78LwvyK-84onv=nqL0@n~LQb*$39ZHT*@FBn0
z<Rw46#!%b(HRe>?qk6$AL5Ju&IteownO`o|w1eI*@KZGqwKV5lTQAm_tfn3_y_{Z`
zKi{_7Si)J2w^g^xRG+5@#r8*E+ToIrkYJ+;0<BoMt|UjbfG{28cZ`!GrSvhC;hq}L
zn_nZmH#bJQ9cL@dy}E^d<x74&<sw|OUb{|H%L_ukDUZ~SG{?k9fF-_F=D(eyuI6fu
zib#J=ruG3XtO~T(@{Z53@L|O|Av%`rn<%5Uj}EF&Uhe{B<#$39iQddoE-F^z_FpD8
z_G`(6Of63fsjKFDqUzIMlj(nuQ|kh)HErV)S9xN^dLWT<GsT*WHL6dNy<5<oW!zaS
zMP#lb+L}X3aWeOr9H@)TeMT3{dQ6FV*D%*F%@&Mdi5~Z6I^KVzR1%Pv=;~IFh!Sxv
z1tg<d&^6@?B`Zbpi}6fVhuz|2wlW#@E;3u$Q1%69ndt~Rc~RdMJrI;7`c*dbVxmo;
zbFx+wr{K*6Zvv~ua-aa!Wz}qtQaO)4O{;9s!aiGYfM0sQQ^wlTv>fwaXr~1;Npy1x
z6e#=4!7cfZ<LFvO9#6Bq;MB-0S&rKvG80qB#R)R)nkvlNH%a@sX0X0v?^WU0D21Fx
zuNs0J!Cw*)XFqFWc{Ma^sWI8|PMBAOG}LS)lj{GVW9V_vy38~t*J?E)>(=8;p6E(j
zRZYX)#ym#Z+#!+VC0y6Z#7!(*wq)WaN9gk^78i{$wrfgOWKt$f#{d-Lu3gqFqswFY
zBJO1qSuKST(*r@LtX2(OyjEy@@;9U$ru|!9_kXIG*z`D7tPeS3%z|!9WP6}wMJAo<
zs~&}hxkZ$f^MUy+6XLJ`ZrGqxQk%h3G1i(I{dXNZySd$V>21ug8?KmG@U=x=zFUV3
zENMP1=_oS0o-D`f4auX8{YPfs0e8p;x{=L_n6o$~H4;4KX4hfff7h!ED2d*Svo^Ed
za1O(Q6PIuxn!m7$71^`p8v8ElGqV3#9J&6LdO}zT`SW8Jq5s9Cx7W<;DR*J5)!KFH
zTTJRM%FiOn9C=Z+<z$Y$z?{}SiNy2KmtGCO<b(6GJ^=f!yP@1r-;e}>{)YB9{InW9
zkEGsY(5*9~vT6%e8NZ+mB4K0fuU_^MR7^LMwy>Dzt-6Um0s~F06QQ(jMFWC+BfpyM
z*4XP}-Ny%tZ^@zM#@L63NMF4dx&F$>Tj#bIH>V7Voxi!o|2S5E_X)HAH*oU<Ew0sO
zQl4%Z$S2Qev+h909R_QTL##61+gFge!h9akoG+4@iV&6m=-Waj0ZX}GlufsA&Bj&-
ztJ5R`{4LlKbCjnY{|eJQ{tU_9{%{FGbHi!Jzd|&&zcA=;Xz?id$}3@|KSQ(+e}!Z;
zvQE&pk}I$H(*EKh9V&l@WUMZu*%4;)jQ_Jm1g7XCKf|+Kw@34clxivJpKQCFBR(0F
zqghpiBtNk-C}@Pon->(IhO_8&FkgFlF>$e6T->h2$;5PZjS<*GT1vrI{n8P$#j%j5
zXk(OjyprJ8_*Yl^;Fn~9W<g^GQPsS+WSaNShIUL&#Hwej^uR)0cTXvf^c@Qo-)G+Y
zG-<uLeKcLgBHW~=ycNJgz?+agf#$T&6lVyp`it;AXzl_(|8>x>gFHuOk`6@814{52
zV>RmTL{2YKc5#=U?(4><T;lsxwu1uw+r+GYsFLf?hW&yHUr{WV++Q<ydN@|<KYZ<e
z;_TAv3A;n4FH!az4y9e;Yoi5%c7bcYqAy7(OP4VNP>FKwjUn@9V~HUT9!2T)S}K9o
z&+nus=niWp8%eae#CS8jd`nqq7{!xXmjWt#tihPNhNfx_J4!Vg)(G8ve)cWOuU4n-
zW9Vo+#Oy0oolrTOFN|79XAI?T_HvLGrvzFo-XHK`V!jf~D;y2o5h1U37)0*>vD$%>
zQ2~({zZ7)s+%=jKpZ#MZwA-A~^A=xsxDAzdu?)e7I*<Mt+BQonnbp|V#{L2_QjcpU
zCB8rqpF?^ampgA}PZi&Bn?WYl5p43SC80ZdrETRK7{(%S;vR6MQhEtpha#jb(0>fC
zs9YT`(8m*>J7nElQZ*aV$p?ES3_XJl?Sw;g)os%X@nHVjv;i8{^3~ayBMG3ae$KUk
z6|*Z}uZEOehkP_}#~Gl9MfeRSClcZ+lT*vx>nLwl%T~ohFWdxN=5qdNF!QI&Q~D{)
zDfj0UWR*axzOq^$I;t0j&81yFc3HXea5^aN;)d0tX%p0)cz7z5cvNTQ8R~c?nka6f
zD!*1RyQtAkxu^+x3HXJ;c0^+YBE!gp0IZM(Jd7@@vorG4`%`I+0n=56zSEIvIeeZD
z(J7T<tF&xxYqW;hU4(W?Xl!v#Cq@eq){6maBOgi3C3SOC-7{GAVu~&7()cw?-8iKR
zcwezQva=~(X4(tL=@k*NIN>VU&m}2b1sn(i#hnhEy=9eB?NJ4BG6>J=ImdYih`Du>
z`WEeCQu{j+ZZ9HVFS(+(yf<3l>*MUt$zDx{wQ&5{Fx`uf?P~WC1f*RtJO<fn)Xu-i
zUu2HduRM78SuusdK_}Va(x|;as^Ig}Ay9w3seuV)m^fWunW4{U<aUpbBW<5+PZ2yd
zuu7vtg>z}<KI`kgsoE)CX?z>h?%9jHcu?xg^QocRIt%t*u){VYzb<?kj?|2`E<T$$
zz!awp{i-YfzQC-AxV{#q^}vR+x)@O*F<R7FVGUfkx_xz2v_b&!xZxqYrN=GxIi2hD
z@)3KG$>bVCaA*QLJo{axeS&F^R+Lk)xpnU%Z@K8LgUft<2nwlJ^n6=AmuBkI$_)^y
zMvQj_DSZqbyPi09M)SOrd0-?~WUlHtl5Uq^>!aDS#1S}Le2DAr(|hwWF2q}c#doAo
zQUui-A;VQ7uEMEgy)F@y*VFNIalG(0U-B90q;7u7d%tE9Eeni^Bh6Vl>J`pL>|WW5
z<+BKN4>O5%Mte??7`OgTp9iKL!jj$}y}NW#ZieuCIf6^jJib$TT#B)B6ElWy%DgCs
zjeQcx<ap$Ud3Vp-U5@j=s#i@)6q1%dw1%;bccRygi+Tx#62jx*HLc$Q_achA88gJ#
zFC<RCgUT74KSz^w46Dj5VW6|-O|j13DP~$q*dF3eo=d!2af`w^BFbN_tHSu1uE05p
z2KN?Omlfx@(L4r?1WHx%{xSD^%<kM>l9;p+?e%N=Y0~ydc1i5e&j*@wQtSSB_X>m3
zTwczZ$lzH5s<#JG!POU97<s4j&fU7+Ovml54?Z%$0dVe<KBRkC^UIq8Ue`$R1Zmg&
zSX$vJ2F)~we&Onb7}l{%b!|zV+O}pM+wA6}f72j88umMtr1p2svh+nVPLKZWQ?>ji
z!(x14kowCo<amMNI>^Vap)z#2#%@&jovJC<LdJ`2=aE;jkW;FE(?{bT|84Ko4X>ng
zd?g7_&2r7XM9HRch(qg|0eM=rW!j)C2+1Cir5sJ^7SS@KJKe-J)^8z+UVqadKN`0D
z1*!kH&EnU)bT#PaJ$;hch!6@j2j$~}R{^8t2h-vAMYwL2FnL8tzDVe-Sy@y}k#^u2
zdZM55_k47T<8OQCkJ$ddpz(h}<Nt!jU#GPHAAkluxzd<7s+a$eS>bNPKr!>wp!=!l
zr7o(GJiZv|OLVd8>(X=*!h5H)@;XD~9QJ%u=n*=Tt+^R)FKMT-^W2qr<}m$ODE3Nf
zA4)=$LX^*k_LAl4w0JY0o&=+6y4e_GM1cf8##q}$Dun3LkY*cqHGEP8iCg1FU?38`
zt#&P*7S}5zqHCm$UKub)xj=GayLc6Nn>)MJFiEjtXuh0S+eebHI2>JXvdrET;|OXd
zH<Q`jwI_Pvn)Ao}M`lG)#)w}4PD#HPiI)Wcr(YZMG9@S72{{+KROz;$t&6=ljp%cN
z$M_A!;RO1-kx2X2wj@Cw9xS#qw4C$s46v}ETKX0(0C4`sZ;f1>9Cp_YY-Bd=P4QXQ
zT^j8d6Z;NOa^tvNXvpE34R##nN!pv<e)&3yHUR{G@l6~3Y6n86IsHqf{aKK^QLc8u
zP%f`P3=pRE6<gl3@PP*@>Rj;1ed@)o*Q;olX-{#wh2@Fwyp?WMMV}Tp!{_phV2_?g
z;yn%VE1>RBAtE#3_7_U2gfuc=k9ogfR72MYN~bkbf~Wg72OUhC`}o;F!qn~MIG*U5
zkojb^eeyu8^7hr`ZdVztr06>_r)u36YuU88iE*!@O=5pYE*I5!o*)D3mdZ_)1ShdX
z2uFVXLUBwnoCmfWSE1|CQXH7nItrynejp3KYzW*<3Wek}i7;)6B++t^=nO2<xX!pQ
zbh*o8HSvPC*-QDz(!0@h+u_ok(0qQ2DxUm$X}pQ|LUV68FP%O2pPoch)E#6ID^6jU
zxRkBTTfxPBtU8c@xV;=aMAld?0=`@|WIwC=cw&)-M`tWg>&bM@wpmn83<;vIS=g1B
z_#Kd{Y4IG9B522KpvRQci)l*$J$@8;61X;;Q@+6wW|!M9C4*I-KgU!<n;Oqt7@O6l
zEs^T(t{4m&(u&@rDOwke1#%i*WuPt$!}HA+eFxk>)h`ox!arcbz7TC*eWS!ZVfYG3
zJ4C`vJ!c@TJ~Bz}tz5P&1$9j#1<z=ODu%r7XsuaM@C|U#X{>|^@!*^uO}|%?QC54x
z5sS2-?#8qmB}@vBlIZN=h9>#kR*I)u1~rG?5ZF(n%v#>qyg8inwNMWpC6CDl=R&mg
zRJE5D`$X(ctY`1~++$dpqc3uLT+;a7b7j`qCe@qCu6)*W)6=d+uPE@bFiSI~7c;?S
z_;W#j1f<qXwoo6Q*+xFbI;3@kw;Yrhd1MT-QK}Y7Y{Ud=)>}dKD0GoPeIH~_if`n1
zfTOK^+#|gpsE4=rOUI$dMtQ=mi_<L4%<;?-E{3ip{qf+cRQ~+e9GPuqzBWONhR8l<
zxWwjd+NcAor6(qdfv~dT?Yb|1!x=thQEuaQ2|Yf;guz5=`SNpax$mdbf+P%qG8n!j
zL6KV&Vu`U*gv5iWqbdga4{@W`xC;p=aCLtFeuAnhBlX-lc?_c3@vD)28_ukRWh7sC
zU_MJpw`CrMo#57P_bcQUKaxHFIT#1Znu=!?s#t=d^C(Ed7gpU&*k8{1Ffdhs5_IbI
z)vaZw)!*#@(CzB~H@da<o-VQ2m!W0f4APL}m41o8y_AealO&IEFG$c$c?ONXiAk$$
z?)l<d6kRp4fzm)}#_9Yy^9*1tTJ8??o{Cy{&29%(XMSI@RJq;cGj6u+mL5Q<*hQR;
z>5|vSqdesu8`g!_Jt7G^c~04)%wD#X3%X6U3Y4~ZLlK~e3<TlN%(hpDu{*J8*bGvW
z(~y3d4`xT^;F-zRy++6O^`UeiJxLlhlQ(CS!5Hr@ci;F`g<5#%p6unY0&QE2n7|MJ
z*mt3xqlREKs)2Gw_T4#CdRK{eFAvO@D2dzsVe`-_H@c(Fap+lIuHoteQJ0=PW}WB}
z#~$%5{mT5x2|>rN4i832(uoHVj#jfA@xOWP!+-Z$so%X;k@;V|mi6DgHjZrYn^W!;
zKECbE@-*Xyl7R;B@W7a56eK3Uf6I-Nn1k@%BdgYP0UUE!d<)69{9*PAF`sDao*iWT
zxInjZPD$g*F%58Gn9)K5<UELr6(!PM|CGQscVJ;b3u&(oU!VD+=eOK`tHUb6RAfbO
z7-S|z>SSn7?fJxSr4tVf!fE7oY2Rs4ds7W9YU+KE_B!AGVr~Tk1$Qqoo*SAmKd;tw
z`TXl=X`o`$%5T7m`Joa`Y7&vr!41luJGUhbc8<X<aiBVT{rlcFaN?o)RZE(bMF)l{
zj?KOirw&<-I&y^@!XqcLwfW5Rxo-ZMYIl@}vkYu;_$w)%4M~~VVmpk@Ee$F{J<Wo5
zN4Lx$y;<vT81InVkm#{%Seta{2~LAw{N3V8|6%c(9~M{r$>L1=MUgG&?!GTS0Kpn6
z%ay{;t*)7#h0y7*7k(o?nlqU&qtQ2`FO?r1hIQwZ{v+&wzlYuAKf<nj8z_my2q$^J
z3hj9O`qSm^?a$%HA`K=MpXxk5$%wyGxKuuIU*0=eVlK~E)u=;KtRXF4!Y~gJhwzLe
z0I7r)633@hcEo4ATeDp3WE4T6(Msx-Ng3Yo$JVOs)K=%@qkkZRfxQOKEb2Ez$jVXz
z{|O@eL$rS)!cU^LqHDtZ7M%s2F~*`yy{^f6{W6rL%m?laVO@ts=wTCZx9uLgAF<e<
zAVqK?5B+}wHT3+yfm-Enpho-ov(rUEqVGuko#qsI5#XZhZ1HVm5CbhqkK?LgLKm|I
zdC4DB@Kk)tCzgxR|G8T>#bAO+T|`RVD@Q2DRh~sv)Zd9aGS>bhoJ{{DoC>3mmOctV
zTf81YpmI$RWyomKAy!ruuxwMe7oj0wCnLotm5wZTjaJ?$|JoY(Tkx^}NEx~B$SLFZ
z;L}j|jr}=g{AWn0{0m5s<{PO(ISH(s6NGZy9g6Ll?{(89tX8gpQD|lH#Mlk6aX_W1
zHqGa6KakCOs_i#qix>ZA&iL=Cw?}rCjJJCc>c91jnm?&1mHRi>9m|=l>eIDj{eS~<
zuQS7Z9>c?u{=i(L-!NBi@OR8@ON@c0NIBZ}5YB=Wmj2D~keB}&RHti}?Y_iTq{a(Y
zH8Q3Nis<IGOKkgv0sHr()<;FIWX6pTy!=0-;ozyz#94AW<HXApR~W!G<pd$2hwX9c
zmW(#x=u(Y5h_->_R%AnlHKgJ=Yx3@ckR_jo8bXY-LZuI8ZtdbDJ)&BKt4!ae<wn*x
zj=YEtd4Bh{UiN75q(A+{prZ+Q@E(dvN+3TP=K3``r%G#RD}xMKGPCeETdzaE6rq4t
zapSMElkZ5v<stu};Wk|jHhoxp%pUoipY6km|4k~UFk!70PD@?PhY(hY6Aq|MN>K~7
z2|98XbNB=<bHGtcTB%1lNJv@`qZMoH5-hLBgglp1`7itO@9>NMU%{_9i<i1Kw~YX2
z{(f<(fkafkBmc$2Ntln1pL56ALJN#;LU6&AtffVz7=ix%2p_9a4n8u}rTm4K|Au6G
zTR$N9nY=DJLLE~mc|2~%B=6_=EVT%if9*KOK0+WNp(E5ha~!6SJ7QwRFOyQ9KU9Nd
zL($z)4DfBd%v9hW<C$H#!e|}%ALWnDo4Z~iT@4Lfxo!VQalrp59~^%#AC?Rxx+dO1
z0{^{VQM)OO;rhce^jgc$xurL?c6x&0c3WZBA4)#oCH<C8bkqJ>K>SE2{QoE*ex#F*
zjX#RUf*(cW^goM6>z+SK8`^3^^>P`BXUx&4$vsH{QZUvvx){fwm^j+|Sjidc=)t1?
z)lFglx4M)H{$FW1D(tzO#+__qG|A#RqVf>k{&BXHk%2RGv;TqOe)LsGy+NNXV$nzj
zYs<qacv)Ge_vBONMB3WQhi>5WASYR^9x*<0|DyKT=TKR;&0Q?8-uPG1cgV7+i@w7T
zaOvX{SjV><cC+M$91X%chWR8gx0pF#nD%KWZmE@GAUBL&BboJnqy5NcM8eAv_i1cv
z=qKjTREM1|Fcjyg+{O!r8In&61%cFUZMpeHVz0D}u}(gYBf!)RbH3%wW}CFF(`<C7
zmWNJvUc5K;!?F8AVt?2kr6EC&+hn%`nmE2#nz=i^A;lQ=Hd5ee`8-c#j+qEiCkf8&
z3?~hyXOO*OUCFI4uS?sAM+0)FTbCsL*y43!v`!GiBM=;7G^tJ}_yQ;9up<bo*9mNs
z;3Rn=4VU6T%fG0dpCEf%?qkIyyTo?6wQE#fSNV*!rWk!I?elupkvw0FWfZ8-WxsaQ
z%vB9<%ofH0H@VIxIIAUbvDYjQFX9TYTsZzEzCGqQVs{_lMtJ#z+~;l%O)2iBgiG{v
zC0JH^+Zu2}EC!chIPWAHgV48S^{XOm&p6eo6KQ$Sd~?Vw+o3R?Pi*SXCAk)6<=;(X
zGr`)@eB4#B+pF-TFs;g$P}a$tgS@wav6QAMnv=W&ItBu2#1Bb^*q6mU&w8|-cclOM
zD7PUOMiif}09J2{s3j>Osx+_ZVPjSAK=P)!{?B%P!S=<{*e+R<`Em7fuphC~G3>UQ
ziL4>_9$yu~_BG$Rd+fG<<Hb)iNdLT~Y&$r|5Ly{dF%vbAs9&Tm#X<#RPlXcQQ$r_*
zD)9=Sg&J)M$s;B?IMeltaP$}QgtF&c4?~YXx$5jm+IJX2^y|u4K?}4EL*x<-7r3+)
zwGM}_wv62n{pIxOLe~+lQP!McM{(PYY-vsx6>**KfJ)OaE-r1Sevz21-iEA55Z^3g
zT7Ahie(nH4$!eFv5<&i7jeT!>KC;ApD_9w6$+@OUIJE38@iTKgu(>cAux|470exd$
z+rxx=4Cyb-*~iubfVF$BYv%?x{S&tfGfXZE-#hF-Mtx?5?PoB|!=L=rScglHc?K*)
zA`P0`5x!^)e~3wf_y}o-yEb}qbBo06AAr1T?oj#96EyTJ`BPx4>BiYH*T;hEHAD*%
zv_~7judl>UZX7y)^(1-8DCY4>?6lv97znDG8rZFK`Xp^?S9Hp!-KbM49Xc`cAk94p
z{9GlfU_@MUKOiWFEkSNw&!+aSGcqj7DE?F7ulxL?kBR@SKK=qLVa_RN@l1~A6Gj)=
z+j6$^34Yoqk4%6dXeN7t=&g(wcQC!me51g$dOZ_<Zc91AT9-mSAV8eNC;8rsBuu%e
zsPHN?Jkl6fndo8?=BO6RODhmg5yK9GFCrD*(7RooKAAq~{SF|iMNayuT=?K=dB#T2
zb<J|~PFMk;`<uW#f9-MYPSSTken=3j`WKaIWe=6TD+@AvUsX;kMn{rL_w!mu0CxUA
zx6~ZP^(bR1<5tFDs9TEfH;yOU%8~_5lNnbHPhGY*V-v1)cwoF%x6>yht!7m@Rz4*a
zzd90w>M@2d!j~2~$p`7CjWDm4=nHt;@!8PBb=3{SygIl=MoJ{bl42dh=V)n_!Fy(&
zNuQZUyp!qh+meWPl*dQ$h)f^i_HrV3l<`7*^1Vx?KS}Wr+Iq+1$YVwFI{<q9H<6xJ
zEIzh33Lq%+<)0DyoE@6JvcK}q{v?b2=YB*^jeYobA!aMh=!Ez+`PIKPSur=W25p!h
z88OZR;kt7SYa81a`V4c&Qjr|ISmLn#;^PPrWy6fCy>_}JA=iwz@fII8^w>k(mw?ww
z61SbJo={?xH)}}x+=h3n9q}#(0zPN!vIDaqSh8U{5<~N@V#)#?E!-rTWV^0*rst1V
zBkRjo&C-Hu&e{JFIr9GzPVG$zH~s8S2oD+Ev?m{%gdtDC|A-u#Kf-Cd{c_QFz-Z~I
zHZjL73*MJMx27|QQv9U()R=sT_;{bz1ZgB2K>@dqu}|<f3;O!lpM$};Eq7{Z(P=p*
zsE1+F0QNPz?sQCm<wzlnpqV+a*=eLnia~0_aKuDfYA*Q&@?^4b9ZH(@*<OSQ|5ESm
zpN(+M$bUx6uHlHO!s)rV>}FvlEOXn3Sn_hsqZ^}t-egXbR;hopKehi@UGXOm4E^2h
z`neb63y<+z(!VfS{^XNCS&<bv+jQZ>AMR;i70fej)Kgn%+=Mc~PqoSD-p-pFnNCb^
z8()Rd$9Fghe|aV$A-Tu@t7!>k{jNh~BO{BuxxXrRCXFJRU)sKv2!<ZcJhkAOVVs%F
zNUn7c5H&FVNVgXeKH?<1erG<F?un4n9{@2#l|6YZwq&AQBK588`MSzp;jjO5tN8P+
z6ca?bC0AWd$*c1_M*D8N2=WzeEAe$}eM@^)V|l><Y`C+J`h1zfOE5ZoOL&1d%oyWj
z!(cIxi^zlIb%ilkxlf|CqUFu1bFMC%!$8EiF2QDQNl7muV^F!YlN!^T@p?piek_v6
zEjs2ML0#ai9)nI!Poj&miBRJ4Wt{wg_3enMd{f{dX<{*y{A%QJaCbZ%79z%(V6*E6
zQe3DCc%~L-{QJ(XQ+L{We{s5WbGyQqRk|;h0_$=~iW6dpkIGMp>f%+hA(93($-5K!
zN3WgQo+odQ=ojwc4C+P|gy_<6s4Ydq%bwRG-*L_KYOr|nr5X03d3FqFc<pX}*b3nK
zhHe6rs#vWHrEQ7qX`*@Ob;0qOF6`5T;;BS+c=e6mKHk>2>WkaMFZQ21@n4D0>&)u4
zs)0MICZ2b$zO>&6tc_6VD((~8W_gT!;0=4{vnE`@Jk4e|-fWslB=y8^ucZ1eEc&V(
zG+i8@)8lBJcfM<EfX~wv>q|D1i4&L%b5dODl{0$<kR^}8|MWRW6BPaMnx~na{nXuI
zYoo$$H#7r~x`Rf&1tB?_7S1ZYoEk0BBhk47umoIaDXDChiIxu@Ok%_dgjPtBSH^Ej
zasm9FkcLT>brml45WGfmFo(BLLDaSx->f4jgaw+@&x=!cgxv{aVqym={6c>8=_U^a
zA6Z1@pi^7uru1Y>XwB$Wj7Y%)*&ZpuqTULNuliUY+CBO=>;^)B{Mmt5-L=jamS(B4
z6=>#tzP`pTV~gcIlb_KrC1er7s?+fakcBJJqRgRGt_6KZu4%_fRA0(9@{N$kHF_t#
zK}QqZF)fOHDm@#04IH^g%Bw4YQ!7zV$YMM;{SH7g+&{Re(tuU&XlvRZH2w~F@%*TZ
z+y2a9k9p}@e!+Kuq)Gj9(A)h%F!`snqt2F%$7UR8`iujL_YINCEo_H9jfHDDcb6T%
zd?RyY6d?eBQ*Ws@n=kL%zMpMp%rpwxpSy4*asQJImHw_Gb??ZxH=QdBO41Ww?9*M}
zA1V-<%I=-aoo*L;dH<Mg^8NVh0N1BBlz#M&guZQ>7XbjB`DAUaJ#QLhi#e#({SIhz
z*0((JC!btil^qat``QNEN$5s&U-$hJA?s(~0aJTljtMN^fEULb`=B2WBxB!x2Nbt@
ztCYl60?wGNle>I&_EJD)W0TzA$u)o~MZ<OSNc1Crx9o4CfT|8Tj417~i2tW#si#^5
z|B==2fNk3E0PmOw^bsbT7SZ1UoE%>qSwFI@puLTDa^~+fGB>_=*9Tm<LEab%pczTp
z&rm(-jX27Q%HM=%IPQvVbT-v#t$o%cS#AYLq2>>o0>aeWIM(QhCvC<4C#mvhn!;6v
zc=5*(h~r~9&kK5AZkep8DvWasPHqGk{Q@|Dz7qhrFp_aB3Tu)DzCSRWdo^p>=lFh7
zIN)VJISH(}#~;*^4c47UeSZ7IN$+IP=3Bd5>A>{2fqB;u(&oP3#Am>n{vQW|Tma#|
zkz$Xt`VXQ$uM(TmU;&lz?|_>Y*iB0*wMDzudt*7;=5L_y?gSZQH@o*rwUF7Tz)BHU
zprSc$?BF#nX}N`1SV`GuJ2t~*Q1PPGSVQ}C$F4zlzg(g9Om0JAh<%J9g?k>Y>e#D_
znSJCkqF>}lNpK|jcr)Wp)1n)2QKs{K-&Fr6E%zvaroO3fSk|oE-|>d5IlH@RCu=DO
z6P^K4?|z&w5^_8&yHifqK3l3ZRF7qM|2rW3fX#yKc<kzN{IL!IP_sn-*FQg}zO4a`
zdDY(vOBRiN)V{qg81d%G$pX!LZ(e9znp?RYBp+xH+9!&qgvVYSeTMJ=6?N>27Q?Kg
zwUbQZ;_)xVr?GQ4LWz#5x*&CsEqu{Do2sH#XHwQrb;f(CC1tYXxsTc{Pv*4tJ|%H}
zo~#1nG&j39&V>)uq87Jg;G7!ib}!zATnfh7+Mu^n)YrYG(?G3wy8D#U%zLr$z-ink
zw`<gSL+`dN@C07uM3r_Y5loEithn+J&yLf`8bvGQF?<S7qdoE^)}bm$*Ut16C{`6-
zbkWJ3zMbdgsLM#Gn!x3xfDy1*UV&pv{f5uMsW08_tTEZ=>IfQ!HBR0Hrt<h(eClo~
zl9uN4l3X%9(${ECBQwogNWA6|s)R@SoW@A0JL0<WP&@n`87ICjM@I-Ro{zwLE5;75
zZehT*36&URN=IlQH(II6&*gvQGmlGFO$^X=GlZ09DG6j5W=UU(+m=5x<;SQyN=WuV
zFj}yo0^tp#_6P;)aDry*qo{28mDiiv;RKm71E?&8JDGM$KcRbHFE@2U!yq|s7!(|)
zV4j5M%igzx9W=9eUpj1jL|@G8Oo^CKUy~GqCZT|7Fzk>yVibJ4XrLS{&;!M(!xybp
zD?F`mOfR)dEk~!%lTuCx;=Mr3f~For?>dQDW0evTpwvAkTln*W*=J0cZ56U0A9AJ#
zN!Qeco&_;i>F6)E)!Lb%E8lH&)peuFZEn`$`E~6}H<EQB&eM|jc@Dk!^Fxm`*!z6j
zYom15fooZ#3&XhD)37y<WNhfRsbZ=Xax(J8{GIOSojqfm54k$a#qL^J^nb-R+OOum
z_dwTF>#>%ZRg-5bdTTR~7~Z1YVI%`{4nfN!%u7##r=m5wm{U>7y_WL0cXM$DHLOv}
z$Xz++M8maU+OA%OpMEbqkXs?q_h?QPyr@Zpef80K&GVX4V-TJ@MeyBx3&%q7yWOll
z*jXt-QyEi3Trr#{vY#iXpNm_W^`dz@6r;avh>ldv8~)HJW0)c`dN;BMS<T_5+s9KC
zmq(tIM*c=V;a+2|TX1z<ELP@qlHrje%|cC@4Z#O?^J`(?W{cUAB^b|nZpHN=Y9@Qm
z1yx<~DM<;4@!G4nHw8K)zalPR?zXTZ-$EFU9Vf9RkJ`(^6H=3iQ*Yg%@8XUyq30*9
zy1a6Dg*BzHdV+-z1j6jTvD57|<*^o)++t@FK~JDj`$(-(A>1uj6$dBpFrspkjsw0r
zkQH?*Cmv`*KS{hu7|2<ZbVI53DY}TM^%>=@6rNxj7Y4ozU}{}}(wQzJqCMfrHAq&A
zZ>Bt`PkX$>@qG0)+*y<{n7m7gTe9cr_3cLO)7<((@rB*vuM3<<9j<3OYY=9mdRF>&
z&k{7F1;wwJ8*0crs8n<mq2v*Qr}tJarc;->IGeUr<GtEuqD{X<m`GTU$IDn2qO-cy
zjX4S$rJ)62gNUg7&hX~Az#MFxM4BU|>#kAy*vd+)0>+bl-a=yT`2){Kv_0LYov@fV
zSq4PJMyZTTWp`piU_;fAlQsk@0-gs;%!@LT(~^eiiR$G~JkdP$IyI)|rss`9#(+*E
z(;Ba4#8LS;<pN)PQN12^gV4w0vQg8`S<hjMi_OiQ>H<#EMURgD(LlsaB2YZR#$?w}
zV={$ccKF8Ok6bKRY9qt^dM2x)7Tos!{#BWsORatrR}brmmF0mfVQ7kVkv5!wbGbC0
z$5zzj!juO8iRyz-+?Uz!wO^kXy=aV9)J|evtQ+)$PE?V4b#h~4otNkm7U@A~>R8M#
z+{W)z4MDc$=)H9HMqwlRv4}=7oZm$RzR*y!bh+OKaN<5G=M#DHwdigZb>C1+3t~|r
z?dArmSVk98Q#Hj`wX>+>#sIgSXOpoE?#5~n)pG}`27J>6-Cc9kgThKD_s`jVDULXw
z0_^0|GYgWm&>jKPfWwhE!HFZIa-2ePn&bm+m2!c2)0UxB^kW4j|2OkS?PHu9HWFaw
z=uVBK@M2kpoBO2@g40cxm*EkQm?_^+(j~jUa<aS6^J20yF};tF-#G-QyQ3Hqn_gZf
zV=1sgsz4NI*3j%*n;{~MH9%@JY!DVn+d@;)4QUG9tC$kOau>8NgfG=~Xl{CAS|p(j
z)8$Q2p_$g`cLs+r;zTCFafh~&*?QU-#R|}1LRGgzTs#v$JX@lllctXz@8z$?kFM_Y
z4Zk)O$g$DY)EFtETKH3CH2%Q_S-P|j%9R{8&+zuieOo$oW71pqv|+Q0ALu<&9f_e-
zLYKEF7@@d^5Z=WHIu#==iqV1!z~jm-#)M<{o`Rh;>9G5%f}vK#n>!dGNM9l&R=W>d
zp*L6oyF@1ClH~5W)T+o?xS&xOY#gbb^`UC5T!f;Ci1G{`LP_0XX5x~JKp4hV&4*&Z
zA8<I_jmS`<oklCj?ffe4Zru92X7^|?!N$nJs}A9eO7nfO2~8trf-#h5ifVEhhb#;(
zFXv9cFc)ym+<M+?dhtmlH<N;_Js&%jw3vg=M^YCsIdcqaMwK7&W4c@$1!kT_+_}GG
zL0*Im9T-lda;J`D2G)xU9A&EXadb0o^4nvvA%(T!*A2`nmS^^tnN1lk$~d<nMwlyv
zhn5Yq`;HVBiL7Sc=IFIT81&*|)l}d@D3>&LkJ%)lnV1)8bhY9jqmbzPtkDl)Ot_s)
z-c5ti^ErInon^!=CG|2!=21(bYmX`&rox8=bv3q2*Gxoow-$0-85Yaj+t0go2I+oR
zCe>~kHpf&+kRrj#RXZaG8Y=Tccas)vx@C?DUZzr&CZy|Lhz9hkr&F6JiOM^t&WS2&
zZK&!BJaDJn(kjsmrJCY6otS5l5Jmw34R#%`lu#abJf7YzaeUwf;}NJZQtr>!7aZiO
z%p>;R)$K(Kd1CX@$tSNAqoiZf16Nh`;^Ja8NQLA?`5kas|K6uvO<R8#gMcklruYtX
zoaqCv1EXT?nXKFoPu|vMY9DbE4?T1fz}(702&%3Y#eS~d_LQG1<$J8J^5YAm62pkv
zUK5c-+;pX(W(<w;%xb~-P~;++mINZxiFwDHpr!Z~DO#Uar}UoB;a*DKwNNL>n@Yr-
zC3%VeZobu~d=adW_!6U=53&nk48_#=$mrF9CuHU$q<IMf369)Qo*{SyVEf!#t9c}o
zRomt>#tntwf}WA<iBby1bsodKM)*&LOh(YK75YijohukT^OdS;zaU<bJ*=&{sTd_t
zmfL&ebfH>7SDD@Wel2<DN61VUgXAa4^nv-Y6bGVjdZoL0Xsh=7s#&FNCo-GIjGLEY
z9|f3gY^;0-1SF+*(I&F>saN8I?$+45ym+VMV7uJsbR-or&?4A%)_&Bjm0z)3ct?_p
zTYULqjjsC?srr+)%tT?^8{h+_1xB=PYa>DD0@T=yZw9ub?dIL}tg1_07aJGEAKp8n
zTJCyyC=cJUW3JT2F;19pO4|_eEPdd*#S3Qs+Od4f_#ra2DbU?OKuYjD$~0H~8CNDs
zt<|`xW1DB6nuYQ6>|7mtg~OP$ppm!Pi+6A3!$fP|uK7{R*=l*Ag`S70#yLnxKxtzz
z{O46BE#9X(d<RfHQB2YFFLtw$4y_Z(eyC3~oo~caG=*J7A>Zdp<?`NGX-|btseNi#
zz&H-A*Jgh4Kwl`<SQq~3Ri=9=A~cW5w3Nc#&r-_qjLfTPLx(HH5$?7_F4+b-pTyx(
z0{fVoQ>H(lB~8KjTP~b5m#&$BRg90iE`sQgs|i4=)qT{qNaPzY6gMasMBL6FSh(3m
zYwqR%gH081!D8R)@y5UUfft$0y9K3_MsiTg?|@iLAR20bD6Op?e;H<PjpjUhMn0LW
zSR}0USHa>AFWUEB@s6(JZjL-}CWNFm5z;cqcv^uz{`km;_vt2DQ>vmvDtlIFrn3f%
zuM>RF`D0U63Z|t@P`w|6&KusPUwP@~EE1<&^C_mdiar~SVB>=M@P26AyIb+`;0u~@
zt8jWZ@;c67m2NRh)t0YdUD|ljc}lHc2H!e4fktUZ)nGkDeB^v<KV7~fI=1-Me{1#8
z6ZPJ@{F|aEbegD-?A;=bco}*$wjb4NCzTkV9|3l}b?LJtx02b#ZN-M5i0dk@a3iYG
z@#cD1-kq}Lf}sLqChJ%kJhRUyhO@*}Gb9^&tp{fM3@;+JCP%u6v26s!?|%LcpxJ87
z6D(*Msn_%Awp%0<kmNb_W*ImXcbc;a^VUSy^L{O6o=jL5QHuGjn9Mh%PT3}+H?{~?
z=;*poR=nOmUbi@EZ5p3Hasf>W$4g6CA;P6vt3v3zqdQR$bC4~_hfJ*i0%_P(C=ah%
zPQ(Ow#;mY|N0q(<LSnbrXX|dBysr}Ew+KB}NhY~3$93o>Qe0N5ieZX<0fA9yKi)yS
zH+*J7kFv}z>aRS!wr^`W!{g#=2aGh;E;#Xwn>Y22_5B*B13ifmxwC;7`?S{lY+qz<
zU1YLHAu!dPv?hJqD!)|z+iXcT0s*0RL<^G344xOyI!AM2BK8Ho1GEzI_C=h&?9MH0
zaJ57nohR+Lb7@DrnyoboikOao?1sBMM5O5wYQRyGEWFhhF{ZLZMZF2S4Hcd7iI91{
zCCju6_EhGER?a!)As=^=)^D(%CO8WlYeai--ux&Uo6bIJZRQw;G+Y}3GQ?+=-@i|$
zcjU?wa!91k%aU>&Eaa7muX;YT#UFbNl+w}goh~DNs&^URJz#W`a?da@%BhF>xpOlm
z;k)XMq;R*$Wrhd2%wW&=<Q=+O#$bpu*!m7Ag|1)yt;#Udpp{z-0!DyerZhfxsl9{5
z1F9bo>S>21(8C~%7B91*Mt<lH_XfMSZ-1+!*(-K@A1bHqL9TE&MyXK#(Z?N1*mIfi
zvNv2*ZuEm9RFP=r{b1ZxCvtt!(q2`fZQYWoqMd8QcA3r~gor@DnE~>|!_xC_!gQ-}
zRl$7;eKeg=#iaH)=TQiGa?jRKMB{GwA!Ae8o9wCmfZbZix9P%wC4=DSJ^|8s0S;v+
zHVWl3vkVSX@6t*^sCr%HJmVd`e$kEguz%iQooce`;_u+`Rc+7vNO>cF`e9*66$ZFe
zJUyn4aAr}Q(zQFFKo~ITNrjk#z5F|Iyra7~W()+*6H4@<q880&F4GA!4i2NRcDhI=
zGii5M$O+1cQ?E*cn4#eu^ddZ2%IDSXUTnVs-W%l{akE%!Uw0*qm^dRgB?zDjo(^Zc
z*PgVlW3AYY(a5ozDX<u}X5)l(+ZrT}3HGs@hHLiW7I0miDvG|l*mpL6QmiRt&t%15
z_2@e4#Slti>dIR2Chg6fwimM_K7~?sPL8~t#sYXd6U|`DFCX;R9k<z+Pc5u_#BRuM
zZ$2nF;;QBOcuBX8XNFg5i3yCW?zcNtO95YesY$R`gI^VzqFpS;liq#oRcKlmEIwrL
zpDbd)gREC`^>1{S1HUk?aWd3WJTJ4R&;O|5naL-vk+7Cb7nmMINOuxe5@{c6sU9QI
zEI{t<r`cb3ot%}gbCT5-Pt7D|50XLKTvm2vTs=6u@);Qag?>06&YJ~av|q=mo0F8r
zTr74AeOD{jX-RSTk@J0F!a`E{L%2*w6>}hDC<o6+9y-*C4DIZENb+WxXOJhHUTl@J
zR-V(U7wjJ+?Ov3)HL_dsE}EiCOI@k35Zo<YNt#ayzm4}bRF4-?j`~&I9NePqX5x;*
zaZUx>ypF#c`f}u&ASwAUvsSzuqANlFwlYteS4di*&sLi#UW-a*5KX)_yhA@qiW|-S
zCLXaZJsyb)lkiiOOsKECi#D+4<*h>b+Cpwc3Gc?NnmRoLuX8Xaj_4s#6YbQ#>V&B!
z*2S+zj-u10h}~e+P5RjJvU7AKP1?nDEVn__EotC7$9d?8o-Jy>%1`+_Af~hM#SGVX
zfb4y-M$T@z>;T#OD&z~G?UR3u@SHaaRQZ_Hl$NMTre`dOEW8*nBN?C|8of2qVnjdJ
zbm#up(wLFC_cLxkj?(-UKozC`&!HjmP$Yx5YCMckD77kr=IPGsw2DJ<5eYI`VB}*e
zNr~-fzNv;dZvD^6tR8vO@d_mB$R$1nhDRl6jZn#%JZCl?<f4`t@)8l{95=x$csm(p
z`&@#(gH<cCn^hL}jF%&>-O%XuVsOqDo5#hEQw%P{YuBmhHgk<X&oZPiUosj_!l=gu
zDZA4hS|YTv-`p>EG8Oa!xoJxY@mzZINoPvcu!y&CP|(D$ga1mANxWxn_9@tcT(@B8
zL`_T2dy0AW2cv1R5YXGD8W-A4tpM;$p2j2Ns$-M5+lECfzP8JxCCxxD#usQ=ah)6Q
zRWEmw6P&u{M7rvTdt!HJ$iTzCyQX2IjTlGus@I25y$H`~hVX_IcO~k{7E!Q2v>iJY
zuQ<Oq`EX#qH-V&IPUgvh()(3BHQ6(vC`{Pv3C4InjTB5m!cx+>E(Lvxh>?b%^YpXO
z0v-okUC4wU5hh#(7^l3MXS!u2UM{t4zr@-%@Ik6E3nBa+a77_*mGdR?+IV~q2SdqW
zCE}8=c_S=G-|AN#b1aK#43t-tR_1x1Jgh1<DFz9rwNJeKi&VA&7<!-+a#^;mU+Lur
zEiI$QBB!|*nl6qbEBle^bKufzj>LzeksAtOEA!3T$&kvmA=E`VBq9RJOXY417m8F>
ztU(F6{x?Que#I^VEBvSV1CXl68!;#bc$3T3rV38~eXT{;e#txAN0Rq1ti9>4yY(x;
zXW<puuQ+~wn=jkCe_UzQw0PrWTWjLilkzdALlm&7cjV(irmJ5iEMSq}fBWVU$F)CR
zwqJMnmw!SXIx^l$_WKTy0RWsBp8x=^sz<KRkCSA-lm+D*S^3r6KIu(48gC{lT|VO>
z9UxYI>ubNRx8KA{s_aSB=KYPkDmA~9g)g&w2WWa%?+bHG-?-bHp~2q$9iU+11OP|?
z$#T~H^zuM|?c{xF(`NqrIem?z3jtXQ+&k_m<T2IpS55oJWHHd7?*PhYGGZQvkKX7k
zehyf7V~bwT*#rQ_<bNfHknD4gO|kp$qymC|z4Gk-Pi3k|**k9SzyNK_`en=wn2068
zSLIt^z)X&L8`sE@3HcSKQ}cLIQz7_Z$~^w@n%qk<Q0ar|IKZh+8FI5{2uQ!Yfu;(s
zlQr%F1!Mb3KdK`Mv5#2!$Fzru-z@u>Yu?;{KMxhxJX9yBlsQ>8<<UI(G#|Yih6=DA
zZ3@t1J2l++&gftuYX8EncRk05KH${Ta<a60_TK^8aix9T)h9-B4zh_mhzI)-9LtjH
zDu-q`e;1>sZG6k^hNk4<rpmkD0pd|YyRs@$a;cn$^?n)Nj{$(4_8a6@2Jr#%IphUk
z739Uu^3q}}h8dfT0ln99>P`Bt95A_E_xdDWEq+YDUGWY$UhP;4oSRrQI&Fq;)hDb9
zfALGIO{nPc?}vCH8(VFzEjpE5AEC27473+{;P84fw-uu8Ox*2^^I(OqaSv3DosAt;
z77;5*?H&2#d`LD2cbExhnZ}9B`^6Vatk8gz>crU>OAHdf0Dk|R0XB1d(Ru%_C;OYX
z>j>yyz%ktQeO~o9lTOm-PakVCHcCnN0OvaXWv=TVjgY_o{+MLZ>Fh_%&-oT!+v(O%
zXNyv|t`E-hBI8EQjew6#lWP)e!CvkcBcqEM6eT-5|8)vVlPGdX4PWdmfZovY66Cpx
z<>qXo^N{SssM{wXA1m}ad=6%8Fe2zuLp(BUEZvW;85Q7So+y}KsyN0EHf=DI>qk$C
z{{1|%hrV_?=8jp6)T{i`bjArYwRiI9T|Uj28X0R)p7)Q=Hx04<^d}$gtHny6es!z&
zZ|e6M`8jB#KK=9;g*cK3drakWvW}doSyf8?#rqcUrF5+DcJFrpJmPc1HUE)A(NUs-
zB4%|o!hotKStxTTPlb;8!;u5oJ394_7p?Yr3#1TAmiF(6FFm~B8()I2EJtqli>Uk9
z$M|QlDF?$(h^z1SsRMGv&93=x9)27k8jcD1@@5YpS_d^K6K_3C(`PIB<1Tq7`y+$w
zfN|-Q7)`9W{P66zOJ8T2ELR#xPd0zj1m{4H%48bL$=lN1kl$~+z*4>b?MuwSo9P?h
z0Z85B4J#Dw?vej<un$boQ|rnMO!xfa!%j3PwR2k%OuLkHjb@(iOzGRNh|g6s(gcxr
z=3#yq(=SGoICh_wfYpzqCvDG<0;#UoJ!ETEaqm%ElxteOe5OY8Bi!L}$hG6sjEt4p
z+yf>pa=s)l4ftVVnD!~p&kBM5O9>&JZUF=A7hI#K%Lfi<#17AHp^ic9CM|0w?puQQ
zMAQHH<;<@iX^#>R9Gv8HCVp{49jkSJ^#gvi;Qn)ndgwYgoLP1!uMH`Dw-}dj-R^Ag
zB$^kiAD4JGCp0uP;wkM#<~J->BmS>8EgJ7l#S8YYO?D?jtX9m1un;d1u%seZT_fq>
zVyL<Ma7nnMIs5tGs~j%xxRxe|!}DUtU;)-oPOp7<aPhcD<noL8?|@f|G;<uacA49(
zmb;V>3I5puzl+E#d~Wz>;eS#RjfiEwkL6{Yc~2`Y?C0slq@UEo?;tZ$>!fC`JpRhS
zk=D$0*7OQjH5)<BR^3VJRVW6|%bStNCHmH`o&gHOCy&3QtMKnLUez;mZ?7oi3Hk|L
zHKx^m@(ON)Kv!9DuR)`#2I7IA8_Ab-L+KFBSFjdcJ927r)h=UG5Az*xZ|qPlGa-N=
zFMIsla*g#nfXCV^ukq`(t>1&?&u+Z@`)xF`_s0%C$!Gro-#__c7!r<@6zPZ>Vs^~I
zvY;t=a1><Ht<V};THlsOr=HXY*L^f|!b)VA;2jM7k7XSva(c>s2Kb?mRPuxMGHgIX
z{$ARcMN8@p0=HeE2*N98cLjHAX=%v(8TZe6pRTnoDD)DI%?p}Ska32hDwmTMF{Wdq
zuwTu8vfPEI)7)Q#E0xtEPM(G^mJ-dE9xKN`5Ti@@Q(Ipk-=$a@GE}lOZ{YmDo@nhD
zIp17{c8th3Wi+RZ5p3^^Z#tdY`+$BTiRl$FFecf9ympq)*8Ca5OZlAc^9a~;c2hpv
zZ1g^M-b0LY(bo3bYl2yeiJjs^Uz?3UlYdHO9))IqYv7-0pEde-_0zU*9+Efvtr%W#
z`oo4af8d8=OxS4pmk)BkV@4E4`(T9sNF!tC>d)x7gh5SR$xzStUJ?mOQPOhDN8`ZC
zLlVNBkPnfdp~O4DHQ)UU5!EzTzkR_zKi|lO*xZhCe+>Mw0Ik?pP5L?|c;oJ~FfZ=V
z?K@yvd<ScCyjS#d0aBiX^nGr+H|*%%2S^FDx`taFwGyh-xi?-eo(~D7Mo>va=S*{r
z7FD;9vK7Zj<Yc!ZQ=;a8`8LyVvKt~WPrhjC`ce5mO}6p#Ud<7A{DrVCo$M5!e9#=X
z#7+-4V0?Q!VLxGzm+JmV`9x9@63lBWRc72O1RJZvLk&Dz^54uz7#}kj9tMWOm>;H?
z`eHX8u)w`o-S`mY63}JAR@0KrYK`j4qA%A**px4{&V7D3DQ1nI%+x84HxUuxuP^|&
zIJWqm_fudfylbSI%{EN4R?Aw;ub`LPXrg}IgC;i{V%=;?9({8_&xi`7EAB4(e#5hV
z+z!Xktt9HEpz3>x2Ut9IrQ^jye+8eUkk<(QnJpGtf!2+!*F`p}*zn}Xai!<pYZ;t=
zYFO^lU185`$R%G4Gu)FQV$#QuwR8!&#7BJF2%WezB?IAp<sPACkm#9O%I?}}tcxh<
z?V3ydWtS=puBoo%c3G7H3TF5bgCb^2O2Q)dYVt1T#!CzIj7_D$$jde~a~|`OcVg9Q
z7wm=4piPn?K_Uk+y0S8kK?&eeWLBG6TSTrlk!O0`-7r|x5o((IX2hH>bkbijBm$)#
zn9P3rrg|l<Acvh^q6s#%U}vk2l%brvA+KOax`Z%Wxyp%mG#iR%EXfCUQQ1!M<i9{I
zWE5<260Jj?xxK1A7XkbX!nGT~AcaD{;m|JI7PrOwrgt}ArFmkfP!O$DxE(fHdDr{3
zJCD|9?r{@lZcykqdMTK>6h1MBK7_7e(W`exL>-5;ONwElO*+%;8f4glfAAGnzVxlv
zpjl<>k%@V?u;C8exdyCLnMP_@-Zw@UrL0Yucco_H3>-ercx)JPy-&GQaOYLGOOCd$
zqpDp8MvPFw6{%M>M4SwBapmI0(%Z&X!%Z6hpZ3l(tf_46``AW6P(TquKuQ7x2uSE%
zIteuekYa<-i-5oo6h{GpKmbWXC_y@eF4C)oW+0*W=8+;zWE24v9ep=*JoC()@;>w7
zz1}Zp_NTq~wf4R4eXsvoSxMIYTVd<Pb`>`h=ZqTci%jAnZ1L9SOPQYV8_t_Go@|nA
zKFuirXwx_Xh8o7Y$mjMyZJtraKk_jv30OC=%hx?^*nsJPIK}X1q%ku|ehM1(<8D4g
z4mzV}iLW><()C$|d86}mN+(8|JHLat+jq2r!r9@eVKE0Tc=02in}2;V%P1x7mJY0_
ztg@tVjlX!~()o!piiPFr&bLCPa@XEAhLtoITA-Il%Zg52m%f}hT#C-fEkyX2_^+qo
z4YFfzc%SLGVRLONn6)au+eU*p^f)HBJ7}~{OjJ0}>7Faah-e1W5xW%nDf9_=+*8d?
z0PXG9AjUJ{UkFg#OI!s|nb5l%qxmQmPjTCPBr%~HYVx<_!1>mK3F}`{oRCffbrNMy
z<3j`LXMEXTey{_<rm&guW$Et}73UKi+CREY=U*J_JZnChpPz_RwGNkO$jSe-u0bx-
zl@9FDFuU{Yssh>7$iKz?OB*n0dcNQ@(b39=$4tbvmzFxpNZZt<6lB?!<~cSOIen_w
z(n?IRBU1(E<)C7t$zl(Lkhv{V*|yuW>D&(Cb>yJ>g0rlW)8IxL(No*l6ooF{el1=9
zO9&C4L_r|kU;<1IsSFf&WP_2A9&_e?hor*NGfecP#KrK*E|IB#QLvO)um_5MKXO2c
z2G!)a7#ts7t91`-&v!_)!FgP4t~pDzXhmvp{%zM+_SQEzxj76NF+IIj2DwezS#5Ke
zy};rYaOZ#K(0@N;J=Ra|+(SrT60-0M=iCe~GJL^yPukG~DRO`QErLKUSgw3{vB!y{
z^CRwp5Id){c9#LoZo!z!pFy@QtY=P^C5RR@Mj%DUox~h1xrGt>v1F3CxEM_6oq4&V
zvRC+E-z&X}QKBayz!jAOMnpmmd!bJG$|%(?U~07U!!UV0GRnXvBO|wDoOh)0Hqzq)
z*5HXV{&D14hnbDew7+rm?UYJS2r??3K|+(n7VO{lS+8%Kl3Wkqna?>8h5}-1AK{4G
z-s@tOu$|ub7W!6piiCN6WUsDGGL*wTIY4$m?se!?sL=F=N!2jS*ZC@W46d9~8kR+G
zj*Dpl@*TqBk#I$N?Dji4E%<T-^5LCM(M5_><y~~LbvwJFa`%JYR5ohvigWLE*lS&p
zmtY?m=p{_`n!lV@X!!rQ0RO+bAhF(dIN#^Mebt22yK|MDH#x;6#wFM+FeG~iTFFLy
zaq-PZD&&>ox>F>r*d!~#I2~bi=eSi~bzTmUd_^bE*%JTsX5kRoqW5w&TgMODwpWjh
znaQOdTznq6?{dSP&SjTquox$I&=*j_6XhLlJN~d&xT0Livh;5MT&Y**O6innMS3H}
z9pJ9`v>tyGB0lNKukz?T!mA*ocTC#~d&$UP4qr>tHYzSiUos2}U9uPpV>5nyh$|-X
zVqWtROCtkTDdvR0kJ3&OelZW?A+3z^FOhdrs><tvQH;W;jqm0^C<-b-xIE%%^I-6&
zhsAM?j0u@OWj1Z6e#LuiCQ`rH8rCXg8FkiZQA@XZ5~_Z<@IvMC(U2DI4J88gVj5T(
zW#Q_TMXQ#}%iZjpMrY-XqRh9%w6y}IK8C)ElgYs>Va`Vo`T}Tyh3#&uxXEfNvw487
z#^%V*u(OeEb~=_M%(?IKi$f>V6VH%im5fH-5{X|XMva_YU#Bc*L}gXF^S4LklW&)a
zIh+{)gp>sUd8*%P)=ef|E5J^7K7*7Jy=5>v+R4GZCqg8KGj~qs5#5uy*A|miZC1eo
zA35~@Y@@`?GW|OX^z!7gqsXbtkMpD#<((cXje>S=sP&%$3+C_G^9*+5_;{FGer1CQ
zm8blN{-i+j=l`rS_gy|4%3<lI=8O^<;<IW;n7zM(<PAuzw9D|~32Vx9XXi;o?v>*`
zT(sixZUCvNoAGpW!dW{yndE))a8W6|yaGMCYcz_{o}?mfj5@@@vkXmWiH37_Ib`pb
z3ao#&QO?~^=-^5IA-4H859&(B9>7p)3v3;+CXF!{)vU>ZNn=<u1O6o^GsT|fhk8;o
zELhG(*OxzfKN9XR?Ww~@plv|owk7K~#nz>TH!KR(*P%urU;)v{bFrj9t4x4(n?v?}
zsi2-Vk2j4pyrjzpk?#vPwCTRwblI~QzP=ogDTPosS+54?61L>P=>Wp(to)&tOR>8G
z#unw~JQ*bt)pb_rDL$n22WMZ%0MZKrU^s-c)KC63T>Ld${54$s*AEwOCvZYN5-|UM
zg_#~#Nr9iV<!=SdX`vch>Z3p@Psvur%*Hi&5kK!iUvax^9ZqSEj6#$**L;U3@a^IL
zyvhhvBmv<tnlFYhjb$<i#KaWO7(J0*XnAK`cd^W7I@=Pm9H)tlO-QS<juM4{JrERc
zl<j(gLp8dHTs%lljNfp6EQz+n$KHeAmR_k-Zyfz-dA{SOI3feFn<zV{q(<~7m&e<K
z4apg3N}G$l!kc3JXoRXcu=?MDSWPV>cePdOPYwsEkVCk+KC=jPU%jO^EUcH?NhIEW
zlvKqMq@GCm=^Kkg(x~8b#5b0y{GH<;*`~uHbD2R8f636bz9A$7{AQOls`>J)m9@)9
z6SF(bhQk=_IbgN>j~bz=<u*M9SE{1(znAX!|KkE_{>Sov_zSE*Mq&8LuE#HW-^>4e
zr^RzUF!B3cgZj=87jZWRHfon^iG}i%<-k8rXMyX!CjD7Ep}J^#ikICIQd;>KWh#%?
zSW#AE+s%D^UO(KoaCg>YRsySDFg`tup!Z2qE^vZ^=k9X9?;GdJ|6*`PR=u;X0`8uo
zBVA0yx%bsK<i`9QX^#9b?EJ6?xQKRy9=f(1cKzH^!n@7mZbh#stAATR^Wk?TBA*pW
zeO^<rTbgkkY`^u=M{B@hKzr!1&@<-f3*DZxi?a$p|B#mdb3xY?lJ9kcqrZDHLi&de
z=ICto+*u$3+q3WI&d&dDh5v3*u+M@49eOhQA1z1r1D3yA1PU*HNq(%|A^g@ePG`^D
z=99|!_W4kyu5;lM(qE|FVQ0j0e6+q>6#VY||5o@{i@+QH>b${!w5;`4IDQ}Kzryhc
z-TgO&!!)p%))~_ic?R6&k%6PKzYJ*LerAe9{|(qtV2l6EU0JoZ4Sqk($Pet`;VG>>
zXs~G*MqIuvUKh-W$v*h{6vdhds&CiIPh5HWx_i66bUb*X*W^?w2C>bzmLg?#&$l*R
z;fk+C^H4>pz7rh6jtUlqBi+-1>@5{98F|R_OX21C+mG-Tl_+*-N#bcjX{V;~$GpC(
zO?=`>ZqKoHeSu=l)RKEpb_~u9Yn<m>_lB66&`sUqG$#;h#9xZ|-ISPVS>0OY6XH2y
zo338th2bAc7SmkaT66RkN5tRthr#&MuZ@G}BkCvhe$4E3ycH%B3=hM|KHio6k}7@r
zwmBJPuWw46VcjBJ)4AY#Rnc2j&?9Ai`=;=lTpQV$W-~vvE{V#h)l3RQmG3y(t#~Uo
zzT889w^%Q^dB%Q8DFUVgau$fN8f4-6`|$rySvM)yPJAx9zT!IcRa4C?FH?2x%{La_
zEsa*q&y|v&W%klo4g~c6o6im4)*+qe-YQFvtKML48Yzv9&D%*SPgY>CO~Q)nG@Cpt
z72DUo%|$((T}nJXb!*DpYg*L;tY7xGYdH`-@=zGftrBZ{d_ExSR<N_S;gfGH(Jn0`
z{iqj<q4p~h`KzLpr4Kj}$`B&+`HG7E?m>RURr(&T*KzjxF<^CY_+cTfSI|w>ds%^A
zIuQcDmN@QnZ{slkA@Q{p$n}6$+4m&mHdoMlazo>b)3~1EJy!3`sbiC@>eYNzTv(1I
zT4rHe_uHC&3zI>;KU8}ID}PmFnSS}H-hG%(Rv_iW+4cc-<#T#Rt<=qKCd|8MW+dfA
zH}&0byAw$jy?Nk=3xB!wSG4}iwAC)3=?m=OQeDY$p|Q&GxKp8{Qd2OV9vMr@Bo@C*
zM^~OSy~NFijwTBeQNzYK?h<fkLmJ72A+B<QuIPsfDQJl&RYGQV--v$NaR30FF+YHU
z1m==O>GGoWgT#eGh_NA%tp?+5R3<n-UecNqkdI_P6#Wy!U8!^C?)JG#reCY6{acjn
zryg5lPp6QvB~Z4A(&t4UAA%ZQz-IA^*GbZ>%2fm3Iy%)lC|=P^))qxuYDyKNnmSZ7
zx9Imy+uY<=;FXBOz&ehF_n+O^oqXU=H}O`ear$o7J~_jVzmSj3TZY5;rVh5W?jU75
zZceY-EBrQ1w5JYUEY7^YK})bai!JYyO(vN#dsy}S`C><-Ks|xH6?)vgdf8KrCY}!H
z0-HB`E<f9@h)}aHmzW0nS~|172dP6w&{UYM8-<7+R@{G50BcKOKzzNJ^cXR&sMBOn
zY)i~w^wI^E1BdVIkG%)}Fi7YBVOfZg+Vzo{uQE%)p+A0bvf8!VInQ!%k-Y!Z|MNpo
zQNoaBcstjc%=CCfMGrjQ>fx-NHdBToblBkB{OQ>T#ZI*t2>YvYl^yY$R2y)ft+P>~
z#c>KR#K83GvoUQ{ONfIR3}}HG!H=WH<!7Jpz?J!Dl=bPYv@a>o76moBy)8Z~@40sM
zem*Xt=iGlo#g|>fv#vCxU)ubI?@}KQ;Rr|6=qWj*q#oRnOi_BHCfcZ*Bce-{c}@s6
z7JhCiH8;F!hncD7I5x~_T-|b6NH{uZV&d?Sas<&I4IQneff_RUcA5k!;}Js&7TsPc
z`qMe836t*}>Y;{cnc-FCX5J1+O{tkzNLsA=dJ=zc#8eWc>xW3gc4VVRD5-emM{Dv=
z1~DC$wR<Xr=9rV|G!(`dK(f1bRM8m=!jS9aD0(nkT7Xk0?#*Rh4&yop+S8OwhsfHP
zw1P@RPsGA#@HZAHm_MA5=`b@^OivQN$-O=f<w<czK2pcb*h1YtIDHDfWt6UulCiKg
zU#xD9aOnxS4!%e2;xr^bsdbE#G=r&Z=1P6d%;up7p-p0~sLPqmF1`bmjLN0<&xlJO
zL`zgpZkt-#fRu&kYhlQ0QUG=&*4KC4);-r$kEWFXrWnAvMhc&4(RzQA8cXMIT0y@H
zTx_Y}&@7(|(UuTGm6%|QHevWM?D=WDRjx#&xTK2Vd_H3wL@`NgJkbRrli;6D+-ZL$
z*DM6-2-gVfBM9xCpN#*p(z>jOLM`0FkW$SE^IK}ncPn5yYvox)Ic71)7ApMWo<<2W
z{KFN;pW9&JMr*#+Q%g}qM*{{T2>|9=hhMTkarOMx6;C-DJvVte?WouMmzsO-BM4sg
zo+L{&T#MncP<O7=sE|$G9s&<wC{$Jt+j~MK=|nm@;?a#0(Rp+0yJcpoLz6Asv)@=!
z{j-n0eZujg>#fa$vAAri8D%FJGlLsLUrz<I1)jUgqGAQC&jXIbzMf4I?Uxgv%NDKg
zCAMC>?^9p18k2IyN>4N*3u@Ph=6jns^NZ%O3~^cdZE*b{u8>W9Q^dL^LR?*X1U%-6
z<{_J=KtQ+P0QTuEk0PInjdqhSJxpWJqy+jKzW035jQw%}XrMJ_ll+P%#;6iYixqb@
z<Xz`<$oYIvz|1loG*G!hetsFdUMJyfozCMbl7xqVlfORm_y^(F;GKm^O`iwA>}7lJ
zss;dSpVR2yfwkO!8dzH(?BNRv?_^i>2fTkT%^POk2}WI^5I$0vu}M2@kcXDv8NTq}
zGko!`u;BsadI0r%uK6U*lE|nsM02SGMjZSXHg1fqtNufHT2!go)=)OjR5JJ1Tn{s-
z$a@1%AZBQ&?0ZK~#h*oWe6~1=65b;P7yO)XawaI#9k7)6*b;;Bjiq1wp<aarxjF$t
z;}~``USNppXt)2QAG9`XA9A9zdiccefV;R9_BIJ;o)US6@hvkqLzBHIa*r}!jw(zx
zpWrDzPV}I7ZMnYM3j4-VZU&)Ha22)bAMXD$7`cxOcGDM2yIVu%Iwi2(R5n+rQU{U1
z`U$Y9zuS;IR0gZKSYWO%oYv$qAPzS{n0|#V$gW<a3@9m)vjYxJx_0#Wf=%|h!D(kf
zm?@j1FI8cnLSv=mjFD<^Mr%6mR(ilME)Vq{nq{<2qrFi?H{<ynQnqmWOUZ3<%LJfu
zgI64|spV)QX;TDAl~5!;`ZqO#ih7H64>To%r4<SY;}4Q$_XD)VNP5jvb}jXUa&5{v
z+=&X-k$kx8)O~<|oRFDwCugFt@A4Z+T?t%K+<y>0kaW#o$UeoZOU#*TyNh9r6F_L7
zOx~q(yBl!S<8liMKF6N!kQ4Uyl-_4JGoMb0{iiC0!Z^d6#>k!)6$k+09H%F#qkZqG
z)vECW0W&#fOz#=jnm$gj<m`of6?1p6-zB#a1cd^Sm)LfODXg(y6+~f9D_0v-Knb%%
zlM0EUzrV>&Cp9@OXh&_>{4+h=hdNX<=x`IpeX5fiXq6z1&m$CMwm!Kwn>Ryg<#|<%
z!@n4H@l)g!E}Dj)3pn(f9dooemXI!Yqso!WW4s9$A1D1o1B($cSHc2&6<1Ld(n5`K
z=Eq&G%qi7;_ng{Ytd1k0!=$912r!}ZRm9b0h_96qnA!$uB;Bf#TxDqT4$nqh(LB=h
z2;{d!)b>=tyX|wRB`ZO*gbrFsH_`ot!E5Y`gsg~>Ut0noQ41Y^y#0#oK3E8q>~c#0
zL?t?r;!rYNSXdIsu74{U?jrkDs|(-AR0^Yd0^LzpFkWb6bP<mHa6hfcxJL*Uv?9`=
za<Uj3n`93aAbd%8{cLiC=Vf46uV#<3ETPA2s32+Kq~q!Oo6sZqHKJRiYRoc6U$Epf
zyCRUMVOMA;9n|FM=S|0{FJYqt2NrowN?Hb<ym3t<I<^K#FUi-Hu)jSdwxKp4<Cp+8
znbxU?`s_L!GuiJG5kR*-jrUjAb`@I6pcrEdSBFjB9j`5YWtj?v_Tf%QY7y@D+kbIx
z|E<pB9fYmA#$il&-mR&#prD?S%&6H(%iKKK9U=hL-G{dMBjt`tmgl#UP;@4_O~CZ4
zsPZFk_v%3hP1rf80in09ei#*=^Q(5kYyxR!6v}cdkjHCm>Mpylom^lqS1Rk2zA0f=
zlE>|K{XRaB?^-v`VHObX(6*0~??x?vvg68Qo$6)vf_4V;o;*%dHG`puB(bf;T53e0
zIfYWne$7)CqBLEjOXz%m-npv67e>zZ(EdSYef%{9xr>j(`t0>bO-H@b_wzXdi%RA%
z&xEIE{|e6IJiVY1hmS#13)3NRcrwng|Jj2qhMR-t(X+o9lIn}?H|1A7f=Gu#F{j#x
zZ=*cx+<xLu{PJ%thOa7Iv2CzNB-IA`@K0^jyXIFJN&%x0oYSZ=F)#1h=<=Ey0)@g}
zMvjyGF(t&|F+Fnt(j-B(b$^$Z<;G#lTTz4}y*ZlG749(S3IsvHJR#eDC>@#UKSO1i
zL<9n0O_Cze$^eK^0NQQCd-8+|F{ozLLmc$!W@q<i|A0#kR46hb<Cf{`@ev53#q^Y0
z4`C@HIbhVYdaw_!z<Ko5WFhW@Yj)_*eI<RfGgK?7J(%JRAH={o17P{i-F|Q!QP<jz
zaebY(G?zZ*6Y}dD0NTNeNi_&~xmV%XB;#So90KXpP7UuK{t0^yBC`mN(9smSvMlF=
z&h56XzlbpZCGhCl;8K|*6Ks6Bi9nXTAH?K^j~bfBp$93aQ@G!-ZG23vNapL-sJ(b<
zI+A)sq^{uo%dErUg=wvu%#C_Y*wCcfB+Y<O;ZsFU-&+FP@h4<1<@Y6)-Xb^Xx)d`k
zi*^L6cj6UyYJ_^o5_7aXGbvXO=4e}qFzzj#A%CS8nlzHzW8091Ahg{Oq}FIc6PdLk
ztkb>V<SH9q)%n5^U$)PZg3S0ko4Ns(MZIRquxL+Rxz5z=uCxs22ghM`)$J(v*Z039
zTf{&dUR;y5^HB4%*VL&gfzf31RAC3tDDm?@^m6a$q1fMYNlf-{K*C(H36I>%jKgj{
zu6MQyc5$f!y1~&?VOXiMX;?3+a#)m2b}|NL_^<rMO=by0DuH<?nzIJrXqtb-!5H#{
zkb>Xw5HF1-BzEX|Y;d!S_%!0hA^(hD)EIEO;kktxgL9)uTcogrD}3eP#Su}X%R>}Y
zN=WkII4s0$TB{dRIU=fH9vk++K1JP?yXijHvwEysTS@kbU-hJ6)IMj}lR{MK%BtXB
z+XuC3f3H>$xpqJAp$B7MPj|sou4U7hO7+@L5U+yR68I;0oAJZ=aiO(fI6Q8)(Ojv1
zK-+DBP`%}v`bH@I6djdXizAm@Dl#B1ocHnDJ7#h<7J10E6Mh!n$X}eUlm}!?B1M?=
z#t(@#4WMEast+Ffrr=BSX*XvSRN|D03PA~OjVNfRED!DC5hqXea_y)|*no;;0$)}G
z{K&q4R3#Why_bO}AwZ<oU>*);d{Sug?Bt~1w8n$T=-=#sYOo#Y>KBRL!y_f2mVyMV
iNjmqd=c1G64l}Sd{-kkTr|?+NZ`0@huP3_Srv4AOcZcZ!

literal 0
HcmV?d00001

diff --git a/images/benchmark_result.png b/images/benchmark_result.png
new file mode 100644
index 0000000000000000000000000000000000000000..d991fefc7ec3436381eeacc515aace2d688ff13f
GIT binary patch
literal 160704
zcmc$^WmFx_7A*?FA$Wk`?(QzZC0KBGcQ&rUB>_TkcXubaySr~3f^OX5?R;|1x%d5g
zzaC>iH(gz_s#n#TYtD*LQjkJM#6yIDfIya!7XJbP@$Mc10*VG67W|~eCf)-A;=P=e
zn3$4`m>7wYvxB*ntr-M_bVPy%oTkb!cBal38Os28X@vHOFZpr^ITC0;=$7DRNOJm;
z(Zo-NY7r~-#6|R`)zESD^dM_UZ)<7X7B7{s6B1l&48`{35f6{<Hl8*fI)2?vrT|`5
z*zPuPAzZ9IzF0ApB8g;D7+@2l_oT+B4v@J*l63`OJ3?g&bm)#mfBOcj6ngd+zsL=d
z+h*XZ?Ci_`W+0(R5`P6LvQ099HuT_$_J#<d-;*?Q2}$xO(tcC!Q-Yxwup|PD8LU~_
zr)901+lOndTT(0Y{XnD*AA&7vFwP7zd;R@Q`8z-C-RTaPnlf>cM2Kv$P@W&m+Xy7l
z!5*sm8}BJY-MUD3kTT2t9nIeZE<Ln*HTAy94&@mC;z;6tdb1sTOKj8aS9;ISV@alE
z+E#16)TG19o*SPG>r_QSPK9mk8xD+wq4N$`E|o@4$#0aBe{e=Q!8H^;q1z0R{A7#h
z)J(xmrJXl9^dSsD8cGk%_~8J6$=M+nd|3^RY!PKE>2rcM5%08w2ZYW1jHO}a|0OoA
zZvh0vK=>v0TooxEL3@<WY8}AC3u|l_o^qKUOw)jDu0pZh_kMZk!^Oz69my@GESc!s
z^HTk1EswSV{X}ZwQXCyJ41ftoGo3;F$C9n(@q>~}?!aa(8Y61t3;4ab+%Ri&c(LBO
z29u<apA_=#nnfJK6#6?)qN(^rHG?HRYw!gdh{h_Op8^MPP{+cMzKq8VN?+|@%zJL*
zF1KAc9r=Jn(U5saK2(2w7yX_E$`s1U+d+l-xgZk(PJjeTO9Wn*Bb7Af!~Az>eM}Mb
zdn7W_AE-|u#pWT|5J$o&HuD-dKT1*=l{|O9N}(MD(ELF1r&rbgxn()LiCT>AkC}lA
z7!MKSy{m{u=A6chG7e8FU^kU{PacXv9<e1ZwFZsa_i|zBfgIXPc-QS4R{LVB=plYc
zB0!-_?Sf&ih=ViQQ5ptA^;O{`sjdoOAc=Fw?Z-SOYBv;#@a{lLGlFCHV2`w|0Q|sh
zuy!;R!0g5V@Rk?K+)`PS*Zf6k1YVY<f6GSEj@oY02-{4t^W+EL;`gl%=TM7|=*frN
z*>@f>k1H5Dfv1BLhJ!VLC8KhL>S_~Bb~kkX+`itjL)3XXyh!Y-5o);0@Xm<A**;ab
zFK624D?5*$1e2H<R6B?vEka!yKc`cUSk!E#84PX9-sefxtm&=$Kq~o3%~h2oeV)IA
zG|GG{rx$`3fpB+p1c5B{=13uKnW4UVUDZvHLUnwTf^=+*tongf?Am-O3Za?}l|^G>
zh*BN^RX#}Q0qKwSzNgCr4GH5DT{3JdL*TI%ta-pMLv$LLJwr-OxY923YOE7z;UM2^
z1OnK>ZYxJfUD(BL!e+0QZj~uG{UAu_Ptx#&MkE8_Fvj0_$j`!2F~WmMr=<mcM&^@i
zg+nWo0Al;4d28`XL>oz5hnR;x?eaLI{1&k%NBu#x3v)@-uY_m)H8scW@Z%huS)Q^b
z86VcTs70=XB|-~9g}BZ1x+S?6LHk#IDgNmqOJP_R#_thY#!Shm1Kq|o91NIoMprd#
z<?qV+@v5=!g$?@oYV=PC75Z>_klX|4c`y`v9_~N;2z9)#>T%zWx|eyTdxbdf6-GaV
zRtaS9j_IQOOlFF1jxLLU6?_^D6|4fXI-+W$!ie~sD>#iW6jjn^yG^jowk^3$WK3TZ
zv?Ost9V<ORVetimHnER<FHTklU%FLhP+CKJJ-^A6u@+`IgFr!nTyCg;7x9wmQuz|=
z(&Q3CBFR|pBCqcYbo_EWR=iz2_6Vgcx7pWHsfHq0CCrlLIf*%0Yp9zKH%vDLiHV5`
ziB${Yj5Q0Q6|xoW+7=B&mhE}JXVWbSRz9Q#Hl{ZQEy%XYd&?Bfo*s5zb9f2)^0r6&
z1^Q*Zs;A`$YNwSkPU&asRje2LN<5{YNDQinRj|t!Nf(J)bWTua=<7vRW-v)574aK1
zY2Rg^$*=nfhGwSem+NP4u(j)X9b9J~#$BIY_g=$1Qa<Y3C?2}AvV?L5^=`6bvp2FT
zu(lwF!`{N`2H6HV2W26zAV0ppK_0}T!pFu_$0uSvF*W|oPnDE1s@uIb*w*bM++z#v
z9qiaQgN{sI`Q5{WV}@o)+EUfLe(<nm+S?@2DcLFdf^LUu$D67-B5qhbPKzq-vqH*s
zG{?u^!as#;ZFf6jJJWIt6$&p30iT;v^*K(t*}1>5#HHG?@Ea}IOmLubO>!C7_|3Hq
z1{$GTk~V7EIE`&9CX#*|A2Cy_Rj7SrkY^-k#7fj*SY?DOKdJax9$9g#xn5VO{iJE3
zUDaS;cUD(ZonpwoSh%t8t`k`iQ!(V$eu}%w?RM)%>jqrqTo$b1u02@6SzcTPx@Pl3
z^NzWI*V4YU%eB*()7qiOGJ@-+3kI(mzn-&XMtgey1bzm;B)a57^w005gs!t8`#o;7
zjwSA%EUCPFUM?4hh0~J*OW$Si>#WC8ShI8m^#FQJzc!p^-SN-WPZqXsd)9|D&h=0I
z#)Hn#&uY5zmUkXB4{Ca9(grpzO+6Gq&l`2)S9wWINn>T*WrM2&{Ohd)PlYi7is_SV
zS#=%IKr|psN0=}0!SiA7>GbMp71%F4=%4f!=XARhywezUj987h31f?J_}&zz5bpKk
z60Rw3A5tl@*GF4I2KWu4=j79&3G;JvVI(Ky*boAE&)$ojv7P!*;~exHvz$|-M<W>{
zHKV)Uik`Zj<K8DS0rGwEdPxkjJTeW5jonnIhc!NXpvU>e)j-zwTnS2XH3`A*Zjp%+
zO`k=mzh_mO_pE<h{V4NUg&}-zpfaKz$AXxf{cK8nyRSK79+SAgwSO{7IIc@NIkTF6
z%hgzVJF<^o21O<^ztJ>fmv-27*x`$)ih7ZY0v@~7Mm|yrRmt|;KF1er=at*~*I}c$
zMftPLZQcTxk*Jbe6{MofG;=ma_m$Y+e6yajcSV>pDl_f{{+s=m)}Al8Yu{793kijF
zX1=MSF=)~sqQk;#AQ4-@npRHu{KWdvKvgF7fua=cCX#oX?&9iX=wy7^!Pyw#w)=j+
zaQAGO{Gi}~F#b@^nS$CPd6Fi5n$$HhQI_B0aa@<XP7#}Ol}Tz>WpDfe*0I-5c`N%~
z;Ub%)I=f}_w0u|V3F!urG6cthe_WsM$w}E`S`h%^ede_TFdi_aJ{b~uZzMhpJ%4|&
zLn_CKXE@aVSmSs=c9feg(TUZHUB%?rz+>hl7oj!W^Rx1ttfP#1o+%+=<9XxlX<QCO
zxlMU0Czxoj`B?H<$*p6pp)L2hm+iIoO1Sw&>7jS?a%}IEzD?a$TSk{w%hN<{sx+hQ
zMjfZ(Oq<<Een)#@rM97JLEgZ<(%80t|7Pwc@+Ko;I$<2&maBXXlq0L^+Rm5nr5I&3
zK9kCg4})K2i?}gYy=)73*uN8w^lxjRG?p`|(6j4T-(Co;h-svB;J=ESRCHaHZ%lG{
zYZj}`16Vvy#JO&`y0(6HuPBx}#<~$~EjsCV1{_X~BQqiE+_K!opTl@#dA~hVz9iv4
z<=!@&-Yo+`Wv5k~a3A4D@L~s+qkQFa3n%vH_M?M#P#YWWn+SL~9;c#v6T`}4J`!m;
z#e3E79&Xm;n!HrZENtv(4YsZnHm&or{_bmt!&Gd{isW;>b-DkIX|ZGxoq^2beV-2e
z6?EV`!L0{snev#v+718;`14Pl&2M-NKYh6}1CBBh?(jwHBeXa_TOZEPE;cVR>BfK;
zF8S|)iZm5_w+)OnFK7O{+bbZFA8~UXh-u4yS#aGO>rVcB@pbgVz?-j~FGc{~*YU!r
zr<Y&vxw6QA=<R+=?=t+gc)O;5r-k}V5#=uUPWKgXnYFH1pa|zB_crrXwx7~lXYhvd
z`eY2u?>IZCJqTh>Z0GR%wKKn@VfW(na%XVRPUN9-yKsLx;VyJO(WdD6=3%%1Hxo7z
z6HTlmB;nh_pUllevp4KfS^)tImzsjm^^Fi>N(ItTH}olN_y~|Gi*~cD8q($8b5xft
z+Xi9zm3z-EV-~SXpXWqdO$sA>t&|!TYOdGB!%{r+_g3Y8ZZ{_C*6HS^96N+GBZN;p
zG75?<0l{-KGL<*}*M*d$tAa1#AX`vg6NcV;qZ8@9>?<-#&{tIf!e15O>;l12S_=RH
zflc}67gFX61qcEHGQ&z$(?wJMGoOir9g~r%gRvQthn*w%Z3qYf4?gg%otcXfiHDu7
zJ%G<cknFEF_`vsn9y61X{Pl{9jUbt(yb_6+gR>b4Cldz~3z-li2?>dSv#B}X7jemd
zs)PR%B(rpJapYrWc6WDYa%X39aJFD(<>lpNW?^GyV`Bur!3glQcQNu{v<Hy?-N?V~
zh?@aSoUI&PtQ_n~{<Ldk?BMDmNJjRjqyPN;Jx((ZtN-?75BSF{uz}2fN|;%hSeXCQ
zHn^(5pQn6ERvu=yn&MV=X7&JZA41$*ECPSM|NkoaZ;$^|Q|rGqd02S<x90zp{8Lka
z`HunrW6<BT_19DIbO|8}F#l)jg%I)HhU&mygTP8$Q5AfB_vh1qKU_Tc56$0K@I53o
zQ0c2o7X*X|gpBwnRS(FMb{G>vsah^ww=ojATC<;wb!O(~=8?F@VR7<qb*=91=f1%6
zCm8p0y%GD`Zar#np}Ay=R-(*Lv(P06LEkxQUrB%m1)+>uge$BUUS`T~&z6?bmZ|i&
z^G<ynI`Po9{ZrBFRDVZ;PSr!sS857XnFkVLC%dbN#VXHM5Phyjh?=%g36tOW?;jhk
zgec~be1|LQi>04$WL#_FCt2WMFP|kIpOO|pp8KsMs0?>dl{lF|Gd~Z#bANLKqpRqh
zD2?;ZBNjHGe}aO+=s{F(Wi(wK^9X}<CwYi2VSGI5vnoz0>xbw0CfUr+F>VCmBS__^
z%h!OxL;X<FVe*&B#{}9VAtr;`WE+EUL>#Y#<)mE2J(ZAOq}EVPTJDHT=E^<_P1;$2
zmt0{9;j<%7P`)XVMi~1nng9BtMPd9UIp&)v;O*_LuYLEaAQQ18<9YP%g7fO8XQ`L#
zEllUB?)e#lq~`HQYv%g|=t;Gltj;zchAS_j*T|T%T-8t-m5jto7=J&sIZWzF10t`t
z8N4<nAo|^LObY<6D~q5bYb~M}^EvLJw6n)M<~6{7jS%v@Od&kRoJzy9`H47xvJE9^
zYdh<Sz!v}DhL<)g0OR=vigGf`3vmB5+Iji9_wZ`59+_4erqf{>(R%IzTpD}5<J|Dq
zkYj&yMs&$9)p5LO_Mp>^ffoL-)Po|v7x41gFHGzx_!za4>3bU+926gphI!?y!N{cV
zGm`JiauL(*X@@rxUzJetk<eAtZfjw<5FhMyd`cd-0MdQ4C>!bp?0GTf(XvuZ^6~Jd
zv7>I5c;*uO%nk#aNJ#$kXO54C-3FPR(!OsqM{SPn)6e#;Pf)fN9g8jgUEUS?{;x=8
zqNMge{BHDc%!STVq|~%uckmcU#B2e0S1V&C&pT)Sx>or<-Nci!`|No{jGnKcOo$Jj
zZ<l^=bzZ+I`(xc~SND&E9`;N9Sn5<<JKqi!FD^vh%7`GYTRm{t{5hR^UFj0&p)|A=
zhFhh-y$L=VEar?|Rg(SiO?=fDaSw=-yT^HaT+9LqDmN*u>kGVp+}|-d5;|y^tppqy
zISRsi&|HtZ|2_ecT=WLPK`mZYr7~`d50$YfA{RSFnS*@h^k5xi9s)ZB{3|E8ncMvD
zj%#5JZca+^l^p79<T~GcxG$WBPG}IF!m>9!?mlho*ewozx`VEKAFHs}-`cEj@U47~
z_gZaeHDN6wtOK@0?Tvh|HOD*{+FyXT(jneUwVHY|tIhvVr8&mwMm!eyR<7@bzD{zw
z(lLJccI^(8k$d(A`leSO(WLhFbj7-eRjFQfC+b>AM|<!D$SQj@NC=^K$luew7j6@7
zi@4PF=DQku)i{@Z5aBGkU{-BZ)>)%P|0IHj1`Q4Q-^&mjJbd>g9Jg{R2|WC}z^^3#
zxmZF6Z`x4oYW#nwfT(7>I@=5THk=psnEs6Te>4YfT!ctW1m2IR)#!`fe+Tf_aJq2b
ziGqA{$PfN^XyB3nfzYpL&jN0}`7QrDv_Au71`qE^RB8g@|LVo_JtXP9=SirHE6o3x
z`KK2Hu%&k)(dn4~#{{53Uw(Y|5WPvgP$5C6&7t08G~Ouaa(uh~!(N8uq27As2VbM5
zaoSq|rcC`E$Y~Yed11KBR$!$RqfmMTUfhM4G6$E&?D-SR-yt&NVxhcBzbK@)OT)!v
zjO=4v>E9l5r&28_V;QuY;jDM0rQYF@a$^5IfN)FbHjL+)0@;~(Zn#zT*M3E2+MRB4
z*-;OXrn7>X5_X)!Jw@T=!tvih_a3+BGoP^d0S|aBUN7d-YVZ_UC>3eSlWrAuS3uYM
zk9O;Tsast)!gQ8$VTmNhvf8W;Q!GKB>!q69kwNiUsU4u|!|86L9a61be<XHJPY6=k
zah8zxnP$5)U!H<ps>)TLS7X1B=k50yVuAdp=kwiP-&~$11Xo=i9t>T6(L9=N)6Dk?
zN-0k5>pfio0AkiUG0}FKtW&Cb{87MJd~S!3_!wP^e_hB!tc$)ojOt7yZqTcBV#i~k
zH>}<hNGfqHsIc3kK+-4h!y4u%Doc+rqq5oPRGM!~+4>sEJ+b0PnVRW6)dz!#<jEkG
z18hd5q|F$jxwNBsX_E5NTcCmb{Exj~Kq0*qmDR$iccpg^^oy1^3wqHg_3!9-a4-?N
z?@&9?eBM6FApY%O0U$K^@J%sx&yz7vcNvLqA99wQ%}T5bB2~XPHH;b5#ZDNsl)3}<
zxY3M>r29pOsksUXd<FdSM}#`jecqcsy4V-3B)a@Y%rL+DW=J87@jS}?(-W3{99b-4
zyPu}i#i|tDAO#V}d0bsr?w3SeQe{q-e!Fwg=p?=<=mn@pdjHD$^TR)$yG4qI*4>$u
zsQ{eNPcO)4E0C+vbF(OoQ<Bf{zTQRf6?R7u^t<GcTS@0bLu73#)1i6TwFq<wQ)U^w
zB=0upUwOzYnlKH349d9>dr}TLv~;y!pJ;~}z}AEJOB^H}1-rc#Ni5Vqy`hk!VRuXE
zMdHv)5=Rq%GSdPeXtf#9Q`WpV*~`%MtyNa$?2<zRQDhct0>weo+bs<kmSbNxN-v6-
zbaM-i8;eh$U!~wX-uM}yfx?nMMgQIfNCcpJsP1Vt>g6_^Ks0t1-acjV49e-&wS~eR
z#xO+h<H>j~%msYJ-M`E}7~tu(#)?j6!tMJYkm;3q3G{13=3xkQHIJC1X)LAO?pN@C
zc+cuiN4&014ewXbjHue+OrS<bEVq}JM|0x`yS`0>pHxYyqW$Qr7k^q6$92D#JEDZe
z)m<Cs-E489dnzB<b^*(2ue7-1l17DGY;0T3*J!=U6a(;rGZ&sTXq#$n?A+QkNi+HZ
zH*T?>21~4!Dt>3L&F9Naef&OI%cb~~PQRRnvsgpNEX0z*15Wk>vjT!GRlHIIn;HYL
z*xd~KAsxSSrjU0x!_2wc`zL^E*_YVurEWeZu24iQvOUeLvDlKI6MDHg%m(Jey|ztm
zwaAS^9x<hWb<#n_tUSq^W2KkW`QNz0_y<eSpp$<3Bi)_876I>%iY&NoE`lj2#KILm
z0&_B>#MDn&sk-L8w@m>A{GO5tTBW8k*3->GYL!ZvvIz`1kB!H)O=|Te2h0Lah{2oe
zb932#c693X#dHUBcQLOU{s~I2|Jl61e<g?Rq?D?}{ek;yKtYE$*8o+vTrckQ=f!eu
z()IWE+Rbitz#mS?4|&}AQaJA->PJ=xz2|a7au0bDVzj~>;r*t`9F!&%PK#Z4V9gWH
z?8IpUO9qvxK5X>K1Omf*?~%{v#~W$<m0e^LX+CxOx(A*%bfEaL`Vt9le#*+=35e0_
z=q5sn-&`!#PoP5iT~*up(<jcnhOi?ec$JAhM`(!R?ld;2<a5avtw?^~3$}{2=Jmwn
zv<Tp)rBudBeYQeHYKt$z*wvEXh##-by>H#4F^_WAeiVk4lDmeq`g;sMj=T2~t(NQN
zkz1C6f~($=)hqtbAMBQD7PZx!Yu`TY=fU3YLrMkG9mE*CiQHH(W*fD)F5k)=E_S5;
zn#xmnh4&*pqlR;oUcp8_Dm#wO5L7JVKF!vs+R9}j`*#bwm+C?f1UBf=AI-G<qC4%1
zP^tH8J&;nzRQxfwTK*v4{`R#OK}c!k4zZlD9M?cX@7tB!Qs04YVd#YR3wAjS(NaSm
z-}|EevSHeI=Q|>smeCp$6Rd*M7N=t_wb3~NZ=|^2szM8oA+n&Y(XVJpE9Ot#c~UNv
z57_GmhvGG#O}{G2ER2Y{5%VhOxDi)lSKM+=EFo}5qQF}psE@DhiO6X;7^kn?Nr7_I
zh}K%mp0sT@Gvmz}sdvZBl3>+OKl-aXY)B|(n%@8&5sCAPe{e&Q-hbTuOt1kIG^z2q
zLRoc%O1DgsDUM-B?<>b1+shUlI6+O{vKi1YJ8iXbZ1-Cf;&Z&tU%VBY6?D3Xa9_`=
zOMKlw<$vBNq(v6uc$oUq%Q1PT<&rBz)h(6fcxd_)v1X;zDHqQ%TS3C}tN+%DtL(2h
zS$Pjwa=y}XXVcf!P7AXXn3;2l=9d`Qaf*BPq^i9DUEvVs$NW|+tLOK^8g&lMs4)+o
zuRce%z9o-05+HP0H^-{VQh8~gLsg85of`ptq+fSkN^bwa1RwHE_-_B>@lUGl&Xj5x
zW6b5+<~#)YrkJTaMU$6<=URz)Y8AS%RLJ<*9EQIMbl#4&km`J<(4=1Cng*0;sT5g|
zMfx)7Z5wl8cJVxuQ|0&r&sBw5bg#Ho1BLSyxvjM)8$jcqsMx~^?Nf!n!lRt8W?+=6
zs%y8EASEtOxLA?}@rB5N#@&qgxqD6dLSh=^b57EriR;uB*-VmyM0!6^rpl+`fov0^
znOws&92aZYD>KgKBivGg@lLy+*!^!LTSm<tw5_XoSA)BKu@EGrbu8lzJR^=~vp5eb
zG=ornO$6-~95<OGG(}o|669|%vv{;<TL?NZoQ-%)hW5uisKWVvyphA{YW*9}g^X|d
zdw-d#Abs63ur)8V^whJAdn#mfajx-oNBk}Cs;5c@<WN+2Nm5E+x7MB&4LV7Uu<1vv
znTv%RR)0fsfF@?A?)7M_`&5Bf+T1SST_dxA&XSX4_fIT^K!;D;Y_i%&eyp`ZfTq|t
zxJrDipHxS%dv&n^Msf~RYC-TD+b7m}0za()K$?lV){DiGJFqvd#FG;~Yat);bI_jY
z%AI^IcF__>mq#_MNaNV~118_W3vd)AIjrR`kJhI8>`DcmQwX^=!+MWV+;>#A53TI!
zwf$uEkvr)tannM$DaShY*z_isizSD<&FxEN+P=p^&7Re2bx4SG+WRd?HS)1Mf1^_o
zIH|SG-bNkX;X2JmePn|1OpINj<Ofr1bBZ_r@N$+{+WxYMzEX2uy%M_cgi5SHg|>_Q
zx&DrrHzykIYS7@#WZw+Khaju_e4Epc8VZX)5H+rs+Y@H91z}!|%lzrxxW>$XfG1wS
zSNK<7t|G0mQ~^ebC0(dOa^&N=Z0-5&8zV%i9o4Xm!vPc*-!=^gpuCg<#jmUW0sg$O
zFIa9O{i>Yxvu=$m)faCrmml55iU0CHqJ+^kgpkb`g04P#6_bM~kKeVAZl@x4sT``>
z&z(NPy<(#=91pRZ{1Gt&tmy`Eo1I=Sk(`af$NLq2q*G^=W`KrcTUm3gq7^d?>mQd6
z=8U!21-Ijnm)8YlcE_d@G{Jbtqkr57m>$8#d9#qxW6;U2pYk<C_WR3?FjX;Cx3K>W
z-nhZ9Zh=$xRaww^b;_1>bhB!iwwh}Nugw-WyUc%*ge4L%sA>AeG)ij8fK$04UJbaR
z3`Y#$*)qE)KJjIj529_c)jH~KeEu=+ttI9^q$2jVdPhaa?_xj!yNF^2NTN#FXL5rE
zmiQ<os)>?<A>i)3g=Zu&x~O_}MI$!9yQ#PYWOrC{s=TkfAH}`GcAgXB!6L=|0u3KX
zrK=9PcBw`JQ-a4Rsomoxk-?8rp<$F_uEYE*nw3;lZJX)N0>ea2mpH1Q@`11~KO;qv
z!e7oX>7RV6uaWm|_06BhKDfnKj@#}^llN)1NfKb1R=Ybstq#0WdLs4}^pg)H9@+Mz
z>dtDl7Id{;R!sqtI*{Ipd=mc$59vbpklj0yh=t1q544SN-F8p}G^k|o(!mkDJ`XoS
zNm(BFEnf4_Z09{cp>BLh(rZr<Wlt)<O@*-ya-Bn_?*_vkhY&b9kkJ=VSPYeSl<IZ`
z0YMXBZOj20XyxOxg?q?r01X_2GQEDO>78rufB%u=7%aI{ubN>TK~XEz1+~)ujMu`k
zO>=tCuwR!MRLE$c$^l;RtEU4wTZfpnFuZv;#eF{qzFB0J%;czy){vhT#YvX^M3*@j
z3mHAr_Tkx*-^=%-_{~tU(KhR=>kvHqb|`QS-Y7b=nu6sq{BPZ-tbATlp^3jiSsws&
zO>jcSH1wSn{!I$2#Yjj3Q3Goo*Rk(zgliIo$aiv{dXI&+%z7!-rQg|clJy(*>ktyY
z+r_8iK|AJXl}SfGazF^1y{0<V=E~V1Tq&JZ|Fh|CjBx#x$ncX8T_>s%at}|b=tDsS
z6^NLjuHt4LBySe(atEGs2vMo$*u%+!Vui^NILiz)uo@gssobC4z!S1xursmRFizYp
zMQ{hKPC?qej558w(6mX40-7ky4N~0uKiapfegFp^s?6d!xV9yI-Ouj{1p(3OPSP?e
zJ*WO-ZPj)d&z;XUuvNe<?^D?08>;R>-Z8I2Cm#7Z;%z&KLWL3He-l>{2<UfY_jfb1
zzgL&PK@Bn4{Osf7v@-3?JQAm4o`a@t{-#mc8qwa-<HA>u;gc-*Y<;PjaQ%$c=&852
zzTcpdwm;WiF*-LW3(6RgT3mj_XXp;wIjI7=#g>|)NZjxAU@b5NtxW#q27&Kp<+css
z{k*x_59f2DMus`~0kwJxXx6G|(*=HS-KJ`b_|FUqNEd&Cu>yt<^&>G%<OSH`@E~Ga
zAFsGg@?Gj&qVC@hU>{ODG!&moZ879s-+`GC|2N$ZP(mAW3lxqv5ntSA@x8b7<#uOX
zw;1otNapHM4~!`$db_#!V0QMyCoBT=?p(atpeg#SKeznPx;;6HG3mze(Y<QDqb?cZ
zfj79{yMEyLPz(NCs3hrq)TV|}NI|6cnxE~&&(Dsm``8UyU7Pr+i&plaa~2XF=i_Te
zJ`tLe`@b1Q3lYp!Vct0X+~l8r(}Z;VxtUfd8R2R(J=7tn>f|^jDSR+lG_1|NjPq{-
z|5c<B(}!@qC+k!|ed1)Jv7m(>Jldd@2$bC&;?F{U(R#T|PH(uQ7ahkGswz)^$`tl?
zn5LFQ+8!qhzpF@{U_Z0{Y~cnH2?m!A^p~LWo<xoFtelBM+KaPwFB;)F0blRuRjckM
zS5&DlYW4|7{8*rY-*xNmmSk4y0a>lm|Co{5bu4`-aUgWye!|*m(;fM@74vV&JUlX>
zB1SC<ZnniQBDg_bt~vV=H>ysWm?rCuS#Tldgr++GE+tTiNF#<%aK&@4luEn`Y3|9G
zUd=`$lOvzFD!O?0*Clixc{HaDon+y1lE>xM9aT?h3n{Skz&+@|#5zpr2g?b#@6j=8
zHcY2djCRO;>}_a|G7WT?D1Uw#<h?$0GTfb(@u~ev+yhxB1l~OtP9(37yK_u!N_t1R
z?26(nCR*c?AA$kE43fE)FZ+>mIiYQ?U~fYAp?)HcLiekS+w8^pE$B889TE9Og2@nw
z>_G3G=KAv=Z~*zKOPoXpK?w2jYwwDY=M(j0=JX;vj;Yp1I4eGob&TL@(?B5xf1p8y
zHD#)R&ksKT{TVFnHggqhM#m&K#uA%j?37<im_9|9PB_NP9`0i-rMfw7k;#{e(Ti`O
zX%jibNxk>7EX}mTW!c~av#+sfbj=7-P^IOzTH8A0lC{xkj67wt$;A75BjxsRqlaUq
zVNu5C<P!BZS!S#qoXcP1aTiO8dAlhu_ewsk@_wU@@>QzmnP;MgZ>UtcFqU}GA}Gjh
z>EbZuvq@612hJA>`qgY|6I9JvM_rtY`=*oK7gwvi#4O*RP~6kNUAzK`cr{Zfa)Eu@
z0dB{B$^Yj>tRi&ac5lzzm}-xcKGqWorR(!I*eg`vQO@)Z#{5ZqO6}dCiK(fsq^h(F
zBI%HG&1?@67%sM;w5+EoV!cfH7E7U@2Ni;101A6~qpbe&{My0zYp(au1%MA^`yR4q
z8*<~Kn>t)pB2X=Est4#^AEJknsx};Ler?5T32~+30T`|ZTx~G7Gnm=2iDvh}kQ7}F
zEoCq(zi|Byo;&<{iVWMVNKVB#H*XD!p$`ver&R<o{R0haIIS-srr^#K1SJgrBk_~~
zk@I1JFa{F%@*S>`N>*WOr^3FQfEz(b6i=Uhj6U*eq*B?oPz?;eIg95XyuR6;iMOVM
z{U`I@ZY`S$`FL54P*ii0vCW#JN14nTJKuJh?Snx92I_+LYy<$uYV-{1s2L1Rebd2_
z{Z_aoYFQqn99?q#)pO^hz$VabfBq)H;2-cA81`qaiywHz<cS^9T8O-7i`m<gV`U9n
zYXfL7g4nDMAsWu3Hj`o^3Y~(Un&mxzd<@n+fI{}b2%ENDEUy_E#Qzg))x!5s>zl|u
zTvuRI%4W3{X5VFc@(i-06@I;T+YL~_d4&T|GAgFP?-0}%6s8OKPcwIO&pxdT*bb3Z
z#?daMpk1Xm*#&=jIrt8<p-{F{m=dkm!(2F7H?Imt|9UCiMC=G1oi<yV(M#7lDbTAA
z-+8A0xY#m^KxeYu?}1mA{gJkh45b#aaQ@oh)Q;!^8aVta1>O(gg{m{b+}(V!wglF3
zh2os+&wnicYh|s1S5^>gi~E6_?2|1v%Ddp?ReCP_A{za^{g>(9IV%HV_P%4H=_-Co
zP#CJI_-IBwhxU+o1Gd!3Z(u<X-$~TwPQJ>+;bleVI6k$rw~eyIf^P4#<Tt|Z#e?iU
zqN#RrwTh2}vsdQFy0q1i4m17qE<ZVo-0iRzSvl|d6#YASWkAYFE1acQ)@{f%e5LOt
z5@^LVU(~*F2UbmneT$i(PB$<Brw^E#KuD<Z-POB8mN;*NI=mL5t@=@`z^db5+&oBv
z*i!Ai$Pra>;z8z(Rm=mXDSiEKq>O^S0utc(EQ2gWcVPM!^z0nCnMW|@NajNIh&uLT
zT(gnto6Luqi2v{#>R{H?$3~R;_M%&qMn*jC7l1qAtgvu3)fD=RIs#W%abII8rDm%y
z`Cg-ej=O^=UqVdNvluVRIPGWSk-GDD9G<8=!R7-{RMDs>V&6l*Na}Y`b#7xuoc4VG
z^1ib!`KG5XdBR-PiXcmHsXv2k%M(!h77=_aP55uHgmi%4rRly?N8n}pmn`_t;3h~&
zpn<pqlsNz5gn#&U5;8QjKw|W!PjsaJR~??9uV_9Nj`6zxY5pIN4;KNG<>G6wCBgsY
z<p1ht^F5d>&*)ZmM*lYs{!^9g9hd_|cf!xd_`k_=b};qr4l6Y*@o%vCi**-43x@}@
z=u&-lKV<)7!9P|1e>Nn*mipdoG`{c;pFWVu;~k4a#E*rJJZY04U9R(uuimn}wB3A_
zsKxckUf(8gP6KVo9bP=?TdrsbqO$hmi_apLl~T6|#H2&XPs(DKlAqTD_zp?~L1EX2
z7je~i%;yrCpyB?&ujKHir1ws&${fw!Mb&J;W^dt)b`OcY0GyY&h`hPm%sgt#tI8Ln
zJv}96`}um(8!!pa>A!!;>2mWpE&wAzT4|S|jWAaz14XCY)?E4o)(Bk4e(N?i?yR%+
z7K_&yU!U{?9`D-BJ|%Ek^@&8%>80wo`$+)D^~XA^f?D)3q=?=r2!uK9nH0;VVXL;w
zQqt+w=3DRPQ7axkR;<=vZPpOr`Y7UZ2FoVV#KLSIFdNLgz@gp)#ag;$zvcsy4(H1>
zv}T3_rqF2A>*2AObT##!ua~c;*do4`<&Grgm?Ix6H^^dv8E+ms{rY05ip)-rPdJtO
zMHNpdZ~`7t!FN4*=(XhV9~_};(Vp)kYdFkB&Z`Zz8m;&<1OWPbq|m|)H0wAAYJmpH
z4Pp^D*2+bR<6x%sd#m8mQtdg~sgTFqPVy|jLky9q+RnCWqyMWP>{c&UxJcPe7O{K|
z_q|D`*PD<pyNxnQl1wVRL|4f{wXv>eDo?#|!WcomcG}a)10c1|S&*`5tufQt<*lx1
zP&wsG!SjBEh5J=zkbTHEAQDA@BD5sxk^GG{1zPCi+snLamKFJfsc}XsM{Smk`MQQ+
z&e~Gn<m4+kwGU;Uv?sHV`WFSncQEe*7@#|ChmGe_8~WWa2B8@{a=$UGE65j^GMkf0
zDP{ruH-k4<NpYWR+N42goTZ8xI$Q;Z#Wbt12hiQbGe!8E6&g|rZ>Q<~F3gPUjZ6zc
zX0l1Se}H4yq`@!AD-FA9s@NBSz^^KkMi0rBscg>Gspz7!cf`UHMa;VDc$a-EoW){S
zR3o}@+Sj;p>a!L6%NCLr7|%h}aet(pl2%A$fXi~NYK5}|opydW-)*Wzra6;=+@k$%
zg_f||1N>7Q5dL-j+GSx5Xh<mu3l?-VX+?5J;&Vcu1r{rm*I%B;v>Gy*0DSxr`FA~)
ze$JoKXo~;%v?7t<b)>|xc~r5%!R^s=W$&OReay*A1e%XOJdB$VGsZK}AB2rHn*Qkf
zc!GN>?4*sJ%~X+ECh4%@{^Pb<M$ApNh-<vWdsV$Oe3^dH^F><gSSh5a0*K2iPPj2z
zXqMJjKZdBBx$J(#doo%04UBEPe5v?qmKE+A8RsA34wz?ZZ*=OA$>TQPZh-8=)=Ytr
zN7+OfB?_XPYGH6OMAPnoL^Sh{;-XR{ooGJy1Kk@!84>N=tetmOxVYChV<C+)<iFdi
zE3yebKqJzCH@vH%sL3+uHy9iCJ0d|DDb3OQlLsX0**4aj(>aBlOiXCviDB}mTcy2<
zwU)T(3_koRLhhn!4ZBqdDaA~LLq3@gp738c!=4DR882adZ*M*5CSD0}QHfcXwM2xq
zY*&pIZo2Q$RZ01@t>)e?cG4O8R%u<H(S71oay?idEqkP!vP4**bc>EsfS1F}(85zs
zm&5tWXShq9@)=;7Rp_djV`c*>W@I>;M5zmX2rCekdNlZc<S#YBG_1k<H?GUgJ|my+
zt*A&-p1G(%f(bS05G$2_br?hcggqnPV$lfJNLe_<J-mShzb229kpIP(f`%s*+RAro
z6$DZVBZFTa4rw^mJdf?^^jZ=LegKq*G5SlWY=*b#)w65MImIa6#rsloX1o<#)O*Nw
zY1b_!JFEVYUFvQ>M(1d#*iU1*6}EdGqrixPwxD5+bt=BVgkXaFp08;YyITk#ne~2U
z7UVVdQmvVcYpr@7op@4dFkM0JCe1<<H|Ctj5EZ7@?o27KblfS4j9ahfBHa95QZLA;
zv6aWZf~7w9E}mr1Ogef-`4nBJrJMKYTI@G|bfWYeIHA{a)YxjDu9?L7>te)o2Mmk(
ze4gvQEapN|{8?tdR4E`gxF==lIVh)QKgboUOB4iUc;a=o7Lwpe31)7Dt(jtWth%2m
zgNaJjIxn?S<pQ>DpFLRN$)sOI?&k$5N#?7Kn3G1mj}2Df46`&^>gUAF(Kx@|@oJrl
zj#+6?Tv^k3A^C}PCLsoVx707OCzm3s8|yMU+s#e5SKAV?Q`EZSjxcTFC{LbqEA$0N
zZhIvSivmA%ifpi=v@*~IR8Roa^B8mp#m)$?&aZpf?sn<NYtG&XDH^myR#F2mr7^6|
zL!|ahrO;?L;dgnug@<;x?HWxQ0BHZH8+ul&KI=)!P~PmZGD<S9_cy4KO7D-?qozo8
zC^L$%J8M(4gZFc&ZGP%-+Z(ovpY|5If|WfX&L;B5n{4nNf`H<)0lmkp+nQp3mEE0`
zokuW}_stG};cO|737H~NL9MZ!aHF+M=h#?-D!2QyPTLn`f|Ci0gPL@zdy>kjP0{Ep
zLlYQb1ML=fOR*?6nbYObMa!KD-^UL(rz@G{uN+^V-jcaAmWz(!6gJ#<agVOvZ}ZWx
zZ)1_EcEhg4OB{2lyqBsuTJC7@)Lf2s0P8F}n%RXeuDlpO-FlkR&`ZXGX5HE4(;Cl&
z#ed_XT4`2Qj&O`JpiOv}HJ<GU$QZKI?Kddc!aWh7>~nGK1Ynp>KQg-PKU1z+8&-1}
zISRX;o$Qs>JKjjHx@>_(#7+0(sni$4)RJw(vQGO|`_<guQ`C%(^Q{HAsP}h*jKj*2
zeotr6I&D6L&&H1eUR|<8{1Ml$%i0#~(%p-7IA+RT5-Z+D6`dzY?_h8P2q^DKG^Z4j
z_NIcSDtl|F8+ElxaSi6>Ar{V8q~MnFdy|rb;U`Qm!>&~pO-7#Wwef6<vgqvuMgsO9
zuS)6!YlD1`f{nmYhfkqP=myb->P*;Y=q6(OQPoLmc*%E{$8OoTYsahoidE5nP%=qn
z_M+*$z}cHfi7%`;=(hbG|1K#pJS>tNmu&$cB}eS$=CKLJCIr^)wAgJr^dEVsDTiQw
zQ$Xa4gCo39Fh`IFQe1>v`=Al2>!DEaDb$4G(ktN>QGjd1NJ2c7`{T@fd6bOu!*5|r
zIj778R0(|$!?0dv4T!oMgWmrE##uiur;J<0l8Rs0+SB30krNQ)Ym-m#^8M4!bJBUC
zw`E&K%Xte?3&RQAl>j(5oLnNMa-2eGv`!xin&^?$(T|5~$EnmSisvtpjl(BYQ%^~J
zJ_XhZ)PG<81iBbgH!6D0YPQ$-=}@XwAuC{#q|^}UO=5G`A+-(Q@v{^zRb$j&XXDS!
zJEt_q@V}byXz<>-B}{E}3=!Hptnw?2Ey+OcU>cj-bXf8JaZAKksQAl2;}8{0wI2B{
zVmapu4biXN)qB+BawI2E7G|R^yd0;DYK*CzozADZUvM9mcpEtEhP-MmVs%~+z}{AD
zL8%%9PL&w|C|GTV=B2%v#~w!;NtCA0hm-?Rt7qAXrhFExQ%tL2(+WO+T+@iXeowVL
z)vUvhaV41&3_tB-Jeok>w;|lK&$sW&f_%7Cip!|i)ZdmqHbggNP0A%~>(Y=-PPEa)
zPgyMkS--A=!%#Qhl?P}YqT-km)waWo#i{3v3q@8kg*|=sS`ZZ#M^Zt%+5SsmTENah
zxmvW%r?(}_hcW*+)SG?*k_5HZ+sNqYG$|!jn#&WOY*Yl&9-EgxwD++_SJWYzz~1G6
zRfdWr=z9OC@d$GU{XK3wd$i|>=h)Kjd<^0e{|U|PBlxwqotGv7bA#8Be4L3cw7$u(
zUpTPZDhJFegV)5=XRQ-RojF32RNcV(1Bg>WA6>K6jD|*dIYicTIWsPDZ}&MRSD%d|
zWS0O7#(Gn@ZmErrhLiiBnjgxy+CNSPIrnXZ&K@Mq9(X6$NNfWvQDBANjJO2-ekOF~
zw8MkX8WDF!$;TYX6FPzv!3mFbkg&<JwHcdbMJVP=jtnT)pXN?)5v(zgFc$$;c5$PT
z{qzsBrX`bgy4R}Yj`wA!LzWw1Y`8qHairlQ>hF5g=pX38d65XoAi^tH;Ysc?M{Y7(
zdGzfUUrKWClnI}L=f}4qXu@RYi>#t`S(*`pf-Y(qCD6C$S~UtITVH9)ok&qOISKb9
z32t(WvgWW?GH}0ed)Ul>y%$OQmVCZ%IOtmw6D3WdE>wySq9xpetRsJWI_2OTDpha5
zkS7ufuNIj`YSfaVQJh5pFJF08;wBXwCNo?T*e}(cPxkxhM_0CgiAauVy7T)WH?XXw
zEMY6`5$pys=0vl+djoBmN{Ck6E-g!daIVJGW`2Du^|^}qty1eW>9^v3f55XEdr$D1
z+Hk22coGIa7n^^|7CD?<-eZZ@+@d|vfq)!<SB|(to78a&7Y>XG`3~!LF5OV3q0(>A
z;uo~fV31Tmr*IRd4wfT)rS{TG6%#|fc+tkZBN<$XkvQe6oj^AYu?xaGyhMYDT<{(4
zd~I~M=#qfb+xE8V?MYc`g2g{Usxs-%p)3&;OrJ8`U{9#exi%I;#)X^+(+_0?dEl6O
zRX=$e3$<ibtd5*802Ik}SVXZ6TUWBWc<TvXp1XWv=;P4}%ODy=afxN*FKTs{L@l5@
zIYb(NO=90)U9s;Oc$*zIiMPV6y_E#jT+*2nk^m?6lFv^VaKEb>ULmxP*s?5=o&py7
zQnj@m7Srx{sdEpriOf@0GWGeVsq|WMKdYtASIJf50$n$Y*&=4#o~S<EE>N}Zyq>js
z$`?=8$$&0S3?n?lOG=s~A!dk8ZCY!CHhVcvz7ZeL<=QmjtR~Bds^VF;In-G#cJ4Dd
zQAgAtpiMhvCk%Rr3vDXe*23V9=1Yg=a9GItahjDs>Aj40ntP2YG~?s$hsuC*il@+h
z4S$9}K(heeHU9WkuB0^p#n`GPS@b9tw;ARIRMsdu3Tk^7Yq+7ZTO{K}11<dJ$)c{4
zg|&6mCbWF5cDB7R$`b|rsr5CFX`TW_K1q^Dao%@u>?8Bc^rjJ9K-QY>s7DvasT~(N
z#LI*m&Nye1heFd4E1I@?a{d04AB&ad-ppJFh?9sXM)L_TA$2t{we3<MLYEu?3ID)3
zb^j(|;Z<)_@WtL7e@PG<U$cVtmh-;5U(B<+kSYa<%ai>6rNz~S9~tVn!L~4{Uye-X
zb8UtSH1Ik6ZD^uD{FSZE0M)I!LSo8P_yCmQs6HPgku34&#8V)cow6f9I1JQ)7i&K^
z5B+QN;2xKx0|mMeN8@jmzEM@arDPqy{np7#fk+(RwoD_01_Iw1c{w5phY4H!G{;Oj
zSwq?9?HhxG7kY%h#{YOCHgFZ$*ifpbwpz`f)o59;^18QM^P&F*qK~OQn-6`TIBwH>
z6w$Y<!=xy_Dr%lX$zVs--xA~5@vK$lX8LrtQ=zSBh@K#jxa5VsLx*wM0qz}4%<r-g
z6A-87gbiQv^pv9j1`ugZ?vEO$`-hYf?m{B00G>s{o3ul|<^qy`%ZpyWv{AZHcR6}b
zS-x!L4Ubq2jWPc?-4pmIG9+I{0fOrQlogIBFL_P0be?vhulxd<F$Dnb4=v?u<{{_2
zDscTK@w=G^QQ}G5ydtq0<7f#A(MY9hj#$#Ii3R<P$N8l0f=JaB)EGh=QaTL*^x@H8
zb0e7Gg+*6<dEA0mN%4h99oIrX>ouo;`N&@e=GQAcLmvMGT`Da;4&uxpMd~Q^ev+RP
zDM(4{5`KTwI9+?>n`H3ZYKOLBq~I6Au3h_A7mh~17-sbX9XYV>heck8sQ6g?gLn}>
zyNG0R*N_XrMx(XOdNd`-d@KrST(n|E5Hd&Q2nYjoR#J})TDS7w5{`&YD1lEI{q4mW
z<uL=R!*?G%L&ZQ>qHpC+K@rwjOs5GL(1T31d9Q0&$2V4Y4*BgkXOY99VDf3V99c;8
zp*v?@L6q{||6%cc<~x6*B(p>lstC^!D#~yb;^GrtXR^`H0WvMIsP*3ALr`!6FWvoN
zL}1OYOe?sARx*5wl0acLt=?ui*vs9q-|S8j<itquG3@pz*POQ8Ge_a0bh$~I?CkJz
zDii-*5T&}pTt5X$^_@_x%O?rG0LBx8<=}VX@a(kVe%FfnUB$42p2TbIb#yCCqB6~Q
z>nl`on}1^&bMlp7@W}zBMSJJLUa4Q2;j~Otq0yPVQe|8(0zqaS_*-p7RMV@&$0)W}
zHD_HUehO*S3o4F2!Va{kbG~6e8@910{HiB&29V7jXGX-z9{kxdY^QO2Snc$A=C8df
zWI*E<AIvdXejSdK9a&$IIs~t_9c<l(DXh6uA46@X`zgj+@IVtUN_o9~=d<pvdtXOJ
z*lxEF*+2wI@`DMSA?mq5V1W;|5;S(BhVdkGk5Y(-2gOKp2f){3begvsNVA1iKe2vC
zbK93KD*1hvP%<c?FCvTStw%g-H6{yU<$R>ioCv#5TR595<aKp8X1lqN0L9|ihLo{P
zSB*|2&U>I&d7zxa{w#wLM{@#Ia9!Qs#~XbYjU8<3$>BUH)Dr?@#8v@O-om0+)^E^h
zruP!bZnHa~*J>ycjws(v7-`6)yCCn*g~u*vCL*5BVoA?~9|ffNx`CrmkMjDk*IkAm
zrAn=Jn;HXq-wWbKOp^8-k)<>DC7nm(^1tk7v|jfH?j7M3y?Kz|w-AhN%j0q`wbR7u
zk-Yn__bIxy6?Jjx2F3`G{qmQ5`OGSBftn$k+tB6~LK}OlpX?fE_bEG@{6`)Y-!FNP
zQ=R|aYyQ~YK5;3?Cr1ckXDK+3E6w>Uga05(#hvFTVt!VBX|R&cODnHTB{uBt(N$v@
zqKAEyY}NTYW>bg#lk|}G=UeZ3yOQ*CM<V*vi7ik_p}6riMxEc5ZOm3N+ec6r(*w3m
zB)Y<!?oKv2YPJ^$!Wvi;r}{2OeEIhR0mNZ#M)Nrz^GjM1yk8vbjiu!zQ|Zwwb_-wk
zG$8Tn>u}m3sDS6K=)CaPxHQ=XFUQXCq+UY+ll}v5U-7a~A4(RR@ejUyU%s&`1-mD5
zuzW<1qKL;Qf)&v{5F@ZuT`HP>^f&sIlE^6iGdaasU^?R=8AIl!PK(*LcZ+cU+Yj)`
zQ{L3{18#D#52-0Xkb~h8V!%5o(9MWXPAJ{%WK^`|_M}?}recD-g)wps`1Hzb<K`6h
zTTny)0xHXl$Y<|8Lxc``IE~$(s7sQ>tT(JI0Va7``V<)SdUuZg-mcbir=A5GKfLcr
z4IM+5CJuNaQN-g&8Bjt)QqZ`86ZPFCPxg(}81P`k@HXfjr`k}y&u^|bey~j3-Gon)
z;_1s~6_2QBok1JXcQB><9mVZ*A>pfQ>zL-j26p0%Rqq3G@Fuyxyi1K8=VA}*zY#T~
z{BMI%4!-+c#Tdxn911%L5qxqCT|3LFxCuk;rEsGPTN-DR`k#ajZ1on;T%#4a>*OiC
zED0)9p(a=({9tywJ^ldy(8T~-NE)vFu7>>nF(}|_qbJCCg_zjI&K#4jTM9Szb5H!?
zB(T^1iJCw?5T+xtwo&+l=kSAvPUiX;bJJ&+R6u<Ar**zxIJKhn;gg;sH_347X!o+u
zaFb_Uo)~3H##p;iZ60$SWBqwQ28N%apX(33178e3Po#59e_H1AYt1M`IUp@uSU-DV
zjUdWob;vP5_kMa|?N!TMaHRsfM;`>dk>EGSk0ag$_|M?O18VCP{)ON1L@(OBm{(r=
zuRWKs`{|Ux$}NqdBY%P~;n=5k%okD?$L$VsV(Ozfy>r?li;4VKu&Nsk$2jHt$^K^2
zGLKfzlQ4-X<+%BT$0NS6uD9`j7n0H@ywDJvw7U$I2bl0GQL}EZkc|j~5=WbjSATN0
z)ZXSHAT<{9ACFE^%o3cuWN_V7bMbC;I2E!7yH}u`v`c8i@Hee8TfeVPYjcRkK>Y5Q
z9k0ho8ffRsi#}z#U}9qef9}D|{Os>1KiA0a=XdC5b$Vf2(h?G5yYK9ByovGoZ1aRH
zt?zGGi@=!)YUg8K^ywPKJzoz>F0)a;!SUB|if_ebVBE{W;<?fcAm$P@bv&;ed5;KJ
zQFi1-F;<wNlqp82jisjD1YjoIMT;-wP$-w{^>S4UV*<%eU#Xi%@fjDQTJS3c#;|R8
z0npT{bY-wCd<!%FEUcJ7(e`tNd20R*mHINh(L#cgY-FyNP=I>Gcj4GMVKLqxF0Z+h
z5Lh7Mw6T42?HUgEc`q*cR{Q#)23(nQiNuj%$WPf2<bP)I%ypCs*-~UN`l!)F#<fqh
z)8{5s2Or9!>au2fF4^R&7Xi&}b(9CCn$|oRcB!svqh;1E4E11H;Us@Kl7BiZhL}F?
zqppn<$jRq&sQXMBh9zccf<7itcgtikS0=X>S&ENw1OAoSleN5vK>2X0gxw`#H7F$J
z+PYEm9+vdQ#T0q=Rrlcdnpx<*oi@kC5LXT&{~ML6#$a{JegQr$(ZOBafPhkwRSeS0
zi3t7ka=g=->fl6XQ38Hb>s`%sla06jT^DC-kZ)^$PkP%q4HO0~^#8-wTZYB4MO(WG
z!Cgae2u^|pcMl2fZo%DMgIj>$Zo%DM8xQU-jk~*lMfT1<&pG%0;!k%|wQ5z>nsbbI
ztTrJt!k@KvNMF8HwL}SZE^l^^1Ge!(cXsGAP6U-pYznQP{II;^hurQzknwqP48>0@
z9oB4_hlBL}MM{f()ikza*e~&Qzn@Q^yzV!Gj*Oo^oE<<5q5ELrk(vazm}f6TVw&lZ
z+NlrgA?ElMFi>dB-AuQ*6?bcIdDj8;Vk|goDRbEgnA-%$)bp6igiO+%lkcTvj?A}5
zdw)0kR1|<4(ofr$j@NK$f$R0WBE6gwZ=M?*+gN5zlrrqlbN%Z}BpX$TQ3~F4fxyt*
zO0G4DvUR{jiskP{+D2-^w#W-DX9WpINN<R6es8iHYs<qvNc>2-YDb6>ko-}(4f23P
zvTYO#fszkOZ~k=;a$Z6{m6P007r(a?Hu(ZktQCoYj`ruDPX_EJ<Ng(wNuUWii3XX;
z7$c1VahYZS8x%n!-=j(N6BQ9Nq{`*B;m7LsVfX2%siP_SOPf3|@3CXP1bl_s=T4hW
zS*tv#P|U}p&ff|@vO@WbKQynY=q{--S6ZQ%(~Jj<MxRVEbpO6DJrBe5LBRuSTR*t_
zAx!yP>21M4EC8;rs6~YFThg1LM%iQw=PafbwDZKzp3d6fa@XV;(>^JiGY(<)mB>fe
z#(SinE{IY(Zy`WndmMVYEwfDCScXA95UTs+L5of25}HGC=o`*4q9jPZ#rq>+kIHqP
zsk%YtiLu^6?F2(rqc&?opuo8^LUo_PFq>ZV#LLxWx^+AKAtde=4X7;5Q_|r;gAL6a
z;_Jy}SOBXb|8=-RtcBqvj9Pjpy;8hnqqD!1E%VPN)=(1xyt(9>HO|TO*0;#T<$78m
z+%+mK`<)l@ni}w(Qi{iZ{7QP~kO%2_#LTtKWnxnObI8%OLf-h9jsP%+!FsAs$5DQI
z9lTwjSmGj!t^efR0~h%NnSkq13-dSk@kMQPSATG0%Ro}yOBcPk6)-myUr!>cyFP>7
z806|ImquR`+={{jL<GYGldBFr4G}?@COPq~a<tX>_~imDg@GRQg;MV$+C#}b${3uM
zh|@*;vO@3*i+uUnzV0;aJQ@j8D#n5oUtkS-q3p4gMLt9srA}*6`Or#7m6cJLe|JiL
zn<gPJv>H~Q&&3+tB0Lk03%W&<tY0ceE{B!Q0+jjn3%3yN8Xcu<q!CC_(OB7RO?tYZ
zbKvC2E=L$x3WsFl3M+QwRGpT19{`Wb7xinR)!6b880I2exMP1+uerC7@SG3=Xb%tY
zZ<%8l$7IdpI*H7uyLiCiv;a56Vu14{Spc!Z7;BLKXi-wp5NV#7;^+Plv$a}b7s3>O
zsH?AXkNeF|g$rkf1^bVl%3xVl)*LF;%KA?{9T#SIkBBIF^WQ&33u~cSpS8Ntl3@c=
z2VR({l`=wQz>{fHDlQSxYN`^wMH42xF#BD*uJy@Ux40J04Kmj!zVkSdG!EqF6lYcr
zp>%IWbC27zjl|D3B0{?;Rp+gNazq%%U)-cg3%sUh<8%!aO*};ru;{vSb5UX%c7@U&
zk-F{a+FG%<n8+7=__bbyZ>@T9IZ*-P2x4&9&SvDEIE#=l|B+iz`)`9k0<}H{APH5$
zsr*(#r3!+D9s3aEw8SJqAz^<pC<zi4QlouQ+4+x1i6Qx}kG&`PQTy0lI5)PsPx%3T
z9w_j1UqAl&MD5Eb7D(bCAiaG6A-=`Ou52i---B)J=Rd|KTOvSGZ}39NhW6oa72prj
zYHf%WU=coYld=A)`$wp+Bmh1!hg^Y442y{miAq9N>hpn&;yv4@@Vh}DU(1bJg(J6(
zWxRq!ppe3p0OWS;!F#%re-W^MW%$0lmxIe=Q%WL#Ija94JQ)TwR3B}I<ECo0|0UcL
zr@sddJAGNjB<_EC1NcG0z|eu3kH<~_m;b=qJp{n$++Fo@RR8D4Up%}}7{I|powte?
z{cl+AjQT}!+YO5~{Lkh5=ZpV;C*b|+kN6x*s~%9Ep?>!WbFr~I@v5{xjm#e&>z5q4
zxXi8``ED57GVw>M8$f<Pp#B)7;Vo(3P${g@cK>6nYJ0@D{%$*4YV+JK9Lh-?IDwXH
zm&(l@V+3rv;=QK}RbzrLY3B6`|BgXBB?M8pS9A!&F4E_t?3^(lMZ78vxdu6WmJ;ni
z+{sb_1F_gM_UDZ`CQheKT#E%aDp95V@HLP~RlsFFV7F3;-B(QcT}=%TLKz6kWfRjL
zfz-5*tu8d+!?o^^@vhZ31%PtDmo)I3&E#?`mAA~xx2>BtnKc`E2v2@)JcnnT*Y(Wr
zihxT5VEnSN>AlX_jh1X^h|F4{JhWjw1-u~6RPIIpk*i0|O0T`VSXd&av<hDuoWLOd
z)P#Ll1s?s?Um`Oq+w32qhg4gK*OSsx=RcWPtl11&Zg&sFS3CA<aeCerFPZd)lTR0X
zCB47ANM0$LDKnl(^R0gFgKIM>fxC0q|5Ad-W=+)d{o8W?<&nq!XFw4&>F)z!s+Xo*
z#`Jmu*xkkYuh<K~KnJYYdJEj`J<sl$fJ^JuBUpR(>g5`@OGeTB0X2Ujz+N0LkMVTw
z7u`5d`}B5q<&zDt9cKV|BX<;Ox>A^i{x!T=rPX4iI6!ypkR*3>mE(woLSN>acC$MS
zNtbDjRg)J{Z32EV?v%wb$Y)DePP08LnY~9uk%Y_!;Pn`Qd-hRi33o~{vP`=zm!oKU
z!axc;UjYC#a+^64g3A2iP`j`}q9J3xcl861tnXA78||=FWi<TX?TwYj;$^)^8du{i
z7x0uwz;Kg_&oKj@8msRl$&nGj?Cxt5l@t0i>)+$NPB>Et3eMdw_I~a~v4a(!i)q#5
zLS-IkLLtcGx|SP~H^n1!=8uq)xY#dxbdzXqOKo8n=gdQnp_NygTgYRoxTYvcX8Glr
zx==LnuN=J`<%%EZGT5XSAgf`HNWdApK{%Vz<{GwDwg`??I#viN;y(@}EOh)9a(;Lq
zqR*1pEzuviE-KSdX&yiF$?jII(UnKlVxJE%^qog*O9RgVw>1(sv~b|aVcP`cc>X!^
zcdw0$Rsw31Hg^*3s!b=tKi2pYJP)M1FNAgCTqw);$el_gUyitBoFx3VwCtyh{w!;+
z5L4K(k~^$x1UB+4ny8Sp+xtC<F!47FV5}BPV4&6C13vf1yX;b+Rq$eS@NIHZ+Pxy=
z5lD7CtfLj(4vWTfJRZ^I+)&`H^&b$Rk%8m`I7r1ZHPxjn0Hl#6b<8NxuHxibAG>c5
zSKj>#vMErKXw98zuAyzOuII%L)!jC?ljI1r%<qjJHWk=M=WYK1>P};+y$AADcms}d
zj84SuLZ#AMbhJ#HkSqj-{7f;mD<HUvV&n+VfWqXoTBk7yx7m@DT&K*QJ(FU(q#Kz7
z%lSg%^VB}0+asU*g<t7Pqhx%k#`WDA*Am^5%8$t+rSL4VO+8>{$0X@Cch>{T_`wu4
zY^wP>w1dOjy*kNgXaz=AC5y88+JW_yWe$%k-M3*scq!ZdJkB6P`Pm=sUW8?TNkQWH
zhPEbPooRGBIz3$~E1A>auuYh=*lL#C%59cmcIpJbMbWLI?#i;ss@Y;A?-8n7A0d*C
z&;anbr5Mr+&>Q0*ftE%Y0HiIv*cDeEyWCJGnb{~|gOR=LkDa`%QZ7a4|46YR=%K-G
zt3xkVtSXcPn{B~3wz=b5(HW~c(Y8&W^$WjC5XI9S?ZNU;T^7K+6gkXlk8?XIqs+~w
z$~`S`$C{crolf-8OBQMuNK@r+o%M0o!lNHXiv`);2L~x*4k3Um%qEsCu*5~3a&?kF
zu_s&waB{U}p}*1&)!lR&o^Pn<+ryc2h3y)pDZ*{H9e%>RwyNq3o?0M;lhSUM_`qgq
zBz+cd`1a@!?a@_6_b^F;ed21!qht2VHg%o985;&PMl0le@M=p_qv)xl$u}4_i&L>n
zlJYdCO3l6|<4CrAqdZ5#VJ^eO(Wv5uhWPL_ZaqS!1#<D2jFna{k^vhpnAQYddeQfI
zH${OdfRo$1IfB3b(au0=zL~4vVAvYmMXlDhbgr%|lK)|W&dKEPy>yjQan1*Bs&e=k
zalpY_5%#Xic3;Y&*()A-uQG99hAPTxD2w<=XyFO4MaP^W8-83V#|h0-bhgfyz~}vL
zs!;Na?k=*e;5T<)zod)X1!^JgHm-!{kRem+`@Qxj4?)96mR?39Hg`04`SVtX=Wp{4
z;2?`C$*RmSx2uWP&a~k+sZBj~=btdC8-a~G*0%}6l#?qE;wJ(hQBH%<vx=0$FcLMe
zm9CscRL$J~VsG^rVc|65*1v86?9Ij4U-G^C!+}eVh(JRA{)12FYl=7m8`B<T2d?7c
zm6`~kt}<~;=S^dX(sdyV<6JUyYf}g(Nqz!*!z)8^fs#&3ks-I^&1c27KN;Z`C&qpo
zdUi7xS8mk*>Ht6QItfG4@~hr5U!G-}wVl=SMILVWQeg<PG4~>@2bjxAqmJ^Y7iUZC
zIe3h;v12~!{y>pvU^7FL+-3Sr<$a&ceC7T3`DrhPWQGKo54pq=36?8~1Eo0F=iIf*
zR?D$PHX$XKksiP^s;>!_eT>hGJM!K+oE5@tlio8WxqEI@cpKZ?5UtbjEea7|E`8q6
zK-4tZdzons=?0PC#hbn6w08=A9L<AC4{B3|h2~YE;Pqt1W@pB#z1LQ4&ZkjFm3oF!
z;&xBdD0^FSXBKl#L5@V+xVBjB1fJJVWS2bMb+={DNxL(}lp7ro^$e|I<gTG`T=jz<
zTPWHiYQkj+P0rD|Nc$5%2~($-3M_Wskl(;A1Ybi`9k3-BNo-SbIKIgNpy^Y)cO&6-
zWxGu-;yZ-uWC?2uyoPC+mdVG8^$(~H)`09b|JFm(oBSsE3O}PIFyaF~V3?3hRb!Q3
zn5nNysc?>+XJw&3SF*}Lh%Yl)<PKs$&ONW*f0)AK;Vi+OS~uwcw}xzxDoiRD)0Vrf
zo}g=Nm?-$oS5_r<inE!_zPkLBnNns{F%4P^3+J>qnTcwwwe*%$8<PJf>ikL%OM-=M
zJ@JP)KE!|}YdhIZJ!^_1((Xi9(sWT);rHE!A@e+=VO=oPyB8%|p`e&o_eF^=iJpV5
z0R6jO?X!eT*-7WBvk+LCuU1g3v16UI$nTney8cx2c+c*6)029CL4Ap^IhYsSD?^>O
z1PL(Vo(QQ%=W?{f5;scxj5*{@uulmfd=c`@Mmmao?Q<@?N&jyYo`D2HMdiEbt?2-n
zPc$uI1%Z-5fpl1A3VDB-D6SN!+B8bn`%9UDt*00T(-1bcL1%~prB!#R0rSIazNC{}
zu{QH_N6tihEpoURFsV%*D{a1;`V3PT&i4|05mB!dfroEiE94VDjOh6Wl0S}AmPP|+
z<p;q>7C6uUb7lBGs;#oh77VTC^w{)K_Fegk8pF`PeGP&ouyI&5uyjDJKwnk;Fny|E
zI$f@iE4ipFG3em}DiUmYV6>QOc9>0i__h9ge$8YKgP~RFMC4egS<dkXNId=nB%*j8
zKo*?}V0<r-H%>LASJ#Ul^pL1Sn}-PYJq{3I1$?%|*k~s+e4P|e*cciVsesqWRg*s-
zyI0#P9mnf&1tETdc6u;bH;LO^uPP|jobX$$>A@S;MC^~_Ylm)`BY}6xGq9fB^=B7P
zJKqW$94n4}L9bI@Z8605$1BI*;#NaNTM?VAPB4aJ1){pM4b?xxW;Mh~wil0?7T+T4
zb*rs6*2GFLBMRLUB7#KIES$ZIEG5n=PTsoX9OvW|fiNC~7@{FN+V)QvSyxYa<bfGJ
z?Fe0E1!!uwB>htw<oLXAB_0pQ->UdTjB&*|2P7o9vRfuyf!jQdnm@rY2J@9N23zkv
z-d`X|2hPC}F1DKcBA*(*)pJU{R3UJLCgd7rfmI!nq7^kyQKJZZmtVbFf3?sMS|t&v
zdS1QYRH#V`Q(h$v@T0I^Z}9xc%qFrk$Jd_yVJOrPbd>oX$6-NrLZab7QNeFXjQZvI
z;fks@VrzC+dQSiB7~cjeH$9j5VvSyv3q^iJK`ZP6uQszbWSYN|c{^Q;d9=JFATc65
zH%b$-4FV>LUnf{*<Mw>p)5uKd_sk5-mY`JV+$78HqKNiyk2EG2-R1YEH-p4KqA9c)
zs~QG2TsPxVJs)3z+j#(YfJhf(Q1ejuqM8pmu-e_56w3ga@A;~d%cq_4Xf-%sr#4x0
zgeE2?A};`s{voym4?zAMK#?uQaQ~Qaw~$jqezQ}&ciJXnur$527r+@Soqt)bWwhK5
zkcC5$P-%r^SnN;xf$N|14|9@h$ABo0#dy!VX=AKjMXt;p3<|WXoXh~G$(P`ZfjINU
zel!PnCmPrme=2x@n;bN$Ua?^Ajq2_|1E@)q@id{{$=ciAgLaubK-68gxMV-)v|A`}
zG)b2Kv8Q|@_+IVgy*iX^ApZ%;R^MC_^1a~6p-iOSORa$c>o>udJnvNWn{v<6bh(9t
z?&+!}xWBj~UJlL$>1Zl8$%dKzizAwQtPA1o8`|r#*@m}j28iWPAWziM;(+&CY>z;v
zuK($wdtmeoKF9sUH|{|E73kH#MM)>)O)jl=TP&7=>b~DLcExT5dz_|1vJNco9%F!V
z+)ULwhV7OiPpq=rwvfRBuqXsb2W!L)R}`xnBk!zYfGa^y>P#qRACd?m_sP-30?yMY
z&33ovN-?wl+5J3gT#EhgnDQr-?{w4lSG26Xkc%5Z*>(Y<DQdj>TrBLtYI%ylixu2u
z68@uHRD11IJzli#Pa_9)2|Qa+MYw&5d~nNIXIhAFcgbkgww!A?hK#6i5kp{pD(7q;
zE|Y5A;Uj-YsuesE4}cmH+D|VE@m#O6s^j*75*G6W+<#A2xR)pExA2=H>UD+cYK&qY
ztzk8N`N1jqLqi@kd$7}STMpf+(fLxA9^T&UuL7|Tkm<0QWr3#|FuJ1VQ^BvE6t)>=
zF#;}>4VzAil6#(9QP@))ED_R5UcyYWIBOIX>1;E8Hhh>iDo`pLKFs&1>hUX|E9Y$2
z>5bus^MrFOefr)2ZJ5j;<1>=(z@p>Vhu>9khSq`Q&D)yZL>AgHR^UF!>eqNk#wSdW
z0@ll{711^lKgl#Po~e$M!nmN%eY+~Lr@$+q+&%!ch4e7tG_jOROaaoacZ}vD<gL8@
zF509rKF^6{g#OKM(eF(`Dx~0vTjf#?P9=t#m2e6O(bqAzyAN+z0JW&$rz)M+XeRzP
zHX2^BPxHw)m0niY<@fJ^A!Lfk<CY9GD1H;mBP!+09;RA`;bSEtd<C=@h`G?LE4>KW
zvWcd)zz8Y=m&fu(O>F`)A^e?)Oo4cH$D`H$A1ayQY#R9p`Q!-rJXX<ZRqM#baXm6D
zLMbNm^wevBLGAmYY_+|RVRL?ofcI24Tle~Ei@10qaj|oYTMm-fmjOW_n<I#&iw3`b
z#h{Hn!KXw=o)Ge~Tr2}}1#xrFq8832nWEn@L<X3HwWNSw_BNA_PlGr29PV-^4jh=I
znytOPfyDgPoE?&{LP`>Mt}M?7Szgt8tECS?)<n0AktJ2h1I;~Kl>3nes2(fNty#1*
zXwtNu-TABAdkAb!9!19BYBWbF9_|iL3+wftR344bE5;q*u9RUt%Y(kU0&6H(tk-$>
z+W^^U>|AD9$Qe3qmC3YCUvQ|_+_!oo(fhlG>uQulOSzlIu+25{4d>uyJMwbZ4{)R=
zB+*6=;`L=_pPtsYDLM%|Dwo@)9{hI#bC8m@<vviE6ihCkgACpIjA*s@XQp{(Kc#%J
zN<sl@RbjI^4YknP{O`}uf;#FP(BrYISKe&=<)d5O`=#w9GnUI&qoZaR8rk>v5q8aL
z(3tD~Ri4)%eFMD+Ro@X88M6@i+XR<@m&wJvkBUshbPi{r4$GSrmyGLPGA59BO2E~#
zJYvo9q-K?SyLLx<Gi`PnHMJIB3zMux;>Za3;bM4Is-H#*NduMW9pO_#&x7rqyBQ<N
zF#-q$hqFNjXm0rkx9>jTFjc@VzG4!a6{A;c`8ox=@4*Z(zPF_y-Ur6sfjany_$NT;
z!=k%&G^KeoJCZX)zxMRf8m+(z6bjJC2911>wI9TJr@cQ`52`Y6|LhK5rh?Gq!2TJK
zV?nICX`7WAbN=h+fqI%xe(_dhb_$K@ZKp}#Ksm+<pVxj^BP1~a<*fWVh}vfZzx8mM
z#%X?QKe70x@pxs{ruO$L&w)|~=zOe7gc#fhT`kIi$1G08i;5){!Ivz?q-0(Wsm+IJ
zJ@3DSue8wpeQm@%*Uy+amtExKGQ?FXFlhXnsO&MHCku=C<4|p1?VrZoD)CQ($0}sm
zpjNaod##qcU|u&PpHGh`hFm2J8Q4oc&$BBWGJdI~<l{+b-0+`3Y?A5V=&J~#?K$K$
z=GJNLuF&*{Eq;SD_#w7w0(P^T9aR8UE0b-Z3F||w&z#6P{gp7mE|DXz+q1>zwTU~l
z4GMZ&jJxnlfTzXmUj4@-w@wtIw@V4D0B7(1*1<5ML-4!nw(XNu;+=OV54#H{yjdH@
zz1ADI%32Hx3y)l}C}$3WwjtpCM(B=S`*|5Je=w76gUHDcT5(x8K`Lf$Wxov>!Iyl%
z{}7Hlo4hw8Av0m;^+f0i%ePvt4ltC~Hzvuuy{iKavzpf5@p6qC%L+iRs-pG(RO7Sn
ze`-Ef;mBbKK=DtD8YG#}E%nJR4r`T{pdcqbu2f1k^N-=oK!|KX0YN~cl+is@Rj~LY
zbilG5KJ@Isn(8d7jf-P7#WYFfHpJ0r2Aci5lz#F(&>e_<Z_-df9HZ&#)jN1!I&l{o
zkUar)w@~Bp;d)Ju%Q<%WuvyKD0X-k76`9wkxlJ#c(u&u^`7^IxuPy8Yd>c46Q4(l4
zF^$O7*(HIq(w=Tq@-4<@;6O%Xwm>CA!JkFgOOgYwKEE7<;%@XPJr29C6Dl&CtnvUn
zv1~p>y%Tm$M=nmr-9fkO099m<qr2tFW>mo?-Li@aqW?Xt`z}VufM@pXTdW#EoI<?8
z)B8p2T<pg=gIcq>7*{9b7nAovzG{Fu!b|@U6%j(}YNSzv%A8k^iQpovJO>73CDW9f
zr0D-yThwV{cS!r^wYD{pR=vzOEKlp6@hQTq5$g>6{trQVXr#1&KhfbS@*z|)$V@R*
zSuv%}Fl*!$vLTnwXU1^lf+^%8vswDJ@-S^bm$0@&z>M5yG=3XudH%Mk5}e28(nVa?
zwzR0YAIq1nsb<v+0W6++lJraEytdGjlTP~^9CedIQk!gJ-xkmBL=G0XbALjz={4~=
z2Hx@&)JFqAbP;C5*AvNm{@L!bEt#ZwWTINHROefhSvg+!2bIKW@G#(DQz6@=UZ^AB
zbO@l+Y{`6kCdf<*Ykquh9aPMDFbFmGkKvl%BH0U=P<>Sf(a_8pW}>k?_~u}K_r(1i
zA%_l=Vs~?9maQ6y69Jz91DxLDOLCiHNf)m{1}wN&zP}Eell0C?xCq~5-Q!0n*Dd@P
zXrK+tKF625x$`rD;htX`YIEQ#h4s@N-v(n)RyK>YeWF^nk^YHTCglzV3{wz|dO#9O
zzIZT;8LASqh_lfb0rHzIi5}7`lbfiT-s$dcOnz3cfJ$mfj3+t))Q7cMj^iqtyEC)j
zb?!9z9V9o;*U?V9<jGTMYdc?+EwfQ|hxIat--|4R@3rD^{|K26&~NyH+Ni`Eho@Kg
z;dq@p?)3tTP!sw{ch?N&E3Z*=s(zy-!^mKw2!k}$nDfg*zH`hgXesQMioAY_7IO5f
zJZ1(t0r$r^@;NTa&5zHP=<dew%j?)8^zh!vZ$3r*RCYC}Pj)&Q&*gzJ9v^+Zw|ooS
zqmbi1!-x|Z-7bY_^Y3YAtd??B#g!Hu9f4#4<7mz^@n!#K0ka$^R%dn?N)364$M(dD
z$*oJ*`_SEUTm3ub_&?3HpL{QGdseiNGP&T~^4cj(b}KFYCKE3o(5o-41vzYGpd{M5
zqg**a-<w?Z5jz<|{n7fqBXM@uf~h!{ck6g&!AVscn+_Bh?Lch1uSGMoam=L@g`@;b
zxjt;P%z&$dLi1Axj*~;*of_xzFk>Dw=hO^R%INZBk(7Q)@g4!fK4c#vIwFKhIMpZw
zkkJ?0Xz+SCfAmsv&Hgkp9z6p0s<QdwFzH(5ODVPo?YjR3^`1QvdpcyhfRiUm`H#ei
zN3fKsm)9cohU@Ez!`x^i%FePZ?F4d@Qc*iH<L(+3=kL%q715jNYS%VaQ7F?+Cdjd3
z>UME`M=HS+7O}IJj{oX=u)?$X^qUqiWnxkGI=dlz8+eDq+cd$(V}C15(JSIoHUm}g
z%bNI%)i_~oi)FXj+z$KAb5h$RE7_<1p)#N2!94D)F#iPN)`PA?h9G7il!?jtsY2`7
zeKEon&Z?szl{$xKG|p~MoCF+r5g(G9zBYFW=F2saDSKl{v&4Az!L83)q<!(b<<Yk)
z11N?o|J;r*i*5u=P~98isBjP&45Qm>f^8rRgOmJ(&?goFs#+CR59)z2Td}y0r4qUf
zse!vpMe-I>wM0pI2|HA*1HrPm4={@)2Heisxwv+fgmV7~QJ|Nz#W!zw8rO5Iwq6w^
z^k~wmz?@0SgnQg1-zLR-{fI={Ehf;lLEm*vKvBT3@6lIvnK^cMvVcg`xMOQ@tBvWk
z5IlZlmzrS3y?Tl4gO+y&mpgbMm7MG@0qg0wqWN3<uM*ZL0BQfjwW-_Rwj5V3=EbJ?
z9uPF_(A=s26U(g8UmMG!(GwZcP|kAL-v(Gn8PdDF6rY3wfKEo(AS);Nr|%Bb^{<MO
z*h<wy!zog;#Fb0kW)Df3R~V4LsW7s}p*CzJ&xLkD$`RrSWPAsdx3=%g&L;o7z<v0a
zg)(FxIhmWC*c5HpH*;<BCHx4(SNRbQIKZ#)t<f4FzXhY<^CVv5QvJz&6a_SkW7r>2
z!GDFjpO3IWvQ8rOH4yr>*+2KzF+vYqHrpH>C2zTq7Vir@@nsl*?`Sg|RaMLVbMC}M
zB;SqAg1#s<>ci3W?-vl;ex)gs0hUJV?vr9BA;2&Dz36>1k<t7AlEHufc*DL3gpkXa
z1yO(h|G!Z_s5HRf35iIR{@--W_oy#Wv9o-(*q@j5|Hza4`G9DNg2mDBUl`0^sF)v6
zH$=gbku?8XJcNbFe32~`Ha0l_j{09Z8UoNciM-iF3ja%x{P+L-dm?r_jke9rH$k3D
zik70$!xNDhUy(Nt`g6?_$C-nadHeuUKT(P-)h_kqv`%p%U=j!gTEK*<_jE%cs^g)A
z`73UEe-X?OKGnq}&<rQ!cK#AU$QPVl$RUUAzJ7{g;no$J((<rNDS9?EB=KuNL^e~<
zpNjU;(^83aB%T(MFW%}KlveObrwyqBts_3C*upiTZT06(y<fXor&fjBPK3Ol7q0Lx
zlDz__hw_b@rfYW{Z3xxh^mSgXxjUD?D<f8|xBfO&nKzQQ?TP@z{J%6ESS4{~fFwyp
z4+bg(eVzFI*m(FqCk}oEu@m1^V<ia$NdZZbVhwhK&Nn}!tX6U??olRMChf0n_Im;!
zhCZJ4;7xb(ULXIXgL>rJ?CJZu))Y`b7s*vXl<8}Xr_%O4ZtnlC4vhuCwq;u)6#c9h
zvc~3ypGLMgLCllmmf|J(=W6_*owC#&(ZOWGr%<_I!*b$(7-YuA?rTXhcHpAI1jery
zkVb;d!oFLYE6jbmFeIA!=!@ze7P9Yjk+G~$_8FpOYw7vPv~c@3{^Ezh`S32+ntb^I
zLxo6~uw3(h`nsls|AvN~kwaj8=o;^P;-RLX={LIvk`_M$&D9GrpBI|q1&E#5Ka|q=
zhkr0~>aWlnHFY{3O){EIz8kcIxxQX#SV8&xR|VC!UEOm?s<UlzTUoTmO$XAzOx8x9
z6h6sQ8^rk^8jYk+;FpTq_FL>z&s}>1$UPBwqpRQm0Oa;PIP2skzc$+Y1Jcwx`zl?%
z{;}S5(ZYQ|qU*qhgKSgHx|?h;kPo>!6Tt8}b{Wx^25!SMN(dSIW+uaTKw@PPuJffK
z{Gra_amyq#%}~-qBTJEd{PYmqZpL96ki(iYYaDRzbi9+6c<Ha;p2HpzYD29MXSEzz
z!jzMcO2ARhHdU6GQ3yF6YQ<?tCO5&9Z@w$1;N3PQxj#daa1SkXQlwP+MSdAy;^MC6
zQ<%W(R@m0)oA!PkK<Yh8^nL7=9@|BT-vwCk7?mby2jgYosomRwcxJudri~7w%(r{5
zVZjlo53l(9hsl75jnw=mRQF@5&k&&JM;I&FieDm%<e7={UTf8Rs7=+TrQsGY4DgAl
z)Sr+m?}s0+VhCO&LrK5S9?ShmAyi2grM=lU_Rm^-qo-BdCfz-UTe}&x-wNft;b{7{
z`8*SZ0Bs$1?@kAOW)<{%*4tx}9EFfa4)?I>dc0Eqb@bRv=B;mFL}f}9*9ml<S3Hb}
z4fnYl5b;-Ix-q_%{vxX~d9HpTz5x{J-~kDR?gJiIAJ)V9V_AJl)`eXl=!er0NDx=6
z^(^XjnL7nURQKcAjXoTb8x9D&S}efR_6+5v11OwdMWrZC?YI8rgg25J(0tM@>a;%r
z_6u)VL7nj+?--e!%EB?`>0!4jp_$!LtEcr|j=~YdTAWeWHlz}^fib3S<EdN%kRh7F
zU2rFiOjx1f<X|UT6}Bui7ZMi<!2)w0p@lk{Ay;iV3OEMMSD|(l{el04T5SXA?%?oL
zA?5Ii$(jEQC=D(!%#I|zRmWifRTxNbopM|(`*PyMwEwPP0V)J6grV6|xWCEgd56!k
zR#1G$P2<^<bz~P`IzAVwJn3Ny;In&VT_5n8#ShADZ*EKs^bQXx)<{Y;vp7LlWuK&x
z*1RPe;>Ca^)p*W_8ISP2Ls;GvNmGTQetJ8D9FKaNdt|QR#cTkibt&?>cW%i4q>v*g
za^2eSI`0>kMww1G9MZoTtSL#Hv_u3V1=ma(HyYwsyVNMXMC_OXmP~1ER!g$OS#U3k
z?rA}v%F_Do@1g@4`?~}G@5yh)Jizu-^#Ym^pTL7^jMFRx&4064WpfQ-M!sUD&HHO?
zU9F5bXGG6Gbqp2)oYvkVttWJ`eE*Q@y-f&<EWOqfK92xwD{YLe3@R*~cNBSu{EJ9D
zA{EBq|2+WHZv**Gf%UoGkyK<3Zo%N_;=Xj?yNM9(lN3lI``tpQw!!^I5u0Tp@27x!
zl?$7nMGuG0gTo<E0=#Z+DYk`BHqSuUtA(|}q3ZKwnVp-1=jL>({lTPanMq6PyU8L1
z7U|+Vseh<mnC;0uTJ5Iwa#=603kHY1n+Akx-A>jfeF37-)g3{i9|Wa?kxx8cI9LYT
z`aH&P@n9Gq=|2dvRlL_qWOkKSs;ir$X_17(X#3QH6BnTc>a8I*U)K*Y!0D#|kk>T5
zPd{KiPj7bRU2pm}`4Pi!Hg=g5KsMAekVL}9sM0V3S0@Un=1mf+)s{@%*3P+U2Od`N
z<{hjDvu1?0R=sBkXB1o)Dg<~AJ_^X^)y7M2%ev;pPOQ#j@bJ|bzc6gFEc95+wxlDM
z$4$pGixmrXx;a!yq8nop?02uCx+8kJR#ME_FEgG85^OZ4y6x`#X4?98&Dvq$@yRA;
zzL$rq&$py*PHW`Hj|`0TR_8>eGy4ymPFCnr9)D9`>xx?lp8@<fVt2e>0)3f)MIndJ
zdJVH=#q!yF7qO|<Ea%{J1)o{BT4RSVgx7n>6(T`fEmT-UZ$MZsr?r+es5?9i`8SVN
zPEch+zxN(Fr*||>)10*Bd$}OaSerPEV(dTqZi%wzLSW+iK`fG+Bf)_~$vxZb3dT3C
zMH89EE15PuIuPOYCho^=i4CK9=%ea95IoC3AGS{K5G7P8UqB;c##J09fRV&biQf)i
zexDE-go2q4YH5^XQpy*6Te2c(G^{PRFwMu}Ld^9-Bj3Ex$Uh0PQ{xW6f$etA$E8}2
z>!Sh6gNM1aTDyHbHM(*-aJ)gAB&<g5JB5nt!bi;rAGjKSEi!7=JEqDVWy@h431@Lg
z@#Ow^x;j=RR8x5s9PWl580&Kd>BFE3wA&2%VrLY>tco2k1XS&mO6AoEV@Gd*luG^S
zN#(7`J88a_UjP_vpUfikj-6k=gy-(jf4mrI>o{0W4QfJ^7O9Qp&<>mUXa}%pVgy_&
z%bVq+9zd6s$F1`Q(u5kUX)(2iESPy-N3K^JT)%c|xdhm@Z}e~$ihzA-m(#3>0RdIL
z^y9Yt6c4dln_2pdfK2@*$h<?xQ&qB{r;40YV(vx#CH_bK?Iyj*BMjtAV~^U`3P**0
zVeJaz7fHP<&O^UuDF_Dvrsj$GiLG0SB6<v;vdEv>0zgVMZ(92?%?I1jh?Omz(aC!h
zsz=b9O-$H(vkN`c6z$4yA5sTW;8?{S&OF$HVf#RrG3xze=<@=TP~VX0t0>yujz9=i
zhJxQjui@+QV!E8JkuPl?Cr8T_>cia)anV5ymbhK-tenTW4DotbBzDQF6St;(Z7~>4
z;poh;q%-&?1<Cks*mv-=gDExLSsQ>`LSqo`*d!or9dor7l=ph|*_^zI{i_(W?WhB4
z+Cew94-pmbdf?UiHnW!Ety!sWUHx?qJLZnnGEUhz=-0P-pxMsK1h(ID+l1tUTyc)@
zz+>j)ua`cFqdcI*?#72+%u(AE3t5oVZcH-zE(cL+mH~(G@rA(s?gpapOBHk$=wmth
zX!KDlr&e-kpm_>!r(@F$NWfxB<APlZ_Q|j`@CmUFR;#wayAoLTs&=6QyzBP$w{}X!
z=M0_vJQRq#dAW`!;77Zjhws=2u;j?Rd*8(JaUV!L8f}V~IcbV~xfA9GhnaF63FzBT
zT9;yn$aob1k$B+OYbH3}Phv)Pakt#AY<$Qpg?jcY%8~9}98V!N)F<tAKww?P7EU3O
z=XNb^zB*p~`sm=MnQEtDS!`L?^;3(4#dE73r3nxjFikUigaE4Z7eR)H5FmYpZW}qI
z5uVO##ubY!#|%i^IV_^j7ndKWw`{!(L*$RSFj>>32!(Q-S<zN}x{EKlN49d*P{-dn
zc<YLZJEA=>`rgYd`Rz3(WOxo8o6BI|eJy^2+zz=0VD~M#(;IK$ENvCyh~9p@PMSXI
z`V*SU>fPb-6xG)2%~?IaiPcw8w~Q}5m58`@e+9%2PNrq$))?`9kR)^<G#;o7TcBb&
zY~|IzN9cGM^J9zkv6$OBw2#U<;s-iM-fQQC$uK4&JKKDNhVq}JMG{K6@x7n-*XyQ^
zJP0+O?Rp?Q`YV8RRW1<9Uts%Ey(MfYB;k!JRIb<1Y~==eAeld0<pl6FXyq7!y1%vY
zy`wU(BjOeWh&!c)(Ie934}^3Hy$5`@9bsPF^gS7=vZW_Js8w;LHt9@kpOFtlfbLPg
zkezZ#UZ}a1K}ak(nLasj_~2)=Nz>4UD_0#Xule9h<cebFObo8MY9>i%3}LlYRlLiZ
zG5JlE^NHQ&Jkk>Pby@Ja@`Jae(P1mK)Ku>v#zH2#`)uEnuFt?lU;N=W`dS|IbQ7_w
zWB-9SS_eX~t(Q{2TZw;qz`o@68cAu&79GIW$R9VEaJ9Z5kiQXFRdaAhZltyn4QL%N
z$F@6e!-RdT(0kgdm~!i4&4&7DPjYUehN#|PO%U=@>aJvIrMI#q9o7vLilhStw4Ert
z53qx1UZSw}5SdMOq8+~AL^9$(;<chD)7ujJ4-~1yHz4r2*>wMY<1@;LL<Y}^>R4tA
z+v%?)>H{a3v5M=l*5^mZq`F*xcc8_-dM2bGX!mCFdSqp!T#++CW9}9G$8t!bEXzZG
zOVhnoDwBENeV9?Qr}k+LOkGCl0gBWnV+0pk$3fhYd4oPU$8c3kn{WV&Oz~&~OYL$<
zYpX$B`R5}rvoko{`be@sL@vI+A%!_<&y6$F(G#s7YP>$2_g2=j$d{0Q>Y(>2*kptA
z-1Dd{N-?lfHbGZw+NI0e)0@UyxFZi}rJMCHT_$7^PL$5{ec}pKsR|`L5uct0k^N4W
zEU3|<AVe4G4G9OLD2Ctb1Tb7|LGE<jm$GCfoOVf+rf$8<#bel?*w+Q*i?a6D$QPz_
zpkKEsdi6$uJ#t>+GU=(!pB`Al;q|Z}X>at$jdSWd8+73GGFCjz8ZwVMiH`aBl78)l
zg1#h<lTS&E!)qD5WopakV?NVdsId;L_v6@RE8eMGM~9td090?5`&T)UIL)1KFHbsS
z?CLX>=yv-M!U6XJ<a3H3j*0UkYQ3pC)ZXW_vP7-ac^h6##y<{>&HXQuvrmpky6GP%
zNhKYFXhEVziV@HLH2bwjO8P?u?t&$Omz?L&NfQDTzqJ}o@KQPLKpq8z9A=c59v$wm
zL(La1k7iP|i6}+9R&ilLJa)4hoyj{e(E8|(&PsyDPMVfz?iO|-0t4_`uY?hJd6m2)
zUc>Ts;dAJ%04dOh5McXU<ps@38?!j5mXt!2J$*5D$d$>Sxy0{@N%bYg1q<D{P5dIT
zo*7@hataVzLuMGi7Wmlt%+OM*M<QJ1SOde=Da3lb=|G80Fc4LMaeQyJlAJ$C<4(;b
zQ@-1rpb5z*{)QC%%FxdXv_ar$bHiJ@4t|$w0y!utT6I>y9jM{n?~8czfjG6+?0N>0
zYSy!RxTYRCB+;w<857pg5&k0_zu#pYhHukx0>gabYJy?D`qer77EHLS0@BEFAinLV
zLJ1t7xdTpC0!DHTA0k9v!=`qOj78ss+!3}}OAp);27XTtmya_|)$>mald82_wtM<P
zUMU-a`92(@ubor}RAAOg(R7)(>8i+YQe9-dEt~84^MB>30nN@ySCuj*E8<)U`<)J3
z^RgRiCwNrhaSprH%K4sjlsPG+Q{@sv%%Fh~tb>>IvFG!ef!%J8NCP3Ke#a!IXz)FP
zm%lMG(%cv{khY1vPv880d=nbfo3VvLB&ifWkT1lbtBEv9b_3)5I`4Afde+eBvq*UW
zk6k72bVn!2&CR<<(|-0G5Icy8f53&J53~4fJ~qKA8-lJ@pasc|lCZuuDlEZkb^}Mt
zDFu#;`+@G{U5WZ*u9OF_>(&oqh`Mnj8;U$F!8i^%#Axo|Z&x0(fjm!3)%M}dN>N!G
zvGT$=@X^BI+42p7E}Ju#SYFa^(ELUTdm^=o9U(b>atnUV;(#pWaoplAS*%`S#Tx|3
zB}YF_qdb{|?xOiRt7(;QK6BQHauP3#^w1G2b{c|*uf$&ZIppr!I#LK21UGEvx@|rB
zjuQUH&mlU{`g<L(p4oq+$dWm40uGM)H`(7m<XR_~`Catvby^o*8VbJEiG8F1uEQvE
zwb^{%arZJhF*lZF3}DCnKsmd<g(Un#!vbCx{Q<B`Hmt7#%;QE$X7xp<q3C?c<+uyj
z+y5}3zy(Or?BMS}n@UcYh;TFnc4`L3ob4+|HCYe_LH@xvJ{W-a9UQ>s1#107(fUh3
zt;Y9>Xzr;vx3_Ra?-2{~GRk9kl+cxdkVNBbllcXmK*Yf~R<X<iq~k(}w*LTy6=hfm
zx1j{`A4(t$QKit4DA`FlFq})XA2;C~fG_;TtM!-OGMPmj*3u@)q=#EfRrq{988cQv
zLEM66oNrb4^}UBOSW2R8Qb-W@Q@>(U8J}5G=4#UAdY4X>jUrn83(e)Do~>8Z*H$)f
z4wR1U_kws5d^N;6olOVWFcS$_>W(ydHXc&X0pcgV-J*7+F~{N`P%NIGf7@3g`PuDW
zL7wlE@l9?*jAS-zLA>8mg&KLE*5ZIqv4DuHD~UbaN((}v?EKO?{Tnae;vDS~nuOT{
z?%)x1>Xp2Um#z;A1|BU1=sFMW-E0+8jj@cj<{iCo3Y^Yr5wMTUsKO)P)p7?<IH?&n
zpQCP^!9eZcAR`l#5F%q2W+49Zkf+=A!ZGjlHPIqUr4w8Ew@Ly%-C6^~ki<k<-;e5X
zIO`9?>T*NV7xvO97hiC9Q??_M3X6)C(~dm%C%yS>965vJFNhr&{JbbQB4WU9bA)Em
zS>!i{%MeV+<-1K4y62Abou|*8Xe2kQ8RP~~uiYSVr-~%A0Ky1n6~<Qw0X8NhxP^DX
zehuaGNtVm<(Df`1GBZQ+8)Wui91y7zpz}kioqpNirMwCjQQ@zm1w?VVQ2ii=!}_k_
zq;geJIg(kA4^`M&^FwJ#mpcx2-=T7*fR@CL5Tmx~xrRocWJBJNtn75P!suHphEst)
zlYOgqn;5fkZM-l9-tEiJ5~fQ02!7AJUBfLPGIV#;&v@TTNkE1LL-@>*!aVTPmL7xR
zp85M-=xsGL55=!$_EHY2a090UAfypuT!Bz&gmg?mPO>G#eNdhgs8n-wcQM}dom(|I
zw&D`~8Vx4mvJ2m`tHh+l8Hn_}=)@j8TH)qngXJxKj;D^UoRBv1zx=d-Ioya1xiWC2
zvOq`+VoS|K6ZO*SS95^tTCf@`Z#^^cPBp4er1A%=AIAW~=PGFi%h3&N=h5x-VN-2v
z0xv%Z){}XJ%e?$KsU)=}E>Zy4RmXVY?E!OES62vlyPJwL{JRP=@xXC1{vJBIsRE}R
z1*J+W)<6#T+@#HkY$6_L#zkJ{Dhn_y;9IbOdY`PjO+h+NK*l`c{*3%4o}fone90E5
z%8Sa@gMi8O8#7$Jy@%s~3Yz<VO^9tRm=i$CE2z0`zw$3W(3d#)HPS++M~3);lTjA2
znY>mZk)FQcTp}@GFwgo=eu8UV=(F#yM@zXWOiAx;XbqnjNAQVoz9gzFVJC|jETX!e
ze;1A#ol*?VJLVU376NvgBD@&FBuK-k`t*Ezd+k}pWZ0my&#8|C9h@NNadp6U_Y<QQ
zbKT^b30q!OzK&*gx3(9EGVFNI1?63iI?7aIX^5{mYg9t%RZaqe<qa1-Z{E*VKOwqL
zSP$THF+B#Gn7aFe-#B>TbZv2!aExJeg)wCPF!=<hz9;ROf>ApT^_5EWQzpwMMWEfZ
zRQ8tVzH1&(88`*jn%2ovS$KrAj#VU8RvVB1q<w|Eqgy-h>tOr!iXiNus#W<h#0FWf
zNgd}w;hVYV_uYAlRXsqwrjcSO%g<odHV)F7xYSv%8XH}<0ny4dc}VR#B&j>AiZI^6
zt_=R0prIbgePm6#q#RWD&-Ca%RuptLwuK#GjKR}oDfz}F<g$@K678}{5<DqS^CC%K
z5G(CMmBu0L*$9G<m1($CS{pwfy*kE5h-%<VWB9P@k-b=|tdtTDs`>qmR!y&Qk{S!o
zrf;`qZJLo~GRYNK|3}32j(OF0|CH*DzTgSN)1T8rE2*>KK*&T)bjSzP_DN3WEn<2Q
z^gKIq6>JxEU_IJ{ahlV)$GU!~CIzE+Cx7^*h-NSrC~z$g){K-iGLM^~0^t<rtLAFj
z8(sgPk3lzUxe5~w$+>EvUk$rwj^gL7`|FltKkYe$E$=!%xJ$==mDQ=Y&_rh3$ooN%
z$+`rcbBAw-Q-I`|eS^6z(MMFtLGKQ2sc$&0%(R_6bXsB-3YUQ8(a(XPixx(a2gl3F
z=4!2Ns?wDEDZaxzbmIFGjBvsSz?)M31(uij95%CHecX%9NtXXD>+Gx6;Usf*6`jcl
z#K384tq-6TyN!1xV1_Iu#ypLZNY7$}Aq#rIz~|;GPe;1;3f~XSYwqy1?+`w(D|rZ(
ze64$*+<r_B@+E27r}wW^4nMWLj46x-OU@f3%!7r3)OiE!w26=V)yO_4B-`w-fGr%L
zKvqCh1n!hjR#_GgN2*nwHf$>aO{Fv{EXr%+*{A{X0>pJ~>+IOT2tpWz(E=nhWiCS#
zWt-73Ffe6(o7>oIszYR;qepgA)|P*Cy@p`_!4yOq92p1Phnikm>Tg}1I7EN>3BU<z
z>>m_>Y-bZ51zFcQr-}+R?c&Q$5g@<CA=DBgfkayxk%${shBd$JvROiF3|D^)>iR(?
z=To*20PP<dF}D?~?wPX$6Ml0!@MF*}$q0SxBa+wWg)+fFq%t_qPv<yIZrSt^0z*v?
zA3OOVWz$;I4EXBDHc3_CK}M7BNp4;(RC9Fh*p1B(sfL$5yix*3GB@7{gpjy#4BgsZ
zY19Fg-8y^9S0`&`oaTDQpF6XSI1c^8h9PKtpBo&JKb0R>n=1SiusI7N*{e)|SO5}{
z1bIL~b(U5@j-s)`vz95-`IW_d<29;*ZND+E=I={9oY$AtTf21va}!puiqJ4McGz^c
zykD$wKWLYJWZ8Uobwez`R2jr{lDLKi5>3^r?ls4ojvf%w`=nu_1h#Go)|=8cG@^@!
zB~GFKB?g5-fR2GILlqkMd{V&l+Ie>Bb;oFUmSKWi6+G9aeqLmt$?TlC@=#~B-LZhd
z%5k0iixs*Z<@GrM3fEi=lDfMXuCZcMwZ-pDb38P3gl~leYoAYKDU!B5U`02L=cooI
z1=~W)G{p0|#^EM^*?7cqJaT$Ppf2Ocjl%K@`YC;-lfI8g)h#)pZFD;3B|k-URzz-_
zEj1y<dP7*RAtk+?cw$g`Y-slxGSY;1oh;grjC&F*L|FU;&wA}k*GV8)J^H1mw0TU8
z$q&TAa8)aLs<R`foeUL2eXC<a_aUO?Eidyj=ptq@hAQpTwDx&K14-^xIJ%)ycKE%=
zTBwREk;8}ZxKL}RxcMS?dvb~~Z?{q3DE+0l(B;*~PpI$;lo#+*C}fMgfR&)@J^4+7
zmY}&SPbJJvc}@okn!Y;1+SssqYdGA=o0fq(W;kBx*DMJMyVy)$D#Fo1F-~j~qbqyj
z8|Re2UWJ(C8S-pZdxU<y<I8L6eHD;6{Kd0@d3adLtmJrm)9j6$o>1~+BH#C=8x%4H
z1l{AccZTc|6{OW#=<f9MdbP@B$rD?y=?z^mzZUGPcZIV@khbiz2Xx@_QoX#)DjNnI
zG`9<_X{n$9+#T4Fu!K#UP~dBDFfS(m7Fw-zTk^N93dei8BxduUG#K!ph%J|1FcI0x
z<oG>q?+4g7iH<zJ(2o1rj@ZIJD-DWv7*XFZvm}V)OXsp%XI{h5ay8mPFu8XUgWfl&
zN+Hbthy!N{&W@oRjfsdDf>`UJ1wVFbizD>$b$Do3Q#o9m>I{5=ZN<JJ9%T8YXp;8b
zQ@{z9i31i>JCq%6cMQX5((&U{w-1;!GvO_wR{NU}MyoFKmbVESBjA*`k%+hs1XAL_
zLPH`Ok|@K6x#M-N`2JfOw=_8BKHloBWXFTcu*Jc36*jmhL#dLM>ruwqfxRqzg`vuu
zpuuf24J8bsDv6J9t*uhx)8Cm5EIfhG^JG3bE+DepJI+mJWLP0K3waktiKr8OK0wcC
zc;cJXmHpDE!yR(a%*>Bmo<`*TN;<dhx^<k;N^bw~KwMlH+DugU=ALDvXnKp@P|#8y
zJ`ZI~pcZWFXSFz=@7J(8xfS$g3&no8$Jci8bQ9<lA;I|2F8v9sduzbnN5E@C19H30
zVorhFOz^=2ksUSa#(WQJzSWJdW>tfg6Pb6ALl@0Q+JP12snZ-t!AKM|HkIdpXLVM$
z_3ED4VjHn6lbw0Z?eevmu6TTZf5sh4V9yU`!v<_fqDKK950|)K_#>nGV=G>l=xBy=
z>^*JCaEEQfC)&U+if+Luf<o7a<KZ;r(PeY=Dx$?k$POJ}`1e*!t!jZu-zVa&Z#w0?
zm2aKtMa}MD#)#Va-Dbo)3oUVV&((((aEhQn3s?8@Uogsik4ErJnMueutPJG-{44Sc
zz?{OVrLC}6_QzK5<2PM@Bb(YNu*-1q`7Ps8YXYU^bhMlU$WEu}9y<*xY`kSPl+%x~
zcQ1xuTS%QKUzL^ebtxmH+shvkqTP}k%5cvT#ykM|yJ6&1$`tpjw#|bpm9V0KUjSpV
zsFM2}XaU<acCWnK2xF9_Tqi)$XD#jX<(jqbBqH@atUvGc2;kFVsnvfa%KvsjMk46^
zti_K-l!H}-ZivF2Tntm5q94o+4%FhBQnLZzQ4!(V|GWY0pZR<9dnLep>_W9cfI9+{
z8&JHlo89Pm{)M#xO1r%njU@IPugy-j-Is3nd5_%}+lxNHkZXIw|FuW(rx1THZaDFs
z_c|a;<w@V$&hs60@8po6K!5+<W;vc|dou}6Kc$^|2ov&*_65N`EQx&~B3`KP4K~Om
z=3-IOyDOO`AB-nSWc43E`@bH%QA#3sSbYP%LhnbuT=Q)0x!-yL7)c-nU)&r){9hUW
z`YDL((DJovK5eV){z9jkuYr;kjVoUGg^zg{KEMlnhUy()*`R-St?EksFN_?4%Fj$B
zO!lE0ZRRKV1lW?1CQ2-u1pOS1c^nJ)f(|eQ10(;RJ!3^*aCK2O`Nsct%lm5!0mZNU
zN3Z|uEcqD$*jLgcEOYJuZ~8o{4}IJJ*`rke@U{RR5uzoD-4`Fr-=F<6I=~?R-QDuP
zvE~&7Mu4|5@z2rz>p%JD53DEMGjzYT>*-5`ywU-mW5^W&=axyR7-h^al9WqS+nY%a
zPWyvwvqy`hHvUv~mMYr1(1q*$W#8IfeWN2%nHY@a7`jycmrX+@q}<QJtDTEhG1Y&@
z!dv6>s};eArKacaM@vo8(q8wp#c#3bm16qiXoZ}P78M=*94Z6-5wK{4$vGNSfeiYc
zhV8Vhg@#4KrJ81+2+N3%<YS7Z&(Jj16diTFVG4({%|?6{cMcJG7BuM!amzT*QH6E`
z%WtY}?k#8TxN0l}TDYofXzOD4XHf+SYALrnfo)p+5wMkb+-`PkN-xT7bM#G@%L!Qk
z3lq9n+=W@EuV}>1#%*YIq?a;q1D<nH;Qcefj39xHP{fCB%f!0jQ%>vr%8H1ux8>+3
zl$4Y-8>6-stTaLQEuy`(7lJMGp8t=tw+gFk>$-FkAi>>&y9W>M4nc!UaA)D}?hpvU
z-GaLZ*Wm8%t_vsVnPl&;>ietCxjA?6AY-mEZH(UD-rCvhlFQlT4rmT2{kJ-jkxjX$
z>@9_r)|#ISLl)~4k*e(6dNHf6CtchPpmTeX!~^1_GzY6qatOriS_T-6Pxr3T_sN;Z
zJFOw^RKQiwMe-zqg8wytHjM1O(s_>9Ox>t<n=s8$J!icb5)>&h?xUcLy`#Ji`JLue
zwqM84PM)_$4FbT@(P|7ai`**)XtEu+tUk79`keVNLD##xn9HYYE{@eFCHh=k(Ui?U
z(h=~wcP-N&{qtFQpt5#O7mDb|L^+Tb8%CNnit4*{id3tFxp3JkIVdP7xo6M6V^TXP
z&O#B8b7tW8`ib*)d|=CJIKmn8_7MkZylrq;Bk8MtIUg;#;srg6#K`B5<~Q>$=~Gqp
znhHjb0v=R`YkADR$u*AgHd?C70L63`^+iUyCuKdL^!8{`=z3#*;nMM{C%N)LiJ=Ig
zmB%}dTOOuOz!UH_gT6VIu}U#&g&L~~-ckg@632c?xwzqS7s;4g%=~(*QkGPT`?OSJ
z`pK!2r70!hbPaX^?KSX$IFnc*^nuyppGE)SL~dU+>>9t>GAoOX65zB@1mQEqbOc+Y
z4utI_LBmu;6UYJ0WlBz8f7!>gpYeAr-ltmPD*ZT|D+ihc>&fG~EjvE7s5L&Thf&_m
zv9f9?`Gm1puc-Fmr9AWmLb>~y+8oUlXEVG!oV8Dt8kg+jC)S|Lg3p$lB?=f96DRh{
zmYQOZP4W%D$<GX89?T>LquJ>)sRzJx&bd%r<)J)|SA|OrsH1x_;?e`j7{8Y(@1B}n
za?pN9Ow{v1(>#fb3yPfE<M;4?D8uDY?z09%Z<ZuqLJkff{e7lgKtfz6dA9TZ$~=UV
zJpMR)uhVYFngwDi3nXYxKQM@(L^L-WD?d!DDN0m#Xi3Sa^6!azFH_uyaGdK*z-t$t
zP`Y45c;iSA&ON_m%2MU>1E_Y5`<4^TY@{~WOkoQ=8E7e^dtQNNH{<J#mz{gjGWYJ<
zS~>$tjM#No9k!F?{hSWF#8QTCg~%O&XJJT`ip*zV)dXA0rD|@C?kQ8(#1+xGaT8t9
zCiA=63g_j#yEMx=p6h+aSINJuUNmYFrvb`Lv8Grnc6%C^JXdzGN0<~Dw9*J|{ET_h
zwPW?v+>Rc^pMmI_{fRBA4V}TMi!*s#S>qbn@aN|z1f*&(b(S|HOxr03?JaKR=!vFc
z#f=qA4du~-%kfN%h%ymCWi5Z6Blrg)$+egj@c{2jS#&8mBD&SSgx2aP4E@IoprBDM
zqg9Y<G)>Ky%}P1*aHdpTEW;;bw3=*mH9#W%TOE1|@O{+b#nlJPN!m+5Y06jANKFA&
zgrgbq6f(J7VFaD>44I;J#cDf=%EozYzZ~h)3H-~)>#p$RX6|khrDU>bk3$WqY~NQ;
zT`b14-MVFWR>XbvD^`_LRm#{IqAaEn4;uSDB$V#IZU!kq)6dp>3749P!KfN(p;{ut
zhLaf{IfIzt+}aVPC?TK5o=V`z_v6M_hQsRXTE(jK2(Hp^dyT1IL+sYqJoy}YSjNxq
z!8}D>QHtLkmS40Q&h~TarOj*(XN$|D^3H9De-y2&>EH^p(W)sr4S#K#w7Z|Je_O5e
z>e8;4Q0hE#sQfLRO@H(4$eES1rhFey{0KXPPSfD_=;{E!>KG3M_4LM>GMjfI#*WQQ
zP|4hWS03FVL2}QTddn6u_T3Wd@D~<=mBsLZ9dlrXS1b)JpOH^d!XZB0imZ!QkY*vY
z{IYjs9_5szd6|b-Kki4?c(nD&!j{NFxjDh&tTt$f`r1AVvI-kYK=qE?E-t4*hcFvL
z&6C3g6;q#TZ%(hc8x7=7MCTovoC72YR|Yt~qwRi<TxY!HYFb(GKnA4El+93QNamRX
z>K9CE|Fvo<GTh0U2!G3mXOXv~VVA1qhRr*oha)<d`NzDjfvPnDIq7j*MBCEGp-M3Y
zvB%^Ae2aT!U(X~<%b;Fx<te(dRhH5Zxb@W<+?V7Q>s~5Xjhihe3kkpNPj%m@7k+Up
zrIHW<Gv6Jo<OO8bL~01sj?pzdeg@JrIb0rQhc2F6eKyVNzIoKALTgog!#%{F5X9zp
z-^HijF;{GfOD9jciKEs@dw%SNDkvtn_2IUPTKi;cObKffNti!)#_brWYE}JWbpY24
zbUYmMRq+`Pz;TZS_}hws>TFk|FB2z?DIrOL2m5+dNB`T_!Y?a}gR6^0j+CjQkJSe0
z06{X;v*{KxX{df?T|fGRI7LAm;doYxZKvS(5avKbJ)MB{?cwt1p87*~3x!1`y0rlF
z)}J$*)*QL+TfCY#a*FLhyBAMk1$WEKos?$1bB)pj{=&;*5_XY=eZs{)ymf=NXu27!
zcPZE}?<w$j0ehrTFCtS3N&xE-<%DRWq1i_D_o)OpvT;TEnNq~<pbkUbAK9RB53uZ1
zw4AC%ZEt~}GNcR^yZh~WZ$PUpJC<egh(<qTces&ucY9`XXRdEY2Hwo*(ND|b^ZU7G
zBs&@pI>x@>QQ5;QA#k)U(<o&NO6a4d>}(%SxgRX`yE|zxb=<g6Y3fZOLBP){>a~RE
zp^q~u!&NB)rDC-Wt@EB3B~L*K_`M<`$UotXhl+1H2rqa+7}j>7RrWH=Ahi-<3fE8Y
z3-xEjWoz})WvMSR{>VHMW+k5!X~l|U6gJG1qX?vcUpvG@DO+snwnM?N@&GIX;)qn0
zQ0iquba}iPK=1o`qpstev>-Pp+Ij3S8d?!nwor^RpE*;(-+2Z+cu&G5kzCH!3dZtB
zVKryVk<3t@S&V<&BW^U@y|g_I%#`+(6E0A}V2DDkrgw}1$O!oO>sImt!YE^m+g=ph
znwnqTzH79Qt!F1-R?gx<Io~rlw6{V|>T2^-M^*{x=QvUhJq(>|6x$ysSF8_cbpFUP
z;dYjHlG_8Z1&-<MHabHvgpq7TY7$;GFAqaRGs8TghAbwsTtF?1ud>f293kL_xc9NW
zH6yQIs@X1}Ew!e<Dwz#uORD>Q@mAvrlx7)VHV_2M@@ts>*~^>RP{()e4Q?^)bE9;k
zIf2Bolt~HXd~3JBk$5lt+(L%>#?)g_@RdPD`C%#$Oe?p(oT>%*>)Y2pqIhn}-cXD`
zzFBk!V+EGW86iQQDyEyxbWY^c%WNzp2zu||b;F+v0`3E#hO-L_OJUA3hO!#Tb1`oX
zxy*K%PE!k=-JhlTQ+o{eo!l8jmQQH!gjPDBW`3G4CLmDx0axu1TKi>{KANZH*6x^Q
z_2d`=<Vg~E^9@^F0MV*I!5AXfggCh!cmhd@bY{JMz5=?BxV)fvaU_7obvt#@iT4fJ
z{jG9+)SZIY%Vo|R#3Z23`h_yJW_YL4Xtyt06>wG8H}es@K+U{!aAv6H7Tx&DxESCr
z6E^+69rU0&ovRH}nURlBs|+5|q0<dJdrDncd?tWKno6PP86FC$we}9wo4cFxkNZ9v
zD-7Jmf}gzS-*9)EFu*W_IO37eSj_K;^Vs?x4kB>csVSnv($&Uz`a7Uw`xK=zWP2L)
zo{O&?!$+j+h<`FKLi~;cDu9nwA`f4lQZ!<;uIhxfQ`Qy~#$fZ(*u{EW)@#fiiVTl-
zr>l&Whz2H|zDl*rMa)|Mj7+^@-pcL_$;Zvdc)9b==cm^zlA=M%8YS}9!B?Q1<q`a)
zZyj04uaHEdygN2D_a2P7f+P#Ru6IiA4VX7RYPhpQ0pV1f(+qIIt!~^sn!V&WrS3CH
zaQJUyb>8|DIRm*BO$}Cg_c7IWcqfKPMwm&@b$UZTYVaX2DEoD!fs?y?MR2^=hj>s>
zxl-#vWVLIAiekq9a(`pQ))eFxiKB&;YkczP2M%P#Vo=*te@SMytKo8-!KokM_zg74
z5H)Cwdd6@9bjXD}-(NaJm*_gjq8CWh+sCHdafZ-RHS1r016MXREcVQ)3Hyvd_?XrJ
zdt$u#$mOF~Hu1j2BDxMa=9La<;~*KjGFz)ckAmgBw@rJ9TCh89e=ugw3&IIUOu>;^
zQl-_^VdI<E>ssUhoadl`jH&&JD#Wq0dgY)Km^!twX7kHdyE*>FuCRns;7azSx*V>?
zYQEVwocMT2+WE2`&uqE(Rda#;Y_If$8x6^11GPp6q%>i}wpAT&`js;<UuD&Z$CeNB
zg_&oJ^r0*r!6<f(U1ppZJa88|!fZ$nKKRfFM-S7M>h4Um1lS<>%krmVh*NI?g~j)(
zFqK&2Utu0(&T%t0O7}sU?d452Q(TP+s&TtmozV$I5OgDmV4jEl)PwrZ66jOvrP%8J
zFF2sim`xmme}R#ggX{CdJUuj@G($@Lq_L^$KDoud#ER}HjjbA@UXkOgn)%HuLo`p$
z2k$|~*gjq_b4)YzaV*|pfEkICWfOj7MsnX*)ySuRG$rEcvZ^P@CR}3k5K7K?{6++&
zGz6F~L~2<g4@^ZlPe#>!dRmwzGT>O*QSm69n?+{Uk=&qca<}@lBz;lK%+E{YV1(xx
zDC0-eOQfy^9ZSxkrEtmXu!2v#{SNO-ja4-R%r-XE@OUIms=Ag7wSv~(@5|@=Bz@(e
zbW`T7u;j1JDA7z(ej1rk`M@Jk*AEiEL!SxFZ5U}l2lVm04azM@5z>e!TNxIZ{J@$_
zf3UxqkkWFi2%YY}SQ4N20368?hni39cmutRV+`yvEsQt#`6u(CJ0}pg^-xkYYmi}+
z>iN6$@qR=`b1=n)<Av5!DXJQ0fku^mbt5?n8krF7UNK|Duer*Nu}jIqP7%f}3>iW)
z5^qu$kv6M`B_IsAL0(Ub0PEzqTFu^`WUryu+SO#j`j+vG?+a;sJYIbf_II^w(M6;q
zxBQb$03si#H$hfGjWq&tZ(^eG1x|QWKv1<)A^c#L*c*45RT2~J`(HYn#+U7DUb0q;
zveGGY;yoD}mgO7d_t20mun)p?B(*$@4KQu>T)(r72bxCO@;9~&G4DS3u8R!Eu+{^4
zBbJnJ<V?AjPW{>NAkf_1W(aEs;>=*$ioVsbzGY>z!&6skh-Y)&2n~a^lq#q)*&di$
z>nai+eDzE7dxJqwECTx&a^SQZ_j#~-_7^t5n9M#krPiNuWzCNG=<uHTcst~<7{cTq
zcd&)r9LUjg;^LF5Lg}hxggoo_60B)Hed{FN2T`gzn{ehg?%t<+m@Sn*dGmD+UwFu@
zZs>-b9e+lu)=yFIuyjJ8s~W8FSr8#C?Kpe8Q@hfxYfNafemj@)^q%B{UUL2NJhX1*
zZ(i1ot8!>LXUbAe?7r{3U-o2$c%N)c8DR>C=sdx^t$f};KhM5nU5&RKH24f4H?dR6
z`1H!&q5Mgo^=uO{mmBojh)mjBH5(}Iw3ix89@alDWlJ(#=`L(fmAI3d2dr>WcJ{@E
zA!(}zpI{2Chv4{2a`(w+DjnV3vM&X7N=;jcpJTmH$HxAMYhCj3irE?7;u)yU(S7Ai
zF8j8yX5lCBb@ZS~`vrXN7uZ@z2mj$rD$Zn7NBPY}IIEMOAwyg(R3fCnE(F|zsHXKt
zBHq%hDSm8UlM_$Qo2hL?MZo7j9=A%ilpUi)cp+<(O#Nk;Yo?n<{FC<dTi=JErh_*-
zEUjj>Zzgq3N=|ArsFr|t4NB;nNA;$9^K$%q$mshsJC^jY8@G_y7u|d#F&DebJm%a1
z3=IG1=B)NQbphK)40`Qm*pj#Y3n2}<ghe+?G^@59X_&q2$LHA;^6R(GXwZ@!f)L?}
z;`mYp_CXhe0>=)|412-r4Ko#55<7a4N12UDpKB6J!wYG|xIXP@ZHdz=Q7JQdeF?Yn
zMPURVOw~a;SxzYHM=ZDvo235Gf)daK&VjF5+X$xxI_;hK*Bz!gn1gSngSN#^EKe-V
zp(ZN{>mpO**>16+M`G$UktnUjRGY!-a)8iauw<cx0)(tp8<a5;^2^PMu3D^SrOO;k
zHt$roaqG<m)9@-?0)3x>et~4gu}M<~udK=5zpM!!z?u|WuDSG&nGcuJec<neY15FX
zJ*m1O!}X7DdSy+tozcXh3ZfU1rC%tSssE7m83eNuToB9D*^SrNi`kH2UC~(vXhqhM
z*)C@EZa=^GNqWtcCiaAKuOX5tuoU=^_Dw^o<(3&T(uEr;MKWnruyi-g!t|ycD2B7n
zcTP;1ns*Lfo6LA0ooKP;z8da%y$f3h1>tkIg$_LFQ~r_5v6?^~K%o5L3_%x|jlb}_
zc140~*@gNkp<?YbH)YFG)=;@O<E_v`l!mvcbATwWBF<h)e)`bDwq?b)0h~dx0N^>)
z(&_EaRHoULFJ^1d09^QwW`D9zp%(>8<vB2!<<F+1AA`ZE8Uy_3CgUhx%Mp3>owqrg
zE$ShOP`XRI0fHofCQN?mggLk!;%7cd>!x04)uT+XZ7Ue@8fWXP+FnYgYowbpFkuz;
z0qY{E;DWi{l6)R9WR?d^%N6oH;BCveWM~Dd1#;ZAm~3X;xxEidn*#qgCnADM4wpR4
z=;ryINj<5PQrDV2IeVJU7;q#F70h+7GoSWdEj|JV2vxp=`aIsFV`#vA4o!A9!F#sr
zaf|?sn>6wkIt}oLKn2#m7Z6s?<o21YrM;vdv)B#AQkUcO7A?_@Wxvta%leVg^;7?#
zrL1-vrifBQO7#|1rXAi^PJVYjX^k~oN<}p*TQkfj5j6qFo){%MoJH7aGWx~k=bafm
za*`qmRQ+|b@CxiBLCKhq0x`IRGuB<Ij10e#YpTLo{Lh%Qw;B7p_80&c$s(HJS}<iJ
z^cC_ec(-QW(|Nl{_3j$b%}_`u*4qb>e$31Zp+S`K_BY28qao*;ALCQV`uiLYbES+8
zD5e>Gl(?S|aGnw$F5<n}128lvR2Kr|HYBxRub5Ck=$N}M=9}dQlt0-$^Le>ZX^j#F
zKi3C(+*!4@kD0b<j|wbby4WvOM%7c3zGs<)0Ul0Pcy|4Z(637vfWxWnB_PtDcUWsm
z$4u(fTqIZh2P2UYejb$jDQVzDPZnU6i>;ZuvBaL+{4YoX2s6I6WOv-{FoWJ#BeicN
zw2ocSjo`gw2{K7GL>nARI3~C94^TIk$G&bKPhlV9(x-g75+km@Cwk8_=Kk>mf6BVn
z7_9M<#TV2;E^A{0F>u*j5Uer*<wNT{24N#PvGsG}EVu9qmj9Tfm@s$*d*GEIiL`65
zVh)*mZnwL_>(?Ok8r3|maAJmS)52e&*%G(iEV2L0iucj0egOqg>^En8`s+8pE3j1L
zq>UDa_{o1F$@vRBKzp%@h3xf|4*PgNx2_#WYB0<<A)&r6Wm0)OUa+|$q0;mVlP#=|
zyGxh#Hm^*93lxVsuaf`JM&4P@*!QaO5&|LHM<(7AL0+@&7tnZUk?UKopT?Tyn%=!s
zMFZXxv1EBg0}PT-?Mh2n)6F7U9mm-6+9#EdQ<U7<Neh~C*H%adm+7-<S+JYzS%PST
zF==_Laj9vgcC^p};LpQpR0X5DKO0N8!$f6jYK`$YtJh<;Vw6QX!JWi8DqHV`j_3!y
zslZNw9WAP<DH+?p&Q<&NvoM2fnF?Uqtl`qGHjch4RR7#`B&u5AbPY@-h02_W1Sp&^
z>9n+B+gurmI9;wU6PnUFq_OL{EwQW=;_j}EBHvLFQNIYI0%f%#E^fDHh^H3IZ%26>
z;3@v4=<dU-L-*=09d!O@Djt#fMJiKD{OT=6M{c|9#~*YFm*r3VW10C^b8AK*u<Th9
z$f)E}e?!hjIIs_dSvo}j8gUi+$c`2Sg^@t;`r<)@NYV`H#h*I(3y?S<_vro?Kw_x#
z3XpIB07>kB10?t(LkvBp<W9+ZdRUjp6}D^f12TH*7TQ=<{~<(d?(vV%piYoubyaIF
ztp~Z%K&vj=iCUk?wHc<3T}J!H>3N8{cUK<Fr=mK*8P-mni*I_AsH9Vy^l-Ed`o?W3
zzIq-XM!Vp7HuEPxh$Ej$BZB6XccqfV>~Sko_X2AefLR1sCK1DMloW1;Cl?3oKV8;`
zcyG|-=ePzGxNd%hK(n#nYZkbc0rAz=qiOba;`bACHCpaM;4f!*Y4$iV5bMN|cSF`2
z8J{a1sQKOdWuTY1jEZLTrA`*ZY;}`#Y8Q>&#lWq=Zsdf3Q^{aW?qu4!8JsqI;k-l)
zLv-;GxzgW?9>yE6JDVva#PuRL894nabNg*8a&WQAf@|sM3~@qdt(+gI9Kb-l2)_}9
zw(HygUB8o%bOS7g+mTEvoHpk-S^<{iTh-A22TNkW^U9K#fBz3lLV9VNemH|+e(!uL
z{J9<V!**wypKhHK^vR+RXBHk%86AzW)wc~$Sm4G{t^CT_v}p#zUpv$%m!2&Vow<Jq
zkpPBF=CuzUYg=NdqFl6nHc*xG^@VeV#&>${{TxE~eS@>c+Q1AMuz%UoD=`OR*h*ff
zV$H02I5~LkoIcg|PS}|9+n3QDOQ}G1tH<ykQxF#h+d&VTm*S-fS^tBUSdxD6V}k;>
zLDK{u8zZ0Z9Mf21(slOPamJpmo3Xgs84^!aIkD<uKu6xSr*N&!lDxY~SN#UZLM=zg
zEUJxF@3{2wJb7atEz<SqhP_xfMEv?JkDZl^{dBf9g!*!nz*<5z>?mz@QBxj>jx|@f
z9W8z%9N8&!vK%DchK!chJD5l(Ez1?m$1fR**&MocoATdui3W&22wLE-(jUrlQ~f$|
z0d|NSqJh(^RcTpYI}<4k_7XSlcehWH?!GFoqgyL}uTtB{2LTxVJ7~`2l+>>+X;@6(
z?Rg~rTFU<qup|NiONv;MN%hh47fGz4h>S_SR_#tvq6R0bza*Fkcw_5sNNR=Up%;p6
z-A@-f)5lL>Y^0OC`?k)p4iypl1-X-lD@aym^uh%?98GV!RRpxN7Pxhn;mr#2qxTal
zoS?;zv3#jgatv52yFFc@f?_Tl6m25t%MPhP-?n$>9xG&Y@ah9fpD$w%HseY+kX&68
zv7@?8#lrTaU-m9}VWhT`bFmL+auO_06@MutT;?mqQHVai4U>TR;#t75P_WG)5lH(R
z5{nwLnQ6`!2M9pOA>wq`(!6)7vwGTN<Db2+A{c`rg8fZ+;cJPU%CT`9@{sq6o!r}n
zDx^!(s6OKfga1fi&~%2FLm5noXl1<{RVN-HUeZJ{by>Y|lJSL{(gqTx0(@~*Xp4V6
zaAN=bppRBZoHmyh$L1JMB*Vi@Mm0~+u;lVDb8;H=%AA<9|7A{&{xT=t0awnu$a);l
z0kfas4Oad?n3Ko2sl;Yk3AGZ%FdgHQ2pZcTeBI7*o!gks#bsX_6)Q>K_6d^^zoi1i
zo2)mazBtwY!Y6I30mA|02QI9`th3f@VnA4;qOq`B+v|CJi|{OTAxqy{Vb|gh{$@VM
zYNpp9XW=45cDMiOxEcC(KPP8d4{yyVkTh5~7VEoD;P1Bnzy-PCuMp0CRcF-Dz3NRS
zu_;zgw7Itu(%^GMB(X{Lg_|jVjM*FLQTBuH8P?aneR?K_FZ!lS9u?<RAf*;`Pd*sY
zH70lnLWM{-GLxgaj%{~9T5r}s)25t!<qjDX{gKCoa5v_s@|@t~uSZ)@T_v;F)@lta
zY_?4Z`<kS{91fo?dr}Gbej%&Kj)srt4sDUKWX9TR$a-GceKZ8(5S%otcXMb!mvq&;
zi;ppjsGeR(waWqS&iTdoPB6}8s(x5I9|}hKhX*pswepo($?xu)CTb<-^tES=EAE_m
z4ZkE5jGrqUZDUbm!*xQ_p4AK_)^Gh@KUIp#*t!`9I<Qt@ffY@Q*c;YwUK&d#6w|Kl
zWQqf5l&=2)PPzc#L^2aF#ZOzSH&%)_f<3=wUKf<ytSM46<5|*a_!l@)eg#e%C@L6)
z3LmPm+VIDW)Z4iW7{30DyVeRwNimphLQd5&*K%Bum(g6?wwZl<^KEey$5zBCGBS9*
z8>80xnik)r>c?7xSOcuIXz5a=0}+A#32*J0vp)_E^+BoEq%kuJ_bBTDXkHBkcTaDV
zjo*1L+|)n%D~6=M>Niu*tKskk2?QqnvZ}p~i7b;}yIdRP*wlb?3+F~0^hBTF+fg3Z
z<gbP~DFcKTu6m>~Ku<7wEpNM>Q9Pab9M|q?w`B2v(w^WYRBZNz<rG}YDH#6XCf%c)
z@rSw-DZrYjKE=7evL^1Ytcj6QW&8hTO|I17Ti*{yVKIE|AJh423L{4E^Wf~RuyitB
zfssiO^Y#}hvcKLD0&6cqy$h1bko5`0shdVdCyh!q=Ze8(#3O4yY<`{R$A)W;5^`kj
zn4_O4Yc5H?!sjlDb=}$3h<EmJCSn-)`scUb4ew2D$pvi_^hnxI?h_KhRUlpKvw;7D
zOZmY27F3aLQRapD|CdWaatlpA#jaRQlJ}rp0kmR2y;p~h;3)k6<>_&O0@0rWFHrc@
zCpWopS>)}_4Ku=#)@?+J1}8(`md3?6CqsORVl;^Xq*J~cod~WvAJqQCVF>b)j{nr>
z!<}ddvEWVv+t|+=1S-oJ5HZcGbVDPw>B2}`Vz2QLK76!#Ef#`b4L$fx>|f*K{z4qz
z>0p#AG?N`l9lyY0oZ$MVeUzTH${tdySRhF70#-^209fI#D`;KosPwPN$qTWs89!ih
z5>5uN=^G<jF3N!)8a{6Qfqr}NYMufm0+1g}j@wE98R6f7P)vaj9bJP*IReD2{SUgt
z|CKEHzzt-i{|`_EC6pCVcKa3(V|V`=;r|zP(g_?B5dr}h<^KdxV0v%>ZfEN_Sjyo)
z^w7Tp{ogbG?;&g=04^$a&fPy7fzSG1Blrt40fR$}Q1w?$!seMxXRQ|ZWTTs!-TILU
zecn6Gwl^M9xa>vbyd)`L+1sWLf+I`jn%Ian?fz^`XwCN@a^>W)T%>P*t`*uUdSG-}
zB8!@8lt!D|FC2?TI5|ylud_R(agEyC@9)4QqzZ)!B(wuk9iQVr3XSpI;;*zhB%ZGL
zplmMuX-`^LdUI;``=-^0qxlc8w{=jOB|xFpqKsM1<kTcsX?sBd^!}-Aqtny(1E+I&
zkbG*tYz0M}33-J}I9_lo*Q_PTuN^Ix`~llCkQ#y2iia<|<(id8De);(Hba_MG$Frx
z8Xbyqd7R7$-fV8K6X=WZzz=ULSMWY+I<HR*`*Tys>2m9yi_9xv3fN7<!;n`i=KO|7
zEOdWOZdfz@tuCZLxJ4zue-)&>SZT<(GTdTJc%kLv@hE-8iwxHlbCt?*uxAYF;-V<6
zwvk_F$3!ZV6&Tg}d`kp{WYW?_rQRKZNiU2pfsnr`7Ex@l$Do@H$XEFC=Q+Aj@z`L<
zoX%)4Gzv0QyNrRkJJo~Ob*Qdoh+JC!FptarLQN^N_Tn|3<N2hD8Wd?6bPYbBXe16v
zKe-pnlHB3Om+mBycohkir3CU^`Ut$NzUU!YUY|~8aM#slS}`-eY#%6Tb_R3>NXREO
z8*9H_hkjorvc9w~ezFjp9d@tm<$ZX<DKmaIHR=^&8S!*|GN8QuXU{5{>x4JeJ<J5?
zK6m6ie0t?kNRYYtletq*SDT4!Wh=n2h4b^wSrVoeOSM%t$nTXM63Q>EwA5Pm6YJy_
zMqeSR!1h2_;@7fX4&jMr3E^|CE0?xpE7}*E*5ZP%ia72rkJK~oN$#E-NE=|EJ2KLX
zWMfMcFWhE-ef``lcYwojz#wl<Rb{S5$9`U{$*jS4-(WgscIa>H$VR`?j$ZvYxI~bF
zGz<3ldRS(5S~n+Bs{wN&XqTUv#7}2^Vc@>bq7du(`QhvnXn~fyZ>QA%Hap`>9U*H&
z$@xbid&M7%b*?KWuSAyH{FIYy;6H@P`}&2J4<F%<P5fKC`dShib?WTJ+^_SpMI-5D
ze2KHs9?kBJ>xE`tf&jlGAAQtWO<t5o><4Gl*ogb9NTT|e=L?BCgp$~Wpw?mum^RjX
zfV%-!<DwMd*tu37NQ(F0B7K85jrj`Iuqp2mXR7cX-#%IFKKj~Mi~Kz~%ffG1dn+5p
zj111mt+YO6zt!d`?&h#r%V+ey|6Npg<P8K(NYYA2yO~_Z5}BaBN2@}U1?MKygi`P`
zL3$e=+kR$96!nP0zodzj)BYt1*m*by{*LhGQq}1|2A)!XFEpRcP$A_K3rzo~tF`u2
zRDJodlY^Eb;Ak)D`>p(uJ*EoUpLUl8`5{7<1lsOdFo){P9-)24kB@l+R7wA_HtIKq
zTFv~FGDt~Ttor!$fP*i({$-xtzy6-htVH96NI+_6)*7if%2uvliWDD~KYgk0r_RaJ
zco3bMIp2dtE4O{<VFn5iyIS2~4F9RK592F<ozIgLK*Z}m02x2w6JAkR_P48$pJkiH
zO~(qj`R|;ei1y}6Rg*1L%VaQzs3yJ*FFhK{=jK{IoRSmEA|B2thE$juDK6F>)r&D-
zlxwtxb36YzWOGKjm{aJe%Cie-^4=r$mqlyu(c@^fI6`VSoR@63i&aRZ(U~!vEt+d3
zzKj1sr!X@(pj_n1MXSsHMYc0vdn~I4n-SV6h2_EN+mNpQ<qyfz#M1~J@kzq(2l?NQ
zj%cT=wQG8ntG;s8wOMYdwi~){QA>bW*hDeu;&N$u#Spmqp4y!rq8Kcidu$4=%MK6K
zHSZCd4#8SAd|V!Tbc~4ExbbvFWy#WNi*h#5Jh}juc^>-4q`_jYRBnb=RUxJyC3%OP
zC$#|ZMbEbc8@pf4uW|vbh`HM={#=p9H$6(lxRc+E;O56yR%9ii=S;jMm6&NH&hOVt
zh4u70Kd?QSb#YnjGLCcS8fh1V(Gt0L%uV=)Tlgbp7PZURQ_TLydPsykn#7^U(GKch
z=|OX2L?I0HxAA~XSCYGRDz>C%Atb=oMwWA3w0n^a<~f3DOeYK7bhN}!7H_7&{c*Wd
zV$1T-c+mq~c#*CIZS^eH&1zK)4^ji9#}8i7dgxeu;SC~)`SePAj4F&WM|MCD$Oy-u
zx-TM-InnDaxu&BNlWeIN6V4@lG`R9l+-&B@S#5_`{{Sq{a#1FLdS$y<ACa@A!sk2(
ztO2os4vfQ*3168#f=k?S{)*)bx)TA<<>eDCj)$N^iI`PJZT(LtG+$acpZ<J<Z7Tkr
zm?@XKh25FY68nuq99e1aAMyi^5Vgo7gnhcqN?OIc_Xnn92K2j)@Qjg8H-U>9eFEZO
zt;{%Vs0(fgGU%%uaVL<{hGnWj^NmS$Q6D4SENUFIYS?cFIOEaE)fLNhhI21pG<{$n
zQA{&&r>pVsL$_WR?~)W)sw+l$GCSxf?0nbGWRCc=0ouYC2uL1FwA>AN16viB26URQ
zLrOouF4R1xC?(b4oeP7t*v&;n+?~HPpx}4nIvT7FIvqzRNVf9aOT3@<s&BrXxR7b)
z?p{zMhV_Q%^Y@9h`u#bcI=j-po<F{hh{;&$qsr&s5f!7FpCiX#h5iXIy`MO-OoY64
zpUuB~zHBUnj^YdD(rwmM$;^fQwBLL~g!GHUgs-#9stMK8EN=aS(Pw}ZD}oxM?Vv$J
zPF9i2kVXs*-c;e61?lrB`bR&o>E8(!E7X@3V~uj|X>8daooe@T#n_eS+=m4$-4Oh|
zUUDw7-xWaLGdsmTjI8HNlKn6kAgzQlEBitiojEj#k<?d}QEd$p<h^&HiZVbL9Opv{
zyE%xi3Aop2>6@BNnrQTR;rC*9k{hp{1sUR{gtp5h|IC0#D12UP5NJ%fo}VFmfbw${
zUI%K!kyXKH?SDYWnLj$+NKn2Z=)ckT`F4v8)rVcWDo9ns=4C?E>M18n3PGWKIP<A#
zxW+CiNL*{Y+@W-!LaDXbLbw!$pNU4bv~XeM`&=fk+vhpAT{4cJn(?2n;Ls<#g5aj;
z+oE-cllo$IGJ<B}5@eGRZu&_Bj$Dqjt(tE`oV>|{*g6GV4p_>zU<3zvBM_Ti?nGAl
zf9))Ml>K%0$tRnnb-twJ{=L`l01D1qpuA$Z!pJc^vIjSIk<r~Z%~65426|LoH~kU_
zJzBt+KRI)SV9^_JT#a9Z@(4#czI)7FzVmL(5md8u4rJanJDJI@EY&N{|Fm2df4CFZ
ze5X`R?<(q&Ai}#~ZY<Lk>cG0dYhXQy+n%Tj#6y^MjX#HPubHsib46d2F{0IE08(S!
z(|pD!dnUha2~fb93SSHw@b<dS+C@ApVmRIP2+}}GA~5d^_>|)#KcnUeZhLY2(!s}S
z*Xst{S7}FL9tzPkF^lUD5}cZyQ|C9De?FI<_^@g%wQqIkYt~SV8x2`Tuei%nA^<yX
z<q$p7Kfu1Y*#PLi>Q{q_Bx3g+txSd&?p|E52pE>Z<(D)cIoJ7P@E+vwhtDWnZHy*P
zj!1@3n!*hWh1}>L`~td`7z4mBHnS;Ip?r?Vy_MTd5WSKs6R_PBSiGLAD@ViXnSSVW
z9z)b5DR}<FP)r^;_zf~PfLi$huQYvh>gBz${ITm)4JRI;Y@V{tlb+cdFb1F%qDJmv
z++>{v<aAPN09w)EFwrFIcsEZo(;^AuU3$NxT;bF%1)j5B+mY^Bs8P~7oNNlBwSKl9
zNp9q#DS`Bs&;iSU@VZwRzC-E8(zFl@+?%++IY4?RN~fPzWkiw63l&_>Vzt1(huuEY
zyjCg!<4t@~$?W4b9dX$47oQ*$U4S(q%rXZO32W6@vFUH=Og1ASfr+kI+?wND9dDN!
z_qW|#@1DJBh1Gza0|LM{k`#U-_cfj+5$ruT>tgZTyQ?qdHDuFt)mr6U?97yBV(}nT
zUU5}pK*7O5@Nw5?ms?d2uK`{gYGXw>Z%ww!y=sm3R^Nn{)DWB-$$k3yyPWS{{nJp3
z9C+h_DGJjP#S?q@*Uew_0XH3kw9~dI>1#%QBu;s_DEb8DJlk<pBdTXwz}voEz6Wzw
zstbE`MljB3cwPSW5pCx0gS4;G$RMD!uyHd~wkFpn>;D3fZ~RloP57!#%ip~ZYh{~n
z|5W~tWWS(TU;5>~H0qlMcb`g#IOb>pDtv$^@0`Om_r01q{=?fEk3*}|W>2X<{L3k#
zZQGom$9%4u=Gt`0E;PL{S_ANw6NHjLG%zjXuA6?_{Pu_2dtttM(2YL}-yb~WWf}VR
zFo#dKLnSv!C%sMHIDyh&yHil^LPnh7w15$MO*B_EVm^YbQ0S0A<t#F#5l&j`xpam&
zqT29VVL8!Am<BIJ9h81qsKYAO)uDhINx?zN3`~9uPP*gY4#jMJNkxwZ0lOdACz$%C
zo{g(W?=U&PLqTAQl149(ELfZH{%A@YWo9ciNj~CWHKzeJkzEt9pmU9w#+DjVjU~<W
zSrVLqb1k2p!UKGk^&@-%mZ4bm#}3y=;|6wmW!KMM=L#516Pp3Ph(JiTS^ymNw{pAn
z_w;~=R>ec!63Xc$`CU?cqt{+}aaRxQTP7KSc7axYo#fe+9$noz_>7{Xa7S~XFjZ~Q
z;S)1Bh`59?a8aBgxwPS!@yAz~uEl@E@wmE;D?R#&QXe5-BbYUAbhSxu<b71()(Z>M
z5vo5?=&yS1AgTv6Ir5x$M9d|`?x?TM7=DIJJzOad08TPasgy%<m#;czT|7R$Z>9Hb
zO?CD$h-}w}^Q0Fsr8gyoO5O*)0FH(ZW*Z&~<r-n45bX}xByI7m-tu5_IwF0Z<j8wP
zcIg%$X1@Y^>>zV3t#FvcpN7VEBipM<2|nuPEg(*)#qVwS{E{zQ@W%D`gL&MnwIR6a
zoHIW!M$~T|VuA-tIF|=yLSoQnR<5Ccs^uaX|DYx7dF^C}!0*Z}4~y>uB&-pet>tt=
zk995q-iq7$OsU!f_=9n_Fl}>BEK8`PKv4~Vr)6%;T?^$ZIv@*QyCRN&6!)KGeew^*
zPXJAH{Gga}jBnMtp4<j6_RByT++1%nf}>DlIZak{+s;{pRvW4_gD!>VpRN>qf^OIS
zH|3%HVjrD^exnqsV9_J3lQk(R&~7d}7r)b}*~*S%PrZf^*zuqk93H(Q2uKXi63Y#x
z8t-J&$Xa!E>iZT0fD-?3?DO3rC+5x;3Mdw<Ms)y#Rl02HW2hV9EXa0#4wex~dFnfu
zEP2qm#)XqNE0kE_JBsg%)ExVW$~shoH&61Y=tEi;*9)VP&X+`&OTUAeMgqXa>uWrH
zA=KI_{|4dJ(QHi0bV)8!rhdr4g0$)aX5C5%N?5lW7^;e_zMg7ld^86?<$`{fjX3T5
zL4>06-K;GG5iP%0lXF%SeM)iVW~I(+NCYsGZW$|0*iY3Zb4_F9cT6j@ay9D_L^8O!
z;bcEX&!_f03))7Wp<$l&O$m?Qja_L|cVTMzBg=|}?8=gs{)mHpBc1|Gpq+?+S}4&v
zQ<Px@47sWw0iI44-GcZSfWDANPM|nc99G@DU2+en)=9UNwIsZVW7w1pp<TKqxNzWS
zJqo|_PIeMqA_NHlmt-MmIRdhffyb$P>SgQ9^nN&&i5iG)m2*oh&zqXoO;Be%`Gu%8
zr8b|?@Bst@<E6X;1oAc!KrAJ!-|BI@LGW<5B`K+!P<KJPdG<u$6h)79lg<|Ah=av@
z8cz57kM!z|I%;^8mSbEB_fFOoH}IXeA<(#no_MH@2xxNsdaTHj<ZUv~^yl6~pYpzr
z<{Nle$^2d}KMY2OU<Z%@JEo_vUHNjh{0-rWP()516O16dvL(vSm&}TSKW*8-7skIe
zb$LD0Ua-at$f+o}D7hmNOf{E2oYn*86b(OI>7#%3swC~tnNo!xrtucx3V?}4HTAl`
zao#!C`%^AN#4&Ah>&)aXSu$*;{|5|U&^EEZH~g?Y-ShTHXZ_gNqx2*qVV%QVIa>Vc
zAiFrd!Ge!Kz)7`iZeyEwtA=1i)pAwNgHu<8+P{Y)_77R;c)BZ+Nu`q2Pxg}RukwR1
z5N@b`;@1{ag>PF`rq<gbOq*COA<M#At!oZEi6Vh6>a>)zOji7HeRgPPTQpXG31ErU
zFAdS2OBVBx#x=|E=t%A)W*=ql{A;hDS+mpRx3vLMWtQcAu&mxi%KrCwGquWOiw0-t
zZV4@u4rC~jg0eA)L%H>{_pX>wup6U4P2LeVy6)f-a62oWAn#?WJ;6DTr~@gujJO#%
zrLEccbmLHKa47lyhv4XH>{M!!UPh?oK@;Z2s&MQ;RHtA~`I=rtWdrmy!^C~KhcYz4
zDgH@l*-2E;*R9`lYFD%Q!-aDpXLGVL0B3=^0h5h!;;s#76Vpoi^u%gaZHV(5txu&D
zo(6i1xSz$P=S|uMaX{$kXmLd=$HpaAWTF}m#6Y5~w}>EUnYsTd1(IdCDSgM)^rS#4
z)c>`SU+IqE={j9ag>w{ygJWsfI=+aob|WY^KNZ}~D*`C>m+lw@L?77cY*D-)c0M*+
z^yp?bQiJhq_(swO>;^%Gd{hMuTA$1%fm}M3#JU=I<T}1i@C1~=bkneX%zHC%v95_$
z=@IZ!nmr2aW-Z$bZU(ww-ZOK`cP7wh8%gE0K86;69Pg;Km+89H)6jz+D>$f%ECQbH
znk(f+<q2H{H{;jIbe=*i>5yP4E@=l3BHbr~Q!Wm@f%$lKh>*i<uX%ZVeDGaM)tEKc
z4*3YF8k&FkD(!C^p~t;S`}v_D-amPrgN}%}xIGK}`t$hQ+FCd85PwkL5>RxH?5=nU
z05nMlb;_PH)r!9txbGNdfBU#Rp66Evif7h3qlsKJuveV?s%ij*J~fTX?~(b)NkwiP
zreO-8=H^E28M_Qk15(Td_Tvy0J@|nimpV{&qwu2(5B2VREgKAi44G&KpGldt{W{WA
zQk^rlTZEUKgEBFOOT6EK9OO_vSArE>^BM3C6k*yR?Q@}T^MdO@#Rmn^ZGL>SHr^Di
zq>+UfRm$0Ji1|`UN$GI7J6nKD8g60i9^?4f&}<fu&^x3rfX#!D@JWMqkj@IQS~IlD
zJ+ON@kFNQe-cq(_xP&kRO=bAqcVc1p9sFG7?Mv8{bHSe*{YtJFt5Z6=U{QomSa@O5
zo<T-pDAdF-dKLt@4&R&gAdT=P9O+M%_g^hq<VJ3La_Z95_gz&DNG=oNfKHcNs&5o6
zHYC8QRhY-v&^VLarty(?n9^wH*`wc2NN><%HO*e>SuhI-GcU&c2&Yxa4P;LYHHwK$
z?g(r6iwH6o#iq$>S4J|}t$&v!|LR#f2cCStHsr0bWNy6&6k+OCT=N&&BxiwjnEdWR
z;}za0QtFrk_a?AQGeHkh3`}YSlwu;^GPOWdOpv^#+52a%r#)Y6B1MUIpq^8Kn2n}g
z_IrR%u71A*&oD%32W}_+0D6<NIf#nBb2R(Tl~jq)^3V4?)sTvdw+5xe__+DlWjE>n
zOyDPtGH0`uw0mcZg4vU<BF&*Cs{k|4eiQBvIQAyuyKhIcsx=DZ`$vUuoc9bCZShTu
zW}-}o7Ib`ULT~P!uLt;rhy{q>bj1zpUx1OGHVZPjH2b%Ffz<d9+pRqH*C#Ho_)-D!
z+1YtRk!^MtZNcxWAR6z%H=O)PTf=LT{6B&1h3;HtVgj-Gzxp~SJNz?R>>rQthd0Ts
zmhl%Qw!D7G#Hn_GVy_(2I5c<n&KAZw!|fx$tvLYZ!^a<fphr~!jb=tQrM-tgz^HfL
zr#Zf9)Ky)5$5gW-6n8q;L+($K^{te5I^bc+d;JV0Vpx7OI^J#U4#JE!ZjX&0pUtwi
zhPrOrh(dge`$ogQTRk96-di=x3{N)!Qy%YRF+>!Kq(Kx)cQ6O6Ds?LyF6aZr0^~c^
z3DXZ?W7(1rgf^4cIsVny?_&n4+#62$M3Y@k!7^csVsl(8_3<4I*0V|Poz6<(mVNFN
zxSG<p9`Z$!yZ)4VHdar^IUucfzegM*jk2)}PrMz9xJk0=c4q;N6oA*EiOjx-$>Nth
z+iNxvW&Z+7T88*W|7z@y`ONBk!$iDvgiS3vNLxi)CXqx?qk+(J@8@oEmwykLzlEeZ
z#zuv@fQQI8fAd(S$ZmPQu+goSC%u#~IQw+^^w4%&gVzbo9>ZV2=>j9+0|M+DM1|W$
zDr`~w|Fv%bT9~j%l<2x{h2YRSUj)wDh2$4>H(?}jxP1<2jW2l6VIwkpp1FDpKD8=T
zVAUC2q9J@Iww?%8=uJayyj_3$T26RlC(gwY7QF7$A2Tkvi42ZJY=ciehPO{7<BH$Z
zh`W$7E|NYnc`fG6K(##htove^4^xz)!2>r+hcmEtD)C;ZBU$#+0Xhp^O@4V*CNKhq
zfJlLwC^bY=LBHok-!9727n#qQ_4;Ehg)B&S!DyA<a*opeFg5cdVC%=(-kkYVuOYB2
zorDQ!?^C^GTs&4=$H%qB0ln>+i4}RVrek;>k}<2Jk0d9(KAK1V5TUK`!U#a%#W74-
z;w}6fP)jlKxNhQlD!)x0k}nv6O<G}&HeBIe3h??Xi|2RtAiQ9ern!B@AT>-#bP^(7
zMd=uxO=XPE`q?9@J0-L$RFC401(7$rc+zY(St(?}6Sqe?xrc$>;qvy?kf8+L^hHGG
zwDt(+Q>7MFT|pLLk(Xj*V9=1!JNPs6?%_GqswY6oMNTi+2AN#Fr$x_cUA-1cm+sDA
z8+vrz_6>VYS;~$OxV5*hRJS5T1l+ox*k=7{NF6XDa5MSz)Y<s%$2ys~Z7w$(*GH3T
z8*&>6SVUUFZ9eEI?i4mxsSkEm!}yUn%OQiNAni(I@|Uh1x?0;A@nW4iv~L8{*v(s-
z0TtXGFPiqjo%y9B75yn72L)pZ@HN-CJ@Vl~a<OWU7rduzXU6%5f{TBIZkfuKHeEBN
zPbU*z)eJ`$k+M7)Rn$XD=dN5G&kEAiP(+0-nd#cdd-}awfX*EtWXS|<e*pPS^*d?@
zg1&CXOdXoVcbYp{dX1VeG%lRcIOc9lnZG)>k{6M!_fmrFj0?hFott{&ADuhyAD#QT
z!2x?Q*nQS%r+`9+;7GM%GqGxRQ9#t_Rj)QB=gZ59(1=c5x*PTEV13S$j#q4M-yKhq
zYqT4_Jf6)Zxl5ArSCm<r(>jtU9Ngk{Xf+u^Vl&j|D{p9h1$nfCKZ9Ej6R+=MEInkw
z!2TrWs++e!5X}?ujc`J|;Z)S5<-KB#D6{c}-$9Gajb#J(%%OupV~;ehC=vVit-7@1
zjzv@Mi1x>&n7O0i_jE;;q^UdPG>Fdx-ui284vhN|^a);rpp!6T-8^fdtSr(uu|Q04
zh=Vyds9#^aNS4O|9evp1RmMI`&13zRqy1OLPQ6;T^e8PaTN|Kovkm@A81#WCiIa5r
zKs%#stRCju%#NWd<8aXDnoa1M=R1%7KTs?zHguy9L3)Kh=D_F0y4g1ZR6{$IV;{$%
z1gLDPze@i{HSLFe&D|JYe+ppL0@mvOf}O8QY-jV{q~YI|T__-J{>?Bd5WBHU5i+mg
zeSY}uPi?^@J@7-AU_hqP+wcG1eg()RuU-kd|H~^u_g}9B(&7%It)b*g`w>TGWFbmf
zR46!dKy!uPwn~)y8-qj)(?bR!i#wZ(>g7?G-|hBH8G<L?zfSIa739T^Z@b~x_)iz3
zEwA&J#gBk~OGr!%wFZfj2lvlG0!RQA%Bxob;>k?VU_6bXM<X6Sq|ZPTy(1u0ENuaO
zq9{;s?_bC4^-9qBR~g<WgF+dBj_CnvymDfkIxm1KO&!2m(QFJoCV?WU256SVFu<P#
z_>zVEPwg3~kiO1R%L=o3#sAQM0Y(rCkcNdiVnqH=Ww_;4uU<V(s#W{X)c>nn!}tT!
zNW9)p@}H0LuR#OwgaPzw8yA_y*W{W1{&!HXc_%2g+u#4sv29+(@&+EtlmAOB2c}B^
zhSUC^Hs-$rZ4&tb8a0g1?WsVP$L;CV)S7q1k=%dV9(aY<v;INMG&}D{-h=NPLw5Oc
zk8%rxu_F`r>Q(UM3V0PP-%7Y-p;SzlD8(+afn<J1U@A2OtiO0jRQj}-y<_s+Vr*`D
zVgxM2I|RR8#6v}g=hq|)(?|M^BfkEp#_6G2r!%1@M~aaK-HH~zXA1Y{#=R>O$)SH`
zUtwB2a}rM~cSlQU#_1ASSPJR;;zqAq2JJTIboSF_X$}?^uZI;?d%Vn+@ESbpbNjEr
zNm-^bH*FKw8~)xlouw)Z^kMt~v|Iku2hCp!Bo@<INKWP}{<I0w7e{0F1VE3*>67<^
zL2Jz1BKpzu=O=$bA@MAfb{}_V3u(Yb0vWE`#2eXa^EO818R+Vwy?DHwT@h*+%h$=#
zh){b^k3P~FjyX6{=+{a^1Nb$j0LE7n0H#$RfOYw^#a~{hXQ78?buMVV?|~rhW)B|f
z0=?5_;wOod)k@kRw`Zt*%KBd2&2|J)siQDTe=g9-49H4t)0NJ$GhGsZ5qDM1UVJbQ
z*(G~CgUSzD^7!ms#G{F4!vf2{nywdXac7f8nIa1Xl`#zyR>cy2O|ld<wGI(a8f`pd
zL5jo1pvq#)!yC|%jUvtL_FRQ_KvSKS6m`f#m^pmHgv#vxl9WytRmFy7h#K!1x<ApF
zi~M4S4NAW})XUnMdk_E|<eVaY`pkn&Kw%cyRA*jvQ?^u<T7TI`Q$P;DI0g6gbHJTU
z3iR|Jx4riC19N?AHan@}_|!Y{hFT9z12Fh*a=6v$s8Vq0==mwEsNw#2MdNkR{n7n#
zTdJ@iSOw{Gcs5^iu>+G5lj(tl`^|B(E&<}<Yz{NzL!lS6d|D0#i2V=29Nu}Tyxc}9
z^B^N&AoOQP?z6xquHU0LI1>zgHun@_zP4d_wSSIo@mj__n-p5f1#BbTiumlFMB;nl
zA>8o&0Edb%WyLCSln=mDXrG-YyH~gv2;rz`{iLkY`cH?j!$*eP!!)3JKU1oxQ6_JE
z(vrUR2I<h=b<OKy>*MvQ3Kh=FCSqn<&w?+FZYh?Te>AEmAglr;1DtZcqrW{w%L|xf
zrS;ZVajJX*pmDC&H^}Dj5cfJYXG&1=j}*r8KnIQ+_PE-z(glB`5DryT2<9ej6vTJw
z+wZduEEOse7G2WQ4JYyQXVy1#Vk_P@dS2tMY!~(FDbh4TsFbCZ=3fma%KikTc$yP;
zT;z7C|9AnsPj7Wi9=o;?DLpLGD&eMxx2wml)w#VpaEN^JI<js*k$g}oHAD3mLEXmP
zvJE;zJk=Vph%Yjk8C0(FL|f65VXW+)nLsaiD#x)T`*G9ZR${(&?>*B6TW8Q%dN5~<
z*KkiTg$Fe8hlK^h)TgW5HNbQAwMV};=}Rv%oNM*82K_v@S>(oC2HMuSyKsL}$-iLc
zkpec$I-qF!!fIA|GKCyh=3?y{ZWTjQ*(6uNOOI5x_JhY!IaAiKh{ST1<Z73w+xtJ0
zg7Ky!cIhcI(=G+nKm^44Syzy>+H^C0*xDag4k3o0I@u4%g!@w$*VP90ZYeA#zYe!L
zV++g&4!<!UoqgwDDKyS~U;JzipZJB==hBdZsEu8YYsUZA%h=o$KI2rq7&`8FT`qT<
zV>cGJ?RKE<f@=>|fh|Lyi56QDyx?Cgc0RC#Wb8fPrbcqCDdTt;>)uRN{<lLVKJU}@
zT)PB<x?4itV$Z#4p;RB8bD7j)Z5K=+efi5qXxj?GNub|y&6-a^(Pd&Wc0T$K%%l%d
zc$3j+=x7k(9U?Eozm?sM@ARB3uj|um0V|Srd*kFc?E-eSiw#~cB$^FQ1#5P#Uoy#J
zA_=C|tQPrxHGPfQ%gkLZrLh7ykky@rKp?P%zJ2n!yYJ~ni=(B`l$-~uRiUB2&Z6&x
z2nU3qb4<Q?qi(l4g1iID4Zas69iQX-+p>Qk+amM-(y}k7JT)MPsb(^H(F=s8fy+fC
zfN&Ju^zh;*z;o@(O~!mumC?KOJt4o;ZOYCDkQQ(<wegH~R|WO9eRZZo*9<FdoPP{2
zH5?>j9>raBXi{R0VaH*1ySvnUyYfGSjDvw}c2bo6V}5Qgc2RaR7`@cW-Ehe@`~R@_
z)?Zb9d*8Q|Y`Q_ZK|s2@Q@TT1y1PNT1eB0&>29REySux)<6ihZ=Zxz*_doDFW86Qf
zIFRkybIrZx=k<;k1{=vS8yA*mde!QWzK;G|%Ud3=+j6at<Zk0%;G_f+u!S;+x{3p0
z5_sCJRf$*5`Xy5`L4VT_FuxQ7{X^aiqgsDtsXlo)ppz>7qgGn<VQ$_?+_~=1p#Sma
zl4r1~><mLphO+XaMjM2-q)$;VM319UN-9(<{Y|i92B+lu)q5IKsi~e4kx-<&@;Jf<
zh-4HkrSz@g=4j<;h18>0%X4RS@-Bwqd-yk$Z6G*88m&@Y#|$2$?u7^iYTM*PkK_aY
z#bxJj28-OefNURD>|i~=Gb8fEGoch9E$tWxDnCMFTvSXtWj*0GX15&$Jotr{-Q&Mh
zv|PsM*cdBfOc(v<12TE)S#khbRLkn-xr0U$_$9}T(u~zMc24lja+>9v8{beq=%ga|
zbdu1dsJ%p~`&0&}goJac*(aSZnTyg?Lt9*jbjq!S7l%?~hxf1l{(UcQu>ICT1iZKI
zQYN5yntH^)nRP7_`j`D)YtrPVYjAgvj6_~2=db-ffmclHY?-q+>4|Hexfd9QwEh0j
zYY-)p_7A-6w7_onub{X-7_+fVe_7Ri$(LXcGU`vrDNZPE2?YH2$yCl3YJ<7b?>)Z$
zY*&Xy_=eH;=EX;r`nMO~tsFCf#WI7-`$33$@u2L22G{0pi)>`aA<)DPDLG!~aV6-~
zX<aBVE?pFbfFs>F=lI{yHZ*72=J<4V7?hw5EqCT;^03^-zz$2+emaxM>SVJhB^Xy}
zmt2WfyAeb+kI9q8J%|J&&2{wksT_>&qUEo#nbEXH57q^o-?U#Wq)i>|4b844v|cNt
zIbTMhRe|MV;_%m_JbO@U@p*m#(CkmE)uuvm!}aQ7AWAOARlqM8n!eQR^4EQhTKcZ2
zhqPamJM%oe5nV7EGe1*}xpFp)@FPuhRDDN!m1@sHUw}iiDbfKWd7af8*%0Mjr~#fu
z%+6tF?&dAFY0keL4O?2hMnVLe4gbP7X2+Yo#K|17IJSe7Mcm>7cO6aFXO0JSWphcE
zLZ8_~F?h$4Hk&WH;b(R(9;eaE(&$aU^C$4AQi%!(?UZTC<;{5td?tzUtcL0m{#*=t
znah06_a4T1VA7|hzsUId@MNT)iYB%GH$O$aAmH{E7{)4!NA67jeD<kklAE<7f99gq
ztoq4h9b{cvINE)2&Dfa_n=G)-=fmb@Ep4nie7G+F#z6eid&@j5?0q<_Ybrs1Ghp8%
zO6YUvR<U~S+DS=i#|HBP9DeuIYTLdMEm3#y3J<N8a5A$~8nLGGxhTYqzcy_oHnd3m
z0#2$HOo3Qi;;Yhi<aXA2Nwt<HbUIl8EqR}!7^GVXB<2Xa8?gU^BxIW^i*i?}S?~G-
zcLf)t#@*fqfkP$jB{<0JFmB&!pfkOxRxtejsD5=Vo=m67)$I9f(4h7i(J|<vkXO}@
z-~V*ht;I$ao3$Q^wJ>!X?VQsmpIM{PCQR*^@vy7mMaB<vOAOC*?&!@uuWSjp=Xtcs
zyB9=imh(P)yyzLrXnXhG4)vRKx!eD9YpFJU<!<?FoOc?IL^Tr6Z+35y#<eI?lq7xM
z-M@vRVVpT<GE1P%mShB^UJ*7i1wF4x(GWH~r%20*r}>HzqBKiql^K6gJ^v0DJO(d5
zWaCq8xO(W^e18Gb4H=4>cf*`_J^o6RoKUUbO$NzFl3t;aM|B<`C%b8h>N(j|WL<(Y
z0FTRIz0e%RcG{=7CDa(j@iKZo5xKZ;<Ich}+2r!G7)!#hjCUAp4lrg1$KA{LFSUAz
zkBrQVSdAFc4xB5QTobo$L(b&xQGID7^p)hvu4pG@4GA^FH!(10EA+XQUAn=FHS1n=
z{bp!>-(1++l;(t=L}!Yc0oE?vAx*I;i(?23Wj9jn!jC_g%q=n|b&Zbj05n*gah%>C
zFwvWiT4d|~F64=bKw9s`yFefmRYE?vKq}@9FpEj1%?C)E%ohi)u*e!(w+dcot5~md
z|8Lmt3`S_9unx7xMXu+bW)T!C$*+!Mc*2>G%v`C#BtV4riwPHZO--5g;u{7wGhRCW
zlDFx~|J=)6M%@8rpDbS(kYuOPF#zkSD^M*}DFmpGRHgl@Jg4-G@5pKF399<#^O*oa
zWqQ=2CtE*IHzu8*C-z0wr%Ivy7eL!L%RImj{3hUGMUiDe509(5GS_k8?4I^GCNXP<
zs$$~0Osv<SZ~71PFF?wG`IsrH4?;Z_kJ2p0{{Qm4Q*G=3qL13neV)r;2&-FIwn>;-
ziqR>Zr_{Wm6w7SxS~Eef%2Xx;$+MaWN}iT@+YSU;_ZtBl!_a(2qB~@V-(gl|2ZGwN
z2JYR9`UYvcD!?v7Xc+`CVnKg>SddiyFOYn3Q^;#fSdoSW7~b%!lkHhp{c-2;tZ0&R
z0;%6xAEgGijUd;~KqPT(J`9^v-gQPA0J}>a1<3F=50VEWq+@6J?%}A7tg^Rqdn|hW
z*%;q)0hIXTH^3oTG;le_4`kHdHkRER4>5usk}zD~4DE^A52+~{9<wh$+`-Wb_$wV~
zpYSq3_W{pw0>2Gu6Wn~F`nj{n1<xy^Q@K$5)Zs|lzU!0gy4|#T^!Za(k6sB2{BiA|
zWVZ$Oh_^6+pLvx}%hctG*NcFgIv=ZaSwcKAeWpAn4W+`N`Wo2X>Co51XxmR0T7bj5
z@qE6x;;B!C1DyAle=%W*N$e$trw}3_*sVtvja@mUYY;CNRklV}Nv~mr%F|o4<(%WT
zwWZH~XS{dF=nYC$LmX9+R#BXPSN4xWdDLA^EVi5<wg|4YHeS6fOw*gSntX!Nbqwyz
z8yO=>BXL3nOuidU;b`<F3)RavmB#4c7S$xc*P2~*i0<w}p3bN3s5SeDXQXJ2k7KF&
zxbj(bop(MPYaBKpd(x0p;>&^B+$t=RcLq>aOO|DGnaBBbZAV@QV~b6x`@TK^!kymS
zgmFb!zzy=j{awH{1W@`nQIlMbLzLj<3ZSV8;{_FH^y(vlkgjzeSHHGVD{>lW2dN>K
zkEJk^Gl29#MLvoQ@y9zO0(ANajt!kR9>5Qj^*tZWHo3O{z0Q2nbxspZXWdAyworm(
z_yLzQ7dMcyjy?X=+*PXduDLC;zd!#A7;EfM{#5?XAZsgo1&A3I)|?UXvV-jKv_udq
zVDU`Oz_t6=!DT3=tCn<gr4^lzLX<D5#8K^{x$wB%c(|>F0I()=5jM`f!SoM0<}`Ct
ziJGh_QY-n5*S3^meti$n_hOm}-&hTC7LLQ0#K9(kv+*=Ldc=%|Bg^kY-w7{=V%b(Z
z*4`)4i0}xd1sAk_VoW}5u=Y;r66wXDqxunyyf^GkY23#%6ur@mgplr>8)!^O5scip
z@4lRB3Va%xt>W`bsW@F-X=s+)H<z;M&9|s9AI$z~UeNskYN7bDcSk~T8=Sbs4Kw7X
zM;F$o&majeS{*|@5VYx!F14H)>#Ncw@0`Wz1!GI#9ZUq&l-a1PbE8T9QpW4y0)5Bw
zw1)t;0@B%UFE@#hukjcbUgu}Nk6N4JiuQ`A^MPdD=caBH2O(mXFy1T}J(GVzEWQMW
zdV4u%<&Q{a7TV<tA!F0lw}<%Kn9$tW;Pv8t?Hw*!o&xm(O@Yi;Zw_-~UE_;u!-TER
zx3fj=j$S)V%6XcXc))^-zLbDzi9cq1483j=I!k6L1mZ-m<cFEPJ`edGgbgp6A?>1o
z#DOe3|6j4gG$58jwPZ1by^<t|R<x$hQCn~knIk7|ncYV{NdBbVLC;qXDCe<bMp9Sc
zUiWVzB69@%<!BOIf$5RI`Me^{ZY_6(a|ZPbhG{FT%`GdeD0kk$Nmbhz)0fj=UV4Fk
z_?j9^j3o5s8pLy$k$S@ah(${gOQd>poU;*}_0y&p3Z9)Ch6I_2JsMKKp<<nJl2N6<
z)@TtJMsMOiWw!9n_nO-Xk=apZ3>=~zWc_q0BqNMIEaS+YlT*FsTz-dp*iiTc@Q)-Y
zWe1F92(&OdpKRA00cFr=XM@`Dx!>P@2L4Of_FxLsNMQzDVV)N^Dhm}Ql)6^Y&WD4S
z_7j$+p#-6`5|=Nvn_yricEOd<H%rbxTgH$kT_`^Q5y|&{Fg?0eH6yc3OnZy-aQMV5
z*#*GWNcJ|3M6lyPj@0-rj8WU@l;VQt2-w8aq2FxcENbFfZ<PaS=u`1pen;Il_KsX3
z+NH^^ZIpGkY=4r}a6B7V1vz*oPo8XVVA|^h;YO5z6nx&?KW7H7023D2l3~m)fE;8n
zH-4@rz3b*w=zDc{h%Q_(OM-^}Hyl0!y;vA>fJ@m|+TVq;Tu~m}6cSA3kmKcPdPVBE
zPZN}ZTGWompC>;9O#=Vn#B}j!gX<|YnP$%o2Xz4jhbE)m*+`t@u&52h%$G;xFa1hF
zW=brOt*Q})EKG~%8oGxR!StzIHU%s29@sJ-9W4nd$aSE-kJ#31r#p|FAKIaj$9Z(Q
z_A!tA`HjWfDCLgvd1Pd)1WE2BO6MAdRddHh59dU76B<J!lM`uA63wzet&M+M2BJwZ
zPZ_I+V+Ac+CX)=;g^%8IxM34k&%>oC$dN_ARRhqXrxgr}HK6;Qep}jW_bSjoPBN#)
z%=S18ujcOMJ^6)y>$?3**9>sW)UqbB*#NS1O5$JCEV;XM+B@}EzEw~kneDoW3RPHX
z472dtPiIR=YesL9%ef=Dgn-pDK5M>Q%!qKXqG8p*3C!e&a)9;nw|t>0N}Zke#J`)f
zHVrG>ig}ALrCg9g%p{rW)7BP)xa<>6m_Yt)RFHh~@%XGmI7!wZCvCP_lu~LXzYp=Z
z0w>cUK`uW8OFO(Bf{<23JJP$IQ_dyeUXtUOPXJO9*K9)#y~e)+yWmfb3ec##IEkE_
zo_(;ehC3|-)N?1zl8aH6eG&weWp%TN=!hk^(GkdL$m%ReD7Qeh{G(+p*5Ha6VECq6
zxJ@X9{*2tiFYT>i2h&KV_Aw^@T=21}4f|Szgj42hT#P*pTatgs0XX%@886m2%Bmfc
zF;Ka=Y?vCYzNfz~GZ{wu7zQocn<CTd7UP}J$U*c4Q|t&$cSs610v0%m5<m(tpN;PG
zLpR+OsuYNnMdEE(_~ZN1x4*o*>x=!UDotK5@Jkbar{5*3!tbs-giNo-|EiGS3+%C&
zs;K8Xf1^UyB#iO{6_0<)7Z%15Q|u~i@|?}<v|VK_?$xa~y}bacd`H_pg9X`Yx4dA^
z1_zkEwTV*LM1=DV&-H;6?)59JfgX17gnm)B3urvlbGS%}3!v%x8=}*Od|z{wF!5#v
z?%O>I4m?q`%7B#LrGMLpQ+gt4@ts55{JxXmI|(Erl^u>+d`{IT@J0nX08AfC*kx3N
z80*HJvdZ3S0-$n1d8gwoy<t4BU%q82=o0#u#B(Ny0~*e$QoZwgC*2DiKGS+@+!@kt
z>|HUEe~=^DGs7s=4@km(cLF|dA;9C2SA|7a5b%B>h@kfN84zc=<2}DOFzL+&!&_>k
zsWDGs#Si-K{CUP&_53RHzCgPQr+KOst9?f?9Ic%_Z<yf%Xg`@e#7@M^$qTuF`7jj9
zn#R!Y*N@y4xn|f}KpT8XU+NP1m?!MrAzAd74;wabLcEg+&`yw|KLC3l(g1Gb6CkbK
zMvV2dhd1Ht1@s`@dI;9v^&DCiD$3NFkXNn?%{NAbmdw%fWaOO21k+91Bljvh>|j_?
zDs0eU0zTW7!9Iok0Xe}Lue&ogNo6jPm%D^FEv_oQG%47OG7qY^2ACHiWzQa0SY#g!
zrh^i0+Rl2V=F7WHf|1DO;%jMDcI3&VCr(Y+{g8fVxwS~g^a8^kv%?i($3TA?vR|?p
zm=T9X(n~<jtjZnVm!b@mr}rsmD2(gGuJO5+bA>IJ^3w0JPw+Kqd`R$u-#;g7z)i8k
zE;<{Ggb;0vnBcJ@tybHTFWi0*O8;9KFn_f7-iPKCh+vi+QY*B_d~}w<tq@`oWZegR
z+cL>@L+4sQ)y~ghcWqC@=}uYOZ^KLuZbU1A>Ns6~_;(0MG`Vo$6}&vvzck-Qu<((r
zM^_AnQ#>c#FL+4&7E|9Z*S27fIYM|*^@t3@*|zOS?h-VGQQ^q)#{3)v3yEwc7P=Er
zoWLn4{gCYE68SlR$o}7AaH#fcADT{O>_{wiru+{(llk%Pf0ANqx46@;q0=vHk^|u~
zykMzY_&ZmP%jbG61kT&p^CqdRLfUVQGawQy?V>=m@zz$>`I;0Bf!z^?+x=@({_i?<
z;&q>Y-4lX6Ij?xSK?3Q;4_t6eQNIO4vfleD8Xw<i1O1CcHTZ9t{`VdK10mOU0SLLp
zlN2z-Lj+;U;7qI$Xe&daSqf`KjJ2MTPJmxTl?P+d4!rmJLNCMLpocQ&=$2<xeagsO
z=R3(7nZ_ypdDc>~szlbPoS%MU6Jw<P0slz-6ELXaVOCFGj>M4gi~7(m@m?W9yd~kF
z8kWG5r_Tcs&xuw1-Ot|Gk4M)sU<Eq)1l5#R1840AKhWBH1>eAON?fSyEJU==`UQq&
zy*Vjy(9@it*EBIDkk0SV<_e_qQ=22O?5q0DY|JHIIwr(mYa<yHqRa3CH3-WurvJK(
ze<S+E?aX_<;f&m<3Fx6yzawk|9-2+H%NH6$Pv`Cgl!w^tT@D22VQ*KN{Oo52_ho{r
zvdXkql(r$0e~2=q-nZ2$RPRwrd!`gB!yp<4A!0zCqFWCdv01Dvc!(r^;d_q<`svAb
z4QsX9oW5X7hl>wQXiV*#nWkcRdCuR&{!ls7h(}3M=?x_Zn7pX)m{(~1d0Wy6&%XQY
zE9Rgp>N=Q8=Phwq%Zoh{SSYt5Uh6BBS3YSmLtUn^-zR&2^UAJu5Yu^dlfV34eBBff
z70&^E!&1%rw$Hd3oAQD_sSv@m1c7Ze;UyQx8c)Wfl7~?#{Q@%e6)D8`%HL>m$}jTI
z=y?Np$+hruU?g-_To&>ftoaHh_7aeU*Bj&rSxAd_9>wGh_N=^ZrK|f7&X1Y<lGMq4
z#+;H;{yxDbSk|k5@}#4W2Dpze3kmWbI+y8>VXzpf=mCy6{+BoJ^$gLjoHAq&QN{ex
zvkGR)v0WGPUM=S1A=;nID&RV{S(VKbw~W@5hsrI*a6=AAI6u?s(`#*T?SW`*ujI*A
z{+rwHxeXYu_?1R%PaV8FDVJV)8=B&In3rk+I6_FL{xcRScuicfHHGxw_%Z>s_u<+)
z^uw^j*EhH11r`8T4L$h*JoaG?T+_>ljXqp8!z<uM{H?0|0tuW!T%^e}fftW!ACH>p
z^-A6SEuh{RV4QWr-JM=SXS|R~g%`wUug$MPHoOxUcS%prjo3fa=yrkpeF3_++<uvP
z0*}{<7{?pW!gO7Dp-TjlbOuQ8Z%DI1aDSjxbRwK~^k)DD3&6jjRtWiF>cF*R)4m2Z
z;Ah;zN*0JChtU5g!apKt@=*pcE>ylx^2k#o3IB3k9`=<U(rFHe@b`3*)qJD0X#nJ1
zW&U_f19Tllg&##A`X)^miK&65BcS11WL4$U#KFE%<=nsS>7bhfq3wa#isli7NZ(eV
zzishHXDwx)EP!YG;@?Q_xs~YmN9s80!lEU;Q!g?aRFr>vptA(uje8;!m!ON=(sZ0Z
z*sL^UlXTSwdVk+ZvM&Xwa2FDAX*YLuBtQR`r5Q{OFeo<)kwN`8(7t<P%1=u)W(EHR
zmDj?6K6bT`-~Zd_41jL_x0jgiK~DSMu=&eda60Ty^#AS4Muyyg2GIDqL)q_~{|U3-
z&^8V5YPE@@{}<$b(;_^*VQcS`xN7D9V&=A@0-zQe5ygL@^8fr2LilF>9&L{P_21_2
zQsC9P1WLgEZ;}oI2c}QRPj`RRCKUe*zqerkueKCf670XePUQWzR`_qFG^zi+uK)FK
z)&m~<RSXiU|7i$+yK4XM&G>(B#y_j*|GG0?d~RAgTf;~{-~Na}juwD~5fR%;abV(1
z)MFypJr7u(Po2MANiR1urgWVl@aO@PW?*X<iE$Xw_(~ig|BhHB(tDStp!BfzVW(w7
zifi{a#ya@mps=i=sFllUgzBG_>l^*H#G@xw75}d#P6^z1UP5Fr|GgRffAu>A=3u;s
z^Kn?*rjP^s_Tkj_`ueNLp;JNBX7=H^Q;iPXDp`ciCwaw3S}4j~tD*&83P`4lu|?x(
zRI-{HZ-Geu-WK|3bU%RrQe-8Xq=7gZahHn|F;~ePSHEw(;I7_9YDH44g=*Qb7}WAF
z&5v&g|FO%PAnvw9E`uzQ2w4m=4A?B@Oo3!^8;QE5%2Q<RdV85|U##a&4W+4@7qmRH
z#T8-VZEaQhlZ8x#h@(|46`%D3!jH7i#$Cb+tvWnTrqd~!>=x1r(?yMOx8pICj^_(b
z3vO#4f%cKM<CGX~8<}JQ6&kf%%F(om-Op-JvDZhE*;EZ;2=4crpC2sGVPBpZGY-k^
zwut2;!>5|eP@Qa6lL}P}vS3~B9AQ&s|LnQwZ(8&!zMiBODEPZ8XKS)LEd%u-A9-n#
zZ}2!ZKKY-zDfK8s6sm;B-tS-TI^M4EzBJ0QtHb=TKc{Z?yswnt;zY&+msgIZ*VWN}
zy6c}@V+#FNl%+90Q6cl$T)m8r+?HQE5Q9FK$Lr=hU0%*)q<`-dL6caAhyILUEtGF%
zqyUOvx;lGgZUhb{DqT_SNfz@ieE|<Jqk_yi+=v!C3%)uA8c=Fn;#Mbnm?_9`oe49P
zcIjNDP$$mW^CGhY?p)(~PhVeH*E_eP1d6V0PDit0pBP7*J>p+#P5<!6OD2ItGkEGu
z><4qF>OPAX;$j`D4je;AO|+0wE=9U66yWM68%GdIPQyY&&p67Z@#T<Puq%ie*IvhF
zKel>1xc?NaT1GfKvR?G*%CW8ZOxpV-u6K~2KUHams-ORQO6arx8i?|X>5m&)5>J+%
z@v_!<#?sut=xF9<UNd>Ngd2SmrlKq?xDmWB31q8r&A?PdO}8`ZC}B@m1JqY4ZuWR4
zo`ld84wvz3Uo=0P;IC`FQaJ5|ocTyjR*RY{SDHOC4L_a2?d?y;ywV;bN>pOF>Z~PU
z0!Js{W*A0v|3orX*@Hv(^kQrRQSXSf$wkWPbQ*$nr(5^+S!<5U5eHX?Zj;j^<Y{Ji
z&dbKFr7<Eug9UnPOY(K&5z*1`t+Y6NV7!m#J|wZb6(^wxi3Q81<~YPF#Rc&sniMc(
zbhjaEUX`hp<`t)$C5{90%)Uf)_m_>pLR^+1Mpg^O&IhLg#qO|!xhS5?YTN7g4c?>|
zCeO;gq<6Yzq<3`}7wSqr{&64f9bempbKZqUhGnrm@zj|v^<6)G(x?kn<V;N!nefn8
z;{%dbVpZO58F8c&kFG4O-ddI{U3w?)(~6jCfAO0Z5l{<#z!SIb1Tgr5a964+R54v0
zSBiDuu9F}a3=l&&85C)a<8n#nXGyQb{ysTT=Rt`xm0dL8e7+Dtz^^#-$hKy7m@Y$*
zhe)V5Sy^JBX&ji}b@Y0(%sFri0p&fEs~;?zt|ZdrGb>w&xQIxQ`E04V`B|)T-(a@3
zpE_}wc|3mD`Tg}?)=2sv$3Ru9g>{<2jL~GsS06szzH57StoC<KLu=ksJO!Bzz0J@_
zdsXB#FwF2I?WFRWAU@AcJ)892u9cXC)kp68`>XyAqgqMQ{pNDo3_Ntmfser6`dOkg
zp2bi1CqIE^(H@Ke$I^@W3a)aX)fTbOgFOuIS1n^osFD5{UEJczFO7yssXT}2R6jTa
zo2_?sxa@YCV!ET7!cf4FJrn%!IqZTWt7;$nCrzfBmAKtYY~)m6eL&thVHkU}HY4(*
z=6sLeJuA<+JBDJYK2*1G8=_8!hAND{=(cDo2Dm2g3(wyG11gtk>bU*q#29UFa9?;^
zVJa=NmfC7$PoKrT&UvLT%(^~L1}M<tt{cu~znihF17q+7Grp6aNrCuM7Nb!Ql@x`%
z3I`D_>Mk#+j*T%>8*S&yc1)vVC2Hm*LH5z1SVzP~RaIKM&L#QYsSDI3@Ax|QEurCd
z|KyFP6;(UG9(2!U20WZAmpS82M~12@#c9;p?;l*lx2A+=<XCrTR(3Tw<x^M;BZt1b
z9qAudqM(|!`T9`88XjIEDuYX|NMaGS{vqjCqUh{g;tfS|DWuTZI^=tU#d00|K(sUO
zJ_U;t{*ShhP42YK65>|DGIn^$mk%Gzm0QyqR5(B&OX1Z-F2U{A$32*ZZrSD+iKvld
zGuLzi>iyXw?~ev6YhN3oA9*(2%(_mIx6qTjYm9<-RAq1~gwL59Tv>;WT$btnKv~$M
z#r5|9sst|QgA{j@sldQjhB@>3riz8e?$Oe`k1}IV<gAt%6YV2Uh^>veL^$G~(^i6)
zzM{1pr7SAf$>On^#R^2=PO0tCu;01KZrd+4c=a~1ST3fjA9~+(kW<`48Gou*$bf8>
zDKdVW0jTtBQel}s*YehGvwF$RoGys>&ik_CZ6LMxdi5d^iX3aPDT&SvwIg}XD?%?4
z1{E#)(?<31#NMJyg*lgK?G;bar&JyGbQf5`KOJx3iHBeA94bXW?Dl5|E855S-MY2k
z9)9E{9V{96d81R0A?pn3%M>Y4f%ZfyO$S7}t6C^yRCoTNDX4ZHq|33ooS{<^Loy_Z
z`5pb#OQ}mO#Hc?fAQWsF$CIY{G-qs%k`zo_+<rJy+9tc?j$b{Bdy98g_$QUcfPtLd
zLdbUQlC1u<5@GRUGBm;pB>r}N`?J+yx&E!UQvtt%BkJ6Z^F}!sT}k*dFA9S3OIVHh
znx0DwN7Qf<M;HzzI&x20y|gShrC+^}KOAQ_nGFKJuIBse6Im-b$>YGO!)l{ZL9Z|!
z+_7-dl4_MlpTyUQO1WvuR7;mF?4wKg4X&VJF`^K8BPVm67CYQWZjs3!I?Kv9&q%eK
z$@3NS=sq(0nzoQQ#{o9%q<(#pHW*24#~hBa7Z<0K0hti7lHc9<IZgoyB)#Rx^iK0p
zEd=?U;U7eNw+u*)7BlYDR>x^NErKJ;iJ=P8sXuT9X4RI@n2U~?1XuBENJq;N&q!Q_
zi|wjDn<HMeb%-_mjt$Ggv`Ez02OgRc+33$ZC5g4JZrpd{emMlB{&CeW^YDsi{Tth&
zu{yvz<7$D$NePN&g#^k+{q+R7wPS>lgc=ch(>R>#Eu$0GNZuvi5d7}si93;z7J}TZ
z(AX5qKV&rWJ{-ZHk>yuea9x5<I^_4Nn4{&M%Nt2yP?X8pggjCzt1#XGOd?;>OzGqx
z)MCGT0U+|mD|oqdmYeYG%H@kREh~Qdu=CCzyj9*P;VHpKonZWnA4rh?fpN($W8EEN
znGf#Ii-|Qt+0a)CjrqcwViZOz4Z}|Pq7j@(Kc;Z;{E&!;iPAd0X6e{Cs7v5CAn8AT
zV-`{*St2dB?pv_uSf#!5=Cv?nQt^HuJWveGyY~NS!t>caJk=Be#kBI}I(5E~r)|&Y
zrD7~kLyqYFusR7oXL;y%;L%h5_Axs}VIuC47U}k$2|Qwq&z+@ja?fG`P`*BP+u=<#
zS|xeryQ)B8+g5GS_?5u%m)gRG6wh9>CvPf~gMAzYJ4Hh-6Yc%xZunC5_jRH>S7Y3A
z(rY+6T)Yy-B9$nUqi9!wJm^j5Qx7|yU|-ZH>`ig`$)M1}{-cmc>(4`EFPEdn`OCQT
zoGd=`I9>zey3f+2O5C`fUd&4Vt=DZRT(;HuWxi<7nUZJzRxaAclmq@vLSAx3%B3QP
z8X5+$4|>b3X>!fRY4%oStF^PvsZr50{FL?|<~WPL3F`Wd6{XvfF<SIZOL)+Ki6DAw
zd=mg#BYa(SUU(mV)o4;+TMijhG55CQO1SgISeh=C;D^quQV<KVq}OZKfGTdIs{TZ}
z;Y&k2I37!^+ALY?$QpmeFs(sOCS+^S97B=wv_dAYkOZq<*7d9#P=x??L)hdoqOR;|
zHWW8OEoP67kD|u%r*fFIq$1S}Pzv;XDa;XtRka|ayPIHI$B1G7>Na=?w*+Q_G?4de
znzpGx-N?bjb#Z}zbA<|n(eh8$RyEF^3MW<lg}L=9l7gStsI50;Rh;9qG&A>cs+3aZ
z2?AA|TSQ$_ElxO;-q)YN_!M6suU)7=uf5Qp4%sDp-4jwY+jayHdGif5GmJfKEUFwN
zyzGiJ9yXof9$i|ZK9xKoP5K(<DcAa|TZG&GK(5k0sKTDtQtBz%{5)%uJIG^^Nsb3e
z=w|q>IRScorz+^y;62g+iU+Q2x{vl*Niw$9-*?v2f#EKX9;>RX&k!MXk-nqwB^_x+
zxp!!HnD!?G_Y1OT5yH!C(M(lA;$sGZs9hF6kkozwpSx^_-xYd6M3}d#v<0+mcuVf5
z>sv!Pw7x8JQHf$1FSb1D7isx`eIf#GU-Ve17r^Q84*8?AU#sb`7)7~gr&VIDHz701
z&Caq7Dv5H(h!{@qao6|A4J%mu+N-g5?O-f%orAQ-+(&v}^~y@8$`~|*@fmM}O<63K
ze&j~E4Lf->dfF=bcxDWX4!Jm#Z#K*girFF4czGZbOTW$(X=~y8?QlgLS9TI}4L}%+
zzZ_29oBx<*$Rrp}<p?Sl7-z245C@4)La(vgj4Be=RmGQKViP?miRm6&@1Xp>zVz{B
zc8OnDH_x0Wd*RjU=A5=Tj(57viN;P|Pn!Qw*S1a*aDRdHfg1gi^Q9^4#O8(BORDyW
zUj#bp8Qr9jBTd+6g}bWhwM)~j<j?bHt5=ROkVN=lrL0W>tD?gxK~3+w22)e8>q-Z4
z3y2Ss31>LdJek!z-`a=u`8};hbA^OL$^lXdB!7al_<9Pp+|Mv~4&*+MuuqPJaI7*J
zc4yZ$KWF$iInzd9*+4BWTuszBC}Iwsr4R&<JHSmNZ;S$*fIkXksKc~O!*^IhUiw>@
z@EPQ?!i?vzL@Km6SGg@5-fSrxu}oE0NNrjW`N~HvFV-n&KJhc?v(-H5*chZ(j>tyY
ze5WJQa+Y=BPv%O)DbJ>7dGvXH(9&9znY(*6IAJkb2q@-7@hol%(}3>FTeTf>VG9L`
z9F*S&=qxk;IQ(%#WnmygTT*ryd?BRX42#vlW%oh?<nArbk_)D@r_)-=m8F_0mFg(v
zQsZmY5Y2r*<@FRw6ftY`iXnnleE0TlV7$E>o;=*@1_VIgNeRlgE9iMmz`^I45ae|=
z&0cG)mLMslj01mFIbVH9xLWKesE9Y3c`tnzmgT?4F*dNtD?)g-YFP>8<}djZQP^xG
zmWT;`2!hy0ud2#6AQxE9=onmCNYa(a-`*L^wqe=_Vo(>kW*D&#?kW42qCV*@;u|>?
z^7IW7!ff%1dMEvc7Cs}b@nL&}L#xtiGVWVA_TYIC72WXA|E2rS39tgy(+1~q25*1e
zDe~@}#vdbS@hPgp>&XZsRFZ`1FB>7mF9Xn3bJl4x?#O0ehilU%a#E8&Wjz6H9?}Q}
z%guU}Q6N%<C+Lf;XX}c+p|v#aY4XLn_s*@|?^P|rni0Hh&s}xO3%u6-WYRz{A--N;
zWINm3A(Hp?&Z8yJs!FP?S975TXPNBWQECm62<?vhxb3?1oXf4G#=i1zmxtP_PWy`{
zh<Y&?R#K1NYB}3O{u(40#I$To#(w|ElMK1af?LoDH_^=Qr6_n`XMaEXs{EomlsEuc
zo~!x0-?172y8X6BAc0`1V|4vC@b`QddCH<MvgeTJx3%mjfmYvkzOV0-<5`WEnL$S*
zyO$2|_uEVCACn%zLbOc<ok+TTDB5phF&@78Q;t&W;ue`@wDc6bjFH`CRyQhbk6%x=
z$~NWo(V+Vg7CbYzXlc*cK!Za??H2_ap0TPATX3mA?n!vk%j^7SfxqbM=owFz`h_se
z05wOcozga_S*`Ti{cu$GyGt+;Gp8%*MJYi;PfznY!~S3aOVzMPeUDDZ8F+yuyTy;(
zNQ8O3!LUQ*wuprIB~&*?{?<XOmm+5t6%gbhsc8m*$6qOesYF1E#a!@t3(!r`M@6w>
zcmz&6^*^N=K|bO4mUTW+onb*r1{%rt4s3OPd!Ua=dwyr6zn491r0qGsc2Yt{F8-kZ
zRgx>+GeQmgCt1t%(g8(HC)wxVolb|d@Oh<aD`d&tr)}j~+STOy279`?Aa{6ZAQ`t)
zC8PsE3Bvk;UwtyrkRKUH$$DiLSMp7V-{PMWf!?uUp!5wz?7LKT?wUo(IFsF6R;AES
z!CcVE>jtu1c*h_KMxGl|>iJ11j`>`xL7c*FDMv_eZ84498Uzt_F{e})@PTXb0WTVx
z@6o2$pFOC^L6O?};?FiSmJ@l}t#41ouz&kj<(t^<K-<v4s@yQY)?U{MbuZEHm>WNZ
z<1tGP8M0CJ!w?+|zHH`&51}Xg#%1?+MnZ90`<BOFe>c_%32nd{&cNCYLTQNZG;@zm
z(_WY?UxBil2R4o`aL_&H!jDB7uogNW(cYut&gV`z+Slg7&GH#cj3#nua;h2N_<+Sc
zF7hDYSu=)095u`3eNp`>s|Od}$4exXASrm2{kjeH9mgT%w!;6Rz+hhVBTUV5$u%p?
zKMz6rY*{jW(tfR$ovLa)o=aG8jNQFG(bJy>JlTcZ<Ycnyf@ksP_KlW(8J|*|rK}6q
z?j9#jt<-pG4YsGeGaz+0gC0F52ZW36ec)0p$FV$Fd<v9Aod<i#?sk}At6&noz^|z|
z)!hG}1ZtDq@}D!&j!(Y2z56x!&!ydbkT^+K;8QyL)LAScZlhIg?636>^gE*Zw&PQ)
zB^<aBzOTXbTCfn_P!JOg%ElV$L<B#0Nf&>`GCi5U@SGQh5Qy6$XL6~jabRi_mYygl
zw3#kypT!ucZO3`WklPjZl(D!yUI~4F{gL7NT<q*8mSrF=!?;cR$S(vplrj;-<M+`;
zP$I~7XC-WxcXuL;V(opxI_qON&XYsXnpc0GD!yftoV13dn%DG9N@JB;@m@!1kj;^G
z(sxuRr}3SzZvO!7wz<yuk@!L4*}e`H9VTsvG$!BcbI<K^PQbi?5scYfGh~Q2Q`-J6
zbf!(xj~N<^S@%So9=_AffF#s+LK(r@7<TFXp6boKi|KX0Z0-~>`N{)?ND~fi3@0<q
z)O?K5CIY8?6pmSgUObWgMTvJxRgM58MMOy)r7O^JGY(TlrO9L>G&u)S0jn{=D8@=K
zqnK1a7sQ8yYd4ER^=^k4G>TO$ZGskQY&if{`GCz`y|NAEb<}l4R07%BCTW(&ggwx=
z4%X{paa1YaEBz>=3%QyEBP7ITRLNg2j<xL>e5pR}2oqze)Pd0q36bd5WMJi}96RyL
zr^>p@ij2JKldO&0W<Jx0vo<=;@YzoS910E*{(L;&aE2iAfjvxKB>>eZPs`z5#^C1L
z^VyY+B+1>eR^9J&;fJEEN6ozh7V%MyVF(Jjtj_q2Cm$cSRU<Lf(%rF7;u(;u-zlSZ
zZwG49g}x`<lQ`csNg7TOA5ZG1xtK3v0jJ#ZjkBZk>?vBtxw|lETve42rGGtAqfZrF
z1Z?t(-lq`V_4UT8E%PTERakef<L`t6M-OZ~nUUvY6tmO7nx{&7P^=N;0ABkn%_S6W
zp)~&|E}T&M#rqZ>6~2&A)-I_tJLyF~VXY^u8B8tyHVFvE<2c}S2gUir{>3Za>Iy}P
z5CV;ZE$J2xx}u|l-$9qKL^_CJveIDExj}C(;IK0AQBC|9>`yQu_<NI(W6z)2djc%j
z9UFS;2{1*63yEwT(<jDWq`qsU#t*GcyjmE8<}HT5rajZVP&|Qzt;E^js=WNO-!qBP
zwmiW*UKRKs+u5<QtF&O$cWrvOZmY{<WpM|}_#p}JEwaL`ivD0ma!wN_LGsTzyHxV5
zyUHo>;g4WH@A!B=1!}#9US@*)n9*HckBXXhG|8Q-hJaObR~VIo1v!8UZWOuJX_qe^
zTw54%3<~3qh@zTVZRVFtY<9Kh6XIhC_ca4${Sr6=wY=!9HRYXUB;52uYbVb`0x@4m
zG8Fn!SC6fo){jt#H0|KJaq5eN-+6XDi$^MnBC6(gT6B&%u8F@iTVbV2{y~?%A=aG?
z%@+BNJvUqP10zfHDHK{6$xRQ+x2&u6><BT1lJFcT+Ht54#ERlluXjbC;Fv$@lMA;-
zJAyW_-}xPicv7-iw`g`?3q18KAFmL`?5$&8qCa3%)U776DQuf*n{@2&$}e|a(98Qp
znPBVlR)C*cMIyZHZ(#hwgLB@M!4?l~a;HCd0glxA;Rf6E?d&G#4E4``Uid?T&1lcW
z<|iVJq=b|MXr;I4+sr14uTRscz5bYcEg2jSCR4rf@M}(A9<WL`D<AuSkrCozuE+xL
zh$HjQBTgde9nm1s@^r%5w!;->vErebiWCnXkzYb?^+LSxB<kooX#LXz47V|#_eFJI
z)l{<DIQ~RY@?kxG`z#zC@wnRzMVVen<vm!Bof1ummmDO2+TcEI7II%dbX9IXvUiBx
z(>$t2w4X-+b%Jx+h(2VE{|r*&?PP`0uFx>s-XTh}$w<I>>EY?tu0tcO!?B@ou+<c_
z3_>g4j&kGS(^(P!!JKy-q^|g!;1zlCodte*;#U--YhViao<dIR2CpkO`8z(FK%(G!
zhLehRU4Z@qOzc|r2d--BTJ{r80dWq<9IE4JtubKqF_k%JwqNaNHJ%TwU-{Z+^1J$8
zJhmu6{gBp>dY$N8_;w<)Mh8ApRnlym^77S2>F_z`1YIm2;CG8rv0q6Fe1hzWwkI&k
zz^K>w+2V(Xs}1&(os#ERXqWR9+KtJYu|#1hp@GTMcLCtCnjX>}YXKQZUdmO5l1^Ph
zax#tU6FqJ7Mb2N~Yc}VeY{3JTZ>S?7EW@xf!ETO#z(_kF^J1rop4-fMTlVUuGkAdP
z{VWjEkC)EvRvZrM<T1c1<AJpgjMCn{=^tRj+pl&dxoh3(E+=ph$;>txom44B!sMGM
znm1U-4Y->8$twNmkXjbgnrlUo@<UFi>4^F95L-j^y&@m|1v6N6M{64Ua@@mK;ODoY
z*eoC0+sJ8__hZ6xd&{zczHBY`30Nq)-)bW+l7t5)yo7#L^Dp7fm5uI+Lgmumo*ZJA
zTyW~wSQI+QJ|Mn{V@~I2#B`1#<IiW-*V~V#DJcE0;a@bMYwAa+zy$YK{xoYjOCrp{
z@D}{!3R2N2Tj_J;_6aeuE*^ROur`#)A$31QbTQGh49y{=P&hHeZWG<*C1rz=g}E?G
zE*&(M7w{RJ_-<#QN8M#3Jf~>s%sGp{8r5@(8`wLt{_Y)lu4xgTCJT@{yT4NuPmVUm
zda*S)tzpv|?{B|6)0K{+N<Lq{KAzFKj~FMZ7d+zdKe~M}_N1p)EfyH(*<8wXGd$>e
zVcQ<WfHdLg-B-0<DlOW$!IeA>)N21Kj{I>G6f5Z$HSz}u%~nKed;x|BI!XgK%E7jS
zbsOG-s~D0H=b&yiL84-Oy!1EGC5Se6Q{JD<%7_{1cyA|kZFVQ0cCr`ICGmnt7@o(G
zFTqSgpJcqsklTsg29hwcSW`fVK|a4@2P&ar44nH?lxDV3wRNgD^@+76=w;^+hR~<%
z_7`F}`tzZOLjjXGman>+Yj?#Kri|Hzg?~kX1g?-=hgv5(<)Zd3E*0Q7YUAI%y9Z$v
zLS6R`Awkwo2g~KvWB$GrJwtKI#x6rcQM!vv7-hNWd;|}kw$v#iPVr-VJZ^M=Xz2E_
ze{yf)R~6(~?I#VYe?a)*1~03J3ErNehe1!bW0lDl$I#^ioDpJ=j`z{DQJjjyA-Klh
zHu0%{y%A0^iN;(H3HNZ5pAL_(M*P>rrC#CvbkDky$m>>v%w~mNbX>K5aw@LCJ>v8!
z2xeH4$Wy?ySeu&~iXj{b!&ovG@sB;=c~>-84#ZhF24(v$Rcc7XE%fOZW6Wm0T-Dg>
z1p=wQGsxnXS;D8%sTs0vq~*3^z-$&613Bc)pYJUF5gweg%%kECj0DK#_GakXtsbor
z6WR}vG|lNGENzOXjucR)R8$(zt|odv&*_%)Zxqj|W7?Uf=`tlPl}lzFqTz*H<GG2(
z+Y>5g3~uQZVDd@`TvelX>Cvfi;R0uAc;~opTX{xHzfakZ$r55^a9O4CZ$-~s@VKP%
z+%T34vV*Dkzjqu>A*p(6x8n$MlYGr}FjxfE&jpK&xkTrKa~@Yy+PC}C4mKa|(H4wg
zG&dsPpJ&T<1E!8Ly55JS-Qr2|55SWY_w-3-EcCVA1FvYBCIihxs)?UrRyMlXY&hIT
z6Qv)a<TJB$M3P->PLO4T(J$>;q}C9Yy3H{=9hgVOpYxxXQ&%DbZiV7Sy#WEuH*l@J
z!fNpM2u7CDEXmtv(8;WJxAlgDt+xU)8V^b)$ikmAp_qT9X#E)|7EJEBb7)R{FZ=o0
zH512F0q~VVZ~ul49_L_VEbPxkpDt8U$Q4iU9&SuN-BGq!QLBXzZ|iqPdOBHe1h0}O
zdN|{CYty_*PUI9dX5v|1mb$tj;LD6Qco*K@US%@sX0(Rq{MCDdDjd}T?}ane^1z=D
z!h*~7a+``VZo}{WBw<k1vOiT+sLSu>yq{D27M4X-m9Q4Eu{64oGBT%B4wH{OkGS9W
zznFQ^51B{-pJdVB1O#xNqUDLHvH`716Bi^8O0$|8$sY$hBBEm*ypMytf7JnJgn%68
z#gLo|xE^iL;6hiP@06wFJ~zfs1jU)Z>8(NmNsf1@6&F=K5waTyG#=i>Miq%F*@y)_
zV2|`yiSQ&2h<0ApC8dBro%w!4<|TX=H!(SJl2fgpqX|qm<qZJgkgh?pEG!*JgaG&v
zBxN#T*&-7s-M*Dee-qrnUf-le{HoE4fF8(`=o^Um+-XCZTEv<<k|BXFj4Ur449x7-
z;GKpw@tZbB;2j|PL3;N|^1qcjB3uBRBLz(Bp8usp`v1>&7{U1t#8E2_Cn*F#ElZ@&
zjJ<nD=^-g1sC;p<k_}`>wA#M|Q~}bAj5T`%^dIlm*0a@A)9kkg6t2xx3V?_~<xC-{
z4)9>v*gRupgbu<B_p>!(UZ3-gxao@DE9xL`Nvl6hn%0ih+k^3HH9k*UX4s;Fg|z=D
zo{&M{D%5IO8ffB5(*tqSMg`dHW&offsLI!+PvfdKmn}(V&CNfUUY_7}zX+hyX_Vzg
z!lRv;Za9C&oh!nvF<a>>k-on@RT8_QU2xm`uskyCKvPtgYVTR;Hc57{(!#1zq9{tv
z>&X=;oA>3)8C&#HFV8Uf*HpQVwv=8<;aOX6EPxcpx2qi&?9M(c_cvOkT{J8=7%?BT
z`2g_|nK;vhe<u8v@^VeaRt)p@dWJvE+RdLu0z$UayS{ml@V{aIbKmyEZB%PW`Gln1
zzcer;D848VkMecSCEB78VA>7NF&3*!A13OcuTQ2{c)m71AKg1Fls~>p<#K7iX^Me9
zoC(eDCk>Xjbgb^RB)n`h+<&0uleBf4X`rmLl=5nH{M}$zVSR7K?;3l9&$;eA)DSZS
z@g4YtP)EX=_W?8=3k|Ozd@k>XKF-T`#$W)=67%A6VF3PeWq5m`8!)XRz1?{0WCbK|
z$ucMt$cpfP?<Oe_IL!6oMAK|9>hm3`ue4AT%F|E(2l?4rD1Tpz;{DctD-77MLjSEW
zaN1Piqt|KqeNQv1V5}OsfYi&6e31?)POi;+3yvnOjVDe>DK)fwf@KT~mBJ|^@_p)L
z102Vk=>^=>vu!Bx7i+V;wYlWK2m1;ULB6W$5Nx{o+n?@Cda-5u?(lI@hb^2Hi--3_
zOdb1t9f~hr6e&m<ksZT{5l6~am5H9(r<6-q9GJE+7b9V(PdA^*mvu6*D435$qlyn$
zTlC#qPeArWYU^9?69Kl8ueIT^Qg5<9UkQ;6@MOo9?k$cq)@kw4-1?Bmoa)1?<Avc`
zQEw{&n?ltZ*kf+@FnjA=Oi6#ij{J77#JxJjU2Ep?XUFp}lKm8}cZz(@^89hj(^t4W
zm*t5%7i}G}zoZNEBJm0lQyXkUXDsK9(wwdSN(bWKL1&!vJ3ge~TwR?mNaoBtpMj2d
z+x^*nt$2=(FB9fGqOI{4CqlfNr}qED{gVN4dBk|<^dAo-f-%%Il|N38vOlXCh67Vy
zn;((FQeq%(fW4@$S2R-WO&CC0t^Zs7@7^Oq1n$+~lGV=JfLgt>IB)E|@|&?Rb_L`u
zN>SF5!nJPuSmb6SA19Rj!teDbfPfD+s_O7f5mX@PrD8pV=~^i~yRXM}6<_viEGBO>
ztQrL38zFoX3#3abm^HYVi<ey=7fd_|pM`%tSZ@Q4nf1o7FL&J}&a&Q$WZT7|+4*)t
z5P`R=nDlD04Wg+1z8T0Ll_2>e^W1RA(VNKfMznv|9tli_iATw~YVHFuYYn#BN#GUQ
z_HgEDu@Z6As=tel=IoB9IE^l?NZ^C<IRXSKZMG+5($f56q!bOjq&7UJO7K6|xw%tC
z;0ujt*IVkCHzHUG#{3S7?HeIai0+VJO&2fZJYw)KXg_3Ux);3~EkGw-2A<I7!EzC&
z4NN}79uj*zulk&YOjK3wAJjN5Vj(~|H0U%`UT+-A3q8BCcB^w!Xv*vxF<PsYnYcQ3
z2vRPVI>Ds|J_It$FW38-4LEV%Hl$8+@6vryPzrR{efvxY;;W#1<sVGuWZCt^MI@+i
zU0g0t-DqK}vxjlmmCY*ctvy(3^6_znW$wA-`rOoZY*aLj*>aZa3tB`gmU^>B1AphN
z!sDJRvxlttRNxuyECVpI4L;qcF^sw*<{+72S|6B2J(o<R%=XG>KU^zcbX%~yvk0HF
z^ULjfKkqnhE5jMQtfteR#eSDAxnIEd*xQ=VY5_}KpqSK{Ipg<uQBqE_MS1!M)AtlM
z!L{g$?_-}JRyu83OmZtx=3nN~>EyB|ySNu?)yr}8BMnyub@UE-`pwfRK@uX}BnWz8
z$Ue%Y&~1oRd2X`RmWAgMALUpNWGb1MF>_BOpmy%_rLc<P3+afvBA@MSRamDTb3MO-
z{$F{6XZ8#Izft_D{%D{X@te31^j^4zS<DAnnKP_YAw+i0*aDcoRG~_HA>I;=-KSgg
znz%g8rBbJ_QL!7AC4dzo*CEDD&Qy{OI7d>Bq_86u9h<8ZP1vj?X>8S(K@ifaQ%}0K
z2t;23x4;?}c;?OMwT9W{OzioAPOw~hj!ffHEBTBN{NB9Eywc`f7Ax)^4iaoJ`1Zsd
z@iUARa?!WB^}D{8=IMgKA|H9O)uhtu)6QBC<4k9DpU!hHM<8F|n7RBZthE4}r_WxK
zLFruyH(5OCguvjjq^7M%idlA0!6YTnRO+PV;vse5%<~2Eb+};jBXNtK?ITS5B94NI
zZTbma?*hQTDz?e2p%6ABW1i%(n3ef*FJW%#hEP3`2eWy21IJZno%^Lr2Pc&%*5c|7
zoBkoo1-vd<jN_~@vMfGNo<EaS=g2zBUHma~j3aPjN9Z~5WH;te@+1xX1~W}2SNt6l
z^16<;{1aoSB3ThWDzPm-Zri6UH+kGDbChuv=a@y$31|mh(y*Uqga<Ms%rs@GU%s*}
z;vAeQj?HBAfc7zljf4|%Dr>=*(SRi~n!PV=fw}dySw1ZhO-wO0;e!+5+^qH{%9A$Y
zK1MGD*;J9HzG>;dG5-J%JW|1<&IqOd$MJb24|AwKVYi~$dvPTsM9Gpi{s=&p0{y~m
zp)dFT8cV&ig`!Av0h`%nEyN!+c=|`@i@|;>p7480_@uU0OK8S~5T#!YecJvJGOu}6
z0D?#yNP5aeK%1iN5Is8|F6^k71K9p(+9YjSB4VDS>10?B!bu*(YivJx2Q|0kSv&q4
z?LY7cv$=eIGR%~#O4B<C{{Z~H3S3le=6LbyenN$yN^*lO9VkBMIvoz3OA_P&c^mk<
zvjjt{uc^8uj(S#_x}miCh88bbE;bcxsJ~v<mBr>jzBfKkxHh0PwuPG8?DS<{GH6lD
zybz9^hT+3BPaXW!zS{80fWMYdh#X)dRR6Kh>Eu(ZPKB&ha#DQGA^YU^)ZqcgOQOh!
zM>n412io8~Chn4+2voD3xmomn@sx$A`Tnkh4***HPAI76ii43^dt^B^Mm3zqAZ2D`
zn-1{vO}%%b{K(H9&Y57#T%?~hTOHCX%aphQryr}!R^s;}JQDa^QB}K29YF85l~6l8
zO&fWpXAUSyt*bt`f>E@vNz7wz2w!)*6j=^_{iR@eCd2Yo3;1a4i1>!~?9fV(YNmHf
zSC|i{W}_G3u28}wJ7Z||x)`|d0!(Jde(^#t7ea3j(eKD+AMiB7p#B|NcJ1V7BB$f6
z!hX!H4hUb9!{tuVBx9YZgV~#X8hP)SnI2l?0DpaJ(o%{pQB-@J#f;|#wL#X--p_LM
zh*kRK)MOuAE*IZW-~2I;9@Aq;0LlEvn(+wj!$spX!DJF8(7p2EN81y&f>%bHaQ@J#
z_uN5G`c-pHDB9h)E|%x#W{w6q`WxsmDV|hBCyg2-%I%eN{@j+9nN{7Gwcr6HJ2}d`
zgjS>5*iztLbKuIy4t7Z5Gp_G2o&C94os4DjO)uc3@pcEb>5To9dkb@LS}FjmWWG<S
zK(tdJk<FBlneleVsnq&leYH(TyFfQHzA+C`D@NDxsrB`7w$>j8X}%GWTZMiu<;C)X
zBknYCibb=>pOD+P42<Qgt+L+?5QR^86#XZ1pUc3N)b32Xyzj00u1N^e^Ln&A^!0uS
zJZ1!tW%1CJv&3$e-BN#VeYvlkjx~aDGF?zV1wzJe&N{^Y4|8wb7v&#z?JC_VA`Mak
z(%s!HB_Jg@ba%HPNOzZXw{&+iba%(l4g131eLp*1!Ty)ezzo-X^IYp(hq`IMfWEz}
zFg1O8EAG8VMO%~{gt}s*KqnM5xO;TCZLTBW07NF+KqY2O0cw56uI2dq&#gO-*3~4w
zXi;~@PU%-QUyh_Wgii*M_z*m9p9gn}o~3|bO#xtwh(hdeCiGXL2Y3>`@{@e1Cf>}(
zv=4|gs>cp~xU-W{e){ab2!d1-@a38fzvw@31%SA^0eKM^_KNm#O1EDNu0B`8yp-#C
zAD+)}uwU6Mu^o53>>IOAiGr}aeXOw7mzpea`@o7$-&?uThrb_{RTHy{N|&zoEV$}g
zj?ymp6ZZ)|fy9OA7WSJ{i)H~8{TsNZy`guu9gb|EvEDi-FZ-n(8e6dy=~IrEX~5dy
z`xrf7({-6fxCINgaR(kp%^$t#-X0^gg{A0y7oht{X})W4us<+!Z>-X%0_~LE^LzK@
zoO77Lh*XYHe<x=%H7Im#c3lX|?MkJZ_}M~m+XvH$wg36sAss4VY<`5R{D_@lWb8=z
zpjPU~+a00L7~0M{;X>Y5oLIBR+-@SJndT$Vqw^%A=|%6Nx!}Lw>1B-8Q&lf>y2@lK
zJIFm7fxHeCh)b7S`*QT4(r>)oz0+wK;mDQQi)fI4vNty}!NGhI1bv6QJUT+MM5?ZB
ztIWQnNpaz(r|LkDBgwW<fK)0Sn#e*XG;6oo^nYxIdlo_4(Prom8MKUAv;$|iL~>+4
zb?E~ekENSD#V?Di*2~hyV|q?HMUsgN(vlfWzbZIU*;BK0Nd%;H>g<Nl!VB!hjSU`<
zgYcUR03qBS-wQsr8}iASiC0g!j%bamR#LIZ@?sUiFjpYf6cD~)%$jyoTWUsVKE5lR
zzmCaKG;kdLUR)sh8Z5GP6klKiFvy5#|KSotpkGfuDi2odcVhGMBM)dZ!>2z`ThzQR
zu0Q{{UXw2jFz-bp5+dTpC#3iWqxXZ@;ALFkbR_Z5du>0>J&9+y<PDe(&bIxEUwZAV
zFd#Af3oivr!@P%GxfM)$MCh(C!b@l}hFVgnVO8~_X^?k)ACO@|0yk;e<dilFwgyJ9
zM=tUMvk21~tLKvqN~>M5=7m)c!h5WjU4Kq#dngnwLBpOuX?WqLCrwWn<u&ee4>C=E
ztur?$1iM?()(m}TI+9o7*KQ6c0*_&lEgDL3Z>wK2jz0Sa4HbGkHslMM4#Nc?BCG58
zPkndq(Sr;@3sa<BB&bYtJh${YdDvjpDikM+-7+!0t2CYUnq}Ylxj%hEV}Uc-m0qql
zi#EtS+q`162%h?U;OiTb{?Y$fo64LKqlnc%-)&v>(9}$EweOdk!Tq`O>6qFmJCB~X
zx5kD38k^#@D;C*pdSuilaIC>#fJxoU5T5j6w{8rHce>xe(HeP+Eyt{(SetSsHty@~
znNOM=!kA*s$|J4$@mBQV>W?nbVqwL2qFSHDFB*V_CqW434hZalTDuQ~VOD9!pF>;~
z^XufX!ubp$(CJ;@32IaxwWDVL_BCLUoGjGKuRBM6bXj-yT3#9ERYGkA4DKc9r>nBu
z;~S50hx?Z!$@1dS2XE;VQu(}p%~&m~QJ|vf*X^}%BE?p$pRd|PlVlGoK$y3%L*}qN
zOz_>DOb8rW5jJ6rzvQ1IHj>68kHCq2*0e<Wj`jG1pssOaee>K|lK_K+Kz7EID+YmF
zj*3v~xSPaJFyNTM9d|+JBf3corY#&R%0klnWJI|Uk&+#e1O0VD2mQ^UYYj|-;mtV-
z-CHILqZBSM-kjX7Sj#Y9ro0fLxA%{iGq9C{A&{}Zri~fp`Fd2RhElj51NVhxf(Sj{
zVlF-pk4E1Uz~uIqbIow48e1Ffjz9h|4^c(bT0jTMsrQZPw93$NS}{gTknBw~@S~oB
zMUUfg{LtaI>*r39^GM9+*kFX|O0bYRB!{oLj~0~z+j@fK>U{>DEUD&tXevTJ3`P0M
zEjlcbtU7%n^PX!>a2WQ`^B1aKQo#1uLPg~tR>inO`;L3AMyF)!g%k6@DtFEgM2lcN
z<3ttcZ|u|n$8pDbk!?es41I4)M6Ze@@q0%Fj!fzAqGY!NBPrusn4<3-Dw+aVPjPgE
zWw71D3ioRc!q&(=T~F5V|7q7<sIIn!VqP7Pw-bI5NE>U5^FlnzjK5c2_DApzobVni
z?+14bbBD0@$^l#tX3XX5koyd!a5H8ZlLGF}f>yJQw_Z@#Wz<nyly{Bl!V3%!A#gg)
z&uK0TQ5*KULV|a={Rz$M^2(6-3os;*Yx-VQ@rfx0IeRA8ee;^0H*)+KjSKgv;*Fo`
z_IsT%p_)$V_F;XQ%T|Z4dF+3~R`0NA!dCX)*kekHAW06pt1*$hto4U<pu7kF23;f)
zH1|_k%wYPWuX_Es?rd|a7E7-}gYW2Tk+aXsF;+2wPO(n;zEJpZt~>>5`@2%p&UAtN
zV^P}6B%b`80B8q$xDG%!AFG)`Y9-*);Eu*lc2?^xB`?I!#37}4syovU$#(f1iSp$b
zO&|Pl+!Ac4f8bCf2%L*~=sSq<JjPuKbete}pugvCPyh}}P~z?T&v>kr2up8RW#*_D
z_1e^M5Dz^aET%>7%ANu@w*-yobJg_y&YDE%K~7c-Pn#*HrPALXKx}cq6-+8LI|s29
zIp7UP3G3xfZUu8jAKi?Eq&N+D(|X3TE-cYl_oAq-((P>VC>XU;Tf0}K0XXr>gOgu1
zXv(oP`#rAPPYH~E<+DLT3#%`lzMJ87>mP4FmDBh_m@gXj>jKwU&B~+K02_8|9{9dW
zv73}?FH#7xwK4dm!}La{ZX%*cSI3mhYYZe4EMXV|L}`@k?>Yp3B651S{Tva|_UVgP
zHAB_E;+)VYgn%6H88X5N&d#Z|?IGMtbKQP~Jd$*aI7gz(Y;6e111o$;1rNi4LmsH6
z1miK>Qd@a<hMpa!t}Sjp;w@D@Mq-2+c1mb7!|$|Bs_ehRo}M)<u06wVW{Cdk-|o$?
zba!<Cr(csV2JNV6TWE|S+!<*OY0=6+=tm&#t>5j(;T`@iamYw56oXEa!s_{2f9%RK
z&tzo!@)VhU3lwYE$_y0`liqVMu8+M|Na)(3-9uNO!-Bpy@DS|EL)Hur;PCMh$6OBQ
zY>B}t!MMnP*Jg8pCMQU_m`d909xF%x5WbTJn93={fnJ0wsiDewLfw~gH}BE8BKasw
z|240w!*osyrjW5?s=hJul*vlkxoJ)un@1KDx2=vJIC|lm|2!KKzG3qV{B1%S_@*kA
z3@Foy&?M3Yy_GFS3qf~Kl^x}S#h0VsQ{1~Vf<#PfEA``(^4k_cO^4<l0P<n~@ebDD
z5ioR39VwB9ej_^t2FcrIFU6P6D|aUnZs8_0V#i`6kfaxPP=?x%FAhmWW6q6fv%JBz
zFQYZT%w`i<RQ0g;tZlJ9V#jbvkZaIpE=3@CQ5xcqi4WriefEh8T*n@r#HU{s7HM0W
z&Og;_bbP|y<->e*sT&wt|B=RzNT@21$SyvA-0|=X^rVDPIEJig7#cI9$W%lVSrApq
zl%mOMZRL5qS+8YE+Z|%4rY?8jEOB;@3lubvJU*f9`>&)A|Cai}*i-P6h1Up(XOF-@
z7Ts^PQ4^cJ7%8kZ{3H*Y(nfkJxGT`?(Qr<y{ye){?i~6XE}Jb~1EcSwq89Mv9G@xo
zXZ8HhSwr?WB^wFFy!G3H4LQ6peW&<U{qzdI2Yy82_Vh-nThS*$YxyvP6h0o?8I}AP
zCxj@_Z!?%nce!OMOd-<r&~9yW@`st>r45)*U!DVJYQEAmns$lhIz0B(uP)ks0LT8j
zDc^SEnRAex^2$SiLqL`~(3V1eON-bFh`>%~|B1kc4vPs1?!AK!U)SE|v12I0kc9r>
z+rd5Ao4;^iKmMuJk38|4w4#M%l&m|sniaP@#xor#mfY-3wz7xekHe?X{=p>-R%!Ug
zqe`9HYu4AJ#H&m9hJ>Q~M6c(}u0@*9TxBJRLpbb~`O`vAWj5>FKtJJ!<&ierDcaVR
z6S`gCv1T5Ttr@*oWak4J97oWeY_W{SiA;9~FOhv8!lP$s8rt7cG74r~6Z6V*awiHq
zjC?wnYlNlbG?n`*{BAvsOay|1a*)vm<IR^cS+jeJ1=GQ(pnJ(zoQ-wX21wAlII3A#
z9svTD?NJ^If4*fee#-+0XIQD;*ojikyIhwLOdq4IEgOJ;M%?E6LEmZjlWZOZ`&uTU
zH0)@f#{AjsL73bH8-VwbxeMC!e>v@{-{XEjP2V<AnB}r0_O`z`UCs~TL-+-f_2(EN
zP@(2X=oEUlL4TkktBZln{0M-_%2z8j&Tvs2&g9Vd=Ed*w*Z4qu&#gwe=Yj35F&k_h
z&VCVrT7qP^zt@T>ZpYv0;~mWHnP_PH4U>1j6Uh^1TCQ_^F-Ye3ie0#ofLzIj2Yd~E
zLr0{(UMOJ_TJy*F2RJ+K&{7YJ8d0D|+SA2nGUH{u&_vkcd#XQn8O^iT+OVnFg<H!L
zCgYf8eP@mUW81$f>aD*|%u}^%TQ}U539&h+yzj*hPMyj%<p&g>trRV{3;Y1t;l?;w
z@`oGspun}4fN<amqw!926zQGjBR|cZU+T;x@CZX598NN;tDpFtDdT3W=5s;*7T6GB
zK5(0-b6GyPuJ39=;+_KNg=&3@Dp_o(iM;E@zIKIBKr%k?SCof91+WGC1SnNOWG8?S
z4yK)1dVh3JfvHs+j7O!ruLX`+eq#u&GJX+o0aWfE%x6o9I6fY>UwELlRZKJoPrp|5
zuCJ$}NUxdSH%M(^wh(g{BO|@6!wO4{<xr2grouS^JA}LFp@5kZNv$03kyfdnvg<#g
zH~*crD>E}oxp}X$Z{r1uFp8q)Cgo(PJ6RZp2R4v~3Y5^5Fl>svIfi_6Azj=mI$mmD
zYxc;))1h;J3;G>4Ihy6)yO%^j1!9O`>B+ZAKg`cG(3Stl*<9;7>R_O#61<_YG?iiL
zP_&we{aICLg<}6!WsSrps4a_n-q}#F9Abu=YLM4chYdrLS2cpqvBtD?wba_O(}E#v
z-IulL5O26>m^YZ9|Ay_iG0ld{&cGlRL#|H#72)QKz?fEmY%105zuS{m9)eeae&B;!
z;+8%$_8fS-0&Yj$D!tE?!}^Etr#dbU&%yyJ{#OB&iaAkW$S}gQSf#YyBUJ!?MCG3_
z+f9r5%zyIc{FGlw#a&@l_8j{)FQHu;CiNgA?KAdy=aI?X`S#MiQ%g48=&OO#y*5q)
zKPuY)4VMVY*zIhAZrsVaGk=OkF{iw0Xf@S+if&q5>{s$kLct)WDDl6i4;K7uDKohA
z^y(I6L;mXBw?VF|3=ha|)mMzkqDDlnaUODjaKVrK1s^!H{);~#rTB+eKzaP7F^mJm
zH~3Zs0nOM|vpZk@7q9RIz$<)y#VbVS&vQ?y>~(z6@+^!8_OW+BoPZuN?En6y%m7j!
zwUNb2{_9_{p$=DCjVa3lA{X*&Oi7NL8wkjzPnKtP%)Xz=j%j<|!K6w6QY$!E#_Qkg
zc$GpEF-#VVxaA2sl(y0Gmv{+jv9p2<=kKbh;0b;FKW0agezxV>2n;IJihkWgxqbs$
zS<u_KqyJIGV0>-J#+kAN!ibg<Hq550T6!#j$O_OR@nHXO9Y5nMw>r}T2y1AwsHvWM
z6=z>p!gpjoCg#CX>N#3!MctJnr%NsQAHxG!WA$nL6pKD0#JT}l4GH&D@c>Tqe?R7H
zTtY16>(6s%;v4@zKmY&Z>)SS{%6f)E74ey+;x@zvDS5njdM2=@e-{ZGx=f4qC&?B%
z@N8#~JvKR+h<ftW*ULL64mq#Q(Jb2ADgF1DUI$8NkWho&b^#Bgm3odFdy!HW)b?tV
zB08Y<tfGc;?*c-vvRrAh*tfB!W|zg7e8tcPnA}|_5s03x_RM-5!fJ|ublmrw3GAA~
zc4bLfRbGytD;Z~vgZ5)Uqpkf5X2#vk$xPJG%Og;V72|GYR|KC0BSRoO-33zgtGvtZ
z{<t;mEVliZVkCu1`R2N`&<w~N`Sv09j<td=aD-e93mCU7Kil0|_3}fu>2B2y%j3&)
zgCd~F`~IIYF#i`;D}brw{@AEj@hjV0Lb;EFzR;khF;<B&y@&~sV|%~a*nyiq&54jt
z0%*HW&VO7%`=fixoL|uIA1Xw#J4$qK1!SPe(n2_Ekm=sL1KJh4`IL?9h0o&d+Ca6#
z))t=I^<&^7_x#y^N^ZcD1qE<beLBmBO&dDwLI&XlteLAKdF6-WNzfBePee37ytZ<C
zReiV_tga((_@Q#Jm`gsrEu2HMh)$gADXEYc^skNz<NM_o5L7KVW_^B_aJM@C1dk5g
zY0%v+FjcB!KFdxzBRgwlcN}C${;whYd+=C^-t%pAT@}>H_!(FLtc4BprF^x8Gl3Qe
z*&pdVd<Im;8l$V;kETUxr6Lzp5xGD=R!Ax<`qpZj;)h7s1ziyF4@+%?XF97h8*C4Z
ztMw<hBCWh{EfRo0h4{DEwz785A1`LwQ3SHQ9tz{PT4_mlFq;eABPQ8|TV|t(&`SUL
z-{_GcgGydob%*#c=(Vla{#?8!&K3gcVgs*rEsvnQKa;E@?eL(X8F976eAISz^=KoA
zb?4N>)G;~9$;mE|ukM}7wk20fCuy*dow3YTRoE?u(JBWt<iDV@mSyrNV}cDs<aY|i
z^~Ve>F(@#=V?CjDS2!VJ7aACBfDNW;Rrrj0G9QO^&|AFt2k6P`S$z>=wEfckU#i5r
z6_|BkVUN{CXf=KZIA342RhTC6{!MDdz43kfgH3RoIklt3Vh4H6OTTn2{mZ|l=F<p*
ziQlQTA)&>xfyDVt4dS~o)<1ye`zmW|J<6KS?l0D+_>at#A7&Q#w|Ej`yDLtX8fxQD
zlqc|bQVV7XX#O2m`BJ~RUV<y<dQg!B0q>6OGwLyrp-9OU;_3yhr1R(!`O%E#hbG|S
z*%;)lwcy1RsLR(48EgJ^gU-!0F0j1YB?5%@l{3T#O|n~qAg&V5;CFs6zj?=@sYl1p
zSZR)8FMl^y({Gx4B7>}eP=hPX$YEgpBlW{T!0k8%w_u~5OvGi1uZ*_ZT~LK7*D|2_
zoia89g~yHLfu~K}P#}haT7VH%qlJkMLIvXes*u_yby#=8@a6rmKm^g0B9WlY1{)Ck
zj>9|gqq_m9&cXihGfJ>F6;uZd-)s|MSO>)%{EjCdYb}8&33gL4MHf}qGc)|xB#Jci
zgeAXZYD!vy>SDexd_aeCRps?1lWNgd-{iB)ugg5Ps%=iGlj)$9(cBX424EL)i&8F5
ziXiHUC?}E)BcM_;WH=m3VBD8*Ld(rt{R6Zt@8s4k)@J7vyye}P8#rF#8(g=8rByJ2
zFn>`jAW^*fkg>o#VHTpiZqMIUtb#%4tyOQ47m_;SvNAlVJeni@>x$;rS>e#v<<X&C
z1<ufs2!;opTKz%j{A{2%`R_9HXbG@;O;!GnK+6=$!j9{rfDf6Xq=-B8=6p$4Fnd5g
z21Y9Cbi-#aIF@SbhTOv`j-*lDzgs&#6fmo(ALlV)L`2>=b;?KMfBC-5a|H8MN1SP(
z?ZwA#Jikh_<BA}OILDLgsr4{~)JH_+S)l_Zo5h?P%u-Er+$%pV*=t7<Du5Q<yBaP;
zBz?s%vtbQrN)~eTZZ~1S20w3KSZgR#qLpP1m{a`UTI~FLkGAkh)~aR>1Fl1Mkcsy}
z?9!yUCfxOb%oCDvviDqVM7V44$GJQ;)?_w|$iYJdx=H|7vs<g%k&1#VD25UKs%{H6
zqK%1&(LA36k*OS&ZeshI^C9H-tQg373O|5wR)jD-RSXt8!3ba_v_pq&05sf?QXF|v
zw+6{bZij=<mMfm^?^n6J&h$vQ{{$Q6D5MS%9}jU_-AI)H9di%YMBC#<#}5ty6~$fs
zS4`%nEz*lyJ8ZUlLW>m75)+lWgh(BIYsLuX(en8etS`Hay5qh4@O#}!7dr;CPAK(#
zL4XO_vt}NX*rH_sjr%pPWO6UeUgL}DalDS4L&l+~xR)r6B9Pgz&uKmKIXeHJtSfVq
zh+<MWMzLN>0}GD+Ps=^ZpilF9vq*Y<s;+NUZ88Ga*y|6a_Yr0lSHw;i1A^`ggr}?6
zRa3!8Iw?l?QDF{oSdf1rE5QoCA(Dvz`Ue(yV#<~)C5cGp+v;ftft&rOXlr^6(`D!6
zwVY{V1;QSrdbmFgK80!F6n-x|Q?{`fdYOFp0V6ui;!}vmEk!l~S$b{UIyBM-440f7
zxK-83ob0!}Xhb<JK@p1r@p*R>@lQc}YnAP3!IrEXXg~c)Hn@|xEPTw>d;X@GV?V=O
zXq2cy+s_IIaSVR$fUo2mBp3c&)>tRIGuqT#g<agOwJ<abreUQ{Sg@zY@3KA6t9btu
zk-=yO?u7DoSgPj!_P&N1Bfz)Oi!~)p)a%dBrHXNNQJZoglZ2K&f=B<6710aM{NS)}
zHV>Ck_seSsbu|b9Q#rG6hWM~m#FWD-ww-(<yo`5^=Q#7r5{zI7yACV$?>{`u%td@Q
zZjHV8flWuKBz@g2(%+$r)ISW`uAZsXc&-{~G+h0ZhA39X<1f$#Ovo5p;t+;yaOqmb
zbi*Wcf%976`O*AD=+Qygz2`$_NS*Q1z(PpGlcB`~ZIs@34eaEuMVFO}>*yePb!BLM
z=i3R5RToG6dv3+1-YV$hry%tG@{0sF`C{vjFnZ)<ZGJ)~2p&u^t7__7nIvO7c;nx~
zN79^6(~)8j9==~1Vx;OdxWemvPDSK6J=l7%!orO72h`%hHBEjKc;)v2)^b1Oy;Ra+
ze*$tObPTN;l8AV+<CXEleD$tFXq)vqhCD>Y+Gw03{1f)rs=-{+DhPghvB4QSePnMu
z4=0RZDMF6mOp4<67wW+TA#t$J(9-tqMU41R9IOxZ?=H=9h2g?pl+Q|eZpi082%au&
zOuSzHwe!eaGU3ZY`n|W9k9^d$R#)?raX^AhdJ)d0?L~{7ABmrJ^N0XJGKr8j`pS9m
z(^hs;mroO0iJqcu6YOgF?tJ`k=FrK0=L`CqH!m;c0rG19i{v?K3G&8~tm62lA0sT~
zBR0ytn`)0n^??b_Vzc?M<>C0m)B7~TW>wyK*1R0d6KRk35qJTXun)pVgeX>ztD31k
z&z<2^F3GoQXQIX+9=S1AR(?ed4YgfaljSEIymKHw*YNae!D1nk0d;RdSZ4}=FPYBz
zlAAS|H^on7xnkFP?rYJ?YyTCoOpyH~dK&Seq;8j@EVp_fVXE$#6=b7Vq+95>uxtxK
zuw1B2kYie`H<MY`s#n_AC|k`OxJcN~PG4oo-Pp^S=;C_-EWcTrL+aq#Lr(s_@$gE(
zX9iF@_U=M)FHcSF_dB{I(;d7CayP&4Z!%jKeqKZ)%p2joxf`-dQKv1V@B}zLGK-^*
z7D!d$!Yz$Tn+xn}4W4}plRuB$fy&Hoy8Syw_1K@+Co*b-2{{Snx)Wm;@#9x1rs)nA
z-SNM(##Zpwx#PWVh)3kVNpga%+{4vMzeV+;9mxaNxtyk(Q!poZ7YyuM{QGWQVb^<&
zm#6)2`3s$<X%AXo#yABJyWV>9)>%pDXt8VD<uU4r<EoU+#nTMkJo^tD+uW=h)>tiC
zEqcuOmnL)kSs_{|=mJXRIcX1Q+SUE7z>NgPw>`1LbMM=r8jsxu8^7CQcfMGl9U_~R
zl%U&17$?FLxSn+fOm{LSXM5<$2-!)wyt`Fat+6q<qW}6AM4ra&Jz-yzh}EN}{oPQ;
zC0t&wG5R5)INh_JLjv_kytYUct(fP6mp~oX3+d^m4|}S`W-!`~E==8GY3C};aZtNP
z3iICONLw|atwol@BFm6^tf1uYF7_&ZLEYH5W-%c2*0Jd0L0{;58adR1sXq$zDMe1L
z9#_#jrQc4wG=_hQJv6Qma3>D-B`BYE%noauEPLyxub4N97K5g09ee9-#-cVu^i0xy
zoRRnd-(OzQFvBxk8TvWFo?|qr)6z~*+5%htD1AyHE49^%FnkiUX0Ft_$W(yq{#_`I
zz*1i_XX=xOYuwsPl|G)1d`lKnv2ra-81cBQrS-gXWZj8CX5Gc^n4<T+313S`o4NB*
z7C(H18&lcW*3a@EIE0S%{@TTXKI`sFvvU5tITdy+R#OWCm46l8W%!dgU_^NfqW%uD
z_|}F7SUZ)8I`zMkqf_}s9eupr9_j_Vcfl7@r7N^bT_nXWN>^r!@5NMh6)JW3#d_S)
zj(pn)%ZXbXTk9ALluhs~VzGVxDOj=ZeB!okTg<N;cdqakMv#HyrvOn3vkkY)IUA+b
z9H><5_-1cBd4a?7Pn~yiTkMxG-waFNVSf?GQx>_YpHCBlgXy9l4=5UgSLAr+VWmnM
z`T$&6YWBF(0&lvQ{@Xgvl|7C&HV5eGbVr-7?Th))Az=k{%ow4x*PRG%-OccJFa_W!
zp_)%;#!elwS$5b(Kr!+ndWj@`VN1r*!tmf^-UHjQ@~5d=&TkhmP2Od@>L8xwnojnU
z9jNZAFn~*Pi?2+Hyw6$=Iw{(ee4#G4=2;sZd3Bb_AqkxxVWf{r3|6b|{gvTT*Z@C*
zMo>(fjnn7Lwe?F9sMz6m&4};dt|ox2+Fo|YtH#O8^Xl0-cG#PvLAPpiw?@I)RBo1j
zjG6-oC}#5&>4Wt)$5cE?4x=i)O9i^QSYZPmco9@-xw+<?DFeE%J~NFN;|Z{Fn^}Y7
zj^E>24!uj#DZh)5o_tbH%;Pf%w}WIQ0-j@<6^_FxV9~<ibo97DE$3b)^u^f=^Ky^K
zRn9NNX&23t)6iFQs@f83p)eGiW_yrb5uanpEJ7o<IY;mN-l53ET2A^a!!0nt#cjBn
zL+_a-RQD^0h>84+U}h%nllt$;$-4#P1wGz_s*qnnCXmuSVB)SZK(QCA<<&nJfBJ=0
zUr~2bP+m70cB5l1!Myn^=^O8ei9l<OA|NWx0WPViiI2BGWcms<w{0Rl?k`PM+2T!*
z?}xE>8OHI{^OiU9mL(&IReo*s6tIoOJDW`eY}{#tw#|d_)#xfW%IiNSE0uj~vz5io
zIik3w(mP`d=Cop=(H&*Q)+p2AM1Qc5d}#FBmH>4@hG$B21L3N$u&Y(__|}bo+~pm|
zI&f&(!my5vSjMbL4x&$#*>>I29l}_63%Zjn%gFYD;sF+hq_JAcOAuW#=dluJXQ^AK
zWl(LZEA-3VkepjTqd@mmHluD^Qg{Z>A<g|=6M3Q0FRXyVoLX&X*YLKHKP^--&cT`q
z)S^3ZH7_r_F~7{lth6TiZd^IQi9I{`3fUwdA_v+$IOl^f7UG5&{9Xoly9!kYqRrS^
zfu><aFyy0lE|Phw5QJ=}AMXL^UFXDBDtAJ4E7s6>mnDA4G4Tt$0?kmrkfWgg_$y#7
zWr2$qn@I(`{ujIBqNX8k31Gv2PcuM`02A@%9X)Ai&z)f2tSd?xMSL6P@`6#Kq19Xu
zmF<J4{Kk^Ys`pC=?;uK7Min$F{#*_#d!e>$Exi_uo?1Hei~i2?s2H}qxYlafaiL}l
zPtkV$=}In%G2{2#N#t_64$o(kVWtoo-Bf+u#$@@=asvu}5xeECU5!QDOpEktRWLxo
zVkT!mFwVm_zHENWUj7rtF#rQTug8|8E9HHs@siA^ofMrkr&({ypEEaAYq38`iwLrs
zork%>d68Y~`1)-}2uFs&gW5XZeQZ?w5ZQTE-Yb|R5t_&*LG2M`*&D*Nv=gW{8o%be
zRT#}+fs0f2$Q|R+sHXV$xFVj_nlw)?y{eb+OY||0APMIeOUAD)z6xhj%0rSC-WpUK
z@g<;lB<DjUB)EOhNeP#`*3|BKCl1^f8qR}G$<8rrBnGH2ETKPFON_r}SgE#w93A&o
z!MhVgUYJnQP8c32X3v*rE`u42-kCdhe;Btr+9zV9(>JL)R^2aQO6RM<F(9j{jfs~6
zFaGJQ?UcHS_04?#0js(`71JFF3K)_d>9#bcWZOYWNmn=%nlYFpKXm0#IJ{~xUy$&y
z-Q0UWYjLq_SYHTcL1(Kvkv3bN84evxN%w#Q)#Oc!QU}y!ko0NMg<i8;SRSJc!0fEQ
zE-mn^?xp1~$Mld?V!=jw?-C<m)W-SggcKZ4F_}BG!rIEuQ<qf-#8`LrMNF44_ZQL3
zOG8GS_AL3dTX<@7;gTNLmU7=Ew^)QJTt7-YG~%CU+99AbkU2q<D~hskBH_{&oWOz#
z4F;jK9WNdtEUpfx^j8PMRU_fiA<B>F=>6HkvCCUQCY&ymk94(&IDa?T(_w)$ueh>`
zLi!REzpGp^&-GyXj-b3O`AxX>sr<qA*f-=R^#_PeWbuAGsE>!}D*Sn*R2>;@X8dV8
z`~39D{UkhbVstYvMZcamgHDSkDu_&He_WN9K|6VS-+6^3>?~8<0?16Y@PfwFD~*(@
zwZCb2A?$+J)&6TKe;%0?%a@r5zaKA{5HAjRq2+-SH4_9q9x*+t!=)At>u?sMfDSQ|
zkx?-Fob)Zt;s>w03*hYcOBj)jEU~P47;Qntg0Mz;MLJ-OmyEm*)^{9ZZ`6zFnB_@d
z2`evl?}iz=B+WDnFHnW7+ibKD4fOnL%ahZeD3!}S{e9_*iTZKva-$~s!E|da4;9QR
zGJdV$Jx8fgOcC@?X&fkm=CNl(rl@bdRE#!dbutP-D<%nunrTR{(8AB`<oA!_IlbfA
zHB?Poozqr!AKD8GD2fjmm7{!zu)F%fZJK5m?mLPT`LswE5h;&KoNvbl0DO8ZyT3r5
z_k^>4W^1x1)o>;m&vKZaw%@5l4|gRRr*&rCNMG<diQovty&>c5=uI}he!#s3)h$!y
z1mH9gB42r4=5@<^?=9Ai*F~i?Yy32V=7(<Cc}f+oEN>e^Y-E`aZj~PGqPzu$@xrJS
zqL+UoyQxV8L(u}PImcunfN!kZZ>lsqR9Nud3b>l`x*0(O43tkIl)wCZ$e|p~)pOQ8
z8;czh5lH8tpeRkgmm?=DJU%0MQJgl-N2F~EL05CH`NSk*@@0!dgmpuJ&hK!(EfX<y
z!9tq@#QNWOHJvgaYPa7$5{9P=Y}%LGj&2+!XoULP3!T^nB-$UzApupE#t1kqN}~x9
zu&U$BSximHNNz1~BpZJFDb{Vg1|G|jC)7i)>Ln{if}ong`__hUd($IP?T(tRYw11b
zUDiCu@OZ+mGwhgbuHBhx6uiFDnyK&DdTiiO&PL2fm<(k!Nq<3KA_}@He+n2a*GOk-
zVG3e7@Eo^HGR(_XgAtI&?ytgbk>6`Dz`~2;?eqeIofn4KJ`2j>=kq;c)}0>~+K*5L
z)*QoiI9Kz}^h(M=vhKj6;EJ?wk$-j5zUwR2WtFPP-c_}SpRMOe&Zs;r<NBTA5=*~>
z@e>g;Upm8l_9farSsFnY)~C0?qVt9kjJ_e-+EGj*BQ4L*tvtT{ZqCa0;K@zsfQ;4#
z|Lc9nSfR25AL;G9$z&!Ps!=t1@N}C3wt-*|QA`QzQ#f8u?3=#bh!x^IGsRYqz?{ja
z12FFKA&YV2jK!+FJ37&YdmL%2OS-TVGu_WO1kYQXo2#IV;P13UtKO*+is_)luMc$C
zGFmkVt+l8hN>q)BXT#*zWqaaP2YeqHOC4dF7-2G^kb>6Gf4I9?o?agh--U;h1am&<
zy&@iPSq3R?HLREZ=<U#}xgI~u;q%9)Dag>Uq9uNSCAY%x=q`Tov+N;rk03VYz12$#
zOIu*FrH~B%6`=v0AtfT#AP!kJi@0)gbWs={d=w1#w;mn+dckvi82RG^#mD4P;Fhu%
zded7J3A72vDmd<Mia^-0&gl>WB7D*dQH?IFde5&VArsM3E6xG$;Anha<`M`$bsi8F
zm8-%VwYOq;gbkLrQQ_%yU}~We32sZn+*zxa*BP%`<&Zvzw!@=CRVxNK22j}YT@ayV
z5f-bL33ahR3l|JO?s5d#r+2P%E@Rz>{NB8BB{gQS@p#5D=U0<FY_x#_|9Mxp18jfP
znO~5YPhR>56)`-;BK;kqLGr0$4=f*_yywk24{+C28Rj-M0p;I)?GL(M<c4@Ct-=lG
zj>D)%xoqGy9Zbd<@L*Gr6S%n5*3}cmhODCC0B%xS3@as%qr(YC_f{IdQT`fS2PeCv
zG%mwrGtQwZ@8gqim*SzEAX+iFxXo^ctsbFO`?#`h>@xO-v&V$q_#}nwdd=u}#7(eV
zwNK9>R}px>8gvaT9J>l~4t{=eEQnCL2#NR--q4nj1c~@2`6EKFLvZ=XI=*B#FhL>h
z%#77ZruSX`SZ~;u9hA9<_Z>kS`ESh0yP|dpYkL{Gjkz2Hwcx7vN!8t$D`-O1$A36^
zpon)?c5t3H320=RNb%@Zw57T7!?k>xn(+)W89I6Dc&~QX*}(9QO@97Sys4T)0-m|!
zjAZUl-@yB2RTj9-8xwv-rUy-c+L8w_CBLMZH7ik}TRdNr+y?S!`@z_w&hWijK5T%B
z?gc{DuTd!UB}MxLmrg6Q$V!AB_mTmLzf!-=Dm$|rLuO?lrB3#Z4EBT5`1_K(l@E^o
zYdI@7lBijBKV*udI5rsJ&%ZNvj%MBN8Ndq7efl1!MHD|(mWFT>xG!F;lcx={T6+C;
zapkp7d%i4=#{xGUoV;p90<(^szWr9cq7=5HTq7qLe_AxBJcoZi&hd3)K*Zx_cs;EF
zq>j9<*#qtHZ`Zd2DjA)}Ty<_Yb@cpw(10&$#X0gu(e;P5*eCyiT{hi|t{Uy1hI{(D
z!i1r$rocR(z1sbw7iq#3&yzIViCOEsTbI6PRDtu7)V4gsE(N8d!X?Iq<%fRmdOpnE
z2BLvL<Bf${Z`}FK(U;(b9g3uVJ#GMis73b;bhECogAiv0zTz+zjV3|Iba_=*SY2M`
zJH-kai6*y4BKwrJKGuu|+rzp|8fO19#I_+O-cEct!C|evM9w7`z>e^Z3x>oe1?*Ap
z0^2}I_(PJr_1vTcrJN&3oIogq41NyVo6>zt8_3s-LNlIRCX|OG;)KJpOyN8ivuzK4
zc}POGB=Xi@HJvWdhwV8NWmR7aEL}&F^#J`Qe8RL|UTLf?0z+1or>uDT&{n+GKIN`9
z^&R*3+r}ik)XjjKmHA&`WP@+|K5!LK>NOwTk#?AtI23+0>t{YX6YlKyM(jIf@1ggL
z;8`|_nCGzQOJER>ae?@w(@C|z4_EUJ)R5x%^`<k4r{~Aa)qZ4RVz!uW^JCv%(fW<7
zi<Gv#ac$cv6qTY_<NWmDA6o|aPAgb&R%5>8UiqRUjey40UyP9G$#-Og%?(qoK1u53
z3?KdmO8e#kousJHY3*I`8o46H>BF1w!HF8HpNPLsLd)&`lmZotLg^MQvu%eGlMu*g
zWV0BgZLQys^eo5~1pQ>g^B}im4Ev2a-O{J37}{IbU<XN9?*UnoTJ{HcPKmM@@%iQH
z$kT1qi?om1Z4&lm2$PN&*|GI*hrnZ1*5>x8XexFj#KGi1NI5>Wj58Hj;C~ZWl+VpJ
zBN$geQ+cdXxA3b5wu_~#1^u<1ef}M{clX>4@p*rms9XOAx<12oqB6xRxEfpw%X7lf
z(dTC#F|VjEnBtvgyyE|m>-y$1i<|8y^_KX_aDPdY&F~iA<PUZLobTu7Zvfz=ELgr=
zvP_6%^egEI>7rIe`?OyiLE@VS9$?QT@%I^&8+PbJ`C?Ia@H4?!VS}f!mnb1EF$()=
z@;H3iAo}^#^x`PGIlM#8_(QpZHlw|xR=L<pQXly#1+i}m)Z}8io<cy!q!#R-E+;h8
z#C4{g@bN}z$m>_D+hVDZa%i}7s-dzAlN$c71fqW&N>e@Wbxydb42Q*a{1`=!67bQj
zG(>4oN>`Y)oXp4h5SohAt&-@QU_YPKpOchz-de9+-8#>c{58a%Ym(=!h7j~<YqR~4
z2S0Z0FV=a=s|v?c>&+4n12CZ!B<&}8L4%oRCJV^zb-tkrAPBT}bV$+CEmlZCsr#U+
z0qYp+RBoG;vx23tQDMh2FP*kW8vG4>wqd^^L9-cUxxalRE^+8jF@L~y#YX?(vi)-@
z#)ZW2jJfZ?6-ujK2U>_jgT5DF;FDR$@_2(XIal@rdj^H=?6(Lz#6>ELeYNVr$9O9b
zd|56HsXr0?vvC%_w<^kAaKLxlBWXm&*qK`^2})iGr|Ov9H5sJaV>hz&f-)}~37QID
zJ7gKRyE$A)r?r_Po2%7=t;V8uJ@z$ho{MobI(hwtus{3&H^v(EI>fq%^?hZDZM&?c
zlUxUeN9MWaPSHRCfqzpumzx3;V+)4I;c}#>a63N?6L>DB)>2(0?3pM^CcbpBGr1Rw
z&2$UHaZ>;F<j1SvcTS(e@u#u5on+(Gu-!MZdsn4wQlYNJs>6I9aB~Wd>JY|ZcpQAU
zpCcLmD}<PPd{{aHEzQ@%+H+WUtd6f{?pE=zj<GI<ARs4fu1b4a`@OWbH27=1>6kFj
zXf^J0s#@vx@RL$G^6JHyrb<{~5$W^M-pIw*dgi7{e_Ug*GF0h)VxVrlXM|o1u6dHu
z%`(;P*HEZaV{wkONme>g3e#r9+ll8c>Q0s}_CvdcQ59gxhT#nN-!sgNY|rD~^C-V~
z_l3v3KSvZ%G6bj#j-NNL(G(UJa_(bdw=OY^^K}>%ur1a=mAfy!LhDJh!#S|z?6llJ
zHoT{K1({5UzoFs|zCsOZBPbzjONu`!`1{LBw%NSz$mIF)u9@)iG9<SjFD%k;g}&d_
zor!>^BjmjpZO2O=p|c?UQ`tQg@BHp=b1}p+>KTNyEo{upu{zRsLqwIS^tfz%Y~y0l
zb;CFB2-E6PNhu3?`LKf!%+fvE%Xfge+O|YnvGb`Xub<KdR45fCGAM?(!?PA-#XGIV
zjP>6Q^goyajPJMKKH<tFrQ*JJ%xsem!lYxbxR9|Wvj(OAOf$Z-L~l(5F~8vLU`xh~
zP6B{Uy&Kv1!&g@j0pKxN^!XU{fBY?}!Ee+@0!gN_Vx%3OMM4rX&;VM@`nZ4aJ7|^I
zQxHQx2X=pIa9WKtM!zi3$vY<NtX-z|pM!%`gM2(wK^}jhI<ygvYkhU6q6eXnuunk0
zf;R~LgGU=iyvu2pVZ*&v?%AtLTL9?)?dnsI`9JO!%@Cnnq-~e5I5e}j;!W6w^6`Md
zej?6%_BeVw5hy=zVFDj^H;RBhf#$#dd&oVhkBI=_tocn|X5cF-Mku|B!wg-@@_bK9
zs`1Lmc?J+Ar|3k||NKw?2abXPV}}R4xBWb7%+CLL^Z#+){Qvnk(mqSk4Ge{%AA;4=
z_QwZU9Uyzs8e>}7F3kAt#l#}vY0`N))G<&qtJ$`mW~(JpmaQ#Itvc}Gb9n<DN0OPO
zj4mqKVz#tjGvDN1baF-z3ywZZNF)FI=KFgPwz|D)ZL<8{5okxB0O+w-(+z=+2i<qs
zc+$ayh!Y6s<^VX~U=~NZ!^7h_yV*qhwZ<IAxsut0sjER@>8dk`)G@ZhtNByo<D<zX
z({u<BN;?s+^Ppp?y#M#>z00t6(WoUn8h9t|0r-f(%6PDnmuuO2{H-sK^q_+N5uaGB
zFGJAV%kf7<hLK|5*D?z}_A-^2gXsd9p3wJKB6k2JBGzFY%W`M;VC4y3*R<*%Pf18u
zIP<mSGCKaO`d44rabQ5B?90Ya%kYE@><2Xu>&6;O=__;fi-W_{l{Q5{)A=*vjy7Ie
z<Irb5iD(VsEas*=UtqVCq71<S2Th*>gb0(H_05`72=lASMk9IUE||L~MXpmcilE*a
z44l)OL%J(iilm<wR0}Pj)YeYFk`Y@Zo?p(h^0+exavTc?)q_tHrK^ZwfFb7ph9+6l
z?qf&L49QFF8l^|EM(<~T;OZjN`@h1>Wr*|3Zc0WAcC~)Fd(h01^x{}uV!rqFBX{XS
zV`z9P;6{zN{5OlwVSontn*4awL)S8#Zrjmw_H<8sa_Z}1#2+#fkk(}y2C80*qS<Rh
z^}S)~Z;}b+d2U3U&LQs@#ION+!WT-}<RH&FW|PEbeQuyUH%@)5ND`J0W!<`OzqIOm
zL6cxQGdZoRd2lZD!zJTHx9`YFx7AJhP_y#y?wWFw5%-P{;Dm$MuR5@)K@<pfD-XAv
z+6Fpkd=*j!aL;q&Lg=J3e&dx5B6i&TWNbq8daX-}`~QxZft``kQD~9ooU{UardNe+
zn$Ci*%v6B-_4FPtquyk!INhGs0qSXJB>%tvQr8fMHdZzh(HfP)?J9&``Mi+sN)lqP
z)s~#G__$I)rdsDfGkqPZQ9c1&K!AjX6nU+XFQsO2g(^dN0Ul#|&6$OJyP7%HIN$F2
z-?kK}PY==}t|DRVP7!dvMeRxWm1AY8<<g4BH8$Ksw@-<M`c}YgXMyXeS~IHl{LI$&
z;C2#K+fy!py`wEDeKh%gDAsAW3|~peo<yaYYkF7Hba_T&>8B>$j1tQ(HX<6}kLg%{
zSNm;(>eh<w8$v5<$h<tn-PuZVy#Hv<JHTp5=EC%o0O)BBJ_jh$(jLEe{<AvvZ7)U<
z?H?)$!t?Qsee6vVVP7@((eoxXKT;GcF5|?~wz(xEcm7qeJpNRcuMTffod~u-(Be}x
z4oBV;5QbA_jqbSDzt37@vHYG@(=s89QIzC2VM(1q$i>qztPwkXAm}*<0M{EK0(PFH
zRtFp%xrXdI(I;q?bQTM$$5`XYm#d-AP_~?~NK$i)`K*=B(ed*5N*UpnYUjsI&ijk|
z7im?4w#dK$ruQ+rD#Lgzkyu*3_`%r`L`1>H@eHG&+Li|4z!|LJf1-A-L2t0f{RiY)
zbCk^VEY6!Mbwu{#a3g>C_$q4YR0;ZJfbXY)Qoc&u`L=2kGxDL4U(Q7b#f4ssab;Ht
zXBG!MQWF|c?$zSW+=^Ei9@gy19Co}Z3P_&n`%J+0cp-&WrC0F++U}U~ri7A3p6R5Z
zS9(VWd3u0~d^%lB{;L@%j?So=LQ;W38QMv%q)SB`@B$?{Jbz3Ra2Do&gL*iVovcO+
zn1L?%?D%cA0;(#V+<1_h@8G7_Xc*`E<2Gs7OytWef%Z=jep*}P2CEtSVI}$PS?*?m
zSuT=l++v9G{q`Z}C+=-poi^K{&t^?JEO}gU#z8o|z87I>@v{u&uEVIP0&MU&gFVGG
zy_1#9X0!_H_WL1vyLem@)5SWX?7KXhu5lx>K%rJljW#}q_O~LGp(EWBgY!zEg_pyO
zMcwML&4Rx|n?Z|~?p2+cY6O<d54n`Gx#nd$`P25#gHzwuxM{{z5+aRSpLv~6_e<*9
z#_gsGw4`%$&sp_3ugtW^xo%Iip$qI9st)>xw~K!ygWDxIV10!U^y5pk;$&2rvru>`
z->XFloK%7DQ7;QEc5L>~+57g$C)`#sg|c|O^Y!7BBWufW%XI)(Sb32EOZdarOWVlM
z$K(ED=zFekmW%a8(Kzk_;DWC7*6GXiA23M=`X%4qWyyg?TOyFpI_>xO<tnJ~{!*YV
zug?80&(h)cZ~d4WK+r5O1k}`}L3-XjAJM7Sdq=bJigX>d)>rNOV1(CW6lbs9hd8ak
zba{bxajJ*z({@O;o=|4v|EQpjxeWRC;-Ms+v4vKp2S-xhej1C1&^@0=<}3nBS@4%H
z<#gd6O5e(+f4r3^n96;t!&~ss6eC%#zS|93%vrH)hvI0==%@l<e-zB;g(nnbY~jz}
z^R`kzE;G{pRPNsRO%o9Ps*PLIYjf1*t}){k)}cNcV~SpJ-%AuQdRWY~0?sr!p_W<|
z$_^!g<Yc!W9vx1{dxshZN=O!Vrx%?jvJCTwyV8%pwvuGDu!<aSrXO#1%!oN*R7r#i
z|HR&nHsw~7&Bk_!mReBxV`Fgt^kA4)<iz(pJ;kkyX)(ww?cy!g%eKFg(`P#J{=ryQ
zV=l?^qk$e`Ql$ls&YG}r!TIR87oYy;@(_95zo}(Rco?q^>UIQ_LvYx^OhMjJN}64M
z^?v)K8Uca~Lb2GUXV9df&5c4`l$_8}=I_15!ZfqN%Un8~f%>{b$JL;gySZU6Mei>8
z(kcQNLAraj;r8dVF6ZfEsEph25wp}k>Q}UEMS6;H7WwReM<|7BWAwTO-m;hl9_i5O
zBx@O7#NMjMf*FpI>Bd)c#Mwf9)3V3fU8~O@TM{iB=?qV%()#W`2UD3E{}E$saXdYZ
zzZ%^M7vbe6&7=I!^GVBrY8aAkz|keg+-NpY0Zf6rxU{ctPd<$4%4C^yP8MJS;9X`)
zgwbCwx28W6-oNLBBDYl1ySgfYlu&}kVRr7bVOTK#&A;cq|ky_BYgPn{X@8E0lH
zA(T;`QMdU|s7Bd@;|z+y+8g=W>#ykVWU9@!k@!TcH@Hy;O{s@UsBU{S!R$^_dbHRC
z#S8YQ9ng*KIk&t=X?h{Q+-pO&3lYpW$#?-!P8jbs&|DlL4KdI_u*B(f%ViP{6jh*c
zq9|3b^y{-V8@XN%_s^7C1mYunAFGo6Ai6yv@05cSD2<2TB<;^BELAL16F*U(qG%|i
z=)>PlZ_NuR$bpD5&MzrGQZISmMz*H2^jYgxr>^v}J_ku}PO%0UpLu(y5|5WiCdBCh
zw_b9{z*jE~%_x88U7oSwJlhh;B@l`6S9}gPhs2-Mb7Db0HkKRPVq<6LKPyqJc_FRX
z#~ObV<W3=~YN|G#F(^@>WbZ3Y7T!Y~&gXueYIS}?pFH{3iwJ+Jlsu_!#|rBan826>
z&uuG1N}qFA8pQ=iG&c~maawmb;cnFSofNu*gM)VQpxcK^_|pfK?T&(P02|2dU17oB
zVRZZVusCjD3ssdJRp>^ePb!=zD~i0=L@Y6)3$K^@mLX_%n;PE76;ED6--s7sDI6Xn
zj#awmp{R3FNErIujpzlm$zg{!MkCKpZj`^Wl<8{lk|~W`QN~;zcUht;{sO7Sh|hqh
zhaZ{4(Q-DLhi#0sC*f~(YMo6W)c900aMY1Cj$w3t%llK0V)>U}rqy>pv)0dfL<Id0
zTX#dvgN8sl^IYFZMy*9{0v_A~vy&1Dd#ykG=_+^!uBAep%N8<$rqVgRhJG*|h(B9a
zSk%e33Ric-T6^g77I5MeSs<Tf-Mg+}k)S3@LzI<yQmTiyne=!1vdp$a*qrpO#m_Mp
z4hd%7e<g}knhCqkBv$FH?Kqul#B(&A{-(A)_N(yo>H|7U?Nbvs!&#$c(+C*8*3#*|
zLNu-Q6LW!Gv@~#c3Knu2txQ4|8D5aj^=Atwm{g3Sx>Q^p)wiU)G0yvY)#{JpA>j26
zXM0}E1NnW~yp-`=vnnzI;Q~}oY`h8PqcSu%zHHp!Jsa6m^6Gpf5?oTlCpsMg;)nPz
zr1bL@f#GK#qcJ!Uz0|LZ$Jaz-(81x}&wOA^SpEXxpT&fDy9utcp65;`)4v<*MsKh6
zLC}W_d#5bHf-bW8a;)h(k0Y8Sy_~!JPFwV2RNr0DTS2W@qxhdc%(ef<ubx$c`KY?n
z9CzsHk~jwY)3u%A)(X<4V41A48$Y6Ijy2!CPijwE+<sY(s2!iW8(Hd9?LG6Dbplu~
zUWWXBq|;!|jYaH?D@W2lpiQdls=oEZ<k@KHL|#2G@l(!qnh|#C#3Pg$RFGS&oG0Aa
zCtTc0`3^W?jE#0!^}bPC)+gy%8{>8ah26axU_hz=V}L<aFZGvrtKv4!^Z&BGv>H^&
zrM9i3mrT>W)`H8)zWJdO%q#YR5{ZfUTo-IR;)n@%TP;DU4;ed*xl@O&Sz{%ZEH`II
z-`%PK3H+d<ts_8Fq3+e?>;faV2>NGyu}W6309}MwT7V4VWcW6D(~aY6=V~SM#m`e>
z5T7yB3E4~3MCZ8_S!<@OFds`!Gz9epp)4^Wl)Pg+rQa44<Ri723<BCEx6C=vYh%>q
z3BKS`-F^LvBoJJ0l~2cl*6M>tis2Q|z0%6*FyaoSO(uuWEtWRJaNVNg4ul5K=4dLz
zza<%0N1l`XFrsELh;PM4GKDGi&+^jUz{c4w^!T6U1!XEv3ViXoqptu<mP}Mmo}Z9X
z-W>uM_9f&i6T3-%42!eAKd(6&pm?}FpPpwnm)j9_XWRcwuhWFr!~Y%2quE67Z0I32
zpRv?$f_K3$h&Ycyud+~rDOv}{+eOeRq)=CleInc@#ZpqLN<Hj=4jj{V5t><x*l&pS
zSi@`K-4C{R@<S`;T)Y=O%4-5_EmuqTls{V;rUJ{T!u6X(!j&U>q+7T8IhbHsVZA=I
z8CVj@MMKT&CT|a0+WY7m$C&QKz21co(0!8$OV*E{@no{aoOJn2Hk<c)AIjKs63@Ej
z%F;mZ>UNn9!OD`rvNvSo0}aE6Aek=UiH{RTx!~sm0|0u|18`(8S-AqWVFZgG)Skk+
zuPirlPAgd&)G&UpSKE@uiC~$M#=nZ#>?sR<tY2b|ui&c?u|fWFuAMia|BNc!M)pcm
z*q=FB#(r7G1SOerx<V>FHtPx=!jfPFzjy^-9|lXzJp&}G$dObfhA)+rw|gaK7cVwH
zuJVWuI&6y5Z59pajaS;epIKsF6EQU9qFY>^Ks;UtCHZ^1r{St+`n56P3N9(q?($Tp
z7}X4EW!}t_FM=6zs!dMd|BJe}46CaB+jXT&X+)&Eq`N~}QW~VBySuv-q`RcML7GW-
z_XO!~kgheR&+~uRdXM!U`@{aU_eVX1A%ii;xW|2+zw?USX~L*BecPYzjQP~*C;PnJ
zGAI}{QSrqE!^_$+LS&d#cTE(9_ibrs|0`JHuTkmMwwPYNY*2Q*)1}9ZT8)FIB20}-
zft)FrNFc6ydlxGG0;;jBkMWB{j^G~HhP0|iCoPZV57b~_S496Ws`B}Nqbi0WhW;c2
zAvzCcyU*Pg;K$9rhvx^h-%2~k&+&xZyPZTC#YmEYfnt$sr9;5J;4*QKT|b$$%cR~i
z5Oq#gel2)VraC>l2_!S)KzL%76V#XBi%qb|UtB$A!E!QNk3r*y@&aM=cjUUkxd@b^
z)Uw__v~BpF-A&VPixvl+PFY8N#w`<uy3=;4=)KGNb&vayWRQ*VEn8=F(S<m`Ki51r
z%J9ryyV*o}rrp5i;$z)B!Jc1B*;h8g7FGWkF0HSh&iF27<Zv6Nk=38mr3F_?sFz(d
zhEurEm8p2045S;>AJg@S5CL7@Moej9^g<Mx{7*Hu2(%_F5VCa@%EM)2J5vl}wB8CW
z*!;I#fNv1T<M4;~KD9(*jzxO4&Afk3_b;JC)W!OaNN(TFJnnCv57G8n`7(p&W>0|f
zftx*~DDK+nhWBh9ejNPCX~)%>{FnjJ@(+zz5~X_t|3OZ^RusH5d)xCN^mvFs!s#fx
zT_z2tBZ&4${cGWIZDJh625~0->Us{qPt;JK+KYcnrpua=d9SW-y@X2RO{zfOO*vVv
zMl5kzmc36*Li9n?t#7lA3mH-)tZgGnDG-x6tF6`JFfF#1tLhJRY2o&j+>lA6_{Ncg
zSTdY?2PA6rzV)`r)XATdGkyvY2*{2i2fB50KJE=tu%VEO?Yz=eK?y{<g?%=?{+S(4
zEBXy>9Yon<<G1bnlf>n0?9ahW;(McuOvj|pX4f?wBJr;UBGVqxO%W;|!>Z>BB-SG4
zxkoreJ@gEewYg_`Q?!DAdpuw^z!Im1aDK;MNJ8A8>EZ?rPL_KFa2qRXj^KxB)GT2G
z;+{+O@|&J+$^6s-?NR)?=}1Cz03Mx(3dNF5Nhx?o1(JLx#r=~voJY>oG~}x~BOhYB
z^iYnRaQdFU@h&J=SKiP>=lSeh@@$ls%sTptmyGnb$da8;VP2owpzO~LC7;5+DP|z1
znJBY&gfn`<0=Jnq_~(rQ%pD_mt*afA@Q(`BO;t5EWcfXT0Rc`-q>M*oey;Yvu5u{|
z-gze<j9$u9=-vlJf@}+*@Y-GT$%F4Ev$|lw1o~6=Si98FJ$Q|r1oUup12^4gY&8W{
zXjUo)>_v)pAOq3V?B2kK`};EjQKTk^O&6R@OoO~*cz=@tW8d1Y9sRDXuX0&>FR6#n
zL7Qv^hb+G9C`gBJVr79r_;gEU(>7o^pQ6M5OU~qZ7@A%f=F+vyb)s^URcJP|Fb+M6
zhXx=FMs{@2{L7jk_=_US`9imFi*a;@-Brq-jnc9KwF~dhuTOu+0A1a=@>k?!W3bg{
zssK;*t=X5${+mUi@*fgkw-j?K9<-#jmPMjk>xHQJwY0;xA2e!X<TH3G<qhs_%9fhr
zt{op=^kGq2zU_{o6Tf=>)}(4MsizeUo@bC8@s>=n(S+mu@4^-tzd`5#1`@>ps_W*U
z{hrfkNP?60w7`SP&YT8=${mgeBG-Z?-hx}&wA&)B6gzOy5VmG{^#U8D((s$Su9o{I
zvYxvL9#+0+_cJRrCrN9)YFs(1@$;bc6CHsfOEQQon#F3)A!W0$`wkl;^Wt(1o(lAC
zYhe`GQd#v&qLArIKr6`tu^>UE2^_Te$s~6Qj3Uid_MPh|+1jF~euXJIP8e71<*iT?
zu${utWlO+r1t!zAW()!y0G@zc-`k&2Mnp+ajOcOMuEDzeC>ubHMK{}iT1+9uL@O{q
z0v@gbz87Cu8Mb;qryF5Q-cM%43AOmvdI2Xio!EYU5(C7)^?xHLa=YdW?--@NRLc9i
zJ;)u*d~W|xW%$~JQf7f!ehtUcz&Aqk=OX>wk_17GZXm=<2Nd>dZAL|(p&ik<?%GJ`
z!&}<K`Q4J=#<ZR?gKnq$u~fLn4$aN~Y##N_nYdk#n$ZD-Cl5nJ6xA&ey68gH*hHGR
zqjE>YY^2Z{dIGv-odCKVzYs}Ma<Hf{itk0*xos&F?8W0#zj^EDOE%v{cadQ|)8@x=
z$<0R0P%&;N-#^#P38`Lp$dX{kYgMNrvNZz$*6D~dI8(_O^x@(R?$^TnA^e>5d~lgW
z_1;T@ngA;!#|o|TxT`&bXLS+-?%eGqGIv$@NK_&Pc9qtqoNmOI>O0YpJac}$$=s^L
zEU%Ey3dwDl`U@O5;$wiOJI>voA;)*OVwhw@3<0NVo_5v+duQp%33tA$k4GGQhC7#y
z<sX}&=*6bp?=IlHK`3dRJyaTLANfr;R}U%+pmRW?UPsd*OyM1upHhKXVSzl`)Z%!A
za0JuD%H9{SXp_0GUi~$M#KMSk^Nb$)z2?@_CB4<iuh(o_`eh_ZJK*#BEF-Y<i*gzx
zl|8`)Atx#6%8~kC#>B(<$^vYyTV{`pAY}LHzWPqyQFjgHE5|0#U@uftgCRzYOrn9c
zroB$F<chxkGOUKI<_lk&ZvGir`_fk~6D$7nP+9kCL(@)Y3QCe0BYov^Vj0<O^NQV6
zs1WQ(aklU+e%g1w)i*kHb$URWug(p-`sG*8U@>xISDkUKHhUMv?ZD}lY<LV2uPdUO
z*hjH={hl`gwiZ#bj9>EH>F411IS353Oz-VAGIU|ya<cyVi|I_V*PD9`J)~XKC9>U&
zqI=r0_Vs&;6=-&DI1|k7|IuV?8RsM@{A~buK%^xQs%JD7dZ}v(AWqtBbY_msXZR%&
zE@p3<!5AkJ|9_CN)eoBEDBKymn~n^Wh7pdxdX$D*p~HbIPw^33-`a@#;_)uh%Kjd$
z7k>5FXSVh9eXa2152tp0ACY%C&Y|^waW#qvZ$!L>s*t|N9XA%wp!AJTup3|E9y2B%
zRt--#S)+_^XluKxE;F?9R)Aw?VTq54Cp)NTi`B$a;+~F$&QQG94~&sOrrh1Ye$66z
z5S1=f>novAoeiXx;!|{4mEW7<A#3{nN(^e;b@Jz0NbPanDLmU2-~Z%}o8BD4gCyHd
zFJp(#M*0Vc&y!$+J?Z`hOL!M}{|hXM3^wQfe*u>CI<1=NDrxeOlpA?I!H6s!f@i5A
zCXzLE*ldHy-3kA*`ujxkSbm?2Kl42!<v^8=;(td?R`GJk;z3)E@0sjvB{uwyul}MY
z{QsgR1*`f<KJ53i00ZdHwQ5J<N3w;uQjXj0pp}L&E|EJjyRDF68mFj>HzzOKj`8g?
zTB(3~rHYa&hwC<Pr7)h$)uAEDEcJN$rR6isOX7JqyxY;-@!Qw4SK=3koS<k^LC4KR
zM?Z{_BCoDf)<+ujH~XQQDCFkt*1(wP>rVh)VwTEj+US;FPkGpA!rgvRr;ewa@aFdk
zN$(Gsqsfj~froP74NfYQZQmw}DliHkCcI%3-~A`Ie$UI(NN@wHSk-Brg-TVx#w}2|
zVU98E8%in2MiBG(-|&+Am6q0_6(yMt*JyA|%;ruxdzMEkSBc~kZ=*WiS`iFF{Z?d>
z05Jvk_XI|qPj7fa1O%nt-7&3mHx>Ywyiy_lF$*~4*ljLVVFQFFSVjV+?dCj4fAq%)
z)1QyP^qo@vV>oJfIIZ$l@38MTK0$h-IbQh7cw6#5v;eec1JwHWQj9v%S9nrzbmaR1
z!n$IeN1F!JX}`?VB&A`%N2tK$Ny5idA+Mr>C4VK7fN1{p_EQ=@zg!3Q@-%~ppAQ?Q
z1m^SwV)hoFE8otRtErYhhM$e0lini3Ur}oW5P$dK_#6;DQMk<`x|9=Vy7%_x3XO=L
zg8M~HA9^`rv%mWW*<riyP3F3u!NVx%#tE}BA`(3r{r1UvOT2bw==AY8d>x7+)0}Vw
z#w1th@~Z*Y6S}Hn-ySGe@(%}7m$?M83bn)SCBW5ZW!Ab6*+(HeuU~NBEMcMiXdetr
zd6$V%s#i<LVO(4Kc*mOl8Ad$Xfm=>76cWDYU9dbb90~bEjIo`9oXv?|MSjyIm{P6G
zH@ZqfECGoa0eTOT;7?~P!J!L(JhqTq4L-q5y{;*TL;1p>(d3VGM(#auX^H9e8k#Oi
zLDBvu1}bmii!mzGczTbs(|4r2|BjdZu68)SfZ!!ec}ftx#4^*p3`y$icRyN^w&SMS
zy`Gnx`YC8_!90VIZfri$Ln@GKu7HW>RRfQ8MRZDs-nqlN|JMoi4|<7IRf%2L+=H2h
zo*#X`hEQH6jmx5h_$A*N|Am5t;~t=}$l7kv>|Uiv?g_4NbrXDj8f#;>u@iz*W<FK_
z1+Wl)AR(#t;(T@LvByBW@M5m%BHKcjgD~_*y-C2$B$=xR@MwqiZ+zOwPRp@m7m_N$
zKi$Gyhble8w%C7pni@w|Mh7$sJB_$l1@@P1Ih&5a)!Qx8ne55=#+;OGR{<B64aT>;
z8ch59pJH>@4`V;!lm!(t&CtCB_PUx__Mp&cdRj4Rue9_X1OhAR7erzhaYrfvIT$Ul
ziQy;m9$arFN|?%Z{w)H4P0X|ZJ8W`+IfLL9ehDCWiov(v5CwRN1<6p{F&In3qviW;
z*^$}>6_n?&+5ew;lX0DF2yddQy!JowCQu-NH;Mnhc$4fT!S}>uhxLQ)BtMccI%N$N
z<{C7SIPP7={%G_C(S>xAC;;0TAney5v>WgF8>nq<PV1t#9HNijues4<Z@q?8mj*e9
z7~B}}2@ziUn<8^D0~8u5xt?}R0-`nl5LqFI`~Vl|V?J^EenhLSoRfFKM(tC=DrgaK
zlMt5NW;^WWgbpa%<ZypXNhTp~-_5EmXcnvfx^QJVU&*TR+}Tqvk$*uTSW`C7H9Zm{
z<&}0#khghcyD?6?@BwSA0SrrQsiZ$y9l_uI)0AYy#h{NE=I2wW!YxNP$8b<{Mj>Eo
zSqq9K79)lUhUFX;+4amQtYbbl#*O4k_JGWx{pC$+0N#YB?|4%`VtbGO|3FS2H-5JP
z)fqa*lHHD4o;5HqJcW=cyU<s%4(NyY*JE57AAQ&uR+Q|qzI;E~0<0|%<m9OKDx!k>
zBQq)Lih1}<EDqut-nZ}JIbX$5rU^M8Iz6}xq|Ioo{Z6&51~DInLOKTD(JbfkjcB7Q
z3(<@FMn+!fzqkCgwYiH|P_Ah9+qB%3IWV!z%9ds~fJ2g&Q=Abr=@5ZUn<pD@Pyyq!
z*PvWpuOoU^9`<(t@Na7QBNAhR<seWej|Nw-gk1;Nza)TQ7O%b$0EV4^kQ4`e=!rhE
zJk}BFsG8{~fH!djQ%ZRP+h(0`0K(Do>Zi)zUJ8ii#0U;rg9Fz`F;t#v@(zVo*M}Ez
zlq={X=~odkG2uI#od7E~{}JR5TLi#X#2UoLVroSp{ts^=j6R@*$X=uz7jk{{a|xFE
z5lqnVjx=F&0ig_*@3tEOK(Y`&wcbor6U@4pe}f#05QsEuQ7w7mfIs~GS<o?{ff2P$
z4mI8)IP7D(uG9~})O9ZXd<B?tm<4#n9sn2v&^S4l7Nhb%$6UJ-c{S;U9#i&C@=G^#
zXqaDYAN^N&E#t>45#b6k;NLuwlEAJK^TtyBbAW%31Hdb+5O}2qpK286KWFLRllA}U
zuke|Y!Df=^esEUjiOLj|d_X0>iR1H(lj%*&>$|eJ913lX=V6M9WfFSJM$Mb(B$$^!
z^0B;7eDi*$U6ULf2@Kp&9P`@saLS>Y2tmWSucu>DMks@z;b<~&C*1$s+Fx_YX#G7$
z!ib1l!9K461359jml*K0%L7L7gZzGYYMz&e8AplST}%=_oAA5evzHA4J2qu;9VWqm
zbi)0k;itX{)6;K?`L0B~GWZRrNVNrfZ(#;cW^>-O#RzqbhyRWJ8c!~GY+OmwJ(hsm
z<O;a1t~9EFm|Hf@<)l__I1=HUbSTRmAYzaz4ZmSGIv8&j+h?SzR7$AQ>DCpgJ442P
z^MLjEWyO=z=`&0vJ0M#CetRT^B<ROJW&aa?qUmvWWig-K_t@bjil;mHD{Oa}<Ytgo
z{r15lh*3}OieMg}_`7@LZ?DZaZTK^%-@%RMzayL*+(GoIZ2e~Qb)@uYk?z~>XL}gO
zCcV4u-TixujxT|zjR)rzen&shADM{*mW9tg^~GmLv+3<o10^89@FDHfZvl_Hi-hKb
znY6~esclOr48%sFa?J_qFoW-B+HK7xI;6Mn6TYp+z<7K<E>T5v-SKU|TjXOwxo&nf
z8_lSDvWj%SHyzKKrX#{8;;~=i{{Yzm1U@)?(3+FcSNl_$TAXFOO>?!l%4QG%<;l!u
z%kbE3-i5*4q_ui=+G@o$tl{YQY25Dr0G9}A{tKfR4g19kn(z2S29~$rhW7=>85hE}
zuNG!A=A)57XqQL>Z(utN4^3wBRUMs!r;{r3B5BpTo3$DbXmQkI1@YLv)Qcdt-;@v7
z1gth%dRk19Z00)qb9_!xww?N32X@LT^xkw^s>e%nQ<Vx+|L${sK`EzM7f_M^`>Y*t
z@h=NTf{zjABBYk{1zOI{l4P<}-6}lR;2>2Mme>^k_dl-@q_44(<93-Q5)Qd&0g{ik
zgQ0s>W2B3S*S#-s4Te}i`RJ5Wb!&0?_7``8@qS3T(;uT)b(cEhEk9kAQVU~eg4KvC
ze)24jO{(miTUPs-aygW7uFZ{EOLoU|8IN$WkIN<fMV~p}0JjBv{h*o@e8lJM%#1tZ
z%6#1t$OPi<HuAN{N(o>qX507=Rg(ALsFG%9{hxZ0`71ICg{P*Be+Gf#ILPgHfD!iR
zVl?R6)|1jlnsya}V<DfZTy!4CbIehlcFu#OFST069xc91&Y7M-N5_Xuj$iJF4dd=2
zYmqn%m@%}JvhG7Yy;I7W)lc(GwR^PMuaZA=w!ijfcBtZoRZa(vL{e%_Yl1BJp6#1V
zqJE24;{GS~>e%y$r8(c_63Kb$j^DsN^QMn{uMN0DS2sdc8UEt}z*~wgQKM0)%6jfs
zU<+vHw3KK(zyMIj>-nc6s@tmJ*IVguyM?R3rZ=+jQs#3U-*f14Q{+Dn9W3-P6QQ|w
zRpo=ZW;hZ2wixdis@M40j69`4LnCT#)^iln)r#j$E;Nq`E!Ee=fd6nK<L;^KMIT{W
zTvYm9tI;+*5QxRT<FNuFjnHcX{sYw{?JGA`IzA%U3wBVyR3m^F09Y3uk@4HYu3+TJ
zk5O+OUO?nB(rti2HqY+Q4{D;SkvDum3)f$U17}D{bf3GqciIGZadI0L+|Gw2rF@*K
zs(LY6=OnYe&&lU6;F%z^tvUrjjQgDP*2o0DYi@?fWD7mLJyn6jj#{<ejVXBzwr6|9
z&M6ye0wx(A;5Px$7{0;4gcCqI(E+_1=B_874@TK}efN<K9VD}ccIR6w4%1)g(vpH4
zJIZiEr-{6R-p`e8tE!etT+!sH&^lWIgUq_-9=ouTmaH7K03N2~o+%Sdae1}c<f&j<
z|J2K-l50Vkkm#}bR>#q8&I-kp$m^yYN~P(GW<8hVorO%h+oGNe08Q$mz%sX}mdj9<
z49I^E6N7)Vuj9hm0rKV?!CD*M!sejRIyE<+t5U1{9HTD}pd<OXO-vnbcO?hljbFB6
ztFZp&1+RPBh`SXZs=1@YZXooC`|w|S1T`XC+D;AqG3a*cFK>(?89O#rQ=itxG|<}y
zCBUmR-CVEwT^YABJD$FMtXXHNAv=m8wS=F(@hBUH0B2nOV7^$S+rvdw`XMSKZ#K=<
z&DgP&s8XhT8A_;lWciK5mKY|7zH%Wiza{ibZ{8*D{+}ikx^jaREtR-cz7fUrUA%A@
zNqL%8o&>W?YRpV?WgV%|3m_V!ai&SD>S2X)!CZ)x)BN?tIeCs>EubUX_Cm~syIO)+
zqzQs^+%+>qm?4z()4o?IKTtIT>K{vPFXX^`XZE(#lECbkBlj^_eA2#@sKrr2^b7?c
zjhuJQgdt@hsAe2$wqWtls0>y#&xX-M(V1Mk)1(UK*lTRsFUdYH&l)<a6CHYT8Jir(
zT2X_259F)`(l<l`@|F#^(P1+H7s6LRvKMdqL0cTxO%c7si~(~<ZpXEoSNL_k;2;y?
zmJ@8q)xLzYrAqE&3g^7bv0R5|(oW@)up*HL<@;0Gc%Hx(*P8EP0jDtWh~9kl%5vd7
zFGC&-mBT(XPw$R;`c1|{4s8pHx5tnJR?EHVqYaOlP!bm%Fu6|Nn3zH+pO|^@qpUBN
zcRPbS2gfO}Ud}%DL|e*iH;#j!DKykhXt3EOTJrAH3?!BoXrM+r*Pn)WHW!{pEy3mv
z;MnT{Sepf6hLGmQtEksSSx7C@;zMl52d}p^GARHk1LhJ6Jhcz0^eC*lhAH^KLU4d|
zzCmA|h)G@tOvJmx9^#w#QGAn>&r;ID4hk0)n4rsz0;X@>2<)>(RsmjTyp84X9M^^o
z9TX~ogvwr^LaP@QaaIPMPBw|8NlDAoOZ`4V`$}Ux_{k1Q@MM00)Vo;Ls;uPgbttYz
zsa<H><)?EL_42Ro&KXk65GBZRwfZ*OOER~lLx-Yi-Z99kOPJ_O>StgMbl06fAw7qd
zdAd5|mqCjH|GmO*yo#UiI#Dl52Wy}1HNALkf>I}&Ud~9p1&pIsTjv}NiBrQM!-l_6
zUKc&z^zTEBdK?|Z7zPO2h_1y|KSou<<BmT%Un@v&zFQ5G2mws^$`UUyBKJS*(`aaw
zr%iQp(@u+}#j=BBni5M07+6rgH)QJ!mJL<{0uu|-Iz&NcT9u+~_Y@IZp}&~Nh@^{D
z(UZpJy|bBOW0cWSswoGEcqniLo};wXmgjMQtt7%<lHbIh=N76xxT@bZLIvx?m>fRq
z;Bl;o1D6AKCLzK1etC2Zv{N|v*rXI8xosMr-zT+r`F*cXy2lN~U0S`xg+?^R8r}Ma
zu^SIq4qv;1y(-TbPj~SU_~{wm#Xa6#(EA2Kd$pM|<xkY(o6m$HELIZ~qdy{3P3#=;
zF0H|jzrN9OYT}nZ<EuB9&(P*nen)W;KPl24qXvq%!Xlc?*hSZH(ZY*k{%OUVCOIPa
zWp~#6U*}1=l%S}%cX#f3(j#;7*EuX8$vm6^Z-!h^|9F(LJ_FW>Ujl@R)N!Nkw3T(<
z7ThrpL)DTqxWz!AjtU&&)~sEI^CnZOF&f;HCs9qVr9cl^q<Q#NzOI-`tT;hQ&ADse
zAH~P3qxRwR)|VWMQ|=i)rOXEJxPDd2gb*J55f7Kw-2O#!wnXwO)*)*23;O%HF(WJ^
z+vOiV5l7j2cP{Xj0!_rMz2!I9GKJ&)(LaCkqab3|*o{i?cJIJw2`5jyMMq-*Hbisb
z9k3JV@RU^kGzC1h^#sP6qo~8Ct;~Am{4o^g>3i%qT>XrkM~UAs!4f48GiVV2lmT{<
zw`cMuEpgh)TJv1Ds(rTbt8l<hqdeqPUa~fIRLz(_xe;(lXBaQgar5cO<eazcTS)Er
z$8zH6{7%cN%rduQnB=*t<22s9uQsxS=a8_S$7a^hTCWI@8t8R6R+^H^2T>kRw1$_A
zdD)DnH1UL6?pxI~V$QzF#5>^7xZITvs2s%R9q4#6O-0*p2rz22rToU0SZ)%Bpy#;-
zw8ty1Sg5MvtlITwjV*aAk9YtwgaIR~?!L#i^}w02{jEs$ECeN&fD{BXZ026X<Jl2P
z`nH-BZgSG)hix^_*7yB<_bIRph?J;;R@Op<Y1L~>ptwRp+%Nke`xCH9U(<{E^Jhst
zG|QGhf@D&%1UzBh#4@EHT`(}5bnkFrFWy5JFjN!fxEu;_$PsbYYW8q)MpwyDHjopI
zqp+A)TSm)AZgFv8J_3&>(lTxfjna^d!88-I;@Z?T-{%u4obT{+@S)sB`7U<B*S9R%
zM9E@$qZH-qDHNzVcoE4i&+^r$<yZ9^A`$?r5xkIu-f#fG8l@np$B)c$uwBtV7Qi=4
z<FI@8kQEW`7%=Z6$5!;70NIH>6gJrEuMB3)Z8v+RzZu%C;D{jXic0`96K~8p+4zD2
zH2>cG)%nv=u;8==m9RXR%0RZoF|<9cx|5ThX!*AlA8Iw*4a*@$z$wc5nGe5k$qBzO
z>n9R1E#Uc|us^u!zE%KXE;a}TpBzdBOj@eT`2rP@1c&4aSfqOKB_v-cB^yKvZIMO4
zJcrc#Fq3{Vlr$34M^qhw3Q#r*Ry>S<4Cp)z@Wbn3j>$kpTjC~cKF_pxLvKR&a-3}t
zybnKeUgxV>hv<WcO5raIw?3d8wQj@3MkKG36OWWn{82by62^nR3pszl8C5o$k{(I$
z{`BS9Er2Z6?ub#hEG$)8tFLsI?1%!9!sI<~qxlZf3KHAMsYcFiyj)HIjw4iMu$6@i
z>;9L{pKfy&HNI=YoCaNGv9xx)dhO<Xlo@iUW~c{h0y}pfS2&RbO_-eG$&{6&U%e~H
zT0Q-Zp9`-}bw0oA9B6vb-1Sud->8d_zxoL~I#a=c3FOMJY#EBy^Riug3bn6IY1%vv
ze>$ge|BSl%WH?6-yLuC44pucoNIVa~*P0;JfSZhjI6n{x9;n7*mj4fS!ox*5xIkh&
zi#!g(PI!eV+pc`RPV`SC;u;=qAUSI$@hdV7lTb~6>HY@Hi}yK{*?u^JS*)@oK3?`+
z*o|FrQ~0a-_|lUV3r$S~(R{3%5&tm5@KSYWyvLFe3VHbU?u4Z`GEV1DQW-=f{gS0X
zMVIE;`5m-^R+e1H*3B|1$#r6YfB!1g(fdLF{tCb%g69Y$e)?7m;liUn!=8d@bp3V4
z54!k|c^_0;H++#=)n1X^ie#&FxemU+Q<y(i6KHO?^OXyD8P_v$*edQXDPi?uApTBN
z(12UHD%-D+ag{DQUbeuq)C^)1phkA=hFCpllNhb^0INqzhH$pH69i{jSE--?Q*VT<
z=Y+HZ(xnV4o8ljd8aSx(^=NKz3cg=4``RQG>|o<E8Wv%A`ctofmxK^>Wj)_}w^j$c
zb9tmzQyuT|9v<NHZ{k^|j*bKhb)T}O{-&lR7*#W8OlzqJl!mz0v0<@YYBkb3F3bb+
zEfb6A6GxR|9G_#>@@->1bT>kDzZIAMx8GIQf`sQcjd-#4J&|`Gg=9U{GX#I2eZzAW
zM-)^7o{7;>)<pmfL1)EjvMb^tHwfscFAw|)8_=?NKMv2^f(oMb)wXjR*#?TtnNH#+
z9bV-Shn{7lf91-qHrh=|3;09oNHc<LNb*Wg;?SZIF^0#_aGN}c{_)idn`cZk*=)H-
z*)fXgD#b1H%A9=dT4Yu<esPHyMTRmzdCoTZ^|<gE8sDV`ms^3L!mp11U=Kmsg{)A1
zY{lAFK`ZD&aOVN2J8qTvGKqf%;SjK#vP*f6vf~Onq~DQ~?%2$3I%0SUD`0G^QEOwR
z_4BWUx=n+I<aHIhodnqqk#BrboW+)i4xItEKxuwY#JdQd&)IMr#N?Z~&!W(k4V#K5
zo2A?C6Id{!XChF)DQLyIYOJlhpuT~ZI<!T^nMekN8lu^J50%NJm&|ElQo0jo=B8R^
zUhC<I)oB&;ZsHf2sUpbo=>iIoYPK`8_m2%b9Eu+Gw&hW~&H=<0LLq*zr?soNvWAJ)
z%R5-+sM#hQpm{OOUEQv&=XY^l)c!nZ>0dCe5UA2XVJ;iuzxBQ9Mt%%Gn*_^Y8T0j~
zYG1)Re)qpf2AY|KWUstF$^{7d&Uz^Dd(GUCj$icj_X1PdZ)wM0&+G{A16+e*&O9Zd
z{r6$E$;w`N>;$*-ya)oI#Y_st>K6vW>X_ODh37h^M0$U@hf~_<>azS<&iVaH56eMp
z1v5_u0W=SXeP;tcH-I!)K9Et~0(z~y9D(K{3*53rslu^zb?~gaeI^URAW{v)pD}*O
z0ZXp2V9YOR5ODeScOYKt$gh9shD%=3|G*8+6v^*9Q%(q&{EEWc+SR7-eiMVDw*)vI
zvK$E}3^);7>>!OJz?yB~ll@6<cT8xjU!KnGzTLrvI#s%ltTs_6HauxU`OW)X`mrZ6
zHyBu#a|q>N^`+TkMH0Xq#BU{ihn2o(FlA}Bagz74C}Uagur!YPlgFDuR2r8Adh&qH
z{#@|~F6={h=w^aD##@XhYWgf$;H9K?%&sw4<ZLog%FX20{LkTS^MFc`i4M1SrI(+;
z-{s=bH*o_JhD=zRsw41&Fr4AF6>>f5XS=9q3>znKqO`?bk(*QlqYp)voyGbl*Fk0T
z?~Z(`xDi;3fyd393q+eC64Z@0GKMbcE3)C?jNAjZ{alz>0}j$LW~SHbqH=())IfPq
zPF@<-t)Al)jlX(t1blQ?&?nxzzIHIU@!CD{27h8dZh3-?)+^r<KG7=kc?vLm_epPC
zXkg%8NWwib2N`6NLO2G22>D-6$yB$<t>`ae^soG*0;6|e$!<N`u(qDSz6J13bsVNe
z>2rr7I$FXS=LDj75sM#S>&~b<@Lo85KSdOO@O`m=UwQi|Yu%z6-QgG-`<5ZV*7_q2
z|Fjn8j5p~mXBfyG`Px_`;Vt3As#`Psq@1jMBy$)1@yi2Hz`LZktKSzfS`JHme$b(>
zZ7;=4Ur9zR%BB_;t?uHp9>OjzV6*bi)8c1$@dnQW1W~wbO>xg?aF*g)5c_Iz4XGyS
z^)yTfA8q<XTXwF!sWRh8F!`>pV^%MvLL@Ml8(jD%?W2OkB0B8FD`;LB36eC|T3<ko
z0i{WCTCD4Yat+d}A<vGhZBaW}^Ky?_lbnh+NSHGzQ@o^xTO@`qz(Y^Aw3zOt(B<kM
ztgZ(*c=j@N^-2p{$Au$W0w1;?b(V`k&5eT^BD3H6ZPn$|_By{E(d$l3WF=FjpNT)t
zZ}C7S2s6Ljk%xxDiiAS%Rjw*1AyX7yCIjj4Fp8+TFMQ*HJ+Ez1e{ICtp$l9$oAYlw
zpu~iagzKlTLWy#DL~3R?&B9Vlv-mrEkDkxLI~;cxI8m!7eA<5eJ5x(d32SNZi}Fx#
zA^V@d*gK|5m9;H0_R%npoWZJlbC4y>9A_%g&2s@iFe>cLrb3+lO$1uPC>8g^POm~#
zmY}7{&0?SaC1FDolXi3bd;Jd_Z(Ys@KOn&R&l5l?ckkKyy^e)d4vZy`M!*aM5Tnf0
zr=U0nm%-$XKa0PVGK=UTP{IQCn&zq9*4-tplH=TXgwF>8e6P!P%Moax<>ZbT|2)n0
zE|axbFx4p$hSq9MTDJqKwdOca>Emp*aNuVnR=J5s@uW*$q|;<UD4{Cp=(x(C#WS_|
zcE41k>%G^HfcGv*e3<hQyNd1)anwjY{GO?+YsMKFu;jzQOJhz!f=!L`I$G`9dn4ya
z=AU1mei*_&0i*R^yzLR&!&5;9F#OhtiF+0d^71T`Ah=c`3&aF^Wv%az82vnoZc}7O
z1QF4u7*UO{wI$0wmv*bWzhdD;Pr{3VExXS6d-)l%Yg#v~Cm?w?SmrkzcNJ@0f{z2u
zyg(+royeTcB)32dwsNiF#NJ`iD;HxHYwlF_;_@CJnjZ;PmA)iWp586@TgfD6yd-`p
z%WkM~p@M&a2{Da_+nL>(rF#8i3-Ie>I?TTw3}wKBaR-hU((}Rbv0K^@V|p+}?*z48
zaJ@C1!ej`jJRFzq(437n<^Un|X(&OY7Bc&{o6Efo1byt=NPWu^^Wpcq6<>C7lA-(4
zVaGN8!V2dBS}1BJ$w69|X$4<hWCR)ysx!ILy&I9*@%h=(51EIzqywVGeTixb7oIl@
z_a1wPEkt0%p8Pwn+i^n&N(RTme-I)>PxKLb&NkfjGjxUVj_z}P;Al&)ujHDZTD)%c
zyFl7o%@+_btbvasm9n9_b+%M3TyJYBK9mABPt=?EWcF`P><jZzCr7o-1!3FWb3JA%
z9X%7EqIm6`KJ7ddd`#)O`^REoiMrMeFV};qJ2;Nbb(-|wDT4_TEN!=hA^KvjR>WSy
zD7LaJWlQ(AN}wR672cgiL%?Mx9gT>$<C$<ox&Qk+A;fefwAw_PA77ed57>Snt4K`2
z`7a0gnP<KBE$fj3^Sj(ml(?8Jsw_4e8j!9CA&}kvdbV3tjwkG#Tuq%DWhR~U-cv;t
zMOXgG%~7oWsXX0>O+Y93^y4$&uc$2{9!3q&OT+ad7(@{gJ^L=TC7#!*m;DRWcDh{9
zbkQivYXbkwW*PH2&tO%4B!x(9`7U+Ty)2SqDWRLiOW_|UMbGn~<6xcrypN1&9J$ws
zp&LnrD`e17N{y=FFLl5Wq1t$$L>+z>4F(j|nRx4zN|}_mb|f<aq4z|Vp(xei&~;f_
zRUiI*6Ay1dYfB5lXwb-VNLT&9ngi!DLt={FQ@b)uPksPb>%ayIO&Mc8$vZHvkS}GN
zxMhpxfSRLh5b*#znofqle0<1<Yw(hSl%u!=&5b^6OL&f4A6Pry>kzZ~cBJz5o4WMN
z_BS{;AoQ<Mc%F1oTJc8UQH`?1QIftRGo&SJ2@tLjef54FEqP#1+;hV1hx-S?JoS^H
z&nEw30M6E+R^0XWLT;#b6q37wMBNPTOSG>Jg}-HU+cTDm$=$=(OjU&v31Nx3u%4lw
zTAUQGPZuPEh=U{~Xbc%hUJ!TPfN4ALf$RC#O8ZQh+13VvVPT5EX54ih+}&C&;-uwk
z1AOZ*gNLw${4SU+g1~ODAWo<o$$Nk%&&1oHyfqq+gC%AOIJ~$R!Wr%>EtaRJa}zo1
z1R#^H$uf{!SJ<WDj=s?gkxDAA+yr#2@A;6bR;vk{YKdZ^#MNPOEN}+r=jZASbe@G(
z3t{KJqy^IP1yl|T53f{OdY1g~dkDz^@B~Zyi)A?8zDR*Zb7aBIV(YMuu@8#agqPo9
z>iWz2mn%R@>00OC#6MUHPw*rY8Z*NZ7kOQuNJR@UoA9@9?D6=svuR!#uve-!02iy@
zD-%S|i?WIY#lVqHTZ_eJhO+7pB5@L^N=aCdykB6}EyruP;LWV->+>6yHDQ|dOImgF
z-S&?&0rNGJ_GboT2Dk^B^H;o2CcvrH0Sbzc8y*F~jbF#m56^2SZq7z}KgIvD_Y|I1
z_Yx8^{RIC(1@7EbvhuLufq@YBlfW+qUZWz8ad%YWW}62hZAUV;O6{v;=aA-OPLOM$
zX^6+0xkq=PBpDsOrVKmHC0>6ziA)@sHZHp5=A_h|3TRbf#2VFenj*QcBK7Um`dK<}
z1N()MH}!b1*!g214laJT36H%@pzrCkH)pL!wrqJ#?kG>Wz@oo@x-zh!!SItt{TkA3
ze+F+kU?71vV8lqz3@S5$g!s@RdkGr$K5Xm;YITxhF&v1UL0tX_u81N?wL|RJxHF_Q
zwFPh;nQ-m4fER*hnsD#8&1#dR>%x~^j7nGZh+nD-G`rtiejjb6xTmg3KJ>{v{mtos
zs6<$eJlk*v%)l5wYaMfm1pLJ{#0KGMmsI;-u7e8lSC>jX!=ZQ~9X}!}vNY(h0g{g(
z7${^RaOTTn`OmlVV}nHV&!7Gi%|CPejppA`#na`F@O7j$k$`1!Gb)8YL!wZ6fC0$D
zp&91+n?XT}PjJ!A6bq@|950H`!tUOI32fex25v4)Hp24lbVK3Bd_tFp=pY!=l}aEW
z;c^&o_+_{GywQIT?~ds&gEKVQX+u~92sj(&eIpImn7~!vE55YYq%lI8R!D3iKbh^a
zX_E{PdqKX*JR+3Xr3gj!S8-%zsHpkq0m`sEOf%=Rc|_<YYzQ$Q2mk~^Yf`cQ$^ZY2
z=YN19|9kLBa-I5r&;rO$@c*Z;*b;=j*dCS?cgLvuDjqq3Z`&&33Y1tOpb-aqlLn*U
z<Si`3Ix%&5blLa~!>#o7Z|_JuGXm#t6%IWgS%l;>Eb@g+{N3fQ`Yga5vQHg2$liG^
z#{wj{BCxGgWS1jaqT?BznFk0aYVyBBE^_FAbQN?rIgBt`rS#O9O1Xp~D#hTMbCn`t
zs7iY8`y;%Jwq(&X7vgO2zH95$k;?>=t2sFKV7^QOqH{26@_Z50Y4VBVJmnE?oSG8o
z&uu!NTxi@IeYAyjju=EqobHxv55#LcueOxU34q(n$ayNZ0|n6W+zqM9ELV*(maZBC
zZ^1VdmF_>$ALk$8hqt#;VI@%ar0Yx7a@3cXU+#8xmnY4H%16eRYMbjzl|y&rfOsrj
zee8S7&2*G^Xls<XCv{;x_0l~36$-RFhJUkcPw34)(b<$9#C+g-k_*_Gd4VrdjtIFq
zmbYhNiE<|rau;>PoS)n-Esk<umbpERrk+US$HMsbXgOrn{IZs#pvHB6Ud(?foW*!t
zY~1aUY4o~i=Y!tnBAQD#Fw7vZdYG($_<ZQ5zMbko2wjN<vIX3K&Q`yZzY+?5)06{D
z2c8Zq-L!As!a5*9r*{69RWOx%GL~s^REN}+7)Ib0-gA_EE-y)#ooxjK2`1$l9|zON
z=@b!Lxqww6gN_WP4xk-vIN=*vx(8cbD8~bR^RgLQoRnLh&l_-ZIzS9wEg_Wp6Y04P
z0d4Pya+Nb}_ji>y+Tti|OK`M1qhpI`6Kv|)oG@A6TPoonc8Kl39n*R^5-fUlxuKI}
zIcAU8Di8Y-P2Bzoj;*jSXMM@>Ptd^;U%gq0-D;I5@$pu)v6{5JrGlDHW4a{zfF)u)
zjjrtC|7;oXt`VhWuCkPLe)JkLo)fOjPh!&Z*e)Fn436>kEHVUAo*d)D2Dki{wp=8^
zB{s|bgQ^l&6ylb+4iHtr>18%erm58pv5+*wdn2>+zXou;4*lMLPIEfsG<&&jFFCw{
zuEYmHj2g$hPTx?8_@osw1#=-|cBiP;GfxAk0a<ZNah60f=0zlDsbpu1zc4cK`g|Yx
z1BnEDHk|W;M4^ywd$`z8S%dEsVlK$>y_?_CEUCOK8R#YCLiDK`MGA`65jfW7$hRri
zMmOQ(JXun}3VEZrC<@p(XAS53T3tRUV0m$K%$2KV5IPJgO1q5tW32(ES9h6HjQl*^
z!b{xGUb<c_2(a5FYIF)Xi^T|6Ux3XSMqTNA%VMs0T;Bv49F^)rE*quP-fD!@u_cbu
zl0RujWpM}0qoFabID!aUL-mJ8BwJ-U!kIPa#zmq{ZXCTDyqpqpss+Fb<ua)PcKb^Z
zFG<XdG7$Yn(C=GyB-@ARy=3yHSy^)3T1K6LVL~%J>Me^(D%eof?|!efs}^fSm8j52
zBIR%Hy*2cRv8Gi>7qrZrI(6Z`?To7V>nZ3$;>c0yb2^I{1-DK884I7@MN}TOL9vn>
zzN|%&-r3z8Tk-T!@HfS{c^f-4UgvDlY#!L`l+K|wp4o9W6PE!iR9N2+6JfRW9u~re
z%$BJ9j6gCj60816$2z?7fVKyJv&gkzwF#gsvL{Cwn=3vY>}G&nzt|%o;-E$OY?N!C
z%omqYx4q=Pf&1igl1qfK;7h&pubHc@-qn9X2)xym;z)HxWZz1;yRRE&+bSp<&-26u
zsnWXO9jng2q3!v!?!L{55hVyMDEg{-gS&Ym)D}(EpUfw<B?!67@j@*5pehLHAS&r!
zYS+L1)SHkuGy+Egu}uW>aulPceHB=@wVAq$bh=-LOPH&&VJI0q)q(Qa9V`5j^Isuz
zKLZdlJ9mC`Ph9KnvosqY%T#J5{LHi%Wi`~bUD93mslS?4_O&G1UjETTnLEHHiS9Dy
zPqcizFU_mxyT0YV0z&EMqYMl>@7<hSSTGM?ac_*&--X?yGX>mlufXQof*B_Vmczid
zwFB#xy@_h&9fK@ut^>S2RGBMFmCIuAey362`#@a2@Hzpt^qccfJodN3l>><Pt<+<b
z(+-(7;=3mlF?EGf>s@~yZu)sukKG<iiyilf;*cBj3E~oOBwwq|o(OoP&si;Q-(Y8e
z>lCjG2iOf4>6$wO1B7EUU{@=wfhU8Db-W=DjK6_*vy#q9zrO7;#lF=6i|;Iaz(ptB
zq+3UJ*}LVoxbkVrb~Ht@KgToj9LD!k2@zY`;Dn?b>L>y+oDH`4_p4)H*7E-ZOuaGj
zMyJ_o-*lp8=@<qE>n9_wv@>cXy!h#n+J(Hza042@9!>bD)Qh?=Ic{;6uQW#{L?V%5
zc>!;U4NME(Gy(8V$JsT3k@c2K1EqUF`W)X?!zZcADy~=SE^FJ;%jWEz^SmP+pqI|U
zOk7p2QB5X}tqa7@%++2!*Ig{V5lY3eltrJLYAk{{o$bk@PbwETBBCBLNT2v~e%VwT
zj#E|?a`&%@y9b2_w0*gf!BM?4!}9v_pJbVrINhGH%47pI^bcJxOt=NkuyV|ZV(`Ci
zfhAplKY?t(lrW<{=kXhA#3M2Fgs-(s62+}`>tfb2P~Pfpws&Ko+_Y--A`|szbAVl^
zbvaq8<Ni?&@c&|mGg$d^=|;!%QCYvhkk7u`B~!`6O5kYd8ITT`zLuGSK`p-qv;#q)
z@;I-dX>iyF`<L~P_Z~w89q!JAJl_>}>#ohxx43pmA)WyMI7QkXxY9OX)?3bncm`f{
zL+bbT)JdiE>RhlWjV<m+=<P3`Si=#2PeVilM&u^ZlW)CTV>RRS1RQs`fpM*5?2U!9
z^?_Q`@qMasFQZTufW7{0ESd`J2{d=#oP+UDT8|Df_+B8gfzG^tWdjScBV*C{%j)@|
zo3aNQ)p7w;w}VNQ6Yx>5u*Xq5tJD!re<ir|*K4K<-)f}I{i80(y?5~%4<zNj3HkW%
zNAqU%2Kn^Dhm$Gzx=p;h`9RZs2)P5A-%fB?2K-kmfmUN^C^La_J$ei9M_wlLAgtxJ
z6i~w|qaFYs{M+q7%a6%*s_4?CX2(nk4w7`*<>`sbP=mkge|@q^dYX1alxIzVrH=A;
z{@pN>Ac1@x;AQ-eIzYZLnpl1WK3U6*-|nNQt7=2=E%PJ4ZTNA0lWKIkP2s8xKOtun
z0>z%$WQ@E{Raa?Sb2_j?fFyVzXUZ1eMDTm$aX{stLJuI^-XQSp9#dEAr;B&a6)5VD
zKWg|dtpJDl{gi6idF}jA(l*p0Ldw$fN@(;XL@qFFw>)MZ#0%V2F}S4%Two{0vkcji
z$f@PQsa63a-wegk>UQ?j+cQ|Q0>j+Ueyn(O=)XM!N;+g{q5pR_jlN4@4Rt;NY}g1{
z=rfh5=GO?yNp&Q-DY9CxNBkHPOU3IUG{Ft?ff2_QH6GJ$${eH8yC~KImi0zASwiD~
zq~{UXX?ve%@wxT^vw(B#VMpp3pKAJ0;5ZI`<6W{W?{*^*8XU1s0}Z{j(9Z2+r{iD<
zgVWI#UII29$hDMcrNYifE}#QYJ1+IUefOi{SgXWte1S7d+tI{EenTs3cYq-?{4#_d
z^6JJKjCui?ZvV6a4V8+nmrC`W;Ac(7<`31{r1ERp&#`=qfb5n+pc-EenDjrm2|@d8
z&7I*Dv%}_IkM){AJBBX4csr%ZC|{iuKr2gTOb}d_PooLS>gM*ud{a8&tvgO1wwy`@
z)B=P8Pv_y%6b%cG*SQ{Z8YB$ufQ?C%*>MUp5tklN=;%qIA)9P3Y9{~jFJ|dz)be8M
zotbR~ukmy*iL&{mX+&pv%V->?#c`c~TQelWYshD8+Vl!W8&!+*FakG>_?g)AdMgBY
z8Ddw{<<Hh1m{V~$(Y;QYxu*h{Q%e`HC#7S~p}=v6=qY+Nmu1t)zq}Xoa^dKqfluyj
z>L^Vvr9YzJSfu~<+*7C-J<}GB6d6czULXi31Zdb$6pWm9vu6+}dy}vRY;@bGb9YdI
zUW`Gk(dI61P)m{7tOgRj5d;=QTV4vI`${#j`D_Y1_G8Zevt`+S?S+88I<EU}PR6N?
zuW@^p2cPBxZq?iEnUqVaip=BcGPz0RUoJq06+d}3r0OCgqy*^bsV6VUgg=)6$}WB#
zUnQ9LuDbgfTHFB-QvNJw{L~89)%U{Ni%758kl0eJ_?+u(>ic_mj#0s?7j<PB$vxv^
z{KJ5yVI0V>fszxm@g5ATq*U~A<uZl6VLg-LV#(T!Fv~~$IijvQ+8@>>ny6r^3Qa=s
z{-wn91)>(tq3Y#8VL6vR&lX>JB&p@D+OAfuTGeH@j^Bp)VJBQK0@fAf3aHpJ%(>Ht
ziompb_ls<$ymi~0@Poo4>7uy_zv80)wA`Cg%^jc@L~AV7hT4~DYj#!2%GR)}H6^!8
z15SpH=1iXo1}f%Z_(r*G@e5dT)iYqUW6bU>+LOQgac|O46knz`LAda(Tx%KJ<7n3b
zym$cGfpB&(5RsRPz{Lw7lKrH_28I{~?w8k}0ZqB3PvZ%2W8_ZdCVdsM-*MY=(?>&@
zFa3CRx3ZSfTw&*HjU>ohy7VaYMYGL3l+4Cv7R!b&eJF)KPXyc+G3s=r=Iuxqy;*R*
zLChb#YlOd&sMkuAB9Avoal6!qMYKB1`FOCGYl=2vDKhg8-7Adeb1$&K&t{~?o5;oD
zDA(4e1RH!!bSxEsR9kdI?bbJXIIGHO)Y)K3&hVIWQd`_^!2;8S2*(8ka-(ZhoRZ)%
z6LU}m`lEhKtQQX+6t{kW3eN5e^X80CV>P9WBA1I`sMSL6A^(xkkt3c>t6s;;8lf@j
zx`MPn7TVEdN~&EQniL=?FX2ac0eM@DT~DLG7L#4&B?zpXp5jQ;09`@1=mw;mD5SKK
z^I}+)sn>4Oh(240^d!WTsnCx;+eEupL3bc*Jq)|X320em%$Mt#eLI$wP!gKDcd=F&
zJ(yK0{OI)p6l2u1ajs-aoq;^hCX)^5ej$$dtFwi&Qffd>{!UXjz!E$Q`KAmCzQKF8
zF?M@(HMwU{vebLXHGQsZDo}X)xDrFaQytXbv=G;cZ*wMF+iurjjpWlmH@Zl~)h!{A
zF7q;i#zW)+6ebkDc^>L?g6EP|SOmR2eIG4|K?caci-VNQr{mKrTrrar*6^cie4pp;
z@@-Tq8zNl%99)d%C^FYAfZz-m*xgKEXt4oA+n?N-?Et2D=tNE6d29u6Ix0AY8k&wx
zVxH({^o(97bDwSpCjo0Qb5AWcx&RKXn7=9kYIHBJ86Q3ab|J~TJ+mnuts&IFf;E_*
zxJ*dwpNtZ_8_>Pz$!fCF(EZqqfuKBr+$Y<GFJ(;Z-Ur0<f)>a6vxQ{t7NDD=<o4&<
zaq4a7H1c;<18fmcPeg!EN3qE00l+Gqm@sJ@Ce4<Pi^3<ziatb>ytgO#@oI>ThWEfB
zs0AVqc!{(v7_Tj-r$>FeH4#tD$^^9@B&!`;)YTZ`aH4;{KUw{z-g-*WAYjpM<USSi
z1LVGcxx8+Z26QKbc_}z<H+K0K0!d;~yWB1B!k*C3XL`oU0QjhQ9P6a=-<pcomvMG<
z5*oDM*a6rpofDQW5W$YwG8TiUhpx)-BSLek^&eSkS+-STs_)7!-eD@2IFgw5u?GZv
z?y?1h`oL8utqZ6DzU=%}0}xR&KyobCd}QMU7!8s_EuG)MId%@_`@wn`25ai=y<;m+
z0^EfD$<p!3zjh|^v*nLAGXJ_`=ns+TAuqdRHhyNUy@P5DUPB0yV%<`NZLVDn$74mS
z&D*=lS)U%On-J@Ag8#~;Q^~{-dxN62snGXYT{YvMMv9`+t2^tJGK?#WRV_lGSKtj=
z-3oJ-@D^)Whhj&S=(_=ouTDX1I#?%sQY%B>!P5OBXzY7a#<_Qv_fq9X%GEHO1z-Kg
zh!KflwcN6vp7d*)zR6{*dD$fBLtf!E#tBc_rO&Fb{hzN>V11uUu3m&ZPuKN&WAfh+
zGvkp`2vfwwF$3LfvhZ<i@(Y_O=SN2)N9_+!2z)JvNJQh@_*k;cRMc^fW12=Wu+XHi
zVFZsEg?Y^t;DfXWnT_9$w>jXP=H`RqlY>I}496iHG~yfgXj2x<jm<&juo*>nlPc%Q
z`9(}GZ`LW_oZ_-Bu?`|WC6!I_9L&35gqX<ycnJb8K5dZiCie~rUwiA3+;_WQh=LL;
z9cjGsID83%^WR<#CRJ1_-|an)=*c47Z>m`8ZW_7EIuHSuP7S|_K(&q{qEBvKGmvg_
zsq#-Z_7D_REVieiUhqQu12cNU6R~x0aQ=DC^=4uKt;>DJCqnGwZ=KCiJ-^MKaoUFQ
zsQ8<y_T?OVK_sdF+ZqNVpks%&c45Bwp&oUG?Zcb=2;J7l-plXCnle|VSxxO9$T&dr
z_T#th(*+J)3JgI*ji<K+8$v?KB?A>JWrc<O-OkJ34mPa|mgb+@)^i!y)`UWTOwYMO
zx%b;S5uVW@rbjc`{jwVXrVDi5!pLvuvbwA(a3cyv;2@8`uGdnMJ1bu^h$!V0iWgWA
zLt^nE)hM<i3Vl;I1SXIqo_4T{2$GJa26JELAO)|b6$q+LW5mmi(~z`o3Yjugy`4Yx
zF1%o|MJ&`ZlU?iFA1<D2MFq<z^dkkF8o8HUeV_qE8&ce;Ag?!^B>=ynA}+rCZ1U!y
z`1%2%f6#Z}`~n7p^wvL{(i{aDkPDz1kE;{>05MMC4q0H5*}@9N-=I*D`g-_AAosho
zkq26rC3yr5emlWr46wb=asLn&ie>Uk1Wmfj;~i?(be|DemAv9{*|@3W{2BXIDO^W)
z)Hu4W<?wA>5|()?hlOxpv#Xk>ySCrpd|T^!Zg^8A|It`ls!icVb&CRXzih=$Jp5_}
zf&ZCo%OMO8;N*vA;8p-7G5~BZhukHL!o1%=+E)qX^M<aXrh`I45EBR``0;@cOLcCi
z+x9mKIAf_U`-K0@Zvn%r?cz%jX-fK#Bo6zo0*QS6j`M;`3BNpsSK*Lhc_Hd}`0mko
zmfZMNtLAj5qh9ET?%E(TUM9Wv>}a?3qz&2Uw#70JdSx2e4wI`AcK-fz&DBj*CV9@d
z?ix7yoOtR5&mreZA|Rv>VY);CLVAfae_@)_q`x73Ruy@6Mj*|Jfy5bl0Gilx@0aX@
z<$U@&{aKG2Sf7(C)60y+)(Na0#)H%px&Ou9TlQ7ewSA+~-2#HZqNG!5(IJW;Dcudy
zy=agw>F#c%ySqcWq@=sM_QdnN?tR_Q7ufs7^G5xFbFDd#Ip&yS9RC8}v9fuZCvO=P
z*IIYSN9X?(e^>z4uJBllb9KLxU+a5fC6o&3jJhPd<C7XYYi4TRhZk7npUEY1pHP<}
zwlWn7^2z6;T`42BB5VLHQ7JUjY1v5`kSCfegQLuSDbaQP7wO;YKwsgPS2neU-GuZT
zu!-1-4N-+cChKLMctS+gYxapnEH^NE&B$qTIdAy>16Kao__gVL4%y}Lg52SqZPD~>
zLK`roXuXdWoQHYz7BA|r@f_z}G{ILR5uVna@QH5{_4CT@VZ^Y&=U-k!c^-q1GdYK{
zkGJWA1Esfjxnj<o2Ia%V@Sqda0(yyzl9%CY!QtPu7aGR#JGj!fg;toW(p73B@$q<4
zTgoGX+~()rps#=zYMvy3i2+ZVVX$hsqu=f+ZP|D-^a1<)zU8qUYoV6Q@r|UmA&l_-
z$Oh});^7#%vQg81^M}Jedx0CcfbV)=nQb}k2Iv}ly*_wH%ATI~!LOv-a>s@XUJsgi
zhrhr5llT;~sGd|nd4t5q@3>!UET)hU<Hn+!Q(5OT`=ES|x7~@!xyb{6^pn?drwMLI
zkw}A$?9%}beA^n&M$qcM@GJ7O2eYg21|9(hVt&ILA!PvwY%AqG%qoQs?xxPUzoMx_
z0`;t4FMOxJF5##t!mffqJ?>zrBJL^FMeWz0awQtc|EX#TuElI6?d8$0PsFt-V<u9d
zQTOUuG4nhhW2D+v(i^n>sjpS40BXMj?G;1zT7)<ZL!V|7DQ7wAYWmYFU!>wl!9O%)
z>2|&`eKKiVMY0I!e!tnMU%!{WBfrUP)+Rk}`(uRzjoVJGb2Tw-iM6tN1NK#B@B8yX
ztrN&Kd4k);r({gbuHtvT8`iH_WoAum%m0SZ|Dd=ZqwRO;o%K;`D0gnsgr%OKtAp)A
zPbGx=kIH*7l5;A6w_JZU5j&+p^kD@ftU@JOOpN<OM(8d|+@{)4x47Fy%b`aq`Ihvi
zzVfGi)_kN^9Dr=oD|DS5wW{9{*!LTXXk)h+Fd!|2L#syPH$T3&Ml5IqB5D4Nlv)an
zjh5C|;m-#d^z<Imr3tVL)|gUGB)hv}nlsfnk>3!jog-vvn#H0feUB~SkQ(dXpyB@*
zE7Shnl6aROnjryI_yAELEP25e{QSp*Rzr~Z3`bgRbErp7KkPCL<JF&zMZZFOa*)pV
z0{TFK?-7Mx>S4OyA1zZx3#!PZ^N+=L2c3Zd`v5RCFkcvm+}d6+-8-zppuMjM)gR8U
zWQ!N4IePDL$6cd!e9?*?gp`i-8^=e>(^%(Z&AdyFPqI+XFLmB7uBNZa_H+U5oc(a!
zu)_w(JlyOGd`Mvm*Cg0fF(B7T;%f()UB(eJ@w(AC^VC?DD!m_Zx%o9Mcu#&KL>F^C
z#O^%2;j|58Yvu0~X^_=QqH>t2gh!c9Q<h8xpSVYSBfojRTwA<Id3+~r8<Y9vci(EV
z4_rtvi)#4hNo3jVPw|oYW>^@?z`Aduh#YEu$|ZtPZef<^b-Nut0k6j%GUzZc7p&bL
zfjGcevAv(_>}KP1>F#!bK7{=8dpBs4;H?|}d*|tEhOiBVBm~0E_4NYzAu6KUrW87i
zyYXZ2uTO+_NX6iwa1JZG6<WO`1p%XZJ_}x{lhi+nX@mP?rv>kB?lxgRlV&6^F^pcF
z{8BS*NP<LfS(oUh)ZzU2(}#Yv?@|!~FJ1a8%xA94p8!j}nl_x;JG@YUp~$P}ef6qZ
z1RWw#)RkR5eaow8YHtkNxky2<=1tp-^r}mDK@Ybm3`-A=t{pjGJ~j>q9Z5h`(QoM>
zOg*?x+2NSQ2dPh(Jcewy+`7!=Vr(Wj%pf|C81qh}n3wH+SrxtLyQaFA*n()4rLI9(
zjHzQVC@QTR`5=Zd`jJ7e(o!9JQL}I6RFvU_w}fjkKcYeB4)-qIoBWA*GF<_Ag9h*Y
zj-mhh1aOVs`kaKC>=Kk-e##16?IM^5i8rNY&R{!ra{htnmA=z)B<YjZ{G1v4wzhCy
zO2*{kRQ8UQI9Psra~FRU;8Gnqs9Q`+qIe&_7BFg^vB(c>kiS(KGOhywLF%w$)%3+}
zeOXbf6rWh|@;`4x){_h*e;4*a*7G>op-taHBt1*O_G@XSC7u2<9rFBx3XM2J+L`<Y
z9F*uI2UDrsJ9+zkLtDB<`pc}vyY2|kQd=F*CNP2RXP_Oev}6aX!ALbK>RiVUf6+=`
zrR}k2&XdU80O~ipiIK}EjqOG~n0;RUG_yt{Z82v>7MA~KoO`EdE`o^}Ngyo=!%k_p
ztA&-~f>vzolzr|aA)_?To~*zRn&`>R@ynQ%9idlj=`GGmvl0cud8=GC-uJgETBzGg
zavyL>pD<sRV!vT$>nxX5-1{&+K``jPpIe<Sc@B+|&y<?uLc@HXN4#@uGt(4xDGo3s
ziT4$XJ9W$G8!l6gI5K?~E00;qKD&&rmyk_RPZ$4G%ADmq<iQwbTVAu;k-Pf1FoK?a
zbsolZ$=n9aR0-myDouGqtxtd~vJSK8jD4)%YOX?gL`7w6-<|q&+<D=V=i_kdU|;Ev
z8GUISH)`j+nv$m@v3DJ_tKU=N^`m3oo8Nt{G_Wxo^xkcj4Ev?)0K6HU?*3*hZmBdF
zIw!rhv9+pzhhx^f>>l*Ma@z=zCKLtLo?@tcv48Ak4Ci!A8`LEU$fFiuiW{;06AS+=
z;Ann*BzUp_l`y*xyw%5V{qE>Dk#L0@v;c4#`X!J;#T=QfZf8aJQ=&4J-SQx^p~PCJ
z-|tpc7e85z>v$_8*BbHbxa(pUX-^7?(^XWXEWbCP1Tguq3EtSu{g**G&uV9)vDFwR
zMen2cg7pK%%gK4s&OumiEQ95L5UQSTo)h#>jSZINQ{a$=J$uC69$Jj6^X#IG;e&M*
za2|v@kg41O<UQeGlkgwl9T4`53iTCS*I;HlhnfSXR6+)T3|?^fkQVskm6*v@?0hqW
zg4Y$+0u2^$L?98YPFSGREz206#}yx=b{mH+9W}>`q`D0Ya1!8B=m44`AR<fKSV!0=
zK}a03;;PwMoz=E)Qm6(qpsRO&$*8axzFHHD8k^|(qP+B=hnZwWbMH%*g)GV0n$Sd}
z6FtmV68u{NC|KtUv4F)pq{MGEfMAtoL<`X)hO=3LcMt2^6p$Icza+KxD*u|7hz=(+
zVtsxb|7za4-R~LSo995#mVECN`xUI$w~s=I9trBekqJ;?69*MBP6g07#QP$~nOn58
zFIVe?RiIS!9thkH8jNK$bIo2Pg7NIhBd#S2uwahx0SFD4k$@%^!>+1xvPB{{c#o;h
zSy993>>p|C3kM<dZ;(ZVQW8Zy?xxeD<I!?H>Qk@5skbEgKXizWc;o-dulP$%2qXoh
z!(y`f5{myrx%l^Pklmc2{#DKXOK*7j3(FTM1EQ&9VetRKeISL^PWv}V<bVDm^#=G5
zAmLD2#lJfD|N3U%)giY3`qaNCCyEK!Do_-#=F<Ihu7D@mzu$rB<_!OToSYY;2vAH|
zjy#3`adG&UYJus^JocXh|8tI*!a&`^GkRl+{?F?XIoTxm&w>9?H85ex5dlMftf_>r
zxc?cJ|2cESRz8^jIjes@TnP4Mh<X#s7ri8f{I6gB-(vsodH}cg{}%f{L-L=||Nobf
zO?m*^CMjt`INp{oa(9!6V!yg0F3gVz8?`r`ht5b?Oz*kSm^@*bhY%T}@iNY($uX5V
z<B)ObL>=EB^CbS<c!#(_0PW=QC6+-r_e*Tftd?u&(b8Df3vbBY)0tVsTZRb_mPnWs
zze6T4JXd&$*DvN4@P6GH_WMGI(Yynaep6WM2;!CooHS5qkH59lD^AYthB6T=T()UH
z;b^wHJjVKEIaOq_>(0HJADU|Zj~4))TuDOF2Z=ogwH6aGmA&(;oy<J}nKb@fD^C@?
zU;I?)AfbxfFOx&*+7rrl_~*xCP@2p^C?wE+^`9NgW-~aY2)_a$)QV8oAep@N0pmv|
zbCFx8y@@<H#j>wK67dX$rLKT^hNe`LI4?TUNOZ?_Or%eteGqoT;KRkOT_eCgdLGL%
zD}nti;2Npb>Xt~KN`#2+1yHACfmi8tPk1^X8dEg1wW4|A8`CQn_adc2a>bu9LI8Kk
zfqr_@cx>lkv$)#un5+$<3~sH(y4d#s5n3)xh{-!$rGhB*XoZ(j1(3dcqLxmVcLt_@
zB#GIcNYRsfhyBk#-*VYcqk*D8N~Ly(ybY`lQ#wUS5gC;aH~aIx81aJ8KW99f1^u+v
z!<~DVnl5@$K0awsnIO|Z`!(=K@6F}`vG#I3wx^c4zniZ~U(8hw6Y^7okv2^_Be)6-
zMpxUDj)0^d_VDygAoS#CKGD+iw>_0$TE#)*G$#nxp}^|b`kQ7UY^RSpQ|-$NZANIF
zvd?XT$wwR(!UBm3b$FpW@%&XsKT~8a%{R7<SJu{-qvV90nVa#xN+Uzd`2&}zooB#m
zI<lb0E53@Muh(N<X%BRPzVaso5pg@rTzcVnTLl=b_L2LoDg$U{UNd)KV9groc!V|f
zLkB<#n7nhCc5h9oOR!uSP*`kz>W*Rh2$CWwj!JE^_rohuJKo3U))vD0Hd6rQU^yT2
zM^E44%MNn~GYBLm{|VK0gzu0S5+$7~BS(0a1Mn|CoKx*3`s2A@MNP**=-rI$sX7F$
z$rIDsoHF;|(d;x5-z$W!RAqe(F3=Y|Z@PLs<FMacDCXTaUbMcoW|=_ra518~ZktsJ
zvI(yw;Zs140Jc3Ysp;D;9K}s(>CVM7a>)5r(cTwYbFngENYLZKKmne2gyX~*)VRz<
zU;O5_Uz}&uoeYk@_^#fI^tYWy3Y}!41q`6V2P4Q($rpc`S9`FFPZTOc#qc_s%Hwvh
zT9okObKF!52hp0*E3BF9*L^}&3b-yeng7JfSS4~Xc<_AMBCf#D2fKC}O<jF_L;JD2
zsoJ!3uiWGqYGEXgqjdv~Q`3B!M*LlsD-}=0yfoksIDPw)(O&7G?q*1E;T*mCe7Sf$
zV9MkTwfEx4Y#D00M{*6~%P8AoBpYT4gbf$U!F0Rn?N}Cw4r&3o71xeOOW9*>+TW6^
zNn7x1;bH(vM+`YB>@%*(&1cdZ;zaXnol5uHJG6!7ZBi)$0ohEDv^U1~1{T27x-a=%
z=@XX5x2_zLEI(IVdz1qp&i4+flypvr-&Y0of8M20Q17sqHKp^pf3is8%6nl<ciw$C
z3Bhv?6-eR<3$9w}Zz(~Lgm~oO<1IplKeupMf)xWp<Owf5hZEvMoOu1hn7zaA0WR5(
zWsqR!CN*aNtJy-^Pig{@A}5a)@fg%$_9JeO7I#E07dJtRxp0Rq*7Pa=Jd!|GEVyp^
zbBQEJky2jpK{<r?oFuxHoyTFTS$jmT_!D`oAvIaQ9F<mi07K8HB_knM#vtxsap0>P
zL~TP}74R6p`;|s+&AmKrCbZ(u6rtF``0O*A3e%$ZjQlSYt;?Z?0)wDG3L!NHLAdm?
z1T#rw*ic9A^c2z0D{CgFZ#~RL27BUj7p8G&BWwDsg<9MwLYXSq4lk?KUn6z|;JgX6
zsFA>ZE!}><w%*4FmwbrmbUHDuL0m)3DNPa*PXFaNlpv32s#UZ6M{p%=hr$aR3805?
zUI5#rY4=(XpU9$<&RTf2#klXCy?JresMeJ{(>Xl#BLCbr!PplEj{wrXIfha>pT?;S
zB?p&I4|ro{Ey%nkmvm8I(r;12N}E$tI$|D(L<`$qXNur2>)A4O`NQ1vs(o4s9ClBR
zB1O4f9b=Dx6-{}}&};#j;}4}(NGl`ib%lC<whF~Aibblq?>pQ!2?fV!HNq|y9f!Yk
z)h?yf904AZ`cdf?z20x(Bu=!gWCQ%Kk4gufz_&ld@=wxI7wF`PDK%Ow@QIBOmaPK?
z0eUy_L_RM^p|%*VE<b|ciQyMwbI&)W5R6(r!gJZ>N%E34%irZ`qY{su+)a87wx(}k
zJbAf$#{V?m-o5UAnlB{*bLt5(OBBUE3XG{W2AAgLHN0c|cI>CVW*~+cXbXFmvI^3%
zWsN;lIV9@-cjrxOkz5BM;LhN}NhG8jNEdvT#tP1CX~N<$C^k5o`eV!~AM%Uu!T>F8
z_rOF8C3yyDnmSGYMn9hQiQ1bY2cI}&|Mpx<lCqCRKwE|--drWoG~X{by0>&F6qK{q
zSr;gARF0qNIZ6wJX9IkHIIPQgk1tp=*gRA`i&SfnJ1m_aulV^2iV~=SNEBQ`jeb@t
zXZoAliHVjEZ%t!9n2zOWWTNq2?a5C8PDf$Y6Q2*C^|#y%(~D$y=8yW>fIHmnpd;N)
zFIwP@R!<u;W0~a?rZ-*XZgyT?j5h4^lr%BDj>x`%v|{-w=RJn>$|$SFLgg{3;qp*c
zW=mt^1LFn%T63N05EDtl4OeznncW_jthZp`{;wYh5)mKwIVX4R@GU~TI~Hz)AEz4F
zfS9ngJ+#_g5q@AsKAy?sxCgPz*ZKT>TKL&yZbfYBxntiHphlwbe(JiKQjGTQadF+^
z^bB$`sVex(UFmF2T=*dUaIi2uQ5abuy@@7%qBx@kzwtMa`-})J>MU_0T%M82)Rl>l
zsTrUz(@<$xprogW52aFP$9%Y5Xm6#5O%4v1dh=UGGo1GPg4a=n_ff%Q-3Iin^jW~Y
zR2@<-Lo9fK#t|EaQ!dleofOohs;ZHu!X$&M9P>jHpvY`#-Y=ET@`6q-9JyyC^;xXX
zA@H-?0*1DMJl|I6%Ri8ud96M~U8?VxOYg|Kw>(*NsAU3aLPeTWYqoo6jgcP7`?7JS
zwj`v5vw+X3NRlw0-+*Y#vhG_);zkVJ`|sxzwY;C|#dI`(+P=#jNUgEwFw?gzc(?#f
z<$`7Q5h%u-;wV#%84hxG_tBhACK53vPQE;SZr1{rAqos;4`53Av>I=VF&>}%fHtxu
zo%zgtN8r<Y%d9<OJL$wbebh*Fr!|MM;SA{B(<*!<S2Kvsr@=u8seZAw%`_M;W7-W8
zT$YA3QkEL#1cquaJLY;5f-d2c(_tmu{0DrE+Al=NxK(PgPQ9h@7#9~hzmME8X6*`2
zd#y5HJptnig(kJE!OpOqoo3eP8!M|&`KDRR<*Y;$<sJF3r#)lB#D)3Q{p3W#hfp9w
zFU?2nl6JNX=P^_K#K*!>lazYUr&+{Z25Zz2lm3gOFsS5yc9J%JSibadIvH}twNqiH
zZS{KUNi}}#F1cS<hq^)!-W^G#%wsgNyM#;`Wx-YV=Zm5B_z$5pAU!qJ`B;Ea=VzQa
zyji8{;Ek|5u6SD?<y%5WGci5RDDgndk$jE$O5eHrfO(C+X7oHLVs(z}Tr;lZ_-F!E
zI5o!Zt$@%{Jp1mhJ4imE`PWw#*B7ApdDGSL22C9t!f#|hYlHexC?S7-Jp~VEET@mD
z_IK!e%+$%X1-hKB<%}x?*L<Yu@=oXc^^Ox`0-6nVLwD2(XfSIo)Z~=@R4@3VT$P7@
z)eG2^IIzt;XaN}ajH=(b&z)=w;q=e=n;K8Qksu>xtyp?S#IgJJ6$O|)!(_0V=^1w{
z6*JP4Z^}H3Th0^5y_kdN>eKPDv~9BcEq$~WSXX2f71fRzwK}g#Jd#$rx+zh+e5^|R
zgbPU3aYZMVpNAO#Wb70c8HN-N8yQ%I6J>=xm%H$%ZmXg&u94pKzlAwy;6vm}=cs-J
zlf+P5ai$17bDMc36R3mJK4{83zNP8lLh$i?wq)XWi%D)LvjRswZzIT0TMTbYt;`l`
zdm^?@!+Ty;P7fOVQiG?y`v8QNKkF=L#t#LZkuvA6$Zrq?qj>-iu^e@A`)}ke4^1^L
z<ws7VMUMxQY<17fGFo7|Wr}|Qq$WZnW+zZLDY#~d?OW(|h6{J@y`po;j&0(B5&5a1
z>|u*R1jIdyuX)UOGXdiXP2_<1<RQph$LG4Mp9OZ()#$u>M;PM;@>PVP#B;L_u1#*%
zUp?J2Mm`QZ+Lf<2&JBG62Tse9&~TJ0i!nm8yN*l5Q}Hn)wT7L2pO1Wgw9O+lP%Ksp
zqQZDvV1%1iF~TE4lO!G3=4tGGSD(t~$vXRL9L2X(zI9<KHiFCBJq@rwSjxg>iZ13M
zTDClB7udsFL?_BSY*s;oU3e6fN-p_aFipa%0qe;+rtvPlCF98IiPc?#^I%!b72^@x
zn3rfQ)7i@W{(L!c_93k1OwI0IP<vRmid2^LVmIZmAVg)2Yp(7U^bf2o>|?UDI_p4S
zJ>&NS-v0QrLy@w4U7OP-;fQ)Gr?Oni9>!0gw+0bgOQDk~4R!%z_PYi%*n?0wq%v$y
zE^FWlJwK(M5QQ7Uy@g%jCDy{dN6c<8S|rg@ImOS#2{ZfeJAmU>OyE2JY71d@)U8|6
z3_DARm|g6`I6rfNLlM#nkL}(B(cwLqjm3LrmQ0XhY*Qt%T6CD-wHf|yUTL^(e?0>P
zF8UF>XU;!wAHdZDxge*{mmA}Q)kxQ8q9+w+bVkIa+N0YE$L82rfeJe3NG*-Vq_bp)
z@-y8@YHCE2FgR0(i+)3~UGKs^oJFlBkHyvI&q`Mh;4{<8#{zgWmU*w^kzkf&R`pV=
zctq8gqZMuKL(47fgL89iH$Gk!JrX_By`Na$FA(O{%B(Tn68znH^jparjod3j6DdV=
zy?XDdk|FMFo<a0MlVh3m`ubgNRBj)~h3$#HZJv^3VcZCE&U^pR{hHGyo5c36E%klx
zW$Dvq^6`=p^$&92Y7=ZX8ajJXP=W#3E2!a=?~EJJ^a^pOLPM5u<AqPZC~rDyc?b6t
zoFn1|v4NZLz%GvAzlvY|q`Izug@5<PW72sfA~xBvex{mrOt*=&6@TXaU@Ix+2;xRz
zP#9OJ^M|mD6Dl#c?+ic&6UE-f%-yd4dn7DLwNaaHOwUapBy+AYD*izBO1(+QtRcj*
zGVlz>8-tZ2r!?dT+S~v|=V#Ps1m|&&Iu-jou_h(DO{!Y2C5sBL=_EZXACLy0FTS~H
zRFw_@fUJzEkm0G6>9P?sXhngb`SNV~g&30bwtvUR!>&98j_GXciFy*ulp*lW%}{--
zOt+mA1sCsikPl}Gyx|h*sI~7>3TwArrts~kZA#SrJL5=IMItwkwpUklRSmU84td_i
zpFO)w&`^ChwB=s#_gTGHBbCIkT-!deK{w&FLE46p!h+a9q&GVhzpQl1F$ntT7iLP8
z6fL@r+^EK6g+17uQ3bDar)xF82|JK?3j+MI7~H$>fFlT<_S<5UjMm=21<$N*k_Qli
z$u{Zc{C@h^-r5m&#-S)4r%R>-9=w<|PLkS;05z<`R%^23hlt-g+-ZwYgDB8pBz^0}
z_wMH2*(=duv67-w<!!sNJvv*9cwp<3F4EE&+iwzfHGr^CX-oM2{M^`;8v`EGc9M?c
z24<x}QGS!goZon9rSX~Na4z10$;<9$=bKc;G}M&KPXdoIoDSw8^cYfzOE7M2G#6Ns
zOe8{YS{FPm;B?cU<e+lpTCpo~ZxkBKc!YnDl54SeIRXrI3eoMdx!=fH6dB*LPyjn1
z$Yd|7Bey{pHEh^o#4g{twNFKr1kIu42>smlJm3;CFp+tCLiFJBhsNQp=mfXBK##+a
z$pMBJ_;5xx9GKW=T<eh|t8`7<rJfPY8m$iyL{4m;J=ed{okV**Bud%qJi*&xC+vLZ
zvoa8KLnR_}%0IQFk7kiZu_&7alADG}zCmMbduU4k2FVrI9`h%Jq-63*Uj<NgVAL+9
zGKBXo4qU%>_6Mx$zCM?no?f8fObci((7!{R(9jWkyrrMnl+gOM9HIdi1LH09nj#_}
z7L@t0`pcBM0~;_nWlPR)PoFnT+)ohKBdHQc`ZIM;--H1g<B->hvfO|<Bq2X*hm!HM
zKFth3Lhm-k(=`poBDY5hm;}6G)i^Vki7VGDT><qnUbpk-GA7fw&LsRnU$1Vp+Cui!
zl%KmIOJ%Ue(&({)w7+rrdd%>VS%GO@oJi-`o$t0e*dAJauB{RYE)dh`E9T>lcJp^R
zsjzc_0#{==A5GsV*~O(`z<mkE8JOTPs<13XjIba4%s3nWtgW}I1k#cTq=lsjd@1I9
zdhcV{`ZlvUpK5%uV;|l8vht{1$U{1rdRiGkLc6|_6u4r%GwWfDc@#ow?M1u3$cqh7
zI)B~-&u1~;*l%HEzMfpKp#q2ta?N%u4LgIyd#2}PGQNxE=d$-7s27$7&R&84zCMvV
z`Yz!zJwI_jVDSyH&`*pnZk4F4<Gq4*eqDXtneHM^C`_t`>9mO?@Mo1`A{6Kbbsx_i
z;YSUU9T@pyc8+T7vQwOwzQ3$*NBL}`S2m>26RL_NkoU=RXz~_QdvYPw33u+uqDsoW
z8GxGL)Ys@FFZL^{3^}yA=_5Zhd2N_NVTlsC6_4-<2(bQ;d*L^Mr*ye^>ny~V5_B>&
zF$t%Gk<P-`c6ew&uqZy7I@Ce{M`=DJx)8A6-YN7PChwpk@W9Gjx|r*5kNrTN0)wN1
z5{!tAxPLp~c>Y^rO|+@Wu%8zgSR2kvJFuP@7&}}pZ+^kR9DWbAV7thXA0s=vr1Zje
z^PT<()(+%aA!dUBo<+MCOW{NOxMVr&X^5?(udAHoKcU+)4jSMGwh#O>5jRC|BK5f<
zn}2wB(@)WMZyeL1%_?P#ktUhP(G2mU1vdOFLsX02Le&(QfQxJBV}K*|76Vs$Zs$4N
z{i`2_IAJ{IgHV+ZxV+WNn{e(2-3O$=TXs?0B=|jnL`x4$7)W`}Ff%OiD!CQ0l>^7u
zw`9bi5dp^vydqqlwabUD{ViC2{0n;r*ZGy>Vv*|AHIMf0Cp!fKR2rT5l-?vgUmT8E
z@J>HW0SVvt1=kDMSRooYNFxWrD;`hk1<mbTPwjW!D>=IEgcDUEbd6<xrD1j#DVuyc
zXOui{w4=`}%50iOl+yeGX-6UJKy}#C9sr`nZfCBx$d9CJ;e!*}E`*{F3;A7q$5XX0
zoe-U#wIDznq^?)a>VxoBNik6V-*x!wslqegb^edHDZp~DFQ)k@eZ=*C!M}@l7@v%G
z_A7T%$~{*$u<2^uqu7S?`X)lUhuEr>$UG-&c(?Xz{IlEqsH2_~#F1(SlfI*m)dB_G
z;#5#Y-Td(t1zLBtGJ4Fa@pc4p!v=rtxR|6ns^se2x8*Tfz3CD;7}E|>fSa`b(Ljd|
z+RM4!B5dvz_t5FBqYdTT{9NVy+Nm^U7r2H15H!4T;GtVw_KpxcLFXSGl?0^XqlH9H
z@br<<BQkV{*zkp?r*RftT%B;-EgwL6fWS~zErX<q3Ol3Kj3rWB&_`TV>tNp9Z0Xs%
za2T1T&2j0cx~i!R9yJPu-+nlW%%^Ict4%UnOZZPosE2UEu3v9XuBD7yJ>6FAT0T;b
z29P6;M&Li%ksA79bcsT!0_H~E<5ypkM>LA736oA>1@d@f^kvpcy)A^)P|!TFgY#6L
zFP>f-Ui}%kMZ9fifX^cfRD%XWg7191&eJ-5^cFxMdkZ5Rz(SOtJ<)f5xKF#w@*WPI
zz5`nOqv|5)2j8|EALE#1ytdZD>iW|Nw?uclWva$#YargKBZ(~kh8HeuEB5nf6|Z$#
zH(iVhoHGnFNe!3rDNT+|<x*ri-V)W^7&8v>Z}mQ_y2+_EzV4~U#TG%T@h-M4+6#I;
zt2SPjgu%(iQBf_v0AK*2e#e}_p+8yPnp@Q#)hrpbd}K4+(2>}fc9px9F@D6XrBQmq
zcC-5g<+HeZu&;x@m%4-D<%M$?=9WSuQ@<=9GnOL$Xwb{o=YzM?N(wx0Q@9hcvo^a?
zRbfL{Fk;`{@|q`q{k4ozWnRgMD9Q^hCw3wnC*1hueTUB>mr~(nHQhTBbWCAMdDDB_
z$9AE~sP;8Ptn*tt$gtbPMxnsw-C=w2&XxJfLXDN!$))#~&-O_*10kx#<{sZ|eorCm
zt>ARXtlGMS(7@WBb@)co`GWL_JUPbFa#l&!;+m;$rg7*eb24ExrW@^!N$M6wW(7Fy
z=RUdrnmXCnm#%3V#b~?B4m*6Hyy59Y&tlf?(cb?)LK#9+3_LsREY1t%`ple52C~Pc
zgQ&;|Ka2Mo-=R)QXTXAbeq}0Fv_Seuc-qNmq)bv2!u_xIiqKQH6AM5}TVXtUc^HO^
z+TP)=k}h_nie(z&OQUy^ywjj0!1e(dc$L!0fS2W`iZzGwhz>H)INkm2l!AT;0!w!P
zXX&#B6fGEqc2D#7$;l=?6TtI5$9`BOmwhk-G?zE%Qh+?2tT$c-6gS`YfjI83&!xnQ
zRr=x%-6Ap#ZX>y&URBDswl-4<`sez%N^=&>>)`gh1S~?Gu~wHM^mt+*CzZD%+KB#0
zw2DMPfr_<$CTw;6W&_JdBNADMq#{(A1g@d8uk>RyrQzE!do@0+hm{jOm2mE?Ux2UT
zF8M413TSuQT-P-`!h@&M-^8=@d}L?^DD*F6Pj<tS;$X4)TgOd*i&A&<i^GMqS^oaC
zrH<AmJzeQq-XkblfD-w^j3lCVh$N!Z$n{--ztncmL~s>1t{WKsga<|C5}!&7(-To(
z;t*^fa|PlqH2QL4>G0?NTME_ya)^d1LQVZBEtIgyWLA(mgrle&$0gL9+N$Kt1)xdQ
zo!>ic_3kx?LyhL>KE{rrk~9BYY7L-xol9fkzNE%cFSDB=G9_})6Fz`>7h^3BXxcUA
z!G~8CGk=)vx)y_;eo36j9qTME1^t~EQ?Up2w@eCxG2u5yInmuy2I=1<Bc}n54^4NP
z8riv|mt+5sr$*Q-zFxocxMFJ<zR`M3dq&uE_Tzlt!R)IYXBC464KKKhR=UpM&`1-v
zZUwl3^|+D{ST7neQaU7o6m{PBP+~c+f2OwEeOEaI2W&P<ad#vElR<#~6tBL&oSS(G
zJ#9aqNXRaDid6jV{O||fMu3~M$!$6`C+x0~$JdVugOQ41L>QI$xq7|+4ATl0g!07%
z3aLXV6P%2%(<_2<Fucv6dZ_mJ6*TtuKB?sFU`$fUvxYH2SCpOv>x*BSP6oE}mluA1
zD<adW-1VAzQFB{e7E!?byJyeoITn)w%3gRP2+_P4K+qtJpSMF+DfR%O;acdBv)HgP
z;q98cSJ$4DRxw$MH|PDjYcxz_&$IP|OS7PoFn9&)(WUVYEy5X9S_cohBSVLPnQ6-+
zQy`Y5ThI&BE#@G#qDuqIp(F%`XB>Do1g9VX?P|?Q#vwY<i03Zh7G;uQ{{Vl~XB9u@
zi2zpcK8WLsWgFUL>G7@Wsjqu3pE!Uw@d0oP&<zR{Qowdw`t_^#0tlunsvIeK656-B
z>Dj9sz-Jmq3WJZbtV+PEj%-n-M`vuKwbt<?K1ZIx>^sZ)a2!ThBn_-9!K%SspMZvK
zs%g1h{`~23qg?=A<-D{71lc|N3qmYFEL(}Ii4m6+!ge-QG}D@b<Mzz35&3cMBPP>;
z_woJM_9jFN$8GA+Vy0QKR8~vzoL-A`WFgOJZUQi{r)R_El}s^r+|UL}JLsL5CI{mp
z=d6YL1?BV8=G1<*{gRTm`25aO%QUSvtmjaxkNv}i^Jp2D;vdJ6*Ch{)2^`)K(SFJ8
z?pnazWwTH-oAB~z%Ac-0y8gDblhc!J4AAP~;n`(9K%8NLHGd1i$o{80i#toNCOnXd
z$S#HyK)-DY_)0R>w+pP`O*=|HzfD653kj9W(PU-{JWoUP5Z+7IHB|`6W>D0hWUM-9
z^tfq>kNv2AMZ6VTKDL=O@_usn5Xi0Gsq8NUAZ|=J(Gv<>mTxrc>w5DH0fKWSK}}=C
zi!JvUQ!%-M4)CnqS&A$$4Y6sN!-Ul)KLlr!O;)RY{jxGXw)E2L8}U1ROS>|Ai=MqX
z*Ij^`{56I_RU?r^_m1QIh@UH8t?hlZC<&Hz<r4ogB?uTg$fBTDY7CVjkH_Ah8f*k<
z!G`Z#c<so^bzg;abN~`kXTw$XgZ_tZdAzb=o11OQ%0i4g)Dpe@k@)S{!q@oh2~R93
zha{5DlFdN-Q0--}XhEx_6uaeEExRUSZt9|)m~3=<V8xPW7kQ_ZFct$)lggk0HiNdG
zQq+57=rTm;w7_NhuWqzP5s06g5Rg(d{>F9)CA`pIsC3?`3zHIJVP@fn{yZX~!lB}E
z*`neZk5GF_-6M^7eu)No=8V7w$Tfhk$U~sX;CDLmLlR7lC>4Nr1nX5!0T4_MFo)1!
zy>^6!0<|uxYgFY;9C6U*{|2RyJ_BT@^k)mwcK~w*9*WEbYcd8(QFMWyw4So!FWW#@
z44}%)k;T15gdX>KMea+f&<})C^-DQp!LzyiVyx$gg)rA!P`m>o4ZL<lf&Xh?+cc&2
z^4<S0lSc&j5>O@@{WSJ}{S=^JNxzVXjGMmv1dLSv!;%4xB!B%vp;;*Z%J|<u{r}+O
z1HZur<1&gE4t8VW=?yizrUPyaMB|quWrKb&+wWO(k^Va;WJCE<YyKX&!_zRCi3cSt
zEXR7Sv*--K@I$3Bb7ubK_S7OOl*R~2y`9SScQE}bS@_k`goNK&#F$w#^QS91e9Iu@
zRPUonXE~HYmI`aE<3H4dfJOH+Tk2nDl5d}>aA_Zsv_Ga)e0#_5cG1si0k*bOX+!7O
zd{(4W%9wcfrSS;GHr*DuDI`3fW=r!M!iM}EfhxQ9iH+j1z<xqIkh(OR$w+U|cz&F}
zfL7w64f}^vBIH5#t4dyd4{%!^TNHGC0tEIeANr(IPZMX7GPLW$rW$QN%&wismrY>M
z&o*40yHNZ(d+58DFX%UW!g}x=PLWY9;jasx1mn{w1Oh=GQa~nb&GMjy=e-w2wZ*Y;
zoqpvy0dQN>Dk9n7FoA!-mCNOP{#%vTuV9yVmLt_T2B}u!RjND~8B;9{e`hjqo(D3k
z`>U~+$SBfN6-2c<-9{NPsea^|u)NhgD~hF`h~YxOdVFB>iV-;jVngLgq)v_norJc_
zG|ttorFgF+mn0ja!7x2oFfQ^wY8(oxX?*SyK;KVpZ^VAEzgf&sB0dZG)Ns0FIC8^I
z<fNrG;a;5V=E$jE3+iReLs^m1ru{9>t*_8ZDe()wTD`x@+UUSbp-d|@cz?aqE!yVA
z<H*Yv-I~$_R_1AfDXTEqS<DXj3Rqw8IK7|ehyXj@A_I|Q%STb=Bq+6fNvqm`Gs+gD
z_FDyl)b0i#>4Duy2&9OP%CmK9{`n4IVhqAf{+`H{fVWVKrSGBv0}n4UA$L>}4Un6v
zCcpO&OUw?j{h~Xbjf%+9lh+UxcdR1w`QGuIZy^zI-=53kzWlZr%%t5WoILOEIe6l=
zXYYaapfj`%+u?}4jns7~KT@Qq(B#hQh_%#|ZrI7RNJa|mR@;bV&K1hkR686Frc_P4
z!@y`x7dIR8Y&@=~?^7GB#Z%3Ib?DyA=Ny2<@arSw0BnB{Q+dm2<FAiGBtptH$+p8I
za~?~hm10V%lt<_m9=y4}%PGUGX%rA4Z#<eDzhL=cA8_&EkTBTBmK4{`abezvl3_L~
zqV1)hwZzOTeZcx*9}Vz@Ok?R}Rl`WX+wyPhK&fJoj89i5+n-J-hEbh#bJFMU=URRJ
z-w=3=%{56bt0jI}&P3{P6HnqIGf!kM+I|DE*<=oxB7?Ftv3iX|{vvma-QF;q%b*3_
zKQ+Wb^qmk?E|Co@Dqp9j^t*2Y7GJK6*yI6Ig{o4y;!O_8mgR-ssWIW<*$WS;5*y7S
zr*(U0p~?2Q3l5iz!P^3O?o1nJ)B7G+Dow7UdllkV=sA0_^o4sPnhkMZnOwOPgZ@R$
zq50gJbO-?51%|wCQCAcQkS?8pg-;ZF9&grBwgJw}E`XscO81SX<>T(x-9?%?MS%w}
zMzH-m`-a`T=fmi{SDNLp(GHfy*Q)RPPc=rt0K%%r+oG~J-&*rF?j?KT@*qNAf++vq
zy7RNL7J?Hdi$iyYhSvRO>pc<VpTEmPSLgJpyY4SDa(HSAtiz<zHIs&2ufG9^u_=qL
zqZY_VYPoz_la`572{d?lA9aXe%%oK)06P;cJ%i$m&v-_7c`!9PLOak*UCn6bc#sJY
zE0P1IAp_X&cCM|$#~gmodI8%x(n+UQp{R4<kzU1z&b%6nRW&)k>L=UNx(`p!aajD_
z_HIVR-eNYmvD&f}eN45&pXoHColcesZFjzBM%8vRj(G3pttL;wP^<P+H2LieUzW4B
zS*TYB>Jwhx*=m1u`zJTIusBsaC^zHtkktUSv}VeiR_mkjiqN5otEP|i=G;N~5SzRD
z=m*(&Mg@VqbEGRSQzU`F%?_LaQ_<xj8|>))&Wji%C%GBI^fudA%in_;RKJ6Id*aBz
z>ko&E+?Hy((TE^kJ82WS!5Ahf<%aC<kVYcjiz$hcJ=<>87yMVywzD=>;Eka7V1@rH
z!^Q!K4*^jb*ZGq>u75GKnThE;CArzPf56*ONgs}?H!}}yU7X%L&9AF3X)r{5dTi%q
z^Gr)>+K0PKa*KCO_t4DnHVj&1(@sCEWFXtv><N^YHN_--a;&PXXQ(LBdgly*s@y+e
zU1&CI1EOtMl34-~=<^hc6~-0YCTZu{n6{kF%a*I<{}8JSN4~^yBO*gl-)&=1S%b7|
z8TeNte9}h~WMnIyqO_%m#7|2y23&3;p198gG)$z(z1>QeKg**BCH(BEgI;?puM(}@
zq^9T)VvXhIRuzmthK{>i$dnfLlH)Kjf6E~VOE@jq%q!=Pa4Pk<&$Z&pv2Swc0z>#a
zyUMhQdEOURI{#Mu<O5?k)eX?&F_}DqiD|FeE8h+f>^%I@6gF~o<8#iUWXV~iu5MHE
z{HW(&8PooX3aU!$02QtDm8)CRX8~zJ-Yr*LIziF-lvVxA8CFxIrx#fRMxTNoDIg|`
z8=1PoJz@ZuBe}IwBSRR)vy-<m^n2i^1kpG5iF}8RI+yTu(#1xXqJ)A4W#qTl0AJwX
z-tQs`0Y!;`D+i>xYNFtKP|21mZ2aEln`drphr2*}RYdBIohNBywVAl4dY$Zi*>YnD
z`uVF~x&%|pUN!=J?rMC=kdi+JE%4X7y{s1dMe0PWmF*NODLw5dEG1TsMRK{7cY#vR
zHi;IA!|#L9T7&$76`r?HJ(hKr=h5$4I0v)Iueg`@n`SE6Y3lUh(XlFv&(dMFp2y=j
z)w+FJb$i0x=lcWMg|Ejfy?q(9>JmpQoD1VrsBUkRgjjRm0chVvh5!%)=HQ<E-DBj~
zTnL{hq8=W@jSAiQ!zQfj-En-$Re~99SkIF{UuXci>3p7ry?1uK&{@bTuN?4B<X7Yg
zjr~!~d3^GXaBG)m-^SB=hKg@h>^Y_8`$C<$#9=ZQg!a@vZPqGxvJy?5c0H^GNYTb0
z1Ul9qHUnq^aNHPT?p7B7z08)qY--Z9$w9k0^N|Gt`s7th!0XaCUQZ?q1{{r=ZGfLV
zuiJ0Gx>!Wu>5j|)YPWbA`sP;%pb1i#b?@!wi>|r<7g;SnFj1EwuBB?*xe<JFsXf%`
zoNY-h!Z(**A!JoqI+7eTRO|Nv@p_d(u^U)BdC!;}Ri+<AHH7W*=s8i?oM@K!`vWiL
z4GQSiJL9?TXH&OY@$Iz@HWK_#dp$P47q`1@o*`?t5><06b(S-McMg_iri~8Ix(gM|
z_v(g9Erj>qrMCPMs-5n%cFsGVT7v;4>Rw?sd=;~bv!aES4<qH9LF9to%RQQvn)&9P
zXr<y!{1?qmE26+28r+{NA2AIL#6D;&Pn)O99JvE(l}~G~l(!6Md&{az@Jcx0^JhZp
z6sx38BT)|K9qnZ7U1UHjIu?}WmQ?K9*(dVTcnbT$b5v{fY=)EL()u_1VTb4S4EQo4
zu^W_;EDByf(!A3fC10%l)|EZpYVL!6YJjKjnz@-lAYppRIWr0x`<a>!cKbfzzdpgd
z&Ahc+EBS?~)iJebHt>nB9Eb~PhbOCBV0YFuIeqSg!I2-bVyy5m@m*Dq{5wVTGs8aY
zrsszdmFSiG)Vp;?stf5!3M2NdltW1J(g=v0USWMKIs1#crW)%eoijiAFKr8<DtQIL
z47*i>l<E;Z;DFSHSLhx6^5}1LxqOHuwY)U+(KG%5F)Xj0xQSBx&<~3Hnc&sA=GRoe
zcwhqBRv+Vo*U7hO>pE|*RqdX3#`V4R9i7pi=W`X=aN!kZCud)M&aZm9^I=PKMCD|8
zBP0}=F!3%Ht_xWU_YIo{X>UTs&O)glJtgh%OsmevU3tz^TQx~<(Em&p<8iCmRYhaZ
zKCw0GkV5qSLN{;5fJ>e#sOP68jam-Zk|>2$1lNVRHB;B!-k<ho7tXV}F204SW*11&
zh^~{eT?Bh00!!0|UpURz0(tQ5@oL);*Zkrc{hzy1J^`r-r!;j4yxPK?hulI<7$hSO
zEe%=3gVp{c_Ruw&c12nXTd%Q#mCWyQaeDgt=mL3Q0T5MebjE%TA0VOz6TR$?nuSPX
zp5~`o)ujaFAoAy}ejMk#svoRkSK8Qw6GNQj>DH2jTN=Nzg~1~r;ddw}wuyKSTiGle
zF0zMkoB?KK<v*booEPfXUcbfpz{0FKwDMK{3^27(B~hW8OGM8000(?5x!{3v1lRX*
zoyR+M4|I!K-G{IpU8aL1-;E%#e0vPWjfNP3^BRt~+!0J>pD6AiVUIim7;-e!H!20O
zRqt+67i$e%JIvLObSzH<4;Y{hedTXq<>8ymhT(mYLeO(n;tp{=`E6nvVI%FO(`9Tl
z1ws#xS1C<Xl{kV<0OMou`%cSQj;&8E3mo6<pO^PAv)&T%JNNP(F8H^oZ7WH|>X+_k
z1fa>tR}~<@)0I#9Hb30H<3(9f%H*k_BT6v^odkO<cL%xXiazCEQ`7euT3uK2)}<i8
z^f?!eMgw6DzCVcYfl^wNt%^RQ+I6o;6BK9V5(~O|#o4vP$`4Nq9}aV4*{@<8OMi{O
zsU+sg1`eVw8eJ>y!|<RGQ%~uU+fbSx_1L-EnQkC{F_!R)?>*o%(v%!9^G=W7r0pdW
zTG1zgnk48yFf2yWZG4OrvYCNV+Hk+P;8ts}Nj|h-QD?W1ti|TkczP>+_CcW&i2p$O
z^a=s+L}5oeOyWc)eS4jgpw^8zUg-AQ4#Q*2y#4v!G;Zj1H{}h=ths$pS6}M#x}2Mc
z8=q?axg`0Ih-7Qh8XQkV51HB~U>WBa#FugTsheP!<(skEk5yI<l!_#jLLY9=U#F<M
z(;q*YvxMuvbzbW#kL#*-4%S%}Z5iexIagqaL%vpqRHgguvKzHXBevpV&Ht?v`KrM)
zAH-U)sH+1Xv*Iv%*G_YVl;)!cuS1D#S_|YHZ$zV>Tsamq|K7)jV(L_Wn|UG%NJ`Be
zM_8!+CgRRhtJpLyeS&4?j(t!0dfSSId~ih1*7IIE&!29VnJ9I|=$SPkqD0479=5Gq
zLJRxhfhOEU?;Hnk(5PfZs@21$8Lii%m#t{|G4O~)b(<!7k4}n>EBQMGkNP%~Nxi^L
zavSFWu+qTbtn&6qjYj(xH3v8-d|gsw&(fBGBR*hS8+P@wSZA-8O-Pb1UIu!%6lor~
zRCkwi&h>=MOtJl$?@pn{q2D@ochpsR*=K3}Qh2o713NS+JgJP7->U3Z!rsvX(G1;`
zEx-!{Hst%{quqb}8B+db7EVJfv^K1X#58<L{(3bu(kZdtPon?{2?7X^6G|i4a9}|c
z5&!0B-v>xY_?4xZu%&Z$20_E~O&V@Bv7YI6v_cnSIm0_AaW)x(N%%nuTYIVggPk=6
z)LMo)7p`5OM69zuc$utaoG}dt^)wKEwj-NDQ^bV=rx~kb-CL8F+Q<z0XmJ&?FNuPB
zz)J><sw%$B6IQl#om#(DHq+ZeAdH9DUZ7mC$(4DimRPEZQyX?A2`cZmsh#nZk5n0E
z(;IJGmu(wkV_a%Pfiuw*-Iqv&vXW~gh&JF1*+1X_tjRHHprx<J5`sRi!Y9b-5r=;)
z<|-oBZ3zb$FtjT*p4KT30VTQB$mX%n-O@oSYxhQ<;1@qJ*HGCUwsoNJ)c&#VkFdsi
z@Fw|)&}uI4)T1^x-euqfSa{S%9%*E3ntv0q;6EB`b;w&uH*?Dcu94krsvUQ{chB=1
zqg6SW0z`}eS_Du{EfBN`@XC){535}xzo99k^*UiaZ~XT{#ZMe!NVGX)Q|G9&cya7M
zXTWM}c6+W?qbxQie9evFT-WE67hxxqPa(fS{HVl1ChXja@V0W2A%1yAS4{3m(36v2
z`PP94h+cYOV<^COjtlO~1_WY5%liPPp1;JbLY1i~@k=!fM;>^>duBo`J3J3?&|j|&
zh)+%Z_(ahmb`d*Zg)W;{^s8IH<vW|dRjSzcTiAY!5dX`TcbR$z7*La%h{Liq17oUM
z+P%>*-$wQnO`N3L(Nb@T(UbY3Q{7v{>j&2l)ACdoiWrGhPc}>zH!@d$pz>B*e&S!O
zQF2b+N1a=e1mX?cZsoOJjhYP51Mb&{7@$cLpGREpv3_8k(Km(0{+=cMbf#jRFE77(
z@=G}^2xFihs-;l)C!y&AR<_PQLMN^FO8ysD)8zHOwUGw!FNE0($!7p4OPB8Q+G5`Q
z+f91~y|$M=5|M;R_o`CheV?QqE%TP$*|2jsn$DGHnt9w%Z||=F9I>XXXx|WEvE7-)
z$4dkNggYD%G6A1~Ar~nmElo%0#ox+*aMdyq)hrog4W-^5c(8EZ0A>dFcftNSlK>Gv
zk1y@4`vCj3mrF%;;uodynthXoC|)nN>gTq^?lC&M%yr|js`c9$jq47tU?8rbDLbIh
zt~DmJmf*&Qr3^_Rcq1hIR?fUk+;^3Y@Al)#_<N^FY_KGCil@Nyli1aaXHszokG1(i
z+F_Mg19&%RmAX9tmQibFK3d20+6Rfkd9ac_w<>!4#S}qe?{x?B06n9gtT*l}-RwZX
z<v?XZgKHBeFdtRMSsRjg2)MW|6E74nJRtF!Ix=(r6OSy22%Z<*zZcd$$4cJKu<~W(
zNA&38shKLFW545ROqRBK@@{?(zCQu7IEz(otKv%`^0wb;(AIFfE5Ollz}%4PDSYR5
z+ob~RN(k>$g+25<{XcL%p=QQ$zY)>I1UmB8yAqHe&b+9Yjg?ljSEcpmh!L#r9hDkp
zoZuifrQlLcMmbvIHr4W}xUPgeK0tGw`zR(ohbx%#SL}PE@)`F}2rv^M0^6&RLYBFn
zvMt%91$0pMqV&&XdUXkglUI|3fq*m}j)6qb{S$&S&*7g<GgC5V;$ytpYj^O4vL<B9
z`+&#o2j}rZS?!1*;5D)_Un%}!Z>1)C@ChLN4|;+9hH^hRbefd7HGoZr&EQsjeCHmA
zoN{@rqT1CG9mOydDLhbvYi&w)rmEE~VX|MkB>)n|XvIzaSGXATJ^HaKu?(;qILEpe
zmu*b}J9Zd92k^n$##C>K&r?w*66L#XXRh0)`Ve%-1!Wqc?H9&QPO`BnIvMALQ+W&e
zO6eq>T*G>2;EJizAABjz$-7lxP(622M0ye<Dnz0rA)`HA8a7C8W8ieA9jRjdSMaY;
zp~0=F$(Ouh^ZiG)`tHo$Iq3v<%Ko``Lbj8xBx37s@jNBs100Md-=yYZjlvTxIGiB7
zY@2a_W3|0Ow#Nhzv4c=*q(x2$IslyZ4N@DM(VL#W-o^!N0}^Ik+8OT;rRONMHrF0W
z;GDLF>rW?ZvN>>J9~0miN9E&w;hFsP3}8^TxQ_yhpJYny^~1z4J*@or<CD+{MxfS^
ze|T(9oSfRabh!uE=e`zq$}1l{UR=Y^006?1b6le2Yfv64=o`sb=?r`(O&eaLI-ibL
zFUozcN!^jS>I+mC03LDxCm}vjLW+KBHAEo=j9JL6$XnsJ4#H~FBezUr2n%M=0)2_)
zt7VgvHDi_xb!tPr09!I!ERg~7>O#B-PhZw+E~>WH(UlczK-PYn&}MiYBaJ9P5uFYT
zLW7ZlSk08G^O`psyzJV0{4HJ&g{ZXfniiOf$q-}ffv8VA)q-NNr(Tyzm_Kx&ct@lm
zgr}W>(GCC^ex3hJi|2KiX(D8<Mh!0e%s`9h9+vqj>Zqr?zqu4$49>KR4c3OwFf{%%
z^N}CB$ca>rfjZFO7ZGkRe41x~ap}OI4z;~i)b&4?4@-l|{UPYRug+yva_3qS4(*32
zf(>*_Qz3v2?N2Ahx4;|32m?YE`PeX49M@~|XSeXFjIN{ahr)-M1hab=vqzy4nRCfI
ze`42IxbK(E%NPiwETFxm3N*315WCy(o%_@#H$Nq-<$)?Rj7zdU&s3@#>5~dsdICjQ
zL51>8cYaCR@&u%mc*_cSDt-z!8hIRFJ^H>pX2ZFhe1m~MgOpp#j|_tb;qJxq7WMlE
zTNoqqZW;{9o89ziPmd)GN1`N7ho6V{4PP!0*6Ts!<q~(#^dZ&OYa*8JO<K3FukVJq
zs&<i*ErSuE388LmV|07okT<M8>ubf6Qk<v#K@2>KN%zK0ra#NTZS`iU8r1RZP`<q#
zkrAMR6%<T2XsG+AR}Xa9SfF`6@dN1?qx4(Ahj|NKyy%GU7Boy5{?QR*W+Ju^W1pVs
zqVF<Z==dsW7rg_1!405A@nXXN0cV^(Vzw7;h3-%1Q4ARG{>d%2gxmw}`Y~Mw<k-lP
zjDUvyvm~i@BqUZ38Cn{^kycDZ-wFN)k|C8uWH*(K_rE;Kvkj0xB1Vr5`qgzx6bJ>G
z2DIx{8+V%bo!HELlG9P<z<_Vp1waU1)B=NlrVHRYd{Mki73n84NKy*`4t;H1{Rr!c
zviX8=A(!PsTKr$VI`NC{It&u0_HSqapz#i@_5XjEd#kXv+O}&KDB9w*rMSDhTXCm&
zDemqr!CG8OaW7EZo#GTI?(XhT0zrbWNuTF^zyJHU*48@KW{)%>xs!R%Ij?b!F~Usq
z<&(bWwq^}xwp?rU#5smwN0CB*&<Bb3JZoK>>Du(P|18449&t>?=hrEiO=|vc`M_V@
zK|=7^GTt7ZAN2R1|Mg!_JiF8_t?i0S|FhhGU+Djnm(NabVq{?vZZ+`Gh@cqQ+XtVS
zO3SNL7jU9&YhaMkjOLG@ZPyAz9A+-u4(D^j?V2BS+l4}77P8`>SL~nn`GZU)3KyF}
zjXDGnujEo&zt89O#TwkuzXg;TxifOB|I1>#<I@)4q`SZ_q1)aPbs1l%3~5uSw1)k>
zzUMNev7U%r7)?|mSN*UK1RL#r&!eITERd(MK59EZx5hs#Hs1w|-T!-e)1cTG51HST
zCT_r$UKew{|MCh~lgsCx@<|}~z3#W>dK;B=YP4dBv&|CScL_HfRgs;$arDpS5D8?g
zcK7M(KfpGi8Sme@i5K8Fq2h8~y1+E<B6%QT=m&81w|wq%v5k`1=K7`K{Q4F&`=Zgs
zs~X2vgk&f<!Mt*=5YP=&H7X38sHX}9{jQ=$4cg^CT@mLBq<V~~*)L6{sJ-406C-s#
z8YQ}Qnn7HW*WYrml-Fc*w_HsDhP@|AyReO8t9djBof9}0$2YGUU23+70F`R>4CQVi
zNCup9+2_x+|9bT}UT6piE6DryAYg6iP-0Mv_%Pf8NNKLu@m@_@DfL9Mmp0FX$|7<M
ze8q)-y|IFn|1{W3P{KU{@nanWdm`qeg=I(>E*sC98+H0s?Y#NsbLD;7iO>FgF~{yk
zm0jrK7|l$%B*pFJ#mDw!k&YkHW&KMY&?dV@zzCoq8Tq<eXzyUXw{*^O^}PP4QZnO*
zjDo0l5<7!QBW|ODjw}q3tfaLzEF77(PeH)%*`TUFx!e)Q7xPj$bE&wQ%Ai%Ea8wt$
zDQjnQXcB2ls_5wRdP}z!CNij23a!XxsNgj&!I6Gbh7cLFp{idDIO@Yh;W@j2=uFtm
z4(AaYG@g@XX?KoF2f#c~j?bk&r!Uv0G&ebUp!u6<&=@Dllbgh9qDok!Jmvc+P{;ir
z<<6^3tB59Ze~+t6d2HBJ@qufw$wwK>9B36VG2VB}DlgKiZKgjQ6M0gAXqM-_R<d}r
zo@Y@}zBl|JkwqmFHm1qTv{olHr@(Gb4rrNM%`~jPJIPNp`Oi-1a^4GGTFwoAx!gJ8
zvHUZcy-%eS0R;r<cIB{N>7>i>YvGfxx~`*)#Ymz4z7F4N&?M<dAk$~<TZQw!)tGKp
zGrDhlr&eFo-F1E#OZ>|5&lZ6Um$vRR`R-0_q+L${lXIvAojC7PBZsqI&||&GT|>R)
zyl7_2$mg4(pOG&4F6RL_4}z~jRgGh&#;3CXvE~+@F1JW$`*)11sb=>X4XOo>e9k^s
z=o4vAmwqRc@+@byaaaQrBee{cJ?25K^<N5Ibxz$t1AN1f#SQ2NzuA{d!)}P=6xhYC
zW0dE|UUp#!^cb~?iIG85=vQ5~;faLfhLvQ;iqMwWuTBz#Tqxj0%3=ddnSosp0T~v!
zCp;d??f~1LqFIas0~7Jy$Tw>BPXajF%2)SFoiBp2wI1$)JAr2N&rjTb=mKv<P?}z9
zFlhZKc0gC1BjWQ*(zIL7V!#-dLj?w$L~B68x!pG28-as=q(-?+oerC9=_#@no4gxs
zbu3@Y+)}Obvc<dk;@asjB1`<~5+!9%b+4;UGk%wiZ~Sw-#l$n^aYeSn%;z#^KtlZB
z@5QNL)w+Hg_0Yn4dQDpC#Gz!h#VG>8LuL7~?~%zByZH5KH}hEcpU=<KYjreA2F&~h
zru?5>VF{&k2fO7<yZH@EW+o;@F9pWu2Tr0CKG}WN(}ASm;QK~Rmuf0~my1%qY0~E6
zDgyrmA_-Cpfh2(GDiD!IhWkRN;eWCK(xdj=vJk&OETek{)q~H9`?Xg_TcwjL3#)G?
zylUt58-wXE_#d8?#hq6cvk^Ok#U_Bs*j1p=!|<({H8VIveWO@j=H=YepSC;x;uDrS
zW;bh1wY^CwUXv5Y3X>j0O4jv!n2!gEk_>8(IS@mNi48ZtJ6rN08k<4Wkfu|W{=8qv
zy`uGtaN@{IpkHQ#Z83D}w50+Wj&^$Y^z|@lM0FD6UjcGP`nL;7q%gX3Y6sB<WmdBw
z!p<BJtLUId-NyH|M62S*w=un+v$Gk!YpuTdLqi}?Pk?@cdT9auB%nI&TU)19Ztn1l
zRB7Q_XQ{8*5U(sY=8p1ljEL9)rSO6hKt!P!5&mjR%is|VL-nlvO5zbHhZq@G<TIU`
zYTM%Ryl$Wcnel6Xul^4yfLlGc)QabgG^#nwj<R1~#QGKj-h5m9FEgIlx8+Kl&r;=G
zGayy23Iz?fo6uMZ2~5t*{%oqb*d-8h;p0B6!#fUdw>&YA?OBuqJFK?n5$doZSiU^o
zV0B&@)6sF5q%8*Wab`xi!#h5>mf?oJ2eUyB2GX`?qiav_%Du$pYo0+#=gqpygvHpc
zD<~j=)<<&-bOx6f1Jkr<>j3yd<{#sOXiEu6;(CS<nuPNN#t%WDrz_8N-8Pb2*I5kv
zxVQ8ivYtMsS9RNuTQpWH+;<l8dy9#7m@Rdq+>NQV#%Ow*Vu38rUB_g6EXdH%PB2J_
zwpfWNC^(O1z*H>+VEM5UFf~#7?$Go5zGms!X3A>WFXY5@OymdC4_*)Xt5UA%RLHE1
z$p?uTY5QyTXnar^)o%T!>h(1-8M^`mIxOl^=OX)ra><uI;u%?FqGd2_?aS#_ruyyI
z@ZKwKNOf%*4;K``rE)XxFqX`u3l_#hR_-iM$$a(<mLeYn-=TxTK1La}wKSWHA4kCX
z2c_O0i+UZ@NicOGRSWi0h(%TCa4Bt$wp7*NFbwJ}LbNI-qd(zIe-@Z%nrDZnN$c<v
z=u><Z86w&z<TMYHFVrRcOZ2%ICsQ0$V7**5t|{(V=_S}4n$$;6-KA`%=(qFv{9?0L
zzB$7?*uKUb8SBIM;#uOO_V-q^W%`NIJtMk}U2cnbw3R3zRGa4C7=FR@J-x-}od*Zl
zHgJDxczpS&|I}lbstjbP#Iyi+vLh{5EraDe3PJu+_0!{;WKpvke`GdD)nkaV08Le+
zmb%G&WwebLNh~J`sTL&|YQ}}I@%uSo13gn|APbg#%*5xiAoJ%YwLh!T=Lme5mTqN>
z;p;=r3x+J_XbYspvQUhJYz}{Ln&z5-E>Xv*0SLg(#nyw*?kj4@kshFmS*auI^UJB%
zr)ntFY*j2$B8Xi#R7Z{D3mK_~I3zS{9`RuTgt~2UQpJL$NGGI&LC*PA)E(SoaN<Bt
zVW8~U=E5$Wx!%J9=<1j+A9Bq^n2Q1$tXE&`K~G1yCz5(6H9xR6+H^E;i`djnvYJ`c
zK4N*-0fMwC*Af>V5yXImtL3QDdS2i<=C<4evk!~78cO6J$0Mm~XvazA3YTxUVX<z(
zAS3sCe4APerv$>5z|JPJ2dI~l`to5ZNs^<0Iif0I$7I^+n?(+F5OVw3=zb0}2*ysx
z^VR#Z-|V5V9>u?FMb<5AlPY#Uep|jDuMF~$eO(XlzxC&#0wEm$cFKRff-7WL{y3`N
zDko-ctq;;}%L2+noPwQ~zw+p2(jp(er7qg-wpxr1Y#q~X`gP)coB~gBXL;|%Fx#AJ
z^qp`E=~_)qz!Ut)<rBE}N);XliDZYBN*nEQU3jI)>Y<sQL7|k)7}gfKejVZ6Y@@(T
z(XRrT`<%4Ltr}f?*7)_UW(97Gk5%5mOMN;2m2i>sFY#<%87`W>0n!dfIJ61wN9{hT
zG;%X(xUXh}&^s-4#OIx?4LJqy9M9V@1)YF+`9Vtr1Hh%;FD#g!O`Yb<$X_gZ649Af
zdU77#$cjB(+z`%cI{TYPsGSP*cUIEG7N`oQhdFd@x}_fwrYBp&8omGcJ!KMzkrnaA
zlol!eptnZ--2fs$F#ZKZISaq0Hpxl+qkOTo`-I?#uDo%J%M*qxjt>TYLa*JRX7ST`
zRfbDVc3-X*>!;tgY_A=V6K}SZ30%&I{pru&3XEzR;jpzLJ(;#|+LZ?=+3BcUH!Q5Z
z*OoWTBpSgHz3<9%?F<vTM_b<C;~CF(PP0+~<`W{vlND5Mb}R2!jJeWZcZrPBvp(4T
zPNKr~I)QCj5V5%hQY(|z{TnU6*vP?OK}`76aZHy2yX!J>y#fqB$9Pu0d-y~oz&e=8
z?)zOXlvT%t*EK#frNI`$w>^!}Fu>C*2TrzIl%nyJ{8Y;6*9MFBC+;L_Q+CO&4e<_`
z$QhYD&W3fB$XKW{Praw!aSzhCU=8X^Ag)t#N&dNINd<NB1iK7SU!;aZEbasArBED8
z1ZVl-<-tZ2cco_w1=6AMV^vB1p<FigYmr-Qjbq_`_b0!05`L%g`}YjQtYzOlyN%MY
zSna}3kn7b>TdYc|5=Zddt3C#6l$=GKnr3~RHP#~bV@_DpN?gwyCCy4fn_IgTb%LP@
zz?08E-<io@P2ddNC6c*Sx%GmEM49!zgI_bSqYCi*JfYiNkk=(qTHLRR43M}Hw*6}|
zQbp1{)Hm9muY}_oPfbg>;rzRzUL%XVcRgJuX}{F{M^aqj;^`E<->S4&q9=-D&cLpR
zC&`mz96@CR^xA}f^;)i8V{Cujsmf2%$$B@&SxJh*HY5h^+ydqRyc`~rEu;)|c=zrw
z1JxSD$I2F@3DYb(_L-Q(ceJnc-={SD4i?})$^c{**298qp}V4ngupMmBqFkb{|L7%
zLBwqo(XwEUKdbpHp3UP*!!e`w%|pfK+Nb3P#`{Nbdi&OgG7T=<+Jne;Hz?AE0dy&J
z;BF|I&5_|Zc!L<0`L~k>Y-e9-`5$ws58wmWF^h(}@PDKBYhst1TXc1Ej|M5Vp6YN!
z;3*GpFIq`$(81ISw=qlom3JM#YYm~lt}j&14v{nocm6&m*lVu3r=9AaZQ)8H<X!gG
zo^foqLT}!et4kZ1RomE}USCi$4CQsa;biFKSbA$;s|SDlQKDcsR+&g($8vuwP1eV<
zPT{b`l$1>N9TX7;hQ{L`wR>v!?8;n~Xr@b)(42GeWsglT>>xkvlEJ6};oDmB?Gr`Y
z=ca+%dlBuAEoRBq$c5)94TQ5t!-)5H9xe-SPpjpOza8fjG)@jo?dc^y0@lzs{xgS`
z0wJ<Kdeo<w94vER_z2z|{B@USt<?SSa*#Mq?PJpy3I`dAQa@cs;{8O<S#bk?<@F<U
zh)fKgfv=d{?XAIVFkPW36c)$n@M}v7Fd3s4)-8Z6JW&5VS#3uwRpB}AdXZNEhgA({
zZv=LLF?JO_e{0U`FT31C0VDY;SEm>3uTW{zvBVoQ5slzQntlat#zBg0Ht6>O7X8@(
z&Yle}IPQ!_x;&H{;l(GU)P*-N(}#I7++7)!z>)0bW1Rf4HP;P$k$xkuU3~HDG%aN<
zt!}9?amAB#?#_=w1UqDMY>Ur+py3=7by^|akuX?!;oR*L+%LqJ{cANYOm%-$bRU8r
zhQ4=QQPcKW>SO7Tg>(+1J%)-vM4+U<o>1YaaNFZOc&+t$dt^aj=czA<9Fw_h3*zTW
z!Hd}Uqk<Zf(I>e<iBAJ$*-AS&wtQuCsc2zzI7H1=j7toUfJrow5YT`jG9QNO&RSJo
z2!!j5(5`NM!L52va6J9E2f?#%xW3hSC>1ah7-^9O$5w~<f0692gmkjx^0n$WHt=6m
zp88+l>0_Zlg4az{AkTU)4Uxt?%%iqe3kS+0NWar+rD3Jv=gii`(=)ryClSZ%eICV-
z8NcIXmR5`PvMbbU(xEU)V}dURf;#S;g~Hyet2i;o80JOPAbtQX_Zk3v&?m^90Bm1c
zFqpH3`!-9#;=SM6#`Y6n%qw2HtGNN_F*G;Z3HAJF?x53Bzh~I@H2T48<Hy2~<`QCO
z)leaSs0FbI6wR#Rm49XGais-Mq_Kw`Zw%kc*cRT&RK5_`;07KbDdhwH)Ehn5&+kYE
zgXP0SHW=lr;<*>$YNy{JcWOR(o_}n-Ba_$_jY*7EsRDnvPeU@@ART<xk6~J!@#hrN
zg##H@*IDO~56}ud0bB2SVq7;}1AwK3xGNGM_!*>N8^WKc3yb0QIOi+`JYCN)Ceww<
zf^jPr_9H@Lhq&SzmAVf!c8+CQb{Z-K4X>{|3`Sk2S^*i>h;$?DP_~Zl)#y@c&b(eS
z3H%!6POO(Vnp5sWBGA~=!WW(D<<;Fs0JsW~>jh`Mfs(PfIc@i>if*M}+6Sl4it!jF
zBGve3!)ELw(&0L%@rI<{Wsl}99!QcMuyJGV4}e(BWz2@<a@YD$tK18|n3E`d!7Sw~
z4A-DNZNWQUpT3E#13W4mHjO6@Old0FF?KU-z&+4-5z$o7?@#w`5V`pzW#cu(`S4)N
zQDbL}PiOZDju()Woq&26@*c5!w2`Q~-Da3aXjCfM_qGw$^vjEaeQ~^?H_saK3qo)x
z*c%2Id(4FDcI*qVsu&Elqi;men#qIpsKTnyXb4o6cDE#@krf@z$;Qx0OJJpnp8DWP
zsJQ`r<kY4|;Jbyz-1H_-81Gi!FRdFfa1H`=W!kAEA}bf^;P&p8Yrv}ter@1fdx(I(
zi)W4r=po_A6!!SA^6O$nyVj3~XdC(z7`8wLL{gCU1HQd=Tp;&uHWx6bl)PZY60Jgj
zmmBZVFXYUG{>}IIs3mFX3`E>@0o=&OQeb4<E+nt;jdtO8Ui}PtAry??JIvUEHKF3~
zw)iObO{v?E!%<19`E2C~%A%V)%r0p3nr^+OP!HAzOm(`==h}M+1Kt-SDYmdo6x7OG
zquh~rPg<JH?qHlMQl~(T<BB?J*@vy1zLBJvb>HG?K=q3Ix^zp<jI+Z99fRPMUuDts
zbS2aB<HwQuXWDks9h0W^qkqG++-X}#CBw!jusSBBQVh!WCHwLgVoaB1$|-%@B@TCH
ze(j*2O#(zEN&6al?wa63vTNU_NWQdO`X}1|3kFc9J~~z9rCaBn*Nm`v!egVNd{fgI
z!#18s+dXsk>4@bnNR*P|JJ%m5vVU)kv&X3!l6lqa0;R+%GKdzqE*r^S&CdPhv^v~x
zxOA33Yut3TwvIsb$q6bX1Du}3qI0+4x`sl2c7B`b81=@p?I{FD)K{~ySzr16Riu~8
z=5~5Pz(Vn=C=+n#YQP?$4D0PK@Tkm9WgMJ05JxLvY)0<knc~(OBUf!GKF0zyidiLY
zv#bs8Cgx>($rFS5fZtYQx=EF4x|c2n_m@5$Y)pg4<kfSCyAmffN{N-4zk8lof2r6{
z*ds@}FIH7>^NA-0^rHF|o*DN2QRC!63rTWmoxA@kkw8?Y(jXlgoI=z+7BGt0obm5z
z*u?bL)R*LM!zM-)&iNeX9062Q5l@f2Cd$I8W%qxyZC*N#N*!IgQqh0)-KmY=`fJ#~
z4FIe7l?6dB>I0{*QMCFhA;|`SHw1H>qrTmCbwqm=;3HUH?fA|8kFfny4av0g;q&?n
z1P``>efpSj=k0B)Yj{2%C`cLP62M+9x=mgt;P@LnQQOwvEynnsuT|NjvDU-0`gdw;
zlUfyUbGjOJ=Kp8%jZSAUzJqaE!mrtT5)Kf!IN^QNok-0&jsywTw&4m)lu-DUS<g*J
z%?x~oXJ$exIkb7$aWeEt46ryvO!*CGjJlj$Y6L#K?6PPLB7^S!qRPrB8GpU|%Cb==
zSQZ?&8qYx`<-Na5rj3Qh(0)Uqgz2m1;(9e{_B^f4ZVbs&S|SudFP$nKi;RZ<rz_#L
zqFq|n`D($t^10MI^DrS^ysa%E_C7|#1H_m^*WK3C<w`rqPaGVg4$^pupa256f{=+N
zQG#v60z@;uZ|FU^T^KulJa_441YRQvQ+vyUQJFQ>Y!@3n6N4GBcL1HBa{t=Q&Xn~$
zBX4#)*@fPGO04X{*pMT+EZ623YWapYK*=!Or%VQ8vwpc7pKJC=@Fe}DVhZ}KjC<Xp
zX^E`W`!URIsxDy==GggQc+&^*+o#odMY>yzNFcFwomI~N=!<7+E;haBJDu<*@xOwk
zt>-CFAYBw7)?3rYcf@vt80^3Dk2v(eCxfDD04>)PKA*=WECP}KEFlc(8u4)5vVQQ2
z#O)3UMv$u4-WccawV{G$9j^@2Y1#ZB@e^q=c*a5e(vx*7baPtqttcLASXi#U9TP^x
z5^MIfBwvb_tCn_uTTH|;YG!4!tRHgy`#ZE*SgN;`YZIvZ5PI^<MQ3X1B$(8<r&h4@
zWkdZxf&Y2lmKSbyG%YqlM6a~^P7grNx|m@S@S^ZoJVfo9*)H2Pf5ZOGkz&eMffTSg
zCxhp@4x;S`EgY#5WHZ*Aj1l?eboU``0Tbj((il83CSbMspMox4Vf!7(l}N{b<o$XD
z7(7Rk@k@-mHGTR0Tgq*|c19U9R;rc~Df%0W6p6#$ihXpVLxv}z<i1%(00ITUG^xXp
zo|}AM=9$K=$_X@mo@I3+eee`wqbS~N-L7!f3^iZrN_KlcbxEspMP|x<p8-{#p73f6
zpCTA1Wo)_mgs&6XoK<+;1J7Jq+h@JhE3)kkOA+7<!D}!?79Qs=ys?K2It+2!kGcEq
zSQWf%l_Pv>2SXP^xM?=(zu8MsW=sX;#ju!cBUD5r;z7hd`NZuc>C_qa^gJBq^V;B_
z_t4>@*A{a+uR4C?VPw-Zgq@8yYP=f8bN+>Lrc{yQ_DW?bl~BMF(8&|Lo`jRdiE>@_
zAD7aUhXCEge=dd|me6GVdy?}_+EFlkVfFdWW)WLoRUZ-x#giGK$K2~e;OlnH6*OO{
z%IsXs^zZ@)V=%~ca_j4{bDbm+U^NXn`3d!!&E6b*O-ad>&}qG=X`*P@%$Z8>*biHB
zoJLi%RyQil$}rnD!gv3t8bH8x_{g>Blkox;eC})5+St&{V{7!T(nQXxkw4STKWp5^
zyrC+J^)ILm-~lI~*BX+udd#luWgOZ;JCt8~F@gz4d<1_s?}rZ8jNA|8Q`B(kpeHP8
z;uql}gB%3=p&Sbq<dJV2^Sv5i2P>5BnZBel!26we5N0w)E1HzEe^#_g1ACwZ1x|lJ
zH#&J{0(MkYQs+L)fdK{K>8QK$fbT9oW1ChX8JZrD0?`K7%EZk<Z|J&>nru`dA(t~S
z*k;#n%2BL_@$!&ji6=z7u#}DhP!5ja*RDPKP}Rs-orpnU>DPqA`r%niG)YKxo_!b1
z2&^}mqYTZ3K4hcr*JpbvQve}QA!IHR7;O9K7UzZGI4cPA7SE5r(WN}U?q4=vq!*Po
z8q+s~K|3gxPOzt7jzWtR!-Dav`Byt8?RD!3<#b!SNrj+TF;O>a^JWu7I&Lku?jcC`
zA)0Hj0J8ua1<ibBqkwy|>}`knlcmSt$-80K5XfO2nu}(ko4|m8$a()AV{E4)?7>Mt
ziv{juWj!n}#u8sfH)<%I`{!BNpi>F2;qyZ~mqlF?4u6r%%iytt{T}*6%a=Ba3V|GR
z2o3%hRO8$aaTKWV<lf~Gq7`v<KuiT$fX5LLHK}(s(KLxEF{SGD2sm@G$wx$yaMGY_
z3*&i%>{}<HT$`RuKMCUcG&#w!52Wf&Z&*@5tE{f5pi13X)4KJmt-n#PEs3uiPD)pR
zWDKPX2q_erbVp%TR(A&)u%5lz%c^Iu<Pq2Nj&!xqH?gOvnFfZWiA?RnF;{{6@6P8D
zD{sT+y7+v_@mgZ!f&L-F+IqwB7qxYp!vt)fDiNsXT;5suBT3%!`-1=sMy?A;W<#Bv
z&jCWuP&r+q-*bUIWpcRI8m_<EY!{3j7{sqTh<JWRKf~SwHq+b^lm#qrSAJiKw-9n&
z9iJbu=^Y!C&|BdVubr0PWh0gSX*LkDdLTmq-}2|3$El14-y!C)PI=JBa8rGqdw7;B
zQ=(J>S@5NwI9=>$_wA?9w#_AQlq8`IAy_q?C|(4fq{ug=)|&nN_xc|IV%Qp!Kb7cn
zD33jUHUV(MrC1%0EOImCNIk_MQ!sGbnkbs!d3#K&xCOH0hDzkLp5EThju5C}vw%aP
zckdiYDbD~&&x&VBp>ea`)|K$y;VmPYJhHIH0^Ai62eZn1D}Z!36U7EdhqGtWL2AZZ
z|5Td3D^%7h(^`X9*MzFkA)$f?gmq&-)ZXp-%|fh|uZ^6keXqk@k?U<qkY{b47K-3~
z*RdRneuknnN*#4K;{~EVMSuE0<Yk)ZH2_6uN5Y+2kf#g2VZM;pR?uS}hMI`Fg7=HR
z4?DUEuY!OPzt0QuX&2JlceLuJQ-rRoJYFy9;)S*+a^NZQ<&sD(z4XBe)0cIw-aVgB
zHE=Dn4B6GQ>~sR>{Qk)NS@B=5&qK4we5K>{>!a4dUl>FjC=&>~L<{ZsR7R#e)_+md
zgmR)3s29C3@$1X0pM3k5o3OG#xRHj`1~T0{Z4`(w*Ra%JRW3rD9QcW?Z)L2%wXl6)
z^}WuXpDbiA=5|Rd0;w|W#2_r21n7keOzG9xUDuZ12Nql(P`U3#lK9{zGFx=1cR$27
zJ?Lu$a!K=XOEPzJ$WzI`)uJ9`or4u|T`ZoBi2GYo_Z;>AR|;(Xg}ndnazp3!{+ke~
z2xepnG=Lt+D=5y-{a1L8i-D=jMlME$gl`=aHv2xz%2d`&D+8>BEnwtzco9MPU-Ror
zayV@+T#(9tn*&^*n*&0LC4Rh1$@OLP_|Tpz`c~rjheD7j=QG!lrt|5L{-5^&mZKOi
z64S$<ZAmYAkpVt+oHzcg9%DhmI-l8%gxYR^EZ_RnAFkD0q3ZeU1|kL|>@E`;t^a}#
zUJ;`8C5c|qiAvV6k}U)#ib-$M-Bk{w)7=YqQUHEH0_FKr{m98r`_HEehRMAlCI4s@
z(-eY$j^#udtkrW-Kj^tlO8U1f!Qokz42j7A0Qdjp;nimYfR7GiH0s{JH0J-|?rjA=
z(i#E3v(kSslz*+lp1r_2Jw<S(yw>`^9s!30VEQ`j_w)USJ^A<5*58PWB0n<B8mj94
zUypPW03V|*hhp!4zbW`YEmUAdDz$@Uo&LAw7#{YY528ez!7wxwiE%oow{jqJ&d2X7
zxG>R@6i%o3!fQsnKiq}RRq&>4?EVv%UnYNprE$8wNV3CNj!DDbw&5cMz+JS`{g=A{
zj{w{Sq(em&?w@xT2p292#D&u`S1_)0WJ_WY3%~pswN40)Z6y};lo>N<1q&zxdM{d{
z0We^?Lr{Za{|CRYulf&u!Nc(X7Qc}B7r)rHYM-uh&WE>N{Lai(rWS83m*~vUokyd*
ze+)E#C1#i2eZ*KE1@y>gcw{0-^#AZ5YwRQd=>gtTF#)Bg>cWAmo{LbPVwsJ(`)1pJ
zC=H96m{Uz4EAim6xqi7F@4BXj_R^f+Zp9<a-oo_OjfK+sA8+!L|35u{WoWqbDU1@B
zcyU<Ndl@c3&0j<H+|?g%kF}My>$~=+_0<-SYdxP6w|O@>1x~*H?F^Su8Dy-_UorU>
zprwWR!WXF=E}Mi2B_NT5VTJ1pR}qdWh^w21Tm4|7r`gW8EQsizc(uw2X3dxSZCSf~
z_V)ePLlWI9c7UyS6VKVoD{lP=tI1vg)#mdW5HS89lEYobz*<`%3$(HFbbCy2+LK&0
z*COw8I6Vn!=xx2P%?9NHfxDAuQ+vbRK+w!HZj~cyC>lFD-C$*+{E4OD$K<b>BcZ!4
z`z8&gD!`NWxTQ69FryVj0d$zu2mGnMT2`XhDiO^?x+*wlV{;!koHx-#Wl~t)ml?ba
zmyCFw1LP2&NBO(>^<R1PiD~NpfoSLhvf*3iIrNuZ9%zH$ml51uNFPt<>Vr#`Q-2Hr
z9q;*H8FX@d`Me!AE#0z!g9GXeR6rjuq%Lv}JX-3RM^DexeM-pkO8*17u)J|LE7sTw
zQEzk3D7hE|D3dZXn|&#rW)BL}=j{nT5H3pevLw;!mOwllbn``2j%)+dw{Z{!$r><J
z3wes-R_b1U>H%VF2kfPO(l$PfI8hYI^}am3PXIuRzAlrp@(#s(ud7C}Or7a+JNfs*
zmTMMUK3({J0<aNzNZ0vNZkrnC09S-U_<{o_^`l3z?SZO#_|F&S$wnYsY4FaSS!kJ0
zAAp~PnrTx>rmu%;&<)AH2Qvk{Yx$`N2F}S6RPwJ2S3+adyrl|0)E+DP4kK1$NR|GM
zo(_yEW4v`<Rn7Kw{<|jx28DGp03rNu;}?0kLxo%qqom24$#z~c>9mpUQ@3a+jpNpt
zPQ#1PUDA&j{2cv;zib~#9c^hjRzc4(__6=Qe>8$b4Zo&%Z$8j(I%f;WGB1f<8ZjMn
z_ajD)B3V{GF&Ssewh#yG=~q9F&{C?Bm^CJ6CDhvu@bmZYUv|Z!SBNFk!;4rJotyq%
zriP}}pl$cShc2xtktp}N*eZ+My0;tL4GW^wsXw>Ezz4Eo?k`tR0d<=p-^*8jwaZLN
zqkYH+u5VZ9X`-l$)Rsx1YEozgP-_hhuaptDn#j2u!^NA})7`1~!ZF_xA+K%H61}#x
z+ecqE2b2{vniUnvc7q)9w>ppSCBZr26&gZ!8^WMlL9UJBJSoAKe&@${naXlGF3HRc
z7irX--*3;jisnlzRj~!a2!%m93)%u=pN?m=iu6x(;FlUo1TPWF571@{3RVVx{1_^}
zNEq_K76>BIsurB7_?T&sRbuVx2ORGLCAx*_2Y5=oo-xUhbmB85>Q~3<1deIdQ|D=3
z#|<mv>ssNZ5;LX0i;KG=s@W2}_UyTZwJIh~%780gjlT79cqHO}#orMA^kg7gJ!8{G
zJ>T<pA-hRp*Hj?gai`1evScDlivPwmQs#%L-)OB74s&H}8WmrKR)EM+=1R`C7qzqy
zyiOO@Xd)A`f#pX5kJE#)h7Q)+60Md3Ir%M%POHJ`tL=<nwY7Ke>q|60_&_u(^R{0<
zskJ^1e&ZDTUPTu0v4;PQs~AxL>(>Pg{y@udT~-XhVGva4wsleVuMP<7oC*LO#?Gla
z+Mj21Ok*%QJ;C|1`H*7hFNpE&*BZJ%dNRU|e$w7P>uD58F&vq1n<c^<lf6ki))^yU
zLkE6*l-ey~V37<qF}|6grl->ma{<jlyGY=Bu#rw|_dZhMcOP8xD0SAAOyhI^EGGhb
zBrjjA8mVUF!PftM&7gCS$?*;a?ND`uKCy`Kjw7f;aC_e!h||xVu1k-g5ca<BK}+!+
zq*aISFpGzO&Uk|}8k@io9u1eDwKH$od3QLnx>Im00*c_S3eL?`Uq|0YbJs2&p7MA9
zHn8nB(99I`m%%uHjZ3IgpZT2`g4;l}T;(G|#Otj}QuAQ<O(k3E>cA-dt?g@~qXFRZ
z2o>b*aj?F#Y?_2|0%^|lIoKbl-8C9>XIzWLppN};*JOd@RmEj;p~5oepLuhUZ>lpg
z6~5n^LHX_W8Nzr3&r%lV62<zn`{c}Bm}*d?I7H)BoRjc6Ma()h`Zu@+t^h*__qh4&
z7HcNGT`{qu+<GS7(-y!ujUNyD+~I^A)ma4FMw_EgWR`rKAaL|YPp8$rR`tC$L}JS?
zbkfB0)V@I}nbcH83iWtaR%7QbQH1`RL%EjNT4*%%2O(4KV@P+1usTq`%on#wMAy})
z@F#mUwWI`AAFjM5p0kKRc?<DUV?cT=tB@<*-P7!3=4WPK_0<wp>GbzWhTtIqmaV%t
zXrQn?JJYp4uaEk2ZuneUKHe*aoKM=()d6AZ=i>t$)~0I(R&XAxk_U7+-Nwso`mNvO
zcqTcD@jP;?2@8ow!~LJUE0W*7<$)u*evRIOaDqblz-{APv1TfgWl%)DrhU>MWHVw>
zPFE+W$%gXCH9M`f6L&TyB2t)O!?RCq6T<2kP=H^H@>Mj4(`K$jNpGnRtc$p3MKhGF
z2p*~`u$c@CRHx(CoI!H;POi+KAVTle2n_eh#Gfgmj5Q}~i6ty#YQZS7gJjEsjWCey
zG!`cF=@L=s&0*=QD6D}0CiEirvgL{oJ&3#<!yG5%e8^f(F^g%!sB4tkVhkqrU|wmt
zb%=)e!6#nYWo2mM1-jYQfX6FjmY?yhB4$w$o{mo&;s9v#zQgNeFaU8QmZPx90(orn
z>EZA$z+A|Er=LyiJ1aSOl@g@&F)r9+k9us_{QMUC<Mp3n$H+wO$QTf6(1jU6!{p5~
zjKRS33}bv>lWB`9ASB2klF-9MWKZR~ar1in4x1?mLm|Qi_0n;Bp*DN}t^Zms-H3sM
zM7Ot_hTn0rg87CkK046yoFTJRf&x09v8hWH*XXORhq;Ga*Gi1@qFHBDEdKVKYGBS#
zG|A4sfAoccu=MrV*l5DW3ytPXj&m8}3+`eAEvLK=-?Pjb$4*gN^x7%HY1lQ-`7Hs-
zbEnoS#a~0Wpw!bht2;6`WYFJB$9Fk5uyvQ~Ovrjf-GqqC`P5@2!FeP4Su(}LH_J)y
z<-t5BeCr>pP>iyg>qe=2wVG-(YF(p$EljsEDlCiQ##ef7h{PGdtb_-Bw>Jy2p5(Nv
z4UHFF?fBF>c;x$+!DzaHXq7|QyG4bl(Fnd5xP#XpTt;)h<NiRVK^HzWl54`hH+t}P
zv|YX}DGC-$39{%Y*6Vx4TH~*5h7*P{!y{4#o(M>VCqXc)<{KhMUo!&{TF;e;#fMyJ
zQIa_Mirj&!W4ACz-SUKK`w|@Oz%309l0u91nv#AU&P+D*$MxqL(AADi0L1V(rnFoB
zIo9f{vSe<7CNO=POoix{W%_*QlN;WR1mu{zh=qM@DR%w(n~nmJuAJ|-aBPoRQ;rx%
zpmEjXYoLKLfpoqnH`ZEEJ^<e!aI?$=ef?eZ>b06qt|Yh*?L-=F)8=64^Mwt_LY$wi
ze>k7_?Oi3Wz2IvZ25yJKc*0%;Z2IT#m$dYRCmrDXQzt#aKh1}fSpFD}yh0P^ykW(<
z*-!VHH63dwo?h>~y!do+g25nUH9t<x0bK?0`(U}*u4M2x*dCJA@^g&pRXY)wt-5n}
z&v<2m3}&x~D53V?ZhImfwd;m*^;+!4lLCt2z%sb0j<>cSzx9xPG736wc&!uUpv2Gx
z0{!|5dbxCh6gwW{7dpc&72^-`NW-V<7Bqi~!y&%6Q7_$enicc>^>fJo;zue0aHW*<
znZJG}2f=1`is~OOV`Q{{*rMH0Y0;jeV|5hxaT7pzq%qWHc#|{UN2Vq$q1b-RAq##%
zTc-rk^!&?Wc!6COs3lVVvKS-AC+&3hGkuIWOB>j}^Zo52bW0ce{i!Z7uPQD!c&~|E
zHkbX|JgM@XkZV-n@w4zuLeFefCEe0Gx+G|Uuw7t-^$Zk^jj{9Gt;^>(lrCEFUQ=A!
zw?m2<SJMC5PtWD`+XH`$(WbgIRw;6uhXv;ztBe=8&B&^8%Ic03w(9*<1ix19Oy2b|
zn;>kRf}{+Ib*PdrN?7ZuedG~&v1Z-GJ<Cn!r#opnd!x}NN?_$cp{=>~U}a&9na_F6
z%e6d<Q6F+h=OQfIMg1)PUQufKrC@C@8K-z+^34M~3h2i@=E?Lj%*oSH$KE_Wk$5k~
z^!*orgc;OvJh`;9KHQ+SS`2>rGZKDi#1$?g%8lEiJvdckF}l>3Tsai;5$9tIDqq(q
zEd$diEp=CD?J3Ruq|nZRe5v7dbhBvi3$<kBd3B0+15D2(5QgocIfm^qa^pvj&x6);
z$8Kic=fIRjfvE!^G)}8l)^&Yay4=fz_Qx6?F<TCp8xKup36^fDej`y%f2lB!!Yddq
z947+}$JveZrNN0rtyMLkQ_BY0Qjb7}?OXy3E#$3s6YZW)X&A5^riPpDv$L7tnc1Ib
z?HzpXyc#BZABR1Sspst7tW)GV%aoTMq$1a~Herw)-=Rup)rdQDawZkMg2PsLH)#;3
zM=LqJ{yOKMYj~0t7cdj=$gTmdwyb9?*Iq2K5H-AEQNMZX$~+^kk!U?Ho%2QG+Z*?D
zQ5c4dPY7lpR<klyTXHn`OBRFCbJK1Qlwh>44qu<x^W{rY%Qaf*S<Q3ffRt_E+<g9-
z@{892ZkcE87I#*V9k>4O^AQqvF3u?_P5$dD`BhR#9Xso&llWhZBjg^pqHL-8lbfi0
zB96UK*<LCSIL~3PdSuZgv6pgTh{rBZ&4R^SX@)}M<VPOPZw{-<v27ybr2MH%7X6?n
zUx$#z`t&5u%IGj5$bxRS?8S>0>?kcf3}kfzERD$R4Sc=d&1QCqvPw3#Ox{4>1+0q?
z$8lT%*>n!nFiIBQ^($rsjkk>ZJDPqfG-~8z&;st!QTXBM9y>F|73SIbFXYtJRX;OE
z(#kAE9th>IzCBMNP~pE@!ot#b`={tzROC@6)u*=2;}O?*or*2G?58YpKLKCR)5gib
z;BilO&5Jq0#X^DlbxX&umRP<Tp2T;dcaT3lA6}~!pms635~WL}U;{<*rG${hIQ#D0
zPW)Q^<Zw<#{QNnApCd|Z229$!Q)?!_@BOGx!36OB+I?N(3R74qTQO!kx{_DvKEuJR
zP#4ndhT6VMM<z%72c|wlh=GFP$*`kB?L5;R#>Af&(r$?_9@~#klhf}pqd5{g*O_f@
zKDu_(kdZn{A$|%16e*hqjL`-n_H)Fx8xg+ksdDF_cE^CIU9_HraCANG7{J~?f*}OB
z`$zQ^G?$>)%u8ooveVX5XXqZ}PCG*o(OtfWvAqQ)>a2K1f@ZabXU0H8rHM|$#WR!W
z6Y10aV`#v}n>ZF8WMM=u|Hm25ou(-M&*ThVtd2p2_jM`oPo3dYgihwMO6Rv+ZI9(f
zZOdh`bCO;(*g>(QSB+CNMf-#_o`{xIWALfQ73a8?Z+iqWmg;$<_J0_?aH(UwUXb^X
zE^ip~&U&7&2C%?Yz4)_O@Y8w!TJchN)07|Q;0~SUNn~_5CJ4Jf`FE7FY6>!lM8Cex
zB(h-eH(_mC3K18T_VWebZhc<c1AZuspT#d)GyY(kk&khf1oPb)CsM-mTWigky>SO}
z&ab<iBbvl>5<25tOEgQ0?pK9P+KnQ1IeFcWj}d^<xR%u8h2M;0EnCvXegX)EG<jwe
zJnaVdz!8Addal<_Qq>#~@GsaAJp<ZMKQblHS909QX6gzp$N@^jjBsPY^6#vgH2}M3
zb%y1tXivaQ0V#EtL92T_4Y9D&oW!d2>i0~IXRxCti`|eUw3t~9Vh<!&2Y#j~iBFlb
zaW|)R-`hd#;*sU6g9*bSDmyk3L7K7}W*j0Benf8nDD~sAx`6^%-5TmDh^vr0wiJ9}
z?81SpNc_8m`y`XM-KppR)lmab9kY_B6jEpE4P8v5i;TQ>W9H7;T|`Y`z+C8ajHts^
z@Ik^BQgAIBLpX?pqjP_XY#jQ_715Y$PaVnSQS!V!y$fB*Kq1%cs3Aqt$F9*~Ch9qN
zCFafBpx6L-hw3$41sToiTDpwfa%|M+gW#y%;CH*#<T*up1W^Oq8*w2+J+=f2)YOJ^
z-k3eS!C{idV{#&nHG9_LH`%-v2=RCE3g?;6`2og+3Ljp`+WU1ZZ-+KVd#;9D9~6-9
z9+=$XYQI==BcxtJ#m}8uf!0$&?O$i$4mm;)t(SZ;X+Mu&&{SE7tG}2pcmPgS$I9=o
z@Tj{Q(^WZ#h?ZMQEnfa%kjOwW-Y5lbx}Q@7eoJV$74D0fx7^#MZwu-MotpcVxkV?e
zrysp#{MhkjR9{9lnbnVH_RqX{DDR^`YY@W6f-~zc`%P@xDMQo-*bw1&+mx}S>ye`G
zANV=FMI91hMBc%F68#RZczlUhfYTuoL^5z^e`S))JKhP0&b$7cGyv=yUvVw{x`Cj9
zs@NqX*!o@=1u0C`1@BmuG4^E^?)C5Z&yfOhBl=*Bjq=Ok>Q6gGk*(YQ=B()c9IHid
z;XWC%2KSPi%x@pl_Zfwv36V>_*%AB3-o0VavS4P%9OVp0M^Nv@c%GJoGv6p@4|-li
z^OgWJE|3;@d1kv;*v8S)juu?EE)dNNodC;X1&z^$=5=L{qJQJ!U1fmR@%rjPBSuNA
zFg4F7mj?ed4BXTWM&|Pl@Wx!^3DEevKkeT96azc?_2@|><#t=9y1g`8Li=a&ViVTI
zA%&TRSknHoo78S7JxdM~MS<sLH%NU080J)R^N7lOZ)}YGBR=LL*l^EF5!N(_T|IjT
ziDAo~`lR2v14SLHjU$$eukI+IajmCSUXL6CJjPh1>8Yu<CT+_N%)FLTV&QMQ#%w;g
za<X~A_bLbRi35i^h#9y3$6G~wFY-n02So9ow3=d|bfRW5_YZfjguXwJ$f4duxZ4j$
z9To!oi@R6Atu{Q#2{%H4x?Ae;wXvFK85Jzbw!c*t$zTJfuM`7ekc04g_Sh$d#@g0y
z_QP}-hx0X;3j!Hzn^(J)7qiVN@N3MgWz0w&95hkeUn=QoN6I?P3PxOjA6A&w6WLOP
zc+f|$H#eAR)!w=p_)rOK@<AqGNAerzR&(`K8XmmAwpjBV8BMvlw>wg?eMf2h5^@8{
zJ`mUc3Qx_QT=Ao(z`m_xN&`Qu*2)1@Q!Q}ZkWBcBuREF25cb=%x6}8T2yr)mX}-&R
zTss>tGLPvGPjUb(nD+z|G}do8kUS$}R}sy|hzv^FSdPyF-xG6=f5+D8fez=QXxHqS
zwe{5oNu_bHS~J%j2q=uuD(2bY4t2@u?ZQ~bp31Nt(TqX%Jhau8#4v_r&bF$&>xt>~
zuwDxC_OL9S%zW=t3WT>Py56U9?0q?%8zP6|<MJMKTjc&FK^_uEes@tUzb1o5neze}
z$xS^2#V>gVT`d=eUcIraVN^H8_TBbyF6fcL$P|EQ$R#6}W8DOHH!kTtmcnri8*ao_
z*A9E4V!s%Y(3u0S^#LhF;N1V|lC7kGTT$7|-|0*F-v7`b)Bp{_$AayAxGc%hGUkq=
zujtIav`o>?#vhjW`xmot>dv!g68;NJ!E6p&pKEhRAorwENI=v2S5Y8u@p|RhA;UI%
zs5fDQHFQhxgA#BLk^`mtz1Z)#O%t<ghP%m;QS+XU<{iPk;oz%6LP-0O4Lm@9KoB6W
z7+^7_+WHrsC$01Xt|9cKSskg|`X7mS>4<Xdo<`I`zY|bm9iR?=BZELH!mty?OvMx_
zU!`w{J}~K}H9`D4HBIEhGv~=bJjNeW@dt!YV_+;B#f$Y`tuOz{d*Y??yrLhDVeUD#
zO$|ep1;1cX^{*Vah&eehMkc?w4c~B)#y=p4jlRnoT2Soy>Lyk4#s`ckj2;8@*MEvB
z1wuFTLKESQj{XKEsMdnK3C<EWoE!iSBodfbk?0vbECp(TkCuuv7wcb*6+>?B5X7Kg
zExZEW*xHDJ3Fb=kq%Ozzk1Unl%?z6z;3|uB<4<l$MQYj}H%Lz|PX^;n01XKY2a8GF
zgBw~ovqvbqeHLHHg6-14?}U23N>p2)%KJKOUHZH|xV1yXD=I<-F@B7%wWkd){KaP*
zUVsdd%&L=!Tt7O8OgZrOELZq#B;cTQ_5Dn)H0?`iEX8=Qb#=Nrf#X3O$kcA6WQ=$*
zzW2uG@{KfkxER6vfTP~3`|`8kH(5usWjXx^T9HX=-XRXGXj79$4;6mT9kqIBm+8sC
z=^E#rcbCIG#S#X{lpyXH0{e@;&hsBJD2dm-1421eLd3yNNW%wJz^wU-dM)k*I!KM3
z=(+up#qFzcztPqHQ=HIUIlz9%gmN4R$502y5WEPFDNpyx<u&Yi+G)*iK$oaGt+Q6I
z3fioT6Q4FSif>#3$d4GA$FRl<hxftFoX!3$_36EA?jA(KV!+tLNvOtUdDE<>e5cNS
zHU-h`w2Yk;7|Kn5Q>*Y6mT|-{)h0_yb4K;?{ff!pj!@}o5%CG0#H{%PNv-{Sj7j;8
z0`dKN_e<=mOp_S{6AP+%<mHW>qs&d*qjOgog_Jo`7X8O~T;`kg9LUkyhTzUp)LHO)
zuzwV;+AB(}iu@bYA8?rz2SK7n5}n6eMl|V7ZiUjaFPO}efqu`3Z0&v|wG-g@Ao2R;
zx90Q*GiGptoowl~<aO^H-=wC(0{e8X(~ElNTc$^1vQ`cW<n|!3hgYE~9lUqFn!Gy_
z@4;9Th=0=@{058-zTOcI6m#M@)1Ec60$D4}@M~Wk_zrb+njcE`kaKjRDawh2Tmspl
z>1XR*;z2GpZu)-hdQ<Q_-Cvw5@(XM8JMsG*w8ZuMvDqMG&?~AMSru&mwVbcNYUkU;
zZFZzXT)|2HV+Ok%>;3X;ER86_Z>fFmFYt(rIq24dzG!e^1V~DWpdXaXsn$STVC>@}
zfbE@CHz>B4s;8ijR`XRxwP>0cKs`MkeIw!FfCc@)@B%45Qyk^ii9$piGHkynt;JG`
zzl0MUAa<;~8%rrw$paWisO+uCo@ZZE{SXpI=7)QBp3#q)vuS&wBs@2al4z4lsUiU@
zPz)UgrZB}Q&SA$CV`<A3V6rgXS5~>oT{v>d2D!%3-xU~lb&a;xu)Hlg4onfoRjNs5
zHkfE>=ulfIZqS^eZ?IBA3ML~}+8cBN#%E+OXj@BL(yBKp@04rBcEuW8*HAe^6oHDo
zgpFYJEW_1G$2AXKm|*{}zH0KP#e@!|!$ync3i$Ko@%)VNPYC=Qm=Iixj`^2`88RsC
z#G>s*UrygbkvMmxmHC%e(`_iBNnic8Ig#$Q+nid`B;5ytx4>HOmh&PbnuGluNk03A
zJf%EGO^|drUxo;8-;BtwRa#>D!nckGExZN?{13^IOm&M2a`mR{0I;eyM-<@zB&*Ne
zvdObd(}#|`Bb`Ht<u_M;p+Xcq97IdhU29$P(iHY5DP4XE!6m7<Bystz$jLK3eZx44
zL6wzL-=v|6AJYv|x#o7ot$_*$zHw)a!-dommua%UffRQft?<{f`o{-JW@%&3&mfTX
zbH$ro^FDsF7GuF^?Ddj}rCxs-?gl^-MR#6s-%Pr<zFYSDHQ{nxpgm%<mybSKmJFmh
zA|SvHb6%O*r6;Q7`75`3M{BZSMQpJ4bM(ciD58g~hu*c@C^&(Zk`!l*CBiylVUgZr
z)8y+H3L(X+66g;}^>u~bNC5E{2hH1sYmQ+*foKG}=R2NMO4_+PzqWcM(pd{HL|=+K
zsF>QZGAqGasN!ImnGyRVw|``8_1GM-z7w~0sB&E?%h*fxbv%lrxrg_IZ{(J1h9-XS
zumD%Wf2obgFEi_@ak-NEw7i#XU7bj?_@ObGS2iJ~kWk2{B4iLR41y1Om@wi?NBS;t
zP<*)cTc7@|-dECcK<zdV<~$syQBk<EIbRpsO-HSq`C<e|A%wn0kpbN7@g(dag#qU-
zGhr>EU#eWUNVfw31)|%?os>wCid{;4nOxhA?6rh%c;XD0pxCSeI(3#<EG^b;*bH(v
zOIj}bDCsZ5w*BGPgFhs_k#YRoyJiyeF?#^(zLV?z%VC2JU)GLxRw}(-3H-{280_o}
zo1*;5=$Qv91C!Qn7q#u65p&vr`_B2Y8B35>pkmxm4T{0Qx8B5Xc0{8l3wwbXozvL=
z&1!6$z`I6y55t3c7)&Fd_1X;p6aa0s>`fZ~M~aVt8OFy_#oPT#Gk*XAIwcV5@+TRA
z2f?8LYv8{xllE5+2TovLLxGTtpWt91cU_Q5PwDb}O3bnyi;x-zOT)0{thbfwr#W0|
zC~15NGTawz0514)-XNd%FI;d=0jG#RQy2%-6PI+OJaLCINy+S-AHP7nvxNq9UB5C4
z2QF+OCO-M2z#qaS2AGBJZpG36aw7Wta9pKYJ6xL0MF_7?m}9s;YTB*J&e<>5$tOLZ
zTUBHLV!}YizK&DvSxtl$`@T*ZU@06XqnSogVHDWC<ngDtpDIujkA)XmMjm*)lLYt^
z4>=T8z?1(C$M`2QLil-ogj<YM_5N9(|DqiKzkbFRO}TCAB&WBEL*iD?PkC4HJe|l-
z7oOtxwtUvBO=Ok6{qm`hO<KtaV>XxE_`Ue|I)j4K&WoJ>nf#5`=1b5QG^69I-F<db
z+lGHA!1yc~Y7Mq{-sReCgPRDhK-7E2UE(+VzxWR(9NffyANZtiEjToGzbiQOVronz
zRE6TYCW-bDDCKB;SZ%v2Vg8dy2$HSdF)oscBX<=zQkws=-Ru5kB}G^G12QOu2L|ID
zDZa0$_L@*gEZyS-qxQ#s7UH*+N_iyje9I5nuTGw|x#RgGm*qdAygRP{TvFR;5uO0i
zO$~qeEYmgLVQh-4bM67}?VMAKv`B|4CXJ^=`8mObJzvgiW@93r#`W?1&0@nf=1R*`
zCxfE<UjU|`0a+rC#AdmQm}{@YI+5XQu!~Aa5#gt@Qrqvf+AY^h@3cM7I&J`~oTz4M
z)9;AUBJsO^^lJw40qvIjQ`p&SgW3hv{VtDwtF@R%KSy{R3ev7AW$=->;yJw;QfqWC
zFVd{1C$=RP^p+^L;<4K6GD-M-`I6GUkzxGurF=8yJ?p6}h!nbGJUCfN-7Z$ye*RL(
z>nchqqeTX|;F*Bzbrfk9t_*%>&~kv6AYFE@D-bsz4dMFbaB)=Z{$<@py~e)}{Xibg
zrVr=Ou)p6`w8y6W{LxCjkejuY$da-C<ASotgbwj*AOLGrknx|mjctB7`)iYNsq5qE
zB#YeiJG-TrU}>wnc5x6yB2D|JDgwSx6;EPpxD(?c8-uaflrabfM#w}~LQMTFwLL2D
zJni6#oTYX$kXJ31$dGVN@9?V8gr||_T`{3n95n`s43!Ha(yYsKs60fgve1UdUMF7g
z&c0ztt;m)ZfS*EdNVk!mM{W#<6BYH=8`li}KLDP_sD8HGnLdF<`c#py{18B)?liZ#
zyGZ71IqIz@wKWdU>yKQyPTuBQtQOn}qUmKRw&+xcYh4aH(Le0%jakjsaJf=2t+Zg(
z{7upkWhP%Y7ePF^+VAo!{*dW8L;nBt_10lkZDH59bV--e0@4Tw(kUfKcSv`4mw<E&
zNH>C%bT^80cO%{1`OdB9ocH;j>%Bhz^n$(FYpp%kT=y8`H=-w4B@*KEcv*M9zji74
zxy07*PQvH#q7at!)7?MLjlx&7Q#QkBlaVI2q*}=)nuOy8q6@P%lxshVJ1ozJ(2`HV
z!?OI;<N)oYUM2=Zd#M`#m%Jzv$(Pz5(i2E{Q29($mC(mrFOL|Qam1dH@fm#`PBjXN
z`*Kj!<5H<rh;=fA0v1ATTiDiEQpAU{mWBFLGr=LB>wt%oPL~~yE4-HOZ51xifT3%?
z&}-$l)apS^r=HEZ&+nKz%KExwLXr49Q(G%>i$lTexw`rp*h2GPul-%I>H|Ke+^cy}
z4M5Vy()E|izM~>by><IzY>6tsjPbfKFN|zm*uz<QN=C9-5=GTUPGq)NN|<WG0O3MD
zC^Dl_u!!B7BbIN@$K9mxq@)AN^zJf)v*hN|+DfC4H|3Hzmyg#KkrKwkS?{ZyUvEvq
zKpcm9e_UhPjLgu*4Dv6qZypIf*YpZ?tkmKU=!3%YBi#^=2A^^*#{Zl1xjF3ZcB>B`
zNB<YBdgVwA&rQwfIapHT55B3L%eJ?3+p7hxk-s*;)O$?0rWyzH52uic44lEdR*x<G
z*2LOR2NkRq);)3Z{<JNC#qHLg&OLLx>&g7}uFK_gyJyUVUm3n#{02g*>F!~rlWqJW
zT_A9~BLp@uo!VXj{HZMRUML3@vXh%jkAjIaZ{TKcU8o|nCCkfq=KOBd?)x0Q?6h<D
zUzak@%^>2K`$fgLfCmtzO39aWT^GM?U%PdWygd-;cE*rsNbMaFPM_wdcZMb1?3(%}
zc|1|1VC2l(vuxIc<J`nGP;)2)JqkD)R`H%;X$Gl83c5oiI+e-22p%<aA8Kejb?C)x
zVPC^xTI!YtCAfh{F5}O$GA34^CfEl{>^)!oLICp+<rO;l^FllG!U}!Mr$&qQ@6Bpo
z({A^^X-&U1ZnU<4*tOKg8oaNlp-;V8rWmiXgDK5kUQ?=ndvUg#Y}ME_dNyblIe8Z6
zMO)S8Tu#9A>+R?IdoUy#lc;HR4cwXdUN8zg%S8FMR<eU!oaYfQl#0x8t2u1Wxg}-l
zcbFP;k)YwAq57>SLP{mxO!r{LbZ3^sl}l{$6XsbS8TlavrHSnF1%!w!ToEYw-+r9Z
z_KH`yYK(>$KjwNJmV#T5pvR=12YW0f3En@m>@$fZ&XVE&$F|Y<F`?=5cQ{gS0p8Nd
z<NcInS5d9i$R~Km{coeHHVYD6GS@ef)rKXvw39~yx7(s;8FEMuyAeAh-&$rK9K{Dy
zbAAr+SnV3#h)`NDNsCYWPT_FO8GRufWWSPN+^~*b%^y<4W^XqCS;={%!<!NJcJq~%
zikUPvft;83VdybW!cVlxpr{O$76-bK4IN6}vAnuyG+Yfoon~_*f*vXc8D~>L%sfT_
z1W+N)!&Rnl7wG12gP+S$&xu-Y8cX<W_C_}0_j*-mHBr7PFY5G|*L-Hnj$<SK6hoIg
zYQ9$^4Yzf?fy=-<M1D5-X7wM#HD)zwb=97x>XZBP@5N=)tEmdSDz$>$aSrZB1H=_F
zKEIS4CS6R<4MV=UjGaMg&hssLJ>zNtOvMkr*!iaS&X1Te07H<o$7a_tuWYgX8Xfs9
zZplQ4+tP8P9x+*<M*12)><Kq?4*Z#Z$_jUH%S^#{b(h1@mFg6vUYH`0Rlb1NCcyAP
z_tEd%{fb{Ogt<(UIvywFF}`}wuY&X&>F$5#4NI=Jz7A+fLDPj7NkraQl}IxBC!cam
zhQ^Ymc~UFfi9vo1eSa)XgLx$2pHc4#o$TSb9yzbs){|<FS6l&jl`v&r5=LHmmFRl2
z96j$a7RRuKwHOY0xyxQDpz-v4gGMZG`j{q-gm!3r{nM=`4U+fs$+o5eG<xoN+zzg<
z6&Emh3@l&Y7r7q4QGXwTIlH$W66Zd^F&ZxMOz#k$QBs{Be_gOBJn_`YTuHUYs*Y<o
zlXRJur6_Uz3Z>Spr4D>430`tOOY)=TbFDSl*#nOQtP)lkvNGo(fh#Ip{P?RdK=Yk1
zm7stxEZ~nr@#yZG8r?z*f-h=g`T#r~yxMpF7(cE)EdJ#Bd>i0L@z>q4JQ|m*Oy`l_
zRoOF>3=N*bLQ-6=f9CkuoS6Q~BznOxL%qa!v}<zw5XWLJw)yZQ=xn<<!249AG{Vc2
z{DIRU)SZbr^QKkDi*&fS>pKyEY_3tmE~flEeAY$S#p)A#ulU+c3sHo^-$C!?WOmM@
zFPYa&V2LBY_?St}$iU?|CAr+&ub`J8G9xiSOiQlRaB^*`L|sey1sKXE;%utz{qC+b
z5g1`izwSmw$|EK0@o%6y-z~JEzMJy?9on(Yj#imRWcg%?1gJQhV21rg;pO|`!=-)|
z#IHt%!&4}#oZ7F`53g||&0*>jKlj$$>=K_QBV}RT0e`~Lsj@KcF&8>#>dIomN_2Ls
zk3)8^3$VBiXf*XY%zzU<)af_PyhPvB7c}jOG$fglhs0aQRG1nNcHDNo60g({lP)pf
z8cp06gWO)K3vD&{ebwIjsGqEGpW}TN<<$arRo;P>Cao@3?Qs?6(z_yE$r`w*t<P64
z%g1^XKVfbfm%IHWCrL|AQ`y&e*8InQVgAQ{VGyn^LkYX2;j?FPTl3BeVi5e`Cs3?|
z9x3!e=gl$M5`DjYV&x0h7_p1P0!y6Z_S=mhT>i`cpU>d}JX&qP6P^ok<$bd7KP3Kb
z$|&14XR%&XR`a+|Wc3Jv@~q0}#4=X$JI`%&!N?P}Y<44(`w0q(KQY|EN~tGfD^Ba7
zs&Z5md#a$}a7Ldoq$SmAD_|I^c}N>4)0)m6DS6C5_O4rDGF@Bu6p9g1G`!)Xt0|bn
zlX6s%z6mb<%7i_7h)WXJuiikF>oDI-Bqfk1?RvEE;CMm4(=BhpU6IzHg1po4VY8c=
zp4~d9f6nDB>)V}qms%o*&)EQPmj#<oGbVrlDSyqeUV11SWhqU<plkQ>xO*onk`dLC
z)<vZoHL$`vr{=RMu#}Q@ad?z@mEquhl&eQEyMNN4_K_GwLS`M>Dlw=#a>-L^etKFD
zB-Mkd&6a*eE!G5T`t!_Qs92Pj?Nf%x#A`Z_d|g3~2qF=zYDlZ|pdfLreTu~GY?(h!
zF29EEg!SO6QmxS@R2|&jv6J?!rW0Ga+!JTZWsZ~TSx%ow6A}q#H7D1uh_lXA-vY6@
ziLQ(1CJB}i@g*9+rPWdfn;7oq5MBj(SN&lk_U6NGCq1zjU-9|q?bbJwaN=KS?xIdx
zxABAiNM4bk!9D=amxOpuZYq_A()rVD$>n%e`J(eO;{IAdM6p8besF#S(H@tjDyghK
zEa7#`GQwQcz59;ZMIU_;o&|VfZKC!=t*Q>@HNf(E<M0Dhf6xJOz%)eSn}D&2GKeu!
z8ygZ{1$*`yr?tQP2K(qz96$B!4t>b>3I0_wxWn#$YYf3JAJ>a}ygOhUDn8U{NnKY?
zGp+60)rWGH^0P@cnkdo=qk6iXKe*_9gb0eO?+1k%qiTL9%>vsXq%36@`6h(~Y3PiO
zIXvyH#8I^vfvZ=&uGu^KR<m$v{dBjW{gsW`xy~e}$h7(>$PtY=2wNKQcm1d|<@WK6
zoEr0OZnecl!)v~4LiBlHwg_>cyzDEvNj(2mbMW*nh}*gRO4Oqh<KyQgbuc7!tjQxw
zZnsg}MZm5QWKZ32+o<6ZMC_~?V-T)^^Ysh;_*`LRFbQMdd6m!0Vb8Yk*I;F0!QE!6
zs5U<rfo9_8HX_;E1W|~wLJ-UST+wS^*C(SwxMHE-6||V7FeQ3Fy<hjaGjAZbChGy^
zQA(^FMbj}jthr>C%zWF6S@9<O7>Rt|?Z{2}mhL#HBTMXY&8Nfi)!B9D+=$sn?Q!Ml
z%DeKi^jS6OIhWGuSTvz%N4_c~EPK-VOUITaav8)UFY@vTW2TPweeWU$xTQ_bOl#k3
zv32`UwLiu!F%rM&86T&361csYkHAfJJeA<!Z?aR-aSxQvtzv=`ow^F^UF!*`dS#iq
zb6aj3U{EJ9kv0vOsz{RgR^v#ecb2an$Qq~J>@@SMgh+n#vubs}J!l3K%5kpRr{kSb
zwetSW{3$9?8KW7*D)l4V%57GNdhYFu_`k){0=K7PkqL*gsrgw=#V;Fti-)(Er_TdX
zxAtBZ)f+r=#gS8%lF5-_d^8{nR(k&6|D4)^=w*5F{(h&-Ui5$mSL9Wef%)9s<}Z(%
zpjv8r9||8y<PT$xe&KlS9CKetYy9`SGLztZCzhT)&3ot!735(h8vEeT=dBHOA!tkA
z78r_e{B6nPB+^tuZn=1M;-~bOeqw8+k&&7xn#6S?g6z=`LH>OKj4gLyd%-64OvsY*
zmol95*}t@lzb7y`c7Z-gF<Ye#T*B{OYE9ZT+Q8d)E(n9a9-_R;n~{I1y?wcWl%quZ
zuLnhPoR!Zxvc~#IYAB#%zU<|dmYba)Nz*%_y6k+*c{=KOcg{4`$bD?PAG$+`ZV!D9
z&dJ?!FBnUYcWi76osyx)jFLYcKViZt_r4`bTRSK0=9**>M{L1&l*B&X)R9(1d$maG
zd#Q4wy9_H^V$NcRcImB0H&}AB<-WZHHdNd_OEMXmY;@FvI@$4iTPC6P4LNB^iAhrp
zT46t0WC+-q#Z#OzawJR2CkQl#V!ebi$zD8Z-2ZNsT-1i-JG&0CL3ki<-a%{-d05>D
zzrvv@U^8gOy`|7|ZA+ulngxqEtZJM06FXg9JlU^Vt0!xcD0odj+v31^+-cTUa@y-i
zL>?%}puWp&lUNS+P^MQ_@;1(}^Ssgh$-om0Pr81ep3~Nj4tuhsCK8K)&8!m;f_5j%
zZkLHdtS3D;!-LA%Z;`{xF&A*+ha0>6!%>rGz>n!689@~T^vZj%s9UWrHO4N=wF~U^
z8!Rj*rK1GY>%Bk9GTRz%l1LQ&jJ*1Z@IZx2Pxa?eh;2B*Vtz`fp#DH00uJ#OHyV#A
z_dxQsRL3(U7T_YNw075&#avgV*2E?>eTiYitv2)H!Zz<TNnNGxvX6mc^hRwC6RBNX
z9Xk*;(|vC|!l+%qn3};V{gY6S4Oif6ZUYM(7z^U!+h91pcq_NC%1NxDVN0sZ=XXMw
zn6Sv$(8dflRfHPvMNa$-vi`6w@>qet%RRpBk=#vofD8^i3y)MdX*!nkx9+>4#(AgB
z6|DxOEeDN7nXSM-QDv?d=%HMa&>i$I%MCV!$L=vjE+!AGz%HrrnVRZwK>U2Y`bDeh
ziL!J5ouI#<*DG)7&%{sLyP#Eo*`|oRSi`OIgX!&K1`)Mp6KjU;89PsiO5@)5b;o|n
zD>Vy2A7N>~R@Z$hSzgb$96Q_Bnh|j(+!Jr6S8HGerejY%_@|?l4YHR$rqP(6tVq1!
zRC<9JgEy}WDNDd*>(Q?JggIqbM5JYM%F&ALY1-u7RkSYj-7dJ~FdReEn8$y{8ODTT
z*Pz597q(U6trx*eeh}Sc`1=nDR^-$bR+NVOk$jH9Qx=F~y$*;?W=-zo{FJWf8x@Kr
zlijcmEo<QTK|>>*?+$N4H7hGFw?dmVyl;AvO?A-_k$2h*u|Ig2UkD^{X8wRi+xs8(
zhbV~s;bX4W)wXT#iHl6#ed2?2wk6LJ*J<*sC3ct8KxtCffEDa#K_xbZJfyLNq9#)c
zPZ*5fNU35;_{XISmfscmz`X%cGlF>;!>6)iBbmuJ`$i;3OJ={ChXhEWyCNN|`#aBj
z%il^Ll?XPg>?s%a8Mfl5(7gD?1WO-3U}A58%YjJI`?Mz2ID)V+K-xA^i^!9xHuNC(
zqzImLUufQ`>5BV9QguZ80YaDqSu7o}`<1xlf<3ti=Uv<0Q@gVSN^6A#!KQn>2m+dn
zL`c1Yu1eLXh+{|Xa`rr_=SE%ix&!cYi1L%{-7REt!x+N^E>G?Cg1j~lyymEqFBTpU
zy(tfAEju1la~NBak|cmR%vuwlh%Ki82%?+kG~6*~2Ha@;Wlu$p&Tq)OZ>M~QB9n6%
zd(kP?ny;*!e+F`vAQ`BNeS`IJZB3^`<L&LZ{5@)Ph!Jx`@TO2XYda_Kn^tjWrRh0*
z8qTOq-9GX<`3U*dB=@Hd){@P@1}4}gcot6C<{#`gYVml)w>oYO@FUVvc45q4mIK~C
z49zJjjmV7gzT4~I$K-QchUzJWB|Y8|Fx0er$4p;JPWqNrB`QoY2BiblEYyC!LC1Om
z!RfVVkYcC9#ZrdX>S<GwTUw3ykp|c<jVr3@6<+-{IFP~pH8}jTX&x&`UkyfM*^g-N
zbwnLB3a<1TEaN&ej6)U5#30MeF@$~X!hCB2j1AGEC+Rz)wYsEtEM257_Rdqi#2-qT
zvF_g8mdgNC+#%Ww{g?04{Z!m0&?#G2tU~Qx-?WOjBlPtxd#%t?O48&9R)1UnaWW90
zkYbtY>T6Rd`7ITy8xYL%dFV;=ASi0paK%=Iqzzrpa%Su{J$V1HN@#suV#WlN>0S6A
ztWJxzK%M6y<1n_>F;}A2mhRM7vfh8UKptM#6p5YUc@8wn%QQwo5(l&WJINzy)bZ;+
z+Or?Y5vIvH+fyYJ)kDV!NHq)2a1Bna^Mz5>k$GLst(;wxr~O{jERm$C#HIK$zsR5r
z?E6u^oa!^=4>lh?a14#jJ+qWCPPwJSrT+%?tM>^hyH`5cFT!C$NtFm2pjAryz!>KK
zkb<!rd*KVYmI`KMIcEYvKIg}8tS-0gHgFjU>nVntH2;Xn;6$F{Llb(AYL`RmfAPqn
z_*~d%$36xI-?FlVPQP`Ch(qBk906?dr4~LO4Z-I@{C*}x3i3%^JdS9t@aiTxB!1?H
z=mC@89?>!JqB?_`=8OvavYQx#Z(P@P605vASgwU5RD$1<n<ZF%B|>Yl^oufwT}eVy
zM!m5G&qQ{Xjb8@)q<D}>W-mc#<ihGm;c?1yDQJqZBbeJ1gkn$IC2b}nhf+7*KIn#g
zpIlkP#U>uVFR`PD3liYn!R+E1MnNa!_vC$#$24fnjXq9SEmO3S<@wV{%n+u?D_;|`
zVa)uV=eQh?b?a=Nk2Vva_~w+`%HPzp+nBoznZ1GR3JlQYE9HQf;oB4EO@mom=pe45
z9&egx(i!bX+vtlM=k*?tQ6MAxm^9Cc78}HW<`DVr<f=z8!`R9E$LwO{%&aygJUQ?r
z<&zyAp4Jwa#$=mIhjH~`H<P#cMx$-Ux3a|Mztn)$Nytd>UfAm@^Rrt>euKkxZ$4tL
zIjEa_QoK|hg+|1;jVKP5T9C-Pq9Hi%WS)LY<W6Hk_MYqW?DNQPc*P(XCjWir!inc{
zwA~=T>D3fl7F5L6m=<wk;%6}HoEst&uhi1UMJ`)qZ+G!Gf&XUTwgz_H<y%8wShO`<
zP#G6zugqxr@tXj~^diN0tae#@PvihN&KAjIzR%oBjrVs;`Wb5YnDRWK;T`55Or$1t
zU&Hnc!3t!7$nipJg1PwQLL*=vAoZ@iEOz@?x~3wQH&iiI;>K+&H|R+A*0g<uG3hV`
z%?Nq3G89+8fVPzDBRb*AeCcvLoV4Akcl}?B!fK=&6}tD4FMQJUVEImI`DvU4&B1&#
zzU_n&F&6g58`nG*H9oiBpV3JscT?EoZK-&j7h7vcH+Aa2Y_LXGe4z*UAaAj@@Jtw=
z;-4ive9l+HC(dSbpYeGl$rv1SZ#RKE!S&N^`pWzZqWcn~x5Ae8G(Y$F&$tm@1k*wq
zWvw$+WLi5lCM1cliGe}11&ox1wRJJe*I#VKV%HuhctV9@=`KJl1WTVBO;;ZIe3FL9
z{ojpY(YA|kjhPe8?9rs47}=8ELtF{ic1~!rGvCepPLevRfh*xu<D~^rEABJhnpK+j
zc|TQGbt#Uk$Fp+Bi{c#51Cyj~(MeUkKWqrmkbn-hT=*1j;39tCV=kzaQ9=rT+!327
z{qe&4A^ChK#VWnDcELR171O*<Em8|1FYA2G)+sXZj>2T&@um(Z<AacosX%q^i9V`B
z!pqr~Dr+n;DIq;f6b6|rxT0a-R3QwV%eD~KbhyeB*7l&JL7D?L!p&ltA_CL9+uvQ%
z{O`T}&zv{V%{U%}H*?|WYspb%>hh*BH8>6v2+vt|i)sh9UA+45I&t!yoV%pDYMg2i
zy6(7yUJ$>IPcyerF8SPyz4oXZEEO#(!7fB5Oa3I{=wB1Uz2FAsgKQU6kb?18It-i9
z6<IuW;wAa~B@TX@3lsz?;X&6$h?tP*OdkvHyqk-LPvv*$EHIfZK>->i!$LnnGYhF;
zhCcI_))0Ka<UbA#z}a=Y--T~KoG+|@HF$B!hP^jISX-p7Irr2PW9SUzwl?A{JwB9L
z8A^-DA&Vr9E*UMuXc70%k;$=@EO%xVqj(nnO!e7FM%}P;;7N2&y8bZ}3Ev(SXA(Gu
zcA5&lCzuq^!i2_`W0>-|OZ@QhvY60`dG<{wm#OrAp0hYGJKJ7JIyUf=UDHl|A3Hgp
z`(}Jj$7U_36Cam>)MBmqev*^T$jaMj@<WP09gkn>kna}{xuHR&G^-m&gzT{+6vp19
z_X*pSveJtciMsV(t+vhYcKlEGb4kSQ%)40e%6W0428Eb@1va%~csdr5N)hjtgPnU9
zH4z_zznag=MbOAnHKjC0w=AM=U>VcUjt0^jXy!H>b6o7gXWCCqHl{HP)!Q-dY?l-x
z2nTKEDhh5QUp(-`6Cu7#jdM@CAmWv@`rD$h@zOaf-ANs8n}gu~S+JYVWV-6|^BBw7
z8)z1hK#BPZ`pg2_!Qi-MgNsJ=6u-`sN@HZibE}67Vgu45zNXLCgNAiUofdPz2;+wt
zT5pr!tQ4}kl5*9?4yxsUK>4%PO}r`<3oQS;2nCKjEmcGYqZLFlLv=XzeVV>Jrazzs
zQkTPpe1;3lTJDJ*hwSSx83GYf_BC|1_m1~s4WYqU0~Q-y6<O_cq=Y|=x#qLV@|G9x
zZ!L|_1(P(^#XL?lyzF6M>hr+f3G<@_8eWQDvjew9x1LymG>=2&{%|@TqsGtOl!+h&
zN>D_tJOQW>5F0{ns7T-wDP#ImUF7mjy<dqv4{r6o^y%$t>GFT<2YM~`tb9NE;#ezG
z0)5!;mdwiv&|7-cSUWRHQx}v^%p3Q12ztxs*OCH~mxJtM!87kaGI}&A#xTnci~l8N
zsU;(%VK@LC;srQNe;@Xb7XX<Cnps?=$up59UDw1dA0rB`>iUo4pj!wq>Uoj>mE;Ba
zg`N~`bogv<C7s4Oa!vHL!e?EJ*(Ub);?mKa4l-?k?wYYqPw0FlMGTrUKwn@gt5qTR
zTV({<Nt%v+<B!;z6Mlm-cv{P)MD-Rv!DO?qVlNS0TwH`zh*QtUZU%+m)hG$X>>vi-
znuFx8ln4>n*3@CmRZ1m=J)E+T*L!CsF-P75%RvDVu@&o|4geY)$Q$C+suR%u<x$8*
zV+gX5ieS-eRlwq|PH)s6&MU@fL6@LuM>9v)K(upKU^z%DkHPt0#sab>2-#w3;dMdT
zzsK;OL%AFd92h2+l%jt<4gatn1Tnf@z<Uq5)JoC)({b{jgMky$P4b*j;*Xy8-_K*f
zo)ClgK3!mpMTTSr{^#2af|qjnC8qF49sh?v0m)-~27#&sPnW*jj{on^|Gz#)cMu{D
zt->eJsG$pK)KJxS3piM8lm%Fa(7KmKeQ{KGS)k&9WK>q*k;pDd0T?_`fhsHxqfD!a
zfr#ll<@&NWTQsb&l0?9bU}eUQ!vpiX#(l%__;+;!I{Bu$!YyBHldYdgr}YZkO)425
zpoCu0A>zov>i2%h!Yk#-1w(qpb2>YvCN-AQFZ`hL;uYW!OnL=vJ5Lgqvia~t5KiW7
z+^Lq>ztrZc%cZ98_Ojo3GwXE4sRbb;JiO)zZB+@UiXxJNsJdG0pw?P)zKQi4UEKt!
zn8F}<JkL}OYr#_TYSpYr1|P{Y8gIN4S&#}8w=YvkVxI^=8w8zw4Xj?>#<%VTg1mH#
zjhy^s`b)Or_gWgYnZ3BZ9@Qp&Ey7i|%krfLBa1y2=ONA>Up*mF&uF<;v<Q-69^J8w
z$CFghsQK<TD)lG@zw!wmPxaTqlH}WgTL0V1{rS@Mz^7XyNwe%$d-ZJa%qXg*mwR(o
z;4f^D9|VIq?&e-rhW2g&W5&!|UJG(8B5=C>e@NIPLCcF~l40Kct75ZL{lcpRVc1g*
zFH51AKh-iUpx0rkw^v{KUb(l)K^5~YXrSng%bEgC3ixV^g5A55J|;Gw0e8s_PiN?o
zvwzg}_`9`jMebO~r0)w&qW_W%3Os?^<QqRQD!x!f^!^v5P{;TzTo4-5v-U=~ws!>f
zanT>a@i_FpSq^p82Y5g7{n&n#%;A>kyto>9tI~@L`&JP07Yj7A#%uQ_5%}D#A?*v;
zX?S@yRGSrG%rqU6?Q^*@z&eumB~R^Ot6QvDg71^<+b1>0plm{4|B%v36Kn>xcna6%
zfF`!N#8fk&(JI48AW)Yf33Ht~q;ahIyw>ZaqYBIU;qpM>NFr>*SM{pFCW27Nj#t=y
zYy7$S7^P;I&tFzT`|+std60E{4wARRj&gFVeVRe@%NLFiQ$u7@>I=lWNI@Zrt0C{3
zkP8w%32~Y9ENa;dd59M6lYoX?3xr*_0{-s;yb55?jQHtO8-n65SL0`o4+9_E&#pp-
zu4Bc~vS%lXXsKtxawad+z@KD)EV${+SfF~#9%p|KZ#Cl)oEJrqZH^71A~^z}xju&o
z94L=q<TveI-8^77>fBu*vOdj)AoCHORA7U)Sv)S;Irj6b{MnYP$vNLs^QmI7H`4_O
z>mQnvWN3l%1#lN9D^aEDT4`W`_s_p*BKFAKwp44ak!+1m5;fz!H=!Pu`g!_>47jTd
z5nh;}!ksAng%-%ihvI(Iw+v4%lxmEZkWl>|u=W8*g6rITD)(f4Z-7lsyePFfk!ZN(
z)%;BxCQ_tAbV~RSrxuPno8vcy3ZCbMO7;3~y}DA0)ABoK^<k5RTn9mlq|=1#R`YE|
zQk+>T-{pjeH7l(WRFfCEybiGR3TLbeJ`_MJubSY0?jPy`B}aGK&_m=0jFb@>+k{_C
zYCyW5+vxz><$)>t{TdZWJDeY*e5c%yfR+40BjdDUp@8aK@yQIHt;R*&AZ5_Y*?q&R
z!TuZ`-Eco`b-}xpO%S)QO94pQJB_Ea0RDjNgRk&iTh&X`?cj8TLr6iPG|V6yo^jLV
zkK{pp+sNZ_O(3~`SWO$hN;leHP5}X9FpU<X&h@W-Sb9aO8M*jeKmpG93V71#ehCp6
zw<!g^!~hW=0mq`$rng7*W%bI88_@q#u$H|_@!+Qo7}RPg9{t0ixLd?sb$Jy+Tnx@;
ziRY$L_JBaGYo7E76gp6>LH;YotI<R3*!dlwNn?%rd(|Seu~)=hbSx~X%()pAhm{+?
zCih?I3)q}~>c&^;U3!z|r>jx5kxG%|i~_G!mSunOx^63!IQesqZ>5qN%}&(BJWhkJ
z2n8HV<!mP7l4UrI#a!=8=jsz8(|IjOE@qr_QsrShDhCXRGb+;1JiF8kj`XQxCc8pU
zg}iA^!bv2JB;!x&=eOVYRJpG@e@#1*=;y+a@B+tQ$j8;<v*Ji(mq1c8Gu>F5a#B^(
z8{@i$gFhPgMRTN+;M+?yt8bcfmJTWPMU$gOBsj32=P9q8o%$@!%ZqY|`Unxj35LV4
z>PY>CvtkX)3%Y>99e>i6OigfzABX$H!==(m=I14p<R{++?t((k))R5jlah0~{#({x
zXgND}Gm;TQ>O3|P-^Pj@c%lsUYu~F}uRmcT6z2_ywKvAUAQccLblx*E(q7L*YSHpq
z*g^k6OI-?J4r+mCd)doS_`Bm3N%%^D8KHvQw@5taPCRwlwyQfBsmVL`eNQ(MSJF=>
z@C~y~<$f}n3!Ie~9d@6%B@>Zi1Di69>KAjBl58>4$^`0@t%pFyFI#ogQP0+pI^)|T
z`BdvtUl%X5Un6S5HC8mdvG{hHBYiaT9VpZaU#Hca_@+^{jR$_F-Cov9vnb_@y`f9O
zF0{RjS*<d<gqm{(sz!bdBTc&95j7%@N+=T3&yeLtdD7TrQj`7H#+tn+&A+^YDDV#4
zAi#o1$z7NI92iS|$XNr05yEb^X2=~4+ndLh_BDBJQeDm~8(O?MXVdP}@%LcrVEnWf
zU<-u)U<>>;sogU2Yx^o;_U}P!#F#qD{l<jRr?KN*hn(XSu=qx>OC}wVL2v~eY6i&b
z-2>rL2TTX@lg%E5KBYf39M6g{a+&7J;s-ZPZ~BWGuf3B*U=Pi9BG}fiLS_uc;g-@M
zlJ-CnS0r9L0##?2Nsuwy96|rZf2wwYd<n9&k>7`^ck&HCE{{*^-)~)?ZfYw(|0{Km
z4td@&>3HU|Rgq2FCGHm!<+l`+bqz%Xzxoe#aGt=+={sfynvQx@OGk_&EpfV35RRCh
zWb}47osJr3^j{($p1fdZ3b<&N9)mqE<~6Du)91Bq>6V-$+)f$~N0=#RF`8gKxh&nh
z>HqFeUDwXqOwy0x0`~ETlUZ+^Z000|;_IwxE3j);0%>RPc(dQHL@P3Hm&F1R-L?C+
z21eWzSOd~66Fbt_JTvV-jv#<@9dy3iUhckI)JMD)giW$p7Bl9DU~TxAj-yjjNN%)r
zSJqKu1@1YeyJwoQ`9JcGKEe`PX^6oOXjl9yYPsR%*fGHJj7#Y>rU}F1aQ8C&4^i;=
z3O$tHV3XdWE6QQv8+tFx%i90TC{VufUe}nsngN3M(Z{|21n)sFV^*TtQ-Z8Pi*q|z
z87zBZsI$|A>HhVrZHF4F6f#vswGkMACx{1lf{je7MHs`{MW#f!JNH@6uQ*Uh>J8o~
zpiKsfF&HN5|4-~D9|#vM)phor5vOUupgRQ#fRWBMdVWz^F9XC0{dJrcn2bSf5n?Vn
zHSr6IQORds9RNvCrN7GF+q7@xUNpAwz}U%h<<n`mD*uR^QadIQxd;QGr%OmhY$Qsh
zGZkZ9IODWFbzMLPFdKrJ?ZbP9@kv6o&LsGHKAF`41)JlFSKRto%IP=rBlzBy_TX@L
zw?248GKQdh$~Vh#2@&RDRI47`jI*By5N{)f?T`giKnjq9DB6WT>ep~I`qw68n2$b+
z6}m=#F<A^HZLR>mHq|V4p0ZVMi#^*3y^h1*bDi`v7q>04GGGc0>L9MYwQ7)hjAo6!
zf03G7u)%}>wfg2wC4^#60TgiOrHnW$I_I^1uC4%u8%MJt(uTc)zS8S-uIq5Mxr8Z1
zC+(srT&9^J9mKP(>A;2!k&p_CaqDIsW7ua!3ShD5>|xWsdKaeU<)}f~>Xx?r<5CZr
z-jeKUQO`CPPLFM>f;Kyre055(t3p6aeLc)24u73+wz7yDm%vK|o;9Q;xn6yUm4oOl
z>+EYkTt~?K`8*lW3byqOg{ZFDa9}owgCa-Kv@;nr^szk07>Y=Olmni#8{W24q;m^4
zK5n|?7tFx(Sm(EH#~k&)o712{dH`;{cs#;opls91!WadF8$2EmyEzR~?KHhC7UlAI
z(NhyHQ7k>8Sm%uHE5S=q-cHIFoEEq<sp3>dk!QSq<8t`Z$w$lV`0*DEkzyYtn|xk8
zH|!`RtPqjb#@xvp)l;J=zN_LTmCAZJpT*>=F)1UpVq%gG+y%u`lW9WMn4aS%zn;~+
zU*9(IY~z1$Zb^1$+YRCDYt`F-A(gVs5ZR5@9vZUS7nyCK+SQ~&+Q{8oe&-M5a27kB
zA>>o@0TsK|d-vFQ@`fdd9A3lEB}da9E>kLu378X)tj*lj)g`_c5G{PH-!;?0Uz+1h
z@dj6|waJM2BqGnXM^=Wv7H7Z6@;j|OM~OEMm7%IZoi50qP?OINh&9~aDt3H0Mo~fJ
zGaAb=MS>>$2U4JlQ6>D|o9gN+FV(oR28C#Rjbtd1*ZnLp^GR{{Z-K^Yv)@K=ziph=
z4KNQ%C5{BvU3L$eEdz3{iCzE=y!EP)gSxOiUE=Qi97u=_#kM|Hkg|w>DFc`sfM8Zj
zBD0Cgtm5ro<H4A+{mJ~ji8<55!vL@N+*ah>$BQ1j-e9WsDfaZEEA48xp!{w3i#CUk
z92jrlA^NvcIXDwEeG-9Z+hUB^1cin?QvIi3KreB0KVaMDwjbfmr;Pi>YQ7iK=~dE1
z2m!yFFQ|nOYNQocWUY%I7;!FF*C_k^=Yp0k<CJWf7kwf>B*@=;MB2|;y=28;;o(t!
z%4K+}t-bjhP{l$Uj=k?8C;{Fd8K0r?N&cb)NdG|zJOtg2C-3LKcov-4;=~EBlOJE~
z#*yRoo0F~b@p|#;P(M$c+@ro|X4>;ev+wmr0;R)VWZd+QtMwi(D>S(MFdMbIiIs_;
zGm!X?YFGm%F7<VX2FD9Q?C?Xe=Fx<o5yUD)K5rGf>~pryTXR2j;ZpPyCc=d}T%q&T
zQdNXZTJu)@r}XV|hbkw36Zcm*8-a}NhqEnj=2ng8zJdDB$1+9ayb2(Vr*dEMAmp*u
z>ACl!)wpuFvV~)WsC`9~sRqx*C#tXmYp3+55nc6<r*1Xakqbxz1RP2uXwk;v#?mX#
zzr%yV1A#~L<O-wOBL7x{ztjM-7uNBshwfI%tp_0owkI7gHQI@bW9!sOu{U9wjz(|x
zJuI1anKWWj&darVnJ_#PZSI|KV7((lheJ*iOkeUmP4<gaVamfY{n8Ck0&Wp<o7FdJ
z$Di+;0-n4jcCi}!z_o%<m=wD$m3xSvdiu!eP}sn$59<0DM#|}oaS}TukM`Tz`L?D?
za_lojxwGfNin&N#MDp&%@97y7Ku2`W;N|?N9`4d*5-TvqKgaF>i*W0>F9;bs={2$3
zhO4V6g6S*AoL=o8+(6JZo>U&0g9>F~S(URJ1(z|FXQK6P^!RdEtFfX5BKbC1YV8-7
z8B%B=>AijpaB1Ig|9+Uo@-!vjW*cG`K+4yZS}+jVP~rML{$iNR^+jMM)$<tPVrUO0
z$IRVnv7w%^M?hGjY2+6vy#7QH!YQE@na0b~g~qG_njcD|+;*_-lKkWO#ARGltLEvt
zwr4@aL>^7oH`V<7+Iq5!fqvdMPF;+WbN3>t6<pCd{C|~m%kW?*($oGp_N9*HjapVJ
znoa)VQ)Y`}y2=}<f9|75h?$1nZ(!XkmqMPuhJ+g~vwn{aRt=XyH~}15rSu^09>BQ(
z@x6y~d$ELkDVfU-JHNJCDtvyvJFiWQ?{|ob&i0ErLvZbNu+}#n>G|juzn*&us%x3-
zAH3wy&!q?#@L<_p!|r!=xvN5@!<P4~;{BwKMpVWwcg5L#c`OZ}O*$#Zy<(Rka5aw>
zum3;_y*@!E$}7DI`F~xWObpa#)fDSV4XKb-!@DkEz4kP-p!k<aD-@6X_?l>z`4I{_
zc@Eqfyxq~hC7jQYT|ed`CR#Qr3#L7mA_Wmx6-#~|8cQiD*Zo@CZN%tF_}-|wzID>S
zq?=|Ag5JeZ5`#Yge_*o9lDg&;x!vY3IxVZv<mO8v%mkoG4!_VAC&THn_{3d0L5;)k
z@~}^sEX$jifG@MQf$UCZI7mldQ%`!&Nm^>t!v~T@?%VT8h3471NyOv@F=Qca3@mOU
zP#Mw19gu;R-y}E0BF*wA_d8QOjWgywJBZ)cUo`N}u7CV4*^PW=@(Ttt?%QCK-O8Yf
zzB1V-u1{gTVG=lbIo5j5Q_`QiJGK4J(7eymTvsb95cXfGYklsz>z1`A?!UZx18#G2
zJ2nZma@d~qWQoXyeM}Ne?DB8Jbr4nPQik+kN>1>)MoKjbvQPAbBtFNYCS4DMl&7>J
zX=A|mvv5DOU8544vRhyl)<&19-eUgU_Po1b5_}T@ndmPVV85RpG@ZQTL~Hj%WiNb>
z#Zgd5ZtW8$hB|(K3(x>Q-Lr*qlp4f7G)~N?8`U?7PO<w<{vRjnglAxQEF0hHd@GT7
zc8_`YZjtFq@jfX{U?gL)zu!=EZ9P&Dbq&sA);7`PT@|PT586<}4Eki6T*>)ibhpe*
z!~bd*V3T}SorpYBK@NG>NhtcV^f7(EA*Y1uibg=mZj?c@&kph!OJs4c!DhA=VrXb`
zXgxqW*wQmGKe{|OL4lKQ=Hg8Q*p+jcaGSK#o_g_$j(a}q+XNyUES=$bD>ec04imCG
zemZ{m$kzfkvht$09?u)AEJPuqc{#<EyHU$K;n!eGRcJccDwk?La9Xyn;x$ix1rE-v
z45)(`n(^_uh!{FG9v~g?A`V3z!pcj{5O_@D_jLHIF+9*?{6@};eESN*eao>Y$~bqj
zK3y@a<dphQhb7_F(VUl>@Q5)i{~CIxDdoOWzz-dk+^zL2TeA%PA(0rW1TC9DPZ5R@
z^x2-Q;Pc}ux~36y2lT0$jwGWpVY#7%v0WTQH|rNXw^^iD9E@ZiK{EVKXe|J5KNsqC
zeb-~LmjR`SiP$gB%Bg^{##_3I9ll!n=PLTvOZX_oTK`6q>+?WCrC1?P3PdcQ{eOw&
zL3$9eywPEP)tbvLY6^JA5u(y67<;PLD=Drxk;;{_SgOY;x8723GDt^w@%W*4WnFN2
zV>@p*qzbk|o)*R$fK)%;Tse!nrc^Y<;<Hc~r3tW8H;Gn1Yqsqkk6lh-xZN~|TpX63
zUL#OjMu)I)86)?($tNgoYKveZUNpow-WhwHokNRaj$EgDQxrrgCfPUXzW7++WZJJ+
zvu&dr=9p+^O5G+1yX2WJ?A6-u-O<C|^g-&mwvSONHL_F{B0ruC!>`K4-`%nT_m+eZ
zDnJKU*PZtvr)ylzA<vrfQz=Ow)_4{rD1=w`ZN668+wKVE7bcN}1<o+2&;z&MhoY)T
z$JTKS2|aBoXg-{;SXF*0$tohk=(KX1fh9dh|H(>WWdM=N_pa0bNacBgODVj8!V2+J
zKl|^e?!NM?k3i{S(mNh(*V>HLhN%=@)ftP0PgA4!O=2zAVYg5DKM`5G-b^L+<mP86
z;qw~DkR5S`ZK1q{S*GyEJ<#<JQZ{!=q#g?S1>X^<mpp2Y0P;C?I6D!_xV|y1@#Ycc
z*Om>+q<=^Y<?K~3rE$}>iabu!uXyv{_^@5twelOX67#+<JBMqx`A`~}Jm4p$BaV;w
z6)gs1$JFx~MS(2M{+Fg}LMV(d&d4tp!8u<%AMVkKn%4?s{kiRE))u+4qGO&xPypV(
zO|3&qRIqd(W!qBML$rIJV2u85%I3lo9tCi}(!Qbhtz>2<oZNE&fJ#&Z=M!Rf$Jgog
zkwL9e`&wB|c9?$R$Hsk;XrC3Azgcgk_Y^<3?OAU1vmtM`8x}2B@0V9?YbWu0)o_lQ
z{W_c?1_LjMn^B=eTGXpLOd{}3Q*CP(r~u-v-*zs0h&~R0|LJ)*(3UY|Pbt5CP(~pz
zC3K%&yj+b|I_J7UHT-8b5s@%;xV)&vcdQ;eU0{UkAy1Lc&n5uoaKnW69+;M|syQ(|
z9k?;x>y#RoY%^+mqUen*V>2hNUI-X`hKqd>Ox)gOr>Kdxsq=v;X)a&byF;Q#KS0gA
zw|wy!N|4vJkvZNv4(2I_6CC}+3&Wk)X!XHH^Y;>XK4p7&$aXpWVDrym$6g*>geX0X
z_>~13XzDB*rC+~#V21L9XL!pDIOLR<2<vy4Ku`V5<d}Z`%#(A@4hLOvrKp_MW3b}F
zRrT}tMdnyD0Vb44iy4VrQnA`y166S@6=qCkX_)1xco6^ZLL-$-e`Tdfr|I&qzWz@1
z#5;*9N}UMbbLuy?W12tx&p1Y5MgopZ0I@@&uO{~-&1J^Cd%X0o-J)hM;_H+TeC0su
zZ-a)%;C?=#`Yfa3%>90L{PTbov!LHqnAmtUek4&J%DcD2)YKJbh*!`!L~RWoL6f~q
zNZZ3dA3YvkxBg@u^?o{K_gM}AKkl;#*FC$5peRYQ-W@mhtxW74I(#Y|<>gT-_hfJ~
zTL}LVqbfJ>>+-iI8rkPxM6?7ez!DK=A~re+fQw+#f4mG2?H*OV*f~Q&qVb*)l39G!
zMIhFYrW<PCQTC9IfZ?{hhP=-i%x9U1SuUD#0#4$ze)CWz)Js<`D@IQITx95=To@ue
zNv5_nk)y2U?S9f!>${)?Srq*Za@0PjlogTbc^ZVNF8vo--7ZAs-PTS*96PI2V3f`8
zT$ea@=)C>l$+veRZr{&s@u>o@EN$myjVdaoE&0tD&lt4PspOP}i7RpcOvu@fEoZO2
z9hxp=Dri^dMG}F}y<I<D2@qeSEBMivW6&4@0i}(7!GGMF8M0A8S%QOP{7YY@o}#nV
zk69bJPZH8v!jcYlx4+X6%hPzjj5JrwgS5AVaJiKo=iCir#s3x&p99B*>=7};wKKb_
z?8|dAAgK(^qXTFyL~21pY$17;vR`if10Rc(a)Yj7k~ii6Yz-RQSt-m^!MYTTnJUMT
z{zLzlo7f3eu&bU>KTytzhl!v>NEEZnra0ho-LHS`TTUK@=S4m%{+x=)kM&1SCoN3O
zacXCYzKgS%GP}O9`t<As@K!tZ#8F!$*L;TIL6%=#w8WQcfo_;R_4n5-*%N)S+Pg$L
zQH979Se+?>&o^}vM+@JgUV&PMC=<)s@};*7vYj~&!(rtwmH>&))to<(d(uEP{CSov
zr+eg>?v!N^A#tm^_TP(qQo06(A3E`RudlK}<t^$cRBM6q{7~1CJ|`sCuL~=DMB&zI
zi<!DqW2AUaEPUtnloNxTi}sv*UoYi^z3EFtq(Ay8R-@cdVFlbCM6aJH{-sx1<a(ZF
z%$oISuHg}}^qqUV-Q;k8)I+|_rt@7y<oVT`tuqu_RT{g#S8p6jqqy$HW}~>`!&a~3
zmkk1oS%%_FLB9hbOD1>U7kEOK14UMrNk@)L$f9%t7A18Wq!xITwI=A4;y5Fa(Rvw1
zuZSnLX=L{atw^ziziYczsRv6b9zU0+OVLHut&leD4UH?O#l$xPoObd0tz4lqJ;KM2
z)gn_ixwq5jJX%W!oNRpS1_sj%`AwtCORJIl2h9jCT4}&$O|{E566_#^qz8ej>iZ7k
zkI3H9&v@gKDBr>m%jI((`1-5q2)XtWL-Psk;4RSC?`Csv>3Lf7wrxacNmARvjd2wQ
zTl^Cj_|}|X{QCHyi4r^03-F^hBzL5*X9R{aN?IRBDJMjpw7fJ@2iQyaJH8Dktk;zX
zbg$q=O}z0q+>F0*f{Y?5o!aj;a4`lcwxm-0m7U2D6CrI3zIgg<LopC+B8?hQbfZr0
zar{V)KBAZ%E6+4BDEM9+8@r3;c6MI_H5cUgKXQ?;tpDIW{pq%e5#{!+R{<CNW-v5R
z4Rf9ps^<ddIO>cuitl4oj;ILT(VOerDF+Hhu|a%r01|?qs0>&Egg%h@(t*z8e%(pN
zwuE?rB)o>1$PVz^65UiuOy#upw2+r1hrA@>t5Cdu{+hQ(DBBSU#`x~A$!vwP6t@?0
zoNe3p>_NREG;1e&TPQrnoXZ`LC_=<{okmP&g2+$&1fj%1B0tWi4C9Ul=EJ+NE`{9~
zWGTmDlr&L|yi_J|6h=fdjn50&s#Ng{<-Vux2fXHle<%crL1DDHbTR~BI>;0x2MT*{
zEiTD#faF$D?a?jua6N(G5dKQxF@IPA(SQ|Dhr8-}|5v#$o6d$1-w?qESbzS1h4H^U
z0Avhk(OGGN16-_{>qEVE{Cc9BzSVA&7_9##qeBUiQ9`sr*aSno|5Xcd!Vdi?u&ehn
V+Hg0d^a=PUAu1#C{jILw{{adrX88aB

literal 0
HcmV?d00001

diff --git a/images/img_mobilenetv1_inference.png b/images/img_mobilenetv1_inference.png
new file mode 100644
index 0000000000000000000000000000000000000000..931442dd849c68dd4013219575635d974e092662
GIT binary patch
literal 72038
zcmZ^~1zc3$);>N10}MTYgmj~HN{4iJcY{cGGe{!>($a`ZrxMa7DJfk;cX#t2?|pyo
zd%t@Be9ka4XU^IC>{{zt&w6&0s<I3w8ZjCG0Kk-!l~e}+KnMT;&=ZUdKXX<h@B;wA
zP<SOFp(-aKL8<EMWc|wC3ILFeO4dQq)#xM0Hc*$d4F=1iHb<#{Qb5g<g2XdUgXJjm
zy3!z$d%e|Y<wlZXMx$CdghocdDyoa>XP#3hsszc&9#y6i>q=-_JC}>Mi`OmRFNZ(4
zJ!o)VE<Odg+j*<MVl74&%ceCUpup)!PfG8m_5f111rs<UW(&0#_Q$?`i>w-c@R&5k
z3;5V%;-TgmAoyq^r9zo>3KUzR9D?*-`#>Ji0LC4u{U<=m8?ol|QvV`cmEdVHWV}$_
z;x0Wq!;f80?F@^m<zhC)nn(bg?|KrgfVm48=cNdN1Z$%$NL3}0lqrB*iEzGnwiQ&$
z*idgx<3$X*aL+cXRrKuAAZKd~w-ax@PF<t7^1XTH-?>wHZy)V@9#fhOf2d*z^4U`B
zSvFN$PuCgnbA3!oLw2d4rJ*M<4~T?CBQg3#suj!PeE3u=qjc?xwfEFie2;M{O!}og
znoB(`FTH;LaPN}{H>z+Z5X+NIH$0veh0v2q5T?yLdujhY$WW5YG%*Z0TM=L9Rgn7o
z#IDIB7%u8}iMxtu$*AW$nH+Y(eEi7f4v`;DGDDx$U|MU?u5@9{^gcQ6zjL6xz>}vI
zAHSRarl{xL)NP!?Kv7I+K#l8W!ClX2l0;OrJTtgibi(_lUhkP11LiS!J@I3N9S&Hc
zbG*hPH7tQvsadyxTa?y#^;SHcq@b#&sAC@dc@fQA!{_CjZbIyz5$NiJaXqr9tGE+B
zD^F*dj$L;AkHjIEe3VZrp$M@U?1+|#E`ClLY<HitQBi~_5%t8thTQ2?aZe^<K*o4t
zI9KS@RPoriVTIOVxqux}tk)Adgz-h`EUG?hP#MT(@UwXIASO-Ygk{^lCG0|+AiOMW
zx4|$8{>!phOrBBVcjl3)pSdjMFlfSYX`+@TW#&QHUH8Yf-k9N?WS8v$5!LthD&CS?
zltQ$I4DPs&DujdsEyWQ?^iX9YDnkvo?o^&t&-e*E>~=&-(Y5Xm^{CG6Jsq<4Lg4O;
zQ2khXH>-0Kx5xZ&wubVm{CaiOey}|IkLA}Y4h#;%W&~C$t$XnTQ!&deuHiN<vBTFN
ze<66s-OS<|yxH#=GVQ5yn>H&&t*o@r<?_T4{Mgl5vV}dtNE}U2(a(T#64@Hn^Q%kK
zQ~f~yaCY_PrEn@6vt|ng&?elyRxy*V-==Cg!=!gb{wiOlYTjtUAE+8AGhR`YsyJ~8
zG|PT0WfB340lb`@kB)4N#;E`oY=}_b)2~BRh%IkrfX=ni74i6m9`z^U0L@&)oM#rM
zSf#;;r9EWcz#s@lN1HbU9rq<;8ge7^n_WF*>)`LEIM0yQP3d$|ircI!@%KQYApyCl
zq{uz(ubhE~$W!fP^}Y@58Y3viAwbYeSumLyWp^Z!`CC4kgGg-L$WW?LS)qjJPc(Xw
zAT>(2_djI$tBH%mYbiZ?*?M2D@wsAci#gI@$CIxiosj=fCANd6=Xq`sjiXrQtJzWu
z;17!1e3Y_9Z6Ga^d_B5gOXEx03^kS!94)XFMP_G-iPAG?O~dYPH?QJm#!ECit>P?2
zDEUELNpK}<@<X7?c#llE>nR_)S1=PFu1d%Cm7>2$3r0nU*UGyqxjV)Oz+tB-&K5}H
z4Oe?y8=WGxC5|<YJSu+Zekfw7#?h-C`X+kZD8-M$qa-5lin{DqNLM&lq*utznW{pj
zrS=%!%XZV+r~{s-bkVFQ%FB_+Hp=zL>c}p9s<ULNMw-bYRaT}^=>4&VcEWn1c7lIm
zaRQJ^HCH&!?@|XP%_QL`IV2JE)5-H%L5pQ-3XoOtie|>8#^vo0&!3#Lo_|hBNl8wr
zn3QCxniMaSFKgDfsUf#*&for(X-hi$B>hcoW^Kr%e4~<|T*0sXt@bl+Uy%U*=GeeD
zfjJM_8F|9`86_+u#<@mi3xxqvw;!;idbA_TxReTH3&d?&hv>45jiSr5SY=WR1WoGn
zFLMu+76OICvonlKjk6ayn~i)o&$71?&-Twc&ya5DZVb*<w!Apl!+AnFm$(SHYB`iS
z8ZaY~FOUsG>_c2baxiBxZ!pd=dx+^t2#B>w$T{{b%@qadQ$GwCw$Jx8wfl>9*n|8+
zox8?xFlowTye+uLo=wZzYFdBm*=iW|vq*7CbBR4>T%}+2qpy!j?2}B?qt8%O{%{t{
zO|&hVAX;s|*7CkJ<Kq|QFUMcp6zkHBx%YXwc;B)oraQ0;noYhQ;>Lb4{KDjQ;CNHd
z8#5eRs#@LGE<aaPQmEbz_FHLHD_7qzE3we9;HMZc&#|DC?v*8!MweaaE`0r>f2(Vw
zUs2=u_26q&<p)!)sV|EgUIx)+ab>-p&HGR1cs(yXpL@dQcxHsFc&j&O31_C}U>>=G
zApW23@U^rd>wf0)(`A0kdj{3x#2uGkOVG$wI;%PJ#}G-Dpfry3b!<XRF`37&u#FDS
z=gvi5KJ4lI0>19YTVF<pyQgF1NWR+r{J@c8C~V|rRQG++<(C)9q4wV7%0<UQU)G`V
ze&Arp0nR~HTmH=IweDtBM^#4m;)$iV^3mPm*TK{L)VkE4CG90WbKQaqjor6j;(}E&
zhdFb;wt!#|7*b0_0PNc5di{3)^mY#RL$oI-^)b=qVl{NN_T4U8CE5~_J?a*QCDIp^
z2cqexmQTCTi!ptP?8%tHi{y7{`@KWfht{I#E|~AbNWngx$E!bAzlEFU;pAE6?VH`0
z$(d=HU3QjreC^onyrmYR*^vAujZ2+Rts}L#mhN&rFW?CCK0H3{&WZUbMJK5xB^=`!
zog!7ID8>+zQ)%6?Kr}}rr>Makx!zqK)l6tZ!OL|pBDvC4A2oqT@uTs_@H^4OHrcf7
zN~UEGbJ>;XE<rggx#&-|mRW1h`#k!b)WtQl3*427xn3=PLNB5(S{dKqR@Zl(z4-Rf
zXO=jnbdbHm|Jl9&UD1UGdO>!EH7ASL?E7tjUp~Js3-HD?#=JfUE&VvL^SOUIACn#<
zA`;P>{iq3H)@9nlK?dufQ`jI|mJj(S;K$d{m&iP!Ery&&^RF--pYHYU4bC{Zn!9<f
zVQhR^JLsd?{Jcq)w58xm%V3i>{48^n$|EI3UeM-d(2)153IW|5tIV3l`rtLPb7!F1
za_*J#aV}+LZo|@k>6+dx`Z*e17@>{eps~QMi<<YSirW$Y9lwJc%O-33ttq+RV#;;z
zUCgxudMRNN^OiADmGdU`&c{rtR{TbS3f90HJ}Vc6D80Upgz`h`mJ+rJ*5u^HyT!-b
zL4_l_6}o)|;S@*Rn<7P3&zAX`rjK`>oDbEfqV@Nx*PTlzKiBt}nzZfp<qV(e`B-R;
z6lay3YZI0o=yRDVt?EzCR@YQaDw%kdo7?}`I3GWcKF>-XO&%n%e^EMrlqaw0(Jb)E
zSLNNS!Lf8+5+sred$h&z${BmN>y1m%=%A(=I&%e!G9!l{+AEW9%HnG2oCHr3hgCf0
zlxkC*9_uBl^WALjh7vs%Jv<r}y~+w@cJa@J8w>VY?%cM92QgVO4KCO(lMa#m@cka|
z=<ZWVZa-es>|f5nj!O0`cu<H?`ib9n&%6sz`uJsNV|*hvWEH!%=BkdAkNajMwlgK7
zB#ww&&n3zC+uGJr)kll_va!j<RlT0Z*)Meq{2bd|HHmmCwK>rO9vALc+jut9HnCZl
ze12D%u<s$80Ykh-M-3z1qo*suFrgs9k%Ngv@4j30Q!Cg23)!kbtTAeX>z&=!#ILFP
zDOSTc_`;>x?k>yFaP(Zno%eMWyu3Jtp$Agbe?4MdcAODj@Wa1UI-GhKI5zPUXcmYQ
z0tYxBn{{*w8r_u_1ob{%jToIoJ`}E0{a9^aI8ed5481gba68FaQ2DHa;;ZmDc3ZOX
zq4TTBBi6&MIqXx*!DjVl2wO@sci{KciD?~&dzbr5lg(ywZ;gw|tNkG_k;9=T6`x0M
z(@B)Ei2k@(3Ih?TfCj-dUcP7Recr`o0A!T(52$T#$p9l7KvTo;+lam$w`_UH`HW^*
zTTsW&*IfA~fGw1F-81VKTAMN7p1zh0Zti?B12W=xr-iqzWcK!Q`Nqd~JnZHDr6mO}
zfGi8ZKM4~H%bt|<t{#)#j|4jTVdwO-dgPJ)QGOk+(L${Y)s_4MCRPYkQ;6()89cf`
zb(Yn00{{r<e!qZn>a<7jz$5#WrmnlLlA?fxlLM=nrIWc8tG9zQ{AvI|$XfvZ(ZR~y
zjMCe|-qB6KTbTM^R|vpA|31w|P5G}&++Pb*>nf>IN;tV%QSz{Iv$9i*pixp%3b|TZ
z3#dy<|Ih94Z^G2J?(WV4Y;0a$UaVeRtWK^rY#jXj{A}!;Y@D1d@GDr{d>q})yjdLG
zX#QQu|0_q*%FV*{m9zURCr8TP<(ip0dAJKxQ~$2$fB*jdoL1hi{;wuSxBqDtyn$@L
ze_`WbWoP@}vf;N1{XQ$8`pVnNURUy!gO#Hjybci#cFq?<|GMF?pZ>4P|F~7}|8C{t
z<^In*|Kpecxl@SkcMJZbMgQ)uf1QQ*mk62=+yC~x2-@<e8$I}NkiL>s(S*Ol!^MC7
zHNyWtgTH>i!(aY^Nn1l}0Du@kPV%LuH*hZt#b0w`@<N9gwfT|&^or5+hj?l+<#d(m
zCtWe5my(+4V{dco^nmC3<&<MpraGS(qp77DAnBBJc}N>Z!|9pJKP?j!agXQ{ZD72a
zT$_6V{XrD1u)6GK`B0gRdVN>d>ZS(Hmn&%asarxml4pC{;P+ce#yAZ@duyot{-&+&
z#agP-_|Nx<2aRwkfqxyrrys;5b0N-WTWZ(C#fa(+n$P*yIUn+=X=ufR7)l&-8xzvJ
zH9sa`^_JB{%fA;#8s790s>pG;JXla~aNj<?dD?luJhV<KrW-OzfQN^7Ky)2QD~9l&
z2Y;v-BNZhFhuXvaYL0q?B7ao0W(4%t4o)sxej{$)qojOjrqDi)H!`VcI;{+D;#v~1
zV1Y_bZ6Jql2jZxh5$ajIaT|@=sM>Qrp@hs*JA@>Vp-yrb(E<zZ>ib=$^<??a3R@oM
zVpw*lsHm=P5m?Td@!P)&6h6x*iwEA23;Xug!2+4c-0#vjEc{oc-x1S&wHbb|U1re#
zwKM0OY;))-LdFY+gw>unh4<7_pX_I<ZImrtEBr1Gs=bfRUj=FSJ>|OiNosk*S=7*a
zx#)M&P4dffchZ3#WUB1ca=oE>?+pdX{_}WnL}(9#YTbSopK*1?W%_U<Sup-$$|&k&
z_Rwz4pbILw!$EXVFpd$fntKDi2nOW}-UNZ4i&10>76lPS1Q?<d14EI%;q@r@9=Be|
z4VU!Gxx0V6G{w61y1m#R`zVF(H;aT$(2GQ92pLOcD2jCbv9437>ne|i&&WKG!gBhK
z<f=>TgRo8Qw^zTU>6Oyg1hX|fV@dg`1wHrTr>AwJ;^IV0;<A#GXlbNl;y-ZN%z7Bq
zIkF@&s0@t>Q~DbDtS4I+ZnCM0X)~K-f8GUQr3-qMV-T^o-L~)6WizP6U<6(8<HaD{
zp75f^1O9Q`Er8yj0ofkBRTHSq*OOSca}+1<0HbyterGB{oktOQag{)Cl+4L{TP?6M
zeNIYRudFbZ6HwT3vIPyap`B}RjsT4Su;%k>;Q*#yzKyrV2dxP8e1622qH|ho3gQj1
zUpS0`K&a3OS>qYCibMUIgC0(EIGq-SzHesxebOjUZgD|G`2q%(IjK@D`M{1dt3xJ^
z%sRv6W;m_Z-?Fvxy^gydU~83qe7Lt7m}I$`Nc%)vW>BY2wG_q-Lv#al-Y@zco@azG
zo2a6A?$2taFzZpTk87Ci)w{S}ucz)_=6v+E+Uy`z_{S`V&v=_)F;yzl@e|spsJDq=
zo)r~8mO!o>v!)#F5Q-N>;>aUQ>p^I@z8EEs);Y_dm_Xq#1!dbEb;c@ENP^kaPnJGK
zBNr$askHZ5aBfxKC5Q$+B;oK|oc2%z$)4_JWbjoxFL%TXBaHb8f`M+_+_{684r|w&
zZeJkm{#VYJY|mLt0*|eHD~QMVL$ee=a7r0_FC+2$oP2u1sH=LiF~HV%xtH*wY*@4+
z%6c#zRb0AZaY}1xp<VjVc5LDW=Zd#Qu1Db(gcE3=Ta1s1uAg7?ssTzK`xvnb?;QuL
zfu<Bbh`DH{CFCHSbaXL?po&dsY%n3916g_Z`qwNQ!qoV|dtZzp#GY}jhYw6ARtn(h
zMz4w_X1%dnQ{ZFA=ey}D3-wanDkxWbP{&o*c{JhF4(fI&6CXM7=uMhvP>Tv!?C$1$
z7ma$8?(q_@?B*7piV&6C#?SX8j`dVVQ~-qjmVn#6EUv~C|D`sd+rn@vn~8wRO3=<y
zdq{1ykt*i#S^6rD>KDbQ*CzvPzjooh;*BuxIp;8WIi|pt9EC^U8=oh^(kkvLjws&n
zy#&wu@p?0-o8&Z0yYZl5x0FJhD05S}#L065l82dw=Zy8@G3fq0RZ>byPWlvoZ|33<
z#^%C^Y`t;I=YLg+%9HoN8-8oOc^@Lo8~6U7E5cHd5|e7FvTH-j0=qW$L8@dFah1aO
zp%S+I3tMH=vgXWUe2dpi3|pqCJq}|V{B*={d(j_wLqY|t%wV4^4<6VN2}SV65)+kG
z6rz>bB}9`0Uu_rUtmQ?JS0^v)5i5wUe8)LCpE4O(6!!a+WWD4xUtcA3KCWTi`!k(q
zAmQr1)I!-#y|W_>y#(Fc#%C$`HpJTXaV;@!{d}sRHyo2>&2~(UC69<^qF6i8!14Rc
zd=Da~m<+hBq>W~&uBNrm@34izyBm)|Syu`QH=uoqDk2~H3yHaH<Nfakr(EaC>iR6=
zgu@kdzm5RT!Sz&_anLxC8`+#=55I1Uo;bP1`X=?S)=JxRbrs_kCa~`@4ng`g^1kN>
z4&9W7OFcy;H*gcpkiqLzEmx%4(p1%SK51~@%`a-I82gk<JB9V$7`_m#J}QMp<9~9=
z?<PROwitbN*!tK~Hrpu{wuPYmBf9wL<0(3FzD!(71Vx~3;GIiy_lC)PYx_r<I3a4$
zyQ2_8j|r_58lZR^nb&-t+XeJpgv&0i{?Bp<^l+`DqJ=YLK^VpKyP+HM+4ip<T(Op$
z9j2RkQTc>GJoea%<_mqFjSn_I!9BODjXhqQkw|>ufoUX7qD@cB>sQc7Yr~za`ZW)|
zJogHR^1{2uOY_{Wba(i>0GFqfmmPwKO*E~Hsg@o!*S5<txD*slQ$PHJxm=aKHwS3&
z88zeK7P@yePSo*HNw+5{=qT%AXF>tZHWsvbdvze_PhqgvI#7KRa5i~bH}Cq^FU{$3
zBv0Z(Dd6$`vRdO*eCg5WS+240)H8}u7rx<jlKASxnM-WaVc36qVfbe~28bq@sAVnp
z3A3T(!eevEon6ngd>}1G06wVlBlcTJw~+Zm{(jwzpk3Nh?o_>u=mh%Or`V`=T_Kx(
z9+n8`!#6{q@59lu1n1t)NGO1aKTFU{O@gW7Dc>pKPaOj$^@<99jHA{_@T-~X&)oLY
zBQZROiBw;MFN4p&o$by1!oU|w_!Yjx$Q7eU<Y7DAfcc1wN!;7|aL#5R=haQgy>zoN
zHFxiKv1f)X&^^NgMLjIh!zs}!u{+rR{Ea#P7rFn@O9PJ95(}B9#T@6#r)a?TL?n^h
zy{{(+jfV)WFAh($CiyU%L=)IL@X0uKlnol(i`=)zCUkk6Mh&fp)ANV%jvRfff*CdE
zy<h=D>~rOMU!{GEmZ}vLNE=AfmL9wdUj6za7lli=$9<u8-{xm5Sijh+_w~!kVbG%i
z*LG`B_|CJ_YO&e}@i4USY%J`S@QERCzo^_7PbIMie})$oD4M8H_N8xd)KQ#!tW=A~
zI7{k(95(;?vi(l5%x!asF_5{&4&kkc!6yW<1UJU@dr4tL`Chn*w?`J;XtXX%zv*8*
zWOIHn3A*PAE&$9P*+dKoI<;C%RUtx)6x%UjXt<b*UM4z%>vtifxDa~QmMimXkHL!P
zdHjpl5o%-<6l^a{?7robnlIfSmuXI1rwdpW*SEgBb&{*dQh9etz+#{)8~tvA#A4U~
z@ko7pvMl42ul9bw4yH&u{k~EIHv8Br8uLyK2lLVmosuXm`SP-nBj^toZ~6AOqw5-L
z)dWA=;dwD$ADU$zE{NgBW4}C9@hBR^ykSrea0hrJF7djeh*3fjps<Fj7QWcPbiWZl
z%>9asR<h3TohaD7@g;9V*Kn4292=MVH*PRGQJGS9rz&_m-A6x4d16WHE%<t{xRA{k
zh~DmTaY=g}%s(IH7(H3V7QLB&5WPRsXqV?UzF;)cidfbt)8`4!d+cYbNGlNu4GoMt
z|C;=&G57&74iJeId#w50)vD2Cp~1r{&i5H*E!hgfEWF9A!@`%D`>Pa0dAjdfPKROz
ztP2n8X+WTG<fUuB-5n9)q4V9TVfoE=KcF<Twiiw}bO%ZKsYHeC*i3g@KvQGNfP}eF
z(t<&ic^C${kgn&mxHm}X(_h!7;vw8k*i9f)!;T3OX@2dd#EFuX5NwgkcqDJ3+~MB$
zG!7WropI#CcH~4BJ+aASpXJddiNB?#5#gyWxyJ5hDvCVMmyVUM8N<-7C%no3MlHnm
zrsnH94wU~&yp!-ijS6t-k#6C=>NSDR8sF|)Zrtp<5&a1FYafV0+~AhCu&mI6!W<|r
zuPTL*M`$|N65yA~Sp~S`h&YOFt8<sgAW@9zsN2qvIA4m4>a&>7<nOu~vMXgXYQf+P
z38P<X0Vj%;sDF+QL#D)7o-zqKAJj>tSNh3hweb_%qlJF9#jk~(V5Otk*I~1D`(R<n
zpMpuJoSVvFsvLBG$hm$i+qWEi*>xFnYgWGG0A(ahdmI*hoPWr;-fc5cFZEWtzYkLp
z8rZpA)tmVPI1k|hd9Lo#Qvy#h48JXKxmqRIbVbt+yA31?<p+)2TSYCh#O}gu?<I2H
z^Toktd#>+G$vdajp5JFHZ1380SSMvuyyqmiM-FUQ2=R*;(*z%l7piqWd~?r)L5$Qc
z#)~w1hPKhFWiUBgPk&qW{^|7WcO<;C{5*C-gBiS+g?`O4ubchGr_V`3D0ZkFH`yA`
zI9p#ALzaq+n;VS22Huwl*9V#U-E0@^-$@Y~?Wviyd)++T9$t<Y>UDD2PqV`~_6osy
zJ+FPowHCZchYQ0~WEOga8#s3b5B6;RYh{E9_C!=f_(%BnT)vKh0$WyxVu>A|3)y*(
zBf<Fh)OcO|YV~XEe8wry$h<sBV{2YdUDz))Oz_(qoa<J;E^a^cn!nC7BlB(fBJ<8?
zGv2Ah81;p=)KgfMsUV}RuBgWcY)^qT3t~spDb!3ve1yGwt&!sluFVd(>nM4rLDkN`
zHfQX$P~+*?9rLusV!EBw?pa9I?qpdbjQ_6fvceeA(^x|8Zm7iD0tWjGC*|Ss(2qmX
zz-j;OAiS&aU+ER;Jg<G&a}KWiw{EXi-m>la9KkY(?^|w%vW4ejR|AOySAVzkD*Cc1
z_I52%WzX4j5%!(6?hj`)!vIY-X+^fM9Qjz5y{V4jLv+S|5>Y_3Z@stc#C7L#AXA|x
zDj(NgPf=n3M!IqkM<`D`iQup}n+Gq5lyr0u35(#ao?o`9)L9yXRcZ8%y6>|oM9ICE
zUUIFNIwRM@2lEz#<efy_uk_^wFU5sY#E5C9^B0vO>^Bpi7p83HY8_gtc5V&?4;F)u
zu|rTTc=B=&pTww=VsnctfmIQUa}>a;UQV!k5Bw=F%*$A9qDm3o#py{e-GS-r>|uv3
zK{9vzIWO|=j9-+-<Ks4}gz&+DY3l?E!$*wMj5_`UJv#TpyA19NlGQs^FW<(3hVOpd
zGg)ra`HKj?gR`E>%>w>CA^T~N+*9medcnaK{D9#$e)dj-I25ik7twR?N-z6e^L?Qn
zkEtTc{9%G;p!!y>^Ks{fTZb%u%{c7VReHs((@%8qwk-&^DPtLIkpT9sh0_}MZBE!-
zBWA$nM*3m9Mt*~iDfmw0C>HW!9ItEppw6hqF4^Wf8vjKF-}@}MacBQ8#(iW2y4kY8
zAjOkHR(`uPSkZ?tF=+BQerjM5KH*>(S7g(=Z!ji${y5bSO~DwE14MFcMUWyy;<R~_
zcKV!o$YYsP<e}VN?xj#+%7>E@L;1V|+!WweV1^WR5z_e1Ian=UdYf#B+wz4>DStQh
zB>BO_fhwXAkjLwmi)LtVLc{CPqDQ5W<N@-1x4dyLRFK<rXS~RDkY_6M>C&u3Akgwk
zW{kH!KcDKFOdQdf#mm1T4B0k80v$7ryYD93<v31t>U5)}-aXUZ`~-)%%5z`ety7!k
zXRKe3$EY#60wp0)V%%}6k>`i+H7aTBYO}S?I1$`=A9?AU4Zph<nK!wRUrM_28?Dzs
zkj@?3%w8)`K&!0=(7cgfBmqF!mQ6C|g6AETtiK9WcrX1!Stze*42OQ%^&TG$^1@_2
z6OqhWRc9!ju`(}Z$^o=bFznM_hl+Of^q?p3HW(ldB2}uCpfAn2G0b!tx^KVv4pGB?
zq5#<<kGO{gJuY-`oUHX(<dG}Kw44g<$mN<bAtFS~_%Gk2RWMaFjSUIH!iG{FE_$1M
zPWKvR)Fo|NhWvFBE-qcZ2U0}IalS0!`Q|V;S-v3GsW9G~ibW=%YaFSG{bWtdb5XQ+
z-8=H1R0ZYR-_g26%i?F$AN?6PhdfkG(xSLPYC=FP)rF`!rgs^xt#n`vc~?vL%h!v5
zitS)!>$BG;Tz4KM`90q<-szBxoq)GgX6pPRFq|~$X{p*AnkW+mIm;UjU?KtQ`EjD0
zh<M>&8ef9hCFC$?ym(FpLIECbDZz+2<BsHd&AuM#$XoExg-wu~O8~>0q4TbGf9{jk
zfVL+(QFkvBq_lln%9W?C{Tkp30&$3ktb@h)U#_u$?IU^OBvZs3AiV6nwhG*%WGlM?
zrDq^lYEZSf39Xx;E3F#6awaA3;ubyvWD_1LQut10T(KkIq5q)IU4Ag)cFCX9^$dlc
z^`lU4Q*pv*Oxoa_fW6Ls`89fr!Clt8p9;e}r67SN<p<F$Qec^WO?AxOgN?-FIb~N`
zMQcJV+5Hb9FaXZAOl6`iF#DJ+;TL?SmBwHMeTo$6%~~q&UV0c(lu}qO%K7;5$Is^`
zbKaL6Ua`)r0ohh5t3ZAZ1OtVv3WxLx1e#IxLCyYkmk{Nw1N`%=%0T1QrmqV@6U7u7
zTKHlG@TW4w<#`15zYu`QoNlmi2X%XvEJE)9dEfS$f}Ty;Q2dkA3myT3@DZ}_JyVmI
zjTSNd4XSiP7V<rF9cdY@C>9Ymo-Z*UuVy~H#1NA}XtO1=D3|~TLm<Ie4Pi6U$M2sr
z0_}5~Q3Wk^=|ceqdRh`YGDg>)sg<40%Q#0-z94tiRY6s-*mo{(kZH{s975GjI46ln
zbo&#H>E=ZMNlJ~1nKor6D8*;^Xy+v)IA1zVK*2tzgS_{gbBz_i5U;uvn4e@~6F=%>
z6FJbNgfCYM(eUXm85N*!ivF`Vh_bUKv4*T?UheE@e(?*nQZ$|S`Gk%E0>8JQ`$RFY
z5~7yC=0LRsSCbqXdBNyJ%kN$i^S=6sPb@%E!F4%0z|T8IVu3FpBC>c3zF`w$$tDvq
z?Ekck1C50k#?NCh!Mu5Ci6ylI?BB;6eSFJ$V#GF$C4|tT6$jNWo3ni?UNI$E8Tgb;
zVSuI$!^vt&;t=oWNWkhlUdxzh#bd2v-5J@-fFttT2^v#LC<I#V4-p*Ut_w?ccXxBY
zLSQLISrU{glKd7E3IHoU8<;LslQ<P{E`R&F;nIaA2~Sd8fjdX_(`~yX&)IkJFeuFt
zST(QEAzC%`Fo;eF;msXyI7hoae3?$4d&m5-Oke+ArkyQ|&*=uV&xlXh?_3;+7#dz~
z1bJXo?)N>(7m;W2JL9l08O~^q6REtan!prCw0|4hkW-2V>_8u>%x$*V)%GdZ3sdw3
z6+&{mRdQW+!9ZSEaUL-J!{P!d6`&_`1fYpyJ6ZZcL^kwJMq!@EVRl@&>w>2}{oCY(
zjZh1HKi8CJrkE{2d-zyc1w+=z95VodJ5s1d<Cae$8rbyh^%PbFizH>5fj}r!Z|Adf
zzBckmn#=B6n<nENoXZ|NZj5`?^x=_Tn+Bgt5v1@8J9lxMC4~{_Z?ye+^s?Yog+Y1!
ziR<;D!6W9nc8kbi^KCW;g@|H*(ytNdaO)8n_I&%T!~C~C!vVE-OHT)@P0H-F+|01|
zNUr8~aiAjrFfb8OB?2C29br^3J)jvKACSZG-|zN9Oz5<1p){Zv3@|2?7qv-#gp(^`
z3!BRHyD!*XgNiCmrY-kn$XX}1=(q28hd~lKY=bzt9WQO>#UaYLd$;v>CrsS#w4<Ta
zJv0DvBnp4_P9f8cp{z|jl0EU~kCbPKr~0{KKy;nHq1(oafq*|&<Q@)~5rlE=pOK`4
zh*EMrdXsA2ILFvZgOI7cx`5%g|LNlEIme+3KFU0czE*JJ^8iu~K-Mx61W5Ig?G069
ztwXYzk~ks45)O1?$h2D_rzM?ciRK&(#%<lh#1s?0;o^szIIPv6ij>>VVo@O^1d=Ae
zm*KQLWF>Mg$G3!Rjz;tR2=Xb+&(o=dli4|}e-gb^<k$96n`MKqlkP7y#QWZ$`T-R4
z&!oeV38C!V@YwUbP3DP`<g;{$szNP(Vz#jFtd9&Bz?4LYVZFsK4v<8d#~a*T6g9jY
za<0aE{+NqFxrjRK@k@y}ieC6E>jC!oU_*}OZK9-Y%LHplEGJ8{>Jld5b^JIbm-3^*
ze6b=Z$c@*_1HHko))n-L4wP#Mt6pas!@z{f0GgDFw+vY^e&WRA60@6kflqy%joVtj
z(koeBW;Y8&OnniJ(67ch+~Nk~4SX9niQ^6h9QtEU91x!*B;0m9eY>BtlLt7aa&E~y
zL=1RyUl+PMbh$i)!>%9|!Jy?YcH1NRu#DfimwK7QHT5q3c(v!lRdUv)mr;7-p9qyM
z7x|-!#02!gE%03E@l<X2v?R`#{;n{nK~(OM{_fM9)=fm&gVM<T=5a(z!w;w1Uv4i5
zES$;jJ%XwaECiM7@LZJRRJvX6cSSc-<4cg4-IB2-FK*ht@JzDc-6X1Dp$S>(?PQIG
zh<?$;6k{+_I!^{lT6-D=*H!geRv_mXVLyHFn3z9Ie@3y@oLv{1+a2mRSz%)Hc_kj_
zF^tgoXuLWa$Ro64e!#1}`pd!IwEKnu8SC5X!AUStA?s;+dvk+T#8jHUNs&0ui4Y^k
zw2KgW_SCCq*bxaK6FjGW?Alue97~>}Nx5#bQFshOJKLt0uSat8HZz=BYP`lIgS4fU
z-vyC`ocPwaHU>Og+jwFPe>B4gOa{*1|K{V?_qZ|IijgQwgo|lH$Hspw9=yLV2I_aH
z<+x3c<OSAAO#nzKcpWogE}Ie2l%C{zr-*YkQ^;%-k%5J@(A^d6<foa%s0aLAUK`mG
zF<%*xef{UBmxkT$u3QfNw>{LPPv)GOPs7yH3Xw<3jgGEeQ?VtHqD%*`MYv~j54p=r
zq7e5pN^~l+Qy_OEZhO-lw@yT-uu?q@KbcHH|E+i@Lv0w$3wZD+dyyfg8VAjLTPxet
zU`sKTd}r%{D%RBMgRr$D+=Mu?G#QutK(rw90O-2Qe*-N$+9fdC)!nWG+uSp#2c;20
z$yFu;ywNE2C_50G_d^FWET_+tCD1UPZLYBirec2Hp^NH2tDE;PDWA25W8X`)gpUYs
z{NeH9)vNNM(VG)zhwesO?C)y{3iet&K@WF)rh|mUxvfhr?}UKS(&noxYtvP$4G}~R
zU8AXWZtDZ7buL#7!f^mv6iI1+1{uJOh1B_S1Vsuwq9zsHfBN3+u`L*J5q9H*Y4o9|
zpDi}u%!;5KCM$ip8uw}wJi_S`SPmn4JC7PCbQ4L+pIO){q+J9U&PAqBHo$tboQ}S9
zG3QhV8xlPtuq{u<Ou&uXp;EFgq(JIm(0t<3a8=tocyxcM5c57|;aBBboq)};IhO<^
z>pDAVV!^WGU=+J$F9gPtmF7Rspc@Nd4=+Gz5>(g1g!gds?S;N>w4fd(D^_4Hj@iVM
zBeShKt#7t!*Si&n7nmQho0#U^A}yW@W8bvhBEV!;-qYN`vSm&-nQ|`$>3ZTQzFx<T
z*Jrm~pFGr!Ph73GeqJJ{(Xr#~>kQ1JsizPEf0HaYyzLBPyMQ(OdM!Q3nnzV)(z?6r
z!$<w;Fv~xZQt@IJsJ~&6Nt1@WX5re~W!?d8OoE!Z=J0_2i%+v0YQChX2vmT!!ghji
z5lJBTR}XiJ4qz98{d)}n^_zF2B3lg>yVg&gjef+30%)5XZx0#=+qwdCkH;!ZMBUnD
zI0fo$8Z&%_3QM^9OMHIb8|g4F6bzOTJ6;NU9zf!XNT{8!BVMA=pm)bMeqYVjDK~un
zEI6VS&OD^&#aenlOODutwr@H%Qyj45u-yDy{NWI)e|P>9n_H&sowfh>9<fU3dvN<p
zo$-Kfi@Y~T>#>4|->RLJzxc$s=~9`IV${KNdY9|=6k1Q(koXbcp1j~!t;5{UK?>=Q
zzE7qFmq|JME_WsdI-X><9I(jWQc-lkS^vGKIFm;?k9R!_KQ7+OkxxJ<336LCvRSoy
zpor{Z?@fUG2=q!P8*1P!+%c7(&$}pSTS-d!(K08?{CM^5_RY6_f>gFEmeGPS?^w8V
z<?(MP=Qoeb;YnD}#pmy~MB?}*ZGju3A0(PY&9@KhSFuXum*MMN>Z2^}oWxA4>F}x{
z#VL>d75nu|JmKP0=;3XffAG^5DX-4UV9Aj8HxzWm@}P&h(3UF^i25oPnM--FYBD^J
z_W}IgoxAVBqPtGJK$*->nN|qHUeD*T?e)<gxk$0e-(wyg*kH`qWn#;86?<Nhx0Heh
zV7jWUFfeHW)T~Xk)UCbg(E>pUq1hKN&VE_2UyPy*)?T3m`66T^%<-0{zIU(-r3Eb!
zZ#Ixk;*W%8J<Hb}t82*q;79$EGNwR2SyNT6?=Jhpao(8}=L=01v6eULpXEPrkoDJ>
zp1ricPGK?lSt*$e^}jv|bARfsN%_*^B|f9Mxw$xTVwN)T*b*L-GZ|49yJA+^*EqMJ
zp3^0fZJwW0P4bj4ma*To&Dak4n<0@KKg%1^(LTg%@y3pnDwKsNs)9ZD-OeA{4}Mtu
z;k1kqUWP$~SQqG#qLW@+dJ^4Uy;Zw~llZ~!oyzZTNId=fKYk}RScEZ*Z^`Jr^hPy(
zTNy07;6==co>rFVYJ^V6=c0&ps1k3i)04w14n1v}-ylU8;Q2tFu(PaiW-Fusk~dtM
z`xU2gsff`w=Y1A5sy6zu>h!AbuPoyCXn5Kmz~uXO%05YIE^Ywz8K9x|@MW3kGhm;c
zauF^dAC2i9dlB{d)boA9B7y)*^8Jtip0?N=$(W;8Zv2x!#L@Agye~PsKNtk_N-lkC
zAYJ?Qx~}*Mlg?ymI^$|!c{*l`eJ!}us8L(`Ri8)!zcFB10FxY843_=o%K*=R{^0-l
zhY<jvt`Q>?g#zkbJB<i~(y12fqJ7IF+t@?izDSs(VijOVel%Q2oud0MGKqOv%47(1
zLZgdN0(4Y<ljFx6`8J#gs_Sb|C45=8M)n7d`THxS4z4U6XiXl?|J6SM;lGOSr2{<!
zttCWlyiWLMw(wux3akoV^7^NY;@<=PJrWQGHRPW0*ZMDI=0e(>{;5Ow_hY%@1V_?D
zMcG7`Zhj=(Ecq`LiaFAXQBm>^3BW%5Gw}G`jw#$+Nc`)9Y&PXTGsU#=B&eZbkfmd9
z+UL*ZJP&8-`U;d$0}Cdosi|q{1@u==Hj=5OqGj#vOC_bHzf5HH$o>6%l%-&J8O}Fn
zujFK9<8gnyEfC~~fRDW34xQh1E%H8@(P$PFTv%A>-g83xPpkf~o(n|`CS?P)RjMc}
zr<57g%GTITjAh};fcZWfBi>#frgn8n$_e;tMwdWUi#4C0VnBNUB>$}#rr=5lf^xRd
z^VY|Q#_=)@h6NwjP=Jik^jDiWQW5{@CJ-6{qa+8Vk*|>4UmXF#0(FRk`CL{)QGux@
zo_Y;#2tWX)fj5$v*mS*1zGjiS)C;FYW}nmbL^zW)#zkoq{a;FnPecuU2dOmcME<_h
z&%~99E`uRv3=Dl@4{KQv0-_PJGK=2deO;TNeFMgeoD6+~f!7OR5c~t*z`w4sBbd>F
z##?uaF^&br0ksfBxHq83XVOlB=NL#Deb4Qp`VwX2U*o<c%_oYdQuq8@8Pxj{<eCM6
zes_5bG~n)|>;NtSp{hW8UdLHWYGCZ9ni|wx?4o5WV|IZy0*fq(N+Lr3tMy>DN7eCa
zci$Zn`X`^$jgy~T!<M`Hq>6ub*Z22g(_kPx)stGXJB96oMaE&eIh5t9$%G;X64QHW
zKf1EAVsB8TPUnkE6jl28tI$k>-}NdohE!O7_S>t(bM)=r_F;?^cop&`+GUnMr>kZf
z2v8FLSV-_m9lsj}P!x~^(qc;v0qn8K1T5E);>bi<rQQ-GC(tR#Yiepb-iZwL3rc{+
zJi<pUx@F-^_*V}lFz{3hY+E$-G=Iv1<}Y*5OB!s*1Og<k;xFy3@}3|sVJ`w@X}#5d
z1jF?=y0|aH!IlTBuD%(N-{GxvY!Y0u(9WOvd7n!~lm#6w+^`x*IRp3NzQm&bXVJa;
zAlDiQv=5senePpt1B8o<OH0i;#)vApbK&m*7bNivhnWtes39(6A31_+$kaN8^<b^C
z^xaFdQBhI!pPn%5j(9Qk(3p;Ijc$h{lL~o{xN@%z=fHShIQNJnZVV?+mg#3iyj=2m
zp#Nvh@bQ<;5i2J|AQg(GTw3hl0+=D7HhSt=0Yg#lg8Yz5CvhVJZZDD%kkI;Xf5CV!
zuYSM+7$69$7nQ*c%c2igt0}9di}T+(-8K}jrI@HP$KxZ!?J{8=mkVRfeqJWYpw0WL
zFe;H(W2e>rGidl?p0h<Obn?AF2u<v#m`z%m8XKJ`elB{iVt(NAtA*z~q^N1ti;Z=a
z(s>*hFGY`_+|ssCG*G_u=OG-ZpK>Pq%M>;fsz%Sl>enf3I>y8=?4AYOoE@**IXk}4
z#RCmx@T%zB#X_ro;w+gir~GNs#OcMT+QRyAfX=-$GK<iO);nsg678Caoku4s(;w0J
zV_$8s`*PS?xot<pM6TB~0w#Ywcf!dN59|HSR8ARtWeHvGxw+~OORePYW;J;1(!;^E
zZ9mZ=Tvx+0ri1AOMq50<>26!p!MyjsM(^!aqxs~rN*}mNnIg2<+UkG&gV1;0FQ~GU
zFOK~C{-gn%CKb_E0Xo@=k?7dA&B6i`TjGT2{X7qATYM!NYuz5<I;UPZ`AZF<$+Q-4
zyYoqBj^{@prJw)Pl8afx9S2x^00)?<F_6NRx-(IdLpXt`2;LgiaBz&HW8eE#Q1Q(P
zW*qo<@8xsb`p6S9&(XCzUdE&%e4dn~trE=Edea}X6G|pXCqS=H5cJrLO5!bZ_B-YN
zavdp2VXuN5NhzKCJsfW8)!KiEA>~kSMtVqq+xTA6bIxa~xzG@4>k`5?b{2GSXDd7{
zd2$5>A|M9NO_p(9hY?yvEu!=ze2RR}8V8s^H9l9?%{LbtHUD8blKXN@RTu>l3mkq!
zXyQM5G3T{KDRF+IOd_OPg8VZK(nboz!gKu7dr)%0ooJ}o>%Oti>iN>|PG!JbGa6Zk
zQVr)@RLy>u7tO%=J%d-c&2Z(4EPVJZQ;5ge7qgHr?+NKuYA3uxG1^RJLQ3wJ+K0Dz
zXe{r#qQ*+~xNIUFs5s|+ddLWFVM?87<UVUSf>=&aPlq`>qIT9I?Xu#g`ys(VDwM*u
z5sBN!vtKIoFY4Vu%!c(Nl4xL%#usHCob~{XO+701C1FPGQrm)Na^NYCj2P1f*cowE
z<WJudDobe}3ia5X?BmffAzOKEQ$Ba8kdg*R8?J82!}CX9XZ^x_8Rml?@2z!%_+F>L
z6A8CQk7_()t=AhQ2BSHjq+1&u=4zD$=G_y~fH<7{?_R0v2qTY#V^gqx6}cPfps@Bo
z3igY1qBwZW2|E4`cWpXflryY*e&^F2oSH^2%+^qtFt3_@L1zaOOv#X=y|2Ul_RX8?
zI)+pgO*h4dB12fXq%TH}kRwr7yW>&`SoEihY~Z2Eo9k0Oo;Ki*zhIVS0vx44pjj&7
zZSsRSBND>ECHu#P%eQ5vdAIn;g|8oM-V(^odG3R{Ya7W#{B?<6;fmCQ6g3V3@DUts
zj<`GOlz&w~4U`Nvbx0?EWGgHQa`9L-Ih(PWv?}tI7dl%)v|E$uy+b<v{5xS2PESYo
z;bk1&!3Z?_28&7Of%3CLsy4@x>l9xA-zPLx6=;|~J%kE`T~Z(1glxMw0{u=*JQBT*
z_zwI?^@kx0{)v+dfgn_z%7KqIz^YVYm{vlBD_08+ZL5qR|JrATYEI%}ozj(64q7lA
zgxO-h$00JoPrCt^efdB+P}b|=iFtqow+O|hQLzsq9OzrNgg-G2BggDn;=n02v!bv<
zTF3)}pI^b{KI2PMICfX74eU3~D2gbu*DTuGOHmBjl?sAdi260w)orGwK7a7QP!K0I
z9H`NV`#$*NB50QzzH+(rwinVdG|YsNV2nrx=~lHq?}h8dp!DxZJ_HMU%s!2S-ZpsF
z@I7Gh!k8t(F^(IvPgmh-swBB#Q8u_69EAk4?H@g@3wj?$!IH#j)z7yOU{HT$;x@?B
zE4#%2a-Z;fDE*FD-dG+y?{+9WDJq4>?y&UjA}&UZ9u$|Pr4k~a%&12Zi36e26&Z<0
z6Z|6}p^blz@B~4;%|iqgIKU&*hS-af`)JT~b*q^NRI#;fE6C}fp?vpSHU@Rs$HEkw
ze4KIHkS$B+S<mu<Val()DieW%g5WkGs_A+%V-kfQWwvAqI&rtjpj6!i3JwXnJC@nb
zUYR_^!h|ldt6<*vrcNe4*Z6{rrEpruCIcu&j(M74Et>r|!T~G)UU%?pZGVlfj|d9p
z=L@^y_x{GT=;Nfo%hwYCZXumq87L|+od$q>QwjAB@&<7`eiDb4*Q$7|gW+Q0@EB75
zjuWh=voQr%t^1q6C-0v?$dL;Y5*^MTTrfq&B&bEZcHpW)k)nJ#(0ducM`&YSM_XNb
z6J(H#nBC&Wy?Il1m!i}T&flP{mDMdKSh6rg`lCF9MTU<+uTz*I{0QCC91jwIq9<C2
z01ypGRVk1Y3SiKqBp0-`h5CY+)u|}UMxfG|B<aWRem*M4LM?fYoLcu^%rtnEtih$c
z;l&YRH24_QG&B<dk2Xv!Y4P!t*C(5GT6fe^Z*K~l;abksZn)myg~O~SvMsZyIOt-3
zjv;YE0U+)7DH^e00^Mx9aCGd85($Z)I_4c*C#dInSEV8UgpGbUmWb+H3kt3O4PAp%
zJ8m#{=~!PV6BIItk0$wXa<@ff|2;40{xvT=;qxMexw4?nPK-ScFo07j>jwI!VgzhE
z28*eSc-aCVOAYtry9^O=q@2Uh-oj_6q<jJTO5Xc%YtWF1tk$gZp~g8ld&#Z24AiUH
zk-<l#t~q>#sng3K6GvX-_>P;Qy&D#knk9($&8|&02LeqT;BH@9y+6;6h|~3K^Y=ML
zJMe-tj*SeVU`<r91B3#->bC3CalGI@1hW4>4;wsrO$)mFW|wSr8G$a6aE_#nNyK-A
zdVQ9t@;M*^tG4<uP}uT7204hE_ObDkGTG&;nVQ$BG3s6cw6uzEs45Tu9aMHEng+<i
z{IbPUh09a0Yhc_PIo<myMg$aWGH$9o$tc|4zGxfP+Unj^l5am--`9UnmDIMN%5VC~
z7m+d*Ac+)-<z44{z9WZ?8C-uc`+8xatZBJa@9U>`+L&U6z`tW0F}OVggM)Rn;Sq34
zBT0;blaCN{px^4&F-iH6TEj^h%6|RR4uKK0?eE=7`f3gGaat#C3r84pbxw72No*#I
z2P8YfYT=MWb!D!5n=iqL3D_Mp0E^RNFR{Wwvwdq)K7vhW>?60+y1<*3;Jam<W3Ngq
zf91chX0V$QR9p-k%EbiYqkB+)`o-?C<a=D0r^90M;X<RA!uHtb!IMkfdm+JXPVGTy
z9-KU{!^K*8@(!BKbXKg_L?;0BmXG>To8M9^Xj%Zh2f~e?6}LG9?*ek4G-inE-S#?1
z{1fp}{shCrs-rol);<rQ<C@Lu@)m<8UAu{5eAfvmT;erw<+9pT$GiIQGck8B=f~J?
zKX1&GhyH%9^t*IpLG8vo!N4@&u0heHg<avI>uy;f$Hi9qTt-Gs)j6(E$d4aE`?DVE
z`PLqy50h#$>9M(@qz`qByOYF$ZGW*M%hJD%&b+z^`@D!)3-+H_AifQzK*<~m=!(o=
zo7d7Y_EFwqF=#PlZ8{Pk$UN*y%}FFV{`jYbh4Tc?vA^SRXz4jzJc(nOD+-UBmo8^)
zM@L81;fbjt)x0-Jj9T(X8RBg-lmoB-4jAu(aC(o$0c}*jgye_G1U-N1ycx=(Wo2?U
zJ6gqFKm8q!4KSbRQT1{BT{&DX4iB7%7;^my{tG9;mxHPb=F72(82+m-;6>SdfCGiE
z7)hyr2U(QAx2A-4kG;nCc=2D#r3{`IgU=$zG^-Yxzj_gX`UwIjLsWwQIS>BDJ;2Ej
z_~Swr;^CVg;9O-wW+wBtlwdBcsiI=c+F*uGGyw}^RaI3-HT3(|+rP;YF;)f$fp{By
zgNYS9WKCc*X>l*WLjXR+QiwWsv{za5{N6x94~FZ+KMSTX{0Ufp(<!9jHE$|FxNsBT
zHzNQi_*J}=NvW3L(zNQJ$3Qtn#cx$`_(E{k&IKSxwEw%mX(8qR;u_$}{!)`x<6?LM
z<UojD97;pJH}h2ni$a9?_pq1hazq2S$5!S3>bqcP%HJ)3+e}4pp;4pff1xVi@f`MZ
z&BFL3CY{)9;I{hOe<>wX@&CbAx>Uo2WkWu5zzG`VCsX(a6?Hhz^4ze#5^jvTy8xl6
zZ3<8`gr_T(D1Sc-sWfG2C{(~>$1)NQrz*pUDjnut6zNud)WU6Bf(70xooo)XS@kDX
zSv*1Hl<dd)yF2(J#meyz*m)f}OyOz_ZGHeb0F2+}36JAzrP9Q*g?zT2#<!ycY>iOO
zR9QrvHym}KzcWIXc#|!C-2+$0fX8^0b2vId>A7S+@qbMh$^lS^G+4LMvuwP~fY%Zd
zO9ZVBfjbr+9Kbq=3OTo(2|z8j-k-evG#+c>$6LYyxG%RlY-uj8jQp7+YErCSmOd2u
zO@&HD;&fxMtn)W7vV%nUY2ky---KbcB;^!3Fc(=3(1-Y*3t&*{oZmzlNMizU{XLtk
zP@U_s5HjQv>HCtNYwY1ScSOr<3}&!R6l)EPy>1MHA~Avt;m}Zll>ZNj4-_;;6aP94
zAm-A9kQs0%y$HB#7X(CsXlQ8YL_~~FHU<YE!AWedO2ReHc3~!g_vbnm2q6Kh-LP}}
zP&D9aH8Tk6^_GBTFa3xaP6Q%klKeT0d~bjA7J%6HE`SU~EUhhD*pHT#bw~)263wy>
zj_;>Cu%q#r2u?oZ;^86D;lj%zJ|QZg5h8uKJ5_nESG}f($9d5D7<hfNTfySK0>{Ad
zZ`oRO1w8j`tM1)~-$0-?CiTvF$sS!Ii)0v_w-1{u3&%nS;hW+T&*Un@4^f86;9Jl3
z+>0$xu*tcPi%i?!6b*g$0KlGa5W${w=yCp9x_)ygXwu^U;c0ivA5(}w?Kkg{`P~f!
zyd=m5vd=%>?6MVU7A1tCSbmNMh+}rYBVkZTVaBO%wODTw3%+xOD^e7KjPxqt0@+id
z7M6o34uw?KbGj8KwU43WB|5|ACu-Z2mFEGa70d<=hBp`c7Eu?*e)~Mv@U$H>3W5-E
zdE<e=@9hMo)Ck=x^Y9G{=H|U{oe$}^yOlW6hiBNA2W4@jRBX*>3FpBO=o%D^SqK-O
zmg?8^hXS4fPdBV#G=O1bDgZtye5aD|wOmDz;SWIQD{ja6=|$0zqXYOh3ucolL-K$%
zwSnCuxF&kqjgF2^P9)&+_4xw4i|nV`&cBY`oNd!EX_v;!Py~#3NErv+djKiEDX=wb
zHh3MfDrLN=sl@%ca47(5^x98&h9A#58chI?AKuT_ye?=sheKA&z51%^wOhJsh1-0y
zKoTB1TKL1)`{aYLI_mf(@EGA+<1(sW{h~iLkBW?m=lnb%3pbeS*?tGdcu4Fr&3>Zx
zFtVl>-LYg$@O55BsYkCLhKB!pQ%iz@Q}aaN>7T-;BRV*1!&4UvhDr>Hm@mWQVK0~J
zWt;zxv9}J3vJ3k~XUL&JL`q6R0cntip%G9NNhPI`?hqK#qLh*@=|;K{q@@w0yStmS
z=Kc1*_L28J@#kC@Ogzte*1GRs-Rdv|*Lbc5$7A`?Y>nU|HMixlN&<D;%%=$~(RyBj
zUN6sObhc>RS0w&axFlU39m%nKh6cB-ZJzATr)zyTnSwH3|A6@0_5qEwR;k(NMwr+j
zZ+OyFL+O&RX0q%fT=Uvjz%;nnd+DuEVLgeKd+uX(u+k4qfHv~LD6j~uBfMM-`@Q1P
zZwjhfO_Q$x9$?>=;3A3Hu5O0lkVAaL7xg#@8_`Qj-M1AoX72&^1fqNVP<wuXeBf0p
zd3$PaK2nhI_UMl(bUQ?s`taieMjG!6yC{tYI|884nXvWS74^Ds?R-18M*ZPaaHRL;
zx*oT|4>XBAh3loyKycmxZXg5LdyW%+9Pjq|5p)zx*6cWqI?-0o)X~%5OQ)DpYanoS
z6u_HCy~W!T!_&5XB`e=MOIf$uN@+aT@}Y<DoE=y>@1Z+R6^{4^0dr)z!^-;G(8_Cs
z59+sZ&&jv*l{b12!a%ekSbP5y4Ovl&-tB&p-tR}$l^N&CcsijJp3V<>_@9qB_Oew>
z%w<3So*_56EIOF+WPf9-Zagiwh??(8)I!O_Vkm2?3RJC|RJK+@&!u!jM|Os)zJ)9&
zxJ>P4c~r@XqKQ~YyRVHDZf9$yWysZGiL9B=91B=vja5>(*8{YK`VhWjCWBV9yffSU
zqGxy~t9Xr)ALvCybq5Da&3UG|OfhjC)y7(Z+S@^3_|C>|E#IN28t-E#u_06-{@oX)
zFBmB?Rt!~V+{rqxGX?uIwa)Wn)xv_~{$;gZjD{}oY}Gk?2fZ202o7#`Nnzcft`$zi
zTb|h7a`7SbLKHf9>j)krkGStH+q!XSNO7i1DA>1J(&j$=BB-Yl$#uB)zL=+}7xvi(
z4R>Cz+F(8Uq3=$X>wNzX<Xnuse|;t<$$gD^nrtIayGo@Yo5K02sL+NWXYW@Vl7tQS
zwQPgScpq@6eZQ)tPL%nHZRlsWMm}RcP@0|4uu;+#T)`9a>hHbSXNMj!+<13VUn#+g
zj+PLHJDCTAR<Et`KumwdlDKN|ZIH%FV5U9M$k`M*lMp}@9|0p-1_h9{yDI~YW4D=y
z*WA9>@-H%VDdnyQ?v75AK>7`jXmp}^I|H2-B>P?|j@yznZ05v|M^)rN_T73?=$N^|
z(<!uKcUWzx9hj*+W-sX+;TJGVW%0|>x@I7eUpNjqcCH@rJ0DS8pR`ezke%Lb!lc2i
zHEM5YA{{~W7jm6?&GX)F+|ql|+W-{hUdz0J(+)pLG)6~(jblGyki;`df?!^(sP1TN
zy_IIa&S_nJ+I+`rHh@&*`s%o0v*sY7j|xv_$Lz%CBO{-Zt9_3IgRL|IP{o4pJja1f
zqPrDw&Czf%C)`8zuggXFND>m1<}hq)^&XFqihEGMR9(Xlmvu;UmEd>L>%6Tc>L+6+
znZ;+Xr116uiGs0i-A3(oVo<ZC<1)E8(zXk8-AS2gWmi8mz{Sv|Y%qZQ`07*v^v&ki
zv5BC(s3gc7!%T{0W*wq@WGVkYYtQER<E^tR^ZbeJXFjO1?3F!9{Jk#b(>0j}E5h3a
z?ECdxF_U(+_|zU7#i~*og}PRjO{8-Q`hYG;N@&=wKYyEf_qmi5P>)XtYvvc~biEn!
z{x*e@QVwip&1iwU1*h62L#${sr=??3If_lhP!wEgI#;Hm8?Hou4xge9B827}h5=zx
zybr$})qc;N@`TgRfE_wAl2zk<wKvY|O)2y05;oSTO!1{*4ZX4xo0eGb<4KqLF6Y$=
znJD&G79De{&vt-OA>(O6Di*Bm6}M<hjxgTi?bgsFT2R`4KXJc1qC9nLdULAmRpQ-)
zZ?yFy`6hxWTa|+)lf4PT_s17e?PwOBz^Q%p^CkQ+N^GXZo_rkDa15BP1~SFnbJq0m
zJ)Enp7i<#Lw|MRm8K_fi`G}x3?J;LT?(t6!=~g8?3Iv~XToVxf28u=5MEUEyoAsev
znmhT)p^I16C|xEmSMo=Ky+%|5fp=E9v$vuMiN=8!r7)5t1eSw;QR`mDKrQ&X0|yhA
zOy4t_?N!0F%>6DJgvc#$812>E@zQqwV;uDgT7@2FhzS0>KBN@^tqsgT_d5$(sb#-p
zx%D_xdr9aTQ0Cr=ZO41O@4kj2HfaqK9K?`W0AxBc_(Ianm<t{ElS&ut#S5?gXvFq#
zsn)n`)1hLBBwB1x4o-#j8_FuaeOt?z?;vCrUt>J45sDcj=2@+V(J1UPgob$D-@ga;
z5>0cVOz#;`#3ziA;*qZci`<=2#&6S4OW+K3iiNmz{v<H*H6p;t;g`K}QK5IKqNr(~
z*d}Y(inH*-Am=er%5iZy?l7Y{)=W8GxwAkm{~$YJg>#Y2sP`02%=njhR3Z^U4V
zj_<>NITbwjgr<I~aT~n1Ms#<?$m`6Tg~IU&>#n?tfLR|cF|TEw4Z?xMW~^x|)|N;O
zvyT^mX40b?1o(QXau?3AfgJ5bB!myp1f(nfBG!F48N&t>sN(NJmkrGjQMNoTw!e0$
zxtD<ZcD%LGN+dtS7&;tK6Ud0~c>J8Mu@$5GMA$PdnXq71o`G!C{b(cR7B%P8(_jkG
zPOo(qNBt4ih9>F}86b*WfQ=TUKy8)V%4Rwf)!&R3rPN)EU!N-Lk=6=tCH=0Ue@<U8
zo}<P({k40GmO!G25L+9Qj;K<6i|C=aAAfghU>g-hU$LRw`gEEcf#PW96czO{0oFy)
zwq=D`iF77i6a5m+vBh{nz8tH(=DiN!L!Fwq8lQFyPoU(#Qp$P<Bu|p~zI<JV$rozf
zSkAF+H){PK{t$I~2S?$g0EkVkHy**<z+fjwGT#E*?p=Gr<W+@+pXSr4e-;KU)^zj(
zrxIKyW=DQLHd%fInWVZ)NBfma#AfPkO-2+$sj1b#w-4lXDJ`D>$%{K^xd`jafy$q}
z6L4}$auKh#q0u^f^LvkCLcV;t)Z;syufRyVSzKbD?{Vw-yt2mW3muDv_Rr2wo3Y<6
zbB?Tpx^*cv-!-_)LW=h0s~)F6e1;bfvxtlA-CIagY*Z2N6|gVj{nxKSa=Goc-USo|
z36VxVJf=Ooh7{>j@i~w!Q-8Za5|g|wA=yoEm@ZSLVNk03qf)B>dNWere%4s&$e%<w
zqi}0kY#Rn`5<%JVdTMfBG3DMBBx}?rSLPo?k;5XudirM<X4{GB`(X(RvnaC2s}rT<
zp0^n*?Kn1DC0|`xID}=zh=w8}a-dDM_}sTpWP=ZwbR9j=YJo`Wi2due{&eAzL=m^Q
zykk}hVZBQoQh9gfXMEkJjiABx`h_`5IZuL8+cvld1C)i`0c^)$9{(>y!JiQcw<^6+
z4?rv|82##-?ol`8`<#NA4mGfSbGKY>zFqW5nX_=vqj+J*l`v5(g<l3Tq!{G5b}$%c
zs~y`DRq(@?^-BdKG2%7a+sT0IK%W{~88}=s@@541(Y9#1`y96xrxKrpLR8TjYv#Ah
z$F291weQt3^I|1NE~0WA^pCn6)4OuJAM>O@vzzZ-ut)R|1P@j$+ndh2RKH_F8``N9
zMR^)8Fl*kQ8t~x(9_6DQGj<FQjkl=n8`Ge$t)c0ZsO}<}$}y&Gk_$g~o7oGsK@9>t
zjuFbo=3X{U|1}~=`2oz<B=YZi0;Y^&5dIKofIHHob={wG3udLTR-+`W=0IJ^f;NY9
zqs?Fl(O0Uf5ysXd_P&G$mfpa-lG<9e{b$%LT#njJTJ>&`KjO2HEKeI&YqHE66!BnU
zQnq!|h6L{+vfm$3o5KI3gi<s|aTY2>x<27`$*_R*q-4*~#g4#-vP!Wb)vu{k7%>zd
zjuxls7+zhRs+3u%jrVSduWiz@y(%v~Pg7_DT2=FH8cNdpZ6P%1C~xFjGIeoYF3u>K
zc(FB^V(q#gtQ27F)yC{i9=lA5az0P3QI<V_D)~cS4K?1#r+xfb>wO1`op|tiMU|NS
zNf9GPw_Y-WAxVUp<<{si5ji<?^E-lQ031#pyJnprK4MW{>iJ76SZe1<TgXc2tVR!{
zD=LCEKso@eq2<F#Mtmp1*FphfiqWu#oFMUpRX*sLQ4DvZ`KA?HFbPSJEpdY9mc7BW
zm%4x0Nvp)@L!@al7lVJ{tBMdc;w-%eG5pgqe}6Q}@0O?Bg0bIZi^@gZ_S!*7WfZgL
zWIRzQ?$rptwBCJP1>PcA4O?H6-s2KPg3#jJc_`0Ejp-OLmUE|3AHS@dI_PPz#z;Sr
z`{b$fWin~+3@!HxQsvDQlB_fxM9#Btc|1*u2QB(+S8Qjm<F~Ga#{9B0YkGKR>l3Lr
zN35%I0Ah%QLekBp0SDu>e^ack?f8=nF>A(LjorK~Th++#p#8%7ivb0cY7U=|=!Epo
z&Yp#fRx4wn5vqzK2-B$PkFE$mu-9C0uO2?*pq%H2dnXGuw_p)77>uy5gQd90Lq}K#
zfSH$01_c!_WyCpfw86s*=<kz6KP;$*j#K0ZWne)40;mbJsV@yTP&Kfhf{V$-T_cHb
zz#((tJk#G$0|-<W*V2JgdC%9uHN0of+ZxkoGkKltl=q+0JEBu=FCpEwru@U|lXJh|
z=-<B|UhNbhmZcux9k+B|R9U-bfb~U2>MFX>70uE7O44Yc2rFB=ssu}h9AEzijOC-P
zjD)k38voovk*ri3bp*_KmE)RYVLtp^7ducK*CHB%aNsN<8ryX#?@8c|fY-6*t?o;&
z@Cf|yA6XRpQNew^k{0^XwQc(JZ7u0vTMs~L<5^J*=+PQQb;2_fh@aZ#1}ch86c&H^
z6~*P_^k$?ZtDoL!crp$XO0RF0>t?mJf3kUq4R4nUGOL2$JsTg(dY)#yIX0~n-WcI7
z10lP1MY}*1ga_bUHiX1p!xiCIR=us1H%h~>#wrF-7dM{9KHSlMN?rKmUknr+C591z
zh>+L)Bl)lHztj``zD=WYJ)EoYDawerrFf8@mceL4`d|NMn|KR`LkC~a>e`yDVI!J0
zU~3MXHcd-W(rfZXPk;UMqk0rx12bAYUP&#+&Fc`>cuk5FdO_}%G^A|_tz`)xjwH0$
zXrbfy(RjDxgz?+xj$cv5%zr<ty({;xPlltU5Q2}UjUY9?fh+tk{`W~}0qe5q$EtAH
zzdjhqir|{WfEE5d*!5pPHPjo0NbVN#D$z`@f5VNl9{G~i5?LpXXD@!bd7Y8(YyoI6
z6`on5zdMtEz7^~hiUA!O?X-c4%9o9)GQ}^-311HOXi@=I?O57k{<!gfY3qs}>W_L&
zzvcSZTlhebZxAo$`4UKpzDe)SK?|3!|3Ry%gqh?*?iM$wq$|eJ5|WTCPRIP`SI5Qr
z2dwr4Ia8LCB{Df`xlvoU1tlAq4=F|)yc-ypm=p&3(^Rz4{gJv?Q-}!0=zm=SZ6rNm
zySMQ86znwV)(JwZJ>m3d0&`6ODqH{!{1W4{qGxpfKLG2>i;^y&?t?+hMt`tEP<{Y#
z4&*$sl(_Aibi{EaMX<e!`hN$k!P(djh?SVX0P7{JXO*@Fr>o0JjD<`@(qG}FOt+1`
z!UF~V{F_0beMbairAxk2(=fgPS+}&bjE;h5Kv5}CMCD~TY>w<F0{3%?6cbjXA7h#g
zYlK)#EnCe^l^TMm-ANKMHO7Wv+0Wny!$e?0B+-i|E6u|{pa~?($w0wXxgPNq!4Y=Z
zAVg;*)wcCfR(<TPy7nD(4pG9}mCXhFzhBwJz{9IGdPGX3PL+T<3UsLHcZD3~J0sZc
zgVSKgBL+luboyh>8!uwpB-t-^b^-0g<=GDA&BbmM{eM^H#k`}(Xgm?hiA1GENzS7!
zH?y;|E6>-=Rte0l_{|l4_1&@K7CE=kd)nQ5XyN2EZ}nfp)om|!+h_&~tX`3zNQ=dv
zrnJ1-3bZwYNvygqCzL$if9`s)c<&y)(!y6!kMixqKFoXVOe85gSD%SKtE5Px$B3GS
z6!uOR`<29EolQ$8j%tE!cW>HGamw%S=4-}=2`TQ|34DN9@ZoGN`uUNq6Ng&ZOB>1G
zFe*kx0KsU3n?=v|xX&=M_;*|X#lKj(TzYK4M>>zmV``!WYs{&_f?8npUf5;;O-<4?
zq2$|Y8m|+HYL|5r46kT*opb=ush13h-t}~M+_ruv<g#@yp~f)^>{C@W7i{M|Cf#Ar
z>E#~BaHktL{Rr^x5_Kc~`w{eMh=v!CpdkM2V%r|=Qo5n;U~WSgZi!@LaHyMnmJXvs
znF@6POa(V+q_ob*DMSekSdU+ouAz(t5tw8VJ{vybIVbQ(P}<ZI;mqv;au=Kg-3h{N
zwM?n0sUXX$J`CW08In9MTiWC~i6oytf1VP1*|T?@_9RJ6Cjjx8qWYQa9rlZr>v<wq
z<r`nkL5Wra2dYo0glmfi^~YjhV8~*1`KERY0h}HfBkSR@4-3qg)IF?%#BgefxyOs_
zy+O_r^Yhde0M;WlQKwtuHf%<7#ckB#W28C^IzleL*X25c$OjjFbF_$29`YvZfF*FR
z8)Pby*qAli9d^qrDRm!U3)TwH{IPp9J^72K*Z-Rr`L9pkJ9VoanYB)y+s;#X%7#2c
z)z+9BT$`Vuyhl>2{*jb}x+D|3B_GHSV_X#7eShP^plPyNCEY=Z&2H2;C1Lh=e6%^x
z1O^vjC|hhn_sO)8Dn=##AE*O7sqt^ntqIoo5|elzM{&Gj(ma%91%fs0bTxeH&B6O-
z509GsFpUTHd%N})yAH3gax~xH)-bH^VT^7!ygZAEmYnfSB;$muTX0uT)wwI8=lgS7
zt*L4OKfN12?Q+xbh&DYcCO$7cfuZ++!{zyLTlyj}X(5>mft@dVgUaIONnXy^q}RIy
z**Z1la2VXak;|~H6a3ACtkOQM0En7e31~d;P(z6*D3a+g)_J#4aR-MR@}h{_l;xTE
zzENtAz|U5Bmb;>Rx3!X7=dhlXR7fI2lpF$F+P>dsWYn5Iixq!86RvzVHECnZ<TYBm
zqFvDECgFv}3&SH_uRXx|O3EP8;y{auezl_;Vg~umQG1fAnPfi5@K^>NMtyRnKV$fm
zUEK%WvyWEc5`^A_yG*;uIZMV57!qc%4~R_yBpc*EX*D!@i=Cbrx~(@{N8)wfN=2{t
z15!i9ZD31c)meNb-Zz1dMCN)$-sxs(e%u*KHb5m8yZF<nxPQ~4f&UMvZ}&f-zWR{H
z_1-7dVKr;j+wFOTMF3q~O=}hVvg-D<xpvUeuG-!@s_ZH#<U>@no|_DN-GR!)?=cWZ
z@eQCxB{thhXLqMgJjlGLFEMi$Ieq1_&YeNgSc|H1wfQG46F{tz)|Imu4jk`xS8FkA
zDQvHAF@EaW_~Woa;aU;J_VQ~R{661;K|4k{Fa;t*cdS~s+ALIMbIA4<qxG<H-}oyg
zUta9@4wNa9j`sqr<voKHpBC*@8B+WeXH#n+NNH<ue4fVL<gM%8p<c0>ai*-y7l`!w
z@MneX{9I0ogCMHx8^m!pv?%Pu#J#;tH8aY;Mz@_El~tAcxO-lT6DfVR-e2zGpqCAf
z04Lk{YV-lQ-fwZ<cMUw(PHVdN1~cmDI4+mP_dulCxNA3CJ)wujWbUhq#G@FK6qeB3
zlii=udMkq4!arMEJ9dfNJdbHa9N$m8-T&pxv7FSOCf9Y&LY<60MT3R2%YJN8k5YP_
zqqzfgk6y>2gqJ1<=glsU%=e7e+q`ca;`U;o1(sOq<DyP@(dBqZnwv#F?B#MM{7QQ{
zv1a@8G;b93J;~{|Glj0G{DrR+D|%jskxB_dGFL)7(N(p*BP#aAvDs9*Fc+td2LK2y
zn!Y?cjL!HKJ61PK#i@dhXc!u%VeN9@Ex4PA{#2@@ibnk^S@Gw+NLIC|%tWu3?ER4z
zu6(suXX|@c=hr+^drg2PI^S-X4t#=0Da`GD5c%HJmS4nsT|M7;drH)PyY6^Qq4vD?
zI{{rXjz40<9nEYQ*n`bD=UB~u21+Q4GCHjd5k=hbFFx~GqD7Rscd=Ep>j!7MyabdB
z=!K;OuY6tI_Btdi<VPa?ys08aGFNJvY1MIRqTLmdNmk@{_U`Q4AM3beNB<D|@QaxB
zlEzy*_R(Qa7x&=-S^m}mtjV?)$Nx^{ZtvNSR7{qvbB7A!Vl#nhjDXKJG<Sc+K@w;p
zp<K^J;R-n}O-h@g$8t1piKpLQ28*1n&^za)iJ@xJ`z!)%$+`;jcfJm501j!COp>RG
z;f_kXOrKr=Rks0<zAt>1qZuyp<9>u82;nq6NwniwXoQoYCzA9+C6a;Zz!`u3xg@do
zO%=hc1hQk<uQXa!$p%<;24IiDvg7F$F?4Cf7E@oFwCc}H>V!BgzY7rBJ;fTx`*Vm4
zSS06m<+QZmqdpux9I1KtmCj|E$|Z+w<MX-W@LlV(r|bRAtZ0p{(nnXIz!1!|^yr;*
z!BH<Oel4mc1?~HtL*%B>R?{sX7*{PU0x>{8cG|SSSUF!y)dSg0l*A_llBi+y5LrX#
z91K^W0rRV~4%~dmR)~o@G6`(_6SuH_$jhFJE>q1C6-fp_$VQr`E_F$ayoOzDhO$z6
zC#O6ogO0d3{4m$GU+WI(+W;f`bXf39ik*;g8-9$*Qpv{V0yl|^YqNxFm2^Jnv|JO<
zE`5bBKX6o@EE6GP1{THF$^nwVs^@uFmh`;dbNmqrso2ozr=p!8P-u_xOr!#t|C(!X
zpx=dJTQ#LUrK8#Ols!gi%_@@{pzl00>rQcHx#bX{Fqgj`AKD9~{0KsN0dz-a@!fs@
zqwZDf@Ak(KkbPsU^GEj1g}$M8e-UaPcAAN>dbm{(g)?V$W-gRGuj$DJkCS&H(0I6$
zC>=W41sPPNYzt=#hx3+GEndyMBp~(5oGifDkk!$Vb%lJn7PJBmlyYpnG?OQ8;3<f2
z&Mps;gW#TQ_+!9fmDyIp#DYLF#whxt_Ra*qw6>T6ZlZd>LHAd#^0DCkr)H4Xi&USR
zMXgKf&q2|_td^@b>>t=vReg@RWvC0(?fDxyC&doO9v5~A(K1P^wc0E%G81A*jpp99
z!B+>&@1wNv1mETQp}vV409inXzsDGg^}3`E7v8EJRK~RxM?E469H=%{#9?&1JY=^2
zV;qg{+&>U{9mZVvq`xKPY8u6!yKx<D3e~6g^V&U;8wRK_`hW2Du@V-vWM`4JX3T=>
z3>U?=pLKNwqxdi8c{KfYuTu+F(t1SDpqYm&Q&QU%Dq91Q-i4Vy4y`o-u$|fhleV_D
zIL_I4nTKN1zV?e<U2hLx!<*N<;(q5F_il*}csI`5gHSMm$rn~sbZ|`Larzl1os^?2
zO0hjA+L+fxzqRWakfMdv{0~r%Z<VLQ_h5<pcKaneG^ZK$=L?T}-&9H%`(#R(P}Z`A
z`ORcF^LN;eU{{|DeU0`uvlmzOKT(=INVxOSZj)`CpC#QgsbQpXs?KAv{FP0l)H{M0
zIh0xkibs|YqH}x*W>Gt^C-D-AW_&n5n2hlAa9{}&qyxfrMieq@y7>~%;r%cAlX|Mp
zo%iL<vf6(HT=mY@V;<kYM(o*<*k!j4z%GGhy8~Zmam!s#BY1a<_(wvEdd+9v-ia%|
z0%6U#%qnQ0*;|nyrv(mp9)IiznJzQX27MNbg|Qda<#Qbo^IlO>V_O@N$vUy>NXXt5
z;qIm<t|LEQqXP>&RQ!}-a==raFPXR#^P?LVZSiaMSVkPBY>Kc%Q>gueM%0Bzheoan
z4zOei_^AY*lPbr``ksYtV)K{T>~~V>ox5jS-{YI<Y-R&?+GvEoz2e7flQ+~|daa8=
zz=f~KXM^D3(5>+1+Z$xYTZ!0Y1<o!SbbpK)IwE&Y=Wb0rZ_O<alWuVqOyJmPWVt6`
zaEs6(vHyZF#4Z#%re+Mqd(gx%E2kipR9JuY68p|WGdam!UE5as8xgKyJpU}PDX_qd
ze$vl~k>Gi~JPa@0?vn__r=mDmt<4<|Pk%NmxtDIT<ML9zwFny;$%_m9vp>0Mz;^$5
zvSM0+^cPm2h76iNdTstN3dPz-9tE6?5fF8fFx(gfJkbOX4`}PI%oD!%A)Mv!kkJ*I
zX<v|NXcLvPN`C@|g$`5#F<JtV9mHWV$vKJm%DzkSGQr&yVhP_`5B+KVhR>`9kx}&0
zCSLD3o5tcO+&SI$7!sm?c<s-TovsF8xQe21J9o$pv;Z|gTxQ3JMPnK-<HlLZVrToo
zDo@_MAGdU7z2%7|e?TFO)Hoz8M(3v8-U8^Zxdi$+Jr9CEVD!HpYCmWu%cH+3i*GMR
z8Z9`Tw1A%$kCN&8@JuVV;N0aTYiW-aQZMuBgWQq#0xe=!+_c9sTkU&P?fzEHQ~ric
zWUL0g)41%TOa7t7Npmqe`1j7{uLPFY!$v^XvQyo5jstywcLH+4EcWV3VvB6+C;Bpi
z1JO4gNtFapOs(?m!}=m!d56;E@O|@mZA#!UXlnW<b$AamBi7`|ScOfJx^CZz%b_k=
zw%zel6j_|?XQ7Y?F5D$NC`kUJ+}*u0AliD9SrO_MBWxN%tM{YEyc_g0oE%w0Gl`^%
zjhVaGOlS|5^?zya5yd4vrS@sGdqD7E&rg02Mo;7LNdcoh&2}D-e~n1Fn(mIzNV5%&
zVc)Iy#t(~Aya#^hU*6Q`(=HRiDCr=7whM=GSC>=SuC6??voZ8d!qMGzeq!cA2$|>a
z2Su-=vOtn2bb1pX%r6sV<}_l;Q)IH12plCvU(p|~pWla1WE-Hi(;)0^T<EEFyG8|n
zNumys$+}e~1FqV@70t{H!o5{9ZpUUYlIrGwG)anZ*(L`KMlSdUV0UIFu`BR%Q5DR>
zpthn(yG`AeSWtd*0dx@*g@^~%i$u@S8z*-zY;tEQiOACzH%(rCk%>S4NXx1!vg6KM
z*`_}&f}#q`%9DgA^GhQa9TzA^gNCYZCqzd(>aLwTNok^T@7`ap)l5VgsRi+6vR>}g
z&?ym=re@43?`zLL02WfMOU@}Z`}1SF{sG<}>0CG*hS>49L8YT^-!1~~qjRsT{MQot
zQYX;Pmv`7y0Li_zpWs~Ikd-BWDp~hrVv*T1;ilvyEbX6X^9($jjkGoUA;NhJdfHR9
z>I#IfAtCfB<6ryJ80qLFK+f^~B(u9>K?NYox^uH)vLJoBTyx^+8Q)eIA^sGaZ4*u2
zB8-a(1TUC8NE<*5hCQ@h>?F~pqm_>r(Z0NIc63^*{xivAOE4yz^$ZDzZ=7&_YVv+*
z_>0Hl=_y<@ZBYEH0W~3cFmSnd5JBi5lJZu+)DQ#CJDkWM-~g7|qt0A+Sz>YvW?7>O
z7YgaHNLF=)O!litg82O?di{H~bS!6!a<)CDr9W$vMP`1py?R1Ro=S}(KZ+TBmc(n+
zxMXbVyU>-?yM+GXZi0#R^;(^N%*mj9<8sSVG!XHMVee2NGMj#&<1naf2iRm}N3lz9
zVfbSq6Q5~sBSwSB*Md{jAbqsX$pg?m7}&jZbMX!;2!Ub?U-N^4>hneySDVbI1A4j7
z1U7+s=}W!m%#%4TJ9J(0r&o|nf}25of^q*)6&$&Ms=%ACvq=*3C4`U}gCNeLVoMpC
z+)7OYYGLR2QZqY~bkchg*It(keJRZ*GcvMrYhPi4PC|cL?_p8l_Ls*t_Ds4T4f(s7
z;AU<T?Og23-_iV2Xn}AD2IjlR{3B~Sy!lLs9lxM43r{&$x@1pIhYV1{GN#c<^m;g;
zHPXXN_nLI}dd9cM&fAuPBj?*f;vf)bW4^buWh1&HoPqZB>t6{WHZTSSUa8+Moz8bd
z5v9_E*Hca22D)e2jrJ$7CB`4P4#Cg+szj_LgMWQmBB<oj*G2x>E$Ezeh#dS+iLsFc
zLbC|SVHD9;Rq@_;QQrUgi6oke0%eBb3MVfLLQm8CQ3S<|5Id}9W+uh&2ya&NaBY~)
z5uyf*$4E&|STdJaP%2TBla>9t!zoe6d=m}*;rGwg<U$fZp?fxIj)(3sCSQ~l6!s>G
ziWjn&Y=1^(htF+}l_|=uvg~2!W7{5#J|{+bwuVQCt-Q)28dTYMG~-9m33#W{IXHiv
z_1Z_&?PTi9N;`9Rv>Er!vGT!<lX1p+1(MU9W&%_jqJv}5ATLm}X}u5RB8!W}P<}@_
zHjykPy0_^v-SM#Qjy=BRRLP$yadvj~g5Ns?E$gJrDlOZ02rLE~feQI$S%93HE0$%w
zOK*z@x-M$NZjfdq&`}2b!a?&PBXI#5dPYCT#d-h=WCzZQKC>^%i&}V<%zEBL0YW`e
zl*l)~b2=pf%vDni!7;2u<9kb*p;Uo{RYBv4(Wpc?8OBmFT-?<Mj`k;g3N^$GL2?Cb
zB{J2yq%uXrSKASn3mxIcgRM{2_-qv@%vU62vFPiRo4s$=I4J+rI4aHokmWm^-9>A`
zt{1?F8=t+s<p?WrYSj-QHH_cblBiB}c?U?+&z%BzM`4`$vula!Qg20G&wd_<EbjHl
zN`;_vI`NBOy2dr-oj+~<j?PoU743aoJ2UPh?=S5`cxNK_v}Ye<oLa>78KL|0T30)X
z6N#B~G)h$2nUEY*!POd^BX*G+DtFsJaKCvT4onEYl7z?d!<va%ho8`Fos~O!wq`ta
zdqFm#OMe%GIp~~Z?0VZ$-59QO`Rk-Dm=0q0UC!wPMBsO>i_NF#HnD%XITjp|w9o|z
zZ?T^&8%Yd5J$brb-h3$hT@AITqj@3hQ-NN^)7CeyCG(2XEPO{|sB$;YP>RZ(*7Fn$
ze^SMqT=xW;HuI#$kRm9qO|IXtV~FI6zWyj+8^?rYnXh$nGnW-9TL(>ytymNR?mGhi
z?Hi>zcuA;>RHehs`59i+0P}ip$Hv1!e@{|f*A%9JPrLcA|KF_(VL*ltybHzBZvibq
zbRkBG$G(n^eY=RkQ@-fVL-UD0l>~(n|H0~b{|_g0o4bNu>Jq5$EkAv=`E=fcA=6Sg
zPv8bc8W%juxqq9^7YjuE4Iuwx1}KpMZVQAZa0&m7j{ZZxw96nP8d5u!{wpy75&-c+
zy&)w=z6|>WlQw%O%2M<FKgb^7%trxqk9`xvKFN(mjO-X2>43tOuIXQPN~EJ7<sT}j
znhxYFAZ_7bTFc~s&jTznVq$t=gO5y=`J@0+7622(ell?S@!wxe7WqR#U}aZNf-oa^
zf>)oiqM=?;iF-}>`+*GN$6)JwLiqmUM+KLM$~SGrNSaE35NQy>^>Bq+PSt94Msp->
z#S2I_X1~ZZ0)rI3eEn)>2vpE)Cw~9m9FSInSDk91uIS(S3Yw{DY3Z89uWzFcq_>6t
z`v=Uzk-q>qdA|}qt8w0RTm}<6Dav=S5r`r{LT3XpHj;Qy$$Cb|tWj`R6ue9Of4qxe
z%s>3b@BDB@_9}+vD%%#w0tsBc30U3I!C|8UXdoa84*mdz5hJ1@&1l>$9slJy2!d;3
zK>>!;LD)c#rjNs=4-};;#jj;@84yhkmq&5Hbg$PL!ED&}0K@2g#DAoN*c19_-$&E8
zo_O5|GGg-VAtDe$7R2W7{8v_=IDG)k-^WAiM{ovm-b)<X@Si_Z0T_v4bW>Pzrpx9S
z<EF49;kP$$LH>I>pxq?Scaz1T5x;7+s(h|Ndi|5pz3nEwn^({_5c$jBj1)uy!+N6I
zcibZoGiQv6OrgF9>BN%6h5Knhq!FEwtb;tXR8b%|4uo`Kc7pCSe*|QWD&fkT%Ex9B
zX7ty}IH*}j-(lB&0JtcLSWmY(T&V3EAUvVcEYgQoB}g=jA|Iv`@uMD*>W?h1=7q87
zuoPc183&@hx&_BTmyv)!BSQC~<$eG)ijatHn?ZGA#=b}E+dRPNiyN}?vij~ztj?Ly
z_HPfQ#Ge(Xpi9UE;3`Btsk_`%Byr;4?_iXAV_n}|UFDS9Bk0sq_)+{w%+x#X|2*73
z2p=4SBr2a5kU5_|+(v%&%Fnul4Jo7$V3Vt?<UHxzX10A<r0?@kr<%8@8O5A5avliI
z8G-Ux_j6+<2HOv=J_0J}X9Heqof-;nA(VTDtuOz&n9mZiB3{WssH<yO8%0}fRxoD#
zCZU`~uMG_u1qHRfeEC9l&vwrn`v)_Va^gc_mn~kVV2X6JqkuZ+vPxl;>b1^C_QHM1
zqFBPd93wuyISY57oa%qskbUf*CTk0$>})g(qH4bbK@j!VR~|@XPy*Bz`*wwqv9Z>P
z1c*7gx5XkXJU*rh*2df;Yncs;#;rhy9>L6(yX3$P|Aj#81U|>w+Esd;!1N@JFWd(e
zW0dVv4SjNMfn_sMcub7BIaU8E0J|CQ-<Rw0Cm)M!o#^Ml{_!{jtO;%UNj6`%S<f8c
zJh=aqe1+}0!4(r}wvF?Ul$0F(=q1<4f`=#1O#2PRu+;;&uosIwcW=bAzz>L#x8j(n
z1|NFOkIPTC)o`)*32UTjhE?GX)Ssdo#56pz9Hqsi&B;WkKtl%d=I3R8`&QY!Q*YN>
zwZGJv@g#1cvBKvO=TeFvr_~s5HUPpjMnt~~?0V;2wS4gAX{rQVU%P3NCsS3!l7h->
zM)T?VgbAk+ohTxxl^BSKui7jhw|uHBvg_~PRgE|T{`D8o1~R15U)#0`o!kAu>=c}#
zY`nC5d}?CX7Y<0r<q;XZ_w{+4%x@Sx2`N4a#>7Wo5zBjPo^A<z_In}Ae=5No9kcyH
z;1UtGI!_d^OGR#qOiEw%tMQRZ3T1vxi4hSGKp966>+x(J=Do1E*nz`QOWP}Uo(Ez%
zAfE|uWy(4^{8EQ90b&+^d=M0AM!R#2dy9#aXAiw1N55)jYcwR{QV9)J1_<5SnUXjl
zIQ|1Bs0hmLOaaT!D{gL#4$xaGHT<ZH>B74J4vK>?>{><SfD{lahV763eC>DseGTpY
z$3U5D5yD=<;dO&}mKKK!p}g%KbwZhm@CZ^{EUZ6M=n=nu`=VL=g&bsn2swEJpGqAS
z*JJ8AZ#;D%-xd*#-g6MU!32io;A;;yAV&#-6L)N_czEdS5j7vi;WK@--X!sB-ew+p
z`sScn!0mU&mc3!r(CDx84RrSwATAoimqT0nU2J%NlFL@wx@z$b@#X$nk_eIuCjX}S
z6U0&+KVdZhIsF!z75ljIWNWAxXm_DMCy59N>pp${OfT0tsLb)4e)M@8eCOgihXPG8
zn(cd4<vd<&4aF2^<u4T%Rx_IAApt&@W5;jPoISHfnz`D$4|k|l^S;u$xwWD8%Fg!0
z3o2UI?zSeg15G?5as+_rzHKi}j~|%AFrmKMPw3g)zYm;9NgHxWZtWL^NVCpL%DdWg
zG-tG|_SAZ+loTj9fJ$6e`(~w%F7iJgE!TUu??DnUSM-Z+SSurlQ7l{|R#P!z({?XH
zDAE`o5ge>uuY9aX8z0=9h?$T^KLqNGGl6Q0E$zO%nD4+1g7h==G+dSSA_F!YR@0-$
z_$JBqFVk#FSO4T@CfNK*O;2O(`z@>CM8!9g03_)`;D(o%R}a#f2~?|Wd`nN)8h=w*
z_xrhwx12@`)swE}31P#)@SIJR!)l5|-5Mlq(&Mr^@Kbo18sSsW_{!U1x!X=T=?L%W
zE4|nnui&QpxBFnajg2So(_uRcy-8qeSxQbJ>fxV$J3rxhJo!kyThHYYnv<~Ev%s>Z
z3oe&QPdkgP<Eb|zoZgeCghT8DG(f6;eD+oFO;j4|`cd9c|NOz$<amLOeFtNR_2O~$
zPy)o~kx=y2<(Z&ssN=g2z<%J-1N=C3Z!F$>Se0A;nXBB_X!A=G!hJ;0EteoQb@9BH
zjh&x!kP2<f+(U10aBx#BdbK9Q0QGltAf`aq;pLmD+A5tIV5u*}Blrt@N1&x9LDXrx
z9GUUt;13ANs%7lZz`dt5F>4YSCYkE9{#EpDH*g)KNS3dIc>uAd-3|La!kJ}%K<Lxp
zx5vcm8fxc$1<gc=E8e$_{hM<VBRkDb3PC`0T$91tI(4a^b@nL`AFZ{}a|_s8yqQ-j
zzfnPF`pY-d@#ASy7JEQW9~a+?wvXN+e%$xs&R6;?0-eeGZ3L7X%R$$hIM`}c3CK%1
z7qijj<s|R&L066%r<&F=$L*>4(nk~~GlGPUnD53T(^ANKKuC-QK8`@;1v-8L*cV?W
z*|Jpq+=iWir8+qng3bSOvb2<`2UHMs`&&MNA)|+2(--)Piu1R`1Aa<N$QJ@3dzP-&
z#~;K^=3=U%okoH|r+(H<Le|eY0EUFUjs|IcA>plt$F{?p@vVoi5<nq<G3-MR$z80N
zqL><0OF(%u0!Hm5JA>H}bqQ@wfV+S=+|pO8N}h8;dkcnSp{ud&#FCeemL+3Uc#rrt
z5CvT`tv!HK;YsbVo0}mLx@@+9nz*dWaXV35qwqT?CNQGe4jd$!XPcMD(GSTJSou<+
zW}Exy+_HrQu#QssMtbI*$-O)~UbmUw5oc|nZ)~7Ptzl|3-<b(2nl!9{#Vgp>0)rfj
zB~>EB--#aY#ZaE70iVNLoqV=P1U{dm3?37)nJ{rMLV};*3)`y-1$TEFFcS&$9!1>G
zYMVa~JqONVbNkJa07QKgbb_LmRLBGjS;ZR0J2OogL<q5=l$it<0z#ZC(1$D7Hyde1
zR&|Qz7oRAPpMuKjC9VkjN^Zex`-H`!)+lA5!wAGBpBp9+g}q0|FVb}WMj)bWc=Abw
za6Kds8glij-m=PUMS9elz2U-MZ9x1Up^R$t^jf7u8-FY1OWxa3L*%r~%*?5>HY<yg
z-cp+l9-&)boE~<yYLqGPtqx_UAqTt(>JWX?K=gp9B4T1xY2GVf(Nm2tv`#tR(dc^y
zBNlEsCK|SF@96oQ))vxv#8IxO?i>|FCZ-Z_m7y7qkcDf^69hxLiHwWtG2<TDd=_%D
z7#Fa^-{weUYenyeh;TTmbbNapuCznwq<Qn<ii1!=iA1JTg-~eUV$xgrV^2Sc5fSti
z{*bezm;dqN`p&l%Kf5~xW+w>j&g^2!`Sy@aVWk$rL+z&8*=9Lc+0=$Q>-!q-TJG}Q
z4u8VgS9b(5nC0)Qg}JDH-3F}tonR_{dPhVCnt!UO3cBP&&IS~*b_xhRaUdhw%10H)
zGQEP@TUw~n*te)Q!V}qRu1>wVZD%0JNpKtLnM-055j#@bp;c0X;(~&%TPcUwXL$YX
zVBB4&$*+avtRQje<DEjOHvCiyjKrqG8pr}D1uC}cfeDE=^4fRHNA&|CPItF<KF0Zn
z<eUPc&~4x;a@%haUNU*L^9j9rJE?(qADV><LBDLILvR?~>!W(;oOqKvPyIjF$h!X2
zq~GfL%apJV^;hCQA&5BZ>W(qEC-uku*-jOmrlj0e7+`fRSbMD#egw^ubZLwVsHix~
zHRdQ)#CSIE?f?g=I|fsb30be^rq^D_eZRH9tHuzB0AA66wl?7TULN=jxv|ufy0e}K
z-}bG}pHjVO3#K$(E+U3x-+4#SJ57}A`)+_!P$qbTXPRh+2Bxy(dC)JHI)1iTcRWQ*
zGR>1#4S1rxr8t|HvKSjMGH%EBf>>eZv;$O7mZ=B5BKtvynD4MpDxS3|EjC=p^oX5X
zj<5QZezZla48ah1rhniUH%-I|r(zrz8_$TL6B-K%LCkI^J=R~w>=_z%jOf6l#L!#d
z_`BQoSz$<Sm5O06(40xqqKDY6p047+2?@?1*4<SF7_YSX{V(@eUt}m13q4CcCUrs^
zb@RTcycUX>BU+xyA1}KabPjU$Sizz7fVAddWc_TGyCsZ&^j=18`iksu&|0Pa!zGL6
zJFlI7+km1nr(@syQ7fT!z3^5vD~&!-@NyGH7<K&m<uqhI_v?DnWx8xDg$Oa~G4UdJ
zzo?>xr|r87I>vLXi~gm+de7$a{ywZh>Yd)fVOZnE<!hg<ns4~o|55p#G9c7ixOKoi
zxxJ;Bo4lR+9vw^+%QevkaDRLpNuxNBckp8b$?!Gmq&Wivxe4qC+0co%x0CR1s~P;;
z`~CM@E-rrlqDU5kTn$cfsoZ&F9lxec+)CDZF{VAy+M1TT`zR1di%xjUJPLWHB^rss
zOw6}G&chlfwCAg&-gQxt{YjB2yOW16y~XP_RfBsUHn4SyXSAJ4JvwpdURia%zUXoR
z)GWfGqDPvV&qcq@H}5xI>bWhFtfOojN3+(9?r#t0R_udv0n;gX9axF@da7f$b&!o-
zLXE}?>cBgbF|Qpk)7<u-iq0eGX>Gk5=x0El+rd<W?X|e7X%jb9<7b}@qKAL=PH>b(
zzkjxjWw2$0F+;E~IoDp5SuWD@(MF6p*xSbgXD4#3*rR6r)vGW)YIO6(B8%WhoGm;&
zJio}~yxb?x3an6ATR{za#En<^;(?_cphOPFfjhtAabHDtckT<6c?r?r27>CuX#%pW
ztWf30#&~c&q>nb8&A&e=ftW!2qG<U+V*$+DE`b)L<@T9AcR^E1<n6%q6re5o5xVT-
z8|h89IlB}(d3)=07Pts)9g`8C10kb)WQSntWa<7%+8L(v)ejtw(q%kMl%)ofUlejO
zx9R{fEZwl>c@YAsRn*Y)jMQ0Aw&II#I+ILlS4ETtREozFfT304{20_XszhmipR9gZ
zAIX#F^VzC%{Ek0g@p_+=o8mR<Cv$Luxc2_dX^LYi*5YEQ&!Y!rxumkM=UyAl`UU`}
z1;2AkbtsK@+}*B$QHM)XO42uA<|&KhiVjPQMp#S^O<#=$M<E_Q_NVZX+pbY{w1PBE
zBHb@u+`>3hMX!ohuYbVy>Q3SZsDr;OMB%X+S~5-dVvH%Dy6!b26VpebXAiBZ_Y3n`
zPWlLU+9pXeLLqg2RSwIKn*;HD{cqV7%69@CH>06#pb-_G5uUt)!WRGtcK{ik44xAu
zW$WG8{Mn=I7mIkqc<`qELQO7RS{UrJn8@`VwpU-7AWQGRR0+Lcy0W^*vd#en)VCOA
zr(^USyd%N<qsNC9!^X=62@)@1=ht<5)w+|zW9NJ*wkR*r=i5S~H#RJ5yj4_EO|S%r
z$o!^gwxfi&(PD4^KdrSn0hB&bmEK=pxunGY6iL_*Cp&#AsO>WP#YFh@HUY*GsA}(1
zi0RAH1~r|WxH`xubLez9)$G>4BcRDxMZ5lN(+V9SLQXm$j@+j>!w;)hu837s0O3?e
zj!s&Rhf_fYS(j<osS$u;G5d{R&1d#Mag3|W(?0$3_Pa|Vc{9iv87R;ZR?O7|V&&vl
za1k57J(*U2<`++U`yfy;*e+ceUC?4`gs8_+J91tjsGs@gY0BRQ5z`-14n=>CB~P`P
zu5YC+wdpnlqtRN2>(l`;(zS<hHcfDf@))b1Rf+*KGw+HywjW#)v#3X?uPykxe%tz3
zv>Qss^<#>P7vY++#a`?IuqhT#=MMKH)qX0}C>`jJNa%g(nb<9s&!Cf20MQ-FYD+{n
z3dSMgWO`a(IdHxpJaF<>njZ|hzRlKas0Za<(Oh{`j)SP0bK$cTNrNX>ydzFHLXyh9
zt@9a;`$;WTzDw!8?)D(a)x0m+C!Ob@f^E&NvN(mq<t<r7Zb|6vk05AeDqmlT=4>l$
zVPQ!=LCA?=h<R2GSFkbX0t{{W26zSgt*T}B%Xg~#UiWxVmL{JKU$ExzBv+!>JhSz#
zRV3q&|Mw0P`p*ut|HU3n+BP>UyTp8-i9DX`Waur5@Z=#N*)2WAR~g<9Zmt4wMo8_r
z!FD0=`HZ?xO6B|jadl^?)#;YzebCFZF1z~%{2yHDe@dTD5uYqOG%x@R*l<)?2&8|a
z$V8rf`bPAO1(gLg;oivQ<#*G#;_ZUGni|v{%8a8A;kjEdw?0Ro>tU^Im3ko>XZPH_
ztx--`5Z)BjMFZa8btx@FV;A-R@Ie<^KT*0aD1ZEYPXHhb!ICEN$+AT=W8wG7@c{01
z#l=|=6RX238u#I_DtA2Hp59sEKLGP|C0drdP(m-iA`1VLl9Cd%2E|l(V*yZXF>Ppo
z8J)zXY4phq?Ii?%AJL$u_7VRGk8uJ13)Q~;y}gGcBK1V$o1v%d*xH{AM+EG$%A_V~
zro9$VQV%*F-PrSyFL&gK-($-)(%^kQzW;ZHj*MpWDUn2kAWHVW2|WGR2SaZ?fTB2}
zML?vT|BC`J4uC5;KMi@Ij?rcJAILQX>o#&d{G4z5{U6`zSq7{ROo^o?#k`>f`G?^6
zNl!=;0{MpB{p(j`^E>>GZ6GA~$wvKrhqGQ1{qax)0{DzR1BX@y`ok&Vmj8Hv<M(h?
z0i-jJZrbxWVF<){zz=N6X7+)j?6I?Rxo1zl*5F+fpsGhNUzGWeAO0Q#<(dEjxxV(+
zy$<@{Q^mnd@%Y<6fB)QO7sR=Bn*r;KUn)XcrsFZ8YD9cE0|^&GK_&IU6668x8M8!5
z>g#*=o8YqCum>32*!z<Xrf*irO`@NOOfC+2DE2yPsJUF%EMHEEI!U$c65oIO#Xqih
zIo$i|R1TTu3`B<^yHEQ_y+LwEIB57fL1@szLiUS)B2WO1!pP070-~N@$NORa2c1F7
zO#EIRU$PNQwPFJqV$mhWoh>I1;|XV4{D7;j3+y;bciSTbd43|uR(EOtwZh@{v|#}8
z2YD2+gtiV<_FQ5}+eZ)5^$`Lh7eUBNiNo^K$I)VS+8&!_2e|J)-y`_<{bw*n>5zqu
zmRrTCKG3`R();5Bbh$UFBFq=aZDQ}kqaJD*H6Kqaf&h#S4}jQ{Gub|l-T3F`|Fv4e
zhF}c9_3=6YJ9=T;A7V*|fec#$**UC$c_=AH4w(I>knj>bM#kq~d1k)I|C#kxnt2~s
zy3iMY=cfFUfsh78Bc2c<ugj$$%|tFO@W0UikR<O%pCRWjav#9CXqPl_-0tAt9W6|_
z_YLf>t(;3lv!Pc&_&YFHe{Pw`{37!!oVb2DvZlf`lp2KUa+{A<7UPn0FLa_e3Ez0a
z-t+qy(W8M_T{Y``O|IXx@np6@%rBUZ^mPFIX{`J!gVk8+BaNxIrz_-ysKdbi-<j-#
zy4YUW@Ujs2#gVzUG|lf<2sXznlqvWu%&1Iy;*F~)Q@1@AshT{%EMF4hY;KcI27p6u
zMq>K7+(>*5(?`*$C4~4AqCZn1G=>#lt`|1s)y}N)m@>Tio%e-a4*xM2z3yvpKNG8Q
z4YYkxKz80H!W0(BCHa=yv$Y!y3mQzxR}y2=6Uz)(LHVccw-$A$TrZupKrB#c<+T^^
zy~?C^cRaq)o)$br(a0l2v~AR)Oqjlv``^h&*Xn-9)$0Y{8GLbxcCH1EHm2B+(KVJ6
zmCy69VO3%|Yrbocz5bxpJJd7VAR6-+%zDP7<n5Cn;>M9g^mN_ec)@B3t3)d>qk}wX
z{s4VS2(N4eB@pBPN^X9Xve~jbZ?C=_kSSp~Zb#)ua*|t6%~CFfayc_=d$97n1K^~o
zi$W{I>k?YkM%heiN{p9hq6~}*5n&*6c1+N;JD%Jo+RGo~L=;6DTmh0Dm_S-0`F-WY
zAYvZv?#0Su$f&^EMh|~e#Bwy4BrFhgTkxA3@qnp3M+~=d$Y3RiXKtP%7dfshJ>m4T
zJ<1IXGy<U_PfzKt&i0i*s4B++2Wac!tv}I!pLMi8Xa%lxjk^xM2p1ewOam0mVKOOX
zAaqL`E5xCe(>gv+yG#XtLtH<zJB-dxHP1EoU~BLd2<+Nsr!TsE5&X?g$mB3b&3-01
zWLL)Qy`(d<T26$>MLYc$@ZI5h$Fu!cp`Bw+Z@!s^CfX<HZUeY&@f>fu>v1rfoR;*$
zGqRmgLv<pkA(A2>5IqPhX!umSGxzCHMO>FvzH)=t8K0!C`za&OnaA-Oqh=iqm;JX1
z+h)AIx+opX@r+<RFxPDQF4QX&vH;uZ$O{!mro*=*!*ZtIFuwb4+SAE5vhW4)U0w8-
z=i`@^Z(5F?Pga0gfvf%BGevV<fZz0x@^{zji%dqV>QWH<7irXpfckhGhXk+gOSy0B
z+wT9WXwT|T`{C@VNnQY=*pPrWxnSos<~GqD<Ha;4O?$7rSoU-i`S638-`)*ZbVTWI
z^xr!ew}(9sdtevQ^WFX@Hc!#|aN&Kx3&cBlo@Hkmf$+lta3S2DCHYlssw)SB!Pu|v
zco%E#>&B~o6nh-~C}BdiARe%MavyIq&uy}Yisfv7;$F}0E*LysrNZ0SZ1D062KJvi
z?iw}fV4puA1Iavv?p1~LcP_%ty{uy;fG;4k*>61+@Pl(Xcawn`d}W|;BrA-p>jIEu
zBq+NSVzVahN|1JP{IvzbQ9dXjBoUaI#L%KH&sK#C0`V)d_COj9I{i-+-#v$6=Zk$D
z7W<92gDg(63J$VVc3R_XI-l@QE`+p$Fqm(wo0n+DbA7f?T|{kv_9n}Y70os;A_15A
zt)!j=p$<TSeDkRuX_jwh5sD2S37@^tq`PuGvh+!L=aH+ETYhnM{@gr2P(<3N@6k)V
zf#Yd!J)ozyICbp5$Fo&`wZ@vAoQl$d&KsfaPncX@(EXMC+Q^3qjd!RIl8_1dSCg-j
zL0EOylCeQMe8G`&TNoo3$Mx#lb@h|y+2NUv6bbTiHP?6lIG}A!2G<NZk9ll&knw5L
zp?DU%78~{4t>wOM+ip+<3Eyn-3tV{7mB&*an_8lLW?_KF>dkOjJ{G$ca$4_n=sgYg
zaQxx0K2Y`j11gAmk5prURH-+3d6Hjkd_O7!S(YUWaW>yhMUUHwmBm)8+ie;ynFvZN
z_3&vvxSJkmd2G{c|M3{ih_Ri=K(FICQ&L_%e{Fa%nnyVm4YF}Z&*xeVgjHmc38t>$
z;;SC%+YA(m4Wc5K$ShFqtpHNr76e-ye=cpm2FL<5LV-|+9@nuudeXUF29In~{ZzVn
z8b!YY5*1Eu)YivC#cI{=ABKkj5sn^|$l1(37U1@>%#u0okrlTe4QpbV#YhxSW^a46
zo+mRP;={zRPhOPdDDvHEGzcII+(xZ_8N>oR2^qcyN5p7V@kb$TbcDE*8sDet7?AbC
z-=05)E37jcAfsNSFX49a;r=(ESwq?O#Uf?`kENH{$f%BDpObnTu~qli{=F%W!94#t
z;BT``I>#K*)4)@Q-2z>Il$U`&%E3<y=bA@WzbfMr8*{+XkRu0=Fo;oRebhM0rc+%?
z5&8~926k{8p5c06Jc)f2nd#^>!8akSUSaU*&Vy(0#Aj)DF%D@`j;HEs9JdC3;?*o1
zx$qsV4;7@Pc4&cQwx)rTf*Rk}ZdHQ!DKA?&%IYbTJa+@I(2+%)qbeQhCcyP0p1ND1
zAGjd~&n<$y4~Iw2-uHA#AHK9jA@qk#_{9vlU(eth0y%8_5w-p&44WXpQ=SAsJnIXb
zwtqi))&w19Pb3%71VRc9)852L#+O=<L`K2@7S*iez#~fQ_qrN#S{2R-T&!8$v_=s>
z&r_)%_SbQcU4WA~t}@^aTmI#Ft||0u;mCR5{kGSO5W5-VP^hRiDny`5_97}ly9o(u
ziOH{wWp||+bi_0NKSCxKC%evdJC3erF{YNdHx<5>B0%iY;=7K`SM+}o_TKSS{{R2@
zImfZG4iX(%A%%?WY_c=U%E(Aowz4@g%ci0-vo|Gs6CpDqdy{qSdGLLn-k;Cs_O93K
z_WS+WtvZhDT<3W`9`^|xlt@b^lP1j3b_Vhz&&yy^+Q%GwlixuF*wW$vDYNz3NU3uO
zGD1&|p5HP0XR~<;I#RS&_EG!(x-;76;DtJFz`o6s{q4$-mJ);fV7hSW%1kgoidwDV
zkd1S{7qtI4G)g_RDHCA&%<umGQEv6I(Z@#}jWm@t6%NBa^?o^A#{@V8!*>b#u0SiJ
z0)z4RwV0%```wQ@%RN)TWlX(Cs>ys{E^wt(ChE0~Ta^7%+Ji_8Y}nY#u1}aIjaJC+
z6@Ve$_g{A1q<DCoi`sczl+fbnqr}1LZa_wL2~e0tcz&vv`JRQ-9nXZIKbK8WP~uNp
z&vX+x9`-TiW^-Vch9L5xvMN+B`w|r(5>q~Ie3yl-7$uAczcEWioOs=5LG{gz!bu2)
z(}T~k(Oyh!O&0u2^s2*4@i$zqf!sSHR{SJ)SQFv5D=I>4@Ef6edJ}GFqGYFg0zVGc
zelp_@@y+3o_@JZ~k+pXpRM@k=7qWXp`*NHS*iU2Vp7-swQby(Xk(<K4Mt}y>;E{a*
zOTMN~7bDp8CLeA7Yy5EZAYcKSW3I#3{8>lNWedoB5+(><oJ&Atw@2`@@)oBNZ<`Gd
z@s1<!cS)NOXl#cmE<?vW-m1;=T9jqiUQp4nmN#E>@*=sB^{I@mk?CF4flB+~L|*SP
zy9)hp4P)qXz01kBj*M>HuPkYsRClYnnS8QDC(0=XVInMQVm+(Ny@%E9Y}X^b`vn5=
z$~2>(CD_a&W3RO3>b6RT4j2=>QEzRM8Pj<Rh@^s#5aI!HW_bd`wf>TodNpxaUX#_3
zH+~w42y!w>O#cGMaMWNvBNs`*3CGUG6=O6BnUtPX;k(cC<3}#rwLJ{TJn{*V(LAf|
z08jj~{+qRT`hFA{nIE7QJj4!U`gsBon}xx6<LOKU>h6K4<({C^F2E7jlD~W0e1|BM
z)k=*ZH1hIfe1}acb=%2@UaIxz3zwg+4wm-zN*ok`_{J#-uk)k5ujD`TdnuM~7}r+P
zJs*a*25-MpnCY&V+<sQkt4QzyLxxb4kD$K^L>SdMWNv@qc`T$rkXSlR^N<O$G2=+-
zFp?7^bum<R6~vf$SJ)?Ea~ch6TOy>diWT%d{@#hhbJxg4rV*#QbNr$1d)cwU8~28G
zOv10Rai-?4GvJf9jCe`wnwMS`Q?H}xGqD=Z*Bz#)W)f>EG{ndAkZ5Fvh`{cG3T#2s
z4jAo-n4@V3hgYSJiutzRP=Y&4RR9FF7>GJ~x*$1Tr>;qEGG0l|0tz=4tr7N<%lpzf
z()1|v_OtB}{N+#50bluCmo@x{n=Mx+{Dr+8#z1N{Cg*Ag`0_5~yu@QuODvg>L-=i8
z{&p-+$lb#X8IR8m@$YJwP9jS=aSwzPD?lRC=2<6&<`bkYotv-BM(w0-<I1Rl9r5Xg
z#pBd;KQ^D?AHQmXTZ0_YWOf8!Xbv#K9oZe6rOHvk^RJBr%9^u3^m_Z~%G43aEICe{
z_F5*fdUIxX5Rof9_lDn&sDE;ZWJ<{y1{QhGU*cC4;D6~N8yA3tj`5P=90z#r53Qe8
z-?~8UTQJ|2Wh!@tV~#iCEwH)ZS>L+-Ig>wkd<A9uY^moRr{f+HL@?Iv{u*2Q@Le_O
zmRz#mnTNpIcCM+As3Rt{X!^(Z*G~(cs3)zEkx6BF2jbq5&x)2yx%qLx(I3jh@Z>42
zj1-F)Y?d1iCUoSlwXc=ML-?+H5{9zZab%FS`LmC$iVX6T39CCNdmB18=tQ-}X`>74
zQEeZYBgKpC0<Z0#NOqTMWGYj6h_H*LhIrmA&?}e8hm@799owt5E=ukVGV1UCx=MMU
zF}m@G+_!-9Prl#J%kS0ud=P_iDxf4J?}zBsn|U<qxw%x|l#zY|nomH5x5J={3CO;B
z*E<t>8FO~W|7^#-H_A<9gxMqskcPJz6)Sg4h#aDM&D-)EGtgnz67ICrgp3m-ICb)_
zL%83L@E;h1Q6xnb(dSM0<L<@K=o5iJTq=^k70p~^zi)QrL#D>UN<pnCR8#3L!E#@H
zop*NW(Hs8tU)e3jNd`^r%DeH%nhj)6NT7$^P@^l?(gsaAPjJd`EzWga8XMvR&rN9+
z8SpV6v)}!k2*SNT40e^G{YeiaE66@8ivlDNRr8AKTXRKOZ1c@`n_PzZMm29Rp7XlD
zvIw`UUDs`l%v{jsLpe7!rf0p#+#>sY^u9P0zCdf)h+)k3nx(W9SYToYy&`Y-&&p6l
zVr^-=QzzdiD~9YV4|NN5RF&UcH+=!;TjleTPvMZ!M_io-g#-!;8OU~(A?C)*t7*fg
zY0xR(^dZ8HxORamhEeVZKsG($22{Q76!-J)E3xOkKNSELaIkX4uD{bYOYr1sTsBU-
z8o<$S`E8K7v4~kYx`Z(O%n7YhqR?n#F{y9)k_R0#;t7}99ZAh3%-6}cA>`NoKrqHg
z%GiKIA@}hw{iq>5P>z~43DnC#W=~6FALN)wA1N9wo^qr%vE7mPtCY1IJl@Lvp7ADo
zNHLMmD-!wU%V49oWTnfXP-`%^VTlF@>{WTC+i=Gh@0Jo5j&5-Ip)G0=oE6Qhk$%S=
z*>5@~nsN9|O;0`dIO)AJke}mU4nJGK;nkJN!)x?{nO@Ypi9XPfG|IJ~<KiwkRW+Kl
zwOt6>S@L<H<xrVgM*rZ+hB)F9Iaw4Wan8Q*eu>Hi20vdjBEy~Fpwc+yW-FP4ZO#Ji
z0mBG4*lY)15~DrHelFGUmzkg^>!d|s7~qjw7mg*9=YZ!ZG*&2HTJ9j{F__}5wY4I+
zoSW4gP;qn6263hpAkj_k*u)GmzG(aTf&VaHljw%{9;vi8Apukvf0T)ePNN_4p&2yp
ztm}_JdnGqRg?%xV3$0R<MEvgLAUi>!bHer0-2~yL@9!D@!mQ4Q;js}2i0)QrHHD`L
z(yTm&hr6f)wq&K_MI)uRKv!u0OHX=V@D<Kl7RRXLC;F_n6Q+Y#e@=dC39+sq@Ywr2
zJpUsa9*XT8t0@bli_K%QXV^L^suK2~>9f!LN!M~I4Vt4C{cvK==v##->D;<np#}uU
z9xRpRc@abNy#kuaLL9ZL=!t4D_bguKqXaPk9k3xeF|YkAI#i7OI`kW-_om*n-{5})
z#S^Zj?Ze35tYzY2ncgvGCf??K7f5<4D~E7sNRUuZ6xIYDbO);xQMbw1%_c9{(r)?7
zcViF_T$6$tZ7;WRcFyG$_dtmqo#RA9XL(B4hiy9diGvB@iuH*xZa6TKtyLFXxD&{E
z=3M$_scWTrtY)Z2*FnG9^@Y!V&$^gStr8Iq&Ympet5)VKGe&kJCH_~$9Q%}QWD2&g
zHMKgq+c3sU8Fyo%I!UeEi$iP|h|Apg*>IIbJ;#6x{j=EoManP8oapECiT}Ed=s89d
zBOnJ`CxbNOMVF7hTjTAq;ggnSxag?{&j{JfD@tGTrl|>&IJtZ4@i!}u?(X+O@e}W?
zxb7`}TVz5}8Nce19Y14CnnNK?0eZ#5%wbUe#9!1gf3}|x8hRA$Rl7hC`e25H_y?Jy
zx6r+a)iUb=z1b?Sjs9=D>c-bKm)2KLpR^TvGUa4iw~2Iie#w@emommBq8hH!=tMA4
zDM}Fd!-9VqJh;}IJqZB}P25aN3Q1zyhSA%roL3WtTD{om^=iE&2%t{A<S0rea|tol
zXdx)?p+tn;`5o~a&huZ%D@nbja%EmksH8B}1#5@Lzb~tnz4Iu;`*d%ZeVdqi{*ly9
zQR|$~@<0_|BGdC*Ck6_=l4p6)O@c}`4FSjz^mxBN5z)EEwfA7TKRv|gJNjKJ;_dm8
zjbQ^X$y?fQj+4tWN^3qI##8aNP4p}va$BoZ(fn}Fj64{1a9ceY1jS6buxJD>!_e_+
z?{-oZqLQEOxi8IVmO7v%s&1bQ-MTX+5Nfy?{IlNfecjTSYnWCT5x;e+?zhcx3CX`;
zRu-%QqZEamckZ<)2FKu~6St8QdzSh209NekzytHnr<q9oUq%5=`_Yf%LbizJxNg}H
zhhHOBe0}TnWi^sPtNn7dBv2@Xj5vz4*S;bH>P+0t>+9y2_{##M9flX;+=e!}z+zXS
zGSd6vHFIMi#E7Krh@mgg3C+wegJ$4e{~GF`iasB;b?FKuq~v@vZWQg}64jIQG=FSn
zo4HcN?9&&U0|51~YKcze$!~HL*L(Jt^nQMh(&p;3ZRZPd3cI<%1a`G|?proL%RJKC
z79W0HKv_;|2cdRux7FA%+^(8pHhHl~P&sV}LF0&30QTO-%_p1R+Whg};^Dn~yy?@t
zB33FUnQ7!ES4{~{yy30ek9}vX!q0sO$s_S3jxuye^io}BFE9@pTzj1gPfdDw@S0Ow
zd2-;E|Ayi6XXQFLz3(!UNYQKJ)T<flDaPOjJ1>?+hHWY-Q|Td^`|j`1{hi=yDDnG=
zAjl*G$!fo_xoO&BvG;DsSvi7HfNZ@Wbjwe}yuK?$>19Z)L$7t#u>ev6f}^!AD{%du
zK`1Od6L{JUT^A;X6pc#FGRuYo{pw0(u7G-%_t(t%7aOFSp6h%UpD*!|Giw21CQ80W
zB4n!9X^JHGP4`ZqH>L4qDL0OBpPn6-;efz|3P+lcQ{Fp;LEPUo;<rJG$JlAMYP`IC
zp?khq;npVC{&-ThK}FF=n+0|Q3o<V>0ZC{c66LwSWw*oc*L?3mDU@r|UE=AGETf@8
zPX6c|r~s=aa1_(d8(blMD4#rV1yNkT{gTu0@Wq?2who!wj^1_GLZ3uMLoEoHAhXhy
zc>c>Ast7`+>C&7>{>ve5Yq&<I?H*2c3HV9}(?-rA!5RGhjue%fb4Jfx4*Cu(jykD4
zC-?j7xzqQ2ns$iWpS*ES%$TA0x(<QjECsr$5cyD#+776=R|b~*yrYGh^3d-3_U1V{
zPI@!msk>&cMfSW$OF!<)Kc3OXzdsS7G#QjAE;0Y3wh+^(Lu<56?kZ*VQ*&?R<LIU8
zRTw>P+UZ%?RH@cFGW^bXSVfi3NY>rA176pc`DWXr=tLiXtKC|bPMST$ousOycT?SX
zFwVarTE02kC@8TG)S=JTzdrZNH6+@4E9pu+c{5ZIlu^VPRuvJ7d+XPy2>Z(HuT3xm
z0mA1XmeydZ=EsbiCPYoJN$B54?K$o5cPmmFh|)`B|LH1yS++H`r7xI}gs{=U(0uj0
z2!Aik>ed%U{d|kHv-s{W?KNRL58oSQ8U-7fzjEwPw^aXCkU#YI%d$jY`Mr>C{L<!%
zGuRri=qgb2Hq6Ik387}GKPUm|&He^eKaY7Z=@L1^4eL7z<0~Z5GR9t~dxy#F#&d1q
z^vvM6{!zM;iy-(BO2Jv8P6?IIk@6P@&|8=woY9zViSci$?2wue?7-6vdJ>xty^F*I
z{y`=|2m@IGn*?e7kiM%I#G`c3Jo)sBThTzE4^|oS#l8L7nMmaKPon?wA2GcIEJREY
zfzFs$Brgs%oXMsr)0zC~ZlyfTRyx+$Cz-kdXTj6&`6o5G=|HZyjV)E#IRtXjo3xVU
zlTS9c{wJ>vwRSdgO{eYn(3rCQuSpNPO-jiSf*;D5h7eBv6P5pMTPnH;KDnY-Y~3e+
zi5;<2DilQOpo*dY@|ze+!>{ru)3Z+e!`=0tSA}3-6&3Q$;NM<V0bsw(|LRRLgu-J%
z`OI=K)^ZD&kA)P!v(jz<esR%P@Vl?sUuUSqs{tsA-FhIyRKIcA#7XiW?@j6>GnVVy
zS?cWq5CIG5a^#g@B)~fqby*B2#Lrewd4rWS!~HJ;bE5E@e<d455ZH^2H7Ko(SKR|S
zq?B`fPBM)^t}+jvkj}HMgtYHDbh(-yK6;eV_>U~--xna3!~YQz90MZiK7zaShE|4R
zcy9+GR80))?7brT?5<YU-5bCtl}>|T7N`H`MN%ZiUI@qiHIRP=yw>D%eEtwr@XcBn
zBO`#!WI^!2bs({b`}USkWn;YR*JT8=+`qj_Cw2@ivFN&V!>D?fH6@4=Ct4T6gbVsV
zUnh#VC}SzhxweRFc#L-;#xP>aN*1+$>2FmugU7x`Xn=!7W!CL?1?)Zmo&fOwSs6xC
z3u+lU0B2(eTHeZ|%o<r^u7#0u-GL9W{Jq^NGKx@E&IUrn?iSOO*bZ~$UxAQ80xy`<
zcpcb}UA@Z)JW#vDm95+m`Q5YAlL*_H<{+F8ovgv9mZZgcr7}Pe`?%M8WiS`?(Uig;
zj)Bw5Vn6YxR)v}o$c<BczBC8_i(_Dk*8BbIgFwF<;20>V62`ie)RS&o{I^R9bF>3o
zN_ctLl1pfm1UuF=IB1X{<X3-@{r>e}^7A@bU}}4f%@Pg_XG?aM83hFJbl4Rh6BM}m
z3LvbC4dlXmCoTpcg51cpdB7M+lU=&@7i}p8i@)mlg$N3P|4%2&Si{+IS0~Y|=r%56
zh!h}TxV~(^yiF<yK8`qSGfUOZ(tUX7MUwJp+#PBb5vFON>Apk&LHIIW07=WtO}*Qi
zR0@mVw3$-v#pf=@6XxH~We1X+d}3O`yYGY?=;PDY{vkt%XL0;;0m?Y{e-x&v+@;^;
zEtyjOu;(LAi$t3Z>Ojcy3Y)j<d;E)S3`z#M|4{0^kV7(~*tD|4BG{vXP{&RO1UY#!
zjm7M6#@bZianrr6Jo(obQR*sX<%b|h2{9X<ur$OJ2INm~095eK5^~2sP_Xs8`6TEz
z`r?@f6j);k*yaPz@Kpk`5J38N>r_#yd|e*>oR1w~%b$}5P+QP~@FHK)1wU1SLr{;Q
z`VI8f#K}^i*RjPzEHAqar*C=fKLJ7_UkxkBB%7Gm{D;sY?ZOL8(?XYv=%MixFh;#o
z;OL75q9~fB@D0Op^*`k-jtk_Kvq6x9uSty46QHp+Sd_RB^sMW<OL>P|2O`6f0?OE!
zw-W(2DAE~mzsOJ@8{EGm>^MAAF1!TJ1l$*E`{*%l@)+`1afN3$?p242f#M_QiKoL=
zMgDd`nDc0%Q+uGofgwK-Re1n{_j{Zf!C7Y9tE+@Zf_y|p&O^{n?_CoAf~mOS>|J1f
zH3%$o8(3{hULhgBQQWXqjqTiGgs+WN^b(H)gJvhF#moHSbMu?I8Tk4;Z4>AL>J2b?
zy6rDH^-3B$g{x^{eyt-6!<K#JMxuZS3=@4ch9yTpQtraB%y1Vd{(-fdU?dO5T|SyE
zTo7@erhZRbYhSZ2h;=Oyiu(5)n}PtYfizjKRiTHnv>M6-0){Q%<b0e=s>u052UwP?
z(&b68{AQW;g#5FO(HjByWN!fRHtK|#3xpU-Hp4eD!^?H_M|-+nQH@)<*~zh>CK<17
zFxNl<y;n&nGy;%a&pMsGJN>C)ZfxqLjzQ78peUnb{zlxw^jU!Cprsjp@<WiFxDY97
z17Do~D=QaDL4XZ>lWlzg8Cj5EzT^weL+2I}p^=xihCvV(UeFOzgRL#@nQyB(x|gZU
zimu&>#O9aza~y$tAu}XWq=4{{%buM5YDPRjt`tcG<m(E7W)#*^?KIc6eS{5Jo0g1v
z@Y9C#1Je{pWh7D!MM)FR9fHq$03>1a85hPar2xN}&b55<msFPSP)6P6*)L(Lt*C7H
zD9ARMuRqz~d%Po{Z(=BeGrg{{_2Z@UMvBWeXwmr!x=Ahzp8(>K8*qQcJ|QtDfOEl%
zoWUvbsJh0{T4frVZU;03Aak0j0Rw|dsala7AXLnk8~hAuw~_8U5L89jmxozq5j#I1
zDH{`BUt?LPM#s<mFr2(X!k|~dlZ5G~d9W|=?!itu`K97;bpWsgN+d24ByCFgl_Uz-
zzQH~K+a{d5Zo-OjY91R34aVdMg3jfgS`<%zjT`sTY}glnH-0Q%eLQG=5_tr=x!J+(
z5=Gnj4_7gqQX`#NNk|~vdxd`80PvF^Ch!+`&vMgNu}|Igur8&+n^SI}?E%Dljq85A
zuattsNC;f;ss6Zj&wS8rwXkB8caONlt8tvhW!T8==h-cb&SS7XJT@PcZ9D)i2k^H0
zgQY9h2v-qy_~FZ6Rh!uy{SpGPJ`c#7vgJ$FNi5!fvg$XJ5U9kN<0qZaEzr|^7=d(y
z?^ltTXw-Q=O+n_>1IJhO1hrk0vC76z4EsAnSB#b@P>1H4haI_6N#<}AQLZ`q8U-Dk
zo}Hlm=KztmH{m`PQ4b0&4<%04_{O4E>rOVZo}JsEPVR4RCW(0rWYfdymC$#$Uvn9a
z_N2dn;_OTV%K^sZ#Hk1*VhcFl)t$guLIb2;T@G*Vg&n97!nE&VD&6)13Bp_O_h2P+
z37=j{OdSy|o@^3teJi_@R{Y9HMU!FiDamji{dIrKaNom7-b$e|P&jgnr}!G>Zf@fv
zbod3edLYdhsnadc=+C|y{Qk|KLF-^{Be)6_<17;<jI8Ad#07Y;y>}oaf?SBXJONqt
zK$*++0pER7NQ<5!a4ymfQ<?ECf_yt1&`4w>&l|3OQ)Nkf-gYSOb8!xGuiA6t-tcLv
z`JuDh_pE0{_7l~(kxe;Ysy3<FvkHuAemG3$Gt~oPx-p@T3~7cCP~d&&9f%T|bZ=>E
ziQX%}1nPZ{^0f=|+QW|c&!wg}!9zU!UPo-^H@!9}&0MA5I8l*!nRQMeT|iC=xa5td
z4Tx##D~X8IWX5{XR6TM8qW<am%5iXZap3RE?M6qQN{5FkVO!La+;V(^$;td>>mXdW
zbuyJLYI>q1j+Z}}t6lLYc<wd;DuGAW@2Qig1iym6xHc()`NsnpXt_|lo>j=j;E`z^
z3v;hzFfP86)&<ugj95tsKO8M!`{R&5EoI#kxFhWD2H=qx#)p+7haG%28XDyKN<jho
z+nE)C-8wq?NF>#>jo=96sBeV%!LtKvJyf^buu&79rv}@pnT*scC_Mv{R6g0rQCR$S
z@=fPP_WEl~yV^N=O&~Mq868$OKRx)aSgQTv*5@ZjQYH-UvW(X0c%GHPmU<uD2}P-E
zaSdH$q%`SP15rTGZqr~Um=dgCStM;h(o#~zC>BTwdr8H;_<hqqm3N~H6x@;{ylaz5
zg-M|wLVHAfj$MbvONMwN8btu>LbJ(2=sNw6v$PWK|0s^x1e7a%$j@kUB73o`8u3&G
z5psKnt4&n!Ud5B2TCM_m?*#44I#$j2<Rd)v>mEl$S|itKSfKYEAYm2{t6i;49>g<C
zk(g>895nhKKufpM<s)BVK+QW_3a4w09Eoz6Q*S=up2YWYnAP#59x1GKlv%_S*xhkX
zWhE=GK)=fJI2eyR!z*2w?wz6E<xgv?eseonujtB7@wHKcuMXwg5}s{T&XI<#p6?s@
z)%pS^7&j$`k-USbfBY2u!2sb9<s}s>YZ!m-63BXbkB#EjfyTpMc`x?*3?osB5yj`v
zIw-1ucVYoV{g<|S(S(f?0JrhS`bj@mIf>04rc!etN5=_?VV(w^%5Q-8sSA{tT^vD6
zjEJDEBYz3tH&JvoO-E|?Z~HgM-IG9Q<2%x`l(vc+Jsr|+BNDk5jfRRa{&Vh?huPd^
z`L!s6a~4JqeJ3}m<1kZ1Hx|sRHxy}2+>D<nDrvT0BpV4?BQyHx)*h1%X<r@8oqP6E
zCqjit2$f%bZTXXrW1Ygog3cQ4?NsSx?oFG<6|rN7vGQ&gmVE|+_YGB@HMVZa<nLH|
zuK|5@nv-TzJATC}))Lf%?*JRV;$RqZK$dn{?j;-@U|Vks=(Yp&hIWMe`ITz2WO)B)
zAU4!*=<6c`pI^|+5?QAS_nyigW`#)jUar>Xm+NFp<fup!;$nPvo=TBM1}3rx`b<ys
z6#EDd|I!*yzq}?Dg`%;bIBz=igqIBr`8+*|a(XJcyNit?%ONk)Uqi<L=Tk6!Ir{qh
z5Gtzzy!6^s+fBG0$i8)58PTNgzvYoeGDIjJPQwJ`c_8OZ{W)p++Bk5G-h5Zvqch!4
zcB=Unvan-jvpVb$ed*@Lr*VwSQg_(BA&5|M+j$qZ0*{qglWux{%vNvlJX7VAV>i7$
z{0*UldWi%T`~4Cd*w+oRD1xQTnGx&fE!W9{2^?m945Zy~;Tmc@Aj|2QXF(5hSN$2c
ze`OvtryfFv^Nj=ze-%Y+cb({kt+Xr^KA<L*@j&~TE~pf<oTN=N*PNK%c-#$`8Mhb(
zsUddld7-^Xn%k}k_4uP~?}C*tZeD8?-3O&d9S<J}YSIz20L=8m(G3>!w($P&dHX*B
z2!D=uQd;rSXI=DAdu`gSv;GBYDQ{t&G^1Ie)+Y1UG(&hFY_YbJz5b6L(ndg25c?$v
ze4eTHN!8SJKR)^iW>-&-ea`c$FU~3Y;AQ@c8+q|_i!<DW%6JFfI?ODkLi6C8m>9ya
z2g)4)w3+YPAv(=do+oxSItA5U@6Cs`@)50>O+?Lw1)`us#I5mkPM?uW(opu6;*tkK
zPytmma<D!oG_JuQ28(`#n45{2vCFv%eKQk_>v`fKnLZ0)7d4R>56jP(UT^9Bdm)3s
z^l()Nx(T6<77d0ZF72RldVDk~Ep)&yjik+P%j`>Gl*`R?V21p*Kr8>DNG*hvx59y@
zD}lNLQ%fQP86QjXsym>Ou#JT9QLGM<kdid=5|Dp2iSmd^BwH#@wOFoA!(XN@V&aE%
ztn&y0dTY@RxOuz#(`4~Q&%?MQ$r4EyHu*^>l_7cAL3=LW470%mHnqVpOupfRijjP_
zrYDe~pm#$`s3!=IF|d&kDx76}xh)@z2|8K5d+nEGny_;lE2E(yRdW#D9X5Cl_45gw
z@O{wjzQa0l8sZgsolV3Sb65f+q2Qc7nme=nmM~Li=zHximAXF!pxn@l{eGdv2o#0O
z3YUGbcxN5c1_m^@>TK9=&3=(6sAn#xZP(zSCkYlZJ2d_FTDQe;VxU`5!BFhu;FNKC
za8V*PU;2nP(ouIq4Ay)(dKrfavJ<>W;5cXWI^d&4mtF%nHb?1RI<5ASPd|JSsSds%
zN0m-<Yqgc~n4m?U<%yaKWVL5<&XcpW1G3Uoox%sV%M-|)_E$icEQv`7mHVU3_S);k
z%XF*MFwEAmF~}ziuenq(kp7KHnZ_*aNaZ!VMZ4*@5KlPc5>#i8JBc~<tDFdoUo)fI
zKxIcR@RmEUy04wZkO6r$E;LQP1?~-2gcsG5W`q=v#&@RQ{E0>}dQd0|iog&qg@p55
zDhVM-^AGu(6vKJYN}74<W$Z97eq8e<KO4ZCv(4usZaqa*yvsI_us1ZshN80=FY&`v
z%!_ZEOT8+2_jB)vGpvy%Inn{G-fPeT#leHefcRc7Z$2h^=gSd3-5_`9k?w)hjnvXD
zB3Wm6YVig7=gRn@d{KRpduq|(;>~J=lN^kEkRzg7iUa$%Vwy~l*&4${6R3E<u+2V=
z(HA&}&Ym+f$Lvi_xxO~wA8(%BS9x{)2I#|b3xOHT-*wmcnI!BxzKDfJauAFFcH%v|
zarNHU>84&qmu|x8cNd^<z#&}g`tw$P+jJzD7#7SmVw%=J?touVIUgDxe-^bLZ|+A|
za~)ee&!`4=$m9Mcg31$noYQrosRwd!x97a(?KTpD73;?nJMDG(*O>swQMwe4fm{6=
z$c}rW|EUAuD4!eDukLkxj?A(Ne4p~yr0CCq4P?_3I}ap)CK!Ke4l61$Fq4c}i0ss*
zPJ6MXj2}A{WQ<4AB%Y49y7D@IVK6t&hDSbuF*Uyr8#B*gNpoA?XyxWlI^|VsdjGx_
zSh`LN;XE*rQyZz)FAG0gep$UX@k!yrrE23#T$LJTJWrN#lFlA^b;lg^EPlakS+7Vl
zQE}+Jnqx=tKN(j9a4jW2iSp8f@7TKYq?<7q7O$;5V{je!`I$a^{*A9Up~Pur*IHU%
zYG@12x)1)E3*=WkHL|M#2#?A#0o3g6OEN{7WxGmA7UKKGFt29|uJo_?2DLqI-qSOV
zWN6SMaCe@LyXA6BoI923ZzcTP?e+=4uO3?u>~htWE2<oZ#0C{52|G1|a?1pfdn)XC
zs0=&_3E6O&0GiYpC^ox$)MsXL_Yy0p&3bNG%yj9RPUX>`U1ZpAIyd-s8zcL@)p8Ki
zSwT(Zpwv-dO<lk37$6|kvi<hFc)>TtU|7Zfq9(!)ef|TAX0M)Vhv3-;PK0&sGGx9?
z_!<Lh&rD5+i;Ny~Vkesca0lgNH|7CUVqrbGEy7EBWvFI)D29funXg@Bnkq)bWic#k
zu$I&rsPXrn3Eu<~wnFWg${fnS6-Ch?_Y&l|N;UhaaxyWFk9Ry}=tQR_GKK#ns65C|
zb#e;eL0AWK>5QG*lF;K5czty-qW3r{tY9^&c9o7l1k0<Hd!tI0llRsUwN5#A4o<^g
zlBMWIzi6qC<S-M(HypYf9~tG>ygOo9=Uy{Ue>BvfNVvGMPaum@z(I;ZYE$g8>17N>
zof+>Y3im%djX!&Pdr$nb;FBC9Es<_X&3rYx8-_V0Pew|lC$<}6BWO=&gLJ5(!7Q-T
z;QFifA06SLZ?Hf2EWZPm&f1G}?g86;+^2lumhB>EmkNTwa??;`o3s{;e*lNL)NAbf
zH|pwt&V>bS886Wh4l2iU`RFS$=ZoDcH($7&$m_1J)~R&P2S?EJ+>6^=pR6;$X7`2-
z_ZGjV2K}R^N;v0y)SX0sWba_TNO}lG&~4M%LU-HsE5#Pw_wb#=WEzqeM_|Eyq;!n+
zc)U2i{{I8|HwB#lhsc`<27}fQHSW8I&x4wx8o-mnAXfNEoxu8>DEGg?{*4VvjQRx7
z^J`Mc|CMO;f3AjD3UC)F{j%PA@RxJ}`!9fAAG|39W5{1YFTd#ot*ZzpOe>*Z+Chfv
zP_53N<I8^@bpGz-pyP)7$(4Wh_DI2$;=rMkaWw8e^!IoG+z$wDSO<uH%WcFG0BH|>
zWBhuO!b)8km0?-RZ?y)pRM@a7Iv_$I*aAAYx8jKSC;qR+^1t{$);e)Aq}zT-%t$kh
z2Jo0O=FsV+>n%q-Aq+tBp-!Ct;_`{cnuX>;D)|06W+w{Um2-ib>&kBeKx7Xo?75qh
zDC{JomVBF4Hy;(vXQ{SBjB1DcB_0E`3xtqG7yG*$0^73%w#mHw?pvCaI7(QeyW`=e
zsmQan1P{VTor!PFq{4UI|D`V+un?DI8Ufv-N>Ev4SVfaU2#4@LnW2I}P1_^sw1DvW
zXO&BKhDwTYQQiInPrO#en15k2VsHKS>wvH!_E+3~9hNiylTrr4xVkABK^gVOnG-Z?
z((kGoW!39ndHhxWwOyS>o>89=O2u)%lEoLKje(r9-emDu(3GSEXqkvpfL^g^uMEbj
z#1WV=d_k#PwfM)1$>`5=XZj^2Z1YL2StFO~tx!jZJB0B*WQ<Po{K-}5ezuM#%1mn4
zfxlC*$hc1V*^?=z25VRwz~@zdX=P@(zo6(Rd)Y!RmI7)43B&}8suqDiCEQ>bYbsKi
zYaiQd5cO~#$73@?$1ux-5No1LkTQPDu`l&b;@f+F^J1tJ>`_r}Kfzw1Q$Vx2w9<+K
zj@z#{3a>!OAr`o$aw_iy3+;9myA*mM%uZX94}Fg^!>@>FkuuzN?Qgmb#Q7*FP67X%
z9KLz!+^Y-3%&r?EpDYc~*q!mKX1W>gEng!5W{QJ17e4=eNnCU#aqImJ({zCLUf)^l
zi3hLnm4FKOpSxlyp&<yzL&;o}+LoFpZrlfW@W<`v*-u$-|ChbV?34Py4k+1Gu6A2z
zmv|AqdwD<!*nwHigTuEcj?o3Lm}{>C_=JVKVOu#-oL$g<{JOf>StYhMSyCrOU6l$Z
zZ)DG8vTN0V;IZs!*nfgYa^*T;%;H9$u4oY)4kyFZ|Ab3A-ZxT#9@x^`#gK!`Q!zpi
zzfDr9t4$5sS1LEES2<rW%EB>{c%iI&tHbxyQ7zc)JFJN5E@zYG6<R^p96xPwnMb=h
z{m$9dRqa{+=QZehj^aFbNN+gM?dN`tHYPoFRy?Dohnf#DL+BaL^FZ+@bSLwY7~>%G
z^S`=MQE&?IZf?RMkA-;~31K|)Wb|U!$zx)4lo+)t2<_UV7=)OqlkrOJXmA{Y{?{L6
zlE5%Date;ob`J{)9;L4#DFPwQfCldHC~OlporT|^)+#4YyCi_Fy%#N+s`|cs0G;N`
z#20veK^^smOe__jdA^OdtMgN@xaEgtFd$}taru_A?iywK%n|64D3-p9KyKDvfDSKo
z<;yfStdjFMBnRUS=Gvu^Z}>XZeT|!reNcTGKY9kx#n@}ANTuRTUT)nJ)#7y<pN3|E
zS|Y@d*-8DJ&*^%_)Y{M1g@gSi2EPQWkOMKUb!Uvkth4X_pOvS60gY6kon~1Lv1vbZ
z8O7rt(10;}jf<d1Fcd?+@!F3I>x1qH{C%{L+c7X@Dav(QZ<-VvYo$s--7fpDGuVvO
zZo=9M1bJobbUR}BYF@GHE7SYVlXrlFH?{bLXS!d(0<h!*fSIqE#1c{mAbFJ(J}-vK
zKTAheM7<=i68=$~g8WtOd_EgCm-*`25s@A3kK*_cdS5*%${1i4>WGERUcs^gU%wK(
z<u%V@)gz`GE>oiXnOSrSWYRH9W|vpbzDG&sIrholezwA5B(Mgl^#X`D#~be9c@G4P
z0*uI1SEIwxUWLR?m=Px`osVCE35dn0=4qTjzV3+&-0Gw35_t(QT6vWD`S~oy+%jJz
z=4+ksnVqm2Eqjp&1Pt_6KecYVE>DyGyp&85C-7@}50|LZz@dT8l{1SQXNVarCr4T}
zI(AqS$y#00=k+N+DU#zsER&$<W}yY(p;YVo;emLvU!T6E&RqU(d3V^Uw+q|>7V7ui
zaVc&-)68<4c&gi1S?Mh;7g6tXoRHmye}3S%SX%Prmlifq+avIPzOfN~zZ)fb6r>+M
z%BrUYHoAZpdYc>WQgWc#Su~i64<My1_g)Q`ua7OP{p5=kRDEzmt1M@C(5=dw-$I)F
zHnyf+A9J+iEF;H;5&XVf`QCwxt#0pj*u@(Zu3m>e1w(EXUW4d4x>KKmsgg-whF=3&
zaUhFO8&3>?iwETV2@n78(f|7ie;GWXr#%wX;Eh}dmpQg1bNnBGyh}d!Id(W;=H!m|
z*UdN91+$S8Jb9;}U@-B<Ja8wxq8(y<YB#4Y`PSd}aC1gVl6b(h<fq+Jx5G`#h4L{6
z7uhFc`XtK$I8_b^CIKmQUz}IwZ%h<xVr&@GW-vbmrkX%9S^=Ip9$%l$s<P;vkRLXv
z_t#CtxiNh5x-;3TUciTS`@MHw+k+=M)_%HVfue)V5bE(XbNpYKW(rS`I?pv(r$M3&
zscQ&Rr6I{E*vzk8*Ppf#$YSsWk|$qGf>Hb&dMBx3Xl2a4VWQGK4!4;yJ$OUXZKGDK
zICyXU_~&Pay(FuNyw@M8Zs>n3yZiR`7rKvjV9tIt)Hwv&NhX{+&m2TxgP}$DGhdnr
z`0Zq=-cC8sbn0aoCUwfnynrZ@PW%P@SivP3@uC|!rool8OFm~NC`-DOXERrRW!Mba
ze6;G%es(UAVw`f`a0w3<WA-(gsqON)doNj2hAw(iI-4OanE}`{((0y!5zrwOG&!21
zHMO4Iu}tg7Bp%rd{Edc(;-ZmO4Ma;(NcJ0L$L(NmxGU^5cl#IGe0@}4Ak`N~`a>6J
z6k8qHp)OJvUDSwruCLiZCiGW_yP`cdYTx678r(5NCzt%ikX)sOXnGr6n01(kNr9bC
zyc^9mo-*<i65U_L4ac41E2e>#UGG#c`EI{>&EBfKb80jF<U2b7uqFSX`Vq3Qj}%y=
zq1Y8aSo`#tv3ba|2pP_^Y_vMAAA}mRHI-HJhv7bvt8)D{usDQz{XNR$Np3`t_I{7i
z?Xi-6p539F5^l-k?=tND-*=W*`W7t9MmH+^ND5pv{|<5m*+VPv^A84TIE-uEp9BZ3
zf4^5eB!fc{&578U+SC{U;r!w)A@rVlBtJw|2M?7~TC7xMLMehOr@W5_nN51u8b7-Z
zP(5+SE3+ATAkI6;cQsGC*2Bn!*@U>2oPb;ZgN%@4J!l;4yzJ9s-WI9df*}=Ji#2eE
ze;e+Dh{=|L%vP1}pe3{fVb~5dxMMqwHNpmQ$PcZo@Aw9`yzga%mMsm*Or-<HzsqgP
zhfQ{f=Uh%4Y-PQSVG;%xi#4F3pyR@|)Jy(tw*+uK0v6xmq}=X#-vydsVacL8@RZwb
zjZ=T1NHY0!Qzk*R^HuJnpW7BtZi-EG_Rh4CG(ABmeA9TO)VwA*h&UeH9Fxb*!U{#C
z!yv%A9jFbvaZEO&n-&GIiV$qd<iZzfV~wVz%KL*<XCOIkf+Pm`9%<POA3{%k#Igu)
z09P&t0Y`$c^#@=cJ<s-G-aY6y@?$W9%^m)C58-~gR_gy|0ifWqv|uMFb^pLIIDI|O
zzSD^ccIjIRw|X8gyvu5&#B>0pM5cc7DdOsO6c6DQ**{w9Hc1q6D-zoqyjPC)AYsc%
zyNaq<2lA8+z}l!-jc7*P2fam$2LyWNML+o-_q-afa;cFmm&2SccE8WC`D}CP0xD@!
z&&p}a=W(bi2%T5<Y{o&{?gRNIM!9>xUal)h=jTgPdet-bGz1`vc{*)uVjBb7_9S!Y
zp&+eMI|_BEOM=30owiIa&A$VyA7!_9AJcm3fZm%9DCgkUpU!weX@OCh%XBZLxV3(2
z?QaS~%|G|OydvU~XX<;Wg&{ev`oS%7y8vg~;_;Gqr<urVFs~7Z*9_wO4WW+qG32m5
z)>g)e;xty(!8(RycGdU50GRAMnsw(8ay)BP4Y*rX_;&It@SqD;_y)S%_2KrN4Szwx
z&UnNx0@oS#lILylz%Gip>A_-fs`*<|?m?9Dj+hCz$$pJ^YRnAFu|04R_7#18`|=7_
zc%i%^x4DvE{J}OA3q!yYnX6Z(ax~2w;-v@sE2ccVob6uSLz4%NxQ{uW7Gg7KZrKPo
zgMFA%IMDXYe&PYu(12BrLgkc#%FoJQY`Mm#f~P*UG&JpGf2>;3<G)pPcy_Et(Aq0@
zM=z<a!86p7Mu`_klFh-v=PTy&W}aJ5<8j^$dCz-@)3Ba&PjTrly#<A}({_g5Seaq!
z0X_6(Q}rO>HIN%+XwT`Depd^p>vPWvGLpuM<`(FJdp<@ta#g))-i^&Rp=D3nWh%Y7
z`B_AFxwhXtai|ReQKPa$zuA}h05S>3n)C^?FBPh*KURuP-xu9~$b6m+z78bnihNO!
zZgLbNIA)8J;jruihl{q!jnw=`9!QILNV#Tk9jGbk#JATP_HHfD4h@Yi9?S{U^uvts
z5>cW+Al9BHI~V_x!RsbDYKc6W##&pciEVl6eZ5ZBi{z;bIG}QoCB^sHdgQfND3YP~
zOz*-xw~75)!juPm+C`X~*gft3k1704@vFnISnjsZGZW8wNqqS%yKf~=iX1Q7Na)Nv
z0JX&BZ25)?CP`V!J^IEYwmLqgIO$2ND%?#&-l;JM+X1Bos!?OlU#xF)%<iVG22B0z
z^PNO*GB$Yyc!1Q&fJ9s?F4wGMyP+23@!3>+jaE*{*?v~Y`MO8%U*LK4eb8AE`Xw$5
z0LJdidn<1yN=#)he_-7+`)*=YXi%eOR7Bgs$9^Hu7M-&tX{^inDc5ncsyd?vPiP-#
z8ahsGD))8O1Nwp=J>hw_ZTq5DzuIgSNioL6YnnXn@hI7oH`Ual%DB#RQH``1m!&l<
zmnoSpf12lTI#NQDvec-!%|ON=YGcA8=mM{RZLXXDo2%jK0U@yQIoez%o6=R&Yf$P<
zf5c{K2#N5ZRxjQ>H%+m;wZLB1p^UglBXt{pGb>O^<?C;_U=wy9^Bt_Ed5{dA(jAdR
zAM$Y7^`%N*LOOm|6|z}YbSH=APw4HbWo{gC@EgMMxC0?H)rXkR7cR}f6WFU)B00};
z+NNu%XPxaPT{yMPMX|tC%gKAGy&$LH)CD`m1dsQ~Y4U6ET63-=wOYuWs6h6e2T9yA
z_|~q{CN-)pq=;)c<fIL<<0-ZXp&z!lROW4^ickP+rF18kif$JE7RIYauz6no<6X&)
zV&znbXKVPq-YcQb1ac<K{|)5(&jS;?6ld}i?;w;sJA{!--0*qXvf7wN3;y)ky81`z
z#QWV5Z}T~R5HO%r%WU@EN!5NPBC*QH^LQ9hLu$wMtU=13h@)dUpa8u$v>amD{74{J
zLmZ;N3@B0N8}~8>6SjNJIM0LK*SJ=0)q8w@tdw-^Ge{hL|9CklSyEyuNs}a9w4NTi
ze`O#2-Tj1gK$Jx@sV3_ndRS`iXlHR~+n+uPa^-+_X(uCMdwHxPWy)4ow>5;A&Wvld
z`}3=-%IsI;BwER6Y83}VA3gmto4{b28XWsgmlv;!MP6<Wy7jtkmU#-Ed`=zKP-ZzN
zkXP4MTqF4;-+B@z#&O>Xa2I{rb!t8N{t>SM0|I$Y-WuJ8tVXcIuiWE)xx)S-wJyDx
z(riale>zv1AV7pW#*+^L;i-6gTq$x$c83+#T3O6r3%yE-G&~E*tqjia=Q&K0LiIw-
zU92RFO`uE=V-f;4Y2Sdp{YkHT7Q4&5E7mcRio_Sz+tGg>XM?}L8<gST4QYzQN@St}
zZ31HC#4nNLuY~eCeCSv=#JDY&bi_Ty1%iW1VG%U0!w4iwZz~}XwKOzBVzk%h-ujhW
z|Kt7kashH^#~I_5#_p3Kn%PWp%CpqSFCLoJc8Q*sQtz6Y33QRC@?sFM+4l~Ovquf*
z(ke>=Z?e;!aRZH~K<cc;K&w(Lw_m-~k<_pyU_G`o-fRa*-I9AdSQrL>sW9&%;Q7hO
z=ku3(W<2`YuI`HQ4@2+=D&<dB3b(#cLve<>pXL^JfyBUriu4<zp*)l{O^PESZj@C7
zKXMr7d&wOotf<DZUJyNAKOIW7AKEi)MK>7*bNuVQkAhx56Y5QWkc{Rl<m|63y#PgC
znzKVNI{MPAUANWOk;R$*<=^W4RRj{#e)(w9$ycPY0p9}dR1C60HS&4+bxRjA3is!H
zWFw(0m%nAC(O#Dp_8xJhE%W{o)CU3dFM{U&RRX(fuOfmF2+mhQ`*Uu80Dp=rSkLz?
zVVf=i&4-QtPp%6znfQ|LB*Y6eLs@L~UMSRhyyLfS{ql+WdnLvi-N3NF0<819aY015
zRM1Inw`aq*pKe=LI*bX{DnNAV+H?HLhAoRvMBDs6XcHy(ajtxoiQ0s?6EJp(g`<~e
zJ2Ype$#0s6d_(8FL{%|eVR<ZDAQWEpKwu6{jtY!W94E*^E5xm)f{gl)4<^=HJm8FL
zX<5}p3RX`cG^IQ&D$B}}W?TK=ER5w6w%K9#HO;B@S7T_LbE?tppGBA;ga;D+SD@}}
zGPJ};zZiTjx)weoA&}MR5=o4S6oJu02}l&95hxc!o|1_@Cd1jCUI1IjNGV!D#9;RM
z#uGYd@=+Mmbff?2K#8>M4jc%Hgfe7;F4*VMRD6oT@Y<YFYd1<)1_LA76kZ@Lf3bp8
zqq+Q!G!Sx(qtm6Z^g;3;f|Y0t_K9r*vbnTG$aO!|MIr6`c&^R?!)au7LQ9E4z}OL;
z&0S9elOXMhw;}mXx<G^VHchEB7^Kd%#i&oZ8%3l?hcDxkUMayqm(E1kZhkJmO@Ut>
zyls^eM?d#68@W0|OjoehG9A(+XwnF|EaDvas3Y-9Gt6-+jf88YcX{O)3~z&lq60dv
z<V#5--LbefVxBT$ut{BMTnmcQZA@o=F~Q2X7cGTgx%Kx4cFUn9lN8Hi$}TLJvMn}M
zMg5NO>7Jw-#i(TQC&)*e#ZD*dIyayeJDY|RJG;qjf+<Ecuw=jUv#1ARHEvg&sraol
z9;#3##~iZ@5)V5+x7sjjwsd$d=IutpN}|5pi6I~%$)VJqX`~D5{*i>!E8%U`m<|#;
z23lXF(er?y7Me8)%3=Ty>512o24>yQGgQjQt9{mQA(bk_dSSlBD!YolFIsGql?sA@
zp<;t{<zCyh@){a&7iN@#CThybT23xtKp*ZyNi5S3M$=H5U6R2`U}hvB`Gf9Y;luhK
zwxobb*yIaF;^sndI7U%#KjTMeh%`|Pxjfa!ZsyX<kxZlvmw%m~oj6znI(ls=!%Oi{
zP%#Kz@UbNm>2&gy-B05SWH%}O?WU%6qkzh#oELawcd8C*PVI??E&Z@J6_;AD;Qfhi
zOPi~=3uMwU40(iN^q)VjEIJx|<I~<DWDKk^s@Wfqk|S<)PoH^U8f{9Oo4-t`gtVfE
zvbT=s5}ufRYX{CVl#}QDSJS6-y^oHl@;L4pjyY3ZPX0mR?T0xFnNX?c5rH}O0N_Ud
znp#5YL5^GQc~U_#P3BO-Q<4|k8l(dszJI>MAYs|{rkgHu??)b<)tcBy=6#_<iIwKa
zTmJ3}CJ3@ChZz7eA2XcK&W~5}e~xQsU#tFxQ={1AZ9Cnh*DP4UzIITuQC>v$RekcL
zTRsixk~QgoNe(~qM?Rm?S(`YzsUWpy(`Grq;tqfHS=N8ZCuyhXnw$Q%Tp&Rsk$Jtd
z``MAxXfNAMgYwuA29J?8MoFnDiFcAMZTy<jS=m4uEsL!jFhzo|ia<X;aUDH$GeP=l
zkD%Q&IpP|~_GrH=K?Tz|i~bW-jOF;f`Fz}>g7NH;+0-6@P@FT&9GPn1wG|nLTzTUR
zOgf$<GMuVa!D|8y&;r|z>1b=C^(RVi*)uN-F7>4F&9MQCkxi`xD+&MQv<h5ex=M)<
zrQpZBef{om6Z%Ic4#oLJ!r7x+)y^kHCJTaPX$%yM$M=^@vJR#hT$rritEQfqHa>so
z{T}4JUsaIe@7OWpJD^4upC6io00(D5lSixgl*YhR#kRLH9H)=;Vn`wnT_IZ+EfYG^
z^EWQw=y!is)a2+bkBhs;u|Y`d*lu3)G!2UIE4M254wZ<xah3J6#Ljn1wXq|o2?C4d
zAB6@c4;-u|(d_Q20ltoD&@Hg16`ILff&xBoKdTX#4335-bm%fDNPT5kpi@|F`~^1(
zVo2M1i$x;PDas8GQ>5jzdXU~=tA1MK^YRjh4dHciE`Pk33+P{MDZU@kG;#V-r<taL
zRzm{t8Cq3N0yP3t#dI{|J1_%LtA3R7i$no?H_ro1$bSg|fJXo@M`79T5loQp3=-aY
z$zJ;+pcp&zwZo?&XCnbF?T@c^bz-J3McgE_3+?q&Yr)mArI2{#9Z-rx?ei$j&(Qn`
zI24h$?^NfT65w=yoHPHzpTQJ%N@Jwm`%ua}RlrTU@`s*o&%wq72W}tRI|>^`;*ly9
zYQBNNQ7xQ}+M=w72%Hi!k?EP6(Go^@Mz4*K8}~~`Lq>ObfaD~XnRzhaDicV>aLiN?
zxlQvnCsgCZ9VL&F;Q$+VdMcKhu#NheHsiT{B=YuFe2`<~-qhIiWTb7N@PN_|ZmO$^
zH&R9kp*5gPNm~Bo2u1vT^;C^gZI%u)Z0qunnQY%%BPi@x`uls~^fq3k&Z=yL>soP9
zFqkab3v3`o0!1WBoH6d?$rle*1Cw!>Si|^>r9m&Xz>T@@(%+etOo!Nk*|``wO_i|e
zCFLz)xI=d8{56?m(#^fKkru{EBiqIF;Q>VXl*twHZR>2|2j<ddmNbFdy+T718B_>v
zV4OCG?t@#Zp)P55NbvCsq?$nAUhYOSyAn72!30Wav0$?i)NlUBokstPGBE!bG-X)k
zFKp<!8qj`nOYx*ktl<6nF8LyDGm|5CQB&WJZ@cAM3j?t#Ms679Vk)tYwmW<_!@1(0
zRx7jgTr<Lu2>5I2bKuEtMS;G-?kFm><IUYOr-*)WgKwXJMTAITv#llMy%5(4B6#*)
zykybk;kjh8A}Xp2XVfIdJctj|D=9$VAC_#iEHGUW==@%>{4IxWQC|y2?N6~k0^6g-
zqaowQ`UZixjefm5p<H2oMO4Rr2Db5T3KT%joz6<B-&6=a>ldhWXZCKk&AF{Q-TZ@_
zlhIc!tLlTOg4vpa*y*T%tsQ#}SHV*Sm-C701H>yo%)`%8YS21*!vT*vshzn-9XH`}
zVFFD^?cW}$Sp46d+-R9nUf`nu^#X;!K=aU(NZTP)>uo>h1j~ZOYa<!2>Be8yJyu3M
zT_&YTIrG}3P3s=4tFb56Uyu3e)3tc>ae+;B&~+o|EbOlSxf@M|71q<Hn=5v{9C#YG
zdPjhW-)+jC*7Lmk{kcOi<qY{()tPX=^Q7aWxlbusehxq+Ge0!U)w7J2?EJe0Ef$G=
ze;Dd6{A-`%{fGbIBVmcgU@c{k(jKs>#9dhEdiaK|TsdxT0z{@gf2~=m`fr6*Qd+-V
zb1YG^k9}EYMS+!m+RKceMX@V^H=XrY*IT}9*1PG5XqrbN`2XN^8yn;pJAip6gmWq9
zrtk5t_1M*Zea4^qVjahEBS$LxnM1edzmg%|`A8(+bD{aie}@oaY_WE#&Eo#K7ym|g
zzW`u(bSpg#_g}enzpY#pGGNP-S$5+5m)B+Cg^?pwqGD(M;amh<wiHYktMk%rlwG{}
zm(3YVq@$RDeIe#=8Px9|Sp@mbg1fc=HfCYEDPS8oj9T?>BP^g;BLlE=!~x5UG8m}!
zE^S_L@AnA%V;}ncqkXZjAnbQmdqvn$3aCdI=3IhIPz``PO>lGsSFmb-)K(Rt2(l8x
z%{~6fJ_9fi$Df58-dGYs-H7e^0-8mTn#l%F6m?SrPgr(rfJ-T0M<_6I^70qN@czMM
zVr2wX6j%xy8`=p{<6Z-oaO+{>e}{ImfNZVUn+Ad}p}Yfg(tj)zSo@*>ycAZ#0EFQJ
zOsRKXQ2ZyhbENsdb2~xhMMt8@r}>_w_sLLRt@(d>m5jTLKM9~f92Enq4j*po0~?qJ
zE<&pW$Z|TsZTF}nRt_+vR%W0bX_~e1|C;~@Z@B6&0vub^1!B!&L15-r5ZhsXavjf*
z;)ZCnR3bqmEq;UeeROD1)d~ae-V1xqgFlW@2x048EF+F;;lNuP{oKWl--rWAgN<PO
zKgtOxL58t_IF6ZnNVKEVmzPwR%(lir*6SJOC=)O=S|I0&M*hngB39}{XN$J3k;UxY
zHHEgqr4R<+>*gtb#ut|N&Zo{#AO2><(_}+BF&;Y^Y7^DB+&8E1BpywrtN`pS8u%At
zeZ2NNMW|rOpsCb)w9L9MD6Ll%5PrraL;fa+BK1Cq?_aW(P`=;i*JSNa;=U0IM>9w#
zY4Yu-op&JMIY5KLD~5zZc%GE9EA^tnI()%q3v1Z}Q{@)JohDq>7KjcYiQ}c!#aVT&
zzsh-!f?M2f1DMEuZdTjSG;B5lFEYE1IkxW#+i)a~26=z{I_6vo*-#w3uYkbK^bI)P
z#*8d+x8FE>EwYqb+27!I4kiy7KZD%6%bORV*ftP}Xa2`rnQCtzfW{0?Rhic3T|PJn
z7?VU=nH8`xavl5<{?o69>HST+hss`TTwxxM`JYr##(L(55NCo6#h6*YGsm+oHp6=F
zD%a({CNA<1K=z=GRq%cU<fDH)?gOAiTPUoUeEBJK8}MTtEo9=4OlcGFdzU^ELDBX3
z`Mu0656`@}i;18vi;Ch@5+JlcWK8{1q0v}5FkmTnq#Fw|1Gzj9rvHp}_H-x$^Zbg@
z$8m~hku<zAAPw?O*s=S`!=heyhbWBa9qd$8b+S>zJ!~z^{l~ckPP!sJrvF#iSBFK_
zZt>2*07D5#cc~~KA>ApUAcAzKQi62H5E4p>lu9EVf}o@@goJc=gCHFO14G=G@7#0F
z=Q-y-muLPM1@>(AyJM~OD@bdq4gwvy6K>STSdSU2!~@)Q@yKp7T`9pTWYC`^M!I~>
zcMH_BL+M0al^m)Uu-}2?DNsGP^y2L;|H?~G1VV6SdCQQ<0H^w4Fpt;P3Gw2k)V|<-
zoE!P>hF|D!_q49NuzEXLq|0TkA`%43u)Amio15<kG=;H-lk^_b%)1RBK<gWFgW0e}
z%nbydP@zL;lg817+9fPN`}l!?QPA+cz>CE3As5L%u6)!OuHk?zWG##oGldBN+rZ#g
z8Jw0qe2^?@*(p#8VXa)N0Fu^?0^+b3-m8ecL#i<nhHK$7VVP;V3xv08#xrCv#l8cE
zl>N$RDv?2mzGlf2E!7lpGh<912a6$p)B;Wo7~PW?F|9b%K4^=iW(DGoI3Qzr09?bb
z-{pt&9RrD_j0zJhvOoFojQ!LXC_u_<rn?5)Wid73)@q;9RyZBmPJn^l-NzWtx~(}^
zqM^TF{d7q`!_e=A+#$4vl(Q?!MwH?Ty9_L6EVWwh#jV+WJXG^NStb!4eMF7Q@n@0t
zXdq|oslZ9&;Qcagtf=C$`{%>!Ea~dBMED3_q5NiRSGr*!D<tX!k%`nKd%U?Mwmi1O
zw*$q_0nrmH^EdS{Rn6h3!O9y&2HqG<4|<2!F6P~Z2}Q2mlCW82Alh9~c+`5FUz(pa
zC}X`bRrRS@|J%dUqhEGo_*G|MO8m|PGx8Rd1t7u@+?Z)82h-pp8fRnUfTgIi70O2R
zwwTv$6V8?<@PYwRft?Q@T@zL#{7rU;<>_qwiCGk~*0y7NR6gKvvF!7CON`WyZ^zTD
zGqvssI6jB>X2Fq-%TZ2Ln(=l_c&r-@3E{j_sscaOjK0Ha%ig#L?D3zNK>Zd!Ql!&W
zL5%TRq^=)vmF5eGuPV}4mh=h9uHAbM!=FqZuZl1OkKx2p@19vffQuhER09eIxHI3G
zmE-jnjDd8K^h*Py2Yg^NotUuxB<2{qhW!;K$Ft|4{iNV0+AdIxVW3D0^z^M4Q%>~d
zklLBPd1sh!30Qti?LF(c-q1`aS=|Wyi3`?TCes3d4!H;U0$ktHj?FI$fHNlhR~a5u
zPbp=ZfNDHq6=(Xggh>8nw&J0d_>1-doS^>KlX-xqDM;ZZ-4?^h4WcX)hG<%1=1Ag{
zW<ibICY4f+&1^ZG|K_vd+=R)sQx`o$z#wqG61o8S*z+_=!I}%qPo((xPk3eH-x=A7
zHX<-G;l~|+di8Y%g4obL4ObS3drhCC`douwT@BJ=gs&8^4N01%<zywgBla$8%-T?#
z%nLVq;-Zsn#{4ydsVwHBh%+zu7NpcI25@*!c!(Dqw(RYvDiaclw69>x6>vsU-k5gj
zNu@lw29g=AUFK&U>Pl*9Y!q`W+OJ)ASYdD)+)?!(lMe^fq{&DZAuCsd;&pQ^^0Db0
zWlArlrE`m+vRV)aBBlHij~rtLz*?L@)?4BQs|R5?f#*)=5DS_hZ1H>bXHCF?v1l0>
zV8#O24fYZ{gS(Z+(R(F9r?M|KaH!d5fr0!k4`tzvjCDj+6Ba`=kvQE|0+IyTl2sBX
z@tJ9>!<o&}{XWLDJD3c#OrEo)3m3!|>0Czo1$sq^-RiteQ_~0SXVqU(f+2pMMeNnb
zPk42y9$oR($m@w6i#_)i-Pt|-tF)dfe-Ek30c1=dE#J$ZBUibc$8LgRP_S#v8wFVr
z(EMz-g5OIfWo8HRDEx=pRUjnv7eG^e!u)a1w(D294Xb14t5%!!nI6u+^Z9^2?@1BS
z0HXZ`iPk?A5V5YfYnOVD2Okka)7uWvqn}LDt&ammz6I1yt`Bi8e&yz3Q?K$(ua!ZA
zcKYX#vv-5XtXF%U)1jo%yCWOU`khe&j?-7acm+Z?8F+-+*6{K^Sr242fT{*PT)kJo
zG_OGfO|LCRv(I=J(Q9CifEkQTQg%_fdod74Hk=-=pA38M7kp@Vd>=l?*C$TuR-FGh
zY7!~zHuCkHCY^qKRT8AQjjjt&97fTJ3~n_nYe#XI@P6~3{_u(>X_HYS449=)o{UT}
z=Q{8*Xvoc8nf%v1<{2UPwkT{4eoW{&2RtUJOtVi2f_~age9lQV=^Dq{3OHHO`OWZp
zLRi4v&$bdL+D<8=CjxC!X{~M=@LW=Svazwe<p4*HFlN`L6ffO77e&Esyjf&i@;<e~
zvhxY>5I(}n)nsOJB&ueJY;kEnF!^j5@mL4BRgI@ADd3K|w79=8bUz)t6liw^CCTNU
z`;-6hU7xrvd<jrlhGdG><}XWr-z5Si8kUH5MBj%tzzzo--@ZQ0uYS6uMYdaz7fKYl
z_n|+*``nNGiv{XYO{fQD<2O*NqS&=cmo@gfO0`-reLbc27|8DGw;nIQeO${dN#46z
zzb*8$l>8WR@@M7CR6~p`G>UUTNv%cPg@s@Qb`Uc%n*yVPXBlK#7V@OXm#TK55|OM_
z9P1AD?SKemaY;3EIboG<#Q$t#w4}lO`|DrHS(1-#QM%THTe2WX?z<<WkhNi%71H}x
zG)>p>&||5I6DN95R{NPsNm}bl&+CJryX&r;wcI>sUCD2>hR5n2@2u5+_m%=eYy<E}
zGMBhjxdiO0Ewf-h)vHeWTO<!%%UqC-V}>x9cC%sg>6u3#u(w_~ncz<mWy0D8T$vmV
z<+z?RwlY%HJK3x!m|5`V>6AISNRuJ!Jui$|DP$3(x0p^tzW$|5<xcZ5r1iHV)<7yL
zU%X>x5LIk!INzibnsL>L<q7iFD%38wcrSm?Rd-%2(l`8BrXSO?^faRB5&+%Ur*%X?
z+uwl8h)==kSm&$%s{eG!`%Bz?(?;~}5s)@`^A9)YtzgVJOijAo4mc#no$TZOquGfy
z{0qxg*jTYmrZtx{)WI^#C)WyoWD$c|l%6>Te}PHGzOMQrEX{Ktm#>>v@Hy3_z^;+e
zbWeB<7|3FxcbmLmAR<q~x)B|u<wuoFNCWGe0r4zTjJ4}AT8_V841SFGz|wX%a=7qW
zH~C>x3)8R#b6e8@9e)hO$ZKat>>?|uwq?o>*nJ93TCEYF)nXuUTAng9wOVvjp>xFb
z?|$Szl8zq%p3zSR;6ffK)^CV+M6mFPcx==}0)vU5;6rVRS{UsmYAI$)WY4(f=OG+!
zwPx{!w(oktE@y<@(DRGHJzXQBG5*lGIYmPEs3oFxj8Bv)V#6KB?m<vvd7WR((;<RA
zq^3)IT(SgpniX?tX`cX7!K|)XL)PfrzE_Q5TmKF$<bHeV-;Z1;;xFw-D+&+lU$t-E
z&@Qg9n4^kg!JuHJVlnG`@MuE=5Pvd$r4aM@qINURxA6`0%p$!AmFRrX_D1wtOOITx
zRhK^FsT<k$nDt{Tld2^GkJT?DuH7c(viIjenq<}nP2M@c+Q$-!V-PqDp2)}(C?*%x
zMkXb%bG*ro0@CX=ki9%-@<x)^T5kMQ(Vhy`w;nB8sR($b)3-?VzHZuJVHEa^Ald~<
zRMkJ%CUF>jP5t<tETm2M@t~?@aQc9uM=P89EnfFSjtI}A0bKUWv!Qx*?fp;XGD*12
zo*-$srdq@Ucz7pdY&k6OV2MV*(t2UYGFY<YHi`DMjlJ2u?<%oD)%(_$ihkphR!<{K
zyr|a<Gy6KdU?HS8>W}~P_Wf@RetvOKpxlJBcX4O2Es|bLaFVy;S<gKSeC#*3RPcYh
zWI}n~Hw&~XSg7Qq#hi(&N7N}95I&(Yug=4BVhWnhzu9Q?yv%TEAr7(gH5ahbWRr&@
zkbzp~ycStjG^V61uzAUD;V~bVzJU_KC6Nq)=l#fSBT~83`L=C>ZAXp+L8j0ELu+Z3
z<Dv*A%)U9VIM0`mV)rn+4kCmVtm>p%Fr;|i5AL@ry9%}YjZ0|Y)OMH?0$IKfAHz9;
zAq_|c*DHSe2+oUF-}0<vs%+aR*QofJ3RT!Gb2KBRvIt=3s@cnI)>8?RUpwFb)5qw>
z{l5T@;YAKurEf+KHLc^L82groJOq??4=A^Y4VrX<b7jBd$@q1DkUlnf2+ubV628Xp
z*7}`fown6xZ5hJBh<lk7E~+<78KNQ#&C)CwOibo7Y(#Bu3=aet`JJ{&DI!6JlGX81
zE9ql-AL{BCRH1I-+ef8*`(j87*&^6DR{+L$^9CSJ9rxI6)Ah8KR+9OwK=szRWf;Jx
zcW24P19+lJZASgY7@e`ILwr=>h&R4>n3*IQpt@c{nEI=V$<nTbCs4;rrOH+N-s)>s
z%XR*1m@SgI37&8hxEMkRaanWv9oKMbx%2Ie=1*S4Mt>pckt&PX;$RNSfCkkfdA2oO
z!^d-taoVlbwL}K<KSZ!`RDWR!rT|Mw1GdvV&uP|PIk_q@f9|o+qQn-Uk;UM_#+}Hr
z>*$YGxzK;lm5(3U&SFa(6`idJxfSRXeqer|a_kl7fg@H77XzI|*=fnsYxZ=JPb|MY
z-W)C0Xnu=T8+pXLB5FNWnmVIs9SxZ#N+TMNzpXx4sA1|>qgPRfS+yRy3ogLZS?>*d
zgL40mm#l$HPABuSnCyV;4p7Z@qESnP=LN7F#-rOe3BL6v5}>+_F%U`&Xs}rAYNRcd
zuvBaKi-wb1j`k?E8fVV8SnK1|yf?@Q-fK1@H)NGf-zx>$(w6EO!~LG^bq)bGhWYgy
z(_Vi{p#ncDT@79$enm5!GVyx&0?2S{56NCwgg2i`+xB5ptxB@A5VF`<9)}xjH!=8h
zcKe9>=F-E$I!S_|D;Gr3ZD*L_57rD-su;+0x(j`~EmyBerYu%bBC%^+J~uIbinV&Z
zXN2d?41A#%YYoxsBnY3VPAy4Sl{JsL^xHbSzztvUcNz6@waM8?0jZ&6znjBo@jGuU
zvI95J#GigcEqZ!%+*_OTbrE4SKT7oo0UiLV<!AFmgu7A!@gE56cV(qfSjp1!j)c5t
z+YGT(JccfPLf}N&6?aS{9$4XaWG21cLrJ5$7`c(GJ&}DtBkk1;8dP*!<L@}4n%lBd
zWo9kEXJ4a0&r2g5@(?3hvHH>;Q5}5!wNh*iFd0D#`{wKC6h_gLhsC{<A>I8mVD+s1
zFJ1%YE5P|7cd@(-lX1?bc@bU_+7wm0tgChZnIB!S+3Jt(Oi595R5_#GuQ$opa`1V_
z0MW*tzz{swYfz3J_*?^~k+KMyShsb<4#Ax^b1|UY6MOOQ^V8?Fr39%PQs%92m!x*L
z<_uY{rhX~QH(?}6v58b77)ZL#Gqtw8#QiE*a&r$|UuG5_15dFN_m^xE3^2T~YUjm4
zWq<rwb+|s#6>Y(A_EM7e#ayza9Nh82$-{yqtX4=pH%}{xJHi8VRdG9%Ud$bq>uiiG
z>`|EsUE$ZuV2Ft!Mv&^+bX~sK5l3bhe-`qWUAu|+ZA|7NGZsXKW%w${6>8+BMMjKY
zR!Sy@xA;&?UEOu$3nWW7AA`M^<pK)^y{{vqPR1FlCXbsje2Jdzq`&p{K{7!^T&K?j
zs>gT5{}?^<BF(C=X0@Je>qM_+5s;5vkMo?iH6D0gI{HD<hEudK9k=ZC;+GH9F+VjV
zHrOJ8ZwtmIuapk+I+zFC-EKH5;d$C5M%~dX)LSw(W$onfx@D&YD!)n|pcu(=<x;e9
zz#l(Jkj|m7RP6yJp^U_SLJB1z{?{welQkBJ^I3da*!L{mzojsZSxQ7F4c3Ne+WibP
zDh40{{p&U~cz{^)1yAC%TSNrbRvG<gIUJ&QnMJTOxrT}+B>oY8@EkS)m8n{Q2O96z
ztwkC>s^IPSvRL@k+A@Th{Y`c#Xwg8M9f|6Ts%c;4T8lG`>_Jnc0T74~kl;~Aw=9_H
zZn^a-l@1xFJ+n&jSD;vZ*Yx!>hH2dP%|#a{oFhu$1k=GZioJzluz!ILo%U<LGwM&l
zscR}1O2A7M<La*m^smLhBWio*j&^af+xGB;DMBAak`wkO5md=UD9R<*EScoGQU^oV
zOOM&IhEvS$Py0!%3~M|;wHrFmd~Q!N=n#I3M>v8Ynp{q)p^j)yLYRO09QVTxP6|;|
z1Mq7xyPdGw@YVxl?*zA>;l$tI7bYi&rVS4Mu@mjB>~MR@8l)w<j+uH#?Hn{=;shPU
zSVNA}5^k%r7&Ypa2+8A8g`5x!kpGH4;qLiRRR((ZiG20!Vo;FyS53v;r9Kd9`2dCq
zJ+ayrbsjtJJ0>fITF;M)Clcf1$?C7EU&ju6DHdkIZCo!rJ0^#%7~#_{a>=R?$zv@A
z`?P=qcbfz?H~n<`+}v+!K7!uBc}c&2uP|8Zz<L$wXvFPjJGykxO-T`v32M5oq)!O-
zxb<zNFpQ{N=3pFyp^Cm<9z6JGYVGAzv&Stju~pQmM)Fde3khi@yUX_9FT9uhh8_nq
z5&{gH(6T*(b`q8yf^r~oodgOfKDyOw%c)a<dTa%3(s<V}<;8J|`a*qC;qBIr1Nzx-
zl(I`a%NG@J<cdpx4c5_i-44w|1{&=Zi>)@JFmg9q$>+&_dn-x3)Uj|(qJ^_w8EIBG
zK)Tb;bGP?l4Pu3GbcQ-dCDw9`h<f~bPE4sHM4@S1mU!2o)@v~tutt)%Ju6LO9n5sS
zs=!o8q`fzs<an}hgk#3<mtz6IhGm2YGUC%qm3LYfdb{bMkKPM@G4a2caN6w8l&jsS
zICwg#%smm1=G7*02jjqA8{EXL=7)B<bjwUbw=+Z*{Bc}iA+#qKmT47OJLhWy7R4Z&
z7_8j>c{;yMoXfqB^T(}Kkx|D%*kODqyWxD2aq9g!P~uJi6S$~GGx^bKuhj*GsNT<@
z<!d6obKTi}o;4N!vG^V=^77Z$gS=D+wkI^ZQ(`w;Y9^U6SeX9(@_K`ygQ|B=mWyJX
z-L>W)==k}=0RU;NrIV*F=ZRze$qQe>#D^q`GhlnK{wY6Zl!&M_u+(hdXgr|0tbWqf
z*i94V96(rA-&VDIVY3-_e7_y9l7jBB0_L(dK7C%JA_`TV-X|R_=$mpub5sE)2Vx5$
zq{U!>?|;nuf||bRr=Iik%q9G%UGP`CZEF|`oSMbDyo&AQ{^`3ru9N*Wz_;nnl>vcf
zBW{Ba5WYtFXdAbzA+iM;DwHOi4zAPKI96`B8p7nI|NV;gzyNuLymBJIn|M*%$>(8T
znhtZV85RMrq`9d?l0se#!>$6G1|hcj_YXzu?*u}TpLF+_xn7b&F0U=oT;Tt1FTx<f
z-6UlR);mg=kZ?8+=qrjXY;p>%>`qH(KGu_PQKA&t-&105BGFz}yhx#F7qCymyFpRb
zCxsSQ+%8i6`_6#(7oHeSOK=1vS>X9Nk52f#&G%+a<<_v}ABcU^yljP-&u(^OPpN85
zqH+(Zb;qQhCmA~1B#LYj=eoQnZs6O2z>i*HZob|Q?c%EIei@B+3Fy1)GBZ|U^xEKw
z9P_ZclO>sdcanKZD#_x7kH9(+kILMfRD1wZ&&l$J?@qc5JK{Y&9Pzi8v@dZuts-)d
zoq~$-Q$(8VzmMGi-Ek&kc<W16L`|>Z!bg;`LnXi<hVO*(J)Zm|x!UG8ocFhF{d2p^
zf`QeJFg2B^MKiL+>}tx!RrzcLEmmi~f-%U~WP1p>C#tbpjDJ7({_#G5EKqz0&Lqqp
zv;*ATMsQXWz51G_SE)J&=T{jIvI{u-?4K2G^Ulg9Mf~%#{{6=wcT}0U=%9^a0yAp=
z^;by4_HRQ)F!tA4ZN~rmb6JS-!8gq_bM5|PwXXij6mwlkY3AF9mC=#^@m;WC=+%as
zZfua;|J!-@e}5E78q11&+pqn<b_jZPl*%;cK~Nc|R~Lnt<F0994yNAqGBsclOm-6r
zjOi@yzamrDL;CJ!{jaTKz!sZH9M}4pr|P{b61a7^y1a5)5Fiojv?wPxx3=+B0=r0<
zw7I!?;@En||5&G1%&WtlVHN;%@>8g-D)+$$Y)?hlW=R~0)iK4~H_NUZhtt0Kl+e+^
zfj5Ec8)zhz+8<B!e>~a)J#P|IMk9cq@f7gKeVUi<uAg|YOb?y)ij9jiuXEpWR>#PR
zi(4k>h)cWn>c3VF>;^axSAKpX7`?XRJY6E0n20C?1+4zST>vW>fN}Zel}Qm82N?jQ
ziHth-=6!mye^nL#eACedTWljK^FQ~t8?4?_Uu_n{%;!MH2VFS~-p7as#$=<u`w`^9
z;Y>p?JXtk08w|zo1*HGIr2ABX)y^<}`@!XW)HhayfheY+{iLOU?Qj`jE!Nn9tYHnH
z{4LW83THa*q;}N&C@S+`tII@0(%}&e>3x*XH*XM5aE~5A0#PBKKiVLqO?itz<G0*l
zvXcJ!;x_}Fs}!{>mt<ZJZp@B_npGN?`;>OzVf2&o=@NW(6~vVBDLe-ziz2i-&vu3Z
z5H*!edP^7FmF@TA?@ZJQ<Mc7`c~^b|**NYXMrg{&V=gsVqTNFWOs^}1em0t|rj990
zFZvW7y8ivF7m;Ripo3oNcvEL8ZHATV`Q|}6SotlVPMQtUh3DL*4JMLRCD0bNH_&VM
zLoYe?D?8ev=<PZ?JAV#S<~Jd-@NrRN!1u3i>u4$!n!eC{+x<#U+7?Z1;qtS)JOl%G
z^yuuXAVr(8!<Pb$yvbl9@%b#dXG>`Tnsw;Xy63a?ULajYj282&?Enc=6=1teVo@40
zISeol>HU32Pm-k@JYbI7zsNE?nFKJa&w=fPp;wbh`E{eO&-IGbGJ`;%^3SEQ8%x>T
zCJoc4GmgQ0XUb&UW~(VK(FvS-gW3VV-*BvSZ~%1eS&W1r?-U6+u+goVacbW>n#tYa
z6$+c~0mKcvrR_$kC242t7!Mc^i|Od*gvhYnQC$tKI^@8#{|8B|gu4t59zE2Nso+2E
zJeI0VbD|jGt%_KvzBAPs$OW$`LN*A!KnFreYe`8oUrEp3&#L&S(S9v4F2$TJ%{A_k
z@0k-@Sx2^OE)Y=<KI=*OMF-)ZrT)qAi1qTp+q9|(M}}R7oO#{b`o-1uGw;n;Gh@&$
z6C}G377SZi?!}l$c=c-%sQyIsPqzmi(JUB)3;mRTE3%;^9dXhP?yy*u8&(JpIM1Ui
zrY!ss_dLN2#yEqW^30d6YsikfAoqRPXiI@XvOO-<&UckR-(z#!Y^5uy@90PU4y$h+
zlm-u&r<K!(wk*E`lA=(n)P!xaT>X@QSud`YtW_<Yi42+GLHfSUt}6|Lj<MIx3jEE=
zp~qF~*$NXK=xu;XCQh70!aFjDA-y|aOAJcHStROR!bqS*$N{wq?Yke&6%}BXaWrLq
zXm=3$jQ2|aSsRb<CE!<-047H78*=|m-G;Xd+B}vCQ2=7QKcmIDa|yT~n{+B5*uCqw
zWaP(W^y!AXj1SmM+LMI<qD>Aq|GszElyLsIrssu$6&XF$-=SWP$+fEkl8gI%_Vf3Z
zXg&!fZS+O0_18vZtQYEy;`0-nt_5AxfC%HmMD_^hut!_zLgz!aO(=pF{Qg%a@&@JR
zFV))-$2kk$>v!_TW0~cOX|l+grvTDzTcllDd4Y+oOGp>6eX?o~uFtQOCr963Q&#0Z
z>H}iX#vb-877W@LLTd5xm1C)(y;Se@f(4^m7wneU)pJI_-@HY084V>N!t>vbXEz86
z#+W`U27cci-D<|SU&@Udy$CuI6#b!QFN#sRJu!4@=n`cVQ;E6H(BIeylG%n|{$MOp
zK!Q#*(}t&4^29P~Yqnx4?%_?nmQcc(Zyix2I4St)G-ci+>~$-N2GbRN$-FIbOuB2d
zb5RU4bq2P!g+~4-D-PWkR8@dN_qh{Qq)}CRzAANzD-pl4+Mlo`7Iy7RSX+$84ut**
zS!uD(r?f-Nx4=>$XCrKc3;P<~+902?&z6A4%-~aH`=z!=+x50uMf!7gpS}Q_#GE>=
zI+OhGd#KikwqfE|!uN!3+$o56`!zXo`uo??6Pspq<jVP~;Xvo_qkv1h;n(wMol0xo
z?y+~Mo@q<AbYJJ%DVPT4y#a0AYSYX|Y2uv?r|TS)TN?H9PGU8)or3d>mt`Q)I%h-K
z<d2nSn#JuPA->l#i;Fb6;FFt?I-k4T1d}vZY;MB1%}SuwYQ}DMoWQWc;QlUK$Y+aj
z%u~EeQOTC>{i9d>pxd;HKitOm#iIvJ4KS<xI=d6x%WU<&k^RIHeXLIlVGU!yzW{tz
zuU$3u`GD9C5?V%?w-i*_Cix@j(>#IEQInNjSi%udRdsqdLl_Bg<ba&`WVnF48CSh`
z*4s`-vnW4O+EgC$Vg~(Q(jAcELp^Bs&`0)D^Wx*nkuCwvhzraV)(rbPi`VcBxs_+)
zb6oP4D3N7D$Joe9rml&KX9gvcj<D|cD-MrY8wHLSg&q4Dyiv?{<jv>+Pd0BB@4&YY
zSG(gWjd$hTr)WzXXSF#^%p|zh-`pklY5Y7>?By~Q;iZQ@(H9ENJm`ayiw2igFTR?W
z&}n|-WIxxSoh6TJH&aU(a$wy4p~Zvk{EUc(f1E0ZM47dP)=c=SKr=F~Rx+Dv+>zd8
zIExYoeyG1#VCG(zRSOc;7`+MIvjML>?geEVhx0M5Ne3!DtyVx<YnpqmtR%5sT;NHE
zJ2Uy)`Ck56R53LY$FedttN5@Xq+0hq4FkI!4k1S^YR6xQ8<kZ!;lUOe1``+LnFl*u
z#?SlAzrW?UT=|h|3S93zn5PYG8qRhCglN?GD;*k6D5%^^9Cw<J7G4o8K&Wg8j=40x
z5|AC5cyJ*7Q}|&zrG_g63^bJ<vm(Q)Vq5$kM5Uz-cou6soIftkfi!x|Xq78F^Kwv`
zTR-QA)^k8LYT&3^K^0u2$HhI@Ig}5V%EHOA;YQxh#gHQ0rZn;Hyf|BxQgqR4b-^Zp
z%frfSrHjo232=6z6LxnpPt@Q=>%P$k^Z804GT*2Ur0o$BKIQ9?0RqkZ7^f_vX5=Xx
zI>t^pgf%BW=c&?z4u0ZVvzGBXF8~jOuY)$f7+lLzrm|id@CFcboR%0=T{HHmBpQY%
z#}q=h>0yn)J=oxQTOy^xoScO$zY$r9tzgFEoXM_zI<W??zQ%Fk>Tu)ME#!FP<VEyx
z_)|)qPsa9_vaogvdGdI~cC!IKg|OA&K-+cyv$)3d9stI^Sm{cr;$3Q+{G~K!I*HdV
z<}iJ}{f*V&g=YKy#X%Zb<{5CNGhB%#q4EHw27T9+cijC>9@&fsL${800;H(vvSQ%Y
zufq}L%p(*Ojp$P4N6bn1T-H`X7YFI31S>k8*FJxg3v1l?#g{9G|52e#>OZCkp0}73
zrO7JL(B_QG-+59`r%0qRZEiQ$)f7ry$M$WHrXQNwYJ~eVkY}#)Co_2N%2=g1-}D%t
zX_;==<7b(weBjXow7~ZK9oFN&L*G!<PR>$BLqaPkxeQyMb*GT<U-!c?d}>tZUXqF)
zbJ=>Mc&mEGRoKw|SI#z{&KJ2M%w*C2IvsA0%@wdJX1}(TDK5<}zc~`>L`})gb89cn
z%$Y+5=@OX*5JAD^NB0+GbMHCYrxbnX<fWVhPV(lLNWXDe4NTNIKr_FzGx!aQ@U0E+
zmUw)cJpzWyB3t3{@g+(DR9h*x#8n+E)p&RZD4K|$3cR%bAY^@_%7Zmu%O4iicmW7(
zW!p-1!j8MKM&nzuw35EnVwKJ-Vd3!VUg{;vrSs~8Ua|9=4rt)7GVv%}#Z0_wn9`*3
z^l&3j%)gS`)q(%Mm#VhYnnw$XVqD`k@Ll}UqhyvjlxIXUr#l;P)UJuwAY~=v6Ld^q
z&p(^DWSkTCaBA_G0*45T1>>iP#axon4Y}MGzUVClvAhY9BhU9sZ6@+;3RU=Z6JPBO
zN|cjI-FF(d55p?S|5=7#4TA!!ZxIVcLue{jQ~caZRYva>s}7PN>FgP`8G51=fNwHB
z-zTHIuOLsC)m&G?rOw83q(Gy+<aAcdJftT@U`VytCyVz|@`;5mN=!FUa;ck-_|>`Z
zF4+zowPA9(&W{^}GbqV3=L={q+GB_3`n$VxOlOKXa@A#wI)^p2eOBqA)<M|<w$z4x
zhgx6#rre8y;K518X+L4x(e6gt?cWq2I^GuK_;g?t^i9$#dZaYUOR4A?vHxS5ZT@*Z
zNghqk*l0opYc*X5-bfpo!{I3FV66*al?7se&u8#lSGkN*DuAHLYj-U~aZ6>n;=slI
zJ)iYz&zHEH4=00O;<LUw8$hK8TSFRj_^hx8sSIsF^>*E{LP?#9%<*~X^%&oP3XM)?
z*Ub|BudV|(eHj&41YQ<E^uFlMUKXD&@dD{{s_D`Wa8JnEc0th6LluAE<lbPb+wpxS
zO4)q>uGd1g<37*5u)es7ot_!#OOij1zA($xweyy`h!y!V!Qy2D;56B{eq+t5COtU&
zBy;;%k=SIzoOPZod^k;!;Rq!AY=)4kmR&Y8&wo%RM4sO^R(w;pEBJKZl9mYC>}WBz
z#dsO}nhC@Jq}4?h70`=TRxRJ{bmu$8>ro`yNt!F*L#{!8qLR2$VVEcpVY+n%Q2_(A
z_3yg|uL_~<86=K`@|(bpm~{gnMcRIs<a-xHfnCcF-ja~0@!h8AAnnY&I|%=}bkcD)
z?b0W}*eSCL!#WjZ9t_r^wyv(=O%1fTz^Np;ospDE5e`p&VU^+(DuFh%WNUT}hdMt^
zVcsgvSA)F42^>Z=gnKLTlwK%4*?(~@ut~xZu4a-2Y!4@>6UB}f!Wi6xe7NYa+Q9im
zBjjKn^?~1g0llF58C(cbw-a}kQ`EUFrbg|*a=0a_;TxkNkZXYt<YhF=7a?B+N9D?5
zGl`V<HZ!9UUXB@+b~sUpatlKtm7IC1>(}vgfkV<zZ;nGVGo2y}{4~RJK1rn|@NG=v
z!LHTr(!ihX|6;^nbGB#2(FiUhQC9AApN)|^%I42F3QlYlSJnkBjp*Mp<5K(YSig*^
z-?FFDuCa+#Z$w}-(|*U$DbbJ6Iw?wp=%c<g$Qy4>>ON2<ZCw)Kd^C1V;cgdHRiPC!
zgeiHlOj<{i<&1?-5foU*ltoj&lg>QqDqCz&*ZYg+4Bg3=%DfT7*fpAiEqx9|`R0!a
z*9i7n8Hr1zv&H!nSX>Pw<4s%Ts}&%9)S-i}mc99bA52j=S%S}M;0Ug8nCYGGgyH$g
zf?H3Ee9Vn3<gqyf>iO;UzaEn01Gk@T@vet-doz{w9=(@S#4kU}b%<b`%CbT(t(hu{
zO+2$ngWLFvgw@c@by26}2eI4xWIvC6>a%B=k^bwcu5m_0fvtj$?>#L<6OYsV2HrR^
z(g|`tUOSadZds6XbV53|FAacVNmOP~1fzpc6k^PZu=v)M^K+MmT^fmt7!K{1^pbkT
z4ZL{1@6fxz@X;E7%q#+zQvW*sYQ@oz#_NwP-eg$~R7!q(=W|i+wlPcC=FfB~b|#vb
z>$AKNmP$<L((hMyq;CzA<}72Qhw|vQ(6*el#{~4-jua>Kk{}QAu>p5N*>ux*D{=-W
zQ*XC9p3-z!_A!KG;hjKYtcAaXj!8}7k9!eTtER6)9}{L|vyAQsenSRFN0oMzG~)cC
zc{2%%{9r?|Kmrx_kj%QVP2-)E8mCR4A<^8YVfqoohd?$4q#|of3#1djWS7ww;l|%J
z7`<=jnJqjh!MbHJg75;}vmO}u=A}m)x;wT-{^E(>sgQ?bKhcaae`(A~Ecp2@E^n;d
z&Hx5RsE?YU({MhkH+E)w_IZ84r9YdrQr-~WnpLsh!2@Kq0#4P8`v)WMiKnGS;Ib%1
znYa$z?|mc7dYR(#zUJ)WM*_PXWkx(H2c5{iq}6a<3>+dP591L)CKcVF#k2T4Afm>8
zC*lIK+UO`YbkUL@gCX1xcTr5V{d-f-Lc9Z8hKHMnTDZX?shnas9<=N47jH3i-nJZp
zlvaLDF}Y&mpKQ9s_8B+IVET|`IlP%vgE#dAED?r0yTm$NXTy1%sUXe7lA)6u4f%N<
zhQC-ITs7-EbY+NL^dyI!ovQmwG`*<4^FD=@Z`J*XCZ(vQ2I_~WuV?i_Is)a{Jjbtl
zEsIQ~utG?n_I+Urz}eS)-Fr=K_x&Yy<uz6JqN6vWPAeB`44)JTtzq5oQbFr0kurv@
zg?$}=oUh{~&mz<R9GiIN!+Nj53G>6_CR{l-X&0&~*FjfVvgSs>MK+Qe@Wqvfb-y&0
z*ULvYuB31*j72F}Uvdz47g0Z1n3t0eEwhE_fDB!+I59@=yBySIHx_Tl552xAz*PCE
zzi(LGG@_d<zD}I_6w00V>p|=OMTINt7c7*48M^|$Ja5QXA+?Dhj1&R_cockhIz>TR
ze*|Ky%|jEiDj9J*6qw?Nxaai!anxg9;I>Awn$Fa^sS?~3dKSkKqbb){s>_5mb9+-E
z`&OLv2b1qBuWo!F)73(p%pK=Y#}bqi={&NUl&&f@%A^S(_Co8h1Y9aTsFOZHaz4Q%
zCP5}#y#Hb=Leiq9Z8%{S(O^{2I1akuXG^9kH_foznAvG}Yg%aAefxHO<|~u8wAE|$
z`{H~>W=`c;=P`Gs5(}Y28AnnSE8A2wl1_{A#XLgFLMnKI#xMEdrMY8pQvBa+PgoaR
zmW$E}o4&13JvPZdae4ln%;)(`ZNmA{#tV0vhwCM>cci4!FHWtipoi$1?cYk11Yn+1
z!Nubc@ZHyKbG%b4d%iQ!7uA4r6*gs3d`O63McB}}!YqXZOhI@26{O%yaNSS9RS*yP
zidGNZ|6ahYe}cDS^br>Geg8rV$0!8XhWzfVUghl>KU6R87mGVT;G#g5eHL5*4F?kl
zQC5se8CKI`TAD5e_YevU4Fam1eH7{AkZTiuxtN=L@yFuOz9IYp;Xf~dSVlPLO}}%t
zufykGo#Z4LE|Q(tlo8y&5F*s4xoy})kFQ>F_d}JiMX0Qc=Epqs&Xj{kv_hgyRc4Mi
zcve#i4>iB6{$Sf^qU2U@UMUHqu{7q4tvScUA^fpxsWC(c{bH-mYWgg6saQbDrHz0i
zsC<yuaqs(Wxv-JnO7`{2^vUL73(IRSr@cgZeOoaJ1usR)rL#r~4A`Dfs&C;Z63J`I
z7HeUmIE~(<$pptnhP4DXU4KQ#+R|PCEpu5_6-s*#C<hWgs-fkNQc%efm>?zkeciU#
zM+`O2zA6J^>OH{_TAPf)Oyu-=f6d-1y*({4J^@_to=WP$IQ`9C)P?W0-~^GK4@?~$
z_3olkwqv+ugrW(K!TkyQ{b*hmL^NU`TV1x?@AUes%+>p+^&~qM9`PK@2SRN>M8D&4
zXy(t)>WDH^<$MM_+>K%CH^fQ?rmt7d^CThJ_~TrgqlC?`F;j?H(a%0&Wj_^u;oDAC
zK<WxpGl0H_Y^}kmNWtJeW+IYDiFBr(xPUnro5=kdmx7ntaiAZAtcl7hRirK5|8v1i
z$M|P)2<c#2=k&d@AI(&h_vQnGD{q%T8X}u)-K<d(QMJrGy1Kd1B}~puHpTsA<{Si0
z$htSO(&T6Tqg6Wsy41#71TLeIL@$Qn9h?G)Rp(;Oi@fr%Oug#z4YWbUPm7c>S_N)_
zbxehKLcN~MpAHutR6H`8DC`bsp!QR3X*5Wm#;A@^vF-X65Vf-tDx0hMR?5+|QMq#s
zqS(e6PholgrC#_l{Do%OF*yWO$x-`JJHi&z=|VoE&+Asx7Ooj$esohy=1o+Hf|Ne5
zvK<+02oaTRnfe7|*UT?F|GFbIc5`>qEnKOZ`R-+^*hz0ci9E{HA=by!ZXl~>OEu@}
z&dIAu=(qxD3O%+eGR%2(PJr`+0nX}t!J$=Z`Kf;2w)Pd)gx_U>6ml3pm)0}vI;#fC
zAmeS$%@lK2in5n>u<lf}C>>os4TM*I4)uXr+?po@GKWQb!QJ%G;==s`)=o61m7@E}
zSi)!hS6=GR&_-2*KgOKlR(~OTb?u9A(__O3hL_~S<Q+0|bsjN%($*ttsp%LlQ8WTS
zc<^53?$OYAa^@&()Dz48E}(a8->RBQk1Zsg{cB!~;ltYc?mSe;_L?|C91UC_Ckcwr
z6m5;>4N(hd#$xTQ$e^H$GgC?L$!j~X-6`ZYLJqNhsH}#T7<DIQXFCJJqVyu$dASCQ
zt&IRB=={SY(0-+AyzE6_6GN5@`B*U=z8zAqeWdCnnQy){`0`n^INt+N-no0)-zJh9
z&P4dwR)7G3GUtpMz;%TgI<1hqiwu$o&&Di-j0d9!iA(iM-VcEP`3234!gx#e-ZFdX
zW*jTx&CP;3f0?y2;dZ{?r`D1O6S)mMBgUR)uZA#p$6{GE2us`1Iw0?*QLMUW8{PLi
zY%hO)_m(S9JzvJdgdgrCESkywgM=Jt-!WNwsOyvN(6>R+5K&QCTC^!YK`P(yw+u`K
zkDhrs;0=t@j@(##{wMMicqaQ7%zvs&551t^S8BMt@Ua@NP)Q}SJ8EaOKNsL{_9WB^
zJ21Bpg{8j?cqg!<1TP+SDG_wGFj;PDvqQa`Qo8%Z1f;LI-J}7rD7N3329pi$x~9~J
z1vDrfbA<ClLeYx0n^sFzbNV&=D=w}h?~8Ua@4EV4RIEOX&WyUr_V(@DRH#vn^Ha(%
z*S*CDtuDES%E^1seeuyf;o<XiS^zD6k~A@^=%gxI`y+vS^$G3%`p%sxqEh@n$AlrE
z9AxIXhfdarMs%O{5uuIlW6BE&W4`SwN0}fy2!`D~2&}spRm(0gPvu@R;-TL0bH7@X
z{pNJB5K?)8H(yaC6jQrf54|`&W{iDm@@|?%(Y;QSFy4hgoG*;DU)9k9-^X9!=I@@x
zcP^IF%&BVqJV;^I03i|Ku_)Rlm5^Zbm31v<osH~2Av$T}znbWuWI{w5!F>f0X4XFE
zr-v0r0!(?^j6~<+5Umq86W;~r&nM#iLorlsXV<NZS7$NZmMflou`)67-Fu20&y$II
z=_(2_RDMn1xCcdbzyGawnBYETIH493IKnwTsF3vQK!a)jn2zlbN0*vq?V}Q_^u=)Y
z{PcVV?F$*}C*mYBPEuRf?f*Dn0P+M7nG=jIW(D(Lye?zmzahP4Obn+5xM?XL*8F>Z
zhZGKtuNXNO`JBdZj_g3kBg2{~(Ld$||L9%=4cP#U2w|_y`}k3qxnX!%Q)fz?twWtD
z%5AkE#JJr2K3n+z7dkF%yP8ZE?78^xyPaB_4$ZHcn8j~q*Gg4Ap^6Dj>HVWq{_Asv
z({KU6J@1KKT>c-}&K0(d+>;0A`I3zwKgAz+pMP*(I-Y`i(DFS~H|~GaDE?J+wUY+I
zmWR;SSKEiG|L1QL2O2VgPMu3yE9qaq^zS#0_194WW=k)<%<5nALBQ98`>OJ#vZg`*
E2ibb<{Qv*}

literal 0
HcmV?d00001

diff --git a/images/lite1.png b/images/lite1.png
new file mode 100644
index 0000000000000000000000000000000000000000..711330ee7661943543475d862504695ad4b7327e
GIT binary patch
literal 258476
zcmb@uby$?`{yj=bNGc%GDAFlIw;<gujS37PJ=D;MfOLb@(B0jkbPe6zokQmtU*Em=
z+3)v{^NVv`H_ycd!|-t5pIB>s)<dAOq7?R1lBWm=2-q^x?^O{H(2@}lkap2g;a}MS
z(hDFUV9}e4iz~~Bi&H7v+nAbLnjj!Z2gYileNgLt0nv?$h!{pnm&Rxg94#ClMnab+
zwEXraLiv+C`fDd+m7mpVuY(Bn2~`|eyqUT8O;pOJ0$wxrHGC*bKc&S;D_n9ra@)Fv
zHa`l1HV3?*DG4^H2<IQdJ|z2EAVe_rMR)i;dE1?v#UYb|fcP>EktwZzMYhwYyj%sL
zZ}>L#^bqyT-+tm9#opG%BV(dqOAIE$b4)varM^MGD=LIK0WIcUWba|*gUvBm2Oj<_
zxil6ZlvgzoYw|P~F>4MqVX9YeI?AaOkwyLbR-`!)m7`&wHbH=oWYsiy0zocf&pHP2
zZtb3sQFNrfqDV**ApGP=`rT@@H8}b@{KroFTfXFhk3|C#uNZ%d<?{vr)*hL=ALHi9
zW`<vGD(54~xFib#!3!0+l6`LpS3wzP+b_RYQ?<~pGu&-l1&Y3a@NPf)Fq!;JL_|fP
zkL|_?`!<dD3813rFrK!+CjPR||K%iw0^VjhevF3-#Molq$i2#&4LgwX7g^vZ=6a=k
zW`Vc1?3<E^sX_#m`N|OyMeBhEy+pJU;zg(@Lgr5H>uEzUF+?WU9;w&C(IEdvGGyYY
zVcu+9(?}-rK1|pP^qCP~UPh$|ypsK3g-v^c`7<i$s$91e_R0E*uXqgjqlCN4DwS7}
zUNR8@&vvUqGQPleI2R1`YuyR(T{rvsMBKv-wrs4n8H{XaVEHwQv?Qf7@Vkb6+kq&7
zk&jUu3P;|K5n~V1uStkM#Z|T?#uG9s!~}0#GE^3Ep(sL@*H<BEe#>m%uSEIh*`!DM
z(5{CcOr{Im`Mt3uzO-aWal0Vdv3^J4JJ!wFrXitV!4UL+v7(d-dNoCscUqHbDS!To
z^zFBo@>q2K&#ml`=W#E8W-bE|3IK<JZrCR`n@TQXX4HbO_2?Y%td*X<knuEvqI`ab
zw*Hd;hyKmw%!(#^0V&0|+h14g+|V{`Z{)+LQcpYXSEj9pZ-lgFY`c~$_+AF@OH1}P
ztRqwmhGUKIkI(yO_As4$u*5Wt1{8LZSYN-zope55`{}$M4w-(vkKpt|z$c|Uu^hFn
zBkW`7G|m2ZGTfoi?x6a&oI1&0mAFHw@5}|E%Rf#FWXQjeH<F9s*7yi|zJs`zm*oTj
zrN1S76*Q%qFTR$q`7U7bW^87p7Zc%nv2<Q%k)6Pe|5x4dpk@^zf?=zVkI$w}O%E#q
z&KZ(u{o|l1@>IHx0K%50ok^5?Hg>!Sx;N7JitGqlm+eu74>2FI^D;7$DEfDVw^WE(
z-o0f|Bwi&NBmN&xHhrHfqsIA87?Dq-)O<}b!kxw?5e4ufLcN2RpVr|V{;D5+V&H9X
zjB`efE+%G%PCg<fk0;kbVJ9j0Oymtppx9~z!5i$}_wT=BCHOIm_C?qmlUF>+f9ENM
zLroU}lP0w%1%G#-E{#;8_g7W18b_}dlgfmcQIQ1FrbQ3SJCNvne~~FSB3nhW5IUJj
zHypg}ScUB2bMX;Dt@Gs%hD~qqCuU|;xlRst3~n^0PR25eBAhQ@y>1vMea?+2?OrQ(
zC)6P)m#(<HhIZzzfR1Ud_}%FY!c%{huHv2;+%fq`vcH+eZ^E$r<swG=kw5*L^1Eq8
z-uJxNQ;I|@<dM?Gc*TJfwg=gGmEQq@a^Fe6y9IU!T6HLXq6~aLN;ebjF7x&c4nw%Q
zn2y}60>7jMonxeMUS7eEJl?`@1z82()q~U-)t&OmG|aWY1*1xnRJu{ky>E>vz^Gti
zFll|Dd&0cLVXm2K5kpH1itGy$;p`|?DL#dIDR5>%fkn=4Hou}q9=}qxs$=Rd-$XzG
zWxkHO0AnzJBx5FP_F)=D?qW_$`iPlk6@m^UOL$F;rnrENo9Jztq!wv*m*!gjphlr;
zq1G2Iw|qs-T6J9Y)FOjCgLF^P`wG`^RPA<I-|U|wKQdLa?LMsKNT$;2kI4We0a~EI
zP1ZqJ;!u^ej;78l9l_#H#l<>w)%-gz7m}eFpdHNZx$WE?{n__cU#x1J;+zsLC=N6a
z49Ho?wNl=sAf+&-Q1g>xg?#KZ0@%Ooe|byH^D@;T)B)M?x*^zc&tcASX)kmbb9ikp
zcK2YSvE))FHJ>sgwYXKR_1$9&qO}o_9ct9@Q_^<~RfxJu>MCS8jGnyQtAkv8d>G0O
zwar=4Y?t%*knEEDDETR~T9Pi*HH5eeu}izFFeIGBIcZqoPW~Hhp?p|^AaIu*Ie~9L
zc9X%hG(X+EE`Puw*u2oZHKAHNLpyTDpz?N;0Hg%Nf@y5BY&O53ees#aRC`=!zWj}@
ztIkvAsSl89cN-I{Sj+0+f}gty)ex?^hT(?E8wmDb*OF<XLiHe*ZbosgRm}nF9FtRC
zN>WN&N#9|Gkj0TTTh)@$5_c5)=$9-r<MW;*9Q6Rv0QUf$fPl=GnHiFVp%yRC?W;Ps
z$48hXQ_1fqMh4$@+a;~()3M~D=f2LR&7FceLG`v6x4y7)#w*9yvxc&k>K*7`myqh)
z)Q>E5HLy5*vAbP8Tw%2<1sz?K9ZSPL9NCSj%>gDsfm?iRU$r8p^h!-j!xs-1!Hc)e
zj~8~wy%#eZFw2nhmk&RzKJ$AFy7RT@x(nR>@HF>Ky63-BxaWJAcnC)tKz2c#Lx!Nv
zqMANA#BnEHfB6Q3pWqC#F_<{WPZ9DY;7L4ct-qPy8~-9qFrLMWL}DG%S<=lH+<<)-
z-dBPg?h;oDHP4dzOFmvoy1X9<<0X^{4GGsHx~A`Am3=<J`GTDEt<%P|+eq9<#<u@U
zy!J3{Vvxurzmv{B<=XePPu&`!ZqK&}c$f|?1%*6i?@F$mG2UVHKR=|Mebe_`H)=eZ
zmUiMzvl5OXtJ2|15!bu@6CyP%)o%=;`B~#x;|KX3_+qq0kYTP7t53Zhq|rrmg!23%
zGe-8%b!xToT-zKnY3eMR+ymL$fxs1Y(9P(~&fb_?J@X;SA*KO>4Pj~0Q9s+3$l&oH
z{g6xCPEtKbrS39Y9^M(c_Akd@TG}hxNfnF0ZsVgKiXIv0Oj2P~Qk>g|ESkO-DTx+>
z)kYRc(*}C44}WY5S)ErI$YdEe5=1ISac`Qpi(k+rgzgi~w7a+772M&DMdwh`4>RKt
z>Lq^U`Fs+R>zD4=I@kfU2i6R5q;Ppnw$J!f)(*Ve%o;LJ_##~EVm;MV1{T;-+XN0h
zR`!(VHEkb!U4Np`DcK1N67a&f&8~T0{9e@_ls@!JA^2G0mqc-}{)<xmyXqJ%*Ytt%
zqKtjm^TR02R{ndP>%F$oknH0A`FJZmMLkv>&(D63K`9%F8-5!pOx-0#hMjsE4Z8=s
z4245lb()qs;@Z@@V=BZNGj)@n4o<66r!Xe<OP?0ZHZI!Uubv<A9CKIMU`>Ch?KM-I
z$x%x3OR}-Tm{T>h($FuDt>g0Ai?))PgFb9t>kc~k9+=Fv?&BS7PZQU|)I+5d23gzn
z`z=Ovw5=`=0~<^CF&4)@Y$?~8uzj|f5iV1zIl4YFZ(W+EtZ&`$-tuYijYFHF%%Ef}
zmf8i5Uvi_9@RO*K%;89pS(~4l^PhF+i8yg4achx}@Kx;7Z<w8o_<Z*3z6lEszfA5C
z9GT(B-Z<<%p({-xaV|J_Sl<Bw#fFjvNnI|c`IbO~Tg^k1Ez98am5kfY8#(KEoco=N
zt#r-H$(w$Pc+yfAeRB)aF{i49*87f$<w9Udd8vLAg-7#&<Z7rc58dK^U1#I@z7zC7
z%lY)^Y$?S#+_v<rrRzQj%^$bm(d4$Ynf#DkiUL)n>5TqiWI@q-E4X{KJGC(=br_;9
z$OYDc4P0x3oXsv{x9}m(&3LWbM-10;u+nojD+7u<f#oB)<~6rQ0~rJCCWBMS$HB1L
z5*yu{Pq!3J{T@?~EoZ(D0V$M1B9Qx-yB+h#v!Se^SVPNJPLI$#R~ss4Bhi6KTLLLW
zEXKr0#I`+hgke>LG((@Q0F1uI1Tnk~4^^KwuU~z)Nn-PedA@AdZ<8I#ygihQ(?-+!
z!j~}+5ghG^C*MVy*6}iZR#^%4JN{_29Qq6)ix#0p8Wr`12Mf!62DNt)b@_^OBSo3@
zRAAdwi_Zc`Fn}!5(~3CJ2SViX(h-MK2Mk+jZ4d&&3!2{_h%%~gju8+<5oF%KQ+Gk!
zZ$|SrmU2Au$Z3~Xg4zK=1~N<-uP`uVzEcH;^m_6lc=8&N!h|WfmjIi@`<GV~{%APU
z-k~xae1TLUc+Zj2zl}aE<~D;t4+r)Yq3P}JTjK|b`T6y=w~9*6N|ytVhZpJ#1t3L@
z`@@6XL;l(VSw)B-pA@es0?PmDGD<)}>W>NFMfXPdUtRP_>GSq&>pSzOevWt_a8YC@
z#i>Um@emOIT0RqR*lUj)O<vU`5{k`GAs}^%8I6?2Mo9oWM^VP{|9NAY@>Jb8z`JRk
z6T+{5-r(O?pZVwP$?vb8MC*>1;UK}^>7T3Y0-Zhp#Qrkt`R9mE8^Bf+0C7BR;PpLU
zqFl!nq1MMwtcqkSL_<V)e4kMf11sdI%f$1)aU>6DZS|rQ+iV4g2+!~D2x3_2>$6SZ
zltdKZWA0Q`mDChkzPvVVh~pBkb+44y4~vT)7M53x$~C?U&u7|sO&3tJk<90{)h|!$
zTCG+=RK5&qyvqu_7M+Vjsk9Vey>1A59^W1}V!~H1*FR~J;HoppQngXl62}D_tb?ZF
zs+sbu=2+<pPSiuAmA$w33KT;@V=ZbKpyKzQk=W4jWQ7!oldO<QH4|;u+3zj1ISK+w
zXXokbp{{!lrOH#m?NOrN4M*Ah=-Xb!9{2NZ-csb(T2w^_(*Ni9>hL~XJ*2gWGD{yj
z=e=j{E9LF{{FX2m<<aS<bI23P?>-3V%&*!qDJ;ln@K%?YZwS|y-U_ySqG&jZtdj52
zm)HI7`;TA{eT5gX#CO7b>lrG}mrP1Zn%~rve6KqjZ7f~)zB>8(GsCmg^!yPJC)cPr
z0DplXJdDN=U(gmH6X-HvmzJ<5kb^8YItbD)QH%!_Clw^>7zF6B#fIs}O)6+X;^ega
zt-8m@O!kJMV+~P9lc=@sbN<`AoaC<A;OwN0?;$Ok{3Yl6ARfNRa@<<iK=}rTQunzQ
zkHZYQ%gTmYM+nZKt8}99d`SRwEWPGq0KcJ_2-y16DN%UpeAJ0%A><1mAjTb5W@Y9<
z9#lj;)i4`YoGQB*hmAr{z?f@P7iFmNe#PQxJdc{T24<{Qt(vw3W_&SblyxWZ_K1ck
z`hNnC7O{4T74Z2p=d1@r($&p;-<@a>3wLaPnqK0XOF&o=wC~b|i1a14?e2)eIxTE(
z>Efob+H)!E^KwqAY<JvLg=xL1zC#O6^BK)j)yKX$aE*sYiDOmMHz#mnup(|5h)-SR
z-AaK4V{!3DSx^CwW!CHkPQl*1bYuhn*uFbC5zQ*fKQk!kyZ0SM&`DkIHEfG$!qw9s
z<eeheFSI&>;l!Sr8MuPa$>*ME+I7VRR#Y&heeSkTpmU7LG9|eF4gqD70g<btkZj~M
zz>o$+z*wA7(WgL=zbH@WsGC>u5ly0Cc8cK@C+WX_Wg8J9IF0){ZFap40UIJL;Y+79
ze4HQ6>9mWYkSrvDas4wx%bkP50%k%f_B&Z4-mud#QTOy&3m&U3&MH<nUML^ru&CEM
zGI3ciC~}dLhh>ehAyWJ&a{m+|0%{+;(o<tf8agFN@|Q1Pl2TXpFC0pXIpcqrpureO
zc-<&N4P3qf*@jM!3V7LkbQOJ!h+|`?U<2<V=`WvxbbiElux!}=Ot%syw476PY#|f6
zcc`2OtnKXY(ZyQvrC7u4s#%0N+DELX<}~QE)3hL34S6ixo)s?%&64=X!58&edz=^>
zA8>Hhq!`@pWax7DmeYj|yB)4;$V)+hg)lkdi{@pGlB}egnq+v)LE}MucQKicj(Jwo
z=~^v{wc^F4W9Ri+^!D~w`ISBOvnKd<hicE~hXzji(95SeJ561>Mg~xF2ak1w<IR_U
zE?4w%TP)r%7FfCA`1M(Ikm3BRnj+$T5Tg<nkZVRu&B<K4M#UEP5cPP3=XIPB_=(@^
z4fT=f_t}I^6|&|(3{7@5o@3GTz0GNz+CF08H%2^9d2j5Hh(j~D_aQ8Vfys`~!J}z^
z&q2*uCQ-$?cdWuG?_8c(pk#=qYR`0PrlxeL8e4bCxQU+isW48XyNaEhuJPQb%;2|s
zyNR3Wxh87<{w1tZQ)*}5f)$m_yL<HBq*mGFnBU6%Amtv&o1AdvA1b!4DM|{NoJ!Bn
z+{^0c7*d-V<?h(DcXSkTFxzv>4vyw~7J7OVOKh;B@XI`L|0r_?!(hkCJ~=f>A&=#2
zL5*cDc`2COQTWm#Xq>+iGI(=i@J?Q<11tLrdl%(E5zUEug(A_~QXDs=FQ@0nwY6QP
z9$;y5Vq<4Gu6nHbbYVZS!c=eWY&6n==i-w%P6}{#+)luO^Tp!?N3a@EY66F;p4<+n
zK+K)GxoJyHEGvPV{kD?;F!*togD96(sB&W-Dj^r24LeYVZyb3%sJ1<3>e||ze+WxL
z5vS20YIDlJCo#ZkY#_QWUO%NooJX5y&cM7up}~EuB*}`0uQ*ejhd5>Ph3n4Dj&%dS
z6<<Y0=7yu$Y-HVF=fGtjKCzWjfN3Hz4~)ixFCeKT!L+NhB)NY@!-fYO$u-t&`GcbX
zZ!J6_cg@M{du~V&gscJkqCR@>ek#scV5-K7>Gc=bFbBU<s@!(mW}J0jf}KcBlX26&
z8MxLRFc|WMoLm+{mKe?OrHh<lt5k6+T??Y<ytbcHxZz08VZYtL2gLNM#wOn-^RN!!
zU&4}*#3|)Sp5JI*a;VQK8ElIisqhqQb-eh(q%k+>x?4xSYu_~L*rJtp{$?p8H=4)9
zQ7^!tfqy|kadh2wZ6!AMV0*Z{>{u&z&-N8?M~H^QzApA?-Q}^`(GkoqZ_#Hz3mk&z
z8%yye&&|y;s8VRhl&`<Jrsm?GP$E)V91Po(V&E4V5LtI=)p$T8V=T-lm{lyWo|lXa
zLUj_X%+xl0dTdaHI1^rN{BxzL<D!7}LyTrMD}B>3`c<>YT_2$VCT6iD$1w*__bpvT
zr8b4J%(3V3$c|ki*%}R7fa2KMdrI8R0b3w(nyAcI#-t0ixiBqdiF?IdhAwaY^lI#L
z#D8Ip9$dtOo`mhaOloJ()P}=mUA}MYq)A2l>IQ~Nj!E2V{TC_fy>pm;zwYMzQlYVE
zrZf%|afIfr0oTD^e6BnDUmADT<C0oc8*5rTO7_-bOJHMuOcO-6p70Yj71Y14h&|HC
zNwDs^k7?8ci*UdCjIZ3PKmKr}vM3wk)d;`qXliOs+Z)bva<{k~f7BRiof`r>;aWXM
z>;sc-(Ac3+U(8{lWg{=zzi#+nU`vcPBE22_Gm`tQ!&ldE&-gAh+=}jbJ5H$ro%|9L
zN{uO-3|rzMEcDp7$XngTcP{og<S#fV#<5XE(`?bDDSrtC8casDzZdL7<^^yH@x9Ny
zr!&7=eHR2(EFU0?jjL7OJDgH$_b=3Ds@o@~Dqz3+_m<InB6ax!ed!d3eIBonum!JQ
z4*JPmzpH-kbh@8i)9jYtk{K<_1YVlp@Z63YOIc#MOOm*E7E^j@?J1a&Bd$9m&$E`3
ztn1pqS?g{!#-fzFqujqc%%p!hC<rKVY&Mpk+2~S8($NYqt#_J_dAnAV3@+JlFx%Xz
zOQ5skJ5$mJ4E5SQHtO-|hc$BLy7MHNCWh@PTPE0z)udQMlI)5z#~jRNxyPDDt3z<q
zdSxJ+EOvbILUc49iQ7HtRxI8A`E+)Z8<Ed)<pUY!nrQJ$i`W5t?=p$lMkSmCFUk3P
zwxab+zcT8~ZI3RwGFgEiLO&t4>JvQq&ldbS1bSK|V&3Epq8~S%@2QDA6@&s><;Ok)
zuk#;eA)WK@t6ALgO2B9WZpk$1>)m5|Vh&NQ^T~#~ogL_oPZ06wd6DK*^Gd!0pV+2m
z#Eq;a46U^<a1O&bhg^k!BJ-~`-I+Y8K6F9mvT6yu&9f)2UpegGX?<EElZW@OwkbM^
z@dNOr5m)dXn((7G3HXBDui#6*HSEP~hujRe%}n2%y?Q|@257ey^Qg9AyUjU3g4Y%a
ziE7zb)c&qLB8G5ryvX!W)wurPCu}Q!uPu7~asOE&IdKL%+m_c1<sHUB-L(Y4GU1DX
zhkkb7{};~uyIa3Ov<UzSu;r7N$B%1LP)tNT04~*rBH1d`fQeo&xs+jDTdG!L+>D{)
z1@77xP7mI`K#C`?;rL*Nw(UM@H&J%nz>H%LY<Wed&kJK`)1S(Y>nSpC44ZXs_U|j0
zWw(vCx+_Eb8-zWP6&3hm%r=4Fdc{Y6x1Vcf`YE|DN!RRFfkzFE%}`fUqf^)Mc#?e6
zCVsuWI)PJ8Z>%P%mu0Z#K&or9_o7~lylM2{)QazRK5lADM=^kZfr)ctpf0{C<xs0T
z2C!DstiF}78>qmepOVwAZqm4s;A%Rr@WQ;FDq^3US2+x5pei6NAb#J}J#%9C@YP?5
zEc5xInAsfZqS&qBBPwHYL6=Ruo*wEn%73S74+bJn0O)w($nz4d@VZ-po{ZzE1_im}
zqT7D=IBWpuVoTdoiI`~C4kc3gd~0C9FYwEkOOV<dh?=MUZPm=ba>nR8eCtIoFqqE(
z1&=EU$Qv38>Kaorm1>_k0bJ66&p<QxW%}4D>x~H_68N6o2O|XkO_-FGQRIOzz|t-2
z{f$~`nu|<CfsW43=~N4Z|59I!rilmi8j9Onz~a?$pJVuBf<PS=Mbu~zPx|7mMaVZw
zhZVbMQTF#-gM;HeOsw(>@~4_6$ehQ8Xy%_j1?jltdnKF~-Bxn>B4Yo$HvCD9578*=
zwf#LkJsCB8I9nD@ya8ANm8U%M!=h8-;~vM+ckivSHPmazx)y3x9or=-ipt3bgXRAK
z+*47iYwzFz)u%A~Sq(>029j7I2pR+bn9^7#Fu%f%@@VjbpHGsY9u5KG-+@v1PE-K+
zy!@q*+oit4HYoKKV`<hbc1qvOe4QPBr^_VllN89wg7NUpt*u=D`#-TD)ck)I3$#>P
zq6Q!f=Io-1pu}41OoqiZu)T4NKQ2XD!WCJyfj!0(a4UGBbfZDD4EOK*`!{(_cy0^g
zMT>hWcj(4H>Gk8%@&W0QwIHJ+2zA4<q{wQEQo4No8^WEX{x2L1_zzvvwS{f*Udp9c
z$nH*iQgc=5cQLW6kBs-o@IMa3Gj!bw^tTK_ML`dR0ng!x{txfBfA8&8n>Xab?X~-m
zItF-rV92OI*Er#ry*0eoDcIXyK0>TaE=&=cDVDPTIWHl(F!}=HIFOQ9mu_y$!0}kN
zw!d~`AEaxL`h|~t*P(P6o)`wFP7Q|SYS56m3)K{00Uh^(gJU9&TI@j_a5uq@M>}gH
z7~xXluJO%_lx$ltpT6r}f1cIj20>naF2H&5os`#I>Kojl<yyIx612X5L`n~>j|2|z
zM7ke0tv3?jaJa*Ej4=3Sh^#0Fs$Z)Nb_57Gx;T0BTHF4vIHHKpBHktN+3^ST=UdI4
ztIstB)8nO=w@BsP_K@#?i9<8VI{Ia+i>S}7{^YOiNE^oJu|d9<)zXSnYo&pct$N|8
zRyoB}$e<EzVDo)l$ZM`@hu(fe8Aq$)CPNErug1E2Mtix+w^xznjujq<8fc{sjqbWo
z;o6P!`na)l#mlD(e1Jn5o7~%pxA59w#Wn%Xas4!=pq0DBt}$Z`&Vdj+M@#5<V#FNR
zh^~HkkqcWST9U<{s~OYFiwX1G2;O&SiT0X0K7AZ*-N>XIv>8(&`ByRjL%iEaXpMN2
zQ;3mHwt3Gfnte^XhE6HOSDWe+;tLkq07;p5NoXP(Q^(ETIX<Fx_{$G6aZ@-a52seT
zC7Fp`<nus{QROLI^SoPc3(_vjpxft}7k}c9?@b$J)GGTVn?{uT<y5JIL}l_#aoPsW
z`sc`I2kyAY(&Ra2k$J~BocnGfGbf86vFwkt9{pD1RSfkZP8B%)?3gOQsR<k}9j12d
z=ozRfxJ~cqElj?%JM>MxpPb6STpqj5N0$A^0VDV@KW8U{LeqwO{CqhVA5o6x$kMyQ
zm|VcKDKSQLVI-D)pMJ7G`JF1-bZy8ypg{O5^c0GdN^_(&m{J4Gme2GFl88a5zg8Tf
z{7x^KQINWr-`95!YR&HUGQXDtMz*Z4!SCw>4CM--PS$lpa}cZTbTUI6Q~}_51vpgC
z!2jX3QK@BYO4p_FGBrUZj3DyuK>|NhDAN{qyxSe5(LAFu_!nVp)%EONzSR`#!$H)C
ze6Zn-b#ta}qSZinNOAN5sFdF&dv0US{-T}*&XtmJ92>OE;24|@EU-`-#sKWHa?6JT
zCh(Ks$X_8Kzt5>3a6V9%#5}4Qh;UhN=fpo%T{{9zGL+74s;v|h1g*O*rRS%=;zG@&
zEYZ#;$@uUe()Xu;9xG72iMo**ICH_pcDfHC=X}2+O*m0|utv%EZuJ>8Z?Xrq9~HWH
zTfuW_{*?a0#QYXVN`>&t(e+~h=e(PDBKJ}r>REF47tQK8dNLGEMNtvKpO=DO(uI!+
z#0o9cqs?!#1Zkob*(-HKG2s>e9KFZJ=fmhnK>@zrur--P3KHXhZYl*_+!6I9iP`%G
z9@X|u7S3*c?*+{7mDj`MOX=yKrSdZJ{bavb<W&_UT5vn4X0<@#LyOZAFPxr!yUGpN
zm9xZz9zEN5f8q0Pm#M-Rs5o|=dLMIX#hP0Xr@7k{aw)dKLAG@@%HjUQAN_X@tf#c;
zeF}pYRW-RUm%^Y3R2)Icg;rtFIm6>$O;b;veEpLk|GtXmpv1ImV;`2qgR?GWoCXB@
zE0cWHr(B687<VV0R|{rD=X-3!1uyS5M!X0x8IAf}RjfZ-%~;1VgXfRUE%iAk;vvrx
z7}>soUr>fL-Un2~trm#ZaO&XQb&B-+YZ!lyLHRU6C$LMVrssRNOWZ4)z{u}I_7nl9
zK_@SWeGqJtRAhZbzpuR_WW(@h=#d)@_%%TgG|t?|hVkv1#qS6Sw*L&#=hN%um1PYD
z>CIj&lLUtO`(s~D0KZseyKl^Q`_$Lh^M>*%g7ODL&HlT)yV7K%yGG<Yu}CX;dm*FC
zBkkk2$7~FGQ=&AdX9Z_~RiiMd>KLKc7?;-_Yf#xkRLh&0Y8HF$T)5ECx4$=rJ3kb5
zg2=eW!J}7`+V|y4U2F8r;Pp;v7D0XsxGp>4`uQdG=RM=w`9d6n9jEJ>WDo29JrGN-
zc*-GRqP^+7+@j)T;b(YQ&lwcJ<j*^Dl1dcEJ{D9goXQIU4rzm{Cf>B&6JI8zOA}vy
zjvpKD#lIWIy|lSpR5+L~)!^hTXmL@YtWTVf_^ti^*IIrj%9l)IV_Fd|=}MvM%NZ$~
zKwOv9?fcc@*tNGUBH-<t-?F3VJG=-VGYEq+Nxnv9a&DJP9Bdeor1=<&%i2}gl#i%G
zLX2h<KcZPZ+8TdjKX&EQ)(^Sk=e14p?hdXoIo%D@h~ryLIXAJHe9uxp0<i>=W_}Jl
ztNGmIl7vNE6*J@l?v<rwgyJUgLATK9Vqvm$=ju8?0ff|{>Y59RCb7i?ti`KBs)Zkb
z4|~1^5s6mxZ#9&1!>%xqsLEVXVlsaOYvBZ~->N3}otE`x_`)____S6{Dy15r!`F?6
zX+3fJoJ64&)O-Utc!!uSfm6eW_nbT&@5i>F_Dg_v=-Y0O?7NYlX*#k3ss@VZZZ3{*
z9_~U1<zcGxl>{k2(q?`Nd!m~re%kK{t7P{3y8j(V4P#CVEh(s6AwfZZW;&~H!lO6C
z=?xR)OsQ~YaO?0IV-8botVC$M;xir3kGbx9a;d1%dTdKzWz1%r>C~_P%gUANedZiW
z%A-OmkkUxu6>>uf_I8D|!~*%2nr>oYy2$k-r(^dGQ}i0w`KMJ}A{;v0OCMiNu`Ds$
zYa6uiZWSi$ReZI=>p}IgY$4((j|~jdZLBou9^@VsHLAwAYHq#nD|T|@U=blb==Tqm
z-%FKhzJ9g_7m}OK_O-d@@^j4TB>KQkl~~h8$GqY3$P(~OHV*ebTYb-kon&5C;HPEt
zGJ7Og2__14oG;c!KAd49h!Zv1KyhhP@jzP{TDlH5dJqhQhLnNo14IhhY+WFRKr_>f
z07`<SRW#=5bmme%C$e%U41sS%$-RQ7TB?0pGz1ZDhNK%aoE(|{kH?RETrjY882JQ<
z+JY&R!R|dBP&AlDn>?+^0$Bhez-O5)?Z_0X<*drCmu^|;KQyHe)dY2@OrUy+8ZA64
zBd=POr`QozA_^x{sobhraHw=0qRYv3!8nkSJyxBzgOrmk&-~)zsvf_N(o%8M6_Bv_
zl3$eZm6DxdBS7hO)6Ki@j4tN;E+Ld3A_LT-Y-}*HTBAIqoR{uidoI(+zI8GXSYf^`
z2bv`UMdGzIRNT{2X{G$HeO?%7_*kJ%zays)RyNlcIwuzrQ}Q@8%-<;ZlIIg2Yc*T1
z#uAxgA-U8E3?J>7!6(?6m$h=cv`_7*Osc2UJ7|#ttOU7Gcp3}FR>rLSg@8V^+Ru#Q
z3navuPrX_I=WG}kY-v!!^(+H>gB>{Hxr~y!v3MR4U}6#n`gU9CQJ_RdNt&~W96nY<
zYQ63mO*zTnkH}a*q6w4M@GKD22mb#MEJ7Q$if>HKEzX#6xra{MRQWV`EL8who2!Dh
z#Zob+9~cP!f)q@&Z65Jh^D%K#swRW92Aa1o1$JF%x6kl$xhk$dy{y)L`{X74Yot~B
z(aL8V0zLD3F{WL&6H&)&_W@TA2kFyiPaqTXLzgw~!tp^dIyd}!nuB3WvXE`9Y_hhG
z56rmb)VwcEbHjL@U1at}^J2a-GJl@4b0Rz8n5&qtVXiFx^->@xXQo_8v7BBYGXvkE
z#=%k(+FkFyIOcdqMszJ#qtS)4{bDh`IweJUF0bKQwII(t7f&F2t5u{|SAG$AMe&Nt
z)?3s7-3;xZqceEi`R=8o$O$htrB|FJu8_opz@;iV+n|ntieh;$24rRcs_n$t@5cEP
zkSgGGlwOn4Sa-C%aV`f&<q%2Bpzu<`%|33_jvLmzC+MEIbxW5y<z_1`l5bf!Z|5LG
zO&(h16==z}&fw|pzoF|g4V08L3xlosB9EU3-)^rWUk(pP0_pWsWF`ch)Br=LUuK*!
zCTwQk6AHq#2ceo9mE<*a{ZLxoA~TDe*-7s6{1)ZRyAE~3l+=u;EioGv$4p0BP|OHJ
zk5+qwjUrBu>V8A>`}9<Dq*E#*M>Twn6JHU0A~&=3afZa@07v6+*YgyM6<tj@o*?`u
zKI(amxM2%PA<GK<*)F(D@LF53G33&E9Pp;%!wdP}Jzo@p@g141C#>v+Q`3a1!zq?R
zXtEj(koe4>l6fc|xs!#ajVI&Es^i%jZ_p+@%{)m+c_r8G5?{JR4S`<(L|)QD5{<rJ
zYOz`=_v=U%B<3|$gaXNUj#Z~j`s&GWU?D$^v_~gLSCeDT+GkfI!X&^L^V)WTN-*Y5
zj6}%x=Kz?5r`hLkW8j4O{9t#_sIFTtiTm8w>MFJ2@$JnaXtK_#5(lOmSDSJ9udAbF
zQ{5)8cHk{D0Y5@06iq2LXAy{_I^}UWj4qI1qWxucrZ*dVQ2iIq+FPg5IO~~x-lUmc
zz1MEc!MX_o8AjR+C$s3gJl?$DQ=q*?xR+|LSyJL0^EkD+-Z2m-6!?N}cw_reV+gMi
z;yNXH4NY};D%JRIW^0Gp*=B@hmZa{(MdW>z9)c8)2X+b+bazgj@18mCx9UwO-1G&m
zydrVm#dtc&tt!4H`Jl(9q&cty3L&9-`n$zTh<V?#9m<=S1o|s9iH+?<bK0He?M`*B
zFy9xBVG_xJlokH_=zJWd+U$7ysZOW)Caz9*aUj*|y{(Qe2?{QsCl|1NP<`8UsHwtA
zf=CwKYKkDY^*}J+5DX60_)sk;XZ=`fF6fU!TnwVb1NN6wUh*r5FNB|r9)+K@KhqaP
z1_uGpG%BL&gBbVt^O{VNB^WKPf0m&-m5i$?o3f&ryvSg#q{pEKUZg(-<4zEg>y_dO
zl!^;}(c$}5Ul4eO<K*65X8?dpD!=-{xTE782m+pppr&y0A*GQIk8qK;!7sjML9zN^
zN?8F$5$4RxT_<KMSh`&xN{ph1!F$96$sWMj_$?!cd94bmJIxd~89UN&9nL9*&ef=q
z55*N)Z8cAWFUod3zgA3@!yx056IiAmuS-lGT9d^)I>}!L;44Y;MsQT~H9AMZUKmI=
zMy5c=;3LQRF>&9mhP)#<xF2zj><Xt$_vUTekUo4T<g|jMkj(e)XCkV_bVV$sEyKn|
z>4W{OBFzcWZvxCgopu9StVuKPf3fRY5C@m6a?&e-W@B7a_LM;^J{*}(S^jZ6?$4BQ
zfZC>-2gDu7{TvZ2zEhy44qB;G4cr*y-v>TJ*XHp9YyKP9d_`IUd8jMo*dU5-oa5&^
z1J@6sHAA>Qz*d(JC_XeOF}VS1(W#|&1~sllkMshGl-VaOhAJVK=9{56(_mHc*!`;F
zlBJG0J<HA8S_2`hoU>{F`Ket(CFQR!N*ekR5A&1hibGWDxR89};9lpab66F=0xy|0
zn9}j4<l-(M2_)J1<-#O8B8t(OLW0PNDMQF;E>0Jp#|`Tv3rI?{ZlGH#8uyykiOk5X
z(A@RY72<JBMyfJvjGkKv=~0n?ndm#Hi4)+&U_gPCXVh=cKuN)kEjvu^qC7K!QZv8A
zREnj)Cu;hRJ*eq%KKvTeA(W)n%qc{8XIwVIYqLguke64O$!Kt)8+Qq*+pXtxc_8Zq
zQ*PU{f}`S5ak)5E7>;NboUbBeOV{sHnXdKX=HiOIExA2|3<KpAU5c{A3iBHEUU+WY
z_kp<<N?Q#1O6@fhTV^*(s~(1jULzg`QP10GjdKl4I|^Y_3iRQ=Ry=h=CcJuxkr?>H
zmj45=BRr7$R+y0&Ild8K^;+hjcOOUd+~$3J1}Aoo=(b*6zSAqJko;Z%@~6s7K{Y@Z
zj!;RcgX2#3lveI6y<_MzJi4K;nbL8C6o(!`{*9E&0aRi6LQ%**jEng7eikH7Dhgwo
zi*8LCgB+|Y39PGBvfssf@7y<Ubq<&2SxUF3<1y2hX!}%R9mEPTI~?f?^F9Z-kWX-1
zsQ^;D{h=dfk8cyn_BIoSh`^M)0%lp4Tk11qspLaE708w%0s-sTJ;f_d8*}fno_0~<
z(Q_c(G-oUYH{4(ta0qbo%a8ngC;PoU;gX}};^0VGaZWj5e+n9&bqu5siH{Pr#N_wb
zNWb3u8S<qcKCDx6CElU#w1Y%6XV=^0uX-ePUKg3Nf($HU;$f7e+0#bY8n$zNx;)1~
z@CQD~#Fi`l0M2>cuGS0JV!!97gmMdtxi378?ez`+;``?(Oz|v2<<aH|PUNF0Kpn*#
zSA+AB3x{o*typWg^2G+uB0@)O`EZJ-pl$NR#tkngVdbOe2X0v?PcBkEb-};%$$y^|
zD+*ZvfNhQeSrriiCNK^FEqV?%s-w&NenEs45~9<zpDfcfGaDaCl5!<Sr9SGuk2jGI
zSnAm=I0KNmZ|s>IAAC=&5H2VeSWi`oN179~OD-4<W}2wkPYWtAJqH(&(cSqlS6+<2
z!r25*m1>!TrM1pK&`a&+qE^TuHOb0S$HEpz9a-kgKV=0-qAA+BKrY=5fw?ehi`K1`
zI8tza50Q%Fn&JfmO?-eu(+Q^JGh{?OTl8Z$@kK+|7ces$Tn%n6m@G14BKga-?T9rq
zO?~9nRM|qt{GtZgLE{%_eq$!<(B9<x{tib4&8ZSE3>#{ydvi<o5FHoV%j(ugDb8`n
zxG)i0k=n*YezrL!$0kb`?~M+`1G3KXi~X^05_kwcpQx!kCdn>8zw<<oB!}+1geip3
zQ1Ua^)TFiidn^BZvKQ)Y&rA0yI;@>fJ;zjyAso;JoSbX+zWCZU`=-rzj6<ALy-&^z
z)Jn_5-y>YbD@t{V5|h+pM?hl=HXO~(J6aDnvsh>M+VpqPh}414Y`Orx&L^WYr1?cn
zaq-QuEG#Yw`*r2TejCe!9?kLEUsXi9WfYq-L<uLjj<ROEis6H^j<BsU&Qa%>9B}DY
zBqX(R?fTiG(uKr$)44<wk7KUO*i~VG{Lat#aFfx1D-v4v$Firjb{VbVIFHA-4u^-7
zV-HKW9u_76U)+<m=vQmy1NXZJ0?v3XdMypimU=(9h{(vFg1|pae%R^f=q5tPR$dbq
ziYN<y7@nbX4cna)FpB~{IEq}LKk0kjG;s0GqWR2z)_p@mePMLqK5zd^s?mEvHyxsg
z09zGO!9E!ET6$u&@n7Nn<TkcH4l><aCEtj%0|l_qsR@<dn7Q$vMZsiw;PO?x;3%fA
zw>$as0Fb!^&C(eSrvL|10O+G0$vD!#Y$1PCXDV!;^{80@hgfGNO~=XrAbFxJ<c_)X
zulVm&5XI(z?R(=LtucG(>S;O{1yJDN^{^n;!;*(a0@KyTy>6`GB%fX6n7!ZkyM|z9
zeaf!x<!RBTf;!2{y#~16xf)n8(1%YKykXXBrC(_#q-|p-Sm+DXi-Uz6NSh!FOxBLt
zF4>Xn3iaFyhMx{}JZAN#=YkUsh*Xe|PRPQ>>6j|oScAEI8Sv;?kxt79T2qY!cS^b`
zvZ<FIP_Ze6I{J^K^7nw3%I$?-zF(g$q6S<V$CVzOmT2?ZUy>Uzv$1|jA9wHm=C{An
zbaF$Qf7J6ak%BT8fV-&F4jdYWlbno6a)G+mjTvZxNFR2z*lMKBgA=&lWJPPSG+Ce0
z<4|j<G`S|}z?Q4eP$nUx5O)XPQuju$zrNSZLtwnYsUbPHuiwAPs=ZjSmQ$B)qCVfI
zi^!qwF`gJoxUqYimbenK#ER*H2hYVn^5CB<{Rep6m*>D28)SZZysTMfD;f3l1t}vc
z5|y$Zii_vi9i+UXm6{CN>3APhRMVT#5O{^fMt%S0oZ_B;P~KqOt?cWifVC<hk#zRb
zPLHf;-%Qi@I~0;Fv1eAp7P-Q%1qBL+t8=T6*49@v#NITfW(2Cttbrr0=b@DZ;U|_`
zy5LkeR?uw;I3BM0*v=ge`b&j<NZrWJ3}14?CBI&6RS8)l`ngMDVYVbnep{-1>soM$
z;Y&||X6soRxPL7uH8D8o(92RgQXi(c%XjsKq;*;@Q)uCA;IdT?e`-lIMN6;uHcgX1
z?D|d<FWX1-C;kVSrH^~ADM>q~f`rNH_W7stmaJNfH7hwsai)9O{5<D_b@|k|WKYDP
z$_ig8OYFF{*lF<o$MF0Yb%X=2E+NQkeDe7(hTrNV0S_YadgLJv8oF_S=xNyMJFyw2
zyI&`iYL5w&cF7BUJ{$#k2WgXoQ^y>$<bC;!(C3hr9}Yz&D*|tKcqhZQEDTH=n{r}N
z-Ec?RuIawhdBFCM=*&#_v8wm4unYF;{DCxaKX;N-lAdb@#k5`dVLBWq8D@e+8k07E
znGh^%H;|^n^k!Ypxd`%CHZD8wmpAh1N4bHj2Ph(C_(3saafKxrIfJ^}18c1rhsdL)
z1H&h&i=A?p&dS$eDFjN#+)u7LhEU!*z)_!OlIW@1xP=z-#{V$$@gp7L0I`g+*Zkf8
z4y6xN>>|`WKPw%T#yX=#vPTvJCa{`pv;0W}AmZmNjr4k&o#xP%3=7mFSCaRfWJ47l
z9-u8yUWF6#d8$`X7i!oE^YN=);WNTSvYm_^7Pc&D4Yw$p(a2U&(zkdY=1}TH>TS2F
zQm%hG=Vj%D6(l9P1a1=@KOYfbP}eNDpf#97h@0XEPDDNDngd^A8p=D8a*y_=E=f7+
z)}BV)r}Q=}O&yC1^|wDUhj)eyYVV&38I8ty?Ef`1DN=(cLXoL<u^NII_ut<s`sVHK
z-!~e7u}JBjwuiVY0HzyMJh#*N0pxO4y{#e54+8Q8C+i67-cjuJj###D)a=bbe`P1f
z&r)ogXZsw|CCss7C$~g*q2Z|BF4ihr^=BhtR;F0_^BN|hocS`{QJ60@gwB@K8?eMT
zUv{#oK*wE}A>}Ym-NPVtxEKv6VT=1+u0Qe<U%GcEFuQRD^!4SzD(=<00ZG2u+3pk{
zO`8VZhney;PIR*<)G7sd8dq7Ci2XqxW&hhVI6@4>dp?yn6B1ni9tHjx2u>2gsdAi;
zJ<QfKqHn2lCkJcw?=D)lh+zf<Ip|!xen7-_eAMInhUBAav8+AFLRy+JD$Ny0{l3`i
zi^pP#uBQynDmo*^H<ub1q2OG73Tv|Gk1kZDk+Tm(jzZTmK?k>17V|E?_QPK$_u`02
zT52%fq@Y)5ZS8krzF*6%I;x3RrPq_qjwK{mbJ%Kg1|5;UqcB||d~<avM|gC=3`lKG
z8JxXGg)9n2!dZs0Ey}S_#WE`fBg%iv$Yr8!j{cQc%{@lg|FOwABh4c6b4@C2;d(~W
zqT-eteeYu=$u}8;<+V8SC>i7M;@}cfbx^ISM*Mou^436%kn5nze8{S)7WQmc;F4Yf
z*tLB(_9#1&|IA|UoTKx(;%zp1q-qmN7H*`t$J*wl*e6r}oJOd!r9%vlC)-Lh^vm__
z6<OzJrqpyM6EmVhMou)Z0giz>);<=6t)1cUkT^%&P9z%g!RH_-%_f<=k2ad`t3%h#
z<N%+z9<!MNF%hd{>S*nqhw7%?<ydRoi~nzsiHJC0%lb9=8{>bCK-z9-mRkZ~Hyb{2
zf9JU<3DSPq52lZ)5*P|tDd{D-!kqy7p=0?4`vUV>nc}2WDZ&YszR2|WFvZWd9>CzS
zgm6B{0U1_2tmv3&=6YUoD||s*ZLL`lbO(aZ+;YP_O4CKzGva7#wM?~_5Jql3y-B0|
zFya^<8a{G%4T|~kgX$@&dW8=0yUygx?{R}~2C6Hyca7sdb~$=1j5<JY?%{v%csgk%
z-&2&YeZ&i(eeHrGo3zBq$s*pf!G<eyUrORJQ`XhL=FWS|TCBRRpXkOo4*0VKhN_a~
z&zp8L4|ay-piB$nzo*dB|19J1cIXp}XDeBy`3bS%cTg*5p}-$s46b6xb6Q02iC|gc
zyYpKA^^PBQ)ol)!ev#5G48(5jXUo=1yJyJaH*B=IcjO0k&x7izOlfdj?J6`TtkM+R
z47<X%u=jgP(;om!*_CK5??-lCZER}+QiAJ?^KKhx(^3X~bCpJh6UJhUfUuZv(fI07
z4%`9|uIm_`JDKXM0g*2G^Zn<lGjCx7bu}w-IWUJGinRJIHES>Wd>QQw67L4^m4<yv
zJrW@Gdm`p_jY(f$fgiX5&dItS^P~GW{};pOsR}p?V$-K@xS^@2Vex>-b`@e3@5QAu
zvXP$FVAr0`k6OZAE;%j4O4)$afTLI#J>Ea&we*5Wm^fP|kFyL-U1^fPH?@omDFK^}
zEf=MyL<k8Qu(4b(HEn*P;^ZBcr(J8MYm<gO2e+D;(+ojA0|BTgfX1_5p@zTMRcGGo
zUNlD)kk)`y`2_qAv&X_8F%?2nKv%T!)X*X-GoQ9yydTcomA=<LkN2+{9>posN(|?v
zI0gFq&Crs8465$D**$%7zCjJ{JGxgsap@(jIQX0nH*d(VptWWZLneU-@)X9}Rutg+
zZDCApR?OB@&yA*no!?+fi;qdcFcdE8=WOrBS1r_J52^LKyd7(>h7T($UfImqG7Yf|
zPRco2?<Sq7^Y{D%Siva>RT>%8!n}{B)hX@`fw-Hp)Zgv0hruWPup2zIsdVyu7|{TU
zU+iGcI%g9*z7n6a=OG9sd2lAJluZ0B-14NnRT6)!y3s`GlooZ_E8Mf+I#Ai?7M*;e
zPeWSwky-?$s#4pOOXTjPHM2KZh;za#Qqs-UIQAi%B8gd(iSme`5(`k0^jG+7OG);#
z@+5zz1JyB$xf$AHIvU&ydPdN+dB}<gId<2Y*X1U`=k?t6rzBV9%TlKG2f8Iqzx_^D
zx1Jjn*>p}EEMq0&4KVrsUiG0u;aLby^Lo>$0(`1R6MpuZ$~)j7X!&(|(BFLle&)ZA
z-4la8pc2cQMTmU^%z&Tu$MRJqhtxaNTmH~Kem}4UMz!`%!H*qpxi^Iaynf&1y95j%
zNYe(p`p)Wg7fuG_Y#Ho{`do!Bi@xw%mmkpbF|cb<QvaGsoqlV(aWx}5DiYK0Y+567
z-6>ak1caXK&nla6S-G7QkAtr_?nM3EX`F9`TUIkZ8bQ1P?e$GHaZU>kfa;^xukm9z
zW9!RQ<f|Li-8VPj&LFCic0soHe|p|OuWceee#FoRM=FfJ8_HgQH!ml+>tMV`+sVNb
z@~QH(y{tmYP`ZCj^8g}rQ*Qxl)d7N^3o_@bwrPf3I#_?6e?@X6BHAp~ItWN%JFUih
zupQsnC22eC`>Th3`U<%@J>YaXMtIWCl=t~DwrcHW3wE#Hb;(y6R%T2z9vffGa@_c4
zD?r<yc<M=G_KqO=P2@;Lf?tKP7u<z1Q3dNt%DW%E@9PzG+2ti~xMdAeR4kdkKFY=R
zH0Jn{>c#RGFo?VFGm%r;H`d9~4{+E@h>OmLXn9l#k-PT7AD4jJ)#NCCtH}RcWzXUG
zsOdTKFx++cn>zSZ%TeuEQV7Xy;d+Fc(%nq|B+lr<);ig<zW|-$S&zPIb|Nma?~hX4
z8kJ^YGxWrVd$r2M^O7ZRag;A3ZASOU5!_xyO?`|l3|f6>gn>BW7ZbsogbOFM@PRe5
zx~irki`(T$Ns>*DvUb9()^sV8RdtHwmrQp%{_gyy5{OgTSUvcXA~7ie?iVud-H?kf
zn-v_LU`xIJn=@(Y-tXSV!We<sJB9iWvU|85|3!TM&>!!1;>&-T0qUM9SDMOAr_U$}
zNx_)|vQoJrpzRjpC<6}IRoOoTnN%&*ZM;@UhJx%MEAxuQy4K4kHB5Cec%c`<(#=B;
z-YHU*VPnK{&$qq`Tb6wH2HbkO8h>N`ZEXz;SA`Gf4m^KiGN@_{y8UFcF$Lv-t^pI3
zNszv<1YlP+b}eU0Zo31=&S`bZS$Ywq&Eu&+iV66DytYZ*ZzZVDWRTmrFkClLpHhUQ
zMzLn7`FG!sx9OkRs%a9QvgaS)F-V7dNSfCV99sEvl8I{%viXP9l`I=};19y&FwU_I
z!P}p8h4eqG)L-rIIb5&Z_o`D38HRU9dP$@o3aTMmEQXGo>Do^Aw5w_kD_1F;PA3?`
zp=IqLWXJuwz{~p>8rwA)I$+Cg)ndP0+*C=)QjZ(3H6s2MEAU~1)BS;hg#R%Q8I5{(
z@1B;S>7sw%O}Jog*6fk=5%Y-Do#vqVz(jpwT>iw&Va7u`Csj9LcScn0e|9@I2UGt3
z>goGmM6i%8elA!7TabujoPD%|TqX9+|GB~Hf2+sE-^nM4Nd1o|XP$l+&GX*<TJWc+
z!Hs2jc%?MBh)|lm#{o98tlM|OxT@MF_>TMT&xT53wFBS2(7s^*h*(v8Rmk2wm0e?(
zWPf#;eD_2m7vEuJx)!W7iLI_(>j!@b^CbIcHNyi_bBzujDP=CBLB(~@RBPbZ?2kd+
z!vO?+gb=S|?)n0W^l{8{1Rp*854`}Lnv$t{xI<j6nk81RK#(V?KepDYZl^JO#x)<R
z-~oWh0)>?bjtPMjXs`c~*?tnDgSK65mLK&A#^KIQxEceu^3bOu6N!w&-srnrFcE|g
z<<lh3)%HL99JJ*wUCd5k(@usu4hrO1^}`tOuwWHiM0vLfrK*-r^LE^O!MvqD=;%dA
zK3R0H-U7tO7Pjbxg(TiBR)T42w<=Qbdrg4_Ks7m&?g7Bstp6W+li+YYt7w{@L?k79
z#yY{$%`#XL{(omDgj1PX>jLS5FpK}fy9nq-9!S&e-~+?YRd^_(ZN3PoOM-1hYsNwX
zPNu6!Ly7k<*h)#(1K+4v>^6lJkaDZMuDPSJoq7wM=SmTBvM=t~nV&u(W6h1BMNM74
zPxe}m2_%Nrf5z?K!8i^GPUPwj^Wi%Lbj&3GNM84Q{uQcvx5Fa(X#(R|VttPY%P;Q@
zl}P+?=*I~*3`PW`y;$#=d-%U2$v*GHJwW|aSO0bGAw}F@N#+@+M~9p3htZ9qMAP*n
zCrZLA2|$AU5(@z>5`h1Qwzm#yt6lqri?q;Eq(Gq*FBF#+_aH4$yl8P+oZ{|5OAE!_
z-HTJ)9RdV*2<{LF5-j1Q&wlrJ_OtgpbH4L_^UY)?L;hjiYhCw!$xqhCj(eT8A7zS`
z8jAAkN}NpRTdF_nf+9}Dd5rz0C&)=q1h0jl)m1qag+3ZxlEB!Za#%G|aZQKNmri-_
zF<f27O$YYuiA=ghf9aMwY@7nSmmkJ5svRj@SceuLCV8N=F!u73C@ltKEQjHuwl<TG
zydSzZZAi11g{SjREmiF5!Td}#+Y1NznuCr1Z~W+o3J*vt{5bJSP=lv`%{@khW4VTA
zFW3L;$4VqNz0YF|ef;s=;9K`|{;r5&_NkUrx!=|`hkL^Yc^CcpD!OcVt?$yzX%fyR
z7G^(Mdaw&$bvdu?&RUqQqg0Zs&Ky&v^?V2M&V2pm=I*>fFJ4Q_F!R01I(|Rx#CD-o
zu88-3g~elxi7I@e3X3{R28l4-+oZ$)=xik?>bJS=%mLfZxj|h{@4E*Osjxk(?DD@|
zaOE*dRLXs6Z{RjdiHmxqM(8hY`)#rw3xFUAW9L^7UkdmzLdfHZi!HJ=T}{5pv(878
zh16(wv*f7=rW>n&oRmW~&ZZ$MQD%%@#n7o%J$Z!RgO$Bt=fN2%{RqUNf#)hUKxsMh
zKJ?4bN9LOJSt-!0B>A=E;3S%Nr}SuM?tC({(0xztl+)(-l;6iD<-x8wxTq_9C{HuJ
z^FAAxlvm;<=2yr9(8wrHr6yxfg;qu#B-oKOvT+)wooOg>Xm|2qn*@sUL2#1ihTip;
zEzehPn2}r3R4!oNtw4OYG@C_}6EeduO}nKFF2K1|W&%6U_(IYD3R@-x^qbDoFh=P1
ze3d@&YJLj2Fh?P*#OzXdfFFI>nq!!CNAG2$^aZ<cto+x+iLu6Ji&|(=iDP!;2nm~(
zy0tcDT8QTNM=0nz*YKJ!9f7Rhn*twa?+!b|KF7PjKwq?{o_zeIsU)i+K$uO*zw&On
z%!W8Md}3aPIy-tL<2wKM=?|6fvv@A({=AYM$3~%h8{OqzZJZxUU}{@?o_~s^YdoAr
z+iSCE5DeBUXt=c*^V}M>SZtHC)~Fw43eHOl>FmO`b<vN|OBFkyh$AiA-;1%9*Yg&J
zeAXs9_aghuAvN4dwsB2LFYRhypvbKLF0v2wmxWq(Z55R~>HBXYCuG6kIYIRg@h+)F
z1(sh`o06n&mR>f@OkSlAgVM3OpIPmDM?BdWCtCqu8-(^V#MTfx;LhIYS1uf>;g{JP
zI<BuYTkcEpYgo@K9VHq{MDJU*iEbgg8Aq)Q>F&1@?IF(!n=V}!?5C&IyAYtcV5)u3
z{Rf=!R2n`{Jc;jlXtf(KH78hql>2_bvC+K|iT_DT_TP4I@Ts@Km3eiH3?~1T!2VeW
z{SrfizyAF;wE9Nt5$`|BBvf;Q?AYjRe&?NVqg)f{=o%xL|6HWm1L(+X!wA5CF7b?&
zmS#4}{I%wEotn+00n6jK@lOyU5xUdjJPoH>*VhjvNHG%#$M9`0GesnJj*)L4vV1MF
zyv4IxYWQ%$IoI)-xu#1wx(y(Hgc5mb=wfNr2XRn|z3X0HZb6rh0-qWepH$FVB}H_|
z9e)|0=49^wM>jUDe9*M@pL1_f7X@0j#q}SL$^N!CAE1{icldle+n>exYhC_nL~r`-
z91UJPK?Uw$s?jefLAbuRpU)FhEA8$5oA5cO-)}0C+{)ZZzI=EyjE|!yw^q$^LD1Qe
zoT=(;)iK^lQ^qg6`Rlu%K}`F>8kVJ*yI<Gy=e4IW*5WuR?F{X-_<12N#L|#DxZI6J
z*eru=mT&3LL}PL=4>aqlNi`HG>ri5Ck#z~WJbRwDaYWL$w^Wh2{<_Y5j`ZkIbQu)-
zPcHzuO-<^0o@Oy$6YF$q6wjlRb(sT_Py@E{@Ik4XiY8knQOtU;!Noma7X?9X+eLYo
zjN*(r=gDbHAq9;(-Hc-MzGahW%?`}rNCMOtXcZ^8<PX8+Up@l4uUG$h@E$}wz;9<T
z=f`(O#3ZVu=c5XE-qXWv@ya+EVDHDETnp@z-)Zta#W_vE`#SRy>V!GP(Z(%_XP<2Y
zoXWxq*o!R23C3ztLz+h?uCb08eHZ*w+%>J)Is1Q|RnNbegjP~CqpCgAylUg^WCxq>
zKO?U<=?YEWHjKjA=j88fnqAcp2{v&_8F?svbEuNCQ}vXI65=-qrY3_bewY8}H{jor
z%U|ndef=QhOzyEHH7d<oJT^Khv*6TsWywf8<`(q+#&APzie(yPrF;4Dy&}1A=-m8I
zZTk7OG-Xz<u;j!}8;eCL)92wW!+hgLvpyo+r#Wgk!;Ygn)1OF9_vS~;seQkH^Sw4^
z*&wST_keEO)5l!Qj~t*|Nbstf37)AUaKmdan#e66rH0Cht*~igj-BL#eSdoIFs33<
z@DN)nD%y?yMEl%7qFv+De;e&;do=ie$I>2PhWd*ay<qtv^r!d{=fM8n{tQF3kw%xE
zuQywSG&lQLhM6RNeSSnyeYPbKG0df=Ns-tf9J(pTp{`a=$r-4t`B8>9T-Xvtwk?p3
zocwhEnWfaamsg)A_!Hmblfb9s6+Tqq?FeuY{mc9tC4(3MNZ!)Crm6C`2>Ro;wvom9
zkD!R7U%S8Z{LHly`Uxh2KhRt+=}X7gxU`OrICE!|jQzK$i#rSD=D)?IpZHsXj;A|c
z8sv)T{t8%e*T?j&4`Xw8ENkJP>nQW~`<B%WUe|otuvaikbj>ImJh6E^4eQ3pp0IKi
zXSB%M`IwTv@3xPo{4t}VUx+D!+PMgNm<KX&#5robe&0e5q%rsEFyBubPzSep{%6%Q
z=*omoJD4&0!`*uPx6A*39hhIYM?PjYO6{Xki}?ol+k$g>h_&2|6Ffe4-9P_qek5X0
zr^Ieq%3nN5#^)_tvtvi8g#;>wr2f)zE!Kv7uh{EZt*~sk{h{%TIMm<(vj9~K2xct*
zN|+e^Od7Mdqgmj2>GI!H-L{g<J`T7fnS%LE`STa2|8sJ8xhZN~YtW?Pd?9c0kHn)z
zrZdY9&@kZJv>~_MYW&~52$sjcoy#;TPcc{RIlh$!{S~o)Si0g9*&@825$61W0ay9r
z5?^wSnL5-V^dd6l%N3_rrPd&!;f6TP0RFSX0(;AchO>k(k3GJl7!IQsy;2WLoj=c*
zRsERP4h<KAYf7@`xAHCi(45EI)-%svP!+)Pmwa6rV{ZITt3A)>A0y7Nuo)MSa(p7L
z$Kp3#4_nJRgAyk-lx%rI;y?WbFwg|j!q!-_BT%alr1g`?NHb#h&ixPq`*i;?C3!c;
zs3`5_0=z1aD`<Ws7<h3v9aT}q7H(j${ELc#iTUxw@Emgrh1iiYwJOaj3McFgzPb_^
z85usYzF7j6jh;KPs)>?I{b#~}Jxe%|Vt?(QkI!^noJ*aoSKs*;3E#*1|5}dt@3$Xy
z5>~3BH=<fDOWZ#}_}9s(O}F~AvClXbCAfLY*(%bnH0WAl)W5l;_v&kYhYG5+H8*+;
zc&9Si5Z`4wjcO9dK0zc3za;e<JRmK)Dy3=taZWqFCh>z^Ta}NLU2lKoTuZLNv+m*_
zz?MzJpYc^e?1#jPqfM^3W2wu%3^MB6`5zw)h`Wp9%Tw{xraL1QSP<5m9)G2~$^ZLw
zM~_)4DIcxJr2co?m>EXY^PAmqkts!?TuG<~a0Y7ghI>VR(zx)N0bAw0Vxf+!_9dd%
zapCJ>#%c&Qi`7EWubgCrzeL8G_fIp%+cfDfW3J))hj;uk6ydLz?C1*$VKZ(fLh^6b
zL{qK)^T9b1!K5B~kK|5rJMTx!>{eCyCXZ4J4N%j;oUA^3*Z^G4>0(p)*gjp91Zv#`
z6cE+?4}d%WO$?6rs=EKSRB@y9XQ-1UUEiJu#pN^dM=h4*)33~=v!u?e4hL6}OBI0$
z@o8N0#%uRGqmg9{5hme&r7e|)w2}jURlh`~Oe&v*zDG($1?!0R?Q(bDmvP=Iv_gUs
z{l|@|N_XdsJltGEO8qntyX#^}hWyRk5N^<$ji$CgEUG_H<Y7rrNr2Oo*ZCWLZ`j9u
zVTXV(+>?jj4X!?H{TWF*E{pvGsr-MzkzztStI3Z7SUe%7F8Pa<{l7yv3V@E2ODCPY
zZTFIn>@R&8s%T~Uf<9WP>s*q_@GZl1N98ff-$TvAG}cjJ+lIdR`(qOQX%8dKOFNDV
zE3<y5hp*RBs(+!;S>bXsmL@PWYm;V9mj``TJ6+`E`B*0qfr1N$0pq<J{oCe^+f^(e
z7P(n3Z8;DzA8NS^KmW`yvj6l__fgbe)?QC_M`KUhEGd|VkMOK4DsBV@TK%wD&isQ^
zq4q5JAA9!he>j6J$1^u2PzIL=u{!?)qkR%C?pW($Rhya~5i%#CE*JkhmFY!9C5Gyy
zUEf`%WV6Y95t!n&`-daXl3%gOho61tR%yVp6j}wR8jR#YvxVVRIIYAR`SC=iTf{7!
zJ6JDf7(*^*t>($a#KTkhJ{IM~6ky#nD3j>`UyOZ0^*c*M9ct)}=06GlZ9ZNthmz-*
z0=Ujp%r#SzBme0LMNEly(a0w=xv|5{CoI`%^qs{@*xgKLAbCyC8CNoRk}&FPhOA1%
zuM}#I-A(f+=)~zwvfM<Zar^6b0LwB7qgEmn?rnJsSMQ&XB1Pq^cG*xGAr*aHVL0sp
z<>-wS(8Y=S{XRL<v#?27Wqv>pSE?TGXCDXguc({jC@T*9F#Z`YEC2=j|8uPJU;WIZ
zKX|q%p2L5Yz#R*(SG`AHf8plg_0$v$kpqzzv#FM)jl2NqHftHw|N7PPjS|co_g!gh
z^Taa!Cjq86(HJCw(c|$}TVX~Xq3@^sawnpA*wY<B-^%Q$Nbhq8Gcq$hCx<`NlXJRU
zQ!Q3?kX7RQqB>&?K#!^<-40mIS(9QERugJ2ep{48O-5~|Nx%M=PU6k~Q77T~jG!aW
z?^T0r(W}3e@V^Yv%em9i(Rf9F)%mVvZT(}1d<_zZ(Xk5&OHUf>A5w&>X`A8*hgzvq
z9ITZ`9~{pA6g1Fmj*z3fAdXb7?rsD{_jHsozBO?tdeQFg&(H)eCM#QaekwB*gSXS=
zvbVXR+xiE5p#Oi6+3Cj~)aEMz66GGtD0Lrc<iu_xH_zKK02KFP@)3Zm(Khoh$^6aL
z|B)va=J$Z%332Cp$>)C~9~dG2OsC=7q{mswUOv%(F~(_B4ly0;)}<$^1bDqe6U8WV
z9zY=!#nzjqzvVqJy`G1BZnUsGTi2>G>_smH7<x7KWUIV8+R{^1?Uc95%-%iEQX8Lc
zK|cIevRfMK4cN+~^v)b0;c-HVA~Hqgf4B$89qhYnBNti(I~^xDUW_D(yBK3=xLN1i
zurE$2m>05YSF~qtI!$?-{(Sy-!0Vqa)O$IaO-(y8mS|cG3Rb!-O6-Nur|Jv0rUxRz
zwJt~AaHlFxV}q4IeOuQ=HBNooSJ_zcX>khHNiKu$3};P6zcBzSbtBz67Hn#kz>c!_
zG$CSIjM`V+Z=GY{c2xN&&#58SRL!Frr391s-x3%tm|tmVZ1`?$Y`r86*ka3-!RRr+
zl4aR+R2%5$IqtVdd=+b}7wg}G#G{5vQ<w=#^5%a`@^A|d*ASCmQANGB$3-GbpAJgW
zbtHZ)H!Z)xx3m=@$sOk|wA|nPT++C%Q(|T9Y{pMIz@FEpTkN&qfzQ6L>*(E@*Ur_x
z_<@sSw%ZQC0eyer{Jor!JPsJ_7I%Vy<?!1L?gG8Sc=Ap1F1ZVzlYJjUZB|ET%X=t4
z0c2YKS6DO^!O3tyJ4tMe9G;E?E^(13m|)CA>5>Dx7t<dpg`ip?-<&Ee<Re7SDrVkJ
zPV`bz7RNGVATx&ZD2ioJOj?_<e`V6vJ6~jM<1pXsQ>Nc#Y@@V>6wk)$Z6k)e+5Waq
zP<!4a%&)XtP`7XPHzm~+_M^ae`Jc_-eeliiUyob$AF9Q=FM9gzdgA0k!qchY$kja6
zI0<5p(+8hkB}8Rs<YRb$(XtM;ndAUZd>EES#M4c$`F~Rh!kX#%@XB<kBQTEqVw^7*
zSK>3>&rKta9~Mh)loa_Fx3g<t9Aa$XcH7)|jc;StR<9z>%kYVgjt9SMYb$FewEgIW
zm9z@G!(<9AJX=!JrQSY?*Zi25)47Lp_FeIjZ9YD3f&=vsrGquY5T3m+zI>D1nr-x=
z!4M+XGt&veg2y44=I{P|WQ4kpK9kasi7`@r0Nly4&V#QWCoJ+ll9F@ZsaFEpF9&ma
zJzp`n^90B-$fjzT>Axoz)y74zu6?9<cder|UhKrf?Tcc+*9qDv7DnC$Fl0fr0$b`G
z1t<#k_pLMxkBo+fykw3)hy_>!A>=Cxlv;WpxJf;kRp~%55OTy9@@7b(512>1qu(?Y
zq+S(wK)M;iy4gyx1;skeBjSl)QI%g|E<PMsWa~jG8t+tRgOYnZZUg2w;Znt!19Kt9
zPD*<t2i5jJ+xcb6pd-aY^+($<Js0Md5VyRNxWyhwv!FzMnaX}r>7~4GKA6*#vAM@Q
zr^dT1rX6kb2KE~#B;Za^vt;GDBkp^pcA0bb)$a)^$peEUop9@)fV0h})P6Y+IYC$|
z8^t<=(tF?{WeZAv<5~O;Su^2So37A3%>ij$aZ<|6@vRssM$-2|&3$l}U7}K4&>fvt
z|1FO^@P9?qks))v7M>YC-=f*2_J~hfx^H)Wqj0*zkc}!g;!<;AY`n$9ubV^nkwX*M
zFst0D)oNAgk&!B;<8?si0}V}06(=>(>Dk$mPO7aXlIcxsX+P7OTdcjQy|XiJ@VZD6
zq(^A6&U)3~heYWtP_w$1+4*qbaY|a6>QNlp$jprN!qR*vucE$I4P~KHMt)lxE$_h8
z{J7lFk)oA!CUfAGD*C#fnCFT1OUl$NY$$Dm%h@9;y^5Y&cV?D6;>T!v-6p@Z(}p3p
zD{$E~=?dSAQcG}#Xh~$W*?<ox#sryQL=3{n+BInXE+>Wnx}^(3%8+?qLd?sE3vaBA
z%%jdn7$edPVhh3J(dNp<v#Vp@9l5~6!xJ-#<h#}KZWE(Nsf~8Wd41cS$t-=~iyrQ;
z?cmVF?~_U9#e=ji7Z=NZr-SC8o*sHiUl>`rp}1g@)z=cs+Zm-T5oMWAVp{7<ZN?!H
zrM`+{;GVV0ul;5vrKN)g&5erL6WGBZtGP9bZPVXidHM8I<(J{PyN?GjSg8@aGJCjo
z?il(m(;8C)HIWyza3^U*2w&7w`G(Ar#fE5>Zi$rgiVfib?doMh(0uXi?S~bai9{-Z
zQ^HNix41LcgFdB_@?VZCVC|A<t?MraZS9wO4YK;JttH+LJbcr$liRrDF+QPQuQ!`>
zz3q;V4hn0Qm(QAP6cuZ;f4h9-OfT_vJ7|>GK+pr*-S8UGN7o7?Pz7<TUhp5D(rj5Q
zR?cS4zR4`JkQY)!)ag)wQq18Ca?Oux6jEb@b+lX+UU%wcswNY1Bf24wPb%>6id&ze
zJw%zJuy~K`$*JhrD1I6*|5T}$cKzb7NV3`MVkw6REK4JcB&AH@!sGms&?gIRma#WS
z^iVRN7vt~YX`^tW6u!HGRIb151qxa+8&E?*#O}~et~aOf;$$~;L=5SxA{pKi-tn!D
z@4|q8{<;Q>`QHmC6zs2rs`F~zKB%9OU~l%h%e`y6rOt(1jktkJk=M--Ip}>DAZua4
z3XJB=+X1dYr}Ex>(0W=A5w1AkRF!N;Y$s0ox~jL_)uryn_z0&ZSBP|FQFMw#Ijzph
z>?UX-)?PV5-4_pbsR1{DIgdf^=P{rb4no-s*=2O=jjJCVP3QZw?1a}L?OxQa2a!ln
z4$mx6d6)oij|1$&Zu$NFM#a6v51$Qr>U{pTHPaUUFFdRmUFD{v9yuq8$&r&H2T-Z|
zoykyYFlTll@PfB#ldx?@q=pi@^{Rc~p!Q}-pT#^#fWo%z+HA`L32MKSkD2{yaQ3ok
zJ(P7yfw?pHMiwv8sRBe(`qZJ{tj~k*^i4nd(B6yHdCQ>;b(;PV&-?0{>#1Un8`!L`
zEH&bmv&zYZ&8C>M{pgYBfnDo2gPWKgHAThv9dj8SDKX5=wQOzeqkzbXiA}a%9?$)n
z*B%HY_#WYQa|u6Ao)x&-b-Qx1D0Hm!(&t<wG}`HjZNK5wKMSZIK9I}X>1kc>RAseb
z8yeUQs;F|b3Ld)p9QZP2Q}~N33>}DiB|@gT7^pDMDbeH$g;1%YIK)uc^)RbUw{F6w
zep$N0y_a=Y(aw`-*%8EkC$8JP&&{<5SXv;-%*dG^|2PdCa7*zPGRUisX?7XxB!d}^
zQo7y8iJibl-AY5+E?k<dXKj!9dze{LSOFrS9i1P>c2SKjC-p$LwydL6z6pHf1$S+7
zdIaobYr=EDEtN*`MCvE-Ryi||(;FV)vFv4c|H|gjLJN^GbH)KQV6S+ljoZ~m<-5Nt
z!!V0`it2`&mOQvH5c9ExyNhni?}4}AOgY_N@iz+h5U&-*islQ=eG8l9cC0)=0)%*j
zF7~{i`ie%>cQ>l~Vp`cd&A{Od=L*4+iTYnPVrC*XYiu-pl6M4?HHAu(>~>zwsvbx3
z29ozMe(KGTw=B$Jj5Wo*C1+ZOpsj@5r=R$oQy^ny3MjL&_PW|M)Mjz<Em8S2;B{t>
zFZvF8cYx14j*pmc-c<&|>jg!JwcY?(h4V^wnn0j#w;7jy;y>u$zov_UmuTiJv8JQf
zRb@a&-DCkpMZc=S$<<gQHD?H?Z@vlVd`&!`opm-Cg2c$s_l|hD$p{P$v>7-=6XaN|
zPZ#pgtZg@rKyDnkO&EHwLiZ-V8pz;_dHb24R7nNj?@56T7Hgb{z!?X5mouJC&>a-H
zexEAiykGv!{y1fsQ4m)aYs5Muo<N}Ft*c!&h-1E-o@I#c3`0uV;oIeU5wY4w1HMcq
zc_YuK*`pwub%LnU`9AKUE%z-@M&U6(U^pHf=_l?_IEI&-F6=fgbAOy$l0c0-3E5lt
zT9FTLzek)8&_xzaKYyoWP<&I8ujqW!F?UE0wx`m(;R5k?U=&>My5*HuKZRyX_3wta
zUqJRSP!Dt|adaX<(dJBJxw<j1yvY%A*Z8i^W-p6+`9j|F42l%=IqA;K;Wqb*OW*qK
z6FnHm*zSGu*b`#t6M$Sa^bR@(u0D3{Y4Uyl$wbPB<HUNC#W!jn1A0q0iZijYoEks8
z)9#GL87-Dq=V)JAv)+{{Y}{>pz6bV|+7sD<EwkK#p-7zHF10FY=Iwi4Ql`H)BlWOQ
zUQ-~eM$o3I5#w#FJZ(h}lQT}R-;%OpMa!j2MgEhFPWPR3w+UUyji%>WDny|k;w$WO
zHmp6U1t)TQYzsG8s1wkC44~*={9aThn;=9!x$r4ly!y$pra(<jT4LSKMyP1>e0ZH6
zX>7AL_jI%Oj_`VhjqcuMq|duzy}?rTG=atU_YVDVi!W9~*KO;t;P-9(5JU&Y(7gB7
zil`i%8gQcJ+}qYV8W{B<B+JA-v0@5_>@O3uwRG*h8<-QC7lW4sdNplW`)(E+PJUgv
z?jM@h8<qde4~v1_G$4mq_wxByvXqO{&5IbXBqrcK=f>xZ)Hei_qAsD7GwVfqdR=uR
zsd1y*D7VMXq29)@ipF$)!z8UB2s$iPN^^8fMt-tu2uny&a<Yz$-*loE#bg4_ZlQ!f
zQ+vKrYT%!@)1-~E+V6hM+PCXaIjr~TItmuhq~fAGO8xQK`P;`I{q<^7ULChcd>^w;
z7Ou{bRWakG$LitX{F(EtD1OQQ)5r*v=9BT5nj$B##bg%=awmww5}h9ShGmA(g}vhR
z*Y8zcz3;t0b1f-fXFMHwt?`~|&t1@ETg2xS1ku_$GBbyb8=LQSQ{VF6)GPZ&nO~m#
zM5n6dkMwm#{~4k9QObvx5=AP?5Ow(GdbE&34dZ{Wp(psrouTVwj7c~RzqA&|Cs3qv
z@2N<Uziwn5+uOd!uUKM>FFxao?KuY>W128K3!YoQb7Rg65O;VM<@+wdq;+dX*8;i&
zhsB_DS%8^HpF?B%ib&wy#`#2ITbv0=?pajmQ~E+fhFOQE6W0ga#v)guAbdpJ*~HNE
zp==>WulI7mv|Oo+^t)AM__e`J=KVqDo`bK^&V<k6Rn02BV%kwYz-vc$_uD~+6uGeO
zbWASYorgn2z4}!z$}?eS?(^czLI*~R>G==kmGVa;5O_wtnyxMp^dK#N>vukN!Zvq*
zjlSqPXC|WQ)#=?qxr#Ny=viBr17zjsJi6V7v^A32lhFM79#P0S9%ksy-g=lo{;4a>
z;jw5BtEczzrSPZ1^bGfXrq&&0U}AamwxhQBqVg%yw4scPl~qrtc%NwFmX<1W4Xef-
zU^CL$&MQm8<bSkSKj{UzHMmJ_zb5~!(}jRTlwmRRqw9Wc+bLh&S)&<T@Qa`vOL!OQ
z!^&PU^;P@)G4jl%^^w%eY=e0$%1vAdg^ix6pN=c0#d99q#Y0*MN}yKD-u~BJ_6pC}
zK3iCTywL=+o0CF-6u-U`ujqZr&hTRN_koaZTA>jT7jWGvQ_Dk{N=(wud(Y;4{h%V3
z%U4C_^4vv;Ilj!3(4(1Trm6wsL$B=YB8r1QqOq$HlE2qNi3W_I6!i#2o=XAc%}3oP
zl`<NAe^N8db*%yK5%+Rl_0TNTnDjRUn%R2cw#VESk$JPAr7`VhH?IJ=-omi*IJ&<s
z1WNKwYv6c^)da2FU%%pf2z9j?xk+*`)FUx`nV-`xwRlrt!)*)tL)!y`3hD}_I0Y-5
zQCrDtFm>-0)Z1BD%hcJ?am~wrF_<g<8JN<a!q<F6KmRk?-?2ZVRsz|oygCr)`Gw@`
zNmni|lx>Y^F*U$G2#;Fy{Ydz4=eFj|{ezOYX)J1wP=nKn)G0C0kjLR&44mz>tCzOv
zl)qe6>Bn}#BVMLlEa{8q>!(5~$?5q3_gnIL=F`_VGn8(n^B_0mm9OVd_>}kbx0yVj
z?lRD=HN&Tm34pkbop$%126q8pi_<6v!qOH#C_cAe^yIvpj0ElrOqgc9>8Y7aaNE@h
zOX!)T_7wjlzkhk)^t_rm*CniavqwXQs{z-HJuXc*g-}1rdGnBq6o1P-fq9Dozu87e
zzbozrN8_yq0hT|ngb}38=e(}IHGBo$2Cs7Z9S;9)w<Ws4xF-tri1hTitjW}f*ugW%
z(P&rRY|Tqcu?m{%>98Z=6-dn5bq1KD%)8A(oRYLY4V>P}?$dL_zladQE-UW*+I;$z
zEy3j@L-Q^7G5Y82fGdusy>C0JitIo!S;~Ad!s@`SY0ma!TTi6Zo(=FyOK;ZCaTSRG
zKt|Ud-Q+@mM8R_$spcMgeIzf}M`;EFdxxpOCXA+GQWDhF)+KY(b-#aY9`O7)fDs;|
zOd#r>+PNQ<og>??iPOa<+2_yg46pFO?kh^Vb+)VXLsjo*SqDtd;rQc)_lQ(ES4T&k
zV7s+4S}XE>cfOm$mX@JUr_{FeXYmJyglrqyB`rNSJWv;1&DJlVH<dS-n^f^$YE;r5
z*C4}kiqS&l%x$Rdh58`=vCjrMM5O|9oes%%YdcG?1>XC{bMVf>Ydtaxt;Y4(Ql^;G
z$VMk{Wr0p0T>dfT_A3vTjrqcKl*ipLpZPFwR&N)LgbLkZVz^w*`h!+&R=v2?FTsAv
zVL{Sh!r%8zhn=_m?H44dDQm08*46NAo06_O=T7*M4RL%3<8`Kp5f3-F6a+E2e163p
zVu8cy<C_8xLj*#+xj{pOOJV6FU-~&TCN$4fl9HNhWn_o!9{dH99JqQ+e|Bg(6XX+n
z`{5orVt490(r@8qUg*fz9!>qg3o49cJ=)eja%>;Gl|>F($-(S9!?gg3Vt1UG1=NUj
z(%SF}L?Nn#_I<I|c)m~Co|M|RNkYnVFIKx~6PoOxJ9^*P{G2gfmnPcef@FoU&P8U_
zBU8Xb2YlpHlV!7C+${6f>B}f2k5?m7P$@?J`WqcERwI?a-rjb9)ST7BhhYH+YQv~8
zot$e6b?cQvjFTkjRR=%qag^_9!Ml7vY%}<uFCzfGs;q3AtDd4clot0Lz;Yj~qs@58
zRx`OdS6da1ylZa_PwaI~P(lof;*WEOD7#0zZ@?uTc<?3fCp_K6yiy<t<iTv^F&Hz&
z@{4H$ZTe*svafWvVg(*j>z_zVNlB@-l_|J55_tKM_bQh#251t+*~dNct)D@CNUQCd
zZhvEbO60~oRL}t#&Fb<Uw%R^yE9QMp4>rOcUmbhF`LSLv<V^J}gL#m8R)PcM5j;Kd
zi;s?1W%!5Ox5QWY)kJ!&aa5U3%NIt~`U!fR+WVTVdF+FAEV>_rb!5`$<dlR>^AtWz
znm&@>KA=IxtNLXF#UQC~gxKQ8E%rI5%@_jb*92zqem<P0(*i%)Q~mbdG{wWcewCLO
ztgILugQerua`N-v&wKX^uSm4Rz1n3A5&K#mygzEJAgWtT>p1%tZLy}Jxv`|$h7_am
zLwmbVmpPhKK&nh@@T<v|xXXLc?X=y#bv7M1Bb&b!$#)|J)x94|Z1pquJRN#|_&wfN
zCn1uAu~fkBV1DgproJWA_hbaP{+xxd1X#Y=hU&)lsdopWV^MUg)bjR8$Oiz4Djr3}
zlOcneGeo-YCyO>}oT_e*?0eE3C-IB$?AJ<$dukexUK^rRN7<yL@bS7b@;#hPWR^~+
zWZD4)ffPM&6^3dkx$Ti&qZj*X>T^ciZ)RZaHUk<G>+FtJu~NS6;$&VM*4|I(^DxR=
zmZG^<%$Ou_z-5Yfx)AJ6k>Qhv#K7ZI{|r9vjNrnprARFUhu1!vbkx~<dzKe8y&f47
zx>{O=W_%#n!mJ@sbT9^if4Y)~qUba&MK;}^?n@2V4Izr5FALvs^VgrMZw)SfUF-`V
z8h7Iro-z%lSf+Ad3U-Iws2WUsqGEEOP#A?1e29hDw#aO?8V*5yE_@YFN29#wAsPm+
z4~?cm?0>@GAlA#{d}W{A3QNCRsD3IRb0I7pzAUrRqE&Na&{x;nXRJ(}2M68})kagZ
z5t^ULK;Wd#)A@-obgX0OkowWA%YYWs)17TX9?v!O3kp#MIz6DE+gw5Ud)nIWQarP!
zB{fF~#j0om;F{X7fW4*g@`uyTE<W&#f3F+rgRfs_p4b!tTm2eG-4iuGM3{KH+L)*>
zN9_%7-nDAW5JmVr8JP8TXT%Nvd3S%Pk7cCsen!kPC_x-M8@^Kdz397sCr7CMXZEl9
z`nToF8rq-*U)IaE(AKi1%eqh|mQb2jDMB5Z$9wni^xN&$)I_*}5GKkj#_c$c+q|vO
z?YPxxJ1y`ad{$xeEuPFMI1)f*P|h*xL49(wBboz$n5p|*a<y&k7BF<i8GT`|^s7as
z%C3duoC?!t#k}YH3;OH`F{e|CEjSql6c@I3+F{2IPQd}UR<AgH0tebyO*;Z;TUu_z
zd9)#?Wj3qJou{hh)#)$LzxmM`VgLKr^gEiI4>pm~yF|ZJc<$$Vt9I+GuUV2WdqAO5
z5HZosCGN;T0#BkKJRFm?^(|QMECf`f|7cGVotKtHSB;aMy|;^8Aq<8s@vb+R&FUgf
z_Lc0#{`dza0oRr#2e8_Ln2#?BF>y)O0^m)uHu=^YY+<I#0&%11eYRx8f%hKzX9@Qc
z%)Q6eVyrR-9vkf`ivAsy*@#W@B*`*^q6L@UhpHTPl~7irZ4F*$CZ>)YS?L2FeWuUN
zZ2j+vtW0Z`7KUBMf`(^bvsvj@S~u5~O=lWgGV_==Ac>6p!&p=KcLu+Or}9*J)D%pV
zXy6QjAuM4e6;Z>7F8eqRS7S&EIs?s$QFS)vJMe(48nXJcruRh(+t|QI#k9V`@H|Cn
zq2Z0XSd$()%dqCQj~FI&`{UlneTr``!rH3SNlt<nvh~Tg9#T$K!ASz<jB#9k_(2_u
z<g2Ai@d$SDKlhp@>NCt5c$M+kt>}$>H8(TcZ(YI^lLK6>VjyyjQp_Bg#rM&(AMM~i
zS~2&8U_NSm3I{&B<5!bsLVH<$z6H816_XO#PWm$34!zsqwlSLzRhT!g_~<zKdq*BB
zwyYrzy5^lgDAMxq_oy)W+@}K*&p6uN`_SF<*)eC(0tSqRJIf6;w><|nXq5s+$(Mc-
zuW%<UKT+&v8`iG1UE3C3k9!PyuKU^7jeWPFqtW?#BL1djEp7XkHbJ$>*xFKG$3xeY
z9#ebtlgC)$@f(qXpO6TcR}#K%7vs52H7A?JQ>qDkpKl+W;uJb^t|wlDwxF}irE0LQ
z-clMPfZiv5X2Ie#zFNYHu*{Ana8;yo$DLj+UVy#bB+77y|K%bzS{wCgifzyi=Lpmg
zh`11Pvmc#naL8@rR5?)ydrT~F7MVbd2e`8*6LPl5e+f2qS4xz~A_<|}d0E%%m6MHm
z2^0<TxF)ELZ6_&aCivt5XN1tz@LNxbJn84s>AZWp5?lSb&;2#urFFn+x)TP(IB8Za
zH=HN9eTJ}2xRI2xkJ$Jl>g9%}@K{0oEXt7JRABMo&FOtx{2sKG@n~{#E2g+!j`UN3
zfcxR6=VN7qLkU<JXTuzvA!7+hV5GmGO&j3(XFO*oJ*m!m5$avRl?}PKq#Vd?U^Q&9
zBEP%b*kn}#tNSa)orToNVSkzTGUe7!=-g=x*8L*24+3E7Iao67*yDRUB*;jGk9*^a
zw#HQv)`1B!z&e(Cl7V%3%ucG9%*LqiSUdwf+fd#N1!TV7s}xZwdBID;#&kHM6S>3O
zhOt!eon%Cf{viZw<@d;rOUsJx!Rb-u6DYl4gPq)1WKaX<viJ-Koz#SGLAILdjRDlM
z%`53dm*J~V-6pB^^zG}`_wtrc6Un|-*O;Edw)S_tji3e-cyOvc+qtyd)u@nKlH$+{
z)vFxCuv@dQ;zgOEaJmSogW81W+jJ4%`9;@mA4~3u4C9pAXO?yAeNqWhM7xril_e{i
z$fsO-dZK+Sx+rD5*L(rOSO4yQ?JQhMjp;3w!~Y3U`$ZiWO{#<74d&B-`6}W?Z+fz2
zNMFR;3wVRJ{wXwgfDCd^_-@V5r%$&2INju`68TEGwP24Lcu5)5Ma0uuON?W+)wYc=
zj6>tfDlNWRd!>|j+Lhs6UTtOIsS{>&t0yoP3@8X<4|TQ3q-_p=`3i@GBM>s&2s;{{
zcU<*tJ{Wc5C`q`_3J?4-6%X4_rDi><^7<L|HaBE2_f4WsqCdhJCw}Vb-D)7`*3bbo
zdS>kM_Ns;qHdULd9@Xin_d+A~7eQ9b6N#%D&bHEk7XU!uqw~1OSP}8Lco1dj+zL$p
zFlpkSJ;vD=!at`S)Qsxt6`RsR!qIMzM0E^2FMJwr`flTfclEJAGIpW??{hI5_v$QR
z9krxe`yg-}cOeCa4O9^J3X|pDcgqJD`?CG0pnYgFHp0)SNkcH3`?l{ro7KHHa!bJj
zEoCdRQ~9gA0HQW`3c829J!D|lDR4*XbmPo7(pD0^>bMSvf8gSCrEptvfQ`#g51MEC
z6fFE8e(_NyF0vOoHKoqJfe9uiN%z=J=JLIB-y6>j84b?s20ZaCcF}wSY<-HV!AR{H
zRSXHNnm@JZM<@a+jMO4=%n^Gp8bo$WUXCb)@nn7#B57kD{t!g7^CoFF<FbbpHhJhF
zx4n$N8hN=JnCG(Sob;G8%KRQtr*~H`Y_gO=x0T6nkayMu<c@^}TsxDw3traBSAA-g
z;1IdRY}^b`tMb#QzwdgKj}Jg-&XhrlP=at!JJmCtw%v)w)+L9t*AvfW#E3p^pB5Sh
zZhIu$L;>vYMb()``Q{P{MKC5ObW2R}pQH(Daf{qf2s`3t1lLY<j_`VGwml`<kgxh!
z^mV6BpuD$5=p)yH(daKu38GAYBiVjHL|4_96V+dL^Wf2t2a0EalMW0n(;;H(&2_VC
zZyzcrs_c0i9fg5!Ss^K7cg-hVI2tD+p69z+tI4w^%`zUt`ibmL5KpPlI_qe4>|r5X
ziti&h8Q0fG;WTrTUI&0A&cfEKnDeejX`oRdDNnbf5}yGyFn%XXPc2Y65##a&9lwEd
zHe1Nae)f)Gl-B+2XTi%!nm7QtSy?2&B;Z^U@#R8mq&YbijmS9=l*_4@I8D$net2=*
z9Ri}gJuV|>{z100^lE8ik03z*u5ggWep{Hld*n@(NR}4o^O)AVm0Q?N2(=t(64Ic~
ztW3}4Sa5<b_c!rG7n<-?7RHrS;uJ6L_Uw8W`f$Il%TwnZ%dr-%a6vq>hw-3lEs#y5
z7(<x@wW+aGchv@dLPkSdSN~wMB!vxBdsNV^C(D0-(hDa8vPALzbFmmji+q>HoUM#l
zdxPbAeQ?-j0v2`5R_UyGQrc?_uZuWc&USU3ys;hUA8qZV#u$B!5O2nN-&-4xwIesy
z6OqSr6p#GxeCIF|7bp^V&{J!eibf$RoN!UZ&n5U$SOb;}$b~cw|0#~?e&FP${!;x@
zJ7upIW;?<I;Ir+)PN0yKP^~3vy}m>7Pbw~TXndiem!jFdSi4?x$Z>1^ohhui`e2OK
zA5O`+7?^k*Nr-YqkvF+%zK@nM@O?<4wrfC7eEH(7ULRCc32)KOFr8l#E<D5&_4bLw
zEmrt-zdyfOU?%l_)`rP4-`~!&Kd0MAe{Mi6`xGA97g@zc1&d&xo8DoXDt&3ZpnaL@
zCl8Y5s(hNfuLk*H+_gWNT_0al<7$3CH^pBNlDrzX8-8L+v-|DMYq1sG^K%?rAs36%
zlvjGYfp<B2S*|t}sQ2ESbgZ!0`x2v{ax0}W={LE=s9J0_+G#(3CH_F%X@TP@*%T?x
z58BQ;`HG{Dw^-5}RRXYQ-tU!@+zp~O)73bm>F}0iNO_8jr6wy`jkxZ%7pWiLJisNv
zd*_>6Lx5Bz9kOu)Fi&PSN-<D*_?0_2NW?{#olwfMn<5c*eaf(FEn@U`QhFkp9ZRNx
z)2%tK;pAqOW3OG0M$xSI<v-do>&AAQ@AjLW$4|SQP&||?-V5&=k1jK-G{F5tGpx|5
z;%u6Q+UKcpXM1}FkPCAOn4npv`{gas%gC*;d2QTh_D&gArHweP-)ev9AZR+??mx`#
zIeoME!Jqqg8Z*-uBdQT=wZO=Q(D3YF_@pvm@Ome+qHCMAqD2)?XqRT%-Bl82<;?VT
zhN}(5w%l_<dk)-G(t}$5hPs4`Pp<(;5#<zq8~XHS=Q#au8_6AmFXRFv>Md+avM9o)
zmk7=Z)gt0>INLQcX`wCfU>yyMa7C%9q@nr4MwP2W5L6>f@Jr?x{X_8@;3-K7_uHM*
zM)A+%=1K6SJ0-WrQ?Q{RkFUs)r0cLH=aFj@&HEyktMNkaO$2_sYMmGScw00HJ$?1E
zNHvb^k<Iokv`KFC_F#md(k%vvJaAZhl!CZi*3XOw+>2;&AZ}44<V8&uYv0Q%U6ZK+
zi5GO(SFc=YUdm0&;{W9BH{_@2;F4ul=-_&nf+HYS)hC-t_cq0C;W9e(<fZGGO5WYj
zLb(lM;Y7*zt6F<qi&nlENM)d!6O+lf@8`$Kd14{EfXbiQ*yxHn3ad*JlZZpGD10dw
zXrAtokcNHV-F)1{$|ozX_S|3R*Ak`{w*Pz#e}Vi;O8mq|rExyD!o!XIaW337tYs1_
z17G-=buwa6?;2!T7{Ctg+R>}RdpSz(#5c%=onq*IVABXqHrJpNN`niWyX{Nig97s9
zw+w;JgfCOs*W8lH=|A9-%lpmVn<1F&fUqB$Hq+I?KRGE$>80Me<EnML*hAX5j;Q&E
zv}g5jK5Lp^qj2NRcsL|cY`F;~Xob*HW@&%;eAdmxjgIaeq1ZZtCV_Xr&{cmXAQD1&
znxR~e{yI(B57*(<BVo_gTCiBT--87Iu5IMp81DleP#Q!oko?&zc--uRhrcNoOa_*<
z44<c+72$#%wndvnG5WuQI~fMUx7@D0+OM{YVS7Kxh9-g>y!IYprxX>AC8ew^WnFaX
z+6_MBUm=b^N<^MzoIqm$iqY~)_e4R{%r}QyydxfFE}A4aZ5qBI_P4m!byp{)6{e4l
zwMc0aXHX73vE09gP%As%v5G;)!vXfa0sX|oAMLu;MeoIYx0$iiLjnwEF3Kfs%da27
z_oH_*Ds(10L)!+l0K<exMc@vA>u4L;q2Ot_tkQR?#no2bxjSm_k0o4#@lJrGQ{|XG
z$?DmE)Lw5eU!tQg!sXj(X;frH+{27bVWPDLhDk{3e9sl_{txhS%7=@y`B*^>u@Pbf
z3pFcWIza-*zi+WEA7``kc1eaUJO{k8)YKp?uWE?(dswD2|KO}h@+?a)JIiPz!Yfk^
zn8fpOp+?4>Er?gOrhKhyDE#D1A$HPO8b7`;1Bk<rTVt|v*E<m`f;G3A!~U=tl(l@<
z*`FaOTGzB#)Ok|B_t*$SZwl$xi^DNj)eddDEv9fBzK)5`V9`Es+&smyjf#x`sE?Qs
zaz26Aj3WV1Z#E`ts;4$)H<eUqHjW&>MsIz7h@37RydgtY)hgJq^k+C0d*6-HRfF4%
zX)FtV$XXS+7o8tWDrP)-#6jV->C2U6$axVy@}6N`oG9|CT?^tdiEe8=Ns2sxVj(0w
zJG<W3Cqm@(@L&w-MJd+Iy*cz18jUeFgN>b3^uE9Di37XJL1fqV_DO%!>k4t(mK(k2
zd6TnW%w~kHcn~_}&DRopGx_ODE|8j-87>VD6m~N;m0kvIcvjLI9TtX0D=AV=hl5M*
zpBtIMWaxXHF9Kh3l3+LQe@kkYPLP6bX(h_3Q!6_vbinqU&(LXvm&N_~=R@O^2zKWA
z64eK`quFE66451l1!u=APNSX3wl`t|UTN2L=@vm^w3uX!%6G@w6sLnrjKX~sx|V7{
z*w`h{1nS@Z(oSV>z4c64vK`MTxLQst2=4Mtvy)|*wC1qP=dokKUv?_9o{uS%+)TCg
zdiPcTbe7q1pwnxwWwJ_&e2b3CH&c*w;>ZtYs*8iPqehIMMpnRgy%Uw)s;PbNOxd|E
zB4jiO6x*lq#b9=dx~;CQX+Ae^t$qZUMS%eLppWH&L=nFWv&pb<cP3Pm6npPR32Zow
z3|T~+eoOCOKNBc}b&mg(uIG}}D1#2Y&mlb!`k^i+QdV$5sT_Z?9FJp{xbsUl{)P?Y
zm3fBd!#in(e!VTah3+GA7P)t1l#p<08L2SY4!FMtJHiGTK?Birf#9dDVzDv#4uw<`
zm}VA4Z!3tU>@_*hz8hB$$>>3@ls{8p@WvuG-mN`D{N@_guM}@X?3ssW0%0Gx&=$gN
zj9{L)L7yX)f!e$ow=PhINm_wa?D~gObLYoYWdWI74KGgFq~G*Tu5T{&s2Uh~3H7hL
z_n_Sl@&k}ek?pXOCrK5pw+8I*!sj){>O{7_cW-@1z$a^A>gYW$S0zq6D2Ch)r3qLP
zBpM8ontPkuB;W9ekCt{5^6BM7o+Ys1X};t&c{9vkSWF|7^yxzooOPt;>#7S{%05d`
zjHgQsr7@cTcnmQL4I>4;zi$v^vKCt#liIv#*Q3!Fc2%L}uK4<-0D`<Se$HoZgdt7B
z*P-u*Y~=D?60rQ{Hsmt5d)hk;-R#^sweF@ktLV>E;hx<|bB-C+Y7#^fpvOAyq|jrn
z7rCl*jW#O-3$D5MY~-2Bm5530xrStay>*4%r5@kX8yq*@*zJ=GKelBy8_i$`HeQqL
zg*JX=9$s&M;_rN)Av$gPO-Htq`*s$x18OTwE2zDaDt%Z;;gaVpO=(pGPB}{zx$8QE
zEd~U`h-ynXAGONddxWL5?G*SGN!a<lSd}ibAp0%dVN}5zB>nGQ<r~a>d^93~S+qL6
z%B%;@+_l4EA>Wd5ypF2Ecme?{>t(?NY=uKt7;treC-5=lCRvg=;^1N7fPE6x1r~>F
zP{?`?;YGaGgOL<jv7-iexiwDSapaGOJ|&|c=R}@KPKx$D-MK_O7=#%X>2{Kiq~ZnF
zRpapJwt@|sJ=;Lu(@ZTsh_cAvbJ-j47$wTZPB(|TQq6F>9|=2w@zZy#!F%}*uty7m
zu&c02f%v?x?Lz8C?GX#7MBleOJ4`%l>)XvxAP2m=hgV6X==RW9E#Iy^gUW+vvJj{|
z?~>R#LDsTzyLvXXAr+}8Gmm!ib#6)bG4A<WetuGPW(&^e2}8MXeXPUF*|)60Z-UWJ
z*iSAF8<gbbqph(!3lR_(#|a0bJmw(Am-e{l0e+dtCPS|SB1N4Bi{QHcSb?!*e&7!U
zvD=qdf$|9>jSpcHll7qS5Gd#4u>iB(@lsYy`M}E0Sv6_ieUPe;*{ynBlNg>yht&rv
zSAg)Ut6bv4LM}`ui@UoT<$RuE+`cJGe|1d|tez2_a`h~>lT+mJjycY7FHFpKp|&_r
z%D&oqo{P6?#4%FrGVkZ2aS{i5cWKU(R8ia=dlna`Q4bNcu---j19pRIF4<b2=BHQk
z3Nyh6t3iar;m<l>@$zGZQ}8{QVcNEk(2?t)FK`eGd{JkWBF&1-`mV(ihjK$m(+ctD
zxYlm8vU%flHRj9Zx>PQtoFE~k80_qxFb1JlL{bL|#(;C5F@;P!+lk>$71;X+G-}#2
z+Y9|N^FEzs8yp^09oem4xmMT7<-`}g<f@d7(X`xi?3`zmd%46gRZ%I}K_R|M(Git2
zrAmSn)AiMf!8~|In@xMo8FXSX5Jk%pbdCeiPYd|)<n6MDUlY}px79b?cg;t2x(_>x
zSlexvg{Ft_Nz-6yVRyuFmj_tZ-2tWT#3_aC5^8ZTn-rY<FPO1SmdgnQnqVp5FE3bN
z{9k8&WhonLCCZ3}5oK>F9T~_7r^&$hF?9TtR1PY@2SGlCQ3<mN009F{iDp+m%-6&Z
zWY$|5F&S!YSBGvb-?TmGQnh7UV;}_jY@eptsHG;ipyqL2&{mDfYh&!N2Ik<IXI+79
zG7?7JBjjb-4)P25fV)bhVupyelJJH4JVwClwuAUCHe2T5r@6d$jr@W3p%-pcFON?6
zS}30My?CPG)$VP%!_EKDxR#k;`FA2CwGZ+32t(*Hfd!xI!PJM{LU@QFP(!F@dpJ6<
zKm20baEzhv98J`+%U#gukg6d=?(?f=#uf4^f7Y=&&3BVaKR+mrsAHRegN<&d?~z&$
zhl`U6wG$W;$cla_0w-Hs1<ic9bUNT_Z8B{?Z76V0JHK`$M_vX6Ehfc#+C+eB!c!&T
z3XIs|%y}2bt*}`jb}Wp|K}6o@<11p73h!wZ&7e;_P0vC#6+%^#@1lUH?Co(ZB6$Ce
zyoz?6u}s)>eH>O_bN`D0vdkTy0NCcxVT-M}O08D>eCBlDB*k#Zjkhp9s8d0@8~%bv
z{N{3(HWuj^T^p=|w7YMTSNy65YR87tRXM{87w;U#=0r7OJwM$eZ<x09^2i&}uNoBf
z_uQqG+HY;=ZLTUioL(zEy4Wj&k9;8YLRRf?YUhX*c>vGEa;{<K5O?nHgTW184$%$g
zi*Y4`t4#RucYlnjg^pS7_QZ-TT*T=tWFXF%!SmwPY9U$5Xr|RulPIFFsV8U#9qu!_
zf1kkp^;Z_{D<kxsrkBsX;q1X4J6ez%CRv9~Y_@e{+K4bL!c2P9h-;=r(xUaPH)14m
ze{uePa5u6>;nsDJiC@0FWn1Yvp3_?P9e)9%H-Z@Hm&mT$&GIrLnD1ih;Qmz^_rw}L
zcQvPoU6pK}7d&Fjgq)RU@`koH2`uIl<9sypOiVhgstB9N(L;AD73%AnxAk4>GJ5-j
zRl#O?KjehUlekD!@7w#q9Yb%dfSJ-t>^LS5-LTEZ{=%WWT)cpaZA^W7a|rVTgGR@)
z7PQWX7+ZVw-R4}uBj@HNtV}6Hj5cc<=jc}~2+DvJxKU2(%WZx=m-5;yL3@3+@qkZy
zN|Hdi&GFlcSmrnSpx~5=FG;Hxq)#3we?BX_Z;(|ILlRKNu6MY9@z7e3Xz-bE2zm5y
zRQrvtZjd*}DyhVa^WIs2MIbRB#QDg>QJ^yKK`X(FSdm$R0Z>q2-_ZM=40<=fNm|1F
zMSU<AVafzg)z@rS?4I7&16#3?@leg#h=}QxI&2lcT3^IXQn;g1ZlpQ#V`bd8hme>Y
z5y6e~!Rj5AXyI~zR0@43kh@k_G&|Rh@cW0xZ^<pI58v?q(+fb~NAYB3hCDr1^SA95
z7q&UMyjRi~$L;t=@C;4KdwMyqo?paZY?wASS~X4@q_MMCl#$7Kv*u^2i&BcSto`_s
zwx9jQQgi^r1m)XbnM>Y?hmKQ^NsIG;e6?*dgu!rDW3mN@Z*8M3;mH3**INd(`E7f`
zEpEl7IFuH*Qrv<Rf|la$#ob+t6nFRH?(Wdy!QI^gBxrDk|GoFT@64PtAM+{MPoBN@
z+CN(o_t!^TY0Om=^i`#n<a}i7rayVd8d<2Xe00CHKmPv3Dg5$yi3Vq;9=ovE#lAqs
z5=p`(gE)=<9k<^;?=+l%=|}gIVAQx2=UE$=^$49>!d(a$0k@MM&CjT$D(|%u#lwh_
zc_-)u*HfxoJW^iWMbapYxQ);I@Y5<otd}N6Q0a;C;ZipfZbJd^q=GnJg?Xv-2;1;W
z`H3aGNKkF>y&Xnp^oswn7U%eS279s^K0n?u+Pf~gfam2wr^=ghvM6x~<5jG|`B&hv
zwpaC$I!3nlh4j|Oc|!3}!J1n_fm&rDLRaP!tf_n>Hd@iv!^8pcaVFdM<SXAfRD5c1
zH@C@=`hKla8JHJRjK^CY)Cz{{8054$wz17MwcL>$c08fp>AZW(lT?qti8>?3a_O57
z(Mq2$G63soWbt0tjphX`VH_>)!U<F4ct%_SDR?|;m^Xg94)tK376d^4uog7SDYJO4
zPt2iHd%rKYJw0wDXy*(KCp9vic+h?JzUI1ks$Q3@a{4An0k`y}AZsFq*P~D%yt)g<
zjh_I?c=n95?UQ=L>aplu@Rm62R&B{@LD6xJ&ukO~MFfdZvsc??#dv4w)p9F0#z|Y*
zZwQx3lcOq4c=`|RG_QWhr?zJ8Lt;0Y0H8es`-5)Q^WBmZy0fT97k-RrjjeIQj|-1$
zcW7jLFJ&Ac8_-Oo4azHB-g-dIAQa7yRflVp89JPMoN4DX*1aH1ybZu;95MwJ3%h>u
z1LicvQp|rKCtn#$Z1UHh4LDdN!}qv(5a=k35OVbzhx%Bk1zBdA2$cR|z9-5n^Cj=`
zf|4^gMaO;|^}-LUE7#;1U4J~mz>zawo(V-FQ`3gkCP)mIkS)1#C}?5>CKje>)am7_
zCVxo4R6kqceDLj5HJ?q!fuPKhnuew`>O{yj5iQef{9fL4BEZkW{P@@r`*vbvooYhC
zk4J=7Gz9B(LX|)vmLkmkj`b^}BDD<*XG4zXnS7e#O{2!Ipi&EzA1_b4mB&tLB*5Jn
z7K)jL{XU7&y-HY@3O^m0fS&8m8@7U-_@H2AT@wdc&7My6i7LzGbU3pRg2`?0LN&6w
z)O996Hn}qSX&@>jX6o}IX)Frft^^~B>yJQ0__)p+|1glLylN_BY}Qf#?OdgHBidcl
z{s#gggQFe0{sB<iaL*J!S~_T$8GQ@Av9zOSu7p>*U-B(lKE!v^BQs@~t((O^*0h}S
z=bg50ROQ=&0Hl*Q2uM+Ds$E{6(0z|<CiUc~BZEVJy@7b4{i5Ie&0TDZFh$D3-JF@w
znktt^%#}ktK<UD;zg>xKZIqY9!`<NJU~H+k_R_+Hwk=e3j}-VNRQCy;^_!!^OZ9*e
zN9j>|quti>6q_3cmo_WkW6<A3;8*%c=Ku@ebzS}#d{;W8ng~YcHw-?)MtZM<YFRIc
ztRGupc|@F1AnTFglo<sfC0c`eJRY3819v~|YaF-9B+;EJ#hrQc-}M;et8z_*o_w>D
zPg|jsj3_!!9Z)SYb_0LQJ0)dxp5(kYI{@QaU;3aotY7=$oP1*_gs#%}8;_5b#dG?8
z+|$OWOiWoXBTT>{Sbh-b{v&uG8Sx4%u$sfATWke5m%a|baEBTYTzwulV`+?GV1}Wm
zrXFBb_)(29172&IRt}7AW-;dVZZ07i`5q_1&FD|fC~JEU$JjoVg)L*=KBo`kAleB{
zKh&`QT<41O$1q#)UwkcoyvvT~A$8#(rUv((&Bx!z(utJw4$Es1)HQ~SM9ZMTJ`9_M
zebA5LL5MV2&BYq4BK@2K2Smy3U5XvG^GmV<R@R*?S;#xQE4ES4C;Tgp?=4@MSDL<I
zsWH+s{_z<lBCBq)(&y=E_3TN1J9)$uo((1QE0FFbYm`tIjmYe6U+?6XRjc|#reBhA
zn)>Y#ez(E;HpU7dxHjJW4sLernp;rSn(b!MeTTe7sm$f!Pmy+9j3Yw@8DUA@i_m^q
z#+ertz4bFVaoBNccc#FloHK`u0rDh-OjvP<yo8wUryy$+U_}Cf^!yZk2+mPsiO-u8
zq@}liMkF=BTy(;+kgR;JvA-`cF3_K#cw4_#@cn#QKMIzbfCbPA+41P-1A(kM7Gz}3
zqY>IV`P~!)F84ch_QGJ;EQ;QM2O<p)=z0LFz@_uiOl3woN3jbNGCG87J%?rs@D$q#
zi$B=#1!jAmy0mUreEC#~IOgT_Q>Q=#&LpoHDdyn~X269O!nuKVfIEJ8fYy5N981c3
z-moNfqP>2Lua;~|TJD`12haD<?iKvKf|y;1{|P6fhoiNxiU{Zp<wq=<=)7em%)qO)
zSE~>kw??rn{XBr>hlEBe;A&oe&YHDc!nVo8_Kb%xo);_xx_YYdrMsOFH0b_Zwza=t
zzyBd1J!JrHjm*5q_xzf<a*4z@<^}1c*T!J{ssRrRi+Nr3vBOLkO?;;oDrCAq4P{BE
zF5evV*oSqGyv6RA4kA;>I(Y8V*dN!fXi(t25jny<HIqUPe7dLwODw5ybp;E!jKzZO
zJiaIQFIYS@RsJCNrv8FKKCOV8XQ+yC`d0T5Nte~p!NK`C&_qe)@D9s5GZSHBshgo2
z|F8vOKZrzR#=Rdz*m$JeSb?yITU(RQQY&Dh;SXVEF@(#~I*OL0(#EnS1NMD%<Co8K
zdh-FH=>QJw{ykCJbl+36i=!Y|7^}rGuJO&2L;9!|9^ixxap#Zm^|A}cL&}|2lcr2P
z)kM`h0X3`uj@y~8E4${=Jsu-*DdNVfa)(;AY6d1<Sp;{=?*?*FK(tJT*yS2Sah0-;
z4f=}TB24IAQpBvSyGDGT+aHe>$`gJ(dsgq->BS(bSR?d`RUS0+ICo2a(mBPLL}1HA
z5C5#*maFC8vAXz!i>)25MI=?g4HY2{RO|6qTg26whN650X;hsn*V;ZiVhSr7h}uwl
zsI*<MgBiE5@uZ?=^bC5Hi3D6GDo4jRI9uicoB9-0UCWd)nU8MkzZq^J5er^O-Px>H
z#`>Ro;2vW<EY#9uyU3Xoir71#73zDPHs<Co?p+SG;nRbiMsosM$~P*tc|2fiZsNR1
zBcF4`rXnLFYtq+C%K=8aC<JFpEhJ<SZ+zy9U0HuJL1L8Iy?YUfhIWS&T>3M+g?CnK
zdZQ?w`JdhKZkal_LP&Ry%d1%xC~p<th*@h4=nqmtux4uU<$ybD*J~5)xaf~3ov?Kr
za68A<-?7c(`pO8_i-tvG3!Zw+!Kh1&11z+%LD5NX#xV;>lra{{5s!$rgRg5*Vvipq
zPq$4B_LEvvHs1CZXO`vwEX&Zi(FL8(Ze29k6tyZO;Nr@mlqyjn!d1p;g>w8>tLkwf
zOYEiDit4MeITj3=XWgwk{i@Jn3(kl{QVIRumvQ(eX;o4&7#V=@2i}B~t0O91w7G?P
zC|Jk(VzV{<I~Hk$xp6-@d_vo;Q}Nf4A2<K<RP@xLB$Ga?P=zDfei5wuR;j2PmVK{#
zwp5zH9P~Gg+ULOl-nP643B&OOvFq;);8Mo?V>>$*?GukEihN!b__g4I2vF<1uPsrs
zl}&`?;%0a_c(lKl=TLvuEC3ddw{+KRp`GcjE{L_lX~FNcIjd1FuJc6a1giI>jVSc+
z;NETN3kPjQOtBHw&D;6!U!43BObD&KEgDg0Qq+5AcD6oOZLgxUw1W*W{El3W>DLC(
z3n*fvR%_Mp5XS8_cNs-yLi!MQ43t5yCIYYOZ&1dl(xnc1NFMAOgiHMI3W)!?jhVBf
zr1rG3hQLw!*8AyFZ9GBIT?dNA{8MSL0Qa~ElaS5+>_E+G9N)?6zz`C|g0uYR88*|h
z(jV!D%Aja=-}wq%HXwOZwT{o7CUk2_2Gqvq$vlFHHb1(Q$@G$Oo}goYRCzp@=)tpb
z|LX+ZZ)|N7tL42GU`v!p`Zfvu`Tphza|4vGV(!}yDDH*qPffR40U3hS)_8KkeQzJ4
z&mSqB2EELqdQBD%rSh!QZfH?DW~iZhMA&szDaAQ{%2f|5eApD93>`+b7e9sQ$hifs
zs9|@#g5??a>Y6*9p3aGg5o*UqN_U@dt4#$`H@^Lj?Ti<(LAt=HeCd^-nMU0l-MN29
zYO>_0u(^unn;Rp1zAdALYq3BpKf1eXjmm7hfODAC)Nx5j-#@Jf5%&31TH1VPd~kX+
zKJ~>iLKt@9v`n1-IU1igMUf^^jMdVdBhzV|kCC24&>b7W0A#q%r(U<&o|5ieh|Q~f
zNR}c6NQ|SOXN_`tb2R-)G=eaDGa8@Ab{S)fF&k9_A9MVjss);pAb4Nw&AcFDhAy^N
zU!`aRA3fu+hEL`AtWs26IAS)@s)+NsNAZw^`YNq=l`gCqu(O05#gaos!?@v+Ec&M7
zJT!>d7Vn}^zI{CYt^AILXikzLB7MW@OtFH7<(z^?<@66%7kH1n@2S2acSKB3lE^;-
zGlw*7MFD}%_Zn5U-}QR%Aga%Q(O!Kb>zc)uKCt7N2~NZZ44ajvq(&=o-x03Q(1gTU
z6tU-d%ch(@)98s;^90)iq`TWWP-NsBE?C%t%lTs!Im*0Y-3D6)Y-R;#d)4{<xf-Ty
zB$tiKyu!A)I$T6bWo&(Qt7wFsznTEZYK9D2!PjlE%(t*kMJaSykbZf?S=FCfQjvF>
z6ysHjkb#8WWSz`Ah8mS?Q${Lp3BP=EEZgAv8NYP#<ghP%&dNFe2gg|11J{PG?=24Y
zkxv~aq~{J%8wXhkMPJ!~g^E57qgo~AAgY$y=5xbw2C@zkGit||)$_w04$C3zqxDw2
zt~IEy$iN-*AcR(@e)f-<d9b|AppJdsfbmnqPiT5T;ELmhn+y>ImfEe|`09;wc$qtk
z5x8HiAe1j(O`4)f`%9UTdvm>f(ldam!QArLv@%K%<vLv{?sdJT_gn4VYX*X}?1n80
zLRH@Bd>5DXzlkM7L@*Fk+V%;jjS}lCP*CitqtZn(xc7(DHy(38(ev&PKC5qp1U2B4
z8ND6Hd(#cuQs=narP-vzU6w0jQ9d6PMvZV`sZ|yZ_6?25iz9x{3%k)0%}86G@nApo
z8Sq$;1)|syv1kSaKUp8m*UO$rkYD9X$T%%IjiC_misRCBgLFS;8LA0<vC6s^Hl;ML
z^CG&%G0kZ;!Z#`9P;xUa`ZUG)2`(6feA?aaEnv(Yv@HMQm^^0q7XBu&mK`QvY3yge
zP;p6mI1xcuT$2`1w4>wWr)g>RRR)$~xH@z7=5X%admf<QQ%LN93p0lm2ama{Fd=7c
zIB|K&o+|NM%V8q1&6BiiXJe(kW3`%foXyn{39s~HpNG@tys36kE^c*kt078WqNxs6
zuJnZ-6{g0V7~y;KGz<`E&2^<RGnxJ=dSKA2$eY2M^(SpnbNona;dTeuu6WrY)|fKh
zaJw5_mlOf6lH|L+96G9{SX;cSsF*|Un-J3U_bu4tli?HamHK~MfBijf$=J<j6b=5i
zIF=wu?Lp;iz95PCZLGZ9EG(b=`hAbTaWyICcI_6`_r<h}ch#g;zEqG}v5Ge9RMH%y
zVDsxVJ<XK@(n{e=#&XJX+sb3)vv<?K2#gx_$=8Wkb?9FY<Q#4S>Gm|*F$)4}9v+?+
z%{{_f8KK=+@u2j@RgIn=?qOxi2TYFp?a@N}Q<AVyXoU)oob?K6M5RQRW7#0^2g&l9
z)(bDi>!=lO)-II1mt_~t=|Wv?O{AR3g-kb!v1Fe3{Bb^gei3LBBa^b82gHZ(B{9KU
z3l{gVW4@3+%um&06}a$*<^0bUIR}oIMpyaVtHGehl}l<<Gg4klSHOLqLc`(wf?)A6
zv3TR%RuE(KyPQ4L8jeJo7ueO83qT8cYQ1~dsvk(z5rNxDg&Udwp5Ul0^O%xvq1j(H
zQ@+)sj*eNv+Mn<ues6Sd0TF#S3JVZl9}RdlMl!zGZdMIkV+*T)s?coKi<mla{AlO^
z*F_}6;_!;aUD4<X;NTfM^&T*?D!lU!EwR+78}XXxOTjJMWLtKMUNTi`FRM|bFNPv2
zJ)NT=(tNU)Tiui#Y3BS7eRLBI9OKc<Ujg4!jS^uvY<FhOCK(EkdzU%We1H4Z>~;}S
zK6$r>6#cD(KodbZNjrb;iwO;HJK7x4{xq>)uH<<(!`J>T;pJ#;sedKpSuGv5KJ)=i
zi5q;6y4^9niJOC8^yw0ZK%;CEV(wUcXOqIRfjLLL@LtGHr7S+1Z@q@XldSg2UqI7W
zPd60s@;Pk_Xf98!2RBT+uYhBc*O{N>S=)=I&1D>&UR=PFS05V(0_R~nx(>V1kX8x}
z9i`tdIsSVp-S_j(+xhHB!b9-rfAPQB1h=-VWA}i!KOaA^c>$GC2k*oD{VKLuV`vd{
z=f-=^_pM$&oGv#di$#)uref=zTKJ2AHYfygV;$rEd+Vqin2bT_JbS7^j<CvA+|n*u
ziusnQT^{?jTdhiGF*U|Vxr1*cvxrX#G!22r<Q4hGfAn!q%DE_3s5g-Z;&>J%YhGZ&
zV9y-<q6b0feh%xAt-NMii4Ls^hw+LJx*l1EZLu9>emr7lcqj3|hyab|Gg!NqKMo8a
zn73Q}K=&<uGwo%;gZ+=-N@Sz|goTM0;>A9*YAkZPa@1d|PAIXz4C46gdbq5Anz}-+
zMnY>3h=yyMpu@f1H_8&|ja7nKsC6D4VyWEC?XomDa(t;}Oq~Abc{f4fqkw?+a)SkY
zGa+?C3uhLnfq<P4%uJ{A%3J><H6-S9EJ&X>)s9ZK`hs(<MD5M==BZDv<zTW&Bf8Kn
z8#`Uqa-I$g(|Y6&*;9W^{j9aCdMdwu_?_p-pL~<mL&ZK&$Q7O!xB*yzH?W=F4vAX>
zVx+8vV3o&LpQbyg0UF8lIw;<|cYV>pJ;AJg!mPpU5fMEV$nl{~>7%qqj?^@HB~@yQ
zicS-%yJ?re9rB4XL?kzgPw?uJErfUSxl0{zPQ7str*u4z*J5|5S~tadzQN9tl+|sQ
ztV{4>hl=7zJ0w?KUltQd)d##>PCuyk2BI}4sHUDE<)n4>z}#$^t<bDc<)>&th7a(c
z?I%P4sD7*ZU2P7)jO%&PO}{n7gMQ~H5y*P@EXp8S#wT;J?l(n!E8FDd_!&MHsMMYY
zIYxWHY74$bkvGOadjBus>}%T?5E3pT)!+K)W-q?iyah$<XhBg9iMh~(Qg%>0rwjBG
zonItrI>KyHWO-8ai%QK8GEyR)dOzT3>6NRQ^<7;9_C~8sE&WPbn-0!suJW^mlb8@1
z@^g}J4Kr-F>c#k&b9pS5--~T=bC`{LgKkDVX}1^?VRX4`dXwhaoaZ7QM@d+_D-hp!
zRc!7T&Ig!5=~O$W8g=Ns?N5*9HZ&%6v(ZjB9R@RbKFN5)Cdi*1+LE4Drl7is#|Il?
zzs+pN)z8S_bfnJ(n=L|Ieei~p{(6pww+^_-^7eDMFuz~zjo8fT_{iRs4x*pvb=C<t
zPTo>z6>z6CK^HpBhh>=iXor0tB9k$N+EA?V93{qG2DJ-!8IF@g@D{E$i8|;FE(qSK
z)}>O{I8pV0GcO5)CD=Nz;>R-il(~{D_LE_(Pwh&R9UKPh7k}^+5~UMpOC`U^?{ge>
z2Ii0Z1u!I?2O%i*FMTQ1{S;O`U4D*rwos+ZKcv@=l`_$N!+qv`evQR}lHs*Tw6I`)
zb?xBq2zcZL3cnnC9j&xSjAeAotx*;j;(aT%RYRRCI@0pX5$DshzCNxnP<oRb`~A-+
z1lg}KyxzK~HZxgY1swHmK)Q`MO_El>r=4$b1P^ah9P+VNX%G;`lja-Y1jyi=_Qw?U
z=e*7AmqAzZ@=oiVmGXvP;JMNn%qcj{W5-$KoEF>*tTMVXEYS3VO<JnT9l){9mEu=7
zzYkQJr^*!3!WufX1%eRI^CRN9G?yF?j1?>|@wsD2xn!Qb`_AJ@pg6wAiohXceP6_x
zWv7O*H3dNim~)rba!<3<?7QT-Q@PHhgL*~^QbD1F<?~6~Z<5{4w$**Aid>Q4D34EM
zQOJJ$gLUu5#34vdGaEgB5&QoUJitF(CDR^$sSIs-_@f`Q$9sC|^N_M}P+rptKQ>?8
z$0b@nr|SRfBj@eW{SIq4KM*Ygw&4@`xU(Hmz_!Xz&?_?{9ZE?%(Rr5~?{VL_P-r!X
z86P(<!y|dV{4D?#8tvYl%V3KZ3nQ;5<&JB7cjskF4KQ95)cO8B=%Y8x@R>T~*lej?
z(WnkghgWY9Dc2?Nv0q+<={DcZ5QE#1tmPn^z44umw@&j`fUznHI^(Cs*g>n*33FQv
zEBsSpVuRU<YQVSqvsLmZi>=epoCmN99a{(Ugz;4Ps|tlh=}F3B>)I&m0sOF|2);#k
z@aHoi{7=_Ae-xF+ZG?npyL6->r=JF45h&*xUl(5mQ+7`WU-rWK_RCoXl##B$3@nBm
zjVzoi9GgUoPmR5N^7^pTHF?Z+KP>MdFDU@#sd;);bpLjy_HKm*9cciPe5EZ5bgas8
z>&;jA-PO;L{9T$1Uv1$*heB?oRcSAGkcEq3qK)B&4R^8lTH}z>nqHqg?0uHZAjOhv
z3pbR*1ZrlsfTO=#;t!)wZqK^ORT}7&11x*LN9|Jd513;SuA<P@@you;E(ND-+<vdC
zGibx?OZi<HtOWO#^(8zHq=Ob<gEJP{eX}3xxHF5qcT9Sg)OtxMx(QB0tcFF9FM5^?
ziq3r2<#)3zwq^@xy$p^>F?K<I?6tO+_J<fzzHIox>K`wofqVBEkWkm1L4L`$9&9p&
zP=E4w0a;i9|3IyXqB^&$btdQ)WDSAf09Qu(-!*qZc;8}Q86SNL_szF_+rMaL0ZfXU
zWFvp=n%|tohW&im586#;ykri5tZ#1zdg|5~KQw%sdYfVVDr(KGo1)DRUUhsh<}lu8
zbN3ZV_$6j$E_Bo}5%vLE95Is$#x-0rxDKD=I6Z&ewof7Qi`FxeJoLeYq~bnn4aCA#
z(&7i?UvB~a$Uft~fKj|-`p?M`I2zSt`dxLiTD#sAe(tm<=6MLl`dUi<pUvU$TA(zR
zhQ*>ix{%eHi*W&ndG|D2SMpIrniDD_2p@OdWkFYr2~AzcAsB;{TP_&nJ!sF_cYuu?
zKhD;I?95?k^a&PU-ds<%q&vcoi-%i?W51d6<=x43oM|F;MPKISg%@PlOW&J%jk?FU
z#eBqqqr{EPCnxXkPu5Ao?)%kxu4-U$^u;_~&PX}LCs#O-55Vni4}a<Xc2yX^N*j!N
zJOumZC=b<Q>uYyG%I+Tg)-#|GG(KtF1s*Qmjexu!5`6Rb74!=h!T^~$&g;iQS(D`m
zHcMT!jF=pDy*8JVl7%;{`HlfKacZrA!ia_*Kp^;e4*yV3x4jIB&8MT;Hi_2E7!N_5
z2qlTq?~dSZqBzH^&-5zCrtzCqDV$mI-7D3u8ORr8T855>4plE9jU?U(2W(&OWVSl4
z(8LUZU;uiE-}xtV#Gs31Q6&b-<15FlOUn1$MY1uN>pp|?zPBdP*8W9b?$9sZtsj9~
zQO?rqw&HiEQi?`rktvfO44%FFi{w-Bzy}%C-jLI=e?Lv%iW``Ti6D-zi;0$KZ=t`*
zBa289e-k0&bik7Qdg+sD%t&iq$Y%e$_)#<blKSf(pbx5BL^~i`JXh+p<CVHeUhiy^
z+S&bj_l{F)5WP`^l8I10oiq6c+q(U4ggj}pNT)ws>YF=yWIcEX4UNTD8IRTn9G{03
z7q?NmoQcUxROFazIYn`QTrHC*F=j~)BHQ<5epTkwgnq+xi{J4gfs|d{WZZ^qnhC9n
z9ERbLFf;uWvfGh;xQ{Qt)&d@)rjEaKzo?pe14s$But3Eg@|8vIC=$NUdv1J}fuwts
z&B7mEIxbNsQb^3{@sc*TZ_7c789g7kL4h;7t?1QV&w0Bx<El>kwp-~tiT)D5na#%(
zo)+>QuK{N%8PvJU{#`RE1?%NW?~1H?qC<KxL;T0ZTe@0N<TgMUYi%wGgp<>n-M*do
z8bYTHGrRdQ!toX}MZ*Ee6eS)N#W181b!=S3vlRE#8Qa-JS`pd868i{zqWzs$f)&+Y
z&;gU*x{udY0<DSA@Y|>wI?;yrLxOFtbPp2AHB*d)*S*Is<pkbVil^>Af9uXmcKmDV
zi5wEvVVH+uf{xdo=5(>T`kXE8)5lrZBVUAdus%rNPa8#4p;Rl-0QL(gIQC{D7#!k7
z;oY=KRw{Sr=WP$=lgMki&%P>DmP*X()SDO4_Pj2WdyTyuOF1o`r`ll@2?$Y_R;>C%
zS0tsB;JoJqJQn|n&=l~QYiEVnam{6+bjkYOq6pw#JS8Xk#YaHj8HbbX;bzVajVBcP
z^t+D?cb#Wd*}<Kn-7$cHwnL>e%MN0d^i=3!fL5Z`d-11ijCpwYZ+j6N^a%{6^o<?0
z>(p&)vO@+<syaRpr1a{XOZiM(hI?M#v`pn2{=T)=Y%nUkckjBIU)lAt<_$L;ar1(0
zEXd2;IA>e)n3p&ULG3N(I@`PY)%FnUsjp~AuvoKz>!h>OYko089|@LwGMcRCVHdK+
z>x?<u!s1Q#_2peTm+L*0JHwN{dVwIrX23;>_w^Ijbh(pR@Y!sBOnh=~KB-Z^ErqXF
ztQyU#zW2c#FEgBnLpJiZlyT-nDLd|72;DMjaMj;WHI4xXEIc_FryB~kohM3~NTbNa
z>`6XcW9=_(23<E98Q+yMiib+_!~=H^!5*@D6z=8)rwPb6?Vqz7`Faj&kJv_8uvgd5
z7k4ZxgZFpB@{wM}y;^G$i-DwHws+iZBVj?^8y<9zc5Ep>I<#ZeW6Tt1ZYr+%xUqG-
zM^#WpDE~gete_b*jO~hFj;jEY!=e6Lr||v1|IG(G8H@8hpkABPYABMMejO^IFlK&s
zds*GBK3}rd0cp|QLT-(r+brA^0UVA}w&M0&=?MvpKC_peaK-_it&tiWB1Y3Ts}ys3
zNkJ>Eo0WD5gnT-`A%{NQNL@yXEp%g+l>ELG_&*1h8<*+`;jlt<B{33Bp=<jv@e>;5
zJ^IPXx&XqT*H<K8KM;w8Q|A^dfSFi@buYbKAyjT_u!;`EJoKk9hx20NhbG7ET0G8f
zc}ZEm(~5EvS_EGF4^DXrB<{0ScPE#S-3oNu*rMU=oN>}>VRcVe$cI96aZ+^0?=d)6
zVMWuVqS_9ti@Ji@+Rd@;l<wcMJ+@d-vMvJ!o#Jfy8gd{$P(F{nsF+;Uc*ds(*HF|o
z-6I`&A|7zu41nwi3Wj1Ki@=|<gUmPFccP{XWr;K!%;WAYgs*3n_t4Epgl;!~*1giE
zX!1M!)NvVB`7VPjSKO9x%8w48u(@r6dpxWfwq26Hb(!C^dBMgnjxXKNxMJJgY^rJN
z22Z@u9_Ii|-DBBw+K#0Tx`6=Zz?~Kbvk32-Bc6*C+2WBbm@s_#`z({2%;1ZKONEe?
zpFgQKp%&!S+1Y1lAnT1Lg8d{j!?bPST$fQd16B6BgjHpT``1?pJQSpe-xeC!j@&~0
z>#`i3X;X=sa-Q1!sc&n{mB;9EG)kE=dFNZF%8kIT97;c6$<?XUxM~1sx$V#|N|8;Y
z9boJvZHeKxjDd*)8S`}$L2%Ey$j*1Y?ACv5zVU~kk0!Is@K#bYYu0^51PK|)r1aj!
zR$60D^>tCAWlm}R^rZ#*U0%9;l;K=Hee+8-RjN`7t>|0(E*jtcBl?efzjYXbdx;e2
zpb(o~-|1jY#^>mm%kj_i!^r1XLzSs~hC6L)0Qrl9_u<CPE=^2q-qZa^9hX((`0E)K
zAN-yN4WEyP>rd{bz^ww@i_Z-iQ@dOyJvlN)5O%^DF3E?%xr4AZ13)W7qE;_V*C}at
zcTaZWTh@=y4c=IMm&_iA!9+jKe*Y~ylEoYIHnm=lXUg%ujMA}xin@5zwj1rq7W(lv
zC~H-N-SpO8VR7t%4x3ERS>kGKqJA&Iu3N(A@%;9bM<(KGb)#77<sSpXO~=MH@t|Gk
zF#;j1N<={*W8vh_Pt3%;MgRClhlGla7R7AOJuc0xq|)UF{MP{=yyA?nGp!Bpeg1J;
zfw1{6Gk#u&&5SgrUEE%&t0<@1M3M-;7ak7X$u<;_`qftqSfVuZn#0Og;|9MwnUftC
zxcg|XXROTDmlXq`9<{d{=<jaW@p96u>bO21bX=NUDew}Zx#zS-%Pn<?Hd85bi?`AV
zsHOZ5NEAi@zqp7GZ~I?&;M_w<<hjWS+OufPp9n1rrdK5f&-&fhSLYQAlbT66<yPc8
z%Um-h5yuC_=AL9Uzb>z6Vrnd54qdR{tBK2TATeyZk?Aqr@NEP-5eD&5>7vQEnJC`s
zzCj^3urhn*{BPp1yuChssN-Bs5mmzV+HVYbRl0mEz0JB(v*$(v*BeTwKLHmveqH|1
zj%M$dombBQ?02=H3H6_#oj6Vvyh5}T3?fW&?h$2w`0i?}Q$_@~d?02>*u|IJtp|F}
zd;U=tb~Yhza{w`WNlc<B{g<uWTF0}rgxvt3o)Y5@T{hD}rm%GI&Mfxh@d_pN4-X9V
z%;ES9^*4F)ZhS{A;x)XBQu222WmIgOq=SD`eo;^EeM~eohkLi1+DppMvJIi}pjhlK
z7QBN3eq7{KPd(PNbx0r1xby`W7rKBsHEZk{8d^7E(*R=u6TIg9)e~_*ipNlA@H#3f
zS$Gf40FJ19;EvSgHlby-2m$L-I$m@7oLCQPL`m2!9AOPHJ^gRV%VD}4=5|O|`*-EI
z05P<2z%!rJm*3CsH1e0js9~VF`>%>yrjw;FQFJFnKCeN5L&6t-#Cb*%>#J2+?^auO
zAusWSER^w2HzX|N0u8ELXdq`+%_gyR56Gy2-9AX4RbB6b4kc3|N$toYbta+#a({wk
z(?)He(djDG&=_vAJxnxG3#9lcM0KcFYZTS-JPxdLPNN%=?kDU1v6Q~4)8~0F*Z;Us
zXM(0ss2&sT_|kA-vLDkegViF?vU!qF;Nyh?#%am3*gKZHO?(!e;IUn;xV|okCmjJ$
zO3vu1@Lym5v<fL|h(LFm6ccR7&_3?-QgyFpOOCW%qk+;qjtXB*ip%i6B4%f;2NZm-
zA3o9lDB7j7ue``A)e3NNApg`CV}-k)kE=nj8B%9M%xDE#GXwSHceUK5&b;%0ZfFHY
z5#?ng_t@_>MC9q9t@ObkL{_+fcgr=`1#4{kJqaGU0_L)HP#$wUPPx+Vz}3^$UOZ{-
z@5P>n6X1Q*js&vcKC`iZGc<*LBi~y_A5RH8(5339c7H}lub<iQQDDvVCPA34A=$<A
zZf)BVv;jNuoCqe1Xmd~IGuBUG-{VAuN*!H^3L38R&Z%@`q=PX|H&^b$SI%M|Rsc-x
zeS=P3sVn;f43^Vf$Q!Gyk8SpGnNHJTj{6P?SDi@{3>%0jEZUH#HuP0fIUo0Ql$SDW
zR-q>a$Is77E!2dMNi@gR)#PZpGiN?I!)GKXznZG~VPt8}TM{iu552FNaOnA}?szJv
zbhpF3GniFj{azu>`)`t%{K=(li(*&LKJUH6eluQUtNy@11Y7qDg>MXA0rtX)Dm&2z
zYFD;}tH4_^iwmHQLkQs_=RsF!B9{C+pX0j?yH%l6g7cIcB^|$C@U{$vqh3XP<8<Qg
z-Y@kN7NTe`L|dK5B|kCaUyW$To%i~YGs;lyHA2%Fb<?V(h+?^9A<0N6cPs+Z@cA^E
zQ}~M)OHUp7e}D)6LG!5K+QN5*Kk709hE<*VKP}b@!XcpSBXkGEtl4!v@udRa9ehQ3
zV<9F$He*0NJNMVed~kB%3w-e0W5#EvCOUgbl3-ShS7Q`akWLX?a*tbg5%96MjOf)+
z&%P#S_@zH-VVx!)_%5!qmlRAy<j~w!QhKb={fOfXk4`jXEQo{dC5|L6OB&E;8uE`H
zzR6}<=+qSZyg`23`K<){CLK|_ebPm!V^DNJrR^|ihS#r2th{#*SHN;FE`_0(tQ78G
zgNS*du6g~s<?(W$yKD)HaogXuA6~yj+S>S!Nr3&b4ac;eiB;*<L~%a!v^P*b-XG{>
zMzGp;zV!hcz7egowHS^}{`|>)3~D!gF_(V3l=5ZNBJM%PQiW(5F03lXI!25m9Z~Q(
z^*la(S|?igyjttHm1u+DIXLvysKD`*Ca{Z*GCx}4AqBk+btsy!3$ZhMgsWgL!qkv&
z%qja|P@_svYNLL3IRQ}b9wXBCt|jxin&Lo+SY%+xrkY*L*(1_tIsma8({fvegTz?a
zu}u5jJTcFtHIjZE6uldly2e}WoB(|&&a_V*SLgZWBlXPF@8xQ}oQmFzK?$_+Hh7_M
zT-%sPLPb7d0Hurf5MXoH2Tl)c`$DpH_dqFUN&R$B5_PrG!CSpRq3#LKz#Kd}`JRvr
zRL7w|ms}P*v}oZtb9}*ryu@(m<e7Hs9xv!D_0D(5o|)bkansb}Gm)lQN$ka^aPkT9
zSmYTRJot=lcd;&|bDfp&cg(Dee?s!fPF*?VDqb-cS#jp|X0H5sy+)kP>%sH24v+d(
zPjDrN`6N~AZoKS|>du?^v$b|Bim(g5ic6sUV8#OWI;o=?bIQBReKfNe&U7>+q0XE#
zkjFJs?6t-hHP!}JV$&yX^$jc?^)N-+pg;KBU);rXqLLp(iROqZWp$d+Ai~RhF}MfH
zU3eyy8=K*|-g#)hZ-p-g{9e%WwrJ9UhL<n8UX79Oc_+Qprv%tE<Y}y9E{9rlfx{+d
znXJu#`_!V1tDc|J`Lbr_Ku9&5bSPWvueq0t+2*;l&?u`)MnSd0$b5DEL36>NVy7_=
z$Rye2Q9^W*@LGF0EritGuD1q5>*O&4-?g{XeVu`Bq}Lc0pSTZFp#r@FuEUTA>oE3b
z)y-AiBEQlpEhn$)K$h^f+AdNKxFkhe&^xf+FDf8`28#k5UiGd{n-h1vM1tt3DLvNi
z>gG=roURq))S;)%F@jEmHjX2ODOR0+-IrBRHo@<UkA1o?@jkAkQC6goLv!hsK8X(g
zFUk=wP1t8_=JNs_j+>c9UAjX7kh@6<NcGlVT|eJDpRRS;ib3IXU$V*e6z_)-=exm|
zH8wlH`%>>qye{2yB8vGwo8i5dmvO)~{`Jck<ioVV(C8o!MIWIxZ61TeyD=+wp5KEj
z<F{5CkjQ*XACCXaitZ|dz*xqD&_2wT(UW4z4*Kv_gy`-ZXQRrdP0(}0#Zs6Jez~-h
zYOH{&tuA*Iv+&b&zBV0Kz0AAMtW7R~Y3AvkMH^$!KAuDUFunZeQLVy8l`lcH=HNiU
za?wOpe#{~zZF@U?mNzqQi5l}=IFqvSdW?|$&uHriLIx-khH{0*SEI?`V53x!I6%+y
z-Zk~ls4_?pt)g_UJVM+y;H}HO$k6%X(*pSS-YYCgkRlU|X&z+Kve^z>RJ<5^(M5m8
z{qaf)Xtd-GM7e<+kbM{kwcFtH71n)J_Qem+IP07kN<@T-l$IOqScvrP%TCa24q?KN
zGQtnr0-70(dPI%p;hrcBdB``APhOG4tmFc3^>sy1o#Cz@ZjM-Eeq-r8Zo8M<NkR{9
zzuUsK=sBMaJNEtgx%GuxV)`@f)=u7SfT@BxfiPxdtHu{>U-czG^ZP1v*F}8$-Moh(
zg_jIgh~I7F_jVt`v19I^FQ-+y%f>WfSj1?8ZW}f&1ErPnm15+^7&O82bd+u-CyfuR
z)BK%<4FQ{39lY60?a$1BAb*e}HFJ~M*cxRp{_gJXP>Q}cgZL)fXPwD0uUu~GT16*m
zDVe`I^=gRYCORj<c4n4TUU&vJCBSOguGX-S%k|y|2sM1|lKFlGABWl52kVnN)mLwr
z)o|#BCDBdD(UXuXDXlp2by6G+l2kJ*`D`TjK?mCV%@4G`JBFXOMqh+K>A%<|Gy9uF
zxPZAWJYVI}KNe%V9DBNqKayv}>A7#K4KH;diXF4^7ymAsW=4Enq`*+tO9#dog=;^r
zK>(?}oB4($_~?C}{iUh=t{ss=fWoOfKQh+Oxv*4f_8+?$mz)RQ>u#ATX@6TJ-76+G
zUGSGr5tTG*P+5}tirtXzDX@9%@Mw?ge@bScRA@EIL<64PiP7IGBBO+_K5g%(9^0y&
z=O3F}YjkM%x*xv$*a>W%_JBFW&!a+o8#B@$K`JB*UbHd{T>_`v-l?lTa~rDdpW#VL
z+N80Q=$QqNp#4AWQ08!JrheOi=X>4I0eoF{uiV*zIM22a5~QqByD0dv^>@y!vkd}e
zn^}%~Aeq3HL6(`P8<;k%W<c5h9XTk2E8(GPr=8q;YUj!9MF=*@dr&<j;&Ew=ms<2Z
zQ;S!qT^w!sJGHL`e{vZu2OJfDC>8)I!*}I*{7yhou2j_JMG_SXb%tb$ZR1|QhM}Js
z|1f~8JEF<uy>P;K?g-(=a`wmJ+fCe4Y(6}BUQJq=Qw3%9-@{T^TZlY_5USr|@9t!7
z5Xi@qj((a{Dk8R;Sol1o;s`kJD&mSg%Mw7!Hr{1-{Iu{5Xijq3_A%y^WBCvkZ<f_l
zJ_vKNM0g=yxPEgLwLv!9MW<^!zPZ{9&L*Ym@;pbpdz0R)@699bo7VkiLn^8ezM=g=
zl*Ut}dY!{{R_bV@oan^-%@^Mlf7GTV49W@RXdy}3lX|xr%+FA#ab7TXy;rT~PZ%dC
z%}Z<SIqtlx_;$_hLJ>I9^*&+0rfm|Q)Zi)`>-%2wP_A(2UX|4|Ybc~S<`D*d8Rn^h
z6K!iCnV~f1LU|&OmZxriN6A<UN|(jih>BIHaNn33yzX4k%O?et9LY2DQcCud5fdJh
zynT^ST4;#M{tpfUmtP{Qza=6@E>gOwB>1{zWx&GIjetjSv7ONejmrLHS23kMLe_I?
zsk~rcKlR$`$J7NZ&kY^{BEeU^NpAUPLB-|ALl_$;JB?#*4+;D)z~f=yTOt=Y=Q;0>
zs2{K#?%6PkNw>?5eEcHz?qe_(Lb~ND+*QFTOvv*~JPU*##Uja11^z)=BYzne!xCO2
z#i(j5i)aUifgcU!d2PC}lmQUyWZ%2M)Zc5rOgT}l0+TV-+&bD2F61JQuo|(`PiDu8
zLU=;M_h*iF*bzx4!=i9iMb1Xr?*)^01W4GP(fl@B^k~AOei-i?X+B=gVDoaP=(YD(
zyG4A<JAZ`VnTc&1_X_q~z3uI(HJ<cPGH!feU<qzLh7?PWr?V9*CU8&NKozG8JghXZ
zb3GK+o3BZ)^ag?`a(=ZU0Pm&7hnFgryVm=-pK@rWl&ys=!lGDa&1v=Vk$~yQfvdQ*
zH4qR(@G+qNeMSn3zHtk-`CB4P2X|I`-^&T|`!O`!^ABUiNl0ifVFE<;M~*!pw|nnR
zr$3}5mH5Jg!EqSb_baAGs$fLaS=v4iqQ(6mY{@3wo$#_mR)ydp@Tbb1n#INWBKh)m
z5KgWd6^}{l<{95QWg})sp1lITf%4b_Z2gl}*S(4s7BOo+EjgC*le-<}n0ljaF)rLm
zIOK*&J$)vXN52hoM;8&|fgyLW&(D(rJ-isf=?#PF?SXn%$0XmTc6(We!s=yrTHABP
zkJj6x7rvQ`8*;8MG0lI<p-A=y(ym=VN>4PMNw$PayPYofuX9#o%fFYP#i%mi{nh}~
znhSoFg|$E@-ApqtYcwtKvW;gQ4~Fa?={4kohYwAo4Cl%J+@73wF-b3pk48~DM24oh
z`E?PN^<k9;>t{@roi85%n#3tQI@`bt`oK|YeFz>$%nQW}b8y=&u$g=HV-u#>1sqD3
z^u%irxScQ$!=cL_&Gnq_!LwKFlY_}{XnWPKxSa)kB*ZI<FnC`~ubbhuh%p#kR}%Em
znBQX^4zP;xIBy*pMZcTYHJb+EZt=YC&>u&RX4{h(oYC*wc8Pg2fC1?n^-E&`bU`cr
z7}pPq#t74}EH}+uBR$bpK1fFW{tugLPR##;yY0^vvhLgguo~b_ne@n#bH--uVE6U|
ziXv3D<9Mi~CM5^DRgO%RhmJidHUAxyN6)Ztd;f+~#{)XY#?l*>uRVr=pgu1!BSC0a
zi-lUEfGg@5z#*ujGYK_Oqb}>yU1*9KgqH1%1e;te?`@;NKN%G_1>O8cUv^XQFGL68
z@NX59S7Hpub>?bXv99xX>1v<-x^jFEJWN@)lVS@{6CfH85V_?4icP$k;mtR5vIV2d
zKr>Rf$C$c3cRb}mgTo1-laZEPCq((axQWWtfi5uJs81@~l2#^d30#WLihN*GXxgt*
z4sfzDS1YapD}=Fr_3GQCovgDNZ5{T*aysi1?DyzliEeMhSwWw@-cS509lH0EN_3nq
zl_tw2)X^!l8}!Doi~ge+Zx({Q={N68qRxXjb^iksZ`YPKGDae-$zQI}iRR}6rG^Wa
zbj--n8iS1PPqK}fc%;p7yQVdmd3hvRCCWvS8oD%_L7+1aip#E2!yW0#O*xAVJ&yB7
z*2M<J3vTPg*(RILC??fTV#`JD@^i~5LSB~|NSw^)tYpFtQf@+mj+{+B)U`!|(K^(a
z<eeasj<8%K{?YG~>YC6X|LAW8!m(m|&Wu*YwijUmdiwm$ygEY5s#~>;CdVN&hbq6%
zv02vgcL$c*CDTO=mMU!UBJB@FdkzS5xYi%|N<UBJP;N_Cf3h}*ik&2jH<nReBD6<$
z38)p|wP(7K&j`9XD$(4U*%F=bJdAloKHZGDdHLKlt7z&!nT%fIY|rS2lolLWynu<7
zp+tS@rGjo5wkr)XDNRRt-h0HiGr#?lBcYWl4#FsN3o<vx|E|saC*2YO`xVG<<|XJ0
z+DJDsx%o^i_EBaGD-J+W=pMYQ;qvJ5?2h<>0FSYP;1&#wQ2&T0S1Q7msyr&C>T#YD
zF<GGfecuR8O@%Shpo>ZQ^Or>2#z>~^TYF#%g|G8n+`PArW}>1zI$F{ukXw+wTFKXF
z3X9JC00+ljoZ#~eZZj5KOBGY+ja9Ga454oBUN#({Yf^3m{f?><aoAGb<tabJ;rko1
z0JPC_eO35TCmLm5`9eJ5I3RU7Cp!`&nPmU6ue;_;D?pDp|GDnj-Q@cCW6mNzI5j0{
zSFFF2?s-!QF2JX$#)<knn=whtw-Sa83|bF7DtpG&2I&hbXIs>_4Z<T<?hLgglt)v6
zfDKA4HU&Zk|8V>c&&McoMy7+k&>85V<T!fL^(9ej1&~f9--y|<Gkot@Ab3H;!7#v*
zlsnMEsQYhj8B`;QqY7(|0eswIT+qe}BG=V)i%dSd=%kVLAf>t++cG&QG<Vz#Wzb{v
zTPr-;K0B;dq&zQcQl4`>!Q(2It?ga7ULo&%j#s)Cuef=FeX}0fTfv<A^95Uuzqc+L
z(N}5#z4w<al>Esf&z*ihie)0nHn&K?Z7=m;+WQZUq;XP25>c3j%V_A#FBQQ5@5h<t
zXWkVvpVSE_{|~`|uX_2fv-Yg8#zg}SE=i1lMU>f5IHwa@<AZx5<ljkO(~eORN8(D+
zo*W5IK9ap{z9q=EJy^jBxNoa!oer018(@IWt(O$y@0*lDYa8Ad<!?<mw<AE657sI{
zncE^I=p@l<;=D#DI|kVe@1HZ!o2Hw&y!*KOU1CpAGdn*)LwZWiCns?>k>t}k<c}7m
zadVe6YfS2Qd%sr7I9fV<c?jl=V}~9*>};9vfh5pB*Nv!_R_IhbYIs=xnb0>e8U4H`
zP--DW8OHx>`q3@$$Ey+*OvsYh1Xa#lNAz`tFPbawGgckpl`L1DpK}<La?wo}XsbJW
z&$70c0`MgAHA(5tDT-(}_V9G^Q=4-w(;DBY@$O~<`>A>ol>5ANDidf`$w{OOcUcWP
z^PZX?JUjcXp1(hOU#?TpaXkIs+7++3=|1pZe%JzRFn|7>X#1~$*g?s6&5(bq|JM2l
zhgYcRBG1*Sr=zirw*T6ZvqW&x!wl0c9zX&PqFpoTt$DJF712%WiQfCbj)2LiguX}l
zvZd_UMSS5^0Z;Fx7M|GirP@WQ07_8ena~!IpdxpbPJk7YKH^lW6p1y^l0mynsr%99
zcBjg*fHUPFT9sjE#I~BoXP=P3I;pXHB0++(n+;v~MJ^m24a!UnySmi&D3M7UDV?qQ
z%1QCNFtk~1&e26R#@TY&lou;0YoQ@4eLi9!CjTIMYD7X7ws1L^pVWMX6z=~{51#zJ
z@#w$l0Qz3}L8qTH7_VV8<Ntc?0+y+;Cy~MwteK;f+Yf)c7(RP43~i8M<IaY(%&p;I
z)BE?dwy*=Y@iL@mlGof2lA{KZLj1Ka?Uotkryt(M_U(0#4g_BoV)p^>b2bQj&udtg
z!)FI5JADY~`;Ej5pVgKcj9Nc(ThLav!Qv&OG7L79JoLxopet!JWnMNajozxG68%kO
z*X3HQS^|youd^5ylKsl<{`EKfpFzJR(x*f0-(~xSeQ1o0OaF%febtY>{SqXvBp|D_
z!#QU=zSf|lG}E5y(MN8}UOvP=kV!ie(PWzpB2D&qRx?x+YV~Mm#H}$3@yB{=t&^vk
zTTlclc!I_$d9EkLW4?f*(m7FYg{og*`{~Eu7aFpwVI)x!gE<E=0wi(MI@tnhYDIP4
zz~ZhLzBH@CH_z=mc-w^%p*Qi9C2@)P1V{A+{{T61J}_z$tf}e#J5)9zUkbhTtait<
z!|8nTr@hB>^S%C=a;-uAaZ0RWB5+61wbU0S-IV?4+U4lPgX)7AAfuJVzp=ajm3b^;
zgY(N;Nqb@R*l(($X-wPazvv&!pRZT#Mo~ND3exvbQ<ip_`YS_hVcR4Y(iBlJ!M?rW
zhq=CQ;ctS1?pl}P+Z`B!g72m~uBGxUT;9C}!Lk|-<|{o#dNdI&dCY4fGaW;m-wUd2
zfA&k1z$cm$g6HzFoomo@$1<8aKe*e8mi(`mHz}!<d1I=ynTCfvF<Is&{*zJzA27WM
z{7YW;s12w0bf)s3wo=*72L(HNK)m6~-nVZwI43m4?Vu`CpzX>W=*Q<LsC5UuDw{t~
zxg6|2`paK!Av7JSvQXG+T+u|ccL%=Psj`0kF7>52mo;hEh2ei?0Z@*H%s3v@j(oN1
zpNmSQWU6Kzc`dH9dIb$o&)V^)>!))Kmq3r3b=;u75C8TQC!yG{4{)VPgr_5pBl>^P
zYRj-UDc2*pVabdPmB><JMs=^1b2M3=N$90~f1)}0mb>Q|A0ZjvFj8I0K~We1Wbyt%
zc!ZuymE}ELqXZS^@^~ySP)(tN#i4wo2b*4b=ID-(ug-y(rzTM*;#*dsX1Us=J=@#r
zIYM%-nrC7GKTd>KapDhM7;v@EqV-b4<oDL;vE)e1|HHQF<%TL|`0oj4=I$t8=k0CX
zy-9%oTBS%9^<;%Amc?wtmxc_U;r;7TDrWkSik=`u7bJUesvrqX%Y4GVOxLpkg7-V#
z7SnazoU-7m)g^nLAkpP>4=L+)o$Ult@nt1T<)LUmX3~wn)5ySFxk-PA{-?J)>qY8z
zD0z6_tCx-PWmf9`JBM3Eq(@JWQN#%KU%%o13+JA+$Np8ZWo>o#&MYVr@4sov0DfSu
zVuIQ<=ezW(Nq^8<gWMBe`D70#@R$sIDjnNLeQ(P;q+Q&lmQf5#3HoSWU|Exrb;`#r
zAiLrqEghelj^>e%O6HdQIchFODRt(X<+RE)GG<cZ4SAvJ!MLbmJ&3+GwqEc*aiag^
z@-8mnNc}*bt0?eG`ZGV~dgIXiO#II>pvV&bSOVr<13dAwk)m~me?yg0yY(FBS&|0i
zXP$FYPa@a$-_uDc8d40nzV30No=jIKuEMI$Li@>vAlHF_jV2#79bg!XmW<)PRsx@$
z_EodE)zyX$ZDqS_9AQFq!t3y{9wkZTf;7I<Q}*}sKgg*zJ8|YBzu!)a^0K&{jW~Gp
z7OKu0RXEmOe3+`Vy|X9zKizeM7yp4*S;WCcvEa>!^#HzS+nIVT_Q%+7Gj5Kt$H-&x
zL6$tSq;Q74YD=kn>cI&P=l2?`Z@r9}Q^cw4o$LY_P9H%`ajMMn+V5Tm5ZU}M_P#T$
z$!%L#5D`QKL_t76q=_`8O2<ZT0@5KUy>~El1Vp6?(!0{7hh9P`BE9zx3WU%i5E26f
z?#C*7>vEm5);{;1v!C-k_fPyxU}iGr81HyTnR7y)8Z1^C6OB*>H01<;H1B04z6Y<d
z%UCuThHELW2j9OG8X?us=pk@DyH?zzuu#bi8=>OEz~KtW|AG2+#pCoo`w4o7tt+7+
zA7WlThVks2aBPW&-mlJ!@WtiXT26QbkHpj<fOnBvlH$-0ZifH}Is(ROqZcW_JnCk(
zb){fa4t(T!RKrQf(YyTE9Wah|$YR9|xR7sf%F&H^xxK3(bZ1fEdL=ancY{%(i=U65
zxK^rhTC(GFVyfGJ$PEL+%2>Q+%(+f#!SxT-vwQBJs~SC06I-!S%BjT7u5c6=8+{to
z=j8OY%_<Z;ZF*^~k|IGNS8Ml$po=5db9a{y6}D36OlBv_*WP<-?R5oE9G_-V(=*_r
zD~FhFu`|l!RTzDtq)k(uw<~LJnSUX*#S<)N#nVq;p#CX#Hm#j6VT^~HBjQ_4o&w_s
zAWwEc9UfwdSUl9MG%y@xPED$!>AP<R7qrXG*?YhJWwq9%!g+8Aw|C+mxMzRz-M=V6
z9x47!$6{mkdZ9~2#lJ8aF(Uk`Z&iWkH6<X4(SxAqWRJJ%MC`(5z44#ld+)6qL@LuF
z_GM1d5w1px30s6OEvwbnK1iy50UI5|RhX4$8P1>n+WxKt*}rN(_8hx2+luLauPPbi
z)*U~#o)b$eRc&ohcKGZQ%a?vsd-vOyK3ein1&GOTakoLL+~?~g(sbwF#=6#2dA}GX
z6D!Kv{y-90peWq1_+gA!yLd+aPTRgnNjw`Tlb*bLPXwg+z@*txZ%bS&(UP>LE^5bQ
zdGz6lu1|%=P#N)#)xsL37z$Kw^&35XuBP+xClSyx-~$_S7Q<gTnf}TS)R08a6umop
z`HZRDE>S6jvUm)uTZ&pJidUF0e<&IiQyr5|z2@IG0fAhcZ1Cj}B45fU^ioJ>ik`T^
zD&08MUGLTsM==Seu*7$n!Y^9kOIQMlk*#1!s#^5!o#wx;X;)t)c!k%G?&z7D9N+@o
zrt%)?@~xQkQ5Q+`Xc+~!Oy7Q2<vmt)<i6pVt&HUIDac}sHEj=i<u=Pm^1F{}`cHh6
zc`!Rs@Xs4&GZvZ1hX7=6p!pTqM>z3Ejh6DY?Zdta$x2gietx3m6~rQ>692h{`|Gke
zC^+^Fm7R-wVY4W(nS%E0=1NiRy4h|WRtJwa)*KPY$_g!!ouRT`KV{0iNdA(fhccgD
zR1QNNqNsaBHCg(s%+>%(JI(|u+vNCCI^`6ZpmD2D;O*Rk5XDAK1}*$2VE^s%1>gkk
zfu<2&_j^G_iog8~DP@3vX`WT6%78Nivw9M5G6dckr%X<j80XS@wOpaLMF!7OxFis{
z^8AqWVtT?gL~1)R&aGgq&W<JT(p^q934#>eO`c4Tx8{akU1Ox{rr5~xEK3_-C{e1-
z&6Z*-sH4E2k&tBOp56NcDb-H`O?i#3x1Bk2(;Gb;@liEPz>oe@fjV1*ieb9!7bZQ=
zX9tVRE&Z+C^)A|d^oE`lnXnWoh9d80o{!hRbz{FQqMibzn!yUVig{?}w$^mi&LLop
za%S8`c+AFu<Njycb@+cm8Y$rP1a#&Tr{5f$)Fu41(i=uY^^b1IbHlH88r@b@cwOz;
zn4-+=-d@XaOY71FyU7|op<*me0p?+JbQ$NNFlN$e-#BH<k?0OLoQhslA@Y928%RTM
zv>--Ux7wCQX)ifDh~8crcitpWL03MX>%c0``*=oU=Zn5WN1Sbqq1>%aHz_45;h*OA
zyC3uiwqH*cpZ<M{T=*(FBZX$dR-=2jEJfg*E2N>OCS%?&UuUR4r7aWdtHdipMSsTc
zG!UYsJL}AvFHdJRZS$OkMTmcMh2G+lhnz}koJ%D1lYD4EYnI~|QQfbT?9nhQjZ_<v
zam^sZVb8_W7yra#6h%%nMZbCTvX|4wbh!taRg<MtRj5aNpE&y({pahKNGh*zaL25-
zefS?<Fa~@)?v*fmFNH57jp}+SXSv~w#?j!+#Id<Y#e~+vtsc8CdV(Qlt@w_H*NW;k
zc$Ledqyz1)eswIJ_q^**r+vfKe(I)S9i#V4mZ#+kGRJrmW5gT*iT8@Ad)DJ`_%xfG
zR(uk`mAnVNv;tRns?Ln4YEK(_Z|KDl{8a?<tkk!bZ|y%aF=HYDQ-X_hd@i4oe-B#$
z$Rh}PC?7t|k_lgge9o=Uy5;~a)W;Q6Aj|5lM8*$JjX>~-_CH^<&FNn$yv(0fSfQ3w
zvB6tZzp+TM@&X??IP#)tfe{V93SLornwNb2-TPVK;T-i+>tJGb&eO#KE5o*E$fS#D
zud39I5E`EY#df^Y*%cL80nWCz09k5M3H7%#n6e5P6E*)sh_+!%l=q^XpM!QIMPHDF
z?VkWX0j6-=>8QBS%9_2IMy_Wc8=eU3%-MJ-=8)?Vm*&(eZFM`SS4!coJylS@(@;+P
zyRV(8IeNKlwsX3Kbpe>bw?sksog5s|W3}Zh*}00|3LlbOJa!dNBLm8Ei3X^>-!GY(
zcfRM93`{(_Xq7goRTnYykXFzpqV^MizFoS4do2({HVNHtl3JWNZI;|@Px=e#`~D6Y
z>AjQA*<A_hLi>8lz+ZijyAi?Ry0Dl7Z;{30`%V0>=+Xz|`05z10Z9RMgBV_$cyDfS
z^7uXT#04x~G3{;6$|CZn0|jxV`5`#^g`kv4*lU)`-YW@E`ogOdbsJT*L^kd93tvgX
zJuyb9GdV>p0(Lp~6zZuwVr&Q;2+42q2F$OdJnnv?ASUk_HoJa2S--m=2RUd$N@ZM_
z5gw3EABOk1_P4oo&}?5}xc>0<b?#!G*NXD>Lo#v~#4asLUA=H@at!|ps&}7Qe*-ly
zWVN$wt&e-4eyQz%*7;eF;{`OzRaJFfxXx+*jApDK@-cKRWTWTaJGG9Rml_2%QWJUS
zW7F|3KE@Vc)L{=C<zx$z2YZ7(E_iEBag?y*xY$c$G0i!JUP(gF-D)?T!SDB1x?XJZ
zR+ms=1KoluspF`)wC$&PktOk*t&#-f(x>o#e=%k!3W{``BbLaGB)aB*3ZH01t*IA1
z$bVRRp7EfNFmc>9X@#cN1<@XSAX2?%c%ui^BTjq2NhN#s1Bj@G)1&zCf$a5SxjGl6
zPYl?w2sK;YMlFLbyD(}S>b-l+-CG9H16Gy8w66Ap6y-cb&`=iljdyF>gr?s*C=j#^
zvA{`6v4bY<n?1yy>XT*d585e+C?~p9D>&BEa<+^O?duKFN+p9wm*FmZ`&P?i!pTxP
z+B(?M3S(e*hA-Gp0uq+~3iB>XY2&*;=~TczNla}>Io7l_cxoT^00sQDRf_)QGwz*R
z3!c5f;-ptd>Akh6*!vN{@vfbRJ%Wif*rYIeaLE>*pAd=0hpmOo8LXjj$=e89qi(K_
zSsUIpiRxT8eon6lKN(R^GOFjj=Q(DvB`%a#??OsI>WYfdG_XuikmcxLpE-f6%KVe{
z!gG(Wf&|T!_($VN+{+9kjwJnNM=9b~(6!4lV4*4G+T@z^#$pI->`@`hu$6q3$HX}I
zhRY)zzm}orxO^7XQl+Bm%zg4$23z3nh8p@FWR;<0wJYM0qEtGJAyz8E$cpCVsjy_m
zy>eM8^@*8bl75_xm~A?!4@B0jNQ?jLV*awCColS!iPF`l#}rDz1vN3UC(A$a1fMob
zfSKYgf`NV!eH%*VSVEXc$>#=-NBj1f@2?CLw1{oo^6>Uhz9kYLo4+l*=LS~1@4@Mb
z+_W(J+T`VN_|EB6FYCa2cv#ioZFyIZ?v2f9NEM_o(~Q!Smg#UfucQJi847mmxdCtV
zIiN|M%j3O-yZ2Ac`}HZ{#YLi2^?FC*Fp?*vdkTXzCyE)MLonl7BhebUhU!U=fAB#W
zQqb*gy;+M`7gdNtIGKQ%?$J3Rm3R2S2VoKRN3{{+J#w=1PLl^6Qp4jpMdVkXy1L{s
zyt+-2(2*?(DJXEp&4_>z4o7;$jr*@<_!C)~R<xh)_97JH%QFXOMJ#q7tKkD8$$RBb
zbG9izk1a;?IbaF@5Z*MW@#(&1wsR)QG>cbVpU|Bck8w58rQ!|J4ZryF42O9t<0)@@
zb}DAP*S#POx2yvDXeVd5uaLT?tU%>Ml_JyQ#kBq2YgqHH*+j(;Ql`hEw;s{ix`VD*
zW~dCu8s1#!Bj+qHciuZYABz>MUiwVd$cy0t#XW%3d9`RCY&A?dH^&$Mi4x^|BaE`U
z6m4V5qb0SXEi&D!>ZCr9Jwi02pJSg-Hy_$hFS{tlGgP8>vFnepfr!|+ZQ!YSzspWr
zzi)&_Yb?9gthTBZja@$6D{aUUtnv54!`?FtgsHib{R1IR1<Z1*nJaA0lC};qpCH7B
zIUW>}i)ZH2zcToRF+LzLePEMSHx0%)K7UwM$OBka&FGmh_wu#4h}i}P(aYPY*<IRi
z$5p2MG_yv72SR=-`=8Pvk12sR@LRz)TnoMo_2`W#4cNiKkO<$+0?}+V!tYzO(tI7q
zD^~?oe)Ec5LNaR00b8@yOru8Qa4g@!b8xp^yv@8rE68K7h?8zGz0|q5LASV|b^g(4
zok(uQ8rUe%r;_ONrAVtxlL?g51D(fi9Y*i+e6_5va)F#`pE3$1`wiRsie0|4k0WMC
zZr5@jEy=X~^yXJXdLfg2f6ZC`$SG#cQA~I1D&BG1m+lBLxLaoKKb3-L<6gi3N2SNR
z(;;D}@axqwp07vNs|DZZAx=0IWHe!;T+uxzE5#icBALy<*w@+8^SIW1OJ7uZ>*8Jb
z#WA6hiE$=Yeh=QRB9i&BY2oB+vkm7Nadrp6{`c08E>GHRk&QB!$_L<(&*H_gB}Yo8
z?g~`w>`U>-U(Iw=l7<f2aw~I4echq6X2<qm?DJV0j&k0F>KXT1G*0NyO?$o$LV+XZ
zgm)z`*NGm|CmZ#z-`I`mnXhMur~4UcQLXG5qz~v_7o~c1Tz7OS)ZX=c&Ih;U>uWr$
z4#kSuPw9#t?3E8Sp6S*$D4r<mT@h}($^C_~o?9c}9mwPB#NmEd(Sch`H+2<nVWFZ+
zxAM`m*M3*CMO$dui^|R%QkPJHBCIwtvpv2O*R|5XGh304XsM2kh12iXfBj@cNtze9
zqhKOY&b{kMBR%#A$2gv``@BclFow<nCw)4d(|x3fSZQ~H4Uf`RC1><7DZ=(BM<p}$
z(bkzsm(RNp%*PU&%zeU7`@1sc$wqh|X5agZge0^_S<JIX;^L3PE4UCMOK5KJayKY?
za@$F)%RN+C&HF9Bxx((N0_`P~_VL|5pUv)_wfJM;3N!eJbD|CqE66Wu{txs2Wva>?
z8M^^gj0T(@7r7{;jpK<Dh8dVX816iESiIG%S-@sIlC<$AQ1vh^!Zt0wb~}A-q|c&a
ztYJe8?JHm9CNEi^Bc*ouz%D?v!qVnCYdK~7@+Yjrs_Vy4+7y0_?GYRTUzJ5h-fEW^
ziTL=5+!0P*?!7UbKFIsQbUFUG`<R8Ye3)<V8|4tcc%emf*W63(xb@~Q#lR`!InI0E
z4uBPx)k&I$$I&fMTEYt0J*^t0V`4gO#y3V*c0G^^?~9b{UdXaU4RQ3j<OFy1e|dr<
zRwO~^MvFJ|RnA4M-PiOi#Thp)xZi?JyJKgDvSqn(-bbFAN5-DnW{1F2X`}P>XdJ)W
zi&!@+d)?yAQXgLtKM5{gtM6C%1RQK1FZecr3j`lA{OV(3LF#i)&dPY$@GRiVywZ%n
z>b<k;jLin8Ucgi+MZ$P=>6aUhznx{B9V}^nZQDCN7kfF0;vH>@&8U}<YB<a)uy=Cw
zG4wj@8ndhvyH|u&X)WvHE0ap4VC|!q8x?`>aB@1=qdnEm9aM}fYOnKtq!imE_Dobz
zs_PB$wI%vOZ2e>9g}f<eqha^TKS5Ap{UnWGEPi)tbzk?S-@dGFO5zYlhzsP{5E&~g
z^)1EMDX-U;#+V-uUV0Rieda(zKFv5$&p~#N>88Yl?=6rcaGsQ1y|+DLC^ODi(of>1
z-!146-^r5>sCLu3J5jW?t4^SrYutT85|#pEh>I8aoq7H439D{O0|I*vc;x_kPGg>_
zWD-Yl^1KI4XTpZ)TXzPgk7Esd_Cg`Iut`12Mll5rT%AVrh?&v)>O1vSwe7d03}b5~
z-?S4Z>le+<(v@_kD$WmQG!ieBbgZ_DLK2O4Pn)uAoX2yyaGU+57Sv%^OcDbo<zxRC
zlt|4BirJV+sjwUnf!)>2OENAW%&#&~tNJrBfvP%sh5I^cOO7+S5U5G++LWu$stJ?R
z58lu(+sdRlpd0AN(g9lLce9g{OJjXAPx3+b*=e#B68p2e^>*?7+=iYbL;dg)*=4dE
zZM{#x)7f?4k|aA34GSIJ_?5WjsSDHa_}%f=o#+?eyIw%OnzYWmZsn-bBVDHWh2&ZK
z;!)qSUY}wPnM3ntdu-^EhMAi=ZB-daVcb&954fS$DZ_)+`|wc$6|k*}bg!PPri*0t
z%`fA=Ox3);+7KIE`q+dRDEqOTOr)RE_%umXnC)B&cP-uhj4Q4x71h%mDf+Io@Q9EI
zIcv)fm05>O6?Fa0MR=q&?321(?KQnx&qJT1x&mgd-c6V8@EzNV`O$LU{JXxSTghGm
zpl@5%d!hFzMKi>iLj0Cb?|eOwx{)8DA_3RLu)aG{q9SC!dFxd3P~~LAngw8YqyLZW
z&fkAm>kLQaiS$bA2?UHyw-7U9`FG*$jn|-=sy3%w<@vfvoOg>HY%)-+q|wC_O&oXO
z3Vs6TnguCsjSLgt>c<<s5z|hKPnlD#6nuy2l-xV(*}oE*XO(2^lAX0=V}M8JI<}R_
zo4q<5_RN0XPi@HS(>WiYbu)c2PK)yT5bqO#t*DJRX_XWCkm<U5v*Y^oIP?~S$j%X2
z$F_yv!=YJ2&$P+Ijum?61y@Quf+Xdp!`X#^AC_An@05nM@G8mu>R*BsTJ5HASCy(g
zt6M0Hz^#~=M185a!a4JAb9?FCT|-%644!&Zb%d<xqfsv0EBH)LG6%Fr48#SQTUgm$
zG_;rJ!sE@J?0@*QsK=71&@m@_EKNuzse$4LJDqo9ak3zuVEl3*)y++7tP&Y9dCkS#
z+klM+6us5_#A+QeXpGR?hR+A3gEiSR0uJJWu0Go!M$j`v%}Hd6hP&5C*s@I+q_Ss|
zOK0%lH=O-IA{KuA1OR_?8`5U@VUEQ;uMCK;4Fut1c=v=Co1NsQd5vW`R#<0m=M767
zSDn9lSV|bGwYj>2*e-Ye(n4nXWF?36+N)X(&P;U%Z{?A!Tfy!!Iqm)HMBB)&nU7|q
zrA3ydA{-`>zIOwA`>jQAc9p(|W0EiB6v~MbtkQ>Dy-a<r<c*GZ=DV?sQoP;e{O$_a
zdwGe2eX-CnvKPzxsH@S#R^@N)?D$;2m9n@gxIFKTwTdReBUs~FNFRE}s|%+NT=~_^
zehsKHQcf|abR$XuI+&!_WB_CG9v$J`Ml+@mruL$y5NNzfzcr8SOw5}Z=(osd-3E@`
zx7wy=i)mpD;PmrhDK`kUdVTKgu{%^ps@<kma?>n_maGkhrM~bsKMXT1H?LTPRA8<S
zUOR$kc`Ruz3RYmTcC$x}mU?-~(%Tl^j{|$hjeH*F^feZPbP}W5UJ-*)%#5*;^f+7m
z957VYThOoe@L8Ky#BUK65a|9IVRa#eLB>|2!!`wkZ+Fo%8*6TgE{#>?Dig3HeW~1_
zJ$A0%N@QH@k`uVvhGsM*4`nR%Ut-Km->vvBhW8i`zfb>imN>wYbpnb&)^lYcD(yBm
z7IMbzl`A^G-tR{U)I@fCYb)tS?K!OZq>e9rN}eC!c<<2l^@u$y(@RHRsF|EnXSCg(
zL_9eoiO^GzQeNz$xAS4N$~zp1Zc=>_LPVZLLsb2cpU-)T*?p;o>AEyXZDF2``bHXw
zx}l--p-K9i*h$@5Y^q;$y+b2Y=vzwXrAbU@<6Cyt*~xsESlezpn;gId^$1eBM!z6;
z#|?#Bhh`+JXKtu|^W`_Jig+@nPgmH}-}!O0fUoA*zXG!to5Kw9+sKKV5^ve3jH9wG
z`oHH5q+G^E;+P3LB;Bl2`Genm;YsXA&jk2G*#;%oi$K_ZKZAUKDk3T0Wt@0|#DAYM
z`?wmj5gVxL4ct1x7`I)01kyE3i|(8*={Q8O(#&Bl&|HyV%CBM&Q@>rpOhwYVfh%~W
zoM&;m*E6q(SfZwZt7mS^%MixJ@6nIQQ-(^}`O1qPsCKQk!tya%J_doZ#A2d@gu5it
z$}d}(n7(HC)4UUV_N}x5-had?;<fZL<v1A@ja}DX>e1fnT&gT^-bl2__Og^CvENE5
zlU&$0u8+-$DQKUEm(<t&E>$a-$l+%&>pKjruei7i+O55`*3H{h6G1L@I1{oQMJAc6
zlvJkKQK?>UrZf7)s+o%O|JsLE-6!Y)M`r9^0GF+@{R5MLFhvz$XOs2zZD)sOF@a4K
z%%yZ^8iwcW#l0hHSMc{K&7RVE-Y(4+%+7S}wS=2E?{3U@&dkDD%VXZ^Q^VMYJ?7uM
z^Kpoud)FGR;4wMa7i{HRdB4}4Vvs~2d13ptO1cm7lg&vE0TQRHS$J7K%gfdDo5dbh
zQXYXT-E6%gl>D&QZ!WJ?_soU>PIL&{>4z5M^W3=&-UNbkHY4x5#XiVnPu1PPAu;j7
zUA#+-mm95w{g5|%)~x4HLnrax<>2gEAA4PShGcEBPfDE*TfO4KzS=y{Z~V2mDbA-k
zEd7Y{7vIM7cJ7GH<%3LVZiG9#+u33u(I$B;s?dF1tNEL~)Z!5UGxMxc7n+C!Xl<2-
zU2R@t&HfDEO^{|CkU)v-QuPcZ4oa;6JB%3fUYx!AoqFPdng~<$DLIsf57gFfWE!|X
zj($}EHQur?r9UCOukcs!lTU7~G`Gd*?;V=UyxEpEt|vGbpbk~YJN7`)$7CXpG!kpj
z7wCO3g1Tuw8dMfqo0o=WPluQpZ#W@n?x+!o+l1_(89wPbvV+vP2ov|HdkKS35&5*X
z>9L8gy7|Dnmw{kA^K9MCBGH>=9vjTG3CC|-Nw0D?T+J!OX3uuE!wjW#M(S+{dPF|@
zIz5cf%w>2>c36E3Z8*w?y*1;nM9WI?V6kAIzA2d57E*U{T*c3CqxEQcnySR}Y>~mi
z%B3>X9D2VmVFpKoWM*LNA;OVi+B~UmX=I&?9*v4q^<F3kSfXzOSfXnAVKGD<Go`85
zFmxEZ5^7yNv~%*0MFC>Z(vQ<F39|L1PrSKqrmCAdu058F$8gfPZj$HcyySsdkI!m(
zVv9xm1Fw)q?$70+$X3zs6qiUK$Di~L`|vE}J>yg7$;-O>hH+UDH4yEt>TP^w6XyX=
z0&+`9JeNhuT(}Tax<tGFV!plA=z;i-w6WJ2)W!ElpevgxRhb?K=#*%$aicf+HuEEo
zo_*mH2H&MJFSb%MRoZ`{y1JWJ;fA&;qY}H<f7FzulGM?q@@&GmtTxiRz+Hq%*GKiX
z6#u(+@Wb3bUkxp)_D$+(J-u>TW8eC)s|SL&*@n~)eYoQ0i9-|^DJJakgj?oQ?oWAo
zE^VeCG^q5$xY#n1^n2x2<_vX{Q(|VioENV?I(#02(C$+j1)?8%m#ql`^TubQ0-=gh
z2l)oya!cfFRMWFW>rGdq-a4;tBHz3V3_+1_6XOgtXwNpjqUw0vv_*02?t(RAGvv{<
zZyn62c-9%)wMkOoQht*7jU`hgiQ}x^jK_>4;|DPT`(}wf(R~D&@Lkt=e->Anvkbq}
zm0yU9bSB9qnLF%aB-A0}e9?x(B-#W?bU3$6KcZ`D`TnH)W^Vg)XSk!{O+RSM8Ybz|
zKf6|n!}lHNP@;xn7D3~WdQ}lG;f$R<o9WgI*a)sQXllH`S8?#V8G=rP702A34Ruk|
zySUG;SMp$GSM**#fk|)XyXXBms%UB@YI6aXA@r!Xlb!EMPS2bK@2&>5*{Dh?^n>n2
z(a#5Y=NXR@Q0ve_%Kf=kNrQ~qcM(Mcf=3TC=J3&1-@}->X>D&co9|jGXJ`8$eLG3d
zcTS;VOnX+1=!pURNW-<rjyL&NWc9>LZl#M=yzpe~Vw?7Oas_uWbllBl-iv?8;b9=s
zJjmsJy(&7z+u%YYZc}I=Wo>mtIy|w=%1c(rVaK>5Li+x>1Q0hSxoYx)CoHZi0!UC#
zFLO4U<mE|d3l*v^qerj%l}41Jfh6v7m)*fK&Q1GM%5D$V>+$J6^OfX^IofOQK<^j-
zKzZu`%}%E1bKh@FA-+>`>2t5Snm-meu=)f~7V*u4CX#S`%6)UZRRS%EE;ASW%sf;u
zh65d_8YsqpLuyZ-uk_d|u@M_RH;;XsYQ7&GaVjmk9^ml^vE046e%UOKg_jY&4AHJ$
z4!6G*zSWp+DmSXCR4nR={Aeo{VpToyvTFLeC^S>Is%m2LQQV4!sS`S`XGW|<<Hh{V
zhV+3AbHam&#`bK74MRXATUV-LCkE$pj>{NYtC$B*;a5`Mf-yJGgK>tob`J5V2_tz@
z((aw2<ee2DrIN#I_$CnPTN1_~#Wc&@N}DCOc1mQoKAH1zS?q;6jV7(NmpnFX3&Obs
zsjSUCF&W3pN-(?Q-n0e=myillWR#86VdKpHHYOXs%DnT{r5}McD@k)1L)BVa-;i`G
z>y6ypn^@@^ui5X_(@cU5c|Qu#u)-?rKCCPsMR``hVub?DGOKW|e$W){TJL3a@M!Ji
zVY6sUS4j4m?$~K`_rZEFA6xb)VUl*dwredLVouVFF51}*@?G{5SOrT0JUT}qjh-Zu
zMw<dfNFBtI_pA7~#axOyMaC1x(t9r9V0isTz_JKbD|M8kQu!+nbW))svT5v$f$=Co
zgliX~?}&L{oQ0DF8DHxKH(lGuxMl60>Eu%F8sUb*w^R9}$D;aDB@VLd9nxLV;Rqt1
z)Tjrt=cr|*yzo|wF!o<)U*Ek(Nzny&BY0x3EKAeZak%8Bn*e=xcO_Z$ZPWXlbxZT!
z7$pzu1%uPQUh~|u_ao&#_+$*o+;&`%@pG8DB|<K}5Ws7G33kg2RuiSr`FZWbjMpA@
zw61Ox$^1dw%(u2szGoXjVOJ&C0?(o(x9RC=TW<xdZ<eD9(Bp=oPps_IMI@43l8LF{
z0*(y={G629#U1`=sQ&4-Xca|IN4aPp+|bk*TW6XP-J8zhB;GD-$4>;=bf&c1&h3bZ
z>S%Euq&xR76^#wqY^?R;z7Y+(<Yr0R3PU@w(YVmH$bx?E%$<kX2S6%1?=VaFrGFY(
zFBd7N%l_m;hj#<!GI!Q{sKkE>3V(kIkfAQcgKO4D_z#>?G6(C$QEyF#c90oNSfiNV
z0hr}4ttNe1-#lhyt*aq|A$4RpY3^*1O^FI_TB}F5;VC0myAb{~#{zzTRd`T^Tlktw
zB<j4>`^?n-)m=flXFx<8y<96Mx{r<MQEA{nMBqW&cQYzHND(p9_oJ=XqhtEiqUES~
z?@^Vo&BOA%{p@BJ31k(Dr>%s}!Kl&mpG#h*vKpH;t0<=rnwf(Y;#QE8nDdF*@?uGo
zgkb`HvPdSx)n_`M81Z*^cUm#0d#mU-hwHxhWjI{~stc6(ZJ<X8TAC=?;|0~F6;(ej
zet3V^=cOP?WG7U(Ujeh4aMgQy+6=WAvN`{8DDhpzQ(Ba;K@pkz$Bk{5eUYwBeryYx
zx{r?A-F0UNRl%jC)Ug!4&gZk4)MHqGy<PH1p5_i%K^+(9{-D~XMEmsME7#=0o}LLl
zXe(_@#8wrj)~?y=Z7a@4GR3E#EYCF7#69wPsX&J2{I8Uw^fHN7IkVZ{Jd|MMS>vYM
z+0RZY_Bq(|)_zvf9=}Js?Dn4Vdq+gh2Gm<NXPJk_d@>yIbaR^pnrZlhb7?Mj1TWqa
z8Un<VOO!yuA6o8#Y7i=-wjmTa<=*5D6-Y&@h>-73X5o829(=%tf>)3~V;Hr{ozD(f
zNrv2VyK9a-G#KYy??+~t#%*=BxF+LTg0Rs#b(#Hy?}j&hgubQNBcojPOXziNW$z|;
zkDF^kYP)QC&DIGjv)$~*2-8TOL=DyDB<>jqW!m*24xa1M-xhJ0r!M4DwqeXs$V_n2
zO<lWP{S^5s&3Zj<Yq46sze8hR%v~<h>_IYG=1W4A?U$<4%KLiGIz%IPGrlSoY3yM-
z^XI=+h@&>j`FzoZ2H34?)6bQlj-gpGJ`Y60`87EUe*Wi)X`^W_t`+5sHKm!M%7&r}
z6Dg+^;6X!7cBuLSGX-!UXB#BoW>3sy0cW=}^7EO*24BOKc4WsPJ<bc-z#_7nk~);g
zSS;U|*-e;imtNtp5^5^J?3H!UaXS&}`S2v^5ydRKLDX~)7P#p!?~c0d07@PK${M{l
zGvLBL^X6OkrJS%3-*@JEQ!<}S7v6<z;3e-S`_0*ForwUl2fte+9)WwYv{TS-hAU~>
z2!{kH?lOIT#e3?6Y1BO->PPNdB#QkM($Z&8pKGz7dp^%dbQbrmqS_+k@%sYS=Ny^T
z*zs`ku2Nxb8Mb`0Cb=0~4C3!FVBZq1Uh8Y-p9=}oN;=m*bCd4RjYxleO=#^Zuz-3@
z+Q{M>;}Sxsk4oGav8X3Z727~fcg7Ir>A$75RG4gXd_T%HOsmv2nE^>WnlhLW7&d`J
z3f!|G*r7>1bNTUjlonm6yVdh<J-Kj5oz7sDXtrVQzWusr228u%%@r+gI6kSaZkU$2
zBK%N^yx_{EU3bW9!0S;B%mWt-Q;6&|QqXv9I3<o5Z-&SF@{ou=eTL-kR~R4s0^xhd
zPs5EE!H6o`NWSGbGOAw1EjdhaR8Nc5v*c@N<)V%Tgoy0?!UYpjIOk4JrIhp_D>0?8
zxs`72EmQM;t88uwSOTPNH-I~LovpM8t5wMh_xtJt;73KgenR<h`vvHiS-$;|Y197B
zc9zjeE5%9LP<V7-03MC^8a<yI!1e|kEvTroGeNMWr~MjLr;W(I7?OSs5;Brc8+b0L
z6!uHrAMszxKayV0$StXi4O5yOz<9fq4|#m1pb_=1xGO3v{7-EFo&t98g%)PsQ)l+R
z%*ryl(N}DS@(t6@%JpNUe%U=;MvX^6t1HbbnjAXLDSF-I$;#_b^?KQvNG9mZInb&g
zUa5@lp1Z}og4A12-3bGlvGhsT_6#3%xMV&kNoU|dB68hDJ+R-ie<@Me4gC1nzTc+w
znM&YC54TzaqMC{%<44IyGd88V>w|LYPx9Bz>&XuXuf16k@LRMR+ogTh2Unu+){`u1
zMqD7(TF0piI;M_HJGi2*#tH3?Q1ool8rvfohH-Op=0$@az0QTa*H&Jq!%U`1$zI(n
zrYdD#(9F1h8nRN0*=9G@mF;8|6WxSHiiwiR8zG}o42|MaB6#|27Qja<+}LF)-^W^l
z6Vvq`u!?n`=;_KrV$s~`p?$BpB8{cUHEIQQ^voloLt67)l46Ssq&JdJb*>FX&-PiT
z(iBORR}{i?1!=^54J@8m3R|4@a<&}Zl@BGXq@2nfnL8al<|R;VRC}!z=^1%m>L`mo
zTmne%q5TE~oR1%gr%#wDv1~|uD5>!1YJoaI;W+x#F2AM<^7YJXrH!4X6?-Bn2Db#~
zHuHRCvd?-^_fXGVSn-(VN!PDK<z=V$1K}{EQb|P^@I*GIZC^j-oYPkd;pye>uB5N(
z=Y^MqEVCBfCeU*u45{_<Fxx`gxiDKGq}=a`o+W|blMxpFWb=Ucd!4wZ2ZYeQTj(?&
zPY2)v*OClHK8d3jC?r^imkY_E1l6u1VlbreI#Hfa7jGEoB{RS>Fb<72Pd!A>z`smA
zRE9B9*c9>%(-9Q<AB80`f_6(Y_gwS+LV8yh0iK2C*ijMNxl?cbTvT+n>L{!vX?Z9t
z!aFV2Rt8y6n7Qv_BO3iE7upAR6);g5A3ry2TwTF(4$q2|iW;Z`Wt&0l<YeX6CEkm9
zl4!UmXL!J0MtE)v4h{!%K<(xm{A^Ekpmpy*Dn`!vYEejEW5A<+Qwq8XoVD%lbu9gE
z`}2HboT+*=?E|Xak#^KlOiv@KVae4z=pp@M3v=mPz%|_Nse*S;x1FZ_`m7!4trD%r
z1N#6{xc$VC{+(@lo$qFRN$1|U(deV$0?)KTQ_?#n+`L^=hYhCjR9P6<0i&trIyvdZ
zYHJ^E+8UwZr`4-fVDY^J(3>%-LleX_OtoJuc(BJk<VfRAbx^t{I%S~$=2^1tp{1-D
zaa<BAJYo&&B;618RLmHX9~u1!q}a$UO#@ms@%czXoxfC$vy|WB<@V&E_nMyG#6o6j
zOQRxRH-4YxY}&eb<4j6#+tLvf8M{0g>RIL~!{z|xIjDQF@o;PQVawok|E37)fzUN~
z*WJ%qO0yzGrwQV+yuPM--ku^z0Rv@Vg0Y7Vf=wpJ!kDi%+)9~68dkYt0to`odc^V?
z=t=EzTKkF;PpOR&8t(vA!s$W_7a<}unZbm4N{sF3o>r=-SkzXPo7d%Hkekn~{I;Xo
zYvcQ2woLxgP=Ya>N*%8*7EY-=3^McLS(SPwH+M{Ze0Rlu?#(1;JkS{BsTbH53h`dA
zWAXCLQat?$QLIABwl0xTnU(l-S}S9Z3;ON-%>MXsa#p&}P(+7~-LSNqYC4P8SzBYV
zl4W(#QIFO8-AECz^(>D@^(GAMG3#{6rMBYee#lZ&h!h+Act6w`lj5uEevBuwpJpMT
z{mkFfklFU6C=kw)!EU<SXnQ#Huy${X7loTn3KQSGJBbNO(+2-oY!-U}Sjg;+mjOP=
zF+@dw=SC{Ene|}>DpK<LEbhSV(x~<7nRI&Ru#+l^d$NCRUU6$-&Q?QyW-`00+T(jN
z5l2b%OkG3)v*^|VrS<};arE)iTMthQl8JBd@YCuAe%hZIYAu*2%uzDS@ueiVo>9)K
zcaE97z@O35opgHMWOiVFSBUfmQur>Z&q}#&=Nf-*f6rkG*UTu*JroI#ov2|YO>ANL
zp48Wg+`DtAV@!HhQGNTC;u_IP)fSH!q%IQ%LKbbv=p@g23T+=`u5Y=#oV0n~nUfmM
zu;qZ=Ci8h<?|nQC$-^~ZyS4WmaJLckbesgAaPC4M-r!D{`z*oEey|jjbfD$AfghII
zGtM?@Y$GY<+eq%-;f%g}PDq_w;*0MK6{iBo16m|fde6{67h=|~0DOiqzO8c@k~8cY
z0m9bKWjd(^3L;wM2zh3L$fJW!;ghmv!1-JT&G)y?B-Y?Qq)o?VTWp9R=)79o-wx~C
zhw8bSbK)=iCWJOWrM|%vW)@~$NZ!PASODI5ee`7%G)4XR3<9XM@t^5I)~4~=o*M7*
z%<z+XyDt#wDjVV&xl!w9fNpr&N4O5#fPiwn1^Y=E(GPYOCO?Mj%#}98H^<!e0nR|u
zj=+7gSVqBc<B(gonH+GdVDM+k^bxN<j|m$3*X#l%ECnaxU?#rQ7-V8^<_J_ohP8R8
z6R%%5n#wgrerfCH2_G@s|Dj!pD2C$3d8ShBED~z}x{!$Ore@HnjXX_a>K)!{UfhV$
zz~CYw`nhIe-9#{u_8HM`GAJLk3V^efAZpjay84OZ2Yu+yG!w>gS1865@f7S+FTYM?
zTklVN9WhMkyrF(te&0j<5G<9>ao^v38bnVkolyql8e76|3i+{zs_qI8OW1|<Vw=;h
zpMz6-H4X?Q%jPz_gK^nKHp!9b)dMNOQDHB4uB|qSS@?2L8$G-0>dEHS$lvTVY<7~6
zlFmHGa#6l^dw<<|(Q7_#V+Vqgbk$_QjaZ;U`8UxGde0oj@}Rk(??VRvCZB+NO|{Qh
zKN(!Jwvz8!-3GmFgu>p}@&Wl$)><#zWo2RNcp}mHToY1&aws>2_2?x)S*v+`LAgU|
zV_xryx+XG@4eAIZ5^VZ{cXcw|l8%10$$=aAXj~(>AN1||Yw)py)ae*RdasF$?Afjf
z=bT16&x!rLaz!stx;Pdq^``l4q`WUTr?&4-gMlaFp%0ny{@e$@BVtj9VMXamAdME>
zge!k)6rEoY25PtlZ`>B_G}p@Zn)d~-k@&IroV_rWv<WVLEp+3@1sws(AdWm$8$2^C
z!%zrv$w5*3n4@!fY-y`9y<@3pTu?hA!$rb~b?U*X0Mau(50|m|>(1*%$W92*7N9q7
z`t7Sj*Fb>9IWzqp%?rN=(HT|*O49nr4P=Z`OsYrCus3t}Lr_P<BDLRIk)5O3ph2Hn
z^?_Qeh{V%(fuaoef@D-eRIdjwC|bDjNKjW-FQ#ga0enbueD715=X8Xmr06fD=idur
z_1eaUO_O{RB*I{;b=p{HuCHt~!o9QD-1Xyv$EF<I+jrEc=9S6ohS>v%;}Rg5DE~yX
z2BZ1Sfy7se&R=>RI)+*#IxXOPX}$ejtFKg7C83X@y&itzcOSaFCH^hsaQBy9gf^|_
zw=ccoJyf(>--sEFy)1H2S<;oAeyn8DN9$Up@uL1#|Ajj&buXpQ<LiShY`f}m5!R(V
z)rLH(BTRLWCkNe|1rLvm>7Q%{14*HW4(nh!PD|yAsi@u(+^k<*a3`_VV15N7wrKt>
zkjbi3tN2F4UdMp`5wOGAzZy49qRQ^z3^5C>5M-HD)C0&vhv2x_?E<MNv3S3nbw$rb
zwnukpCC>}a6(~fKPmtn!Zrz46xzr`q`Doc2ek(SOgA3NA3>xq9^P#_5utcrhBuEa0
zSIjoG-2Sz&Tv|Zz8Wbnk-k>T=D0w97cb&t>uQaZ=OMm_#qG9g~V`ma!hW=g^UGUie
z_;rF%L!Q#uc>;|w2~v|^|J+M3PP!`COrzcs*%-x`k&a&JOyG2m=5~3g?~B@ejIA%O
z05llYs~DZ07#$*oIU{lu1-k6wb?xAxu4KKqoZPt|^6?G+7jT@%S7weYe2J?c+e5dJ
zW^Q1ut=VS}WyN{Bn^rT22E{&LWEIyf-rFt)M&-XIlU=?$849$bQr;T3L5Z8la26H!
zbd7uiOYq`a($X(>W}UzK?eU}w>Pz(2)s^SHV_6*Gzoey``p9n(rYe`NbO5asMF=7A
z=J9?tL_mTy-sqVTpQz#HT~Zm$)?}sw(?!qjYfxo1jt5nh+pon=zcjk;FMWuQxj!7N
zOZ6bmJZBj=FK4N%-}idA#@px)v*jhm<3#cOPE~nG=KYcaOSPnRj%}D`#mis5_O};7
z3d`DO+fRLw6Gm1@OS1d{8F$<LjalKPu|yb<AnK;wK*VV3SEItkuLqh$DtL`&<RFZ#
zdeg?evM0$DeC@m~cLa5S-W~-4)N@6|6a1c)tC?gjx_6QdV{1DLoH3ueAp%|+yjOV@
z`9_&C$&QQ9OQC!v=(CB1^tq3Itvnf16DMFUwDr?hgQU25{kC~cS9*9^2gR-#-Uk|l
zhLN!n>?(4ys;V66iOyo_{L9|CZgfW%#sK#iK#qrW3Dq6lRVzsBnom(cHN*@?^_=AH
zVyXSuhR2KL-JOi0?X19Ss>cQqQW&OGlloX=mzOu?zX#WW8hqVU-L19Jf^|_HqC3b@
ze-|&Erpcws^aR^?%h+k>gI?awjZMavzBJlW(%C3Ut3crF?+0~f(TcK|ms!(wS%Q=1
zQaXuE<={(kb0N2qyQEHFKEa=uhng3aBwc<K-(S)RMB{v%yO3Khic~oc9Wx^jI}xZO
z&tZPVh8U|!<WVsyBITy$s-fcrn}FYYs4Ug-kP=NZsbV{8eup--*h*gc;-dQ(5$*`D
z)zb5lt`7*w7~T{&GvPklR<lgcT2KoP?s&Uf4W*M_C}43sraZ!_9&8QQg-2QUeXe)X
z|I`oqz+%e!;R38dKeB86gB!NTeZsS}p_6*lqq?69*DkY{)KE-c&S3Q(6eh?B+OW7{
z*a?($yIP*lrGwRT>}THOuaBFqeLkaWXfP1TvpaPBQEaIbVCItt-zanRy!tqNj><R=
z1HacVB7rJt*Cr9?pmGoCwou{P4T#dbr*e)%<F#Fk(FmCa$$sa}Y>dKYtgBbL>mFlV
zaT+vwghaGcGdZVP@%ms6Mp+RYTLz>gh=Pb#gK7DPZq!$C*@(LNx!XOA=LEWuIYP_F
z&A>;+_Wf4-DHYapuGK~YlHUsWTo<h5-By)IRu&B{d04*k{$|}9LU>|dq6vM+;^LS0
zyJ2stj}VNepMx1Yru4Kf(weI5k;=&)`=6(m;K2dws~?isoUg3D&pP93k?R|vg%n?s
zO}cqTfFj?z>#?3+(avZ=`ku81zkT>ny9}b;e8XMe$S2%+GNlmUilb&eYq-z+o_&e*
zFlI`J)Oav%sC(@&rLb3JT`)=uA+nu!^=K(~xdTY3<o4AWGi(`DPOsiE`YzuD>>9?`
zC%Kbsg25MI`zkKA9j9b;oX@2KWlC%drM<zYF~{aAIvqrqRL>H%F(B<y?^9KA(Q$54
zN)p8okU-$7cT|`L>RB~&N$V`;o;p>LsxAh5pZT^fyJ$TD6(La+u!32AxX3hpn`i7J
zYJcGTuR;R&xu3(E!NC{fM*Fn}UW@H~E?Z$-Mxwmfn7QZorK&5~qNCR~-^S2v7B;V#
z=Jrj*4E6J@l&h|0ULh8^Pix3PebrS_gU5C@D={R0YTkiT_!~PH43rL<$yt|rgxfvX
z>YCi&OOKO=<0lK%7lFpFMuF0ca&_{1bB|;87fX_N+IpJ3+di*{z1E%Osdu%iYLIs#
z_T^FL-#n;S@>7N@;&^SR)Yo^gh&E599QCxJ*Ezu(!u8z}U6seuK5bPEY>K!Wl^Qcv
zYliWv!U`n;TP~Y%>vRV~R0HETN`^a%xOvuGnt8P0=BQ8}#W)arC=NY{8^U#POk#?2
zm)~I|nP-{3Y@tV3t1mA)zOQFpH1UT0eii^ekw9G2*y%p>vmODL72pzOjiSO;&FN77
zalunQTDjq(ne!W|V`N6Ox4L*N$@KzdJpD$!?JLU!R!M<h&5$<g=U!3dTnW9IS|<R~
zLyYj2TE40EDpB#lCb*)dvNSA_<GPnTkr<exdL69yAA25*2~ZNx!=aa@>Vb~)#9HwE
zoDxvHmqmZa$HZYs;qaU}wd<w#gs&mH<<Y57cI@)xy;EgbI$;Z!soTgNWJA}r6p)Gn
z<-~b&#}k?zD{DI1a>5GB&&6I(KRU3h>--{UI@D98_}aQ$XRj`}pdaa_Gwqvtv|rbC
zNZPwJj&QH7n;pv5hPZAfnJvch#PGFmuDO8Ed2Joj#o2c!a6POk6*u?Ltedo8$YD5e
z<dt?B<}y&%q#8#$hZA&9p<iE4<~?77tCASc7(ZKWy;C%elWxJ)DK!q+8<v%rIa#gk
zVv@bt$~}4)e09r9T4X!#?H;VJzfP94d_EsH?Jyilm%aD-HUI9xW>@WU9Uq$$ecYyo
zqJmTj>byL}Li*w?bblkN!Z*Nd;UhEOepR*NeB<f;nOuH;w|c!jwH}K!X~S%2;UD-7
zuUe+R%eM61{b=0bvY*<`ISV&8voXTd@i+@tkrIrN;mATcVTvN-#LXr?aBt?!*`|9D
zEwI?J&2bwAExkU60s|4)NmHzE=BBX}?y%UX8iffiI=lODl)i^rOIv5kyt@`{X!s<(
z5+`bSRO+1*VQduNpIx7A!)r}5rlOO?iA1`rn}b-T%6VYx1qCjd*&Nl0e00n6mM}zQ
zdOOI&N6>18Vw4xAbOx-ir!SZ=^KMuVEvKHIkM=c0BQD`+gT{0zXIc&NZj<qh-Jyv4
z&B2kglA{}$axuEStC}^O<@};1!!cZ{s+~hKDNu~Hy&d<5n?om#G}<4#5k1C%od)s-
znI)>CgoIg!x$6$YlNT?FD#N^jQUhlse;@N#4&0<dNknpIy|)uZarwqgk04r0p39tP
z(%%Y6n?8R)4}lq2p-&PkkBKkI07;nQbCLIcozjMSz!3Gw>va9|fr&!(v*KP}*~q45
z!<(&7nEjjFiEFNv=fhktOcj611CpoT<<;`arI{8sWazb~6V<=!a^4&>qhh;8uT@E+
z8M4Va9^4zVJB=O}{xH8Mme8Ml_P{`Xw%IqHld_Wc<}AvEfA64udCgU^nzv?i%8}4~
zx=8yo#YVafd(F%ZkPhrn^yVyM4|U~c?%8qWdf;f=gNp{3ZrM$emD|%RkL1OQ3-zfK
zB)>U~_{K9B)X^H$O-JY@c>|xMD>vqO8rBu`FL57ctWz5v>R#Ts@EffGx@vA-y5VC#
zcxd#swq?ll{>d9u3oeRnnx;Q=gt3w5`3AYHHrqVQyTargaJ~9`w~;+a$2VTfN(9&i
zwY5g9FSG1|ut&s`D(+(H-A>*^*PD$r=AlZR>6S_3h*!w$lMj89^gR@{LAOAx2;Vg4
zW#OsnL|n74tla_P$dlDBQT&KmN75FglU6DKE7bQm>4HBS|LDNb(-F~C?^<nWuN&Hs
zoS$g`nh5#be+QlW)(jsw&jO^patM039oF`i2Ry5w94vf4?jY#p7??^4X?yg2BJGzk
zq;2u{&Iu)-F0_~~!y6|5%?1A!c-G((q|jk1-&uUm#eR>ket1v%J_)&_{c3zj-%sE0
z`&9p7&*7YEM0eJU;-3BEogaq!@#jlvPNHfv{)Zg^asN+#?$=rQ6EUY?IlkYC_k)7|
z^MgDEf3G{*Z8A*X>+e55{^1F~@9_NJn5rhLbW`n3p>uyN03ve76^rjXx_+HR72BJ6
zy>0H;VvA`yw}z;Lsz>v}<*>PS-C~^c0kGjbx_vp^YqG;9bo!~-bZECov3u3<>pF_O
zU}<hY>JRw^NqkS*4GTP5*4Rh`JV`Y7cMYu?JsDLiH^)DJvwkF7iHj_|iU+?^W_9il
zLJ7#ZWSRfv(P;T*_=Woa!SDUUG=TqFvH{(zy(nvEu0Jj{kbp=V{&+IQLb_T9x;-EM
zgLR*j<tg6ketHvDu)$nSeE3Xt4R=*Jv?k??wS`zw9$kPQ(a4U+yQI1g9F*=el^@s(
z?T5(os-?4~=3`e1Fx#5bzT6QdRBoEn$5*;393v=_bG}2fUv$7yA;43GQoFxO*z@-P
z<oEudfG6q^V1DriL5hdGD>TvQPQ;m?MfZo_0RBv-g$BhHvJzpT%^$JgkD@)%?Ov89
zBDlGypa|6;4PuQ+t)CY%{@*8fC$j%QC-dM%b!(Go8D4W{j9q4<gSj6xvfSWc84a8O
z#S+;~hk-;LP*u0NS5e2i7;M_+UUp+vBMHzu;?dO57Bp*#sMp#H8davn-=oSy!Rv>-
z_axJ-r%T$lN8d+SA26DT<c$2;ivK~5zewI#Ve?OA^G9uwz9xewbEo~Kvc{ju_=oqT
z`H3CL{#@dJHY$GK?We<+{t^HOlmNs|MvN}${4~fP@h0HP8KS>ERRG&_wBMTi(1!X)
zMfv3$pI!pARR5`M)K8Z5lU#v*{(C@Du}`?I%#r*D+59=s-#&M%MMTCn>8B06KiNW(
zJOJ*(!v&pXe+}EW&yoMB;r}#UzyPl;%qW$xwf~qI{j;z5ctg7B?@sj>5ezdq{e*ab
zGO!cTCHy{aCxM4_e?wk?Nyh-!rTz(){xq_57@#&LA9FbVbWOikrHn)ZqW?9b_4jw-
zhbCPm`Af4Q&ph>~j`1hW`cE9xp9wYC-$n8h)ldHR=V7HYI0(++!Q*o7P+d6L2mX`x
z{rDaLGygr{^)HMX^!H`U{I_)rkb$}A^}pfXq_0r{Orh-7FwtKw?tu3RCL#rwU;dR?
zPj>wSnAoDk{Fmk>y-N&WV&ixW^IyZ!QwjhRcgrds{`Ih@@ReBra$GGiNa1wjFDG#d
zzl;dLL@IOIzXs+c)e}sdx9OtzYkmrhn(fZjBk<QiCy;)!S?+I-`oC$e|2-1;caO^a
z_bs6&#O3emz<<xE|8~s+m_cx&%wMC%3MIe{yh}J_fujCj812XFC+7Weh${RifARD0
z3Pb@MKE>fZ^JlfcoCF@hQwrmtzrA9ZzYksj`PE6VBKXk*{gF)qp8}g-az_3(B>oF?
z{TH+g@E<x!?<xM$fu4j@|9(>c_d|t$fq!3HxXLV~{MR7gd*W!|$%r?1ehT@1R<?jU
zztEFAN$*Mjo^1aGc7XZ!^6!5;N&NrJ`%h9*KcmfmoErKGNr7+%VBU&h+(UmIG5bqv
zoCH<azt6n?zSaIK0sg<hy#E&^BRV(561?B7eSUf{^=`%d(k&?=?%RwyGj^Y5je1h_
zZR}@5VfH3Ad+M~muQw582eo9yr6baA>62=jno$NSKRdfZtMQcAkG*zm7xV0zba_W6
zt|j68A0NJHvnXQQcY1|XTiZ6F^hkE9rk*dTrsqZ^wxt;;&34#@+;aP84@<g!&+Y1j
zex9~d6i7bzQB}ONp-ojpOTE-OsN)thioToE&y}*j_v8bP`1<GXPzOq{l)k36KF63V
z@HIt$*g|Uu?6cy$5AJ>tI@9X2x-&e}=-k~K!w)vZ9qr5PCG!>2OO+^kxYhZ@jsP_{
zv+meeQ`E8c`(=syhA_k0(l6(1y0fu%wH2kJfmY_z^e|XI2(ykJp7qr6vQO95ssoN*
zvz}HZD^5lWi88!xmpI%3x}Ikxt5L(fo~)ZS5PQTL5l<$%9<?wO>w&Nrv$onvlx6i1
z!nUm04Gzl}R8*{$voaMG5%-#>vH5L3x*Dum^ZLt0-G7`DF^DfjlvTf}e-5{y?rHN5
z$?0rk6WyI;q2jzxBO~<@=mxqRH>+WXweH5I=0BwO_33ReA}J^uDT{^ZgAJAHd-w9<
zvqUlz`nijUaWm@PRjUx!=YtWoHcaSl?%uY(!5@v2HGrESC6p2ITHAW3_A#)Dn7P}f
z+6{bvjITU;6s~8k0x8(_+-B3Tj=2o@nQI2u90Ge@X;!lgOGL|Irp)S09!%BAW<$Lq
zv!>4O{U6r;I;^UG>jH*F5Rj0PmR7n`x}>|iySrOTq`N^Hq}_yccXyX`Z0W9dp||JU
zC!X*6-sgG$+RBBOd#yF+m}8DPesevS-bJlcR%jWJSnJ`V{EK-pr};uh-Rq9bk7o}q
z)scX<-0o9KBxMHrOL$C--!d_kX6h*8{zTpZCLPQffc>MO!6Ub{MQ7gq_HF4%#{F%c
z%^}b3lu3QVLO|}J%c5tp(UM8>ZG-C5Xy~7H(fJ!6I)n-4qFE*X#^8_d)I*wyUC9IA
z-g_y}(CrlZvap!Zys~E|D3eT1zqJ+7+(ofu^~pPDwP>a<uV8k#liRJXm>aI|xWSy#
zR&-azYalM|(j>${WJVGbS$PTAP4Wl)$HzQGCvM{TT<q73Rqn`bqkhMurE+VSxwC??
z$=0^f#WZ4aowHOvtyWy0gH7*X9h0*oNV+Kdkm;N|5D{v(fnhnyDk0?#cJlDD`>r`9
zxb950JQ(bsAvDM?V&b~bV=oz$;hK6zz(eeHizK=Naev21uRFq!cPig>Bi``$1n_NT
zg{{(TbY*C9u>@@VRW_Pl6os*g@qwI-PWLZ>v!4cPTFH}g&){#?DDq}ZV}Rpn3DbrR
z+O=cg>0DZtGOPSfnim;MWc-(XqbyeA{=jKIHR1Tv)9+3#v_4B<B#nB{hw%*uFZ*5Z
zr)4}3Jhj#r$-z32EmkfVk1vaR;^|bnS1^@>ft~)FH~QoL*ZvFM_${7wo^nQ#?vou;
zRu>Hq*H%~idQJi0%ZWwz$?+-dMVs+^=ERZnF{?z4`V&1jV8g`O>38N@$J_RrC7zjb
zU3?HP^^goOzqKGSH|tLg-WSxPx>?cns4v}42Hb(t7G}oMy+$fVEKGE6>F<a#bwD$I
zXuH;k?+IeLdw3{-?SMOad7q_{oV5tDHH?t{L|}_A!DMo?z0k1t=AqHuCZ-eAX<P{n
zt(Zi;l)8c#Cndw&7*U9!c<ls7hxoj7^unFM8=|MRRjp|z_8cSv&MXS5#>$3jCX&5-
zjO8*8+G3aI5|23r8*4g`Rbd$%Mk7o$OXKJ9r57Jh{^b$zH9nuSR4&tQsxKSshaB!g
z611S2V02qIVRSep11w%*B23!{+{!~|wRqeH<3_^x&WPf~c@QQJW5WK!fvpbu3VMW?
zW%g@@;R2%ZLCGJ_al*s*w8@kgKbh01`mi{qlHR%BUEH4)@?2eOX?gSBaojb=Pj4r_
zEav^bvf^yjz&uf|+dvB(h_PLWcamuFv|ZY?1D1hS7^!mVnR%<tWDJ;G1meZ&beA;T
z>D}8qK@WVlkDom37oOT2Um6Dn>DUP?xyd54l}U~*gTiiB-VS5k&?)EH;n1m|8%?#8
zvE7@){vV5oYHDet@Ej7P#oEX}V;;N?W7h-^6EBwiS;NPMUqC+y{Yq6`Hu&$lr{j~k
zn%BpXn?Kl^Nl#~J7cSq4AyU@vi*{r*-5MQ@3XU@|W`pvp_O}z`lDt?~OC~mI;+8AH
z#$pk8e*G`*F7M+Lze)|eS6T5c?3g548Y@jRuci$do+apGC(oP?S0#(B^&27b$n@=f
z^KQ$_Q+hgU-x}<r6v2I|w>#qy>Gs`0$?El<rUbV()ML~@wT^XWaSm`4)n;)(_@~Aq
z#=@Mo=CEBgC|!9noR0>|{JKF_N^2_aCFf7|DKup^GuTOuFXNeq;-^!$u3xQMFnUx%
zIlh7)S>uZtS4&Vaa)clUX8=#(Belai@6=l+cXlFhddGO}cJA3LylBVNWd|k5p)Gu&
zYtJ%QEaR$dPU6}h0tz+yOS=Uy6PjwEvw=D*mFbb~i)5n;9$wAuL?<VheY*v3?y^^)
zky4n%HZS9a>h0Ri@j>rX+uJ+GMB|IwUE>9=Meo`X%Lb>~6{}k}U@a)K+fc`f^S<#e
zucViCri6=|rUvO!Ub0gnKDEh3vPfMcaMc}~m$z%c798G44K|5W*o4!GJExh9;)(N)
zO@@JR{Ial>d>zuSKMU=>>x+BxlEJ!}RDJ80L?xNlEky&>oAY+(HdP}d+tdE%81pqC
zHH-y{MVm3nweq&@Y9qCjr_CTE0|!kYcRdp|B(|JO7aqb}>4)aC@FpoL*6qw6XUS#Q
zf7}rjoKOcaNM1f!k0Y8bv~-V~n47~$%``Mx+~X;t{jFmLjLs(v*yFk~8ZL>s3!Kb0
zYfb)cNJgQU4|3(J9{9Cjh5AcDW1U$Ka^7+^7pL#`ZTHp-<s-R(@1|ZYp?Co7aP5Cy
z=Llj(@@cUcIA>5z9I#1f<MLK(NgKeOj%XPvP0e%r&T$9W5>}iiy;~ZoOn|eR{dM5g
zaj=U7Xr&}ISjPoO9*Pak$MU}Y!^?q`H0u@#HnTFX7EStcU`<*1+o4I&98l?*oa&pQ
zW(KOa6GQq{fX#5Jz=F*M1S&Mi39DNlJ2T$is^0nj&%<zR<xK5MaSJvXu_31>MoxBe
zy!F%)mu(#<lLU`NZ(O$<EDb!JkB8KLEoS{oefCQyloe{7^Qu5kZtxJcZhT1O=s)@j
zow5Mf=tXZ>Mi+VQ1?D-Hyc8W`W>7}<@dJCu>ff>gMba<)#o66@S#<KCJ1qkVq~x>t
zMKW*in|#SdPo*tVy1gdML`P=%;IQ;7Eo`1U2yT(Q>Jp5tI}igCa7g?qc}-CvcugH{
z&n!+(T5jQ!y-yLvoSh^N{RDq|O+_M1ztk->OVMAS)*aldYWKB4#Q|mlKF{Zu<(!=}
z#xxoyyj8&K8ZtG`mXEg9KfUCU(QYAC(RLSu9N9K>@|Kq82!B-YxzQQ0!vmKnv>yk~
zsW-owd^(Hp54y5IThZrxdXMCVHseECJM1aPOHL#DtyCrmsZj!Qs3)&9M~1VMAcFZy
z!5w*(LaqYPN`VuZr!m|t!1A#>kh{+Ilec#J1pXVdi$cM%%--t-g3;Df8&P!=9pJ6W
zGxcPXe@q<VB(gWD<vQL?XF1!OQ;(-UiAGH-4!5lpZ|+xtJ%Ae&h|HI>m(Z-entSf&
zZAJ==SjKP?R4huG|Hm!8E^fenpyu!dU~BXDV(5p%kHrg{@B{1pFo|WB^h=hB3o4;c
zs|x1fpH_HSu3*{$sTOZk=W_im?Q!kWXzN5sY$s7$`tE_Ce?9YBis3SvF1@Fn>Mi=;
znG&)omS2Uh!;g{2yOGBp{3c+n>bs_h`p5v{b<AcGHkrqAj&!R~Xxkve$B@Y0qvoD>
z(UvoOpe-f^lT<>GhkJF!ld*JC`HO4WnD~$-91YFOCYQTUj+X>gmu|z<QoCK7wT9j&
zG`k19`4fg04r}=}bkC}P%O3!1rxlIW(THWzx&PdFV)J04HW5jY-a&G*Vd5k4iu*X%
z+gWTEfx?&>bem?-F1)37Pg3FtxY7f~PoM@S-8p}6vi1MpY_hcq_LXyyVbby$mfl}s
z<De7_j_m!h*hnoY)w{IaLK0f6hV6Bh>s{9pK0>>2EH6v;x7dT7)NIuZO-@c$Q?YSv
zozo6BLt*-M-M#CR!=J9byMrI^AL*%iICU2VoypRB0FyGK)CUdxb7X%$JJuHt$M2V>
zX_g>i95I64A##oM?~UU%ZQrr%;=4=6?9$RMb{^S}LjC^V&XLJqq=MAgAHg>LW_WYd
z>009rE}2~CI-bhH5pSZar^3AGY<aH~QZs^H90MywN|j>BRP(17ejUWG0qj`+@6Tyk
zHM|(4(#>$|GoH!SYB}PyKP#jj9_}0e5hK}j!qm4jXqV&ebNzv`%Onx(&0a|G_gJc-
zM#iwU*3v1)t}Ky3+!$QpB&!{{^H^FU0d^y>U+wfby<X!%WZgfTyEsK9(3n}M1XB7O
zx#H4sxJ;aKiR3oIM;2OmC@@7^EyY{nkn@q?ZA1N+0%mTVITWe$9U`m0+9IGA+*xtH
zWz(i=IlDvJIBU7({V>m&yAWnMO<@j$^k$MS$H=k#{^%Qx))#r8EBIpXhrw-9Jx|w`
zdmhRl<M|_&@n#k+W``hQK<8S6{W9DKl}Q0IQ;QPw-M>6$=7`pMUr>0DsjjivMLo3n
zgS;}+DaVv^!2$1s-tNJi0s<?VsdZXnWCo$HFZ$?dLS3RK;}S)VNJex<8;QHeHES~N
zW5}ZWDHV>zGzGKN=(Qgw&5u8@f(xg3l1IZN7}KXBwv32dLhpaSKx=;;NYJ<Yxc!)5
zX(2FQU%$E0!%KAUak-Z`tQ_j|NL6AJJ2yxKL!f6iwed2c=-gS6H)q@*40HOT*8h5`
zW?(ykmfL)H`*bGf1ZIMYW*zvG|9SjESxs6I)2~Im_Ey2*(}C-KJvXsXl~$;92OL#t
z7pJ})f``I0yh*hU$N5Kx$K&#iAserbWsY8N=a5rR96E^7zYCUpXmG9&+wHJ^i}Trw
z?2kw|j?IOaSmudi9WBI7{=v+%2hUlh3R{VYd|QVxEBLodA{>9lRdyUsVPoTuhYqGQ
zx}6s@I@~5+TzxkNo$44z{3e8{P|U<e`GWQ%PJZk9y0w-A&d8My>)Yr1R2r_QoKn0)
z1{YgWZtK7vu!Qqw{ys<b2%nt}XB_as*W~y<N6KZ9cQn9?>jn?JU+emOD2G&v?ehUk
z3BrY54%mDF*j%dUqv>Uh9S`^!fkVk6n`GR^5Zt_*XIxV(qjZs;S0v+kOiU%aiH5_E
zV4s*{kWJ?zcFMmtmNm4%;t6*%F$+phgmi3P63{x`&UvykHkOBc&;>UsyGP4byHI*5
zn;0Tcs#;lJ?gXZK(vn%F`{OuzM@HD=Opigj0SLX%(QhU<V&R@avezherKsTpaq3ck
z#mu|CJb^?wM;QLo!OVIAIBQVw03(D1G}^Qf_h@Dh()9&pvk5~hN=&FD)bKC6PDAI&
zGfW2EIyd@^881&$yu7^v$60P?9;9aGX+5_!7(Ldc#GIY`+gx~PZV5aVGgjcXD^plh
z?1Z+c8^-{ei#t_0b^jYIRLq*y<C)GC7mO>AaII7pq^mcxZ`Su51`0jpcAqx+Qu6LP
zA&)v3DvzWK2EXbHLGo6W$J4smOr~JJ%xj4U77BX2sy>|98)ZrPyEPL6ij^wr4Bil<
z$7t^c$%)xPd)!Kk@x8OSx`sl1Q`5xQwU_R%y=XDM+TZ*36Ad}eSfAaKME0i#Hhg>U
zq;C8(ZQ6sK2kg~kEKw?p1uX7Q(v5I??JrO-KU-sVsp6kXo2XGXqGmh*9r>|3zHfvU
z*onmOob*eymRj^6a4H%D^DU^#&>t_kPE9R2Gd8}+jQwUTQf%enewkeasoeFUkCv|Q
z+Qv)4xLhl~rrBG};%ElScj!}7O$NT$uKUPNQWfcWjtB$t{WEFB8%S&^^uXlg{Bo_>
zecrpWCL4#53_13r4TTud<Plm9rdS_ogrXb_u`Mnfgbm=h*uvNU^QL}vuNQ;tRvWVn
z*zY|*{A|7sTJ+l8w!b}}I2mOcnXWaciugM(g0&R*f@RA}z%OLnt&<q|nAvprEG+l^
zG_6^6aT>@`EHN@PfYY8MTgny6z*X14AZ!TI05=R03)cYmZj~D$h`=8a8ke`CO;?if
zx;Ggz!q!%eBi(*jFK_BmY2?D3IYq5hnab?lRO|AYql%J>5BCJq-riU1D_92+TJa$@
zfe?zhjSj%k%V_r6K2Q-E84Aj;PtcbvyE4+i{pWM@viq?5`TjzbP%}n2W!-O_FWryN
zDj0rBWyHvX2<g?KY*%m|*Q>*B*>p{f*)>-?S`I>HbGUA9o3vbmI~I_GFqhZUeD0(P
zj%=b)_CNc>-W`<`R&jWc52s7g$}pbPr9oz0xvbF7^aK&{47v$N+)=^8K9>vfy@;5=
zZ-Y3O9AWUFOKdvkP_pt2=CA-vnPtM*U)X(~jksNIxlnlY$%~H+D?^)I@#7l23i_z(
zk_Km@bv~@2cs>r==<5?fzr&B{Q2imdn>KYjJ`YcqA8>Y-wH|~ceuNbH?%}voZpHV9
zq<Le<z>0sfM?X0&ckD?5b<V~Cb(q^hh*eyw>RSrn`+Ve!Bi8+3dh+(20ueDYGqJuP
zLAFp_(+M$pnWj)iTwE)0D!G+!;k&0_^(7?y`-cD@cADF8er<6^3YB(&x8!&ynX#<$
znmvmg^C%s(Nu3H51i$WQB47USrZlt|MhX%iy1m_!oQNBj)lJX$Mi_?$mJW=V++8=0
z`Cly<VJRTtfETA>@E9rn+y{U8StH)qfqP`}*2V>1V7_K8*ue(|fo_SO+2eR$$;8N3
zm;+jP11k|%821jjFTm%`*xTv&XNRrKcZ}q;MB!W^$s1y=bngguHL-HgDojbDOSuKz
zJC9g5O0up6KWzvA!9a=t0UA@rH3?QK%QScuAMPb+!RjNx`EdA3JSh_F=<X56_Kk`N
zC8NJD?z^SVo<B;x!?tr6(^7YA#Bm6?tCAM;+_5SaH)y*mk7|;$hT!f_VR`(VC9YW)
z<=h~0)6W$N@#qJ}g=0jQJ_d`SE`dTo92mUwC7`Oxmm&F_rB-m?CX9_?@>@Y#f5ttV
zo^5tophO`uw4L@_CVF~?<stRV*5rkD-yMf9t)wV$;l{X#rCTpI58lpDtWYv3){7oj
zriFZUczupQglDg3<%%_<o^k%>EBR2**+Z3-a^ZN;XyqHP*`<qmd?^!=oa<7PA&a5y
zeO;sIFSY#@#Ur#M{g?;ygErCzN9_-%rxZS2lwWVINWOYIK`IdqUMxbsLLimy2Oc5c
zQ{w#X;}Z_l9scl#^r6kkrFy*BVOnuPtogUDY)Du1dO2h{*}_J0AAaQ;GP1599%%-X
z{hjtO4QFBAjTcthWxO2?)K!2XBg*Q1{55MAv*9Y<qKp?eSd~i-sY9B?sa8c+<<!~f
z=Vi^B*AnEqeIy5Hi=pyZ*GTXiyb5g#HCF8UY^y(k%If7O{G&EHoS@wYKWRg@c6fJ8
zl#MoGC=X<WLE7H-x%8y)UNQ$=<P?dF4X$bxcL8zy-WY^yO@`1ep9pB${AfYoJ9Z;N
z6#FG$M%2tq%M0$(ay?q7G&!fcKX}1rE+KVPB4t(F>g|NLU9j&S@z8Q?y%wh-|A1vl
z{hlAo(#4YtBmGF=HCvwHwr*A^<~jsMNku$V@M8c2QF1~VuR3^bab^Pkun~VVYsR0W
zUPFVNf+oGuX=-wLk&j`tBR)yAGDPe({Izx5l+}^|!Q+Zfq3)wUIhseZ@o<KeSRvi4
zs{HwG!DNjS)4SJ0dD-z)6ckiw26Jz*Q?U`keD+379`nEyuW^Rn<ERgz_0NfwDKsk?
z7dl4&FknYh+9uz_7U0-upFg56tLEBgSCmVM+`TtTEs@Pb%wif~2tgX%vG=OUd5Nrn
zYK$)tZ5_n2aLkcJEkdf`hu5N&5n|Mto9sg$9>|vD5h5d<twyDyus)nVIW{kee%LJK
zFbyv6UP5^R@At2e@au1}VZMy-L6-_D7TD*PP6}yckZUJf1|NF@>r9WWGd3Qg_g+Bo
z6KX=y;|^!>t;aHwvwP8EOda@=?O2~Uv$WYJyeQ)8$|>mYDE)8+<=0E}rRn0TS+V<#
z>+bmvZ?GwCUd?=@-J^0Vr~<V`orIb-tlzV<U4eZ#UA!#!IaZ*2RFDWU*TZ7QTg?vH
zjT7B$)GVuYt}o}S9QZ+--i@boqTnv5g2$QdJRTdijoOYm6k&3k;G1)|h3FNybvY)d
zYReqI6a3zf5FA<0ZogqD*iQ?Hk7!7}Jo@PDCa>xN`I^=IG>b-|JN_4-uYdAo9d%}5
zk-LuoSHX0D&Zo(g+s_Y=tDm71_tSe10lm>Y?2O~9!Vo;G!bMMvNv9<o7@DYy;Kx7P
z0k7<82lt>d!bm5{?`kEzjxrNjYYjIb8W5c_fPnYj@AxIWrl%|BrGe*KC12o9P~`%Q
z;2EER|B3y@+XC6SP&(Nz1tf(f&hl7J_IUZk10gq$Lx~qhX9K>+><lCwg>dK~&*=uU
z?N=vl&obgtMX&>fyt%7|(Ht%!?FA`81Goadp|YkWT+z+W<82fx>MilZ?pAp-Gbz%U
zPR$c@k|xreV;ZM|n+y}l6UuIFWgVLa$<B=&GMWBURw~9@Oa)-mcXS-ctH+EiY>Dy}
zM+Io1jwvWg**RH$@=*oQ+SmD?BO*{~{*E5Mu%s0O!pQdLu4V3U_Xs`8MvxsTOa_8k
zETfr0ah4(6Q=vg-ea^*z2Q%`OOERN?q{;-_spe;vBUai}8arvnCIH^(5R0F<UJgRP
z?hy6wtyvnHRBj2J$F0$iIfis)R%zFr2n`V?K<K8RCpIDJw1PuuWIuXOn;T%L>G1<H
z$e5MBLq0df$R~7vacyG6?~Uu+*|}3&Z&)U#o?Qb<eM1iPZPtt|QZV?!Szbm3FKo4;
z5N~a7--YYB|0tt)pIS)B_`A3?x##GM5T=L*96EdTkU}v}`~sG!wR>^vWC^zGh5_uo
z>t;T87k#4Ny~_7<@i|Uw#+TBCnwSPdd;!&tQ3QW`>i*fuuEP(Q?OF!g&gX1=ykTPy
z@T#koED5pnrRS?$%l-OMeWKr5j=!Mb<e8T+F|t?jo}N27$q$#PF+Wz7%PP2GJF&6K
z-{0COtOp;CJBK-$!DCE(Ezc~!V4Q+?;Lww`@kLpRkCsYaJ#RIbA7%SOt-|;dX2v+M
zgLdSrC^@rohNuqb?rNM(>58~XQ_c9kg;c=^$I7l{K2t&ZPU&^lNz(41S#hK8%-Tt%
zB&T+I!ARR)4HeUA@`Q6$4CXsnU0KJqR9cT=O>@cxAw14nITB*UiZ-=KAGz$JLZZLZ
zh2KcgN)Hv+R)<N%(Xen)*xorw955omN*ZqX0!-if_6d{ok)J1Nt-n0lnDh*@z{c9H
zkvEZem$zNs-ia^M)JFs>iTd2qXw8KhS0@e*@)NT>zZCi*(b?^SxgC@TR!=8sN9y|!
z)CqB2UE16at_Ni8qi>K%K=2tu2TLXG#)9}O`&`(4ysEKx@*S20XP+I&I-<arszEW$
z4==9HDMofOhDrC#%X`YDX@0EU=Pc~;1{`10e5DJ_FbhR)e&ECUl2{mMIiF-Lnea{e
z<SuZIsht!SL68rZ$dzD2hQKY!dD$7=EdfuM;W#9&Qun3NFjZE`4>ea(6FE<DxNcZ3
zKUxF1&p32<sPUJT;-{w}E93+Uq&<w~z5x(uEngMlRS~Mb==obc+`WkNmz~$LvYJ%Y
zmPTwCD70I^p#Ef)yCt*SBcA4fXnj%NJ*KQ8ftSbG8=oLHXPP+xZfj2yLnyE^C3}Q>
zt^fzIo{T7CG_hWqp;t`pRl0k{KI29*g%w7tnw-8kHm+ebd%UozcKuN+^?Jxm_)|~6
zY9jplJQw9koV3&en*!5Cai%{Q`K(rveomhBzdT4o;x7opZ;Ypdk5cTcCCv>`7hnsl
z0-94LH*>9jk%sOd-?gI6i?<3~jVTQaXj9tfmc0&bNyd^6r;3qpKDI&;0k6XIZ2d9-
zvl0qMTqvqDCqW>$>1e<vGOpbIHjpmdmwZb8&Hy9NaJa339W9hpzHJN1E_7Fm^>wA+
zr*LY&hFLn50hRcGYoy>M()ZNgy3Mn(zOhr+QqgAoKpUr^!PSnTx4d*@RWQ-Ot7bh=
z#2;;1axxv@q~~ctTvvX1;iV;r-DNQr;RaD|VnqnAz@mo+WlZ|~ZZKz4IR{e5;gear
z_aHlHTNXTzM((kztytko5jH5K`Yh%ZB(w+GX_8(T-o-O$Vbt3n$GzuBA@{0-?V%<y
zOiUE-s6szq&WFxd+L*0pa0|~`RyCq!8LcyddSj!-oHDJty~nuY0vu6K^!2eJ=Xkxj
z>O~Eub)fDdbyN|nPB^quf!BTM#zW`8D;twJRGlefJKs6GDL3oR7<)U+ukyd@J`1Q~
z4zo(0{3ygCrNKjHV?tKLASS>5{TvUugq8yBUkKYT$kioAk%iv&J{aj8Fw=VQA_(6t
z2~iZ>Lw)*9-A3frPDIjd?qiBV!h{<S^{M7+TLg5EQ&)akdn@EtuU^n+Z~ux~Ty2jj
z0y&2+i&;#!Vn!|y8;BGC_QGhm;4JM0tVH-H5?4A9iZJ%)S^s42#5vMjBcf8TTq{aE
zg9OwvLsO>L`)yfG@!P~ABb#D!=<}SxjHej#Y+RWdxXub1>&~vcDhAOaFNom}zHId)
z(sPyP56%Vaw``-E#@^idQum%3FbW$5Y9S|0!|n*-+^9{4<F(><FHd8f=<^!tn80kl
z#u%+H_ktXIVNaiN<PGj#9-#bCqyJ5OG=}tc5WbVXrVwj!^hL(t?)8hfgv4Mjd`o2a
zhPsd&b@f8JQa{vHh@>b5->z@gI7b&>yV%OUzjsnzpIl{p(?#|8tNEcfLT)XPqweOD
zv1jbk&C;=PAZbQxV*W1I7iShiPWc}x!h`_9vcA86s2s~xIeR&_8No*S%>)a1TocHS
zq-NO8?pA}z*jxmMdoR48G3y&v6?F_NSkoNJ1FnOyMa4Q4A_F#AmjTuffSA6#C-CWD
zE-pn$Ao*HCsn_y?_@d1t8PgZB^;8DWGHI%NuCEGP%~AJ$_5qQK(Q!oS#sr-@|003X
zyl5dPY9W53OCT~Kr545cRtqcYRsLd34h6e~-xz*iB<hzGil@tKm4&LROeVpp0;n)5
zm+zcGU2RqyJ+q6GpC6`8r#|)C{gOg_mRn&oJEMP)`*iSE%r>e={4r`2^h&?33S8X!
zp6|M{-L7LZ)8f|-+p^;?F!e{am#Xg1Rr+F6gh9$>`DY(JvCK8mFtj~9k3SJsp%I!_
zsRoD>P^K%O32q|A4;N)rZpk>`8%*h3k%YAR!WDz1J6CAwxq#(vcLwTj_mS29y#WO(
z$_M->Cb17@Mi@?7jR=+Cqq&M@V?33Qo>6`Ki%8JpK+&@g;b&XF$_lp&M;v|wt@?^z
zS08!xlERwnJGgq<I=^0PMBZeA^i4$u!k=9LKXz&>%ifBM_vcro1=6#Q<ORINieBw&
zC$e!X-{4=jk86plhQRUt7EJ6TV0(2q*!Ws}AO(?b8`_-zJ9zrWYH|Uw!(nP*a_ox8
zXqY4A+cy4Z?u^qrt+olU-12ltomnmhe=A>Wi)vs;fioN1M1o0XHaA@}AJqsuX2xLP
z<jg!Ahj-6$b9XnW16mDe>-O8WLlo`;!p}d&rA3|5hto*fU@c!;R#AbzC)NA%p9jyb
zvkyxlF4W4meV$_Wm@-FCX~=|8?}m94$_6`g$dKbZ!+nrjHm!t>=TYP!BNzDhJ0AE<
z#H0071o<ocfYB`*emab~t4UXkB90Fczw7qD^#1Gq0K-SRa%Uh|_ZNBQ@%|QZcwHe=
z6#W;N{6O)YNrd;9JYC!0KauH{X~dCVmlWW`l`*L3v7>1`TK<yHO(!R#!oBsX62@L0
zr`?!NE(?T)5|bh*iMwW~N2o^<+BM3U7v+BgIjfSgfV?lxXL_X8zt7I%K+7`a84<Ox
zP*h%!y63|4(e?)?xyA4rET{>!+j$+^2ofc!KrvQGjh}EF*0kI>K_mE{vzF@^YpU^@
zvO89q(qm0c{Psu4r$xPhLF4t#Q3D)aUcVZBJ!u_+?(w(%niWavl$Ww51|(qjO(9pm
z8QDGv82!gReS~+j4LG5$8^57+kMJ3i$*ZFDsKVyF8;*IUN#LM$i|Ky>0Cb3!HtY<=
zq*8slR@G${xuF5c^}esk^9$qF{JPM@x(FtgS0`W2lfDGGFa^Kc^dB(HRgmrFMfspR
zD!Y-BuqUMm0_z;LFNEOIM*QF^ii@sIgiBA4O7nN2y&wp@1hI}vn+bxjJAB*TEGM`w
z>uVk#{m2jzGn<V?zg1vU^cKaV@hEKaW^$^r%&NJAq%c`_7w=C(qHv{*_lV?5Hx_-z
z4MNr}-+IWvpFjQqJ%6(5H5qsxYOvasT5Zd)NvHO;aV4yFcCg*3qME@q6G>KB4h8Hr
zzpGg3#-wEJPGBI;J>~sCQLU^rzUXwTaNMu7!LO>0p9m22r?T>uur-=WpS!zks9F0!
zfzDu7ahGFE;oszhq%l}>|I;vGJ{-C~2>JmE!r&z&?q|<?t;vd8#IFm-<n5vzvQQoN
z^6m_quS@0DKyH!78C-8xkA7OFA!3=5WQ#h!l(S-*?}z7QJi(xd-0uD=Q&hKwc5<Pw
zV{!3dUH+vpkW4Rf8@&}itIH3`fFhWX9t~wDV9jyibYwKbhg;HXcHsRqIKQ3cR=#iE
zDJ~_om@paPw!<*hs`Q)*`sMB}T;iY_$n^3Sx9DObQtyjRt`#hQMuHAs-6^InC*lDN
z_bZ9PlAFO!52aeSzkqGJ!~In}uOdhBov0RoY|-Yyy&+uBuRh|u?)T5|#6|o;MDd^M
z3MEwM>KmmcU?Jjf!A(*ON)pYN8?2-VI#ml8^TroeNIWqgrd>?^7Mei9rJn*9`X0=c
zBDl+J<=Xu)c^s0gC~N<Pv3UjzG`ispg&;S~c$<1iqMFEp#Tb&l^eBpq;jC3k<_RQ-
zFfw!k%Ul_ohUgz~$T2)ju^4ZfGd?GP1GkCsyA_sGT^)n_G#NO$<z!R$=?ki18@ROc
zN3upu)xEZwqzv>8$Sp43h!6F(Jsg3|ZiBW_%sq(m49YDG4kto18d1e@1+vD#xaFYO
zSJKzY+yZ%_uL%ZT)=C$@BGC%gkooQ@ePil&bMRq1&Iaft(|6wK{99!F3L-vaEcoa8
zw)pVw?N9P4Py*=JQARVqL-NqEC^tfuZ}k*4G`)F6ZY?fJmD_$DET_I@xmmgIU#aAl
zKWKRPx=PE#xxjcht8-HGYx&hD92Db)g$d|8{OpS$nIhY{<hYck+3ziq(D&Fr9WS3v
zU_y9hu47ZERwco`QY%NW^uY6?&cl37F6h8}LG!9dA#tcQn*RNZSOZ$70;k})&0R(>
z{E+VEHwJJxY`(`Vv_yVeKS4-Fx{t;#o*(BSt229+?2fFbIW$HtI9}z!1sao!Y?P-t
z+V<C)>*j&9HFnRd^CwKw4;@*77F+sk`cBqu&~d9uNxNd-_F6R`uY7o7Ti0nJKKXjk
zj8nZYl^Aw#<FJQfTqryL-*A~e76sY=RPv{nS;JqQRub}P_g3>V>(}L*_bds#B0N|D
z?*p8T+i?0bYZHE;F|O+MKqIMsFw7opU5NBAUy+~<m>bRn5xgVWaFyD{ogC8?PM6A*
zo9j+3biHOH2f3E|rLuO17xe_FA%D_`I?_D4S}QHhn!R9EKMq7gA!-vmrl6oKVYR^b
ze<ap>AEE!c`wI@8@i9au*5=%xnTY{NIC{u-jX=K>o`7@hHP9{7BS+a}(et5U{6`uj
zkppdFGTGo?NRm%PUt_kF9F$Ns^Vye`)unbuzKiTq-s2vo0=g@fj3W5NxICu>fGN5L
zUCZv}to~L?mo#+2?!a&XPw77L`~V{|<U_EgN@eto`vQvoEyf<0Bk=?i-$0ift+azM
zF03fj_u*mMBmUOiqp-n~?Yz3G@Rpl`1hRX=&u+oTb&nWR(!n6jsH4daA)+Bo-shNb
z&Av<T0J{|f=1qoYRQJeRGJaiQLS$FccGJ@9qP6XQN(mhc4o;i`bCz-~w~B9~qd>zo
zWs+#J3GZyR#a7^d4Ann*X}Btm`rlX$`t$zP60-ViaV9z8KUwif94bsm{qE<1<+4;m
z<)N+1zO*X~83~l*%gqt)sF(@3I3kh?6$ix{67jebnT)Tt-!neA1|Eut?Y>Z~w1RYU
znUtF{sa4RCfk&t1PS#&TEY;uc53q=H%qT=*eXh!-_wev@-?Sj(GxsFY$QD+j1hz1u
z0lE3=?Gl!AdztkX!#8Y`oh$_H%1YR#C7CJC8hLwR(!8zB_Ak7ZM76=@?^HgvZV=HY
z=ymn)J6a6`msrh#rB%VmnVgdcD9-D(1E7-^4XmfB6A@g-#A>eFu(7ZI!nl8av~G|P
z;t$8Jz+^uui$Jf><-_DD?j92tZC(KT+}}mY0&F~gdYrxiw$7gY&fMRtA^hU>qNh2_
z%)8vUxozVWe4T+GI_?hkn&N>Qx5*r4q2~Z-7HL7~yaB)aVR(qy;u`xkW@gz%2P^Kr
zK$#%y3qNu`J`oYqd9D1g;i6=<$!ZTw#Iv)C{Ze%Wy~14rUde7~1$#xBmx&|lV)X1=
zj1KAp6$2ASH(aPgqMPawOUc0o7oM~$XGB(Rdv^WgYN7+KJ_iEJ1e{zspmZLfYx5Gw
zsyz8{GcN{H;AwHM%&}h`uMLET*Yvft2~#>d_=C+5QhX1fwP|d=Xn2(vX*7Sl$K~}(
zxmGJVc?5U~P(0UvWXJFFS_0b{q2zl6|7S461Ow6WY*Mi<+S%twqhl7Ke4&vYM)tT}
zP|s4@Z|rptypbt}!D-<kd^aNR@+k+yyTmQpTjC4njPS+}=4GCK{XE~{p?4^aazU9H
z1Ww`Y!2KtGK0mwISdg~-{XyLwJa19BH#GCf-NP<(9pnjBEc-o2{JW>J?Z-Zb#N-L}
zAgv7AiS1}pVagf%piBtrNE&)-wz!}2&|BSF3uY5JkGWf)#XrcbvV14;<<%?%XIINQ
zA_R8l*MBl*|0O{!{%<6xF4!c_$d=gO)a+kp6@=mdBs#=7hy5B~tw;Jw&V})IPgDja
z%tscTOmON|>;#)QJP#X!(I44jsX%voev2a0<1AyEN}tugJp8gxjoISZUA!+1##5oN
z^nv^9-2uVuOdbB-5gL~}<XAwBASD`<B&?}{gN4zHrrTSMqNc_$j<1x3MJ0+q4bi06
zV>cL#d>3E7tlIcQQlvYz46B7N0<LB`>x+$)!yPXBeiSymo>LC+=57MNN>3Q3@`uWN
zWwjd)tZ)VR{|Poklm89)pJs9j5D*OSWdgNP@c?OER}&doVd{G;2Y28}hTD4%oLYsh
z9v3-+Miec(W7sS&*v(;e|72w>N@&mb1Cx54*3T}40}32G<w=Ls?O*2^4dtk^a?CWz
zc<|Ikx^r5{p@)e+G>51*5sM;Y!2n%i9Vp9hxoMr%eEtrli<ES1QSep452a!T$6m7Q
z_pLmVZx2J$V`3#FA~{k{FzDI@60pom%W`Eh-9B@qrgmI7;wwiR0$=&rUzcpBQ2Oo6
zi5`dS6Z3yXN<6p04<|$G{{vclF|IWv=<GX$9sBFq6LJ7ThZGjC$2?V${pmey&Qgh{
z+qcgb`3LPyQO+TiZjJTBhr0)D%|kEV!6@jP<C}j}Ar2d!TEGu3aDYT`qCXjcI&eMr
z?FH6(0>MACB$UZ@hh%=W<MmleG`k(W*mSKeIEst`rIsF}B)~Qa{E8?FMznns9`=ar
zza&E1NVh;j?iqr*<19*C%Z&Ku?qZb%jFp=iB*r(#R@fmQ{6OVcym+k)rOf}U5-;<o
zQv5}n{_rO%pqNlE^BV-pT>;VYber<uJ*PMP#y_#JB&wu=?+>%6Y@{napp7uH<)xcs
z8w%VbPHpUTH<|4>{iz<vZm9C3FfV6a&e6XvJ-il3by-sf;&us<nAa1^72(*;!)&Iw
zd^BS_qdny%QXcw9c9%#=LBJ%l>&d0y_YWel1Z!jV*}SW}X(@>^p|9A(#^FBqE=-Fr
z(Ez%telDFbeBmg7VXdyjgyq4<A5;wg0cd|JBOeqBGSW8eM~dvfCJFf{&ktDO1?6;X
zsFb`PU|Cd}KJUAV_a%vRg+_dN&(?^lS0&p$Y&LmNXbFdVpc^MKkRcKHPJ}Rxy|G`<
z(VE&P#k}SomvY~B>qiHA{5@N2{E5d(?ab6zL_ucm&kJSo!NFYg9-nCWZ{K=nW?&pU
zQ|OOtKdtSi#iUAj;JI)OaawZIJhs35l<oWC8CRJ^rle=0XgKh7bt8Q}=V#Z6>;8Q8
zf3(fWPC%REFlqJ<?&U8M3jy2v`Vq>P%cz(husz&!(*s9hZRQhoRU!&<tDdJ+Uy<AQ
z6tvDF2%>mQ_cJiSJ~L-MD%hs|k{u`#yAbP3-+8RqvX5fEcv?2jPi!G0sUk{~!`I3y
zoV$fNy2mt<|8cX}^!g>4g1cVyJiK;yl$<WMoxA}N{AE`^nslz>0eF#ac?(5E4GK_6
zfcqPDIQ+N!`_F+5&_{Px^1Up=GgB4Hc&f-Rpl0>>1_Kqc3P{yOyi5>e?|;q|nvF=v
zwn`m~%_%lsnSPcT^p$Ie=Zs$DMq6UiBsg?RjcyS2YEV4%3~c0vRBSru4<Mz@&OMu>
z)VVQcbQj)32=WEGgmHO!(@dtvSWSE+fi{_gkLdWN-3wyFn!)U)c9l;R_YGWle)y!?
zBOWt7GWN)p>^w4@RPOxt+n%S&TAU`kOWf6C-(EWLwL^4fZ5oX>GnPj^gNJjBeKQqc
z&t&d>k@8UAPtVpOn{((kssA*a|KuKg%tVAl)j36MA25GcTt6`pK94?D6)({#jh{l~
zCd>9L(Que=q{WKd^er>a@EOB~lifneeU|xWMK~keIg5d!a$B^rU`7o!nvV6N{xcjC
z7DP+n%*=ws61qoWOGBvU@j){0agyYR_$`jX-LWcG&E~PmHA#z_JyXhQQ%Cji2=y7J
z<L$~lcIAf<8@nAA>BJhTMCX8I2PR*649;O+=>*bWvew^>7z&tx1U5}|TVT!f!Ql^8
z&^>@Xx{Ycw@Zi`lbiU=)vy2EVYLj3GP0OiIYaY~6W(*mKu%n_v$BjjV5330k6O9Uj
z5pj!*Z$Uw}$x-1@y~G9xu!S}jvve@4eTxce{P$XTuh;FdZ<{0et-qHz=TF)iu^%S`
zZDINj_@rYK6w6Y>m9+OIjEeSEsQT4N-l&wz*hJ7OY?#tW<q6D;wY@}v#6rFVC+Vei
z)cjvC^v^JR0wn<<z)vFh4uj)2M4>%rL}5#JXF|1N2phWh(o#I1>zJHUhh7JTegmS|
z=0(GlIKqW!>l+koS9cV3PSR<qYtUs*+@RY|s^Qm_Yh)H>tXp=O&$wnE=1B$eA)hze
z28IuFJ5_I3nG=2-8$YtdUr05vTr?1IHlB=hfw-Z9eq>QmQgG40?b5IGO&@dnc8ui_
zDJ<RomuOay5YYdmR&pB*Wggu0d~NwB^ZM6p*SAgZ_y-Zug}QrX8NO*FCou($MUST2
zn<n7~y7Wub;0hDsLGFvQom`~092IOleJ{g^KFArx^>TLu*H&&Qp+!MP1g{>>%SiR=
z$x>mMLb6IAc!MIaFtZ}kT%&x#I$7+&Fx$(JE$B^e%#_=<^D#czDD0h8Sx4G5<B;1{
z740@ip*h(&ZGwTJa^v)3Y}jQv+vVp2AZ=}1oj5xgh+?-*sO^`Hw4EUY3ZF&>DWC3u
zp88D_sp1joYfWpQ!(}N4hQJoU5SVi_cz{#AS#A``&0C~xd$N=08gQF%1(5d!ZW*BZ
zxPNNqSIrasAL<XyQ~yq*FYuSXLw}DfyF37!w<ITVp7$tx8$-^+fEP3f*^8I?yx|?I
zeS+Hj&c4f8pKp0&%mh4z%Z2)8^zww7*troMzX~;S6i;X~PwFtvH6jP6ws**K<GG9C
zY1{4ON=K0Rg|oA@YXxH$dVtHPNnOXb{vcTFj=;KC4(g%I+sPC4X~2aUr`SmeLY$G%
z(VJ5)331Q5H2i3CDRke*n2MY?H>p{FF*#Ph|98*Ln8~lPXNq!W`bpiQrm?hzb^$1P
zI?XxTr5`ELrOGqs3&F`<#K)1M5Wy3`YvBef7X?4m^Y-Bk*~`B|Jo>-TH6UI9f0-rX
zaQ8~iR*S3%mGaK7l=$QgWf&Lwz>`9ccpubPKb}}r;03&vvz{DYuL@1!lwO3&r9a?Y
za26dA`EWZejRzNXak6Tg-P~reD7UD|u_J*h>?5Bk?9Oe8(Fp$h&FD(uL8Z@V1mZ{=
zQ_Voj5fS6M=y&<{O;8O_(i?|l_EMwS-vWklu00Le9TT1Yv0j$k^Y}Ne^zV#}Vlnae
zJ(hhU=lA!YXH{3@-&mFbrXkJY-Ty03vH46$NCllQDiV_NuZi=qb?^g#0duGAo>(e^
zr-Pd@7{7Ex4no45MaP{k*s3;Ox0yLjVVd~z*7tFuAr#kf30lwBT&4Oqw4wO$MNkg7
z;UMMG$U?q_Dt8cWWHo27np68gUN?D{ciNTce1Lz>$LAxF*nExS6qhJ3aID#{{r%IA
z*jPJ)Z$)H`Hfgqb0RtxBMP2(rqphk0y~KinGq<-X(f!qFmFWh(!M69wU0sUb?OD23
z@kZ5^_|Y@Hvjy_@o3}U)&MF5$duVXs@nqvO(j4zMHTG-A)yhlHZqz4sLBlQ_slXJS
z%-zDfkjt3EVV;t1*6F(^^+bfq_O3%kJ-C0+bE^ukbVCXSiu#mQ;e&<TzpJFkP{y!N
znm`kyV`@11=(ktu>qI61P$p%-_L8%4-#R&#wrjOQO9Tv-Exokq{ONu$lg3vEx)2ft
zBcLnC4{Uz2hZ+SGp!5m*t+m?MUVCjki{)|-=lSF+nv<!oV0R980M~-z%~i`M)QT&;
zWzT+{?^y7JKSn!yo2=U7U(;q*`}`Mn+wH!^6YkVthJi5L$IedAd|UXOXk4TCE2K71
zxBtKCejCY!TNmBF!7_g46j%Ju!RGNl!B%&Rcr~-rqWCsV-9kGsi!nnUBz}}i6)~g$
zE^EywK3GcAs*(pcEUnB4AsXSXf)@>|K9HIfs7Fd;dsX-B|EyifE=v)&Cwn(h(sU`e
zJaj-|aoB*RecG<|fJ&RGH1OZTBD3;SUU{axo+Yg+1f7ZE;=&w1dMa-gPrz(Mh@6kW
zi*s?r{Ep7*4mEf(ytI?jm=ecn3#z8(X~&zA)AZq|SO{58A)t&*9!{(2hj+%KB^an$
zpK`l-Q)1n~$dt)$B|_yU7v}4z{n{m>@IR;U#!z8%s1DMER@&mYy>S75poU%wxz$73
zjqOsd<EIKhNW`pVKEn+hd=3q6h_*?oatLDfzeVkCiaQek1eAy}CpnV|vPtFIu4le9
z!uGJwqlH^rqOwZsQ~Gu0`8Gld4qk+kTTENJQuo(7=B{O^y6v>gs#0p!gpGgx+_x|-
zIh2>T0Y1o1QpK0a^&zX_&1D55Q?^(K3i=Nn`A0cKhRS{ou@E)ijcSpc6h6*_R{a+V
zAolZJQEv5MtDR|qTz7b$w|yu?fh~k8&WU;@9fZL8&~{2+Z2Q6u6@+Ze8HCfX^U*{<
zJuBRmT}HTE@{r>m9^1L;hLNT5?yVc+H`7;*sY@X*RiCwmDrawaxD!v(&*(|-bGFjT
zB<?zpGp9H&jvU$lG@;wpRJT1^t2)S~8{3;0@rCh_KWat<405AkF1No#+C4N%t=ubB
z>F-YJPFTAfi|*~44&x#N6DDLF1)jj;0e_&o9s@Xk&j%lZkf47@GX!B@wuBA=wE>+k
z-*kPTS5U_t`E|Dlb+NCFr91NSpr7CU>{uQG{42yC%yH|`>?xRB7?wjS=}FIy2jrqs
z%wfNb%nq;<QJ{IS)8*eRAx_TSu{Wmi<{Z@3jTb1C@)mq<rhoSOur@+{r0J6;J)_AG
zzvzQ0BP&w`+Od_*6A`#KeZnyI*Emv~{3Zl9Y&OTlz^%K4d|1E#ao)UCLm4n0Bx`CJ
zC_+E+OP-p_8W;K;xf!5QpcEWk)=^5P$wTH<#pUIk?a<E#cz&eMVhHYGduz?kE~8D!
z|BXTq>Oy@Cn9OJ^6rFfdA%qDD9c-AgQ^B~qm&m*3rMU@vZ2V{<W^<tw3@k9+PzQCC
zXzmx^r2;k2Xy|eWdbLFDj*E_D&j3_v0OUR9s>{oJMbpna?}d<C?-RIPXiGUBLaWYt
z&u1mxgeq^mU=re>V`hQzgzR6aYbmGuUdhgEC-C2V%D=OswMvjsR1|DBqih!S>nDKO
z=y8h3cCZBANEZJ~et-CEBq-(wmswnL?dt`n`ZfM{3E320OkZr=#`yHbBh)`2`}2TJ
zXLE1d#cL(vBF&-dDKC-TJW#|r<$|ILe->4e2bV*i|HKMKSo=K)MS$&P+rw%_Q~r-E
zz_Nil;7&6t^J=uiyau8M`2}pREar=JdJ0M`?v*KJh{LT~<gAHsuheZhdv^$gcvNu@
z`TDox@%jR+q^uE)r-|}uoGAC=08c6`FBqc7>I<;SMacwS7eUW_SzGhfBb0k@Gl?Cx
z*QIlCcYk+wypp=nQ2XA%vgEhYBKbdwOHloWV-nBc#Qxb7{nfzooe=?UpN<RL2p(=B
zl0O$jC;*E!;Q>BhC@jc{(_cp=2idZ>uYPuM2zZv^$&eJa*_0M5??CYm1a;FX@3QZ_
zXGxLjx%9z**WcG?O~A*4^SZvlE_JSaJ<QO*gT*#zz+{U1KbeUM@i0T>>=-uhKFa5a
ztWniX;s33eI$h=LR&SZhbh<|{p>DLhugfg$WUAe8@}H(?&Xc4N!?rXMN%_BZT7Ttt
zY-H=zNSKs2t`Ov}8)1op#Uo`YL7F@P2VpQhJToH%KF@3?-xb{1^fXF(gM~cOxjQv=
zWZzBvv8u#vqg#9U-isUXE6$l_yTVEpmt~Z6ADJ@x_W9U2Fck#3T}dTqJD*E;-u`NR
z&;QZ-rVlz$BMdd+0owPTyhk^PR^7$xRlW{6bK4b(+*Yo?LUZ--|6e_D?rC<F!3UZ@
z5Ah$^TtTdBVauAoJ|tmQYH3gNl-;e9B7DMiMJvHM)_yG$9vtR{#a5WJ2ILf0tOzj_
z{Upj4+izI1$A+F6=-RuZynDOZijO6jE~QaaTf97~))<i8aG3Aw{Dz1;za`UC3lB>?
zdx4l9<&e_>Z>#F3_@8;=d<d_B9K=msOKq??n|1ExgEy~|_42``JwB|fYtR!ybvJnZ
zVpZoa<7wqRsmT^ON;$$D@vOPmPS|&IHg`d2=|HbD0&w!gJ8~gD4mxp8;mKd#rhi=B
zlNv_#yaw7t=&jGJYGsVY@9Xy~itt?_pBz(!9;q__d}=5QhY-U6TSwi)Ecnc8NPE9A
z*a7wC{dBzOk`<?}BdrQz{79PF^a|S4n@52_U~c~PN|!{b`I7eBseJ%_t;z_o<AYQC
z+i8EN7m%|B4eqA}il+}6iaPb7H%-gFiKlPPNtK?&E=NH=pTZ+|()0{#tGJ8*NVnE1
z0luLMhxdG^S)R<<iwYej<+*>FX1{0@`ei4trqao=s&vnLuMM$Y-(ZIfM#+dly4MP?
zAUsz(JL{1dxidBGj*lb{zC>){nFg&Dz2BTCfs^5QeiK?(nxDdRbx(7)j8`iUjsOGd
zAcmjVVTPTuFJ>h|6eok9{0Tf@RFeWvDNTk%?Qso@K1MR#5o;qSnZfR~V&DB3+f5Gg
zt5W%oelPzuz)=U+*8cKI<P#B+tRU>$wd55R=cRGru(k0a%k(8XtZt@RVx?_ry|=RY
z)VX0PKAaT(a>4arsVSt98!W6r1OGFh=Gl0~ClY{Ui)mF)O)f3xhee^C)8PH|pwi@_
zvTfT)8W=RA>3FK=r=rXx@fPgs&Ras%w3wT5ak857j84ib@*092fNJ<OVlZBuU&TCl
z$`B<2s`T@D@$zz(be?@rpp6>_LZ)^rxZeQqyOy4O)CwNxVFbH6W$?fhFO&@qnn+uy
z15wI4p6qrWY4%&EF5vk8p;MRc{SP{|?*1CrwRs5iACABmfN~@=?o^!+TiXCZl&}O_
zzzwtg{(Zh#OBd`6#Sc<Uh_h?o^327TMR76gm`pz^Cl}QC8-r^B-((?*t^vtU3L6p5
zDVrq$FW9`av{Vr_ka4(r9KEC!)HpmGIT2Or&FxKL-M5$<caLDK^aAJo54Xu5TUDDq
zlT;qD=m0|~URd`_;7EuHehD?|lfm$O?vELCSC`9=@md>^kLcxpV8kaLC`<0Na9&U0
z`)!%CYH8+QNk$0!oU#7}gvmTQb`yAiKfi2xse-OY)>9wDyQrVjZk|O2$B^ImmNR|v
zxM*X4zm;dWfqK)P0yZqn*W1LgHK+P#$K~2CjtDkrJBExQ+LLFc_c<|NmJ?QKq3w0a
zIn=Lldo{u@ydW71Q!LvI+jeUWu9!bQj?)eZgjV$(Rj&-bsVHEhGt~oWkaGkHFlAWj
z&i%%W%Kt?tCdBZ3+)ZwgJv-$Or~XGBNCM^yCJxXA;e}FLvr@`%b-$a`JrbZvojmM_
z(pb$b+26M+_*rA`+H@i3Y0rZoWY!Ov5DslGsQ{%o^Qv_<&5yeQv>FZT?=XO$Q#pP(
zQ8m$1s|K#Fo`fJF@u4q3NAK|GL0N~zO*49ft+SEC&vCeZAec!eyR}%yJi48iaKdPA
zvr#cW_9b)Y>3dDQf;gQeh8(Ke*Zzy#{0TEaXh=tnikhD&V#}ieen=14OZ0RVZKYt3
zTU`U;&SXZNYU0|_N<;JwxZ|SYglH3Ctc7A-$-bCdgRV{Xx0s1`WM;%?`98efmTrCS
zkAM{zQ&?-jc424Dio=zB&#Oh0+36g<fc25p?mgJ=!5a(9r6Z><^Tz}uIR#><@>N#)
z7rjnP7c9iVwG*{{Skj&+{RA7R!jhRH+1N#z1K5ua2zkaPa^n0b!0138{#97kTV_PT
z9Gn&9_fJE?DPwXx-e+5x@$N_Jr@N=RAzuB!sB}RAX$0}t&=e_gCk>63K;l=|bf5|H
z3V9lu79=cE`1yK*BtxuG-~9p3zs@lEC4YX#S{7%}h&&1nMUzQJd5zhxw~07x(5v|P
zBU9IDwhu2{TOEm@{bh=YN=LNb^t(6Sr2~>&4(~-++QN5>RGSOJG;~P?h^#V<VRw#0
zdti9ZoN`m<#;Rg(no2i5OXPOOrG<P$K~&JGjm%t8ebr&^&bCt-o$26RLlWLbZ$uoY
z=S{b26}<B;2R8}2heeRcz!EndVLSh5xq(((yf1I-yWyVMN{CllDl!I};QzzgTZP3H
zb=!g=KyVB0Ap~~_PEo;w1b24{?ov>N6Fdp-4hc>mxVvj`cc<_Iih}0+{q;S!`<};s
zTHk)(bFQ)GkQrN44owufyXRIBDT96b=17JGC3+Q^LyGX1xn49|dAA$^02z!7{{qcx
zD;WKyE!s1sGK22nZ_+xW$JS>IOkilnl)YB|imee?O_K+KUwRi!c$iQk1cM$TR)ibz
z9^lMvx!-Sr&jz}qTm8B}1HCBCW{I)sM$(%@ORFYzY04}#g<My4{PFM~-unC+2p@^S
z)!Wln83ov{q{y}vG%<~(kB`<hL1bZK6H2Xxv;Y0;{jZW4wR}-?EQ-+CC~1%<Nb`|r
z*3@M^3wbvgN;`FY$+6MrmOnf`rqyH5_3jIb_x(cXOwW56YSR(_aIsBJW8ZOEIDwy)
z_~`(Zi`vB5WH6yfEQ=`RRaK`hnY=D=R$%ZGT-LpjTLLYt^EG*=j9E|jE*Sw*N`DTH
zI{LmXnV+mB;cJO^r342(WYY@^pM|+MdN1#Sdr}W}%&gf%kh2NN01xnHeC3wcQrV<=
zN$#7QnHAx*T2m;9p3}|xMlrJIT<wl|MdL^8ZQi9{Yt%J6n-@Gtz?fLX`L7)d2eQz)
z&R1&=?3l!Ebs@1Mu8}@m2gnu)c&8L&n^CY19Wt^?s>erhN{xAY)H|avgco7;U?*{k
z^x7GKTWDTgK|4|{r8tAt=1m`F445|lUwF-V+y~Il;T!kal@%q|JFtShyjnWSnGKuv
z=D}eKL9~tg5F}HwNipSTLwnrYg&yD6J$6?!1SjXF4z7L8{}vAU188G<NtKG5*i)(6
zQMb=CU1<-x{?ab^wDrTxxQ&Zx`=e_n?C+g$IeW?v?}}sT`o0X2r+r<qw&2^p_z%V=
zo9>62{MW?dFJtS*kJwQsuu4$l{BktV^#O6(FN*to0qBIixAD7h63mv_F3R7%N~10F
ziI;0zziXy-dw7Jlc0G8oesn<l9ZNK_+W}cQYVXYf_y8|z^Wcgz_HB1dnRB~&nLWGi
zr5YnMFPAj;OzI0q!}sO<<1O33dV%W^l>ZuUZRoLrB)fcrU;g+1(S`qSaN6mNL%oEb
zjr$$=A;O$Doh@5)GctCr9(Sq6y@5Bgx+}N(T?*#qCWb}UW;J?zUvI#dFM!kEzq&$W
zdF*WI$i0SciV8odd1;Fj6_u2_<&er^S~$sh+Lp70r>E=xDsNRts=Wwi!gc$@XFFd0
zy2tkSVZ^Mdr3;S=%IS)sEOpkZtxb8l<?zHv{#O1Y!Ll`pL3_*3qa#XUahWjR#FPZ#
z-r7DTwnGV&yQjt5d*v0A)5LE8l>Jf8!X3*KTH)>her5^v7Yi<3Cl}T8!qub{o|=sf
zQ<nebP@V16dwz_urAq{Leb>vPsH&2r2atg(DPuR`42uA6!Y1Ke{kFil`FjE7Q_hN9
z>XzZdS`N{}tBNA-x&~7>@i;%fHcj4=Z`u{QO3|d_1-@5NEcAS1x=JYR)jX#7azc$*
z=8xgvS2Ps7oyqGQSy_O((}arBH{hzml9I`lMH#YnIiKHmp5_i-4s2T66kG!5c@<a+
z)*AZym?Y@uE$knKB{IH=Y^_8tfcRg4?<%*DPOv{VOlxwuzXt#G{7m+<4VjJ^swvj1
zCd_DNPQm%cEV;m81AT>dZ80>5Pp_OEyl|VMqwndWvB$R;u761D=;+Lh39c*IWcxCn
zw!zjCfBeYPy@vC=oxlUq>oHuhFwkZDm6qM%VxH=e%ip6XrJViF4QKR$T9l_ngTE*%
zq*p_c!E5%E6t*fj?Ix~}8Z%ZVzJw?&Oxek<JEzK?;DXCiR;%(I=$=2j`0&J#c7t{3
zMr0<Ln2o;)PUp;`(D$wh8lY_dP>wSMGpzr`xZ3nz0D_Xg=T#&q8t$cYPs6%e8zH`0
zpC!IrW5mI}G4J>qdr*;-J-3a7v+&;CTEi6#-f*T1i|cp|rqLU(pXNO}I@5+JnHkeq
z6<ZLXVf_&fKDO=&#NG5(a93nY($X#!SCm&S_U(HFN5z@~WLFF3CzSWQ#5h!P&=e&V
zUsQjGlZ`khrPVe8o<?omY?KyXC7j|ONL-H!CM~g&{>Q5DknWt@aQK+`%$`K=fWjSl
z*>6BQNvhn&{BG-rjkA!}A7-D3*~1O;U-;r*DzB#Wt^;>%W}=<FlTyL_-Z=VT5_x`d
zcsKf>u>h|a6>E=-@WxcT7&#?F{LvPx0=StW)f)iKcRzSAF)9@!@IarZLwUlQ;~*jY
zRwO!Nw@P4AFsJ?el)~I`+Rt{_!6`J8yCeU;WI2LqD-fsQxeFpLm15#su;qK#zCLYN
zLZgU$Zd=>ahc8z9Z1rfAN_Bj4SY)z(F4qiu8u)Fw=HPdkXJEHDeLwt(eqrncWI0?<
zvyHmGenQ}R-i0=k&ToJ4a9mX~!r2m|EOsHS-|pX(ez0@PvuX=<2Yk4>sqZOB9jMTa
zidG3zdJd*3xnK<rEZrwvGfg6%z24}Vm^695MQQ6etR=7)g2!-8E}A^OuK=1u6qJ+#
z1F9kaPHAi20wsX0?GK24LgQYds2bfi1N_qT*f+rY#V^;_SI*3K!cD%{5AKlszUY0V
zD{;9jV$Sp6<?Brwj}sPB4aYBzpH`zBCp0KxPFroxQd>S_n1*yjh+WwI6ULWn%lgax
zB;{3p+fU`rfwVfn5d^0-Ri9{a))*kS*;3ZN<%oh;Ijb*CKxACGPgt-pZNsU+;?Nf(
zuG^)=4!n-hlt?k)hPY=#EY$3)(E*Pc9kyW_w@al-NA|ezK9$JLKaImd`gsj)l2%pM
zb3}hcu@0%(<93Ma=<-@6_89hG69=-NvJ>3^`#N)GuH!B&{->K0caL3XXZ0{=;3J-X
zT$pITyJHhm_@4;>T}_|`Wwg!j()Mgz!0}07YY1#H;>o`UqCtQi8SvG==E#ifZ*vRi
zpdYyUrQzx3`E~2W)4hsjYD!r@0AY5h2??~zfBU3t>b&*|g7TKIs8@Qvr{7vINlSCG
zV4~xbH}Fwqso7pfmEksc^FF^SQ;MkqsLmm-3VYRW0`~6Rq+W?ZwN5bxQMX;5ffU9=
z8dj7BXuX*6+=KSNvImS$Fmizx{*)!giNNej<7$<Kf}Glq6rRno<KCICBv|}rVn(MU
zQT9S;^WS^z*<w(x`De6y&Q54vc;0cgoS)SG%SZ0zk;o3pO46wuAV4~X2T+y0o%$c(
zObiJUz4h;~=;zKvdL@)sG*zk4v!mkqH6IfUy?RR;E1TlUj_id`yPypt>?+d|Da0Z!
zxroQ2vEsg(8d7(AoJ(&DPud?((bbC*$YfuNksPVfgZ;zZOvU*I<q=&4tl!`xgoGni
zr;sXh%^yUlr9G>T&1ji;(V54Yox3|U`i0h6t$$Aj%X;e4iXke!r{XZu5X~#ek&G!u
z-egslll%4|QIw0!t2xGXe2}|4>?{m40zT5y7}3K|)kkMHsE?^^K4LA6nA#d<t17G=
z7m0RiITd(#yqbt$FGaU%C3k4zslUGQQl}Nm^ksJknY#49r`o<JU&qjxO9EslD#9j{
zbe`il5kt?+7JTt}dp+Co=!X3u!@BCi)U)RS4UaAIlu26d7h63)8O^j<2+d{)!QDpB
zn8(OV;HI;4TZ#bq{I_2Uj=uT{n<YE^MV~nGxhTibiQRe&CK+JF0k}N4`nR-F{2sh;
zlhTa}%^JT|epn89*6!a6(B@JjUiF4Kf1@|KThOKcf#GOjVj>`PWsVFcz3~#*XzL@~
z(|~?9gnVq`63a19CRIIALsNe~^;Vm&RVkA(kYeXw`PgnUse%ju{JZExe0*i$eo_<h
zwu?^WX>VP<d&yG+?-gb2J$Oro%_h?|oV5+w4nI&dd3f9ZgYDq3Bu6$hh4JR(3NTXV
z1uKkQ$kThWiEleY!<S8s<@f=ih|soc!_=lNO9>og@-H{4F<jmM@g(a~N?Xm^bN=(E
zuLmLbzQxT}-}M6=%C>Fk?TmLmulPZ;<{@IYMlJeqVWHK0^L5g5v|_tdSu#8IoQm=d
zo<inA;(yq*ivOi0h5L({pr^F^!o40?NIpL9K%6!E_kQN*2kJMa>D4t_8gKY*p6J#2
z4&XkF^LG?dp0&=L8%M-+u2VGH+zF44@IH;R3{PAEGmr>a&E^I&X%u$-`^5YqNAw&F
zdb1Y8>N3`gdv$wQyHqcfRQ-P_sVH1)ivOadipg0jo{hKN*`LB^AtSN9OuGcqFBIJq
zOckXln)&}28Q`P54}aBigDrZ8=W?~q--G_4hXCn`J`-o$LH7w(TgBS!rBr++Zj&Lt
zhDm=eup2u}n0*`Zqw2!B-x2`Y4mfU|(HcD2gzH`Q{*GURZ}9mc#_$pc8L;*zYq?tG
z=T}{r5N8muq%XSn>i#-VWqv7i(#`SIC@~{J2I}w_TJOYF^fr{sQ!E!>g}1J*eySyR
zzo4lo_U+a-nqJ*`SEd=umu{O?S;p!69<mQTBuH&Ge9$zegGI>?X&Tr_jwqM!h?qbO
zYpw&-!#;JsSJn?_-W_G#_E%|AjuUBoa^!J!U4y=2v+v00v0toE4=opi@5=u$p<)?T
z559p5;ig5-u@i{Y#yL1t4%{MjJdT}SlS(sgguTybNSv;x1|5u)P%D;X*3sl$L~roy
zbdxPB7g;4;ziQ1j@j2vRO*8mxq}$mmd-M>_*nINSods+8-MM%QeS>JI?D;2bAD<_;
zt9{V}IUR&H&#L<Xm)$CuO^VhSUltzdS$1&5##Lxv*@EmxI;mo6xbD18?D6jhnh;ws
zY_|m(pftbkfhXy*--F*F(*}(L?%~gJ#h6Gh4N;tf*g0te;_-Kd9{kdW=I5imGX{L<
zzJ!qyKt#_0Nl7yD>yKPtTeek*62i}m(b!9}k0H{oFCgt=vPTnM{_N3q|HilKk-U$p
z8BcAIF$!>otssKHj@pC?t-p#}XV8|P3TQL$!+f}ihm_}kaM67NK5pV2d9V5IPc7Qy
ze2JiP``pP)`9QYiTuV!jTObw~M)G)a?${GQB~48~JC6gk7hgl{XjnMkp+n4s=bioI
z8GW`6cM9V3NnJVs7h5Bm-?L;=o?$n2T-Y1-Iu$#PhsYE>Ppzguq+Cp%BY>=Jz{U2E
zIMHi-k()*fd;(@Y85OdfRG-!)b^50Q=qp(?k*(f%&FUWV0aL>t9PZOy=@W>Zu63T4
z;gLg7N3%8uuC5SBz;WP)GxT@+WjB^xN74uj+V;<gb#L93i#;^j>Awx@c|^vc)k_VC
zbp-m~tWUtFIYAZ{o+;?5Q(Z&>&BfI8S`aA2p4Kn*hNmP#Jq%n#x~?xg`bX+eiVbLz
zVYQGHFQ9TvP-*y5b<}@oOfT_3UlkdW^Wq53oWC&cqtoHP5|_M`j`<&8;CASXL+==)
z;S;qPLWW`^dkTY}C$74X)^s`(qId4VS@o+(sYC841M06N_G27wK+(Uw0g=$w7Mx^Z
zqLBx-pZE`tjcJBk9Pfy#e7#t+F{n?T=IH2%f4bR;u5$tNAP{*ahxYrKFBfnXQx38?
z>H&R^L0TK}ozLzK?C@M_t~>BV%{Vk@5mjqA;rA~TK3sl&9PZ%O3b#d`n6^k+@6=^c
zAp@RN3eGV<i2ibwaJh$w$?Pv1MUP_ff-SmVztX5P#3nO#?Wsq7^*`jyhNd<B-jmc=
ziZmH=nNGt(4n%#+`1>ajO0mFe2kwU_SVxsuR&)*8@1<?mWr60MpNa?f`#CzY1yXpz
ziI~xtT2eReg*l4$$Fcq=pk=N2d|k3Y!ogjdYsUI%xf6<bCazCMJ=#ckV%J2gSMzc;
zTvrZO4#uGzc$F=CAdiJExVo`Zat$dL-k#4{z{0?N=r&#vQw}(+-E)YG1;GDfZ*;i8
z{&}C1HBB(O{VD1B75N)7@Z-JZ9|sg_&?lfn>vq4JxTOo3V}MKg)t#Uc<NR?-uitq*
z&YKJL`+!?=6U{gKrC<Bg@kJha^#9m$j-_k;63dHiCo8Y|T@(4_zg`ekk7r!qp9!1Q
zU}0k-ggeUlMWYsQxN`gTHgB0ZI59bU6N+WWej_NoWVu#(ElCV}>R4Bs@NSpmk*h2S
znie@~hN<jGp!<=6u#e!c5i>dwe?*-NQu~Hnr25`tU4NbS=A?(Q`z-QNud8T0Zh)ep
zI>vQw)ACF#A?`TXX-x~u=>@eb?h}9`SZxkMiNkQyRThNIM@#$gM1_I5_SDlU=;yM{
z*jUYdjuk;GTb9gO<g>x8c$ziZFIKU?fu5UfKdTBJM{`C3649A<rPde*SerG@o5t(|
zjJZUg{a+-ESUYC{J0=RW?%y2ZoMy1tb5_pZw;>_#wsuRh_R);517L>fcp;4Oh>+&m
z#{=QJ9~c|qj;R0ILa=GY<M_j;3QiY3Zo+isbVyfW3l_r*t?1G1^=7p98gU{~?a#4Q
zW!0xFtvZc<Y3CrWaNGqD4z+LRlSdCNw9v)`60%3MGZX8*{srx<RyJ%@8!_AvClVtK
ztQSJJJj}1jzNv6d-ky|@UG2Z>+>w1=!cZ-e1s0^TynavIL^$y-hi8YvgQB6!Y$3;)
zWLQt3YjdJZk7RzsgS&;|_?_ZxAU1=#QCa3@d6X%)sLRj}dMxL_hj-jQ1xk{;>P7!U
z0e2Co)?`I|sYZQD#~x=dk|NjuZuzQ*1O9mkoi?58x7-=nsDX{+-h-WbD6jA5jP4ZI
zh0AYMe}^I%SJCq_6@I53E@W_L@v?V}mV2<Wa^}*<3#V2-3^go<VbQn(6h_C5cDaqQ
z*MSMJ*SCJ~-vq|P{wuJ~j^B9%po~&gJ68L1+7-Md+si4YWn22&B@g;g?*I!d?D3E{
zQ4-<kMUSvNtTnFwi3%~)iW$nUmnd~nZ$LrK8)13iClBnUQl^B8^ozn&iFlF6Hze7w
za0Ua}1FBQ<hhdb9dW=_%uRZe(=P?+vjx;ZGq3=3uRMF<LtnNG`A4fBsVlz>L4x~CA
zxJAwyw-bZ4xW()0>)&@vVdnGiOclh7EBx1lz{qd0h*inS!lEGkU>;htIg!Ql{q6xo
zAldW=&zN1zFo7qW0R<vU{|Zm#{gpPVK||7%H#iWWR0HUqJKfdw@Zbi71{`;I4D%$W
z<RDiagO3K*P8>{k`DkToF%L3KfLSz;$It4D2Be<el=r~aBkQGh3&Flifj3V`-C?e}
z@fKaTMgsi;fVlesrx>l)o8XZU{~4e0wK+qy__F@d>v!+iBi1n<X{NS$wkI~LNVz!k
zfU95}PuMkqGsZ;<oJOuLe!d#7cEb5Pc_K31^`$*@D(4x>^Y?RU{-128=T)t`iFB)T
z8vd9G#-nc^e4o}%<)sZ0%v_ub)+%Kg<{K5?tM}buZ4OyGp^wYA#$VYnO?IB)F<TGm
zA9WlXANgNKN>D!%&KRix<lEu2jOlGrZ`SToS9AxzS3|)e9y9EJUx4j27vx@%-^*o2
zUtd3QGi|2hVYbTlbm{LkeEm6FC^(q(jY>P@=8n%F+pID#0&lYpt-6t09~VLOM_iF~
zw8R2eI2G5Rf!3<HsB6LLV5s3W#Go3jloHsky-xS-8b2P2Yy4>th&O#Q_BG~r&+;#K
zAu)k?K{1j$XTE20ELHB}3WKJfP7jB~npn>B3|D*8fCVV5Y2LUbd(C)~eBFvl0lv*)
za5#MomI_;39A0=VgF@nQb!$alRWvyUjWU0}qm}Ta^$_4!ms;ffYX7T8c8My94wEP*
z5OW2)U2Xvjkx*f9*ev?=*&PcjvF4tQDMz~FeiKjA<icF*Ob49Y`i){b7DmS9Bjj-k
z_PFup%6rRBb&TY{v+@6A0UVsDQ>?-cELip5a>WH+quUAxe^b*k^PdMz8iQr;gg0O-
z=g1yU`~4w+inL^UJ=ctTa=s=0)~!xH^!vwhau`Mg>wr53c-*4N|Gc*7#+15|HdP$=
z=7O&@M+fYMc1(5KjBEJ|N^tHDXs#40s?F;t{Wnc}B%u9FR@C!Mz1F|^@h2ex%Bbqq
z27Sc;&??lT0)2$!nk~2`s#{v4oDe<hi93DK3J23rf8sYAoMh>W1b;l3+Af!5mq0hX
z#Nt|rF`=FK`%Q@S1Gie0Bva2!NE;wVzB4cCN?JOTD=wKmTQ0@uF=rv<swyhCyp@6w
z2uMVq;JZ@On3rav@@DR{_<3Lw@0|N*HHme_`y$i)dyU-CMppt+O)9ossdIh=ZGbW{
zyGi}`xpI94XxsC}J2MoG{@y4aBtx{SueZwPmOj&AvC#gVD99hpFK75X)RbeQ1mNVz
zMb(_NA92;2FX4Tr&H{DOVRskSRB82pW2Np07=jr&09w93p#8>WvE08vffKxBstfo%
zbgpgOYgf`LjK_hId2@T(r844Nyr%J<{d}T)A(&WhM0hIA`taywlIUg&9=m^2bIP-m
zEjVT<X*o<AKF&>sK4Qg^q?W7`5@j*yl7Ipk9N%l}k8h{ueQw1qRip)QYigQsY0{~w
zsS|MuP(a+`=bgw$19Wa3kGp(!&z{G>UJ-6S=uKIC1Mi`22=AWSbu(}=2X#$RPaqFA
z3S@ek(O+(>v=tt0+8}ZfIx}D9pT8%YV_P9}P--s*itZSP>IbA&@5|c?D6IRx?1+E^
zo9XbQ;Jfqd1q_M`?`Af>es$2=_Gga~I<y=5y7EUL#`7?Y^9xxm872Q(QPD3C!hR6}
zl8*v^-z-r0I+O)pprCPi@Wvp^){y#Lfg;QHW#k-OwnBC<FKC&l<W(}Sc>%%Riptuo
z0hIk;Zg=(s>_bE-2idA`dim$qDt+pq<+-)(9%5O3muh5H0GIB+kQXwV2Z1bw)cv(K
zqIQ1y8=!4@;Dh6I>Qs*ZbN}wH#wD?i#&V6yTdM|*(Itj|h^wjun=x@BybxMi0C%a^
z^R-x%uq^|Q{~cqW(LKGw)1@9!h&S1Ys&<UL{bx(IJ$NQm>ef*FqL=V+ANVZH|MBB5
zo{$0j@rlvVUU=IN@OcRBBp&2Ws`+-wK9aPNJ;FvPp2NTTM>5^qcI2FTEZZH8>!#ke
z^DA1AWgL_{zYI6?w3Kh3zv1tu?=5SHmV)_DKiRajjsks6^6tT&UrA}nQO?OeU;Bfz
zkDC3y_S_AxHBr44<pX40a?kZ|3tPoo(d3PKo)2#K2@;R&gbX0{Fc)(YgDV@!9;Pn_
zrlOtOzr;H(e7e_5^O)kQ%rp0Crt#be`kJQ*!2avK;@x3hDhZ^napK6rk+Oc>KH!tN
z>CE8XrexbD@HI0Al814IO#A+g0|cQ}>I3Is)o%fwdXCoY$PU`-=I0U7TS26yL^C}m
z?;jBph+(s(90<a5_WPgGue&mkLtm4Uw<lAh5Q}qnbFlW^;->SGlL=mhaeX5AqD}F@
z9{3Qs1=(rZKa^W<4%9B;(t>3GM5NsRif7s@;}uaHC%quW{y7OFz-s9#m7l0E-H)FM
zZxIv1@mM@Qf-MA3EwNu+C+fXDN}(QLygw=4oZV4Zl6bdsd;3yAQGcfMyQzhh*Hur(
zH{MDV?T&A^2*1yl-!|;oYF%vtvZHN^klmI=L}xh?W8jf?_s%b7Psfe#yO`eoqJ@kM
zW#3+t8!`=7Z2xeL9mFH6@~eKATUsBLem?4Mi8ROZV?bIAS}sW0a_>72tLXN7d_`*?
zF~$zw&$G1Nd{^-wMcx0d<q7&FXUa|U=N^yzew(*E?qvWaK+_tFV7_sE8TU-R4pm;Y
zfwmnug=*|>`<{&4@6N;*G+jj{bp<iezrk}~${8UP&fKYe7X0j7NQ7!l?Bf;(vzn9R
zib4MxoXE$H5GfNfIh|^%t;>8Fd$YCOoH2?VeMA1&n|HZFp09v?%37vs)~xq$GvY)J
zjk@|xww-K-5Z7C@2C2hnuWIkrtBJNC?mv*i#GMa*r(^LoOgDH3qciaXmUyj)+-U=Z
z?Xfsx*Uq6YJA`^vrV=Jv1*BsUHE4&CCPGSKJnMJ?o%q%MR?-ohcc(hQVkqL=kIYV8
z^X}p%qM~5n3Gs7@D+!gl+KB=1e3!M;T{u<cd`Ay>9K<>o5ld=!J^Ou)ak~xPi6lK3
z2R#WFj(6u9HXmJLC1x+S&X>N#rhC?IhLhsA-XFDGPYh(2WH`4tqXSL+|7)smdjcAk
zr4R^)T}g(RY5shPTLYr}6N=go#gW?2bjNiJ%-ix*@+h&8Ol%`s^y&TR@X;JE2M+Bp
zLEr+eRYjn!R|kXi@rAxudn^<;<?sQSwd1vUXKC;^86xzx4qq|6n{o?mwq-2|JaeG6
z_`=k==(>1xGG{iP)JK&g^GgcC{vKS*nu|s}!dq8eA@?JBxhYzxZ>o4tFoSru|AUsu
z>5c_66Bia;9tgkw!Sw-}@uXz0&6!_T)%e|KMZ-82JXMaxMZwBeR_JW#&)+J(e8^^#
z73jLhHq7n0%>l`qUw;T>E&GaG`uGyHo(~a!2-X*KKb>8d`3!x1gZs5NKO0UXEscG`
zw2BonX@+*HI6xgxO`*|b;!{4{ADqSp@d@G;8bCr8KZC;wS>ti!zakOz_eDJ2PzBsw
zXr~L71S)vz%*c>t%HN;g(Eu%Kr^E#6`+7U2`~z{UrAT$ji8OXhHpb+-!S+L-E5JX*
z-z(<5Z=TXEks?k_KUO;6fHSVpZUjWI>jAmD5;tlUZmJ;syiAmmF~B7?TcR-d@iBt5
zV#Rab=z7c<Oo%nh`>UA(>)dp5^B2ZRtgfb}bkYLkHj1rMwIk>0x?~}E$7cyGZ#cnJ
zXiJJb=27ShvAU?LVvr4vdT17@>2Sd$=FkhaiH0(apv?(w4YfJxP?ilnh~HeYU3Vrt
z7jHfKH}BZ*PfL<jH<lA)Ejx8QL$4=nvira-x|8k~GTh>`(>a^Ik2aw#0<>8B<9Ss%
zUKCkwqf?r^_Q^x&R%E*i21x*|W7Ecc=ghVgsLTM3A9{?w(~V=lZjD5GtS<2B7WG9l
zW<fzazvd;%!O5cb0Jy&9^LQpydB5U@bSnF7hzzyHZK`M+cU;CG_qtcO3w=UX%|nl8
z5~~M^E)&Qg%Yv*z_l+^Z+<le}J@RN-6Ak?Fo4d1G3-R}OiLo-6INXCz`;kn)#CO7Q
zPAV$##s=n3@<yO^Pc4?KrCJ_)q#d75xpV&n)e}a$%*r`_dzHGIKGVFGYhz8exv`(p
zme6pvB-7Cbr&%)63qx1sU`2WK1U4p;6ppH`pFSeS!>?1;Kp9UrEAH0=Xiq#!mx9W`
ztT^B!w+Qyy>SF^Ikak5EJn!w{F;xM3N}}<-_0|9DfN|UYRDX%yad_4-8KMFxvIJZV
z0DqyQW{5v$w)@UbFasEpUJT*}bwF<ahgqFj_kqp*v4KThL-#H{6YYsS{X(;@q_%wT
zcPI}QTn?SJPw0-zT3@Y8Ebzp?wY&6gdW?3VmC2xpB(?=}GJ_?sBL2~l5=aN4c?kc%
zU3dTa`lgQZPl;Ov`^!VQT;gk@QvKn^Knz54v#j4?k~gx0J<KR>al=Q%^prJ8$1qiD
zqWzq)HL_u5Wif1+`RzN|V4S=jT&2)ohL1f7z#aqud)uB}B{!rAqip|%hf?<SZS$_y
zgLaE7U$@<rcz#(jm-d@|5ESxr90>cE4)7+q-!LjU!$RNU18oIVAG&iSYgZY*p7<Nn
z=~?)8MZGdUcC6~vfQ9mt_NVKcjtB9q$3FT=nuhfAP1P!`q`w|L&d}nF8t=cWUHy<J
z#EDMBlm*_JBvVjM>|wYBb;3Z_>Tqp_Z%Vi5W0Ii2v<Z|JTW%O+SS!BL0lL+k$sYdt
z;4p&w(3A6oBz2eNrDd+%5yIJiBBJB~`{i<U!cZaQ{09vML@s_k=<YFoud{IJz6YJ8
zc`|anzrWsA?2tGlw;R6lbJ3spp@k{==@TW|zh3Tmh?_LSYb;VuO*nsTS6W-<;>IW%
z%!E+PUuarFSvl@f!JnA0<}m{Z5jS!l<M+6plG8wnA@if!Zm0RA>quj@xG*Sd;~8TE
z+g@XRA!%SdUd2TpRKXH7rwtRB^^85QY-2pb-`+l#Mq02dvjh^{23KVjlXRbp?$j`u
zNWu-6(S8v5Nag^FOH050gUYQxH}m{(CXQrXEI7Oo1gbI+lZlcOX6u4)`vebHVd84{
zRNV<UsJo!A|Kj<o9tir)z{=;xVR?^JOvZ(z#(oA<`zAR!Ay*QtW6$ULbYK$~-4lan
z7-sX%<@yGI4-3odtgs+%At$agpYg0g1-Mw+T7Tuv_wwRe6Rpz1%GEq@93-0vA*Jj!
zozPmr|K=am=Htpu@?bpWdG(iHCB=^B<){F3RNVWI!K!H@PKM=5MLY3?sp<h2i8H@o
zmtX$DIs_Ar(Qzvfq(?%RwzD>fhIJX9h*L%6Du=>CQs{fZYy$q<m+(QYc+k7MWyVhD
zk+FV8o2S4t+*Y~<06K8KOLn)Qdf?>g__m~B-8W-;18H#c{@yf%kut;AaFdp6+(O0X
z$;khr&x7+8-?+{1@8v6NKU~m+5Nrd{eia1zU63EMm*Xcu%%vBleiPPjN$4gt-@ncp
z`il+cdbF&H#fVerY6K~(apLR7F5-SXD={fs77uWReLLT#=nFvm_89r1Drr6{&ak_>
zrS|BF=Zrx6&@Pd<IqK8D2)YtJ8G-*}t7490Ye#U1Q1r9j96S*3fOiHlT63;X=O2Tg
zj*m?6p$Pc+0Bz64Z6+on;NH1XC$DaC;Sp|P_yu06TlvtBZ#g^M=;hP_K=|yHI%}^S
zJF$P?g7-vRP~fGIzZp0x$fH&kLAm51a;JoKmZjkd2yk(vmE1b)*{s-HhVNd!qAhXl
zf(4SM67B9!Rp>mkbMG~oz1p?;-cchZ96pE>)}>UCcC)D2ulER3d*2%4ot!cgU-?Yy
zx5vM|DEyF%{LdU?O))dJ|KxBinwbWCWY@e=nv+q8{bdbt-qZVXDb0s@3^kc($=VoO
zMs+y+Yiyz19=~0+dNnAg(tiC@|J~(WM8<1L&0fNdap>_XaT4pZp4dcu7vCfFSs-Q_
z$@~NF&x9J=ZHybxVSC54zH~j=wYormS8R^If{o!NK$@33KRNYYcnsul6;JwR<!QuZ
z@((JOCPKp5GK}e&uN`V%R>^m}<o`^8Hy;xwNx}L5v9X}Po{9~8Py{X%+IW=F0+ud5
zh^qW;?KtGyMf9Z``8=iAuC177FBaZkS<`7AyzFqR!BMco{Sq)J0qQw^LpRAC)Rn-i
zjHYt*Ysi2MX#wA~$ZA|SMUI?DJh)+jqv1K0B#~$;&L~MU_+i`heCND;?r=Rh*=rXk
zptD)a1G=7_YlKce28tcut6l<o-mRJTINhJI`r*oA1e|nuG0e_UfJ}~Bd2%iWJ~qSx
zF0z9LE-y-qrBQv;G>w*}J30<aJl1rS^Fj+z!bm~nAK&%DoS$GS9>>(Mo*Cw#OSeCV
zrL>TYxpL!D>{f&Nl<77YKvo3+6R|2}m>M#9;%vLVUiG-!JpQ%bLCZR_X+a({=8f3>
z@rm$-A9}Y{y?MehzQBijk~T1aj0dF56@`|SOv<0<W@{x@bx*Cq8wL)Z&~myxIT0Jf
zly-<s_zVS^@j`Um?m5ab^0}87EI3D9TNV>r=G3DPuN{e8&U9Kd@@TTyre*?m)6ph(
zYBzlPqKPRS>416ej@e~9qchV*PSJHHM5F&0L|eCaljeHukSa#KCdt))_a1OPDfME!
ztKd<2JwHER_7Ww0;mj1nVuCo*u{&F7x{fw^42}crk)61jU>AJk(hQ2=qj`V(*|MPE
z;>?`}<OwJchY#@Lmb8_0r~Yzwgcf6|=&SU}_<jX$f47w67kGP-pM<y#%cF=8xSstd
zFqzMAHJdsFm%CImOtooc`!{<IxkKdP;r<@$|9acmyc?d=nYymP)%u8wm>7d-e^~Fh
zWAmVT?{Pkd5{BATY790mx4<{S8<HYEYD3Bzm;QDF13tu@HO-%AKRUDH>i>1Cxx{Wm
zRHA7D_&vwIk-1_9T<AJnd?y38uZ@-7>-raotq4=`b{9XaBl%^#DM~z~b&9^pDvO>4
z>6yfAm0O7nQOhS%L#WOWCnFF^BU_7kpfkgI5Zf<!gh}7pTc0YL(147WIDn2*5(Tkq
zZU%9~)xS=gm&=EhR_PPz<C!HhG0UG9c!0EGxQ2n}Z4NHV(z;8xx3`^qy}SNnKqowq
zSS{MMW5a4OnAFJQ2ZiEXxn9A0PVTk`<7kz_-un?|VbOuR_HGd1e5zME!7F#{0J4^o
z6rU7Lf=iBf$EOj2y^rwuo3bM~arg6baEH5{pIiHRHxDwM!2N-XjU^9G3(0)iKDhCC
z|6cB#XGc%ai2PnkQH}ckd^?Qqgaxq#f=&g*10}-lv|BFw*Z&9&cL$8*)pdA0)<lJO
z6g3^|&a=3hV(&UTsC;F`xxO`(kj!v}d?7~2y-LgI!eVSlPJY8Y-_4vB0t&;K%A<K~
zWxViCp(O>2FKXzTM6fbVdr;kfSUb%-wX*U!tg0&5U-OaDa%DZ6H!-;l(0?JGPWr~8
z+(0kvL_V*;6~HjeW;cQ~Q)Pg@S>bIziSgad1Dx{kxz?LH(3MCB{#&qY$qm{Tg4LEr
zqi~S*>EHtjIzBo-z5+T23OdWD*M;9-1rbRyZI*Z|{pD$dy*Ef^FKc*)TRSV4&7Q3u
z{!_lCG-znyb6#t`xAyY#UTYP^13Y;T$>!WGIzw#g2Zf(^62^S2NSIzljDBARH&=IT
zu9<|DT_ZG<_j+a;$M*oHfY&ipVxMUcawD)I$CT&K*3$@J`<Qs}`}@mfr<5t50lhzd
ze=m%=&HnxRZpANF*g1>P(f#;mUcMxBMEEEWvHOocKJe!DRc&BQ@;|)CQ&)CPmLGWH
z&xV{$yEd0_1Bi8bOB1cPDP`I?fpr;^QEaCGZ?Olz(y$4EDhd&j%%}f8{AiH)6saAz
zD);-89594jyN8Y{Ty*aCmT4oxRzdNL3lXH4R?MI#w7t9qzBJ7JDS95?S33VvI<xE~
z)WE6POyKo|I@Y|u4%D}ld)ZzUOIYOTrYr*ip}@yG#go-XCMiX$t2{KQ-HI`#38Xl2
zZl$b?j}mj`P=hJd$_K|Od;g>@A?@8xXhZ^sUb}IlQufn4WNcRg4rS~K$4<Oih_9;+
zu6b1~*(NNX4cg=ver$|3CCmPrIEJ?ZBNRT}q5nWQ{O!%Z_2xqmL6jwHhWnpsvmv2R
z*^e_>E-YB~^h(t()7EbcZK3TWas9-%QZ8~M=aU60X3r;))^7(#zlI+*5)%UT76j`k
zq7FdNR&`~e2h$T`gI5RK<Y7a36$Ul&`plE>;}w3w&zcpKMGswAm`A00BwTUsnO?Gx
z)pcpw(RDVkqf;tFyVFQi-$~Hw-OZX8I28Y-Pa2CCcd5;E8EyE0HEd1z4(7D5?YWXl
z;5Lp2jzUEa*;%553UYSYOn<3NyL0<|BPfpYGuOfIxI(ayrLE&bSGSs!IsGTpV;7|d
zUTJl=DYu%}@Pxh2r3knf*8s|8noTxKUl_UDd>h_~aZA?-3d|r3Gc@3x!{&<2Y{}1}
zKh4_7>;{CKX52&#FWZgC{DPBG&CFpKJ~so#*L=k)-zQATM>}nTwVri%DF&)Oci|o;
zhbNqB_Clk0B7)!o%hh~DE9(y=i3zF#WZpZrXvuPTgafqh-%q0VQAj71vOe~wDd7RR
zx#uK|S@)vQuEVTuYc8ippcGcmiq8RP%e1W+W!IATHz6vfRO8==n?E!ifJ#y?>dTDm
zGMLyunZo%Bi-`xPWv+HclB$&*7`?{%eqLrYcQIVOYGm+gAT5iuEJ%Ia2*FcIHzRrA
z@f_gG;7s%GZ{MoOp3kjmZHzc4t-ii~V{pr@I~z+2T+5&u?|6Nt*75XUZxLj_loNod
z@Kg#$w(RbgWjUB<#*t!rk4qzv4B=$4$zfb%`g9SrkH5U<o@f*vAa#Fc4#&gV7B-rl
z$Oed(Ti#$wIEV*dejnEJ1{-+mV{6DOVp(`sdzL63zl>7IcS~9v_Nw~8)z|9WZ%Dk7
z*TzEj4K@FG9IE%MXRS|<rSkN!i`U%mkEYSLK-*hdk*(A@i9)EDT7$N+IkH*!+8AA&
zx7Q!{1oxmXBCix^XWO$A=*;eDS<`^y%Yjr}cf}1TDaBdGuvQw{cETA!l7YQaHLEz}
zO%dLjDgFnwGS<22`mHb}=Dt6x(nY)iXY+<}y;JNc-1<iz!sAKDjXUwHbr+@&q0ZTj
z-=H`gJt^Msu6BDzo^hN_m-mcCKg<oXh^6uzrzYeaDEid;eZ=jSAzYD1+nrWVU%EC^
zl~^eq#-xvEyEcM+iDa%NH#FzuBse1YAf?z^U|YmH8%}pl)eSS>U!%8Dv>lp%JHRJ7
zvv3p;9WARrBi-zF<(VTlJi|DqMFYjZg(<D<Uy*SBq+RtZYkgQlV;qr>pngB!HPSVh
zx018uTkJ-FmUL!g7?9<wpt|@7H*oGUk3rNpyT=U;fb8eVMahmgW;mZ~uDRI1vq2xj
zPU359h8tZQ@s7Tv-9FV}^&R-Z^&1;<a2R-bYPM|;TrCZKGhTncU6H~Y%&xVjl;J(J
z-XfoEcxN)+dPE`h2cOpsvTxXmb`cO+Ir9843Q?kfj7JQFu#Jn(>t-5I9pXaf>VCWJ
zo_c>?)xp|%sp4dv^#GN2(CE~rzu!+&^t$~$dG0AKs{g%_{->=j+!DB08#N&OyaI48
zcH616JD)?#)F@*_9hJ=8-t(a*Jcx!8wP;fCwO&1yxGa9dFw5xL7qb2IqZR&S<934s
zBirTXc-^|5_T?K0+QVh}6wF|TJ3!OavEqi1aoN<TvgO%RB8iM42Vi540dhYC7#REB
z7zx8hw=M6!by>$xY3f4MrVv5)CPx<WVxQd1(!R`3W)a)U%*V*fpLzOClj~sC+xAPV
zgos_XWbA&>9kS}y4@^ie2k~0!*ZPIH9uHr#l^yTQcQjHEt|Sx%fwV*LknJLR9Ph12
zvpGkSt9U#?Cl~v%pan5qt`;U);UOifF=uGg_{oIBifMtEQimH2Pw#|q-k*CLLBkI=
z*?xuBU27AWttIx_)3L9TZNU2n)!n|x<@m=zgiPnhJgzr;E$%p6`l3(Q=}?J9|7tal
zd5f(z-P_#W@)LI9%X7rWo`2BFbLVzAcugnTat}9QRPvP^1tOLdsd*Z4u{bacXWQD%
zz4v>K#2R%VE2_ScsvX%!|5)&s59V00(5l@IdicFiP_66KzQsJ$8BEmt&nq!5)9dWf
zlt|4ww?9qopNN&q&+LTWIK}7N#O0U^AlAzszMX`L4gvD4wj8Mt4Un#{Z6x_BD&NOx
zY6oVQJPpEr0eEKaivJvusfur0Ky030QP=SB^g^qcn^=MU=^|f=DZvD>+oN4dkE31s
zip#O{Sa;q^CiV%70b9IzOpb_ermr%uiVj-65X`VB^zab%a<Sz3JM`(!eBJyof3?iT
zi-u|?E)R2!_x56JV9>~GvF`+M#%!sUUh<V~vpp$lGS_7S0F%}fn#dB>i4$Kkp>vt3
zMskw?)b|B;BV~QD{QLP96_}{WtWLgA-yqo~S%oAK1S)c;^{nSu&hE<AU{RVt)}(@m
z8F{a28FxJu`&TZBojqzM()w%!f%U$uys-KdYMYNQQtlBwn=Il~$Yyug-ctjLrm2bB
z5QFTdl2v0)_u{lVl0p?m`Bkn&NKEj35E>2sQ&vD1-p;p0K8A|is*g_n!7Y{K;^oZL
z;(pDBY~ZqL?<?;Yy}-E8t?kPd2{(Q-2U_{1=C<@hZdgk-nP)PKm$+*&ua8%=TusgU
z=*K#)LG%@M1ZM=bmHw}e)D_$bk?R4w1?U4FutRIS{egn{uzC>Jsv;<Q9nGnoz^Q?W
zjI_PDib1F{@9SSc=vhKNMjNodZ6P}OTr>G07XHJ2X`0@5k3Ha9W<i;{p7^TYM31iP
zMQp=h1}(%PN(2vcab#RvBYbmhUlQY1Zu;`kaq{Ez7U^aQsflgkSlS`^o%U}D?BD6J
z-a*K@rtNPPR+nl0zZF`W9AJ=zv<76rHty8uck9_W1+fGUyfVaE8!wk5Hh;s<k^;Pz
z7TH60y+0f_T20f`+DKpkG}az`D$9G@nrSQqVMaFIg+CwT>5dN~3uH%7*M(=9Gwh|J
zJdy|Mvjo7IHFM&-KS#dk)p>|tb|PZ8C{WWH1%OCXjt3v}-SJ(v#{?F`2QtQyl$w}d
zM+LsmMgQ;PlM0<F(fC@t!G1<~$7A3eJ_eHcfP8&=seYd4xxa2T@8jJxU+xK&L%Fdq
z-2}JqmhHAR+#BM;=xY)NSJ#ia30bz@E+(LjKZDBeZGKo&VAgrQymN1jA!}5z=4U_B
zu3!mDI*C*LfEV$G&PO340bA3Qv}eg-z0F$yb01a{zn^kNGJLKk#e;`rm%GBD*P{1y
zt2xqcR)^oO^u;HiCto#Nn$`R<5i>&|X=ob)QBxK^X5FS~d5yUkan>QB4A@ka=c{!2
z?cnu5pTu8B^#sGJqC6X*s_8hXS;n?l%w(<p^@ny7g)o_yus`-YGOQ0pDDe<`rN!gD
z;gmU}aE##g?V3gLV)VzX<bSC#c}4MRRKAp`VtXh21`nSuroYMSYtW7!Q|dG@S3z9!
zSR#O)sYLn9`pS<fNzTa)>-Qfry|9U3de!+Pb$a@PMhyh$vnTT%pW}oa)~mbkNA9Cd
zf2-0jx<Dv}?g{tGnWpel>#iJ_Gv|j2G197EW3|iWUrb$BN(I71p)C3o(?8wF8I{TR
z1B_*U2Hp$kxC9HSSfBdUk*gFnQOq~~{{5}MCb$yQdZlyBVSql-f+Iuehmv=A9odJ^
z-NC8)i>m5NlE-|Ue?NqeuUJXv!YC71(y85hkgdLP4D5JhQYQN--~D-mTu*EV>k)Xf
zpW~S+c7Cp5on!ascjfPh{ivQ@Ze6vR(~{a;X6sLqKkKeKZ7ATX%m#cUm_;_XeOeHQ
zyzdct3yTA33PTg(@KN23H%XryY=aDOuy|RmE7Sv>GVIz!QRlvL1ToZCQYO{ZnHNi@
z?+XAquI=tK@$4r)2kml?rt>Qb^r#DCC`5X!u$gS8iU0CSaIYf6ELkZY;|Pd2{h~92
zoxr3b03)4XLF0&kt$~~iLQOMGOjmz9Kprm!tfmX)`&rXGJmy<po#~Y*r!7r;`;(n#
zTjw(H0tgpC&DGVa;^FEMWYU-pwBdFOWr@*>WZ&qM`ZU9@+9HGeS=hD(J~j22x(bjI
zfHPqQvUyusW~3XfZ0rQlaMBTxGlOWG=9$8JK^fDzA^Do*1G)XPu)1;f?tCgvnmU*W
zPeI0g7XmG`s4}R`7S>I^CX21PfMiUVI?*r6b7oRP{p$l;$T9d$+^Kf^TV1%uW)?13
z)B1{oN4@&~{nT}Mo%Aoc@7NL-y-9QJPN;R1$Di*dRj{R;mV|g~G8`MLxc^Wu>R-5+
zxWweL9U4HcmaG4yRqjIkvn>$IZYrPDSB{B^4R%+XIG|8DYL{2H5?q0Qnn<D-c$MP+
zB0BI%2GCAwK)RbsSYM6zEnjwl5RP|uXZ`kxn*=3Xeb^OMwtwH82|TQpiWNj!E4lvK
zM-K(W*<5RXvf?Ek-#xF>8bdArTvp|r7~g)UF0ggqd0tRE@%^GeE)CV$@R6i_OqFKY
zU7&xa2evGi?eiq#yK@NngcJ}*w6hmBVAIo!=vn?pUnZ(I(O`kH!1W$)+Xzd!@RGhr
z@3(ngP2FmMlI3DEWppKbRg246rCak`w%rPJk2kNge7Smi*s{gIH)rhP;8uQRf&)7g
z^V9X3=^KxLEshSpqj_?%t)vdht28}fzNwuDZkKjCBZ5_C)hhNVg~cyLm%SQXX5IX=
zCz1SEXPJjl6|ZTZt$Ar(-tJ?#G>vlm+d_PgOsYQd1TZE>XP)2UW%_R$XKeT53MTyW
zGM`hQab)8d6AznR?(Qoy4f@F$wm^C3c0w!W@(C!s8!DyLwB|qRY^V_etf=zN=11lH
za>di6&zS?h_)As2?j(!RsTUm<oh*UTHb~^VqK4MZaK~LRD1hk4e7ilRQp02E0&Kdk
zM~i*i1YFT_4n*aFab(xdEB}0xvW}x#mpa6;l2~`NU3mGksH_n80y{ASq>=&9|GjeK
zVKld@5B(1A8~$tE&A{?Eth&CktFeYNbXOqhdK1*mkH7FR-kUwDIUQjW*6ol<*q|Qi
zo4H`E`z2}U=<j%;;BM|uQA90j1tW-jqO+cbt*UeFH^TC?Irb8j1gRQmwZA8HCA!H=
z?T3h^4%*v$vqI}V6z@zEDiFGrD`z8Po&o|D?jWR!E-c~!jNYrBDMZA?VZ@)nKHsy4
z*_;0v$0D$QkBZ#oBJV`8RV7iE#3lXog~{K{xve}KqEC}8^<)V8+MuoE-c8gDuuEgq
zG{8|aM!eQ~dN<3elFj`mf`V&!KMGUJIC+zwx$6kXS}=vEqUS82?;k|&<<suxTp8Mb
zS#j&T-7qhG&t5Kk(<>F*oyOVZ4&O1Q)52(#pt8z_Sa0Imwut3oll0u${3=^!pRuOK
zfWB*xK%U!D&#-e4WDO<P#iKkCPRgDTyHYB<{rk}?FH~!IK}@M(zd$zbXNn29Y0Of5
zhB8pk)wRZI{>b)9V@j#-BQewFzUYI&&9m_jBthd~R9)mnBt1QOiVV70?^Ra2?S93j
za{^WNlHop569>a_19C^Y9huz<t)r#ORHupBYSQB`f~y~Z_d_b(t^&Yg<**v9Z_EFM
zrqmmL0D9MryuFP_as~owa;i<hihTH-5ON1FYZq7Rxg;U!P;(kexo8qXv%SisV{PYt
zaRSUZKnrP&2{Bliwo<g3C5fRk{2mT}RYaC9{dkd5&fxWmeeIMoDrw!r(~I&pOLYUp
z*jblS+^<wrhOp-IsoVHenw+Uj@@ETN>kPijemtmOF2rAB9j<W}ST}-O{OI!RXq`Wi
zZ-G{}bpC9vAkA^<iu5zz=!$PzZ~co$vV>P=dQ>To@w1B(3Xao~8d7j3^(b4b{~NNq
z^IjRG@(04-{GjVnS<hm;xTG!}H`HgIm9p-yXdGA)x@$i-{EIHc>hOsF4{ThN{CJ(j
zs&+TwLy<0XO{HZ)>~}3P%Dc)5XDoL}sJbqL_`YJ^4o7H8(wGlgT8HfqUkH>?|FhJ+
zBN-0t#;|UsQ-(dSrq{DfkC$A~t#FYk9TwBao%xLr%*I>T2_TwD2;?V?Vjlo~$I-g8
z-U5$1TE0O#HeH&Hu~Wu%;wNff<RFCc+*Y2A7wuoX-#8PxxIC!z2XNT`r%(C+|J0}c
z>I$`pQ)Ov9IWaMNyx!3CnKp7;?>1c2)S*-9<iE~vp6H7Ru${2#lxNrqc1}*8{%WSy
zecm--f=-;8`OZb<JWcws+_;v2#1zyOcGWlC@T*fvdU@zgJw6kL5J4RQN%u3N1GeD@
zHy#Q))(S^^IxkdP!X@W@RkojXsU&$YaLzM!!rTp8ax8wt;_RXYT?z{y*A-a4TWJuG
z99PLD`WvL(>FYutE9s~i5r3n4TWMUYejx6D`k+oOR$W!D*C3xAaDmf=U)d&%q{L>X
z?BLoRRd%aWu5U9u)@M8z54_12yB-9-AD+SkS;bQWq)9mR9En<cmzG=5UOeyit+qj&
zC!^9CDH6;UWf+DbTyI@gboCk8q})jJbEw>|Q3;~+;lu4RO|s)0#H!p3cROCa!yi=N
zS5OSZ=e&3|IY<mKK4eF2NthBa+8-{Z1}V(vej;7c{OL-o&6@F|8^_z0^?rHmbkaaW
zHfWj^6sn=DP!-mUj0>hrE@-|fV>Zv8O^-z|&I(pG;muxgy+z2E7!Q=%;bDs3ze*F4
zgyVMElIp`%3jNnUCLdyK{yLA0=!8zk1XRe1085jFVvP8z89F%x4gW9B-YP1tb?X)l
z5}e@fPH=Z8xCVE3cXtgQ+%0%;cMtCF5CR1ucp(K8a%=5>uYJz>_j<Z-vo&Zm=KO~C
zua7?H*GyHOnt%+QhhdipcvhZA&81}nMVwlgPQe@ZF25&57J`4WeIym;p%3?E*|M(7
z{em=s=z>A){Bso%i*86?RxmtJ`=`k8vwL)TJ>K{#+FtULI7|XZsqc<<iUt=n+%{I}
z$;evipO}8R-H}XLHEAm!V-N3Fow?51_v!WLs9IiKUB||yCIt&V{nyAro}k5zXM-XL
z%dS4#qD!VdW7f#8&sudtC2kP2!`G_gD98S=;~<e!YrDnFnk^2|WFzZ?r9RxYgrHUo
zZPnU~>H+|{5#H-^9x{A|c-~n1lzc8`d6RQT;|3Z|!jG3VM8(@#S0%nQH~38<>#sqZ
zNWcuUP%(TtLrXIl{Ei%NWm#K0TX<Q0L1pN2_46m^L3SilJX-jPp9vMno<yro^z8Bq
zEI<qJ=o9((s8d_;xnxjFnp3mDATH+q?{sG9Drd|oqb9rb&$m{Ci)(s2qL0tWP2lNl
z4g@0<n}o}Fu<{;D^*ZI0#N^hu9s~a8?VCW~Em@WthFMlFB*?msvqxpAMkSPd92{(U
z(Zq0O#HgYApj+r6=W&}sE_nQ_zVf?P%@oiGWe(Xcu-Rs#3?CRzi1J=&w<I1j7o`M`
zD7_xj;~kWDv`e9PNST%9|3bbYfK3r2hE1c3SNf3iRNEguywCyxgrCDntK#XZls(}6
zW7@>jd6c&Y(AzZo@3D)*J7DnMAn8anz9k2pz)Rn0mCjG2ACWB)qtvon1VdH?{8oJ1
zFiAXG6WRv;?I>r)^v*+pH=CYB7UM2DZUXLpufI$riYwWS@54ACK>DLGbEuhD849t8
z4$lxJsFhcg<MC-wKB2BFA<pH{mDc=Ow~a0%buI5}X%jyk9ugIOE&)p~@WY1lWxj;~
zHqJWA@<`K&9UF40KamMPF|z{`bM#RA_E&C+$9+0~ING`G3?y+OBBI~G#uwuk54n@G
z1={tvHp(x7d<85m)mnl`Z6gY6e@a#Gvl9!bO`BR0XV>!O#P$e4i5bO@f?*+=xUPx;
zCuNz3FQ7eMaFrh@9`K-G|HFYks7C6>MF(X82z=MkL(K1Hp@RaHR~{_ohtt*Tbl-;>
zgVH<kyq_6i84;M@^jlYr?=Nofc(gmXqb$65nDM#f{iSb5EIEgN0rNKU?5r3@;`_8n
z((h}wj;}4!zhq<a`O~#IfkNKrE0{Q92Y5DzFXq0J0jad{DF!|cF0{Y8E3e)`b*>Mz
z1iJ~)uH-i*5IgXxT={G1V9IQm)o+qg0W7zq{S#pU-cOE|;-Sn&pGq-sJ@sE78n~Vx
z<>Yqlqqu@!V=P{-$rd|2VwMg4I;qdO)&E>{>)CnjYBt?xn?KP7%|~voW^*GlY^`$a
zcD1MMaNO`qGxQ5vy4RoezuwiC!(Mcs&nPH3uxOcr>-CSBECeA$yTdef<>ZZK>*yVS
zPBP2M;a}-HP7m43jyh=VK(9CVV3fe`D<3~IF)u%n{K8{Rk_GrLeIWq#mvNuG?FOny
ze;}ovGSPg>B?`8?4;4B6UTyg>00rFh-euRSs35*RuMC{imyQ<uZDO^W<iD~uNXYe(
z28ZRT*(KhQMxAn*N;WFPr(zFHrhB6SI(F?(j@?1k(CCJ_f(X&DaG1IUBd&t*<MvGx
zK8OmZL1r)?+}C|>7xRsDCt&CJGv^aJm)SGxz^@h#oelSWN$y>ETids@w_e)4U1w4L
z{RYYDUs~Q;yKezjoM}!mudhb>`mEa}*`^0pbG%55Azx81H8nOluZ&Qna!+|g*>`Jd
zeaMqYF3mn`HBVy-$vlh=jD`D(WR29(e!o+ZA^OJ6+lwSIz+<<pjQK<C0O$X?llm8w
z`r8Ork14OaecSOHmohG$`5b?jchJw|3dQ_YM|wyBsuWVDNI}~qiG@&{BqeC7tJXxo
zl7c5^8q_9q^Lz?i38QJE!>+!nIttkKpWnA2sSPQnvA5hmqoumcdlmuK!hq8BFCSoN
zk%*BGfz0}-DnS)baFcN2wyoVu(^aM^5L%+_{kD4>lW{99N1xTNq}TgbjvRkMZ#UU+
z1y#M4ZO_xB!^&Y$YIpD*-Q$d;N|ss>7)%Ff%Vh~wLqsC{6+Y0VGs2ucQ<3kHMeM&U
zE{);DRD3?h|J&z}V$MJoMk;U{-g9Uj!wNIXmPJjGmOqO^z|F-vR&j%2g?VcnBmA&p
z?fD%t%#%FHW@c)Jv{vpg7e3t|n}oFpVF^x?kBG>Od1JagfEpQcu^xhH$R6a*nEAp8
z@es#P+8rJhGh&v>g7c0%I9)9^yFI?m8&mgT$ShwW&Mev^TAS@K@@_yF|HO=gKT;6Q
zT#=2X^<*do@8L8oH`da<JB%V6oNES$#*|_#`T%>+A!1Z+8j1!-j74@bCS=a0+fM+#
ze0?EfC0azK0k{pZ!)HrL5Tk7mH7mX%Bem^yFX<-;UPLI19ETAJMx}bf{BiE#@qoHt
z_+$-flu=SY`OLe}KF{E9&o&OA&DmH+FMYoBTK^K=yn&_8lbfUEYy<ipEW(^1+OM7h
z+LFpK4m4hLdab(U2in-`^eYn<yk&CL>odqjhL`=B&85Ks^HwAf0~+kC^Yzo6SvP&p
z(4FWR^@4WcOZRtYJoI1pqsipoUSITed6#e=Ay{PulI_Ltj0+%LdlK>^r~8aHuxu{-
zj|{|~rN)jjpJ&*r@RTmx^(Hhd3GD25P+=ZwU(&Ho!oDk?b!eaY7X~rl?Rqq$L`U*N
zPB#GT!Ve62i5GlOm=lr=dK@p|=;$a6xO^&{JZv<dF>3NeUDxhngga6ms=y%%F#(qb
zf}GNK<5<`0Pjqd5e5EMI^^kdaws)Q!u@2CNyS?!SU)HhQ-0b+1^uRq|W#|vffzP-E
zpM!`J&&pGYmrVgrsBY*zMcZB@0N}*)quV|A+N6ojTH#yjspVux<LF3tH(HLh{cqYr
zHn&m(-`CEEDEm{-$z9&rW+LR!E5qb)Z;(UUiI1IK6}6$;7PF6oNO{+se&trM)h?U_
zN2b_G;BchqR|0j~?wWS}8){)N`(B)FjIuM#?Kq~c?vCzaTZUA90J@9hOmrgLe_es7
z!eU}W86zLVrgJy`7@|fJoz5+0dOmEfhtQxr8+I9Zz-l75xkMI+R}@)iw`b>(NxsT%
zb)hR)Xu$#9J)qZ$a%HmF7wC*(!K{~<^TYD)CZ@3feKWHSezb@A9}z(x?nOws?by=R
zkx}FB71tBO({Ej6NJ4tc-WV?>+C10IA8uHQs|7f=95YYWMY-l`fX=OWRr>3Ev)XmL
z<o!J_Nc3jPUzzA<R73$=()-hab-|$M{;5}YuVl!T$^7&XdZC6+5(?q+FL{qY%dT#R
z`{39C`7Ws`GRz3h;t1r_Atf6|=!pqz`>J_cPC@ZIs|^YQ;C^f#(vjuVnizip3;cP1
zE(#&HB3ttTJ!*gtmw9n$e#f_=QTnWPf^qe5$_7-*0sp)M;&d?;;k&dR@J08i=hbKe
zk&+OO%!#U*>gOLWLx^<aF?0p=-Dgd4D2Ph2oV&nA5T7q?Ma1?mv;I#9YPk`H1Z)@a
z#O$BE#?~Ka!HIbB?Yr5GsdNeKo`_TU<S<L@Ta50FwS$cHeRKz7A5xuhYtvqi&K$63
z172PD8bWvJrn8*Kgmcja+(dnd1UMq_JDI46V;q;o2=TI$mpn<hgRRjcTwD#lV+s1S
z4t#6eCCWou33yIkHu6rSJ{Q6`^022yoXg1LoaI=NCt*`<s_WHdI6h;P`^~Gw+RKoh
z^rq<skq}WcS+V9SYiu!Qg*~aR?@PH1egjLpb$fH-Ohi_C6VyDM;k!*Sw~r6PlQAuo
zD{;o@WDVY5YXu4}8|RN;-Ob{Mft#s7YC;WgnQH=NcbY;pQd8le@8~fwsJk(eYd@0W
z5P2OosX;H-Dzskj?|XLwk5C>5y_M4g&JLzsj$Fsa8J+v7jDrq(>|g(=1ZN1=_o+P<
zCv-G?y|mA$m<S=eY)!a)r7F~155{xaIyGU=%`2cF;17(aOuuwokno7LAr>egr4AP?
zaO8i7++(GZO-VMBYmF56);u@;>m3Wdi$9|4F%?BLZkJU7XofO%POl?r@LMK6Ifw7V
z!O^J*8dl3e#N^RSgYg52wjmS%$mbBLK~6{FRVnlV*#QE`g9!*?N9aAgXCgnJZ3Q;W
zTQumhHNuf<Hd(w0du38|@PLKg7tq3SP%`89jV1yjAVWmRG6`{i(5s3shsZK=6EEW_
zeL@;dLAsm(G5U=YJHZ%^?n40o)_2l7PsVKBx7+z|Oj8wi$%<W7YPHC#jQ`-W*rBje
z<ox0Qd7s93O!QO)06-tGfw#*BtTQmA)Hd?UP%PvSe?n!-ppb+gCqFB6cy$rMKYoF~
z%lxRyL)`OrkrnU4wEcR!W^H*eI+OLFjH==`iu{i5p%S~U%CiPE4J+fLMG?Yygj5+q
z^3L)%Y4;4<r@7wSjGjp0sy*Mi&#{+XOI#aJ8CxDWHgsk%9V0lxPSXeze^QwfT&RTo
zN3lCWvy6=@H*n3-*8mrc*Sn#z6liZiUQ01@mupFM^pr20(?AT_PQu!7SSJRO!7KL_
z@dc3ZyFDNh&nQ#y()UwP(3+$bnd8NpjaArLghH17aUq`+^~3<wOA)&JJ*rWMO(&CK
z9}=gUr&`iPA#OHFXD<2Map82Pf6$F}=SwD<x#;QI%#X}$jBj5A`j<YUuAt8BeROk5
zQmka^x*jD`TCv@k7B+@AcOJY`)l>ca7lC*RGM!C_CUg}->^{wIYS~OusP{2;fCdqg
zudEJp^(z{S;a1ypelJAE@?`8qxO$Eknekuz4p$WJpaVO4i!v2Tgic@Y&Wn8{A&pz7
zn4vMe;}PFQ`Ka6fIp}SyLg^Lh3{U`FHfq#8_iaPR#=@BZMejbJT7knYQR9LqoD5GU
z5@t#Qfy98W*ypfarwyIo_TyLK_mhq{cqL#RI%~6M9-d|BJ-z046x_^$>=AbL*bc_G
zjc^@~wa+7SQcXpCb3Y1(3hUF(s-06>iieBPJ(oju{V!;2d5u_S>I2E;{4W{qw2kRB
z=R^+D#{ENj+L8xErogeQGZ1hqtSWscHyc@vVgJbo=iYe->HYqB`yR_pky!Vi@73{O
z2S3H1f>D5d5=o{{@hVYZi(i+$yi7`z#QM@v_X=25GHeN&D(~R}jt`BIgxEX63AW~O
z-~0gmK$YhNlBf|+gv@;3D8)j)!5;U;C->#~Vqhwnfyv@1Dj98X@HLI^QGY$5vq(!#
zyp*dw2WS~Br06&*XE8yo-M05Y2__C5v(vZ*o}rDZmk9?CX96*r)U4@4DECwS>$9~l
zS;lEqm4kPSdM-Qc9NV@6fsYZheybj7(B?I?6f(#hO+Z!pmyt(Xx(zexhH9u%Pzyra
zanFY*`x%P1>gy6|f=E~HKK@uR?gU7Mf5U<pQR?RM^}fUPau<O=Prqk_DFi=k#0jNv
z`b}Qz8>t4*+VYB7O_lrWmB^g^1<@N|C=ysL`XY4^bcpfDimQP?(mFhSb@keD$qVYt
zWGT1jEag?e9c?ZS*K2Eh2pC*b)B0?Ry-!SLPr>?D8B*@n)6!!=3SQ%MyDSM#gC`4?
zke`}V0=GGCWtFOatNT<m^Rk*A`O*0wBEe>K2t(dhw6?vSVv9P)6DYXPp(Zz*&$@mi
zr68RGRa4bQ3~KXBlFv<aOpLTnwHl&4@(#@Xj^=1QDH5;!3Gn3Dcxo*!!*v#}7)CuJ
zt89FBmJ8_vm|0INv3Hb^A4*|&a0x$8Rd%R!pQX7X75!EYn3@|zQ{Q_X+WC4<0Uqdb
z`ZSK}0B^yyz4_|02-B>Pr5Z;1J~sYfpAxJOf5DO~&uH-&%MBbGdu2|?X&@i`MEW)=
zp78?O*kJPm_s*F>@fA)zCeVZel*`<ZYFPW8{9^Kg7iQd6?m&~Cyt~e{gOxSxE*?Re
z4%ho7FoCJJ=eexi=RJ~quY0O#YsH17gZZ7P4iy?5dyLFml<hCS(A+jOy(hSNx1)P$
z6o&k*_4R}qptlI2t%!$|RHLs$CM_UJ2QoyKoXnV=DmN2!g%0<guLopr^wFe#m2%Bk
z3#G^FD*}9L6Ag=Ub4OuGv<{yeaDrFx9eZ19-MwyO-Du2<r!k^0>~$8nPhvGs<D=tL
zM6<0&o0d^@Hp~=DMv`Ajq%;^tfwchr!vf<s-GZpKm=pEUtiX%Q$45&J*Jt`>XZptq
z)9q7z4U-4wQP}IXJ*G$+oNZrqjQBqoNfsT4#yu8gLJwDa;nkn$4O~jaov84xJx0`H
z8s5*PFDhB8#WJIQXN$TkH^HX20gpz>)1$i6Fjv=O>}ihtsQ*BgRvcvX_`|sD<T_Y>
zu`<8(F`+Xh2wa=OZEakkTq^GGTC6imV6Cvok}5SRA5Y_P&RUnbIhXg|62k};lMFd2
zA|h%Z!YD|wuj?H3k@+by)1%E7#J24VfiO{-?CLvyA!Q5sHg#&B;bUIhL~;Ct@1Wh^
z8A{~t{eB6nQ6;>f+-}D+S*;d7mHV@<o#QYgFJQVO0wkvgS#+FHXqt2s+Jq1n5ft=j
z*|es-E<(F3EoAC{-VUyF1Na_K4&g9b{tkI{yPj#sFDuZF*&!w9PQb;H9rf7>267QJ
z?NO-lxs|$zCzjCf@%X`QrN#naK%2up+dsiilia6=v4R(A-p2y+)-14q7n3(1GFPIk
zRvmPA2#Pjdec8u)Aj5|&HOF_M(@`>ucTu+eXFsf(D@~HdlXf@maZHN*e)}>j@faIy
z?hk`!o&8?V=R!?fQG2P~$g7G;zgsKTB=GHK?^O$z$;hEc?K)c~?m6Gfqc2HkwfPMc
ztY)ZBMbJR$RnR>9H7~}s7zs2q92_iP*9*>cI|u|RY-BK3YUgyah8;Q>;|%eBWfcUP
z<znRKd%N`UFD}s4cf5RP*&>`60*5RqE8lZ55$}7SY>~}H>Vt_6pZqUV*LL3qn5g$(
zHbgc8q!wlWAtGRho+Ch&xEw`SMR}1Apt(%DaEZq}9|@A>#<w`_+9zdm1pe`PN^-}B
zubaTTaaD<g?%VDx{H@bbzYBq?*x8|^7>BCsgH5mztW<Qdu#V9z?0+1}@^vT=yE-aM
zEFu1e_o!CVo;$-r#%0OmKIOsx1L4oe7K-qX{vt1!$mfyrqx_PT-%|W72%TxDzU$0E
z@@tMg_upCo+;A*I&fQ=P--{96_@=B>QAyK}0gKm(tx`coZ3b>5=fI6I<D8geY(<^V
zwo0clpk44MmY(P11{N2WzKcnBOQxEVTvh4znQujQEz7cFKQ@TZ;%3;2I>as>042b+
z{(kZm)pech&b6J*bbEmJdiRsFO7E8um_W<<fuN?#-ENa_tRwU(C@{U2s3;^ah<DG4
zO$*QY@dMkP?f?jVCbL*FdNPGZVOnHnjMTrTQ77Ok6dq6%{vpLc@tu3#uR=~=!tI<{
zP8P(t(1v3RH7Z;v&0$nK&&(t!{Obh9H};a~i9OO@NqL_1sjTF1fTK@hR^wO<kh~4$
z!EX?g-o8N$@gV)BgyshRGKezM7>wL`H@i0#;W`Ys@r?uYa3-Z=CGq1+hE6TP&LSh<
zC?aEnrtmKO-(F>mvnD*G-FghzE{Qr%`(!LA(q>B<f_sd~rAu2)nMYp<{bE9JD8+dA
znzx&pkUAd|;SXEf>oTa6J7iyPhV0>jb6Fdc&Zv|YRkhzE9~0_igF=To$1(Ujz|=<R
z^Q0&%&iyo=c*aQ80Nur&D-t>W8^IL!O#<n`X&z2iVxRIWA;$-oiLO^K+k$TV)%n2E
zPKWnQCC-^qo~L=tBoabc^-YZmKSUI(b@VQU9}iP$sc?)LwQAVp%LFevB!Bbo1E*Ls
zU0urTcMl{2MK<w&rBlWrQA^*yb87TjN$jD8O&|P&sjerrdj}RB-)P7Xy_3nDT4L>u
z05Oe?h%y&wOSpO;^!9T!)QXV_qO*KS5<Ntpm<gpLGMR0}AX$#*=g-!4+x#pu>7tWa
z7uIbaB}n!Z&Scm+N&id?sEy$f6>owp%$>_7*!!H>GOd{1Fh{&`_aoI}ecASiq+hCv
zlu6+)SPsrx=rTb7au7uPYC*v_N=+EZ#WszvE8j+%-cI(?WeQ)TnT#&B&A019g45GA
zr9*u;YDF`n3;p!U<4LIYLXLpVb16I4(+D4PLzODY{fSnJjRT|nZ_krN*@G{|5ap#R
z-5uxl18pzpfTPi56WXdsHv}uYmzXybp=3)@^m$t(B;eX`OW2;uv>_xP$mJaHIQUhc
z$6?!|29(`5H><-Gsv6>4yhZ9G`}(-ZXjS|^y;boztm&@k?WX^gx!?E9IVIftb)pjF
zRLQz`u0*mkK<MNc{4Et$oevZ+&iQV<+$>KJ6md|NsE<Uk4WD^ey;{o~=MWza9l+??
zIF9VxL$UVqD^nu#7{!{{y+s#$z+V7;I}wGg9Q!4PDD>Xzp#ym_{(sSy{vt88kU{MX
zcfMqqF1}ke*O}=jj2#SqJ=Q$YPNO4IXdh3GK4%<xiH#>xu;!>`b{8Df4S5Lw?Yd_O
z-ToF;e9e3Xx8-?mE^rWo1I^lKp92eXQ)6OK^uE7WVe3#Rjx_jiPdF8^Vt{)8)d;$M
zrlsKs7IJy1o>gehWJOY!I23hZT-CvMy&et#y!A2)Dic8jrW)sU9GAXjW$$1G*TkV-
zxD(->k53Q~IRv>mjH+-NU|%5F|72tp#$qWWOU4+68)x<wqF*YHPnCO6kPxV?n-e_M
z(K#1p2Iyx@h|A>B;RRk|XEz>Db9UbB!WT{@h`!}BOGqaX(^eD)L^O)k*H>}+!?Z0A
zWPd`%0_VA?#ZbUFlCgJWb>X4U_nxFpm;=!IU#s$7!V3v;t%EKh2CzjoF$jMK)gJV|
z$>u~}$6XXg`>0UvL(QIc{SLK?s8@SD0q1lJ6<sa<8R_PBvnB+OU?v}MO+R>-E|nyF
zpH+ovxK|MLbYhiswBxlkbJjZv%OEXhKbD|#T9{K1K0?Aj`ra<k@qJn%2CcfeC1FgW
zIl-Tie)Gh2W74iSKYq|qu43Y=>W9L&;oQb_@niz?XjBp#+FJX@ueKjYLxY9EdRaGX
z$lBvLnE~1?kAt>q@wI4#GrKbsyyXLCqaRXH94AKII*{rz%vCxyltaw?ZYztMFmYE@
z-oQAzT@DGw`$H6P3`h76r#heL6oP`tuz1}b#B{XYq(okC8RK&$55dQI^e~M|urP+<
zEP5>x1!HZ<C<YJ1Pa~My=6%ef83Fmy;UbdSf9_iK2RH~Ok0gHnj6coSlGf&w@R$4-
z!ub87M=Hh2JXv4({*VcuWn-^jR;{>2GQ36T%n)dC?j5ph7#PX*{H!b|;KOPgaOPT-
z$F2`#c`Ckk!AzSQ`KFy01e~CA+Y1B{=<*H>0W<rb%Qi}UpC7JG?XC1Nz)lwRC8Uz;
z<wJG?%tYpLo|@Ayq70YHlns|89#yB?`^@p@g0SWW^$g6g_tSSl)_Z|Na^n;J1x}I1
zK~n_qM`u0FpPwx)OS9f{1wS2&0%np#d9-RZxE8~F$qm?-ggM6?1<kJVE&^nR^Y0DU
zyB{OsSd&i<#wKp+`ypkwQaYr<RN^pAKIt%any|J!nJB4+>Hfr7NJHBaU(M4?XL;7s
zlz+V54%*XpV(wL}dmPz`g3tzp9tTyNQyOjh^UIZZ!Nlrvoj#9lzW1$b!y|Ly5Rvgz
z<+RH|Pfyw>OJSSsIp?i0xchTD(U+B%{eT7KI=*9v&a)cMGBjL{4p2&B#+=VPM2zbo
zQ!?VH#+x?AAZj|}C#og*%eN_W4mf1bS%Fu+n;wWx=HA{DC;xGdz0$0EU9Yn8+m$50
z323J{ZzdI|gY5NsaSGTE+D+AnUr6`K>^w;!BWzziyiCisz+ShQhxZqU4`k%n$;UnS
zwe5wP|4zKZ?)z9XJ96{tF#a~5n&_P>IskmiEmHv4Fm0OWYMM4~z3uwSkV*uYI+1Ol
zsHlnl5L_#0wiZN|9f;wrc4SP<^a?Oldyp<4And}00)-5@|Ed@NKl3#Iy1Rf3*@VJ=
z5^L4G=0Xw$nlwJ`kKM$Z%{Q$F7aGR<t5>5mT<=oR{pldPPLn%O#7tiy5`B;m$q}6U
z%tw_`w~NofU~e(AiCJR8j*%QRkU2U3IwB>#p}5Ex?3Q%acbeFAOuBx9MOKNby37Gl
z4or)T(~4rU%CFbb+b=gWa~?ZM1cpsoMFpCqPrXE+2Ysc*H15h1gP;xmS*>(e`kh_n
zBRKZLX-p_bM1;*WEjr@^7Sp;FWRBRgQ`&VhfBZT@?MV{}Yew&o_xm{BW%P@70{s{3
zor@cxw^f^TgWPWFQOPdnFrsIYbL(E9YI39pIC?F=P1QZoy{y&_VvT9E)H7~atoJ?6
z8nnRE+@ax`mun46d<{+I)=04&kox(Nj+jB$CDVU0TRXc&=JnZmzmS(pHuf2S);_tf
zTGQO0jh<t|R=0@J=ri$s{V6K5(})OGF<6}m6DhQPFG&BQ|1GHV_qRC668FTfkUcAr
zI5S#}8%8wA`I!nzbE9U(&^o-%5s4p=j?3W`sokOEu3ol3yPWl^WR_Ux`C*S%Qy#Ds
zYK^0u@}^B<sq_=;OsC0}B(z6UH57eBLox;3j9f)QTMu<4TbWKaRv1OGS15m9b4>FO
zKA=BmK1fY`$;k8B9(22%$jqsnl-KduA{ro0wcI>(U7v&y!mn{;Vs6*5=7=`cRfG)S
zP~Gn9Ek12z$YafsRX?{oVkuu0{Cv~^5Z+W*=Ql1WtfsI1sD=Xbs_##WBo`Q>*ng^a
zY3nUzr1?$mY}dxNzo((jjN`q-lJFKJm+};PEbUNnU1&}1ey{xcUKJ*^XQJW9G(T8q
z1NcjQ@p&=VrIkX>e*>V#@0NDU|Ahb6(@UN4l5{@Rn|;}NOnSfo9zteK<99zA{}F>V
zjD)3t=iRC*Z627fQKQS8i^qa3pu#c`)Us0*fJj=qK5JH!q}xB&_gcOJhB`Gu0`>8<
z0He&QTo3bAqXjlq)%5J1lJtkg3EH$YbumlwzEQ>s6$YPkKW;davDs?y>;=xOaM*9Q
zXf;J}{({|@7Tn7-^iEcoeW=?G7q%d9>Uvvi_)bC*U5wg!m9kSv(ebM6H}{O$nt>h{
z-#Fjma+cFfPSA6$5#&~L9JYE>#7BxObw>8~$m;vSai#&`V*c?M&q^tq^0>e!jJR*F
zPYvNcH_qbvC+I%fJhf@?oivIWLQ9mhN)gzOCCd$@)c@>2a!^Z^)<K}Pc`@Lz>I@-g
zrQeXBhO6+AVzNX=-dCDRS_U)ul&C$A!+?0>m}euKviDUQtd1ZCYU232M6@4u^7l?d
z!^}0YnIFQXct^4IgVjmXyhQA3C-ePi7hjr%!Rl6o)}eAgW3|g4$!d2n>aT6>1P;$3
zSl`p@{>d58o3zmobTY5#)B1C(SvWF7Zs)*3v~|G;OP@AfVlz0H(uEQz7<RYd?rhnY
z6H?f`@(<}ay>s44s(P{i53T!uNZyVZY)IB`F=j7Ww#b=S=CqHEZbE{Bl1?2@0nK)S
z)J&;kLrC00RJdsyqX#AV8Rlp^!QnOf#?aM$eiKuXQ+D)dzuP>1G>E&MC(W^Eyp!BF
z%Bfx*mY10t_B^ye-%9rP3OpSUrbb?E)R<I_nG|JQbGhn$q{GWn=(1xOp97$T?f;lS
zoYok}*BN<vy0C8u23k>bPnqs*Zk;`jxUcTq;vEu*czoUV*m*znvR$r7>ZsM58&jx7
zHo`*T8_RW65qVKVkP3LJo((VaB)o5?^<r+x>u?B@$gLYI^aYC0gZjC^>PrF@*ho)M
zsv*1*zRGKx+Hb!dve^T}gdzi#_ri!aSL}K4K`VlDc_?MC_}PeY)(oGc(8y2%id~{n
zq(3d2QvY)LM51PL&I=RkWvzyF=Rr>#O3rHdNzIhge{5oOq%}NO!9*)Tk-d=%g=5F<
z1?^{Io3huosl@uX$y=hdY^bu=n4jEU*xP4AW%sAfff2%$cv(&KtRU+hj643rB4JZY
zkp&Y=169JaeBq1V_W|fkA2tf4A|J~G^(W3`>-uBG>cv9Lp)b)ccBTWhj3G$HhGnK9
zHCpNtiq-UBG8YNr=n#K`5HEh2IVbk4+{cqyvz!mWL~1-hc_gXfF&RXh|L8U7?hXXN
z+6EN{J|^@$-fwGdR19Ga%{G@$DmRyp2kDIexK3XEOnkFx5@P!00kX1sYVKJXGBk;*
zLt2-q<gzy-?s-*Y79W}lcql;&gHM#`yqsk$KHRl<aDTJ5;5Nq-Hi#$X*SP2bs5zjr
zf0#><AI{8JBfeB53N^+pklGUlxHup}A=8f!H^$a3`f+(^(k)xu_OV+I;onRjmfk_i
z(P=&f%W(e=Pr%vslB!e{Q%&{YVE|@9rb}Z%ngz~mzP^FBn|)XlbycM-N6;g(P5eG#
zz;CvRO>fZ(3deB2_Xu~Fx9sLL;JWD2i(YT)kMMP^jO}KT`XoKEKaB?5S+*&99{nfo
zCes-cQXcQOoHI>b!1j0*d@pz^6CbvZqu_y=EJl>>WIe`{c&ur~-RCrqF2f3ap&+q#
z)8h{8LM|t!8j-oLeed6o0#m%{x~1UmGle}0do6G0DUN!p5l(vWdP6V$u%r`G3XK9V
zMi2D}5}(QzbCI&p`^v34lBJFLkAA`w%NCf8M&*~Q@N#J=ex~r80nPGWs&6NO#$oPx
zZ#9Y@`z!bk%nNHAbW%y}Yc#9255A2GFiDrHv6Ib(9a*ttp`g+m75hKu)D~7Yc@h&>
zI18Sqg?ish9-mPOn(ZAN{ET`s^ud1kJ;iEy(etP*9%7n#A~;^kUSdgWbH8cJlh!tD
za+hnCHuGcR+e&<2MvRCXupJF%hiOU1w@?cCHe*mYt-_og5#S1ZIJmssusTT^F<J|H
z^B>LU-3h<h8qTkWsvEd5!3emB8B^FBg1&hf8EMf#Z@yfN&GP3d^jW892q2|Z5d!t}
zfKDoa8)5Yc9R|dMU@OdR8?rvuf}2#@Y}Zgw908dsD~pg|z#BgY)H-&N3Z5jbjOnhL
z`D&OO>hCp`pnzTel(4SZ0+E#f<SQd5#90u&FbPQysDJ|+<k>$YVgA{%pt|^V9buzW
z=B{_mnC@+$dqu&y=VN{FRk5l1yQwo0TE#TP5}66eAm1dEOJLz1-I`_SY$MDTGgN)`
ztSH`I)(MCGA8gLLu15Q0xw)?QP+oSZ|IkWO(?iRxlQnl*tsJOnzqTIU6R}d!ng+}2
z1n`qpBQkh~eHlq|WWd0$M6tR2L@SEd`D^zx?V`)a_rbF=rMbLxiZidq<)!OEc{mrd
zX&=(Noi?(hU*p?T8=dOXLh9CQ!Ztd(kRRGw0WOqLNRjsu83MG~F%v~SrRMz4eab^W
z;yme;{M6BY4$Cd2DCL&HYR$HMP?6qoGodil`3hf#bAO^Z{5qB}Ff8+_;zMGt9j~&H
zuMXT+96-h%EsnkM=9fga?uYb0II4eYbhRlp*T=HIEkNf0)h-;y=Qx;q^-1pu^pqN2
z)F_htktUTYj*CT7a#$>hGA0-2-e51G11-)jc;&rbRcV_bvtNi|@4)`3X*e4Vb*3hJ
z)?Yk~3BxKy55uLH5?Y(i4&W`Op1k+PO+(`Iyy7j6Od!SAxR;D*1Bl~NUqrA&+uhcd
zUr5=3-TWOq6bs?x7{1sXs>ZF_0v6YYi0bn{qxy~BR=$r~Rgq0mFS&8)v9U<g>qG3r
z@gxJ;arS(0{0Pzf1o(WUuHW)ItR>{qv!&L0m_vbANyWl*BCC5MoCm&jt+ME2kk-ts
zjyX4D3ivo@Mt0oFVRm$<#)S<gqpCd!WcrZ(YX;gkSMrk7#$rsKD3Iakx<)g|Bk%~Y
z$AkVmr7$qX|2s!k8z5nYU|7iI098%BO?S5w<fQwyyQS81z_fAlfHmK4ayotMgis@|
zDi@G<y~}l(Ude^msbiK3N8pRr)UVIm6GYjJ!thgv&b|KE&Xi$!H*we$2pvd`+PmKn
z0GG(_fQGC@y&<R47x31iqbA*lxAhzaC+eh&zlIgs)CF*xLa!|-O^6(wX=a=aw?K&X
zwjh7<i5|%se937as2&)dv9vOM*H1!Fm=M@$JCnCok<^KIa<OTv?&f6Z{cGIYp1~?S
zy#G_s%WzWk%uJuNCD-LO6iF5PGN@B5?9SF#gl`VX$T?K%s!NHejL@mdcgC96PzUb_
zkbBIMe)23s+J>*Ly-n03Q^j=*is(ku<(HG`@nL1VBITH~XKUhWD~Pg25WRe{vh^85
zXJkP)YHx%!l4nrFoSGYyK<k2?8-b=i^4<(B;~V^#bP#o#okXoqr%*9AEB($f>Ue#U
zwg0Qi+dZC~bfM4#C^+L83T6i1zF;fguoS5N4}{BngeyV+kyxBsH-1jH+Hbd%TCmg6
z!rJ4UFhP8c^95pd%5ddFo*XcABur?^MlN@~s9wMsBN@TA&8@YO%*QM%I;Q6bH5TPn
z>2HmIXkT%kBYPE>gJZgAfRsSDGsl$65@yBdSHN8e)QC$nYJ0s`!H;)HonM^{HbYlg
zb!xT)UVcwg^Ux^^b380a4@GjMBJLm_na#BzX8pEJ-eBX^I@T&S|1BJeFXa{A<kR%0
z3>m7eEV$kU<HWX|!i4s_;iOf6A6X%rG9z7ug{)QKV3Vi0z_y*Xf}>{DXTGtq=9f-~
zKt9(g*V=c;3P`a(iw6XGvc=)3U`JUN)%<eT+BK|Y+2x4U-EMw%4tY6TeS;n+fZ=a5
z$3Q9IC%)8)=XF`1o|s6<6H$F4f{rizI!v-{PC;f+tG<&~JEt3c<z7qjsOKJYgJkKh
zSX;GiflFFi4oXL1;&FktH-U;v4feK@G>qqNYFBg?1(t7}7CrOy;*oJq_l?Z@Lxc5y
zfN5v_$|c#CtmjQW4sv$1^(cOCPg5W}6W(&b<`CXW@_BP31_)0QEjbF?E`}hsdzk9Y
z<C~a>b1xQsI(8fkxByYDHfZu65gxqzoh8kCS02Ct93QBZ4Ha_eEWODf`RMFjk^R&(
zgz!N`gE@yGx5E)MR>hS*)6<W1bCbXwSbdB?V7hnEJlr?+l+Xyr+S2J;{$t;9=+^Ww
z30D^Sb3$3KaiQ07h2mh~yzcXW+l{-s(vDI|Dio5b5KbHO+%#mMfWp+!H8?A>htGSj
zP_Z4#0i{jDjI-l3rc^7m#q`-lS<jmdUs9G~VZ!-vzBsDAm_qoiF`vHu^Ao3x0y*!^
zSzDZs>|WOwTr=T+uuw6K^^3ns>KoV1C$L*LJT30Khe1cJ^YC>=&XLgK$MIvPNHjD-
zIV?2i+_j{7W=>3F1#>&xVxzV^A6n0tLe9>8MJs&tfS@lQrV!%>b*Z%3GRc4KmIn=1
zS)U3y39P1D;>VhBSWCdACA%~SB!rD!!Ago26!V#%<29Ky_@1+4mbV3(C654?%=KZP
zbIO_1M?L7E14tV$W&XU8B-3@fM3mOdro){bC;-gGuh-`hwlV6?r>5Aa;)8oYBrvqb
za6SFP!}+ZFf{XDVFA(y&$6~J5F+v+#A=>vnj{EW1#jc#tWVw5R9<;yAiDY&j1%j04
ziAuZNsM@qD)24eST&fIB)(HayjKn~0Ch$$g`Hm({oi{fWt*EF`UMP)M9-HhDoXk^h
zv0w#`w5?V^ZW-oQ(%~<{qc6gVR8kg8z@248X4I+XjFt=^RuNMHrZ{&vnP9zWJh15l
z>SP1u>rwZR$M;p+-KOWd1kA)~r7PAgR%7om412IsPD+l|Y`&j&JGHQ8K}i6BGdhX=
zEhDk^Mej*b)o^_59+6kc;aSPaB3h68$Eau_^T@X7%>8%n+JrLbOrv-B4hG1UQ`!1@
z1?|h+S);{Nj#GgSmyWFc>%Ic3Q%a~GRKjV4B!&ssRY`-hr+byp6m5!a`Q1$bt=;Ko
zt!3sp3GP1c-=x3Ar5%km_TCzi20gy~>UpFz?7r!qgSq)S9+e^8%<iZ0-;s^n6dhtU
zoY`lf$SgK9+Dh`HgZgmwm7f?t1_dGaH`tU%3942d@j5nQ#Ep4u%^|7yQzjRcfR=D=
zB3s2>o<>KiO?5b07%@`rPqo_U(f8w+KO_%eXk~G-3O<ov_*3T0=dq6jW@pFsQgZ^!
z>in>n@_2{Cvg%6wP003Gx}r;d;YoHjULgdzNjV`{7jR`PuyYi8m3=$!Y!wd-MBVHq
zPJLK$sS;6K<Ul6j>2(FRm2h}-1$Pq``Nx#p?<Ia2?t_Ia#68Q+4|`EOZP${u^yP1d
zJ!W1w2p!JDqC~{{I8(-EMuvFP9PpAD^nCT<$C<S4aV8xf%Wl9RaW`e8x%_troeoub
zZKlj$3nlvn!5xjHdj)B;7j0&&KaCX)N89C|pOPhiUwJO(a|9CnKqoMeKAj$XX)g<^
z-Me8vUO#MJu2pdKw*-jED-1eLC@Ojs5EGr_n^u-syQZ&XtTU?3w+ikCUYPA3abX&k
zw(!KGFBti;qV+!qx$Xue*X=w)_WA&Aa{i%7?uCMSQvy$CCI!9TM!{(i{F`oUCPU85
zNmiE@Ek=-n5_uzCRbAzRJ?n+b<E16PQH?Ge*5n(vypmzszr!>!Np>b%f4*C-*H{tR
z>O9&B122DZm3(EK5IDG2m*ICNe0f%7`>{)T(MQ3?yDtO<YN34Sf~#MDDxWmstFZ{L
zBa_hfZGP|vOU?qa1kctcr$KGtOn+gDv(Zc8viX~fbz}I!3x!^%uyK-!ltE3C?-s*R
z0ENq@!koh{N6Eee@Q1{wU6wANNp{Q5?2d;Kw?COHllz&gMg<w#Opj@_gXk{a?;OWr
z?TK+@{tJ!VMtES%+rjp$`)ewEfG1W=-#@{z<(nR}`HmvKni}W6vM($)3}x$zDxbFc
zevPoH|8L4xRlcQkKY6=E3%vMU((g5%XBMz$vdQYp+Of1M>6@0w7E9VU>=_!ly{Y=P
zH3FRuQ?rvM_2$B(xpPyZm}K5L;BDYlT(j*xegdst7how}#!s<QGB>kckHYpbQjN=s
z*x=S$oIt*PxR%*C{809bA1nf1wS$qsrRO_v182znVybbC@3!Sw&Y04%i1$sb7pb!d
z@NUZf;EY}Gao59J1VV*TuE!#Va_oV1CdX0vueWA`X7=aX!Y6(Fj1^5J#EUGf{kPj<
zB0=dzigo7YqqT2lzi*JQ`C1ELq-nRwicz80)yz_s2&r>cHwo0Cz{$;2;Jn{fyyQfw
zTY>k)47O}Gq%bj9<WbO5PDCl-*3SuYDZ3@&(phY7F0I0tls4>X+VO}6q7s#sJ2cw@
z!v@1?xP=V~iT->JUO{o>%{L*ctb9C7Y9_rXP=+yJW%YMFdA13QZS#Fuq$=}->&=Wd
z_Wi%jIb!LSA1g{i<!@N|KRkKzYjif;+0m{zV?myd9i9z72q)#=T~9AT=)5~iw@Jk9
zNjFtmQfm{A{CL!p9X;yuzKpZerr5>|ig${J^A3G_Zss6#quvwtTv|)K5M7+|Hk?ES
z;wyqq5<+;nQfB_Fh)UEM->?qv*pci;p>_L@uDzS??_!V%&YSx*Jm<dqv;_=376BW(
z6s4&Qxj?DNLlk1l&Opfau#-qK6_Syzl1IVC4~F^P`c+$PYDslYm%Bgu@jh_(A{wP@
z{?~B{M1^FV@}^QhT`#gzbGQGG8;gZe4P>ub>3jo7?h6rY)tAc|uFyhg69@Ep{v|`?
zgBX{t*9~8bIvd3qrnD6#B~h_w9KeXuHAYqBe_X{hL$naY9<$Q&&vkg{p8P$$$B$Wt
z19y>H=<%+P;)dGXQ*vj1h2=2gsI@^O>ktyrIIp{$sB6F?gQT$+rds^}=M^eGcSTLL
zskPJJeMuq4&_8MoroII>ttH!pAvOH-xrO@bVYGH3On*cCheq@k7MdA%wr(+h4*w;~
zRs358<aZ#H;G&Xkp7lz0ttyoTn~p&e22@XgOKT72s;w$+WvhMs?>kHmCh{U~JCbc%
z?Tq>`iAwn<Af24@KD|1W<Au>;Vhn?oO{lzGp@E$8Ttr)0m44J$pF;iXqnjbJm?XiO
zKqudakqs{Ycxf!=9pK4#s<E^tPkYN7-WwVkPW7iG((n<5kMwm?O&QH5AI*vpXLtBN
zigUn@_$%Bk`quZ}-fu2$lq8@Dbe(J;i{I`(=-hNbo{WN$>kejQ3W9tc^Ms`twpeHK
z=K?Qes$CJ|L;RWJ`g*8cB&%%x>m&R;m0W&NDnbAFx5+}-R3{sO#Xb>bjHEv4fT^-J
zzqpN4yqsgl+V5N5fo$yhjlPk~Ln*@i{2MxL73CmA({Ij#Ti(aNP0l4#U*&9VQ*z+(
z&hZ5?eQ$p!?p{Dn*(v${U(p9Sq3k@y9BbRQLSn&y%Qffl12L1-aCEyZNaqhavXb^1
zt87$}PrXp%qNkP!Fs5Cx9%Z5KfSGPLKpY4WW5xf3hkZGFEQ0oJ?EzM5;KgMDv<&M=
zjic~i_WMQ5jt%8WmoPq|{NLIex5}_r4#O6PYHnSJxGKpr)yCYDvJ5nw&B&YdAN;II
zH0>%9QcVYqX!LoQ2Ss*VD)I{$1D}-K2KFOr^$=}kPUxYFgVJ)0$vtUKUzD%~^`huR
z_hxsx;8ol*jh*%Y?r%p{C%k46hos(N8qOax<IjkjE8Ia<65Ty92P{RsfmB*gIwxVU
zN!-CT9^wEGuD*(?%gX1Req;=j<N5zvBIMtKA!4jzMV4sCM&vcIm46F}c;JW5{~$4B
zv_4h|X)4m1Tn%<iOPT-Voz=@BivpW6pP{M0D*myB5Sm+FYw**=8Rr0oGMzJ4NwQxz
zcCi%l$Mvy|j#cSE4-!dd=jCxAQ5kvKEI}ZIZ#9m7z%LF8Pm0ejhlDOMI?5unY-Lz=
z-TxTH-)19bP4%-o+T9xO+w<RHF!YD`zyeRXHJ-vnNVfEtL9b_m1G7F10P6mihIeGl
zlE$?>w57YbWPY0CgB~H#DGfKXBoJ=;Yg)^hD){>-zmW~Mu|90d&$`~Oa_Nj<2&-LF
zn~26jdxK)keuvmu#MzPR+O?u{jv1|=FBc-KWb>^|U&PR}1+cW(S=G3(wz<9dDTM+z
zNA^*Wiuo`5an6P$uGY+5BYgq?mTPet-h8momt7(<ds4{L_7v-)XV2sK5^H@z^O&q9
zxUBQ0bMD24yDpQw=(4i6v{*y&e-W=tz@NmFI@{*F*7O+va*~EXD)gL&DKa(^me0|$
z=7D8t&IYhFmbE$b_xA;%ZQMu$wM3SuN0yXIs40eAn!qQObI4HJBNhsj{YIDTHDxw<
zz0mTt`7LF}-sJOM<SD@~OA==h*2Cn7j-~D6jA-LLrRFgku7ySB!V7-dM;=cmj!}uF
z6;Ff|3i@5fq|90W^Ro7}j9F60fcXVTK1TZn8vUQl3<Ab^K>if)_FtJ<Z6nC`BH+SZ
zR4?Hln=4gVf!L6)<qP_-oWHaP{g|Nz9#`#i>kh)A4KS?$s2M$>-QmXh8V-8&E2?VW
z(;*_;=bt|~Y#q@uk?SzsdFs>f)93=>&#6yz14$Z}Ni36i=Gt_8?4-*b3_A@7b8{?a
zGL6kHuLL?)FLLM<RrEhP6#|-9U%5d^JGlP7w*swraXKgFWymRPtIv&#@6(qS9tv3c
zOMv#TP7+0N1<X6ty|^WR+x~wvbhh}Q`&e4@(TrZsc1E|1sw)s2KnTC-+XKC!#st`P
zk>KYtTBa%K=@@^8`BDhk@m8g~JD(GdcJWKueV?>I#zj}X&Rg=%IlRyRy)0|^`F=G+
zNw%|At<z+kjlg{xk&-)It;?b@t=15k(ZAyuO;zpwlV#U|?_b%<>GTxOdg+WEmr!9#
zSZ2&z!cE?88tgu%7Oi8`)fM!w7!8O?uWJS-tMpj<S$^>%apu21LQWY2Hjj~a75P94
zC0}&No^fc{Tj6p%u9W}d+W&q%hkk?yJw8Dm0z>oc#utfNZb+RJ`bknurt_11@<sX>
zLtGPbl$_Hwi%YKoYW4biHQl}svG0XQJ=N;{iAvSwhYe9_+)uh5)%05tS7+qZxE#n@
zoi4B)(qqV=S8MOT7<#4FP*9bYdz92XpJD0nuMw_Y<kwQhfjV&@o|gX)o|f<KFp3}c
zo(TYG@?BwQwsz;ZKecLbTQBJr>gV(B671*g6RZzEM2Q7W9+lk~`JbG(mNm-|k$HG5
zb<TzV?!9h2a)mtgK2sWw<t7Qe05|#k<2Blha|8>)+eumne}|F(R8#*F#0*WO{As_x
z{ki;KrimT88xo@De;cLsM~|R3-Mv2<EN3Sjy1eF;oT6DM)~Fi2VJy=)--<~OQkHy&
z3~Ljq(`lLSU<Bg74Y*sZ@M$}|N7m)_&UG}XSdk^TgN8jDN`Eevv|Kiok$2aOOq(vG
zD`V95%GN(OdN;*GNvYwXG}H8JNd+)0D$G>)a!jz%g}B8spcLGkNVg?RWY3KqY83UC
z>sX?e|N2j@_wN|Fg$FH=+ZWi{zN+;<d1)XLGQSJJPE7fRAtTnQ`VpthB?ftIQ=sA)
z`a5r)S*Vz~WG!UbXEE2Z)_{_qC;{8Oi=5mPqHc9{ptiHk=q`Ops~ah&wedtQm!TbR
z=H$?uzvg#Sf?NwGGeKux^<&pqw3GAl045VTIi>U_&r=IP^riV!q~1sUHdu2-m=Jpc
z1wX>12fapnx4%8^yJpQJ#PJv^O0I<sgJb|}&pQzzetRdhMXmV%EBc9He1V5hPTR07
zui55>{`HQxTCfm0oWDJ)#)YytIhU`tkQ?j7G!dN=s%C>*0dnTF<sZFn^>-I=v@K4f
z1)Y{exaNcJ?wvC9J`x6n9AYPWA+4>z?93x<Ib#@f40)gA^}Sduw|_`YJAykAG=;uR
z@tMSDRy!7U6i8AJIByM1yFA<4MxeQwR1wYS`d*uoQ$SI2zpal1_aO{E9=FV0<p9H&
z(MzfB-VB2=5RzS6WB)tJ&aWUCBGff${oQu_ZFZrvpUBgvYa+F+Ikb%aljZ8g$V-{z
z=NM9cE$n4z7Vg1Yy2ot5`~0jlBq{q`Upel@K#d(Jr^>UYm1$Z$5WPOsY19N9RoL72
z=L>NW4`s+}Uz%MATxpy9wxBBiA&XDW?FLxtw4*1(PoO2_T_~idHillbb(VgEJ5ETS
z#~vcve~o2vb@=hBp*E*5k)pa2(iIo&AgsfqaQ*=+cJD8t{Qa-}M^0=$5TP>C9b{Td
z14+_iVH8D6qll~I!t{yxQ;hgRFyPk<hqMe!qQ0=!vsltJ`M5+xsAiL*&pfq{YIY2~
zVvi_*`QMT>!qxQ$^7?ev=3$>>1ipw>7Fr%1E9^mT{d$cXsX-Imf%A#1Bus-AN&gBE
z_0#{5qaKoGjjr^09aCtW`w{Url~N3swWH4Y)H$AR2b+sAYV`G%RY?2!AXVMG&yB~u
z=U+lhyTD$LU3=Ev-ZqbWn}hH~CxO4>BCp6Q1eJrdlE$T{J~zn~#zsWeNy(G<3hu}5
zh2RA{18AzBPC|yaDFdiw-~OvK3Hi*&_&=Y?Aqk;Hk-#~rbxojMB4m<M@mE=?m0Ujt
z*mQ4?>uf{!Bav1Q9k*r~8@-%sT4pWBiBu9*h@4K>;%<{I0n?(WB+-h3@&Z>!Z~n!X
za$esg-UTnZccU21ny8~`w`TPDoXth05ja6~cOyO<KXT``g8uQV&=+^gWz4<kwAJXb
zOxScJcb~Ha%EU*s`eCiNWfa||tA97n(=O1l>G2f+3;e5YdHnxVx3Gj_xbPZT3oBxF
zbTRxl;BspOtFgcBzoks`?~ePoBCJHJkMB!`Sf>j;_NN53-qug@T0zvFtvI8Ooau{%
zn6OYS?`yN|85AssC(+7^>*OIYmE}MORijCficM&Ue+G~Ep%#0aBP?6udaa$B>3*Kk
zwsDG@8Zk{vJ?cRls*H0D2_C{BHm;nNQy=!AULZN7;4gK<qa=CrH~0TO{a>d1@9KA6
z7$HkIue*YtS?ceCUe*OR9+MN*pP!6+1)yMgTJEdXXBt{+jXNHKA$Yp$8?zXo@k~cE
zyyM86HbW-2lF{1mp<28<pR)6XRzU4Lj_S9TRWD{4LhQ^0Zf-T>CZqq4xc2~SYTMd{
z6#)wZRuGV)NRjGBihvYF=}n|p6_8$}MM@~T0Tt;bQbRA&dkFy)0U^?B2u%<I1QI0>
z2qAD6`|N#=?f&}T|2y}8eB^;7EY=)zjCZ`_9dphVlTqd9Y%hY*uRC**ZNvJyw^Zf9
zny|J0YptM(lgH54RS?RMmlLLTH(^9%Izf~z6<NbNKWr}WASW_jNO{)~-=WH=tEn{S
z+prVSRZ9HHUitk)oh?5)3jT41?jIz*2U84GRpq@pD6*E(cwJrRYQ;gN7jY%z<Ic&D
zbDQ1v^C<^p-topJXFImuyev#qj7f6f0<}1osZS$LMjW%FeIZZ5A&*)35cKqtn6O1n
zLR`zK<{ra^4wQoL_`?8S-s$7)hJ6D8D~6ZPOMR%jMZc^n6Rs3xDsyuB?;FqWDPI8Y
zKR+d+eba-;8gGFIU7)}o4k0ztgz1j_WIo<K1)?b#zA)Qa?EP<~bfOK4vKBCNpg#{1
zDLNIKlTo8HA_z8l%_pu5Z0<Rumx0VG?g@6Y<_at8ARI1pD~Wz7t6<&t=Ug}2gxB5#
z+gWWI14`2)qW86CJP{DY1kWCKA-b2oenP)Eh$iz->G^99c6Di9xjJQ0c(PHjE=_%K
z*yD|cK;RL#PV1gd9u=0+F}CUR_1=m{Gorg0m&Q}4!ECQOG(O4Fe@Opnb$^`Vq(F-2
z#Uh)eTWY|OBSF}k0FlDpY>-HpTaU=R4^!(eJCx30A2t43$Ef<VUP1BHgSa01rHQMn
zNtB4zikP+v=S~)#NO*NDu%EkS>oga?OWB<h1)~XRc8r46f)rl&RW}y8D?StVl#y#W
z6^p}rUAbrZFNq1tRP{ba`9(kQ*{-i%jc3nMWL1k8ibI(ipUEE|lGv{V$IqO2@$BDe
zil+;woCplp5Cq6jtk1cRT>ak+_p;VC4aVys8GWb!ajt)#AFh0z^5t8kg@BnXCNix3
zZ&u<+Ix@rbOa-3(N!L)4ojbUd{*7mju4U;=yhZcH4KWk0qX|8)J+SR*fdpku%|8;l
z3mKazN8eQ&9?|9`cck(KKeb`LakG7Ck~fYa=LjX|Jx00pB{_L+sK@J1XXv9EUaJyk
z69ZI{<6=No4!Q8B?;#M_wpNFeE_%Ycu2+!%uaXT|r#&9QF4@jwX{vI}C>4~#Dfw=n
zGD|5X-4ptYG)uOrjv5i)^Gu1rMQ*_&YEvHLsX8r*NXrvu+Km8Fy#CC5?(~^`b<yeE
zPs;tqBl&7h!WVstfnotgOVLpD5N5Z$#izQ;6`xXIB=UGRdJKWTvNB(<SL1BXSJ>pO
z$Bc1EFH*jf<L<XuBg}}pz@^b8AsAa7whNFL-2ta~|E!4kAj{_Y&lDOad1<TsiUntb
zN^ag8TS>JZmz+Lbz#H2YL%Y`I-m9u-BpKIJ=HVR5EI5VMZzMK8zBN0i=?*nv^M54X
z&6d@FORd%~lRf$46FCh&H8tygU)+NW@@L!6`7=~&RnGB5{*Pn<&3_Q{+cV`Cc_~hs
z+JoY%7yd$jj`EYX(^YN{MsMd}E>fm@<eC6y;GJg@nK$fY7d^cFj2WrZpgkhsua?A&
z(Oa6h-4}{v&Iw3z6tv(fWPIDuSVGO0!FQR>{mJq30oNWbec@BEu1t~$y)YR2?q*BF
z2wQcBxyY9Z8m<Bc%lAooYsku8<+k?l_AYV#=W}4^w&Y%ZgG;hl(ark#kAKxs%o<f5
z2%fcQ8bh5Nk72&V&7I1MnZ&$#_o~~gBdT+^Gh|e4<h*xg8v(_Au6qzU`j@jHHvbP*
zuo4F4nf5#V{{Y&*D~^`%z~<me|Dn1+2Hb(L8dE&km*LFfG^Sa?Fga_SgD<lK_CWbp
z-cGX5aU?^urJ66$ZVYRmEAU;_44J$i%H(D&Zs2Hsv~UelZsX3i-W4u#@57h&3Q0+>
zD*6(>WEiDC;50|Z{<w6R>vk@?=gXIJ<nsj<&Bm&-L~T`waM*GWl3e5l_00*B=thK&
z-e`O>3-{hxoOpzF2N7?4NdPqa96X!JT7QM4Ue`!bf_JYBXAUAk+07@x>eKw&<l8Tp
zFmP{jiJ+YCrEr;ArDi{IA%>*KwoYum>--(<$gG^|j`YX5FhE_H5=9``CaVY&Y=Rp9
z77wm!2@oelvF^fhxa#B)Fg7R5YZWru?j70x80nE5ISPDw@=-#QjM7?pfPbT-B%ro-
zRsLLU{oglzK19t~_Fui@(5RE-IkJ6ph=uu%HPE{Y+B%V59U9^G<$f)EzT#Nc&`Uoi
zak6y9PON>w_;m}_QGdE%_weIvm3i4VXItjI<_OdpjKib#(y<xo{QO3j92m%qT?>4#
z0`*jz%X!`hKAL_KBiG6s+K9Vib}6btk5cN-Kvso(k2IfmcXSNyTu3CI)5_I|=dN>a
z?|LD>>x{UySudf`XlE|lNH=@-46b`l2_4}Mg(2u))b2|#shjWpb|yYjB+!O2BAytS
zN2wY@)GT9<CsxO*r&L0-tslO1pz8R%=t8Ye=WFE+n2L$<3~-#CH%h?)KbG|ug-sY?
zdpetl1wg@fP(aw%V{qKvw@UN}VXD`6HP;jByOe20ny-E|WhrVdKDjVg4NNE~CC8<=
zzq=z6qEV|Pmz2IX;oT@2J<wRSdU;UTMSGXfH8a23gOlAH2xbvfjuK}Qz4;>^1f|_>
z;Z1>GbWP%#?TD0kYW|m))kkyknQYDk4S&u>5EF&|{kZ6>+95dCT;MQ>I|V`(mrTU{
z4{_ZtK7PJF)M}mqWJE;KTQ&t<1EF)P@z$%SRv?po2dwYJuCvf07XCO4Y`mScEv)b+
zEuBT()L1<ilN8T&RZ~_k-%nFTGl9<#DX#A<R#KgM8;ZK=bh?~QrNa56M8!-Sx;%jJ
zu&!4#;l@>)&UfI=)+ZS&uj0F(8(xi90Y*}DgK~nMk1%1$TShx#T$ufXCImkh!~hv5
zaE8SHfyO>FH&r-m_KyL8M~b|Ba+*2g!$gL*6!d}-Ro2Ua$_mG`rn1eKOc456*?suC
zVoIw@llR&UVjo^M+dM0fY!P(o$P4R&&<dg}ljuiRm3IGeU2zuoHUcDF@=Orh?UslO
zc9N1Y$HK<5*F8F!Wp#Ai@E<P(h4FHvG2Jv(lQS#*R5Iu_oCTM9k?!_n_VP>EaiD%0
z67I=GT8btq&zrD0MNU)#f)roPdkNt;72fUzOtpu#rtfgC0B5tTcgAzirC#*HCJeAm
zS}PVFva`&1gnzj?VbF8}Sc3gzHnF?AR}HO+@4{$S5+@#+`)pPx_bhnkDM^FSk+(3p
z4qAZdgzuVNupBHp57bwdG7Wg7d$Zev`8<!GkUc<pk_U|R7MZx?7G1wkXIk#_df&n=
z;=zG*o+w>+FAM8y4lFuS^vPZ|R`Zd6X675)7uLQXZ`%unr!vYj@J1|N9Oc!9X;n>`
zpZde0eRqt-AV3I{0Dtbj0-e+qYO{=p4w|E*F%btP*W}h7dcDTOO_A}(y9oMD?dvvl
ztlR~`!Hc-ZA|K_>W-pLa7)wIqF;!qs2b*?|we-iFFQxmz&RiRRbr6ENRV(IaCEd>3
z#Q_TY<+SU1F)Yj0?<1~SPni3vDLR|O;=8S!=gT_uskRksGHvhAJjbr6$=!MT=;LXv
zf*W5r9O8$T-^MaLx8{ZNz<87p*8C0njDn)kJrBAEUX%>qzJyKlspW)13SIPB>>{Vs
z`+;8eO*A&&7AYok@MPa+7p3Ewv->{zZ59!rXwUJg^sh6#zW#5u{+1F)_LZ(K7Cwc%
z)XRlvIc~iin(W-y>LWtnLtSW*C26aM>>TvD+o62Qobp|usHi2q?vYc-x&@jQRO5k!
zZdzAw6fao`Ds-rwjnew6g^{r>r|ZzA+Z<m`9@+5bw>=i$^E6adkil%oW$KcNM8Y`)
zYUri9&2c+x6MJ{hz37o>nS!zxwSNAXOL;k+?^C^E8fPC|W|qAvg1?76KbO{hL%D-r
z>qPU^`G4w!Be3ra9N3AVwDTUYGUG1eAI4lGvwtdBExf@l{4ZlXdv<_lQ<RtD5$6ox
zE7Kp(cd|Dg*1ahPY)J(w?xw?xsJ+<bVwQUawj*9Q)UvLIIF(Uzx1T!CS0u;+dH;e+
zSaoT<H>aTKm@i8FeY%Rx`<5@EQe6+9LZ_;ot_(eMq7JgE8|^tSqWhYKd48~2?(%fw
zMy{ytoVjw_xl_z?>vYd@>QNEZD(6xQOnU<P@o(<ty{)L%3XShBSI!ju%ke$V9c0~-
zKAs`+{rmy70ia~jOf_7wNeXG+_8eqpg6ja;$?51p*7=&8k(G=F`6YEk{d!T8h{tk4
zOsbr*xR9@=J^!dmTW*eyi5}k}MeK#(Hp@lM(K=@_E&WJOOL~=Q%eGTj8)!5&Rbm^G
z1hxBm<JV~MIT;yrB6<o;qB_(X*Ke;KRu>a7-ex;(NTW~Lz<iJ|?vd7z!kb1vY)8cs
zJ~D^Gm)5{#>LoY?tvuDTXM^jpywr(p?b+ii<ij)-;;>8m{}R(kvr*^Wl<97(R#+8V
zC1*yQ^%|u)_gQWM8$j(<ex)pIl5BIKg}Sj_wWi=ujpL0|$}dh`ex7;jg~E#z67W!l
zCQK9mRfm9PM&fJp+e&QlsUWa>uGB^IMt)f%rudW1&lpKukj^@?2G?|N7LEXFI@z{-
zbTk;>G@LEk+@qy7f4OOjz<*idyth#D-NF=o7FNWT8;hjIX1LzRYc_=?-KtNwYX@fC
zRpw{C?ct{R^uYsj<mc!x;SM~>8rrJ`_Q|k&C7dE0CzodKoOs3J8hdH!m|M<#QBQV-
zY`0ceb!wH60Aml&JQwXj$`kSj{_`bB`mF0g8|i44rj?<WX%$CfUn%|T-5|<?n<;u^
z>2c&Rk8gw=|I^cX^W&uHxI{!i@j>)UQ$Msxh3ICPI-8%i&*65nmiK8q%jPh_&nbIN
z{RJQ}x&kBRK_I!!y~QuPt295h%E>BJXs08mtjK&B+-@B1Z1g&lqM6QfG>CBWJ&vY<
zv_1c>QjSYaq69JAOy;jAjK2_A{yfejyYce;HL$Vcv(+N%H1A*vpYvJ%H5NuD#xVt9
zjMg4|2jQ-5As@sGKfmHmp~u`~`_IV!{oRZvrD+Q?RiHW6m^bVI#gVG?19_<?pEVpt
zgiOkK+0|EBWG^YP6LC3OIIpHJy8-<dY8XE)<e;tnmk%M`w^W#hVX3eIG8QWhL-kN_
zIqh|!#y%qnsg~X!b0_MyVm_D#D9n8@sy2*i`gpnmmlkl5lf*u9JyqxmJ6H}$CF`_D
z(?XrV-`9`4e40nIi)XKGPrzjgtTRRGZRW?L<zhy6x02Llh;fS-wO9W+3*daf@|?&O
zDSc<pWa7%*D)QPux1My;WqqwWp_EP9OF2YOe`CI0!bU#>GHOoTlu(REZ(u*8=U-1<
z@UNv_#r1QL3HK^pa(o6?$<LQHRKscwHPap^+6yx-D`c!cCVlbyGXL1Wc?C}_&=MA<
z#Rubq*E$9ff~OZ~&~qDSO-ia#h7)y1@`!fjHL)YX^KXgSjoH=DTAa6f#xA;0P2T$2
zhfq|fLWd$MkSgb;MdyR}O&lcu7W)!z%F<ZcU3HWg8|b`+)&<V<*}BY>bnP-^dYjXz
zj>ri|N%BCZNHmEJUdlKx|63tE5vm5DW|oI06Mdx(f2?|4{DZ;PeE4Ifi<!Ryba1D_
zccj{}NW}&ORX<NYn|OWOP6ds#e|Ti|;dWcD@CHNK^xpbR)b%y~W^NKTK--)|F6|mB
z6z)R0>=n>mZPp55V~tJOJH*gts^(Sx!4agZHKuqC$FHG^$Lv(;rn!PupW=aMRR$+Z
z%x5vCK@g>lybfVu712fz795;kiOuKMcE`|i4J-xh0FtF8%{i$3^_wZzWnGGMgSWML
zWu_t@H*L9HZ?+&1cZTv9lR}7Zi8qOJOY;dw0SkZGej#XJ+_S9ChZ)?dGDN9h-nxDC
z(=i`N(&kvwsKn{563ohvE-js+IKmMKSn;nfy4)uwKc);kXRCAkB;x!`%{MpGQHD@d
zk#u?A(4t|1X(^pcmG@WAYrMWH$XcLVx&G3>Dns7G02@LAW7uW5JD@vgsdt!IH%yis
zB%VlbWw(%Gn$mpnsbz=f*yvReQ6IyGN!DJUw?=z(Fav(5`Vf6mDrdD#!T4m77xQG=
z;B^otg*wS1ex8U(itF!vpc=9X*5{4{>xW*DD<J9+VM)b~Z_i$&3kf)38*q?B-efd8
z|M>F_CNgBNu#p&E+SxUvAxR(#zYws~_PlStm#~E9kKZe*Yx$F;6g0kvw}d^5ua~Uf
zA)wr_p>k0<$)lcHgBgs)>xhq90s0X#gt~P)dYCj6^i+uyz&Cc2(hCw89vEw8;m$IN
z97H$WT@^Z5!|?BL2OQJ*P775EY~X3dYn~8yq*$kwzDIs<kp2Y2_DjHK6WsSea>z;G
zL8G^8ODcV}b-bd(o5uIz8d2)IcFzKL7m%fC)E3gPZR_508>`2?cJ<!a!y>*!7b~!}
z>Y*}R=j@8)5{uL&&e*(l=f$jnL5a*@fwAn~$2I(yR)+cB7>rs>+LC4a3xdju{PPUc
zU>9fx(bj|oUuN5;*U54H-JPn_i{uJ47CI70dYT018P^olN|(%>v7nr=xgmGDhH-Ha
z=D`l2&S(N82V`~2&7rU&xA(C*G;QPtg($~;(h;(X(E(!%NtV~svXO6^T6d@BH+hA$
za`w&?@|ZIAkZNKEcBJG46FWpcS=a=uS$gl8Oh@%(oLs`PM}0Y+_vv$1<T7xCQ$$K%
zafg`KhNF{(3K@Wi&CHbRUqpt@g@!_JE`1@)5oZx!cKnfO{9UNIEi(WNH1m;p5EJeX
zj6EF~{u)y|$-=7zMOyg52TnZ2G>BllinodbcMx6eUIn-b+s7p%lG~f#Yazh0GMZe_
zu}<~%6rH$okkNB#g{{wBhr!i+nBUGo3`wWreF5-$CysPjWePPeGq93+uOW23N0qfL
zh>^nc#50O~=7s(}Uur^U#?|IePl{@V{45qqn|U#XcPp-JG|^n&G)Hn~k$M)jQ5|BR
zufJ~2GZT6^)cD4R#$xI42dWw!;_fo5WTIXowCVN14f^%^VVr|2*u2@I2;+10+NC+A
zwD{+o?2wYQJDYS{(vqz+hOAS}N%HALD$%l;C%3kl)N%#~3grl@IlkGOeJ7XLlKax}
zaaIpSKQ}g&WbvqDltj45%yHO0X=o)A+?9*{VGS(61T)_{*WV`Hg!3AOt{RMr^)-pe
z38x-X01i9KKJWkOPGTd;d5QHru!?Ae?0S7}_|?TgdH$n@>PUhm2BGHD?WljKhZ8HU
z&+l{yeEVzIoxD6l(Nj=j`-)rof+Noh6Ku*5)(<ykPkDmZKwfLaX4F6wq^acE7wEoA
zXG8Fur-;GzP_BQte@1M;F7)_ID3GsOD?GMNi}-HnB{mb2iiA?)=Yt(8DjqB3mNs2b
zu<WpMU4KSx+mVw8QVz|nnW>~5bh>elTwahG?Kq=`%kuy~qcAC~sB=}oJ!mR<vrt(f
z>U&3ct?zV>R6}6+J-KTcLF@3y<rCxdOEv5R<_^=7;)Z9#CnKm&4Tw@$AKikbnk~%W
z2MSu1K!lCX_IKF=7BD^>nF#G%33hkeD(CUY_n8b;$M&COXjrGNw}qZumFu#vxRJOL
z;Y@P{6N$a%Xb*C5`9yLdy^M2qiFC&0p#3=v{C85;*&Ea}<w)L4G9Jo!B$Y71kOGxg
zW?|QttXz47!&QlbAk*ti+1H<#V6cAl5)<vU$ObTxRZs2yH|!{#j-o$m+@}KyAaJ*U
z+C!&K&5cjE(0m1YTMEFL^9uq~%SK?7`e?D1_(AEEP_f4ytUY2}d23b&*?B9{pPMg=
zU3&yQ9x<Ku2FWA(dFRCkvj7!+MqyYwetAGJaiD59gN!xmB`f&m<XMF3t}OS<tLKrB
zY<?r$tLJlEIO`7s4+l;!^R@fjzB`ILHWEeU<tBBVusxnho#?X6It~%-+sS|>BPXMa
z>!n~M(K^Q)rKF7#Y)7`u=X<*GWXzLp%0Fx5Un!*jDIe5ZLmJy8ya?CwnCl*0FPWC~
zHh^yU+4)7TjH2Zt+3VFpxOqHp9b|2o0{g2~4Mq(+^P5H?lcxKmD-F;PE5f!I_{<wN
ziI<O+42&Drh2&s`D2i29FgeD|*tNCWYrF<o9!8Z*b4|>u()P^0b#BQwv&~m9ku~!p
z4|Nr(`RE?%*?x7v{Hxr?5h!0tbL${_>Jcc$2JCr5%R_6)`yT6)$kXE8yHovgfFpCI
z-w=eU@m1Cc@T*A;<y&RLTuSNGMw5kAS_;oxz&LQs>^9lod)H<tl@qZ3CtZ-QY5RHv
zIuX%}Scylk8x<RDTrBf8dSwKmqFwOHJA7$Unh$;`R^*DC{5JLGa%YD*V_-zcRE>OC
z+oNI~QfKpoW7*1}OS1ia(lx`8?#SC7ko)W4dkSueVx41=REdm&@BMB!Uo{<V(sZ!c
zkgV9Aysyw|J}EOdAnkXvonEf^#>U-&;^{xc`8R?dHd#&(=%NB!Qje@Jv$}%7A8kI(
zPWvh8JDbLO(~b-IZ6>WBkL>!qd&v3y5}ZeKdHR;GVq?uGIb3ph2<druYRiJL-06ii
zJAdX98g%RkNfdQ<?6#=aNwtU8w}o30<CEF_)8*K^cILy}mc!)udDiI`u#ad(HwktN
z`7R2F>6e0~Co75+UI;H$7KN^tpo_tfQaEAIt@i3vY_{8%>(zfF{q8$_8)S}1uINSQ
z$-?_Y0eoldV%HL`RR?N{$Wa5w`T;k68yzF)LSEarrZHmza5rl)BLWl+k%JNr*7v7Y
zQR4au*%q50`-8=5lKC40oD&4o*}tCXGG8jwmioMa$n3*(b<zhQj3vt+ceKvYFL$z2
z^ujg`o(d|Z%TPKh_i(W;NK{JAg$!cZC0E2hP`Eq48X%0R*JRF93UtNbs%YNkt|PM!
zTTK^02jy3JtnT0#%<DEvWT%N-MuV`l(RE4xLWG3XBW&h}iN^+qy1~qTY=i7@d_Cw^
zK&`JB?_??H54u~R8=%EQv8!ikR>pCSsU9`Cr$a|qpJ}SNd>Yxn&U2Av%RYs)_Y{ra
zW~p6A=N1(U*nalD{q#fFw#<#t*P~r4^^4>z8_|}XsHtkO&bDHNt$gFO&k6VuQQxDv
zM|AaC_E<54(lczt>glLCC^5y<pU73eRU05!iEF%|rWRK5xt}DGsi0}n;;j&B7fz62
zu5l$Du9a}TVPSH=b;jh@_$vEO&)bb_<M(gZ(>UF%+8Z;)FycuUk{!9{F&`Ey^U$8x
zJ|i7Q*B>M!L;%IJsHeMYd-jQyvw3_n8XF(0z$xWXht(>wV189Y(-}g*#_zT&S$|oe
zO>A&A=Z<6>KH#xFLFy)8uEc(dj$^>7N_x*)zy7<hT<!Qja);v-{pHV7pC(ELm@E0b
zieKN#ez~N4mM}Y(!!*UY{ck?IPiotMf=sd)q)w}Gog?h7OC2m;-(X=~^L0jd?9Dd5
zd#nKU?Q5jy@5JZSJNOtZyrh1I9fmGuYwYfT@Ls8Dt854N5czM;uvAD*$}RaT(#r}t
zs9D72B)V^>*pNxr1MY+tjfA~TD78Hxl?7_HJ)n39K~Q(2!FqYp+s}CSQDvEZmYigq
zq$=(W(9CHt(;lN}Ezo53*T*T=z+&8)4QuYRkcZMdom;I+G5H4)J!!X!x-s}(MzK;^
zr9P@m5h=3~anY7ZcX^Vx;&zAT5<ZcAD3EAQ`<UJ(VETxt%jUhKLCnVtTRc}5|JZb2
zS<*rbDcQ`wo!-+<5HgD*vj-ScYo^5vn@C)$tq@>gUYs->MYE!8ba6iBuaew?CV<_v
zlku8rdAoHqx*pY3k^%=F7@f{~kWfS_w|!6M?R|K~B+FD1jxu(5Rx_bdB%9jrStv!h
zg_+(7`H;HzUJ$*oK3kS?pGgKEwL%^!T`rrUSA)<ovB8VNZJk{o+~kHnR$bVgDHH+k
zreIG)`=ZZ!>{*g7k9PV+Gi$Ax=*{UHXKkk%$0avU_yxyx30gQe-CUip^T|Rr5ANP=
zQl)5!MHCfx76YdsZWyt`Kfbd|91075yieZ#6ZFL#06X!z*x<7MDy?V)ow0Ouk+|e8
zlCR+9oHlG1ESM1qd{&+}-KO2?RZsQ25$pD5qcoFqZ>+{9@^cfR3tAn)u*DeEPP3)_
zf(le6!OE<TEth_^VYAW~E3^fBofTCiu$<cN4gtCbth7?l*og+}(9v8|4k;VZc@)k+
zPj*gpxX{xBqt!K>bJW=$Q9?!gVT@BWQ{K(r{7@fDJr~(%WW|nF*MXHey#{x567HJ=
zT70#BoON_&O_rQEH2?I@0TjlGFnMbI>OyU2sm+T66k=yxK`l!*F7i;6qg*JvuW5o!
zuID9AR&sit0hFecG0whBvhW7Fi0*!|0p$JL;lj19(dGp#AKOUAYm#C|wu_^>9A4b*
zrZJR#y0rHJbcjYHQ~Fu0?8q)|F~s~;MJ|a0e&zNa!k&yV-b~uHsx(4of!INyyex7A
z1GX=t>@M>dc7;6(3qCwinP}~OgF9`zN?ms6nYyj3_3YN9$h}spP~p)OGv!>;wN=Dc
zf$t|%EvUd$9Tppf9kjWAx{Brvpzs9gpH^NvLPsOv(zP>;aK81TKX!Am$uAOheA2gO
ze4)?cYZ^3WO(8Vnh?<b_P{!#>74>S8l(ohiFK5N(i8@8HEiO*do7g0HG0`Uj;!3g5
z756zXZJ*iT<XFh@WwTcdl$Od3`T+*0#yy6pN6GmaEa7Rl{z#ycxa2pw6$Ow`H;2`p
zxtCMXjWtDY9<nSO=Q;3KO8M=i>`n}%_f}0A*}j?6+l)onf9_ZSjz8;<r@v%!#Rz8w
zaHRw8%^qIH+V(m)-4R%kZ|DX^_#H5o_$O?x!31ZHT^ZR)?v<bRVbA1%2)Piz4y^7+
zIJD)07JFtFOROA*L!j7EG&>A}%2?dZftd)4470;y$IPz|UKJt3T~zf$(NH;3TJGYs
z3V9bgV8acvmj{{RUiz=AS&WcxmO5IkWS&7uEcJ9^bw*C$+9al;fW7`g$Z9>v!#C&;
zF?5}!y&T<w!d-)VYXPxf5cEX2p&DN?iP;I3RXIpDB+>7+?{Tp1twTHWsbPXw*qUCD
z;Zwf7b`nK|wGS^%eKyWA_1~`{ts9L<`d-rZ7iJ#OD<!%;nVSDp;2A)(k)iF=C^x@Q
zs4rQVHy}O-=B+lhla5c|A7ZzFLK(UYcx3>=kFiV3hm&gMCNeI<u~LB1GoYrSjDuBb
z{6mluyZcfbAvk<R%!%`xu+3<B2w-#XBR0v;k60`4MQLSe(FKJr@87)9g2i^+Ke%vG
z{y<PJV;Ky(j$Ihev73azOBASu9ip&*$S{!uF;bg{Mak02hw;q?sM5_B6n$P@lUN;K
zHfG9p*Vi4c*j+q!Q^}Qum{A51ohedyi1s+=NZVp~qCG7jX;7MO@^dZo=W=Pg70u;1
zs(|&YvCHZX+IGrwH{akPr>DZNv^W-mqxdiIs{kqDK+xx_^-TFkK)l<a0SfH|ms?Q!
z?Sg<kC6NGP%!j2b6Zq}(^Ji&i9~!J9zxY7R@~DFjYo-j)Xi<CIqN2TKQYN`qWCP=S
zxI{$H54h~+X3Op7Q>S?DWisY^&@NN)5n|cmX~Da0o4~OmRD#X84`kE`(qC12VVl7@
zwD*&6CzG_BE}b0dwl_{LK+m}mpKB`XXNdJ*o~zN5b?D9;(YYQ40kW&||3G$qe2}E+
zHMe3^zqUq$y+aub-FkVxujAO~n_m@#fbenq-qjgLRyOJNKLw_I)D~PC1+I=-K+55H
z*7rW8r%4-wc>Im6vle+Mj;y#H1nJ1+k;*|ECA}uhx70e*=Hyn>Twva_V94omVlnT%
zgXb`{B&DqLcUpLrJ4-8I8KINCtWG&2I18aG)ky6#N7gX=wW>SKB`PN~`O$`AsJk!W
zOSF=xVUZkiSP$H&*-kE6V(ki(aITY%9<-dpQpD?8Ogpi__B!(}ZAdrgDXd^v2OE5s
zhooQm9zL8i8nQ4V9pPl_vgrnk+wtcbPpqG#VQlFcwFg(jc3w19(eBmmyhEqOhXs&W
zjrwi0ExcFtkcF*}=vR@2Z<rNr`Rw1TQl79qc95iKGrgcgIG5fA5U;hcm_b%M)!ycm
z?ZbP0t5EW8$z>^Zo^N&VixhWSkeYlLtp=CKm3mTG#Wixs6l?XxQQ<}oCF*w1XWh;q
z1KnM9Q1r6lU&@=I{HFC~)*5%hLZ}<2tFQrQ9Q_dtjq4}V*GCZDCup%;-JGsy3Wisg
zb;JDW*6y=HY#a)-gNl<TQo!0z=n*61H|JFVC{8$2)?2?Syxu_pPbD~7lglA8r@cS*
zeJt)CxUN#l87X&xaChzOs;M&~;oOO4!BGOGfg;J03?rd;S-kOkBnhAX6%FD{J^OpI
z#SYWb09p(6o{k8dyT$nMC{Tvko;p}-(2WZfZrrX=he~4<UtC)!Li=IK_zoeB0fUC1
zuXFa1xKq!ZFC5VX4lGDXC~vjsi`C*60pVHe3H_0iD!ei@^P0->ucJ2hGdCxdqu~s3
ziN<Yyh2F{xYK@9#i-Ys*1x;ndM-Z%o6+FUM#GN>`%?^@bSnQb(d*Ip&bzyssy%v)_
z+AM%_Ssl14ykpkJpxd)&cw*dtqQWUq&it7cp6Iu`<$+&@@Nw43hmUyAUWLX!=aId@
z1m`3zV@ltL%I!u!^YsX1U9jbVcdhE|lSOTrAX<1`vzicmBe!vH5~uvo?oLR=`B;+w
zM@TlTS#JC=N*2QS6*mFEgMtFpUQ~Jd7Hfes60{cPqm(Xt$cdA`R!FW?t`v596npC~
zjpG@!TL*$BFW);@okMhR+i1mM@X1cRPGIp*GJt?O$8%np3v_2yQ>BI(u(bR=gJYC$
zK7G!N&tx!eM{6XF@Xca&b1~&=(_^o$Zx!wpznt_{#Mw_px%YxCdYab;7RuF@PO`ml
zX64@pRpz6%W%Y$7&S@m&DI&rJbHu$+<niLO>z^H(FJvi2Hi%F9mnatB75ru#+D<{d
zGbzotTcCAISXRiP&E1&w<7}=9b{2AbDJ*GT_`nXV*UaBiv6(>Dn}zwdWVel#O;4#D
zbnR6UpBbjaI?P?7dBwDSLT#A;zl3M)vq3wl0lqkCP8)xs!G`33Ld>w-CT?#PG(YZ(
zTLvA3UG4MVqSHR>!gu395Z}=;&Oj+^?6{scx^(xZJ|kmxVWFDlUY2m@N1n7I9TwfE
z-#FQj0mrdxp;@BH=PWRjdt3P{&1f~|cSg}4wqKhPXJ8`kWEoHwH?ujci8V(Iuxol$
zpiCta>)5oj3$lJhcum)HMfi~A=JI5Bb2^Xvx=|eVVxp^(K|c2YV8;E4NYA01n0_4n
zlwOX{#-H3A8^=~#R9y7lWP0~%{Tjh!TZY3Y`@ju!f{($H0EH9Z3A3Gnf;aQ^4x^to
zT>M+9Qr#;D-BxS@ZbbNt1Y|h+H;?2Vab)<~>{SKwXoIG;_y94{YnRL>%}kPCHJhY^
z76WBBJ#-VC;n@HM6zDM<F?5Hl)4s%XCj0qIpX)+TzYdh#zrA(}=V?ox=-UkIiQN#n
zttRT!(&gKK7%j0&3m)#>Nm{36eetAwCCh&2p|;Gw{w(dOvgR+_sp$$hhVzmhhEm&y
z=SL92@3+U3)S+@bX5>R6q8S}R%rno0<SYI4(HE<-jbjF8Mdm*}hZGRpGGx!FM%J?T
zGFn8SHxaT>==S=}L7+=$!5eTLJ|S|y$yL_eqI_NzVU((34NihGI7r)>849Inb*UK_
zQ)S9mhQGJMwFyp2z&9WP7b+pbo$V7Zfd{S!2+;d4A*9|;aC8V&`}60s?;HQ0g!KC2
zy-3@%)YM6R1Gg5Wb@|gwI=I?r=cQ#Cj<8zuxRvotfe+@7K-WQeq#S;k^n;6<GQ3?*
zdaCzq5KjOr<yQn^ArmDkhdYPDEZmURi4AKBrn{egrOr%hsN__L_$)F>dIZRhIVK``
z^oTO)QvdqXN$T=@BdDMVs(2o)Nly4-Iz%`+;8GzCjYv+fl9`3YsAbZlrT&>plKDNx
zVyC8M_7{Z#VaY5eZn-01SnM_~+p&|j^k#_tX74CE4>ZtbKlF%a71vW?LSmv3&C|JT
zWMLE&Rto2HaVYctIDR&|&0BH<v!~0pyH=_aDSN7gV3{2G2P)OpQS2Lkc_y#ZQa{&9
zFE??mY%g$~u)*yZ61GyeUXO8kKQ9qrpuf<Ine2W#Fexp-8N2y@Yj<!!`bvg(=4lxg
zW<RgPs`s5J{ch3%+$;TjkQ2{SPx>q#J4ZQ7p8IY{&qSz{u!L}Prh=_SjOjxaDcGRY
zp9F~Y+TzuQ$N_R<yv_982vP5UO^hu~&=o`X+fSv0$%N)uBxouMTdu-10n>`OzMH)w
z?XAPkD!S6dJ*MYapJIM38F4s7Jj*Tls?*1}iU8ynQ>~GrcHTK#Y9fP)J^~LyIUNuP
zH~fTT{pVR4kt~ozX>YHpje#oLHhM2MWqSfWd4%@kZs7<#$NwwGI3YiyNmYR?N|6ij
zHxc8a=t&rjNXm>ALf|3u%cy{&UZ(A|G|1A>(|k7g-FuP8!>0wGW`*Xsxau?vf_+-z
zyxy=$+g7YG{JF3XYjON1%#iG~kmBfvli^le4!u^UW_<_nXN_LfXCFjwdQ=79UK%ru
znfmBE-vVyt<0P@YOyU?b6%r%<Z4u$18R?dZc*NQOZZE==I5K)`J2a=VR;yhzCGLwQ
z2=VS?tH>4Ej1CbnHd9RILH~{9^|9LsCC0uQMjDf+@?grE9@dp2xGEyJST`%8tZ5A`
zt;1kgW68)8tRfuIZDkte{RlYzTG%Y6fCg>73<p^`J-Pv<66J5TVM_?z>j>ZuT}R>x
z7spJS#3cU)uz#VLBqiVRYiMbxoOdij0I@y~S#MdAWPM1Yf)BcC%=J(DWInt#0Ou6i
znvnC;KGZAYGwi&Z2Qzp&`0;8cIQ8Jmwkq-;nHpq%^-Vi}@R`GrT|`gW?u4*;Jkcp>
zNU?5eA$I%WlFBM^{pKo(AYic*Usa;b(o|%@?$Qg0jnL8DrtqyH^p~YCjs9?rIku*k
zQ118*Yd`Pg5Mh3dMborwH-t7XL83s$LMdaTN!@Q5K@@kATnTkSp@EKS1$5h|)jD}|
zes11xx6I691VRG=H<hK8(Lu3;v(|H%)@{GUn}cN~TaBe11F_@J@{DAsBf3#)S_6)s
zMC($Fh7aIhSXCCr^xIr15zy=$Sn)P8t}8*fH~5=ZIx9EeqF%BSpQul4997xWQuL(c
zNy?(!{kuOBG|4@4=o%Be^78~<E_$<>=$YuVD@lGaaG{{dUnMd^t(iK72asUO6_nPc
zZ<l&(t}pUx!lkD3THQhXiF<205ovomb9wV|mwB|RF?gpbJNTEiFZ0ILbC}7_D(&gF
zQg5$+^m~<;fDax&KG;fh+pZYgnIBR~TLnZg#P-??Kr&Q&I4l%p7GIAU{nQS_0a^fm
zg?Ju-(N;D0)?51L+;-3bcd-83OiW{y9eVSddMJ$bK41Gcex9&(4w9z1hN__oI!|-G
zciKm*bc7rv(7QO?poaGJRMkR^rS_UJ2eJYEo#wQnei-$79;Y>r(_b<S7p2FoKZ~#y
zj;~f6^`G|mFjfkp%nntQFB1fpcC$np_bi!hce7cn-wdByfvQ%u06n9>{_q$T#eEJN
zpiA4c_o>6}Bd&?Z-6!wkS}?tJ7EGxFK5WTIE0m|CnC*K{xnW(hck=1enB@^m#xMz!
z56*d|k0qN|@>Dj6p5oP`n&W`0NLoM&^WVv}tudIO3k7*1dremD#=NdsgnJdq`VwC`
zm2YyUjY8bduFODHYcpcp?lVpAJvJcMkUW~t*N*AST0FX5>f&MoBqt5eU}`HDSbJ;j
zJc8)cnwgAmithe%^=eZQ)Y-z9q?6&7ymSXD;c>2hBj<MK;v^!m?KbU|Ue$-*om8(R
zfIJKLrGMY@-1pN=M_P;}bSLikDhfA#Y*VXNLi5<yP|eXdK>=cO?Qxl|d72gevPi&m
z5JoSIcspd`Gm2;NvBtfe%Bjl)$zdWjo%}f=k8)PGKCsu3=Gy(LWs$-SOAB92n_h8+
z%$tI5B<(K)s1TmR;O`=AgdtjckNzm+wZl(8Ms}Yiy46ep8785l7d6CC<SfDRLFux|
zUc;PYH@eHVq7>z}Mf4L(4la5L7su3kL)=Jm3B#p$e3J$gm4~W;)-ElmF*e4QB1Y=3
z6Frl?!?LFnc$CY0iklvj{gI{kyZp;&^3cFr2(Y+g!`lfS(1xn?Ko|2nnl%d5n(sY_
z;E9gl9ld-RXz%DeBzBokH&hsd_!GRjf%I^P_k^P6@mm}Br?wg`YPxoC4|Ket3;5|N
zCDsVdP>fcN<Ktc1qyZX3pTqRaeGq!(y~RJkgjs+1Av?c9zD*iC_@{kj{)<XdQf#xn
z0oQg0+&8x{KE-11s~B1jClZp~FdGKe*2@W2V)AijDImG}V5ZJuK|8KpSn!_LnIRZ4
z{IWXGX6X78gGk-n*s?S18Q4K=4kv(pUM=9;@>INOs`XyWT0p8#_9pAv=veH#8o+Kg
z5c5MAJVSEGY8&SGs1fUY(WX-yiM<qoF7#L?&E_?w*;k(~9bGO}oK)Ee0Y}1!UV@Xg
zE_N!21oIyL0{SlvV42{-)khCf4{^pYpO5a@y^&fAoJeiGvJS4;>+i7s)OP`{?bW;<
z@p8Ls-B6{&_&qLG{B+`te^}37VPveFZhz3Gw!1u5BB3VQBz7a-+sI<L&}YNO$yzaw
zCT94wxDksFian1wr_^lf@NVmoZ3dyM+aTW$SegU>GGAs|?g@~xsje}EFSaVeZH;vO
zOHF1%?d}G#MkllW=F<Hp9nVm2oR#bp<R;>a$Mi!kvVbn66t9`r<S!?vJDpl0tn2d1
z{B`(EZ~VNJ&gCW5O-xyW59Cg78Fhk^ht%>w!@yJHaS=sf&5_4I%i?d=uinyhb&UV`
znw7r3mFN8%`VVcay?s>ku2vMaP8%D0fhR>63oG79IrKv4(Q0ShEf&G`54=G-HswBD
zW(MJe*LRfa$;CEJ`Y0hxDSTy9z;3Ge_y~kVdR5wEz3EA+n0jA}MwQvUl1S?<1srf0
z|0h0aA*OYw>1FW|G#DySJSJtpONex^;@8;Zzs$;YL8ea_-rBl-^&;8Q5@@{3JXJ`N
zEsHlmU-Lq?zoGa#Y^%xuh9&7!=Jxo59+rCf=&Sr!Q({qani%Dr4awK1v3V2r??VBn
zT)D*|b;pv+gWW@k=Lu+l&-<T}A)aanM9u!V#tEpP#ZbL1pR$EgZRfkQa6qm2CxvrB
zY~&r3?0Z~jQdyzLw>VvA36x+f!Yjr3Qqr!+jyWqr1qDq{e!9QEB!yQ^9;E5N_sNSV
zCI50*xcc9gj^Zs7oXP4C+;s@Co~jWl7k$28QQr^x{DeW;;}PkM=JPYumbBlDFJ68g
zbdB-B7z0=+?zso0>d~>KW*_#{QCPQ6twC2K(<lZf4-cSCiVl+fTZfn5pjwG&T^?l~
zwq^XN?`EoNSMMqf%#opmbgx0{vgUGKYz1ho>Ce@1KS^N{m<~Y{k=4jcR+9MAc!SMi
zrN+*%eGP>O2zfhC;aOh;W|bDejKA1B?Sd+876fA+0|&KVKXv!(gf$ipXeizi&XTB!
zPZVO(WCJT0X7V8%#b2>h^dFrnKP`}6b+xJ;AWW-&5hky+(vaa)fM;9dK7o&-XwhoJ
ziE*QOOCG5HaIIP?awC+%zN2*WE>zf*fcH=8tweeWfGeF4=uu)pY4cc>_SDU#w&e%=
zLco8N%)WMt0`Vzv@$c6=ev@JUt`<WmC{rAssSSVq`~SJpSKrA?)0}X1MuQ+u3g5*0
zS4sQd>D!)HzNJ2-PB;7?B>d{;zPcpvZdK)hgJ>^*5K{bWWB&U>Utc^uWZ%MBA%fc9
zlovn0w7khNptl}TGa)X1>?gziZUn%kCq9}gn7&s0?u{RR95)KIxmJsZA!@$u`25U%
zVrT9tuQJc-{Q7?TdMJ(;O!nY}P$9oG*^qXi^lwku=i|R`-T%*ykj_TIvSFJo&l2tk
zlZS}MCfB<prxv|c&#J97B(;qYHH7zB5YF!EX7Ln5wPUGy&xU<og8oprycp$cqEUW!
ziE7s+Cs6gW%IjoNN@hWAEDlspWQNPc*I0zF7AmsF6PhFDYYwM?Iu%FzA-P|B$No&&
z1_VV=J3rt=2~{JzCu()T)1BLjE;i+auwp#QP?HT>J2m(aYXTRvN1DS^FvFUH#rZ5j
z_D6K_mD(z1@o_5SSo`5vT?xA*qQ>bH(>OQ0n2#@?Q!_KlAY6)XHGmay`=AZG?GzV_
znUY1)f1>)Q?h4${xIUouoVImFNxY){jiBKwBh#Uct8c=8=1bqb=EPG*X6kXO^>#e*
zP3V6cU4O@hd@<#bYon?OK3`W@Ke{$x&LD9{j+u!Cmn+Hp`QqoH_0Jnas)2#-#a0E3
zzl<+i_k$EYPP^2V-rrwx$n;*;g5VkF^k^qWtX@@aB1!)>PTFXCaaCZ%zueQG1<^T&
zAMqXk04}V(Er=?FxWmT_a?W~kYSRYU`!nzr6vl|Cpv;PWVwSmSgA{4vNF+1oCPf#m
zoK*u$5`gzP<WfsXiXP8*xtG5Om~ox-tRR!&_cdxxUVumHPJ=3%`cXzj<Oi4?@r2k8
zbn#{ta6c|##x(}QXrJo4YW&NJ>C6<vI|8~o23*!LH8A6iOT!w>6yXfKo13r+zwoIv
zRa9v^K`$#pm1NQ3Mwx<nrg8WCG5=sydFi77vgUzeoxWvKfLL~tBK$meNwgEB!g--7
zewm~>BH<$)R&C*!I^FKJw$-?iChk7H>~se~Se`d6^|H6SJ$Xb|Aw3Bith)R<TYbRG
zpfD}FK}8|GpJIO3ACg?!kGg9|8Xjx1AG%|=hcS!@bZ0HM`yL^KYR=exy7CB!;-O^Z
z2QdHeW@?ekx)*~(^O#g~`243Y-WVCDg{EOtX2R+)qTYOUXy8S|$@~y`F*%y}Aw8AE
z?9~!|85Pgi5cw>sE;3n2!Xm3SZ-7u(4j#_W8rU5zKYtJ{^<ePKuUw3yPZLN_sHI6O
zlka2whnNeJI(EXfgi~DO*SNfT=>!#kAzL3;*W1ele?320RGEQz#Vptj(l0*?bb+~B
zSbK^I5&hrvk$<?|{h&!#p?n<T9@}`*<It~TNbF$9kaIQY<?h9w#`r%`x$j*X7h9sn
zHP#gSXutRQAFP-@OnFWRHazvm_WtgGA6)aB*F>KmP_UQiWVs>n!)3k)6QIWqq&|J$
zjz!^LU6rHn(thr{<U%?5QzLzIX=9DR3)Lp8tXa*!xL1Ht#vXAE{e9!6eVnkzog;s}
z#`NcNl5`75zsaA%(^-Cf3x!94{QKKhnD{vH+g1QL$hfp(z^?(F`{WEAn%kRmk^0vY
zVNY?rwf_HaU!eSb;{vGpy8!|4QI)A9-!=Sx=I&KV03rSDDcdKcDn1Io4hWz<tq70-
zP?dTBa2nvRJ?FpYyn(+T6BNIXW0`^OO0nx9&ObBhcdu2wlt1G-Wt+G=aOaosbRg)z
z=f4MkKP)Iyeu_Dw$t{Y9q=){mGl;4jd8I?OuK`*xz5?*sf+n05_#ck#B=A4Mu>rbx
zGWiF7{R2J+d7T32q9%?T_2ipO@!gF7_2bYT!2fYEUs^GXs=vXBP&5E~qng_sIer-}
z+ARAT1iN8(YESi@T>bBzYTPJ28hx4>=~eI-pZlNZm-VZlYor!;R%!Vk$JIAW5q=%o
z2e=5_>AzEwIMH7lEx-R8)yN53CxC_*lXv@lzc&%HaNqm;vw(>%6aR1eldW2iBHiiH
zed8qlcZ_rLw;3UjAX29^YQEj4^Mk_!lpsHAAHKNeLPz`Ke<MO_7=ip-6A*@!|K8|7
zkf?O=eJy5AwfNWR0kyBikZNDie!Ug(=X}MU6BwF!$q=xB^B32xf&(MN+LBhS4*il>
z9He|L0FY>j*QWf#zb4#As^AAe*J{ZoAE<xLUaL>eF!fYF%K0IXe>B)C)sf%7D#wf|
zi-G;G3EsFlP}lx0@&FO_ySEDH782wBzCUq<{2F=AULMhkRh@|cP8a~PMRPf;r(@!e
z4h!r&VD^^H0CI`XW>URN&URDmM)*|EgH=WobZz<m_D*;na8o`eLltlz$<e}8W8*s#
z6vHR-7i=<O8u%{9R!t7S<|%L<IAL2QU*9}Ezu>S=&>i;|t0`sn)E(cGN(iIqfmvDp
zF6uk7>%dkeAiv?*C-M_o2p6gg_4qe_mT$g$4e&q(```9Jpm-Ut)8YSB#BlTx_G=e{
z%O5Pyei>dviu=0VZ-zHeH~cne0Mg~}IyO)?{B98bzc2w%xctln^5InC_ND>l%PdH*
za(`sQe<!unwbn<BiT=Ci_V=^{`mcwmYvF#@@BZ&dCql=7ynExf@-97q602WiE6)A`
zfPtcQ5<s-k)D-Gp0&o!F1VFvmaGWOJE@J%7?f?35$PA!fE<<oL<{#2Ukk?7uPu{_b
zHD=3XhyRYmB>htr?Ei_w%JE?9|2Yfbe}>Nj`i&5BOsZkvHzX$l&<psfKmQovKu8C=
zJDC=o{dwyBDq-3B?n|J143$B@_5ml#eYwerFYnhG@IVmZ>@$iU)>T-LBhxR@Dp1~u
z5(t;yBm)O}{~HtosIGq-*nXc4dDIK8{6JjN$0=A?jI-F($G-g84f;ecu+wPbVk9bK
zC%d9`J+c^Y?I>&Nf{^ys(-{Gp&6{sVPeP5;42{j(MkaRGyj?6fljr@|<f<FRU4*Bh
zwo5euE`^*QJ^~9GntOyh@_=PQ<c9q<0(Olq`rPW9p6`GK4SDqx#DKR|uCgEQIszSR
z{uP6$1Y5oQ)NBt8R9}M`zX~R6gl5;I3S8LRT(l=$EcXEiELB*p_Vx`HzoWX=+*Blt
zcBcZvoP7PJI3oEM2EnZA-$8+~IS|aK=nKN%@JS98`9q4kQ5l@a8RC<Kt7WJ3Urmq1
z8%IEdYtEMz8cw;I8TRyc<R!={)7%WcuVI3h#p%CDM(OvUzubjNG_4Hc2j2nfdYT%c
z+n)ZCvKttu<<T}YIL#HV6^7pMo*jHFYcYl9uLAyy(_%goMT^t*bap9L8F#L4dA(3a
zI8SeOXoU^cJL$ce?j;rUp0gk<sSIW}0!>w}+72}Ba!nvSpq{x#Z+7&`cNEOlD~Nge
zW6|Nr{=IIj{2m~SQc<q##HPlKOBu8=6)-$TEneJrZTZ4u@1A@>?JgTfy&pko35~us
zQEmnn1x^9W%xj==vXlIB>|SpUr`J-gwH^;%n7nZ_Bc4{%WuZwgHurB`zww5ox8IbH
z7SNd5sL8kX-Eb>0kgl%Ebf$cLLA$&6CYPt44!0ewP){qPVM9G%><!I6;m?)#@K@TK
z^vXo{m;k{BGS|{p@R;rNS*ANxIE_vhe0Wm+UhPpw)ZqoKKa~zHIyrc;(4hV$r^;_o
zitjZwkZ8@|$l<d_X~~d!eu(wQ<+DbqP{?q-q>+hf+(x`L^1Xdu{ad-VkCMy*B3wm$
zrtH)2UCsQa*4i;&>{M3YVM;tC&1-QGs4@JbU)_0OlYDxWdBIXAmwHdb-3Q)b#g!F=
z7n`klNYSd=dHd$KHHak#9nyW)kK0$^x|1KbU8k05Fh?<)_?!`vU%ciV`OC(Z7ug5e
z6l=Ew5-dUhYi2A;1wpxk16Qx{$|$tPjM(jM)g5P-x(IG<9gr3|m8$Tb(X};R$zmOR
zHaJh;yRr%3io>D~e>VkOw12a?c6w-4sAbSn1Qe9!Us!VE^bLm(7Xw?I8C)e!Trxdk
zTA*_FP!EIc$Z4AUXz_{|Ba^u5d6O6;i<lyEksxO0_0-Ym;;P;_1z=tJrM~Exo{}mL
z&rN0EOqAVqwFVc{29ly@z+g2o2ay^j0&&w)UFBA&`4=_^xluj{TG+_pN^Ej9&+UQ6
znz%G`)tdX?ND#7!N}+~IjEq`yRtiay&4me*{!d{khzs2G6fm=Nh2~LEwr`sTtbr@%
z;NtrG{NvC@18#PI-S!B!*ynQ{*0y^GLCzVo(rF-RQB>sLIii^B1X~!gUW3$MwGn^T
zsLo$QfBk`lw5-YCyr&EsqQqumyZdxj^_X!${DX`JwPIo260<>v)jOE&*eQq`N<hLw
zhov}5-<EVNhsSA9bB#N`^jfh(!@y(=b~V-`pt)UlJ4Z`+eHEG#Rl+7CR2iMOUJX2B
zNjhG-t}BP{pbg{Vqo9YYhR%BW40#?rk8kg9hEc0zS9$2E>V~`x$KSv0CJ8%qGeF!U
z@BO}y16$1xqIJ8x{NXXnRRvV3N}e5$)k6r^FOfNwa%}FI4pL_(t#mQ^o_ri}e(PTi
zzaSAx@#bEKsh4T*>b4g!Ygmar-G|fu3JJm~@w8-E67#v3v~rcD=}Lo!?d^#-DmE^q
z$;<2*5#tLqz)BQnVAH|@uvvu^-`nlAik{N<%O73}Ox6iWnD3OEl9F9id>%m8SsT=_
z+Xc|*Sg5Z)H8&4y;<VDLvs}$T=Nye|%4=DUrz5f4vZ!k^rS-1(kll$mVD$~(sCj^%
z@HU(`vEHatphrzJpd!pY#w3lAw$2e<#&Ku2YLReO&-leCyG4Y=+*AE8vP2JH&%s-t
zx5UTb8dH0)PU20wCU)Rdo9>!p8*c^>PCS<_bjCH8*U4+Za;)o<!i=w_1#IwywG#g+
zu7qtK^E*2b9t+tI45bxg4OdEgZSMLKce%xu@QRYPyPjVT3(g?UblGstVrCI4-ZjQA
z7Gk>I;kT@zwUuf*WX=IGQNGketpi9o4znyvk;k9QR(u!lBi*N&n1(D%cE2387@&fJ
zcAjPVEk)4F?YPbbn#)X}Mqx(xkyjbUIuCogC7w-e+KKsKPAsQRoE|E4DRJm<EY$I_
znN6pDu`Q}SwCb;O8UEVnsd6JgNrH4Jyxi<86>#ZYOR2hqShK-9r~VgfZ}}HxqqTub
zBdyY{C?VZaLkUWkbc0BTbPXx+NQ!h09n#%HcXu~KGxWeP#2KHx_xS_Pdp_}nf%`Wr
zueH{7-McHkpFSy9OjK>&j18O_wJ~f>#6h8!+Lo36?`JWO{!@Y$Mo|I3YGOx+@{^2z
z6Fwd;J3pL0%2vZP+e;MgV8<>VRMB)I9NS@p3Nc0`%ZsaJ48L3tVw791Dd+CFJ00A=
zE+eybtprEhtcWw;zMc4@qU-$FRV|X3!!(`5F|d`uT3ybhE|^I6^>&6r=sE`cwjXd_
zn}>y&^NMZ`9g#LQvGd=743dL))^9qeCPLZ*8$&ARAo)LKNyrzxI<~%{@>Z*W9MDe2
z(q4amTkIM`MN?2C!p<T_>w;2Rur4p$$coOkVUsOkq{L)`_?2#Sm8P#hi|eQD*3+!1
z3rd^B)girNUv}Da3#Ie?0s}neQTyLb^Z4C6G~hcteRYwEZmVuy7xsU=ueXmz9m7J>
z_&k`7_v<K-Z~MF-^?>wYtxt)h`WKPSeZBOchbf$H_#My*GXsNPO|4E>NLq@$o=usH
zs%#-X**)aCax6ovD}rcG)w0)@Je#<TeKRViFM5{Gtxdf^>+|Nr=gx_woJW_4RzG0!
z{O*aSouO|VE7AS-@SU~#qm!YkYt8DdH}dQhJXUGv3jnu)BUKt8>ZD*X@u&vV1K7p&
zM+N;S()W8Vx7T1=kb}nhcwTYYjm1N&5>UL$4N@M{U=BGZnWM<JQ^B3@-;1jsp5HuD
zKD1pR8z$Rj;iH?Q0>va%U4ttC7315505QPqweUwvfn1S~_<Va;Oli8)YQzOdR!R=r
zsPUXBf^4PeISq~O0Tob!F&e;%JU35-u1)${_#w93YW}JZ{Y4{>Z~L185%5v*hvrrU
z;>oDGDc$wzkW0aP_9!rnHWuwhR=d^R+N6nLwwOuG`7YP}E+)ale{b9WF#B)g@EiT=
zsIBF8r2u9;6peL-nV~4FbzQ#4LeuxO#mYHccC(gneDQnWB8U2ZYkau7{gH~tdB1Y_
z=tkcm(Eoo8c_1nA!aX3jUqevb(-?5qaykeXM>LnNv@+5>NKf%|6*06OC!Dva4D3rr
zt?pZC7>ZHqPiDC!A*upguEhRe2!c2m0wglf?Tq8<=r4!<^r_{5@)w2LzTbwA__}H(
zI;=W%TDEL7Je_e1ohumseg%41*`*mzL;U&iNJ{WQ+cWI)Zm8DaX@m1B>x`2RyEaFp
zsrCwX8XGHiGqAC5+GeZgIIDbP>+LCcIk(8{;rBi}H{|yU$CoFT!gR<r`y$wb$-+Qn
z?q}A^5>}nsI%q=t?%#tj#~ShGu5pB@pI%W#@6w{&1DGc5PI%k??y9F19<)orxu<2l
z+342lZ;I?AihboC!jm|64(|${7VbR7&QH#UO~Lk;@DMZmkO!RQ=)x8^J>%bn{N@=^
zKG3{qPosAGhJ0HotGM-^QM+@<C`naH#?@jt>9I=XY9iBNddX^?l}pOC1<+xz6-fX}
zNI5zksKx}jH1X3+@-nq`-wc2?+KU4B!VDgjHmrXBZ+R(!aUa-yUuS!9ar*G7>7uPY
zF&d0=K$Z5Z?It}jm{63yTiDnAk3#SW<q!9Dxs@c+@rDP-nkR>yCs%}lfRa4!d{SIt
zy6~3UoU>4|xSaJlAU=-Cs9<UMKdNMYqR9K$%ymwLuixP`u2Q$}rf>HB`IYnV52QYs
zC$Kk>mEF<}dE0O8r&oHC=VsdE1YmJApSq`ISa0?_@Q{`}bX5YWcs_I!?&L%_a^<4z
zM;Zl+KX!CG?<%yvN7CR|Xg=OPXq(?Dm;4XfNkC8^eMSzuT3fFR*MGT?T4gpLaNH>!
zWsD$P|IrtrrJ|;a1Dm$h=zdDTQ6X9;T7eo^LkzRtBI_b4iU_$YXUmGVF&uJyn96nl
zu-fdyZ-POWouxnN=pa=-EG*<?p+2h6r;Cc=&3@3tgahA{)D+Qj2-=v%`sT5;`kRPp
zx}jN*C!O7jr;k)LNO~!j#n%g33UMddJ51MBM|_^;v1w?4lgLLh{&0b)zh(#w$5>f}
z-alu#X&t~P^UMT4mX8?tXq-kSPl{wGxq<lTeN-N*6=@$LGg}Y8ihtq$fbnlhSW@c%
z?KI{~W`ORO{n>dP+c>Q8`A+>eUN;@Ze!9*ndvN%0f)C_KhOCOdX9;+LS=;3W5@Q*y
zxSUq^k}tctp0Q1x0p-<t`VS4h;?y{SNCBDSX&uEgIl<K9r%6D7`1L_Z0rD6atorEb
z-DJ^)v|TIMY&apYMmTQhj!&F@&V8!6(%=IqzQJoMhrTvr=JNskM*`fd>FjX$VlpN(
zjvwe?st2O+KTPsBgu=U26dDLlPH)Udr~SSE-<1_H+1WZ$O3NsA{_%)z0Pgts)AjJz
zq9O2sCtG@vcFw$NuqFC1V`LTDqU^fqctvCFGrB3*>g4BBUgnr;N-KU^7&3J0io9}P
zkCB3~sZ_Os)5p!>55sgWeaoNUzZ$o7yY&Two%D*@xMD^T!oLduj_S*P85P|HB7<a!
zH<w_-`eTIgK;Pc^5s(HMy1v=hrry3HU4FXYJPU;zpqw6^@!sWaaY929J0gh_Sr&W&
zF`&?+Zg}2x;_l_GbvB<dz?w4g`iOnB-S-IyXdf#?_-=Y5f9TSR(;*R^0RC{uut58D
zwOZkE<8W6!MH|`@$8Fo$R0K3={d$q&=5y|J`vsNr)*|S4W_EUuc|~4;%l}AzMbbPA
zHS^)%(%kyjIpa2uRR&&1Hx^#7QrG>O6s-Q}q0U$1NYa*%;f5Cj87|)|J9meBB<E3+
zsPaLsmUXFh`A<#Dr@OG5rdSfY#>eYfj5aiHB~nLlo=7(b1$m3UM;4{)+P%4Ph@UFP
ze@X=V`wbOm#e*|B@%#2CLfx4-8uy^>Gjo-J14{q{+V(3bXF-e8%)?~V<Js=hJ)?u$
zUtP$QxPNuu5(;O$jaAwT<R0XZqHcx%M^QJVrBaQxZEP$aVjxjSvOokx&Vj-BjsK>%
zwn|JSjc92nM3Uy><o8$Cybvn!d+^sQ4p7_8TA>HR^<C!e=kl~K|3m3nRj7|2>~>d#
z;Db+y3dkL<cw+gc8ikm1Qu_T2uk-z2;aAU7qs0lN_<!KHr>OUUI{Tkf$HwCR2tK_A
zv6%7G!sVm2OhLWsbTD*z-ZA0EMb4{XNT$yINznhsVyj`KDDPDAOM`K^-pU^lJ=lpO
z1b?qSX?2Jy-*J|NW4X0$>mjm%46<ay4hHDKiuLUg@nzS38TlVMLH&uoHlm}PVbi|n
z%XUCo<1}NWPSVo3R~L_|-Uqsd%OfQEH$-+I(7m_<eYky@$4Y7+QjiiPo6qL&3}0SW
z9J9QJ|Mg4#riMFL#9f&iSn<nP(mHaAP=Dc&f7`r0%^YfrNR1v-rbu03@QJ4J=!Enh
zShsaLY8Dvz%a9fnd{yKOJ)od&Jy{rrS=ZFm%<b>8;(W0E(X2o?7v7%3p={`@Ty9h8
zW$&d08eA!s+{J=)tb(TY_)aIcgk9@bCy!C3Qr8RqZEy9SHv*$7TBVz((I`FFD!Sn@
z2m6GA*G=fG-likV!aM%TDw&{(tQzRvF$X(Li8hMmuh(^OygFkN()v~afv?|r@LZ1R
zPvv{y&_47`J8x}+oV2T!P(HbyRX?=c!}3p@Hk+N+_T=UNqn)XK1QfeYWxQr$VP69|
zU-S%r@o7?}!^!Voj`{zfQfm+9JF0rzzj1FlTvKi>xK8g~$&dJba-5hlZ<Odj1NGnP
zk$cdigKQ&!mu(SO*#8|p)}!BlUzk}Wbo){4&%jI9n#&-c5BY`czQ!#9Oxus;yY~HN
zWjE{&wh<{v_`FLxK<Cnws}_~CUzKEdCQ^MGfCpVIWY9N`(*({;-Y*h<)pArPpEs{C
zqS;zM9kyG3PxxUEjn74JEq>VXF}lU4vPPjoV9w+cY_4Mc>r?Ks@Ji+pRbcw=qgajy
z@~S()=*&zKVnj11zRLS%IO1*sfgFFN`YC~!+CQDT0?H9A;L^Iwif8}1cUJw_5hQr$
z!PNlfaP)mpalNgtkdr^97QMfXb3W7nCD!7rwRi>Peir-pl>YFI$@;Qe`)skKB|eu2
zA>QB!JmziO;J`7~>EObFkA*}<kB52HIZZpqDR$I<=iwb=7`_4Dkc1ASi3mT(wu>NF
z7;N}&zv+_+FI{`f3q=(qJ(B|yf8VoL|0{3DY)nt*Tkj%UOhA#E9RGMOM<WJnE<JEs
zc5!TfU9wnu-0~zDRh4Jxv}DFcg4~08{Nq9f*|KeCw^eAURR;<j8mb(OzcLHlV_r}s
zD!do`m5yZUzd{k2+ySTmyn@9jJWcrNX0}72u12lbe+Zs`|AOQbHD;Jj1w(D8nBM<q
zM3$wCe&Nrw{sc81)Ne=7eY}z|9Y5f8ZU+$ZudXvSMH(WNzwf?zwd`Y|VkZYc<E~;2
z-3u=#GR1`Np8`e<TIoxK<%97*HQi)~xp_A1ScY%@zgN9{dL8{*)s(ef=Qn=Z9Kbz&
z2l5kh;jS4mke{qKCg!`s!1VnOk>*vh?Z&k+9Z=Frpeo-?y_k*t)r6^TYD&Uv9e)b&
zl7BGY$6`GNv09bVYw{a)Oc>KWHc6)TI#0tLD0yo4SW(U6Z^U|(DE2Ay_h2yLK`VfF
zaCLEHtn!~XCCE8nCMx_Dq~4K|96*nC1;22QjTGy797=nVOWfmVd`Kl!oUgI1{m*7c
zA{YHwqQgjV-QU>giLm{hZd;t!&VX9eQ-t_-=;WI4pju;CFZev{sLRV(ADOfF3iB`G
z^-hn9=lTBa2_OIOJ>hiW!HthkL2E|Zhl>em=lcW8tMV*B6TB;Cr_9mUP~gf3+>*?<
zG|K<5J0bbsw7>8U^=|d;U3B&9(|bt6&dU6gsyd~~+S7)#^F^FW%Zq{`A%f+1dUlLR
zO^A>H8nMFx&&5c*gWM=6qKj<RWmhJ6^z@;yVrY@>-f+@L7G+Ng>l?gdG{K|&-KC&k
zk;p)8B%SAHMuN2pd-ubHKKLF#nSD^g-(e{ly6aQIaQX1W!iz=!d)~VdIIXT>YaZG7
zefHXUtI7JUz-g$1#3@oC<L{#eqN3jQiu&gl5xyiFAb;1=dY2(;%-Edu9tJU*U;G*E
z27DZAS-&f2;pHmxKwm?^IR`d+eylB~Pd4iT-fG7GME=i8!wqG?`6SCfWo@#p97uI#
z-C=;e52u6i`$R|SQ`fn+6l&m=n$Ycl`4qqK>%FI|7|7DHV^g?Yng8PBH0_HGzY9cv
zHLMtn`ae$A#0)CbvF(Y<(f`s7yil^?fyS!cYE@+uQGU6u!~wnyYK70PlTP+iM6CkV
zp%2G3P~BokB$wuKORQt7p4U)Wg^o`>hk>!xS_6r4cdMy4lJu{VedRn9>SYHo(k_O0
z!krV##ys1QO~2s>y4~VkU$Ldi$GH+3N0-8r1by&|FWC6HvM5gl?60O9QTBCJD0do!
zXvL%bz}y6_ZN1Zf(sIanP+cFf`fP99r-Hk-dF(NDDYzb2>|*>%4u5DM6Y=$FG6tE;
z?B*B*wTmnM`CDryTv0jp(-Y1gZid!!RgQ}A+Pu$4TzK5hPt(dvT(JX}a2F(J=Emnb
zQ)rSy#JxYqi(4&^7?<Q$EIIPA(@zAU-pv|4`RN8M7~(tXZzyLj=>EEmVQlifOIX{R
zFXq@EKdC6GXlK+0-^@>COIy=DF7zV_@!W%@6;0I*dyL2vmnGc}7-IssODuNwMa3`@
z8p(O#)SV1Fl*hRLbXchWqMc;)aj96^ig4(sim&?Jb<-G3bkF1IKJB-zkvl>Wp=?g$
zj8LTsT@KxXA*M%xhu<J0NS`~x*~b#|AK8jAtf!k^@$>DjQaS=JUr`6X4Lmz-dGcw8
z3lWY1KU&LxsTR&PE;cpPYb**E!Gm{=JiH8!kDGL#k>ZY+H#EPn{9t=|9#)OV{S>RP
zRpxB0*`e(c4o%~#>;J#Zsc+$SWh=HuDdMR*w!iP$X8x+$cRRHe&gB0VmK>0`%^F30
zbX1>VQ!BkPP&OP@<6oRm{#oV8bkSozPMFWWy(L4>_VR@HZ=O!op<j}L3SDKfDT3nQ
z`i_G$@Y8{THN^GTS-*!hij&h+PqckXR1u<y^_%^tQ6AX*5LAaPdcW9x3~P@@`1
zfoNT@t?4=8#gPAv9Fyc@2)LeQMSgW7jvhH|#SBWisuCFX=09M0coa+45lu3j#w}i(
zixnZw*J#i6aJ&Ber$yWQjxoQ+$K=vvEMq@8BjN9Kt{R-<Ml1Tp<G@8_m1*gO{$&Z%
z3uLfcB?p`|9QUth+ab#NY4S~#3Ms^zCTCaK!{~sDYt2r|V@U04W-O4eyt3bJjUHoj
zf#2QACmxrB-?*xU2!tDGfxV4^frpE`7%}M3U^qEX{@`(mP^@f%pNgj6U@IoeWI<8g
zU)#+T0TnN6Ccj4kog5LjjKE}ES3OqPb%;=oMutcc6WfR=D)!8$4nX@|O*0oKt9N8g
zedPw20gq1fZ8sJH@p1RH$68_t)qK7_-SFh<e2EA74N^L~$WNS{q@`%o2-+$aNVx*<
zUiJ_jY;wceKAiH)L`)J|+|82ZOD~I@AB^wA2Y}V(<I?PNTF&`-fA5Z>h$^#Ov>2hn
z?>)re++H5v<jZ{#RL*iYmSMC$_w@uaKGUAnz@w>CZQgmKB)IMRbV(56nxGRa&qE??
zWxA@xEry)4ljm;;r=vEv4;@^li?n!Oe}6_eJasGtq<zqB@_IbonE<xuweSK(14L%N
zw$G(I(zm^5`6aIVlBRV3^!wFQy%}t|EjzH-?%U!6O~ZIkU+80_Y2nqOf5uV?lEdyH
za_NDV@A~mo#_93YxC)3W5xjt+=6Yt4(3pBmXM`M`qdYD0h1joWJRB$?5b?2#B^3ix
zBUsh-qU=Y*8s7Pc96v3xD_&d4gqWS4et!*6nv0&m)g=|MqqNnCEuVqFLc4Hhsl}fj
zS7ZF>dE7&40m|MO0AE$55&npz=AVI$XUxQ$H<|o71&zJe#BQ$2W5nK}ZErhxzHOoE
zSbbsj5DDmT;5eXLzsy5ApRFt6+}^JKu?L|&4*8#MVtw7p1!R|XW-OyXu=zpc<SBai
z=&|bT24lJ1ZD^|YWPL3mL|@Rt%V8`-ajcX>LF<%2FD)Z6wnb&IyZtPp`gWwI+VkPh
zi2p6P^x!f>#K(IDd{Pa)G<xuV^f(}=On`23%UOFj7{#vhHfdiSMX%h%`0Ek?&Ng7>
z`t2*jV8td*zDbX_8JsX0E1!34uLF=X8OeY}Am&L;gQh-L$$CQKmB@6epDZubbA5I7
zIQn}nLB^6YWApZ;tnmKKHYL@GL}A7%rB3y!f>=oVHSs+`!z|e@vT12XO(Savqw{|9
z5CRE!K&lO<vWxdP4CC|l8@_6`vzCsJ%76PVnYiw!OZ#a2s3LS(BhINCVW^|~l#6(_
z$~_Ra4p;EHdAcK;y4feotEo`&$GmFkY|OhZ)VO(J`8b$lvEFuLpYb}f?zA1Z@seqY
zsXhIzX;Q5fLCP%H?`*VUFf>y;rFG8;soa!zWZak{2a)?rE=yJDh+bEBJ{x&(MnY8!
z*vb4VKE99gx{tnKZa|HA`9+CaMUtU$ygnW9Q+p#M&(RYUdGM<inr0+g!P)&+wAe}=
z1@Y1frGE2g%U7$d+7Y99n(YL-fq}=cuTJ;0CvKkmOL<h14MYpXm1u?cK{WaMF;DpQ
zv2xYkTNd^f@?o~s8?)&~Lbe_XhvP%|ZMe?zPhigdp9>@py0hlx`~8V0Z<YSkXsh(?
z50a%F%8@s$@Mi^<K0lZPja~g3JCf8L&oUwrWjuHk^A1z1@Y~y8^Q(jqj+Ca&eL}iD
zDauoeV>_(VlJ{=nZafHNlNjZe(tOsy<8prF-M<QWP&`b47Q0?8PLG>@LVi0sZEvgN
zr{eyw!pFT`1OIV#l~X?bQ9o|od>a31%o?sJTXA9BTmZVJqWt2eCF~s0EXt|vcai>;
zH1X@%gkkm5jP+3#Vr99P3F_iF{1RWh>Dt}?dUncg@YJvVV#cw2@}$3^cJc1jsj*m+
zhmD%Uqw^)n(%ciMO%Ol&ZPOS!-NhLC%NzVJB!}SUKX<#6tD>oPYg@-wJrvefB^gz#
zRdsb4G*DB4`fs-8BHs|^+TUuXY<ei-YIE{vqM3hgjgnS;t7%z0&TU@Qc3Zs(L}0#5
zk;q&)^~ysFEc$0e;!W&;Vr!B7#~=MJH+!HkpT_o^`=;evhlBKsMVnjYy>`vV)!y+N
z+I_0F)^O|w-Q(xqJUp{rD0j-hAni}IeH>1Rn6@-8(Q$cmI<0qJ_mZ>yp{Z#HGrS=6
zZC^t4Ce{WA6IL#P9Nm02Em0X|fZa}<fg~uoU}kl>*69E^v<3vhy5E4N`X`QFvos!j
z>y7ss`Q7V|CNCFac~?midKBhNijE#QMKoc~6g_Ir1SPusMBtKU;2rXNSR;Af#Gxjr
zNr@-AX;jp2`V6Ng;QR%i4R;~;xT7=IlC))3<hMm~M|+20+luz*E~)kR_XQL#sf-4K
z-6iIQsq3D;q7~<megiLAHc^|$9X4!iRDFqOC-UR0w^S7g4MV>SJ<#t)76SkPfb!|%
z<I_)56tk`Q0N7!7<^aI1Zj<VR1O;>U_IyQ_amTHceN|`Qb=J0{P`_!?9NiPsKGprN
zm<|^l);~;4mF%C=DM<?tI{GvFHg+n*rM>Nbapq9!o7i=HpH1#09!pB~Cy6Y*%`~)H
zf1DjAZgI=Fcghy~!5+16BrlM##<e@Z^o~hVf`|zH4><}<G(i)W3<kI(H>9K~NHK0#
z?miWj0hIa*(0Td4D4>T1OtNr{uKmzFN9=#>o|yxpmc#Y90^(*z8S{43k$=PHISQUY
zn`&0wO=f`YNm_F0Sm)^45wZ1RA+aqvFo&;T(Vjg)$#azkyXcgcjO6rSsU$V{jWs6D
zu8e%2EDOM&q$B`pX2u)rC{Xe3+wju|OpMk~w&d%sq!Us_I4F4jM?%@5n2iZ`*hTu8
zA1!#SaQ~4m(J>t~(=mG9Hq>Rr++sqpvJsUvdKPZkn2l54{V+qsUr*ah`|-g^VHU5K
zOiSziI7)#H_oV|)qJm_W1JT53qa5>ZWsZ*AKJuj~!ak+2$Q-H8(j^8epRZYL1$-e4
z?C~~YXq(bZcppqQHup$|Za0swF_@|C)o5`d^O{yEhX_e0Z#QLqHYVa!m)^c^{G9W!
zh1bc3Yu;pRWk-`Wo6zzDiMD6NeS_|lPbKwX8s;9nOG0Wq{IU_}9?UNRQYgk$a9~*_
zl#s;;AHwTT1tCH@ruWAHpEz~>+a<2Ipius@sGHF0WP}v7U*}vzTu6r1$$zoRl4uC@
ztdHU-y^?~af;zmmUhN1)0MAgfoFiD31FMpfWnrP{N85|Wj;oHL%(fcOHdUVBm)i1~
zBREHo+x^A%S#(2x@ZN2oQ3v{J)9dUS3qQ)?NYq!kUh|X%F*%ZRG{lx;4cT$77&_}E
zJwArT_Y58SVSmYS0xr(1;y*Thb;d>w>X>`c!HkKoRL7#xVwPWZaa;W|xMW2Qg2Gtk
z^lHe$Gk|;4zA?7C^!JP?=0nHQN~kusxly+ZE%hSXE{`s=d)8a2{va!O6aKDjNeOGE
z%I55tQr`-uY<-{;;aw)$Elt5QQPDf;eUG5B0n631kKAKF({6Y5h0swXJ7`ev(#B7W
z{WZ0##P{#dxY+y8R>j=(-u0O3l4q2eQAHg)qwbMhl)o){xayV<r0)Ch8e4{!)hRei
z^#cMpv$7l}q~+(vTUg#ol?5*W<OagTFR-uaL_z`Yit8!lE8eoC{8RE>iogrb4GK4I
z`jMFv=qtKr6;9dbp!2*9-SK@A=mCJCzg$ekdY>aEHnXs3AtEqQf>ill$%#!eev8ww
zy_L-{L}MQvb>iB&Dp@e#Wv9Ns@pf1;_mVl$D0{EjGnJ0F46e6l0U+}kt5xPx-b_<k
z=ODCQn)7j?b*K=+2k<HPt=<{C(C!|&g<lwHz9HOgREJx+)~AC0=2AZ6gpM5IOg?|=
zq!`4S3i<gJwXh<CPnZ=gaE0rxF`M&A8}s{Pg+&POZ`I%eO+BN=u<yEfUFOIB6IFB>
zm%u^C&z~KTU!??5kq!Dp!}<HY&kCF&X?Iz!-~z_VgPk{}DJ_w-Uv3IC4T(%0JYILK
zuiZzaklCEQ?<Zb?a*P^ee~4sKKYyY4JM?_?+dFPtz1N=upC#h6`Jx<akkj^vvS!6=
zqm(Mb%rt*Fzr*U}1nE92H5mF%6}Wst{e$PQpB0BbQgev)m@uu=sJhR^NU}qo-Th{m
z<M6c0a~lKOpCS1eU+M!d_UlmqL`;HZB8zC5`4@MBwTEUwe0}DNfrfEBQ5-f-0&`;s
z?ronzzvmq)ScHkib$3{GC+BAuHu}^{a0g+tanI4d2+sBxnpLijTkr|p9KOH=>>Sj;
zaqn2h$+i7YJ*~~L0ih|6OhoObnR?5R!&#~NkqKcPfk`i4VZSrM71^}5<Mh^)kbEBW
zR|85Hr9Wp#P%tshl_Y$nR13f$1?auD+mZj90O?R>4dZq2Yz+#1pp;&=1X_2DmGkq*
zXJVdeEf||U_I1_!BjWcj-Hz8mdqEneD!3mePH%%ygs($5O(M56!oRHr>z<59@<pN5
zu861b%qRJN+vLT~na%zg*<%_X_eK6llQ~-m-$*7S1@s9<#Pz8w{vY<qGSEZvmWRW{
z$*F%t>IO;9Wgzd2$F8K!azl*NC_A}$wnmaP^63PAJ`<F3=&`%9`8e94Ka7KCX`Z}X
z+TkQnCmKFGayAnr6qC7p<{Q%g86b|Q#MOV^^{$@K1y7)L+l%1lmAuyov>HyyCyJ?;
zw!E{)FWS$&EAZUyTd-Ii4<)A00mRE9J~O!;Z%wNTry0W;RiNmkcjenZslw5w<!CV~
z8|Dua5M9DpR;0TzM+OM2nVB!%Ug0U38bbyZ$gJ*81i3J+vI-yP{iR<p`|vJ+gs><d
zwMt^dcEtiE0+gTn$Yur8qb5A<vhIjyLF5_XsZCB;-J`F>0q32T0^cHP>mX>CDl<A=
z<hV}WLZyIRzNrBbF%jGnz8&1&%K;I`=@YsKqIy={=1{)^`2O=lQMXWU6;QT&Db&3C
zgB!iBVISP*hB;a@K=j)_z#Y|l@0W9-{Z%k#Q(Aj7UWnDO%zwjn&T`l1vTkhHmjjk=
zone*UC71>_q@AlRyV(_8F(OmwzF+Y@5qKD#`Wvpr>hS#wM}F~!^VKP9@cTd+7E$ZR
z<88|31TIklS2}Symow(7AAm7$a-2P1=JD=Em9c_X$wWrIQJ^j&GmH&0*){_2k5)=D
zP>g@$^f@|xYZp2<^+`^?mk~r1WB7nudeh+5J#JhJ7K-mL_)+E)w7TT@tJxA!pJGPt
z!j3*sbXN#9aS#4N8c6}1sI;==h}o~Qhp=wPUoBk-FR`iAU^hYH_Iq>R+kcoW?kJN)
z7u6}YISFxQc7~m*hKKg>h{stP_msDj^o#<0T69Zu3_5^{N=OY_)Yshp&pD1ZmvEpB
z3}yY5@AvVvbZG-N8gT8*-l5{A%mkn%`Lc)6s~%x*0F}Gz%*`I_onvS~Vlh>GcoLQ6
z$^L`4T<^#@)ZZ@QvUB<1n!UqVd$>OuKhdO<_o^1a7`{Qf%kI{#(@e7*nr*|`8~c%#
z@m=eLuC(f<@B($VlkYQoD$KV|Z5e|``5!MB@wAAb>(cC#VJsV`TuY_$GLOjmB+*9S
zVW<k$UT}>aF?Lk$W+`K1O)V_2#zC;wWC;X5B-f<y8JL)@1OcoGaQ0z9pM)T*vIsNq
zFJ@fJT*K9HK{3WnDkbe-K0Vso6t*ax%oT+f(w|447uR}9@71A|H*P-i)Nm0()Ukf7
zu#Au-T5aBGrnaBauEIAximq?bT+gOKxt;yAVN&l+xZXPd=0}G}5mm}a2(I|*n{LnU
zOrf{(u6DI<x{>c2YWVQ5&NtXNY==laXC>tZZG2bficE++Yu@i~Wem{l2l%4{(OFZj
zlf(4|1f9exxR>g<VVg~gU|!V|qjCGGzVy&lS96(bc{75`RId>?(#k(PDBVjIKa6Fm
z#6RKYCNHBe7ZqRPX{iDfW51Cvc9p~#!AB0LWO#3w>OX(ZqOUx9@wv~e$A_B`1dQr4
z0;X&=j&1rJ1!j{=Hi=#z2^Kx=$c+?o8A(v>)wd0;+whsM{g#fxzZI!is(anWg~>M;
z14iL*kiY)1eU=as33+orZH^Y5f5DeiL&9m%;eytOHHv_-o-)aKDx|de27ViFQ)&Ru
ze>`DQnf~d2uaX_G`2f<jcW9i>Z1<98_kG^Us!;n%wrfSr=`rp2{oGKNSG7=5ge;RG
zTbtB4oPv(h23bCrvX*We7F-J4bH3vU?rntjAq%MJQJWB9!+bg>9VOGY!$XA-;pMfk
zj910`SS%NVVqZRC{n2rI-v-yz3V9PX9F<h@=;40gDHE#bf^!yz5$*jMLGPv5QI+Io
zDP1*viZ<K#@K{nK3WzaXmh~GUT-2ZddR1HOvyVh*vre^yaT*}?g)%lJb>87>R~0>5
zi<f>!cqr1h#qph<OuCb;d@fPD{p&&WyQEsYgwliQYH_6XTb6>3uNyb<1#e6teu4(z
zDrbsvkf$-E^++)D_{^W5PQ!|#8)G}OB9_S@hdfNO#cwo@)4ellFhh@f^>a3(C;A53
z^RJZ`0T_HNrH2N(seMCZYI@Pq$#~s$t~`<vai>#LS3;$j`vp$JC^iQ^k*O`pSu|1-
zX)4`TKIy^KK@w4~JEE|qg4tjM)Ghb+dgT82_CMJHyyPO??B<mieWbxNGtFdOlUlj>
zsfIke5w@jLy8=2;_pIeQvGU{UdfN?28wa{B8?c<`W25wWh^d9n@3HC^gYRr?L__Px
z>;b_!yKvU@QsB3j?~(9FGps`^wK}@QhaFgfTikVS30pI4zPJRLmT=fz2R;yMbBCIC
zemu$q&baL`yW!zyJDP^X@?$W3ZpxVXnV1{%QQ$oj3^R9{*D}g48e`!FbDE0(W(09s
zW`R(ta%!`Cf|)r(yiDuQc|W-*r$wQ&fn2XK)rtEXlPJw5WIK(5hT=fwyVFlp;*>(O
z-88yTVOr!7GWAm~GR^a`d)e4jKi27$XoG99ml9FQPp@-as<G<HMrG+y`(HcW)BNQ#
z#ygfHwpSXpD)+{=ryJ(mD+SN118Etb5$cW@Z`sQ4TMRtE?dJe5*1U4m<Pp5UVOMUI
z?A$l9u@jL!38w3)mqLU4MNXUa)(olUv-l%hF8UaWN0GtxYbs7r6+!R~yB$8J3R<IM
z#~(8z$(3f0f0|=xbL%@LbJ-=4?^JadgENi}mR2hq;(yu{NQ!BG9SRSf{z6fNn;WJZ
zk=slJ16RufnUqzRGsS+5k93$gBo|Z|=PukbTTJeT`CqY9dS2wpZ<Zqd@C`LM?UDA0
zQ`j6I<n|?Mz3ZRsCgdgTEuX(-k<j6w%#f+N!Vzl#@UEv-xEbxbRP8AqecUDnFJ7Ho
zR2JB`3GQvensq35_jYX?4jdE9UwUz=Bn*8@@-KlmMV%%pUur&Qli!rP+#@G@cyj|$
z8%gR~aJRzHC)16=$FUT1(*IdiX-&%-8)TE!pBP+1vG!Eu#N?o4@Zahti{7;mGe~CD
zv-6z-u~v=B`5f}>^s;BQ_{wEg$MSmo6|UYb@30gRjPGS({vwsCNgJA=#@<=E^2W*6
zK(;xBE;j=WSEBGzG1ag59SPK@76avwN~|}p=wI8PlsE{@cn0W)-L!1n7|O`L^cCp+
zMD1?A7nTN<6&;ka&NrOtC9B+9>calPXKs_<;cWMjqPn+W+`M;!#iny8z-rHwxi5Dl
zXG=E&_ad8lCkCl)j=i(4y(U6DrBpl3LjLtQ--@?B*Si1Ff1xXI8N0=ZuR#uhn!J=K
z)>;4K;=L7VZK&zzsVOfaBi!BB*kCkHdXGwP=8vPnl><RoXIC%72SQ6s;v(AS@^ug)
z_M7T2NsvI$My7-O^~!h#C3jX0wrnn!V)y&b5RAI_MX$<>TL@|Tbh2n@Lo20TbnQ^X
zbohU?$qG|Vuatu)UmI%dqBLF}2emZT2TD@*`2BX#3FP-8Sx!-m)Y$8I2HE|_f_sy)
zhAk`UeQ$=hdjSMsiR`$De|`A|dxiwnHhkWf>T|_NQPb#Fj}Wu%Sgvuhlox*P9DSsv
zS1%XUfeaVp=83<jLpQFmXR_&WOY7Q)D52-;BDI_($7hP!BR?sNOY01k%R0j}XUg`|
z7FIc0X}%I{VR<0V&T4BvgqB4@&HNW%G%kjGB5!<B9x?1hUV>IGy)4OWq3dWQT%(K^
zWO`d5!he89ymeuj#PI7$fCq<d@+`-&C(JRfqAem7fmw7p@%m8M6I+)0gzE!0xLAn1
z@6f#M`skO`iT<=5tTl5t^uDoWfT_2n3_oe*#D;TM%yO&)+u{eKNjxdi?pEZ`YHk6m
zjdLZgj|v)GdMWAjG2+iE9NTOvtj(WxuUiF%^C$jI_m!8}XKx~?Ndf#C40rVw$3Bi?
z0?g@A=S%NPBN<J(<AYRND1vx&ULJP-=C5D*PlYhYcJkOgH??41A-DuI9=v5stP&no
z{V=a%`eU_OLIU@z4sGmkkM(yP2dYowQUT`$>Y_C{i+h%~PYyOV%UFSUrRG&^>%-+c
z0ReX(r5AfyL9?683k!!myN|c>Kl>0pZ_JI@JBBRX-N>w=*#_&+OWiXAO}>SbS~^I<
zMgI^xyWC_3zj?4=PN(bltETIK|3m4>yf*taz5+U~AeJN^)S*nDwWjc({HI}c!;A#s
zV?kQ<y5+OIq0ehkI3J9opVpsU>~?JD=0<vX2M8WP9nD}4a>*oWbY2hi>)s<|#qeUd
zOa5|^heK$>so}7(#>kmZ=8{*RHD?&*EPW(N*Ms`{6~|c0;H$p?Qv@CAh*;5ouSq~#
zg&25H5zUVuTE!yldI^3d{|zd*ApNewb5~Qo*}sg#X#w{msW{;+em4o@DSRQAhmDiX
zOvXH3D=PrjA=606ns|S5vSXp01?NuI;w#~%B^&;3r)6kC>VO_tO&@?+zAV852o%Dv
zU#58CC$4XW4B%vXZA1gY;dqgj%QxmLGZ9|OVa~B-7oU7-^^4A*V=dH!*kM%l)^Ci<
zcDh{Fs8XXa(Kf%0*n6`q`%u%G7HnYp{vb6aPghhdF2i1=xm`Kq%<>FnDuhmdpz9PJ
zT@W`KzZ?oGd5a94XQ)(QA8&#XSkF3QWb2Iu*<Nv3T}ZU=xdL&YM;#5OLDX$%HBZuZ
z^@~)>w_kL1XBRNd$zVS^!m!HQ7*v1po{ELVnAlZHY?=gYHB|AdHO2-R8rI^4xZvo!
zp24U~lTAG~3}k1=7hsZ_T2`i?kgyIt5EYx5A)R!MHhM3ZDkciY(l2x@X0^9e>Yu(Y
z9*FWX<5u6-9p!(2Q%~~Qp-7DO38II3FUwAA=oxqu;2hzitt?9**@2I$_beNoM@P8v
zgByhp^<(^2&1olgPzd*H<QcZ|Buv@{@|yL#SKV7ycl_67+54Ja=RYh88QE>&lyOX}
zk)DvR8Vcfmz?nH0Vw^peR8X^Q#PodSA5+@$E5GL`Gqz53z~90*{x8exV<MdTMk5&b
z5~MrczSSKkwr5ZJ623co#g+)GVD6P?K#z^`ocC(MTm{T;mDa`9{W@$Oxa?n-GXFD?
zwQ|WD7s^9{v*(27CB5(Sd!a%&v<bD9k)55wM+^&X_13leSqKJ~ETKtWO;xhHDHTiP
z!W`FSTN>ZU)iTh&KgKXQfl!cd?gIquvDL}4lG>gy=Ixk2f0L>Va%-GkRI~)U=QM=2
zM41=x8tYH9x*FSvDo=Bi6HM<(c`b|a_VXGqPdW(WJut9Fi-|G@5_xILnMHl!qLcu2
zlo!LnfiSVF%9kS-o=mt6OFanR;&Xu~H8R<rKM32if*;*pxc~t@`yir}kY0l^YFMta
zl9DPVYicVIW0()Z8iTc&iPN|Mj76);A=RYgJ)-R<LmY}8$n&NWmYG{-`8$G7<X(7m
z?E&N6wg;EMR&h98N!Vfl>{*>VomNT8Vj7MIi49&(lqHFZmA?qLAJ5*Pyrabz%YYut
z!_gB2^OC@4nukJW0soj?rp;@U$ND8Rt6gY}3Ew@g&9Ub%;V?k<u7EdHHox7_KMil+
zVSgb(z1?@wpX+0^A?i4J9wjao^DtRhXwn$|933j{1Oo2=Wa7TrgQEp4+~Ua|_o-!U
zqw?pe;T=|?+=bzJ*d;S{zVobYS>ErL=!Ph+D5z@COZ0=!2(56+8(fOtMj5n9oHz~A
z*x3ZFg$$Pv@rJYe{|D%@php^UKZ_yw!%%zX*WQ)l>_ibK@hPQ~_PwcVx)ZPM2r0W7
zMC)Ilrj&k$W|Vn+zShtx93NvBifD0-BKf#bW{rNE7Va^r%4eD2({!-RX(C=}ya`n7
zY8FH%YmlCqa*IzD3EeoXmu{;WngU_a-GeW7qWovwotox6&2TNuw+5i#Tzj7c+%w4I
z^pM4=sFp}6TU2-1nvV|68Nk3QqaYdx%1g|AMWje|;$olEoLFFhjhZ0Y!A2gb)NQe~
z<AqHK$_fA_l*bW;cZ~D3pV*vEUN-=e%C=y-c)}W>ocMlu%-@`8+#q^a6=DK}m8o-l
zK^RDboKH4dgU>lq$`4!b7W8tg+&rfS^2?V;cpqFtm5O+ysT+{S)0n*oH4{4DAa;QC
zvl}*TeHYH(wT5Ib0E^-Al;c0?GBN=olR_bB+bI<p`>P>TWyv8W#SrvW;Hj9W!1GC?
z#)L1Z<_XxRZjFAG*DY{e&1mDf#kwZ5tH|JX{JuZKzT`J)PPk)3)qGkadu9=?XvyhM
zx$C*$asQvuzK?=go&cFOddJmnvxMsekZkycHc=4}{mN(5g=+1Ojl&g9D)`#G^tR=P
z$6AD!M=p<qeV@DVB=SbfAs653+93P=o<PC(STzN%#_kU>!v}-{>R7Epb*{8oJzMB_
zHjb)xP=Qt%WpibZ#4TP&^Zd)O;-e1U1*wh7aPy5Gma-d`I%9-7s-(%oa-Sn?=w5Wy
z7oJ2M_<ar#iT|?uhB?A9*ZY*4cEgd8PBE=_t<R<-Ye}!H4)-k05)Q5kTypq1p@MF`
zANy&O20~<b2OmGUSbqv=9uxhD5I=i;xFq=1j<qY)^L>UMYx^HrI0F*Hw2-6F5@IZL
zPlu*}PzHcFmQdyzYi5%f982c(Di55n*|i=gviU}0L!?vU&_4^QnGYiwYdvDCQ2bmp
zYlZP<fI%|L4x2m()10TO)hVgq^!_(#<y4Yk%PTFuuv|n@+bLfBDAo*)^t;ADA`O<R
zWZcZi;^(@HI!kLmE?UcXb?_~ix85i%6ymqnXEM}s0V8(%#=FH<vEwIHMZ0I$3~sQp
z7Pd*o`@iUpCfjq?{BEZ2z&06)$oF1>d2(JCFLO#;-2t(QPP@W^sdu5YX!t+g>C9UC
z%%~%3*#ii6z_P|^W5*DJ@>*V@#-8D-PAxQcj)&Bj!}CS?=P$wqx-?Mmw6r(t0t*Qt
zfaHAp&-GsgN}@o&an(crN$c%mZ_^hpeA9?T0b?wnvd~l4$MRcC3l{X=ks~(r0YM|5
zguXRlh0a~A#@LgbIj~=xnU|PMF6V~MwR7@N_1x>u2afo&#V_Hc<%X|SEX9>Ffg#`4
zeIhV3ABkxmR>+EgPbk;+#bm^J)IrbuWAW9pCxh8X4GIUT!~<j3D!AH1vCth~X`$2g
zBtIoDffnibv^p!10zR0vf8Nb5H2&-l^Ch<!c%N%xbiFboEHP+7K0CISy;ya5cFc$A
zsLa__3A49ZN9Z1|+`PaMaHsg*#93D5%-RQup*N9vcFN<AIotYty?xSUZ~rdF|FgOH
zlaGz4)39_j|NpcA;=7)X+hvy>duM*`DO-ssraw7QGX0=X8cKD>UrWQu8OCN<G3aZQ
zW7^bxk)PV^nYrt@mL*2#8$=-bh~nvEi-|AVOCFq0$<|Q$cVpP{5(PRjQBl*6xmuC-
z{mc)-Y7B2sYpglzOb8L)AaHcpQPMMN4Fbv&sEe$aEcNjfzL`f3Cg`zeuu(;SnTNQT
zeSi6?>!^`H-S#Q0N;p^MI+h1s5*uF1FW`k57{lFee5~xfb*Ex|JT?FNhbI;)nNCQE
zaFNS~+M)OrF&pGY+VoWbJ;P~V6qDthwk>4t{^CAjI6yjS7?;dDsto8v0aiKLP8`i+
z4hF7jnKf8_Kei}M*XuO;6l@DUl{qinh!bCqe%2t^7?If|d$We1G>`;Dc#tl0p^I$t
zw9p~&VdiQz!PQ>NUL`?zUqp=ZRrByqJjCNKtQrR2mEP)RLuT;hO;v2$8)R5ANPcN{
zR9Djf-K2gBNCTXTB^wzPEIYQ#kold7=~07A;!9(pMu&ZmA)8h%MkY4Crwy65QTZJ%
zcqTK?d!37FaQj@jL+0=EFZu3{1pfTr%te9TJS%d4iIw*+-yqF*(p?+LX2@i-o;
z;qyLt3ZXtH=#w*z+`Lg;e=IDrT+jW*3S%)RQS_epv@_%Td9>-$B^dwt{1CpUwuMB?
zIApj-pYGgUsx^Fh+crq@oM%_yd)<=P_X}z+tFFh>wuv?hMB|^#>O}6Mgyxrk1}u~~
zUvpE}A8QXK71O=#x3BcN>MH7jDO<dY5EG_2vbi>o?IL>^eVzlj^Gg}h2$qFOxu1)=
z@=S?VFie;q9bKWS3vQ%Z>V%^d3uhS8?AWOMKc0=_$DMidUqJO|ZhopbZmg(&*hbqx
zixtjt%A^BQp|0Bw<ug@CsDF!~${CFM_d|6*pAo_-xdtP~V_1+Wi>{zPkSU=nMInzd
zx=xpGvsv%OfJ$F-8~Ms-LeLX#DjhsT4`b(A(44Sld&c98VP-8NArMT|wOALQoi!(q
zJmA-p^Q|(Gd9H2pB8r5d)KjY~MGM<9%W6DN=R;hd&7RA`_}zPquD0YKrZGCvx!+k>
z%zwVcCjJ!AcCWbZ^;4GPp5k2+w0Kr%5{Aa-bZcEl<`wLmZ77=IUAO@okl(XH7#;U7
zpkyDuw9wj8{h%^^G317W0qc19fY+o5t;3I1F#Q@nN$g|v;rc?nZo8jjB(;W$(l*_B
zUF2`A99@^UY-^Z0@=+=r7hXBAL$-J#lltAOecaP26=eq`&7AzSh1O)+VFxbRttXi&
z$m)4wg&T6dCzv!UDa$3FWPFRU3ZGSQ3Qp1Dn1T84Y`bo0jH}k>;x@ls>j$z5k!<CV
zW@aCg7v7>uj<91EnPbZenfOrnXjkX>UG0slzmyGaD;#6)t&Oy{$yN^ON4)5umQm91
zEcwI6t_)V`K7DGsSolcX(byK;3=>s;z{>0}8V%lmDQeSNv3}E>W}6#q!1}6BR1|tH
zc}{Il6=1qX8xKrSFZERBcvdo>oo?LmSO;{fTeces`|9BG#LD=(F7Vmf1KaNL&6;np
z-@~kUfOd@bW+l7SCd^$ppsMpcMqDX&uD*3IZdFu(>(4YDff{0i)>C8=nHtuqL8*BO
zAzqX|d-gaU?&=DTj`lMxPAo`o3zDEk(fYtdYF>|HDLovCl1u$>_BqBx&udx6uQQOb
z7{Pr}onn#NeEmjg{k~kU_vl=<__=&(G+2X3_Nt=HD)h65p_Q6U?}C69mzqUBFJ0$n
zk!u1V5wwS}lVnvU71a8V=1Y%*vE?U0E!+o2x05Gw^bf7#*mNtuEw=t-{St>rKa=W8
z@u`>?fQtwD)}n=e&KSCs@HypvC(0LJ2pKM)(=gM1I)0ajzxInV&Bq$^#Qocq61IBw
z4}z+I=Z_7OegE#-^<<xw8*k3O77&o#10&eU$CUmgjGZ6|C$kFyQZzm{IVt8byM`5Z
zb8eAhc9+|!yHHwT_@HGGqC>Lza#weYhE1d@Cb!gSPG_)bmuO;0&37qIVSB-S2ZMYP
zoV6>kCV|z;N0VC~IK7bD^hzd}2S>KnhPYC{4;AiA>5_&S{;|dSY@mO3<=f%fGSSdl
z@aC98yiQ}F*QjMNa2$&}8V7|nWq!s$e*uDLONeE@H$HjvwCn%c2AcH@Z`?Z?l-EWm
zj!GExU_(G}9(+yqE{XpuExQ=d^{@sMthPjC_|$v3keHXEcXt;l6gQ~qAy#1U{NEw=
zDE5M_E?!6tCB_haVQPtr=PTr_^e<YMM5p)EiH@3Du)wxvGV1-9^8#A4T$KA1sW*8{
zDEm732frvTQ^GGP;4j<CEPazjw)DUW;g=`5PUcLlrLNOEtAZOrzOjrIz>JrdNyVr8
z;z9XWeqZ}O3^rQe7AbrYK|g!suOpCB^~Nq4mzoJfMteOlQNV`mds1j|{ct8uWdX~I
zd>eu;Y<a}t>U$aS75~)5msICLQ30!Jwn7efVg?pitIi$7&*3ke<2U}l!Z}hL^G@fy
zvZ%XC9>J&tMW7{ZUNGG~5sb#JSKe~1A|s@z#p!+vJ2eo)r}X>|ZN}sN1Hb9@)YJ^I
zo2$F3vv1_RQ`0E40qIUQn1QX;DSK>}=8n@l*QXOWqMaRIzjz5BQ2XM!q?tRBMcD^7
zs&s6a*h3sgiK`VOQ+EZ9b<0TlYB#OOlvcdrNDIqN5-#7a#5@U!mKz334q3({zB5=t
zi}N|zHP}D%kVs^aglZbbh)MoTp%A{^`BZc>pmy=;P2ZnyYMFXA^x^^?jX&|nLutLq
zaS4}9X7;iZGmS<B^B?q|#p(aGttq-<4K>BWjFTA75i(>qUZNZv7@)rUsxM&ztR@1r
zDBp|$qkdPle9SCzmRuG=f2Y+C7<g6r=ATu|8r3(P$qP~ib#!h@AG(u0EAJ3*>qYFc
ztJ2zYAo-Yjsen%&Te&m-7arU!K12}6Tx+!32^#&o1vPYfV%^aZHFAAh0%|zA7N6kz
zZY+zawSXJNr6QN>EEY;xUZP}I+8<dd=R<0Z^U{L{nnOKRQatNlB3LrqO5BG^7hDQG
zmr4F(Z~y<Ry@kW7n0{+sgmu@RPWBhsj~hWamdJ2e8kR8k(wnJo7&<t@&g}!CQ6yvc
z`XRBV&UQbl7f#vu0?IyCRxtcCmpjz(jRv>r4=wt|&^N!7LI-h`SK94?9*bewYUD;r
z%_dHLnBKvMnnt68E8ohtE>2SRd{7!EpyIc0`x0sRdNJz`-fD$C{no17b9?&fFF!f<
z>YoB1n*)t`uK0*Q9|NM~c&f;~UVaz8t13(g(k-*;6Di%HIy6kYS*X7Vpozh#?kIAx
z`aGJ3XHi2_M@xsD=)V6s8ClQ2prOJZ;;LBNpxu80RW>F6;^Th<8p+ksBRQi>6IJQn
zdrbd*E;m3JG_wNu=o1aZ9NK*?+WSct*U_fFbh*dV=tH-BNkO9I67GneNQ^`gq;Tc}
zHOlB7bCjSx=qDw*9%aA1nauCici5Fu^!&;tthvAP8H-<>8gi{TG|FghTOuT{I3)WG
zxs^_|WF)i~Hf*y@lEQq|w3_!h2z`*M8EQqyXEI59QRx@~3qp}Tpzb6kLkvcUh2ziv
zI$|j`rcgZc0i;!|63b05Pm+3T5qR5Dy2BSkbhpdadFx&qe8YpMYV%rDI&NQN=*KJM
z6QNT+=*KW?2T8cm{{mH%?oRRfoqXa8Nf>QniyzIzv^f%co-{e$A%X{czdJ9YqE8d4
z{esMD-7dMmq<tAd8(_L!$klI3)-2h`NY|~!;V*~#>J&n;3+M*ZmFK*C&f0>HGTSka
zo3ZP-u(}+ttCz#9CDZ^T-08n_@~MT*8W*k~CtQozwyoQE_iimiED8S3jht^rveYx7
zID8&!)j#_291uUGylerh6OH-o@ucFs;vUU&UMb_>^jsR@OF#QqR+J2Q@sS^U@9r3N
zx|=y&TVNY)>GusRFdcu{j9jy{BiAhEHM93CeI5#h@IUCJ`Dc0X?MC<WlP*_kpaAih
z19ETn-`!+4HZl45tW)ui>pv0_N<Fv1&EaAMoD*juPcd3%-9#Lo#pW9?C_h|%uS?R_
zJkw)P?9k;5p<FJ0n)p=gGLBX0`1P5?eu35Tl%Un_n1YUBUE~WQt_H;*+dR*%-Dr9f
zI8JF7a%6+r&f=77xXf!#+Dx-q2m60Wd&lm&!*6XgNn<o<yxO3#jmEZZG`8)eNg6ke
zZQHhO+csDDtn|P4GtM~Y%^7=tg0;qtd0+Fw{4rVi(&g#F#Ey^2Rc96%*D#zaW%Sog
zsm^=D<w5%Moqf0Up@Xr3M6t%?cyzEhHJPw-eXY#Ty8v3Y>qKpN<{h1p!*f%?w6P$y
zyqtEIUY&R74oLQi;wMJXOkcNgUY1AaGplgX^^j2D4@>sG!ioJRkh6nCj3OzemgZl2
zQA}>VqB&hi?PkRtQ=5B{f8hzY5^L^MZC5U|@#tGBjX=Z!V6(F_yH>58q3pBb7u2W6
z$dLlG9Md!9M*&!*<9j3BwRv~Ub>Km^>P)bQ4=5KE-b74cVYT)bwEHrPY%oD&o~1qw
z^Bqo_6glIP1^R6ZTh<dP5jLVW9~<trxXMJ<szNAuU0B7BMm4lXnTD`e26PnMiLaN_
zco}_JJ^FBZrI?Lv?OTxH7TB`cjWK&#NtZ0Pv9zan$jl^{|40WW0hgr|$4?}7yIxc%
z@gv4mZX5s`G->-KXlk+DG5b+F%Y6+9Gj_L2wcSfdO_8J~po;!qcU(=-m&||*Ra1KA
zv~A(=+{337J*U-1^`V_C7J<bF_L-#y!RiNVAeg~Ef)K&%6TuzcBP{TRe>hjg^mJHP
z-{$mUlY>;^K$@^QCB&_Cq<me4-hTfTiYZnTM+M&G6GnmfyLG>1n9+a$%%BRz>``90
zk3XqNM$Y0i$!(l1aRIs}<NM<GZZvHaasb4z@cuC4a=t#nc53Bq-1uUnYxf%7d#lbc
zEw$YCK2r0`De}=NzE50y{{Y6@Nw){q5nKpCB@*l<=b#rqZA@l#*hW|Sm`}u249qi2
zgMso5S&_pgGa|pBTa@5RCm%N{99PrTjrjqJmId<HC)oWE{9f;K`a<pW91qZ(23I@x
zF><%`dOZC<jB~IMZa@**m^{btSt?CN`^IL#duffptq=wOVR~;5$}eZU2;nxZZyef>
zN3!T|c;PtVT|M`P6h-j=Ad(DZICrYixSEU<3C(}j+E4qJFXo`(j2$SD;{am0s395E
zub+BmnQqaY?s+rAXCt^}_5xatR6l*8jnIZ!opfvU#_a(9Rp6Ld1v2ERVwzq0`3Stp
zwN#h0s>oP#V1>wCz?$Zuluqk+6)ga=*V~Zw*3<ch^0cir?+k8?b;e@G`^}8sSVAjS
zvki?y?Mo=kBY1w9@YJq2J$Ur(s`RDXX#V($R{zoL;I1J>u*fuBq`BI{H|*Qw)vRkC
z-L>V<n0z~TWCgW2X@_N7);ozuqYLx5@M{UUQ8Tq+B|W!Ia_i0$qgSH~4>LbWA{Im)
zh<Y6F`gA9|$mpGy*m4mCbA4Lz%c=u6TXy^K#&a&(1ry$8GNvfQZP(#~4o5FMlagUv
zd9?($RO?@e3Hduef`ucwm}ZYJY*h=FH*&PXn?*%o(4T$KG^1|I0aKKHg(JTLNBO`K
z_c9&#=f<=Hkh>FEr9gHxL&cHB$RpXgCIqU%xIP{47s^RE2n=Dq4j1+fuUF;f;Ek;U
zUj8XPl>27-%m??&h+=Zl7ILPZmC?gED#x3^gUZm=oPPKLULYdW_lU|QR*&6f;Yu-i
z5UZS-kMdtM%9LrWC(k^$;ARj8%uS9>57HqEylC*|{vLhzuB3uu9pxx!^ZEE87yIly
zy<>mYEW{~PT5XDxmuM(`l~=*|C4;Ad+W8Q@fuq72^|S8mwhVtWW+_vP_u-rWozJ`Y
zs$f$0hCJ$^+LVLO#Nyt-_bpSl!0+0%Uo(8Ce}r5OD@%}Kk}tchs+mPse0c~#Fj^z}
zW8+hzHpS(B^vvvalTyp<5c$$YY94++?{(!;?^`-*(qPL2Ti{Sz%?#DBOd*jUjC!7D
zn~AKQvz{V_{?4hX`uF)<$Tx0GM*|y>^H4SHe7yZb0~<f*yhpV1FRH(E4FBMHuZf4`
zSj+NrC7Cl<yA)jM*RZu#X!TL@x^L`aTl1`&bP@+=W#Hm#p}yV+)kh?_-tPc8)_3%b
z6A^RE^8)Sxp0auv1-ukjOIim2m1p@y+$bS0kCo-QA9Z$IJm$Hcup6`{QyV{0JEplg
zuby6NfkRp&3Oob|bavRV>jtAPKkGh!N%nrG9_s?YL?J;$U;}pNBZM3N#`V^T`aZ*C
z@bXnPxoIS&Wc5X(O5`!l@-zmwcJb-Ze!n60xFuEs6*|g5U##X`$a-~bw+EWz=iUM~
zLyP5HagGwC!NCtcW0nMBdS=e3?p!MCI;KjC7Rl)5!R-D#iVz)EP;XKyK$KVVTiZbF
zUGq;FNJh<y9IhbCIl$SXs7It{X!hB&{>z3y*q4gQwi_RWKotcXu*yoQ&-{FZnSrE>
zKU{COAI#u~_n~d5gf*fBTthJotDBn`>EL6B(*g>L-Y?1Yc0=b-o1l|*-QzH~58}Gj
zF94dS0PRyZBlw#1uy4H1`X>zGRR25dS}}t=yx$iB$MMjEFK20)__8H(%>9Z2m6Dxp
zsRLu4QXZ}x8C|c!@!B+@_+aO12PbNJ-~H@(zwr)E6B#j_Xq@sV!SD=KO_g<E3u(WF
z-rJQwA}QPI+3ezKf|6}geH~A1pXDCRnl;w0mpzW#pw0<DpIekS*v%0AQN;A+dco{z
zoJu4V-X>|q5>qFO4&1NK?aS1dZkyD_+g3+FwnL!*n%C^Zr2)FBIhv6JT%|QFyobw(
zyrKzA8Zd%RVdbrdp6_bva!K{*n2N3hV|-i3!{Il4-3NHZ1CQxeho+AU*0o0mshSc*
z1l-zrs1-9W^P~o)y-&1bt8c8oK1rh^ike+sGR?wgfwxG+7ywZ8mJokwNAK&L62dI#
zoC*H%5H<y;qCanYmmaiYl;9hI`1-?{6T%hdT3{-~f;XTVXtUW)50A{#R(tVyaQXQT
zjKEEn@+N*a_A<Bb8FtmKn?$P-N_tkdrYgad=YCy-8@(jr=LD5v{GHb6V-<pCl)hF|
zR#iX6L)#Z*nXUPkGRJujs<!q8$EVVsv~gpOo4PM;EUQ*unbLsE$vRoj3Ojbm&UQ|T
zJXpgk=SiSfUSp;jbn14|)83!)1(vw7W7lDLffO8XQ&B(XtiY^hY2zbl4z3Bn=zbJ2
zLv7ie-(q<OxXY%?pEDUWl+T>_JU%nKT$G2K4S%Z5RX_x<b?LE~L5m&pfe~EBr(|1}
zwt6<DQ(=YaDI1Qc*GST&rg1yR1IH`a=B>cBIpXkD-pvqv2MD}?MWntPBsV>mcxnXM
zFMD+hP{79e@_Da2K19^qo~yq5!ivb92JT#v@dyX=zChjs9!3^3A=TLM`>+8*8k$F0
z8!t4+?4Gf>FW!x;K7q6yx8L)&?QR#A1!wma6R`jBL6&R~q=7fiCL~AyZKTjECJiR2
zWPzNI9TPH8^^sz7X@@y^cZP=~n)SIfBj<bQ`ljJ-;U;$44XejE>W||?FYzgu2fl9$
z>^Kw0f5~JnG+fydIU7CVJ2hRn!5Xj8mR6*3@Wk})T}Kz^6f9Pil^XvwMd`HJ04MQT
zt$l^>)!p9fqGt2wF8YI*-GTJLB&G29Z7lK>O*<0yqI#RgkBh02x;Ck{6Z^a#0#jgX
z@utFY(iW$)VZq@wza?Yxv5z6^xaX{qN#h{IRx#;Sbj2>+(#Ro|fv{)wF#2`EbL{vG
zWZ;S&i*W=3Gp4%9wN`ghnlFl?rnbn^m4<C}>`W%&v}MrhNg!RNV#W*GaNMH$+h<Hn
zF;?ECI&PVg8*SIcI4U3)6?o9pQEUYWcpJ8Ct}g>S@|yJ5Ynnm?q}S-tp4ox^5sROR
z%Fw4}#Kj&lL78;0n*q}mJP=_YzI^OUE@qI>!C&*BBe+curts>E4Pk|97IN<E%gUJ2
z>>jxO98;kHy#wK#;C`DVJM%Vx2eO;v4FE2(1izzc@jy<l*I)yFrKH61Yn-$bO3=_w
zE^68+cv{Om?OYSmP=FYtPF8|W0NV)R2m9Aq&d--`+LB+~P?71Dwb2`qF*93NVK9vm
zIOl_ao*}EK2++E{_6~MSJ~I$dFb;Q{26U#I1O#W~d#I-Ow2#LyJDg}HeuXJ|7(C$L
zhLT%7W(!@0YFqR5mA@FNLdETPk{%}4_l8mF;}zV>mqATE-=ymF$|aQZN;W1NSM*qc
z8EQe}-%!`;QnuQ#<@5%147M%*yG!ZC%cOof@q{Ox>JiMPCcW@}2y+M<JBB3Zcs-B7
z7+EFg2<V~D-)hqo9Fza(%;k~s;CWxgaoun2;)<M|S|h%SO}C(fzgmMZ-e%oVMOE;#
zdhPpWv!70`cE1#c2isyMt8|$*jGcE-KmEdJzG7hlZx7q|;8#uF@baVs=GB~$)%>4x
z)d0p}5B9;*(#b#@pO~L6Hu8-&e<Hh2hBiMH%s%HEetlWD!j?OQwso!yH@(2Q1#g`O
z?wasOE_CZJ-(2R*PA|aNxK~Hrf1VFFqycS}FbX|Is!^u`B$eBTt^~T3l_RK=0!e`(
z4`$oLx_a$%VsBs-mT%axV;+b`H?#wV*?p=KCTedYEvB1;GmZz_HL*JCc|`!90A-xE
zSynLqvOz;i4U)N-B!=?6xgTvXJd$77+Jl*v%}#vL_Cq8G)5?mzg4)2e=PLYXo$Egt
z@nqGpAuOGW;K`QY{Fn>oeghbEvHxye|LXtl)<qO{o#eg6AWy=pjN~RH91V){1Sy9T
z=G~%$h>>Ir*cp&uz?AkfYcdNzu-g4~PeWtylWH<3xnv|tP+{mVXaZ^)ztov9xg}Hl
z_WA|$*~|a?lr;`h+_zf@xP4A@mD9Qhynvk$qvXa{M)Wr%W;<Yoo*W=+zCC!BUHm8w
zftVx;(odSOlMemqlP<ZCXgUy)W}`)#dvRO;(r+e+5En2T7TuFiK!e55GQJ--KDNev
zyXkY4vfCF=rllIVFS$3YIgR?k(PSz?f=oUv;?@#0A-(@a!=gaefAMRTGd40y?XS?Q
z*i6O$<XsBS@yO6PwO*pK68guPld6-@3Uji1PnCE%=Y9gO{Yg=RtcKa{%mdx>G)VaJ
zkDw6TIub0ip6tc^I8UgMci2mov4M?K^t7`0oM6}LX3kuYGJI{$w}MLV;I>qo$#7-3
zlI|b^ITP4QaBT-!<TT$BDd&uD-FV$EG;wyLGOEn24Sbasf&NLt%sAMP#AN;C%#Er-
zubkQ}cZojUTp)J}^jJhISf2t@zl8iT=c66=;hkhg<rk-Mr|w~5Y;eb0USAHenc23Q
zSQ(Nk?ok(ph9mzl?fXKb)26mB)Dr1Vp<wH6TDVlF`n$K6yl>yG4c?7eWi&OHv<mvl
z<9^zus2T*MlG~c+jsKG6*JC5nJKY-f|Mrmx`>#&86C7OUGbEuNE0pklO>L_!(ieJO
zr_WphG-)uixw^k<KTj%uPdo`ym2y7KDR09kjv@o~S{}@w^BhQ<^lW{_lfSkgf|I8>
z4B1hyig&RXk|mS3s3qhaqwvk#LIMUiBF)56jU4Lge*P;uiT}%@GgB^LieqZ|3vhtx
zDfmO_7X)rf8uv{i^w&#ds+8#&#n_|_CbexC#`LA7<D!^Vb+<0jpIfxjdMDA;_sI88
z5VJKG0p?s~={47ik;6otjXB=9T;AkqvROQz4Xi)e(1)?nHvkEb&7ll50`jj%#Ut;+
z`Rspw#S6wPtjhV)Pp01*R9)TVe$2_sYHK)nIP(pE<kP>E9RAH$g*PC_j+H}6B=|S>
zpI8X09dZt^yB#q%J2o@ji*$rwzvYNZM>wBk1n|heyd4+HmO^#vh*)up78ZH&{;K8C
zNOq4hK5z4%JD^z2Nw*Y!n(Jz_mA8hh<RE4djE$i1Ypde5ZMyk$I~)Y|q~@de33#a}
zpkiIl8Z20eWMt(30k>-GcY^wH3sxUwxYz?VZyKi6FUoG8bY_lfKD%(FfI3;3+j-jt
zyHUq|3k&zabIFJSv*o@y{U<GQn-&x6?XvBbrsSR1nY}^@$VRj*R|#pE!Z;IiY9fiw
z;?m;fpTlb3QbtJ3j`wBB>$u9ouQi&Z_?w1;YWqD@HI0gyteetuZ0>=e!taGnm9iq&
zNlzo){(0e`qr9eD!p<4puk}jYT|a%ANwE^*ZhLMdwAxLS%7`fm;7_?kYtEDiR|Z-+
zOANoCH&TLIrS?OTGY!c(y;%RDWl<oH;lR^>=}&i{0s4;j@%f6Dz+6NKkNe@lbTd)?
z`PAM*FDu~9zY5|x3m)w>1HquW%M!`iCP5A7)XSrX+gm77SaBwHclSKXAcC~Y3<5|H
zwWgs7(f$GHiVK>kX?pXA4f4TD0zAG(xm!7sVn`gNkPY&5|M^8b*2Z9No*9{+d$GgL
zEM!{oiv%!Lnslzo+zqq87|-uTF2wkv4Nh=5fkEfKJMdoT7}pcKnRFdrX`fYDJ|t1M
zT3)WO>O-QGc+-Qtkwd;;c>`C;-;lqNZBuBYe8eC%{|tUM<g0B)HXx$E>s!@;i#E8J
zsX`eu0IOxPgnl7O9Q^M{&H4F?Z<V%F&UMAv{sR^t76v6Kh@k2pDUmb1cld%wo(QyK
zt+Q<pJ?J306NT@dvozFvTB=eH+Cz@@C$C9QU41ZP5&J#YGY`qW4Fx6YsA=W#ZK?4p
zRZuk!ZJltm0@2;*42oj|^}#q~zN!10kpyvY9KAC4Al4^hWhW&7FDKl%ah0tk$hlH<
zYQ58v=P+zivbB=dF))+aPp7%3xUGr@Zu9`_cRl>buKm6wd$iW^tSLdV9g2!Gqtwo0
z&MoR~;!3i5y$y?K)&ckWHvu9GWFEx}s4@;CPbsPLmp1{`o-9XiL}+pzgO$t@-d@Vt
z*kVZoad#F*S}tWy2zTYu>uglr?qlPqR+%{+5ZOcTBSA+uljgFXRYLdv*FGtHitM*4
z8<T|n$BWMC3tF?*wVBm#<Y|WgwAhbDvxEch!M5-|{Z<O*iW>8RQs<b_>#adXn2CDw
zPQ%}#9Ze`S(uI;)_6%Eh&;6U@W_ozr3PfQsVZ-i1F)_le(B8U`Ya1W}GTOvJ`4NMS
zeFn8dvdmIuBnOw@OhfY|5E5XKZ4!zj%na9DG`$nko~P7#(@ZF-m+LEaOtN-8MF8A{
z!8ob?-T=i$7NQ2nz0iT9*yMt^;GYk9`-<jVWZsVMPp3i$=;K#viKA4zBD6ZWkEhgz
zL(_B|Tgr7?hxl#Yy!{MphiC5x4~NGYZywDa_oY8osSt|eUGz(jQRvr`luQ4<;-F)E
zxkN-)(n-P9e4144Y~ul(56N@G+B#(-`;z`yc8^lDnE2F)sA#!ulnn6Dyiz}A%XWvS
zNL4}yzvDs8+x)R}Z&5M5eYO&tl%iZ-OEny9SZ@T0yP0KtyA9QRnPxLxhLzrfccHqK
z=YJsm5pxDXrW)8p<5b4V)urM;s~=nv8mySCDqQRdany^UrP~#)^u`;7N1{Jf4Y>*!
zmw-;ih8nmWnQ&Rl&&+c!JYFYc9<eSlvR@|ru*N!p7H1C;*`bE{-o;vVGtk5N38uHx
zY=`rJcNl5D-^tPync>qr3sS;03W&5^uYRz?E6C9cGeSeR>@@v!1Y8lx%*WNs=ChIb
zE%X-LjsH{3`-amVsDmuM(IkEGHm5tS>GQy5o8^I3&rS`oxE%b)ZJKv{WjLpuPN?VV
z^z%2Dnu%e48|j`%EYOd(3OwvT?prvdB5gHpy)v&nJr$kJ2J9Xk=2LCFX(qQgDGEm`
zj~nmqHlsj1X(!?H27CEt2=*W;IvnVmpZqXCWNw(JK_Avgw}Jv^g_vA{M&c3bLa_lr
z62+eIcD}9D^oRDpv^r5OiLI{rb(*gi?yjep9M$=}(jCohymQ~e^z2>%W9@o^E+$9`
zC|J_9G3RSbADr#r1FcxwzqS1s^|Tsmm^GPqEPZ$=2>T51vkC~9ZMETLf%JFWNuX2k
z^ro$FtiPDNkR3>#dGcad2Bpc8fK4-Z?$#%&#>;n;5vm_y?EOV+)5LWN)zkIV;1!F1
z1bzAce}X>OcAo!Z#>9Vc35b8-@*R*^vizHG%@d4K-C%o?cmEblvAP&yYnP`zX9G)B
zQ91kr%unio01@(qe-?+L$M1HywOCWIT4gQ|WnYURvtXL(VLh{V^+%yxvRntARINUG
zwQyi)*lA#HD(r3q3<r#iBGLU{Vr*cqDy_U{(z7mXS1!*;NPVvI18n71T*BWg$TiP9
z)!O45MPnoNve>ba<c42h4{c8aa7XQGWW^T<Q>Gdx>H~E_H(psLdzlUS`Ux2zd0i*V
z!@={W?#Fn)Enbu{#q_cS49^Q>J)SS>+TDobYv4gugqTXKwDIQdG7h+>ftwSP%@=03
zv^9nbO2OrEzu?$tQYvPvywGuq;TVbUoeqFe-hgP<V{BJ!L2X_Vg9U~Lx1r7KuirdK
zMu^<bU?GO{c<j@g?9$3#L45NOUgqsEJn&L`N_R101OLG?9(RHk#zJG>qlG=Nn-#DT
zx&GC-y?$Twn8|`;{`q;v$PH%o?BRaO57c?TdRRWvu6aDH`D+7TkJKDAI_N|+4}biV
z-$0qk*I}s^yymOn5B=@y3KG(c=27Tc%8jkZn3={5%K!^6lWvOHu$y)WSK@}mUQX4L
zx%bEWFypBV5vJfe{bHxp^60G35y_JR(%%N1nE#`{eR#3%!hQ8?SddkX!eTmt!1`sQ
z<GE+SRgK-9__!0=X@~p$abPDjst(jG78n}gZl<nRp^}?lC39TACl&$CRRq4w8Za^-
z`K?6BZGf9w;>ginaJS-AmfjbZaqP1qjG*{rMspBK8OfrlW4}`OZ{$d^Li2}WeMvt~
zu2{4)z%nj_{hzap7M<;5L|79uUt83EJzi~!X5?d*{v9*QQllh~td*4Rc18zBlmz8w
zosiB7+Jo^3;e}0aRq3WKHL-K92@P;R!2mVU{gPTL6o0;XkyC%drsEchn6kb*i*qR?
zcU0!@G}x$5@0p#g$3No{sB!U;lB{7vw_y}53#Ba9A-xtMOWK9k_Bki3b={%tU)JGx
zxhkMop36f_F|?Fxn=_ffF%~IYI2Q!@fp;keIk~X0TEQ!<cNS^+!=26;V&p*BwGa|S
zIn@wqRS4z({SV0az^!$#yKzb*pv-~Y>q=~Z@5Bj$9G1%PQ-JjM*rMgo8hu1~6P=^w
zsRuCO>^77`7}N9hC~_W`5rXlBI%z4mVsOaY^*9i>lB4v(x@<<|bmoQ)FIlTtry$y2
z&mtEAQ2#Ye43p<jYxO5PO*9shHZCbMuUC1pYPqcWcs3HyQW+6g8mYj3b%d@{ikK47
zpzc1t0b&S2fBi#@*{==N{t#p8{)sVaZzmHGb7NOcKXAQ4ek5zdLZt4&_8ifhAojfC
z$_0l%B@#q>rH0o(^PVPqqU3VhV*VoBIGzP><m3&(^@WqCP->I0#NtP}TzN@Tulo6|
zBTphjQcQD+owbo!MrfNC8n%Qfvr>0G&cpPyo^wYPzc$PaLAmZc!NmrOD>3jUym`5_
zp`S~ZY=j?KESR3oC2Rp9MaY69<>>lhPK5;{HH3Tzpcns=V`slc@yc|?t7!O1o^f8Z
zN+&4|AAT{fQtPe(0g_oR&^|bfB?!ZR+z5X@dB|J77V<2VQyoax>PQ3YUFy7ek+|*R
zx&!okj5QeH-;LAdMOdGlhNwsdZ3D*3>$ukmR)fZ-N7oV}e(s($=MNK);)1}6IhEuW
zX115_kokn>mVoU1dH(3SxhGBteae<l%L{I}hDREd#NI89XO3U`BW_*dcb-FM?4UkE
zCFFl<1Z3msX+S<ReK}SRjfgB%rVT*mk?QIfF`Cq`SYJ={UsWcAz^<vh;&X6eyG!R0
zOcF}nr)`&ITBuM(u1WDx9~tq;dO_fx(Q3?^s*U4G7zKR4c6DV5o2#Uslnn1WlKs>C
zz$5ea2YXtj8Mh%-*V}q#aaxv(*UHp4>~&~?oYhI4MElyJ+Nht{&s@Y8>F*)|VO_0|
zOVyYNC8i+AsJ1CcKZ_)~S+32Mq2%z2u@e#Dr1=a}2vd?eLobrj?uZI4f>7w70u-x|
zL*#X%1?|S3sIM3G;Qs~Oun>HFvi*Or;H1_!e)*EZ8X2`{fOk?Exm{3WE)U8J!d6!C
zE=JU!`Av0#Cuc}yT2K5$ySLjT)n0HziW5^ag1kPnb$(=#pj{)-twg3;k`<^zL!*PN
zdkd<h$MC=E-`6oBul|~nS9ySPtkn_}!_zolqV~8U14*D1B-ytIgu?BDPq3sEUqepK
zPY-~}vNZOq{>l;o77LA65e2PQm?|cRylUa{t*@R?qCgGGYe%L=j{IF<;#P7iNMuIk
zN$b0fWyXMDe(%dS%P)?YS}=u8JM=6K7jah#c_)=xz^ZE$O(>`PjO@ix7E78>J0sBx
zTbRH^#&Q;89+GzYy>0pFA_>U;bzad2?|Y)}t|NyW-e@b#h&Vx}*Ud^sUQGt@alt%Y
z!5nK2PyEBAF|#h$;Mh2<@{!!$OHY$_3Dt7{G1O5)RChNi?GmSd9Dia6#`y9f^AhZt
zWnml3V>FT-B9uwjBB+=Lxm@jEj2I<>_9ne$-pUl;Q#g_JDvIe<e#r74I_ntap0utu
zNw~-H>Bad+f0inQsZ3bYDYeFv6Cu*T(T4j>9Aza0rUE>p|BlW6f*vk>&!?r}<y`G-
z0N+ASC1Gt^YB%h4@ehXkP^I0iOH^F2r16~C7Sw>~>oXx6Bp1K{DCAV1uX@-%>zbOc
zt@SN`BJf(r2im;8y1U587DWf5(jTrQMR2OHR8Iq@C<p{ik{Z4po@d8QcqT(<T7GV*
zU+myx(UQ<_^Mu9TS^cWgsY9>A*0UmO@gQHfhe$=S3$hSi2f1a5QLz|2qo!B%o6vP8
z-dnfRr#zd_5<qlclcG-Etm!F!smpNx(~bQFk;`^iZLKtXYBz0(I9FN3Ivh#E-Q#HX
zfJe41II45Zc@i601*f1p611?VpCJ*4B0M#nMXe+${-uRJ0vCE>rb}KY-HYmK&uFpg
z^5j*ShrSrV;>H<TUu`R{ZHyTGmw4j@xq%ABgLbkiWDWGugS?wMdBiPyBtPh5Y|qs0
zu2S=i(;#)@d5g{2J}3nFp&7n)k;Ww*p)+$qujuFn;a3U5dKmEgRa9&1d;|P<$h!=7
zacOOBTk)uVB8Rs#5D{CH{cf=5$A+Cbs(^UTghf8NE^kIl+>y-JiM^4ZaA0b8%x!z>
zmtBN~C*+R!sV$j2fZi+&tinF1Az(j}Fge3DV|vJJ)M`BleBav<WgE30lL_wE$X*Qk
zt-I;f6HDm*?Jz6g?pEfo%8!@ePDA#N(g+pO939*sum;x3y!Ppr73lq2q=uMYv%;)e
z6Re1MN!U!LX7EYkN(YM_6SOT_1J6Q4Fc((P9I^SdpieL>MUy6~&xev5*Jq)&=P8aM
zzTicZVAF8jo*W^MZGAyBg~vz0?Wx@kW16{6Fwe%8jclwhp^d2kPXZf@173^a!!D!^
z^SRSC^1*t39@l7G^zRBPr0Z(7cj3bADdL_2MF27o@m$PH$tu63AB&0A0b^$!fc*|l
z4)3!3mU+6SRp+!H&oC<4q#MQju+;$b%^yY+x=or&a)%NsX{8a$K4EgN18K?CA7art
z$Kc(&F8Ib=h6nfV7TVT|vMFvf{<ugZkJ_J4`nw)^s<6FcOdOfiKcR^LHWm(ibl9zK
z(41EKCZ|G?^xFV&Db%@^Z0DX9Skj<jsg&4-7Nf-Vbmo020?V~|1@S$nhjbben}Nar
zr|J)q{ctaDRYfLqry7PPT83{SDr>N|Gri$`3$tMQO<9ITm_I^qaZf7#i#f1Pi-}PE
zz=kV{?!fcmG~R*%|6NPc92LgUF)iS6aj}9xpK;ajkDA(8laf^%1?4!E#3e)suJgcD
zx;8FBkXuxNHl`PoIBj;>Ne+j~l9VE=Oa{{R^yH>X*nProFCwfVAQ|6Bb<D)bS~1P3
z-asphZj+Xfn@uuFNXv~|--g_NmIxwyvuHpgju8%`F32~_ww$FrQ1>5~Cn3Zaq)P&!
zdN6hCXz_#9k9&N*GnX3?%Lt~@zWNKvg6zrHU+#2x0Q5`B16!v1a!zPAM``p7w=X42
z7sY=^Qd{D6z;dL^jlT#j8Nn)TIjcuL3H$P{7TVju!kg5;7hTU`Teo9do7T4o{4<_h
z@5GmDU9qPr+|*vnowm<<N<<v=M#;!#q#eQs1Bi5i{TL!e#u>$Lc5#eh2y%=eZ0X5m
z16F!k4}hv9D(s8)uqp*UHy~V)COsZVAU5t>0rL=!VNUjm%(nUj8YdU#ZsogmTS1PK
zq2wvnp*wY4xITKF)@4pvgcyrk+1;dp$X0??%dsWp1o31cM`rZ*=Lh~!CXg#NVL%`6
zD=f|?@BRg<Bp+!KzUG!P1GNbXYMJx`OLNrcClT`Hc4T+zQ#h)cyQd=ia!KlF%X42q
zl^&DP%!qBtYdd8|6<VfusKx||y2almDD;@M-eLI%(CG-&=Fc38B0a_Up}wZth+fGI
z=^9H#v^HH?a;8>7H8R_|(gAvM=^?PmuNvdqfYj2uy{>E_Rz}IQj}3mj`4*oj{7Ujs
zH#x?Q;=*p@#|5$R-N@q-GxL^Ciq#k!ijJ0P05>H-JAT|C@gL-AarNp!@J2li$zQIY
zyms$^vk!ch;l)k)zu5RQ?@`E&>rKg1!{q05ngR{}9Rf?#Kj-6^@!I%<U#{o0B;btf
zQ5FNgbE~bbg8Ly>U@qNV&LV>qr#tkPaN=s-^RLGXrs?_<(5BPnjJ8{3RU?7pu9_7X
z=&9nG(#$T!y9u$>$g#9>NFqocn31r6Z-p>KEZHw>igli!)gaUuZ~ad>+c@Z0Bs@Aq
z#%t8uC5qRdM?*Kbe7?anhX|_s2d1}aE{FXibn=uJI-09_^L(#XOkqX6=v+Xp9#<EP
z=a=)omwah2$pwxOt8v}G8{UwOEeB^`k~LuT>DLe!jwnh^JN%oFTEU70_gUQtoDbmU
zDv*#Dzd&<<J%bON3l(@vS~()3XQbq8g<+YY^RE9C?m)QHw53P_9Tp{okH=lZT;TD)
z40nsDkhOf)X7{W#bgjr#lPaLpz9f&25S*!@N<GnBwO5kB(CD+h=*s<{g$)5J$ps2l
zhq*ynL&lY0k0u3?SCC3b(ZUD?C9MIsJQ7!%tJntHnN)u*MsNR2#?0{S6iQOQNO@S?
z0NI^HP}v4sY5$|9T7ar5(Y~w{@hH@D=Gy(}FZNC&sCieLnSKZ~O=X<7J4?s<fBI!H
zn%M@}bW?WJddeOErt-j;h4N|1_b1&c!26AlVAz+PegpMB^B^-9?SnGu{ekuT*=y$h
z7JYJJhV#2XM1$qEKoPG5Os6sE+AdqFa5ETvAGBY8kGhvBP3+v|H|BI}9UonZckDFM
zlXK^cECqYdYiUHYF2Vw9H?5kcx1>tV)GrVif8N;ET%8?`>4W%r=RBU8e1kT+*z$Z9
zQ$%E}k|k<w@3({7e*9O?=C|#e*k26E_66-BGFfWrf$29)Hx30BV=X|-e3sypijP@a
zYrQNpmvt;<KOg2Ft<Qf*TQu*x?wjmeFiHQYpo)gM(F9HRVg3J1_p!j*pp_b!GEv_u
zjsEk|nt;kdXSoocraQ<};``kKcYaIWJc!fE=v7xpt%VA_<WArKib>ikU>Dh`Gw_K^
z<%i%F_4AUl%h*2D0aX=hv)%oBwr0m$6nY9kgsD$X<=63?wu{kb<Qn+Xr-;Y8=z^>?
z1+}fz@&53qjbEF&h^1m^{(NzSU|rcT&^Sft+8)4!+vA0|U*+w$l8wsJeC4BxUI%{R
zH!<77yPzGNov*$1BE&7wa@1-$2uzO)SWgbp_IB2vcMUuryBb^AXP>t{dF7y4=m_pE
zD??v*@x|qUF}=Pfxm!e`u23AFv*zz~)npvP?#rjs>$6a>A_6rO73eW7P=VUYGOYP`
zKn5RSeBU>|rn&ScW4=FZJd4V%V){2CH$rCNP@qp3_=wTHT)YF2GB{1JbJZ{H-rCh*
z!&c*|zK!=M$KSQXQay05TY7T3J0yY}F6u=SrAKNP2ze!DNVxZ_2}HW|LeY_*;T`Yg
z+<$-@)WU&{XxcQ&W8hF=j?_##cpH14JC?bzWsKGC+k&hz5sHl|jmEejJgR)Uj=M3&
zU0^1?@O9z^#Ny>s95|6&>v>bfMfDs1zedaq*qv>!1Rk+JS(2w><N@rzNsoit{Qq>C
z-w6JwDqZQcR}Ix9=CXC|X?55)mEtwwM=gM3w-ms(qIKe1DwJ>+5(-H&UFbs%rh@yD
z)0I34i&1}8WOR<%MsOkj<l;BCsnS#oUV^nhNekfOIpu9acEZb4GEP-Eaw~dwFVdwL
zh)229)4ee&wM=ie4?&xm0l9MO$>ZyPnMMeUyf4nK^N+oL8h$h1UTcU0_ooa5MZQWI
z8^+7auOa)zhw?NZou3#QaC-5^<ptqL98$^xv?3@<S>+m=01k`JhcT7IZuef;3KM|T
z<AiKnAZB_!jGt#3-Pq{lEEU~UcbcYc1#g%gQ*87~Oir$b6>MfXXYjpZVP2fV6-hGS
zTh><<?l2^YI_20)BG_57kK)C1qAWoII{cqu2f<dr6SZ+mH-Zg+;+Dr<=!2aHmh83J
za%26Xz|!;`mQR4yLp#5E0=<R0b8eJ80#sK+&D7>z=<qZcH9K0Q;QB*Wmb~>6U1Dr3
zL*{8ug*pj&n{OP`t2y<YgL?`f9ZJwOo&RuLeBzF+R|~R*{>Gu*S<oOt@!W$9FSWZ2
zsCiWSpyKXTadfL5%$YU@67R^|EgDzYKipF+Z2}K@{X5KO+sE`?&ci7U(CW$QSs14i
zgC6V+_#m?)`Tcco`$4l8x?~T96<mcg<fT=wE6d`dSeYC<+v6wn7KfCdt$*QyXFci`
zbb-K`^ptq4i>^zsbUwX5KO1L0?;fynK8#WOn2`tvg7puUT+a4d*Y!;CV8n)Q5T$Qi
z6TbTT>$|)TSzsJoa*Oq0l0)X?mVbgF0Da~O-^7i`xo6~059PuBxwZM}xqRq;-j$st
ze#37HC;bby7H|MX4(J$K@<If4yFK4TfBkP|+OJrUFH8%~fh}`Xqq}$k6JB_ko?$_@
zIMPVJPzzSC{2QX^mP!joOeHgV5*tCZpRLZQ_eh{EaC&G&`0o{k#HfxC#+7**53IBR
zaE@bIX9UI>qX0558TnZ*2)9MKjZD!%4wTg9QBr+v0ezKi%z5G0hdoWjdv!XF3a>!X
zJY5AZbS)uSw&*JLaq8%(hC6MG!_M0g`#IHQ5GztnQue}25A-Dp9wWFiRLDmF99y$I
zY+S%u8pr^TIflH!lsHjgUlRQnHbqmIKzBpk)mMB}zXcn<5Tl~>H#)|QN>|zSW1)u%
zabQaGtQtjvbQkQ<Ax!Rj#dY%v^!@Fk>+!bRo=#IyLxupokfxDQO)JWZuPX*AoB=a_
z_+;|))0X{)=+zKh=$MpZ0G2rckU&X45Khvc&x=q`fqg`^puJc{;v<Ov!RCQ<*X-|A
zU|51l*s)P(=&%iiAxY$TVcDuJkcBQfz`k=Rw>(SQhM%RWfwukZhL|c5aVg=l26g>~
zjPS55pclDB^+buNp<7WMl`q9e^|74~w#4O$kX$`b^UQ2`A8jz`fSz;4*yz4iXXHQO
z_GTv;R-KO_SJno{do5HfKA4myf5j~k7(=QMR2o@?i;8MuQhtAJfd*G%ePesLmBW5!
z$=^5LRCz6ze|lkeS>UT0K`2h~B8|)3>q|Kv!47p4I2sKlb+x9f%URlMoL9K&nX(R#
z%Z?o@-Ta2ls+QoL8DDIB(z*`=)@pMB;e%j{e|FNB%9hRbQ%tnd>30b)R61K%4nU>0
zfp6=y5AR|m>;^Ou?XM00ooc!*IqWRd2mPiUD+>RAX`V)*PLvUlb6ui4kMRXXU}-f}
zc^tKO;X5Ef98Ud=FtgOVC(+ZOf=O&sAp}l>3@gH3>`qnQE1OYGF6c>uL@n!YPwtP>
za42@I;^NR^^A>f#&D^Z6f>yasLrDKvqhZ5qE9aJ*b>Sn)l;XT$k(AEi`Mp3WiE7R>
zK3`nU7W#O&92A9^I;9xR11F4ieB3I3n&Q72%^oS!A?#05f`_elw<LiY0t88k*;3}<
zcNbY;{Y~w@=_|kco8phF&~*anQ}1D#@GrZ?91x+!5rBrfDRS`XD;Q)IU)JMyouujX
zEdq&E>Yg-~DW>PkRhmLsmh1U+(n7X{HyQYH>y#r9RA<aIlF|e-{wNrT;*hDB)WLyb
zLu~|`icnL!x$YFoG_gJ5K;!<`eOK$@BUUSmX6v1$tIQcn7Np(-VzqW^Q8-xgONrlP
z@{oR%BEB@tz{&HhVbB=7vfTJZ_DHP%o&|6~j?wYyK|mVa+Hj9(<>%St+t{XA-S0n*
zrP#4hgAgIubM-QE5*^kuIkAr?O)FX~e0(cG+;jj2V6wk}1MV}y{o<s!90O#1UW9!<
z(0SP7o)ptsfs|V|E8i=TT{O$Hoblk{Nt@=`Lj&>k#UYJ%!#X~=tk-F1J$@*5n^-i0
zrVW(NcGy)9l6(oGB~I}XuTSl=mhRL7I%8Yj8&yjRcGp9JE2Y7}ALK3Jgd%|^I(3Q4
zH_q(GQ4d?~=X3g7ZB73ztW`xIFlsc)V{G<1-Do=)6`XZ?A-?0e=f8#$q&!Q;Ws3)o
z&=B_@klrf?tZCLz?P)nfohz;A<<7y2T|x$VU|>~lwo!9b=&v*H)XBN61|~%*xK8X$
z%Z*nbL`ZVsbui}Nxe6w3L>79bdMz?a1WD{mtN<<IovcOi^=Ow~24QL1pE&|Qe=W$b
zXT8axY~F`fmLQ@uL9OrZQQ3~KL1Z-pSGhmLG&9TVneY^8v!PjZ?+K{#XRz5YTeWgs
zuO}!k5E!uT&PMrscUc;`V#wXu62gcLZ!%Y(tHA2F5-*0a3gqpB-kc9VjjL>Sm`x+!
z04yBSy&ZafC1d%Wt+*BP8}qB?4V|-1uFY_Z8f40WR{tH^B8u1_B8aC~dh^a`DLhc_
zK9KJ-6|Q>baPWD`*XL^K*zgs&2bQ5;>b!&jf=sL6l6f7H7lwN_jS~#EXmhjNW-Tq5
zr~!-PKhj4GJ)PfSz%gbLl*ROi^HUR=WvAOX5QG+ZS<me?BtK!{lDROf$(0pzJUnIE
z>LlgI<%x$nr$I)=m3Xjesc=fOrUZIev^KCiev8YAD27}p&i3=4;+~!LhAv4-*~CM8
zMpao#UwNuqDzdQ+fYGxf8p%NSrhV5#MMuV*YpFPyG@1?lTcvqM@auqEFlzeRHGUdl
z(S@6Lzl@v+IO<$nJ|eGTU_F`HUV-NgKEJ%|e*8jv$;Lj=|IKxL<68kEXJAD^{napN
z_D6F>QN_l;^2oh+trM5byAaKbo;|)(u$gBN$PpAY2)Gnn>b2_0R~Q+&&sSi25(^+>
z!CO>1E&Fo9qfuaZ**VzKGsM#_QwBe-S9vf=crAiU;dz<ag@QFb?21v@oT5Itp~fv!
zy>0`S;;BuxDFK>g*ZX)$3kl$?k2MsJzFNLw3mQ4x`;pqi7BoQ3b+CPbh>sid_qtS$
zm^Rd0Wl;8R<-7%@mx-RCSbS{vYex|Y28;o5)IyY3YKz_H&OgqO4k>1!iH(KVb_+9c
zu{A==kP!2%6&qpF#9N`vuu)jt881lYogPz&2Fz&{C4(zcx$DBOqIXy;saYhzhz#Kz
zBkadUGZGO_*1xlMG&o9qH2tTCx}R+G+7E7V`cNlw7B#US^jo&%HVDy`h8DlJ3u;}w
zAZgg(-(UE`vv4odB92|`JMYO?I8}6%Y4LPfA6R_Eyk_m4(<_?ye1qeD%nbp!xsZ#S
zZkT@Y1cr^V#P-1$FwuxQmFk3tUTKaQDvO$b8+2%<78}kMK5p<`tD$Hz(vP1{w@;QJ
zh||`^7}S?|Y!Xp^=_b#)0>AYIr>f{vT<k-4WvTAW*q}D}*$`IsCwl=3g6nXHXLGwR
zj2ZNS0$J`mbf^G+W!7^WCH#PcJiKNfaoPw95^MU|($!lPciZm0+o$k}EBl{GA|hmF
zmY$*8jm9K8m|iZ5iiypfD@6?rVaYiWLYWgrjOKc!Bl47=1GkI(8m^iYt{0{E0$nA3
zjO(&%IcVXx3%|6fKQE#Wt`@4FO2cI}l^ea~TZRTh9NiGn>JlvenyVg*6&vfUd%~^?
zbE?3{4l~V%b%*xW!cU;p&Yqf*f4U6j4R@&%?^C?^TwbrED{|)1%74uH;&!~`5L7*q
zBbI=%`m{$qd|dzK@|R9TPA9ET0M%F$+Tu^oA((?RMQAW#kpwAN@1{t`LzDSsoO3o#
z?;TGnCtGz@iU@5KIbUenn*COVud~)X;jvfN<=af#ie8&PEki`M?CsF3@BbFXGSGFg
z)kFCYcdhFjADb3{-TRIhM%c*QZCiO-D4kt;v;o!;YUil?A<~|M3Jo@RhV5VV<Cq#y
zBYfpIIN!(pmG9b*7{Dgvsn+C+8A9*^z_95iqPOOPpep;;Ez%S0d5d(ZtR+v&8<#(`
zG7plVp?6Lo5Z>VS%=I7A?gt;xPjEkcJzHx@kV$2)VwG9GUhT)W?1{VozFH)kdHjif
z(wwLxV<TYr8`s;8UEp=CFDnyUL*YK_4AWyj@zq&BQ+zOsXi8y6Mc9+OYm}J*3(tc|
z@x?xLy>xK;N}<l*4vf_wO<e5Ov=+C;F9`!4TsjI~)>Vcm3D4Z+Ch<pR*Y*=uas(_D
zS5V29v|tXSR*f|*!$lHj)wLj3AEe{T?}44$?J&COsT|Rx%dH-Zi%jqKIZtKn9j}w%
zzg0CAQ9o5H{_uX4-`F&NB)x2zAaY~D>!d2$Z8t>~8Cf<zTI)M#Ro2y`&vBhRtHBnL
zD%dUWRAVtz4mt)Pn-VvsE=g%@Y?pfAE6RP-v2S|iTIlM)US7RiWZ&mdGlifIzmWRG
zVnFq`{oMMvUyJOwmD%F&^np;k+Z?U7{Q#S$_kFCm4ZoKX1Jb2y7N5x5mZ7u6m7usN
z8WV*AILpq@jdn=t#g#IWI&FQ7#H({7x@)%mX=zy9$$*86Nqo;cUVbyQOh*nu8XD=_
zinkUzMOD=n1<J+Gu1$hBtdon2=&wh*o?-d<2a`BYH;d?RYRqhG(;)^hM}A*|k2)s_
zpl|p|5;r~3WqIhNWyU8L-mW>BIatLB5$_KeC|qBv6mKWmUyupHf4Tq$hbaJaORVI5
zus6<o$5!U%;d%n;Sz-o>Zv{TU7GSmhbtgBUUjXMFLW0et&)bQKnX6%H`)x`_KArd0
z5sNp1(Ar%T_VR%)m*mFN`2@Y+&{W6cK}R457tP|?6I%`sm}y@-Q^#{lha~>fWbomL
z?i(Zb%ViIGw>ch9BmPc~+I#kI%eT^QX&Xw0NV3Hp>e=F7MCRz~xgFKw<}>`G940-(
z`$t9UPjeO*lN0Gaf#Yc%5*4j)>t09PZ-e3-cDDrFp*Lha_uw4vh8?)Su781Y<{Jqi
zKfqqxu8=zMcL2VgYD3zmavPZBE!$T{Pb2Ry+#EhzxUqjwC8#SVecm_TKBuv}VGGy^
zZW^Xws#4f)%^;_H`K7-Oh0U(d3xtJ9-txX7zmt6X1T8^OH^-rnL5%t4D*r%J+j`g|
z@gNRs5`PVEqIpg{3GG*2PWyDW@n+?|8!dV$sr!V+{r0#s8!-gH*R;w3RBIaX6RRjb
zml*meSob7OUb>!NwBIN`0TC(`@xuk)`gq?(Yw-<e(`&6ISD5{>S5^QvX*s-a%WOp@
zH4*z*KW}$~@v=7F$|gF~jCFvy>n;vLT14T%=lh5BoQ$^@408*sd>}A!`@Kq3mcKAm
zypAQEBIH|(5|{q>p&IPV4px~xnH&X}o6b9%)Nrl42p+8MKBS1q4jhedq8ZY55t7hF
z;1JlnYdGJs4s^XDbUY5exxWu%aJcLxN7gyNohr&s7a0s3hGn=W%Y5U8*y0G!+)S5^
zTa9HEtz0e7%c31czFFAMjOYbiuufVvE{P35YXoHl>O+n=pyt#T7DHS)QUb(SS54uA
zYm(+Y&-l*KEdk0f8WZtCjh@t8H&=_WNCFQn0LnZu+<4*<?-8%DFq3zm`*H}r;$j5+
zW{06X=c99xKh{ToL{MYV(GiqT5S8TU`B3DdRZ!3=P>(R9QI5{X^MD-}txZfe#wIQ%
z`Y!4(2i>P07wsz->kbp#P5j2A&&mpUQWW1(Sfjh${E)$9G-}9(OKLGp;nYr7TV&f`
zysLA4^_0ZKe(w(9P@mzl{Ec<ZUlPAKtGs@za&8-ayi<h(lW{*fR(3;s*4jyCi8VXp
z6SlhJ&elx3G~E59+u}6SWy@N!iCQ<LTebq5$ju3UC=dvb@i&@CRAMZdg~VpPRS!|~
zHZ9nQYoo~xs_|5PG762V=f~&G`fkW>U(?T}qHtGg7cDMls;fW8o@lNY=Cviru1niR
z=BgFQeXNUA@=BFT<)j#Wfu6`x=J8*&9~HSag|WYuP^dkW(q(6EAem&>n9vR$+Rf~(
z21>%W*c&3pmZzmins3T(ZF!Qu1051}!>Sbbs1M!X_$ONM>=kv+XT<IIf;v%NN<_pn
z7@V(m$mSi+B1#7CEFCV+tsH-3ZPPT<3C-%iKbzu!-*{1^dfe<wYd`GtN5m}GSxA0{
zjkw}IfDaIV?nL5OM|>do9ZzqDmT{R0Q%?042aYJdT$8K@i$;KVN22a}FOT6;3S+&p
z8Gi`POLqJ!4`uI-D0nX>WJ98wSUff)5zN-_k&h4-Y_(fMDafGkCk^y|{u!s?@M=aJ
zsn6cB6Gb9bf}`2ov@j=f9387rZznVgLbI8iHUuZ*pQsET8XvuK8qEg0gRkjM&cWtJ
zXA&4BKIN&?upV|v--U^J?qMjou#M>|&<9_w=FB&%PP!`FiQ*&^;`8e)j=T@ls<Vb8
zEFllHV5u>rUyagBpDn`bkCrfLh^>m1%4<{&5nemQkaYm#zGN~1x~yJ7sXQ(+5?B<}
zRvGRR59%0bmo}bJhNO=;;|$EFq4d6}jpk>@K>Jn128|5Q?AqDVyI+G=1T8ub;OEVD
zB)k36K<i~X`^`><hL?68M7NsY+td5nTquTX+TS_2V6)}A9TbW_Y6L6qODELkPpiTT
zEh550sIkK5gjQ-A5R|V0NY9;F2G`RkJ!!x(>8NtqFPe-|Te{KVXq7!YNrgwl{)=W%
z_&342Pd?8y`GZ$61xBZAHMJRIfQ4(X)%O*mor^UCE{OffI2H>P97e<6Tu{0Cw}(^H
z4K})FYt41&bUF~>CXzcpu7jDnl~MAts)Re**w@4%9veb~dm8hO(8fLmAN#dz>tU#}
zTCP%FcrnX(r+09zO0c~&2|hkMA5SD(5BDPH3dbx@P6jfQqHF~7BraQ+JZAzOu6Na!
zYA!01b%r6o%aw0hx?|Up-L;x<4i-v4XXS7aWZt?1KbfGRsq%bXmJKlv%G~!nYRp2X
ztotrjXue!uapN>$T}C2lOwW)5Uo!A;(3-`H9f-t^Z_Q8i3wIp1lw!Md!{>`T<S6b~
zB14Yn<5uu=rM_Siv)NyzG7aUY+oK#6xamAe6QqKj7Vc55He`@(@sZfkK3af{Ld1Yw
zH3Z)d?~L14tv&5$*;swq@xUWqDw7E-#@n-T?p%=F#%O%zS3d<D_Wd@GtSk0yn=5e6
z`q$Tr+SAF6AJ%EhY=1X5OXt}fZYO|@@oSyZS=ot^MHn?@M4p$3q~luL8!H)J;s(>A
zq-8?}<@K!`y5JdRJRW2FW7EL7g}p-x)2_fTn59}IFJ8BIu0Zoa&goodX_SM%8lyk<
zwu54p-iGHtYj@ncL^m}g7>tPioIG+kFq-HJC^m~{@ZuOsLVCuSHjSshUJAACh>&vk
z(0Fe12tmF#9KLCFj}LlC?sZ^D^pvtWAH$Fzzq+EH2sYVg@e_8B-vf@;n(#p;LkQps
z#%Qqo8PEw8Il;Ga8(X<tPh!3=5tnTx*3>f=Phc(c<ns1s5+^;CA3L6P=2@|UU7^_Z
z+B{@1o4FvF`?K{-X(S=pVZuRW2jquU&zGhi@t!dG!nMPlA>AyeN}B6S@H^)X#Y1#6
zge?6Jt#<QaQc&6Mj{^PxXoW^weH5`SY$1Br>I&DGF7fkKc)yfHrbYZO6y5;UyqZzU
zx$K-h<OPI~cJzDT-h$C|>6#?!{LOPU5=!z0L53HL^6_jbc>s)Kf_*0Vkvha4aQOk2
z%AQyc@P7b%K!m^d?>nN3798I!ff*zx&Nhq>FA!D{hEM>O0>4jpe$|=0K)#&PuzpqR
z+1oH9&VqGuWWnN<GU&ZA&b;preo+F^hWjdb<BWdr8?1l!C4K@joKvD4o^4Jm9;giG
z2QS&5Gm1ul--q+MMvWTMxN&3Y)~y@x?qGTAt+zCemJdvbEDSLD?AN<DLVTFtE)>C@
zH?sFW?^7uF^x1Rr?d&;PMKEYRw9^<5G;csY{HzsYUzL$5TMPC0WAc+UO=e;s&aRoU
z|G)RuSZB)f7ca|)ul9#f8eDCW;fQvaHt@S81XKD5v}8w2m>_ePEOqAl?c0fx7-y<m
zq?@AxQ<pP=?Iyf;wq#y!eaRbDX-oqmDlfeEe8*tM(4N+QZ>4ho8_|2F_LjzV)`=$@
zw{4TbqdtRh!+Q6^J)Rb?UL*W{Hfgeq9QcN`YureZK!DR_WzQ01-GB6`jGaDHa|fhO
zBaLg-#GK_EvF5Btv5|b_Mqyd@+p^;ynf~j1DOa?Jw#B{=dh2r}D~d3AOw_?fL}fi(
zbvGcyWAJ+lAV1x(0b*H;6&|)eU-SE#&a{iy2gzg4AJP58l+U~jW*u8kzO|VU-5;WP
z1h_d-it9WiA*ubskS}+3DO4a>@7Ju`0<%Xql(--2D{2bs6SIu`d4e=1YhZe*S*@Hj
zd8(Q;s{f=kYGC`}xk<ySQo3Y8P0K&IgR{un?~axcpM58np}nFe#5iN{BatSTgSZ%+
zBeN99n?tT%x-NV7A9m&s>xNP|fW)$6=RQqRD1WeI%aQ@ecnT>|tbj^}g_YM^JT<CI
z7R#@%3?s=*aBu1Tajp@d1eXbv(DMiX2>h2upgbgB>(#5LU7|JswVhvo{WV#;cCB2w
za>YXK36Bs6bA1i>?%kGKAr}A&@8V#d1{!me{`Nh;ul2a4L-A}X89vl1ELXLoGMfwL
z4{|0OJm3ZScI;4@_RWVf_OrKS@W2<OQibCBK6R>Ok|k4sw0rgq89Q;N+=jNWn(&Rn
zb`x2Ls+jx(*j}Y}XL+*jvoih11wiC(=~#Q;tDMcpYy3lds#Pu{Qzlpp=c0vj$;dCK
z$l48m!xXM}7{xs7O9-TezlVSoL$)mGrBsOm(zj;^Ns%gi4FXpSe7r+_WH={)hBMKo
zTkzw2=hk%}Gy9X-M}TXxNs}fTwQLy(jm?@#t5&UK_Uzfv<T&eGqx{B^S)=wkpoW6=
ziJxPwyS;mq4_Ge7#3Gv!%@D+`r#ysGt0rBBbRg9BlvJscOOEWBG|iA<U%}M*DQJXS
zrjt!Zjz96OtH1|88#7s>vjnD-3gwE)D}6dky7U3zYU-GB9`#kNs>k7I$K;4V>Rvxg
z{q&_7Pqj>Ic6vsc&1qAn&;si}Sw-IM_o7Vr;60f!>QlYG+U+@KW!W-ilo#F}B;U>Y
z3Ht!fEP&IxPdHON${WCOFwyrMI4E_V=^}NzJSRUcT!`aI6boWK$}UDz-4C;J%*r!m
zn5$`OsAs?(;yos))o=6QadoBgc!9_8-i1zW(gRL{xkS^6&$VtTwJKK7q)8#PIAqip
z(D2_4l!thUy>v3qEP?Myvwu=%bKV@;QAP?G)Vr7D3CaN`4%0{er0QOaw^qBigemNK
zyyviDNnE4gq+vMwnlHRE{qLe~1h{9Di9{Dc?c2B4f`fvB<n6cLRx2@YzWJtX+_+J-
z9|>ofKC*bz?VH!-&h3j35QRWxlGM0Do7cAXqOLctlGsMJmuBau|F}?@%>ijsNMNS4
zny6CwV)DV@J~HX+_hjnCVe-|ex8%*&y27NkWeq29?rb;{t&s}VJIE4PZKFW064|3@
z0<(SA*x#20zpawWj<e0Li&k2UeYP^ZF~Ty}KE{tL4eM5c6+i3SG258&Gvw&e6UuD&
zwlAuJ1_f6R9Fg|V4S)%8YfM@+sR@gMU~E7@*Dw}Y-opjLBfu^&oD(D>!x=g)H_ih`
zwan>8(Qs<H{yz^k0%lFqf?L<FUA2&l7ca`Q&ps<vt5%iq<HyVX{rffEzyQt<&cDb8
z=D81T-H7(7$d~NBC6Ot|-XHGIo|~L!S)R|{TM%}vE58MUlK=)&E4fjPYMQA~u3R!{
z&K#M)Y&mvOi%sRqW6>w5`TYHyxyo!Vkw2g28};TusZgQ>a86pBv+8%{3*DaZ-i1!>
z9^*^i@YCQe=Y;2AyI14(6n=T3K3k}~_WWVpyJd1b*wcD@&$!TOjF3_sS-EMGRB6*r
zna!Eg1!xgXp}{<?|I0FQ*!wu^jFfTj4v|;8y3RVmSpuc+dmqU789%6&yG36!wa1a9
ziC{&@5t6}b07K|DI1)VIh43LITpBTC!P(2ww0G6wx_iJ1e8?6Pb+8#xTaSgaS14i8
zOU)SC=la9e=WBjn)7d}8gXFR2kLdnk%4c40`Mt$?wu2`tM__UXgfvn9Yt)PYhZZJN
zX(n~@^j{@I`qZ*(uf@zwnk0crId*E_7}~_y^l)fxZ{Dk>`38og$Ir;z8J~lw7>EN2
z{jIsAulSx{^46_8a`MbM`EJTwnEsg|g$v}A_lHlAaxj<Os6llma|Ujn3}lL@OlkRF
zT)uJ>ButCKM0OG!(h0ns$@PXv0;O25f-&&R{AC(<8s>p+Kok4Q)ex9<gced{0>+7W
z6#bSmSpLVn7hp$Fm_o<QqkW9mbR2&MzUMuzVWnp>27CE>*zt_B3UN>8h+`cJ*c^3f
zf%v1%^BhF;(th=p-?Tf7{CsV1v;E#)wEy1m9sC3F*o=TVlu=8Png9zHEKq+(7cN|o
zQKLrbj|7|M&6`W5N|j*3KfMwZ%o;H3H&llmYwGrm3*cd00Srwn@e>kpv|y<ha1yGB
zhlz*-*tu%uMycDRyB3opTSock--AL+6i>IPCqdb>fC=glA^~8$4LJW?rxvmf%-S~w
zkCHvuL-OUxE`9oc2-B1)r2W$kwcpM*>0!e8<o?bTt#@Xu_+ZA1uo#qK=QH0~NH-6J
z)<GrIUd23)z;bvvP5^1&IyEaPLB(i)egrEP)CR}^2ANmQY~)?L{%?8rqpww1YA>vM
zd_H2J<c>;nbEctsjSG$IWMDiefQG>>5N0k~Z^MAMZdv^gI!8E%(n4#jbLY-72Zwbd
zo&U!le+YkMM)&O5Q_V(F8=Aq}yo+!+?-N&v3Mt|UYw{_9flp-KHiC9bv7-5e2hhMw
z(CWtgH*3UOUp3AFhwPP^HE54>ySJ?~3$d`)?LFXe1@h;VNngDujh}u|NW5M0$9kBx
zpR2B0=m*}I-`93yeg8OTv5f!rC&`^ND})S>LdfNPDO4~QEZanpSx2{qNAUy3Ar6>9
z@e+cO6hfH9c@gu${y9wT-r~$VbM+;SNgFj+hZ(?a+ofT*=OqF5pxYEwz`Fesn#<*i
z!Ovfs)XH?@Xmak-tXnGxdX1kkQ@(<Uvf#i#dF|uj%4n;F{g(TsJ-GUW5?l}VByM|3
z7)dle+ra+}Zs{y!+Ir#m%epkZtvO_@s}D5I%@S-c-EzAg?Dg31T`3~Id(eC=*;`%{
zXRZ;qz&(XMpZ(rFRm?nbPL+fS;VGXCS96v*snMhp#e5t6ijFU3i}TX(2;ro5$z-(3
zVxF52(8~sYmIHhCkl$cVo@NU79z7zXCr_0jefvmytOsK<o=qdMf61CPGHl$}k|%pM
z*?kC>EIu42rC_R$noQPLGi{BTarC7Y_BD>02)FhJU-J0V#M}s&`CFz;89m!ffVN!D
zoH^wbEKYp<@yGh>-o3kY=+Hsk2$4C>V3?97zh{f-v5?_*$Qevc?2lkN#*fdu(L6`7
zR<upHF8pn!bO2K|XHcN*KX_cy1f&Lo`2(q3v6N(nzh=6a(|`qb6Eve|J2aJF7A%(+
zU-?j|_;BFR33;k@Us*K!3(ccV=Tb%&+;AyIgwrjzv<`fpar{w0$pj;?dmm{W)tFbI
zfHM1d7ZwSV!6n}F&$fb_yvdRwT`HNsXq5`dzSyI^Bu$#wiEC*=C^R1Eba$Y6HF>&a
z-GQ2r3_W0ZFf+89!*w&HDI@s@eWZ{<f`mBRz!c2AJJ&EL&O@*|Rd_=d$x8gmqhbVT
z>7Z@fwla0<RQ2V&XU`rd6Z)f%K9ah1>nh#jfvq0gzJ;?b_Ns8!N>ubCvSo0$FI}{-
zOdt8FX3Y-kBcx}$_EfG_JnAwxjd#R(RAw!2lg8tv^U-~Eb1f4e$eba)yz<<0^7HS1
z$eu%oBr)JqmjMH%V4mDktU!Ly)2;Im+d?;EOV_TGcSnzr9GNr8U;pj`j$BvGDzm(M
zz^N4uI_r#mH%>tw4D5a22s*~GubB_kBI0d-;;d7zizUp#!iWx0n6^Bak;MC$hJkn~
zwPIU0s4vrCeP{TDiIN+*-Wwx6k-WjdQoC{`q;Q3M5au(%5;!zZ_EaYB!LIx!-0K-`
zt9uEno!(00I?{kk*z4dy*NvleQ#LC=<D7E_j~qYK0LI(fpt&8*b0lI0TOMa@7&o@M
zY$?K9V{8L?V+>r6TA{Oz8twlR!z19rw)`I=T83VcS`ftU+_<5P0#-tTU*iT<<;hB=
zoC|i__C0vxxgxD5*<)RcS|nq}011FLG0pH~!jA~6`SCeB5dE^{i_5_NU8PmC+OmD;
zel2k4t^;x%zBTD1!!umEd|i$lJ*|lNxbJ4k$x|>js{q;F3R^Dp?a<-lGG)emjXZPq
zqSFkNEtPM#j^^0=$c&+CMg$q%^n0p5XzqmS3(|1xH64)pc>F;F*t#ar^nC9vuo*-s
zv~1L)fn@;!vFB&Jy&Y@=%j7j4q*zT_trySS;KQoZs>EtD0kzfzW|NiI(QdRw^LuG_
z=r3tX^j|$kz{~J?Ch<SV(h=aHjOM3y?b-#OskfE6oGDW#nLK&2G;P{cwJE1hpANtD
zrxoERLc%o@-nzRD?dH2+ILAwn90zkqEpiP<WB=KfxPULmkDpeH6Re5^mP1EQ$v0mO
zk^V1smONm_#)kwweR=5*f=2NH@3LggB+VPwl<D7oB!%GzdDou9F!z%{I(2(f{i4M+
zFal)~k#@pJ1IN7d%MIS44u!v1Bx(f%u$7E769^Y+J%QTE22nTfcqQypB+xN{yeC6B
z0B#(*bZVwavqFn<{^Hd#d+rkTnWk1U9E*};Lru(s2an3A(UUbjnaw@Awvrl8me+5i
zkx~5z^~O(_2zA5174C0`oI_niQW^XZ!x{(Q{#L0{MMxw)eE2YYcwduLIMk8(OiKh5
z4B<MRHf<W_|8eIUHS09u8nBk(b3HbG5)w&Dl`1UF;Lf5BT%-h~O>Hq!9BaXaD_HAE
z=Q`G<g(ukX69*xd@52hj5lAc5wPs7iJp56><E_`a$?m;JRA6J^JD<xk2)FQ^8Ee9z
zEnEMUXI^+)xF*SLet%GJ_<b&+C5BzQt-?osi?s^#_qXoC@A};<XwBo%>PItdmgX7#
z&VD;<wl;x`zEXt>LX-J@X;`hAWQ6oTqCniZt>#d9O@KY2aK60q+VkC{W!*Y*0PB|F
zJ7{~-temw!*vFmsfTz$DrPUppB6j@}>i)s}n5n#k5jamp62h$l%b*(us|>3=YwvN-
zPlf^aNXvh-RV?q>ezwJPt%OnWV;jx!vgW`0xWu*&an9v~&*UaBL0G@)lOXhFa1zzn
z1quly!1v;}W1k4K$QZJJf1MM6lde8{?<L>!yIVh38phMyFB#t-zy2z;{7*}+Y+2>|
zxpS4+cnyA?RagN;Mf!L=2vgc`kNivv*mL-ZJm0#N)P%1;!gOoQ+49&HAbtZv+vQ`&
z<if!N%IMM-#_s76+XgXO+hg|PWYD<p*s-J5MGZ#=3eE;XxM$3mF;cN&MS1z<mt_rv
z9_VwN@XzGs`fv|;)^&3aCX`R&K7qw>1T3}Ys(b3Fd?HLIUa@E2A!+sO>#C7OW^>un
zg=FEZFIA8>Yxd0AcKg|e@I5$JcIp0HYx#cs`^s$Qndm0^`qt1d<=6?(q)@KeGhDco
z7ut`lLmVT|GZ4==JSXuS<94<|9QO~_!+iF@NAoj>L#Cm-xqNwpWa5~2U>5k2WY3yG
z1`PgOHvYL?huU5a+a3q~<U3j$%#kgV95{SZ#(n;_l!s!+eFtN~UT;+0$H%!gZUVT#
zxEo@*k3zwovnHzQ@|P325#Su*8s<64SUTXjDFrkevt-Glg}wUft5URRQK8j?4I4Iq
z`AH=s3kIk_T__k}YlGt*EnVEFk8)1rSQOa)JV8P7RE=u-D++C4u2It^_LDdqaRM_|
z^!3br*@a(Pcp>8rjPTY)_MByK2<*eadn$7k&7WUBAM}>o!hL4IUxv@Wn<N)6UDh^`
z`Nw(>0|y=i^UMr~VOE>r#m=1|teHm3v9Ei584Lg14!I#$VRh`vX}S=541|j#HHM%g
z0gt?R`HGsOX4Vv{g&aqwj2tcvaMno&GuZr|!~^-`S%>>U!Q8o}Pp6L3vt3)+3tW%k
zbGQhjPhjplW?t$GuYaNuvSOZwp5NA4iR(juSS#$iXVCoz6P>2u-Pa9h4@hs7>%Hy7
zzfIY^r!pYdrNPO$KCQK7@kID+aATHBT;5fuwQgCSwF`N#(`rB8S$rH*m$@1%uA#@n
z)^j$>8LYCLkN%q&9|0S~ImYY$zp0R#Y-HThpB}@x^Oxm?F0G_p>jv=4Ol=*3@0xXT
zlV-KCh_Svrf)(KzO&|#<36n{8uwi)lLBshDHkCkVK{stw!&&ArFgmD}taUjmGaF>p
zCrZh4otuGBdQVQBJO@+Rdyx+SO~Hw?2-{5@AkaJXxezpKo`N~=lqr+xvwcIiW$=FM
zD=a{|xbf(BIz%o)8~YOcY+N{Z$tiut!j-5>rY{f!YD*u7iK9L61-c7;p-<40=s(R*
z^1U9`JO-8~U9McdrsZF`V)^on3jlf=-*6`c1NMMnPc3f3vTI=6vmF=F4obkWObQg7
zICe_@MSVLo{eFGEjc>%mPv`?88?6o3_aw-$u6?@?YMa^aLomU_y2!{jjBVs_&J~X?
z=qvvp{|Nk7M!+02s2SO#M-S~*^5n_o%$YOFT+RTZC5bc3V8*vin>MNu@Z*m^`ows?
zbL%R2iy-2}gBAcZ0m4OKqG~{~#tHL(fzRe#!GTI(8S(zhLcgy`AQ?|7c)|heaKr-=
z4`4u!0b>|nvP2;n_2Dbpn#`H3gV7R{cLxUtW#HmX2&DT6n=ar-*b!Es^$Nl}rn`LU
z3K%z#&ISAo!QOF=aSj;B{MWHZU4RMROXn{E2734gM7s$4R3g&@Ad#3-req-*{pmnh
z=Qs=P&j9J(_dVGOcpQf?k^hjNd(zF2n?mz;^k+@2Z-%a&n@e)IX}F66v)QZkMHKyE
z-SMCS!Z~msjOIJHF00gU6czt}QH)umX>jN!Q<;G{%Ju8lmHEtg!+547KYtVyp~*Oo
zzxl4b=Py)nL~)L&X~<9s9&H4h7lQS173<~NRhY6=samT|B&uA!YOS$Lm#*p>;a<))
zb{N7HJ9q4ne|GHFJ^#$<b2=8*;nvGIph&_1n(sE!&!*=bL;4p?1Q^l@CK~hR$pOF8
z$Mp3FBgV^LTmHoyhad`Y((|xJ^ZD3m`i^U|T9q==7C33*Bp#YMrtnyGwQy3Jhdgy~
zog6#*lU%;GRo8CVxN6LH!0oIXM%BKg2_x$RMRR^^`Ddq01j9KiG&+duzSXOzlq_6G
z=Mr&LE3jqOk83dm0<+Wr=?X?0Lw1;VoBR7x`Df1_kCTB#=po_$;+4y?_uwJf3Nyuj
z@7$&4iuXcu<lKdefb(&b3G75*wN&%RA}|mqK6UP_{IhqDY~H>@f7^HOR=;_~wcSl0
zqm9tSzIgevTm`%_;TrB4$Di(NksR<yCO6f3jEOV)fp1JZ%+*^#{7!TY5f6+d14~Uh
zdp`TUc^{Kko(t)g;|k#6RS?$ifY8S{2I2u%L$1q(%Q#cPO$yEOlE6!1;NjyZFc<d9
z_T77A_kjbj-gO3Z)-hK&S-b>}3DfY0TW39-+g9GV(iDvEdQ~dRz9UB^b3lL$_~cXB
z@b}+3{VBw7<@z<50Uvt*VxLL}rU3o*b!^^D(xilI4-)XB{A+vZNTj0xAJO(pV5D!J
zHd)pV8>&oa7Q-=)kq|@GgxQzy>JRUoDbSZRCT$d4OTZK>eYl=Ke;(SgN%d@V;=~DP
z17{Wr)qeTqmukUjz<>eDcqUVd>zqH%bN=jsc<k+)WL}f$3Wg)C_jshSr+?^sRL{2D
zgXZraD>h2VmFvoYWw|5Xe?`icEv9SmF6dIcF>Tj8B>V#=BuE%vS~jgE?+om&XQEti
z1-Ej|X4$Z58<8I8O5=Hk=M|oBxEF*Kh<}nc!?ag{lQAfa0giUz0`>~PAQMO{Vp`Ia
zc#hHF7*fTz9B{;>XJd3c(dPy*t=+6qbvX=W4Tg_C{YDNRK4CG*9p8pH*Co)Z#!mP_
z(;PZ-Qd&2!qh`6I)|9pu@Aijhn|SdO0p{JroVe`v&i@d5Mu3(M=*yYHxYT4m24Nw>
zf~;AyDnpu956YD*CxZtMmQ64dd=qp6;!<J30DI3!7ofnn?pN-h!k#R~@459;6Os4;
zf26IN=Ap^JJ#Wf|OIYi~;jo^`FeCl;1e6MP?LVOSz)s)=#1HtMR(rV0y#>a)@BzXg
zlL~MP>y$$FjjC6dUhUhdU}YW%N>82lt1MixQr9})-@J8Geps+jw*33A<c5$M%N#Rg
zuoTXl*THNyWya2{_8EOWxofBV^}`JL`=?oQ<;+<<7lt)%v9l-PwTj>K9_Gew88da7
zWK5d|cw7jq(tMyM&pCfAYq6XFt{>s~4Se790coXulg3&>df<G^Hf)pwM~`Y!=alhM
zeR%J@B#(R)CYyU5oWpT3r|$u0w$3@{u~zJ7AgdBsw!+zfbNn>$3!cMv?%OXr_wJJe
zN5ESE&Q0N7%i@Ro+Tnb;3F`=4gDm?71XsxHy@~5ySo+!e?=CrZ@|5nYY{%s*S2Z2i
zGTTPx^CjT&taHa63xDVR`AZj-(QWE<@0YbtlM&8+lVu$`b{uWqhcm!Fb(?hd`~{^K
zx)|4yA5MRc=o$g{Ib>>~h=D&u_XuDmZU1Ml8~|aFp)Ryvio@)pVazga)|@43Rwqy1
zV7YhGQ`*Q{ik=Hn32T4`261OOZUGqpa}`3RPHCCR=KBjks9l65D~Gk{MY3keD0OO7
zl$ERhg!bGa`SY)Tq&A-EPgzY|I~qa2Oh9lIl3#yq`9}*W4&TcaaHz1D)Tq|An_HDU
z;9sp0^Opw&GIS3^qx$=)zsPY&y%Kpy1nuQF`acVeZxGNxFr$<b4<VazNLdI?x(VO?
z1SW?ig~ldSgw}V5ryEGE>J=qd&TQxxeWEzZN8Ym_#D4wlY}vi%pqj_*(Y3Yw_4hUz
z@Xlwl3<s%9V0x`yGEstZWRV4nSIXk0YgGf|<=!1(N;9dNsQex#lh$q6>g;jB0=Xc)
z+X|ZP)g)WC%&PV0Ha_SiiA4tw9YY-(WajL}D0`JNKW_%~e-5V5t4i@A`6UswpSfw8
zG8~`9ExrH2KLY=$5#Sz%iB8kJl=j`ab*s?#5j6sgMl&@9Xx4Y};>G%_j>AL${{4mi
zpULoIuoGRlZtlU%)P4AbjT48`@;Iz`-zzom!~QXF^AZ@lzX5JA>^pE=N|h|2LO}_^
z>>=UB5(e_94e%6aORN0=>=nuum)Bo@R)&q3s0sgAz5&FV+8|D)2Lt`CtOB8C+RXXd
zojSEDNegIR8nbbRSi%G7X&g4j1Ku4tcnp49Q^Q1DOUaohNR|Qq%z_s3X)wS{_;Jo}
z5SRkei8gj@`;<KSL}_Jk5e}_d_m}(w1~S-zQV_yN<*^qSjb|dFiO{$Dbt;3gRT>1T
zO<LsapO;7;Xf}}0L|9HDDK!9we)zRyL|dr!{mBRYrAXntavw0=z&q2PsJpUmYBuou
zlRyn=aQySb)dw445>L7Ah7TVuG=)$9%`}0}9|<S{AnY$+zFdDKrqW~_HKWOpHs6?Q
zMen&eV}9L&>HDi^x5Gr}H4q)*Ns<&4w1@>?$ZDRt9a#vUfNR!mfvNZa6&}d~(~)fZ
zB^(B)&iEC+%eTSgb+Gh=3B}U~j>@zj=F8yszj7Pox>&0;u~-Eba8dzpGyA-O9VCn{
z#M-<F7{XAjNIoe6+{=s`*Bm9*bL9?#Kj$9O_L<jUzCH-;`dh|-H(TBd&MIkAr-c8)
z)$+~vbJXlMHKF^z*hz8#r}1jy-mQhP;Q>T;G=m4)+qbU4B<E?&i9oDfFo0k>QeQse
zX8N*?_>v~7x<MSD#Jlxdz)V9p3W@zT4I4;lm<8tCa;~>f6<YfanSUgzwWw1^eq6Lj
zQ=P$jroTFC9a3NjX96(j*8ja#ecMfh(3fXuRllA*1r37=B}+)A^l+(Qi2II&Br=@8
zcu7`)pgQTNIkIr2J06@VT?Xmhu`RS=Y6#6S6Hed6nQ7sQRkHT4zd-zO;rc%;TqNg!
zAMlLOwRLOBog=%hCtu;lgMOa}&F48*=Gl~It|s1Gi7%Xm7VC8ogpz<jmMUW=xZ}W?
z&5^;L&wg+6$J~`L9cB)<Z&&|_DS)pvht@p53&5mYhp^e4#Y<%MpPMBBrrmpWXs1kP
z`r`g()*RV)_>i-y6)}H0fH+^L$`g_?ARXo*g(bKjSap#2ylnk?IS!^P@d}#iEML5s
zB!*dH;v<BYJO^i?5F@l%7OYsQ^Mn;lz`V_uGpFQ$hS}0}>t)CnUkd#;)0Fm?gWr}S
zdGj~~7-GTx!v=c|gppw2pMmv<Q^(~7EE~jVc+^7^Uj5;{GlS$y8k6=RuAGO2z0_!I
z)~uO~8#hk1m=jRY0{b|b&P-!m1&tgzQuw3BbIX=3rC`B=nwL#rISgD&T>SU$-qgL2
z=9IbaH0Vp~HY)#et`T;hIC%#6{dz6>An^AgZ}pHeu>8U^3DXci@eJ%Ae^j1G1WmM7
z&Fjj@G1KHI1os)1;%rkD0&xKW&>+M);MavKWX`;$k~&oi>Go_3(9`le@g%0_IPh$9
z=+JQ)IeLoHyegI}CeL(eqU)A|cBCVbzGcGXX}>`D@u(z)+mU`f+dF(p-v<Hxx-4F@
zS_N<K!5(Yjf_c%OLfT%^;D|3}0PU$e=tQ%BUJ4W9nPI_c75q=Hl}>?;luoAK1^rvP
ze7$@J+Eq~YOkhZ#)Uy@^hVNl-ReGLjOcY;TXPdi#ePJ``qqtgsaXyZKfdM=#wQJW-
z=EBlOk|aqqHwAY2BfXV^_1wEY_}~Nmz4OjHa3$74@?t+CykKy_0M|M@vGs4cdhCcK
zMBda8V6iNrY4S(@hkNsuojWb2E?QHmWJ#%Bp&agjcj6s!fyr}!hQ@I^=>dEoK48qe
zrGKcv<8o+wn~(q&%HHqyqFPR&a3s%7c07+`ykpNEHL*+pN!KyeK}$*kJW25Z)JLI2
zy3=?Xc=7U0f66v6l^MFeJs8$d3PJ-qA6#p!mA6KYQr9wjj~tb55W;LwwJN@k<+q;o
z#OA;)f-v|F7=~9)!pa!hed`7+o^pTS_7S_A@59hqyot-ecNRI$4yVAJ<oBymwzT$-
zYsC9^GH0DT5)5I2uB}?i*E1~vp)+9A>iB32ykpVcV|{3Hq>@-4$l7nMIY&Hq?vmdn
z8DIt(v(&b)P_nq<&g>^)$8iW~aJ|f3vQ&Oqy40=4{#~9`YePlz<x~9FxRYUrd8XS2
zq2Y1UW~lq6SGzqYSs`#dY4%)s9rPWB*SdC<SD$}Q6&b&tF;o76P;---HKbL8db08F
z?K0)(U!3D;DsI!Tp*#)q<&{d8QcY>DLD%H7807qhn|JJx#j94!XH%xaLW2>A76^*g
zk{0#qr~p9*2ncwdBTYT-*T;TxyLTtnK6xMK__4OV|AmGD`jE(T8$=#5a80Q4L`fVT
z(yAl+8db|c(`}<10b>0ROkU>!q2E`4Z-5)G-ajvEv{I2+WI|x(3_xZ`jIa*onu@y+
z!Zt9*Uw~O7GM&kArWwXIE$c~v0=cm%gDGRL2^-a6NO6Alm8B&^CIfv+?{{d_Xn|4i
zhk%uV%;r5{0)IU68~OU1S<aH@Odn;LuLQC{Gqluxp7z56d9LSRXBztr3BvsGkTU9{
zSLKO{C9MI}fwZ)z>63ar7}kwok|P_K@U1$&qD7=ko=7rfNUfU{0py;2hn;OHRlER9
zRPK_`#!R(WheJ_PXyBZX-Y<UuAFcZ!z1tJtKaEPqn@x{WitFK<bpT9XE?s6F*4zrg
z!~4U>>+kd7uY=jsNPTS^Vzg&k|8pcqz%l>1UYxg)EW)2WnnwUYjxdW7|Gj(nR{ut1
zFdGwed`GE&{>WS=V{0i8EB?B`_j9XOtzh!Dlrp>6Ql@7hTvLWK5UaRxxg}ch`ZB8j
zuxKEEhmW3wudiR#?-xz6wQt=}vO^GsCfd9*n{DkN%ph__srZ^z%PYe<OXhU)(=SV9
z@W3AGUg7?o`_K?LCf~q!D?_ee2-Z;g$Q&w6c_@pL|McVf_qN>{+p1X|T;qTdepn_>
z`bpywfgyb7&OK-(AJ88oD_1O`OfhZ=982zT1BZN}=?4$!E~P<~Nf<Ao3My%vfh@9*
z{a<`W8O~%V)3jWTs^wG&hzGVaXU@wc5PnXbKC9&vDV#@|JXIaAA5zU28v`%(WfcD*
zVr^;80tP^HQpXE*SkojL=R_3u<53sKeUXF{YB`hPd>PDEO6$`M9D^aCP*{XS6%tfw
zPO=)9P}F*+$^4+8AnRLW*t!1`@1W`X6aRh(eCn#C%UaRFaDLRib+ixb>Mj@_6t)}w
z*>}3X*RTDAWPxiFbVLQ<)^FSf*CX?QTU7wTF;Kpn^0QOu#PlLCQ(E{wT&=&)K7L(^
zsWc%)O;ioQ6Q}_&3>q|>XtojIlVPt&Q3zyE>R1UUs80j()U8=jp6}8ES~WjPHkf-H
zGk&HDg5-nd^M{{KloZJltJ!U?*J@SDs_>na`SJKmB1v4p7zzt<p}JNn?;PXscyuM2
zTWJvvC}~b+un9FPR*+OFps5MmIege^X$&OAdVT$cZqmJ78`Ti7%nov_S$<r(eD#Wa
zHD#I%9X-ZW5$ZZ~{sQ5zd7aua6a+#t^~fA#pl0+Tm@oe1n{Q>jgV5tyKuE|{8TiRa
zSplts5Bk3>`E%uh3B=p7X7gtGYT9&7PN{mXh5bj4=#P<2K%mGQltWx6A<vQ@*Jq1~
zb7ud3**$lr1QabL#Tz%31Q77Tf<~ue(>fq?O;jjg6iihzR4H2qCN*<#3Ms*tL}8k?
z&KDe{{uAjdaXPe&>>(g!3Kg7Kv-vOS-{l$U_slbx$FP7x2Ym>f%OoDpUbI*ygV@E;
zu2CZ?mOsBFO$<Nw7#D&`{^%Qz{`}tV)dP&>(W<>a=eOU%oGvca%9oR=bFKVjHg{^;
zMCw2Jq@L^Vs>_YoG`8AJ+aEIBU^X~0ELPxZO!gYQ*Ot(K`-n$?--L6Pg0a1N^^z~X
z_`)fu$ENW;tvL}s8m7~T5hH}Ze*OAMg9Z(RLVgssHGz4E$8!B!496IltEhrBkr}fV
zLrW~BoH=({&nSsNZ}J6yrxqs7{su$vt!LMk@-75VGiOXIqraXZy<cc60U04!0{YIO
z!^f1y!@knRkS=7?ap#qSepf+{nW8cAU>UDNo2PWI88CD-fQ`j?jgNB&^KZfVhd+ic
zU`z)F!nzXTufZDNfOo!>J$n!6yJ186N~sbB^jkRncmmc`C|6vC_J)u84s8yQ4@XRp
zA_enEISAeoxE(!m0?hBpS_YZT@8GOfwlo;=`qujX`Z5~+=-CFswA6Z6KfYQA<_9bH
zhgdNJ1_o5FTv^xhUl5+75Xu$cYz)NFC|FO%GA+IFdC;IiGIs1(dFiE>)W>t4JbCo2
z#q_2;e&i$T*U7)28)j+VRZ2E(rphg#d>~Z}zyvlXoWK^4N%ryh37RT(zZ6onTv?Dw
zc<PHIC(oXdSrBlf%Zmbea?85STNFR=EHWTPGI{00k5rR#V2|#SHgzh+0kjnD=6z*2
z_Zl)(vsNxsTA5YJAmBoJ3YNOoN6ptl7{@`t2U_&%BUxZt`xs#FEZ`CxdkR4^(6vm}
zvSp<ctepgyx>(3#5iAR!alxdARFck>$D-LjG?Fll`v`?rDIAat0*2RM37|cM6Vs+j
z?Kta%R}juRP<}{~D6u@>whfq?b@gmRCMz|Btxx=Dzcbf1ae?cwkhW;W3hDR3hjNw-
zafBB;b%3yBkOkY(Ope8$+jhvO-+m*+YrO^ftLE?H*IrQ{);dlOT$<xaX8W{XEgYP}
zAiw;+REB;zTJmJeCVOzk;{Hbbg+8!(E?WU%;T%~4Rj}c?0dIz?l9qy|&6}scsh^Bc
zQ~CxL@f1(3<r7$QJOd6JJ1$gyM*Q^KZ~AN7sG+>t^*PA}Wd)x3!r#)uf9tg(e`J)6
z0IBQdJ4ace{(`I#;KGRuBH&RV$FqO77J9zCK~lLwNoAg<PnTAzRxYayXXc<(Xszny
z!6@Md7!d);G0KDns7Ei3D0AUP8y_3wEie?e!epZfnSczl4n;^zq~l7aLozTjs8PnS
zat*ZB{@o7^nVbqx9LaQJMkD3l-3LsRGH_WgLjo{<BNYh0!+_xmT+xqXK-k~?aI8#%
zglJCqr>1Y9SyMle#to}!UliEfkE5EwWHx8X901=x=fI3k15=T8L0nBL=V3<aD~CxV
z`Y){6w1;e3`aLuz^WpsAB32~F7%*(!vQvKkb(!At&*iHyVFSroL!2`{fM_E2+3tg!
z>6(H`k`tOlG)HTg<20Q+=Ifby0BeRr62A}oMrI04eYbmdfHHXk(x!kG;T7qK!x0JA
zw;<(AN!(>CH-Y(`UV8T*21GQOnq-fNIpu7S|0diKpu{JGQwluX)c%)`dIaj#tEWgH
zO->~OLP^t7_F%LrX?B~3gb5QS=#Rc-$<!)bxUgzRGY=2oJcyHNVKJQTB}MT)VI0;x
zwEzgWlP6E6zQ3peKoh%nBdNukCwC5I{1Q$xoI3{-dejsK@<#@)2??MP8dhN1lz=A;
zsc~>3f=XDNJxgZQnC95hyfs6Sg1JE?DWZ~$SxJyF0qJQ4!vvbW@`m}buF4Sbc>zqY
zufPOjm<($W++Wf_dtnW<qzNzc<jM-d)lqr#)ov1;6IyA2fd=N9I-=~#{vsGH0K#Mn
zAeixswtAzi1F@?hGhW3(uqPqplTSWTA}NU__CP@q{t|;APT`P0{`f<GB)Y!#+G|38
z?POd-=2)}cy>kniARrRPp|9;^x>jkO029Mr>?6H5euc7U$z+?aSdVB`!bFw^!ZzSy
z?Yq3;Z0EBbn@HL;spJTRP-e_pB<Vn?B~!W2E5oD;T$~hxAe-EBFr1N#<C_T@)Lh<F
z4z>HN#ak_xX=v^>4RB2QMI^&nVFTU{9Q>JDcG(7ZAMrudVIUzk31Gk-n8~N`+hY;H
zG@_Z%c=3~By~5}4z3YHEkA{fG5%WCCdW-WMS6|Y2=6T>}RCXSwiWh)DVM2l2xzy?f
z8Fg%nJ$SZ_XAI6O`Z%M81m9B-!V+e+a<2sgkj!S9CO&cQoN6+Yut)!0Bp_^s#>o8T
zD^xp#1iT7>5ifvw&A_s0nTYWuG^D}acmbx=Izw=!5Hxr=&vzX-piJehbP@Of06+jq
zL_t(U*k2Yw>OFUk9AJPy4=t6%QUucZ9U3<RljFQBT(wHe%%3x-lz|2}CHzx^c*{OS
z6zu)B-$#_s{{BJpu?%l{O`N&f>oWJZZ^0BL2sH_S^CSd3wFi4X`@KC)Ox@!%>-S)N
z32>IOpJDjUz|bquZs<1fEoZY@)~%~%-APEC{PWM44{;?3TFb*Hd;`XCd1VemCeAUB
zEy7^(q!wPoLJ~vE-5EFt8c1BcCrsC>etr3F&Ri9EpjpnhMt&|8ORkjV>o*7)+~**C
z-0kVs(8^D*<^(xkV|z%5evznr>&!8Ax5D<?$2+#S|No_ZW-if>E47@-aHe)M&n~>0
zK*1d1Q3}r>ut(wAPe1)se-y&)+_^K90}7}hAE&2fsfXVj)|)778ePG$H&9de2<S8n
zWHz^IUR!uZ;hDszfJSpYCc;_bi3%k&OH#N8VK@h~*aYAlD+*GWVBss?%jZ6o5wsb)
zhofZ!=8uoPV?e)7Fk5^Y!gdE`&3a1!t8~eN${;5FD;a1-x>=wuDvi)L^YNoFJ$eHs
zdu?^I4Qvk?R>@$EhHmo6a83vT-M!G3o(c<7g>Yt~=Go%k*T{-he@ar|f!T5dsv<`+
z>~S$+QHGYAc)sDe1_xdpx@esb=HvLUK97Kb0krt^$}6uZoq|HSSFsQBOlkt*!KqWH
z6b8^z6wPnbn$*ytLsh`HX3d)F`<bwVf$NDHtdcC9Butq~5`d9LO?|UdYNAK{$G#^S
znkt*bUejs<1DSw52fd^B{8I~l?ehc&sUX$zjhocnMxN~1WbE`AQt!zsQmaxWCoUkM
z3-iIgc2g%$fwk)_&UvE@FRbxOrAo@{&v#RXR{-2+jDd#b-@spg2D2?azRMY$UHgE!
zR}@x%TvmRZ#l+$p@FjrXBuJ7}Vb^VF&)=pnxR<er#hnPVY37^Kbo91eyCf^{wQD$g
z6og<ZaX?z<Gixo(LL4s&<^taXg0u{DRYfa0;lWJ~p=+gTwy6Ds3J43hUdY^Ch_hVB
zS6<We7AvCP=~uvfO^f+Slh+9W1ODB&UwRE0qFTyna5lIActTv73}NyXXkB0h)=w5#
z5P0Roj}@+TgkUA}D%?SxW_{*>vQH1&8nl?r7Cx^hLZmftXZkPmWHi<cLoP6;t5+zm
zd+@;{N9C7g%Onu>9Y1qc-otsZOwl50Y3SDNTUtL|A&s9o6Kn5l$p)(qM}YH`EL2D;
zLt%*U;TYiGbhx=mmnx-9{dt};s3{1Y6|mE*rZYc`2Iq~5@Lywa1T3sAT31L6_9iw~
zg|XeW=O7rf2eoxIq0LnY=9RH~DuG_LYaQ(#HF~PN*rPoN?Abucv3JJ%2U0>KHV(ZS
zC~M0`07;*m)R-9v&DnHmQ$h;nvXm@VK>hU5e34@>x8<^cM_DjHO!y0o**egaNe(2O
zn?3^(`4E^tTLH7JCKP}<@Dryj%``jGVBSaiO4xvLijPg=6c{vv-XE>ZW-@olY@Ru3
zxHM^4oe)?V^zm_^*th3^v}pgbP!pNVW-@L2_UeE!&xu1o0yWw8Y{#ZDbl6z=Wx;Z2
zP2G^GFl!kUltmfRX16dJMe)#2h7=$;j2$^pXd0V+%Lsz6O2s*LX`{WBFiArB;oFa;
zBBWhY!Q2xMh%|FsyV3K|^tz^HEnBq_<5>xZ8u|-`R1o|=PM8F9anLACpDvZ0fyU$9
z8KYF=D0%Xv+P9OZ&Y|x=>vv3y!_46K$LM#YV1NQxKOEJujqD#T8{+bpVT$nP{WL&c
zfq4B_Ft5lC#$%G6$RZh3@4ox4(4QF(Wjug+3p}h*5|EPrJa~;6Ge&>(15IB^m8(=i
zHXJcvX3QMQ`HTl|#XVdciX}6b%+rG~=f*&!ks(utbb9smkG2l|p=sRQ*e?vjneGw@
zGa2lMRNHwqR278ici|6~;npn>Bp6UeEm*mrDQ;T1FbrpE8~1vljU<KkR06<LdmXp1
zpO~Pfd5ob&n7Jz{02gURgA8YCFN_)g16&1EgK6I&p(_FgGMn49tgDtbbb*?Jqv^_i
z==UdK2JOCe^Sb)dix)pRs%O8VVLn7tUt&wKB#Kr9QGg8RkdP2vhengp-o_j756k_N
z()?sTuUofHwWH~ay;G-7QW0j)QsBT(0RZ}-6M?U7S|W%}0;;`%k+?VO!S27=WLK9Y
z`e3cKn$^n5(6@TSL?aHuAh6QXK&|Q()G`arGG4uQLIur=!4iuKry1k~bCqeyY-S)X
zW`gU^ioXX~LzCO%z8ECkdcC7s&PR@(f{T$;aG#PMCMzMi523xBxw6S!9K;_RW-~f$
z!5I4U##|tP3lGSd0s6naT2lKL;R45m(uE7b6mbA}0nqL+kv?3hO>Cy30r4aXUcPo!
zrQ2Bqna%IL)DKz(jbOSEcoqKfJVNu<v*GUIWk}>RY}mF{+FWS}+&j545UJ@h1KQ6F
zw{X_@20qssK%>+MvXVeTQ*8rBe5yYDzJxY)BRta#BTaR;gf;@rY8!@gW4!CxzAcC+
z3DsP(%T%0J@t^v04+6m0$W?oZew-hNK<#4BV3x75%2gm|uzdXbYf>DHG`cL}9H?Kl
zinM*Hzrzoo$i}VPlt4_vm~}?AG^y!t(FE@6X1<y@UUk0*p^GLIM?lc2Mejaf5G7V$
zb!0ZvU-@D9sQc`#H`Syj>-ScUz06o~50FS<W;Ql%!X$DPmPW2XTk{Uagnjmh*fj#g
z9e7ruTanf<|372K3|KKs>Bew2Ev7=kC7fAlIfhIpK7aoC=lUBAt;bfa+emKg>Gxs1
z3CO(ekrtuc!aYnmQFSGJH!d4!y~%{UaF;_(L!P~So^xy>)^T8FOQ6-j0lFdLxkP8K
z0TM_WQIF_Gf`L<>be)u7@(z5ZE5`MXd^=^Xli~byi+b{0r{=&j<A7lgs{k0&crZmy
zt+p#yLzLNU6{MxPZXJ*JAS_j+P+s}+lL6B6rT1|z%c|Dy)Xfnzvj>0hmD8ft+poU>
zYlg740s4^7<DxY_$RC_-NazE;OySLt>sKuE;CKDBjt3dd=*&YHz%@;QcnXf}1T%<M
z+$dP$88|<Uzjq4OQxK(V*RCq4%d=ChTD368@<2fb)?TpB-oJTG6`-gvbMGeh-PpPX
z*cLRKNZ7(~$6=9xxZ;>0gVa4n8Zgkges&)`C~to{QWmdXqiKl`RE9rn3bg5&%+Xgv
z*VXdEV7l|H#q(QA)YAcee3z_QBb&BumjE!g$ZQUT5cF}lh#J(Zr&I(}TE|i0QEVMh
z9TL|c4kyOUDYQc&T^l@gC47AE*v0cWYcSC7Fa!6Y$aeD<;Ir9a9yQ9OYXKb~j(ZC(
zgy#IVM5sL+92ltD_B8EI0g1E_S|hWWTEf>scbPPMjxw7wz=ZrwNMwHZ;V`LDsUj=_
z(OuU)IgPU#EeO3m>N6FreChpRQnpx8;M)b2f1!it7`PD7E7PN#GZ!vOmln;X%~K75
zD<@SYMe;=~`iHe%lLJ>j51f;0s1GdI1%bg${N@7Apq0v$lGi^TAvqDZWA9$IctmC~
zbFzPHw`@^nb1>c?z?ty%Ze67v&XU<6U`jmWD&S#rxOQsU=SB6;{T-|h@O;o2)&`j0
zYJs-g`s0jn-q>FHZ)0i%EQa%A0w-f?P@<w9E<F0nyaDq+i{Sso1p2&84JLsivmnUk
z2a~oz-OBJuvr?1L>|!v?pW1`{^H>Yodi%%94U!oe#?}GYp32$-&p_Laemy6`Pam~r
z&O+jJ^rr))T$!RO6{Y~xJ)0KHI)Y%*sBD?SvSH(PE#sN?jU{<55a0CxL!bbk0FV$}
zxyCXT-ls=f^?7pyaSTxboT;G$>`Ts^I3+`YtkV}W!|{{n<d+}6ka}R+Dgy^VQ-KUN
z2m1P9pe8aIBRp7Sf|l@o94-i4>93;<BxgSy+E>kA?mu`;J|8<1OyN54qgqh&+CkEx
zBPYRR&M7m#`xr=IK5~q#%^Ae78(>zOep1U)<cj<{^9!j7CJm{<BmgBym{@97D=!nr
zyyMh>{&Vv`x)CIUuOLcK?b&-sK7xr%mX$tz00_V%A>CUN?<_4P_AM(+aQ5!r76|4=
z8S(k|@b!FLRzn+*^M?|<_IaV5@c#*O1k9AzHHuC7+O?Z9c<>-Ob0r}*X`r|E2@qx}
z{uhsG1o+)}C_8!5B5Y8bIlMq77Y}Acc4%sw#E;RVN9(We%dbnb%1I?nA`-=M9EZkS
z^nHI!EFepSYnNf_jp5>jD^jL(A$4&OULQigv#cW|hA;PQSu;Cd>)hs_P;)p`gfowI
zW4$MFMFTE0<r@#C@W@;?fvy-*fMAmZmMoOOZC>C?Ey-q$YG~CdQzlbsVued)!66L~
zbNBE)XaacL=Wk25-b1tmYBi@zmqupIS>hCo>f5t}q)H1DNI;&WpC|*+4p_kN6F)&B
z*|TfA4E(eY<_vt_p+^yoo^2TXw-^Bnw!)vQL*RpE;llfdT$4=0^dz28pk&&#Y5MEg
z;{|Eax`Sj-cN_aWOfzC{jZNRfHoaZn-mD{6jPs88OL6#I{$j)c$wx~vnBQc^r4C3f
zt>L?H@BYJ1zwG-D98sdAS{*>A?;{<B(1=fv5X+HBlR5?FsXJlCRA#NeIMT97Ett&T
zC0~x6u0m}TQY5qa?Khqm`i%Bj1IZK|3$D&_H)Cy(u~^Wy*x3Y5AlJ&t03oa-i7iV$
zk!desIRCo>3EFYwA~ZmM1QCRx1pJDU*_;8)5E4JEa9@2kH>_SweR9#i+D!;K7zQ30
zB4<GS*uHzWCa+VulGK5fgOnhQT!&B!*Fb#0=xUIZf2L(~>GjS~O-E^W?pxfisKG*}
zqzPmkl0ZqVS%n*}?wch4tt-DTe*c9X%ks%AATAG4e{?i;$!|!(XKFCL_flUqt<Dcf
z&39_9@Y~0ULlcqrAY>3<UIn3w{_Gft$54~m(&l%Y!1|th(33Fl{cgX$@+OGaDWH{@
zIv|BIn>&GM*0felC3f9~09tr^nHCmH{%|_bDoC>6yfXj0Z)EkilO!d&?vPcAC9U)~
z!#)B$@Bj0U<y+VrqsgS@-cIX7{86ijjAyzp84SO!Gk=;dJ-W1%C-Pna&IIjdhlX=>
zf<LY(uFq@NpiSurslcR(3+qb0nA0|Stm}PA1&!gtFiE|8&pwT^RxGh`fH?QWc+y=d
zRBWgS%}9Y|GL&I96#|YXBtsm}EnM4#1+0XAr_-cKA=ADYhV;O<Fy{0dP2op_TQNWL
zD9(%hr+MRA>cg3SrWwXf{6T$B(}E4dbDf$>{n}N$bwb}q#~;$`7!o8*BU?8wk*|l$
zlhn!B&*-SlU*^M%0Ot|cGX-%O%+*_&Olw>e@ytZ#^r}^>^j8VOz8yPvhTE%3I72{4
z3g=ql2D*4cZ-3s3e{^wP+Gm8*rhYnHs+B9JG+%=w6f2NlUV<N5!*Jev=n(LM8^8f5
zFmBQGcs!tsjK6-iOkyX(7^lE^ab0`v=X@s9IXjqnL;Joc^}2MGb6}JitxWo!C4Nh$
z8q@g*eIDI-da2okBYGKZFX<n*D23r@^mUv+Xh|tKzO~Z@f{&f?cGmGRYm_G={!77L
z_JJnmyTbZUD$GBcc7Fkuci!m!0%&JxoYr@^N9{gvP)1FfBpINTa~e$IpGJS7n*Th%
zS@T{b8?=_6Z_`>%LJNHK)M?u8E%3oijd0RT6=pbklN9`ugU4ZM7C17QyYz$3EtT{O
zSFq;OVbEtgHEk?!_wEJl<xG0cBVHbuF}=D|qGt8<dGj^f@zZA%FDG5%@|COd<AQ}6
zPiFJL?$1kaFs#WSr&0^cNC_HI!|GLK`p8e@>Ao*%TDnx?xgZM!L_-B_68~6j3-#U~
z5yc~*h*qdFErxTbXn#bMi~xv$2X`NnkzY>H;tJ)@C4~#+0rC%ImB=fwJ(^OiTk8oY
z!+G)2HB!G$We@+np%+O2w3RzNH_)eM8q?B)2D~6$pJ@Q*KpDTGX0|C=slm&<pjo?=
zAbz%OSzj5>Yu5j*{-x;$)F9Tlk?rtTxOvOJT1=(##U&9K>699M*kCsQlsIr)I(uH;
z7&J<m%_)*6meM7Q2>lIKsazV&Z)i3XW5)2r1#mrg_M)1kH1&6E(@-+O|1vdqSf;ie
z>F<G2TmT1|p3k?Dm){sJ*C8#l4@BM)AOgEgE4#*g*gG#u;SvSqE;NyeyfCO%xv2=(
zV<*natNn=Fm7*gul?>v+ky_-`&@NfLAejDX!Sp{5^X7YjM1lxM3<$uJrm`7WbRYOB
z;#e|}F=fW$4i1$mz<g-|ZD}%`8EBf5z&QY>XpMg^ec}J(^9XQ!i7<|t;PMIXb7p_S
zM?M1SVJ?Th&3p-DJo9iyZD;1&3(2~XBZf)I%nc=7(F|4*AbMLTGG9uJY#IxuaM=?!
zFi@lL&TTug7Kt)hoG1Kd;fhpwzKBG*j-({6!~?vvO9d0L<TK$7VWELf%#sDLRt0kS
z0NCuwkY+TS#(adc&A|lw`PV;W9)zuo%ZY4RU_R&AX&E!}O~Cp>@M(>Mvi3)FN8=wA
zH$eID35z*!{n`cj@yDN|vA+MGqRm`4duMn(XBrY&>D!so`Fr;4kuOJoB|G<@k=I_R
zAz2WA_m1V$EWC1J`@7frc9wjwmU0)k2!rlJkXlX-oToel0!=t~{*vNUiGV{eaD2#A
zqnUZ<vIuLfAwA0_u9q6Z9F<^og$!pBS<S(_B5;kQILJL#rnGe={C^@l0^AueA)MJ#
zLlT}$IbYVLGgBBhEQ4l4hxX|$7a=&31B5zilF>vO*FX0;&P6hei8~w9hGbOR_YkAa
zmJx)Lzjo}DP20DF30T(hy9<pJo-fFtCShUOlt~gFjFgNJ(&R=+%~tMBB&3)?Q-@?Q
z#=B?sbyxpQn*Y?5-}gV##q$tg7}oz~DFsb@3N7&aa<0&%D9z)RD(tetMIUpFH`5!_
zQRiKDXh<0Rfdp304ZaH<EVH;IO44^T0||VI=;IK9noI!@R-|^neZK0;SpJ9S<oM{v
z;mI>(p4Yy|@(%o;HZLsAMsLHxxldue3>h*)=pvE)vvs5NdU2zynbid*oXKni{Nv(?
zf$wj(lJYQn3-j{lZM}sCaTfa|biB9G!tsPS6ZQo%$W7oq;RE+8Q!m#J-9hAprnUxF
zz`V9S#^h%^iECvHOfPTr?*f|M3hY}6V71BeeVHm{Quvg<4#k0Xz{gU@7|um_XKA-{
zAC50qFCLKjbC;TC`>(Nh1b7ajbtqo~X&Y302?`3*Gt_Em=JGcZ7N+upei%1#GS}Wm
zIN<AI@ul*pPUC=6c5rnP!iaBEtGHwdfU*0ETXJ5L0YzFmeYh3?lWga{eX2!y6Lf5Q
zC%F#NgZ7g#z*v*Jue3kLdkYo}5OpC_AC~au!+f!t<wxSO#frgNK{~BBHV>n>erX*0
zZ14AD*@-ZFr!VaLMh2WGD12(ZY4|xG{H=XRAG{RrIV}sb_f)f)%;tnROI^a69|%Ts
zcler43+6B9yans2^9J4mEc7s(hPF5T96wRIv_A8kMFC#6msS>2!%{%AI<=h)=WV-o
z%h@L`N+8TxTRW>}Q)Y9+YE_|a{1p6_rxt2FbLz01gbu4p_NmmRWlJ@4O<&UnFJK!3
zQm2vXaD!tQ&Zl58fV4#l+E94(`?(hFt#OU&@S$A~%;ps68!hwDN{2<WXT?=u#h_K)
zIx=(NBKdvoI_c2#pw{O$x2>i-S{a>R$XSv9MrTFnVem7YLyt)m#>A2Jo3=U2>)pK#
zd{Y8(1wtDi()e5y32nY=Qm0HJ-+Vt;sM%E%%ug~d`Q6Ndw=b2*Fc>dixvp*4^7l^J
zwfmr&^fo}k**M#cN(N!!(-fl#oBrCN{shScCLq5G4ZV$<x9e+~t1b#tLT<s{Shp8p
zJTSc@Pl!XZZ_mC%^1<-0mDwDeBeNVhbVA-4*h4BmSr#T?ZtJ+PE!;#nrpHg527<Lk
zE1(o<c541{4Bgto0kt9u=FjPD%#mX!Rofy7w1*hXW|aqKsmLq_r9sOvhY@e<scfkt
zk`NkzchELNWH;qd+dX~y)XH#91Rp*ee`?c(fVj?nnm~<ZYT7v^f*ozwasa5=r_x}4
zZ-P%}`p>?E?+}naEp``=Zkhu9*H9zioDTR1oa>3<)8@x%<KP+;3^O&K8!C%GA{LDR
z*E=<w*REYFAAInEO7@?_VI;gj^S)#<lPGiI#0gE>vPBDdzI#u}mg1;fKk=g^P66Z!
z798Tl19OKkLKkUxrAF=@;W&{c7OgM~*+~#5JaAUV$>~GAR#|bC0Dt0?i{Z}0mw|7-
zx6ZjD+_G5BV2f=H0=%_7xutgsF*#xl>~l+m2gXwnCoSOCtNor;hBM9k5>AuZ+^Sg}
zX$sTbH2vj-gPw&%`7@;bb?wt9>1|lW$SE^tOoeb(5)3EC)E^=@0{E7+ys;05fBJD|
z;2N|ugFFl5yr2(55?bk-oq{Ay;lH&DB-fv;T2t;_KPU$_til8$+G@>N{tatBdX^tu
z&px-o)=&x_G7uJ+gBZHc7(I0Hj={Xs2S-J`P!Pi&!dh!2tWw}@|DJ;|?`R1=8Z-XP
zKkCcLQXH0BGG@+T`Hu$!@No$o2L`L}c;a9>=7qk9!9S)g_A0%_+28Ztmo%Pv9`sp9
zN5{x2LHD3=8W8p?zTrGEDJ*7P8m!%x_3J7_nHp8Zt@yhELIHhp(VTAxv`*;fdEOt(
zv=K`1cND`tjYJa$&e2onFG!>3yUWOd1Ef@uLXtUs2BB6qrTi(CpCd4fG6h+OYMG!(
zoD24~z*bs2mhbc0^os4U-}Bkm@SyovzPG$4&Ro5=g8)Pcu%X}D^V#o1r;J@OMf2xV
z>kVW$8$uetDEq+u!~_b7G2R|H)$Ib%7JPM2W72aiPz(Mf2w(Ia$B+V=)SR~rG&fkG
zL<z|g6yzj)>5EwYhkd3=)f=adOLdr1teLL>kj{q}3XkR8er%gZ*hRruT6g*Br=Qf)
z3mHk_1rlk~f%!ozPREWN)ucn;eP0^ZuOSI9&Xt>&_W-CsGYD{!@FBW`Ioq%vLkEdW
zofdqGqngq?pwy5JXM-cK#$uHlmC%cOg?r2FdxyTkdorEx!R3-Ygi2?7No#r(8Z@T0
z*?tNP?%jV(1!;>G&I9@t1hG6Tx0Ml<x9lSKNbW4RZ(oxfd8)_{vtE>Vf;kbD75U4G
zk`dtCqKhH=W?#R4y$at(#B}DFiPqbW9z80?;@d|beI$*WG?83EL9+Ub&*binYoX?Q
zlr%U73n`i}FN7o$Vm?zD$G|dc5p&$A1<4TB+T!{1j>CsJ(}lf`X}Z{AY|US`T$#<8
zpfGd^LU0TtCQMY5-o^6gR~$5kma^6zC4VqvuD$~b{09MneIMfaz^*vq`xu&m;T$vf
zzUPjLl%^k~na!L(Uk1RPP)Z1*T!tV@_qI<<uMX{%AxoG+JecPw?z?32?mKkY*@QxQ
za!V2jq?ogk=?&+>ZP2{3;!H)Y=6^u5*|vL^P>3@tw1w|r|FZ{L2_UnXIJt#)<E`yn
zN##EV`dH7B5&6M`vF4gtr`Fs{;9yW|V!qNjDRKz39}{TWo2IteKC27|6%}c@mdS+A
z6CA9`X*!)QnfRUKJKA8%c|@PiS^fXtjr*K2{r5AR%}7Myl>mbN5zd`C=VnfO|G}g3
z!|cVnL7D0(?SJCrS@_e-;4qyM0>6Dbq+q_Bs<mq^6s|^HkB&d|PjFR!p9hEPSu?6s
zi`iiLnVFi4^DIniuUflV1`PfjW`L5(++UZ=r7PED^7vtr1!ks=HUZ1F2VK}W^b{_b
zTb`=_r2H~}xy+vXyVR@wgp{ZDEh;$*2JL$vP0(yLs+LDPG7+mqz4nqKaj%WG(vv|>
z-^uTMFjgaSWXq_UynG;YwruIbKss!uZ`o-8*rUfzDRGOM(A#$GmJdIf2;Yba9S0RH
zO8*cx(gfu(nCi}yA&s0oeE~??Wj*Mb0XF>z4$2CNVTaj4N8iFCV5y%u0DNylz$V^>
z{{#jeU=%(BD4s{XWE?Z3Opz46oM+3T-`7}&LydL);dh}4LJ~pJrcNgN4;&ZEKao|V
zw|ZT<{db-t!0*USD@oFX(zJFXiA#pFKRh-gumTdN4B-(qNQ@z)nM507g1b+jKGF)3
zcg3K|qP~yzEtadtNKl{|{M&%ti2&lbVxupZwVDv-=B{45;UEa0KJy}is+o*GGOX2F
zf+IZ-V+w?A2w((7IZBE+l|6-3zk6dXgwUQU?0dpj5@srtD<<#0-Ampa@&z>2Qp4Ru
zDlnm+ftG-S0cHSk*hNLj8}>xqMc^=c_s%WJ5tK_RmwXzr?tP4s%KncEp&vp9uEDT`
zLb!fOFeL+l!XOtRjZ6RP^s`PM&|ISom#<;}ymuaVDIJH8$K~r-bI8IQv@1%0<T~{>
zyNvd+mD2lPU)CD&P~th)VFhKxXOm<d+_hxSnn4a9J*|ZXWzQrF7O$3XCeM*q`aYxh
zCJB?il<mD$F&nF6hdz?{gk1pbCIgDG(F?o1_0n6d-r_v-`jW;o&x1e1WrPi$o;q_@
zCDiG!(pw0tZFWW<=L(s|oNN41bNS-s%d%t79@)AJL<<OlUA}rv8F_mS9t0r+%nyXr
z(0;y%coSF-we)i5$RT6i86@2Yy`!&^La?mwd++N#&1>g}Ki~YhbE(O05;M}kWUC>j
zo3bAEYVWuGJ~HJ#Xg=2AEw70)S8r{htBSi9&mceNKT_J3fjyu7-dj1bDUKRN-r$k;
z0rSlj?Dwu%Z+TsB&7I}`ik7{;hcCG4zs{E|*sqR3;E5qK_NEhHM2~^F!6AM7fM}N(
zL^u%9^h=pCV&N5oMBkkpgB!>8N!6;=BwLxXv9O{3CfP@T!gk}vjngPQthJ}%eF|v(
z(8n|JDkDTYc<^9piL*@s_+N|@=f3P-xlm|B<sJlSDLCn3(s-5@J<nvGQzJMsfoq$A
z76us17$l&QMhvVu3qG1wSXKJE*16WyG8Fc~=8bF00nq%01NUNR+_0*&fK?~bs^}U8
zNn<9Up8G0H3kwf-E=k^kC8ToRMln;jzm^9Y0s3wxvzaDsiC2Xc$YiD|ZNdy%hAIbB
z)7`swmnWZmQgQ&7()|xO{XGc!-TnKZNc*r#ioNmnWs($(HZp?ICAYvdq}dpY1!0dE
zR<gZ7w^H06$V^+i`A^`YgC!XRMoz*yB|}=kT)J`jXx!KG@$0X_L~sTbZ1WA*aIfQ$
zJn3K=D-BSBz`2`NSN~JNV*>0xcnIf^t8nR)-oXI%t+!_HF{#b`GS8k@YaV*WKHRfe
z|35AHUHD4}!CtnD4Cn28;QttNktU|OH?rFV7o@3Nz&UK+;lo;aQn*f<y=bv){r6ve
z@7^L>5rO+x2>J31mN{L3tlhc;0>V6>g<dQ)kIlJG{K5qLnlfk1OzG39MF6(n{@%)i
z`Da(;Kv67{+RrCoUYj8$;M$@UE8%1Myt=2g4m9rVVIKNdr!WZD=jE$cEP4Rw9^Qj{
ze(!1gu@652-pAR`aDMoa^<DxN0=0>!{jg91GX=<*vlrl}_gn929kJ;HQt2Jdi`J`M
zSu$lzrwlQZiObuL_~y?SEP?d*3?`(P6|IoxiHarF4^E?2eS`$YWh*wR`RFWQREHIG
zs$fS?0~6VGYE)E)GtFmHtF}z3!V2uRZl|9$3*e9utxQ?P0=!SOu<CNgvnZ=L1ul1S
zNFk&7(2-O6V>$w@E<FZI#tZ?{pk7spgTpj~Bb!R@@Hqgcr`K=Vp+7$R5<=9prM>Oe
zeaaNc90wj#3+uBr)>gPh__9TK-r9?Iw3?=wu7Tmiz{A=1(=78#o>{#;pTBs`LPk-p
zt99N|`s2bI0d9yP@U@-{*7MX<7AKlN#L5w%utq!}6U!lWI(+zWm~qachb_;>QUjWY
zG5Uo)4j(5Bg9Z(fMvWSQ7*jx{hD{zKXmR2~Fa<<J{xoQ0)XoDUM{IvW5KgjZv-~d8
zZ}D~r$DD(2_N>{hIMc51uDaeKuyPXG&6^ySCYU}G_8KK<c%I0@E9?QF$HT%Kjs;EV
z#D|5C!i92c>rS1ys05jy9D$K0Qia>S2w&<1&~LAj(?Hhm-G}`D?GVhhq=2!U6Hz-C
zB5cxQoPh9@2ciQ94#+$3C&<8qjOYHyHA;dh=fdgJ78A5tvu4t@YgZ+laQ!kc9}iGT
zQf85$(qKN_J|~G_NkrG`V*#r}uNK0Au!0@O3oA{8@uql8+d1==%IL3WNUq>O*#{xD
z_XmS%f_2yL^^YYdEXTYt=rbt?jht%LE69Dg`gkmclmmc6^4+^Puud=ImZqBNzxB}&
zPRwxIzl*mx`+MH|lEyR7gFRas8)+e#Zo#tqkbXZT=OEk>;Ls}eEZ1AxI5#$I`5TPm
z-MS{p^sQXFGz2@b9+6=Se2>0AcC5@>y3DgU=QI87T|m8V3}?J$ehNsms8<(6yo~b0
zf(0^p?k~=Ki&m`?{!DlZrro;2cNsNL6N4Za5i!W*i|A_axBWh%eD?PbnvZ38%WLAy
z)m|6BCt+l8=`xZrZ(d23Dz#b%u}ZS`Eqgxuy?Z&a@5v&zS$*(qU6jB{6sq$M+RcBg
zTd$hT*`ZNE|8p-r`-}u<%__Zz3{?g+8P4<tNYjJfTjbfoNBtZ}LasoNI^?<}%#=g!
zI{-U{Y`N$9Lo6BrEK0f;;5wc)YnCiovP3dN<H!ifnwIjrk-<c~iA-jmZy4Tv_g!hw
zpn(*InR)fQh`2kqZ-AMALoEV@DV5=@GdJoX0M{ZJt%SpwGr?MsBOE$>LN%4?Gnsvh
zaNcnoI5#mzckeyqD8*`TMp#4wamZS*p{8%B2rDnzWWf;ABHyk~p&j3nyQR{xK>K;N
zGnQ<apz0H4BwhM6uucOjS}_ViIInoNMZGxZ;@G9ZPwTM8A-4QA=O+4OA3b`sHY+3W
zATpEff$M`B&9nx^chqPu3%4Wv`t_5_l`AVv#H2TFC<?V`CE3fClSG&YkK1>*Lo33N
zumT}KRBUVf4pe9&vu?ofPqeygpt1LcCMj{vwlBRb8E`h8v*dRv2h-a0WoyhG6Bmp9
z0q_9u=?3-=3ckcok`%t(Ln~&*{(g9GA?FL{0nMi~q{o?O56(Il;Dg&@hH46Z7xo_t
zA;y!xA^5Rz>vk=V!mt%fmP9<+X7*X>D#d%O>;1#rWzP(}^u$Ry3r$`IGM-m%+@#v}
zAHMnuw2o6jNXw#Mnmy>c!&)r$p6C9)G*Z=Dup@L9TK=Ih2E<sLFYWy{_jF~Xyb^p1
z#(Ay`&pnXO#8Fg84e8UOe=9cqsXu2$5!_H{RBJMg2$ep}7ok%6Bcgr;EQYiH8${j6
zVk`;N?!er@iq)I6T$=St49TpVnbSLcm)R@C{sygpD?lDj!IpkfnYUnt?A*0qG9f)p
zC7a;6VCl;Ylw8yWa|1Z6cm{5qG~HdfLP_cHbOWa^nF|*$E7Bf1hoc0(K;C#bgp<K+
z!s>N@$<r<B>gKv??Oz(55C@F>d4ed0s!%1&GEE_2U7695nMg%)aR10`e*W1O(zSDQ
znfU!rAevgH&W4ZrP73791;0AkER(B_GR?u__=z)MlvjW+>?+V2PNoMPZ<q+{;kTmc
ziF|p3zzD)};K#59nxD{)+2Slf8AeaTh_+>S-J)*RW=yuGN|gedMlB^MN0u;=LhU0m
zvF!=a-t_R_j0bctG8Ma;;J>p*z)1M<>L1J4IV(8#<HnB*E&rP`W2U7!N#dD3kTDey
z5TMe&ry%*qFkrv{Wi%HpTGYvCwqS)N*%vn+Ox(jvFQve#3E%`onmcO$XuTk+t3?PF
z?Ep9Q$6l-`+$cmQAn*2n_F}I%2soEmazXnm9pQnO;HbvS8}t^i!$>8%K@qnH_pE#s
zUYYKx!+3I(Pha&WaD2i$RGM_raaB!82Xi?t%&OBYEWxV9#0qCHqN6d7d6{G|FJHb~
zD5;+b1bjQ;l=F=m&6LPL3(YZxE?v6l8Z8gMJ{jOQh{0N;mb5NsTY|)?Bvs}TaBp#3
z?%$$`Mc08xGd!X{5zmZ>Q$%&sId<`2-DT^Rozm{vH()_0y)v7LzdhHfnMxP0-?UYx
z|F}r2erxC#G9B(-vO*9h=FHefMX!mCS>tO70!Brd|Iso4u;~k{PUVWSXys}-1g6@*
zd-qA+;2@tm%sqlWnZKR&lT4aBSM$8ssRR6il~Oa>|Dddjt=l-uN(Vwu`^JqV3kW4N
z!A1fQeS*;x+pw|Ylz2i{GYs5I2y3~YQ>(QGEE|w9-J@MQ*?r(3eg}ja(myP;=4(<A
z1b_Ksnf_*s`V@VttoxW2_J8<?0xW5>WR-xd*_81^f}+P!=KpY1BJMKbBn1Eu96bVN
z^C-<gW;2B-o@vosGNlWU!w^{fc>DxqI(K~aHOU>6LpAIzGv6@}+avCkD+TvjJ<ovn
z$MwZP99iS6NB`egFan&1{3d(%?v)7>CTNQ;Lqo~nK#ZV2S2CJ;t|9JJ9KsJBJ9d=n
z)vHVXeEB62gnk%kI-BQM;;(wPL7bjzq7xusZn3S@Le7yL0;33|WsMm>Q+hq$Mhen}
z2-+1<;2zFB;Tqg8EL^-&lOH^M3YPY&;#`NbjSA{{UvmZzu-Cycr4_-*2k&)6rW~GQ
z>}MIhXZyJjANueVnU-))a6bSHfFW(3Z5aHtj>wH+bf-3JeaDU+^2HZlNMK-KC}uMm
z&9wML7(n-6RjXE&PMta_49E^2#u_lU3@&V3@}$q5M*?sTyS{&)H77Z}V|WfmYWy6_
zj~B<S57|9ZdOZ42fCa-qpE-5ztW1JSgFhianI4S1GZ!zxKWt+ugmHYkR}UHT#c0Ws
zJ)68S;uBcOOfN0qN0zY0`i2isu`i8_;xvY5VCIT9_!-4j#!PWKuR&vg-{!nQL2|-D
z2juyoGz79D7D~40g2eBBS^5XgDqm~6hrHNZDwZl4ZXfvNqjFhst1yAgW=N39PoqDB
z7Vv550V`0svt^ZO^L~{l%a+lzmTL}L?J)Zf_vJn<TS$RCxv~Dex6r^JY94dX%pRCY
z%~RW&piQA*S$M&f-rJqY>3U~k%+<`tgAk}|)3Cn9<VSnLs+R5L+~xNwk}ogt9xr8U
z6#sa6XDaqzqkRM{hO_@0MA<-du@KQee)J@KQ?Af*&z!p?b7zbKLm->O^oFZCE?s3%
zgXlhG`aJpMi^=+O1AISMEL%*-&^18AmCL$Q<IR9b1Ty@21}?(;D3{Wuc7MfA?V8Y8
z97Q-@V1-lLtZ{>?Fvq=2rbAo%#UAaz+(<6VR$9#TmwL7XK{-84cioY=0BTVYk|j$b
z=Pq1=FV9*s;Faejd!As`UM9mid$tT(H#K$!z3~FfN+q)-V$jwk(0XFHb^ER|$~(1d
zB&ndKO(`zI8TN|f?Ywf#3>3q?O8y}Q-@5{z++$qt9^%eXcOr~g#tfDu7JUgff$!4d
z@ELs{<7vKWs5Y}&wi{(|lc1(xG_>#`Iv2aM`o}7Ef1ik2w|Vns88c>#O7a?0ax|Sy
ziC?Z6`hI2T-MhCmZ{A$BB@-vM%!V4I%B(G>iwlWGnq##zoWXF8&I!V@L5Lv3xi*Ai
z7XQAMO@PaSB~rImB{1UBS_~g=NRy`I(g#rn2-qc<<NsyBa*e%w<%Tq@_XNyR1D1f;
z%xVb#O|ajaIO`fmE?v4Rr_Nl^$2&d{h1~Poo&b^zvdCEb7~4{Xz9lSl3!cwzFQfiJ
z0W4stYJu?kg8>hsjL+?l`W8L5C30?Z+(<Ne?X}mmAD0{wxg>O_2IGkuI7XwnTeogf
zuU<W=Ql*Ndbx0(8Tcc)Cl4y4SF1``x=_Ar+jTIlqw$^lQ+m?*N2bk)=bpDcjJn|cD
z?)7USQm9}q>HR`mNe>G##2@<g?4S&1n%Q2xc8iRg_=61S-%Z_+&<|4Vp8pn6fPUle
z-hn0;1cu_pA^vAEFwK04sc*&MCm68@Z*lhby!R!IXPyUr=E0n%*UA(rstjkA^V{mx
zQmsOHNe-<!%eR+nBAK=kJo5|^AMP6VA3X($^Nk>MWRtx|j!0o>)Y8l$O`%aklpz32
zks|QD``Yu}q(q^Dk`#Nd&LtF-ARheWo;fWE?f$zom-UC~EXn%2%Ah8bx@f+9atl7Z
z=u_;u)~#R`ZkK!uV&E|_0;wfUf6%2sNXiHy76Ut+MLN1;`r?ca=f<AaejiSjP_G^|
z9}Du9*Tk7?s0tBrmvB+{GE<5xl|7&R-j(9NdgpCbn!XZ`qOZFUnK<)D*|PIr3C<EI
z2jB;;4+JTKU_FP#tEU?_P>ubAM~`U6F;k|<doRBXw??T{Kq97LjcI*ESFYRJ=-WLA
zf5lIMeK(<XK8`4#|9#Al0O1~mZGQOS2idxHt5CDi2+YzF425WU)}b%gJbCh{+1$Ey
z>q<VrM&eZrihB|3H)jWw8P`L@+3J}SNLY_i`4@984+PgLl`kf%*Zr*}uU@wW3If4w
znbk&4%rJt&ZzwuY;bP+Vb0Msh4zAD6O8wfEBm=aALF~a^;-1-h<jI4y8~2x&E?<=k
z=b>PM;0X!bOYwYiA0zv|qBVFn!XrD=c*UQX54cxM2kUGL!Cstm<3Yfh!F{A+x<&r_
z>?5FS7jxqVG!no6{(F68=)RmsWHi%+K4AcbO-ex1mO{Gq>(`gy;9zG_rX06*!1;zU
z?qVNMp=y5ln4f~qHu`iEMeXBq3mxo*+Y?y6Z*M_CY|i2(5Hg!6`JmOh3)-=x2fqz>
zE;)5gKMhL~wBWND%xDJs)XkkOyOb(Y7_?{lnhjlJ?1{0?=M+jJZom*9bmw@$$L+M6
z*wfy~w}j)*vrgKSsUWyoL0zG+l&L?@le$$ZL-;j~x@+(XpOKEsB>@y$PQcaX{N*bo
z2Lxshf!SLS%v!n(a`I0i-?T{Qu(k?oLZwWWT)zEyn9#M*8CW%FwSEJ9IxmonV64CJ
z_Fxr)EDE3EG>uI-KtW0}-5KJ7URWDKxz#F^bH`cDTt%N<<6-TiEQ2@+^F|a*F%hK;
z?>*-qh0S=LG~tO-rKNonxVuEWjs+{?@^xJKtrf3uL^7J1)l9^D_rQ$&`NN|c0SBha
z>ir)gDn5eQ=r?WoM~QeWliI~qDwUQ(Me|E>{E8JV;QSTGebK`Cq$HTjHLF$7CZtZ4
zRDOb4?E{C70YAhidWcWMtYl6kg4TdD7~A{dm5Y^p7mVBVV1WFOz3%{wqqzQlYNy_d
zZCRFk1B|ihy#xXz5D0-#QV4-01VRY;LP82j2nivbbO`V#gpxuHgx-4x8*JRWCClo)
zm+$wR-H}$Sz0>J*r@ND9Y~Ak8&b)c^rtHjnZ{GVT%xdfK#M7_Ji!Z+eAITe3)B(%)
zku;cCu*}(w9O63Ps8<Ir8MaS3(xaTGePQl++0w=G%b#8$+jmsUygAe1lkj$V`nlKD
zLQ2sbT01kQPX!YoTNU-f%Wq3{4Sa3@C4PK3c~Ub+O`>F`g03q+SSPo@XY#H8ctBRX
zzZUr{{2i$HL7OQ?Pvs}GidH3IGS7oRoPMGi*xvku1X<xqe23wTjbtrMv)}da|H-X5
z@%{LdFJtl9p*|Gj!$uXq$NU&t#a~l%+;Tz0&DNcFT1q)Q>C8Z_f6i|_ElFWB1<gEN
zNi%cwWkg{!^J^GG3oDBbNhvVJnn;sxUEpWHq^!qM#Y#E;UzSb3dQ(e^z(hGx0h~9d
zOdfdnS$Xt{m(+72<|g9m6?7fAt9c%C+~4o~FXlbVgxyDuIaJc2tfMs;oiGSf)yo6!
zhif;<hE3aGEg?sGy1}&8!3zaDzx8T(0y4Vzt>K?$f@x5(eg!%44a0`KkL{l*Nxn>c
z4z%zp*0GNb{B+;!hNkIbFo#Q_GJQ9bi9&`6O=Q<YXvBDq(Py4{Mt=VDpUdf|pRR<>
z%&%d%$EdC$wUd>vUN%_VgyG{P9B3yd4UM%y2D1%V(GEr4`@oZO=f572`Iuko8(ZZk
z-@jB2I$$3)|B%l6EnOsk{Pk7Jr}Kh&GvwM|+$>K#^NO{migo%bl2S<!7Jj;58jh!T
zAaJ(o9)5a<qs^iVgM!?CZ06JCp#An$sFdNf@&Da-zr4O;g_^HPJ8t@Q(0BZva8B>V
zckA)zUzDv7$W_75*7Cg<%c8k+mFYP$IcTDrp$;eWC~Q6k{sieejGqj`P|5*V^ES4$
z%GOG(a}m^-$CckErg`n%6)>ayU-=ib_BX=s8ksxvmrIR%n(sbh|NUV``_pp6_r8nM
z+Bpga1=w|s4N%-wkHddZHtV75GW_LadjbKxXfP?N8Cu1-4xwp8{?QDeJFs6r_42Fo
z>;Jn==9EoW!sZ`ccBve$PO=Or1@ZYf(R}q~mq-V`YgDwj<G&Bf15ZAs#)H4_l)Id0
zJ$JRYOHXr?B!Ho81kTZFg}~)<4rEvh|B6>%eYLu$sciO5t)NFi6RpDh;SYb1|6+c;
z^wLX}YaojeJG5@H#=kXIxK>I|0Y3!%p1$5L%&$XA?5dD3|8Necaz9TvpK|ii3i_1d
zJh|$}zk>jIizMRk1cR!tdqT#uQfdN|ybVI)+wXcvrKPL0Tqt%NcE~bG!5&YauF^DP
zB5SWPG(Yg2!Uf;EbsM%SIO@k;=FpqLJ2r)tYzQ7-QZa??!9Si19xW&s?(&9%91TQR
z-}hl()8F5%6lvAo2GHH;g|$EfEF=ni@WBUi!wokm<$3Ov$h+Viz|+i}1C}gVA~)W6
zqx}2d|B;I?zF20@o~_E!;~~)ZC;`fT*6B{Pd=&d)NQ4O`6RK%Xt%JEqA07x(>j95(
zOo7+)@ST0dbw8I<%(oOaf9k{&<YUJkqw>(g&rB$yefQ!oE0`2U^0evo@aJ3w-ct&c
zWprB%78-Lm)?T0seX19G^90Z%8F(vVO^SYD)h`P?hm(&!N>P6b6jz^r^G&qvn`)29
zLw;WE#Ql%*5sr@Y@;mP+<<<(c?;?ydn$%{n=1mu&><<U?TYZkH0{tmaLH+0xyFA6b
zAKU>_o<ZwTUxM;21N$<=E&sVkTF|C+OGKe-J`_}D!Yv0wT|=X?5=C>>3RPPs!u=Hw
ztg(z*%s2me<Wafffd`?8{Hip;AF^6ATNP4p{QX!ovEDlG{iPof4|1|SIDSf?T&Kgj
zEnB1>rqK2GocJ=GzH-m_%xka7Kkk13!uLmI{no8&j?<LaaE7j~b{l$O?k0c&qpzGM
z0Ex*`1P1~c`^{J|zh-N9vTjEod7$LxWJzCVm(m!qPG#_9hV`dx)`OWdZ(cc0L6>21
z(Fy;<t7J6<0m|V@sAI$)#7S7h=jU6@8RFC2+@_e0qb)jEp9E<1P)N>@46PrY!oK65
z536VXETO3+rsIPXPv<?~jTSu8vwXtlM#KkmQKz5!5rxa<Ef%xtnxEe&TOpuLqA6@J
zvgq^r(1Vw%l3sfCJxhqp)4%q|h43`<ti82E{&dqn<x7`dBNtuzW7)E`LQT5%*TjXb
z*A(i3Fmrh&bwN{)%np2L25Zs6IfE%EC~v^YMQ&{Lq^jt^6!@?_|I%CX*$clX7hUpW
zdE|)~QSHcXA_oyOH--W+Kmlt1)4vf<Gc%A0L(_BIkbU;EpOt&>y;pwt!yl>>lN3aA
z?C_6sz=+M5JyuiFa$u@E6Z3I9+!J(S9`iG{!&HCblZX?D#~pL9EQcb^#?3n=J1b4j
z|MCyyrB~j;2EY;~u{b2P<`f%O&I@F)tX#Q9uKdAo6-GNMYUIMtpDz0?S)ekY#_Bti
zm6f61^JhI6FFb`tW(nUDha{W|uZIH5)6YV22Z5&=G3ls#_2WSXCv(R=)zN=SJCuX?
z*Ms}`3UGq~pLgoCBxsb9nul(H?}`c@gov-oO(?cxn*90Ce^zBu_)ODqR1~SJtCJH?
zJW>AjuYW1V>Txgy={(i!IYvidj<!%flt#SRV<I?qJK<|tkF{QC<m;g37R(VVR<2d+
zn*6*h*?{@w8?Z!k^bs(9Obu=PQ)?<PfIso^W97u-4}-tN9SX!xuD=m$p`DmNXu`@p
z!gVRJas_^vt5Y-bq0OBS?Q=5MIlr0Uj{<B`o*4c#{3P!LBZcATH{K-guUiMkg(aM`
z@(xjv%yu#!UV8g&`OdF@1JlAq(%RK2=bd`0GFeCF2EQFV8KT29D4K9Qt5dhC$5DIT
zVuZX8!OGhotWrs>KE`&WV(5Up0_!E0{`e>I<sbY|-d+8nwI)YOuG=Z_<_Qv>GFUWs
zjx3%#SHbA(v)1(rh(^G^@_S{?-mH&Rjw{I|dt_~`8c(i@4fRL=^%|L`3%(cIJFzCg
zA5Y2az<$Smv~6dF{NxXRQqh}nf9i3^!iV_Dl8w_OoFlmuWZyp$X5&A9`f2J^e<lQ9
zm;dxSSp)OO95dDeO+${IE5|xrYGqF^lgzSm@qqzrLTpih{m_~xq0IC6;|e*tAfcaT
zo@#D{ulsf=4Sefc-;#$OdPoVG7K7=`Q_pJd<dlX^%0G9YVqQvm5xQp|*0$|RIZLG(
zIZWJHLwg1aoWoOLX^sah7}9WGU)L<Z|La{+vAt5cX;XU$q$U2k4xF-*u^*v3j|U%q
z2E3Vn$=uncN>T3HU;jMr%M_1?Q#jW!Sy@AUxdJZK07%h5!9%&+U+t`@lBb@1P2u77
z<EEJUR@jkTBk4ex&h-{u<OLPn1q78TGKsji>8B+aC|0MW<wd3t12rTHuzzv<a<8Q4
z04nHm{8L$%KAJh^ufP6!x$U;w<f4l%f|Y}L7LO2l8N9~-ATsvckbv=p1y!tmt_IOn
zH)9z)L8fIg^4dE<Q?3VSf>#H&0q2z+5bFQt*4uQnx_;@?XG;kLmc2Hwm1${O`AeTZ
zOPze30|E3S&p!`KJI^TvHA_G^G=B^O7DErI=wsOsNTwGQO0s<@BZkl$7b<Kc?ptVq
z>2q-Fvm3%xw#)ff{XpIX57AN{;@XXYZDq9=*K-_8Z?1e_uKdNXB?I5tUdRwWf5z!j
zhVQao6RK>E9}AwPkVNHJ=d;(n$LHDs&2j7K+swoAi7NM8`z>B1zq;z%Qi+2$RD8G@
z0%z{gh`;&{W1YHi?wmnj6u4I*Z3+a^qz?h{O1P}|i*LOp7k=;iatZcdPrv%A+Ha|s
zgAjtdaiAPr21hEQMax1Ancz?S;U9NE=)YF+JO==n_^N;4Q~DsB-oC3+&cWK~(rd1j
z3%~n)sl{BW-2*zn+PCAuY$P-jGj<B-5r>?nF~&|lj~p1dyRZQMHBR7i!egLNHUWd2
zfs=vzj~m(n2u#aiX7zXoAvnP?yz}mAX~hXvwLn)Bgo+;dGg{N3OinU7banM81`)MJ
z0tUXuz@{niBM)1S38)_e-b#7(4a-#0`JXveN~gi!A}2vNC~N)>2!7O;3Wi0mek}6m
z0go%b_E}7(B8wNyR40@ldgM9f!<rK;h5Da3<v0bFlPFLBy}x?Bnt;@#kG7#E7nu;;
z)ZFs-`_-wwkAUX;MVSM8002M$Nkl<Z?gPO&GX)ibO&16lV6c8D^N<7gRS9`g@%Mke
zOPzG2W-kTliV0|cz~0BH`QQEVA5xH?rP5Q8AsZVn^$`kGlzC?;5XTg-7DWkI%=f61
zldQ^gEJ}EynWrT`{pn9DX3Xz?_d8`mn?h){Fu|Ol=h{HkPfpE+fF%PA83>%QF=q(J
z^yR-y>m<OU1+(R%3(io<rqIkS!pF|IOdfyYWiXm7|80udPli9gC8$qxbBj<Z<%;k8
zN;YiVCUa(48jR<E<}`H@z_K&}K$z?D;1ix<+LQvh_4Wtk-IXu_g?Kd=;(O2zbEFUc
z?-{xM?uTHyu2AY5T9JC>+(ugJK%p|JLkkh=yJWv0ov94-`@h^JH~jfdB{-+TOxy(E
z-`fL)Ffe5(v|^u7!BO|PsY?_P)nk+<m>zlL5#@Vq5zIT$oOVS;h0KI@@GZC8q8KJ;
zpMADE`K-q%(PjS$IwXK`5?5gibqi20ZnQY&Iy$=KpZ7c_fBD<L<d1*7Ti$$Um74E3
zX>n{f)Hf<+I)X~J*Ps`E{tPLExqY^eo)hUiy&R{JzV?;#RH=*Rm&>~=*2+CN8Qt2_
zs>*VX{Q#&<{44Yq`tVLpNte>O$4cqcvm`5h5oRyPmSzI53Qx;U0HUf0YW`cMs?UVL
znPJhaS@H&$D?fl)z*nHr-weeq&hP3QfxbiKr&cHsJ^R}0@|CN9r1DhM)XLs4>q>KV
zSsApb(GOGEu%ZO!j1H?dtcP&88~ctF)sM)8A%pMbx8IdN-Erq2Q-ISe1<~x2J}~@d
zmqD<Az&+7hP+ZxCQzJewO8Fh+AGP<nZ`rVAo4mDZl}cMWr9=v#fTsJZzVao@<X4FG
zQKx(OFzEv;Ikt+9rynz~{v&DKu-wl~GxiV+6e|t;lyXxA2Jxe?baUfB|0Or!@0nL$
zQ~iPO$b4*tNy&fVM8}#fn`I`Li3}H>btV{IrS_hR@USkxoLLC#9H0N>=_-wN0%Jh#
zdgx(g+R!>-GB$cEKnl9P8%a0)#$9vwvraesiun=avIwR=5Bch4ayS^R*;B2x;+Vd*
z!-1No2vK1B_U%fjOkp$UD!sQk=bUro*=L`XYp=al4n6cxoB)F9Q*?0tsdn_!8T1^O
zlE!^SFV?m&txkmt^!<nj@UQmWXMz0SJ71P<@V~tG;yG~Faj#r+-CyK=D8kV{GlTVw
zp-*AUqElU6gZ%s6$CW}ot=MhaQYk0E%H26<odgRxu%d_l%r#IFcsP1*LpTrp?^&!d
zcSFbwehFyDKe`k7_kB;uf9`)$W|doGn`xcFnk9$EtrkF0*^aa&4T)Q8YrEY2??;7h
z-2Q<5&1>X;4P|qJ9DW~6n&CMj@Wl3pvKADn`}(?29|)Y&;daS{n4`d^O`DXmJQZ{)
z80Cp(hM)Z8Cvx9?_sKWE`Ayj$7GBuD)%ZsL8p~XsI4E$&+!3Jmf!+nkHsV#d;EVw5
zjtF*~VN8Dj#Y(OR{(ASn<bkK2!g>H>0&4;O(Mr#QPd}|rwl0`it^|2kU;0%!Xz9K-
zPo6ch)&pE`ejb*&7-(H555n88U;8szy?%qG)EgvD<DpS7c}J$fyoP_A<I0!qF9)3W
zIazwz8Iq3rSbXG>IaDR>pTKYZuVI*-d7b(F-P=VCyExMhd*OH(p2l)fQ4EvPpZw?v
zD%I@L>5#MZ!2I-&lwh_6O7KKTe>3=L<X-2Mcixq6{Pbr~c4|=j4Q4(WmJ(^L$se!1
z2^LAJ>+7T%O7*+#EARdNaeN0gb{+kQ?Z&`-{1#?HDec4~j>KMUfl_?nyI<dYD^A+i
zDvMBD=N`OtKSm3cA|86?S$XQES5zIjjvy~V3&-DIw_g5?eOXbibujVp<;x{s^$D7%
zW*AOgD{ko5`PPB*P=Zk)Sot}&Am;(E=rf-<RoWn@VYv6P$7LJbCs9FE?E`@~@vo_e
zwO3dh>cIDz;o@`8k%hBoTXUm|asLd*%!#u*WiZdd<_|WVv&>NPIP;HoKOXqZ>y_Ut
z&W(pY!h?7=Mlf02Ak4F-fSN!s5q!7~CzBD#{6F)w;~^+5w}i4fq5BH5lR3}Ru16hl
zfZX(d_o);wzxJNe(p&`pH}+gJe3H`5HCoeQ;mi{PI<#R@r5OcIHahs7{E#jWW~xvA
z=n?YLtM4j*!>Q0b*@RQZ#~%lVQ<iHIwsG>MS`(UvKbET!t}uhHA8yn~5{#*j9(S0W
z{i&0%nRr~8lD+)OUnpVtvgLbIy%uK44^c&-%JS56&~E*q$c5*BQuf(vzVba?3uf%&
zPrWQxe*d>JcUGxv+FB{}^*noKnPSE!VNzqUT1!S>70|&S($c0UzfyJ>y3;Ykua-7E
z{6k=4qyv3yFPuM1uKMN|<@(>kX9!GNe)gj8%kP0FwS-H+B&O!$J1f>GE#cK`H%e1;
zyTbVx^c9|1WZPoBG(5PUhrcyXh63?J0S-0v=gpfpE5-{E{}BFj===4OOD>Va4m)h{
zq$J-BGGq7<DA;3!Cbg4jdJY1Np6)hW=mR-&1Bg?GV$wsQ<P2DCSifN_gtT|T2W6i0
zfVp|{8JEbVUp`y*U$&Q0=;1uwg9U2U?po!_;I_N4$ZBs>g5=FxcgdZ%TqlPgx(su=
zg|}Lmf@bs+eaw*uD4+Kv!%u(lH@RqcojNT*zs)bc^0pEX^IZ*2Ar$0iL*T$g-tfBX
z`6>x!e4m9-xeRcN3iH57nybETRQ4gwSG>PYzW3ul5S(0l^`&ye;mgGbU&v&9;=`rl
zy?&f;{C<bLGKMP-xX~nUFWR1cASoqN{Smy~^}<!sq=!SM2~EB+&_pD|pZ@eGzrMy)
zg0cQU;GBTJfq@`Tnv50x7}GoZi0;2mWenAe(8-4l0cWQVSe_#x3&Mw<N+17#lF0Kf
zy{&TI@XK$@f&1^Rd^T|$KK}G8%1y}Pg|lGFa;w~S<F&GQ(H!Ag!#%HYY#w*?fj9yE
zS*0jK^Zwtt`UY9LWTBi0pYjP*D~{CwdT{-doSXxJC*7u44C#qPmrcT;uhvs+|MJ^&
z^g##7l^0zIKXtb%zqL=pEZKV)ujj#E?cVd}DSya3@zDT<k(G$&sgF{eY}SEhhv&e&
zFqLE3L%1iHGjoP|Jsqc;>DP`Yk`G=Ai$m!6+n{v9lLh~L=zp>X-*FvQtzQS<xogyk
zWu745$z_I;{5<)~U3aT7^U05X6oTmKk_L_aR-E=*317Rn!sl*#2j*)8nwg_Okf!F6
z=%W&Sf`Q5w)YLDB0C7rDp)6gr7=EFVj^88KH%?`+f1=%%<1L?KLe-DsAxL0BCUcT$
z@ZOzwwCBN$er|@%lY?mp)ioD69`(FCm>lo8$9)cp61TzJ9>e)(d{PeCZ$D*Lp8e#F
z6>rHkfBcgy0E25iOlMwmIRvv{=vy=!Dv$V(K}bL0*M8%2x%!6RD@Bbz-FX+{_Qr|m
zlh9AOh8W5-h{q^5PFgwdo!Q*+?q_FO=RMbf^v{?E0frA_a>4@<b+l1Tl^+al!K4XO
zrRV~Pp{lA%U1_a}e}^1$$e=YRwXXp|NAz`-!~J6+pPUNoO^GQO3q4TSwbr+4QgtT}
z(=%}+GdwFhQ@-?tPs!UjH1YUTugD@C5V#o%{^TKj>a-JJ)or%oZE&5$u~k`FEw8<~
zQf`4_EKfFX++3kBy5<L8l{vF#3@Xa0F@`eAN~fszRJi>&gv#8LoOt}<*u&(=uBuwN
zvwKhec+=e~jnndmvzo+J^T2o@gD#DV3iB0SbUl)rLyIX1I1$+;H{N=Wy!Ec7;PlI%
zeG?|Wms)})eoK(XUvRdSd+p^fPgvuD8ig!Fdz&oyYaQ~D8P>84urX6WjcN2Bu1TJP
z0F<E;!WD*VuDM1&`N>ZzKkc+=z+io2*ya59IhD}s2kW^+tfBg$2*}yNUkLQhUmmac
zaKQ=YBcjWqq9TKvJ{GwKWYdlv@<V*T8BT!0B>myiXKXUejlcTtdvevUek0{@-Lih$
zcA-Dqb51!0>rS7t-lJg>7WpyzE?OwR`1Y0ZonQV6Ylw1bq`Nn`JD3L-9GOtW<oZ|V
zQ3ZPZXuLgMjT#k9Ue7ct5(=JB;z`Z24lj{+6tGj+*m%NoU>fMX`<?UIdG7!>cstsA
zCiIu)V66G5IB@Ns|B8J@KKN+Ways~i7oT&k?7d)~6z3Nx;cE>9#5CdjcdQq?G3Vz(
ziJ%E>diQUCBlF73J^VgfV=xD8O&2S>k)JXC@X+t5!1$BPzb-Ryy4>>lZNV9=tmDQx
z%$35dN=k(h2xrj;XO>O_tgp$ZzV>w~$NuTHl`G}RmtL07K{=9qxrK9P%QwzHUv9Yl
zc9{!h-m8EA2dQgnlw+~3n2P-z5Am$Pe(JAx-YsjuyJ>-J=#=A*l>_%(qJ+Eb!|MCy
zA4P?&v(8#(GT8U+m(DaQ(=eD^*I0bfoa`()`{N%|3Z2|@-tk|!*TB0oK6;|epFLC3
zFc)rxHKKbTe?lpb<m1p?0}ju8;v>hwKX;x|5KV@xh+%L(AOKewfsgC`Xph0>4>p~%
zyioEu^N)5v9{B$94v!!Hr12S&e0em8PGqj5Q-FKTBrpS;np)(U=iV61aK`D!OFsPc
zazQaT00-X=-Kd2G1Pcc&gWwb)E5nDy`Fry03vUU1{8AXrG-?8{v#I?p+jZnvvQyx~
z_gK_t!nY>_crbF`f1ehf>RkdYR4c$&6(%no1{XB|%*{h8{JtH0;1czkW*m1`!h{jR
z5(sgz>Xv?(chrk%owu>69YP>DuU9F@@<a1QmTy*lLQYi=vk^^@Pc1FMDd*2Afpbk=
zqXK;6E&ma20=Pk#16sZK<ZW`)M=!u++9!{~&oTdwKlWe<Z;F*Kp|{>yt@2UWeEm<p
zA*aEtB!%48<Z6+O9|8tW>LUX&vWRXmudEcD;Qh4)>C%o<%M29mD6R(U1mPCj_Uo5?
z2AWBA^7lLbi<2Tra{0G^Df=v*2VdU{q`9R{9)9!%s}TGih|SP<AfPTP!HFfD{MP+S
z^#?nZZim54=EhJU9x1>fg?W_=qx<i_U)A8;bI(=Gn8OY|O!B~()`5MVV?mDx9Y5NY
zK7D){1=3az!ER?Gm@{1{lN-FT&t9XIrQZ`ro-mv;wMc$=)x|P@-b}glitE(ttjtvT
z^WXlhllr~x>V(ggJANgfJoR{`Aj7#@)gO~kKjy84^Jd8<UpjkGF!kg!uL=L=&z-4!
zHfzA2JLeSHwxbdP=yz4ru(<;?EuUeXjG^h$7hiry{W0MkH(v+q7v&m^TA=C&)dqv)
zv*t?uZrBxRoO37_<B_EMu(AgqKrb@n(Dp>xmtF!>l$JU20GJQvHa8ff=+A6Nj!|mn
zA9Ktx!jsHY3gPKy9YP(WI={a5@wCnWe9|SS>Zqt)oqa_2Wo-$-LJBF+v1lP?FtDTf
z3C{7S;SEa)aOqf+4Fw82@Pu{VN+_nCcfnQa6@|?gUw9T=ksODy33J9&NI^U4pxO*V
znG9Hlx%i^9l)#zh{TY7!``h9Bxm@PV1+zB@#)*bGp;x;q{qQ?KM5m$pO(ZTXH9(V2
zBq{;R;yRs*uvuU>U2(w|lmPbIzx=hz(+Vy9KSImhD^Pe_3C($5vQO^$(+zU;!3Uu)
z@njl>{Fa&16AnKN8uGbPfm6veMfZj8d{<6A_E_~@c<||G2Em>WW(ZG{{QJ?z)QQti
zeC4b1vu|D@mwfg-CG7qk{4>*sUJA<mGZ-Ov!=LlXU<_tuWJ)bg9o!G*jt*y@_)+=z
zu}3SWAlL8Ip`8LIMFMX4EqQ0vY6!YNP|2UW^B*!5g6E;WqM;1GM!PLTE#La)VlRh%
z29@S$CXoUi=e}TgO{4eZoa3DL>`a4JT7cfryOHC1=pE-)k97mGB;$m=4rGKWAuRgF
z2KWd6)!%MWuPAIj?vR6FX8vR)jHbz4=RC<x9hG{iRB_HJA5#pymtYNm;TM0qRWS~E
z3d-;1Po<2~4`?u8Gv8E4aLoS^$7p1eMZCyi1q3QT&^rwzm^3<V2v_!B_A?5YF1X+V
z1>uS-u8`Ala&FnOWs(gaj10P;5AT!eRe&E0`D9<3B&8NA-bq&n_crL;$|XtUzE;GO
zyqfY^)8*zr{YZZHtDEJ<TkcV~u3f)XJqJKsoouF?kDvYM>#}@5_^jl<#?HX+7p(#A
zx9>vv&J`EPFMfBMI(YD#Km0@ZM=N+6H}4n({{>inTD@kYRPL%BHt&<SlnjO6Su>}r
zv~(Bu6kH4Ok73<=51hSKR9w-r1&X`VB)EHUcTLbng1ZEFclV%yMuId1x8UwBK^h3|
z8r<E<>zs4$cwg`5jrG<2wRerRXU&?kR;{gE#0t_SbA?GlJtrFmOlH$yiV5C{R^Vk6
zTp{+~50u}W+}SJmZrypAF1l{MT(q$?RTtYljPc?OD}=<8@Q?0(H?FEP^jh<9UA8G1
z2}MLtQG=(E&Vr9L>%H4+ISH9ILJPO41-QR{O>Xl7s%X&8_HO7!dhB>ZuMZ+cI`ilV
zeHVF|ZNUnseNoe_sj=-|+DL9aAe6q9Io4Ioa&}`<E$DwX=nL~z75DFWvU1987XMh~
z{W`;p3EXS7fM@{9-Wj*D2&@kvJ^huaM+`>x_lbpK91RH`0c*}jw(z6(KfH2)lF$C9
z4C0!JYPD<~Tj$N+&~9_YS1B3DJpcy-n+GZWb<Y;a%G-`1QmlVNF*m~g-kLliKewla
zm5NoWQhrO`)lVx;5ZHuy?4>hK{qeS24T^hw15p^$JhLz9XZtHh*@rxr52abs$uR_q
zKfO|Hl{|JU=z`EXzKQ(pqg+OoD!Xboeyjt2VA3*^+c_Zp;V6V*xrcdC@9q8Pc*$(4
z-p;gPo1`MlEl&OOjoj>Ev0XJ$L%GLr7&hu(<=#Lp{Elx7Mv(+^I)3K4HIEa<e-*IS
zoyv&6$JNJ>s%kYo0M|=V@#SGBWqz1EyW#6@pt7TfAr~ZzMf~u&g!E2h8+3i6H8DLV
zH3(zR=C`M?&n%J7^*z*{2%+!3-SR)2^K#B7usox5&s)N#@aH<@c%dmZ>#TyGS}wGq
zOUl?S>Pw!%7Qcf7a!bLWfo<?u^q-##FCr|Q*`@-J@JS%ufo~n^sVZcBkE0P*QurQ7
zUQg2a3|EP@BN8)SpCULk(IaJhP!)>EpSH?k?5&5nX#26T;osw7ppmiArm-3_Hoxr-
z;{fkB&98VKR~+tjEfacjw+sDgm656gp94S1rs9gLNTY62c>RSQLh*T9b!~laI^L~|
zolGW8&uL7rmHQ<GV-+8^Y<-}s4X>^ZZ3@x5PW*gCOWBP>9zxtOIQV!3Imu8@4bH;$
zb>qV%Ztre?Q+OOZ??R>r^M0=GlR0EdcXdU}3VK+p{?+OAf+S|Rr`u<_Cf7rUYH^Ue
zA}OBrGfzT9P2yWXj&&eGNotPwHfrl(an@|KJX%&O&U9?v=DxHNs$HmJ57H{~j2r3R
z+G4tTx<&65`IykXclG?@dGbrBhzIc|<o#KilhK`E%jX(^BV6y_w`O)wYW2nkt85q0
zmjgS~`U&aGsem3P1~OK!b>Hkis`?vPBzz9)Uh2hwQSrAs?$)~~<9^hU0FBlBh<c8a
zB&yyvv)+WqPWvZ7>-pJ6`TYdN#N|Y;iWD~+e5}-QVJdgTu&A||^>4W9G9{5a&zoQp
z_msh#f=DMXD||Z)3ad}K*{6S(kEkW3SMc$VE}M}U2!4c7d_c>FPF-}|by7vo@_?SA
zoF|SYg9sP8p7z(B_D2<U`K}?{r}k8iRpqPj&3NT??d#-E1BJHTo$q|eN~$M#jhI;m
zF{a1QV*Qo-Wz?b?cz)<PHS#r&oI2C=siN2+A`bdoHO5Y+c7z)E1#s7K<5!E>r@v<d
z)7<~P_F`sN5fp!2J?3cs%L24(F1t;16i=np3Y?<TDF{45xVy#*O3JP?E83&lk-SPg
zl*!X$Lm)N|xte$LdXZ-9Ih<r0zkP#K;kwqiUcp}f3AABY>a+ZbSn9A5g+IA_)vfOo
zN&kd8?a2K`KIESp0g>%FUsDo?nx4H(zdv+&hv3bAhH{&!MV?<0g6Fg9f6lvG$d@18
zkT*UbyfpuPu}ZIb%)&`G_S$U*klMW}oBIB^YP_4=cv0B(;35`qq_?{_Sz`n)zrCU9
zK2AYLlq^{l*0(ObAq^W<-j2ZY;X(dz;waA3%<=Oy>prI^J#a_HZ39r|h^8I3+I$IL
zh`Nj?B2lGs%<~UdpL4v`jKaIyT_3v49@q$jMNm}+?Y1|u;tKFS6OF4Gzcwn)9uujT
z{91<89Kh1Mv5r6M-V#RtB$*EvHN*VX=zyo(n1SO5n0h@KVfc|U#?}ths?f=MX&^^L
z;*Ws5>|K33bYHHW4?SKqH->#s5w#8YS;1fH2k%G=e|fs&4cmSAUHr9JPyZsZ-+F5k
z{ywRD!@Y7fAz<WfG1;0p7)DFWfH8Oi;oHHelL6&M`xW)l=g-_ITIgAd#ek7cuUDCj
z+r77l5D~H}FO5w^jPo$nA*brU@02<9kue?|neRR+`(FEvyvAW25|zzAf0mQmYDvZ}
zV0E$<!0m<;-<IJ2yq5j&&LpxZ`u1ayGtT9-5^RC0#5ad0o4}LW;Xma+tJJjex}Mk0
zM~(bWi^3r%$3oflDhY7D2aTc=&leE=zq9yPcWYjw?;c!>rX$JKq@xSG_DM-b9ncbI
z)B)YsLM=X6%;aVn(n)qPa4WPNYwd}~Rbhy+J1!{i)^M+~l!_An_NCm-)I@T5$z<?I
zquiHJ7*(b)x8~fSk!l-AtAy<)vS+*Tv6Qf+8eZU!JD*bgWm);)Rd#2#GH7ptteS1|
zRhcu^?$_5XVT4|Wh>886BMhI{oWH*aj0GtCvhyC-(Qi`U2Yy~}`)kBlPaeDQf~9UE
z_>ln=kA{xH;{bU}!zR}cR8H9FrUrMh=HubR5qO6k6206Vi2W!ZH*tHnAXwRU$@AJ;
z5@?`2)RwpKE_5UCZ`gEX;P|7N(0rMAgw%QbqMuSD>zaKyIyUo*Py}S-ar)EMy|Ga>
zi;QlJnpVK0U2FX3QtbBTFQnZ)3sxU|rU^je{4(G%p4W80Uu5{l2;`(gFRe9f7YD5T
zV`al+!a6Xoh4@aa_F}N}tQ76H?*wmYhM5)=?HQUdnFcsBk<i64!Y8ec<72qQKj%vH
z?Lo4Ti%s!UE!j(vp7JU$?;z8vmtX&=2+LTqKN7zCxt{C@xQKtGKBPz&yIZUsah@EK
zhYOs&0pJ1=mp7X;vh=YO*b)yJ;``upnu=7ciY}wQ0C?&W!ppMp{c)BNC<|z5*ikHC
z-@VZEaq6eJ-%{wxChot^1~M!-y2Vf-s8qtjX3n1_S_ZzCB&SlY1WxqHiY|_%a(IJk
zE3=881avXC;8iTKpDp2;lo<yK8t2Nf8BEZGTY_^<e*6Vd@;D*ayU(PN)mffe(lsyy
z5>06oX?=izzXK#;;shWSUmh|1Ee++dS^X`IyeO5tZCjBYKIyrrOUcx<i>4<(;1&BK
z>y)2L_$irfD{XuP_-kmKrD;+)4#?OxXHx6(KY1Glt0#&$q{FodsPNf!$lWX7h$BUy
zPun~Q#kYpSSvP+K&n|V|I+U$G5GwP$`*jl5MMKoOTOQD{ygbF)23YL$Yu3vJ004|1
zZwS$~_2=w;j6A!7;Gd*2#&w@;<R_U`F${Q@KDHcHpubN;CEU+Wrztl*G&zl|tn;u#
z+<W$HRL$o?eNBT5mBH${Fb=Km_WSqG1c1*w4kTK@-KFvR>MNA`&DuTt9-Yf3yZuUi
zp^x?E%zFfTMQZrG_|eGw1{3GDXOxb7acJqCc6E(tBb#(s;Nct6{zoPK$lNIw{NnOi
zgm=M#ELqp@AoDIc=DNDUg0vOriGrlns^VN0?ZTNBGoyZmKRrr-89@m^eQ&gF%mO|}
zyn6WDO97cya;9*4PHCNC#^q?U+D?~P`+$8foZ^$j(8H*a@lItaLC*Y6G=QMd$XCjG
zntpQGLc|4oQVLSDlWP%E+jve<-}F&pwwFjZ;030Vm}_<sGTt!C0^QAr()7>XdzLZB
z#8vuw_CYeXCuVCSyh#hI!S>Q2g@|1S#?zTGjhyw{E$<5JR-IX*>SS&cyruVx>fcr2
z9JQ`%{kSX59q@FS`2|({4LEF8eflor*h;(5!RLoo8E@mvCKIU);&wd(G|7<x342I3
zO%7VR8Gv3=UPy+nn+Rt!oT-a9SB=VbI7+~-!+IAyCwvSQx1DdsI9Lo5qgUkFZ<<}$
zQ<F)3#oTgSf@sEcR5fF8h(9=>39tBk-f=zd*ip0iHb6mR`5xVJGBVev9FIp&>ucm9
zY6bq5#@sC9P&n^8S5aY2kEH^m#_OBF%7D)~f7o!W>NRePwh=U4)5Hv*cj>?0sMl=;
z-78Lvx7NLYC*>rGSO6v}gO`;nztAJKSHE(-eB`LcIze*Op~=Dh4NEV)FW97nG>@k@
z63Gy~vZbHVwMw}7*Pm%|*!qtq%P0)So%EP3U_nKP-$Z%0O)`KvG2xgxXLAZ@hL*+6
z@xvvzu3Mj@)wAyIP^j84d_Q6rPj+mfH8tGj@Z!AWZv0HX2Y*zRwhoVs0DSM6V%5>@
z-hV}jd!K@8@?<e#Tna1`+Q@od%!PU0Q%;X!8GI=rs%fkK_OynjgJ?&4ckB5pt@(|b
zmDfqmvzMC5*q`eGS@)gFH!}V{$xGhS^q4wgk}Luzv)3Vqno#wE3IpePdg)Fag=U_5
z(Vra}SJ!TrhEbF#$Beu`{c_9Z`F}0M3l@ISZnW<;DvPjtpCZj~{^t@HF_mFQt4F56
zfOqMzy>6LUMTtxB_g8&EJ9Xooq%O~)d>;2g9fpX@T&M?`k6K~55wvBZ;KezR+8>E)
zQ}-<#5g0B4Sv6?Z^sMlM^8+hnG(9MCdA!NvM}{(4yQ8^p`e_>)V%A?+#88ZpZe_p&
zD0%M~O#*kC^vxlYUCc9+9R^rJ&!iV5kA`Gya%M}Uj!#(w`>N74&H|W+w=iSd0$W}`
zdLI++{|Gz3w5Gu`lSP;V>Rf^fA5i{ut{lT6qKlK&R7iJ2FdFQ?Bbrj}!v{|CLRuYv
z0Vh?xy?@JEGzERdzs9WOh7RSK`t+W|-)d064Ra0Imp^6nnJv9zOQC?15Ys64-hVjv
zIgPvWdR>9{#jD{SNG5=(7{LJ$Y2q6nSv{T#=eZXuk3YHHxdGvws2Bh2R2EuJELmY)
z1l;IJHVgV_t#PP0rP_C6T=fMGpXanwaJN?>k8SDIR~8tKSTBBOTGhIujgK=VpJ85y
zzq@#Pwj9X;0Wi+SS{TZEjBH=`SnDv+aP3h8cNBW6Gq69nyW4!sh+S7%%AH%x|HM51
zJ){mX$Sl#1XSa^PEU}JmKDh&@+857~VVL?TzpE^G%ssQK0{K1?Qx&|10Q>mFst>@b
z2;!?SghUYeohe@Df`?zr=MuDZ!%erO#8Cf8zucLfs-zRYSF$pktThUQP}mDuJ?8y!
z*`qiO6uHRw-aCg_8Hm1JY_k4ytyO)!<J76{rENne`g8+<&N-6xkq7t(LuS)noqizp
zON!Z-8aOdSoV_*@OP;A{!cyDWCBf7~bRJq{^!MAo3e%Sby*tlaKDe;n2H`mTw+RAn
zDbzz0t5h8oFQe!1j@hQ>udQ;}IdJChOw(ZH3^;9E`ZozGo`RLNFf{_e56)Mgmy5!S
zj&_5k+&Lca>K*Lcgx0Z22-bzSgk~;{vNfnnl0zQ8{4S8mA>e)x2EU!Wr#Z}bk^duu
zoP1N0EBBRRvw{FVvU9kfV;MO&TX-YquHq>YS76wpzW$U@Ag2%aCf%+|mtB9og*ceI
z(7#$mF7VPH!-_++Ut^-QC0`HBFb6K$|DyTHQ03>kZ5s_Hhu3o!GT7qqiH^A#dtd6A
zwb#Y5A1;-n^4?$2_07tKHZ1eIdthj<!9Ms$6eI#{EchM?Hxi=H2n_lj`b+4Imhl@f
zgPy~psnQqcM+6^}zHbI#LIC*Ns&6^yg*TcRVQup3M`(|DdHNcv%~{}aJHNc!oE<2M
zU&Zp=jhRnrv9vS<R}-(aFKS+{e;79k6Y`?>YV#n>KYw<)+4!cqqls<#tS{a`Uq=;c
zY2>rEdzs-d<VuhbleX>_qP<=^=z^t4n3zx=co^oviuPkk=?k3>03%L`!v;<o<&|~c
zeLB`bWe{m%!yGEX`NV*|C*4<j+`7Di{IWeZ1!!5g(0E8O!+=3(I^Gt78s@6R7o~6+
zRB5{A^HG8<k_d>=#8?{~IPcS>b&g2BHV9c8q({LG^=ki;E=I_}HbW&|*SX?wD1CfM
zE!a9d{_0Mtz{U5pqFt4UefC&F8$wmTFsz(<=%-(TkbU^t=S&}SA(~+Yc3Mn0QExL`
zbd1w}me}mMn>$}+OiC)+V^P(=N4~=(L$T8lZ+M2;aLfKA#KD374C1c#-Fo`%uI^0s
zm=QPbcz2wRmTO2&7zhy1zxD;1S|z%CmYA$Odz?#Ud62!6b)P4rSY>V0KTDmBBf=2~
z0&=-2)=or7;R;_NAas%gr~Tu5T|FM}TBy3xf3#Yf=KMbB;Q`BX{4RJG%!`8Me39@%
z7TMc~w*#|6gkt8r#+^>F<UlGQh_i;EiR88|36ak`aR}YO7jz*#4>+AjRcqY+6m0YT
z%3>p}!{#|e<RxLn|L5g9ldGtBI57MoML$s~V{gAomXYu0^@p=vQ0Jtp;O2=N9Z4vC
z8b6P7pYGA32^1r{hvtP`to;Qy5cRN~!AIvpR)A`!HpE!?>`;aZs?=OGb(DqtxPI}j
z#!=J+K4A4IBAiy>@t?A84|I^MV4~1O5Ag|+WB_-~WbJ&!+r7}qdxM$BXQbL{-D3?q
zOkpjw=`R2mB37uBbf?z;{8L-(tewT5wPm%s7G+z;Cd_T}Q;QHsJURN?(Rkam1&g$^
znX|ip)DX4!bN`x(PK)mC=HeH<z^}Z^N0?X6aW=ae_aZWSssfcmywlL>{2$Zda@{CH
z>WliU8p%0+Lx@ZZ={dc*mRfb@dN;z0Hv9|^t@??0FqF|=r=DgdhmZFqv~4q)VPj~L
zCaO1uMrSV*3XkR~j3(T7D#aNRVJ&EWhaO?SXUne`oK`)YXz17vg%_7bHShxNj29Sx
zi<r1^`YPqFrQ+>RgaM2|AEH*FML;tW;O?yNvbE*w7sMfgv<u|J3`2WuyC`fmZ*b%x
z&^pJm)xGH?F=1~Fq|K$m>MRu@yyq_wZZ<C%DI121!5X=jl6M}7UzNiEc56?VIpk0(
z-{fT%bV2cCn{~d`<V+{MZon^A#{d0+dGWUzrX|2%zjs(y&(h^%Y*t*)l4TNcL5z_n
zg}GoEkw7YTeQ+m7kV<zrlX>pdNAOwcXWE)>mw_ckCa=X7#G{?wiahV9VIrQ_XG6Ov
ziQTwg)$?Eb7sz|}y!tW|{cuu7zr%MP5Q^#q(S30fr9xE6kvdT+e<}v`a($7W;FQfj
zsvTbWJe2+x$+T5Fp#s`khwBH-lU2T9Y?&}@4_+FH`tnv_s`RTJUKAwbIr6J8SiiVv
zw$xi^%UNY&XwNF@Mr@COPYfPTPEv|vvbj8Z;dK1^qH_sNS?tt=pR?wQD#Us{cZlU6
zj%^Tk`i0bIajst)s;|AFdC9o%GCr5L60mgDPG10`&ivwB?`bgmeKehMdi@)|n1-pz
zhmM?w6CC>8c}#pOLwwK*H7!Xbw>Xg@W`N(wg7aMiI7%k7v5J;pjDD;Mo}Ph4SMX5S
ztk_T*&<x@R+Z9Ocy0RRKz|Q1}CqJ^TN%t{X`SQ|pHf7t<@f(PJ5CdgWM7bV!knWJ)
zD|eBow{BtVk_PUtPa<YH38bjzeC3^&<tSQT(h3vuRMhiqEwt2|kJ5ORg0;K)7!HS<
z4S(VzPR4K_pgvmIUJLsUyPU4Ql#$v46|uDY7RD6Ocb`Qu;QPq1<-+~7QnQI}D@z?E
zRo|v>@V8&*{v~(_52h{?9!SXvbU`pvxIUkD;g9vhM1}W6`6)!HBZg2^MK`dkNM1Jw
z@6dU)>#NH=j$rO`Jb!%7z3J={+7jM|>-DrW6)bnEO{NOU^=Xe1!6`O#E^Mr;a~_+9
zmd_8U8dgtc^DJ!=0V{$?@I_{qZ$sNSw(lllCYr7mr-YB$REfaqsh^B+C{=~{f!Cyh
zXgY9KKMiTLM{^LCIH#m5J#$AGr`m&e<3qJMoWtaIzz1;KJLkKM-dL9X7+iYIm}nc4
z!TnC55E6SKDQsqknp=v5Dql3TE?F^r3r3t{z*n7(CM{}(uc?{5vdbh|in98m;osbC
zBp{E`n7CzDIBp6Yy9sHQ4flaeL<0tA{O8`rx%SKgx(@4D+`BWhykl37d`44#dT)s<
z;*3(d#?Rp&DVD8}hAgSWnQ;V~`4zQ{GWIwT5Sc*HaK9<1+D!%G%{q__hic$XqbNsj
zNBOo~7Uk$Aj<brXg|mq1THAqYAlI6url()wJ3A#3o%i?GdY_P!=~dq@I{ST>WBrX>
zs-$G+dtpDA1j-7%2Z~`;UBLHU{IZrUdl9PcO87W04%0zob|tBkw?~4o+4lS5X6{x`
z6y%bwslj+C&^p`d5zEtqVaV>!fAy!H`^Po(BJP0!L|ec1U5~8CPmDMi*&`GcCRetE
zG}SwqR<fb!W*91w^nFft%_jpk_p6;k7Pe6F+j%+zqXl&^i_Bq`55sJ3Ef+48;vhQ`
zA4r_-NxG1@OpCJW4vOno!V$2NGiD$E(q!focI(Ka+#IPf`T2ktWBm7Y++7YMLh=y$
zoa(htFR?l)@efBpTbcQk5pu*h7D|?tf?nEgJvjj}zCf}-9U%Q{Ac)_r*nAw!8vkJ<
zZ4cTCwTpl|hq^33BeAg~aSrCtA-OpYHe8`#k@fF+=^rEURipc+cBguDu)0Kl=cO$V
z*@J+v-3D|cK3yCus}J>NLv672Ew;Je^E+^-!8<41Pe#Vs7T$w`8-DA;qnG|ppUTMv
z&|aqYGd7(m>3cUhzg{vvB(P?uE<cipuuT~31-<}ODZb;HkTwa2+CJjK0K@Zw|F<Mo
z)q-t(P)ELGBLsgNO%OVCbZSWF^0(^yQyBcErw`je79HXHoXOO~6tZPcGeN)}bvj+r
zrzoAr#^Eo%o@GMO%1FMl$Bv9qI$aWGh)Q9y;Y=_gXe%9EuBjUKJ`zX7Kf>hMW{|0Y
z9`IttOu^p>(j#qf*eq2tiV<kDA@S2_L#_wF;`FK-$%$mR1LTJdxZ<ng??#E=*}Em0
zI^Qx=UGo>}ejAA+{`1k47EGoO8t?<i;2pafL>LcKnK0V*e%7u#aQS6LJ61EM%8n_C
z8nV{7_Pn0DpSY+=E#$|jkpXLW3WDF>tR!$*k>RY+-s^jw_T{`6n@r6x=-(lvh}fay
zPd$Dv^xZ+yz25NK%@%&c>b<5P+YLU1yPW=tq#_;CQWQ+*%$KaR%9ks-s@kJ!Vh<qf
zHwv&1Ryt}amXKLzVd-+Kk*tm?GlGuSQi(oB!i4j2Y-w-P%QI08Q>QEgC5dC7IuZqP
z;_XwS2qej$Y{}IJ6ZxQ<<hOlVCT{LC|MJ~5SJ_<nV?7IVbPuh8inj3hrnc$0WemFo
zvwv4{DZc-2Daf(n7Qy}ydpggV$X4K8R9LsS*W;P{nU$CCYsJjr&xCs~k7Bp=XT|db
zY`$;Y+1Kls>{D)~k0>)AsuO0ktLvCl{w)5==EJPci&NKU#Y|15p+pMZLR=UaBto3R
zT|8T3Wd(IPi;u8oJ8Je+SnX6$4M$Zuo)xCJHyVY)+Md|iLJ!$3>9NOScZ`tD0BS5B
z6XuC^gssEa$9Pb{R?CDWkxoE<LxjW``gZhv_Qa5;cQ%VH<j4DIHP5jLFh@w{I*dI5
z_+2n-qSG~QCA5VWaznE=Ak^<9?#+j-2N;U2NME75vL{Ct_S~?T&iO8|dH?sX*~NbW
zpH}a&Z>A5xkJ6BYo};|mWtwqwxY()iU$?7LlEYCdPA`U8PFr3lgUppflGW;FnY^|u
z_zyVa|B^?68&#E*A<5$}h>I-jY2f?_g<tsz4(T2z<iBYa#|^XNr)QLgTIFm<0UQ&~
zu2$(=o6&R;y6PCe{VEzz_yTGUBRPly-`l^L%wR^5ca=<yHK)fc(W%pVJz+43ZsLrv
zv1x6zT$ox>4zeM#BrkOu8cwSjvUWN^K)QU=S0+L;;i5@!4+6}HtYAk-c7zAsi=?NQ
z)Rcz99ba;*Fe9V@0VPvdLx09X6~vBQvdZeVoEKzF&;9!@u8m+Kiy;D;oUI6%oSMnK
zvA63>AGsuK+vfvoN028Y^)!`}mJ90L)FY0-47dq367JthDhl>UkqvNrFca`hqH7Lu
zWHkBD!chm*i2E^>ErE1fKDK#^P&p`F1T8kQzpR++qPx;~CzR2nV{=VC3?44-Jt>)t
z-)_2Je|d18(9VS@Fp2;YFj5dkKCDsnrF;TWvr@lS^6d;X=$RjoNvo!Ue!(a=X@}aZ
zvx*tRDhup`s>$7`2vb7k@egAj`+JQ$NPd{iM0}vw3fC(piTa%*&B`$A|90^Y@P!TG
zfRKAdz88UWzb7J4S_a+7#pJNS{<;_khSM*4J~74~{4+8LWZFQN3f1}`^bvjVdq!Cz
zzwz*cb-$cRMv^eR-HO_w{YtzO!yRAIvs?_jdWMdso_-SJY)>39;;hJ&UhG{}-*DaJ
z`pO?>|8WtVM$2fu-Zsef*x2sH`KuhLO{l24)@5Ls(iSB%ltvj=@5HXOG%+C(sRCV{
z?mYuIIY6x5D?wma3HD!j3xhAW?4LsZ?Njnku$>R<1<Vf2B*AbZhodu+BOuKNggsq9
zQ{O7wH&eR`5t&8OU{9B)m{ZL7jfp0qL)(HMV}T>gi|B7zy<j@o^#&gxzkMCMz8ZH)
z->om-vLdezaK|nY{XJqX?eq$4M6LpcIttiSS(q*(ZdU9igWa#C)P8e-0kI`PUg1)b
zsN^GPX2#y~i~szfiKrHu$(!_Z^ZdM=AK5%lK&M)rd70i=X|lbg61P_9P>^ps#HWpy
z)r+kw@%3UYhS$k2Om}A}=-77w#zEP*@)VM)kjKZ#g!?Y!UEix*W}_D0buj-Ly&KC4
z=|*HqgZoHGETVB>NTUIt;@-&Wk>?uA7{jY5Pf4e=?^gTD_tZ**YE@1|GBfPlkg|Me
zfFv52HgN=GNeI-t{*!DZ{xF%OX(yGqbfo6q8|or*oBG<511EXq)nB#jSN(P>nUOXq
znP8eQl=;tt#TSL-N7AWuBd$-tmXRo+@XtWY=FhJ02pM=v?8iet)pF7oTXT$;(oU}%
zz(@Kt($W<xW@5i(;Wwte4dj|Ca00$spsBVjVw)ClrV@@-0vuNb_@z#ZOZ1PtPE;4h
z1+SyVZ3<M<YxdQG*%(|E?^0=XT#gPMljeuADI*W@ldEJG?F!d(lx?b3Fg|dIs`8{q
zbwXC<wKdiAIMsUI32|$y$rRJ}J)sJHrSm4IVN4cDU<|eq<77frXp5h23CN=DOYzkj
z(HQv^kze(QFz{KLSOXJrI}1)`lXT<Ms)irH0U)G7Y;bmoTLNB<0Aclyg<0*0U$R+y
zWH7oWGQH%ztYYz8bxIf+xMA;d+me`)(+mB<jr^w{XQ2Ff0xbC#vght5a)W61P9=B>
z7mRNYhD->O>p64?sL2=Nm(;hlm-l?Ul$SXI@&w$htlKF;(??a%e+1}P=(*pZ%d@Ms
z!_~>Nn<w3p8Evxj^CRGX$Nl$pG75WgE1~^P5~JLGs0Lj89}PHUv}v7lSJl?fr@#nv
zFac|@@GA{kZjzsnP!Z~0gK&=T$C&SAl!EFk-IY8gHhts@u+Puwx-AlF+{Gr9v)EhG
zhyd1=S)oNaGJ5T9)wC>=?4Lh7$&G`>M#l9SZFGyY=pZ5-t;Bv6jkPP{xWy^>RPL4>
z#*wvnoa8eMa(?Rd4fbJp3bI*K4MK(vHy{2&ijfD*Vu``fkjBHMVefnQFK^oY7wD)F
z^Cxq#a9xsF^8vY=p>eHXrm@1>D<Tv=+rr9LG$8R<wbMs*{vL)DIQxCi2LU^$kZS4G
z{d|axnjtFNsnTOOwK*DkJRGXLz$O@(-{WF;JbLXUTe`qZ<WFDA339ziw|N(aJD9<c
zRS5(KeUh!6yi1IP--SJiYLOX~1xr{-A68t^rGKnF=lC=M#h(P2!2A@WI?)PL!zf@H
z*~j!K`)aEIn@54ewvWps^U@ifQ0!`_>-J{bowQs7T*njAhFTxPKG-`}Z9p%ocK~6$
zms}e3)Ab{%m%svj^^>Vf>F_^vu<u!qE&)T@Os1QJ?v-l1lZ}Oxu%1RO6sRf*?cg(K
z$Pc)yzS29iXT=bq4G<#6^#~3T59FHtFCTDR0;cq<D)JwvP<}9FSEf#D)JwX?TI#=5
z&qfApUvkk!X299PBL&H5+L`#)8Tu2=(2<mg6UL2mjW;AP3bZ$Z>S}!Mm~x_vY8^CU
zs1@!Q;Z+pI6Hgi<KuIu`@_iCOuyX>JJn%9S4wfk~eV|fI$-!;hq{yuJSMW{D!uRMA
zc{Lp|<6^SFLQO>dXfC5;vkNSIMDJ0eGe$;#`wD;Ewq@I*THne)fD|*K;5=zRCW7K=
zjC4lTV1-Ze>HEUDaw`V)GNy^J8<4}+-FJM&@dZ=NwfeLWTZSSl37>jtDx1^nFYk@R
zbPoWVH$Om`^y+(%WKgSHuK7S??r$En?0ou?e>kqgWCFDPmimRh^B-s#s7s)~+iAl?
zdc08YnZLR%+Ld3wDS++9XE5>VeLt_Y>elQiwW=PBp_4x2IDD9p7RhQo&o`3o33&0G
zna%a+6Foa_8P@2@9Ki=o7f}>9uIQK7B=HfiB)2{O{v%r1Ilc0m23}!Nq|DBa*@}^a
z4ei3*PE9S}y(h@nPdW^bE$Pv^kG23tJmPLyCQD(rUX3l?T$6VWH3aT%ot`_0ZN5DU
zUZS~y*D=E8RDm?KIpwodux;G{4&p2T#-G@Zua`h0`D#T^H4p<q$;Z}_!%Y%{SF7Q3
zZ!;CApe|@@xVf<l3yraI@xA_YnRS8X=r<U|sfO1@lKa~`#-e_zSbki_v0xr3I&lCl
za{uw(4I=F6bVg~||Ev(v7&tzDzH0X%c?iq*^)s!qYCSs9mhxm6nG4d5HP{ltr{}gp
zScV0^c49g@hjKs)8R>?{a(}Ip-6D(rMvw$w^-Ht+_8(<xU<8xpGr98j5uMi%qonnB
zG;$en4t7TXBQ({;)tMzKk_~S|qv!4{i&M8r<_J>#JHUD$M#F(xj~2BY6AP$toCW>;
z%+zm(WStL&Otgh^>kYfxia*dOCY02={a6ujMP)dt=9g_N9!i^(KvfAcDBuN?#}%3k
zL`7s){&)hgym&N|1UYgSCPcH{d^ZgH7fDbz)MqiE%2)1C-u68o(NsT%NAtbe$N^%g
zdqH15#9FlnPhcf0)tb^|^uGc67o@MKXp6u$9<cYWd@w?~!>RQWs{^L6!%dv07O+8`
zQ7(}w?C#XMEGexG6Gy>QabmWgmZo_;)RM3vPnFl!ry<QV$oN~sYzNXV4kMmMb+fWy
zR#Ti_`^Q2Pt`QvH6(ocw>USFhC;9-T@o#xOa%ehXG@MNGdgbMXm6px3RmLAM+dbv+
zl=+Avz)67EV?u##rJEuJg}@NK9m39EEsl_1luk~D({9CzT9>ZlumJ*VEvWSA$GL^y
z&G*88sRs1rx;o7mv}?Al{G>ze%DNuI>Yro$0n~vXkeBpb0>f;X-L5z(AVV3N-Yx8S
zO~FPFU6?M(fV?zPM6u&(E?(Bh{}NB!f?&ZdQqF1Oo`1h<<zVX(Vh6RNslX}DG?G^e
z*gdF=QsA$rk(73r<0-sh8CJ?FL#m4Lm;@Z=NKbx6WuEn1g5UqpsPEzQNOZ7H5tB$H
z3uOYPxOPwaTCInZe@Bu{t99!Nt+3A2M!MiZ=24B8%;V@JoRQuHl==J53B@+sPW!cE
zLd+RCG8jz)W<dU889qI*Akj)-TNX)??V8@<aYzP?nV8!&k0&*-*v&8#oQ3=Ye5lnK
zXY{AgX5&J0AQK-CFW@MJTZ#4Fv$5wpcgoH96TiQCj<uY%a2x8;p{=xj)#vBAxQ;=P
ze%<%@=q+!cd-8y*j|R}>SB{Y8UXEuZt@N)|txo-Neuus{!ehJomW16s5`4$#i$h1=
zk_`Bvj{?d6DjLcYKn=nKT17@;pRytR6!f2ei?WN6!y7LbSQmRA6<mPcHQ~uDY2mLq
zU?MCB%Vp7%;^0&%DSQR6k1{79W7K`D*TkzICNI<=U0aNSLmlCW4HRg&<}Ui#sYPr;
z?;1QUKU3;5o3>sG5}e2MLNZjPKCp|Ts8a?`JW$xZxKyttYlGX<dJA3lQSa-;NnZxr
z$Gl41#M@_%t+JJkJvzF7{;ML5eZBl^e87j`dq?k(9BGJbx{;3uxC@EdZ&4T{1q@<R
z69|G0AD35qJX=vSu7wYTwyKTP7n-(0q%qVbxIy}!=V^frR?t~ids*-MPYKXV59vN@
zZHa(g#b7pmv!O7Fr+DhT$_G1B?}=5^N;I_eP{AtvCYeeg5U6CkJKVG9Vdy5ji{xWF
zGED@dpjjg(*X=$l_O2`Dh0oKZe%MH+*OBHd0W;(yQtL-%?sMMFE$23?Ji758cmA4Q
zmH5ZArT`b~WqNQ{w-YQ1JZ0l^|BJSX3bd~OEhi@jS}hx+9T|>q59X{VE@3!M;cGP=
zAJE+hu4!#9F(me?4ly3HlvkD?X8IO3mBz%E45<j(uJ}aImKl<%%Rz#Mpn-LmRGbNA
z3UMw9b_nzXqKMvbcKdB5g-u`RR>wVZ4R_2ckislDY^c|Q-<&VrpARuCx^Ioj3go37
z4w*LWtB#@a{t)-)5ec~cK0&53<f`Tx12k-rQ*(i9LN5V-(5eIlf?X57yZot2_Mu1g
zF6aE*wz6z~I-@eu_WsBBr_&XC*yOy#ykK!O6Qo{nVtPvC?fDz%WUOyw(GrT#nCR3)
z-|NN82kk|cK_yx3TkWlRM*)n}o{GL_)FVuKfO<xffqWI%*VD;JklK=|EF+bhzC)u%
zg;9i0596^+cf?RwNRh$D?^PBGvCH&|phLu#@US^{+k8U0{!5(sKfKT=qr||i3v2c#
zp~raBC;f-<|0|NDVW0gnG;1UtU6sR8M-$SR?T;2h5E8|Pqo>Ti*;GpqG-mxQ3KUL3
zn$LgqrK`7Y{R3QD8QtyVLqqu85le)I!KsRFMUE+BTj%c8Sgt<tYSN_ywq~|O;S$85
zq9H(oH$073#nWZ2j9)42I@3Hn4bz8@#vY2E-XSaE4VC9c`|8sDU$G>lw-{YptPiyj
z$d6SZ8l|TUb;!1$4%yFccbXmiLW;Kl^f61h%f7*MEc_x%h;4<LCzIshz-L!lOL|Lx
zK<-qoZ9h_3h|^i+H*pqWqaXrBl5|C@o)??YykG{zlUqmS%P%8rf;yE^Wx$nr0}Fpa
zKRlK+n%1{d?UF3v<?_^$@E4+C1qI)bO!x*9&a|(mdc6NP3&76AITj0XmdOf*O7UKw
zzSYeX7SS$j%}y+(*!G6T43<tP9xqLeb<DVZDy@-Zex$gB;`@+-o*2h=PEFzh4ClY_
zN5joy6u!%~48y}iRHFaCd@U~n8wha0qZHJPu_XV{6^GXt!eAC@nSr0D>6q^8GWF*O
z0Y({aOT(@I47;eh8NEbky0G*`z}lmKBk#-jGHHuIX@cbMFgVZaq>xO_76V^leg!`_
zy72)IJzuKG$1}+W!f$m1z6Va7=vs88a*0WN#{NtZ)9eBvoTgR=(k1E~^$Hv|Qp1!j
z;Q#FWNMido=Up1G>X`cu$Ff`LZg}P-g&1*8*OTHt(YHtTk?+*pX?=R==?SNr1k)JB
zbyo`lrYScA;~yhG!se(dy~{CKM&&Mj6<q_hFLZHT%P|@h^?X4O`~0=xz$8$V(FBzi
zul9{KGup#hq5Q_PS>BM2h2ln1UgegoANz*V!2Z*cUseKNOH$br%$tlaElrTq2)p4s
zeQ#p>FCaKQA}Z+jHug<O>&ebbex*st)<I%D3iVPBwh9x^CBM~(S*uFo1D0HGUuR*B
zf7;EAc=P{sm(Tey;HCqdX)P@)KJEVnAIcfzKf)eorRQ+Nqu=d7bC5Vyi#=#L`&{v<
z&kR$Yom`K2ydD+|L&+T<*z&23&gn`1!mdn(Kg&Z!gvZQ;log2?Qt%!kCaG4hP-HSH
z!l72_-S3YL>`eDllwn3-c2G)H>$WTU5>QkGP*mQ-<0-=4A=(SRoU9r6QniaA#VHR9
zc0D`(-Z8C~;kGKvDQ-D(GZ@R6Fpd&4vWJGm)UcL-z5adgR6n&dYRP(`gJ`$+504E7
z;ssgbeCN$B*w_;^>)0imdCVuLerhoCAp_e2kU7U29%F;%>@>3x+X=BfJSx>xU>tN)
z36XM0$XxLt8FnYbaQ$2avjB6#V1Z0VoBiGxw)x{HHJwEE*Nf+$31eAwQW`5(_7MfO
zQ)y7Otj=r@&F1HNgdml?C7J)%!u>xIOXO_$<Aq{;pv>8ST*9;<SbCF`FC7UAaQ}xZ
zw7D=|2B>NrJE}HLp}GsFDt)piwj->WG(LmZh1h|U<3nP6S-Owa;hW_}rpQo&(5-9C
zo_;!0WIg}eT8+?)s(;P0W6%;4L#r3sw=Kzx$7lif;Ckk?60A<~G@x9Q2y81=3&!Bk
z`F0RR7kcF26q>XQqlhp9O08`!d{W>rubhE^Q_VyX!pxwkZm1F@@DS^@ga7}%x7l+x
zq;mXqo&O{e8o%6`+VU_My(E3`-1(g#QfbuekYyk6NOyRClwu=G&cKZ=s<Bmph*GN$
z8>U;KDsi6*Y8_Toyj_`|(_-Qk-bjiAGi@u5HL*~p-?~)rEI5m_D2H;C6(u-^YMF7J
zhxd&zf%rYj6iOEQ`?)h1q(im<s;>wEO!cKYX*mH^zDt;1B-yDN6Ojb`_}gFMocY2V
z4d;rL>UJ<&77i#9%bw2|ptaJl-nVCZd<?{YD_<6NtkAQxeeqz+L4gm7HvhpJ85d#N
zy3@RjDSbfwJIiz{@*<$!z4R}G&R0wnY2w<RsEK#$SvY0@lPpQ+xwR$g_U2Bk>rqt<
zxufJzS8y-{3}^n^GBV?xS^PU0cRV#(s-j;FE`;(zRJkXdy4sI<4Se=P=}L(e4RoLk
zF^WNKNPX=c=6&DrhY6*Alj)Z0@aH88thep8-S(N*A8UuV^~xpHLhBoI852I|?4815
ziy{;56Mp<15|ep*G{EW(A!TFu)YIF9{+Kbq+wJ{ryop34n)c4v3vpzZt;mgAE{{;<
zbNO}K&Lio8p}=&zsdwr=!T!YigQbiOpYw5UfD>wlP?o+qd^lw7v=U^b!{Qfkr4GtW
zj%{jG{wa8!=_d^v1jwz44j4iX>i$%(B3b8oXkD3>Np}7vp~ps`GUX6EIQknFqQFFL
z^P#O%&-xzbn3Aeq2w+msXse%(w^x<Y8m;B7cLXs4l%69Q-{9hq6xF3KX|~OuN0Rou
z<Rl}`K9srfp_3c>hHPB3UXB81`&-T<=W;3Y%X&*!Lw3_ED<2Mhe7MfP($EPaYUQQ4
zZe5TP=Xg~F3cm>HElZp~PCTw}yk6i{GFa(V-^X2lA2RO1SLU$k7<Jxv6Biysxy#{z
z>vidJTSrC@syL+2$JpliynFEUt+~VG8aYdOhZna21(6}1;@WrjK4!+}W)4r_n8EEI
z0pI|n{{e8U)XJ~-Y*poK&&Qbmfg~|qPxjGQAd7wZV7`#HP?Ok9#EoSt*AmNrF{A(+
z=iC^5_I+(-Bdux3!MenA<VRo!v{T2vq6hdSP$UsrEr$oXrn0CF&?b^GFleAvvh-@u
z?Dzdx{{V`mW%m=8S00CPTYaF6jYZ;-8zM&}jWx<F@T$kCSV;I-s|_E9%b57p12cf1
z-)W6@?O6NQOZ`OZHX=HK!+4p@%qQpru-Y-Rh<|h~$%gqL0U3?|)PHuXBlQi;Fod>I
zh|P`I5*cAq)?7l?x7S>QT{&RC&P3-n3A#B^EBb7{7s4^)uL2kKmU?h4f5dY~cN?Tq
z9zeP<r$%n;WS??(v^Kr6%IKEx`1Ma7*|W(Kq;oxXCx&<y6Z$!Ar@*DjpPGD}anX|u
zYx;jtt%WgB!qOfNAr1M%%(90*MN09JrdqNlwYnus7#D|F{+_T98(tJEw+|Obk)i%$
z2#-dkYH0Zh2)@vKUH;-&u{Tu^6~L!A%tZ;Fl9$U4y^GX6UAF|I$#Y7L={T})N7jzb
z2-B&5OAJr2zbyKr=<1qqvr^o^`vi_RWAfhGf%;2=|Ns1@1qlnkPPaXhj(y)xq8EN&
zTk(R@4hL&l&5^d25TE@#hlI~NFXmiArV)Khu7#`3l+MS;{HkA!T+aMMa<U(@?<Yj0
zFHr>`iywSu6n8Iv1~RtKSEepSQIvr)dE=A6?1_1!>4e4%(Eb4GyGOb=*@xJiGMBAi
z78mwZczTMxhGO6LY;I+yn3ljqK++2ijk97J;^*w9c*sE7p47^v6a;tM7(Jqia7{=G
z3PxQwslcSFa)lOWIZZ!H#VziKMo9rHOn#R8YaM;tipS1rVd!B&M+2#V{m7j_&;DKF
zT<dvzUEg^@wQh%>DQpJetzX>f4EagD1Sj@?Pb2xB2k!fOt_zRN+!u6O2?g1_4B9P*
zTnWv17rFI&inm`QUyBf9?ECEpaMVc4bK_dzR7W$*&X{!UB4&v%x)WDN=YC=ouz7je
zy?tMOeQB#KAY5j8_HU>Bcgd*2kgzsxsifJUi?I<sDqbvp;9W6J-g~Otgr8#n!trAj
zH(2Q-Aybc%_7}C720qPSXO-;zJSNqJ7D~y7v6CsY7*D~xeR~`r;X6sXVudeyM9M-|
zW$IM(gqS%tc+UjjkA$vI13b$MbL;_1WuPAcFhHBh&t{%@bJH4ws+4?;ChXd!;5lqb
zOE1}WJRud4V+OuoxYs%t^@ty^nLg2})t#&9J~4r9$4d~gf92hLo=dL|m*kjd?CI+F
zaM7Cwpkz;`7XJBo81f;r9Kz0szoG%N<A<LQg!PMCkW6Z-wIu+OdqHJwgN&vNe1RLF
zg#s_`M<RLk7xO$A6~U5s-{r995e+C9$%n&ZBk0k28H&kKqfymF{F+!quB_$dRl?w?
z5GO3Vg*j*WiwMyO(Gc>3aRrd`6QDz~{%e4lFiM~~_kV-ltZrMCa^_J*`|L_JaFjTz
zWIjGyuHNYHB$dJlvpiGc$>49N?dUJ*+otzhCPS_5fg6-2DotfUSpV5Ji}9UsVj_#u
zZM(&BIdU#(9cI7)^a|re(zn>X8?<AmY+m@yqPvmcOvla)y7k&ON!h3P_hLpajv}G*
zI}`Y4R{x<1-_Moz13gO?CWW%Qj=Y1k|B3D0H)jtf*8($w=W7>mqEVUOI6AM61OhNM
znihnu#O?$14J#{r?uc|>AIvV$SBv*#>k}^Xzuau6RG@Y>CtsrtL?v4>tBxe~F6C;)
zJ@Ntw@e%EH>47zD$=j_~W<O_Gvrpse?5-83m$a(y_+$=5=s}VEoOgVH7yda%kS(Ji
zRQ{85)k|1jd6-5|tywy8=u^BBB_@8>R|7S8x{vzZuYFk2>azNgwmUp@A@HTqDe-kc
z9RiOda=ni5Uu94y4@%SQElYH40AW%8!!!wue_n|d!FZVXK6mf&F{tX!cQE}BE-VrV
zTR;6xs(lF~WBrVe&i>3g&#B%qJyZCTYw?L0jrqL5XX}gu0-N)e_7z);`@P}9MpF{y
z&Th27Jsy!E#GP71x^-qcp8R);LC6_DP3#4?*D~NbTp6+xt=RoLwpV{a8#b~TRww^c
zbFgh>PrRNeSO4%33YfcmtNO<Zf)3X%d-q(Rj(PhQqAwONg{klEoXZn&seQHAg#Uae
z?~rxA&v5(sXac`cQaQHPuQdJ*j*^TH6v9TR_}%kS8QKEf#fw~dNSNR{t7maVV4v-3
z*%@O+7QuT>B_O%N<e(&6WIOhuCUD9g@Hv0(V0EPj?UUjwR>NTIXhq-`wQgwASE`GB
z|DmFxPohDUXIGgT?V>#tYi?nV?wI`fheoBhVpGRWsD{((*8u!aNHPbcB+#23=L9*o
z*GD#lt%Tg1{_A(3qIpk>V}-cUqVaP4zmP#sfrw^B%G^4WAT!uj)aa6>;SivdT~20>
zqTIziY#Bd`B#0ZA&i26Pr>J+t2Qw+gVRrXD|Ek~Rn?=HMuuFk`AGCCDZ&l~IqC*zz
zNCpMq@M(wH+?U_nHjp6MHVB7`(x*BTIn1r_QFcNbW)qF^99Dskk#NbGa5x;d*m7zL
zb%4nq?7AF3GJ+pnjz6%=?`C2<;6uO?5V<&I{+hksk6<qKK3YbuCxv1KHQ%;5+BhDy
z;sYKCEvshMyt2YbILGxzdB_krF_9LcqjJ7}*QRGo60wZx=wFc}cde{5wTy_rzsBLA
z{Au~6Zl^H!L=~N=zByx4CgTtEzSh*jSq&!P8)x|sjG@N&d`k#<Zf=^XMHoUO@;EDV
z{ovq9aaNB<V>stiOKANLT@pNuo1%ZAU0d}faWb*#T>lqJsJ~>eYnJ>Y9gLK0X{DgG
z)m19_gJ;39K^Z=g$wXO5MfE3$GYe(Ur&{COTc6mo@?$GJC`WjK%Ma^%Ut#ei$cj(W
zJM#;D4%~b`KW-p6cAu2pXXYDXebxIN_s)~$E8-Wwd!3zG6ixar8+#dBC$A6VV5g+@
zd9lN)e$@z^IB8zf6w&B=Ps{C}d5Sks_N!d7@SjGQq_(I3xa-(nLihE;ozQ{9H=4uG
z?Ltrxc`fo9M$Q`kM}#Ba9|+F+haaYD4<Fj))^)B6UJITJj-!o^S)6Uj$}8xVV@UM0
zo|O%o-w$D`PK7kSQ?c(*;BeV0fZ3O*uz7ka@*I0-V`rV%P)7t(Q#B#jvhTSdu%#qI
z%iYk;{=gkn1>89Hhq?h;8a5SgI8s(R;s@uSj~(1X9pQvx^=Rl6wPnLJ?ET+KbqkFT
zz3L2;v#Rd}ke-Fsjjvn6Wtx6tkz(TfnnL0JmE~#t5j3^N#L7EYw|B^pIwgq9aHjMX
z39*JR_(S~Ly4X^usVIwzyTk>MbApki*gDtte>%MLKV?1(GcJsFZ~lMc9#l+)LF`G-
zQ{gt2NrG~Qrb0^~EO)U!vE96a!ywx3$`VR8G<T9#&lac0eh;)MHDdQ5IM2h}+~?rc
zFJ=|P4$M--l%^+jPw*l<U>NBl<v3K9`H>zi_L~7y^Jih;f-Q=`v#M}COwJU&j9;RS
zl&KqJf<UeIO}=cU_FZ6>AFh-CHzjNR^u9X-H0R~|?mW8dy|D^utoJeL?JL6O{nwKU
zJ5`2}8APe;Nx3PGwtP-0n%IA}`8r-e4b4qVrn#2|B&kSC{b-Hh@1gGBND87=3_GiF
zW%3cTwp{<2N4Zlu1+UkMX<<g`Q8}*~o^q;JpX)ry^n44x!xWS5;Pc2VVu_?6<Hm^t
zeL3shfUEr*(!$}jxN9g-ber(d$u2dLY{lZ*u3Ev)Aztri0sdL^Z1k7xT0V)xf7I*#
zyj!-Z^E%<i-SdkOpVU4hUkuaoKZzOzO+Z<Iq*x#4hD-kc_z4Cg5)rhaiMz=CoL!>y
zlUr1@8uk#77brY4P>(AlTUwB~WZ_hoY0KBgf37G{yBwvykE+)6XAER0V9bP|vs#O?
zWzsW*jA*|nZ@UM`sJ&6#9%F&Mu0k#W{|IaS1(~;m9spq|X62=6M2^z{b5;NJe(#8~
zT-bUEUsKf<5@rep0;SW{?!3_$jW`KJp6H+|QazT4>K>^FM))}Dv_`G8nEwebwB!og
zkcr8|H1qlLgwC?xqq29VL}CHi`6iEiV0<}VpjL0XHUMyfb`PjhW7>7>D;Xl~LMD4|
z0*U8=@DG;VEb((B4zS4Hkx-lV?kzt>p{Dup{CKGBiodAL7jyI=HSXn;fXAE7Wzl;*
z_hyxXC^A^6whsfCqQ6t;;<EUQ&IT&03FVqvqG}h@VX||XZv7srtmn#Y7lwMkKJP!?
zC^;BKF`6k$k#<(4y!3cSGQBiykN%ed22=gNEqfAQ|JepqmV8V94LuZ`WE$z$^;kSF
z=jabyW8RP51h9qn;O{%Q*n&!6nfbd=jBCjjF({`wEc+I|Hci>WFlhQn)w#}G3w*}f
z3Y20<H#1IIWm448ZFA_;t_cBD1E$84<5n|N<yuh8@GKYV$4oO|EETAnCHJEj$_KI6
zNt-za7gpA)luvn1-f$|)1Lfr>()v~ls18PZcx;rx!zy9_4{L7~6=$?<4FbX4-3b=l
zU6Vj?cMn=P!3uY`1c$;kxVw9h!rdiUaCi7CIj4K{=(p~BU*Lfnqxkl+x#pVd+r;wp
z2yh1zEfh8>xM|Mkxtu$jeP&R_X~_F5{@9sHNa=LaBBmYq)%`!>spmkv!(nmHN*wWp
z6`%3FFR;^G7qO5%u96~ecr%#qYl@d7_W`5kkLyPL=_LqHX>KKp=iKFe-c~wdIn%{U
zE5TJ$5~H7v2@oi4D_5V9Z-1Nxw6C$-8HTztDP$-Oa4ip%YxPMxzZ+(QgTGS<`CNN^
zMeybJSRg(={{3eZg8XdB#qz<<hYt_5#*2pU83^bh&JWQNP#x#C(h^1bM=3rwo8jR}
z2*7!1o0c%y&hq;IJ^5ZG=uxbo2gBg(TKIoj7kSn0WvQX}*6=0#MdIXyT+~KC3~WvV
zbw-F&v-N0wd<;|t&loOxQXIveQAwx{_(=#!^sP1S+?p73HB4`;U26rXa#4Esq9EL>
zV;fZLx4LQ~`UZLpH=AR$FK$zkX+E2Gw6r{O)6+8rT|RAV8uES-+CgPbAk-@d7G+Ei
zEZ_PCMmIOvZ<V;a<ZOVFnSu=}Tk;bUk>#iGxjqczf0$?qB0K&cfYALHW!<9!XUT81
zV?xJ{jLg}LAzYSQW~Pyko#_sTn1$5Nd{Cj^BGv8uiB7`EgS+POWrDH0dnb$-qD0+F
z-xn9^jQP-5MWQ~Ssbe5Jiu5m0OI#LHgnMi<sn-0FGT&?YHp4Yyr|h((dJKrCbmx@y
zB8Np4l~ov-UoR?-vMER@Ksrr>hi`q)VpEiOc4sa#!Kl`2Q5S}qRZh2i&zKW3zNA(>
zOr_<m*Q%#mU_6L=xURt{!fznwH$FJmcYk<H08_N}iQ!hg(txR~hWDUnc<+6x;Ku&7
zQcvaz^~6n{k@jB5f1p4=<7UTbJ?!}U8Fg$t<9}vz|K@YAjK&-mmqjNSZ;(4*TlU1J
zHQ%6NnT~2CtbPzhDJEVsTjkx}YHRRB_i^wF97MfEKrYj`T4KN>$xcXqRPcwEvWrwI
zI=SItWEScV1vCs0qOcF&AtGx_?jGmC1vMyqA^9FqDeFR;QP}{l4+;wOqVXqxKMl0E
zrNtT#5}Frfg^W3H?Pk2y3fcFq^cGj6utN{PLa|@{OzQ7mFF$`+N1>?VcYpZ*J=0q`
z%7iGbeSx}JE~kBuv<5}2`afa8E^1q(DVm-loJhmL7i!(uyjqn8G!(0TY_J|{@smre
zPlm57X@09n+Sl)iC1&m;Y$=03L5t|U7f0G<*OvH_kV&x`>?5D@O=fy#jcUe%BG^@c
zh03u~oc)L{3ccs`TSbIUc3|6WvbFZ0r-Uvj2X<)#nrm6Ez<Vhq-XhzUw|CDt!g@L+
z0j6{#H+wG$Ob83&cWy+r-T2_6pg^sxuEyln#)d)1$p;boru!n834v3^E~~F^-IzwP
z<wQO3$Bxced@`KTz(dle2`hh#*cfCYNTN&E(6JFh1pRLO|Hcf)oT}Yl$CPLIAh2dk
z|CV+*?Hhk8`^of_D(#A<7<C02USvs5u&#9iJJBI#fZ!6z2V~Gny2NNhoWx=~5!{W8
z^*s0eZinrcMO$7YEGz$E1XaNA6;t*eEe0gOuJLiPuB4^<!lgne372Bu!oBgQok{X9
z%PfZ_Tm=kgKp72sP#6kGCkG$!z~ueEYEiSlp=;z&h5gMF351iAm;gofldp(|Dm|5w
z!T$_3?a_zKZbW_W^6gF}&HeZSB3y($l#l<S2IWOk8FZdv)T(cB@}`iwJP{8oyr9ue
zE5z4Mxt19-Vi3Bt5?#Z>e`-<_!TDIU46gM#!NKG`gQ^gE-*#`D=GsjwwJ0nVpZ*&w
z4@!{nPT`Ra>y($%0{r7QOfzjBb;qAkZ4w3#$xRKZVD=1eV?3hV==kssHTM)%czw_L
z7S+}T!|E){U~ocJk3tM8932~6InH=OlW&p5e`E)Lf8vM-z&0#xhmNHfzOPF*pcK-w
zl<SL?cKXm{*jVW+F6$_pQ|;nlOQ~9p13iCg3*#6yLH$MUF;ew4ooPMg&q>t>it5F!
zK&m=@S%Pd@z|xG0!0A}^49|7+L}-?A`5OIaY}Ic)_i_U=`(*qFtLSw<4A`9_)r5}K
z2K9g6>xM;tU<iBfVkxU~TN9tfF2~8WCo7A+ii#SI%Uj2OSq@WEF6E^}h(pwVei7-s
zgDX$$p5+MH--0M(+r0n3WA~0}T|!{jR#P*^*lvLDJ&EECk@L#B8M0vV2G!%lY2QW$
zjuO6+z*y=|!Or!r%Y-C(Cv7-7agK}?111vkfh*5r{%-tl*`?Rlje;`sFm6phhRg~L
zRej&@zwyBXo@8`x9+!&y5wM~c=r}IFg-6xEYmQv15Vmnvt23$v&pw6Hb{zDo!nm?e
zs_ihs?kwKkbo0K~Z?Q9CgD%5%JXJtia1%YRsZ;TBQ;#3ZNj<NcncT6#q!-B04RCoB
zm1CF)!?=@A6Z8wH4g6X$gX%N%Q7>Tu5`xCt3=k!j5^05k%wMZQF1D!q6cr$q+Rwm1
zN9~3XF-&Ak>`mPH)w#f8rek_xhF$}dodnUKBWB%&?Rf@#UELmHw_JS*Xk$9A|K)x1
z$0{)Afei3dH<#Cd+CWv=w^m*cY9sVKrh*#Hc~n8-N>~&5eW8rxBQxYu3M01)zc1e+
z@jnWfvAKK%LGYorBW}Y`rR-Q7OXxRun%6&`?{&V>@_C8fjAT~}G)u~V_$vu=ROu!!
zg6_U}Z%Dlw<T9MhXOuFZPMqcrK{g0ou^m3j#678Dt!r-%XF}QU<zk<{6<IFF+zDB%
zm3MWlT%mW9oo=EVV?rSzL81M`#7i{!PA!SbK`7yaDaKHM@tEi5j~p7<qx>B!UK}Zy
zYU~0%OmFk__V~17gY2y1D2q4WXx?i{P3_9i{p@(`c+7s@-d)4oJnasoHK&oLp%Vc5
zRA8It?p2D<#f)1rAs62+5j`;77_`Ac^RtsO9pqq|kxh=?MN7w~$f&XHyT>n|q&Yu7
zm0O&BG5rAInA^y373Ow1!|Hze1)_|P39HGy#Q+>^8%_203nl7O6SqFT(@pnTEVug_
zA*IcAp|p7@)P8Yhu_x_~T_3n-32*g$3#wk_KDfCX5tdJRBX-StM4pdB3TlXjCfS-<
zB$x1i;^`XY_W|((!;`B4e>%FJQDhq%r)w;;3P0$Bw+x1<y6*KHnx2G?2G8n$3xO&6
z9cGTJSCTJh+FJrd;y67d_R=&a`Y`ejkQqP^y1ag`o0@z5QFAQiyXT3cdgaWDru!na
zViJyv>;gu-RSD|hp%=kZC`4EcVPOvjovKc4vETai>4bWQqbmGXT$i_8V^7ddq{vpc
z*Gi=)TUH)Nz(%au^)`Ifi?W~spNy2uC*NeXqGHEn?DlWhdRWS^xB*Z2(C#_y4ixGX
z$5~Xy!LZuV8j=xn9*<186@dJyQNNg(7CC+_M=Xm*>7Os>&a;!>aE19##$+vN!Y3(A
z2PNc*CzS4H4k{J`76@OH*A2qR=zXJf(fvk@%Tm|sw4O!FQvL9N99@z)?+C1Xx3O8k
zF_8PHB@kkT<>uI_(k89|`mVW%_d8l}yb<RmBBXOvqTLx;BWi9vPjhesMkPRCSaKm!
zq(61F4I;3k4(Q6fP5N+;ZSd6NUP6su{E#al_^>pgwx7nuUQE){QQ1>WTxfgIf-j_B
zKL6EwU^;N5UCp;CI_Ft}n=hN-j-M$2M@*P$N83qjZt=@SC3(IJuQL%lS8<wHX&x(~
zY~i{Z?ETtrcRs@fnK1Gv5#kO{e~U2RK=)Q`kfa#94JbB0JMVF?Pb#v!IP*n}U?Of4
z=-ZF)QWPp$Fq~`TS&Ucn8bz00@^W9L<~-r7{dS(N%$IThtE5Q_>wE}?90$Hizvz|1
zJM3#%>{Ra}Lnd%*W_EHaV1p}~pq-rkLt%wT_=5BNRqMq~;-#c}x>)!|(UM6MT;WV=
zU!7a`H%bzw;Zf6ounoc3_`P|{p{6M55oQg;8W%or?21Xr=&f8Kmjj^^;fvxZMrj|*
zO3aFDA_=QTXr_JGD9fATgIhkj{fzw_&4gdmO%kX}Ulg!lZ(NhaEA!NQ89GHv@$|#N
z?`g%)P)kc%Lqi+!0;5+q*1}|&w!tw5+g18!$qua5F0MI2naO8yi2K~K!q^f!0^A7Q
zdkn*tN@00i{gChSH|yVLVBH?C71@m5qQEe-|K3ns^+>hl|HK2fv~o3%)JzE-dl8c3
z;>-0{RO61_FiKVAx}z5TRh;QlP8c?K4WrOAwhHkm|0PmK{X(-D#y=jv-KVs&d0~h!
z^fV^VQckOzRt`I)*E=teQ^P+|3<x#X^2biRFUOr#MSV#;V;dN4Zlc&FZy4vwPbu7e
zz?vZSXpL0e5g6IJp0UtLxLxtOww$)c3t*w{mQ>037W&o09Qi7BmQ!J^*<!nag#}lE
zPD2wjoY^6#tY4y9RyQM~U~ZQ-a&At|)eB*utu5lLmwC*b-E#IunlfB=^&?{VblSqW
zD@!qhrWfYf4FfZ!JctLpeQd{&bA>C%`_+f+Zry>a%TL_z4#ft9tZPU()Vhb3PTm6L
zWJ9i&kIF*|Om3a<ML}+LK6vvE`ThZPH+fX4z!>QNZ1}l4&Dz%@D;WX7%EK+$VzaKc
z1s+sdS}mclpddjxvYG?q3E95VfEF+#eepA>Io^$TQhUy+O)NrWfQjtx_)HndgqOU(
z4>;&>|9MqcWos2(=vaA+t-jR`O&Rm|BQIh8A2QCmx(>6vsWk;j;56Xes`+|w*HkiK
zM<fqd5du)RU}>4^2Nh=WYls=Fzly$Pd8d=v;WCw9K4|Es9_B~9HUTV-KQiobVa{8W
z<X$Dr-c0iSEIK4y-ZZtYUYnu=Y+(?>JfjaDwK5p4d}Ksau480CbR1fre#TTR`Xo}w
zb;_W=PeU1~em!tKf*SlY#makAp>%9PZC93SR~7ST@9!)@h`TLLAOd~(W5I`L?D;Vf
z$QANRrnd4^`jBd1{Dqra@=!k?S#5t}tT^5IP=mP3H>R@V_5(OBTJ<SGNd=tSCxvC8
z%q#T3U5a>Y4jY{pftO!BnfUW8mtWLY(%lsRMGw5H4-`-n<O$2Ajz#n0*|XnAP&_o-
zlUAlNF-#k+oILK=Antf^wskw>lf5ss%Fy-lOA?Ij2=uVzdtD{8)L3dyG4it0!-QE3
z0$Ew-6`2m1Y(X6r2CP`vv#)F*5Dj4uhI#%H!v=P^vKaU6gVCnBzwII$y!LG1XlSXK
z66|S74VpMxYyV5j9dZ7}j#)vy50V@qb!?y1Lax@|qO4Kypg(U`Vk(DR8T1sVogqRy
z{1|APFEe#jU-2Y{35P8ta73mA<&FJ8hQ(|G8w_~I$A@t7%l#g;xDUSeY5ihqf|@nE
zLe&u(a)_(Q(-rrWJ^@lIw6!bbRF$xk?vytu?h1voDQmcT6CL;+yUt|3P6YdbL30K@
z3YMNBth1$c6{9XpM;9WW^dzCRhO9}JROP6omRFct`TpUJxyc-Ig{CkS1{EL6Mq<x-
z8;HBi*uQ);W*SiY;A6QSb*>t;jPuD^jv;IaT(@>n%^N(m_Glo=cX}zOf8S5wnd)%0
zw8ie5VrWkenm}*c#fx&_p-3T0)Uw(pD1)q;ja!BV0i&kwe=G374k@f5Fvcdxjq=xk
z0;baxtj1$D)4wOs=eq2*Nw47u4#;*;%(Q!uI5f(D!BZzq8+k`nuY_0aI_YosKBa!Y
zovaT}ct#4J+<e`|UUhELc&WQ>aQw(A6zY}jMr8`5O{gy56DM@a0TuwKy~ebrA?zF6
zi1Se0;l7?Wq8MIYbEN@f8xq4IiHBH)0aKr@ZdVqE9oTzhD?kSF*`TXjmqbOFn1aTX
zV9<|%9p3>nrI&zuj+IFN!)cpDE8D!=uyIxLtJe^{g_wq*haXhBJr2ftXHFUGF^8*A
z-OdF26C-1&Yxe^ltrs&C@A7qY)6)y)cj&7h3N)RT=LytN)F7pMO}v{Sqvp`+U=2*4
z@h;Z78j3LCJr`qt-V`&bdw!a37<CRL2?)jc>ui6OXoe3-c49XfD6*Ypgb|AH&p-@U
zr!CpJuiC6XbxLo)G1X1w#~8RG1}o_Ib<}*EU43BSp#c6$UEaGT++smtKP8X|Ml`KH
z%BBGDbcD~0X*mjBzB`W08iEVspVR+Q_^FR^6>y7}@zWER!5{0P-+MvvHM8FKv;Rf|
z(2ju2`#O8(x?6_0<8nb=XD&SR5#fTIQ?y)p%i}WWsF|&{<)UN@gWb)5;|}|>13dKy
zzf>keDrMKkvt5CfR~B9rgo()taU+qOUu3j!4fNrAym_SLd=qX%eao0q?nmwqtIeZq
zPFpxe2PU0h%dozC1J4!@TT!9oWG&{-Id6RnWy9PEqw>=nZlP>4^lcW!t5ehH&G(8E
zef!2Rw@_=qD+S{FC8eK-2PlMGR16)XPsC9RU1F+B!MDVUUa^K<icJ$;5kmGRhPVsG
zpV%$PAL+Y3gQep<F}`(;9o`OQf7WZkLHs2YEuf}RpCzc|gSW6jntws^7MENsmSZc3
zVtxc_?)<~s;35**mWh#OQ{K$Dt{s3SGqM<Qd!m_UVGuNO&g`wk<&e62Gj@G2)V%UY
zlS*RLUyp4HRY_}g@Q6CkYt#$$=&n~(usWNkkU3gngawLppE>1~q>(>?H;Qh#Fs72u
zUddv*ZfUFsQ!y*5@}I5YN9oEz+N*BvI6pEyqsiD(_;Sr>0K{#cNsk+kv|u!hucTu{
zN9+jMp7<n33E^Tno5%c*x&aW+Ierf#{B76iMD_a>?)dFuC<R()N?~3eu@J5rCUr%+
z8ZyRl)kjQg*}tCn5(PEb>@6Bq!(xuB{o_NlyEUEb+pEZ(+UUh;4QEscc0==1G@dpJ
z&cyU=w@p3w18*=kdq+4<CyE+hNzG0))_6a5jCO&?ou>)yHp~3&4m{FZ%Thb(1ihxy
zqlvF4R53MM8V;23#?SI!D(zeRJU8vAGpb6P_3j;a3pZD%#bOPOt`s!>ufx3wvKhYs
zBA1+FpUDgKEz+<nEOYo%2U#Oz+I!22D4xis{gjrj9`8hbAEy<D__>j;mJ;}D>eAEU
zU~TrURTZA!&n;x+Tj3UW2fQ;qX#HlUPzaPQY;!$z5&kJ6!QDi*d*1BPC`3LV%HFEi
zHM&1&;2VLGeJ_b|WhW{`I(Rlc3O2oh0Jiq87ZrOuoI#h!3eR6%5ek<cjvX-_0K{=I
z9$YaceIosqMjj!$x#D*HKK<q4<69%&(M<D<a`<k3T=?rbi3vVi82i?7b15ur+!MtZ
zbbd;E5Xjs8Z0&OjCwNh2R9GoJu7U?5&?rV+RYJs_=T-i>&AOdll+Vp|jjnLxlRX{P
zRFmd5sm+mYcTg1Ch@-x;sGy?ly&VHs2i7X_C4P;C+-^)UYf`93PGX_8N0(!K*7vvv
zKhKM!-B?)DIIL>7GBfLTjiSEbhTLABxYt6x-(fV=_jCxiV96Nb&ydOF81sp*GR8{S
zJD5)NO2AskCls59r=FAhOADOshkf+ENJ}SDKA$boQ-sxti&U<6?_5wK_lB7$K3-%5
zy3r|BSdLF$D0Pa15iru7-ebVk53xX%RQ<y&qT*0HKnssFdL9t*5n~36VD@!cpw2|3
ztRcVTRz_c}@G29)jT#=ey5tir1i(N#11&7}1l<hLZiAYov3P)aK`VY_h{DG$LmwEs
z1z%}#U+D{0snP`Ha#5L^6O5_VMQLuQqo;b|E9lct@>7?+><YAcKRb5`c^b3(11sRp
z01fmR9L3?@lm+b_p(uAePF6H`9P7iQMRk@#Rnx1#U1izgNQmw0LRHUT<y||!fJG*H
zuQd}=X0Ekh@BVO&T$T83Kw2sPy_ded?|VBT`(@#-m>N8o=0yM&p?3k$xD!%|{tk6%
zFts=|?B{jI3+@Ykx4l7f?tAZ$F)LXo6ep2S^QvMSRXN?FtrIXm477e9G)-*kOG?3y
zR~T{Hk!*TV6)v1Usdb|voz0?5UrGK3Toqm+!?5{CxK^>T>%7C-)gPkdog4PR3rTi6
z?uWa3H4n$tdWJRknpaL~5*{IRzBR2u)5qR!{NYW)>Ga&jbuAh-gk;{w9Ha-MULzLK
z?u&lg!U%|qOhbF~m#J+)x~3L7xqD^90k2>=<?9u;xSk0#b78Ta{)V!|KW&OJ`&oIW
zMz1`Am(TPJn)`M+GdM9`zbAfz7Y$*@^oNd~ZRuK14&<tpUC?FsvwKv>FscRYNEaNK
zXL`pJBI#GgmWy)9=*$un7OMwM-|eBLx;vh68dz72X9DftqdVs!CR%#)TRPbj&Zo1^
zY}-=sVz6!M)TUyXGcGx%>&AKqq*u@HC`2E-pkhMJb!B5Rl!MNdc`7Q@uSy6|;!^s_
zCWumh7?M%sKO-k%cbY~~CZ&@6HpUCScd1%<p!LXp;N%4+z>xQ%!cAyEAHkzEhnhVH
zxz~i2=e54ebh%&JFF~>R^<c*3Fn=|#Xt?fkySc0nfuz9iSnPfC#J^jk0vLe0_(T+G
za0$hw45#1wBg8a$_c<c+n$dj&4j5Jb_DW<h!c^lXCZi<Qi=Rt@?Pcrk6S-*X`-u0w
zM9Bu~+lfAEcQcC*zUf5>8-uLctA|_|0lV9rg(-AaZrhx5JU83@S6H(1?&4H>mh)yA
zDFh5sz#hj{8|WCss7Z9q&a?sg^rX=ufQeh`UXi`TrVCy%$TBBUj{_XDu<fEgA*akO
zW_JqU_0zyQSUuY3h?%NFz=wW&aemVJ{63Zfok8o0r9Qa_Y2%prbl(f*%p){iNKu&?
zsSMU}?{};%`V!k2=bcE`+okgkl`!;oKT4`>!~9j%;6Hi$3~FsKpRX8Y2Qju`F-~nl
zH34XnnruOARvcdrQXr*d)ejL!*XYbScu^i4hAPZ};V$f}zFj}FJ&SwO^~B0UCj7}x
z8Xe@7vt(PK!cGKz<@<vJh%ppsO1^^}>Ff*NC>kW7R8ScpsF+a{MkSZl;+jDIc&SAN
zw2Z<aCqEu-5AXW}VKPvlugy=#g&q7M4Y-q5jXTfyb@v3o{u*udNi<fWL5}}L2p;a_
zbE$)f3|d7;V!6$bC#f@2G^McH`Jf0_vZKVE%j@qOuNizl`}aOTyvW^wB`kfWZ*!|!
znPBln<Rd=YLrTq!D7C2Up5;?UZJS=Y&-`2|X5puYSE)si2d(BpygzG%4EEq3T=!#2
zr}pqmD;;t)w4rtS0Q&(bXOpYH5F5yZq!g69rSi0;bI$g49>z?6bso~M6K;XgCSACy
zOek!qy-VYH2q`SAo*#KiqX&P!9Mib`DNHdz%0s%MEHB|?=d;h!m|RX1i%Bm1moF8m
zx=r0J%xSCd{Y`yrRY{Nuc}pOGkV<?<Z9;fTwfNtMUZwrPh7wz-I3>p)oJQ0Y`3Wor
zpWdi$kZ$2%?p<#}xSmNyi(!_%jCe;D9%|&+W$pwOh5f#ijplPV^wI~36=>%LVY`xH
zPLmdz*^$5!J-=R3qroT{uAWylJj9VT1opVGEd!|#hc=4P{c`(d_hgSX4TFXs<vo##
z(~4_;5qASlI1p1pO*H)7Uo0urBUSSrkc9V9*t_l!6=fcZ>J)MrT-#PgcT3e^8^gOT
zwsPL~v<AbKgsaM#rxs7Ldp|8>KOTH8wK*-6N{xwL;LcO>a{o45vcbY@vS0h)llDX^
zM!zBUD{y`6PyAsqGrxJVH!?~@2+s)X*Wh7Ef7L1R6L9uNzoEq-C;Bs{^pl%FqUDTa
zgeg*dt|InclgbRv-pf;1k$*-0^na31@5TZ8+ZOvsAdZF5P2l~uw9Ac<pIpQ{gIULd
z6hSImJE3o|SE+)CMmqDGcIhknBOOJ?xR+s`T)batd}$NGWpbm0XP4Edn=w9kb1H-#
zRylw6G$XrR7_P`eBF1*IeV4|}kptZ`7gmqNe$IF4A^vI`Pe2GxhzYys2~Aj^kT`r6
z73~T>JRIdm<C4dD!BH0W3x80hm$$n!Ha#(6C_S3$iZ^`gc@CtfT3;UYJ{wz>+LPp!
zvJzq%S%h_@efDNGJvFsZTe?#7QO<}n&g4QB)N9olu-J4f1qg)Aj7{`%j@R@qhYmrJ
zGkVx67b6NIBEohatztN~^3BgHGM4k$w?DuubiUA`-Dc$#(MXr<{AM)OT^>Vo{4Bkm
z@wgUcHN4tw#P?7ecO7pjP=kC$-1w*(kuW-^zye(7P4{^dNpEpV`c?+eCz%hjnbYM+
zF7N<y@2nbn%i1OU>uGkWHo<m4AKc%(8IHU-)Y0BkW<vNHP9v4$DuNB-4T56Gy^4Up
z$N&RtbJ)NFk6t>(iGW_wOk*g03`y-w#ie>B%y0T-!>m1V56mTrrzyPfb?OFn%q}~>
z@KBFi=n@?q*M$}`kIMo!4F0|Y5033{P{f$!CNG4=orp-M%P8+}L#<nBA59LSk-8X@
z#6^@A$)%T5IvdPQTGd1}8v7bAgfp^_M(XXZLwv_s7w=(i0&{Zw5#o}Ac!;LSlSA7F
zHZ1BZkNj@WV$!ZG9XS#??_*bncQ2$fsjG{z&Noy%EOUX1+*!H1hZu|vjws1UQdSnf
zMOLTQ-|b#uY11DBus6^?M>LbVwY{F!m3VckWwtqylSn250KTL??uhQJ>qWOg6KZyL
zY!%~{PNhV`-aw*0WDJ)3!7#A#NQ4Cjj#qF^Uq+AtCRk4A?Qd`8(^0U7^sl#SEc&1H
zR%@cfC|^1H#w*f<qK@J+^Jb5^W4inTiX5AquzTm{XD}q7U88l;$$B%w_`q;;r`Rtw
zeBrJjK>JeYzMwE~t*W<^HK}tA7BPh8O_Tw<JfavC@SoPicB{Mfx2B}+>w0JN2>mFt
zb)Ko`|7`3>xj6wSpy;<={`s=FKe?Zs|GBQw1B6e{nyNN|qAbb=*3TdAA&y*dQ$|ai
zwY3*QkK&jDc62rN6k)L9_(YHX#;9^Q9lSs^)ih3B{ADmE+`&O#o_joQrb)?`nmc<*
zI?tDoN^(OZ8o^;gBs_X>goaxs6n?W@JO;n~=_AI-MSt0<qFcZSUqYTyWcRyZsK?4^
zICZ?-P*Zv<4_pwh6g&Vin<EvZ*x0DXggbG<-UD`i=S)U$NAr4k*P4KlO19MD0gWMM
z3zz)F?a;V8Gb}eV!yU9w*l~@;<Ki~+(dZdl!{nVV4{x5vy>q~O`RE~H5aZeeXC@Xy
zmX@nN(BII<NB#4ruay7LXL1w*`XLX<HH$x?=`&0uNN_r=YTwb9?NI^$6c^o^3hON=
z>ii|sA?+O>EHJV)x-EIJzzw!2dPf;qU0wXZo~~YS{G_RC-)GM7c|4sD^NhNyxc#E6
z@&q-pp5Aff;R<t7Kfkc#s9EfW6IYKfchMm5vW4O<d!QzPoZj|4VqsfRBf~$oC(*!M
zgQpTXl9<c&hjqr_b<$+5H52oKckVC2WUG+74SEkOrStS6)XL>eq0!4~|9C}k9$FuN
zk8s3VtFv<CL;$+;%`-FJ;$#nf?NN50{kod^f+u`n+Pl=Kd%EL7!z`l%Qd2Ttv=a>2
zXLnwe4TTP+H!5801(T#r7Ww9lP$<_x{33iu%Y~;|k8L?+{vE?_Gh4^*WG0v>?qZ-<
zD7r&}z~0PTG$B86Lz;r!^jv1e9TzuG4}8g4Xww*D9v-9tE_rGVGpkYx5^Z9^9>UMU
zaWTxD4K((701_0>2cr4RyHAFz=qS>d3e){;VbZ|6QloLM#J;+j34*`P;Y!xGhejrf
zl)5>=HQvNXhlZ?*JrtX`M0#IBBf>QZmDP9O6gA$5nT;rVd7m%~y>+n&CgFBF#0KB2
z_3mXbw!W^teRST>DhPG(o>_&9fkx7BMp?*Ifzt4lv?eDIfy0?bu?^|=EK;Cys9zSh
zh#&s*yD9I`JoP0rd&r{8DgD#RKKv8F{97Pr6bf!g_47GXm*kByKS74rDX|21uYTf(
zbmRl8O=zj^BZO5y5lrg21}TA+Nnd!9YeNZ&w(y4;ymC9y-c<m$O{Rc%rSXZXDYMwc
zV<_Lr9Jh-K^Lle<V@a&(OVERiiTL@vViCSt%F5joQ^(t@QC}3{0M<Oq5^5!OKfwG}
z3xB9)kT@vMgBZaP|2%D|<a37EU`A=}H#D2HkBJa&#70d!zagVrw|I#yq1e%|WBCo`
zyOlT=J7TJ6z$;c5;KcH6w<|YItMk*xGsB!8M(r|JTI}ekw^P^B1Xf!QDL)U7HpJac
zcj4~nPs_5RgWFO>_|Es>)}OmqgnKmtx(b{J5hBvd6@4qo-!7Foi}x*aDAEcaRP<3M
z{YYNf3&&QCvx;_{1aD$C{I}cZX2<PzEERcq)W|CFbO;afAGRcZQCXJ2(+yq;EY<7P
z){Ok+)grseKlxWFl@3b$q|zlM&5?}`sJv?0^e`jRg%Gl-!XhvsSG21Noc=PJ$brYi
z<#0~LlK$r-Eb2S|i9Z=wcHat<Z%yG(W($tgb*)8V@n~UpW@-&880;{wd65J<>%GWy
zKdo~<4Hm-{qidxUS8EgHJ=D0@)ZKFW=39c10_JwHCU>~|+8mq&!_GdMa;?qXtAEyU
zRQss%oGeXQur(|}dmkNFjkTO(h~HSf)O@3a-<y)iQ8Q{$TbD+A{8jH$CTS=@=G>DW
z`d-mOe;6xdTWAXX%43Ks+GjT7GMZM8l2R7;HD?_LRpq=)4fYfP7U=xN=c!LU&TCO?
zPYlI`u^bcijnv1SoJ1-sZ{)X0YGIby$}CWy7wa&4M1sQjYOeh-!u_(&JP`R>nubYY
zf4}qV*lr3`1RNDJ6Gty;BEY+HI77UVD?d(BClFl;V2gd&c2DN4^i14Jz0su~F|#~n
z#WY}QkQi-JcSyYaZtFvt5|DZM;8fvlSC@~++_cY)r*UDqt_<T$-K|(VaTGG}oXL=^
zUwz)a#6w=+>+Y!Z15pe`6X7WnE{HEKKdE!MW{ZAwF7NJe3JTru3)79;`2(Fgk4~sR
zMwmP*{t%QPuF-t=y@KPDaqNtc>zFC_tHiy{sT}jPBTco1@fJ`&kz;z51}$Mj7>zNS
zeoxebl~4NJS%>t3-}5_eG%cj^O>UYoqd`Z`wui(0k}D$Hyd(ZiypeY_1icg+!}m;1
zHxLrv|0ydDX2J&ZcS83?E9(_N=rT~{oE5wp4D)Cp%XSTVzyYl(vY|;3#Wa8s@7_Mt
zC11?*G$1YzU#-k(Z>WC_sz{Sg<3X!(I){!KO3}w3F)M%SXi%8HuWcNztTfKh>DG_N
z8*Lx#Sq?SNYKF05!KiCew$|Eu?_2<XMO|)3ESOdB=_B8~S-E<z_lSFK<J2o#_|Gat
zSjtTd*d+d!l|Zo7v%u|Pij|wycd3gup>vFld_{e^DgPrrRnz>$9WM#0HdC%Si{IUX
zj8QGnpw-w-Srjh5pL=RUwm)D=wT;qR0*K6%{k9N%P4$4RTLKP-#_1t!X(Z=@(P7N5
z5kvlv_q=mOMO{z1zp*J8w4r*!?W(d<MJ4#u%=20c-0{1LypVpj;V9;K5H{xuGi_*?
zjoc>^d4p%!;=b;kP2?y$K?@$sRs1(B@d<|-lqMIltZCHiLE8T`v88}fBe6BMx^7m2
zEabGfYCe{st=kLeE{aVs*|{l_O!`m!W_^U>k}?cuWsB}(o<|IALrgT0j@H>0S<tlm
zPHl{d3616p_UJJU=D~74fiv>y8L2>3B&=c<r>qOlyhr)S&*Dhpj=Bx&hYo|2S~p%$
z1XyD3K7akF+*eeggV=Z0rDI!@7v+vwhs2-gz?)ogkr`<lO47-Z0`5)2?IoH)R7(At
zY;vGBhg%lb{qvSVbPbb~th<=RJ&orpl@(}g4`Eve=@q0wHZ;cwbB6*mEnP+#E*y*@
z&rA-)g&7k_Z5?84Mm*J3n)mbxPZ}3MRAvYQ)8KGB9(utC^KT5J=Q8Sj*z_8IP{UK8
zG>j6Z3H%WQ^nghwHK&p{AL>=dFRz7zIql=Qu~mWc(D@qZn=)YrFtLr*v>>4B&kXwv
zld%{eDWE$iJ7sNaU1DDhyoT!KUqjvH5~8u?pgM~R%VW1JUNYIj$dg&Fo`QygdhaIl
zNkTpS{x4%HC(yQw@Rz4=EdC#1NhIPpif>^)^+m~oo<~1h+CZ%o9)zZ(dO{aw7#81@
zt)M2fsJUHE1QR>O3PSs&bLvAq)}-{w0=?pwVELGGcQCi55+ZImM#g@$O9Hccrfehe
z6oj*T8lVZ!YMVbg;lbGALoIy#a-7M&ZL#?QJ5jI4oZe*quA8zD-a%#Cn5{wV0hK3H
z7!B@0hK5wBx4;b!cXRO2d$3QU3oqas*7;Kr(FM?3yc5}WIiW2J)SAsrV)Nh`&wVPV
zXbR3U3}!qklqk2s2jK0M&&aW_{Q})>exO<W9{7OlP|rhAZX@q!LBUN<*?tq<Re7<%
zYopAtA44r@Gf#VTRW3Z3A{dP|bS@C+6w3G9NaGD<Lz0zUQDqmsnn=<!QD6#90|@{I
z4CvF8&ELLiL0*{;&QMx**PYKqw)#rSw75-3_YWG^YR?w?)89WD?`YxAONXxeWk32}
z!1G*pPm7z<&{mhQr1xhI-@n8Tnsih{YZ({^rR#KQU%}X?j-hcfB+PiAci%By?eX5?
z+}!LfNAuWz;cAqv22^r}a=`D-*Vp&p+6HP$Bb40Ue^kdnVU1x&Tla*aU>L_q%{OM}
z3`&ni$g(dCOlOaiF&^*1H!QMWE4W=ZJkN7N4g5RQ{l|+DQgN(bq+9<Pbk(QAEQDp|
zYqrntB`V>hqo)*)rE8^XCofovuu<Rm1`Eg878}VLPOJqPI{5fQ(bWqdf<Ue2i7w^9
zGCL6)T*smt&ij)f=_xu<E5yxR=JxD+wA<I6H3e~~UW(L?kWq8|bC3wK1nQVxu|>3p
zjV8b4G?(^Iumn=b0G`{?6@(f{!jkAZ4`$b|#A-IaG{&8l!a6~6EG;9ZY?2|QlGiB5
zqGKna0s?xAi2_r38PbG)Xm)L3^u5Gn{^$W3kxcXCA+CxcexWe$q3gQB8|>e1P5VrD
z`SM2@@(vOCa?1d{Oi(DGQBFBry|3Jrf<Yf$k?NzrE*xYxfAHh%+!M^BC#*qINSR^w
zb~|ZSpGdg(K4<o<Ny>zTRp5-k(lv}K)6cu9jo|8Qs7Ikh#(VH+-q{B5M4|O2^GpRV
z_(O50Z!EDMMZQ%q-gw1N_k7AZ+)|@|fMXR8W*s(^uX<}WM<59zIRWhb=2TkL7S_`n
zo^tV{0PA<5M%m2xa-K?u#tVkllnMSQSXb)&VqiJ()jpx8`p~Z2WUFRLKT^1_*+GCO
zt_IWmB4S{UR@<TnunU(_@W=Bt;d)Wa7tOAR@e88NhkIO};JGA@*Zz`s#l_>VuM{Na
z?xRzNrANmw*<y~)Hthkvrgc1jRSNF@{(KIX(qX-;5ewMQGQ8xUb@($;o#z?X)lq1j
zE#Pd34L9y>NJP`xL(yvXu4|XZEUN&RdkG_{YIfQ)sSRj*pT65jRmX0%sNm^Y_IgUq
z^UOuyr(RglIwK4w#L!Wjk8PXBFeIIB<|b)C9F7nYFUO`jJ0)2heEtHOnq$PAzNSo$
z%J1VhF?ctKKTJ)ZV=C-KMjc|XO=2sG*9E3LQDwWuex=bdc!Cyu3DI)I@J?8G1J@$)
zbO$0CUDY6loSjP()I^wL(O%xyBDV}vn<u9LJiHTQ+ph&Y50cohPv2kh<zB2(hVu5;
z@O7ofdc)sMe&ks9J|^UGaV4>#Puj#A+W9!ytQ;rsX(lZUaM+MNd7l(_2zkehnvXQX
z9-FnMUbULh-)JF#u{HT>ZJbzCX~ePPaYVDvE7JXVz#t>+6&Ur=-(cYu7Jf$*b!@?@
zX@%$*R`d^2yEpl8UgtmB#ETd@>)FI4Es)yY?V5%QTA@4gqtA+tJTg_m_EII^$=Xa@
zZj2i2nAeCT`nwRF!nDsM(lm3f{BZr*IEJMh6hrRu9&PtrhdTMe;}QZ%L5If;!lD_2
z-rm&MZy!tq$GMu&pqUd*N%@LeUWD?NNclE;j?=dHaK0PL$%kB;!`7^p`*Uxp7-2QD
zVHx)ny&3z8oj-<So^T_6G4}0zn5-g}gH6LjR5NiCO6l4AF04oYMa{J~;ltPabCD2s
zj~K-$j5ouYuR_u&TwOU4o!OG>y>v%0>H}ry?cQOJ){W`g7k&Kw(cbqJLGnrIN+OC)
z+hV;h;3#gt{6J51u^thM-I|<vwmt`o??X!1gXWOx#Yi7c2iQ*8W=vw;Jx-?hH%`g=
zpU{8q`+!x*AOTpu^_^gU5*~pw?sJTRfy;g2#7C8OD@C}9@d*?3pf=57<Fo(;Yu{}(
z_68w28VV!sJzkJfM!!scLMn>J6!WQ3=k47_ZhOso;E22r;i2QUlPj+KJcW?rk4E(9
z2twQzj*x&A8yT8YJ@0G2t_GFiF`3?@a|f#~DO`v~1Rbk-PjOYJr7K-D)B~H2l?VJR
zfV53K&+UDK!8`HwrI+M58phLgN^i*6AmLpm3n!6ixgC(DSChm;#B`p`0xhTkZJx_p
z%ta$w>Eojo8WDLXfnax9=L8s;s0pT)K?dTAvXuU(zss^ZHnc3HDOP4Do;TJ@TRr~(
zF9D^c6?}C1DWUMjo?fY(i-{m&3JIg3B5qr&sD?)IG}KYsUzU5MOQgJ)nlrQ4s0I%k
zZwVich*f@w&3ErV)|xgOd61j$j6_@W#@G$!#+-;<Mku9wz%sS+@pt*yx6vBTtScH!
zh$7T2SIuyqdXqmNd=#`h^_;&jdlyTr@}JTGFCCqhrjD$Ixka4VA|U%!VJ@bIhycyj
zF{?KXL2c#XorT&27li5ifbY)4&GpJL$Lng6H<-ZA7nG^f_n!_;1Qwz<;`6|fyag}|
zGL_Uk31oc7tiupoTJiGZuu}oRp0C>jU0*gVZh<xMxw64xuZ_mM2u({NFIf%@Kxf|E
zbMJsmle;TNOEm~ddVq*~gy5@N+Zl!(=RJxj|6qSBFMEoq?=LgVVcL<y37@dK$lg$*
zC9y>s(a>w=&g47V?67~aGDz<#i~`I2C1&2u!w+s<dw1dlL)bfcncOcubOV?qCEAXN
z<A$+ge;=gmQm5*v#sYecG@*g#S_ZgjBO~R>{O{s15#0!<LGJoM^eiFi%wokEWLP35
z6z4=k!=7eD=T?*}{=;S%Ia{YcVL3aGah*ywC|uXieh&_b{4Nj~y*D5&zT!lEZt*x%
zwRaJ}!XbxH-eHiKxTLdqC?ox=MP+3kd&<S~pvCq*4agBSX=sRZ@XHL;iz>vV5Ut*e
zeC=aCzVM9F)LS%?opyl9(9!*;7YUO_Z?Wu@(P9GRhI^R6B36bRe!z>|lP{FItSW3H
zUclF`Dqr35%2-983oVALZMk~HffLTtkBTg;#)j46Q8azcp$t+HtokQ#q5^neyTpVc
zBwH!dfiK&WD$f~x)6j{N9K54w&P#LrRBJ#&kc-J~zq2sm{Z8mvjU*_Oal-U+FqGOZ
zr4>Us_-4L^-s<?|O53kPNhD5K_U6P++t1{q(~oD>n8*J(6&@_`-P!dQvD5l4S)0))
zm(7PNmC+yh^}@pKxe<7L4(s|y!jln4h-SuX-xY%h-jeAYUeRFZSzOL%nPwN=R#OY7
zC7fs!iT}zjL?WSrzwEhi1wS;2JqpRoh8f%M0|X!YyLu}D$A+ubglqW|Ibg^k1p6cE
zf7c1rGnfo#PhK%WF>ah?1s6bSZz1L;jv#Zvrka3Btb%1-0}L<P@VHuhH!7)Hn)>;$
z=uY#MS3EYBCnTprcU<iIh0gflv7@*LlD~{p^)dUBDWZDp(Z56okf*%wix@Tj+YX@o
zueu!mI%Rr6(xpHZKau%yjl>9kCnfpt*&`iW=l$;lq6K!%3mGcYWRhMT-b|76o+p=+
zx$Mb5zX@-=56iBf&)%Sa<E;aS?>LGmwfE49vS?|v0|J)hcyq}D4vzy#gZ6n%uJXPD
z?1OkJA&1ch;ckyZ1Ei!<S=8f1S$E=x?IK&FE*mFd!FiBBuLOyU_9deMxmZtKFdhsh
zwzDj<NUCXZx%!|eK>d~5OgtkUyrz9QDuyfOFFfbBde>2J&lHRZpWV(zvkPakeOsz)
zDl+1MF12A@Il?OgH0CKEs<Z`*YZ7=@+T9j+mhTNdWVZAHdk@^ucF$qww)UKN+{kuA
zTTtBs6A1L)8jq^`j!0KXmSY5)2ii2^a-9F)5?+2bp|cmWHCft9UYVHz3g7I|4IczP
z%ufdcZ`Vd8B~2-hPfsEiZ$g(R5Rvc15v5sa?I}4s;FX#w**3qW#y#Tfiww(z2?5oZ
z5^YYwR@-k$_)9b%t51+*j~n0>0yS8JeB3!~u4#2Yc+^be3`H+`HTy8x#IWQ@m`@j~
zKsIzbXC!^J<^&7$-1NH_=>oYMT!J{wmTPt}a_|^nFc4wxSuS5e`{oi6EzX!-Mgydv
zi1JcJV(0H!GG9_-f;HZ$R0HLLZeFb=8QLl<Vty7S4^F(H(jyRXbQu*j=1a$P53J(Z
zJ$g8Tj|%M+n;>F(NBF7t3eSJ#5QQ0Qm;dyID{AO*Aj`HL6FbEGwwpF@`_<i)=JN=)
zYxa%3x;~R<){a$Sh%UlylnH;sT9lQGp=z?9P19XDJ#zvb;o?*w-2olvTNr8L7$IFu
zj_LIujn>yVu@Isrol-IH#H_~!KXIhax*z*5gkATBHCCJvINzI}u<j%;!|=d+m7fbx
zjHWzPemFTgd@Za{g(ZiDWsi5-NSs&`<{V*<ERocCa!za~LFBHYu4cpPb`Y6Bh&J^g
zCw)7;a_~;AT{@=}(Nzl)2;k%UdASdd_iMBgvMONGk<Z3eYn)MH2gq$f(!aR7fLu9@
zvW6`0$JgkdV|btmV;SJC9>dA>PYn}<Xk5PQfn$B(bq90vb}4K=7<Sly_KYG3p>xLW
z1rW;A8&S0QuliAf+%?Mah5Xa4oM1vKD!WNfnCEzJ#k-T(-<j4xuT_5eZyEdmBmBT^
z@u`m6`)a=#f0G6^meKm;rFhou_vh#&c8)*bSGNWo-L!uQ6O!Yqv7ny`FJRZ-fB<S)
z4l#9-B@_<L!Fw@t`IYXGA84>@VOT}Rmez7-9J0F$R)0>iXE+nbb%H@{CKkN9c6)cA
z%STs1+DG+?I*57M;r`dnOWZ_E5$((@zqfc^DCe?E(WwMToieLiR(fd@E7V*a*2l!X
z3NkiM#7fNH7Dn}^O~9OkbequsdR2(hEy%yY;zN1n^TtcwW5o9sXtCxryJ~bZdeV_y
z0c6nc${}5w?MriKgqyKV%DFyRpq(=pbMwz>5Bi)1P6+X5EQGnKfyaBTskw>%7<cd|
z&R2ZtLC4zb6wY(o-FLzE7;K6P3WGyh@j%|!g$y?>Xpi5sPLMbzpo#ELlIHIoq9n=`
zpX=m(G7$Ql8*MS8*}MOH{^!55zk`RCKM=<8{$%RoF!Otp%_PYrjabkqFDtAoA^>~u
zx;x)=q2uAos^VWE`kVXN^Ww1OC5oLR_e~|Du{^~+W-cam(m|QUX(;;uETI!EM+99F
zw`J)Bp3osHkLu>?0XDZqyNpbWun@NYRoJWz46=&X9NBNbG%;*qjv&}UD*Jeu&Jmsj
zW?x92us9rixRUzyBVhPAau!?U;p{EiliY7QT8kujjBx)GqTQ3ldY2(k`$qQvX{ccT
zw>G2tVJhc$Tz=xqih`9v3q}#4Qr6u?8@YUj`5M}(>3x~SF$fqvk6(kTT=q0!n`l#8
zYvv${xiI-(VqQ8l`MXhJX9%(n`<s?`e>ZsWA17;kINf8TA2)B^U4A&`7P7svwFL70
z*}Of~6bS^c&1>1eetRe|XbTw7TvoTDIMTe2q{a*t>-?nU_xw?JplHKP4?*N@$WPXl
z%(k#+Wl!%F$&ztRYh;%9Zra*9P1AynSS;j8Ie%Cxl9rvFGj4)@Y4+c9ZDvvW*My(h
zj_*!$i#i6-A*}cMur9uz0j8siy?U_|MaFQB)%7KecwHXrmi%AQs0;&5bKT9V&MUIu
z`$&}zN-qj2OR>UwUF~mH5v2cfbfHJwb4GjCY`M>(5y_$rd7k6l6zuo-#|XQ10&;~b
z?QK}-)=HRQJ2?Si`>d1Lum-B=)A_T@7N-=W?OyuM$v$t0Ry*vX?fTo*qPN>3OO>{F
z*+cVKPP-&dm-d<jzzrgc>~Gt*vJf?jP`H>&?6pW86?;Ki^WAY}$l}5cxO}J)d&K$U
zv-Nofa*~2o8_s@`5c>5WL;H7534|<HwBT!z477P#oqKKM>VFQl2n!u+pmb$#v9ky&
z)tl0i`r8-7zz{}h8zxSe{XFZ1YrUPJkXULDw~>v=dPsN^x1(1gHmHua>p|YH#u<P1
zZwGRrbRy1iQ-swLxp#n#Lc1iRxSEXZ+mf*xSd2f$P$y2qZ5RT1>d_**gF^h%QZ<MZ
zybhB9t4q&Z>Ex|Y=2x5pM{$x%&ri5z0KVR<n<o|u=J0C(B*6btTDZ_s;`s31Lx4LR
zB^6@N-6f2T0iSU>?zN3D1wD_*ovb=SMIjV6x<^r9Pbm|~It=atf^s5^B8$~q6YKPR
zmAwM3?7c>|oUUx$h0D=QT$&)<Y)soTHi76x2>IR$M79^bQJ4Z6g48p6kAZDH@Q^wD
zE?)Zmz#nwkI*5p#a@|WQ;bMR!o0?r@cx1N*#CI@_(WPWcAePrK$IX*1DyQ^1MKQBw
zna%YLdF_+u+1s~enE(FkkY5Zj{%KJXM$^G+V2u+4?%QQ0hL{%cYz&DyT!*}!GYa~z
zO-~wq*WVRwx39+)*J@0EmFCB6u<WnK8Ik8j8!aBK&mVYSI$uBR8k>}IrWDdUX8v8v
zTXOs!w_pk}f?(=w=)B<UgY5oDDdAj0Mkr<}%pKiYo<5yj;&_W~>^A=iX$0Na*CpoX
zW_AtXFv-$?8(<2E+L3#<ydg6>3s%4PjtD{arm@LIt&PP8lcrr^Hc)~u&{!Z-h`#;5
z84iOVChtpS<iJ69Uz~D5Ns<MtQC`v^V|u4g6y1|O5uP(?ab_BQulvuhL5veu3rQ2a
z;mo*Lg+Is|J>HWhlS`#|yYHNtEv>RL8a=lG$}-*CMiXk!8uj_QNH$753s!|L`j#zH
zOs*D9K$i4<Ca%jaLi&n&r-@w|oK0zAVW`{*RlO0PN-=|%WVDS_$;k7Vs$OW;fH%_h
z!U7Vo0moBpelc11mR)iz&2?$JsGvl!ug=P9?+K$lTa%qv)+E!ep_paOV^kp${O&p8
znf;Sji(OmHfZ}|U9{-u9h)_gQas{F-0?f)1L8I!t<|V9%yTGotbISbA5Tn~UyF%w3
zYXvLI^mPSEVsB5UHX6mBGA;|-n8Fgt7U5z@Px^O=sYvM`Fi2tZp@zJ#W8^#B&D3Yx
zN3yqrcWNef?tCH_4jwaSS9JDL?~Tq#3LGn&J+Ah2_X89o>9-5@Y;lF!5%mX#d#<?M
zZv0f}cx-}|YSPiIPkb|ompyaa6^)@&jR?LF_25R{>8iDo@BWR9&y=Xpex%!G_{mz`
zs0yMfxgkpu>BOFK$rexSytTu}8rBD%0j(3>WPz81V9Ao|%cQ^1{GUOE#Sw86d9}5#
zyKdxK@3s77P~J5mt?0&7=XX9)Hr$dA6{lDegBW6CO1)Dse6=6C_6cyo7?_TaKmWoK
zTG%o0d8fYr{w4F)uk!Lq<gt2iO!VY?#Yg$I)>wm^x=-vo_<d`kLtVFMf)5XDs`Z*x
zgG<;4>3_M7|JMIoc$j2)=leC3>z=p!Gwuy{&<Dpnc0(<UBQ+&e4BmGO+<c%0e20FN
z^>Z6KHLsntdc411t%PX|9lINCSQz-B<{@P$Z6E^aPhT_l^E%u}ftq?iS+t<o1$E}j
zBVWnkITl+!Pbddf%BYH;UsuYWpBM7JB16WGIYWj5ozbp>$Lft6UW(Kk+No<NGFWLg
zwYI_z(r2=CG+%cne04f|<)ao=v3jqXWgOWjCE1Y6@|O>Rzz!~$*I1(Unr););hx!L
z-PqbPrh0BMNt79%d;MZtx{w;$FfaHIB_{_9=N?3tm&6Y3jEQwE$c6tsdI)&ugn=vM
z+B>*vma{$R6`Fnf69+gQ<Wa7MijV>`?l0fIv&dBm@*4d5e_FftcqY_0UQTi!Nu1<T
zB9}1^&LWpk?66#NiB2-k!f42b(4^F;q{T3|DfdIs+}T_*oSoc8BQ#=jHYpuu$ZB$S
z&gt{(-{0r=`Mv+VpZERmeV_Mv-tY6g&-ZyvXzH%ocZX3BUiEV+L{u03r^N0FR$gjF
z+E_2t;AF7wAotu)0ybf;fnCJBSUeur${`D8x7;R<a^Bq4d<CY+7SaGrYpa8RCyYua
z3-uaRCFEo$e6-GwQLReIMI3MmDMJ)LN-Iss<EBaIXP6WsFWh?35`N#x=Fw*@^`s@U
zsa4c{gTTy)q^IQ1afpA@UB0ekX8#^T?C^SR;rbic7jf6lkKo#VMQ^F;U?ad@4U5l`
z&Ci8#7cT(+C^!q3OP&1LNq`(TC@$s{H^G2DA|FV^m3!vD3Eo*dvG1sh206#OfJ>tx
zq+J&F6(ItO3;HbZRgy|Cbh`x1m^B$#X}lNyW5=s0&c1tkMe=GEc2bh}S{iOZhvi4t
z=MVX5cinVcP`YGf(k{VZ@(RGB+};9rLjaoT^YmdIRiaAAAP6K=rO4BX0Mb=-Vl4&8
zUCXPL1@SjFnMW|5xZ%NA7qY8->GYCX?ppb5|Jv;e?AS^Zh7#dfxs}EJtpd<<V?i-D
z)F8)qO%nEyzR`dlT>n*620-<*{Re={RRNWqIwEe7e68|O!}gb-S277?9<#RBEytXv
zV?@$$j={VDcH#Z+q*lnxZs?Sa>S%DyG%2L+5(4>)`nfvC#h$1LBdF!fLQ5{4y^d0b
z4~rQY%QDu4dTyugJMcPgcPA_3NqZ(*tR3WtbdOUsxQ8`?k{QzJtSXA<{CiPq#O(27
znVqz-+wQh@FI(wwgtkV#d?!@%vFjNMX)bG<Yaj3hLr7uG_m0=GTVX>nX4e%r!Rb8n
zj9OegTGt8*k6>6DawKs?P!npq_K#TR;ovsg)+2Y6Ux8V2P<bGZU%y|?wfuKq4;zR5
zum<SNh7K{w8QrTGIu~(hv3|)j^}QKx^vvF!K@rQgu*xrwPikZyaN1u?N=$hR9tNmf
z_7eGpzSQOPg%+uchzd+0jLJp_02=X&;7l!8mhYO>We=u*K%uS#7M+_jT&=W6Z8gZn
z6gSJqgraq9c_SL(Ct_0pom*j&7wBhYL$+leFozZ-kIx3&le*>lQ6q}z{<4o?9n@6_
z+oHbBiu`OJJ1zR`6a*=*_a??!#%<fPe(9)qCuM*O>mP_=L<C4NH>6$Pp%xsj4$76z
zIR_L@0wi*P=3KaLvaT^Vi4qo<uNaqP)qNWaVcT~N!_%<7h&uXGKO&Bt?K7~rHYFa!
zu6R1B&ohC&KDJ~Z?S?07&u-*H5RrKc_9I-vx(JC(+w$wt%>E|SU|;~VJ@|E)8UX0~
zmMy{Co1^Z856$}Y0d;Etz{XpPrX-DimmgewDI&UjaErIZt)@p2dV^F{tZWQ363VT4
z5e4ckjQs1mAiZa<jlKRL+BCM=>dvEth(y1GU|e<Irv`@h@+akMV}}>vy7c+qI>%R|
z2q*PG!ITP<XX$CHKNa+`WWpjzk3P*|@Tv5jB5s|3c{AdE%@wwY2b=WXic-BaYG1Qr
zVK?E2b|me_t@vhmv%H2U%Mvp(+X+6x%EzKTIQ~P5LUyFcpDcXDn0lec;5PTh*_Nbh
z<{pc=7OCI#xFbtMVj<y0BF5;S5}J?;L6~Ji&8c`OZ#s4=sJgfRIVUr5fCp`^C^EfW
z<!!YbBSq=DkXtU(nH|MvvPc-}QCq&VpODnyAkT)YQXYRaGISl-2OCg<y=BV~lPc@+
zpR`As)C?gU(<h_P>}V-h5)<;3ogiU*QI8V*Lz}FB+%3~!S+mQXU*`TYla6@+V0Ur#
zhJ=m4ahi`Gd>%}^0**y){SrJ%?W2v}?skGQG#^KwFFUgM`h1VS`jaw5$IY5j1vi$L
zdhK^M<PzDxsl7L}=Cz(Lr~vcKM6b%GP>Y@pSv)}Id!<m5k@APRY10zO!0M-uPbV3R
z=v2EL`DGSZofQ30r5NC7+c*4;w0lf?xGTr~;7D)ZhrS7*dF=LyVgL5igBDo5%Gp;}
zu-JAdT|(%ea-6SmqQiT(y0}!@+^?QKUXYEp!gfKQet1u3YAwx8Noja9PszLZ<yZCp
zr<+=ld}=>t#+ZY&2JApo1p>OBjahJ=%%u~+Jh)s&HZttT$wNFY?3|b*-uaawYb;nj
zB<!Zx?)u<z5qrf_Z&swa$4`o)Xq!`iN72^0u%PRH1*4qy3P7a<=fHJdej1TTFN>e^
zOx?;l7ClLbVTNkN1;xf!SL{T3s_>aXR*{Skr17(L^n~f0@J5=Ckn&Z+;|flMbraZH
zG+7<gAH{e1Cg8*Vy)Vnvc%6!`y0M@yi38lL0}HHP3^K@|(*#E^<q5R;@75GKX6NC~
z(oW6#yWdOO7<Zk4oZQfwID%9I)(>02CmS_FX3uE;iD|hgKF69F8#`;kLh8N=%R68P
zWwZu{e^#s=ntfi!u|y2}4ppC4cFIo!U$t83)B19^rE@0nG!W)ydo3tDEqy}gf4L2m
zx5Pf3@t`cD23)TSq@%mt?xxdY+gJ`(ANDFqF$FS6$9Hgzk=~L1DFvdjJ`AJVt{?|h
zA-{(@l|ZeBLT%$XP5A=qK=YFSB^^Vpcpcw@YvNO9lC=x^9(1eTn^31#a~r)UQEK<N
zdqL-j-_FyFxY*_jNvScFNRnw|ga*f@o~x|NsIC|vGOq{?tq<%eN<L7ve_LR>nAxr+
z7O+ovOGAO{y>h(xJAnM+I{$lH;OvO6+Z2e&Sqztsn3wb~u?6taKkVCxiU&Qh!LDH;
z1M`?VenOt%#_I^vWzUwa)ak{RA9g2s1$gc1TJ8W1yW-p4GdWOILU5YSi0$i^5}Tn#
zMg}fkAvNXd?#srLC%9^@?`AyG=s@$${ueXZ%>I$pt%xkLw0Rf>hAV;EFJIVD`5heP
zk$5xOhE(nE*1&hkOPZ4kS#Dg&evZPBS(&JgI|3%MCXFF9we%ZzPB=%AJ-J9Q*SBM@
zx?bq%O_tF(S`#gg?x@C!>R~9+I#=&jhG6DTJyaD{-d4P?X|5a*IC|&Xp6Q8w+%M+w
zW=M9Ou|-n}6W;nL%Il=+rLB?d#O0bSZz8xCB^wo;{VmbQMA!*>utoD9V@JgG7X4Od
z66I*Ehs+Zi>$?j{<|Py~{9m=0{eM|ItArgaVVh*G55ZXB{y^_+QWtX1-%$YzsqCf$
z5BlJw0AI+D9w5;Z#rMsk)Vqy~vz2WQZ?rBuMnaTPr9D0NVJW-}YQlGpNB=?j|3RGp
e&B*Scq%hfi%eVa!QK@;Cs5v{jovw2TO!*7*Dlatv

literal 0
HcmV?d00001

diff --git a/images/model_quan_fig.png b/images/model_quan_fig.png
new file mode 100644
index 0000000000000000000000000000000000000000..ea6571509e4b8f1fa00ee8f9ffb0a6870b740d0f
GIT binary patch
literal 315888
zcmbrmcQ~8>+dhs)iIx<tQM-boMX60vHA6{T)Lx}VjkKk<SVh&2TBTz2O^3Z_>{TO1
z?LA7=3JGF=_4)pOpHH9T=<_{}=Xw52awPZty6$V7*Lj^+!XN6W)6!h0AtNKBg=wff
zCL;s>A|nH+P+vTMhb*8}jf{+&45p%_=SjXky;|yQpf81|4Wij&xeC<2PyGquQj<!i
zrq#L=!DMmk)|oQY@EzB!+m|AFKEXG&XRln~8KhKd#VqE}?)Yuinr^vZXO7bR`uuE%
z6&$z9N7!8>?ILBd$=1>3dy`0FHrI*$$Ya2}d-OLcR5Pfr2Htr1-+i%88Cba$@;sdB
zfAHOZU+)5i_K$Bo|9<`7znL-K^PiaHt>QEL@1Ee#tA=}*|F6#BqmmKs?E|R%6!E_>
z>D@<#QvdDe@W&}EW2o;wLd47^y8X`|>(xO1U&9ywSB^n{{u3MYQz()DscW4d2_Jp`
zzqCX*Fp%p6r(WuRROz33vet0Aw$8ET*|FZ?f81fW<+~zphwz!4)VY1x=5XumbSv|8
z)c<7k0pE~A(%KK9Pm#?3?+4BQ+lCYx4gZB-+p&=YozIT3X9`D6Af(t)IGPkbjy+w+
z(gA%wLGru2Z4R;LACI5ki+|7Q$?Tb*4CZKhi*rdXU;Lx9aGu-$)5A?&&ab}pYRL2F
z?Y|hWn<=pFrL*r8Ao68%67~zi|7g<gOKQJ-v`@W5-{jBEH0fTi_!$#Kz2woRM}v<b
zY)im@M}Ug8U7VB3zxG!P8^zfw`qW`h^lV2|nc)U&Vb#LB`6zD`J^Y(#ZL=Nc$FU2O
zmIA*p-(yb3VtOBci}1@dty!#AB$%(u<7{T!?;=)Y(pw0jI?k{BH)%g|{@1=PrG6d`
zsn#P*e?6@Z2L(zB{OSwzcpelc^u;kPwpimc_|?+Q`I}HyVc>eM|6y*J(0H#TkVC-i
zMv`=o-`UYD*ROrdetpWJhyT7u(l5zTQ;FKOf?5&2bq<QH)Tio|h`R0e^<e{xkA6p!
zv*{=2mOsIlVk)Ozb+3<@AU@Xb*KctkXOjAiPpQb0i!XG2Xa~qY+Sf<UGb61u(vA<%
zX9rqPSY07TbALyrF)c|t<g^8P+=6d^=o$z%M~vGLU`)HNv&)Xd$0xebU0vnfaduM7
z87Zcvs`F&|V9M1i%jC;{WsxE_!1c^&+US4&nsete=<%hW8R9Vqy@)={>24twvsYe7
z3)4hN>%lmOZrp*#RmHuD^1u*2Di4m<&g>S9V^5~Aw7cJh3)B4Y!CYzeD~emJ1uzQP
z)rH%S84AH^b*G1Gr3U6+&-NA~MElG==R?K~Fgpo6R&a`0#OX4E4(4rO_IkKcT$Xnq
z)?MUs=61ARitU4(9biEn2){KG<>tN>$bK~sFbvWT-KrItF7>u>Qhv4QidB@31iEK!
ze)C<o=3+zf7uN4h@XW&ommEq<{j79rCV68kms5s%JbpPA)=TNFfHz@d@h@^e)W$0u
zp6)mJ?+G{T2ur9Bv!6vFe62sjaPgMEIJ>-W|J76eBM+(nb|&?Xg4=(!ZrXJKcoJ6Q
zxnDc02zG{jji2KSY%xi$d)eio$`EHmi!ed`$TUOkf|<W;?pN@4X)~OF+q`tqfY@!5
zmNNO(`%K{>ON~cat*!CPgwH4O;Z7*~9rs&hxR`Ym2&4p3Pe6=r%u#}*c#L2ax(q9k
zRPJfTbC|k242ct837ROgaRU^2efeY6AuN6Nnz0uB7!$dB>qt+Vm)2M5=bG(O(zUi2
zyTav*dkzq+2@$;YwLQ?$X_ns14Sf8e`=pSRf<8#8?wXqX-oN)>*`R9bihb71&z$Y=
zCVwe%NqiJA8B`h1yv9!4(j`R7(JzArZ^2a^l-kVueXRzfR*ND^c1KMSrp|B#eobOE
zLO7y&c;>q%lOcx#F?7SS`j0zd31A(VN}SVu+Z_SHJEQP-4bM&%wA%`6@~TAW!!UKb
z@96j*HqUx>$!h;N@~3W&P&``@JzWU6<FjAUb?j6my-|#ybe{=biHh5d@6X++Lje@S
z{o8@p5%s$x>%G#m>_mKAjIRwz_R-z!NiR3=0_wcrfKPDfq!Zm<`@`Y1TMnWKETMFC
z+0r#b=p{U1EVWe^_-S+Be~t=DYQi^=X_xh^jGB2&TOO?+<t1q^PnSlk{uKU8mOO_t
zd%^b@clUxFzx*|c%iIKjnPGbIpLmqE-U%15Fvx)IG`{0?!)8x5J;Ex*V0(|J7R`Z{
z4E6o7{a_&Mei%B|Z}p!f5xyW(zujSFlm?0b4*yAj1@NriPvQRs)9hj!(g`Yjeb`$2
zM!pJx`n;lTY{OgHDlQlMzVv3b6Nh=<f+9ZlGZil^ZH99>4c(mZc_rWG-5o+8$k$Rs
zMN$Mj4tw!|=`g6)hClo<ia3^{A_SGVJw7}ZxAe}|$yGCVHlneH+A~7%=xHq+dO}<e
zq~c`3I3lLBN~~+wpGcxh>&GmdhT)l|Uhap-1T0A1ZM^uC?MA;WQP$OL>dACcarN^V
zGu7PDzsAsc!4(6YSM6Tqe^&ll`qPvE;2U2UuGF#iu#JKjU^DSo5~X{qMFzS}c4p4=
za&JN1&iBkHA8>#@??Ly;Ogq>C?su{~8n(6>-y;CS`gPCTuMkb{3ILrfj(hBAJmxX+
zoHaO8&lJ7`5BUOSMYP3hsKJ*E^*RN|A7p(xLLMafdufbqUzFZyo>jbkc3yNngnIaR
zniMN|GIj2l&vm$i^E7|x=BmDmX%$H5!}a;iXzqR7^0ku*OI#A#2eaD$ht0$RV^P&w
z38p8~{sifkR3cbg3m}=qb~fYk98&e`>$i)w&uW+H_cqeU6prlnanCrVJ$x>K*%FJA
z){0GBSB-8hlfl-$p0T#y$?Noa@(dE;o%i6cK}yGQZa$8h(C-$1g<c%I+zwhnp`U2G
zsArOwoW1*CfVfZG+gi<G(t-!$Sg<fjP&J%wrvJSP1i_<w(vvGKp7|5G?;W3PLj}|9
zFd~XJeUGa{<#d9LDB&~i{PaI#->I;Ko6fuwc%*hC+9nmOAhM7U>87C*o`C8B&uA3&
zV3}%jpFt>4Y*81IVfUk^SuVYh&>5?d#vdq%J_0@Kf=~4G7_Z+(Ac*5Kv6q)Hi8o$c
z1&rw+y^ntvQN=@Cc1xOYm(HDK>_>U&BT@$xGA5iViT<`sUy)zr=qm1$*t&v~lb1|V
z*c-EMfh*O<^zt{k+54)Uv3&cD%;bZ=dHtAj%xn2VSYg%oZWH=jsJ@{1jduX|5J`Wv
zstyA=(Cq8d<#1C53GhwWkNEdInHtS%47(6wCg{X}UGtUev}CwX;kon0W(dHxLYU22
zP*9h1B)~HOGJ|n<7=fQ?Xg-QJ<Qo@4&=ZF~g(dp(00j^gJG$>hF{Fh?ABIh)d38FV
z?DOtr>LPf!_dDq;(eavAj%h_~r0Z;J6D8YL3a}YviEwXB<<-D!?nd{z?LL#)I@PWD
zFfLBy-pEwW2jR_{wSvqN$Jgdoqp%-ZCFy4uxK8n5RC2SYdzE&~BBm@K_#bWM2_%2;
z;ZuhadL)#^ZBR@ng-(Khnq*9r;wg;~blAJQMGt}=Y4&)c7?~^7eM}e<vOD<ypma7*
zze;`f{K%8PTT79GVit7;?DRPm3<g)j@J9DhVE!)Tj3#(q=|);5qn8AIO~FUE@H*1C
z%E;ovB;26+K?44!(nXB+x*R8MD@2?rr>eG~zyutwi6^LOdcPNY-U2*Hyio+CaEhZn
z5HyB*tkLFp`3U-;GCW#nrF#l$H*2g!Y<*JuUOMS!eU)A}bM2KZoph;n&TuS%b|Zdt
zr#`4pz^Be0FUOLt(!QB7rQTmvC<>qPM*JBkjo#E6g9O`yyo8;`F8;N`U2nPcTGHkS
z;VJJ^@H?u?8>8DV2F(9&W}16Qe_jhSpXc~q`x}s#xd;Heg6}xQb62TcLGTdDYFp{T
zy=VFal;C2AVdsbw?|uUsCv3F(P!mOzircO7JUdTd(#PvrDvF3+)V)EK`o;Ta{z$s_
zW;P0q$kZUZ$$i3rlwH48)(p?YtW-(G29iGZmMtXq^ZwylA(qo?DNYM<=b0VD?q=0u
zv=-FM5jpdt-RZjK4?_G;hTR@@n22I`xVO|p=gR_*w#rA%*-+Ttra$4%m;lB0Hp$Bu
zm3KC>A#rLRZM0?^g%!|y<!4rNR2S1@^V4GaCUQ=xymH3d(KA(c_Q_*5=zgrH1v9hv
zRnVh@Eo4-{tp$D~(BPx)J?bI>N9R+8$Wr8Z`YS<pDhguKr;#2i44ZPnm+x*a-R$Zs
zU$-Xs?nt@MonsXlJ-~<09-lu5TKv_cqRGhv5lZ>m7v?C;XXHK|cB#UAPznHzYvc}4
z^yfp_HnaDD)&7b1vt_bh>(=lw4cZ5TYWE(841X)c$kh$QEFUp^vRpLWe0}h?Zf(ln
zeqUuqyLc*u{qVtbV*1Mj^k4!pzL<ySx!Jf5Lj;jF?Dc_Jh><;~P`&;!`p0XwYfhae
zYX1R+5kKPl*YXj|7Qlg-&nvl|y=_j2p{KkCs@SxAD4`0{17FT^rkUQ-+|q?SHRwYO
zhPf}XH}n#J0}qT{ozBs({XZCvbsBOQAF2xM=4F2`p|M>(WyL4zgdKh0`<y{Pk4GQo
z!kMLOrrY7{(RP;OhI-q%ubGf`Uc@}qVcsiPG#4kQ(%DVqMvm{sA@cN}FsQljcAHYp
zmjBt2GQ~o*>q@93<fNamzKorN+`85C;Bd47;<FefeKE4I=GWKpquI05Y)0p9Yt=^f
z&&Dg)&p4S%FX|JDk`Ep_-EuWqh((qw*R^>*=5sRcHWElxR}k}}ZdiYr7f?~E8*&b7
zbP{U5JN+3v{~{b(l<x(M$InyPc|Es??U~$lk|WhQ)Bi9?*v@I?agw2Cw;keP<6GOI
zWNXscO)xM|sXyN7mI(BP3fy9;c4*Q<J*RWfwKbE)({<xKeNB_vVYAW|RC1o@9fxFO
z1a4?we~<opk)>|?a8=aWc%3WcwLwWobk?O4L}}dJ1giY)oYb{1eKF6>z>G7^XsNXd
z6P2x4?Br*BX&!QK&t&#+q$u1muShW5Yi8<ZQ=gjZ@?7?J-nbuZkD)tpF_-;T5vk%M
ziuH)@*<|LjsNhJ-(Xl|}tKK_%cU;(x0>7c@F`nBC_=@{%-wGT?|7@iEZg4B9zT)(c
zfTN_p!B0~0Vta9m1bn_8Bn`k6F1Y1|^|^TVw$T0~k{bESSE5UL1Y>dX%OK3AqR{uJ
zaLXOXg;bXs2aS^IgXL5zOgURO$1S=8Tp%di8>E15j8*v!ELH{!>3z?6GmfRKLgx{K
zuv=#@@$Dj{_4ZhmdVzG$r-qD1AjwBtGIxvoxZLa^c+2FlCsC|~`zX)+ZrrFJDq5m4
zWwzjV!%dqo^VosINg;DFv*Hr7$Yr<bQI-0QT|Js3<?}x0sHNilUon~~6E(G%)?hcb
z>WcBb+tB@z2o=TDYeAlxQhdIUH*@Ymt)x;|NtYV66JV1P9RFFMks99RoysHwn`^q{
zH}*pY5IGXZD^tRuppLbX(N2|!3l{0*v44Q>6Yk4L33$H=*=xEXll7BReN?Kry5G=U
zC5dAdN<vtx<DT)hu_!TO1+BdXEed+&qgUsJ?;6a@#f&(}B%R*`^P-9(8wfU7s$}Kr
z&ZgMZTxYeYd|!|)Ugq*vHj{IQ>!93TgNp@8tJ!R#h|;e$XDa)_#_c`Q@BVL(I0VOX
z`Vj}L;<4>t9Va{4&Ao5^yDT<}14sqOOsB|^%bf1<Wy`d>BW_04I|JaE?E$@g$2IR`
zPQ<4L0v>n}ITFv=_>fQRSuZ9s*U#)y%=R>U2T*QFfA(l*pANyg)beG5<6TUHi0Ec%
z^bIb*o!o2T>ILOaF=VyOh@gDKca_YV6DaB~bS3{o6FLz29Wexhq*Ra5%vV8vej}{M
zV*6MO9Y?Rl2xj>nA$^-K6I;J5=X%3?y|GZwb5TLQD(!m>Vd&?ZuFUQU)?fXSdZ&K{
z((GasK}tdE6#t;^psvPgQz?~cUs$8?2zven6SKMo?@|knq7IaYdSpUhkE)_!rZoP*
zH?myjyp@(&+&4H0y6jTAj{&Rt2%S#s!jiKQZaj_N&HWr`TX0dtN||*ga{sgSg8tKk
zGm3<ubkDLo*}rD%J$b1(sx5XvoWvDhDcY$p|6`QW5kO@C2K-3ec*~_w1DCHc8j&(w
z2xj+uH1K4bfqj4>EyUrFS~ScJd}+E#dFQ>#S1&0bLL4P6W1p?ao-F~7@o~KVu|NM8
zXTnSnrPEj_w`AlGebqDjB1*SaZr@@)znkMSpN5?1a?r%gkMCy2_<i{;oTzAhC!8LL
zBiyz);h1j)@kRp>7D_ZRmDgmzd{~Eev*4d99nOSE8+Cc9q@%K4KhGzAqsRw{yapWQ
z?fuLqm*LLE;~dHuhVM@I$2$WA5Ag5N9J!!lXOm`$qqw4u#IE2|!%YXM8$3o#-&%dW
z$tz*~3#miYBmiFYaj<uPj5$)`Vt=1++?uDJYm|aiZr6f@5gsKi=^63md3rClcYJAE
zjm_0{{p~ScifjHgv!Qy$9~4(NBl#(hMRZJaN0G~%4_+6X-)gWdY-E6U$fG4mPPA8c
zUFU&`9`o*J`%S+d?S-6f$6z6(CJ^09nfw<9|0pzUq_dB?qjW9IxLN)*@u0O}D5Bg5
z6+%K|;-w)+Lf6CIwPfO-DcYDV(LP_eDJ1`@uPS^C`M!KKv(xj*K?c{F@$mHr*UmB|
z^h*j}8;lG@FJE^3^z@c7<A<%X@+TFiD$-D%bd{CTck_B8t`9U<61iWJI?kICO!RlF
zzt)Rjm3v|u;%~T(D4tT}ydz8PB_CdS9g)Fi?3Fb#R2-Bar2Td7TNL%l(uybq9|*b(
z9U^6d{OqY`fO8vi+^I;)V|*}lXA$=B(L=Ft9kF8a8;zeAKnMzgAj)FKm(a^Y--;Fa
z`kR;g2f&;vL6JnG02Tr2H1B+HEJ!i)*(g`$D#(1stKOCr@Q>xsX^&aO+Ld%45x#^*
zKw(kvw^ML7_4C2ZUi!A>*d<mHX3KAjjg*AX^_L8U?ym{^Wdp;e)HlzOhPMXOC&m?B
z)XfW8U2i9vMa#Cg<b48CqiCURFn9QO_-e=|5BqA%Bs~XouQ-O*a3U{-Y3~A6iJAwI
z>XO`zO4?hg&)3zTK9u5_>r=zgI?L7m?hNaGcNJEI=#AULhjt*R)cnj|j6}hFq3zHE
zsAr}DRKRA@VM-l0?<WM3=hKp68fWD2o|wj(6|?!utk@Okxl+m7Fc#Se7g>pto-%%Q
zcVa=mct}YoR~41|`MIQ}hoe29m&0?AQeiAD%US9QZ{-$IY(&6ILO>&@g)4n3EcDJj
z>GmSpux%N`9Q3xNRFH^bQ&8SA#VZtLZ>Ow_$9pU;S$NJ~BH7$3=uEZk_5J|w1AT%e
z>gODdh2a-`wtQVn$Jm|`I%3&{*hTAD@%+?H<al1mzsO*BRlWz&aI%-#45Zy7HD40F
zJUq3wQt~tyS2UZO?)_XQ%s`>o!@bcxO8a<|hM-!t=ld`Jc~L=MWV<A>6G2KjrbW*F
zW&`uqNQy!Y$%vLSS14Dn9a7WR5UJoFhiIvM{ZaH6c@e46m#0kUYpY!=GlZmTGgrGk
zU<&=gFmn*{&G(*oT(9wmEiEa?7gyr_tq9*4<%F|-QqZ?@*O89s5z6+6Fc%#4$nCJO
zfY(WYS^>20tGylx>}A47*@yRwN#QKPq_BhjiEdM?Iu?V8bHLPK#O4#6zFSwG$r89*
zT<l)W_bUah$5Nig`5WkZyXouvwYaihrJ%@U-~o40Cw>W9f0yyOmF{&s*_XM;6bRHu
zbMMuEcv5<Ef&wb$QUmgQ>R|Wke<qrQ2E4}r@Kd#GPcNvT8A4E|D()xM{9bBp=-z9o
zAk=qoMl29Lblz1sG9>Gv{r1K)Keh@h{RGni#b^|0eqEBwny<0B)%Lo}z34IJXUf}@
zxy}n3xJx4Q*ZdjY4T@f0yqDu1lpT~6^bg982yGj&jkCG)3O8x;1<)Q3Ruz45#qF9H
z`<kkW_(D>2`R>PPee~*h&PDSdgcU4E$pC1m>s>ptFGaKjg1no)seCG4&+J8x(U}2n
zl?v;dxPxW#Ye1(&id^;c4(74g2kpTa@0E`pbX_Uyw4q`qPa-Y6#gLTMTt4f?O(H}p
z58p3N;){ppOgcxqiBxL4-y8Gu6VAn{O1gC3=-O&IZ+P#)$;@7yW`=QIUvIa)WD*#@
zec!yQSe`8s)c{z~pQYU7ySxy`feN0Q_(}U5$0{l$7rh?utVMdW&j?qz(^**m;l2m%
zo97M(78iM8ZXA4HxXU8t0<PE2mD}is?xErSTTzxl#_?6^+a<Ub8Q#eP8diOo=7)hB
zS<TCM73TIbH0l}e3s=4I>ofX$njOSO7hk%H!;c32>MiWngo)#Sa3=ah>tj}CrzfdS
zHmTi~Di-S@1sGDBD4qMWf;-D)2B^<=n{6@fqZa-r+@R?e&dvYSr7@8sFP;Szwr$P1
z?nvYvk?tO|t{iAGJR!l4pP&9iIM4iq?t$~qQuA|33cjya7TJC_F`o8_oOHZ?@s>-K
z$<HJg72g2UK~C@_w91nRz7IO`BuJ$ZG)}8slb5HrGCAZr>FGM6PHvdP*Ss9{!oL_f
z9`>P`SBg0As73M_R;DRXj5v-Mj+D8%&o_tgR9HoZrx;DLi6U#pOl|YiZOmOiZa>9H
zz%vrivhFj!Ss5E2yjttt?iq;`J>(Nzwt}oZchy-*8p=FqJ5$x6I{R7j_rb3WIRMc3
zn6eQpL(Ra)!Y>BCS>tf!x^@}ZQ$weVv6SJ_d1E1>A##OJMub{~J2i^xB)fPmNaO&|
zhZoR@fj8Y>!Slh|=%J+Dl*{wZJ@FN|s)DSfqkgw(kI@of0F~!5WHwlzV#re?OVjf*
z{|nacKq*dVG6!WkJ~jo4betU5bXHsj8cUJU#Ot>RgW^P=v(4~kSCBHShgSe>t1NKb
zs%Ty0w!$Z@^)$*qPElu<p5r`B0XbcldGF5Ct>G?1i0u$PmgqYcA+W>TOMXQrcjvw}
z-~ieuHBi6Siyi0Q+umi!={s@NCcaou;b`-^QQGK8b#Z!E6nac^sWLN6dS8B}Q>T6J
z8)!p+@%sQvRuV3JN^F!<@zqVwfiS`VdEP6<5_xG(D*Mb7z_Ys@JD;ZcJ1nn5@K;i9
zjPesl>Y1OkXJ?^RDWO7;ZE6W|Dm%lYKIGF<B6uLy*@5{w;jmLu-EEBKW3EhL<WVD4
zjT3i;s(}zPtp#LG|84ACB=CQ=0J(&#a(#9e*hVmm!{zujU#)*-;)ekDKr(blLmRY&
zF4l=>AR5|^SV+>e+`bY{WwDcSaW*ujc|VTCAJ`Urdxmb^iW}sOPbEQ)U&8Jj_9sDN
zzTskSZCf2AleER@PCHJgUv8J@aY1;Y1Ht^9bNI%$(nwgOX()%ES8Dq1y^+T@xoodm
zvXD#SO9mF!Sk#U}=W-^ScTl-2XGTzc@zb=j-`)vd<^{QVt>uPEDq22=Xa=Q+J?M3I
z6JUCl*FWo7$70eRy}k5-%CI+TB)IQqlbE%%V{3d8a-Ti+HX+`t>v-ubfUtb<j4-rW
zE!dWQ<Av{8V9a-`$-nYEHz}xs1ISW^Sip_T`6@H5d}227jiEvJQYw8;4hzo?*W1|;
z>_J=K09PIo2+UFp3cxi76Q!zIQq*knNfXy`Q>+-z7?`|W5)SwIhC2L5)X%~GXf7Dk
z2ig4sryb^3lcIRCTaT;ZPNKaqY*Jb0*EQ52C8ZarqNGah5oopVWgTewlHx!LR@Zy4
zzX$78V{k3t>fC-}&e3KqqUvlwor|OBsvA=gQ4Pc9Qe{;Yrm%$NkFw<J0n5NH@9b3o
z<oyhC0%6Yhs8H!_;Pb&Di?dw|tR<Ch*G-D7!`;&AC4!nv>dixSURBE<jM$~X@<w5#
zOYKHQ=55w`r6_Iie48S1{Wi!e%lHWEx9OU6*bnM4wF3=En2t?pOHaGOu!kH}E-jM<
zF+~qhSeDT=W^l+y%`;|Eb9M@>)GO0egQ-id$&D>J6qC{wV_q|#l#u%aAB(9hswO*w
z=oRWE8$$t8b@TX2<XYv$`u=B9!Og)cynA1#lNyeHjxWYQAx=+=ut!@B{+@&Y%v5g4
zGk9IgjO{%7WE@SK>W?%&h?d`Eaa$Kq8fb-@^MX!0gGG0!ugAxWY==OM01M+q3#L*=
zQ{6drs{JG}y0M(fN1ngTL&A%5Ef6?~C*>_5wqBEM!0~Pu({9J<_;q<nonGFV&0PPx
z)7FvxheK%059afuR(W1}`!YznZo;>J@0^Xi-p+hTdW|lwgju<IW;(JxT)Oeib!od*
z;j*RtPsey9({yT--YBy(|Md$qE*;COv72}bMpu?<37t9KM=={-5jXS3F3aC#E)jHP
zO#4BE37n3yT-ua|^k{iG3f{U}rGe4iCgNK_Ah`if5E08ibvBMgy1Q@H@6%QER`lD>
z3b7q+%${xpGE3d(e3eXPm4D(XV*17c`-kel(hNXi5`u~&CD6gpR<8N5oqrDeQH`X#
zL#Vamn=r8v9$y;>*y62!JC}02#hcBSOpt`{Tm~cqn`_Xytr~ZYHg5_G*qcB;|5&b?
zyvE_{hR)-+DU1vZwFJx?b2=O~?yx%ETQSyQB0nrqD?ODKFgi2x2xm4d4d3`=ERi~y
zc+FZ%Qu=y?=1BKQms_7m+R(nyZ5u}GG7%<$E7HNmXP~Flw{tH9E}npV<-`R4PR)z>
z0077z;CFy4P0yP|VNWSC7}&u;V9B`Q6`LfPg<Lgos%$}gKX<>7p@0`}I1|Mj$`c&P
z_ZfZfJvTZaua&)vC+MXVH(<}@9EtMC_Frt3YC@L8>o;nNCt$$B2S8Pz0>u_YiwQ=5
z3yfHbQxGuZH&VY56UD5adh<EM{v22d;HRYOmd02{j4u?^4h$3$6JSZ91#DO?)b^&}
z#5UNS-+>>&W|1iMhgdZ@#ph@WG(!q;?KTw#9ziAuF-uAdy*M!et5BN7_RjH9`ZIKm
z^%-K3{05KN{F-8JQADGhaR&0hd&Es@_$#$^F!2-GjMJLxke4u)Zgp97v@;kUuHrPH
z7VTPq6`ttNZslXU=VWW#^}S-CJnmVG(hhIBQ{8HqpJt8(|8#_ad-iQ}tGmla@)e8b
zB;@=h3zqX~>@+O#X`$PP8{xEtQlD`mi;B?hXYtr7W#%)_j{+d1UNVcpxT%c*MoVlM
zWR~?4LSpeDy}#llLRu1l%7Urek-Vt~!G0`^HZ7ulO<aXy%4`s)!5r?Hu82?-F=vUK
zvfvA~KW3&bb~`93mwUa-K}NbRj>?Fs{qS<mEz6l5Xr@ZHj02nQXC^^*x!7~=R-9m*
zoI;@PH&fz!Q%C38`N96t(k3{)wn|y<irZa3yyPS+h1`-3QWdL4k2@>Mq|XkyczGB$
zX{KxFgzJjU+~?|oIHzz_F$_CnK|bP%k-V;Dvap4(N85vc5_XIC0=C);7b|ZOT9&s_
z<@{$6(3aVGe-lx>z=6UD(wF!2HrLK4t)*$b5kEq2(wn5$OwhQPPEilc&!sDt*keCX
zkdaUH>*l_E*1X9)M?a5gPr18Ca<YJ!N;^4?wB|$XG4E;It|901QaZQIn{B2m<W|<Y
z{_Rd9aq5{K*C*blzLgITGz^YljFnxR8RAy<O6$G;?DMrhv{ZkJ;*zdh>3qM$jTd~&
zZ^(b|_M`ja+6=b=_u`9quLxOb@2F5XQ#@mkdP1o(pvPys3V#~H{V~^p&r0J3E7cP|
zZ5$jG8j$AWESJG!nVJjZdr~C;WKRN2%>{EaPvB@Mj@Hv?X-ncfJ~Z+i+zc0jxiG3I
zUvRf~GO-MB$0yh?(CXtgpld`W>51mpQ%j9AAaZE?u7l$P1Ma(QP@7N8srTIyqqu5<
zjfjysWcdMENWSkH_Ji(uE0%d@I3q3`^*W7O2~_S$JW+BP!HgbPG_N1fZe+H=!+BDD
z@S&pmulQxHRu8l8huH*YAK{b(^h;P3pT~$ih!Tc*OHFoui+b*R{aa93bbFjsyO_D<
zyOrXPHDM3h*%Y?P?7St+$6Bk3T)UE-0A}H?(f0kR6$JxIo<rudoALHAh?1*9byy!C
zyU>z*`t8QSsd>4}<srFq5bq_9n8NMj`|V(f5-r1;X#bL?M^A;=yKcpD7|lQLi!*Gm
zm;<WA!j^^;eXXadcv(=|{v^92(mlpPV&uN|rJnmHPx*XUnCoy6eR!$%r@pfoNc{NO
zMb`qA>9Ke(6BOHtE!}01XAVe;Y^!dVvJaQchI@NDToy)YGKj8D-pIay7;RVOT7T15
zze*qK_Hsnpv7LqJ`Y(9(!kd0>Xq)3eB=s=XR{&BRmY-Knx<}*RDfeFV*>q<ZJCxcT
zWgquQ(Z#|&q32lJbKCXc%gDu|*S}q~sIPQWKOdAwg#)i?J8D>W?B->Tjbx6GcJX-D
zDGmkhmQ$4~5baJgmG1!7RA15!Ul(&EXv?Z-naD9^7jY`I%0APrPMk*Q*sXEeFxuUP
zhAX)tjYB8q%^b64=2t(l{}mF++yQ`t!94d%f^R%heH<F_fxA2)w)J`wH~9@1bC5NI
za2+jdFnSB6_rzI*IxkC6=SOKKx8aKRYgmznrTz=kS63kMjeJUKKIB5ju+Jq2mf25J
zRTPy(0^43@rMtpz1cQ+6e52rSa2G(w{&TIyk0CoLD?Aw}Tn}HA5@6?7AmqlIA$apw
zz7=U`CXTl8@#CQz>8tk!FRxEuaP`iTKwI%>&`BH@aaHiLs^56u_<Q6Qu1>O6Uuw>_
zl)vsI8tzT8v*d9Gp*dn&-(A+I035w+293|%L%P4MbYgF8+*J*s>$)gVlC|MX+ZW$G
zc5hT>M~%$7yWG?Rbkd?H?>nc&=G}D3G{>6EAK7daYsw|HJ?F$;YhdYPN^HXP-(@ah
z-6Lmi^USsI&B}zcRKqQ1^r)y7+2J+`_g-&KRnM3DSaaDtg$kKG$l^TLbi7JLn!?2(
zbqgjhCUUZmtg>do=X64kAZin*)?hDpy>OWwal?%++EX)&Go@3iu(5wC*kUiMp)&yO
zbhXE^cAfneo=jTqpV^mIqK8=$QtxG7`C=i^MXkqQR5aO=m(A)LyRZ^5D~)(1JNz>Q
zKJjzXxpIisKDIa6ZjDB<1j>?n0e86_AF4}pnMnWfgZ139`HJx$#PW;lSLTM;+bS15
zzK*C1p_8z-ejSK~MdPsT7E;k<;<oY1O@&<AGyKgRkwtXO=bV^l`_YBZobBqZH(Ide
zp<N$txOF%u`f<)S2#R*&I6KPkY#qu2PV;N?2ze3*`y(aMPInwN9WSma4Nt^vx?dAz
zu`Zzo?N?PejJqr3{}QbeUGULA$~LWZFU?_QEHeKjy(Z~>2V_)hB0V_pD57E{%BjV8
z*87Jn;rpZ4nbTG5s_4bNA*>t63|{v3s{cfxm#Gt3G@h7IDjQL^d|Z|~xVGP-GUu-z
z4~p-2rT-VP)A^i|osnXZEcYIUES9_c%{4?hLtbJU&z$IiQh>Ilc2?4-OI<3ELN=!)
zy+kl`mjUk5GO!GoL!+|lVVuSsdoA`A`t<&}@FHNg0s*|T^w{cYi!ud_p#$(H{te^I
zR0L?B->6lxt~tUKf-vdK{}p`9@CCrTy;6Eg`DIj);c4fRKq%ibY>q)go6Izb3A1vI
zTxoxR?SKek+`A`WFN{~_MfBcBn~k%-YB)0Wm@Si0*HKdU6C^s7sx%x<Q05FF2_JPj
zy517M<;;QT@a^d5b+NsELygZTWKqKUKCf8dsDk%^{&}I|sOX%NMY#As&v_{li=`Kt
zF(j>*4cki&?tx~$B|ICx>A8+8V<zjDu$%wDS2(t}N_+2FkG>vjWg-?LR=Na$T7zj)
z#qoF<NC_h_5Z9tS>%@-LbsFF)e2HzJ>V87BdeF$E6STj$AUBSZ`Cb9O(@dW;b2T@A
zjX*PF*~r<oVh&7>_MFaXt-y<OaFIg!>bg9pC@X9@5jArmbancObeE61RxLT>pS{AI
z))_{Y>Ez1D?}!vTvh{AhY|vD1r@S>cR|+}yN2PU%^u!cAdl0p{_wN+@5YTs>;+O{m
zKUSGNNEnek9#M5H)P6D7(6*uu-Mw;rdyQ^qK@_s;wo#<<WX(9-9MzN;rKs0DTPade
zF<u4`P}tVmE)@|?-78-^>Znt1klm~)e7#z1;V9`yc6z!|ISa{S&S%Z19jkc#X06XC
z#^{1EbUmv`SU()1oD{9Q%JO6_(Q<<&lEXtbeUR#d=j@w?*FQV`nwloGr{+`a(pg7v
zZq^&O)A=7^lPB7Y6Cb!aaftMBw}-gk-TbD5JT|VP6L2=9lAl~<`Odv?VgR;SBrBIF
zi)qK?qoc?P@hVRV$61DLiFp+V>AzJA>Et2@f6Sr(4L5Ru<VxH?5Q7U?n)*5aylQ~4
zrS`QVhARxzj91U47Q4d+pr%3XAWPH-un)BqP)?8{4|!erJi(DtnCeKYx*ON_aRzgf
zl9J><9422kWK&WFig$&fGy%T#uOsRfr6GKljs#vHm}J0!Qx*usy!JBO$8=&G=4MYA
z9KpRB(c9y)B{Zi4(8z7Y;ctyuDnc*|$fXjRMER_^g%3*tUAjf#$hux7R18?bo9R;;
zQ!01@Xj7uS|K*(u&&qsFVUPg%(r@Ci!sS-vL8MDpCsMyljUuNH$OM0x+Tln{;=Xm<
zDVP~f`)jmZFn>xNA;sexMisfeAk(H^G}X>%N*OZ1q>ywRy6<pjhL3xYL-BdGXa_DO
zul*io(Y2=#OYCU}^zQzg>6G6X^0>R2%x;tX{f={RIt0AX6l~hg>(wi2V<}3YQDns)
zz9E-eii)aP?hPcg2fo~dAl|9WCf`Z#AFGJmc>2|+Y&$z$#mYpig#4m4sq~WTw$OCR
z26&NG*sZMsY;-QFTRFR2tuL2lKzPK^X|2w1!8v^be+Qz^b`NQ8z~&cH=p~a5KyH-&
z!JZx64`^uNT7b&#R3B6y{{sJVu6?qc8Z%0E*v|~bf4X$Wg79@kZ<iH->`&sP6$R+k
zd@w&BX}<VwynF?3WDtF4i$u58J@R605c10p_M^0dF^nPCYNc4!@#>nyOEyQ-HHWpp
zY_L(^;y_i`2L`&P)_Nye+h3$(1-T*DCOOAnev!HOX!l?5W6Y3Y9Wsxwa(&ma#t+#a
zrKuiw))hN<c@KDtg?r;r<K?f<m0VdJ?9$E+noMrv->O45O0FGhgx{7CSQZyqV0$Y2
z6a)kFOR$^ha8XQ}oUX=Kv;Wb%{t1gOUAb}@)i0*=gOQyfkBNBo0fVV4`qO3din<N0
z$8RbYxrc&WLWd~yf?lIc&oz;0OQ1l3reTMEAtGQ3Oakf&9=7FTD0No&ZvL}Tqwd#2
z?i}ugSsXR_pf?H_!KY;iAOSJM=g?#AtIz(UPL4b&;Qi+Up4!piU$O#^K+$fRw&zr1
zN0}KsE4{H&8pCHaR4=$0|7OuS-0eFOCjfhha>GM%K33q}xUO%~|7iQx%=EEi5<`=k
zwdS)`??DY-<7!yEpZk2tdm&ulqFE`tF1nbdlFi>8bPyE0)joQ@2QbyXVz~g>KcWMX
z_#eegM_uxx*&Ga`70@ry4_aRI@f(?u^elBu#$Ad5e!X<}+dyXa+E*+|+#Ear`*EqT
z+J7Q<3uqqC>lReLSa#dI^TroCPU4$${o2zZ!l8J3dCOv31anxntSBM9GUPgzaHyS?
z{zkuKL;R5_OS?!;-B9_v!bK6M9#iKFmoMUT4GcvVq!q5KJ1Mc9DdT8pJ%>`fLVDwF
z*GMNim85|5T!Z&W{fTCS^0jYoCpQpVM8uhL-5JyW(?H$mUVsG)^usI{`rSFdup<Wa
zkBTXGQ3~D)F@HW{b@<p%UD|JnM+EYr#*ym*nnZ&jEy@dxh>l$GRZ*Sw#K%(kC&0ff
zNUJU+JvUog0i*p-51P)AM1P|HhsyPBvxARfsx23ymxdc8<B`Gf<<vEs<poRVtd&}Z
z<GiF@RXW43g1(cM-G#JUeKEVAx6Y@(<Jmr43Pu95(k^W2*>=A1REY!U;eJWkUB#A&
z&)hWbh=WSeG8zt2rsKH<yS?v4&2))u&NV5oNq~q>kNa7Nf`AI_^J@Mh5;h&d7`@T$
z5R2k#Ygg8^_9)Qr$DG`jG%DsePSjVLKAaNs`&zDJUr?Xmvn><`ieJXPyw7BPKgw)i
zL*yNX2sbFQnjN4&?XsD|`8b755__fQk>|o4I>%QfJuT<MzII2_F2@tGtZsZH9c0__
zcpgprx%==QuPZ&<<AEO?RatQmoa!(rl9+G+>uG8!SQgl3Cw6WqNjLbSwsGvKEnL^1
zchB?S&ul|l7W4lO;%`z21O$-1&xmf!m`lV`r~(b;Xl}>~kiBnACmZBfwbYE&oYTA(
z>!$PKL985ktJhsK#=JqbrqXwX`0Bh9g%3ln_Nwkm;_$DiM@0tQj1aNLQVcPI>-ZaR
zF#5s^$kw+xgXvFI7IVq$ZXYuW=R{5^4LSl_@A!$U8&LG)17a455F#LBn(L&$x>eLq
zR0t@pp0wF}LUKjUCZnP@DM?l;7>6chieEE&9T31jEo;P6ui1{|gFNS;Lnj||*4Y*0
zY)1w4njnM-D-=2ms-&np#QPbSg-IUZl)2hN7E0F+BSq3u4aO2N40p}lEJ&*G2BC4i
z(e~F5T2_ckbUPVJ=7ApSo;||$i&3U2gtBMZG<o<qOvj1b7Q{8NS7@U*+;d!&bzd@v
z+58o|g?u!&=x+KrT^=6T(9_cGNj=jJ^y%l9X^z4?tzFVB3-|jqY8!KhzZ%}m&0!zR
z?A(9HeL&iFbl#B_TmR$_gsH3^0BYs>J|PTYfJUI(_rHITIz5&X-wsiH^FGWE{00~1
zA1VbMn^1t=;xw&3->(S!#SV3fKOLD5M-;-vL!}i2JvE<Rx|L^5ibHwoA*}cO>I&i?
zyqC^~?oYWY-?;2micFwZHYY0But1+Yjcny!vpks-K!hY|ERr*xewA!K*`I~ZaXr9Y
z^1~E<5(f#_jMlI&3yugL7LVX9O*x|%SK6WOg;FC~Uk+EYYvEbZl~^w_`ois|Au4ow
z-Nf@;RMKYWgANh{b5q18WjQsF|E5`)3ye!$ac66rOu+k^GPksBUBc}(h0)becH$f}
z3+^bC&Bd_RQO*iF;3@{pLH)FJV9(r0ZWZ)%%$@RD`B*ByS!Ml}rzoNL#H6y-Hn<#O
z)ft-a+9}Hc=j1HK7~M_p&rq+S@$(zFR4vXw#@_Qj5(xa%PANaFY9;KHa<Zt7Hp*?f
zQKG%^A{8?rzAsav>vyYED|v2cELXnH*rHjht#obudp${=OJ{xdg59C9?S2Yle-hxq
ztQ)F|`cP<kKo~iG>2zEwZ|YmU?0tsqQY=KiPCF*LTw-%O8dN6-kMXodFAKPKh(juy
z>YZbv&oy$<(Pk!pm;Sp_Q(p!GfR}%+S}SpoD?v|4{mP(TPG5WzPb0V=tRa-KRKp$T
zBYiMV*(7=wumvav!~;+eLJIbl-;PnngBOp6b!OjXy!2$2nXI6;{bEmp0@k4n#I)z6
zl`K!bQCL$Tmd5n6`h-{+UbwcJU>sLmf8CICE8wUD6jPQ{W_9sb{jz(@_{JUNEe3!1
z$g*E{>=J-qkpBc|U!zW@$2Ln@xrI5}RKhOyhLm+Z%J>jHPiHYZ|Ggcj;vX$w)`^7#
zWousvw5rOtrka4iu8e*#k-_CdP$b8*4{WFO>D5@~l6IvYkoyOE4cm6qWz~QE_+eYk
zgJO7zm~y*XbSLuJE1j1;D_(v-KghP3P~+`&n{tA+FS!dgYHvA-?1w>J!|%LRAES2j
zF-bpS9yfl`+K^Ejb<KG<A%i4=RgfTtBv#Y=U4sYBFDoP50>WzY%@(>$AHZ$!tnh)5
z>Y?Pp%9nG;FRzNT#lPswxoGyq2}$2>J=3`i5qId=bmGr&jx2pdh=rsHRMK|A{eI~x
zvz3%krDm+tUxY&rmhjNv`p-z3Znbx^!qyTM;>-V_|CramcyeV<eS7~P+?Vh4J=RwM
zV{rUb{dC~#)83CZOyS%kAq=Wu{!9njW`9?0)loT7MD~g_p@CSh<NuAsYEcQfsy<h^
z*p0on5Q1LSa33v=E>hdQ=y^@lh^rO4G$cUZKKWH|Y&9(Jc!t!-aP6SbXv-)*cD(-Z
zwV97hW(wMJc;$Tm<TWs+^)3lBVTg!TwRlcrdQg6B(q=3JdA0en?Be%Bvoi*|c`LK5
z+vthIcMv^3xn++C@;c(6R&X?jRZf;xgD>K+oT=tqX&B}>!b4-yKMJ9#&4~bikVS>l
zkxq#D{rlab4GOpj&Z}KI?G&=vYxB)PP&enxM%IQ@-odP?@Lm)8eC@FdH4M3@zY0&E
zYY^WgAk(-S##1pt<;k6%3Ypq!EON$^RXPyEQlm9JLpu&hgu*<V)WmY`BR_0CE2G;+
z9Y&nL^LAp=om#T=a(Jmj1-0@?lMp2oZ#iniKPf_EaxE#KwnTGw$&L+=`lL0waGBJ?
zsL1EQaQ0OOG?_uQ2(*okI20GqIF$e;gK*cSCQ_v5va`ZNG24i`j?g^2UaDeM>q$H3
z&N+Lh>%8p&2mFq%FM%5a#;uDYMWRvJJVl2M_yJsn{=fLP|FrG-f`<M|9lbq+7$eJ7
z1%~w<qgdSljTbO7twxxoX}>|ECYi<yr+5{`CXv9efD}MZUG@4@wX5=Aw-7bsdXFE4
z?Uz~_Qs(VU0Dlmg!kxm~hWWUUjS?VD*$QB{dK)wJ5NG<2HMtVd#OA#x__NRpL3?;5
z1!40aX{C-ImMVJ=fe}ZR%Nll4k^tJ1Pq%8i-yJ2e`*E(39=@Q1zqo3;JAJVkb6~v!
z<R8*53i1(iEeoXg8{oo3>Ze{tS=ZAmHF_E;2@Lxk4pr7T%8sAwq~0QwUa@gSOR+kM
zR6`O|OgCRb54LTlYOpNkQUX;uB`=e4%Fik(X4|V}`*MP48I`t)VFHBA1<ur)-G|1Z
z+azNt$u3JBB?-A<>6yq!N(X6Nm9rgouUeS<mVqZ>a5)oo7dcn7`^2X~qvcCwyYrma
z3Qn>8;j|AnoW%}YkURe2>Ek7DtX<2uPW`K;-Ah0ZH2>^h1j5PA!eeZwZ9j_4M{Iq!
zTg)W{=MzX(oS4OLJB5kYh@?s(i|&b7tmDUQ#Tt7<>!n^JlnTlA5P7(o8+Ll|>{tSL
z&zIorCw{U`===f#!kz;AOuF$Wd0lNRdXDbLc%y?p)K}9L!eYnau!nQJrM+Uu{`<?p
z5Q{;Nmo(wV3pXa7)l)DkY@2OoSs3<*@hbKZhIeB^PjR7~`SEQmt!8D)%>E-PBiN<I
zMPsfxN7J&lY<IVusw7ap@k-2Oji_8GJ%@Z4v-n`4By}~;Fe;jlvsidE%erjU!-ON|
z+0M8zRpLBohM3uai}h={s8eLsU?gnV9mm)pX55Z;AK_Uyd`7?(eawU7#$-MFSx;Mg
z$KEg05ZrQYeJ?snRog2(W6$orIQMb;6PLAvTzc6+%$vdpW>Gq+XsT;!_!4Aq@(ZpH
z(G##&&%%nV9oHX)Y&?p<Sml-o@>MkD<bB!AoPBmE^ZcxPB97JH?Pl?ir*((T`Rr?w
zDjlCtjyVFuzZQ*zni8`=9`qk8<y<Ai?YYs-QZGd9{{LEl4laM+BCQ(hV4G2{2_X~f
zT;j$ti_fAd7iZDktq(r9yM%b8pY&doNkTpw7170sUAny~v*YLQeP&rmz40GV=U*Je
zT!{*|(p~Z!P25df<Tv2AE{OmufSX@}-UgXaRBtlheO*{{iyLhCkt~Nh@x!?u3FdC1
zwpjI^H&#GQ_yvEF5Pn16ixPv^L>hj1*Eq(5$B3u_*L{SOxxpuKad|VGH#;4zgQ5ZS
z)7jK-FR_=zdF>3kQX~wNZuPZ*+z;i1Rr{~$;KiAqf;#xPOjq=ko{{8hT-FDIi`V6k
zo|T3@24pz%kKb!V7u&Iy$g{Bptw<@4#oU)7cij^j>#cA~rZ)7^4Vc@N+ZoT$$Vx|!
z-K}X4a15Fz4TR0rWyo>%T%aHxT4E1pRh7bABH_tnmji-d+WStp5kEp=vRiGPu0F%`
zEyr#_3Qa68$+H28U+VAMF7lr#PuU>@>kqmnWvrfZKIM|?vusquJ!qohbdFBW##wOT
z`ciX4)lPK(FsW{6qo?9Vl=jT}?hjo!Ek>9N7Q}9$A%2bPJGOAA{K>&rUq_I-r{kTo
zRCF8SsNCYHJkxj$KTJX#-`PFiq!hJ1U)A;(2lvBl4+zjR!nhsuXq-nd1l<Fgv~^Z_
za{BQNFA(NW3iFroId4}(!FPdg8T23rU2wXtw~w-qIM0Pz^N=cy%|&=T*A{9P+zxK~
zSuyFtw!pT){(Bi+frPT(R^r=EgZd*q$9m+Hn;6rE!S1q};5XzJ9El^hb^SSzhfn5%
zF)H`hB(|@;dR~|>6B#x|DvKDAwEsMkF(NW@Yx88-;$(PS@f^?!%HG5jNk&(?4ivwR
z+L}=7AnDGlHY{+X&yH89IJ7L^U-@-9dsIJLdlmE+#C7cP^3CmAT<1dXCUlaUhK0=#
z7h4C_X%_`tp`~xK4SctHR;Gxv^YA4n!ij=urpVSz$#%N{d~{CQF&al0g0et%WmeDf
z-<N;j^?5C_7*3lNZcxV-gSUlGtU(>w7pZ1`?^&Vh#nYdPPJCMr$=9Gh8iJk-HMzv6
zbzz{};qc3d$e21ey9XMa5}bH2{bMV(2}5+;_Dw#=XV#4qX{Yy7BS*TIL5}h#`5z*r
ze)8<qf99wMZx8vNuOc2KDfi6%UOcx$pAvJ!Uf#!g>f;XL)HzcWkMS)R%q@-=n5k04
znUf`4s(&V)C45%jnkcTP;`FrKQ}-vTm!9vU&!SKIwvPK=h&pw++t(ovx-B@}0{w@5
z$+h-B_={5&t1mVk1FZM#PYFe?o^oF|nXE<#M@qjJ0-DP8{=h*0u9Io306-srj54B5
z5zejIuhIC(QlnoAdPz|1*3Fw_LS&Uf8i@c<pq3I$NexXf!wZp@!~^u^bnoeNk6;@=
zQ#4axC{ifAf}+M{xWnd_g%044a~x(F0>Nx?!;@buMd{y0Q${<}^(U=KIReba)q<87
z9G0&&wT)@HUE(@?GVo@o;6fR}Q#RU_Vb}70yOSk{dPDbNMsTF0`r!+kLbr9?5zn<T
zpqc<{uWfjS+@u6uuO!2PIJ9vx=N;{}RPl9K|A%Ss!>Q%28*Ciiy(96T?Sgxz?0DNk
zLmfSXav@H0Fo4A9d>j9VYQ$*8t^U4(b6UIPx>i%@X&N;8>C7nlSLwoAf!Dh7L8A4I
z=W8<_eaG<!T<M#cj<BT7-G<JKA+&;WST-$o=bmwv?S}AZCu4gM)Rp$|Pg&o8`u`Id
zAmDm}4DiRK-CuEuZfHR3Mg8-|3z~3NhEec8o;BvpZ*6Fx%)EiBkHf(c&w#=V{NN<8
zgT`Ru5+jhC;WpUM9<7r0np;^IC_LekdXw3k;WgM-!;<j6wzXA$tIpN4mODkUDPMCU
z?&b|8fK=`m4Vz30Z@int>1vPyilo4ow!~dyz=!elNy{BCx6sS7u8<gZUAIZek6ax@
ztgEHIo{Vx>CrIS|a0oYB?v$!5$2xFzW7MglY0OycoaBLzyC-4_RjHzF#=_{p4Vp*^
zkVcP0Zn6i;myaA<r%cyohbWBgROj656dBK55<zfUI(~ZxP^v)O5;$LUB*V|uLdJ@+
z0e89rQLp&+S*4FWy*ac!aa^3Je*C@VY#id+#cEE)^Yjzq!Qc{HpIv=ld&k^9PwudR
z0@39GS(8{1m(G~yl#v<y_6A0PyM0AeeRfk8UA{?Ro*s&+I7Mg7{;3QY3L=|LMEqJ&
zJAn9lVptpc?bcqNf*QikvhfLdx(D|-Z!Lm+))*{qBBvD&4qoH3R-(LTp0-mdQ)#PX
z5IFod5;mIi)11%ZHSDqYv1e8DVP~+*Oe^w7Gh8L<ETRHCB6W;(e2L-lN0^!p37idh
z2dy8gC4Ye+bo&=O<eF8uFhi7nMgsF5`Y<0uZ~XRoGsgCU5Wvlu|9|K@%cwTDb>E}K
z-9pjg6qg3KA}Pg7u|k31ETA|=f_rg?BEhNPR!VV)0>O%Va488IC~~vTID7AP?>Tpj
z_rv^@_Z>5H&d2`GZ-&aYsL;(ao_+XJN!I30B;*Ai!(o7Qt0|ZymGE{T(Q)Kke<CTF
zBG8Rj%@-Xq#p`(HR)@aJKD0tCMR0Gv{Rhu^0$0xY#&NL38srQ?PP^0$e1*W>mEGI!
ze&r;Qk?*@cqHMlb7g@=&gM49#4wae21MT>j$*nwR>21!10YOupc#jhpZXfgg<u?6@
zWgzW7E1{xoOu#sxw;-rWU5?*T)@02_);4fxe6P<xzIsFX-KC^pN5d1>f-5eGW~)H%
z#AqFBT*D>m^Yx;78bFT3D7J8aue=rhPha4-jsEaYtI4}AdBV0Q1B|kWM?XYdJEpTd
z-HMX>@FqM-#~pu~?{%Xna|`LL(+bv$$Gqge2n2-1`3%(Fug`k;P&mBaJUVDj{a;Jd
z^HNx0PGSCGQCLY~>R38q{siWP=Y$ACpDMyBLMwt<f-pj3LXRTuPfrzjU$pR$%h76M
z7%9H$(iAon)~a<f@0n>?sfX<qsRA)Q6iQMR%hzk5WMSbXp^gYo`W5ANE=3JeB}jI_
zj{+@P%>LV%xGqnu#6+yr)IZCe_}G{O6W*Gv#sRqsSS1elKp{S7Wty#0`;5c|f<P7z
zum-^hA^BHdU_oNZMCg4yo-{dtmX$W<+lKYDHi89u-td%F6K^1@OD3QQ8*ZfV<0(qK
zdA{iTfX+L365~K=LgG-QMVlmyYo6Wttl?YT4+D>l{M#K)(K-!J6CnM3r=`rObK3@{
z&JB1F*xk6|9a{HId@Ai;RB%`$I`JH2*SeCyJkdC5vrG>?iY{RKHS~cSop=N_lY$_X
zJ<H23vOMRd#agtjY9JA^&%R1@3Y1bbH_d)*?o&G%j|)cM0xw6O-4yeAnf|jd{DyoT
z+k{=iAC8Cojyi?Bc5=ISq&!Dy--6|nBAy;z9M*IO85^BJ4X<Nw>rpq9Z#X<m&ir4D
z-57b0vj}^g559-@A|um2WvDO(@v&%~Qay|#KSi?O6CEnbe}Ub~biAFthx~Hy&dQGP
z{(7G=QCrZx3yiEGB9m|FsI;)OBIyP7^T|~`-TUNa^A>y@u5d^ys|MR-zg_n04?ODo
zh)Y&mPBkC%O~rn(;fZ+v+cs6%LqQQnh!gZV5F({E2;UM>SB+h(jWiG`oOtV9>Ux=7
z%6%x*Ol%5tY%h<mYY1xaiTEzktzAh{xfCC(Iv+_V+QW8QFVJXN3SgKuA7FXw>-i1%
z)U*^^w&7WJ8d$mj;XcUpo){jRcuujRIhb~c7LB7fQB#6@CUWbslz~5Ii8}_%;ZWrQ
zTgi!u*mRb>GW+36h^zj3S`%1!PRI-`us@b0YEMo(_D5ZI<FPOfJPxeo_DT9SukRbq
z`L=h;?64w?#wRJ$i4s@^YT`5<$ZBepQtbSUu;hamTby~HE0H;Gwun4Fn}PlLn`!cA
zutn}aHNby#;7W?&;{yq|2r~UeEf8ma_Y8-Hp*zbJ)iJyck_{&`bYCit*kkf};gmLc
zn|C>zP^ipk3+tN75)>Cz-08((Jod->p+0kFbBh*&ioecf4%<V`%sG-369}d$Dbnp=
zWdcQT=s1575E9)GsrAh36+#S(nLmcXyx6iUp*SyaM+0?)Avx8QpXknKzOnM&y2KO)
z4@C-WEVY0up8nwe<4a{@Aqq%azzE3-LMm1akjf&te6=jT`qnRy=N1MxHZ5!e6sPBV
z(!NJAZO%Q4=z6x~K0g05Pnj#1^yqn^S(jWc%i(F(pLwUh8>AP`Me*ii>h)}Xiw*jg
zgh$B3)hvVN`m@tNMdP)e0`aWzNrh`;2Rnwyh`i|{wYp?|Qv4Q0pclOz3lV>qkZMAQ
zqj3XGmxTGfWG)X*5U#j@`l1n^x1MC0ZM@HWm;)Mf_=}{0F8Y1f90#^%GiomX6j5t4
zU_o3mg44Y}KBKSzHI;8Bokm})&sN=X$k#a?qGFIDoJ9z+Yv>;GE2`j@AG%vbS4ng2
z4Ur6D?Hs#m>^}oW9__u+PZf(5p==l$657&v3I|~3poV>oQfuLDuq9eZH+wcmNB32{
z!jE?>i!W|Bi~?RHuMtVee}5Kh!wmUbJDA%fT~HZ2(WB!#o!F^ubZY~WZZ4=DKc(mD
z&>?zTk2yau2a*7m$mP8^sC1ntWxgoVal*KC2*g=(4>h2?h$p&Aqv2YtwHIF?>i~-j
z)QPxkl(P(;_m?+)Gi)@>Xo<-@J0sMbLHbSst+8gSlP;*FzYb6>PzT!Q_sVn3yve<Z
zS7_m(r#gS3Q}>YfKCx<J_`AzXhsT~pw)ameTF1>AwI0qEd-x6=o&14p8!0ZvEcI1>
z{#|^;V*={0pUo9=nmaHSsSfTHm1u4C6KSq+R}W>z|Lo)4OC(*01kxuR#Z@)sy~Wvq
zJ+?PmefiI<`@cicLVV{?MXY*)mxRIuk^~O^Jw{y`$-`;$1dM`*+N_#|FKN;#`!qaa
zx=IxfzNE4ihOjd?Qjt@uES2MzZ{W&>N(@Vd{V2$6qF}`*j>eJCx`-T{vAxRQdL~^e
zgE@rL6{y_>RI)>NK9V(2%xemJFh5wg9u2;4eGU?)N{j<C44fC+UlBSlRn~4q%ofo#
zN4Q=d@nDcTwZVuUoxCa7V+C1jt$BSM141@DnS$nVm;hv0BON8-zpDu$uP<%+HE#RG
zh>-~v*3)vnwCe+LW7L=)!=6Cy`MiKx0b#=S^D*skx+QLjKVnxKW4GHaJT0B?*5ZDP
zk-9J^^I_hK(AM$o4*fu9kp1oG20mitjxN;E%q8etHQ+*8c9y^Nyn7v4F!VuQ>iL?t
z*PV{L00U<?E^r~eoQfbFsMRPkJ5Rv$@mrwlKbR6-j?Y7qIKbhqPrrvJ<>FPKfKL@h
z1Ao_bq!#XW*K-6Q+~;_21%j%he{t9%&rsp39|q?VYJf=vcqa9<yFZbp$UccamN!e(
zP9(BCkU%HzVgFTWfqM)FNu<(6*;*eSVmpZQ@TL5Y^!>qvL$8AG_D|{Y-oGfMP^V5v
zRp!&w?p|r;*TwqAt@34u^8O`!9Wo3E#9LtvsN1?&oclzTRRX&R`MXK?->}JNK|YI!
zpOakN1wKXa^X*S8r^#<wY}B7;bTk;gU}&}fGpnW^^E_P6R6(+bC-%Lwr%9;LU4r)u
z^TUqHpA*>!U`@{jUqTi{0mxdqU|GRcHibcOAPhicl0eXmB?i|8pA6vKneyQ##bXab
zn18yZ!@6DKSKQ-&*tVtSjz0ZSYiOg4D4Mf9bnxJ@BH5(eu{`Av%yb83#gerc*?#@~
zOtAC3Oo+s@)aE5w`&4eJ!qp!1_k8A`z{KB~!@ry==t8%}CiXMF=KX(a<xZ@u7^R6C
z<65w!6AEEUV@iJNcuGY25EqKG>(AdEZA#pj!lC>GfzxaNM|P#t2v(7gSm9x2I<m;V
zGS<4Yzp&EurNiQHnoUx=Hru+68&|%Zgk^{MVr@F4X@w-VVfp!;dcWjGCBR@ZyZ&LF
zSP=wPtww7=Gsdia@bX}_eDC=5RnUN{uz78k_TM-#r5&6UTy9+26_(Xq<huW=Kc^&b
zGXG?3esa?IdaT;HEV|oyLM*Ts=RPcF@%T$21H&Tc@1rHy*Lm_faB(UPaL~ZYDXbss
zl3-NBKj3;h%iaRjLl@;)&@s3so4*L5nBa<rQa0%+lR0;nhXS4LOibvEAd^ive9fBz
zPlOwI?N=w}S`JtW2okv!e;1UbpW{j}@H@kdg}-f#E!!<W#s(@<ML^#Hn95EZ89bqW
zsV+S|3>O@#OA+<^djcL?ZGX4mNmTfTzdV;KYdZMhH;5XwE0^Dsgcm^M2NRt!7WxeC
zSP+IX3M95^noZrpHMQ8ySf;)=@ARi0;`RiO=kxqfJ&{Dc<?yO+tJ2N`o62RLz>Tx(
zv)$<HM_gnIw+O@zMQa${_j3us#V$dQ?~m*Q>)+4K8~$JBgAlVL<{5P`h!W2tNvBuG
zUDF$iBXFPoZS?wjMvG`Ism67Miy*RdAS+c|{Gz11aixO9VnjND>`>BtNqfopG=I%*
z1@JIsA5?0L-I=C?j~!Gdf9+9yhG}r<H@6wRslNj1=6gs@1oE2vG>+rYOX7!^bX=nA
z5YP57*BlPHaqnj}y=(<es<vuFr(3#k<eVTb)Q5tHGLiyM-QZ`YRM(7c!vs!;F$PZE
zs*St(b98=)FJw-}hq$M;Umd}8&TH)^IyTNNMm;Qk2^3eAr%13&wZq;L0Ih(O{`mn}
zYT?l(2_Igf*l4QcXVBTQqWkduNgc0r9T4_r6hNGGQ_k&UQxz@Lye=D9&kJP=Tr2NN
z2uySl4?GIHFK^q=t=s`-kn~~!g-6VqVg&t`bDz~p&6`!!>YtS%KXRD56I^}qDh`If
zS{q<mY2ftoBny~{7ICV+%6VrxE{K&Nn)i0au>K!b;y(@(NX^o~QoHNLW{hGw2S=<t
zfiD3M;d69k0r+x8c_GZ5{UOSRPW&R*hHwt2>og5m4I?TqR|vIj<cR$4I$emO=2v>D
zBoQhes@W|bD&B~@M9UeQ0;UO??wBw@SG)xD`j9PgA*{qMzbB64ggPW~L~vPf!>K~3
zZ?)GY+F)=RIR=s=r7pb*%1no8CTSqWM5MsSGeV{px|HMz1(W8pSF+My$C*@vzdCF(
zC8lN%aSvz6!7U1JS_muS`QT%ZaOLJ@yRtZVtfQ)Nu2y9FGw)pp=WT=qITF;5*n6Ad
zE*~&EIH)$qcH&{q{W`wtvqY|NYarHqc56c5L{b8?*WUR1T>2egTdl2<x9=pc+4RHf
zOqb5XB?z{?Y>tkL_!Lyc=64C5aSPXE<`IY&$UoW*Od2Y&xT{c6+a0C4CzG`A#Um4*
zqJ3B0Yim9%TcOZ;$yBiQb3kRWPUiCR??h&<_Ey|j%0e5sgk4%9tB{DUt<aU&PMOQg
zQ-vMPSXsPztz$(>Y9aTJaw`Vbt@NQz5KZu~GMoCWMh~S$G_2QtbN6glZm$pY>RRKF
zx`Paw=xQ%F0;=~xxJ9*Amgci7WGD<sWQh0KtJp75Lbc+bh~Uic{gh7Wq+7lAV7}=l
z%1b<8VTs&DWl0sMz*4}>Z|6|6=c+KbG)om=%bREukZEu&E>Ow-FJW#8zqinefVqVa
z7ZEVGI>T?{Q)~6bB#>#kPAPS;^DF3&r(V3>2@0-b*mzDN^#L(|y%`D<GB#1zOmMDs
z9|I^60o9N4rm}-9${TF`iSt$A!we^VIGGuK@YsUNtN}PL#F&Ww58)5i25&w_a28(5
z`g*{uwqs{}mR5C9w9ih#sNRjejPsChq}FU^?u5cYy9rf2nNOGweDs!r-*D1L!Fi$3
z$Du1nQ<yU=tUxmKLfG_-wbbcpevFXuL%qF|slS#PHJ9-8ocp-RSDTZI753WI;O&(w
z(+~fM68@38>cV7YmE<z!XyHiaHQqWyEz1IN<8eN5IN?(giV|x3^=5LAq0v28I5jwX
zskIwHnB1zmL?oC#u{5b}%N4;&g1N+4#=nv0jUO^QaBC1eU0l9=;1}fb!(v!{Vb<`}
zzT}~UMU)v<3>-+1${F&hYD+Gc*`}Kz2#4t{mH1%NAy)ARr?9Y&UbTt&0j&E*rKR_G
zf6(e@+>IuRbRSJ&$YTUYuVh`{`~L5U?X%WMS+|I92{ce7*QbwT%<7Qyf{MwkNi`kT
zw4f6ER!(ku+=ccs;-s+gFZYe?8s|s5PRl>-toil^3Nn$k2X7F{OZIo#?h)Iql7Afr
z`Rj8^aitirc-2gOJNZo$!+qGYhG>WnzKY&mru&!(w;H8)rDShduwKTWWl%m6299+Y
z$R31u%m*Do<rZZ<9$w?22u@N!e}*Ktokp$Y(Hd}d53XATZ%9?}8Ft5OQ7yhzv)(MJ
z7h~;3od=())?0m!4A+h_5;;n8tmlhFHsvqT`bWIWVjTv{Z!11aoZr{2f<%BE<@8xz
zR?A;(xPP+UTPZA5va?^$M_=B&o>U)RcB#BsVCJ09UK4d)N(aCe9Lbc=Vy`F6KpoCJ
zOBhkl{PgFk^3!NKcb-5K9`<4&Md3yaOm$j~qPyDeh?R?V6`;>wnx$lSk~J-B?bfnA
z_DTO(7xQ>nNFx;RR`ecn>vJ#N8ObtKv#D_A!V%?L7{zk<XU4*!j~kkAF=Pa9K!&0W
zART|p;q)3d+L}L`5UuAZ4B7Yo5hN@aI*tJaBG)SWIXjHsZ?MZESQ=(mRUZzgx}dF&
z8`D#Ca%Xo9@6y(LgD+XB6jtOAoL?_WpOG^{d`MQ2(Sx!r(5g@Jhe;2|$IHm$!K);N
zlB$G{&m1<Xyy9!#(j3aidRuAk!|X3V!yJbnr<=_Zb)+RQtuqA8wm||QT%TQD+Gnky
z9cm|f+1lv%iog3R9gVsdM4I!E)b!6W-X2nk(la>b;*E_GIY_d!bMAOhGiY_jB|5+<
zI_3F}ri~*aVJN-w!-kD5=KESA_#i}``-8icU?h{NiGk?OJk*cI=lmrXlIV<relo7Q
z|CcPp+2ylckW~pfAvLopwi<H4-Il;kbLZ{*rr)!MMG(!ImN8pNZ}?x0QGlvqh_<eJ
zvZf+0rcAdC=4zg4Uqn~hQ+nmin2>ZIN;RA`7upuVA&w#t`AeGRmlP`0AwH{Vr3JS-
z2u)q&W`CSA>)16%lq36m1=;YEFF+e5)6@9%ijbui4n+-f(M<_JGL6<Qq7olK*rm=)
zz2)=P4fZGm>hl^=JU@hnizS<Sq$mpCI1nu-gdZTt8MP5w3IwF7${ty)%)~g9xO+pq
z#w6Ssl^Yq(H+obEMrf{;h;aLQQU{I!6FAoBHboUN=@*L5XSSNs7<)6xftthK*QXG1
zg$-<lIGyts5}Assz`k2Rs|kFIbGuHk+zku3=mJ8RjslGq!bF+Lx_#*Nj+ePneBGmV
zVbEh6Y+r*0lSdkX`?+@<mI-RU9$v7v)Q;s6owb$2fFsCarnXDhdQw8Ykv|QOVLTX`
z@T3_wKH+JuYtk?AgIu#*|1@617hebUA%3s$+m~8z_^^&3ccDv(T4ya`=Fx{c$xV&1
zXFh%AjqZY1)dx%7UTk!u^IQc(uKXY8ob#kQziJH{Kyuh|ki$;*LwQQUz70ql!C
zI==#aIsY`q9}gOrz6YJ}#gcTvCK-hV2Ur(YKr91wbsrwxH>ON{4L;MURZ(V;{x#6z
zwh{>Bx~;5wz_ZcluBaQf8^Z>2C-8a3Hls`Fg*C6|_M%C_VmsJ@mkE!q&6LAs7xsQV
zavkUUZL9NLLiW=5A7#S8C*%)6d7`zb;Nqt+`9DYNWVS1PC);VrZzyLIg{oebhkPN&
zm7fo<&bg5%JGrjr(S3ExR2Hz-o<;i3Ss<iOcG*@7i@n2imM|M`ns`_z+@{2hbi78~
zT9JK6247E@BpHWKI^Fs?3HW<P5cpL^&W|?QxD0-~GzoT4Ka%BOGEGAa50Bg1&&iv{
zU3Yv+R%eV9V2~j+1<G3z$Q-u7d3+1M!2vqL6fAlRv4nh14rH(B$xwB_MYxt(AeMDM
zMM8l~`hjY>q<c{-b^2|!a<Vp6O+xv5Q093Ivv`_*NwASVKP_@2=uqSAi_gp1&$G!}
zQ$`LSgDc}W;kF6t++U1A%=k8k=Un4a9YKg0k7kNbQLyG9zPJBt%_R~ae}?l@x?+x}
zJQ2svG8+~_Ie|36B>{(en_Idj*jiEerKQFC++8X&PP{%&%j@n}vA$U4SU!syF%-_<
zkgOYhJ0Uhi-GZ13N`@K4SQj@F-_|oN=Nk((xG<$%a3%(Jar*K!D1<q(<cC@3%d6IF
zoo;55&QeVgM^&ycg@@Z(rI27>Laaqu#9M^cgjOp3@_JGG*$s#;2`dH>_JkPm)x!0w
zM?s~VS7)l_j}8b|v~}3Umf5#LM}5V}vJT248pWAT5^QzJ@#tvP^Wj7JjAbuP(zX<s
z8sU#3Bk6LdmaaVFh3pWj(FLM~g`a{i2QY)yiTQS^C8a?p)ivQI&A+Yxgw-U;-$3}6
zyZ)sG*eg`F{t%ynE%?$Ir+I;&{!r&(!`?yo1}-1i%?tUFMZ$?8@4^SDbQP^LiVUom
zYUcW`8&=7|pYgUeOSby6*Le;MV&{9<Bd$lLbJLf0&_Jl5ZsO(^WYC?I0CY8tNN9FW
zxTDt~xb01o9SBOSuA}lP1q?2h-F@kJ`(7bHd5y8_U0UBj(1KWicR>|1sECNc;pP{&
zT(4SKfn{fe?xmGr@xv&^;^%2f8RbE{%u+3DQRCFN^<38#xW_~9;y+gG_sd2`IjxR>
zNsH)=%h$)s8G;s=19QmQwNB7lp5D8~(simN=VYb@XEueL`7`6@^D>2B&*0;Mz|w)M
z;*<MP-Yo;mmt4m~uB4-~pc4ex)VZ|mACi3TP>kRRGF!Qk-A#Suar>`=wIeWH$H?E|
z-5-?d;-{Y@^9oz<(FPAWh|;!JaG|dXG>6xqw-9tT?N+(%?2nwvP^UhP6UE_JCBeFA
z^`EV?PmRbyyj_;*s%HQw6ehU;LFeVw?Dni1rPxOIdrt1P_bJs4h9$7NpupzL!ZlO}
zX#3&QaV6D4&C{ZnPwY39kjN~g{afcMOsZzERt}2#XyPn{JRtN;{->8Ly~dx%I<v-G
zuM_a<W$Y8+U><mP9;OTs!@}uH{O;RC0;m4U5>e7{F$S?9<Z$9IL~_8T`%H~7jwWT2
z+j+kB{MZ*Pjrxrofga+M+3eb9I|D#m3kBZek4UFGC;kQ<C%a2bJkM`PkZTaFNt1lk
zlg9$~RH54s39$0fj4%I~w|_elUAEC7ie^~4VOC)tlutP_2p$pqCL|_!t)A?zH5G+}
zdz{*mvA!y~$EU~%;7{iF(<qKqq3!05rP*_;AoE&@E?Cd(BOoB_aN=V<l)Z6SH+Tq}
zF5(vEB9F3A5<bVNOEu=e=UP{_>y|wI6lUd6$VtA-EQ!BdjOJMP$H^l?zO{9aKf5JZ
zUmz#fR90RK^9=Lxy~-3h#!=+F?0OBo=}G#7z!x)6SnPdc@%tCv0<IAhtwstJI>y<&
zC~DF9g%^)|jI1E5hwHJvQ8F^Uo9eIcx|~>lxS!10k{(u4SED}9>Ix$*7ziD5U-E<E
zK5idNfC$gh3feU3S$bk>4I<QG=On?#L~mZ+FO+i#lPENY;$1!hHK#m(kM%i8_H>AR
zuY&!uD1JM!J%#ArMrm3t=NsQZCi)n?v6~yfYbRK&=SIKBrk5e0KdtUlLNiv=?_uxr
zi}2vKLufSzLA|oe&_>LAW9go$a^`E?*e3KT?eM2MA9?%&C|zgbb=)*ZI&1}$n#W(=
z<MoW|Fz_hCb#2Z`5cg4k@a<tQuoQ3B@Y-K)DaZEReC7(a=KgEZ@#|*4AEX5A++cF4
zV<$a2A6I~;_|ZdALx4gtPr6p;$AW_fZ0z4hpAni5jgW3UnRoTflpi9pmX^5xHJae{
zWlAW(OI}))%r4F*yuoQPm*tlFW5?dS87zo7_!hY#_OY7RSG+^ae&eUzm%a{;ixhX+
z4oKqiK)fGCjr_E_*UQ}_P21qb7kd}9&wZSjIp$hW<AUc&_9(Qo`nbV4m-zFM{peb@
zxxzp4Wb2aT>$`|RR4Bx=gQ9cvMk$O^A@5<7%qxFwBzS{Qkgv%4t#H*Wfdc|tr`4D&
z<=5l2a5?738!#jvCm2a)a4)=;7ThvaBj;$cb3ta`npQXfYGN*82$Ekpoo=X3bk#C+
zQgz~~7n-7yHfhlIc79vvdg)E6`mpuFxpy%%Ar5mH&9TBpmoLOfE$tFCb`f=Iq4}vT
zREst%_LF!)I~(nXNSm?e+J~(0?v~nF5w1~8w2G3H>hLe8Qv+XSTo$LNz^@-GUac#=
zJ4BOAuJOs@oSz;0xZ9X+a(;mknnR)q(3LP0C&Ld)&mm*=Tkz9p?~<l^<j0oc7P6Ht
z#Wgp?!sA7VngmKr8;piQ8oZHepUZlXkZ<?MIQw6{O;yW?5Z0vi#=;~UsaPSbHp&Z1
z;+Ms0Sju7kSbA7xl=QFFdC}1(MrUC<M*4auMxkO_S0N6Q0b3pp1E#VPoWrb$JM?*>
z214C5@FvmXZk=Y$fv|wUeBZR|WmvS&O-Ajex&a#Npnw&ER)V1u$L9h$>TJ~46CO!0
z%KRbL5H3k>6P4Om2rXUX?&t^-^(SEtSe=OHOLqjV8<CnFyTitib(Tjr{R+%rHS<6M
z1VP@*M^oA%25yhnYP)jR8xI%PndAugeuvg6inp=no}4Fl3Cj*+|L_j4yJ3qPBNYX~
zfF$e6nOb2>sd2Y?iI$bkzP52#rAP5cZe?V17fOR(0vlXPc0jKgESc;*jWsW)G*IDU
z_CzpY$GS#zs@D)nnxoQm`xgwU#{|vw^to!N=rYS^w+qkY&=6E{T4Z7V{d{>U2e-@K
z8cH!u0B(d%>hd@9OlSa`mDhaB#H6Qa6A3Oiw=b7mSvt^2F?pmi_g*r#npcr4DEILf
zttYu^=dmn-8>XEIYKS2xAT^w1T4;6tsf-F0N!z5zTbZn|_tvC7&P>5uQ38SAjXD=~
zzMfSx*w?XmMo16<GzR$1W(Cd@$?CiER{%9wzt6+3;7@HhEe0s%5g=x3X2bK4IS!Rp
zwJSHH1zyJe$8wiWzuCx~CobPIpqW+(@|9c`$khXy14Yy;+sDk}zgMKRcAhQrRjAoj
zP{LyS7=Z95Yu|8uljo^rYMom71xeL`{266%-UPn#8n;`KVCt+B^QG5uj;gvj!snJP
z8OyXfJM+yuT|W7@0Z^ovr^A9!+-G?kKzQ!6n&eo1N@okDHL*dXp@VZKhR-DqW$$Hz
zV`}<3>oYf~kUtG+<f()84bEKzJZ8cQo)P-nG<AI2sReeiFQT_LcV{{@-d+@8I_&lF
z<t*R18tpyLLe<aV(G%F?(VIU11c8_aW0u+^zsTQ~BC{gWvq7__=U=P!E6)%rLN_JW
z5XL6s3UuF(oUhe4H-wt|;e<C~fo1=r_{hG1O815ECQ6gH)^u^RGKc3)2L)&swtH=H
z2t{Y~f-!-4A*@M(;nl`p{M}*Imvz;3@=={1@Ybq!4Q@XqnKaNhpq30oX7{0xr=~DT
z){2uauxdCi^NvPa)zhL-E@?vgc^*^Shu_`U=IMKi43aHtavcq-{=!DN)(i4WwrLKN
zS;(XN%!cT${hK7OOGJ&zKj!d1S(-##OeMB%ZcO!IMalr2W=?jLpdzO(CkBTD7yWZ`
zb%bHp%dXdjFK8qmXW{s9hP?Pd&Ozr)Nlv3OTccPMg!3^K#i{$$fiwAcTXQOFYTpwC
zzi#CFP`+oXij#MXhei{ISlJ<&T}D{eH{=-9wbVx!^!YfyaqKz$@#|CAl|;O77a<^_
zVZIKhG>jS_Aoe*t2-AijIxLXV5W(wErE|Q_&l=5T4$iZhR#w=Xq+(nz7ifRsp#F;d
zK0ixbC8;(tBWO5LNH*2f!Z%#xZr^@oUFx3WsOvF$192-2Av%8Rk<&))^@Ph0?c+0K
ztm^M6VI|3f;Z??|@0tn9dlR+x@j8xX9%YW0?~6tr_ZnkljG~64h9!*&J^c+A6yyp^
z+B*=gR3w>u4u{fmOEy;FUYsojtRi<k>~J-iUpsLkA64Y6jXB9aZYj@Y|EfWv+qVQ;
zjgVmyrxT&b?x=~b@Ov6K8xVb&0^3stiFcFQPksb}Kd|K5`GWA|2||91lTt$Lrs=;~
z|FB`-!9MMIRD<H+rk~?=&K!e7&76gRF$2cYKlgCo`@FPkjo^hORCw?6A6l<C-Mm5;
z^~CU>S9O^tvuHoPvt7;nRBos35s`BJoRa+w!aYf(Q&Z4o!a$5}nkD`=*uB)r$M&~q
zB2C*tmfXZ=oRPoP7B6gU@%99tqQJQhX>*MBw|i^LPV7TmWnU(!Om1P_*!f!fgX8r@
z!-nxfXKB{X76Io2z|I(0fzUs2xb9E7=HN-E?kHvE!gVKPR;2*ST1W0;ri(N<>7>1~
za8O}WVo(AeZW`db9;IV{Cw%d_T>SdsDRktq^5oT*`wFjtwMjS>iCOh*C?h&#$gXO0
zgGXMPKJ&WXx;6SH$93#s3u;PqW>n>9nT*3SuLgto9nu~)&iBbKJkMfoxG(0GZw%U8
zf;r`SLT)ORQZ4(;5rt^VZ?gt7x~*a7r|GxNIOjuZNS)59Nzxm3l(P$|f)mGK5HP{w
z{{CrzdBvLF6+p+!LDocKe^A;%w!mz#x4zz4*5siM+nq$@)8B+g0QT&W75A)&@C2-5
zEAN!lb$$w3@{AhYk}Dhq#-KY4oJX@!`t0U)5%sgr=(@71;2IZFhp(E+lwFK6CTun2
z`Fy#d-$@2`n4Ane4tl-s4zrl-$7zk~3~7m8Eqw9Ib@5-%Y@g2WsJFA|x2S;%{*8ug
zJvVswzsf8b2|{tgaS|Z3pVH1b6uCZegP)TRdw0u#uJFGkd@VdIW-acjTo<_s(@RC)
zbyJG3Nnlp=Wd1`nt4G+L`j@)y5IUtxbQ2Z))oHpSUB`KikdmO+vTJXbR3PY4w?wzV
zTp%sRxjxiJ&BFVwu9h&%$!~hR%O+)=kfZxQCoDIU=H9E`!z`3aGaO8*XH-yBU)R^J
zkHt_Njp4*K_sRR)>oaVYf@Hcvs+K<X)iynyA2{?)Wy-#rC%=E3Fk?X{OF}y2HzI_@
z5rie)ard|f!=L2>x!m8qeGLCY2^hdzT_E^A^pkYo_)g&{AFqX`|N0^NOHAR^5sfN^
zs-j-^DqC}mm4=E+KP6x0CyozR={V0K<{7MTGx`MQRaI5X2-%9K;$qlhKB2FLk^ch=
z2nz7I%4?`t^*p&7KRa4kS;%>BCodN8dsOCRJehf$Bk-%eDM5ndLV7n;er|>VnL@-z
z{beh(zNmCtxP-fBN|3<yIPpB3RBA>H&t31-1E?xU{5e2Bl|9YB%$$eMd&rZz55Pt-
zH>LWWB!INqQKB(`(OdJ+D*Cb^S!|{GjUo2vUo!5Fb3q_K2bT0Y&)somyhTR8_1ZS7
z#I{%Vvq34K6%^Bn#C`hUqI4Ro<`KB<g*4+<xCf%zqyt5QKg+9m{gy3os>z>Q2u1Ju
zvm!s%8jn$LrZ-X&VC}JU7h!5P7=e`r&Kp+v`o3p2=*Q*=ubN~FVhPi9T=Ugb<}lw8
zxU1XqAl`rLv3XlkBX_ngbCBm}1|lkn&aF^MO&}w>*UWrn@rR}i*IQCw_w^wv@6iW>
zRi&?MpTvh*sA|v}rMxQ{rIobcO}7cIWE~U9vw}%d$FxS(DOSd9Qt4a2q!`bkC(*v0
zb#38|PYNd0q#T6WDm>U&vHttf)yCOVo?hGNdXeP*<sw)FYXkeSm^l~%15xfXp6>BY
ze=BiSeTYR;+&SDhqfDR*8H|U6)6imzm-22yYHHPq@~qZsBN0kdXt>)6{Wq`useT3`
zv2yFmkGv$Oy9yp-I(6_#=(go8rsqI<?UphPDe>ayi=>;Plb7w?AweAp?cvud<q0P-
z;UxveB>DG(fer(>`+2`aT;kj*_f2z$D`2B76ycl)SssPhg+egn$e<uIkhvQmT;%s|
zLEY!T*B|uQ`02kQewSL&NVLD+c-5{`>LUGQpg;coD=F*aU?`&DilA4ExLx@74VuN)
zZZMW4eZcd+3lq=RT6sXET;WB=qx(n*w}S;^ybdpm?e@0_5jJndA7x|fu$EeKQu!f!
zV&_(@mRH~Mj_+74mi5gC0viJXr$2IrMIJp{FoOfuPIIxXSD6@3aIaa3_p|S#eekK|
zMb${X(L4c8zuoroyXkk=Om-jwIMuVv3(9HeQw`P*yXFyEJ$gD!3iAK)C`v^>C)}nc
z)TYko3DZ?8QV}i7;|-Q9X?sqg!$?}KWx}YMF3hu-oFqzJB59MWoDvf@l5R7k{JB5p
zTrxR{^6|DHp@~@DX8dWqM`7)A%qVwUyb(eDX5_MsyPEu|Zikgh*sBR@xAYuYlOTSf
zbZmvqGsYaA=@)q03fHU(vh4iG7tnC0)4|CiWxOP@eb#V~SAwLQ?kJ{5qUmuEHWq8%
zEmo>vK5pU^JU*LZRNr1gebJE^h#p@VBGbZ+{(D-7DxBRobZSQpu1WPz6J?359fXF6
zXB*s01x(~TwPe5Mw%FoFv6Bf~_F#9k5nVDD;%p0-@qv$7*9PQy(gF^0e0}C+`sow5
zpkfz^{8Qy5dNsAGCK`(~Pu%(6ew7)YkwE6yG_)3Jy}IHlzx!Z$`Z)d&CgBp=m=eps
z$mFuL_>H&F@13{h34<S5vqeu)Uh`=(_iFG~vcyPu$$cgNY{#)2nn2;3s>4-wB*#Uv
z7}`0W?0UVe*~$p`Dq`ZOR>;#fjMphJFb}o+;xrOQJl+^cj5iEQR{DbKKaZ}UpF?WX
z(*><b`F4KNKP9#IeIpa5;0^0;`h^Fsci#%V9QdJfXPRrX8GftF-KZ5L<Dfi{&{`<y
zCo$DNw-qhT&VMoTN7<}eOWq|!x+zwWkHKB?)$q5rv`6lrOt;vQL{083`<n9bV?zG>
zN$117`3%^$Ojcn5z?p6|&+$a!yCq{Yz7T+*{9j<8wV3<71c^Pe$qIPyQ{pYa?z9BA
zeBjHQEG3Y|$j$n>@;(t|ro5Sa;(h7GNs5MadXj9i>qwgG;~{wMeff4)<35XrU{g2~
z+>Uq-AR$)XjgN0FbTe672z}W;1tZ;&%eLqt&+L$bi#oAacEog3SQAaLG2*$f?m0vH
z0I}!$(V&F=FyyP1-po4T3k`b5-mOp{wSv8h-Ax`AB)vu|sAaO?T9G82r+}tGTdZ9t
zo>$}p2U*T4H}S|i7Wcy+mA8oa3=;YWJ?KnMJC)u=7ST_$O<mVokVU^M+3hfUC#W)N
z_=KPpI0EyHyUeUAF_%h?o8;*<xQ&_|_#7NWXW1k3+`mm@w!p!4D{_rb($1q%e*ju)
z3s;>g%)eRV4$5^HjDTNd-LUB}&cywqKz<57Nc?sF2?R(HI>>1rk#qh8+Po$^)mRuN
z!khCjh*m^(+W#8<;qmleN9DiIO!R;}4P%6n`Z*(y--iud#wWSh{Xv4HW%azZx^Ift
zpHoDgW`*za{E~bT?fJBy-ByivFm%+6Q8d}r?|EoU{b;<oZp%UhhMyRoJ#``vmR3p>
zBWbWC5sbe7NpwbTJv*MQ?Q2Hr4IW%QeA=HW@`T!w`eck-(O#b3_suU>$7%_y4Pisy
zZnpDu{PnIm$Yv9Z3$l7(i*|x1M6##0&vT0frLxzRLbZRWrHm@I&vRf9+N@$a_3A+!
zk{HvujiRj<rZ~woRVD;|xsNP2W3KuZ+5J>_$8uU`3w(oh|0+SKfnh4N7RM#G<$3NI
z^KEBak;9CO+}rGAJSabfr%QTCmrEP4ICl;h&pjAy+uW=pJCdf8pHkjL^ryWmNu5Aa
zKUh=TEy11tM7A@xZ$mB~j{lS6u}iXKn@nDc)^|zIiAOwC#|tCbpA&EWxS2bbfRjqA
zFXDUzxc@WRWgM1wK#jCNg;2Q$F>5#Z>_VaNIzm(MltQskObYYW`Dc{k;6NSi+;}|6
zUd`lHxrM4lT8FS&u$fp#f5?Xpz?I$|$4w|8Ef6g@TJ>aboj4g8oD>>FMH*C$)<!{+
z@%VIJKkkDget5~IIO4g-I0ZUYi~R~$$bW7o_S?F0;6p5cX0p>Z@4TPR!Z+~U!4n5x
z(7nFY)})~W(RmJm21B<SVIl<8gQsW*q3%zYZ%L?e)>M}()sQd<LR92WLBG`z!6l5-
zKnxFManJ+$?S@lf=1`X7oU2fu#1!fijXFkeTu=bilKw(oFB-iSRxLK!)7;~J5m^fi
zd`VAmV)Ry$B;9(%<RW>HkAH&;w+(=~;=9CNWg>sj=U8zJ`{v)21r4!x3N)5EGudw3
zm1?m;UD8ZqQ*Jl|0TY7s8lto8#J4<r$0XPWs0ClCw+viNixhd^xz^&#!cVBcqqV8R
zGc=C+^1=D!Nv-v3j;l_m%8($?FqNOmqZ6*gTX)y%##8o^Y!|5|e?8PFPO0tC=!+*R
zia7uCc^Zl_!kum?nf{ku9+(i+8TR${f-=2Yt7N*%7OaM&0g`<hziak~j418Ed7F#W
z(XE;vw0M83%S@_INXKTuwMGonHGef{2rK0_dW=<TCmJrT<USV4;;ozyx6lOnYnm0t
ztNFpU$i9jT*LDBp!c(vJWm;{B&OUuI)9A;}Xtxc9dy2X*^;#Z06Z`v6-LMw*&Q|EJ
zmF(B7cOKrC<|6K00izs|T2B}M*q*!*8NsmUq8h?G5#66|K6)>+wDxQ(*_CmwPvSdz
zeFo}r@fkID>R$$c9%m{{rBeOdM2EhjDD2!(crh33diVX)?fl@gbBhY?`~!M?FhYeT
zTb?A!7*I3eTI{OTDZRJnDsov~qoI)Mdp@>rWR3q^A)5@mw}fOgQ34qg<D$z&o+ohH
zsi%D{nI4lhgRBJX0AdrjA+$&GFn!~eS;6C@7eu={9wZgP?IISTZiGjx@RXzz8X4zV
zt}~YXD4hEsL%LXd{!3<SKQx0{WVVFm2o{hI>6tJ@#S9Fr4|SvP69H0Sd;%SY0Nicz
zOyuK%uo<DKi0hd;dc}pm{Nph<R0FoLKb!s8N>4)B6-F|O?G@*r)D1G5uRQv>+xg93
zOyS3F-mog#qk_qV6+KpKXwz{^Jd;P7_bjf>ekNM=JRr6PF>Z6XB(ll`-&Yaq1?7y;
z?n-r%)m8+FT$UZD7@ZCS7?jl)g<P_33OgFSfp#FSNACgRY<MNM2WzW%K@Gw;eXPk4
zJ-bB)KQVR?9PA$ESLgHcjahG|c-RS5r&gQ((Dt`**;D4UW~t3o9a5W=_hNuPKo0P^
z>64_=(0HC`1*B9<v#3gBhJ_zRa(P4Bf%12XfB#|s7@$}%)2++RR75i3rm81|hp9%K
z5;aLxpMjdkEq*)0X{BZThMO4O&%(l{vWxRFzsh1=b`<qh6GwZ{N6J}zZ+J#ZrJy;`
z<TN*)+o$LU{i>{#xWO{u<G!jvy<C27CmJermO~EKI7F*D#`H7ACUvV5uRerJCwz<$
z@aX{ece}oSVm5O^p2ox&=~adr<Fg6k)cDs%D)gr)e2B#)i()=ca6h*W57H|uXHkA2
zE6GBO_kJCMf-e{B6R}Ik3@Lg|SV-W41M~MC#q{oVBQ$E_0Lr=^ssY9GpCZ)kB!<6w
zF$U$$!LlHQTsMFTI6ZuWT<ukaQh@@~sgj9EP(%Hj2L^>xjR)A;+D?Hxp{Pjw2EC)z
zjHu%jM^ehgTCA(Boq@}#7L*xJBi=$dnGK4tVjNL|ppuPla!S=7$Lk5pbC7Pc!vK@X
zM)F%5g=;Q}t?b0G6!irJyG|Z5?vRID&3lUheOpILxU+9B4qUpKgH$x}W%#|6s4%LS
zS)`*60XulN*%xb`+l!GT9rf%&;~HQ2yNF^K6EZmzlX@URFDh5h5C5i{x>tF?%uTuA
zm&kEKbADV2>1T#T7HjnbDj$Fzn7Y3u?Fy^9_UxO)LHzrqa3{u5k*Upf=`>~LZPvhp
z=&NDLQ+YHUOvinVd$nx+502;UC?d6OO4|_)i8*!LjuP0mGaLPo)g~jmTWIcQFVdw#
zi&yU`R9-XudUf5?=||($<?&L0OSYFfwzX-H=|DU^yi99}Ex6I1bgJ~XR9K?)6Mj6u
z^0pb??LHLcY#Z#}$68u-QSJhL>tA|~*R>0)g=~p{TXKigDNHG#q8(w10mUYj?3y;v
z?Hsy#ctoDHM!HIKil=dqvu1X>4a>dbnpPW@sFyrjQt|!05dzuj!fQr#hg=P!og;Hz
z2FGNb)li;9L-~BZ=n&$Ir_n!N(fng||36^_f<kfQ-Pbu0$F65ouYgZ%s8o2?;gVd8
z*%%v^qPYDMRKn$r_NnpVZl%5S+Y<Z9bX*9@7$+*<p>?SduTBp+?I3Eke=W}cI_{?^
zy1K%Gcn9dt^~a3y;cT52>BM69g`Gd9wPpMQ4L8BMkxImPcTP7A0JNcfRwAAj&2ndc
z(*yVmjCkI9p0-cd@P;Nz1mKnvnn^#)x<qUj(;Lbz5Z`jEri;=CZhf4jzY0>gSzgX9
zfdD>RdOc@O0YQk9jN6mi^nv5ENrNaNY(~=41lcKTNQgV>Bjq9S@aJOV)@_P6rRu)K
zUcxO;NYzA!Dk=e+mK>l~pgb$dfuM19AvA+;ItMZjiUZhyQL4<zFv-|kYp>W83h$KR
zZW2ELyWZdoJL4yz_=<APoK5;W-7DE}64@8gqMltz2*86dP%NlhdOV#jK)bb3;DD%n
zsG)V#e$>{s?5;cN_B2QjnvM<US|vIl9lyUl6Fg;hb4w^`x3&Ice?o=)43#{gsvPEv
zvfoPaY`uyFY6N~%qH~;_bv#SQ^J}DEtfc?AAk+OrIY(0XsF+#UoFj0N_L$`R3?`)B
zJAI8<@OnNWz$Oh8B($#6IwN%)BQ89(CH?l}48a8k1F^fKL>pP-9MEOa_ZSZj>}+_S
z>!Tw{{QBA@f~XCpo4iJQI_#R++156lRn;)`FD*c+x-kl28WP{^vgE%%*^$g59#yhB
z;bbYU?mO~<kq^3^6mUw!uqtNbzVj__O<D{+Ij~Rq#yv6Omg)sOc-}8oCVhP0En#?$
zJu#BUvH4^q?_HGl?=a>69VjMnu0tJ^^%CppZ6yeOsmFPK>KT?4j&JFf=UU~}tmdR!
zrebbOjJ!*q(VQw=2Z0l(@(H-PA|!Rsmm;{_b4u40{ajxL@wSTn3M+{FS1^QrODS7O
z9ibY-dFXzYM|K@<6dj<3JZkq`f1k`cCKNM@UIbhjnv&8P9IqEm?*i@%(_^}G6zeNO
z3x^2roZ|<Fus!yiJS~@%gX!4?o0B*Kds0|VBkLafZFDI{woQO4V4C;%I*?nDsf~51
zJI@hih1_~dsJx^aCu%|L^aCc^j=SnJ^RasfHx^kWLT=TwL@7wIX>YFV=2ae>u`_^j
zhFSKlj?{v{N)OC_dEn3#V)&!m<Rs#egcMRub)w*{RAx~Zh|)*da0%o(k-}7;@CRA9
z^-8l<tq%%zeb@>kTGwEex8zV+t7RK_I<*z$YTriTG9a<(-OA!m8E?fuC3#zE;+7GU
zjLTrF(IJKXR}V8a2@RC7rQd6PFvPxc8~XO@CLQN&#6S+#M$~h+YV95S0+QAfAdCFl
zn~RGWbBsfFZG%sPW+?D%BPAe}Rr+>Y>&dDsuLcLgAjnnnXh<5!b7*rta9ytPYWUR7
zku?dspmwLQ9cPVuYyOXo<HXLegu9>iQt2@`LT%Moo7qxJYDmH*^K^4as4d3Ywpidv
zG^8s*`e%QNpoedb@N0I3q3|cSepkHgew<nbaG2*5mtc_LNgwCh4EFW*`xI0Jx4`B^
zx#QQF*fo<M+)T|&WtHhf^h14xr0fP$B`&@6Ii*^r99o;n_6IrdNtNZ|Z3yCS3mev?
z&M6w*gJhCjHSv=mBhKE>1E1uEzfZ4c&2`(t%Z%8#iakwbA-xxxD6k*R-e_{oPA0t`
zZ@LfLo(u#N%zQ_pFMr^#e4+jyFYkXItD#5uh0ZJz@A4Sr{2MDQ?Naf}L`Phy|L}a)
z)5Od8{rMHxBV7H*a2j{O$a^lq?8YrcC722>Shq!tx4(~JB$)byr`g0Yqy<GmSEleX
za1)Ok{Aye2{y%BBze7q@3~a)~IIB<3*U|*c7QST4^GC@5EG-L_pa5%1;%AcZq&oB^
zJR18fRO5N{-V+s^Og9aABKS&=?kuavc;dwTl(_1J5v)w(fa;u3;(ai}`ntO%LVQnW
zI%|-nM8NReX|6i|TRX?2;B$CNG~{LlP-RK2?8htNF~qVD4tai0@G}QnndUT9iDH{m
z0A*>T{1Kt}C`t!VsY`nrFWX$kp@W)StGMk0n}xblUH4*)8@l%<M~NHTl7>G!RW*MR
z-m;uVW@9Zr*n$;KFG-HqTSGv_<@@z^15apEq+p#GZ_9Hm=X_Ig1Y5?YC&Hg?g_KF!
zcPa3d8sLdoVmKyVTWmPwtG&~EyDnTKQa~DTd0smGSo5<}{D;j<>`dXJdxzb|D<pg{
z;22Yx{UH5DwE$UvQ*5@CAzSK^{sd)Sx5;))d=1daVYFGuP8J7iNPXvWb=La%mjggy
zl9pegujJ0HTo&b-EHth+zDVq?;@2{mB7fCLgpR?KnGu8dl?Go8SN}v5u{;U}{FT#i
z3eaCw<;xl_S<w}oir{m}7ZGqoj|NiS2Mpv~h54wO#)+yl$l6HvM%J^2WBa7Xy&sw?
zV+5?|foh|rI;n1f9s~0U?|&XpTAK&;CTp2b(hHk&!!p-C`8@wqnAawA4?&vu=8Ku3
zm_ixsiI0ZJ9AB-Kc3*C`VVj+i>ecKdE6a6duHl?&e^2@8T18LC2F1Pxe0gOU#JAv+
zmvNU2JIQJi;3KO{)-XFrlr@zpl-mGZ`wICPeO6wKF!Q6p&-ED?T|=W_jtcw_c!k6j
zy2PJL{U_td!-?5ZTU-)<IA3RN9v=B>$J8+vea0$^5D3rP4+k$hR^;!X9Z#qg7x%)y
z(#679JcZYPF<MJfk7|;;7bb&ZVK!JVebI`sh>{TQMBaS87oLTsJC;^q>W{hD(I3V&
zje5?^DPc{(<*XZ%%T{2|sL4@i2D;WUqkgz-f^-w|EB*c3y5%_H5BlXaCC=zi*iylO
zGsL^Nfmi|?Wj15jJyP}eUI1nbd%M@eZowg8@_6CohVP3NMwEgx<?HBQg7dnUWd~%=
zb49kg+y>(%_gJitx<kQaq`iPC<z}Y7?xA?8ptNXr{6Obhq}cV)bHOMLK+yzf2O&ee
zDx6*fOF+&)iDulSubrS9lzfCuNBnXxz`9VveN&|dnj{jJL9-yMe2K1+{Jz*-D4rUz
zQ4^7kW(f3+KXK;dy$_yzKloyTG=5k(IrO3e8g#IYNt+%kw!QsP<PlzC9gZ~_u`PZ$
z1<8JLaQt7{agt~XNc(9|O1+*vWGj4wUM(g<f}Q^WMoc-3cgib98vao-QH0EzzQqz?
zb2?nHw#S>u-@_VyW2Y+DGNmTWRb1_09Umz<s)sit#G<P=ht@jR94>!vkeFqAX9iku
zoSs4RYps{jbGOx`zKDDB6fR4;*g&tT7wxvg$^yKPdzp57R&B}HA7HDNHbZt{pIFCs
z!ss0braq3Z2QT-@MAiXdeiom^4&rW8-j68W3qQ3nct&~}30<@NIn|SN<rvN6gRq%i
z<-vxVF?iaY$>*4j1HdANTsp#K%V?K`>@<;r19lu(D|RAzUD19@diq;WEy#$6$lclb
zNJnVSDa{ss*M7P9GZ3{A{X3jY9<j>1N@k1@)T8a079=1OBArjuVf0oFP$50R=`?8j
zsdszl7e$A6%)HPRd`pY`(fcDok97{vk|K`|J~q}1#6N>J&;BF}%K0vyFdvi84sE~B
zxx}zvm0ILGhWa$G1^w|mp)sAEBE36x6uT|xi30uvc9vq&9pqg_K0HB_R|>BZt!9Q@
z-?U!!!_?-=IFlQSnAQBlACWwDd6)6=KUvuSc5f67(T-0RjUX=Ve_4u8Dp)*4Il_v_
zg!+#{vwE{)PlmE~d75%i?8G1Z(|3YwWeEehEV8}8qVchmJ;for@uIn?zWyg^!*rxv
z+>%4sqB+i+qd2&8nmC2GAYQ@$%7OjL57`O+5<v15!lvs<U^%KQ80CL1vk#jxDua&T
zaRYR~JT_5EGt_}^hz0QDLaD(bU4^ciYt+3YJt|T+?SQO_&Vg*~5^~RT_t>N-c&D1>
z7a8hQ%9LV+-S`n5B6ok?ECLGW4_*)T#}Jo@5ErzR3u!^bC^tTeTEA9%1L&e4^8@6r
z^C)by$@?{nvc-vhB+gDD6@5X)HKfRVD&kQG(PiHBSo^Hcoh0ViO>a#<)nz;gi;^{c
zoKB$z*y{@GHEd<;1itHnwpr$Z{FlXmPb_9A_T7v-UQj)>B+8~R!|CUHBTjqCsu!eB
z9lXOgC|NjDEEefSO#&0V+}yb3h@&0C5lvoxQI##E<LOJ;R{VA+7{?!W8E0pQ=^QDd
zJ>C@Pm;E2k-U6!Xc4;40L|PgFY3Y!Z2I)o#36+*oB&4MU1*BU*Iz&K1L6Hy;=~lWy
zI;24w{<D1^#dDtXp7;CSwOp)CxcB|LW9FJ`u9>^heu&vr5@lOLx4}O6iWR-^hVqy~
zkZan?J?ClPt!)0}&11vVMsHfmB2qHDFQM5=4^|g8e3d_U9W(P=MQwFes(eIMTa9`N
zZt&dl1uc@v2*xq=hQ{c#ls%-&NkViu?{s(^c&7Ld^Q~@*Mdy48JyfO2lsS6CO@T#n
zQX9u7U%_BCblHmX2034!9LmgNO!LnZu{l<|t&+DqStp+A4qdc56H4C0g}Ts%a!>B*
zhn={qH(#RaOkO6VmE7>lSXQ;(Xmhi4l)Kb}xBPe<HJL_?$ZJE9geRBo3Ksg6)gGEX
zX?E@+7qO#Y2R&V+lb!L>(gpeQf9%LVkulw4qyz;#29dbLcgvP>geL0erG%0*p2^m<
zd*hSKh_#qATo&N$nc}R!7KK$Ob-R6#Jn*4bkYVpxEpo${^U*U|@AzdzF<cn%RI^(V
z?%?-7aT+v3O9?R|S~4t_`3HBE{m<ff+$Q%E-Le<tEajBK67HsI_(U5+gR)`!wC0h2
zOmkZVJ7f*J8NxjI`UOn7II>QHzxgLEyo!?JVZVIMpH$%M9K)3^x(lrdkiQEqGBkOj
zi{<dp?2DPlqm8>qo+jVIC3N=}1hA5yg{9%S%#f~g<$a@8;iRt0Z!uG5X0)5=;I(b1
z2|5-t54K|BaeQKl9c=QfRlu9uZlaO&28m1rBM%1cxY<LKMENM!u%#x<1*3<ux&n9Q
z<*waVDibi7%*I?qkI(Cpamg`gFj`CZG`gpoA8I5!m=`)VeO5M_j8woR5*yW6aHCFw
zRPf$8PFif+E1fxAmpMKXC*B|L!F-)ENP<_ZMBXVFE|-MwFO;5`d7dD6`rZ;rs4Xs~
z9LeJ6jRsQLX^t3L_Pl#k(~K(z(#Q8RG6UROpS(0)MgDecFpS44Y{%z^9L3&J#}L(V
z4=;CnX*Bnjf_y82!Y`p)6H0=QPMX8GN`%s{ggY+=?$dni6`Ogf?^1Z5>fyT`&YQz)
zvA%EKNo+_|FmjMoL@>y9M<@LdAEsto_>3wxgt72SF^scmx{~ff@U}9$?VA6&esZl>
znGzz{U8#v`q2;wrkv!!D4)i@~N7wLDgy_O@Or)D*4DO@%<WBVSqB?hq?vw_xmp28u
z0Eg~&abED58sRB^Xo~*?U1(RwT$iqJ9M|HAeRqtsznJyxr#run_QtC2llQsGt`U!&
z(Us4b9C-}%H@W^Ij&p_2GQ5Z*y!%qPB*|#_Vo7oh&I}K>ZwYfrnXGqc7v=e@fmSzD
zGo<B=I_ro~2`(xlrG4aU<P%SE-w61SPPt$kH<=Vb$}kg$W{mXYiNcwmm?j$?fglJh
zZkrN%zmR&o%mJ6l$`*d|IQZdsw7<<-@%iA@(oug+@Cyml<Krd6(uvCEcf}`V2B9ZI
zRDO@h&E7nf!1NOzN$@5x=V4c%Z#3E8{8l|%CRT7VG&vOih@>Y@g4Fn2!<k^rQZrq{
zb*@OGUN)?nwORo`PN%rGiTJA|8LBrXFQ?RU2(w^&mOW=>>hua*kDNiRC12sv@)34v
zX(Uw<pcxLXG&Fa#ykFBkFq@E7&w;{WlAtTB<9clWhHd1a_Mq@ea{b%ly+%RWunbkO
zO!xP$*2*2`w+{(R3^(FE=WDOwJ$^^?dB$S*{$A6prOT|cTLX1p3CXOp-;aBKt|ZB}
zoM!w<;_Qb<d$fAko=S!i6>=w!bq6B&mQzY>zH`597H$E`;o7$r(G4!yOx<V#f3}t<
zHJ4FRv`~*lvdS68n=&bCIu>8Jo3$9T#BXp=c=wo9l5RY{+e+SAm+ZdL?DV?$)G9<j
z{X}N-F@m#aY3Tjy{)j6IpB1GM#8FG!V=w9zst+-?&trK1-ZE=wsdZ-W&tO;G6Wg75
zA}X`Dwa`5pYv8#+x|Sv&m(xBSk$+WEgmAtj_be;b8&fQU$c(%j+eCr}@x?MC24%)^
z{u@b}f&v44Xr$zWOs$$RKX)1h+HND>i<2B|-dyfy;_VD;fYTVxtgXum3RSZ=dg*0<
ze4#9C&KmoOe5=CN?48d#I#oxUx4iB-cCTxP{e#m%kG8rOmW<zg=G~h@yOfLDx6#1s
zPs;F8sk_T?h9#<#^hBd0c~DTZRQIr%XC}3+sS)?t$8RZxQpquRw@IWUE?VY$3YxQc
z@$IegY#5l+(%!rk^<Yl<+j5!M$CIJtIeS^Cco<`B4VsDKGos4g8%OsNreWKD5bBnE
zP&-h3H~C8Eb1F}jd-l@_4?K;TaIBKuu#<^$DIyDIyd5}S(0Y*XJm=G8(`BH1`rg^U
zm}|RHjvZTUjCG6j_)UD*o6<V(kX|0$>0A`=6Ssq0S<e`y(-Zn{tRSwp#f>wje2Qk#
z9m>dHz&GO2QMKA9T3xZDhj!OvDkzj$2I(T<I>m=HomhvQi?wSwPei*UQKHV&$+@g(
z=gKulrkpN4cm@q;Br|d;s(4Q!zq`9eY@g#ZqC>9<dAKeW+m}4>x`rk<_Q<dJ*dy`U
zHCy@G;CqvG>#y{h9<ZXCZy2Kl`nE(h^M(fd%hxW(+NX7~(8cDm1o5G(JR;uk*ytxH
zx>*?uT7*z{4X4q_YvfHFQ!WiVKgga?xy*1{`{p@!$rg~d+1h+Bx*qnZA+orcFFvdP
z%^~ghiLlwRZAYez?%2(OnOy9%q>GwK<?1(N{!`WpVUAK|amAZ04c--RP((@RrSA;e
zRJ)+KS}uS7ag}LNZFV?zg=vecsJ0ZR+M&h5q^KS%(6%bkJV>{r<oyM^BF&cmXDw<p
zf4;i~-P!#<@~xT$=h!D1)g2MdFUg2AgC5B8Eqsn%j3+&szVlpQZ>yhcioR?TZ>^Yp
z`hajJiRo3z1=9`VhDW~Bc&|{FM<qXeiU{Mm?Md43qWo>|)?uf8C0|)tgZ9T3T574v
z7URO`KR@!{@^P2ZIPV*=YI91|DLT8;cMxwU3O>w2-bt~#`}y#4otu7>40@OZAY78;
zK@)WD(pk1Y(yo68dcTt0zSQt>lB&^I|7BD8;B!g|^9-?Oubq(Y#j6>lnJOj!%QiX1
z5RcK2(eGtUQsa&O$>#m}`~T&&_*wKO)xyV|^osvNPygEl{FiTi0J{^@rMvyl&+||E
z=Z~K-QNxBJ!p)NY7ijbwBmJ*0#KVES_mOtsI8Ai_+l>GBZ_m@gdLVs{X8J#l9}Pb|
zO+WnE%758C|7G%|Qy(KZU-3A6`v2qjC6Ks#F!8@u{*U|Pzpp<bV)`)9Zl?dYLH<uC
z9ez$NFYf0mak>0I3a|d%TK=)@fB(uG2{C<VjTEE*kK;#%-OC{_z4PA}`G5SasyIv^
zE46X}{y)d||K%s%)ZXP1IR8J+o`xP0<f)4f_LmAPGxRH6t>M~lrz<)o4!Zvm8)#sA
zTNq{U02-`Nh+-&{3A$JYo#{J{KyX)JRO^Shiq;<NYNfOF*L2_4z_QJ#YUTdRQ>yx+
zwWWxSivlnRAa`CK>V-rryRCs9`)qr{93@mNNZZ~J_=fzUk<yKB#gXPe?LO5qWQ)Mi
z^zdl0y=emHnHcTzuNiXwb$$6d{f<k0?H9-ezF57jznC`nNZtTHKqmS0^Hp(?H2YL-
z`tokCtT$3Q)5-=vbP(vz*NMH@1my5{*RA<)>>t<Q%K3<Q-rH{!ad+Xms1ett{MfED
z_Jc*H1@QLP?Jsx(HWD4W6jqYmRuo2-B>%QgiV_o`&2@%*!adZYhTuPqmfB7QzVLh6
zPmVN~;#K7a1I0fN_sQ8$mXC0tV3AB5?{#`Hm=erv@%HORO*dXRYCb+XsOWvI?bhjs
zL$$)795=DpUr2?)it`T};GHWs0csV8O4n@+(?qVz#xi9ajH)q?gfS8yR8p>CkII7#
z*JPtDUi4_Id+cyAoJL9{qtZ}(LFr#jk}s`l#lv((+JN3o_Uc2s&Vd4(@ha=KI4;tV
zt){`QmrA1)&LiJQt&dimtL+P`_ATyn=V=5=PE?;96+Y<AjUf|x^iXjRi|!)3?)*g#
z{X~J%*uUv@Z*_vylgSe=jo>9g``PI7TZ)A*@JWOm?nOq7smW2y%!_FgHDV0cIAc{a
zeV7UrE{E`gY5m?}PtFQdINvy(*HZ(KYC&N5es_W-;-F%soV9O6y_emix6?zjUoE;E
z1r|eaC(P@R)+n<-RpOlUn9E$5*U7<n^7<GE9TJ+lHfCB!Z<{C0m~|w^uN7l1C<Q<5
z_+w{0LegweU7d+FaB{##y>Mw-EB{Um!G$Vn7r5(%oFG(N&oM@)!pZ7}9bt|gnmgRR
zRiG5h9vZVg+mV!T>nPLCE<@wg)IyryuxPTVm0gK2_ePS?3e*d!jA%DvUOT?um8p<^
zkLjJ*!S~|2O7_!-`(?{=8NAP9UM!8Em6J6~)=PMoDn<9AXgx1LDA0YP_))#!Tw)Y%
zP3X-iMir;;Teb@?W6NI`Rt;6cT>_z+3`@PQ=X`5#g(WqmCvMTJX-xm&@An9@x&2&W
zUS0_w?@mz+$6suDOMel?P3(BrI89pd1|IsyoiR7-ol$4!!pa|a@NP!jJKc8KWyq={
zMM#-~Rpn!j{d&tKcar%BO$huq2FZsf(g*rbMP;SUPyhTcE1LZR8PS0HopA-UlK5b8
zG0$(hoMDk^Bj=72b@5)u(EA6%CGV}^T**&E6UMoM>f`;VCl#wzWP8c-29e^Kn5S>Z
z#N=ImK1Peq$Wm-(^(sn~(Z(B{57uLf4}_QBMLigLm(x$0#AWPfe7*!~1-cF_0(-Z7
zLVLG`oW9Rr+nqS~VIeC4g=(oP7=qR1mKfGnCb^ctzb*UpjmO6=o(IdZ6_3T++WCgr
zCNgyJnnKSQXo^A8?g04<WPA<~S1OHY&tX<knv;<Xub6%xcxyfJ$$O>K>)76ry`+WV
z-_2C-i!*+T3nS0)Xh*&e7LP*9?F9C?WKAUR7kL>?MEoeuvqL3TOTf8PRa{6?5uAyx
zlM8~!f1SW*_2NNy)^#RWJ{r1>Q%B|f5k0yxjndD7f8?Q*iNqkv&T}1WTX@(agq2k%
zVW<}5{CqG)buwSIJBi6}`032fd`1|9?-U%SD%-q^8xWX3O>WLu#GiPOf-p0@5BlgZ
zzE*i0)V=?;3sbq#*S0Mfd~Ho<Rqpq*A-QAWjVw;52eI3=C-4{IQ4eg)6nY)KhE1lO
z{xMjBM9vcWTBD2U{nL|cTtLc@3v=pzrJ~Zpwt0?#L0Nt`U*|)I=-Q`qV<I;{-z1K?
zk(#x{WS9TAA%v^~sNsRPGn->>t4UgI;DJ)M60bbW_~#DBV_M3$nd?khy&-nw&}hg!
ze@i0lERlf+a>$mCQJKwX`FpqN2ss1~F$Z>p0Aph^h{Ls03TIl)6uqa=V0?R_yHO`3
zf{0WwRP@IHiw>o!?kTtolzB{)J=dNPi`!}83ddb@vWh`>YyNAk74+i|NZ*hVeEq%o
zpI}f!6bZV5J!I;(bR<huxjeGi`&!CqM$H8|GovZw2<*uUSR#rYcu@|@d9nO(sM%*)
zPZYeVHK8<<$)O-4`$u*#oXoq3uc!8n1+@N1i`08qbACnaWG`&^VeJK}PHvY(xL0ch
zVWMgZWuEm{INR_$uZ$>P-umMyz@X$_@D2>bz^+w$@=FV_1N)RGN90wqt3}A|>zx=K
z+LOUNhoLb<>K>XaIIkX@p0i)@XM^UGzzSS;sL+Bt>{>*0543NQhcFL1(Br+6es3Xb
z15I8|&HcxBNyqN%{Pg&fqXULdF5vIffL;Ap53`5Iq$4Hz+-&xBd@3S)Q31BzXYBT<
zGZYYPFAwt*nu7s2uQQogBh9SK+{G+?$v0dLBiXi#$?>*gl0G>K2JO$orRsqmy?Am%
zQ{k1yUS8Wp-n(^Z{Emw~nQV!`R4DB1jcE{tGA22wtE-Vx8RjwmVfg+w@~Vz#hM6Dk
z)(2){@o+p{8mn}R6ywi(r_HAQZ2cjPN!=9#=I+3jo6HV`Y6}-zIjEBN>aJ@i!|56H
zSg$7zphny-kKdoe{*N^i^)4TjW4<0N@dTv|&LTa1!$vD~=zBV~6yi>y<8pyQsONe-
zH@yNO>-H#Io|JaA3G>N&Ks^kfc~Tz$O!~kkq%(I7Z1T!xr<k+lpwce1o*Q~^IoP!f
zeWq6s=MQn;(q(IQVmMIF93R(w+@K14B$$+E#ChCf?gAhG*dt?vi7SU)T{}Fqy;FTs
zIeaf8gqvX<u8ny_5LWV@Q>kiavA31iqNl0y<oF=@>m=BY$+*-JfNbOjy@?(_23K~c
z!f>wO*aQG%P+5U7*k#Cu*im^MzSVhmFG8otB%n4W|6cPFLxSbxl?;zB5~wakB|{dh
zL6kzp(mSk$A1C*`AkmXH$5FWzX{^I-zdZcG*`M6*nRBb%b`PlmYP|8@3b2PG);!7J
z{&)&Sl=XQYe*5UKpk9d8<5A(XOn%Ni?q-1id+wDa$G+PEyV3DZ$AHTo>3mt2p|VO5
z%DV8@gG1-qtEt&eRHdCL^J*#C*IN09`sIJ;XR?qLkGvQ$jMEM{Gu47UvGV3SL&ycY
z+7ku4mu$Y`SoGz^51S=ejUey`^A@;5^TIfrs=cg?LPP6UNltc%DjR2|_}le^tGi-1
zQfSEJ#6rR}@~UNZ_hYvjF=i0A($lGJ-NzRG{)Vo4;<`o!i{rRJ+$+hE)i<@+1ycUl
z)`z?6=(d(#ZiA+=W#x{oPV>=CuYcLsUoIZbXkyUNk3OvJFZnzTn>Nm^?hWtmcaN}F
ze8(p<;M&5c)5oGv)JVe+jU|YYW%n$bXR=dZ>=#hm54Z0SWaMH*<ST8)tN^B2|M)&Q
zc6cj_;krmA{-$;t(Ms7|a@sUki?OxU+C+^tpbhJeL_zDBmKbbNXwI-FICN!Gdao3f
z%KgjsQa=X`(h-hugsELutjXofxl=e4e%?Efe6(9dtUc0pY#mxrPBYOvzC$z)rWx;A
z0HnQ$CMW`%*;(>_cpof1ek7lHN22Ng9<8i~aspp|x?Y7-n)!2c;r;@>ipl9Hm4Y}y
zdoy__hOoh6bL~Xi22!H;7QHQT*bBUXi0*<fC4!YyWOW6*P2)1v7*_abqnVKvVZ~j|
zOa}o!NZCQFfyO{^SX!3vzREIB-F81I^E}!oBG!E<QLiF)L>L*+m@4t4DAIZ8{ww-v
z_k-mRX{9iX>mn-7Z|{9g4}_>jQ`P49FsnRk1e&ee*9Q`@b_HxF%e;RHz^Cx;-o^~k
zT%!GrysbV1ucN@~!x;|Jr8s1$86}T&>DO~MZF_};aCLB1!b6y52G6PIY`4`*ZqIs2
zuX)LLCW|UXj;o)HuM_9q@*wGAG7&QyEqBx(zrWeekJEee&DcDdu5CTh*opf-(eWJk
z0<ZL4*fG~#WlPQ4U)1l9C*i8AsUbuvPp8T4+qjOYUG%~zg!$-Bz880PF!AK-#pK!~
zGTZlrRqK~Fa1<<!S9>uX6R3#b@w?uBN(P<r)3Jhwr~e&MQj;OoC!aJka18o|bP->*
z9C$PNVcb*cnGhHp0%Ha;F@2{&(||c!_Vli1M^E3az#OUd*jKmg1&^=VRd#5~MLcjV
zs|)J#2Wyjc7h)fx`Tac6m%=<@@9kz<q&IlX+Op?CR?twIU|FP4FNGtj;Z4a(>~CEw
zw$q=jmJxWn*sT7dX3RD{7!$>{%FPa8<JjA0|A5LQ$SrnDI#nch=}S^8rX!Vn^j$`a
z(}w}xkVdXUvq9g(bdg*4GmINULpe;i1+^qduep~(jCmyadiP~eYpiZwbmJnsZETs@
z`3ly8($5qw(-C79j~1p;Q{MXMhf#_~YGGsfp?kjF-0nAUGdMHq(0%>(tHypgK)NZz
zUMELVudB{?U+EQno%LpD1Rfyzo$K}Tt=m>}<x2&di_T*oT^4*~uVqwj5)m0F)!Tv%
zNPxMo{!o<cRsTuAmgtv$QgPkuRgVXW1#he;)`?uF$fk1P>br;EJ_m<9xi*T(kPx}=
z?Tf?!fBs<dfd%F!1krA+w8I{Wo5o?H7cpc#F=yue&+8Qok3CPfA#Z!YwUW_7Ri8t0
zUn%P!B!svpvL6;HKK{ec=!~V?eL5W9I?3g~i0;p4j4d8XoLH{de^sKc))0!eOKA<b
z<R%yLwKykA{i$hkP|PJNn4nwDT&T^Vm~DG+D~MPRyx_hj(ka*wVPgfi+Kfz>j@QGy
zN_jd2Xt{R!-xx{R@V~bDFv?ewon8Jcw|D@FNc9vpSulAQUG0Gbx+B^96#}nLG?p!D
z9Gnu@kRiU{Wz#7Etm5Y!29y|qBO2B<KId0Hox@QcVz{Dtt3;`GfW6e}e(svY37FzW
zqu?fLnu3Z6A546TZ4KMQs9MX`zz7PR_1UQbzECR**5OfrK?TmMW4Fo3?^JOOjlhRi
z(zmyBbi|*<M#&v4hbVgt&^p`jJ&eCtH`Q8n!h@3=rdvkYU6u!2OwtvOOS2xh5mr&+
zd&dxyy$Y5c|9LTwsJYugsWljc6;Q!k$LJ_|H)x%bKu7&>6OJR@UftjiK4zcQtN{;W
z19h27lc2lTqSu8>^7i)TQjUIkpHr-U7LSPs$vwC+9aW;g*u0aj5Z1Sk*J|i$bC^q9
zX^;S&%+{#M>sV@w90w*%BX&kAit|CtZ}jp`)-<oTv{HHQ(lB>G1@BG2>)R8hQ=ym(
zHtN2-C;j?v?H5QmKN(5CSSF{|>EGbGNB+TS`OeNOv7=@{wv2B{<fj@!MiDW`SyXad
zIqA#6TlDCJ<);Jl{)iU+GHncuKQ0TQL3B^P4*zsYA?oWSWQx}rYo>}ND~Fq%&LoXT
z8?7AH070>zh-rPn6f#MH^$CC;gMvHoI6tba1GtMINC~L`MM@=y)h^QF%UFFoME*_3
z>b<6XPo_fD2v}mt<H1L}7jP1;-k&6Kh^zQ9^ng}d3~_zrXp-~TLUwCBk9j$G--d)X
zhH+q;-pwn_0nVw-*Dd324hy)aRWdaXAgznXnxt;PzJp|NW^wm!sOH%W^6f`U`5#=j
zgVer7sa#uNuiR4qP=iAD)Tj7TrvhUVA&U}|;Uyd`i_9BbpDX5)-N;dhz-|Qv+ky?3
z*>Ao1_uievXPQ1qf>LnRV~3+Se=5&`%H7Ez8f5^Lwp&Yn*c6K%y6Y{hg{4rYKW0@j
zXqNV7L8L=yDesGMSJ3_!TEZ`aD$PM}lgE>shVCo8KC_!*>elXmu?N}{;wb?ybEq$9
zKwM;#XwP65EN{@U8uvY5Kdy;Nw*S=2{^H(al+H}iVs#x_Cs>u(`ct}QkGJ+hxN-!i
zP<dl2izEm=vs)=z-CzBCSt4#5o9ITbS|W5c3hn)N*f{?+gJ(AV=BpbDwsGdIPW^Pj
z$n9$52tgIUjV~U<CV$Njb0e<gTGcPQV1Y+ui1nu7VoQlecg;fm%8`pYmPTO&=2!=a
zD~jqoaxong+r0O+D~)5hwAPI!B6J}LD^tBmei|X5;Wjynje`2A{IlR31;WY|+`T+S
z;ko+_eek*L>R2Uh?d(z+_*>OGnS!P*(R5$!XWIg><OW=~7d5dcMFf~Xj^^o>6;0+9
zcLW9>K#+Rp&6hp6C*?wY4utgez%FQew=scn9@iwdYobYe6mp3-_g3jcBc_&~oSRZc
zA<nSNSrIrVaHcg@JMYf>+`E8;h(;@%3+IPF*hqG08+d4{0h|aAh7LX-?t&K%cvyou
zjlesEVekAekX-~BjMrk$D@ofXW0oD@`20Qt6UoahFk=bUp2>ZA=sm0T$MuFlK<Tm#
z8A<P~M}-K(if}nvUOrHQy|g)`PFQW)`dkxG8v}Q~%C$W@7UoVH7;_;!xJ*|>sJ)_u
z>(A9Sm4AF9_@nwkb0ocKT<uB9O9&_gBUD;fA=uTE@XEz)oNkV~`Mk2{#98sHf7+Y9
zAI6@5bDB<$wooD8o-dz_d66x90O6(_?>n{>rw*IO>Pdd{2jh=*(S@p)5n;7q$|v_G
zp5XXxIGr5rG~&k;=n-Yukh9)A!E~r6(tg_FYrwYxkBQw}y3xW~jHi}k<c~v@+$2`b
z^+$#HU%N5<g7@8y6O*Vg%UVQpr+~NNA-(Z81MC2k;rP;bfUD2&@ju+?Rib_a?nS+G
z=BZ)*T9cfd_b$6$`O`)Fo@_PJn`1`)6q4%zYht+dg*?!NULt~C!q~l+OnScW)jB{|
zc1uR1yv@kExScOeBwbs1E{*`vZYgSV8K@GXX)!8o&29R3=chaXV$MDhw>`xR)HX<Y
zi_fE2XT|dYJq2J#IVM_HZucc_#53blOba<KKJ9Wl+U}1^&suUvkZcM@8P>;Z{$fwN
z27u(r@X<1KFrE34VgpQd)0$@|&>$L#u8TQoyP|GCwoo?<x-Zamlj6OgUaU(ID<CD$
z)2|%z$rPn>_#%<EOS1OTuKsFI1vD9#bV(C|Mu6_Q-{}q^DxV7kn$LHVAHb}SxkPNi
zmGi@Tob2b`BM@_R-}v#54~D8g*?92~YSowv+dd~!n`Ns&Tes}9&zvNwZyPaLRvnRG
zEk0=jf~PhRY-5&5^SOONISNM&CibFqtZiV7aWR56c%u@8ovqJPVeM`<HJ<f#fVkyO
zq<hysEdO2l2w3=XSa_)_akZ9J%M07PXBs+_m!QDqK%IG2iF7f>`H0E+Uks4y3UZ*}
z$<4eX<G8xJs5NiNWi4`ahfA!azI6iEs8ItN!rq^@XR$1iPL<1M<EvV@v?<vl!a#Lv
zzGPY9s6JHx3I{>eEu*poFue{Zt>#Do0i)k6*X=Eum?^yhke5bw34y$m<vF|F3l>Go
zliXa6h&280EL!5f42w5DE>mm1O)T4O{98MnWg+}y0@K=PVStt7teMLVYkY1^4&uiw
zYx9(W3gPS9kQd``>A%C#Inp2F)%k>?H`kFgN~Eoew}m|#)lhxBUHIjB9$S>C$NrY;
zE)!0P$P&zTPX--yYr`yWWv^iSak1WV?vexBpdP7pQ#ehUNrl70q+vueujRlONFZ<$
z20OJ~EVCUAU`(D56_|YXuI6^h5ztB4z5&DfQg}0eB_>pj(FvyjVC)!Wx0N2nfwN)P
z*x53MJzNo9J)U7bTD}6W?YC&Aexj)!RCn0Vx>+ZqTQm?u#YWcw#S-t3vU))$30V)g
zKOg<uqr@3DiJB-FgrzQ<v(`YaT#w{~aBl}<G2mT(Va-#NpDKbT0yY4VmQaW{3N8nF
zg<)^c1{@XJNxrkF<pAg0c*|8n>L+r#mD0?P2*B}*L?{qTQ}aus<?Z2e$E9d)va9zd
z(r4?QeL7SJ$EDsd3V>y#pkn34)*-|x&aJT=DjQQMmGy9{XRG!XU)L2w9NPHy%9bpd
zRrFfxw&Z@KZZq>Aso+B@??*hrC&X2B+F*w<UKS%R#y-TDLF|)oCZRH0-D=OHNo4!{
zJMZlftQ6CC;3>#A5C>Pv^BfA+Gf|q4a2amJ#F^ss^v_&nB#qlEqrx{h<&q&oMAQFD
zh03{w`BkYDwqPCC0#!e7CESEpb62FiF}`z|caZM@(q~l2*L?jPaa1D@yGFkQty}=I
z_%vz=9GsdsV&q0jr0+-|=%CctUpZ3hE8cex$8)}4!#W?@9TqR@Rsz^X9iQqNM%iNS
z>&Qd(QNy;rO1B-2u&Bd@Sfj&LXOam7yR%AE?)m*~W8~p3$+?VbNx_Xbfcoidb!;jf
z#}tQ4mst|3Y}FZ*9u>p3pnoarB^2dts*XJRPues(_!ICW<chD&J3$~=-MH{7g_1jb
z9~eJ!OseU{$$9#4Jw*z|vd$SoB?!zYX7M>flkY*Gixbcgz7=O2ioS;`>I8gaYHR|(
zjegTAaH-A!4X*EIEZwCH!%@Km#*vv&q4{^+WHz)R>Qcp_%}LcMvornj+HeO!^2uu@
z5*;E4YL9sQz{JnA7DUk>##~vMuNsuOgOgM61Xt=zA?&2Q(dtl(;Vv<E;={1p=9RL7
z$>UWDOj6@I{f(33BLsi2%&8dkKy?q8tqKIz<LIb>Fx;`JaDWof2nN0FS`BIiR0ljg
z>~@j9zETZ{av_~A`7H|$L7%N|(8Vi~H7vECxT4Wr{+cZfQl$Fb>YLaeBrMc7#26Gw
zI~q}J_r0{5iS-|)>6wY3iltJ2hL|{ldknH~k<q!xCcUFqN(L^QCR$Q~9vdf(=pdc2
zVX*0n?6^R}7A6JXVn$?r5HYb3Y4bnpLYksJA=K4b*At%8`4Y9=GNev2c6^f&c1?(I
z8ghL&d-dDTGGdwDW4|1kvkOeic&20Ei}vB0ADNI=W$SeMmXPfbrkw)qypJ%~n_G>5
zXp+#JfHo_GwlsY;{M~rbF}V1)G#(1bgf6GVviL=0j2(2DiveaBLzxN${9eb0nT9p=
ztx=S-E}K(fVt#Z8+MT94RqkQV!o=o;M~{|8jU9@%vjmFlEw<t0kOMGNk~LE#i(HRF
zZvF^gyGlty>4_+f@2kLi4uY(6JFu_(o~NnMre}u@_z27c0T7$2-EfdC01E2-12{Ys
zy4`|W!!^`Ap$|q%ZJBOyjfbh;{W}}Mgyww?N90Sro(J1aA0Wb{vZxh0gCO=TI^ZPa
zv=%di6sLknX>*_a6-8!J{C%-7Fi9P6eRver?AKm!n+9Tf`nGLOSh((*-Z}qix<OQq
zG;yZlydu|rgKg{0;D>4=>7_)pR?UR}$a6naF$7L4?UQ>r@k=Efl<BE*Saq7_1fu3$
zG%BfAmbzf!1m{)LTQqOm3#u6}XLGq|ByVm9-D#rLiAN-gj2WLQvs)xH72L3Sa)Gaj
zZcNX0;iWa8y?}@Bz5!aHU+LC+BWf4@)j}gNyF4JbHoJ7=&s_nv>=(8NoE~gUH!Ext
zII|9A%5zzBrr#;Cyv<}nZu}k*BolHZq0V5GLuQ=bWsmXU5l)Bbj_!%ojec9K?Oj(!
z3z^zd^iWFCAp{Gne-uHf(*n5Zs`Zv4I&r;7OE&k-*|^6xJ=qaDRqih8U+9&dn_Ts5
z%en$N7Y>Cg>%Z)NF2h+u+q{W8qy;1=&gsV~>Lmam<XNr7P^9)#TYq?o1>UGKGmDu}
z#x`~@=`Mr$kMt?Z9TS7Fo=z-y6%g?A=|)8_#MQx(237oO7(Glf5BL^Lx(mX|UfCpG
z-}l8PtEUarN$ZXg`Q@O0uZH(7vR2-$Z^<sR@v2XON8_0abd^?o-`ZKZGnVi^P5-Y&
zXt~eMXpu+3%YdV4cUA<0Ylozf`l_Z@h68QR=d4Zhf+!hoJ4YfSqPH2cz1KA;8ZKr(
zK7}Q!wP?}#Y5H#{8^Wllwpf~u0Ch?Yd(sGmrs<aul)GQ4%`B*%>a_68i!*<nBa^?P
z9quDEH+gB%G=Y+!#nQtKOrr4fz5)w&3)RGB>Ef99-bc_@o2O#C16CpZxrhpWI@#Y3
z#zg(g=s%jYCdN0-jtx?Ggg6Y{N-ZJx#VtIJd)fRB^Qt?*sf#Yq20(1ldp;%soK~)m
z*U|RPfR1LdI21`cqY0IdBp$|qg$(4@pG|5lx-*OZbLviLIo)^HzBF!%m_{Hc&nzO6
z){PkUSMI!fcI9$`etMAu(~M9o0dHUErKrY&XK`n5*FD)QbJvUhc1Jhf_fDMmC~eVT
z6<=I+x#N@yu^eLNP`bbTsj>NV!l+t7E+Kt=<E^3&*9e5QB>1iPU6$1e&8C~a0nLGT
zj)%rwW|s=@mI<&9TNO1k+@|jo&jNO>z-?#cHt9kQ%}V6(ZBr)yPkDX>4mnlnn0}aJ
zg<$?A3U4VT1D*T4@NPES-?ku|5;^d%li#B};u&1MYRdZ}h(?$1HSPzS@i9aRedJ~b
zliHwP-Kyf-<tqy^R?Cqg>U}BZ73{L(118!Hf`M{`A6l9cy8e0EcTbQhA*xJI{q&gC
zLTS_?puSg0!w>sq;+$Q2!;qTshQoJz7cHi#B3%c?vT1(3M9A0+ZgH1U%2B$Q2(2r0
zrY|z&Vn<#67tBO0L4C}KAHY;!E%0N&STE(8M<o7TqH>^*dw4LET*K$u5kMqztz!I$
zeVTX%kGGkOsN<0H7{lpsZ=-HJAneXJzd~D2&=n-aC3cor7<3>Z;b|A>E|fbhUuqQ(
zE9`)`#py4x()j>jP>4h&fv<hJ`sDa*&dcf$?iS<tHQkYqE}N2HU+~*dZbbS2P3&p#
zBWdQ~)Rxpr5O0f#kT5(F12#QXd+4CJL#R~`?6ZQ=z;Ry*T&)^NY~aqdu-75H`uztu
z6uN535$SD}jidBx5|1g#XkKkng#DahF<3u^h`;g<8?>Ar`wrT9GL%h&V8SN~%4RQV
z&*M-k<x4T|lqF`rb?S)|sqVuMZ4V5v8zgv^n~*KkreAdT6sU-|Q>#ArbA{L`!Tbx+
zh@=A$_tpb%jLyG~x1DMzL&$KZ-wMI?Qp?k91fOcqf9BJfnX6pU%V|ReKFj16;A3eo
z9$>PUeuAi;C7Mmi5z(Y{1st;q$(&XBLRQ*CHP*wKB77zJhtDb149Wo3vMMoma1C6r
ze%W}Y>V9|D?N)W<zfV@S1#+%Bl^KLU7~f1CD=f*6;(!t`C#Xj-Ck|f6n<-HxnntWi
zfGRrIc{eaQK2B^i9!ereixaDwf+rrT$eTvW*9yLPOu_xsVn6e5a0_dQ^V)g~&}iId
zT#HXIPyD$^&mVG#6Na_(kBeA4`2Oh;{^Hh{xV>ql{GO^0*tK4v2$C>aGV4rU*7w-%
z&Ait}5DXb83gGh<j`zl*@A#{)*bCNwQLD?z0GUS}dKTwdq^?-x$MagJlxh(jVv?i1
z1KLv4#@4>o>H%2R?V)C9wR5XMg(xtv1hNE;ly#=Uq@=uO$;lr}$5?%Sik{}KRf2jh
z4y?LJbVz>5P>blv;pSMbOPc@LFz_SD?uj4RoSTZ9DD!?0JCu<uex>yB^CMXx+J8`C
zsPFhor#K^?n!9CWNEFj=d9q_pmh6zg`(SpPCk0@gbj(*OS0H*nhLLmjYzpt=nkU-&
zjlVYRS3@yZ3+%JRXLA_e1?tfGI0l=H*Mg8$SWO2_#v?95;TB;SlXd*`eAz#V>~H9K
zdKG+j-cQ7i7F1S1U+|z0eM5rMxN72JGNff}6tK1fd2uv|W`CXfm^%<sIE2xrR6<Y@
zalE0dLY-@!zQ|cgD0<o_!TW_$s#Dp;BOI??sisc>-o(dwy3Y&AgdDPK*Er^H2rbZo
zMO!*wNxOPGf}>PY-{jMYOBL|wX^lT&ZS9wQ$mS||LUyCc)t7)l(WY$XVk65_Wt6CX
zrrOfXr$wbkXQVuKDOhVDFT(IVf^wbVM&f+fik^8b522u382CbA!l;;w+WB{gHdN;D
zc(W0S$1a(#JioRsA9AiD70QGf>*CuYy&j9|9F}>wI$U9!DPL@O9EE~PkP{F0yV4sW
zF5RGSoR>0JXPwmowN5sb`+Zl)iZOh~-}-!m>p_n6Bl^{J`maJ)4{l4LP9{oqkyo@?
z=BKXoC#iEk;(G8eEx=_}A*4J{VqLro#`&ef4Cn1zE@Lw$Kf3#6SVt|a_Xoy;VvbY}
zhP|Ng7d4X#LnM1O$mXikynf%%6j_zopz$8#cbhC#IwaEv+5X-HukRs`?ygUbf*_6j
zP8k~s`=}`E_BsTZMyViDSWR_qIRNPxsExGAKR~3mv+8vsOt|%w;D-E$F*RwTh*?Yt
znRO`*vu+ky@D_xfLO(Tq5ImP5iU%DRy6A7d8|chH$x0G(w5vhy)j(qIMwn^WkH9I0
zr?^Oa;Ax`OR>2Nsys8$GHU+lLP{CZJxc&|VAyVH$fmIGq5&7F5rV8O_vv`mU*ubqk
z)<8tb=c4Reob2~kBjP)RGI12Rwj9b=+pokf>Z--ZZ0)Afmg4I|dat{sWsU~pJA%?t
zkS3=s%|!D~klYE;DX}D#uK{n0CGFc>ugcLfq2lM7#uD+%w*R)C8W`R+Pf-24L?<4f
z8v$B?Y$I&RwnK34sSh`5M7yqpy!cZ&2+(7=VhPly-|52wAdAV;)i?xkfx>}t1yWwu
z6F5YJ8K^xJt`hGB{#GDEJAPOPxwF<)duAUXzvkbv0d(rj$D%RFzm0gKQqBWc{3=`*
zD}U(SeFE5xw!kXXb6={H4?{PsHnt<2DoH&!TJ@sZvV-6*g-qDVvPrDPc(yH`-6AGd
z^2U#X+r<pg4x6*>Ql2>?D4>nYQc0n7O|+hPhY0IG8&k&?4ukq*1okf}{*$*;zzE@Q
zgl&|92b45e#S106H1linvv9L9JV+Li&Ohvt!`<o!*<jq;@?Db4r9?Lx<WDdkPYRSK
zU7<o7c{CNGaG7uD9&JB>2yk-amLTX<xC~62IYLUquxILav#H89I&BPe3NQGSw!De2
z+#$s1hV*c_fe?<?S~9n(qIx(&T;z9wjNv~LCtJvzgn@U^s0GG4_{<SHg<&8FrM_}v
zD1T@YXOhGUQ(sVJF=l6!U6+yo#=Qp+I314w;~dVX6_nEKM{HM(LigmIOj+u5qnq<u
z%omh$?UtEWe;>UW$<(|H+njB8i}HaRq|pbA@y?k39i&nix1bOQd#_j9V42iJ#<2qj
zqNpaK!*r9qqifJBO2xW7X7tvz1K1*)a+?KEnl<v}N0rEY_p7jBV1%PS(^5%Q6jmP#
zHH$r}jqGkDw^yE5qG50Yx7KF3mbeO<kR8<#u$>%zPA0T+p$=Yqtp<|o>u3dZQ#em0
z^uE+o<A%ndJ_#+%8L$w{t=z5UPF?CROc`&j7=s9=i0=>!o%u29@+xp&@<JfX)jrdV
z|4WG5jGQY;)nMliJE{UGtl@m;S+8S&u?zFv(t_8vKB0*%K6#eCmUEb1QhmWQ+!jI8
zz?R9xSHtzvx`c*M>Y`Q_m2;Sk3$&<)#h|7@7||r+0dKeBgUOC2I#ruusjZQEP4(IV
z1=T^*X;}PgyK7uKyEQ6NE9j&wv^O2K*+R}bDiboeNBtc#ckN0}oOX~@LmVNt2Z;yn
zj)ZC`&N;JAz&kE-R?&6%66=$+P*QQffTVt+)HrSr-oWMyr{`6fS-qt#tiMEjs#r*l
zK{W5ex#2h%G`5&Q=FLj}J0h!<=1;juVY%wsqb4U<(#Bg`1#qK+%eX-RD+adNEa%-I
zyXy&j+YGuHC0ZdER9h8CKZYxb_z}q@zCn|yI^*D_<Kvy`3fQLQsWeo>`7gP(YN@RN
zMWsp56Qn>Sx|s~gF_!>`?~{Nw9JF<9eG%b&8>{UKM1T_7XBNJsNlBLIecE}~eZ&4C
zn)OTBI7k@Q)L49ae3n_822ux9*LCN~-}^KJO|f|WBe!kKC-Q=t{3lXAmqm8fLrb<G
z@5v~NkTg-4)^kQLKRF8RJvj#Z=W4ofV1J#=gs9H$A>>kU@~IJ`hcb}mW|J4a5<!?T
z=<>1#2~mM#G2*UuUmFdowD9|xQJO^ioX@?EwT<tENhFTHO{b)K_-3>n_>ooBdl}(!
z%)%y3xI6GPyHDmELcFP|65U~x%u~d6^wSyvYund6ZdBK*a#*~k@`?lRwn$c-Zdh#A
zF6ps*Q$@5F;pFwJqr!9${&gNz4D@Faf%Xk(gXK(5>LIXY*Wvm*+50P*hB$z#$UjAS
z@+`)D7ulNnxhTzh4<-_^_dOdjeC=L#rD-(i-e3=gLoH3Zcs-7jhMN^++m){#q_%&D
zq<<RcalRfg+pP)Gt+y>ziS}Kw*+jps_%rci<d!kju*>@P?YzlceX#8)%06Cg&@?lw
z*D@n%eT(IoXr(lWvh-89XYo3JBYki2bzwOuOWSA)t%gfNue5RED{t3h>%7d;q4({*
z6Gvy(i(746Uj`)EIL`*{6$V!>c@v*V+7>Q6^7oTE8Uv-Cc@HJ`YrL_y-@qfg>Xa{o
zv^cZ|lVI#=?fjK)Z8>Gn;=D)EUrnH#+33g=+B{?&faYfMA}#X>B1IY#5VLh?4V|s9
z%aoDr%|1T@0z*Gu-OA11b{1^j9C5}k384ICPOoG4)umCoP7#~%+wp#`Ce<ScoSt>l
z+XKd77p!7yOE`DNy{eseYt73;7ppS^Yl-)Bu(szn9acsJHG6#ay0z2Bzn`d1f~J${
z*`efCNW7tu-86zZp9$a?7vwHuxc=hNYfW<uVt2}E@zE}=e<)VOky&ET7%yZeIc<#o
z=u@m0`GhA&{!QO)nRj!e16ae2iwbB39VvuNtqJP$KP|Kpmi`*ztoymLP1-_kSm?z1
zI#|IE#x&<X1D<a$P|J`_`(D(IgelC~*;Jta4t<t#CNZi?I{#N_1pW#1^YmETObo2=
zjm|W*TvC>NW#>Lq*J5Q$uG2XN`sC0nPMF_r8HGsP%~jCuO5N<NgX;k)<GB>7FpSTK
z=iGed_gP(53;Ub=9Jtvj*<cd2FTGpxVa<wK%qDK76Hf2hFg*c0XSF`x!8#|2MfwU@
zMYNy@a)W8<EB2w@<TC-6o+g~BX4?#MYxolBO=h5WK4d$b%Xvk2wTW&uQi*+CYp9j*
zM(5wDu+(yVrnzguDS}7w-)jBcP7Zjsa}L)_S9)@y5J}a@j~UUQ70$ZzZ&`sm-e?L_
z?SQDqplMPMtaks>gQ81CWe8BFBf)9mXo5+wqx-Ciiz+)+yz_k+0-z1QzxVaB_4CSy
z?r8WLBHeSNcZ04JMP_{Z?cRRB@MXk(|ANvUwJa|ynZ-HpD%I5<TDI4VTpetgtR>hl
zxo7R{zC{&=`s|W)TR~3M8Zx2!7jLOPInOmPRssU9vBW+^U?y4tiG@lT{?Z!xFNLUv
zJ7*ovv=P5mZ2%_}rxcN^M-KiK6j0*7KQ~4?RnY_Kb6*CegY6M|-L8wpgGI&%1HjF+
zAcV#)qzhjKSHYBhpC+Y#Q;x=(U1w`Kb@0uf%C#ZD_A_cNC#R_nrxrf5-!O<0D4sM~
z#vbb1RBnHaxjyD~;$h?X$#5+B(PTiX_T+IjqTT^=!;{w@)uf_7Dc7a2!je)d(e3D~
z@$Mz%I)1ktTO5<N_V?gaDbS+bs2V!saQI|H$rm7V7ry2DUM%?J#hSbA5geqw%E|}(
z0=5*d>ytpilQzK=<F+-Ax54pTuE(jsvNbV?Rd$?2Mp!l);oP-u4}tLICz)REyqc^j
zg2Xe%`LaqTbkd~L;V<Rmsl)@cCvbzO+FfhL`GH8jgs?fE^(Q+J5`F}c{QWP(-RJ8D
zo0$o@Lgbi#y2+@lCT1#`kjkk=rno30nlOlZ=$8lTI<)6+?;3UmZ{3oAvTW1-_I0e+
z4SyGgYvk_0Ba2Q|BM5AtzWL%UdfxJV%}2k8E~uKyR)@$np;r`SrF{H0V<I-aC5UT@
zFJdz+-I7>^gur$Qo9V>!LgUFbgOmMOGH#P+!*D`x1{%LC(XlS8f9uQAYHB_eSLXp4
z{*RiooF9XD&bn`Xm2+z_%gc<lp21;wvG<~7^-$^Y!O=~<6uTDY&=_s@(g~kTZ)J@~
zRTjoe+$A5DL7&C{@OX69lNjTYZc+o)<FoV)p(H&o?ZC5aP_FwDI!-&=wlsc<6EP%g
z%JqtWe$@u5Sw6V$gCAIioY<@i`^F2QEu4KcLPi6IU^`O+F-527l;-~1Kg0CYHDtx@
z3bL3RIVY<mm3My_$!Fjmk+iZNov+GF7PQamum0ZkGRXMg_I}vbx%pP3Iu)-Kh*-9?
zIzW(^FQbx(o)RJq@I|U-fwkzRVh*|Kg&Stl`8RMG#p)NyAlQ7KY<CFOB_NS$eJ<q6
zgIjxsP;wCJX~E8(d#K#?pI{NYH}qgCh0Cs(qXQ(Z+#dLRt&#KCeGlSVo6vxVzCB8W
zI{uQLN}{pGY{i0e85xG}`N=nV0W$_m#o{_#xODYl-t$J(gyS+kEMzK=Ej627Q7>`_
zEVzy^4<R|gAKRNBOy7b|r2%#B#5E7wmerEgg%*?)75?fa^b?+Gm?OPs$=i8h%DgcD
z;w@Rf8bQeIz&62M@EEI{9e3?w&e2u@PAZ2ae}@&2NMmmSAI-gYrF{L)=+lfdmgTLS
zbB!j;5a}#i-K;RIL9+QAg|3fX(z4;af5cr>Is4#>R(kEc4M>why=e@bmO*tp_O-}V
z$zMX(RG?(4{vOT~-r#i-&M5v8vIpZ;l}mt@nm9S3O2SeQb-VNN)BSI$E0_!6VirA2
zHmx&=(lwA9b@4pTw5mz?9j37Oe*%Lrhmgr|*lK+mGKsp*!bq#gOY3d7Qft!YF8vss
zOFsp)>{n1>FPp}<F$H&dl^<~1s|u*YiIRL%v<4}b<Z12i36Sk7gQ$-&QesdkCDii<
zi}q~~V?Xw-lPu^aizwA7T<y$l0NqqfKaR+BxKz>pnaj7TK!4hMa0a1_50FE&t;wi*
z9cdh6PL{%@-`soeElbR8ms76eCo|7P?rrztgjqKyH#GC^lB%L;Y}AE4T?fJ@^R!S<
z6g9yibX>9VI!G{f7lU*%Kd8}czMY%3y|qhQTKYuHrp^ct^;(ySb=Y~@0cNER7m?d<
zGIJM|4qZjIY}r^Y{~@Q3k)d`ed)v|?le};@t|7uY=DkQ%6G;+L2c*f-R+~r!!p^w!
z))tM-%yYI(HJJ_%=(ZU9xs=tlGhkaSWk%g-VF>*EJ9tG#i`JnsoBxt#y}j5w7Tf1;
zb)AZV%)g6oKE0|B+i!imSTb?k=-wrBM}4nEN!?;!OqaKsNL+qXtoOBW+j@y!+XUmK
zc|5DN==OCYDcy_dXp(E@_+zlYy~WGRs+JUrIdgVJ-=4n?MWVn7O@f}t#<u<to4Lr1
zYz5TbKpk8OCerrQ#AftR&@2$0(fh^7rH-I!l;zvBUP#^es}|uesHwV*HUe7G0!Y>t
zz>N|ht3#By<l?Fgdq6gr@~!Eb<_^#zQJ;$=DRt(ehn7Lhw*s(TNwLqSG<GZX#rWEG
z<@j=O=QWuV+{TpTf>V*pZ&;EF&5JeV-dgms;E=suwQnMN{&<VtcZT`??+4>efI?Xd
zr=!<fzkhUDl@=ZOLxDMKbm})<7EVN!BdGXoyNtEM5gouH4a&)?R<MOioO9TAVyh49
z-`+_o`@80B$SOpjcytw%m9ZLH`B||~rT9<Z_BX;-bwC15x>kX1d=qg6Qxj1rVo=7+
z3a5X128yS1C{981W3(OtBnlx1w3s~*lRH4^+=aaZQQ0uc<r{)Cgw?Um9#FW`Fm%ls
z#9BW^h+uatp=~J?2ALLLHc+2N8bKiQl{x|y<;dvHz7GmMxlPxB-ifjc<eEbi_)Wk1
zmC^ar8RB|R;y202t%HV-fm`#YGrIg4rZwL!`tv9(w!cTqn~a!3cLLlqzZ}_FylfZb
zfepTSM6Xat?=4uDenV;Odk3|089J4&?-M{9fDkK;qBAB=U<=Jt|Ew#?w$yeTw)#N7
zH`pGdt-pXspd(xfR10wSLne0zq^|>0b$c%!6e#Fw_sJkwT_`LgXbvOgX*h!X(sd{?
z3xeK)6;S)imh((P#-U5<Kd1y?RtZDION210(*TidiiqC$%B}qbS;;{C35$-*Z@a)m
z3TqI9^&bDv5@IG|Bv5ozR~&A%N)6tIl8y=}8__-Zu`HS)gw+p>gK%S@3+S3Y8q0i4
ztpGxl!oW?gtjvB+DEdWS>JG$?6`*_b^`Wq>!$TB6(wwNffu^H@M}6y6s}s+IzPxoe
z&DIB4^hb`v#pWu`5GZT57Uin`p6k;WAUrDSRYQ=!Ir6psI4b4+*R7E=IhsQf?w&^d
z;zCddtt2Pc`DZpG_sUtVn%XlQ+S-NsqKMk2ZauqZdS)3*P?IsItfxXMVg({~MY61D
z9JNnQ-v_uNrlA7~t@asZ<ZKjT64g+0L2k9kD<4L>!0$2H7XDFac)EGKy9k<O<=z6-
z&S(nx2{C0D;19$AljHx)h>*hogQOIU{$+p^-XKJK#5!tZ`Jw)zj)`WK)ACRmc)@W!
zr~tS>^^&ItW-8tdDg*S&Iq&-tf1^>R1fSdJR5g3cXbp7CCBqhu1=-mTA>HVx9?zLL
z0@XZ`hYo+5pVQNL2WfDg>COl1r06opfzgjHF$hCW!K6_s?bM<`8GvIkx^7f0cf=6L
zXXXs9xdBa--z=};^v9;(K;|1TJl8p}0`b}ptR8)@BS-!ReQc|{5Y1O^q>&)1y3O01
z5gG<cQP)x^H=@-V6n0+uj@aY;MxT)zOUg?j)E`L#N!<#FXB`lVU^c;*SWoT9rbKuQ
z5tT$_e3o&-5XTb?X0v-t0U=ZY3+6JDt5;DX41&GZyJVBd_x?js;WP0I47hS~XOlq{
zT;YpJT%IQ7Uk=i?3P7VJC^uWH=euI3J5}*dSAv=dj=;;y1Zy^5QoRxN5}&j`oNhpB
zq+V}vIDn#3SF5>(pcsT)4k|Ax8eGdt5xlbwZ&+y%F^3aS(`gw7w^y)fKd+2dFcu90
z5md)Z-22idm}bW3Jm$xu-dxRAo+f3M?7!3VrvvzD$5~>;87R)jPh~X0xo*2aDKY?5
zPNDjy4h4Q}Ri;o+ddulj79hGV6!2!apB@2dzQW=Ss7WYPvJA*LVnvOwGjGbt6;~El
zI6vi~&*ZIaXK(!Z=YLsU4FklkFv--8a{|KHNI>ydy(|1$8ZR3Ppu|oW{&d&>PV@hl
z7v7(7zXZDaeq&<(b21UHRTq$92hMncX;kd==cY!gr&d){`vxKBT!5M8g_6AI1m26X
z;>#b)d$U`n7*l^Nze-zLlzn^+!z%GOfTA})m?i9=DWYRU7d|m5g?}HVcM&zGaRU}q
zL)A&^w`DVbOT)y=PDhU0Jw_;|8nQ_i5F$n1dtXumpUVzj^zK0hvV*@cY4hN^unlA_
z7eBdPM$!yELq!0B<Sw4OcOdxN64tL@R2jT}-4MvQ&~cj~{$UwJVR9(nU+wz-?TP=V
zE*BbVNF<BbUtDF5a0^bDmQEGIO}%2A38Hn2&QIrzMFsJ)Ly@5i!PI-lgQGFv&Oh=Q
zP_*W9_d+jv;Tz3+6tOjq?GMiNzON(EHqLCc5s-BwS!z}-Ebl96`A0*JJ-l!;VKL9Z
zE5G>5U)#x4Y69~A58?hBmFY$-QGCjBTyGY|$0coa0y@pM?R2GyXc}@z@2rLdtVRo7
z8w+Y?y^y2c9nK$Lf6<SUYag(X@g<8({QWg0NA|*hmN;Jy7xqrn?t%26*5zKvAbk#@
z5a#<ATgr6aJ520YAfgS_oby&3QEOj(5sTX+j&w6i*%r)?Ui$R0tqxQJ|J$ry4)=TP
zMV_{oay4~J9vb&3;>Piq-?*IeU7Gq2Y!{<IZ7x*_hJaxMq0AJ*{-fh*HXD-jh^?F{
zeZB+22Bun~BkI9S6ZlP)f1$d+FjlGv`n}^<B1<x5;3k_=K0$kiP2$VwrvnN{J!|EW
zfGTq8SaH2i8pk{yE2)`Wpc0Esq+>+)dCVk%r$I<aRQ#;`^JNSYuC$c`qjp;ZBE9nx
zdbkp54%buDF9%|z_{cvzFHOhnlPcdK*zVrBpwzj1b-iPGm}j%lV>W4Jad9zsd}Q=P
z&NV&XGe~IEkN>wX>&WWKY6wm-qTJxoxAP0oufZhc0y3`nnX~Za6OK3ddV?{T=P7k4
z+<4BSnY(LyyTBXX7Fi1sP;!H?g)7!h5Ht(thP5vF{rPz{gcExzoL<CtU?@50ZBVcb
z$u`r(5umPnSiZ6S`Vi{tNX)2<F|69qI2U}LJX9iHKCt{fbWb#m0$oA7cu2%3LBZeO
zzba!=g;om8unRJm`n*2Ka1Z`9;-ABYFL%*yolw+lrjV$TOP=u##(l%?405Qw;CnnF
z(trJp_je+BK~)Yng?ZBdeWJv(FrL;L4SsYfhGGGhn-$^}#nR#@7k5_2ZQK=~voXI^
zjETgjS2R&rotI@mi}UhwadBxFPdTQD#9h<*v|sJJU;SkNYjv}(GV<cgWZ{hzynh|;
z-}C)z^zf}W8<D`niN|cZrK(k+V`W>W8Eqjvd!z%q>(%$+51nYrU*joSpI=E}Zhw_T
z%AQ)I2Gy^TAT7OgHeJY`yEw>fH|#}Pd;ep#qjAY29X7r4cN*EMH6=o-eom|}%?NUn
z@W~iy{$+ohz1YM`<3g|0+Lu3UvecK~Aa3|rN2&WijO`spoVP0|{jBaRz2ruYB>FNz
zctQE=W9AEe-5&Tzp6GF2&imB?`#Z0w@R90i43BNn^mR_Kw{5t!3A|m)_FKDcel0%Y
ziJR_tcg?UW^}5P6`NRyH^nH{m@~myhTV|z@`eQ;6`w_lPM7oq%<Z<9M`0jqxChh^+
zE=#Ic(EisM>vk&xvQPDPcf?O-{~u#-9Tw&KeGe-oA&Q6rf}&EAk^+JZ3J8*llrV!z
zmna<)BPJ!%C><hF5>gI`MR!UFC^-y>wDh|lWzG@L_xHKpKaLk3o$-0@+<Wb{);^n{
zVf^Ug7mlME%p@Zl>ror}k+oiaWaIlP4Ou%kr+@S7d^O^&Q@AfJz6b;mMPF~OQoAeP
zRe294;N}0Gp>I@lFAk_b9Ne<SQ1_`qxzVyOnceP&W_(c)n`{U{zgaZZ-EGeQzm`}K
znuKiJq;pEicI!}|V`UCs9TxfVzn2P^({_@1-!;zo?OUIvnbQ2#VXDp7-1^1D!H@q*
zeqo>2Ui0l+pAIES{+?dUw3X)GS^5TLv--BbjuOIRh$xW@uw?;JvIhBX6CO`VtdQ9H
zEal|yzrMQrUZDz8u76?n|9+U?Q;nTCz^Wmz;$`Ia@Xw<}g5xR$o>6!_XWb@nZQ0?#
z|9GuCT&Q>qo!7qZyMKPcmgo8B4>ZiN!6w{_QNCIx!{hTal{GRZP^G_@6+%sXM(kxW
z$~I;B0DI#9`1LPi<7w{7N2&;n$<R;!XN_VL_jBo_gtu9Ct7%WBvC)L{MgSvZ|2%uz
z5#nS2U|`i~47sO5oQPlV&YDN=u%y36ml@IZ_IT#`zn4DtJ=Nt5rxzkOydK>>6n9)%
zdn$<KG*uf<G?&@&-?M;v3F|0@yV7;b%c5ZLx7!}6|Ic%7C-c3+nk}bv0-cpO6qd%d
zvimMO1NFZDKhOC-SSIQ4JGZE{(=BdW-T`@NE}VDq%(g+cdxgmqn}4W?_C}SqO^WX&
z4#2M^O_}U1@spX_;??2weu|m%i5Is#H!?{Be%=yliT!7jV+BSG$TmNWBd2Vebvn9r
z4OCfgH~OI$VRwy8GS5r#%Hw~gn7Bai?WZIJ7nZDV`)5mVok2EEV=T|+@NIrXo+JL2
z+larLywe_Tg}*0ZJDCuQ)tGGjo~{!ssVC3V--{F_Mq}&{--jjLVGQiXuGN*MQ~!(#
z_8`g1;#5S;Y3B;Ai?|1?mT^pDuN5hE$>&&Z8Ce|Y+gF(TDq!o=Lg~@-vFgN^H#$yJ
zeP}Z2gc|F8Rfg-~jyrPa7gG#>kCL%3N`ZX5K1%pJ0x~WU3^%wz3K2}bkNaR8RLiTN
zsK|$y7Y!qIhr4G>aGI<65IcfEE~_mIeR`&tkoD$U+`%Av1lLLa(s5=dv-m4r{hyGg
z7In|ZZLD%m+RP9nEo{a29ot-9H6)OC1wlNhi#S|K%v2o@aDN9$&jqjsNCCF89au`#
zfS@K|AURav(!;%Lmi>Sgr5L{TY$>)kOVi1YeIW~YWC~PlT5^oeyMBAbX$*?_dt+3o
zras>M!|(6#cl}Uq+40ZVArsE(LgE>yYVO?*JmKRYJAZ=++b<<4&>Ar(-(H$&0d27a
zAtP_7L*F(Gs!`l78u?r&NlIq6)%H=0jK&}RJz7H<s5F(arEos|AxP_kA?@>RNE}t+
z(aB<1Q;8OD78h~(ag`CWN@ftOH6#|S!H%tZDr#lz%7lwJVe#?do?H;{>DY^Mscqz3
zN|c-acx`-Y*gC?#zkQ5BXthL*T6K_txY+Z&8xp0=n^;|ek{~~|J=y}aP`OYG8=QqI
zF4H0w@rw|#?d#RcS6qNG%FxU6Uy)*ocH;2R8wTJ+<P3OpocbPzu%;NHxFP60$=i5l
z>k*ACn4k>rextpT7CvPe;*i5MK3|8*ePSG#sxbne#=!2;&VwgWPeD4X7yIWy<Et6(
za;!LbY#OAH!!MEogMk76PFS%qoqO&4dnlE0MTwk_)seYAj6e4oAhf!?D$u)dfd9@9
zDFqJVQv_L%xGzjR0jiMtVV8mJjjYny>$gET*aAz7QN|P3`89uJsl5zbf@%TZx{8yH
zRopRWGmC)Uk&oPJLQg!3J{!eLf3lB$_XBo@8d%)U6?tQoDLh+_0O&68Q(8>$_RpJx
zCUxerZkU~tt%eZu3`3cFQBnmHNIfLr?tar>ZilKO@q_RgGD!_vsOZ^&CP9fiuNUus
zY`LPY>dA4xM+N2rh&^59PJ`RPUhpf}-WEvQdOum?7k`l@lIhK#=W!BBhamCLHRhX7
z_kdc)6Dcs8JHdxf7gpRD7Vvsv=(&_q#r#4%o}?t1hEPH_SDBzke1c!!h%!YUJ)yDq
z1~5`18{a2VjVp&ibQa$4wIO|F&y2kW>cgzRcPaD4Y4qkPgB_rRC!e~1=*G$Ld`CMx
zy(tWQ!7Avfb3NDGqNH4N5l&NZM8qgB_r}jk8Dl7eX056;HnD%Xo&waM%ZNxmowAa6
zQ(?<#q^3Xa6*8?msB;=nlcS_Nc7=i}um%u6W5CPpfH+oEW7A)ThMFKh0C@H+%IEJN
znba@3nXo6roXkLvDRZ$5VKpFLWx1deZ`KQ|3^>FI9J<`ln*Lshz{Xkxm~<6uXtA$`
z-m=HaY^#6vt#~@URe8%6jyES;J^T#vufWwwTQB?4SGtLERCwSbAqiFPwGR}75?@bG
z;f~vM=ihQPcx(uY3VAV(p%PX`?EWgRC(Q<q-i|v+){L6Pe}8SR0qZNQo8j&0dpJ-)
z@Oye3lz<Yqky9uTgwI;Gy*P?c>vtX!Qvsd;8^X$(WcCn8kg}c`A&>JBq#dmCR+c>l
znU0+<l<AsxLjD;rm3^N}9Mkvsk6Le3bH-WktU`+REoC0=7T^mtN9a@sKRwo`qbkcS
z6;O(ssN<h2Gn{+_o4^Eic)@~p+NDpCCK^+Vh__T4NXJAh+Z2J-ZHfCh1CaIS8Umup
z9|i%A({RX>QW8nZ%y&^4W5K0w1*Ny%&Z~4k?5)pkun<H;F!!kU8AA)S8DPyQRtB=9
zKn|7INCms^1f;()g0~*Do{gw$9}Qq5yIk%P^+c9_{ov*c#Yd8p9D0G&X%5uGi*TO1
zzw-$=ZWISvD1g>+7;Xqhme%qDSQbSy!lECnBn)FndL@$L!+BnI840xqIuYZF5y~-M
zS0E|oi0YEgmaWbxr)g(C<ui1i!zh|{xsN40FsMX?zWMzJmYT}~4xM81X=>Pom!3Y6
z8vU(KY@z)tTqmiB50;}rk4g>S_Q8PZ-f^K)`K`CBViIzm0Xsv8WGVyAWmtyuKO)oT
zvdg^wRAFuCiQ_&yc85cyqo>aIHi7ywyl(u!Jq7K<0|F$Qfq}0&vMwM^bxM<h%IKvM
z2Me7t(Ln(Hg-x|`MgWab-p{LhwZ7)FO^m7uJWs(wp=J98HK}Pi;6(ZGc|ie5$Z0}X
z|L|$zUPf_Hj@42~mXE8r$-MB%%V_~h6gT2Dun*Cr1$0H&MBV#5o$lc#FnW9XY{bt=
z;0pA1q=yM3c9spi3wc_s20xyQzjNzaKHe&@bytj$!>(|Ft%Z&$fij~1K5~N{%#7;N
z^P$Jj-$h9uP*O8BH;)xe!aQUxjb0^<Z~gsHWQxdJf+GHwAX53ylFHs0WAP~pN3s!Z
z@@;a**X<D$_{q;XA3EA#VK#oAf_(;4<?y^T@q(x8Psf-<yF?7}rox~|YXCzsIz0`H
zf-rFF_z*t&S`&6>G4S2d50FN7HOitNvKL%m7lXEgO7sn|=P!ZGPrnO%EFbZADc8_$
zH`?};9S!2{=MltF`+WlLq53HymUI0Rb+cA(MjBKx6P2epf<cbU2_ZyL=GfHR4SvuV
z77XWvydk-ZAO3Eu-uOG|qVz>wC7pNJFJ*Q?<ZAk*lnV`-CV;DE;dB|@G6CtW25^oa
zX#K*bH%@513}puXf5n7jNpd%wY}d6evQ^^3OGQcN5nG2&e;=rXzUIA>|DwqTjDsHI
zpl;Cut`+bi0WWeIZhQvA5sAtCR)6+KA)*VGp@yB)C1S(~nvNOEC~R89WYjlXw9CcV
zC2S2iQBoJ4hY#zlAZ1&8dwK-3$6WK16{+kp*ZE`JrFng}PwdWni<kB;W#Uf|ughNn
zlm-J5$&4BAYna|i+mf!4aP~dAm2nC2c8e0fl{n@X4k}sa(yGXtCJc4l*i}~Pv-7#*
z=Tn4#536xJDlNW1uivHX)is|?SEP=#hrzUV?(-59iVcck+c^!(*Zzc|Nd^PX8Gz7>
ze8=o^mzTifd<Y_gQ+?l>)AB_hX1{2Ce~-olxEuwwmos!P0_0*m)c)e)xe^EbG32Vt
ztb29sgH%Ld-ag;lqzzUk5%VCRg$gUYjrj!ix)aM$AizT|mf&&Hn{TrQtz}j==bcVv
zdp4>F^d>xCD>~9ND|OeMz^>(qeMc6WXE_aM3v2@Iv~Rne!4A{cQv^?6v~0Qrdf;%4
zy!qG;3HR8((;Y9hcoJq$gm#fgPGrcsxy7lz!kBJ^=O5mZH*l$Km#O54ZY<C5N6eNk
zed(-bSELjtpQ$|W;Lo-(DAn*!v@{Tbx@y*Tdi+I_0ma(NqVuqXFqn6qK&&rKYk?hs
zubHiB(wmWV7#m<r9@X*$8f68&HK+q-B%?c|JZP$-=2V~On$#-fX5(gRstSq)q3quU
z5KQs2ABzy(48Kgz61uZTR9p~q$p;}DMm{=;(Ieb?n`KnW(L)>;YZw>tnD$dqLt1@I
zMG-3KhTKohXJYDgBl1*55PD)(BGi4IhMzq7LPCZ&S)5XyBHl^AUtEUV*UkEUad23f
zb+zvOxVJZ*3~O&E2)=PwBK}3zF%m3o+pkib6LK?3K^Y|Lw(vPK#pd!{IN>ApDheCK
zI}_z;^fCD{P>m}wo$=G|sOyYcgd*d!aCXyr$(Vkox&?%snKul9YQeDMWVe=ssk9Yr
zt31K_LQ#Rfg0hQ=vfh|Zf5-2EET`ps4(wI+L~}MTXzU^$!G^31)YULx*0gDt(}LSF
z^jgCY*j8GN6g!ZOlb3pogLy>u!914J0SMLHZ2w9Z_st))w@Ul>Q_;}8F(&L*eY@q&
zu~(_tPKzb%RRjF0t%GrN!llGNQ*Hbfgm#P}H_Yh_!JKIb?rH{)`$b@+(X%wcI>2!C
z_Qw<NiA&%oAKhnEDNq=@bSv7a>_9y4OOb@v7iGddS0Zh8D3GA7(8sytOoj%tE7yQ%
z-LzKObQBEpgbq85=e{YyrSsbpUi*KCTwn;S^pu~EC)Xfc_^cGO`LDB#X3^KuQ&pmb
zT7cQDANw5#_AkgOB}o(4DO6DMaS95A$KY(wdQlCT&@c!Sf&oGO_>zBI@NL%K*V~yl
zp&e{Fm32=+z3%<@-6ykG1PHgXClQiM?%pW&&yK;zk5^5uz$W)UAW=7Lm#NzMjsW3D
z@6%6WTh5nXhfI(*_Hrc$km9+P?UF?iufEUdoa^@tzz5O}ob_rEVS`jsOZ%>o#yA1s
z62|s|R>2O4Qod(SA(ns{TCW1Uj|RORvQd1W5cFz!n-|08D%OZ5N%5WiHk~EyHfMgE
zYYOV&1^ZYa)*n0X)55Kj#R~-RLY7m`B8uu}ckRn=86AE+pE<e<au9^br7sv0Glks5
zeN70L)w=S`>W_0#qE_L|0J0aXI!quMC#?W}`)3f}*1R2gYBdB*>sQ0@bc<Alsz=Va
zle3KTiWyb-KLG}q^rgk-^GEm)RyuOnCCUaM9g>H1akMb3*(}VP;0g9BmGzgB_V#$F
zp_j~jGl7h!QcJGCf!~_?(h9JF&2fTRq#eL*^s9>!uG}~DVy!*?6XZN94v&~F-JlxU
zEjOwihv(Zm%HCcic&2B%P%BMuhAP}ClIgkPUPvqrDq|SIJLZ6g!t+Kt@J8W>nx6gs
zYcdbW;ZSn8IgDjCjVBPtvW^5Bb5R}5t)0Oo8Pbac6(zk<!Vo>2vJApNC@#NoD`-69
zC==V*&KBY9Y~|W<hVF*0i8&-ax8ASZQ`oSYxbfm92YRHF2QP#Jj-Nk4BQb9rG4ajn
zS^<Ar&Sj@yQN>0#?D5A^mwp)*(|>TwGVIJL2(j6`{wd46JIk#g0}r`T)c01r6}EJ=
zp3>BgG!M7NWe8Xg+6Ds})+4Mq1<tRbKu^5@ZLWg9^KcqxdFKpFz8ki6)snLJI)CM#
zB<dGvPljeiLta|!Ba9z9xE$SR#ap4*QIefE0Xhsd9Ht;Y+~~A3naG(&$~KyNj2zW5
zaWS#KAqF}{ENaJiT4aM;;%+_VP{ph?$+g!MZ7IQ$&fj@~*5T*+l7Bnlb#fNP{&k42
z1RGTs;v1;V`Dd+S)0Ih~%v{&&=s`GiuHhF$eeN-$D3nkSgW|6CQ!$(InnKgdnOC$B
z{|77WIvZFjd3%_N)qrv;7urOM6X3W$30X=1w{=gL24Pd}GdBlNy}1lk{M(Jhq|q(x
zZ%X9<f@#S}*n+ZhsA6avqY%<(=d2)F6s6-<0>rcc+b@e3y7|%c5r2S@s35XnKhN3X
z;SvJ^*hsP?615iFQXGU_E#MOndgFa5$q!sfA3!7$aPOiY<bT1dU~2miHK3qCGkCUk
zwwN-_i9@CnIcn2~(}NG0@+8SV0)AEmpFnp5SZOf{dpAinW)mVi2M=u?%|hqn;h@T@
zcrLUT11Jo7exmzzZCHf0LQpqIs&b3!<4#_?GfjVC@%tAO_z#5itnOs*-v{$|&04kW
z-4eR8Mj~6_$J2|A-9~6vD+C>ZFpu%HQwDPFR1|*S%_4ajhnZ;$dk$yjX)96d(O0PT
zmxZ{7$PZp54lKW(2#cZ&rIzE7VwwgOOEn0;crm9&AhZreIPAK8TDElu_zleK=x2a!
z+a12r!pH^<Wt0Z7$$hUlg9ByOU7EYb=~a>RROP8iwe~7Pd6+z4YPLLwN5B^Seanw&
zX&?H9S+>7qN0&G}0so*XZIy^;Yb*U5H)`{~!(e=xcM7IBE_S2$^A+B4CrGD<0YwTZ
zv3}*6XxJ&3U3qGA=NkulciM4C+S}%yV4yBmI{~#69)T`@*4NqRA0N)WHK;H+ykE3s
zMFUN;2oLY<s60i3kdKzYby03AL8*)HYTa|^n|OUAR>?PS{=VDEe3gFfmoB+{Fg`jy
zsArHL7PmZ4-~>(T(`mi8o*v0eEQd<K62d<x*jVSg@8$Oj;gn;+g-f-4<EQ>bGx@P7
zigCOOC)lspk>l}oHzZZ(f&4V~luvCtc<*w9xti!<jVrR?OwI<;hFG}m>k)SwAlUZu
zoVLV3(aoT;h4IBAVplfaY2pgd*hG!9>DWwR1F96LT~!FM=anH|OIQcQ@N12lh+?Bu
z4xoM`rfs%=U1roS<WDBM@;LDeZuN&Fs-!RDXvot%ApTmk&MH$(6bKdXx9nUQ22$gp
z#nxEP;3zbS<oDY*=_KDsD*U~<?i@olS9aK*x-;YcuFZGm>Vzb#D<OyGKlQ~SB+BG=
zW6WtC#1yE>QlnsRSM?d}D?ycDkF~_}n`dp%MH1}JHiR*P7%`DA0S~1Xu`sJSC5#Zq
znh)KG*`sbopYt3F&}Xx7GZ&h#zSTX^6`zmXz%P|`;$!)z#o)@*S=-C0=6^aVHa6e3
z|4e7`(*#Wq!ZUe{1tcO_*4bxDsP%*(R=(Ob+c!s$?9<^S0@QbS2li2p?sGVhy&E%n
zIqedApT(`mJ4!vex5w<!oAP@sy`XD!vb#GC;Nu#wc{-)^;NELYfzOF@<!|p+(dPTM
zTt7jRB-Mech^tvFTgm&-EY>i^mWciHjDz4HeU&FGL{|h(!SKpme|+fEQM%KXnc`R2
zj!BZn3R?uBGgzhbZWr4P8^e0D>%CQ34Q!Zt>%r|Q6s33-kDN<DXfa}OVoBFb4GcY@
zS>cq4a7nfJ!~8*}RAjoZ%}u0A^>ND6ia-_~@WTm%jPA|LrrTy0*nTv)sT~_$D+zp!
zwkBc+rj#z^@1efThp%vt(Y!1??uzEY_>?v%onNUVjBi^6TrLHuj}<M4P1k{&5)dKl
z@Z)Ld7m|^g!ecn0*e`EIO1EMocTNP=eZJf260~rEbe>+d1ZM-M6&mY?a1_QEjS@Q(
z3`VHN{QL`U1=kTs_=w;d?;fr_`JNCCXnAg|IqT=mRdn;XG-_AnCe`+i^B!#wu^&+}
z3%4@N4pXVAt=&tzEB`sv!u6ds`#N@{%RdqEZnjcFmv%J7oJL2qr92E#TK4X;_$Jk#
zO%TtnB?g;hhUHez3n4W-j=z6;>cnf3?2?eo&*_1zuN%9M#K;%JaCPt+okk0z-f(jq
z*C1Kq=^x)<-&%#GWYv%&c@(HB_w1s2M7}DADC|%z1GBc{;HORHMM}&m$go3+Ew{~c
zI;|bfGvB&<8F*}I=>*uV`Nq}s@FE*v7m9?UNJv7<pZ^@NFkNTO?v4kkAZJm%Fv?-q
z0=jT0_!Rp>)>Rv378BLv*mV3{MAYmR%Z4j$*#zC}YGQzCd>;kT%qQ3Ecv<J=4l5xG
z@VU(T2GQlsVfJTx<`yJYl7v-HYaUzt)L{R)k$3yxTY;~yhHa+nEM+qPLV2nVW92G1
z5;I*=8Yx4RFpXnCs3|ml8E?IxIx-JG>}7Us2cDjI?)sw!!pqQQw~J}ALCc$kb77QB
zj}?YXqAqH<8|n(BJObMHb=-kKvY!xoH?S1+A6a(Gj~TKVFB*`uFm_h}q5ZSn{1Nsz
zkA)~C1}?ZQV3mEQTg`65MVi5xVGrc5IDTUO8pKGlTrdac-0^_+13Qph7r1>7hd%ZE
zC|QJ=L*rWMCBHkb%$d;BYV&G7_ebQM31Uz0;VAM!Ca*qwvR{w)DnB9k6FS*ybDD_N
zr!!Ox^PlCwj3^lnfLyN)PyNRZ?{|7w2sIDAzHk|64fN76c6z+=vTg}%hGY%bca;#C
z6rb4Wicr$o&nAt{S~Zokqo%npzo3h71*qy)tuBoVdCopD4fP(#A+z-bjI0*{q(lID
z9Z#G;IPTXXYQv|)-{E#;(-1g(xN@_o9j{Av<+Y^=O<Su1g2A=`W_GtbX;Bjj4~1dv
zvU=IW>elz@d#95;fMoatEOF1m3Cr!)m`2)e7ULL>NI0kpgx*GW45|Qc!^t!!?jA|S
zHV3ZIIqOh8McIfgIox)#O9-5q2th_gQGk6^H-P8z(4gh+UHz%ozNioPY4@IqyVJ!J
zB=hX>AY0c8O{?(lXhY~6>rLf;r2hkdF*unOLj>;<GpJRVcR0pG&9;qR6)r8ieSFsO
zu??Q1(Zg&40_t%250+>jV`N(W3>c5j#mRKxrXQeXateC}y&c%{Rwv<{9C-O5>T9QC
zbf*ciAS-mP+)61F5yXB4vpZ@1f*W5fdHMnPZFYa@z$Vk3igjfQU4)EBp^rny2lScj
z?e~xzy2ic!#ss&VLIusxZq%rN*R$!E1mX@QPJI~-6av1!rhg9^V959)DDKh2A$ke#
z9EBM5j+P%((q8p<_>Ev<Gw3J{;$tboL?I%%pKkls{FSbd1z^8f-{1gs1nffOCSc?i
z4c3hw9wFvHKg%+@N85&Ug)wk+8gEl#615zvQ5Q*qlqf4PK|>K*z}%>{1)vMW{U5_q
zYwH3I3ce3NU}FX+rVw!2%pV+Ig8bc1R%~;^Sd+;3OU8~F74Nl5>UW=#?#eNGAj&ho
z04i{UN(3Z9(7?wFz=<<-c!aiU**?R6#^)-L3joo!Uf%T^gu0qIul3?|f1UV*bUMbr
zb^_F`T9NW50tXMTLA+HREA4KugkhAtF$dT~(JsUbHA_m!P8~IBAxN@dmra;ExEaEd
z{Mrkzr=Ko;hq!VtEw3`0YK){7#&gL;=VJD$TL$;!j6uYT>B_=E3E+a$_g;(I&If7t
zioY6yW(JYK7f3pukC5h38HY9k%wP?Z#Vx{h6DGP~6^NNV3y!H?8|zsq_UL<ZLx>@5
zJ;j)o+ANg!@(CftlJdKKFZA-BBmUzB6j_FEF|l6juKKZ^fn_dVaR+z-jx=y=*TQ_#
zteH*s0C*}e-2)=e7ghx*odh37a13~cgdvUq6bJy|gAl3yVLJW(Nt}F@`I$HzC@Bg7
zOa&Zc;*ZZHU0_3^RwwBsOXySLfUuEUPlS+?DDl1wCuMTjLj+&Wu?At?pvdH;Hzs5V
zTw$+X6AQ=c&sJH*75my!^Lp8tUMxa1T|wKi0{T22Fu!zst?9E01SUKzn{xz5iiR*g
zeQjLdu?zWXe5tj#G(TZ{hWC-%Kd^i&khMG2Uo|TE4rzmSM@drN;WI$`>d1)ffhA;m
zT%5?)!0!9ucJV7xWd+eQpXD>h9}AF6I+R*`0XBDMMt#Qxuw)W&J7T(Kj?M`_G6|KR
zSp!ww0_&(=L}z*1?0&<;3DLX(NdL!{NFL1IUJ%x2<=xRwZ}xwc_8IcL<BayEC`W`N
zg0gm_81Zv)Up`#`UfA^Q0&w|@(@nj_`+%%&*!EsdF!ORdp8NNJzuQY`9N!1}F`Z%n
zgwa=)IsDJ)8UPSPh%_CWxCy0ndNg<EQ<0l$M@lTq^%L=c-O$Hijgdl+rbBTKm;v@g
ze!T%^wFO4Y+E2<RH6umrh7>#T-J^i;6wQRxH<cIBv-ipr+b>~a_Wc029S5xq__Yd(
zBFA{NGj6i9vO73(yaTuCqColD9|XzfEcJil2>o~<rrUut_h%;8f%5G}<xXH*me|t)
zWn(Ek=rdR0yzj6cym!z2dvRs>Xm9fh8!ax+Gg?4cyb(@quuaG^0783%axIv}o&e}|
zA8ofiM@&L7_~Y7rpgkqN`_V2J8OScPZVTn=c(bE(7YkF&wk$>OBgi(D0>`@+NK5oP
z=Uk(+T83WNc(1=f+~35sO>e4+HADFwF$c4D%n2C;>~i?$k!@doqc5A}BAot75N|gZ
z8p3^Q!4sbsGd2nsx|nHI>%eK+UN9ofGTRmvy9&aFDkwgir*l4ufu~nP2L28J$NC6P
zC27?QByDu6)k(l-NzqHy#y}=)ExJF;oZWRL+JwHV(<=yRKQRMl+C9DmYan#;hwkUQ
zA~yp@<$D0Z_Xqe;BsldD+;d{4SCMKSQs-B=+6SC|`6HC6!LY@-Z3dY)vs^CBc8_<c
zDJpd^dFipPmfVK8f%)~FAM+zAVM8p0(E?VHnVknt2A)|qw?Adm@A-NPl33C2jxmcS
zZtANW>-`(T;kL~y;w<?c?%yC^x)@E`YMSb(GyrH!OYiV$Kk~Z5kMsKd44sS%|CFwY
zXF>9@fzi)$Kq-{td@u$gz)ALc4K6msV8pm7UbZ3!=p6?Itj6u%K&uc}CuPo`FTf0W
zR8Z&-#v0cyY&pd^{CRyOxw;uCLYAu?0a4t(wU5e@A7c2Qgj=>S&i6s=6`5)k6uX`z
z>(facf-`MR2ebVIGkjh_8ghxCq@}a+#x1TUM%6?J*oL;AsE^5O8I!p`xBZnT?h?cx
zCQxU`JTP$eL{~aq6C&Os$H^QNN`Y1&3zpFPb=(VYeBnkEw!_dLJKPj~IOPRw`_`zi
zd2(uxiwM?GkY$eQ15UjE<4JF$^M*H?3u^&xzfifyDNiq+RafAdyHfvOA%*Xi@~Ipu
z^t%gj_2BEN56i=h5F5AyPpy%234@89o9ge6#2wR?2pkUd7F3|8W)vVjBp^JaVDx?U
z=N!Xt0K%K9s8g0!gA{7#4o1&Up|yFtE~jhrxdQck7(m9bODXh^&$kuc0VC9my8@}Q
z7ErM&%OTT$&*{p5&@e}3c-<BzEO;-D+`4c<4_vi!1PT7L#;;q%bTvUFDrZ)U2na<_
zj}*9#3EcvcjGtbEThrFSI<OoHHM~f>6LW7bAv}okLlBpC-DR<;*<e1x`4z~n%<95Z
zN!zPAN>e)y@FM09obKGi!f0Rz1_?epoYWY#uG!+b1)zy9LTZcP>-`|-eYibiugk7|
zJldfEG^u|FTOk*DiP&=_l0w(X2uh!SHY8-VYHfa+G7=R}I^Tvg9GiizPfU4lUR$&(
zI4qRtD)o;Se=i{zgEuw3ZM7~fM=N`6h7##2qCYfZL_A=x?CvUXd^rK+o@yYm@G`Ax
ze-;5F)#gI)=o6SD(Zhq25hpYikN}Olz;)JisefaAv_|JK)ZrLy`P01elivkNt|Xix
z>tG*)5cew2cyA%ge*<)|HueFF9Ycp8ZL<e{VUWRxR;~H<t*`~-zH8xT65F-fCR+(&
zkTVr=cDx$_((rk!3v3&J79K!<laHuuw^x$jpxaM{e_5CilyzJA?A97ID!vIh^^n6P
z7sR8`=HUDgC`A3b)?aF+*FYn?qu^m99INw*x9(l<-X=kMk0?Qg{r8pZrbbgMe;=P@
zO03gqBfiD$lI{;Zp|^?=StUN8*i)oD175WSl%|84V4Hqd{|X>G62R~^b1jY9sFYd1
zCiU3>m=}nSATnSW?03QtIs-`V>xy<ilI95vPE(G;(<UCVKyHFKxe$u;<NhMcQE-6r
zVsN_9N8}g)@DZ?Flp&%Hz<+S;Q3257<f*;u3j|53QxETKp6+y9+i9f^NxS7k#X%J*
z=Ig=Li^L?lB=V2(S93h)`c8J`1GUOn{Z^Pj&4Qi)7}h#BKVCpjJ($O6O*PJadISm$
z`7;Q!VqT=+k5h+O&B&*O%5Q?er8Ce>qQVRc%tG!9)_Sik!!q;>)f^_y8Je~UMgiLF
z+wlDS9}~b@MZ&oEh1s$3o!p4|{IuY#@3=l`<(v)6TF?x+gLbK|Fd?P3I-8K3UfSlf
zd0MVhjj?7l3>|74(phm>Fk3?cGuf1Q^me)*cy#=>xloS3;;Wy01B;(OB7BJDqep{Y
z0#Pxo=GNiOFf3WQyzIMGiDk;+E&FL3_BsRQr~p2(wqz`ke_f>y?Y}nMvgNw`^*G*z
z-hz^eMr=%J=ksGv(HC$2jRC)HShup1_ItK`X}K-d$ET4DzeL+O|JUgTv^Q^(k7MLN
z&L=4S{*IwY_zH;ypH}pr-%-V~{ra5UXyNnJ|2zwwzW1%*&u5r7({F4d@M^Qi0*Xnt
z1e^ousPzl{*Qg$xU8*hl>E_5!OgsPhKC<(u2plcGrLaO=>1MaQGVzpO1ZHe)X@B(L
zKlziWU6gmvU8B199(eBMwyFpIvpQG{P@X{;#rN-VZaFF9zMHcse*#Fm7soFV5!c^y
z1}-|1V;Q!7hkTko-SpspM7$vXol!FRasyW=Ia1wnlU>p>-|_q9iji--vHpFIX)`kZ
z&+z`W>wo>WoD=w<x9^My{qgRs-rjK}B|&}-FaKPq#82CyS-rGN4Rqo+Hb0`#3k=+S
zivs2ITizQhLUVU<DonbkL$PZuJz3#X@Fft0iWp+Y|NCIrIyy>a@ePxTAMd8>%*8T^
z^$A{}3=7bBT`u{flCX`Wn2)aSjjGeyQc`}_+?#xv+`zUBjA)_knDmxuMsm?1+%}%w
zL#uJyw*7ushBv6R>$clz|6-RbrlVY@_$6Ms7Dv0t(c+s5Qkd-&Q}R31O|}fnuc<&i
zr@AYg`7WP`bmpo7%{OC32~YRuzdnRR{q(NFl3sda>X@dNr_f-(jhGw%`<2B!@xpk<
zxFG&J{~pP{>fR}~e*Yop{`;Xnequ;L>CTYOzpqfZh<Q{(9DQsg%*UspudIJQ&%dkZ
z&!0fyF~;i#N%PsSL3>_Bq^-q&+5GyTCHNAPbz8;V|DCP>vA#%nVG)7`>FNKmrT+Yf
zk&)C8$F6GGI^K{bB44Z;w@FeimCgZLa6A=!XY+)hv24|uL&P_R;2a_hUxN|I%EE<A
zYUCDv>0j(FUWIaAE|9zwJ2N+K_!;MJO)x*wy;?tgXH-ToUL2MYcK<Jh3A*GmsOvbt
zUKlo+DiEdbTTf-aRE7=Z))H#6NxelJgi=JLW7AtCm?#8A*ebDq9tCxa5^-bt1hODx
zgOXjj3RE5JjUxen!nW7}svoPIE^n#$K#lXBDwHcnwjeu_KVVlEN2S;sRc@^ejUs9Z
zVyTKw@gnOms+1wS(t`NHzt-FT<CmYu=(=CazjrUFO&H!4QIPpm8Di(##ry|ki5oQ<
zd3eXMCRvKzhppREYx@=eeb75a1pd5HPpF|~0<92q1RP{qE^r1w&Ueo3GE&zhLl1!V
zEC^w;Cd&KyKaYjBy|T7KTWJ4SZ{+Q+%23wJZo`I>0c)ENiRi}Y`ykN^x@jI~aC6=D
zQl4o*==dNT@%(0GAz^R1NagtVTKMnY*fIv*g>Vt{@egg1vi~1L^7oep?Z<{jl&RLX
zrx987#8Z5dguOO2<dO+bO>n<<O`)(^YU%oWZQZ3HUpw=dr<d|5YU>#~f@g_4>i<~q
ztis4e;NSm9d+Re%(S-6F%D+5i$<DKIIV}5tjoMGCx6~ckIKA%sXUWEw$QvC0$A*gk
z3^ae9g*b1Dt?z>0$EBI`QpDOm{O(f&wzpyY&aY-}-Z@Y`%M~$EoKtJtFJd^J)z51q
zl_j_Tf9?990AyDWW?zW=^C4JM$uA6r1>tbP^x?tu^uj+HR@GuTCiXZ-L>x#j)2)=t
zKa(vPV#{c@^`W-R&^D5&qp<Z&SJm?Aw+xr!1y-4Z{JdWs<UYE#eckgWLT-iH*@^x@
zBG><VP&!JumPn?Bn<$AZ=hv!c^&q)cy}PLunAa%j(c8|qK8#3_%u}~^uZn#twY<jv
zju?FWpX&(~4|7J-)+lTA=i6_?&VgtMqewb&_NdOiYKrlwVVeX;(;u;pR0hJ0cJlU*
z4D`$6&r8&vu--b=f4#pRC%k{hTCxM>miMPK;wTNGz{``&*WEn*<WA=86SU(ziVaD}
zOnGDZJ~v3clX|hzVY%yYV{L6?e(a$cE;H?aO0fUuc>h1=P=rg3GgJ8OtRQvbAawyQ
zGV#d#Q%ZqG`j}D2Ufi8EUC)&|quYOmEt}g&o?WA)_vocuYgfJHN~S2ONiTWFsI#5=
zukF~=bPB`UUh3sFIbI~nrN}hly1Tm0pHIEzHMyd__q>QWlA%-NJ2$baw~&$dZC{(-
zNm9d%!M;jr%WE`S{`Rlymx~%}r)8-e@Zm~zH$AOMP2=fZm}AR+z6)Xu{_YGzoOW6B
zINf2In)Ym5dl4Ddjl9MBchM-w23f^D0vEit&Z7v;N9tTZN0n!Lrd*yryWsfO1i^2;
zRpZaTE@}Ds@vUN4$Cu$#Jx{JqPIOD@Fj;@5&roP)+FWdk>2Taw3?~b1U3>>gm{`p`
z?=&ag(Wb-cTz&W7t!5nV-C3l7!Mv+;iE2M|#cy+jJ*nwljHZfp{t@=Wa=uMZsT@Q)
zfBWpf*5K>chsxhEZj_QtTzXISCL-5yG+D#`3b)eNlIS~|Us(oHywO9V2ey9Y>YkHk
zEj0H&1dj^3OsMlD8nS!!Mm3L)on`*#Rv-=;)`2Q=kPU-%!^{qRV#(o3%-NsReeC6F
zbof76#=X4|J>T?<-m26g(`4l>NU<%Yh^l{Qo^6DQ5vnB!X4}^d<^(d%3yn>DQYPjV
z|1MvKFqAlNf<~3r8;S61(gUJd*Q9U68u6F~Iz}oFJY5Ojn^`;PMo#NCtfyv1luPBJ
zV4?B;+=KrscxO1-w|_UD)nWZrj?IssSHB)Ks%#kd_sGM2ckw^vH;Q^acx7qS?e)l2
z*)rn0{=4=dC>?uAx_~OxO*+`&_iUJYrfzyqV|hu{yE`)@+Awi04?BymHE#rMew6hY
z9LTjJh7o0d)+<pB)MHIqg4-si3tY&C2L^IF2W}?&JAPoc79*MOJy-dixwbmczLaE*
zH>GQ<^77Z$ws2g<HJKkOdNuj6NNvszV|2)M$GJ&dpq{`tN2#yVj8^jh6p|HSnj<dC
z90=H2P^3Tb@Jwj5;rU$B^6K!yg8Y-CR|wbs2_*gvZWXmrq=PM|#Gc61#yWdUKep&8
zZB#AL2_;g+RQ6W+qNHQkuLJGpz6U@^NR8cMgJt3S7pppji&+N6`JJlw{@q7uR1^Jg
z4d>2ep1nt%rls-G4A_nGy&f+GOo6#u%BB3=rL-W%+P$7}=F$ru%(iQ}stHnCk9pKI
z%@5wppI)R_1|Fj_Sl?WJywqx!MO@XN_=5If-PF%4B_q<4J+)xJtq6hD@s;WuW3+9E
zb_d8EK_K?22C^3d!zqYDwK51{dj(34bK(0+-ynX93F81H^MPxG^?Hluj>aPBLNgQv
zO_|45zwyoiE4UV#rHxv?2ZPw-i1`#xf3`oEfy#n3gt=!J%=51mgM*8WEB^EJ>=nGZ
zz-F`(l=({vh5)tT@p~M|cX_<}1u6&%hkJ1mfDMFCKPKIgmKw{wO!A;VjjIQ7kgEbj
zisnlS;@w^p$LAn_na+IsEZ?oQ!hq3jx2WGaUOXLr7Jc#kjwH;TzpH5P=(&8&EC5)-
zP4pHEPqMU|-3K0`qI>D(NC}dWpASb5DtVVcC7S%9@AKcAEhvYC@yLrp1jPX_C2JY)
zvYZgh7ViuMpaREjsr05D$;-{#%D{HLiao;M81xJZLX0KDPYF9ud_D?=PeJDZTiU+b
zyCM)4>LGl;w2h@kuOV=-i2$v4QQ*>(9FyApI&W~@QGo?NT{9i>BP4%smMSqUVD8n>
z!Egwm^8ITYwGH_;Jz{!bT{7(ojQ?TixE29|HcjyG|IT}bZN)Qx3YbIcwaWA`&ukjm
zitE%Qy7+hOXsI2ptwJeZ-d$+9q_b_!{Ga^b_pR=oin=JJ-0Jma9on}4z+tWlJ4Qc5
zmA>aTU0;&C{Z&FCcnO`fV{_34$&!59do7HEHD>rbf-^$5vgDKsaDF_r+cAk8?}|1C
zJwF0aHSb96id=_^HbP*MdIRtKw)!sBUa7}V{>i|Tl~GgkP|c8X9l-98@|nYBA-zbI
zL4s}!?TaxFTAz*r977j<7nMu0Buv1g@&}WATEic|##njQ+2pAyOXt++XTKxz3pS5O
zlqvNOUOr*3LIc~^d*YpMwr~HkYmT$ii+A~V@eLJDV~kXLlM`!W8=jpK;FD?g6DZ?C
zh(OHZH(Bw^b6>5YA~Ou7FZJc#__!a4)=T8#seeXI<1z_kcv(f?sDZ?Op(Ub;F<?F=
z5b$|Hy!~G3cZ~~hagSx6mDtymV&d4Ox1r$k^BP`%U2RMBi>=!`pm$|t%{0FV#p)F;
zS9#(3Z3o^@2h|^4tZ*ySI}>GJH(PICzZMiY%#a_!w|Q|^sbT$hwzUqcI%<aj#-1{)
z15&Ktf+$tz4CRsqeCzo#Xq<$w`w`9842-TK^z$<4Y#)rkxTPVQe@EUBVF!GXE*?Rr
z!8f>)`@JCXUS>7HA_OKxvLX2vXjwwQTZ{r9!Rc=m;9wUJAs$T*4K?^dpCXX!F7mcJ
zmGlwNZXBHiQasSh;h~2}8VWyrX&v8(lf6MZLi6D4_1hV?k<VN^Yzfn31p(OaK0Zxf
zHEyg@%x#TntM+e~iA}-V7bhx82Fjgf%{jnSzpYfx@U}=T!8vg>a<~o|34)&JTc*v^
z&o~aGn1o=8`uT}2O~e2!{5e52q*KLhANS=TIPc_u?8#iL2E~QphsOhhl=+M0pxS%G
z6*$VrDjm*6)FDnCL*VrMYS?VN0S>}|0XeG`sLSyzKNb{Di&L}D+MTwEnz4F=mK`bP
zlMPA`5KWK?^dh<h#l5NoNO=z}Iv_n85Fu2+5TODH;6tFE2!{irYlh~{pxFwT(CBml
zJ3g}-0rx>pSsoryr3NHe`V$IKDhE!6r_>)|3)k?xY!G`s?wEk(c`@F5GrUpZ^@>!4
zX+_iPhZ5+oB<Q3(icvp7oTkl;bc1ZwB|JPL!c1jn4hU;PC7<)~0Idvz+%~6?zL1Fm
zF2Q#C<p=MH{|vsyOJzSH=d(<&Wa`TIY)rzmbfkD;)_*%o9%VmqZ{I_p(^Tb_@pGMs
zW5>c&e_TP_bs%`nK#PyAah~65Dh#4b(=RKA@+c#tl*=n#-B&0C0v7K1*}*uH%dTNN
zRQ05w3?Haw)t7K?M^F9paCb1yOmV=sWX%}=6U3xZkFkye<=Hxw;GN#*g<pFrjQ7rJ
zFy86uPUHgQ>f~IP|8?*_Km^7}?V!c&?KiU(kmgK60TZFAb*MYh-J~8XvdhRZvH#q~
z%-k_skGn%_mbqi`IC&DYv+*c|0pb;Zc_*N5lCQdjYez=VINswm3&H>~X&6}1p%=oq
z$qz(?FL4YoS3I0>-x>YcXO!ORddBkg_N%4ZkvSo{mzjS<60AIAzNWkIBCY04y@Hj@
zMs?RC8d#-mo_DrnA$HwutkPDUv2Noh2w6FqShubqhl+5pnCdBM0d<N52wwtF(gZ9}
zziWZfk*P-vP_~iWqyJoJ8#jIZZeB2F-0c?7)Lv0DWNj);$;89O6$*E?ULN)9N#_+D
z$vo0g;Osm$B+n4F!!7d&p(Ss`Y3S#>ZPF~41BWCU1a5nJHeFr+>1_N{a>a{ss~G~N
zaS^I1E(I#bDt*A^dX~$dv;ss5Mjn%{U9Un~Nh*pS%(KfsxG%BrwJNa0D!g;bIK=fv
z?&w9Lj7CVGwF&FCJW{J?7Q+q~m8fa9{Ut)iiF-<bSJ0vR+MkZMr*k6l0FXbN;7}ie
z`bi-RQ1Sasp#D*SyX<tSI6-+>vbmv|RXT>F_bGA}80qw1u;Yla$bJzVx8D<!{sQ;n
zH!z5`rfVwChc=m?OYGn(u0d22wX@yL*w086-O(IcuV&#c3$NbjkCw2vR&8IoT6%!}
zD4i;+%s>{G&-V#g?~wt!_x*{Z=dGld{*AzX{Up1Gawbniv_&ukI^)%UnuoecHB^D8
zYB-OdBBwu`t+=A>R~-=d5&ebrl-C)z-|;(2`<F%-m_8TH&ZJ#Tx-aP0^STrBY-V@j
zJM)He-c5Cu<G8*YkP`wyAbxQUJ2^94n*zuhRa`dtYg!1nLJ$##ZkbAi9S>22w$o|(
z@gtxrv1^%KqjBeRx1tzF`h#Ve43@1gTfy~z@tOe7@-t8z{G&k=WGUuPOK**d1KNua
zRLY@LtC^6Uv=8Eme38ftoAJcaUph25t3JQKbZE{i(Zb`1m7Y?+{qqjZh8l<MwD<i)
z58BAprifG8#9FP0y_*p*IC(FYz+@_0w&fz*PIiZvF4W8dqX3s8G^<q@bbUPk^A{x@
z{JBN_Y{+))GISwt=(;ap{ek)LheLva&m0dtKhtM9q`vQ0B7wC&z!mn0r45yXRC_Hu
zv-ptBUW1Mlvw2GBMLIl&9pE|VQ%M~H-Qy16dkV9G;&Cd=(m_dY4QT^7198wH;WfP7
z8>Wvsp%!1?-}{!Irkj)KFy2;OltH27gm00DBGPdv99vEUUASk4{!8<N2Z#*8EY`JR
zEN>PE{4-xq@K`*iE;W6ZVMO*#T&LZFkH_YDtEf`WL#JkTMNOprUqPDYFF{(fd)0dm
znI0%kOE#ytXFf8q&1AApl`nscpIj|cxN`EvUr1Rw!_6}SEw|ssr9|1>@+5rR4dx0P
z(Orp+=^$qxg!Sf3UKerjT*60^uNhV)>`n^)^A1U`o^`(JI8s9+oQ7c#$o)O~Rr|51
z#uuY<9S`zAhZYW+_)~nlbz}0{@aKBqRyr-9Z@l8R1|1DuC?F_9`Xg1?+5VQK6-dRf
zH$@@J!xJFKWm^HVw!ihWL!g-r27`&GZ>YScASuZz8RtGucae1OAp7J7gy5ESXAfp)
z%of4OV4nRmG6ug!rbm^-HLQ^0JG*8?UPj!ceZ%uq8wqFzdR(;yTLBI~svvp3imQw7
zVI!NHzhO@&S&`oNM95TcEl{h^t<Fs?w?{#GRpqZui9Nve-OQrd8<_QD5>|acjj+jR
zVwjYgvDb^gn!(m5e!rn(Y*n}M*5OtU!n9n$xv#IrDzAi32OF%fBv1YNvf^VJ`I>qA
zE(rpuZ!SqW0zZ?@)E1tVyxjyWBVTKcMzg3#<aoeh8iu?=Bs~8MT+{jH&1VsLsia>u
zm=3VQ#gNzjq^<!<^&uy2gtq@uy782>WvG3L`JRlmw#7}~PrtJEAVBoD(5(~>RJmLr
z#9Q-{M%USWbem0+^{#FRDm1Fx)m;9`I5@8J!gF*eG5vhwy~`S5qK4}Mr5oxR&-n!U
z8j$v;+=*B>q`5VZOw`V|J*S`st}{D_ae%mZ23S}6&?Io!O)n8-3CLl{MA`~IhL1x9
z4EwZpFcu#TlKFfnJ-cJEpx$oe^p_FOeA`(At0f=Tg09(&%2`4#OQ80D&cILHba!F6
zw>eno6NI_>QX4ZJhB@d-FeDWRmAHl+xe}zLWwd>rr8<g7a$dPpev<k~EVCIuLhAi<
z*a?ugB|}lyVzgrzJ%2R)U*Q;{dPn-+4f0V~MJ>&a)`G9AnX$!+0~N-M?%C_xLOlo2
z4o?s}1%++#z#n(`CMjKLugxG@46-*Qp{Twbr_qHpM|(Y&ac1bupue#7C?WJKs1$^j
zZKZ!eEO$W&>jr{3ouF9$DR<C@G6|1b+#CWjrlrfyBWDZ>th=t48o%%H#PEDoxJb7N
zU>J}r+ugOlnhu=9EW2qTCNb+V?%=A4(yS0>M$f`VD+TFNo%WS3bEPSE=##L912InG
zqAX{O!=?OhLuSRO8NcU|q>Y%Q4<k8T=C>#^lcMGSPDZwo-}%X_8s(5-3<4Ma9AFf3
z;}1xq!-AfKkw_XO3<gO0#6(>WbYox;_Vd$Kf9Q-G$Ve{m7rY3h@0a)9Z|(^?V{EyA
zhdzU>+p-UozZ>S?09E@7<@?hkkw$W)9nrfV#&BAL2LsYc4G)A3>+bw&SMjfFh#V2g
zA3+-?bf*^~8X>%|u?bEvkSK-BzDuhUWYp$$vEL*$s{EfEKVM?mWOBpN^qg5s+3Ao1
z9WzHIHvG98wBimNF?^$NRfQt4t{pxl!q;sTv9N$8dIizK0!`RCmfpF)(dv`UPI4mH
z^Sm66B6KCVfDvl^A}*$?vFPma*o%8rJ+<IUKK_Zi#eQK56gT0TY7(D{rtHdIeB-V4
zT%a6_%*wg#n{!!N_iV!DGv7@xI6_YNY)Rm}%evFdJD;xEo~FN~M4DuG=6LHZJKlPE
zbs94RwN^B?ORqL3z+0<)XGiPk+a+0dNvG=@k(CYx>@$ye)zja3j#QQz+YI3p^+s=H
z&-S`@EaVblxvmp*KT)NL*E`Gk%v!eP!COn=8G9V=OQUyn5)v|JnJ_C!=O2e7p#mXD
zP<0W)`max$)o5gZ<rX`d{4`CtZj9ct!J%DCM<UPP1ARxwhbczMtX-My4`Cg2^m<sr
z_65|#7GwlGuX{C{42TBaa1kNS8{4zH>^1dDw5E8XEx@s%AtQNl%{q=^zhyl7S}UkL
z<odaf2pKD0OaHd2LQnZ%#%KQhQL5CroSobc&eA)1?&`2$Oj#%koRu<)>-->$SASIS
zC)$p>K~09&d%8q3My-Mnl8O&4bv!!iaD68|Ne&q`$)t#4gV6UqT@AZq^E$rn`dsU^
zb~4zOVqX5v&$?|=ghJQMJ5sYu9b!?gL;xJMKs|(H)%QnL+>G8L-1GwesONdbUw|D}
zQP@du>2`---5r9zo(5Ip^JcZMCiLqPOLk6xT6TmxDR14i{?#|`$g9BK#eE$|oHKYq
z661Hpr!&K?1oyO))Y^<4p?{e&3R92ktLI!oB6i@1`f7c2tZTF?=2xTGObyxC2j*wa
z1(D3JTK9vLU(RO&hjf!30<C8loIQRNbD2;-2H_?bf{OuHtDeAMq%B=)%Wm=(wf^j@
z<eQj=tD~Z|=wq%OQ9rJSsplKWvXGza?%L3AI=3YQW7Q<TGs9aye(GxZwoYp|XyO$)
zHm<=&mPiwcwR^YG5LYYUrY=sz;-dJ1@&FyS-LB_V18GJzH1;zF1q9+1;kwXe1}eWJ
z&=$@`nzthg5eb9f?N%Y8;ZqxjHY(B3o+#^FNt<;^aA$5I;>KGy4k;fWa8?V#ns<DH
z%R6^i!Wk$_ynyfJnOLE|fz!H4tzSWs^0x|V*8JGf(bpNBCPYm0(q1mm9uPiWHhs^d
zUyzX=3juubOmO{=Jdb8fO;cC&{AU0GkHOU>@?8~V3pTw1eQ;>c0A`>BA_g>Cvl9Gh
z--W3wEKq-R=jm~i8d-Q(_@OlUdTgp~$TQn;{j$>Wqr3kFijkjKbxCAh!%bg;*RnR!
zM#cWrj0bd;DvH&T-<6B?K5@0q<-lo;Mg!4u_kz-o4WfjT0#>)(cg99`J`6o&I-eKw
ziKs|2PCzwKj9Z-4`>l3+H;Mp|ddVQ>NZf7*IYK^5XBYR}yHw9jksl-Bx5zKuTr&J@
z>jhx+@`H>bw>|saLngWdJg}CO!MNZXccOBA1w#A1Ro|D2eF$y{fsR*(BGn8%p_z-q
zf{_ix)4G!b2Z<|<V?b0myWzz%>zNWx`?9M+)-hN58W-@^jnJ1m0*AvXf|!5)G9p!$
zoBEK*9%G|!#&8C1YP39(FE0T=gI+Q=PVnAyROi)qUx{tjYX!R~ICC?Fzsj%g)!YC4
zYP{qV$$bAn^?%%DHV5<$A97i1C&1x}fkLrOeek*6(wCPPCar3Xc3{ua=#mns(R!iO
zbyji`D#R<vI29Q**k{+*KS~;5aKk!|Bf${(^h&YMNQ3u{9$1|i%XBC}a%Kd|EvuH~
zM?3<ci2@d-Z~6^%a3SO^2(!9#rjTYoZx9%SpY~H&*nsF8EDfHR2gT}@{ZjJP*~(<^
z%RBmyDE2OygQ<~OH`rzf^L#I}AJA4;9O~o1eDW-q3fI4}r!=BDYTJAYHM-jgnnzMC
z8dU`yy#!{RIuS^aQ*-?I3R%W>*OJM7a-+O^ZDm*-84QlE)ZzDRPQoTmt4bp45^m}?
zP7{S`%)J4<)gQI1&E77v*Jt(8O&*Z{a3_b#9mB=Ef0Xlzo>|$f)Y`Lv+fw7Awfcl+
z!$;1C>{o2V`hL~&jNhWtCZ-adckh-MqH83T^oj1=c?)yT@&21@AMP{eP3b+h3c7Nv
zHkad2WC!q;2`2uN;7Wj}H-|gHsXcS|ox%DSshpDt(3WL7cBwoS1el{auqKOUN|#5u
zjjKU2fi)3WejeB9+eaNp1T7OuFxZ<IS?B5TS;<s59;Vg&+;QRnur)v%+2rX*Jt*bv
z9!qj&M-QL8>dmfW436ss9!|NMtX^kJSGSLvOSfUiU!+|<RvDS}QIPQPio}l1OPI)B
zxWkVundi-bb(sJxHj*`s-ZW6+d(1+Xh!n4y11lQ~cZEi#lu~Hd+2sUAjm{G9#A1$L
zhu3c1`AT)Rll7~|o4vL<O!a-)_X7HY-!2|Y^sBpm@vS3cp7A9xc1i|WY_9WEUzG(o
z7F!vgZVv#_UsZukk3p)H_p`&wqF~XhjpR=t=62FbSZWA0ABb_qBo43P!n?$X`6a94
z_UsjrZ1*r1ogb2^FHGBJOSEsjZgl+uebt+z*k#t{LU<ngPaIG@k3vh1S9x0e4RAd0
z?mQT<P}c~rJ%mJ@Hx$aiBix$T<52ZgQY;rqgB~G~LHTG8&%wqFjt;>*|26#{rOg5$
zc4B{3SBh;?DS5JTH%ACAJiDE7o?eM*ECNyX+4G^b;lMC{`GfZ~5cV5a-<w*Z5;{eO
zu1q1?zIH<JjcPWJrthyqpXKe6$sWdu5m_QZIPcC3XN+=JW%?2z%+C9!U-oEmsomFM
zk+qD=TRP9;7%Hs1UjV}ket*tj@FY!AQJ%%0ejRPfUOjn}i)|$J<h)BvZMprzYG(76
zD5wg9WtCFaM6aD1+HvIez@6`4E5q}6Crs^F|HbYzPXt3=-95KM=26=biAJH3>w5bm
zri0Dj9^S-4j31-yo?P#0wS6S^><jd{dU9Fd8MwZ2ZvbvzJcAS3#VkG$#v*D9g{@pi
zf_BfXY(fw9$FAlS`?p{$E1q&mMhbCQ`x>8jA4FjidDg<@CA-{=$LyRi%H0xv%dUK*
zue*q4vLW)9+L_!98ylIj&egjvXX@s>cbo>@Q4!oglx{20YttoD?X1$YvByjbQP+0F
zojXc9SG(cTZgM!!(>-jqZf_r5<M+2`+#YNeZ+}I}U91D5&*Rr-X7!$8Y!Uso-SCA&
z79dg&(8{)snqZ|nhpObvFgjRm`+!$KP9ZyeAf<B5B%@QT_E5iIcjM8`tG#L$HX6NL
z04Ld{mzPg;svKk5nY!C>uPP?59GLKY39X&khH)Aac};-i)Yn%z1B2>VPntCjQ4O}v
z9l^jw>Ns3k+q3}v+}>%c`0ST~#HV=R4vqB^xE;NoBn0Yg7~JD!#O&_pdvN?2BqVkh
ziOQ*X4HC)PX+%5dIHpo6$=U<4yCMb`08nnRZ$5Of4vV;;(9~l#8PX}O30FR{&pNJK
zn<aQ>g<q$r-Y36EoyVknt%1+COoMaF`RMJf&q~?YlCOmXd+E18%DyLCYqtS;HELiW
z!{37CxYEGdxyECMqdItPj!%iraQ==%QBE{SR99w^RNEm5bT&Y?hmd}$KXnYDXG6L*
zkc82$7)G#>U#JOuI=Zh)I}*yD%~SK8hR*10xQ=H)l2J4>XkrZx1G}W>G&*^m1<HON
z!@5oX5GNWwh+!dV>>`<;8vtiU9w;c~he6n~D6w4dS`=T);I<JFICOpm;zdrRml_g0
zIOxA60JaE`j`B+yxXA6o7O(8;)OzUef8gYf_yAdqkw*8I6r7YZCQ%~i4pIruHYqsK
zY;}?7Ftd3!mU|&vrWNC!xif6+;FAqY1Hl(03Ol1az2&)S#gYgagWNl2>(0ftS4h1O
zFmQ{NCt4R8KS4F{d!)Zj==A4?mgTDBqi9h5q~A^yE*(>`ua=4j$HZX5m3vfetzA<{
z5llY@yqsI0;ekFT@wDXLYXBG-e0m+%!1@7Fvu-V<C!P|Y*9t(;MdLqee#~CMbaE1-
zr{DwP--`wfI6Xj8kcBg61pP@kXz#msK`90^hv8(Ye!KvRJ|!fiTAFKB2fLG%xLYzQ
z?;e!wyeVYS5G|4QjcmMFF<0`q7whuX<k({SA3aQUI<I16PFf2v1=cl~g$pY@p1gNV
z?+I<~Ew~#w{w&VJ9s}j>Eg9vWQujg^>D`iRbFIRpLE03`s0VCyk9BJ6jHb&<8$@TX
z$5@5uc<R^c&vp^@(isd@cvLCUqQSEb8XYSGlYOveu?8UT2hf7;P}Vu*cI0UUbgz>>
zW{(L({6f~BPwI%m!2J>=&O3r2(cwh?WlfORp;PalJ<53Ib`c*mpcQL@PA2-X{AQ=1
zt_p5MENc(CV7DZ)6b!Gp^7Y*7+9YUTL&*^_;t8#qGfT5@32?e?gek&N6Pm93U>A#l
zYntg#Fi~2Y0)D=|=ta^+dQ||)5Npp8U-#0g0gz=X&>S?vU30z}BdVJvUg)S1?Op1>
zzBg(5Pq-@RILW6qURA@2Ucl~EfBBn1vWr;2ecXF9(C4Fjb(3D76~r#$1e3`gm$z)2
z8rmvrZ1k?S(?p$~p@z@s^5T>&gzd9P?>o`sG3$fzu)vztZVm5-%2C*8*mRqwdNo`!
zHV>d5%!x-7<lSlvg5hdNpT+1dyHERc;1I1IBPtU$MYx)lcBg?Z@l8!yDaF@M$_S^h
z_DqKa96Z{m#fh}iM^xalq}d=oxKsZt1h2JFN47=K^_AiXL(eWJMW1y957ft8b6U0R
zhf>$1<p|uuM<shO$CM%(5Zo-&92Eb<&~>{Itk}MgJlWoVP#{2`qs+cOI>M{0lGi<a
z`50(h-(h1bc^!>UN&F?q&sB&oic7&;sx*Yw*^7@BPmC6zjQ!b|!!@ebat=G&C>IUL
z?5LCp&icT7>d}m95az~)M~gdhW@CHlkVOVM`&xkC3Ck&>T>l?q?;TI|+sBWWl%kC!
zip;{HvX5{ovUkX??ChQ0Neg8pWJ_jN_B?1v$R;aC_9i>C-|HRcbLVs4-|z4HPmjla
zf9jm~x!%|Hx?bZsrr{jf15pU+=;~f^tmdRiZJ&Kx<&ypzcrVf1uSY?jJS6vo+&!%c
zdjebWnRKD@F@3=zgy-nsQ@{V$dlI^iIS_-r+b%0uOpzH~Zr0AJr7_{R`37-S$}y_!
zS(?|hp}9t=aU>-xCiXr+Jg#mY=F{AJD&A>0%(&P{a=J4oiO2H00CeU(*6%f~F-}NK
z_yn(;^a_qd4%~gf>5drql*~m@gES%E>cH3E<-gZN&%>yVJ7R-_vhj*xLqSO$rr7HV
zu7<)ex+#N$H%aU7y>}nOcYtC!hbQxoK$T#`(=p#82$+HW7pHWE_G~cYjfN+pB%E6q
zUQ8D0pnYs~4brQ~NYDk^6gzrPHzFMboW8sbjEnvfL=YP8QUR}OXosBxU8N`KZVW<*
zD$IAsZi|dgd6)`vp~-xJW5cHrB$C@!@Jd+@arcdK>3c*4nX9VyO0|)RqPF+(X;IWj
zGY-x81FU4n-fqI5fzas@S4+Fuhfb6k9p`ue+!E{4zQKJXd3W->^tXRiKp_UWEyz9d
zF3K8siv!|hhpx6*m;uZo8hZLt$UxOBf0p4r;7#;v?WRY8x?`syS|55aa^zQ28F2im
z-EDQ(V9rDpRecY*M)lV|yT9P9L-=<keq%2Ry401I$S8X6+uo_2I(6lK=Wp(R7xpmQ
zo7d<<Sk{jbRU{;h>OVl?lEy?4@@{2z|4+O6jL_yqv33sn?MMmY%`a<)>zSJiU<JJQ
zd4gGZ<<dKTkTEOYT;!q#^*ft1@HG5<P0!uV7|P9P)rq+bm^v(7KEX9#G!jjID0-qR
zFBL?U|8Nyu(xHGTk#Eyj*mszA6lk@G3-<Mm`Z?X44+6DJ*_pe2i)Fi^`m<!;guAvD
zwD=J1<iK?(7R2B#VTAtn^TDytlbt!wneE}#qM*(zovm<&yy)`_T>b;8NhoER!o`)g
zv!166vDn{;v}ce!T519RRTljv;fl#BgrNK7z#zWlFn$AZY0~(#l+Gn?a$mDJ2*`KL
zu8Yi%A*(N*_FBNdg_!(U0fOf<R}I{gRX_#la5<awIxHD1F1WmA=;3RnI=Fjd#%I%s
zm8)TWMnO0JwzD5r52Y1AT0=P)ruEI%NH=ZHh<bxlpx{=mOC&*Gia-aVC#4vHO>S)a
z{1h)K1Gxq1yZe6EPGapsE?&<m^7d02)OalV^)`h}h+!*ZxlztD=hHA<(Dm=GGEOu8
zPiv1MU>;_c*TyI95n0@Y>7MR9+O+!Qq$3jg38kf%$H1@gSmcRR?NWWf0&sU7OO4z&
zW_s)OzIO@X1YV=js2#xd{_q-6Z-CdR7#C*W$$Mz@4+S}=cl4ULdt(}1<_(;SLuaAQ
zoA$E}m!Sga+(lqk(wpv%Qknr`a)1{2Ogd!Xlc3K~A8b`u2m`$#n6zXYB{*~gKR#7m
z0q-(_5`>0<1y_iPg#EW6u&Ov2bMtfyJuXYX%B8-ZYT~k=VgCJ#yDT`}Cq|{m_mq1d
z))V41nLH60vAT#GXC~wj+X+u>@?M_-jqr@ic_0R?)oy<t51hNMxsEIJ85jZ4{hWi8
z!c@zH>qNRqC!X1xY%KYtVcmz1NgU&jAb2f&IlFs#HD0jQ*-Ra*LeCCV`8%P!%uoY)
zAP^_+3|er5&?j0%({dwR)?h@&(TIBD$|2bn!}<)|lBSuXayQgq3kNU5)=U}|wQLUN
zEc<mw-;u`H9h&vUCNya<1u=@t8OpQj#ryOfOeA0d!cW_2q3i{Vp$SpRF>-lyBrmev
zh#<Y7OkN08$PlpYj~^;lV9{~xG@R=)%cPd+WwF!EHrz1j1-RPE@6M7V*5}`r%np?M
zE-)~Vp4-p1l#%rb|6GZj@(jmQ=6v<%-g}>09k1&hQEif(Jmu=|jiYhB(|Yqwve*r=
zth2EX&PkS+{2t+7j_%~<Ha0e%i0av>Uz;r9PTCyKX@BQB{FS;%H4y0e-FH3)<$cqX
z0RTXX5B{Z+{m~oAxcmIOdOQjW2W&Wuj_XO3wV!!(=3kT4nq$d}dAR`TxvqAyh4t2K
z?tNR4%Ha&NX=8KOhWzwwb?fi(7RPm87vp7xQv>p}Vq<Qt)omQGS=x%6JY|?3h1LlW
zDE*<*+&1MCRcd&uu_{j<l&xH!bpe1!E_>b`JUxS;Ev1SyKoLj2kSucE#Ggl=P^xP;
zm6*Ks&w5_6CPA0v3T%v8s6@nBD}r;Nl(}NR81_>wr3xR8jPluly>ao$<{FIJ2O<6}
z``7ttmV#;=-si#U{sOlMkxe<AL$I)WtIn_aBXlKQDTaC(n^N0c4s8EkKaeYOhS9A!
z$wLQ+!a~G@bG}@C)aeCN#Dvg8c9fo0o1enkMw7qCpm6^(z#Js)?W*C0q9EjzE#pj~
z)%t}BJ$8sQ*6Ra4N4?a^b16F5Xe7eDMEko($`B20RLkGQk=?|nZpMbUH-9isPfrh|
zxj?2<PFC>#B9Zf6B7K%AeOhO-&@iRivYLaz<J)(Tq7Js$b0J-XR8B9rYvBQ6>OO_T
z-y1mO4AVkGh@x}wvt?pL+zp`1AKa#<C70i6MAOu^@WK+ValgxNzqYm=@PtXxi**3i
z1t6qZ5Lg@D_<7SbA=sdfx1B=gN<!ma(-yVYPCAbg)cnJWVfn=Mr1;&_GBe!XWI`#V
zCGzXX9z-X#AZ_LT?mY&<s%o=mxYE{)K&d9A6E%(tai^p_`Dl9&(P!TGASnU)H^b@g
zMRRb8*CL%BxC^pD%4cRREQ2;F6JUBH7L0nIVQLf3#ct!mJCNgb5=fk*GUYL4pn{Ee
zV!^$>M(35v?>Q<%S6u>vFx_n(R%q-GoxY*yvI+i&&JTTsGMV#h(?)-1{i{f5eS^u%
zNO**@SJq_-I>2i84AVq}7Z6*karQ8rnZEuz`OE35<j0KFRy?M9HWt60XA#JkiyVr}
z<cn{;GgCuGQ%^+5)3%fZEL5_@rH)sor>RT5(OgokE;QW&ykm_h8zWd>q?vqI63O#X
zP(Sdk9gu~+N=ee?4h%>6135V}m6Jxs-zZzO5PHRg4qFvz>zt-$RrGlr{{zgeXCGvv
zxt$os53G_8tw&u>n)>3zv=S_*=kdVxN%yH9&iJiqNuwl@Y^zzkLXh1~z_%MVUv-hm
z=z_on%v8HfKDl~(ea^C6?ZvZU45j~r5)e!&iKajH%AQI<@{GdkvUQ`4s6h)A0tjwg
z?Y_5*!&=iZfgLsbj}whqx2zs$cgx2Ih!Rhp<XM_$$di_}Ug+z2JuS_ZMfUbzZvgqF
zE5!EC#OU5cR-?}(L|WL~Nc}bX^o_rXGGBx#O@p^lAYmGDh@nCZw{L}B@E@C|EkX!=
z0A;m7c5_PQ#llI}6`>-qan&|D@7lCaCv`Ho`O^O&+hCc9&?oxN_&@d2VlvHSGgPLK
zR;~BHzP&9muac|i2waD)SJ8N%2)`>AkD3lo9zmIg&cvsRvPC|h64ld={~t?NNtP>Y
z9c!VA)3(TYq5InAs7k$Riz>e+zBSiQTc43STI5#py=}3G+k0Onx4)x9u@cIfKc~5l
zE!GRS8P$KDc2MrNWZi1g2j~C#9!wxnJy?QrTyBq*kkLvs)6@&pQK8*V?Z>r|_p`bC
zI6+xs>v#M(-uPbcw1A~49lhhBeQWw(GWOA$CYBmec1P=|YkPh3+mzhNCirs+vz`5M
zNSt^dHxUVz^MT-d#>`kd?Kkd&AH%3j@Iw2a1&QTI5lYv6l(*t^y*%DU?j%OU68ZHR
z(6Mkfa2ZTdu+fEDjdZT5%}@S4-rMerQBv){>>Nv!UZU2@O+M2<H<_WM&zMo{czS93
z?Zj@vimCsOIcvO=;6qHD?K@mu%GDdV_>G#yOa08LFEStBud+J@U)czixpwZ;&cBYV
zha7`qowv1(Mb%K}!U3v*oYZ#&ts4gx*lOstiv0wJ=T<UujwwKA-HT&ob<avuMa-8a
ztwS6)N{3hWtZ}iY9+cP9=6w>obKHh((B!cDW*@X=d^N}v;uSrLuxce6wgYbn@8yOb
zyr?$*#GU3Jv57>T31s4WTITPHpLLjxgfGb%Z-@E6eg-C-*#1G}a*Y_BK!inBbD9ph
zCw(td$f|DaM!tZIMWhSiW;g;7A@zxZ_sQ)wBsAE!Q>-2T(}(JdkHIDCY9sEZ;?D0r
zE)4Zy_DyK!q)!Dinl_n(8Oa2_n-mIo)oh|C9{i6T^Y0Hnr;bUg8e=MarY|l$0=(EK
ze$@`xb=xDhQ`}0K+A`x?n>)Y*D?fH5aR2|qQ~vb@$$GohT2p%){J#7@{5#V9200d~
z<CN#Vz3wc^TSW65L=!r^Y@`>IGJOhp2=jygUhjWew~aPLVU_cS@6%E)S$|O+YcKb#
z+N&Hp5*%s~Xz1#|rRZb6U85u$?`9cqn@^IM{J(q};f*@)y)<uJz0Rc@npdDLp5JMr
zh_^i*l{H>LY4Edx5{*8%`kzOAd0A-&iqqMTXaJTVpx~eWg!a*H;=5-N5AD^SNpMUC
z2byX<i*?5i<b;lE+(JYNz%=DI1te3WGdY9{ZR_!vj{R6wIL=%H@Zqk#m%^SQptVb&
z4_M7vCpi&Ul>PO4gTaDARU+R4RIJh<L9I+fhb5GgD@4qCb(<iww2nt)H&^fclg^-}
zE6#{bC!ZWePLpKB{f!coMb@z+f8QMODixcj-!I14dt4U-*DdD#2I88boWTGZc&Ql1
zMp!(IA+dCHayuxTi9Xo9XP+cs-Z%6e33~ynx!(=?5|>~un3)Qbuj)&+94fw+V|$Qq
zO~1cs)epHJVjiOVY-uLHb)VcV!sj9{Hh^vSniFxmr}ARc(St<$DbB{SkB8a3`1VaL
z(+GMFz9!f>!e7S>?S;oE=roF0Y0`Ec3cdu|=)Yf`-hStp*b_v+J4w^_+*s6A2mhD=
z09LofX{{n<p)$KQl=QWQHUooD>@VLzT(DJW*a?gHXNLiFyKpn8zTr{wQB14=7DRn&
zpi9J#K(XCm>aIJKp5OJh@cOxbHtLO=v4h?&lzv{rgr(TN1=v^`8Jbgq{-B*9&GN~U
zzE-^#(G-S`e=w&`F%!685uSQS5ZL-0lbf)6c_$@mz5l+oF*&66V@(n+OIlK!Yh&hP
zo8KeqY8qdU^kgdVQ<v4yYq-|63Go~zoU83e+3{pp`~4S`xK79HQQEqe2q%D&e6-6|
zkh*Zm-_SAh&|sa0dv|voi+OpaQbtGdqZ;+NBSK%VZiHng_Ff&56I4;?C2MQCROG_$
zv7Bx{F~>{3XnvJ&P5iH)a|ii3vrigQ{=Q2w`tWnQWJ2@|g*A*4PzLfN8PA<xn2M(E
zovC#a<&brK`Dw?@hOh~?g8}bxh=R{!$M)^`oyiFpKK2(DYo59VMS1u*vyYWSriMbI
z)3jg1?H<)c*g9$@%4<i5aJ{rDpvLn%$m#XD*_qdb>AUkgycl*~a@;fd_apujced6E
zqe2FeDn-;oo`7rqv$&@;Fe77?au>Kqro-Kg<`#Mc$I>cu%|p$*xmtB|AG;DopI1-D
z<yINjdpZ5zUb-%_N;Ad{VShgY#*5T|c|tCu*5DGuDyc=^dB;aAA6D8NhGRPKu#AT>
zB$PhUZ{bVR9f*)sa_f91Z~g2@sP_NuTh1Wg(qLQ{_ID0|AwMw6W=9=$epsQkRt{@+
zK+c`4p0zfZTiZxp_tp@ao2<^@Zr`nIx(-37lg&;YE-hH3@k4c)hY!?NCI&Hg?$~5I
zUI?5$#tJcE#`i*hM;-b`43Uwla6564GHYN_7imD{S2^~Z1-RGB`Yq#o*OG6$?bSa}
zVMExS|9SOo$48rUGVP~|(%gQI$vQ8Dc@{X4b4woG&*Y?{%PA<OQ=+TwToO5UHly_X
zo0;Hbt>nz+wUd`)V-C_}XLFRhi3*u)Z&Z`J$U-buHB|3dZbs5kri(g1>q`rw^j~Au
z8IPfiPc(2d6U}6WRbJlY?Xrrc`%SOO$@j41t8`VUZ@NXC?sUhIn15o>KifY!1=CQr
z$FP#ugDJ-~4aMbY<dGV&tbFx+zGYr*_lTlz*^9-jrP8qG5_JWZ@lLuUTT@>-E=Yh9
zf0OU}=Yx>wbxOG+B02O{aM9z-D}JIW2G_owrei&}^9=8&ZyY7+S<5(gjSsI=oH=h7
z)-=i*5jQ^}IO5QG3%^bwE_QBfA?-_dkuRb*x#@{^Y?Ro3B8$|qu^Jay>Qn*C7yE1!
z;>4(T{0(@Larp|~Z_Um9E@I|W3aGlQHeIjn&V*zPd}{M@*wLM63&mr*6Gv4a244TK
zJrY~L_v@kOl!Aw<>YP)ky`=2SDc>h;E7pu_L3Xydv2tvutjG_FFVfw2T{47Hq9`T*
z2fOo>VufsTYR5@${Y%<+JCrH3Doa(oWazhk_Df`G=MI_f-myU?gYV-jb!M-O-iW%r
z>Sa2w00b*SrbXCy81DU}GYhgTDxE5Mtd$9aO$AtTo>>P)a~JaSG~4@{7_wukg&#W!
z=q4w3b8BSYmQVJRfB1S7z4~td7L5I$Cy4Fa8%MsV+0{_g$aPPJQ^H^vXBx3-w6q~B
zTGe||Bm8LcMCb(}#~3v_mTu32+B|0T+Wx9U@zZtT#j{zahAo+Ij*0RSF5(JiIBB7z
zJ0&bIUi&{C3j2IGwU{lO^e%bm%olDi+{)YHLOCbD;r_LTb}3n}%Fo8(HSNI@Zj%3`
z4urRb^*h#BZERGgsYP7hv`@l%rFL((S!!3a$gTdG92)<IhTf!}$F9~)>I~nXc+1H=
zoVU&EpYN(+e#WI`ay71peH+Wb^8_|W_<-Fo$KSbPY#4Fr`;jGyVuQIO(c#rb^~W=g
zX}R)6%;wgHxe2A?w0)~<xvLs@jLVY*MaUyQPt4W2OlQ!p9VDOw{#nOekb{gWiuVQg
zzqsDa$M(ZA(>i*oKO-XdccE?3F<%op8|N+~?MWN=;`x)5HYMO13a_p4J5sPuH^k&@
z=XZZT4!e)C@qLFZA3MrV%5FG+ERIBWD*x*zA)z|_k)rD)PaM|y`7x7J1znPMZjbJ$
zxQ-kY`7QRJ9Ppoog9m!`&{;K3bmXLj>&Jv)RR&IrH<6nI;~~fOLa>3#9m#)kz{VfQ
zL;O7QYDX6eJ51gvcn7V-5m7D4668&EGpH{v?10KwrV`t(OBNkEAuZ88ex@anOR_YM
zB?r1&-cMZ(TA4KZ?{)cWg+8B1p72Igt3xw%N~A~X+k_9{D&}DPg`{&dYnj-bf8KsL
z<`x(;XWyH>GX@>Pw+(8n>VVp9@KvIt?gBvEjVc-IF^Wb(L#gS<`R#c!my0<19`Z5{
zOfuf4sV`3KbDKHP&DR#?&Sd_vRnFWPJ+vjjPk0u|EKFV%%Uy>kuF6jj)voQ{sEamu
z7XSDxuZOx(Mf_~TOYW_sO`nBvFfL?1%&766<jg$`!jN(XROlZbGPt?PK(>qM6Cz{a
zP8io30<8z{k$s{k0nzPW84>+`9+<_imq8QEU(AyO0hWK$Fg~pTt|Pc*_vM~;SEqvG
zvQZ$@Yo1-rf^FkCEV^z@E$ZR+?iQ&(!bBE?f5{ja1^Qk9yzG#4Nk5MC*gFA*O4PH_
zcf>OQ#+nxc)V#f*>*RgU6iIiHU~P*i<-DlnERjXr+YrVDI9b#Hyt>re3b7PRFTiUq
zxON^Enxsdc<wSJ@dx#6jC5d$PAbxJ}>u2BQMWOu^6ecn8q1G8;z1k?2h~O3pL5GbX
zj&~||>98)9AE=&s6Dl3#8zEi&MihJ|BKfmJ8Nv}Nd8}?qbJ~lV_wJ2^hy>i><3Bwz
zK_ooi>#9kO<(>g1x$_lLTesd(%4A<}7lfD)X<KuC4s18X>M=9JD=?eIwZlhoAnMDs
z))l*Lp?4|udvk+vXy7=gI9ebMRtT@#=ENE_)x_5tLoK`7VIVP%TOMz#)u>_`_i^Jk
zuIG7zFOkQmKA8eyY$zgke0)4uU3bi#6RnZWm=B13&f}r%_0pfzYRlM;aanBNX!pN7
z*h@C}LWVPtjH^^}p?B^lAJi5z`;Ouc41!8v2q+k81I*#DhFN;v2XQlx%cjjtFd?-g
z+8AfMZ@!<2>Whn$x2wM%%J#!e)IO!!u{V*$QXA4!A2Zq=Lp@wrezOQW)%=}W?;>JT
ztj_Yjt25HltAk+nRYfkGX1x=qt7;;YrnFI4@8iNdY?UP;efK+m&(=Q-P{#U&-Bu<3
z3?+quFDl<!zS)lVjov%p9dc?w1u_J$)rrz3aRXE|L+bdtIcAuI)tPm?hx|R)-=vjH
z7vrTpGcGMgCkD|Y6s8eVL@jvk6S;!G{C{pXwDK*ONoE7>y#{9dt|Raf<_IOcbjAx*
z{9mz`7?~mn53j>$$pzYuqshVd`ERE0`KCJgE>ff>C1T-a%@~}(E~+6yRQj_;L5Sgo
zt$CbU`%3eql=;}P3+27@xCqIp6?_nxeRj;^T{@ZBn(nb;y3nMsDXrK@9oAf{nu(Ge
z`9e>j#|-nNOG8iJw`S>cn-bcW7<u|O@<qhvWX47>;d@VPYCAZZc;Mfd*(FWjTiTX^
z0PRzuZ`#A$lY%HUgY}NG$mTD7CoEbXH<jNVpO?owL#+*VBhK?*wl*~EYV=i%y0*wd
z<2B!6(~qB0Qt2m{ihv?m#niT-qrTY<#}Kh8<edtwKc$ZGl$jl;_xik%)BpAF-x`bE
z%O=RDlse&dXx+y8F6Hs3G&In&d@&kaC(W{+J#iu^$#AN8YAr)RXDHINZ*xjSx-RDq
z2Qy##M?zol3bs|g6~fDMDOp;dXN8s~pnUcH0zEh9W9DFP<()CU^o7y?3!DPF%iPDB
zB-*CSe{W`y|6H&E9*^Ji*0zVo7_Wap#@agPqD0(_5z8i9&@-=<W>2R$FADm)pcgAU
z8J{nifR=2ms;IO8OinMN2l|}->OXQGiYi#d`xH3I(ULL6q;p^>!32PVx62#T4W64h
z{O38f70cVl<W3g5*rE#Hq;+ZSRiq>N_!v_}WZ~pgAg9P(;8L>hhFtVS59drxrhRx%
zOsD_E$CpjZg|@*pIcqY_N$QDj%_u_F)UOgKTU)P!{v2{PG<8NMT2s)WXa`#F^eo~9
zr(&}QT6{sr=9_8mFB<y!y72SMu%yk5vvZ25l*Rv&epWZl&dKL+X?E||QxghLbG90l
zu?$2}?9I}Yr<$tmfDL{q7eFw2&cOIJgs~1;{1IT7E`dAE%!dhRq@4NU3Ui+!=xi*h
zwUZe<y{C6l+n<sKK&{#k!voO>E6<{!M-3wB1uNTLP|Z4NG6%4;r8AG#N_saq19US<
z&=+my2CK0@fwHd>27WU!iF(DLtc-3V>o@-=y^|nAtYq;{AFOFtPs>|PkG5yV#N@r-
zNAm?yZA5TK2;az~7_o2>w=C@2hY;5GjJ2VEEtmgB!QB)q8^M>ZsncjlPD;!&F(mjq
zewmjQ6`B<BA}DAnB3HW_E5ODw#w|rH(ZaMd?(kA2FLa2ZGUmtn&lJD<1jBimxn@>&
za~QqB_1llx05uGSg?$S7pO0VQ{~=!gwmG#dp$NxQbz<d~<aX@8xc9AHgjARs5(?lV
zVQKaeiQ~I0)j-kAHQ*BH0Gw==YR`scLrpr<B`_md8~R!gHetuZ;*IAPbXX7SLbS>T
zhDv8OeiMiQlv1hf4<js#&mS)q8COf60u3*hO6n5N{}lL)vAt^x$52a&domCu+gmT9
ztJ`OIi)-+}?lpX|zHHYZTrl2iqp7832pqKR7I~qIQVpTTNiT?&41}Y2hReT3wNA}A
z#(0Fin#2FduRfAzxUY<HCb6`vWc-zzgf8xrad+p%970cc>ovuz#-7qA&nvD^N<^>+
zyx(rVi4XOwe#klozr}X$hu0BlNMl-71COa6ATpx;khXf_p3~(M-(I#ae;KfxPY6>T
znj3o3EJuf3lA~F2O^yz6`sVO*zMF5VGB|5ab`3kvZ_zQMcDf~0M}3)4(9)j^sPT3=
za``7_Fm4ol0nUO1spJKhswN2TbW-ks^k*;)|M}gnLA=@F5iVwPAt`Xd`hhOY^V~uA
z;D~`-IFA&_LQ_blR_)QL`w7N~haUnlccNBJ%Ys?U3g<xY4pX<d7-~WlLEjii<T6l!
z7%CmOQcee|af~l4SrK*EI47CzZU9)FZxC5ai@MRY0j(}szAh8Sl{zaA1jCVT(#`Yu
zKL2f55*h9`x3H@4o9+hUmsV|;%pi8FCf!|26iv<4{5(?kV$g~p;Jy_qB{MO3pOJhl
zleQBeP1oUS^}~>_p6mfUN@g}VqwQ-JG&zXPkDy*0zM`9V$>do%ZT+@`Kn-cUZ5$rZ
zHROV|zzMtFtbO3)lOv&qp*nm4i!k@}71@U>-2k(2Qr2pKLeWUohF$>Y>rR6pIigP3
zoDjjWxypyptB3bQ#V5&lbazL)PLG}TqvrlBzqI6#fV*pvmz!*;*EScYvz1<jt75F`
zUkTb7{a9Em>EbMzJT7T>dK8l;|BB2iV<w^RQso)`-K3-hEWkumSMjBJU=%XC1WDW@
z%haW<A32GQCtB^{i>Mo%UiT{+DB^`MM*3aVa$I0KV2QQ6(^;x!;?w04zfS)YvpX%z
zJXOY3A#K#Y{{)P!ZXxO3x8SpIN+AWZE@e|z$n}7PZyR4MVsp0o&Y3_Trg*)Ih{#Ya
z?+lBt4+riF-OFKU`Q3iy;nsd`y+I#LzPJRM`P7IsZeibJK&4R~`6PRsJwC`PPW4`}
z5Na7%9}?kYneoTZAC<ldso2%?h`FC76N>YS{vWAe*!D{y!`~?Nkme$LdhUyY2T_|B
zl0OZW7oPm!oB<>Fh9CF$9R4XvrJ|tDYA<}fW02Loy<rSIp97F<2`E&PC{%{$o?`nI
z7@eP6{WSn!7d}(Ftf9M|>gMzY+g&1zII%7?TqT8u*JPMJ2L51MgQOZQ9TFPfdu1-Y
z6&mrLRtfz;1IjLyPkxSB(^s^=q{A1K(o=icBO%27M2k9VASi7)`S#|{Q3y8ozztXP
zxD)Lw(!6anX`3USo2+y8960<k1n`Vjqm=erMED?5vA3%w_yr|W9YKen1>>&v<vRto
z<0rbQhPJB7WC?7&=|RnM`Ux3+_lGclcV_{s0Xd3sAS|BL5Y7ZeqU?Sj_I=|MR5my*
zBXG2c^XF{|#b4n8%owKVDhb$-_*0Alm6r*n_u#W@tCB=DWS&RQ$N9f(YX_Zw7DV6v
z84wXT3x5z<9oLJu-b)VH6=7#!Hxi-V_QB;plQ6b-5VzDN18LA}47Co+_r<~@%<Ib}
zXpLDUlbs!?avh=NOpFOCv$q{`9*JTd(&ouD)a#r7L2#RR%laccIf=cUxo5#O7N2`N
z({>q2Fv|8FJQ<K1_M&#I?x?!9&7<~qsM)k_g>Hz^b12rt-YtCoscLodC9#%_YnVX-
zInS$H$*utzmoqQ_Dd+wi^SAVhDR7NzkFYN;IAlEqA2VTPolF}_J9n<;T?ZV_#<uV8
zyvusuaVl->kH=T62eGoCQp0ddTkC>NzElC+5GZAPuS0xK<54+ZFU7I91?P-~7V+cn
zF*ZLGInH05m^x^ITl{tjgp8OfhrrQtj%BI`$X58fuHJ0HildOZ`?WTg!~l$FIEl`&
z<94983jlpnpV9EkgRJ>js7&@7f3k0?`asE9VyYBpm6!&kE#eo&%w|RP)=<x}oY0-c
zEbi4NU$lROzr%;JQfO@{cG0f$c0q?JmWPfn)9?|qp<>H5r0E{=wTw+BYhO!qEV{bl
zsO4f@oa**Yc9XG0Po<_US|>xX%rt2V7y0Yt+7V6L9Wu9Ll*z*#VtE<!!Eu5hB|Y-i
z8Sy+?zLBKd6)*1&-5^&hw@<5Cy5IL-v33x4jemWLsPc2-#aHMyzq;vK5hwNX69qai
zxAUlQP5NN}GSnh@5#X3{ZQnjDDu5Y{wX(P$BC%fsvP|SAGIScL{*!XqIN%L?vyO!=
z*$?65_<M30{KI%Hx@ORUb-cIbJ>xsNkw{jY9gV9C<Cd5%otGC%`BCO^{+rbO43Pmg
zM0Onisj~1=U#aF2<#dGpcKa?l15p2$0U(WGMJ(Yd)oz{*P#dkt^Rc_vult&hbf_RV
z_i35OYD<hu>PaBWw8%EIKfLbIM1&Sd?e%naNV{VBs&v#pfRvCjZAHK~;$}3T($s|8
zs!8|Hey~cGiCOye?H5ts#p6vH2Cvl%pShcd=C!0|SuGnbku3)0YQJ`TF&I%c>7<VP
z{5!S(;MVayr%S)cKF%zdfIx2Nl0-M8Hs?ai#e632x&S-HnhE#)A>s#T<+;`^$z6P0
zy4+)fVNB|{2V~&2z<y-cGS7N68qe>(p2nHC0c=p>w+-OdFtLUuIkP`rz5fSNIt@1E
zxV(1>)p0z)lUeHt6SOYZ%MnpFAdTgkI?!oqvsO$RGjTB=-KP|KUA+NztvVtD(SUe)
z2O(u@Ms=xk-Xy!#QGzH9MwNKDK8i+z*BmNiRB@)Nng+A7G2G^XxwgmciC%leXxUU!
z)|x&p-rt&czo=Fyoi%XjSM&IF?Z4Mhk&eieCL+1gp$vh%X8;B23yJ;vy;KzLZ;a!Q
z9Us4rVGUZ*l36=t?WpORC~w3Yq56?2Pwr>!>Aa~hwBOPPaoj-R9E$=Ox{G^ORXG3h
zhOajl3*U%f5TS>u97?_z?L18FJf4y?2yW&<;76ngjfta07gA%=bxqs>yd@smma*Un
z5^P@pGj+x^|7F|S!o^Z_(@y}3AI>d+Em@1WpCYOcNZEreh1b&PIyvp{vG4}tQbd9z
z*YOA3zoxHzFgD)orz(Fc+qwp2Nss_1E_OP6GMT*>awK6S6I|&TV|xUXxENZ8R;4)d
zQ>+TnlOxQX0Q!>Ze+}cmKOKM@YCk|z$e%|5_-~S$uOGrR^12U*@Qc1svm^>ysG*Ga
zxU@SbTsynDNx8lcZ*uW>ADbTs*LhQ+mGKc7g)Yn2wvXoK7wKZAPp0~1Vs`5JC|<+*
z(MF6uop1w=Fu6N}7w!;{{ueZ<v<ArTkuPLxR2!sS+{I1RFuMo;^_mKG>h;4lN)nn~
zGU24y>#}z;yd|C4Wn|5DcQq9!j+ywm-igWKDnBN{KCE53IaSfMEa!HZq{Uoov@8FO
zyF}1-y5Gn_<RGXt0|}rbe({c{iRfBq!yt%hyaX}0j%v!zazK5F&%8TeCAvB5ZwQ)5
zeS;6GTtxnW?pl}&9F+SNJm$z(K(+tax%nre>sJtyDhx39iZ!35;^02`-kB!y!0F9%
zz86p;vj$|Qj`x7RFqWtNJ&Z7EG;o=~07k5kpKc1+TELB#kC2eEp)c_TO4|{(e%Vw*
zz4~2*&I9%>X`$NpDG(uQavygJ{J8|*@GwYVo2o(R<NYZ=&MC!yYhPJNk%LlAk4aNU
z_9|7&EwjVBqvAueSVKD3Za*k2wbrnsjrLbdes1mD;#WqMMx)+one5)HdO72Wx^qcq
zM{!XNe!zOlg%IBF+rO3Y`uXOT&d56eq*)mjz%W?dvDy@EIt_9=&(nS+Q9W?DbTg|P
zRDdF#UPPP9(CtZO|MiI9T{fMwO($gv?&DCPJMRnc1{}KKbKY<J<2y+*gr09M6_hi(
zT%LkN<qrSQ6qvOHIzzDiP>U*vjCA;>F?^xP6}2W!arDu46CpN^D1&kla4$-CPNfQA
z^GiNC<lX1UJbdwJwROwAo(-pK6+4bI8CFwoq$Nr7k$MXaFtkU>6FS1LVMc<TgFfE{
zaXelH^cTl3g<|h#>B;qY<u3mk{zWse-4d5%Q-q_MC?BQyb3OWA{{bta$PDWO>%I>U
zhJq*>6ABk+Q!|zj(FQArBBrShLv8Y(lWsYg>|drJMh!Q5woBBh-L(^5S^^QXrxu$5
z*A9DM*1Sn~#0jdD&)*_I`wc-|VlSc3&I01^Nc@nH0HlU@SdVrs-t7$S;UcgO6@{?S
zAHI%eH)Jjaw1Ie+#=Bcrn4S|8k6*7iSUm2!_VGH=3?AdKb}LWkTenpSw#4E>iMwe2
zAqi)INeh~J^ahrVUl3XQPCKH)@5x9^P~6{Ir`DUXA?+(=tK;lY;c77$-}_8$ES6y<
zP8!%~O|)6~(9G>kv+2^!E3(~uwog7$LDOG9U-=HVgEIkw5sA?eS#^Y(_K(E_<iuj?
z>JgWnvr7TgI|uP<xIWE&1~G#I88V$S&{-RUN{mU=^nu8<BQ%J?sWncLa^sR-jSfFJ
z6%~#8I7k)f`w3o;h8hcFBfKna&Q-piT?z<CY@$8osi>n=1?kYe$_A4ZZ*+G=2Ins*
zX1e4_CqaX9KCo@go}nKs*-xDdoxy}Mh=N$x5IjV^`X>d*?L3kuw>z2`xxL!{Wo<Lw
zm6m-asd*luoH{-hkLUK=7v3*Y(WaYVSZz+PJfmxIq$#j9nw-O`Chu2K(dnR28Jn3-
z2OWoTlt{baP2-Hoy*@V+IthX>+x3tHvZ9<I@$`j=7AXOc$Xd6jKFoz^l}1Osjom$m
zgin?|V@bBC@Un{NFT5j-{sq)Z;8=UcF~9em<n=8sj%7W5el~?QB(>IrEa=hRaj5H{
zw;5@i7}Q~ZN$Zz<Bmg0lIB^Tgh)HnNA}KTC4~&n$(Eg|!0=pcf#t*K_3Cy=^nxTwc
z%05BrD*Qq<#oGlkME&?Fq}l)xNE)dltfK=~AITR%`f|%+sOq%SL8@iHa_Eamx-?6<
zB=*CGHgnYPHQP~vv=M9b8x&f;J4bQfY-o_F_X%m9j0n-`vjMZy-|Hm-vp69V;rXsj
zFyX6eS>(k|Oi#s?YoC++U)|oB1|NM&+B%Mo(QUWW`HdcWsq$&@#v_yXcpg8@^u^p@
zVb-c&k9)6ZN9&z?r?OIQZFBL{(B2Poho!yK)0_jFj!JH~vCb01^5}{<>jX|gr!z#3
zuTu;fngf_DU(RAlkHu6yDIlTk8M7f--Rl9GG<YUab-}23ejxRztHtHsVNFcWu3-yw
zWWT7SKu6{Tp;Myko5aQmO(g|(ltwm_vO2l_1N|~@7ighBYqg2eiw_7!T&*U~{kBA{
z)zIw~+>Sbma=Y8bgxljwu3ju0r<mW!Qd}EkgkGV!8WH<ArRzuz?t_kvegSi|QD9X3
zWsw<9_v`U8>sqk{ZNxv@ljjX4A#;B0>KPCk*{=w`e$}oI)aV1f5|5pmF?MOT@h!Bq
zajx$^=ljdNzuVF*Sut#VWyQ{Ce_-79Q_vscRZ<Xb5OH7q0y!%aup8`dt@I1p4c`lD
zvaiH3b%VCXAe5>h;IhJ_T$+#|zF-;id!zUF^FhiWXb1eb<5LShTL~Z@fy|uB@$mNs
zXF`i1XITCMYD$c44i6%RgNucKR_sA~uGIi>W{@*+xhI33t#6glEDLcD>(~Z%?!tDF
z7P5>NmGx5i)Tln8TqI2i?0rXMN{5wmbvv~7<oymz%`T!g^5j+rk>@|5oO@3Ew*AOr
zQ-S&Zcyaovk2I$KT&NgEo2?@pa3fZw`N851d3Bp^C@>k$vg){7BGloJhlK=K$Hv5p
zfc=b{lyW&0iO*B{nn+Gu;xuz68H)PQ3Y+sJ=&Y<$5%TBkQRg$<>mZY99)b}EgQ?8{
zm!`{;eQbKQW!f7-gqCv~m36wyGI(fG^vog|SqgLyl%En4E3?I>7SE0gc)i@zIJy()
z8E_YC3-7Rc%&HGkCsEsIkQnfb4{ZEJRQw{ke9pZa%kuMW{*^2RCzEwi@%n95qw|)D
zH&0_XMkPdj#DolMUP@{T^p*eK$Xx$bB^m-&rVn#UtA<4&sDU)`6R|mIDH96|)Z6fk
zxn%%Rk&LH79LEmaIbuK8j|^(e^F9q_BE40K{Z9&jBXBX2;^z4aFUBMrC0dHkC{S@~
z)|P#6w|lk?tI0cd6YRQ})~(9qi|6<K@Y&x7>8bC_>E3cr<hp6*tQaiyg9b_t*K#1Q
z>2G#Mk?m?`yY1;hl%PApeGCUwm`fI2Ja(fvr-^TJ*ZpXzB?n@iiEYc<TP_$ia$MwT
z=%q5=x`i;x*c+m&YOL8~d)0I*Ey2f9-bG%wh__vvTX_@4UC-h>51?;P8H=ukA!>1s
z*#)BZql$>CqybX+4hDz+gxM2?>LSvef2pVcoR`70k_oX94D4T()NS{982I2d@pA>%
zOI_X23}WyX-7K6011!hvgQvjvF^wc+-j^c#xHeUDy18dTi2F)ZHUS98OLfx<w>Ss}
zfuX&b8PlnIvZ2j6+d?`e8-?aoHpr>t^NkO@j}hMA!7pA=8IkxTI9Fi5cLVn7KU%YY
z&@zezM0sWP#@FEXb6<L0P!89Yv#+h#B}tQ0*a|ru$}j7Hr^$!jEOXkS(^B44f!q!b
zyI4!1c71f(;S-Mu?+JNZ)f}I-5=UIzfJ~QnFOAUF8RrScjI)f_MGC4Q)a(@y^1yj(
zLDw-D2rP2Z9;;(<A}1|aOUK?Q&s!jK3H~s1;1&cjAb0HLxi5_M)rNr|#F^j*e)F_>
z+S`0cwzY*$66h~lOpn1^Th{|-9um!D%wM6bRns(0_`>D{$~Ay2Bkc<$G=dGu(x5=X
zdb@P}fK6&82+@E%FTMBqf~A=&SW(5K0gcA-_}c!~bWF2Nn{?!@^X|dQNv$_!#@My1
zc#U-LUT)oa7n~B0eUmZ&XhT~eNE&^e!qRtOm8$W8OW3orw50n*9vy3TU)N7h3{G{L
zO;!6bjnJY~DLva;4;>LO!FF`yV!UUI-mdJ{5MtrWWS2<j_+>k^hnz8ec>8{i4cn`{
zJgOD-Dk*3VsU@4-d!V%`xB|JU?`kCCgeU3TUUOGdlFqIK+DkrWFOdIlMCWInI)Ag&
zu>-};+86Z{k+ei|0$HuLnGdN4^K)~*yt=5*u)!qNC{YjYstn3E8n{a}@zBsac^>gn
zr&#>e4YeeD^JDhcA?`)1tcSMK^}CEeT$p@3{AG)ANUDS!U>_?;!s2@UVR9(3dn5gQ
zs{Wzz*_p?${>EtR!rBs}`_Xy=c)&yYOq^<#7n-8rtg#_KAof}El1BAoy-L4?hZ7Sv
z&3v*t>dryN<s~(g(m4|o9<S$EdNYihC!K_PC%ads)}{|Ul3S3dO4({D{x~YxBPQ%V
z`Bo(VD}F%+mthiW-xAydqx)(}oA%0Xz#Y^=32E1a?wjh+_ovi3Rbt!_-IwM=J299C
z`o!mzv|LD<;pCE^*Qyu8ffUrmW<0Dd7w8Q5#<R)^j3>Wg5MP{rIgwGq8BhkDms-%)
zIV0I3(V_zb4{WX*$tvRnbmpo<PfJ3shV>KNiYvRY;iRKZ#r08MWY@ux7Kb1tn0y96
z@AUc0ZkLI4MpGtwMLnXTD>s{ja_eeK<~8SKL4&6MidNOGv=lSm?;fE}eM=kYsjRVE
zmVW?pkNxKD@IAWFa1$d)c5NLh3U<Yb3Ac=23r1goWK5h5_PLyUNDd#2w`sggv|6G=
zLfxg-#yvxA^+*`!N9_&*4z7HR)bqCAZ1RGb>c&56Eh_kvwb`|%NSmK}@+K46fx%IP
z_x*q<B_cVgZwk8Z@6U-3g0i+>n)UJXt<lynX!f1R$_8MnRlJWp#~MiDF^nZe+WH`4
zwdNER2X7c;h>EW|G9wPwmpnJuF(L9M(1$y{$5iSFlDi`lJI2bztdB$HaM|ai<#*i;
zWU*Ji^-H1&?9~~Ezy*)W+1@ff;OTPbL-jrO!Q7^KleG)g1Q1}W+O0>FY5~B#V{ROS
z{ud(*Hk|3}9Afzu5nCKAN7RDTuf8WF%>tNMJaOr)BKI2(&U>hvleKNprXPc;B+a-?
z?y&RNnsuRug&XzOuqiV&Wbvx>Wf)i5?TmZ-lBr`<tetS<RCRx#a<AJc!s3GzEhEtK
zsZ|=6c`RAVcs^g3-A-bDklk=P5#B3jBeZ(5>n9(uSOxnYl7FZU)Yl2D%R5!m;IWkR
zg*Y1~_cCYPe!iEafyD~dwr>&-T)4|*^7|3X#0H{rCkpfV!09|ZV4%sR6kfzyJZ7y2
zF%LOl#s<f`;4iJ36zZ+_Qw6%|9Qz5wxFhj&prsn9YGpqc^-T{dt=Kg8SkZTg96ogR
zJWy+FC;!pNi@i&f5pfvIAvuuRX>KVcXu+|^K^;+fjr|JEu5+mw#dqmZZa?9?cza}K
z>vf%>Av%qUb@G|$Y47Y1iX`ialJn>j>X@f-_HU4y_~9<`8`~_p-E`SX1#>a!8S_Ke
z?B!aK(s`be6(>cUjj!48qny=A9Sj=waKQv}%FQKJnr<Cq#g~|$owvkQ({=k|)(2U$
zp>C?(bL8^T(Lop|{5+%N3MvC)EInhSuzQ@e<Nf#=VakDkY+_M3>tL(wQ2CpPFd@;9
zuVACMQKefS(tX6KZtz4yHu+0ujM38*Mgih?+KF8Fn-x4)Rm#n?g#PrKeB_NlqTGrJ
z*FhXbk;G0vE{<|-kyswzYn3zRb00?U!ALKKrP{R|2Iqz_1jZW++K)D3qD8JQtCz0^
zgNdO)fbM84Fynk4xe*$r(G_4BfTLW1p{PT75cC}c;qt=G1g;8yC~LY6)2eLf)YL*!
zHXYi28n^P}8F&5u_@cuLuKpQa##RVae02P)`F{J{`(bJAz>LyL6P6-L`KTo$aJJVb
z_iot>i!rHMt)XUGz1JP2sN*O7#sAfvDksV-tfxwv2T1|Y2JJF=BYEmT9b@dZLl(ZP
zT3(0i>{d@ZCgK^N2&FlpxB4;kQA8<`$w@ch9OWRglXJkQCK<GEd*KOu!Z?{5kdhVS
zAEHpp&@CC%c_c@tpTE>)M$+XBoxdT-6As0PdHp255){3FsD^Oejf>8A?KBMamXAgf
z-9P@;<}OtYki#?dj8_iXs;#?1+cauqS`*9C?EK&%bUwu$d)=L!A+GF6h=Dr};r5m`
z)ETUo|6cw*2)CZMTfJ4;N+LhC9i&kK2z&?hx|oqdd-UN~<pRJ@-F-&5o3V4mXoqno
zbsdLbBn>_MohR1oD%x4T{KzECC7WT=<UCtR1<C69`ooLYeGc(|Zw!-QAX=pVaY)EG
z!9pA-|DMK#v@#}m^ad_uiD`LPs*VWyM^7_8y2SFeN!*I5UxOcNpsv-|MXcc|#m<y+
zmiVoE6qmd?<n@7*z+@0_mNdDv`!wr;b=f1d_Vka1GwytllrcZ0&^5qqw2gj&7H&!u
zjBQHBh&gq_EJ0Fux12l2IOMVuI3nuTdvvI009N%9$^?#=jQIFW$vY&hU!#Q7kzQ2g
z_18fQ_NszW6>8(dnsQbU6wjGGDzoMQZKoRsanT`7s*Co@ah>4B))(Aj*eD8lM9?H|
z!p9B0TSYI!0$?Pea8Z_IM<Vf+dYAWhEK?-MqVhYg+V36KgI0RrCJ;7Vk$EA*nBxc7
zSR7Q^*4U^|B18`Cu^)>w`dWXH`yAd!LX!P4@6FKE@!;^-P&4i>J(qsp&mq1M!aLW<
z`Og}~#EYb}tuO3xpZf+<Ds!r5aZ-D=^$!eRuapj==vL1fs`V~-StgyLBLA#)t6vct
zNQ_QcG2rVq=eAN6cK>RY)eWh{pxn>Z#Z1qsv{krZ1zXI*lAXu_5oGCI(&}vrDR|8C
zX#S_*t9wVlm3h^q^Jk)ed>b8GaMu!a8?6$;tz8j=bCZKh`JYdj-6LlOD1eNRpralN
zKsX*?N#bf=WNn06BVDe$xE<zN<$r)$L;$ov)$HRuE_PobIn4zB%6ECR`_8!12`mrk
ztMmHkdz#<b+0jKAbD4JrzNnX-$}){uS!HJtriu_a!yfs$oczvC^3_{r#kvW%l{?X3
zfUxZPtsdkypN^`W><GVEu~rweR;Rmi@km#_an~nLhe-6+lgMLRJ=t{t!B$Gy4ko#}
z8sOp{#d@1H11?irP^ES8uWBn^$p_&{COx&zG0-}CmiwB#=7T7anzoHN(#|if>iFDK
zoimwTTLVX)3y_8<qIRj*^WFYH+!So^K`Y^mt6A3+NJW7A$Y<Ks8N1=HfiU*#5#<xQ
zJKHS7<;+eBBHW)@1n2NYcE;2f?>yu`phBlbm4BE{;?&VV&84C77#>1kd|m;G1UrmE
zGM=5{m9|e9+Gb+=jxN87kC-TV!j3X^P1G)Y9ug7~XQg<-Viq4ybP%I{JMQ)YuW7l}
zkt)+2K>H1{8+Xm)G}~9^4ix0)U$?BFMcbQ{^~duEV$`x<K<o(htcrGe6NR5nE??`m
z`c`Yw$)ML4aVCA$<0H20tH8DE*_})3u5KRAhmvuAbP=CT@}f*cX(P@Q6!cBkeoCBq
z-_hEbNiJ|*Yg(6p1&B%3A>VMedAoH=m`+Gey|Maed;wJ1n%ZHF3_vGn4*CGqFy5d?
z+n7N#T-I}6T{xiba7H0gpeal@{S#<{`)&$YTYxz4+emQc!w+5+&IF%bXHf<yTbMQM
z>Ua&?b(~Q5Y&zRA*yZS$)0}u(@09#S*uL|Ju-(1pmLwb2k5Ur3Rk2IMFj&<9Hak@h
ze|MTj&R0vNnY_u=<cf2IQkfE%%mRohMHsm6UWOYI5!sko8w60P*u4F_n|gq`;7aKQ
zG)`{3%Oke2H?E772=Z&Stvum+@9+64&4^yQl{K!HDp9}Ck+??FAVN*GwG^NB-*7RH
zYlfHcC?ja@5<2kdq1`eSC45~X8g<h3<yzi!iwOt$nbKR1%?2NhD5%l(Ftpw3kef(j
zi^GzQMNaED%j=a)X8TIred-h?kQmhXf1+jYCVoLST1Q*V#zjL^ckV~Al7zZxD+;*S
zYDHNC-=A7GZ?@3tq3)!%8e_I=&+Uv3qe*;f5v`}`=FM1sf78!tBV_r+qxsaK&z$S=
z@j{hqa+-Lqkr}4v=hQPg-b&Sy)a*TM>z+g45beemM<i1{joz~(n(f;C1T%e%89yLN
z%gj?Ph>1!vPpP(X|1^{!Tqc$^Vst*FMq+3DL*TaOR4QC2E%?~O%<c0k^R=kE(O-N7
zvst6Ua<z}Lh<EcwNChqNO<pdS<x_t<6`0SGG$q`w$=TETr83buF~0hFh!uOA8mBs2
z0To@0sc5?;Pi3U3R+zS1(Tjt=gX<kzMQR05wHhRU5|q!jUKCHjeg43&C0gGdl#8yY
z)Ezn;xo=)O&sU1uqJXP^M3pz_yR7azoXPL^woU7|Hv0W!yMw(mv$H6jSYH0u@CNHf
zw7S;&Orpc^LHVGUgkw>K$C?{|3C^U>#7FeVyQdMXcIkslL|G4y*xJpm&pWi+$D`}n
zzn8kxnxDNFKNNS~(SpFGkw>^RBO~=1JC67svU@=d3-YtFYUn(%C*pO3Jf8PE*!o9`
z9;<lebkaWfW@3G=eVoBDp0B-p(6TO{ZDcN83_*(IDC7yh)HXjAz$ik_x+TxPeX(uz
zIP9=LW|rc6Mt-})@ZUE-q&zwVg$l9edbIkqwjoA6tfMsNAj#M5i2r|J)Tm-eo(+&Z
z?{4@m$TuPTRVkSJu2$@rHtQmd9>ek%?-&2gJsLay^zfZeN4&*qBjykIQFb`(3O`<1
zCb7oG%Qq!RrN?}SeM$S=*15vi!OP1O6fv$R@b>?lmT*~mc8j0|Td%ak_ebw#sD25g
zmL#RUxi~j@LgTv`bG{!VcL0UJ7xI_E+6J>%$%a#0v*a|J-t8<Jk<&nVys_F0KeO;l
z!`#_LW*($3tG<cMa`l`YUim7u^`c{45bkj_3ibCZ*+q1kAs`|?G!VEf?@y9G&p1(&
zdFxz#BI%;uxn9sN_r`a4P8GACzs|N3r3hRJ4kG;XXUuY~squ<kB&5w}^q57-y0$LC
zSbbvP(qEij*g5I}hV^XQI8Do*zYB~%XE|Q8A}w*GI4moHdHI#P)g`@A*~xW9wWcTU
zm%8^^OfGq?Sky(V7na8Dsa84LE@6o|Z~D8axaf2Du=sFS&Ct88GmV)?h{MjJuJr`+
z{hviXbrVB!4R@vaaJg0~Hc(&w(9%iT?Co*hAFMp`hom>=b?;2k`shP$Zf`jSX0#HD
zfA@vy8+KDAGL=Il^K(^H&*j$GM9Xgdp8h4ZJ#fLs0*U3p8QPs|<G>Z5ldX8-Yc!jC
zh<xx+=!D+#h>Ab!yVW^{jkTq;szz_>9RpKPA$v^TC}lY9ZCG@`(#QsPOEAa^-@`Lq
zZ9SbG9zMZdrW2U&rlLDB+@CYiwf5da=6QO0=K554$Ac05#nb9-Yy^XvIwl}A8Lj3D
zs{b9+Oh~bl{YBX(pG&f5ZMCY0BMw^l*vWTG9UgDjQlq(a!Q%E#1~BqQ<X8jRsx4QH
z$5+m5mR|C#YG@v_S;7|3NmkcI`@Xbe{^hW7T0KQuGxp|>ikIFjtLGCwC(5lof93Ig
z#^a)=M5-pq^j(4qhCRY(DfW*f|2yDduTy>D)TK|CNv!@haVSK|)vA3oDWYb60>tO+
zr@DB9vyM^xdz;j;#0GsP&ETLC%KZB0pim5>vhVc?TKVnUVDe0|*4B*s(y#lb9$o**
z;G*J|B_@z}{+A8{PtJY(c|03Pd%uLrIV0?>!2e)kR=~liHelm8#(xa1qYt+SEjT$J
z7nYVDIuWcic-2EZPww{CI*+Tys^Ip?fB!os{dc(d_kRUzNA~fyk4a+B{{3J6oQa)(
zh?9-hScb`}Wm9S@HRPSpkqJ~%jXR_s8f;Q<+`_~>!%_R<_Ot-QeE?2Dn21IFzo=#Z
z?Dg%o7HmS3tJu5pw5ej<!6qLUv((~xq;vaxo^Pjk;c`H^t6TE6EBpU_K1|U5+|3Iz
zWf7}!X@Z4M(z1qsvItK2iPp(IS%edyl|QiO>^8ShnMrX@3<q84I??Bhegan!HJ_c7
zZ6G8qmmtY+f=ZG+LVW=eJU1y+=QmGqYLCVMaip9Rh8atL@$CPcX3_)J8%Y62L6cVT
z2vMiG+Pck)dSQd=ufL^7!rX`Me{_2L<R=Zo=d)gF0hFuDW>4+<n6{!o@=HuLaPG9U
zHcl=u6uH0t-f)YAa6*#<D?n*i?@jb5!YfXu18Ta~hL~LZ-%A1z-KClR8NclZS+9T{
zmf}S$^E`8jL0MX%Ie({F`xM(Zjzj;1h)PmW49EltFb_QtL4SRaR-jST=s#cgs3Kk}
zXb>h33&mZ~A%S5LXWnRt2s|jZl~guHL<jHy<GqrGzb$1f8i@(>p+8TPa{QgAg!Kli
zD&3*?uw=)G&|~)i**FLA#}x&-(RTl~9v}L6g46GfD+pB`pa9Wy^@s@*V0q^LTG6{_
ztzpPf1AyHbhegm~-M<Hwr<imE0%cAG3^+Lv_n~S!x!`&oS4n^T#-J!UOQN8`oon=g
z7+@!b$VbJyHsG6!?Nc~(3o+<Cx9DWc=BrUcT#W2XGpu_^(#}N*g<jTvkt_5c+D!^2
zXdc`ZJw<eF7nXq--t{@yCFuyz9&aF=Ufgt@hO&uBoY0EK){tG8({@(T+{yYx--wm*
zH#A&1J5ke0=ki5#b4~Th=c^<8Z-~Vvh`TJ0vDYq5od@0fn(s5iwGUS2*_v}r<Bdns
z(nRf}Ioh2xSjMR-@PzLmY=VCd3>ymsSRbY49q#h<0f(`cxYg<(&m993t}VYjj}q?o
zs$n^p@qZ=DCwwDoe?F0_?EcEmccS*C)HOF$^VkzZgRn=nFuM-^_HBr{yZP>36K&Ec
z|MSMG7Qf9)P`|Fb)W14UgVJV#z?Dt*k4Vbjn{g}dU^*e_vPT5$+i}l<&8+=+?lY)t
zLo(ds^HkdH_e${We|avpWiI4Zkuxf)XP!aETq_k7oRQ5FM7bf#_xC;W&no=+Og0oO
zg|u(;KZKM&UmQRLlV+x%YH8M<^G<eoP5qi-Y2W>%(_)5yHPc4elIwYU+TYkYbNTxT
zV+9~@q#O}9-SNHji$wVFTb*gkPZix|P41B?Wch~dZ?n!l#-^0@(b~MrTBl0=nou5F
zSLWkc%F_|ceN^iU>`Mhno~C$DuDr?Y2Td*ib(av93{wm=-O%ZVC!>Eq$1dy(V*6~}
z>#Vm8F9ojl9Z-2R%p5j{wKaMWip!3c%5gC_ZE%u1nd`GslE@``_Pv;}3hVRPu#N?{
z^<U<!1I9^tF{N4a<LEgH#vH3<<K}e_xd)4uq06<M?y46=O1Q`$^d?E|$3>vAwk*J7
z{dUn*&3zzr!Ae0r*5F^OW1@%+IRWbWDIA))$yXEtTBK7epvnV=lxloJ5Ql{YrSR~~
zyVV#qg!~w07;?<r82N-|Bo*LsKlO>A*|hb8Z%ATTSq(2yo7%|UVsk{l(8Kp;g;?6<
zTE$##mQqgi>j49{h3WQLr)Kh)(=S9-8or9Oko(TEf2%jkII4E8JY;t7DrsflOk?K2
z%#E3xHt+O{bP{DprT*s2!l{ug!nBKQBuLB*4Q(`(2b08j2o5zJ*L`z<qopWkVYE{#
zM90!OjDJ}uOw=laLnphiNuJwbggaJGfm+1gkh-`gptbTk=KoB{rD!~$i|$Ld$c`5N
zn35aBOBTA9<zgPc*<9`r#=}*er8ihzlc!0uUPSqLj5q3szB?03?~wi+k;9uLJvE;k
ze~QmkKU12GyYu6s9p((HmH(BH(W8iPAv8>0-#V`kFLvM9oL6ngeJD;V$MEj!C7pPQ
zOx<QP4=Rld<xe<%T~Tk-55Jx#x_P#@Avn6n7`%778gn7(7bpA4g=sRlvGdf7V}Q5c
z&Zm*NZqglc-!AiKh}IcS&fX|>4+9HSykMEF8);kmL%r7`HCYh(rpse&?>JR{X8Lau
zH;&AHDlhV&wyW7#{wn9C61>9WUlS@As>b=vUoPGp+!%wf)yvOae2V8kI2qJ`Us<HM
zl*Mo#?rF#kq(n^Q3C7M_yZ)SNg|O7y^t{5$D5Z1dt*<6skdn8Jq0Dxb>qa}@Ifws`
zvagPcdJVf(!~jGT5L6^YN)eEjMi2=p0qIb>k!}#15RmSYl<o$DuAy6y?(V+N5A{38
z^W8u0U2B$W7;qTgcyjOkR7#VY@0i;s)Zk?h_6Vy+70A-?4HDM=xkbNwD<5cMDlaVz
zyH9pA^w6nLNwH#1OyT9u^)0+<9GDP~QsbrPqetZhF8%1t-?Ip0otWpArKXo^e77dS
zQ%cuUp~jHmt=5vK73|aK)5vTx;mB^iP@$?L!NG;<yv2$#ZOk_-G}^r_*D@ZyYC|-_
z%|jnxHd{0#q1F+c>XnL9T4wM)!)W!Cy<07FZ^AM2nG+F5Bc7A`J7ol<Zqtc8o^xt}
z-un1%XD&V6FSySuBKe1|`)f%=$S#Jo;(<a|l2D`;9i8rH>Evu~yzsaLi(9AnP?tZ<
zKjC6qxJzHr%iV*sbfr+hSxB;i-}p`c_ZG_jAcC!8hYTT?f;atG!ZOKDvHQ;*uOn#}
z@h!NO-knK*1TlXdju<gCC4*AgL5@1b1YQFIHwS{&BYD-ah7Bjls=u>+vR1ZiN!K+U
zrqFZmj2zTrMfnmXV<L9$8&E}sYNgf^_y?QiL_g`C536X(t~k4!&-|c@ekqtaTk)XO
z<wW?lpEqTnbLicD-rv?Jv^Oup9VKa$X{Oon-LQ75w!2$N1)YkGE1j-(?})0xuy)eg
zY&Snr+zG(O9C+ahJs-`&n3p!EwA<0BsM;LZt<fgLHK+h`-#%ZmuoGC&v5B^#=$wP(
zr&jTh8*$1+{~5^eEBbT;ZKo?wgjB=+oeLbhS+|l)t)g6C-5pG1J(aJ^+P@sTrHd)E
zDA!zebE=A*k6B@8K0j|<*`}NIboz|V%F2X~M7YQLGtD)ogO+__JdOA$jc@Y-x`fk1
zjL&G<AEVa&9oe%D#GITs{ACs*7S6szRqJeuI{_QrcPpxj-u6PGVa(p`f$CC&2Rmby
zQLbcmmJf@!v~7xc$Hp21Eg2^qbP)a1X~ZL0NAU3coi#tX)cQ<=_)3QYm9AXFd9{hR
z?&;4}-4kCDcsAmU=M$qZ4&|4)53v3?9maeaUKhX~rR{uT5xt3L!4|?LIu`th@#3Pb
zo~6xW1;IoaxK1Nedt5SxPQ?UiKULkS3iyxYiTz2F2W*~dHY#-GwcA8n$DH)9{?je}
zof_~*VuhIdEgJ;nP6k=Z_|3$8K{fW4Bw{a7+qu0>%NhC6%c~joAzjb9J=x|os`Bf?
z=gs2TVM*)-&9@KID>ZE(%yH{S6l^vp*Z-iyf8FTL2mEIVurAO~65aH?dT;HePowSR
z1g@oH{gsG`NOyTh^PQG5RZUZrA%kv@(oksckces&f%#-1C1<XtCT(j(scx)f#7;sr
zVMhR0K^b+YdKv$#7Gv4KON0B9<9{bM*7U7x4>hA73^G)0TV(T*YB6c$%wJf;KL5I5
z`q}z*&As={R=k0oZAqmXwqKt*oi6dgj(Mua_u^@^i-k6|GJSzhZ&PopAdm1K_GOww
z%dRt;qn`Dc){zu!nYq&Cy^C1d1Wl$(soT9}jBW8<<a!T~3HH)Fu&Yl{K|y~-dLV6~
zA04A$7pFeJqzx@XENt!S39hv!KRr&b`8z+e#4Mdow@@$QkFCmm4JFkY@y#U_99_5B
zY#WkRRev)^%UMu%P+P73GneT}`H^Wqj@^G<InBw)w19Elnzx(=l4sl!320cpbamc~
zfT`?ONF{;uVsyseAIK16S~`Gtv!I%hO6N?XO26J~4v(Vmx8?3qRaTcc_u%J}iW+jV
zWY3s~jWK5%L`*t0ag00r&|V2nB3N*-$-Y;~QWJcLVq2mX{xjg}&(R^qnqGRm_Ia#>
z%nHhv%3nxCX-k+cgXweojJSx~X)J%&ZmiNkfj073d*aM!$ogeEIkJQV=RhVSG<8+%
zo-D=gyk`n|{gz$Xo;8c%1?iik4W+vto~S3h83+7P?h<(6w|M7NX-D<)374^ni*_xX
z3;MiLwUZey{PhR_6>!r$2IEcn?eQP{!f(5r`l3aH2|C!0+Lj4In3Uql+A95&3zV->
zvUi$Y47{87I*e4F-SD|n{~zaMeJAGA%@#&KPF`w(KiJ`2JpK3X4kZFr(97g@c3zhP
zQP$gsqL6F$s<_7iLjJN@_?dq`p&@ma)r?Hrk`44;($tdHU-Ju}Q}o{;|G_o6DAa&2
zu5T8fR&YI#SoE}@IRU7Fbc-sNwsgFgu#&T(sAd~!%Da|r6Dg#HCm1OBF8hfV^8nH9
zR3708F)0c!p0j@>o)JTt9LZ0Y@d;po8YQ5Y8_Cr(*{(&GF~Im%uC%hvaBKVk&DeF{
zB!4f?8DcF^QnBn<;kbAj?c9UYhHEAq{boY4Eqq5#2rT!24Y~^C&<)t~@%+(t)aC-M
z(g}bW1VblR9#mzifFryab``o)X~0#>0W3is05)`{B5fr*_mLKezXTfwx6Lf@h=PH!
zM*UgCQvl{t1;Dow4b{}3yTPPc^`3tZ#1_0&f#2emM{#ynFcVb6hm4SBB+?6pMjlTe
zFlcmA(+m6%u``3jQJw$M(^!c5|I*WlSvifgih;NjDh|rRR8moQpiB@5a2Y#x+w&yc
ziUmUsMr}(vT=QA4rX@%X`Vq)C0~?+!P<+a^xvXck{p^Iv<oTzFDMKB#k0!Vh8H)Er
zRI}1ca(!N#+?y-;)Q~`BJRtXc^J5maEc-L_k5#87tjJq<aSO)M!i_jfg0s|0<8pn_
z7eXBR<zJ4Ps8P}P$`AIv`-Cm*=j`!^v4BKG0KE&W@J_r4bm14%)(J=*;aHUyIRDdj
z)zBr??#+6Jzy}<ABCIM0)35a`s4{k~UGW3m9W2agGhV7ibcuTcF&|HFFO`4fQ)|I?
z+a*Zeb9eBklE96e*D-S-lt>AeWa0#%F(4P!&DmgcHQrZPp$$J#P@@FcXXXBkl&c>A
zWPd0PQ5?b^A{v5#i<}!TXEA}<s_BSK-xDlR*PpKo5s)6aGt3riRPTVpQIMAP4KogJ
z%w?xh!$Z9(W5Nc0lnXV5a8!_yi+dlH8|}n3hMl+6C^8J4yP@42LHaw+Dnci?qevF4
zYT>SGH}WR#i1$rlvE`J*t9s5DM98oah=269FdP8E=Z(Y>TcjPyBg~7BuHUado%Z$l
zbb-sn@paN+IsP8ItY^L0#QI~&AMvExDb4}P3<j#o8OK@{EHxIpG)bnEqqLiqf$`2V
z{~FKY=8d1cZk-1>^Y5iYE6{9zAo)mPaSzcj305!Um`lqYM>wz<NgGSUd@{VE0{IyO
zea=05SNxpm=WDU>cYhd|u3H3IQifsno8050mwM22@NL9!rzM8pvh6VchHQP8sCbln
zv$gV@QA_K~*`n7}qE0|;QE~lbn5;G1lp#fdKv}ALb6@aHHVLL5#R8(9YHisQr`~h)
zX)cJ%?vylqmtcYe(mH}>&GT4Fp_pqr=9M-Gdt7!a1Gx}ndVz0ucX!po46)#`_5(s+
zK+rVqbC0te(1xAXK12Y*(gIj2cD9!3WvdNcb4`Xyu1>R`*)taWrFcJ5N7)Ggrh`F&
z?T`4A^s|!(E=7oN2#n{5E}B4Qapr(<Vm&bYCw^>17HI}zl2|q|`V|myeYX0{q9Oz?
zGVtx(btmywxs(lDmIcCFNj?^{yC?B`kDm~nYead^-K0mEy&(Qqx;`rs)85ktc&{m*
zk=rv8^vaU8N(juc7|><qNmuc|D|pV?e7O5o&dVS-h;{5{sQ5kuAM;ABQo}|anxpQ=
zuSi<puiIdUx6$ulxv3M#TuQ~pW7BQEK7u;K@}XDfS1`Jla<ipE+_AgV9i=qXCQQL*
z(ip&;-DhB<v9Qxpk!GRWQ#Hw|hud5_({X*S^hc#=;co@0PyHO}#8VoNN_$aVdrK5^
zhi27IiO0wdVhZU(;E+DScMh1Ua>t_;utms#v{{q~=Fe<DYm6c@-ImM^chAWeSe+|_
zKPLxp2_EWJ3>%W^nl0miQSgi$PjS_P!pp;eoL}B2fGWn;j1&mKLK5xFJTtfJ9B$Sj
zP}M&DPaniBnuo0e)yprv#&(lAZ57dUULgFwWY3GRtF0CjeFllDDJ%Lsr928;8BQIU
zv?xjD3DpHEiQECpE{v%G=eKKl6{5L~#r+aty8V`cwiA5^C94u)YAszh?JONpsb#dN
z^|^-J4si~JQqhk2O;v00ilbq#)Qf)(1X8c*3<aS7r<r@zAl#$0{&}GxQ7WPz_}=4S
zG6E40^ha%QJFMTX$dHR;+N15-b7h`V@&C}Y^u0=3{)b3p5x1FS*e7=8lTD^+<;21U
zr2&q7^lwABlVrt>`|aCSs@KhGat5gbr!Lhnt~_C3Thd-D?7e8yt>=FvzUIU!ucqf7
zvah*-SxwLp$jpKZp4LqnqjLgapyzv!v7Qt8wAqM&Ji?$5s)#@)`aJX@S>VC6jkNB*
z<ZLwuaN=8tt~-Kra?4<q3SR1dq5{*^aO+a1UcuxVM?7*`^9um^G>y}>I?e68u38~+
z^idtgQ55=(ZuLFG)%}A77<Od<M{>EGwkdx`%m!)a4?=k~W8Nci=H}Wco0|O!V7vnX
z15|za=AX$34fyw9Jo_=ADKBnV2$_-FXIi=t55UwoAAS4fuV>3I+bw*9p@{yVh2X(^
zE%9bWk23%nhi+=mhx7`^$ekqiT|HYof3k7&<89`cC)1`HmDpStJJj1BGsvnBJnv%M
zkyl58yP26Qd|b`a)%_E>cLNt}-}ywFtfp<wE6U($^z#jjGSF_5p8R)rJYE#Jh6p8n
zDY0&OiRd6bR*TMo&IF8jFbpzv#4<)q^H7ne$tsj{8z3TJKnBfEZxj}#%l9nII-kGQ
z;#(H;;&5U15%ru9+gza`MP>;4+CbT{#8~A-o?buWLQqGL!S}qBCFHB@0lFlfQsNF9
zj)hP*@zRMro&LKfvySkU&Ad&c0m?K<Cfk53@rPKA`~#;U2ZxDulZayy*X=vfws7yP
z{Q)K@4IzpACCmc>8$g!|$qRIN7W(k+CeN3xwYcy>RlLkL$>-2_8&SAGB7o>5E@Z%@
zZLBQ-%r3<Dthq^sq;43}BhysbujX7~KPbJoH=jEJ$J<@5BXCiYd*wl&rY9Y_eV~{f
z0vbc4U?$mst!fr1<e%(nR3ct_xUBU%Zg-nKH%)N}1Rk39qKuq95cLrOrty9mHe2tP
zJrcWqgQR-!xARGJh)&ly0Iru!d(Uah?I2QVOu9{1+5!)HFM{B5ZEO?2ReXBL{o9T&
zgEbfuokUN0>f{AaZhLr3xQ*S}$NAEKOOBVS^81*ld-JC($&TR<sd&Lt>=*mpQ}8OL
zuY9h*@vl~x^$qEnIemwA1keax^-N18G!urbS;}v;4_RfSl}jd$F(bpjBPn+&;z94A
zS4EA%_Ze~LY>8UA&kZH*d4e0h>$cJ5C8cYHd8w$7;qZ*nL?8Y=jDhFf*4CQsxCKTY
z;hP)1J`Bg&ZO?ifzu7E&PQ>A3V`%;IQH?umDUFEc5+@B4Z%0MJsBi#8Nj;f_n3NVF
z5Ew=M3XI&=a)@jesP%g?CJ_m^L{m+!QY{1<;C2nGYB)z`?CEHUk%>tlM3jwy`I5^P
zb?A@I$ODg+dAek=?bAE%&0Z}xQ3o4@dDsBrC$~$NDoP{g7v?@280QnVdM*J9D&F?l
zKo>BFhZn^%6#Ic^gm3UDScjcfAw--mDfMK|@0JU{HTog=I#1sI>Y)NagW2!SB%ShK
z7q2)}Nk3JK$eD?*4g;@fFzl%G!1ZX$_2W{a3;!RU-tWrRu{8&vn>DdSwEnm<#RyRR
zR-=@EVhY!)tk={%{`62N$9kU1G_}jR)PYpcgP8(F7qjMU_w36sKi&cMc%oRzk+7Q_
z_{o@8=(Cz}QIcT;-~!5?%v%M16f@1J4Mfm?8&v27vs0iU{T}~Wocil{U5paB63N$=
zJj$Sfw3M!V-GQ(V@R`K=8o_qMhN<b=(7w44j|yQFB(eMH#((YX@03US>dDCSne4K!
z3&GcGtUvTBl2Djb12zfT@$6W8&@Mh@(8X;BVinqhMdn@eFAKs&_?JjneXI<cwW}TT
zb>q9r#KY{_XLg1pIQD)7<adm2O4vKw1~NktFj0J)C&QZ-6{uTyHJCJ5ukhI>7Wkji
z_67q~r*(PlM#6;JP6kVdO^SV@%XWD$<bqRN(VM)?R~NwwABB#~{mH1@-r^=t!pTj%
zr2gV5=$8RSl_NeT7DRjs5ynCU{=JOq<MqURVeTvb#bNe^{qBh+6Q5HxC*Rc^5M{};
zif>lp2OyvWLvr6)cLz^Gh9HE00)HTBD2ss%Iz^eXQar!gYtYCMU;mh}Spbx0$47Pp
z20NSVfICz&$?rnzLlnq>W<u>N-{5xms!qO)AJ~`?(ZlOA9jTk=fTl<2!i`WhW=?1N
zWMG+<w5#VAC?Edl%*iygZ#zco@@rvC^Dn`BoTARJrwXt8BSQ1~Xn22;{bQQrVllO!
z{0H>NZP)48cOF9rrfkmQX4|ES^<%^^0%6x#f|_JT>(y#`PN>&3t%i$gNeYo2ssP8d
zppdA}GQK)L6P7a8BEw0ReOdkM_PmYSL2W{x^?@^9c!kOoNnv!JPmx6iw_fxU4^)?H
zY+mnnLrzEKFvXGe<A2E^gV?j&9%ooof|0Q{Fv{IeAoK!|B{Yti2NWRPZm&|0eCnLd
z2KU=>i<@-CFT4YnzrCn*14^>UVdeP%q|s(bd%Vm9bPc5*i27f7V7)i6dHkxaUIrct
z8k-?TDT>FjUo?BqK@Wv!oYC#2Ys@1U?m8du-ZT9QC3XNiJ?8#_V$*k(l>|`z%4rv)
zwNK5N`P2(q=!(f#R|ifb6H$`Dx;l-BGqAU9^r|pFUV@nYVkEb|8C=!~gpX?@Bf7Uc
zTnm6-GT_Dgg4~y)!`9-$1}hLF`=EI$6dZX31dvCDT&gKS&TsZ6f9a}#)(<!hvH0X2
zb`HN0k>`QVKwcoGq>Tpj9(6s;y1K5fNcxotScu4tq9DjR6Ezk!H+n6^i`Hzac$Oes
zw{WW8Hh=2&l$d1~9b0o(hNxw}&MIbk>1>c6L61*rdC{s(RI2GM1Do&>X9dR*rX6oJ
zYflNw{B)D&nbxC6fw5%&W6$|90X<7BA!QDFW5GbLZv;0ot+Dj{Lu}E#{t7K&OBU0A
zZA$to+GNkO<=YlIdyLKj?S{Y9Zx|-_8&kq5*yyx&MU;o=6(rS4Bc1jVot+tMn5{k2
z6~uRB_U>+oab(7?1a`bj3D09VZn_fRYAwFW#gK0nxo-ua{5s%Jj)Sxseg4(9TF*D^
zY8j;=64Va@bCGz(;*SE3!9!C2&zaDem1yDFQZ-As=RBApIjYsj5R-~3DuBqM)UQzt
z5yvBU6^16IADHJMBL(V!MNl1B1+M<--D1#>itw4l!f&vQ;WK6^9>)drf7#a7C3-d>
z?X~rQ$zp)=LsS6sSAgQbLB0B|S>kc|Id;zMY?Ejqs6ROPROTyK93sQ9HFnE$l?On+
zlX|4KWE%VWSz;?+_LU0dgLYHPbJJVJt>vPzc(&M?`TMS$EvE)OW@q>2)Ex*<Rtv=6
z(p+*cuem~mQPF!@<^46*&1=pZtZ_9j8Eb~z)%aKq=d&MbX`b}L7V`3lBPT15+T$^5
zQDQdI-RzS^6<aPH_@K-;=bucmpS_o>*F1DG((s*T>ISWIMU1j~vqHX$d+dbyfqQy4
zikWXEnv%UBa6f=AIJ(ZttH@*m#p*sMHQ@1KmR8)wk0|;nB_BKcVuuBxK-WY6?c~oV
zPo8^m;-5#}c{F<-;j!EU1R3pA)cmMsHO-c9x6Mb>?Wf+;uYGIVR}m2uugwhy(;k^x
zU$55ItDYx@mpI;hnvHt=W1@uD*!H9HaN-naV{2=6m4uvlwtG9CtI{U!qrI%^%g*86
zn}G>**?BlBs_AAO9fvu2*4!&vp0vAc#4G;R4B2_4zrWF5V0~xh(&dgFW`>5|SMUk>
z-Ok}VJS-=pc4^%O1aTYeLvZ8w_MdmG_fOb>_z%|G^ghPgVMtX!2stFiAaX^yRzwVA
z9_gHJFp%u^>oRdBlzhk*fm3qJ;m$&iWF%*CEW#)(r2D|NSoEqL{{p>Affo|=<C|Dq
z((9}l4urJJ+peZ2UWG@CvKf1q>;xL}cuM~Cn)ZZ}p1(1bvszJbmBRHOP}1-K0hTe@
zi9U0CHXdU#U&51mcf-!ce>k!?yRr5{VdrL;McNkmSovaRo(>hCzxYOdv4PoS!Hu_$
zBy37NtB)=X2bE@xm1|x$#^M?Z4;3|g)WRUN_zm^VTd9F6U|n7NcoF_*r}!UH`fgd~
zk)2h)|2{bPL+KBYb-M9nz_Q4Jr_jh<OXP;{w1{}jKGUbdalQP|?pF>LX&VWd@{Yuh
z`X6VH&I;|g-walCFw4c`371r7a!=S|8#doC=Jl(y;x-tVADix-_phS}8iJ>@eNI(@
zGm7ordDU2<?nhCt;lS`a!(JG&(Pxa}NX0fcR$4q(Q70l4JsnJl)#6>-*gFI@I2KSw
z)e{|nz_ERK<hub>_hKP(3Sb*{vi!{Es1m_VEg5k|xH;)N^O6Zb!K)gRVDef&@6`(p
z=;Eq73mGtub=>1UR$q+|De0<x@3VJ1yPJcSQlp_82&JIV6^697_|=OBnNE@tsiKEi
zy)6TqfJbOg)i-;UR#d(9^vR1S;NfP7E|!2R5y{y_J*hJSbD`(vJUQ={Uut>d0EIij
z00T{qBge){BHfmVnf_HX4VOy@KkWHl>j^g&^oUXVTdFc<o=Gbcg>4CvE}}a$VvB_%
z<S9;tG_H>=T-i378I(*T3<eq)Jfs4+Yw}IV^wN4V-ESWM>R%t=<V6~KH)USoMu%<$
z?4LU;)rvyo=;kfF=Qaj2D7|l7b&F-be3Hb+vhZz#lWIvO`8c0S&MmANreSW64_~A8
zl9kz<KW3MFjUHE68~j{FU61k$*ZM5kV{mO<O`}WmwDWQ{Wd&{OXeMRWBPX@gh*XTo
z##$S?^GqX?ubDX5dKI-oo$9QuU!v3WYPTrF?d!2U)$w@Ai+o0X8?-m{on!A(q0%GK
zbt@~kb&T@SPHoXpx1tvIlyA}kFEFI0LD=Yy)T^`rB*}AtR5OyK#V7bDHi7P~jR(!Z
zF%Ey|b!7zqgvAX)sg>qO@h%V)@$xl|r{%W9A!$L6&#5>-@3$xh=sF70(iK)7ESSY6
z=<SRgA}kEczHBXFP9W4qE_o2B!UWHNcxfNK5$4M?D>8N58{DXf+BfmspU7NofiduK
z7yO1*UGIWJjYlLt{{0i^{P6f9@IoVb89VP%8&LV_2z1-P!3ey(m)rZ+OVbL`c3lSw
z;w}qNqV2?TR5MXm&XQduZyi5B8CFv-@VUmjNKoamL)SzXWzB1}Nb~eh>z^A_AEBjO
z!5LE?SU1jDO|z_N&E{OF;2~t=3BU8m#inw_LMwgNMci*6i?m$44eifPA0Ig7i^&s~
zd3igBAOjr>nPdbaDfL=)IXCRog?nV3C}PBP;)3b?(i-~ygbxv0U|6!8jO{Dzz=Bze
zNC#_&NykT@K2%4~DYB-mJUP*2mHuP@ynrBKpJuV-6T$Uj+7z(hud<ti@2lyIn~Av?
z>=*A`^}<$b{@&vIKDM!$&djDpaToUr{&_W17phPjYLnuO#I@{t#o^~_;dvjbzK|e`
zhRGk(`o^It38r7OeyuvK_SFPDvz5oPH|K1!;%P}JBzyw)X#^q%lSFfSVcr4Q9G0fa
zNM5TUh6u8m(*uwzj!@n1E43p8TGOB{ZK1olhGD_`GmgIl|EKx)W#hXWVLN6kqt#aM
z18+U1?)4yQ#HJ0|8>V2eed!r2Q-^6To<59v{y1r92nx2nJ-%kwi|;WPt=zef)UwTj
zq5&5c?Xu#jaqmviwk0Id_nGI+d9;-9+%6t({Rwp3iy_Q<k|L$9M~AT$$=*<%=7{&x
zE#CF@=Rb?19Bn@tTA@x~R0%W)XXkq-+}6<>9u)_Kg(a))ij}XA+`Kd9jS?zK2`#GI
zxy->{K}N^HH}zFw1QnyJ;dt-NzcyAs7Ts{;Ivu??R5ra4S*uaeyNHso1B4I)46->4
zE*yT;bw4L8B~;3z2J-V5De!356K;Ola^jX8HT^uZDJnQ8s?j+#iPAReD6H>QXy1?9
zKfUJJURZAw8E$#_^>tS^9eXXC*!>-g^;5Nv-#uP5StZYX6s9sXZ#)>G`0Pz$O|mD*
zZ-BdI!EyUWINODZMR@ZxJG~PSVb|UVvq<EuoOe{Us(zFKwvpLA<&z_XdL7VY`CwC~
zeS7%l{=6RFC#VT7@e;QLdZHT@uSh%zqI|bg<7w7nSWKs!_ujN(N4Esjuk``V7c0H*
zdl>I8iMgRS^E1?1d|udx#Kfwx@v>Rg*E20l4a#nMU1Niv3IJPpL+(W{(0G#GTh#g9
zc(?|Yt2Q7--&Zlo4_u=Pdkj;hQHpPur+;4)NKVCb>uhG)DYF?4w^)l-m{#pg_bt8x
z9qPe8(L*+pcoIGr?$A-s3!7xV9{f5xDz)+qmu|H9UC7Z{i8YZh&o84K*W)}}U)s;x
z$l;NGE2GXz)jujIykBYm{qa}g@YUG?B`19qSyeimnrg;>;u|E^c@cPmZ>(e{F+7bE
zh_r-AYbg&TsT8_Wdn`+XSl-HACO~(`SWbUxFVAyiXyi^z$=JFTmgv|<QDGQMiyRZ)
zCGshS@xd?NSr!6xhodi~z|U}>%j*;1iNa<m;sN34uk=&c(TKl0gMIA*RQs%T`4O_S
zfI}Jg0>C5}2K8UR4)P|p9n3cT7!|V8o(LD|S+cZGuzKX(XJ{sQZ4cv=i+GNvB}!I|
z-4cnI7W81W?m8$8b4DYv#<rVnz6m7MoiNkha8&MKpNvPbwHO3@J79ih>h74*#>>;M
zU?QteuAG^i3naXXbqoyQD(NCp0rn5SgSbhLl7qr-+~$VY<v^_Q-*L^w;1@F{({Ti|
zk(MnSv7^o|CttZ*YMyfA##ojz*Spxi89sPWyvQbpxfpioGb%=JTIS5>6nqnoTUcfm
z%sO)o;T(J#Vim8%-m*3@Fmg#I)$MdKvY&l(OMS=h*@?o{uIdZFuPkD-k&F9tO6&0e
z$dp7*89~GD+RAO&#B_Dkf^z#6KX<&{oJ+|r!XeF>ZJd(_r6fEw7A`_9A9e^iD#@qL
z*wpZqPopF5Jq%Yr_l?8jH%~!piJE?SG(qnb5$dH)!>y!NP&9TXUz|#iD(uz7(x_Z`
zfweqp%uU-5WK?k!-%*)Vq~0p6MQjv9U&vXI>%Xx=?PCH2sszv{;KNbiB~8>Z!vTHg
zXvyUldMzA~Sh|~u=!_+-Bl_>0K+uk-&5P!7WP_;1sKSMQh#Pe2ot(QaApAhR#5$yI
z6y`CYfl@S;X8H9O^OK!LBj&O>Z!gM>rw%AO0hgDRI|7&(oOW-=X*0{(^}gm3h6fdG
z-5P<_!Tp*2^j3Rr1^1XYe#wW|A7#8>*IZBAe5iyK63sw4e4pv9s03c_n3dm|$4S1O
zEaonvr?l&u??2{=l1)}nOGEpFaU87v3R&S_GITpX&*TLV$|pbcy2AJpA^wedpdH8Y
zHNJ5n0#Oksh8LA;Mab1={Wu@9{dnLJB|)`$=v+WpN0G64wdwvGx{h(uL_dztosjhd
z&MqHu7m33MC@N-Q(dg)^PM1%k!u9zKS!=GDv$gb#7XyzBN6sPpY$z4eOfYNiCYNsl
za%j&nM?W{q+cYuOwnE0d<fPjaGE#CYP>co2m<~{5OUY(*Xj=n6;bliqP&AMU^?2#n
zAsF#G7FNhn4keclKt@{}MD2lNz`2aO>cyjaQEy;(V@?<fcU(L$H-VQ`-LP9ec?+6w
z?{t^O;+i3g+ufJr`dQck#PEBZped7@QgpE4HE~Ah)bA5%LH1O+;I&>5T(_m-pEZr`
zliAH-te!|RKy-1wJDmuLub&K4p>EPU<*A`@&r?=ATsYlvFT}I<wpBjUhx%QC?F$Wu
z*lO;lRGjUGyy&9SmtvmoP&R+)VDAkftCC|5b+~a}^1NCUPU_w4FufS%ii;Jma%2)x
zQ*In&Rd~Fm`pZP+-9iKBO0Is|U?IPUio|ndN`60L9(2hsnE#x*$#*a#H3KmjMbuqE
zv*Tc*#+QXKZw*2k(wp^->9)+q%lbt}Ec!DU`%7Xb9KG6{sTQSm#pZ8!CJC#l#1^N5
z%}gh9H-uY->n)&2W%}vn?!e=UxbgdD_3Qc8_NsDQG1$oAsU~aH#o7?hO{k+RhC_Zc
z7+(#lnA4y<&O;2P49&@Th~Y(|U3OEfG2|E(flwNM)rb$u8NTZ2SQ~cNR<Y=d$ne=s
z&k`neIl>V25ZUdT9xBPN+U@%wzZ2wJ556e{IvBBsPtP*A65Z)U92S%c)h#gjUhO(H
zJ=#B0SR~}2$?mlM+(3D2!%Ig3UEp_UEh0j8uc&bn>Tr_NZ$Md1=tlUxqndYK=ZW_R
zD7MY$P#Nr?pYw{XgrAJ-B#GDUz_9n*W5zi?Go|-uQgy}W@C`9u<pu2HM#*D7U%^_*
zvm8;V+NWS|DfHe6yhpEI*gcG!_dC4&6=ah6ed*2C`oLKxfG@>=DZ(6?9Gw8N2Och<
z60kEri^TF?H+H`KBvU}^Gu;^-is^^gF|F{6&9IK|V@@QzG^c;8Y7`zN5%ebVN2aaK
zub4AxYc{>88v@HT*4$Ld%ecp3xg?rNfpjK#+=q+YwAWA$1bSmcUO-2P0{P6JvQytu
z@8PKW6R2Qcx%-9i9gd+|GP`~&87Jrk2%3Agwt+96?Me+^drLql*>L{NfJq@SD@#b?
zt(GWOhFDuz<9FAg3T^q3c{cl_`ldYBh>pK~(8G>RV8_2Mkr;mC>o=<iQFGVh4P73s
zCn&Vz&jhiph~yWRI>Xx~o~y<=Bf~^Wg-}(0=OPI3eUcXACMD<ZB<P=U^&3`9%=hfn
zyYOeJe%Wk>i>sEEi6M_sLT+dNI_BC`UuaLQj1v%<<b7oR`o_i%LeN&=6EAU(eLRFj
zmUj1w-}0A=g^OQI?01c@ZJq{4ow$Q+O3co&v+ck!uQ>m>F?jY-b#DFw(B5MOARBS?
zL3I4n4UN}~twjqU9t`EQnHPB!bjQ3o5qPf!Dv!oMCod7YEp-l@Wf4Z|jjc+jPQQkD
z^F(tb7klfYs?(--9+mR!FMnT$q7qaL)A2379(<)U9f|C%?#k4=Q;+TV`xMr_dNETN
zIWs(;tVk7}#yks>IeINj>zp>A!B)@mD6{)M5&A0GPnBJ^SiytYM+NLHcxSv*hGbT0
z$+;fh+4l&E9<Q<QX0C*EW{36NwZB#;PajRy36W522eUPOhwC;qGolCl=3&%DLAH+G
z3yrHTrg|P)*KqvGJ}*eHDb@OZt`=~$L366C)uOO2uWyQC<A!iYn@n2WqG);|;;Q7u
zM7Y#+LHyx#ovE9;7f?+>+RStsjm)#veSY+f4Y^e-cPvVzGdVoEMk7=Ri0L1WuiY7~
zVGqdf-40v*zKe`WA9wls@^owk2=ct$H70hltP_{WMK>sQsA)HE`yDO2r9pf*bUHZn
z1KyY?L&dq2P^WhRcZi{P$_vWDwMmRHcY<>!-rvbN@fD&N+UeYS|Ln&@;$kQew_Xyy
zeyLsX>^W^p>Ge3u@!$na!cdKmmW>aIyMIS1zZ_cY=DB4<0i@c^0}Y%Cf1HdfPco-$
z63wIH;1Zq~A+tqOO;)%a$X%J(RXi9*QaCbGnpxIYs)<r%FiQzFeVJZtz8|m?+SEBy
zBvIdtp>sE9Qm>VPh08hTGTK}En-+v2hWZ0lJq301vmxO#j(P_z9A{?9<-!sJzAI0k
zL?Vw8JS6hy(#=(Y01{8_HSn|Ft=o48+#6V(XdFu}=G^CJ-}ddnA7<shsetCxu?>Kh
z@mX1}QV;~10r8Gy9y>jIsHNqFb#zyU-H45Sg|!~%vEG{&m3`hI4(qpPlEH(>|8j(D
zEqFbyp0k8MR!&^6oWE$h2LkPapAm}q2IWi$pR0>V<dIqh<R$_K=A4VAOT5w7qHkhu
zIEg&}*(5MY=Qvl!c;PlI;YaicKH9|l>mrY6{$2Nj=e_5QAt;6`1d&UXo(!y}5Q*UG
z$*=b_(6?Qrc;#zNpZN@AeI|M%Dls<nZg+q8f|tP+u8t7O76u7lu>y!jrfSUn&WMJF
z<K5Q1qtI+3_mZlYl2xLK-rz)_<F$@^J3H`yB7R=Md(v}40v1r<60(4R7)#n*80g5+
zX>=ld?K!e7>@e8OlB)zG6|_R3<8(79isk@ZPGDjwDd?IGJp5FE4xR4;=@S*^<LqDk
zOr7YWVLVgxRYE)7opJv>7*4RgySKHp#5H-U!}dmeJg2zOS+RMxxvP!X9)Il5_=6XE
z4_(03sG6zv@<KJ(qZFN+2ugxo`N1_*;l|_=^Y>$fF8y4&v-5m3@oNgYb((w250;=w
zpUd<yCI&k@DC$d5B05}^lR65A^lzlTj<uET^!1q!b*&j0p2L}ai^Avm`@FJYERW;u
z%TQ!`<Yl*%HT}T?%zjR#digYrVk|YYVs$5Z%90C?Z);)btJN#+B}=l+chOT@Wqnd6
zuO<f>&9QOE#x~+v(8|Q7k;pbLaZL~XtZJRu$-=}-wLkND;_Y*ysMgjAjN;2F7avtG
z$rA4j?p<gQdrOb%zb&4IyamxW6FUJx1@UK^tSJ6`rxoY3j2WW*!^Q0*yqB?IOQ0ac
zcI<f|W2tKFJ`<dB0CH;vfTZ$qzW>r}KWhXkE1G=(UgUvY3H8X8PNudpES(~1g%p$-
zADK=Q3%LVzCutWW*eYcZ7ZAuD1l*4`O{eF}v&XqBoHV%`$gHAW%nii_(xdIOU}RmM
z+}Z!JzjOdDZgFm$@a{JIO}2~r?&#wZ=C@w>JM*+u%iXzrnQ+JcwclzgHCIbc9mH1Y
zPP`LtDwFSTs4h9aC-}>DBH%A85DsjG{`9Ok*%yBekgA|Z%0N^vzq`CtK}GR6t8F-$
zzGI{y-DR_cBt5~;wx{Q*ol>KXRLX`B%F&?tI}KB2?I5m-e}y+?Hx<vv6?NmhsH%%R
zbpE^@>I0$16hQ^7Gw!PE`QF|)9F*yu6U%$U)+vJunBm1a&!}xe;vrA*t<e7dZtYKF
z?&YFV+HslIR_EI~EM}M&s=MQ%*c~Y<HLk3&UR620tgc;x>g+DKPelC4_!Vb6!yZlX
z^fMeJ3frJUqOdD0C~sVdkf23^cX<|RB*hvL*me^{@qBQ_uTk?}fVm(z)dI2V=R_qq
z?KVq)4yZZ(xBw_XxfQ$a8o-3KLqB&RcWMcpP&*^L=9=jd&>NQu5>^ngY`Y*b7fU)s
zzT@>hLphXUHttK@-|I<1_N6fyy%73ij5e`DAT%ifS{^IKy?|}dJd$6#`uP=f!|ttf
z3Am>>0}Sp}(F<uazMXoh<m~LW1<(-sFvxZdD&`lH-qSXJ2_)h6FIqVec^079zqNAU
z;NlUXK~lUDS~G#GAT`~^Tp3?%?rlFFOqH?x^|(s)J8wFH-j8{bz6pz9VnwkLWGZMD
zXJwiUK@#bPcH(6hB=ky#lDEio-G>*<Y%)rR@)1<!4C`Y1+i%O}DN}IVn;pZ`B@&&d
z3&NuZvT9okzS_v`)7aNVTaEh?mF83z)>B}YE8ObZFL^kTvF3ODQ_+>ZPX-3Id$p-t
zZzqHyWf%?(_0|3W4w&f+0f;?@oz!+1yN|9ruVH7hv?_gsa+#fZ@^B}k>!H&%5SbFm
z1Dw#iU2$Sn-wgm8gg%Jnbs3n3HOJZbB+c+To{AU|jHz1!FxMPyGS;mJR@VWTc;kai
z$^%%&PnoXYvDxFnrR?q6U9E(fIQ2aW3&=z+7Sq8ZLroBL)7n8B`xTj<K(`>vg1AR^
zZ}V_V8Wh6rj}9A~7eG_vW_$sVj^FP3*&uUVTAEzAOEubkZXJTLnO(C4{bLFVj9ruW
zFPQnR_=ue5Yk6;BpVe?R3$sD%`Wl(-GM<O5&7#-Sv(`Gk+}g7BAB49lunTVxGj58G
zQTA+QQ8hCdq%nEMmtloyjAWV+?KYlF!u%`gWITuHvsE=#&3^FQ0^M-NA37#$n1aaY
z9+6tPI|_)8Ar{2EFCd(OZs%2H#Ya*oHN01w4@v5{0xHB@EC2Q2z1pHFSt*{ZMjJJY
zgGcBs$nuj>qnVUQFF}EoV`HYH_MM;SRE;<B++3>7bFk@eb%g<Dd=5J9@?GKr2Z+F1
zGOE+Oo-()<luw<=D>yt<@TZxp?vtYdGL?=8K&>~3q851?3&y;;+5s&qKwm`j9H2E7
zhp=k3woPaEc_zrqk{6(3dd^wOgbfmr##2mx5)rW&fW^SrZLXarNM`SYc4U4g%IZty
zNzk3rjehTWh-8s--%p;)!$)I!s@Qq|VLi^C@v9F%gZcQjE-weIz9Y7O`9F;vq|@~Q
zV;*#fW6wdz<f8xNrLRoF7))o2hrxsIs-XXTZli0in7GDQV7l$ns~0YMci%evctTNG
za@PA&7Ug&i2Z8yi(shM|<Q|rTt6y>9u;yRX?|9=t%xM(^JZN$&x?-BekHRDv8*|>U
zHlgA|Hlo1-6Zfe2IJ-f!iLeAxJ?r2O-BO!{{x`dA9~n+glHQ4c30!!gL-S&|p}{Rk
zXe>y@?Kpb0=X!+dafRm08{KoG33h!hph3l}=KST+xH^EoN@#P`P>LVx`A_~1W^i%u
zUTz1Ou3{_5^c6O$gdD(cSnA79z+Le<bK$yTm-lHY?iKVEv(pF9TzmOr&Y>5*_tZ0{
zXMMvbk$}5W`|<3SG2ynz{g6#c-Y_u<3R1e@f;#R43uEdE%EGv*;DAU=p4(G%dXFcm
z%hrdqYB@>R4UE(bN^^c%TUTIpWt#A;`VZPg2Sfvcn1^r2W$PaG`B8!qGlud1sguAH
zX$mw5FjQzFVR~Xa(yN)pu~}~5PrQ5{mWE;M!5Q!z3QLUfWJK3{f=~234aACCbsc$B
zBc63u#p$fa266JzFw^@p$W>Uu$eR)1sj(WepL!F-pq8VJv67~l$o_Jhw3lyx1gFKA
z?kidf%r$R?eExd0i_jVzq?<0E^g+RtS0##fAN10xE0lgltk;;`T?rTJHI(`ZAb0H3
zBjI)G<qmIl{cWB<R@Ktl_IdU^R4a;S1rXC1#fYT8LAXglEL6F;DaI(R>}#r*Dvx+6
za;-a=gXnNz(B1KdkYT$463!QC$wkjV&>9)coKZ4+d$eF^2*B2(6Z3%Rk1KV@njO1y
ztMMpk&5ZQ`gJ5v)I@q0AkuO)TTVeWqd!;S&zNbn^YQOF1YS8ZTI*&$s(pKV0Q97K0
zPwSo$(c$5fde3U7=%<LrV|3!aJNid0c9gUe*k4+MjaZsgh*=_Gt~y?-f8wQ|)XTZi
z2VMvue3#388iuCDLB8r=q|Z>aA|`ZG*gKw4s|H=Ds@e{qc}mF|oQMY)(h{+7>1Ds^
zMs=&(zuY#Oeyg)NH`r;tf~MG<gp<L5kdm)nc*)a`s8?riEk5&Bp}$6t?*71;SR|qR
zA{JBrr2ow8<07JdKlGNJaZGn4cA<dr0znwp$!j=UPo1PkNpC{kiKu?rF9DbtEDdp-
zM<5i3OZa+4q8akU2LMMPF;BB!n|vc~Pf$*qQAFP9<Gqgo{Y|2l{BsU$<p!5K-7TA{
zBadr1P|(>-LBz+`K^d(Tx}-l25oziK80+?e_h4PN(1!>hvIT3CeSi-Yh7Aps4cP)0
zf;W6F=?8Q<O}w*C{q9iUouq$pv=)6Nj`-e)zOV=H%*LfSXDCKp$BKN*Az)hv1VcL_
z41BwSr&1&vCV}nkK8I}3{e`UQ`1yGGhqBC6!>!UbUtELcRTN2vyBGVYK9GJo+z~a6
zsEpLu7>qM9v6{~b@{5j)6r{0=voLKd(2?yPJIZqVx9vJ{ZX06|<QrMcN2^-bFw_l@
z5hz@UDQ;OiA7%9t(O#FcW{m+fZRlHUqDh<>Z`iyGS|XSZD*Ey@BI^gSrdVjjJ^^E8
zn~5X%Drw|XHR2E@e6&2pwIn6Sb0{8AiFIXaCHd27xTpANCKb{?4Cp9HTY&078cB?=
zJZjYv!BLeP)W;TOt^)$-l3xQQ7{qFrXU$0pK6tJdW%+vMWOYToFkmaMaNe8b3?Bho
zc=dMY7&BDOi(beU0B_`;19!92^KMf5I}=12ham-^DE$!{#0I!m_hzp)hMeI1T^S-f
zbu#Mll9cN%-oYe$C5HXTunkgcPu8$S2A5ia$C;+jgG2)N=QA2Tn%gUOa@MCho+goc
zH+N@-9@Y_jDoMMZSu)7NAHrQbd)OBosS#kZYW$7hF{80XxEG4<I-R|gHMNb+^?KcK
zZal^RBy4u*c5iM(i9Cf8L86c+fjt^|64?@jfz*e+1(3>c(8?179-NI;SRVZB6+w?O
z{B+41yF)on4F-$muin_MINNRInwyNy)}KB<S#z-B9I)n@N~*ws8uW{hvK%L5*HtZ_
zd>YKc-jZE6%=PR&4-yixoON3c3vd1?bW+p%8mq^4b2Ol<ZbG@Z#QGD6m-)7EuQffe
zNoz7ATY!i=d!eXz7B~eKL8hK2=+&JFUE}GU_CYw-5h;DLTo<l*Ut#<f+C|+efs_Kw
zFF3&L3YryTdt|WAVnM-<<M&6)){BFEp9%51ndR@YdH-2?S~r-&8H7wk935Sw7ywwY
z6r>TQx$7plT+=W4=g7aD$~wfs(N-XB>MnVPYcr7LV&&tlU^<|se~@KcYN9wi^hs;S
z(sPn<^G@-TLhONFM`cr&$+h+=yzYxG&oCVRC-nUF#1FR@7)lX+t3~hWX9^JIbv*2d
z=ie~0C7_EB=Qo={)KkGqS9;3u(@)_KZtrR0Z=yA-FW%QNTOaYftlrR}`$>>!BTpcP
zj$@|0O_-7|o!TUv*XO3F2t-tu99J^89+EyK++8%(ZEtxK>0_Oc%sk2&HYz6RVAk$$
zE&@MYTJf$bzV?<Da^>B~arIZ;IH;A{Y?x2<E%*^oxSvvgxelER+n@(~Z9Xm(KWYvz
zY18=O#xs=zgMbB;=015Zjz`#CgGpI)Jw8YWQ}61);lZBK$IRx&8HAC+V!x_A8keDL
zazE&LZD+8alfSF1Mik@NF4E(CulEDP)Sk|f)`bhoX|UB<*Gu+50kQ*O8Eh$D+b_3=
zVs%6xA^YHS!I=vSAdtG`#BLwNqdv+|$7*-og8u=;O&&hSxC6*E>ja*HBzFt$CU;h&
znaMzbNkL!D!B*griFaw}ArI?D^#)-l*6!#x!NYP-Y;<yrWJ<L#F-lD5+iz@RUAiwT
zsqe|)v2u_GzS}OCsNsBUvEr{2%~n}5nW#8znLqK=-+)vJM)Za3%qUtnGc4MoI2kp@
zNx5kLx<mYzFwkoWa=3+6pmrS*=xT@Ni46RST7)2QI!MLTJg8P@AYO$odSFI%mho-;
zSge{W8h7{CH{8WO_Ft$&M!BS?m-r8k3hqBZqQUo6t*ztxOz!gM4F_G14yu?vS87aL
zPW1N|0Lau6nO(dK)_VpT6`ugza*v(FNh`ZcUgZ)hF}SC|X~$yfAu?b(1~mORaGokC
z>0DNZ%8~)By|+_QgaXNO$Lcxph<hIqN$*`KVd?|oC=be$2;3neA*48RN*<+XKZ&gY
zo-{r91Ww!!s}6}4j8)uWn7n0LPVLFu_~0o77DyW0-kBn609vDYKr<X2w&E6PiiUYW
zFbrMmA`PY+)O9<c*lo~I=$V4rb(9^(ekd)dwI}T$vusd8jIUHjqw|jLtwha8-1sfo
z)zCi8_`n@B?R#0(dWCGo5<xd8qYdP|VpLrVUozRGO;qsT!ZBtP&rv6yY=1xUXeO(?
zFcwU2ycZ_(#kz+tH2PJ5B(XN-b|k)hXXh&+U#`##R%mgbJai0R>a#l^P<Vlu^XFl@
zJ^ksp?iXK1t34wLD|VI^WKzb(DZVfa7mlaXaUU7f_auoFuU)~Cdhr!iu+4E=iVP_-
zl#se)1Wn2NAO^5u&4V?QK|L{4VXz3$xdsnYaMJVI7*bHn<IDn+4g|Xg1LQ56l)rEQ
zLM@qf#rcdjbL(Lb_YR9l7m6DTiVqo04O+HIjJwm6Oz%@l$$jycd3MUp0q<uQ#Z+^+
zFanXNIa4%r8x@x75G@sCH$u&(RF=;)fHZ=)Wqm$WdhP38bj@<($3^2#jWW?Kr>Kat
zn!8GSo3QSWUBfs@a>bh&%K|sL^|YHN-trKxZIDZ{Us{S{iihE>sml3quTu}}WvLU+
zn=rOF1nh}Rs5sde`wwV``pMKB`wj6ob)9+Dck7EX-;d3Nl6|@IJNW)(#|%YYD0q&d
z5G?$6ny5Jk?!xa^{Ax>C^S4cXmUhG>67Z8KyNsx0JJjN*iL@oiwKUEY-+G-dwa0wx
z44BRsFIn8;`??mSebk5I9*I}t%zTTBvl(6u;(zO>&K4J=I#M1cvt47Z6Tgw>%o!xw
zhA(J*o{61(Q^z-4Kyi;ENVI-jV~%Lt)a6Vlye-O;GIc|_>&9^_SC{OCK)LP<Z{L3+
z!Cz51s4Q`lO<+seUn;KbpRE5T=1{{d8|%-Ze9w>YW~f&_&%o%!)WoFrHBOEZ1zH;y
z-q>XiwYBl7DXqp#yB@X(y8jw;SPEs_lNHpg!#>n}o^KnTdj5!>8S3`bDco51E1+O6
z$gNF|ivB@w@G#1Sq*y3An2^EX(W)n}YZg`x9YtxfTk=oQpU3Mu`{2YI2WzwscmLOC
zmO^WZAxB*#AmOqaj@mfWW~Hl1=18~J-}WjfioTG7pm3o;T{U|p`c<9IlFv=y%`0&e
z3zP&~y(W$32fL%fQ}F}j3`+}#2PGC1D{&?eN!M0!k#YzQd8%$IPv(ERYF9Gor#M*9
zw_B;_LVE8n2-0y(sK0~W{$uM&rEvZaEr+clUDFv0jNQ*sL`I3t4AFxT=U-x!)*4vS
zMhDMP34Y?_ax|4Z^JHxDdxnFlq?M5`&A{gr>Wgwdk11>v*>8NMd~+M}QZ<Yi>CB9x
zOXvSOQLvbXQeUoST6(O9B?Fpy>OyKUc4T<ZOn$)NsDeSU#DX9+2@Z<Sd^T=OA8r@u
zN^vf|ja#*l#h!=5m_L!U@#OYx5vA`*)G`=kwTzpi!r=uT`{Pwk=gTBB6YyEkyVTNC
zoZqLo6XQ?`<IpI0zI6GWd4t9M{c!7sD&~_Ql=LE9^Au4xA<q#{(oBmS%h=-g?DokT
zWKc(4^U5^WB}da|&L;B{HRbl>3feZJNKjddt2<TSimacT=0*Nkb;gWSIsd%#uV3M&
zim|NnNkW41cA(gkg9dS}wo?f@gc8_>S&vxDPt>&JDEh?8ci!pumiP#1&Aqm&eJ^U$
z`Sn&zzJ$cEDD{Lcnbk*4EhkAh)zrGL5@2%a80jh;pu~D_{of8I=E;h!ae+*#jw_;_
z*m)I6O%C5G!rL0|YwwGpB@#_@TaH;;u2~Wdv24_A-#+<?oA$0wri=xiN%}m#ybzf)
z^Q}^mj2T%*<&vG^4|c<)@0TK0b<cZBB5S2jk4DpJM8EnT>#s5moLyF`pJgV-*YZAB
z<-czV)wwz%_Dp4bA9t$ef$ZJq3=aeb<byT7t=Se+{#<nB8lF0N_P^&HQi+HcvdMZy
zO5LoF6ZgKoRn;}4{m#6%^`(DYH?(DF&A&j$`y=Jn{%Y<-%~g0!$)z~Kb4e}Xsf)eK
zRGbBKJ!|Mw(=-j#e$z_hd=pckb5M4?#kqY_<g6JAz%66_ikbA^-`eVg-jF-<Bxr@@
zya2m~?)&p-)s$aOl_lOE8rAh``epI{{qN{S-@Wo=#|j$z2t~F?nj`R5x}KpAbUmN>
zn|smN`*DkxJU%qB2yeotm54?>Tz1^J!D<`Z@r&ZMb?pV!BD7;p4T5pW-HjF|PhlOU
ziC|4ru@&ro15W=rl)98Z&p%u_vY_c?#x_XVlGTdTBjNpgy5YMm39tCh!~{D7-`DGF
ztKq@X$`EW9UO!Am@2LF`RtYHweZ_e21IuA``G?LcWpWl(X?(Nssx`ASI{te)U29lV
zHQ6~+Q$D0E!~Q*d>@78((e#BmKF=5Pb*E5V*EK3^g(@<tpAuFq{_0whBWL_l2AakH
zI^)+dSOUf5K093_`hLp6{&mB5BlxtDk4P%kmY+2+FjU&_U`&Pt50~vfcUj$j;k^3J
zCwH5DfQ_TEu<nJeFV>&P{7+31mR*_tB)!WjA%A_bo$Y%W+o>MB;`!>9N_Ikt9qaXv
zgpG>|i!MUZ&UQNnDnGhEDpr;>Wh(Km*5>p%FlDd2!!4fw-sU%1LsF*Lbr5tja3T)n
ztnQ*oyZ78w$GMbSf5Jy6cSBnfucf)3?8-Nu3}P-7Q%-6rl!O?o-z4?*h;t<8Cmx4Q
z6uCH8Hm!U#7#G?9RZri+gB3UwAG!2jD;5^W-{EJ^a@5Ffu(olwHOB2#I4dFgSwMEN
zu&~JVnDQhW`9#SnS9ndLA_YAOT2SBf=d|9?FX%_Q9YCeX+m!U$f6ph}ifDd3!ujVT
zKKa|iVxynSuTEa3w5O}4`c~L(6*ejuUjNAOz8Wuqb9A;;cG^e>UhxHT=b-Y<8;J#j
zy@5t!Y*sHUl7LFrRTEpFC}NuSv{7oESh(WU;tFS35N67&<V8cozM2&8Ftyg0{`y&K
zItlD--$BLNpy+8@@rHETbn?ydhGauoqem{;7Fs_2MYwKQr>mv9VlE=lFh4VLB$)Yy
zMgEx6KSwy3(k+p4&0xmM#~`<hNPjB*Y!?&0(<^?pbIW!{f~A|C2}ZWc6~<TT)CU8v
zi6y9N*)Ddv$*UEU0(<tBLgn-Sly$(Fx9+1C>AzDT%`#yUR2UqMN<8y~I6iFRO6&z!
z{S<x~!_PB4Pjc(B$;@3I*4DIGs6uB}elbU%3p1tfDlrjPqx<p^<!_rxS-kS>y3OX?
ztQ|)6b1n+@1l`6DT@;n~iaAUt9QCY>=dC&(HPV}#&hD50e8Sau8>0_{y*70wZ;Wa|
z>Y^~RQPP&k&E?QU|6f;u6Hl??byyHXto6H$`&=e_W1MG&?_I2m{sKiZ0|a<3c-f_>
z9dOIa(3@SwLHGIJ`=0K10`-|NA75{@ghbX{TGau=1p1@)6ka2VQQzpzU2bW{H$#NU
z5q#^GXZ$XG(fF`Cd9G&zomfWpFYwI<1w=y*?GNw#cUS)VY5(jNbfho+d>OgQ6ryxH
zVm62#4GJ^k{9lw-InVMvKZzK8d*tK~Js83H@5%qSiTvw%%w6<|zI=-?%|^M!B!vYp
z%Xa(iyVdpI%}96+1Bv$N5KLZrdJeNQVcS{cH#Ccap>7r>&Hf$E{>vEr^}hdl-SA;d
zN?wFmBY~`}!KHkd!=`Hvp_Lc<wIZ+FkJpHZQc_an)L#~-Xv+!dxuEy6;^rF}H=%BD
zBV{LkNp$JtU_6l`q#^(R_MjnfYkwKaGS#7=|LVS)0g$`t<SptFyYQx4V^3Kx0#jmB
z*=-UPE{HYR982j*Sh%}px`?Oh*kzjMaKzgjd1myOjPm{j=7xYs>+!Pw@6G?eywZB@
z91C59(4*j{PuL1WZ#Mf}^o{OOMhIe&w?4q`Y|Y@YUg&;BKtC<s7`z!*6K_OW7_&wv
z_PN|~BmR&!RyvcW=afC%Ry%YV!dW`ClFkh#fKh~j=D(xYpQbi^yYtnTI3dr$ySci}
zx23KwZN?mbKog%6!{Kxq6@xtH&cl;7asTU66QJ$#qM=;x#E?N8M8k5Dge!{AwSrl)
zJ<s@3TaFM*jH^cfZf!ULrx``h?zuavMZPeAc<xSVnUg*HY=1tjH8krtB57KV@qYYd
z1a6&g9FVCSL_c97=<@;nf0Ax~%Zpc-v_&jm3yC8;M#JKUq{h%8QJcR0@j6mq`scx)
zeeUOuRCdPM9N6AEpD!>t;URUTm5j@U5sZIMFfOjXcruRl|Lgw$_n5fBY&%qPA*9j8
z&usd12(NM5W&L+a0mLs30MS@H4?VA0NSt}Tksq%)e$W>HCK`NujllxlZt7Z>gWbz)
z3Kah_d(z@)c89V}b1&223iw@8EHJ+63?zzUehh}I)%E>9SOC=mb!eFTak#Plvk2qj
zWY)1!>guLYD`A07V}$H0u7Cd2EWDq?WHQO@ri1rpf0P#fI3a_4Pk#a(NHbEvkhI3y
zSa(*T<+F|;`R6T>1%`{W?u;IFG>(4R6h76gB+^>6f|g@zo<YZ;Ksm^zq%PW%k^S?(
z^wCnVHuCWP#>5?8W6TFJ**vTTMpDeW|A(=+fQou;+qV@_1O!A{N~DxjK)R$PrKC#<
zhZ>M>5D*cN7Lb<i?pA3@=>~yeC~1U&0lxd^j%Ppnec%83)?%@??va?e=en;rujA}b
zzwkbn_Pr&^awj58T%G89RF*=XS76vPUBh%W3!1YBFdIQwX}<g?g}DBa3=fxeApX^$
zzW#rOhClv(h!8|AL7UQlmpHy(Z>>&bVL5Fd(Vv)~7`v~h(vt8hgJ<6kn}?TIZG1~4
z_elwf#bkLF&ru+7D*A%1T^S6Kdmb~FY;n9WYTifZZ>%eh_n(vWzx)_?I4%>fw=-T9
z`OkCy#{n{@16DEHN(IFg?0p;iCMN@E5>_3)_TJ_%pW;S%<;B9u1^aSc(^>5@4y2Vg
zU1`l)1ucI1Raa}fwMRHqwBJfLULLp5(OR!9<&Kf>vFJ|6uXOA-j`{Q|NRalsdUjbc
zJi>S4Njs595M>spyD=H~^jK(1*+$6fyVgiUry}t>w&v#lE6Cn?0xq{6LMHuxwdVn@
z_j;w(d{F%nwfN7I8yQ!B3X#$=g7H^8HrqYI?_Yxx-_l4Wb2Ys2zvL?r@^a8mS^=Yp
zAF9v)?~g8j5DoV}U8ntb+*5ggp`;|Ggt!ohUqe;<?ORWpS{EX!hlyKMPttR|f<l!#
zPE_rGa!-)z1(AH{DO>G+xZTcFq<G3zRb>=JlErLaRC0!i;Ei9^oVL=V=FBZQlwvyJ
zrPz?^Zcnw~tiIKgNkk$EbpAg%<%@s%!OKR^{RBPP-0u#bJf_%qH^W3^H2O<aZ~pNk
z+yf|V|KA^(bl|q-8F_x?--QwT>r3NnlQ?e6#Oam}Nq%Cgv0nbJvG}&Mh=Jh)U0%w>
z_aejsK(`=(TzU~S8oX?>J2ZomUg<YlTo`-X?D4kZ=zD4Rh8^6id*L6K-v4scKFfov
zQ&afne~p5nw`vdlpp??nUEREAcNdkP<Hk-*277sC*PDM+V?$KPx!&vz?b9>xeq`KP
zX#Ybi18v{kCm<J{^TvqBzrNc_?{v$~&S=wt_kk4Izkh$?NO6-uj*((7tdsRb3ZAtl
zf{>4Q?Je8E!W>h2hs^p|kr2D~gVcoBflnPB9h#mU{8e`e7s_bPah$h?Vq!UsWEA4q
z(wv90_0%^fn<x);?5?f@qJjni(CGR)LN{|5B^2}i>*LcPraY*l?Or6Osc>2>s(oH&
zU5HE<F&JU=ViA{=wn>^*STnOA6%gwV9T-652w;oNYWdP_Qt9~%<8qh5@1yEAzdiW2
zhATA7%-kUza=cZ?qwjs<!GL~d#D0-f-~Gpp_s@gq#59<tWwc(mcPF7bmsv#Yg)U>g
zM-FjhXtM+9D1Y@-e|E`X)G=&U*cx7kAffMZo80SJLNoHsO&eQ4Z$G0ktMO(Lr;YC+
znya%hE4@03`^etxECo`lK;P=)@mbX}DXr-#pbeUsI*<-La+{E8$LV-C6o>Y6o=NQ(
zdl{{DqkEE6!O@k|4t|oJv3dJ;uHI3~NP!EpE*J;*oxR{?H{>_EFFRa8U22RqWj+xO
zR8{7sSiVW1-NFg`^y+tJ_0Qj^Pw8_!Nl)JQC5l69f4*I=M<e^SZ}IR_d^EO+q7qGu
z)jDLLKEyGx(v3H%N}7GV^t_#oS<7*6!$aXTn-hUMtj1s!7-`X?+>=zj)rCup{`WgW
zocq@5RD&4ajHj)owgaYDZKFCPk*#oRN`hA7;#zyzs57ru5i_CnNqXz4uyuwpf>6>v
zESS+<FuPA?BdAEp$k8=55dN&t{{34u`l^dGk*d^r-&oFVsh-%wG&PI5r%Z$W^nA`1
zJ)9-vQT6PxK>xHTvlfyv0X`WHhMf*iOj+**(+Nx)2jOos7K5)REVO08S0YjcHmG!b
z%_hr9lHw1A@hse`ahsDB3l5W|C9w#I>3)HyN#GW^FTx)D!1?ck2g*_U%X+h6N*!$J
z%Wga6?WNu8kw&q{s?}95$=Y-3_Ctbc8A~3u<Q}W;6{4h3;Zd+qg1>aWe+NI%9Pe<Z
z&Qm`k$ul_et}MJKM;Fd3{rvv8CObmi=1E(@oH1gW*Xb<r)HJyy1&tUV47!&tZ@!!E
zTxso=S2rx<dDFZK^X~ce>Z$#Xp#Kq@H<$*WaYVJM#TK1%Ep3Y5QN}nvy{nwC1BRV{
zv_k**3<L^@Q7mh2RmQHdgO%u%F=5|=8I4Fi_sr*5&TH72HEhdA962ql!|J|_*DEcd
zrq5^`8S}pm(x4`G*Ciet+o^TFOGrEni|fzt@Zvq1_}5t=PJ9i2-=??uT_~_ih+h3g
zC@jT_)y})C4?*C}5iAwjHZvjA?1H~E_O&?Z+XdsvqBsQ*=<q`Krp#68e_tEs0+>lZ
zK4dcip;}Mf%P!)7zeZ53_#}e~_Vb?Ma3Vftm&#asUiDJTW#HcEE)z_?7AfGQBx;p~
zu$5<<wW=67DNb2VUrfX)U}&i|krRwvMs|F_uNsH`y2Cvxb*o@P!Nya213feI?aMa`
zS)2M2HoJ39)TkgIb7R|R5(5Y>-=k+haM7RXHZ)>h+kY7Ow2F)}HKi?TM%(D55-yGJ
zj1&CIlF9j&q1hWI5RJp2xJIv7`#-XgyM#M<VI=n-E@S-nYxURb=ER_qzsF^gP)0P~
z0e=;#gD_<y;jqJv<RPU7u7Fmj2Z%c|Hk+Il-iyt;9ACUso2g`N5SV^UTXsV;R4&C%
zjc+rhZ0P3zI7g?r;eI{-izuv9&l7tUdgn#akN!U=s_ZDA!<MqlIJjz5FF6a6C9_R|
zGgAlE_8jCtF#r8T2TQ)j;zX&lJW}=l0k?BG(7txy=^r#?!B5K`Eb?4k^`Pl8OS96`
zXE!ZKAA(q{+}@|`@=YMW>77ze(MV&q8nJhNh$nrgu?F_tvf73RsJzU~wUv$_`JAMX
zDXYkE-y!TrtifNy5XD8)Dw-_sctU{GeI+S8DRApq4ucY5Nc{oUul{*zUs0<6ap76r
zxwFLJmJ~Ac^2COM^+-=fhcUP7co3HOM6H_KcP76*4xK>H*WXQQWIAeJlH{G8+_sY}
zHqma3<6QQDA||*i?4tRgWAMF21`#-kSHMX$dyS?3?~{0s?J~z0YfO@H)2lG9Z`zU1
z@OwaCr6rH}^i_#;pY1aNOO<Q+6k{3^K{@q>zS1o;C<+tQExgo(38~xwJh#evV`2+E
z)d+1-Qo;^nqO@31F~kJ9Lbv~8sd*RC$5||$Upa;}+v8mG3)nr6JPGLY7o_<9eFIlY
zVLY9cv*8m#B+BG`WW=P`?O=L29Z^9{Z7*pR40@9H^tSG+VaJ;W(V9`>#HP18pR4Y|
ztN;K0Ro+J)Mw(+KTHM$>gQQUJp54$XiTl5Q2=M~uJ68s|b~Xz5@{_zIzh2h<4s+No
zc>HZJl07!VmYlq1@}u!u?NBkyTeXaVxrvN&7<FA}tfuTilfCU|iG7iy;gC%jT9J)`
zwkd9=cEFS#ovzJ*fCXpmM>!Ymg$Sl*qT&w%3?5b6hy6Fg;CC|i0sG@t(HkY*_zBo7
zEXcN^FM(|O396<WUM_4O@{N<l`-8Am*zfKuo0XxlE$Guq+mT)9{L)!`GWoJw!5aS1
znyWfIO5p~S3<pbt6so<Rrt8|jPrE(cXIs%&5LLdY5Hnwn!e5bEOAHPk>M(A|bzRi^
zrF!DFLPnxsKje`Z%Fg%eMaAgD%;aZcEHVyAGAGe$bh>pw{2X&t(9i&PORn~ba3;KB
z7r7#K%j18f;GY<Y7Ci2vDwp@gx1f-%LWV(3laKon>_~vXG45O<(ogl@Kinr0qI=;v
z&vbG>t2L5Q%1N5h6tgwA1>z^Pr~aaEt~e&5ow6nMS$uNZx2SwdIIn{aBKXM{BK9-T
z3$_b-V20?IyUe1M^K9ppw7bdesmFuf;NOT%m3%RZPBB!rF2(v+-tMa@{<>4nn)6|m
zpL*N<zk_`xXjRs$*n;hj5zyDNoNunq`64#8GLWhaG*Pyf<B0G7^L;vToei)uV%c;g
zfCQl`?@^*P5dMv%Nd_idbik{Lq!#LLE}T%hsn76Vhu<5NOV&<l=Epu(@VZacdUv1I
z-sX7AZN8zgBpOM<{*U*iFR%EgXDr_VA~%T}Zf^bwLO^(r^~7<ybT|XjUQ~nae-|RY
zy0>!S0eb!MV$iNmFZaM8zzxXKdL0l#cA_H_6S8NgM>#;B;tzI22n3H8{F9wy2Z8#O
zr)lRa?g7{|5=sSVkeg&ze{Ibsjv0EuZ-XVZ3R+-CSP@QS1&zmn7YO0}r!HRi1M_+y
zAzv<?JnD@_jaoJU0hI+n8;Kr)#a%kEkms{^*e<ZJ*b517QX#$2wHxH(fd7gA6A<MN
zmS`^~+-j*imFEO~&NhtJax=nk>O_{_bFV^zy~FVW6VODc1n|I<uK;D_<_MT{sFOYZ
zTv(y-UI4-Gr^t6qc>#b{b0BNk1F*t62(=zSHv*dA!#=qJg;fT7%}ms>%OsFrJrRDb
zKM;zWn*uf?+<_K8=mHNxfE+J25W3C(>JA>Y;7Ip2u<FeMWB4)dM{Smjn6+~q+PP%>
zVrPBfP34cj8q=7(|0w#t+SlPab;kj_a|iiDuXpgG3A?Zk@}fIp0-U0qy1U#RvR%ci
zC`3C)H7&~D|F=Hq&jTeMLbn4X3W7_tQ4h%KcF8ftU_A~EK<c*U(sG)-u?NthX+UsL
z+i7uIIC~%9dIScr#EuuP7^k9dF@<m3pcS1#S(mkwIxhHPt$2$@*)|=^32*K*E;ROP
zv!GIj6SjPo&%ZNpn=zcQ;Z9AM7M36$&p$SL>=iA%Dmne}fU#NW<zBZJFJ*M^GP8|m
zv{>0AROk?2x4(C#Pjqt^y!?~0K=tHDQ{$W?yQM^QP|+_S$7z;ha&|8{OSG1<iMH?=
z3ImUC*W6y};6oAV{ZyXf^@>*9JTN@vn*^2R@OdV6mTgpALSq8yF7&I5dt>TKrOoNt
zHc5jf)vYF=Fi7J@euZea-%yR<xHxXrA&0;gkAlznerg_&<k5NQbprh7+K9Sx7FgWr
z&*ilmV_PIg{|Gfn243X-&46tzL%Ft0foevrPzpHy(r?tt3?0}8`jG-1d31zutrw8d
z*0p`GvOh;5g$T;(9&GWopzuad;rWna+*Nd3-yw9Ly8rYoUsuJ{E`b>O=C=aP@%ogq
z{AA9PKf4F7v3GhMfyqt=q(^PL5WNZ!bOo<8qHlG<Yi4DD>X2d<;9M%lbgiyxSlJ7M
zWf?*6c3T5m?06`o&V}n>yqJEM3#6_F%UeHgIPNR3On@HXWf#pR?Wy7@i&q^aD<fUE
z=tJN$2g4n*R%CpfV?DqBNLA6+WjsB$-xFdG(_X%`81T<7mH1=qo&8HRTGvI%-t8M}
zMX9@k#l;|I^Fd0ok*rx);lR{(1F*i|R<U}Ok4EoDxbS!bSf+W$9J=r@MYzU&w}%^c
zj6<}r?UA=UE8@nktTp!|6lFseB<gW}N>(+LeqW-NeS8ceX58*^x@|7|9!wvZP}1#O
zcv2JAie(rU0)ck+CenIr(S!N6wNI;Ol8&zh!XpiaHWK)RTFr7NQ-c~?T%62cm&{h>
z+xrh-SG_>MU%O7+tYQmfhy7Q&Yx!^sT>!TDJJ^Dg1x)4^Zf<!h9_FN1e(Ezhu*Q(`
zrFd<QrUSKlpwC(Zq+<L&ZBjD=H~4kX>+=T!4W4=+i%zB>2Y$q(Q<yle2tQTtjBk~{
zcMK)LmihcFgADCox4C#fCMl<(BG4^Uf@yb8naF;jbE0*5(7s+2%(58K(L#&L@&j_M
zPMlWhK=j>jKs~pD&R(c#FY|pJdWmd$rzz6Yx}J?2Z$fI4AqEx{+0JW|qGN1Y2H!yW
z8dQt<xPi?k@jSyBvFO3SM5mz4kCj>gj@Bfj?tAazxnV<)3{RLTy7L#LHqBMdxP|B=
z^4ZB(5bl;qVbvY=H=^lX$Pm2T?6UYUstfr!X8qw<Xq8-*PJ2sr&_RR=8};~jbzA8Q
zOn@`h$Lo1e*62*N^n3!PQ0YYdxfA^wN@`VaKcsZjqVDGhq{&H&uEWx6fe!oq@z8hs
zH7jH<3R?MpW}aA^3iw#r5LTznaIIEYELvF!aO&A;m^?D}?CQcxz7s*@D?zpnZHfm4
zAMg;RG;NW!9A9k$65UDw46{_t?IREwW@}*yYd&7YodSUBcd%tj&=qzxneWAf5I9pn
zjhmdc9Mxa^k?9Z!*oTj0;*RTq*ywtIyTp~hO2%C3%QUZqmLnu@k*vw_3gdDSuv~y^
z;1ghaiq|{L?r#H}5JTTDpw}D*#>7j+n~+G>LjQ9%9%01g)wgPHc^JdbqZ=<<e@#Dx
zySq)UyDkosYHds(w8hQkl#TLkva)2aK3Q99%uF89JZNvuLgoxG-*4fmo{}M-TD0^^
zOgSDp&8*fnLQAz1C*Bj;MNad*t12PGg3^+|KG;8AD#2_Un6|9j?R5cW-K>ySu<lJ{
zdJ9i`fG~$60ILi2{Mcjj)2VUa=}Dm9+^2ue5?1D>oL}O_Sf!LdP&PD?S*38BHP2gQ
za+JG<r*R|RXh>V`aKp`DxD@1!(y$yIXr2`jD*giZ5Q35;FI(+!X1;xU4S;4cR)OJH
zCWtO^Ko`1z-a#_<`6(>zsTH@N5u6sl6ZaXsTa}|&2=PJvJ@Jz;sx`KZ9=nrYKNZxo
z71My+Gm)eD*cj5(dGSKP8X%sV=uQI}iKvc(=;}<JgAMo9j$~8->~n>ZLjOE=N?*kM
zw*+n&KEOf69B21c?kYN?S4IGZ<HarHRHwqqWHrPlI+(vWkAnb6iMLJvex@j}^&P)U
zAAq0tP2-L&;W>48fDQOgPTn;lVC}m9Mx$=L&-ktad858UAU<{FJ%>{;HOK_ZaGk3#
zFj{g1zyp3;?-H;M({DQi7CTak>sD^h_gq3Z1RBYrRx`qLUAE_Bi~!Vv?sH4BLQKtP
zC21kVEm7FAredSlh+dxj7Da%bU5~e}t~ZB}sM&DhuWCM>jc#!N>P*C_^LhHOcbCNP
zSDe@7eyMtt%n|tIpJRwYE^JTWxejp$M|s=~Nu7aEohhY{qXZ6VFr%s;`v3tXsllr+
zS1SP*Whj`nSXTJOl@i@ozzzB>PL%iQ^t@n!&91A$R1pn&XysbNGUoBt(5pa;<{ag%
zx#YYaewmx&scYVR;gpYd6WimC2a!=A7yS-$(P8g47m&Y7f;^Q`w2gWDcXkRg+9_qj
zx-#0`M3{{mg1OFYH^-}#cCI|hsy(2B=+7)SyXB+h6gO1HVqYh&RtXP5LSrW(K(coK
zz7#IDC9FS+!Qr`x=q|2Z#DR6IvoF!ELFrX;V0?Y*G+Hy%;`UaIrfv{W3BQd{0uae$
ziYbWrmQ%Y8#GSi!5R*PR>lHwDS{wN&h%Wfm>9$yqEVp{rk#lu-RLmv$oTCnA90me<
zU9%g&ZcrH5l-Q48E5T3k^8zR$Q5QhDdDyhyFJwdOGk^RtG6d^CV)&7_m&kVaZNDyb
zRBY~*m(`a7+3i?HM!0CgQ<1}G^&mU&VoLSg!=44JicfzeDM#!sfK^L^vktR3eR5YI
z=~e?finG_RVBu#1z(|kV?r)HM)4i|$3;H`QKxtclV*dJdxHwTbE-J415xz`Fq-w69
zhPq<@4O@@JH|!?YcV7_aKOH5G{<=IqQ}^(u9zJh87Pop2xrFfTfPD2w?U~=O-Tv!=
z!u#CyM;IE|8f9F!DTNHe$=^At{<EFRp7I$i=_X+_xgNgOg`=%sww(RqCLPTRMM0^z
zB!LJq7ZL#j`HtlxHj^n}MEy)M7Vdx1_KvXmA(=PfK1ew6nm*mHRy}p!3Hf$;4}Bf8
zQ`R$OznPGw96((<$g&^?Q%&P!RhGk5p?RY~`_DeGgia!9D5m<dVcxDFAO98$!+uDj
zzk91n_)4(FQv9!Dx(#s8NGw_sJaEfy0x8aeOdpUH1qL5f-XgoP_iP+;hTd|X0v^{!
zxdhZz%%2J70kOgYnLSJlM<4HIZ_S(>e{p}clx2hR$m1JZn!Hr@cUum;h!<N+2+f1)
zc2d1T02tU{9U>CA(CZ8E0ts8OW8EnT6-RaF4SfNA!v03DP60Jhi9q!DQ;DqdhBbJ4
z+BmPS`kKe?hcF6HbMxNJl@TY9h?_?2K7}Q<-(Oa5_l(iy;P|n-S=p?0fb52QG3lwN
z{z^j(6?fsrr+zcoQ*V!?fn_11r%$dRw<qFH+5)=(3Q(!PHsNBbT+uPc&|#T*b7!_d
zKYHOXVk7fN{>d?_IC+&tB(>4qr>aGH@b?DkpFwPJ?>c(^;*1u&LoV{=7C~K=16J<e
zJA?Ht{g4CxNx<4<|G+))sdM1EzX^SMEjaOS4ZoUFgy*~}wy4(KQ-~(8E+@J-kCnI;
zxJ<*cVdx3lr58OA4G}9N@v%>O$qD{DH1(5`At*&HuuQT9`ddLDIOVCsgzxR;oV^kv
zXZ~4stYkIdyeJ8UfoWm11?>>&h0c*1sB+zDIZHNumcCvAfTmD!I^09x+wo&i@;lfb
zj=>jPy)#|M&o!)QO~j=5V5$ie5^3HkFQ5%Tw_LG_F9c3?pEzCBKPzPJMiox1E5DB8
zTan-&9x6xgG-y%|pj|=kcsjFh-wjny7QoHm;1DF>$;YRmf6xE!_+aSY@EO(63ZUBG
zZUsvqZ%&-B26`KSQOPO?z(bY1i%aRV;%>t0=N2$>U(1T5yGB5>*}Gau;#NJkaAQC>
z%z2tgh2z_zPFvzg&T=mc=x%MR$8J34G-Me^nTxCqd!*V@2Tl_{d=IYZDe3+JA=nRO
zxyjS7;@r{IyJ!)g;-Vo3lU&j56CBQ_BXnK$trl^xebeZI!WNp_@V20&=t8}vI5sAS
zSA|ygoO@Q=cz(}&;>{A<Bi#?wIfr()M7p)MLy_A@<JA|x+9v-q@Dks+^SRvy%!7-%
zyM+P%rt-@*%0aMv^i+65e=cz?ONwTCZ7|LM5kRj}OE<dv%>heHS@aCYnx=xcoP)1-
zGv@oNvU?aGrZ#f>$wQ(vL}C}S`RiF;4}flihrigvqsiL``gO+@Yxm-hkzQSa$BRv|
zT2nH5{wY?JKL$qCMH${IXP?RHO};DsKK6<EWXq)^JHvD_^@xr(ifp^8afnnex_5%?
z$AGEH`?NEu<rN{$R)J&V;Q4z^pHZjA?+BXsj=59f9uM?!eoZaN@qA-!PCKM|vM9Of
zi8LD|@`Bm!B8Ll(BgsYM!4D*<A6;|+NxgtFF92*zg9=gt*3;wLKKb?r#jS+UH*wdN
z%m)3Y6*!xSH`Bo;*_)GmCumO<D)v!rp6Yu5pKx+AM77g;$+{bRC-gRFBGpXu`H5Bj
z=)snLCfPv~h;0J*sXTYSe0lZ`HdQdHIk2rHVlo;`hW?{TQptgVw(sDZ_goUXe_m7#
z?S`HxV(IS;T<QJNchJC-OLR2B^S~3-Og9I{s9-{%Pmrse+OR|F@Rt@qR8(V_ZAA1c
z)oXD@jC%6>*>c8JY!AyPI+S=6Zj^;mvW6PQlIvJ6tt9Zio!D*G)cgIX6y;cQV@f{K
zgMW_Pbj2>G=q~``^===-IOyfv<}p{vjy3|C`--6v0Gqa8^TsWH@PT&%q_RX@(0wH7
zm5pzxL021(bKxg=Qmct}yivKKEWVZ8NVcTNDV;{(xkLTg<<k7+bNmm1YnFVPS{FW7
zZxVw3+oy;CcPMvTgYr7CDL3yWDT9-mzOzffK1`Z~2s<?NMj;|BUopi*NPIE@w;2Ax
zO$gbNCaQ;p{Hl;f4c4Z30JjI$5eG3LSPBXkd%txklp$2nu%5I>-QZPXU0U-5X=%rK
zVKM-8Ax`T7TC#rrMNliZKtilOsoRW*PY$$sdt1Ps0S+$yktPop<}vX8ecC8RqkKio
zYOJt!tI++wU_zxgk3l4%i2ByjP3N1|nt2MYVoIK2r>hrd$jh6c3gfJ<u$iq7FuWBn
z?8Y^^&i8b)?US}ShNlOE|M%=B3wGS-l)M4cjj#8x!t@osENdOb8=Kb^!OQ!&#iD)m
zM0H=5J2@=L;G-2q9r3HHyQ|xm<E{w3G%<c|l5NJM7iUB=|CsV8=LG)xo}Eh0R9W{}
zU!PmSlyG*s)8IcDg!g!T=K;R`0K9ikq8h<6Y7ld3SNX1kpCv27PDnsw0#t%WGQKhC
zsaUwmckV?ATS78?VPA8I(%r>f30MLyRF;d2MBC@xjy??b28APS(yUK^1j9Cd1+m@J
zvTK9A;^fZ2Iz>&yQuZ*48PVKQM>4=8+h-JP8kG}3|DO01bE&6nPAlj|FsoNoD%FOU
zv9n-vzrvI*O$K|N*$b_Ljc;J6(x*7Z>pZscP5({YT;~E~nUf8ZbM*|>^tW}4W?_cr
zez~c;3Z=H#n>@-EIc8cHEm?$wm{a(^zodmnjV?~c+i2%pu`9stYLfRs25r0iA;|Uc
z`%*dy9-SU;AS;kQiV%e|fft@oQAoiF@36fB8PkY1<z7k2W3P4EoOn2)arki`%FEXP
z{s(()hbs?_8gDt5hTw<E@r$lwBpi#an#q@1C|*uM8)@tPBS&Bb^QN+IG2eMJ92C({
z$>x(F(AWXU%q>akLHj0DVITD&pK3=eA8M9>i?bKyjX8QO)JMx9)wG%(a!<kcv53jl
zfh(PVUbhh+UgxRUf(_N6pKpDz_7o`HR~o?HTkLVG!pO@>sJL9@D?j9Y{`nG+P!((i
zTbBUUP1<9-6piiDR{W6400~O^PuJp;zI<fKq1{#uoSd+v+@}kktQfZn{gFqZr#Vfv
zS8>Aka@k2{>aG|;Y?>JY94dT^&=*~DE;?|Bu7PzUW@GawQ{@twQC}X`|1&1wJYE1>
zdFddI^~K>T-zo!)b0N;7Q%nKsoi|r~PKHyH%A#A&4@=^k_IiEheJ+{?27p_15;l%!
z@>_q%&?jvUEK0ovfs+TT#j@^`m=T62YsWz2F{;!WZ%D=C$hixC?}h*RGbJs_p4om!
zJS&^cA<}s!g}A3(-x5;Uo0Uj|ql#M-C>w69d@9jX={V6%;b2USc_gOinb=E4clLOF
zhTk-FzhZ@yq@|<I&eh6WbvI*qjWf^0qOa(-(9+eukJo!a2TLC#pyYTlOVLeWx^^10
zPASwN_W=nnu!=*UlE=~0A+|aP%JqnlT3g0lHa9(+I<wWc^vk6~z(GulU)lqtg{j=r
zhL4)6=fupueYlB{QuY?c+UtQkw^QzuLRbz!C^oquwdHx1>(WW>G6=S!h{TeRSuv>4
zO0Xr>a1O`gV3-A|?kYGa)p-<VAg(B0O8)BzM1Ug@4og-lFKgx)G0_w4*UjzZo(Ig%
zdWHIUoYUj>%hq?E(?7qxQMLw)YG;#(x}09W(!s0=r1vW;Nl&vwVg_qt5P{n%_2&;u
zips^oK<jTk#i!Spt`foE{yOGbB5`%)ZPK+y+cSv4mS8vCY#O6R!HVsgL-&BXD<j?X
zuMSGvLFmF$qu{mH1~dd!TkmYDK`jpq)fzGTl6y;}Ns$L`p)79o!5YiHTW{qfa+BH=
z8{*K8sgJ}`(_IUj<7Tw!ExZAEn3c|Y7tU2iu*nEiFJ+Y719%&}7>}19$aA>IG%a@o
z%I)JKR`{_o2LrJkOw_wM;Wo`5(g7Oqur7BdiTt5;70yJ=3#O)GYco@_T3)E3-xVkR
z1NSg0X$$nP7klrJRtkW2y}v;n73iJK@|bkoNaalV85Xx0TA`H0|1c~;IS@8C?Ya~i
z6sP=%y<tP$CIgG9SeV`f_^#PAehskB_N!ZX&kM1kc6;4V@JgX2MTk;te%qDf2IB7L
zOzvf1c$Zq+#oY}S=H&MLIR0#+J){3@+X_K|m{$-_L7U+>D!Om6_fJ%Go<=|S)-}F!
z_Q=jyN9;IZ<F8M8K(W4Y9BmW*;x-(<mvxxFJVi#}ocJ4$=&1jl3pMNxHWSt7#`#)I
z0e`QEPt2=tIYDYSC@Lg%rM4PTfEaiku85qZLb+|0d*ytEy`%Z(Jmlo&!l|e???d3F
z#&<{g`R8an$ScjpDE8Xv(f~MgE5xh2Qp8mgu8Y3Vtt&b{AiRIWzzq@OxYx~mdV+YZ
zB}p#g&I6>M`t?a7CEh#$q-T`6_K60qX!l*q2t!yny;HVPh)$A5t#6rFPv;R{hh4+B
zqc?)gW$)i>cMq7dikAOuI@M6WVdecCnXcT^Z;BM`n~_B|6)ZU+)NgQ=dLt)3t@tn{
z-N8S5Zu7J`Ec4sNdKXY#CzbmqcPqzKNP?NCI1WZpx=MIXIb``JDK-<2HIUso?2c?=
z2SQSiUKd8_c{}7z4HeKK#5+NQO$jhAJ^>b8Qf)*?+%p8Dn%L<kcj^k1PPKuWAkgcK
z|IP`>(cVj2Wb2Sr?43-{pOfmk$I@<X1A2n!cq8$iFBZ+J;#u)kfb%p+2UOO8foGYW
z?P-nAczlkI-$uiTm$&-?w?-BWj{NZ{8UUrDEZXnxpT|J^`c8TIm&8s6g~WGOhGD$5
zN`QAX5pz7jb1&&7lkjk_C%S=<@;WB=&H~9re1x(}$~nKKHk-U**rtVB(Sd+P4}gR3
zt@LtQ*d$pAQz~`MRii4pg-g8IFCuedvPlrE;Gt{ap*)js;OS`T?nG`ePCn%J`<WK4
z7k|LT{+ii!#eD{tK_$?kC>&S=^D1TG4Ff~UmYdxJIRJSX7pIVs3b@M&&huU_i?9do
zP@`-WTDUg=UiSSJH@$hkn%qSfZh{HJA<6x6>XJgI9|2^RWE#_l3tEr(F`e#4c)w@h
zU-{_rRR;|#3-zFZI=-RNMxZFOURv%Dv@pr63Q5$8ohhH{=zGZSU+HYO|IPips-H8&
z2cCG+w9RlQ4HpCcQ9J}B+F-u%ZaxJCz+`LS#?A(Ftj=QsYB6&!a2SK)2KqdfXpf9M
zccr`F36rPCFMV<G0P904Rm^)B3Fcwxz}n_JE@$IH^zVjRc>@?><dukRUuj<BVy7qL
z!*9vcD2cy-n%qxl+)Q&_!yIO<EBY3*sr0rXB*xkvR}ET5=amoyCkb#Db`s)Drb6Df
z-}TLV@BxPH4$1(}3iQtVu$JR?PAXD;w1xX8+FoVaaqif#kiQhSrgRTr{YepSpjRa6
zrAiH;toE{IzlnF5)WBpS9hgi8xazo19=nio7L<c|gM6IRK*cqXC^XPf_vVsG_=VwI
zrqQ=xiOJ1dCi%hOPc+UU&nch6#;Pn-49b~%bz~29Sc{Dy2qar#sk8n(L=>lb=&h{Z
zqUx@?cT9nLv~&_W6cTls(IFPR!B-gQJ8}FW?H?>xX}ZtRI!O@L+VMEx3yA3><8hsj
zi$V?<I>C4_)wSxbp@F>ZZ;DDFv+ODee&(aB-w{Zg#=B}G_H~6u%HC15zxt%^Jjh77
zkL&g|XFJw$H@I8oq)ju-ES3d1@d_+xsc_W{I4wS^^!cukOxX~FjAv<<BD(0eliu1O
zlCFwFR~jav+n(0#*)KiS^OnzEI`O=^QbE?l4?22Q-Ym^+#Ne^=&UMQj*b{r|Tt@x&
zcVn88TE@B=#Yl`K*2$w9N|cGO)a|CNBBBzPRD|6h&!&7Bui6|tTomA~eAA#zj2tLj
zv<%5S3o+DpZcYNA3J3)a4%rgzD&MSQ^x>N$Kw_x}{d#j?omaD-MF?8U{t$5CjiS$9
zbn2k!=_b*7`1-;=g@ApgYG|AyBE2p3Y!lX>D51LsCbAATvW`Y`aE>><TwVfX3igl9
z*lPeo;X4RBoBGjU13N%sSMntKso?#Q)GUA4*pI|73mxhOiM)ozuKswupX8#MP;J5m
zG)S7N*;lQ2X&7z&$wETXA0*%2c$h#y$@_lJbqQ|@I49lfelG&$1XO?L1b(kANGCym
zf#Fx%Me{X2LmAO<I9Nm2C0Q?1=4eC4TQ6BR>MOm<Hmh!Z?{2qRr9^G`wY<%r+wwKj
zBW#MY@AxeAg4w>*MJr(+)}F#MPlv0)awsF8sKMhGYF0MR(gVs9b}7=!<D01XPq~?y
z7HSHoPY9j=%t(GW{tp>H6OmIn4nA%hkTYHCX04U&<LRIil%99l80(8y-6rrk_UHcI
z4(9Nl_c?XSGc6<MNcF6L5Z~VdiK!^%@M9|kOi=59ks+c}TmD%WRF}#i+V|MbD<(R+
zi&jx)0vBvRRR;*!<=)JnIXEt_f{xm<)F)}u^>`YUYi?{{e<8le8q7N!vTN|a4bwP$
zpP>t+x|Q%HoEFo;SA7T$QmsPWOy*|S6PuJT;W-Cml**?fFTP?zKru&`v?&Xg61Qsc
zl+nKGz9fNYaIYvJTe-jQ^Z~{&pklXGOe_Pq`4)r}<Zu)G1#}4~CkL0oJ`NjsBWT<u
zWw~eUc>ru_C2NjX8{rd`s=FepD-?KuA)*gkGgu3m8t2x0#u4#n2Sr>TBb(wdSTncg
z$6LxF;2N_nlm%+#Xjhpzka13q(+V%_3jy;t|JnYsJlXHL@A`SAU;6}ZD6d4<JloTj
zLX<}0#8(ukWsz`u7VU@Z*plc42)r^OBYaqBTu}N8CH*%1Sr^?Cg&QHcRQzgmCf<bS
zN{c*4I}{af{vzf<Q+4rc7{L4Xskz^!lx4bpc~Oe5W9=Wb_!o>@fjU=e7Xpn7W6LBJ
zJ=za(IETvxTD?y?O@3!gKkTKZ|5S7OU9Fw5_SqBu(y5gQ%D|eA(q%G>I7#owheIoI
zk~Ys_zO;1tlTJ<?@}%2{whCA9u$`ug6PI)0eb$8ukab#0L$4p0oIHx^1UxQjG{r+>
z%voT-Q`acarm(dam44F4Ah!&LF)l9^qnBzuI6j9|>=v4%FYz6}ii=Zbw@))na==~p
z1UGnFs&F;b*ZT!Z3t@ef=f~J4k<HRpPr)x&ybFFp%p;DwF`>3RZluXXPx&9$OL(L(
zB7v{Z{-ML}ibYQH3Lt%zg1Z0K{6`;fxr{!WId|`{_y2pbMd0QFqq3h~Li~=P(>k%E
zBrX}pjl-<91)m1nIs_>W>Dbx81)2aFA3J`&j72`XccT+9ckV#rbQu9oq#qvGKXT>C
zQY(dhU|#zWNfGZC0?*?7rmWv|PmH&&?cQ(sEGKHMnICC3+|4pvNBmo_tcA1|x~O&M
zrB&q$QiW*3qFCL^M-=_+64s^S|CC>gI}k2(Qx^koC?i)xx**uL?HZ-n=?QAU$Ixj^
zcp3#}yIyUqI=l9tp{8B_R02vvWQ~_&;WT;mgml;-az;h^GO^zGI1_4ZQ`Kfgd&B@q
z=OJ^oZp#qTKOJEc@ewa5ZFK2)*g3uAWCksjebxzQ^(mIq_xbG_WMi}WtY`MlL&{r{
zBTFYk&c+$ggP6{vH)F)c*^0%=Mo(fy`S_gk!B-;3+bz)m532s-LD?p4|Jzr(y=YY6
z-t0+NiSvi%&^$809RgUVsKP;yhbiTuH5iv10N(d7aJHoO+XJ=I5TNF$yMq@m513!m
zxy|4kZv9&DC*bX$aRyvEe(tokLq>N<e6Y)AZC7J-wyu>XAtLHx{SrcRQO<(O;N^?A
zbHGhZ<e6+2_peuvli(ZUCA2@mRopf&*R-qnzFMLQ;LQYhJZQ<__*DEV4S7>5fkz;+
zP$j!A25C4+sSIe}BKm7w+vP7_dK~Jtx80Vhm-h@Oekbaon>8dKf;hn_7oF232{(?m
zC7l4b7BDGR2MbRYfIoNun6CW1)#<G~$h@VDS$FK6#KkPYC5ioZg%fX1JH%s14$I&M
zhw%;jh98Y@U$Et7A>uwzr+RXrE!!aF%-1mGav8K`j1i3w$1Q_t9wTe~*ME1UAt`Jz
zCh(w^&Uh03tLVI~r+u&YjQuR28EB5a#J_2++#3Sf^Ahn69I|BKZUK{r&pHMnozD(7
z2WLQoAtX77&#(QxXPPc|OT9Otp2~ysr~FF{N<Ye4R)-@f<(86-oc2E+X-!$}2^2E$
zJ(cu`s;tyMVL5$+VjKuvl0{FgXofMAVIXx5RSt+&RxpXC^ZTiNU7z+pJtgB5jE&v!
zHXy#eN<d{<vbH~L$nctxPcsj7^t4m?qfwuO!=-29jJ@nkLO&m!ITh*Wd=zG3enSo7
z7zPG=xk||t<pbeqDzb{Juo(|<hpz#j!}R&HEf?%T*L;w+C+~}o9xVKVv@r@Cb#3ZT
z2X(=S@|HOf9*r=6rqPCkh@s#yFfhE{9y<M{-QsW#qDc`qLtOHp9I5bV)`2%fGcsXR
za)8ry!#Oa_4K@8dZ4P8e%A8FHx8PtlL+7=%!07F-H*;5{j8fFImZ#{{y`!87gYM5V
zfPvw;JDmw|FTrk#uv=LMbn#BS-(%d0wR14{!H?gWo<o<xL0X8@*VX9-G%xtR!X27>
zJF4vC!k)+<8H_7&<8DX<bJqic7PsT;D~z3P$NjB=%Na**sJ@=?emwX;YrPfv&jVt9
zODO#aUdPqJ6$O^SuEzY!Ep6m%LrWTO`@xJ7DUiArAn%P2`PvB*d93mPGT9R_DWsvc
zYa^^r)MXW$ms`9eB_4bz;ipJlfA8LE7t_AuneI8gGAq1iWq@ga(M{K|C;FwuBzn@+
zP_ZTUqH9=wICzhQk+bulvQaa^`YSK54{toOy96P&S2<Lf?kU~f^H7AnB!(%mG_)^O
zX~k`?vN4vqZzy<=jPXd`q(?_;1J+Gi4YG?h?8+{QCd-_jiub#{^o_)u5PDU;VKJC>
z6?9W`7znuP9&7`(n_V#4v!4Y^KhtH>EG}NE>?G<p$NbK6%~*iD|1~^&8yg8<+}{v$
zr28Hl7n(GtSEUw!|5g1hrhKXFS0=3$2?pJCmUAf%`5OVgJg2q&(<sg%q>(CGul#@G
z%b(@e09Jfkm?E#7x0uXTYisLFpn3>#t(yi3opj#mvatU)R2PDaAhm>+dmZ&^L7Z?Q
z?W3aq;JMa=7^EMMp1ij-H4%}wbg#1gQpxQ-g>$T@$63z<{hF6+U;JaS@{s9)H~M(c
zAsFzf{Q;;)efybxxC<CK4+FDDNpIlPOg9BCPsy{PGGKPVKY-;%!fNQNVB1*#1$5@*
z^*1~9tX6UcDB)LpzZJS)liTAM?o&q?_BJqb&wQZx+2%L=V1l8qpFCr;jkfz{k2$O6
zwKFOQ6D>mqNj57RYj|llkZl)Hf&Z0|`dF+JER!$8lBD|*^ic1kvWbTHI__D?*&t-E
zk;>?majgdWJ#i#I^WcU4bw&~Fhg<g`SI526XjxC<jj0n$h8svCZY3J^4xlMe5yB|S
zR|$W3xcc26*!XBp0slj<Vg3CmWBupt+*|FFD}aQ9MfK{^Z?2{j2gZ`l83YB3V3Wfd
z(2O(H1MP`CaIICEcHbJe{`TR%O+yvWxtIr_>pjunJ$D%Ppz~$;Af8SvNgzvzndf)+
z`AzBIdrDsjVip6tXxp)*KNQnm#~G5!lsXm_cP~l1D!ZCSbp7TXh6rz$ivb~*fz9~%
z3fQ;)t^T=F$`r4aUbhWYpJBPfN;3BoTCd8ZV7_rz(>3R5b#;zJVN;-xQ(Bhe*k!$H
zbGN4+<Rnh(0HUSET2!gl7H|+)lNsPL#gXUDZL&5qA$bV{EX)I*`*YS2Trsa?hOj=S
z0g^+8K;U1k4(W;Jq8Ii|U7|uD`!h(irIBAz!Oitv7#Ewqf6ZJEW9}%Q>vHn45UJ2$
z`amC88@r2SVopIDx12u>xVzsIk*IjWMYbWlbe-<fWv~<pzMHrLHY6RKqx{s%0G4En
ztZgy4oTjqa_ZfaD1#jw0yXzW}y3_)1qfr2@zEvB}gzM3D_E;ycq+x%UZQ==b*Nm*3
zXx0rf`DUTkG4qo}d1<952#M)z<ECrdrIsf)(R?i4)>{&FLCT2<RorptMTh7l-PZm)
zFL=DRO7~TzD6c)pSljs0|AY`u{`q}&NnDHphfj$CfzN5(Ab2Y5nFoL39KMMtR0J5?
zhCIj$=n~u7x&uhfilcJEvz1e6O5?8F{}3(l85*M8RhXt@&Ci_76j$Nk>6K{GBreG6
z76>VR5Nk(j7ws_rPN_dnUzXMfy-RomQnvR8-Fw>>y!Rj6%&yk(QxxlaZ`P58_s27n
z8~3_Mn;nhwk>MfK-+h<R=OT6A><OsbV+xNOfsi&InA~WIfMMr1Qa#NZqKoh78?j`U
z=l6ED^{}Vkq1EuQ4*hOL7*oc->(|ePB{FbzShB8Q3s+jK@@7tUpsJFRlWt<Q#3GSh
zeG0R8kqt0(jLxyS#Y|mQcNM$GAFs|O%sZES$B6g=%zgxo+P@Oi^_A!ewbQdt1J2JI
zkc>z5y&f8X!fUepsEpSrbT!&804c-3r?+cZbi9|K=hhJ0TlaAqEZjCqG;Q?`P;vg8
z!tAx8nRd#&U-C1HC2wTtnb5L*gT2y9Ef4PMbn(D`my+L`{o%Tx!ckO!!@fPsbs;UA
zQX-718ksVyk@+|8r7jGJQMO-uDtLe8w{g8|oY3$e^aJa?jDzgBhnT^_4~d#11;7AI
z+mIX9e-L~Cd2YQcp23WX88EGC7UrD}zQ@sGUq;e=DUshUy$4e#3^Q|ZABg$}zMBlj
zG=2llH(|K?#33C(AVmJ`8H-9G1);8BrZ{J@Y{_I9o5j%8KVF?KMRCImmW!9(V9&Xk
zAbutWgVAk*ZtdkM9zWP&&RoemG{w*tEeCcB;~pYCfKitY7M4qZx5Z5zxK;`p-?m(W
zmJ-t%voWV#fiY`&__dO<aeT2lx-R}Dp2cqmASPyi^!lkhX}poX4Z_~!+((sGweat^
z%_U48U&x4;@2si!%m@UWbGwAMg>$wx_S9~G5*crxWHmi^htYwCwXjqopKAHA=7)rc
zmrtqCzQWbKKo*a2$qwnrtQ1D+LtV1Aa)F<@)azc<Nktrm$y;~4y{9y{XJ3}76;N8i
z;98M0Qjf9-5?u;#t#bP63kC-Idhch2>&_L~6ed>*-$Ni_Q}I8ivY!Tx22@p4Q;PAa
zey?^nnr729c6#yCv0-nY8{Stv-IzVH{&h-cSo*LrIJkQBbn0c<E$|g#g{;Kjh$USo
zu2)gUnOweBP*@5X3+LP}|M?aHYS5(vI%)0sW8l#+LxpE+UvZF~Xrs2r^VBeL=Dkt{
zq-i8P1c2}TNKwJtk_*6qDvUpNoUCU()PheS4gyerR<vmZqNYB>wA}mTLe7P5t5xv%
zfOk>fU$6>3X!EX!FGGLgx${Rj5+MA@V-Zyk#<gCy<;FiZCo43Dv_6OY0DJi>(;FQ?
zkM2~F=HaH%2h{1C@979)-qe8Rq~J{ZLrh@|dw*%4+rDidr)}Px)_aluj(rQ=CCw#9
zktoKV(8;Nj=F8fDMp)?knbJLP*nZIfBScO+(+=>?6CttOj_7Zu&_A@=JTQ+o;I{a=
z8timeZX;v8XzCk1-!AE|=qEP3Ekki*F8noYqHRo#h#!0ev_BdT>>z|q+WtczGwEv$
zjE}ZKLg*bq&ZYr~W8o;eA{sJZZgk(=Jskv>W$90^0^ET-E=6$R^_3Ao%FbVYf&$!+
zCJh-IkTpItAr<?fWO$)t6>V06aH_#2WqX&~#oB4=)XYYPNeGkaOcHQ-pK+Z1^KO7d
zldFxJx$O5<vp54%J10>17Jzb=Zisdq^1yfmNZws6E<S*A3+VX%fmFY5mpdx{p8@<?
zBDZ_|FD(G&y%C}+09;h@LJH02y&&9-_1OkEf*<0@u1NY(kQS4yT8;WDGQHJ$A3J(J
zrUOgX%#xbnrBg}1r=;s?lH42=Hnv#2seD=Q&1C{?6HM!a#}ax9IH3ca;WMU5vCD@$
zULD>~-Z$rs6YV*a5s_k3e4Ic#Ll2p3@gRkHi@u}}pBb9he#rgl&*%$6KRA}|*Sn@~
zebt*9J!qTz#ar;$|5|{7!;B<$7dpYpA-J!HBakwOKx>Cwr_d<b2-iM(BpJon#Xk1r
z25lU_s*NE=RTID02(JQz-N{=6rMFLQjITEIZc<%P=B*K3#=H}eB>n^EF!KSY{s#!|
znhV?;wYx)yGN+*uQng6KoX_jTu?oy#wL=adV^AkB(94xUzl$3kUdDPcG+qW-Hbp*E
z%_Vz06oIdE#?g2;BdG@R_EHzFTS($E3qFBn{-tr-Z-`Bcn@`rge-rU*oR$C&cGu8{
zwLpn!66{apF<9oum)&A~a|`^=AiUkV>Z`;%$&k+8LLW`KE&?nkEYH!X%e0_Y)mfak
zye!${i|%Lwf}t4ULb}v&(G%J3mbL2f8NJ@7^j1*U^WBLWuVXl4r+BdZ33jWruXGi=
zpfSiaD5jmsYEH*E0DimuX$Nr_qgS!3#bJg0u(QQd*66rh`DR7iH{)nVzA`yV)*|n*
z#!`RiMffA$n&V;sDWcu7q4ZFj*_Yr~gwvp3m*#%Bj%b(w9W%6Z1!eUF_sd?{<hOk?
zr(6AKHUF=l4YXq;l&#ZLB*b1+HKJD#IfSUgwwX63G)w)hPmYs*y_wm&ZK7-jqvZ1f
zi3l%70P9G!R?s)A9-tW!@R3q$WG)Ew-1eB?Kh?+6DcS@*_|V)5o4Idzkkf!ckdiC#
zS8e@J`0Ax$<{PJVfmpx7KpJ=1ZoIKtCC%%?7wCO@OX79u2%gNP^bn5V<1^rrNp&wb
z<X)R2q^yaQXj03|JU<z4$%m@K=0gBq&jqI4t3_OSLWoD+K0uN>67GXGnbmM;aosg`
z0_T48L!#fy4z8u4w|_h}%a6sqhao_`UZl=x1CUFM0^vk4kZlz<SZ;wbaOj;ikb=EG
zF#A~x8u+6DWQODqT7dp_1lWz7i5nI9Ln{%4-$CDkb}i!yhJ?7K2U@FsbyBEl;7ykr
zyXo4V0Svru*|!I6QK4U#VRCIl2qb%GCEL(CmjBhTMt|piPTUYSUeQOmYzkT$vy2B-
z`JpdYVt*89DSaUQ`_%&krt4_%+dtduF7Flzg*s8sSvmo;6aS!UU7VC(+a&bS2;_{B
z^8hS}Y|}lte^0S~7Np&T8utgIi-h(=ZX|?Vx@^nXWUtfhRa>YRxO>UKAkI>U=v~<`
zDpK!8ZiKc&w9Q1oLL1drj%yK;`i{<sTbwOUNup9Wa}&i6AI8@kAdt@?&?vtUNM`P`
zCP{r@p%y!x9B9;#|GEcC82Xv4Ozm~x)Dd<->olUe4k~YR?Z_1TdwgNh);THe`MR~Q
zUWjYp+^zwYnm8>MiU^Y2je=T=+>Wipych<_K}X(T6W!&=y!UFd+|2hJ$XZ!nv0>#T
z*~>4F))6vhHE!FvJ`zt%*|K`cMjb8k=i7|{19U$$e|)V1Ybp5<HoI&KaTy%(Wa2Yw
z*9G6QYB>^Tu%Pm8T|>)&WUazU5p*c`Jfu22z(qefvgZF<`7<ln^L(Zwim*SVj}DK)
zmW_XNz+_~K(Q*3hpeng$oXf=aWEc+52=8m!HuBUqHaAGt`@T<PbFviP$Hie38<AZ4
zovpWNyYp7-$S~T`#&_SU?C^}2P7FM303Nw#wO6Ns*yJlhU+h9j@n3Js*Wxl5?g*h_
zG$?<S_4OG80)L>WcOL=9?l9v|8d>ob@l4qP=O%+FjAqQ#el?x7@G-u#7<skp=ALY#
zS56+<7ZsSa|7@)bYTj<`Rj8rfg_l)wI@k_Z93tWq+iKb$><_^c4QrK=OdjmRzYN*>
zdS>+O2YIIg>&CepY`+=6lT-6D&M=#81;w7&7{gA!!+$DoJNe8QBFEGxEeuyg!{TGf
zl9S&D;bGh~r}C8`UGKIsZ=D9Wd3G58;=5Na%i5Y06-96NLTx?i)rcASWJ+4jPu_)>
zc>T0m2mVFx4*1d~7duHd&cIrO3OGE542=^c2)P_t``KK{<3L>G$<cNR@Z#F8m$_e(
zclw3E`{piP0rT(X$w802kQ&=_8#PbnUm>LITn$&?Zpm{H1b;O|$oyqtt$xIx$-vO9
zQs5Z?I(F+(3LJTB^1%l{u!>b>+8y;etdUDAW!V{Cem4cJ`9N%Opt^hA6Zi=cyaV=f
zq)wo}aP{OXRyoT8G!5F)1g*cM+k%efnW^~KuU~fX>l!Q8{OF|NFzKi}O4hFwrWX^!
z&Ve!$qb=>}f7%?P^dpDi$+;NLA0hUSD{t)P1d#CD_1&&R(%3QzZ}Ztn>HZu%QvAxA
z-JO^RXkzy3W$fEg`Bu^*sIZT;8?A4uv2L(?Wv9NHUKt+=incOHG#nEB*;-wL8g%D~
zAB${K;<QLzW1gu-J#boF)*a7|%T%5opPKeGH;7Z*=$gxps~k8WvYGb$C}_7Qv!dBk
z)6XeqQ{_2f4F`BtIBgDXEb}2@b%B%hKJ3@zX5|dyuwZ%3n$GD*s`au9eXgCAo7XkZ
zd6-{Xe}JqM*AwT_+0d#AMm=PZeiS?<c7Ym*Oc&BQ+<YQ-qS_`yb?|dqE7^I<;<C}6
zf3xQ0XyO+TMf~y^50;v6%(7GU6cQwq4z?O%-JOk-EG6g|v)b0WkmN!^bG;bpQ`1yh
zUr1#|2va+zX#P&4DnCdkZC9?FiEcHw*-J3Fk3X1LnBoeyA#!(Ew}HaQCoj&>z58@7
zoL0=Rz{A5W0J5gYi{&u5y&C@SgD?TE&nA)s$Vg<qvZ7*G&&_78(QU3Jbj2SZ+-@q6
zcx)6b;2|~?ByrZWZgSW}`}CNNiaH4G_j1P+_EXJeu6lz2rM{OqHnXhpOO4?ktH{y1
zhF6%3H?AQVna{Pd4Dr_3Ep^;4Gx2{N@>+ktOFf~fv1;Eh!H$pLZ>mp(#ok834_)p<
z`#dh$V7S?Xcx}`rs`StM1}fQeLq69uAF?0LZhLD%N{q{5_O_<LyLIj^C3t5!i!)*E
zWv7GXtFeYaa~v~#S(@Sxr{2~#uM?GyrhP2QxUn(rki@I0d-igLsq8oR-rwLbG2xeR
z)@X^$w=sE2&LZ#X!urCY4*3EjWt$0!OBmhPiOW@#)#4JPz-DWuJLsiep^wy1053Af
zVHT&CoYxOrqn+RgsqJ`JN?u+0M>YrQ^B>+|DYO9WHt{a#(Z0dObn+QG16gNczKSwB
z1uU(|<9uCS`3W2imJv(^X<jF5IgzFIFTNL&T-^QQq1_2;*1~zBX-g5EK}a}jnUn0S
z1~^_;pY>C2dVpQTrtMl%G`PAof45g`7s31b07q+F&sL-gT(9{+rLO{ch{y+PRoL{s
zk9qWD^Ut8ki=npa40Htf?9~%@RjzZk)2RtFgOy^?^9O~as=Q$!7q|iwDH#dW3PCW^
z)DcNmYjXi2F4q03gFO#OIHwLQDr4hnh<|kBH0+%&Kv&iqN*?owDC0AT?QQBP{+Z#<
zyocM}@{u%JoVA_yA3@0Hmo+p7_;x6(S!m*#ZSM`&uJ~rnBAAmB?^P>%+p$^SShVJ0
zNi2%h@Tx^=99EjWvJD}GWr$2zR~7aI3h{k+YA;8*tZht`*Gwkt0c&9Dj6@@i0V<s;
zxXI0`GNn09FQ^Rv&&9BW-PPC+Mi@CS2U4i7GaX?e=yA=uaNN{9k3fy)esl{e$zB(W
z@&lLI5wICpOfV^VNBjB9x=ZB-0+S^g><MYb<!+%H?@5|?ahax^7A3Y=VZ+8&4cDs4
zPf-F6(GOzD%Xm+ENUN)u(qqn=QWFpOO#;q3*G}m+msoBH_0$fnjLl96v|tIOj#s~g
zp|(t)!%zy3=!U^bUT095kPy)+QVfn-#ySGH?JKPxw7h_UN%9=?o$J(H;Rle2e%)jK
z)KE6PQ6gIw79s>hlsDbv?<GYMuXW2?sC{T3bH%70Jpe7M<PFQW<?c^wRg`6z3fAQ=
z=gcj?Y+UUwNCRa#$?VUcX40bfHDkizdVr)j0vdFtL^`Jl6g>%-lA&H3s4@hvdjk9K
zXXO|b@b)^?KC%FGnZB$Uq|AsOj;JRK>ICeGzZE8F2K^;gowb3POj#q*Vx`~%$Sl4J
zbXIFeA|8J(P>2#BFZtZZOJTC+@M2h97*O3m^$dRerD#dX_ASKDkdRD3EIBbV;e#)Y
z6X|p%y1cC`!DwoRA*`R1y@9QImR+pxDWDp}JyZ6<`l}E4`{lf2ivj#d4m7_cLInl5
z>FKv<fdLa^YOT_oywhS3b4MQCHXY`&?vr8^#<<sgq>NuKn;Bc||Hs%{hefr1ao-2T
zRzg5jL{Lge=?+C2BoszkNr|DmQ32_0N$FwekQ!9F1c9MLVW^>`rQdsF96i71d9UY>
zb6p(KnZ56Q?{%;EuFqnbpo8a??z&SMKe<$7yF;g@Yf!X`s}B8K=~VCc=*)AC^LzKN
z1&hft$;PZuo%dzp))tk$70n{TW1Joor7CiT$r3~H4yZ~&FhE5Pev_Xf{pP-?fY5?x
zC2e#@JCAhxEhGxhx^bI06$gaVw2E#Os;UGROC#uY^~(gEI!%X{7<G31VWk@p$@>i^
zyP(pdwwEw;2XP9@^7x6#6rlm?t`=57%YC1X!wXD&bVi_GQM51`PusOQ!9n@lw<1?z
za00}(nKY+u_JAmKg*48*e8|wSqJ()O!Q*Ie!sNXt&coWm!nmT3MSMSvWS=Yq1ym%{
zw7iE^nFaGQDeZu4!?6H`<ndMCQr4$0&<abd<s1RNV$H4ErScJ$psQ6vZ#;PjsEHZJ
zn-QZ6@w{t$r5&RQ&RTE}c8%-1lyJQ*@evgbq>gC;+_crMt{<_CV9RyW(z{Gv!aAHk
zY;sh#HC!G&dP8<E#*K@t*LR50>@-|H!E&VZe0zu9qXGm)VCu9vxlnKl(ij}J`#yG<
zCkrau{1S?paD?+cCzuCXbzxxGRHO`3r2~F}zJf-%*12%k@l)DZVC~B6%!>=wtVTAz
z9T~8@{*fDeK^W@dx^{O86eSb*-vTOtQtLq!B%jSyNdA})3wPGMf)RUP^KfmXzdhaL
zS#kmF&W7o3(WkDOQd1R{ukZMIhW2(ouzYq%Z&~zzli+&V&i&BgA<E<A&iMSO{>m&Y
zbhM{%{r2qokyZ^je3Wp#S^6Su2h>fOEl7*NaifX^liofPy>iB!nh__{!qnb?nVG_Y
z8|TcdG<HzjBkdS*>y8s?LS<{?N6_p&Q`=+@T9-3#ci#XU1gOg!4fI9n>DdUS1ZE2j
zoupOxA>7TWNe&CK5oA=y&RRItZmI4q4HP5~dmJ5Xrq(6%SR^?EWd(@KLRUJ0a2A7X
zDgm+y>~f1Xi|Di;hnj(c_A1%THQ``hMHwzIjYt?om&Y$UNGB}lC%bK_m4OVCOM#fx
z4%BFS;Gr`;F_WIKH1SA7XE2!r4-ye#K`WVx)hZMY9-yKoUXaXtOIyDo!|{00{zi^{
z<WN6`tu=ib)e&pk!>*^JkPlDP@l^0m{?N!D_PmX8$!FEn${p@zK0W$<|J?VAud{R>
z39TrXP1mkM?|;oS5Zixq^{!FNdRwpQTY0!&&(`2EERJV;S+6Nj3G$vCLKK(1TgbCC
z6KZl6K!1f2Yt6S8ZF|DO4HrX@MEjakV$sAnN0U<d5)>L=bX9Z|<M!~#maU5RuVhZh
zjD*G0TMzBBQ%fd@BqpmE4w?wrSd@Hr^ZNd^D9)V`4mVn!(%oVEcqmR-QfcNE7naqT
z_^FG5!>lZ+vSMgXSOjIDiR$=Pu+-tE$uto{>jRv~ttA>%BR5q6=E~O1=HY$@j;74b
zjqyy2t({ey7J1!aZrFtc)bjdL;Oc^vlZ(x0RCzhq_Hff;FI>dOU|&6}f80<*HpU&~
zILL4*4B}Ki6>5Poz%r_rBjkszPn2FA0jaG%kXN4-l3s_-DTVqxP(73J@WH1T4&<~B
zxmh4w4N~tlR-H*gBh|duc+B5G6D-j90*iz$HhkN!EC;#q^wH8g8nIla(eVNf`4#+K
zKqHk2+|8^$#?6L{RdGd=*7xv%%Po3F>SjM^cHklKshQN!4*GBkhurJr`ODd01fdo*
za$Uw<fkmTSuJy!?wWq$W!rPTyLmlxw^p&l3)4SA-yC88_wJo5i`YvkU5x3?}R+<K0
zCD=LWJ6Uhiw4FSP)SuOkq314cDb_j<)LkoGDps24wrY((iH>?H+aI_;IoaGwo~Wm}
z&+EjvZfa-FZV$s}QF~k2IISv+`jEZdC#(^yG%@H0%j224KBU{S;+Y{E<Gg-<?MO|H
zR%HK6R%fvA8InZLeG#SnA$|PE+)f%N2Q0)zOffN$@55<jYfhu4k47?-TN!1-eOE@D
zOwk9)_J$J6LG)A}rfY<6N2w!2_RYh0-hjHKQ!?mcs9cpMOhuky(ykeFnQDSuIpN#L
z6xHG9{_4*t*5!?~ymQN~(fM4}8<y~E>W0;w%RK}3z#lPjLQ9rq80hA};XS$9oDy$P
z<><prE60iH<{4ys!r&lhDXYeX7*u&gZKrBGLO7S1S+?!d%)|^5TQOWbq?!@$*QpVe
zC~vY`*!a5S^A@)T^TOJ*^~as^w(x7izMXZlIwP&)hKOkuN_2EItx!Rk?x3Km8%$$$
zXGpM&x>{bM7P&R%%vCnlp8)e!IFD<0DYQb`9e9-E^<?ZkcOAbyU3ru>QqE~n&6R90
z&9?)!O~TLCsP?SUgp#PX`a)o`mI(d)sLh`5{=M+OLPNq(Uq1bep<CwRBOvjOb@Pv<
z<SXOw%BQ~#Z@ljm*fu>t+fHqNi8BAXW-$$F0qqw+NngO@{>BY@u5XT<r}MwFsMT6U
z3|CuQwubK(NV2lHmrL?3w4TxU*NWqup`q@s^YADGg*caF{makqQ9Vd0#yVyX+;%=M
z8-Bg|oWtZ|#L%e#*GX5O=~mrW$kCIJRF?WNbgL0Tj4tl2`!MNPq<;C3jjXUc2f{=^
z7}Wjxxpu#^J#Hp9=jtyXDMXxpbcL@OWZ7Pcj*w!-B}7mE>n#2-NKei>+Au`7B^c&|
z_Jj*E*Cq^1DJ*>+!UFS(FBm_O*IaFUPwUW{_krEEsqswvrc+3}kQjq%gr%!e58pd|
z@h(;t2gVEhe-ua{?`7jDE<4vPL{eK;)qGjOB!hBkQfH4LIeH*Evq7tyxUX;Pb8XH5
zEtg9go18J+LaDkie@l<NE(ft0j-lhfcc7j$x)Ud?xfAF7VSn-P?QZ%0V!6G13D00~
z6FZ2}<bsCB7Xg4^&(exm3z@Z)D`bD9bdYQC2HDT)_HK=1v&yV9sdT4t#An@RnyxCd
z`Ki0H4M*(|yuPxvh_TMZoc4}c|IFgef4oM|0_d86@^ujH{5ULrrahT);yhVm+Hx1=
zMH3>rwVa{^7>HS;Zwc!}4-jgyX|nrK(sxlyLW?B0Cc}Qn-Ol=L%zAqKg1gn~6vhcN
z=O3n`YNV}<_x!9@7Odkxt7FTGTYfrwf0ukjT*cr<=2dT2HJb>IzPbH3l}=}$uzMzM
zH=n3pJR<PYvbTQ%t}N&#K&|P$n<V|~8le(y3(e!4$EvmbOc8I~V<xg^Oy+aXm6S5!
zAF$LZ*<GJIcindWNmXu>n)+p5==c0v)%!l;s!O>Aq-6Ersbbr6O(@8yX$Y>L!mrX)
zugWbd{_`J`Fx53p8z$D|UrPH~e*2Y#N`aqs+_dp(&v}3EspdWDZ!M2XNvGyzgb++$
z#s#wTwMe5HqdHVLa?4#=`@fe_4sI+3`UvnSl{O#;Pt$&&-ivr9!D+fIp&OO?jEI!q
znt0ko0v)~5aIKDXQF&(OVu$V&LEdmn8_S%KBgV;sS3tG{TUeY7TYc6!W~ptw#bj?j
zuEY;|5y2Y_o}bU<sy`ndxX58o&zd))=gjrRYr~O>U9l*C{H&|QZ24DM;`GVsLxOL+
zg)mB}s_il@Ue5on`!&2;f9v_Yix!(^@w?GpBi#zi=`Z1=LUr#(mpDJxl3>04EHC_#
ztid^Bdm8spQ2OJGH}BX+@)K=|LSzkhJ7*a_Z49FA8y#BZ6p$_QlgH)6Qv&NF<nZRi
zYTnANLI1h{)#>QoBL*)1E$~jh22in^RR7(3zt;Hg9qB2$ed{TsbF1kqmI)R$?Nu#z
z%~dV-n64WA;~h7x)E>f!l2vWjNjf<GgBjiRY5$e*A=l|KGBS~eNe;r#JRqAiNB~&_
zHn8ns9qad1|Fdq`TT}{8&cr-|xN2mQf&baIP!eM92A<hT777Yy=9+7aRae)X@AMIi
zd@3IwZ5Su5rwu59+nR%~x3;haFG9Wby1>xfsM%u2_phT5CL($GMPRNmOd&O9W*w`|
zjUf@UVS2niPUatrIV)$w$b~^jHjD={U7e@S{%xeZ%4rm}Q4g4RFmSHtLkT9|yGe(w
zFVI}Xi4c{PWJP<t{|9aHo=OGFltU|fIoCk;asorC4OL;=H*J%_FF1UTA5Z@0^T4o<
z7o-PgjYwTlY`fO&DS~E+TfCp@WS7G;I^Ra47=2{v4ks~Xx^zn+>nmV$NKZA-SL(jg
zpX9mSwJkulxw46O{}le?yY7mY%c7E;c_2IBY}2!~+2Al_a<sYpk}W*BIwet`$VkH?
z6N>6oa7nN#YJY|5!1{JGpNWiNQVbZ6OXa?TDjIftSv00AcOfl{cuw)eDT2o@eeq$5
z?J+rBmu<ma$p}%%EKlj^by${R2T5(V1H}G3Nx;Klrd3>4XU^C3znt|dAU(}#*ep$b
z$YpZB8pEQt75E|=+Wn4x5vQp$MEs9$Yo4P><DIU4)DU&X3X&ex&h18Le;(WmyG!~;
zEp#uLT{I>{vlFxhZDxp7aWI<(3ip65Qnenu_xJX^ecNlTM?BYNN5dq*t||9UwV6^k
zlk|Dq*0Z+JGA65&w7Cw+!a%f=&foWh>p2x;9W8yVT#3h#yGeJ-z1GOje<m@Ym65<R
zL1qs~h*7CPHz|LUP*bbm&dsy9UN^)6PWgsi)m5gyo@qD7naD_nsJ_=<>-#__opV#@
zRf7t;?xCR^AR5Y&$q3)BN&^t}93pB7+XXfb-v2BeVT-S49<g|;wlLEwE;zpMaM9>Z
z=a0{?Qsc$TgD*5BroZvm_x&u7{qMWV8RVHq*tl>#!6!i!ylCXaPsMAwg`LvJFN3XZ
z8qoV7{6DYU@7&G5@d5Oyq0ypFJD1ALzDni}^bn{Vd7bEqe@oz1b!G#Lmfc+LRoI*#
zfkpAhAg0@+8JukXA;*iqm-6;){oXS8B`ivo7G3FDsg81u<Jbs2q{!8wl<|8LN_am7
zA9vpljYIJ0zYFlccLKfcw^_V;K8EyVSqNrx>lmHna#{-VPq-at2gu#K8%i&IFDQ%o
zgeaaD(Ml24D3oH>sVdb7j=D!0A}U#c&t>`TA1?jDy9H7kB`1@!X#++l>ZGS6bBY-A
z6>C0(65&;y#W}ZG)#|5z<^OF-2x%16_x%`H4<5s7OmsHKD0t4C<9i|c653+9k5x^+
zq<k=0O}m$?sjjrS)LZ>Q$%?>>1vkT5&CWhrr4Hwy7QMkAQ$V<PK+bJjc&+TC6d^cU
zFWv(C`Q`h`%YUD(e?N?WpSHSOicJoK14G$b>}Qi&Y10bcs~@*HaY3O$8Pw|)L1|4j
zQS5X!s95I!2;Bzfg}@4NJPcPmkRBB=#I%oR>D0NK0B#9$n@=I3rzhdu$y5JtPvY<A
zb4`+J-QO9|MskwTpp_`?hR43uQS7PTTlD!UFG#XWzyT`RTxNrJfBeoXN$Lw|`cerX
zW8LyK*RI&niI89Yr3Fa*CC2#mFnG_z4iJqVS$W>a!#P6|55*RJpz||Fzko*#9SBN0
zKQE~?Ty*As?NtuL?@o}wN=Df1*Kb5qRPZ1Mh^jXo3%E3Z)sN+~F??o2&Gy$lr^MpD
zJ9_lvG!i|)O9F(t-NY|AzrUdGo+oUg|NnW<{<%`3*S$Izu|bp~<J+}BI+hC%S?@(M
zLDmY|+~6<9-+Kx`hgdL)dHp^Rg4ZbnnW%}x6W!yz&xGT|z7M(qRd>-}5Xf@%XEwC|
zKNrrA+aAsQs5evo5rN=0&v=lnFcJh)?`<D;|C__-BW}^mu4rRGe9u{|oJ}$TVvwy6
z<Db_87RZp$aXJdNf4>%LC^7*M$j&pe7ocnPrnLlQ0)$bLlZ4wwt5NRC`&5&|@ZzfV
zLrKP+XyTc%XP;$F$+MCrFK~3eb5w^l2lWm(4?o<p;!cY|vzRUTN5L)#al@F9#2pqL
zb3`U+E4{wgMD_9~hkrk-e{jcM{j=@y<9(d_3NhdESl)fXwMcqEhK=)X5xP!L@B8z$
zND;?lC&QNA*7^7!zu8mtf|vVCpPbLc+{!CVI?_USO_jt2-0rDz$*GU<9;lKfa52=?
zG)=bwI`{5km98<x5Ns-+%Odq!cO9j-Gu@3pujh|T`|+dK6>y9;{&W9tUraP*?92wG
zpfa-f&^}ktif7X>*?lQmc<1sP(G>zpSn`qny$>ZcLcbpQ|H<Bhc_6?a%3Ht3{h#l&
zrdDK0SV%E&(6}s_ihTRF>Z7_;-$p%1#A8M=4}gc5=@ADie*Zt8{hSaOxodcrEB^Dz
z>e8Yn3+ZRtRbNN(v&yDwm)f$H4eVbyF3d!5gx;yUpLnp3l63sf!T7Hwg6@Me6nKyq
zUj>-{=K_#aimF;Coy*Q`el-2s7_Ft|UwMry-_rb27INM6mcX;9cpaa8!HB_#o@|RV
z?*HrN&7awzxcgueJEBv|wkNdN{Z=89L!w%e_zJsY=rtb0Qgy41xf`8cAC3#D08a_#
zL4?pI{byxKd^|}OqSDiHANylSSZ4eYl|8#SLRw!8P`I$|t3;cO))D!OS$bDlbiw!!
zc_N@bY<1k<mwOnWJOsO1V>mmcj-jI-<DQxq7Rg_}yOoJ5Fls8s82Mj>asKZWd((7#
z4Emd3au>4sri-vivm!Z|Ih~Wu7&C1?Q#4I^W}@swpX`*5?RufzbTll`8QOy(s|%z0
zltEnfEzKrVpB|pul#?qx{^%4Iu+h-o1bY8j*G3vDZ!6k6J?rN;J6hZ48uaW)GPPkA
zWvcDES>9<Tqz-mXWZ3z4m+8@%yohf8lcpD#&1Br2FQ8wY_zEXASO37i7#wVpWoPDo
zg~J@(M7DPkWQZU6#!#DHKoVM#^M0%C{nL8=c-u(A-lv33_05*QN!vwkGjHaiJ~8X2
zVIK41u8TN54BB~`H2lsoE{yQl_?I$*?KZsG5ytcTEI9r|uWNYeUt~-^ZlGkDjjdGR
zLZ*pvN{L1h3nW^_%Mm(odv*!(m%-;m668t{1{zV-Q*`K(>8I^o>fNx5_s{<}#lK9I
z;%(e<Fpa)83X5)!ZEQ(OTdEq=Bo(Qi-DPfN77!yBaMJKbinx$4{aDT{Vt|Ow@;Grt
z<gcsh7CP$)rthgr%L+~AM-46#N1#=PHLa_e>~3*EZ-S8Pq?zd#DKrRC+qjL@p5}5Y
z{%cEry-%E>o*l9nZk}gOJ;=KQZJ6i-4gF_EZdi5~ydU5G53c`xAr#)PzyIqNzR-M$
zZ?SJ}Fn%+7l{d8;L@%Zi?wbuECGxMEb1sunKU}sACj8$U{|DC^ju823n;TM~LR%#6
zQ2ru6*C`*_w-c7oB|=~?-!MNbf8zL`Rq;UX&GwFg?O$IlN<f=JpK_-Zo2ug}fj?5M
zN8~b+7IyUu`Tu+u7<5yaCzGMvKxfAmqbxImmmg>+4wDqm4GDTZvtSkRX<+V&5BcnZ
zi@ckNNMgVz`8V^z^+lfwXe*^}K*|iQoIPre;t+26%j($TzTM1+L(&>91_VE>mDEGv
z)?)IyU;XXrbb~bNt)j;)2>f=8;mCy-5B~R6170mYo_;XRlWZf#_TrO0&MInUNgYh{
zxh-zbRCFr%`Ez%db$K}q(Gq((%ks7P+7#Nsx7ct$=8}!|?1(RxzNj0p(yH_;?q6WP
z@T#H{W_&V0BQe3zde*3^l-ED-!^&MO9>V?-LX+D~b<_A~oKZ{&CRUv1GW)NMi4`57
z(Onv-BH^`DHa&rBFx7OQJYClRuN8;x(5nq$Ay0_bWh&tr&PN&{xe$5p4I|=EV{5}~
z@{|mQWfhiz_+lbV90yhUTF#rZMgghoC9UDJF&5@lm9b8rf9^XPC=+u@X71M}`e}}z
z^WYj}$Yc~_Z}ZXJ-aPREKk9v98OoNf>@$Axph-e{QK>t}TKgs05gLjRL;Ahaa&?i5
z=i$ULIt9EZIjsKo%mdQz&=-u`GG@lsi&dWGdjT=hG}Z9<>n8Kq02h!`mD{hhE+A!<
z_Yhj0GV8@C$yR)xDj<dO>rjX;4earPv{6FQ--ol1k0;Mp4VE3^y7C@|+Pm@ch9@#M
z#SKe7Kgv{%L-wB0C0=mkj(djXDFYW95HL+|z*!mu7mU~H9ED4ePH=|>tF$MFMNjl`
zt9qcKm6i95d)Xd8|NA?jb+2N^frV+Qs%aNu{Z@)Qsq1);SSn?sCCw+_US(TrBNaJ*
zINc~H9-6R`ocYhQGP!gcQCyLh#1LhI%?^%KI0xHkOogxYsMV0_naUTp>`V*Zpcn9p
z;0P$KIk8iemPAVVv!MI$F$@M{vHPZqG<JmM`OCcLK2qt^Z2e)xhF^jcbIB^H<<j4;
zn<xXW@)O!dXJ<rt4TAuv9+$}R?;c{t*zWL96x1`>_X-0HI!aZ#c)O(ddqBy+>b0ok
z<OG^<c2L}n#qLk;$8lTfZ7Z`!DbE<i|56G5u}et2Jv9ZgYK2*3mNB3CalQCr`ZMn7
z|9D;!(DRa@k4XLF??XSHA-0siotc<|KSwTZ|24LJ$`daBAu1tz(98^Frz2YTVdtdo
zJf7|rO1^Eh0cl~{V7Xc{PoCMIX%TT#1R?g{oGkdrlT=@{R#a8djH3MMAjGbkk*OM&
z`Yd4_!EkJsNn*j`oD}9uKlSfYLOY!5JdPTkJl&;D`bp|8m7cW?TE9tF>ndeCPA{DS
z8oS)Q|ASdBjr7!PkCw@o9M_-UmGNgYITS5L+0vAKZW`v!N$gi31}{Sp(I;eBQSl5N
zUq%^vSGt+FTj9KdnEp=-RY?4N;ePF>PY=rghnx$vOYP0g{ZT{%=`(0(V(mmI@Y&Bv
z`&-?P@BuZxxNBet9NVQ^0b8_BNfyLUg|%PTYn0YPF*QZ!OC9?0`@czMlX@HX{0R<i
zKv4$c)9|_Xmb#q8acLmP`D6V{g}e+q<vG`dbxNB0w&?4~2}DsR);?>LWw=CQvu%g<
zRgfmK3$hA>bczcqT&o(H$O$#$S8mc3ITjJVZn50lZcyoMffIMIfnOw{<L+hxi(6vC
z?U8+BjfuF=uX`(6FWMAb-C&(aC_Duca>piW5pOtG(E`*zO`w<rXhszQsj4O(6to;7
z?4ndum2|#;B&+E%On_$sMzanY0V{aIU^CO9L=G<v1M`DX%K%Q24dI=S-l2nD;`2+5
z0hUcaC*W%{Zvt%E69Z!b2vAuDsBvnY?%!TgW<xSmRkad+2%Z6H8t~jjk2bPIQ~*l)
zei{iB0zgVTen;;|x3q8nG{sfKUXaWt5fbyb%c4Mw5Ee90KZ2qkF<*R@H34fNeSN$}
zuIO%BI$7}%2=#LT@5+30t)0~+8K&(d4&W?`LD%_|ktLt_O+Xj#*4A;!f92t<PZ4L5
zryLDkoLpeSKV^pT*5N(Zs8b0LNUl3?IZ`d)&w|3beaq#u&65rK{rUzq^k^<TFGCVF
zFFc$Zn8v;Ow{szk7th5Lw(TZN_SG<`HR?n$$ojQw-s$03Ijd2E7?tEaznkY=)%$Mc
zO1WQc=_j`6hxEk`4VE*rS70<g%a#p23|_Tj6LFwlW$={l+{@Cbvva2Oi$Bl99m3}U
z_iGh|cOF13zgWP1c(i6K@0cLuQXbv~u)kmZpuS2wpjbkH5$&9L*Lb6&WPo6?Ry3Jp
zh13|LL!p82{;9n5ZpL#Z^Dq6?kfe)iio~lG^!uPeH!0;6!|%LQocQVQ)&avrxo#I-
z&<9Wkx>CPQZg=8YCiPM&KzLBY0=@Fv@QH}8aDWsUx45$-WVhN#=2Ayd0Qj}E1Y(?e
z5vOT`g|?m?@tE<>G*f!y0S4x?6>9b!A=i!1k}i86UY_<v^ZAH+imKTO(Jsr2k`vIK
zjO2xsrqcc*992yh{<4?PZymoLSmET0DkV+&0rEOUXI}->?d!FRo4=c5X>HLw)Z8*;
zIv^}sk-Y$TVcSNl(_mbzYfRav&V^oy%N{e1rSYaKvon17*DE^&<F3lil6?!6bjR^6
zx|90zP<>5HShuK|7tqAe;%}w=dJAakht$)J0uCF96|w<ML-PR09V+~)RVF}ABeC5v
zzY3eHxlhvSmnTyK#QWJ6ZVMV%u>00jKiyGw$AGln@%t@0b%sf;xM7ci3b8WmzhM>z
zk``8wcGj*}o@c0`a_ay?FS4pTe_B4AH%8?jgD5>U>0Qu-Cae46Apt#AMAS@k!eG={
znW?FESy~l*54pWUxnOxGd^iW|*p^-cNNe*@iHPv5E`jX(bZ87F)hTXIVs@bZsOF0B
zpBr>INwj2!F5W4*KuynNFJj{5X(}$*u3mjgK?a;%Be{P&zY~5gSQu`aOVi3M&!lmI
zLmOFM4z&yFTa1(j$r~=nA}n?bhp_u{LSy~A#R|o7yRA&&j^d|(JgIK+lx{{WXj+yu
z0cKH2P1QCciu$3*f$fS!W{q%pwK;_FuTa($nsNbMzmN)CJ0#%a&^S;+ngGeIs&z!t
zuN;q8lDMe*=^KEcDC16$KBnJjuye2jTI!*63Ki1n)&Z#854fD64J#Ixz9BjY<ROuQ
zH-)Vx9|WAzBAko2i<TC*5f?hxeH0NQGBn3?!Pxh+F{j(bN6QX|7Nr{QI5b`_&z<OO
zrfm;qd^)dB9}g8(MI*btI$6>`-u!dTsKCMA8K3Zc#tmf{loM+dnKyYR)*D7N_Nq~X
zs9K#ud<(?ouZ)P++Xb@JC&+hyL?6wBw>SKqeJSi?w`E4Y4TTPbwIb93XqA@BVOi4-
z<Y00s55M`!uo|0UkBuJ~amrp@rQ!m#l~@2#FEEt@>d!iCF>6{;4SVdZ+FMCit$&8)
zoJsFj+_wd22{>NysQE1e?Xn`20fHo>U=5RCgOvSy06I~7pI7Yz%Zj}JLNMRV6~qV_
z4RU)&i>|WO;;Pf+KHwEZ07Zl510Cy8pmiZG;H9t<l?=H_W#cC;Oh=egw<es4T;|x$
z6uc)uNbQA+b9k>h`*gS4ZDu@@{`qNLn(m6}*^&ssQUVV8vHoHbm&D{1o&aR9j*&Y*
z8K^V;;o*z!5j~$7eNeKz&SfLF<j~amgm|th2_NN_g7EBm=c8bSq^K1hKrH}fcI=)r
zIINZwD2;|pDM2Gsav{L)%867_1V<e{@$mSiRa?6SY5JTJ2Y{rDdP;DkTdWo>rk@c3
zb*=_o<C1}wbaG<>xCsWJO<9me`n`m??hVjZ?uA+gy^KVOwt>YTCA15=<gXF`o1^Kz
zjk`YEwe7$9@vs7bIW3}U0N+|24kTbE0CxQd3}bm$H65)Hx#7*@)IIg_JnIFA0XwF;
zY7CFr_h&BvnOhoaxE;5cSaF>d(1rlDD&gxu{yQ%eX$3k5C3T^>F5?JGYFvIQ#hXus
znU|6I^o)z%!l;KSp0WytGqpVQVsBBPTx=D#zHSpC$R)xtJ311d{2}8~eQW-`omJnz
zeWX%KFT+proD0G_9L;_1MsEVSaNI8HFtm=GzMtH!Dg9LV<E;{{mo9F@7Wc_T=pX6q
z-QCWt=_wyp-`LkI6Nw4F*?VX}xti+MIvxodR@DedvLK)88sKo9yMhxpFIjPs9BEos
zHn+2Tm=|)$UGy^#*C&}$=<Q4Zxa0cz&_sst48-XRR|vzTo57%#ET}d5wVW#T{NiHZ
zhC@uS^SLEVlx4B*dGg<{)Eo%pqH4e?Mst_sPDeFRI-)`2*NFpbDn|Bp1{eAyN8JZo
zfPtnuU70)1(Ixm++wrTn)Ej_Jc(4TWmtpQ>Y=HZ=Sy6W&zXYUxRqYcxpf}k);S`=6
zE}C>TO<c?Qez=esb_YLpg6HorN7RMzkDX{0-CE4;g<1cnN=kNXQ&XVY%<e+WLuNIh
z@-o->gJF;8)d4-znJ*>_S6C9&i#N*4>Xf8@hJtDkz<mYifCB?(L3d3pcBH~0ePWRp
z%txqEk?KGJ_EHWs17*=<VJqd{OWYm6d@G4-9-z>1A@{5Y?O;ES8xS3lpcBCajIh-<
zixx=FyW1HIC{5Dx@tm!T$}2VN!4<kTDJ8)#A^e{%GvneCkTr)LTKbZZ$lTNU?nSy2
zM6R96YSwPmMW?t|#-c^sxqFaTM#rjkFw9*z@b2x3$5}ZuQ}P&(Fj{oU&gXJ3x8R*P
zclfH78*m=`D_h5}RGXW6>^|A5>MB>rB{N#sV_SdwXImW3nNr$6c3CuiV!9jdSAg7h
z9(Z?Z+fD{LjXD;-9Un)Da3k}{5z+}-1P7~-gpoT``sJ`LDdL#w@8~+Z_W7q#=#z(b
zP;Zt($c5zqoqgT(uKt^d%vU@kC+|I>Uw-25O7biMmgZf-R8Xgda;>Za>ZlX>sErKP
zlfiud+LfjIUvjdNlz}DHrWy<49+rY27+%67_X@OY`#g@eJtQBfx$B9)dCb2IXo;`q
zAPDIO8v6vyB#sPh9>r{r243apU>PQCk$4{)`DupAOGStcTO=PEpQWsD0iM9Adb!qF
zw@)vD7WQ`n!0Y~{e>ai0U=B^xYtaFNqMEhx$Zy2XLiyVaD!tqf&9U5R&}qzdE2dBZ
zsz5JhDK&q+-og7!ZD7HT*sWFZr!}Xc>84NB?nUd9)_jXDRWxr2z4`3XCV=Bf?rKt8
zS^%WW$B@Jl4Mg^tKwpgny#%?c>c+HO?)#2q0M?JN0KH=-fW+WXY5aQexFG@v!=grG
z(FvmuppFD20rAqxyD9w5ufCMKRmI8yBsjy-)Z0;kO_0+E=_rM6rd2~0uWV-~VL@#6
z5SYC+RUft})LK+Qc54KEf|lP1E9!e%6;Qhix{X``0|-tTxEFAbJL;|2wfdb%`IU?o
zO+70JFtuU(VDxm1lnxR<A%FiyhhD@)G{mcKP;n%!dwoA!4>*rXU#;4(ZGeeqEEQ3I
zL*c#SwLzD*%2T1vH^i-n+b%fbYY1t-Dd)q?ZcuV;JsEj1I}qiucw6^2)!(_ec<GcO
z8{(ORD?*=YR@S*(<eaZBj>L$LPM4#4mhSpw??q1BePzf2o(4k(Kf}FA>mI{Qa8eD;
zcCe;o3-ymLj6R@-yYusOsrp(12}P;f^MZBRgoJmWcXpl~NZdZ-`!h;9MReONbsT7T
zamDr7521<(q~R~RZTRWU)z506q3xDi89#tQ8`uFuj{uIeNE*A*r|Y!IKmz{wV5_1|
zrTMx|AU}|Jt0W6|it&N@TV;Spp9=u4ja>?Lhuoc=<DjJ0QxW5nmstV2El=-io>so%
z034~)kKF+Qjs_@U(K@e^D-R~g>Oeh*Uhw0g+yr1qmV}3lV^nNL;1!9&zk*m#(FEc%
zWQDDk7#;v010ylU7c8djik)3R4_lNg=2?(4x0Cv|k;q=$_BY=eWti|9;p+yu5@=Wo
z(IzNZi|@{PdeTwxFpow=mk&t^8RXRV4V!8_s_P*;W71~2lZ!6Ui(N}UEJfxd8@UgA
z2vzzaKXrYZUGHV*kmxD@+QlP{?J7lcOC%>Jul!LUgkD}nOWe++gy&7bnF`7Ik`=q5
zd!{8LPmRe$b6SGgjy4vSJ<}IBOt(B=HO_6*lUYQ>xHDu{7P7nL%tF;7w`V&uFg0W>
z^D82<$L&P0lr$F%F(SML55G_0>;OUJC5>zzFxJUHW5yXma(~aYqiWQHP`T++h0%vs
zjoy4EU4n2HCV;J0)@QvE6SdNDU+AnQXf;7Vh&-jd#sD$|@J)qN_CVhYk7dVigH8CH
z9v6U^F7k}tMH4=lVCo~*<IAs5EC9n!upmN^i;{h!MmL@WqBF~iaM)z=z4ziNi>~Rz
zhB;Xph=^hpW@f||p8DDI?@DJkDy2CSUvB9drWhC|e>Qh?H{6fh$>=}^yt>+T#i%ng
zXCijr#7E+1T=LWIl}3t|%+keMB^M0onarQ1aZJR++-u5LP?rp1^jT<2zj3rVNH?vh
z5a?_cfq{h_rI$+saa#cXj{`^a%mxF459Ps}WLO{QU*xu%i8GSewe!uhw+1-LT#)q`
zn_hy*s|Z%tujjo`>Zd^qj_u?vp(QcZ@rZ$J6c{sCh4{KBv_1OB8>SF8FhMt_Z@}kd
zhaAV~8Ujn)@B8b)fPAe6Q7C=34uF!0egYUlrQ%Cg@z8_{qD{-M+7D8tDUPigm}+z4
zv%bnKfUApiH{;;F@{H7rYNu6UiHh8JBU$QIW4N<J!pxee5C%#$W#P#rPvU7k_R)U}
zL4;n4##j2Oj?K!t(Dek~L0W8_mq;m2OCr%m?>ZLHTDF)DIW=l($!^)zGIH<V)qY~w
z(9LF$+*c>tMaQA{9q#6Qy)tg9c!z7RRHuYhT4Uto7Lck7tZ^9RgV{f3!m<CbDV%~I
zM{<Cx1qS~>-1g0~0yk!M38%7ou#}nl)78`MF)nLP>oYlEKGq;(Q^Y_~d;&=dH2TaC
zmb(Q?y7??sbGKH?sv$`8cD%-jsqF+9zhnmn*<`J0g{-|MqV5Bzf$NqxdBUbi-$|E4
zvtMw^K*tYiIZ787U&E4W77M)24PjtkJZN58xUVOmG&eAtHot4%wgk995OAiZ>b``*
zUgu?=VYI3SGh~V0Na>UaTxM3+KR{N3j7xSHwIsIy#bnu8oH$LUM!B_NIx3`FMt)kI
zbxz*CLZic?ewGEoE%FU<SO0gbsc(8DcR_S{)q_?<PcS~&ZWL$aoJf7YNrdW+*I`T|
zbU7m!58U*#f8?^fuAfZ@nWr|82YFGdXT5X3zw=CN<FlY^@xft@9zx~Plg3BI3f1Tq
z&M@C!S^&KS&gkH<$Dm?nnt<bVHJJOyrlpcgDJ(!mJ(yO>S3Yi?$6G^zb12ePU%c{J
zJrEXN{@kXV?f@#)%_tVwLV`<?(7{G8b)#l6{{%=~Od^G$hN=X*afJS~9pte}r_?E(
z29#ZB+RYbjRkQ|h+E2KBeF+E~Ou7RU+Aacs%m~?Wm-;ADThJTif#-vQW*)40dY<@|
zt{YDBtW++%(E(&Xb{wnjz{V1dfu0hH-T1Ma1W7fU{Tg9~lGm*sq=!JwnRmW~-x5;0
z`7o;Z)9q{a620`APBtdKN@WI*I2uqn*g_N4X+L!zg17StmuW!_B^)th^vLwOfS;z{
z^Ydmkqyy2(t`&oSRUtrhOw~U6fN^D=mQO&fDYAi;zmd9IC1IL5XI3sh389q*#~=9o
z@aU7$4#z+C$m`a`&1&lbW$icX2Cvw&Yg-IxoW!nvu?#+Q9Jf2uczw*zrZ>|2_T>W>
zv`~idgSP2N5^e7aAOr7b>KOrKT4=6RYVqZi)<!WT>3~GgwG0ebP4PJ`$31j`JL$5?
z;3{ZjOa)^i3BR#AHN$}ec9hjY{s}PWU|_tJABPwuga09bI4AoB3{3RHa9e(PFQua?
z`Mn})u3ZctIhtR)rEZ}zYl*L6R^&YlLi&$m*TtwrQ%rAFEim82xi|ejQeY(bEG#Rr
z!wY71Vy|uVo$-kIPF$tHNYnR6A)gC3ceq|_A@3NC44O@)5d4)rnnU3gf08d<6V@3W
zYT?i1t!oL870q|4^}V)PAC$AL)pfsZJru1leXrHCz`y44+KrS*-`fZ)`H}o`r!g(y
zJO?cV>;~_0*xI_6j|Q<}_tQ|5Yi!dJniCaw2A2GjJ|XWc>RPvJzIZZPcar^>X5iI-
z2}%OIkT}<d@i8vYo;NvXV8?YQyS*56r`Q*Ho4W08tZL`D06yHw`%=%o0uE&cH(-K_
zdQjtoCWKtj2oAFY&EPxB$sUJJTmYM02IxRLZeJz7msL#aegF3L8CUS{TAAnwVIh#O
zQJmYvd<{K;&r7wYu5)$dB$;M6_pI*&jR)@$i@Czg=FAPn_WhqK%<@WSx*lb2D79Ck
zgUU*xPiMr(>$Hv1n2Lz>_L~jdh<|iUbg43a;=Y_M_4`YXbH=xbvmMOSNVj*u=mCLD
z-lLWPNzrk^6p<an*{bjMt&3_cSon@PiwBDuX8>SA4b3DZe`rEH4qQw5nI;O260O2k
z<SDTaLsTsj$RC6-#*s@ii1sDW_@2Q%YALqau5;LvVHnk_JRn`p>NYQH0<tNuvt`CR
zhatj^ytapiX#!BZKruP)WF7+l6aee~-ec-gDM0lUHm9cgF(S&Rek4si;G>p2Uo<*(
z8_+e%99Ji$Y5@CG2VxC|x${#!z0^(I?Od=y-QA?!9=^Q(O^Hzz0<DB`)~P$^A4*Ke
z#7v~ScdT6faSr^7LsIrGL2IG1y`yQAKF(I@O3lcPGqCAwqL!21HlCH&meJOH`&8b1
zS4HpEjEM_c_hbB4j$V3;ysN+(f{{Ma;k}1@WvhwD&J3(o{u5HE1h#0wJ9K=!iB=Qd
zUi5bcsWIS6RI@xOr21$LX2^}pSX<jDYx^-GS|A=277$7a^0;puQGwA-b=&6VNr0tH
zBvKv>GQf|a8GPJfWh-Esl?q6O7>RE6$bu+}+^G=aTLHb;No@nhg1UoF3wOPXjEI$1
z#+RFP`wgf64c4C<;g-WDg0HaM+6=|0kbFg2OrXBz`%tzozyR&k1L|9s$^)M4LpOlV
z2-xVw@w$r>C)pPf`7BP@{vr3UYFDpM)A)RScYBZRuDO-_r&0Yv{<_$+V^2iX-aE|-
zHx&l&UyHJlYkK|n7Nh&Utgo>Y5$!z1lf<EX;o&tU-0Oye*W@R(runPi7BeKtH<mxu
zmXuT}8D6W+eEv8O{t(c=9wo#hAI%z}72asMnos6(Ee3{Nl^5ci5#>DlBQq6BcFYIJ
zr3D_ZMY@!NHV-2s=AiCq`zUhBXZ{3%U<$tQ_ot*Rk<cic8rX*$04`iK3Q8KtClrsK
z0Z`^l>HvOo$ZgCZ8)*0_=zi$GzOe<4=04X0ICwRcFWm74P0+vhNaER+`qN)KRWX<Q
z&r*)47LbFX1y7Kv%@8Z6&*Nag<BrDXFF+240JGHke43h_wrN9~8+307n-G1fceo)u
z%m(C3i80CQ7v_J7p?(<B;g_PdHpIx9E?9tNtzPbQ%;6SQ3Z;Ie8;b@mk1DBbRt5q^
zLd;*9tH#f)HMA}}*O6;DUutX6CC`M!M<#AIez^9-{&OC$2ciaKABp+cz?29TFqPFM
zUB|%;WWKNfGqR=H-~#BTb`=#zz$KnV3f^jT&xZ$b^;`N;7eih~&j92@vlP^Eo!?AX
z2nNm3wW8)w@2LJ}9Y}D{r~I4sAbc(foM<79&sw8{X50X95W>+e&f@KK93p`{3Gy&q
z3PRtPw9UN|jrHVNT@<>bt=v|utp8-fel!)PDO=2+(0T9W-Y{)l%`y%q=+Sz6xQ;mw
z_u37T!@p0-sxufap3iN2uFw8)3c9lxGqpucHBeWn)rB2q)?0o+=42ce75eCTnbd@F
zQ}>R_u3Q+}D(?lF!6|EABr<`Qt&u3};j;$<8ot4)HjHy`o|J!h_M<65{AM@ewqLAM
z%Q;72iB_f=dU|(HoVuV0y8m;qpnq_AZ3~(lmmbTUqR0p39L&bJ_5e4L*l!asR{Q!a
zUOfd?Uj@{zF1|wn?l*3;HI=NhgT4Lk0f2bD*afmcEae}%oF$p}0NDKW!;8N9q9IN_
z@mk=SKW>$2F27Y99E`sS0G>&=YWs|cPsF+A&r%n>{7jiZ$;`3MT`RHK*vK_qHI}Vu
z=B~w7)-gKb%2}+?T;ABtZbR8^qma1XeEPT1gch&#rkCI>ow`$U*MsEQSh>2=kyuX>
z-T4ss74^#%6+>9z7-T=&Hn-cO{1I*;P%@}kg6|sYC?}*lXIdCH!=JgW4;q@b*$Lz%
z_490VYwZ4fAbiBPHB-l-j04yiq*n|<mL_sZ(oM9LH#7<pEVw51HVeTAA1;;Bc_*fu
zF@$N#AvqnNuvAu)DY&xr-s=Rlcp&iEfZ!AFzPFA`H5l%urgQW9-N_#>!tsUU;Ed!t
z0G@?CwxhmItkJ8}!K4}gC9t2c<9sm4qy#>1t)~YR-OAlYR(GPX&p2MZxLV?>v@2X~
z#V`J$yn@-JXemCYGrCR6_bWwr{1$if<2nJ~VOKagGj=w%RAFmLp6c5_>H}-e#SPkD
zi2MBn@JHIM-pF5a(sr+R4{vywM!c0^FzWQZa5~;sr`MP4ZEfA|&*=Olb!a{V@@(sf
z>RS!aO&FDT7;QKLx(07yK?%Tr!&{((boL|rNReO=H5tVWHz;VV7v6z89l#wqbbJ|k
za#!k(4n5nz*y*wNRyR(s>Cw)YTYmkRo;&w%OhoSm6zAEXKbI|qTHL|dJ>m5#(9mA}
zcJnl#82fKbqoD~LWnczi_Iz)aq^6A57ci<M4RC?#UI*d>R2gBs%ht^GlJC+0(4=}f
zfu?fMwz%Qk$l(4Ank<;MIiDGcYA0j=<;@*G;1naG78mn0^!5&Baat%dX)|bU{B0Sj
zJ?-+NX{-kyDTX#UktXRAi=TOk-SADG)}m7}%LlLzb3gv^5Z*4Qz*Ef{o^eO&G<MLm
z3%o+#yj-$zZ4f?M!Z41@qm^O8Fkj{gi2lYB;nRX4Ew_H@3~(7J52u4AkceHRkkxS4
zoP&7jCLkGt_N`osVbO{o8~`}J@F!&Azps{1@+mk)v7juviN-jz0_+<auzIoEja+7R
zWb_J2*Nvj3SqPL|r^qK1%O@U8HHW|;M1S!?W+%yow>RY9*IoEVu5umC_{t<%nPTQ%
z2K;lWy^5Kho|<mM@LVZVc{!|zn^z>jCF;Du)yxQQgZ+nYo$_0t4S!M2;fK=i*Pit%
zr|W)Qd3MCD#BhApv>|qQmnNb{@_n_W`kjwGQcN*D>Lhgqa_9P{>xnX0>Y84OYq?(w
z(+RVQSncO82*RCz8u-I~uVVH3h|H#MH2liEWa6xAp>fp$G&npP2%F-wnRx)xh~?*l
zRY?F3=QALSb&2LexxDSZP}_LKLTgdp`<N}RI-FVSfGld~2qVBUdR7+S45rR2$zR3&
zy=tM+5XOz1%?ZRs)F9V|*`BH=5V=km0+rOkpeHYsAbzHNLM94i95n(qxW{ZC(OUa&
ztWk~GxV6;iBC|Z_3l3g!n)MXTS9&}P{0WEq;jS$yq$nMqVguQ1Wk*Bc4fixXv>n+0
zmb*5|`r{6%Lg}GftN<hJ#G80P$1=1*I-6z$7H0uZV#_^y$iMqzm9;CVe@RqnC}q=2
zORbL4#%Nw@nFV14WM)-DfQ?BJ2^kodv3lyr0oyp={;@d(6sO+>{42N&je8G)M~iAf
z?!IP1ipVQt3T&;%pejTl1*RHvEYzRH$%s6?7C)k8KP1i!Tr%A|1lpQl_!2|w2|6M@
zP}h0sDVOI{@r|12UR1XGAZo^{TS<}m3%4a<_xV}6k0~R=<v5b%FH_#8NW6!Cp-P$k
zR8X$<-f!uBRKna|{((<tfLo{FM1#c>cw(Z}4qp(_-%=;>Tiq>S!_@MQ2vzgm;3iM7
zSjr01>tP$LV8yAtwso#cCZmNaT>DPKLd=}3zD+QdUotPS)Wq_WFk3s2=PW3gq$wis
zbyu>EAc0S{#wX9r>>zo$-G*k+46(o3YuJfyWBi>>^NJ*%-i2%uN&HC?rqd3iR+8xs
zV1RBguu=NnHGqD0dlW9@4$?k-pkPTERt9t=z(ZFAHs&0L!w&lxD4UM$7cEcX!WfWr
zP+F~mhb6IVMzQvGCck>Q*w1>tgj7A6Nd#qc)`$4lUORX1DG1nMZo7ENVqp{aKpj~v
zZ66F4?|D5A#@t?=M`Veql0*x_33#ppoxhBGwNg5a;F(!HUQO$RxYUbo`s1JX9zS^I
z#29igS|hz76Q+Ip=sw2iDcnH#>!5I*#4OoivDM#d?$VQ95?%5w{?47Ku7NqWvPM5z
zX&#)FYd{G(J=e_m1yOT(0L>EBCgV}6{XY37nGjNMS>CNgr$WBc+*EB;_)Wr`DW$sM
zG91XYWHasbA*$8<yUdPl;z`g5nq<DW%QYEqJ=R~IU;C*u5V2&iGE`Nk^((5%IYW}K
zs8C!XL}UYUL5Os3RY=TV1(I#lRCK@~0Sed~&|zG@EHwct9=Twe<(0W)fUX<{1-BD!
zo&0kn6$SC^1~OEm9dR5{7-MtbLxniSR@U~&Fp8TQ&|Vu^@T~WTvPmjEX{bVY_!Q2G
zQxEz;I*0_~!2L{dFN;0hnEPx3P|*Yzo&$S}g^2b5{Yq7*^RbRHV{QeAc}jDPmIc6M
z5;Qm}D2hbD#Opo~U#2*3GGMyarBJpc$(~7`8+7EIenT?VnhbZ=^&W~252Hi^SQAh~
zG5!63`z7yDZa;h_Go-YfpWen|_k2Z}$B?gqRkNB|XN~O$DgC3s$)mHO|0%$~6y*S_
z1-Imat9rR!5zE|kPhIE=mFkbtfkI9Yw2sa+*jO(E-4#z}|MPzG@Xk5WTY{QUK%k_y
z#ugMzBGg091O<aYNb=yzX_40?o2?8E(&snx0YHKjmbVS$EMi~+hLZ#)KXY(%c}$>Z
zA}<)Ku7oLCzZ@Q2b6i4xCkOLat^u&<{P2|fs8usxS43X=_QcV_1Wt7gff?gxL9q5t
zz8@%mxeq6)fq>!k19pB8s-Yo+G(SM0oKDY33<i-raB$j#O2uLH?@yeQ$pOU>>DHC|
z>yH*vy%G6UGOa5e-?l)KQ9jYC>FQVW;G2p(Uir9r;S4dG&5pO!qFx^D@TZy1sSH<j
zBe1f2r3(qdqve-HK6F$C57X5RFTN5}caPVCYjk$KUcea5d4Ga0T~Q+EtNS-FfzBbQ
zINv||fCE%dOhIwJyme6GJ=5g50!7y58CQui3MVBm71l*or@ZW=!h#x`g*Dh7^C@)E
z`3S-F0QT~eclCy+hZT}4{aBYOsw&(<=6+%v`^2cen-@9@PCws6TDZwdvkif!|Eu42
zeMR=~wVt!GrnZ4n^DUS!0G6oSe8a^IqoS+KOV6VKFE9z>&9U7FC9iE&0Y1Q^M0cl(
zxs^fL)srs)nJIoEiyWY0ONIfJdh?m=7F78Nk6Ir0IC3-H2YEMzR{Daal$ZF*5Q*=7
zq9#eJ$z5wOQ54Rl!|k$VwkdM7E}~78ZfS<$cKr6`wTn~Y7?`G+532m+WoAR{v<3fY
z1GkLY+4h$(4~Rw~<(tLHT5z2mln;!Ivce~r)~bf6Lj>8sDK>&KVjq~sXa~OAVrRE!
zZf;I-?c2T4t%DU09dAm1-eXu-sF68)D|H<sJYio#2<R2yX%F1iLfgP)MM;rm0I7=t
zi!*V2YH9$}NHqFk8f{5ulE&JVt)d8>9-GK-kB8ZM&;RYEzY^sDg#f$cf;c^sR%z<z
z=6E&Dq>SQEP`w>C)RQ0BeV@*^T}QM4`i{fJe#)|pXGJBpCol9COCcyPT;>gb`EKLZ
zsb9CKdyL$&76zFSG=y<Vf7d>$kG{0i(kZ2OQ^DK}n!Nx8f%!WOXlQ^59Kh)_6CNx9
zF`ncBxJ?+Sg(blD=@^TlSyxy<Gz}V*WqV4bdOu7M6mMBKW9s&4!{6Fs%Hosk59=up
zyBS*|$iK|ciAO`BCLkcCs)I8W4Ps6NM#Ay!`3*2u6j!kF=^jW-yjAHVtNSj#35IO!
zHveuHd+vd|R_M7OJgOVOjLXQ>zVgqVBxqbUvV6>5gUh7vHl#hwRx2@7JIzc0{{h#E
z{sg>WHMIKJHgILyCwnT?K<4*O-@0-4x3k|B1Yp-tqCCY9ISk7p-buqFu=7Q+dxnGD
z)INN4V-K(K<RopGE=LT}^|BxQM{27s4;3Lh-*pWvvX$*#zf|Xr%Pu)*Q8o7YtAk1w
z%Z1w9zUOKfz(^8pUK7y0mWler`bET&(AqFet;&^}BYNU$Z#OqTkJ7f@_lp6<0UbT0
zcl(NP6N67Y(wQZDO~iYK&%X_KA}*BsS2dCe@0><9eSu1p%C}6pxHMsqLu(nl@srb}
z`YvK1(O?WhSg8%MF3S2(LDbQiDrGHRiFiFHc@5<KS|}h7LOdA*BOe)8od9X}fh};B
zb34#xAI|~PUb_o)RyDY&!o2aR&u=9eD~+`leBOQc0K?X>7hnD)Sps1%4OQcJ;EyMx
zf#sZ%vIlMwj0Ys{fDj_|G&wXEz67zn1>8i<3WDp9o;NOG@!RZ`1BnI|#Y=)~k)6^H
z0mIQg+E&4?z@YfPK^ae`fuBav8JQQ7NixsYWaAQObn-h{3GB6BpOe(<A6+RzzRM0=
zH|i`fk^j3p<T>#lMMe3CT*g&r24@J&cD>s7-h0W9jFXYIJ1p=lx|`La{3Z&GEB7>a
zPL7_!9~BnArbuwTVZ`Jj%-rkB)%`s>cQMJb72@2Xp!uJw-v?(m{Ow7zW+AwgGf-AW
z)qG@HqC>m`4xqttsvO%(?<6$YNT!o#4-SNeuc}X;owS(dbxKrMDrW^59uv@97o1Lw
zIM=qIy*`eKs)sTw3>C$Tm3cRMD=a1++Z#t3h>Psp=o?Ny!?7P<tzNb9lNCwTt&IoR
zm1pceoFvNz>X1ws%^RZcFyXQ_u!&6eKqYr<PL?hwp%|cc8PF>1(;YZAs<WR}pVC`@
zYp2Ejxu~^wYyL^u7Pp7`0MuuqHd$?5u@kV+duZm8fXsnCO4^29_V}HqIzA1=&X($$
zr+wbuQD@8aNuqhf6X>Zq5)D)y3>v~cl!t>G5L*)2&#Ml_A;dTMFfi4O{GDPSH-Von
zl~9mq>iJEKKCp^S8ptJJ6F0TrG9U&j%fHt9BE<0d8Z?-fJWt?T=Hvn|<hGMge5LfW
zf&V+L3Czw6@nq_H36hoKqeHuLv{xfYK6+Qs6x5dOlyAnNrZ?-4_UeuhQ0ZXhdR(!k
z3z%p2?cD8x%9(4l7wS><&BNpN5w+3z<!T_7L_>it6r7Ba4guia0sx8s%uhg&0RZ;7
zYhgDA<RTiaE2IPEz@j67o7<uxT?ul@ENIV;EtiJ=ftHEu<&Iu7d~>UKdETm|41kRA
z<^O=^P@Rn#;9UjX>%R^X5bba#i6Um`t^xg}ik`{-(5?H_X9n|At9NB`HSs;xoW2#>
zcW@p;0<dd)64=XpJXalV?08>3Yt|4$octpb@%9Xjj!Ayw6MK-(rA?-ljdpCb0zBzA
z;ay<9S>LSiHXoO956z06hIGCv7Lix%k6m0W^#|PM=bOFBhmS!uL2EqMy~L<%OI7vi
zUqqw&0W7aUR^GW70|79YE7G;cTwtj*Vta}OnK-us#Ma}p;0RZOikHqH%<2PjxNLo=
zb&#!Bf!C}ynY<0+KKBj<ChwkTF2e_py;jaiK1p!M<zN}=-^FkM*`tL4g_A*nD<BqY
zGSVFU+IY4naEa;U6k$4UW#Lh_Q2u5U#B4brbj>zskG>NIg_q-!?h1xOU<h5%149V>
zobG!GZ;iH_i)aBEtXf|rw`e$~S)SQp=Brn!pGHnku*VUi1^s`*mes13{WFoTQ@k3?
z1AB$F!uG9N>~I}ZN+S(?RC8wZPr2srh#Z&b`6keWEU%Y2%bnz}O}`|TvTzbt3xQQp
z5n$+iS4o64Dk&Y3*euJqW2K{Unybvs77){<JCmMYc%(j}(Z`a9I2dfQz?#(3RQf$s
ztc#|1aL*)wxo2wM73uXDz3vxB{j-1|dI=Z)C_C#kH6j%{p2j7&xlC9Q+tOd-T?fO7
zL&?X~EbQq+!XG{&o5?7-|Ep2UYx(}ux$aaGFlc1VfH&|2ZV}!FUg^=f-cWB5d1G*x
z8Qe_h@P(iYc!3c{58LkAHnuQuj1Hh}8_AkMM~o^=Sx{P=Uby4WbbI&PvryAP$f?dR
z<TCHoYcdv4OaoK7Z)Icm-UJv#s@V|3f<AS{CB%5rzEmwKxtsaZ`YIB5l{5Asj0z!c
z=dZ^J?e@7lc=~H4j~R0CM>tI-4JLcpk8(;oaAmsq0$NPei_W&-p4K`?5p|`!azWHJ
z04{o?ttba*ALNzMJ8JSlMc=LKlt@kNd!{f(MRrB95y?W9T$1X4e!+2S-7AggTYl<^
zfi|y~qE$b5Fgl?QJB4ky%e5zjBnhLWLTEEnpG2u@pa!qiUFu|5XFh;{FeMGhC&xe-
z*B3<0G5hz~4wUS;a(hhLVMUD<**m+xoQ$gw<^|^(-G9FLoI*<8<I2aX{gX@sG;hUb
z2%UH*G4XNm@ki~~ZECd`f~oM8Y}Nv#-_zQ+x^kEKuj8JZxtC2r+(651@M)dwT<hP#
z4U_{(LE~XmGS@`k+KE?OXdT+0v*w8DxL}}dlyI<Ft_6@Q9Oj^L^}BdSm+Qs+*4Mu(
z<51gWad<ucUu*9vdhaYAT}n<a!<?GQL}`sQR-$vbg7q@{-=zc4o%Vvbko3iLRB?NS
zY&1})QG)+Era4&EAexl6Q{S3csx~wGkF!Wr74)VW(E3vRifewXi5D($C_h77$6_`K
zb_kohpH)j7EVdBftnQJCWY$#o;HESTEIpLdNLVQpc_hMAG9;T^UNmnSH`mx&e>k-P
zUn>mZ1oEEw<DlUmq0ysXT7bWdCg6@C=MZn-I*Z?+FK}?RI@Gd`$s4aJt%Z;&TUOQ-
z)C<DL92JAk(*i%N?f>n1xv9P|^jNo=i3RjJ07LNqG4>TuQEg${f~a7SB1kK#2uLff
zAd-%NfV9#fAl);Lfzl-+APmxtbPb?10@5{fch>*|{QG#XUhn<>T8rgE=FH4FJKoyQ
z`#d@~<`u0dQXsaKB3$P}Ljd~^gHK|gjI<BkH_tW9QvZ8kA@tHkeAU!xFNKbbY5TVw
zW&%ztc%>^XWq%R2Xl)V={v>Q+f4~qJ{x`Qv-INOVK*J>?@4r9nVHjw466|35bvX3*
z#%Yfw_^x;<^^~pTXCYj*t026mXV8JvSnn}@SRZw|Y}QTmm%R|`e~&&GBycvK3CNgK
zO>!2lGpVHO>Z#<&7HT!d#Y$V%eH?EGY)j{<oCE>I5FZf@dk$wV!4b5YT=bLd<=6Lh
zgqah6&SP*OxTks7&;Q1KO2w^#dy-UZo~=w2(sL`tI5(@GL-7z5Q6bgq{Z%;wX$JJ=
z!?^x?Pg{@iRg+mc7s79FE@+)=!B4z#Pan^%;>`bf+SE65D};)#C;d(LBnJacqFXIr
z{xd=k&GDzF72kUb>6*RV#}MmKiD-uYooxQ$VE_HAI1)UV`1lQe**~4|zpwdYcZ<K5
zDjh0J*C(NusQDo$RVn;;s*Dta2)o#oJ^v}O|KC1O>M7y-4_q3GGZjhu+;FuEw7l^*
z-9qHniTvp?pS{m77~o`@{%Z2Yc!T@1c1V}I_&-+KU!~DMZVvjjlazx8_t0ynH^H(s
zpdpQ`ckpV!HNYnN6-#+Ptw&Qo?XB7q?9cZEvjf|(k?6zof4k9tF!~(lk)82w=>@Kp
zQRw=cg{$1rlx#fY*CdcffWGF6^l2(d6~3QMyT~G_8Flc}b@D&E;{N%d|4iBm<RgOb
zF$&?LJ&j~FJ(d~Sq2Di>{Jten)YW$rsiFGs68b+*&>zEb2N<MNZqNVSK==RiM$*P`
zFzBTdwg2wi`rnTCuh{U{Q!&ZK!Pai>bpX_w*EA&m5Z-L^9|jcAYgR|%0KzPYiUs>u
z!n!4c4>f2&%>_RhQ2w8+C!I-(JwR8j2QXUC%<cifq62H1-_IUL3GrZ50FW=d5?oLe
zO7h2XJtguLiA0I|XCJ6s5cdU=TU75KlKXcf6$cyM%JIcbbTB2bXp+cT6rvK&?)vK?
zPvAR|0)(mNeI=zogM^=cDuh79JuOh*s~f;6dOq#CYNJC^*N5rw^L&rv>-+$=)k__e
zNA{nM8y31Yb<Pm5pCyQrlRB!(5)pX>G$z$iBlODtN*vG%I7U@$zKBRn-8n9e=D%^{
zHjm0n?)fWB9G4kF?@be>rZQnQWq1u`Ltm#{JsY1I$Mo)Ho*XgkA=&#>!mDv#rdXmU
zTaF#vXB;)1P}x7k#T^i>vz<LoojH}myDLgCWx>B?4<V`9KYku5GwsSi_|5h{=V~JM
zc5vvm%9!wn(47-$$=J(tbD+|^uAt}GanE(jn!a7-B+#zxAb{_dk9Gjwc~19v$xVw+
zU#o{!8&O+>qKuXWrteN71qi&oz0Io=n*`iS(+V7$RHH->1hq?^z@r{wUq~5{Q47QI
zm81B=+5E3Wi8x<Y*0H-s!*8l&5P)ZR2Z&>yAlOHpkT{+{c6dhyZkcpF!Pd5N&ot_`
z16vbw1vrCJi^KaRq<#V!OWr81Vv8xTsC*uq=rZ^D?S6y#KMyKJPhe9v&xVK}0o|E$
zpvaPyaKjRMS;~m$hHgoFIgz->yt(%B1<}sV9+$*31|%cpf*QA8Bc;fZpRdpqyW$7v
z+(c?Of9QC5O1i+f<8>lO|DCv1LVx{Rf%QC}q$n^sUG-td($O{rnXC0DzA^n2Vm^eO
zKpk%#->wDoOYjcMKm=<q2RW1%9nhZ(9hC)x`13KrzwURSY0!UnPx%kc$iE&AX^fvB
z3yZs!kqu+kgj2L$eiQyN)aP3QBVmgLSqD11^Yiq(`t4Vi<^)JOf=lGekFg?IA_ecX
zxhmIkfiCkuWAN|a&*S4Mb<@7if5-dZE2Q2@l`D6z;M(%fU*0L|m`Mga<8SX~NjF)|
z<kOX%nJ~k!)SDO`vy?~j4}Ex?Mp@LHi*4dvWo650ZAmI@MXPc)!IqM%BH5q04ActS
z*w#mFS05##`fEo`j7C|?r@D5?ISU0)i8P@#{(_sV1Fa>`bqrfQpE62ZlFW@fdP;ln
zZrCJD*Up1qW5c2nyipku@%JBkGpTlj4{O<uxM>gA23S!=5s%Z7_E>s?=}d1N??Li~
zKga!l^Y6{a1UFS|zx-gAJ^9~@B=BPAu7us!b&${%dN8Am-OP8Ul@7iAtew3=recX9
z;&ennzJQj2hV_W+GXbCbKAv=!-LrU9%7t_+sP#O2JZU^_w4xLxv&DrO8`9no{WGc1
z>@qPUJ0?@w=hnmKa#Qr<XpdI8dE6WK0$DlQcUChAyete`Z$(tivs3CGP1GFl8HQPg
zt4Is5_mE5GCv(<sg*}d_!md^N_a|!K3AjQdYJ;z=Z{rpv>>|YVNirz4qUF6&;j5B3
z(X<b)QMuXFT`)tvU!U{Fie=BQ@;Tk^tdCROdv$4na!VhQ)uilZ1J9Nc6=vD?>s`s6
z1!-#LwePS8rKaaGVAOiZzsgit?p+oA@3QHB5T|TyyAie9C^MbwGp@sz-6ZhA)uHpf
zQQmvXuyv-)fSNN84DKy`RKJG1-l^#;WE-rV>`xUnk!?~}nDWAR{`wonY;lxLMxCa?
z0%v=W^dD>g=P)Te!P#fD7tHg|h4U_Ut)d>&IZ2TaK9YL^WB~;1HVC7{YzbPUMPJUe
zT69g+DPGTf*UdyqN-gZpQsK75rwzXy&&&K0UI1;S`zI*2-5VIf|FR*_r~ma6$A^#B
zR`+A?-J{rkX(2FftB+B;gD*+O?qHTl`Ng<1mU>{|!PFbS0pIZ2-byv$Os=S+ru~UR
zw@Fgdyx9kg!KZ3ONy8{<o$`}sOZT=lP&%|EH}p|XhlG;n9$xhSnAM*eZBQJ!kI!b~
zg+!fKsc&nLIQg_8W?~FmQE14TTS&67a~CDatr+Ik-Z9%-fAsAZ3#P${FRTcB@wc3J
z5=RGI(=rr0|5?3zg#HF_6K#(SFY~43GhO_9s!V4Ie91n-WN^}9KHj}dG&(ov<L@wW
zA=jBBIzH`)ZB&wc6e5I!j|qpSQ(^t;pEBb=tw1;I1^e4ImY^TL?tFeSJM!-Fa6a~X
z(E@w|7qN>w55!wdIf*v<562Ty4IH-WtR_{=&%Z|5oU6|sxM0=pShTN+5`sT%knUZk
zr;ci^zc_6;IL)!3d00r5b<J|gs~c93i>jBx&KEJ2n+j*%=^99v1%q|Yx3|Yc_V6`I
zL%QFMGXC>7I=MR2z91L#Vx6>}!P|BO|K6An1@V>@Z%c2m#V<?{?S1CL3$8iuOni%o
z?Tg}xshgT@+(#p?q%SFf$E1WF^X&N5|KZ|*jkAvT^=~#YLIJyTEzhOr`M^TEW+Ghm
z8JQ`0^rIRRmjM<;M@5ErbG=j8(!D~~9=UXsF;6<rwW{WoJjx!{b$LNu=LqEus^N0W
zl4rWYP5O^$C;qdr@+iidZem`4T#DSHIt++Bn}EK8EP8Ktc}ex9>d@B*$k4nDLucy+
zT!iX{A+QuJnO^v~3rfin{nyf`#z>X(Yb6#$w-=I}I0b#l6D;||l`awkYU+Q_;6)au
zu&n|Q7HNf*S|4k{$|r4^Bp4k3S!W4frGS>9TcX*bL3R`aZiwSArikQGyj!oNdVu-M
zv8-++`S&27z!zs4D<c{{d7Fdi=PSX;7rS_d0gR5u!KXF<b%%3`_&A#K9?S5IVb9i)
z;QOQ6Cog)nn?98nBehu~boME^app_@b7PUzcY3o&ciC8<pT;9i{dj_HLH#`S{C|c_
zDo&Gtyb5oG^J(h2zrI%L1mYXTiErosb$gM7GVBXY7)|+{iaH(+c86j#(alb-uxFkL
z)GsSOmNfWY&@JQ2PgX?D_YtXEy)Jaazk2WRwuOx~9lMhrMycunSBg+uMYN`TM!j>V
z_Yn`KL`QzgBN`s_Anj>D75fs{+X>{#U~e0sC8esauyOVlM_ou;oTxvVpsG~}Sm=)w
z)Z1683Y6%mzh7DTwDt$1!}GFsd_2-QUf?_z*NLRu{xw7=z!SfeipGqzvJd&Ez2pWB
z${@lhBv>mh*RE_N{^Qwh;t1EQo;lbAdXPsnvBOn^V{$0?yJP5*2Qo9)EhyMNYH78+
zk`v9mQ;=)aZ+g1?8NFnRE3-Np?42zOA?7qSkDl3Zr;@GGXY&Jf4t^Pu*rg$*my0#a
zDKV%!X0wOO4iB_1%zZuu^n6Lct!BLCPyZio-kJdW;nD1AN3%SA>o+QdKh0#^MSR>i
z@?i3f>USohe>s8kwD==d&#XT@;Kq+pFW=(nXBjL>WQbN&;^26!azneuWKw{8`7>WB
zv0pdLx0RLS;uH-+)l2hPs#12%)$K4t2com5O6yUNMk#5phb?}QJ521?eR=mz_Bwly
zOP%v~#}%@-3KOIttW1GcY6@0`?w8&EWx&&!+e^t_r7`r7$(lmLp6Y$#$X>ICQtQvf
z<;{*)DoGww@*(IgCrn>Qg0&;{>b72Phr)nI=tlVxw~jNb`)g>w4}>nc*KJkE@98;5
za8=+Jr$B$y3R_zF-qf3j<1TOhpnE&!?awR2mX^9K$D?1&{~$n1#Yy0QOx0wxdIXjM
zxQTy$kQky8HjhRZK6S<{HGJ2L+~tO}e1??y-|yXhgaCiBY>#+!ofEYZ(JM;#K9@+c
zZpxVIZTz+U?sfAR`LA`^wP(v8Upjl|zQ}GNqh@W$%?t^7O3n0Zvqi*SS_8Rk<0-n=
z#U+cAC2WjN{`s}Pe@7Y;bc-_8sa~Z4gGdS{o%1!x-*M<a9;IChfB6Tqtxnb@GK3YM
zsRQh>%0!D6Z)lh2Gyne9%IwkLe#058yokf7u~&S+_Vq+8XV%O^5$jh-+l!K2Y+jWV
zA!}-RH2Qb)4YtxIRqA1EjF*$=*`4!q7TK4u53|_!{43Vf4#R1j7AUN9C!2tRh#OG?
z^Ch^7A6|=jZyKziF7KwJyc&3O)Azuzvy!Yu!4j*qtLX}t&;Hl(T_#-1SQN;BnaC0S
zv_$4&c(`FJ+_jfqpUMFj?>#Qx5d7!V1KWBUC6zJEc63>lEvS(tuSHF&K$lJ7tzN}7
z77ATMdl1B#T8qh^VNBs#sUbL1CA+|1kJ3VsS?P1b2cI?+3YYYa#Kq{Ww#n?q+o%D(
z0@dT)6BquNVrZ<xU+gjz93;v1;Ke}0dNYmyJ3@5)_5v96+tA9rJ&UjKPt5Um<Z%&S
z%&e)LsLL{>I9>l%hLNgTzu?YeHXq6+*4r&t?`=f7vABeIhSs#`u%M=RH!n4r&4diy
z+mI*bw+-<$W%!}T5@9}BF~t=Mv;XMkH^sjxw{mrl<!xlKoJ5n+3|DAd3v;R8R2ZG<
z2zsxDE>zf-+b5!8iW^)m_h5u`Pdl*ZVe7+!{|r_qg<Mqngg#OyjT!IRS$0ipWLV}`
z`Yn#V0B*sUr$OCj(F7Lv9_ciw6xf~R>b}8gEf_;x$dY{`R}23oOFYk9$lZN?W%g`s
zax*Eqz6V!3OywiAc|ESB$g99!a7sP<@Pa9S?wq@!j8i#l%tDTn!VC4$)GN-MMGj1u
ziMs(e^wm`B_H_iyI>@K;w1`gL@1WXpbf{m_P5Lz3lWxnRN#WdW++%8dj{=PB8E9nB
zw5yQ+*Cs45eRYuRrcAk8H!VTpl2S2{sC<JcsgSsb=KIt|&Qy=Mddoml(8fU?-zyg>
zZJ1%$K+{8WG7eT>XO!uS(q?8ZPhQOM)$7WsxFb>A!a(~kllMZ(jR5PV7UjXBtKt=u
z<V>@#>)g{j|43O@J1ea|HD-9OrHF~i+(IGVE4>O6d6%|4H8<u#&&6=#Z5hL>^dCHw
z4zq)3XdO>yJ^b-ey2oJIu}3&G1G>ccwRbdli-k<tQaab%d2LyV`)b>oqUG|sLe477
z;DSfVMg9BS%j+1`5qt;#6Ktd>ka<K;;`Ik)LP-8Q22oN{8a>UfyBrA~*&&zGR9J+v
z`q6?L^+`Q-@OzdQeUvkD<pQqe*61<zUl>`H#gLG8=^w29%ZMPymA-l`NVO!`w#whw
zFp2A4&D9O$rbS%Iu@WBEJ@?#CNFvD8e-m9sM=#W2+G}rNB-t@5XQk+LyX=|M7sb?7
z5xiGlv-3uYHecrDoL=SN6fK=0@5F@b&i0z9WtVFs*vU^{TWY!TPU3hK<uV@C{Y>B%
zu%D?xQU<W@Xi0=bz-4}K*?@FsYFvXA0I>>KbSCcqIDh@m{Poe@xa_{*dv%+W3bQ|A
z7L&%476Cd{SAccmEhW-<wxgI=__H2I3#|xu)zzY_g1)uMJ)eqYM@kCH`g5S?w-AKJ
z#<^tiWcjy);b7W%iyUk0KVR@dHqKxLX^zqJ0v&^K;FMqUXfICj<g>&3Up)eb?W;WO
zwQftGn<m*>;p_a?oo4$XE%Ce};pCNq+QY(b7qYz5kTFV(xshJ*dVdUh2>!g_b5HPo
zaxpeiXrBM@yK&02@S>t#7i`HCg*EA^x65|O0I$Q9Cx@bhxP&q)`3iIOJ2VT0f-*MF
zv2Hg%Nl6s$`rxkb_r<CZ>oOYZeG)7I28vW?Z=I%B<qPQ<a{#`bc%Q-TH3LAVx&uu$
zkjIe~fEPMN7Bi(m9|sRT&=A%ymzxK05+u~!3hDNsQGj;n6SlPl5QSKTDGPlD3ZDIg
zi~y50BB7iG3SY6NEg$dTkQvWbK?BQt3xC`Dn%LDKkp&J=c1cb*0*9LM0BodIXtL}H
zT4c=uVDw?l2<Wuw0&T>fF$)V*R#xCJv`9ab0k9A|cFMD}Dm*TEekFijv=<Q)LVe=Q
zFLR3gPSm|kledKY1k_=+-tc&SgB<`cb6Bqd!t0p&0;!zz3tRUz;Yzx1PD91lR1@Sf
zjru=avkgGoX{^6~1<fy8<3=mAzPC@Y2>5n$kVi8$3(sbS2>;;u6t9vf8`6KVQ!E0y
zP$M~qSjD2Q=J(Etvn3RmW@rRQzxQ#MKRyGH<`jU6?mSR}?RT#1i7$iZTNs4nm<N@{
zUHGj&K*wPLDws)tY3D`JDBME?NNAaZQj<m3J{$l-<2kcSswIC6j<s>@ZAw3DW&PTx
zsu-OWznw(q?bLRUt=MT(ONbR^89|qfxV+Dj6QY<knw5cJxv!47Wh>pUH5p+?!tPxE
zxV0-LSv<49sqpDb3ntO+)!4Ml)?n5#L|b(-6;K<pBUGUSZHHudwG^YV00+7{3y>zv
zUO+%C(WP=T#u2EvbwJwf>n=`f1bGxNu@D<jG3x=)GJWRkZ<1)cESTp6#Rwd9>{$9c
zZuojal16my9<k`Y!~k{)h(;`(Wo4EqR7@pz4gfkL>Uu5k!v^xrx<C)F`2~K@XP3**
zKzCBHxTR>YPwoZi%er;j&-{vPA2K7$^rba;6TxBl#|IL7#+=(gU@psu?uAq&!NNxp
zRcKlYK0&i3I<$?9If&kr?`wEIt7(k8XBhUwuip>6!nvJ$=2uh$*qWjoNnT6BCf;1B
zMhR<$39H6N0lEQiSL9oO54;8h)#g<vcmy{{U{j)?&9$Nomvmo$jCr}<&&3na1Sl;s
zEH+S_;Z@<e1Z%tCdDsM#Q9}9%hUgA;OnEBzo_1cCeqWZ~;-F)ios8<s+ZGJgn;kV6
zGLbthC%8|Viv0SH#`5BIJAf<c1ng584?4TZHyNXvd8$4^@aKKd`jb+c>cx1_-(U{N
zseNM-Q%-RKUMro$<qNj1uZlZSGyHO1CY}lJ+}8FkCkANh4*~YEdHPk?Gf-1*ZQm)O
z-}_;NJatjfm@>f=wB64`ZT4K+1*yJUjklEu*Tk<JpLU%6D)|Av%BI{mGb}?~*tKtw
z>$czG-swC|#yC=r`<@rM-PwKdIB%hRLypT~b;B$g%eEK(G(vl3BwRQP#d-fyv=$7L
z?Ycf*(VTNWT1ycl@7A@?IxTMmE|IwQz%vZ@{fi=a=lAH|ws2;@)=YM5pV{VffB3}2
zEA0gyOpN>OZ&Us6T?P<jRSXa^?2&h;p@P$K^GxIdao~>8<0F^Wd&`sR`(G}Q{FO4^
zN0dlz8C4FS8#D}Jb>YzbRMq`uDYRyYbFcexU0WO1p00$q^2Vk_sI>k1NP)|sW#+_e
z1KZT*p+B8!O<ts1qIK=N9GVQQPZgrI#PfXg)T|fF+qsDfi`61ljr=q(qP+s@LhnfS
zJJEvNCg(Mjo1&+vXsXnu?LT_3qC~js+WI1b+jjZwWh_75PPE1<68q%WCxK5ZcA3h@
zmANH)n=H=E4cSsNAZB?LNav{V1N2o^0fez|w}j_tGQ?@JXOUQJO==$iMr0sx9{(qU
z5)@#ddB7ME!fr-y-*}~#b3+*=F+7dwD>A@=<T0a;asX231W2<Zb(VVxhHiXwKv&y`
zq?6W3gX#277yhJ#;^&NiGC&+Utrpe+HMK|g+^6VF2}Fd=hk`1V4<2x484SGZ>Q!FU
zOs*^t?Ai}c3vfuvitZ)Zcf_w4izqVHmh>{-^MJ01Fz{iBEV9v-tWY=_(eG)^p$_>S
z_E)e#^G2obdL=hGo)%%+aPHX@q<wzPy;*1EE}$U0ungE>e6HP8;UVjIa_ggkdCRT%
zbb|z{CIc2BF7Kbtv_q2x5z;S!>1T&*u!R$4hlYSqRz3PW6lkbB;?x(_HeIT|L?66t
z!9P<8zB-Li!1<o|QBgD5eY$b4=25z)_IA?sVmrtct*$4E&HEWVgSy(1qx)v=_dp$m
zwLuL5gRb^^L!rHZmG36REXRHn_nz^b2XHB!fZxbk8#HiUouBy;qX-PlEiRp+8-}#+
z1FJxfMP%ih#DHiw;?DZMQyS%$6Zw5}|4MkbM1Pv3Z$6AfH5pqQla^nQQd$HpP41ov
z^~by#kIo@@zwg>%Jw|IUdT?mY>O_6>wLZ$lZtDAyl^gQA{mdEcZd^PKw~63lL#=H(
z5H3<9nqe6!+wwpBg*xgzGP3E+T4vEjq4gQv3#Ag{qzQUkCZ<$jZB4LoqaO98@I}S3
z$wdeA#kW?lTZVtsU)Yez!zag6NMfE>6D8N`E(368hdiJYs<#0xxQ;+M(F4^7>CbOP
zkK(o^!tVaU0(b{X9=T35{D`8J%z8n!Io3v`M_Y9?-A2^hUrqD9(TA>rd8{fT_2J9z
zMzwgkXHo8<s9mjM5CB=p2NWD78-zUOhw18o?Y794t2LR2oz1#p=T;xCVGu$?9$lot
zP=yv`3EWk;R!vC9HG=ZYLvCb!c4|Y|ENF%_3UDr+{*VyKZ{|Mwn)d?e1|?nw_`MOo
zO_<k$TC&d(pmuTN>E1i#<~m8_i7~3v(04N_0|!WGs&&&A&{w1&x>CYS?=}@OGF1r?
zz9mN<VH+gfgQ}s~w%sP_yc43n{n14$sEwW_nflzXcpOCH3$7QmVlhlztB7L{_rCIx
zdu?KyO6@+E2rn{?4S%xOoYd78sBIG)vu-S*Hx>{r3RIjuern{x=09I;-IP)SvOuHt
z*^^wdOX;mGskb=vDmqdKRmwcA?QXP;D-1R@a2N`(zV}W{q7KjgzLi!eL%T|OCaJkE
zWuV2e^O7?!>!&?WXWnm7r_)jfWWq|R5LvIA>yIiWl0J4$2dQV6Sn>_VUx8JoZ~GLL
ztiRLiu_0EqdsyZPk7`w7w@yPz8)OV`#J=QA`|{hjoj~41vJIcqDsxOL{#XalYSn!F
zjEJoS#sE)=MtN+L29=&8nQ|mwU0W$IC7tkt<ykI_9Z+&`^RsyuQbpQCaa0NTSQIpk
zkAX1heM@=}#X0tBncedY6Y14;Xl0@5r<HoaZe|FD`p9va6Nnw}u7Uc;2XCa9*%#J@
z^+0SeLnsZG;o{NEvPO4~_^`*yf$l)zbz?q<Nvd&-Oa<76!3=IwWL-U0$6%V0A)IU{
z553sa;D}YwU42a7;Rbjnx|m?ylRnNsc%nJTXPR>E%Rf=I6eYoe`vt9JXZ?V^PYyrU
z^(8lyt)DSHWW&?CoWbg1q`Mnm$L@DRQ-TW?TGcFJzOQT4({G33tT)*-45PbykH5^L
zBbz#<)#UtqXier7`6F(RoE-58p_veM167Hq&sOw!xpnCG$rqzf5*6m>ZPrtK4{>}f
zWf0B2a2v56<_oYue_O$ujRaB7BvV!f2T`CmU1bJGkOedZMM7sj8U=m^jqj4+xgfI2
z3iTbxwFABT&yF5`_MRZZNz5w3d|&0K#TeP{5$U7t8H>JM{hE_iA@5#)aAJA7>{yT4
z1e;vd3?phCGU*K+ON=59n_}PdT}t{8S);Cr(~~gGzG|IjGV{Dg@K6tKT6xtvKX%Hl
zhY}GHG<B^kLfe!<Q?F}u+H9tIP{>L?@Zaz6+j7~9+}kMPy>pxAb}drMiGb#|$|~4O
z@c``jIAH|PKF*7;x&`&5&_khgh>ttwoONIYppj#~=K4mwk42nl%m*p<ImjS0nrEL8
zVjNXT1zdcTeU&4yk2jGsZK%~Cq$bA{FE}Ahh8tk&<HUD_mP^L}iRysjjlT2;!6-mL
z@j>En9_Epg_5Hh#7}FTVoWx9Sk7d`xjA<9m5$c5zZ9W@MO?`evB-<i|wOB0M#v+xM
zvT?)gBdnkb%Dkp9Vme2Dj1#9IQ9Qz8wfV#AAsN@N00@sX4(x_KfQFY!iFLQUW;P8_
z@+K@KlBX7bs?om{O@^UX#4jwqS9`FojQ%brNzd6VQvtX|F+ul@z7RH2(dZ>89FU7;
zp)xcybTCmChk3oQ4Vap608BMTY^T7fYk#_9%vT=#At*yuUk6iWak#P2KGE5$w3yPv
z(HQ}}XBAo-IKjWiS@uzc2x4Z<*Y%T6M8SjnEPzVv&FiD9Y1V0r6&EExf=JauDgil|
zNGDYvP-3-e2O!@Y{5h{T0Kb5imm6`^#+fk2Z<j_2HYs+982KY0MyrL9r*G*qK}7uf
zI97@bA7l0Yy1w+Kj>cS|3X?qSG8!`iENMpA7WUkgeyKPme9!!pBGd}Ow?ucoN|vh5
zW#wLWjO*$xv+BTI?y64f!+0K~-x_BO?5^IP6i1p3Y-_u@RC%vFq_)Jdd`+?BbY85^
zdIQ5hl>-5M<<c_{BbUg;JLY#NYNkObAq2hcVKqt~`W#o$F`&V!7FMQ^@3rE9f~0Lb
z?0aIzScxIHt3ZYLXqZoQx7qE9I9np@YHQ0b)kf4vfAq;mqteus@CR<NxPbzRh^l_q
zoLb+??}c^UyNe<Pxw4}$jBnL+!a!jq{@y+^rY`a8<3g*3wwREN=W1=7RxKaf=5wYg
zh@)Du6Q3k234ojJ%7)x*k@i5HIlo}s1$?G<5BvaKl~vHsvY{`x`C1q*z?G9-3>0Nk
z;_DFIh1MPM_cVkL))OAr==6q92n`rhfZm4kOab)55ZKm6+oi;hn?YCDJm7su0ukI>
zzz%VF^`!36hjxImRYpgRiH`0)z<?$SU5o3v>!lF5ywUR_Z51AFBcke48bcMCJ0uQ6
z4bQI$JFyq7VIdT2-ZbjY*R~ocx*`X9Z}g&`Pne9kosx0z7#tk=uy~;lED!}WKCx{S
zTtCVSFayKW0O&dtWLH#Bp8IzJ8EHw_y-t&Nei~P{1+W$>YU{1W@YYS>`n@5Rq@FV|
zOu=tXRKbM|q|JO-JJ|v(SBsUZSjw~F`i<yY%R+_I;GudOjyu~6&B+0j#Mza1M2lnc
zO$_ur69e)m1xsdyY=*KIM@{@xvm)sGa%52-1VyyBelNoFn$Uu<2kmxUAZvUY;|&N^
z%YUYmfSAT&JLUFR8$qkGgd*R8?y?UByZgCkA9wX2DxmgEt$g!V!9M@j-l)bxpad*?
z>7)<A=L8-i*IikR^#nT!HX6;UCZQu#b>B(``*}eDE3F43`VuN<7W}gM$h*nCe6Bsk
z7^FOXmOc`!RXG8I7WX#t6m=8}+_ow4Fs`pr=6*im-3q<+Uz#&p#bA72Kdv9lYH(g%
zdcSF$MA<u|@1tTRe>I}&HbdGKjF`?Na7Kb_^}=X}vLt(pFr+c>zt`5#UKid~@Yhx(
zPugDX9jo$KE+P~-mjEp%PW*sRsV*(N4W!3w^<i63P;6VRO(O?&!WwUj!Jk%wra`T7
z^w4&L<dL1N>QH{0gd(mcBRVth`}Ge7o!@}@rLqHU_)3X9*{`<YV<#Eopb%$b1JJEf
z91nr^p)JscGpv&rU3FW)@{v~x=;^l@@#X0=Kvd-Lv`yOblrQl~#?qprMnK^PTA;4>
zL+ldkM9tGij0cYweB8z6H-UtABIWTw8(gSQM%<L_hY4FH2npUHj3*ne8O6u8fBm(!
z)0rPOST!zmhwaw+gFB1BHq(fykkArdyz;d;!#Gs`$j3eE<y^qcLCz+IogXu{_pR>B
z?}aWo|1hM0FX_fGYkEZ}H>*3w$A0Ka`exE)AD+bn7S?d@<JHn>jB;;;vW}U+fYA~^
zsf7Ks!$7s$_sSKKga8d*kMxedQrY}zA@$E-RR02D-0UZadQS41QSBgs@vsr!3q&qt
z2)1g9?K~Sm)*!Hc)@!B+P>6H@7QvS6+8tS1>x+9YS$NpiD|ai}e5X_F*3DkccsGnh
z37pPp-Mz0pLqL^JPqPyIRPiufY|8wBpUP_MYaADmtXD>r2h+apwai;}DqAMK^*;3b
zGSRc9KJRWnH(z|OO81^yvRoRVN+dMTN=C4M71<z8H`VkOf|b=*hwE;x5*w7bxs`kQ
zh$d;+I1^iEN^f0bb&B*&FrR$UQbk4V72n#Nk>Rr=vIWX^N5pO&m^lDDO8q3|#b0Sp
zYyx^jA-R%rJ<s(zvOd6wE!HV)KP9&hXbGw3+d97?x{rY6khzNuDFL=ZtO)pJ9nRL$
z35c28wsT}U!r%d@97Hu2KBCR>G<(oJIR&(*PkBvq8px}gjaR>M1mysoN}TD1Kl1QL
z_u|s~g4O4y8^$&kJ3)(PDO4xOBr2$&P1&=p<#2iZC{`Z6zT@hr&CwkxB+|q)!eC1z
zuR_u#TaME$%2cbWzB*ib(l!FstRaIEwH2{mqC~NekQ)ctCIvj$@GEZ6nNUCVp`e3X
zGNSdK&_l9O`&r@%UHE#Dt77o&%J5&Gn8qi0g)s6t;jbA#fzO82A38ZsWCLg(3$E4Z
z<OXVXHJ7h{cNq_0A2t~-@bQ9V<C!h0Gv;$&?G@ZOI==?W1;x8lJn)5!h~5&gsAV_e
z5+n)ND#&cr(=<54$GUR{=?5@*`;-S$Rksr~IMfZo3Xw7HM|v^pdM+lq4kPpLNBy2U
z4sS>4nHi4C^_iRm10~f;Alt2*(`k?U_(ko(ZC2GuK#c3R72feJDBTN0tXYrH7(LAW
zZi%(8C)3#H<;nEjf*6x%^pA9w)R#cBvDDcd^2VSl<?{6fd)%D%fNSW~=eG>k-}b-y
z$7{ogx({hW83(H&DqE7naS}uvs7MioZMnOjI`6!mXG0LK?BgV;P8(F%v@fHhwzut9
znU^!4w?R~uw-FwBzd-O1!|BjsGD8kzafQMiv=9HO$(yqw`Bso?XO7S!!yrmsAPHS{
zSX*!hbtm*_MBz7B!&zIgJK6qFUC%GE6lkhl^+Lywfd4HHr$h=^kb|JeedI72thF{P
zSd2)I$4hb^<6OX&A}1yi1(VW`=*!#H$ejq!e7T)hSjHyUs;`Bb<6n7YUMPby&B%lm
zAY%|g*9EP<H2+Y7pW(|>^m+i{B<38ZLPj-54)zsdFIx0U(E@lABeFgxHNx~U)J;8~
zng1gmb+na2=5pk%NSRT*er~M|#1A3vX!5gF58LvW0ab@1X#Z`8nB0%|+7M_G8=$V>
z2~|l<k~}~=9U7yl=M>&!(1TZ%ivb4A0_2=<AE=s$B`b-xE{Ytb>cfIJ+C)NJV5=cE
z8fJ|2Yi<+5=ICLNAcZWhfXE;o7~MX`kfQhmTbbWDqc}4H_jQ`kGtD{7*zF+&&aCeI
z+=R8~Ozjn9Ey9M+`Au(qdhSmKFc)KlV;uq~T@-kk{`SptSCD+GNY8`8(Va?1bK+TG
z%{bO6PWnExb*-VjI*w6rB};kuwup}rF#|$9%58|@-@a!`3|C5etP9-y7O>v0M{qO3
zZ`CGCJ`<qjIp3^=m^bS?^sceNxpi(OI}Am3rVNI9BxTQE?tQZV+M3P`(<&x5e}A*s
zYa*xstu3+cFtBxfZ(ClO-&XYy0PoH%EiGC;@1L%k!S1K7mQCJ=Q|)(|8T^0_RXC7R
z1`oQ+0d?&>bVLARX<tA@$2(zL;rL;xz$X)Q57lOruIG)K*n*-rA5@17DR9$Rour&`
zjUR`+9Oo#IuaIHHjh(6JnT5(V&GkW{(+w*d#U2a&*!AKW8S(6<j0INZZa_Dt&ywxy
z6XtX?m<D$toy2qO<j567J3}l}8E6j1I{SS4<(Z`7b_t@^XjD0#gvJJctyI8|x$Pc9
z<e}bs$L8laBR7aHmX8}$G8hy5D*n##;H!fQ2@wJk;^U$g`vy_Ff7#!lyHPx3Ux2a9
z=m0BZ_U$GW{T^>QBnUq`YgwSp<qp7~SGh61?id-6{Zu?xjU~l?C`P}L*iM7F!47D6
z4dWOLFEOl~prbV~zu_c+No2k@mgf^x!E@~OLh)6z>Ec8Jn-!;l0eLh_c`M1162q&S
z?2%1O>38K&dFzhZtOcJ)clnYhXOP-NThlTXp!)xk8@+ayokQPyAB2y@S*0^k530oK
zLAW0Zxc;}~I1pAd!P3M&TkKA2t`L(}k2UBq-C|>10bAB_F{@Zx!eGBM0?k^I^~0Cg
zHL1#Ovc9AG&AG#~nb8ru&em0Xt+wVzxV_3|K(A=jups*R4}dNu>D;ze<5}<5@0o&?
z;tu2p+U(@^l9Lg(Ew<FRDG^DW+f?bTkC$g2R?X)JYn{>ao_R4@&bQgz5@;o?We|W7
zyQomXX5jI_T3NDKaE>3*%6DwT(f4vEz&ZakiH|T>fmNQf=`&L=4YogX7B+nKp_5t#
z=42JK;;Xq5>=SSJxJ#$VS2q)s@Qpq};9RSChVu2UEZ6BW0a{78Fz)^Jz;ijN4JaK8
z&ef!}r~5Z_idqiP!R+K4=`Z`Gm`IQXwmchC+|0>M`%#$z1K{3(q1QVa9Rh`U9J4eF
z;rUC*%1@~&6^+EG4^W>U=Z`Fqbk%b9h8M0Yz#(!;<5N|jgSQJ%Fa(#jaF;KI-m<d>
z_`zCE>>Kt4h;^9tsJ;#4XncLGG&h0R1O~vx8$E}(5&7?`#HZa0tgyMopI%i_z5%EC
zzfdO&Z0*f`!A?Px+fpxPU#%Hka23RxU>x~G0WTQ5HFG|UTMdk`-hF<d&DY_l7xG!O
z&a8h?I<1$}g5%4dysO-+nIJ^+Mm49YiuU&ir~Dk@yRX<53S|qfKhAyGM)c1yz_?VL
zuLl!*j3c=?AeOi!q5m+xg<Y~@UfDJtB1cZQTd$+)0wntKP>*>Jqr^vX4<RpbRz>3C
z(U4PhRMyj$v9iu0YCS3M0y*K`0#0~!=cn5aX!U(?NXW2XFI;&Cv*0Nf8B9eA^?Mh-
zpR*#xTeg;Vcf)uPa$%-t^DIj9wvw7+roxQd;v2S<qx-<$A`A11x@MF&6!Xk_SdT0=
z%W%3vX<=q4A=^-cQ1Kw4K>44SP4}pOC~|c3JsjrbldT2tr&_YPW~YxRK>Av;jy;5e
zUSYgI%@Tf1<R-nnfNR<sAeVdC>H->xyJ$zT^(|U%`1(=S5N*onYaB524=D6~NhFVi
z(Cd@7lR`({<ft<ai@P8wG`a)a&)@KhDxk+JXpB&KiMex?DM)Z_H%q9nMsk>P*AtaE
z`CN>Vz3$zxf4GE$Ehfnt(^bIwV<^SP2$e|P^<%lfO}XlM;0-sH-WGQ2+(?gHlY!{<
z&Ih!s8&uJyFwn%`^4P+U%Yx?!-RG%XULYu`iK>nCh^Vq~sQV~n5uIx>NN%-~UvGJ2
zDq&#-m-hny3p4DGodRDK!<IA8Y&SDfP>zx3P1eK6R^HhcCNlbsSx=3S8rYR`z2XCC
z0~%;*A>@6QvzgN+v9X3mOFwb+B>id}kWjfHbO5N%XS}?3Cj5BPCB{#Q7c`96j{&BH
zX|CY)X&ywRtCOUDq7g_-ckDNr5pL_gpj5sbLbEy3y2-YC@BEzkdA!)v>eVwul22Y(
zo#^{|v-~u=$_sn_iP4exOH$G&_+F*)BOty}01R!{t@E1L3P3M~N3$AW<E?P4xgad_
zV<87|XoUf{F$c2g1o<$wdrKCgSi54V3cubtGQP}@+VkJFj(v{kTnA~kG{}IALZfYu
z+uurD++$O@tmtrXikt>FV^sZ`s5jx6+uj!F8pyC@$*C-MT{j%9#}x|StUO8ccj5rj
zhlf{?`e^UW=eOi8;5QMVDYwx{+hvuPtMEjOv9zsI)o$W^m*%OwF24;A0=jn9Jon9=
z11|f$6XJf)D^OlUch9mX;^d3DX0oUlgTSj$vBX~Y(!fBeU%~xu$GusKK#t#ew>JgH
zCIev()9%P_WifwKn&=A1Z^?5UsbWuPUTlUc)_^zT>`yMl%tz>(wa2)&<948Rv%CzP
zkYbdG({*tWL^?LmySVa-X>r?3wvo45&vz!(!lv9o4gb(?8ORbjf(*I+{WUcu&5YKJ
z<=i$vwne|Ew2>Tbv#}2mLU5iw(|U!;V8}A?DhISakTisARQOgLugJlW%>VQ<fD2Wf
z{tD0A*06!<(Tin(=zbm)2iUyE&lx<N3>B#Y(z<T8XVDF^Z$81q)@4M1TbHhwCl&_u
zsg8kqIS{cKZt+uUgDu)Mo*woiY#G=PA1|CWFnMR_p1%<8fEoUC?0Nzz0!6oZKXN}`
zA_TyTd6(yLIsF#4d{t~#Mhoq4t#YXGWd4YSrwP8J&uoAY_#^1AF|nBG*%s=5axx<)
z^A%V#A0WPVQb|j&hiBDh%pN@zW!&`j9KS#8tvP?tcwgc!uEx({&FH0@C(f8t{|-+p
zXyu;m0L(}XZuor$M;lbtG8F;@9<yCqTIo6nbB^n^h^krP^Ne){^9qNtDu!-OIcqcB
z^*<EO%im?zDp5SgKXjOUIYJAiQLEIS74Mi<vvKe81)tW;0EgGx@_ejS9h5};)~16>
z5n2Y1Y1xG+oX=g;i`*8}q9x(a%t;?5%F&3h>U>c0w$SdAnn6&TRVVQEAmfXH;wry{
z0zeh&HX0krYjhgo8~rdiV1`nLIK+Ec^SSO%zF?p60?)g}ws(0L9>zW&1L#MO?0W0L
z#)-aW40^YHv3<8yg8VioZEI{_Vn2=se;icgsko?hxC#z^i|1bo{W|kFK`8K2Y9Qwd
zEicEr6G(16PaV1leCWk3cD&_E1R{_i{Wqd3N``4sJQzrVuxv!OS01eB%p2+7_ou!i
zc`(rSj?v`+@K=~tH&85B9v`lHpiCl><CNRALe5K%F`$LKf^l9k9~?q%SDcdgA+mU*
z`4AX`yg<8lI($z=V$TFhgDJg<!s-*omGCzEk@-`+kT_O~P_(K-uo`M&^kb7Lji|#H
zJpzRYUooaj1uc)41N27oGS<=VDc2sgHMO22{DB<hgUJ@F&42d<TukT<l(=$U5D}<-
z0c?YRYK39lDS+frf{r^byg7Qjnb$u~WXgTbVfF{BK5!Nj=WIV6fP+F#0r?<@jXaE2
zk-CP~8^9fD^poPg*RJObl!5UYIUE2M!>0`H*6>Gbc_z)cBT9|Oc%xT0EaHBLOYH*q
zCKcY#FePzjSn^;Z_x|FF25RKVTVpg|1>1xAq!vidd9O;k{DZfaBF48^MXo70jNfm6
z5G@n{$JQ{;q(*TW1G|_APQIcg)_q_y603gA1p}thZW(Kk{UnQ6^K~bCOGgjY16h6=
z&qNaX)+?lLuy#u|QCv;XyOs@#z}d-yb@Gfozt$nhbC`-1hvDjM>g7?T!pogP%N7`N
zvz4ye+@M<XBSZe(C{#w9e|S;zXn~L9YAf6#+8OwzW&Md3uinS4qriE!w)L<BZ!>L+
z(dA86#+W|C6xBu@WadUyMZz%P-LQ$43h-`u{{bpX;n^JC_h2QR@Ah@K7a85}Ep`tw
zuK={0iOqv4dR`z<h-E&KZ3lUjS=}Zc?ER$Ai&eKakE=zQ^ti0J8?-P|I$;|+E`4-A
z39Yz$!Dx2Ew^3OPkWH$v<(C(S<G2ut=6+m7!Q5vhcXmMJ`?;)e-?04;tn5Vs-%4*K
z6?|@{hn#pv$%#0}vu`VXv@@cf=Uz21xG<YOQSvhkY&*3@S$?$!x8@NYYxa@;{tmCC
z=dk!;+FSNtyF|(q-(m&n3F;-E0Zv7y)GfphQ1X0R=prs?>%%$-ZaU;b_4GghVd5HE
zoa|Y_T4<qP5J*~nVB~~Uq3+{~0n-tL=z$bbHYKFbA%q(7)Rnrs*a#%{=MoA4z=TH7
zl1UOC@MSfHIQC|pu)M2!#S{L)JGRfP8rOuQa*y7ezkT*yCEe$NB-0z6BpB+u^-oP^
zKO_+LJMqrmy8UD}cM&&W1iwu^Q{G4hAlboEPo5ZDvjNrii*Vi}0D9ubJ41#Te8WBu
z=x)`>4B%G~J?ferAb=&$ou0p``ALzY&GtpLtuHtLiDfHz#0UnMN7Tj!t^HZ4J6h1{
z(y(0vee!_ED`Thk&&MiKA4HmgWZ|0l`0swGo1ciLye{1S6!~=NP+BT_lT7;^y5QSX
zS~xpf&X`zXHLrT^B<~w5J5+|g#xZ(`h1;qk^GKDwO#+rB#_LvKr_`&hb-1|@T9Ub)
zx33t3K6ozPw|c6N@W8=9RY?oe*=;}HY~RM5V_N(B=<mEOzWlD#Olve1UcvM6yJbsx
z%~uaC05}h4-yC4DTEe#A!w;LAWE_E1Ldrk^e;%N&Qb0Zw4Y)GIwSf*_UC-dA!i(_$
z%{fkwuGpx$u&vDd86Wg6?zt<*%{%Bufu4sAgYqY5ocD-aLrSV1k}_4_iNnVah)r#v
z7{CtlqJ1GEt5^#x#%~1#`n{2_*du^=0DR;e?hA9(SQT)*7MrcYB+>Sao3y?+2CIN=
ze0Ok~_8IAXMwwAg#ZEjZ&c$Nws`EhX3FMiKSAz#<7;!xsIsw>0Efd?B?=R&`_mpY>
zh~4kRt@H)gyt*zs5JZ3#EX93+Zwk8iOm?uLSbAPpzIdnscgkDTRYdj*WQlB>nDkwS
zdAU^NFJy%2b+uV{ZJUS}%l9#$4$9gjt*U=dUb_K-_n<f_d}{?DNan@l?`L<*{K5iY
zCe21SJ`~?F+Mq5xB?%6H=~ww%#r8<m#F0Ydg3jCTRqz0O){X${eb1}7l1A8*5Fg1#
zZaySa0ni{wh)c(P!iW7Gd-8fQ3>(b{<!x;6PwZ%L?94he&3HTyLu_btM)R8mj<R3S
z7m5Y|2_O`bl=1!$9GRl7!PM9m)@z$U3?B@~O)?(ryWIvV3^f#&-JHZ7CMmHk(=Ez6
z7$M0WVKjh6u?$gq;J}pBW1jpr1#<}e3ik^G$5S_7eq(<!^jgR)#L+;%cyL3ce5zc~
zRHRA+-@1c66y5a3{t25>V_5yO<mNP<Ck>}sUt18cVC6(E!&|;TfnR#sK!4=wKbAbl
z&vFW9{N>2pSFpuV!teF_>Kq?XeFMWAv=!;voqvN^S*-X*|K~RJ`<~p-09*W53h|_$
z{Hu5{R;|szkx<gw3S{Rjo4kcQ6<g$90QHkhW#Ac7((0jqV~PZUmG(5O1GHX2GVZM+
zWXvM&3NH){Hk)ldFDIv@;MD<_>@F3zdG9O3MOk^^;U41$M1uvnSsk`v1E^de1r~Fl
zESh_gh?c>14EbDhNv0g2HG_6WJZwAJ=GR{fz*`nUN&M=bud^;BpLXu9HF?yB18^wh
z<G5>hPyEuiu+<hH_aPS$@OMr%h0R0Ek&X<2`mxt8_GBy_^+*flVRg~kI#4B{SgGdy
zWhlR%Ae*bRYQ(F4`}sT*^2ddbD=ZZnoNllAv{(lC*KV>^Vq?<GJ~+~M?UO`R^7Zp>
zrDJ>2Y}<cup(6OO7phCQvpWh-&L2@)@RcW{2D$Gp{Z5eq>RxYGstAO`3ht`vbk_QG
zuQU(%i-keiqBoux(9_&()sL2BoT_7T{e13d{2n!yKLx16U7N^o!T}UP3M3512T7M1
z!-*SE!rqdjpqhXsGO)@j{6M)VGS+sfhL&yNjgP=me{`yu<##N+?mSna-dw=LG=(i=
z<=#GHYo&b1T__v%y&u(7rrj01XRw8yDc!DThPKf|dStzFw+>JxG6(9CNg!$IXB4cO
zAG7^R2Y@JWRK_=o5+LE^{jM399{Tx7sy!emQngL3X(s|4z`V!kLF{(@oo6OXdv7EW
z(r-suSgdgu(jd&hL~n+VJ4nQ7*NcPo-KHrGx&XtKqHTQa=p7lL=aq82pL5JtZeQ3C
zNKfn2^?@?zmfL8sMHi&T8hLg%(sqSKi?j92=LvuDU$Z22jbH(OLCe>i1rO;<)MzY@
zsAJQt$Xn?*u3!)EMckTGY_1jyvM~A%&|exFUMA*~QCizc+l<8foq3J!HWizp?4?@f
z*jOF-4XWRU36Hd$u-l1bY7=!VwhoK62CRV)inr_R^dt~I(FR%{Af^p~v}hj>10rdz
z3IP7i-=84<ack25odw|KF5Cw)49wbqFzOoaau=Y0)DZ!=i5sIOCu7fJA*`a|2skpm
z>NRK1jkmlUyYV7N^>lEdhQ{&LtDD{5DZ(_3{&FA_C1QmewNpJGq1Vu58(%Wibh@^C
zTH_ntWN#3@c=6!L>yhEX5wLRkuZ{tRv9qn<puIE4$rBit-AW6JA_(I$&b|NjjD*xr
zc4!p9PB{WLut!4ju(Pp1gSFFf1spCpMoA(nr)wx^pwLF!%Zc8fK|Bg7yk*cy?tgHK
z8Y~|@wAsAr;sN$hU@fEwI|4?+`(XpXv>Z0lW(O;-)PNe?JwqzM8aFR}=J)Ju6D)XM
z&^k1Kh)3W{G3MhAX#6xp_q%Kg-yy_2YEzWC0tMm5ov27f`vM*7P{uB9>)caEF&KTX
zw?)Z|n*&<7XErSl``u;}MD?TZy+y6@?c#~<5S7ReZiN@ws<qkLZu<z09N;ponDkNp
zLA!StbUpAQT%BKeEl2iAr_kRdI@&qar^Kk|fh$%oKhVda&KUnTrPL?zU%8VQ1AbVa
zPch)AK&&&xY0Z$F-|R<$5};d1NC*`9zA*Vpe5aqe)8+fM6j#`;FZFw3xT6jBnw}Uk
z&!1iC+H|}6Y2=ELZR_rw`QYx2uP4QxUpT)Y_;u8FTJog{j`tohFll^*0`~C?lHNjS
z2%hE$Sl-^0GdDo+LP)jMXKNKW*#aQUp>>FxI0E**XdgRP1xU%}m-5>Jw=_GxZHKWf
z9zqJoRgpDz@}m;zck`zJ@8{?>WyCQMC_b>ZdB2_fn$ZId;OyF^d#_^JJZJcS(DtE2
z8LFP9TXr${;6RBZE8QM~r~P%<IL7p_&q_`WACVDlkP#_~w&W8cofE`ctvol17;n=1
z++^r^BWw*LHs5q0UqUuL{391YNK5-H*8`Z2+P1=qW_W}~K*=#q*=#9DxqrRVgS&-S
zLFZZT$U90&2irYwdtnt+7Cj*S8Vz8J)v3HRjRVb0mQfxc3_3Wq|8i8GOZG>Q1^=Sl
zV8Oxi%4#<DqsM5<foO9>QN?C?hvhi2d(JV}J+HsZ4(5E&Ia%7P22k{Y0iotk>X2+j
z3M7=YaE?(I1V4#fbl*@{0Abker3V<9`|z#hw-Q^Q-`SQ-f~-H;g^^?E7*L@G)CVy#
zuGHJTB6Q7J59k{ou{R%yFv1qP%)TWNl_XlF@)yHc;hv-PIV-(aZRgGv<kn;RGleQX
zh*mLnP0uKw#1~x^QJ5H|T3Qgx=@n!^4DSuyK<s&O?Yg{d`)tr=7;+ai&0{e!8JbPi
zx?+|8xWr0e4#t}ZQU_9%Nd~WGGZQFI0xH+<fbj42MHWz~m)vAf0mzD3S+?N{z>~QJ
zssgvjYlvL8i-C4!*LQ%@fPmo{Fk&okhK0VrAHo$9>X7W2d<MADQTy+U(-{|eM{lly
zN|x687w5o9VKR)cAB!3N0iYOs<C-LDnqv(Z)NZfCv}h5F^WAE&!p?Q$X29@H%41du
z4wUzbAb>c4!>_uAF-01w57zonFsHf2&q0k5!0hou2yGACsyD3-_4*%syA*;LUe;&{
z(kDWwj03dV75$jYw1fi46QiaK-*2#p+^#KOr2xn>+cJ0I;+DYbWsow>HKzwWl`$ev
zSsf76GY}Jnl^#@+wp`xna9<%;LF7L!ofXz@&T3^g{+^4Ow(PRnF0SOTrFgQ3Ygsb?
zqktlgO4xq6nW~a7w`_|LU7tnH?1Th)z6O-UQ{t?Ck`$o<K!vCy2tLKNMRs(<9LSh3
z2X+a;4KGT@?gHS+mlpJOM#RoGFx){9=B0TDh$@gG01#WS@A0AM*C4d=f$NCeO%cm0
zV->QqP1ac&W$sP3kK^5{g`W)B()zpBbePi+cbIB;J+=PCYt3b+v^aXsTC2Y6oY$+b
z+?>J^?62U%>Q4ISa`|PPqH$&<yVt0%3z3==BV`|S8;Rkn_W`d)9FQulj<3u3uy)--
zbm_~<it!7!aX>oG15P4=WvUam-#R0S#e>N4G-3fLCvZYT;v9^$0AG#n8DN`@WLvQN
z4k7RW$q1F3t(5BqtyMX=4B}o6@kRUU>v104^QEQFoCFEw`p6gRVR(LK&lbx#Zv)+=
za-&Ysvq(p`15hya$F_Dwk8H7NTO@_gflM|55YxP?w*8(N7fM_}<ccF1)~woBud0R)
zAt5mtg#YQ^_T7mf9eCdud*jpKJgU(;a%s1NCGVLsX1G*LFe)?Wx^+w1d3cSw!Q~|^
z-J`mk^Iqx}R{MpTr$I=h(+f2L{yH{@Cpf*$K#0KG0N;`TWgm<J2n}Z-U)Xhgw0hhL
zHnHi1yenwGz#o2Ib7ABzb)`}9H<<K3D0wrKSJT4?jiHF|vX=h@FcG^Fwi6;afUjhO
zgp$b-#7!u9Z()7hBq%LzUJ551d_HsoAGa4HqE+WEYC>;JE7jXxa8!vJZA|v7$oEd(
zF2-fP+M?NfofNQsh?8~Kn}`$?$f}w)qnDeW@{0W;!22P{DLAuftC1$EcXW4?*0jfQ
zS?sfg;SI(`rWo$^?=%Z*vmQ0X`HyPBabBD(en3oWgQCtcz&Nb2p}7Em%H3*;4V%*G
zOREihuZgjB>%x`m7eqjiEPW|n62OE)D)n6e>gug)XQb6+lN~to-&NF0`=QR$w3F>p
z!RZ_yTh!vJDiPrYqT`(p+$4xFUn6X52Nc25R)RGE&{`E!%42>`-=BUIvX0u7<+ABy
zPw@nj+4jb=guiEx0bjk!yCs2$m#Lise}sajOgN6et=nopE1GT5Do?1A{ButKs!`Y;
zDD`Dmi1q2sO4sJGU7b;RLH)ZNoJ>L8?ZJfpcnYCWk6L_9WrGD$A`ftr7}wAd@B|ep
z45c^?123mC&?-}r;`wHA&Fle?o>)aQY7!sOg;E`l_3V8SjD8`uQ`8ffooZIOU~^y)
z%0Frp$q-|wKr$Pyyg%jVoZPx=aYi5GvBeZEBPX=eYuH-(z<+?5=4E9ZYf&6|M{o{L
zItzxve3A|+(Z!$IATbltEqqWb{j99$fnZniuAGos^t^?pso&Pe?IfNr_7V>M^sLi)
z=hB_?qI+KL7irEmX0_5)8P9m`n9e9W=-<{hY}xOmdVF%(v<|(5Z~aMGmh4eDjcL#v
z|A7puAHcnlLiw@{($}?*>hHw0Z543lm;d$AGiH4|h@1;{=HM)6>%G($2}hX8T-{GO
z=t41gi&gKkwXH|=?(&BquO%bPxx?MjoSnXbrcq}MC?2Ur=stZzi4LE{l6_Ggz_y;L
ziX^fHHL;l8#=hr>j!6grUH}*}%@j5RUC=ci3AI5|*@lxnI9JxHk^uH65}=4}XS(&<
zPQF))``#DKs(4QX;BQ_C-)}p>BnmqOp&b4fWp4ph)z-y*E24rRAV`;zB5?%iR!SPB
zTS2<JK}1lxq~p*i-CasacXyXa*CBk@=3c$t`@G*f#y7?tcN~tK&DndcHP@VT{pW9n
z%xAh-_;1~h*bUFn$AcHyiYGFZhPzT5sGg=dt8VJ8h4HFx(4I(67afWp6peTEQ#%z6
zJDf;%7aiuQIhjKL?!;Q5LuH}rqMU16)-6H4+;!IevrX{VQS=p0eyT^Gjs!t!+5iE9
zVhLU&SVtM~ah!lgbEnh@P?jeGf$~8>&s_p<W!f+`@UlJ+(o0Lz9kad>T#0<PA{kgz
zMYg5hB3nI`(k(mK6EKI$HtjK`i1_NnnYjdO#TK*I>`Q7!22};8(Pxtu7b6GZB+FV|
zeSvJ}X}Kr<fk&T5Ba#AJ4TdAiB(;+msd>Ee7dej980X9p;_7?&#9P<gh7w0gzO$Gt
zFn{x4D?+PC=grS4%<dscEos)@)YE9UGdQ7hy5{yolQDzI{`T=>bX4F!X<p_`h)%qj
zV47}FR0-v(9p{flIAxdTTNkWNClA5-4}G^n1-w2NXSyYj7+>`;meyg%-_bo_6C9Bc
zVeABFO2s?{NB#`w3*<kOK)1DqEm(E(+AyNCic&P-ULp$qbU*K*Qb+DVOy$XhI8?Gx
zyWvT0w=q6_+``chaKYbmJHMefRyMYVnSidC{rg8&b>AC;73%<%)(9v+cFr3EALf@^
zGLrnB4hly^ES*X&fk_A63t>TxT!1$9L0eYmnkO%E39(>Us~!$K<@BZbOG|kPYnmb1
zBFRLG(;CG3P|F@Y&sk@o1iTWzdJLTUMU-NWrVkhl<4S|Wx~M*k{kidu_Qx-yeoXE<
zeeB}R?8j4AJr$gJnV`Nbk6T<cN{RW~;_RU)9B`?HhWc!oY_Bl6T2OYh$eKZgmaCxq
zX&SR`3#U*fP%XQ$e}ku{C#2D$0%48`ba>+nUoHW;_Gb>?69sU%H%kU;nx8s8y<c`g
zfmIVVJ2GwL@%D&x*SR*{Z5+V_ow`g(4|9-|?nG;j$g2^LF0G8B*@KG34GXrxw+%@+
z1jc*yAzWpU6ydlqv&QNaD3H5Csa?K-C{-3y;H;p$<N(s4f?gE6WPDXns`?Z(GB%JM
ze}kf>6i17{8#V>%DFTjs^nh=X3V2{4$HaWRPj0J!Y*+{u_`%3`0i5@KplEms;92RQ
zuY5QZ@5eIhin1onCqg2V;}wuDqj6to3*K0QgEKTG4Z=(8$4f_SPAH6qI(6Ro3Bm#A
zpBU(}*aUXio=@Cv$FQ1@z9;ttCl~J^K(_N)vv<kY<<o%@4l+R@N>LHPZ#SvNC!Q3F
zytTHkfDkN9ZHe+>xXW3gBl@@xEN8QNi~ztP_YD<2>u1qXb1n5(;6!IrDLMXT9uX69
zlw{G$$JgGK#*3w@*Kd(jtlOvolob8Fwp(@p1rt`HE&|?UtuW1~!)>h*eP3gROlr5C
zSCJ_<t8uS{3;B3_<;Zl`@QM5G^_kYj8%SP)FM+<86;SiHN~N-gcI#d@Ym;oP<SGNo
z*W+RuBl0GfduNLj7gBQW;6(nXWBB^<;}TFb+Mr(HB>-d4by-V~S_S8BJH|Ur0`BNc
zu(B@#Ao9*VQhiNm7n16Xgoe+QhQ?I!rkiVp#QAe6Qtp0>m*G0i8r&A|LFd79XDqUb
zc6KL4*@4$)IUqsqlgj?fF*}W#YAO`onlZYyS$E>c`s({Li``QYOXdF*e*%LzN+2W5
z3XsD@pkIT7#haC0^bsf4r5fYh?bqJ&9sYBg4pVHV>l?at1V^K~m|L0oH(wMVgN!Wv
zIFl|;s?)@eZ1s1bxr#+FI1<8He{Ytef&4?cy4k@+oYdXW>$f}eEqtY`V~l0eZk<Wt
z?r@x@;klHnD0k{FM?}*AAOtUZpMkN%PVqUiFfY$xh%=KtDI;mw{0x=ba)k)I;Bjg7
z=$N}Jl{&A$=jKEv_w$q<lHhId?(vv=+xJnS^~%vg_#)m<8SPA;(JHZogdLBQ37u2l
z#%->0ULA18_*(`G+^`}n0x%ji&u$6~eLHxa$`GJYcgD(aF=mvRpkZEcKzP-Ct7(g4
z|A<7~%eVf0=v~#?+O~1Wed*zwW+(Zw1sfdp39TE;%7M>7z-x`VKP6mi{XQ5pKK~VP
z{`KOKBj~ciGez7h#1bw>nBk$`Wf3jYCzIV8I?_pD#-rps<s#5x5q$yP%G7bQXOxdB
zVZ_WI*lPO}=;fu-wl#fvp;SPq>*RQ&K2yfcz3=0A&CaJM`bgPkPU<%!<LqXuPi_y8
zY?e5)>=x^GaXa&j&FpT69rF>ET3KRcZ=fC_z~|TWpwH8$@(VDPMXG|vin?b$YdGLT
z#)B(v9}Fdgek&#Z)=OYrwFuK{8QJ%~mGxzq2Yo2DJmE5n<~j^~+_v$CywNS!-O9}q
zc&*MgVOA7$DnJt<{x;zE<ac)eZ{fJ0pM^iq*3tvf><;c{EeU;7lB)sF1C*ld<m&30
z_WitOmN>%fe8q-Cggu(SZ0!=i;B$hrHLc|n<T7m}xthiIsXk>^(;R+sg1{c2$8e9D
zc{WT<rYeMej(fAd!{V9U5r1lRR3~97^!p9Dfw_cJ(r!WCV&NNKf9v}Gx+OvCB+&4Q
z-ql7hOj9~Gc_8u8fKy{gnfuT<_d?VcWD9*^ZAduxy1SZO=KDJ*Ev31NobR0oMY!@X
z(Q1eN)KGSp{CYAjzxYA;_qn}~E1aD&B~HfIbQ1?ZKw{qz2xEkT|M~ZM>)(Dpz!=qt
z5tXP98Uw)BH%h#L>2gA%TjtSg%$|H23dUCTWW+%zWLIOR1wQ4sA|G0o<{*j?HhgnY
zit4mwsVc#@gK1v2M(#6{@>hBAR#PDM)y5s)BAlgOVv4QsC?_yD=URHtP8Nv-)5&Vr
z?bxmx47DtTIy>jIP^?Olhsy$K{0f<2Xk^ptAw2EW8k-I_Ve%cR+7b#q%RiK=my*>_
z$c|qF&fbxJkDBTmIr8K)wvqKCycIR?5D;2-PaqD0Tm|F0jla8}0>0u1W(!^*qjDU-
zJER9MaZ<e|#{b=XsHj+SxQFSp`^(^wmZYruA1Mdd%o35fUR`K)n;nerBRuOq+4n7r
z+z6^WWUVi@SIHEu=6g4Rc0+{qNfIhV@IW=VO7uHI`t=BfkrKq66XJUFc$g2n;$B}>
zJ?{!|(kX6veE>^=LFZ2?%qFHv9(m!k8BA_=#vP3*GHy=BOTM2sCYR?Dl+_c}?1~;A
zIC-?{CH?oN@6t5ngJS-pFYgFX<^wiq?Z-A=?E4B-!H#Z}PCDuBCLY_ECiMicP1sM>
zjB}UH@A<L2@amiSM${T@loT5mn>4R%>L1yg+y5X@Js2z1sJs@}q~NACP;6tgz@QW5
z_TK*Zr(!XVc_M|j-^?RhpNM8@iU33m--{l;yd?Ko3;jFQ-=)=i?MG06e1Fh-1=r&{
zbUYvl0qQ!)LDTMk;zj6bBw-^>ckx|lTSmbgNH_6<52s!!d$L$JEx{-q2)S6@^p+Uu
zWmR-lojTg(z3Z|{3Wr&&qe3Y=a~aI^mQhU@bSL!|Q8vOBSgi5eLJu-(R$G3HFaP>W
z_axrbqby93i&u!v*ZUk1>*pC}4uywIbT*j@3|qY8`>xy3#f?>Om^l-!97nCNzIvJz
zsehDP*9X}v9OOcGZiB|7^0(AYMa~IIvZ8MGIfIs4=LGey5Mn&%i2>uBCRirJA1tcR
z-Ll~g>usNUf)3<c*u%~lF=4?o;m?V0-YhH(WAK`oDMv6vpLfrh*&eX}xliZQa2?cv
zdB#GBldgv682^4QSCH2!LUs~mXjJtkr|Brg;#Z;j78+1inl9RX#}*k;p2MrSnk_+W
z!U#R6uaoWlO8SWfEi5gXqCo{rTujir!kGW%3$3Ak8@L&NryKvfU*CEWM!X)cuISMs
zN1Vh{V0)-Gbb<qZv@JZxMrgWICY13a;;HYZ4(X9nnRKpV*x%^S`Tn<+B0$29RTYZA
zxV#AVyGjoO_1r~EK#xTkga&$jDbV=8yM`Q>=i&Sb>(BI7!*jjhJ_-7~7fyV4uK*+l
z`t81VzTt+G?@&uj*I3jZd5|$`RV3{eJpd3|9D>K@yn*mXp-PFIF~ia9KB7S+s^!Ro
z|0cP3QZ-~Nl}o@2=VPT7iiW&LK^;f75InoCW7xz@qaHdYx&3`o*`mKvvS?#WJvW(!
ztRjV03<gB=On__^g+~1Lf0k$^^0@y>e+vGEsCwQu9wI-N-8fI~!8Om1$UdUzebI#r
zMqd|~49EJZOI}W+s1+GXwHsD=3jV|*;In54gmcEw1d|KVOOyHfI-lFDC*A#Hbd$ay
zANyY8eID#_Sv1;UNpa8Pr_Vq%vl5meH`sMd5QJ~}e+B2ig0XMUl4@UHEsbJG+bmak
zKke4QM3_2<fz(A=0Qwch^0@>ZB^=mSi=>JMr7T6p_46jnAtPN!t+_y$s#TfN1riWF
zp%mN2R2roJaCh!sLo<74dwDg2^|@@^4~~zaHiTRyVS0;<l$V^2a7Z9DdR<v>ct$x;
zGm*seZIWUO+d}6YAzQHEH8pUZ0|-Ot?37CMD?0d9D=;XWB%LZ2oKdLTI;?=HCz!;I
z#QWT0fA#vbuHX3PeWJ%VM(piMLMx+y!<o@+Mnpuk>+byddg;IH!^>V21un((0I<e;
zyHyFDI=(%RVDFxI?FLaS#ua1o^0W5ThPm#z70GfubOamq;1g)}B?bJUs{4=g@;^V8
z;;69yWQ#rDsx~}lqwRMd7|E{XzlC9eL@5@Ofr>*r(A3t^_x8lb=wfoL<aJxh-0kbB
z`GE~-JGzDkz8wSK?vzFd-v{d$8Jb8}P%bHLzrDu)*9*@Js@}w(4;kGro{a09n61{&
z)Hr)!AZ49KHS}Ty5<O0&kyTAx?jjcQ#}A-1hHcF32s#0NFq`q++`?U9qQDquH@oS3
z_65LxOuvL{2mFr#`R|neWqExVJrBd2ngrBMEAU7QM%3Rk-bAnAKnBf%1Z|c(lfmg&
zKTr}uB}#^%NoMUGudK$P2Jp@*5GthFS8tx9^IRc#gu)tf1+T(%R@{^7zjO8fb=SY<
z^A*o8_W={q%Z7;=`~T<XmxUy^erW;zx}4_?Xd2!~y!-6GfV$s*?*I9MfCMRZ7ejA2
zN7a25$Q-4E1uiQ7$dE$wY8Jxcbhv`z#55qcKK+Zc++ay%36hggh&Ycq<vO$Mkn&q~
z^=qPGAT6JwxLzz=xsC(OO)4sbYSFtNb*JXe=LaKh39@_#AO9~<<V?8x_y`m^A<S6+
z8O9`h<T}s{?jVK1_!6>6i5fuiI8ih(;n@-YKFZWdW<Bc*N?}hyJfuzgIMDziAsiZ+
z4A4HM|Le)#hw+Z0v^N&g$Sr|{LfJVbNV#R}4?%6-j|myJ&mx4shaeEpr~5u70c_Aq
zEEHaF)aCUEJpkw_{+c{V!KkxAM&xkbXa7|6|NUeG8&-GZ4ZZEyGp~npB>i-uI$o|=
z_UFf1y@@0wBm{7_yR3R#De02qtOngWYL6_;1)h=H6VV2m_buD34EWJY0p8CmhePI<
zzrV`=zQFgK#5pJ%)j<>30-eZYfE2Czzr(bl{gZo#QpFKsP8)#+DRv)4F6&JV89bp%
zZg+qh=*#wR?N_rH_fk5}KN<*6ZDCh}y@Q^b5ZVXBFCqB-8HyjQXZpUqz^zk74N%`l
zmv-Ob8rN?N%Sqn;ecOg8S6wd#LU67U0xRf^!k||20jPVm`+D3DA44soYBki&N)?mi
zQ2-J2@AcrBf^0oAoCumWYP%rz*Gj-Q{Cgh#Wi$b=TS?7`y1o;r^&z>$1@omDdqjfD
zd$!%kYCg$HRQURX<8>Ayq9<n4_>;$lsdHT&<UhXmSzQ`xcwSxqil)zIcd4|i+tKRV
z$L;Jf+W94=<3dfD64@3>Lcf(CU<cYhk@b?rNCP%Ajivz7Dgi-)VP`b6dnb@roU;?%
zljFF$bf`9kE%z5eD(Nb+wd|nD8~bPfyShKzZ{Yapm?A|NuJ@70x9_tH^V9WJ^P)N$
zlLeb=Zq0eUG<zmRWjcd*ewRu9S{yMb0Wl>YvtGf=ced!i-?8ByMTggYPwry_grAyx
zC(f#O@|o45+Z52BF&NeT6j%MN8F}32kNZk~tkd`TY~+i%c3<wIf{Ku{{jlht)6rjl
zgO8#&qZ_%l!q<7u`x`dx6djIx7Ll3?@yMEq8(WdCQq0NFCs;B2)*@}<c|TLL=YaA;
zC0jn9g|%f@sbn=u$X}4qU&d4L<vmm>6?a@Uz0^&^Lu>Wksh&WxZ_GM!UDS_^2MuE%
zS&>M`3d91tExEVCm_h9ym+esTFpgK}YN;Bmf8TDTMYwj(`OeG9dvZ8nug^whZT~y2
z{Izqw5lldKEYk9)?I=+TEt*3W7ne_Nn%NEa<y=m+OYiygAo!bfbrF1!<pq_@VEA7-
z1~P4UoA_~z+te2{9+*;LTtJ%Ha5#T`o-KjPz|7HOlZa0}In>x^vHf?(B?Fdd5yHf+
z&)|1pXO^2oE}Kp`?*jMU(EN7?-;Zq=6{qhtj{6~{%Q5P&^`n$vkUS9?@6Mf{y}~JX
zvate*6A5B@*PYN3Q=%^RkN}w`r;Q~UoCm$Uf&%uWEAhhUzYWK)$M+T7nv_578o*J%
zZkM+mVoMyvH}!<?x4AyB`x*|I&{6}p$%xi%vkG4{OgOBIX4|Mewi*<o`oz|x;$P=z
z2FNNbgueVXA2gZ1GO?F!7$o+6LJHR&e7K}*@<9j{kuoH0mUC}?F3O>Buj%yBna{R-
zKH7{waY9w!x^yr8r(0^C$FmNU#<=$1b6K$7V|SYghW=c%d`fL}j~IC4zt`&Ji&Yb(
zaTkV@B9;7`H_T3Yu8ewWx+8snO0_)q!EA%^(K|wR_|N|YwXtT2{dx1n4@vvJG{q#P
z-=>|;>W*NcSv0i&JX;iln9n|qNxL&<!scT<w4$`t4^S&tt2UoZCFzMu@_fmP_J&v2
zRV5|dJ)kwCTf0qe-r{0-AQFd9NS?)5mN{&fwu)$VQOJaSA}o2jTib}7t%K(tYhV0!
zf{ZFM^4y`d^W7g(>c76(evsfwtzV4|-6tWZRD~DIZ@<=97>ig7sw4YOd+ue~h!5~!
zeJ9$B3-jT01!f)r;foJ;+PrN4`#3&O&q@#atBTXJKj`jN&Gol17At>A4U5t&7b#xa
zFe<8}a1GPh>nlnPGrs%EM7iBTA+3&{9Ug1=F<Z2?g&A?M<rk>bJ}nPW)Hy<dhX!Bg
zFu<1h=hM4_q9FZdCVVIOXwg%yaCS=e<uhk>O6ExT<l`M>Sb2_?wapOUxtmVX6SWtT
zK~CgdBr4J#Gq9X4$$9njmwh8RW;rSMzca8jh9o%MZfkzU94ey=V~*5@7w%oiaC%e+
zTur?sK3@m(Q|qcD%JqMbHvq;6HrRO$>*=zKAF>=*g4C<sD*X!qx-w&XaaX{<2W8>4
z#_mwQMC5>a$ze>hETszT54zD`j&<$5hAo?tBktj*l#gG}2x2}oSp|vfhbd*l$|`EL
z>AUX_S^!A>-EmH7B{!wq5RA><T}+b=l-{p3bvffTwuNK1Uip9~KJn=FicK%`=UR6u
z@?w*i@uBA_mj=8Y@|%_{h4KD7K8AO%)_Kwi1Y+M{24CcwbW)%N9CQDqdG;R<0xZKM
zZPaZ(Y!?weyW5ZS9Ov<)-&U$$>{BH!4O?vode)aTHMRcmHPDw%^`Y85%Fxb~&%HCe
zjJh3ueB0N~MAzZ-M!!D7PU4w6|E|f0e}krL9^Q|0GDO%R(tYx~B=2zNYr$h$?-1Kb
zwlQhNDQiWyCGxoK^rh$&>7<&|l{z(3^u0*swx_{aLfXNN0eor*IL+^!Ft(KW7b*WC
z1P#X(6p{GvjRK8JB10nw{ES$x7VjMb+-&39D;LY?{sRF|{<R<lXYQ;{QZnmR)10R5
z4bU8-+tt^k0oru;u1soZ%O?Kty>ZSbqxw{K<J|5)s-EOI3gp(U4wycXDYrlJ^2}ve
z_8b~s_h+{s{2tbzw{hI`IFv+Ox7YXse;3}Oor!D~@uDxemy&jgRgHdB4U;<I`24y*
z|D;&NV^5%M`~Sv|v*FB6=(*bXl8-%Ftm{xLX88!hSpr+6ibB3s1A>Kr$F>(#4dr6T
z=Msc*^wL`DgjJkkp@_Uumi@oxA4`d+CbM7O-a-+z6)YnQC!x%}dg@^+i8ZvY1S!A{
zBPophn?Lz$y02;@<*E1XK0+B1+xU2;Wv)oQtouvIPCRPIcR~84f5^>zaHcD4uO2;{
z!{6s@AjG8i73#a|w(eLW6A}DH@Vk=fir^?f8}~nxm!=iX2DJ`I$Gv*xrYHN<5fa)n
z>3moZ&)$+<g`UN~7y16pt0zyNtKu9WbO+_Yf&`Ec+119{V1}kyplu3?0tRS_`aS1=
zhux6#YUP~-D*0MefTfKASXw;W8;q3j)X#|Q1X1iL&C=@$PddJ>7-n4<8*un`v@m7W
z4t$O!8OQEl3l$epaN&0gQm85$y=pgwgPpZ}ERiRFhkWT_%J|#vBwYA($tUPz?RvWS
zL(>3vn?XPqy02(QeM9rFMg4#O|05D;r?cejAfWvBO*U*(^qRW_+lVKA8c~hQ&h6?J
zkBwL;$}W3J`lvL~*(Wv!_KNqR*N=Oe5d0Pc55qpeR^n-ThwMU8M2>7Np7qdHma;BR
zcZ7gt@!V|g@FN{f`=Llh?YH%6jm@;;)n49r9!lrP-KY8kt#vuNhS*o#v)>)jOLJ59
zA|Z8AyxAgU-*6SYje^bsCA1fd{r}q_1zjD<^;*@C`9KJZk(jv@rk%k%IwtONDzW!&
z-zp}k&3B(k7e^Z|)IoHk9QhXAAR)7A5>0mfz9oaIN`N@hu8&vOa5)@`I~ZT6=V{5W
zPCys?@6+nNh6Kd!Kca|S<jAN&h>v*l=1A|wUW+=pp^q@wPA{myQa^W^P68b7U#`Mm
zFXE9MUno=$r6^_N9>~+Ui6me<`&4B+r^kn;POTVAVibQ2mGx)VOfYLRTeLYbDb+x*
zdMLYh(2Q5uaIQUWQM)BT32)}GO&YQWkn56n)~~rg?x~0JoAG`IszZyAH5MZP!ZzEI
z4Z@31qbB)R_5m`Mwn%48&1XoeYu7^{PJ2FTT20`<i0(T_@TK*!K+vflfAiCS+=eTj
z{V3ak)D|gPljY4FHG2yBOf+v?oAbONH<(?+r8$qs3y$Dim>&!MAOGkU&NGC>^CNm+
zMEi+a0dhf6!Ia)Y#NN_rFr8RL2$LFoPeM*b{jGF~QKMh@#zWj!;c_!0q?CgDMveXZ
zzU*uXkxhiN%MZr~)nYjvs?Ju}MBl6q_~f?X+|8;Vrn|~~aBh>`<X2YFTTqwwA|}7u
z^6KT$vY-g$JyR4s`>ief?LAY!lp6M5l{5DdamD7T62G<d;%!8pFkO~V_yM68YwY@?
z_ymc9{w1koqChH9c{@9MpSMEqyX7(q<37ppQ>(05w|`(`SA{b_OC{v_sN#jq(8%QS
zR6pki!G8u?@E+;a6Bga<^qMH<7vGx5ha_F{b-NkDctP)-G+EagU*`m4W5!0=*p$HF
zA%b)t0b;qZX#SL*?;D8QzKnU1i;oXeVCC%H)M^0;cerY{iLwcyd;eWCE!4G-wSymZ
zr{fONXZCTb$3_l_73?3H=$+XW*(56SlM^27go_VawX?msG*lmjK?+F%$(Qwi<}fRg
ztAIo`E$(oPqb@y3Lo4By#%Hz{%W}GG(tqH#<*N+zA>cM0T2>*8r5T(hpD=6qS*X}N
zr}C{MnYO_oQioE}b`s+pF3a2mmhyL5;1VIDAAb<u^RLnw(F%c*!oTeFUoTb}ug2uy
zKYumJDSkdJ5k^pWflR#?UVH3wxUTD{hZMaw(G{*ryn=N*OgTt8PBnX?JF81FbJ5D)
z7q4%bdM)UMDKl!(dPiq>4h+%BUrgE`tr*AgT`s6e=re4*hKjl<{7m)7&<8_G_FbM=
z07;??kI-D+GagnRbvlqf_hDy+Rt4a3fffKX(u@Diuy~4M8CH|KZS+QO&_q$f^&djm
z;tY|E4s}Y4_>B8`mmF#<=db4ww)pnO2OcKy3#Rd)Y<8N+Y2oXEW~-6OcHm}7U7qtF
z486ZL3;?Y>c9Q^y&h8Xk9kJ1rI39ig_@40%aNvUh7+w-0z2mphI(63k#>orq;sL%q
z73~&4!}ID-Q=D}2>KQHYhLua1>8;_IiSpZ}+?%*Gfr4$wH*`|S>I-6d9P-}#M1?2t
zI$5_xf53xjeXn&S<}lY-s|*E;{<l3!a!0bO@nK-e%GjW1VQ(51YlwRkKCiDr8g6`v
zVve*AB@)qjU(H|8QTi>xV~jOan>5z-is7HTUmAtn2%pcBQwFG9c6!ANG`)S+OeoVu
z8)8opl%v&6DRE9l+kB269x_<4o7tJ%65H0e`O%45YIm<NAEt-Fri5*B`q8ON?(>|<
z-a9q1jOrmJUB=+vC!%g)@Zw`XFqc#Wn5d|&qrtq(yqtGARx!S%_I@?`(Q1@>7{0k-
z37>JzpgH#7>C1aG>j}S5)GOPdD(f#4HK&P}cG{HFO2U4&_PHOVadTddF@REI2mLt!
z^iZ<XLOehdXZrL#i8ab*Q3uoKeu1B7d#t)G=o@iFzbM*LxZ3Xr0BidhC_Zr1OYa8I
z6K+u6pL+%&I1+$dR!KY7>v^hZ9{@bm!a?1lClInSj@u4M8<kJlr*4)_1c?TdCD2Hl
z!-0(3Prp0<bBjeupXl^KD$!$m2xC(M#5bRw132dC8lYsznc3N1R<8l6i3@${vsxhH
zNR;7TqvEptfzPC|njny<;r~mn==>z<`$&jhaPk_WoeZ`cA^B$O)k*k%6-P-4v0|RG
z4nP~`Gc4)zwXJ2DAga%(l>2H`gKemX?*62zPpRV6GLESXTbGaugU39!P&ba*$*TgN
z=;>630&m{SA7rg@$Kpz<WM4@8itmQIiX6V^XrDX3TI6i`%eF!mrdJn0C3HW{7nkrx
zgQ6Dh`JRc_03E>#smxh*no!mxYCV8FX;pm&fd^{(-1Y(dy0N49bsM&YX9(M{iWkJ@
z&d2j$f7`?ULf0%ncSTFwo@@Oy@w<@@rQae9o0$H=>j%)1d*<V;0&un}iUIvGfDz9r
z^3+rrny|Xpz0sXa76TV{r_?{)r{=tjZ+B7m&xLCt290A;ROJqb{jT}ZjE4ZJ%lATC
zP#f`?4%9r0oHgPcck^uVa<3+NtwJ0eP;i}n(a&Wu%?olZ_=~+0dyBO+W<V`XR12ho
z5#sY4oO7E%_Qa6BO#V~AW$P-9xZ=47d@LSNw;vEH6(M0G-$h1#?&|yY0l^oNFDUf_
zPo6YidH66zxJ}X#?>gxEhk_DDg+~9Zo1wa6dD96lKb47?%#ttHaTw`z^*8vWe$;gO
zIdLpcJ=1*3UVRn;`i7?)4P^8IP&g@ojnoOdJ{k7iJXwLucE%C%1Fhy0W`nU&pyGd}
z6R+jr4!}D?t{3}we8VhLb)a|;s*X$r5~5fPzI{2yr@tfxRZo-Fw?Y(YHIL4+$Y5u7
zL4r2eRcs>Z<ROM#ylsmSbzkneantYl+U!Kx;Ysd~73xu1Q-^D;N5dCe8bi-)0a3R|
zQs;h)v&A-QGll$xEmLWm#hdtvE4pZT1p{u3&kObvwb``Ha7>6-V1vxBOL@!VElp}v
z-;)O2xlCsMy+L>eVXrok2xb0AU%LL~NZ0X|_i8QUo=a8h$=mB_t1ALOxuDvFWR7V^
z$_y@tACt3nm@}tbfM06}<o9V%F`MJu4byAy-nb_*8?+;c7&47$4p=TaM)f6_=H?pA
zNA-Q5>!0bF7B9ANi|F1h?TA&5nQR}|?<#!Ym16&;0&$vf-MDD^79V6yzDeHuxj^Ma
zX)6a;uH*rW(+J3hB7{%Qw>-#QD`g(`7r*U{=X$Ktwi#=ZSJ<?=an%#F$@H!}$hGJL
zTn7=*zgz>8<z5w#&(r(z=(Yq%oDDX&fl_)=kmluaTNNTt0a@|bV{z7Bf|On}Bt?(+
zIov<1R+JAM=kjfkD^z?eWM*%oZxc2cB*nq6GL0_${MV+kpD0MXOt5YXgWwJ55t@T<
zwwQO_(A?ES6w?e6njxT5Ni&{$FKB44=iA4Dg0hVpcxG$NsdpAt*u_G&({oy*PB@WM
zNF}P|maE*NbSiG_?=ihH)x@5!Np_yX+?4mNqG(x}#%EMNP;br7+Ej^mAEa2(HIc4C
zT<D@2pnU2BdQt7Ru~Q&=C(dbT2_JFdxWM-4eRA`<<f}b;mBJyf2GG9q*@Q(s-_G>`
zfT*mF9i)!Zz|O%_rg?xLzz+<Ap2lGNApj}dayQ6+n3Fp+8-jirel)zfFL&VZE$gP%
z@0t0C3m0!qoWw3SHCEcFI&})WI!)}2dTfSg(d{^i=WIfScMp$W(|s;+v&vrjw!dP`
z9{e%IxQxy1B9;ZNI)PdKF~hiuyDv2|<#Rr_V)173*xH>^MX$(yg{}`T77Nl1*9}4{
z6~^mqPEF#<)Z2ZHTNe3Mdv@i;KkKtP1b@mUDghS|{YdeiQAA$b&QDE_n#9Ue=noIa
zGY<!v!wEmqFE=89Vk*2lBAWUH!pncy(q5c8mR75Y{ssiqn9dMea)boIM<KV!ZyEeT
zL<KnEc{p-5B$~m2Z-6ls@@;s-Ft|XSb2Moq3n4Hcaw&tCm=IXbmr;+~bo3EoRZM`W
z=9YukZT5Tab{oK=N(&(Rf>!}Pz(%L|`<rVRD0Q8|BwYQWw->#Dlo7dEQ8C~ZsC(~R
z)Ccv+x0TqXUM$1<YOsm%U3WH=;g0jD^O;|B0@0>01kNlG0Ob18muL8>;TmRdBc_0q
zYO5)=HIVkJPF*Vt*2WK@sJ-ff2;qbV3;ovR>?M+z+RJvX*&cC1I;*YhTg|K!O(pDf
z^wy8|j`U8*dd1f}I1;wO#jvw5VDQ+C@>NY4w$w*T7j0hB4q~B`LH7R5(u2j7jzBj_
zh8^;K@l52Eqm8n0pARq0npIB1^*Qz)>pEF4kVq4Ic|Y1-yLs_?)_ALQ)5<+eJ*3)d
zw^ZXcouR&7vfHs~!1k~~@XvXbtiFBAKRc53av>!|>wX%jWp#Jy0T@>$qFGG$?Q8Si
z_$kCQ6I}bfLTj2^&iH}E96?aug(`qiSp!&6JB5|5II+{sa`K%G@0$a)PYgo57VoJX
z#pOE;vIW~s%~vB0BKT_`hODW~!x%G0)7E_JSzLV6-Hzw|>SV=E9}YySjK7EF4Iix;
zdFYLECm!!K<GR6vj_InSRK|n<{s)l>?#Sb3{&eg{&hx=5FPcRWm+`Qt3>JZ<xAWQS
z_V)s3uOK7|{oEJX4p$&h%mn9QTo=dwM-j?e;yjxr@}u?O8xMd+yGEdhEE$yfP3(7Q
zU$A_%@=_LDfUSWI@BM&42qsW|cJ#|qEs_2NQ*<YZfF+pJg9)OZRROK8uKr5A4TjrE
zhc<?mH!+TVL274~^I2ui`avZr35CYr;@$sQoxy@~Ike%uMapxHz7MH8xbF|G$SYiY
zwe+bQ&>J1(s(-{m_LAPgfOoE8;WmtCsi%-N{=Fu@eY^IjEzTfvTQ-3R7wQu_kGXH?
ztuj%sa1yntZA_o8?D#~%ck$JTf*P5G1UdsrOBR#eR|m}<ZR9UVOd6d=^%mb7nY|~&
z_`|W?K;~v{mCJN7l8j<X2gWDbQKXuM7=~;G>ZDsVw&P;+D&}v+vgk=Xn0Os7{no{I
z29Vv>(!ynKomTDu4)*{!jU!_~%%NDy@gP096J&Ke73X%ooo!}cca}jk-j8;o$JttQ
z!r17=O<gJBvvmp%!xgFr64bf`2EK73t(KZq^d^%z&gnQWFp63_bE0;So_LF1COakW
z5Gb`*`L}31I|7XeK2Gl?9w{iLwXW?+m^e&Yy?-1A^acIES})yQGxCOfNtpW%+Bj%(
zV&~8c$x%r*gv3@*x4j2|dKX6{UbU|qY%|%_scw3=HXUQ^v^+6>BPIpGL`Pt#GG&+9
zP3@T&_zl0I#Hu?iJ1F6lWhE3~5zjq}OdQHI^DASluj<*&UaBk8iz|oad%)`=m%1p$
zwNC@e7~?+4^cU)>r@`6;j?^z)5o-El%bNoiIb};d{ly4|i5j;^?X_Z8Uj{RaF}$_C
z=2spn@)ix}*`DuyXC7``^_1ym9$eEJhw3s%-D)W^w*dY5zB^yRG!WmLQ~_ROJOy9q
z(ZRuWZ#OYgyN|Zsr@+(Ycf?T`^%ONj<s1>nrN__Ax2w&%c~6<Md%7y`;J8@l-Gyu2
z+m+(^k=`)vRUsYP)%VUy4B7eb1lj_RUTGNN`z{slv|+0}b1$64-WTGdj~t@aWt`2o
zG#4*)Jz4rFZocd!?FVDXQBi`gf?H!ze2Jf!f7eYcKLRasV8GzAYl{pv0T%tbRUb+I
z%T`G-<=O;LoXQ*pQOe}72yleCWjN>F&7xKR5KT_Z=$m4%&pHOpcF-g5{RG7^E6DZ0
zM_gMBm$gS^@y{v}tn|RitK7D~-1O@C4slWs^~})AMyB;iIrjgax=9j93{TB+e`x^{
z&M;I(6772O{g}UsZK1N#zIAijE>W<x8Q$=_HB>lie69qKy=PKitQezjE`e^Aw6Jzz
zsBT*79&zI9L8{!^f8JMChD$tzzl)<f$lp=8i=Sn1kZIN|dN4!-p3=?Hc35ZNX$kMU
zkiz5^Vzt)?v}J1b?PpCM(JDcO#_Dad2i|AG8M?1cZql#QMUCYo0qn(ZS6|LYFmr`q
z-5RzD>aEkEA`m^f>t&LW45)aEAGpC5<Stwj{oZ1soCj3=>2>pUfb-ZOP@sJSswr{H
z2ssE5d?e7nnkg!zne#1dA&XwBuab)8ZUEEn2^wj(kF56V9P<`z0iWL0hw<t<XG&O)
zBNz1v;t*N#Y^6~u>c=VwipO7Xx_`AXAt2VQePuwUW9wZ>9msYgC=*jH$n`=h#$Nvc
zf#$TzctQy^2Y(P^uT7<X)k(3C=u7-F)&i>cm&p+Yv7}-8b-+N0o<5yE>IbE5Pgc<K
z(cy3{x?+gqVGA5fP9`NyJ4J}OHn&erYFgNgR|2RLJoAwErr(pexsS^=3kw+C#UTK)
zY^MuHyQ-p&q)ffW{MSL|XNZX!V%S8tx3@F!ls5P8kN<vxWJn1Y7;C5r;tEG+EVq#)
zuK8WT@1pu?z&k}^eERwcoy;SH7G<WnVBXu=U?pCW>XxKxcUYzM&mlfJv6lR>rAz<a
zkvxcvOlWjR>)GgOMQ-*=IB`+lvO$xBsL3~K0ZbmZh|Qvcpa6nb82p5%EQ=A#2FI8-
zk{`(1Z>xhfqVyxTE(cz#kDMF$=MkG=CW<Y9=`$VmLmfIUy%uGRkAd2<#I>@tEHf9N
z&N>~C6#AZznHF~$f}S4sh@5L$tZ(u0ea%IF+!S$}ttbZF^yu;D-wEV?t=%3CW&5Cg
zGI?f%cWf1^eGE=~8LhcwCQkQ7T3kA0%dI5?W_0Gd0#E4ljq1V{_0hFmaR*8^`<<kT
zf**ZmDNj?;4?zDe#WSZLKazG_R~XC8yCUOC-4b^85woz3Ae)P4aI9Wv`?5oT)a6*J
zl{|1nYUWmiN*o?{b9zAqJ=e28sd5{+of-EVJ|V$h+i5pc<#6{~^~Tpq0@0MWKsb$b
zyD{0S6$bF&sWmOXT(<=`b|e3A$v+!V(?f?<bZ?KFl~1h#cPdGmV~RCuCOIo!Vek;F
z_Fho)JHyNX!Hah2?JIj_*y|qgfV%p{M+0CZ&CgWUc4He{daJ;Mv`E`rWC{dHsO}v5
z265WAYr5DcUNdKcTKzzXBP<rB&DG`o*{K{5|5zOZStnFaw5^(PO91e-cN2iL5|7*0
z<RISCWisvA{ob{$5V|?}X1gAk-4!_kEabO3#EP!+n*CLSZM(5Fpo@CxIAe$UMIEb$
z(XQ5V6bDB)Tb#6g5Wfe%D$m>OC9={|zmk5}^tL$W3AIh7P}6yKql^B<+D@2-dAv!{
zZOO27D{F|7{C98$@+&e)#5U-5l~F*m;}&=UYU;QPjt@#9u;nUfJ7I|!zyNfN!)S@b
zp=}1ux}NvBb;yoxck3*MXAEs&;<d9<r1*xRXY29A@LcW{<(0_>rlT!qf&4q(sDYD{
zFM(+ZfeYrM9fbKbikQ?RhVtJ&WoH_suTu3E(zqLCFaZELd!Jfu$pHwJdoe`l?MSJp
zx5kxk0$}63NeViZeUY#_tD3q)d1KMYA0uD4^$a<pbZw+xZMVoxBd3jCP|NLLC^?%V
zzrJ?+E1Hmtx#ip(2;J89PcSMte{45mIw;hEKdR3(SPjbAbhEJmJ$;5z;ZrMvE4{)6
zVD#O;2;BsZuyAr>!{0>;N!Cb;Ztrt=3nLq=Rz^NsHCoW3?sL#4qsczfW0M+?49C|C
z8M27gimiz{v3tltCa*NZ&j-(WP@ZMou|R8)r`np35k=Dz7qx0|uraXLMio>GGdDUF
z0z%H;CrpCwNQz+poYNwzH)cm5rh;7;DVvV&R>;A&^l{gB&J%Jb00>u@*sLUUkG&vD
z%h~t%J^oySk~WfNtyY^#Q~C_K@(T3OJ?7WrwG(WvRUgh|2h<mtT`PgMouqr7R~2`f
zwZowPaO1!qzJ)Quw(yp!b|-2@nte>z!LFoh2c)mm4Gvhf8vs|H`N4P`z^3H_bUOdg
zeMW#TQZhZOc{t^~k&C~PU*)A-H{k939S&#bIULn}><xNw+}CtJSqfs(Hoa_Oayg;E
zR00`?=EsGM8;S4)Hs=jlLBIXCVl5kRCKiydAz(!whxL-O3Vjv)$E$nUfCWWX`CI9r
zq|{~~fYgV%ZUXE!(T=gR2uHTu76?FHm>hC`g0d+tcGrNXd^g2&F)CCf>H{BoSXZUV
z<m6&A^zF4ZO;2&;XHM3A7sFOo7q;o;eoi=HRC(CwZ5dgY9q#)2r1Z=;QC44!E)m`|
z=sqx?*;WcGun|x`&{IBpyqIV~0u@jQ5_ziOuUlE+{@ljy*GOs=F;g_FsZz7`{l2>H
z2=LfHR{=Uc69Q#7ZC2W>^h0&(IShxOLei^j+?ZZMt(H+T5%giu#$yv(0jrzFoo^LN
zEfSj+cv3b#34hZZp?#m%YQ8CmavYE?)@<OXBs<yudJ7{n`iz4MLrcd)rA4>N`Qr~~
zJUpsEYB3GX<=0B}yo&@}hZt<`@QNQNdPK+{Z(w*1Z&-M}TZ*9JKCKD%i`+>z%dJpj
z;!<;_C_dD4HB_h1i?lJ7Y;6}m5^-DHp)6C<2whW&OpH|DWN~jA_M>MZ9q1p}HLMQ6
z1YDgVYhRN!-_G$UaG6xAQmJ`SyR?~d*n*g8ZUx0&vk&^8Ic8mCZobyu4HqK~g@ZO(
z_b@HM9|^f19HTrE2h!7{QJH;)tUCGa36EYO?Wl+En!3)=RaqrZy8zGFkR!<Ci6CFE
ziV?q3Unuf+n{extvi5_;jxV$}d-JK>D@tzeMyCCXOacW1yz3o7J@S=0T6{7kkn4Fb
z2C^i_{yPB2@QgmE;OQ>PEeSHvH)S5S;2b{U&kr_tsA1uKM{0xpVCMRE;H$xIcXA52
zUx#T^DHLvkn)4E1ny+7fQ*6*fqa||&7zOE2gF&kvDG0d)wNe}izLpZD?H=Q}^ksE8
zHv;?QYlt@76=XMk=(fEHJ<Qps9Ty5<pRGW^%#{KN?@odOZq0C`3%<EPf5{$@Xf$wz
z_KnTLR<c*x<=yfGSbv#!&nMzZ;WPKMuPA}E2LYw)px2d=!R-U7{n(Z^rg&mBVu|=2
zU!4vGCnJuC6vgcUCDZb!;RSt>F|tm?&Z1G)DX|jFlI{{y?q4L{AM3RRim|=iwZS$^
z2*xNxt5De)Y!!@6RvX0<|9b11Phwws{HKf;o!0IV&Wpn3uJ#|aBPZXY$0HAM_Wg!9
zgDv1wEadMwyTLYrN1Fk15#3)7Bl2Cn9hVwXh+*w_YGt$l>c_fT9T-_%c2E=lGzXLP
zfFk6is6~orqcljihqgF>yHlZ4rhY3!JXWTNS5=1wXRXtC6WG1>g1u8SOe$%ZY9fP?
zi>$d`>EYVR>zB01S+Gx1{MjK;dch2-lxc85e!!Q4@37;#W}g{Az+?JtLzztHg+Pv!
z`OzKFerXlZC+=zjywyHfgLMG6USm8CNf;3TPyVCH5m<ETct9`)7~%7-(4CFvTcE$&
ze8f3-rPcGKQe?;BXpIUml0N~nK^Qs<P{n@=fW3OdA0;<-HZ1It$#@nO5BhH%OoNtr
z(NlsRo0}jtKpKyu-+*(Y?`bVmfcG)#2IkGby}Tq8+jX9_ThFspkIp1+W3-gsYryGw
zk%nJ;b&iUld%*j)So~Vj2czMbonD90Zy&=_6`EKj-Gj7O8c^ZRAtS7k<|KnV8zgBN
ziw_prx)QXfuRN@#$nd%^U@+P3`8GT`W`vcp`huScs~d~_;cw3pyokVBT_h2L0aFPQ
zm-_(jGxDm*q80&qeSF_v;_J0w!=3>70@c`<Bl_Hu7gABm_nK}XIRRYg?&t24Auf&!
z5)iI~1?FH6YI2e<6+zFDN@0WkG#W$90?as572BJuslgfPl2M`})4E!(wzPjD@OS8y
zX{BQqhf3c850M$_DHi!5LncAu7mg?usu&K!SZNn|n=lD@=7lfvNtS8-9KFBFm_t>Q
z1F`}MAf!Tu5*)h8$4DC>UG0D1kmaQ;Lh@K2-Ab!yhfBugd=ud5Y5Vc5fE-`uOyh$#
zRFCt|HgV7Q3jQh^(Yf$hqsg0t=FC~wW>z)r{hV?QKL$F?wUM&Oz#v-o$i#0Yb*pOx
zDYPtHGUF!d9ipwe<dH##j0iQ^RxC}%D1H-IS!7*Pzdf*m{(-~c1ZuJ!b{-{+mv2HJ
zFRfek_2`?c{bmav8os>f;eReV@hLEf`tUjFT2$?*As?yGeu)Mmr*>2gfh{><m8?~3
zA+~DT_m7<RD%SR$)uCEet>`(xzqkHT96-Vq2EzNv*$aQz3TsTMTq`(O8tBxquEV@L
z<@n@==>b*4$m1_w0mI(XeDb17VBDu@UUnwf^IUI&1UN*<KBmcJ`9AnlfI?Vgh;Nll
zn6K7K?*$Q0zdW>u5KB_Q+OjTyz|aqB!#AQkM+ygIc+&B70|}Vl0wPI;O7b-P=a~US
z_A@H(V&C&#{oGmYEcR5tMlH#1Cv!;KWs?m$NqY;2qPyc1Mx#>*_Y8t2jj)Q{3s6AZ
z50;_(%1~1cFR#G)l1r({erfsl)Ho+)3?z&{hs{~y-h+te`E^}9_?sUON-=g3y;u58
z2NuWi%Wq1#KkTc~9n?J>L%be;Z~orMJMg==#dvUu8`tq`o&2MR>D$692Xn_ZB*x;!
z2_xkTd3GTq@4LSmY=j$;SCKB`vsby86E_y5B=VF0!E0#{r0LCeF#`Rceu_c#8dBRD
zM-g0$m?EV?q2xp_=U%dHz7C2b#+!^-=)bqLBwFa9YN_2PsLu#Tt2rJj?%M>mgwbt*
zcqZvO8XhOb52(qSjJic_>A)7xEK&`_g#Cn>7#ILF2oe&+oxiBO@|2HuEZ}A^?=H%Y
zEG`rniryr1e2JT9eKI>T(;d$>P&hX-LxZ~x$XRiDbqHfIkEHs;VM6pja$HxCt7Mbd
zu4=YXl{`FNyAWg%&arW~>kjOrs7EJ%6fBW&a`2#B;$qn`L<sqhu)UWLskS1j#UhFu
zq`TsufInSk+jK{a(@g|ZG>{1QF=G)nSv@XYmB<#c=o04-(yMg+Cagr8RIg9A?SrdC
z%a}ePO7x4(Z1w88@zgg^87<67s$RNBs=6u&hpA(=KiXY3SmnV`ziNlx_JsvEX4>D3
z`g>E%t(Uej-y+0E2g@q=imw>H&wU`O`~&hyF(1dmj~?R_yvt0QdL&4w`8AImldP;S
z8DtuIpCbvC$;sC1uu3=r0uK7E`C(vQKRw{Q_=;aA2*Ihh_W&U09rxMNk7D>a!xYFX
zu$qn*6bO7ST#8ih2gnsTRnURQ@?*F!?)o`6J@Q>kP8EL%oUTkzo}EO?$$$I>v%kV>
zAzpcGcf$gLFUOo%KA!>2ZiNW({faPuoy+`{^N{m-BPEQ#lf06fVxuc^rfBw9Za15f
zhadlIe$a`yc|7fI$Im#*Gw0?!WvjjLx7Yu4;fXxWXCz%ffy5dP{P<n*`q=T#!Ww`R
zCC@d!&rFXKH?Kw5?=H5*n3jggZx3N_Z#WeWMP1R(=ETHPWqbY#q4nw$^>zNuAd=ii
z!lQ=7#F}RJ%U|Het`iPv%sfBx!AZ^DY^3kZ9cGui`~|eYys{AW&$nU}G~nysthPv4
zBLecuN|mG4bIp>}UVSL&!z0op^sXElzmc`n<bc5vAO4}vdV`hEIJ(F3*&GxVctO)V
z@gU9Ll7H!={DaTWWNjgO9QVcX!Wu|J*4SP@g@A`p5ZDhq|CN39n-KOV_eWD5$dS>~
zY}h7Gt55GnHq95Qu!y$aqR)A`Z(n8t9ziBV4gXTv_Ttf)uj%pEP_)eoPFZ#>>PKq_
z<$WX5P&b7q$?(xXn`~YhHo2oVx<%z5IL{5BY5tkA-WtxQ@*5)oc?cv(Q*}`-N0U%H
zu$x%<c~MJ=ak`6Kv)<S#6DcD8DVm>K!7dZ9CKby{H~c;bfi+7}E1IgAJ}@$3E*V+3
z!IukjUZ&8tDxaSo^Upo*e@ZHeVB`zk(K0PbY{KGW4Cgn=-((>6O*aGY`qvHjF$>Ff
zr1y;b=cdC7OA^v)t-%H6wQU_?EK>%hrMY^k3%<4Kv@Cri+}u8^MwR=jE`_E0)RASy
z`@(q#BC==x7+WG4r#WS1f#a(xI>bYv?qgu98331E4CL3cdb|KDAke+se6ORRk3Fpg
z_B6HvgLC!@#Lj!!o{~`Wb?S3wq69F{`IJz{3>HmRs5MBC=J*%DXdshc1ET=7L<6qO
zGX`@~Zopd>0>+tSCtOB@laBuG53s=w8l2NlETx?slvl|GH||;OAC$uY-_<m|NvkHW
zyu%Ue&pIqHFJ4pgjoCbivF(4=;c8Zvk?TtDl-^8ac#<9{W)oEA_T4W&+q@{-#a%?@
zZlQ0YyxVu4dMDkbp%Et)Gi~!6I7edoOb<sjgJ%R5;-5|X3bL_uQruBLwB3#JML>8v
z$8g1nkesJ{*{e*98G4OJIR!)Ef(2n(9`(g3TGJhPE6!Pvna@}#K4>e!oWj^cBW{Lu
z^@DKDPHPfF{uXhw4cZx<K%okms=`Q{H0dHK{4h>Gj%K7zLE_eY@9U^2^*i=oc#Zdw
zX9*zpfm`t9H@tOK?WPzQ{<_|jV=2_-38wVv2w=DS0kkz{-A6gQqd2#uUxL+m@WBf1
zX8<p3U(gX;1!2Y9z8CpfLmWNKO6!Qa2|^>`Xg75y@NkfHfTPZ*&>M}q<{Hjx=_RIP
zMFkeXcg+O$#%=Del9+$8s_(AIzHGysxEbCrLb6m4TG_kb_Ly$Xgok|n1hsm((%8ng
z+_@v~$I0!o9xV<JSQbw?Vu0i)a_Y@LJfN4eSgXUeB_LP{s{?Jmd0u8n$B7!mTlowq
zH+!<txRKMEpm*G$U)M_?A+B||MmyNmPtI={$miR0hULRf?w7kVcoVC8PGK3mUkh+0
zzXI)<cWEBI2AP>er1^-kUDM$W6DN^c2KDKYqTPp9HGY$j%{;P$DWTzX_08%CldTBO
zv;bV@UDHI5i<3FI<AZWYZD9s_Q@+v$9)QLf7eS8+=gU88!N7j@^q%7_1wO{<z_;Xt
zDxtp8i?sW@vgi)sqy{lZXZDZlYbx1V2Wn$RYfkLKH5FGPZGMpT9d6xL`_u6q!hvob
z0@efV!vgxt=^7_n(6fxoZTJfWqT1jG(vD{{UM^cMGm=o3qicL7L)EJr_PQ8>K_H>a
zJ#2CkM3=1Ffc?#dZ)DSRm`h+$>1^PN{gm$e%7FQtSCG0lxFLjbP*SbS927lOqU05K
zb2GcS@6vW;C_nE=>A|P&L&K$a$J6dz!?@~xo(qDOPiF{qy1@nidVste6JMqzKlxyz
z(flMaj%RPBC!d1K^`C4HD9w8(7`gRUgyuX)xnBQhARf`41y_`%%(rn@Tx`+pYNyUl
z+}_Z4cd}U^$^Hg|j+wrK`JnhEs8!{izy80<B^ny;_2#-5S%70qbd-9vLqKl{I51d?
zX4dsdZ5e5P@reWUPf-JOr&U0G!LX}X4YG6m5|A?i+7#f*Cve*tLm}L~hGHyN(r~??
zG%`X;G;l&~j)OwRBV4LgUPwuXf-Vm2(iRX%9l-l~`_m;8aC(|8t}%0T8adL1-O
z&~a001r>!lcC?2;Jn_@N4V)O%s*rQ93%${P)d9~0$nW>!AiN3&oBC|@1R~{SgrZV1
zslz1mU~X2j@++ym6Q~K&XMl14XQ&?uqHH6}!xf(oR2}uskh5C{e~T|K;&!_aioE3#
z_RwrhXkz{0Y^FA-EOx=)xC8m))RtEQT93YLFLZXmU_7pe6+}gD7=GI(>ZyO8Ydi`;
z1-Ujg+T)#x(hJ}fzRx=$Ylr#;f=rXzXw<MHI6~BUsBT&)0v-+<1=ij5HP_B;$=u%b
zvZ^a+l*7~<GM_F(r~)*BZf~MKn+fk3R2!r;x!p&XN<*0EE3LJActL<=nlm!Y4{~Ic
zqUYeqv2`D0iVsxaLpxxURHl609ss4gy!zGp76(v~X}{|NuybpG&xe9=wBxJYx#jB2
zWzX0#UY2;QR`gn2$p&H)vur#C&98n3LrICq$FD+WJwt}Ig2E32FWkf6R0&##lIAsA
zdSv2-)b&(O`KKH4@I|E~8}*|z=W{}v1)B&nzMt$A^uKaRAY13@io9+KW7KXLD*?)z
zh3I(*2B)I^;$}Vb1_0yPG}U}~Z#UET{Oglk6iaED{Y-gO(S@|3YfD40aEDM85mUeA
zXbvT<^Ax8-&42xf`)Z_XvONH;3>wuC?}iY~=fQK^;f$Dk`yq``)36eN>E?2)yT(-I
zXRZpb0LRf6*psh|Ze^$NpB#gX+6Oybky?VVaXZVMDbT=SbI}e|nDzrw<Qj1P91z)#
zn~eB3MLB9yv=#99>VS$aRg?42c#Gf-a)o6^pP`(#Hf5xncJ_x1?L@!jvAp47i<)(@
z#8<bquJ-lgkR#fR8Sy>bij3jUD+huejtiUBYitboX*8aFo#Z@Fa_6`ARP*;FOWocp
zHc>05V^imMx9gM*=IPA$St%1IhSwghFkXoJAC-7GoNNUhIGLXcq5UyTFYn!1ZK*8*
zT?gaAHdFaj<pRV|*!bORn+aR4nZ*yj>c^R7H19)jHLhhZVvCy4V|SUz%^%fU8C*8Y
zoLOKo+2tv(aJO)-@_G)NSe!UhcVsXr7Js_Q-M9AfJO8+{imF-By7+G2WQ9O=u)e$M
zLGij`jC$hEK1F>FbMgAYTg~Z7&Q{N(%7w9j!>v@yzmOAIG>_8I4xs!Z_g2jwNd~@#
z<yBIfPUa?A)O0goQ)nQ`BXem^KGPEJE&$;xjf3WGy8J1mu)1T41n2e7edbS-wa~>k
zxuG_xP?MEf#CrBC^C(SwRk=_2_E>!o%hUC~vVAdsHK-d_pec8Kd;~SAp_!_YP<moK
zjFwVUnDl2kQUgxj`Ct!DZ{s|U)MlbR{p;mTW3-9f-hK_y?A#G^BY(q*A?siyO$%l9
zQS&vFooRd7>U74G%`&BG-1RhjHeB@5yY?hs%5LeF6Pe}O(+`So8b2g3)^}Cz;k%`q
z<&!-}cHMY7udHC|MfYufvs!yeb>WG;NEe0tv)$#EF%C2I4!I1c3g;i8!vki0#;j@S
z>3;;2zy(!6nyQGh2LUzq&%0yO=RX_<K%3b4=-r*|xyA&3w*n9*_kXxUX$Zu!uQ*@@
z+aPcWfbg$})NJ4cRiH^21cPRmi8HLbvnoTPyS5bCq-~8pky`DRI?7KAF^(9E^ihJP
zqls5u5z*2bR<bdrE~!49s9j1!vX4~PHur5TC-Lr?<?^NRVxa<WxG#S$jOV2D_9Y^u
za!@sQt1QfvI#CQ!$Lf%nk{IW3&94jp^nX};>#(S{_wQdqLO@DNQbJHVq)U-dln`kF
zX^@hX8dAEH?x8^e>24)Ox<f#^yBY9V8$IXvIiKhIeSUxZu8Zp&XV|l6*t6GK_qy-*
z`*jns7WLSe6+ha)f&^_^ZZL8f=)gMLr$Eqxe)aiHYLcFF@Yx2n0ye0@-nNL*6>P+`
z#=+K+z#<L#q91N3^y4id)Y>d`71XsKss|xcN>JEEU0PnJVMq3QySY{;iK|fUWYEuN
z0`SfBzD2_~i?x#|lo4P>kXH{nO)EET0~)!24=iehG{b&?Q2ck{$#WOgt0QZ8V0M%F
zPEeg-x%&J7Y$C5}L;$MvtGCfP8=ALBkJmp5<|;RJ3V5N{Zk-GRnl|gN%}n|U3}Qgz
z#&W8W9n5;xg!bq9;fMjv^Fut@AUFa**^;?KkP%A-opQ%kO0FMkcLE8>rhnz~7PMm<
z@bh-Y$M~dl>?wutKoUhc#%IXLa~F$c3P8nfrEx-=>hg-gmj6jXGdEFj<2HUfG3
zBZt$2CQvcFA*`N$RtXjVrE6;2K+f1{W(mTFrco7~LKx@f`4{s`F0@QW9-+zj{4*XX
z%b-`vqaTR5zqA0hvwM>8sgGbrJ4@g)yp~_P`*j~I6mK`37OJ-ZcG(W-Q*X1?bu+KP
zK##-g79TBx$@$n8cbm|ugAY>w0xh8tc;<ep4%Bu(w`VkT3wHtgYP(Y)-vti8Os;2z
zG3;(O@1Hs&h4%335O<8%lXetmH5fSEE0gTF5U7Xi#9MH^Uk@)O6|Itw-hSwC*sKxX
z!HHNOoit7mdXI{t6KtS9pkC~1;(AE%F6}RIDYPD?970P-KH)pj!=kW0u)|xc6kW|p
zH-Xl9dC*x%u7Tq9d#VOWNMhMfw0;s*1!3dCRX@dlM_Z?6)V&aIi>Gwp0Fs~|saJh)
ztZrs;6qu^Oq<aK71~`O*VhzhM4?g?Gc~x-#etJ7=4g)-0WNnv-!sxnsUwa_$G*x?4
z<|#wg%h+8+a33YN<RuY2k^!3KNT>Rwsh@(ZWl_oaC<tavK^>Fs;sFTk{RT1~MLg(6
zFoa^AIbv=CwUy8Lq#8(aqrkGczcf|UAFy6($pQJ^R@?8Ztfg(D$*o&8lb~GeMTEp%
z>Gv21V;y{%&CK5+&9D(~f_X9Kj=VBCNs5%JZz8Cq;gItNJ2J>z<b6Ga@O)4vF6HHO
z_C2=UkB7rHJ0NpL!nF=IY5>*|K6B6jkn{z2=@Gv}M)5Yx?PiP8;|m~OO8atMC~vL^
z{|ag?>fkHqLm^hz;hj5o+;1tq&~dRMQTs5WXyP-u;aGb*530dbu9H&lO2@3_i4e0z
zi1pQE9arJcZ0*7nFNoI7iFGk3`OV*OH#oAmTOK;nA8^4%+>)L<n+Lr8)BV;cPNU%G
zVybL=DMQRlc>~JkzMoF&si$?~pq6$(r)W*wSqaLirl1#d3KU=uE=lR==oBh=4Fq_P
z<C|z_?<=J@DAMpv-!nvL<iXA@+vkU>?-Tn9zj3Ia998G>9S-}drZw-&^2XAf=7L3^
z*>{<|n1j#kMfA^7(E!60vYn(N(p?TQ2_v8F!xI~Q31;6>(qfJZdQB@XUj;ofS}RM;
zoiB*s$$Y#iQ62Da%_ZKQgV8BRyi4SS8?r>bf)(g!)?9Fg0K4j$8&f;i45on?1kxKj
z(E#YdkO|;u*a79=08kiPg^SJwb0?;WD&0xl1HJ7Tpw>|PO}K6xOv|%Tt^C8F2BAk>
z8#o0D;-qWR(lE?zfw|e9Y0wGYC|o%|i?LhmsH*4cRi{3^XWEKez-o{VHF_)ke2p0u
zZdhNrZs3_bZ$LD&(?xwD5@7*?&NL7eOI-h>trw`pczGyu4vfT?av3V4DvN#=C@*Y!
z!xewc$n{>T(buT0yYadjx^$XuT3sRN<6q@RuWH>j*1feWxX4(=u*l|}V?sTPNj}zX
z0hF3^Rfa~V2o<|)KhXXNNm8g#fwCw6@y*)8CeRCn+!=7)AGyja@YY*3;l88-K|twT
zs8B!9@~!~DacUd2`-|s^BJ8b2D_IGGF#a4yr20r3>c@AJvtL{spMNmxQ(-EalW4Fw
zne5l6hbsU5ZOX}7r^iifkJUvdP~x<lH~SRlF1O)<_@ui&&cPw7*>lKQfJx4zHYA&#
z$ZdRG3m~PdnTHp%4IK}Pp+d{;&H+v!0;j=+<B+&o>*_}37M#r9SD{4GJ{A6;umx6|
zJT^T1%l49@pg}DnR!5ujnb<92Wq|HaT7fv1Y%{qw3A`3JA)58P%l8esFyUv0GROZo
zf;?}@)8s6120CZl=#KYTwN1uj3+oN-+h2<)@@0)#?_7$epc84(UQ%H>+AJ#B9q4oE
z>It0|-cs3|RE?R&5qk5qLL@YeRY+<gLDdq`WvZF>A6pFt%)te1jVOfUj)%_qs_K3w
zJM7M4^cJo*(0vrSeQyflNCd4Bs<AfW37H(K2-v@d>uQWh(PB|on!VxA*Q?@Z8^=A6
ztkG<f+{?YqC#Uzac+XarT=jED`%uz1R{u?r`=;ep1dkNtLOI`!WiBOqpnSg!wIdn}
z6HmQWU_Z}~vcY<A*Ad4!`Tp*jFld$X>~!-@LqZ!%uLzLUFM|w)l~E12RHAA)Au-=u
zdABiY0Vg>N2o&!{AA=@=gtfc(ZZnNFF$+WOrmAXPAz6DIw{btfUD%EpDlcAQfHOjZ
zl+!y~8CTM%|G3E-r9E^+0;o9}jjzBxr(Dro`Se}gt7!go3+M0w#J=nr&wHSP4B7|%
z9zsD@A%Xb@d-Q8iB0>k{0z-@iBpHvO@JZ^b;Zk(EYU>Yx`yc;Quk7f8)Ix;0epI7a
z4<?xG0(*5vcwYJ3W(}>z(Qfnoq$Bk-eM4o2OjziUdC<@KouH?}+SL}5t<!Wx<1PWZ
z6J-Z1Rnx=8+XpLZUFFA1b#-M2<^s-8#Ue!m83jxkla<!j@wLBO0>p@%&M=BL{ub!B
zb}hPOk*LFU0_x}j1SPmCloP<>0{iG0Y&&aMdeC}R7L>v)wr^e*-+~ek!P2sfrE`o7
zWv5f?4#zv+D?8yCcGTzBds3AIB3Ri};Gr!`aI7C`n$aUu8uj_28!n(%`96^Bax&2`
zsD3{!uc+Iwumh;kUPw}3TQ9IG<ovBZFGxjJzD)~sgIfW4z$_4hK6@jy0adRBS}$Wf
zuzJ{eiSJM6rA^;TcjH&hW#1xA6Ac6^A+~*VYR`-l#!dN%l}9Y5W)J$8HWS=h+%>Q_
z<j2)5zE^g2Xzw(+$bN#!YEH=1<mH)e$%)&F7AwC`uwk_HhDA@hM8W5yqib5Ear?u~
zru&C^`@3mDK9GyJF2?%aQQi4^nb?Y;#ATKKudS-HWXh_5uiTU@^JgYO(fT({drVH{
zpln;xG~rQ+_kJt?xcp7snikY$7tN|2bn=RnCn~;)FH@~jT);jz^1K&?&=n2sr?H0_
zXT;*$SNluYAHZLTjf@OCM-)8v+yG5CZ@Wb?T1_sp7kFDhK>B59`?Lgi7j-0B1{+3G
z(9<ss>RSaiv_i8(`ya=nSv92~`^3r*Kk6*Ne&?<9S2K)KHi1AlfKQ*HP5L|v9f7Tl
z46=Y}pl5gh*t-chgFf#Nr024)F;I2L0p_v#2>J!k4&eU;Ash4kK5CM-UHPa6RW0fp
z$Y2GsOS&ST1~~=pPeN7F<B8z|db~@M-m8Z*leRsV%1WNKOdlS@xu0QK7S$vNIP%)8
z>YG}lsy;iilCY95^^s5N$!Lp6V<_3LKQlK!pMs|OFh+rf+I4>4<`Q-K5^(ho9mB`&
zwGqbb1(`<h>c)JdwY<X%HmNqURPu>oh)svN5<bI0^3v(7^ZkxG2O76_d`(egpnS*z
z=zsLR2d-1c8`t{K#rB*)f(ghGexCm@<4s0c2$^1Nv{f1vf60NT2Kor^Fms)3Ii&Yt
z5EIP!ZK3b--q`$!J-??^F<Ilpo=>j7#P<_whqMp4%Dq_Zwc?OmbQ24*%rIQ2gJ0CV
zN7H*Q{_@9XpQcv<Q2X4vQuC9Sp$1=1Ynp{S4+yaG%x<%Wku9yu>}^Gy83+u>JU?pL
z?ikgL7v?mPx*zWjpJePL6;<vc`NF8*rsxy}7w8l_S|P1JR#YF14(qh6CzUjmw22p7
zwI6>oAS!6kp!MylS>SP=H;#^X9nkCjkm!105PA|Xm>8ED^96CTX9^OCJg&#vFD*UR
z9{9epX!lQjb6OI1Ou6NVz9Y}ad%>}*B!lqAK^463!A`YihjhosDT$QX^&f0QqXHw|
z4U#f#*jjF2Nzl8upx<8GP*`V6|9DM#v{Y@66=B%cYTLpN#s<|$`FHr><B_m7tBGdA
zN7fk)7R(|SuxdA9zw)Oq7Q>?Jj-Me)^&f>zwawL>T33~amTOJq42m93G+#cZ%p6-A
zKjkS|ZJVpmZ#Fc=pC~tt52qG0gg+(uk0dDyT%2z!)83nyef<5xQ;qNKS2;h@HMWTE
z&h5P-x_Yv|_q88!27E75!r}WcPFJqOnA`G!ab;p%4f2~2h9U5*rJx@XMbkV^8tXhx
zmls_&C_#BMNbz~}z4GC`mVW2UU9q@2GSeWlEZ)DFjWh}p$mh)wy*O!7beux!am==g
zPvIZy6}ud+l$6{V#!7Y?9IxLkpq1A8CO>LjUv&~U$0szw|5hh3o)kM0VN-(T(aCf$
zg(5Y|F#bxct9(0GAJ{)QD0&K$6g#$jKUg`GclkgE@AGlp4Mh5{x9^oeHXVUkI3YCJ
zD(=QvExZw#EO-AtVR<(HiA6xH=E@_s!||fLX!0*Z+Z}2ptXpoDmacqOs?mv>O6+U}
zvy<gx$2M48{&(7!K6NkIix-#3*$0aXUhZ>*k#4L;8Drr7lc)S$Jd{OldYfdt*h5#`
zreyT_qlZP(US|?HA!{RzFus5Vx|W+j8?_B{y9i$c+hUARg%|b%UxBt($<OLm8gDs!
z|Je-x+7)|VC!8_6bLRXJ)<=gcRpm?@Q<s#Z#bdZ;V<i+lUGU_#zvx8y-avWvMk{N)
zCbvqlPDdAISSV|G@kp_ekd>uno<RfaR#SIR_ZFR%n<Wc;tNe+%AWXZYWN)EW`yt!e
z?TzK1LS1r|SIK-nyv2z!=Hb4e?00_Z!~ySv=X>1S#22{?B$BRNJ18#k`Y+THHU@In
z(0+V@y<$lP3hf(ukTvIM#6&HK{M>waMJ!0>&oBUNpou(<kmU3-wfTn&?HJA9(v~c@
zDRN(PFSReF9QO^$@pW0x5kOBP*}`@STx|U$!AQ_H<@ZWgVy#8!OLAhBz#ShsyFAAU
zMceBXR;j7!;oM>)yml)VZ+Ueka@#HsDH<uy4}Cv*xaQa2>p2i{c|hE#%*Y!aWHH^a
zDDoa-dMWSA#polQ$Ah|6wa@<r@)`8K(!+PPASF^&d-ygtJu>)<VljF0IyRexUk3kW
z!u_3leJPGNLsWV*#FtN1J(YRSFeKSFK>4^?aB_6lRSDdek1Smig~Ygh$HuT3tFAWD
zqx{s~U30Wd^qpTK^XgWynX7+t;?!d{BU$riB3v^to;Mu9wb1Gt4i|7MDWR0xl~pA4
z{#;dr^7{-7<BwHX?K5?*3Vk!KbhQTMe&BIp8eF5$bo?{h3Ef1V$#~0FEa`@_#ELrl
zNFj~NI`Z8}UMf#g3gEst<Z`etzUU>Z2r?@k;fNX{KfjuFRH^G`z5YMwa^#KnN)jIY
zO<enH<rb6@pAwzHdRH|nofuig(q)<;#MjMfj7_G3Uo&|^)Ro^@xtgfSkyy*JyXwct
z`_nkBd);O!dUvcy+b_2-^%9__jNizb@KNUT-$eeuw6CBP@^jW{f3f6e+5xX-L{`Ds
zaJ+#Jk9Z2IXT2_u+h_WlV%7lDRsD9c3+7GC>R9qtBGw_Wad%F-*VpWmIN;>3KF2uz
zFDTLr)XEgUpw|li5m@Ar_g~83H%|NCvajY;!&|-s#_z!mF|R-C%Fs?*`~dZoq$@kK
zpgS#p<mS_w4S#T#V1CiBz(kG6gncpHAGID-D~}{*j~D+k#4J~xggeVXNGbx*mZhI}
zK_*`c74r{00CYlJ4AfjJA`LI*<h-&J^(ooGyROaLB_w_1PpWoen?5-$?j@@hn*D55
zTfcj{*L&TI7<xTDo_MM@GO69aKVG8;L$BJC^C04Z{W6twi;8jA#_K2n-X3^r<>q9N
zh~#F7R3S2Q!7wi<_~8F4DgSGV3P3$Q(cr2~HphFfoEBjcul|xK%#hdej>?ZOf$Bcf
zcU|r7@*Qd}ft2@rI*EhY!{Q_5d83zwouLd7SzVvtr>Fj>$L^z-lb|=MLe9wj%Gduz
z$NSe?{`0U`2sz}WppX3LxsXL|tBHpD7aZpp3|ibr$e`~;M$sH&|J#Mz171$HD_vE#
zrXArv65*0b=ykKr?@KRM2TYE)f7r_rRzb#8WN_n?qcY-4{Cmp$JDvDn|L`YV*4e(v
zs=j->&ccV%bn8t%r7HM(?B0os9rP9A)0uv5URdsON;h2z58%nuUmLZ5P2)XzP|2xp
zG8Cnl!`Cj)?Te0FMgt}ejYKs1{{m+I^Jo5D=KZ}W-1EH1#0rRJ%UCh1^Koz<;Z?5j
z6T{mkP%C_i)CG#iY9B1diZLNjVmybg*9h8!%dDtplkCgU8Oobdf2K44b?X21VI3p*
zs8ethga7rY|M6pU_+zFhSmD86VE=2{`)f}4Jq!L%4^xhy`{T^k1N;AR;Qw#}G)_-^
zZ%1>||3dU)D<Z=le(@bZoM4$8u8&DXBKeBMV(^@vnO=DL0BI1%E1sLB1}nu)Ji@_O
zQ6C^%b?e}{wx0Roq+hgh#h<s5^W+t<M;e=A16efyH9Q6L$Jey$lz*h6|8?(pvf(qm
z$l)^^&f{pA2FZTA@Ae};zeuXTrZ90nj7gvr)s2OcdQCVTg(#N-Us5krpnl|pu$rdx
zm5+EVK{EHKhB}>-c~S)R(aU!4FmdV-O0oC1+4$n3*|Z+)Eq=>t!6Lz~*VTI{k=&TW
zsc-r@qXW<sWa7_S*GgYsBn(bZx3?HpS*M$PA5u*V;f+o1y#Dp~H%No}`lqzaOHfAv
zwM`bS>PRZ5d3<Q6@<kT`_MMm$CeRG``vbFM++Z7oK&_u$t$<l;miiA7R4+3!u!YJM
zJfLHPjvowdJSG}W0Szq~S!1gizMp@!DeJ_=^c2@dhQVMw4o&<wYhGfrYVw8?*N&w~
zVijqqX@~X}h<Ww_8#3Hpo5-pufzY%G@dJE#IP=b(FH61t`0VKl>?HS?eob&D$dy0S
zO;f-;!V!zH{Uzjbdiix%q{N_JBUwG*GHw3%5{HB*bDP6v&{4zfAjAZ8py?&2Q}`D7
z<XX7r!<W45cX=zKtfz0J6`|a5nZJfCfbosw`=f{Y8A-4P^NTqim#M^*#;~9l|H+hU
ze8<?!7kpf^6V4!ed1q0fU$Kj6oCS}5q_yza3^HjHJ94s`?GqLp`{STi-%shAmfs^`
zp?9Mx>@(>N&+M$mFVE$4G=~7vis>A<Q_9;IDzV}}9@OQRkKe_H63G3)SVc*d{>SX`
zou;v%CMs(GQey=&3N`q|oR{-!IjYgO*!!2;BVo8d$;)dhy07w^Gi2|jnQ@;O2&=Ik
z;b<eWR)Ufb8F)p>)_{MZ2iO7`RGzc7r7^bEk|d;YZFIgeYan2LDnGBGa~ALw#?|)}
zh+rw%KNcK3ynPB@l-1D!FC)<e3D%4jJnC_kGua5U8pvGOkaL|n2#9Chp0^6p9(6RU
z_G?cBipi?31-w`Tdxp5<$N%pW3H`PCPq_xZVWq2*$6&TzzUS$KMbZ!IxKFbaZLZY`
z8{>y)Sze8=9-deZB;VS!f8{UMjDbzf>D3wvmx;)Jl38VIG>~9BosuaVjm>$D)Up5r
z8>G@(gA>FFV-g;Ky=9gYIAYcZ%Q;F;dRFcxUtm^25v@gOlcvxtwcP!bSI9nC2U5W%
z339O<o>@=gp4nS&qyJMP(ndzS4K{y~;kQNi00o#UJX0UP1Wk>a2HtbIed>$Q!_Dob
z8;p?pJmtEPW*l`}C+{tuQEKpV)=91{q|V^X4pFUa7kxU-xMlBFKZ#5{%q-^DYs?-M
z=86#vE9kL!eGy!nkvnpdSxjjF7x@wTS7P1ZmdBZxI@5h1=}Fl{t-MS%HA~)|V=Q=*
zYk5kgQM!9**JI{5TC=%iS-8Mwe*7AYGr*h`izjA_6~|gS>g&%0i=g>Z)u*`?a6zPc
zsHNwKgHsJ<rF*?j&PyE?&!#lA_PQfR=ErPJ=KWPoz8qwu=VmV!`LJQ^dx_VKe<JC_
z|F1|-9GIQX2dVS2+6uU7E&|M-)iQoJy~><M!ZN^!o~(CA@lJ8rVyh^cmK&vBTi)Ft
zBdLhH9jv4o3m@9EyUL1cXG@m2b}5xV<HGCktmAs98ire0-eTANdB=kZOCqST-8#Z#
zd^}-b`CfS@pYL!l=XgAxW768L7w)uL8vcO4LZ)H_H}}K-nUx<^`9+6M0eii&wa&cB
z-Sn__#JdX5txJFxFGmOoL8x%V{TA`>MJDA65?lX_UWh}gwo7;#`?J$U;<u#-^m*T`
z_<g90$0IS+$Lw#!28|XGwRP8%@x^OSg1wq-H!G<hM}){5Uk?h>CQpBgiOP7T<fWyN
z2?U~Qr!jobR#fbB57G^EI=wdDPsFYH#%>n3FA(F-To1{_sQs|tQoafFS~?oOJSk-b
z$%)ULjZHs#7kG7%n+t^=<!s-#4AuuKhudq)ZY$P?k)!_+;x+c(YMhQZcw<;~jibI_
zNEO3$m)>h!PEWmB1B~El#$V&m+@3{Nb-(n9xhoeanzYEB6_1a18>b&MhYcTfz_@mz
zN=Y;+JNt=5SPB1Q3P>w}Jv4MpyiGz5H17nR{Y82vFktPG+>9`V<>p(o=)@<tOX{5Y
ze?8`Z%EmjaZO#xaFWp&Me_RO0`*6j$UatNL^tR1@40EFJ+LI$K%`_b3gWDj=aE4PG
z{4u0{tw=r??H`?;2{b5*^4g6~Y!Hr<S>xptL|q|WO8SEWV)B-8t;42nMDu(GV2~hd
zGPaLqx>=$(R#{x<T#(QU)~c?>7PhQ&m=%uwcEGRbtipZIx6S^e1@p_H26FW|jX6o`
zzm~JhMGcz>Ik{l>L&@#MH<st(@~CO-Di81RKN&dc5K1CvTn@3q$r*~_t0+}dVbZZQ
zz1UrhA!fF66Loh2xpcFrs;fowO}rb99v5;RT4(f&=aVO*Un~kNJB{Zet}b<)PF8J#
z9akAW6#XR=)Bl;@zYMQhFt_Qi^(eL1uiUYCB{AD_P-{!2eQSxvZQxLyXwG?xR9koG
zWZ_%Ld&k{9e_D<?Yr1RsUn;d<VTXB1-@v2}BUfaK)_IJ4M=$C@(%zbarLZ$DO_+=|
zR^r3Zi0gB6G9f<{xBY1`o9Co%X!u!+SYa3R2JsSzQt_oJFf|xCtHUPn`~DMx=N%w!
z>y`n(iR1P&y{6-pTs7Zbg(=Kp=Fa2(`n_`I4l9hs)<mq5FHRw_Cp$v;5_%3dF1{7<
zxo+w`DRJA3#}zjk3iY{<4y7)y9aM)>ApNQ1zg(cOP`w_AB+oU`OOlw-p<c!c{#PzG
zN0tp}i2jJCJmc|ZPVg%f3e=ws80X<Gxuu83Ov!vt;9jELV^8G}eT`q#Tom%5T>o^>
zY%KtbTZ*TG2C2wot7RBYzDA<Yc;USr9<h#6k-t;PNF&o*ibLP`vn=k0;Zm>;usiv;
z(i0TvldE>Ne-c5nzIAcJAAGCII-BWvV|NcL#%)udGmhHsN8_{x7V>UCDjyV7_ZiLh
z6y!4w%Q<#DI?SWLmOQD(D(iMJ>Gd*Pr4skcT4Dz4XBy95dUJn%SxfWzo&xT5MAf~L
z$36M^ZLDL(Av&c+=$@z04h`0QmFrLA?B&k;E5<gxKjQh~jnH2+(kuCtlZu8zQ`-A~
zToSMh-UZW>8IK)9sO8zSfVHw9iCG62>}i8{6znf80PI6a-%Y(XSafYZdDvL=ipq-~
z5(yjGK=$&BNo2F6x7izuj297eBi>2(E;fs8t2o&UgSd;t9IZvb|5t7ctsQlQ=!iDB
zEm_s9QA*P&NP|67?wtC4x!(cf-`fHOB+tA73Zl1OK=${q(D(8oDFLdh52K`^vL`d)
zw6Uh7o9PA22A&r<IqJvmD<5qHo-v%w9&-?EC_OgaEk4OS0rOCCu~R@D_;u@{sZD^Q
z9efd*z5*8s%OXf)QNqHTtiOZpMZu;q#tG!J{PRZ~3Hkgj(ZRKs-v8Q6|2n9VhQ9GC
z<x0Uzinl0j@1TqP@*Sbcc7h?=2`Oi7ljId2XYVyvz5n7M;yroNZf3=a8G?Ixt75Yv
zAJ>SLcJoR7Ic(m^U9qKK2~N3;&rqVp6#_(FmPpS^yaQvvcji&syMGM=Uo@mRf2y15
zElj_D5etsn7Wltj#ml?HnE>23B0X5e{}dm8U2)IH7{}4)TV|3R_IYxgFNNTf?+)&0
zN|=W#KY0SnRl=@*r=yWm>&n!Ux?SRH(IjgfbB3_LD`qRKVm$8g&~o%zO-7hc82m?f
z58E9XpOb*x3<Zgm628%HR=AQtgQ2Jkf(9&?|M|gUuDCyKm0)EfytKMrUR^slG}wD6
z@c;F9il>yIo}PJkECIyPpv<SyZIZX74%K(1q*9n4Jp6Q~;S8gd&DkM*<E*oUpy%rz
zp=l}k>CDtsu7`I`Qob{T{W&dxo3Co-kmdDA#J3&NImRw&-wy|uulyW}+vPYbzu^P_
zVAZ^A1X8@Sj=PJD>&SjP_*e~<50R9*ETK|Iv4YbhrCsi}h!{#CA~#=)f?cy2w2VZ{
zf7!xl@LYddbwyqIb?-FJ(Dt%?<~4Rk8xiM>zb^sZ9E;EOX#PC}dT%!p-T>0cRqjCP
z41boReV;=^qHGf<p{Nu?(K0sC(oegJO9-2#R{hAL6>C}e`aP_n?#_bflMkGW-ST7c
zK8hNjD(@AM_w5Vd!4|$0;<36Em<uK8Y);1G#XGH8gYKmj(nbaRsptM$o$`Epoo6oi
zDeqiMHt$iBRcsDy`1fcp7`KMZ`~Jz80^Y7ysK**tr=$GF)!Q7wSK^C|)HC=^fmF1W
z{|dA_zmm+8f&EV?`|Ga~MHUull5{=~LA#o}DjAg<TUU5ZCC$1&w8d{YmzM94vA*+|
zkXm^u`}}$);%l%n9OZ{<EBv_Et@C)mt@#a^`>ezn>zq8oq`?qjDmGgaEVh$+PJECJ
z^Pf(uZ`MA%eCC#@lTIuAaA3E~{p>$?QCBdUF*8Ke=5;)UHp7}dBo&uL<C4ocCN;F~
zY|ieC1&~VG%D!LeB<>uH0pGa2QAY?0S$R18R`D||&e%;B#_$BwnWei`3~?HmkNKI2
z%n(03SWv8^-+9a%$En0?K5*694A%@d<!OjajfYd}AQYfqsTw<gb)58-wSyG$XI4&?
z{pM5mmi0+&>Sbh4K|(O*l6tP&p!{oc^CUp6Jf!O!nEgPgsd8NaOZFQn87#J3MXs7X
z-HMY_-(de%dFzYF$~Q-3=db9vrKH5N@*SuwjQ4Z9z>x@l%CprQV%$=`ll+VWD!7xc
zjQ3fyjDmwjo6D8fc~BUop6*8M*e2_`3E0}2FJ#bfaGw(IRWC6*{mb*>ophxqWZOG9
zM9OjDoqPN6frpYO!Olj6wH<2@{ORQ97#&4*;m1YETsyGxmSXGt%c4yjdFJtZXnMsT
zm~Sb0rIGc&&bfOnOm~A?nVij=mm0RD-t=J<qIiZvm!%D#?$4?B^56R&;<+;HTlA6!
z8dY=wF#(FLylXE5TbW+P++!bRCQJ{u7^dh1qC26%Jrp5?<?=LAJ?2<Q&)d*|KLH~k
zagZdwEix8QUvyf0y%m`Biri%_iMG_8le&8X)ng{Bo4Ri{7;Q#q7JIgK-}I(xC`d9|
z1CInt){HD4!CeGM?2SfzCgb;LVlm%`p&TAS>8w*YodL$bZ+HRBeRSp(2Dt|=wf6dt
z(;<amvaV2C8fW0RK$Ft-OMzgx3KJXj(PPDb9dO6gXBIlZ$CI1q@_<`q;+fDZ?Z1Wn
zJWPMaz82?6^~bu@s6a!rM_0T0%)BH}M(;71V1`LHG%&tI?J&AETaoF^O1SRUgJ;su
zqj$2;{kOO-n*EMh=VV>|<)>o6F{RU77p3rzHu_?-6<;SmewDXjo|8X&DOMXrX4WZL
zNH}eX2*BN_xIDvOgUvsusr-BF<og#O;s^)XG*tqcZ~vZ{U#1W)w+c(ekpkXfasn6U
z*U{?-h{jKo!m|oXst^Scy>rWA*joHoJCwh2ZD?4YGpW(o-YZIiaRrn%#Bnm%ZSGRk
zS@~ZDhQBSVt~Z0-%!%Q??s%ZS8N9E6H;Po(@_t(PdzCliS^4Ss0_}0|V`Fw*DKeP9
zg$c<c28Y<bf^u$sgKav8?#fdA+@z)gUhohMAT7ff?h4C7+g3QYSeptw)<h^>hcm)<
zE?6ubcR+;#B&kuLMH6kiLt^#cv+@ibjePK#-EXZ3=?hdY`A@i?boKvpZJ%Liw7*4W
zCP5*Vbm6Ae>Q1I<{4UuWx)RRMu1P`&L>xjH(;Fd*gKP|x(l2gHk!dnw_T29ymD~{i
z(EQL)FS00vLC-F7N2hgIre*yuxJ~loum$g@_@SZ!V--{#9?VhX(R*m@*Th!=Ea6W<
zn)U&%Jkf(cH*e4Hr)UB%e>;(=cnXu};=JwNK?N6<#Z#l@Y=ZQn(t3IJSE~DJZG_bI
zl^WC8jP+kP?S&7ch?VvS_P;K+91_T$Jq}_9Jh*Y=zVhb&!^@+x&!bG&9B11|RM*b-
zn^oiLLK)op^#Y&9Cm%DH0Gyn(#*3my0|TN#<-l!r)G~A`DzZ`VM6-=h?DFL!i1d`Q
z7NI{1qa6GgP-YOIf}hN_hO8}nR1;;VdwcBN_Dqo}xF;VT@hH1NJqBC|US?nD0vdG`
z1pnkJSGUnmf(DCWSMC!_LyM9{waEL5HF-`^l<wJk%ySLF7AkIrddKD^pQ7XM$uE0~
zXJguxf6kv>H||b+1nC__0J3x+7|HvAv`+pD$ixB-Zrtz-+o$8e!m|vVSJ%f4AL!V2
z-P;E$xe&-QK4T2jc+GrvP^)SH%RtI+xHxTc14!0uhw6F|*rgQoMOFk7gdwq=0Qp{d
z_tSyx$`Kr+PK00?BuX#csy`Fl1XdG%z<P08*f$iInf=z6E{!hM$=)EGUw^s^h`i)|
zz%l(`!n_10ghGHTDB-gpB<o!Pc2k~>>C@qgexRt)`?~8e1=KyIKur>F1V0_jUIwz9
zqC0C+j6bhK#=4B3KhDNS>ZKrqam*dWXCEx@y%FO&Z_x;Jrr)8hI4|<|)9Z23T^)>Y
z3fJr>1D(t|Kn7t7gjqdiXy>=Y8}AIuk``EbNhBBy4e%<Lmb7+#RF#yQr8E1M=R8~S
z5e9MRK7N%&7=$D!%Oh0EdT#RNYAZVx!69M4avb?Cvpn~j)85K~@kL~04++)Z*$`)m
zlmuJs@4-<I_W%YtQ|Dqg#ttA$d)+VgsGWhw#iS#gfya9Gp>wC^)B~VOXWbFut2b6$
zOqQsar4J!IPhw2jGBpfYJljkf@{_;38|o8F^Prhh&@aP*?5Cl5#BRVL2{gKHPQSSv
zT{?0aDm+q(LX<-JT41zIl-8<ggJC(rQf1H8K%1Ls)J5h@(~F6WlBTc)*ujvVH5f1$
z7fDgrT!GhM$qNoXNJuCE)d)foX}*RNoTo{G_p-LuX3}ziPBFm__)8OB6*OE3T4P`Q
z2+^km0^TM!khIGRvD*!F1;&Il3N#nCsoa6qITzr;6N)C#nR!?NpgAAHV{POh{9!!M
z1HypZF_j&*BFp2jK5EkbIJ8QJCxGb4mh%pYee`#>+U?KH{B;+IZ*kmy$A}eH=RCt|
z7$E*$Z5V*qO_acisqwcD>$wy({7dM+MR>gEoeHo3_REK<+Ki3Y3E<E2NU){FXz=6e
zdil7%J&N<MpX&)%M+ol>AaEd=!<^^Rg7HdrA(5L%Pfxp+4VHo$ocxQgL2dN6-CL)R
zY9%d?*PK9D{9rR&r>6(Ry|O>%&hjv7XzVX^?6^g7McozmW-3)r;yP0P&gQ)=#9LmZ
zkl@o-Uovj;y1Ea%yoo@74etfM5`3jNp*NPV#eBCq5`l@g*_Iv>NbPuCb1JuN>4+e-
zbCGSVZT-lfc{u}+NkQK+gwlm3%C2yoUk_D$wUTU^xj*Gier+aQUH=*kCV^yIEG@ic
zUe9ro)pOt^$oP5m^8#M-Dh8K8hX?5Aye(^Yq##bNceb(XDec*+YQxPNM=L~I52H($
zvh5uuOQ3ezasXEVLAb)7_f1P87u4=v1AgP_G+@mVQuDM1EYl6WeIureb!&W$u@JFN
zZ__#DyA8Wh1$YG-9ppc+>;4Y%07`Rt_)WXq{B<~u!d0Z?<{7hesdV`1&HOt!=gG}7
zNP$YSdJK4dxX({P1tB<qe|M#%sQkTSI9!+6GU7>>UIo>>F-~Vf0@XWinJ*Fdl9_56
zbtY9_v@u}%coF2TBG9(0hL})Y3i(<eewIhUmG{~-cOrl>`y;ebiz_Xb%NM?mo!AT%
zH9vZQZ7WQWGjeh=Y8@I5Z~9DD9xT0uLmuG+z2)@r%9@%R;%NF>wx8c0;nQGJBd{`?
zKqht|I=f^0@h1SAm;|wwV{8)cYE~ATe-DgcAX&Bxz2YvhxYcob^ZRKxOAMaMZLqc&
zL+r+)X5Rl_=%fYTwDnAXty$UUL>6mmvvmJ?-dC``FT(y=euqFKzV@IFGL2OQ?(@9X
zuCoNbh%7MAz<FS&cC3MAP6u_g1v*6107${JQ8De-%)FZqOqKm1urN$v09Y>tMqEOj
z*aa=EBJ<Ul7dDSp9@GJva8k1CZb$Kl!PaE<tg6^3cIUsg8{&+Zc56ZgjBQt@!B_10
z(krduKF%Gg_jL=Cj3xV3@;!n6p*>-l><*q{VO~82hp7|MLU3x?>76CT?UNzV42!DA
zd%9zQFZubco^VmwVRVh$Ld;@CAR=$aq@kX|m1{$yBHvL|EV}-9Z>&fz{sp3aDllsC
zOEKp8$y=><c81y*`SD7_U}vkAy_2GSN_VQOa2@a(&CSyfx<5mPt1s12zrt(r5X|kg
zn-FHthG?bZ^)ZerAVJWa&r$zOfs~#77MnFhZP=mRp9oF5BOL3!8K2mz1)SEY5igtp
zvgk)uQMwU{=t_0F7X?EUhF}dSh|e3>E(TQ6fkQQ|O9<(ZVf1-%1xeKoKeTwDJ6I!I
zIBclAz!ITR5JN6w9drR62XFQtO#?WA<PGs}b)rrHyD<`=i}d5g+(o~;|2%Ih9_dq^
z+s%D(G-gbF3MigW!Ah;9J^qyK(;1*5_H8QU1NH)+V+dQ@kC6}E@%MtDevp?4NW`jA
zT~`8M*mcq4nVEyyFMhZqX-MCxxQ(6|GG7HWNZ&ZM?Q4#;!cP6Ia7^uh5Ul#+gIsyx
z9aJEOHOmqtsoq%&_z9+r*j+h$2$0_l|B<I(zFWDR7L>P%G||?;&D2j`n2j3%MO=VK
z+nc}jPSRuhs82YT&p1SJ@l0x%w3pli)$v~DYsosPBeWS!^^Y#<^wJ+~-NMO@buz>m
zod$<w&>p+l8?+gV9|O2e2M*|%-oX61Kx(O6%rS!^Vz5PyDew2X`SRuO=@avu8Zm?3
zj79s9FB#>B%iAsIks1@A32qXpg5n6rn(*NT7l+!$Rta2@hoJN7AZ_;de116viv73Z
zuN$iF$rIehV{5r__!j3RzK3#=&n$Ofj@>SgTom@6fu80>-82?6NNhlyap+Z$+XKYm
z>VmaqM;A}oY<5eapF(Z~`R5z%;qa)(TB8fyU2iQ4f3W?~zq-@@R#Pu0@#5P$eLYb#
z<G=u0w7!m|3QH&Y(WC34?!I}s>%qsmQ<(Zu2o|NSt%qI>q@q8fGuUnDkUCg9p6{By
zPM6x_;zV1%;)&roPP}TpGdWphn`9TGX(><*#H-%yGa7}9@n!_<x-bwSpcM!YacL`W
zfDlWmusV5jiFcZ5A^TNYiv)0Z(SmY#L(o_r5WO~Cvw(bK-m~=p$N0tCX~Lr7YQbkH
z@=Ln#bZxw9TYL8@AKzkU%R47tH?CC(alcm3AZp|QshWYs!f^amZoXczoI|9#a+p{G
z;`GHoHR%%!yE!3)`r!C+L}3T;tBC>JoG!lYY<5!xK+9^>Iu_|SK;hh-?8MpqA6HP*
zP=f_Ra~|8bkug3L-eVtH&~pNJf`IMZkD-&vVUdPU#aJ&5f<_~zL+&4rbxMt&q9_)!
zGd6r$Qhlw)@pqaahuzb%o70Xv8gJ?9&0{wmyE#54Euh)wG5i{-P}*@jp%?OwoG$V;
zJOWkPXQMqKz#)+cEnugBT{5%ZdB0*-MtI#t-lL(~&x&%!PFSb=Eb_^{bMN-NQJXnX
znW)sxVUSleTS`#pl(5X3-C?ZC%pvMw?J8pA<BdW*8w^~ESsbO?a<pU^ePn6MHa>%Z
z6H3cp7dCf=3-HDuT-oTK6!zrh<hk-v#m%=S3%|OnK-Cf**Ha`vjN@=;2c}S5EQ|Hy
za|=EOAjq>~9XQ(a<jw@@D0a$ssFCZiY?JH&*9qNWB+hj0{K$+HpSJ1NdugfwhXJTO
z3-zbaKksk4>YbhBpz*roUi=c&;gf@CwIg<|ehI78>{FGmC=`7IqXmg9fc5okIHyvP
zj8nq)`-)x^+ijcarRVY*6?tF(y^+Q?gyLZJ$Q|%as|@m}RS9ThK;HD(>{iAnu?nN?
z*yv(NZhFFwuWEZ0G7V(NH3z6P*q?ra?aW7JZmrT&uT-WPo(wpf+UyKWzTs<dt+Mv@
z<`aW;&nIT=Jc<Z{YbQ=Dj@fX}(3vIHwX;*#ESj(+90SFz*<`dBKmyL<SD(4`W#IQb
zunCi`eh-&qNyc>kJ5)yF`t@}a!wKuNkAM}OKiiL5yB2VZP1Z+?<ix1A&+oIXLrpG#
zXWcp!0o>fUDl;W2Lu)D&jJGKEBgHwS`xUGx-(|)J`&pezMn@Lu5^B$Okl1X{1=WW}
z>8*9Ko3RA^7+6X)5LN*h%pa3>EFG-N+yw7r*(7d!GJ(I;<QJoq-Q{kjcDugsSjn2u
zR{q*@U2~W4=Y6={<Jb3ybp0PVv{u$uEd(%}ka5<oH?!S}=4;3+dZzdGWGF6)9c;8a
zziPnfbB`wvj&{t0?m1FoSm5R0dTJ1_c7ZMij%6QOOF?kmcORWHqvjiLK{K(|^)nzz
zxMyz#%rR<W1%*q@{e&nNvprW~K!u-N!gY;3y^DE2gi1u?dC?XDlD6CVCHq)#iO6i<
z0rFvkJxyNCU8_B=HN;(`F~@t-7sA_*`J5`64#y{I=ln!}fA=?jP1-ih`nIbdvim_V
zi>=C;EfL6V{1h(weWeWv0|O~J8u$QbeVk{XZVlWP?w=o59ecn%qPoY#>J~qFpni@T
z#$GSKxX2(a!Q8+jYHE`@JozSuvJD`SPHfCVZ#RUvpruJe<kLaii%;z*9n!4bza*@e
zwGQtHalR$ez-Hj2$2^}UY4C4aa&cFApR6)p!3NQbn_y%s?qL713_Ry40I{!C?FFlt
z<-c#;tAzYQh<1-&msaI|00vM|1Z=`|KC`+z9oP$Z!lR`poIquCtO9&C;x6WGL;H`S
zop5$BS)HoA#ZLzrNqo`raM<ZC;C@oqu@iubV$fE^N}`>RRbK6>91FZi#k_1_Fn{ON
zoCSut77-w42M0cS7$v)v8+2X)Tpd^VlaSO?C(#9pr+8r2)}`E5A2%%R4W--!VN@If
z?4hc8O#A7jk07^`_GiKBb6~d(bLcY`I~xNI{x(49Asz=DUy2<7Zdd^wq2HVaQapFz
z&X++$8SsS%&Z<sH@@Xf9>X40eY=B+>ufNqDsE6K>6g=i`{=XoN(U8`rzstG$@jQ5w
zw+8$WsgUf6^Cw_>KhN@$)$?5@)kF1CUw-+?dx>ZFCy(k}M8Nvu)NcAg|GY-bM6}y*
zeNF+ryGGA${z6}QCcKP&H7JZ#6O7M?jMWw9#fOrxogzbKXVJLjhey#*qw901Ma$Pq
zwc}r`-99OCRf8$&Jh~{=RuR+FUoqYwq;|F%3!G#mOj@2Q?ieRIXlaYiUp_HNOFshd
zB)TCz9-ZIr2){j2{pd6rH<x{txkG21<rwd53Gd6_l2Hr{JKNAy6Bcd9Ind_CuA6k)
zZvsqeNup~W)UObLB1lQ7z9;!TkWdzlQiP##I(%bmt6FUBjim*hMNtTqtb6~_n?Cso
zGqHZT68tLQ?{a28l{^IKvXWzbt*M~;&4}{$!1WftDdPh=UJaeoeeKS@@-k{m<FXM4
zz4WsT)?!+hozx@sbN`Ux7}2_sU15t2iE;G}N%+zK_g{Co*@#k447dm3QJ}_(v#JhG
z06z=VkokTSi1?!6<^iG`;dZCt*CrjKgzAc@8nk(;7qrgJK~ISk5NCh4{=)phqG};p
z4!-4KTLlFI%8Hw{dvXw_FF?nld^V7||E5{q{o42$By<XKurK-0gJZyo{7BNGM|ck#
z%Fs~GHN!RCTG&X&J6cIR+9+&;DU8PV)*=D&Vn`uI)oPx$5nv4Nt?%8dWI6z?k6vqA
zw-%sH0rG(>(A4OVx93W(Zis9-;aULvb~Uro8L;bSLe{n3vfT#o{w82An_2)N6o8c`
zdz<txOme$K3i<S{y?G^-3LwN<5<RMVS!T2m)Zh*@aPO<JEbwIL8zXvmdEY(Eq{3L$
ze(>xUDL3++PL)oM;9jn+<?(vd*<{n-ZTwzt6GHpvIJLer@X?B=sGsyNoLTkdYg(sS
zTTi+mx2R}^=iV~&l{9D_t@L8}l2y0ZBj47WOkWr+t78V|Za`F7Qv^(pO~${cS20Ko
z@JoYpSJ1?c-WJ3~ofQxh+R&+8R@3-&rli8C^l{v52T@95z1b0>EO=2huw%P8vNMT%
zg;hbowzFN%h1E@4Aog(Zu>b-VtzWh!AQXiaPs0K7>%|ITRx-P|RxdJ+fb;0;RWo%*
z2sBbOv<FxN@_>fwrQc0*tyU0|q-Cdn1o5TC8c-m^(_<VOnad<AK-U4u8iFu^X^HCs
z^p09)&sP8^o93blI6jvzm$tx5zkjp8Msm5ysfbWAmti28W<&JwbEYu?i*X)jh^+%)
zCicr8+SjZA3G{PsGWiR~N)R~;LNJd|A<+GsG77?g#ev8}vVHmETSz^LMYPHpl*67h
z3UW~-v(#h9Ljz2Mgg1XHVdfcnL$=;I+1^ceKV8Wt6l=!+4x)9ZdMl|7YvJb@rEuPF
zf=Im2+g~O^$zqOC%>APsx(#nwB<bI35OFHx?}_?Mgq?R&=FZsE`Rd!+uxx}u!;jPs
zS@t@>iwx3lAI|3|z&#VhQ`n=A%9=Zy-G~@_G|!$_bYAKL(FQGW`#;)RChC`+WEMS&
zz$#m;AD@g*x2Ic9cdfctw@}fC|J@<JfX8W2QxrBbF1ULFVvG7}7i|GIAl|UBj71M{
z7wuEBLM|&fo>h(8T{ADSgd~+u>?5A?1{^><-{UA<Ltbr6DE}2$HKRIfG*(IT3WUt|
zD)M_{Nq|0czt7b|PWvw{z!VgD0UQ0!hwG+edFmy4b4$uAmwX-fA<Ej_B(_2QpAd`_
zYIGI%fb~V?J2n9Kl0Ld*5Cx<BRrvL^gL*~8QB~c<(f%>4ac?UEj-cNFp7tSB&zSr2
z9@GZ&X2ci20SX+E7~9jZwUwjP<}3PtPnCF=7ndAdTo*tfV3X)5HHK;Bv`M)ps^p9W
z@W&DOK2ybvOrY%$Y?uYe*WKtJ5BpAFd}RZ6;R_w%CRD@LDrWFxc@^KJX)6~VQ|FFd
zgGUoCHq5zY03C}WB<5-0&=EdgG8S!6CO04(%_uvrPG3>1akd|B2%Zlf53jVZYS^&o
zG4T|yz`$NpNwIAHF#@}oK^OD)IR%S4=~`~uoEH{b3ShjqShwVUH-t%UfPHIBm5jU>
zq|$9JKwnTr)-S>27+G#Dz)R-0)+?>qovS;V5-R_U({RDN)|%V7lWS%;=hY9jJo>|v
z4c1;ucfW63sBHZ%Kv+vT^g$`RAKDjX0~GAL$0^D=BsbQ}l56Ha1aC7kEt-++Kho8)
zopvRcESV_)^w9q6Fw#?v+o!c7PLDJ@GbzU_;I38^H+P!lwj`*1>!SsDrG#0R%FClT
z`Ok7fXFtIN+_1^t2Za!(M`gF~Yp;9NaZ*W71?FyKTCAwUQtR<$4V<Y@Irb&uW>YOL
zpG)gGn?yJ#!n?J`yGd(ebu1k)Ym*SNZijC>#Dy!qDJ`*Yyps>GO*johx$$@4>q`Iy
zzL5bfR-d7iaH0~o?q}553bKW<V>bo60UsBz@O2@z(Y$*ou0uk<vR|yTyJmPDkcIfB
zMyw+szT@bEm=J%#NnC(r(%7TWQEwoK9&`ibFgz;98KhHy@RkC^0PQD9ssn4yfd*K_
zEGcxM{ItYYev}R=8L9=Xk#v+G@yiI<M%oYzsBe9&1iHH!+{(rPsbj^Y9%K#guLHxe
z;5k2yGsx%$s8R)64u1?(Ou8G&D4+ZU@GZ%;oED-|h3c=&tImK-W2>&tv$?b&Hk*eK
z^<ytGhK}M2IwEcn%XA9*?daCB=-Iji7%tNM;g(6+95gq7Ri7l{#o6m6jF8+w_Nw1B
zJBe6H1Yl2W<mK8DQ}o;MOo-nfD5dwwH;dKo<l?0#^bvhDf%TOTn-Hj*#P;@H0%F-%
zMBbai`XIocflXrRC5pC9l!bCbbH6M(^j5V78w{xHSYZK~8!RHMB;@&woTbUpYfw`U
z=t&Fkax+*T;m@=$F7H=Ffi#bmoKk)rzzF~2q>AK}vg_%B{p>D9ea`;|50<`wV1~WU
zV2QEUoKS~Qv}(?@Bd2NE`u@r7^Zcd-mnU@vPYys&mhPYW?G*Si`E&kLEs}T-h09-C
zL7z!WbID@>9ArK+{fGNJI+D+?E6(%kzWCy!Q}Pf8QP^<IO5${S=P{n999wn3lOs)M
z4a>50x8l=Yhv<-K*%z|%2E+BixJ6}pflt~>_e+Jx+2$lBG_1>B%>E{n`{xrW83<-&
zW-s(PkWr8G-c;?RH+n983OoCYPd@RRPyXv9CHtk5uSgEkH}U}H+!|aVG8eal{GFJ=
z-iISUd#0QkI}|nDU%cVnyy}e;tF1RsbIY!<iASHqfkxaz{0TcItEU*!1I+8G5*RO?
zb8d-a)S*AXN#%d5jf0k8*;qGqF4izMkSCSiCsR`5QsN?ZQbHB7zdtpgTH;buVg;8;
zAw^x~QrOyQqb3OxJ<W;bF%5SFHJz5Xn1p&(Q=TMNt?yH@HQ@_d-GXZ@JAgFPI%w1}
z<(FrQ!6OWMN6?;v?o6`QPzyD(At=i!LtQT2QeH!DS*e>LvPFn`HU3#h&G`KpC^NEn
zSNlDbr)dFjV4qV@*}D{62L5>}=l&a3L*gQ&4^`?)#p}QbX>C=eYcE~BasvfIJ{R6L
zdC-_l^+X7GTMKo_TJK`My8`_@_YudRlBj-S4suz@4Vx;RcXRV-uM2OBmj&1p$T0L0
z7&N@X&(F6QQOf;+W|!0Qn)wU86&{vyhBB=_9b@ME+b*ec(oWuLIkfo%WaI*OV%_WO
zn_z96Xhl5cy6*hEd2hA6%ruR3KT5unlzW>*8ELwEKjKG~B_4N`o;g;VZ&Y!ty#<Q#
z*WzNMoB9f@E%|NjmF<sAV>?uGdKIT=h(F5YK5Uhe`?#vJEsEUd%t9}Y{OgAj<T185
zpTCm*ZXv@q`An9TA-B5?c;dNk3ju#dVZMsP1Mj>Wz8>ESE$x|HC#Y0zSbs5QoipY#
z^e-DX`f`tkVfNX4MpvM>dzP*gHPL;mpOHnWmgv(oW;buHDll7kP1VN*Kkuk$jGTM>
zpr|nIN%`Xor;^-Mt555+Y?5;0$M)h5Q)@RHRX*{lcSR%>^G;pd(a$ua-6YqZR+mXh
z#uzkAnF1IPKK*EXA_TxAv>3){-Hrw<tTKU^Qvip*^RDk9JAlIkka8OZPbE5so&-~g
zj5GPMcH>1>f=ZY_lrhJT1AykI!XOj+zPwf*52FeIH;hz*UOqy7u2KT(QPM9!t+Xlr
zNwTnj(KYIMP|zAxb1A#|j;Khp!V=l7!|R^t`ToGdhaE>Si(NbTqBrtMi?9GI9ho=$
z^6Lk8fEs4$k&8^Otwj>G+i_|osBY?|wa*7rzbSm@LE@>xnx1_2y*r#tY4J8Yx~q?f
zRnB0Ld-A|;rb>9v%ZIz29D~!+@^t-tq`MTWTbgKe_YT-muB_<zrd}YpNbeN8_g7%Y
z_=weDOHrI-)q8wf#FKvi*E{pzd1_4Qw|^PN6+_^qPuIB|scCAU)WiF*2pHO*F$?um
zg|*Poc<(>+7Wf>-uiN5|TrZ)8@`Tozj&W1eh-`5~hnUVnX`6C&gM9t?X2-JT%dY@D
zLaZ_#>`dY6=UlRPFXcGdFek_zzwSL@OQpono17pk4)RqT=Ih-iwDl5oups`AWPq$u
z?QV(o_~`zm0i}^7K0%J4XEpZVK!w*mMUWA9duMGKXBgdaAo~=x3#wKWVY?j{a=0NJ
zBbbNFQziVe(Ddy$C)H`Z;Z&FG&}$h4I#zWDdA#E#+y^)XO#%j34_B=<;{@$L9s}Nw
zR~sNCq86_(L<O-`%j|DV@O=<EI-II=2?A7rM>(2=fJ8KbJh3)b?l4qQ=M$=Nnk0mA
zmmZ0L2K;z>;hXdvz5B%U&QQH4s{%BB3j&eJwDQ#j85%KU$%cyiKksU;(e5!9TcybT
zblgyuqB%1eG3!xgt6`*g^n2vcE1;vkatgbJtC{uW%@D?5Zd8(B<wL>5Ab*_zS1VK4
zwD|^0iR}$$T!mLZMhl;9^6*3nDasNt1qcO7+OMtRJjQ-%NI5KLRkV90lQ<rOc|lG>
zsfw`dITGXh9JvLSG*O#p1YA5((wahbEniV0HB4ypB870Ch9*tvEg(H+)f2OsExa9H
zo5uUh{zg#ndq(gkv@XvO-0d;a(Pp8S$54=~d`<GrvSEDn0hz&Cx+up|tL}b4>W929
z<LZD`j=CBwu7$SxG?AZ^Xb-e0WZ4#X;6@dXbGsk$@gZTOgDG%w1sXh#par2{rkZh*
z{rrbLd?ZIUL%=jA1X-_q2<k{X3t?|lQ4VOk1(Rz5ED$+N_$%NDvV%%Qf$(#umBz!0
zS1D9z`ezdv;&odMKSe9GmFW)ZH}VBZh97yL%7C8~4Ak*8k&Ly;hSODMYXHS;ZU)7S
zhehqQLCxc1;>c{MUOI%;7vcSqw+x}<Jy5U7dsGx*H9WL?cX<L2)Q@s0(MP1a#X|tC
zPfBu+P&<!16{&`3#Sl2!bd~bg-=eAC!A6>-Gc{G5X3e!n=cD9)A!K5Mx3iCPGt-^S
zSCBs=!13k>^l1)>9#c~V3VD%`cUt7C@|L8M4wSxH+vCYU2&`TEp9Q1ZFe=1&I)gyR
zPh6@D1XoX4s+ZGdLAXm;P=ET-5kTzpJV4TmiqrM@^$zH%5};k@+;XZ#Uif_jbeA;y
zHtEYQUt%CY29}#q5I$>Z;h|HzzGhLtwKe7>90kpKTJkHVUTx;ttN}3vt@Hje{@7RI
ziH#)dRx+thjMW4_EA3G~Ga^B?UsnSBcbBf0YU>uP7HXWPFNQ(J|2l<K$RRK6=GSm>
zadEWe4!WXWtCSrDvf~B`cK;?nR(IU*najfyS^(w^J4+c@>rwVuPvx$X7F=8@R&j7W
z&vLcLY(Hvm;>1;wypEA-HKrQ7-p(fExlZTwUHKbZCAxNk%*1-V4Lv3kv~WXkm#;Uz
zDjOt7a3mzNE`X#r5yS@L2Po{XpQ*|~ed7Uz{h(rzMSIHZTS7TJu!jrvD=%K2Y4Yx5
zyj})GECRQcH^fEEDHCtJAhvDo<#^e6MZ(NJ0qP{S12C4)C>%z0)2gbfs#^(4kJiVy
zF-j#~C$qE@#Fn1t*Ke?L-KXsh^Z9*CKx4Vg4R>Wuz>*da_sfcIfi}^39OneWx7*PO
zIPl(C-&{}&q5Xbf%H?EkVpfnvd!}kIU8ucZP@Jo>)V7o_B%een6%H2?d;Iq-|9=?!
z3aBXCXlun0>5`C6=@2BOK}5QwyQM)=8V969kW`eAkS<~9k_MHKM!HLK==h(pzPjuF
zcP$s|T{EvU?>u$R*=L`9xY2%`wO<pM#@d1NP@iZHWampceNOgKC#=mGxI1NyQOd)H
zSB5K7QgT!8bsy>o#jC04ok&feQOP0JlG(BpRV{aMZQU5b>V?ulwWC=FHDW!<^aK<d
zBc9TTWfb^6j5Kb|*NH+v&tV<J**0J=*nA@b(cKSa_ih)pG!T~ja@Uo=qOfW%Ows9)
zB|w>Chqkpg#p>fVbaG3y^-A7y4&Q9d(J_tWdHzYc3M|vmY<hUFZb*JMfvsOY@>=t~
zFYrwV05Pp=Ww#!T7FA|Hv5fdNaA*&xwhp7Uvx6uTym#7&V$zSAuW|`|h!!(|$n%lr
z!(sg4g{-D!WnCt6l&WlQNwN0mcD!B<yV})VjtvLii&<}3J6K_55nXen?MLIg@x5z_
zWXm?ang-)to`aiXDMTgdJsE6v@&*mnqj_0=+RJT@fBX@+)Q^2!ZjP3A_xfa3VTe`4
z$)r)?M&@zX#R;vE=cPfWHvX9CNy51-wP0C~BGqZ9XYKb}*@-@d{AE5U+-hkQI0xS+
zbrE_gCM$jEF@}=fz=af##^?H89Y=ikaR}I{y5B&0meUaerKarxsV4mOJu5MoKx)~-
zOSv$1{h9|b>2n}M<fAqixB>Pf1ES4XelZaC4a;P%M5|O?RutUC*LwjJ<*<c8tS{L5
z$|8k0CBxq?9<&#~1!IGanwWq{45e3_`lr3QikBk-t(%f~49DNPvG0HQVe}*+5K@MR
z^5~*6!H+Fv;(#3s%e?xP#AUaU&+*Vp@;cTs-~h%isn$+-G+a2^_4J|}c&QK$-<?MP
zoEP!-uhDw=XS7rs4ITAwd`&8k@XY44I*>P26SSHiGt@;o8Vs)Ns<?<ypU@s4LHMdJ
z@<`1#UN)x;PHRc~{Oha`5_V;59MDs8-Dsj8sMB+NY~?xq1~@G24<l|-)5|B}5Wlzy
z>88fn2KcvjL2qUAQ(X-R3QhPoAli0cHJr6fTFW60?Urc@+@R*^7Q5S|tEMC1>pWL&
zKBxqwq8cr*tXp_}CMG~5U3?J~PL~vdA7=O}o+TAhd_(Z3FeFMEkcDjQ2-*U?lu};a
z=x3r(r!)h)n!L>hNgMZTw98_*)%7h|+PsLh-hERA{*d|f9S+xzOnWY?32QX0tk-S}
zhCHu}Z<gFqa?Jgtq&Unan#!)O-$^40UC?}Rndu;DIN`Rd)Mlz@&ZOL05!j->tCYvX
zWla0ae~%WExS6vk98_nQtG%^A;{zbtvNi9s{jW-<WqI}$?AP;+rPh{8#UTD-^<|>s
zMpDPzITcFC_KVl#l11BgFtG^68fi`+y4fXu*fBc!M&ffJ3BuG~)O)sv9BaBV2KChH
zubo^(9%8&ASs&q(uaw<DVsxB*b;TJ6b?}*(1o9em0whTo_r>b5MUJQlRuQx!$f^QR
z+@}xc+X8gCr_qHR!>`iCKfhbzBGbMA(&U7=hya~-S%``7>d!4LXzRyv4Zc0Q1=eHe
zy-4{e52CNhiFBkuYDsp@MDW*7<rab#Y*DjZ-2*z9AziTyQfe-n<LXrIseusY+s||#
zC}h|2)yy5$)9KUr+SeS;7Q9#JQ=d{-gqL+*EY6ZHt#!AlEAHlJAtICxp)==-b!LV!
zg2_gqQRAp0J-tP`M#PJ&%nUv(C*$<T?@xLK{tR-<uR&%5Cex(p+0{9avBnxf6@Pl<
zdo7ezFMRd19^?yz$P8En)5SHy#M9W4Q%-zd=@i3a*()EbQHE`ho`J0)c@VkLt?3by
zdNa$F`E>0O%dRUpxn&DJd`olGCg_e9b40}#P{xMZy5Ee#umK=tIZ>N<0CL*`U_av7
zsFLE`;c;4!-T^L8&S!BS0Dll+3vEBs0^*q_KH8*!IY(NYp6b0TBYz<!dZJPgiP=W|
zjLmi@9b1>9$A8nI7YOjq;Jd<K(84MucvI`bUk7q3RIZ*Vk+%5$TG)(uP@Tr$0+;XL
z7}At)`y#l-D&M#rg<1xM;GxV2KbDgVx3sguJtAV6eM)9WFkNe^d}~d)Jo$Tm(!xTt
z+A+=Pd-+d^sa$IB?jVc2TKISswfWI(Qi;h0aY>$3F^fb?(O?DI?H&Rj`3DXYp&Wvi
zFIKGXzMqhN89SBLO*AHY{TYX-dCr6WCnE;>#f46@Cq;H<mf?=tEj$)~%ww7{>fkPg
z1dlC3>u(M;BGRl9IkLYdYOe=!cP+5;ZXobwN(0EE;+Y@}q<QG^2nA{FsS|H7S3UWN
zh(q-{qhE+4`8+ejvSpa7>xjy3FIXn*O4DgQLq+*|DHqk_F7Z8I`83@c8pItk44)Rs
z>n$$SXtWoZHl&yt7wkTMi|AhP1nOz%dz>}u(0JYoXn=G$Nk9M{sdUn9A{3bGPvMyR
zD_KgZBA%-57eI~-`PF{5QYr<kt2>%n!hgeCj;$ijwtMjitQz-K<Fpi_qWdVcoi2c1
zJO}m@%DD$%d|l__qglB*1|Rgt@NUqh$)8u@J1?`nI6JP<nmYzsj5kk5#eezqlDGQ|
zwy5`J(k(^;73&y`sG+i`oksOb*}Ispl`V8t^Qq$`YKzIRLe(*>O}VTC^6O|c%`UCi
zNbfQ0sTGuZChJk3t1c<4UgFns8dwMXdh>`;PfZW;ro#E&ej)eUqqK)!lQyW|l{=P+
zb?@^2^v%FuZZd>(krPx1niSu~%L%^j3H-=6Td^}8$({S}6MWB4LYo6_<SV@UF&hKl
zA^yG0*_K#~*TijXCPHP{&5=u=&XfkNVE0DQvzp$Yo(Xn*QWij8(cbF`TTPBzX!?>l
zyres~uMqa}_yL&$viS;UZpE>M9!c!W5V`aWWbBv8^44|@;<2%Q{6~bNT-EkY<Geok
z-mNOJ6|KCnIu361w{a6~Z_2r_t=!3l9WR#`^^w)!I}U!27}m9$He&a#zpE(k%(z+;
z6<!($L6bmCF3P8`0lO)lYfL*J1B_bt%8d<>j&<a@nfmO3e5gqW`BVICDq)DW@%jLT
zuxIPO`uH|*nmw{Md_izwLjOvIBPjC;=FvNS%)x*F&<pPU%4BWj5nxl1@9@2_bHF9b
zA>zqMoqC^&@P0jS_Wlkjwjty5!e$i8Ko6Z3AqCK`W(UHSAGQS>IunEna}|GB|5)Oo
zaCn)9N5{jMn8s&3m%Dn^rKVM;gM-<<LK>WNT5M&0ywi7t?_TTT@J*I0lcQKiDO4y(
zIsfnh9i`1k+IOjxb7v5lRP+O8HEigxcS+Z1(%n?`M6+=u<(1dkqjrex*(YAQ+O=Z4
zE`RIdg@#Sn>WZwPvqd@EC0j%w`s1G^9a2SIhccTk0;jT-*(Oh4$0h+8K6OsJ@$!%;
z(BAM~!b}>N^Qi4ieN@QSy6%n$|HW~~N>ctC$0xcO8Mz^7Bb=MFhA}?ZT7USkdSi=@
zpR30orQ%CI)|8v>RIFz<ajk<1Ha2uT`~Zc35=t;JJ**!h#JX1j;fEy?%3-ioyCqTz
z6la6&U}*j?5qZ9(kkv@Ovf#~{^`e+UnZ+iM*Lt7_=sE-z_hL`>X}K1JFVFT&_CE8C
z)S4`J;yHr`e(mGLIYbhVs)7LFRD%_u%yS?Sis>UE>ke7VH3v5k9yMuMt&UgN$hQFz
zM-#hj<g=@%fc0awmRI#TZWVi?^jD}7Ek*&cKBwTqPEEhY7Y+YZnO{&ArsdRidyh+8
znEcI^??i1af&*!-TU=vXZEcI{=*QD^p%W}oz$!u2ydxGyE;hox8*<e)IiChnb9#}@
zA8gRi?dFwwkMP;|eAg9|c4c_fJHq#&@Z^Bwi|+N&sk4o#+>|oK-@$#KG$Oc9*MV*O
z=r4bH8!!o57^m841~dx}&w~x7WV{!CKv&<y2V`xn3c&Gb5;WFl4~bU5-ukMmGe0et
zON#qkbWTOIq|>mAajM6Q&HCX^S<0j0afV>tt_UA-iL<v}b&@E#eEXQDS$>4mNTo0@
ztVh9~iJjjKzs0)Cja9xZ<YgY<<F>w#v41@6fAt7EpMY%W(z;8rVw!z}+kQq_&TWz3
zlx<){9bTNU0e1Vdy6@JV?7s0Qq6`AawfAz}<;*u^1<{|;qoWSqfG*M0R0Bw)<C_3L
zvP>KOLs2Vu$bcg4$MV7rph-?h%H_79wF5%7wsD|2ti=iF-+9+pWde?#%sv8)GBQp8
z2Zk^rk%0BGE={TwCS?>ONr(%SIil_b6*`n=TaVck*AIUE1izi2MZ-%-`C37jVclO-
z-!=XT=Gxbe?6*(XhuP@4ybd<BNJ)Q%u2d{g9pL`0Wr!&D7KLAc#=i1I^Zcl0G*k}m
zGa~WR87_Myd?kB6M<+3Ksg;J03||VTy5;7RNKuNuG|Qh&9DHT=^}YUD_vy-ffC8^W
zW2Q*5!iWgs22Dy^r=e|h0Xtrw6Ozx3@_xMcLVkyZX<vTWy$W0S*0dpzzF;lGoA%y|
ztODl;tzGs_?;B1)-}^xg&i)}_c?%5$#p;ZO5<ixx#gne-fL_5qwzRXuZKg#bkm!s#
z{%~Kr;Ty-qLcf{P!EJIo#G#vUW>NJtV=duikXJj=XHDhcRaD-af_$fRzX85$wK?9Q
zZ^!A^A2oW$`N&AAYYb=Et|Ze6@>59VN(kqjmlYkXyE0`^sK4qH?J5ddM-x?RRIWUE
ztflE1kta7qyKdcB&?E9yr8`H8iO9)WPctIcGZPEWcsiIdUOb}CudQi*|8k+`rM|nJ
zjKNwdo*Rtve&sqP1qlNbf)v=3b3?Qga}yX#GDnZO*3h{S+9({pX>aFDebH*rWFT;r
z=q?~#;v<mm2WJ4pAwo1|qp`a;2FM$2*NOCSY>FRV+o09j0@1s)+6#VQxfe@)Bi4Zx
zEqIIBMvD;YN7GBTZVr~KSSa0O+cyrkwB0K}&Zlvn1j^2gh`woBVNX}Dajl*MnqSQO
zYD$F(sm9}_<m!p^`#-U{D7iF^EovZm0bH^pAU8TlkLXJlMxdcZJ&@pHQd7G5(vDw-
zazciR4hKl%*^2}=GGYpTW#(!H%(nWI@C|Sdws`h8b{CERK=A4gm}|eE9p@5bFjp#I
z{uMgiM}$s!RyF`Aya^(1OUNnc3B`<zMaBqPMLm<Q6%vo8aN`5Xnj%}Ys8Oz`Z3LPT
z<y~Mg#2mQW`XOaeo-FK*$<S}!kCt%?J?wJ4WlEV`z*E>U`4Hit77}g@EQ1#Iu8hTk
zWna58Q~3mH_ibq2s=Rz|rrF&>>}Nn9X)1_X&Ii4ZytWC-UQB?eLWFSipZ9_)89ZB+
z)Ayq1OoT3npv)u`;MlCZ$Y|018zArxU-^jsQQP7zfmJNSsK7p`_DOg@)o~45sLx7J
zH}`0y@xnEF2>T*uBHg5C8l?O9mjv6XO+SI4l}>$I>j{!XFcv|lOYr)j+OUod8&t|*
zk*M(~@D`uF-UD(Z)#T*lHo!ZTD*fiv6X+AmphSphJb`m3%%(`EEiQpVyaL5+6y#6*
z?Ji5sx6;G|@)nWcKk*Undr+Oa;6dCu<WdT6KcK9SW@h2W1PX`j<!#@(4u@8hLP5~m
zbUsJFt}tKl+NYi1oL(m89H6XK$SbM6cd6NR&3u7xKVopEV}kxWXaJSRDm`8pL**5{
zBBv1IeFM-9>WMapv>CVSJ3xCRjwq3n6=}Ti5ko)yh+CZZy#dc5U5HshK|vfTKsv(-
zAmCv#8t7tF6KpRRdZiAo#4R|A;SG2p|K<Y3d?q3{d5x*-=VU0-aQ(v|IaY7@r!U`i
z$Ll^aX(-DWBsbN#LI>J4W;E_`k0Fl*M&mJ)zjzCGQb;qihn3!aVZ?%pA1_9I4Sr=E
zbS@_X?#=5XP@6@w5nTa!)mU<S6le*vFMUoX>O{2hxhXSYNizh`%P%77-$THInYIR=
z&y38{TDlNjwFz==!~?E=XceGIwgLbU58~3ycSd6cT99#ucfQlikMQH1x9xmz#fJCs
zr(3`pYu@Tuj#p4(76pWifM-J9?&|1VYd}y--vs<2xQ2{@lj?U--4r}(jl3drntes6
zPFoO2v1f*iT4qGHB9bH9Z+?g4+qacB52G6{GP~v-N)7Zr?5xhaD<a7N)TeVh@OoGw
zhfJ!2*YRWd<Huq7=fKm@o%SC>>yvX6mm{{XVl8V7xsaJ2@Wy6PHO`6%9!cTo=BhQS
z@SiPO*PO5^E*<7gBM+oBrR3fg{pI-p|K|=4BJ8#vzyA>`dU@Py0tg+rqioWIb-k_F
zw0@xPY$!lOb$+-K2sn|_dmD-$N)|(B3Ub>~<+%(MWf|jU^#-VMqE9=CZ%x5^*w<f-
zze`P&J$xJoCC)oOMxHmn+}&pDyY(vet*)K{x!wV0p~sP?8Hh?^^pAC@Yq`*;bt*;|
zcf5HF=XZG|DSRyCJ@70!kODCW*C#5A5de?>vr5o1=D`sa)fPC#P$*)sHX*NeK+`on
zgvVqfd>@c?&`V(l_DLc>o+UiFpRpm{h-SLgj}1SJnHu^+qD_RElpVmz=<bciF+l{J
zeW9#zp5jmPPvp4k{YFcSe?-DaNBa%3UrrCVIB;5D<5Qte0-R6rHvPSXU#9Cp#k9my
z=q>23C$ET^e>>p;E1fbwbO2V_tRUa>Hf%-VOLyUG?or#(lQfpXX|vwS&yQ(sQj4)=
zc6m>4nXXsDZ;$QSDSa>oSI5AlI)JNyhVl!#WRXo?QteakjR`Zm1I{nQ4(lJr_E$fQ
zmB6!(zXQ`cycAH#!FQ=M0TZd@?-Kbvd(a>nWw0|VN&+e2rcVoj3_KT1@oToiO>mSM
z7U<iFIX!p&!Y^gVHEd(vVfiMK2fz!o?i^Ej5WgU-xI@L~%A;f&yJ4Krk0B?pShZ6-
zEVYTs5WldD0nrSf>rthmnOYu_j!Y0HA9U{*C~+HI7uKhQqlv0&XC>p6C8{-<!(mZj
za*7+7)6I1>0dn?gpGDge_M5BR?67W~GSd)H7F6MzT9g+zaUOIis14$sjA+DsOB=Lo
z6dny!q?xp{s^Oi+^HHLYcPqZ_LFXl%2Jpkm7utOJh?qB;4SS;QB*6|g_K>P)io@?%
zZoUtJQMDiF@Y2lpfaavjwoZUmhhCOWUC$?Xjyvkq!`246VUs(bAqpCs5{f+%flH*Z
z0sLnJYd=sd2APnel};nMK6Vz}#gR*;rLLruO4sA|YdEE8ls_Lv%kX|E_X7*0R4f=I
zI(bL)M8ZX)sIR^idlEo{GR!!c*nIwCmzJbe#XeY`zmp@?^Q=KxyLc_OAaS#YpP*lK
zl{Cuyvb!O1SxZJ&yRw{UNca1QPxNlvZt<u4Cz=PO&P<2PsvJ8pFj>~jNHez*HaXn$
zSb#LrE>kbZkEpD%>4q1{R5yOgF{>52AK%r(k^4SV(3)jj^8mf9ta#KWdAZ_!nep2{
zzSPbSU#ifS+_4qc1XK*Z^E&))fF44J%=hhyL<zjGr%7kbaVL;#=ysYfyYhN>@JXk|
z4W^rrk=cv~SlGF$!r2oXlYJ^3ot_jTxdNF6aK>}h2{ZkJS;mRMjJ(=`nmo<nubU$b
zwX)i)^S3$RmZlc#0a*HeS<88bW$ZrD%WM5piM`J1Ut}s+*4<7QNlP9X2+P)ckgHGT
z$W1>PJRS6PHYER$|2-AkngROl2QKu1T*$oyk*H}!<c~E#Orv%M!Q>Nrh6n3t8_nDW
z1Ox_G^>0(ayG@F+pR8$XaFJ{p0AM_ct*K4Vn}PRAl#ZbMCmb}8RN4U2=bV!sJV|G~
z6^rKBp7JLOb{^xOBoe!%RFkdItp;&X-V-I-IVjX9fbKG8adp5&@?XY+)l|fTVnL~I
z?(>o2gmQWgHTLIZHS&~E3Ql?Yw&Do<Ih<M|DCcvGCr@fJu7uD}DB02yX~+{hSK{vZ
z=G}_b`6p3qYeROB{-A;?R=VgLSHC4TW4=mASivOm_>f(3y^)4vx+G?^S%GY9WB~7M
zfUJ=hn{rKoqC&W!auw4xZd1zsN(DUM&0KiDaOHxDHKHL{9^*{j2kmZ`*jgLs!u!tL
z^D_31pxthTni7t_^rPG7F>VVU3rTgbq6^85%CVQeHv~RbYkRj;g&02K9TSZY+!qJ)
zP{whOa7?sX@w(1L=gClCW00<{&<&@m1OJf=p^GHt8^2Av*yNAXUuRrkAzH#;-xH86
zv7uW+w&tKP4+r#~@GfyyPh{7(eOKD>(+B^I(s?*0lpyLHeO-H-WQM1YaYSr&%HZ=w
zW*-i!ntWO=L2m=bqJ!UQa+5|1W6r#S6I(aPKv9F3g>47;v*EzO(scR$jek_}rmpo|
zf9=yPovN5xOz#r$C)7(k9=i)Oaud*NH^Kl~7z$!oUidZX?^r6vEx1QuLfXD6iLwE~
zJDP^`1Ks{&Ls4jfFaR(cv_PKPT4Y)#mL658WRcSU0w?L2RBwu~&3AN6;28~aT>bW-
z7P!{|sPdrf*4-|DdaQ5H8qad@FnY-B;UrvUCOg?44-}B!r1a*OVudZSo0L|#5qy5j
zI`(qW9;00;Fz8{m@YU^GqO!7q$iCKz;uXfvahxJlruwwx0JeP<J@%0B(tAw2zT86D
zri`MSXi{|E8zZtpZ$fQqb0f8mFsuPqvCR>G{Y_sOnW<L$6H-!FX>tQExx*x8-x_0V
zPPbliQ&BYK_w53xgD<bxi_hOB{SJL+#CPLcvI#ZxA=RCy-}Rt{_jV)-5)ufZl$-M%
zeQ>?;_0v?(#}B9Tm97Zx3W2#Z0U-s_p~q5hDo1qPHPPW#Q^2t7Y75mF>uS?xF2zf_
zYoQn&k7(3~Z^*vFa8g{Kyi&Oa3*y|BKmto2-^{O0=^WSojO8i6A`kJ1%0<(bgQ#7>
zKF4=b+vAhO;;rKo4b<ABx=aVC#<JHvM`81Jm`Lo_+IVY+?B%Yurr#;dLuT5#h5bwR
zMzWQ`s0To;DBUg&dM?^JWT%-;EgIcy4|;sy9ri|txh5~*H4u;&5(>>qf;|5G`Zf@U
zq!w^p!~aGqrY9{GiMgux>9*_v+N<U5TfDYfWy<6SGbUsr-tOBu5{Ii!<Ty>igLNV~
zOPu<|Nr}PiYro(eC^u5%2?dwk)W+5*L`U*iGd+}U?(jLjiiAZblheBhxg5{18#)E1
z7;j^;J7xS;Wk0;H+L@|<_r={=v>aE1hPHsALE_OOCX%tLR%s3X>OQ$iJfF+M*E9^c
zd=U&Ob5N}HTR%R^#D!^Kh3SipQu=ZamPFyt4-B1?zCWZY)Mr9&x)Q-1ujoJGclzKa
z+2gyvcia_J{KPAGu7Dy_vxYU?tvJMR{~9M!NF`Uh9@Ey_#Vwb2PP+<~LhWidFqfJr
zKd|cbz&1P=gKsgtC^f$vEVUN4=F4}ut=r|9<<Nhe16$PcP&<`%HJpA2dkKeGVPm<u
zxsf+qNZO6LB%UrLSr_LE3MM=B(VQ-pKG3~v9N(a;Y|7}1FL`&A>lL&d%}lh#S0U|n
zP-<Rp?xCK%ktkJBtXDa~*wdvmmY`p(C&=_-#~go6tAf2SE=t>nWlVOM#V6p+e({Fx
zotgzP_w$Ok$j&9JY(1q9ktC^=pF%w)uS$vj$&3HGNXvnAi+i$(YrHr+ldoNk8BR@Q
zc&uP`@b%=g&Yn%hSS5UM8LA)q`G#()zKuvd;*Gr>8g04P=j9h>pO5!PHzrPYOH+#4
z5MGeJ^CBMN4wl2FB5HflDk};HSiUjGU*n6waqnFH#vslutMg2ToAJR7Jil1-_!o!k
zB!hb^S`{67N{KHfI9`MvzAB8<>6y2E)Gu<HRCQkFUB{8f@2f#Ux^)-0qw@zDyF0$I
zm}7+GEV`RgzwYR@m%;3ojb()=+Pzx%bqnE-TVh|V?mC_>a_%&k$z`SNj&jQo9ao*_
zT(~!Wwst8{7--|`Y~4(>Ydy6k{Odahun98++`-t<H;I8-Vk{rp-<ag@)hvtBx}~f3
zF4FWTthxrhVtBf49KZ4>Zw5$6pca=(y0{{%&3fofTNhWRjK^M~njEbOri!Ffa5p|D
zzw}9vvlmV{8MIu@t~-)9A~LIapt33HUGJm#3-w99BdYTc^5U-QCqcXnmvogz&+=r3
z*@ChhLQS)xc2!Q67f6F-E+g!3krJ5N4S|oN`rx2!RBmj``fAUN!3SMld$7A2((%w@
zxidX`LuiCxo$lP(N3pz|E?Yj5Ra8kjjS+GyM>8%yjq*?M|LY=}_UgAzMssziI*Hf}
zyF8koaI7ejmG=Gd-n%;Ve;*C#0WH;2$`X$m5`4V1oP~t&Wi5y4nauZ*vP+Abh8ojc
zpYl5!u7BMzpJkzBFSz7UF>{-0Ft2h`b7|;haJUSsD~d1axx-=T7Ts$^UsfkX4b{Oj
zN;6=y`bzkZ#RJw4$Ox{V@tYhBhQ90d4>c*3JRNDGQHU0EMV(13*RMUqpR75=XJbQe
zdoy;ot&ZP0&n9kMCXF1$Wp$XX>I))(x7AzzU{P)Et?aFD+)v19)KBqF<^rO>Y8wCB
zar?od5ukxmg|ry2PegSFIU{~8w1#1ZbWHQ`<`QJ>?UifrHJ|INqW^w8ONh?00+Dm0
zyK2FNgDI!k`C^4Z%1bd}JWDhrxF_5|jaBQ8;iotK%!Hq8iaJ93EbvP3R=#B=uZ-ot
zSQ)JTki@AF;^AB(6*0Z_k!b6y{+Z=5R6}s%7dPwYm_wr;iXpGNmU}oiAm*y?2y*s4
znOg+=OUVjiGWQAI`Tct`XJpb`r&vp$Tv(WV<w+p$iWnVm=5CWobB7SJ+v47}_>A{1
zSGybr3t7rGf3RgB9KOsbOanzjl~3hVEiZ>9l+9LP;naH`;C=p#GVII&(*1b)H7<^n
zVluzuouA9&2~r_CQOI@Y?9lA%q9KwsE_T~vPM(sqecuM@UJJSs0tNE$gg|u3o{!hx
zWo2bq+7F~6-+m^DR&}u*(3Fs%(LY`$#;R9UlKyz2-Z^}wM%v;#YoBud_-L$gnt^FC
zhlSK{4EiTWvrh%J92Np{BOwhwr*3LUZ=PPmZNhv2S($z4W@BT6^Ssrj7D||;ii$dO
zk^;y<Pn?N$M-Db8q>M=Q2><cuRL~8G&SM{zY`?RDTU=Eyw4U|}j1f4}p_)<kee=NW
z&(P%SuXPaRbUwj08IBk;X|CfZ=5|CuJ-TUW{hhAnaD&YgzRE`UDV0bsz}cdAUsG)c
zSMFew8JTUQv58`U2QLJ%qT_D#{`&;}vDp5{7fGIr^=xcOuGg)a)I(|8l!6P#s6Nbb
zuLX)lw-w|vp(MP%o1bi{T~)R6EmX8KFA<*qRJ3&S@FN(;3NP%^Cm>U3<U1Wydr|Xr
zo%w`g_ysY1?c8Kj?BcF+@ijKy_p<*S%ipZmf3F=eB=WGPkJay7({Xy(>YtAqO;!jD
z4)Tv3Fx>Gvum$uUm;PK;hSCy;L!%oJfX`LYXu6*!fz!lJ02+1bmq-MD%QESgB-+Gr
zUZ#@)_U2Ze{$PB%;eWs8|MCi^_%o;{W@*G5$iH{qpG@KZb|I#MdPli9hee|Ce;o1u
z?~5e<VB?dR5c8w{bMgHBF#KPxq3;ljKt(gi_YXn!e|y^BB-TIoa6_&luko~UIpO_Z
z#_GTKNZtmv>|GJY`Onk*mm&LK9{8VcS)w4l{qSWDl*C~Kr+^-Xh?sVkpXIObQAS4s
zscJXN#~{aS0#YETKUqG|DCrp>#zYiLuYmUc0@PXrx@!D;sG9<>G+sKUKOhDN?Yo;=
zRZr5BggnZQc6&G^G!fLVk}qg~&3O3@=qIQrKvU4qedZ(iAQXRbw(0r|ASu2ls|Q>P
zwTK5(zK8U-_5fesi=BDu{0Mz#Ml_Y<jh{;hqzq?eprUqVthN}mMrcS9cCOL7<7Ys~
zLneU~NZ;oX!~V{zcj}$<@)wh(eJhixZ-;2B4vkZjZBB9=ObPmrSMQ%lz5n?mm}#4~
z{FvuE`oZ_L-QEUlS)vhs&xgNvS06Qs8P*^o4S<qk(VOy4E|LA;gLc>A3;y0xd_(3I
z0M#t*?TZ4n#}ShQ{fL(YFm@0dM6C>D6t6UP2bnZKgSVllvN9Glx<o+qP`P<{LR0Er
zOZF0fHfi=(1ZUG@PA|e@J5ArXd`hHAFPEFbp00Rr1)t*Gea{pA>Ml{(@bK_Q1*6>W
z(_u^5kCE31?9Eqj5JS5l@of5Xp(REK{HpQ9D^pTEr*yl*Jia(B_kaHsF{Diys=eA6
zMA~)#6anZWYFga?J~e~>P1M0;G(|x4s%a3ABqSSs&#jFUco9RDRsylFS*7O$%_nx+
z_USpFz&}ei!u^=DU6x+tq176>e0p)kWUnz^A_3=Je!-qJ66OP=bey>x{Hblv8CT?v
zz7acwEjFgIlP|{nFx6?zDBGv0$hyjk!4Lt)t7p-UU_s6yLMX&aW3&Zi74kUb_X@us
zUl||i)irn+*VU8XoQA)at`Yq+I(da9_F|3W9%ok^t~Uf(_z2ok5In;#UcXIp36N+2
z+1YJ!=vrg9h4f#!lD|IUmq~I5MtH>CrknWRd;FPLGzF5Ye6zrmJH`Y%p*3y$kA(CS
z>ORq}_Y!7eF~<aBd<I@IJ&a#v`bYS_X8x6y`;U<zNk$w*png#Bp9c{XOFB(G4r%`-
z5`16JJy#rvi$_}<5BUVVtvM(T+Gy%dHpD#NJ7Ci~8yxE0d}Mp?(z*(U$O2)k?~O}v
zyPNMVS^o1A|8@kdSR#H}zs8%r+kbl%EYuYo2u@~YNtPsv&@Qv}j1m;qfcajN<ANZD
zoJ!VLa$o5LS#;IVy-QG&u92o}zGAs9;-PpoY^7lK1h#No*>;cfuUdgWhuyCWv3Xo@
zX3iHfSv>eXkH0NdXD*Sgmo8SevNP}409CHj?P#t8+Oo(Z46K;9I6OJ=lQE_3@EA8t
zS(@EPMc*vT``IsUpAyc*h;-4CNG$j4#qf*~5VHu<N(~i7rZQ_jGAdW$xwLI)PBpuv
z8F{hKG3O1DenL}AYdUWdHc4~xo~!iueRn)P)AEZC(NMGChfGGwEVVSIa@G}>>b1y{
z>E+Ct*4*RU`y6jHEv1U3YXLptVRw9o+$9ZfreE33Ipm=-K`vN<POeNX^6fCS6p#@>
z8d>zFfh~mYvqi^6&h`s%$C>=$ApK|3rNzMcEgX-${+~5>1%gB7Ple{es=&mC^FUEZ
z`wM<jf98YSq!Wq4F6)U0kFGyjls~XpPxZm#y_ENPNgj3JTRT}sQTwJVXu)EQWxcS$
zkN$L^l3OIZG;gb2IzLBjLYqr8Hq!e{4o2+S&u+2TJNSMLb~v$_03ThH=cN->O&zpZ
zXXzT<lof{AG?_`91np$|=8g<&dS7mA;9z|D+oAFA$DNS^M%=fK@z2KmJ)v-MDwIK%
z>+9;@pD=xR$uFqJzy7jt`kQ98#cfQ<gJL`pE7QH1#K#$Uu;XKk24ChKVeIL&3_KB7
zDacx&rk0nsZTEWXFU;!pkHGLQtxI7Zg;(MKUQmCms+cKaWtkQ-SV;VPWmVrSx!y8*
zG9#_yNAYDG<P5B54R0TOZP!TFCrLdx;P~vT;|J<>$CTEK2OGK1VZOSCS-xF6R=l?7
zFp=EGfsD^Vh1hN&5{9#V4*J|s$bsFM6Y!7l;ve>p>J`|{FULxP|9#6^!WDl`9UWas
z)sv}kS3xO(pB#)n5y&=Qb_H98*{Qo&{=bL!_h`~7`yt{1Kbwayei48EtSGAZEB6`w
z#n6z~vdZ559FS*}&_|COc6n=jGnoD}F@L^-JQh5uz*3F#@Sm4{USP^2$n>Vyl;Tx?
z2FjK<ki;?EZ3J&+|1(DTyVH?lii81V!rL<bv!bKHF&2Ud2nJ0;Go!nrS!r<3A5laW
znP&R4nl9eb)P2-Icwt_8yQAuhR9Q>^(}K7|W9p8<0Z~oe0lw5@X<zO3ZcfC(v($1w
z%JJ<Bv*3&^Eha<sOKW46aLg`_9xeWM*TT#?Y=_B4R#+w$#lJ`B&rm&sya7IAFEPRO
zf8Ubkivk(j*U@<(Cns0-RlyFIqG)1X*2B_;oVL({n5Cm?{N*k{7Q@-VSBZcv*I=4e
zhmFpkodDQ>FB^_ON$vJ<X+$x-1KbBnLZ^szZ#qFTpAOk$og+<jU2WT+zoE$H_M(sb
z`h|JPqQHaKb3YB=-yZI-btHy@ytc#q_u>8FLLk%iG3!W<4_9*G-dxZNiwiDTzm>~}
z`29l(am9gt+O}PsMSC$DPxi@K7JaljIV<*ihZ6QbTZp_gOel4dN2EZGOQKEnmTAXk
zR|GY>M?N;{jCPwy{CIjteHNyz7TrURe73W<#(yZ?|9*<AcuyF>(e+d#?jMmX!o{hf
zezuLu6GDThn0|e!8rdo*78`C=qxLGZrhFvR%)e|kc30*^ev{XS-lIZcJU_uN-Qmt!
zm(od4`amPM&)O~56d5?TkIZ~u(RkRgMN&Sq!`J+SzzgF2T&L`sat5D(2!+%*zExT4
z536z{v=dpvtA;t~NI}Q49{eJkN!h);1{LaN2%wmYKb>qeYc)3eWz<VU#IJ6Cx@x#q
zVPf=}`zb8;&#CdBi#U`!$}cpL{_;+L9yr{0pwcCQyh=RxB-Xi79xlXF31*ZG-y)tE
zw%Eg@$>w}L-f8uqq5DG8R<cf71hvEc1UYh6OR_#PRmOnWx**6DZN+efROoP@g9~Q2
z#OsE`8OeD+_GB<Ei}`eUUDr$`r9rZt`HJnC^=fKY!sp7&V@B0*by{AbW65@yRguNz
zyFHm?8I0S%`9%L+Gifb1!TA^52qfzNdG4h3qfk9Yxk|8$n{0OLT>Cc{AeBT7e|_J!
zQ<X?wHFX)ua_*N?WJUBS=qpXJLyXUQ$cRBuPq@Tii}K%}j0*;<7*`|4`9CY<?K8ZB
z_fO*wPa@#%+~ww~d5pO)jZ4zIT9_ouJ1>}G)$+pjqSRZA02;+5DX;QQjo@lW9<P?5
ztC|UpjjCFV2;!~gDhbt?b5;6WHFEl_&|Ao)ugk@;-iP63t=&<^<;39TX1XSnXBWmi
zU~$XVa`%ZWZsA)$j}8&$SfhV}KY#xLBnb#t%gu_P^Pi3pGzt{q$Ca&Cl$WDHrDUxh
zoqWxjc>eaf-0nltD1iu5g49TLsY$1T{-6vj{$Uk8Gt@Zl@h9AtWGoE!0s?F5!6bI~
zR$}JUdc^C@2DD4)?8|3!TJCAQt;+qLql?8e{$!B4Lm<pcEMW@OxO~>K?;fl`z`%Ks
z<%aC}J?!8h0w!}2l8)DsJ%v#<i&ru*rO4Y$A2|;6t5lM)+dT-9RLnFxyJZ`&&PYo6
z`Cm}S-<-FVB!o-HIr8?yKL9=8(p`bzRW=MQ8H8(v%i5}OX(U_ES>5JkCQe?79##pB
zlx?@Rr%QQU5#y@&jy5O?am(;bSZOXEX?z5afT?9c>%-fom_@Cxnfnd9Mw4h!G2=&b
z%xwJyOP6GAAy7}w$iGOQ|5*!oCAc~3V)fb|9*2UGlaMcO^93=#dB5Rz&R!YHPmDBg
zIrIV4$~pLmArXQfeFR7(tsr~F;Cc9+EkVm9bfy$hwg?J)3TOGB_yYrVjE9IOt3aMf
zA`2Am;Q_fR1NZMb+K4cd&mm=Hs=js|r`rkw%JlC#0*2f3!rHBdS}j0OZ*l$=s;-4U
z!x)Hd#^-W=DG+`~x}`kWIF^g1&`|w#X^|E>4+s|e_}p#gP1n&abhO*Li&ieOc4XFz
zG4$_@cmMDg0xtkce=<%u*;rm7w_g0jj`Leh!w<N8zjsYHjkH=<`>$n3<%fi*n5zO=
zREahqaY&Tx(BScpKnt8QgLk28533gCmjDxt5K$EF4wMgC!IQIyT%I}ifBVLopkvKc
z`VoyK1ef+<2trUXN7Lj4sIK8Ei5zDl%Etpbf+>9&aqvfr4I8O&?11DcW)$F|B|969
zgWMo4B&EE({CUwiXzE6UP*rd@Xa%jPRa?Qxb0Fjvb^z@hZOAXS5oNeV03=RyY%3?-
z=wn&|)udMb4S)zj&#lN&MmIn?(Dn+5ZEu0{yHOwx)k>&qOO4QE?$7k?Ps;w3PDzQ6
zEwGZ7w18m3MTuS<Y1JyVfDY~Gj^H6lgE1(Cw1iGKKD3KWw*!r5g&B#WTr&3==-dmx
zvy5C8Ro(hRhisYQ)ieYP!Q^za@`bw}5bX)gh!S8P&?t-riizCrkufJbb*@Y|>E&?{
zl?U5=v!K?N5U^&%adIj@arJ}%Hkq)yEf8!lN%c8Ez0jyydHx9~ZhW0_c*g7p7F@@z
z+;b~9P?6RE{4chLrOv&=nq*I!t{SIy`HsF)ZeAxfxFj_6-kB){f=1!wZlmHD<N|8@
zpcKzot<D1!!<m30KodZ&7heXdZq!Z0;yVc7;yzUp9HeD++ZeZjNDBj~?9PMe;0g}w
zcwSB&5Vw-J7>B_uACbgcuT+jx@;)veO!mX2i;3k$e)*!1L$Ih*nnh50+`9xWSzsnw
zdXo59v{1BGceaz})B=uqw`8Xj9{8cS;L*tcv>tEXFHeFnmv_znbmG9@3hE~VG7O=h
zM4R0NW4cs$@6+9Ppw2tSZ8|oRcU&3>2QbXuIm^)z#uj9a-<j_SM<^!6H9Zw=fO-y{
zUC_Hk{3cBO+|`|8Y3xXF1CK=lDAf&b={|I%4jlQQ4$!HaN>LcDRthryVF+@Qc!@u|
zgP>I304l>9Dhj_d$BbKCoz2J;j3ut$96PRel#8L5ElAJ{R8YAW=a_&0R*i`BF^~Y!
zD`|FP3Gk&`2{iR^g>bI8b=^*cG;^_|HC&w7K&5V9Z%(#Gl(hSQ<r&tJI9pZn!$U}o
z*0SW$UQ6mSgk;WoO5#`z`+Fq0NKlUB7QB`^_=wP^?pK@|VK`HK9n11wr>HnzDU#|C
z)Woz7XAe|3Ejob%<e~pQLKtfB*}bFik!hgNsYex_Vi28<jeV@+3rYjSS&Hi(VGwgY
zAmT}x-wGA+tGlpZYXl16tyzf%)?`s>LXf2VHZ#Lbr99fNO#TGs%{D5abiNHVhi6A<
zIJ_??5e0&$tzf${HDN3Qsuy14<@pv1+Pj2{ZE2)Pu`8lWIo!7>$v|SZRjmGD71f#<
zmUNYwYz^IHq9cWMU7+n<<SSj!dqMuzrcV0jlM$a8)C0Ix5EWnBnkVmJ+Q$zdL&OXt
zw?z%zBoCF!0#d0HN?tb678;V$k6q^_>yRy_IbrZ=@y_(K)+NqFrnzEdDQETaEX;jV
zX4*ejl8R1AHcZB?S&NBXaIxA~etl$Ny{(behSb1&{Ef#SbXnOO>8MDy@iNr;n{y#|
zHKc)dz;#~AOe7Sa0!V42JUF8ot5n-NB{ei;RQg98#thwvd61XDJRCE2qp^we`s%As
zbC{Q4&r+BvdjTS&0!=#%p)qa(M8#092}a!4q2<D&p1;M#NBz|>BhKn;cBlg1q}m&>
zW<N4WE2Un7ol9u@D(8|*E-}B2-(LYX>89)WiL+zD1PjWIExaGCG6F<3S@*GDK5RxK
z+3Q@ow&z~NeZR}ghZHI!{OKy6Elw*vI+ErWz`pRX7NSX}^H;-lw?P}Yi+R7~5FACO
zJL`s*<AeG`yk~YYXaf=&>Skm4O9z`MH4pp9FBG8xa^k#gZG^h4_`;yPw-s?%Aqa*m
zosei~Tp(mE^oj^+?h8V*(XSN7e!9{4Cdwaq?YtK^v(m&N&y!qM8)k*#Skiz8ysdij
z{^X69{;q0O`B+IAOO4-4^5S}`99}1Vt6t6OFSx{A9Ea-)^<Tog!CaBdyJnpm91g<S
zhZf=46mXm3lneR+(BR>xBLIY&q^GEEs%Oe+N+Aipry5mCxHvxz$K?B_oFYIzae7M1
zz`*o(@TJcfK^58pYJ9k$t)lGqV#F5OERi@ctZ1+z1=8>hFO(DMiD4d@-uJ1#%z&Lq
z3s|TbbTKtn9k5`aA81oDWK5w*o3s-xfrYEpLxXR2)RHma1dv>vfalTbC9khfsA%Y1
z07fV}gn;)^myHi;s7%bVKNZZO;h5e9x0s;Q_mx_<;#+1=N6!&+dxF+r8W*C~(`5V!
z+fP^ix^o+B4n?s*vf1q|g0X$%6|Yh9S<oMD*1UcM6io_?Jsu@U;aR$X;L~QGz`o!(
z9_jlhC`*i3cpma$>orl!WO;YY9~7EfBVtVY`4(Z#?I)@|W4{rp7ThM`!E0Fe#6G16
z*G?`4O=1lCzfJaj%3-~=<)C({&G0t|iwe-wj4K=3iXQ`tO`}#c#egZLOP}};3sbiA
z`8AJR9jG&&1A3;B=94WNfxvouMWZsVMhd}d6~;f>?yvMW^VR_M)9j{VH<7b#i5=kV
zgo&QbV&9+kzxisp#D|*ea)^2RzTqdKFQsj>jTdl^j684_a-!6ba$81gJ$_~0`cjUa
z36zGhiySY+&JZagDuo<b%M7TikcnU28Gc^?=47tv8n!&SKk2!a!zy}%w9<GU-ny{M
zigGD9iZa&}@Wh@)79xbRp&SUUHw9MbX;6kqy-g;bP*AW4G#PD%)2ky{)=rjq#hGR4
zo`BR3Dx9SCVAkS|<HL7C0yy7-*zDk3?%Oj)tiIZz@=w4_W;Z3&fEoz(4F?piBRe1$
zfTUZe@nO^-u(rSH3=iE98c`@tb>EbD#$%si91ah?$q)u45~gwp{VGd%>UxSzUfi1t
zp~@jXQ8oZndn3X9gHBn?HyEUdbnnb@&|#!UoTqPbQpZ_t1`J|$P(zpRv@XRnh%{0w
z7h_6W=9DTG5EU{EyO7p>ptAa4TzN5UE_EL>MfTEZhfSH0Cn5bg8`?xkAuHkVTAQdV
zR`s^(k@T%R%xWT+Oa(M%sV=2>mM{>zSFxP;v{j0w4{8PB!WeB^U}aibpaK+Elw`*B
zI?ioTSNrOM;xb>S()*;chNJX>aT)uV3R!yZWEuWKZZO4XK<PC)o~7QC=IfhWls=X3
zfQQgO@#5g(qJ6>`>+c@&iZTR^G$+`I83N7Ca+q$I0~(t|y0)c(<E%IsJSMFuP;4wu
zoU2H@-@!*x)f$*Z<A@^*L!yms#41=zTk#NCn^I0haUeosp$+_RO30x;NU^MyP#s;>
zoi*th^<*90B^<jB_ur7UlzJAt8g35JO-HB`h^J8u&j9lkmLpDuI*Jg6i8(00X{(uQ
zANh)&<J57J3I}<l6=<zcB?r(V-QWPFtW?SBnJqgIU5B}rJe^@fzjJBA`)9it;`f1I
zB#b%1hFlvySfDA=J!@z%v3xQ*WFB7CQ1e2lH}#5vGcHPY0$|QQ!nk!u%r0!OwEjr{
z=_G+wOIPy^N-|rvi9+4V5^Siu``P|`Q3Bw$>Ml5-`sW$fzMhDu1iHX;Wj*ZZ=PE1=
zDemu@G|CzlpEjQA-=R!&>`V#%1ns2Cxg?jQ|ICo;&4r=?Th%d(k!%Hi*aX!bgYM=V
zkS<!~<Uoi3&DAp_2|vYI9(ubT`<wZIcse}zi5Jh(5J@)r;SgCaBF@ogP(wsSZZ4Od
zMdeTP*Im<kTwQ$QKQ+EAJrSULt7($)vQ*|`1!S?(Cl>gBBL!l!s57+$bY)G4#ucnb
zoMo@2kwOJw)4pCDL}9z<k%?dN{~nVero9@@2IKQS+vzYxhZ=DTODmzxVxA%#CImTf
z44(q1jqH{bX++O%0`jIxvi}J<!*S3DR0DDDxC0Hj@J-l+q8-*6P?+&5C^R5J^uf@g
zr-#^_fuoklYpcGJ6PU>fkf39We(jwVuQ5>D3M{DEryzoxpVHHxZcWVr?MDL8(&Nnv
zXm27v4_JWofbTb<RVx=@wJvY#8g8S^7TuB%GsGMZ85ah{RHX&cL%SbHJoBp?hPD$0
zG9dQVi-b@kE^SZ^Ry+;cJ=^Q;sEHBB_9Ggk5>`ifbr)zm-mhTFnSYFl<d&~uEA9zQ
zh$kq5G!(_X@|<can8H)qWot^9SIA1xKp#hZb<Kv~K5ntpwaPwtnNFDlB0n(&z9K<9
zNYQL{--E8+eh3}?M4JH;$7UCRVS>_rM<_Z=#xH3>-tMG|b(y~2I$7rZxI=z&`w8rL
zt+FfMr@@!5D_;O!V->+3>taumoW&+1I<9bHyK9|lRp5}E*Q>_2dbjJ<xV)*)guJJD
zvP|NJoZ##B$5tQ4I9ES_56CWUu*(xsoy_c|v#`D!3Aekusy-oqGU%&2$vs^8S;{YI
zpj7dKdqCm0{k_VEbd)aYCXfdwtU?)(7`UMwO^1Za?JcX)(73bns_c(%4lc;U5ycch
z3;(;;vzxv=dmmEISzTRA*a{8g4GZosfhwH=$pAK;r@%4Qs-1e4#tPYAH7sv{59^j<
z_?(QsOQC(D!qNG%5|opzd>(c-n~_*zfgI<WPt|$Mm3C05Cg&M+z}m9e=wtd>R%=T5
z1LXxgpOM*8-Tek=m?Xp}pgamiug8Nxcm`*T#<AX<X)Pf`yPe2ferjDe&V#3n^5~`X
zH1E|{Tef5SP1h!()~^>Io~%c(2NVvB=-3oW-?hJE?(_Wtz7+$EO4zJl;$e*r;n^HP
z@vJ9j>7e-&EEv#+(8@%AbsrvKf_pV3TI|}*JM|7?O?S{y@!wv#IDnb?qVx~siOQeg
z11>L>v&Ufez;w1!JI2FB`5HSr$NI-UVR|tDyqmpM^lGlQ9Sft!iC0m^$8qwpkDU__
zy!R_#N%TeF#+|Jy3Ine-cprt0jX|PNvG50W*D09i*=x>q)rGb1<Eq}p;Pmdj{RcUL
zw%^_cN9Bn3AtN1BIeZINxp{mZxyJWT<l2Z<xf4jVtmY;HIfoU6_C0kCZOQ%BuM7i+
zAY?|jUC$%p350D30!ptL4?JY^67045kZ=Ci(OUsomcpvX$)97;T7SD)j$>4WORia?
zq?0Ry2$DEdL)G-P+<XfpRO|J{qo;r@7w!B)a2q0|ABehXHr|s@l_bNFy&T4#kvdOO
z52S`@KPTJ4v={196nFIueWofrDR^I#zs>Nx05wtV{tNQ2l!rKTl#=d<JK&Mpm8%qd
z9xT~uC+&ExA24Kc?IsD>c_@VK36+~Fh>EYIG0;e3kU#B=LG)P0ObCW2RYHM|>hb{T
zWF5!1ytEma)a3#G8|Sw;P47|bI%Wbee)>;MHfbaySISn0F21P0J;TjT{<W0*s37e1
z`;VZLGS=#dARr_c1k2P;DQ*KP)6<#9?Z1@*eww{#2pVxtB@o{T2htN#1)z<CF^rBF
z<2JdHz}!+sE>*y59jf3>QFvK9iOq__@E*W@*oL=ABajUNF4AD+rEQxWREQ?{^gAmk
zV7p~z-9lL80l91=8zPZ!j>x{kXQwr{x;@)s>Vx1e6cmnQc0LHcqF7I*)mE7_klkYg
z!u&&wV$ED4ZBm5^mLA_!+7zNH_Q998m9Zye^CvE7`kL?|Ipa)D!o_eOpn=wkkf^t{
zsz-nTfDaS8wDcH?5#yc`fD3bYnD%JcfB9A!>jmgYg9oJ9WJ(GdVFN*Li}Y*55Ylob
zfi|eRHF*;)C*!R)Q=r(L_TdW(wi3xJO92>Gql5&yY7&<K>RQf+nH&R7h3LC$-T-o$
zkWOB0E&qIfV7*?1!g)k4#aU{&27$rw;o1iKfZAf@b@KHBm1{-j^;8oWTyJXdBSx;p
zl$^66`2x5h1Gn$)8YT%h<H3A%RQvP2!W&3goAx_=A1tw7rRQ1t%J_wC0MAHj)gdMd
zK4%z{l-nTfYptuYHc<J_&`WYQ>y-L<byM<QVuME^Tg(31Y~&T6-#EVo5>P6zeIlw#
zPF30Tmd7SxTrd6vm|VPL_AFpfkOYH{;ttL(+B=lG{|E^IKikC?sXr&xzvLdM+`3M|
zxVPMkV2|K|D7!B9X(!tI8*ktMR)H1Lh!`OXbxn2`x=a8v)qD+#juN-S1j@QR<m&*_
zq7;^id?bjd-UUkXtq)3EnJEC+;MWR@&Ac3eAw{=M&ADOIhWyy}%6y`sFF)4_5U3{r
zQA0#|bqb<<L42tI1Z?t%BCBRo0FSwnr{uq3V!ZBqj1M6OFxy@XbrsBPN$is&z{dzd
zUjn5E$WVdo&8M{Yu^7%z53&6_2My!9HXVGB<Q;&E62fElRo6I>fW{8)_NeP*4i5C6
z6mb!CrrVP5MGk?3YcR6|)-Cb`#e5A^$p6CtOh3Nn&IM2x#Y1@_i?-iF2B;PJGgoy{
zJ_>!Wns5C9k^wCQv~NjIn2uiFfZ@1=j9}N#@L*#@m;sJ8hVTzZB=g)ZK&!D<7@5?S
zJoFBrgv5krgvL;+&*zWzwHu+Ra#I@=%$eFXU>Zfrd!l#_qWix=h!O07I$yo5yj6r4
zgz3Sy&Z4KV@tv}@Tw)a51#bYf3ED|f-u;4%m!pc*`6c2o+<F#P$&5K1E=z?{XK97|
zdGzwGsv~(Ked2Ix7ykl=3t1Al+_I#nU&<Syw^$D+-nV@+WEcAZ?oo3S_vuhFc$boM
z&2cV|Pl3H~x+N<lC*L6XO(BN~YfEUpJ>n`rZWc$g-pa}9JYBpu?8m#kljPnMCV#R|
zmSjjroNxdao76&lwwrIvU0!Tm`g=_by(v7Yzp(*8xxv+j(*4}}#>N<50Hyrhz*->=
zoA&~dSFluhxz!&QPJ<ehH7jL9g#L*g$~&>_8d)@ZS(@z!9HV$6SyXaU2)l?bB17T@
zG^rrsUMSIGu=+amvg6h3#MMEVq=aUtUrz_VAhfHEbzYP^dK}JQBk_&QA~LhxX98uJ
zKJsMZ4?CaJ_aOXm^ubdg<VDjSNy-p?TsYugV1IH$Bhy(FI8txb<^v884Ua-LzzQYj
zFRqN09VQ~iE)7D-bY?^GzJG<h%!o#if@^LJ&EAYk@)GEUnBBa4K<$nVO+mqF)0hNW
zfH^9Pyfi(;NT<=qy;)R!w<dHcfo2%{96^r}yQkyW0ikL1kTN!1`yMl!@)j`l7^ACh
zESMy)>#NjBB+yjJPPNd&Ab1&2-Ck1|b}r~gw3v%&)-jo|BE(x^gzVdTCLIFpk&mnM
zJmMF-q{VISTd$<Vo>)F3!0QydGIE;&W%#BCL^WxS^R;YL1-bvt*d)#rMe;af-%@J&
z7RjC7InpI+m8913Wc~j0*o_$r+~bN`KYGS6)iP_myGZfZOq7};FzH*uFtvF?D=LBT
zV@=(Gyx0&bKRd%k`>l~?N1?Vq7<!o0713LME9t?l0MsFK{0@-ZSfGO`|Ad2oAZES`
z#1(k`^iR3MxOEi6w$CHmgx&r@*b$RXi`JbEn)1+yu16t$x9`-7E489f{bt&8y>Jeo
z@u4gCp9R@m83ko*OJtc0kyLwAfDlC~bCvy+S=gl3SXKPgz-#lif&1)r)u*!)zMx!_
z_-<Bp(M|dgj!A`Pr5j{V#LO;gH_GIq`IYS+;EW<fW-6GReE@gve(&hYhjTU&vwUI(
z+>$jC#`v*SX6#Qj4V`qsR|Hb2gk6~dfNzFF_qcnBVKKVj9@GSN;%73r&V+n&)YNkf
z9LLX}m&1dQzb^WkR5EbKc}X5hHKP*;j$&4mcGn4A46f@p5ts}DM2%QcQ;JT<A>Jfp
z*URC(p?wEe`c1_%9xmqBT&uIkq045t{`)u~p|%B{{2=~^)35XUSC#sd{lI6!foNEc
zLP0E0h)M!|Ds6o`a}EKJ0O89mZR6$g`(FN4QJmgnaEBz=(2q80s@&(PJT<9wnk>ZZ
zjlrXWlyQebhU@p!>)8nJ?2PgfmQv5Q+-ZMKrSe{+f&6R_{u!S%fL|A$^T|v<)yv-m
zG$9Ngw9$EnbZp_o;TqNbZ4Ex&!|hPr(0x`9XEe?K%sz|9{>ix|nfz!ZI<?Y||M+oP
z5ASa;8+t<j?3DB<r7)L?r0vCN`VrK<0@l>_MF*j#2uV=_yv&BiQ2A#Je?JZAqQ%mv
zk>DkT)JWM2Zq;jgG5whX!_ECW94*T-_V+iVlc09KAn>5mf^V8`_TDEkVqsJ6ae
z!~sDXq@@)_Qjl%|5djep5Rj6RMhWQ@kS^(z2I&~O1ZkvmC}|jw4k_>2o>$K~zTbDB
zd!NTghRvQmJJziAk6)DZ7BEqEgN4MTJn3_m0J<j`=-sFRugB73y}+j~J3*}8JD+U_
zNs_;&+863NFGM~TV8VzWfI>QtOPBYTTWt(72P+9at2TG=<dDGDVo|Awyz-w?!38+;
zEC3mwARI_c$m(J}k4d|N{pkmgllWx#@~goebsYoYNf5rgW$M74Xv6pLNquI}BC%XI
z1e9G_z!<xWjsdT*FbhKB^{Lg<CQ!Qh9D|CJKU7iy$@-HsFv(+YFdE|51D%J@K)cUQ
z<tslYZ#=w2YI_^hpUKTt0>(%TE}b0|xhKVAay(5`Mk7Z%=!qVuMi?SAtBQe0!4*we
z<TqVeYgh-p1e0Y)D(x|0W2z(vHJm^d%S#!^La|hIeq{6B2U_vF<5hq-h(^v#@C1>J
zCh`LD8s@*=Ltq<o^Bq27HV-X-=S={}ysutTeozl;?m`ad&9hiT!KQU2bT&wG<3u_T
zH4A&9sF4un@O#oKNj}(k&$;n+>+MOg^r!djkE&HaoDJEE7rpO({q#J-PDeQ4O}L}c
zNcLomz@p-&O@ZG9aTwJy{>5LF)Y8k}W5%9KxS<)eo3T5|*ES?SthTnWakI?Oa(3u6
z{(DgoLOuS1awN3x<vd<Bs}<~ZgTU0w6L8_sW~Hhp?{ppU7G5ZY{G*Ycs!w$LY_u|K
z$q~dEXq-$yuuP^7P@+-QA6s8mWCtQ}&%?41-1w3-pzo2|4)R<IqRxnVJWmE7FXx7j
zl6?p2GJOW_IoAw?Bo5gsbWfGcYXi;M97$lnxulbE9_Ybw<w<qz$+A`7aA>k}*})bi
zz%K=<U2AD4dDPQt0#BxK#(kiQK!h<rjOB)7H0h-rquf_i`%U2BUXFf(=6Q_aDjyL3
z^`>R9LxFGfnAPJOq+c<h^H!U*kM2m!3jO#1|D(W`>G0k&owF0;TPg~_umB2u8Gar-
zQ!ksM$4y7?!@khrW0MazX}h!0>ysDyWJYV;;naeCkAu5C;3)%LqV~chNWikA?(GFs
zx<rjb9q|%VgiX-ZW@>}Nq$T#u=4_)oeonwj77K=w6zv2k)1rBK&pK}jPpFJApb{K-
z6*RhDg*&!s&~8hyVq9;#j2z8YNO_i}%oI4rVsKp!!KT)Qls-#pw$r^j=*{A`<ZrGq
za=Q{}UgP(h3<d{zVi111?3r?$@X`rKI+Yz&q3ldvGBrcHI17?&pXsq7eEASH3*>I~
z-!!&00*@YHynk@Gvb^kjTo8l=D|HBLm=>#wpx6J&>z?hqUF)x?1tPE)KM3Mc6<&NV
z^`0^P9uqTDx4(+L*Pq~rJI(Hx?DtK`_M!W*zP8ieVET!#6ZBIV=SSX>ruJFhl1}`L
z(rD9hJ)bKMkP-OLoC5$`7eG`5eYW7McDe>oI4IS{n#w;?Q_(U#2W=r$1@F`RaNqZl
z^sera^LJ|h1xOMDV$jrH+yHoMirfVjvr2Ok-b`50cBv8|Fr6Vcs2!3bn1gf?>iG|@
z?~w~Tl`C3;E)|{^VTn{W^{N+(4#wloGM2I)PR}o(TTFIY_^BoauJ|@Nb;liF_M{S4
zc2wSCv|H;@BnG=fTrc8m>BZ2<FQDqr@aEecGY*3*wyYRooUhuV<FmXRi-6f;vwfzi
zXz98Zs1lkgX9<*s(b<fFpzPC5m2YuHash2ibo@@X*;t|fSdzR#vg>9^13O3xH>%(A
z)A4<SL{t)~IXPY+M^nx>jK%E*m~MM7;xFcrz(&?CwStKC75<{()$RBC1d={0Jjq)a
zd@8a^PpD9T><5KbJqW93Vv?XKJ*fv`{>?Vcx5gz8+OrF-v^+{W9M+W~m7;BA2T1IA
zF)T+bwWWL|T?Jp~(rmo)mch|@@5<I*PAk0HWP`mo1-R^5Pn|kxn{Y$((I%b{Y*Ye)
zbVDUQ4d!zzCC1O!B~k6fP}~J0wcTB(+D{n`WFZNT&%jD%lk0uyjj%W|e93`mVP+YB
z?s2)`!Q$2z&+KlH+nZOeTN?k!Z_sla6vYyJm%0ARATuy?9Y|^oR-%N7l<hjMU++$v
z;r{r|{;+SI(&*0lLWS}3H_W1)xD2dk`2s<z)071@h>6A=D-yA9Lzy>z*Ay>dcCcE%
zN>Q$F4;FBPg(hiQ7x5GUU%XJsr(aJw#+1spte&4|UH`Q|Kr@N8-+U6YCKNWpTF{ds
zDvCkd5|ic#9&$@`+`1pp>K4$A4!hYsHroF|LfJbrY^}Rog#lGnj$GTtwSS4gz*o+;
z1A9fen3&1ya1M!1X--=l#fGYOhY6!WqF8D2a<keoA(f{S+>14WU~xe?%v07fm#Lku
z_8L4)?hd0pNM1GB5NJo=rUj!busl_>Hazp(PvDguXCswXIw~2<2Zzse%;`Ro!YrwQ
z6PlCF#&z-uwwhIXP=E{t8um#e-h6V+Nl;%EmfXi?2=tlW`LC&?WO^`pVDZXDc<nFp
z89b9fZ`QbGj*}Zx&JtE5^o4-Kn^tnx{3*s$i8-55uTAer&NZ1N2b?L8S$_th`ptx^
zHmyi%ug)?OSRPyTVnq1Y<I{9Y8G^M%Ow4=um&^;2_T31@-(b9NC)sy!{baw=aTYYU
z1w`n`na*Z^oTpejvYTD+c|e#GhJwnn?^pE5)#cvH7B$VMB6!n|tM8mtM275QoTtoc
za1&P)cLx(<s9hgBuJ%g!eTxz5lXycdkRiH$m%ZL`P0>|C82Xv(>prr}%I-F@TcqK1
zz&-zW5&14CA~O%+DuIG`INweyk&O+-ky5e5JC=tWD2T$$m$S1~C-VHiq96!y2SkL}
zE+6dMBC?hw^2`l#C42?B_hMy6D{yq`UUWZ<6fw^XK;fNzakYtSO^i02NtOuywBUg_
z3hAKFLbr*5tGHOGRdZ2Z*g1}-%6yBaGYKpf1QWuj>U}QmYle>!LOet@#Yeeaon>4+
zLp=(GuEKRku;7jIcQCdGhuX&$;FU6ad+Itc#5x-A<-XhqRb{YVNX->e-@{7kNPXuK
zBz!c*RaG7Ko?rRs%1Rckph3_O*61P77Yn_|K!%TCuDNz>ge71%5YFh00tZ}pA$s3}
zPS?v)J5GK<*5hg8ykgiBlmtQ$SV8*CS{TwrkG$>H>|YReplg}kta9!C+^y32bD*u(
zX)e{5=g0pzj`L=j2<{C}C%V+e2HD1)blAS$$6bIJU^pIVBa~$!sMBddUPE2op?=6d
z9&iB~mjelok5Q|MFfK?c;8)M6zMMdsmAm08(%kJbv)1`pN%=KEzPDY?`DZzj*wrHN
zn@&a1hy0VJ6C)~}-dCV@Qe;7JW@gdJG{|?<=<)P-9vVh^X=(u=CZhD$x4oC4_QYs%
z=A|R#h;uGz^GI#@?ef%1fs)BG`|0mw2YLZLReWn}kny0$72rGgh{a|+9mn`WA>vZi
zU-@uf%gOvrAiE}@piof;=-|<Dv{XXq1X`{HGbpgvPjezEiWT$0!RQe~=Y*x`A{1ah
zXvaUp?XoQi<378cEzksNm>1<vew%G<Y^w0G5D#B78q<2CQk%36iuPU*_K3c;Bl+DN
zqo(*Flli55(&Gr}=87C+yOzUB88k$3p1AfyEvu{44=Hq74_T0>AmeQ>O|n}XlrjhY
z=-SHnERwLNO`EwzLAeHSM5=T@J~t-{kh5k(D^df#0eW)RKv{Tqe@LO7Q-nST(wrKW
z3I^hO4#SGl<fpr)l3B7Yaakk=ReMZTJ6z`D91Ejxj4=L-;{|WAanRzFJp=uAJoxOW
zPPTQ+74^dBrMI%a`?6?2{fs92uTIUXIN%zRXQsZb0MP<fe6hEwdn})?37vq_t+C?g
zyEmSw!NOEX`|J+54L=sZuUbA_*m6~BM#4Cg<7d+6g`*k@WFx5IkzzMt8X6kj>_XeT
zPu$CC-?1;16lbb*_`O#-5Od$q>t||zNCl+Na-}O~Rbv{y-d>`2x(an3rPp+V5;(6!
z3bo~Kma-k>N>9Ytzdw7*pPQ|5kqbAsTZ3Tjs~5RJAM4@$MU!PCODB1p>gURK9W}$5
z)P_2fiYKc36$P!0_t=0ET0+%_KDe;;P4~v%5;>5ENZ9@y(4G&TBJwnLpz&~?jdI&U
z*K+dmgxZacyHe*DZH}ndeiiSaSz|Q}Hg%}WLPh~=i0vX$5Te8H$ugdD1|q<pL4|<j
zG5^xFq2ggU<j;y>FmPF(WBWhq<&X1lBb$JFxqPp`MV1bmir^Ck0dLy{;5((Y-9!@b
zQ2GKg64M-{M_B{vmr-eL3!n-{1e2pLHuND@<>-o~2=BxrHyie{0~{zze4(78iaT!r
z?gpJ_glASj<n20|`oPKlS~_q0ml9COlmZbYx%omZXmxz-ow0Kfdytw$k?7}f-(YK?
zOk2DfQb4C#_jH?iyvUI{iaZ5(;d9*s6o$ct(`(B8xzsr;d3cD7`S0E<oTK=C1m;Wm
zR+WhvkmhorT{Zezr{~9_hMIiUrI%NicRREEP}ZPgv|Nga@H`FY6?zG3ZNvd~G=8g@
zs3qB2QRH|7Ss#%(UScLKC9U>RnEy6VT*U_f09R|9M<Zw{+d|S!M$~AY?(m=>U6{!D
zV~V27z)Ai^@d2MU^N3f1F~fr(3{E87>ST88Xiqs@M1V;a^*{_oQwJEh6T$O8BiT^l
z1XWu~7XX=er3kRimLUr&veuQfNX{E`k5F`yVJUj$9*$!>jQrg0gfH&x4-RKkjxX?G
z-<iKEr18{M>X038)<Y_(=t`EPVr~(nPUYqJ@s>r|*NT{`d~rWX)z%7}(emDNp|ePv
z3ZhqADhJ$N?&~gc*9K3P$cn9_qVyep50e~GRD^k%ytU3E=(q;*g>D(S?Ip}&7>spm
zY&#N*8a)kMyAo>JsKLH|#G4nI|1U7L$K#)?kxYk;LHK>zFQB?YiZDI!!eYaKTXUmP
zbnKiE&=7`YVv4(EdC}HCDrNkRK19nW$?k*_Fr!moW6mA0skTIte>_Oh=yIQjIpYpq
z1O9N30&puip7UkBt%b!oHc{FyAIm}4k?#PKu2%E;%hM3t?rTkb@i!uPmh$DV2BM{k
znTEsO`AnzelLW`=CaKi>9`)a7z8r`N!v<;Jsj!0fc#PAe<$*>KJ7;ZN0F!2>%$Lz+
zphMeKVw~*AIIcvo3(0RajaEcmgY~X|-q-lhowEWe@cB;h&ePX=mi9D)3O<LAD6>1%
zml$@n@?ah(c=TifjHju=F%a*fGPo~QoTAW-<S9Mv9J7LP(15CTGcqjiN}sz>UomS|
z-*XizQz|pltD~xh23j*cpOBnhfuO^$Jwbe(ATdDfzQM^ZQ<h`w=&1B9LmA)=GbfD&
zpn0a-+dGsKx5KOVL8mno+xh%p45(rQem%Z54o7E8%ugx$bu$vp6EbDpXR7Y|uWDHL
z5gXsTkcTdPuAASxPEXRij7y>ngG`6AznOKb&9sw7<`)U;I7Z5+@JuIfI$o-;5t<xq
zJ3dbDGWz#^e;4$JNiW^_=m09Q`m?oP8#Wc(b|ZCfy=US2B&>OGL2rK#5<yRTN_>7w
znJx3=w-zO|q)8BN2R_NUED4NTNWfx~L?&SS@<t1RmmYrWB1M>Yz68{@t^!%Lep%b7
zamV@_0ABoj;zA*($vD}@z=w;l&C`sHhqsrn<*3r2y5a(i>6~Z&tL$W_ykj7IOU>oS
z`j#N^t=v<`_h6RSffR>c^u*q6cFNTxV(`X*H<?>nk^l^gZ)G*)UL4(w^$4HLry(B=
z(N7;Z2?B#~6)6r&;8{#qH8aAZzl{WKhM-lnJs&JmUkgx`dp2#b4e(=)pnDZ2F%y*`
zCT!3qVSNYn9)5Y$o92oP(%B|K9mi?y$!e!EsjeD>%=e`o9@(Vkyp~vlk-OuT$3R>w
zaNN2DhoLA5M0kqZyqDrmo?~fOhRG=4y=}Fx!I&HIwhVK`DhiCM)2=!(ifE~!k0k)q
zbOw3WV}x%{un0mV0h&Vcs31p?M=-ew?(p*nADF2WEjAp3*|2-r{Go5TLsp%PZ*xJK
zg?S#UckuAtL93Q=`|Ums*S9D#x1!R;2gfslDoUR$W)Od@yY^{-V>E_p_}M18`LiXP
zL-rBXr0+2TG@X4MMGrJJTPjFh?~s!XN9g1g(4VesX?|QgiHKnc{ashXkwOtXi*0xl
zsW~^_@Ulp|zHTMo%9+{SV<RnR6@;rjo}9n;1uv)~#cUWJQ%-ahv*rNZy*}5A?<+6B
z`pb#FM)w34VW3Kpab*@?Y*`tf_FXDbMJ(L;=_w+;0W<?)&2(9E_8sMF!KuWsG{}$d
zK^DgIT2SMvSDf=U=Dv!`D$g|GYN6LHEc^shGkd-HKFB;UOj+jb^b_2*k%2lBDj#Dx
z7xXBVVD~pcFHu|E36eelZJjr-?MCy}@A7l&a*vw46hn!=n0$fXJ(Xc~gdtTJw*e&R
z86j7scNTT_@m@#t_q#Al<){H9<&kV|z^0xSR_RMUgsJdDyoGQ>EOp^m!&s<>JKcka
zcjjCGd==xlvXi*4bjk}2uA#}3)eII)+iR%ciZ#sh#t*0L686S*feXOpOR%CjSt5}R
zk1>>@OWK#~(DUqY_|Ev#Fn%R>V&3>oN0#Gv#n_KaIPFuql<r^-GoKvssykR?|Jvtg
z0!Z6$O@6RgO6MHWP(Z_3ok?uKOMdsoTi`LKBzFyQK}ShZda$H8%wo)T61IdV&TRT>
z`l=#T`dEn+11HXl1WAKH312pY7fztjXscLcV_xu%l|J0_5i1)qwX1ei8Z<B=qw-e(
zl<?VC7fauu?5<(CKPDC({;)smqmL?uHS2d?0Yz(y%UG`FFHSXK7`bS142Dh)`MDx^
z1_Al)5$z|SUp!cnyWm)GMX2rVbqf5}kI~gOD*?b*lpk%Cu67;>$uW-t;ZNE<1Z1#-
zZ|h&6In)J;sxRnGM|+=Tf4J1FI)Yur`F%Bhy8k`5ZGWmbY8)LXDK&m3SvB2pWc_3o
zxONhLvYjZArBXM#x&%Dqap1F2eX{k9lPevtDoAG)OdEGzba6Xq;c^zlnOI~BL4(>5
zHH)PSW3UB}?AgQANf>@2fsv?knQ&)4O^fYQy_?N9B^p04UC*DqN~wJ3b<jS}{D<J;
zh}718tCv@=$ZG0&OGt%;Ur6msKfY(^D<Yw65LQlXQl6NzD|HA+PPuQfKRx;c!#VfO
zvM%nTH<!!p_(^N?WqSC7elIlfB@y>Nc|5)MQ|IuIw?rxp9bg^j2o4i}CKLa>Zkk8Z
zkbuU-GF(4k6+*Vj{xIL;?F%dT{q4e$L=g<sUBNw1-aOt}@Mg8HL|6*0Ow9Xjt>fkH
zNGyD9@e>R#irKPCWR#%$X9|fco!SrN%MBzLZ^D|!QE$k7^cBOcmy6?+vI<CKNHtmP
z2uD>f-Y<n{7+0tr#AvycwU<ECWJl=$^o&{;_r~v&csh-X@Ej8s^`g48MCXlbR7AW6
zsY@HEwl5h}bU{EYU?vyl#oj`Y>RTW!kKkRuc<z3F;1t;uvjNm=?ur*&LT3Ho#BStt
z2JiDqeIh@Fyp}shs%5MA!N^9U?S5P@zMY-c6(Cx>-CgO#8dbOjqEUV2c0c#$KOAEq
z*;o=XG|CPmK9DMSCXjIQ;UP4TQd-Ir*@c<rnx?s#2HZmN+RUFGOMJc-YYx8M2D<|X
z2YvTQBOC6m$Q4Z!-FhrXNpi@LZlWPBMs4|27-ZiBiZF5doI~J^O(N{H|89R|z4bk2
zzwyQPP`QzVJVcOtcuUtQNegs<y^gBZ(wW3j`SG84vqlU{d9l;ql40oC<k2Lk2q!9I
ztKg=m=2y%dM5-JWJ!;T1`mkOcA)>=XfQ!jo+J630{|YPEGM^;~_IgFho@ss|HT>Z5
z;_9Nlq|oaKp|)XVyB~Rrl*@#9xXfAP@rX}!dY@U0%?bnr$+lK6R5;ublO>_Q%GIeK
zFhJR;a>6K~;|Mo)5vslt>Q#|UNvOP`X!>&R{N%ufLYsYdmQUXI{_oKPUQmUIeMLP8
z8-RKfL>SBWJ<w-z7*dx2l4wuk`szR+s~A?}lsKa>EPKtcBaBYmzJaBgu7>U4ERen7
ziGngR%Ogq%^#GsGOH5Kll(UYOh#G#qAcI!<87mgNwRN-=;9iYbPHDZALim@-!lMbP
zE~RBP6@Z?=bJ{08m9%qLn^~ONN<r?CHV-1T5_&TPgm|m*DcIlkB=eN%D9k2-VSNVx
z%n++Br%a^T)!_utrusH?{|QH-L_MwY64X(pHA@B(`3gw_Dpmp0XC5AFb~2y>_@0bH
z?$QnPk{<aiT{h+Zm`)T_#e?|-dAtRAa=AU+cnJ=g<iL)g9OD#c&<6M3e6u%xsIl_(
zwdK{Y!|8A1<wjW*b~}EY5GD;`BJ3p0o-uat1PPSlF0?a%nnm5%wXmBi>{*-EcI3%W
z&WJx3dSzyem%DQ#c&TZRk##$hn=nHuI`CBY>T+@CM+!Opk&6iGY2Js!q3$Lua@d1y
z4<BTelosW;d?{kNCH)|jk<vsz1AXY(gxa9q5l$(OCgVAeqY0(-(6frhj*F<3mn&+~
z%EqdN31k_;;py`FhC1rfr^kSeC=z{EcB&d3Sr$Mo&(pOR(zR|3Xy9_;=cn$6UoLBh
zrk$Kd-hYTnw;v*Qdobfs(Fd6;&9q7x7-!-VS8|o^Z^?)CXD~d9^uWM)NaR2eMt`5a
z-uDjv!%N>>MH}mnHn;?x?cpMQ(<?Kxi+cvf#$SM5tl;VU(+$Tj_m*>Zv1X(PmyP?P
z?0!u!o-<;)KoS|VVWn=)3yXfJ`Av@_yvZ(-5^8!7ILqAHwEe+of@}UynDr_Gb~0vh
zg3%-Kgj&hCyT~Mrsu#!U;ofQWN^+q8%S){I_fv5%F#16&6LtWUg>#!ZN-n+Tp-qpE
zrep35O31`~tpZzb4zVvg1_cI%*EelWA!E%_mG9TNrlvh~b0D$S%{KOesR9Qr@iWg=
zBOOVUFTlEFBfv;NI+Xx!VTUy;Y9$tBc6kM(!KNE%q+B0|TpH-LHfuHug#9^@SkpkW
za(vk<RhJcoiGDG0PSy_+psxC2S${C9oA6Sw_#5c#2c!AoT<7c86)t9d6YaGTYB>OZ
zI1E?*%;yMi9BF&q&SnERYGsr?vux^d5Zn{4@dQ$neqmvP6CYwE*!x&Rw6Fv;gd~Y2
z!nVPeA@z{Ei~zyGkdUG|XO^5+Kz{jP7eB*Xkg#b*0X9dkxvZ=pOk7tf0;(k9Z7V{1
z174ph`m`O*m7;+{N|>(j=J>#b-i9CJkg^3_M`?>mr~K9DTc<gl2Tv0=V}_l(NZ&l)
zX&0&C6n3;cy?Kzl-%h3JQW{z)r;v<rlc~RTUeZqDzL7pXJ7kAo&pj$vI6R(L-hbz@
z&9e32{qNq|ss*eVt<XMY1xUo=xGlySHv9zAVAozTWZJa<!PuR6`=1p3{z{BtV9SIE
zWUClf3Re+pnyGrVKyT4NWWZq~x+tNI6((-otEy>1JR$JZ900OrmZLQ-O(s=+Fb2Eg
z!_ml5E9USvL+N%z`(A;C>h7zlcaOV8NR>+|@Soa(;8K%SOT|KSKK4l`dWO2fNeeDg
z0El0<5|jt2fo4CvnZ!;C>+BWQ@f_Y*sUhH=Z~*TN%~tF}O&D^*u=O2*5m0t4NtlFd
z+qx21uW7qieYXY_rll92wDtNS{B74?chDIDBoJ3ejmeLB1+1p$z_8wXGqMaDmhufW
z>tb)ibM3L<eV->Mmq1~J8%T`DO}~F8oz)KYD@qyza1HL;Rvt+i<*}f77GklMKat!A
z#2_pDeKHd{{_lv{3zEoZV10REp>oe0%**0N&w5GVlD<C*Sa_#xBgKo8HKU)umZ|J?
z4v~9d34CE>->%dLqi~kcBa1cht70mkCgn09_J#PHd5+$t;|UX!LA4J<Lu!$ucY0p`
z3)%N%n)}JFMZo&+#lYebieS{-jN-twGt>d;2&FCYel+S#3LamUd`zt6KXd+j;bNPp
z2OMLNj112b<lF|2GLg%vy@W_GMRb`Ws_5sSo|S(b$kx1jz+!~;;{yR#Y`--K@NVu)
zCMXDQ-rED|nluGLKcofzU@{<?)<F_MK_+eiL24|^j^|H3F5_}Q1RI8$;}#tCtD1_%
zZd^Sa66UuJ%!Gi+@{7%vVzdDyv_6}$-Pd6E*?>y*U83W3C9H-3OeYEe>Zy6wCSvXy
zXx7FixKZrwI(c|Nj|`My;22}Y@EYL^seaAZPSSO#+CpCj*kUr=J?IoXWct8p7x#2n
zQxs@Z_kf5>pHoJC7oTM{TKO^mP55*vv$u`4E!RWS*M^8<l?*LL`FMk3d7J1o!v;an
zqVX;`D_s=@sElQxdKBtk2>k0x;CU2o!1}xm6<d!oPIKqEoJMzE8>t{L!b)vMR%xPR
zli|bSYBpt7F!o!J0b1M1AL`pCIRPNUine&*0p;f`Pmj&~iR2Qy!h{NPQMol7bxVES
z@h9x4NsJw~Pu0a9N&Br~QzvDWrJoMe*ITO9?!TeQEK+F~I3+zn=fM@+_TN83Y3w+*
zJ1j!*a=b_0nRmu34$KE4GJ)H<dy}K*)ru_@1<e&PHR-nXGsJ5+zw!#P&#<s5T|nMs
zHyD{dB$=%@z&tT)Kjrm7QS|>6bwCNe8c~j6YI0KCqg4jJ<(S5^%tS`&aH(}batS%G
z0zmE(F(w1@8qj6bF9mqkIDRns>2LPFY&te+c7h55@&0H|81S4lY#Na%+Gl;NKQ+ge
zkbqwr&Yy#R>B0CqvF9C2ao{sho<@6l1~}Sf;QAblz)xXjyw;Y0MaBzgOG=vt7LtsW
z;M*q57$WCCE>K}Ej?P|Lg?M@h_wb-cVm-jv*3k7tqpu2^#`m;0iTl!I3;3X+XzE|T
zmjU>PFHZ%F3;<FTcScCzYSPmHro;!<+FigWoygXsYN*xhra#6VOJsMx0q=mKL<zy(
z{JB8NN?H##*$LqL{o=(qAyuMv<fz(P?^+9v(r_NGCwSuKW6y1ZPtb;dvaSWL=$Um2
zBJ&h5T?|2K?W~PfOshMq67U+ACkLrK^9#X(sh(Y)at}x}iHV(F>IAh$3&>^k=@EfL
za36^!0P4t@Gux~A0+c+l=Uf4yRZKHO&<s)}#vXG$GMyOX<|W1TyMV;hs%zfEmO>$N
zyvPySmj2~p=>TW%(P#-{(x*Oj1Fp?HQe(9WcacGdAN{|u0JCTMQv96<VLfceiDZw3
zL@f7Dh}>RP_e;{w(-eKxkGk-;;yXF(Mkb`ZIeqT@ic#_LCw8{6Ju|EnjV&F0x$}6_
zZK7Gi#XP6<G^*b*0Q{y04QGwuxItVHWll`FV1(I6t*zOry}93=FDP@JW#lLh8EW_2
z1+EP3%K)w!SmoDaIX-#_;hNB<zsB6dxMyRhi$dQ1MQE?*9jl9H^0id2rhFD&9R~CQ
zB3OHV^>VywV58Hxo*$a}`CMPIUdw$w^Y4LT*T&vp%w^q0WzA!ezWz$G`13Yo2M{{}
z`Uu^+3B2Yz{y_#%N{+vH$sNK7(%Mj$uFx$I!uNc(EFu3=e!$6?Ud~G10&I(AQ2pgl
z0CZP_>&KOV@fr^`bqF_dEDi!`6<{x-STVBh8$IOJ#h93KI~hue<_ryT1p$X~7{L>+
za==U`6LQk*tZj%D9o$U;+e_wTpEq_)`HHY!CWk*32F;t`!?UIe17ngQ)F%8zKRMGX
z>S+tzuVWC-7Pbk=q4_r(%S<GMX%Qi|?~k#$AGgqW^|&}_*;ZNtuJ+tmUXP{2G>T>|
z?GptA$z-(nIfU~Lb6jtuv<kA-YjEflpW6hsQs)n89*33eg7_)e!XgC4*d^C{?8Jx!
zc5B{_cmikIYmUlXGnjUj1r0$o3tFlEX$}f48uYP5m$D&RM;goWW}#G?^&3Bur~YFP
z0Avsgw#{%^D1`<XDs|YF^a{<mL#_sKkW0>>-<Yin^>tvpkwJ9}V;Npn+y)yLLyaPM
zhraoqn-V6>|KP+f<@AJh9`ECuAQGX2?{i2qANSk^h9~Vzbeh@q<k_1WI(b{YD9hvf
z;BVe>$6bPLvdLK?eP(^K<L!N8V*wR7ZQBJYMSX(EjU+AXcvoWMBfp5>D_X%VTFto^
zE4Pjsq$7C8`Mb_TZknZP*UmJ`ylA%nHBO;^YJ*H_w5QXvfLXBIEQ5HH3&PDB7~*pr
z0w`Q8=pnYSfEKd2F6v^k`^ERx9>ww9ew=x+(L2B@lKdgoVPpeKJK$6asHIYyLwlgR
z8_>SId`WE^XQHrJI71WPU@$1i9RM=|FOe3Hz(2Ld6k|Bpc+WilQEqb_L`%s!6w9ci
zLfIx2Ed?X1Y4+|g_0c7V@qM6usZh(<W$DapBTmAbVciG@UIr5%Jr|i*qqQ(8YwRt!
zs8q3ss*@&<a2)WbB&NDfaO^{yd%D#)Yl-WX2r#o3>=<D`Q&sL}d!LSm97El=yO22t
z;*ldz_q*Hg4!S@LhdYx#a+XzD_`{z0W&pqJM*9XBm$h<6OT7=iI}RuBlQ4K`+YP)z
zjV}OA>$5gj3eW|hEhhJjcc7+qSOj|V3fSwmo2qspbB1+o<ITR1#gfq;$m7GsRX>4Y
z#GB@Q1QXJwxfFqMza_fHLB|1ff#^$Lzuq5^zJg?2=@p{qBkgmno2f$c6xb%x)XgHU
z6Zxtr)qE;Fqf=93=3~ZT7N$omGK7w~f8&w?i_{#K_SjAfe(jvF3s_;W+?qW4+&d(L
z#<X16Mo>TOQ230Mu^_6)7m6RF8e-;Lx;dS4%f;2Oo!E7jtU+Wo`=Fc5Fr?vbz+u!*
z`{+S19=}dwQ6F`>oUjFU<?O|E<Y6;jZDWv58j`YQ?8<q$8+m%q0Pll=2F{D|nuZ;^
z739&ug#P>7)3$A3A)(!NOLJ_+yQqlxB8q%=)bL$${rs;6iP-GTVqilP2dJh^bAD~=
zT|mab0BEce2Fxc9fJl)t`IgG$_Y3*K#PQz{A|U?M&x<+C^8}B)2jA9f4(n7xFd9>n
zpT4OLr}R|}KCeU{-vag?0Q^mDUPV=T5iC6A;X%u*{~k$*8X1Q9QGH^dIO(?60sejz
z947~<IaodQzKtYESkd~jRyPm2PS{txxYr~A)k^zh@W2W@|Dt;~bftdPo=#*$>IeAw
z>RGhRI!dY3oYe=T5q7TA-x)Ya)R}ag)^1iZV;D)G3k$iRiIaESGcg|W7P91+udW+>
z#EjPL4#$DbG8RO#`k^#|<akOKOPL93sE8YJSx~AQ<O5sMf);{)W74yD0&xb1mxdq|
z<W6xr$sJWur$p6iq9jMll#AtP);%?5ZAtCuAP{%!@bX*vNVRe69OH(KcV|YQ$fqmj
zn20>Dpu%m+F;q8JQ&N~b0MLhsZ}V#-611#?X(f<FZU-NqOMEp->?Yak+(pwU^t>&H
z%FXD3q$<)m_Tlx~+3@lZf)Or!Wj+>qOXK~r=&OkEVYnDSs+RhJmjx~n_JlYoCcS^J
zh*oW1VQrg>YnVLp&E~|)&)*RzCJU$g&jr6Z#ux5)4~)dvfPtg19~oj&GsxafZ$D@o
zX`VKnZFV$aH<nz)Zh<`(?fk9G;%MR^jpuoNxy@{Bf0P?P<HX*<=}E+fj5Bl;#r=)+
zB0PYFhqG3G8>%?naH6Ix*zQ}=oIHjXYQ0(Mu=zDpc{WI)XwJ<5ZVLMBQ5NwVgr-YM
zQs&Ev{mHcp`DlJ6-+xABR`{zrD<<@Ugrzuo31fAJAVbCSQaYS<cUR$N5{$kxmGQs;
zX9~0>xHP3XaNz0)K&|IQUL%0qF}*i$DDZuSaRII`J>&f(5T}bi^J771`aY7+p+%6D
zCRU?)a&@HoGL$Y=p*34!l;`HrK6?e9Fba&T1<3Lj;zD;nyhoop9z!+8-v)fvch@!#
zzdhE}v9KS%N`km&1+p!TH>Q-R4N&*XdnHpf>@$%LHLaO;vX@v0<g@bdN2u8lE5^yA
zI~}8b=8nJ|?@`3Ve5wlwxt*DUyxa1~8?KUPC8#zA(;f?VC2Y2E>;@zfy=@^0-J4$M
zn(AS~JNac_l>8)h`(p3v4>#mdgl7^Lt#ok2@;<~9p`}cv?8q%JRJj<$#T<?TiPQWT
z;+Q~YrW9b!u56JzYyoRbRf48MKXH61HMCpJ{F58f_^m7D6Xz<UhAaruf?jInlTEnM
zh+%&Ig~@L3E{?sSkTZ6H{7ELBw29fWVp~~z|76DO*ey;KDnpj*pR-08UHKk0PLJl-
z1Z95D4$kc~d#zH@=>Ql0^`22hN?EhA_87O?0(CobENOBHvq+KanX{H=s#@SLkvo~}
zLhyC@;D!HC*nxITJc=7_(5aTXJ-+qjgHm&61m(BcVW&%L|AL0eVh#hZ*c;aRMCxgP
zEoxNOe;+IJlekei)b8(i0-Mg*1tG!>yC_Q<(3tB%N1d4xRf5WSH0Ba~Act?a&A8+#
z0-9e%<f3kMCMv1|Ff2O6b$FO0dR91tv=QzziRF)g)5|>`La2`Cilnt^Rb0rbm=I7$
z9!tF17%$1=L-m0)L-zpIp!MYA<e@4y9>u*5$m;7g1@#XQlSsj@jV>2d_@D=27P6bK
ztw*yUW=!D~JY)9BHoeKC(d}6sSp`M4Xa%cuHn&PC>13(AZdp%fM*)~D3&yZmQw#~2
z2@*i351WfHjwXz0BBSr(G+s~&JK2WeYyoZ#X0nw=7$@qE3tAB~hM437Q3Fc3I~b}G
z_5ubInm|WJzppWGw2hjkF~s(=PKD0>NdZf$iK9Z`9zN%KX-{?`fO`hy?cP0a)l>Q=
zr@(pf&hl%wK!hI^7}CNRlSP1}PzQ#}z_kvXbBXG#o&#^BWk_OIyYE#nG1DYP#d>|t
zyT)NUPa{Gx9+^7hIXjHLigK^iO;EYvb}0C~2by^r@+h9f9r=g>PyOt*a%!zj6W7&H
z5dlGgAfYLdL^D^;!sCG@*%<Tkg6{Slt&}ar*^QmA(!-C(94jvDz+)Y0D@T;?Wa^xl
z&-ze>e7bXI1qd0vG+zj6g&!M$gfPe*Uf<?IF=kXQx}L(*z{-9;z!O*5n^s>~zsgN3
zxADE>dZvP?smt4_{jcLM%f_S%xlZ}nEZ~W&m0v`wLu!)!HY?y-B4)@HedZuaX^V3M
zs#ovbOair)Iip!`SumLeZoDmU(_e7-9eqXN_cyo?Pwry-((nMKA<w(SKwn`AOeT2W
zY&!Yv^EylSP#R{7hR!K>Qlx*)aX)nzZr@MUx%F-4+HH3LS;8p|{*WqSI-G;5y8BTV
zDQ&LepZ^GhUZNNip7OpOMEkpx1)k_|MdvxsD_)NEV5ByW#sSTnP<8W?&)A$*2AH&_
zhxdxPx4_Ldgya|YK)jLwnb2dfd0JuId)wHd<cn=-$?>?%HQkjk#|y<R?g*eQV}VK$
zYgGe*n19nPgBz`v)yV_V1@#h?GcMYr2gl<{bXb~rN|OqTKV3y9mn<=~96?i;+0pQS
zwNv-m0gI*BV*(hEc{z<NWKKx*U-!z*)crD-W>|NPeD>A)aBg#qpo31+Wixb5z~X<p
z{SYf_f5J#-Om)w}qMfHl3Ev!$=I}aALTP|{l$FhdVGNoZ?H-NSXF&B+fo-c}Xue;t
zBN5GM;;A~8d)p3ZVb@(mETY)2`@!BeRg7ty;lnOw{gr!IWGyU>w47vzti<fd(+scw
zM<els$MdGq`S?cF$%fJJGV;g*Khf0)_4q1<d)ZgPC)ZrwbTrO#E~M-BV$Pj1xs3U3
zRpXNd(`>Fx8>U+-Z{(he+AWB==iT(!QiOD#`foS1Lm3RDvz2O!s+=W<8R3u!lAPuc
zu41UNi7rEwM;&<hut{1Mko?e(Cc<4%Q~U_73j?FJlc(d-ASGzx_Ipbg!Aj%rV+3TW
zyEcaKF%52dJPQJ*Gd@K=#{=lpt|jI6KN_!MboVNl=T0VEorxvj^On}a@}hO7JgO<p
z(h-(Gp$RI>m}H-)^4i^g2Gl45a!5zTNBMP(>Oc8n-Ju2+PUz}0i-C?Vw<X))pm19q
zCI!3tWA2J52DK0NlTKNb3G+=_VGIbjp-p8UA*Ms`!II@*@crP8&itq9QE##AS8(-w
zKh1rwp9^hj3_gb2q2_9KV?7Kd`?nr(SOQ9eMD=P~;A^C!=Ra0t5112?Cmd@SDZkB6
z3Jv#FGZ+czbd`ZN77X#AYl@(X7f2xuTbN5Fw~rE3fyNtyH=Vv|(&a9OFS~Dl9*K55
z0$+URye2ZoZ0A};pBi!qKS1c@9@W~>fr)+K_|QhjvDC&yO}xdM$M8Bt_FB5U?LEge
z**&c?t=42CX_wnC6l}R`MVvKKj9TDDYY##2QWnUQ5;B@S5xx^RUS_9-08vNX=+<OY
z_3F#r!I<S*qsn)}y&=b8W67%~JVVRr!I={!?ftf3ZOfi0sC75lYd*#LhneB;)#;bg
ziU^Qs>241k{A*gci3w>?K*=<Q5=+Z4fPNSQCcI@CAZWfXmpizFsVh4V<jnY9L`>|{
z^Ev!fpsir+pBNx8uN|52-&{h6H9X?*U#==rh?GuHXdjbycC$Az=XNux8#>WSFxfRZ
zcidPi8|m9ej(uildU!jfo=w43ph4tK^6PIyI|u#^u5ytI$V!V3$&4wZvk4nZNoEv{
zcQ*BqlH<q#pKfaWmA&S%l|r-0=iV1?RLy&eLxz;U3SyvK@y9=p$3O;GfE4~0@AaO4
zlaef}A*4xU17J#T8FZ1F=<7awri%Q4B1-SZ0*d|5(=iB=p~HyPW7<AWFlcaEVJh3@
z#EK=r3eKr4+Yf<p;;)VsHIj3-aBwXY$Mt~h$(Yggdv%<^_m9}|*XTSC={e5B$a{iX
ze6NtV-*Wx#6L&TelT}uhZXfx~yzqljv!hW_k`9nZ{};0KA1aRPa^R%xw5d*h>YLZU
zZvyfYx`41~3?L6AJ=>54AnFk)#MEjFEF8v<0-Z12LQrFQH)J`doe}vTa#aOGKmR84
zUT^ykZf5rWKJ((yPZfKixZz^jsN#=HE-v@>lN;Tebv-SJ6EMtKBQ*%>G4tQO@x%%>
z=^Q<F?F!!A^DEf*A{(jAPwc)h<{p?X9rtkIaP9R9OBcALas6IH{(SM|4J0>vdUfEZ
z$Kt9FELz*i#0J#n<A8mcKq3{qbsB_-y-l7jW>-J#KUo5{R(NpP_g~*2EO^U6sjZ9Y
zKOmM(vAJ54ti4(jqg*M{dkkY56;wi5p*QAkdayYC0I;H(O51PuywPzux%!04)clfe
z^oPK#lNPSXHy>7>8*u?{#K$$jcM*K-Bl6F>_#Xp=4|Xh{a`|ocUzbrp=b>8+k!Y@)
z0|A$1s8iCXzDAA;S;1C=b*rU$CBaI>1_by%EBSvM>Hq%gB9LaSKkLc)YZ7{L$@9`>
z^jLs)oCK<SeB}^jgv(?wV>>ZwlyG%p;~r2LDq`@4BzKICx4(w-{2Y*f4DJ8@*F^^K
zVJ^S4pP$fq0@C{+EuTUckT&QA4T0O)Lh#ro@H|YRvX$;r^P&o<U?$Hqk?~pEn(tpL
z{XbQ4{`>W+D-8Xxf$7zQzwY0r<Qzb4-vw+Ro7G(q-cyXKo3MI9)tNf5Og5*29yb?M
zb2Fowd-><U`K_q@-!J-QAxDWAHkNEDh{dycA0-a49A}(t|Ef~$)PM6`H3?7v;QuH7
z{%4T#->3VZEuPf$O$AoY7(#x3XsrLQ@A&ueRw*GtPu|hEvft>;|HpU!wc9Ek_>Lff
zu+uMJ&Hrs8_@B3@THymt+1Y2tUn2h3Ddc~C=U;P7svjf}>(^xS?Em9Q|9yd;4dS}G
zv$2fEGQTKQzn+-uJFy?0SwB2HmlqU7xUA~F+tbAJWlw5i+S=R4o6NeHWl`SiZ~^9=
zS!MbW-)C2J4oBBsYrDoam0LC3JwL(Y0B*4yz;S{*vLSxT&%}DYK0r)VA83#np+DyB
znBzXLwQ9VV;(j+*O+Gwwts5rL>v3`33jimloUkvyEg^qTq_mekKY2R!3GK<fbORD!
z%g|Kwms!SNkJD2X|M++zv_&r(2%OIO8jUyow7=LFM}-7WzXLF5Zz^3&Pcc6!5UxAQ
zigPPeJE6sVTwO^3=KmCOfdy+W^JVp(%2^kEM%Nutt|c>chkdx+vP%%TknIDm1v#z+
z&&%0dIxZKvaYAc>v^oCf4^yWx=j>cMu({BR7N}-j%^J=UixxH;vi2Lrb;q%X61Lo}
z9&`P8K9t}VybLegxlqt#r%3`$_*KUKfdRcmXF|{2#YhE;sWGG9ja^#LWcnw3Ppm-D
z`k@cP@IN#DhrGR$R}+vw86j$UEeAdAoSU9{Wx#Gj{WUAsbXr0zDP!-@roQn5oeD&t
zqutQrN3xdW$CD29%nMnafr+bMW3w*Dr8yNATIJ`pg;jrVBaBNvD$@DkX4GHl-Mg>s
z8y^y~)l<&o-BX^+rH*?wru_T6#LRdT3oZ1t6rSXc&MK2#`dd!mzm@e5_3sK5vh(h?
z8qYqX_f8isRA`STa!sl0sM3GvtUZ12yZHsqiBU26J-cEsaW}>ZWHx>`S`W%fmjKOf
zlLi0$AY>#d`9Sl)Evs&oOeAFyIF!}TLj@%;vcHB~#)ovMen{X^CDwn}$d?s-h1|J^
z%@MYaHg@!$4qARs?OduXs%n!zHwgO1GNmDhN>&t&g3@Y4X7ddCaW67U!uZF{mWN7~
zB*$*CbMm#>t#F+8OwGy}FK^y6601X~S|$i={V4BM8c+lk=|mW)5=JO{&;LD<Jz<3g
z9Ml0_?F4GDJ%H#(m|XS`k^TbO?d1e3iq1U1ne2}9yo}{uZsJpW{ikYu{Np(g@!o|B
za^xJ=PFxEcSg{}W*vuI5t`G0O;zz+O_i7rea%ZdcI7}EI`^GkQb>n*aXPB3)(3!=?
z6a8btnR((l>)spN+(F&-uNp#liMTB)W1AM&vf`|I-plFbv)UJITi5+iCZi$-AXQUI
z6!HT=PiNqIH7^PJY_qWBUcQAO?%lA$-z(X^6spgb5j&tr(iH)gb8ip@_Z^{s4lWIG
zUC{dHfegAihQ0AbI%sgqQ3cweamMWl`uL-Xue+F*z1f4;Hd+~u*|#SifBU+OwR0do
z@$Kjmek~Xl#t1?x9h$|V6N>~(En7pDN}VrKkH5USmXziSXDzTqM3oFKX3PSKoT~E%
zk#Ko|$(lol!aCMluJl_-Z*z^t@u843W_=S*6a9BMnq%-7U>#g-H&C%z&B;f~zgD(1
zEz$wS(B?O(EVN3pJ11WcT6LeExVpYzUVDk0T&UiKC+_>#72%bdoWD>I3@8lB?92$J
z<1yDlXlpbUtc*&5W8ToOMC`thfJ(}{8q=c{?KXxM(6L>?DV0Ro0^b`JdDX=YkpXaF
zA_cSXwNWXxw%4ai&leZ3OG6uh$OhbG%tDyF`&&#6NX6lGMWFBmj|hPm3MVsLb<3o%
zZh@ow7i6TbPU@SK(i<Q7hZNy~P#G>?$yUP|jbLA)zqP^_05-n1Y+Rjym)8`mfPvZ?
z0^T@wvK^;sMvB;USUgX$f6jd(hk9d<Q0%a-M87>AthVhPmMoCFEOXoMI^`w8mrT%5
zWj%=S{+A%b&zNG7(SuL8dTq7VaTVE;D&CYL=p<UQVc%+1*miTCRfNXHwfL#=kKGfJ
zo?9X9h={KC^y#6C%#)D7aaPyWNo6CN4_}d6YZX4}c88!yIwx3E-%+uge0+QWH|yh7
ziwR6TwFR`szHav3WKj5kV*e4*KC{=fR(OdYk_8&tA86a#9kB=d_<qV>Kw>=`KDH}5
zH2cVkj`$xIH8DCMd5?BA;m<#%CSlTMU$K1oB7xM~Pk9Qzww$u^LyhZY-laB|oXE2I
z6Z8Y^6`ja&PnDZz@ryb)8_*($S(fVhuf6KMQzVS`^uF@VGlxwdTgOeG+e*xJhe`xD
zuMWpiC4Q^8*x7qpq=^U#DP$QvFLgJSchN5WZ`+B~H<0f8mbT89f6~>VW_&2&>gomu
zRTV)#Y<)?Jp0|s^li_vb1a5vfkC}(t^tmbHHQ7SBiZ7s}8>`dNw|wJl18zz7iLGDd
zq??0i+J1-KU<@t2+f~oFva=O`uGx$|gJ8F;h~cYorJWf@EnE2q`<F!lkBG;#bzHSB
zxql^9r&5<yM--w3v8dJNTf`v4C#04YmnFy1^xx6x`Po44>&6t-c6L#Sl)crbSXkc?
z#X1)E{0+x#i|CyOw@u7ilu+vueoWEY0?o(cXf=--W0`F>Ha7aB3xZ_U|0yW+*Tk`}
z1VuNQSluRn46S`FR1DfC*N(y<r*K5GPpZQ=*Lb&cio*)$l)<6{;@1xp^UB=kDW^us
zj*lPeL=<|_zfrC|UeS+fl|DA9sTRL^8$;9ro4hgmjA%9>C}r6hHRO4(%5nCqzNNR)
z&(4P09xgDP#}^uRJlB#sCz|DWRt9#AXv3!gRX27SUeXNM*E?rJE(&C9Z_19dgTvdT
zPq?S2LT=nP*ioR=aq(6!5Y?-C-aD^3NH*`y)3ASYZTa;L@n>huoDIj>YmC@2wbehL
z%KNsacIlG{TN`hsYhHab;t$N2BQ(PGiT*43L;+!I=-7Y9=^im2_7YF8fzE{tl2j1t
zbGf({Y!~%_t2B)Dx11zvaMMW{XLsHY{)%j=GC4PNl?`<m$Nz-|pk%N(E59hc$ecI=
zU5c)gn<pmv$IzVCxG})%TZFfebr5sFYJoj_+|Y36?9m=;LCVGVqLh0gCg&>oC00Rc
zt}4e-_7<53r!H1L?j{S_2Wnr_t*~`;C=f=c(L8SPTi|2@?!Nq=Ml}?9ZatI!@>hZ7
zcc|q7OM?m2S|0M+I5(_%2>r{()3ZUE_PB2PA?NfFignxHtL3k#F%H}%cCI^D{~SqT
zr}(Rr7bKc<yM=nE`4qzcjMZNXq9!Hzrc@y;M^-uhyzl!Mz`^YR!yfmyJo2xP>b>cK
z@FC0}n+kvV?{M*i?UL`>!F0t+&%gNo;QDCUJue)I|4>baE~1|r7CVtHJnFCg=ZXK<
z1K)Q;?a3KC9gF4qBVSn6huw(OmInBsv4MehqrSC&9ZC#|VQXt^SZX=_?)pFRJb&Hw
zR4phe(ozd?`!l`u2o*EVUfWo`SM@6P*@@kegCu<oeP~<0z>Cy(g{uZsvWuEub;3&|
zZy{aYQiV9$Nr~WL4VXYgTEf}g9_eG^0pcWqlUof-dUOKnD-0XfJs+8O!H^)<CnlHp
z`PY22o4bw<wHMqrEIaH-vU$p1N9jbSta97r)9Yh2($}O#MoDQq?%=f7)zHt%%@phx
zb-1+t-2TR5Z~GhOkJ2yFE-fW8EM9$xB&qAAN&5eODA1gq$_|Y<oo9w${%~bU?IUQa
zzu%ef*J)OXdulgZ@b-B${jR65RQsE)p;7LD!e!6Z;Q`SdR)3#1W!v)a7`08wX)HC4
z6C=~2!9JX>L|<seN?)I2h|bzw8&|P)6_|Whun(52=`lF*#l)D@%*q3WB8w*J>R5xv
ztOx8XduhaGfp_0Z%~TXjHkPmGY5&lufopMWyJs9PzaJBo*Vc}_n`4z<w}I;1bwCmK
z{K)^gsNKGTi)?)8_FgGcDMWrcJH6x`d2xy}w|n2nGQ$R{P}k!9!!El;ebRAdTZ4DF
zXJQW?|IPRNJK?&72gbAGTyfx^+1_LniY4iMmY!&zywoOowpD}4ZUg0elbXpYN4vqr
zkNfNv`E`3c-_bQ4+q;=lcpfO6#_-UjxSP~Cj^({`0p7R5G$~dlV^Rbpj&?2=D!ss8
z<Pwd1Zf!y3ceMi-7c<J@gF1JUam_WI5q;vg&`=$P)0Mq~Z)S+AKL$tJ+u7wSJ1Q2}
zGlCV3f4DSo+gJ?6RtEvX_Mczz{x{G_-6|9|{^M``fta?_IdUD}!nuycKMxlP8q1Jt
zzG$IN4H8C#jI-%0n2+0cj}r;Pr#TPJ1z*$#376<0SoQnhml1~M0Pqw5wzv9PRQ_Lg
zH|>Md*#jDxD}TTywEK8#$^ZZUCHg-<mjMn=hw>k%LkG@}kn#ml`@c?5VFNnvvdwqU
z2crK?%KH1pq=HeMjOru*_m;n=sQxc7Yb$~{;YET3*(hF`kL766ChYL%)fDSJMPC|r
zCcEw8azEK=&kCHjnJLsKzUCfvbVY7Z7S%tGiM#WB=|t5e(Cj0dT47rvvN_19dQ~a*
z&oOgy51Jg;mR1)3nCYCjE=}q8yk004<kj||x9BboQ~N?0yG}-KkyG=50W&HHE~QDu
zkd8B8K_YmZ6Ec8x7U`%L(mS6W5+12FeQrilTchLbs%N43YE!B(C9yB~9b2OO9rDe|
z_0op$yPT7Xb`}ek!s%LyM5las=iM(QMu=Nv+QCt^prfj&VVkuF*;}ZPGZ^@N39^5x
zg{ac(dpV}6+}`n0dZZ$__8`b!N`7NOXfbBMH;>sjGKAbButwm)1IoNYve6LmIeDB9
zI&hQ7H*;#>uOJ%UpCKXbWpL$&9+CuqRz1LCqNiH#k}Pfcd|A1dMaw9NvqqhYMc{#A
zeZHG`VGo(1ZQ02)XsheAXz79t1JXt;lG*0`{lJLJz+_jD;sdokDKloS2jp74qy+`g
z{@glKaL&W-)f%@ONipQ<L6#Asmd<f^yZ*Vs-kJQghK*BQ_I8Re;lCwL{~8SVkdx?b
zyyQQ^s4y`E(dxZm3#&4N;UOzN)xZxw&+tnc^y9R~-Jl(%EWhuPd|}Ul^8a*Z<#0}Y
z=+s{!y2jblktgA{+R(MhTvTluw%z{chVf0qjhKN7*X@yRaFJsxk<)5M*eRA{D;DVM
z6ZwBgutRsXAAHdjRz9iz*Bv%Pt<oC@1&k9rTi9N|CEJPRt;t;xejFtNElp|jleUT_
z*Viw)MMwrJ>~xte<iZsq%g!U5w6(%X&+U);k`=Vwq--sy^j0ayqK`WLSx;lt#-d3D
zP4ojE$0v@Lg@QF~1|H5+xSG*9!IPP+rnV?WBRvaD7Ztw1ZynIrz(dA&g3Y>57h9uU
zuTAGW(OV%N4iP8a<40r-@2v;atuDm==go?w^(^}D3JVx48eUmArX-G+hC-iQvh?ed
zZ`Y_FNP-W`ViquY4VTYd86kEYXuoUmd_X3$%p#4=NxcmpT+^h>0(5~ThBSezlL9Zg
zc?&yE4O&B-p`{d_RQ^gTJ3SpK%?W4wpB{WF@XSko;A8P$vjY|etQioSD)B2QgSF^f
zs)$%B$qQtT%wFcGf6sT5WmZ*TX*cNaCo34xGX8OYcCOG*C$#Kitvj{7Rw1+66PlXC
zy1u}gB8%yBWcpb0ZqTPkpRX3xlSQO{AatwWoHSc9<5X>K3EisRJaW7$OW~HaZejMZ
zF{t>zjhj+KiCzNtprXc$No;@>z`i~{Wilt547k=^J)n%>diMJ3{@>rh)N>5E1RmCo
zJ3Iiea%iKya}1!f+_6JI?j}l`CtQrz^YK)>rMY{cs+G0pqzSji2)cen$v{%saz&QC
zMzCtd-TcY%)i3IM8p?vTl&-G)t`7;-{EB^4JmZg7_Hb+-A}d$aiwkk-;KPqU4&((q
z@)`>McAa~quAu0uE$_*RIpxjub$qz6N-E>UcbAsygD0DgB>5jD-mz8;nU?3JMa<cz
z3rfp#TOmK9P6s!LARhSF2)wmWAkOi37n$YoD|d5FMnkUSzmVN5rzkLud6X~CLM)J_
zp^R1o&c>(CpE*}z{6(Ia{Mp`jU;c~(#2(8tH9ve!Q{y-30DUA&Jwj)mkSC_5Kel42
z84E}_o}4hyMJ2ZZAilSZJd-aO$^a_I_m7@}m>|(h%eR`J4J>Ond2B$TB_4?0P1FI&
z+hyQ?%L^E?ya4r3sSf1qdqG*r(jN3BQnig%oxfGLLpB0N|A)0V4~M#a|A$LRB1w_#
z6qP7@b}E!5l_Y!F#TaGF9tuU46xo#|`#yFO*`>0~n6YMyG1iP_EYJDw{(PVB{rMfw
z@%-@|&v7{J(mnH@VXpUeo#*R(Enl7<Xcz~Av=kkEl69<MCrGL9*Fn6={uT&2k-KQa
zP6W*60{Ff9ILF+b0G@?`3&85_Fw{o;`dnZzJhi+p;ML_|h6CIZU6L_|;NIS85hh{-
z7W2L(;E<Z=$L_CBJhTaVi2iVPiDuL{pNLVN^`I88`t1TTux0;UOU4p}UI*OEszURG
zed|FF!3reYjSC!!r(jm_ylmPYRG^BTphd{aMjT+Lz}HEivMj9x)D~L*xd+yO7t4Tg
zTUEb)MP71$`Qk~!?IrMWj2i$(u)r`6q~qEEy8E9QiHhlgQQ8B;Jczl73ozpoVLFHP
z0SbCRH8}bDugk*k5+ecLvySbO$Xqk1+w<0<;S9ogP%76nP8kO}uLY7lvNBiFodB+C
z`!QhP&Ve|cNR0Tb(4<yg7zPfB-i&RMg3>Ka3iY6hqGHR7!H1=fgq{s(mtt~V%3K?(
z>x>dn-Scr|If_tS1J-HevimiZctL4;;f-cGPrUS%VJAIThndt#@87BYHr-tY<*t|S
zz*8FM<J`w=S64&*``3RNT|6)8x)xG*s_a?&nPcp_Q|N)PaABJ(X5Sv(c1@g0RjgCY
zU$<P-N^CXsedlkK_&bA3WGE`2--b{`$*3LJdF~r71Wq8oLjJVLMqo@l{9C^jv)Pxd
z`ns{}vI~gk;|_buw%q*ja{c{=BT=Z2RAj}F*VSK6$uRxFxLq>QDE&iA-l&RiUtSjm
zUd6j)n}u>*x4E3q#55D*!ilP!-=PzNC#>~ZD_DoOR#;q~nB>$@D?hS3_xDQVukgdD
z$^7Yl#YDJro@6%@%up&#h!{FgV&Cc@aX5C|wNTE|+4`S1@iT+PzPQg&C?MeoBu)r)
z!W52u>ChUE+jD|hkE{^^K|jTwfKy&3tY2?ZVv<*QgTPx_=W8Wjf8I{13GnKwL9@s;
z|0}>6(1@eF-$4Qkw%Vn#pmGURJ~All?&|s$W>M!KD}WknN~V6}<$xcB)TPE|YGsp%
zk~SFhj<}Zr9MIk^653yH5(z5{5?fShxVxXn$JsgQelYuOF8eF$bztk*Ztr|$(JZ+g
z#^H8|(dR8uMc|_cDB;ePe!rwTdXlPt*+$?gkF~EVZ>DKJSZa^DW@W_EQypQ}zN=O_
zSKM*ioH;z0$lCBh?ch{Y(_Rvtk=ml&0Fu!n7||0>EQQs6j^N${2FTr(*h@d9a854i
z^>8TErtS%R3Acs$N)wCjrx;%l@-T4IjW*hT{Y%k@!J9aEo7|WTll&#c5<s4J5Tg3H
z9NQKnlnDC;HtAtAJ8eO?(*`JDg1V{J6&7KdI=r<QqAUr*o?6ul$_EdOzG$?@{s`Ee
z*o?7Eirl6oEP`vJQ%skvj8t{AUQ1}jE5{psdGGxB<U2d2&6jUiKX0OWs%jM;Wp1K0
z_XRaOPx;0?l&~&T=f+1&3F$6Z`JR$&99eRsCG^u*w-3gBs4V}sx(1D<f(WFUu-Z8;
z{6&TJ!jXH-(zM6)%QGA}9&@?1%Q<BDdk!?{ES-N(9G|X#UAw>$Ep|!WFQ{*mjYn&)
zF}Y?Zt4hhefp@IwjV2R?jc0gCOq7>lxj5*)KDOn&k`KoQn}^v{<h6;(%-KyGP0+;(
z{!GO=bD#B4j~~3%`J(ru;<!rnhn)OOLvDv~?mpI}Qq<V9mOZ#2;v7MdvyRcK0izo+
z!sBm~*Jc;7^*d5j(;CW~HzyQ2u1PEqtk_fk%R}L0)RnZy)Hh#!eL>I#!=51X(a|^s
zLwcpf5!W6a)&5e1H362*C7}FDM6zQb2SVbhwqUCyX5y6=ZTya8eBB@!n0{};u(S>F
z2rT=x0u$(XMrqt<B!?{{ZolgL)`aw9og>>pw_=2Q$Suk{Md4nb!lt#+XY}*Y==)zh
z5eQDjocD>jk-!a+GgoyjpE!1bO^Nhmb~`xbT4&QeN(G~?0jn6DPS|JjXlLwqKytCm
zpwvC=hBY;kAzb~ewZ~thtx@>yd%d~wx8<*YbjjU`7rP=uysI{W)VeY`1!^sa6*&|2
z|9(gGX%FYCGHf}^Z}7Kn_BlqmI3)rVbA&$fz2|nvS#-y9e|O`)Riy0f5@#xkv>d*r
z4%b7f%|vW{E@1HMw&nV!=b?fgI=6E8|M>!my(<O5cFR|(R{5R3$xc6ge%7nKWEZS?
zXdC=)*`)fqmjKDR&HkgM?nrst#K~v>F8?>r1sLt0-3C)~=@w2G?JouUwjF}asZl_4
zZRjuayT)R5yCJrq^_#(m8c;#e5;Eatbx`Z|u08ZKVVq5OoklO29uw7=t_>^;2ZZnu
zXZH)qIk7N5#s%4RshWwZD*FQIy^XUhe7!kXzWvP@_7dJ(Gcpg}M#F0pA1VcS5eF0T
zbp->=Xvra^`dE>5ZNxqPt%9;#v{>|4PY7kWfJl}lmvAiyM!E9`Z}_LqzD(a|#L
zTz6pEXubJZURpr>PO1&osprp}pSfwzv9(^Mj`K6!4kJS7wddQdVy>cBqgnDbt~4+G
zfH8!WqJiLyNl)339))X!H^z#Gl)#u;Gn6?KymJcY?&j!iH1wvWQZTd3zDj>Ha{#;b
z9WOcbUFP=L(aeGG(Ar22)$G{2lcZ4$)guO2*UP(S?}ZeViTNlE_$D~L*|8^+mVDiG
zWd1O|d0JUb@@!dbBlyl=Y^c0=HvoNHi=&a;0MGiWSf<O~ZCfAiKfC_UMataR%sXoV
zHN2{yFQHRcVB$@-?v^`HD!kW+rwAA=>{jawS-*DPK1zQn-CpZ(6F$tDGwz&m|LW$_
z6zO{1iN;$GXk|_ZoqusI_G)A;LSF!PEwR=FBm*Lm&FnEB?nq}>RY=~0q6~JY+kz<N
z7bGVp*@?prPu1PP9*Yt4!xSWxjDBS+AcIQ)f1&0iui32Tp*wCrKU`~&E?B((`N>{J
z0hC!HBsm4mN#Z4=ZXRA~#yH?2825VC-P3SXlYzBFz4vuOZ7BwfE56M^V2Hg%gH%vi
zi1Pa6F-${a<()z@lp2zVMNxk;J{UFSGYMWZTB9vsz4V(!cyJ9ApeGazqWX&LHX3<$
zZrm=rRV$Z|&S3fQPY!Cp$HR|-{@(>B6dR>8wA*jJCbXLeC%6NX!UM`JV=Jkq-^*de
z?hkm_qa%|%^rlkj{+FlsIw+m=+zs%U0lS9v$Mr4Kqh<X@9SmG@lC2kgK}g>s&NjZn
z?sp~dT#4HB_`^t3(xuPzXX%~Z3a*<9QbG*aWSoRgTjxG@6JG3gP_~}Ws`}Hd^}VBj
z%KiADd@#29=n(yuLjt;?spIYfm0t6uY^UT6{ZT+6UJDs$dd#kl|5%$P2b77pkUR%y
zg7r=*2>0~qBM+k?Cdvid2Ad7h`y7n@Ea7Klki5OJ$*QfO&yU)SS7y3;!6I9gK1zk-
z@Ns27pD2}aD8lBM(L<&$3JEI1KYP|t%bQxB|LQJ*aG!<&lZ>=Wd)}G`Gx0bf;@;7z
z4?_{WpJ1jrILZ_P<h{;$aYUus1uNtoZ3y2d26m08YhOiSHkQdQ@9wB=3)cS9Nc`6Z
z>=4`#)C4a57^A2ooW^6^K!%GZyDK%AFu#49!FzXMce{R<&FWi^lc>PTV7DJxVg<53
z5|B8Iu(r@n5biucvOKOD$Fi7cz`1PZ?E6`x`g107@P61HvmsvHhJ9CRJ-ts9$@o(J
z{N$*KCGG{2%wVGa=S`@DMXb1Y^vism`1YU6sLCstJdyKH7btb=$(oo*40j-IF`l_$
z{gzF#8{~LdgwZJfS_j8yu8!<5h7yE}x)^`UwU`^Y)`ZdhV7y^~@c{?g23uC_A*G-Q
z4w-Ktbvt(-3;$v1+yx4&6fUJ%6nX;xF7VUfrap@&<*w;aO`>m3jm9noKdN_%5m{xl
z>>sF4s-x*?E0WAm^N#ngA?FGQIyf|(zR}fI5P?mp5ws)&NGeoaMZw;3-y7~I(7tIC
zBf7}>{kGQtei`q=Dm%Rif2=RPrp7<nGoQ&DO;Z<UXRRt&4`D;6IgdP-mmYpRzNny;
zA-aYtUe{#*ug@5?U&bTARLYz}JjEQ<z?$^^-KZ40v3>GRw<1nRX(oIb2kts&KZ#zP
zR136BcfA|+v)MTos$Q7GRJ28913+iAli9xtisLi={RSsK)7ZB%ufu`o5x?NO;9ZoZ
z8!w)5qtLYTMnv%a#^Csottpu)FxWU)++C3^GyK>x%DM%mD_;+-mnA>WiwXPQls9B9
zsOzjX6{#>IC6TMx&`-L<sieQ$aObD!-+6?J8#UMj&yH$PVZMSV0d?x-IbOwTf3U<+
z-J`5jQb7~%9QGOlY?`)CTkMxxADrDOya$?gF{G?Jc~y6A+-1LCZeX{ZlkCign1oIC
zCa%co)swsfiZ8gzOgBqO%BadxKVw!m9VLfE2jR99k<0-MWvEIpwXCb`nTw9WgpV)(
zIJZ=CE-GIn$)SpWn#jVw_G!k--i0)3(o-_|a}zai(mR1MudD2hRd;O10}WTnja-yP
zqtb(_p9}waFv<lrX3}#^);zTpr-8f@N@ErjF=KE0(H(+q4$`I4*gLNjjlY#yuGza8
zD0O?@@#fDQh^_mO{xPOZrzC6UMN)yT)m%FAt}p^_ItH80+Qf3}m<cHkqrdOE)w+yt
z`5IYL9-1GGStWR(-_;sALJr2P#+Yy$?7Rmt!KYsu%lb{q)EK7du{y<YGT}{(uaU2a
z<}gO<!nZ&0P1m&=nTZ=A`)g-`dfj++&Z&1+#F_a+i1*c7g$J3A*iD>qt3=1N*#56s
z8k9+YM5xLbdZ8O6SjZe(%85^Q$DeU?E1g5*Hg6021H;g=tF;gj|2E@?Xa|p~sf+;8
zNWc_l(b@H#jrq5L5F(ZK1WG$YF}8xna1C(@tnSiLZeeX+KlRqQ(CivER9JYr_2)Eg
zYlBkpmpoVv{H&bH>!bUE9ICPd&#Ahr_FpyeP}4NU)QbG7&e5CPtit>UKEI($Wu!g`
zLYtqtU`C&HC>2sr#p3w`yg-7h+<7K5=}3t_s6Mvww#R%xbM(G|(|>Ep2D*zEp$OA%
z3T!_kOP4+@9R8elX<IPIJxHvUQRYDvh|mFcith8fY~bknHsag2(da()1QW~f%?<1#
z<4QpU*YS@J+kTEH?ew8h4do35emhb=L}RR3EKw;;JIH*yAJbWeeQU5KRk=``f15Jc
z`pDEe+CKNDw~4pm)X`cSPg2|TUe27+IOMX)1+~HGJ|%RH*U)99Tfwud$@OKM?b<@x
zZ@(`OrNf?^LN{DP7$#5nCWekH{4bvgVP`P~7}3t!6FxeDcX5inWEHES6|+Q-wuID)
za52P%w?W#%sYuUGYXs~(MEZKi!FS9Sp>c(BVwAiAxZp>rU0?iqt(q;DhN=Nk@F&t2
z*ZpbYWS+~~Sb+?Q7N0*-yjQ!2hOh6~4T4_FG8-(+^fw^6_Jl-+qOHZH>-+Niz)VT@
zXBC*7UHL{JeaK#Qn;vR+DATtrd%ziYsvCi~(#}!=(h%F=F6bT)bwU?k2&aY2seXuu
zP%Byda!_M)#E*hn0a5SypHuP;Y@dNj6Se`#Gnd31b%=OIdA+miW1oSQDR65&w{mBb
zsDUqyfm~1y#~ETQe=4&HhXUH1>T&(T-0us?6$eay?;Xn5D6~Geqh7?#n8;YEZfvP-
ztTAh05Ys)}f81aW-x8c0Y?7Ymwlkn~O>s749lLmKQ*BsD8mD?bKc+U%!GwV6O_%oi
zqRd_*@TlwSwnM9$@BA<RWpYtNfr31)7^PJ^rq#VEw1aC2M!&6JXS2Oeyjsi0R9kVK
zjLCoaHQzTJe|py3b7Zy-W4%@4sa5OSAB#1^8?`0CYhQOs;pkI96YDT}i@Ihje~RX$
z-guQ@S;rxh&2~FRmH-!B)Y~fYozEu7-9on#Ytk{}(j42S<NQ$<3Ww_a<-WTdxp4r=
zXDnJUyWQ~O#O@}xUj5wmZw=)aNtwxiV5LZo;Ql+o_7WltMl6Q9BIM`o{1$_F-oi~Q
zx~LzR*0l7&^48+>#K}XCaO55lC&44-BZ}r0&rb~s1~J%r()+N>I3;Zibo<!=&Ho3l
zW&7uVeF$vDVr6&vhViime@~xmO;$oX#NqmhxxDR$TAsrJ?Y^)02j7$LA{An(nRYWh
zlP89tJZmK0apd<-$8;6{nU?*v5kPDdRdf1c={*A(A}1w-&P5v?zIXaN?VGdx9tg7-
z91IyS7;f-?i-&HUF_7o*<A7=T<48he5$eFoFV)2h;v(D-du;=eod*Nu3VkJ;6W|2w
zf;h-y+tzmH|Az|@9zr}o@T-R`GS3jY$)!z?jmfw~hz#t%SG}k6KtUdWUiF56?CqmK
zOp)Q~?!Wj*>At-_8?%1%m!nb=J^N+YtR@;G=@(N%c`Ic`d?d(0OsPs{jpp!PLg_I1
zUn6B-yoPi~{*QvG=PkH2g9N>yt~vY4aaNc*3VT@``$oy#Hn2qEbEu92+aw<<I$cAC
zADe7la3j%{VTn<r52Io5Mz=VoH^styORq)qhrZjZt*Z}o*}?bVIa(2exi>q89pCZv
zX)F`i$->@zk)Ivo`tnp+W;l!$n`AikS<krWxxI2cRp#BixH<hJr|JB9qImFnW<=ck
zoY|9Nn*VdDi`nOHO?{j_IiUH#qor`7N!g9lfN46`>D20LvDvxQ^*cFbW&NL8wsSV{
zflLPvl*$JL9g=Gu<qj}C{Pcy0Gl<C?sw>a9`3PB#TxOScd}|1v`Ytc~V%AWPWHs;q
zyC6~0;XOgpV>!CJ=D%wT*bwWz*0;;G3%=)LJg&BoY`PNxCF%sVBlC|Ip5tK6mgWhk
zIwOXr(Gp*3uj;IQ<-+6yAh?m&06IV2AKSaf-+w90{H+GJ=Sjgp5xu3)cZ!6PU2jG4
z(Gp54RgJ{HCP>YJW@uL03HO3=z|qLG@Eceb{dTjS*n(K^$OdT-_Xa5pw5<DeprcJ}
z=&zqA*TtDUdfwiEiOE2`jP|QqkqbJkC<KLok4QaZbrBM>;5HLLksjZ2^<9z!MNP|F
zB#@kSeea>x!lx~Ow;gfx4tJnk1`qfM8k^0EhGy8Y9RM5MYPzS~IR*w{4BW?=0^R~J
zRycq_xR4rhfAN%BY~c9=Vysk5OGdv~Yj(y0(k78k<igv=Q%yiX3y@=R9tj&G0??7v
zVNET}CIVc@u!rTtUn4~!Cn<z?PgSU?$PIKOqfDnGz3TA#xkUg$p2;oyi9?)oFe>*C
zBNe%;SJR2pYhKZ-s_I@Q{p~`*6fp(UyruJB_oB<oOQ@VKM2iQP69@x7ySG9B#Q&_(
zeLN*UKR}O0b2H9jgg_#ln~D3B89qiLySMK-6^T6CH0UG!>pzUM$luLs>1*00{Ws$-
z;rF%nym{{1-a1gFc>QtJV^To={;15H9fw5ma!0m1FK*NG!YfKx4~7pd56Guq{CYJl
zmRcHjZ=Zy9`_b|FLw~@p0Luy13k_=nU9N+5BLNT9BRJ!U-fJm!=pJq9Qk{QI%fDCC
z^@^vbygY80TFPLD2{)|=v|+9%tqw}|CVb)c8Sy^8$Fs1@T|i)uF1WRMVpgO^nHB*Z
zmlJ#?6-Hm&j#L>BqRcu!iA_Q8bO@u=6Sz-mqpEZ)c<F3T?h%*$0}e)}i!CSo;wM?H
z(49tuiQqI5D{rM5vkWkhN%j~LeVjiRXKOSiZ^hbfw77o_2?J-=a8|93dru4t4{4PQ
z(e^Y%TlqshuDZzia{=5GrsKblX}yHNvo@Fl2$n<~S6}2dD*)U-Pn#fa;V_P<Pw5u>
z0{=ASb1qdCL)<a-fMUkWzna2N5_@F`7pQ)ecKdX_-RC!21oexCwt&`|;+Z#RjMBJ=
zQ<snG3?CZouna4%eX_?%)5x%7g4QA{T{Wd{-d~@P|0QKT*zy=T8^gDwk19Xw?MBx1
z$)Vu;hw8KQ-(-_oH}9az>Aim;%#QgT{UX+`LOvpafnbe#fi@vnEX;f<n3I#dZ2q6;
z*yQTmPaMA3X|@<bllk2Mu(Mx?{gY!AU<&W5F6(O-UTFmDe3l>m?u?;y7lqs^(Bl<N
zdjHxU_QDZ{e2%mB!Oedqy!4YVAVIy}HSw%${a7$ZClZxa^vu!2FsKgVy(>eVVp*()
zk-Rzvs<2X&M+xrt0d{HzK8ZBb3v5=}uSqXSbvAvG^79v^4hs_#_|^|Jm^cl<9?!3X
zBoeFID0({>THR_Sd$qo}gK>l%K(+sxyf<fYJ{J^)@!H_lGZft!LubMz_24y<lOr*M
z)&JDG$sdLUZ?IYQo)a&Cd-wuQi4|LZK2b1m^01Z6!(UDOjZfw1PB#~vSQ2jj5SaLr
zj=L+7Zviwb^i+W{bH#tTI<uGd80S|Qf2$f;I{X%lBeDCHn^d-8q9h6SXjV~OY^pu5
zkvuBQ^Bk5PlVV+<(BWbO^X@M`TZ9wvoK9tAD|rj5F^xZk`m*9PQ)i4=A5hQ2aLUWG
z^9CmF@aNSRr2P~R*9_)|?qc*d@g8ycztyjrrY+-tpSXCaatNq-3!3X^4DV;(3ptim
z;{Rqn_?q-{^O7y%L5B{9^P3QrR+jO~q2Stork*)gD6uxdJoKp@sWzE>5jsiD$eR3n
z>+mmJ1&WSCotLO;1NPK~eD;;LT5B1Jxpkw(m(fQCiRiY^p9`g2!3DI5dQz><J{oBz
zr~cnwdVkdfSNU&H!o99V5J-r}xO3xeOh*|d$}odH{)?b)N-k|XuRtGwx)#+@kkWg{
zn8u%aERxGn?K#S+%=|px_ZGGfF`l1U=^qT3elB$Ueyjh?=+qjs1}C#4-|Q?>sGb;j
z6TMQuHF>^B<jTX#KJcq%iyjSql82yPKZ99hP(Iqcmiwso4wAN=S-6cyZO+ggFFm-0
zo)%epkq@ylU>d+Cbrn665qAh9dPU`o_}EJT12|!NM#~;E7??(Y5~3Rwku<*9lfMSZ
zjTGAYt3Go@ErEMpLRd$i0wfqU;I0i^IbShKCE+&P8|q7;9^feBRir+$3o)DGV0)&w
zn9&Z)(&mu-+yx1rb!X$`zrUhSihHeYe08WdwW7E?TEjFgG4UCj8IBX2dK(<L#rwt-
zmtlG-t-<B?T2=aXjlVgFvoX)L-D+qNe;eijKr+s+ACdq0c7;0NDLHQR23BeR$9Pk)
zin8E06whgXjr%TqU{ooL(kg$`V5G_;276n(j-H^83zh938pU6KwNh3s?QEu&)$XM4
zJ}9eyznq>U-~5r;GtybdP-Sw_*=qfPBHzok$e;GX{$0y?XZ^aC3;urFjK5Pxn~&BX
zKmomlV$fHUQ#_uJKIWcNIJS9eS|6AHC3t<DJEzd}86)<5l1xXwG%Mk18jyGj?!j&M
zvNCibZ;8u8g8m2{g2KF;0Tt!Bbu%Hyt69C1bviE+=l-2O%^UDU#n`5|F*Xafz=DRC
zxC1PLZy~{3+TD_+r17Pu)gcG<W(>pw{6J9ZqK;2j$x2od^2FSd&Vc>g$BW9L0!Tz+
zJz8CZb4}eFFc~;^Hun4YcbyP%nw{8s^%IR!))$%+%%O-oV?5~QdU(@%L|UH&k*<`)
z!hI?(_yXm0zeNOGn7t1_u`ag^o6PQKB#cxw+-=Pc*&|`D+AnEPz}}Fn#FITMTwS+*
zj8~isPt)MI{$;PRC4qhUFq{p~p<a-_x1c1XD>rj8$l*Tpd6+I5H|kA$!lMPKVr;;1
zponOTc=&-aAni6{6R!GErJ8q?O4S{)fg9rNW&Zt+W|?1rC7($?{Yc>L=;S4~`t@h3
zX^V*88yPH$026aNxd=$w*TDoZ<qA?!@NsW1KDF`TS`5Q*&O#MY)CE?$&E`8AC9Mi`
zO#!Wunaot~6(Mxc_OsI84;Ie+jC1W(a^c&@r!{i&$?*%+PpVYY;_Y!f4~6MZcPO~C
zHn_uVYVNbadLM8I8U23NpX(0(6uy`vAaCaVqpaOq(BU8;wFT=28Y7QMlePHw%f*-)
z@_Uowlmqp0N-|9{`KbQB@b+h0vlSIL4LsGUJn2Tg4-Fc-NbK&C%j+54Gaq_LwOSg-
zY!l6QPuPEp(P4VwR)uDXr_=rS;Nq$Z-*;$l5S4uwq@sV+&iqjdSo4XkAK$}yFIZWs
zV&uznG57}!JZGG|@v>7T0B2-d;FON3K5yL$JZJ2#>VAIE#3|&JYx=<O3278}D-{{E
z=pUZ54~920df~2@M4AWZ6*Zs8lID9dditddX0?wVbJ`5C80oZjN0OyLgXpgJZWH3*
zP_#_kj)*nR?6{F8VTTNr2;K4LKf%j0_3`#|ZQ+PKvk}C-nVj@Rf%&)x7zPVoXCGjT
zz6S1V?)1fy&(o+|?-MVV^M3kPt6JB7Hs>uk)Lvx$U`yn0J>Ko2u*U5CAnrq&9Q96R
ze2jB4Z{qD+!Ae%E-jFa8G)+}}yXfz53j|bB;1Bq;#X%4w-=?=%c_`pdYlIwN1E;e2
zW6qdSNNc7w*OBg}f%5RZMe`w%3TLpzWiR8fc--Z2Fz}DzCU?MSiH1j13PtwA;g?k-
z-nk|IZe&KbecQr+Uemq#57qAA`FCI5$Q7WIeaFiT)?XbwMy#oGBfrF&Jw9+OyVnP`
z<XQcyd+^!ljpx+ah>Mj|VFPWn0|&O!F+jeNLE^UbB2p$4J)_(=)Eb&6Sgj^L`Zu4)
zH%<kl$F&d#oh3|0<TvphunQ^$g}dn=qwcz&qrJw29=xjdM0+?e*YKLEm&HN$x~PV3
z#@@?v3;wWZY@pF&`+?_P3gOyuc0nH%NH^;b<iI%<-nwI|gP<l7h@*ag24+mX#(TxT
zOA0HAgp;0)RJWQ%hT;6%&<CU=R)1P(89AUgAmUrv_&%#W)P{TnCXCPZv-6np!kZy^
zy2iD2xT^0Wba;eRGBkygHPibBKifTd6L~FWOp=;(HGr=Q@6%}>BONPkH|d-;<ZBi=
zjC4_-b}v*bFkYOzb}K5S)k5Cdin$l#$jbiL<seOzI*PITZ~0F+YGt5KzV8?mW*#8$
zR<|QGygL?mgiU$*>RFun8j?xmDsZ7Nd#>3?U6FQH&3Wa+o`8H(wdSCtb|oJ@#p&h|
zoofERWZ`i_GjYy5l0tif+fQ*mwWa*CTSh82#cfsC=R<i5HWr(5POT{27PYTb)890O
z17sbsCWi9H=`9<cTf~PWfng&bkLC5$pN_=E+oJdg<68<ls}<<IX`=7BxWtMbc>^#)
zPrMCqs<V(3{GxLbb~6c>>Q=1DzYe|B-Uc{MkZH*khUYXF-1~2pQ=|?5j}WktQ&A$C
zA8dRET?q-^SKqwcUmU@A;aZ32`<V@V;iM?$d8tHJr~*b;ja*tov-hzfJyqx;F$289
zl=&kVE|3m?+MmgMcZznlayq(AYKGeQ%l$BU$JKZQg<zhW7H+Lsfz1TZ>wKG@s`~Kv
zG8i;#>5U0s?x|s<tM7UDKrjNeN2UD)0m4bWzqu<CH~D|}i(Exi(*EU!im5OJ3sljj
zgwazk@;~lp><u!dz5(q-PQyN9>5nwO@j(w%-9epMq82FEzXnVLjSpN6PyS}fi_BQT
zSc$3l=KUPAziIAJ)QhC2kk0$@K6&!MF^I@&g;9h&_Rfj67y(ne7S<_9h-2Nl`wL7c
zo=XC0Y<Bj}IP*JBA+kI1{hbMyBX!^1!{iOhQ>Sa=_9!K~j^^Kwndj{PHTB7MV|LCw
z*>~Tq&FRWQao4_h_-4E;ZtRyPbJogc|A35{_ZhZ3nle~vnj4xj8yS0+U9rzig{W2~
zLlkyO!l>~9PaFfnMir{>Wxj>IQq({tIS-9pQmjorZkbosFMc`XP?=iIs@!n-l3eHY
ze^vDXO1fzx`&Oe&XTd5`O6y$FQ&rZ}j(aztjy*1;D1B$G+kdA7aTMtUhf9=m55}$m
zV~4~ZHehU>8VW+3EoIOfV@&>bQPCm}1E7tw2=KZ_Ft;fi(3BOSFb1v}-Gm`Fc{H*a
z423VO?>k~lKYb=rd8fh?ZghdOt&f5}3z|1bXCZ;}&RqCMt#x<R(uTo;lX>gMgTL`l
zLP^eniY9k8ZlrRJKAWa?EcFIELl!~p3y3Dv3Qrex7vio2Q1(J}X=VFXd6@`KhEQoS
za;wp&$j~jigmG=1@dbZP=W|2>Djzl@J3i(m?&velh;kM@_WraN#UMjiuY@Ptb<>;0
z;2KSv8du6|+iF3ez$8WHF)rShxw8~Xm;I?{NNJ>QB}(VLL~n|grPYBtXk^cJ;d4>G
zVcnIUT8Cdxp1vPu7F&014>nIBK{aiDjUVpYWZ%ZJOS|ogmjBs-g*}G)l6eYh9}$gh
z{oKfT^W6tGvEC_cSGU*{6xjE4bj9LNewUzL+-?Cn#eLwUZMUxRugkAMt)N!8q7qMW
zdiy5v?!6ct={ALds^RiBg}whJGWm0j3dR1(qy0?gr?6*^E_60767!0PH7v{!C1AE2
zo*ma8${<W<_QT1C+w1qcDM;GYl6Xh1HY~r*m-7%Rn0`ZpGE*?qx6Wn#6gN#PX$8RV
zSGs+<2Lgov=MTbF5dvifoi86Qv@f5`8jrkL&MSb|mgt(0{wGdJowN+t290q%|BW>5
z2A+1h(#y^%N2!krqWp<Y-eKV}o_@)KmXhPR_Eldfqb&4PAWCUs`V8&?PQ)lJML5Pa
zVzdD=iB8ceHOnah_h>3BGvx)%$f+3w5G=~iRrPvb0pt?RH~)0nB6~#~k}igsy9v?E
zJnuxDFs?(52B;?Gs-ANbWuM<EC$s|-``YTUfmO%&(C9!RJR1SebmeEKLwst9kJ+`7
zjvcb#_lkg&)o=Het-rj;$Dgdyv~9|69e8Ok$Der9Kn!bt#bD;>O2f^Gxn@bOR-0Zu
zZ7H81E;U})Q{%_q)tI(@%3*-<!kKRby5jtsS2V6?kH%r^B{$Cc28oP3W}iMUsuscS
z_GZkJwAbp`*3pWw6V|0X?SwbsRqcx=ggd#Y2(dY9Mk0HEc^;OPMPD$=cIRELaocCn
z`|Hgvjnf>jB_hVESd(mf^-X@W@7~t1kZ@QJ7F84KZN0dAV>sWg%3Yaw5`Y$DtTJU#
zzp3w-V>}qr=yYkF`=edMb~2beoPWQQ&a-{`(Y$;X(U&8&d+xv0Oc;h<EhA~Gh%bzH
z9BGIje<AQ1GN8AG@pH-tZig101Gp26s(wb0Hw+#HS#H)3Iyh8HK2sfRn!{*;X&>){
zQ@Ux~#aKQT(_5Y}-48;|BTiO#Z=BkxkcZ``)1Mbi5-TJnanT9+;^<{X(z&h5I&JMV
z_Sg-mDB{50BH#a_M|u5M_ni6uHl$+K&TNUyGZ_J>`AYJ6D&LOSTo{S-`b|V@bb>d?
zVXLu5_YzzcMmzrELkn$F@1f8GhKvX>?ACl1nN{}2sN7OFtBK={LB$4Qx1e)Bi7OtA
z)wobgJ;wKh;2Svv)0Lo3Z_M>$ZX!2y4_0Y$X!W$MY{+yeQ3y#-+zW~g*Vx0t^{-_9
zesz?+qtBjIzO#+%AgXR@xt(oN&<jzYoF6|b%gUY;dYySBEyg3a22#+D)YgnozKMJI
zf_rGmr~6>|SbMsNPUhK>;&6T2U85P}=&}bH{%Xx<-#0oGgsBJ*Ea8Q{`i`0B+{s4|
z7cIBPrE8#$*a=fzy|V<C#;4pz7)NrpCWMp^h|eF3KkVNA#rNPD;0va1G%)$`W)52p
zEf5|j+Y0?lET)nw_&o>i-d7b5y&+YFmXL%ue$BUsR>r7=_7^XOfK2yY0PAiRjY3i<
ztyblMm}uead16sRBSjpHgV`Os?a`*$sORx5HaS*H`{V9RrFc8#Ro)=b3+EnoybM(s
zfyO}wjLp9v%BQjV)^NxD8sSUum+O;APCCNBOTJ@y_~^5`_ej)H5FSoL!`>@t$k34%
zTT2qyIst~lO`;g|4vdVxU?6#=Z(_sE=o9Ypo3mIp!g1-GbqIzqTm~m?aa-U6oV)OW
z-hYtfb6)~IYHGG|cIzV}a)PyLsT_O|^zz)^KjN%PETa;Q;4X6Bo<8C)Cju<otzq>3
zk~h70=y_g$nYjZ&T#blNnp-31x=V4{^mb?#c(jcvSyqtpKJ>;|zRNo6A^PUych~5m
z7r!NG8656sNbj79IIZ(;<kN(8w1JY`-HCG-?QM5o6rO(1a!c>t86C!l=C-H%K9&BI
z2~Y5-sabv*KXcR7{_%1|oIzdq-%prj^V8>B2`7HleY=r;(>?=zTx#L5=Ol+jhd=2U
zvS@y%7vIs;yF7?x*HjbM-;9&?K~{i<&Bu%Bl4|-tsri3y6!e9HQ2OW5?4$Q@Uda@@
zclV%h*^=(VEB5r;>Zi_!)hzelyHU2FcGoobl}Z8clVsmgic`gW5sj!{c)o8zw6v!O
zfuOJ^JnJ_ih?<}Kuh@fXYA@3??${%S?S)*w5f^;nL@CN3lx65B5gK;<V4#ZZmTEBw
z<SR`pyB`K!)Kz^lip+`wynvKr%FUuENCBIPmS*)QM#GNgVgj1`FfO%VsoweNX1YcE
z>fst`2zHByfq?ms-o(P;uG4fCO<g-M{4IXHflvqb(zKpDO+)zjE7FpJwS}sj?z&wZ
zQDf?v<uQm)B1q+&wEju~jwmDMfb+gfKevuwxc4L;;d=wnm?ZcBk}-&H!?_Ai5O1G2
z+!=sn7ZHfz|0am6jNi;u*VtDJGf*jruVPD6^i8Hoc&jqzK1*sJc$6nDB8>tH|D4Qz
zB$bQQ?Z@F%@eYCdFS&^jldC74PS0iig(A$l+_gg)YhzWM8w=9~l<NJ4&SXr`+D<fo
zkZkU|Wz6r+ITJe2dhPZNpUXL+YIB;Fw?%pkn<CPg#W-Z2)4%Y?T(!eGg=c4^Jd-|h
z@^a68H8yC;esL(o*<YCwdAZqJlgBI{Vb$a~aq=uqbZ0=1ZW<+{p83X$6@N8~K_}#x
z>5cUPi+8B%n+Cd$jHuH`m`kE#J{7J9-(e2Jy=ner^sywAOGCgy`q7bNa}ReF>KgQx
zx>*nYpBE_<+6t>UD*Jr%blp4sCjOP!|9+-pENB5nRe#^Q`+W11?XOf3Ry>A7jc;76
zG@6$u;gE3EJ>7M-hi4ugS-tu9$Aou&vrO9--nsfe@BAF7<Qjs$XA~Y;)%w<?Y<#X1
zCF6d^)qRSRVjeseO^tuPXTJI+w<6x;@z^uw@}<ifb4t(VygaqyRcfn%)YuWY{9hep
zh)MxNZ{u=a=}*lf2-Oyezhed{6Ytdp08n2%|0O6?oCX=y;{6668ooKjs`1E#3~@9A
zF}U?Teq0>)t49P}J6{lP4{U&y6vyB_WhfvlrHhi65cgpPi{$J30_CMfxQf1H#+i9#
z$!;w+3dqcd9fzP%cQ8@r4f;SmJEmSQ&tw(u2&5l`gglJvUD{Ho%_yd6sBwx%c&0$9
z7N?ovd*TKjz!E@%b6k|eK+DTPZI%y|wEO|w1~MjdD3Ne(mU}}>{Tn#3dmCYC1{g74
zZP%;;e$(W#9)<JS`L{Pq)0}FQke#Zs`S5+K+hqp>Gv`!l;e;I+bsHRBrcw2XYK)XB
zW+h3S^ZW4m=NjE8J~|{;r>QoZ*EkOJ{`J4j$$eQ8^>P&2FMrNc+&*6G|EGD#G}YO_
z&;F;muOn*5<Rgtl`S4?slv2(wi86Kb>Oh=26-i?hvNRbXB*nw)1hB>mVe5GPwrq6&
z0-U>6rz3cB(U7n@R$1^9C`>*s_jvw&(u|zCL4@wBc#QTGm3iss6C=g;G7pgC^;FHn
zt$z_6tGY)WYjtaiE(j?JZr*0nS)Y_smt#>|01lMu25myidBP#bc;AlQCK9UW;_y<c
zKeSALz!@Bx6Jf~6D=m%TQHweuo|u86S%ZftO&tl`7gZ#NZrH4>KYaPZXJ|~gs6(d8
zY&!%_DL5?>1_q-xw}C;<&~yXsbKk1-@8mnB6jf?m+qwCG{q^Fh&j1|u9A$tCM|V!H
zNu|}U0`?LPp8&oFPuwrO2)Rl#wKnDFN7SjjTBpz&_g*RmNm&dcS5whFa`#c&DX{t4
z$2i<W3gnR*-M--Pqd0;;%wG4H6}1n5Ltn<29k=0No5DA5A^MBejN)G}gf-u^D5>vB
zWs{SW*V$E{jf#3gHsqh18`FfQ@5H9T78T@>%{?YXo61s6mTnA+e2biWQ4%D$aco+{
z{p-N7O0t~XqCos3%Ug;6>GhH=;n;wl@5*t*4%lOPHq30{us`G;+xC70A)9qR`!5!^
zZzPhNA$UE>4nfRUz}=sn`t2y~xxl;dR_jc1fO57(V?7opkDDjMoWr%RlL?&OdF4}C
zsaaUQ{eb*Mo{1>wy-Wu0xTsc<VJVIW)ymK_7w&{Jn}@4Ofn>q;91EY51rKn#8>E{>
z^<*{DKCxDmE!0s?P(Pm(p~GByl9ZS?hY*ZsV9}KDTN3mQOpjHq0&+*~e_WvA=)me+
zAcGH??Csv<{ruDaZ~@G#dpmLqSU+W=^zPCeq^28}bM1g*0qq22K*(PrOs60-JMf2;
zrP)i8dJ*6y=IV$X0yNYgQ~kVxu#W~l-uWnb{|j}{dG``l5yCRn!E;giwU1<dY>M`+
zUib_fzs+J@a>i&$2jq!xGpmihBiFGp|GHLi>IsKXG8fm!^i)S*LhJmP;DhjCEBn^3
zV|E_AzRMUI9qjx<@%Sd~bMg}SdNlLuXdDMWW^CcuG&YK*>-S9#0UeEVcJMFOjSC!B
zzLxf6-(_*>h=lwhbKBh$E`L@WS_dQ?dZlRX<=*GbXqq>wcO$wlCHIZ#wLCr{$W$s5
zI{}$WRkjKFB6|P-J>}u9B!#hB{FPs?E2>RY-fx6=fkT(0Q3xGbFoWNQybdu9<@(^H
z&fJoceLQz3Uuzus-A4_s*<2;S`-wj&TEQ!>yS0ZCrZFN;w}QZ5?PSaY`JwVQNk002
zi#q?vQk5O-y?if)<8_FmBSO@eGVi%}EER@>IEx3W=MG<b@}FF#t*0VtA&0n5lo(+4
z#`iimo*}y1tUrr3p5^|u9|O(jGa{zpd_Jp@-MhQqeT;O$axgRRgdrol#7lEi67;pI
z>LqtG#Z<}iG6x0v>E+bu#kcNt^uxK%_dbHSr{Old=X)O%7Y;c6A_RDSuZLkx7Z|v9
z!q9<P3_Hi$)_PAz;Y?NCcMeEjD3UnCck#HPD2yn`o!sLc`>DqoSo3Xj#~>KO5%{V7
z-<u&9$YF6Zz%+`6-r-fLTe?V?T(NRhM*3x^3K=h}oN@KA1I?=Hf+#IB<Lk5dyFXh5
z{x&4f_Y^mQ+d0(F#`$mulvUF#FrP;FvJaK_R}^2HUf&x^!Pny-<fsKsfF^F^?WU%1
z-cg6f=~3u>5W}yzmIH_-^;CC4QNzIZVeqDyKSsOPsa>bhZq#`5_N2R&?Ss#HgLlk^
zYU=Jc)mY-pruw>wskoN6o*T!0<dyh)`u;OQ=Ep`V$h`PHWjexdS+Qg7dwp^h1n|bq
z_56b?%fjNp)tHOu{>?jM<|RJDB^Wi=m3&mRSX$7nr`~3}FT#J2%(S*UytYHp()GFE
zJ#dUY{mo9FFKS7OtMSjAaNz3jbrm>J{P%bw(~6W$1Tl{Ony(7T-`3!Q!>rH9|3fU(
zp8|p_b44JkelBpd_ThN*fX=X-cw8mV^1HO5tt%h1^Dy|c#nzvW!|_tv1@P+6*Sw11
z0i(Id5J5a?EtDYus3F(1KYoL7z)Oz-skH&;FtK__kmPc3xU=t8zR-4bh!iHM%#o*b
zPa|NQ)wfl+d>$U8%3t2vOPuZ4pyhD9hiIi_86ztEm~(C8m<80250);7WVCMnV)F3t
zcJvAGMADePJOb!G@!xq&2aC~F^;<pSc!Z}AAM|Gw3snRA3K5ug&&%#pOmYiM3$1^_
zq{9YSbkU?G$h_B?v)%unMFC9+O~WF7`jEk)X>;GzHwjCwLvC2crbxNjPrQj-;`mA5
zQMX|ca1OR>`ZgBAGyj4*^%`?0imfKH5qoYD$-8-OU)S^$_dVwTwU}3h!*PW#zia1R
z>^&keCI(!o*8(}}E*w|7((M$6<*4FMbv-)t!Oxzpvbji1Jw4Ci+xH^EuhrmuoUYsp
zr(5+(vAC9^u`pDlQ)GWSNQ{lkR|l0SgR#VHjg~0C`iM?}YzJb6TUS0L5=A?3T8G=E
z{+6D2zimolG>wK!B>(H$cn94Q;T@cYaej_y-573k9Lrq!DL-$#KgN+Vk-bvTfalWv
z_ElLHo@O$Bn&Ig+?9D$MkACdrlY{eHg}A<6zg1rvW-^XDp8lhM^nSS-@qLKpWwYj@
zNKDGmd_Sr(-^-O@rhN5-h+p_Dm}d|5yHVy`;KjJPE^ltj6@PnAwB2u<VCv0~^*>iR
z{?9MLLV4zN)wS)PqK{r$S~A$P7;<Ka_#IW`<X9<k%WxuUCc2kSR?Yqp`!Le~2D2{q
z_a`>Sl1jSaHLJpzCr`>f^fb<YC%k-3qbR2LJc%EBQM%c#RI0!I@9+43FK?;=x}c!2
z)?aqz8pNBX#w~dSFQ-&rrw%vlz8kfh-SpQZsQIa*aKGRX4Bz4v<Bxh2$YWLqgV1+J
z{nk@fQ`D-5$`1&&{kH(nmE{w(Zd-UL8!ZpI0{5Xs{AttEDvrX8EKJ@U%(S@lRHM29
zre$$ZhF+)h|2+rmv`mZum()d7bq=fdo~wNZmcyTItzsRl8;rhPXk;kj8yf+m*250F
zU^Lh_dfKtW<YPG0->nr>_|H6XMx$?pr{z<@Xx+jD>92wuTK<!IB|PhiWs6aNl5c#N
zfdVjc59&^Lk#BKg!{I=nCmUbk4rq3Yhr4L-W77&Ur4Q#(R_c#D=V_$+In5Zd;a}ro
zJMtO6HT>9?dN0@_jWl+j_66w33`w%4MU2|jOj(}+hLG2s6dDN>amX!FXI=f+%9}$3
z<^@mDOUHtFcX8%`d6v#?KqMkv`F^9J27Ot&^pyG)-P0l;vf|b1VXjYXmHW?b-#0h7
zVD2n&do$PaC(aO0z}M5I&Qn@h*(g&F@0L@liG~+XwLRWgE%Y}Vxv=(KnaEwUc!2Vl
zl*K9F<279t=h|g!ZxG*B_SND1v3GIi=xXvD@qKj)eXGEzH5P;fT|&j=p@J@V|B?;0
z=7JR>_ef7^X6Rbw&6TpgsYdA|ze&_ly?+z0Bz2%+e<_VW-BVlPUW3-7JN)L~gHtal
zgg{)$o>fRi_hJK7H}O@+!9U%aW#CbVh~?f}NJ?8JEFMc~8G@sP3&3zK_{1MimU;qL
z^Q-6OTwq$?5*#v&WZRMy`<^gJM=C6bs@lj<|GM8bC=RK^wp9ajvu_kzVm|;?@6`iZ
z2a?m;fhng^P9JRX+9!3E6VYubVEsj|@>~9be(Z!pH=z(-ujRIL(1>WQHB4Zx!>3*a
zoWBo-KxJ^0pdDajwwwMq7fHv?;uu{GCohN0T_<pVEqX$SlE=fG3FiZ)KJ-ko7&Ucu
zp}dpU27$Xy(t^|@V^v!1+y^MQ7i&<>_Xb=1^+WD`wTG&A4HPAB(R4tW7#RqB%+;^t
ze{5)Y7-Ndb-#$Hxh+j`|idynrWlF6dbPU9avl*9^{L!bnGkR;H`Qz|>$ECnJ^}^ff
zqF~V$eXaQ2%mWmLD-UsL9>7Re=E67S`2&U4KR4Cx#B|^VrAyOo8e_GHY=%0bT8Y1f
zxTHAviJ&Cll=}PFy!-)XL`g9Esm&TyLV}$!SR|(Zd~t$FzjV<%creDqNZCp%J{Lv4
zYh()iWO&hrn{Z(p!jIVmuIQE3=L`|MskQPi`xdf^_4V>UV==Bzs89j7eqVjp`QFYc
zU8&N=TYjj0pOj#;*b9~)pPt`5!oqVk@aCm(r60dX?3^EuzCL~REi-fbZF&(A3F`-E
zP%~F3lp#;l`c9P}rM$rR<#ZA;X~nQji6ZBHccg5DQqpfmtgge>HAfh`8YgxyuH5A&
ziLH>apgk6;%3D5e969-eTV`-%KBv3UGIpbW`E40@M@~dC-k{}fBv<)wjC5*IccTy9
zGMKV8GGEaBUr$5(bIH38z9*cGELy$G?Kaq3C)eyXSD{v3wh`{j-H{oQ_^->1O&2NP
zEvHBYmMzG8nH0n91ou8WA^7a`82>1nZE(Fw30=VbwBcOyD<`JP%%@So*y!EMywsQe
z=j)J6z_`D$gVx>OuXjh8cJ58*IifZb>o*d2gdUi$Bw&*cyG|!cD(|i+`+t%BCKFA`
zHmJ?+%Jw+XNw3KqwJM>!BGJNC->FPxVYc}*PSmFzz158VgI?830Ugye&U3iw)K%NN
z?)x7T(`4G_T$f!ON~m6ddOLa^>dzUrEp*LNGfN;Lc#;xUzaBO(+wJ+yc_n{qSF8Sy
zR&r{SH-u98CMhQQZ)W;ed1I@G-Oerka3F+SE6h~Mb&F9siw-?{v|kYIBJr;7`f_s}
zGR$^SZvF|QC#>Ni3!^Fc2n(kDS98pLFD;(pQeLa<CarDQPu%xv<QV>5`|DW2Z`gW2
zj^&$aG|I>*?QV1{k6mnt8FsuHsq6ELMHlA}UJQTso*X7O8D5O7TYggzj+*zE8`NEr
zE7$Q~^YF*k$Lm&a_GT)T4fXB*5%+8S+&F%yU77DN6<^^qYCm{Hx%Uv!3|powkuS31
zezO6ECxYs0?QUCH)~5cZ!KN7MNDXPd_PX+Bx-#BB>m8}!bWU0mWS!;on3bFNu7k}z
z?f^I1U%Xjr{n2M!H*tP-_3Mrk%X;&~dhSkBRYrWf$Agh7cQ!@;Yz93FYnOlW_|!mZ
zxjz>nEJ?n5r)E10cX#^j8HygIzo@daQqTG=zC8S;eq^$(|L~4JNz`jtfwDqA|G>6y
z{NtQ1$+&O0cv|Pp{Rbn(?ssJUQno!Q2Jk#tJ#r>l#L_a^GPC_c8Ja6GEajhz;|*6U
zJjv4}^S=4_(>epG_;R})BDaiwaF#C-Z#uE{_y09Dzi;Vu9ue{KiZ~eHtCi+Qg)KPE
zI4y5u`r15V`Adkj$cazousZP$UDJo@0{-_4*RG;iuBL(o;9_SU04e!-(QQKgw((lC
zOkL5gVKE}lV><V6aKF-GKs-8ZS>m37saxN}jmLU#o+K4XG92cVw|PJ)Nm<R!pjw?+
zZSmh~NmgxM^IKRW2i$3&Vl;rk<idfE+TZ5;Id*Fd9ft{Dub&=m4F+y<B_$m`6Q}bd
zQ^AQcqIHYU&d&uaYiy>h)nu-)q*EnP&%Ly_oMV&`h?Db(QQ1@QXg6|qJ#sw4w<mG9
zcfbFS)$BGY`s?dG%G*6Z&XHg3+672gJp4f+By5fG?2PR&UfEqW-|YiwV&43;-~9BZ
z$|2>kxfD9h)v2#z1U6z5dLtIizRhl}oPIhIL0Auwu=pZdrImOtQ&Wg7_2q2?3hpq^
zTKDc&w@}TonTnX|{9Qx)LmC0c8~=FMuXzuR>~4>we)p?et6d-kCDAx!@28>~+4$!1
zjbV;+D)H|&aF0Nj<Vzm#Ei5YMD-ZBUYV^aKjWpH)8c~XtS<Ndgvux}BUasnE*)rEG
z?fVg1M=l%EGFyH%o-~NxGN+W86HAvkM}Y);e+^S!i?UG$ZfEoGY!wIgFb6>`jcJ1?
zUN%>$2}f;1BJ{q|9(apyffbo}di?cE{Wa8klQ5HAKVkEnysY8{+InQEj9g*Qbj$_H
z(<FWYpJ)MAVyM8e+w337B=wE{BISj_WE+{tZnHQvvM@aoMv%2zU|i?9p-Y0L=Tten
z<thD<tV_z-!kX$LFE?MaO^ijQy!^Wk%Y9asQ}N1W=%GiGTZb>6rmln4R7-hdzVT*w
ze@UxU?M4@R9?3kh3|Tfc0ob%<Cv_E!)CiqbA03(gwJt9vQDdZUbSHLDCp?xgCD?l;
zZuz8tPPp&fwaDrvF$^CtkGt{`i?Y~Y-~P;`!iSMEbkv)$a0zX<=+rk?U_bR;y(1Yj
z?oYP$@7H~;Yc?o4R|6;-<zCGgq4_QO4!81vJUK5HwERmPzCmNU+r7HUhd*U93}*}3
zxg<~I`IJzt)_f)LW~$9E^rY_m89!VxGJGAO60BE{c9>hI7`<Y%PF`F?)<E?b;ZKsK
z_^n#MhJ$9`_wT`M&^0By72wCI_d@8FIZ0_bI+ByH1uEA%rtj+!3+oY<gxN*}LnU@}
zha%ZcWtHX!&6shH>{@Il&Z-^&>PC>-G01I-)1(>}&rRMg;-M6=49|H(apN(3TJBA0
zkN@I+|L^;M=yH`OQq2CmXLB3o-A49!p|@^2yUXWyAAKRa2YvQY-kaoFao`V(9VhHO
zO8vuDme7yXr|f7bk<?3Egz_^__2BbdrBPfWeKFRn^RynJuK(u~CR`a{tyB&Y=k!>h
zrEJb=##P^}`|lMri*8P;D({+1JWJrmVh#lV-TF8tjbRD8cq9xdD0!Ry^9*~xmAP(0
zz0=0jq;i=*?5301E7P!el*<a1%{>&YEg$eyBg+iqDgAeA9a`?{@PDw|TldjKBO;}j
zYv#I-yL8BeB^{?7l^@lbIo#RR#!i*i^KOUy$+1k{(WN8PDAvkmP8KTUVEE_D`669X
zO;_BEkU*_SH}zccjC}oyJVoWT`Oc&{x1w9WagjRH@Vl48C)O25Rum6Ti$0m+*<IqH
zq$(M&e^69dtga(gZ!Y`&VoAll;H|whrXxB0NLt&ha$d-}Meq+z{l<YwoeT&PTv!-+
zD$%)6urpgAH+fg+z~BiAZNj|*#NQ`5ckOGX-Wt|;Ik>I+bKN-C!f(IP{eJf<9Nz8H
zY-~^a@7QMgebXHN-4<OqmzY4!u}enpy6@`kv9#V*-ksj<6d$P1>n1FlZT({FR<NmP
zl-h#rPS2y6-Il>)a`*50ctEARETH_;lepZILN3i{Kl6DLxjU*xHuFY|W^0BnuCw`#
zv2C80qpLF$JkY+|QNPvkLndD|MmIw7ICah5iO_qs=>l3jO)|wpMP+C`qT8Hqay536
zKrF7@$r9Hl5g6SqCml%hZX3vFQ(qgaudF+8*KH)*nmt9vdZb8?l1XMH<!<i2ewpRF
z(q8w#Gx(qL&FM9oi?Z^K>(SlG-G7#pamG2zX=IZUD)6MV%!yQ^+M9dphE9*jk*AZZ
z&t(>k|FbI4o&9GCA04$xmJV=)wo543{_AlRokN?0&wJ#LTYfV6`ha>}w9{kb%@a@i
z<Dbe~yN44b1{PMNS1PRh>7J_m&)4TMzQQteZ;r7uV9;cA@R2k!v3`e`oK}!Efii?;
zQQxyYp}R9NK|NXH^U<-B-~ni2vUBKXwxFZvlI%ZEg%sW~EiRQf7I^jL$e~+LXa#!L
zoSFh1(wat4b@MkvYr@^i=}Cddq&8i@M;>jb;y8_KRJ*%75oOuR_0&rL+;f%$8=DJ7
zjcGqPcA5iEZgbAu#rJj(-x)@F#I<qdopGTW#RX`BW{0=uJ=T=%r*TCN+=M$DMm_PV
zuG=xn8!_%Hmz3odV%Kn(adES;)FOozOkR|mjGI5$GGW)0MTnVpgbBtJzvWHv(D9xo
z_`5EGNFEcCv>4w^LnC^EZml@3HkU<Zj#If|aOcYS1sAmQQ9C~8D?_iEsbXn<>6}7^
z^Iky<t=gOl=(e1Ju?wkgfl^n1W$>O?qX_os3?+&Y23V2%E0KTHcCwbMkMLSHM0qt!
zx2tm#P)=g~`XVSUYId(J#_!Lsu*5leNtY?FRoG#aDJ9B;`k4h8{}q{PPvx-rOdAtQ
zxgW7yG{HAQev~@GeI)6%;-e4pkAC6^ip`i~ud{sJI;)6hIP@sE<Z}O^;;s@gYuGsi
z+ocK|SI7(hEJlA8hD=x`i7oCd)`_;3**<y{_jE_%YhRgmIK=*Q4#J|u&ASf3;x!;Q
zn$>ywOp!+ZKW<3RDG{8qSxliO<+o*(soldqe@8dW<1p6k<Jcm<RRjMrZ=?_&wD;kE
zqXg1O5770wY!h}bb>Yr8bIOk<@;FtD1%La$*n97&rq-=}R51!zKxKnS6$?e0C?%9o
zL`6V)2PujO0U|v}Xrf!$N>iGJ5I{O1)FdEXQ4ml9flvYg6flqwP)I<6fxFoI?6ceX
zefPWPzdOcn3}CFRjF5NDx6NnHXFjj;{`~`PAIJ9h{1qvj_c+QWBhJcb5G2yw)*5<U
zI!@*^ciqrP^-xc%+UEJM^{2#O1w1}`%lK&m0cLo<uxOo>k5S-bI(xxeZ_MJ$-aCyz
zDyLy-b8%_0$w^)gblxMJ=lg>Rmr3@}wsS{sVHPTfamfNF*1p+H`gy`+E-CTuyuDeB
z^)O*a)PkoZJ@!Rrn!zS;KB<+=FMVSd9C+uRIY`7I$Y${E8i=)3)N*<;)8LrD%>A~Q
zlgAz#e?`xDX94mlKRCCK%B}DnyiJIl7--)1hqkjHEsDcNO@QDMyR9i;10NvDd9^Gq
z+tE?e1n7on?W7{f?Dp-UyW#h(P^f1FAZ^{$Im(ddiscN)QnrvQithET+FLKSiW{e#
zwgSS5a#@k*m@Rjoym|2Ni49z|DIL5`yk8uu$BK0wi5?C+5>}iYY`x_t*t1#Fq4ZOT
zoIs@7R;{g6$k&z=ZNr=04lT=$uh!4HeS72}wxK6^-eUmN^tdfS8}xYSDPkLug|;+h
z{lwL1JOxy9FMSUOM1|5fFG}3a2l!eG6ovEt;@7!8TW%v2vB&}Zk-ZVm4Xk+<)OV@~
zZ@aqH_`XUvCq7RyTZ48WpgRr4Ea?60ZO3oD?6E2>Y-$a(cj9oLf!j>R20GK)5;c2D
zIijn~sXTN7$i+=MrEcTKsWS&}Z#xoBd?tr|z8mT>4Y_B7gT)&!v}x3^m%K2mlycY8
zULXEg-nOj}BU43eJh-}XvG|8nmFmvc9V_JF7Wn*1@cG{t!C#Fgc9LsD$#Ut~YXa^s
zpWpr$<_E#-ulFds!lN6s>7Oj%)a=Ml=Fcvpq^i)MrXV)66FCa`@}ah2yFMQBy=@m|
zSh6A3q358dRFx>mT$}S@QN<hCpB>dLN4d8)ISgS=an?28QFcg*YMW0LRM5@!rpp(Z
z=X#lDcNgO}mgp^;eLz}IW6yaH4W6sPL(eIr#jbtX7v_xr*u($5nV&H@7KacBtdai%
zS^oyHS5L&{^G^&w0yp9tsK|{UixGsD+j?&2yT!OV51skO5l)+;>QqF2{}kW2u?QhU
zm~Bs{le_e!0uS;aV>&z=YI}9-v{{2Tjgd>bWXSEU6W3p)p7)p(m>c_cjxZ+?**o2E
z!boC%v5|UtZEDw{wU*ZES9eft)4jUQjo*Wc;l)Vg<>t%B|K6<g9xwE{SVikM0{&ut
zz~&3uTt<=&m60=_EW(vvv5hG}Icw{U$qc>Sd~s57M#*CHi>+I6i|N4X3oJIo>_JGW
zHvPl72l2puKe7f|Y}|@7pCfk3<cvd>zwV|0`_yPQ(m|E6vGHQxTthykf6;G^GMOFR
ztwH`)5qW)W1gLp_aS(hE#57~tnd{EaEY9^uJ(<!UAG~_uA8k7r2cj_!j@%Ia-HH0W
z10s#4e1^p%>a~}GsBi$eUyO7}bmWUWaOgHbcWc;2)BHqj1k)!RNOf+zuM-b~E!3r|
zAfUM)dGQjoALA>)@$~(!K<#UE-^6PAnNz%RBp#o}C1BSuk5mFgAfOb0Z{F&BQ&S#<
zPizAO5=SUN0UgI;@{3}L3p~Uc`3GkG4Zn_@<&p_-iumUne}=dJ!!_6vbhsou_m;Pk
z^glZ9_u$RHqd5QJ0zh`av0Y<}m%IP$bpCFy|1gCA_Eumn@cAWuO7GPxfA5CB&&*$E
z?%!{=;SXKToxoF`q|5wQ?Vr-${(M|R_N!-qH|_smwLiTDI{}>425USaXYt>)Kkhj2
z`CVb!Au4|}R{Zm}?MgM*ZFqPP8A>$n)TJ8oEO>KaqkzO#IFK=8MCk$t$E~i+7A#1e
zN!tM%0Aq%!&$+#gcot4@wd1uJ-*p=z<Y-s9*s`k8K)Dms<_ocV=ik0YDzKgx=z+k=
zt*g<0eS)@~#{&WScQ_$>F(k$HiSf6BgJ2$z&pJ?Yvt+K#z{W(=N{c0a?&((hxs4a}
zd2s&K?_2<%tRcT__z%#9(W(?)ETeu^M-IfdcV`?TCE|l;GLAA|ES9dFMF*@nvIN2s
zwljj^Bs&MjnyK;nn&!@(oXqXW^w#0)E!Y}Dz?`mpOf>Ygdkn=}V{wO2UCR^`$Z-I$
zev9Alk@rT`#QN3nzqLH>#F_nr+8?%8@euy?-9e{OK%Eu^C=u^Z`vuSc>KNbG*@1cw
zX*n&uHVyV;H}W<J<vw?DGTwFv_xBnz{6L-h)tplXL)_nQQi;H*UlrWRSlOndL~qo^
zMy>(65Hao<koyovkdKw+79h^Y(yh`n;BeD7Z*7p~vWPiqdWP#__~voB$=4@x8S;ul
zN7nf!?3Caac?Vq*ZIbu?3FH4dmM}6Ncl=V%0A@PAlkZ&={l~G1(W^G5$Fu^VGb;us
z4n|92wO`PoyeI1$3ZD?RyRb?eblP&MP&s-Y{*!DW7z^_Gwyjr1OS%~^bU`V~f=4)h
zeOVp|02@nHIe<X4Ld73@IPDusubZvi7yjZp{Kt9q`^^Q}opFN^k)z4S&A*r)y!?3Z
za8>YJ2k(VgQKJ|89Znupgj2c$1|zB3lM(wjdfZlPJabm0{+NlYciVO!xmtDPrsuis
zoA?)Zf?PWMT%r@K>;E#{Mzpw|Z;dv%q;lQY=kfogEejAqTHKZ^dk}1@{>O-|(ur%b
z3W&I=dE$?+_z#QH|9RmOlCD}7Wo>ICEdu0sAQuW$B~XBAR$tXGCj7^0pLHw{IDsDS
z4gk)L^6jX?toMHPooW%=8&@P(_;`>}IKb{*8@@K4`O_bt`mLQk2jBl7oW%zF1HJ@a
zpdR3KH;M){RV#l1T$@(LJG1qFH<%9`mzFF6K``&Qdnzgamrt~ss2{_I6Z$#>|2hYL
z@4&w=n18$9cI_Cj{<H@^4N3aL%zs;>|Hqe+mjIU}@`3JyKZpDO{fPgs7ks;by;l4|
zBhqjG-{-_%2FRaQ+`rr(Img92IChSyBKG%w_+LFvpz!wgb7p+@s^@=aKGgR1fyzty
z^p{rspIZBm`|bLG_2NG%QAPec^8vlj#ai7Ty+8hM{rbOqDYgf&p7#E5&EY={{{NNX
zej5{2-hlT|=RM~M{;x(a;Nk<;yH?)=uqyvyn*Yrm$oA$7F_dio|Ip??jpq0`z<PY`
zuDxpi`vvjO^J(Gewxf8IEzI{{ja~5mZQo0HF}OnIzcQZ(VcYlmzf=GJPW?X*%Kzo{
z|DF2(KcD)@2Jg)Ou<m-C0Br+iCjdTg#;gOL>O?jmElUAZ`-s_q+85{ZR>8p$Kdx=~
z@N0e?z@=J64qAl-pt7`inC2<5@(lN=R4F}aTRJKdU=%lQ7a7|q1I2m;i@@ohi;V1u
zFVC}QfpSDT{Sm`h$`a&*!-fKa6D$NS$qiaW9tsP!Va4e}N#~0<y<H|UiDg>Wu*i&%
z{;)`ho$_FN4Ygt9`aQ_71AZ{8fN{cdE#1u|f01OFPaoIQ!uA=@cX*qU;`1(z)+uC>
z>HW8Ajq)o9)*`G#*(5UFcdH}h4GxDd(;cim?^?<3O!g8-^_<nb-1#(<X*W-TVAPAq
z19nTN%nK#gGT;h@Bzrwl$E(ULBL4xWFZ0E!O$*VA@MS$8R-luRrtbE-Ykz7=(@Ku8
zeEbQk3<Ni{z$D?3{Q;)a9#OZJSmu<;p-ZMPnFU4>X4QFlT@>WJsT~yS1chP8!{1C8
z;O@KJ^mS&<g#6uoSUAU}7I*9JS3uY(Y7J0%!#4tfu@o}}y>AaK7Jzt|OxQMaRkyL`
zr?q2;&p0W4{VeakZc<4C&Zgizv)>LTx}hy@zY(x+3FQ-8NAm*$f8J|=yI;Bi#Fob?
zKysh+272=SbahI!e>gxGTI&I#p#uQ#)hYT8vHL;D=eQHOu)<S2PvvZkQxVN%>H2#y
zShSFVBwCskXOA2G)wi7mXyuEzZhX^xMz)OJde^oqJmQz(_SCGqH}e=je((UxlH!pg
zzB*_acFzp1aMv^ypQ69}WGutaoet!P2}~_at|r478k)V~AX3;vG=rvdh_WC1WI26_
zh#LQwR_veN+p#`v2lQF<BGAEQKtZylt}wQpFR~)?`Jk@+V*PaT1S*Wg{bA%yb%mMU
zBgNN|#yy7u19m@W+;GP+ZD3*t`copE*?A6(l1LN1MXDCN-<+S-?lvEPb8z3P&qKG2
zYopjCR@H31FXU>bVe+r}Q!k#|Lk>G|c=AXUqY(+%F1pn#``9<H?ljLejh&ptWv{-}
z`yDh>apn!Dx=FLzjvRLm%|SbQk$v{oXxTS109yLiuKmbqncp0+pEqvBG2c4O#?Txx
zP?_S>*pqV^DCRvENVObvBQe~kOE}qetCA?4sfh6E`Fzf$BIA<cLOOi(5qwM=&-Ny$
zukBbuOh6On9xjzJu?_bthKUN0ekDquCB@VI?e$o32#y>EQ$3XdAFUB4z^og+65yl3
z;$Hg*X1*G62ztp`>fim7HVJ`}uA95;U!RoR{2IT}R?rDq49b2NLkIkkZ%L2VQTlWp
z_DHAh5CxEbrvTDvV}49IiwKh?4<{MRlqbOwyce!dPfKpymQ0_CO*T&!6!<Xg771Gl
z{M7dO*i4l1-nip!axy%I2gizaS*44Hl5ESRAA=|&e|+|xys*%GU7yC?X;I5*ISVZI
zedVaUVKlD@pAl>gC>0>p&CbqYqeo{Dpsqmp-p6(tx}157H9R<kv~RsEo_0`1K3<-i
zt|egJlX<RF**uqvb>-(i{-g#6RYd>ABP=r3jk?*-mGnZN{Z>Y|pI*eZI>_Hp?PGzf
zRO2<&>yaKd%$GhDJ%8sC%Wp)$woIuiY}D&1E9{9nG*r-qeW}6>VnZEy(-(3V5|*U+
z156tYj>dbRhmLA*Bxxx1nX>G*$v`oOAKELX@)m@#%L*mqf<r@k6ziB;!4~rl6Xg2s
zXYQOYUSynguw(b5i2FDtl7*5Yd6ER4uzde=@^FdweR{6VusT|zOdzWy9&Ao(FNQFn
z(QT%r_J$5G@RyNo?{&nOi`~L1U%S5thKqSXjq^^~{F#6T9a^~YDs2sZxTHFdnqMsi
z8T}HWeCHktG+Q~aR6%X7J}MfD=J)R>^ivB*y>+IKQUGn_iM4YhS0+QX)y~<K(dtCj
z<_sr*d^yLlol{s`tjC#e&hgLIZ}#0{S|n|-=)mET#xG@B>WOJPwAikNX8yRI%=_Z2
z&Wr}7)$c!i7r^i2HiS-Cwn)UB+O%M#1<6B>rc5oq@<vvflvSrx*Q#8eFD1*YrJ@q4
z<L;dU)eQ7pDVA<J?2m)xHJLUR%51)}gY{Q@)5T=M(WsHmOqBxlVca+V!P|a<ne8Em
zOEgZV{)PUv4Ff$AcmsnSK`DW>XSo`;XQNb(QKwW60W#=MS9jU<`rKOu_9Ylss1X_|
zf$c6eLklGfInSmS*mVXU3;MY{btEq=`9#`?P_N)th<bOH=OY`VnvbHgW;L~h`Og@Q
zmZMN7QXI}?G^BSzAa|e0%U0s);(WN>LBh5kB|SrZ?xOzP+%)3AS%)bxsx!C;io?E_
zatd&@Mw!r(kNKNh^&0RQ^BK-Q*Eo980)OSZEhRTHfu6ge4$gEX9PWHNotBo7Kb(ry
z0AZg9t))uL*@_lJ1aFr@3T+tU+eh46h-kqDlQPArStqe2+^Q$^)UTLmRiqln@kdSV
z;sk|y)}-nl;hnhF`Ld;7mRr+>9H12rnLX%DsFs$zstvqMyRz^xtHic5J4MO`Wx%#t
z&~$J;^Lk1P!ab<W8dTv*8(EQ7$1505gRmzuX=tJmWD8wc7BDsO!uH!M*h)u;_@p!Y
zNo%A@CDw-hO<kK^dSMqWVH(<bexPTrGt4fMMxHCaTW6}bFj^F&<#te&gN;r)JuuRM
ziKl@gX?gUTfN}{Ulw>%J!>|Gu5VeZiyW`NQsZrRst0l%UoKw;4l&F~#(e7k80TCcR
z<W{M*dbO@xGj@ZwFO>>QD%6REcqcuX>w3z^yaxJZj=&S_T&QdVA9-oV5s2pAJREJw
z*$<z^xGQ>G*gCJ5F~l<ER1&lTuGVt~`ET*kbsaVemgZVSCO4MLs#{uQTLok9shKHc
zb@Mf+>*X_N5)_y>o{XTUtP!6rLUdh~%Vgl`HmF(cJXh1v(+rW}uCcrK;9#=rDqrmM
z?KNHpM%ufPOjM%aobkrY1o6qV&Kk0|Jg1g96O*wNmptz7r%v%+_svPa2mcWeMU~iS
zRaFbad6`CZlHb(Kg2{qEJ0f0EYH;Q7Su|xfP1M&xGER|<p|`m-_JmiJU`pxcxf4HS
zFYNm}_)|RYq3@i0e%IF*k5&8qKMzb1r<~5*e|@qzFRpL}NUPNG237?=>nkw6nMMsY
zsYKpKjw7{@^3q6Xq@Z^JJ-FgwVA#|>$sZnT+k{BYY;f+gT!$lIg66Z+Z5Jc9lczDM
zC|6d;&q=HrzH-V<(a5Y*Q=0>id4&azV-1Rha-bQ>nqaeJ9&GNC9oZf~l4lrj@ui)*
zDOxO%X%e4YJ)->m?up0kHb>{4yTP(dlBs9H#Ft{$8XEx?&lUq#?Oht>OszoBj;?$L
z<SZPgY)DCRE~9lJpt(yMc55{x+4!?b&P|F}4vIqt=1zA-NT-aPJ!Gu0tn_j4MghK2
zwJd215X~iXf9OY^?Mv)B;*81|l<Mq>UmWBX*ym;Hnr|g=z$-x@TM@RI07TbXT2Dx>
ze%VbEG!)oJj$H*$jINY>^nBjOzwqkT@Xd&!S`^8BkEosGayj^0Z^IF0JTCGgnvEtE
zA<U=Ca!f>}9}mr=Y2Nh?{`EOak&_$jhazjXk+hh-GX>tAa%uV&0{YDS{Ry<rur8yG
z?|>G6Yn>54CI&lMI8RK0*F~SB9cLdMG8|SfkN3y1EM0PY%arRCocSs5PS<X{&1|t%
z5ktqbgSOMygu&W3oU7^656ikTx0iUw6n#h}c7W{jTA3Uf?L&z2OMqKVj@Diy#Sjsv
zK8|k0=_egvdHFmnGO@rE=@20CNrxOrX)B2b6uJDHq+BE~ZvE^Aq0^7jM;9YL#xM7V
zx713exdEEF5NiW@!TK6WVacMinW*tbVp@zX&Xg>>Pu#9G*bXvpGmsjTs*-VW5^zQM
zM{{M?2huKXSxXYZsi-KGDyFL1(35zT{2&`>-H-m+N?~6I_aCiW8?76<2<X$^(JbmP
z_{xjt4ff-7T>?=6cC@x}iT-`}NNFySn6|-aS^gv<&(jwo7Cn%*O1ynHd3+iXiHX$7
zFtK2|;6eS>b_VhOr50WZ6Iap77phqwyme~>G^n`y{&tOi3k4LlOYaf_+K$+bW^`vF
zm=)p8=vMWZonf8=WV76f^wTx}2zKIs*};w7S@pWoprDQ_+$Lh{EP5tHI`~Ua8@Tt|
zo*E#AJq7~n%{>XZ_}CC_m%JD(9CUY(0Mqi$u!V!cq<DZ${;@e^JB@4{zD;i76ZPvA
z?&NuVwyW3p{VUI!OWB8Pub<Cphs=LHC`;9h?kk$Q=j#By-!exMeFReYo@0;e>XD`f
z!F}6j&F;~-F=gCBJH<mUqs3CVETei*A&=8i2Qp4%?LyWDpAJ%ZvG63mtw5vc(}8Bi
z*TCqY?UwO1`za8$uiQ$h*SIq<H1l3)xBV(ZERHD4=)144CW7M$%(qCu`w57sy_&}G
zEIi{;mAafyyQ9(5f51#O;i0&*&=qG`iH3pfK1Seef(h+0qF5X*&h-M^Hd=N}ie(T}
z*iYM%73DO#`R(&ww<RhED=u44Xf)58aabh*Z1akqm%a7stVPID1mJAdHDi`+N;ObN
zD!Mdn(j07N!S6RZ09VJ3vjdi@{!>rbMwua}-XS2O#|6RRY5J*z8bd6PTtpFHbe)La
zAyOnMaF%&4fa_rl6FPI^v6=h51JwTAF3d7<FT&dt_?r0AZ6m#FCY}{zClxqX%q}DB
zKa|B!e4xG<DMP$R?RdMFmbrDB)i-h;V!H(FCk|qeoRMl<h^k7XKSbv}n}7qN{0j%-
zXfog8dG7RY(g<|<x%@f3`GDI!Wz|%z==J7TZyYIRT!k_(`U!>k&N?!%VmZR+FxE_j
zd-OGeZM02<J0OtEdGp|6j?v^*$r(WZIAE)MD865Y#0~uBJe2g*$f$XyxaOFZ@9sT5
ziML=EkaT&^5*-xuMz6n;jJ4fuKVLi_-uf*mV1e_R8448CtT`WkOa&Xbu|wI>?7of&
zcBtT;?%^|)l_rfHW>~a)Ils=J5{y`Ze=fet9AL#m9bS0TzoT`=x1c$54+DNlDJDz%
zvCCGCv#yO-C!>(2b=d2P1{UO_Ya`&FGqDz^=HyRCXN$sfpncUHC&{O#nWfbsP=_fW
z=u-|L$%L=#%zVe<$D#LkT)_71ftca$FZ+zSk_$bD-?`x$mpx3sUV}cpxvo={adp}j
z^1OHmMEiImo%8q5XVpc}{-69Tea=4)Y0vF@1C@DyQVxWV+riE28vG$Jv=8%6J&la*
zj;Ou$`OfDLhb*ssMxH3V`#dT#t^G;6jq9fipBz<;Z4y>E`%yK(U?R}Pm2_pqCAbPn
zd&yJv^3B8pnBZPamQRvlA1LM6A-}GZ`-)Fcn+ReF3uW5f*3Pq+9rz4Fk$K2$tnQbu
zH%ynWPT9^bsPZ2BZv6EDLTc{r0V+C8?2A`V5VEG%ZV$`e^5ZkigyrUfY#gyf{Yq}p
z78;_KcRoMs_U6EW;?_HoBD4PCOY`ToW8MimFh#8SR-@BeNItt|d9M&r#HXH}q3uHP
zO}x9c&b>Zy(8Tf}Z11r`JCWhF>Y^TE39qDqjkLVxH`3&b60wQIGXCCg$b;aHqlD*#
z2W>KheQB5q2XW@Otjm!9vmI7K!{xf)k)g-quDRo|k1)nsJq=wDwQr@a1po|CBJyy2
zrpH}GY*E)lsXESUdvvu_`x&SdsGZa3fOl!GA%`!bj-97Be0jx?Ba8Mpq<{jT_bNMR
z!STB3hD%&H48klIM#uNcHQS%bL%bhqLR=o>I`nAT5u%XXjwm6#L*02B0eD`{zj<Ej
z7xlG$`tdZ;`6E69xNj(awk-L~&S~-uLU1R*JQ9v0rcqZdEjmJ&1(Khhze+~=44iwW
z_hzIoB*Sc4CwkL4{Yg<F?VR?ICVRTu1GfJ;HX2lh2?}5d&=an(&6x$tg-gRUXa@Ss
zga)2S{ppO?LOx}HUrEu)?+En%t+_0|^`lIPIJ&9BxRzbk{Y8v3Z{LU)HNxSU?Py4m
ztHI4~JF?){!knWFC3q(q7OZ;H22qw9N@(;F&{l<tUEi-D+xNZGF}o;Ho2DCCK~&{5
z46A<BFtA`Av%51^9t<{enL`pQLU8xhI<66L^|$<nd-3?kd>K7(SYCs=e7}<3Q5Vu;
z*}jDgML%}9>#{En9ra_+mm4GHY@MEo!PU`1q?Y`~Hn{lf4?KH^QEl8ygO|MeyX<#a
zCi5pmi#<T^e-qBRe*J5kK(4W~VsTdP9uY-Mun0!9q%-2e(!}EvmKHk%iaj-t?{?#9
zn+miN!fXBlz6N#!T?~2@Oe@iD5uEql501KNHYE97G@M7difnIwG1-W5`amB#TF3xG
zmv)~0X=Vgnm)@23ac|4i*L(+j#=Yq#(rIq3PW&?Oh3bGS=-D<*oziXeFAY|}8J?hb
znTSsjFO%OBK7*5E49vP=Spi2(Jy(ak(pHLjZ5!uXLM;6{=Sc8`jpowTbi;|6<hu9$
z@~%GeyJ@`Aq!!|-M(M!xhq_|znie6UWsLA9c-WP5-B)*>k8v9=NZo>3d=*G)7%Jmm
z2)_IDlK2wiHWzE~$ZMwsW5Y;0$w6_0xLzyCK@D}<0WU8!t(6{7`SIj^7|I(cjtN1i
z8@ddHFHbGKYv<&Ou1!Rvk5v+@shG$b^WYZmSG?i@;JylbQ3rN)g?fdw6)om;Rxr<*
zG%>%Liy{-j53hgwoeN-t$KI1J2GQ1`j(BW8fTCZD4#ODEr1+X*2&PV}nn8U&7m=x$
zwv`-9cbOzx=mQ>DE=s)-6JJqFyX3;}S~p!x%k#6!dBCdTy6H_a44atAj!D;=buOY^
z<JXAY?s^Yyp26DAndX?ES8?R)gp&f_q*v0;dmIAxk7yV{i;nn$kQhkIO%1bVnL(D4
z2AGYkS?KV{V@bnSd-sg!E^n&c<oO(m=%~(GwzrP)T6@bot=SK+=mN43QvC#^EC{Kd
zplqTSlx>S3%B*zgw(LP%H`Pl<C1MgOKKGN?(^)%BEjq4YVGJEA7+z9bV;2p~4B1)d
z#3#t&rLIhRXbZu5)Y?4G9Rp{<;_*Y*f-MP{Z?+Vd@o||k48_nB8D>qHUal3{=t~RA
zj`SICKQi0Wuyh!L^Yorpg^5DsPH-Y5_?Lo4U-Eh~>RLGr&NFmHB}oZ#H0?liL+DJy
z`ZLlwsw1JsZW<^=uCr8fcg847`}(0jh9`|=!q;~v#g2qBI&|yz!xX#q;t&a`-9i7@
zI66W5f9yY~X?KNBMCZk`>D+1A@_<tn^qJA=p2YQC=X0^$^H$0G$k_A{Pm1a6FVI$l
zF1fDik9ltU;!RI^eY)~VwI+KY^<0-moMs-1v*5iZ&?}~&Y%`q7ryJ+%qN%Pq15&+A
zw&U}e+~el34>^T(Ca*ov7V%@z&c)w2m7^#&!Tj)0R`$M|rq=ExS<iV~Z+<|sA<MAu
zr+69f>!UUJz#{w9_;<Lm&O=?9D`&2a)$TdbiJ|?X5b30Bl&ks(V&>E1V-jhSKVf-u
zOzWXQ_N}?@w-1k*nFk6?kWkKbYi^Qn&*!V=ix-N9UGga2ntym}uGtzoc=GNvjzW9j
z>Eg2B@Up!uC)q|R?Dcx`IfI|MT{2J0P+J?O$a5na)ZO~kHp9|-!<MDXmPTSHe(AX&
z>w-Fp66{vu>4to3GG*_HBl?ax6pmhV7wojNneXHj?PLdSQ955-`1XpFvT>tSy*GyH
z3_48gcovgh%@!$?Of&Bdcxtc5)L+8HT#@KcVLP*}yyMxWh?2#Uc&=S$2dha#NyNJ4
z1N1zt51EKL8RQp|p@8Q*D@ZxwwM3(0Ctku{d>MJoLerbnZnqTH`zsi|$ZI?1msl*0
z?h0cR>ej=R<Um4F`sYyn?0c;8N@a2I<ag6M8x6*icCGrYpA`clOeZU6r~^++?>wP7
zR2S-?+E-mbx98=BD>Zdrm(VAkWNWrEZP$|dO{P-S6ZSBlP};-Nnof@x?>PL?mm6Am
zH`^I9e~%b3|1i_XWUK!hyS3Ijqp`*IrLvYqW3rqJd%R-wyUCsT1R&__R8$kKp9^rw
z#d&|q#}L!h-yL~2WXAP5Q7TG`?9u=&Z7H}Q$;BG6m6+D`Dp9`OK<MBmt4Dw{m(Gi-
zXU=tvke~HFO&@{c<b5JaTs<*F0r8`L)usL&oLXz>Qwx2F75|`M1k@-x2#xcaDCGJ{
z+=i6_oV)q`&5_=-xW3ifhlJjw-}~E!ao$6lN0>(x6jJitAI3G<RwKPsmtP?F?wX7x
z-$@b0X25X@9<0+HzV?>a*1$q+w&fG_bI!eA3@JC7v{W+FJKvs@NuEy6uo*b1Q=G}t
zG5~q1*TN!s)9uIb8Ph2%wOm1BwdSAu1g>0kM^1`8SuQYq&BvG_^d)Px=LO^iB*WrH
zLzTgI;bU)0D|7>OgYAQpgA*HsdM~JZYZ}i6+#RSZ(7e1m2|PI?&0RvkpJFJ*sU$VT
zldaibBV3b$X^jMFC}`}vyNgqt*}?}C?D1zwnHl#dpo5@9omrB!z(cZAngQ*;#x!(5
zUNSj?6{G+K;&o#4-ER-Mi`c;lyW_5AMB!Vx7*X)#DDar8BqgPAFWy@6w4<>T=~~#R
z+D&Wbg<x%p!#m_9Q@t<(E}sk*IK4=544&K_SYJ%Oz&K^*BacoRaI^Pi<$n|eb+$y!
zx!yq$JRd$lS6apOlj+11KjBri!R*sLHLN~X9m}#g%YlFNOh48H@;1mG@)nDU_l&Cf
zHRPmzF_$3&-C|wDek4Z_Ksk^sAZT)k%LYL-qWp~V^)IO`w^(@9RS{E4MSCOGEz3Tv
z(p~znQ_`J#wSp>I?#Dc$Ht4@8U`U-YwO9<A?^2<kjMn8CiRA8%_GI^46w1IpJwsui
zvdh>Z?6M9|cEfV+$Li|dXYdTldWVk<IjosYW9@!w-$*mSYP;|DM0~D$LvbY6!(AcY
z?2fjtsJ&9&9A51^?EXXr48g&pRu>9)#n>~_RqW=wz}YFj!WV{ppDL1BO%?b+lSYyd
zo~%}sN2v}H(sPE6IgeuVR<Lwu)5eNP+%1#F`R5;oqmuu@e!xh1-0`{N@0xSq?EyE<
z{V>zLaj;{SaRp50oM@LPHb#M6<;Kj5J-4_V+d%RB@i%tUs$UztU+=u8mYH5EmYQtS
z@<K<$ieR9M)RF9w8ROftGg)1v1W)p27f2#V9b~-u=by@_5`y71rtx?0V?WjUk5Hef
zTc%9RqgK+Vv1Z5|B<-%_$jrjb$uPy0yN*}=cGH4Z?Q>C4AnBl+(w3El&DuH8Le%<@
zVUq6}FuKpqTgM?(yP>_A>eg8%dbNi!`*-~e?$qpp8co2;^qbZ7FK4w}&P5?K_aA&|
z>g8Q|7vC+x2q|#LMS05i`99Z&2<#MFM+pjs(nV%7LUrQo(X3Zo((3&Vq>gLuFl5UP
zPCB1i!3UaWm4%-4V=P9n#DQF#aV=NEY~If&z*LXieUX`;bmS)=uFGXD9p%cqOTz6k
zs}Ut1YOP(j9NQL}5I(U%{`5}@$+S<7Z3>qJ&x{2l?Y{h0Z)bMUXJeU^Q`yo4x5;8e
zr2oW~aL|W!K_iox=FO1qcz{}CeHbk6cPOIJ!dIRqp7Pwm5sO2x&voBaC?VulqElXY
z6srVB)_sJ8<dngdTkl9m%K?FPB#eHNW7@Ja`b^A@7H;*^RgW;$1y?)3Tr8iI)v=Z`
zkD@$xfBz4TWUqjsXI2x1uF;pNCEQGlMt`|*qH3HIf;@QSJIGH&K-j_Mdp1@s;ky$q
ztwV<~V&rDy>CJG!-N%}`FXx@=v2_{oeVPfaBcG?|!TpdI145~>RNlP>T~5;qACgka
zD0VHx3N0%8!DjM&<M+7<mhWc%`i#xrla@dk!J(<Cs>^#4#l(<jhUi+i;q!;De|daR
z7L)0wX#iGJ(5+0P$?OArbmw`#oH(gNJy?9eo9K8wYuA(M`1XsGL;kOwZ1kQV<X#hg
zb*~Y21s!T&$g~!5%h(ak`*rHh7yX}nj>q=;B4+O(z4BE*#?94EamSqyD;B$l%%K^c
ziVMZ_HXRIoV<5Lg)Zsok=Gj$y;W-GAo|Zje<#zY_Ag(m$_0gVlW9o(;SMtf$YyJWv
zq%T2l{cJQpP^iRRNyG_-q!&L_UQL6;inw&<2MYQf&&VAZ$iT7lw$5iQSEE>ok6Tw=
zH$EycdfWrkRwEVq9-VNdghWf<K~5t^miIqnm0Ma|c^^EkZGJT9&3jS#PU%YZkh-;4
zr9uY>>u5Wqifdh^NfxQ2-vSpDdZO>PiJqz}yT2nN#^*^4b^vzgdCbN6VrgA7kNy;O
zr$|jxJ=H6EF!y^gOf_t=Cp$=yK4|DwhHy0a#f~2mNuJSEwd4ObY76maeEBtlya3us
z5Q1gmydP7O?wp1_ha>DIZ_0G1>a6L-<)N6bAR9@e%u_=oo1{u+8Vka-82u_Xte3sl
zytFN<llW?TPj0(>HydvetBsL~w)4zmTR+@9nN3XfI&Dc1?tHq=l*HvNpB*Z7=MrYW
zn@j3-W)SmMSnWspE6yIkp_L^VN<|bWL~sseEzCM835Z)JVv>&P7$rfm+T=IYGbsBr
zMauHV9rGPJ9lB(>=}q+vxT!@U#dja@A3UK?)lC~!wFZ@<GvG;eVy(Zxacg8(cVuA@
zENpDCBA)`H;e6l!we)&BPZkf_FVZ8`%%6~RT=F;&XfT`0*!6Jw{&Wdf&I|#1<M6}<
zj)F|aG&l;J0=^1%24w~gjRQ&=(O)kke~!QW?3_}Dx$pC!_iwT^HRjXZ<}+jF)66x`
znVqar%LHH4>>Hb&Ib}(xi`(t|zB2WgK(e91DY2YQYQ3tJ@yd``s%O`#R&p98qPr)r
z^cFy<N+eJC%fp5c6zjkj`(j>N@dcM+jbc@`<VUH(2_bV<ZcO9i_8n3TMOfxN<Rx5t
zA%t;`k#wpPS_d5sLdh_19e)kFqZ1!zPYxNA-_fth+V5e5@-#mBpp!S|Y*!#a98KBd
zoHUyTqg3NV?X~On=jX+M242OJ6&gzLehLJY?0#+_(lB>nGs9<e^P2lTrLxNEUt+MV
z<TMSAt-B{Uu0X<f04-5dY2N?iEUSv<M>2ekuwnJ7DFJY4pi1IJi3mFW-8uTNIY=`u
zlxCK2_~Omyo_>+MwmFdk(a)VYx27Qp-dp8wCqD^uxTcLc$6m*vTJQQX6VmWG`twkM
zQ)=*m*uf>VzHGFCR)0Kl_6S<VYOI*#p!Ow)hVB(Pcd0R}J_|i-MQ9}T>TZ@h#SC3!
z+3LzPoP!t1czWv=E}u((2cVfH%f?xk63;OlU2!r1I_}uv*n5PQ$Vdzzzm;(yV^kd1
zwZA~?SmE_b?&D|ORxT<VJctrS7X(#~pVplHN3jD>97M{JPl66)3gKLZ!(lzp<p#0K
zNJXU5vfH4P(<B$>jxL|Exp=RM9_dUdt=we-*}BEkH&7(K)-C6A`2H=;)zjba+C0AH
zjr)XI=?hA|voG!llE$pmwf&qsBkRA51)Ab@`)0OQdMtDApp7KxC6DdrE3>w^B|tmx
zqbO@HQ>a{{wBJeKf!Iu8zthvJj9a3YFFfU9MbV=a^e^`z)rW6f-|fu%A|Fb&S?fsk
zkYT_hk*hDxxi!WBT2bOhN{TB+D;MSB8^G#vYB$lz?a$4^+Sa)+{@WF6$!4;N**UP3
z!y^Uu-ngKIJqSKlzL}6m5vEX*;eLu;9b1y~Lby!#g7{>GsUBW48SYop-Ljn_&QG(W
zC3ls*cwgqN@6zh5&59_KK!kLaBj%i}SXHQ<upMr*lNDx`2Q=*=w#nBZ{)FH34@MNZ
zvH<VJ{<)`jB(J}<1uW16vINnC(2G%P-ImdhuLVBRZ{9v8BPNlN(HC@T42uFCIB+l9
z=b<_CRHrq|j<xXsad=^-ST{aK#ZwACGGO#2-qE&^<bXLl@9pqdjv@U%bV2Fkq`h;n
zUbLIc0Zuu&3{NRl`S>h}IB@1`ZOaUxbNEzTbaHSV<}z1n>zP9|79XqV^vA^^0Nu}d
zUUEijkUg{xr$$=Tx^<;du!K4+d$?qHB0K6IluN@4ZMm;94MRs?4Mtz~a4oiq>=L^y
zrYMclWL_!nnY;j|;*4+fW>wpEj&|~%6fXUka;_%3M6&0^7@vBauZ>^xB8e!FO+<GI
zBl5Mi3LP4}{1h~k&FSEViXR~DQtoB+$+?FE$XyijRxcFxu}qW^l<YmWP6HJe)r6GD
zz_{bg<_AbT$6glO+Y~nY%6PkmMI!YjXU{%7p=|CKQTI@wS=GeGQzY9?vP1QBtCL8f
zgZkXCO0wNV7)r4H2idQOIto%R9JbffrU3g1mO|196z`cc8gmUR0ia^%HFrZ-EAJ6B
zz8T}LOE3vTJ}kIMzedV(64S8b_s-B9+h!1Z6!Q+&dfP|h2UgRzUQ1azliv!?%R~;*
z`3G0rCf^(>i@NrXT3xWe!>0u~<=8pzBJ9nMFgCDQHYMNUDlBxaba7roqRO^LXg91N
zn&w;m^&f<LkFY?;`+c*Dur4qBQv&n|#EKmGLRZi70uoPprf;*e5=Eu81pg3~XeLBT
z3u;hFu|=G{m6j6y!^W_+&9bpqF<YUG0<h1rf(8{BHD+V&@Tb|ObqjS~r-A4mu|iq0
zqKwjHRr%C(^FMMI7i74QXOLQAiedw^`ehGkd(VnAN+(O3zu)`)t5Yg>r@5J9N_G}Z
zs%r1f*TP)KyCck73tRV2J<F(^zVZg#`=UesOzB<Qu@KE+=(Al!a&=+EPO;^$TR>1Y
zI`Aq<YQ5Bt_5fESBd)^*`|54>zE7mN?>sKe(_s<ZnSN=$MAiAV86yH$sp`y%I}qX6
z<Ec=6Dg#LKM>#u4rZ*qBifNM#tE46T=*gsKdG)N40~liK)t^S&rIwsi;ZN_VmvVg=
z-?i1~z<^}?PLKE4>-9cOQSWVrjn(JHj0cEd`+R{f1C>#5N*r+V0esm4;}wZNPr1_=
z#tBZUC4L28DXS=zY|qsnb|vif3Zg7q4&jwjM$rv%`DfQ-%OsL#pTPbV$_aGjy*)vH
znhaSgW((aWtxN==M%hP6vBHrPlJ*_(^9__(XJRWeqC|j$bt+|NGHr8QAO||NR$<PK
z%lVW%jFVUIYW9h~<S@!pASu}Qi&rMf_d|YY@_@Z@LSnRfDTO!?xM0&z0UyTU8PJUI
z#9NW$j*X3FaPO3b4wHYk<4+Xp?Uq&T2Kg={)9#;mAn>5Q1X!?TYQ-);BJ^AL3ekfD
zv+9Ksv1R#Q`}zuaTrTGxxcjm)crOvE`3@SW&|h7cyz)HF%42stT+@O+zPClV#VQ38
zVPM&v%V+oaL4#GVGs|E2t+jgX#1^PK$klwAC`-X5uU5;850t2<>vl@-_sh|5-fN#~
zgzk4@UAvOc_46QFB?&k$*8BdH5?-Ev7`IUShUIBT*h?><oU&w(-VoI&l-w}UBL~A~
z%A%9<Vy>EI`6MKVqb2Oy@)0IIS!MARh7<sg{fnRuexg6NRMGZ>=JJ6au3hm%Sl4uU
zAbglAzN%=BMJaM8>F+*ox7E+;?`Tra7!2}6l&K6_T6mH%D-Qj))&UR~kLM#tla+^U
zJS7pqu_iLJC(gP(%jh5%FH3cW-%fF1+$vK<7E*{Wg0gEaW8i3rcq6`{V!8O6jBO0v
zCtSU&+WLmeKRo$g3jq9g?(991jD#;=t?uqWtNi_?QXZFjw_JPB+jZFhI_TO@rRBp?
z`lejR!P?NC3nsfSd<gNWgU@H-=k3L(#1xu|z(G>d&?bZOySXM*jA!-7@P*pt3`6S(
z7Z=^yjL<-yKCj(8>k1i{o<c2Ll6>``7h^K*^To)sCyNV-V>Kd}ngeu0=pa1#HY<;d
zbu}wEQxQ)sJe{*rpd<g~(AX=^$biMh17a8%yaq2%%AX_>%3B_&2Os5pp!wBHdySML
z90?Dr5%tDEe&oFyP&WKqSwq`55SiNSjV(xlUafyWV%ggHO(+`3@4t7DtY?pRx_pe~
z&bYJDOYl&r<Aux<>?VUsWX47TxrdmEBCPz9Pk4YGZKAPqF#w=BiNb7>nu|@GItT6t
zS<lK%eu5}5@CHTnIp|S_uf}jS3E$XGfAM)>F>C3*PfG7UoY&uw`v@18*qOV^-BMj~
zvO%`WH2q+m&O_8I+7U|1$wH=FEd12XrIq7>Q~lQTK77(DSW8hAtenwai_8}UoeJPH
zZ?IGLuRNB?h6l_?jkXn>&Wb|2VewxwG&k0`<LvVVuawP%zAC#kA9f{wwofU^w{yPd
zn?Gr$U<<IOWVAy|sI!RI<eLXwd(;_E!#Bzu-`Z!}5caO`hkcX}Sgo{Zi@vpsbz1oO
zo^er_n5#mt1^NJ9!f*Nk`_SC0<e2oH<FEEa;G&bvh71)3WyCT|<KNw&0H!|`rUF#Y
z|CU>svI3M~hN-tUD6IXfKUO2$mJ^37OIn~TMV&}R)Rr-K&JM_{NyAoQ*dX}UA&BOK
z%noLOyOFzBf_whm`h&M{F3}XZWehwz8D8weSkzss|52{CYQ?n{c2p0&p*3ie6j_e5
zfVqs&piYeB(Xt@+GdM|btkAwOVK9xj7>EQqqQb8~J*UMr-4>8{`q%9HbKqnogbOJF
zY!vchm${U|7yYSy)qy-t%F{kUI^3TE&yT?)Lf)Ue-&O;f>pGv$W#Timg}ImFc7#+m
z?L%A-HLkTgvQV3(2nXAbrY^`E%^%0pka&rVU9J=UWEnyM#|*%nf?%pd$oa~bRtTt?
z$<yVGGF_im@e>&+A>1sojK7dW^daqxaNzjl2!=_xU^+JLe)w)fbu_KF%N2~oMR&}P
zHtpoX*wfeD*{>%#SMBr|5Jk?jL^3vJF6q_-ww1=!ePdV}8=fyw(bHdBf)TCAtH6M_
z&*>Dc-{|_iyn0n{C!q^5;ZQa3HKKB>`f_UxTbE5kHLEENhF1@6epgv-y>-*|wE|>5
zz&Vq*{B-l2Y0A9u(HUbGto%r62ORHUnM+u)9t}>`c}a<Q3-{K8ypm*p75qNGb*>kj
zFX@RXD;CKgNN&RT?f<k@jSm!bAqf$cMRB=xzrg2%-F$I9y?vP+e?{_QN0xfZ?bZKS
zeB^&%XYxUT9;D`wbdzQ}_yYLm>95zVI)9n&Hnyc!SqXetW<EfX>(q`}KgP(km3C=5
z#vK$yTJ<*WmHoE$k}cw!abSJ#!s8KvT%*=dg=)i2u?IaP!Ag<1?>T8C*87|lRcNe$
ztWi_s>Bs4r{$mA~dLqp7Lmd%?c{LaFNpR)-e&S6GWZ%q^^T^68lpt2Vl$$;nkB?&M
zys<zTYn!%l-jbUYIrVF3d{8fJ$$P%ozsi9Do$vg%7&dPwkx)d0X4|lXAj<<WOSTZ(
z;TgS3i3)2C=VV=eO{H#`PJqI&y&Sa}NKdVoY!P*rU@Hq(XV()lLKHKYiX(kFeN;h@
zw9qrWwv!c$fXIjWd})WBcB$*I&G7lgZ?72@xBNcnifIpzc#FGRyBgzwxFo|ZDcYn_
zt;4s|Cv3Bc<>=ywhr2YcbUSYdlsKHM`G?E<E5Ji=A+?d3hTO!4*GD|f#o?dFClJQ|
zu$`KsnjmOKX4IEEMRt7egU{p1S>Y<ln0M8)xli>XmxUbPWHx->s_|}4R9vsa|CroE
z+a`~%Jj&v@nCD7gi1KVvH8B#KE+lw`og2J@wrWV1jE{h(W6XR9PiFXCGi;&d7)!DY
zYG4Ctd0dUXfPTAT>)1x??M2K686Q+Q;2iKWNtD*f!RR+<_gO$mYE=-Vk1Lf-t5~vw
zghKw1H<%Qzn@<}Wv7CGZ^wSE?tw?!#L_N2V>l)MbyUOYBr+)BD>?8-}uo75xbEQ9a
zc4%b{rnQv)P#`&o4=5=rz39ODMUD7|_csNPvq4z#k(GUp!?)i;+`SeSHt`9EGn4XA
zWMB+4@bD-l7xLIpCZWPlfUKAWu&hZ3b{5e-y_EaU39(<@m`L+G1NS`dr*$?UkBOZU
zyVnEere8lzP_Nu8=5(sHTZ0b7#)y9CN}{jk;UlNi3hg3BCx?>oZsjk%*6m(pSV^G4
zoFmOGVs(um6AM)v5gdl}QCf{go6$4Pr;}4-p@R*$LM0RJFc^BUvPFNr_Doy@z)`GS
zfIM~|OR;?H#jFi{-5PN$5;t-ofh{6b+N{)xbLQL>+E@8t*dC9aOyyvjfoJL|$Wx>W
zb=K5fiP2R5*P;y+6G!vyl)GyLO8BloI%gB>G1`*20*N1cYp%#xjnZ`lnj~mj=x($R
zgUVzKNU0VTcW2#%)@=sBE;eqsxGy$Fx-bNyjTsO(;A{&=-~HAI5x;%IxWa{sJghUG
z7Q6n`-7k$SK9mzt1{Ct?oFA3^PgR&zb|A1F*v6TfL||+s2-S;SoK*;xh))X0jWkRR
z2(3t6dbt`h{#x3yLCm)VuN=a{I`>}k+d^~4++E!+Z`gXiD|pW5>buO~3{07*U{je}
zi6n2E<CjI)QZ-Y>-y#DpNC7<cDpeDW7N^AniP#%Qzd(M(S~+7bjb^5(XDNz!`Q3(j
z^&p;~+&Zn-n5vbl>u4>RnfHoZz4&}%6d#11usG#-K`$dV<M#JhSWew7Skink_SBY%
z7n)+fQ8eVqI#^qc82MteN`@sw-KxaU@~EkZxltfY{4dY@>j5qluDzucg4Vg$$u443
z*lnoSsCPZ9Qo%W^N8(f<8JU^QYqiW9NJ9(Kr05ry)l8{jGLbv;CB=(p@k90EY*lpG
zhIZ4^iiss6D0ZycV%-T=i(RkqMU!ks0Z7@X?UleFRs<1h7>l2ma*;+l8~?dt)Z++G
z`YvyQBCtIe27Vls+1>v}N^b6i9B4@DOwsU=)T=8Kg?4_33=4om@FtgHw<3{gxGAV(
z9hQIK?sfTjkAov}NiW++^(^u|SC2@!h`{KEXsg#Do~&#BV>*#u=xF=*t<s{)<<E=)
z&uw&KYq~kMFB*yqCC7yN38x+2*yAR3P1{>KM#2(dE^lK-YjV(gHGby;WCB4*5V73*
zT^YUzENvO^q%p(uH}`^1Ag-Fq7{Nzep{<YYd4rAK$5|@4XK@GqfPHkcmFK=3g!eYl
zXedXl$ZWK%#rN)&_@Bd<wp0``zRk=V3f9w4R!wh_i_K_I@M<Z(inM5nZr?icYN(+C
zPl4gkD223uiIO@V)?bv-{*6+q1VP}dSI&mir-u|HPcdAOLW=&Hp<w43MfZK8NCmv&
zw2#x^tz_-*An<}u3G;&K;g9{kqF-xylI*Ff##_rmvNN?D{6?$T&-iM)FwaBUVOKJ2
zB$lUDm0ghKp2;LF4LZl54Wc0KR3`$99rEc%L)D!5=tSV)sImEyyK^Sqs<a;$(>&={
zsr%K+4me1T)TL1HQN6Cxnk{0>-M50o8MW7+{|H`K20dC=;FJejn1(l&O0qs!vr8MU
zi9#4*kgE@-n-Gq}_#c$sqrZc}DxSFnT955y|CTk&;%UU@Py|UY<7rD`lV5+%FOt(9
z(30jC_7<Ubh|^}}Xqo7GVyZ6s$Mz;w_*z5~=asegik;Gn(do$y!)m7h2SV4u-kZ;r
zHn-L?v2qMN?AmA<9gV1Y2oJx~IHEYP2=wQQvw8BrzWv|E{q;io!X3~1166mG!3JQb
zEBHzg$2U?po&?OoKEAF^O!JCY(i~m1;hL>S>la@8`cCV0$nuwkpLD!koxSdzOTB0L
z(hqK5ZP#nM{=OB+4Etr*Z7^30Mke>aKu6MBGN&8vUY>uzCuEka$Q_BxV_Kvwyw8D~
z7%qo3;UfV?zT$e7g9z*S0oJtv!rtLI=aI*CVYK#_d!3fRNGQ~X{hZXnD2VZam*kX1
z;yA|!4Tpcu_cQr7zPrs-9AHbLp0G<vM@n%qZ(f%6_6JmMh9zlp+&u{wS9*dr67<*K
zrN2P^aDQ<dI9kMh)B4x}JR-I3{#IO(N#nxMvwnmVLu(>p`Gzu?;=;*5`9`@Q&R%jS
z5u6(WV--7s3T4*E>=C}W#z*`wab9B9!_{eI%<@zIH(NdUMxeaih0=-m9sB&|b^Ocs
zI1B>of_ZzoY_>v}8M~lXGE&}_nmo~896#NVEAFi^&pf4cB3|M!m#Uq)MGo}o7nATI
ztoMfPY0OXrwv;d3#`4SsO>IqS&eAijmb<w^)rDfc=#Y%0djl52HBJ#085$d_+&kCs
zi^Y-SzUj4%ViO5h&7j!cd_9<460x8OR(oqJ92Jt;?=$MFcA9|KO&@5*@K@yfD@MY?
zT>U+ZbZ4BZe>tljzMtVkB}X<qib?kw>G(*rV5yB%toVdEankKOnKr|-!n0pvA@U~S
ztBR=)!aUs|<?PnIoq39062S_$64D`^zvDgw>AuxQh%yUWfEaR##4Dwh#Y*XMtP=>R
zC1dWa9T3v@j}0W7^>>;5He;0ou?nri0D-5BU53FBm(JqHpA##Z@+r;3Wsx+u`Xw?y
zJJ^CU36z8sDnOh>`S-JKh#pw@8SVmsHNq`^sLXrGQh%-E`7_4&7u@sWLS90~&tCql
z!;Gb0{(R;0_vaZ`u-nHjx<-#VqNHae*XldPL(D&FX^a_qSaF5Eo9`O!y5^W{L47(9
zf|{;P&bMnQ3|Y?xR8cA0g?q#IPV+}1Q8bXt?04HE$gwY0RIFcwd`g2=Scw>m_P|fJ
zGb++ecgda0FsY$a@Tv{j<w8D|L`ztmOEZtIcq_s24HzwZxI@ItNMfPaB<w1>KX`MX
zB+4}3pHzDnQ|*0`RH_n5Gla-Liff`f_1B1K>y4>fc$6pAVqnwwi+U9?C6Zo>9G`r2
z#pr~tyfX~r!cTf#jd;K62mw~?1~FHgKk#=xNQXW~<&?LS#(XTFSs!0*6qm^XN)qd~
zjz4S(Bfgfgj5b1?9dh<6zejuq@YCZF<aD@SlY?iKPq;SL86hI7?U!OA6E=A>qahis
z(i|1<nNx%N1+LLA#D_!cbQoZ^M#4VUsuj6z^0`mB;3xoXT;Y^g<bRw5_J^lNpyEKN
zce65gFJHQxk*T;toCMN%*+2VdwTPCC9o8k$hPlYc?QJKIv>m!!%+#9w37e9i=4Lv)
zJbiFRzu2YJ-|p*uPP=|{nJBHz@p_L6KXBrFb#SSR*cR`g=n4E*UB)fnyTxL|73It`
z7Wrh$&OTuLA4#&$b<?cq5lSM$cBheh?Md<7o{6_IN-_M)Xm;)o<hRr?zXY?fZu^Yk
zEl=oz)2Ar)(<LsO#*K!#q?Aa@I06IWT9_2VKdZK%nI6DWTksy(LeF{o*F3rjkSt)d
zN2~Y)gYqAho%zwXuq;d0@rT<1eL~^qxmSZj?G!7c^55shsz3=fn$<W4{3$*XF)=*S
zr)j;e0{Sw%60-p<_?doK7Gk$$wPhN6Me?!wm!z<f;gpc^s>(9G;w|k3bOqX#Ge|?%
zO!_)Z7)Ki2LHYha)tz}flx_R}OC%{Or7VLs$`;9PP%7ECnC!W!F_vV{Ft+ZfTPSO`
z8QCVr62{10k%=%2V;M;rVrERX8Ovz+Uhe0)zptm~etxgt^ZWDHA710-%5`0?^E%Jt
zIL^=Keasj7mQ`B-Yk~czr+G8ys{rtSm1LD5>ryemE1yaW!FSr{3|IafN}W1hkkRlE
zfZD{_B^~^aIr)Ar>3&UDMS!&z8IYIy`<^Ecms&S}(0A;~S$=l{#L>wVuavAkPfq_)
zc&*j{zTXF5U8P4~qlLKx-tin~3iz>SZWY;R6&W^)n<N$5;0Nj{bEyWFi;tSA^+#sW
zh#iY8n{({espRm5mU@DFMj$C9X{8Fm@aAMcoZL9p@xF_;UG&gUe-j@Qxdn9W=?j9~
z$gH>yROex1)w(7@MyQObA`{IzprtT=jiF#RKt3fJES`vC#eA<DKPELa%o{e22a?^v
zhQ_P2KHY)xB!8c3szxvb{7S!NC|)^0yQvpkB)4|caaQ6DBw-FsenU*&HaXp{4~AH6
zGF6TBJ~36xEH0YpsoL9igaGwgGbzSha5?>x)?ZP&_HNEi!q*UawBJj?iZ)9`oU(av
zu5LXg_jKF-kUu2dpOW8w`MyQhqlpb3)w@jf0D-B;2W`f8(x1iz&-$c6SHQ&rN$qSj
zX!E3&{+KAQuBCKGSu#e4MjObL%OX3ko@?0CfwGH<b&rZChS5!Y3_FAu2SjHlucWtC
zHvz@-U)Sq@eCucy-5$T^`wl2)2lU4jV&-1TH3YmVkWc_E&NMeRr@NK$aheudn*@gd
z89V5PLHzXM&!3W9pN{00%7CdxEiRf8Fo3C)$959166kOX$laxlXUp5>ci%d6w3lDD
zVF!!oo@YXdP5A*Z(7yODycnjssDaQDh#g`uJ*Cn5vZecF`CymfO;q%ge|VI4%y#r^
zP2D0Dc}2XtO}NzXDU`WD)52vCu03~~%gCg#uMq5T2Lir-AgBGNgPkISqm$TeusCId
zVh^<n^w$TLk6FiSERjmx$|z24ZxwMX<tMDkoNcPnGYL+=ah-p$<k^58S@z-IsS=Qs
z_+7(jotg73t?}%a73VqqbFZr|kE_O9@`)dZdlq=7!ymVg*8ZkWpL*uL)brOfy^>Gg
zQ>NIu{;hSgq79}4XhaC5lld1eM%+7IPYV?rr{>tT*}r`p+40)XYW5D|KHmvK|KP0N
z_J+Hw%lWojRG3fs=}}mDm4mCX=t@Ez!gh90-ps*q&L0^9?79WUlYRGK1b$4Af5-(|
zbQ8KmZ@T-r=L7mt$waD=BM?g9`#N<yZ5U#r30Gj5ksDpKgrF7jd!nM=m2u&|!FwCi
z4vezXpS&O&&XBq$jMv(L74&e||6csBYHD2Wu31&yl&-)&Ve_kf8N)F*g-Got{!Tpk
zX*PzDHmS4iI?GY}<PD;2mY-Ne{8j~NELLaBqov}%mv=ldL+3sVlJkEWkP_%40Ge90
z%}3U4e|iT8c=zIg9!X<?;%HgJJ3K|$7`Qmf{oMukush>hr}7V0=z|5I=IjvS9pgf*
z&->zofM|$~JpkHR<r@ko4<d)X)7_hb6qTOrg(7U)NUtu!2zgaMX&E6-V{!vkPXN{!
zP3mCI)!Pp%vd@)K#)Hk&UJ0s>>2Is<R#F@5N1_PY2GNnSmKI{OTM#k89W26mgZC8!
zOlz~ozWEZIH?w`PO;N0NhTVXLqad|`Hg$^dM=5+ROxaEl5D}eBqIDFoEE|T}zkLL}
zzZsMhD4)MdlI_O&+iw`56w56>PM_$?!HqP@!dM&fZ5e_q=dtac>v6c{?X8pg7Og&K
zN4ua#uIU*}(%p>@%t_-=)nO6#hd4NK1S2&V((^Itj&W1Ba>+#S!@N~O9(gN8H2jSu
zvnY{R;u*?Zle6?QwkX!#l4A&g%@Xsd=|~4gjj9EpHzPWEV`67J0lkKQhPT_N_dm*S
zk`<zO%{z<eJ5PF6{?nMmUy19l@4e&o=yo&^r3Y&q(Dc@TweHYP&5P{KuLv07`5a{N
z#ZTaVol9}LLL<3GfG6ub*UZtQhb}{7_2yD*Q(dUf74b*nU*_}eJ|~Vk(!y^!qcii_
zXHP1(`@UFsCm=s{4jQ=Dvd5}l2)z#Y3O__DoN$--FDVo1?%<>9^qB`=$xp4p$Qkh_
z>zggUAxN{a)$U}mrhvvG3$5U!z!^i-u^SUeG@`f{ph`Cj4i(yyDN@_S%+?^&aE2x;
zD>e7J+}zEU!#dYiq>t(xC*QV4ai+12X!&EF#eLNuzL$6HWkUpqc-2T^skY1QM$b<0
zB8WeZylrL{`YuP+IVaH2WDWL9!;J@9kWDAZRXrTA4Ld`Me-^J6Vdv`pN>Zy(a$_T9
zcMf?(jpRYP2_6~|G1(FHSP7>4XTcv|2Dbs#q+4PCux|i~t$^HacTJ8RB6)uZG5+4>
zpt}!qx6T5q0V9a1D%k$Y&%-VOq~^DD$3kB3>p!G0a8x19d=3~9xXu&`_nc<s&)<OL
z)>9nXqasefnU8EOr>vh87E!Uwa$pR<P$q+#w8LwAIpQm|8Rm1a;nA@ZrAd_VvTBvQ
z#WB%62ir7mk}Giiwj(d@B113=K6P?!#2>E}<QW2Z`}tRUmc>jA7^0g8>zc~8HDciI
z9shgTzldXRWBBer9h%-yvLsvu^L}ma3cPGG<9RWxubRUIoaeWc@OFD{OaXwrcc38I
z(TD9hdz!_`OxgaJJ@@5FMTyam1AqcUZSGUxt%5e@7w>cyKq&F^o|QSzTY_pYjV@Gn
zJSIP*(PtYYotu<98(YR5c%aQzFT(=I(BqO0HvQ3mbT-1Q@xBB9ACw?e`W#U?sd!}g
z-K5q8JtEsjm9&>;t8|``SiJv!aITb=5ZAGNbhIU@O6|oEbR5gbT|MSG^LbT7>8^>$
z(8CK6Q~Vr{SL~OOrMz5j&h7WPZp)*5#^rbS@X#7~$&LGmRNTN-i7M684SGY#Qs$xq
zf}X6xiV{gOF>Wv!*GM9qLL9tPDCvNfWSr+5kO_`sS-PSTd*!VKN!Nm{0e@SG`2|No
zSSxb3d#InVVekRG8(17r82q(|^7@D;K7h1Pxy8KM<|tXz>SOKtthBYV)tD75C^ym$
zpJ0b`&B|lXQdO2Eht_?bnjtodenQ!y25qCaTy5J@Tpt3{zaxXKlL5>OLtdE{RSf9P
z-glE>J_uhQgCc_{=*Cd@vp{?;wt1q8Gz!%S0Ys>#P2?+-{OYSssuFUORk+s?jpQq?
zr@U-<Ze+F<cG}PPc!y$i(-qu1mJKN$Pv01KLvBM}W(58+4tB%^3H$Cp7_$CQwp{_Z
zD;)Rr5Gk^71^m9*ZW-OBC+?Ht-}TQMQsmL*n$NliI)JUPj?(s+OW%jagyYWRe;o2u
z9i!nUT1|~;<4xkZ$KFc4c}U51k^AZ?PY8&B<V(z+kC%qt)f5?D;jS3UVU}x-kP`L}
z1tvv5uP##DckXb})B^!a0rSi9w9e7`eEf13ZsMdRT;7`%1;Y*3MR_@!2lPSL8iJh$
zB@T8aZ^f00kIa~c-a`mQ$?Lz0)te#hb>Pb{0E_0ah7DQQMdbR8votz+toRODud4Nz
ziLVX=2HzD=poxt>X8(O){n>^{`S8#V%2dVqEN>Tbf5yC&j^EaUGv_Ui%~6pVk)E%~
zm0`Y4R8uJju{nERC$XnaG4NrawL9J4Z=piBBP~+4Keg8_9~lx1losNXs=6!?BEcqL
z=7FqqrB_d+XR)l;c|Xfm0OfuLr3P@fumFpT&t1@8Q*%G7l^@|jV8G?<6E}Qvj?drR
z5<nc{0FpjqM5)wM!Q(OP-{LNm&zTsj>Qek(6r<HfjV5bZ2_g{E_(#WVR`C%n(B7=!
z7uDcaj??9m?yF5RgC_YdCNMueKy<6$1GE7jL^u{-4PKD7^es>OS$<4sgx2-IcX+fR
z)-(|lks(Y1k{O@6%O&o*6Z?m}@%FU6U?6yKl%|y1H35B2T8xeJzsL>+nE!iP=5u&=
z-!UQBu<8`G_HsazZ^9qB1>)9ROry*UB5f3RZE0VnQ2x7^2LSVv<=3UW6tT&u4@Civ
zktVsS<{H$A*{jNK&NvNw4P_qR7tg+rem__L!b$i=$5dX>1HOLaaoj1Z@iJfMI05j$
zU9$q69g`w`x8e64f{pIJN@)kr^oLk-u=!##&~>n-L{k!$r;^hSWDQ`?)D@nZb9!d3
zKoHxKYP-Rcz3;cFp|m!41y+xx<exyB;P%Mba21qRC-s6WUWNgVA-YqqKJjuK?^JZK
z7A38c#=jxAf6lJ0<(%FG0?L~UPU>NlH$?rqS!sShpktcu$E3chmR0hW0{=4cdVP#V
zHzmTM%%~Wuw>ieSTC6X<@!~|k`C#s<BjTK+tpwgk`Z-OEEuF+o)ND;5&>lCwcZ=#(
z9bpo9?p)k>TkYbPbyLxV#LHR<L~T0#$v&@A;d12((#ChBdE2%YZE%xU)69!Q#(#=S
zfD4!gl4KS3k}2ePSz@8&b}|)@*xtdwjH6YB49Rc8*#12vi0g!RY9CX@^zon|!_Kvk
zS(rzX92hzD8m-$8ILI<0J835!i^K1LF!4Uwx2-R{9_@wCMV7^@?pE(y8feP2*vSrF
zhjY_3Z8)bG=>()8OP<_9>>x(USdw32jEk#792q$d;W#>e&QWv(Ot&Y{Hp@B@oxgRu
z200*X;CBq7Z?Kv`7*qw&w%E;QPDDps%sl<{=4zb&xNv$1*;m(5*o;|}Tv}CTaktss
zNoL(BDmVkleO~kWw$(0S&LZRlJq;O0(O`Skd4HbOd1r1zo}u?cW=3Aywc0f;KMgtF
zLcvRpc*P@)^;9EWuA6lB6$7fowT}-*Ta-H&!!$w^H#=QhzuwKgGv8kCb>LpJhxl}D
zwRui=hI-<lA{aJ;4P*uZcGDgoev>Aw;X~Tn8nUf4SzR}%!ktlB4S_)#p4L7MFPm^S
zY{CzEcKsdy^!wXkp;x=B-+9oZ)inI-eL0VcqT^%rsVQooPQqHs_B^S5ypnBTy=bi&
z2Z&l*ctE6_cG>m!+l=Xkntjbm_6vzB;;8_e@n}G9^GBR-vDJ36Z%$9qnEkustl{f1
z@Z6Z(gfIKLmZ0Ar71rcVVU4P!-Nf6w`o_*%{w|c-sVPifzbJF->Y<}KmwdnBm3{oP
z13vTXr?j~b9SnZ9$2!kSG3G2Sc13el6Q-$4U$04LeK~xBW@Y4+vuW=;UvReQu=x_Z
zbCcg?G4_jyg?g+J_1Sfe1-U~>aXysS!crf?EDD1P&GKfRp{)e&b@}gD2H7iPDT2Lr
z`NtfSE+pE$O`&)WP2c5l@K0$<l12SQ*i}utrR>0dPVLh_I%gYWSL`=G)UtnQ^P{0I
z(Zc*ypbwB6Kp#b<mhh!1Qa>P=Z9KCf<3WnaKRz_a#>rU{z>1s~UrTRK+p`CRiGmzk
zz<f9$z2i3l4Q*w6dVHEbY}QNjPK)G*cN<O+Lvn_)SjEu*a40DtOl{vP27b#=>N)D#
z`)4lW@LP;}x_hY4ZwK&Y@fwOYf@YqjdVt9=3nG{R9sYB8J96u|>jYwxNit>7WClwI
zsK<8nZAn|gmwObzBsGUJ-vpOCfLOASr(iCWPNDgQx5B^B-RU`686hzH+qOLKHLibc
z+bW%X1Kl1y?jfvTlQa(ID$cV594U2X!=KJXDs!$lEZ(uzDv;qAx`%Vr;AQ?tHklsH
zwyv@Re#Iv_L+9IG=sb<_>H(Zft*`41lUJ}9o%+uKCX;QXg$!QC^7E@rpFS^9l8ZqO
zUaU(c)f7TD$CYZdD6WUe*M=7yqe!Bt=*~VSZF|R9c+O;E@>-`Z7Nc-!^(BOo9Y!Vz
zFFT5S5=%aTQ7D)Yg>L*-(D+^=%5eNgqgq$+(jdV1DGV5>RGyH==IFPKY4srgU2*|5
z<zYyDjvmV&I&%{3tjT9~DgZm`r+izjzdR<$?1%3RPl+XO+^&j%L;l43_tWfPsfCs`
zB?Mv_!l0s$=naA-35QkNm<yXrB=AL~4$SC$>z}$km3BD7qhMmX=_~(aM^FTD>XbgF
z9cSBQTq_<pGskSedi7c<jK&C4gEH(u?l08^z~=s8#FKa_1$Ozts{B*I@BDLPgUS=v
z6kIQx#-7q3%6%+oaqP}_%Ght3Qnif3X>{;3y~dl5m0c_qocDhbKn(aO!vFyy8{cqR
zEBt$A<OH60?%<d?idp~oy4m``@fM@l1&s=m!XSP1<vS*u&Z4Bf-pMT=tfCLaH@L?e
zNu}}ax>KN%U1DV(6*GM2)9;BZu!c(xjQYk>#jK}Cbnp#&B?CC!A@1mavQJ+m-r0fW
zMr%g#NN<SbJT!qaS$m~rx!iKA$e*j&kPrnAYWqA{+A90~1yL@s$IHS5px5#BxW^}E
zO9c^33p`K563u5U^j2Y$6Bdl|v`8ms7EofMNgfP4{kTziC>b%IZU^gWkS=xLoEuSU
zh%|8&tS<V1<5V_UvHI(LYb5p`aK%b|)2NaVs8ZS95pKLSKiuDj^Te={bp{c`F2f<?
zSQ0Kga!3U_90)~sXGhed-qyg0sEV!CG-R2dn@)*jK@=0{_EoijpW~;y&Y8@~rG<bB
zt0-ZF$&*wR`^X>A>d3Xx(fga{C6Pyxs77Mb5`#%<rM{>+4-s&>tEmKY`cq^q#@M2h
zBaE9fnskavM|$E%U0rQabzio0nzaoW84A0dRFiUwopDT(GZZ!g&v143)ZW%Jj=o#w
zyQqVi1%ZuN&9yDa&;(j(gi(Y(d%X{hX8(p%eiCkK()~3O;QrXpRmML1L~w9rM=<xd
zE|oHGl5M~2v^X1?P76t#F>WFo#RQUoa)@HfK;u#>9gwv|S+hFo>KKX)wkt<z>+|m+
z7GX00maqF{8c@{w_Zr#jJnjRH3etoF(uhKONzOf5GvfnP$a`<n-r?K}dt1Z_d#QaY
z8_?G<@QlXiK8!`_s!$A3+>n=-A8H08)k|pZp}vXLn0sz_wojxqH-)K{4#->L_scwA
z(G<-iY@F+l4Xjbv!&~o>veiI9+><<a>h|vs!;vkeH6D{&r}NE*bOZE>JiNJnfK-}}
z<_19+=M&dW{fhM<tvAML4;<?5+7v+g%pGhmXlT^Y)xSJ&ca;#hA0zSQ!PTnRd<_-M
zR!_kX&97nk<v#Wris?;Ursd8H@p?a}zHD?;OLgv;fJ#x1a_`8Dhi2@mc#|1qDI`v|
z5@jU@?>4|S5i(XatHAOMFQM5wzofc;I}$&FPvHJ;NU;^?AWo0@Qjo<toK1{#iQ=$0
z@BMw3{pC-T0$hf=MBTBY-007=D?FF0uN79>YI3`!D+uLEAK|`Wc2?ZEP0Na~-%m8K
zg;D$d3^(pX?SgpUgH_r7ZjU`kBbg)j1r<mT1>_4FCc9%0QjEb>(P~$K3Xx8-KNU#c
z*V5HIpKyrJ!T~IoWa{T1O=^8$)^{y-wXZDT&}n4ayn9&@siT6O6lPng`4#qu<mg*l
zVB2v7<4fmD<9!+=oOHC)+Gr;;=5K<7a`lO2V$wgra(1Qh8Ul2;Pt`Ox|MF-&fM^(2
zh5or0QEsfCV^285*wJx{VV}>)ltSZHKdXEM>=CeVu%B?)YPmySZoS^VG^A(y3hs-V
zi@O0wl;GM`JGL|vy7IXQ4ex}Y?hdDGk61?@P(!T#oHbNdBV}6cbtr+MSO=f-f){q;
z;<eqU*%RH*vCe>CV`saNk`3U1PDhFKrn!`J7Q|=0QSx<B!^gbFivTWIsz1OEp=h+e
zsqr&4tmVq=z94<*39<y}Bbe#D8n@$aoWf#EJD*@Ng7yeoN-YpVes*6=P{Rt6)P^{h
zhNnzqC7+4Y!+>O9wRF>4Pum(J!-?1!5Za6|^6^5+W6Pg%0T4L@i&zRxa~!I*PRaG+
zhZSKzsewtO@gZaWEm-gQb2pNa5%#Zr0E$V4!ffrMOn(r{Npk&Q_pV)n3r1Hj->hq5
zA1QX$y~5CL$q1p2YIuVfB_AaAo!tH#>JWG29B=aFGf)ukNpzq>?xVfwdz1HO4DYSm
zJK47sC%)%~&~rvU!m&zyhCB&Cf;7OU=K{U@F}J1QtTsP4BDTxd+dL=We!rLIDa~`<
zoARe}uZdrb)z<JIEq|+%B;F^Kz}Suo4~-tjC0%nnQM-)@-s5*!WnD`n`fH|^^>6j&
zD?+!=j@MMKuPmlC7umcM$2rU8yAM9nPw2f7`^OrWR$I;FrfbeytnB^suZ#*;s*=7?
z45;>B2mD(}RrjgEjW0>zyKjt)-LFqP!=%Bpe=bq*gkG^oM2{c;0iv8U`RG*&me$rg
zgKyb(sS)Y8lDt`cHs8x_78SL%VQ<d3(w~;cqg8h>YO1Fo`D@wsmFcGyYsg}Pp80Lj
zVs!-PQa5`4Q~C<k8iX;t%wnwwJ2$&XAww3<qQ+ksX{*@`96<HPg%ePX=n-W6-8nnQ
ze5<Uxo(nGDCaHd6;W}+0A1hsn;VAGL)MVHtNt7hvRC~~7ZGJ;S5t*ofh+Uq7JvSAZ
zR#<N83}p(E9|lm>BPMGzx6Hn3*nRMK2re2)wB5MLuh>~8EJz*HY#lro6_%``(4X8G
zn#U$y|BQDvTx?}e>&;q#Co3Y?r&V;5CTHR&LfOi3FF}m^Y~`(d+Z^Nu-1nJ9v1HCz
z-yH9r9A0qK#cHTdAxQQU`Q+ql+)F2<8~>W5Z~XToesu<S+!8qM+SrecArImxZMK8j
zhac;6>l%I?U0Y23x%?fO#Fjujq^o7PPPVs+ww1RHtS$N%FPPkh6OXe`9srBreVT6=
zbSkrNlLK;*%ndca6C)=sG^?vK!IXQ5d3N|(X76R+3}SsausV&0`mohY`bRdEyMBw9
ztRHr%pU%N&O}-j&1P{Cj%U#O!7v_lYE*o`z=nU@LI16vgeA?bKk;sQe`w9R~+lz4?
zpS=C=;6_IUZ~4A1K=*>8?)bUe;?fc?uzn8y=qw}eckDK5inJ_n`gQ<I-~3^#Wf-kH
zca8H6J~a4^>k>4{yGL;f=-R5a)wS_+5R9pu^6u$^b%J{lqgEq>shY{qrL4y3oSzy-
zAZ;{yjZK1$e2!Oulbhp9=AM9OR>?Z43SzS*&eantt%s61H@uew%01J1$KKmFtp7GS
zg@I*G)}4P=LGY?igs&YPh3@<*PaTTccU6jO=cCKBXueAMTkh89Ub~$-?@6hO$ufmc
zUL{@~zS7`*{Bzj5Q>+_N$}zCucV41(?;hUcq0Xhb96#ru*R0={w03A<t469YKPmc;
zuZe+O-_L~9?b$5@6x)JV<-UZgvF8`0^`W1bhMB^qbJqR+Tu-JEyEU~+#tZ$xg|;J2
zDEYbbuc+Zz+09&Y9!Lve@AkdG$B!i6*%GEj2xka#gKhZ*`e(Fbmt&i2zBUv->hgsj
zdwc9A>ALTzEGLX>5M9U~spt3LKY>wi%$Llzf?jNUFf^BC{YG8#{2(Z?arRPotN-3S
z^FlJOp0P^A>qHA^YQV>ci;06YpMmL$>nsp50W`9STt*quWpwIs9*6bykzii~qD0z3
zUSDTzTeYa*L}yAFag$pq!@mecxkdWk2z>Dty|2*1{xWt+5)JBqn0R6^Iee*@cGG%0
zqKd7<)?h_Qo9Q7_w~`Wag%i)<MuQGIFTb3(2NdHN$w@t&L5}g5>zLB}#ZgE(=-c90
z$Sd*M;Bf&k?_FT{LvMwe?K~a~w_2rY((6)~*vh&R=jmxwY1<Tvb-1Z7G~&BN@jj{h
zL#fIIkptoEw;#SRKXjgCd(Se@xDE(8X9sq(e~fP-!6W^($_-0<H#?QL9IC48tG%do
zeEkBF28NfuofA1N)}|@CIgnZ{3m(XWPd3w^Mim@iF`(QF1i`<p_zc%R@d8G6v5eX%
z8Vr266ar5DLabaHy%^_|m6oocmiQ1ha#x?Mnj!CeT50kUxYR(`(0b(iR1z&4nY-C%
zWdYsP4nDWy&&2jdsnu#iCWB0%v==TP&dha{XfDYFvQ)ZSJR!&sz>2nEMQ3*KN+6~T
zvGlCE=XFABD-shm>W**Nrlw;W-$ck9LKFczufp9V{pW!Fm$8d9fBUUkKT#?nQFzNM
zK5(q)RzF@fM=8ftas1Y|h*aNqRBLYJHT}hSVV-GETcll!VbwC$YXI^!={n37_xMUx
zr6z#f^fnZ`AZ$4xEzDkQ(0}=Mmi*{#(d~?AOKSL$d_nS|$2oR9d#JYk&*MC83$A4s
z*@ih3OTM#24UA-UL=DR3#75PGP&WCS;nHg*kc5j^SyWy1Z2$Msl(8lFZs(cMO5a;2
zET!+{D7>2>qR#3IUIguz8tl6ydqX#oHo}udI^8R-DtoA9Yug9LT?UJ*fCX&@iK<Ei
z6L^|MdX=<qqJkRKq$zC_*dnR3%y;TmPm~==pLX$~URWHIP|8)f6OgzXIS%=HtI<Ng
z#qEYV7A}r4$5h&H>s4s=q0KQ+eA%T!^E)hR9rT4uvhEK%J>i(jGi%ujjR49xvqOE;
zX?0$1{Ac~6bquv7<V{WQ@Xs~F@s}UeH^vx?vhkJ4z;v-F#}&b0OX%9DwSIZfwf+PZ
zGSNvFtJPS}iTs+c_vT{8Hfe>qH6pD97U35Sr3+DEz;2}ooH#4?>4hAKRisxU<Is4q
za}iSDfqQDfYVSN!qkIsgO{8}oQ+xNo8Sc!*#Inb)=Woa;%fVC7GAnO-(L)tJKQ*?4
zN{W2gQ?lTthdB%4td5cmaRm^|Lq1Y9%f_rpOpG8r*^{v(tKID`VA$l6S$WtJ>JFb@
zSVtEkJGj?n0%sk;&WZf54J~R;h=9kG>b30Z6EY`g)YJaMV~uEM!X-yFUZB`88LpZ&
zSgqcM@sS(Rkeeib^FeqZS;Mt;RtpNUEyw?Nvix)GeSO5Yukl4L|LO~kdj&r@Grr%H
zG~0T0$7e4%^++%-oK!*(>3tx-I@uEe7~2JBlB~A3SW>KPtWm5Tm#rH2emlNb{2_3Q
zyBKB*U;h~tyl1ei@Y42%T!qU3eO{a;t8@b+9OrYh7(li|62RE*RZ@|Uw^%JS=7|Ep
zrISBU?)$p^OZr7;L}g{$_1mlm^P8|ht>AiCkVBSVo?eSJYTopMHtOzZkw}>6gez?Q
z@%0HmTnKjR4={c#`+72C$~a5O!4U;AOyH->VRv&ns(lQRBVLSfm&B!N^Sr@lYnc$g
z`mG^4bzQz?!qvIbMM0|6hJT`7_s34*(NKGkK#mKqRj(ip{-hdqih749X`>XX#>j8t
zDyoPAgkj&2Yrxu~VW2;GDr-}L0HL4ufN}U;8_JV2)$Q0}Zrc&wTw)mUx@o7jaEwf&
zrdL-(<aQbI^W=k}=OG|p_(i?6;Sp_7_Y;0TIx7`&bpE4Tb`}_S)_iSY6g&;tA>@I}
zIPRTqA(~yHi;-E2`Xn6QJ-cNXJ|6UOA)`|0#a+89I$a#rHJh?^FYsAm@_Phg%AOvP
zs)h7hcHngXsMm4<*W&f6qf{rnGR;~ohjrB;s9wVQ`@#?iZo^rA)(@Hxf?xJ(_|n*#
z2=}w1w*0kN{>$9cs<S)h*t?(@VSX0Z4K<g2^BS2Zhad28#g;B_#XgUInJ?91j)_f4
zzENBERW`?`iz|5^aVDSr7)qC@GP&Joum6Qky(@ZbG3sizLXOc`pi}s*+dr)s?^bAX
z*oJg7?b{amkqKhS=OAnAVD1vhs1}25J<cl|e$7kF1DwU;OC5V12`9YEeFkM2G?~#&
z8e?(o)oHnZF76{T`B5XYV)&1pqxh{aw@Qx2)N%_SSBWf=S;?VxZJcHW4Jn4F_|udt
zmtJX7#+E}T-1%Nne33SIW?^J+p1NdM+BMwt#ua!mx)&i)I?-GEQ_8rZ*{%}06GbFj
z8?Qb2C5ZUPdr`bkzQ4vb%HhTFkB4;vwmv>$XUaYhTsh0vPmdh%et5p3dysZ|580=>
z+EtE_@m3~08Ou5=R~R3ex@*Ezr4OA0WA<J^ESjy=?|O?2QFbKspK}!6HA-}yJ3SgW
zButajz4at&(K`wtGGKWKFxCLu3fEQ*+Pd7}q4kfYVbPkK$x4v~&8-wuk`GFNaqF=B
zs4WLB=9!$?kKfe^#6hgKV}3TIdhMwu#nlmZ!AU9D=SO+ch77B*fn7*n5A`~}?5@25
zBKsD8;pcBLO!)h4{v&Mqmyz)6$101j%G=P+ky%T=Uw`l)#?apnusGR&y#;-nQSxt5
z+TV}-`vKcRo`o*Pzeace&Hvk{^ITtyh1^%~X!w7*xBqYp<X4${|ErVyGu-^=u@+~5
z`;>Xl82<Zh{D+hLdIB|C@vjr|e|6Pwh4`>m3s3Gh1^;XG`|EH1Kb<;15f1yWPV(z$
wS-i^SI~0EE^F7f2`}G2kQuP0qN4cmMVxe_Dxh-tpF5sWhHS?>LS8m7t4_?udssI20

literal 0
HcmV?d00001

diff --git a/images/model_quan_table1.png b/images/model_quan_table1.png
new file mode 100644
index 0000000000000000000000000000000000000000..53bc672246c341dd46dbb7a269ff2b3d1c35a05d
GIT binary patch
literal 128197
zcmcG0bx@UE+b<2$2uOp9G}7G-N~4r?hje#HNq0$iH`1`BB?P1!>F(~c@Oj^vGiT;}
z&-v%f>~W0E-naL<*0ru*-N6cSl4!_;$WTyFXz!%Nl%Svx*r1@G50DVR-*AzwD?>rO
zgnB0?{Luw^FAZ_}WA_Xs2pvWVSNP?Z*D(6f?JyzK+0*)og=^%`-~2Y!51kemkgU`_
zB&2&WLaopFg$)-M9!E?(y4c4dni6Kydm;AG0)2DA&gpWmt@-w0$g%ZshM(7@pZ`?G
zHJv7&Z}>Ef0SgoQUtZL@F!3l{Rw3VzzWV&jOAUs_wD9G>z72;Dnt%;iAQVUVU%na>
zyze7|m=N6m{6bFmK_!IG5D&@xm+Siq-bV}j`X$W&{DPOyG*7~}T4<V}*KV1%nWFfY
z_k&MUd~P~k7+v(CAfo7Fk&=itooo6g#ohViW2N~7t$iOwV$%1|Q<UfQ@q#y6fB!cx
zd12=2ENe(VmY2di%|wsSy;f+0OM3H#m})mS-WnCR3F)g4EsS~W2dvS<*9ydN<CJ)k
z0yGJ)qr#wyWgpn&<Rsoff`yi`Gi6D+Y%;lb3Ug%6cP6kUxW<vgf2DAyaNBPrr8&*^
zv02Tl;fjf*-UvT^t(~;)WT|=`y46^Ic+n^JkGSb`NPW&&n7*~oN;T?-29uaOYd=p7
zjaTUCNQBou+kwZTMul1OI1QhvHH(d3H=iiRmB__GdHMmQyORM%<DKy$<4FDsra`25
z7Phw@cJwgtqPmMMuEyYE@(p?;BrNUP4MShQhB3qXk;H7&k5l8YJsQ_67BCs8H-gV*
z8pVBoHYWS)8~a#D6mjU^`VtfUGyXyTyULDUXJ9e{8ll)~kp~tu)=xU$?XiNC#a8ze
z-h+nDv#sBve&sErs=BW+NqkR-c!%jV8%E~ccFR<-2-Aj7-oc5z)PajyF4t=QR#MS6
zo9Zz7E?**?@EZBkGAK|nsXzU66>O_L;jElX9yMuUBw14@@4?T=!*N(l>r&xc%r`I_
zeVwg&L-CsX-?xZ0DEv5QbZU=(D`?+1IKvUCU@7$fxuU_p_NRbiT~C8W*7=v}7kf$J
zI}-Gz;EYBTQKQa+`3}CKSh<AoK}jps=V7-3Lg}E=XlLX;ZJhT${&~}HjG|{Q=_XvM
z>jfEJEGeAU9B)6>GAS2-e0QI%32sarG4zffnfKjq(z<-N%(ZIH6Sy+X#?jJG-~0C(
zRwL>LM;Lbd1%1zVr{<hI#`0wY`TmTIH978n+nFdKx{-q1orGRrZ)V7L2BPgBUPoq$
zgnjcdWHiqX*v^<;>ljQF36I|i5cx%55F(x+U4$C1^a;7%5YKz?`8GZ4JA!*_h}Cn*
zws($6iE2gqJ>5o?5>4MnTCJLa$I7P9n*zn|G*v@!@61Dm^96!D4sQ=3oTnSVSU*pd
zFAP;yfhtIWMd$i4UaU-4Bu_ojc6-pAl${m<g%>ZD#^Z?k;IK8)PRJhPmP@w!JX2pB
z>$}^zIjcbr>gl5U;mmNPz)hjDTbJiM3~?d2_urt-J7hT*3>n(4^v9}=q-g}DdFpGV
ziLud1J&uG~&Bv+7$}~9b7aXUOL0MT&MGrH;wWftY1%`Bo;>}&SZww|I|IQZAciNvd
zPSkQ@MkH~FGFxc+&d%X^`ipa;6GM>A`+iraEQ>rIa?@J*OzHxe*E>5bZFY^?ETK-P
z7!Uk0RQSL5s2SX&J+G@}L{{$yj}rSXsAY$U<lMb!gUIjLeyXKvdT0b^zXfl$$XU%s
z-;C8;>)7`YnB<Q88b7xZSfpW3ejRB%oNjrVR!h<E+*2omRH9y!pT<<ziO!n@wwV97
z%nY-(IKo{SZXAE%)nA-TY=@A?w(Gmo;a@%BM3$evaafwCd)=7XPwNFyuuIVEw9TWB
zQpu-V(D^#<PR@Lp%$Fq^t1*$k^#V;T_9mL9CFE$am(1Hj+kKz?w1>z#DF6kJurv<;
zYs2nyXsp5{xI&CT3^J>4?4OMvP7)z%>!#grvOX5P{xOJ!O6V|Ywt9QMYr^aDX(l#A
zlMOO9lg6a?rExt$W6&hS)A$>knSOXwX+tW9<u})Z#!ZV}_YBVqW*yJ-coL!eZF%n)
zVRFK|^_@_|3eDq9cAM{guH9PWzD3&9dwX;U>I``-AK#nN?8Ru^Rdr@ETg`KoSc?g6
zI3(nvwo;Ug_?6Hqc;&DfIAe#!-yK`De}SLe{~1mG>*{#*S647LT1jE!&DnOWf|)id
zj<jt42s>+pi0ZFg#*;n*_9T22h?i$dZ&<yWYp1_kzi6}5K58J{*+^LH7bEhJ7JBx{
zWEL7{KZ(nO1vGE!0Rk}ys8N-zM0$071PoFOQtvz4u{O_J|2d-l=EJr?LFC_N1!=~M
zUbmJSNeY>{EP;l|;g@xma|y9ul4Vovjv3yg8|xYvwLCuD8pK;Nbi`zU!ze)BXJ&C&
z;I_03CEay{z<yHQWag$F;!t<=AqzkMHKsCAZ-Ruw#!+N4?WeTp5RY1jd>YRLDk0m?
zkplgmsR>mb%cLV;;o$fx8HC3Bi@9IChb;z>v1e2M0j8%fwU*wvZ$B@YKA50X4ZBlq
zRk*p<G_6i89K_XtFevg+D|RO&j3$2RkX?QiM67?+No_mG+?ymXc+XbXJjgz!X%NA+
zf>h#nRsbc6fNbLwN5Jb;>iTfKY3WVeO40mtgz<2^C|146p=jY5#UjU@*CX<_{p&_r
z_T*Xj!&WqJ{7N~#W3kU3uOAy2Aa{13YhOmH_c5Cc5%XesfOQFccDp6#{dl`H7k_BA
zhigQdfVV5=N@g@WdR#J(M#xZ2{i~w&;u9-4iu}F@hlc2Hjbu3)>wzv3ONa^jrm=}v
z^~cb+I2TLakEvz-x+s(vnLV>AJ2f&@d?!;7Z|~DCZ26y=%gCh%uN8!!jYxT)^ifTI
zz3ax)nZ~s%NOv<&meYjq{_wmnqB`8*bB_B#WABjp&S+j}9rb|0v#e8DNXP`U1s%KL
zPt1WO0<vV;p3JazoCAq4d_CT^@|^VELfXPdd7<ip>?j84?|s|P=>#r|@j@7fS%Pg}
z_1RUksLI?o8E(If454|O97ofN%?~})po^kar|*pL_33n6a9-lyc4{hZ(bdPJ%ilKn
zaG@ZT1H1n_C+PZSx5D5-oNU+BQ-)GCkM;wxpO0{;GNDzfSleOaW;!#;`EN-w%ZIC8
z!P(%9DIM=tJ>_9h41U8y489|u8OJHD*o0^Y`yxIys)1XFRkHZoFU+P<FKLkyR$tCr
z?NTS`GdG`hpfJ|n#x(l6@Im48b49IlLat?pF6cyse_UgIlmgqgE-jguks<E_)%RUb
z+xTe7+0&^*@ndiw+`N9LhC&QAyX&=B`ES~-ZZpZ0MoS~H`s7ie97DgXXgFSUlj;n0
z>83F9EaJy*8FYW{O{Q=H?R1Xfh`zu?>DgT0y1uJJ73(jfI|}#4T#avDC9({^JkAT=
zpMS}a*!E(0qmKSzBP2@)kCG&=P+v-VJK=6cM%~7}s1iFB5fxvT3JFnF1TQxIZJ1_!
zu}oE3oVNuI<I;@N&b<9FyZz=6yYqr$VjAku#XVK3&6hNfVFDsW?Q264%EKHP4K)aD
zkYkd(vs(Qf9g9`#xk|GI;@7@%&_U-!>c-@FEmfyuT4RT-KMGg`JNA+|Ps4hQy4K^&
z_k8e=6KRS=0ArO6T$0(3Ixj?G8==9ZfyIo{ahn?4j_Wt70aBUS^|k>?R5M+{Gum<t
z+^l{S9l;EgW_O2=4;5Kt-z{cXEP3ORK1QTVU)aIbb+sy|XwDLpx@Fa#PGvNSM6WB+
z*yqKddlUTL?0AJKA5B8(WU<@-;+ncA^tgD-Zk^LOw_h-f@WI2lnYl;5+YQ%l5ieJG
z^{5krGf~^EXkiD5f+sMHuA*hs1}c&$wU-W6t;J^86=R!vDHiQSi>o1-{c}P%qI+&*
zr|$e=fDJ}$gxE1^J{1w<c|g`C=t|wPKDhSd?H2G2OJ27#@iz4JMiU4nJ1nhV&gJtT
z3{SLMDqUaXqoc0&kQI0Yu48b@@Y%*%5#h<mC)s>CyjmqVw8s}*Z7neAIXK<DsuH^a
z4N}^?kn8G?UzuiwdJW~r>){03mg*m@c$<a~LAI;jf*E?(ooLwd^HLXg<7eFTU^~sl
zB9mJ`aDHS=s_m`WX1O?>-0Xt0qZnZ5V!BRvr{Vv2y!JDI=sb-pADMB?!7wI)NpqF?
zYa#+pVAK0psOw(@-19SbAq$41iV;-VF=p4Nw|ElyHW@{l-E_MTN7ux|vK%Lv@QJ#_
zWA{TU`VaA(sTIq0)Y5CJb;l9CzsXcumO5>Q*beXYHc#w@ExFg_sc$1aAMY=n+^Nu)
z^fq85(9oZmpcKoz&G8q`*Nyw3LvNHC)Xt%*_L~dOFHhtlw4+z1SDK-*YcQREjuy^^
zY#mAhXQhK1oA?TwzR<#VB3m4#<ZAT?S$%xMlJ`6&gL@6PC~8oyeN+^ki^U`6utE)(
zdEBF@wy&P1G%%%?@!sj@A)DEH-E^+8df8JK^49qG$6s}NUZ1R2dEVbmc4v}#1d(~!
zT%x?*7KvKuPj)%nNe1Y|7Bkb!bvXbJ6~P3(ZY7FN#aJP-?WCWs&dczb)pQd<Cz9jq
z2PHgP^yg{3JKQ^1##LMp8__fz3kO^IBx1IGp1_uYWax3p@(05b5^sKg!Ew*=jPbNu
z8vjFdD+6z-c{Ypbk+#h`(KiJ)ksXO#wgu%zPP~q_p%sFXrvwQooOqVR`>@;61q(Ko
z7-=qQo$+<(+6)lQ)O+XY(flZ~HSB%*)IN*{Qw@ha%F13+_h|uW25y|Y{1ht|&PA7;
zna>riD=cpyT#TV@egnLH+Fabujjv|x-#84jLmPgute!Re-q)B=JAX^|oxT~rU@Q?i
zXV>Ol=uUpTBIht%z+a$k*Kajl0N&Hl+xr{S@58GFgxlS6j~wTA>yCB(QlA5a`%uVX
zniWq>UN-{HrcQnn-e_M^@MYtt@8(9ULsx-<JvdvM_*0cr0z4#IdBQ$b<R-B8L)Hqz
zNmj_LZ@n2(9gIRQN8?Mx&iz7RFALY9XR_0EU-R9l7zVq2;dO+cj}7O)w+&rk)#LU<
z7}Q&-z}CW_*)XA<oUxPRJk)ABab;91g9`8wrQICPuwHaoMbqF?&uaHcooWAgmW=Zr
znJe(Cs*dNx!CQXoMZOZltly?n<!J-}m6ciSOjVE(jf^qnz+LUt4C^DHtek<lU?vbr
zxN)y~V9*akz?5qwG;Fg2<AU9Q2oZ-XM>3KFn@Wz&e;dc1dPac(m6(gkxG<~I9_mt(
z%+oNQhuVir&3MW4GMwfWb*cl*!+C{wvNhy>(uq{zMHZAD+TEeoC2z`$i{6Wx4m2w>
zzF>@F5#eW3W@Gs?exlr|X*S@^PN#EA4;*I6Vl>wPeWp|y^nMa7;!`+?mL%bIVgrb#
zNm+v8Tud;;2MLGG*Ll&kmyoxj`G99mm<yaKztWr+&2VoWUaP(D^EDk0Dq<kiF%A>F
zoVRyc^4bxOn?zMHy=w%QxKOv)#klinXeWu=KL0-4#6gC~VM`aCcaO=wpQfN|2mj9N
zV#egxT3>8gJKXvnfk~>~?kj|k_`HR@f>(cFPY=Bx=bYTMz3(08s`RCW7xNx+L~Fr_
zMl|ZW=(;&`lUk?S;s4BOS9lf{m4|xP{bJ8-*LBC8v+_0CHS)kPuj*4}_s>@D_Z;_<
zS%(YOvdbX(oNHLO8U>;e8DG8C4oY`IGmc}Q6-2#7{rL{tk`i$z&2=NW?rm<%)_XZ!
zFb!8>({9msMml}7lg)se$*TB+iZ?`M*zNu9B4(g5Bi?XWUNpmAR>;!MllMX~h1iO|
zfP;JLd2-{UVn0jO&s;`aiK+I4xh?9L^WCjZ+evCqu3549vEj^N;m7^2<Pl$KcD9$6
zU)-bjdz_(L^ok1lx3PQ5U1WKK!`Lz-?HR)s{uWX8l#sXGZ*e?zEx!*BLGKV(QEmj!
z!ejC%l5&!Fv`3nmiu`LcowEc7g*)~tB4J)2oDIKt`^j)DeDrpwq@oTHv)KWDHT$qz
z+~;&pEt%Ql>ZoBggQ=*m_U`(ANG#t)FibO`i0p{Y+%+>o*eU8&&%-!{RW(OXyIdMi
z@=zMD1_``@{0{lfH%$`7KQ132OvZuSsx~a@9M^qG49fSoxSLF_@@XN9&GQV5UEdA+
zo>#n!rCdB(?raSw;tX^lXQiq!{&nIsYesblpycTjF~Fb9>s&qsfL{gRLP$fAX_brD
z?w20#GwSWu-&+IlGY4O5`ujD}aF%G0+wNr9+yQ5s#bV2h`$n=!3n{;w%apcz8K>8s
zi*-*J!D69ePD?t!yW2#u@+l#^`Pzd0F#qE0_s<50*g=4n7~|b7t!V)mY1Snb%#n+b
zhjYe5unW(h@PIV{--}@PuJ3=J($-$@k1tDM9p)SB8FQ_5+*LUQrGD&nnzA(Q5S1>?
z*gAf@U67F)JADtPx^sZ|)+Wj{rh0mhe?UW8FBjbx!Q8hG7^)%nyW`&0C-^Tw#jmRE
znpZ!-(Cp&4Gk(m_xc1z7D3zO3ule)ITHoT*<AZaBc<76&hM2nFIg%P2WuLyEn2#6g
zjg5dI*%~l9i(lYT=rva6vqTZCXR3{cc-?z{fpR_slgj97+S9have=Z;0aQmdr486t
zE}pj+=PG5N;=k+Y@n5Yc>a@hl@$~V6<GnuCb0?a_X}Wz9cbo`pPpxD(Zf$(9Cgt_J
z<;`xJ8QyK4y%X|JkhLDn7Or6RE=-;8@XmQ1InUY7M@ZfGmfr#<@6fP$*bfoo0`#`S
z1*bX7#hdfpeQWR&WzNUlcyn$|@8fAFGr$_Ub!}F7-<M3~N#h-YasO<HYZbLZHid)K
zs#vwcYWHTQ#>72Q$E$Iw=mXVZl0n4rkif0QVU#F()j9pX(&1ujYo2ss&uGSQrDliy
z44D5o8Oodt04=td6DeRm2R~&MV?8YR*kZ}3mAa62aR<nNGdY24!^0o27`2w{u5#Z5
z9<B_mFBhEa_i?WUY*)l?=UX+xoM)x_ualS!?7k&{19`fi&<N*2&WY6J5A+<TP~8Wu
znAR@4gXwn{#K9rXWPlaikE~FudTkx7nsUb|56-hSYWSgWd&vl5g8InkMrvM=?1Xih
zTO~aQlCy!%v)z%bOjVmEI0r7!K4zSnkb0rg5DW;<NI-p9%RF`-ZZSsiok&%HC0Znp
z<kLT=$c=0f!RS1_zg${M5%6rPHyU`cYn0@WS)+%FOZoP}#kkl#3j&((rYm41Be%8W
zxR&7#<5jhf8(7PQ3TgmCbpbV3$MgE8Y24PzV{IkN-5e(DvB_z_X~Bj4Tw}WtpT+Pi
z$z!7z`#M8mqJzL|3{ddjC)gPP|G>X^;Ps1rYPzKt0tP`|4lBoq>WO=9Z<c79MKp3%
zz1^U~bzf#=1IeB{dH(abGEv>3O|a;1DecFZ!}fnqWbZmQLHAM)lBB^rG<ZCM<7Bm;
z+U|BGl+g{1kgd|H2;~s)U1~|-Kux-#qaD&aN#|kT3B5C$tu;$_dZo>!HuOPz-E8Eg
z9W|o@54R{Za_wfiJA(?fCJglmzV>qhwtSgn{rJ}aFsLRwn+&Cz#UG>IFvE?QDo)=`
zp>fV*mbBcxuNJccMQ9QZ{jhZH0a|5uoX_#f%6yKlXyYZQiU!FGIZd;zuT<fB3a^A)
zDw=j^tpRYT6%pgR3#nslyXp|Nb=j-x{bWQU@-Aq(c1*73q4nWrS35(%v-C&vRbvFH
zl^+6HnDph6H^efE%;TtipND)-T=So8r$CAj4mj)CCidw*eIF6T1sdrh#Jip;mtdn2
z+pFaujZQd6^~#!v-vkDdd<O76CUHN;MaR#Bd7`d@+p_>1n!6UAcHmla+kIOB*xE_5
zuxUvW$2jNBH0Rl&irpyv!X74>S$rnFk?Yfp$CfnTq^9ACoLYJOKCR~H;Y#97%{trd
zI-=U%43Cpw_Bews=Y8kbeh3E`JX<d-0DDZj{tHb`YnGvTuUf77`swJeK;Xv<qCUK4
zF&z=qeYs91MJ^ckner7=-<M~|!rZ9&%GH+{4@wF{BvgIr<ch}!>v|&!DWx3hmUgv{
z;Y6DobD3Cq^rCO(+CH*qA$l%W-t$nZWy?6M`hm{hebf7pnlpTVyY#pyhpBIu|47KB
z>ysuGqQu12_v~@r#ivjCA-M1=nx5n?UG9@jM)*O%1AQ#16(FXPSZYE0TEvnhUgigr
z25FBCSCj-Qu}p4u1~@h<S#D?r$vh+EGx<;nA0I9umIPVaPTp|@VU{v4^#}V6%Vf|9
zXW|3}^DGRJy<_^6)sjP$Nby&h6tNbVu>bgBOFtY(dUrY0|0I@>*>Q1cI&R94FCNxI
zCEOBDS{r)vF04^zXLq6`k-dwDM6~XjAkZ-`^~1U+tyS`!wjBc6O9d!JyT-uic&K*u
zZ+)nQr7yV-8dmp00$oB_I&@XaQ)F$1xHnw@=eii-&GO0#y(%A@b0zq>$-~{X5q%vf
z8{c|BuFmx!cM2<99iiw6VXB3%=vazuJm>U>{VV{_vlx&UHZzX&+#76ai(~>S4E{5P
z!7arKZGP*$>8h&a>?4tBQ=djnNk-F=9}0G0N^lP%bt}9C)L<NQc~@vkdaw_Yq1?+Q
z1T>s9d{vElE6e!na%Lx*mjdOdvL<M-79-*I18Fxy1jv*!NecO*?X*J1bc*$Gj63W*
zggb0IuT=`VHQ0YPk3+z0zv^^ea2koO0FLicvMX_o=X=vp2rB*!l}W-cD#+i@iQT{R
zNYXn!9TsfX-Q(BMTQ3QJf)OuG-L(jUO-3@cnvXhAEDKn<M2D~2chHO{M=lf*R=%eU
zSu)R`8jbkC8{J;aS=aZx=D9B?6Hhjhz%Jo2JESQIX6Vz_=T6+Cpk>Ehb!|f<G>?58
zdnl28i1a9ujmS^8U7hzJWCLB7`sS(YV#-fcbfWRfR)&0Oli<sNwb~kBHe=^{6(Yc8
z0Il+VtQUR9YAcq@Q<_iQc-q1Lo%F{hJQX5da#YsHWKBY1_m+#<&sFLRGP(Vm<{oNs
z9m=E#XihRS&P$&4w+0kYHv;uFP$d#8;pBSh76B3;*BLdEU#IJ>)5_dq2Jlsiz!+dT
z4&th6DjK5>SwWWrXhDAN!9pTld_u-%$`p;&is-%N@f((P$dUIw({*Z}!Jh6iWrTQq
z219v~nq)h7T8r~>{{?zMB^?`W;+(ds8G;8OW53xCarYh!cQ2akWw=Zzs}jhCIlISF
zoW}5v&akGzmQ9<4^SAnWKltTMOQ*HwkF`!?N4H|koZ0}Ql|u&o@<K<e*@^u?6Xn8-
zLf}9KZ8M^kD)Fs`@g?EqA@0zk%^pSH!(OZVW!>#iv-6$v#aDZG1-txSz*$>hWS=za
zJ#y7AxhFDc?Rgy&hYfURY3?C{NnvTVX09UQ{$v^>ru8g1^1=-@X4Dyr+6rbbP3pey
zVc?<ji&ilgacZk+BbWP1th_+g?Pb%FpYz4|dN#T%7>f3jB@X6arGHqmwGwY(ec>XZ
z@uy?e^CGH?@v&PSZBO=H>iv0vy(s_s5N?P35GFDyn=Ef<R{sI_?SoyL<k1OP%+JO4
z_9ni+#Ws+;@Q0+A_h;l`?Pqd{uu6ACd*cr+d^uYDk9ViPY<o#efa<WKVDh~SrR23h
z4yuk_AH_ydL3(vH!1tkQbHmy-sj?SAf_Lpt#P73P{+QOKt|a(%&1zMwyyCmjQjTlE
zUO}J}$`gD{(e$cLvizwyU?*^^RCME5nFSdl$UZBO3@0=iF8Mm8)4R{yOI)M16zS}C
zzGF07Z*4u?x4MV=7~s{hP0fW!9>(zVvF*{mxK=FBJ<D3Uso_(-m6ox*XL`Nm97jU=
z)6_MX)InzL$v_MfrU7{S`h}&(yWvMQ0<N<W(baTyJdO*+c!)?0%_wOApN$i<iHRM%
z<!tSsntlC(Q_bcj6MA;~ANmxFoe<sh1@FguCsfByG|uTN8rLDU^Azj0swVg$w3V=R
zFnDyYyioRPYRD~yWdGUgpdEK_Cq*WO3&hm6|HXb0WrKd!GwtL11y<q_kYdB6A|xKY
zk0h|r@aeF%$GtQs_f}lf37w@{!c?e)+EZ6o3zWBmdh_^VNRfVaX1VwM-01Bhl}Cw8
zPojm!R$4V`06yZF)9&OiLft#tZm1Qwde_qp+p*=9Kz$@pgk@Wow}|q#SLGk*8+i|y
zH?APOf-4Klo$WhdV5;E%5#L#An$+1wHZ0GN4UI)D1M_$@1=(x&dP1rEVT}0FsL8|#
z5&CgWp`Fb|&4r&=w3-i^O0xXgh1H2?#(G)*>jj9Y)9XZVFC_DOfgmtWP8^t6xR^I4
zwa~fjQwxo+P-u|G9M3f(p__Y7M6}XPF~N$CEv;Okr)g_dXrOf)-GBJ!a3(Omf@3Hz
z{wm0l^X?FC?AIj#&a-t<qNUtOEQ*Vd`_|s9cia78P(ky8$6eTyE@Fr&u#jl-8}1CR
zn<4@*NiOTw=S1+BHw%QK$bu7=TSypmxyI+&?gB`KvT_1=@|JdX0&$Ap`>vzL$MwYC
z4qWd^=h2Ghrh9P(9or#sU@L6bj9EMIU{R8`Uaf>~Y~@7O83gJsSp)Vx-}~`FrT5RG
zo(L7<!zlV`a%iwp@<pqwpz^2e@z!Z1s_S00AGSh_@v==Za5)*TDUW&uAAUzzYAs&3
zN=^k;`33hUYbqu{M52o-iIw_bvF8_ZfBrEf$3uwymFyADyGCRB(eZ$#k+mqs7UyBI
z=tD0d4Tiv#?pV1NFK@z${7d@>$>fZy<l+arWy<D)mDIfjk*hLORKg$Gne%o7Z&iYa
zpke|s*9j)~z!}JB2A#(5UibFrLbEd`IP$;0kNrpxCG{$Chd&M)M;FKANm5pu=DuFQ
zZ2$nO`=N&wZW(|X4p;=#%6<`A5jeZMtK&(0tkmc~qr${YK3yeTT6YSB9bw2g%SPhJ
ztPyD|!;R9Ha)>L{0}M6Dm^j2UH;GnPc8oBuqsU-(rz$wfB_r6>9*eBOOi940i}U+V
zqF{f&eyJ|dV2885sr~W!Fhk3c$Y(WUTq3_@P>aKNKoVmqNOSr|(vUeW=TPMq7V*lD
zoCgaC_eoi&KX$MfR9f`b8jgF2Rta^VUlScRmej1?Z=_!@{Gc0)h*=`rbUr$8#zr0Z
zj&^Oj`UN!o+fwx!i)($S@Fky&GKd?59@?0#y0};}l5_(cjGSkZTR9^H82~D?GenT`
z&kmU)Zv+?~3i4678x6##uLb$cuARl53#-dgCEo2@W?$Y}ebFnDl@pU4eienH&QRNi
z!-IT7gwK*hau0}i_0<4Yxsk|6Ktx$U$h9FE0!Sgk3^%tj{@EEF=A+lAn{&@n?h?!(
zb8f^d8L>vG8=o#?WGb*<7uzZMz){a_T^|65kn7egrWjd$hZ`b8@&5CXE3K=L;BpVG
z%qa3fh@ZEddZb)j-=AX#>~@<@FOx46Zl>NkFDIH6H%y8vJ08+Q?A5&c2ecFk*nGOe
zG)hD9nW!;QJAzON^b=2*ZkZMXUPmBc4iE=iKcT<?-t;-@^xxB2TLoFhlTdU_N$rF7
zJWTj{N#R2C>cPSWa#U~u#QlMI+Ai%wAB}^*eLmA_+4z-$S0koh_#;vbF8~`J>%Hh`
zM0TWm9Rd;$I)moQEWpwB?eKVA3GtL6k?FuIfrAHajHM9y`-+wNokL?goFh8Iut~aW
z&m@Y&>lLk1eFNQ-5mAhprevc_`YA&r_?}U~HGCp}pN$T4D0uRdyw1yL1Fz?kiZiDC
zj>gWXJEX~s<CW*@ti7+^3u0fdW6&*GYG!j8ouuNjm!`f!vpUzKeAr4mLmElcXP4-W
zAlYAg76)6qd&aER+P)U;azZyVr|tTBM!I6Y&(R4ORtkEDm*ny<jc>~D^@_BmGIYe<
z$<5Un#@le{3b(B*XMNtKpRp$$`9F`rr5#LP1Y?WA96kI1G<M#I4pS$gF&YsE8_<M`
zC)O(+QJNLjZfM~aq|3*sAMK@~NY=e>_Y<R8_;*fJkyvnXEh*Wm!v)Ww&S7t~RUw!N
ztKR^=p5GBnwuvBZXsIYXt!WwifV7O4CJbvjDD*VqTa^PD@DyMywY<qc?SA{2!~vd1
z9bbRcQ95v}rm12%kaar~&UrtaW=#6Dw?*%f_nD&*+WVuCqvG{*(!L-(hsEF<p>@=D
z+otGrQatt_s`z6-#>K<b#}M}BfP@G7z`oj9={~Ufi0ttyXn_u06m{-X)yZmZtQ_+9
z6N>o9Cm&Ge-7}__-Z$F?`z%L*+^MkLCHJOh9+&&Z);AQPI{{z9RM@#P!Hq9)6zhw+
zc{nnod(Gvpr=+U?ay=EU4l3+YfR$w$CX`@f{LmuDPd}k4_<nD@R$&SEHVjs9$uMIe
zCtUhPhJ#n<mb_%##mrKlbT3BZTWO!S1W|g<+N$b!J^hGtGQX|$cZUFBG3z1ZHS>)%
zU0Afbp8s9n;sxK?TW7G2n-P}d0wvmCznPj{CxYDx7H}16=*(~h**4dE*_bT{*7hda
zg*yz(5cr2S>gKF!Vwr!R+muVxVY99V#P2106Dw&qB^n-&Sp#LdY;q*c$WC1S0a-ks
zWf}7@t@UDdVDgnrlvwXT$g?3eV+w?Mrc4&wRISe=M9r*RtJm=rCSVt~t!Qp7<Kzj$
z77{m!tp1b{sdEy+>hMXW2yPFxd~T|sXeA3o?51|NM46cY@yPwm`Sb%}!4IA$6CKcq
z#JctF03ffBy;{N4P|>#e!_g0upp^Yn^qKQ|yy~nn6}}nQAjjf~^nA^Clt5G!?OYbq
zh9xZ0N8t}eD;Yb^aY3q@wgLC#ZCz=WbzBYX{dt;2D_KjxCt@e+gKIPRb*1tT;4^sh
zg#CCD<Y3PUN_v^r^lN|o`~xY}P|U0*ER)X=n2Z`QM;Rho!F-F_;yiB~ROZfX4!7j&
zRi*DA|8o4^-x1*dnWoJFMhF{F+|1^0##%MZFHcNYv-PETw=Vch7(e#Q=SYOF5LRm@
zQNuQvevbLRI^-$dl0%(&XbX%e>Z{r;2u~He&tXYCID|}h;^U><0()^JS$(PVEvEIc
zdZb|fxk5^+Mw{YIx!bGv-|{w984MevZk6!aNWHG~c%g&erJQk(gzEnJLo9$^hNf*n
z(#i0G`c*RKdqK#D0=cSvqb~>c*R2W;jEyomcMO^^e<!;lYBL?L>vffa{(ifiXFY*i
zc!ivW)_;1hLzBy+f?3M1aFx9zNGkL%7|)G!U1dl@7#N)8U$DJLrMICsatQH8Z*vMk
ze{62BoT<)x`-gf-onje1=A`-5+ZY%ARw_p`)cHE`Mel8ziAkDcf^yGtLRs?kDV87d
zEhHzm^QRDb<PW{$&6l>ugf$+>(%Ey^Lx+Nam^i+gfXJ-cM|~55GQ1!IGXxv#%oM(K
z=pa~g4(IKVtsrI_#TXP2)CY0sjBy9r>m8O*U{NR&pTCytIy$2B`3qPq<6y*HDx5r4
z<H4^J6Q#Mjlei|1sWu(Y>IAZeHr%W8)+|<?0sS)Ti^(cCosuY+P*+nsH4%HAqvG`{
zUX=(>JS3Qx?rzJ6rJ=jBJEUI>Z~d7#zlm&7**mem>5KFT{`Y<^bgF*VV90!1C$2#|
z6TNqzm~MCop+BSf&(>q`P-%@qDb8CqDL3TC^DNSR<R2q8U1)N&AX1~uv9lt@j=|@w
zjD@qs(8l*vP+)011GH65EX%@GM<W^$r1BsY((e9hTl<Uc*ySh{rt)>DQhsK7nWlKW
z{W!{=h5bq})u%Lgx4tK_WJhbp>s#L^i}Q@bc&*@3JY_G+6G5_J6pBNwPbmGFzQ<5P
z)T|xUv`omA3yJ_^DbZGN)lQ;~n!Q}`GDr7+9I8X9Uvw*3=v;uz{#c6AUzdqbMvBxp
z=|T>B@<AM6&j7wpO;h2Io<%0Jr0-WKy9We`2p7B(-lWzL_qZ>E5^}kvLbg8_&or-+
zP4n$rY&!VJ(edNIVq45u1-{jtufx^5iy{JR)j|H_RXe_ex=~>DjLs!R$lJhmV4LaL
zVc4O(s6kB+6-w(8UbN`@15p$1K`jSeetwdQ_@G;eQeETk_&7N>F7BYF)-M-Csd&GQ
z!MHG7v~%llFTEiUTiFk~W<sEYR}PvpiEKZDrAFoaA2K7~K7=Rj|31ybd6<SS?(O1$
zzYfpwTZDR<kj#t|K-~J9$EyF@Zuo9K(N9ZoH<0aTmH-x0N<5+UQHeTLejIxou@r|s
z{sw3+GbigiCDjnI$2YN+w05luAuRaHMg4j7CUc3e!yg{J@U<k>7&cSwBho7U-EUlF
zR&pT+lz9mEuDfOR7W@%=-E%BN3qf-MuNS#q^*gM6xQuzHg>6`H*hqy*Z{IjrU)Zlx
z#{xe=4WK?lz;eH#W8JAp<d4Y@VOTR4k3j#dj{P}f*&B}6>7P6(%Ouc#*th5i0l<C;
zLVch&8hzLzPo)KKG@{GFhzmDI0!J4j7OcMx4_^l;@VRH|Bm5;4^Yl>kaf3+4?l^IW
z_zL)EOaqC@u&LI*%s&JTupZS!_7JXGHVKsomFE#x!vZk<;KSMJDAWi#RM5OSJIb36
z2vffQSXSveuIxk){S}9r@Ao#1D+4xwBp@=LtczD$xa(_&HIqK?i#q>forkNY+AHr%
z?;dZwz~)n44d=%9{$J#%;JaV@y@T*VbZ+QjUMn=f6%J1mVyxR#w@b9(Fo}L~8+Ifv
z#8@dC)%{M`Pw7qK?>%0{AhW}jki`pG3FsC6Vik|zC$^*hSlsGzcto%0`NH8aQYV{d
z^e}L^-Hq`tPCqT+#EN|>`Pkp*g5ZMCG;}2|Fsfa(K#*b9sn#eA@!5aDoNZv5v+uCa
zj5sysO2Xq%n1<tXt1AGv`f+Dz`9qEbn!5lVUE-6f5%ugnib`aRyv&MF;EE;mvI*tP
zsXiBN^<MW&oDoD-GA0h)7=@}SL_is7G<4JodU1l^4GcZf=vdkn{mjhJ;>fUnJ1}#R
zC#9jLC0WdZv*vH*SEjOM5Jo1r$YlF^M(+<bjh6Rsl85~pM;RX@eOl}smq60qbIApM
z3!bkgfARbAlqADjVY!EftKI5MUQb3HnDV_wwT5>y2kuSI-rBc<tMdQQL=1ZC{W=k8
z#E8jauP}Z-gmW&M3fvuumI#VxNM}Ue{b4<}aTH;?lWD_F*KzV&>bxQVxV%!SD&h;P
zUcfJRyb6y+lnqq0I-eT=_4dL#<OQy~(AyRo@rryZ#j#DeGPi`(R<)>wm>4R%u;7E|
zh|&>Wbizv@)Jf|HG7=2W5L0j5;e}UdRQZZT2xB8!RiDEHcwUT53(eJ#&}m0=#5O-D
zAOZIGw9^hy6sqQQpY%@}j8Z-CD_}JnKX8@xP_OPY$LvKBUe!%q?hMlKgFOgt{_xOH
zd0&&kJd!ys<<Kaqah+CswZMY%4EDECqLwk5*pumL;c*di^ClYN#sKPy3$Bn?P5l@+
zlGV9Q66#A%JG!2)(jaDV`!NpmCkHsrniW{qo5L79)2l4VV`W<vTt1c5>Sp=<(S+m}
z-Tl~38>-)WQSWoBt`=SxFM{b*`|<&fw%6*j<0GPCt#4nvey2AKQN1_Vtguh4s63hI
z$s4o!0Yz$64YQQUsG}W-rGbBbxU@9?q)G2(YHsodHP?9JTQbwJ{xm_4F&R0~2@p-`
zym;@R&D;YD^Sx{RPUpI~c+(i3*<@+*e#Xd50r`6!Xwjey6wBK<nh%3S)-8H-ju%O-
zfZ-gF@%h>-SA9uFi>rRI5+8cvwNZgV>Us7%yBuSj^3j=&@O$5JZ(6`2C0;YCgmT8Z
zXhO}tbo3-s6S|X;)I<e}odLb@Qg~nP8Si}Eys+~Duh}bNoY<`(eCd(Nj%eI@M#x>Y
z<9kS&7Y73RkfL~RhuRw%o-L|??VRTdI8AthtoT>B(F_X@Re@vmQO1u~U5yj19!Qxr
zYs4Cm?DS{9_e?Rzj{|~V8A$a=SkZ4+j4j+M#z8EMLV^eM)@UvM(wNy9$7HM4LrNQ{
z20h$cVgytty<K)a(H>T#x$k6jcBTVcb8f{TgoDm?Q70AJNUJ(H*YCLWb}Fd3uvL;;
zFm~EIE5CMy<E?zEHVmH1eTQv*b+o()Bs<X-ti`_mc)HdK$iuArc3xr&|K<ExiDqN9
zYS}0DL(oM}^5ruQur4OcG(NaB050rs_H)HKA%}(Ok``ddlAVE;y#{>J!*igtT7b2X
zrIq1AwMdDgLNP}|d#YTkwJ(;+9rAGVIr$QN<T{DX4`}UpTID@=aJZj790duwKyDWf
zEeKb-V_itR?~b+iXTP@sF9jO|1T<n-ttN-#CnfV~?41J!<>X2~jJCTI>BB+R;WJ>^
z-T7{{F#A_`PG&W6dr~>qCxJ9u0misRgGl}j%jrtJ<|%NB?*OrOV0W&*Lbb+NHYEYD
zfUdyIaRCg>tI=|a3F`xi8iGM+L>o_%aZ|tq3S*1z0<hIjl83L2MN5*JouZdtaXM@%
zq>y&lKUm^nc>=e-RLT#Uf>eDYJnrz5Pc_EjoWN!MS4w>nP>t?mq#3GMWcbr$0==)-
z@vNVeQQ#Kd+Xpgx9jWvA=M)ah7BEQ+<UHhA<s6uUQoeWp<BMpq(CBikZw;8@#agp5
z3(t28=RnpDEJo3@nYZnoX`)vq`tEh_X1&}Iu=qiq-)L6@uNQb%$8I-Ltodfqc52AH
zuG%M~Z};nUO51=;+sp}Uppgw7D}a+b0pHDg7(!QGdCu!KQK;CN?+DZnQod8%7k7<z
zGd;2Zl+`io&7!TqNRP_9&jO=Wq!X+}9OKE0o(C3j4VZx^ezHGffb0hTO!!Ia)EK%u
zjMiC0yGPi30I;umKPf;d^O1V@NuCDk|ElF;3pZ8On~E(}HtWUt3tTVYT6G82)^s^z
zlHvN~W^N_-J8Ru~9Kmn}Qt((vq+!KMzqTB*WkPy_q$iWjAE86gT`yEh)uN5|>b<p|
zBwoW+IpA^|kgi+W@*5q-=eEo3ZQ$#t)2Wt8W!;x<JfozuKhxOS4cedL7F-}n69JY5
z%SSR#_;bhX&?R2y1LEB|;6@nO(RGo?h-|}l!%QE$G|>EAqRMF9^%YBHDu$k9EA3C5
z99jI6yeo{++3-V9TL|ME7&bnoZ>5e@EN_6{MugMY__Y}w%r|EBiUzx=<Zc4pz3!&P
zXdprSn?+AYjmhwZw%4`MT!GUnVZRPgD_erGsb-f5K2y@oF(`#CQ2T8?@Hp+^6Im-o
z5LhcwT|0hBP7uV?u44%h?EAa*^Y}b~^L7!enuhWGR*&C7g!a=EcJ}1XXVhwa;|S05
z<;g=4p-2$R<Nez+0vwYYyl%cs*9Rx$DR-!>Jw38<=W^981#;(+zqANz8p$QM8V`XR
zf~9FM6QW#hLi-^kbf#qQUC*8Xz0UUFwCwWt?+rD*ryx#ZUZyd4U^D?+<DZ-t{5^)L
z>Vl%VQnSWCXg_ya^z9;xM5+DYY*$E+j2yiqU+1S@;cDW^U)O~0``=YHrHNh$3^gYS
zdbeo>`znX2M^0(l%g%Qee`P%G_!O`ng^N1Kg|2<nr6fPQ@M>9@v;A<;+)1PCh~ajn
z>J7GW>4h$Yi#~o$edN>Ks5E1;B&loHkjpw>M80gwr^}WBb8KVl4nLRH>x~qrJVaLV
zue?^lRIIP27vV`gOd$7w!B(fEUV4_n_eL%^!zjP#geghF6Auv7(ms|Og`He)7=JI;
z7%jcO_g9|<Ar4j)Jcim5$|y(UOa(>-s@u1x!b?etUBw(L60K5c{kM0D8I?N`1ZaZ~
z#>Sx~1<8)*K#7?qz%b|utJ*}udNGHi5ENwjnXAN)=b~x1yvBW>@>`!+Ls#?+@%ZDo
za#u7PD{r7W$+%f5sp}IFVlJ@aM`Wy2_1?#*XRt^2mRx123b}hCe0%`-U21B&r8w!3
zPq0*OFZTCoMQ3v~zuV>}CMPgSO;{-i=>CFUDh+ufsF|#bK;mU{I9+96@f~V^{HaM@
zvyw+T<$z4r1wVRR%bzbOd)4}P$D529G*qQGdU+J<sEtP%o6qSh0**+Mv%;#}g5wpo
zhYp^_20r!w3q>_sVS=nfk#q$Hz<hlk;I?FIQ6q7iv#cjj(F}~|U3ocE(ol_;kGe4z
z4iV4|Bt3U3--TTZj(4--oh}4|jJhR@5$ErQ8vNNck3L--mLSv`N24c|p7dp!6O;1Q
zb&SzXTJlAnCSgoGw+eD4tJt(x+UeNBwsTD^0^Qq{ZK_n>gVYq^C8WzW_<N21F(5VT
z4a*ya2`!~OyxHDxR=LQQe1+CG>YM(R;C%Z`G57=U8=jGuk{UZ>uJk2+X`^?0ep8;t
z>s&{ty=8EF{`Rk7_dfho)$fK_kkRvj5gNaPZ-O@Su?_10`ypFUbW{A=&Oqt}TZ_iv
z(~i;2Tv)8q{)<07(0hd)4ZY!<*Mkg9aw^nInqXrlHdz?`6|Sdmd`WP#V-Tlf8O~yk
zgde|JB{qFwe&xJFastyzabrCH(mtYOHFj&LA>BTz=yt^D<}|HEznShNP5JmPfMnPx
zHbbvxIycSM_~So&6D{RHEY)v#8`7D^D#_!x6ZbJv^BIcdh~FP)ICOEGeug>)+WPf3
z?O(gE<vN9y_2;4ub>DHkr%ymv^sVG$wrE1~3waJt7Qn0=(EXeO=NYxpbH^w(^sdnk
z2e|eOj|;ql6Vbxp#4toJ9)KhAM4|GYg);@}60YkvD{zfTHQu}+FV^UIw;;axMmcRq
zp7q=qL^t-Gyg27=MEl>~-W?{KY<kq$HvW;~9pH$nt-_!F;K)ZHGlHJoQcx3Ym+otj
zr9D<4Uk~dxl)3|qQgX8v2py<C_Zq#-Q9r)^<VZe?j^a+p%E3P(3hA(k-Rmy+_}h?C
zAmGIFy{8c5hSB?4dT7`0Dc=@BHNAsQAyK7e9(x*VfLo^hvj9~h+HVU^fw1O${T)@>
zN0$Cohg+oxqoyYGOYLG_Pt2RgExL}ga~jDr^^wn&>!u1t!?|*C?q^0vtkaQw$~;m9
z4va;1%^qGRL~cW6>-6e9n0anSpB}2?X<Rmx%18Sc@tJHf{SfZ?DM?PkEVrbnc+@DB
zShTYuOkyZkGa>~argt>Qp3zNNvHlv??4{fCZaYcmGa%xOh{Ffg3HE&W)s;59K>|Vs
zk(tFR<Df><BZZiw!~=1Bz0P)P9A>tTJ_ZEg8C3tI_RfsIQB-O}7<|Y3KrI#Vqr`1?
zkY$>4ZJ!?6gVuU{6*bJkfHyd1X_6e=fhukM2S|)}@`Jhwp$)j98KqWe+Qe-`Iyi~o
z=5>cqx*yZT9e&t>#nCFpP}=mHT^kR}wo&}O0}u9I9;LhBC^!o-Sv`kaGq%#L9{udP
z_U6U#=v>v0qqq8%d649r62->P`N-f{-g?jU<CqQPX$BbeCLMyD*mt0@>*&S{(UQ~9
zHE~&E8z44A<=K&OFeo+*unFcM@sbE4zdH0gE+h8HJp512Q~?!kE2?rIdk(EyIde%t
z3H4Z&IEVbldnIF)YBb>r?biOml~9H^*Kp8%>HWg5urmd8KO;>bUjL$)K)GJm`kD0(
z0fpNchw~|22sCR80_Q0$=b8PS>1E0=+b<JaR?2Spm))|3D_w1CDyAP(OFz5^)TCJE
z3Z^b40<0m=%e|n!lT+Z0n*#<QM3v}Y+R06{l+X$IIW2~tdqtMai}CHHs#U`pRcu%H
zc3_91mNi>M7TwB|?BAV>FmGyGtuhb{zSX$~faoO*ZW`34&B){ko?a-8%4Z4|rAk6S
zODXP0f6L<UIihCqqy%j5=>+2m|6c4n7%NJIG|akDy-<O4VByodi2Cu-gg)*K4n_a-
zKQeUvu!y@=5Ec#ht&j+i6XZ!ap}B<9^t1skzwU(({Flsy_^%4J3-Pa0-_cU|B`b{a
z{&P$9o-sv7ACtxsk-6_N+k-)1&hpO)Vx>VRn8wje{A~jfHjQn#ID(v6e+O9BJH(@d
zB=LJH#`x!duSy*TSf{eVSRBEx-~s$x7eWT;p({t6#ztGk4R9C0@79@7K93o-w;ovH
z@e2)l0<m$K@J#t!B<gD$FHDTEqzK5&g50#PCMG^mM;98Gfxg5l8i>j)&v)$4ee?r%
zzcICR(CEAejQx8{E8U^Lp3a~^bP{tgm=_0IH~x&WJNtkTZs%W-`Lt3(a2;4%3(fWW
z4NL=HlJSV?r4aLhxH#E>HH6m)0&`M{;O1!Q0}(J!Ug%t8d$UT3`ahv$5Dc+iYBL6|
zFt*3LO~GHlA!me!59XNkaN3gd&y%HQkJEj^H(x{s+qYKU_(?d>ugCt4Nee|FX?h%Y
zJAE#~5WE|)DV)j*A~g>u489l`RAgH#L;A=C6Vr;Exc<HpDfdmL+pGUQ2>TPu7eI(f
zV*5d6V-UIm<8>QzX_qchal%`TZ(s@L1XVtp(yb613HiFv`APU1`e(oTd%aOfLIC2(
z6v(CR>FBH8h=atfIry5w;C?|YYA+ZNbP~jfhxvPRL8%+V$XM*nH`ah4r~~yuGIn9=
zb&1=Df6K!P!JUW-A%x%As8IiB|NY+s-|CoA!sI9}Zp#1mp)UnNaQ7#MNx<uW3xh|&
zgIcThDNy}qVZqw|-(LT3mw5%3E%#q9z|(jCf4OAeV)d4i|6XzC7LvtGb@v^Je{MXb
zgh5KA#yg&FpiRetpa>~cC=iZsm=&0tSwVF%vzw1I0Yt-549ntg7N(f?jhnYMnH###
zc4O~(D~-CG7-ju1r`5daT&|Ay{lsKJt)y~^3Ois)S%9sW83d2}Ea6tJ%BwLELr8%K
z41~dOaKH^jrsc&U`0pm0ZgH(quhI`-H+%}@dz}52GzRECAJqLu80%1_Uyr6goglc~
z$U#tngYk5#vydmgsRcwU^Q99RHol-RFahG3A@%DEVVREwFN7)MO>roCe;bz?a(_zK
z^8^?u#WEC>t}$@kps;%0I6Q@9{A_ao6gCbbabhadsONZcB>|w!3cQG+>_1eCKhjuW
znl*n=mm*5m{wE+#E%}rJC-;DV`%kHp!3-oaasq!3(`s)d=TkNUsQ2~Im)m2#OamVh
zL4;--`|2rT3w&tSAVX}Fz@TNWK)nwXv<vN<qDYQ_;tx6NG)o=WQN%$exIKA>+^Bh`
z3&2j;?g1cdozD9u9MKLr-hIQTKnZAL2C5iJLrWm$QS<cB0nq7Fg#8fIS0!R8W$o&+
z6^vfZdL&r?vn3iSUiwK`UHAW|kHNhBF{xDAwFQ!w%7#PWDF6(+j6Q`l$l%ScuSe=E
zHCO@ItO3UMQy}lB0Ii2Y{#BwjQ}G9BM)^<*@t9Rp`}ftvZ-+m&l>uU)P7pwvaP<b8
zEVJL)0Z+b7V_+{NZobdb@NG9tv0d#+enRCk>3lOXNlfji_)I@EU6uok0mvQh<M>F6
zQe-C&`R8E(IM1Oi3ALpfvl>8QfY+KolsjfUaF7$*c46fMzkw6Cw;W7N0k}|jX&-{h
z+r65|ZD534Xv)zg{{)BiA?*`f^2cn&IAY0^s+e_CSIX0HRu;8rTM#(?`;3GL65el5
z5bAE7(YuO{EtMHo@XBK?TJl>ox#Tym`<ofBUEpuFE(gwAR=`uFmePMzmr8KIk^Q@?
zGe?kqW-6L;XJ7s2^5US}*KZNSxzS|)D)hwA7gSTYB+wH@zpLWpm!C;Rbk-p6DpeCF
zd^A2#Be{$8zU#CipU4!3wFj2Sl!Jxl6p$;RIKw3Ge4<=IuHJZ|^=wpn1HfF?(zBoI
zdDT(UO(z<Ezx*XKs5dcTMX!Ip`~Un^$OkHDX31epHc<4(jNA@jh1dXP^v-<HM!OOh
zXHIFNLT5?94OrXIOe~(ZJNNH@o=hPGM@tMgAgyKsUHen9U`7bFbXIeVai-8MT>9Bc
zUEW))5PBq5`W-)O>g&C9Ab?v<J-EG=itYIM)>oI{WvxZOFy$e_YE&>;5w-ZbN&dr;
zHd9*B%HZMg;!Km{=Ok4FtL!84jhV;CD4L*5!5dOEUl-R@OVYesFDCTuHnIdUoCo7o
z#6yI!Z?p%;^96Lh{#fyX3~e(uHeOrlB+w@g<AsXGz*c&(N2!9B=5@2352%+c{)3sC
zf~QcI3?MMjdZM465@GYzx#i?b!Ew5Y5~--90{VYi^2=d32WppeUplV8<A5OysuTDW
zs%GLgkN~)tf0kaRR@H%?2_BVU07e$X&gWN@vfrlL)vEVX@TE=}ysW?$w3w~!8wWv<
z3$JiO_WnP*m_152PpxO^Z|Xwb6y%3nNm`BtHuW%r4D}L^xqcxskalIbG{gG86SSQM
zoM0VqTr@PL-@qTAD!cXmzb82OC>D>6F9Tp=D1%sy^r5x@CbD<FVe18}Xd=SGsr<|L
z>O;AxG5S=Zjd4b6=#NtsI@Y`4PZ1HfBH+Ld6pRzpV^|~dK8kch_l`Q%a$aCxb|qr)
z)Z0b!<^zUHH(SumFKkOx$}BuZNIJ-DGb3^iUZNk@O$B#gc&D=bGJ%lY5Wn@MUQbFg
zis>y+-BgMQ*1)Mw4JO8}lQw9oCXPpySeMUt9dvK9lk~X=?ra5V7!%kmj5&~aQ7B>6
zEHxd7;Mt*OwVY+!9R`n|u*#Q9Gk3c@Ao)y|O)tqX|A$}Ib3vEu=I<;C>Z=IJpJ+Yr
zMEt)So*lYCN~li&r1DPzpPKv><pCj&A&=|z#0|iUv6Fe8MFVcbdN1Y@km6GyW;Q}v
z9y-dDuIKAJ2RtMu+Megi&1~tSsDM;8LVO}kS%C$f<>~ng=aH8LN5{PVPh`_K0LB2#
zIGY5ndlq^=>Xqmq;Kxf}tv?x<Aou&$*tCQ-zfj`@o>IToZP)q#i?Hv2=DPj=&Is8v
zJ4E*Q+9V^2LW#=A-ei*%*?Y_AYa|(|jO@J$*%DbvgzS*Lp7*Ew{{5f-|2fZjo^$SV
zpL09iU*E3l^SQ3~`?cN&Gl3)pS(FMLt-%C^E&e*r)cQ~2r3Z(&=$a0fdA1SDGE|h0
z6wW&}8?|Nb0J9~Wm+q`=D|Shl8hsbeI$r(SFPCsUA($aCr9`Wju&fWO;T5g_TXV^8
zLbnRGS$a6_-+lnM1f#AvOIps${VeSD;k?*yp95?0=Wdin?u|7DHhjucdu5N@Qap7K
ze$hmG&7)XyHjK6Vi)hLhvY5W-S5$0I<fZoz1KKJ$x)*?IEA99Wpz)V_$ZJ6YDJmN?
zW9ko=^ZUS^Pk#SVi@2_Az%`*yi<wM^ml*xu!oKIGbsof-&Qc}uT3+r9dKJ#AbR|&z
zU{2Yf)^l}wSh@<^N(-Gh$6mh*htlilwNo^LT$i}{Ijb;jfnB&th|)D}hUDuRu+{oa
zX9zp@xBhy4!Gopfib}EvAgP-W{;+Nb5i9NV&i@otaRivKS`l9%y21h8-}tAp(EK?J
zW?rA}m)clG$^_HaSslxl^U~YB!UoWTHvm#Md5T<?2JqIbZXhNl`?@u;4X9YSDPDl%
zWYXR1&#xc)0J=>>c0&WLu!RSxC$^sMt}a0d(q{vKJ?;b2dmfM;Qn8^18*~J+p!Eu?
zp;kcooq3575d3!@{Lzx)qrHudLWs;ztv0@-2U=6)sVY1}gw|DXM+klBOA+pr0!dW^
z+{B)-*S5m$%z=%l{yf1&4={xkJ!=G+t{1=ndzH51<X*n84Pr$(Bq3g_C81Rwo3mpZ
zEhPUMlN=;*=G40g1D6K({@|&miS^k`RhNic^wPw7$81GtBnxu|=3Mz&8pR7?AfeNr
zI9{zXdd&D&7G2*B+xRaCo%s4G``vWykWC=ISJyxfps5IQDDMq;8n&*wudl*deUkbS
zM6vqvFuFGGbmOlpKRuyCbp>zhw0^sPJ5}7_QvM>|Hc)Hf!9WNpc_9~TVO|*45*pNx
zRp2nweUSCL&2Rxuj725%)sY&$ilb(KEa@$qj}cVy?0m|S>?SEcor>ns9_Fk)>Rp0P
zT{n`mpSQd{{v;CXIpF*7fzBHTC4E`vpSKVvn-5#OcO$&k(i4=2!N)^5?JH7w_)%^#
z9#?(u%AP!NfK$sLalY##i;hlM>4%lpVSVf@=H}}6uy@m?<~>=z?;^KgA{NGz%fOB7
ztT*f#H~@Ni+q9W{GVui=9T(Xr|G)J^uv`%#9fD~_G=aU0%R<EJdt<}bJT)ep-i1BH
z_~r#1dp+bR=myJRaYxaE3xzKml`do+@sK-<laPBERF_MLg^inlcIK~j4)DR{XW$>3
zh8J>!5w_F-zU-PGQFbd5-I4SRLG+^sKvQf^6?fgY%|fp9dg``mvmb^5l#>nM#q%il
zKP9+H(QWlP7NdjKES)||Y~39v1J0caW%|9ky6p68u=<6UMKgx7<7jI|er3>NB$Cv?
z-OtsNl6XDr=c8oxHw;e(uNwd~*#Lf?WI-`Kw{i1nFsvGx|EX#^v7^N*HYAIX4P|f+
z+k=Wzmz4{D6nGskB(I`?zaCuM4X=DpJ?nb=%VV0)MK~WM>)LNFj~3d@-O~4JCsXo(
zdOGJ}6b0@wBOzlEKZBU9b^(7h+!!YRZU4PsW!1X{dxbyE>GcS1b}J@l!!Z)~@A;8n
zeQ7)6?ETx)beljNiV!sO93j~mBu&&qfXu;A=wXyeo=%yCpf6fm!HddV(wXe`!;4wV
zE49C$Q2TR2l|^>`<^c=lyb5%7Tk{_Vk(->^65H8)(2>k%`CVQ;)av6cpEJDHm!9p6
zXqD)+^!FVfY_GkKh8Fyhd+&3D8t2@jaqjc6QtJ{QpqRg)#gXpRK`UtfLvk_Q)DkU8
za_Q->*JG>Jv$NvpVs5NJJp2~b4-W!G62p7H6lvr}n3IW!p}YoY&7dmNw6NfBVuC-P
zF}J-x7u%`-t@_E5#s*|vM7z&Lizm~LJ@1#dKy!ID>@i{1h*(K<jCVU%n#Xzkluusz
zX#mBpTcu4_id4b06(n(qDqoIl8Q=D~X%$M$SPwBoU33o_rPe`7T$bN;PwxEuwaKfP
zXF9iM319Wfx1uABzxsk`&SvzjG6PBsS8L(s4<*Wj^!Imn^4@)sS?tdIAz{wbxu0Nn
zX;D5VT&uD)y^w(-)Ye=QpO)>?_#yn90=FoFyQfCXs`z<pq5T^fM1nZCOK|2Z#@0}x
zF;SRf5xfDQ0IAC>JIM%7$}HX+Ltl47^9C-HBE4-^Ca3I;Oz#Uhrh|jDbg#u1pDX+=
zqn=@|if;6`84c_L6_G5Nld4dA_9ss^1-5k@%T5yQ2rkfFUP9uA1*yfm#d}gm%`U&~
zz!~xO1mwA?&_hn_IdvM(G2H4O0pn$x%r4vmW#1(w(wg)Pa+<V07`UmK?D8ytHQ}b7
zp0~ty;IpU`>O~908tca7Lwx)UYS}*UGMJK(x!XFOjp{m#qhe%F_l3NUsihQTNuKdD
zVg9oX4;JN<$A!It(AxFB&_5LJE`#Gb0C@-w)4pfVT)^5r*;YX@k7;RF4lQP{&jP%D
zYO4K``E6^@Vr0ih$@YH?yaJSizJLD0C8KFL>9rY-k3Q6u-#|8EbvE1U5B9OA-eWuG
zM5J&^&>inCmGu=$GM$Ytlzl$;%UnR2N!)J2j3mR|N6cz4Q}z4AN%Cs{kDWh)-)^2w
z9Nf|AuVj$)ZJWyewGwhc$^KvbfSC*ypFDoZ1zGL4GuQ#$qO!z^zX6F3ge8uC*zGbS
z4v9O1Grt{)8&08&0ehq#h49_|@0O%c%9f!8&jsROiQ<^;|KKNNI^9_JEZShWF%6t>
zU_dJoz;;Ct^mS>f$C96T2Eu?md;nHUgUp?;m)K*R77<U3B9XhFM7KJzN~ckve;2Kz
zUahkKYn90Lc{OGV_|AmSA2+QkUg?1|vjXQHAu%-h5wu2uiD$c2O|+*%4Zw}lMEYK9
zGp}^Lf%ZV|rIJ7DmL2&v8V3p8v7B3y4;lV=iC_!G&&k^-Kt1&CCp!I81#jN-mf~F2
z6j-@p%AQXBlfxa;wF`nX=e!Q<mkc1mtn%dU9)E((uk6q6#9mi;l#EXt*Gtz{uKGR!
zTI(-NrwZps`~}elR-_c6N<Wv+hIGpd5Q;#A+yx>Kn^sa5ZV3rlvhEwx-~Oyke&uGo
z|KssgwG%goxsPRs5KhPqWf@Az-g>uuu5ORGpBOkH5bUKx4fl^I(&ZzEwQ%EY;XrkM
z@wbFe&%c7QAO$k`w6i+^D6=32zd;!!!06-^=(Th?AxDG`MnIsqAYq5@cb-fBH%ik7
zgwQd(WkRc<K&i;@3<zoFbQ$=-9HG@|rmGYbo-a#mJk{v}%jCV_Qop~XBZ5u-NpQ`u
zaZ7?xl6~YL&fNDPmZP(s^=s*Btb$n0%idSs^G;%$AG$Vhw4^$PpNIZXFidQ$taF0`
zQ;6qi${~FHQ@47SJGt~g<%hH`L+Ssl6$)@JJcZl;K6KrrM#1MJ&Lu|L2cLhXek?rm
z9wH8nBMB`6=VxI%(VkEn*o|!-TZ&@yDAC-qaE@muBtf7K3U*ag@1JWsU+&wbj&Zlq
zijp6k)O|nczXny3o7lIevlEByov()Um9$35YtXb=JpNi_2c;RUF2?NfCk_SbH|tbs
za`%$A^>Ny|g}UiAeaK`r{gd{R>=@OjHwWBhUbsJ1kv`63Fm)WTycC*bI$@^CrJS72
z6@Sn>XJw+d>2197;4p_?%(gh(J;CqumFus*I1F|@`Fywb2fNEFYeVsrSN}Hg0KOt9
zAmNo75H7&;!l}^Q6x0ZHpQd^fs0pNu6yjT(dzv=9X-P`Gn;+w9hF7n>3Ya#QxK(*m
zX|Ul;YO*ehR=8sd9OZ7LS3Q0kc@>3B3UzOY4O#Y+NI<?}AGkB@XSTpejt02lkpj+<
zK9lPg_0e18>3lELLc(W@7O5iP?;6iVDFBuCt<WU%N)H!ZSEGVIJZ@VrP1V1U9mKex
zu8oLZzx?q;nzOUq7ygpP&V=TloS&HUz4BzxFW^a5!@cB@`A)a0Ncw(^TM|_*P1ffI
z#DYBL77^KKT7O$`=`f}6NcE_8JS2bbKiDJ=QPRKfNEBYR^tbVe@G|y$8O6uxNwkFJ
zNat3iEB}f7!Nd`k89g5Hz}>yLb_^_>F)E7i%6MkgW7R#AVk93V6?Smc9v84c&S5A(
zgckLJHuv_L&Kf|JWx1+Mc3kr-E0Q-4^gy5Wo-XA&Rn2vCw4toy(m;`#Xs*dRuM+c`
zUNc#TEJP#hYA9_Hk7fZ1s`?3ghb<#}$y~CX`=w7g_YF!n)JMrawSL-hcyHr;N22Kz
zcfLAq(oEAH*?1;Ep(GYQzW|eWUq;IAT&(P)z;+}q4i-S>@tE5Mg|3|22`2dzk}o2{
z1sSjnrnT6gecOsDYyIcZ2l)9++OuMRIE(UMFelv4cfk#k!H9HQGR@44MnPngUw-x@
zl&QDcX^y-o(^1_#@;$VX98*`jYY3*tD+65kAt;1&FE)3d^U3FivB*QHqe>K$=VGM#
z%k3sBqXYy0gw{xKo*=n>Sa%ceK>Lw?GRN##$sFe1s3b!hayxJ;QOw_?G-~SSKT8rH
zExkL|)-5Vr4+Yx6*u_hm&f(>EUdem|Bvj&uj@0I~$_{V$)yogHYmW6@@9-}zeBJPH
z*6Xq>HIuB}Vv&rAF)#AI9E)Un(e+2p33m`yUcOV_Dq(O9!8<wdjdLt3exIk-GlGKR
zgV4#a$Hu~;4y=n_=k3#fAh<*sxOS9fW0T~Omm@6wT%aH!8K*ir<)bI*`UEVnNyJk^
zETnvFEi^_TOeFcL?fH&s6bXMX=Y~VWPO_*)oS4P?Y|1S|TbH@Y&&uUj+cz?<^Ddc^
zN}06dZG-1Nl|t>B;JWe5dV`7Oug|Z&eo)+E>j{2@Q}<-aBuO&v8HlWV;UzN>);uYX
zzq9-nta3f4F~>Co<8Cj`%DgVt&vk6@FJHY(lX0|8GAymKm(d+7-?#W=SZC-P$8*za
z%Ey;1d3sJu*Ur-4^3JgT;%6#lAQV<V){s_PP<y@L^j_p)&Uc+VPH>!cYA(pYu?3xN
z0X`xzDP#$99;~p@ashKI`<fEFNm>EJm8TrN+p34P#>YvZM*jN--MV4q#dqhj7NQ!2
ze)rxFykY^DYr)b3+UhNzJ(Gv_Hop5x1$ICSYrfa6<lm)g(f`WGQ@Vp-cYYCNsLA8V
z{0qS#0LoFVvj1ihYI@Z6BqWgY$I;22ubXjkY5Q8=2Jx92mK-rgha47tM*JmeC3pAJ
z>LcjF>2-5A0i{fX?i$i4B-zE@x~?DZ$y@%4&6lB;@mDpIoD{;NYVyAFJod@bOEBn$
zJT#`Ei|43xN|SKPI#F$C?Ix<sN`<hSn&tQTy@+4J4g3UagbAk}k~$<Efs1$71D#^C
zVwj2heceeqTI^m>nJrC0-TCEU)pBL_-QQc=RK8y<_QaDeG1yikxsdm0E(t)Rz5iZ!
z)_MOC-tl#7TFH>aU|*Z^n2QeC78zrYa1oKvjZUHtpExIPO4SM37FLKGq3(IgZY-br
zx5p&DQKfuJ*PEvHQ06QNjYZpr|G?G|3L7MM=;dTa9x?sz(R~$@L8eQGgiu4<-`_t#
z$HZl!w?sI59P0=aXM)ggl{M91XUSbddf#y}P8xNw(;p<hD8Ts9qIxr55%@@p8+Gd`
zVCO_WHIdsQQ!^J>@r)H+e{5{c_n4j@wK*FXKglS}L>V9~!~$gPRVb_Dq$6T2Pe7;&
z?|T?UwKSKwtv#}dGfum-dq84<7&7{h8!oAcVRj7syK9sM*Y|>ZQ8Zf51iUd*x}Y&2
z<@Py<P)?3li+YQI3Zl=QHA%jwoOFxXf4Nf7D>uD6<)Rk&`3Zc_G5!S2B0buCtb6Pq
z?}wNayY7%GYFoD7SkHdTdh9fl>C&{>dh?>iUH_=f#M-#0VG@q8OO*!<Z|=G-4M|JE
zuM=dkNb9VD&MXP`sK=Mxd384Kj@L%4!W(DRYo$9Jlh@dZpPO_3+-;?Cf@HEZ@S{wj
zj2B|0){9S8pb8w{C9uxdJGq<Yw~AEU3#m3mC9jrZhd18HGCK5pw(W^mdG^Mf<y&JB
zf!crVQ<&fXO<~Ex@eedq!~8!2U<fuEAk$?!->^yMsKi5Rtvumtox8(EI7h~BL~;q+
z*y9BRNQ2C?Lyxt|s=Z5|Kgp18-)XJ{D2fk;O+M2Q88Q2?@X^R&=&NG{JVL;pukaE(
zAlWt0kv~4shJ;C?z&|4eI>WVv5H>6R^p6!1a3j_S;RDh12Jb_WEZZu_AMy?ma{RZo
zhOi&)Gq0WH`8z2>A9ah3{QoAN^Z~EF;<`RH4xnnP>-~I9z70Pt;^Ki?_l@;vs0*Y3
zu#(P5$gt>3_1qUWX%EY>`-xb*hp8(0_(Ve9(X8zK#)gh*LDYU~EIAi|t59D-q8I92
z&g|xKoC_Mso4$ZHO@mSY2jVTrH)yC$QqXF8P^34$tD^=wLqtBA(bTUE4elP4RFhBp
z8bX<%{`!53$Mu}+1a!d-2;DLXztq$Rbf9ay`pfyY%XrK7#~i0f0L$rprA9ym#Txh3
z4C%#zK;Ch)haUC31L*gr0XRypU@B@1v}8X>zxy&(v4rnx9u_X~pBR}#D+d63B*VQZ
zoztx^Ek1%~lN|tJ1`3}yrN__~PaZPUOYVLZhn^E@{>{eAp2<C71pDeAz+JJE(+wi2
z1$Oxx{%9cmc;&!12$0<XM63K{2q_fO1WiNM91MY?u4MWKYD))zOezId9ftMT^y`wK
zG0o!5nf&T%J%~U-C@8z68ucn|bttDlkpMIA%v=n$+Xsjlng*06?cUd)Sr6wS|FTIY
z=}OT25#&?EEdk*!?NM=WS5l42;pFeJovSz#Ol?rib55e57C41?4hLwQ7GP1URHg>5
zZL1DvE0G9$?>lY-01eq)lI?M8y4vgglB7`igwf+#+%795&!<9v>n#w@a7xC;HQ59G
z@1^=R*}>nrFARhbWbmc_x3}u7UiTaUlwM^CV_5EC39C=LP2aA%2*~Bco8TmB29y5l
z?cNq2FHGJ=wZ)?(!~+Y<JruP8RuV=yQzZtcpc=r+)*_N1J7&^jHsgB=ix0rcY<&qA
zDA>nNVv)!*`jE6>YU)hEWGxm!RZ2B!wO4*Tv4{zn&t_9|Cud`2*Wm_B4;%+2UAeq-
zvOE2VLZNC(E@ZqBk;zs1AYsMQyh(-9Q{$pBg!zSUO_6Y}@43V?-;m%L`-dO#NqmsJ
zD7%-Tud08<u5EW!1rlHF8ud>ZC9{<bJuA@5UT=1o{q$HuGx!O-AN`{~aAvM8qn7Cw
zL#~<QtRvK=R<rpWu2*s0p;Usmn>l)y1yol1FJ@mR$rao{x`&<u?1;+^fFEvlvBkRZ
zb++?R=)@?;EGd{zOuL+RLSWX5BnFVl3YQj=r1M%n@BHa+Hm*&3x+r^Q4ax*ll8OGD
zS1B>|!<Cs@ufK;A7Qc@>-GSUK!e<|$bQwjUFAI<hSKirQF+6*$WijX;{nb5P+`)it
zK8%XR_^3PlbDF)Oc)Tp!>nWs}&*UzRg$(#MnzH*WF~VcE#nO!b5%D%L02uTJq}5JZ
zY_;fzt))L2*=mP}#v!w=x7fI?7i7xzxW3_2%QgRKFV^ugMXraRf?$;<@kHH_Mip>C
zvnq{Qzotq1x<32pcB9CS^H@QXKHRorBEp2-_b7#EOm*4$II}+eM+=Y`7VKl(dj?OU
zEo=9v=-q?G*}h$JPD$O0$4Q^%THFK?hdSJ))ipmt=nJ++4k#833JBYB3NP2D$ULik
zXyJJ4nqr%Wu98{L6|m+06bfVUvs=CQ$MjE5Om8QZIb(e6e#nLFxNS+*<L$KtfQ8G}
z=<#n6ZmJuq1H8xc^I9^m()|E@$=K}M#^sHJN*__}njsu7pJ=3<8kE0I+KH|V;=3d0
zVfvtX#=j(5M*PI0KZ-kqXR^^=Fh|{q*A^2A7^4-pPk))N^y(hMhi{T)@{L2kys8rh
zcKKUZjUQNAQ5v3dcOch9oE9cw!B~~}aOAidC+z{xo5+W>z4*7Dv;>d48W%x+&v;-&
z3T51<?o<<Vnvm$JpRsacrFvcFaa6>|^G6Y-F<tIc6v0|u2kD&sRBN7M&V>2${idIs
zMRIH6wk1+2lamF?uM`||?IPc>B+qd3pSGY&z<(CCxA-Dl9#S0sq6uAisO@#+oYEaZ
zP4a)$rM+>{$&>A0mgAobjSKJXhcwJMpNoJ0m#}m{O{*I*4=zD0VVImXsZkH6{t}sy
zfNQ0-nb)~~B4eJ{|Iuc-@nEejre0F3i3#yDlcbB;x{2ML;P<oo+7gc~{B|X|XZrso
zKH6df4&U>$*4W>A(+FFFqfH+etx_+$x{caA_TK>XrXh8*J<kYu$LsR04j>7JUT-}b
zcX!no@7Hj;Vm_bO7%<NLaG5lYK`Tg4lkR3_HGFejy@TAbzu)y8dEE8v2QHdRpg|(_
z7s0cgtn4c=Y?|%`z60Tm8zmu*+-n`nQl|KGrZ({vRo3x%DFh7XQ%JlAu)Z+dy(OWK
zoNh55Wvr0<>ruq<E~XYAQ~OVWreQIoL>?BjE6VS%HpOXJ_n)O3f?vt{C}Zm{!5;Cv
z8DZNnGuGnAxZlAIswsE|0mEoW(nBLsBlwBV`Dg0vD7ARt&a1vTKSnUdf++CPr?Du6
zpm*0MBlR=3GP7`5b&7snky{ctkNJE~qw8kwb!_DX_`SI)AC!~+icaXE%ZcFYq%=e}
zxeEiM-$Tyd8BRBcMJX>gHIOMdqIcj1<;XpDd(!Pw11T13qZ;5Rc>T_>sZkP=PKXqV
z3hlXil+~OS<Vw@&i?$wOZ{3!2PFI`2R{^Y=5p#Lvje?5(Kb^PpO~A4B;**;G?%fS+
z2#{O79AWoEF^TUDs9ep?@WqoObhu!>UO2CF+ffC>Gm!PO{4*EkM|cLl&Gra?)J8-A
z%JFjZ8mft(|L5cP&r1G}f8TqAE&pL~=AJgOB)C&vHoUlqldFh)LPX#>UFuCGeZ7ju
zzqsoq$n&%#<j%7cJIyJ2-R*;W*u>}b!16w%`{6%#zH%R1IZ_Vh2-xb@!|i&%;pxt2
z<#Y{JOsb$}NTyVUmc192-Y%_I|DV^{DBtYxOH_g+iN?SGovxb3V9~QaIDy(d2U#}*
zM2v1+LlGbDaU!ocM4;kdn%jc7LNlJj&$p5^STLy~o1yUT0}jv@!Db=-{`ekaWyH-}
z@2LD|nUUcdWso{I$*$t5!QlA@zIxm_3IZgt5#gV=ht*`4>Z_TVpHcf?W&vtaRy(i*
z%2``<Tp?^p=00BL3WD)*>T7WZ;w0b2nIgEzivm7Ev{&)-KSMv_d@-R&sdW9lI$kC^
z(=F>0%!wIKItJ4)t|tQ`ZHKG3taUTsVMdtT^aZ5k8O>~=34b>43Q8tQCX7Um$cubJ
zs$y>eG<(AM@kMY%37$440&WykKZ5#m!Yt_m8K=hINg%tpZbG+QgN3Y6gGKN}A%ID*
zx)NjVm63OEm;ct$Ai9Gz#@_yGNRw9JFR9ehr3-H*1NlDGHGN1Aj|dEb-z!M^AZf<;
z;U@D7><|d3NQQ8E^BRcLt|3(Rfu8<n*tzZj1fN7I`#-DW_b25R!piEY;PtGAs3e32
z?}f(3jt+ugS7v@T!I?#jhuTE!JA|S##O8y{F}>PGTsu%>XPx|Vj<ep<Kpv!4(q@gm
z_mY+Q>Pmcsf;D7_N4LP<BEDp~0;({L3`&EGT#A=TBhF3UDWPJsa|SL}zJXN<_C5IK
zIZ1?`(H9Ia3T&wRckw#9Y+01jdc~Y;k*75FTz<jWXVG3y&7?>HPguwQQ||It9`wI|
zD0h<;PRE-)PPtmhm-~PG6A=f4Hg2M=BTD~2pZP!p^Yy@B7ZuHJ!B2tdhfs{)!}V|X
zdSSU*^YOycW)V!lNb63@E?cyP$uY<{6~Np(kf9kNff&L&ucWVy3lsW|9SOwCI-15d
zL$uDPkQaB*n>0&03)rJY`Zk6ZzJEY}V!p1P>af70*Dd8Wxfz<yq9(Ok9QY0@<kl@Z
z0WS3eiU9}WeO7xo+4J9N<SJj|czQA5oL6ANz*NncRyFKv{Kf&(LkVE)2Y;%((etQ{
zlq<r!!ABoHyVQRWz9;ppbR=JE@@kM6tHDtiz3)-$fgMzpC9Z1M_BLC?_f$(6Vk0uB
z=>7FR2|DjV!8{>c+bU%2tpP_swe#w6msn$a<R;bWV4kV1HZ(g@ipnA~&c-YJn8B*d
zy5qMJE9H~S`^DsvC0CPVf4&gBNx9~A`65(LHSBRblI+tpar2zBD)r7oY6GqYpUytB
z-CM^YBJN0su$YIv!_`A$HTe)LY5|?+z%8ecQeP1Ig<uqXSHtPx)CYl>p;TtSg`?^<
zK^dcVW=6!|PYd${Bq1oZ7Xq?Uhkh&(vj3k0I>r@n^sg=Ty2NpYM8R;5G=yP$Q}!SA
z;7jjLwY_5sX2W|q_lfDm?QOxk6%^Hdy;g9*vbyUFy!d?xiU?PYInelC!}Xpn!=Q+B
z>0${*!)x-b8_iP|{+T>UcJsKCSps};NT>)J>H6P*B4;ywNrxQs%j++K$X%fWO=F?^
z+gGUcOCT(=KyrHZ${upwpt}3uRD$gwkqcn&%OHhN?@=t35Nb|A8O!&cqk9rr64*G7
z@Kb;;-s>1!oj898^ECKHRFZiG83vW><mq7qi_PS~?UBa)wj|yo>t8M<<XsgE!!NQ8
zm86(<<;4&jjiUnld-&zXuJwA)?Oq_O9gbeM$2aakI;t@e?CCYI*Z$h*>W)74=PRVa
zwrm7tq~<A=&p@lN0Ou&ea#Wd`caW$~+d@e6$Q~-$EFhhqj1vd5CfSq*sA%<%;?yU9
zOu8ICmN;HWth;UsSMdVmM57S<L!e%x>CW6TXnAR+pO(Kge%U{h<W5TnpxAfrpXt}L
zL8!O@VKo)e7SKf(nDFcS2I@$H>R}sLycFn2M~V`e3!pM|lVVl={nG5aV4TsQTs)Ft
zo(4}m?d+NbdiVm3ph*y|tWSLkfLc0DJ`FoXkh(z|n!+n{$j5+zqfb*Y4dJo*avU2)
z)wB2tjX$@#-K#lYAVxHo;XC5vLd#|&xlVw|iXa0d`oL0Qmh>(BCl9x;UiGOIei<U8
zxd!Qun(X^UJkpq)e-l0tv{Tq+K@}N)qMI!E22n9A_~TwG#{Y!4Hn{19MIZm&UpcPt
zLP-YwT@`paO3cdRcoaerN{4(jNfi2j;_vBI`#>^l-EYLv1;L123?QkgoW5=xf%&tf
z^HSN)0bqlwU=*-z;#5DHJx<`2BCn;E@&}&X^aYXyix`AUoNm%sAgKZ4K-%?ivSfz|
zoDdzRLG-7u>cMT_n1@RR>)-6EzV(8t!`f%B?mKN1UG0rb8oV1U^q)S1u0qw<=x0~F
z1sGu6Y&~FT$5^;}mjng#TQ79;QvkQppu6MD4lnqG{v}QpmMRC@j(x?y`@Zb+^vl8J
zGj2|d|7(u&GY(MLZlCV?o_)>NuM<-dG4Tae>~8=85|cysU4f=d+*|>X?r&(e6Os+z
zF$lmYr!<gUxDQ^(^#mfOVzUu0rg{hj%6Tffz5&wXZA8Vh1_@aPFkz)0h&B%dwW&FQ
z2ciK{-5}w>kj|T@jsA~F2VJLeJhOO@bBXmX0y`iou?@&e-3D-nf~O81u?i)h`C0B*
zN4VD4AOW@u#LX#R;YI%d+#UojuUX29Lm+)V7=Z71frkB>nO`ZxMnv~34<FVe5cpO(
z6+C$NypLonFR|$UUCED#p|%Rdoat-?AqgC6yO5e^ckm5%W^%;xiycRXN0t@>6~WcL
zXT{{f@r~!bW7byQH8XB~3unWt{{;xq3tL1Xv2*Ecyxik>^fjQp{U`;Tl3gtXMw@Nq
z44om#oL-2$C&h>HneYVlp`Vt%Yf=@^lWPIF(YbdY4`%u+`Qq+{FEf7Ha{Kk?N+&iB
zF~cuLko8PQQ1LjEYsmaW`;3FN#cn=!(*XDmic>+E*G)=HD{!K63-d=^Zv4aKQ2hf(
ziufvY<P9L0dy3#EN==7IUhyW|F=#L5%ZdALhjb4Iur-gRmV&IOOK9!>1F{%-F^!M8
zgJ!f1L*NTsMJ&Yh;0&trKG<5DSm)EN^Z?v$8*I!Dok^e%-*I~y`(SBg<1#55g6Mqs
zSKbpKdvi;g$yxPA_Vf9-v-IY&($AyfIIi3|*Wd<CMnmQ$ZiB3zXTc$5tb)VqZ?j`c
zg`LJqf#szx+d?x!<0{4REqmmkjWJTT>=-Q?)zmL=Z25hJQ=szWY;2*MMK3!3_qDiU
z<5I>^@OYbo<kIqm6NrqR>x3_iSw5G|*!wlq=6$mBRVS2I%zl0;W}MOX_9&1^LS<Kb
zgbmu4kOYFV+ow!kix_-2;~P(eSQq?kG#}g`<YCvtXelTt%6glb6%dBjX*tKe5L9&e
zt)Haj&6eV~G@6311HerghPiCgfA#yp$XWh)?VUZ<0aZ1%fd;jg!j=}b`8?M~9D>E+
zPHS0!i=~|X2)*-AI9Fd_!m(<~IkH0TI_n1@tw+z}owPHaOBL59-b|~v8p<+<OKK8l
zF6n`!`k2XhP(GJ;Id@X%IZY!g>DhI2Na(Gch&`?Uv;l3>*xoUutW|6eA)Z3hw}~32
z8ar$c?@ow3#MkWR!S24!%rz0FE1n#V&{g9AxvVvE_BuzN5KN@(_qknAX+pZpA{MJh
zrUy4=WPdc0*$5V34b9q)yicn5((LPt3QrCU`{gUm8i&Ie3pU(G7R3t=Wqo3!BX)xU
z-Ns?(uRXzAa|J;!244b4`0nQnM39~CI;u13nlEcu{9R%JZCX&lIZCJf(`4wTt-uwQ
zICJdhnb9UEJ#>7Ca1Y@WM~`qW9{2E7>0$X0ZzGkn`QEvo4WJKPrDK}HqFkw3C-5&P
zy%iX<q@q!te#0waIxkOD^L<`8Gb#H#o`Fi6eBf0>&<g57$zFhy@nKEc=Xn3Q2n~{x
zDT+CS1{1oDf$1rL&|NSR$)RQa%9ii%nHJB;IAvZDW$P%1;Lm72kwHABt52-f-{%)~
z*Hd+aAkGEw+^P*dQ9hA?joTF~Cm<6o*qw%Bipkm&xoUoD>X=jISA-@}ZCDP<UwP*o
zaIz@-T+i_rxYI{wbgf<oo4qE(c@S^&LeDvTBuD<p3D!xnyr_*_^Kv9_>8&ELRM&vC
zQDm?hzlW(l`GNH9Bnn%@3TtldSEu59*q@6B-(Iu}il+495gCzs@+&S))J9V~xywd-
z$gX@yQFDc!pqz813@Y^buV|19kBkN`=#;QM;tz>i*9wrU;1hX&r&BwuB%{po1fZq}
zc|kz1r+`gKiOd>Np6oFoj<hWJnzhSx|BdP=<F$_lF7TdwNM*NIM<u=<VvY+5R?IC|
zCtNKU5?XVe76*M~Qkt}vn;A8n8Rnj1ph*9O=(XYxo+#i^Dr8Pvq=UJEhcDRpIEJPl
zZwyC9zgriV##SyFM9Zo;)8`Qd`I=)6nCSZ}icg4RXND()aXEotFI99@yUiBjM2SQ~
z^$wi6)iZ3FoA)VR%pySpjca8??|*6jo({3jRg{hIefjw75gc6Tol)HZMCyTaOl-4*
z=<^B-(%Bg=s_fhTxvw$5Z@!hb7o;uLo^%~6tXlZvaXxO(psy*PZgFLcGVvNGrf;aV
zT;PbNuATX!R)KRdC(jkl3l~Lhe@S>CX>4j}mx`Z%|7(z(2Ht}NvDS(G)1!?^Ne{4P
z@sB@>bo~K2@TGL=*k=ZgEsk@>CL&~n=mv&~Nh_C*k`PM1&}ekg2o|RG;O10~%Ua6H
ztFz-gqCs6e>R-_f7d7vtk9e=#$xz8^(7x*8V;W0F#SBcEc;!Xf)omYG{rW3DuPawF
zNZ($;ygjH`vTGTgY`p;vE$gpKh?hBoFa2wjk)fWdV!dtk_m*pi0@#nE<>;QMG9_fq
z=S=9XJ$qW%`Eci}IA#jDl?12L!!nq=wm@r*9QjBNfhnWj1^!_AH&6|{{narwLcR0y
zOF;LmzP*k*f$xK~IX!_qKI1jNa6S8oVVtdEvepwK`t)M4MBSzg9wL8u&{7mgq|1$F
zZW1R~^H{+p%XbX*Wkmo^)0>>`&7Plof0l;(V4RjM*260J)Bb7sMg}Lb{{Zva@{`pB
zZ6_*;Mq!>4g7l6`isz=Qx!yf}&dp(8m!yXGg#gp~Gmgy;jH<GvQgg%zoaCBPcm^b{
zn*nE~TNCW~!HK?hu_Kd<UJq)cS(_2i#{<A-k0%fLu<59f6^9^r2c(Y<g#0fOtweaN
zO9G-mg)VxJZf=}yB2ubGzFYW9+okg@97hR!m1CEisJTXczENXV$TMG<;2hok`=sOJ
zS2rMj_Cl1%7|G}+3=0^?=KyKX$+a^MwAxv)Tp<~96q&+N^uVh^?ZZx5-XQ@CX%UXr
zuW-uslIy{q5xd{gC0$nicS7jxQM}Gp6F0AvKj{i(dU(uN=9rus7iK~f5F$i)mAD@{
zVbb;hoFD7+q6`$!rpEQU=-91r{~3%OEoYjQt3EHYPk1);<PBqcG_%0OLXy=<o2?wP
zoYybv)Q2eUq2@Q<dFo8I1a=Fw6Y>C{<(g9tbI4ff-orV0WS;a8{}QLN9ab+s_I<!)
zQK%Nl`a#@sIb(G8>u{>OT$<v4V0_8A8?QGG!CbhSnqg$ZAxLD$nFaQAjJRndnLBp2
znxHm&0}br?PIO~I%|hydJN?rOCU7&!*w<&b_l;R)M-ysZz8-yD_Ja?#;tZ(#D{%3h
zSq8&O`)6c4q<)OR*u6er1nnV@c)?hd{h^x1d{Bityj^gmu-lQ=g;krcrXWKa2qu+z
z3eAvVEtTJWyvks|C#wxtsato|I140fWVR?%6^=z|u8Y}JJSVj(z{|PbIjH-(EGc#k
zZdTg?vO$q@nw&+p30<r2KPI-IA{uwW+g1ga{$;*UeV=Cp{*PGWu<a-x(JvW}TBYpT
z)-Jsfn<lD{j^wZHDu=Wc1D7ZL2N}Zc(D<)j3rfoCcdGEwedtRG&y~~je|9?dDeDty
z0z5N~ObitE?`g+2qCObgVM<a9Q)pr(AuXv7hu)UdbL{5RTRvh{sQZERYw~_B=n#J>
z@ZIUfx>1c}oHMQX3ulm2Vih=-m89yoWs-(pK3u}fQcaZ_kr>N!o>vX&=K^a)dgZ<<
zUVOsQ?iU{`b-uS&=hz(r3vRtucrD{Za$h*;oRtuHF~uJ055aY!)K$>cm9?aiVb)X*
zTA~xMPuqe?>{>>!t~Zc!&tM{2|FVe}wpW{fCf>hW;y?7Vqw;%^^TfGRY%Fg$zqbhq
zej}!|LQ<?ZoiIT~1s@Y{IJlCv@XZ}O(O<$F4?wYB7+z;18%4VX_Jv4I$H1A;JEQQa
zlWgLT7!2mv%56yv;xi4aj9mxVAI9*I)V6+P*w_c^&2TWFT{?Ti3LBy}bL-^-CClTQ
zNhd7r&G>`fh4s)(J(r}5DEf%bS-$-~UEXd)n&n<BUe0ZLB;5AfT;}nQ(Q~_`tCRc1
z7a2gCtvSSIdlobsp(}nb*+`Z$$@@IXt}waP4VK)MXbgXmrD__3!BiLV3b33kP|+PP
zezp>@q}=sdRR{W8!fO|AUGzk3sfx4<6Sh=w%ID4Pp7akJ_1kxQ0+Az_x6eeKgehnn
zGlNEAqm81aPI0mSrRhu7+1-Z+L2}RW%)(=qv;-gBecv#V$8m7+T|o|6S56Pt)lUoM
zNC}7OcZT|bOgYM@rNqGuH5RMZmMQjdE{;h<6B_9Oe(<A`@VF8>myrdaA4HHLRwF{~
z6LUxvj)_A&wEmk>vlBWQb3`)l@{vkQJNjk;b+W#j&Pw+@T;5`U*9rkK+$BuRg~byC
zA<(J*Shy3ru?&)*@tN=$SKF3%KeVjo_MV;`=~`f6vhZq$DP2z+EYDJ_j`VNYAlJ~w
z=E4i%ub#o&hUvLuM|hYBZ57O5rOk{d$0-*@(A&1@Mc<w~C1zyTwm;X}r4whQISq|*
z<afCL3`>oS3oAxT*YgK|U2eo$C`9+qiF*Jg`MB6Wz`B*jq{iPb+)d=6afp||thdxx
zM_bj+533^kbJtcD!*z-0a0@R%3eu=a9y)XAgF+Mi<#T*TbTn4BS6M9E($9`UU6XqP
z<s|yOlaLx{yJ`B@3sNfvyqr7qs~ko*iWRfU{V}n<E{QX>wM)Rb<X$B3tI*#1)hq3U
zm%k~+9|kXw8D_Smx1HX(jVGdr{rSzD3C*lMevc<QZql~wj*r87M!uUD7nA=Yj9}pn
zq?|`Bq&x4W6n#6qh&iH|IZ>|u_-HuCyq?Q3L-AdLbPM9@B$oV18mQ)jF8nP$v)P<4
z0qs*RT65Wm+xIb^S#->j2g{?orSLH1?WVrV5+dy?2*!D~G*OX1(i&?034_`O_u_S$
z;q}bb!>C>9$T<L9HH1tlLPUFAdKrgy-w@`RN9j`W(i%<ZBk7wo%^`FlvZ*04DT@;=
zd&q>Y4Mgi>k#I+>gg|s5(du~aG6uCW?-U-T#Qrc$KrE+1{cvl{3v9(`ijin4u{WYX
zMs;fY2U2Vea(I8qGCsdO1NCa<TKi{Ae^P(Hr4^tu37<7T!XWbCy=?TzND#9OlP3a(
z$o4*R926*!lQ61&YGqnu*W5tGBqr4_P0inF^!#YPv!!JqP*GBz*4sN<omd##m|Pq8
zI~NUaJ?_jE<B(XBeFrPb$V<HuCGDIGrJVJNXWP*Xc?{*}k*PW=AF4&2MCczD6;Ugm
zdtuw}9G>2Pm^FAGPb8DRBSW&9l=X8`0<z(A6S)p7gW9CG(!}oJ%_zvzVqkP2&drWk
zrdL(?H)~oBmO6zfc~oAE;Yvfu`L52cS9OQk>4ix{W7>tHGN1eWM;z{bN$(t3?rp$B
z)A{jt>s73Y8%qTF^D`MWkJii${Udez890Mq|Gp_@HSjpU&|)vGG0C&U=T{@?EqedQ
zLy9#gx6+aUZ*cD+xPF`52g93sUy_rk2sq{iXb4KXN!RNQ?VJ9jlfGrg)X6qId=!?P
zVEbWrX~etifm<$~*{FPAjML1=0HczR7t2n9Fo=d3Fp1cM7;PPgX50?~JzGz=2}5o5
zZv5D&0g3fCN>Rs1O=?ZVCcfE$_hzy3%*>_auC@?*b_t4e0hl6P??Qvq0>w2i+4jDB
z73Ak<^a8OPxenZ0%DSuPkIahN4R*s0K3DCRsgLqEHZky=Px9m;qB+p++Naa5Zfj-x
zgXmc3Ne}(#`LXGU9ghFB^t!!!E><n{elyyymBb+W95-(<CmpNyI@}mM%E4sK^0+GO
zzb?L3uR(w03Y9!Zz=_teIz8+yX)XNnjT765W?%jbXGqpr{VQB*VJB?5+apjc`}qFR
z#S_87etesLmFss(pO1e9)jSckZ{&f$hmv-{fy;R>vR$%O9&Hz=SXy&W|A6zDgyi_l
zR+cL0A)4t&J1k*xd`l8z7k`m;T=4q9xia~<g4$eo3d8g!$sEOt`{lUe<>_PuIvco0
zug~N&FXBCj6zg76DYT;9pyYhrB4yMUa^|mbB|zN+x0lb}P9V?lsUqG3dVk&_!DoDm
zJ+~{hj{ZDu(&ug>hcJ-AOw;Pv)|dXn*sbQ|P7KF4NjfA?X`OBdxnAdLJtwkLl4!(o
zvkle#gtkiQuF9(m*nC*aL@L;9N-NABU5eWNBjNd2)H>R@*!R&3_fcvkXDD(~l+p61
z5DdF8xDSI}w*CCryT0tvp4a|3!z3HVSCp5&@hC`MCgF0p80Cs(&E61Ui|cvQpo6Qe
zSoWte|Iq>voW0D_BX(fpb(=yb=^C2)691Tnv0ygA-qlQPYhhjj%1d79&PKI!#Kc5K
z*c^i@aihxb?t7k*V7zjxX-(H6)RHA}(TNLbExz+UBd7a<O?J@*|EG$~JmvmWJp63^
z(tCRJl<ChHZ1^I$PJOV}?oeLU7PgYwtSvU%p<)}$evQ{UwNaGKJiCk9xn}UVjucOU
zndgb>q=eFpPqJ_`D5I%sIf^vBocKbU6SiksaG7jj%$a3yN-xK%RPl{Y65rxel&b3;
zyf2vkGj&TFFS94+7Q^eBVz_&n40s-exMRlRI1m8Hf7c)!gZhETCnLi+GuacFl`7I(
zn8;S0aQ(YgY}3q1>U`KjYt9jO>x_EapULsyx~no+jgMVr7skARUIL$8zb--h`Beb7
zx|w1CD;o(2HN_^!6B;CK&&3R&62X31{BdMys<`^>bPvJ(D#9gtvxlONxUmU*Xipu^
z#x;B>HUbo6G5U=;gBTCVx|&HHAB~q^2r(06fAgLAd#KEOO=(wxCjIel`$_a~8$ddG
z>U!k&+a!wp*<K&0q3-ia(#&a~!Y4yI5=q2atUi(TMl(nEKhFCi)4_%78xN&!V?~Dv
zLl0Sc;zZG#xL0V})2YDS2Z9*qT}pfA2F?fnRz|jo>x<?$3e24_xlyaY6S`z~ZOi6j
z59o&Jw2g|_GdyO)#Ts#<ZtFd4k&p&{x-e3Gm+G3DDoXFZq(4rPaG>(V9SMWSWyoHs
zJXxK3RJ^#A)O+lvmEVAe+OKXO2hE)NXlc3&<r(<w?Y{l|j03=nDE9lC^z(npE|y1?
z(mC<`P%56LP$QuJ-cIo`iDB_7Z!t2+7|{~ndXX^yM{5AjKZmr7oN+kMq&(dWIvurT
zJ`KqcIp%FlqN{hNa4@x8GJ`!s*d<0OQJIos(_mNM9h?N<_L{;~er+p^EKxaUev+&V
zeK1`X+R%IN4HNTT8WiIE=q{VOqKd*M&I47j#TX4^ZX(%;%vtRUIQyRd`9;R#Cl+<2
zai2xe_mQ0(`g}$s>f;8k>LU?(?+)>`g$DUJ{|-+6p5t>r_xIUOI{{*+Cy;7te6yR@
z#~C1lR)JuC<8jPPiFsz#9RBHtdPK#|R}Xib+*RF*Uca8%`pv(u>UEa#=9Iolzn`1n
zc-BfezH{g$6LvLe^4~Kpc;da6eC8#xuFG~tHHahZ(bweabb0G$wd(31uJC+r>`N$F
z?N{UOErdu!0+l0aJ}x`^0Cnm6ot{y+fnxE@f@8)NS|ps---MY`B!7M^a3o8Uy*fYU
zQcA+IPPaDaoy&$@8Y1_oT)p6XMW%Vbp|&xp6v5tCBv^d@W$^;VhiznrqSIB$?>{oq
z%7U1B%C%oTF~rLWmec%&KJ9otpEH^N?CB$i)MYLz|5{QjW)UN&wr`Z0*w|wW#ROQX
znL~hDR|@iXZ;r<jV0EtExc39?Vl3B+ZD^gFwAGR$xHCJh5kzL&Ox9lQ_vQMToAFNa
z)965%0P=|rv(d(9UV*z$Qn~uXNHbsht<=*Shi@9E(@M#1-u>~MpohF%h73)nO=@%T
zc!&HfNN%l>x!u>GDf(l~%w~OO^#vWMaeK&VP&iKAEmsMsjkQgU3#lwT+pC##0+Mxp
zmr$Ans$TYETOZEKu<17KK3>rKBIv%~8o{aO{TlaCnv>(-V=R0*t%$(?=}qRy@)m}u
zx(mDPRu4kgDjcPPxA2SXNFOspkfjAl*qMRe0NBV;0yDqoyhm2F-+hy3#`iB6EOD`E
zIWUK&CiVe?N?F5~;8wEaFRSORS@G>TO&!&4<C$IX7qQO1*JN|OhZ9de-#go{TQaZ)
zJ6L3N8@LTLB0zz0kA2)6<d&vTgb8}|k@8W~-T9_M51%f05~I*FVg=W~b(8CR0b)#7
zmcdNc_G{qg#a;DJCgRbgL(5Qv26#}l7drC4oqaMjZ>;Xhb@!fbIrcYUC(-Bn2kVnb
zEjJX3dvo;JXK*@dB8^)E-=(=Jml4f`yPliSF}~p4hEg0;(4(AkCZl;7NG5ol#&BA6
zy2c2#Q@oY7EZRXMejnTE=G*cX9Qt+>E8I=)T!!V@Z<kO;a>eMuGt>eSM0qTY`K45c
zUui!@AcySCYO=C0Ma23!RS1rL*ZDdk1@dU_#@DnN(L7WC+#I`i0NiL`jk`G^HxoTb
z(c+x=n_@l^Sf^2+V?m?up@V4^sSOnTa&0C_2Lr#VNZ-0+m&$c&TNbNj^PyO?t*8rz
zZ3T|18QrIbBq9YCBZ{Uw_0}BHk&aFsZ!{qJL;aRAPw$Mk-;0PD$sgczuiFXoqvGfe
z<t)f5i0IignttoER7-k`z@K+e@L9CVzf0U;_nw0v^K5RR^I4YSK^&fa#fav?M<S*k
zM3Q;MiY6J4TH<GN>L8&b-M;o6O4HN2Bh>F>zp>meD<TwR&CG+GPb4^V^<GdX*r0c2
z{NH4MxVg54-DG{ArRq-0@yFA=@eoiWYj+%i*wm&~21yDhI(KQ*0-KIs5oaK4Ja*}-
zDn(5hp2F%`kug!=Ag;>A<E%2HkPFdSDx4n}0(#}$2kl;y;ObBN{-QK~(bI#IVx#;6
z4>eGWbwkjYbNf6d5l!^nw={c<xBgzxN(x%RawnHznT<ZQ)%T{a?&>>{S@^m^ZkEo+
z^CIH060>-91K~&YTX2ux$Q*srn-VG!{Bi1zr0uZ1K&$#LBi<tk_e2-sCw5<96WB4W
zhZTzjDzeOH%}~BGJIvgpmy`*SDdf{1g-7YCQy-9H>L|d+5MIh1bWU9xk&(7>uo<>i
zd&4Wjkp2A`Zm9pcc*tEYCt?U^@4vv2ZY2HzmD?t=WL!?ALT_Jfd>R+ED;{OrNk}B4
z;@Ka8#qPnv*EUnOE^9s`=Kl#Vq1(hsYRV<oyX>_R1v$0ymtd(}6DJWzEDTogvEe+~
zoz#vNyAjR7&lRwqb6iPz7=6SdrlCz|^llS!O~%^jTM5zSR~h;uFVWu*3g4HOUMn9a
zj5McY1uA5hudG*S8;=}DjDo|@0x)+%SFPEqKV7V`B9LHLZAb0DNTWiYw#0chaJqeG
zN1R{Ju`a|FCz&ovJyGrPx>R%*un0wLdI|42-Z`f(d*pwQruHuw!dVfw;q`hIEjIC%
z%^!!KsH9-dbLs&D9KaS0JEJOkv^krNM9Vr2m+#qJb+hLiBQPbxIH6LuuA+|A?~h8)
zOBYgny}JZ3Ex?Yj_^;p;AAQ#E=RId%7l*kBmRNhm7zXR@PW5)eT?8?t!~2p7h#^;b
zx+Zdu&aK^NRL3(@en?yBa8bJII*}ON8(JAO<My9OfkqUdZ508*sKS*@qoC42YbXl(
z>-gkY&o%W`zq_D9%Htn-8}+cq_;^n0DexL@b9e47J9p#R=>DdYkR<8?hA$QFV8z|Y
zT~ck5HP=wP;`+B|#^~Dm+vP3ak24F^-^yq?y!(#&Wc(z-(Hw_0Jvxcz`}XBeG>BwZ
zEN&8^&I{j6d0p(i!$7rl`Peba;YHq*GpXL?0@i!BuVX13g#<GDR<@E?*!wH$N_;7B
zvw>wzw)UsTU8tM6wv1x#Vw%GX#OmV?t*ILj-z#~F-~aBPZXs>wZ@2I~Cz|sr|9Osu
zzf+zE`DeALOrvT`$Ej!C>0iY4E&{P22!=SLU*2;WLsg_!ym)N4JB>8B`AmsO6v$s>
zwU=_aDc-7W2<{;J@e2vX9=M-QGS|-)-2DX*BtC5Te2pcv%t$~oS<qE+SCp|Nwc?dC
z?~9g2;oM4JgXKiYZat5&wydz0T-psWhMSazyEd8qFJ%5gNgacA{1(O6=7T0BntQ*D
zA}fLi&M8ffoskS;r>>3`rb#onknG*C;=tXG--BAXe9!1D`3!#5U1P2YjIJF-tpCWL
z`BqCQTa@*>!2Wf`Ov6`(v1e)5K(ygI-D(8pRSQVQSb$N$6|I!miYW9F2VE?}{1Yut
z!QSEEVs<}~sCH=bfjPm2iu8lT?ycO|0t`mCW&-@U;H-V-bm|Tf5VdTxCzpST*f)0*
z;q<ruEO!3xG;z;5LuA>!0F)^@e@8=Hzbua#P-$|3@)#L4XEECz?&<#E^t$jnXYP-x
z1EH*gkPIAXBVa}J;}U(6k-S%gyMKw<^zUyNwe`J%+Fw8s*u5~%xN{$26l1=N`Q%-{
z-*R=w5RWZfj=WqwDrWq{YrKGSiK3FW-vX0Km6^XZmtM38{!l<VV!0cHGDff_UH%9h
zWJ;bgr}{vUJ~dzH>nxj{06Fp}s)mt6&ZH!}=NjwYRXgGH*a;kY74*mgJMMk0*=TZq
zAU)7rx|L9mB=luoYsRoB%RL@!?p&e?eQw5W)k3S3+DRC8XPWMOm-pvHnhR?=iNi;&
zu!0focLpq^!N}*2TMs#FtP!=bzI*mmd%Tt%d-if1?Th<(^oTp6=f-6kN?b!~+|~`k
zmL>tPG0=-d+~IwsU`(gLVDxSFS4ODkcfi={=L70w9A}!nNx93<If<789lf}+=oe#w
zf$Ka~636tW%C!7GaVyng#xL+JR^oc~zWVn1<!Q-5BOE&gH$Q>$!&aw=Xg%#tD8;HH
z5f5e2d5S3Sb;*!ZAcwY_K1I?q7uNzsQ!s<+t^tD)#ZVi@a-R);zivvz7#`B$s;)hD
zo8kFRZBTu!F%IK;xG96*O>(>6fp6NF0okrKuWY^b3X%}VjYAo`@o_dL2E;c=H0kiX
ze{r;CI`dkNg&m}cq8shKj}MQ(U!(9=J2^Ua;I;B5y6FBP{i$d2o}jI&e$9GXbN&|s
zeIz_S#GS}+p?xq++3F86idHkt4Myzr598Z?e5jr~e`6$nC5hF~a~aVaEJ`h$*DL*T
zxueg#UKpudQ$Y0Ly5k1K$$+r?AbxpC$&}w+ry;<`2XqwQq;dQ`g1vus=W;a>o?OJN
zP(&yf#ZE}Wd|5La!L#av^VIUYe=+?6hZhH>O4z(8NQi7whG12)qLdbB&?yLJiYSmC
zme14xkuCn=Eg7}$kh5M+qCh?#+x5;rg*1G)M<hh#nJ<Fip~8#?ij#J^psc^Nn-Wj(
z0$WrrUxeT)6Pn5KOv-i)RaYee?jF{;pXZNq09;7Md!S5T-tge+Ldv5dAy=11c0WM_
z)EEyEG%@<PZSi%c(Wjc{J5K?dMpa{x%&mQKUVTtS>WKX2#H5RS93uu_eH#;|@{Z=0
zt4KL<{Zk1<NcEc0f2XQ5V4-JGjbYY)x|rVlS$wM(WoY`UA0HyAL=}p0o2s5xgsp5o
zzg~wHkN}i{%)<=SFh0F?Oi=DQ#z5hV(ZQHzA`;=08~BBi@0r+dhOST&Ci`ICcET}$
zoEIeHOq~SXR<(N^m8>u*XbR0~JW&<nnNof7*g|Rcwbp?j>2OeXmc#yDob34In*yY*
zgNnsQC7ncsXh{sxgXUFa0-b#}N+9symh2eYwcRn(ixjG52Q%5vecEz>U9Bke>$23&
ziYKCYzS!)4V;oifsk5cEVh(`e0UO6*_NhVUuI7l`=VD{ZXH6EkPW-=c2_Nv9;_-&~
z;VC}Wka2O+--V&vJ@`V7)?|A#ic(AB8+`Htiy@&asP!Q|^Hq<p9@JA_tB0ZI&M(Nc
zW}AeAlr>X%IhViNG(GZ)ShlWhNpN3m1qr4Yal%LPqPJEbHX_2@(@r%X-7TqgZ4$!1
zE4}kTsvpGy?iLfNUA#A>+CqCmp@K}t2O|)7kK>Y(pCTFP8PM((Q%s_mw~@sg{*+S|
z&cR%+ukOGN6)@l9vZKs4U2q;1*p)!>j!{(8P_HtLO&T9`H(rQ*WNA#_9x6?ud`{al
z^V5a0=a%E4JHS4#LW0q|!k(7Or6$t1qL@ja1=j==ds<{#gMQk!X4FBN&FdBK-?V#8
z&v&y31M@ZduOf=3hkBryd1Z&Yh@bd4EV`q<%^+y^{N890BY@FRsF6=(3gmtk9QYEE
z{4T8~P$oOhFOR^3<hRp)NX8FIqYJx`8$Y2f@!KQ-KUs%fp+X5B+)idRJHoo>>@7Ww
zM2PF&7X;fRq@0X=$DkxhML4e1o#fV4B1+K#!;9kO&q^{JRlS8;976@0kc6%|O??0J
zQ6jT4#mtA+psLpFj{Pz$ICpY{CubD5eJ#!C;{6v!%7(5Z4=q}QE+k75wGkJ3pd0bz
z@I(j|WR-3F+Qidr^-7v)0#g>zXT1G*5@U(1tC_w&CRnj@?b0hYyQaaI!=&!I47(+i
zEKGB=i>$ZU>2c8)`WY|M2P0Z$`-{JD(ara;aYeW;loPjolO@coiM3!`e?iJn9b|N&
zW7iJ@tHAFEVp!z2_Wob&y=gp^Z`=2sWnr0@d7dgVx6HE)p$tXJoH4A-V<==+#s)L7
zNK%GOiIkbjtdOLTNC+kK^lXR!d0wCUzMlJYU+0VG<@2txT5CD>{n(Fv-?rcH`^zEU
zPawf~@N-TV<jB@3ZpA$5YIsw(x-_VR)(hU5{&C8<Z@DFs%+hVQ_Kgr35wkuQnl+)}
zl)x+3z73qkvikkN3iZG@4W^=YwF__VJSnuEe+kRoFT3YvuLo4qRN?g8CsiD;(iVc2
z4^{Ih=K@yhh*p?`U%KVF${(znQTGRg{j@CE^;)&KBb3qi2P~h{M~L8aB(C&@t>>$Y
zW@dcM(4-`Zat>|n@rSs8BG9F;hfs3F>MnTioiE7Iz1sC$YeIv-io(oQk~;36@JKv4
zC$jGLWJ4rB(cZ7;W%KnO?xeMZwPb(xDgL#plXhR7X#{JmjcIn+MJ3G7fAahbs-`7{
zFV+Pqq48;Y>Uu#U>XSe%t467e=&o-J>!uD!xC_HnymqM4l%3+<0D*gdo=C)2!9L{u
zuCpypd7X$>i}tDLvpuJ-9<vp;N~zI@)7vjzpL>ZFj5?wb)Hf!GEBHR!&?WZQN@1hx
z5%s)qYEn><NzI}|WGnG5W-e>(H3C}j3ctrlao)_S`NyR79i4CK4-bE_6<<S4%p1fq
zYyS!0qvTx15Wa486_M_wQ@`SNqsKIJbZGsCUYHwLy`*SqhspA1QPfQm=G(n7>>Jt`
zeXkoADXNKDs5p*ge102mRwML=g}v~L>6m(U#ZlYpFcPhM5l`gzo)l8!T`GUtiJIj+
z4d)+`u)VNQ85jBJk2FYOK>`t#*`GmKDXYO7dx^!5^^GFMf^1YbW-X#s*%VZUh1{m3
z6VQH0e=?!Ldlmhj5norEuE)_am*GR_SJh-o94ASUZCLt}f?UF?VU%Oip45q7UoUD-
zkhnc)BsCw3JtD}b)a|G0y-~JFNW07xtoZ>Wl;5Gd{$-|yrA3f=#_Y4h)a`FfzQoSy
zE|OV$`wIadG<XfdV>D|y$0k2rn9JCGUg-GIw9$Owc5)<>zz#`@h!N|IzIu;7xnyw7
z-v+D04_H6^07?t-$Ix(U7#s<(z?>o6Wq$uu2ZJHvyCaFZN-2F|DfcM@$^aE^fsxKa
zSIie;&#;Bpkgy8{oF66+7YJ7am4pGx%>M$WFhOk$m6+1-cs@$)z8ZlsymQw8MF&5C
z_u@rIg<-KM4YmLCUy5DP#|MfRri`bf|8Flld;@7}crBiNy40HD|9{{A{5miveBO>`
zws;#1ETdU?GRs0}i`V#@l=J_L2>t0v5Dnv+_SUmAW!<yP?Qiqbi<)bZ!!%$<LD(G*
zdY4UGaJTnbdcJmKw!*Dn6D<>X$_*oH&$|Nemj$?oPv!6Q#GYSFIFX`-!GxFc;ao>V
zKOcXy5Vo9u937>$$A=488sAk20#EhYN22Fu@X-y=NJ;y;I^GA@TZx;BGQ{s9+H>2&
ztKlbO(v~QZ0e%|W3}u8Xls0~YS|z5pb(v8bgr}HY(92yrPr{{_VRWhZQXRHf7=@7{
zIOBsrY+d4VS=b7`z&!5}rSEV+k^_4<!?TBq*piEUcPLPZiF*tBfCivzeJ=1gqE46&
zHLN~~@VXDFkeps&UmSFX=`H!aAmc|C?BsHwN=@Jwy8YDf<3ID05RA?Z{&QA|%oB=6
zRcRtT+A2b5m$+sSiQBBggEXt}fl@Y-qaItHl@ue)f|@yoc<(H=@qoGR0hQi`Q1$C@
zV}xh%g}p}<@pz>7F=FOG{aevRmj0<3=3-3q_jmUXB`yeWwCr7~`jf}eYQ0z(f>S+E
zQsU72d9qf`0kq0RNDTw=4UAFBEIbf~>jTeEJ2Vzg7zDqkCzLw6rR|On`STezVf4YK
zj)33}A;}ul=1n8ZW>04fIKq*tB%<0V0<E)C{Y;wcm~HWf2H_lC^WHS#RvpygaesO3
zm#{YeLDIu4F*GSHx7*j?RRdqpV`qZU2!n;8n#YDCygVu{#UWuTCf88!#}_+v2SVT{
z|JS?!@V36lMno?Cl&T_*1^NE}!;&GaqlQZClDJ6zKfg3eEtT#Tm*zu@Uz;%I)Oi(A
zd4Z}kMF@EsnM3)>Ctt0!-X`5W>h%s}@t%zkzEu7Fo?h0YG>fN;e|~r9H5+!dY)tH~
z#&$nwPMlkw2wqJrdv;GzQ~iC%C<eE`d40O=#+#jC@PJg0+Q(}p*(|o|z1@U6Vr*Pb
zwDR<&$R2^=322~2g0Ri}nkXXftZqh8MmY{XXT|LP{cwK$^=$U(fe+vYA3OeQ^%Npq
zd!v4z*`R*=uNeAiwA%X`x)%@Xt@`@-7VqNQU}x^MAqsn&;JNL$jlDMKLA~k3Y!mFQ
zw9y0MHr^#NLwXNR9qd-`RixYR3qLL$O%;BKEtW*BX}oDR6c=0NTt!QB%BfE*>gi@4
z`<7M~tU+Lh4rrRzsbN;JR3X657c+AiTRy*9KQ84_?|MT%;-xTJSd$QZRo!uIHV*=W
zYcs)z@7Uuz_gC&&Nm*^yrz`gjy6kgW77=J1b<}+NyB1b~)lLpiw@jZ@k6=1g-gW}d
zAzmM67xC%l!JYx&Ou8s?`$$6(Y%fH_7HP)P%)+I3p-y^P$u12NpwgM2^{ejmz}QgM
z=BZr&ivx1^Gjmu1R!8S`6frAFunX=*prx{yVKB+Nm|XV1gg9P4K%1c&noXYr`V#GA
zETU~GNmZA5^b2n8PV+dvzUe)dcJbzY;S;637<9kVvNB@R@P%og4bl%;(d=?2wJ?Pg
zg;|&watKxSK>+8mDzI?R!ZD+yLq>Pff-pz=;vQ1<vzmTiC}jG$5$ZnJGg-3VYVThE
z^sGV_Io~4u`)kkIm8;;iJ{EGptN+^w$$B*U>K-fsy@*KbqtVFF^AU^gCEvKR$0@6@
zTggD=fWS3isip3@Bp3%#R-H>?=8uh34e3#9VQMOa5`iWf-HQ)D8B3YA8}u+BCl|b;
zxNVoI6#A(_cR^kjb(ye-2IgXRQh1d`jnzB07x$jOCV$O6?-GX@vLZ-0_i?TwR3;(g
zBC}5AgS*>zWk0(~`GTj4L%KAc%Q`Z{n{^2&*^X}&!aF6qMICCtUU$852Z>)D#)>{4
zw^e}?YQ`1-oZJyHycmD+0#w>>5cBD-WNIAH-QgVwyDRTMf6*6u73^cbq<a)H5U#|b
zD(PUW|7H)s18w6M-apd&<j?Gqyu_q^#HkwuY|edPQ&4hR`r?1S3^e1j;3$+vDwRDG
zx)XAy<vUL58X4=b=2>>CEcYNa#6X^>ckj)6g4}Nn(NIbtTyezCE^4r1^F$hLdsH7%
zxe#`E8XfuS-2jT&?}(5Oshc7SINwpbSaT<bdW}Rx9L8y=n_Z;K!8XH;dc=3<xO>V4
zsT-L{$k>u80uX7KfGEEQJ%|ptcFaY@m7mUv)|>?aiAn((q#9g3mic7*x>5A7c$Pir
z?+2iHH7H+3LyT5}KB}U}kn%<S&!X(bmkr?3WR0rmu{p1@mQoLZfnTaCQ4HsBH+Mcy
zbX~~u_Wx;#DGWax=8GELIS&bg5Ks7{U)~#cNU#P;ApgKE;<P_QRN*cks^%U$X_mxg
zV^N@sCX|wjnO^TKyW;Mt@htXbmMp5C^(Q>5`+Wwa9{gJium>dbcVD29S`QXOp-zTJ
z&Rg&jU(i`uc!02^tx^7<9u(}pux$9ACAlyHeQ?F|<~m&d7ni{8C;p_=J`<pc)8pW8
z@Ik(^52bh=!F>ZV>~nX1zu}2jVZVL=&)I(WhF`Vy@Ys|12=J0|5bgScI!i}&8IGsv
zJMdWaf#Q0g2l%RbKPm)ou8hN+xYPfU_2#DhmHeAf?d^qZypu2B^85g6%ma{E_4)uJ
z>v>c9<!hhD;pXw2h$Ue*<9(ylGAfYT{0-ujyMYlY{sI#1f4~y>Te&yKr%OoT>-|FG
z3ws7nb1L@LcMwb|*L@hP{%@#ri9)|X15poN@$dUU_BDBAX}bx9q?r*5SF!jBSOZ3E
zRUJG&!@Tr`!!3RZq6FSns;=liITRE>W|qDr?9??8h|N$A^6`L0svk&P4)yRXc`nSQ
z7bmNGmD9iKq$1RReyPU1?t%HAj<92Wn!qG@#b~kD?EEKU*f+eZhef<r-FifEw$L%n
zT;NS;?Cubg{-V*zy2*%fdOgv*3t*62f}%bCPO_{OEPD0G!=d~WA}fqkF{x1AV?lLb
z=b%>U0j`a{5cnn?RcZcBxpNsbCc52=bVwAD5{uWm6&$UPo$5Kc`25rI7my=fQhJys
z<8{5b96RaOyK@Z^LQ;AHZhan<#+(0T2#`*BR(TW$^hqmlQ278S>4`4O2Z958E=cda
zJInmspLN+54`|093I3I8Vj;xxT{GIhZkCz-Tv%k$7f?gu5f9yh65EsG<Yme=k)~#L
z(W55p_^JGv*b&^!%PPTBb6=Z>ZI<LbDmN!B=3pe^6UgXa1drP#vP!fng9K~D?>a2T
zo|6vj6aB$;U+aCHp)|UD(aL$ec36^!J-zPuPPfhey%#1=9B<s(^8#;&SvRJp6<Ff#
zm+fsK$&ZZ3?3bbF)lCEI8r?xOOdj{6zsTaD<Z99T9=2Jd!GK5}QO?^g`DYC|y++q9
zU}Z2Rcln7jx&p-k>}>yyD)`K7xE-Fgyn4r}c;zj2r||&-6jfCOthQTE#swFBq~UfJ
zmW~$PplCG(%yBIa?}bhc1h^%K#M58SGkZox#cGStMt<Bb_6b}9%yQKTXZy|8Kk*?+
zd0r21eHJd|t1QJI`e87ge=?PWs_KF2U-sza$X&#;68ZYrNIBerDP_s8z{HR(cG4`*
z=0zyzjgK0h^rO-~JMaex)@6>QJIU^~pE}=HR9~P?A|Bc7A@3pPQo2?n;S&+`*9LQi
zBpeyr`>BH_O6($_P?Z?vKii#}>wVwsGsnUXA=L3mI8I1s#e$ugeP|0b8Z#C(csrSu
z;9u*%r{AO(w#Wm&uVm|1=f!HL-alCj5N30F3CI+b*G}F&IMmHVU9`_P(sc+HN$GUU
z57*ut$k+4*1<fU=QbX0P?+hjC*C1Cx$HE$JGnM*FCgvS3Np}A2ftT8RA_ANyo5SmY
z4)8hPHGKZrHMu7~t#jzJ;_`mWUsYZp9s_0zJfAr}is;h!G5&Nb`{&t%ft*U$+?HVF
zCl5i1+|^<>KmHMD&*Px^V8^S+s$%Oh4*f28>*4?1?{Yek(S&ueff-~Ru0%*QddK^W
z&Nz>UiLzB$BW46G^E}v5h&2^0K_p#0z*BprmgV&(fcP`x)St^WJY)q{%f<Z)2oI@$
z*0?P!ndgBCx5r%ktntu#jx|U4kz?ubsAWa+p(92;k3+)#v<HaodziB1;+PAlG<egi
zztTQ+&o^?nVBVX%#Eh!9`o~C<U<W&`i;YH)TpPaX@fixE9+}~L@`y5Z8_CZR$FefQ
zfhuMAspdnDo16b9b4!i?|6*?8`7d*eumgACBvJL(>kF9>f;|>CVfU-mRNh1#{6V)~
zCz8;rvXtB<r29;>wqHGexK!F^2c5s=sA<Jk<8xbyVa7&fvveP{V4{{QDhCKRk@_m?
zCYcQ3vd8Y^s)s{#l$N<rrS3l)p}gH?mk%j6=`cA9>52v1qn@W`umO1dRpN5TxZdtF
zKnG=lRXW<y(<|{Go6Fc2fS=Wx|6_EyP*&ZiIMQ{uj?>HxT}BW4Yl9oVkdy%1U!MuB
z2hYy-$%tMbR5pfo^Jzlv9P6&|qnUc&A7iT%rw+ExBYSmbPI`P^2v64I9FF_7%_q(3
zJt)HI!At>LT7H_iD$2vZBwf7#7kWs_<B{u)+c=R1tRQ)~KDo42w|DP<bF=(Vfd0t7
z91)Lq8xXX~n7&Z$^hfWVo4*s^UE<U2vsBeOX1es;XN-P?Cgbf)k-R%EZ4deQM6(~p
zRkME0bbB0jZ#_tIl2<Zas_ne3<T9%wJa?b?pJGZq)&224vUx#QW<jF`L58>!=0O>9
z{M(3#nU3)$f-+&Ph^qBY?%d(xXSL5oz|dl%bvDNa%*Im=XZDLPT_@1<a9>}Y2KR^z
zVpBzICDcuFpRO>7n2Fb@Zp+gvkCO|)mUuW~Rj#EqGvMthBs>8;HCtLU(R_7if(lM4
z^0qTZ`kW==K-smH`j-Qx_=F;9iSAVdTqg&6av9jSxEamC!{0#)a#2OzLL7{-bx`qK
zWEU^mB}|RIMs)j0OY_q%0cb}1Q4!1^&ggzz7g)PjT>a!2SXa`oOR<|^h^d_<#sU!T
z9@z+V{#&?}?+R-3(>(re3G#jF{WOK1|Cf1%V-_*5Q2Qe0m5*?&age`g%$>D`=lI?7
z%rAxXs3YO0zS1oYb?cg5k-<hCT$oze`oKw=O&t3Qo55CJ1n+3*SdUaq^bi0#yYYnc
z!2@B1RNFOO!X@7`vs-*NZCp5PO?Wd}4`lVdSWEvB9gJfz;Nqfq%vi#)t#V`CO^LbD
zE9!T9f9=V#qJ0Q)kbT=7!pz!<>iqyNTBb#*J!_DktvdAX7{8N`naUb@UbvkN!quiA
zo!i-!gW4QyrS^r7empcDLrD4~l3Rfk;o2$wq#G@t$rk{`k5`4QmY$CH*WZUGh4+RR
zJoxT}px+l#Gt%C=851qKAS8h^6PaF0H8k@ciC+|==;iaEc0ywt71eKX+%=H<GNin_
z(sqr<e^84h69G?`*ViY!My~?Au6i-633f!Ty<U?)m3%Qn3|z$4k7}rXu_*r7B1S*(
zE&6nQp-eB!b5{~nwjlmO2Jx(`^riF0TD;VXq@PO%ec&E34zU?p_QSp{REsR1e(Kyt
zEhi1Qvjmm7udtBM!e&mE!>vO7yB*6<zAbV!@>TgIyR2<_{kCQ>a5VISso9xr0#xOM
zwp|<V4kmXmtD1Y{hSqb<20{2~2Xl?)vncZms^yFo$_jx0?L9m!hL<ODRz1KHYpX6X
z;vUVaH3Ilbi-IfewLnHEA6o6cHxxTYRH*he3?l_@0EgGD!X-+rT<kB~?D3X$uPIGS
zuQfG-K@>&Z5vb+t^qj~Gwr3-<g&pDJ(dM~x4xc?u72MTUyhhBOWiWo!xyUAr$-en7
zt7>o^ll7KA)3!k~ek{rFfh^w;q5qXJ6i;4=cUkSwIo;g=^I1&<2l^CFbsc~cpPgw8
zz3H<EeeBU2pCm4QQ_g;*eE9U@lpSCapACEirn>D2W~n8%5S?R)fq#aCy(d2!9fL{A
z;K-!36aFSiD(99IQ{XBgfE@Q98(R8<Q|%b)Do9A?keDAP+-9Pa%+lQWKDicZmTs1E
z#C?hE2NR>|xZ>?9^iF>9cagx!C2_$4Td^kCuKWu90sjjJ`?;LwUmf~8#-moEUz4<&
zi3TD0y{>(6F3BK!b2(~w-j9mr?0X+RtNa^3JY1kj+ACtpcwKuVN==OJOnln(U!%Va
zWc)L9sMVu)_!W!%q1$cYy7TJRLL62wNzI9b9T$27He0>7xgH)MsVH!bt5g`t8anM*
zV+7J@Iq{WGZyj1w^1x<5$}r1tPQIl=eRh(X_`!7{4tHK@TDmi~Wmx{9=JdY^04?)c
z4cP)osq}!r=~e}Z_Yf%_g(KIW2#Td-E~Sxis1+ccp&&W(XqxqoZL;cgNpF&H1zGT+
z(Ba>}e%)y&vgoM$FTj`NvgvamqSWPM=p&!Oef^07W&7vYruqV1Hx=u<_PR}g{gfRc
zI~Y+FpQ$Bs<><`Yu!YJ4b21qATK#^$@31FzD7a{_i=4|}K2wC7s}~V&$;h+utf+q;
z_mtCT@M@uzM`2lc&Se-;1stfF{RwSRZh7<U1Ab#wx@ZC>fQLdIhwUz_i0HLlXo1%i
z4YFk1ouui#H*n8Sqpx^7lh}%THd=o5(xI37;)LI2%n<C!-jg!>2&%|`JZE;z?$Yh(
zMUiK(?0+zhc(0S1=`O<2;o3p-_OIC_*mA!U4#nLh2pZwJf5U`QD18=c&t<D(%k#$i
z2U4*>vK$nq6Qp`T=IMc8exk+pb~faYbKDgw(I$PQJ+4OhI>(xDuW&{q!`GxOF0Z;7
z>Hzmo=`kHnf->xx`~S-%pr(7v&h_DC4PK$_`D?y!+b7W9`0l)8WiS7U9MMgv&t=oP
zl8vL~4OG*UJz1C0rJrC(be-Jn>4;W2rK;eR*o;ssMkt7#<Ptn;w&byGqWtn}na;Ua
zFN;f5Mn59s2_B%pVejx7Ha97U#{s9#QHDWb(Kkc8VPGw~YJj(Wn0e&LS)3C9C@<Zp
zn1H3Hm!KZGSd_QFauZ?oPlhw|YaA_=<8%m9Ylcl>y#g4+|HOu%S<FsPO3p#m@_CjK
zP6G1^dRU!ACcH^cro4n+P-7CX*1Ym|oM$#)Y72%bN!S~ouJ7{kj|~Ac3bTF>&Uj9$
z^|&0(zSOG#&a~!IPCHV4Hc=}tz7aS{6o?1!zFhbnaA7ael4cAA%X{skEhzU{ThuZ&
zN)MD*TNMa8hZq%`q1ipk?P<i@Z~4s}NBoI(*Wi>d&5SJarS_G@ey)NGTsxGEN?2Rl
z)Gn(C+i-auL9zre!<Wa?c#5ihQB1zWE+=WUBVvPX-A+6rn-KthzZYGU9{nBM$ts*(
z;yvC7*1_Mo9_o(u0&|v_D>UKPA37Y-&q{kpO0qyTOJ3S?Pf<#1_|bv};R1Z_jJ!M3
zUc36)peQv>bd~m-8gVg;bFS{5x9axE!vir_EC}H@P}1MO<TIoT*y;PejH`(3b??tr
zS>2jeYnDP2+O#gl(r}4RvkBDZ$>Z|?sx=}**Hk&$EKDR4QE@sK0@5rHtBH6fge(MF
z(x%bi-qSga2e0<l_NIi(U|PgCMjdDuT)MfI+Pac`smsoyPyQuw3Lda0+0YR$K1sF<
zbN@Z}Um*@{RSgQ81#%HKv=Zh<#UBD~-Bhc6M<OkXFN00|(tA=<ZJ9<o!_AhLbZR8T
z@fS|Ea*GDcNl|59U@#HEKl#Ib?6lD1gbNF$j{Ak%9FI9=nKs>GGo#bQSVb<y9Fg$m
zG`@E)(B=WD|HXHq+~Y$Xboj#&I=yB9I<D78!9hCN`p(?nF1!xvIT=W7n8ovgBM~=r
z8I{Jf;yDCU_^sSTaZV>&RU@`yWImc!V3LTIcpn^z3}hc-lRa<eAJ?26Lb35_!EP`7
zNLmPm>n)KJX0iLAK;<>`^rLbbCd^El@xJw2D$0tO@vS)6wz&y%*6xl0x;;kCGaJ!r
z@+k8H2Mg%FKd!8eeOO+iL6ApfGUEq=rRTjqeXj{4-!|YRJK&|5=FEFncf9o_fzhrR
zZVu<mBHnc!Bo|ZkcuR?L(o`8RX`DkGX%)O)`Zo_$TPaG<S#i}>wkq;w+#8UMIGY?s
zv_Z-FZP1V^RDMorfdwtCUewqx*nF8`|99l5y?XdcWR&4sXS~2*46p3jJsnGik4!j4
zudC53tv4SR#}YG;m+;6rc`xXU`a@mCBHwhPF{CW*X`7G-+gi%WC;MPsDB1x&@YOHX
ze1Z|yt*jPnQ8vtD^B=^2+4YIUl;p+Mi=NT6+dH%_{FlSTU~EOKdh{lrElnrxSSYJ5
zgW!dVACqk37WB{JqqImUxJvKVJP5puQ4S^{U&jkZ;5Hf-1aH-;=zF~yeLTo5s^lFo
zgBCE0v%MSdqd%@7Vpa<mUK(30r$~C=WxxmP+9zFCJ*^y(8u(;q)iwys?oSidvTCeK
zPZs3aV}3SbF*@LW(5L)RI3qo>?lt6p3B#zvTP-1N^})`BK7q~P6ipK-;??2_ZSMBB
z-HD_^5a@#|(ME9PGMxT}Z^~OEWf$5vDUoU@byD^SJljz-b<F{CORLL`9Lxc|A<J-r
z9%&_>B4Y1FH=nMUOrJWq)}Oj<Fiu9Zhn7DDf=SJtXf@K9un%2D2Dsz#5)ym0o2q|>
z`ct)XYsMyDoLKUVvdMO@&0O0{Y!Hn)l@RIc5Zx&hJ~t@rn4s4ZB|9lgtjCQ?qE8tz
zuWWa>c|D`|v<Y<cHs9ef;aOJ6`leG<P<K^@*D2=I24T=jU#%r)@z18Odj7Q&$V$cN
zQnG}Ign3~oz8LVSVJzwtcTLRfNH|UrcTlVwnhk$jx)&7b&+;Otj;{S1ar#A^h<D3v
zNs=83(T$3~U^6-K=Z376E=j6rF>T8xG%MUTR8tIkDxh~b+<D^R>ry^Q+m;!D$|_Sg
zd+(Yi^vT80^NC3<{8ue#+Q#ZfZ4Z8kyo#O~mf^cYf-GG2drbegJA(y3KoTqt(s*LA
z9x3UG%AzoE%Iz0rdsM|*qii(29v!baGWh+N0(E2Y4=5NLER4ly%Ke(OO`5eCB2q2o
z+&wGB_su3t5_%DKAf+_Ez-Bi?`m8>bt|1@x6qP7^Tir<+Ci}mShe@N%Cx22J?}j!P
zYUzD&4*uZrRRqljHXEzo(>}c4dH>5fqRJ|OX|%0HD$$PQr+5EU?JFJL5ZbJxf6Bil
z(OeJsyD<GdW6?_UVe8D)<(16!eJO)IIgf^%OU#=cgVqAc50%DeJ6dE9H;7M?8@4Al
zK^W}5Nz~A0s1-PwqLHCgG2~^!TvylbLi{lef$B4zsZgz6)=&!{CPUHWs<WXVMdC^<
zX0MmxfC}t<+5OvoTnPIN1IHq45M<lY4q`1#f~(9X8IKmHTepi`5wXJj-J}*?AsB@F
zdz|{5qP1RfTKZgI^6^KzXmzVwoj#?08p=xTI*6Am9$XH?R-dPnBYY^rgXoGW!%1U}
zXmaYZTjOj&6}k?dk!^%={4dwYZKJUNa*eENIF_cVwiMo;X_3mf{`+#5I%^D11goj`
z0FRB?>qpkpgk(Zgt;ANX=fqbR^)!of{97N~+O$|MUnvJ#K#-MF{koKDf|SFL6D(@n
z|9D5lAQf{YYX(%(wJp@-LratzyFc1OjYklJ>U7-a1dzt6FyEIeExld`J)r5dNPlMW
z){4$n3z4ob7!6^3CJB!oUxrB`@H=`bPCd*mA?qm_){YA9)8g`tJz09|=?9kU1R*xE
zkB29=;zqC(6q~}_Oa$tvg$Q|yUQVhrqeoBEId62nkgEs6w^}9jcx-6h@_<T%I*O(A
z^CHpj2_<MaL2_#ItFXwnONO^e9MOz)o7$hE6%s>OwgQ<+?){fVL<9SeMZ|0_BKscp
zEd^>#_swKF8b~0KzLi7<aC%u^9k@9v1sYU<b-?;n{H!2PV{R@NE0};7dQ3**(n_A!
zvS?ceM621M=QRlGEVqInh?;hkSd7fBR2nBIKtu9d_?G3Al_ArTPg_Hyws#Jp7Ek`t
z`J2mexza&11I;d5&<%yz=IZWG|8I7Z1o9Cq-YuHn>OxfgBej1UO1M~z+tGGzAiMh!
z(K6K574UIR%A*M55_iWjl3Mnyl#5kv=idhxZ|Vt=xdpv|3MrOSmvt41Qd0Fi*xz#}
zjQT|7ZE3Lorp<_lgxacB!zle|^s8~r?o3a7jPo=_z@gfc{r8=xt_F<ZHTcL))^m4m
zQ}$~3lHTC9I-ft#(s*3|NCf56f7~Vc4im9eM_grQCQ;I&CTx|8SSK8d9UVu`{Tvgy
z$}&U0W*Me9i8?vzS!g_f?1mCOpPJP2^a4zOjs>dqgvp4C_q{51LEu6JV^0mf-Y`$=
zRP?|g$iX=gGf7z~znH0m`^ltNKA3PkT9SuxT!!oRBmiJiPkDWF*aCibC7Qa8ehL*k
zet}`^`Z?kWt}4pZsVL$@CrNZJ`Bz`EP&5uBeedg?>Gu>)DuINbbFM~YpTF&YDAMrA
zwrZ@KxLTVJ%sP`hUO<Y;s^|wGk0Wtd>~pU0nE1Ob)%f)T;96Je-^T?o+c|nscT#C!
zMU|jvu6$#Zf}!BFij5~yae6Y-Ocl9A_^iQ$W`1=rAk-?J&QiqmF{PExHrHSOwSBAM
zEG3#9$Ns=jVo53Qu+EeeQ%(`St;H)grbd{W3x(#j7L#B`_7_h5V5$u_iNa5LfI18@
z@g{POJC-Ehy5t+tZvaf(S#xl@h-u2W+1zUW4sPMM4J9oW#4k*AHF#Yy(z1%A6LmG5
zoRobw?fNMVPk1G_ZD=utTHQuww?Yf<4Epn7dBQC|NQm_ci={WzzR|H!KsG~}RRdlL
zp&MLolKc0ely;Rvo_>S$M2=U4qeW{9&4<F+K@(B;x~Z5gP06td{DFcKe#N}Y*Mt+-
z;+<#s6c-`smEC+LDx3I6D4&@r|A9jD^gJej=HHeQBNDKboJCiZYjYHIGNd!A2VC&}
z9hr~vCGMsWmB|g7WxHXkkwJ7{Hi3;^G`o$dRVLW?aPvE)8h2LHx~4mBi=B4l?xTaV
zYcFzq07G{b!WsG_EShcok=S4;g^o*hV@SKr?hOdv>7SW<LCzvxcezcGSdY3BGj+x6
zo9Zg=Dzwf-$p2YeD)xV7VT+y7?+d8Wor{^XeGgnWI7>aF8TE6N)MlsZ*hlKT+GQAB
zrOO{83<{*m+gp8LpRBeQJUv!Du<r)v1UX>p*KAD-h77GX5Nbc3L?zx%VvB}LFKge|
zLAvt?+*C$OMXhS@pZ6TS!z`vl#Jhc4LmPI#!DttQH#O4n&_f>M<_Vc$L`@-<l27$K
z&5!gm42g?A*&J+te5K8^L=Acn7ZS>I)BVd%6$!YQO#m}K2(S!G;||UCHfBiUH9@i5
zw|$z{ws`;q9cqgNygEb5%N)tpG3HmK=lcF!nH$J=f|AbNy#f{VGwmBHadbSQ6MM96
zLtpd?s8uyl)Lj~D_1E?GV$RY$%#IR~Vx7V%f!_J*2jlZ9j**GEe(%qY#J!#3b|drF
zdK<seCJ_-CRJDKV$$pw8O@v;QKRQnAuQ%%4EJ;n!<*z<p+8slz?z!-I<A8_j#zunh
z03R$ztQMEVH#a04qtdQ<z%{$1Ix!v_&X!|*xX|p%f}0pm1wIiDBR{Hhg4g61%xfU>
zGt-{<TCClzup#pZ4*<Rd>AkObwZy)oVQ)7{e{bs*5-<+SEY$-~TR%lUax&GC@#62#
zNZRKF{g)X4TR<6p{Lm_5$|z=<o%E!E5!M@QcQ87w&X=h-hOf}oT$}f-MN+}E#vaQa
zqlOMtXOzKx;^5*&Rb%yYjGH!Sh8{Htqe7UXd5PO=%fNHW-*yd?<t4XM8)`}0F|*vH
z@}l{$fAbF(Uj?9@b*~<kafbW{oIN^UkrVgV$b8*_v*U`<k1A^KCocqi4SMX(62T~{
z*P1cA&!_wr^LJr&;rGq^&IobUk`;*@h48nxh}yy=Bcd4-qqui^`ge|RU!{dH7qHC;
zzfWh<kZt_>*zeg?HjF18n>uGU(@=ec1+Cwr82KxHwFx8?Bm8vBEb5QFiJ4EZ%7@$p
z6<RNV)9@s&xL$JmI7j`JRfLCSyWYoj33ume&Nb$)wurgTw;?QDHhr%Ti>ot#)%zKB
z2p~`z=~pqig4q}?Ml-PYzS>FPUyX6VZ*OHdR1|Z_xTN%UE6qPX?C&?Z@1Bb+itwLr
z!k(cIKaPLM%Jso|-kUkG?(!9P8q3&Qrs3I&QF%b|^!v`;W!IdG73e{z$=#ZRwoqO$
zA@7(W4}Af9kWOO<YVGD$W#%-q`=4|wl2=ntw*HxiyA%Ox9m=zZ3moRfW79%<BE@C9
zZfDM=KL)`nGsyV6JLcp|Z3^;m7yS${DY1x&r1yE@OkrPLYWc63#IRg?6x3M~x`m>s
zL5dXNc$AJnS7r?!5>Ls0c}U*U#ZycBw@NSYUJS7H?{9mxJX-1S)u-h$kGz+yD(<*N
zSRKb$(2obJYrZ(0&$MFd|J_3(`()MU-&%lV@RPuXFejR5Pjh+;l;+j!iJCD~m5%kY
zb)K?Z=PKG{N)20SzT^Jv9m%P?v68v7xk+4N=~3T7A?|fz7zAa+tK6i=f06KPcl+YF
z9;8ON#!t{>U_s{pjzyAJ(3|17+3`WrQ}}H_G`Uxt|0c*QpYgw#Nj4%k$V8~os(%^m
z3?;hRSo!M4!-a^~H`fJXv@dFuEF#GoV~?N?;IsdQ=myV^;ve-iD$N+5<;-q9%ivDx
z9;buLURx1Bk70QA<KXc*n&h*8<%Gs}*MA07VQkMeK;`@MR#FgAx*LakjZvt$12^E7
zl}ZI<%fA8x=Woy~jm2MhJeqY=&oQ!uAo#-Jr+&+xhi0T?`+Dh_^n4h&5g0f`u&Y|-
ztRWG#M;*rl;*^pvH#c{VG763+SL7YVZ2)`ay;^wdn>dA8gogWXA4%`W*Ea%?WT;t^
z6QQ98a;^$3^p$n5!Hd$v=A2_!CcC@G?rzr`(?>3zv@lo^QfO0-v~~IJpt(M*gC$;?
z%h;K}W^uYVI7}@iszHccI@s@TgEjU{L|XiyH!Jghn@I&T$NTEemi^aEDy~-}ZXrn+
zhNj7fPKhufn(6oqQqI8NkyON=xwI8{r=R|WMtH>tyLzY8jNKGb^**$!s#GU0r$eH(
zlP2Me6}_!>qCdJvCc_7})CUPCB}NGgyoE*hpVe18-#Y*b*5s{5LKH$g<c8;?cxMuq
zE87z1e7dEFLdkQgW3;o98Z?j1Z>f<6Q0^14$H@J^EF$!+znK1ai%6{I9gS1+b*B)E
z$bGnwZ7VIF(tDjVC?b`s$^8|lk8ubb?++)wT}3B-mzG7foC_h?T1RHfPQ{R_lbVR8
z*0h+oB&dJBC(rVi*p%6<6BmQIWn1|2NT>MC`iVO6osUiXjwEb#HG<d7>=Up~wtErH
zw?~B$&xdW-W_eKH#L0$5Ns%)`>$?yb?G)H=mDiH}gRWL#<5WucB+<vKucTg0zlux>
z-BlEC+V$bUeQjc1=U_QKU`xMeK=GpD+PsAiCg=M6ll9l!KxakzdSYMrZfV-S8}H6(
z367dE1w{pWk7GGg4_gnzadb51ONITP%T@eFv@<6zoqHrbvt91VcZUhQXo>QEJ)se6
z87kDiq$csD2WG0gx13*VzikdBCHe_bVW}@_{N*cusIw;flZ`L|Dz$n?W8E%fXI-ZB
zX2*zQmkXrFKA^>h@><#DwdLOR9Bbc?X>J9+b#bCYXXDA@o~g0Ie|?n0woyXbeS*XM
z*|@Zj#mrOkCn^tOJNH)a=bdkgRvV!!PC;gz*@4z6yW+VltRPRg?$3-Yz(VTw*A1I}
z?&cu^)$4w2b8tPb>BvEY=gs`+iCVy5BQ2?r^-m{8@+A#Vf48B@!SC3sOl53dadMz`
z;5KL?SPG$;aqn;KH1iFgpN2@=tbj8rTvA_~l8>NC75b|sX2TZfd!|atUv}xkj#Bcr
z-6)+YYE~76MNzj6B#)grmcn;LqTDP)+ICi9_7s`Zc-3DJ4)IjC4R_0p?O;xiS2xC}
zm<Zyiy<c?L@#xY0kq{46jkr=Zc*0gdA59+{!1|GM^5;Ls5K(zd4bAjt=Rk+E5r{Fw
z_H-TE_(r@~Ee_ar6{r~3^7_t~g}=Q8MrD|J(QX0dEnQb5OYVEJ9)EQsL5Yfb`<Cr(
z&zrlu&Rtt7z^9E*oLDdRNm7_))x-w9sYvYlVWU$qNme||J^hZ=cE(7wWk`MHTZSU0
z4_nOJJQ=b&LGso6hcm1D4(_89#<aP{?AsQEpmeyrG8vmF^b#3y>cj@<fG>n(J&`}3
zJkgM{<@jHdvM2%#`;P~X;0p<|skmZq?Leay$R@)hdCD$L$)S@9Q9>fr&DbQ(M{9Na
zCybCOHdX*f94aF8U;OEZufhjLS^hHJs2$o#9niK?EHZFAk)MxvcFNCUaG%l*Nmq~T
zD6AXR-J>dccmKra5bCsyQrAUD#G13)$niKy7hE)OQ&f6%r{bJ3b4&N9SiU=2b#acQ
zH-wieC%_p#bJPn_+CAbQ(M`#Gnkt;{FvggkCXx?xEh|l^`sTsD$vX<u*RVAX-cK+V
zE4}3LLGwt5CSnfp8POe+sYu52*KnpM?QNevrOLkjWt1)~FA_zqtUaD@B63ONO^uMd
zMfu+Z%f#oeEgT;DEbtI*{%O}!0una_A%<uKWtOMQu;A-ruKeQY`~pI>vS#!Pokuth
zY)^$m#JirBut@0$790GXYd5cwxoc$S5i2t@CQ0O)SPEYyHW9)dWu;_(>|%S{pDJVM
zdg*(1+F|27Od<b(*BmZSg2jF<(w3@3d~U#;qCVJe!t4c~m3AGoz{S<PCxy8#DgVv)
z;qYic!k^Dd2OqZ1MnUU#<`bFLFNIMK+vnkHM>VjFcx;#H(+ahEDd)Z#W}d|m!|!Se
z(OZ=HacH)s-hlxA!;f_rO-~AHWV8@ZO%iIds7W|V(q+^`Ijl%5AH4QOn(mA-VG1jr
zo~8%amwr#aeeDY)_KY53s^aj!S>x+1T%l+LMYdaEX!gSb4(O`~&S1}25(dxM8p}ng
z6=92`)s7h-DVES6+~vJftM>42a}g+9^x;q(u3dD*&>>3gm^!a}xc0GdgD+TArpBSa
zBh#gx{n&pX?#6^;!L)%P(n|lIh2WTQH$L8Q?b=SwGP?gj3a){@qjn6<c$xk`cx_cQ
z{QjTVX`gHVPXcrJnizG^m}3q7&i)6leanoZ$6s+(9%BCwwC=oUP`YcV4f6jdueC%R
zIc2{MUH_8{heb0})iYne3Yt!-Q@bmXpj${XhXSPL>RiD;?wi8MW)_=ZJ7B}5`w1J1
zfqedWTr-%45idk9RHa=(?BxEByJPTLF%G_=u;wI?#HA@kE5hsy_My)uETVrNj2VUu
zXh?_?sfJLJ*&r@Tu^K~zv3y;yM-)?qPux8CqSjRU4pDzPM*qADZ{rft3~%H1@ya>~
zul_N6d;qCJ4<E!+BRImH%$uQX*BIOMHx3;Z!)lYzeU)16mCG{+p+yI}jI>$U7*iP5
z3=&d2u-Dx|G_;Tbny~lp{V0Z?Qou*hYukb;&ZM3I_A!h544oz&P_x0RsUr>biW>`{
zwCud~@h<Zpa5o)~Z${Js;1xNHNe~%5Q|p&102UQ?3_<iSR7XLbvz-C?LD?W&(lX%y
zD+p-$NJSE%oM{msd_3nRA2REU#ZkEPSbQ*?lT1WvUOs=~2Vdi9_z@I!)P0;0BFYC#
z0}=RyQ9`^cAe7BUTn7XL0lgB&Bc@Cm1PY8=8)!O{o)4J0JbBc<P>kvn1|L;;GlVJi
zLUF}eOaKfj;dS#^)VpJ0YQ!C<)<-%PPuf#Td&T2Gm<R&c9mbS9$0xW_1^>AVX~IEm
zW&YwOoc^WpcZI;dV|eq>^sxwFdnB++>VZ-~8ngKxb-sltS`>xET&foz@{&Prcdi)L
z1;beQRuC^J1-V`K5b~h;mAhK=6)Y}QFoi(!Ckq<DsQMqdH}nCEjlKo{=zr@-+|59V
zThm>a8XwMkg$-YU%kV$UR|9Ie39`m+l{Ed&7LIu=ymsDeVCetz_YEIaZz&mmfy~zx
z@*+6-4Xg;IfzmlSqD5mi?`9~6n=i}}5tEdqH>Y`BdIFV~>a;GcfjEu5U@Z0#<N}W3
za?kRAeX!hg@D-uKJAvd+8nHlqnz;!=)>+83ekK-s<f7d#^b4pr^g$<7uC`ovUXOw}
z^%y#iZv_qxY$&PU21gCQflTDGH`ZY;N9}yQRk<0%pN;t59|O|vGi_F_IfMVQmqfq6
z(rCM^L+=>a@$)hEEfu&rx`zu|pcp)o)B(z2yP6X{J+4n;Qf{fe9`MfhK!P#L2TMw;
z;>>NsNhlnFg*t$z!-~m`<zIS}1q$*42j0?7a|JhijljPk3f{NXQ*ZnMaYh!c>SnXo
zz+Vy$k<yBYPw?G)3*yBsDoCoG`(@!bZgiw?Yxe1ppeX^Ra$SB7wd^Rte%7)@Te>nE
zWYnEh-8l!)Z(7YDZbQ(BvS1UbT5;M%>UTThb>4+NNK{wMcSQQb_9RvEDE9y86LGuz
zh~|X;C!>ej;5zBdpy5)?fQ$A;6iD}JcCAXVkl9$2LR6O2DoE!>RoB76x(M<gt%>Ce
zp9^+1c!kgeov)vOpY|1rXI)pBy{v<U7>E<Aa3mNOZCI?TBq4!hyp1NPZjg|?Z*exU
z^46;EV74?<*IMCreG6hfw@f%173J?<GQB*EFnw#FAW-^xee4>yko3*tM%opZ{{d#J
z4Eo66v05k^*U~lUdA-_tb86pc#KLC5`{Ev0tgb^r=UDWgemJQ+JN4yUEpOa*X-F1K
zv4Q%IgQi-|_^9^9y{mDEhoeRMXSL6${8a-QO|6gfAXLOiBk{LV-rr6md5^D{R}iSN
z2OQYC<q;tKQHecBOa|_avP15TC53K5?P+eOK*&w$fdjs4UTprQ;)J)Y{>K=@Ar~V9
zQKTV8F#Eon&X+1bIb~;pcz=T6f9aMts|ily9^^uq3p{1Oi-L8`3H0l=8uySMhm*^b
z8PbL8NbWMsYfmYV6pCOTmcyTysnJp`&J=q(Y7uJ0vnQ8VAnXK>m$Ptwl?Tt1h9IDA
z_d@y&4-$hjTJX~E=Y?w_(PITS0FRBvTK(IwaTu!UE3_=ysxZ3%Qi7~w$;$6J!Tm<c
zr<tL3A^)11?_oTkYH(n#0L4te22yF29rtPrLbHSUu)ZD<eIX6bp98%*OZ5Q!{+I);
z$RbFLNDHEujXiD@Q*s_QK_VcD6FXrLiIFt43WPW}vFVhxt-|}~BzqinH3&f8IiV;X
zwSeTDL9dD37nE<KD!7BxC^cgZ7^D<!%Mka{!$A^52FVQn&md7FWCq1+i}=l8c+Xi-
z8@(VqUo)mKN&}@cE2xP(!TOLaeOB8wkCveuL0tfb)bnydv#oZ#Swuq2v*QVk7c!Hn
z^>8M2VN9e9zmQ^qIDnCrjsGBF60J`lh~h(%=E|qygOunal-=Z^4N)`gun@bFuq|0U
znlLjWl<h(HD+`gqggY84I8HaB^4Ug@zV#YmONo|MN@E3T!;@ib2=wZO%b(@Q&a?4h
zS%~InU&_xBBB@$+qtGGSf8SX$a6zyHDdn*!E(|HsH{Ab|-js5@x&0SAN{haqIqUIY
zSUy*cD<U>?wzlg-a2LJAw92*wtyux95nEJ(+TEQ2#b50ORQW>U#k(n|*+PT1#n4rV
zjmhVp^jnJr-pHUuq&d5zk#ac&kllzb%?e|)9o}&wj%vWci%a}(DKwxv!@}_O5(x#{
z149TJy+!^VT<?s3-4t}l_2$HvJZpT(CVA!7FopK?gLXT{>hD0ivG~Xr&H5YHK9KOn
zad2XP{YBLm?)r7<PEY+V+qo7s{Z?nO9(~pXQ!DT2HS_-VGcUm~R;Det)YvMW-&Ucu
zx&y=Wyp=iE+tDag;)R3V4AaXPOHtLj{IeR(&G~Ln8JvaMWyO+LVEUJaxX)6mRtpeg
z1ow5o@c}_;`In}?Xe^3~TX~cs6N;<O)!>AZA*j#Yt!ED~!f*sNAt)}6lA|8ebjl88
zr?<f9lE{S8?mcVf!FSOvNoUi5iZ&f<HSW_bW`a#x(_a4ix~zhp4^vLHTsKYQ%Gcrf
zCl};(*Fpfj<{nC;UqJPd8)y@ve1<2Xd_-N<&j$Hd52UcofQOE~)@W~yn8;#~2ZDrE
zrTwh_i79)DLcgZUqr4U#rN>G_W%$wW2@Wgz^QD|JvKIFr-WgoGuE{Hfq7L&_zc*Or
zU;fbcO#WUdfPhcGr0^z8pL{?VaxJaYE)Dy(RA01`SS#uD)eMU$z4#?~D5s`76L?$B
z+#5`2e@)-hhk&vOv(IhBTd9~aQB-wK5q#=L&0a9jY;N7(q;`-lZ+>JJQr9wNEXcQc
z=4qqG8Z<DA(cy?vKp%2ED2{jubp&(jx64JCc&|pO`96ESE(vFHr_v1$SH+A(Gn*+=
z3igvJP>hqb6px^tJ((9%@x9HTiTQXb-MMLoDYCQ>=Li02&nFiqnwtEzqIp8BzEu|S
z-s0mo0@xOkiNJ(BM9+S(g)MMd$FsIQa6d^R-SiZ4PTuWu5%hd?5@q-{h~Tpzyz7&%
z6$EN7az8W-1<+<99nqkV1Lvi+Cr|hc-4#jukN0=kO+S=TpklgQVv3&aw{II9(~_TJ
z{ZuF0;;E#+rE%LfS@{Ft3wEoc|H!4@z@Pf2xs4=Jv^=@L_~OG;0aB7y>Wsvp*p;Zs
zT#%TKiPB&Suo0W=qvgBuGlI#m+e+mN<7MNM<lti6NAD;3kbwi)(%hnBn`oi1uCW5>
z;;uj9kf<@OXg2&shGkNs6TbQb7L;<vu#)Hg1kaFl2Hoi+cJa$kDN00T<HY%mVnp=*
zy)2q$lRo&SWzwnr3ybomgOxC8p1~P4;+>LeXpa#VIIZ#+q+}9&Fmd!BK53M`giM?Y
zgFmI5oFZlK;AGkK74YS1#0KLCMclR;_m=L;Bx(lKvu@3|C?Y0G&84~xb(H!)exsgU
zJ#a9Z9Ou~jIK3;rR$sBJFPwUO%tp}r05~-@n^Pk~rPR^6$v0@_Y@qn;n7g$FVPNJ>
zYu;AM1eMz&0>SGnA4bGSa_s{dZ+!hi_SmQUDQV|SZO57p{4O*oq?C?i+A`^WOCw)M
zD6VJBvVoJ3du3h7(IcKYQIl+P5s0{baIxJa3lKw0SFjB4Sg!q9`ImT~-9_sg#p6B9
zJ9C~hh<N{3UPIn#wsO&bs4nMgMA>_~EFWIGLXAAu1s6w+v98MqKF-YfjFAw9&}Gja
z%&jzz)w!ph1|Zd+V->xxEJF51+tmk4xA!anEji=UMaCe&lyE|N&)ZQNL;>9?C(g((
zX>@(Twz+ls9GO+MXgXKI%7^J`07G4+?I-;Dw_5Zpmt+3xgiDzIHa}}%Zn7(%4;@P`
zOhr`gH+VL5gE3LJ1*8*p!DhqX&-=G><)$&2`(yo@;-AF7im2=zHmOcUkz*)G!J5Q&
z6LUUr9Sln^dQ@=j!Ua~Md)Hc|uk?6b(mY}lWe5hcNdKLPYgaBFJ)*qjWxketbGNNi
z?RFQ3KbISu@rtdLZIvW9-q)ebIL=kb%h5_OxzfREGfZr$r+|K$?icSl@}NOAUuJEi
ze#V#9UGgl#NEb^8qYTY{(yZhE1Uc+$bGWCl(Cg>O55DxU@Ti}OYup&n%2#OdJed|W
z07(s=?s5U@Q@Y?5)`$9OTwiuwo6ld4nn}F^s;1t5KT@jjI7-iiu{&Dm`P)ftD-P!k
z#;MjR&GbmiMB8!)W%veIk^OF`(WSgBcEkXcArNG;{^pEpRdP(I95cE=p+;tjFG_#X
zJVK%17i}abC3Qqwl|syxjc^;QfCQ0~-8~7umYSH~iOe|1sh8hJ9g?4Ocs>qVY&-H{
z8+IFVh&yWw^zfpQ)MYkN6q_DDZa|GZEi;jWe^<kp$7uxNNg_@0UN}6uL{nf~R!$v&
z<#&`?b*%4kj!)d09*>FKujU?VyY@Qo=^k)cxDQye&VAQefE?LS*-5chBG#L*=dM-#
ze%pCUl)w28oWg50sG@jl(s$1uAAgC}`392O%GR*M)M?;tIf2^Z7N^Lr#eE(gutY>s
zb!+_6m<%H<!1h2EWCXdtP6md(W;jPT!0HB~{KXLUFjCcoWyEx|QvuIH%WekgZrQJa
z4&BZE?x)qhN3mj(SRH!iPrjTxS`DIn8LT+dyv~aClWERiYv2&pV_t!GEhwDyui2SV
zatD-RWO;^721BOi!`WvSKA+C)`!WOuxGa|3%^BXgTldXQWgMLCF=`|T+?M=SqYi4+
zEM!X-Px>KP>Y^!#E6)3Iwx_BnE?rT0VTB}sHDP-m9%vAr@WQ$ye5@=~)(Ks8`pqQg
zkJ80XP1h<-jfq6+@E!{3Bka9_-<)|_)iQ_%s}wplh)I2)gqr<cL!N`#aANI~{L>Jy
z>>)RbVL}=kz5sHy)Rr~j#WRp(24U2_;%nY;-^hTY#wXr7+-e=oG<S^rFaFnHXmA2D
zj!EDjfS;N>jX}_X8}5>32GOzH&;cFUo%TAf)@;b8a^{{@x{nR_1AmxH<O0*YCI@@9
zM8xdTLUp}uFt^J=)<V^u#)p>>?2r~D2OKgHuTS>zV*2Ugmqn|&IvwtfUBHn)p=FUs
z8l)|P`h<__k!R=CaI6a72@1VtyjF49fh>V1`oLx^az}A0ih)Z1{P-wuMZR?xDGQ2Y
zwb7P~fq!Tdsu!ZyL@(Bmg|neft~iRGFvz+>!ld-8#@O1hC8RTow+-?hDmLHm>yF<_
zmFn}BFjnO=#1M)01%3=7J1Z`5F5fw$DJLvhq3mA3DKpv#C(*`?$6lIqmU$$r%sc0s
zGNiC-4DoICdBW>CH&5M`-2DSKl8cn#ox0rxEiSJmgyrHfLrPq}%#%H>^=Q|X;=KUo
zO4Dp4>;mBSr;Q!S!^e2<$S9IVU+C+yxbM39wRmLy7$PeVePO5Ja5ub2IonQv0XzXQ
zIt#Ta?Y<ZuTc@e7ZzZNpY(o4+(9K;a`&hQ5Y=%=U#Y(W8lqo#|{mIdjJ-p{ZoZJIt
z7Ed;hn&i{CFK`AM=LE*mmhE$lc*U*e*R7DT<nlcECs}v}9ci=jn{AX{vTbWugk}qz
znyw<j6>Pzn%bTa{+c69C6btAmGBeNSSHwzDESPgS4XMzYoRIuGcPey7#Ezcy^V<Vp
zfQ?o9wDOj6c@56gJlu1}{>4)Hfm>N<t0Gbh-OA9p3K@ntfTo5$ZzbeS`oJ8?)TB{p
zZ<Io7>exn$2+662DsAoou+4&q6h$A!*pB86v_tYPZ7F#sK)nCLM56p;O7t*kd99aJ
zYP(%ojyKJ^7Tj_Q7~HZg*LQroxecRmyu#4yjRD_#DC$zFj97A2Yx0mmfzt046${WP
zHZrxa+cPLgsPF+9(?#O&&NIk3y%G|Z>c$L7aZO)OlVaF`J7GdqqPtf#Iekn34aGt+
z9lnR~s1L=010d3Byt-=S`*u+O(EImvZy|6+-0m){7{7%ENMqiS7$3kHyw8<^WkFG2
zF^UP#({|^YzS0I~=x8OD5S;P7FZor&`BKV^ISj?XY=FLd01sbuzh7j2@}1U?qMa^8
z+DbhEr6R@g8C+&qUVT2J7x4>IU7ZtLR6)|Ipk$c{5@qws|AzY8dIw??M8q84tK`kf
zd6d#9uQUhW^wMPDFH|3GF@wY*sk~F?EQ4YQt!C}K`VnW@{Cp#Ar=jjuy;Q!Lc!xqH
zW3uk$_FE~R?`PsqmbWxB@G-5~nzxzRP5DAu=125pvq(3n_KeuaN)QrsFQ_E?^6w2*
ze>xqe(3kfSi(mQ5q8rLB`tu$=9@noJU_4`SvF@6~);iqTmLd`()MHjIx&@#)zZlAA
zD2__Jo2C^x(@1^2+?uY~R59i!623CJIsz$lOfLDCpE(p<qYU?f>zp2$l}m^Eqyds+
zDoO8txFf39xSz<pJ7&m&qJEiJtCHZ!!h>QEDW$7$S;_P=!nS051}>x99T(IGk}n*5
z!IPS;@A=FGB2e@ZSgQ+m?k%Uk(VOs?>7m8A>y|PiT+5!Ptx51)33gDQnvXfl<0_oL
z^KUJ{6$*QmFaq&?_i+b4GxE?~+gaR_-g9g3y))l9HjF|O7$QO$62@7Mw!A`2M%3aJ
z6;9;#7mB&+sM=B)8KlJt1><jU%$P8*k`6Pwcj~Y0$m5w#T=}y+ZB13eU`pmu>LtUZ
z--r_lJh^mmo#RK{-i5|(HD38;wNoBTar5DMWG`2)D$i%if6V*=Q4>r`Cv971=T7!j
zhjOD4Om-R5m36@3b+&o?mgPMgSU%n*c(D=Ag@tr@H`svbZ&MouA#jZM63&ldxqSc9
zUMoE8wRdI6X_bWNF+sGjx>}7D`St52m@C9cBEYS71rb++zRre+CKp-4yDoN>e6A&Z
zRL!Rff@}92a_N1$(YEg=jz~H6;s;su>dKEcq9aI71cnJEb&ya6gx<$mqrm_*cPJpP
zokI?=7Taj-n`nkgWv>Pe!(|`YrM@*L|28k$5rc8+QdvweKvW;xpRma#Cf*R;Y_<}8
ztP)y3vpajF8dM4#Rw{Ia>?5B!y|8}roySq+QF2gS6@RA~xsD)ngGADx-Xz0F`_ThQ
zA^=yN5`t72^@eQPnN<%TE7b}Aq=wl>a?zw{vW$Z0qy=I0nZ99~t=);G`U_19?m2qD
z^8Pq;5ZOCzE=!~tcAyytUO-ZSL+bia=D^XFztImLr7c)tt}Ts!LI}nD{W3%8r`aCW
zT5d(F@q<#&f|=UnkL=v1>skcvpR5p{u~QJ%xqbO5O8s?G>X;ZBO70foQAsc!Fs3kh
z?Y)uZd|~>_1rZTVf(M%2k0C(2?ntYtQjg8JPX1nUG_glEr-K*;J(|dUtfunCpg@d-
z3X};s^f>}Zaj(zX&Dg4|DN~^->Wp5(!}_qA6`9cXO6SZ4kFa0%8f4$(Z0*8a9|66g
z{V7>dh!FXdMuv!(DQUvFC^>K16X^3NXh`ZJPFTD*GluXX`A9=dkJpLWD82G>`w1G_
zKT}Uju%VdzK+hGfGfEY1d~??av@Yc<LLpI)gO@E>Fft%!qc}y9B6WGGKJ=1r&gIKM
zPE<z|7|=-r49h{B@s$4$V{ZYJ<r=kZD<CBeN=mnggmi<Xq)1A4qkyD<gn-fw5`w6R
zGzfxpH;8mdhjfWZ`mfvlzW>ZO^UdDh4CBn+Jo?;m#k$ry&U5Vx(y?=nKMhbQIjj0J
z_-yaHN}@<{f2P<JHt&ld*ZJq96f3RLuCltPajXml28P5aBX_g72g__qUOB()?gtAV
zxsLQ~by~B+wnGjdpV_$YrhOO9Ort)@q31G0KA1X{;3lIDYl5{U<1ikbx;Tnnwz)UK
zYC!>C^;zj?{NZy+L##W`u7AF#t54hb>@p5-q+~x@;|$pEynGX#B*v_d_Cj`uhM8*b
z_9WQYKC)a2rZBydofl1D0pUwhUY}Ij_;uX~0%ud``Q=C;&rlF?znJb*KtO-U>lX>`
zZ?H1ln!hcB7vc*ytt$taWJImpFX9O(2yh4jh*Q)cN^cjXBRIZyyUv1HEz`l=AYpMJ
z)4d9nAP~Dl9oL=|M|m=W;H5R0PyIbYxkruC1dH96C+ax)pNH*jBVF^h?q9qQy7Xyb
zdQys^zJwn8it&#CCfSrj-QjYLu?(zRZt`4%_pf9ogttv1$o41D1}DD)XuHlrPLas+
z?x6<2iF|}M95C|>Rvx}7E02<_8{|$JH*b$9O&Zg+lHbBee`rj`!<zdTkD(9@TeJt+
z)~gF(Lu2_rH#F=ULAvSxY-pc{zjYIQ5D7s4E*o@?SkC|n#f?;AIp6G#ytIljcMEN5
zp8pNaAb0_KY~YYdvkuEAOs?*`JKEsyFZ0zV#S+lA3eJc}p^_2!fwS|YXk%)Mw&e5Q
z+N%9b_ZKdyM-IP{y&A+^?pkxL0QG9({RM+v(G=LZP~9WGb~Y=a61y#^6BGwlUF*N1
zRZ=hg$Jb#IjsCs!cB)H7ry+SQD&~Mv+Z%QNs^n2YQv1+t0Y|jSN=JKUi0M`}R(hnH
zrMKxWtx%pUOjNnr(j}DkKci;FzkDyR7dNv_b{cUIF(h&-X+q~?%TcERjVW}o@y=al
z{82SYmj4t|7RZ@N&sC@xhm4E%I!F2#V0o{f8ujZ`5pc>6vKfc@VFpA%nyCLr;|D$I
zEHuQJ|BgVvo}Rr67HWRMlSoNJ>^qWY5WiG%Gs!_))QU%glhzCo?T>WnKP(k~p0RsQ
zi-sm2ZUde#k$pb`Z97&dlkb5McWIMl@t1XW%JCD};48E-TO(*Q<*Mr^TP7nL#5s6P
z-V!q=UsN~#Fg>o8dk8s=wCF9u#DRvHTs0c|yykM#7bj~E*=%9k7I^4bkyv5U)He)v
z7=4g2He)g6a3J7Rcgn-^P_yx;n{KjulM>VMJ#=jy!{qimqtL4;WM~xcc=W<Y694~a
znx$9p(L{-aIBOvgHvql4`<k}y{GU<}=3PydKYt`-%i@*jPATXf(j_{`2A6f{)Mpp_
z8kaGNOh~iUI1nmTK8-E3SkuwFuc0xSD6KGi=TFDM(?8brpV#_(Gg5u5SNdzW7d3ca
zr}_-+i>~@^PamDu&N|x@+#3EGS`cT9(XtL3`gPXYOpn%G^p-GTo3FGRT^;xX22&(*
z<TLzhb(jf(lIfOuJi0H*bA)?hCDWHuW-p($%lYx%2(Dt%p=<a9rbQb|Vz178oDMYJ
zZyaV|w#GiQ5Q@rq&mZ3S5>4q=$0z0=<z?GlH={BxJ{q7{T7G?`WPbDtOGG00?|VT1
z+prj0Fit#I1XE4+0+-}`6LUe{<Kz8F$}zqQD(Pf`6I%#uOKV~@+85^^Ln$Q<)?r;s
zvGx#1^}K?Tj9BqfdES$KfpbO!>}u9sIdaI;kBU-!ijUD>nxgFpK2jKyZ|1pz%d&iH
zATrVC(67Ts7qMgE54|E638|RgUu!_>@scIok9ONS$Q*mCNX!dBP5#f;linagkuPTv
z3<;~Xf?*rs_k7l?e0qNQsF9bfqD$+RJ9vqlb$9;K8(&N>6!*Q2;)nJN@GyI>HW5@c
zT%4<9Q)hVf^6!Q41m~WmN8<Zg3X|$D&BK`OXPYK*Y%@85T>aD|_p=|$*1dBE?$6BM
z9<r-9v$hl)pD3ffRG}>@;~?kwrL5t*tW2xwqP#7d`Qn=xzF<G1{6xB+Zk?w_T6s#k
z5~vEyrt1|s)V-9fCW_%m-st>bNKcS1nYWyhgzl?7Dkk*1YQ5K!_=BEmpS2Etx4Aop
z^{JF{jRps?)k`k9i8!srSI@bF_RpK~=6&bg_fJPu>T#r*v(XNy$>dIscARzRY+8b`
zh#kcIEIo@4>T^|6@gi7@{K+gapCodA8in4V4k!lh!a|mrIswtSN|h%(6-<XU4(1_m
z;(ygt#sn=PeXv8W6w?nTrgirf7lNzdC%SxfZ-fSbUV!zAJ*YAy<{eEz+8+ZtM$*=v
zT$~-F7X=T^uf(X4r*?bHUK8#wqJrD2N9zn41jNW*q45A`LNQTM=hp{~9Gnd-Ax>bc
zH|?{Qd=@No9~K+vZC|_1IjY-yH#W4RkaGLft$0{Q^(~n2HyVH`z$kBTVQSu3OmT=S
zx1V4T&T6oRoSosxLo_8>T4lX{t*A9uapc*LIB3)@E72+oy^jW!E~iT``eo=Q2#^E$
zpXxqkQPex&QR7uKTdr3f6+X#?QcZ-ZndF*iKFO}v|D@He9<x9h`MW<^E%lC;5<Aph
zKVS|h<FO%VMq4JwUUF0VG;BK8aeypVEg7EIj`CJZTV~is4ECLKIpo=*V=u{4yf#&T
z+>U#s<i%L|>z;HU-U?J9$1`Nl)LA%U53Xm?6j2|&n&c;O*xubZLOjYmxHOnEnLR@y
zC<Xf2Cv0yj3AsP;hmpcBC9HG?#ukho0z=}~qLSP@CusQnhxtPlEK~NC+%j~EoVcXK
zq~?umx#<h}XKy)cZ`G!c7p|SBx&E5*ai*lcI{NeoDRW-rW_DCVCj8ZTl$Q(m>rCxJ
zFH)tBX9G|N^P1wsVe?G-y+wNvm7Qjtcj&pXTko;@zweF?4eD=ZG76=tCE?$0KSi6d
zL0ExapHT|wo_8w)CWg%-;U&+~d!j3qF%G?2RIc<D%TThuu2mwtr}4fQ%E{gp^AjuQ
z0$>MH6EzDZ3V!*}dxF$*<f$j`ckX79pf$%V8!V^ztuXXxbko{LpZd-j-Q%YIsEM{u
zM-W1#>0DHn9!<46Bw@&aG9pcjgS=<cn|eh_<nR5i2T78-D2p)lw2|E!(ZEIPW?mwB
zxd9Gdt?YG(VMa~<*m6`SKLIO}LvfN(H?7(g2^;E;GE8tC4tf(6!zenvpe2RLfER(z
zdd&SO>yym<bDs2|@7;(AST$G)#-pQ)=i*$L-mw{a--j*4ncn>RAWUFD#%Fz=kH;kI
z5&mpHtF(w>7W4~Hd$=q*|5g>6=?#6<x|y%ss_^X1Hev(Ojztny<y6y>)=H7n3m%<?
z%|!2L&?$&=Ftn(@@Oa@0TYWl*`Vn7_d~{{D?YQsO=Yv;%L-@r~ml$vr=QKSY8oP8c
zRf?`)pNlWW$S6YzC;nKD4BucMNy~Nw9n8<+h<u`PirNEgZL2CL;YkShNeAa4Qnd9>
zhzZU-W6HyjAgNo7AjN*|g^xq_bQ_1<^J?|ccCB>BL}HE(A=oU>SandjMkL-CB^v#d
z;V~n`Mp|GRJW2^=AzV%QPO%;D#q~_V$8Qd&O(N;0DMKu3In#Tq52kNEuFzu;#1yz@
zp<-4@`*Q>-E{ZMcc64ROZa3PfT-?~Fb*$ZMcNfFViEukyb0uVD3gEI3(h+*!024ET
zM`2>d;Ne-(y}F|$mxu?SF)bL7H!crvJfF&XBeb40z>b9=gkzPxgDaUs=d@SzPYWY_
zVu-rSB_-!g*~e7LE?kOW2B|?2zcbGR#NmGjlf<~zd8{bIcDy88tKR3@pdYGSCvn06
zR4VthQesW6FZ0vAED4KCplB0*>Fhq8lOUrZ;Pyk^{wM8G3*t$pkhJA;FY7@^mJ~I*
zG+TD-B1OUk!n<Q1AY`Xs)I6H(%#?G3<3>&&nW}nY4pnR4D-6d7MuQD^sg^)>PS(%d
zZv~fyO&{zwXNijHc9J$XAdX5G3m+o~(OUv^junvnVDSVX4x5LpbbAlRfgmoB3iQj}
zF8H;$an3Qu-SSBC12fU*#loi9nV|h2#O)Day7RrV$M?^$rrJa$l<`-9+I%jZz%4Z~
z39V6Uu2P9d0E@Y8cP-n~`{(xeDT;RZFJYwnM$wX9K*yD19j;xIPahtBF8Xd8wSVn7
zhFla}#DE{3PF$xCE;p;Zt%w8(_cI~$YhJ2cV9P&!{)2LH9moA=q)`7;+B;<1;ieJg
z=I>#wBwX4cldFc3@x(L~%-#DMFFhM16q2LXP20O}KUy$>?J~b^<yYSR*rZoBFYM^<
z$H(6?)N~~+URLBiUTm+o{Nnokrb{R{>Hy#5z4~+qj6*Ph-BY|>%GKSeoeMAgJ|XJ^
zWHCNT&F7G(3z~MFplxnPC#8<AG`+;6%yeY!r}n5};MZx{1M^&O*+%$u6dQ{UvM3ln
zJY+|qkI9tGMuyK2QJ2ff-d6$o82kX~<rHD2p5vG?KKc@#fH=xK>YHU#$r-;{zi249
zB1#o@Unq0E=Dr)dL8RnVdk6D&po_#4Qm{P2;{68AN%B9e2$MW)4QG+Mp=23B>3MIi
zx-p2es|6DE3i-4hny4+8Orv%&=Ppw*T*m$c)QPU8NcWR{+N|^yI*LUMIvw?|)u=&N
zg(RRnZ)R`e2_mn8&tXUDe&;8816PS`YSdN8Bs)i>B0QVROgN+h)M(~HNYUiTsK05_
z^lN-<38~L9=->)$rjeB}xIf_GsB-jbAcgQFma1psTMG4E3R7#oB)gN1l}m@_j8%K>
zl(*~?av!;L`zK@~9-Lm#&!HC@!kD{;W?8f;T%y5I+ih!H<hxttH<*NI&0-Q_V))2v
zSqI3|xO5+@LiHq0NMyRAHty+-Aj~l*kjZItH?KyBVH=awku9g(PM2v&rJ*8g0?7h(
zepMX+eR0_(*yPp^OD>252rx^mOgbgoWAH7apM)O<J@V+Jdu{4@7=P<kBi{8QDb-HC
zVJMk>YRBnc4ZX4P2>gw;dBopud?Qv9HSKY8N!%9QS!Lzo7X0#rHG-#hc>{u3;s9w&
zWYO4xMT3N=a|M)AWL%gvYkoSrX$zpuP+nPUqKusN5x$$1jzz-W@$=T-0THy{^6=Fx
z>Y2O3=O#KEJz+Jp1-UGBWT?zD=tWI&X!!JGNM^ez|LK-ZdZNa8H1YnIC4)Lcz;%h9
zfVkH*uUT6)_A#)o#J%h6tZ!eu?2M5xp}MzKh$AbWNp%(`$F^$rp~iJwu~&wxd5bTi
zWGDs0bj%}ynziUS-Sr#?tEjicn4G5;;{%PXa3)oeVd{DyZcV<eGc?I|<+re4uc4}!
znRxm6%Ay>NtCzZ&gK2k>>Tg&JDl5|sBE?KyIz7@`9D=&pB~Vz;AcoFp!~ggJ-vYJT
zOo#WSglJRDZEJ$g2ND~%LX>Lf%h+(a6H468S&Lc^*@#(ub$rowlNcY+&wOww=v?+o
zyj}RzQixROy<^<+Q+b84w%AM5vuA#-zoT=Vhvn;ClxtjKExR5MGvw{}ytu^ED|e^%
zw5ucOBTd+w!eb<YQm$d@(TK?G=FmZLSYXme_j=%!8FA976J?uhA+$cIar~I+S+W@A
z&fl{EH(NY)-V!y1sf?9}-}|6is$e`Nk2$)=qQ&8$jJq=#otum+#%y@CYM4!}4F-7Q
z7K`!syq4WLMT(D;2LgDoWN-cHUWU8xd3SCnZ%H7@zdJl<T!gVvIP1NxgD2;DgPs;;
zkg!uH$u-to|F&L~6OL9H=F-iMm>$_ArjA$k^aQs8{<&R@xMiCTjdwZ%0Z*67?BbT!
zGey>Y1<~w6G$WJ<FB1~{`%l0R%+4&r8xIpXP~aQ??T3KqGWi{8lsvP$l4$?_^M-*o
zNY2dxE;#Jjzy3%npjmvy#CrSkKd&<^>KP8jV5wHuzg^+q>!%}(kz!zsVVC=li@ZxK
zjKX7cw?msL{NFF~8D{WZ6YwS}__F?yIaVRXvL|W6U#fV>>z_-RCxj?JWPTQK$-)Xj
z(E@fww3kYkjtwmnwR-oLdkU!-hJ@j9Ey!P*DAX%|zumP*DO{n87Kb)1HnApLC9wf7
zD2(OucGow(cGt~eJHls|_IrE<I}aY41hwni7g{39N~Bq+-N;9#Z~uHJ6K#p-o>>h#
z>?$20PxL}D!z9lz5#aVc_^=^|meR=NeMUd?57)ylp<U-Ruu0<3Gea^FU@qY(1rIxn
zF5t@5>*KSd0O1slJSmCQ{O8~3O%oV!;`$BEtb4)nk+K2rLmPaf)!MgrzcQ(oj;rJX
z3o-SgCd#eT5GOjCO#qF&Y40f}(H~a)ebJKCXz&Y*u{kLGb!=nz<zO1pqtL6Hp&)L@
z1@vw}dwrG|58RYwdmOJ-2Y&@W0HJ@`)u5~`_C4PB_YVfbFx1PUcl<7nty`S!`6+*7
zQiW=0Qp%cGq98W>S|Ou(n(}aB+516E7<T2dGReW}fB@ntEClJ{J_`GS*vjt#l{Aku
zXTc~}`SjOTAFMW3pX}*wsrk|&<2#nf6VB;d6i=rgcUT%sGdKoglU^od4Qm9mOl6Qs
zAce<XuzxcDZg-VkofHN~LU3ba0BRw90tTm-N|FWUgFH+U<x6V$Km?|mM+voWmi2Ic
zdqSf8;7XXJEhZueMOVuQ;<(Gc?2b&9zHF8Wio@B)M0xIalbFI1NpxK7R0uN>h<E}J
zRU9J<WPHHSOSL&rdkXwD4+pC_+!{4Em_M!2jZm-ZLwxU%+P^-yLDk(^7`qB%=<XaO
zruaN<AE^~V(DMxWa7-fv$bC>P?_*ALFw?JeAXj)n)Z+B9i(Y*X4bmaEWXAKrTQm(2
zHl3&bBT|GP?xMgb3aw*-u4OxVH9EW-<ib+_HFYG>l$0gWBZ=^zeS~NC?<-8FDtvM?
z!VR=%|31$oUtpp2XA^3PB;R^y^?TJL=4ovV+7ify`FfIXAKjd6&nnjA74_+diRO6y
zwvhue&_)eC4FCFhKJ76sF$qxNQV?prxTk|R4KBk12T<x9Kf%r$B8?NrQGT|^GD`U;
z#bZfO7ub2nz)rVC(yCo5RmLdZ4&jXgbGzXWu`P+m8Ny%+K4omy4}HklkFUkee=MmA
z(TXzG%(0r&vF4UI1&`>h+Lgo{m}v;Sh%dI{uX`A*rOLjjM6)`4vA)yIK#=ILD-yos
z_~I@-*PZR~3cHc+HE||#oCqBjZLWZ-9&Q;1y`nj+k40CQ)g8;ImdkZbeh6!zNi!29
zQVsRGocpTO)Oz3EzX@mANo{#`oO8`inGXwQs##0uk%-A>Fb(TUGKXGSGwL4HTkN1~
z^ygX<Yt~axH^`)Y7&wiIY%B!FHY;$lt5B+zevY?@Du;8jhtSJF-gTsdMEpHuXuX)!
z<k@y1<%7VH6Q67bn~_d8%;aucclE^mFP_?WrtsM(fqF>IK|)P%9?cbIOe)`~F2jl@
zk6&L$7#CR6A-pRuoJ)8SebomIhH_9qrh!xR7_+ZsKY}TScH-lQ^)`4PuBRdeY1sYl
zO(B!p75sctK#K7S8f=U^Kr*)4AoCD<Nh8lX=<l;G6*tGVj3fu?GSYj)&_nk%S9)Bl
zCz5Y7H~P@TS>WBw-VyE06kTrYf2e0k7<QkZwoNShlx9QbYYsqZR`B@B+W*jwDWC_b
ze-lbn&LUb$*^A}S+2_!j4J7y=rx&%a$ARgZ!GK~)9Ht*->nwCp?_MSfe(k)zCvLpp
zciG~60vV9W^?JnVJU=?({?4^ux*uyjiCL?q%x7x7td=@stv1I?wYdjo`}55p5amL0
z<(n7%TC4<RoQ7X?pQrQMz-;XJ<S_A#La`F6D)}ir71P7>FhD=|!Afl#f_KAC42<KU
zqtL@9u6S>i_YK@JCSN6Ef5W*tMN?I%?W@!0({r%%=RtJ2UO=k!f?nOkVC6^#U+m6~
zFmHwYtR-_Jy*T6rT>)pC<aTu(GpzXTK9%~n*8gN!L;LvQkEgE7UvzND$%bZpj|}Y}
znDisbrjTQT<vloerOH!#wIW{A?lXHHDe}4oE$kh|&KkVC`Q^H=vs2E%B)=ET5lj(r
z%g}ojN|9Wy_w^8jzjvOkUx8opTW5N`KPQYY)?oO*z82@qi}Ta8nK~a9|7bKiPe7m3
zAX|^dV*~GoVC4Vt-N+>wd#^DK6&p*GLnTcA{8D?wV2*qm=+uWRk6|M4VBBzv88Hqi
zJi_*&(lUBZpnxU_%AE}G?v^rx%FqjNa_4QlR{gr{>G6Im9PE;Hp$;6T7juU6&0~T+
zGEOrf;YlCgSiyr)q1E(w-A^_es+N`_ULbRM0jd&?Qm9a}AWapfwjo_d3si-_Ma}Th
zX<Ilc3pgaj+l-f(zJU|q3s48LT)=|5R1KTfLo@i@3s9~Il)`zGWn&XOG#5attCp$r
z!nu~JJKO&%DW|gA#^|k5c%&?quqU@d!<Sf_$xnH{4bXF#(QdTBcD02)zaY*;m8;V9
zMhQ$zZ8LC+cmewlb1A?{ES4*PxTtcey%CNnhN~R-RRjBdr4#t^5?&uG>E7GW8OAwV
zcYFP0bDaBeGZqUYG6|0~J{14l0*F)90f0$&2hcBEKmu~K5>k1L(0C=X73hxFa~F+#
z)zqdV#M(lBTe+L8t6z*WP6~uEVZ&tB00Zb&$P{GgesQy9%!s<rZxEzsop#68%-?xW
zq!g$j+dq3XA*b|5kx-20`MRUnmBmmv=EC%-o!pucsy1mU68?USDg|gl=n)n|ackMq
zGW3n*d`BgaKwl~Fz?=-WOIT)o-4^IF<~1Ta2b7WW9cZA#?L1SfAZ&yq^nQN%*D1vf
zcJJG66^O(30*q$EfZM`sZWy2mP|QkRdaAod5bzW;Yi!0QCz2@7o3)`#X*+v<Q(AWy
zPyDwDn99>o#`k$?nQWPR?1oS}VJj1L{4WbYHhwS2uvHg-f={`+h@%@{gpt^^^tZfC
zHh}Ovj^YGk0}jQ(%wG`3es{jbQ81(rB8TTx6s@_7G_T=|ISCkF>6~zRMIKdshipcS
z1un%M1@HWXGQ-x(zTi(>XpJVKq6X=-4ZG0Oc^DJU=y+5%Vz!<G)w=oc8l8vW7^yyq
zAC9PBL(D~x7zgPYA>*g+Ka4RW6a0$aT6z5Md}T5G^Tr~ksNc*tid6S#`+Fg!Eo9LU
zJBh01FI^Pgz_(8xyF+nk>hK5Br%}k<#nv3zxuT4xZhEDkpX3j)hf5}u%`4zi^7ndM
zm1rY2oBVg*AFw`It<n*--m_x#XoqFX+I;KF&bK-x^r6_n)e&iARV|I<(CLZLk{BZV
zz^@l|qQ`N=+LseWgNPN<SQjoQzA9*O&?&xYy|QyiDH#1{>`c(3)w@sTJT4Hk(#r4d
z4>$6foc(Ul6R=&DxGBGG-M0HRs~>s98V!EF_7ex&PBi=ux~d{%G*5S>Y7Sy=Xd;29
z%jyX{bYdj5ykA4wJqy!H%{!Q6Av^Y-d94v^;%0VJ)Gmxou|0rf6C+8x5tW?BJBWe`
zZN+XIL1U3tBuMheSpe&G*=3=}pi^QT1tT<(v7Db@8}*e85hd;$uY;8r5OqCY&;Dm7
z#dmxl|2?vH&S8fi37Q&_QqbIud!53_WP~;+x1A5Wjyzus7a59_f_RMiy=um*1I~z&
zb*%h!i;jywY4@XD?R<WESQ4^^gvvcAo0mfwAR4{AErc{+Iu+d7fXy>m7r;p30#hl`
zUdA9>jrDMl0ab*`O)sZp$8jVN25V_P#Ug2u$(ghe>bB;dl^PDEk5OQJ^Xn_@V&ttZ
zr@ktTQIO!A@Yb_F!IiN6_V(>|K4{@zfXrp7faWXX^~;#MHQXOU<@yu~nS`!yyC22|
z1-6!Z>X>|I(U!on38Y#|@LWl??uDa-6H*|$=6>nReo#V=Yj@D4MK!-Lxa}!s>D$rT
z<Ohf>@P_+%6VkeqzpK``22p#|McIuNIX1Khg{FjSP<7s`_reLR3@xQK#;JbTc~#me
zxx}0yq>Zn@<^BXokd$3U+ZkmsWo<ZU%<yXt0$XR-+_hm-5qI}(yNVUfV`h#a*8GtL
zK0Rf)wwOL?d~WE4_9hPZcHZ-=q?5CDFtoA}l(f#c@F(?-)HaPL=|1J{w9f?=kC=i_
zvaqpQ4Wr0;x?HZQ8a(l0LK$K0(GUwSMAM-OlZ?eAk%?MUsy?rmOUhMAvpS%PKl2&U
zIuAOF5pf4E!O}qmBxr_KUnw8M7&YIn<=TBCV7(a>V$YE6^-91w^TAuBx6KNSGF(Oi
zSU&4<TYdr+%&>u4nScAcy<Om{+p6VhdW_C-(sHw<Wj;gl&Vg;Xv52Qa*AeD7^nP*)
zzgk>>88Yq${-uZh*<&iurzPF*WDWgujT=U{U-P2^*m8d*kn>uNW7N2<CZDc?ZyZ0|
zY_baacK~D6rLEun65=Q_O^lzcH{(($EKl+GhwU{)oPH`;-rO&jSHD!X7{ZrIuV7;a
zDc`Su_h*)h?}_65dFLauiSo4U6T5LAb&4~QW+#2<ln!&}<45PnDFTiig>*Tf8@=hk
zzgHv4^Fv<twH753zb-QA6;<lxUz`vkQPjF#Z*;LZaex^CGT)<N{G9`>&UjdYG&TN!
zO0Oq)a+#k87y4fzy`|d7C`L5e(9Zhona~4Ka4osD>tKfINg@7-y-04_1NFWA479|-
zT`dH|=nz3C8W4GX(H!!Nk|l8_aA(cQlA%Rtqf-JSSRGCYGF6+B^?xABq_A>%ugzX-
z42D0QsImQJisYMa?{6$cLiw_pwCt3+<r4bkNtuRmx)*Qa#yt{tL^1DAsijqiR6cTm
z-d^sPbJWfrezJ+Z6(sIh*2S#2Y-J~OmSIn+i`Iqj0!@f0{Rpy6-r;M$S1v_Isde0K
z?rPfz@f*kx8H;*8CfMu=e&b{Pw>nZ!tozfeo2xyg9tF*(t7ZrzHvg93jXncZYn*xC
z+2iF?ZuZrI6pW?5aZtzLX0lAUkF884|BZb=)3lk*6_$_h$U|=)U3noco3{u3b3QA9
zyrW1vkY2!Qx;0{isI|pR6Bm~aa2GeXif43~16VB>5=qMXeYra!gm508r9sM=M}x`@
zIgCM4Ve9F3v;@{dR!<pwM!4>cbbuaLBPqx8`qgIr)iB8ipt<b$R5$HsTwXdYOD^S~
zvZgn?*8Ky6lHzL~?47Jw3GbwW@ZuY56yup!fEF!7`3b_hS75C=hA3!lpJQ&&t$J)t
zj%11U|7=XhzD6e-AO`jN1Hc&c+62(ANb6>wIYX^NC90>$f+)e7$fDayKs=KPVyXg_
z7@6ZLn|Uu6pe)kB$8Cfi)sRwST!aDr<<sd;m;*@TY*(m==kf`^{l#$TY_Q0MVnT-~
z`y86>PM&9n82<YH<w<$hM{bnr%CFW5@r=8SwFHh$&o>dhlH3*PXSkp_Is^Htb}4Q=
z>>HEJavma;bz7aexMb8w;`bVnVT&smYLA9BnN5O~kA3a_XyTd}7M88Z=#$A{jUaK}
zT*SJU@=b3AjrsLen4Y<#@N!pE%%ivw!PgeAQYtNmw(V+U^V9yfw0QT8*!oSkiP|H*
ze!|yU|5Rf8<ZJpEH=k<kwHeQ&CrJy3aEMf+xT7n>OOC^BSIS>@ec$|w&V%L>A@c6w
z3uLdL8XL7Ew;nUD)8-5)vCJX3Ux3d!{Q%M#@geFBJWN4;4}R)Z5QXJa@fyV~+htU&
zf2Ik5K&j+)+ga5*FCi;)fJ>bDiDAL)m;;x`2?sx)^~-My-@{aN?a1%E6q3m=FDm1+
z$@(p}^=Zf<3?=u6KjgW40j;yFL%AP%l@4;YtEk<!wd{e^m=rHJKqA*gaD5)~9K51#
z#B!If2p`_(7r_DRdu98W#)otn8s<%bSKfmwXozg|uerKs$>`cj9O=-0FO}~Bl}jg`
zqRf@q2H;e!I%DWoU~4KhBKagS6c}6RFRHA*g7l)o+<PX4sfITB7_wMX7bWGdD&5z<
zG39-D9|@`C@>z4VXIn6^9dIv04fD(aqUzBpG$sKdPN3K^>>KHvGL!GsJDMa5E)Ix#
zLgoHeG@PA#sJwq<Cy^@+8PuU6CpgObb+Bnm_5y~gfCXm}mBu~IXR6N(CmA_q{YoRS
z>Q9M{tU2-_Wsl6*Y{m5unXFq)IQ3}`Zp5VaZ1WUK{I~_LQD0Sv2Qy6!UEGtloQgAA
z<GuH8ze|Yv=yTc)t-tZqXa5nd{+Ww)UrVAeX?|?FwjFj+r)p;;xbB(-OhF0#{KJoO
zJ4R$i*20Vwv>&mSK#8-IVE=Q<@uPhFPtjKqLRRqrmw0nbl8vK75QJC=*c_w}gsdnJ
zoEin8e}P2oO9`F)!uGP>?0NKggkUhjbxP)%!GnGhiu#z}y=vchvY4tM9gDQTED#3l
zQwfxTM4`_OxJGZ&tWCMW$lOMxQ%HU%lh6||vv%xcy3XIQ2;GOu(Q$AX$w<O(s}U_D
zhLPKz%)#Ff`>#5t^a{`_Ptp!wpSfhp8rcCI&l11F>FX2HyQ`2s;9~a(f){Ju^xe@$
zh2W!*hmRD{&cO051u)s$sk;;{SEvcAqLCI;l{L5arfK(t<uD=`Jp%|vyL^_W&~fC*
zZgZt`;ZX%swXjMGa=U20y_&dOC~+tz(rqd=tI_(+IzrKjs&<ukN4>F;gT@>^T#TAC
zm;m<P^RkfLy-`EXozl)(Xy~L}gkaE;H9I*@KDq`$Nt#hOZ%iUKg$J0rJ!VK#*PFBE
zN7+Z`lqDV+2>m_Q6d8{2^cOXF7aE|L_Q++w-R-BHUIzJe{N7QdMTz>iFZFSlty)p!
zs9i2heRmJvsiVX#|3nKC4}W)CYBo&rF=j#<)zF67c|G&}@S%=Uy@1)DYd^Cb1l`ka
z43?K_@O-asBNrOEzlPR5nI)x5=iXTi=^1w_OxuOrJFE91ZeWgq^d((XqI2#suHr2$
z!V$pZ@;y~?Qv)v<!(PvShj3?ayzypJ+bgxgP}h@RTlR@_9G8MEzJS2QQOr}AcB#rl
z49*6XeFo72n2K&lQ#VSxo?8@omKf)0RTX4LKdyHFeh_`FnWp+}JuCMqa5IUi3DNX4
z49`7Sn2dPH+_`?uElbg0wmk26NlAkv&ARhlLxIwOT%*=3YnOSW{nB!B38p!IrpQn7
zWv3zxqnk^QO{vZAcI8GGY~UO`b()6KbE&`^5MXwU@v`@x1SVa(K1REoa5E^Yz(puJ
zUC^k7`e5XoR#7MFh^l$`3b#$LmYVMJ*Uxqot8V6b12;n@Q_3`0uTfhWROKlfy^L#;
zR@^vIVjz$gvch5PN#rLUc`c2lkTgV$v(J%qVdu+$(#D_;;8Ah-@hL?H-HG^5lppQ2
zl-Zi{&q}uXpGk@1zKOROVtJdC-5^U#iel;bHH#rQn%*LQ>WcC==QtS=sd*}rRdC;^
zrDG(Vxl&DrDh~ZzDD_{^%mqJFA|=9fzV!Lt>nmt#c<YBEyR?s#{CAGXZte;{enQZW
zz4L6C0S6y9;U^gxCSkBWnGtK=A1(#K9a5Y-c}l)JD0i!v7Z!8N+JaCPF)-xliluoY
z_}`XcZ8(cMJ$hjt!1VT7`_-NFH`EDSRG#Nm+*kD_+KZ4h21Q-J;37_%NZVs<eL1!P
znNozAm>{qrGf#<Me#DC%ZXe^)ohbF-p!sIaaMP<~$KJZtlB9RtF~$#DkO29kWBPgf
z-Pc;z14=9yaM;f-emDF%l{lTc9Ie{mlV`z&^Nl^{8)mCY)ckF|qEC7sE`p^b%Qn@x
zJIho4B%VMVpq2#`-3;l$(GyUOY_{9G={=$?<|MSPrA!@sS7u6i9($S|IjuUTdq$`k
z31>eCSDUKKkGfK@4)2z&gTkj98#@H=$VA!cYQoyz<Ji@zjq<vB$So^o+e2GRx$G9&
z06Ns3ML4B9;@;Gy&M0jgS;lQy5O*zR$@F}4xZCL27-dDYVWPX6H1E4p)N`2FI>y1w
zDA=&_%Kqu;bE%N$As7UBlflwFX+%R)*PU^EyIn0GtCmjAIjVg<B5MG%H2k#P^hbJR
z{$v=W?bt_xM1zaadULfI48Ws8S=2U03;Vz_IpLrdXEORjKLhh30y&9gnrLyhr<7Eo
zv*j9)-{o6b>fJ38OPy9Uq|}YiAZ)3*UbS;>^nZKE{WmgC!#rGHes<`}N)DL`Z}i84
z2W(qmp;xYhrp-*NXOxQd8qMqaLBsvmFD3V|65^?bPR%ar<s{^OKW$cwg4PB&aNu@&
zK>t8IEpzbCyrb-gb7*wP@+3caO6@z2;aRL^WTI-;CN2feC9KpgnSiV;QS()i84}hk
zjY)hmu10L)7sVOU!ao{K4O<rjTdMEBt8oXbt`buZ0;|b@4phYwO_>MpYBC4X1e~4X
zx8elTdEiuPM<L*puKAZ>tM-en0p*k`W`ZPTn{Sakv#fvV<TLK8nMUR>Khv<<b~Vgv
zbq_hMTnoXe<;B!n>FWuI4LYL%`v^Z;_n8yKiWS$GV*#o#@<!-{&R%VP(M@<QHx2?~
zoI4ZZ37sFN)29uTjMli$ZD~dL6tgY)!m6C0S%iZ<pS-0nog-At(as;%@m$~!j_E;Q
zgb(bZ8hCPPF6kQ+?Mk!ZlFTTkopJ<v{ln<>i}iE30<s3*hjS>#CLf>!7I%i(su*S4
zU@_rLa+r!qmM_BEzO&&_5HuanmP1xXwWP=K(En#0CP}<x^>qB2D}f%~!ze7-S;!(%
z23|k{mK#Bmyzl(X@q9Q^Y&^Sy!ko*TRyjeH&UT{0-J$wKi$6e@><$+N!LQIqnx2Fa
z5QSqCV*h}Z18J|;zEai6B7eC^LzOT*n?B=yy6rQfbc=6G`*Rcqb)J1~SpjjEWFl@d
z@KGC+4Mko-lfiN{-$N{dacgqJy-3P&LH@=Ah()IoUAcijIZb<N=>auofM_O@ZajAi
zasV1lvxNL*$QcWx-4TU!?bvBuZUG``<SMFKgeG@Z0SUE!mWk`U23N3P@g}C$STnf7
zjYfBS`oUhiRCZq}wA$21HUF})zozp)d*S|kQV(B&Gw0nK_Mv0Cu|-tHgF)ZNq}oLx
zb<4U6K8UO})tONh;-mptz^z5nmZcYp!6=^<gPdp9PDs3+fqnVop(FZB0fJrJTZtU@
zGcTV>T<+R`X%HwLUg(*4K@{+fh?P_*L^{vUIDRKtWhFSS?L)vG$~nas{e6etbfF;w
zpuH4PLj`D@ZROP}ba;?QOm@kI`mdX+`nXs`GCCbjr;JoR_P%x%!Gmg?6c19fn75U>
zG~fB8JQ#B;I})3kt02D;vi(w)jEs~s<O=?C*V1US%FCIaG6XdPJB=fK1K)AY@x3%K
zdt}Yrq?KcdgZC_pxL5DT=ba|*i02Fhr_$>Y$t7jDPqM`Xju{$r7Jqtu><R}x3_GIk
z9rI{0ey?pL?r{RR>uX$AXd7R$OLwFckyESlD}F~i#9hT6XE}f&J|KB#{JiNVPfLYV
zWR2;Fri~DEI`Y=*I}8ieY11JDu7lwNhqd%e5A+RA$0pr%YsS3M3Vvb%Ulx=oNrR=7
zUrI^eKdfo}T`Vqc#~Wm}X}|xVb~o!2TDMdboq9LK@NrIHV0gZI1NUdS!@6v4NL0M~
zxu4MUy0IU1n88xC{OGtGfuQ<}CKq=5fXgepS&8&(#r}!(l<^2<5Ur*T_Aw(BR`Rc9
zE@S)i{fKq97OT`XZ;TC#yV2z9u21K{yV_yb7ki;))YQ7_6S9%3Qq1C?Z0DPw^J`S{
zj=xPx>zQqHkQpm7Q~{Lb8@utE09NBU9u5T}##2g8HuCODhl(#hKW+G9PA3;nJX5r$
z^2MZwPNmpX5)3bmDZ97IaRsKo_B);=tkDi9*VsDN9gZsIXpkvx8P=IT?yE?5F`LsR
zSxkI#T}=>2e<C6~*sCS037^b!nTUnAz#IEks3B9q;xgA*ICuTW9B)h#cFN~74ZjQT
z`tkBpAa*FWG6LEg`VYl;HN<WOBTrRaDd{S?t>MEpG{&AkeIt5o1Ko+@#v>B`DDRTY
z^|gu`iU)J>;AD8Q&2pPtnD1w-Fqj$rWx!wh0l4Z#H1L<hro*k~MQjxE-^#OXeU7J=
zeOQ?T$6&>Ia}PpneQBLzwm)nGUXNpppM4bu+c#EJ502|OP<y#$KNID!b#$6pNxU9L
zpfoHi_|C!W4yK0e?`!=G$qL!>B#sIKnXF8jEs9qhC0<07@GR;D<hDZ;V3Ej0#=CpA
zn}(5RnN!kK^$Q`fzgkakeq##~O#aSBrrXR#^{$dBqM_@S3zV{R`3S0o`^8Tk4nW8}
zSuYOGBT1yQrqxH`iQ7e2PbjXBJ?$2PDqc^Zl)AQQMyNe&yIL*p?(nl4?TQUSJ9?r2
zFyP4a$G)`+@Esqt%H%jUI9kjfxSK_Q^1NG)?ul!mYv?Mq_ihu$X-L{atxNSR$)vI2
zZ4?cHuyCp&k1iJyUP()B>;Svu<)@i#-N$;>_gCJX#8L*_@8K@I)iFC3tj{8h@#T(7
zu@Q7`wE~B&PCd0(Q#L;w2!G#3z~hp8*WdBg1f54vu=YVKcGADb$9H+boHo8m1hpmb
zwToZ{j>L1%o}^oaA|xRZknj1{9wZaHl`L@Cl<CDa4D8vbB!3VX0BNb0t@nHHwHmn;
z^X+s5dl32#d29YXPhnY+EjlkGw)w`sFf_8(F`PjuM_G%`K!j)cabP5E<7T(L^uVrp
zxVlv<+FdD{yKHJ{Dbe_%yU(h@skVSLc6K^jW$D_yfr}cKe_+CEdt(=&*Ok0G+>)NS
zTxZXeR|gkv&5{$wigD~6QzX_}emOFIeCsXMG-LS<4W3SI50cq<jGQMMqbrC+9zib*
zcMQU6hfj2k=X&sy`!hsNDH4bJN=w$+K2bHVCwLd-x{S=y6QIKq(lQm|=CUcqawul>
ziEIa}PQk%&N37k)Nod-+c5Bdwkd+0+w43|9TPrE3o7?hUQjhJ*hg-{X{~+V08cKTa
z>mK`RbAI-E`zs+yGQ~w%?M=e6+4uOtSv&|KLY(AT+yJ3_g`9+SZmUlYYHlSeWmyS5
zi6CGmqf<Z0rC_jO!EasSd(_pyjc!3imQZ1IMG)J5XEs3*g|hitcFIx>lH%W1RTYbN
zHTK!!JH0W3k3<Oi0_+Wyu41oqSl=3w-QVTd)3nR^?T&Gb_3g%tckPbQA21nG@+SN1
zcjUF4Xf<8+cr+!jDJ)}Tv$;La8#FQ#jH9>oKk1s1GFb}aZSz&!nyHi<M0XDrF~1f<
zO1giQ)0F9fGxUt*`_0v$V5dM<Lfh52cG?-s&<7G+rzqidR>Zk#d`ZEx{q02>*jlJq
z?KEp-qv0LQg@BoMofZ4*y^g6hXp{TNckaYRsCj`xTWGoYo$l6;Z|}l!Zv+&~2P~3C
zz2I{1^(8BJBEaL~Tk)$Dx{l=FG&$-&%A>uMrNx=(gqFp=icdlZSQ=7|t3f64F&4mW
zoX-Z9dM*Z$<tn~Wagt<zsm<5-yUQ}$-lS`=*C1rL`Q!}&y<g^$+t<z4;KXfpsO}I|
z4<)5dtH%uo5Yyb1%5o6B!^IChl37OYAH&$0GTh|E5B|K!_@^F<M14O7a~xBM-TxeK
zoykar6HV9?NP0CO!)wVGscDSwe@Jfq8cb;i^&lSnE7)e!H6`LiQO^%FyhUI_v{WrL
z6N5qMclIfbT_Jqbla8p>f4FJ`pRI~Y4(D0VfoMGI@p<Q3_1_$a5TJYXhe=*sJ<4?5
zYkxKZA{R$o(%^VuNG0KP`;f$ZA{GCcyrJ$;t~u-DIVDx_)Gy&JwGjhq2vM$Xs4SV|
zF0%>^w^=aK+*hlzAK3X2Y7wwn+E+r~*}^!$xQ&YgHrVFs`~&1G*XTU@#f}Wpe1cN~
zTCb>hlxQXz+|n%sFXY9gX;X(T*?EEXJj>W%YSid7>usOOg}y(5WfLN#(_S-UFNj$t
z4G+P-hBsL46w+{8oRh<4bz@7X)Wysza{Mw^m2OW4QeE%nE7<G$Jn*<4&tnpSd5Sy=
zt_E97L1H%Eo_qrVfBxpdbyTCN#kBQBe)#{-izi;?)|`@UjD}Yls}gm-zMR?ad<U;t
zr7LS*Uh(w{NiK+4x%rmad*T*N7IVt)(z5;!6z=#g&QdJ=N(Q;`ZB^vupHQ;ssotP*
zyIgP+0rJ<zieG;x@CC3d4T4jl>TKadit}428iarx1nr8ju5^Gw)&W`8@4MAXszCsu
zq=w*9>R+kkrOvWpelUcT5bOBvUmHjW4aMm8CY9@K@>$Qz?FB}}nt%4GtNnvU5tJ*O
z*)HnShno{a+xjeS1lpSJYX`JZ^0J7fQtUE-U%YS@TTsfp=4pV4%eCjD1_K}QZR*QO
zg}ab5gqG`?#Y{k^M~AC5Yxn&Y2OcyTm;L4mLn4Cf6XEqUK}jhbUh7M5{JGM&OatlY
zc()&s-T#4ML<WhPO3QsHDN|k?f`s`O<rSf>blYeh)2=87lJ5o>7<k=sgS}qCq%K=z
zJQfNr7y^#;IJj`=(8ab5qAZ869E({E^vjZL;|<|%l?<lmd!kcS!#SoS%CQYi!0kI5
zf5fO<ZXwHd>o~n)j#jNzb|d(WS*eHVOTpLGZRyQZj8jdu0Liq(8{OHJGvcbO-UTP?
z?B^wVz2qx`u3x*N^Mm#*aD=!PObD7<7!9j9JPgH)8B?vcf0MV?E)&`l{%SSkNU+yk
zTy!+%t3!^Yo3)5es1NBsvWzD~g??|;XcT9ZLA2o&pM7Ps{)*t0RCCi|8DGfOPZsGA
zejq`%QvxAX%R{KGO+M0`&ZiV`+N=?OYn0O<D~a}v7(u;-bo7#_a%u1($#G2d4=FyW
z$F4V8u3KMzcPvo^VZATf6q9UkdsFfEy(y<B>H-@y^TW?S?hQcxAU~)3y~NF<$6BWg
zihvB^lI(5%YV$!^M3ZjfhOB)4TYxssl}z7Zj@$vejSa_`k6*1wC2E#Sy@_6RC(5T{
z(qq}H$K-@9nofDC?|wZ8bC=XmYVS@k$`s+xMZ*@sL1R7Y8rlPAAKmBmujMf_@u9L>
z)mJCUdc9VNb%5>9p5I(LV!3H9zpm;T=VDGUKZ%5Y?Af-w-F69~thiX8+vo$wuB;Av
z>_2!kr_$6cEuoIN_+9!t5>OuBfcYJ1!Sf_xw4^;K=<Wt<eI2&*dh3Fc0zFxE4)lFN
z&>kMJY%S&EY7C0GGviwFZQ-(Xv-|JWDRdo{y+4Q$28bieNFFl&bi!v<(=+Zh_WNG4
zYidSU$GKMW7UT7qmAw0(oM|>3ou&zMsd}w)9N^2X5#9FbQ9SH#oC{u{K}DafuKvLt
zV*9@=K)+~*;G$V_`II={C-R!BWcOCSgWEfD7I;<8jk-f#Bty+ZF5TJm?9AdoIzYsQ
z7=5gyy`yJ#$5$QQZ!GD+y7NW|&M5JISBVjiG1v9QuHfDSy)U(*KW}wXUCp6h$Ief8
zGrym=m|oOy0<Ci(!j)PQOuUY7>rg<OZoOW|ayXske-Aw<?wePlLYBHvh+<J0)pM#L
zJV;9Rs~@tl{trNI_%3eQ3|1J+;)A=i95CLvw^&K_@VwQSzhR!pyA}R;{Z)tP@d~_E
zmG`Ka;l<sA0S1`E3Krq0<`I20&lZT@@x)hB6gabPV>$<uq6TIvnx<h>J?Eh&-^{|;
zL97#du<jja>Kn&2fK7C>ZWBr{hHPm^X$=2dVsrnac6!6O1zIYej35sjWwL|&QCjc$
zLym8R(Bb%#i8tRDem)_dxU6=GiB67NwD7mowZkUwcKxt1`~$RN&ZV#RDfh-g3|4gM
zK%j-6PvxBIeH-Yk#>JT3B+r~mr|-hKvt-AWOvV$PvQf5_a%!St6QV>T=fcM5Utl$5
zegswP$=NZoKYRUAf5~kf&N2?6C~rnBKOw?7^3=YdyB3Z15}O90e!hKw+3NK~j?=+0
z>qMnzktKHp>5gtSNacv{gHG-msh(V@2jX+c%5e9m<n3PPr=Ph+?b_KJPB00chxBbA
zZ6*QYw3joq>=L`v{ls%zWuW0HUIY#6`EMOg3~G$v$pz!QDCx<@ZQl!7uQ@gi6$~TM
z&5|)XlvK=;%=x^g27~B-ZguLKw$KdEp%EH;fMk7HY&ccR3yd3o32*`gKUevC6IN#r
zK4J=@{DZ0+##zc66oWe`K;J1zt+Gjj^Mvb$p_mB_0k4+FSt}I(8{w=4_Imf*SD*jj
zHwgmD7y;+j!waWi`d2<2=<7Kq+Mp053HI-#Cm^*<P{QbT(H&;}uW!G68$MZk=cmzs
z-B1<>R%ez*Qp+2;7ytF`p76=}ciBh(z|{Zpr==pWt7TNz`IP?a+Z*qhXzkFX&J;r<
zwiO6_`C*$0(sQgB;mP>8^AFxjPha9N!K0ZYhl}8a<&Wl@!|XJ`ZV)&bgtN>utf@=|
zqTm>?vA&P_=odoQEz2KN0@@vghHxe2Pmf)E)Gi>Tb*a_QMX+R4#J5!Uygl&B)q<iN
z?Lz(KuN)P??$XL5NU!|9&@uRo_UzeZZ~}sjhlViVJ6+&<t2DYYtiVY0!oj)^iq}Y&
z-wP9HB|B1{QM3bl5L_)>s?8w%sf^?ICu0@GgjzXf@c-!<pP3Q`Edf1~t3prcqCH4Y
zz!!kf+h<^yrk!Lzny~0IT&q<N@#>`<#`XvV0aq9!ykeTjhSEld0P8JsLBdbg$-c*j
zw-{7}Nsgm&Y4pC9m<95!j|#4se5WU%674?iXg`<^jr(unjs69;>D8a2u&VtI7zMm^
zYn@ZvZ-f;ko4{m@%!mT~5G(Y+3(GPc=lMr$!hi+h6d<elY#gBf&=(d`ng%<P0Cmi)
zOBq7Y-y7k<p_ceW4ex(H3rB%t1m|@v6D@o>V9fVYk_>I!f(uaA4fWW*KlCwDL2~~K
zWFYHhpA8*W4l29M-@(xMS`u<0n^D^krcG>%?@8Qy{SsaZZvb?|eTCQ={ahdEL5y*K
zWap>L-*Y4m_=$%2Yf(C_Lm~4HoB~z+-_!$(&-xH0__5<?K`L^_>;)?vOBv}TNAnRb
zLV|!#XvkI;f=}ZFg=%#BbY-76^?+N$UlG)V`{7@KcyC}*?pV%R_AT*0!|Y%FR8oQA
zd_Vfx`hP>)O7O{oFt@<}gtqgLS+`E8e*Q1h=$~KdVWFvFXI8uwRr?Ljx`_4dYe^kQ
zDNR9LfZ=Pku&pf0)$1Hw0BM#ZG>>E8+gE&+J^Ha<U;5uNuAk;GsvSr85k!n&HKL`%
z2WFcg%Pj6U%f1;Ws$3$c<t8qsDrf<_Hoqpth*<8?i@WrMWVB`S`%7*|m%9Nvg7N?7
zvG|rDoNBi0ziglgNV2??jKG4$lc|F{)56Ys32lMC<5>)cy!ed=QV~#XG8gr<`+|Aq
zkD}4V-4=RYP~cPc-jRt2p??S8$XY==M{O+G>#k^hK3OsPV5r%PSULk1<dk@SX6H~`
z<Z?m?*hOb~L5QdVzTnDmhEN7rVTk=od9v<=^%=wuEAT7?7~~?jptT<la7J8*>m$5~
z_o)!rVQk%Wz2809ory6rQv<S-RK0SmR6r_IZHL~c9v`eeL~VuHXpKk$kd#R|prR>H
zai4aM02qWq`H4Iljs~xgI`|C!{f<>Ax$sF`{LUd4^Zs-UEI8z0T^**l7Wkc}$2VbH
ze-Hn1{n-1YcMY;xQ0`QOr;>Q*U^tlz3li*WpDl!X2X*uk?@E>#D1yu7gPQ>#mdT1t
z#cyt88Wi$Y?4KIC1hqFmzuE_+vK_N>iph-sAK!e1cKaYy*$L3EIV4&mPP?`Mwc=2C
z^(xl#a@l01pCJy-AnZ1SmS^VLdX|ZzO8J}j)F+Y5956^Mc?9S1Mtc#uCnfozL+SIi
znYtPw0F?`~3(IUjc#J4rV)y~4RjX|<fjt0Y6kx?CQvi$}rr?7p5_jmcv#~2emVgWN
z5CyplpPfn7Hh7*wb?S3d%#ELdNOKH$^x8a1bH4nK<|KJd8ZX&f*n+sW&Y5;#TIa*n
zpmu~(1gZH>VBNjIN}bggl6mk*;PJf6y!Elc0C_y7ta_Ydsc+pAUp=#^9%jfok<EI9
zqS3>B3+F%&^!3dh222Uu0e7F7gCW-oSlt*w)*SO2)*e)Y*jYAU#uWh)#~{T$jTVOU
z-D_lm9NaAFE>h~S-CJmT0r$j6!S9e_1IZDy{iQCmbzXtFVGzakd;pIB1z3SH4Hg?W
zw8H&o*Z`Mq2Kc`cGObF1USopB09&m9RQW0m$|JI|s2d{8s)F;utx&_DQj`UCg&N|{
z`~vJ@*#EM`+dwL`f@C$eAp8hgnJE;Mwo|--*C5_>9$@bx1cIUzn{RJlB(jYjLw4=m
zdB+#6ax0aaK5%|rQaYmDRs{FQhh`a1Mit$*Ci(UtLs0lN6UHwS#N>&(uwE1UY?jKK
z!SvDp<3i4nL5URrdQ!&qk$lp%I!09_VzYRa_m!SN5c;+gIZjooz&0TfH@U#raS3K8
z5AFk-R<!qk>pi)Fih-vLN(jcdNyeU76Rm9e5|jXFlf^R#Z4_E-y$-a0ixx;AcoScN
zwdMEt77DLKO=T3gKOeK4NPc}q{N=HbI3F8NM==B**MnU!o@NUa-d6D2ZjrZ&Wm)W=
zLCc9%djRh0b{!xOmIpiwUW1QE74BFW*o42x8%QUEcjE=<eJ`AHOriuA!miZ@02RvN
zI^(rU?w-3?h@6$kR#>Xh+`=T82x$$7jTC@87Jyr;m{C}YevGjLCmMZ9&UH8yRvxp(
zMDvXq3}wU}Uejyg4cdoA-k%cXc1FiIBJ4rRm38c}bTnLBihBvo9n81uj*KX+e?mV&
zW43haWyA(~1#I|BR-r~wKB^g13ksvGxPQiMqA2mmXKAEnww7yA9Wyvuk*Ew)x+^EY
zUh8UrStmC~i#ND(RO?$=9sVjWz@cEBQLNv@Gpo3%7LXHrk!W$3bsURfq}tWmxQ*Ua
z|95W7>r0B!Sb9S+8Yw@9q~Dq`k5}R62I)x#xsc0SB%;A>398A)Bm8g&1%ROlEd>Iz
z$e8;};fi8hD_cO4cI`Sq!uiwucwoU;XCJ&gGgaX=#-PiARHUg<FcD^FFu0zUw7(k-
zAdIUOpmeT<Q{Q_(wFxp)XtmCA^E2Xdt_Sq_dv<Vmd4L8*sj{Ui`uFL=gqM*5)%Ro$
zc`mF67P15}96GIBABnY8Y3+p2ol^$7zp&^W2Nwozk8yk!Ckp@0_I6xvV<!2bNY6Vl
z)*G`RlW>Js{qUVp^{1OoTY#X^^j^lGGC|yc5*H3?>Ng=%qWGK*dvF--*S07)fEZvQ
zn-s+_uHE@2$%P$Qmyg{s^pL==&nNhZwAu>5T9sy9U+~1T;<Ji*w1t@Z&R8SuCis-!
zj&g&Jro{{P37$_P83vrN=NTy(;U0-sa6kemQJU5XCPq<0YOY_fBCV5)@+?!2Y5%1}
zEjB1{B=W?q%kvQff7v3?iwyL9uXb(u*`EHa*qiPW%Zc~cGwt=Z&%jyy*kEHUly><M
zUO36Tsrt9$#Kn^(UtI6Z8fJxu)73LN4c^!{D4mZ@>t`Wl)Q8xu?-D6_e03KM<0=D$
z+{hM}X02#!m0o=t4Q4wruh+gsi-9vN>WBRb)(@DgD-SHZfsaF_e6TT+Z|&4@?w(dF
zLW$pO6nR5rl!6DINyYstwjvKB->OUg@JvU4$Lez$IUq&YOmgTcZ?4avvi=0>x(iBw
zZ#Zg0S+gdGkw8Ew<T>EZ<k@A5N|I$%W3iewvUC0jxj1}%1u^mYLn%?0uZ;31e1|0G
zVm?Hc8nqyq3RcE<eUt&x;r^S$mLA;M%9z)d5}Z=MD0Idl`p2qmD6r=DElZ`uL7#@k
z3)gTdJ95@jZsxDl?ileMfnY|YCVthO>+}XIPIbsMspp%RvA_Y=@XD_kX5VUDY{GR)
zJEN8$)A$NZrN$wJ&E~b0z^3Ui#fF$Cs;vx}9Nnq?5baL*<%)BP8^&sLfcPLa**EG*
zglnI=?d)S9q!AH$b86?>jO%KDMUj^gDz;-jVqk!^eS%dE@~EOmoFZ-2wr{7NZx<fG
zWGO!gujMqXaqqJ^+cnBPq9-6l34Y)@pTcEh5f_e_TBIJ>fD^kF>@(Uc>9y}RcxM?r
zr0`ruOVBGog4CTi`>Gs7C{xq?1&@)^Gpr(Y^Sop3qgs;!Abfkc!`(r(xUoj~R!mqP
zZQ-9W7uC~yUnCqEb2yY;F5K2(>o2cf?s74ZQ`#bdpcZ)r)PG;WTF3v#*jqqlxwh@P
z62eP2ymU*5NQZQXN_QwJAPs_qprq2>Axb0C5(<KVl%yyv9V#J`iX!`ZoZtWNy~kSX
zpKFZyjluj>CO+?T-}iN2*KwYw{(=EchzYHTI$O;9SdLo2^+V#uU&1(SI=lvr*~cyj
z8l`>Eo0vsVNj*XwVd*zz@^rDz|3*@8ko<(Ok7<l=xOY8yl*oLMc?A64c~ZB&oSNLF
z4F*D#(nDfV=r%?WwS~TU@Dd0HmzO@9w;Zve3iH^8X|>8y1TMjMtvLO@P^d-%AHf28
zd2cKp`i>Jf)Y|-r69pJoqA%id%Jx4)=(R35mY^#7gygBw2?~UhM&fhIxpw|gAe8BB
zDmajWPGL~orCs?ia?myl!+adj%yfjl4F|Df3ifz4?DYaplu<S=p^wlEw0AQFnGR@F
zyWcs?q1e}Yn2vIPL5*b=Mi$AKngfuih0%Gg(?B?8p9^k4U`U<!0ME6s27SP98Bl)v
zSFM`61+sTKn)|am68K{yKw~8=wN;~}H~-#}=D>U4&Qb}O*R{Vt*&I+9M(k`r`wQYj
zoS&`Wh_RI&x3%)N$E^S46fvZRJI9p<v6o_>4Y3E}<mc>Q`rsz4zRiP+>6v1isgeH9
zdcBNjO%*nMH}<DTkK}i-2LPZ5N<_8|Z)OKevdfsYLXHKi-;hppTnroiHx&7sr-Q1x
zacudYTKsNVIkGfiw~*-^xWffRW9=0^Tk68u@%=Lgqy%&RAOYDum30Iv;?`Q~X0r%R
zdj}ERx(1&w?N<G+Z8M7q{w^fRg63*_$FB25Vt0Dp+b#Saow4GpG|zq%-XJ7JUlM3t
zysk*^Nbg$g?suzz#RasPqVv0w_&=ZunWq!8z_j$$MZ$)CY2eUncac#-g@t3_RFhKr
zHzd6#p%v;zdX@b;eCXT1iK$h-!gVmG(s~2oSGmari>$c)0e7v`d;B_y?*|AC8s`NY
z#J6g{J3n4<K%CQvS`Mr4pT0xvo5kYg=0#!&r23q4q;@r}rL`p2D5V^9+-?cq5|Bl|
zg97_jS_y~XdbYtz`ZFbH$QWY|`mAR9gkfFAcaXn6$u+@#mw40t<qOVFhgq_{&C<|u
zeU+X>DxKFvpYPZwx=FS94fNxK2#fjZ*78qZ*kjEKw-D|Cu{8EKvJb8L#v3ZnfDDny
z^tJO)N%9P9nLAh0m2$CpE7(x#=@)I#AES7=blkiKJ+S?evwMKcjp~k^5e^ag2U)m9
ztb>nlt>#_dIosxr&kwxls<BeP`B4>k^s-}|r(vSxUck#*mC@;_&B^$5jKmmy8r$^c
z5O<y!WB}J}MB&#+*vdHdBtb71s|#CGB$mZbr@2n|X-Wcy(z$##qzyI=W2?$ioam!V
zjZ>X;nvU$R67W(KO~d`34rzHlWJ?GRhBshh|C~03Vh5=jQz5=}<Bx~MQxRQkV$xCG
zmgc*a9bI9Gn$UzubJZ@nhxYTd;$RgaP|bk(2IpO&@yo4RIyZ!Hb~{+YnsD>!tP|qy
z=GL`2Wc+{(5GRD&dv%qei*Tv#^RvkU1<BaUJxnJl{h1*))GOpG2Z)`Fzx-$V9v7G;
zzF_re%@?1ZDf^~UFo#F_ddOdL%xEJhsr(aKBvEknJiB)$4v`&E7W+>L)pY~YqsC{j
z<xDtwg2Z<EF#82zA&&KiK!DuAs8R{27W!Yj4}3l!BVSDR=o#PD_=~O=D&!(afyy3t
z==Wk}uXer|2ls*ZkYH1;in(^7H&$Wv@8Rs(W2F7~tSLwaCrFhxPH2JW)JZ7vTxS%5
z=kiwTG*eLHtvtB8d#zQ95O*F+g*KlyoO{V4^>8?IX?XKPdCJh6H*WW0v4+v|J2Ck(
zfo5+dG(8>UJtjq+-gnap=;Ri%Y{4BTXe}^x3C$^x-fa1Ydr^>d6!&$n`~yJ@&N)yQ
zTBmt5v_mUBgyNeXO-~*Yqr%|7niMG}+4a!5^q7i?<7=LvZKTuK(+EVRp4>xtmud1Z
zjG9Y<z^Gx^?nwf@Js&gPUBroQbe+AB^%`V|k;)zAxGRrn@GEtvN~8q7lKK*ikn~^_
z(yWB{5->3xFy4YnR1am<;qA;2q$rzvtnyMeV1$*y$^@Qz9JAAJzi#!-iGe%Yi{8>p
z0N$v+`ZMmfN>L~jNjjlDU~@MP%E&vIp6TG@Q9Pn`c$qsy;+<enDjzPRQ=TzGdZfvA
ze!VG5%FQJidAI6sh1bRRd|0+;pkOHywU(~mK<4$RdAocdKvBD8SBLwHK9%d%x*E@Q
z20zL_!T7>^AgK>&IuBk4#f$Q$@k!V;{FJx0aI{Vz>b|8_hcP(sWLwY_f^S8Y=-*}0
zm&Nxf^(&o+SK)cVojcd++HTD*5t1lsJUNg88>@T%I=I$1a~-e<c%+^R?lyPLR^RLG
zNlV+jC{_2pz}k&8l3+!p0+WK12N#FEUGT2Ya$A@}Y;#y?en7<=TU>3L9@ICIR_{kc
zkFQkE9R^T^G?S_4z3Y2v?l6A%HjvmWiP!y0)w?6fP2Qz*rjz|)DH=wWrj=@b(M#p3
zXt~vlBZ^S`5n<`WPM5-G#?@>(ZrICwFpu=gxqe%_9Mx?aZ+*d(|KlL|$o(&;!QW@M
zA-n#PjS1VvX%rPYK|jkF@#^k|;ADzECOcO2<yg8ely?#7l|LX;&($Jgmb8Y6eSas5
z`~t7k+8$y;ruqBfp==-UG=m08tL_ZEHn5Ftx*tBKn56aVkt<OC3me7C+M?-xn~SGV
z<v%{KVZHxQxTh$;cLDt%*BGCPe>GEmZS!v4FTUBS=6$aq&W9+bOXxb3Xyh2)H?q4O
z3ANoRU4lLf#jkBTIw-<2fBB6R5XzkW-cO0-I=MDXA?<-&wn$kVBJbvPbg_QQUOS1|
zNS5loIJxud-pxFMOG0h^7jh~``P+q(_Cbj<h#*Y9+iVHCz446Cn{a~IdboS6y=cfP
zMqf-<^95bN^yDH%CUd<(>*c4`i1?xlD$(_8I;NgnET-|lP!*?ay9;y4YkRqaeye*(
zx9FA!C&eQ};y4Q&w!Dzev^P5elyiQnc6#%#_S>qV?NBF`fO>Za<hUkb5yye;$K0d)
zl0V3lwTP2*#&1x1o0#MsYslK4SgxFO-*tn?9RGW&Or))byVMGTkM80}vMRcqm#m8;
zViJYUnYuH_(ydPZEd!j?tNc`8f98PXN=*1I5$d8P`;0@aG1>L`aaSIGkA8_~*rW9g
z2{H?45Ksl~X$L#eX~R0g0{8o^MFxjKj_$nUhc{kJPhqSOiKksRG9~%m`FY}bj%Wih
zgT!c+TKENyg%L!w;eE-vuNh&`xn<ph3+~O9Zz>l+KSURLBWqW3K2+(EJt}1A(yX(n
zpe+n|mj-*Gg_`O%dB`I60E}-u`H=;R%WRuRtBpzJJlZG1z=sJUL>Bd-=Sg3#yk5yJ
z*Dsx6-r{o!kFjy^O2y4*LR*$erIf0%x_0b`oljslH@QfNiw;Fv0o8BT;+(_iMCx9o
zD6jklDy?76`Fv0cbW-{n@-$h;(r=BBS$2*tRDGUE&dbIHS`=cs0TqHAFBqU3B;G^X
zDs7SMvraOw;G#;`zITuXj!YQqH3WqZr32%rzRY|P4OrAjdgeJ*ZLy&k<GBjDh6d}M
z-^1RIt;>}8{TT(U9&BSx+Vr)qCSP?4E^@AJo45yUzX=`9{5Yt8hZdtQwD?dSnrUD!
zbHf_c%ntc_6%Aka9+=lL^n896tI5V<T2(Y=aI+#!zNJs_xA)(p+|0q{@Zgn=l9oN4
zVkdS>4f_q1;1cJ#yWu9DPfSd%>8*Te+3&ok!uriu)*VD_7uN{z3i*v??ayoMn70R?
z+U0c4WNn`)O#2mE-P`^3P+a|`fF(N1BAz5x4$bUmKX@bP(2)MhV8lDG!?it4*%Y8Z
zCCcrFKC+}=wY*{sMxO=HYt-l}x>+Xc9=JPb#kiJs*cJHvxYc4|CGX9ZtNv({I$90H
zhm(pMeFXsHe3$lT8AZ!T!qfXv!GHrtC3ShiZe-WT^IH=nMy{|QvdV&~;A-J@?I!2D
zh}h<;=LVps&}1|{e3p$$4P)q%%J_RqIz<{e+<f-FAVlqav{R}_xCeeR7UmkwSu=I9
zq>)+jM8I9{(i}zZ-$D^Ony^jG@vu8ymXv(Yv;XPm&sOMaO3-%Vjl{S_(jCjV?p?4N
zDc$+t{L**(HbrAQyt6&u76>l4TBY1Cy|MA9r(ns;&O>IY6h#ujdQrdA`)*L<<aO~+
zAL20vj5@Ci()`^@ra<wQf)nC`V?H6`fc91GQ=Ggok%lC5M|yRUCS97N9PEGCVAoIX
z{QFu)Z{hVEu@bn}_3!OYu;iD)$z=c)IC63gD<3{1MXK==UQ>R%ZXefTa7H!$+E;NC
ztg%0M5pG&MPV>>ODxo#Y3wo{@hJ)|e^p2VH7}|yIcXvy3FvBV7z_c(6hDJn}8=wVc
z8gT^Kn64)5sZLqMiUvfqLL9EB;%k$7P5OM~Z1yn2ycVEZbfdTBClib_H0VTRPY?8;
z{nj>SdMtIBm=KryNB#EG;bP11<j#(|aV@Ezf7tJ<UU<u+%1oIVp3B-Wg;0t4kWlD6
zDF5B83<J}hB-SAOIo`!VBmR4Ls!+-*4_Ybw!dcllriU9Ezqjq{Tv>d8mE04mIJ0iy
zSiZ1{msWLcf-NJn6>K?ZC!Zkh0<-9D(!44ykcoS;ka8irs<mo)Trqqt8JS?$LRqiu
z(-Q)2y~Kq@V$t%q{z97>?~sdr7Thw8a6;<D91BELN%tk|ESta<pEGUjZ=84aPOA6(
zi6a(H6j<m?@_y54*oLxSW9<yG9NZu{F2?Q;-nSR6k0Av1I@_lLg|Jx7A0Ia77W`;*
z<W5<aMDbp=J)BJJh+lCzJ-V?Ov;ZMO`b}<@&-(0STi7Q5iv>8|gCzNsMs1{7gWTLR
zn4wg@_@dvuYT?(t+0VvCFGsYXJ!0oZ(E}}{1<@N3w_a}61;rcwh6_xiH!f$@Rf+uV
zD`3m($s*4b3)0b9|5Nx0%_JR&4y4q<Z83~+@pO@9Anu!KmqjpByI1RDM3zs-EpY2f
zVg2#1566|g{Cre$yFw93dyzc^dcJ=>VwaWw0sfX!W1t>g5vl($VFSjm@lJQaZZd{J
z|FvC9X1$Wn+bB@mfPY|@Mjb=x^wY-liy;DQnX;;v-o50?*u19qs*^(pt`DiY+^=&x
zr7SUBs-JY_<>0@4HnpeU_hh$cB_vS;W)(FW_IOvj0~Mv0S`@f>ljUyud^E(N5vCR#
zmitna5<B|^wC5b&YiY*W;E7?na{kq^V5|<#JcjtEJ1K9@jY8m8!+^Ul`qzu3k=?Bh
zzZ#(QGQl*Z(oULXmCSXy!*5wp;Z0J2Wy=kw;DNUgeLH*S>xX_`PHRmQu9L;yV??dC
zuvD%nbVJdjCG2EQbmwEx#*H>t&u#C8h{G9hDBWBO;p&AK!J$v44sOa9h;+*jJFpYz
zdluHzHDn&P>+UVT2^CyHScsh;M^eTBZZY813@_sKn1!+?`bZphbXkMP%fy3@nMNoP
zIEcD;EGFK71H-8FkHxSfi{J+PD|(iW#&C1FR?5{ri&Taie*oCMpx@leqhB)hi(oTt
zk~@!2-Cen$6BW+-u4O<FwU7AFMO>Hl;Sxb8vABN2^!+5;uSt9l#SVDO{`<FazM~RL
z`jD4QjslIke61Y}5|XuO3~yj#+S`uG|BT}bHJ-j6Oh8lk>0M_|LMXiM&HJqbL6jf3
z$6N7_piMkOhuY2U2+)uvq%K2*uhJl}H+qs_jB|sG`;*{<X2w3WIMS`Q`-4r$TAJsZ
z-2&p_=NO@uq<-gf1HP6=VmJIp!~=oqc%MSmuvIB1RcE9iaMPUFbLlb|J=VjYm~9NG
zKD>a3Kz-NaWAuUtGX*2tc^)zg-MEF|yQhQZ#NRHrtdcu7KY7eC9t@?Qu_J}6?KQ;O
zwd=iy?4eW`zP;aO|6Lco*|d=bv=CZY+ANoh_v5R#=KQBNTVKW(--HnjXm@Tmf47SF
z1R`+GcHjE|qD^d>NZhA^5b<PtAhivhD_faH0adF^dm>NVIsq8w&_^chddLJE>{>3E
zwC~O}1F+@dkEYL+ZmDant~7@RVR#Py?NRw*WMJ{Ds|M~vMq2tr#b}2lXTl8d#3^F!
zzEM)+Gw%y_<K1ch;G12%%*b}-lV1KwmjXT-C)>Z~z4{4Y5=i@GZ$<Mz_Pv~@@td)_
zufUlc?0egzIgz!s!l#$h$x$-^4BY*oMois?<ITyDbru#%^~`9NR*L4aIw__#+E)vC
zA($iBt=u{_<B`xYot`7|BO%=h-$ywpHgk05*X&cDm$jFH>dxrNh$MFjqU(A0Mq-h2
zcWcU_RB*%gK>mx$k1|99uo&^+`249#RNV5gCGKV4AMf2})PJIN5Z~Z^Xr0XFGH|ay
zQc2Jaw;(7VM6x$G7tLPVZpbYa3RI19&4h%6?^QQ1<2##tznU0RigI0(_PVFT&srz|
zmTHs3ylh{3k&sInF-IwX=Fc>^n!#UJS#+=D_9b9Y5YnM_Ka1z0zIe^T<vDK@jGLzM
zU2SN@4^t92Vry%Ocz5Nb)iW<193w-ISHqS!^<!>v`2&RLI)9GxF_OomY34xK+41Ig
zH#&Th2w&H|x%$K2K@)P`NDj?8FT4)2V^V}L-}+5brrUj@b=@Ivl3#p*u6(^SZVz%P
zty;}{(^AP?dA*XTk8Ha*_l{RsQh4~{YDWtc0NVkS;jGRqXC_LP^CWvt9+wwiznPIh
zWb5DtG~K;axpD8Y+a;7<=`ct*@OE>SAOf+jM^;*3ykF(XAtf)el(&b}pw^9uZ>YLV
z6#~1!M_D<E%NyJ?x@8RGB^LIQ%#tNw<BPEIs@Kmkb|5t~vCM%^eHXD*<-r`2(P_kh
zSJhwRN$a@$fha7Drv!+NpV(8)lA+3ogJK=Yk{z3VIP@Z78>%i*VchwF44*#k10B5M
zRJb+z+VJW}b>qv?%Mj5uxPz(sy#Y)dUR7^hB>C+pUtrR^VhtuiVzK?u%g>NbuxwfF
z?uZFt=79S65mDN*3~m_V6J4F|!sd;|Lok<nG2rBOwAImD6S5SpqqW6fq7}<hjvJW{
z-c9C87@Ey+&7EfKw0O51h1(0w&DY7SP(A;Vg*NH)u`p&m252%I)NKBM_o`{T#G<_r
z%nv58ij1poh$&N_szp8QO#0Nv(RD$Dkkw9oaVW$ZQ;8tvJJR`@y<sfY-%OkE-<07Q
z?tq-M>1TGlj-^-7$*bPAO%Qqv{PSo}&x_JT<CClLBvX>wxp&qC#Rbk;>x9WL3D>BN
zg)79hbY=<}p@)kIudC@jHpax5NPD&O@?E5BAb?>mCNki<zDsr^;YFHGpx?7%I%mtX
z={KFCe)F|e;;u6yD?=!DHtSOz{GYV$mMy{6$XH5JTwsnEE)MH0)~9yc>uQ(gSu1^h
z+g?ijeWLrp0Ri#;dUa=3BmEywoC<AgQb0JJlweP&X3z)Y&{$n#+W-D-+@@9CSw*sG
z$%rNEYzz;wCwMiM?UdmGfESIicjoNC#X7*9y1q(;qQmF-+Vjv-7Ke(E&qPQ=B*vPx
zZ%I+`$p<yl9{yFA*eUx@&&|a*MMIWWV7%UtPs+bV{Yt5Io??1NVODpJ3^V#DuuY@R
zGUrPmWhUYp!ttSa>rQ0aHT%>(sa8icfxF`3oo&`@M0o94SmEzFFXD#btdy@Vd@1{~
zZ$lx!G&1v{$$jBzxJ;MgFx`VB5+$sq6BB~a$^^8rXZc%pzMLdXCA@hCKgZQqk+sw-
zma4Ku6SfmNu{8}e>@Nsu-{wEBxXg&qFlN=p(htp`h)NA~evOnN;|q$=&dhb2_`L7M
z@fX=6(nX(vFq7V&Vn5B6E<J{ihPU?vmudXHD`laHsG9PmxM06cW^gi7@!xs(m)qM_
zPglz3;JrYNjXE4&7ki4mD@}WtRLwA6W%CZ!i|z<c8Z$LUE+Hu7=yTq^gt%kcLv_yM
zQPi_>>(nm(;6G~t1sq-HI@=YAThF1w(ngZpXX`&kWQ6T+_4&F<1}AV`AH`xPebW7-
z6nLM-c5DHg<(9YY79_1%yUQpqE0N0wR+m)Qy_0?^ZhkmK<}iQ3Ji-{{uf~u;TkM-2
zDi5Z!=H5wbfa{)cyerORtq|tN-b-7vSVdDOuQd;%UI@p-zuTUextKx_LcH)b*J0HH
zm*h|f=d#!T%IK8c0RPg|H&ZKrivJ;!{s&uD78#)E-q%MR&@XSd*!CxLiVau(GOzwu
zX6GV#qJpgoQ_`8rvJ<&hirwAmI{QXh@~Px|zd++Abg%kf|I1b?@KaXyvG28)Xg_uX
z6v0*4WlEqPPU4yHS+w{KzoqlSa7z@1u+M#%bF0l08~(L%{U3fbPqv!dxpd}uu8c5X
zepBWNQ^e{~#y!VYkQpP&B7yDMCUM~02Jja)N&bZ|EV~6i9;N?*)ZHD6RXW4Gf8m|F
zA3spkdV8PnBi6rtUq8$c84<8hBdUjB@0`(WV+7}ff8KKG04vee4wi8jMJ&I6f2K-?
zis%5?!up5h3%EM&!v!H5&in|yDtrVE;}4+|y`2fp93SUXID6mPSI;0&F8~a$5Ln@P
zRal_c_RNbeUb8i@*Q883XFb<Kv!BZ+$<O9z$cf2HjzA{cGLq_aksUa3;7&G(#4pf^
zgi<ot+&4)5``7I_<JyIKB<t+=h8&_ynj{e&!Y6yg69R})4!|mzaIxw`qS5Ji0E-nh
z$F9ik0CdC!4DQ_j=F2~<u84dx93y`Xui!T%j}sw+lp&xJ8?3oYR?2S0binO&2tlLZ
zo7ssIM?L!MS6i@I0ZE=Lmc}e$IO$qU%`gqdtp{oX?aTh<A4^egS&ZPaTAce}l^C$d
z`ycSH1FR-vX^lMV;1&9M*O1%5@DYmgaR<OmgLU@r2Ei}wj`n{+g7W^MgkaEP{4!qW
zQ3WX(H&F<r*7h%+A8IqrskthQQ(=gW&shUtCFI8&(D$>Ln}z`JFd|@Q0Xa)0FlL?j
zn)TcYZfVG82dphT%Y~QkkH96f5{{K6;{tMoi{uip?!+E{;cV;;S=ZU%_A{L*nq*vo
z4<w!<mj@t}S%_gS;uZ^KpWMNoVFCR853m!LKw~;#0e7bLf-vIbD)xZY3I%L7Ru8aw
zMB+BHz&Sk&#?mawR}9MFPc&dp{kYr5`Vd=46GzB#bPh4U$_WIVY)y}G`$zd5<c(Cn
z+Oa0EtY(&X#piu~>1Fr{1~Sgg(<0YYB=W8tY@WOvQ@`ny+p{Gmom(pL+g;8d6iy}v
z)bIxx%Z+1%v+`k>OQ8ys`;U9@BfhYQILCI16QAMRKVrtP%M28nPnnL-p4J-ucYu~n
z=QgrQ1z;Pr|I5mHByR-2)9(KcV`T9ljw_?n+?tlJt-U?bAXuWIQ&m`p_xohCx-pF7
zX+l8j84$Dd&SMFm!GhU!2c0YpsrA{kvx-}WHf`Sr6t)bjo*6z%zc~gnEgL14ik7&0
zwMP`2BA<1IJAzpFvWRCc<Z)qovO;GfUNJuA=PVn)x8imzGXj*{=|aX2zt=0xwzOf0
z+PV>RMmAc?rXvwjwwjr)^1YIwM~3$FQo!XuNweNyt62Y1Xb*F*vPua%8j)kf0eFxJ
z3exG-%&b&%?nm?@j#d<od-FW8WVvN&^KTgvZ&z&sxC=Ytpdidr<F6+Nbd#wmXFteS
z`2k&XZ8}!H>H0;4o%rj#2^pp-z|9%zX$yijQn>!skWis&oDecW1~p`p<_uJJBMo3d
zi`W#4@uX27r;UCI6XGI0oV&oYhmeA_4U854MGF4$@Otcjvn$cb-5>5%Y4X5eFf=mI
z{AyC+9Hc#b0%H5?@vP_)D0betYgMTHBunI@Irf)p*L`oumY5~~!Ry+#wcN#rp4zwc
zF<zSIFCl8B*{T1pF{}$He~oTlJat+BS?J!yt^?E{vYE*KSj0j?D4$WPs?WuL0#4ZE
ztyr|3!xp$uiV_YiBh5Glzr_$E55&pm*^l(aaB;zCQ*sg>=)Z_UJzV7vPh<#0A=XuJ
zVetldKy@aTj9vAwg{lDVI3t4DSSKC9;Fe1cwA<b=l+Q#gD`3ayX{0R#D7V!bu~vy=
zMa)bU$SxMDUKu!PokJ<hG?}g-mMH<pJFdv}wd0fAf3U&#bRzg^Upk1!qS#0&Fg?M5
zX|>s){gRE4=!Zcx()<n+Cqyobj!!GfO#vv4Ict6FqXSs&syaYOC6c<*7ij{tVT9L6
zEA3a)0w2|O<PB-{)?P%f#ozwh%lm__+B8wy`=Nj@pMwmrHQh$1AQo#7@Gx7;WW<#B
z8LG!83Ty2nTuyGIWWu?60eVBcD7ULlSd}6(0Y#tRf?;@x!$>}|q2bs*B)*TyC$Uzd
zQ+gdOq?*9wid@5yIW~AIZ!Lr*%Gt-u?DesyYrxJD)OK(2?e8uE?!(0kO0UkoC)}U%
z@9&&{;XJv@uzF=(xQCc}AJO@cuwU9L=QzCELdM-Qhal>MqfeAvC%?D8qs)$D6sSCH
z*aVy5#(DqXLHsnQb|DZoc4xs%Tar+cf6YBcVua>Jn>5S~;#p10kXOrFw!-iy1-x2k
z{&Hty@6*^8wF}@9Jp`{7JgysH()O_heDR(ciN69@BU(jmGOta*PV2^eUxVH1vwgEV
zyC}efW2%XV9#_axh5W`#ohw+%)<Tu1UoxMuJV7z^BHaStPd>)=Un|a>*OSzkoRBcd
zi+m=u)3o;iU@~7b0qvl%#AKS~cC{Z%O-+Hs!M+GL74ysiI6a!!U%+_)zI1P1RzK;8
zJ?S7=1yDr{@j^O0`YwD_3DE6?PS4ym8IxutTNx>Rkc2udu-GjU4D%MOLLlTuIcOr*
zKLIn&EAM(}u3fr%KSo{>d4fHDQ4TnD^-#{x$3}mdC_%;`c`Rrdg!ya0iBq(HncE}p
zN){r3f>BFX;)>p%MwIQGK_to#d4^8+pFo%WXc)7t8XNlRX1;}1f{%|cTF;k}tf?T6
zF`+Bih<`Odc4-+dag70HVz~%PA=W^7<rR2T%@e|mM;hpiBbkl)JL900Z95i&R`zum
zn%yX@nCQ;xIO98SKN|CaWAuyfHj?GA4)m}vfTA^0zjdb;pa&)vju(~b^E=_3k-jt;
zY5{1$jXJ%u6vtm+zhzukyITJPuKX@HqE{c1wOW7YZ1_lR?qZFMd!0Y`zsQE=2PeF6
zk68&G&)d1{GDmaX+Vtc^ldZBrCz$}R&Rkh5PSVu8pFt5)wjt*B5-p?+z~E;3%bk~-
zx3>Pk9_ike3&>#n&wg#RP+l$Bi5|9%I<k|cGh}>iQXwpTl+A#J^xqRBSz#7~zNRHA
z$OVt;Y?*grKUgcQFhzxh1@Pv~Tj8gb7N2FZbo9fcFVOXaC^X}sRo?j3O1xAyy<77e
zH^uhFPy!cjUaw{H>knJ$T)uu}MQl%>i5zStAL|?a#Bc<ze;T4Fla1uaH8ub;tRB?-
zMM%Ko&$ILTmzxsdjF((r-0}twneVGo@cEy=w|{*TqZ>=V3fH=}MmfbKN+lcYFk{^j
zbB9m5@rT(SP8@WgAX?KbKzj!w1`Q<d0rhmOqZR&v5m{NMpK?#^OD8@x@A%pkrrQ~)
z)vGM=bqr|?G@DH-uWgGrm01`=BnAY7q7<QT@#R*OYyuWIcbute)cuyD4xqV=`8Lt^
z(SQEd0{TUG2!Wg26Z;{_cBM*O<N%Dqorr|XH@>)&p>pI+X+^xY=isBv7jaxH4tbxS
z`Ai|fV)!s!W2@DIxP4zxjMB@_S!y8xJZ>UY*fd_;q{lCptD5XwL$V_|9&?GYT9=~C
zk6`W0{G<pU2ZvxABt)}K*rWLBA>Urzn@?`Y0m{5oL4(_s+;kdn(-J#cu<5_aLOwov
zOU0M}O%|@vyYUic4NnjweRXzHab)LpjVy0_%Yxv8QFbk8VI`k#Kr$rV*h`M0I|lx;
zYjo@NUg+ah<VYqx3ntTG;1nyS&<r^IXxzj+lquNtbztroX76RFRKvb<iz`tcHr<P9
zVR^5(<AweGJousEI=xlpFnKw&pFeQ#>W?)|57*8^f@2=a_uOsMNeSW@Qonib0#2++
zXWSgQUye&~u@{^%oZ7i|n9kHImD;v+dd9nX+AM?0t6LiKEQct#v_ueXqcujSpJIFM
za9tnivw3Gi&F2?CE$p&Ii3jcT{;}2~y!S1hU`7Cn1ghb+K<dvne%3{sT?j^pM3MG>
zhf=7iRL~cM^8$!%%g_qKDo@vHh-avRvt$aRmmp5uoq5`O+V2CJYEt&kXULeOB`aJJ
zJF!5XaMrKN^9u3EWUK1qs<F-XZc4v^-4%87JMpOQsJ~1Z%ngbgsbl2lR?+;RHyk4V
zNP#RNiMaF`Ud%s#A1uQ^4;H#Wv`<&;#*N8`Kds4D-E4n-VEJcU{c+?qmPQb#c5&dU
zc@BpxqgNh0?u_~7Nv8Bf4x*}Z%Hm1o(aNNQJ#qB294lKTT)1ktX~Xg0Hk9fQ2<{LZ
z+m$v%oTB)Dz)S7j>mvw+>bc9cMdiw#w2k<qBfhQmhzU9}2co0ub5PJaFO4bCsj~r;
zysP)6B6AKeS1DCoo9Q@uasK`dsLjQZo?{OoAy0$&<UMx2M-9xIBCeff7|*~XrXNu#
ztv|pbR_DKi4wg6usQpC`J)tUxzZ`IH?<3>z{M~+0AHx=H&Q1`?y%%G}ZPN-K!z#b3
z5!cM$`y=9N44&cBa6Jm5S%!*7+G*6PRx4vL6Gxy2J=82HXq+Y2yj4u^yr8<NM8-`X
z;*~wy7egTa5rVaftOcl4U@J2#`9mGjOz|iO4#^eSl@EWok$TJ;-~*crWrNLdV)h1M
ze+(uQD9K&@@i(ed+;H>kCLM>6s;*N>NOUxtc%0@juhY{n<X@<pzF$bgDZZFJs_wO7
zsN1BFlJktR*o{I{nMZ?7*^(o5|9;!#xAx&r@xt5QN3t7R`=?uzG2X8?w%Q_(%|>P%
z<VFJWAAav!H<f%8C`3<CFPZBW=RH<lJbRFH-5==DI#yDRFq}wK@)ZiPM_A`&@DB}s
zl3Lg>R<XKhK$CZy-@g+l$s^seV=KQMoP4f<6sSpkH&*lrviwAlX#`Txx!O2Tez>P)
z+H8Qhcs8y9XDfhoFE3GGox|UUj`CYN44vJT7xJMDv6s;%eV*SD=#!8b7}syq_y-m<
zudTaq#-UI4>X9^{Arp)5!CG(pbx`1DM5vIAA>YZJ6y$(wWUW+RjmyA_WU8$3cw2x?
zaW}D*CZEQ_BV=3ZceLwZfvaEFD0L6U<#f2IkI`lCW?EMw20-IrPAYYQL#C5-MbyK(
z+d7t(hS)bdu>J1o@#_F0*AF@F&4Pa?6bfYAFIuiQmwy>AHD6)Id!z3^x^{6~;k&*h
zn+`AbQv?an%aXaRR!;ipDMOc5odqu-o7_~eqV#C-UMOLEap~L!&yG5OIJsqL8EI9i
z#e!3Qy;tGs&wE)#88V2iIaBbaGbEXIO;9ty_tbW<qbo#t(od;I=?~;LG~)j@z#)>R
zf?HC-g{(L!63%;YM<9H4|9d9*pVD80O6U1xxOax)&wFs!&OxL`u#1Se4W7GX9sTF1
zsXYsZXZ%c|QP^XD%}@}(^9LAME;)i0i)0~er`q>VQsNjBXjH&S@;NSUB$c!pc;?jk
z&CZRkz4z@j|4wqTZnHN8m&uWadh)LE>jmRhBBa#z)?Re&cp70H<+(>E!c8-BK9<Xa
z|GX`ZE7(Wpo%RN<Q{u#aU+K+(`R<TJHRF0t%=i0XXkikTwGwU|zpu3<yb-kYTKgzk
z0ZVL<GuA@Lk)yC^1fYcv>tge?;W#dcD+u3(s>|`luZ6Y;#-5>42|~CBKp$Xn>+uO0
zC44Oyl);B9ie;eH0{VcAcj^_)R9W)n{erehL-k%ycKSazJ<{rmbdbIx6k%W8=dBS$
zlHPIF_yO14jis23ElNk*Pmq)xg@y6t!QZ4XbyeM92oNw?QK;C}SgrjG7@lEep@Fu+
z=M~~La`D14CJ>Gon!F;e@qV25n|o(^C}DX%kW@e3^0(G>Bsudj@3(lHJx5Z^uTZqt
z=d*5o23^l<QAg6cXe@UU>T}<qP1$CD{wLGUe34vr>s~a`LaGUF50|3Wm^<ZJv9Fbj
zB(RCwQ=<<rRyDKbZ-%S<{C5BB)nU&r9ZN3P!%c$Ifz>BVG7oo+=JzZl;aIbw;4W_2
z_IYwx-^EUo#V>aE<7G?*ibpSDhY8Ck<Gmb>{gfXGCRFoWhp24WVQ{Y-tVPK$;iV>n
zkDWz+T30I<h374hZ~3wd<UGErdu;r}w2+QKp`b`cWVeDvggg>E^%t6ASAvBGYeF?y
z0#T5A+A0cJaaRaZTQzhLgn5yZ#1in6VD?Zfw@YH-sS*>nsvxdCL}XC)XwU6q7+1h<
z8x-TgeQ$^IQ}}H*_t5=xk3))pfw$@GE{0E^N-Z?L3h4AhLfQrq9JtGrYy$rqx#Im6
zS$!@eTViQP%};Bw_h=C(StcoHb+rxogec&Q(0Ub&ipO^gvJu8NjT{w)pS5l*4s}|*
zXT>a5El6D#b3OzEc$dj*Ui+(c!FCtNi^Yg<CqPM6ko%(g_}hQ6003C$esdW;)a3*}
z%0aYxdf^p{k309^wzm(l9x4Stur)Q}pCo;&_ewmrdzvo|4uxVTcw-*O<h41z{s((0
zb*fZIvY6xCz5SsH%P3NPL=Nb8#VW2_UjB0l0=OGoNa>&(!6koGSmPT>@U6pUm1QJi
z|8++}$a`89QO^yeLPqq#Fq22qV@4E>g?q4RW9QRy91a(KIGWg)5A?n$5f=9=e)8Vz
z&ZUc=k5^YEwraYTS?dMptkGVcJ9x9<#zgrH)3N%a+wWdTd(~Q5_ef!_bU{~bE7yB1
zNw~{@WtW!KQ;3!u6}LY8J-+3lJ^+RI*qTmmQZ!~0v=dg<gFo%lG)hj>OZQa@KH#Mm
z!W(ooXreqwfr492GBU|d@^jYHJ<U8sAg89jO@ZuLz@5A5j0`Bur8o>`TKB{=IwQBF
zD#*ZIVH>q8sSs9XxsS=|I?B>O#u|bmh)DhknJ5kO6ydp@eWP%xh}}W9u6q10Ads8d
zuh)o-!<=XIGfWJ`vPDU<>%0e>;MvITDZgkdi~f>p#`(vhDP$ZAje(n*hfE`hc264S
zegY3ZFQH9-Ea}R{g^DDUrp*ud%H!PXfkU(xns|n_qFr^*!ZITGZJji9sp$g-bI#@Z
zE|xE~x(n#pt#~eZ#d-umvu#grm^wS+>oW;#O?$U`tVm|3ZeGqg1JX^kD_K<%?3MXO
zl$d|LV4{V>K5C;<fe;q&uUq$1L7*@7!soAnz|c#w{eg2@nL8qr)!G64RB4=-kl*g2
zx~m=kWQ*C@iQ#)3B60L;n^mtg&Utdn6;B0Bci$ay-vOSnPd_12I0QQKGoVrnhf#~`
z6zjKR#J%Wn6bWK<@kjCdOQO`2^<a=D<6AFH7yM4!H(1A2+o<ul9+{BEGDr$J5fcux
zPcZe@K}%|}ZSXZ$>IVcA?((~yFFj3{hx!;n^OsYnulSOvL_FPzu2WIqJi<`LO@1YO
z*QnBj?0D`;7TW8xuv@SnfqxuT`O78I00D;-f4t(CMvr6UpP*6)=6v}Z(E8)gzXdlJ
zr8vp0eYPNbgc;(M1|2oZTaRbZl<>9f5B2o0Y))P-d;ofN(VQ=I5nmK4$&xe$xr-a%
zW6?Q?_zd@57P9LjMbs?LD{d!FMea&0I`1{!w4Hte_|9eTmb9?WohObv<E05eN=@8i
z4QZVu{n{IWt+-2JyWl@^{W$-8Z1@=XCsq3tv#m3@cC<xXki+G*2~psS>)$KAcsb`=
z=<UFgmmo}ZxKyzkrGDfNs<2ON#}6-3u#-hiOe<W^<Zm_hi;(;&+IJ~AGyIdK`PXW`
zJ~R_(M4BToo`PO?VpNGaIRbzGC{ExOK)8+GVUjCt6M^|ajq{N14v*LjxwZ<MoH-Ko
zbQKUj_+s~xa_Sio8!5*>&m^&4**@(G@=Thqre6&@g4G157_NrCeUv=74u>*~?+fIo
zozAED8*vSO0F2gsise(0MoLHibd2!J3egH|Hrvs2yLeo=R-nD4x@(^ettQqHu#C3&
zL-0+Wbd$JP3I~sTFyQV$;0T0gGq|HJ{JAhjv=jky6L%=R9c;U5!<g-57OX$}fp|qf
zK+^Bs$Waif>qtq}&Zk$iZ$E=rQn>$*47kpT8LL;+2{<-0&{}B!C`afQkLw(6Bb-_8
z6)*9BN@^OcUx%uraaTYRCdMl`an~Y+*2!^*))R!8I4?ZBY}am(fhuCx*=mU!^9MWj
zkjNab)9YO0J4m>83n@2(-e$ZbG?c4j)q$Bg$ls<1&*ka!!%MP;$lp{h=(tEm2APZu
z1n=~_Sl!weA~SvTH0+7m$@6GF+PB+*MTB$(odshsk@Tyh<x54hMByP`Q5lSSl<!UG
z$!cIEL*L0pmLc;=A@d289ldcB%{oX*OC+fdL$e)XSwtsx<<4g-B@<neNMg!K1xAI2
zPqXRDVmnfb8jKzt7!wuI_M+dJe+s|9q_axOx=L-`uHLE@oCJ1ComPYLA=+|JeLJFG
zdvviRT($X=i1<yBn#H^h0$V+&4ax-ywzw5Ydne#vmjB>a(Kf~k&tlucG3hzPh6>tC
zD6}gJ_ctu;L3np6yc4OgX39sdytwR$xyIcqkj#fGqi=Ad9FFLW6I6K+(vGcN%cPF{
zUb(fAAIt9pa@+MWHRc!e?*#XX-cxy1-7WOH)p<n;`!@wQPnhb{43*`DJ0xp+hy@uK
zNcS~A{RxssF>SfiJ9;F9(mc6}0boSBo}w+QLHqPJwAUi-#2|6i+d@Dv_3xpE9ZmBM
z$%jGg3QVTB&tO_2{><pP=2xEOw&7p-Z`nUV@lxCHvdcfKxjpP2f@-g{XjmOW=QUGb
zTSl~QAh7isVkYa|kjTX$s7Bx~fE)*MTuu_+9McvN-w}|j&5wHpN~0JTR29oc%mA+?
zroxEp1mkMdJ>h8v5Yc8^Itx~o5ZukUZaVKR)ZcJy(MA*E#X?%d_X{?b2&CWRm6`Q5
zj2Tm5K=AM+ZB1Y`(^ozHbPzMooESSa)xO<}P_Exga**xt$rSv%?TH?5UGBC_*3oO|
z&aOiZD}ZQQojz6=T8v~%h`HcwSi~~BdAl!5f@m?6MMn>Z$fnkxs>|dsndekcVsTzs
zf+h86tEe6R_z(>DUcYtw2^^{T4v&EQy!57}AWS~PzK2?npwP9;(@AomKhOEF4wmjS
z*P>2gu#e<Tx=x5KQcw@@?BL0%_;GF|$;(;HesW%ZAX!Z!3s=s(4EI?lW`BB#&zEy0
zboiJ3lPk3XLr2cyWltP|fb%(mJCVq$R6i%JL`;$G+KnJF&h0O&mLM;Te}hsuOZlik
zkI0Cb8>>Ks{LJ8xF4E#W^om7S^+I@X7%aY#lPKmw2XH9hSfJG8K%)*u8Jb!0I^H=i
zAY!Q;CrsGuflIMeBBy(Exl8BA^E~5UW+3&PB^o<45qX?k?Z^z_;pr?}w6~ILJaGsM
zcub$9!7p87w1#3_;)=J?rR!QODAS{hy$#BBx%Z?G%2BI|EGWs6dBr}YY&uAn9!crS
zbd+2bz3@=r$r;5N3_p!=PRj|@(wZeDXm(zzV(XDjUHC{7@h6wL0)-`Rc^5R}_l30y
zk0q<22~v~(25H#gQ>fxmj*8y?An@G0sZnZ4kbe<J#gV&{#gq6aBT{MUbQaoG89mQd
z_^j?27s|XwZ|_Iqy9mAwGZtV@=(&PgC?wrrR>kV%ZxK+wx8YOGMwvG0w={4>M+=6$
z%$vj9$?mg)d=fMQ)Kq>?!yUfAkyp}6c1ew_YAe4{js@m+DhE_bgchd^Xj9P9qnYo=
zVoakgklwRYRevnS{K*%tpYppE*15asRyS(U`;3#-ru9ARX8sUpPn1u)k?XCcd6`1m
z8?h2`{%tiZ@F%>~tB0f~N5GCFo~v^5Qqx@B+t$5CZ4fvaps+=ozI!PSn>hE?BkrJP
zSWnCs$bZG~-3L9^k!|O7`+6}Xqy>Q)L;CL61R`)%o;+#YmT|oGx^a3}>WamEbLF6|
zkAEPiKAb_Rs~TBGUr|>P!e99DMKA4tK4SNNFO__suYsfRk=xD~piGIs<`2qb3Vdw~
zJZ>a&LZ<vVO6*kTV&x-X__xkH50(nc^p*N~)gvk_%$O?Tjs?o9wc0!Qd)9v57FJ+k
zgXJ&a@m*14$1u{;<IVZV?nNkyYc#i!;TeoevLPMJY)y!XVe3&ZC%$bD9C=$iPY@Bu
z?zblond;V(Xlx;ULJ*u4rmyjAcaZBvH;TASjjk(nEQWN_6`uboCctn2>>Y3Aba?`1
z!poXgFVgt0z-KksBNvnjB~11xapPWOdg#X%i$0j->DhOZie&sTc9(J!x}YsibIrB(
zv7s{R?ihS8kg9=^J-O7OFgBd%v?o(ZTAn69YQ$9Jdr1en28vMgh_ZdLTAj1A`$+S`
zr|>O0b*Eqft&Xlow5jVVA!kVT#u4cJrHtSr%VhcTj^N;Ps5?q8zK@s0ksxG&hJ3=>
zyQkO9Gh+rzA-&VQx&USwJ#kl^2tH=8>0n};jO^vjRtEk)x7;xr^@belDFH_2)b)w`
z!RJ_?tsZ*zOm*+K-e2*dDY_Q;ZUX#5aRFGVyD1r&b<>nsDSrRdRhQVdS~isgNf9Ow
zwx0%iNo^Dc3H*;GEAxLdL)b9#e?0?Rb8YNU&O{(>vSDrNz-ai$cCbMD0PAWxUq32!
zBS;LF;;rgKRJcaLXB?u+6f}p<1=|`HgCr>dx5Ud>VHg8HU_|a(vf2r=$2jqsJX}=f
z?AU$SR86to)>wDB$P&h)wg$Q8=~=G}NLabZkCvt6odCpHGqA5*EXl7oyn=O^7@LkA
zBUBoEl$=+QJ2vaWc~unUK3Wd-BGhJ>af_|z*(xMcHR41jv8&DKsLXro46lfWmPYB9
zY>Xd)X;%*H#Pc0k8N!70=tRhxzx7K0N^N!3^On{+tX~J?+D+H+@YY48i!YLNk=Q=O
z2qQIh`z?sJX7Q{&_#W9&ljMiA1C2Dk9&Tw!KWs;w&sSoJ!fCp8!>zGoxlLP`x^OD{
z5|#-y`v<%pzoAjW8JL9b_Io;hw5`d?^UkScd0ERk4&(G2Dg~)1Y{91KH=)v2m8vBs
z?fSIBpQYrs1S}pyqLzle4DDke2c{Xs6#DPfT>)8&au!&GA?tGu)@S0%Jy@UqgCCFf
z*x@5l>i*CAG(*;>c*ezmggs=RB}CTeW1p}0M-kj;M*r{nL?Ur!9sCaRrkOtD%r-bt
ze8&0evN_>T1$su45)+v-xid^a(b~Y*#%SWzx2t5FpH!0KmO^%~`Y9FdU$A;8IN)=a
zcqzL%eMGQfQGv9`nuSrg-Sw{h5>aUHx$Wn`{W)&GEvC})Yo&~i+O%z~j^*sZ=^-5<
zr7YuM?wLx8BV#v|NxiCmxm6SNzTdIFv9wv~eSAK0ks5>Y&)`Mr!5<1#M@b`dJrAPC
z=(_BL!it)<`#2+2344Mk%4_UqX2sxWFTh(AorbVAVc#v2aX>yBdR`ZN9X3oq?Gtnz
zLY#dR;_3!?L@=bg{Hv^N3341dWucm$SyMxTwZA6vJ(R&R2u5QkTDN`yTj1L(T{LX#
zfmL-|YVp>bro^^AWrKR09k)J0pkoe_!6&O7`Dxf<Ez_4u5#rQ|198fJ3!n;0Rn)Eq
zL_s}V`jOpGB1{$CVOWCj4wTXIPl)IuzGi$S3yD@PK;y(kcN+&$g(#A7Hz*Y9oa)K2
z=|BWoSAlNQj|#X?b7Mw`IZ{^K1WQ?7att|~xaYj&=tIMQGaqBg1}nEm>&@jK{|9Oc
z`6Znv5caM6B+@*O>VJ;y4RHkUi#>a##OPnF+~gzJz#J}u&dtZ@Uw+D`h?kmUe2&?r
ze{qqciiW6fs@u&UP5*_ldUOGxwx^PAxBtnW{y)3E8E^wyVo<hZ<nWKZ8DKI0`+wj;
zHug78Blb(Ct$wvgfmCkM;?<b(Mf&g05^@d?CM-QwZxGLsa$s~Ga6SVEJNP_*z;1J`
zSTk)9aX{Ldt|w}*MZ%kv6aJD?pzm-%3R}qWq;30uGquw@;oxxi*8kqeQSXtw2wtT{
z%X^opnm6+)2;Jf6tN5Hc1Yc_r8djpUCWEpU9TatWda`KaIj^Q?f`$f7=>*enSEKU|
z+<G_v`J7he7vXTKHFky0hgppGV(C@o0!q|xxX|L5s>a(7az~Kq(kruwW1@wjWnx!h
z^t>X_%MIm|hN8?nC)Db@ZKhwWNfoGlt1ARyf<BX(v<J8S1Yyb@?fCi8^0#cHTo0l!
zXS?FW=N+LomHySra3-{L-N+oXrTiOQT@aTvQ7F@!rootsB6{rcKQCki3z8N53a80w
zTy@?n;#?JI4zV#W8dVCMolLe5diF9p&;bV1IHO4>uhdb=rowUu6As44pwEiwP4IUd
z-PAu1-{}p7yr&SDq>R7xajpmA8QkThZPK_g&$I5rO=YG`2)5WdQ61_cJwr$xC*{V7
zO-61nMmc}LV0`o?*b{Z<-v9G+9VYJYmq|Z+l!L_@23cQzM|Tv(_Hz6S5Qoi7-z*b`
zw?s1%$pin}XK-s-qlTTueDl;yC{AAq-kiD%yKs@pT6>p~gJht&{TVn>%;SdTC4`Oe
z2f|=-feZ%5Yi-qNxvt)_`#?&zt7ZiWS?kcp6h+Q<F!|cQTRTh*fFB;FvLG7NgP#Oa
z%N1tGGoup!9sT6K0&jk_EcWgH^E>~aux-fB0IG_u=s(7O|NGbe?{bLNV%cP%vwm>^
z6s9<s(hizP;|U|W2&GUj#x)@B<#R>;xWe0c;YyX+i;8ma8~hxlH(7nt-KN$=uOLUv
zRR}J7LJ)Q>)WG#=&b-Mj&sDCPfg=(?kxF8+M?xVr>tcIvXC7<*Nfvy!E=!tvjqBM>
z5Zt8VDI;cDy7^AG12eIoa`bw@t}ID!;F}69)I#Zi+pkbie$=dHzVGrgBHqj&+?RgY
z3LPDE=5FjwU@1YK^=b6bKRbphM}CZm4zW1<>!S#8!+YK)9a3PZhK>4%0!sx{zvO9+
zS=2skItV;lIQughe};gGuv&+ZbqL6fcs;mgXTfUC2|AkYjrT{}4nF+_VEnX-I4uwL
zZ{RSp-|@d16akq%f-#xrB1A?O0!_3+AL$m1<8Qr<ruu&V_vgX{F&IMDz@Lbx-n-QI
zB}|Wt^k`0XBI%b4;Y_?CbfOVMxXQ8dp!Fak#geu)5SW0WV;SOI#BeL`gj58+7CeR7
zgbabf4M}7+(ZyXsLX{^!<wdwq@HnljnX3!qbP{a6hLEA}@1OgBgJB<kgAi&HZwS}T
zgjM_csW#1_oH9~hBYO+Na9)FZ9B+iK$1BEHfbldM$x!yYab3m?c~K48ymcrEnlxb^
zHa}}nuBQ?iN6Mv%R=I3i*WJ3?NpNeiBV>a60o6>P2w28}J6rFat*J4(^Pyt5;So!%
zhZ67$48AewMeN*&`1)gM2m29?F(C}@_!s$RfXZ!(ISI$37*ryWv5RAs2|N<vwwbns
z+vHPumaUc{I=Op4JcMEPTMz57G6`9#C7?d#A)@;^aA)E#V?~Cki?ltIFJxq!3c%O4
zi|XkFVk!o@%3y@SE~In21XaXJ+Q^{qgNC;9LdtZeEV$YrwSh2*mYx;5!d&SK7%0H?
zJ#895s-nl&g%b`81+b$5th#@8dL1;gl~=tshVzJ`tvZQ43tCV1AD_8EJf<5dauLsh
zI$+lFaHTgv9CnrlN$@SdcJ}9V2;#f=Hg9H2hSV0MQhxPY6>tNRvW{_KeJF=sQG4lW
zBQ)nXYxkjhoq3hNr5gbiO8`lq&}Y9+6BP88fgVU^`#Qi%E?BxO{UxrxFJ9P3HW|>U
zjt*aq6#^kv!iYkZt*q@<2dR?Wt^ocFcghq$I75L&boU(G4j&I8lmDD(iz0%do?()S
z;29r?)HM6nY8NzUT#}MjKBCMr@Xx4`dqm=e3?cITB-GQpGKqMF2ar}PlgtZC>^}+e
ztgD&Xp}|JPd-;S|t!jS&`^rR2TSBdEzX674H<g&{!d|nUfC^VZO8iMdMfKAqlKTSg
z0PxHec!JX6um2_ZQ=Vn}&p4H4!LxYJ`>0lEdFBJy@;qaZT7JYXCIw{G*v;?q9o!F=
zYbdGFgfxiv2%~eX5rAj)Pl|Oc_)Tg$5L($Rz<;C#3x`ku5o9?{RGM0CpTSUb5bThg
zwP1hW+)f#s3NxftqV*R3-tpNBjB#?1Yt`SJ`>PrJp0+p}%NpHJwbb*#V5-Zk>~h<I
zUQP^qE#j+H3=tRJzYry1K5gGHLWK$NmYC(d{&V8I`8Hw<5%FrQkp;sW$P*Vb!-lB<
z16pJ@hW1a>)q<zlyPLt6G0tajGn9qJlsNS10BOiFId?e2co!_vu$Nr*8YdvVtD}F9
zn4~|ui3Hqi_uRPS0E*~EY#UPsxHUy>!}6&G2fS|Bc{1Kowd4uebU!kIi+SIu%UK&V
zxN8AXxNvC;$fhP&uZ?`zoN6JKl~bmP?fwrKcy$9;O&JClxbs!ax1g%Dfgd`Kh(SW2
z5pG&Vt^t5c*g})vxl*rK)0ZEvy_XxI8(g$;a+)aV1MUsIkdaU}1zXAK4Y!eY*vV`^
zJ~M9G4uoH_-JGn}Hq(LyVGAlQOBd*{MF+tHu7&>ur;Ijpw0t8ru$q*%H=bLx2+dx-
zmq&9nH(z5N=6VJ-0{R!cmcos2YFHn=mT%WTltg6>Pm$)aU}BJG$_52g0z6m?c*NVp
zD1mrn?~{l%EfDIu(DAK5Me7Prqs`lBor-9HwG`?Y1`Kqbu#%fJdp!Ag5#9>^Be=mu
z>ZRC8QdaUBV2iPjf>^}_2v}SA-|W_Y!5@@)ypzKyYy<Nt8-rQ4_xBB|EF_KKi~=0B
zQ1K%PuBaO?ml^h`xj1OPd;#n19d#&`(&R0Zd$n3m)ISc&-+CB(_uM^%mtpg@O)brw
zmq9rGcejp}H86t|g>F8-B0;jTMkbG9oF$}5YjjC^Qq>-}F4-gc>YL!lG5P!cV;jRi
zlkI=(ch6M-)&EjICAjI<5Z$JVq75hC7{?ziDxrrzLj3#|T@{-}$m2FiL-aUx9<{XH
z{>V#jyAPzH2sEqIV=${P*-?=Q?@bqJ@}^f`D|HQjO&NkB|NAPkON@P3)xlHNjmKa!
zCH65Z+eN{QK+R+HgB_VViw-o=2L5>!Q!p3Z@Bhm22Vbn`!|4_c9H$pDYKh6XsY8;5
z9DhXAKt<I}L>F$T==3<#<ceihFpykIa}siDbwF~hdfVNWT7dcfY@~`=t&NKGSb(e#
zj?XsWrY@NRUWG+h1YF4CdoCq*e|eWd>+{AQ>?b2v_q#7}Y*AXjqrPNjx1n8EtLd@=
zzOT%}_y63RW$LsVOE8F(P_#F!vY4h;$b7CR#K(DrP>`(azOV@lfjavd4A+D(xr%0?
z8mnYofpb-h&W-oK-Xbz~Qc;)+sJVoex147s6_vz^wrgS0S4vZk1<1<=t7JV|o*XkG
z(YAtU&s4H3P)JgODd<2Kq5X519{g@XLJI?GILZ#-5*FKWmaI=%Ibg-BC?ROGJMd=M
zlCrm)@?CCM3-__EUHz+k%XaR<X?yKp0LjQLoW<Sd9MkzN>5K#F*_ji_eShe?h2an_
z_fS<W-TV5cbD$QWh3RimuX^Q@ZS*forG6Lj)O0t#x%}*LWBoJ#ow=4sKVLM70QRVR
zWjz}8$ncve4(CMLS9iZtkPkPy-*_uG7!?Bb&5+-+=M1<fFApJ{mm^<`-;~|geXw+*
z9f{JbzR~74T|SdBw6SpitPOFs+LA!>&5y`?cA$kU1oP`wK)+xMnXT0j%>bhe>XH3W
z1@^SNI1wog^R)JTKkGnr@{ZSZINoKZ+8gBKm=T$?gFzFaF7bF27p7ih|JJNz?6(tw
zM_zIXsGg^gk_26$`1f0S<^(D%x4j5y`n{gqdiXVp96aGtRj;~Ig{)%ZgC`urYCX~(
z64R7}?xGkQ@yH6C_}6~A<mcuClIwv|o+t>h$y{CJ9+uQ$uw_s$*<B8_=+^m+m@|m!
zGO|7pcvN5@2Sx%e$)3LBlNE-}F+wPc16X=lV30GwXVE;0e_8W3<!@-k*H<Ic9HgL8
z)S6Ydld>P6L=OD%W=~sT7!o=gpy7b^2j9g?wR5WIuaM|Pe;xE@`n*3*8yO3`kdX(4
zo|G)&9rqJZt>=PRO8DEw)xpHn$}<T**<#io9^3`HmO;slT-$g!MEsa{l^5rrQeL2Q
z$4_Q9{4W+j08O_11#uKp4RiTw36i%ttB(MxHAIHF46NkBPG^e1VaBp6hwz(b?tkzG
zi;x_+%4H%cWaVUQ(147k7ZGSE(!6yoC{|Ea*zH>E`Td6;=2@|l(?@DxJ)4M(+}SVl
z&wXG%`_bEMEl}I80G-<`7(P4&V@u?tIf>Q()x`p<Au<Hyhgk(eg<9ZKb4X_JOQj=I
zFWI(-J;z~<1<;n?2>$I;+R8P6Y5j4mxXSX?#8<_!hfj_bNBWMx9N1=GMFx@TbRwio
z7WpXp?vTX-9qlv{4EHGD6US~SO(oB_1<qRTg%iC5R+;G4gK3%j?=CfE!Z&hTx)SmK
zC%K67vAD~A=GLJeiopG7HXYRCoiJr$Z@fl$&d&%DCrqFogfdr0K)ZB=?xsLZ;>t1-
z&xd9>0tball%HD~7&B@R+8nC`CQI%#S+J>O9&OM5Kg7KSP*;1~?oUZ59RkvgfOL0<
z0)n)JG)jmxNQs1$bV`S!fRut#f^>s|NQbl_jYx?9d$ITXJkN>u?3puX=0AJyu^IRJ
zt@T}Ze6H&<<{?d5;}Q*sMatH&isRlpXO@lBy!(hhb_ypZOWJ$%1$@bf*H2I}0w@Gd
zM=ssCY#*)CHt3a&UlwCxwiA|jEgId7s|~Hq_d+aX$M-jb(gI+OSY@>f=k}L}o8C{Y
zxxT%Pa+x<Tsy#fhcG|r%@lqsw$f*1KCp*SCf}mR&lOY+;r2XrJ&S>2F_^wK=?k2DK
zrLEm2VpsaBT)jHLT>T1K8z>eMxX9f`UvA!FrZ?hl*VfKZY^Dpkm1uQ42WTTgz(WQq
z)H3>~s1?~6F5=D?A=YLkR7{pQbx6%Tk_owg`m+%;Jc-$Tv+NL^1!F`tDnUb0E&c*Y
zWwpVwz-UTu(YVzsQS=gv(Cu26G~cutN(1wAJp}wBs{S|l)tv_*q6o#ekMlx7`Pt}8
zdx$*-43urT8}~oI=L_D#)={*~M5wRqg2(>{>Z|(pH>IGG%Teh6!F*}`Pt4a-w6d)-
zzov3;s(o0YE@wASq6DLdv2=7h4CR_|y`i`{xA8R%)l1TZ_Zi`RR-L;=jwhdBqSq>e
z|DXI9>!xi0->gaWAEbc3(r0qSwKyOVI+%GFdLli}iHSEh#P+`eVAfdJv}O_OQ47iC
z>h-Ss9B51O^pVI=J9WG$07HpB@_dgh*N>9?KkSk1UHc5J_e!bTq>$N1jm65RLzmnf
zDscI|2Px)4<`wbjU1ExF87xE}R)aVitA?SLaZutwN+9Tu)2l(h`yU7C6^S1Qk)tyN
z@e&eVTty7{BCSc6Ah!Og5N+YboDe0@lvl>wlG4t*8P1UUo>J)mG#js5COlHIPBs=*
zTn6t4uyt+v?#g*5JF)LnYCZHKO;iac)B=TB_5tq4TSj0Y>M%raO@=i?%Y{F~1W-%w
z#a*eXMx6RPK1*a&AFfv7N27;i=FhU9{igPQZGO;xf{7kEyZ+`?lvog1dSxDgfr*Qf
z<X0T!C*ZreHTrEsNLKVFzJc_Q9&4YU*$UpA&OT+x=DiXp>GRajx}oEpfgYt7QT?U|
zQL;k-g~1{<*Dj<diZSSAFJB)^uXd)7z1tA95ppDYK8#x4XwR8D_oY+qmQ1#~7Y84F
zQVUOW1*5ApumVXBJYAskOccxLj|f}X4|b%mWvcC6GXC}H)>E<w<c`KRLK=nj8i<M`
zA6FE7ug|~*Qqh1T8#k~=?0AaNjft-$)Pa^@z7QJZ!ni}%ae;gVZvm&4Yzl%&(VgRA
zGB++s%|*W{`x4r;{cA0G+qF9GB*hU8I0MElrZ-lvrfNN_(2I!N1Rm-;&O-7|^OYNP
zI1ZsZXv#C_C45-4f&nc`^I|(N&N3pnC~1$dTB2;uM{*z7?kGc#*hk|vWuba;;QoW-
zkd<gJY33NBh{b8qBKz1w>INfXvFCM~0K70Dm(KQeM`*H8;*?~-C_yKeak}TqIv8_?
z%8DVazKQMg<M~whK7t{hWTA>TnLs1kYs35{Hc~w|X)(>E>(FR?P->jYBIK@Mh#lMY
z7y~TCiDc~QyyIn+@*bg(dA9r*d?M^bS+ysbbw|q?PA?fs1O6t+)SLe%$nLYMcFV>$
zp$=kA<!Tv9w1ys<NHTP}%{TC9nHRV#t42tu3`8UIEa*bU2-%sRvWAdc^e3HVKKOa@
z?Yj;4t;SliyTK*1rispzreYQi35g38t@u$-`daRWSYR(nVd<MR*z-!;2x!g-4*)YC
z4Q9V@ohHxKjlw&1hbO$ViBT7}2*LDS8fGq?17ZejVge0$&e#!&-LfC$H~(KFW#dzc
zILTZ4Nfd(H`>LMcCK-sC(WyqZjhI2kovk*3pOYcN%*x$ml1H;xQ|ON7B<aAE*R-%+
zovL-~&{BLaZh-;Aho@398HFS4bJHL3ElghUhQ$e>u}HZrsz&=mn|piVy0A^%ZbNx`
zH2qP{w6Q_)p?4VbrRPfnD)Hev9X<)~2k%LU$?;mxbvHi_-D#l|?7Ad>2G&~vzbeeZ
z8azw*9ZtmP?<)!>C%ZoFoYwoRpM>|*L{LR-h|zj|X3Yz`Iu=@%DCUo$!&-cv*<W<0
zP9Nhl5qolNKR!i3AyG-v1D$};Sl!<@*oI$~eOh58)ccDvdpV$5EkO?7@2rU9TH$AM
z*U&{~Y2vQ-@32o}WnXpiUAuUEeqTAQp3$ZE&%z$R`}-~!@}sxv1V3Hq7EX2!cn&$P
zQ`kvw<yt)=8pt=&I&)?79VEHIxnW=$6dTfIie4-A1`QZUtL!jjPGmn_SKZw%KL$3g
z>v=(wq)U13Y&=-=;i?w{HYz*sr>L}k$r*IeavB^7`<pm(eN%CL#-XAaY4io}4mF=t
z`rl-~D(^FI$`7{E4(^PpIG08de+Z<Ga|7wCqXq7Ge(s+?s7rC`SrnnL-Hmt>aNQ%o
zFvrTh`MA%p+^~}Ie*6n*kd^sShVhqIGV1v@C=)D~Uf$O&-1p(b`A-1N@cI80fM!?6
zwguw?VO&9RL-KA`vTMYliA)q`j%PMH520Ew{B!++dd`YQZlT*<Z{H{M@eP!NxTL1+
z&rbH&te_pQHd3YE5(+tXVF)5TZ5#8NaT{(_p;x2uhpF2Huj+f9Pnhyd@i)G~w!UUm
zWhFAXtkkjTQt^-U?@VW=fKQd~prR{&YXS_-D{6|aOg&nW`L-RFP!!b+XG2ToDeK~8
zHT9OwG7Z1yO%QM!A*!6hs_~4}W1cA%GYsqUk|H`rRFzxxjmsQ;lOoF)m+j5c1!f!?
zSCH`|8tT?5*XUG-MDW3*8gohYq$U%w=ZR}3y(1osTE!R;EeVka;tL~Xya>Puwtmzd
z(c0<H!H(Ocn5n=sLiQ)dbvYn9Vf41P*}d5LU~nO1{$|mhqVsq6=Wm4)QJH;@#dR<e
zNW!D#f|qk1wBQKiJiVkK==;-AAsde?up1-F(2ISVi0hn3W9zhV6q_MbPr8v{&lQbG
z`WHH$zVV?M`xsl5y~XQ(Zf0bzle~VoM?aCRkp%;?cVQtb*}enJCx?9kgJUKPuX<W?
z=R1Peab$#`A)qP)Cw5OGmOY!iR+PsESlOaeT*M?Vi;87R2aVpQi%|Fyw>r@T{F-+O
zR%V4+5>7DYUMFR2<n+6Wv{zHBB<w$@I#wkuT0&%7HEOlh{%^oExGWTcnP&(*MJW;i
z3sNas%dDx+&51}!uaU!TY{i~j)G$OS4l1_}5+$oVY9!AWJ@oS!1dMb{gcDu^e|b<+
zar@Q<^D4j;GM0)BJ8&N+-4o1TO`hD)2TPpS76s4?35Sp!n6JvZ(2Ni%PBuEJSucEG
zN*Y1z>!ldCBUMjT&Z6C;2%|_D-gRZkee>9j0%alzrWNKzzvK8#b4gas%)3aZ?2!3Q
zIlfqW;NVKe=j4n*uZn!0Dc-Jygu6qc%V}1)dDW|vh}I^#{MGjb->E%Oq&Je?MJ+78
z0k6Lg7?D!%z-O}xnM1z7Wm|ohlWslX>?3{$<9e2?7=q!C*kw(S3j)h#MWL1woPCv}
zd;|`v8~!09(nMYAxnr*Cmtg2W;C|CKm`C|y_TadxCi8_dHIIsq=^FEjE;#%X^2Nvd
zKJBc4T|HIgiE3-2<Vflb$rgvkVj*%FnOaKz-XDgjf$hQv7uV#1G1H+gX0{3E;EG|+
zM~>iy>mIu{2)?HOpi##L)@A~&PY6V~9FzXQ=qdx5vK^4_n63Ed+-I0;@SBWz{)J|s
zx9vC0;yU~~-Jk|q38Ioh+_ET>58tkV>lXjHL;|#k>M(2iW|cf9?Hc)jyMw@vUR9Fq
zWs^||4AagLw>B1WuO$eCg%pUNZV6!mDhd611A%Z3l#&zJWC8(f^kmM}1A?J1s?o33
zK8<F&xtP(t><uN*#_cEzA=CHujy=2CiE1wzSH7Qb59kp@qm>6y9H+MS>C&ANV&95+
z^|hSq2?Ss$RBhN@UY+_E3WpehUw%c<&PrHFJ)C+gen}lrj*5aJrbThFr#SVo_qoUC
zryu!&j=G~@iTT_bjS%B{Pi|24U<rjX{W;u5_zc^D;kvKq=UJFJEQ5k-*7Dsc0HIWe
zq-kIUc~FM*9v{8s%CM3a+9K^#&JfXv{P%(waeD!lp8yY9LCaBNH2LVJA$!q#wo~0?
z4!v)*IQy;D(~Ye7X<iK5{{go(Y+A6B??|fY7nMv7uET8HsZnyf)ElN*ct@K2Ea*jB
zpm~1-iG}YOM0%h#cYgiBFre`d=!7|2Dc(QOjlc|ZoV-4h453*R(S-Xl<L@6>Q3}{b
z`;XOubMfn2UmPf0Jv@F)eCfcl-n3|DaMb@M7cOwDdM90HQ#sB%8>Cma7zszFY@J{x
zRs3--nws8tEk+UO_hEIFIR30x+#oyDW+Dth5Gjfy1Zf_zO_?xOMU(5|`*Fb88?cln
z&fFWCNm<Nk|2G=P)a^QA8#!UzgK$&F2Eb+PW~(=u?vu5&6@7jVo`NdE_IP!lpv^W8
zR=yfrG@+0A-wch<ZXzwgMkHrhxZdmAWPeyrwm{Sz@!IGOtBnPuE5!T&U#!aO$sFT*
zhZ7FvV~E+lZ$Au#zI{E|oR&c5qQ88QxiM&}Y${=q$}b#n`fCil)<_&+z_Jm}6cY|1
zvF94(_VO$NJIzu!s48IyEBzCu77&lQ8vssul`w?v6!+YqFs^mAIjw`$wUmC5Ce377
zFupiDW^3C5GU530+W2esFiiKxU|228nX|Wd?*wu`h@BOH=C=ZcRpsjin03cCp6>Pr
z#Vi%XN?$1fl-jZ6bxkC=AaebLnK!ei$k`zP%2u%6)7$_z*62@Lus`7*h2cZeA7r47
z4A3nRUJg`1dONP(d$4>ps)0F~Ki6~<)ViIR_|z1W@FKN>BkX}Au&sO@DW3c>NI1*}
z=;mVpdlwspD`W@@#+sK7Zz8QK#}MscXF2p2m2m4g3OfrsjuBFUq{pTTnV!B-PbV&z
zh2{1gG`L41JvOI&-W;&b@{^PkSl%DZ6ceZccg^vGryiR~UWBFp-slOSnPUgJu1h^e
zy_a060nVB%fK&%o)Bp#1a+o+px-?<0cNwJohHU3;PUyKRl<zEEe%uV3Bx(sTS+s;D
zI`->2zooP=xZ@C_GGS@x#iNV(>CHeQye_kX#GpzDX%l)oaZUoMt1$QG;T2uvmP7MD
z`S}zwA^2+?$E${JrvlJb@i%N~vugh-Dt`|Nj<tj-_sqsPZQ~S6s)^-6ECK^5o8G>Q
zUig^lra7z}^m*K2bYAp@bNG`}q*Ee%OivGJ>Afx-@ox7#=maAeJh3ZX!9h_5uo(VE
zOPJ~Q@d7yoad0dJU&})yE-3uK$8Q*soG(i^2cv&wZ=swI<wIZr-&JG~bLtBRkfYCH
zJN<Zn9D~A!M})*K&Q&Gj2k#gO<lzI}DD&MFi*rVg4FR{5@j-z3M6f(8e3s~`zrf*t
zw|53cdyc=zT1b7|Jh&iHC5(;n3C3>^r>>$O2myM!Dp6+fFPzU0hF5JQ#u76}4c}~B
zpu(EuWAu4gH~l5#cZ+3ICvLqmWauSS+IP%lx($poS`Q-&H_-NHfBw-=)_3|*;0fOe
zY=oW6NPi(#n6_|X-dVMqFycH+<u=`zV$?UU3+5~O4dTxRI3#_!W)c13rlK7;f6n)>
z<<5<dZ`U|d5?G~_sr$2`J(r~qukEi}k=Ka7gH5g{S!sgZ+!j!Ew(z|3ijrB5ZmoO6
z*UB)AtHU!@5FFWq1V{c&Z36nmJ3R7)7iST0+Wx<YFl42EN!L>h@+6UasSlWWUbm4a
zw-^zSFZHj_eb+8mX7Px6+8{^Tzw|=_;SnTI1b^YF8dV4V$MRPeC0SO}+gG2ynjkhs
zHp+1&D8U##*0`5LI9#zh@Ip}kjrPe`AY%SKnA?rhec}%JjKt&Ok+z26y*c9LWsKW;
zY=VbKZ85Q;@z_USCN9nZ-p9+_<;n2OgbG4OEl}RQLK3K#=bF176Vob*E62c;WzvtF
zf<bj8cqdqklYMz=2OK4yAtOM<+4713qH0zk^BDvWdlkpKjPf+K7wB_pka#&0NXo#Z
zeV=JD*8IpF-WHQu*CYHrocfolVVJo5H|y6iRwzyzc<SOyd&#^8YY9LV+e=KiVR>MI
zWCvu5_b+6sNcQ62gx}G>2*0%ntUI`r?03MQPR-%Hd9=mTosdeFAip}rm9?I)UW{5?
zXmWi{yO$UUC`#h5j&A98;2w7qNZ`iWWx6PLoJz#pfGNHsenzhKL~@A%5j)}}hJcwF
zcg_V5jyf_e=bsXH6Y>GSQmCRH#nq~XLDn6Fmwu24s>Ka$nd_m5+nCHnhF(~EuZh%E
z!Rk2!nS(s|z4~b=k7y>_?hO8a(-D3lY?&u{=axiIhAhjJAiqaEP9%RMj^C(}-ScVx
zoL8lmfj7Oht*yMr?iFQ~iYSUZqj^#!qJPX%$i^yLa>KpW9Zhd>@CgP15?aR#e6qd~
z{^W%WtzSNJhOvmw?-1+L%pAM=3>sQGn&WooJw{7jgc8o$s!dgoZOPJ*L8TKO5obL6
z?G6RiC)gNjD4{ZY7YPIc3`6`E1>W)HTE1oN7IBT=l;!3VwB-HyMl+NJcyb8QghSO^
zqXt|j{~j&~zpmt&t(!Wi?WqEr#WEAn;cY0Zm1{FSvJ)L7b95XBtVqE*0Hh+Vr(Lex
ze;tiQk2tV{Q23)oeuF~^o;H?i|JT)`dw+cisRSFDf_%3E%}4#k-Z)6|^-mxEjk86L
z&bK9maMh*KpDrriX*TascG<tD*y$oZTv1zcy36_~J4MU`rm1QF1^|5s8XvL&5VVPL
z(P@a?EBkv6HfgXdAMTa*)uQj9abnZD8?0Qrr~3z>K#WL>A$DaMy&`Oo0m@Se_GA?u
zv!S)q&voWj>hWE9DgF=88qhn<1i%BG4t=iu2mX&9O^E?;J?)B;TmQh;5u(Z<@vsUv
zRv^^!Z<gr4`3`9)@Onxg%dY?9-}96(POg>US{CO&5P*zYq(Ci^Ua05z2TT!_Xj?%R
z6Hn$Xm{9+p|B~Ud+6ekcx>2nA->`rGVgj29f}E-#0Ik~I7T#}255b$vX)y4U9~_~O
z>r03hgpeS}OEn}xuMnKM#xV5DOzr_pXCH8OWQQ2O!Ro>25A~(`dmMM*)4>XY!YUo&
z^?DxMM0Vq`oF2vS$6gT{R$7LRgJcFgOdU3)jOt=;QBs1~`;=(xy-UGwjeaR4!S&yu
z*y{UY-B*@1u;7dS@rpF6-7<aPn?=3v;pPh%EW7|{<^?SH)44zW?aReL_zO6bA6FV+
z{byI*^<)Tx>?#^#99s}%kESC`#s<tW5W~Eotzt=^@2<!u7)-^kkg|8V3KN*M+ZTO@
z>#wSTVzme}F%3ArHD2GYcE73l^!jYLtQv_SAd*In539$4o~kx_S*UXTDM`=>gu40x
zaIg#^NBmHI#R%Om6qhKu7DW2q{3F`Q(6wM?jwCRd01~Z*Z4UQ}q@S7(v9$9SdNd2d
zU*H^6Di=JB;0LL@%(z_~Iv5va=O&$-s<SB(%e&3)3v;hI6zOr%C|FhobDT%Z@9+R^
z`v_=T((p$DceSz_axP(*ehHUt#;A~~*de%yH&Y}5@ZVg#oOcL<d?OhT_pULoaSVI?
z^{gBbAr>c&V@-(n=)=AxgJ)$3m4_Jw^<5fV(saNN`;k$mBN*MZ-wpdt3dCTdEG)wD
zl%FC<_Xc2(J}EDH;qH%=L!rDsUiG{dAP<es7@B@S$_%1yUuptMDT$L?o+;^5MG4OB
zV8dpX{d~<8e%g}Ie1XnP-%+^Bo&BmetNigl=NzxXgUNnO_!9LWANF*(<>UXK^Mpn$
z7Y21zr2(Kj*&1g#8MEii5KP2a67R`q4*YX+XYwP+7Bb{xyrd7n>tm+Pva%5U&AnmP
zH-I$gl;~g6fkLp*UmjmIptT8#`LV*mOyk$Lh2DJ8uaaQBP(pA?-W+!So@To}a`}qO
zt{))F^%Akv^MnlREQQN2zYf;P9(8?z=LOe4cL2V5hNyO~8OtTbm&$;d8??$jU@?mO
z{lo39@#l3HmFRn1aSI%+YqgpPE(+NKSe-!h61yt1bawNNUv|AI<+%hN`$o)~0mEh}
zOpQq>-S7s#AhB@2JNLwngJ?X*K7Kg%(Y(oJ92oWJ4pR6a;JhuQY%YW4)b)}nc~wsc
zJ1Ci3YSbyt+fE+(Mf3BsWFQcpNK!4*6u;}~KvujQdc`3#V}9#OM#q2@8rj^d>ag5M
zgI2N@mY?d#b{T%Uas1wCt^%g*K;}K$!E7f)7xV`1z;t9Oii8Qn+F06i#v$7|Od~Kv
zeGc)7JKy}n)ZiRFjwIv!*6|2BrU`1l?T;_Od&P+A0&zyKq}_m(dDqXEU?zY>`Uw59
zsSEo#Y&)8SM9M;ulYy97je5bL_L;zdXsaU(LheQ3<VU|HTfITVjuOP)>srSD^28pM
zHKmxH9e&7{3oq3%xTxX}iq$cQv2~e^G*|0>3(T%;THAaVAYvB`s#r=p3-RIEcf0ws
zYU%I*dc8g*NFash&i$M>e0Kp5X{83yBod^gAFBub;+73~&gBuY*U4Ee^(5Q&Bwe+g
zczaLZB`nVTVBsaiDC#*Dpm<97Xv<dC*Zdc*cySil`*M7EzK!RU9desj#!*Y~1MK$^
zt)mdsWD&K<DCXqn4<_v`Wa7a6eh#hGMAIYFwdbm*%S}OBh!6c2-l*>r%`)>p;Ef!y
zLY<B7`_f!!rSFh9JH)3=2-6%zmX?nY$&Hf)yM<BNuaJHj9RIQW)$mNif3X1ML=LC0
z)a!>rqaV8BH=7+WF>-z;?DXtYZcv`zK&F`O##kj1BK$KO*>?`V(qS70&-(gc=0vUw
z+RCFe!{91M{I5%|nyA)(`KfHu62;vBYq5=4EF~&MJub8kLw{wZ-~OhhEB+c_w)ISQ
z?9h6GYPoR;wnn}9re8s21BUEYrDorOnG<$&Rg`I7Hp~9Och>|g$d)0Ov`^w$@VomU
zrO^FP3KF+|3_MXClWjt#=4E{jR%HH+6F$WLXk6o%o({(tSsLQY0S~|gwfm6Vi@-l;
zm_a>nCxyvEBDcjNU7N0VR@4r$;!+xV8&TZx{UNme;qvo`=_O1mF>f6~Ez?;zBmcmT
zc0prNYWuq7<_Bo>J!9O^=@l`{q6{03&JdfIR1?<-kCpX_>h;Su-o3-VMA(l}WOC!W
z5hAjzmSVEbYm4^Io%G()FBB5I7kOoNvU{!Rw`+bt>n;q0U&8NjhvCq2ODr4*nIGav
zVz5{CJFYF?0Sl2{nX?^&J-}=x%XvGdz~c;a!8~8?lkaO^Iz`GdE?9c<;V2@W{BqL<
zQDO_aUKAMluuADtN@+dzC+rw|sSb5VZhm)_l@45WgC;X2dR2rnnxN`v<W2RhvQqKB
zDoi^I_2;=mBckQs`Tk*uU<zK8KYbY`CBZ!qNn2(q3UvSd$>E3)?)cU`=8zG(vu&*U
z8?%&xh`JA1c%(Vi9NiJPDe*&a58jQpS*}Pd;IMrDpCQW17ggAKqJ7GF2JB{U*AR<@
z9`(x~oj)9PgyqTwQ7#i%H+m-`Rls?-V+PAE<u;PGxejVCtgt9Vn>^r!>-86Pxq5=6
z$sc4)FEy-8p`v3N-tdB@#&vsK)4=`P(ipriOre`vhBvYhTb6qQ9B?<L(&TvfsIBPf
zO-N<z(`Hl69=_@)NW69-I@Q}&>kI)Z0|}r14OD&%IR`2;y5rwT)21PlIR|<>Q;WuW
zcwxTg3lp|=!_Zb1K3JtDV#fe)7FTKH$YHQe{tl-JShYmEf?PL%j`U%lZVomxlh!Be
zaX#$b+R5Zs&^BETlJDa4zjSPtwk|d9k_|itCBJoe<2w?mHiy1B>g)yIgYP!kKFgG?
zwKt^>k`6|?eo{_-DKH?~)}T*mue9z?7z0HEHI;y^x+7c*Ba`^atFT55|MF5h*jwl4
zEYPM@ldy;S+=2vKPqKP#LJlmAakahPXE)B>CrzijH_m5BV=#cKDbzM`>ldml)+?E)
zY=Xnc3V{VxecfKC)*^9P{{k-kt34n{yb^{?9mq7w7;yH_4j^0{A1kZhyi%h3D#7M6
z7{EN{Q=6doHzYnsE!UwKbA<CT?=ARxY(3R0H=7=)2Ox*921X!-2SZ^$Y9_gBfIe$S
zxWzH}+6mdIzvjW>(eWgH2b3#fJL}iZXaQzu<Qdv>_s@n|-bP`fW+Y&E|KcP4X3HP5
zTSeioq7$jgn;-A5Rl<9=16d_0u(pVTU(r!l?q-QaA7gTQ1WFTn?O`db-iBX^`O_(h
zy(p;day4usa78PR;vhvz%G-^I*v}H)st)Y_<*HSg_~vDU(`!8=UW{|t0ELK*f_3j0
z{5Pc%^dan8r=W1OuLbv^HP3q>fW$y)P;CgSFH4!z?cl?b{uwVwmDUHbjUk-sR>1o&
z*v_sI5Ot-XJX{A7whf7|ra0bk@6`Wrjh?}*dGn`ZYsy<lKSct2cc#BHC;TLdl4`r3
zr2c6`pP2^<7I6qSHu{cLtDWE@8Zy0UvTh^ttLf~3CzwqeW{3VliM%=?#`O>$qZ1a-
zIMXE3HUKBzcg~=ya?CYrBHoTLKPF+;h|O_rxo=y#^SND0L0^f?6hqLM9nFP1;U;j@
zjDq>*XI8=+UU;-Ox+v=FyA%>RO6eJJ{vhT`U;{ZkqtJCv;qw+035A>iZ?@p#zd<{A
z0*=njP~?;r-AS?*yF)qRUmZ@OsrLjR7FXPG$QOv7#md8tzDS0egTe8PtEe?`-Oa0x
zA99dMA0rfZ0ERzL-$C|<jX3PsTHVC*OnqwT{q(BPZxZ6~+P^>S9vHS`BqTlOKYt)*
zA4DQ|3H5QFH`0CFaVvZ0^a^9lH|Eg)-~8vBZ9?&|3oJ%dyRVw^?_8RNLyR(EjAS9M
zLMKGV7<@Uio}_yZW{3=iU1yR}FGgRWW$?+ol`@5LG?1a|vF1kfsQJJ6Pl-Wn=Z;f#
z8*A2a1Gt;7iMABkI0(W}u8dOfZkz^dN`$yYxh>ej;88Nu?1eeGeI?G1OUs75bD;({
za=fyNHS1o}b+cgh*5v2#_+LQNPF$k5);j!hE6vAVAr2#-wR2yJ&ItebP%h3YuSyYQ
z#$*~Pdlt_fKp2ffDj&1v8+X{^x7j`8^57~8c5S<WNCqAW!ESTHlSFY?;WQ^@Fp`uH
z&mxG|AC{u7q9jI>D0xbzFrAGzZsbe1IhLq27EO{)*mEnn7{l2c$0F1l4y=3lzg7dR
z{p|UT)Q;pY8@D?$9%UURO~S1Ey-TQrXwJim3Rk4<`ssTa^~_?ra0dMB3~!PYQMezl
zd^o-*F**#kHWR;gB<Bh+ie|k@bE(-nvc6Iz85$Hf%_|W*A1~{TwPuPwoS`9qYXQ;l
z8CkuDTtn^KEvjFNOk~)`KY<|0Zlx8;2#>PBrv}ITb4|yQx#z6EmqbVRGIM?B4u6l}
zCZJ??l051Fy7fFZE$j0z{dJzG9vGrUg85Vzi~$3O<gnVGIxPwyW~!!G{M{n6Z!TNA
zAvMomo`9;XHz@8yNhG(P*LjGboj&)A#e><I^u`enK{;R%sY8%DJUab_cq=o4BPvN(
zQSWaB^L8rRL_IqHIDS`rs%0c3LtdVS7(P6GE7=<~sR-(}ZKA1ojtOi)i5oVUS%%`z
zbd0k9TyllvBeS0M1?O5Q6_1F+OXbj)D7!yuXS(H@w2H>5&$l1G56>G4Tzj618<B~V
z_*f1jbQf8BCPu$mcn>FuaZ=FS2SD6~Yx|J_c@=Ie%EdGLFU8|u^Aw&ugF4Ol`4z-M
z5MrGBKo+I}OJ^ryn&^vAxd`$!Y(Jjvj*Zh<LG*86vy*kJ_Q$&)6f}qlYCo}K6THTj
zflYjO?je}^;aE)T+i*bX@LQY$%!UlXdzMI~s~c25?I+Fmod*12KKyzIGnnYoB)m#Y
zFGo#+jJ>oHGx+l8U0luR6Dai^{s=iujF<!*ExmxHt}VFYO|%dOJr>SrP42HvTDyN`
z+@!=!v6&|bB8hnK4>dnZ;m{UYstT|mTvj1p=XGGR7~!W}t8yZ-se7P<P*)2bBwGii
za!A~^gV+{iiCSTYwohnc%a7hYmNhmy{3Ts^Q!Z)VDe>H&CLByfSc2=yQ(F3Ts2(#D
zvWUHymFsq84I4j|?$QnWKprFys5yB;&39q|dJ#vkR`tUc`mOHOV<axDMCCX1D}9Kx
z9Wh5k3YsyyKv)FReToN5fY2AP`3zdaXra_dzRiuncyCD7NGY7C<@bZ!m-1b=miU?r
z=8$6MD`8Ec*Vc?D8(*Ftob3S)PHp+#|HYXS6=OJXJwBJ2Alu#w2U-@|9G%N~k1z>n
z9Vd6;&!>TGq%9=^6-DU~OzAA2qOkwkx!s%}v2y|ak{I`gqh+e#j~0lpkNi&7jY*fi
z?~rt;7a#-fB>Jzu{@X3Eg^xUH4PLlT3WmRzcmgYDa3~%=kg?+8W&hYlL*X^2#^sAk
zzs*T*h3U^6c*);Vawm*Im;sYh(wV~T@-d>}uKc}z=a0SAqg#}d)NicI``GU5Pf>R<
zG?xIr6^tVDFwMDCjC!N2OVF-S+1i?+EAH-JT<LE<_ddQ(=7ZeUOZWwRVVCe5u5Vv&
zB|~geYkq&k?waV~PC#^sWlg1lWzKR*7Mx!L8MPSU1ndn>ffDwHb$&7BJsd@im^Jl0
zYq5-3tb6m{p{t_@U>nt<go6k>0&%>X?sMnCh4rI5xw|h^53liV_S<enao2Sbz6yEC
zjAV9AYk9Ym85q(~WPI=lmu0{{y@_rbKH)bC#)GaD7pUr;Tr8e}WLAhHZuQyVn35l2
zqkJKNBe(;Quh%H9W<V$4_3)8ypGrXl1Qqv1rf$g4Hh&$%9ycBIcoW9LB8Y=IRFL<k
z(mo!BkTVip%PYRiiKXUF=RJoSOgxFhA9hKS5&OwDc-l>FiYwzhQ9<t!#^v;m5q+Ln
zlJV7KLerZwnw&z@q(S1_SRP@g)uRU7wH61T^_zZ!F^&|(W;mhrrMz5xnh{tMd3@Q;
zb(ztW`)>dX@idE5SwNQeR-G}`QHLN6yG&5h-oRqe41PEFR_qyb)B^<$w|7527=eN+
zn&A<Da?zF&Jq_w8-HK1U&%g=ev=E>j3?%6H<1q0W1hDbKJ1@|hmC-u;BAFLdn1ion
zx-L41V=CiPAU>c!uw?4SlpYi9!n<ISE69h?#(T2xkWtGKd5y{UKyB5YomDJj3_r9w
z)?+ssSU!}i$@Iv^4$%OXTi7C*2W_5&T}bRjXZmI+O#Gpw)vd3sh4-=WH`I)WeD)QU
z)HE&OL`;rdzD4MGfx5cE*c3XSX|IH+%3oC<qiJEEUk;s1NJX=uiuCoIf=B}<n_tXa
ziN8+{-$i;PeUDHkX!%9>Gh&yhZNBx%<d8HsHHpL~kv{1md$(4m519kSRq#6*)cUqZ
zP=K?ugx#p<OY|VgVW3$Hw$N1TG=qea9NRxvXBtmzfAqflVWIH=my|*Yahd5{PBt*B
z?D6bq^EGE?TqMm)C2rcwCO<f&aMH@&C5WZzINs@`dpASLe)H_?uqguxNvK<739DE8
z>Luwlb}t5tSh>*EHZ3Oj11B!(is_=g)dw*i%tNDZC5P1VW#CM0-e~iBrPE4$ka@~D
zO9f}<23zKX$3k2W_ngeNI<}*UC<m`b_albNrYrHC^gXJJ`Atf>fM>(wl62nj3Fy6}
zJHPPy*hT|C4}<7NAqukvkZt_LvYyL9ErjfF5O$7FT>=p%Nk3-r1%{%rzOH43GpM}p
zd$!UKZu;e@J<&?j*11L&B&+6ocWyMK60P9Es0?w_x7(ro{>_h2m!Y4lxqp+nP?JTF
zeUk6;9BcL6PmB(k&9d{qpSXOy`V0p(6t$g(LQHhK*<3`rhwn0>SFU2~Fw>`nph43N
z>@SLrK6ZT%zJ8ONcV#53;n1c`gN=-oMFT%=NOFE`NOg<%e1oouKRdG+l0<lsgd^hI
z08s$C?<XEjtwR^6Bp)>F_3&Lkh>TwcKW31%BTMNkty3my9#5sogZ!|W%VbLL-N(D3
zS9VA(Sxx-)k?vQU<IL)3aoVNn-c`>pRxcl1{3sE_>+Jo*XYwiVjm1VQwP6*Hynnes
zANS&H6>LVBLC5Nvhty^A1M**zdCq06*Fw1LPEP3WoYk3OJO&Fq{K&!{qM&a?a9zXX
zg%4YJL7X7xv&?#Yi6JX%=kZ<akOq|%vO>*VZ>F+m1@gItcvleXbiA)_;0D^3=)t5&
zfqk}LB$EzJPKOlJEmXEZbPH8g`j~u52g`3Vk}_Rxa<y&xDmxpbZj0CH?Jy&GC5hS5
zgFI%gjNG{8yz(i&n5J9$up_6Fsx?%@Wo3EN(?FA=d~uP!A>mSb=qGHOAc-*{jmM7B
zH)p0lZ=SdYwjad!M0s7n%KO|9R-yho2coj^td!(gmP<?y`H6-PkBA|6cu%l^%hUM1
z2d~~Z#f2%cnFFpbN8<sTCU=eO41*hg?4|_SWf_Ycxg>Kih=Hxe!hpgSk}+2K;VNf1
ze}mI!xO~F@x_p*{>R+~=$c`+geE{!?H^2Soo>UKz1@Qte)mm6mrIgEy_$jWrz~#$D
zUcgA80#z8@G<$8hgvWR5G3tRg;#7m()hA%DJH{@gsiKuebn6;fGQtfrz9q8o!@i`H
zhC=LX<w$>7TsdqYOTrGfRpGHAY4;-i+^J)NMZqI4EQt^t?Z*sOAavGOyD2FiasZ%R
zG?m=cQGi)veR!fN7*QoAr~jhNW-wb&|8~fwX=qN;h<;aMXO+)-F6hzYp^S{+2KvQ6
z^i#UReI)DGo;2;0Qb;{~i)XW}QIq+~dCG4Sa1I4O`GkH0#b7N~<zs1YG>X|wJ4zYZ
zs&Ui#{v4?Pvy#HFWM11~;mUkA0mrGY1d<)MD5YQK(3y0RS)kk#sn_SV_Kh{^eLx#9
zQ?+k~5eY+J@P0>X(bDmk58YS@ZNy#-T6DRVL61s<Zpwxx2l{F(Ic=M8Sr?q7NNNR!
zR9}k^;5zln@w;s+Hx))F7~x3?V!ye0HU|^T5#_3Ru?(%j)eGUD6YSquxTxq<gBLwG
zc!~~T<N09ovd?J*bzZ<}7_apO&pc<M&2G~tsm;K`IN@F4l7|*Yc!Tk$ulw5P_H!iR
zYMvx*ba!vi4KlPF0U^3-4!=j21_+9Olr`v<FCKw%$-3u(DwjM&JXfeSlk}R$GbWMG
zut1_$jWV00p4R4XkSGg6(^2EYMEsN?Vkv2TmTPh2iroOxlYdB*G!3-yf%=B>n;<?U
z!_wApObZ=rf>qh0_|Kus3o21x4daXb%0C4kh+?;YaLa2Kjlb_-q_L>ovf3y0aos4j
zw9$KC?42R5FhzHUki%tFO1XUrqYTJ!0E;f9$fqt!gk#e!U8(NH7}9oaUo=kneJaiV
z^Vh2Yq6eS|hzWrSSsli8r5J{^AC0_SzWf=<<1hO-wH<~iw@WpDtvQs{b|9nR<rf+E
z`5~g`)ijqsVnhr(?8&Js6~hc@7DVi8aXK@aOmSdIka5r_>#aq22nzJRXF{Ad>~t-8
z?<=li2Uu=MI3KOeFol4&Aynf`1Dn1jbqKSCJnk=^w%*@#HPPM1=EM{zWL>(Q<sFRX
zUvV3Bl{>_4f$#bGh~w{0hpsWUkzQ9yOPmdvV&ra%s(+E+2v+@@`rT3dLRl>rxTBB8
zDy^ItJ#aN3OSjuwLuujFS=bGf3r&-1@m#ps?>t8(JT10+9<CxTkF$r~cJ!B$I%A)^
zyD9|W9!e>%xI6H!zc;SGmniTELpy9%`nw-k=|3`{y<rM=TW}p)v+r@psH15oSc!&Q
z?M>QRIh0FV$gfgr=0WYQx?_pJld-Z_yZCgg8p$zzZnDy`bl~n|Kuw`&a%WC?ElEeS
zQe4|^^%qDbH<aCl?}4P07e}!p7d3=LQ}iRNzXyXivRUYQ^HFosI+DwBI?r7tUg@#X
z*8Oj~T30)%9!E4qz<_E7QC1ytxL&E)qXqlNX)yIzhWGN#wWYY?GWQ&;Pe0w4>bGD9
zdp~&<0*f-$er~1PWmn!`*gS1Bz`b%`@_qKUj?N49N{bX|s#5vWoh`+djzxO(m+6!X
z+_O2@4fxRJ#~+Iji}dM+6}s9y4<O5F1{V+7KS2rI*nQ#=a>3P~D3$lJ-cV$*si(zx
zyuTXovQ`{GnrENxnnK&^ZOGxu>f%IbLmId^f<`o!DGMVKE*Oz`?1|+y<*U$Ncg3Bq
zuuiY~eI^|~bGM{@NgY7tfr63`dZcLWDORE<waE1g05S8AJu#|Wouz(e`+guo$q7Va
z--U^E-d9Rz`)+4}gFv~@{tL=YWd0YFYuWFct7`j1@_GpvM^09#w%OeDBBX`w(2mOj
zMqVJHvYjb&wAR0fT$do$80wzJ+!*?aO{W`-REMS@7cGsVm_g}2i<fD=iaRYdW03Ti
z%j|tdym!_qBcg`?m==_f7@#|XzRWw{p&*?GCf>`7(3LuE$l<zAap~}_n3R8fonNKb
z)XE@Bt=`hX@Q(TS^>=2wQhC|uEbgz^RKMFu2J@dblAG$3nnPcL@(7tOtXPfNig*@R
z$2jo5w=C=^>GKJMd(j~l+^BT1!HuIe4rv=~Sc_?n2*aVPgG8+HoxPKjX49hcQ46?|
zQ(UN)!`Nlgauj=U5CA3UbfQm<=inGKiq|Kigin$OBRUJ>L5{+SY@7-?5zy?j*bJ%e
zSE-(6#N?5(?vDVQNNGR_1HIN|pqDkj2L-W(>M#>r5$uALN4Bd9x|OY|%zcZk4=&pL
zjpv3pnjcO&%LK2JU7}h7YlszF7|p3kDK&icnJv<3a54MfbjU3XY_4&j))pXEr{4F8
z4I5$h+EJf&LGR{BkDQM<guG>}hx##5_!AR!EJ3+n)tv-`R&iLnl>GA3kR&g-vSNAq
zAab$I4V>^66&^`agnY3oOX$iz;29TJr(wplI&<Z&=Sv7pUGQCelu7+B%gHjZoLuQ)
zOrqisKH(Hg!A4(=9^9lp{4Rt^`*1bSWvh|C=Yhwn9tL1Kap#=YoJvaYd2?oS8Ll!h
zfV%gh0O(=rOrm?s<$HAG6m@tTb~2VE*tZa1%m0v5#-hiyvQr-9S7nGV<pg03H0Pa;
z)b3QMd@l<hj=s3*c*aU9q#``H`&02Z8=4Ox*m`27$dvEmUeM)#E`y2HvTN?4cyAE@
z7wj3oh#Ii-#QtxMDA~gzc=|PuQPDBStou}GJVtDLR$zw#;-wnLH#OQ=7I~T9Xl=~?
zgVlXF+3ve#Irrn81kY^Eu=KlJ@7Bs(leRr(qpY&~GUf2je0#9ljJ*$0Arpm<o>050
zUT^m+t}+^-;XHiy-Q-Zril9^d&*!VUzejJJr2wF77|KJPSe?wTmKMT~;HuS^>b^EB
zp4{65F|8vvMMDhYc%2aynzbD97vL-DKL0X-QiYlXmN(KENXomL2E8!_8_r~ccg4cv
z+FokgMt9{4TRx?)kqIJ2rWkp;s+U%d$Q0@NJz=aL9y8`4jyG5m+jiTczz7J&4Wqg9
zJp}o<=N4^Xq{x8ayB}5<d*!;JIeX53G7^&uKim1Xy_{=>v2%nZ2|l%@iObHEFqi_U
z`kVqj&MY~F%;&eATW(&Jof-#Gk5Xj`UDbU;6P9up-vfrB|1}~eQX{04vm4)Ce{~xB
zWNSJzYs18!6-tsZ)^Pd9?7<mSXy3^2i3h!<_!<u0-u={4QSdvA>VCy_^JrrBn25Eo
z=DfEp&s2ZW$}13IM!0ot4MEyLFi>+QuxQ79tRn9~ddfb_-s)R%!v_RL%it*x%{lOk
zR!R5|?-fGN?oj)A31>t4q`Q*VY_w)f=pAHU!9~;5M>ScB-v@><Y(yp1B2R$EHAcr1
zQ>)Uu!df$ZBH{sYFL~h~EW?9Ja)Lt#>qYZ<`#a_yTcF0I53q1O71_*lFy=mp`P*93
zvtc}a;IxGT^~Q25OtT#aQjl1xxpWtphqak*H5?FEVYPE)OCj^M{yZ@B_iG;$aTw*w
z=meWc&j_kw;z48%%Y)i5aBh1UXC-Cr#AN+UpsO;4$dbSrrdQLhz1NIO)8C?QF)yN9
z@%nhq_#c`&jJd5FTS_#+5<U1+Q1Ant@iL8&cM`yR^nu^3jTs0f_c~?HKJ_p=#Fa!<
z=lG)Rs(r3fjC}=7;YKX*-thFvpFe#AN-4o{a+@o4<(GX&U!CRRAfJ5q-=BO47aM<H
z^|gqlRc9i0)*yOopzw!hO_tbjjRux#>(i`W)G5)&BOiSF`3Fy7N|$$r(;iD3-M`3<
z`rfK-Pgwlg;_2IAO#uYJUT2&}jQMkMNT+jycF98pYdHyto%=kau@~>s6JT<inOz{G
zN4D4smfMhTW@yz8D?4=nQu>>N&|jnB6X46ooBxt+hGmWQ7zEtf<%|asNbSezdCy-_
z`kV==V2&@|yH%W1w3Cebl9&WZ4=Jv7y$hPpg*?WeZk_@K%>~?C<<ubs$FL)H8i(O#
z2AOL#^Na~iZz|6_1{1Bi;X6f=eMLVgJm*wqwWUFsxfG@-&8v)qlaY%riHk%VNhmYl
z9Mq>&YvNm;`>-rOHpH4lOu?y@M!ZFfcwz3;Q1XOBV6xD{<`&#svmAFR9b){xw+}z5
z`~!oaANd2K3GCV86mHAR-6QqxYnChQ9Z7A$Lv|Db?3b8k5?F2!-yHwuyv)!MZ=L5~
zBWgq!E=vm@m}#jLOUSIT-qT#Qb+t#l_qH0Y#x4%RIj_Z%jY<U!JuqKwGmn@b3`J3Q
zq+OQ@lNf!8jxS1oFFoSxwuthjQ32eNx9=bb-1E1!0d~ED;TF5^?qc_-s6tnZVmfYq
zw6zoLOkhrP4r#t<f>jqGvZbuF%lJ+JnimLjKJ7thL#g0UCyxJO0pz+|89PlT6AWX{
z@2bCn?tZ<1$4N_op+rg97T>`jMu6QU19l_)>oGpUMYBk!J}n!r`2#w(w+t_ajPuc1
zKDQv?{&0hj+i+9hpF%2H5~J_VL|v(f^OYeRpK2~fhq>HKMz7p(`J<4JC^mkuy%pAR
zWyl4UO2M?xBg?TLpPN9|T^eKwrYypAn93Ntrm*Zl*B{~)&GlBURL<I{*!ooPL@%3-
zqaWwTf~55Jx49rnCe(|oNOlPm7FyIUu7xsR?B5t}5DQ6p!a0IVfgrf8wch6lZs}L-
zY*x+rl$(=abL3RkboLOP09TAcn3q1@`&pXjUjQyya4Wq0@23+JM^|<)H*dF&Sv1ot
zrcedqDpRPMaQS;iTaBqoazo6#aXHmV+t&1K^c$-lyQ7gCmJe}ng<Q4T>8L=pOR8oM
z;&W`yl;j-XU7DCT1O0$YZ=_G+57Y(lxWljy@2R<;Wl)J-;c4DA*V&8A^YF+^@02om
zTUOq;=xat<Z)hgQsSFNv;MHfkR66@An$n~f_}s7F#>|NaZM%(u!)DeobXwk8MAC#l
zikGuKQV%vD7;PCS=cZ5K#cl%{9fZF{UhHdLNb)n%YjV8N`~f%y+!4NTN0hm$n?gb`
zW0T)Rf?*1B7ZCq-7x?$91UybSzYE07ij9F!>N1GA`X;W5I*Hl~k!2EodR3(zr}52^
z-tLVE743L~H=>FTR1EFbr%Q{jlqLys#L<l;3TA2zlE=0oy;uNwa-8eahm?`mdcL}o
zUl0t#!oYJ5cb~!7;8m{O9AGdZ`MN6>c3Bepu^?_?@V9qA%D9rYxcJ|u8tz$Clc(y7
z`Xr*(3MWl+x}mjmyP*=0kXBMejJ0R?glpf&9m)AdC_5kPMMi6TDvNVsUPe-eAA1!S
z&-+bfiuJ;LC@NWjP*}IR^2J3QbOy`?ztOAF8K=45Q1M%ZT)Shhy$iEzh2cikE0q;p
z6X1EjVM_mpaVI-uBm^v`)nI~6_s3ji)1?p%l)(qtoV;g<xuAuCU5*;Zgp^pg5X*j$
z_Jw%XN4N_vl3*C7FcKuc$PxWnD*fXZf3eRx4K0G%8ZX!<aTNOYtjAQ~+SlhFpIpgW
z{MH`jREOej;YiJzfOm{4%-$PE^s|)n&+e!4{8olW#ly|%;qUPrnQ{fEPwq$o>c?(}
zSaK!_+&IvHIRfI~X?dV`5GgB0f;f0)8WlSue0MwVUDA^BVgrnxbj0v;sqdCQNsuMc
zw$;#BvEftQ#(yOB7~`%1!CS@heDD9@(D&CbMJj`jW`3SQ`#*lsam2&Z_?Gz1e<EIU
z?;=Rym4`qR{lgdi|NE~A?wre@=oodJc-!qb_5Ks$!Z&XA;_s`5#;o<~77LJs#_8bW
z*I(9&diixeT(ygeC`Z$_8BTrR)W<ExP3We_1hnaplo)O?LYUrYo=(gv3!WVphu@p8
zhH|82*DZs@HGg7yw6U9++21JY5DbAdLgy}fuo%Sc!23JRIh1W1v`A^dF{Z(ua#9ym
zRNHu3PvOt{kRTa{>d>Pg27|A)6{wVUm5b37bJ;Oq3uy!7sx!zBY2LKrOOQUtL#SkW
znR7=>y<%OyI{`<}eb!&UD_rj)C@Q-gluu*`+*L%oCBnmD{(k<KH|Ly}(4lqomGqeF
ze9Y3VK(G)a#+W6F1?K-wP?khZVxtDL#J(PT(_fd1l>s?b#~Aaea$UN94r2m!_q(hI
zLwz<%HiGyU0BWzbiH!zq*^DAQ75a&TE^44St3_X7mwmqea!5xmD^@fJ*xX4COc0EJ
zk`nu7^=jF`1TBpnVfv78XQc41foq?{%by_`T}dzj)J*-_P+l++`S7MHsjyW0&R?MR
zqPqF7?RnviSx~|_czLgw#Bn2!+m#^1xeb>}xDRU8RzdK-%9D%|CN4IFUUM6ISi=Qt
zct)3m(&?_7UPDu2YQ_u(EAVG-17e+m4x{RIG5JqJI4%dBVn;*fU|*KR*Zs|DW3$Ag
zypb=AT7qbbOtot-GWvkaO|8U(_(FCY;5dJNJOK>utpquLN-4=-4;dQU1DKeN(@Z1z
zd0;Dq-{40E2A>cCjbOn@YU~F5y+_gY^CVmC%B~<pGNe-TfssbwkFatuhv19m&;<)%
z*)EJR>k0s$LG6Z)beW||><l7m?cml5R;?f=tCpxIkl5hW0SO58^&u9xaoSmB!#y^;
z8*Igsb~s2F{tll8?)<;|SIZrN<^8coQ2vSOhmYQRs7q0G_8&}r!MHd}c2rET#eNN*
znES%_r_w5aA^19>RUHuKl!ISzJ~1{^(@S6Zql%&fw@}8$d%)$xTzE$0aIIs2|2@gJ
z1LHW6N<*%DIwD{+i`??CWVbm<cEnG;7+gp*^n~nLGayB!j==f4vI6JJ_b2|+=$Wo-
zBJM-(L)0ESCqzn?M=%24oLhXBj;8vJhFbd%LG{Ijrk%3nxAfkyPp=#sLjz%aFLk>P
z3CJ%gC4=!-F;kIeRP!I~5SGpn!ZtA0u{kY&_vbL?MD%|lds&A)SW@zul<3+2i|iGi
z{{@~QN&2;+aY%43CX2{c)~%;aEP`?xPg+ugtKvfT1ndR`-`EWb!!43E4#!a+l6pb%
zd=p`F`&y|N;1S6H0OT1=e5X0YB-~f$ICrFaKEIoJ*voMvk*IBq4-ClM#%)s5p+--G
zF~q|+!`Fj8+TwktV>FLOENA)<{VFV*rQST4mwy8`c<=e<{=-vLQ(rmAS~lKeQJN~a
zrv3wPUt!j(6jA~-Z*zp1CZXixhv;V!1ebcfg*ftp(kKkk8R%S2+^UPKGWTadyTD3l
zV>Dn8b1C}vmk8Or82MbCvhiF`goSh;v>{%a!shm6*v#i%nl^-BqL(!PL;7KfQq<*i
zBn_VR&Wae-UuPl$S3DDOzVG2PE?MH+`^A^2D;cRqIStALVgAR5h;kzZLNH3`R6tJg
z`Z48&iplOMw8-gdI5h<kgEGB`Krheeqtd+iEMd@G$1N0u7#x$Ed4EK7fgGBZ^uDp6
z^vw7>{`GeIy`hDu7UBylNMbXhQr`pn%`_-#2i`i*&P+x}Z_UEVdM-3vb}e2&CZbWk
z`FsUqxVNeeats{*XOfqj_kTn3vXT9d^lq3xjGQuH_iGDsF!mf+b0Z{wx~NNjqupB&
zCAmYXhNNt7(yLWxV^ninPhSN~U<TswDy|i#P6kH3PmEd*;I_3gS4+7hZKG=u0pvvL
zYV%7Y#<^fHbye4S4DPo1xICBLO@TQ<LhdW;YN1#iujnTFjgsN(Q`4F20^09qOtF&u
zTX*qi>Ol5)>l~H_jzk0;Wd$}Yx29^}=%o;OhC?EB&Yx2|DztFAR%EG;Yzxo?z&m*l
z1aBE2#_ihQnw_QBTJxNIfu^M`-Dx;yutvd3CFx4>4)l=YF+5Rp7O4*=PXd2r;f;{$
zQFij!$`aBxacB#a4ylXhUcIc9mgAz1!%WJF|0_Q%?b(a`M>%l`j~`u8R-Gd{?c(Xf
zE)cwS_05p_PQjuA6QLIW4yXA26|Z$}kX73L_0Q3uSnp=Ngxku@Y@ge>|Jk8WHrx^v
z(t+U9)(6@%TZM7uXbKA^p@dZuE<%2-l=L`q9y7<*<>Gr8z8*g}##4R(&_PUb4Z~~G
zDnH`%bv*8Az-l%2FN|Ni!A`hz(o+Mq*fH0xU28u|_D!$7Qi-mOLdEisTb+J;(z6`&
zJ05PxAul6VV{*))-^_bBTn^Xvv+_!<=k=WNC&}!93CH#WyYQnkoDcMhyx4`$ALj@9
zCG0((Ei;RfEx=y<lHyRgpj<iXqeOoV&8OPRyDMe5j&tNbElyX;p7)m#JEFT9xwM{r
zl!?DyXDU+N-!DwHMBN6=rw^gR5irJYmi#OT0;@&?G{c@CAU=r5?0WvJcI2-tGY&nF
zYn^8f9U=d)eWCvL49*pBJmCY>N)VC6WT0c=3kCvH@|xsn@=hT!B_yMI#L&2QF}(ii
z;pDtN`0|yyyeNLJ*#7r+ocV8%Ia~Sl$|rjeX@`y666H1k$x`wief_;>{%qj(pf>{}
zZkEr1hKR48xOTdq0UX&nvk%VRQ)$zHiK-AxtE;pX<noD*Fo>q!cTSJq#Vn(H2%2~_
z>_~W4^Lzlg>O)ldu!Gen6&NF*rwDrFuI=|~{)`qje6I;~CiKT<!^0JAht43P!@h|y
zv|j_%%N5!t;k0n~I@qWLoM01!EnGfXSEciEJ@QVE<lC?OoU+)UwPpfP*>^em*cy`(
zny<kn0}N;f%yB9bVX=<2*<KL?$yjJrOOhrm2ih1tBrl)P3<om@aI$?0E2xCdU%+WX
z@FY-zQrC|!*#8%e45Z{C>F*RjO)JcUo{og?QcYzk3^HngF@J~?98#gNev3?@fS%TZ
z|A4cKXnvh32(0vw<n=BQ8#=fe*FZDPQUP0Y-@c#SKj4HPYJsBx?Ql9})(`5oi}1K{
zHzTn-eHf95ym$zJs}|rMjK-jIH_w43=RgbKmfCm0kGPS%BbM%tH)LU;9lk#3hmq{N
zA&>(2W`k3fk~G2LGWaRqTAh67YyntuWnH`wW~F#}7S(n0?YWPkkPKD14Fl5WpJTym
z`f?piN}QZwJsS$BxyZKUK0s~wp>Vp^SHMf81(rDE@9(T-`~k4S=}%=fyr3HQ7lZ=`
z!LsSBsrh~HQ=g;VrBINM)D1!GYzx?V7^VRptnufLo&?WZ%11MVK`3Z&o^T>4Z73Ym
zq!kf@HIfsqpFJ}%4XUrANuz$i+6^3p<NEhI65lKp3ES<erMCO1Lb_OJz(Cgogo2vo
z^)`^!g|5out{jG6UShvx<n`PblZQ;$OHed89|YVAEOhGJ7l@ecQF}WtfN-~$vLw8?
z-Qjm^!}#(SUG+mLaLAPhiPa_W1!hM~bze&l(=iezqXeu2e|;H-`gLe$8?KX{davlO
z(zJd5`SRY4hliU^VDu6DEz<v@cxQKe_Gd!=FI?IvBTo<&4g-&$zVK)Dc>4`_Vcbg$
z>pqA#IShD+cujKX@0fqpOtQ1D!cfxJ6;Jsx2c|@Wc=3ve5yV^HlEF6}xq2)6ox(TF
z!N5RRK2rX$^kwe0m!UYjy*8nTA~V@56g4Ue8wlA*e0|Ra)Nzw&mP;`V_7|y`9zGTq
zw77RcP3_n8R=`(8P4e>(q5Xu`em#&#OV4;tjH>tK$CVHAuty?fX`RIIB+5wcXF=K#
z1Kj<XA&z3nlX6d&(;hjBb&}sSB@g<8-2HCFf8G6o^Em*#XJ~{bt^LALZ6te3l+~?l
zigze}SEi8-O^v84DVNAgi|%BsGx9O!z2pZIoqU<1C&&Jm1k3~J{&)<Xh2vE_CLa&1
zs7n^cSjgjiSGcxPzBGnRZ(rc71N?ti0Nc2*zy)7z_uJPVu6=7gKa^R-Un6oXF(>Y=
zPN59U#8ZFx8%B5u@V-RL!0#3f0^Qi*vNQP5Rtw!~uqd!#6lni|gw3faUaWnYzXqS2
zJ(2+)6`|K6H&KG96E0Gc((Db_)LN-&t7;A+G~f2Khms*JDH(UBD&YjOWr-6pPN5N5
z&wA<dMV{W-v(M0`x7x&oX0jB+y0`zKjZm3WgsjvCMYh_hzGgn&pNTqRl)|ZRA|g?E
zH$?2`Z#_xeLd^kwM1fo>g|-M_pv!2+Q92`^ucODb=S7FlQ=gPhwbx!=_A7|mh;pOD
z66|3N3m=Y3?pib^a80Sra~iK|F9mRUwbb?tf61{?`ptSk%C1FnQn#LEVtFJvL48w`
zBY?9lJH6O|i0YpTy2QwNefYd0>TjpFhakIqoeQEW*ydA;r1XSw3j^fA*9XadiZrml
z=pXZYBGuPjuu^{a1%bn8>Ah;69D&*@*Js!n*R3|eFuIbp=ZW-wUU5EqOUJGC6H*Cy
zGV3>fKY-(SV);E560?ghbNaxTz!toltG<e^W;<eHR`ZYSj5BRRNjbs8u#YRp#3dd)
zD6^M7NP0o#JLPLI0v$k4&d>UlSxetj16-1XfzF+XTBfcP!u!te=P{}BI#=MiuwVV4
zJ@Cq3F_#j1u|LJ8v;V=@d$?*OT^Te20pqTpy@DnjFyUCXeII${|7-8O!>RuN{~rzx
z3CE6O?_`{8!oiW1S=n3mrbt=G>e$N89%Y2era}m1CS~tEBN>^$=j;9Xt$)7P`~UCt
z^_M@+b<TCp>zvo~G4A)<y}eEwoP%Tw|NDFS5eTf&?Z3Ql+%N+{UiRP^7XPpkKzPKo
zOFS#Q<R^dicH0esqpkH?_^<|BeJD}CXmr?nrGBXqO5#wY1)2jx_VgSmHXZmDZ@-{{
zA+pwiS|C^OMt}4I=)K-)F)?ia3ObZEowNrwyr0IAA|YWdJ}dnP{=mopiD;{|#gA{X
zPe6iEX!dCj92f6Y9#GOki}8P^$In?gSF)fo3=o1z9`1B)$riK&!Pa!y;Qm0+f#@EP
zwVI#^*dxww{FGwS4meJb(aNP1pZy3tk(tn%9qBu2{TcPvBr}lp$tFr*nvDVV?h3Qa
zu*^we7JbSHi2(o}@M)*(#cMsu1$R-ScSMRk`$-!?0Pi?Xd?p9N6#Ky8;v#=KM^TTk
z^&Gr4W;U^zDNdw!{3h_X$}CvK3zIV9aMlq`S84a?Lq#mp)1PKAsdF2um&9s>OPDv6
zf^5N8d^ty5qQrebj^DrZ0fHS}DSewhp|04C>|<CJ*kj2}V@jH-1C>k;$>JAq{rsE1
z4<Cww|KsP_Ottsibtn9J*i8HPe%zT+nGJ?j-S7It_tj%bV?@ZA>FC(K<;-_O70O5K
z^K0H?76pi{@4yCkGLie4?1**u896AxK#{3{%0<6Kt?s#HqJ;>rWZf%bV4tF3ioX_n
zy*wTdL3o*7sHQ8iZZTH0|E)ljdWHAY!x7_Jgv#!7O(7KU-#8|f!Nlp>ky4&{82~24
z?tO`u<=1z*wZvx7Y)>>pxdh7~K(NM*<I-#*_#RDFz(c}vO9*MSI_Lr3%&gp`_A8rI
z!0)77*M+0XSHHsj<B-xrq?f??SpRUH8JHyPff`SOl<=voc=Q_B26<(+t((pqr^acc
zSJ_WvU9cbxyq<l#HOcG%&bFi6i}SHCy7A=(>bXMhO1bN>hxKhUodWA_`dBNK#VaFO
zQ9;0McQ77wGBHz!KwI0GHlpk@6dQv!s!tKEh)-voS?!(c(He*4!W_1j)n7$$P7MR!
z%Vt_WP`!(ZgvRqpVQj-MOB86}G&pGvpkQrg3IWx0Q)3nHA$8udziIv@L!+)?$Eq-I
zH8zjB>$3fGTCaqPwZUzVjB?C-y*@%aQ5pQ`kQQPOjbR$_`+f0i7RfVTx!WE09EK&j
zw?jRkK#c0OQsd1)uxe+zk&>8lAH;3?D$W@@WZMN=T|fTGgTK#~=%g4HMX(}5E|K-8
z{8+V@tqrR*Qr>DuyPMW>YPZo!K`oX85idfdQu#4k2kj&?AD{TOuUk5^ngML5#D$C=
zwU7>yyb0t`x0M4Ua4d^l6Q8}Omzpq!ef+XeLtc7l6h{+OfiwdAQXiyh$VCX@aIJXf
zDQ^NkS5omrhYs^eEl?WZkkWwR6%*(~-}0h4X*!ztS``zq|0X5j<f!?HSFZ6hIWR&t
z6%C~wgOkHG&3G*b^UMvDICEO1D*;xP!;Df3XJ3+h&a*A$Jaqw4v2TtxNFqeGKOS0C
z1Fp8sABw|@*R-A>sla$nu;0JBpRM1h#^Zp5jkKX#;PFO*@Yj8$BEkC=3zgO->C(7W
zGJOInFU4@fwbM5LpuNnQQL#t83a-8MAbnYc^=wtOA_V(ZgNxO&iH{*Z&;sC;GN{#(
zY+m_dF5&xE+l{^3HbppULdO*RevjLm!f6EFSk*Rgl`CM&$55EUQ-ey3yfzakP_y<w
zTL6yC&)_cw7MLsg1Jxl<K>EQl2;b#DW);Xb1&lQ+*bPC)L=KW-5hp*&OP+(a4~Ad1
zjzxaYp{)mwDN?Q64rg|xk<?GGvNV}L>I-2ZiYHQgK}YSZ<HPbo%+vzgC<EkSd7JRV
z$vc<vygo}$c+T%xsi6KYQChEhlwzSQJcB{D8?dKR-&jB=RdH<SGLzJx7&?y5%d<KJ
z1+MFxKCr*@ASAAgYGIdu<{dq(`7-zK3E@W`=ZJehF*9s=9dhy|eKpey(>D8(6db+{
zKf#?CNPDCnwH$u@l_4CsrYXTCtlgA(6QveYF<!xnM{mB^kL$Y^ZL@R8zGR{w8p(QY
zA|w;DV?eK`1RFu9BCQQ0j`k%elMy+H1OyABPGL7nXVhOlUQR{`lmq49021tpuMScz
z4US_=l$4Ep=Gutxi|)^O2!0pkmhAgFuxr%a&zZkZl(eMu#Ff90=(bn>L&#rj)Z0C;
z%mSP-#pXu1o%WTK0Ql1JOAwnuj;)Xmok!j)4kU3=Do59#&kA=0to5Y6M+F_N=>$8s
z=jal)#}YZn+TdC1W31V<ljSMtvutB%bm4sN%8OQ<Cij8eS3--X^Or=HBj}`@lgJ|u
z&2!a$VBG!b1?S?q_mc&Rl*h~T9jqz>{Z`Z$GB!S~J9Mg<2y|R#_{%g;&4dv18d723
zf!wMFtLz0tFGsw7PYlevtlL48&f7qW7CFQI;8P?VdC^=pe0Wo|L`)(uM+^Vuo({-0
zi}o=xN;nI|4PVP9Xp4XSN8U%Azc5|2w2xXM_(+c()AejQu0&4q(3_>>GVfi{0Ut6a
zyr`Y;JotV8xyiV9zSp%aOF>U>rtf}Db@{k1(|i@QxS6|U&bKERvDe2?n}z2FLVnq0
zP6P^kEsbB!l{6f9EfTL_lMjzRJpV(IywM_0jR(IkqqHy|BU~qNp^o}o=EuP-!U4nA
zm;C$Mvt5XHpGi$wX&iy<({9)~<fRG~;ZLiBb&1AHXW0yE<k@#Z4n_<<DHnp60Yx5&
z{7JKa!0Z?_v19>KCf34aTV!ATY<}-M^yn2&mxIE|n9bMg7|w8ZWR`3Bi&tog`8&xQ
zi-)^Wyg4FZ;PzUOC;T|G3mGu{fW0krm5gT}5Wyy%jIcU1j`X$WO&<L*K7i|rwyV7O
zCcE6EflpWvGSQVg;LMzTIoK}8n*88osb1@TcBbTCZ<U(6gOP(ck@$42;YqNHca2s&
zF4uy+h(k}i8v9PV`sH;d!LT33Pylwv>$UXbRy86~P;!Fy$D=wGeB&zHBh|ACvu%^l
zZefJ=kVU6Lhc=hB76<Vyg8uecj$d3DKhmx+A5x$IK|1zErJbrKl>-2Eph0rqWRR^E
z;S`{DIl?EsZ5nq4(%W&ggSTjeqD2teAQVARq{FC!hMqHW$K5tUgkOVFx#TJ~DF^wE
zVFTc_${VO&YK`Pvn0;w=*?J#b`E~>o*>KlH@nlvF^MOAuG4IzfOtBlJbr7@9M^KBE
zTvj@x=*X1{I6`48<ef8h9px$+sKg>~eN=t$1qy<=CfYLU{qkIlI!wd%$I2Kx`mrRv
zaiki;?jhlcl%h4LZpdj_=3}4JiUfS(xS#Uij@}9dugKHv7-556K{(@|%#<Sdm?DRt
z5z}K!*l#r30Cc{!GryAm8HUc;Vvw&|dOXQ3qsjsY(-qH@VO0d54x51QZQ<8v6`Khn
z?o3B_qT2F=uJfOIyx)>w%2#|5xK4brs;W?x;d+#<RpxMtf8UA3)OGbx<qcHiXV8gF
zwu&$sG;#9js)pe%lRm2UJ_M2nVVQ4J=i3>4SXu!CBgHWF5+Db7^@6q$0$3p!i}>^9
zkp06;Y(=DDFXfetzL|`qH4Lcle;o2J-9tS&*c^PIc3TM=e}X2!?K@2pP3edBN4*=D
zE>VxFV-)Y=g#1C?bHyrV3xNz*`1?htSN>$vcH>rZG`kz)@dZ<jGU}l>jdrF)MS2y#
z>qU9X0H(jpYdZrdeCe25mX4=f$-c(j%e9NS!~w#ey3*(@%i$ZP!jDB*dD+Pub__d)
zz4v4T3*xB?L@#(7x|`lM>+?-UeC8Nsdw#h7L*SYG)9gFHV$hubsoCQiG#~Usw@l{t
zJ9Sz6?={X`V0dHRoFyAs9EsPl<)f&==si>ESfxn`qw-!QyPVR(n1PU)_3QTXAd<8X
zntb!zPPfW=yw3mL=osJ!MTn7}Ar%#PvThuSblBS?gFD21X^UV>HhIh5>ZTM9Nb>^r
zKncMumO`<nYc@_3TPJpyJrmS8!thT45)*s~CX<QH;qr-eI1?%enQp$dVQ?+YMPS_l
z`jr26OX`Th=9LG-CLg`5RWQrMVQ5So2*vv`1%{6zrT4W;1ochAr<hHTdmpKbsvx{D
zso;ca;YY>+;6WNQasRMJ|FpfG?&FGmKVOpBSw8{((4pDCat_V5v_u2FPtV_-2YFdi
zXJZ8iSu9h(j$PSDO|BFXUY8^mO?cm!DSwX&J%3<&5`#{KA04DW1f8eDTECtpaG2`&
zl1Ywvr*5<Cb8CXm)3}LoJqCVPmF=?c?SX|;19B#(Nr@8GsB~;@$Z=i2<?J}`YkYY<
z-Uo><q9azX4}r4FiK;seD0MmGrP}BMA3EV6zq)ev6@zgi>8kFBProy((h=I9{phgP
zb?Vaj)G@OdW2^EDf@(gZeaLL&pHQ7M0GJ=nP}U&xee12xK!vDgRcQxJ%@|rLfWZ)=
z{sZ>tvaX|#j7;H-+mAhOt`)Nc9a(4EuYtOVRB(Bt{;4ejFT8}D!|P~(<2GhDp{Ly?
zt70-3cWa5`-ir)Hmdd2#BL#e7<_EgFmo>N%>HbuTVU6EN3Twv&3-$owgQGaJ&40eX
z5wOOh3myWwZnV?8lFy6rKlbUd_MC~lP@_IERlLik0zR%@kg26^?qB=e6>8LdRm5}L
zo}ml37?!<)1&GayoyYUEiPrLrFEJZPCY6g@g0R4twSuUht%hY<upzEQ0Yw5zXzgZs
zwr&^`QJ_a(tdr~h9X)5XCyD%0p4jD73!-*1=}45X*FR%f1?v>SXzC(`HiqmbHO71p
z18-uz0LDRrjAH(ap4;5D+~53KgkQQ&Iut$Uz{KKkdBppoU!~7HNpJWg+;7c2T#uOq
zDl;M!0Wa?YU%i(c3oxZ+`cZgAU+}f{cQrSU-mRRv9zf)izLQI|Z`B5iv$s!-*vON|
z@S<664|Q7C2ZOq6lRoWj;oq#gI<5e`WVG9(IeM-)?Kk%(>jLZ`$+UP^$CdanMOMM#
zZXB|0%QEqbo+?E=W##&Rp2|MWQDIAf{l4E*wQzIVdfi734s~kmrj=dh`96!!zz8V8
zP%DrQ`xC|p#%Da+9!oztk7g{rCT8nvPvSm;!5eKgC#*IQ7*J^g@%8AEh->o?q2lH?
z3FAvWx|hMCB3W%kImeSL@`ev60j~1<B=~t_5eyi8DOI3>O6mhEg_S{?d*wSXDaH3i
zvhHL*z6$vQ#cfzOfkfT~{?W#!@C+jNf&Dzch<=Acf3SJ~Zf8*K!PHBed_j7<`CNt4
zk(r}T$>>@T{QlbV?CVjI@u>gm_cy_3E~aX$P=p-xA~7Rne;8og2P&tY+QUPERR@y&
z-yyW~&&T=GIDz!-!|JiCN47sUHmUQ-iJbNu4EgU5oYIR?h*FV;d0kWqzea@)_3TRf
zqP61rCV=Q;%PgkBL})78CFZVsFH`tOecLFp5bXfplLiM%N;xnBi23$p+ea=2Es9VZ
zN?jO7As0rgHXAawZ^m(}<ExLO&Z1p<yf)Jh!1vW6u=dAo*z=mD246DOF>lc+Ek?~k
zIG0mbY(7gJvg6uzKx>;~JtZZIYqMTr#b#Hsv}CN{G?ijKcbXb21-rrRn0NZOUlggU
zs2vO=)~9|)1uvEpn1w>g7L@V>E&;HHK1zQU?BeUc(E?g=k>ZEEp9KA0zcs6$2LW14
z9os`7DmRtR4F)~ti~Ws}`gNuiHo;_{$%Wnx;7verV7(fe-#fHeemLMI2%dCrF*(t`
zm<*pUw*Azowx~WRJ=M5yGk7<~vPO>uhlZqqf~2EhpQ-EB7Iy|F__YspaSdz01zVxq
zc5P#*3)sRO8;FFM?<I;*7Pq`S`JEf}{?>Nm1J(<V^?Jy`A`>vj@PN!sUxdVA8%!A-
zcpq3=706Wv8!#L#eBW@(Y=wM?6(q~|I@E{qQ{2lNW~Y}}?3*7HgtoByY=E7M4Q19C
zeI{RhN@UB8+*@O-Z(@iz-OSUqp7$j2$t<r!L?TkgrSSw=;ba$05E;n{=d$kc+Di8_
zu<dag^WtoP4l6D!*Jk>8N9~gHX8K-tYSM>e=hqqkc_2d&m_jwhh^5}5R%?sDsx0R=
z=3uZDFj%a^y0Our)5W1QPBOkTG3Uay)zTs*>u3p0M=3|;qi#KSBv?TZ%`*{4IGn1H
zUEG919j)(OsL-3=XP(1RNyUFzi~KZKqol<Zfhf&a3+s>{_Jgv9K9MAFR)R){sJ$>x
zoIEJYqh`*dTt5duk)?t2FeYA9zUx^Z!MBhaO`&_2bN`k;^DOu~AK@}Vf0fcOiy$tj
z`p38*Y3X}4Y8c)leVRj%_4Ab0u|%EfhtujY@=vccZV1h1Qw~RYUYST=R5J{_^Q<Hl
zG%j5~2jLT}`7!gqhDFA}FeHZK$_b~K+w=!%-Mc#z$}ski2Bjrv%Tx8Ms&3r*4xlMn
zhP1>ZnJH_Nmhc;I7lZN?hFu#0D(1?wtH3s?g!<FJ&#z8YsRe!;R4|+j6JV;)E~Zmt
zOsHinyFP7<rFnu;wz>mYJ?fd#NkNp^;el3#!(+xx4@zkdCQl8<N1${OqHowZsMzBf
znqUUG+4fKrDxil+b|HOYYt@(8{DhW+&M8{u`Az8PP)o;m`##LZ|EPxFk3dt^AxGk`
zJ`u0Wiez?Jk||;3(7g;@=e@4~bDdop$7Sj=NdEakFyw&i-27Us?lE+o*L<Mse0M7#
z<W0iei=z21icTAE!%LI31Har(AD_mvfSJ<Tw?fr#^lJy-s^MH(d2?XWXo)--u54Aq
zef4DGz!zsM2~)(S{mQce#rR%kWZNuQu2<Q7{+c4t2dsL|l^=d<Y=m%zhTY|YvoeeV
zSt{A<4Pan$Iyw485oE<|5q?Xr+Z~8@Dv*AatVPKM!r}T!0xX{hidhN>K?_R`RzDsy
z4&tKZH~S-gf4wDB1{UIzbsxdK&V*ByYuDu=ln4CEQ?bM9L%+X~<_yAjwrkW&9@GUH
zb%_+QUP^5|rEz>Qkb%mAGyJJ{zuHg`yFDc;ETk5|>!PP-8QI8^U)6q6SD%V!`_%10
z^T~RsNL`{V@^CH=hZ)V%%x*t{3t}v1OSlK*<BjN-=En<X%uav@`-}FvnJVe$;PpGX
zRU=tXp4+TL<?kV%^tom|*VW{@3NR-(!0&Y!<tdm1nyAf5Y&vfB>&+fBb#kqB5{N0i
z(?hz)V(B<aDT5$?r<{iWwkDzX0+&X^R9KlnZL4`GaK`j4^l$9h+Anag^H|X_h<Ga#
z{x`l3VoC_~Csv7c^ElO~`sdeJf^e%Ube@4}yh~SXBcrMIP<(Z(c_1a0i)7fj>koIO
zprLJ7mTFzrVv3<@->v^Va3={gS>{E%VlU7YzPB$noO<NnIIj={Ws16=I$dY$2$lh9
zqbIR5RIIBInM^on_p(nb9&tKQc3$`VR5(Nbkq*lR7c(0~XJZmuNn))O%Q))iaBY9X
ztwKbb0l}{#%Phc&MbbUX+&?5~tfCyEKUAPWfC`x@1pxy4m>!yKiHpIAJBW}jSxK}9
zU?5Yay{W$<E^uPrL9B<ADnO8Uwh5co^+qEZuJ?<^UL}^ECoVPc>}|ZlomBE1T7Z(0
zfg5w(n%QX}MAT&xS1O9~Y4|a82tW{D%m6}xhy^YUXM(HI%<vSOuNj$JzV`3jRRDp8
zJ;}*dW|?o@K+v48Lb}S7V+#;>yI~l@iPtmkQi*wX9+T_Cc(<>@$n<?0PvU(k?aj!8
z4azuOkC-CX5JhTtIv3-BxHhX+3X71i(r(ETSz;a7`2Q|EGyH0Re}+PEGp$g1OS&qq
z!)98k7ag*{^;JPwz(|Vd9iyq$IXShcRJ^b+d<0{fJf1b<KIn2B)3-|)4Zk)-Y?Clr
zK;0YFD+>+x3hZ@dYkMy%Zmr279L2CEZqk*^qSMZ9()_aSVm{fHu;)CwubIE+Hq@C_
zF#8BW+l7#E7drgfac?u($V}Hp<878WP63#6t!<~ChpF$Ta|Q)OfC$7L-8JUx=J29R
ztRBrIOt@|V!MC=;%0-q=v6Tv7Tx3F0Baj$Y*cPb_cc39Jd=t3kVrMvG=vX$#Dr$xt
zgi|muexqC}AThFzO`6zJZsLhw3Jj-1uB($0EM5A%_1@+3ifj_u8TUwl$r*R}740At
z+sIS<Cm`nPUC&2!5plc`h9f*l1W`D~;CM5RJ)}K%^%_U-ky6S*=SS@(8ICkcMx4E^
zP!!c%L`bw_9NI{|s5OTYj)-;1D5k9YtMr*(BHU&3xv;4_xlU%<I6amRhI_#!-UA_C
zjt_rX8f?Z4StJLR)h<c--d1@2)^5xzS)eSi3LVnz8Dz+I{tG0lmHL&M$iA}ma+M~L
zUCydn8ulQ`X2ZiD6+v`;FV>N~&#cyDVXI638hVaif4d&VKl>KdR94UU`+W=GM{*el
z1{KWT-`pDYrrN`k{{rNMquzK&ZANU`JWQtq*J)A{pB<QwoNg^xetxR#AC;P#S`}0j
zsp>d<;%#NM_+$ZLXS@Hd1^;T7c)~4LuaoU#wGLQ9jbwp(T}oWaa=H9UJ{Xr|Tso9_
z#&~nyy(oYqn)7R<V@^k;RS*3&mT@4-tD`FUnYu8Fa?or8dkWJxnHEtYI(gtd#Yxp=
zg){qTk5vWl|1tT0k4qBtu&XrWNEE|5Z>0x`wWTJdQ|zeEA5NqO1gp<TItxqVPjHVU
zn^58Xt7z^4Ty0S8>1&gJWzni7hQQ)r)@A(v^OXO^CxCqbcV(>|qX?KKSO(%pacD>a
zl3fxWryo%Nb0O3cTR|d(s|(Q4-T>=LlTg9e;M2rDMi>DxAfzMpUrI6T1(3_$g9+7h
z8G|=M>o<Wg5&;J*4cTR&a}YhvJsk)>7i~j~s{l!@3k>7|xSTP8JoB0h-ky<XwE{)w
zIJ6i(faATI@&enzWzba`)`Zoh0kP&Q_|DWZkZ`v6&zwvfzO&pSjZn%TvN)Y}cXC4g
zYCfrSYXBpi6mpQX^p6;mpAyJ9s;$YlL85e^_8g6~K6ptxe4q^oCR1p7YuiB<_(tPu
z!wY(BGW-LG3pW^S1#$M;*kAnC2Gtyi*YGf3fGwl_Z3s1Q^tp~UZ9OABb{PHvus$O{
z2eY2e67L6lJ175m3op{V6os$o4cWH<dsLEH)|X7qx`-Qlx)KB^O=B0U%BDla|D{0m
z{|ZNS39+CpzZW}%qsiC%-1rdpZ_wlrIM*%oyKhnXca{J3*_{xHdC%yKYyTDQoXCJj
f!ua3nDUXD0Ut89_vXa!l1pd@jv@w-RR-yk3tD>gh

literal 0
HcmV?d00001

diff --git a/images/phone_list.png b/images/phone_list.png
new file mode 100644
index 0000000000000000000000000000000000000000..e2efc37b003baf0a8376e80dea1b32923f5fe558
GIT binary patch
literal 16772
zcmdVB1y^OevIdH~(>OHljk~*h<L>V6(73z1yF=sd3wL*?ad&z2KIh(j-*~^^tuYpv
zE3+z@Nh(#9@0*0l%ZkCnV8Z|b0l`a%3;zTH0%82R{s{%~^;fopmkJ07PRd+JNM1rn
zh(O-K*2LV(7zju_EJ+noO>qQ0Tl1%cnIDumbZ6MlLMiAx5u{kEWhe=Pyn$3C;WI!z
zOtrQ!zxIqWGKRJ`a2?@8J%!8CjXZi%l5?G&(18rh@yX-n>*jM;>*G|K<A);S<0cl6
zlZD$)bJ}uP{%jImbbRE#^u+WbB4=Qt9zS$D&}{B5t<mU+2nhM$i_gR*HlX|tU1tRc
zZ>~>W5jldyTVVcOf=ML6vpdo!43JJ=%IFO+!3%%qeWh0!ik#mvKLl!^TKRywg;xFm
zmW5VXy+q^@e+Lc_W7KehF>vk%+<hg85BmO07kFKTFhMd<u23+0EZr_NL3E&-lFlX^
zX|PKV;T~*urLUa{oa2p~dcT@>gd`x(pp`j=?e)`Y_%pdfYfv7Ji`|S!-KeA9WVu<B
zgDF2T6~exTgqRH7z&ms+9GuECRH0lPIjyisOy=1E;S5Vp@Qi9ZNL0WI#=eDwjZC9p
z3h*t&kuaDVjON>sBPvUmRNzf57`$nem8jPl(qy9jGWHfkwltcmx$n=ogn`AYEfnZh
zp|_fF;V{aROa==-b`A&w>(I2D%s`4pcoRjE-2u22z_;tsH*10iR7oPi`M2fY(&}y<
zLpsUi_~jUyL@17i%q>*9iI`<OD-%a$H*Eeb>J<9q@YhfW3Hc!w$WTK4^NofnLGdIq
zooXe_d?Y%1uY&0~C3VAPed|y~n=l57?gIWp7>MH`us<hahQ)99P!`;Gu~s^+?N7X}
z1d-s`3BJ|-0*QvB2Q>n<_q0`{dn?L@hU6vyRp*D&VooQF`L+-Vri03l`~*ux7>oEC
zRB95G3v|MVV7Z`*5nGl{Bk#WdOAP7Ak0KV<ms&|De#dNN8?hAG7c~pfaUw{F<FP6l
zo@E9*${;kQh{;F-jyM>FIBZ8)Y#j`7;QiXn4L-OZ@3GfAr2gGX&Q183fSW{%+zG`-
z4g+JXt2_jp?3XMip_ZcKPzuYQOY8zFVlOBG-~Lcq3$$JDaG$spH`LHWphh&Aqw&42
z<7YuIU0ZcsLCa71Q7B3J!5vFEYjW!;eRN~F?z33VrO2HwhhWpL=&9%YIS{v)msJ!^
z|MTHVz2Q2?W&KL%+FC<3CKqI`{DJ<8W5fk2>~Qp&QF6$e(C)C|xdA1YpBEaJt9vg3
zJSlWPl)CVNO@p19q%%oJP3v|tbOF1PPX%Ii>)IP$!16v~^EG8D(hHBk`q`hA)Vxsq
zK(2OnS68Om^MpVTbfCXHZW|^ELAxTvfbE*XYhuw#om*}Mfs}GVb0`e;5GwsZD~Ivi
zfPInR`g+`uU{M69QX$%Z_@AmnnE18oAya@K=#i>HmiL&{qMd>91$gH|<3bGgn%e<u
zK`iy+wRp7kDo#V{1OS5xh(qD&6AXod8$_@ZUxXr}ga#7Mh;zq>7ZR(7f+-L<#tn*d
z)MJ+kHW4@j=l}xy><$Qj_-%+0W54f%-+UjG$F}&Dp67CmIS*-EpkPMCi8di<nlEAo
z-G*BwY&o-GM(lyx`AbKPYo^4E4}zX1GECioHWhKG*PxF12Wo=;Z5?AJNW~y_E&3Cm
z?jUEK&KaKU02Vu}s~<Hxid^6Gle8Ca7hFxB>u%JO#2eKI&}Ba#@-djAKT~f^52-Yf
z5wZ!gBs5y!c_3(@;+6RcSqB+Pm~=kR3=VHp*?`q9?k?l5=<as|>bihsku&l*@gWk^
zpFot!1H=ajk`g%L?GnS{s^S}k%|<l!;44|Uva-ZdfWdv38`>L%8?+n48z7Ms1F7qR
zfuCTBD~V`{)`{q&q>^mLzskiLOCaP?%U0$^<|Qpa@4wyC-WMe&CnqJ>EDF=qEecji
zR&{EaHhwqjEci2*X@<M{E#1E<vngOvvR%egqGay;xc83PgV&p*Gup@BC+9;YBacTT
zqk?8yCs(^_qtsjEH4Q;zSS6&2Nv1@+M9{Q*k~B+4JG?rJRxG82OSf6$G511d!-pq0
zJ42^ZCwr5zQ`_U{F8es)?)<L*4*Z4mMe|<n*p-1km?fZpn+ct%i9wd34L%g&0YWRl
zD!?Hi2Ywa)1@0bx7@G_S9a{zGJHwfgfixFcO4^uK@A_~@uNPmR6_{tB-M}m|JaKiT
zn<4Wo#j?1WlF9Gk<F*-3!({tZ`{-+`J+eJdvX-!f5#a=NvJ7e2w7Y0#%s+hbeDzlQ
zU2)wR`Nguu*Ts&~&FMPK=WI-D5%dY^*7RKZi<Xnjh^$krx|Tlk9mD?m$Yz91YL@on
zyK>2d5fh`v%Js7KFF#~xh-uK0HGizpKvter#aD(`J*aIo6l=VynQGKD+B94=)YYcx
zF)bBu9=d9VSH)BTTsqIO*4SJgTqs?()>u|}>e%X!RxwtV*0!8;xxhHaoxb+cp}5nX
z{kZ-5vD*r?^NkY<hccJ8gJ@P~=HMhw7MCcp=yP;@WI3MmT+m^k3#DC|t2=!<2d9VA
z^>Ojc)X;LI1Wtp+cp5{F7LT^0c5~~d{hTY#rOMgj?nB?kNY<s!xz9wv1@c8*Pr=IG
zv)WNzUtPw~=8ch??A6<5!^CYtN^{D1MQ_FM+7Q=9`_OA~jGtWQ6k|?97uXik7I;^P
z_tvxf^TF%+?d#gsAm6ZW%4dT8!(QNCQ`9L;EzCB!74$Kj5qL4=2j()C5!L`~IlKp^
z72Xf1&F^oi=YUC*OA|g=d-%8@TqyVc>%H;4-@yiX$a%(j=lU=D68g&ekNs7B4SlEm
zuSDF$hr+)_QHTnNR7E!T)9s(vIc>JwF0XHgaw79ZNQIR}cp_cGlSP`P`N<=5YEAk!
zFxN08q!oXJ9t>57bz+#}voT#v3-1oJge{=r54I0ZMe!x{h^J=PQtvn$i0_6Ea7iFY
zgcmj$W$jarIFH!=6jW3xagxPmGT$tOEh8)2oj+v$so}8t@cUy#KVeDcB72vk$Z0gH
z>_HK>Bs;@|k;Zj3?hogj``lv*>a60dYmx8v;Ejd*JJxz+dL%DzNO$(964DPf>SJUG
zC{<W|QwXE#Nw0Xc*haDnv2P^hNcZ6!yHwY=XMnSb6<Y@bN0)uL!{Ysm5#pnwBfP|8
zDF+gA)6^-7%o#%G<YY-M)0YV?wgx$L(luJKeZ_-`X9&A~ABCOVC)w*<g4*1+?eofg
z^;g(?7}6jNQ?3ae&R2T{w;4IdD~>k~Ye$+R+Voew@1C2<&w#hcXKUC>jKm+uI+%5K
zM?@$2nIhe2?dUbMK8@_g_EKT$BYpAJmqc9^bPKdeNt<t*pRW^ASERe7=Tbb$Hfk?r
z((*1{>x~`xZ~crP^|yR2@AA+6+c)C}=hPi4RvHpol<Mw=%G2dp756F_RTmme`Z9YO
zi>vjGHH$L3uGI!sgNOI?_u=<hNi#_kI99Bc>sNV_O3s~}g&uNI<`c8&Y&hUJHC8a2
z^R+8hj?agWeBr(wjid%rhE>|ugDSg={#7weq_$kQ2~%>;YcfqKwx2CR^#zWmZ<7hm
zo6gSd(ympdVy9^LJnbcCU2l%ZQxovC@R|?wkBOJyo@kz*Z=~-jIIsB+jpvUmTUQn5
zH7t;rkfYdfLn~3<GWo@mhx3Qg0egr|jZe+E?94CI(f!FG6)~9K)$J2Ke(xV|*X0|&
zSIsVN?x_#AuNF6Na4`HCXiPwrYsv}dbbfGp`h#k^Y#N;f&+hq@xz!qQ<UPrzebqMY
zHgmh{x5e$tHGQ$L={EBE^VWE4j0SIyGg=3_&Ed`Bcwug-Wr<cR=4;~;{~4;vP_%K`
zL|ON6;Cg(x-GcSOZ)v!q-LYBW+3-Yrl(}5`7`xW><m}{(;fC_IyVmdP=hA+wF7X9?
zK22-ignpFn)(!5pkzdFmJO(~$eK_9aY{(VKL3&7i&c0S0ru8@Iej<Fl8f+DIT^!XP
z1<)mTGW)ddEi9{AzuUh*>K=7|cT;>=d^(?W<-MHjkaPcZ(_4g`4H=Dz#@FN(@owWv
zWn-r}7;!7F0)l`{PlN7>zyq3A1lH3EehnEpam<!Px?fQW>hbM6X~>oA05bc<cHojV
z2h*d&ex{);hLXEpP7VP&-*4z<CY=3er}{9z7Zq{me0y7p2}qm<$SV;Z0l^9v_pJq<
z%oFF=V%o`V(a+E;tE+-$6zz>Bd%`El4|s%tUrO9~tyN#A3urrWbw?l|bke_HV2Ph3
zS3p3(ndVArPHHmJoQAg6wE9N22FA2*)^=agKtSAXoL`sL#!mVKZq`;dj+|~hME^)|
zeqH~qrXwQwN5sjJhe%CEo<PXf!I*%BmYJ5Gh!=){fPmY<$b|E!u;_o)zkcx$nK?Px
zanjMby1LT3GSS*Pn9?zDaB$GkGtx0K(tJtKIJ(<7>ATU`I1>M-k^gB&*x1p~!Q9Tt
z+}4KRZ@c;iw$4sGL_~i(`rprgp3~UP{NJ8z9REA4uL08iy+g-9OHcQ|ZGWk9|E=Ye
zH+M6(QWG|}Hnws6>Vubsm67`&`Tx(Ie|!8tn(F^*vj4vn|Idy8Dst2Peg6M5p8rhN
zKeb=8#S6nt_rEjG3uF4`od2~LxaPugN?#?&-<A3LP=0+W{#$-sqwmugokD+=wGzSt
zN^Zbs+F_n(B29b(F5==K&?LeHzx=K0FDuUKVE|NPrNn#OS!`R8TW|1}o$g(iQ3Y~%
z?@Gez!sx)zuU%9Wg3E+~H2ffhu<h8J&&sn)vRhq`51q}lSs9rxU4^d?4&^6H$_w4R
z&XZ-S%Pu_M<46Sfk&wW^fd47-k^C?T85N%93jzMW2ne8{K)#AxBqSnYc!B>Tg%12B
z^m<&l{jU&#u;}XlY7jsP(|`yX<v+|w^n(AV<-Z+-M*jZy2U4g8*mthDuqxe{Z2Waq
zV3#v1R^x1L)W&V)JAd*aw%lmzocsZZx^J2_w1Y|!l!Nc%?wn6^45|@Z#m=1f7wbAn
z`tOtPPCSpn6$j8Pj^<N&MC6aHVL+?rg$hDS$c%EPGkL_r8$Y7=I;h85c3S!z@`%B#
z!p0-7i79zmRiM>cEbunE9GBwg$uU@@Lra#5yK`wbZugK^TR&qH)_&_MHi$Zxt5l`O
zdw%u`o!@MrFE{QnF<08kdE{3{AKrNR-mzP6diQu8*IgEP5x+jFJo~ho@Mw2<-PffE
zhy|k+@tl@Q87Jpebf3oWu`XW2?;hW$B`7mId?re)J@{O@#Ni(3st>`7B_&1R66;AB
z9ZsbRKc$O#Tt*lxDS55c_3&e;B>MS|-U|IGJDh-;0s7o~mjgxPDxYKvjZikKdqI_(
z4#@E>9-liZ&234Ql$12Y*Xg!Tpw(_Yw8)|w`=z9?+E9VW1jD$ru;n~rkHbjFXu}6Q
zqDu}ElLg=};;7V>Qw~WuqML?Rv22U`QZNe9?*5)-uJ=fLq9n55qJ&PQ_|=3qSFB+-
zVea2?+-i=jub>j`AzZ{;8X>uz36n7{K<QW*W5u3X@VXoWojf^NQn}xSethhJG&Z(R
zP8cQy0b0WH_Rn#FwcoW<aF0l1#*+k%hyp6aogu%>DALn7LnEC-{~L>(Or9s=_vufc
zdO%h(8i6VyBQQ|>oCfl=2*1<Fj7BR`AaoowFmr2B9}$v0(HucFZxOR>6Zp1?5Aeap
zcr%ZwE7DMFt%jWCdMNh5V##Ww#mvU*;|k{UckQlhsUteAV$Vl>w+$H0xCd*^6t2xq
zAx%kHX{F)C^L5RO%xlm&9mJI(lG!rRd-77_TXM=)1tJ#nRB;9d2B#?dRv%*Bg%MpW
zi-k%7>l_R9>QHY^*WADs_Lt`rU9-8(%h9a<n2wBQiX)BcCHoCX6jW^amjszI^F5Ms
z?FKUK4wo$2T7}DnQ}=H_;><JWLw{zFC>Rcv=zHNFV4x@MI#Pz%pnY3K$8#&T+Z12%
z($-g~J9i!ukVb8^$}aZNnQE6<vf?gtYBD9>0&5bJl9DJUVGWd%l-@p)bBY*C?|;rQ
zLS*6|=BNDTRmmuHuB*bq>9bU(9ihQ2RZu(<l{l>3+2&k&{D3#wpBm>Ywc^I(W@cMh
z^LI@Rhln#h(hgD|h0OU}gdz6K8;WI0)h$pZZBqH!V8f4h7f!j~eJ3-sw!HDq<&Ro+
znrj{EX9@l;LbVKa`qetWW{>~1hNN}8G^*PuNw)5E7fSI@Uft&^4~6uaGR1b*_h(n8
z@ethe?)EEcs>5EUcxcfDx=7r<LaJWY3@;yvn&p-a@*O5b@g@)}1YU%iBGYUe2}8(7
z@(#F4^ZtwXlUURm(%25dGw+2eZ_NtK`_~j*-=Zi_glV~9S@$g4KDyp5neQdh_~YEN
z87|L{^Vri3)c2PUm2EyVtME>KBQY$0ET&>%W9qv$xD>al@yax$%OJB@x?d}mHK*L3
z>q%iD@f~$`?&unD@m^8I;EGvkH0jBTc0kUI(7&P0+`z9<q>Oa2{5iR7+s8bBZPp3K
zHN=s9Wr)GK(%^Ku1DnWV4nA3`vJ8E+k~1=%Yu0pmCY^6v&8xQWS9Dt`LxB>`1TgzD
zCcQYH?*li8-k&6<YrQ0$B?9J;Fe&J|*|Sd;$_n=RUmem|U{X25DpchLB_u3oQwcnG
z*sz>9N;<?4exn<vf~6^9TZX~8m(<gx%DF!<HJuy7PkB$-Zg9EHOs3PPTR61F-Pl~G
z94=^i3<Um&3d8x0DDI|C_v}(oRf625B!+NOP|ftgB4sY<{B(M<6pzB`c^z-R+`yCI
zofe(Ve*RqLS;A!5E{aG&gF0##mov=rkZ1uYta6|GR<2$Ep}k`oYWiz`i~ws2FAoTN
zaJb?${Bk`HmSd^x>cQIQ(0<<EI4@iL6#>8K{1u_x`?mX;X$7=K0JP2V@_T2yUYV|E
z9^ZI+C9`_-<z~?%@8{@)2HrW=L4OKIF%W#!w$Ngm*2F}C)?)I`JN$%nf+&0$x@EoB
zf(#@tg;dKsETa=yza`2!ZwgDobI{=EDS}3)Xk}xcqgdawa)#@4TIJ2Vs;ci2>2MF(
zx(Aj+EqwEPYS{S9oN3Y`bMo-d$ckwfq6ejj;xb9D$hX{;do)i;Rjzp!o7|50a&!=6
zaz+()A$!&Htp-^5<3;tb(dE_{1bi-Yv<gm_5*O>^x52R{UGqg8sy?m9J9}bE9vWpd
zc)AL_HzHc+SXvtEoQtX1?eE$PF}OY>G-_Ro8t`jd4P?<|u~=+YMo6Z6_bEDsZI$5t
ziGcch%~5mKT-IzBAEj10K3!pxvs%CT>oS${=#Zy0;>95Mba2q)z?}dk?YyYhyVC`o
zsjd#lC5UL%y((h?h^REBF0JCu=L-PCYW=XM<RrqG{3_FaB<_4ln}eqa2*anz96}8}
zWTWMRk+l+Ho989)`SPXAtkZVD8}Upx0oy#Y_d3+*E~R*^b$gkCS<G~B?m6`~WeXxY
zjq3iEu20*GC%c2wBK7)%AsJ&2JUi{!vLKt=>ZJIDyN26P>Q4Chx9j2lS6$j8Bm~kS
z^C=ZEu6Yg{HyAQ5WeC4g0>+_jKf2mxzK^~pCh*q*xHOx=IP>(`)I(`69kf@$No9?@
z36C`@Y3z60M)Mi1EX_6>RO324`P_Z7xIZ6Op4gL$0^)03T*9x2c-K_4XY$27eCv+u
z(SV*N<ITdllk~Y2408^|dKYMHO0PeHj&a@xR5Uu;--p4PN}l^8XM1!*<g$w+tv$~M
zrX8g*J$TjxvR;jzv4`V=g$Nl(B;e7^{b`hZ^a;H?jFM)vSI{F7{ioFeSWo&O-Lbs|
zOh0dwF|>yvA5KQoH)qZH+aC-khD!}w{>CrM=g2MDlcIumwXpI0Mx5t>y>JD%sYULO
zj;X-9UyUq>g%x-w*lgp*3Gd|r{7OhzdxMd~H#J4n(4u`i#o~zKexqy#diXIL!;&g7
zIoyW$a^zX?-%D;)k$)bhIsP`eo#f9S=^2BM8t<UoqX>d`WZ((b3wL1^t@VD|JHuEB
zZOAT<*oZKjr69BWQv^0?1H=+Q@5OI>vblrBf_BFz?KGi*qs8WWqsEd2?U~@ptGO$J
z%}?=BRmoi|@nUxk)8N+qnummSi&5EadP!%P^@Mp2{`}`e_hYnrPpbl5ZgzgAKe04Y
zyRvBFZgDpKcII#Dz>qT0aapLtxf+uO_$BSz0_d8^fDYdH-C6BYYPn8tNcpM^`FGt9
zQ+T^3*{<$>wv3Spd+@`|2NpAOm=BVG|H03pIs6mw+SJ^Wdue-nRPLCve0mg-bZS+@
zgwL!aZG)7?*UhcCQuTy`tFUO#Z9Si7wF-y^U^dZ&N_V)d*~s0Wjq~jo0}}<@t%Nt~
z3~g?3HWnbidz9Vk8e0`Rt{j{P&pv&uSXBi6-h0J2zwGU|*c$1U4~=9n>-RbC{=9Fk
zqLIz*aNb1?`#>vwlb!0%hS@Z_e@1j6ti>1XL!k;-VaG9F173W4->!OABp>_u$Jqxe
z(&3#COB=nJbMGKO_tgr(3@U57xM5|TT7<L>!QV2a{CAlH*FWEXhmEFU#1!u<Kx-R5
zs<lA$iJ%R2x?XI;o|OH;j`bg%ce<{aBd>3__Aejv^{&m$w2u74*l6kGfZ5(PCR*wj
zuPF=>aI359ErE8qyI^^DP!5+P5_B$hSzm@r9S7|0dD^(b%j50^bAW1@dZ&g#G<Y|M
z<5)~dmnyyJRizJ~%F(OVX4(F;rj&mb|JHjMF$nM$#85t{)_V)#0@+Kp@g$t@3$96i
zdOxx3Qi<>B*t^XvzdADDh=?lhE1~L(s2UyZ5wzy>D<KVlJI>wv-Mc6|Pz6r2gslKW
z3|={;d!sHS8uxQTg8v_{sWwL2U4a0iAYgRGuD8v0(fTn8I{4y7ALWN}B<fPbf>WVh
z!}N*Z%Fj3kV~kJd<)lZxKS01!dbITn`(5|u<L_{TJ8nzleu@N)9a6c(+$VXs{NWe!
ze3UP#5+33PY#yl*hT`lppYvr(m`>^5wRx>BBhSZ2VeH`DO_oIP%*kO!Zl!X`V3F(U
z2b@gdVJqQafk_#i)E{khPi0iwOm)Tgv-04;DMUBN^@zECI%r{Kz)p~0l;kA|@%b&H
zcmqLuLn@|<w<F*cZ)_=h5fv1CnK&XnQ!}A&gy;(~qY2i|ouc+0#0hDc1rZX9FhmSw
z6B?zonTF$43o}CWV&aDDH4!6<6)}?75N%VSeN>%u2H-7B*TnAAh4KCNrIHetk6fiS
z!+<K^iGs~yx6?TSpa-)ZwW6r(H>uMiJho{4K&!P9*L4=hQaVmNCTWx_@I1FZo8nEs
zslm&%hkE!#N6l(oqG&o*F*d;}9%_3FIy?tO1}<f^oprsWYFhC_kWjXT0p*Fmdh!F@
zJ+F|pN@KbZS!lkhR&}-G($jd`1_@KBcm=?W9U58f@yCp>XIffNkd*O1xrrH0eU%54
zFj`z4?{}vYCo+}*Pp9k3Mepnl%als!x1>|nxU_zVQkk*cKm21LG03`qs<x(7<}764
z6*h<%>C@lW-Z(5IDad0s2D=~5X>FRuYEiJZuToS^DCJ06yt|t1(&)@^@rN<E0mKII
zgf`RRkyR%;0~jYtO|%Pq9FybP(QK&FFv$S^hc7#(5;7fV^1HUDkYM9=F@`GfT^Vt7
z7@yAEDcC1wJ)`^4X6>1=;zIm2lF7d#;S7cQ9vSUY>bPEO{llSlmKbS4c*u>D9>goj
z+6*ZP1$282p9Ze9k$xQ}Z{um|kq_RjGcKk2JP<jcOuNE~^dewzjAPu#y;+nPzk%xc
z?SNW(mKD3i%jRon6w~;+>3w5j1LZ(YpZh7l>CWR;mRF(y<5_*#q7o9}+kYHd@uwI}
zSDS6hjQsf%6W=6lCo}TSl<R4=i;FDTo(JZa!sJ88;iCNvz?@$pQ;5}W0_%lA@O7Qx
zihQ5r(T-d4%HWI)gF*oTPoNMZ&~aaGRw${U;jfFOB`WwQhJZBEap-a&mwx-QJCmE^
zygR1|6AOg*^B4@qyz8o+o_mw19xm8qFzX+1SUV(+xvaiJ;U<s6ptz;h6#@L#HYiwz
zxsa?b7wPQL)0ejO+Tjc|W@1Lc_m?6(oK5`)Ml6g<kU~$UF4jQ`^ozCWTC7pm6UkXl
z+@UPSk!N3V@~{XA!Orce3ip_9`5Uy?TN2^&$AMgOjbrMn$7v%+x%LxrvKiY|GO)g=
zgP@Vq{iFQ;adk@cLqP=K_#f}6Yb_P6Y6F(<4<NY)F;P37Ruv4KdS)V5#9cF6Z&BvQ
zr&0fN`g7_!Tbr}K&fN8WZL(_GQ9sZn*SuF1s+dy`@HA4tphLS1b;BCNf(liKC`Thq
ze_v$5hA}5Yl#slds+}eU`kuw<>!(t^byAeXAQmsudL9Y~K+sW9^9PCmC$gxa!*B-l
z_`626WT~Z;^l%XGfIpGR(FvHKLw<t9nXlCc&F%^a?b+JP3I#jkz{Ck0d;V5k7T1@w
z-zj%)f>k1-`|U@f^ERtNm-W|qs@prf;G`@6m;zd>L+pA>uk-^VEMcQ_XDn1@ixDRm
z7X5Of%F+LrXWQ3?`x_jum|<4RenNqfFw%nsh<M~C`c}lsW{uIhVx6H_)Z#^O5YCog
z=lAahz5cky*PA!>)+bb3c@N>6LmZa7Mgdb*qHrZvVnx@+U92e~CyS_YHXk?c@j<CE
zyo0IP3Vc?FOK)EeVhT|;W<|_YqgeqAdvr%{na~AifP??`IJFryZgx<iRv`9qkHT7)
zt7;DD@Y%OJKW$zUD{4$0lpzYJA3q#CigZr&;dBmmsS|?++0&msUC)a$T>&px|2hsH
zwOysl6pSuf^-&$szl|Rr*|x0qFW2fDa5o;QVWct&az`%Uo^0tdYE{)00%y~RU)43J
z178M6YoBA)*J8gu5rWoMbWArnelxnu$!E-nRnytPo5=}zxES(7YOq?0n+=j9c}Xc(
zgZdoz26?c-I=iuqb;S&NI1u5f`whC-&xp?M^dIv&IIdQ0ITyo->m^=XKyZ%2A!O!i
zB^^bE5;w|RMAY4)emY&ttkm5+bnJS7jfnX3+4a=6ZVva`{{GZ6y7wKEZb$>SZ9|P2
ziM-Baf+$)W1;Dd{WTb+MGd#W~UgHxreXEAd##2>BuDSg5Yka53oCK^-v(6{OP!py(
z4DR>0xgogQ(hmhhB;?BOKrr|*374shm9oy4^9aB%qlX;~43$FG;ygQ0qPtz(-)M*X
zfw8G6!Sol}qnpa=XU1q0UuRGNdGP0zGhggE_c(RnUV*^E#NC^BoTK)WaJesd-`oW^
zT%>L`rvS~Sc_yTJ+0ZI;!MoN(yMU(qd71j2Xqi+d5|&nJMRy31m}_UBH6Lf#e~gTN
zHOlHgq1eOeqG#tMqYO=>V({(I#+sYVOevS+Wd{(JTLP(3zoh8;aT60$Q3F6J_K5su
zP88rp4|R9381~}wXI~wB4S-(=p+RlN>^}q7Dq|$?Bb_CK+Y8HCTwI#cq(EMI)tL!x
z%Q@_XwbI?pc@_b{eB7pXo6qI7NCRHRQA5p#%GpPYjj21_4bGtyUk_>5rDGBwvPJq)
z8Gau4c0$4kz=6gtQs~eESK(@IvgY>vA!Yx|aPNr`A|%r8I?U)(@Y!Ex`Zv5W4fSuU
zLojY6Pt)^c?Qg?7P&#RhI$17~Ms{`8AK$|>#pkK#9|{rxm{Sz%E>Ikt!|(-K^;K_B
z%P48#8rEi%DDK-a#+QYh&K6~q!TGyu7K9D_L!D@ivZAg0W=Q%hX(fqQ^%DmejOb<!
zLsRtCG08N)W!2Q%SS{tMwZ`Br&7sX`=~-4{ttoz~GPIjuplw}iuyAVBGNhnlmw+Kq
zgx36=hp#~1GVrW{<Ssg44vLTyIn_;-6s%>gbjWrk&I!;)2NOBY+I?uLi4g8|`i-y6
z9=L{}fRMEpT<sB-86Lr}gq#)*TaYOIFRZ%G|Amm^luBj)i>l^=e?ep#dfy2Dt=fk8
zLRQ!LwQs)t4`>SpOalcKHv>`th5T<7+ZSm2mAJ&0{11vt0A=<Ufknp@3H`UK#qSpq
z7Mpwu_y0j@VZYk{-9sVxUr6|?=z#|d1hn|I?Nj~-5dMn`Yal)OenHrup8~f~1YZ!h
z;$Q6gTstHOZ7`X))F@T|7a=^BQ!HG3gHc#Gfq;K$DdwgppR?;$ckSzDo2ADgzi9kE
zF6mL`9Q{5!oCLogkM2g9)JE5acl%|P$*-#2VJBzT`?B&&7gulZlT1bT@I@=H7->c*
zOfWbQ2;hKU0zCv*P(-oANNHkMs!TGh|K8vy&<l-hMgCtgJtQ|HWP0Jlxc5KO{qX;z
ztcUbf#n1tHb1)$_7$1RhFe0?FqWWj?e9iv;XG&%ZRdcJjsb@GTIRm#l#i>J+jLatD
zpZL8_QN>vTJtXmOes?FzrRwh*g5z&ni!&>Z7$|6I)JqsNKU3QpEeTIHy(t1*&bd;l
z&jII>{ldkHr^SgXZAvXXnvdUe&)XkE9;?#&f?$4j+u-$w3`=JSeVr5FFJrv!jVF(w
zFSlAfL+Y!wiXzlkoUU)s+G3s4WV3m3crTO97YczF+}qMM<4PSHlB@DjOC|$jVlb~z
zc<%QTcD(7*Zq?P9jW19Xl-PI|U;LCBsnE(4&assUe0}Ua=BmB3ZO6%T-2AcU-ILQ7
zDnvwJF`|)t-ygTk`=fb&yARvE>`ieYA60U-?Tfb}N_9_X&!bSzmn(Re%QrEj(W=Qa
zBQIyNdq%6+ZtgrACzoa-gFfqc^{sWXhv!t_tc&td`gV0Nmp;NO5$N*Ts}GgUG25<K
zXd;L4Vw6wkz7{PS-$3{YBbfm%M`^3lp^^7YCQ8OpOHh$hESHkSDMf9smxI5;7Q&15
z#-Wy_l#L258RNUBQ+_$xXvmtw`4Iez#o4!)+vxY66s6x#oop?-EpNWwOjQ1WH}8GB
zprODB5u-04OT;myrkGl_Mg+I{!pUYXGrK~$-(%U?cQ-T0iTT#3Wqg=8Sbno48|Kn;
z1eeD?o@`QsBK%}XZ1EG!5_~qNT95{NzUb$m;=X>Jc9%YE)kX);Zt8|?FUYgGS<ZzL
zH^<6_)6;U2<q{O=UgIfS-|&1SEl2D#{XOhCWs-(J#Qm8dxT82xbQ!cpMVKS{Oc~SB
ze7}M_p2Np^RbIBl{ZWCh{aa@~c8X)qO9$>Gw?IORtqbMMkzf**Q}=wE^d><p6i6_D
z#DBBjwTd`SK=G(bOKMGxS}&yoCC&5kVkS7tb0C6?v<wQdNg`3{cPZvE!Q%tI@ANJU
zj`{0%)AN`Y174rQAC^uAa*$R=@B={+>N~}*=4+YRdQ~Eoi7K2)KNST7K{W8n${SFy
z01{yMTDr}LO!M^ucvQ)Wfg7d!K>+@}nG_bo1!`rarJU|W5vjDK93EdkOb<z#G{N(9
z=0b<B8G0U{Gd1gre0IVhPb!o`5ayMB7xcVZ0Ldq3`G)iCv+l_ra8VDz&}l)sXt01j
zC*Az{mM|onLtfkOyjUm|jeAw2(=8hkvqiqP_)^;)nG==K+#v^_OsyCm+%=zdwP$z&
zHxdOPdC>}margjRp=<n}zy?JU-m@oYMBg}HXN(Fw+*L4acIihPCU`5Zb17+5U!zl0
zGc1Qn=Vf`>fr2H?CZJhBCY|MEu@Py(r>%@T6#KH5nC5ixCeo<JUW0;_q!kEjHqO7G
z6A#G?XOZ*Ct$;)m+DyzCwhfww-^r1x_vq(zyGXOzVp)UfrNL539!;}Wo3?nJ7@|p)
zr4g@lxu*E7mwBWawO9n+Yg#x8^NNU+$xj%Y%@Vlcr(9UaTY(h9KF{B2Y5Q$00B>1L
zEOpv^nO`K0FhB2KlCjEaLx6D3>lA20tC5$DaO6A^2ijGB4XYZ@4>9N8|9YRVw0LH{
z(h&2CJ8STLlv_68=!Yyn<Mq4Yr&g(8z)6Zm$OsOY6Twvr5*D5sHv;1q!!_b~L`ic5
z>oYXFWj6KEuiz~c2oTplnb?0=c*+fh>oN)1u}q$Oc!xB<m{qPXLlzc=<VA`E{zN2`
z{(7`x>-NVwg$w-oP-`vDMkod=I$?;_m<wyoaF<p1*?Q0koP`3HD4g((K})vTA?x;i
z%DM$o$iDBzJ_p_>5-)noK%-L+M5@)=B{jn`VW}V@o_Vcq31K=T27?1iCr1TjtFZz+
z4)VJ4pj~fKdCr<$KCtlKbvWe(!eNnXj)YWzXH=!`dCsprkP8azeCD2LZRDhP%?_v~
zWsF#~${lLVIiQTm@1uBp)h%u)=5(MEgpBLO*l!nlQHT=^0!uC^nQCMK@{t!V)_he_
zqI)-=NoHU=tgpNBHZ)QuE&#<>`We#*=81qCPTzAcWwgp{El%Ue%zo=mWmnO!wB0bJ
zRD{9x8_j!dywH5d)>1_0V}=y+=4*Im@0(4RzY1nfZ)vb2D9L_r*^J7k-~td0dgLgR
zrER-rHKlVt*xktvCSH|N6Uilw*0W*n#EJ=CNR2b91bXN|1{LGWB!ie;3?$IzM$@d4
z?INQ*<Z^iAe8S+E7Y{}YfBr!kEEBFHX-o7D`U?+;#pF2LdHqnXXHPn4i(_ojV&!UY
zA(ci|uKHH5$TQY}YWhV9#jkJ|RPH7gjJCV|Va)1wli=B>jByovt}j|H%uk&@G!Vs(
z<|t|T@P>>Dk2zQK-|Zz&mgYOM4jcDZK0ThyYM+l%z7bbM|4|em7A*o;5HKRB1-FD*
zJv4)Hxyt1=9T^SvsJ$ilGMi;S%tc*HFjMVT!L7&)Nh)W>ut)^2e9kn-9=bRM7D{SX
zn<AEsrj>A?Y8#o8_4+dQW}^+!)!GGh;)GD#wb5rioBImS*F`0^&Nj2f-y&n?pLqpM
zGL>2kQ4lVdKPK@2fWordWZ}d)Qb<8(#HY8-JipHNpG6MF&<}hOs#QlR7vARvM8af?
zj(Prt?YiYwU%uc;00{p(&uU{mOZo6C&Pc=(3Uk|3L-6{@3fLq)tZGg3{S}R3ISITk
z2lIKi`O<fd-wvIY7#a;Uh80JGCg6Lfbh(DoK9yxA9xNvtT~YV7%a5mxU?L#J3!2Q8
z{j3!a4+Zq@sZ(woE1M+#G1la}+{fZZE@vG4#E_3@tMPMI>@T;8>d4B+M2zNVFx1N1
z@?kNChEkRuNW#TBRZb_1WWwubkycg0WC~uD-ifb1kX~;HwuooM-<K24MB&)nl730F
zf_k_VQyt6&nw+;w%t4=N%e3v6GA?-8+)F*+SU;zqhh**!eKT-9>+WLZXU0+tDJYp#
z$&hQ?=P$**Vv^0yH?~U68r5}oE7R)mO7l&z+PYYD&h`%0U8Rv|jGGlPGT_0W5^sAg
z8fA44_@xSV;s_Oip}~eRs9%saqR&35)dEOQG%8P2QCTM4)mG;fYuI6KR4TM1#9pq5
zNOcIN$2`I}r156;^zAb*VjPRtiM4D#FFdeh6JfbN7w`^bqs*S56MaWHFv`=pLJ0XG
zp+Ai12ItG+5pTUBi8fjX(Pi_*xXdRhhQKEb@x46YX|F`-dRGwLN>J|B^AeW}z)uXy
z)y_yN6-wz|()Fi@d_4|~DO%WMY_bzscSa?+Gy+gqn|_hy7s?^j7rZmZkdWV2h>|8W
z96gtYdD8HH7UvMX;NvrEX+^J@ybzKq`08+G)HeF?&7vM9P54^DxWc^~l($+rO8Hu(
z%O6r~d6R(91;L8E5ct|7nzX^`9>RX|MJxrkLhI~ErCaCd4|-`p9Phh3NJ3x+k?lJu
z(e<S*B5H{Kz&jlRPcE6Z1A!vKQ@Iz5Z}O#WAZe{4O~pYhB(P`1-t2Y|r{jv`YUgDv
zMScWez=L-F;rJ|tYH1%HL<bo#tXNLW@k2Zh1bl$Y?&ggYIUQb_t#h@)ayb*&AuCim
zCfd61dW;?q5mE42XV4iF;OS~1g*KLqT)0QhUa2;el^k`dm?26i2Es)S3lN~1Sj|{+
z{LO~*gx5K+gNH}u{b}1_Z=^H~J}{Hsi-535PXT^&WN58q=wrIM%Rri5_^0Q{M-FAI
z6$Q&}z9$%*Zp08*$&$PMe6%nE!0-INgHr=jM}YkK<l>abP%LR=^lr0;CAG}|%>Uc3
zevTy+{zi05XZLpkc^F>YSH@7%G{CeigkEjgpUw9C%+@|gdyc`eO>i|3h0tus{mYVF
zZ#G?KEvp=T*m<N*C0_IW1=dpC=!y&L6v}aF))=|^#MEsPS7J!XuMP7tFWqUxZ(I$O
zcXYd1v(cEVYPA-vJ-Q5|TijB)hRXjXO8K`vXJ}-~k)U<zq)(ZH%qGD$8Zg}@)^gt6
z<s#*e*}vq_K_f4ZFjjk+i8}*GzaVk6{g1tU^K|;#56m#elaRKIC>>8SI2jH&u-O<z
zQ+I?(EX258NBCJV@(`(&mAm(WKMID`X5y3hNZ6K8P})zW_ayPIq>M<FK1DGv&a&)!
zskB4^obwpl-9s`a1*HHWcdEEn&r90&fHj$H{+|@RBA#0UmmhcKZ@L&YwVjL6fZQgx
z>k)H{`Ju4@OlM^LUs+&8psq)h5MEKG(7l_7m1;te>Waou_yx&AI$!p;zJ84HUCB%x
zDHBonB4;ZKdBUT}le}Do!AL2jMcB|ci#ocS?=HzsX<yC<6GLZJAO$yU#PiL*HpO}p
z0gtOUz1#AR%Q8&wSk-@_wMxwzuCZF$M%Vr(2yQkYB?r5UA2ROxMhoC#P*I^<F8z=9
z?1_Ltz6ABk9fLY{5iXWndPvkMiZ6YsxK{kbi1Wxv8i-;(Fv;N7>~r(J3{?eRjrHJw
z$nXy8nJ~p(7=jIqe{}8B+Ys=ewTo%Kb@i=f=^NtTP$0XfUHS6GwN%j39T}ov4R`?q
zrg`-=OmEf)1*>I=h`R#`pH86_SE31TiWPKA+x7r@(bwLrH$jUaY^z7yG3FQ(%VB_l
zh?uz;)Tv)F0=2r$2;4zG1w%W>(T|SU={I>kLd<$oT}AqY7|l0#DZB<HJ_@9w6H9Cy
z3v{7=K@ovEK+vG+Fp7h+_t_@h90-4RJSb34@VncLRiy~46Moss8JfFfvkmK@<q*XD
z^y3EGYmj0OA#lF8<R+Hwzj}jw!2V53<9CEy=NBLTjgX7(_g8kZbR>8l5baZT^Oc)y
zWB7G<@`{fW@XuYr5A!=p^0=>-x=hxyxqQ6{)wJEokii#e+}A#kPL)he910ix$!*<q
z75;OqG#cIQ^ZO)wb$0QM*E^EVPq2}R8KT|%%q+LiND@y1rB9;?$I2MZb`R%WL-MW5
zx>R8ZkTn)t;fd2F<ofO$W}z0hKSYdfgv1w6D0e@A79~miNf;ZK3yLs2n+R6mp?1MO
z?THxK@ez4{Ry&`%_JpinPJGb8up+J>)@e3IivBR}_wU9TP9GUAJa;9tv@2<*Rm_!&
zuAFO|sPcJte(I?WKH=k_Cx!x>&rj9xCo`nPIUX^FHX`8T-u~RKFW;0)!KJ2kc}+bU
zwxt5ajLzbSiO%6hRrt-*+-<(0;k0)sbKM64t*V$WJap=qw}?f3S)^NMlzIyROb#|*
zH!xx^tWy!J`9%~gk9of>B8ka;r|h?}e9VjbQ}P({fRVnV&8<iu(fDD}7ZL%dgC+G^
zpcCT7Eqo~S-B#b<E{!j~%ivP0`->y+Ic=uC3<GS3xTV0C>akN1=bR8Rj^I`tZm7P7
zH}%%zl(SjF!~@`b%5an^6g8Cou(JITHQMbRUKMM-bnQ{qepp}bpmIeOL(=)pxbIJK
zx<D#6+F-Nr(h2r5x9}aTEkR1QR0G}eHCN^-2G>y6johnO2h?JgR4nOFvg*-z)wQ1<
zCVS@5WnO&tglI39yPvA1WFo-Lo3`?PXw~&pwbF9pE8gOO(C>SD^5ZKQUB<zscj=`W
zUX#IFD2A9s4Z2gu^@cwXLk8i>+uZ!-PWHY|U$oDi5fWo62!FhsNn2u+Mn&G^do>qy
zdx0(lVgjLuti4T+L^0ynP63o|rd_3H(mVQz7@TboT!m>@ctXnwLlBXq^{D<>SGzVo
zHJ%ao+R#X$kk2UWvQuX1)(~BcGIWVlr8y<9*?ekI&nvT&v=N3_0<d}v4wrrJ2JKS1
z7T7e!jeB^3e@%+NW_jDm5E@{H?b_u#2$A^Tqy+XNA!$x<PTWS93;bE+hHY_^U?m7=
znpA9XfQC-HTwzF(TeDul6({-=DCtTzx--yoGCjPOEkVl{7C9>B3ZztgSS}}Gc3HjV
z{#Hw^CaUpru|{nv{mVeu!01^%c;yg@%S@15mP)V3Dup5Ni+y*@w#7@9N!=4kzXfD<
zs146Q(OGa;P+3*5{jz?DMcRVcto4c4B?_?LCjJf?yZgn6(L#9<u*L1C0r(wn5E_&)
zC9MzWdRRKB8W4yI5bQj^-cP}TO%b~niwDb`mM~JfC(6IKS?%H~w{a&xkGqbDNwnN3
zcN>`1+HQVIRs9K3p=!-v$#~FZ0eUlnXgC-T#a{BYV}@-qaNY&EJY1}BbmmSmnl3r}
zTy$I{qD+l2_+?XY+uP2g%lr_GW4Y92QLtJsO)a%O`Y$y2rh+!^ZHCuhzFrsRFhvRt
z;s;RpmZqX`I%h)z5&>%RC-*0*2BeG>ka8MsMB4g>Z*;@bDs)lQG3In`ynPQ=zC#S@
zthQybbDf}&XI{l^PZWOMhw#}+R|#)lC;4Pqcv&U_os*@gm=w;9Q8zM1q}c$0n*Lm&
zF~;bPaU%k*%!KMH0P*`5TylMXSsd3J1u653Su~3=hZyP>CVR1P)9H)4D4z`=pZ_|m
zwQdU6g~F|#LE>BDA9Upa<12dONA+mge80{ZU|f$6wktjL^}&^!trkqdU4d-3wz-2n
zmk%&hhxS{YRFT1!KRr5$5RzK5Exy-uDJ^Ln2}3%mFNhCoJ+m*aMnGhg?-n(qgeOOS
zzBz!GD-+;BaP=w-NFz6^ih%9fI_fL)DcQ4mcW4{JWrK1P+15tbg;`^Sn^n7G(@@2V
zFZ7^YMNZTAUYVm{QC_zEVB>;j$2#(~#uy!uhpsgagdW^zcRdbRf`IW%A|95Trfa}>
zMkXH}5;B5tz{LZ*6?#cr+ezWxX(PDuTX##KBT~$FW5AyIBo&bFp_-#sg;~NujFYHQ
zNqcef(E->g?tlvz&9`mrf&mab2pN5@;vA|p`M&Z>yx!jF+GWC7S8f)UUVUDS-V9#7
z045SH93*WAVQ^MohYDSKgQ5I50?W|M*YiMKYYj2xL~W8fd8%UbuI=1Nm!WCS{bdy!
zj?fxb%vXadCXx~sz>gG>vvS2I7wqEDzR~EdBf(Z?6BV&;0~G_9w{FqgDcCdCEGF8h
zG=`X^u?XgPNcL3GAqQrHbMdbzZdg$~-y$nx?=u;`5GZ~TR&0?(JxxvL^v&Q(z$b$z
zUuP2Q^KFPX!8$AiQ2G#J0}DUKdDbMbZKxWvWY`r77by8@Pp`dzHu04n!3TzhOyJC;
zt%3oZBT|sT;uM4?iC+q`{LoiP6EME$Bc1EN^YG<6PDERB8rA32DvVR+#iJo(jAs*D
z<1C^7#2c!zMIWei%e!7qo|l?5QUSu96<9BqiEnOVPV)AeVA>Hl+|u;n(JD-{tzaXg
zxf*S-9}Iq-FUoa5kuYcY{Ok}Ox^yI5yUTqcJ!R4i_=N{@FqUMZdT6uzZ8N0%=5@L}
zNujZZ;)~nlccIt(rB_?aAT+O2Qo0@MM~_P?Rk(~oo+;MX@(BFJ`yIyZk%PyTDKys8
zzXb(D3=G+ZhXD3Xt<Tr2$o%>9+z_L72Gg_hLk&G>l0ZW>p!JcSp3d7fHre77F#3Li
zb0A}R?HDLjkv~VU2X)ieiHcBu^cw$G&eR=C-gF_1E%MqBhM*3O_G-)_Fpw88(2h_8
z4N@iAL&Lv=XLBid$uo<tuR(K1n>|6J4%HUM;<d1E-)M0*4cy}U0odB6f3vF+T^(M0
z8C$Z22DdY?N@%H}aU*2JN&#cLki+FZ2dOE``+NSl)jz>l1$|BF?Wl%A7wy07h{aIm
z?qT2AtWaqsaz6N_GL5e1ySpxu7>C6eIoN11O9-j!Pe9G}fS<33tMJ4K>HGEHtUrU)
z4TDpD!afwop62wVKAN&JbB|eOJ<D7prMYFpu2e%V(5dIFV1s2MVa}F#y5W&72UhJL
zFRM8~1lYc>{+XvWjWclX#Ea`~h3GI*T*_q0*e0lRI~TpSiA6x8_XtAoTAWiBak4gh
zFm)*PZ6?vKy0$WGle`CWX-7EHGYwEaUkC*B#<%F7cvTro;$AXyfv8kKgy6aJescl4
z95}#XMNNKajYo}aqNv#kdDIRunL^eYI$w-~0l{yy^||Q^gA1(CwXSC*CnBC&v!<&p
z_kEaERsasHG<HVCQf_Rrd8Mrj4sMh^?NTKQIPKUDa3}}oc<3ado@{|Yzvg)^4y=ll
zc921Wn`Yc8&hB==v)*$SI*X^}K-BD>UOK_O$DB(gP#5n3(X~Kf>QFPGeV?#TQng3^
zlu$x|cq0mek+6@}eDk5_-|F)0HpFU1{qM{L|2uO%kOH3<c>VUN@(9AhqTC_D;P_rL
z><FmwsBwPqrj8#_1L5;Goqjj9Gy@!F)bk`l38ll8?h9r7*^Z8avBtHH7V-0rvhMB9
zd-CYSA_)-A1NE!~ij2M4vUgDu3DU9=q7Z&Dm2;P@v_y)8RdDbZXyL*X)V~pGXO~s8
zAy3D4>8CI_+J1cGnC|t1hA`qPpWSd;Iip~~X<4{gu(hK!!uy0qVt5f~F8xAOcXL~F
z+!p-f5MgNH?_vr~=i#!MAEq!yP$M+izf5-rUmcC7!JHpYNr^m{Uz?>qscQ}S6fqzr
zDU1-QZE1JxX#QzXg$QQJ=ewdm!+uI+vsiEQ>Juj2F%@Lh{{>Z6PFcZ&*gz5Jj7DU+
z3k*9L6KCK5r7Mz3GRp0wQ~n_YWQ9dXhfnUbIn3}>(vtCcZ*ZGJi8^AwqRT`&Y(=pq
zO$ayC001|@=qx}_U-?Xp3522|0l@zqg;`R2jk)B;SQ0wQ`Pl53j<?tak6ZfGeP{sN
zT9>8-6<@hJ$14@^)Cx%Sc0=ybvaZ|Df8z^FV_|UbkX(@_FmCa#q4|1s)Y7|d^p6#y
zQPK6c?_Cr~3rn*1*cCkNIia}Hd*2QYz+un<bRS#$2S~Knv$$$=gCjmeQW*~I<9n+$
z>q86k`1m074`M(PKLi}(bDS%XyYb7Wwe+YUE%SFa{G4a2wK}t>AwzOr0)D_oDZ)U0
z8wmOF5^QW!ueM@$+T_}vwLb90Xc1f@5U+i)O!WUDwfy+M=qoaLw<x=RXs`d&`wRDc
z3C^(8{~tyR>Km|-5lGMy6x+XS7=bt#TpTot_W$o1eB#^qrqQTcHPZaWGXhD7$O=~r
H>iPdacLMxF

literal 0
HcmV?d00001

diff --git a/images/run_benchmark.png b/images/run_benchmark.png
new file mode 100644
index 0000000000000000000000000000000000000000..e3d349b936d8082646569eb0b38d0dc6bcadada8
GIT binary patch
literal 136012
zcmZU41ymdDwl=hADNww)6e|>WhvM$V-63djEnX;6iaQ00ySux)ySs&8LH@MgIrrRi
z|E!fY$;`YnnVEO*NA^ybl7b}a8-h15FfgdnQerAFFvvqNFt8y=2+%bmy<|}^FmGk8
zL`9XPMMcS!ob1i50A?^SQep8Lh?=T{xEVSs(w6>6Qpl}gD!H=Azr^4Dpj$waCi~Ts
z^iJ$#pz2MTo|v%Sq&fzko*rx^`ArqA>-?1xZhX8;rJ?Aq{F{TL+cn7AeOu%0crx%=
zmGyQFAI90*UB!yA2t_!9(g2qfqbnsYrH{e|mZHNS*Wq;rf1B=5)c5ZQN+D-2ar0a-
z*)0Yx%1%CfF9za@WN{!^;VrVUcLVnx?_S=(=yxR!UBQw)2)BYuybCcE{TGB0u!1y;
zdbF%{vwQHZbqlMcBld+`2w_+w`(w>uvsT}NOJ4cn?o779R~CzrCBS5fhH(F2+CnCa
z3UXJ|Uwcax;@UyJjgnE~=V1O8c;&9ut*Q52X5g1`BYPqj<OR_GlF*{ttMrzS+mb@d
zw57^?p<aiFEjum=!Lgk312wL(&$sn(csj3d%0*Hb$+>ls^7l??C-{b+Pv|y+B}4#k
z92+RPsI_y(2e3nd<RSEM4A}cXERHtWpsNZvREtP}g!jq2u{g&Cf^~!pIUEfuKb4r+
zp1F&4Oyow<$MSHou#ZP+EY|+qJP5|N-;%G=f@o_|%~dJ4dfqM$U|$YB+LGO1$xwWr
zd0eQG({gX=(@&rwEyB~GzyzAGH_#cxy)WEc9NjOx;tFWcqBWvHy+qoL%?`E3Koaeq
zsWnLq{z)m{s+rF&M5(_G`J6(SU)f*SwSx3}?TxXjhe$vl9{NZqippqozZ7U2bJk-E
zf3fA#@yPq)^E*^-GVF@4ucF>EzczjC=w+|U^!Pgi8IhmtwU#iFE_(`jH1=!+oIaK?
z#vKX;`44nRaDjPn7R-?ln$4^R-jBi*1|^T3uafWf{b_%o_|dEB|J<}3+(0kD@WV<+
z2aX1d^4yk2p>j+TL>hlf{LN-6{q{o$=7+FNG07D;^q!|nOLx?eZsOZcpU|o&fTFwD
z0U1A~E{!v$ogyCIa9dF*JoQ(F_vE^&z`jI|ZPy>OSm>Rv$%J<Lk{gg6I{Uk%0Q^XO
zH$mD_)Ic+^0q`X!gsG{lGN(aBX$VP%xp&h>(U!(`+z8i9vHj!+?|j5&n^TBITh#b{
z_Vg?F=!Ydtoq*H+F~j~!;DS*Jaz%xSCYvh;Uv^J-@d5fQ9YHv5`4A1_)wlMr{^=ey
zSCupE^QG+v5rIUePik$XuofZCb#iG`Ll%{rsRjdEGIu$Wl`DFy-mpr(k~8IniE^{I
zutpg#CG>(w!Z2<Q4i^^|dNbrOH%za;dV;FQ$X~a8mxOhw3orkHQ{d8Y^%+Jj>vblr
zi6L5v|Lc-|Vs}`-cW=8o+~1*KiqIt?G=B;>)<Q7%Z#2Z9h2J%#(nKukFt5NlffEY!
z$wDSV=<l?0fYn8q?<8*UZ0b~<K-3R}g%goNA~quH`v!0Po%_StH+0NzLFAKC{6E8U
zKWKe}Qziq(^h)tm5fpx|BXb#G8W7pxc0xN8w)=qogJcK(ilkSG!1`;-FV}<jGl*t6
z%9a$oIHR8}vc)Ztn}|xqY$jJNKX?+ge$|)co6NTqLSSZy2-7lVOhWJLG_GX-gcWN9
zs$?yBRoqKZfqN%p(92t?e?qL#gU^lP=1<R!sn~UYC+98L_O`ssZ7cFl`jPG#=Db@7
z;{Z-IfUPsSgG!FV6vG@t1{o*lH0X7Z>V?%2bqh6Sm|V8NB%xqrVGm%7Xp41AVvEF>
zzA|t@{DdY(s*ln_1?FQy&xhSu8EHbPX6b$@4XM@KdQ*lf_{DT01%(f?1HC(Mt{AVB
zuW+tRu3*Fyjb$%$dQ{-z7UOW@Y~yf;sARazz7|Q=<|8O!6)w(*&&XK624jO6!M_s{
z65<oe=foH)=RTLpl(uSH)RI`X<{VC^SrRQ_rv%ic)dkMUG|PKQ=TDy=bY8Q23i|N0
zM)?N#W<GyO{UxBCTFfw^pQTs2THqrNNk$Xz{}NitCZ8{r|JkB_j4EAUFT5<BQ8F=~
z&!ArWHtS4&)mI=SBUQgdKVyxxRnK$(I^!Vr`t-W{8vcRmK?kgO;Ksrn!V%cL!G_CL
z$D+W}g!&EP20=Fv5a<+`iMoXP@D_~PPe4tGOYnt|gyqE4SdNc6F?m?GbEUtf(_5$u
z0Ou9t&@+XB`k^es-GqILc0tNg&Ag`nplQ;}B*8JsG3t_Tn|j-ex*;rfP%Ku9I#o^~
z`8taI{h`oLp(?;mTTFXucAi4sWgbwjK1HAXl#7k)J9BJ`Ei<3doXr?JI_Eg2fsOA>
zOMie7h9!BOrj6ssmSO_=_t7CU^(uv`hfneh9~f{FbUrOJAeNkz{wxVEz0q8)&eMiy
zT4<No+Et%bS5_n&vd!nM?YZfMmqwQkxVE0+FLSxxxPEk9U*=d8sN|~JU&33QUtV{~
z;)CNEafZgyo|N;o<A~$Rf%_t|%at=Gk2;^8lSF!JTJIQPI-dlF#C_Dyh$3Q_>EOLC
z*N+Z`ZXV1jJiMOHmj`*1<9!Pe(uCF4Bgrh8x&nGYz52#A$7wgh^DifJTQ^;+gX!n`
zr@o_sXBcOd9XX5J_nP~aU6rYQYgeZ33Kx%S)uW)C#QMaM;?Cm!<vzaE<~~SXw7+87
zIBRBg8{GQ4b@;YWpY?l>`(4N>2(rB1E7b3o_!8@QvmLZu7kT`q;>`v;0QumpDSRH{
z^ZNySQ~Vy3B2>@!0OC(bYb1|Jrvqc==jK8vj;JxgL`WXpm)j%TH6g~oFn*c+IyHJQ
zk~UH|y6rCQs_r`OhEVW-*b}Rfz@*5b&=6nSNpZYi;k8?LKfeU^WkzI+Q;DgI3q-hv
zCy3X}3DZPmR+x9KzF&SXEvNeF+iqW3SSy|dDHq$>gxFS3L)a`9X>W7yc%)Elhg4EV
z1^uRrvD8+051%xeba-x^Y5LB`L6<>$mCveQ@|_h3*sRuaQ3|OGw`TU(RkWR!Zfc$f
zjbi8J&oZ`nemf6E7T&0$<Y%Osvog3X#T@cZdraTvV@;_}x&8Lr=)JP`c*0+aNQn>>
z3~kSNQG54Ell}k$0Z9Xe)B?e@Y|Q&7&W~E^Vo7YuqIckMo-Mjd(8<8b=%T%oG0=7A
z?Oxu_+2Du$-}}UI2eM9-G!{wYv}u#%E(r-Td=?L*x?I(YxKzuGk~^xqqxT37-M-43
zS$7JTS!5MiO&g~rJ6aGF@EfXNJPW>2eO`#8viqbW@Pg-&#}>%2&zJ%+B=K5HxF2|o
zxVJ?q!HfHJp#Q$oVV~kCJ59VDrx~}L(YKb{%uzN>Yq0BQ**Qg9G1Dw#eEizu+6!b<
z_JV4Q>Qq)B!A|p`P)^CUZKbv)`>~t#xe6rI@T7F#y>T_NdrIH(1)wdh`%%loM17(t
zy%_ujuk=it%}9P*dv2+!wtP<Bz^%*}(7Oko0f&Rr<0s=s2?3lXD;K|H)LdG5b3GL!
ztwyI(xCr41%K>lJW-1l|!27*hp>V&JS}J2%lTtm~-Y;8o0j1G(RQ7zJ*l|UdW%;^9
z`<DjMsvMxj<5;ZAnu|-boLgyu<S`CdpgI4f?Gbn|K8nhSs&m798+Q)xh2!<|NcEIR
z2+6*wJ-uCAzbHN}=RkaqI7ATBw;1UopPe_hH?tQNxQ$*{dsk1y&HgYE)twMp9Q~d|
z%Q4QgX6Im|GTY>-bZTyGTdTi$DX)H&hvl%RHWo{<E;F3h<;MB$5X)l0A}Sq~+v_fE
zy)kg#XN*hlqG`f?613&N&hN)JaW=c=J_u0(nXL~q5O4EF=_5BeJz5{kPR}>YGwMb|
z1NXyWUumkUo$DIril-Cb?G0!h#h0|9`hs!OZc$*>3+Gn;eExa((!h(il{cCn$;aW+
zsH>Y#@3Ac3Z{X!_LhtI^bHP?+?{*W-nIhV4(5>z>@G5gv@wXzPr|ioVq<Am6yV~Fd
z?HOXcp4)b|U$q~|l+eoV+qgZupke#u_;hQq-%8@HdNX%-I_4&LKGve>@#1bchd32F
z6dgsXBPj0E#FxayO}jhjUQ`N$fS8hu-0__lW<nL#P&Wh;I(P)kka-7QR15C#>pH5=
zl4*gl{K~cKnm+xeL!bLZTU`<}Yo&+=;q^?niMyp(#^Gk!UUnxI`sV4zhAbP56a$QR
z94Z<bfQaa^0hQW|@atUi5$Lzdw+p~UPCcgHYMmqboy;>TTHseTe&WVbsJcLQkkSIe
zz~EB-dBaMpP+mZVM~0P}rn9EJ9IuJJEu)dCy|EdiyR8FsZx|SUcV6g6TQg@PGIv{m
z9gx>ufa32SywJ~oRx?qM{oTdcMu0+7UWrW9-pP!NgOQz)nL_Xl85tSBlc_ncikQT|
zhC{yzP*^%UJMc0wxw*M9y0J0ZJ6SNX@bK_3F|#tUvNAyTU;uj9IUBh%*a1KMbC7=@
zN6ZXp;$-FEY-Mjp_UE`p#`Z4G0u&T~PV_&oe}1Q#yVZZrWC#4$WkD~H>CYA>7Di^K
z|2Z~vDF2_eyh>K?W&lkwD_b)=AoLu9tQ^do{C@}hf1Cbu%KtS~>pw%;c$ok9$p5wF
zUnBXM{#?QTx}tw>*Wb0!`z82>pXq<@z2F=A&ALqJ;~=sUQ&fX4U;TMD&{sF~P5ZB9
zHw>)mx9Rw=jW966Fw$ZoYVNQnt)ZF3;;x+D?}m6HoyK@_K4B}5A^muVX{D6_E<2sS
zIX;Ixzg|ABZPj;0ZkCmRm6k%H3Q~%q-A@CK;rF;@q!cQzw)vc&D*5b>Imigf-<FmM
zs)N-I-YBMn#FB9ppTzD*)QeNZOH<i%%~6k34u4XasQBGJ`|!FyQkJe%1lz4O5raNW
zMwiIo6j<cDsx^D6)2U|%y>W${>G8Tezbaj92^=$Mh&!0&exX^&H+tfoDO6{p(=SQ#
zY_pmdT&g`c)YasQ&K7?&J*~&90tx{eU%772G4OUDYBoC3>$N$Jp3V+D0yaRVzdp~j
zxd4xr>ZsyBs?$-Ic053cy(~ddtJfEyq$U~Sm)&2UJWRUY)9D^aD@s4!4c|M5f|42W
z>PVV?S8GmETyGVStF>4vgYOoSt7{w<3F@=v9)}V>%G;)J<!P%IE1}xm_J(7J#XN0n
zit>7IVVB<d((ARl3hX6=LQ9?8hEO5P5|E)D-w&S{ldpa6oy-nP@iNt=wcCC07<8H^
zWv-O$f*b6En%!PR43q12;1q-feJYLbovynoEYyzNh<QI3yRW?KS^SAbc}BZ_*s*>h
zBG6n!EBNAxk<RVHspPD`<9xah^5A-QK<9TlM_gn4WIk_kKUu93e04p1X`(hY@38h|
z&v8vj@O6_Lb;NX<-S(J3jVTx*zy1*y4wGWzcWZNN+Q(Ch^7j0A`Y%~r`tM80v|G)*
zHqL(%_FWWf|7LByIY9!uFW#nF35)}$H?zkyc&|2aB}GOPb@Y43g<k3Rg<?sdWnY;M
z(QamrX$jZctzVKV2XJ-W6nm_arYsg2<WmdGHU(>=)5cWyxt@2v=Jnd7uw84-L?hv`
zLr@%Uwe{{wNIaFZ(DnrED-O%D+gUA^U<k|>HKIu%Mx|%frq=C0bJ~YrJ}lf}ygm%0
zsd$rkDwx{()%T9&p;n$a6O=-0Fy_GNn-3(}|7}nklOI4Q+Q%>`%UR^ozEinZp6}D-
zl5KaYT`cC;wVg0+F}`2^wjuY+&?-FkB_<pktcdWtcYhWe7C{|Y_mQ;o^$=CLI_vnV
zZzRtbuYI~xz4&fPqzLL{zEF8O_K;1X*o}4EdcfK(J=!0RG@^`2vb3XIW=l=?feXDL
zoNB>bJaYAq5QZANYDjiCewoZ}IJ>KoL6K5j$ssY&FFNe2!KX(?+gJ?3yEt0?2<m8J
z37p~h?}Oh#hy!2A;udwuT>FfwpC8u^6!N+hJwFURUB({9?8)v=O9;$VrOo>*y5&SM
zr0#kKX5GNoN7Cx4Qb$A)FL0S*NBeq;$tD9T9G4mAD>#Q&IRd_Ba4u<3-w?uX4lK-P
zYjfjs)d+Ev>4bKCd8v03e4FN{<MfJ#EVGR}*B;$^z9g*cg=yLXk^0y2&sZzwr6t2l
zihQ|4m7td;e@X#!VSBLZHzxud$h?!;FzpmcAM0(hd)sk`QIji(TEH{TCuHyO_ptNT
zH4g;id(W%yUlVW667J;|e~cfWTg?zAeU1Yb`*c2KGnQ|BjOAj3g=O0A=Awz9dMC8;
ze7*R#$q|%Ln;Uwn0csMO5oX_iUG)J1&i$;|gY|&1m!IjIEnYk~UAGFkrnhQb>Z1X8
zERH$n{qA>tu4xh5PGCIMQEe|vcV_O|T&ow#P>615;~|!E`2e{IzkD8V_Z|xU>i6wZ
zc0XF|KAZyeFs<L?P$Or+L?q_pcES!7{%n)Ml_qFjm3(No{Oae-n^yr}$;gn9{#oQo
zkga3vB7n~%h_y{)U81Tbil_IPCVS~M4(HOF)qd&XuU*^8aTJ*rroG%9U~C2rH-hl=
zVmh|ak#x01aoH_S1iT}-kVutP=e{6*CQ*PF+<=c7-}{sc!dvASuC<yB^1S#ga}Ni{
z#U#e_E=}LVj#9l;P5C#tXAxMFqIhqx*Aj1B>JfP~o0mbneD!J*>&TQsfhZI7>u?O`
zD|QCwMQ8GmqbQ<MPE)NqfR_GU%2%qnq$)a%EWvcXodo<V0dTQ+8H&=!pT~!lqYpV@
zPF(iudEGRMG?AnP1znaAGIad=zal-8Xfp_Aizaqhx+^f#y;L~U#TsSj%PRL8rk!^m
zP{p}}s2i((%c=tH@|*08V!W3E><{0E0(nz+&*F%$>mzUC!=5CR_*w5;3vfpOwiOFM
zV+YL)`D`^O{A{+ke~Y6KyfSBj75G+7MI*JM+-_nMNcAl(9A{`Ub-Tal6E1r;(b33P
zl63X88)5+kFLa`?CR(ha^Gh{8&n8tbFn`2cM-aw9U?%Y^ktIIip*Yu=`Ss}nF6QBQ
z2KiMFLFD?E0c74T?S=m@N0GkL?+;bR-2Epby>k4qBLVhHsmKee6)2e}-gLnmfgH~k
ziKJ@{>&A2it8aeNt&OUyqzZNBS-uNmU|PN(tGQkn%S=eLTrtMn6^`m|lFhEX0+sw~
zF-q6AM#b9P@Kq4wiTvFr!4VBwDnD`_;2?p;ukSH2eh{lYv*?RiKM$r&T$fdvV*2-t
z`a7{U#QsdHjKkUZtY3iDgmmZ2D;x`7>@W+5%rhC;w<P-J0izIAi~iui{;ajtWaoJ`
zf#r_9>jd7BLl+BRNo(fNn()e~`Qn~$bC>#<(90GTU5W`NCc!%A*;4V;WVPsxL=fII
zCm9dw@v7*ckmnJ*&5?FCl9zc<WHL6pos!UQEvSP$<#v@0R0;MXyi*%B#uD1Dl7HO5
z-h-&NM|_Adas)+Q<`n8iEyj$uKZnS-q*rE;BGqX5u`8qPa-1Ib#^laND-nVx7zC3{
zyyB(Pv`RiY4rll@ip`4BC``RA%U9<86e2K~Vpql@!WeVBkOz-?GAS<5=DCf~3b`+7
z$l@FpP;eJUCp@Lcf<%6GFuk@>W=L9GGOVPc$wBFK+LB|4B^}SmRIveMAJ`5`QwlLX
zVc8Ph%d5WQQMRqPllKxq7i#fivR=)NSz3UsdXY-HO8v^(roH!y5dLxVe0SQ?J`Bx1
zBC8#7-n%}l3D~)Ku;ts`v|&z0z_j@%@~M};AQrVoZ;$x+^2E|SJqTK#X|Z(FjfYaK
zgZt%p=9tv7CN<VY7_6u-+-I`MhF;1cVm_acquG*98zabJDvpiG)<@+Q?x!burUKE9
z{BeUU-r3^9{m*gLej9@}?F6dAS051k19(Zk51M`>RW5}DOR4m=(up-j&|^yf_)m_3
z&In;CxGQ31w7p-xcbEOSiwIVyT(Oz?#^T?L9Q^Jq3094MCLD%i`~LKWT)E@y^_4Em
zl_eJ{6s8zu$)DlLZFE7+vB&NH=`AUJB=i8xnIrp!Hl1YehL;U;7whGp8HX9AEIla7
zc3TJJjo;}e0^A->JlH-q-trP{iF&@%z#hXDq>ky2j3}61D^=Aw0;wT_frCwv$nus)
z7b4|#N9bGQV%fS2o(CbtjD(gDoi-yBc@hCNcv(c{F$MuUUwe6PPK})H7jN$1v$bYY
ztP%6Zek8QJ?k|w3C)l4c$WIc&+^o;F|C1Xo;f+4}_kPF8X-~aQqiD!6fTVHsAvi*5
zP#7QaIpq@f6`|WZp3iZh?@kpLOi=`KE;efU&E5mseu7Dua^G60klrJ~S)KPCd-$e8
zGeJO;|Hu0wP!R*N)Q4BTeW9P@WObQmi_ItLO+t%dCMdXkG9aBj%C8|x-$TIa4OSq$
zX=S|At1uJfXUg(+yiQd{`xGQfg8C$}#(Vm?0?|>ovsKkSjbE!u&2`=vp0uPLg6@w&
z6vG^a8j=E-*=c&&X_>z2iIKY77_<f3m3L_fpfKxICQWNr5Ci#;nR%mB(SNALABGX|
z;m7$mC;jS&sia?O&D!cX-gHb!Jfkd}_ETRaZqG|I?3dlX;xg)N(=GAWCJdy##+AIO
z2&oSh;-|j2ry6>?*~KehPt}B<ME*_}mR2Z$g2KsnLnMXG9@oy`opCnh@3~_+r_-~?
zTu03uYSdTnaVx0Z;-pt_9qPlQ{^b8C_hD$UAXH(6E{;jE-Q5=($GFsqbR<buJRCbO
z(I_B3oitl_b%|q9&AL$c5}kxU>j|*8kU+?nOUhBIp%DhTE{1nLir|}MSNGN*T8+bC
zH!$}>pMEjbZZ|Sk{c*O6J5r&klW)#Zq?$vZUa00@o4A&RCFii_wNYU+Co%_EEKwe*
zRwzwTjt|F)Uk`|{%%64<PBIHBG&%(g`nUX~3ddrP?@BjQUBhG4&!dZ{S7}}1C}7MC
z30=IL1~fgsx4CO^zAt){c!d5Co)>wfT3QsZC2MK*N?}2*!3DyYzNLC28J!vap^Ew^
zevJ>+&-^Ejd%X7i>c+hfX?bL0#YD^8qQ!Osoad_5GG=tDMU<%)Od@2D1}#qV$=O04
z4@9vCr0zqQNLJuRzp}y97-mtwtrZVm-=k0IyH1KVjyu!~2}BIb2+C*#wl-?RD%7X+
z@l3^XZkg@Q!BH9fq8jCT^UC7B!G+yN>ctIxKzy^kY~sbYx+Rtv)^iR1S4|y<l8bkz
zeiv0AC5O-F%5c<ab-qLjwCh1yZ^l~ncU=AUR)Sa7?YKA{$~5xxj+n^&`p=%Ou?%Dr
zXhlx7Ie%Hl(-*eAae2hfhhH)!;eYI8_&CI5=_%*kwQ-Wb9P!&)z5KHl&Jg`+OT~~2
zxKEz9`{}GLP!i|a?mXF71Y|Z;;bNcD8fuGYt{>p*2JlrJy$))0TkaUyVv*3}WYJ9Q
zCyaInK3~IoCyp1{yB0NE2r%k@f@~zQl{}yEJ>}fl@)?*DfU11NW0;8Ytd#2O2%^=C
z)s(FYh5FZ#!B1P4;XKsrjt*;xws;CWb$+1?<ym)G>qVosNOt>cfinT}o}7gmsb`?~
zkc~+!=ZyhJaEaNp1mJUJ9_zkGTz|Wc+nM@0C3s?+w0v9iTeWl?%5=HJA}PX?Z>NkH
zWN&eflv!f2Csbusx7a+=N^CvjXmYJv(rRpLm!%UB(Vf#lBj+KmoY|c#`FbtOj-g1Y
ztmHZQDjb}pQ-4m0idXZT^f@p>M=pCM-+&?4!qfSSi3$zADk*O=XjO(9Ft_bX#i7h-
zDP+Efd@FLb8~4j^xkGFCbJH4;eIo6CzQODv5?g!6CZrFwdS06G()*M@dU2KLlOa@~
z>IrnE9nzH}Co>M+2jfT2LumULzVTT;g<)orZKDz%`zl|LbI$v9)u6=H;$;)nv~X?<
zaM^ks!j&tEw?1Q<uh`_Q_(yX8TZ;TuB>s<K!v0i4hwv!a-&3_}jh;M}3H6bJAy>=0
zH7^s&F6WPd(mg>&uj6n1O=v9Vy3W%?bp0UYLq=CHrOpl-?w}h1PQ{b~jv|B548j!y
zfwi_5G6u~iUgZg;=ei3Pj-k?phWI(VrM(o!SeU!jhsf)vIlZ#YsomtFXENesi8x?)
zpQ3<sTlC;*5Rok#q1)Lns$wqY!z>xH>=jUV#-@CzcB8Ux;~3amX3H~!aAJFOcsoQj
zvg8xagPxs`2+bxmPspiEcrRKSBrn(}Enw$*?RO2Dud#c1oG^>qx$XhaV@%jHM~wro
z0lZ0O_x&0)N%|RQ>(AFHO@nY4f761sT$Cq=q-u9~q|U)oqz?Pf_Q;7m>n2GdfoypC
zxywFdCu7csP-VbSB|nxuW}~FglUQa&q3RDGmBy@7`efathi49(So*<T$IT&}2g3)A
zHZEcE^=P70?r4&Q&=k%B$zUQg)4h9w)T|8fDC?_5n&j6;Ztj^GL8C(X5Oc8SE=Pfd
zhRf6Au{Z$pse~=*!B#fvBmW2j3TNg+mBcz!i6v72&wr7-U#6heq$6t`IxwVg<|;pB
zxJFu!LM6G=A4$G6E%EYNE02hQZ&uw8vf8!SbH7Ob1mwbfd0LIe_IERV$=5rkKDy;&
zq@L@}>+i96y3xbV&2-y>n+C7))#jWLYXegmcDze`*G+7Fzc;k{W1sGI?G4MDzZcRu
z6#z6(cp!mREAItn50SlG><sk{tB#UjTesVWg)Zx{2^z34B%OKJyd6KOf6`ERiD>kk
zbSD9Q{zh5{3eGkC45>;l50@(<W-DMK%`Vso=bS@sGsRS*3vfwt-6W`lS0voUK_D#!
zBWsC}uGiWYC+Ye$x+hcIU7t>|^a(AxF;g3kp$lz`wiw6K;^i4cW(&QAJb6Mne}-zD
z^1!p%15uN6@6!a@H@f!Z>y|hBdF1nb-2896yW%vk!8~%F*msW>?6(G^9vNG+jiu8^
zZryXF3^WG~z0z^TouIvEJ-(Pmgzw!l?dgvJRdQGDr1HNv`CaA3q3I#|kbjea-5~Gn
z%5v9Y9HBXh#n9xfFGT%GKj}Ugs9ym%m0;l9vo0HNCN^~&s$^;ZCjE7sw634Q=vWc^
zE(qU#K`Q$6d2#Ony+}66)&Eu%W49s!Dch^rT>f2}YlTW!aEpDgOGkn3f@684Q#74D
zg~mT-!M{8MGEcZSl*(u{($9zBFzXM5gxN1o_2b(kwNa!jKaS>$3g0P6l=w@0_j5OE
zc@v8Dm^qTNU#Rx@F3w>HOsYRft(4a8oh*1~Y}dtoR3YIX_5q=AKU6U-n`zH4l^KwP
zSa<Ppm>{?UCL;1~x<vk9nrXlJC#JLa!rlEa9uP_U%hixap^#K6^<P$+a4F%2oB7b6
z`*>ArllFY<xaSL^5b{0x#pY9*%4LKpj@~qC{(arzYOxi9&ia#9quNv5GYr{hA*jt4
zMBO?J$8WiEaom#7{E>?`MsnV5;LXyDsqsJXJ{laG-ZJs}RL4vvUS%n=Ot-x=b;7Jb
zf0Uq+aM4a1-9g*+ee8go#`jve_$~XlE6*m2DW6968w?r%3^s<%iO06GXmBZnk_VZ^
z8iU!S_o*I_BU-=HOXzGW3%kwnIbwzw40w%xYgI4Wj7N@)qSy3`K(IJVhFHIJ6yVx;
z^nT`2*v;exDzv9u5J15ldjNf@-cBJ9IAFkipDFd)kQLv3PclkA4IDz($BC!eJraM+
zbx3gQHG!a!T&q4?tffaqJih-)O9T<fp34oIj;h5+JDipG==PpKW{d_VGOALo<(3b#
z3ipVo3lj&+ip5n^Ns1dkA~$sgcyb%<kQM2+wfuAi-Po3m?;k#t>gXa4(|OBOB#1V+
zEW1u!xunkc#Q7anh4hgSFYQS9b1DTc6Qmhr>rOl)IvTOrE1jRP_S-k`mhfP@XQ2tp
zmP`}#yAH*Z*$`R)JD)-2?Z5Cs(bb+lPPrT&sh_<jby)pW{Gex9@Yo%yYVy<ig8=7)
z2b(hYJ^EGa9KjpZbPEp>*exvgGA+3v|GF0A6<4fTal(7nY3!Rdb1(E|#vgbaSbNPT
zATI)3ou{@&e~*x3+1e7BN4>GL942OII5;}XDsiD#^`677+O<aCeFe^VFAnV@RByOn
zqff3)o^4)*(^`BmNS<l~BxrEVVe&c7yrWA|t;1_P=kvIEm3jpU>sM5db5)BYY&)N@
zggdRYV+1ri#NyA5OLzqL_OCP`gy?7n3hen3YjdgtrLHH%-CMUV234KUws5R_Q*=e9
zy4;ZR23xJKKx~3=WuCgB5FsDVK)aQe;ALS(xpwPjp_<F5heR&w-5Zye%QdJ)fC(G>
zsU*^|!PfgnH1jTwxDQ$^i5{aKU)?WWo-YD5nIP?S2rrRlz@LYw>_;Xv@+X5d4+6ss
z6uqO)D(B9GeA(r}z1onQ=bZk-VR2a#BwD*XjqV$h0W94lZ!_6}XB!j?RqC_AvF$Xc
zYy0-6B44JF$F4NIue!E@w0Pzu`5^2YYv77;&T|s?atF<cd$)uq;C9?I?VanB<a|2w
z=_}Ts7`x16UB4V6er?p9_}6BEtv?m>Q``l5zBd>AM!86Pi&RBT0slXy<DWb-6(sCW
zIh1T#g=H~5xevtc>x9BlY{^g$b%W5wVLP82DHg^^2btj0aVir%(+`K|$3;1iW0zMu
z9F=8UcVEpiLKc1)HB;yLNw4hn<Vg8w|J<4F)@xgW)=ca2sSwpg`XsfyS_c2GupAEp
zz>#XZh04@~vM{a}GX@h9IA{WS2epL4#4;=}y90J@4s>5}JcX@QAD-8f95sl%d)f_R
zu@<#Gt5UD=S|<l1(bNVc=|=`!yvlw*vUh#5@Z~p_cDec0tKChL%hV@cgTXTW;lJ{r
zZb>5#Oi0ZMf4HX0Ez(w2IoN_<qBk=GfMNTEr3s`^P+HpvcDCe~5nmws$Cs3h34J+D
zM}IAfs}|Mh7&LJClsrU<ql$&U^yp4qioJb~HRADnIE`OsF5u;Ar`XC{r|O|OkXK!Q
ziI6*Ov{{9kN+&c#Y0rjc!TrQ%37I<QE7xp*;MmN6)X0;wR4FCp;dDe8yg{;Y4Rmd9
zzCGisE*KcWl0^HmBQZ1WmOm1%lk{nbx@O(;qfRkNTQ3`~k7WNMQK9R?%Gb9^f%9XL
zv=*{aQQa*FtA_Z38&=k<4Zkqq+0GVbCbm)CjmceR*jjOv;qp_2(s<@o_1LFzZ`v@i
z2>f__BQUoUX+skMV4)uzj@R#kgHsW3`4x8n+W^J|(diHDhCcmERdtnfLe-?fRtI)+
zypQ1=bjf3XCKo8s(=8uuWxdm%`*F6OehcXxbJ!x(q2@<y95>l^pUR8$PUa47jIL_^
zpy#^`zZSODjCJi5e!bwZhd{ma%T^(^p4*(!Kk^3oQ7;799FX#qJjhXhz5>SfQTM&A
zwU8{R@DPuyjwzS3GT<Dx;U^6$9!K2T5YEu+6th#k4Y6`XGwqJXB3k>Ntydrxd}or=
z(zRt`r^~ea?Akq-PAA#g?3#OaT0M9|v-tREQH@9IS5vJ4TpGedNpw#!K)|;+LUj-$
zNTAKfw0lIJDkmscO?qZNxgDNKi*h8Ak{n#Irxq$7xJKHZ>9YkA+2xVH70e^@InQsI
zQqFwJmE0$vt;NM^rMdFQC35J2-BfJXeaEISe;!&Y^aIOvWs7oX^T+&-MX0L0;7K+l
zNV>#`Bs9Dx7!$~UED?vY9KAS@*!Nz8kURD2+GoNe&3C;8;ZOq%G55GjH%J1#(h=jy
z9p2+t*q<Js-D~KTcw%v^^H*HB42is6ILovxc}a4ilxzN)CML#nk{)S#j5vBUuRdvO
z7;rcRm+~Gepjzu$&T5s<j6me)DF5Xe?-0L&kD?9j<QZZ9QmQ8X>&+-Oje@`$%$(AT
zfKAwRiuX0LI-mpfU{VWSYsj<=$dkAEXN|<;*^^?;w_FWqbjNao+DR2<8#av;9_=c4
z+Jq{#+Gx{ky}$NP%O$9qeh)=IyqdMK29cmROaukOPt*be0H=AU%BKC*a^+yeS5P+d
z4nZ3x>(mH*t8>YAI%Y30gG@9i7LX80P%yeW{+8jZhjg4M#M?hi%3-1-yVh#smTGS6
zcC-k!%$9gul}BjTZs?87PUUPHYw0?3<wvR8vEbN`Eml5{JBJOQqKK!WQ|5;C-+P84
z%gk#o>#|xvJ>3GL{k}=SoN(Q_VavUck2Z;iv?+m3KJ5D7`U|-Fy^}2K+n)j`OKq5g
zfU%yXNdP7yTAp})1@bD{SUHrIdr6-|>v|Ok)pc|Qgq@?qZofoUwXeG-PM&cCvg5WB
z^2rE4&1y|=)AQs7p%d|mR40ub?PY195>DSm5%KvX<c;a=;C^BiYA1|V1`7=szRT9d
z=1xVqk(#&ArKV{Fv~cSjR)zs_=3XJNVt$)u=6IR60{*n_f;B?Z_L8@tH^>`dzO8$R
z6tlWmW+bBtKaWa44p4dj$}8JD@5;^2FeOxd#BDTCHH;lIWfSa^_indqhdy>GvGgjy
z^<*$7w8%K%%lb!yM!&=n$GvO#^{JTceCNqMeHY?79+F`kl8rtIkHrosK0Y4|dpL7i
zr00vQF7dD7XF1hbgX&gz_y+JbUw08<m`8EhEyuIo`u2yu(|i}^#Mx%B$8|rK&Uf7(
zoln#If-3kGW^~Ab^g?6`<ejC~T+p}KRJ!<}e|L|Tmm>&~j_%`K#G!UFAm;j9+PUR0
zCt2N=cg(fLLAz8SBqh1v_?=K$VR*&mg1OYtwyb?GP*UiEmwKd3(EUzSCDMuWfrapX
z%FGnSf?>P`A-O;KJoQ%OLH)@WN~;$U7b%ApwI!!KuDxnG>?+m$<kDgineP_fL)o+<
z?}LR!W4`taOLwRh-j8jGxWHFIEP)V}9Nt=WwLSNt2)=NyCk8aOKS86sBX9bDN-c=~
zuO#D+`&00nYHWW`EdNWgqDcdEMPzl!+fr!MM&&&~VwE1B!R-qX^fgDhPM#R>n=NSz
zet8-iM>9o@n9U(!>hyXe2ZoM^rD5AM0SvDm$j?4@lDvrdqw6GRhk%bKm^pU~<rxlk
zV8)?d-&ALJKN5ld)f5y7d42r24tbV_%}>!Tso~{VR?JF+Xmex$xP>$}{Qg=Dr<fR=
zU{FDs?=kt*FJw%zd!G)JB&c6CCa<rQO{73Lw~f`aq-Rpi`q0yMBkRJUc#~?x#B%5)
zXmiHbZB7!%3yr8g3`zseWesX5HZsDWLa`C{a1-iE9q*0Y?De)3=+7WGcVhklK)6y4
zdk{5t8@&ReyhFQ>N6W)a$My1h-Z&J*s8%*QA=4H*ESbc)KD|_*`d_bS&FjDaJwj#-
z+#4^!AD>}QI1tG9dk2L)*S!8yw{Fkc!+Y|@)HW22O4oM1%B3g$P5GiDt$3Um!X;Td
z*2A?!X2AyzzimU~_x}JLo82Q?RVS+}GnP6~fjOU|=%@|LB<jW3*y2aiMuHuauP+x#
z&XM1bIWn<ES_e+N6QftIeUDW7S!zar-~H5Mc_G1Ka64-FHSy(Ym+n3!W?qs6k98<5
zImb+AwmxnFzKSsjgkvimE#rva*Y>(q7rfngEp@tnp%4-HhaJ`nUAj8ZVSRHl0>w+8
z4uA20!cl+1s2U5+G2796q4+is4b{Kk68tlMtO!H8`P=uN(9p=MD*xz4M_X$<w*5qu
zFJ-?^O9s{Wd@2TqL+pQCl5c(dCdNa!&IBBVMcjI3b*W>-oiCm{2@X}jv{Ku4SYAgF
zTJJ14W^=fXB?(g86V7(G8d2hS=~(chBjpA1jl9L@JVM_IBCc<Ms>Hclx7dvQ&_^Hr
z{v9pN(FNe0Wm%pH6m@23EpkY!W0(fMRtE#?Y8Ki>_8coQtNh?kHEKUq^FNL;jd=Jo
zWT%a~P^L?^Yv^p_>VW%ucs>bo_7QSU#YR0ZYMm9%`Hfr{Ey3?_hdJRf>Fqt|`jve#
zRkbO{w^zI6+zaF!5-55M)BSI|=m0}XCa&ACndJrj2)A4_|C?gnPOgk(bswtb$k#PG
z%{sc5`<8qB_QPnbF0vt`t;k+az|XGaq6@|9<XlC~SX?4p(V?qsq!p-z-#fdK>}mPP
zf{dmA&1u5nik`X?XBRb^lvE(Qx0tIZsjS<$n|lqeOs$Q7)x;RTrh<Isd=0`q|LHqJ
zn(cNV{O|pRo~*emL>Ca5&STF@5p#0M=f$cwjr8+n^lS(k<-OKMRq;$Y?z;A^bEHqK
z4y^J4R(`sd0Ht%gnKCV`T_12CsyL%#arB;7-TBj;D4?a4#g~2lb_%ZA<@;h#_Th-Q
zP5XEGyfm`g0?AVy^Hi3=#=`L3A*E7lbs)#^sgP@0oB6c4FUyV2naGq~n@P2gnM!+1
znX9#Na+w!!lQKDvu9|W5-VXj$;9Yt8s9bd^YC@LNNMw#^e{xzC84HiNJtzN$DyZF!
z#m|)S`E`zc!#UJtj!ItTZ=HuTq%`jBV}M%R2r(vD7Fc4}^9OjcAip2@$;)}V9-Iuz
z`9MUMc0jkecDq^VJ;Bqq4HF%3b`Uq`vI~ityMBnp<VXT4FlneAd4vK06)l72E=E;Y
z+f5$BB&0tF{Hdr@c;!~+Lb2pVSq~5TAMN-hW?x|5wN{+;Ul*aKnY;<~OG3F}hW9==
zK(_c1)NcDH+?v{ZxSxcyuv~tKl5t*{RSQZkt|>SCUrM;AJoim7k|T%4sp%;I5PeT}
zq(hg8EY?_YKNNQZUt$}d=%IOzUXpx(M%v~CoUjYwc*_k^%EadkcQqQi^|H@t0c=3o
zU+Vi*v%A@;_!^6s&6Q8j55FDNqWabTT7)f}u_N&Kq@VRgpeS+Y+bOo6t1ODWgXgh&
zP|g_YJaPqtT+CUH(!tY@p&Z$5>&NuyjhIt5-Z81@`A#sIaH*W8)^<q~Ui$a1Nr@*B
zF{6G4J29St*>!y)#@Db8b6aN4U(p}ClDD;Jl*c55Lb$v3P*^trn`sOTH}*d~bW4Ix
z3`8L648xz$E!wlXDVtLMlDyPosj#W^q4mc-s6D$_Bt$)CbXm!=RuPjy3)#;)7BX)|
znr0;HryTN6S}XOjDN+@HD0w$YJ4r=H{&9XJwR@2No82kG{lwh$yg+Wu=c`3YW3!!_
zNk}tAF4U_gkbf^IW!2Kyxzs9{+}fco8=i29Ual}O3%EINX@u#*nhEW`ren8a(5wHb
zAv*h~Zrc^qBP};wwNmDbEc3&tEpUCp^R=zk?_@Cze=fhu#b)E}FLOyMp+)LtVq|Hp
zpnEwu^)c$Px(k1vLW34&ZL8m%##5iE6;GDjlxVc{Qn{>zMZIp7C`ABs#^f`UnhPTu
zay@<M?cjcQj!oN1ieUA%$_}*K#g$-y%!zK-g)%?$aU5Qkxlq}Q-cpU43%YjonWIM%
zBU6}@Iq$W=H{;iyuD@_mkLk-l&?rt`d#oSze=Jvy^BR)k`C57t+%Ur0ciwWuH8rUa
z)868+*EN`@=XJQOeykTe`3<tSf5%R>@`z(&*`M*cY_lf+-+^WeL7V6@vvHo<D%;z7
z_VZ(=Dzm56sC(Pm>C?&?_IQvl=^Rz7oWbptF{axfmOqvzYpKG|ScSxYormzv*UxyA
zZ>(+9CQf8$p$Gr4M70<p^^2`M*m_PX`6NEIZjyk6%QCd`Z6CV>%~(bb)cTo;-YWCa
z61ATvLTy#ZPGjJ4jM;O)TQ!9B*f8+r3FCmrlc8k&^)fG4G`;=P`P$?kbc6X;Hm)D$
zROjG|8`5a_GS9R((L1b>4t3g><)Y=&Uc#0&S?p})?|$^~WsW<^d$B6&+*he*=GdKg
zblEp|@F{!$1v-*HK({hT(7o4a`F8p;;*a;dAK~{h0_r^GXOi1x^{2(-{NuKKtu4KJ
zfG1TMDpK;7u8lqrTJ!Nn8Y|`Jkfw;C?xbA`*5Pe=2xegFF?mf2+<v(Amb!wCJ=**_
zUw&Ufv63j~zL+WM-g(YNUBRO$n44rB5mE8rb>x<&HSx`<uFU{vI8l}Iv!PcY2kFb<
z^o};0!9EWXnqgNmBH6|A4L1?%Cg9%zIgpuoP3BE8|H}nPFjHOZ*#eosNEt>X#NvVC
zRbK-YhH{y}zJ-H=??iORNs(ZIRg^_K6keZ_BE4a~bhv5gB-=p^Vom`!#o8HKB))ri
z?(1oI3lu|(y8kdo7Q0F<D0A%IGBIf}`XB%;kQmMRrTtY~pJHj<pzVNW3OfBp)(bPf
z$_BO(Ubb;wGxyl{|GO~<>D46pk2u=-N_mu{l_v3xh4s{j-xOJkKVxah)vM?xWuZNJ
zsVWO6kT_|l#AL&Sk;C9OLDG3tXiJ&TnQ5`hVYs=gTP%b0`baX`2sJ_-{8ojSR5giB
z0WG28OX9=xRmd%PN&FhDcCpKU!tJnE+t$++P<#dRb6tC_#tlfVKZaa2>fP8&ocr7`
z=m=ty_hdB2a@QfIzF1kSJha$KDIzfjoy=BtYtc_)u}8}%pREuoUJXK9T`E2k_EKn<
z7O7)9tM2)T_A6#cBb&tSX3#n5oXYmcLaXI7r)v8gOl5mr(n-N87E89a6>#w;c0^RH
z_odSS>sNQjcm^ynyWzfZUg9sz0UQXz#fj6Kgb2oSTk3=dm^L9&`D~C<R6f_~DUNl-
zaKQqowo2XMFnzT&&CKb4gKzjYR31C&OE546g^i2xhJ;4~*rt`&)%bL{(uM;*e~9zk
zf#&a@+qg`Ra`!g%XIVJMTr_Wr9!ELflr~QYZq1fjOiODAYVkg>K&6)X8sJ8#L(*Q@
zgK8+*3OTU48ZZJ6d3jj2aDEZt^zXzUyI(f<l+!YDU>w}O^)rl3a_Cpc{8b3VvVjBc
zNxpDSV<K4SfENMLMh~dzLiNS#uD{F{p3mT(GLE1cJB2W5@Q8eCk~-J%ymy7K%j}gh
zqg)$np%1L0&hBsYT@Njr2_TX;$giJ(ew7`0$qsM8=AA`O%e!(^?~zEG*bZppj=4{D
z92yZ%Myh%w_~r@Gs&lG8#Txi}X9bR!bPF5+#bfYOm(lPGXh|%Xc|eMciyD{*k2GDw
zxK_!7#x-pS@!H(ydvE45=K-aMPf;N4VhQyWZl?;~fuFXQCa|$XTccsew}4qv!tDg8
zTR=4Pc)Ei(ld@PAD7gT4W3rXWlfePO-x-$>GXG#81?5TP!=uTE*R0XNSd2V_*{hMO
z3utu_LM<`a$54nSv-C6PkJ)IlV!ZcpgufR$|N02c*8&onPoH^C9-^zqf}rxs40HE_
z@C}Bc2qL23MhnsVFR%>H$b5gK6>jy1?EmgRn2Hcq$<|Go#-wd}<GtjJ8EY#1sXKcZ
zn^PGGtvkn6QAZHP6-Vx-f2=-O0&4;@wS~jBdJjppAVe#R-xCjVk`0a}rOSPhWE4bt
z7fR?jS&P()vgFz_1pQ(ZD(=&vb{dFc8f($q<o-xE$#Do7w2UjLO~p3_t=A~Z7APYv
z%|9RF#m3F?TlPFaP3>;Vbx^a&ar8WrHkbY9?&Le9eM%V5cns>aOwR(2=nk=kvDcwa
z(s<^NzZ=szic)Q(@bjCa(|AisGWos8H9kb+3+mnG4GuGS-Ar-;Zg`TvH?73>Mq1P@
zCE_bmUlW>pzM1k-L!qhT)C?G6;fmq}tc%Oe+sSDHK4c!sGZ1npjjx+wKB{xZpYB~m
z7L5h*Tw~Q1nJLRtS8UBxy51@Hv3d#WHDD{&OdA0Vz{X^pvA=@B)VF#fx=K8C4t7fw
zB$31m8dSu2Lq<&p1DC3fJ{&dMZC$w*+%(E)dD?#6dpnyd$;+o}g$!U{y5*qa*gx)}
z%(r3EcEV2!H4l))>OpA>M%~<Ki&0=q3cmh7fb%s3S~O7OKCTO9e}XiVdT^>+vkBBW
z^ts<;dhnyTr5tb3z#e6fdKU%-Jz}|dNnSxE6`L$)JD*5<zPV@DN-AMes@r)^Z%G(m
z4B=EgVy(@bn85S$NTaiVZ9|?@vTz(qY`L6`LEX^td9v>J+P6%(yY-q7Eswc?kVxqy
z*Ri~T{=}A1R}7}r)ab~o&vKj>m937p6j|*?B7Mp5Pzw|UfoJRXM2oQoWN{$>(tQRo
zW$swtb=m7|I}EU<pPmP2o*Tx2wf3Qe?wW58$0SsvKT^quaPC$x1<YY1*lLCl)?Bad
z2Z+$Y#Mu3D0PXaciUqiW`$lzGj#HbMgx~)qc7K8-^>2{er}sAwPG~}HyIOvR*X^oc
z@&{s{S`MZPD#?8Mli9w8Uh}8Q;ONr7p54hG&yM?8F3A;Ei&_~?+u7pNqrz!^>+>Tm
z-I$k_<~iiY?WoGfa)&(nlXK>6eZpN5&Zn!k|8B%uBmYqz=A=oR#BWX;J1A2;Ju{8>
z$w*#hu_s;aN7Va?RBev2wd;N_$*4?cpWo)-B6g2!CGg2aSS-8AGv>U5f}#){)qeA!
zuZ9@|OZjC0)=HBtpA}E)x5%!=4n(YYzx!R&+79GWu9w|y%8GR?KM`gmRlk<L`8;tf
zf<AMzP0q6wQQ0r{7Y{}84$%4?^$>g*_pCI8jP<ESHCg(%J-?84>ra_LS>7Q5^(EVy
zay7@K@#$InBuCs_NCB<_wE6NcR{W;WZ|AmVo-I|@!zl*7rqj|gnXD3w-Fv9`=$51$
zz&F%$ZY5`}7+S1W$)6fToFlbh^ias`1lWA+y6~ULa+R;WyGuN*#<Jr~?lFSSY68(0
z@aUjSb$9Z2Ioj0ZBqdI0&1k05j7?qcFQW=O68?dj;OFT>;m_<1?aAky<oFPCz<+z&
zeEDAe<u%zOG+ilRbBAhG)W?&ga=(~gfz<NUJ_hilygXSF7=lg#yTfihj7(yyYZ$-R
zT{nM@=$13Dr)RBCuMCK;B@re;qKI-9rE7czxtxX1HsggV4!JCG<^Ro|X4(Mpf34tA
zI<2C+EGjh02C*}PwWQpUl|W|n(<gmBXg!+)t=@riXc$3|ldEjhI|M3J6huA@kt;&U
zQC>oW>E3TyY^FX$9Rsl$ndvIv=-?c;uDIc2<x}egh&ZB1kZ=(Ntj9P)$r{08<Io-=
znxwkV@CDIN{yyx?cIHp&Hx+9Xv*PNnhTf1!-w}y6=Ox>X5;LB1s0)x6QSnriV3r`M
zCf4|6lwkT>n}B+pT}lmLjLEsJOV>712Z|o4|M+OpqtA<d&@|unuQV?g3zjFG&95B4
zR-V*ra1t+NqaqwH`ik*<`PWIqFc%m5Z~Iv+Lb#1;NTl{Amqj!#jO$a|$g?XCF}L>!
zo9?MLM-F*wQ{lxRlAHiveZD^Na-oAP7a$K+$JIT&8(m~Dwj)hAP>WB?Yfc4We|AJ+
zthkL4+GYG|Q3zx#hy>(D{-%0PTP$`)YTy+kgH4}D@^Xu)dO(so895YODx^PcRwQju
zVa#B=UcG)?{H5!@Fq8~jyB(ZiT4p&((MUJ@%iJInlg}OckBQ|Pfz4vFv;^}=p(THA
zkCPxSm119-#ae|@0(NM#Jq~4Z0_E~93A+Kbk)4%4ZG39ho_m{twmk>h%>J*|G^^M5
zw{C)R1vr#AQ6jrZgpxGl>G|Ue01t}WxgHCxK<UMf9&~{5(^l~05P(}D`T3bCGjGIx
z;>K*xNV8pkCeP{v&b&e1=-6rX2($iK_dsQE^_&<CR$b<}Ruc0fwEx0H-r;l>2n~RY
zc3&%^czRWd_+8TEp4fxp$Hxkb0%#ksOcXS`t44))T;@9${v0a7XyH0t5?!wvTvI#S
zb?h#b_>w^JzC&#O1BshyHmDjS{91DMMcDPPIsNZ&P&8yS0b74T?72p?GjU5|-k)C_
zN?dox3a+2Er~G+duSfaQ<N@*i4<|~woGaA}lmPjEn$|NL-OcpoS-*B_<>F#iwPN*q
z3RYJ8Z7eRH(HGK!hZjPd2<?l^y)i3pkw85WWgwW-5WhQ{(JHB(*PCv&(mtWJ0cYHR
zjhT8$u=?zqkyOvVE&O$;_xfs)Et4+EcT8EKO;LW9#}E#4B?IfZtc-x=$gkNX7o@7&
zQ98{P9BA{Ghc-&y*O0MdXy#V~FUrMcUMHPz`dYSU%+y)kG-0wE+OO0N<5O<h>wsxa
z8hSbZD0hU3u07e;_WyC1p{@V5CCng{=|^Byx9eWZa%pB2Py59E8{j`KGvHrJffHh(
z0~PG)$o~j-A65(4bU`mcjh4mR{3)_hgExF*GftR#t$!s$_Fb>Re|KzvFa{FEc;vJV
zh&Y0Zz7!R3y0`1sg}a{f?V#8W+%cpczDhp+f0Vs<Je*y>_8leBqDSu}h~B#(L>)x*
z-uobWCwdS;^e%|rdnbDDZS>9%L>=98%5`1$bKiUK&-3p0ugDlPXIbl9zj7Sk1?VTv
z?x!H7NBc3pBJ4l+4;0%THDvX{jvm)k^!zLGXCw$-sm076>dRR<%R@I`oz?}7lHm5d
z-#Xb+Rq;)F6T+g~ut!|dkmvv?20U(JSamufzpIsen{yDh5RH(BHrn~=qV-Gy(T$tP
zXAEM(SR;vYziml%X35mI52}||;Rn86uA$87_^zISF;yk5jR!v8KB(TiVYV-mP&x#(
zd_~^frW4&XX?MYLXHPpA3xjZ#jnz3OrwT-$GxRR<b?_c4W(<GimP0+|C==n8eYG(M
zt>SV%%u<nY@Gd3T!kRsLxVHSF`Ld3{zm}C5ZJwqg>HTi#Tc_1Il;mzRD*yU&2k<J`
z%0k%T<b`|&ti)Q)APbdtR4g#Sk!AoIAuOaQQ6B=iT4}v~TiVaa`<PElhfE6pBp;Iw
zXP^JML1(e;KV{0BX-?o#=#2Jiz$UwcNJ{VS;?GJDaEnAW@XoL5kR*z>jhNz3wYWv#
zG%?5cBJpLMru8Od7#R8~zbJi_YF%@B?V5fkBk9&P`#h%7!)IK@>M7|I9PMW(LWkVE
zY*ed2$QsL^O=x(FK&&NJ?k5Pi$h`Ob0z=9TKRYCSD+5^p!A?=H^1&`t0fGp7EDj_`
zXe;U1kEG7p{Aq<>y<z!6fGKi74Qila{%3LE?lJ5Qi2`i!4Vib}OODhLW~A@ntw#bE
zJTK{oHJf#o1RtIsPNrdg<569%N+{0sMvCl0tYF88#7;b=p?_mrxw-@73-7#Lat9SX
zhFlj;{WgE{BYktF5P0j@rfc{{!QHOt9w;Wy(GEa%d!Ojdc|qaJgR0`q^#X=A&Te?F
zRhi{{_kS-O-NGMPpqKVDsU>Q`mbNo^b_k@EHO#FSjf(E)x_W#5Xb{dH)rCVIxV5@3
zX$J6^KLqM|`n*(KMX3GA*`_Ir6iLdP9od8_IPwK|cm4Zms>K_N>_y!yJl6lLAba_X
zw$coUd7q#a^B1a3GoY@&E1I30N%KVbX_LOE-E4+nvs^(ZsUSD=mh)T9csrR{gr@$G
z>q_ER_t!fi4N)z>ZicIU9IfP?-l2*lMe#kMCXz;=MCxc@hToR3IWR)Vz(z<n!>vPC
zO}?NIR?uhB+DmQ~YiUvr!H1xw>hX5#KK=2#TGf{=Nzms6^mdXjD6#lAsEj2w4_xI;
zkVJX8_N76#N-ylCmN#h*Ijx*%BM_yT%=Mxb!zhG6iN$-WRTelYQqRN?ZLj_9Ah9O}
ziN9+HgeB=8ki|DE##^J(LHoR^gm{4~O3bNFwkyfkt_L}k$)`)G^PZG0#=VR#f7XxB
z`<}~AcO#~g&KkZm12Z|%vLiq<dkoMw<PKEL-5vu4AY59az&u`kuXy<%30ZRLW!TkC
zKO79_2~1MZ>H))oqWOrfG~SQ>Q(zXYJzsZL>qd1-s3~c^xdJr;W<MQ|CKn5F?<S|A
zb61*y=|ZJA+6dypj@2fil(s>I5{FzKT|_59uKg`0MWc7PXfz_{QTltdosra)w3i9r
z=S_+7mh&J&yAIl!<QS7o%`~!RP<u0#<oGo)`wr^5!CrWe)bRXmY&g7mi(3TO)ly&O
zW4uLRe9*LeBwKM|otSzRtGJNS^brLT?#M=v+HGQDE)4W(>X8vVfZ*${7R`hv>_}nk
zr~t(}5uIY^*>=3}&29?bKInLFIh8lbEOH%QK!_uVo`6<A=`rjg;Oh39ixgMmW4nMG
zB}&V68dq=bNLFTyXY3`Orx<SnWi|V3#AgQ{)P=L16NOeO=>cg)jlB&#xr-OFW)dhC
zZJuKO+Q;wZZ&&a@vs3@K!V$$SacVk+H1_aMY(42+NEYm)?U$Dak$Jr(qjSU|y%Iwr
z{_7jD*}gYd7?MgE&H`uCkNL*fvp3N-dq%q3w_Agtb-m2$^t~Ou(%L;mpyO%t7?M4|
z3>r>gLD1&2b{{EPc@zXlwiTKNCFzR>zd4JrJIoMCA#%;cI``%%6SqpB2z3zaH3t4X
zceo9l(`)tswg!YgN;;jj;U+*;x=al)ND8?m3KWMOovgH8GL#-rrn(sRCjIHr=<Z0B
z{%5)B3e(R^DwgDDE?A4(HZ!W#;XvSk&yDZZ9^Gha<ie*8xx~3vE~|3l_D-(d^=Xpb
z_GsfkWm@$8>9v?B-D!VrA4TI6d1~co4dlJ4QdN+^o=Vx<h`lK_A2vAqGpApYzj{gp
zH-@5&I+TQqaBR#E+!FZh>NGudpUx1z4<L9VSOXqF449BzP6{WO&QhK->jtj|p!DvK
zMrLI-|HiC~Jl1VBl&f_??5P1mCZ^JAY^$x~Z4~fH!YD?rlUZI->ClV8J<Jt~j8!9W
zEPlNU*&VtHcZ%R;us!aPd`-(`q*UQ~qc%W_^9{wZieflfYKOJa?%T5=*+uKxJ;PLk
zPMRf&h6h-QIuwa#vOYvHnqr5?>7Yj*u>RrWnSHR-oJpK&Hq^Bm+&22oj><A_>8QPB
zEBf8G6U!Z3&H9ph@*15^^68o@kk)I0ynNF7MHPQbV&O-tRp4Wv(b2-~3Cv!pczeI+
zABmueolY(hw3xgQ9#AoO&6BX$VoU;bhl3a-v3wqSlEu<;7=&({WJbq@X{&7wPbZ`x
z#T@em3cOq^&v(c9XTR(`W^7lD^0H=?B8Kw`1v2b$1rsKg<y5`+Be}1SvYVCaKXxZC
z$~&aC_SU@!JO!ld`SR~J2P5c_9%P!`3NT6AQ-=zV>I$ENm^|vNr(>^$rr~}%q>(6L
zVA}NeNA%TO&gZ|0@Xn1qX>i$iR<hI>Ub@uWZ(RLLG=W7wlm;hJi|KH&Ns0{?Sg~**
z2+3tt%41YzQe<`~g;Xjb{IDD=%SEFJeQjr7*oaK=n@u<9#MCh~?6CvmtZjt(!C=S|
z=Vcbqoz10`;BPi3*(pc6;F&Nc*@+*%ESF#T)yb1+%(NrUd4oxjO~T(5&tWQC<SzYC
zN@hu5-Dba!YqN>AWjYA4I^vmsdh(*%m2MX{)x|dy92w*Gv|+#QDC-CUCc9<_SG%t9
zOU~9pP8BRHg4LYPNc40bB@x?<%JT`sCme0ZV=f}DP^-p|#~f?f>Ph|8o!4jYH|-7T
zoR8qNs=XqtqZe!m4EL%OG9yk<!y|_ytTzXHF0mXyS)}vVEl9^9lvpE9>ZN+>E4Y)R
zYHd~n&nF)r6wX`j(kmknB+<iT9hr#z%W~@LW`UZ*$xpV5uRjP(MDPy{oTm&k<$D4}
ziM&;|KR&*5ez3<U^v@A(Yup^9IPLZ37}l`Pd3g@kh*=>Dp<61J&n4HmW{XBC^dX?&
zgI=!ph9^HvSFHd=U~VfWM`YD>*&bIQAR4+G)zlkIYXfs@OE=^qubyjeY(?cYjdzA%
zI_@l8Mbje{`+w!MnaTjA_(G%~^SED}9i<O*Rs;o~9X|I8HHX&J7w9vGEj&y>Q3OY-
z|Hy{~L>%z`&5A6F)#r=6DVVh=XB_<Dz)IDdD$OFB@GbFlnYc01#qhLb&^ul*x50NL
zjF)*0a_Y3xX9zZ`54Yhm49&A5k77yt`R*YUDQbStBnzI0Ce>C&Gum$n1A}x=WBy?Q
ztUP=3C{)L8*34&rmx`Hmix?7fIuCdAx68IV;k>Bi8UfHW@a8%6?bKPf<sCG^@=%Gd
z8%jJ9nW)^wVh3$Q`#;3p{Su0yeUniYRWsUc$2P~{YR=Gs<xV;H$N1hEs_gSXw$%}8
zS=)<Fmw4OSKVR{offDGm(xlSbz6W|XVIw>-PrfP1(8^mPolKuSm8;~nsmZhNhL59l
z>?9$1DecP619bZJ=YL8pzOc=6OCJcb)X#M%;bo2~Sf3fS0g^S&>&w|hw^Dgc$_RRb
zs9I-DLu_B@wLFkM8pH)jY*1h|>J#!%&Pp)Cr<h_yfGKNTZ6`nU%lxJAUB3)y^*vkl
znJV_t?Tz`B!KsP6EH?AmTp&uJiQN#(jI}0SNvL<5Azh3n66&8A$g!g6d^9zbz#x>a
zZ_Ytkjc-9<`hXit|2V@)G#_V}`INpziu-lJh3sSmi((X|imnOH50}usPM(t=8|C&j
z)5|v1Xq$kr2xIfvRr<Q{K?nFUv5PV4yVdE>qJgbJ#&I(W^vDI*qX`1=hg(wIUoDLw
z$^l59dTupq%9d(suUt0L#rNwPt8IL`?dU{`fUc~ie#ae6@f!~3OtG6J^#ve~^`3!N
zD(L|&&IbDkF9;#U+pjXQop!7v@^Wn@KGQ_LWA|g*up(aEfu=gU4UpE59jVaeSIm}1
zrC^(gPGjn!E9@5;<y)prZ$jiTSKI4v^jEwf=(&MDp<F%-g(8MM%u2u{zc425W4oH*
z_*fHE^;&r)dj%%=uUYb`9##~(kzdd5+x64dqCBt@<~CXC%@vG!+PTFG*a(jkBd%jC
zPgzNz6%dgHod>13*oQzHg6->5x?=9TGsvly(;=R-aYvt+N}Z(hR)U2*DrP4zt^Oo+
zw%c2h|C!XSQ2nl=8~EP!s0_Jla?KjRW5A+4n_uICIm}c#eBu^;=~#;_%jY~BDW4n1
zs!dBPz;vl=8UZJWL7ID>QrpDxq^H^7N=m~kij-;l%s9e5;&L-BC19rIXvP*~)i`<}
z+~!k@tjq@q8um2Tgq|5F8$VRw4t$~FV*JqwDGceKNVd%)<w+v>ZF#H(`pPX6XGUxX
zw4@k$Y_o-nmoU53c+HCz(<*_Lth%`TfKmAdJ+00EV}!?A4je2zO^bWuKBt;GCCPb-
zQ3-?Z=CRQOxO0MB4n#m3!Dpk~$0dV@Trmq~ta|CO<**mlsx|{%r^&@|s@85(M;oXS
z?tCT9roDPIq)~IRl~)R57eTGRXJgXU5nIsIOUIBV+fdJzs$T&rfw9js88xS3B+1b7
zB%fbrM9IAp*OfxSZr+R1=X{W?mt43J<Z~%|zu|NFFB;s=8HSB;wC?7)E#Hrp$|_K2
zk&**_32KL{$$0L)+RbAnyqR{3Z!Aln9j=`%t_T;KNT&1suPwG&1QDy*4g?)iU8-oy
z^10@yX-NP%-i3%5&`Gt&Xfm(^`h9E9(SrjtHzWYX#`?8n^@46Oktc#RT9d~?@QCCh
zxAGn*j#+lgf(3!J+=~^mzM#_^;b9-f8#^E#jd801bnEj2B;}(RP$}-p${~8>oL!(x
zv1Nr9;hwlDMT@ls-exRYNH{p@=%MDC`}88Grec3x!WWJMi>uu#qUActy=IA}W6GRU
zw;oel&+t!;+c*^k#F0iH!W9g~wDE2esI?T+z4Yaup|=w#MgXrPgp!cXyAQMR^seRV
zcvB!|l&^@ToD*Fkr;zuR%fto<@sXSZu9a7)S)?Z~aw$4*=ARBH0D9B;c1W!GvS(T^
zB-u5^ej!m`l=GgCJVbSXzxv4)<zO10V|?qi%MXsAo5j_8hWT5d6U9xArtHgMb8ku`
zTzg}lvM-XKZ@3Ty7{}KAx)C%8I<C<#q~4@7^-WJG&)MhxrBF!k{%;Bez)s8}n;3rj
zq?I;fv3I>Y$&tVar2menOxdLybt_ga<HM`)Dzj*9*?>sPuu-6)(q^-YK(sX&xAtI4
zWRCU3ZsL5OclWU6209Qt?KZ$(@dE^%7`R1MAYI<@(peiGu}-C9%KTgv@I=9;Lq}JJ
zHncsp7dI$<by%!akj;HCIdTzd@3O9t5c_AyM>ws~9bZbdH`rIlht7S}a4S>IW=O%g
z@*B~u-!1YDo{0$qJ%8Ov%qtg5p9|{tWBT&ucnuQ@#jQx<DVHLMOW<D-qxog%d<2VR
z=2u$%5+qf9H32@nS^vQ}@ElYbacbfo;(`!G_fhMfE7IGx0s^BPiBF}pr}L2V6=YD|
z48RiFt?wr(<h5N^%k{%;p}D`u?&C`3WEK{ELDGj;KfJz*yhlA0t=r-eQM2kngCQ<S
zy9H=FJ!!pM-@j361T3nI$}J)%i|fCx<Zg9a3FsS>r;8>w$Hk|c#@t#gZw^Hrz%3?>
z`bjgbfGWjXf}gfU-7uyo(?h@BOEn_9e0+D>)S(Gk8%TbR!?Lycv^T_gAy21AXJBZ7
zetTZ@NY1NF){|!(W)ViDdhZ8cE1#NX-<wR-KjmB4>jiHQJ{u9+e@_c58e7Kb(Akp3
zbF<mNNE3FgXfHVrCacR_JgvNcS?FrmOA@-d9n>~_p3yArJ7sTnQqQ~B9Pag>x&B9F
zWF;0-pOe~uS|3rY*wWSZv8<diF&0e0_g3pwvYZqkjaI>}!JK4uoUQ42(mvMDd~_Z1
zw}0CgI0*c5(f7ya{Jd4+%^$m~<!18sK*F<12A8rSepDgb$A;X;C-mIoq?8g%7=EHm
zGP0uUUsR31;TnIT5WarG+FG>7*8k@iGev3MWAT2qn$DNJYjjcBs!ft-KQN0K5(DZ?
zpPMMxNfDLZrT$t1DDAKbm)i(xY$v2&t=n<7W}y{FtE2*F<tEj>eL3b?jKlF9o&y6j
zAibFpSt9b&Sy3pv<OAM}*#2!$ctU4bH0(d!7=OB^^ija&DHX*pGLijfp^^Uj2~7H`
zw|7SQCj|c+_21v#js_nz$7HzHp}75wS*@6m+ivmGwF^|*&1HN2-L=WK(1~pyr+Vz-
zpVN#g!-}$ZYh_@ODF%*@5c$^EuEpvW)x{IwBVA`;<|2A(F8@1`=5Lc(<|i6JVhOL;
z{!RFw>(h=72jO!N?{3Xl1yd+T)BgpQh$8fBtg#*o??h^Nb{Wkb4oa&_R7iibf2||*
z=N##!f&0-+8JD~H4(rmL3i!|rb$2=2|91YAcaMYPE>Z5wKk2JzIt?X>6!bOxPkv!O
zIi|l~h9Cn<$V}nG^*BZ@I|#9W@7g!fSC5C%J_-*<6!O3nrpfUd`Gqthka=MX=(DwW
z><u-HIY|5)TjgJ);3+m7)c{`Chd7~sJ;q*R(QD-(o2r)Pzg?<#4+qFmY3u0jq=;}o
z`H`(ozWz0EMc*Ta5irWeF{t%<-}O@X<}j+4O$Gq5_}%887{M<C0l51Q{IQjrO8?x-
zKe(U2FYaIeJOC#=B4TzD{LgIoABXAx=Zm6m-im&9xXLeoS@6%5`=8GBfAPC}Y^<zQ
z-=R(PfA08y9Pa=6ML(}VU}l{t8OnV4Z+H3s-zVMzW03zLv7h9Baq$1Iw_G6u_jxP3
z<6F*uD!>0V5dL3({O`ZJl*()e6nKCS=v6pD-kp@<*l|qW9S!bk1b`MRFkI4d`ZYx)
z;JWtV3Ce{&6hM!}0koyZfNHjM1sP8a`r%c96ZC`A;cfSOhebV>pQfsLYR-+$HyA)O
zVVPO_rB6vRckaY1xv}pXbkK>dqUbKFxBulD`hmnV0V)*3?PiSA#x^tev(f4^8IS*z
zU1R}o*$Pvs|7h6gzF&Jyp$nqc{GZ2RAJZQ%X;=A5t5AA6lgj5-jru?g3c)1&sov*@
zQh@Q~M~$}YSlD^WB#wEZ^l~a&JVQLIl=3$7g*+jXRu0>Av0lX9Y^fOfrLS5L>KOnf
z^KcztYV_W*EzI{8KAOFIy*E*dM`rqMt-7r3_}n(JomE6I)k>v5t1P}WM;JuQwS;?{
zZ5Cmbo3MYk)CAiF%q69xUCRIEaQnCrtyGHM0xb(k5dcrn0@(I+8$5(3ztKan|09R}
zjODkiEBFC+VY7=<W_f=QUbZ`)Ers1foAqC=<KJ#Lzag=9T%}Uhm_&hU3u4W}l?<MR
zz5W`PlH~Cyy`|m5yBom!kUJh}nYbJ9<wB13ho4T3LAO@p`q;9|`TV(#JbL!C3#zEG
zW&w-tg^Ha|v#acDAF#I=RO!pAyW&_>{Rd*1WK1u$!pclFJ#}h|2g1M>sLeo-%e+z^
zgkhfyZbbdEQbs5Op1Ffo5vlEqS31$s4ER}GmD#nX{&GGCAFX9`nIE6TuDui$f`4qs
zmL6yoB897!=AFWEvwK8O0$RDfnNn3R>N#^D+o?9(SNMZtdv5JRI-l1hdwnS0Ycbmz
z;GT|t7G0tlVBGCsTA%S+M)Ad4JDKX6Ri*iAZ`brZMVu&KtltyMOAaxi2OZ35N;h$g
z+QE{Tze=(otw&fk8~3BRo%Kd;59zX-+qyTCGY=AJtSN+7(W>uHw~!D}tSR8U{OEQ_
z@geTN*@m?%CwyKKQL^vCI+C0Km|!YYg1M#5G&M=D8O;H#H{Yav5Bw)Wc4`Bk$h>j7
zb=5~HLH&1Ze+u;N6Qto44=6u_S`IR^PR+>mBAf12l-r32{2bkD<y@|#gcpY4e7w-Z
z7SHb-s;xSj9_yiW$T#eUWUa@Sm_Oj^AUJ?8{GL!Pr<no{9zebPIBCA{imCarp!OR}
z%wdv&V{$&>;4B9Zgc&XKumP8YC5Gy8hXNPT%<$KGhV=waQKG@z!}$lm4MD;>I*6+i
zx$9tHx0;wUR4KB;o3At%@$Ss2ajK{C_IwX`ws9o6?XIlm8+cQSPki+<q`SlM<q2o5
z3;;WQHDCqLtIJyfiZE^7S5J9_wK%>PKW5q3Gbb)*n?wR5>WPvO11*2)CDP(;=RhR2
zzdkc06E*<)&mC6%?-XRXYn4JW&s>1Qy5H0+CYS+Z526alac9GKy9%mWmuLN2`k?a_
z=;sFj-U`QY_$6}`I||w*0ts>(sfc7tOE9-i>5wsc0(}PDl3*{kU37bdAZ+a0Gzt8+
zirx+X9~$4<x7P>oxYut%UBLhr&`<jV&3Tbd1$@hKhXr2m9`k}FUd)P3W1%X;r1j|6
z_tXjAZAmkB3ZpQgJugUCUu3a=GOGxow8jlG*I!r}`N+#p1D?ulo5T4e7c~7(LdDt$
zMshr(X*NJ~R?r*fjiEEcJIBxca-Oc8fW8NkFClCjZJ(=sfObgW?EX|SsX0jj(Zl&r
zR9LX~had>#0Xx+MBz@OA`OqOdz3!j@2K4>ewYxYom1Yh0xaXwv=bq)<lbeA*tF)$>
z04x&~xAl8oTRZpxObl`n8AkUO_*}o1`u_rd^muwhoHwGM$=%Gdw)4>ia4h6=Ka`E^
zlW5^1`oDDj0+X6KG292peV#J$=WN|!_}>Yn(EFFP1zfh7n00D<xYK#vZ0di<C^o*4
z+eCSV!b#wjuv?+}P<~FdZ>{JGl^!OU2`>=mwOz{M^dj`#>M3!$l4WBQ_4!zf8urQ%
z2-m@Ry*MCQ(3}S{)zgv(cVl=MSy1yi8HF!@#^FbOk{{t&_jB|x`ctOLzaL!_0iWoy
zbS!7?1C-75bmEF-xhm~gJuW*j&yEvLo13#hcW%?yGmY-X3?KL^czA^NXKP}@To+mq
zobD&^6fHkK6Y#oyq1$FZ2|c)O(t%8zZ3mi9_rR`g1RWLe4XG#MhC>1PgVdL9$SJ|}
zO*^vB$H!IfxHJ9x4S2$dA%(O%IL?L4{baG$(im0T=~y;a5D>cp%8<Cy)6LUo8O=Xm
zzIZrVZYqnxy~4|@>wPl`w4|FPeUk(PPcE_i)Uy^dL^6ofDLkClx^pmWfKE<Uj=!x0
zU-lWbRK}C3qxh|uFym(?)~W#NNV5ALFK<e0UflPykL|2qr_;f<JP$N_X*R9RDG7)#
z%|(o&H4LyHpI96juSpU&f-mxVocFP`+S^)AzdsJ)a1U`158DC0{?wK!Zw)oG2<0cD
zzB1Z3a$AG(KfI+PCx5+@I~A`)`0@CZB1$6Q=NNIyiceqR7y)69h~#NBSG)2Gnf%XB
zDTz#z(Dgv^si1Uv3838a9pFXgabCkys8i`Z*v;3sa@cDJVU{G>Uct02mdxzj%#%qu
zDZ~y9TaZ0z3rH^#TGu*j1Yxvw)d4N>1X9nlaQI(J<mq6E<_$l@kEAxOdc8O$x#4D$
zSuD}rC-mKkoYt`|s=qzyK=*1SaRx(}c&&axy?(koIzF8A<mB8~*dCDzJ16IQx6B<x
zbBmCiH~a&DYzLI3hvc{{Z5o%llpnGwQgOR8)bIZqeE6k2dsGu2ux>IQZU+0)WomtI
zO+0c2RKc?pKntXZJdE_Okk7CA9k4`>DB@V36r=%e!wCdhiQD_Kj4StP<vA7NF(H&=
zEDKsYVzSNQY!wVGL)x$8>xDl{1v8BH?o>Qw%_j1VQ}DTfWP1K@XB}-LoN&{aa*1|S
zMyTVLC--Y{^su@lM`%a);K;1_1B+@UBKJD3qaNvJt}y}U?T&Zva@l19`j<sIawN<0
z22&jZ>31YvwzLHRV@H`tLIJ2N><%9r$K?Zp+D@~BKRmy3lvY?fHl2y(&C*7J%b8Q&
z1w$*F7Q$q_K`8wOHuA)BbOFec;3-gf=|ou4q=8$!r=i<L&VW}8VazA-e#)&6&Bw(F
z5rjYAo614yd{!8uw<CD?jbnxPaY?HJxWB-X#-0oemM_m{GpeR256Im5`8b;Z-Q(k<
zY-xtb*&QNZA{~wcCoI+S6DfD)3M@H9bx6O#yj!_YZShukzD^BEW$|RsxkY4hvwo|)
zYy|-B<%@@dD`)LzKD%7szH014y0oispG4gnD4$O6QOTB#$3fdd%^1=Y-Enw<#kT>k
z>CK@{Ffrq4&|q}5X6z*j!{#=%an=g`i9oyL&0|iL*5=m#`t4nhUsi?qwQ`IH<c5%P
z)=(ZVxxZAWNz7CuTEbLME*QW0;09w0MX6M>hJnj{h>cpUIx~(9=;_0m+l;q!PXZcZ
z%_e=dATagL?4a7sj8h`0_B1Kt`gztMBhV%;jALLpe)Ky`7jysSnEn0%e3z%xGB2x4
z+aZRwnwKx4^HncRKec5*UcDaBfbvWlqluH8V_Ck=zl)?DSZel8!N?Fwd*Pz-HhjV;
zZ1~!^XfXz!8uRaG-U%N3)mM7nVtv*b+a255>X${+rAeHj9WT*LS~4*h8fu=^n{aqk
zT{IpFL4z2MU=XKh@)zy+!k`pGU`v@;O{Ww(58Fdk@+tKqozO+(m-DIn9jf~lBt^!G
zTUh3isl%3JwXb8Q!YY!~Q=c`u-`ErL9LyJ~e6?OTRFUA9Po08>u3z(8uM7UP0=&s!
z4QJ~WT3QVNdKs(M7PB)lIlUjrjm)CilJ0Z4CUrf_tFN~9E?eh0c*X~D>GMZg%Fn6(
z&3@1>I#yrIqm^)iXySsPnUxk;1ylXDso{&L`haQ}XrOU9kUpPy>jZV*zj0js{b`)~
zxx{++wxWoYRvF9B)Nu-t<lJqBr+5U~E8&krFDYH?6|jh|-a3Z8lJ*>Z>1C2iByag*
z>R3Z;Se>gT(?HH<zznSy<5v*}zwx-M)9!Alf7%+?)?tS4F%F>Akm!CktVZI_Ouk90
zLFcD^ac8DAdV^V@liM#7r^h;)0i9OWWYwxOrPFODwL?edLkBb`D30>6^@CvV?K{L7
zbAU@alGBvSuo!&MWbrzN2b70L)NTj+PQwLTz}wzfya@)CdieA`IVah~vdC<5y0SWS
z^hQ`GdVzo_TwS(x+483U*ofkFI5(~>q>44c!fm%pCYjwgRNtO`%`mHm0~bS#wu_W~
zC|&D|K>`KO9%mm*rw8o)M8(ee(;bQ~=}B(0Z=3aEcd@g8VB2EZwP>gp<tV$0-!Lbh
z!bIz)GKpOKJtRLwT2AZ$;6}2L5z{Ki75o`m%0D#I{}wH7QarK`S%|7xpZf+APnEvo
zd&d!^h-`=}Agf;If}YIgg|*f-zD(1*+g{O&hH{X?`yJpczV@HT;NeC_Dk}7JJUnia
z3*nh6lauNQZ4o&QZOM3ERpj=@aMPery-L7cH>ce)A;G?|FCeAHjfiY@QdJ3etNiy&
zn`iCoMU{SrE|;TdeLgHQ-%VT*y~WD&6sSKEZc~lsh1{0{0vJ;Ii{+hnC7#TeZP
zwMnqWHv68L{dPD#-44%wro2QvJso`e;KkNGT7;v9_3<60w5KaxtW<iX9})WRr_R}B
zncIxXT9hA@sXn2nt$>0pOXw_sSO8#3C##fg?(fvL5&)mb$0@8T)sqDMdA0LGF6ts(
z<`Q@*dEyZ-=$~9>VI}r69u<w^;c`BFKw^Vx2KkM#7~h2aPsu^g!*VLj$?qh!N4Yjl
zu<<)2%mU?w__OEQ%6_Nfv4b!9m$9Wuahx%dTfgx~J@@6ja{uw<$y&LZZwMY<VyW~R
zox7cJGitzHoGTC$xMCsaH0gh#kdGToP`AH7-mq*4!vJY|=!818^bTMFfELp?Bd0gE
zY4aa4@&xj|%eF{r>BAMIRW!0v8!|S{j{97CfDV_2Yey>yX*GH}kWF(Z36RcX4v^Wf
z#f78ps2tCGrCS1)UJtO5Nj5qD<~?1^X9u4B$OwlzP0V-kf7f-EY2F)rb^yRdxoh4Y
zg(#zdydq51NIz1%PI|3FrP+<0)<8HcMWg%a#i~+)k-LZcua6Azjsi^6jAJY?iee>{
zQr#jmu6lLe?(`G^Uv@GP_h7{jczH<0GbD5dg`7~Ej?ZE$v6H`YhliM1z&_m8Dk3nY
zXzHK>yPdClyb8p+jn{-Ro#4{5`E!zJ+${&(jkRZ24*;VqKg!PcI_Nw`lPzQ-P!ZQM
zB-G)-HqEW79P2>9O{3t$?5N8x{>%ZYHmoh#czOlC*H2yJRYN<!*v1Xu6_Cg9@1JyG
z({(6Rg<TsLyN@1lrH<6K#;>}3C{tjb8Pm`#h?iRG@bBzqoQ-9;jmaB5-aPTcNIwgU
z=(w)2#+j-YN54mzS$4gh()>{|J}Mc=%r5{0m~2I6s=36|m9d2|*{#6|JX__Rq-yur
zg;UjpH`2W}H>kK=(?x3bc%A^2&_Q7G|56Fzp)cmS!U-59ovqZe1-Jn%k}TdXM+y<&
zPc%$g$ZV{ir}9@HKyAwS2(Z>7y>&T&%?rXjq7}{4`CX}Cbqf*bsfJdiI!P0<w_lfT
z@S=&eq!((yS=Am{=-<C)tXBiB#3)6@`_=dtTLfIfv~LVQy2xn+oowZ+oil(LzBdc#
z4lI%Aw%9A5Ti;-it6ny>m|p{plh>UyEfE*wwYrJ!KSS4YZ@o)YYbpz+EGPgjXSQ~6
z!THn`np13Bf$Px{uf7s2ai|VFL*LVe>-$ioGQ@*fVX!ryXnu_}^h(Ovf7V8vDSW|g
zN$a*{_2MggpCyl<&zBoZR$7Rk+_BL+=LzG|YtI)OMB|tXR~#`(2?vZ}UWV*t0GPol
z^?ihVoPm(j*EZa}n{H=_^%B}V3kaP3*fkP67Hd;YS-@HxX+bJ@nzCuxp@ni-6}%r%
z9JkRPy|Je`m>ZE#<r~As%0IACSt1CN^r@G6H{|=fgn~1(?i$*g^Ysg7%4BA`E}pHe
zRy%j=wrWpO<ajyvJ^Pjglw)K{y6bDFY3darL7gCx`2gQ!iQdc8a3i7I4KzVpztqgr
zdioPQC(<9sW9mfaRxc~dpoHeFTxRssrE01r6!)-|H<6PgFH)$lwx0Uc7H@$utt&nH
z@a3w}-o_4c@e8m*>-RiD>&tx|en}Lgv~SB3kL}(=PE$;f{H|SmWs?koNO=c^AANND
z8<Okn2b;qF?s95{H=wHkyKdHWiXTJyK5WJ;hNDu9a>#PA|2TI|&nL_9#|k(Pb2zm%
z=JH3ijuDVQK3pXz;?kD<9ekd9-G^}p-ldzc$fm}0r>Fy}?$+jQR#lJYfn%HzNnxhG
zJ4IUTt>##|4z~gko`EH6kC%@XeV|4tdM}zOinS&B$v;Uskx;k7p0Q-9dKm9+yV;Pt
z@nJG-_jDHOsMC3QUfd&*)nD9wfJB~HW_-T<aT*e9HF@nE7vFHoj~F*nqMd){M{lkm
zieCm1O>zCDbks89p5H+{M}BuE9bLET{@5E8fRUUp9#J@Qf(L}6j*aMIqVnkQm6G(%
zW47}{9Y(4}X;#CFv9W9qCjd6m!}mnW_cdwypB<(X11}h2c-GXwi*ILb7qg<lxr8Yz
zx#CM&<wj(7tF-BaU;knc=2b@|9wozo_hf3WFv%74W8yx;(h$S9Z?v0hW=>6rkkWP-
z;tB-x>+5#<Z{X$ur-ib~p|Ao)mW`eHTQvcl@LY6Dm9{+HY^^G1q!N>o@P5)Ep0@M#
zDuzCf!(N`#+`B>J5G7y_h7G-di=nIHQu1(Y1@i&>H*#m5fA0h^+sk9`<09=#%QU5Q
zd<!^^UK>HB@5$?@<)@BZ-(=t=<d#F(j{!$t65E+F8_jr&`2ONi@D(YH?YQFe0!g1b
z!2N1nGzmVgnqKa#MN30UEhX!w4IJcL<Bay~A~qCB5*8UTX}Z#tJ1b4`ePLox7U?Md
z3qD3wEY;qJHFL`aU<_A>Dz?!%+O_fl1x>~s;;wxZ_t^w*kNFXTtkJ*=Q$c+^cR%l;
z%vP&r=reYVWUmG{ciBI1Li{pxkw0)krh0VKDv^)78%?z~crhicnkkM?@0Tx3Z6<Q%
zF37k#A@#h&{>?z8(hqdh-Oo3<;5Oa$;a`(YC_ep`Mf0YT{~pK}zHkmF@0#BK=nT0^
z2j*N#|Jw-X#G->^!U-eeMVwlSs<bk;4f-lCfaR(Db=dRq>oc6baLQAUOX-h<r!35O
zo$fF(0__C!GEmG$yd;T_)*y+A_mDNqNI-3;+d&gLg@pgVLp(dHaD&lRmdla3tk+!G
z#jbygT?4xnFt&k;9yA~sQN|_Wkv8Ls^8m7e72Q*uugf`Z_U+0>%~*y}l*}lm$T?B+
zv+sDQ82bcc_X+=(_NnlLnN>Cw_Xlg)-Q?r!;(VN4D8wU*!V;uU8y@;20bTQXuU!LD
ziT+9Y-*d~1dNqX}<^)L2Ki9km1JY=hqwrVwRN>r3>Z*kq-y_yxzIRN_4L&||oW)7s
z@vT8kk`(1U{1j({En3-cL<>)~>?NCiGu{ono;%Mn*b#~RiaenTdphAC+oYyIe(
zTMZ@U;!nh_cL6n8_GtYKJ1L;F*ca&5O5*`Tl0M)ygiUQl_B@Sr0i@FUzD?A&l>nd2
zHAxzdnL}@h7X)UXJUnkPSsMykrk8;iK&MG09Al6Qp8-UTmV4^qM%B`WR7@f`TMAV!
z*?X#$u~4nV@*w>B!eZupjFh&i97>A|SqUFe-EMgDK0SiOUH7ePd0-2J+=)M^K+Ds5
zw>Refr~--ERPQzl%f^!V5Vde2-{sT!9KVj}HiF_bTy%K?!b6!Sv>2suwW!g_YBzVC
zpw?sPLlZw_D>XO8WKBg(9&;)Arx|(sb++u9ImI1$h}J3o+I2Q`%=&dQOHx-zIiuv@
z0Zl=sk5j7xPj*bRdluam`}1_a-<5$1KFBr^S8oEW&frs3ngr>X{Cm;U)y!pxY3U@q
z>Sa5E^OHQ%w8Pa3<<c(s1wO#>m*IPd9%F)3cuT39P^xEYV}Mz28?Sm5oLes9#m7xs
z0O*p?%+uH`e7at~x26l&?yD+BdEQ0$(WENTO*aDYuwTlm5ZG+W<@NSV|Ah~vSUsJ0
zq$zk|O%WJy!Dm6%#aqQxhg}_^<+Vp{a|k@)Px==e8B*^N4tP5gP0R((+_jePjeCcs
z8enJ%C0hdStgyOa5@4o3pG~mxav)B20j5X2gDQ4@rVNg|zjlkMV2M!`dFWz$UK6_l
z<dX9Fd=rmfv^wA*e<V?uRyBXDH(t-DlGSD(*rSGd(~as;Mp0WIzx7m|VL=Kxy5V|L
zg8AQ|gYAgH+H=h~)!&^gAS&2N<5C1`0SF<A=~<2hW%=wD?&&0}f{2rwdfZn*KldU~
zjHfQ2Y8__2Ml?O@PLkj!GH1!cEw<;3i?QD}4g^Q|0@<CHk0F}5F2F-l1(sN@r82{|
zJdTkelC0D(oJ%`T4SSUu;{e%`*4N>ph<i4sH^2&PRO0cgy%n|bXvQw*sLqVySzYLl
zzAVvP^`(zhH5GPpU$u!luMJtw70VP=tZ@_d4e$Mnpgoa<(&yIGDtbp8h78fpm>RK{
zA?iIvAICBu`itFj35mNydbGKU%eKwhj9(!ozVhRX>z>gn@A=BDuVK;Pe4{Yr3B4R|
zbjN)YxEx?x1%bf$BVmUk2K%%wpB-%6O9jrLzdH2>+g9H0^Vlg$#jmgbR>u5(a7&?=
zlxgzFAcev2^z=`XJI^_ENYu_Dx?Xm@zBanAyH;A{mQCHD9()q%RLjZ=IsZn^>d+B*
ztc;1RD0cX`4}h#rd<A?klOAiA9&`%;A+&EfGtSI+#KxU2u{}Ot1UKSW{%+e{k-)7;
zG^{nREi}r|{4h1ysJ)bN1SI^)vB4-qd5-vFmOSW*eWs{N7|_o3NJobtKObyb%%vT4
zH;_jMUmrXQhS|FsN17pvtW=DR^Sxf381*FR%K<~Bt2)-m(O2-45g}hQROkjWiI7>^
zaJHgcu)K3Qh>*=(t2<wM2*m^#Fc1Q&ac_anX)lInS5j^WSXw)aAe6TwUC{w}&_m)B
z7cW=T3?4^twg0cH8M#p<8J;|}VPgOTm^XyBUu|%T#Q};i1!yGN4o=P6+`G@Vf5Ry%
z@Asg`<qtQbw1N+M2oJ0*7P8Z!C7-GD<tRo{Nt7|nh_f2Yx&nLjzfxp(HPv{UH*fb4
zr+?Qm{|z0G10|1_vZRcrwda^vq&5CTlNhVt4$={PL<Bzl{Eiad&AHZ|2S5Oo>L?MH
zw;J1ME@<p$Bw;2p9romOmT6>vg94&-o{?NLO04=;BaX9(l)51k#+XDtBA=Z@Z>@?q
z>D87r`s~=jm)HP(sq&G&G+E{7xi?)wW^1`N;H;y6QQ&!s$)hk|PFs-OylZyRU`BZA
zvp9A?Rc}jH^ZVNF$jjo5>|l#b^V>j+lu3>KU;I1^C1gAfw7+5Sv}P4pcRQnIV+Vq+
z=Nmb*g|~-~O$kE)9U9Wt`L_g^coePc-2FQt#wG+0v^he4OwYPE16ZQn3j?nG<X0of
zQjj3uvn&a;T*iw2NPrvs8e(r2(6wC)f1Mr9M>p1u(pF`doStS3>t)`KkLV3sGU_ww
zybX!$j_D6rkKPp~eKH=Na-^7+Mz4|D2FM)6iOeOXVh3}M9CY6G9sj-&t`WXxp9G3D
zy%uvl5XrV=nR88G>ED-)b>#uzGSFCHwyOa8@{}z+7t$Y&_}#a1gBTt);}pT&bYvSt
zdtL@7ains1+84j_?9__UQInbz5aF~*PLgQpWa1G-IAj$5CX9TEIVVn`nMP{8|2?kq
zqmPmW%QMmXcJ_+x^3ha)3rhLb=|ML$*i>yNuRC8mB)plAkkR56x?O1$2#6M}%cRRF
zFYmr#3a*K+FPehU>7*=!n6nlcHzePm7Y|tJJ}5RHI-brPNT0??*RNbZk8t4obmTGy
z`63>(vKmr(W3fH^SnkQU2k>11wiA%nW|IXukLErUz}NQ}QUevfR~@)&Q}|5!xK;6~
z^k9M8&MAtvt@N+A(xiV+QK&ejKo)6?@v=QgAQj@@Fu^vD+isH4_42vJAf(cqLUDbT
zKl}4lnv4Yh%U=zK2$xP1xLhgn8vdZzut36?q!%Ir)wZP|t+y=I9iIr{OT<sri@%e3
zVr;hF!Mu`DlWojcv}6-B0SkViYDw`BztP08<w$P2&sTKEs1k6a6VBF|1PbC43(Wwv
zjNJ1*2Vsw9LKB6ohg-lnGqOoFd2VzNCM;O}qa`GAf)uHs=Xt_cPr#i5o;iEO4JN(Q
zWsS;Zqm%fqS&6hmHWIPAZTblUkl;NGcR6JxryLw;*lqo>wuXYlc~$SUgt+uXIVrAl
zjq^q;uQuOZD`}M)^0MJ3S&<9oWLVMlbqRu^ntSHA0mU*&qS7iDc%*EP)`JurAa3@x
z6B~=|&jbG92!^Yd>gTm7q+~0!(hevBIpqfOT<u95y+YglBTMl3+J+!1TgK*6O;oN_
zrV#W(k-?W47x4!m>`e+UF`GYk#Q~(Xe*p!-2EqO6v^A_d&&<ZWpaD$ubQURT9a>IH
zsgO}c{ol-F{5}jU0Dy~U;hoOQH$vdp1b}gY4w}?aM40Ki>llM$kSgM_%bs4bQ-$#z
zU$feQKBVxlIsS5a(vS4~b-=pdZ(qHX{Eno+mRMGYCq(A76lINu^uX-MpCf$`icj{Y
z1AhKI*9FWJhK-@&NHn^ix-V9gnU#BzP((3zZLEcYcR35Z0fSzIeJl&HZis0rksyX*
zFJBeB_1wdBpt-7&LB!c$keqWx-!wJ6-i&awM1rM5a}Gq2u(-cX$d)f=m7!qPfM~s|
zfYYA(to0fBEt0i=K&>t5C$h2ZD%7S}ZmKlnaN&=Mo`#W#cY{aE4-!{Uk$<eWn%Hx#
zN0g6y+6enMdwSKs*M9nmZaJe9NfndaxehDSbiV%UX29Gd;Q@O6Yw)T8X$g$A^;_5j
zkhsPTW#=hOR+kYhNIOsx`RrH8yuV(F%zGs?XNi+$$A<D=JHUaKLpC+d-n8S$sjf`L
z{USMC#i8R3OdAu;;UOjr5chxQLr{}GBUabA4QQpPy}x9JWkO?Ne~UgiaQ*s#O+G5o
zY(U|f;&NC>-xkBc_gKV5tM`}vXME^33Jt(cM51b+UQ^dkfP?~|viVsp!?c=f9|e#|
zj1sGVitSI?(YCeE`;uoERRRyQP)g9Y>`5q1&eLQDkIG!E#I+P2UlePYK#9udhy5}=
zwZ|naZN_NQI%ZCSK*jUnNNQ50G5a<~A!{+GqBGWv8w<rS>!j!hV*#A?kbf>)2^<ol
zz;o2KkA(vT1+SGu9mcXwFM208rp9R~#=H>aJqSfRx|RXRpkJNcY`LC|hmZ6{?oo$o
zv8kncar`S*x(QVKl&kZco!A1om+cWB0j7oavjDKcSF$0A3Lw9(SCOdryJL@AJ#a^W
z3w}R4v2}~%KNP_oRl1yV^27SWaA-hEl1yEKKiELh57_1*0A-G75!1uDb^@TODF=vB
z8-WXNAp`1#Zq08V!M5m+6tnSj0@x8?YTes1M%nre+dtb@qy-yv+^#P)=U=WGdOZYA
zHfWsve9Hj>wcW|ox?ibW4I=_db1%~mHypfdotxH_DSZcXl}QY(Cf<^F0yP#w)|f#H
z00Gmx#pFjL?}NlR3+T${$R>97VRYHEykE%#+JO`WV=mZgfh|)VMY$qVQK6uSv_Jnf
zbU4xiyFr4D91e|)e1&^P_~nUvKr;;A{Rns`rRZrIudEVTTAZe;CeieOt%8uEtC^%^
z`e!#9>V1<glAXEsUY&NXbID1E#oOt9OFC_iX)(p(WJCe)(Tnk0bOA5Ga+}h|1Z4tc
zrS2v{9b1TNS4_(=)?u)zU!Ga!Jy}*oOE&A^jTWmSy^7C2@M%2Etm?142jZ94<H~lE
zuuZ@btl9^r;*k?JEFQT{<mtE`x%_nduF65SBK)+H?WyXMr=y`6Ve;5K{1vaB<4?v2
z(^T6nOQ8JGi|DuVNK(!i8L*R_4kkL@-Sy(CBuC#0Yp1kXV-L07E?-8E^Y1<w@s`d%
zd9rD6PvN*lsf@>{rQxoT)MS)Kz2irS6mG0LUz4d8LA^2sRO>bxCZVN{uNjQd9?qUQ
zMk(F1dt>M}dT}#MKT~3XxVmypa;0A$jCzcaz0>NINSjWn?T#&VtH?S#oWm~EwV#Zj
z483&P_#)JS@8IC|N*QJ{5c4H9)AIJLv-$r13a<<T(Eh#&e@ru!i*en3Q)X`rihG^*
zGgv%XiI&3oH#Nup4fM^k0}-Ej;U#D6fA~@Sk)m3m4wP2iXbTXJ*>Bt=8?WqJLej*t
zA>OgH{Z!)=kRKE-e>UjT;wnA5*MWW*SR!J|u`#dvrf0}~#B+lC&2Ubf{P&nd!?sQj
zoUR~_j<(HwjiTvnt_(t<)^`BvMV{UH>zPW3)UBAyXS^5_abP=}-JmzZFRqHvwC~`R
zFyg}v?x4d;oi7$B@Yf9#ZuIqm2ystW-HaRv`6yp)ZFFC_!Ns%mU>xRs10~n-$#R@|
zZ0Aw<hE@3Q#Lavu8@+sP?p3Y<$r4+lS2~;v<s3-mcL|l(uTE7+Yr*Q?X%k85qv*$;
z$d1QT%Och3Q%F0Kho1;X#B|45!;GZEc1lZ?!g`g7NU$Li^~>#0H0VR~V5w)*XU|%^
zl-h|No2E+PYm!5CHZ%%5_UD+-&3JR=$(z`n|M*f%K1Q;Bpc-fqHmi4lpj37&;%UZk
zMH=?U%X8Bv`_qRQXvM?zm$N+lAjYrS0v(Xg$w@{%9de65A)YFEHwrt<txT%{;~j{+
zu&x=?q!k{I2s$^>Ed!MDniw(kNb2nXS;c|&xVcu9+!2nGe#C=x!*vVRjLV~x@v$g1
z!UOVa6iRxO-?n1!XFe1vUJaq9g$XX@(xhcz(Pk~$FGmZ9x|(@D74{L^yBY0ms%n|o
zyJTZBcvhmW(L=z-O~4i~HK@ZRJ@YD}AwP40px5aOnsV!kBgug&KvjdA?3x3MgNNo^
ze_K?C%{%<uUb?Y>&dZNqI;AEawFiLj44!GnDDWgo@Q-s&N&%4Rqe}G$wwsB@gC_d%
zx+?j*UQOCN!B-x%(ekM_&FY;q8Sh4&zNVk)q_nw<o#B^Fgr*aT_4Hi&zx>xM-DX5Z
z&HFP^Exj+9Mpb?Jy-RKJ`L!h4XZ*e9HZ=l24>k0LtCvk6*5+Iwl)>>KJjeM&AHuIW
zG-(vW;Sb#-ltgdc+`;zYwtX45dSw&%aQ}<sU2tc^Yoysk{hLHy*~SKSZeHBt#pVd0
z$Py}5j5O5Sc0R$zV#n^l$%P$DAA%lz)WQ*lT0!@sIPVT(+}Uy-=8k-}kg?XkMF!A6
zS3L~$oY;Nkjtl}w1q?=f5$#-9l?z`88$?Ry(@hxap`B_A<dB%BDJFj97jDIw7KWSv
zyo!z13?GYD-jn$oB#kT|Jw4TrWp;Ju?z%PU{&&D83Il{83!d|mO<ggD*$<bNFgMd^
z&%Yzl23)^t?5))gc(wyzq^QlF=g>T9zV{z17Ldenw}vo-R8H^PD7?1!fx?q$)`VIj
z@pO?qLZfpIKPZRaZL|K{`E?)FKp-!&<(xg+eUP8ZYfjDruUpWq_|J##-5tEw=xW`|
zg#28xu>p7u_+a@Q5p6s;?1SwR!2#i%1g{hceql(uN>B{QDTz7Ddwr6cnAT_6<F)^&
zJ_L2m;!tirixi*5dBK}wOWWUi6$aU(fCp6???e-I$i^<%uVOK;>e}i3OzoQ5=t-*8
zij)1NRr7i}U#H)laogiOo~+u|u`7bmHVgCqCWW=g=~4VtX3qTCT~g|7mCT=v{0IvO
zg~9#f?);HHp$=}dw-*E(u}>xQiX~><0MPE_9dg=yZBD6;)=Q7B(^is#=_DXOel%$d
zqYOPZatLnDKS21G_yh+t%kEeV<~uZZ$|7l%4r2hi*U9w$wc29#@JS+CqL}?XFS+qk
z_u{41$Z^cpbn-e3TV9*#K{a0mm>beFpWdhGuQjvyTDlP(LDbYj*E=+g?M-fqX>ENH
zFh-~C#ixn{n0s&uzJ}BT2WH`hj=D_2P$SI!$u#1b(gKzxov3}MZT9ED_w3`vA3xO@
zemX>X?h!)=;pF5{p}3F4tWv~CIS6r-r~O^|GV!bZ8dE^Ve&#{YBV?4tV4Vbc<t=ji
zeu7YFB%ARus({;Bpg?b1orXE<U<@qOscshmP9f|Ci$t4`bM?0<X%R;=W1KCVIMuTu
zWREO!vvcZ{YsL7fzNFox=p(dDL)@LrBh_(ETwLjKGw7u-o-l6TJZ9|~dn^;v2AXMw
zDl3V{i-CZsulCr{sq+uOVd&Vb#sF()LDh9lM+R}D$Y>szxi3WiYspNpRs(zl`4p_e
zxlZtGYa`-b@5J|Rem9>gXyrH=?K9cO2aMUG@~&F1PJ97Zs&zPI$LT~*2O1F6&S0}@
ze-%5q4zRw)n+}?cY14!hcq+Ut+DCXTd9is?XE&2*@C$BVp0X<Gr_O_{X){l$en{}y
zuPlj1?Bk(khjpR2d+?h84)`L*S-bb_wBx3+Oa;iXMQ$xwZ$?c!J0}1vfcw3ZR_$I6
z$qqOvXmwpCg0-yhhB-2URQuR0L$+t#T+jRAWq`Tg?NzK8etogzrPT$zz_6s01IOH+
zh|hvB-9Ekh-y-}R3O}89v5X0t!9C@#_%#YUY@_Vl?OmrnEu;Prx!<t9w{Yu=Vd&0h
z_LZ&=;s9%+A99O5UvQ)!Ob*~I(GSO<99S|wt@2I7vX`3)EQv)w98WT1)*i-BpuTDr
z_NY((4cGtf;d0reu1HETBiu|sK;)TR6?Np20s%TsFJ|*tt=HYQMWY*C%cXcB*{tv6
zi0~fKW)bs%rE5_H@znPgq_0u59lC}D6%>l8L^NtEb!#4Y@tp%^v0?q4U)pU09U_M>
z{1HPiXWiBWd{>9Gs;Tw9C3@a9y5_{A?@JxKZv}}#K0CNQk;Q)@*fRGi&7|$-_bH*K
z;j01nB+^+t!m%n&?lOIKU4JaMt37USIWqYZV(N0nXW?7B8;K^O?OpJTC^URaK}l!<
zH<4+d7bG9|Y11Q{XW;(>#FDKa6j8qSs12@Q@Ove*L_vvqL~%B5(%D1inT0>3#cM5b
zkjx`{F`no|A)mU|kS#@bA&1PdUmLwb#wfJR%P$toIf#R39uf2=88FUYpE0^rU{SH{
z-snlCY_Xd@{Yk_^;WNy_j;)X?EQM?Csf{S-+5$c|cNBW&Zs#^laO~3?6d#1r_w#nS
z>!^xzH4V9=f1NvJSN7XcARDtm&wfaL5p;xF({<30a^Pu!dSVa3e7jEj^AD^o^sDcD
z#PtMryM@l|z%M>d_JSkP(&|>SBQD+o`^~Zwu&rH4HR<pYkQr9YRpXRYTsY8e5=6Q|
z*S<+>>a1?O^3nI+Gp-7iiNzJ>6vp2Ikx_lVH{N&U*V8|_@wA#@UiJi?4NbMNj_K8Y
ztUcD_9>fX0XYw#>#<qc_Gq-dXsQ~krVqXkct!t<Cc+TjnhTn}2y?3<B{VSSnb$HAB
ze@J`Fs5;taT{l=D5IjI|0>LdnaQEQBed6x!5Zr<XcXxMpcXxMpx6{0`zO~m`XN*0@
z`OTbk&+f<SuBz*1TzuNKEt!S<xvLxS88qi35vs`g+@-U;yPf_#o%6G3xea?no`kC#
z?!g7tNSxAY#Gq{F%wz5W$7_+fe5K9ju`>Q^ZEU|ah}Q+Zq8zjl1KqP>(g{t@4`-uT
zlDSL+its|&2D!Wcx-j;6TA#Tlw19Mr{4)LWkro4@(SgtrnjT_@zO3D3ec<|vOLc@&
zAuAxEk%9Sg7ji^DKIQ5@3P;1-PAZ$j+}gC~{@CB?{CN{vYI1wOgAuULOra_ab>GU0
zoY8+hUp&eGqnPqf?K})EczqBVv+<aXxNTtv4eROTS9@aEV8hp&9wpxrl7VjThA*nl
z*fWKNgmrh-0_ajD_!&6%CBK?(h8RZAO7fg+UpBV$_IR5ceoi{u8SeWHSH%Z1x<h>O
z0r2oeROz1L5Vw}V;3Su&1Ow5$Geg-Pa%~=g!=MmoT~F48@z9$1qSm?aQJx}t7x4t_
zDqKVo|H8b;Whrbaz?w~pQ^cpgm)rfJIy4loQ}oZ|*M37@TvGm&L!pqvDVcJ7YwmjY
zP5tk__F};SCOBa&jz?SZWW183T_;V7_nEluK&)3db32#B$wzprhP(JZ*W8SVMh)P`
zs#?qOs=dU+k@O>NYtB4tfC9s#*68npks8zchT-D&I{?oC&%1=8LK$G>W59b;ebSko
zX>Dbdygraq&zLKU9Xz|hjXpUK1;59|&?n~kmfz=750nVWM8n1d{;u_Tf~r{%U*5Ow
z$Cl`l;w92wImHE72ihato{p0ni@r`8RqmsxPKtfHvLKy+iocWk6L#5BKZcP7(!$oL
zIk=`3FX`SYk44i$9d($0P$sYOdN0;sd5n|Zmq<>y%hW4abhV{zT7_X<;VC6arAgu+
z<WO+i6PC3BxVJ9fcs!V9IPmr2Mv;*^>pKHT`$u(qTTjzicMmDEeeQYK^Sh2x!Ft<u
zU;^CCh+vG&+lah~UJY%P!zN4sL7hCpI76Um=-)A&bxr+V{QUJ<(ij6T3+6m`1?DpA
z=&mV#y(BIeCkEmN-j>)-5>h(>LMi<+D{rgLhe8X5p;A`{1L-LgHIDgA|EmyZ{`<8=
zYj*6re^~(gZ`~`iTRRDz_k)Uimqs=f*^_L<gs5gHQzxYnGgV(hO|W@g%-fo_Pxytk
zceWyg>0WAjpFVX^gSc7kcZ%4!-Cks-G6RDw?+r%$b2o~61$Ec*2%hT+8s&Z6e})!6
zw^yGigG|`%c9N`%P0Z03ZSFn~D>qQ6%-C+?43j~UN;$i-e)&<&0oX>CKHqdU-MyT9
z8%_FM8zP~~BX2m3oD(s`SJga*t=dcF{Eqym`J3RDAiOZZnm7m}6XW4JOIb{TzJ3>-
z7ukJ6pgy;*`1O{(luSlt>e~2u;d_}X>9+IrJIZw!i{DrUYNgu3Yr&>arybFSRnC)4
zc2ci!z-UIjqGU454q3D(FFy~kPWV=xG-=_S1d4^~Z4&z9Vx?(OAX`j9t^A5>HeatO
z&xj{o;+~xF{i*(B>jo^uAD9m|?sxe?Sg$=FkA<fx<>lyFU+=_79G^Zlvm5`!*Rw;8
zLcy0x$?wY>PwWx#Zm8b7Us;y>Z37Np;`@5LxEUQhVG|qwlgX^)T=!;@>*#$QV<Uu+
z<=a89KSgIInw2j6qcnolhMV{8NHM#^oY&SAmDU&b!N2B*ax4CqRYd6#eJ4|zPbL-o
z;kZRw2c!FVr-s6<o8Y64P~kSl7g@Rv_Sqi`i(z`52+dKXc1U1>gBvc3y_bGZ`4Fs|
zpj6At{$=1mg<-~nuND}CVROg#mZ7&Z8`dbgEoJPW-0<#><cjI+Y73)eJ@$JoJZ#!X
zc-|ht3a7g4%5k8rwLcI_y!PQSaI%3r<-C#{uuBe1iN^z29$&><2g$+|&w#~W_SGsz
zgw~jZAADvs2f|>Sfw4ik*LE$5czG2#slA-RMlQS!`uF1uM16Bu8Z`@pet6r{J8Yn<
zp@;8js=`ph4OAE=6EO7gEEaUhENEX=_L#ymaUuUYn@J+s@)+WeqK~{`35(*TQd<A4
zp^?Uai`&gfJi}oGPhq#es5;b%ri=lK>EGwMn3st9-|S0<7|+Vxq9SjvU_OHD{UJ-A
zcv+A7aCPR7uYM`cIDr%FuJ=_3e&Xt@PD#Yw@Swi4>hrvE&czY@?lrQv{FNVWqY-iA
z#`KqEmP6N+E4q)HgIxqaoWxo8S#B2*FE<{05le=#re#gm9Q7|x+_q-L5>uOO7i_Qa
zTaQZy35EZy?e^p;Ashg+7PmIO4Fh)Mc=kvXyAO3GUWe0OP#2ZcwW-=_%231u>%z@u
zQGQqFS@8p~FaxMzWVeH{!ekdIv(TC#q5&<iO0!D8t1$A)%o55?<ZVEZ7vfx~R%oo<
zp;1MhYy$FE;jI3*$L*sy*~N`F-5gORFg9S9SH8hlXo<PEu%{Q$Zsad%4n%W%$!(Uz
zh6zLt0|!9+91!Q^X8<d;z(I>I$IH|GS1FV4M9^Hq)rIoX)=2`aiPm!OLp|P_Gj4ms
zWXgebNha|T8=bVH9%SRe@Q~alT;qFuVoSuNAE1Vl-5aPO+cg|rut|lt@fwCV5#}%~
z1XJp<LH8am-rc*{`F?novbp%yFV?knvqLNtQ={%Dud~h6x9uJ<$W89DMnT#`)rb1^
zXBU}gtZDg%Rkt(cj;jaOOQmLGVyv6EqN2^x%aMIrgQ3aT6k!|Qp<tL2{P+E>YCr?K
zHXgce<Og2#Z=Euw-~N)yB%eH3tQr<9H)c|D2AemVDBUKKy3PAHdVtM@cL5iw{I9VM
zFHn!(#G1}+A;S|_?g)j;;mnsz`T#8Hu5Q!t+~W+DKg;u9bwxswvH|mH{}&?XoJzfb
z2~QhSn*QE^lz0vWUiVnDeQHoP8YhF-@8Wa4!e-MwX^`2;Zw$^TGIasg<8CO0qBZ%=
zbwqBK_1h+di<NJ_eiEU|Oscyz=v$oxy+SVQk2Xt}&-nl$Yv8g|oGH|jtl#2Ol}Iqj
zliMWmg&S&E-gv#b*ug}DudxubH<&a0yAS%MnzMz_D2*UHoAm;E$0u5OR~{~-@w5_h
zik+r+rPO-*x_NidM<0Y6^Eh1(KAO!{FpxBdI{K-Pwe*(!2rg4h6@GOU@OaS<I=%0b
zJIo$jv#**?O|Ez>uP%%+y@qYCe~Fe%Vi1k~sqip-k;LIfV{}TgW5YYrN#A0N>lGtB
z*iQHH`5SGEhxl*Q<ixDS`@)&Y5jSW~{I|#Y7^XaC+w3vKv8DERpCLgL%obZl(g(fI
z0KLz2=6*P$a|*I!x|fD3_^iXJA7E1N94{U%?77Hq4NYJMZ9xy>Kr{&3gC*Ftvre>8
z&s?k4-p#Ymo1X%zp#9ggON0p!vHb^!D}k94a@q_v5&QgWG5cb)_;>dMcDJ&?ghA^Y
z#gEVvn*=&-GUs}T2H;PYqaCuRNoo-dVJwLTqN${RTe(G)EY<P81sTJn%cMKtBmr3-
z&&CCXhYQwfugoBvf*_lXBc}uw7p*G8Z^AE&8ow1!Aj6Wm_@j1d0#_KM&ZOZPF84NJ
zfXOET+3vRMPqu+Hrm?ykX4A(HTw^{mliu>loAJjKGkzm4*WUv0Pz2#kg~RR*z|bVJ
z(^)C7t1?+lpA+~Ja4rS|y9ln&W{FVo%czip$?9aH1kKKTS?#(;!2GPKA*<Ngss9!`
zb>uv>;tm(1KStx0y2+(Hajt4&pQN)Y>z-RfJfrdUDGEoTmW3_Mgn9k}qY+K&(-0J0
z*0r@1xR}$}7o-ujCIQq)g7q(EfN)nfMJc{HsNez^$$K@_1R^adTFmnTl?1{zfH2@?
zI9bwM<+|)ZXA~KD#z5m$#Xg{&*=)KNg&knMQ+IQ@_~oxDVkQnu0^cBsXHqfaEh@AO
zHMT4K=Mnz=Clwk%=SyT+bSzych^-u!ZARy0aN6>EqFJ0!Z4#w))`!GYeY^VnSZ340
z5;dsRChGnLb|N*RHsm%P-gTnbKwI{qR0CVW7Y;qEGL~jCcCwk1zph&3XHxZ0S^xNU
zh12)eTI<Hc>9k|{JWVhxhAQr8T3Hn{f~2TlX*)9r;pZ=uwE6YRg&Y{K4q^3<X7?zk
z@Jcf+Hn$)qt1_9Z!dMMC6J{3#d_9ch!ph}YjGZRu42d1y^KqvJTuON|;ZhFnpUc*j
z;k!2<H{LpMJ0Gup)*>de2u@;rO@VV55EX7V4S}mHB2aiWUizU&%{@*ceaU*sA)nM(
zV<DKVBDiB(-DInTKUJ|Ds)eONmKt&qcm0u{`>DRgV+`T`a$6R=SL_^%p(D0($3WFa
zKBA{J?O3nz^|N56ZIg%iy*1504U^`OK4chWZUu_P;yQEc+4soPJ`T?x|BkvP9_K4Q
zgQF+%N<Nw{m<kqMCYlulM}q`d%|HtxG4zAcvjNun%UAUn(g<7BN{W<SQ36qO+ig}D
zu@L40^&d&ln>x+h_7@l*Y%glr607isp~Q7>aB0nHZIqYr<_h6}Dly?`g8{GX`5(~%
zK6+n~nAThNI!klMy-)EAs&-ltisvRkk2B+B0RJaiblYfYEI{PZyH{#r$aheuh$O!8
zo?z(BVNtTGEHr8sgu{n65F<T<``WX`w<{1xR8M&8`8A-w6CkY>oV%XBKp0f)F+NBs
zlT5x9B4fp(lDdE3Wx|+Ax)##|L;Whc$a2_k`)BkNJg4{qun36}S;f2bB2;|6o0R(Q
z$^C;?tzpAQL)}{rA+oFz9~gKhIphL4E=`ZP7<u4*XR)QA+amaY8bf~+j(q2BHq@;7
z43mtp5<S_2>GTj$CX;!J8B{`=1yoLh6-4^>H8rHgn$vgGD%I_uN=F_mOi~q01JT;U
zw)4vX+4(K9jxE0F7!gyNY}ZA%Z+X=@IQp#E;HcO%5F<;}c#l8szbfuv*KY2j2Dv}F
zm&&vN0IK&vZ360Mkj0NHb#|)z&Vfamgi^qnXomEcZ(>W3V=VzNG0+kp!cc)DR3%b*
zU}eZ{!Ct&-C06g|(9J1%1v2L(glQ;3hQ-`O@6U#}jqbW4U3fpHk`tJsjm0-WsM2sf
zRA5^CqF195R%SW;ej!COvqseL2`;D-9gT^#L5(TwuYHH~hf90%MP9pGzUaL*?24}m
z^Tf^aqpa6Lm2th}a?WQ&+K|K9^e>!^Wb?xF8w-D&POv*Qu$KId74nNXFx}X$RcGr7
z7)k}q#k|*|U;^Yms5OL!bK9-cvspjMWfW<j<Yt2MuXdmAQ_h$2U=LT$v>{MPI)<_f
z_u8~xZ7lLPp0Wb1fk2FDFq2(Z=^_1IvXAl<UlR^4K*_=qPU^B@-$s4mGRX?U?(w4f
z@CtL3t|Ku)5oQ9kzsIf5D(mzbH#%7VPmSY=5I|@ETa=30h_P7WORPCnn6OR4mOcqf
zbr;Nz57<Iq8$dB9)1tsBVIQ^(qaIn(*A87>1De?+d8uMz$P|9}yRHOZaT{bLDXg>@
zC_=nm&I>4ZkA;xM_Ty_>gyM+qIzsPYa5-o`n5i-}+km0o&X}zVs@k8(l1v4olZ)fd
zN1G;+Ae$Ke)Z|X)p!93fPY~yF-@&2QEHq-a5G=yhc2F0t(S+LQN_M&ZQ^>moSm^OT
z)vZHhqT;b5;^i=|nkl+{4kn>an94mdQ~jd+-I1DOArf`Ck0oCtmjvFUq%o+zJqBmi
zVwInH4&|%GH@WqJX%jlY^wD&tbycVG<ra@4EgPF@@&k#CapBLl2Jw9vpxTGpSS=S_
z%YWsFxR4Nh@(e!)<?<3T3R81K9Tn}IYrf1Z)=5?3Ar6u|5zb_uej`{8{o9vLrV(Di
zR%QUe3iiM|R+I|sRsc9b%GSk#rM=||ofvgq#S*0y5#{3eA-CTO=5t;Jx^OjCBP1NY
zaH^ZjJ#{?i9*ie~&Sz)P3q3oVym2kvGob20Jth5v<$U!ix3_b(I-PyTuhl^2xUAq+
z?eNIiGP_b63|a#sl2u*s;E~>Yj*@GOKTfpr!PepBR=ao)=Dm2=c%6Kbk1-N?{Ffx6
z99!CoO!gpB)DFHs{L0=_lvJC8^}jH)Ksy#3_KTySmKYlK7SX$L%0b-)&!ZT~Ei0en
zN5t`t=)<Y{%hY2l<Sg#Pih0A*w7V=9FxgqgJ!y|zo)HX&-SK+3Sd^eooD6V;v~G-)
zf-^wuf=p$4$gK<$oMNP{g>8WSCN@{9z&QW8fA(ds@sRo=dnKN>hUc}1&2&xohX%M4
z{bsKN>4s_3uwVip5YWwR_S%Rt=dAPylRP>o$K!lD7k+?K<$O(P^?ZiJW+7@V>p%DQ
zR8=wUFrOs6PzRvIg-W3Za)kvnVX3TQy`%c8N{#We=xfj2AGM+~2hp>d-`}3}fYgl1
zrDx~(M`>y{V#t!X9O(2R$Wq#yW(WBn$0wyYivzHNXU#MCdnNN&e=jf=oa2~7S8IK2
z?T*@^mA0ufcbYC%P8Pa4%s1C6(Fbcp>Eb|ILW*6xuckR_k+<OL<>)7k($5b0yrICw
zK!XO3(X`ey2(r|?e|Q5n%+ls|QMbmYI^kt4f9X~5IC(L}E{Ie1v#&^0b98DLQ~=j%
zOOv@5H=e9(QMu)F-=)#eNBnTPIa2;5S@j2$L7x*4%$J1##aZLWpJLqOFjl$G-a*kX
z0j?@juGjD&gSt*^^I&=~w8Sy?izKW131k!s_foE1#<AeoW#-dGL)ygW-kC??S+UT6
z6HE92fCS34bN5cCP$J6jyVHvdn?3!;g&ZwUJ@WR;W+p9%4NligOF~68Es_P(rB`(b
zF#RWkwjf^i9xpMX)I~J=9(Tq6U|*;)3tBI@Y{iiRJWsM*I(GX9szw@h)65T1ky<pG
z9YK=C6w!HdfhY&c;q^h^SS6stm&P=vH|j!NUAquMX5;r_&+n84h&cb~P($nt@P5Q)
zXQ^163MPZ@cA9Eik;W(v*NcHdpGP?B5^U5Du^OD@^QUH|K<(ugNBFoY68@e8a|1o6
zoVcme6nXR`K`qCi4{u$wk99Fb59&!fVHoDd^%n;9f?df$*Zw|F)xB0XMqtsn=}?jT
zmYOB{r!vImCji8~nCroO#!w(?pzp_u8MbviJJZUdr4N*WrbUSjao*=J#W4$Uz}^JE
zZs&vlvCZ}?9A?6q*Zvf2Fp{l}ai6&o{VQj`VS~mr{Sc9wXBScL;i}bTyj*}a0l)lq
z;6p{WD7o|wI~H;rM{-9A3(dC(+Igh+z-66%^<3lM@<LFGP@NC9ij7-#E-!7tT1wyE
z)yw%$V$e+d_NDwZ4(g2;Ufd-7;W4vaF%C4`h}?qbl$eYW*{`%RyZ@<3IMQ<(kN4k;
z54@Hq)vpt3{lpp7SM#EyP3^3Yj#Xm?D9b>sRp^$qs!^x&cq_L{;+!Ky_(&Jfq9u#Z
zpWi{>Vmd>ahnKB7cvA;6anc~pv_MJ5wLis>j@@BLyi-xeYl<A@{q!D~d!xYI3-)H<
z+);Xc9=00UtD99~wp7{vkcgx1NpO69aS4%U?0caiCnH>XZ@{6JtIxPCBR4v^fsJ9?
znd_8-4yGGAKe&ER+9F@BA95fKkXAECc@bc;ph8ad!uN7$RU~t9zY0f3oYTT^`F)<9
zl!H;rp)@k3P%C$@EIbm3;62nU<n|`JH4V<O3T{lD#oeJ+?Tp!V|D8B;s87MdA5&@i
zdGhtTUopkR5_F%7g<o^7*T~UEO!bVU*Kcx)kF*_xM($P64gXoOtf=1Rw7qWMlHrHT
z{1JD}3ik&}st^BEFj>QcTLoIjnHDQo%hU{f*Rh$79?(NsBJbGv!=QV{a2$UJK3ODo
z?@vE(c`@YFfk4O&hJXGhPG9%ozn)jKq0=M@!dMR!bT$S!b1%k}#c}us@||}8IGO3t
zbVsAJ@@qUw1i|@-0gZVxBC25mpLjB8`flt@KZ;tFvhUS@9F(1)Bpy2W?q6NG`YGcL
zI6fgBegDDqNq3?h5a;u^P+OAmtH+{;F8ixu6LF5+4^&^2Rvf-nxD?~4$~(CUbS;ct
zbyMz*jiZ`}$>i2h=b0qN=q8w*0t!Yu{TiTO8q#<KY|DMa_@-#KBIHs>L-cSPl<;|W
zn`KZpG(K3dhgh?XP=&T864Jy_|JQ}f#u2N0Daw)19efYWmlXjhDpdHx$&dp1f;<xM
z>ZZZj(n(vA^l9DMZZ~#^pX29LwsU}SlLKDjrDSD~sd@>Hsxy5*EfDrSQ`sNh9^Mo=
z;~xZKxFHHR86e=Wi@3~3&PKD8U=JI}u-5XzR5X|A&k#*RiBor?j{vYlrhzD$B(3%`
zjah?yQD19g=wcem(9gA&@GXJ;v$GaU^cH&TwZ1k?>Exif&!<d1P(iCP4j!|+A7%%d
zh|n6NlEF~3;xw0POg#9K+wV<_59H$wSAU5NcYne}q@lCxBhOe`Fk_EaO~!b*%@U1|
zXLGTGBzGoxeQ#WtF>l^e4+8-Rip?okck~$F%%<S7d=X5wYIFwm<=Xaa3}u>V)37_h
zO*w-7YI+63KQI_5ep~?Yo{8*&=TxTfmR7esaH1BjJHNEwB5uCzl&o&ka025DVq=X*
zIXGWdfDHO)(9x_|c4DtWvBNQY*2v~FkS&741$ND{00$DFeqt|Pne%IO2no{_ow`?=
zGu#&BeJCoYY^{KGZ`v`M0w-!$n&&bbx%EtMltE82^BLbj3>Q8VE9hE+Y2~lNiM<DC
zBih6+X!VqYqwHM8UM4vvBBDi{??w5gTNY}i_ioQkTp6I|I3M{0=1)R~_74c-5wf{P
z?E}}Q>iZyttX^d#R}KN8%x9FkZx+vDD=;cehSJ2FIydw-zRkl~C|)x_s@(7TB;{E3
zGD+p!bow3d1>0!<skVg#cJ0yc^PDJZD=)ZK789!nY(Wi+`O2`EqH~b<<i06xCsWIW
zbV}J3|FXW*{xbw+-8YJL9MuK@m4~m3C^|UfhM|r)d)pMs3%C)#mS4Y|7vy&R+bzun
zxq-5cRrcX%cm9*Tqz@}#Vw?iu@20o?b%4=KNeX%0^~3oO4x4=nwl>IE$8?qGpn-y!
z?n1?Vd|7?rDC3MI5)CbS^hUyRmwiNhsel}tlr0}YR;7-Em6B-d+nd`!2JIz}I6YXP
zI>PI2qHR|^!iYUg*f1zIA6we|D!fH5)0$XtsmnieR=XVZU7#_=K(Vl_K_NxNo<NpL
zr7@!Bcm<!<r0~{Onc$N^{q@tAfL|?&*)mQK>;Cxr;IZc~-ly`e^sj5i)wV(+6g|4V
zfak?I^cB1m6-s~SiE=n8EM5IIRpXtgK3}^;4mGQJg=`4K{ihPi!7?#Dao3d=8-H$;
z6O!ljaN`#4+T9L6o4En+mtV=1m>l|A(eQ5P?;B5>+@JkuYss|=fZ&TAFgDfa)3#cX
zR;;t+72JBy`U2QZiPJogOMUDmY#M(@1L3|Ag0M>qT3-!@CZC3*??J>)OX(izpk}|#
z`Pw_fUvq#~bB<O8auoiv^+C@Rp>6Axn8Y@dTc?9^9%;u4x#~^tpbC{D#0{vo2*WHv
z7o9O*?%(P&vkC^(4>iXLX9hq4H!9vkMaN@G>Qun{E;V*bq3e?m$}q?fw5EZaR<#uo
z90giF@ytR?f#9A$426@SCS(%X+kxG-BeFYq(xLIDK=9Odi-;b|Gf5x~$)c-tDCY8<
zhOO6TOmcR&KPFq#^GS}TlFGuAs$d4D<UUD5j0s-^<0Dv+dRw6+;QP5pGCJArEtwIb
z*>3&&^?Q-+%ON(caPycZ{|2;%)TJI0p*w=qO-b?jwXR7-%aYkTs^2Z*5+jBIEu}9U
znw$rsqxw0PMMv+_m)~1I{tKda0shyH;bS~oBpo_sGnfKlTJg|S+5~9ANK?-ChMQu3
ziW?TO#W~)>p)Sh`=WdO5SqdQslU_g`6gpM|TH=qle7Rq>NdY2*I2Vql%_sv#@mRlh
zD3JZ`4E=>LoH*vnl-(L7anK6~EQyQ-@;MRDF1>>t3(c5-D+=Je*0RkC_&|Jj_a_bH
zvG>Nj<~K#A0{w!kY%tA7ii(6$0=+jf-#8L%DTX5lBY(M6X%m-D;vo53F(H7-^5Fph
zw|@pvIg*A>4MpSo_<@5W(r3>eQ)1U2qcm5~h}>h)rt&l^`mo~+5S`TE|7?_01O(1G
zP)LG41eyeC)P$Wf5B?fb2G$}Mi6-!L*8dQ!&OU>gDe*&~jAqnG68)p||4Ta~_2yB_
zSc)Q-xu|Cgv}SEq%`%Puh&5hH>c(V;pnBAAtt=OzUd5QNn0P}sUZUC<Q^03H1Jc)C
z_+qbRzgX$DPcYE_-OR||pWUWkl!OrUQ-NQF?jCt&1VtGhOxk+u*Gkjn0him=@W)Z>
z-5qR#hG+wfgh3|rK$K%+`u0-Js1KyW%75gh%O=rgCXh6pjzz?&`oZ;OB!~n5`H2(1
zf6eHH59i9ZW{W*FlJNopZHNCmDyk~8@wk-rVLV}}6lwO;W>IqI91MCC+P|M@I1u5d
z#77y5fw-X|%f=fW%L$v-Z();J>~3~{>g8wt6^9!Mrinc&Ox4TVMECNWit3nH=c_P^
zoA3(O{T-(NLrr|QZ46W{3Zm&NYQfx|a{ruZni52K@z%KXKbn~k`X!=HrO+-xGDI)t
zXbr<L9PDqIpP=eRrMVLm6KVYyUND>)sqIvk%bq-)pV!`EqRgE7V7!G<RZkCtG?o_7
z33s@Vm&Id6-eLnyXs~gH<25pwe|~%<KM#<CwTzVeCvJEf6*Q1N@c2LE2V!M?RT}L`
zi|@@<>tM;U{|EMtFuX9(dR(P1kl>6#m<Rov#||z}!Cy~@2Z8A4TYiEeuhgW3o6}hU
zb(cn8tY%mJR4fbl;afD20;5Cf$Hdux!QcsMi2#kq+=7?9_1~v*cF9k`Z9(E@a0KIZ
zZU4FB4p8!LIR0$=9_;!DOey_0QlF>g8#L6iHtdfi(hpcL<u-9x5|OWezaq^KV6{-<
zb6U-=$5_Ma<)Jhck9Z$|-v>M`Bu)m4s>~_AxBuk1<n?PWzCJrUE9pPa;D0hOw<H0{
zF_wK%LX;Xtmp_f?<>`-dO;u1M08(6Tv0+|xtN*Ws!j$kQpuSSbvJp{S<fgi3n9N#K
zus`5uSXmPS_vHiuJpW4}c2VlTUO*r9+b~j95gj1^cc%Wo5Rp750)UuPU%8w$_5Xc?
z|F3WTZ$wTJLf~P2^^Da0|M`FFQQzp6Blb};|EC+<|J6<W_iKL*1h}1%kfM?QMMM4X
zfBnDzcmMtM{NI1;fB9tiS8yP!g+eCsUkm5|@}d7ZIR2N9gtH^}RSMLP9IMDg@cHho
zoy*!@T+NSFkJzZQjZI@S8?C16T&+ExPRg_-Oc(sjN9o?{*e-l4YVmRUvsDFuIF*6Y
z9W((@^Y&6dq=rNOkH^Bpf=mWft6gwjeq3&F^FLW?#u(uf&Hrak|GTbv@zhsjlWVlN
z+CGk`U&qFX3*`Mf!w6uMgn4G9iuozP|G^#sUuyH@N%=Z}Mk5!#;;~KaytAKc1nAth
zS6xQJ)yL|DVPD)DChx@*4L57lMLv}LETL+?>Ck*Q&rRJM)ezUtSnAA5xlA+!fQ@pk
zLUt;(ZWzTYb&Ttqg2A71G(xcREfdd<UD<3-v^xyGdj97|{(5>yOppM;@-4#<{&GL+
zF|kTIngvW8zv2KHy#E6O>-n8<teD^AV6FmFz24oMEBS(&)A@=lq(0E_KcBNCBsk2K
z6xdSv&q~qZ#Umq7kB7S`Qz&#jsj$;3m0)CNpb<<0pXU*4u+ODx{CK$=wU_%{(r2nv
z-P`YWKfP?;?T(CEty~g$ZNu|gE%s(AM}%m><BDZirHLLi!E*~3J%-P(0SqmXWtFNo
zr;j0p?XQshX+XKeRJx!JvF`doh;77&=zJgDtt#V%@D#K0F8QYzPH4ceWx3?mT1VHa
z#+fC}MOgl-Sy*xmINPQ=GK$7hh)7^hWls<X%bsTl?&asE2K_lbvs$RIIYp&S{q$B2
zGUVt`DDk=1SCw@VSFu9C3Tq}eaX^39lc>sj)q{yY{<qBcJVgUy>%$ap)|(d*pju9Q
zOIQI^9E<<)Yq1;T863xpFRxs_8VN6VYNKuSvp63C-SjG)!FW^zkj_P|PI}8t8Jv#{
z))O<VEFaJU5`47nA$5j9!*$El=EO26KUO?th#ayk;$b;fBg^al^6i*Y<cfWBJQIO6
zRUpYcuYJBdxw9oF@6L4psV_Aw2!mD#s%{v->PP@4Rxon6G5~T`R*F+_cD_(-?AI#a
z({i&z>PbblvFWc^BrZr;G%B8K$Iu(5FzG7!&t6T;+Nxli6+hPbAUoklIcmW1V{U?|
z%}#<mUoI&|{j-Q%&!*)7E@4=Uz~DnZZQa-M|BSCK0&o%JZOxq;Kn63SoY5c=FBkoT
z&Fo4bPp%PKbIkrDSfN--wZ3{y=-PNP4KF#GxaLim`BN{Zr1iwR>_-Tbz4}poQD~_{
zd;R3dR}82jx^zKo$py}NjVr+zm_T6yP=tLz)OGKg4i1&46hU~P^V)N~II?))-22j5
z<<yHe>w*%G3r0n8bG*bW=*|gX(%5XcT)e)PM!vyR84kuvstdcWP30}+Q52xm@e#DX
zq9eJVPh1x0_N$1C8!(P>HM7^qp1w2SdNaMvHeqJX+*Uz46Cj8`Ke@3~>VYDuN!2$g
zgxRl%J5cbzRjjdn-^^$n*PzyDMp`^yC9zszO*N{ZrD0H521D7hD2iQ6Vj#g}xU*lP
zIChc!;0h`RnvnWz?2!zWO3?uyrVzYx#`-Xlku|{@ReMN(anS+*r)m6^NcalCX|g&1
zveO8F)9fP$F7M|J2*Pp`#kBz``!kO+VPXz0*&uo+dQ#-QztgozCv@uvKz?lU7SY@3
zdJOwFU>G5R@5(vsAYYcAX+LbtF}m^#c-?s^y*mo6*gTyiz062G5NHDiGO{VE&|t_m
z6Z&l=cb%kClg2M4{&35>eERVp50-ME<cpP3g3xJ-@akP&xsvN0182@}8&f5Q;2jy9
zWM2yVF5mqP*4!9kRT)i;98K|{$5Vzy?~~H;yCEIgCQ4#)NI^e)?_PU`KAOIxLm!!H
zdH-OSz~m91Dl8+3UB#_~S77ouGZj9oHN<6mID*U8n>eFru_CEhy8I*G)!p)DfMCz1
zQ_MWv5&N3&#eM6KCW>jLpWSqWIx*fY0bL^4p^>mua=QGgSuLMAquy)M2v9zd<{;D>
z9z*fE9#xugnBq<!el5#wD5CP8#jB4TGLmorP3Q2(oNui7+cNKWwa3T(bVuYyr(;eZ
zMyB|qU&YO2V8Hgp^`2vU12l?YHua1PjW7<tlsG1P8m$1%YeRn&UcVnjG{so(vvYw6
zLKM~QK|s44U8QM-w%X(yK$9zxZSfk^)aMA4++GKtw{VBsfw1k5a-y+o_QY6wcnPL`
zVRHL)xAu}l9+X{Oxkk~!k$uQ=rOQ3KXsE+)zO7$zdW@a2+Wz68ce~n4rKk+%aF<GL
z*5uZSh%-R`q2JUf&qW0nNA&(GqhY!bS*Y#xagCuiBZrY@8BQC>=-1u@>xjJ=<%Bf^
zxRS1u!4aHXfxrhg%crlyYIPjUpV+hMa~@pQho~@FGYr_I2jfk~8EoF~rpnUv{{}Fb
z0;9jvBF&oxuf%ItVx83*7AdbI`=eL`-!aj)z7CK%Kjh~cP4Adhxet!;%;$<aIwcg7
zPM506)_YO+CN>V0T{N0iX_;7W!zSWX;^ilG221$woUG7^J1H3`btV3EIOWzA>P^y7
z*B)AHk-MY&FdqL-pkL%NW~?3(dsgchw!|gXY<!BGaPeBjyq0mvU5fB(|ADI7e2E}|
z5k^S1&fI*x)*>XhF=gXs=$yg8=&jPGwy1l4Z*+Sg24H=;{Cw~6>f{S8UYAs%sVtf$
z#LeQeoq7Nar<9Pl;l%pXM@JppUD%oF!IiPyw65`yv5UX|5l<~SW`t}%xosDB2<kW1
zS&}n_JYZK6b1j?NCT9JgLG*+DZP9!BlR<wvgT-z7YeD&4cfWnxmR~(XUyUoZ1r}`q
z?oKx&HyHa`&qXiGP$y7F;-ycm!h5XBlpQIqG?6aY={#^SRX%C3?;I)6q}rrozRC28
zZnA;AUAE`-3-Og=N#DTnc9DDjz>h(bFCSX!<B{HN1nFpsq7mLjR)<@SHR%^R@O&Bp
z7UEn&XnX;S*6(V81118ju4&gNQ<Bs0!E29*xHfz-I<r8NM;g%NQSjuY1V2;`E+&*N
zOb~q@D(b7Jwq=*0){oMiN2L0dhsE*e6Yl2nF)Q-L0iOo2W=URs=<me!#G0m<-1Bss
z8kW0qa>!r*EpdI@p`0kU1*}K{beyvp2074gz)ZOw_A@|im@xTku`+V7&1q~tic!eQ
zMg@Bg3A&fB3-O*_m*c1KiG5S9NmKB9gb-@OsibL4t(Oiqm`i|;o_zUCnEAu6ssijT
z=dkK6_Q@w@gG0X~yYmIT8-VRJ2PIhIMq?yF3WEk@Km0!H<orlaNL9((-yh(%p9#$f
z%cT}idxrZvo%xG5JQ~k{H#pu072=pSVFJz$(}mI!%h2@*HeqGr_QyTc6eI7U1}>5)
zi`I@gEVlxY-T>dio2mFeIHXGte7t}P9srhkuOUCMsi6m2>>CO|V%;rLI$WGc_g9)V
z#D2yUXjIg|;oBbkqu`Xp=HWS0G2K?U<R4UF*8%X)D1sw@KvVug9rLKeje>W#x`|92
z0LETSh@<ND1ps{61`2`NpRcLZbp}rz=0fPG`lErQFxCZ2v21fe0pUzO$*=|=DNC{s
zY4E!lWDiz*9|{4a3>?V5hy5rsv3~Xc9K0%s!Vd$iAzbXW4(*YnHs%ZYjy9^k$-6(D
z{|+-bewm7=i8qPU<~}`N$349-09?}BPH>IrPuCho@JIFYdIw(M+*~G(8y)sDE%eC>
zU3Q++cBc#E<nTRIBuP$xQpw~YwGY*iY)d3GlP&s;rGeD%I$IDVRJ*#`bjrKhLQM-=
zup5J#2k^h)OC@ZJ=1kuA)1iCHVGt%o%0xuPVth-!<LEcx^!qmckc#gXVdl!&Z!NM?
zkKrlT;}U0Fi1Fp3K(L<F*AAq<H&}S>Eb}aI(Vd|nRiKf{)-Ev#(1iv1Rf<PK=H?%o
ze+Y)|#j{T80KqM0l80n{7X3=6PZ-eJY828`5Vk$-oSloYWtN|i<f$U5UiUi`hdbKZ
zSN2QO(_3?t#B|@Mm|~{W`O-d#WK}^X&Eh+AO0zAeJ%KN#)+o`;@sFc+`)N&crgJrk
zG#Vk-un^j++$jbY<7p`<{`dQ>Qc3$14HmVb;thqRCuR%v{xPMiNz}<Hn?hRvPtx!v
zQhxsAFnB?{1Z%#vwxJ7$i)swO*{iRH;_-4&s$^MC&}d~MzUT0)&YO+H@pF1LdqfZj
z^b?j$&dI-wc^`>@QLGnl2}A#-CEKg0_$x&~$jiry<u>zVIfb%c%)W<++F6(f7tTX4
zpfKJ!w>_9)^_rfhV<&UeUAYzP6eu{yPK9Qf2=5S#(K6u}Z1M0**y2RS46fvb!o^vO
zWf>ZvcTv=Jgg`Tj%pYa&!Dyr^aqBHw{wM4SMDDW_sCSgdV%%dbkgtZ-x*cO~VxYQy
z_B^*1aYe+39T8+bQggn{)_OBqPiS_9Kjw54_+UKQRQ~X88R(-|zben{Fa897FcZ|U
zVA~j0_o_)4rd;!2XFPGX+c4?0B6xdW#X67YP1&yz>Jkw|4`y{TP;I*MVp;QO9q}1V
z6%IMSF9K?)>sW`VJ1+e>&pA8DnD?di^*jVVf^gFyl!s!IQ#>5dm7G<GN@=eCe%S$2
z0x+8C2hHb}#n+#XSu6w4i<DybBGlYgXHEqvtbuV7UjlSBj{sEF0pjR$JQ7!dQE@4g
zEqOPp(iuw5W&#Vb?x!6K5Pqhb6rNV_n(S__B#Ef+88{!3n!ou%eXRWvC(kPPljk@5
zC$-*yTu5$#`fvT^<0fNFWmjmlZ8n20o5C{f;1MC09@^lu&5yU0oRQqC3`g3x#L@2o
zsK=ADO3l5+*Z2DfHkW!%=6Fso_rW6KYOZk-2sBkMzTbg%9o|~=)j*NUuS6+klSvJ=
zX4x~`3g!zr9FJ)nfJ|A2a06R3Wi|Ik;MZtTNX|V>4Y8uLSMn%ODEFcuEP_wb3u86s
z&F#G7ZPfWl0?TA2x!oL*=bFD#D^IHmtgGiIa~K)uEU%B17G0GQ*gr(N7%kK^oNCxC
zw*-dR83KDVMNCn*36y4}GS3DA|8uD4;Dx;SC|zRI+3-Ej#o#*qasQ9S{HKTE&q|lO
zgTlVBXj!T5&Yym;{zgs`5C{fH>mIWP`#>3dP<YOAQp@)`#OWfLczr5m5x_K9iZ9Xu
z<!B&3T$w2$O{XwtfP)j*4_O`My7AX-qIh6zE=t3&gti?q7MN`?CPnN&6ngi3YTV_z
zqp<hl%^eIRQft$O{@me2@(I-78a)tAEx}lzka6(D`#ag(U><xekYQKN;huF+DejRs
z#8`d~TuWJx9v`kS?nk}ru^;`p$HNiSi(zOomDNByBRZ^!Lb3QaDa6Z{TpG2kD1#Bo
zU>|pMLZC}hud4i)a>}D@Hf;QWk6XM9@gI>ZfXFd^KDj%mt$p-)(rJ;a14^R%Vq2{n
zB$!x##Kwq<K^30L^Hs2}YumrNj}Hn?rg{J^PbC{xu0;I3szg(FlsLwGh>31d+~}c=
zY-p%UjUeiP<}GHNsdB{&J1#CKXh%JFmqslJ)R;WihFP*xmAI?!f>>aMK$%F5HJb^m
zH(1s!p%mLBW>81vf`GH5p3dEXCfJ>=f`^()z|x3Dl_@7;kE$N3&htm6QQyg3R(ZhZ
zFc{(p3+s*)Hh+dPVoB~~@$)HfudlNCk<WXp2GRS90(>pkC6}{b>I6%<3>dWI=TQb~
zq&eU6m+N?OWj|FLP5knZB{ZkqDFvk-r?mWP+_F6uw^v0P+@b?D!@h#f9D7eTDr3)F
ztU%8(d-P7Yo?_`<5g(iku%2DEeoOUMA3MkExkheM;$C+Uics6nf^nvUH)^(f^+CNP
z|7_J*ij6vU$q$7?U9?StHY=4oZ+9uA2>SwQnqt^h;&)J9?Rz>eF`!j5V6|7Sx6yNb
z_|xn~!fVAX>AC{&jx?C$ob(KgF*3(G-=z3)HRLx{YXc3XQ(?XZq&<LoV_t<TaiGry
znEP0$Bb9O<Qi7wuhg?|B#d`s7Upn=XxAb2Y8;~<AmAfVj@#o56qq(}=XM6v4BmEyZ
zFSmE0e@?@0Pd}~Wb3e279LQC~<?uNgjVO%VloJ=?yAKii0Cm}LiAlP2l@XG)__`8F
zwJbfjI-SndKyYVHQRs-R?`&&Ys8&X}{+62PO@TPTd@fbf$8^@|Lb9uWJd;iUB!L#R
zAfleVL?V9*=7QHaw#19wQ4Hc*pei4JZ$3U@wzTkob$#*Pb|;B_e+V6k>EZIR^mm5#
zkv5u*SL>^zm%+F=rQW4Nai;nsf>Ta6{31&(P~{D$lst<ode#tzfao;5LC}bHgKauJ
z7=u%KVM2Lcmx}5(UWilA2~stC>b+w1{AioyO}6V-T$I|7PWWcH^^fcpE(v)7DUASG
zUK@Aho2&;=U~;cj?d!B>!ZEJ=t`7<Vex{12Y$uT5Mr4b|q%q$S)x4wq?9S){=IN8|
zEs>EFlOOhwsl5z%WjJpt;Nsd$(P(J}o|@Ej`M(2S#%>y<VIcB;^Tl|>Wu8kNXh4yq
zH#51054Z#2gze@GE>?%RWBHw#-&?u(rJ~syl-eocik4f0?6V5vpK9bw<;$n{i8m=N
zhQ5UI@#iBp;$6Se0!3U6l&b7AgGBY$$766RKLTa>boKMz4mCCUUb(7mB?H}uOyiOy
zy4wPVa;;48$Me&Sg#ypmh*}?8*mt0=pM>mN%c3!kH88n;O60Kko&cxyopMZf=8b6b
z-utB4>wlW}OKQzFp7>rt;!1fr7x&Blsg&CLNbalk*AiE!Z+*|QIceJf=F+QD`P<Pq
zZ<miq9;xmJMwbd#I3;4}7mg731Jsbkv57+jRy92?V^-;hztlVV=iwd5NnD+w>d;@T
zcbY!OZu!nP-&%mMkFVnlo&Y&TB+&(B*K3ZLq>yX1QFZ7P3g+<6ZLy>AVbiy^_hT+#
zF4RCGV!zvZ*vUm?HVZfJujM*lqG$wSacr?btOjd?*EY=pIt}#ryzIiQ)?-RTSg?=L
zAn~bmKWaJ%!8rCh7_NL(t=6Xk@nSx?EM~eLnx3D#Lst@40cJz0506{%ko0c!yjS7_
z(f|j=J=XgrWnscYoVEW-H7k(0#dL$pIcG0kKUI9^Q7jkn!iB<F-@4>8;|y<T4N#xf
z?YDw*#i`#2;sv@d-Mj!oLg+9+NI-*31zOJ}yry;;c6wYFNrDZ2b_9=*qGcqx&v5+J
ztdO`1I5X<Hx7(nOz~@6ah1OwO%<%@)V&W!>3nLz7PT;ax1?A8x<&0dno{ZL{8>oJl
zkL7&`j4`+jz&sldYgRGDMLI<_gQ9_vMlhiuNY<)ET~c@Nvhmvya!I@nARUzQ@XO{!
z@<cjIMm+QYrArPQUun034(tg{oc7yT_gr+g+IAfoRH6;9OP04D+&9%%4f!^^6q-%@
z>4MO09tHsJGdY87w@2S`LR8|l4s28~ku|C`!fHo_dqOaK;-~*QqxWUQ+>s2}nXLvL
zltqV1G~P_CKnQCWsMl*d>0MyYu)Uo1E7ILgBByhsm#DS!zG|9S=a@({a4!n59=-F`
zqp!7meSrWke-|Y&cdY=3in#1<e)?2u5!9?|{!@8sfi3Da2?*RZT@ln^9x(~*2$ieB
z9@~O{RhAj6k!q_`wm_aXzCO<j)X-eDv3qW_?||<p*O<#ioIeHjtB{T+D=MrFR92@M
zAHB##d1=lM20xwM7?N<F@v-%RV&^)BsuOGX0l{qmOHH{(K|a%3P~Ledwc0jXxDlyZ
zvry!@k)v=TV>Fq~fmit>RcA$E<ww`D%p!R_RlduLF)guB)C;*l>$jGkasCGiMab|E
z!Cs_J80*9A4sld6eCx7t#S&lMIH5poXU53neJPtN!qHTL@^?7#B~9@!Un_s=&~TjW
zs5h0pC<1j8dg{ju!>Vef8^$=s8`+Alp4;s$%Y<ena1f}kZ4hqnqGU!a@47G*H*(C9
z08&8dE>#Ad_XC+W(tzFegB@Ne;{3JE2YCx6*hkbbErvO=FzO_``ZM=#-Mkvf_k$6;
z^C+JKq}KZMD~QFF20MLH6g8@(Y1Q$tg%dY9_v(1uevKZ|B1qHJSiv7!m9aUd_WKh%
z{?;^=TUWpRY=hdO-o_qO8L;vLk&+D}Uf1A#)8$$P_WD|2)K2l<y%*Zy<PEkqO+fPI
zi;Sm(PiG|6%6M#vx*{ZnQlvmK(v10{hzXNfaLlmNgnw~6Iwe*<;)zz1$t<2!5Sc+H
zNDEVgX|WJoH4vA*s&ygmS53pvY0Km;;JFwf?+BL8p&HBIcKZ0WhELO6?I$zdD;yh*
zxN^;iS(V~X*C8lz&AO0fAzlpg?h_6k-5z!}ZVMcifrk^{!P&eIs@Bx<8FdeCJ|-8s
z)b~*BN5$i#l+REKCz|8}s+N*pWv_<19tu$T|Edzteho;$e0}UdA8FV&ZBhibUc&}_
zU||SKxz`P_26Su>49MxsE3yx4G*vi>!q}&sX?I&bDax((@K`Ik{$A$uJ;PVaH&aZI
z(8Nu9HJAJ_6QgywfHYbBn|JeU_}s$cxBTka9fD^{SSIF`kZo5t(M;QAN2!LJ2q&!A
zrG{jxNl<!vQ+<Q<al!%i4mbA$Uqm<0#{pfF?YQ8cr9HHis0N_zqGs26Bg3fExk<SZ
zby=^r(!<~aX{l#{aG+gq^H)iPgfviOEQDQiG89PBy|tdp-I;M-su5zk^ycPO`e8UU
zBA2M3+~2d@`>|j!Gk0X{kCWI{ahwj&GV`!9W}?whmJc|@zU{E;7`w<O!L*lZ-*!Hm
zUpNo@i^KRBz<hW?21=hKXQXV*pQk(!BS`;~KWbx^Hb<fjU@lhdog7e-MBPsKo+sw*
zVyl2(f~r59TY1DbPB}+15KXQg<W6_sG<)=Nb1m1<uP|B$aig*7sEoX$hDs99)Q!L9
zk^GI3m*RHODVofG!E28pqMyg|G!KFV8l-Lglb0l~-HgK0#wInWKT7Aw8c|{Qg!>M}
zjmOD1GWX^A)BZ^hwQD#~`~lKs<_BXkdp#MXl|in_fgIG4#h}rtt=`QLDRYFu{PtSn
zjV(PAgOlv<ByI<0^YgT1s><Aq*qD;mt5m6#HPMby{0Kp>f;fPXz}ub4%)Z9{+hBOf
zCB3-eQ@2<|`ED$kKYhA-?nTKsB|xhL(?1fvMBohH{&u2iMIDcLUf%gYNyS42j!ZDG
ziER)?<^VK^=z*DFstzu+E4&Ulx1p_9inBe%P*cRW3poKJxep<z(g+g<vcw{`C_}aO
zG_G5LhOV69Y32FUXu3yfA}7RIZ}3~2AX_hc(6)oxTKrq4Fm{I8QWQ?1auFHta^#n9
zl)kCR0gDt#!l{}8n;Sg1<MP4u)ak{Wj*6SCBOK-ED^ABmT5fU<ca)<DO||uVXBDn4
zt{d`kwIx9m0LIVOZsHAIR+hGYQ_6Od5n$EDtNMF_9ulYAmGq;o!4Qm=XcN}nA3mEV
z?o2hCQQF}Lw`s7T3r@km#D-d>>*YFEy_KT^9E8}*M|&V?Rlv5)R+SS<O$2oA9=X31
z#$U>F^vs)t^$z@j65HPkRHAn(aCGFnZC!O2+y~mHme#s6<3at}8xk2f=0Pb=T<?ht
ze1LMdfVmluim#|Z*Q~Ff*&96wz#bUm*H+p-0aJ9}&-RQZ@!_*5YRtJ_C5ZQ$Y*}>w
zGl0%45C%5pPj4GD<pn?w#CIu}t2reAx>2}@`@B#<wmI%FybqYQI&)if(R7AMBG&ZH
z>2cN?22wQ6I#a1!seLvACtHxFhP})X;6GSA9*q;oh0N^zgF&45t1Wu6#;Z+MwhU>K
zt}hp83B!pirRuHunG?>C?WY(`<S^K``lU*!j3CWmvVWxXqJ-BM%F(bTApc){eU5Tc
z%WXm!zUZgPfGBaohqG`3beIn<j$hiY<6JcZ=plnbP|cPA7_%ggt7+Ft8e6S-$pdTC
zdkNJ#4&fc--`ZRGvYcEgS%>n<+;*?j1G2ah6h!|@AY0my6T*LVF2<lk!z4&0xvdij
zW{BzM>HAnn^4k)|YTTe5BXztPf=y>@0<AGklZ}Qpmw&KjYMXlQ8j&m*0*=bOLmedy
zweL0{?fZV2ZtH(IMfZqvPsm7AB9%s%nF)vPbD0}n|9C&wfUxj2{*3cfg#jZq%8BF$
z)-L#tc+Hjlnq^U9w}J$=kh(C`{1@24K)}sgQP7^Kj`(sNROpO`#kf{)SUpKjnh<Ce
zIRN--TkY+5@dW#K!zt}_&aqqFa7ZLI|6Co2$kgBqx6wMrCK(2LD4BAKoQKwWpaBA9
z`^*lCFVJo=`G(5lk8j;zVjVV%5-)q}nTyq-)FdGCtP4nvOchVqeh8hqrj#rDwIESg
zYE>mqE2|%Fqk0NoN)NOFhG$}6^P+pBDMiWcM@wZmQlqvazcm?0_30Cb>}UPU0+=5T
zl%Wmw(RX&kL(fnPclI@MWySa=TO7NR=V|X%J#W!s_o`06dKtA#o32ATx|b7zXNO<I
zN|Vtc%ko#7EHEb|m<6imCa&_pn&pRQOqY30S{F-3(oJ4&imig`-ZS?mI%Mfr6Djwd
zv7eQehROF!bb#nbgHtdyH0(NmUJX&nnI3Q}%Mcj=43IC{sDC$7)ao;a_A0ZvOH~6=
zA}5a&-9ikP-%;oH7GN3o!^=(-y43<UxWjeUbOUXK+vIWDyqsU#mzZ!dhx+GsA4MI<
z<B1u#(<Xb^{UjutDsrq5yO`><s5EkZJLBMel1%Pj$Scx&fm02fj5&Vwrf%1I!NwMJ
zaWKw}q*|?urcrAu8aWREhr9gj|9XE*TMc9Pv^_|ar+dCJu<!!--WX}tN>bom;GL0h
zBtsLPKT)lWIo;JY61!ceWqT~Je`JbqXVd}NO0K+@MmRlY7Wybn%;~;EC6Q1$CzJmi
ze7=|};kEVZO4n}#ktG7tqcW&LDsDxs*6_Ap!2q@TZ<^3MuAW`8m9o%Z*HFjTV6%#j
zn~NEYiE6!UCt^Pkz$E_2nm8g5A@z&&iDoySX`alDI+qls>T8i1{3k*L^2ZZGf5dVM
z$`q>OsOPd!cp|YoPrhi4!>x~MNJ<*sEHO!s6fd=6-4X(-HWjdOP$!2@=Byk1tq0AH
zD}*}JIRX?O+Yo9)JAF!-vXb?jA#LF?Ku#-g`L=peeK{i9E`y}~*8VV}(GrKFqmQqx
zb-(wl2$Bh|uq=<I=r6MmDfnLDp*!x-j#X1pwaZf@Ls~hB;M1>R?eQyE3X(Yk@OYkw
zy}cuhh8kLb2888=>1ZDTf%4r3+cYK#EQ&(w*KT|f0+$`RGXR!@F|!e$%S@`fv?Sgz
zO$&U*_6^j!25x`Qlx^N^7gG5)O%l=BngYs8qUi_$wiOcQ-uL89T&PQSnzP?0t$@U~
z6>odJ0H5?8*7=+ifK^$x-eIIrx!v&}J$^VTr;88mXQ_*O8H_za&6fPu!Se@B<*jWT
z0Mq?wUL&lv+k9V8ztPNKtN?hTa;E%Vwz(q%*cJ=(T}GG~bn$lqWpB;nz^u^H<59i)
zPTf1K(VSvmK`e`V<rOWGHhF7`KfIa-WO2u<?gt}B_^yWT9tyC-fw7;ieZvWiY0Ghg
znk%}0`(t~^_oT0e(qKq#n*dG@<^RLlTL;CtE&JaIAsYe&mk=ztyL)hVcXxN!KnNaS
z@BxCmyC($K;O-3W8vK1=?|tO{>fC$&c&nzSuwv$!XZ2d$-|qf2`@d2Wa_4#Qmi07U
zsQ|xC2<O>vOWKO|JJalx$zGPhAgdF2L$8^@`H0@B=rOageKX5-X^aE~3{z5AbH(S8
zK1(c*M0pDZQ_^XGkLeWmBv(ip#fA^E_1x8`7vIynt{1h&d>nqf>2hbyM_^K^%nFVt
zk8Px$qU=EY;*i9DG*<Sy))M{tvc>5L-=`DDW^ohO5H7_qXz9yaJMszymz~qe0umhI
zM&lm)zoCz|??p5LDt_q_`W_!pD)<-lF=qG~`nU$14*_m2k&+nl(ipZC^>1zZbb37S
zfO`lyZn94J0zIqlkejzD^^SOH&Gvl}31{fY0FV(K|H8YIUSU|~>;F_pj!BTksTGEY
z1XDEn4y<>5`%)rJl@+&U9GFsQ8!cjS@o$Kmc{+#7P`knv!n-eYARr^PLVFk(r-k|1
zgFH3$q2@uww+71RA64CV9fDgxvr+p|B{<E*@Kwc3Dub~e!`k0@9AhOup0!GR^o&^{
zl#9AsJ=yr8hW|lw<x}Ik#{lCVCn74)+Ty3hd69+c(wUlgHrx4lna(FEd(gi}3*qOh
zPuD_qeMang&z$ipjKdQVD)^+Ql45Pwz!C;Gk*STiLw-}f$=|Q=ggK$ds}RILyIj*x
zRwU61b>v#LtRx~E)0K2pgQlOj$8B!x@fRf~#paKh>czpsjys{Y{AGYNvriKOpm?ZP
zh<oHX<J@)UsiI}T7aZM3y+<A)c3FI@s+>3%RkIo6oJ$IRPxz1|w5)eH;Ylq3R-AW1
zk~1ij!VT<}BDaROr7IAv*W&PSc2ee4Uc<%r57wLXGwY2BOCZ*z)`%`)`?8A>Z@mL_
zbhRMrHTw)eWn`N_o`)1$EdR(JrLqi1_^X0LWPRMhu<aF6l}ZTqeje#8Py+eHR}cd%
z;cvde&NAf_5$6ek3TPH<oN<48N8VC1jck4cpSF!CWB|@%zhUtcISK?rh__Be#990J
z+*J#l4=Gb2Bf3q}KrE7qY7UqLmr{GshkNkZJ2uYY)7On`TibNHt&(Unv2(!mT6AL6
zgc6_<d4Y73xiT5+hw~fn1lmfUz7gU(pp|=R1AlhnZaKpzyVw2;J7neknLNSSXF>L>
zBC<Yb2#{qr7?9;L<kergDNX!bD06N52Mhp}s&F`NRTEVptlJKDiRbsSmaf-jL<;b|
zW#&n5`AO0MrHwR5K(}c(jAxxB0OMU9u_Ci-A{n0rgG5l|NrboQH<-NTyl%bMnz<Lh
zKl-d|3`AZeB}7jj!&OSHKGjEIxO$b)4j4pQB5GMX<DBK;a$bpkM1$yuo}I>9Exqws
z@c6!0v^jHfV$M=nk|CP;744rHa`z6{oJb0oVv`Rhvxp!1BC*HhVnD|Ruh8~kQz*=Q
zKjfucJ>IWWD-%GX56G<)5DC6K5eGll7AalyW5jEzDQ^O!nQA=GT)EsHP<aP(ILyo^
z3)+|nkg__vibFBBs#}V;eiy1w$ICX>l>w{dsXzm2X=n>$P=afb9&AypQrKChlByWl
z9}liKEPF-`yyM0@cKCwy7JB98qU80B>R=_zk@x<w`&M?`Q^&Y;UNrF?oxUN>J-|uK
z^68c;)4IbYn>;W`g#H_p*gz^pCEG0iC2>X@K@X_FBPFU&fwuxEC08WdhWjlu<%#{C
z&y0l(_Dt2pr?zoGU3j=LM}_buGlkRXwU*g+@y^izcJdh6qb`7N^)yZPNjdxv#6(t0
zb!Vd8QGSBa*Gp;MyLTd7ZaZ*koNhnHU0$cV!5e6PUl_vtx!I(SFb=_=%I9V+(NWTQ
zIM}mrjw>w(xI6%m?U`0u!S+Ne-ILMf=^gA{>533LCm=A*ICrfpBrEmQK$Y%kuuu3g
z;_~ZK@Xu3bC=v@GCbn<{{3|h0lwZ!oLxCvf5Yy+wcZrye;O*&h(UKsNu@UerPz+F7
z&Q&=Elmh@Wq|h1w0lwq)(}~lI%8N!}jC9j1aXz@;kaPCL7giAkYNXrPt!oqC@=DeN
zB&ryhd|6%|@^fh=vkR+Wy?TPEI#_z35#JPh@T{VbaR76p#S#L?;wit}Gqrg5E0K9?
z+@|hy6T^3C3hLtX%`)SgU6gX{?=oE4<sGQ=r*E~gFpF3`IwKi!+c9EBfG$t-rBl{2
zjRe{_0lP@tyh9J0?~Xpd%#7ve{BBJnC-WYo+mbVTt@B`=@JQl)7aM%>-j7v>qwa`y
zGN1K_o+;l{UnBIxzIsz3sTwgY&Lo+j4BfB7Q?8e><xLHu{dpEq?~(5C3`;cAI*POo
zmR;+3GAIdU;}WZj!-e$syy^KsV<&=4dNawWk>m2Km-l6#Nc+QidT<heQc44hH<TfN
z>+e7TH&vE);~*2uXWBlDO&zkC*XOkKzR~Z0Y1OU)&2g-LG<+K(kA@Qo{2vwtfizX2
z>STWq-nAE^iAJ-1-dn?y4(0R1g+{T1qzDXpmMx^Q2vkz@<<WI$wM^ZYL21obhfn<z
z6}nRmvb1$!35ySj*O^wbSv?pDE1P*bUKZC)N2obfZu8pT&zW+|oN)f87y1H}QZfV+
z@`vONR+0`|A?dLni!@|%$o@)QgH^i=J9LBE?-u;_p@%eU1aqBkr^`&x0LnpW6VOgH
z8@rxBzb1XXmn>aJ6_6|Jbn<3>&k_8vJ?OL^#j5~s`joA9mBHbRBk{RMX@2MmQhh#x
zrfi6tiFBWO7k`6+Aj1Su)~lfa0BJkleVC5e-HP;@r%g3aUdacJpYdJgv!rAlq19XF
zYu0w*`epHJ;?*<)_H~mPLnrSxT31!{Isqz!CMKCG%$Zj;i`)mUS4*KEz%;tmp=Bi3
zvao0hbh`Y)Ix_5deN9@3KcP&Yqn6CK&*Z8xH2+}{_Kj5p&q}oWTmD%?QcQ91kIM%8
ze!0szQ7=MzoZkQk)!r;erz|x*Y8;C;A83RYEzSpshH>aJ=Bv<U2_Vupe1iE8&Zi|g
zxj+zJTJ9NO{!kX0t`WF|cNQ2(^*77#MOoqe>mRe-UXwK8+sV3iq2etE%Sxln>+5N|
zI$AgC0A8`8;)yZ}nq{l&38tJMX#{YJz5q4r4e$6}6ns|d98pWs2EIdUO6!*4!{Zaj
za8qJ%RZc&4;ZKlZ^Q%8WhWf0Bz;J6$=z2S`)=`)%GiC`}`Z!1n)I!as*CuuoQ1y2|
zvK7}7n~NLGcRP11V43bm*POL*u!aY76sML>P;e?9OmX$pvr46PA_)5`nQYSGr5!SW
zHB=ORp=^Tx_>-NV&4yOHF@4Ewz@sY+J#_Xw9gSn&(=N$H%GvfQUMHa{Z;NIOli7Ld
zhEP}@)Db3^%NVbR08G~VTinl;HkevdaxyG`Q++-HbuZ~8|AIEm;`>NGq%)YHclfKl
zl~HvyKh*K$yR~{b8&GAbg5uA2Qt6H<mv@$4xDB+9`upBaahSz|%J|dRQt2w=Gu$(Q
ziVo-jdth`LiMq*4N@RlOye?Yr??zZ(5$$<jIO=TtJgJ}-D16hLP18#@bH$u|wDjp{
zG8>rGkuUVQ3z?Q)N6hJ^6B=4tf){ey*T=70O@&;mcS2-3I%Yi11$xbYygeEh(t*l<
z0wq+&nPow9rXe;X>OuNop#fbW@5izc^F1suy~r}LPJHjlOkT~}{y2gnbG5l2phj^V
z<8&1|5ec)USm-z89;PSk)gFYas@POTYR80(1p{&XEs%mDKPZ|2aBZvFAN=^zt)pAG
zm0Us%t95M;4%z7z*YO}H+;oFn{`!hB<m!9M<^5%dOodEx;y>Ub5I=N!b2{PcNOmau
zC1EniWHpq9d+aY~qt%+r%ZZSlafb&bk$B$&Gmer!z`vLDkM`RX)B}vd#(TDCx}Bl1
zjv~%F&Oz=sofbcJm+CLyyB%`K9H(GRv<*V)I+3cUBO}`!Jm>6pa&k<DIH4}?$){=w
zpyWCpBLEnoHp-p34%G7u6>80^!eQjM?fO%BQhJ)A!&yO>(S&L|MePPa&l@J@F!LyL
z+LxT8XPlehlgH-UGi&*@{-0@{Wg%NZ>2HXJdLLqjriMNE%x*iI>qOio;*mFKv}QIq
z2DYxaKCw4O8^t2D9WWrBylqC{>Cu%z4F-}CqOG63kB6L_9L^H<h=Ja;tYyF)f`9<#
z7f@VnES2v<FHANkoxfN?Tx#KfGtCxwY5xMNq8SM<G`U$6zv~_O;QEIJ!*H*jVd0i`
zuxT{Y+*wG|&gsa+R;31)juk1LhL`B;oe}ls??>jGb(b*HO=k4Ec}ix(l+#U{c}2V-
z(WBtp;w%$!_Tk>%DKBEpZ4%ihJsC_AhAU&(q_gxV42IBCPBvET13NOr_@3qZ5(N5Z
zD6an}Sy!RuUF)xM@*&Nt*gm;>bPwu5VNP|>-dv{3g6gO6iKo85)wuj^s_8yz>)BfR
zvQQ5mnEIL-0OeIl3HghDR*eDypVqRb(Ry80a`(#&&s(l;;njUT_IEqJPj>5R)X)j_
zpm>Y236*#DX)1Z~uPPm?n=1mpJ~V26k+_KW6JjW^jI*aF8_8kT*sT`eBm(2%H!SM=
zx65|XYIKq#I;MiNHI3bIvvRl2J_(xaw%>@-;G^p{j7J#v`oYKI-D5E6o$ei-9RQbb
zyBsag^bP!MGN5Ob2D7riJ15a(+CaGX=CYRsq;96!r(lD1maunNTkB%%0COV=z;;#v
zq_vaXSB|nW>DC9sWgw|DVb%_ZbbWAV+(f7#!0LpFO8b`VP~zLxt7>tPa|(!-1vKGg
zMSAZr89q%t(W#!ewdE)D2wD9QCY7@vlDOjbx-V1Too)@}vtfP@v#28MZ4hTes(1b~
z45;+qQT5|@*fb4omrjb6cblpb89AR)V-+3FewJY;{Hk(9hXH3uu#;?<ZK*iR%y!5U
zH^WT87br|Pf8!e4XR~9}O9#9#;4f?|gr*Py|DJ?<$>#|o|D?pS&c1S&2G=Lag2Rx*
z)>hYDM=)1OVF`O_)Mhr4OwPQ3c@FM%_NrOxjI5750ED8BBwm|)$5kNR{cxslylnOf
zo$}O$df2F;k29C=HF01>l(oQZq$jUy_+cL@L;70?|J`<qSy7B>#sxJoc5!H4Pdw>f
zdp6LrdNb7R`OGM+INfL6yg-S08~t3tydS0(0*q4dc7Akr0x}~8x8_e<`R~GvojDE_
zIbz?_fntks5zv4!6V;UI`cQM%&j}V5qP_f+ud#n~*?|@O)fU<jLjh6W-}=R#8SIC1
z{lNEx5ttDD6FLQV3wwehja;T7{nO<i#64)FS1$Z0K!^}pe$v4tJ><t@f$8_Fr^#}E
zD)X(2^{jF-rCx<$TKSy~8Q|v1={1P!RnGd9AJfSxgACp3Y@u;l(cYCox2M;Bmx6Y&
z9uK6os!_grf)GBOLQuk1zVLkfa9$<K?6~5j+^mT~+9Ls~rW7+-`K?xBj^gQeM}e^?
z66DMySRV0CIG%}lG6d08h|Doy*PAcD>>f4Hny!R%FRA|-r6vf7Y=BbqOd@<G_Dmvl
zjqXWV)-TreJ-d*!+iP7VT|R0!jy3AWdg=qhWtDEsfAQlt(!TBJo=<+pcssB7RWlNw
zRFW>g8Gf-^sqjxvH|FeZt|Qn-$L<zu2V1Poz@&LZfR4c~`Zm`wSE@K|*Pb+48Wt!@
zk$M7c0#0?~x0BV*9m}!dQeGJ>r5=y^`y9^j#n`S$K&t|<YH)vI)zILR90KIlOyQTn
zzcSb<KMbn31R|1I+^hd79i2Z*M@o-<vF|yb`sq=grK7O)WG)5S`l97!B_48sf<b@z
zPI$Q@H#$=p0FKq|0{zBeHjG#A+5y-`^E|LgKdDEBt)TB*(?B`RoD1Sat4dW5;GfJ8
zK?e)S{_=$lI!R+Yynaw_GM0c48jy{|-JTeaWxer(aSru{v0iG8PAH39M2Pa{s<)ER
z(X9VYNxKZF9X9j1K;9p*G*uzmHAd#BTCHMl@>Qr#c~hhP<f*+hMg~w~T>1L0DX$>u
z4C?i%lBwwDFh77vG9hxXs$^#VGa-#}#|4@Pw&fX37~J{~G@(cRzo7}WrV`!~FF}K$
zd!5%x22jphUj~{nUP5R8ylVLz_hsMeY29yEkvo;B=AuU8EvS*oYz+0N@*@D{LH3d$
zG62SiyfI)3%Cdk@teosV3k^eYS|;i5yRksvK_&b<`2Y*W`Q>97aewj*Fz!>B_)=Sr
zD78$@g$fz5r=3AQCJT@eM2p$&@{P!4&77uT;eE^7IfCG@Xx^=}DG&4&W;go3$=c9s
zPuaC3g;u>-a(5`8!d&<%3vOl56i(83JfQEUb_?v0&B*L?N)$dyCQqXAzxGzYc_Ic{
zSUW6V2Sjo}C!P6akmJ?d3O?(mF}T%_eqr<qdJXfzT^$x@i6(L@j+4z!UY7GUF-^>N
zlU#GPM_;9i$O#~TPUK`Tq;dP4S#Uzm0zYBW8l{?nu3Q(V*KR~^1H){i({;PKkk`C<
zPA%EM;Vn{iG3o<<q`M!>R_ff;$3ScGFoG4geaQPrA~sGNHM~AIlzKkO9$M;`3~%T@
zHE;k&18D^?@2Gy8YosY(q2C~denN(H{6*cIEVO~&og)*p*K0L_DK%>CrcFXSN_9QR
zh3vICNhL50k32D1D858Kk;<N+%FuiQP)^#n*9Sn8YFpdxAI-o<T`=?iZq$E}_h-7?
z-=!0^ZYO{I!3#agZoOkfq4C6E^8^@z3tB_GQ_ue!K+6J{wJBysm>VnC`q1m4f5HRn
zB~`6~!j=SM-^7xqdgFzZw2ItuQm)TWYhV3eA%ZojPY}V+Z;V?W`=u-q&vzqn&H;eJ
zV(f{wdnYRACcA#X&G4ev?AVW@#d2L=1)GUwKp}44)=oQUYh&2TK(pya42vEfKTt0e
z<n)AG2ka)K2J_ULA*=nBb8qt5?xaj<`)~rTR8r;I1u!tFCa+&!tF4Pt6`zK(DW!}g
zZxybUm_Ym3CE=V9GS6N!rnOHqnzz<yYdO}m&>eWlHmDfOx*trX)oeD~0OHB$^FmJw
z%R`Q#%UWnQ603*_TT8bx3=K=s`r;^Od5>-tMk<AwpUQ4my|)b$x&pfWr{jYB;)$R1
zV&q_SA!|E;XA<O=SG$VmW+D}@s1ndTWFs9hSNn*}yP;!Fe-`2zTxDW@6l28V)kjso
zyQvOt^beJ8U++3Fp$ngX<>iuTda+yJD5FsUiekDRL8yoTX>fg7Av*)wqAI0kLpE((
zmNGGp@U1f-Ow*ARPY8+k>@0>wk1Ho}EKaMZ?}{Zh=|o^%p9q?0N})X{<&2{!Hn53}
zenyU&PEftqxabaFp&32g_l9+DtmpO{%3$FgxRQkUoel$)lr_u$Is>Q^p3i`(xE+e+
zBfdKxoSjJx)2-TPgZOY1=99X}hXv8;Vu0`&Ox+b|S(sf5qzoI!mOOI<ncJ1i$9b3T
zqu?iAZJyHWwH|7a-l={A%K3~>;`i8RZVGOVt0}+XX%QZDO^Krb;}1lD_6}w{au<B#
z!2ZS$bvDZ{HES$|q}%=iBNV&yC*%XTGWfsj<TX5W#T;y6(ah*$bYW$?-e1%}RXF^A
zrv`ren;IB_okJ<w(h;r~Kl7G;VeaF!l<{jcK<B}(EP6?^aIVtX{2Is_JXsEyBOFLH
ze|a8LBQhc_4XVEP3R?mc*Qq4?^w)pdjndzC<Loty572NI0TAbR@<mTml{(c&CaqF2
z^#;>dHp@;Q)ACR0DON?cEW(k||KJq3AzVO=HXKlRiVuI$Xai7?qos0#Qx}#}96H_l
z(7MC9#Di;~Nc7j&bRFddg}Medll0p^ED!Eq8#l@2`|fd>8E(e++E^faSE+o55UB*l
zmJtifniR`F%?66W=cmHEkj&-Z8o7$t`Yfyd<BvWB66v61DYemnpATCFid`DS@6KCe
z)PU&@(*!Fnc}E1@o~&c)Dl{1q>odGnY&;Ae52&uW-lEG<oO8P&261Rgqv%9qz{G{O
z3hp-2vdYRpwpk2zInk%t^&3E^5JlVP_5@>=FHr<FQV>0NtUuetj0n(zo^afW_-a3E
ze{gf&z`)=hsGzhoTI(~qH@G3uF3;Mfuu@?&TdcDXD$;5#nq!1foCJn+6d<5&A1*ZK
zFI4gA8_vcd48N8Jy-ql-+h53+{)uIBwvIQ;<NHhsOf1*Qmm?nu)4)9#t)z7^UlLY<
z{7!bN@h{D@KpUB1or$4U58w#!#G+Od&UGU0U+sRz1#$woz|BRO`S^HlirT+$frVQa
z{N=&GSaD~*&C0*v<y3GsO}@AIO%Atg&Gb<@O&wFvM~fqE%(<j-<hQ)8YWC4?r8J8b
z8p%%6s+s;S$z!<%4vU%-fCkku{L>kg5`JSu?%HE*5CJ<^QW#*s7gqdOrFt2sVcH7#
zSILm?9zg!WefZx{fvDAD#;9mVx$!Ln?`_X*d0nk^z@@q!_aDm<OD&M(oJ$ufUaRzB
zTCam;+~Kh5x|<^%0@jQi*IvrW+swOtjmqGXdaAx}(*A6cx0E&~vM6RZQ#p@eM2?Wm
z^YZ$SIOmgQSjE%#b%!p8YY^+sjZ|s<(5*SnUg&ry0R+f=gA<ZZy-{?PZAdWqk~US?
z5558ot9j^TNt`Swm;&psVvvEm$PkQx*@5Y?+yS*W{GKMs(%fGcOZ|DGD$>|r(`fG;
zDlR=?!!lG1t}`Mp;lkvauozr)KH;ZUcxC<I<)UWP8w861HR6&Kia=8POUwCW3Hnob
zELu@f`e;lDR9}V$|D(o(aO(=|aOQzG7)Wmi1bV>&*v<(6+gTAopLzjTB(|gaC$^I=
z?$1}c7<}O6s)Qf#JU{Ls*rB(+;K7#=T3>UK2AUu`#8NS18pUbrHM<;@^EFjmJnja8
z%sJcb9}`3KZavoi2mrJ!@x#6R`Wp>EK*`au?xI!T@HsRb6qVrlbe)+Pa+#f<;i+l!
zPIvH74GE7sv+crax}e)B4R4~G>dsWg@H8@UufK`4c>IemuL>hWex?4TC#&A+*F~L<
zTS+X}<sx<qBfix*iE3qET?k@ebfPp~23$a~aj4<>p?Y0PkKo}_yPV&>qvXQrUJJPA
z+eOu6Ml=8XXDZW#%(Q*7E2^u^W@}Q0Pg~VgjH^X5>o+YY)k#!0jRT3Zao_J6oD_z+
zcK4L+r$*KaZaLYUe7>hP-hFNn1l>L^`9%6|Tru?WlV7L7@Gu}HNB2dM&Q#0&oSQ7<
z^cyikAl~0{uEs1^-qL*xycD8kw{~=H48&zsIIP}r;$hFYjgPLpK=j)B&U1gnpT1(n
z(|1$IKIcyGiADF*v_&*y%<YG`#J%Yf(b^)FOtcbW>GWz2)3kA|l!bpnKmv?Wkh8R_
z#JeOykhs4pTUM_Hnz(w~8@auvsiak}oN`h_6XvtX0&A4US~Fc=^DSPlS6Q>%jMZAz
ze08G*^ZGHqzao(ZYNgai3%ewZ3Q<Xw6+UiKb!@QUeRNFa@v|^?&=%w4Gt6JxBMwpQ
zUN=LDJePCAvBYNh+2*~D`TlQ=;GRtOAD2;0=D{@8G!?zB{^Z7UtX2_P>6l#BeG17O
z4+4Go-ErJ+-D7)brH!QALuQH6fdbr98ni!FFHwE8V7`(%AYpn8ezgt%1TX|mmvpz-
zlM#N`msgSDTho^$U+4Y55WzZ|!UZ$)1%iq$sY$G4;lfW?p#!3=&iHIme;V<Pex|70
z`?dFIc=gwdDqj~9zE|ZS+RNMwmAsaaOHZ+_tU)+~#Uk(Rp@vTW#)Q@R@pDl>Jo21l
z(RGyQQWAQ42c77@GX!U?=^N&;yFI0csN%DSu3yP@JUs5VhIpY%BwTvgqoRt7L%p#J
zzFP7+@>x&41ZL4Nb5~b4FSkDaxOX+{NlkZkUI%72dLcoPAUnbmhO{87E;vC|^_~K=
z8y#Ctcqo{OiDg9}#*`JWr@S^Ar8{#qJ-((3XkFr|2zr&u^0VGH9YegQx=p%IPUEB7
z_4F-ofrW|t+PBu3FLbYzc}ry8W`%S7{;Y_WStH>LbtGBXl>c#CW1aI1?`<__s)Jsa
z(`NI!(d{F>hSB9#VI3<EkcBD9Z{x8`y5=FvE~1!DOumKeIe}bj2HWVoIS=QO203)w
zPEWR+@_2A0C1tPaq<lZrWdNfa(Cwh<Pjxo@ydN^fQtvbD66Wt8o3pw`U3(MH?E6ak
zCR!APXhE5U)khY&PFk3l;*tsnx6;?`-@Mi6m&c_jArL$<*#?)kqhSqovbO}*JLC?+
z=GX<dez{+^*^c-Hk*tNIEQOl*+m1(r9XR$P{iQO9w0xD_&j&?9Di>95n)FQ5Mu<Q-
z6V<5$t4pg#TW$n=eMjcHP1al1x$NMJ%9cjE?OQd{%!O2Hg?6{7E#}`n_r=#<y=HfK
z6~$|N9tB-x<_<Lu*G%!C_0u-1kx*vmdsQ%fCXJ6*<!pCn<}1qe6C=$!gOJ>%MTDh{
zCbAr#Y46V>r#;uLIjS4nvJOeN0o8nk1W;Dg46k1Te(KYMA=qCk6YPz>2n<Ire(PeE
z>L_7GvP0GtGf)@gl<|@VAy(NVWLUdG=fIhh^CIR3k~IDYqH1p<Zu_lurRG__N%15L
z#dey0f1Q-p!^O;uz`eNdL2W>=a+#!q!<#<(t_aZRegp9>4gJD&rOhq#L-eWO!)yO9
zmBP-oiArrIrxP)1kqu6L<))G|lY7_jM-SDT))8m<QQ2%msgzr_Xmkq)8}OyJX&i5@
z5~FDN;1yO$Y?8n9kQ`)!f?x`r_wQ8tSMDjIv-|=)?$$8ELYw8BG{6NK`Bp}ry@pYx
zcT&V^PzVNmpQDo-GIzr7%s6)p-XE+za)dC2jBLuP73=uCBpw!K?uq_}Df7OV{!^{H
zMq^`5vWR(UDtqpmy9^l0GF+~~uxy)U*SSxA0@J~ZkgS;<33^`#643ZffB6M@)VrKA
zw*z=mn>1+%SD1`=p4R?G`~i_mc3U+2nJH}Gpl@Z~GLjvz_!$S+_KW3*BTjyYS983R
z#pU~w!Wvd}S0s;9u3gV~7ruVBkC~9F_DyA8W@ybCYbG3yxy+5Z3?AE)|3RhsPOy~n
zaZL5LE6+D^BV6x#?Yuvd`!hdU?SsusuJV{y$Vqv;NCY`0W38g9V55>+S#b@Q6Io;)
zSxn5_=A<%TMm;`<r2|H-=8mDj?Wtf2u1Vrgr$oC3bDP%wMEa$3`kTRp^3%4q!6kYW
zS_I>e*&nS<O8$r7IAX#pcn3lZ7<IL%&!&1<ZMbfG%wIs+#q?J~dcS|4z0N!Ey=mfU
zu#<cLgUISyQAcrv-)Hk?z=g-`y_c?wtM)M%Te5wH1)`|kY7N4wCy>U?o;xsA+rz0p
zD)LbCI|g4zA*M4gSK70NY)KRGn6+^WZ9MKTm&~4)b@TyaosH&9kHLj8?(AJodFxyD
zetc$OD;E7<2qm2_7jH`G+S%RRa=;@V#O{nz$jYN@<{<Cu$EaCBPHZKs2g2JVfXgl5
z-Z1G_yT;`k@ksA}<r~Z9oTxNg&PdY{wh}>Ix0e1J1XFW0gH2_4#pKq7#dm1vMYR%p
z<kr4MApLowFk8^CXI8z6lRBKeYx<sP&l@vwZ}ioT`jUnDEzyyI<F(Uq_u{Sd;h0Ov
zkx5P6VP3jpUF6n~j}Sq25iDe*=a)?i>*kRgj;4=T!SZxq77>^4$@|5q@Or1oiTd$P
zA6_JCOI#c6HtEFSWJ#H8URzhX<=bJsOc;xu-H@E6P7r}|y~S+A=gP`JV*|&FM_sX#
zlZOeEcsCb3PN(sbO5)%5tG<$>txiYCi;|)3tui?Gw(v=>{x)%Y;qSTfvbX-O6Qlc0
zY5OghXkH`P6?KA~C|;q+!x4t(t3qDy=zVk~i|dZ6Im-L6CgAfTjR>bx(e>6e$&hk`
z<s?C04^?)oE@D-0O>p>gPe*Kv3-yZn{4_;!$mp4^CJHixoF!;ToO{R_zG2oH8cZ=`
z{_cBKY)WZ&OC+kTvJtEg;BpFI&yTU&yfbmIiq-4UiB(QWyypMPlao&4_^wkBPKJRT
zst@fhH?aOO^zw3MrtLe=&&MC}MBvmD-xmEC10Z1%&*Nk2juvalO4NcsS&AmDG0WK^
z=Rv0jZib^V6he`kFg@86l07lm2uBq=84O#esH=Bjy~YJ8=U)*2h9~1Ahri}8c^%x<
z|9){kY1>ThTb4ta`8tZ>>0Y&cn`u|P%&<r$Kv}pgBDYYY8uL}j+0)^}B2xeQt|J3a
zKp=ohq%eoScYxl0C(P21Ol=`WA9GBKDnX5A!w(cQlz}y6ksleQZAg>Gslm$M%MaV&
ze4j8{=4)X#<*QP&H!yu4>lMTS5q0QS@hFVx3$*3TfHV%67xqpEm-h@H@QHFJh;gHA
zK3I=hUqEf6^N**u*mpq~j=?*q(RymCIU^?Rhvu9x<>p}OIroj@-=2E$?=URdkHM5;
zIs0MdBeZkkCxDSuWTbfbTzj?*x<_?5<=r!0ibXEWLy{}-^Us^@*D>+<Z0=wdhr8E{
zSGqK`CsO4;;^w7R{cO>$wz$(dKzoR3Vi#$%i(cEO&b;HT$_W`dIrgHks|)2zNBes8
zoAg#XZDkHLvTPQ58{14E?a_vD<hjyjFeGy;jbzIF%418p)RXlA#VYfL&<e#YgW;hx
z(nXs_1MOGP+Ei^DeV^B_qiMVR9&dUU#p8zwUeW{Z!f`dbuS>_qe{lD^n2I%;q`D@g
z287;@X1|8@fAREC)`g-Ac0483c|1%TFg~K|>a0?^CXTK%p$5`Gm3Wc(Gt=4chTq%q
zdk3zqp{13oUJ#kzUaxWP!Y867mW^3F)P}cxY4t;Lwb&a;=seAhpcm<OO7(jQDeCoO
z^1Zr74R39!iHDhPf2bKI`VbCVET%`3WU++{O21FCb-YV;JDk<f9H4LUblZ~_@~q~D
z52x(Rdq}O-nt(S`TW54iyR4vK>{5_L>rZZc+X?lT(YZm_=$%P$OH?~1rm-q~d;v{`
zA3(Ps$m?^?aPca(d{$+6GMe%jtZ@=WAPNoj;RVzY@k?Oiix$JX-Is$(QA*>;eYCnT
z#T<I~=uleiAaJ}Efv_v^0ve3+-RCDZ_QY+R|Cb=IGcO4bcI^HmGCUK6Q{!8?*~&)+
zQ?|4Qca_lS$n&mSrL9@J^Scdo+7!l^6n0zTt2hFAZACbjm3{M3a4BgrvwrD(B>|Ko
zFT>!c=MUT8UjkYi4kA7GdAVZ-4${A_Ca4f|VX?pBTjRU+(q#R}+U%1hOqG|$yFM0w
zzfjd;QJD)XY!7vyi0mMv`k{WtcCMFxnJnVnAf?`35d+fo57wxI4dz)|89hEQZ?NF3
zA#eORD6ax?dcUD8bf_O&<=pw5Z+BW<O!_xQp0D)b<_Xj|9$^X0^T$X552e(DkZpS~
zUnFy1;n2KjW~~wNavtzn?BzbDm)>6NXDuW{G}LfkgE4r_xQIV_K07r5RV4mfgH<_^
zHmyj|8!(fEE}tgD?{h#QdHYPz^m+Qi$jI*Huq$mRx_F0hjn%QOlG>_1l=K{E@$9dU
z<RTT~+{)3e7j3;Tdo4)W2tzM3IIOl?E2dzS{&JaKyj+K-s+E^jw0P}6#Y#zPoq?!j
z8r*R};E?XsSGYx`^aFmd-?@fC_37^m?x2Nxb>q?uYEgSFw76=mgtGrQS7`5$9_Y`)
zNOfJE6?zp!39D=f<vv`}34<i`bmak(;x9&$<Bz`?^nN%_cmYk%fdQ-_9#a3?&-tCd
z@S*z?KYFzBzuk21yqJL%8AHKgl%>>DoZ<T0Vr-$}H7V2@M~%+Yk-x)B%b91=KL5(U
z99omjPB1)A*8bh5iUnoqqRaPn&<WzFOjPKP-xl7ia8@rMz5L_F^LoU1T90ZWINVxY
zupf^VyAUZ{VW9i{UmW6?yueQV(|P>+8R^YOiY-D07J33G_W%vPg-VsDOLy0M!{He~
zIb5s3$yu|vZPCASzDWTrb(K#q(wU399XfTNEc~MhA#k_)9dj6Dkt@hw{0Qeon*e^^
zKn7f`6`wl%)0_Fn10wK3u<`U;l#QqKhi|I1Vb;i*HyG%c3I+<5J&eCzLIWQY@Q6Sy
zz<c-cg1CcrNY4R4p)7cNKCA`jA~OH~^x}bk+RA@;R(=n4`YQZAYti`LbotIs^;W#;
z!aEA!Lazba&Ayv48u-WMJbkee@gm4^xc3|WUvK`;OZab}zeytd%a=*wita=Ib@%_=
zHUHaZe`YA;e$Kxw=WmbU@B8|X1^*wu2%vhpA5;$SD$)M!n*QaE{qH{?h6NVRpF7)s
zd&SRp_|HEsf)1df2EiBUq;|i>`REZW;(U1e_UTqb{9jKLNuaD6fYOs~3aP`Bzx%&<
zKat-&H=%()UcEIkU|2mPO8P;*TJ){w#V$tcrG1HwCR;YyTVhJe3zXcvT(6n=t4(5<
z3sTPV@};oc!F^ne3e}<a=(xDj*RO-y>|UTq6Ef8J(Zg+e8EPBCk@CBnOG>*5G&!3~
z3DmyY@pYtvY(ko?Wbx+e^8WDFi<0yGi0#|Lj<mm+hN`GYyH#8qz#3-xQD<IE$yAnZ
zvB76rWt^&=u1Emd<%_L|0U=?{R<DGoil!#y3Y#1DpX>jB|1>|y*TO1WW@@{u^X^Vn
z(wbC!B&XvW-R-ThUIJmt;FUu=`rR3@w))<S29gjpBIQM7=e^A~A30%02Ry=}41O%Z
z-Kk7H9}YR2wIgbFdBCXn*+XXHgFfniq)p-Wpy8$TkZjiQKLtA9ei9R~TYtDXm<x^4
zX%RG|*I{26Df!&~V9qybqOg$XBetz;$OPPkuIXv@up$D1c_h4iXiSMr5dBM(;`^KI
z*UH(RE^`%E$cu8GD_1W=)laG0rb9XG&H}QX0snm7#<$TPa}o(Yd$C9ERt7|7*fP5=
zI(C*wzq6-;Gs(JTsh`FzJRULnb4S+^&SuVO=K1qu$)WRU)W@7G(uliE=aHH=G%UX3
zGlBum&Hr+B6=A-t7i{#*Bs>RvDc@Nu6e@QjxGKEP7p{N2=lzD1%OjaTIe)2H>RMOk
zS!>B+y8VKbE*8<Od@*Nzi%u)ZSyg~X9hp(`6(&rp@yqV)2$~SyNI1y*Y9VzQOk!jH
z;mp|l5!`M;lS#r~buREJyuKh*CX-{v<(ex7j0&_Y)j!U6MO^|v)rb^Z6z4@27~J20
zQeq^K(!g%?zrjEiM3@SHHKuj2<D^Cesccj$x0~|xM9f*@PRC@bsZE(+CpK#X!I2(J
zdn9DuMN0-_^m2bLhU_hlFzHD8=(4Y-8kwv<Z_>yH%BMom>;8G9jBjf6dYl{aJ!PLX
zyocE>&_xJ_QnKc7%|^`_!$s5Lz1!Gg6?xpcih=D^j}9#35*WDsedo;iy4%JqwPGTb
z7KB#0wl9%QdvecMZE}Rtj<^29$LrS@-`2tkpe3Z%leyIxOB!@a{GsC!8IvLGBPV_P
z*9<JM9=oV0wkkcS=!7{I1x8%$$y>L?5uH|9|7>7jcSX&<Y&5N!JukrhANHO9u}=l`
z3w$q?rvvLTJb16-w(k!SwCS0|qfU-T(xh{SYO>f0hm7b{p{)<%^2c(C)aRYC+m6PJ
zW9<uG9gGHdBOc0~qF#t<^Biz`_LEm??_my^zvP|_Jzn3NOx>>|#}Ey6Y**(=>`?#K
zV0$GXxq%)4QShx)kMdx}Yyyq3Th%q-cJw&qE6D4AvpoCx`CvDx&U`8;uR^{%(ed}b
zi092+Fr~vG8A);Gmlf+BErF<m8Qm7oKB!v5euxo8VC}ra)qphYSdD{F3bO$S^#nZ;
z^lSIvcH6-={rwu&e$AwRT3rh*(bt=^D%K@ex7iU@ihS0?>1ygo>>0ICI$7N|QGoa|
zsj6~QxVg4fj>kA}bNX{usQ{bXT(=QV>~>aUd|BkMFaw9jjJ|6j^Aq{Vs5IXdcddnq
zxw(TxABiD@+(2i{r$V)SL`mf)%RT&od)>~o(wTq>1&hfN#<$x`!DCF-+c{`GqK&eV
zdDsc)1CJLFMX^+-(!;Cevg7SGQm76_p+DZu`O3J$?GI)YE21h9ROy#>%C}R4Q)ZL>
z7vx{CR3V#p3f|a)5gUwB6Z(h7s{+k7bp4{0url-wY7~0rdT&-)YV71Jhcg6-h~kNW
z=Qh3}%C*Qm0XFKW_x^AEH0#YJ@0(my3;Sxf;_|_V4=8#AiFRG1t@R_&QEZUDv*;X>
zbWU4n_(iJ<*dx{hF3%?Iv2WaSND~47;iUc_hvW>JJv9go2K?pN^OCo<1f_E3u5Vr0
zPA|wY;N{Enc{wb_3(Lu!=8Jg>Zt`hM;nOT;8W!b7?8P>(Wy`d>Uwlm%Lu*)I)b(9F
z%otqm_1bxH6}0~Q_V(Yxp|E=K31g4m)!IUv_BW|=m__K}Rf^nsYq6I<IvEk?aY`3{
zScqqwgxfH$Ef_N9ptq`zE^@$yg>7#KQ%f>zm6c-enOL8Z*jNY$p;DO}HvU0lT<`-5
zN6Z8neoPqI6XHsv)4a|Rd2Ge^IT@TjL(r8hFIzVmJWeEs!z!I7HXjY9V2*m2k@FdB
zatV~cC5Z$T^xm+fVSg!JKBaAraL&Ya<&E7N7T=jFOp^@WDJjkA!yO6~p`126Dq@)&
zo@**<AE9)P4ErG*#1kdbJEGh24W35Xc7SNDJLWA#>T44>)YuZXxNf>HYh3mA{qEtf
zR^boX8`Rw3AbW%f^W?nV#*^gb-+GN+I>0neS6dlTI$<!DlL^LuKh-#G9>LVI`571d
z$&Bt-?p{5D%P2fV^=P5zsvhg}O`{dbPAZmMYC(zz6$tI*xZS~0*Ds+I{F(DoH;|wG
zM7I-@eIS6hscBKIae1HLCEa$Zaj&X$Ap=t;RUf5ko-JWS4`n|;*KDq=N)uQ~D)I!F
zT$Nw#Z?-dcXNMJ%roBgId!QDrXjYt+BlsSBt50IF<3Qh!Sz+dU_G`2@qv}Gq{>xpc
zig-np!t7F>Bz{kAKZ#5PO~s=+@^U|q$*jX1m2n)vU)8`7B62=H?BJ|^%3+)cCn|Y=
z8rUMA#%S7rdS<=gmb8YZWS#m>^Rs+e-C7Qwl7q(1cxjxFYYfWj1Phg7>A+y0L4BTe
z$U#~4v6SL$3W}K5XgOAY=xtQP8%q&k5tIwTnE<jdD0f}zHZYbZLbei$J}sC}AXlk1
z5iFQ`iR(J6;>6C~y+bv^<<R1Y{k<o?EXWbH$qN{w*{0$&*K|~1aFQ4;R3yLHMQOh?
z*w=5OQ1-SuynA`Sq3(q3fLrhib-96Qu56s1eZ}e)%{2*KU}g!;EFL-6*ZrLjo84#2
zI_q^rY6T}+SBhKj2O7w?!o~p!AsKUOO1^fjdYLz-7xO2^ZOqH-`H}eDS;>|(-mTJG
zGo(^_8%-G;j$hjPnFug7)f!LwIAX_{XfAG&-(ec!xI9e2Lmq0ZS$TafwF&z7JnqfX
z=>;2^)$Eek$nJ+b$$9OsIZN!qJ+`2M$TMe5Q!0F;MhtArom%x<3`#^!k^;=%?CEMP
zCbQ&Q7dGSCRmTfDBq6QWPL!IpsbYf{$<=Q0NIhJTQ#2eyyz?dv<dOJ!1Lpsv=3sdn
zLDQ2lZrY{riD5PRAzAiauP-T&dqGe`>C8?YfAe4;E4|5)M4MRGOgJP0koq((g<M$?
z3zfvWm{Nu{Si1@S%cbK;EN>C)NMyY|S=j10Hs^O&g7JM*24_5W2W=95iNA_NWI;$f
zeu-uqHL(C-;x}7vI^qR6P6=|VRoplqNv|*G-JltCJJ2_Jp^4FhyugWK%Vmp}?8=;2
zWM|hHL$d0MBySh5_BDU!Y!vEf`6+)<mj!<!F{vQS!5%C;$>NL$+q~IIyojYM|5lI$
zr#@(;mE(IGU{p6Lo5-KRANRh(zr!4bFc{q>xUq4#CMkAIPEoH7scTuVk&@;Ge9^t=
zw&4JC(g?iFSE{U(t^J8jn;7(nLq)T&+ifwYe5#myYdhL}q^ZLez^Bd?;B78M&!ev2
zSEX}g!bFLByFZ@h>1<vMUaC3MnHoyxNC9Rt?;PXP??r;})3|?;$iAdC+iBcmblx8n
z8AysaKUxR~p2!Yy;*z?`QXxK}YLXw`uW1)e`WEQ`MHG=R5>l;ZtB9EoQ%~#2pMd_8
zm%C1_80^#C*Z;;_=X`TdsNbWFE$X;7lm0ZBIZKaT#f6$Lz!jILOwT5nojQv2TQu{H
zGK4!iJu7L(OBJO{!kw*Is-IT?y5ezzK=Vs+Zd(Ide;;#42#r7D`iLH7&y2_AK4k9(
zIq6T?;{S}E3cf)jz(YKpcRmsYeTkGa{2?DY-E5SQy-=PotR#!CX4^CR$=NDDV`~5X
zSgVg#KRr#35mvb;&lD=AFlipoc>zI?qly>>IgIWYqi>}?P}!f?q68!Ro;9ec#>U7^
zcB()@CpQso3s-{LLd)^=%wqO4-qWqTAbOP_>8(u&D8EvzPj`<~4Pi?TPs}ETJsTvp
zQ}Y=wvn2OJnWtyHY+3TjPZ%nP`77`e3w`@{R0<&4cHxC!qK_SB=c~M*_Sa;!=+z<|
z=EQse@zi2wg}s9iyS9YMN#qKVk9h5i7hsLy^SOo6X}4Nnk5oLgQho{+y+V!Efh!~g
zRe$&?{rbZP!2w{1Xd`Z@w$-UO$#gcT{jO&CFj)qS>o>IHnj870@Uhl2Zt;RNt9j&_
zxk5laX>M~lp}ZQlzV>AjhFJL4UX8;~@f-I`Weid6jj1E96^4z76XHVVe44~6+pCw&
zTfAz5jVs^gYLe16g2K`DIv=QBJp7L%|37^TqI~&Nu<`M?_nSWW8Ix2u#m{ld$)cjR
zpXDV}dNTfI0h9^c+us;2NfW1@6-BZO;=@9U669rN!IeTV^8B*nWDxY`F0k=*Qkd$X
zY%3KdZm~+StvkDjG)0svcN91$2#~<;b=&D`(X`*~s=vXBPn0jy%6DaP*^no-zYY+w
z;`F$7Jtw)>w{gw)yeOKG6*rom`2^=+Lt}grX;X=D4Lw;V(|&K9X?eLtFg^DMVjSGS
z=Rtj(%4&dKYbMsT*dwdeC|Rpdc(G=I24WCbblQ9EW699OZ$b$|GthsFDTqx;6gJ7s
zMtxC4Eyl^|+uH1wCHi)k+XZX9o33X{^>;z(&xuFqguxW)Askj^9L)j23KE=Ezx+!Z
z^UWj_%lq1y=9<K=;;1l=R7p5h%BV3=3P#UpU27p4d#i5RQo{xwk5d4hPCJA_T1{4g
z+h#((_Hg)Kc+PaaX;+$pe0nwu8!g@{&v`08+_iyF`G40{{&BCAIiTVdDW>yGn0JQ|
z-J-(2@N2T#WT}$o<yFdW<Yt@F?NvFQ?aa(d(aq141<9A~zJYi;NG^p6!4wJkr0V5)
z#db_w-r3hIlt>80o+s6jwtxG=R8?hDj7JAVG)@gDSn<z+>|4aVA(*JK#ajtkEzzlH
zeXL3-*~gvhi)|KjQgsD<z>11|eo#7h(e83^Tc$TU4-pp19%O6KFoUZc9aSt1LLysA
zN``Hfm3k1}X0?LUVl`DHwGFGJ@=MG{GU9B)Y!ZKXn1s}oYC!)E15SP#nJx^KF?g|{
z$?WQB+c8M0o4I`k0BEjbzW6CvXcE`G-=xwx>^iI?I$ljMexc=1w|J0XlJU4VkX!U1
zH=wHZMzkxpNMv2q)dyGQ3(gV~=10q1njl9<>Bsx(E=h*RXce5oZx!P6Et4!yfpYD}
ztA6jhq$%s=k=x&cmb{pEavwXj9hW3E+FaOkbImkuloi?Bww!wu>u`BQ9Zo`3_iBSx
z!G~U}pcBgzTSYjVB`<ZY+&t5+O1Ryh9rTzZwK+ofQRBI0XRSjDqxt&ut7m5;15@g(
zP9&f>g0R61w)7O%2t}Lnp2cinY=H;uM(x}O%>Obn?~nMHi{9UumCYH9j)?_0Q`QvK
ziAgE)vd|c7D;ot9rDvkwiS{-~ZBe2}6|Sw95a3F+5|sk+0JeI!b`x&bN$&7yxS`pJ
z@6R&%ANwCjAV|yQhKzuQFQd}RN18=%G0+m4?I-ZSK_A_w(>%GOPRhIftDY!Dim`s@
zL%ttNH4T{W2hr|C&m{NVU<z<VwPB#`%)4_D3<&0w-bTGUoNAMiLJcXVrXg`uVae3?
z?teAcaQpRVp23Bn3fvHnY%q;Yy1nIkk>Y2hZ5bOaw*03kv~<S3d65b!bD@QDouWyE
zxqo;_$&gB7V6Wh}5(Rl+&Er{hG_K85aZ?t#+_v4owAZBCvW0%eAxn`TL99EFAr&;B
z3A(jDboN0j-;C7N6Af~-NY;?;G9mJ{?ia-KgNLNB+X&;$*JPn>m4E$$UR&iKEC{qD
z<^r<Hc;viWtz~iH1M}~rJM0L>I_RRa+AD5Mmj38JNM!aheAR8p1t+7jBlh*I>gCH`
z^+;%t3mz_ew|q3+A<RgmW;;?mn<J=RGe7Z8d)UGvD7sJJcsv_&Y<8}?`CT?Q2RPF%
zw=b8~v{-$J`OsJIhClTbo?lq`i2t2y_`jkuWhf{WS&IC|(<k=~F^vgm4B2(J1P(15
z9vwB5Ajk4+k>e97H627B@$$oj#Bmh~ihMb}`mIiEp(q79f!PvJjceq4VRcKP**7#<
zy1ZlX5K+WkD-TXPi|sDyv7cg}bx=ubPs|7aYT;Kd$3xV#cF*x0Ox8>=i#=#fPLY8w
zjaTDfn%K2ZtJ)_+i!gUQQsq0wUm1z9%xbKiwc2;hC<#C+0F5%#+Xzy}dw#6&c`>O5
z&JPYkVq&>2Y6niNIhd6dIj{BaB3Ind&BrpHt&2ae5Y@r7xwRa)5ZtR8U!rj-pYz>-
zUW{s%699X%HL1XthE9)ryQ+D8pH6Bv6sD+myw#PIk#c)i^w){k4i8}wV~UJp19ID5
zV}RVUEh~s>Gw-KdT$d6Sr(f?n>U?I)bHL?2W{LM?)>5=IT9O_h+2Z<x+WGB#nr^#Z
zx(vmK({auGHIvpA){NlcNA21^sss&(l0eVK4^eiHC)cQxw_O8?=k3d-&e%I@Qd<hJ
zkM06tHvI;z@`p&cW|ujm6Llg^lb3)17d{-_guYt%`S+{;PO6kAeB(|7QhLDVYcf|h
zdKraIoJA#9FbV&bP@<J|RILFe)E#wNe7=YZ#9VcB!zLP4$S!MtV`F}{)XBI@F<mc)
z^`5<<eGUXZTSHCi+CjY6sIl1%c6?Y!FP^#ga>-?Ubz4YAu1?``leK=H=9vlRNP3e$
zR*z^}`|6d&5~#BOB{eyJhN|u2y7JdmOBxK1G=ofBc=onBl;oMiX(`+F;mFatXkU%7
zdEee0ZV`W1h-=Bba_$3q|4t9YT2~m<Zo)1}>G$A38_#o*F|KbfZ==cKLnI;+6qh4$
zZ?TS8z90$#BNpP6D8g=Gp>xf%=3ihaP~^9hKe$|TQTZwg0`GEtubn=W!hAg4ZHJ8&
zB_T)Y9-e?%EJ|~Z*Pv%K4p9d)t>Xt=#+xq{2ic8E8qjKbU7D><{JHjUp7W=6`>1ML
zHgk4E9<*U1^F|>p*Mr=ShfKL<#)Y;iXDJQi_Yd<Js+ogfMi>(VuN~;_yRtZo>QP|&
z=`jbh{#Z3Ay04PqE)8l(kuN)Uq3j|fQQF#`N>IRSwND<_X*1C1M)>ct2_GRuE0yxY
zgNw{WMW@DILNhz29NbIdG3W*9pfoYL6L^!WsE@;jVZY^c6`QGCIN~8dx;*Z5WHPm&
z;df~9W`CtH?qYwoIei%KVqJ{UZshl&Q4U}u#tQx-G^{F{kr$r=E}{$6t8)vpEgXZl
zyx1!5`4z*ARV&giqw@wLD6|_3i_Jr;5$)86BM@z!?^Z9%VzB+Ow$CSjcvK~orNpuA
zG37Ou01ZBB!iAuqxE1-K&e<$&iLZK+q*J5eo-KZe*>q0+ttS2fpwmV~9GsS_bJ?$z
z#N@f!2$R%*QM$II{Q6jD$eZu#>f`4DNI{-4V1MQmvPncHwe$TkKwKpd?z4ynac5<-
zmF-ex$<qPF3HR1NtXcQ@z1tN-UqHNXz&br7Hs!EuEcN^j2RClAdz7|u{I*LHJIS-E
z%{VCCm6&azCVOKhlgRMCgbXP<k}&7nMTUCO&0=0r;Hvp3o?>xLW6sZ>TdcGu)z0of
zeO<Bo42{T9%cYp?dQloY5T4>NuF9dgR@|9Sp<_^{ju-iD5tTNv+}K~DTXtJS<d!Tu
zeMOC3&TVwNxFeqKj81i}u;){|==yKN(|(cfMt~84j7?J4OD;foL4&aYq2^m#;al?N
zi%q(C0T*6~C*fl-x)-c-67*w5+eA<YB9$%Zm_68P5cYkz+&9itaq`2Hd=`7C_&H;n
ztzzN}&UZT6Zy-%>wsK0M2q*5aw;C(1U7{<S638+U6vGNQzu!wpdk~Wq3{TIWI}jHr
zrK(ab9IoeEuyD&u%(`xK$Pn;Kb7F>iO>Y8KHU3IUs$FZL^0H;5YLMLn4legoa4{L;
zbf~}MYK<Ear^R``iIuo^yRiOjwG>RxJyA!2&H(0l(9ut|qXm~4Q7XBLW>5_3Vi9Mg
z*PggZPAja7ur{SgP(su~AGm9FAh2Tt=2BLp?!ZRGu?q0~tR&xNzyFo*lt=mScwe4D
zL(OW|a1g>NbFg@ua1*^Hn+?cfe=Frut%K*gfDDg_BfR$EdMJffb+2MY4+?L-HdDh$
ztRT{u$5$E%hD`~<_ge`I%_0=!!_7|u8^3S6<b}rftFKc-DFxRd*vU14w^G-Tkh9;7
zyE0*8{ErQv(+DyOJ*(FW^vUilrgBhxIE-S$K=E->%jL_POhtUPH{CuVlzfa`W4c}s
zqg28iNxl3zzkI(CW>^8Ix8RM+Tjad|Bct?ZCQm*T7TB+~y6Kp(+31)s4E(%K;cq1F
z0(GfF3z00gR^#Tfeim}<(4?~|gnoNyw9KBiAQX9_R&Dy0iqhl%BksS0n%w#*P}qi|
z2uhcZD7~rl-la+CAYDOvlK`O!2uK$YNa&$M=)FfkrAcTKI-w{STIjuVAKr7$_uc!>
z-1~a&KX=AqWF|wNXV<mYZ>_aabI?VidS+NUKGyEmO^y2qf_L4#O$3n<@Xp<_J8KGC
zCu~sCU!OkPNVy+1sq+M-)6+bXZR|~>Z6ZDZ)yz52m>BR67RdN+>$0(KbyWxHF+k8H
zJQv1|v8I(aZ&${D4d*%1^ehOiNo9X<nXAi#CHWlK&Q=Z)bx<3oq9g@4pz+WmCME%7
zn35AQf{a{Ws!>lsUsa6_;bE{1!CcIaNl~v9QLa2#`6-v9#?|sUF=_Q#py^|MV=Y-1
zt`E!8jX^LC5H0$#Z|)$|9OG*)P=@!`YSZ*Bb^-2b$E1iQwz1E)j|`vr#*yfuymDa5
zIUyPY(QC;@&m3RR7*dJ3UD?BpCOdVo{vd)sLX>0xr!)6ATdU+0zUxwc&vAi;8MnVb
zH5PUc6cy@F^T0!XZS;2f5sA4-4c?67K>|BI%v-mA?U6OFrlEH}^`8H`mJh#0V^4ZE
z8cWZ+2(3DvUXPjSt9YN0d|n9->Z!=+%P7=(s+yBgx0<c!tlfvI+CP1?!xRBv7>4Gb
zK8I)#ZJ3hIiuzww1DFBXA5v5*@|h|?54xaUxWpIJ(3Y6a)Z5PJ+*YisB<-hzlSZ$(
z7(8+OtWxO}t!+=g_po+hE;+scIE^?dC<8rTQ<KOZgSOqPYp#<!IH}stZr~umws+X=
z1XxfcFkRvNxtqyk#}Na79gjsnCdZ+ui3wqWtBF*=A?sFOGJgus^vAs1TlZ|Ym$HU0
zDUml%U$Y@d%leqOiBaK?Vj9%KrFqk*sf4Poo9OaVFUnc-qC4!LLXa4ss$wYWS|y6;
zr9detj3N|PuQ4+<A_g*s_T^Y@&VN_h-Ss!DGYp3?<2kjio2+&+lkbVB<xg>{j(hQR
zcKB}pDBK}B!RS#dOF3e?Cbs1%Y~AfC)qs+p8{6|(Km}DZeBIk!n<ump53j~)OvY{|
z_n{h{$S{bFRhH4Lr-<%k>8sp9A(As&qs$RUu?z71941FnFY$|EM1!R^`-hq<Q^y1Z
zOZ;7}M2<#7t1ge$CVRFhe?ik?jELxS8V=>T$@2XkGadvmujW#GHy3P7(63Tj2Wmq-
z+<q;uSQJ7+iv_(Cg)t!T#)bd5FG%IJ%hNO8;M&9965%?`o|S>0@#aPbkDf^se*7ND
z>*6Pa%94qC?IJ^C*C_x-dScifGjFk;Qn8v(pvUrdhZd%xv6gVZ@zlzkdw&Q5CWs42
zWE;<$o6t!CXVCc(A$vh>8qif?i`}<-fo5oD5wPB%{1sl)mR$d`$#z~@)zA4vS;p8G
z&BuSAws7-K5Jh0p>wv;N){B}MdmvnAK5l}KXKL0K?F_W`)E<a@@OHOT|K>DO=es-c
zd08d1NaS5`^`Xj#8*|ctwuC(*ym^u1O`e&4x@_Z-c30((QmSyH$tIIqY#9pattYEd
z8uFy5#%oiBzWjZ(e8!EeO{&^U9MMvvYQKh?r^RaW02$f?(3zbWkqai3AN<5|%2(;O
zNt3&sH7hh;5?n^jO=5BS>FVQaR^i34#8Z*%+ub#yirT<4w=ME$j?Nn^P8UidGoxvy
zGM>>d{-`3va69fYWzG5{zoFLp(cgjaHB$q$YMFT~Oz-i6v!sE19Mgd!Bu>pAxD_a|
zqq5ah)J%YsSaU00C<7iP9E@Xi)^0f8L%649?-Wwp(Uv0qbg~x8HC#?g*4_{2we9Oa
zqhJ-*Pc;_yy;f^?%+j|qcG?R~H2YcMrLi`-YeXTX;?*8lks-KNV`r^LTbwD$rKDoe
zr>l*|M>etecU?0erc^na-Ozg}k^vm46s5=>Otw-2SGQ~L^`QFlG6dImm%O#<wr>4d
z<veo=ABxoAsqz!XE|7bkWou5Aw5x%P9roWA)#K8()lh$*Z{sMFaT97vyb1OXCf-w2
zrKatQw8?%1YUd|zR~Vr|8vO{;B}eN=vLfpB79lxlb!yyr_Vi+IrEe13vgcTLaWM-<
zx3Rg7NI`Df^$&xnEmdPWiw$O*hecC0;=$mohwirh?SnN@)5zcS)DP}!Jp2G=5-;xN
z$!LMPwB6JIFs|Fyf}^PGQHU;^x_sZ9AjdT(_WloG!?#CVD>IdJbn`XwOrKnIE~(l0
z(0lnn8G(Z?=Mvcg<i7FV*l034bV#ev^e<?J7R<_5wNlGph@wVc7U54}9$mkRnOzG_
zmGEt^8Q7Kp%<-opN!QYAufMuoo6|r-W7Gk-wDQhmfGE9t_0Tr=1&vtXV~KjQ<-{*U
znb8poVeGg$?Qv!uRw9^a^_&tHU&IXadPcEOrDwyB*W;O!0bEy%b4MjCZcRu{=5|1s
zp`;IISh?<x(H^z$X{}RTimU$QA1p|^kpP<jh>Z!qwH*eZyjD?m&uHqW!1dnFxdd9M
zwKqo#CJAou04aja;BGiaUG80@0bbAQ@nD2<4qM6qQ`00DM`J2f*64sCPm47hALA%u
z6rw>SD`LGE!ny~r5cl}1Uy8eRIgU$mEfEJqUasReKT?UlKXzL#KyybZfmF2@mDpH=
zXr5NwBU~Rb$wOeOE*?y)zJQ1=pvE(S&itl|HnC$@;*8L6nyLsGuD54xr30zc4b=oR
zkf6fftD%)bLPOAwAm`8XKV1m)OIu%@u|sj)WBLyeC;_dA@s2h0o>4-6i!jRfi6vL9
z<6v5}Ndur%F{^KfCd&XSh3|gN>`w)vMk|ey^;oL3&E%UkC{ku~0iEDuc2WOYw{)<e
zYvQ(X<S(bmDlgKM?m!;jZp+1Law7<?c&cB&M%d|=L#E6VvG2aZ_9P5J-oVxJoRO9r
zGwRQN?5M@3n~0%vUk42bhH1j{^hc41i3sMrY6F*<kEnyz>O-%JM^g7xCVWr%0r8K1
zALe|JHaJi6>E6Q(%&n3X;0=}AItd>w=NM>n9n6wr_8)+^t0(dQm@Esv>|no2PoBu@
zvNqWLW@t*nI`Iv)Lq>Rl^R(kZwiY@>+521PDkE#zLsnjs9N=2HA|0PNQD=!(THm7;
zm?Fk)_<^hlK9&|vnLavk2fMQ6KKQhW9B%zo<)DA<Qhe_gsvwRbqIIP)Bz=3t0gWtS
zkQwSC$|(rLC;3_!_IsB{0+?7#KU2iL+1#CH^&06Mt*sk@?W}&j%z==<KeXR8H*#!M
zd(JNU#lX?|&MlfTrs{**Y<%ov^16nvM_Rx6WN4)#)5U-7tcU9sz__5`g+r^6A67Ca
zpTjH%7e;**9Y#qMl&Do9OLrCqqI&Ahm%oHpLmW@pq1BkATjVVO=ot9P%~e;`{Jg_=
zPaBlSl@afM{JM5%phu89b?CI)0_KyEYXaMu9wX^vA6`_WwkKx+Xn_`k>W-rRX<3EH
zd8g0oqdk?T{1zkvzkPP>oj&g@DL0YRY?P&rc!zJ{Vkkc0%bVmeCNezZkBZRA9&9b2
z$~2bW3jpjQG0Q!lA@nMD^)p~hyQ?FGL6gIya$Rf2>eIjiHox@n%vSn<ha)5{u}5&V
zqCvk%<@D-*sapPTel7R}B($;V!#nsdj<8xJT}9<%4*BAc$ft(i+GLl9?pbMEL5w0@
zgwi1llFJqLm(r~s-c2KZr#A)}>1QwV`h50J8}s@TsqnFk&>igQ;4#3Q#5sYxKZft7
z4L$d(hZF-x5OzxE_C`ttB|{0~FmFz1ynVHQxfB0qGhTOJ3NrFRLyClj-&p%LU2SVw
z5d(Jf#!Ju}L$s!2M9e@Z;`_C5{J8*l8zCbvjBs<5iFsBH3xdU4S(*}Nr9J(<HzBYK
z;FrqIDLd5C>)~cOBhkLXO7+7eY9ma6SXNpMDWZUxOP1v7;7q*rvf+!8Q4_Udfayhh
zoUgQb+0CBULaJ4M70<qI3_(QE^+%V+EFYH&Bd#@2%`0-ayt#`onCbrZh)+d<v>{o5
z-fa8^patAPsAXCa=IA8-hZwj{loMT7NXKX3@pIR!>D_#tZZ*%lGqpvW2I>r|Gg)zx
zIpXd3wM5uZ+$?{)aZeQ@SOOs4sTX7GqTfLswt5d~wgk?9VeCKup5Qw^+1`U>C<AL=
z-00EH5#M1beHKRxr?q79)mQ%rQvV0XooReYhn=|WfK;q2BdD0u=-&CYysZP@ItDgq
zYEZy&r=}&7SkW+IE{^2*bsL0ZLB}mxCDKaGX6ln4idfZ%*Y~kyS{UQc%Ud?nsl7By
z_73}4@1tzF-X+_~HeT*xrLUFXqR`z<;?n6#z=XEL-y#Go$yp$a0QsSM^N-^n9#8tL
z@=N$y@p8E{L*pwWemtM@0cz&uAv{bYrK@3!;cR+o4=u%*3Sd<!+R~E|48sxN!|PdT
z_Dff5?FQ47EeaIkpZ`|&{=Q9u>YlL)6m=;*p(JRi<!`GhE4qypt#y{;>-wnS6zgwR
zDFYdtX99&>>~GO{Q~}h1mMpla>nW}K6V)WW!8zec{T%lpVe-<j%)Bt0qt50NC#0lO
z&)cO@PNzhs&N9=|ChwbKZ&$>pu|CbY;Ap3jKo;;FOX8sJZBfaxIP)>hr<~C_<1@pT
zb>-)K8#zsqmJib!d|4!vv`+@J4J@d|3NloPxd_v4yF^J|D1Gi2VkvSAN6xO$`7D2x
z8%T@Qwf@t|$i{aqBp96Xzz#6up$s}d67R2!hO-dSDN)5R6xZlJH;<5;IxU=sg(*%+
z>$##vOS&2%CLLdGdVl(nO=l+?vi2J%j(g?#;ueM(xn7JYb94{sJN*EN`DPU?!jUW*
z40PgRATK}-`h~m3!$}V4B})6%xc7?i7o*J@<{*4+$NQOpQO@f(n|@RdN<hT>l^OG=
z`^3v%`6yS7Yj<fp0Qg!*;vv1U4rEF@GpEJ#e;12M%k{b4eXIdmOs)K`Li@f3%)UTG
zC+-C@;u>TB0AAm^w>BFYvbS8=Kd+Xy&HwNy00Y9ZL*u_j%?(78f$;pKy1h$u)!100
zW0=gFy90|&LO~4@Jkiu{xZ(8tXxeOf98(>qWk42HUiR&&Ug{WLhlICwKLMEJO*RLt
z8g47!bVGe7>zrh&yJD)sJ}Ho~2J*UC#cQE@fU;<&m<Q=_Rkf5=hYQ8*&@u7#lOI1#
zr%fQFw;I~Gh@fY++<+brt)9^-&|2}S#jJI@a3^~WQgQc>kPoBA{$=U?68fm+Fab>2
zBAu9jFnvXsv}uvI_O<KR3)Fh7xYC!hwBEpx^m2*=#%?q!Q{fA0bO=+j$Z;z|F~^A}
zOgK_Gql~Ul+s(eGX9jaXx{Tk&%f*quvZ}(IeYuqy8Y@Jki)Yza9xQdjeV}F<OAy*R
zLVkcjyWdE9J4`MTO35|ls{1BwE)%xR2G?`O;ED>(M!mMPUy^E~q|Z0cIO=)BPcDTW
zp?5XrCEE(gV|<bg2UCTQb%J7cwwH`R_J+g}v>UvX{n?xKwo?urgee;9L>WDp@z_3-
zWYjVVvNm!bz*iQByFtrj-L@dC=qzM2NshRv&M{W&bbm-eN9@Tb`77?cgSDG#Q*_qb
z4Y_C3kYH7;PgpEdncGhON8N8ywi$r<6riRA(IdlxKpm^leTPNwDYJwPX$x=XRtxK7
z?qPh$uPJL_Cqp%E-;X4UO87mj2(fjV-XpgzpY^SKDR44WHDX6irh~DquUn;NURM!0
zI;wd+@DQ=~M#04;9D|$Geg9PMSlfSr@t_%N1Zfm6e+=>>z^1K5vK*^=Cv}2iPT&Hr
zhP>=17m-6zs<CbfysBm*j=1iKlo9)v^1oUyAi)n2I!9b2S;c8W$Y^+W>gf?BwN|2G
zm1+WOp7R)J7~qb)ladwmox3L7ea}zvI;PdDiaqpSCp;<5?r^D}EgJ>OzVVK=3k;+m
zsiqRCKZJ5>R@eC+y%X=ehD6?sfg8BgteuF$7K5&ymYXVt>9Q$5@(Ulr&sbzo7UoPB
zzxckl(-D~bn5aUlY*V1(;*iQZOY`0zj>rDX@yMSQJu%PiYf?ljOke$^%Y>?{@XPPK
z&tcr7Ym#4NQ}2#2XX>e~d|&?l#-#0-IN=H4P<_1IuJzK*+^9X-i4ke6%s~88nRvqN
z83AKi)hovI>9UXcmLng)9AVx5C{;G=Z70SwnrloEr&zQXnU2^e2!oj1Q?99D%C%!x
z7C`gJNSxX7?np(AQawY7IBnAN<QJ*iN!iKnyUDut=2D|j8&S9I&+sHZj;a>B{kaa+
znSNDf2wePKMbQc5OwT)Q{+ya3%Umv+G$Se75xuGN!On~)Ng?hIr0eT^$Vu{u#`@E0
z;l0-emqmdI`1SQVdq8Q4;3fW+tXW4d3aUIXMyfPUMu%}&t5K`b8P?`b1ztP$n-=Uj
z)9v>qrm|PKVa(JUZT@xPDMVe|=cuU;g(ytW5=c;pGBOxnE(`mjg4T+lSuWP+g@wJ)
z5)IiFc{hjF74_XpLEeYpfJ>2)Fc#eh<S6&G@#y?0>Y}op*7BF-TAY!5<F`n=lKA6<
zUT?pAcfPlxmftD=WaoC+`$pS&Htcb7Lb|fLO1gI~arK;Mt;;R=%8gr^^C7Wzt@M^}
zG&Q$KZ<T7`{aH_Mw%mWh1trx-%%mk$N?j0Gh7xfLB-Ur9OZtNVfV2#RKR@QRjDm*5
z656mM=rI)sPcCf*%jg`}TPfmmQLhv^;)m!5k8t&D0YeLoJD*9v8dE}oNhn3RnTvDX
z&L~^Qf%_mEEosWE__L<ABK4Q796lH4hm5)7#uJ|J;!K>P!6COILh^~1FgXO4VCr9F
zxBB0#0~q}av4>D!rN&KW*@0GWGwVZfNc|&V0jO@!TgD{7_{<RdgZ!?&ZzE4&es?XO
z@;brJCb9~+P<cr&y}#H(B-k&Vq_WD?CVUb2Jo9s;zIT`de@~Dq?lg;pkZ8|Lw@Rf-
z$u$~;A8N_}sA2$1svbF=38MNd)l1j<q{Ycm0KFG7S&d82G1qxuhtcPS8ugevyBP81
z+}K<k|HeMA%a$W6(nq&(snA}~`K~3e!4Dl0Of`r|hZc~btvcno{*1x=51!r(dH*^q
zO|l_@LdR@z`7?4jr+-f2!&naB5h@;Gnmn;t=SP{#bOh7Xv-9C9N4y^q%y|x{Y0K?f
zel;c`&?%exY>}I4#}}aB4OEtYG}=Ek%}a8v$-1QSUJaB%)8Rl6M)Z|tYj>bw11-=6
z4~LUFa<$Oa4BEZZH$t<;M!rN!F+tb5c`OaRI&2@2YN0*F4%meTj#GvwlLTNjenZdi
zF%wswSNI_@WL;K{_0Kvg$x&3~#|a0LsIzLJNcbLBQ%_JCwMw9Gu{$E!SHI}9QUK_y
z9_~G)d($QFp+I{BS!eHP%aYuwJrT>1fEP5AyObckJt2~7rcS;?D~4@;bxVm)r8Gsr
zh7s{BkqP9nuJzs?Lo1mn)WM3NrYN>Q`~Rl{{7)&{l>n5|7tSnWg(*$q9#}-&RXo5+
zrb4{*qfVfYUMQ|gL}KT7`W7HNDNtF<Fc5oWKAV|2$zgmOq1HbAgR3g+4-3YBmm$dU
zb*Qq8b%h^{-vavF=h~WKeeS2GuVihuQ5mgISatKx^fdm}Uj+WH1`IH->BC}p_6h%t
z>-sM<$$Lg<oARIWF8|HVXyjdfb>2N*2+{wmuf8HHl6L*S|LC7Rif#(vI@3MgG++E5
zXOsVW+kbuqURvbOzUjZ8@Bh=6eh)|jQv~cZkV|@ZdYey_ceh74V;FVI^`cbFTK5ZL
z>Z53$3uKP=rT)O7$K1|W)STNg&B$5rk1z%T;BamK8E@<RJCPA<E1FlYk%vp4g*~}L
zbf=0YE)5AL%K85Z)c)u{Urs=m6X%kx#i&zoK>kt__}V2|OpAWy_h0c8bzi3LsQlhP
zn+I@i$3SJ5x%b3-kA;JcYx{tOxRr$DCBFqztpj@8>Gxk`iuvkakM-qJ2I=Z%k9-Jk
zhQvq1o8Rci-1!%4<fV0U79U!u!2`&|xKktpzRAh4*Jr+OF{|&h5E2!Uo>QWvHyzll
zOnGL<CJ+^otNPSq303;C{>-!*V*+w&!9X_bgFZF6E1i=|v1iTAv6|AgMRWoV>F!GZ
zw1f5W49i&K7pu(?g2o>aaipJc*8A<6cD&~6EI+U`^}drigFz>nZjn|T&!ebEMkMqo
z0G{GDr|91f<^QD``m6DE9$vF#f!br5@YF278s10L_gchGt<aq`Lp(0cOn?U8yp=A5
zlGhA<;a87#-s<6e3os>y0Mj!<gs&l|q~as*6O%5;pBB9~sUdVemZUu~I+*7PtwAam
z+PZ}dO-jCl{(3S>Nnownw<17WR$Xe`VksoS&R$u%e^SK)1xOZ=v#rn(dS_k$x)qr=
zH?17(PPoRL=noT}&y<H%c=P+CglkWLJ5}AbPhk-e--ca+M+oxFxKeV=)Biep@N>6F
z3Vs0RzgA%UH@z320{XH6Y{`>%8GeKwyW>hUBznK+a-YO#5sSro8STn80tplx+`em$
zY@ug>5vg!$owlr`9X20&FI&<?)Nuc&p9+WLxbKguRhxWj<U!Pf@z9%ls|z-@dF#Vi
zoZEJwMeLBjZzI%h^a%^}1QW3*38zW8>Lom1tl_l&`{{6*B|#kD)`j0$PN-3NL*H4m
zS^RUttaLgIr_*fdtV)dQHV4mLQpJ4AY8f_brk5Z7@y!3>OS0ZL<$&UV4JK+H`u?H&
zN<m*nt4m~bMzegmiYokW*UMT{wu8is(Fu`H$`Sg+o{g#mhP?bA>_$suf`dDDPdf-K
zXv%X-Q}`pY^Q!ajyVz;Ie$)Zi02%hB13;wSE^9IpW9VT1EYH$P8)SNwVeFON7xuTv
z+L(M?<(5-0IJl$3?c%E(dCvrc%P7Y^ML~ciWgxy(e4Q-;BFH@I)pidd_F%yciB-@0
zx)l_FLDo~FQuV*r!T-nAlRV`;pf%}B70d>1cml54i5l)-1Q(6?vJPl!McBteHMK5*
zTgz<=;;HRYBs;gnknOnoVqa#+-Xoj7vg^>*=U=e=VL{}?T=YHNeI6D0?~3HZcDrv4
zu|H!<E1QbWn}N?ZpgIuZnNi4lB6K=##S8q-XrSffop9s6JSQ$?CO*m{hOEM2m#f{V
za_|CMV$$)a(KYd(6>47kvTTco_<%xDC_<s;x{sSK4aB2Vl->ke8mv*aO)vJ5_TOx0
znP8G9x@==b3fk0@h+6{A!y1F0M=Rv;7H|+CA6I_ntXEhd+qkkiT<6xWV@Esjp5D@O
zD54Kp0HvEkA@8nWf}Zc=)PU;p^7s*FjoZQ-fyW-QB5L9+UF$$fLqTT6_hOR(jebrC
z^6dE54`4t7ww%b|Ck@b6g*#xNAWd%W-(z?GJ_<%|>+px=u77sHnft(#3pq$TQ%@p8
zNjglL=@b`VGV1G`w!4jbXBnH@Q6s>$6u*E)I>~ONM7fYEIjwUid1y_@x(g{yv65?P
zH5iXj=U!7um&`E<RX0qC>R0j}ih10;jH&ud^EXD(V@XY~7b9uCDDL8GROfPQq)2z@
z*>hkY0~}nt6I07}y+Rw)2!Mj|lsGS9ffgI&+;m?aAB_MUAav{;OtK?uA}*bPKke<J
zQy_+{mC)eLM6PBoShR$J0yvE0L1W=^;rGyNLIahmS;qI`7G9#XB|YZXSqr_a{U_^f
zm1SgVM^=1pvnLF9oKNv$DsAS}3e8#<8JQm=CIuf>@AvRjD3gQLZXZ|?(r8G44!Gu|
z8e_=F6AOIQnKt65RB6c-Wkr@}1E+<J_-*Ai_u7`9B7{I*P#KT)84t2=ws6V)345XC
z2MFZzKV-oF2BiT9@b^F@tcwnAD2HI$>b2rlJV|Ti2m$uAGWH+2ENv6u+TI-r-C=8h
zI5H<4Eux}!=dh|V^GPFF)VB<UEttaJZqD^#+?IyDC|);{?lh91qG*Q`f%PswWP2#V
zWS8wx5AL<DK$h`+hS^!asvj2g{$`C|ZM_XeJf0J-tNS*6R*tl%F*ZIUN|RI{bMBZZ
z)9U$4B03v(P)94gqQf;-=B}C|=2Elb`q#4H5WX05R&`^nFDz`26A~Yfd(6ignov3m
z>VHSsH_khqua*kK+1%v2M?mOKtv&Tv|Lw80;#jn4FJ#^Ou1SkmsAP&n@XF0wm=0w8
zCwZhngZ-{)b__tR(}8sU2zvapg?>idctH+~d%0!}kh~1w5jJu(yurmJO#E6?50pM4
zdiD)U9AvOWBCFVMmamvRk)WHms*#wI@Y-Pi4GWO;gOnA>f`;1et{xGnH>J$f86oxo
z^TK@&9`-C}5%(4Cg*Q5ELKde=E40%o)5j{y2eVxIvMj_i3W8yootm*wGadPf7*|4(
zn*d4u*b2NmWf=ifkp#x0a4x3w68+uv!YRo_UxMBnrc3~e`5^}XmupoB+IjYO(aWmU
zA(}d@Y)ZNv%*;4_g9U&XZ6mVSNF%5vcgNir`nF73060d9IJkK7-$-73Z~=;Yb|W=U
z@hUM%WY*shS6uF-iK%sXqE0Ft<{>EF3J{D&tDeZWb3Krx_`StN#p_VmeD=gi-p$pY
z>g7&wo4FqWc71hitj41G$(3eAAwRWYwB_njNlqd&bH>Rj>D>oe)?|hVB8KYwcAd)z
zh5Jjrx>~$eNH3xE6v2<V_t!aH+7&d+YvxXr1Sl9(MLZX-nKb$J{Ak##gu{75u4NJb
znbzW;t^@E5?C)*vb}3NkD<k>BZ7qc0r$`s(%&4*{co<nXx+*H&eUDz$UXL9YWKgq*
z{bry-Ry+Tn3Rd}kTY-ktW$0a6ha^+M@h!1=!ljX@n-xZxEH{TT#ZjnyIxYyR+dYj!
zXVO+yq%>xYf8K>?O$3sHb~jhR2%ekn%d;)oaJjO^WI?!%*kpcPoE<AFs-=yp(TSxs
z)kf;!cxNN>a;$j_8$-jN*mpJ%4+m=FMXF8!;zJD<gheekp|HtfS2cnoU{sF6xpbyZ
z|2AD7yg?!Ync04SYaZq3Ln{iO=-jnzb+9OOsgT}YjI?qsLf^l1L;06Z$mWUD>vu4m
zVC{U?M+yHXtW(w`ToTH*o>p@MDGY@^i$u_*xRuS9%N-7Aq4La6f>lT}2p|#E=~+;-
z{RMSV;bI;8HC8Nyv}A_wRRYGOpY4CBd;O~$zW9JHleKP85+I`M<#d|81pw25{z&Be
zoSI>SIeBCmh_jAP)Jdt~(N*~m+W^WheCdB=%T0hSF==9iK1%8TR>b3o`P5)iI$4E@
zn?x6AtON1^Sd#LwSxmDcAdA%etng+J3a{P`6j3H<TKnnJl(;mm+y{V_YpqsS#|t^f
zmWP9j?Zhn45T8GQqb8JJv^#1!XlJcZ4c{ip&frc{1Dwl$SAEo5C!du!;;VEd7Yi*P
zHmcf9fAW*C^_(@eJO42e>9%F~F~#Gc%OT77Z@Q;;;tbHxl7mF-a+8|Pp4TmxM0Lvd
zp8%>>)F;nA!35J@7F=4myS4g%2s@qWR@3U_?QzF`lfkYKjea`rzcYOPH-qpWUdGc5
zzkv)j5`&+^yEwo>N>TjsaOXXq^z(?^Pp820tuu6S=7fiIZmwF%zm0v2UE-vu7Upyp
zmMG4>;?APwGFqNJb{UbjpiYlyx9$<#$n7oZN_a>^j8bk=B(A@m8klWfWW*Y#9Rh5$
zit5@bcGTh0O0>9;y$$8d6opEl(46J`^7{@DZP8O#lWgX9cy(G?DgW@d4#%6<mbVFL
zun$IK@DW+fNbSe!wN>7Wezk;-jV1Y5W@xHJY_LXY)a@)FghUBQL)Iz-`@abojZEAx
z2_EBz6yQ)q?o4JTNPDT~-#ClzkIE?B^~=zT8P{>9G`B8&(qLsSb@;Y}g(}IQ`r<$)
zP(k<p#%^=b4GQkx|5{hhmD*?)Jx5b&B<>mqUWnhHEt^Fx2f+~@5DINxtDf$GMSJVW
zwtU}-4wtE)=XUUF{<1o{{|x86Aj?0*)c^TdzyS=%f;0x0A@(Yn4|FsxwT55VujZ9t
zd_Sd2_&bf|tB*yil#se^NSZ`fs;k*xWZGGx=f>;ev4CTKw;Hcn_fGf4kyWEf{J_FM
zZp9~|Xb%V{oep4Z&NXf~APSjt)s-$a+y$s|)RsEhlm6%Mr(;0yrw_*iUo2WP#Nwys
zDm4vkyuAOLt+`0jaj6pEFjw<%&Z5ibD&p450+!Psh=U5OyY`C|on)?_RrsaC4>c+K
zd8>g=IH5v3j4IHNNpJT2D%-A|;5#&K#Nn9_)Ff5dSly)@C$Pf-n*9Ol3!og}?6v!x
z%Mi6FM+bX;5K1TpeUvF#R_w(G6!P9{*LkE3!1##z$@ML^Ag$R(0tm?I00#W?(7>J0
z=B(=;gFLY16Ng0*{Pb(-jEKmvDJE3gGZ)kc1^y%G1un5*KmbLosm`&>nZV7%|A%wn
zua9}EU(N&(i~BlAL4>49f!@VYSM92x75rF7&xW0uSpX-c@2ItI)j#KDIQyjNs;{VZ
z3OMhWz?lI{7XV!SGxUDlgp9_eFpj_QvUOS4{7;sd?%cVnW8&Q2F|_`Ob?-IT4&h4Z
z5ao_l5%$(HRn2AMgJ=#P0Ep3xTuvpFB-u?Rla-0nctcSTp*#~Wd~AE=hE(h6C-_W}
z&*yt$20D~&>@IMak|Ynqj(`>rqBD{UxClz8>JrgtKDfKfOXqwIkgK^*KgjdqW#!lP
zDL3IOsVdI<_DZd$X^=XE+T;01rcM~N&9IP?TGnjyh(p$#{hJ`%ghrd*Qp3<n5J=$B
z2GCq-V_PABdj6_T_P<8fl^Xp3==uV8WK2dr5FRP&fG8gZ+8CsMbLU?juQvFuw%*3x
zSOJb}SrtDtu%d}cady;;e><*~GpaPu!N<nqB5GPM^hTMkeN(qmXMSCY8@Fxp>N3@_
z&|y31_-1ffvCSI)%G7AYmEfxPSgjbps=ax&T&mP{wAb=%>Fb$|bWB-MUkjjND7s21
zn|M{i*Z`1@H)k%hBcEAee^b-%0sJvpM$%l>8Fa9%z$dh>D2j9JnZ8Uf32J!&`FBc*
zvEwLKRQ;e>{h)ui)yO*WsY&Un)f$DEqWnaLZB8hMkTj;H{75nmC?pc4iK|b5q6^Q*
zGyY-_Y6x=@9ztEi@L>O{c^H2_z1n&Q8?wKl(006=dl`=Da_}(b)?$g=BGDi}AnRm)
zZqUbL03>WJ^&C~F2?!at0<NnjKhFNQK8l12Wvo2*0I=&Zc0-6*D;Eu?dd$-`9gYy*
z$*$v`&CTLs<H-Lo9BL;-FOI4KvkgtPzQLl_d|;{xc1u5F5+rr>&VHTf0%DXfZ$m(V
zsjjU^q&aOY;^0L+W^=NAA2kB(ZT=@V0o3M(RutT`pL>?xPS@FY+8R<;C)FGqT`k0y
z7Fw48!tUrB){329C1^PH18EB_x&+wXeW1dl^L;U0sH5a;+X_?X`@zwIqPKf*xJ2`@
zfa3#&Ig$bx<tlKkCe8`f+do*urHzl}Vtev0Ib}=PFXgh000FktgI9g&oSuXn-k5Im
znT5j9H$3$xo;os!A;Op50P16%NNvx{gweW(POi_PcD44mw4Na1U{$sk#7&8xN*0=r
z9(_xWkxBv);uJNkKtyK^E(kafROb@4KllOs9D~D{{OuFb>3aaA+bGuS^#Ig>l=b45
zw-6%ecIbwFmBxBr%E517{9Nj`gk&2dg`9`7v==3o*Ih9{b9QI?A556vxr@)Jta%y(
zqw$@&cl$oAPO4<8P!Mp+&<6Cx&E9(|B4P<|sb?%{JTzop$NpMNdI*RQag<zd$cYQr
zysSJKSvC?`F#?l%TKM%=gHuYNJ!)b-b$8VlYB}y=2bkdZ8vR7KIsP5l{H6So>WPm%
z_}e})AYA+V9!<TrUw}(&bhSi=AmoO-<5&n0yjh7-1TRmoy>B^)-J;OFI$z~e^~RDd
ziHCr7p4ZTGejtDYSOIh66Uzq+3H0Ft0y^dDyzF#Sw|np6?PbP>^#Mwk#VSQ<Y3_)-
z_w*efPcP0-i3<SXH1p}rZqsOwza5vgpXb>Srcv%0-n#MzfR5!WNn3ofW}3JAPt0(!
zPjq$ltmACjnfOFq%^pAt_@Hf8eiY@Qt<Dbshryst(L+Ce4-SFwkGY9lR-{5LXtb7k
zsMClkdc8kG6l8Ke?wZ?3wD047tIG_gseXE^H*pP5TAnq~IUlG>v2p7b8Hdl7V(df~
zcO9qtQk%`z>lS;C87XG{B>nSAIPL(*tRB7o4rph}Z%hBz)V%*W`O`@^ks2W5T#CLl
zd2VW<Y#oZ_5=^DpZ4YsV6(l{w0Q`B>Bbpz_5}|AlCq!Yn4P)U;%A;M(EJM*jT`>G*
zlMNx)xD{9Qri_ZX+wlsasLOhlBWX;c{b+%VneXA%kzc#{z&UMquPEwsdAWG>`~Z>n
z(~M$h!`#io9%hD5_)zopX#_kJBuuNP6NG=f8boDhg357Y{stcZp<oFB5?BQr7r%V1
zVWs0e34nOb4m~P<M~;eGbS+d*jJA8_Z_*MSkpX=`{d+v?dVyJef6*FX6<Y3GLxvoS
z*z^A<nK;@?Yj<wIPXdVsGOBgXj(|9c-@OU~8|yLkCwU#Poh1Nzi{Lr9HvzKN)C2la
zJuiL%pH}h72Fj>fooXgMvP1>U2~adKl~$nq8&!eGJCm<mA$_1MFQfB^d;Dg=``KXd
zv%1X+jbmts&Q-2362)RQt^_Zw?cw6rswt>hzoxKYp^+tL&QWckzKj%(^9Ijc9#s;z
zghTh=dXMh*I*gP#!C#TMXdwhkmE^g)cui`-TGa##uY>XTQUqRWzM6bguJy^gK_n)X
zh8(5*6K|-~D6PAl@CnYPY<ho}-SA#|qjX}~Y?U1;V0dDdQgUU}gNBSu;JuxyxH^p7
zp@}un1MwhAreBY5_1alTavcJE>ufR2W!zNME6lI?G%Mk+KWqYp*ER+y`t-^|@Dw_B
z=kw`!%A+P%{l+u>gfq;RAzP_JjxQn+Y8rE$mmnhqKF_bHKwQ$FDw;mY?_6~3_*dS|
zs`<E{C(_K2w)y_sqtR(-LV988Gx5DHKdPCAppe+!*}>XO0rIb_z&B-SP~5hvw#aE%
znxMePI)uNX`ZqM=l^dt3nJ+$os|eI+gmRDKyW;StFS9Yu*5xjB^IJ3Fn}Bzt5(-NN
z>{FP9n$|Gg$8hV}GB}iRU8|Q4#mSZZYqgI2yLlZzpHFu3*jla`FSvcNtocToWsDpJ
z1nKRAc5g0&)=WX+DM{O(@ZR&X0R<F7+-zi1y>szs&r{g^>s0j47(<`Fc>|5h7BnGN
z!7I%ha2eK&zW!={w)up5s+dzjLB8Mt)5@{!4iD<P_ai&5N006JTJB((aFNwBZp>Pw
zdVpY8`{V?uCiFP{2F#@|N{jMX6GGI}C3V?@<CCfvRP+Jfc17`nCM8H{3h+i@ds#r?
z$4rjEf-cucC5c`0Z+$IBUbP7_3!GTq+&S8q<g(CAN9ufUSxwAkJOdnNO0?IT-D!5m
z{FWO%MK4q#QSXjV@l1a0egR96ZTc#xCesk>#dH!as}FLg6jofu-&t7mPB>nZ6%l(}
z@P{+upO3X(vcLL4(_d4nQVl~#xeE+{YM1R0h*@iBKT}iZy5W~HJ(<-A=D^JB2wsiV
z>62YU`&X}uM0BXs7$!=!#ZsPu*}`)Y^e&Bs5DMyLg%?0tmE<0Pe7TxD63A+UU_lvN
zkp#0qb|7qABOx#BiNntMqurlb%{|HZSYU%|wOyLqAc$s<b_3|UMjLFaF3{R<-W>Di
zML$12fYGeB+oM5f4AII^P{_7aV@M5CppXIUoR)7Rc}Awz9eek=GT{vPZ^O!*G%^ti
z57M~W5xt=VpoI9Xa8<Oc7VzU=B5X029Nk-{n~>I!U7T{;NsIDzq~CxHC_c@OW)fS~
zdeDRIG$|p=G6oC+2^363<eKKMUcGOsx8;01Qqyoe5us=)@c~TD{Nm5E3@`WEOb2{V
zK+d_Fcj+&Q+QbQ@D&Kg!HnFT`47J0_OI#k5Ki0FW*G`Aq-Op+VyN^$+MC68vzul&_
zv|6-LO9OQ<uX9oqD$NQYiQcLVgc4y?j)#^;`!36iFgvYdW#ZUG{FpZ!m}a_+f*T4c
zu8D20bW70MS!t^+4Tn)w!--7azLnevt>TfyUR{%bbiTf0Y`|OQM>mno11#tiYZ0`O
z4Y_2Zz1hkx3moO<0G98Z$+vWcJYD^MVx%8l*si9=+5Z7N>Fwg2Dr|mVAUvW5pS0Hf
z8TnYtONuYkO^MPIP5cdV6BEuaGt!HeVVKIAi$eonA><Kp8WZFRA_4}!bbPq2d^VxM
z+)+Q^1gdc@dW-D8UIJi>+o*N=kxYq0E7yJxL@J8w05_o<9*`;so!<qG7wnP$TDlA!
z!gkVOTM9;Si2JBz+gJZGru9@EDDqC(fVhP-eJBM)l@r<f)eYHy*Q6X6d`bAie!G?*
zJ}K%*4Zy9=yHwI9BCuRCV%4*Y)#)Q`-9@^tt@72#?@Wx0*_w|yc(@`y-IyoQwAan9
zq#kN#DIKz!d~p)vCOX72>5?L-Yf<8&Hf?S~GsyB>k&F<ZEjuiXDe`-nZ@R@gT_Q4#
z;J}pk(9dc!-KXC*eJ5?K(b#X3Z`OJ->p~^`&J_{6`L2&~9n}b$p8T@u2Ho9S_s9ni
zc;x18s+t>@+@T^&fac0qewd#W6a5`{r=1)IRkpCG&drOCbJMlt;Srq_9wSL9*6RtJ
z(RtlG*(BB(SK%$SrX27yQ#xF~GYIP8E0;4o>-Y&$dm}HIqA<{cxadxzX|(o5O>J%Q
z)0DfovUMjuInSCHW;R5z|MEaY9?ZB*C!m)mq2APMBx8E1K*p|d#OBfXL`lMJCp#yv
zG8tKYXK{P9XSaUqeADvm8iNk@`}wFFqtJ8iaGER*mXMH}G^*H@3WTXvQ*{qA!`9?T
z)Bj?bBAWxV{HeleKfzCcX#3tZe8%@z{^yU)YqxjV*b)=rwp^rzaJP!3rlNkmbBiZs
z>(M)?w-mFnZSf;j>5X&cfik$3@pU)$;R%k+#L{ctx75?)w5=l*Zt3ckvw50R$@ip{
zjSQ=3+9pGFjdinNJUz^%<xN$SZ^)Bq_?Q&F$;y_T?1h*b?%L~O%ML5Q43pA%4sG1v
zl4(RZ_ih{bcuQ@p5iDUX{CE1b<`1eW8x78zY?0y`v!wh3x4HT=UK4gn_>l*KW9%!U
z{`YGEZ_ZmK9TNzCIvD#*SNCI)#;=AVoB$+|lK*;fo714(XGEe+t$SISF(PrB>-4-t
zKu3s`R}rma*|aigq^0YsTBK{E9%EUT9bXnqNk`5uDlEkH)HH6yUf#1|DZ;(f(XlT7
zYl|P-6H{I4j~^c#_QG|G?7#v=lZ<_zrNLT(y{PVIAYtDu9Uf>xs#J2i1jz$Fa6^&w
zgT3Cy5h2{LM~w$3lO*45$e(l)@HUc+KXNkD?)73s@tR*))*Chr!3pwi-aJetnlxbj
zcJqJs^Z#imyjQ=odz6``Ui6Ip#(n84SO4@!1~DVsB412k_#Zy<U*peH_I+j=s}2?E
zAAb+<JYTGD;G>Rub6zq1(IEck<<jp1Pk39mQ~u9i>c;&|s;hxu$sM*N{D0MmtiTg`
z$-<rz|BDv9@$Na$7z5&WSOWiO2><=^KW_t1n66>@;!k_(KcCAWUaFUA9IN7c%=K7a
zrCg_b#D2sSPhA-V)o@>{ics;|+|1H1cU&;HFr#O$9E{a+&(SL?n|<PX$M;{1iU3K@
zAG|?;Kk@7KU=jJ;#v&_lLlQIsWpC;$y+PKS2yMS1;#w)+A_>uw4Gd!D;%~=_xIx^N
zcbX{@2V5cF%Jq3-lfEm7XdITNNqP(#9GKFL4SZ2Q^Q>~0@?^X>e2?&7jGGGPR~&3Y
zJFvny5y_F!Bd3XaryZjDo8Qo^A7Mvcjp)f49{(eU>Lo^4y`jU4ni5mfSH@0lWzAp{
zUB}dCB5Dht<JPvxP{n24i9SS4@6vNf=WZ(Yl>6=5_%S$-&*^y>jg7$0wyV&NbYM)&
zv|>!p5d&&UaAt>^jfF#tq9xYa*ze^0k-B(l_`GRCZ;g(vQzI|tfX>fUd$oEjik9pl
z-cdQt4lf6LVs!_z@4l({)2*<8UsZ2rURW&6sU}L$XYRF2czXBthR?q-Utybnb`qWb
z$*3MD;j0`Iv$4LIE(oc4q3Q3>?KtpDw$a3Y43Jpv87qg;I5>DfAl>;UC*0GX3V$&A
zzi?j54+OhzgNOb7Vs^$qOwo;ddvsq@rY9-qXo3|*II)TgIvZVIR3Ctq)KqHuAWY9&
zEE5MiNT9ft^@ROsA|r%EB`+hRbl~;td8{&xgS>f4&^ve&_HFSbZ+gSX9?5&2y2Um-
zfet7Wp~^<c9iNA6i%|4r7NdX2N1DC3nwmOPhVGXk^tF-2Uu_w!%)hd-z~@?xtCG7>
z*!$%yc=Y!w=%utD&%X2N{GH7EyuRSO%=Rx-UT5EVdBMx*G~qq-sRjZMkw+d8zw_1k
z`(L)YmIHy&O>gU*A?^pR83`iFR%$&}GpkZ{8wshk?wu#F$i?o>TzLzm+8N#H5F*3q
zq%z<SCgFRb8!zy%H$y_hP5XymMCy}UQ1tbZ0{4DPjrhl<+U|MKsOkF-!D`*!em5QJ
zvckT2N|eFf__$W0<XGEfuZdq!0E7J4clK2#U>kXmp!(@f<C>&lK~b8?vtxrmFtdIc
zD*CLyIKZ+XLVQO@;dA;8Y(j4abK#>9Oj?GRN7>Q4V)l$H{{CC~Qu~uNz2Vz-osS<i
z98A<rqbDb=%8RGZOjfEF`icS|tmw$=Ta{0@zgBfTla6x9I<0!QmtiqFSjETkeKoDH
z^TeM)ghw=2$yE(lf-HDFZspUOl{(Wivn`%B5<^{`<nR6O)g@q&6_{X)sNE12qS%8w
zeoS<5_;y#TqZ?L7+gHl4*7#;faPkT!6=bBFS^5GGV{A)hXWQacj(w0-u0DF98jOdG
z8fvD34vndNjI0$|^Q>L9`)o+97y3Z8TxVj{0<xI(ZR2-K(Yw)9K?j@c5@TWg)0=lc
zIo^u?o)@8#+P3^N%T%-)q`QnOcbo_mdgF$AGQYo1%L#R*WC-TkYsT4ufq?nWN_9a$
zcD25PW6q$i^+@1t4Fkc^zPp-%;QpxHXKNzytPkki@kRtzPKvPSIjy9vI;wHFR;ec4
z*`oM8jt6RTe=>nmule$@)xXk9FLI)MGsO0STTO&tvR@do_dJStw0dH&z5m<LV(vm0
z^ZW}pp{5ekt^D@qirwASi`b>^jZ%acADeaak~U4i)${B{N-E;5-qZAz6<B?f%CB2J
z6jVLo6tINunYJ!RQ;)YVpd~Lq@yZk#&fct@`n4wJ;kb|M5sN38vm8k!D5<Z23<=uW
zI6RwKv{Fl;nn>=11Qoq)(9_+{Y2=aC=&z)b5$TdVW|I+FeBSD5pCwRmHr+^&eSM@U
zxF~aDW7Z#2kW@J9U%B{RM^6{0M*SzdpegB+tBi=^vmSp@#pgt-2}LQXRkn+gUP}el
zsWEwoDA>`EYn0}g<IF>kb-|bAKM7<S^$}LzciETGSw-EYy1IVr2d~1lafEx|wt)fl
z8B>0JEfUh)tU_hU>227wM$S4?hnuyyxKyCNrU3iyKxwBr!L<KNJ!p*LQ3wlFJo?~p
z>zKQ<u8idX<B_PWw3%ZWv#0g#B%17a^!Lek2Yi%>pT-R=?a5LO@RKI^#;tCK88i0l
zI>8zywQhZ#rK3eTg*nf^$DJm3UKw*5MRn5o(B13Q{NGdH>J4$mP5&!bu3TIg-(v^=
zlb5+V9h#wInR0NU3bq3ai6|<QJm$DYazi1Gcs|JrV&!3dFabC7JP(|;KbT0Ko`C=C
zeW3Y~EsQY&pNWY{WO0f=>$Y0|A+n4H<mCY`S}9u}`jMA$vR8F7EP_~f%{VA5;wf=9
zIp8)hIrifvuCr33?$#z<heamxbUT}Q^L<O#_82)EC3*%~f`gAx<X%V3F~47a4gW#M
zp<nbE&{e@Wft6;$bx{_h+LUcNJ-Iz4H{WqwzaJ_z6R(XJXm3KE)s;O1Ye-FewvTyn
z;pXSyTNP|NE!wm6^}6NBjPLgOs#iCJbN^Vhw)*#C3<#=1&WVZ}!Ts)G?uqoBE$}GO
z$O`;OD=zKrvHTcNv?SJB`EeX6#vsIBz)-L^+lPZ_eler%7=}b4gg?_Fukj17j5#NV
zk?iE*GF`s9la5GA%bw`zN{o>CuQ4ef9Xq}9bt3o8@0wnFXUR^Dz>{oy#ItsUZ1xq^
zR&8R)D>2;^hV>3C?p1b;efB~Q#6E#zKhAw1Yiv<8f<Lf!U@eA>##{G#j{7jXjou=o
zy1-N9$szUD_hT*az)PF4<AVMVRr`QDW9Ia*wo+XLpySj$I~WSxC}&i^z8Px){;AQT
zQ*5YfC}_{cohq1+c60A{!Ho4rfLxU=E?V}0JCcSFHx^GxpZv4w+#5@6P~(>O2KT^F
zz?q}eO<cd+6m}N6W9hMeRV2;7W?aq&I<1i+*q;Dokg8tyr4xqC<<g>erfYfC)xMR>
z$aGmQTg0R;sDU3NO-;H-cUrI%vGc6z%IT&(eHci0UXPUV#w&4m+;D=K<o<i)(OSx0
z)z4H*TeI@xy%i;b+L}72A;D#ICEU$nT9#*NHqa5dj~a#hOx2aaCsoCKKThJXWmDn?
zwNCLqADg0Fq>6AjcU(eWo&iX%yU0M!ZnR*X9(i;;-YW$$-d~eqqudoSO4CbO4BnG+
z`?>b4!rH^5fSpe=_Uw2(%2|aQE>=hGLvMg7o>$!yek%ss4sFhd*v&pq5wcrKKiC(q
z9vdC9?pI#VYOAtWFG~~ifA~u?yJ20XYYkHa4SjykLume@|6`MV=I-+3grkq^0<Fh%
z;K>%vA7H%z8s@h>F%Q|Sv*-xE>Xd3JQ1BQr&vKc;e{ARxI=y({mq_sP^X~e`4=1@u
zwCo-A7oYWmXeTO>ECeVx@~uppOm`cf%q;`zn}zW(as!R^fTwK&6=-xY>-q7%8=+cS
zB52J}RiDr;7X#c=z4%Rcd1he0zGC!)rW}9LuzbIb##U)^MmI4bl5B+Q9a>sWP5mRo
zqY;4{RR>>Wyc9Ql6x!7C9PiBvX~~K8SxnQ4xyYVxkA_|j?1w9v#&4S)S1<PG?9o|{
zoK%Py9|D!m`oMTr?0~jZSSg4sE!=UJ;#8w|H8|38t+RASFZR$+&*s6(vhsm)DYd|z
zHESWkwJGlE)|b#HU&tpLKXA?kfkOq-L|oeM6UI*D;6nG$4rY5kGxxkXN+Fagc~PO$
zsHH%a|7(o#l(aPx53<N(%3;r6kW>H*R#))lrgrmDry52gbsF+yv)k;x>;}F&-m9ku
zO&Qqukj*!$wUr+bQT;xyt(bm~X(AWx40WYck|6vI5Wk2Y$Cfxj)aUX<3PWolmXFjN
z@X5MSSN3;Nr*vyuCts}z+P@_!)S|_COE$6t!TT?%CEPyku1*aSTCR<*J+nR^1M4zq
zTaEyeA{NMFR-K-#ji>(>EoFJ^K&cls%N~KF-xZnbKS$U5P*TQS45bcD%_hIX46v#X
zGuoO=G)t+^jE*_|(Pkwi$GDa`Wpd`DdZNeuW_gmZ>h;e4k0j^k3XP1bZofSPK4*{o
zj>9WJF|k>Hg+BE4AqOi_Ksjr)pE}8FYL+<`FKmBrel$IpKVVcqo>zwV=H`rF>fJRA
z@<9&UXH^ZyqRq){XM;?OxSM8GcB9ZC>j9Kb>0^)8!`qZPRd=jw`Bz^W)mx4f{n{@q
zZ7oG(YTX4oWTpL#TKsa}wXo5aZ5^=pgEz--?>AG)G<LDtT+!gPHV`Xf84@_OC<R92
z_K)UAQ&nd#OLdzf?!2Aw^JBDG>B{J~uuv4S_A#X$jCi-$Z{{{Kuxj`cTR7W#TsPfU
zd-(ICE`{na`ZlU`=nHTe(8PL^A-Py_lU^$*p~h*HpF-#4G<74w-6_0NE6v?cgGZgS
zOQ7HBCx_~KpBP)HKuK~AY^(%rqCx(wKs2#n!_aWTVY|U6H|wT<30&0m`TlRZI8uZ<
z&TC|J$8PQ#ypFr2DxtJ`K}_+NMNCFl&5Akb?6~YA(okUieL4c~8n?9-3|o?t85~X;
z#U8PBC?i5}r);XifJ<o#Y0pYIRsB?Vv|=IJw-Nm0BFSwiBFMdVGEai%0tc%%{(>Sr
zKaMf85$HHJq#HOEeYcw==>$^#TBxn1HjtI2*TZLRU*{XrQs;;?tz18zmTOhn`CQSj
zg@;`g-$43Qh_PZku$_I+4vL*S0uwDqR$UHP<>t*t*h;<ABwD#XIhF(W2iVUv#QMBw
z-5R9r9_jLa7dh8-%h#}?^7UC$6{Lsg$Z4eMRjnRN$=tW!b>{nl9k;LhRKgAZ<mjJy
z7Y|K@iW*rFm=>$^&`nGzHD#clj3Pq0<ECWR86b$%rj4k_L%;BXK8mu9y+mBE;V{r^
z?k>8ur$K4I1+<g+NGpBO?N0Mn<dU10gd6oY-&^;(=g95P)SQl18+X8LQd`4`S#Z*l
zie<WtK}H3MQo$Pc1Jk2d{gjo>T<_7+Z7d44cgUS$&qduGs-KIL%_ZsvPZnw)<qbI)
ze%Ue{<ibZ{v5Ux0T8@O?!)R&63Q64ziY1O8%a7FIBAh_NE=loIOAGcj{zB|Sw#1ds
zs#SgWZqP|WUR>vM2kx(pt1~Q4ogmAR>9Kujc5{FIAw%EUvzNMTUtk<$$smn>(R%xX
z7RmF|JAs00Yi~-ItYHlxyYIamn4|efLXJQ@DK_hAgQO+6fh#sssJ28RgVny^M}Aq&
z=19iOJG1^YLiIs$NV^HTSx!u-h@ARn^^+_qcp+X=XACV9cj^+fj2a5ux74fA!6%jZ
zh7LBmNMHKfYKY3H`pd9l1}`)o0YYNF-Xq}pKiGQ@pr+ULTX>5F6#=&j2ng5^q*~}5
zks?Tk0HKM1^xjJn0TC4erAjYS1EB>7p$e!#kPZnDLKFzSNG}2M{c!KI&pzMX`~E#=
z?wxPun=`|VgOHHlTb}oM*0a`nX;}vh@=_ZzbhicQLA6Bf*RAxJPTWC)8NC`UbC2?g
zjk#};mL*;yUgg4T(5WCYVHG%++9L_%QTOi?y=3KfjnvaVgj~d_Ji5y7ixU${wW=?t
zw#JP)vd`qkP5D_^sxrnRaV2gn=jMw_)UfgP!u2~3sB)ddUO-2OOA^j3_4jXxOc!$Q
z?u}iV8>WhtUUw^VaB-fwZ7ZqlMo^%(KK?VAH%TvJA=n)(J<~0*4~BaiZJ~W<b>OBt
z&K=HL=4k`=CLqpsp6$RY*9VyW_h&G}o=*j(BzpCwM&fGCpL7{tr>rBD2`dvJ?(HXp
z<rGv!XunOF23YOpt;H;YvNl9Ir&=tPYwR>Shwfc}c~7=LqU8Nx+7j-O6%9hwb2L|M
z;P#Y5DBRQ7264%{-{gh^qiXr=qi)+<J#cr1lnRPQ`zhMpAsE+26-De66)%&3S*$@W
zgzq>wvnYG9l|A-uf@q~bv0<P3mEt6|P1YGqeNJYDG<h@Z^DVnCgua4}%BQ{Q8@Itp
z4$O0-fZ<APGWwBr^Q(6Rmql*Nu(5ATiTSJG^N815$Oo2$&F$|6K4Jw0{anV1LIF-A
zh)tJy;qS@Ij`M00F);+Ww80OF$LSDP$JLg<sDSnRBdvqbYfJ(okBby<v&(XCjf5c|
z$F+>uAMaBxHc&FDcw8B{PvG$}DQw<MW=ivvH%eX-(f%$W^5HrS=-Yba7z&Nhl|9hg
zv@aDfS_^Tn(v;|?!OQv*%1AeH3d5Tcl5Xp_?1<%|2u8}@=&L(>D_`qdauf>V1qOET
z31fWPZ9=c+&7S9};C=?vyw9WcL>@i<x(|N4b5K=`(~dDZ;9;I@kv1Z!mP~o*XQ1Iw
zS*FL2$w*;q4R!~)EpE_1u~W$j-dnpHL-^+6t<^=veSVqw2AeU;bE51GJ{*kTO~t-f
zQ)c#4h85@(L>DTF1{`E*Rp(ak8{NwD*%Byzg#(7CB5TlZaXvVupUAYn^+_!u93O}8
z<`I+L_+ZGSv!{}0GTPwa0^Zv4shWF)@WOOvFxcJ_%=cv|>Zl+>FLM)S1NOf@)RRZ+
z^$(8~51v-0o11<SqfXBp+Q<=Go6*!OTX&$*+-eG><0Kr5Z{<xxC)GjOhkhkJ-?$>M
zkBA!>3&rDPB|7%YD4*Iv{Wx;!iNrm)M}vnQWbd@k!8#`NgxSXk?#~m{q=Wpy?ci75
zP*SyK!>$t6*19Dl(W`TAQ0>w!2V704l+=@F^UhsY5%4M*6McR$kX;%RRyj~hfYPR&
zDCA#VVK+9w)?AXsbP34^!^d`lDtkuEiKzn0Q4HCTYVP8I5o!|M&&<^5^`JYn7}J$p
zzlWaCLJ{Y;-As^VTV0Hoi1p+o8FsRLxQnubF%}5amn@85Ul_UhZQo-K=f&Xb$kMGZ
zk!oJPIf&wocH*gMXaqe&zQOiR9M>^D8gLl;rv`fpmZK}H_>*c>?~6!J&8g#~7pJhx
zJ+m6&x=tOr*Xz5JS^AbzfGDf4()Y$R0711;ufd9T9p>5b{=_0$iC|S>J}Bie(bTBJ
zv3;XWO}njsyw2atVSd{agVE88;cd}H7nqWRGPY#*uOcpIa-_WE;ZA_BCYd|AerBSv
z?#HHDKc1#0d6TBvB{^X6>tgc1Jm4@}g&nN=XU;h``Z`7@W3bKMNX9_6`Ia=F6-3U7
zTm0NMoh7ADP>|`q-Mxh-a?S&91<@|b_WT8=FG$Pm{;Z2p9I@nm2bWs8qKGCG#rD9U
z+^r*Ws0X_7F+_)pS1lfa=U!_tV=?Ave01b<5m-1U{=laHrTvN&Zf*_|wmH8T&%|eX
zg;;z>(}HbX|FBB=Mok*qpY&iJDh%Mu_kOS7E|RA0LmMk&Dv8gcxeLq9T+3=6ceYtp
zzJMdlM=JhIosX%MOFT|;gg;EC6PSB)1>O2NB7RQ4u;$SVeI_L1G16h?n#VO}0B<>f
z!Jxv6WUkW~Y4UepyX?5-EZct1k$xu$MF(yE{KwX061xOnCo3tNAr@IWQj!iGV4EUn
zPPKPQlV3?#S>gFluF-4CEbQ_+>o>H~9bB7hqd!)^KK8mIw__vZuJ&<Kd<ijD=qw)a
z;rCN1bT>+_cP%R-yp#F>PEwl;tHe6tuOO!&aTT?v-ap467DuWVVM!9|8Z$y4unUz_
zqOvd@9@~vJPkbv2+_-Tk>{ijT2{;|*cW|lzE?-WCxU@4Z#H)N_`qxjsdl@P9o7DZB
z_zPW){(JI1`SjBQZ}=s>sl(!!ZfM3n_nODdbsAfD1%E2rXu79{z<Mh$FZI<7SHhi|
zcJDVff*Zh$4yB?-yq#;!yXztDHeIP6ao>Y2okq*lWRl3%u4~WtL~w^yGn`}ONC_0C
zPdCL8S5Hm%VC>tq=wzQ;)<Z)}=RbL&pte7=_pE~Dl#Y|!%URnj!4JCbxlaq4#YPD&
z0QovDmq6||$gL@mpC2gZgaml1+Y&RJ+3fIHiOE!+`&bS*%ctcOc)9joL(}4YV6TA&
zu-^>=9SB7)X^bxMbH6#ka<1$qdm*UcDg!SlmY?N0Qq_57u~wLMGUGXiPUR7Ze_JUO
zGAh8>Fr57@f~%EiJf1PSY?j8uG_a#29PHk!0&FW{{a&tFi`u89A^NWS46(fGvY*&}
zf0f(?;^Rb_^ImCzEL)L_Wr^%gYb2A~0EY;y)*q6IfkMp-KI`heDSfVCysIezDX1in
z*CrPe+j&X*x01f=T$or1ai^@HL?7#R0&hhxlPqZLb8vPWa?pD!+O#CCXt-QmCXu2L
zD#QA%R32je!HMuVq>=}^3AWP}r7n33vg?A+8X5mM@Jv^N-DhP4i(o(+h7vSKkDD1g
z?O`!@W@hQ$d<{F^vfN14Fd*H6BAFMZUA3kJz&y&)QlR(MODnA;k)KLf^e?{$y0)*`
z3gl5!j|T(C5`_(TrBmZ!eoJmPtj9<?@_4=c^UsMSDgnWI$!mkfb}nicV$}v7#3&7~
z#)9hb)sVc%_i})o;aalrC6qxS<#o;b#xF({_dzAZl$m11V&t4!OlxFx9BWZ9P6sfH
zek&!M>8e$%oDdnUk5uYw;PXBAOqa0)%xfTYCOV(Q{v`p(IJibG0Jm}JD9#QvIN3t>
zUfJsxwzbLckIr>B?sOgJnAloz!4EVf>l|a>hrm=jpjt~rvp{}Cf!O=gUH1ny^>Cf^
z7{pMi(bSw!T>Hp73m=v4E;hncLS84p{m$tC0_g2!*M_=-CO%Zdb=j=d0v&Yp-u#+o
z&6Ne*ZS|Yq-eq0XjFm?N<y}X%cFU+A8?V@vwRpb%xp;KhQ<F!|C3&*RLRi8>F7AY-
zwPw1n)Fc732EOI)oo4m@y%Z1~B3FJ|t!8P^j-$4qI%I|Lg2>>>K5~QQ9dfojS+pzu
z30P4mzrwy1<))q}V@$^(&d8#10Vh3hllNzud)#?c%nY%GsrqW{7a^!G+k0x_X0Cnu
z79H6cDWN@S0lq1Eg&=hRF$wFd_Xso8T6*8j*rrSWI&NWi!(`~H(eaEdc4fv$e2MO|
z5ipzN;gGFXo?8rq2Jl5Sob#KV@J6*tHOFL*TdxqWA7vKhZbdw@6|D?s{WW$zH;O<G
zBg}T))#gO3GJXJ7I&e>U{OoR^8kW6}Cyr5~e3OhD^Je%G3|ZXx43|xlf`AXUKa-p0
zDqza9Kpd4W`i4sqklQ1c)W>WZF3@Se=E7u*USrv=oJ(h1-No1Tn{rF^zVuq^0o{na
zI4(kH%Se$8bp&qyvFdBfbUqUn#TJ9h>vSKSIv25}RD}F)#R}P@Tg0o)IWXuu)Vi-t
zrbe|4o&dIh=y;0QPj%k4Pg?0|b!lufBGS%3thEV>dw3c{DAb)KU<*PH0he-n!VTsg
z<2==RGi_%e7thhwQ1DYKyhh>>>?!@!nnP!N{WNA{aaeus`a3!i-&^`{y$AjTe?$@%
z6Omk*H9Gi6H<K8!$r0-WK2;*t$@6I#%{0hkP&C<myNalR0Qt;fNU=WXz@1lL_5v+1
zkXIwNN|p9{QHxLdC6~sG>s0;9B;$~weaA=(;QbPt5`DXeLTpc5;UUh;Hx-0lSn~G=
zyHl@5wK?qWsbbRttm-BQN|x_|MMCq5X;9Z9q0TPv?oMLEdalVJ{5hwc_KDaVQLOwT
zsmMyAgVs_JX!IdU@pc+_h<hV;*2-b7qFlslW`NlvIxUhSR`*sO4Iqmzk4HPXHaXPt
zGwGN0Y?fgG&2*xx^rVkdMBB=FE^AB}G9)=?*R&q2?|1O_)b1{*MLHf<7)9ECP!G&R
z4$E~PrJv~9d_8g6%0#OR)t;SUaw{$))YB28L__{WO<EXLqFWL~`OIJK08IG{h`V{o
zo9<Z|CfcdngGA`osvg>P<*~NUOUHot6Wa&h+QUp?Fy9Gjaj)HnRuHcFnQvqz)!O(@
z%jBq3u`4jY`Ie!@hbG+WT3uaRqJmO4<AiUa17K-&(sxZlF`Isld$x9i>tHcnU|dBG
zWE(dCW>$av{u733=xx7f<G6sC(z;)nz$|VyuvvV4I6iHrZoff?Uzu5pWwZn}Iy^1n
zQ=Q|0?bvgP&ynzKb^?|-JR?P&b2m9%mXu3RKnog-rEd5&-^Q<J9p-2Q?HG5w%M-0G
zc&JR_;Jx|5;>FK*7LhyUPBwFw-2jH?<hR?`PNy0Vl*l$Y1nt-FEULNKK3Y=EoKSQl
z(0H}c{#3qH_KQ6Fg@A@vs>Viodg6kAIK5w`)1b+XNR}IMz!8d*N#UfX=Wru)p2p}S
zbd1b=J+qQk-P-58%3a%6wHPwS=@K(cdUcMGnw10}rg78GW8(^m=5ez1;`|p3&#)Ub
zNuNLd8DsoQYr6m7XQ#n)vc}cd#X^Uc1D>CW9y66d;}6!8n&>vAP_Cd2xW3o!;9Uvz
z)i3vSktfK<8w+Ql000m(S{Ks9vn8p@5zx8)$&$YMv|~bv$y?Ei5u@)cx-%L6EzO^X
zY3CE#eX~wuhTkF1C8;YwmYR?Ixy=cQ!>(K!qBWBZ@u*9b7iU$lM4SW^B;@R7eumL%
zbLjH>9uKaIh5G_dS^~x#20-tFY|02O&W5=xxM>?7ZBwpLoh6PXn%UM8UaY!ZoJ(Q&
znFJ^HE<pYG)%1-OkfB7qZg9wW&gJ}eN6nzEFV4YI`Ajf-ve#+dWPf2>br$OmONrFS
z42U`Feskz9Nt82b{Psy9Hd#NNmKz+$2zh(zisC#N+L`ydR9o{C>|CHDZkxd+q_#3D
zS|j~<5d^sHB1Fo(@20(zs<K1k#aN^ptZ)$5U|et3l_Je0?3vUJ*r2xqIPma|GK?7v
z65#KcWP@7WjrzS!L_i!~KMp|8qn`EHDB%D);e2z08gE-(Q#>jTQM>g#_hh6*#z3V<
z0JErLspJ7eD&1zsGN>({t(G*|-tTM*0`ZsOLcSf{ZQFH{B&pGU-Q|C^a-TVtRV3eK
z0la|ej1i|{>UjCpL}*)Bg?Gj%V`Y69S=RcklMa(;^F7JAL&RwT&jEo4^>3fjbaI#t
zafeYR72FTTIcXuzZ%rjk*V}dTEMZ(lcg9~x?tX$>gJ&cTlJFOS#sv1KXeL1q0e%rv
zul_4~Oe)r1H$*trUAcsP>7==U@lCb(lAE3tm-PV50+|Z5m3X2?CGQpS^lVC#tv-()
z&$L@R?G^v^ggp8fw_9O8HxqT1XrVzN4{U2YL`S_a>Ubwl+^}WuNJB`Nr~TK?I{=>V
zBMC%@LJ_ApMs;_SK`F^2G_`|0s&4X<9!n{8qSG|uzTC2g`ZHVnEdFfy^8Cq51G$tl
zBYp_z6MT_<GEu~lXf!2{HeP?a@9&jRySw?yl#8{+FI8eV;z8aCT5wj6-i4q7ASUbc
z81xl9Y;DcGQ(0!rT^#IBjl<kaS!*;Eh$704q=NlUViGqd-L^jFa%u?Y_JLxN877P1
z%HR$j#y1tS@Inw`oQM4ctK;c!s3%|C@F8AJ%Ru$q!_@dQ%qZ?<s+%G1n^*jk8P#c0
z6{0j2bL=##9D8|eL9<*9gZ<?7<z9V0!@JAL%j2S4_4Qm^PcQL`z4Z*pqGeRsNtkB$
zNj?DhK1)6ubF#<G#+UmWywsR)mzESVZj5oQjf*nXK&HNkqx1EVukb*tgfaM{NB!JD
zvvuKQGO19(GzZw_UBFHD(GMi73-#Y4;qvMHdp8D@;S8LoFoOlDoQ-{NVk?RDQu+e6
zJ-OMP#ZeKN)d=9m!ZNtwT_(P0k7<DmTgBy+51Hp8;xbHtOW9R6zP_u?3DRg{5em>)
z?v%XD0Xc{hav4!!XOSLt!c=68;&^dwJhZ#g8Phl+Nl7v3Eqt><zi=#zo*-|<?Ik7e
zklj1}ZE{=ww%xtkPxNO+mPfC#Rs`eUR^PnS!INPXw@?&pz4lq$`Lq`7VS<1;*4|yx
zssPk02_$I53lFTzfGHIbEBqpAQkZ3=FVi^ew=kB@g={dqUb(R`ho){tcNW~Ut7<uF
z`CLtTpY_q=VP0#OBZ^W8=1`y>6F3^+?PHufvtG5C6q9dl_o;STXws&C4O%idIhL|x
zXA3v3HMa|KhXi=XXgGmY?;_;v`|Cf8CRbtMH;PSaZ_X0S%l$8Xjta&9RD7cj0<z#P
z39m<5!tf$$a+$_*HFT}2g6=#=>^#&#ndzBYVa#<&&2*HOh{K1RYJM4R8(rOlz}4$U
zWftLl*bI{_5Rh6eS?cnPc__pDtVddPT(exOIknRdYQ~R|_y#O2A`8J0`B|VdtQq~9
zV`GSLn$-P+LDYHBmDQ-X_TpT%@_Zko?HjqP>ZP1=FLlYXC>pH0nSOx#7(FVs2=SMY
zo;4D8y+u6!2=-7;Sjf<!JBF8qO3HQSMN0pi`>aS|g+Yn4Q>t@iTH%=Y+`3)riM7|s
zr|%LPvPOtJguy}sWtM@$`OkW;tFf14%`&>@$HQRnS=8*vWw`uZe}PJS?3=x7t)oI8
zHvJWAZ)hnN9>?OITE%OGI$BsJQw`->R5mgJ(;Rf_Cwiktjuq2npTQVC%{Gq`5IxqB
z>pqI2$bnmdfc2x>!0s9VhFr(*yccYU=EwVR;xB<YfOvKDdyxeG6m9->hfk+4!w`P%
z$NM?>SLb5`0-G<|4QIRM&wCl8YG0{m(%Kc3z8S3k)hEX-DtcMMW6Z$L;5y$<Z#CD?
zRD!>)2dii~OooY{?Az^((H$Dj^cxiw(@EUi272kAXfX_fn&peYhaf<W4VVY!M?8bE
z-F~UXE*-^?a{HF4Wbm)N!{TKJc~^Q2^J4g%kZ?;qW<CUP-NaAie!#ClFHteG#nvj8
z$a|$q^p2oCrbz)PPY>vjAzZIuoDRA&{wY}^UhczZtNwzK0l2|C_^n6Zd`_I)TQ--`
zozqV1@0o_xuYwB5nMZ#nBLXU|pf5~qGZ`jRYaS;~UVfwUG{fX71NO>4_^SWI4vEzR
ze*XcfnuF?OY@!_C?4U+Ksn;hZ`;-ps_Z<1jA(xw?IUC!t=g`LdZ_#$Mfuk;gDop)<
z_=x}do{55TS^{e&S8RUr0e^e-8-O8fa{7|U&koCvS6MXx+2)tV=P5rs4nMela;JfG
z>Za?^v!7u8{qZYrcBqb#?oe3Vfhqhq9_%JS5YG8vasDUg>9^1K-^u!8*q{D)vi`Z=
z{=2jOw2l6|v;GJ}{|$rx+#@IdduII+JpX%U{pqOw|2n}CRl1yK0Vx1i+Rr>X&ZU7V
z=(o8yZ}hk7gYOgPY_%GT2lCCNJ#{uIFBT~b{fDV!-|4flyok&HlI#ZQ&`60uD=O?q
zt`Bq{(*B0$HZnJg{+5`u?RM%lhp=8vg~NG?0?tBQdxBbg7#2wd6c;K6KpIx_*o}+>
z>fZB8v~CVK$1#CIoJ%BrA`fyG!25x=`8x%}Mz6dg!`hWR+UTzf0X~iTH9q#!RbO^i
zS;Ye<4TJ0NI56O(k;P2!f5=eU_<{hRlMa5==v9DY8*BendqWp4@9RSs>`sJH>Mw0{
zIlX*7iFFy9T5P}irx5?AU48lKhbZl@OY)E$(~ha70Ul7%<qdnBcgHaj^tFNHhPctx
zhXYBE&6E^z50HR4I(nDRQ8D8kkg~lx73YY0zA@iU6ZOzFxzP-*#K>=&=!i?7^e6wa
znGhxYwQC<QJa643+5wHGE70(t`GB{6)V<`K@Pp|}O_wtweXYukaCiL9ZI#1(KZy0Z
zYdCwJX>z=K0IEx8ad#lAW21v%_E}>&04++oEZ^i$5{gd3b#$eR`)Ih+v&Dbn#bPpB
zlwWy?0MA_%l7rSg8(ZtTlC`%)DzF<%Cj2c#0)0qzVABPpozoZ)fR(bd$|}M23&u~>
zOD;eCVpLjf$khWdm1m17!tHh!$$TG|Y_FgRXus7?I3TUMa0s8q@#yhk-OMPiRVjnr
zhPH|P-#c3Q!#%@I_7C8DBm})~+Cz<&-8&YOXAAP9eg(4Pf>&cIS^|}CDXrlElEEA7
z{?gPi{I{$$mqxD#?vg4|@m9XxUnLTXZgw$;qIBYm3@1udYf=EN4SOwxy#Ok0#nkxz
zc*j`cK6;W(>QZAt&tl)EE<k%29c)Ai^}{ki1tvyxb_^x~FqG()T;@dVI)I2pW87r_
zrkwiP%JtQT_9Vu-YlF=TCpX()>EZZ6a&rn3u*<9je}bGj!`#CKdc-~4U_L+16TZF{
zcYl$xyQ8a9@EGW8QtpyW#S!{*hYwFVSY?iAe`$jD*FV@7Ea8$)1enup=UK)o?p+|x
zn;j!X>oB++l}8U~9B>E%&yF~W;q<oP|7}So^7R1B8T5MtpMrpTzQ8^71yOEXhMB3K
zeIKOE#f;w=r7f~J#BZ>n4$G@TSONXkQK2HhfjM5(=F3}!otL~FaaHMn1yg&=R%9%k
z<;`01e#m1BmP}bn!wRsD#}%vpPSU#Pd3>+v2vF(xUJ`Ojxu&UJr#VSpCfG2E%mJ)g
zM$aUnC)EJwyDv4HfqDp_?oZA>lxRJzp$d3|_GyJ^0F=WR;basObF8G-)a!*I6mcVN
zA*}P6??Lk?t7N0}D^x(SeTZ{uYFvCD^*(8-XiC6LRjarHmyls%FHebLl1G089V4Bq
zEoJ<HKLkBw+{R6(yXxZ>E_In#U+%!VxKp_6kXuem6^wzXdkup(b;Iqq&M;rEul01V
z<sY1LTARo%<H+tWqnpKdI1x)Bd51lyW&yQ+O@IV6xubkTCAElei+*HTsK{q-fO)pS
z;)4vqnV~Rc3owBNX!y`(OC}#qfD5Hs1{A`73hRF)L6^TzFnvd!I7Y%yHi`{1gQeDb
z#M-gU-XQ1EqI4&dLLL-kS;h~xS(nY#Q|v|K=lkU(ig3?aB&6%@`&I?ALC*nT=OU1&
zk$%bv$sn(YHbs+3sZrB6M#JJ{1%iNat`xG`nS2&dnXecxxsvhXI0H#_M+~TsS$ZfF
zidq6?48SM}6#b^VyBu?zkK5-#VI99IQXb?o#0QYzqVX>CJlCi<Jo5BYGE7LS3?z<p
zs2&{P7I1BFKm|2Yr!dN70Z`=Ljxv`@ON|$4HI8>xwxsd=E~B0u*{;7FE#*&C5f3=1
z+jGMNEQvK;Tw;|m^!Icuq0A7j9zvFerdh;eB-M|fQ~##D8^n?GG+p>Ix50tn$_Knl
zOeQ3tW9%q3sUwG%bd#3bh?j?(DNy85y4xOUk)AYJ(9bDxs84&`F(V0pq=Wfp>g+gG
z_syNeN*53!E|gg8+F^H`1V}J)<F4DIC5s8ApE?O(g=xToq4K?u^N`$v+~yUtrKXm0
z=$Z(?Q7p&fpnAP_NfH$fBxeqB(B-g-hmW)3^$T+=9LGL7Vfvww4mTAW4;w=gKDP59
zHW~LX`o;O6N_d4M)(|Y>#3xQW*REaPJ$5wAsvpaCVA^3Nt_`$WT6^Grx^_7eWp-to
z99;<nH1(wGLGR<JOl@xkyAd&C7U6tB+Bs~zC2;UwktUjwR|h8V_dzjzc|hNyukJyl
z)%=A=@l@DuYtnYRZt)~(%BglVtZC)w8pZdxJleA)p_6I!9zOzsuTZbCeb5=1#M9t9
z8|{d%G^kn{ov89!955T1wXAT@Q7z{c)dV5%lkD%EEk~o~2H~m>`#mT?VqpO9k8a%@
z>cq7^IVhVbccIAd%hmekh^PTPWeWx_ZX1%4q>m$O;X|ZBYr>q{S)Urt8oQSZ8jE<9
zH6>cY+E{TiL=L+9MZ8_w)6jddJ<elV@cM$}tNFB2OVHIH`{2zLDwG{Vbp#+5CBNq_
z`!$a>wpO3Asd-W6BLTKCg}7tWG;U~q&<yZF2p5W~oV&6(%l-GCz>u+3=}@4r^Zf|K
z8V&kH4(vsR?mA~KH>F<`{ajB>U2Q75hwrQZb<c8WEMy-zwr0mt_y6|Ta?Sv*<3eHH
zP8=`SM4lxkG`a0}r<}sX9dUhT3XY%L32P0vK4azw%hkgs+NV|BG^_*`7s*QW%{2R^
z=}lvlQ`ncJtvF+Wc6wm#X?HA;uhU@k3&DyDck`5JS*p*!rUddLGDeS%)qVIwMIpK;
zP@q@#Sf%9a<b$2H0s6=D`%Gn*Z!zF!_$yrcbq;ZNMB)#^RP`X9=Nz&7G_8WP^sG#Q
z{(Gg&{e{oDskPRdOyTEFRNs?Ka8(S{S}Kiynj-dC#j$~ufnWBboAOhOZ+A!VM8l0f
zMOlFDvNdrI*4z_Yl@}15iM2{0uU>+UwhdoCOvXBBEiXw_f<o_ydFj4_0FC{BC@IS5
zla>eVT8oA)ZGgk!G}}ZAnkL&Ub>vPFEdAbm!feGOfbRyjO(nd$WuNb9j~`J#&p6@>
z36+M*ZaW4UUPR{q(BhZ^P&dz~*X$B1{@zP7&&xrwp6Pcf4rq8(LAkw%&$ch(tS-uX
zF*~euLi53$PB1|Dm<<~KEl$ckKb;M%iDhR5HrM=E>E^d-b4>hV?YT5`=q49Y6yt=i
z2`CA?H64DB3>;PhsUz*%waf1EJEAfvruoYa(zWd`f*_o|ZZwOAuL0W5tDuhpe;tgU
z%8CE#JJaxMUxPIbf3b%S$gYW%H`jA&0o|rF4V!wXC2$AZCRv^(8zUS$0Jl4T>JCf1
zC^tZ1>{hMY{KjX%U6_6JI=6ui`b)ieRz?bgTfpRjHux-0YwA!EKi;I_vnDQ9j9YYL
zqI<He?7hk5g;yYTN%y^KP(sZNJl%S*XqHFw5a74WItVuen5bSX7wxg!#=OCUp85+#
zbKrdmqrMuWenYLrHJ1k7J1LB;R;L*EGECAykq$*Kafd>=vAPg<L*5vYElIf0(?-9m
z2vYAJ!Pj>u=4Dc6l%(INbXgxvgtD4er{F%mZYNzb;%}y&T?5)Ek6)k<TD}w-rEi9$
z=LcAxKCSinx-Zyz)+*#O^kb12caVBd?lCo?+#d{9KJIFrJVE4j_vPYL6W8v~dPXft
zlA9z<)1GCYIxpSGrIyX=y%K}5KFsrk(V-BI#&=Q=<{Ywye@Gxe^OBs$^E4g^ONTuN
z^LM8phBukl7%^4z4D-R{R?23`4_cQX<r56^TGOf#q;MuFH4jqlG;P&6wVFmAL_Kcm
zD7W7f3j|Hml@ttKMHp>}6`u10n%xD6ZsCAPVVBW1>0IKrZRF&frE~*81ehRC{?7PH
z6nNvTwWJNXM$GlQ>oX!~k6y7Fv_dqm$?h*`qtf;7M3yXoMxYDJHT5ufhReYiWu0a?
zlL4}r`T_xa*lGxHJfrKUtIE6vGYIwwrD+44nkRoh(0d3#!*?SxqAS$E8@n$WG*L>j
z7V?43Hx5_=10%lQk+C=rqm^k;@?}mhec5<|!~9y^Yt5mL<{?ofznXpKavU#dYQ>!t
zoH#GCg8Mq*@GN2FRPwgN_Z8vhj_qgad&1k>EZ?mEc)RlDwciQaWvRsg5wTIPUK8Xp
zdbQRox2_MGy9k>~;SN75d5cC=KZA^5vtH<tocx0JST)$^o7f8tmk2maF()e9v=h9Y
zOsXa*dnB)|;gI9Y<*e9*8{)$PydL+FH}kqv_*XB5WQ;ED`<2V(4Sm55vm&?$dV{IN
z{TH6446gJ-O{f5>FYoGR59FF<VtRz6cYABP;2DNY9yhI}?`4)O?+UJ&+s0K4f(c)8
zFVcb|Wo)VcA+lhAA435$z%r+UdlC$!1?X{!jcGDi_}X@92e#54@OD?KDII-4n`mS$
z2@9;!)B`wcM(K<<{B5u%;xp>}c*DG{l%mXg$u)h=1~C7%{bvv6%C0R%FR_iFGZ8(v
z-Z7i@oi*&?`yd3JJZm?6KhJY8W1z7A$#b5x9T?vNG-m^?%!}T@zH-L$-P(97^J0AZ
zSyg$c2njdr!Q`nyBsmkmD*qrWu2oWU3bE*=uro8)Li{&eJIHkG(SQ<kJie2hNWJ9K
zvD}%+CAl6|MqFqaPX!-%{jRfMT5N5N@Qypzx%>$wqq3tsN{`7Q*Oj_)(qM+eH^uf_
zdeG{;qI)@Dbr)##bOa2VW7>c;7i1pI%bS1FWgr1bmSLPoKF%F5Gw~QVHriv^vl%g7
z)5;O^dvnsNMy^I4ZBzNs@7jJ?N+q#zv#J3nk^bhx*xfy1qwk25Wac*tKtMtCK(*IB
z5*n6kCU(iiKq@g4yJ3?P2StB4X!4(#(e{Ik04c1IWGTX-x2}M!qtJ@wC&=EHHQJ6S
z>ZOQB!*kBZ(livjgEv~?_qHO|&x4x(-R1hTtDh_bLKgr^SBbk|YP(G46J%J6JTm1o
z&%P9tFtzADx1RXO3*7kQSk6)OY4NTS?4Sg$-Z%JeHx;Z{E?<JG96v!yd*@qH(Q{RL
z(|2crby<yKM#!N!nOZ7Bg3pV;f!%)sU5UFS3B78_u*-IgbdR;TL@Zk|R%XZDEt5~P
z9AcEtOa%}oQLq4o8xd%{KDwgU1&=hyh1Fc%9FBu2anfOoxLmsS^Z=#l*R2H4Vc4(X
z@T?TJ1%BF`#bK1Jr^(TofnD&2nQj(Oo*sJ->t3LBe-J~IHG7j)>Gjg=W3sJvUAj8U
z{8F_296U)FUi*q2)Z~O3X++>YxHKRUf)P=j)cZvzx*X>;$#UyCK2NR(o;}`*`jC<n
zvd{Q~<!tVZbSh*#6!wjNV?-G8o5N5Teq)9zV%_&_vKBVC*0jpMUN-wNKM^QhM|;U4
zsO5nu*|>U^A~h^Pl3tAIT5I0aybz1bp8wd}@>ugRWp6>odN_EpF;oIZ?n~x8ov7qS
z=vcip`<tjWDEZ1^%*25?1@gb|F7K-I6vjK4t1!DM%Zl#p>_+5=gsl?F>$l2If8N|8
zQ}(=6GyB7{I09hS6gjrQ0L?9SUa=4Hgn-F@!Gvm;$HTHOYJHqb%*$skCN~grk>*BU
zqRGcd_Zw$Do#4SAL{o_el#HzO(o6qmW9E-VsBW~qy^rf<CN5Cuw`PXAI`5rxL{iB~
zR&Kuvo$_94SD#1PQ0`9OXAb{*aj(ITYNDb?7q{tT+M6RPo3hB$JZ$@lHI_Vhp1!6v
zoWTkFUUzFubdt{J&Zb|RAHNPmpweSr!t1BtAv=7rv~&74ha$kjhZVZpBIQmDM{mni
zry5bo15`F6XKdnCgoH)UkGj*eOr+Xq#^+^-JAP{^X3M6VcfbhF`Xd7nr*YW_0FU$R
zSjy!)!|QKOEx(&mmF4!4&)eQ>nO~mKOPAFPqNza&RYFwkI3A6}!eQ@L%}U$%(}chw
z5FBq91H%HVxbM@^u|<%9{*n)i)oNkaPFK#x3h$^o?xu>U3*IQC&+XPfyZ~&!d_w=A
z(vNafvY2c2c9mS(YBvDq+~;LI!M`n6oqYu9jWBL&u`CY#KE>Aux1Vp(9}KqK0g8?E
zf03wYRAT=rQIVCgcwGp#ZQY))?XNszCqEJfXcY4xT1&sDcj8RniRroiA<lUg=6a<(
zZX$$&dK{U4|HkXeuu6M(fSgDur^t#Nd@Bg!RLfIAZPq5_VM4=k4RvSrWs5EMEf}Ib
z%ICxDQJpv!^HeE62|ekqgCn(yl*}6Y3D9stA!#E-l4(UUHN&Lk#Kx%MtC>6Bro!z#
zw%cx_BB0Ip|Kf!G*^u?WKd^~hbsHnEuXTEfMB{~oaA+rogs797$dr#P2(c>fHIR5F
z#RZ)|b(mQeb0b(A>jA+oc*r)JCcS>OsnuPKV(WPJkk>HH#ORDP^3bp!Yyt}x)-oTl
z;WsP_eZ6XOtE(YHw%S~CKNxC*+nPn%2wz*5Z+_TCl-+sb*jGB>O>33bciL;(Gd}H5
ziz2`8@1EE7A`#ITA_63#qql?cdtDEM064&QebXqevoEnkck46i9H2-!OnU&iR3Pku
z3t8|npySZdH=7^LL_EB7`(uR7ZvvZj1+xd5pYZ^}+j}-<gAH<z(?R~1b{W2kow&v3
zV~b2t>h8M3;2~#ha7oBuC?@6vuQ9-L3wHO$|Aoe;2Xj4sQv)E|0{XUTP01-dOo+le
zTy3rF=qtbKJ6scSXL&xSY-5FH^GZp*7`Bh%D4`AE_eMB_;?FGN@ce>?@CW^IFJp46
zLO5d8w)`x%wnE&Op{wXK1NQtP^~IH}Y|%)|$REOe|3&%;>E|;2vj*XJr%?n^TWyoA
z3_<9%AM5eE<*zD0UCS`a$8F=HzObJO5pN!OzePMx)4z;!F;wq$u{9!-xj5oScA=j~
zVD-BoNB^DtT{pG>*>n0)NI|<H6cuisGNU4*UW8M$y$!6onXbQwwYd$bHjM)FHW9)?
zdAi8C^(1-ZKodM7nS=~g3)VjcBd3d>k5yCg><5ihLcaB$+C|^z2M9#SFM|(*hyc@N
zO}4R=3}z>}fZMow*Wu8&d4dUuSk4aIc9=&8mZ&cnGO_#!zr=h;EAQ0O42T0a>}m$}
zdCtgFJ-!Sojz?7MQty{!I~B_`ei4F0RV5;~$%^7>+d%d5ZYT$-IF^IFW+a)f2+s|?
zRj4J>flUzO-s%gud+x?^QUCo;;P~U=Mg{KQ*a52Vmvme|OcE-15-NuQz9Sf6tIgO+
z?|{HpDLm;U^UPWtitSNmf*V_T1^q&t5X_>(sbS*Y-UGx%oh!4D7^iJR1+V$aW7a)^
zhXSmQHBA+LND%TS@<3`M<m*|x{m92sP_3?9s``8mMq}hE;!x;=+WR(oyxh9g8RaNz
z*5g;oibe1i3P;E&pKk>;T_A_r_dY&$gaz<m)%N1SVo7{S{HKm$_4&CH`uKgIYtT(I
z-vW>##eIskc6ts|SY?mKGjv`G(Bydle1DNcMm!|RtHYM1T~$`;INky*Tz8-zjJ13T
zgfOXJSbxJ4+T?;ffpzUqS^r{H0O%Yg`(gcpMnkgSlY}tdx+P%0?>lPoYGX|;&g8~9
zvIse>0P<@~!<%og`YUiKUj^<SQDI|g<bgz$^K1jjx)opNfhuHAsHHp|C^AwguN+C%
zH|)XzIwi7*XxmjN&`Y6u(fM~J?DNwDB?ih%95RMUPi$)Mez$q}z+3ePA|HK|kh0-J
z?A6@en&APQPJEDZBSHmGUZ8;8JiO~6W=<j|qO#Jss$*d%eAIo9KACMQG>ofy@RQ%d
zM<UVLpupro8t&44HvS~5#m+<zE8;<jGg|7P`n^ku;n%|J6XdP=PqkTddDksfJkz;b
z0i`3~g)OXdI)=0J{Gklf;KG;4bX!IY5MTgeda^`Lf8__Ate9}~>Bbj|KQu-w6TGud
zvfK&wQdrn6YdAl-ziC%w+KVTD&+=Oz%sc)5B7Sh5Mt^gStSX$A75oguIhpPWH*~1o
znr_G6jWNKe2^Bvw{gJqfEE?V#ECS?uUjeqbeuwReML%^x8(blmLzQEU<`hLMv%88O
z5LTZU#qtjg>f4pwDFN+#l<};3!XacVEbZ_zP0YCkz&N!82-BfnfJ$$nsuSUK>IrLO
zW%(KZnxogZEnwA4|Fg#EN5v`ND{DEuXz()Jr_|-%zAPBhl~kX%(-OONdQoQtS~TYz
zWh`lLw-%{`@^->)0X3FUf%*O63<gN|tn9NweZ<vkOACSkBp$_vA43{hkHtBNGM--g
z<o&`Jp!HHWtN%?C;&RLJh44XwL?w>mGg_?G^bWv>y{a0}H@mwiUjWLwDx9^Z>`<}s
zq|Y!FvCNpbL3(VIcenE+3dp2`sZ_ivi7e6h3=6bXXRaMWuPn=OC6fYOOtq@bF@d3;
zDDIkfzY46eeOUx<6ek^sdRV5jn`WDD32?GEn~xpzFg$tzPd`V0eJN5qlItYK=sBs;
z9L6Pr*9MrPtFOB<>=ZkSZVfnae2G`sQGR^wCQvs1E>!|>u)br}S#w-Bv@n3oJ|RCl
zo45~1l|HJ9Hxzk%JK-UOE7CTF?ypyBfycfD`kAUx;wltLmUbgB$iV~A*ZUGTc$E#K
zfrJ@QYJ1t^z&9dTrZh-~O{Wr3R*4G(;_#5CCa#en1j%N1+?gMU(T9P6NRKfcppPY{
zuZ3Pn5xdzfEoFAtx(W^|(I6W}vI~|HTnq~{U_(i^kBT9MjD2-(RYDH26J{#GM~1?P
zy1Cu)Enxl|e*ri+0|pe22VE$8Sn8#H`KuvwV6Ub0%H5R8qq|BJVF>Xns&6x<zqCPW
ztH+mVDoQ`9|1?q_2|Hul#{W)2#vtz*BFDTYzfj*E!9RehBie?JWq~E+#mUOv{1AH6
zXz$Iv;|5o7OKSgDF^BVG{!Z!<bAyvuI;Whl6O$~Wj%V?mGBLz5L0qTS<Mdvx<QsRZ
zLnw)N<<WAaDGD7=@Nh-ie3di3HNWGtP39odnW9uFh?2Mct-g_lqhU0i@3yy}-7f29
z+<UC+JYCHJ=sk3bdZJ{{TZ}s2bn(Vz{c<Rr-+UmfV2RNfGITx`oRQKAwPIM`^R{U;
zyHni_Xt37+UnblbWeCz`O$JojCZeh7y9X9NH7i{3Y*ORJ_#3X%N8_Fh(_@CI!}?{7
z-&E9wEwTL?aC;jN@wWE8_6|D~^A*4|^MNb>Iyi{FvUVJbPA~eVInw+M0ZSA<{i~us
znEU}&3*3haOq%rJJuH_-nuf}V5ye##$cCJ1-^z=_P7ec+cU$&b1;Rx-4#^uOn#l|S
z<2NpOGM7{UDy)4=5k0j)Y>9Czo0EZa+Kv6+R9M!55}r{%`jj7NnN&o%eHt5>W^<Px
z6xEa4rD(U7)x@TLx6AUvS1l8`mb=poY#X_EXW<lumPnX}!jlz#UkY@0!N@NFz(w5G
ztI0Qvl*z%t4GCqijJ=K2*#{v|dD!eR5_^p%(EwGq8MW~B;M4wMf^}YcDa-$b0_z9H
zHN3Tg0+Vv9c8%PD>n%npwUu~z<e86iop`IoTqk!`1>H|hKMtf<g`iC`(5Z;NbQ7xd
zVdedE)k8qa0H~S{mR6{<$2N>ypyWPQ_mw_UMOcr5O&8jwtukY@+>6zE45!w=6e1bR
z5u_(C9zg3q_06p;@zOr-4J%+}G5hWShm4Xe+oo>p7Ms~jyK_H8410$k2qVE^Fz0UR
zJ7(?xz<Gr~`QCtwrmPsYv(>5J8N6GWzg__|hYeFKwMQ<0n{<0$`Rc)W-2QT<C&`H4
zBGEbgv6S`+5){$jjvZJpS7(#(`ka!THz7;sVx{GL2vt&o6w2HPh5hMcfI6<$RPR&s
zR}BPMDDD(a0oqhE0nT*(T@2>bUot#OsI(t&jUo}}EO$npRmys@e)6JZipKxe1>1Hz
z31Hsy!wBSSW~rXkKsc;)hxiJ}&r=0V(N$*+_4Y5!4b-|32_Xd0qf8|x*Ub4%XPtia
zdpus>m*sYCXR^@6s`EL2Xw@g;Wp1u}u7U^JnZ#RCzA9rW2QTqY>18Z}1{{3%zP8LO
zZQuT|cJ<#xApcn(?c{y6v`LL2eAuUaE8fjnW|1I2=y9xRajM-y&$-oZEovHi@ePR%
zyHM`h$WBH(!mz}PM`)5a2B-`Rz;QfrmPc~dm;^sR>Bd)y8pQLNpRb*NDkE)W*eJ8K
z`2|1)=ccmq%%p)9@SX)GpiHLE=whyJ30<|!#vB*ThDUfrfh<ZTZ#jLW!8%p#W~2=u
zCL<7^L<jnhLyCC|ZS5?+)P;n(m?(-%uDY1{>KuB?{gx}pdW5{B{2D0tWd1Io`nS{&
z;g8q<y8CJJK~?yWr=H8l(0HZMVe?(HbaSy@zgkJgi7-lQR)sveY2Uvi72@onY+CcA
z{tM-a(QPFy-vsW@=k<Za^xS?#m$6fOZz{NNgJ6`-#J;d$Tt&cmW7}&)OouwpVDteN
zLE2!f!<*M>o5O*_RUVqnZq&T^O6M!|QOv)NdN%$;gm-UV)nfuM_{y=Ik6)sV$f*=G
zZP3ZD1>`B2bm-ssHy@(pE)z0RAia}^Em<D@B5q}x(~~7Gnwt6F2aZ|-25D8~+Py=Y
zJiyS5aYPUfJ?%9|@LfO9?gOxIKbJ14;Y`Xzi%ZjmwU_ri51>_-_9VXsAH3i35!8$z
zn#_0uSA1lbgKGI7)D|rp2fOh1X&RpWKLKC=bMgEW4PPK{E1CyO(`ckc>nB%XBoLQ<
zrW|)eeS@J3sKXOQ+&p(rP}C{qJPkKz(kU*n5TKqVEvNgob?P9Yc)A_mF@vVudyO6}
zSU_3BvW#WWzfjP_R?ix*Umh&y<zU_s^pWZIs<(C7MJmy&2|5F}UIF@<{pE)D>Fc#=
zpCpRWi3Xmhv8*8Nt`pyzCtvoJI-8cQe@`DvVI)7rR2i-1MU>-TQdu5^TiAs>@_yYX
z0-MvHcL2PZR)?E8Go6>2@2CL&fs<HrzrzVh5YDqQ6bUy#%uU%s0fV2v7V@RTlUDbC
zQ}TgkmuL~*-rv8!z1K?PJXBFOygRPPH~~0X*jQFFH2~7&$<a`n)$sv4cYyxz^}^#u
z>UHdZJH?^E+Su=Ih`aaRXK`B6q19TbF;BHy<KCEZ=5*vDQ$=6HW$XC~9`n41YS@Jq
z{{WK#tSneLlBtd8$y2$_NSe6S;NtvS-TS71YW4)+JGEav;lU&r#{guQ00@1~D+T0<
ztWs!vxiPlz(fk;Y(A}eMgkIZjZvD(wJNX`M{@J@A!(3(?0L1|6M$(;$@KC^JuxqI5
zJ`Hg0oFk-r{Y|0r3zewG<@`w~X-<E*+##xv9VI9#D8y>?KzZ^9Ad>w`bQ%f>D~4x+
z1t*L@l_U@;+!mlF%IgWB`4vNQ5V;_6yF+DRIDsqP;laSd+N-xAWQQktNPG%xp}KDO
z;9*&-0K$qYr!yyZAwP5(pf9A*sMx!~C2)C*rVY^k2wKdJ^?9eHzG0e%_-n1FKT;Vv
zc{a@52D~@a5(LmU*%sID+~&5KsQQKIQ;4g5=1H46g@lzjm#s#ChCARvLoref8-25t
zO6KWi^9ws`cU;=<4A=X>`0PxPN?vvV4bE`5uXquU!4d+jr>1FWye24RFRrQ#RU)C=
z5(^_?C7V}QNItw)fO(|*>ZF~|Xi*VZ;Hy}m9EVY(Q(hq2%5QSw<sI(nguc-nSoSoZ
z2_ILmNAFkMWkiNe&}YD_?K69rs3f;d2GqCvT%QYUz2JmKOVkxSszOlZU*v<k+me2B
zo{l_^3SW!eJC9lfi=Y%9l#5>1mZc*!ZT}Wb<W8J23?0D-c=HO4$E&+%Lpm6p3B{xP
z<qoiKf>M0Ol>+Nqga(z>kK4Thw_KJ#O+oAR020CCm|9i;T=9EqdRK<z#d?xtdf?KS
zB%{R42CMc)O#X6)XJx&p5u74sidep~0C8x<*I*D%#vO?s)1zeK+OA@#UPOio4_npW
zDJV#4kE)^0fjTx~!==^%n5%M}Y?YKmkMiyuwGq)RUgN{Mn{A{bnp&B-c*5#<)YMFD
ze!OmV{&40<k}~sXNy7Tf(3lgdt`l%lI(dBbU}aYMW3#KnM%g<bZu8SZN8Q1Bwuk~`
zzfit`!3n!3TzOCkE({XCGD3}<eDfsUmXnFwNg!uZ`QTgHArd#npDg;e_4C-olI_i8
zR<L%le<gg?CrXxkuwoZzH}%>vz>9{Z2pu?=_?Oq7SK||bY~)-P|N2p_!X%r{X6FN`
z(akI00z4YtgzSENIN;n{AWv<t)sZV%NpBGHgo9^9q;O;kMT_@|_tRQplz!A`Mj#b8
z)IY|xe}1&4QhR%LAVXely|tY(q2Kq_4A7tPNGRfv^ifgj*7lYtt(1D(6Mrp%R4L3=
ze|1UM&ioA@w?Qq4-{%t9f&+tJw*u70qJ_Q}g_7@WBbYtz!Jat=0KIoHy5BXBZTF7^
zbgz_j%O+xbD2ME`z1k7L%Oz3B-VlJsLdO>(>eP8%IuEo(Jq`>ML5|^Y4xgTC4xc$$
zJ?gDlW)x@?h029Um{YWqSw|QfgELI(PNM#Xgb|cp{1lkmEWTz<gWSDzKcK>qOlCKg
zu(v?Smvu`oc)STu5LD9_jCy>f=FmBWJpStJ&^|x6L7@_@@xx&OyL8Z7XCUdhxn*hD
zd${p0K%Ql7aXV{u`TxVh(aT=}h?e(%*E%+B+)6|sOGn4*j;{7jn?G2~jhM{rZE^DY
ztRG80bVkjQ2LkQhA&%m45^A_@zdL%oY|i@7E(XaukkG4eLbX3-VTdNEF+Imh^iOKX
zeOX3M+T5Mi)A5LCf-PbjXEGgKbyIHtiQo+p^K~6WmQUT+?8ckXAM|1YJ$8z|Sea+7
zK4~pEwH0W#-kA?q)G?mUfi5+<IeP#a)?qnE=+dE<(L$Kq@_p)yrh+bE1eybppDP>$
z?%||f?PM9ADEVGlb+AbPM!E`AOqkRetrBR68=SaxAU67&yFf<WX+V&S^l*GXXc9(W
zW6FXp<id@+;kx#0<%2+brkuu=<1TJsz5VMFAY;T?=?oi#_TH^>5T`%o+qH7nSyc$k
z_iFD_rWepPyZ*OuCbax_^5?t-&@<}l>T;)Rp&lzv%&G~Mp)PYeY;cI1L*MI<ca3yv
zArp^FI#}1#B(13o@R^q>+Boh#Uh#2Os=rSz6_5_<o6_l%1drDP-E||2awc|WK|pc%
zuq4Z5Tp_O!`-jL8F>@$#l$|wcn%y#!S<-&*FD%MA%RqaCUCi@-mz@{dGg*J6&;V7X
zfr4|qu>bT%7chT}+5Dmo83*Uo`ftotuf~ex2FK)0Zm&?GGjzS4fRSH~Akqn7u=8C?
z+ig+W@A^|2k*1${lDdjZj<oZsn*koVgX%{d`%4)ClpsNXD-AE{ZNvCGeJ$hRF}U5M
zhD6Y<sx1pYYWjrg6o}sbeZT0#&d+f1a2-E(<4LN$cXvbGkkdb2^uLM#p(#`#qawh6
z%Ex`s^D|Nautu^Dc9-p%Pn{_O=y!bLyBG5jevy{DihGuJliG6w#W}}c*uU)bwxguV
z!sN+fvv=q~AHttMJ(aDmO?T&C0HO>&pp-TVBm%B2G1g%8yXSw9?|<va{PFGg3C4@E
z_O^VNrqgc%DOia0v9sI}A2bCH-5!4vF@~*0#aFQ|{Z_^P{yN<88c%yJz~4Aj%G*I0
zeKc%U;eXE=pBbu(>#8;o_0%DI#W&ZZzet@6wg5Z@v-!Wr&|vlf31f_i&K_HrY4v5w
z2F7@$5dC+g)4$|cYSA4(iHZA~1>mMfSbXIF*RlQ^;^C2F&e1;}w%o5@M;EwF5bX!J
zz0$M&<sbQfjocsK#?~Lcg~4X_r6WHjgZ;-Uk~#%^Tup<JvVZHZ-3M&b1>AKc{;g|7
zxHSRFIi`Q`5#QgYYJc^Ew}3h`JSCRyZ=(5MKP@H(5I?SZ3SXr88$#?K1NZ0WmYW1V
z;Z?32g@0Mb|HDKEtzMVYDwZ;({rqoTS#lN)e8RQ|vuFSM0seZeWc+u!{xos_ovuHv
zlK<|mKL?NhzwWNy(}9$FARflYj{Rxw|5nWO#}6>!PoGZ~{Tj<F$n?zZ&mr%RGx$&Y
z>R<lvJ}(gJ!J7U*bLGb$k&;`P3I{k$Ugg2Wy`OK^I-qQJH4`JRYLaECJ`xB7+VN*6
zwX^P@4ElQa!x_o7r6mVgb3<)M*O#s{XXt*7KI_Bu`1+NlC1=sJvnQ9A3zB69q)|~7
zDIUw}Y2ykS=}U9)ITs%T9C~AY?b?xZm;e2*lR-~8qZRBfU_w8{D!%#U)PL|bd6nt{
zhLjc4l=~mN7<kDuwacp4!<uKgnFJokD^XpS`}e=z=pHk<?{0fK)Lq=`o}u7>@)U36
zs*iEU(j;)H{bvK&rg-e}i^-mQ62MTbef$rHV*9#WW{RfkXv8s<r~m%f<*oBCu0<<M
z%yK7?HLn<}{QKhw4E@=tUqaniW?WhSvq4PblFKwnlMR*n<v&@CmxBzCoqI7^%BoQE
zIk<J>KUoTLROocz3DA^FWugDs>t#+|zY?v`?ist*o>2StKid+wj+`dBZyzi5KOBWN
z3o24a+*>Yom`Sl|;}ucoG0ZnC@FSHpxt;PH(o3$iH}j&-Nt763jJQX%+y*=@TdOJ9
z{cDdlI(<@3D0R`vu%s%MU;kcYx(rvVW}1f0U{Uyw<#9RT{D-0UazRwX;6%3#me#Ao
z`L2qIeVjS2nfHd}9xc3b_^dWwjice8;{3bN5F~aiIwd{(y~(BLqxTP2j9;7_=@p4I
zcx9i&1i2U}WmjX;;4WOkej2}j^4!cwLxiW}poxH4`<w0?-Ld4YW!c~ht7BK9*hdZE
z25|0VpXInvUfctDhD#R&^m^8UM#-~zu`kX6Z6&RnrdiMUz(_L@C*mU7bTsJ4BYgeF
z`MobgM(4eIjw5xo6+vmZ+RLVD!(_$w{KbKJBRi8kkjTmTs--s!^U$^`nX%K60TaF7
zME-VlsYJ^~Ulos?GU864v~y(}mSsf)|8A%c&%2YmQm-46(hycJ$5)3equS$Mr!cu)
z;Co}P8qaqHHwc-XLg1HZHP%LMZGSb1<I{fFy=m#L-#tv=fIVE(-UFKs;krvWJ#IQu
zRSRBm<<_ZwBX#g0?#nXdt#Nn2#VrHp1yT0?aEmWA-D;#&l2vPv!X2A-pEQA}QU3-h
zw)??fKD}c6buY?cH@9YSe|@d{)aKVI&vV>*rBCa~tFAXQx=VHnvF*TBEd551;-bR)
zE00%`#&88H9Fb%Yv~Ei`Bda4`fYT-Fk|XA#q(@qc9<B<odv5b!OW!nEYMS7h#G}=u
zmYH0_+^tpHjj!FoZ{f;fbgzC4H~4?rd&{V}wryK@M+g=y1PB(~-6=FU1a~h21oy%{
z!6mqRaCi3r0YYKHrEn-*f;)UGd!O&MbI)tJ?Y8&l{dxQXO4_PgW6m*pAAR)c`v&%Y
z?~U`jQRado3)Dn2P>^ZFuFM`a+D6NT*~BIbYHo{E$;$-;e@|zw{x+K<kqeNr8Oi8i
zws%WPJ^v+1!B5UGjE)A|Y3Wq1BIAZ3gKCWtgSP!x2D@^U+cq8xEzNmun>U|XUFQgz
z-LAu~ms+h5i;EO~NMD}0a~f_wtxX^CCCVlOvp0Xdw=v2H+i*D)j2<Zv-(7R#^6Pdc
zqq%lEx{9AH22a)P9rvHg6tDFT*N|ir^|x0PCNaqTX)pXn__-<&g2NCWXL`OT-80XX
zXTI94P2hNYs1LH=5AbUAaZE&RWlYKG_)t^h2uoBcS3#gVA=T?0hBd-Bnony?CtN_T
z4@Q%3t2;$8RSGqnPo-RWlSuEsV($(-uo3!2o6Xn9NIF}Zf*P!NY=2nT^t%!^{>UH1
zFIaH8$9|F3e+rM&;4_>6(-eo}(8cEOd}tTRPy^zd>eA<ZJ1=F~-qwM#*hZ*Vlo=>d
z%b+8bThn&Op@0FQ9er_LoAfd6Wc+J~3ahVA?e+E!2fObzL1S5VIYR~c>?YqFxL(w~
zxa?9;oZjW;cu5fCbeC7QP(_v9#8q}`J44b3`6uM8xg&OG61xOjQhx?NUYY8stoo{W
zcR<!W8d5&s8{(zyJSISsA_P3E-jX-x5(GIMBoae*hd^a0T!uzh#RU=MCWpcvdeaY8
zOec$A+OL<l;m<tqpETF{ES$M(2Y3;~8lrL0c0p;iUTZ1o*+xgDH;)z1g5h_8vf2%|
z*H0Zremsn(=)1lZU@Sdz&x)Lc?@W(3%cC{}{_;_Mio?n=u6&U^%EbBaag$sn+oNgI
zDi0p7cX)B2i>LLKwW^h7e9~#uXz+#n!IS2ezsNzNpatWBkTWb>yisCd(dHIDT*hLf
zLaky+c$iA3sdJZLjYr(=J)V_;r(j=_>*3}QDx&{XdqGUg%m<>&Bo-RU)bl)u=Ox^*
zzyMbL_;pCTg}x>NbpWd>Fte1$PF7bqNpBA#ga>AHyUZVY*t)ghH@TjV04*2kP5YVg
z*Cn7crKM<i3mSvp_hIh{ZB&hVwoVV{w>})YjN?sg{s6`mhBT$}f>>P7@+%&1tlQ$4
zY_b#xf3@9^ez+IuYZvJ&-MMQg`sj64bzFUfBP<Hi+aS6uIM6VY_)|K<VED`$2w6On
zpSxg6r4|ol)kf8<Od|HY=pWVvFTeJ+cxOj<_HGh41K&3}lh?I6UC2|&LpAZ}dh(z|
zq+N%aQ`g;@t$HF1EzLy#RoqwjU}9o+d#l9KL;awA;?Z`%?#8w31os@)3zD`N0aJ5d
zZ14~Wy-_jb2^no`_8~>_dqc4B<0+5v2C_4^EcN~6Oq_~`o&sTa6M6<PdPO|;x%TbT
z`T-BSJa#UnuI-=Tlt{AU-R8$ZZ|#fwf@H&xZh@oY`Bi7#f5js@rcDRO4Fb-W*GcVC
zkppK4p&5bW4!<MPrRf_n7Fk3(vbCA_8O?W&(z9zU7t5KF6eqi7ZWTsxX5V|uSS)r)
z4I!;hMYbOr<@#yOV7^}L8WO{uE#Bw*3ksQ*)3b9#m#1G*=C&&J2#bXGVdZgk)>;NW
zW2r)8tQ|YAqI$+o?lq;}cl4hO0FJlDVW{g*-9orU|1xM%3;h|1#LP%>Yl1@Rj{;|1
zy}xCnh=Bb{$FU+e7LlS&vNo7}pAmH9w6h{h%Ts6PY5|;vWA0^yb9Q;Gy0KfoSIu2$
zEtn~2_mYHid)>A8gY)!SLQG4h?Ub5FoiEfYtKG|aoF!ZwWyMd47VnX~N`mw?w7E7%
zX(S`O{X+Q_^cw#%hz%^ibp$6}?+K2GT9mrMfjTT$UE=Cd|0zE|iFz&&vY`a&{#lCT
zKi=YC8WL~3Ywo<eAkqAlOvTo`sYQigq}HUzgSw3b<@Wl)yNmE_ByCm8tdKqbJ;9Tt
z_2_zse%m?6?V^R5dx9ce;^*#H&yJ($k>2naccizj95T+dIe72nbw5KASx{jY1tFz_
z35AsC<LPX|czrg@_`X<cKKX`&#-**bJ6&d)ZXJ!GKx^!v_cr}-VLriJ|BV;xXTCB6
z#bVvu)ME546hG>!@Ab+R)^zR~`W~3ESjheN5kU>jN5|c4{4~dk$um)>!H;`9cF=6>
z!nJAi!f-YIjsOSZKE4nr&fs7&**yXc+U0W<zCZ{b2YX$ID&%^Xi=yiDgwk=2M1IUt
z;YfX8l{oemQ(|r^3tyo7Qtsfzt>Bki2A&JTF#+HxIP%<^OZL*jBI(0VVenV^YFU-(
zEtw7rO`bs)2Zrf+zle>jLtIwgc!1VZ`D}3$Yv_gitmwNB@^xr~J8);luI5lTt_C>F
zRT4QxGhmo~0|UVk#{m#2#S9I^%AqQgAK^;hdZ#4nBoo3UKLk0`jAU}$3uKOgvA?Sw
zMZTrRwCF;%D?#a>NCC_w$j&X*l7PH{((PZctDP`n3}tdAo80j9u#NFUfv<dk6*_vZ
zPharhl<?cu@4SX)SG<QqTh^E=75dzYpZZOBOGUPB%DBMCxd_L1wf5?)+<b{)>Z3%k
zVf|*whNBTLHmXj(UtRJQ_FaC|d&_4+%>|LMa!W`+@MR@2J<SH<4kr+DsJCB#5srW5
zeeZfuX}i1pI(ljH=i<nqUEzp9bY<a4wX6IfL@N^F)7Neop*eBx3HovM%W%W^GR7wm
z()0s#C_h*}^8QbF<gMh>x91ATfsj#W3l@7?-A4Cdy`T0vW60#0LZdF6wj8rAG6>ux
zz8r#lX}`+?DV#+v*1<h4>(QE`DnVi{BCDA~os@$yTYQ(#k0;ZEef7`xR<0Txw%$@u
z?=G|i4m5c6P+x;3?2ccFE8Mrbuxm%A3*;;4%YOMVx>T4M<`;B+XseM=5wHHni+-E(
zn6T$XGIvk?wlP}k8>Tm6Ar*}VM2u8nCPB1>!k*bE+zPPZxATy{qtG|KGPD*ia9Xkx
zA<%)iWeQCMx<|jpSQvK$6tNLV>h}RTjts&}d~zNF6&2q2^wHr7^PF~!?R@ndX_b1d
zrlRDMTWF_kQb`Aw|Em6S+<7bBBe7g96Qr`>JE+l*YAS$7Kd4Gcj8fcOa3+)11RC#1
z$ZBo=8Z5ljkLeDKMJGGlPZ}ybphYy)1zcUvmzHQyr*?)F0@b!p6e}Y*b2e28!rDCY
z!%Q6ovKIuHjMEZvGVEC>0<7j@sfG(AtfsA+4i?AeUimcA)OV%|@nplSx}OS~R_QZ$
z>Q||uzjzIDFMG%yd0U`#cz_pNv{IKRu83Z!N{hGmQSkcvI^QO~QJ=urL3q8QtuBUg
zS1IPGur!H6GJ(5->(5``-JU$96U#UhQtRiZJ_68j_~;PsXC>OLkuXt^EAz$t%o$G!
zh51sIchWnzF9FA+Bw!K~u<f@?yG>?@Hx?pHC8Zk^dQE&8#08e0IWp-4HoXL3(|dt$
zWxtRH<xe+>`vbZXn0(%i3niPRTDpo%S}{Ea_UXKL4|mkCEcJ8;SqUANZgKc+Wrz96
zH~dYCj342+_V1i76H%I2tE2cHZ%aex#KE-KPpO*x%9a~sfwTEa(M2x5r+n$X_!t1m
zMZAm%!Cdik1&W~kR3^~163M)Oi%LxQnlY;T!uJpb7=PVV=u{y)jYd)#><vdiLEQ3@
znbr)(uHp>IL!96=YlT>2OS<E;#!j3)p26g<d$|TDB>2Modc)x&VzyxQfuez2G|ctT
zbO3kLeN?4tl*Mq-K*9Hu=|+$b1p#6J7|9o@zoy*ta_|mrl~dc~qX@r-?(EH_oPFnj
zlh6~5AQw{U(jwB(hSb?DHXC<W&X0EC(o-H4`kg%Mk0h+tyHA}OzCNLd?}>D_Qcqg-
z_DE)TA>kwEz$<4R9C%Pw63cr$Ojwmnj(jk0<7+SAU!A{OCuBi>zJx?^d(cL;V=G!w
zS}76tqHopis4I7_PX|mun5a}?5*IXnnSx%h8%1MjzQlkZW6En>>l6TvP$Q`SDP!=>
zP14}|CtWr@WV^)<>;#HSUt+GRMy%PKe9XHCz&WR2?2YgG+uruDjr?UE{$E*000btd
z)x)#KLJEXbgarH!nGqe{rHAA5q@h8ZEB*UR2UYS>R$Zd!cnb~_TcFnb^9g*8ZtpAq
z(Z1Uf*iq9=G5@w37jb8j{}&7r4Ivj2d~w=-hssJ=B>$gaxxnsofv}BF4B5GQHF5@T
z#}!M|%Rbb!wj(SR`Pkqu<d)98wId((;dg&1ajtRRTC$e^l>FiH!9V6F_{WM|pAiT@
zd*QY+GCqgE2*<Z57>P`yX0di(NwfOoR-Xhn3xETO3K;3<xU`*2%y&N}TRY_Gx%JK#
z`8wq>OghD9Fz99aNgmpX<lf8}I(g^6PmFGqeEF(L*Zbjs(fwqAhMbul901`eQ6Ez(
zmLTtaEW#GL<s5R*8gO6|b2l~{iHj>-3@ixUmECNEnOq4n-+Ew-KW;Sm^qBg?gU&4n
zqcc<P_xItYCR<C-#<ZV$UwU+Ow`AUPw|F(PpKMlrHoYW9L?1|}TE1qY|LN19WV;WT
zE$I~&<{Qni@7gv>M`~q<XUL4>%XY@~Ia^A0_X73DT$~G@dfQoOSaKVs89Vl^+&JrC
za8!m|6aP)c3uh~tDHM~buF@i`U+-SKt#8_ZikaP|>b{s8+T@64?4gSmK8}Z41z7pU
z$E};*Tu;tHnBPfXu?7GZ7r040!5HkKlId}sks0Z|31j$a@OUNk{wqS~xSsEh(V=_k
z9ZL^8^?3Qx*`fvbW-Q=X7ALPy0YVoU%3{Op58ACJtz~yUsvDkV2m#xD_n~#(GeXxA
z!l+i)Olgxm-m0sj2=Yzqab}}wTR);Mec0WPF}}N>?@qEVvw%lJ+|jzHYCNXiQ!iUE
zZG{E3G^l>QQ%p6(ozCcV(;i80{csb$OTgXY0T1)DZ%%T`TE8J5%)AfY*H^9Cr;cYR
zv?)@qubKRL@pV~*_eU-P`z?TYWE!6+j3zK7#-W(Jm&I>4z#M-_e|r>zU6?R1l?=~|
z#i&RtNr1|NoIeM6R$K1D!&<P&DX*8FNe$WMZU@bcVoT8?R)0x4RoCfB*HlfE=D#3{
z?jty9>uBj%YIjxa`A(8_AIu@v%VVLWDHg5Cb`QmJZd<A`j%t?7g#{eh3sx4dLQCUW
ztVD~~1D~`-JBw5fqb61U2vs$c|KZ%|ojZfj%%Y?LRB-__)<vD64kTp6GZ($B=yC<0
z9-q522RkffY{J&=w;!64!pdPq+XNO1W!IZ+pxgY@3H;z|oWFN#5Lmwd$V7NuEWJNC
zCCa-}++Z18X&C9*{>s?r54_7~smyl@DDZS)#&yc3K(oE&_HfRbP>ei_aLpeb{!&Ab
zGvVaCH&dNS+U<|q4k>)T7EZV`M<X8@xv5`44IxjlhozI8V^NsJs$=h^?rm<-f3%sD
z8k?Zi2{8&a<SsxXbUz+o=<35vQ2$Eu=~U*np4&-f(PzNc7dDsTd-z3i=GLwkJmze8
zVm<j)h>uKHhWl`qNmiqBW?;&or`#P28nOMDuE4Glq2_R10^Y(lGT4gwBE#Iqx9?u8
zSCKw&AG0LvLvtMqI&z7-d#m~XBWtaOo<Sc0nT!ml9NwV~xL2UN;H}^b&+(f<3&$^*
zJ$Bk67vZ>2^$H~gk2*Vs3l4<1+oMuKQlia=6FAQafcl@J!Y$QG8Ce9EVVw7~0K_-k
z^vlC|hHiUQtg3E^7VPqHVsh;@K9nqqP^fgK>F?nb6WbfkUs~h6H1iocL(q+rll(D`
z>FemgX|lDaY6eRq5}dT-+eh1kla$e?M6Qh4M4}=FVH7BkWSoiHzYJH~1&-3a7ljPA
zVLb~{m{vY_!%jFBOIW;kT*}#DGj)$YLVx!54>XWjXQ%fdoIk>(4BofTbA9}J0m6JO
zS@;sI`eF5|IA*Hh%{MGL32q(Ki>tW<iMVYDck!FusqFh<Sw;;uQ-{&jTz1zTdd(-A
zPT<bv+(5_{mpD(rmVIl(1j~GISLijOb3Kr_Kf=n_C$Nqcoxn|!WRw@&IP@*~YQIE&
z^hEAf%vL*dIA;!TX_WK1FDD?Bc4|7Ttun%T%%j#*!~7f$>g>iiWr-(nnUYkf!eoId
zotiS&zVF%^yGdQ`nF%lItqAZIw{1pT^*UAYn0buk%=F7fNiPwgb^S!P6+KOfHv%wa
z9qux_)GB?0o7BoqhGRWBb(9-D^z}CW{^?pre)ke><7?%>82dOe-~GJhF9bQF<FS6f
z&X$LCw#yW53pqy4DdJJt59=#4zOJXMJ%TSdguaJ*&0f!5zE}E9O<8oFYm@zfC4UE|
zSy?~Y&*p_1)|_Q66WJ`?^?%18GWcath=+I<tD|0;tbQE8WnXvY&u2#3)x5e_>^KC;
z@ohtjUx_~so{>x;AJuQ6%u*b2+O#jtZQ+yo@-~`|lNaEC<T6`lnRHj24Ug$6(8)U0
zU9sd`GF2`!t0y*F!U)rESXVCxHt@4iE!+N**}M^PA0c^Q3$4*3R)rSxBU0Zpn|w6C
zt(5G(bSviHZv$8KY`3Iwm9UHo!(?nNXV;?Cy<D~qqncG{HS<*v+_e$BlRvh5+_!k;
zJrxi}R~*Q6O^Y+gPIqk2W$e6H1Q<_iZY>{O8Um~qf<wmbU<cpegtp8|)=IO^epYFk
zF~XT*lkaAa4^fqJ1|gv>5EEHhDx6Ebb_X%$k~eF8!@54D(w3d(*$d9)m+fOc@JK$v
zhsGfK{?i&~;5epMN6$Iejbu69D!F}RPvJ;NpxVmC_}3+s4kwEV$x3wp@;X89Izegs
zCfSW`wK{1-cjK#XA$vFr>noB7pVf}ZCGwm?ja1b2&Dgq7G~43}M^kI!uKUpk5(xtq
z2kJlx?vQXLbicF;g6T<x>HvsK=EuYyOz2dFvbyAj=ZG4Q_rQB+DD?HitOD;qUGD(4
z;_J-~nJJoXpB@nE&Uiv8yMCasG*uX0!s@+8s>xlgx_-+m<fiD+Lxa9iaFO3USQ7I^
z_F*rBrrNiYTnAx>f=g!cAm4{I!hO?NrWYao-@}z`gHeJ?iaxzL&tCqtM{x5cRh?cc
zz%qQsR(f|f-Noa+E)Y_9ekmp4OnHC3bqmqEEjitl(*S48tT+V=y|g&rw5hSYH=VKk
zNkXp2oY^mpl;kOw;d(qoS1wuwyq8Qi;wr=#ncjND@K@LQGpy)SxldwS_<{;ZPqPsM
zYKhVnP&Y=8c01lkdRk`_WqPJfjS(1!PZKB*7{9OEk-5KzY=4&ld9K!QswXYP?xKY5
zohW80vOZ-X<n=j<*KiexcYYW*X}7JA4T_XS-DQhcZxpbX?7qOCM<jAv(K1UiWI+T{
zQY4d07%p^`_jvq8B(lY+ZIin%$u95kYwLULK1a}((?d?=?!KCQS?Gm#@EJlm@rb2!
z?2w)Kj24^gHG!h&jWih<({n5+0zjYu+3Ebqq;g$sYf@m}Yu}Sfqw}yz`fR|#j$Cv>
z;g3Bp<>y}J<f2&6Iis3&vk~YO{%Nib^+`}rocJtRo^Yw|NI9&tL&DZ9=+}<#{e^lh
z3OP}mL#Esby%K~vHJ9eE+&#!cQTVwTvtB$);ZWqrE9fQb^=#aXz2Unz%mK*~exK0e
zzGsjGnCW+VCo$^QlEoYU(v@bP!GO8|`KdJ1kxz!(u9pBrNcvj8tR_zX=$y|98W>v;
zm;KjYZ2B_dB5Lnm+oB}%X4HEn?9Y4tw1&z@LNLR~<TSRpndJD79w7z=MPIjmj$w}&
zy4}cKw7~*_;w{kG7yfs7XmXYj>Gt|i#*mt2i(!9n?#O{RJG`Qpch%0g69;K}zTW&5
zmLmwBW0#e0R5eKdPDTQ_A0m|8n}J&q*fRh@I3#!$#z&`Kt|YY9WT`mYj}1K#auQcC
zHy5RBnj(D-mcbe2R7=XZ1CH|r;BlM;!%Rz9!D1vH_RUcn=OnKu6+_U$ofE^(ctJj$
zZAZ|_QpwYh?dRE&j%XP^;u2d(^~u&Hc~4Vkf`sg9*0peVs|BC~JHZh4GfN3HN2E)Z
z5Z#rq$u=K-$M-pb5KKsd4=HZVsJ1cMC~DFww%O=FsyysK-W1>XyQ7D}jXaX>bqflQ
zY;mh$9Y0$Dmx-Eq94&nCGI@ukWZOHG?e#)MEV6KV(ZWL2;-SIH8|@ZxWbG1W3*D*3
zA>586O=XAR>0y)29Gn<an9--I00^ysn3%6mDf2j<di*A3fG$Zsdh<5yXH?JJh;3FJ
z%NIpm+Tat?lMG%n<wQ2Bid$uRdU0XY<EiC-Zw4N_#SxABQ^Bkl)AQZr^Rzk^y$%=S
zk3^9Pn5<|X6{TvGd1U(M{V+ULQ<FVxn~_->S`wH8uIOe;xF)oIk4MRgQl#oRO|EX^
z(XL%E3He?%=I_P0z|Q1*|EyXKu*1kRF5N5;IzwJB808pTxxYbuHdJ$3#%b{O&OeD1
z-HLVr;EG#=&idYXoP878Y{a<o<}|d_Gt`N`eJUSD72nfj{Wa`6!E}kCP6v4LEOCb`
zX7}~q%Pw<w{bj9Jr)cxn^(EH+(&D@JeKW6+(^J}C)kg_mMM06CJ#D4hxPM4;-zY>#
zX#yZu>%!fv8nmi?tU(;T$8m+V#|=G__rlH>pZWiyUh#18nBUMZFsi7<5Usx7gH?(z
z>^$7BSJo!L8@fy14d9O4!AnCcu=R`jGqqY=)(gUzs9V)Z-1KL?zOCasd(eF~G+kwO
z-gzICLe#y%nvTo5)9Nzlf;Qa3WDoOy9ld|<&-dznmBfAJD?(D#6k)@P)qLE{-cO5_
zcNwfUDvb>vP9W*<r^#_ZUm=Ua*^dAjogy+ErNclCuQRZ8x^-!9Hj~yJsQa`5x(fj6
z&E?Bgb07R}ecRaP%W7y+4+MOY+A}MwfQ)wFa|Ab(+z-69ge^TA@3f*(eVr+c#G)yV
zP@=#d$w5!V`!&+mkcKBaeL{0zz+PITgkj8+1tTe|&OmZ@isa}%zW10eQ;8X@i&4rj
zs)IwG&YRHYNeH#d^PBA=2*R?+4qAbgkzrxy8{T_Jw-Zc7xNQgv6XTU6=c^5@os1OW
z$zfj*tM|8G55D5s%3eA|f`2i{i~5{}){o2(>KJ72Hjw8<uj6ssOV&5A^J74j-D;pD
z1lzt&iCM$_K;0|wSD5FyPg;fF9xFB&>CN$-BlQ<hB=XuD9yZlBKG~x1slU)D^yKw1
zD@mPl8;<?>>;xGHaG$Rx_YLcOz5kw)hdzq?EPyUP?Q2dQ;A>=J)$_AsF<)d@?(<l%
z2Q$Z{4Kog%4y{>d-G>>o&Ufq7K~)IoW90I!L;<iM3W7dOsf2qT5A^+yj(s<zUeHqh
zB|>`Mo_n9`UG;0HwU?*-LlvXoKDVb8eJ&`uf(=R|=q%c;l3dQd`xYg-^%hniUt%bu
z`(PWr^s{dyY&@(>i=RB<IM9(TL}WpX$ZCJrUtV2#3Y35Z>(udLzMOfAJjJ^0sTP&q
zUNXe*KFJvhB=ztfJwYcsXazP-1b~$SG70=2obz4kGk|lN1PR;phKMdbvU}V9)T^QV
z`rK<M>4i)DUd70>107DTyT${4($mE95f^4wwr7PDauuRq17&<Eb<d1Y`F(^<1CTod
z1W3%UzO@o$GAl#G5q~uPW;B<2v%fR1H!~mP+e}EBP7&Z}cyGNDD%Kh!KRMhsH1&Id
zPnIXG(PTE0U=G6Z!UR_-l<$rkh`HlVV|uwf9c|wI;Rxsh);}8B$zDR(XQCzwmfP9&
zF3e~1{QDJ9MDdn1`tj3q*$J3SY86^2tmEU1khwDsif_(5h;xV)+;~IoRxq6&y7Q$^
z?*J;=R7F)R07NIIFGduMZl<532pi}b@l=*ba-3>Dstb}jO>0YcZABOE$JxA3NV`Ex
zLrpwDP6gPg!FYGs+}f*$L%3J|n#aY2eY&e0u8|)x`Bshu3VweIh+rDk$o6c@zG?f!
zN!DVqDZ6;BuUz*81IOrH5w0l&8=8G|v&cZPk%{6yy|W=Ks?|+A;$T}tv%iiEU)gti
z?1+&|p*B`oQm_yAE8?0{Ds*#cV}nE^k0SO@b(UghtxH=b5w?H%8Er2mdgC^Jrqtt3
z@nxUHVDx2@Z4^!p>{pSOYXHRXZ6#hd(Qcp;dZd+sd`9kCnpeze+yv0<unIamq&~OO
z;Bg~`+)C^JXv^D-*c)`rn8%lrM9SD?)=iM-+{kZRNS-+~XbWnt&*CyfhLbMc1P3{|
z%3d@<t!7OdPmIcI10il*8<jV&B4D+oogJT^9t(oszLTEn^vO6!NX_<Eo4x!7VvMO8
z#aI2Ne)tThbl$ss7zE^j)e&?tqVPc24ePuyD*U0q)+;hMLvShma#3flU?fNZ0(Sbv
zz*?$WoBCxtjjlVb^ZlZNtEnr$^DYX+;OiHlat;hn`e>f@R;AR^L^bdrXKOMAI?9?k
zXqbMf_SSgT<QAx!fLIp<%TN8yz)A%Ik^cO5+llZZWBBBcyuRgAA#unGmNe~571#!U
zytZ`J<8Yh`zf?i%(&q+5XeOs8N*gR(WpsiZCnC*)zD=AWz?&Z_Y_RtrjfwGE6YI%%
zR!8y*fkMA^-9t6acq|sQYhaZar;Ch}cp(tC9Gpd^rnjR)^)Xq&C1EVu&{a+A1s?at
zt*dj7BPDXOS8?Ivt$esf1&3&T)%CkiY<i2{L`lK{gnX6RWj<q%(9-0W**~J0&Gc{r
z*jr*_rZOnX*57}pn%nEALrgacT<C6C(&v{R3_6#0k&E=@wg}og-;PfTQ>>gJqt;vY
zai{_xNg%8?<68U><p*ewIg=~>KIh32nP;{9-Cq)zmE}rGS|isP5zPwccE7=0M*B-A
z_S~a1HP~U@ZJpiQJh2wFSpxRqcM)Nl(HF(C6Lrygscq?%3r>QtEn+!R@hfozJ=;}v
z8?3Id2%s(dLm|yp6jV2m@;%5L<7s0t!MsB!N^;CT@~xX_vpc=VQo*WpM!+rhCG*WO
z(10eG5fErCU;&dMs+p(Lg0NG6G^%J*?(xm<rJ#=&$P}z#uH^*ICoML)GblUD*fcJ_
zQq1TiH7Mm}n4;4k6%p=V1!w}K&K0r%o3@t2BHD3$*!RILYpcSwGk>W_!bTlW(HY=|
zI2Tj}%||3yYL;3PC6aC|v~0aAvo^ga!|A74Cl6vX^<~(ga=9Rn=NM6>uPH4()WEE$
zTxgOoXqnv%4Y#m=d)0qv(4k*7nLhY2%_jUw_bX_xBe*t8gd?#cUho!!2BxNX4HPn1
zII4=1w+tGP8`}krEB#MG(=we?RZ~bN;miDviH|oYm=&YjYWfD|_Tc$y;_2woO``Zw
ztDYCq`4AI%{ztKlmScgl)Nb=Aruf_1mO}FH?6Jc>n>9xtFFV9yHbDxygs%IER5!Xz
z8r@gVMmutDguA^;HJ0*Xd&3556&tw+2WR%e)zWH%JzStKfmQQTmzKw3$!O>6>Jsr2
z_ZA6=x#F8qIO_=ie5_`yX13m&^`)x*Qd$<A&ZK*Qd8QJ&Xn&<fn}of-)CP&F$@pVx
zh>rLa1;U75)uq)ee=mW5?sbG{E6B64w{toP-!pA+0Wmy$wPlU0o$K7md-d=s{FgXV
zdl>S7$(~OLuNo-s+gLYeA!DOr<x{G7!z2GA`_a$38p=3h3F9)%L)9O0>OS+;sx#_J
zlJef#-@X3GRUF6nm*7N0{^`_dA+{ebe^4)?ua>vf#)tJEh{wHNI1Kxf(f%%$YMl*d
z6PN**$V=0|{Ig2F4&ibJxa+yJU)0R(^)K;ELmp_dSo^`EG`-P%!>LE0jSs)!iJgJ!
z01|iW-uZHcJGSFNmpsBM=|MbDM70u+um>0k@;ClwkW<u=4tOm_jLaOnN7d3V0}?EQ
zst#EuO|p!=F6Zq&WU1wK2clSB^(Yr?{|j_I(_BCIf;a44N7=Pe2C8iUr0%lF`b+qW
zeCrgIlZ%ipQ&oDRK_g_i4wn_BH(w*qIk`AxcLiU!s%Q67xYCG_Pzc5n5u4JY-s!;9
z-b{pX9Mg8yDc@k^UmH@+zV~T-R1%jm3LV>u0VQ6#S%Q(#xC*XjQ^coNDUOhvJBfAk
zEv~zKRYa(dV&H)^c(F2_z0Id!9kg99ui#_$<vHTGBb2)^E)6I?REg`nRn^YY<fK3L
zXP};wEkLGF;DX5TUtYI`ng*Q18Ju>ZL_#j)<`a$w4=#@<I~(|!-O%~J;e_!10!6r{
z_Yt(TBC;#ElmTyiA;Ie7?csgJwO>VDuMR&a<psdf_cB84E6wu9{2hKSpSyOc-+O<0
zjyU0aKg>`t8qw_d1+7+H%`9rhW<=!r?o^jW_6bg`$+j#diHDPxQ!3$akdS_0WN~>s
zp~l*A^!WpkbElwXNc>b%BG8FqIi14d=2QPYhE<*>GfbaSVKW9hI9k51QlH*FYSE(m
z;r*H2CVK&pLe&PiHWC{fPmx^Q{F6AUq8Jk3?f|;vI+WjXn>zYkB=tyz{*yT7yp3A=
z+KbyNwYdL)3;sgRi+=|+@PCUQlMv6j;+QdYBGh9RsBn^dW$lWsG@4aXvf0*~zppZF
zKb9XlO52h}7)=?5YsO;Hg5ck{UaZQ5n<EMFi?jvHq#yvtA^;s3wPQWQj9;VlLKYKk
z)dGuU)$M78a9=MDrjtHbEw7d$E78AUGU<tKX`HC-z$T916TD4_=Bb0N8wZAdn+#%d
z$mbxf`ByaGV_^>>Gjn>_`oTERq=cDZp~?!pr~x80lNMvei-F|@>iB0Q3%SXRpOKze
zbf*VG_P<-y5*=l!WnQ|S8|i9YCaEe$`}ELJ>?;N|aF1l%AS24~(!Y=ak80YsWo96|
z*eB<&V!IY=F>8n>Yw*4|gxs<iD11p6p1IKi>J7&w5{~h+C;S*hiF2|VlF&w){d*L+
zlGY>O<KuqcQm{~rnl$jhTLE$Lt@XKO)^0Er%M%=6{P6f>k)Mz!fBVj@U=6=Ty5}2s
za&JPmyZ*cn0E5icqHUgZEGccpKIqgFL0ax7egLh7)sH_b6bW!ZDaX_KUtX@yhC6_t
zIPZJw9SQOpuJMH`5H=jd)*?Uud;sH2vQ=zkYc-OqmFjuZ$zGIYm%V_P6@eE}?5h>G
z*y;6aj#qHnQ--93=>?K3EnZ%F9$_YFE>nE?%-h$-;p_+YRosM4$_BrF&UjanHLZ?s
zQ?{159mr5piiaEvn-^*2<rXRo^3)pf!$Hoi-a%t;b!p9nXRMW_&m5unoz3U>i+n;<
z2fCaQoOsMW+o~&STW3`Eb$)4YRdRlniyWfH+o8gL5wkFMO^sjQr@han*>#i^$pUH|
z*G3okv@_6C1KJZ4g5dKT&$-d7U&?F#dvhI{fXD)3w2-EI1XxV`QBD2om3@k?TvL?k
zuYFq0l8?yl5)$l~iHoZJDaZt=+h=p9$a@4_RjwO0j~Fy^5%`jWF4Q2$)f_L7iA#4)
zOjr0jx|Vlx<7*hP@F{g9?AF2&{pfW9mz|4@)?Jk2(kn`xADH?I(MN{I4x&QYOupBa
zIo~0_x?mfCH4#G}7e=HOKnJFg4l8;Xbj~Q&;BZ>08Dwg%QmEAZ#X86Cw{Z#~=8Kd-
z`dn}P*!3eXQ{NY`wa0vczLUzw{Z_~w%9zFLC=`AgqMg#`yRT&axB8Yqk{A5`94J<J
z8UT7|?i^@4_t;`2`T|(QA?~Gz;w{La`Pjo{f#q8tF2{<eQ_ZacY=@dJE9(fLKxs?8
z=Efx{oepjy81L%^4ed}-sxb73vph)0BhlN#4qR&X6u9?^^tM(q2}Y@=D|*0v9JR41
zn?LQ;%e53rioW79%PMp?t?unO!rOwQvoQ-Iw_y{nW2CoNCQn&?HxaVa34NQ8qVUHJ
zTu21Up60yVnXZV~S9e%Ao-oVTEOcQv=snQ&4uh@IbM0P(YuxeD2c~5c-s5~6S}7B4
z28wg5N3Gd%3sd6fT#?(R!-)qcwPAV-;{&yLYDAgWj4rTDyy?07!A|e9{6eMlhBu@k
zpBTy@-D@_eaPEV7hVaQ4kjCymqOoP`2Wu!JsPZsHx`AQU0uB3K{S)h$;c=JSqk&Uf
z)N4v7=Qjt)s=)Cb3i9~#rD>pwL3~Y<ftHBfr|#@9ErR<Is~Z&*m7FapfM|ikL_tk8
zKK#DL*X_5PAtEYDzE*43mw^uz@@EiyC<+I{-|e>^fkd?=xCAZUW)siix_V`2Qif;G
z;6EWd{&&do9?G9*jGX*Q^!k?m=?*3oM-^)h&`PPvX{h}QRtYG0?^0v`ZnR>&4s6pS
z0G4JjgJ9#g2c-TYS2}^LF%+S85%#cWSo=MUae&(aB3S%KX9>eAab$b-tCb0d<C1s)
zJd&?ntxYLY+IF#Vzi9R`dYxj!McsXCrKOz*nDh<CPVLpgW#A|*Gua%UR|*AfM+khN
z$E{JsCG0U`GJGSRIqOL45AW52+XeOSgY;<<5rt;dQzwV_%^vw&YFM)4mQNE1E4M@l
z-i{(;r%Tgbj%?%QsU5Gq*zxt@`&-@230_{dsn*GH;Z5L;i@ka)P8+NQ^Y?V(>g~g>
zPo&dV5i3MzTiap1#S7uIEH*#XMpFmt1YSr>sf7qBOI!?4-_GBd9+HDR1nk1&V+9Yj
zu2rajHZBXQ-awP7UbNBFLBo<y7nw%qGc1Dczwj8fl!P>R7ERIiBmy8Dc+5#`ECMWW
zPBm6>N-!~E2Djmz+k3Hla*(l16Uh6FbaKOrFVQhQKi&Jz>&8L%OjVKovidej4H=%g
z3wryrycYD5hlWX44O84650^%lw(;_pa!SRFNf~FG!RQa!m%h`{bM~x`<HDVw`6fzm
ztAp4B_l2%m{njY!H81%MX=Jfs9gSS}f;3LwbcIH5$f!^Xr+&}!lD&y_=WL8Zb~>?W
zgUF%|`&qM@{OyQA2cv;}%6fB_vmkF4L;K&%hEQ4xMe8*d6gYSHjF$n3Ef%1Bjl?*p
zPScch4~=rP2CCRIJ4tT-NKfMKJIX)l4L`qmuEG>Aww<XNbjC$CwepF}`xs@;cia5S
z^hblm8Vj)<Sib)prsB8B>m7~pe0kY8ySW>A;N*^r{lMn3V{#?PoZx1O(5}Mtdwpnh
zEDrB?M#hmK)+<u$mCm;NFo!Px=wo4_2fhl~AAtLp3<?hI2c2_+EX8Pk0!bcqy6y8g
z>(+f%xzO~)Sf&YCAr=6y(dq1pL)zknALC-8<&oCmO-e+(J}T4qRZN=|n8>fEsv;Hq
z?AhfUkQY29F}7{Ogxvt`H*lhH<Le9eOHVz+1q0iDJS+W{n~3cjUG6cTi<BXs>CC){
z5xhOm=+lIfy=KXJD7#kRv`Y)*Jk?9krlK9{Jcr)TYt<YCUQFXXD@lr8u1IJp8<NRR
zU!>{9zB)UqAk!h7#ChIv_i^n^+F%{R+o6F&uk~(;05luG7F;^w>%%@hnILCKk>#s^
zfTccsy5Gr3xfwN|B?zTc7X^h(K@;ieRB;*AH5WN#C=?`VXB5Rw8qAdxamyaS4g0ec
z2f9{8lZcU!2|PxQiVh?RpVjw4VbZ3ExG_!0@J!1&wfEboB-P{CiUL^pM<QxBr=yqp
zwQu>IHPDN&!C(cE00;u=M$+SPpSk1Jn9++gnf?Z*{8E&UF(;`?%rUv=X%vVu*W>vS
z(x{@Q{O$khgEt4QDtH+5ulvL@wII<`5^+^UwRmW<4`0;=;dbVe^|cqU&SL{wvMkPY
z#)V<5anHLi*K8R59q1EufPX_@TZlbggyYRSMj8mC_}e_v^{k;+YmaF|zdJl{J8_z=
zO7XJwlGm4<qBPs@z!&^NUTRmonpOYeBleWwG=HopDMd&VL_u<4?VLrGYP72Ja@L~?
zq`dZ4%uu(-DLe$D3R!&oU0U`<rZfkfVwH|`<cw!#v4lK!&70w1i|{9jcC(brcJCEF
z;OR%+6JsOU>wBuuC}PW}&fQ*Xqlb^ea)YD+Nu4}4u<R%bIAoR4gfAYFW2g=21IW1j
zSu03d+q4FG6)}zC8NovzLo@Tj>J5_jdOFY<tdM#%xzvbb@q&A??z7_frb;Ez4^E~^
zPoj_~$d~<@oh)W__!?~u!?fEi(#qzmK)Nx-yhlD!3}v?rFp-urX|ql&eL<pDAAZXs
z?RFoSf_8(8u~TC5vZ_Nz4#TOVAZ#@_{Uo+Mb2aj^H7R(|c(<YX^cz`9`v!ku2xAtK
z@6F~f5mWsfnM+1~>T4>RK#2d8dZluPPC<TcVzHLJflD<hZ~Ns5D~|S7+`UxPpc)!p
zQrJd_qtDnAR6PKaoH=$|+8;jrx}l}mMSsQ3YAE4OD^?N!0pzTvnOR^16luzMM^Q>6
zqClm1-p4Uf(28$`EWZV;&Vkl&Vle;`s!&bLefkXN<C40=O;2jNCszN~lrsA5U1>-B
zXEsd%DQzixzdLt%+F)*9Uc(aPv``cNxzV|}t&zc5+OY;=0@y**LCqQyEGPV{Xfw0o
zUUIqoX7cdf>^BQzApOeHn(>>H2sx-w4{PspatzWCHDGD9i&qaVBY^1Q{Yy6qcT0A7
zXB4!X<j{#CS(?z;9|oS1kKh%ugq;6w_V*VH&DcP0n9d&E(4Wf>0A(Twe175ynp*E^
z8|m*50vd=RN|Ec}H2S1JrCe#!Cr`3~nW7u5>ORG;chsh$Ziz9wMC!nNMqIzcv(RR-
zbTk*O7(k6VSa8r<yJ4GS%l&L>>cvhppQ(K2p3rfONg<-a9a4Oz*V=J1_Bm#(Nc=Mg
z((yAY29u-DJs6g<!%}a&UYvPJiY-jDp)C|f(o9Si_c8$vwaPj?wja2Od0q4hhGZjH
z$;<W-<0SeO(j#}+;}mInSO=C`9K;;u@R$u^SUNIHM4z`3`<6^!H>0oDQ_Ud<Y_s2?
zKv_eZFINn4kiNTEm#Af@x3M>6As>f(&<yJ`0y?ju@qnrk&qUGl@a1(;5U^L$(lnkE
zn#W~;)isNt^xPv&F8X5V3Np}o2KB8$TfD@3y-^XGFax5WvX<)QM&ChM-u$Y}dM&RO
z%NS?+*6Im2f^AxB;K()PKx-(<!aq$b8)NnPZ8cx7{y<aefs(ahzvjdibf<O;s3+>4
z8uBmB^4*G1KEjf~tg9U?C01|r_F`qGCJ}Z_oW(?Ssw^|C=bCv^7T@j)6k|f?y5qE-
zW@{+{?6Bob5xtm)a~#e2oiE5HGF{Mj>@~prGkiravFqd&55u7&PyPr7&XLHwVQF+x
zrrnZ+&p1c&zOcGOf~P039|8^42O-<{4Xn%PzJ-!JUCB)JNc7Jgcacvlg257?YEgCA
zx*5-w#sUVWSyweMB9ca+;HHuIvuw<UvFI9e<Y8l{R+~F;0Pzy{#(=l#_dplB#^Ol2
zC!1{kO+KJjRZ-|&FE&>MP+uKBOCPD;sf}|_pVrpM;(oVAVU|NA8?aW*#d=Rn;DE3u
zf|mH;!TMy#`Z97t!|Mg{W5BT<`YB&s6jLMf7ND9F=zdcfbnglJa%|Y<cmQ`5vE48x
zNdtCHtlt9!s&Mu{@0_I3@3R;FLUW%1io*%NLB6gIeSP;=`PhY{CTSnKgpzqZ-IYi<
zL_Yoe8m>_Z=+B}Eo`_|nPUR~mUG!x?wX0WDJC-j~im^vBFD0wnS8G>s|J_&7cHY>a
z1Lb5_@7FmDMp_$6XC3_1pif|+fN>HnQwcVt?SeJm6Q|IZViwuNz2M6>n0)PU(~aQ#
zC3%bAD|Ru`Z4jtKNi3F$*rZQNClJTM@K&SyO*ze%aGf?c<(;C6S4@PaL2<nSb9ky|
zgA#bh(rpdYtxUi~6)@d_^4%Ubg|>6nj7tBsCEM1YCqD#*V9>m%DY85@l~v0e?>U)4
ziPbR3xl+aiWsmR6j3f&-k_*TO!Q^pkaRyF<7T$FG6HCMm6jJ_sB2%%bsIG|*!dap!
zrc-qFVnkKilXLlt1jA6xy`t9EA~W`&PZa8Hkb`e_`ogr`S9upp%h23d9KrluuepQn
znyfw0wzxyP4%bJmtiuAj<mAjwNI4P62O4n12resf8na&>-kafqt*Tw_d<T>{)~a;z
zJ?$QAL!b$tJJolKy5UxxTIEIv6y)~YjJFSn+LfbS+aLK!UpP$ME;0b37!+%sqb0Me
zoMn4a^zP5aW`%_m%t>Hs%v+bekC~dnmYruamDGJW$C4zlo|{E1=;|D??Ec#MEZc><
z)FQ3?tP2{|PI~J{Xed>@KW#x&kY5nezlyd>h0e%NCY2~RSRPX{m`0s>-O^tSF{d57
z!s;{U*W%Za>MQhG3&>N8M|PXCoZ`d7X$B?v<w(<?ZZR8_qxa#{*LbK{P8FGoep;Tu
z9FRc8<GzJUWD^mfKUoec=Z9~W!~A}395|KDbx3R;VF<9SL`Ez#juop<1l<7MwJ&eV
z`CUH(fNGNm4Jy)IuTkJ6W0BKYnL1f+lP+M!1Ww6+qw9nf6kR1F=8a#hyE!P`G^$y2
zd(j~#1y7AEj0xL0#k`b{y0Dr2d|p+ilwFe1ZLXLs3gUO#h>0sapX-pU;Ooy}F1{-F
z$MdsYXu)YlchsRp%v4woHW6^_Olc;C9jRK2q3}H*uQwC*xsM8@wjJ96LdM^#?b;Jm
z2CO!Yqu;A-9ot*)qJ7OIolqaWzkN*nD&vog!WyQ1N^WQz|6oBtB!mTH$BcTRc0XZ7
zwk<TIrM_fP$r<nS4<ux4xn!emae82?p*GQ*2AK77vGczj>ufs@8_3!hemo*EFu@jn
z_iH@cq_#<7UpaejA-{Ce{jjj<x_kc3i>yAh-5-BtymU;?u~E3ivh9`m(D`BNvc_3+
zm&l{L#phSBuUYq$MJJ3V5<5}}=5pW&Bie1Ls_1cxIAQ<%A!%Jx=|#5QW`=pvW{20|
zxCy?nafg*#3y=w%hJB(2>1h6$s2Mq4#g26CoT-A^7l*<_jJNtBB_o<nf<6^nU$J}5
z#ZLRq)tkoH#%Xen{C|~v3KDgECGMok^?E*(Hm1H-@P`?FAS84?Q0O=~!_%n6(b=OU
zUtppkfv#_um|gjL#6|80{~DysA9KB>g-)ai(Ts|QOl|e~0Z)~na(Dfg=mUFRzrXCV
z!-!J82jno7Rj8kK=n~G?R;UBoFkp5D<hm2<CFGt+Gc+dJOz5%HS~Odr3MeJgaR_+^
zTSyvA=q9kj#ZlqW#U4{o_8J4l&#0?i#x!bQ+64-3?Gi=HSzbf6S@m%J`Jf{2jw6G1
znTjq8;(o0mjNp%l)^61;&oPLdnGoi+QOCn-%sSO3@a&<9j2814)r5y2n#DG6U23b4
z-P)VAAYIYzHgx()(I(ZcOZ1b)v8|U4*^0Fj%+4qB^-Ac*t%bT=ab}s8I3Pvy9%3{$
zi}{a&7>nX>8<4(N+Ox=I6ZNgwZHJ2}(5e!liaWR!_klDq9YiUGJl+*BShOt{&I)gP
z8%AMtYRq(~CWa8hDL$VUeQwW}gumbPY~x1wCt0kwr9B&_c-+WSLJ#qa)a(I<5T~zL
zmx?cMT1k11Lxq@c!47+3xu>n(TTL?&C^LVO&_}-`ddjzT#FMX28yqxU0#*mkj$|;A
zO@NX^o8x1$(Ghh&cJdK@mo-F=Czvd!T7=dAmaLi-qw`i4seZq>58m+1SU8X3k*${T
zMTR2cbeWX5qCD9vsSVck+glp1CtMGLI>JWxTB@G&+DDofjVO(6K`k%>SZ*WVCgodk
zzd__Q^QIT>X#r}OvAyShs)+_7_|(c)n&p)0QxQ;Hbw2po>3jrX4Fou9r2Nnav!Nyb
zciheaam!4zeV?Fvi+1~lzrlu1-<kccLgCY+_d;U@*}(o?F`t18vDSs7AYuls%ls#Z
zhk2HkP9VmZ(NF(=#UEbI6)o8nImObzB$mi8SN%n9tdpa&YgvL4C&i&E-|e0yPm=(w
znv6~_ZJ{jAiObVNI%jvAucBA<yGWb73iFI(%pnZicFV>Kcl$QgiEL!^fI@_#<o4sM
z?mlpM!g-92uw)N|=GZSNk|ApUEH*(6*62H7WKqcgse>H9w;^Ne$RCUnNSJ`sFn#$s
zZTerD=h_1TMcUvY;22|87WH?(j-)7eq-Zro`~8lJuTXb@QHj>DZ^M1TD_LLtNJ&4b
zZd3u1jG<?2_|qbcu6JknDa}{>#X8M?FOMdkm7b<6*#d$=>Y-Cq53hv51djfdyJyE!
zPB?aahbAK)jibQexNKGhu^g|{EgXl2MQ`Wgtq^tVWEa!?+#Ig{_g0qUr|$$t6!b@E
z()U-+wCatnt-ZMdrue5oUhOU05|@pClNdyih$CKtljW}~yj;2+hy<N=Hn+nSde1w2
z9B1Fvf--5*@cowX29V%Nifqnp{<l2h8{4h@!#XRw-`_gj{UZpBG#+7~D-49Jy#Clm
zFnt7P?GHX4X<uGO@aaDVxKw`XTktVM{5lZHjQ?#U3vEd$8;NG?rCUC;^IKQOakl;j
zJ1-o~i~^;QH03-;h-X$8MbPSX?#}<JH)Zdz#iI9~mQhCOBW+sGia-1NMB0K!Jk1fz
zzrabQnw0|w{H|ga#x1CH+1bY7I85~R99@sDXS~d%ZscN7$k=sqtC*h{<cec^^Q#m@
zamVb9qu`!$G=b;yB0qF-R<7bc223<my+`)&g-`oD^CE@Z9mtC1YsHY)cua6>*Bd%&
z&7OFHkkGU`^t7FY8FIY))tk$Va$5=5&y?kCQh5@AV<1z+vZ`MzR0KaY{!v*Wp06IF
zEGMBEh=RjPhKd!0<9*hi6?>Gs+8tok6cFU~do&A61krXuOyv_t48g_f!Xv!HtHfwP
z)<e-v2EcLw3wIGg4vTps6;^7rTZylb;Z+!-5*O2Zr>Ata96kN5IIoTMfpLNmby5|P
z<hlOylo@c3WrNN9EYilqT`!MJ^V`0nhZt?NOJ<}^Xd+$y<qZ?a@uFLd2bV#w=qs|!
zO>HFGXnJeW`WAoYXeCDhy{Zl{HPLnD?7g}&?+no0%gL(9-qv*Y_x^ZRKMP(Guj>!w
zj*2Y|zKO$`s#*sQ#N{^p)E*EuQtTLA;nCy4Tv*x;(vG5t_uo5cXh$TMpDRT_{SAn`
z6#GS^k1c;L4;H^62^wakx#sQnJJzDQm;bD$WFTO!SD2`MLGe3czOD=ZxrHn685luV
zi_H~n-iV3?{euYGFwMumR=nI_k~l0ehl@EU43@5GvGfnM4Pbu@Sm4h!U|m(I1a~r%
z$S0dV7aK^5o==ZF#Z2E8A`qSmV3f0oa4&QU(a{0;oWk?RHF`=X$C48WCsMoH=AFc~
zf`pBmMVA$z#w6QdUYH}Th)e*q5{3pyACxj@;=d@_o~?AKw5#-5OB~Jm{^V6QfqldI
z=khVQo=DN+c}L4m+r&rg)?Lx&EeHt>$!~p7sx+fm<89UV34jn?&OdLK`=cmU2J@>D
zGqUuM=<jDly?GL10WeG7d8vr`T*i4#gp82fnjDbS%Jr068tsfB->$8af>cK2i!FLQ
z=ws=vpX*6QWkKcT2i0vp=i*V<`%?zcy^LJ|C;_Of2H^dkt_6$3g4WxNiSt=DInXRB
zD{ppC(0m-MXb0_{0S5C~$H;1vtfL~>di^Wp!+)EPkp65EA$G1|N=55ac8=qBJQQs>
z$5;>h`-2VbStYv#)J7^4PU@du$=IO>5M4^DZn*woB<0B~>35$*o1cGbO8K`^jg;y+
zXG!bEXx4K4eQwJHJm}|wId*q3c^$pzHk+W=^M|9>N<<-AUX6#HQ{>NshEM(&D*t9v
zWC7Ur=n)v!(*M^>`AkIh%Xi70$^PCR>uv&X`8jhi!vDk~DK7>vu(1djF8L2}4kZ4!
zH_Wq{sGp@YidkrS=&1h_!zXF%&!5OabtlzE5m`_)<(hHvFH`@MtN7oavqAUU=&5+3
zw=eb&bDRJBYySN|Zd8D~k*3j*;veJjzx%SI3xUDrq(TKeLfQYL*RLc1-zqqiP56&U
z`QNYgKYxFhN&ef|h)>^|@&D-cpEQB*&Yi07?!Wtp{}xjJ@ig}^;Nk&K>FVnLc;J8d
zCm0zR+f8_L`lo^0|NOZB_5S|98~^`q{C|yz|2|&+*Q5WpSNuC9(dI{tCj>PEvId;J
Qe*r%-5(?rKq96SJKVe@tY5)KL

literal 0
HcmV?d00001

diff --git a/java_demo.md b/java_demo.md
new file mode 100644
index 0000000000..5cfaf9ba60
--- /dev/null
+++ b/java_demo.md
@@ -0,0 +1,112 @@
+<!--ts-->
+* [Java Android Demo](#java-android-demo)
+  * [编译](#编译)
+  * [准备 demo 需要的其他文件](#准备-demo-需要的其他文件)
+     * [脚本方法](#脚本方法)
+     * [手动拷贝方法](#手动拷贝方法)
+     * [把 .so 动态库和 .jar 拷贝进安卓demo程序：](#把-so-动态库和-jar-拷贝进安卓demo程序)
+     * [把demo使用到的模型文件拷贝进安卓程序：](#把demo使用到的模型文件拷贝进安卓程序)
+  * [运行 Android 程序结果](#运行-android-程序结果)
+
+<!-- Added by: yanchunwei, at: Mon Aug 26 22:23:07 CST 2019 -->
+
+<!--te-->
+
+# Java Android Demo
+本节中，Java demo 完整代码位于 [demo/java](https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/lite/demo/java) 。
+
+要编译和跑起Android demo 程序 PaddlePredictor，你需要准备：
+
+1. 一台能运行安卓程序的安卓手机
+2. 一台带有AndroidStudio的开发机
+
+## 编译
+
+首先在PaddleLite的开发 [Docker镜像](./source_compile) 中，拉取最新PaddleLite代码，编译对应你手机架构的预测库，
+下面我们以arm8 架构举例。进入paddlelite 目录，运行以下命令：
+
+```shell
+./lite/tools/build.sh        \
+    --arm_os=android         \
+    --arm_abi=armv8          \
+    --arm_lang=gcc           \
+    --android_stl=c++_static \
+    tiny_publish
+```
+
+命令完成后查看要存在
+
+```
+./build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/java/so/libpaddle_lite_jni.so
+./build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/java/jar/PaddlePredictor.jar
+./build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/java/android
+```
+
+libpaddle_lite_jni.so为 PaddleLite c++ 动态链接库，PaddlePredictor.jar为 Java jar 包，两者包含 PaddleLite Java API，接下来 Android Java 代码会使用这些api。android文件夹中则是Android demo。
+
+## 准备 demo 需要的其他文件
+
+Demo 除了代码，还需要准备在Android工程目录下配置好JNI .so 库（上节提到的`libpaddle_lite_jni.so`），Java .jar 包（上文提到的`PaddlePredictor.jar` ），和模型文件。我们提供了自动化的脚本和手动拷贝两种方法，用户可以根据自己需要选择：
+
+### 脚本方法
+
+进入 `build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/java/android`，我们准备了一个脚本`prepare_demo.bash`，脚本输入一个参数，为你要拷贝的.so 对应的架构文件夹名。
+
+例如运行
+
+```
+bash prepare_demo.bash arm8
+```
+
+该脚本自动下载并解压缩模型文件，拷贝了 .jar 包进demo，还有生成的.so包进`PaddlePredictor/app/src/main/jinLibs/架构文件夹下`，
+在我们这个例子里，armv8 就是架构文件夹。备注：这种方式构建的 demo 在 armv8 手机运行正常。如果要demo 程序在别的手机架构（如 armv7）上也运行正常，需要添加别的架构。
+
+### 手动拷贝方法
+
+接下来我们介绍手动拷贝，如果使用了脚本，那么可以跳过以下手动方法的介绍。
+
+### 把 .so 动态库和 .jar 拷贝进安卓demo程序：
+
+1. 将PaddlePredictor 载入到AndroidStudio。
+2. 将`libpaddle_lite_jni.so`拷贝进 `PaddlePredictor/app/src/main/jinLibs/架构文件夹下` ，比如文件夹arm8里要包含该 .so文件。
+3. 将 `PaddlePredictor.jar` 拷贝进 `PaddlePredictor/app/libs` 下
+
+### 把demo使用到的模型文件拷贝进安卓程序：
+
+下载我们的5个模型文件，并解压缩到 `PaddlePredictor/app/src/main/assets` 这个文件夹中
+需要拷贝的模型文件和下载地址：
+
+```
+inception_v4_simple_opt.nb http://paddle-inference-dist.bj.bcebos.com/inception_v4_simple_opt.nb.tar.gz
+lite_naive_model_opt.nb    http://paddle-inference-dist.bj.bcebos.com/lite_naive_model_opt.nb.tar.gz
+mobilenet_v1_opt.nb        http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1_opt.nb.tar.gz
+mobilenet_v2_relu_opt.nb   http://paddle-inference-dist.bj.bcebos.com/mobilenet_v2_relu_opt.nb.tar.gz
+resnet50_opt.nb            http://paddle-inference-dist.bj.bcebos.com/resnet50_opt.nb.tar.gz
+```
+
+下载完后，assets文件夹里要包含解压后的上面五个模型文件夹，但demo里不需要保存原压缩.tar.gz 文件。
+
+注意：输入的模型要求为naive buffer存储格式，您可以通过 [**Model Optimize Tool**](./model_optimize_tool) 将fluid模型转为naive buffer存储格式。
+
+## 运行 Android 程序结果
+
+以上准备工作完成，就可以开始Build 、安装、和运行安卓demo程序。当你运行PaddlePredictor 程序时，大概会等10秒，然后看到类似以下字样：
+
+```
+lite_naive_model output: 50.213173, -28.872887
+expected: 50.2132, -28.8729
+
+inception_v4_simple test:true
+time: xxx ms
+
+resnet50 test:true
+time: xxx ms
+
+mobilenet_v1 test:true
+time: xxx ms
+
+mobilenet_v2 test:true
+time: xxx ms
+```
+
+该 demo 程序跑我们的 5 个模型，第一个模型结果将真正的头两个数字输出，并在第二行附上期望的正确值。你应该要看到他们的误差小于0.001。后面四个模型如果你看到 `test:true` 字样，说明模型输出通过了我们在 demo 程序里对其输出的测试。time 代表该测试花费的时间。 
diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt
deleted file mode 100644
index 937781293a..0000000000
--- a/lite/CMakeLists.txt
+++ /dev/null
@@ -1,159 +0,0 @@
-include(lite)
-
-message(WARNING "Lite enabled!")
-message(STATUS "LIGHT_FRAMEWORK:\t${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK}")
-message(STATUS "LITE_WITH_CUDA:\t${LITE_WITH_CUDA}")
-message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}")
-message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")
-message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
-message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
-message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
-
-set(LITE_MODEL_DIR "${THIRD_PARTY_PATH}/install")
-set(LITE_ON_MOBILE ${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK})
-
-add_subdirectory(utils)
-add_subdirectory(operators)
-add_subdirectory(kernels)
-add_subdirectory(core)
-add_subdirectory(model_parser)
-add_subdirectory(api)
-add_subdirectory(fluid)
-add_subdirectory(backends)
-
-if (NOT LITE_ON_TINY_PUBLISH)
-    add_subdirectory(tests)
-    add_subdirectory(tools)
-endif()
-if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND NOT LITE_ON_TINY_PUBLISH)
-  add_subdirectory(gen_code)
-endif()
-
-if (WITH_TESTING)
-    lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "lite_naive_model.tar.gz")
-    if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-	lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz")
-        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v2_relu.tar.gz")
-        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
-        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
-        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "MobileNetV1_quant.tar.gz")
-    endif()
-    if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "GoogleNet_inference.tar.gz")
-	      lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz")
-        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v2_relu.tar.gz")
-        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
-        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
-    endif()
-endif()
-
-if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
-    # for publish
-    set(INFER_LITE_PUBLISH_ROOT "${CMAKE_BINARY_DIR}/inference_lite_lib.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}")
-    if (LITE_WITH_OPENCL)
-        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.opencl")
-    endif(LITE_WITH_OPENCL)
-    if (LITE_WITH_NPU)
-        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.npu")
-    endif(LITE_WITH_NPU)
-    message(STATUS "publish inference lib to ${INFER_LITE_PUBLISH_ROOT}")
-
-    # The final target for publish lite lib
-    add_custom_target(publish_inference)
-    if (NOT LITE_ON_TINY_PUBLISH)
-        # add cxx lib
-        add_custom_target(publish_inference_cxx_lib ${TARGET}
-                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
-                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
-                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
-                COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-                COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-                #COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/model_optimize_tool" "${INFER_LITE_PUBLISH_ROOT}/bin"
-                COMMAND cp "${CMAKE_BINARY_DIR}/lite/gen_code/paddle_code_generator" "${INFER_LITE_PUBLISH_ROOT}/bin"
-                COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin"
-                )
-            if(NOT IOS)
-                #add_dependencies(publish_inference_cxx_lib model_optimize_tool)
-                add_dependencies(publish_inference_cxx_lib paddle_code_generator)
-                add_dependencies(publish_inference_cxx_lib bundle_full_api)
-                add_dependencies(publish_inference_cxx_lib bundle_light_api)
-                add_dependencies(publish_inference_cxx_lib test_model_bin)
-                add_dependencies(publish_inference publish_inference_cxx_lib)
-                add_custom_command(TARGET publish_inference_cxx_lib POST_BUILD
-                        COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.a)
-            endif()
-    else()
-        if (IOS OR (ARM_TARGET_OS STREQUAL "armlinux"))
-            add_custom_target(tiny_publish_lib ${TARGET}
-                    COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/lib"
-                    COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/include"
-                    COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include"
-                    COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/lib"
-                    )
-            add_dependencies(tiny_publish_lib bundle_light_api)
-            add_dependencies(publish_inference tiny_publish_lib)
-        endif()
-    endif()
-
-
-    if (LITE_WITH_JAVA)
-        # add java lib
-        add_custom_target(publish_inference_java_lib ${TARGET}
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/java/so"
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/java/jar"
-            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/android/jni/native/libpaddle_lite_jni.so" "${INFER_LITE_PUBLISH_ROOT}/java/so"
-            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/android/jni/PaddlePredictor.jar" "${INFER_LITE_PUBLISH_ROOT}/java/jar"
-            COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/api/android/jni/src" "${INFER_LITE_PUBLISH_ROOT}/java"
-        )
-        add_dependencies(publish_inference_java_lib paddle_lite_jni PaddlePredictor)
-        add_dependencies(publish_inference publish_inference_java_lib)
-        add_custom_command(TARGET publish_inference_java_lib POST_BUILD
-                                   COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/java/so/libpaddle_lite_jni.so)
-    endif()
-
-    if ((ARM_TARGET_OS STREQUAL "android") AND (NOT LITE_WITH_OPENCL) AND
-            ((ARM_TARGET_ARCH_ABI STREQUAL armv7) OR (ARM_TARGET_ARCH_ABI STREQUAL armv8)))
-        if (NOT LITE_ON_TINY_PUBLISH)
-            # copy
-            add_custom_target(publish_inference_android_cxx_demos ${TARGET}
-                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
-                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
-                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/include"
-                COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/gflags" "${INFER_LITE_PUBLISH_ROOT}/third_party"
-                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/Makefile.def" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
-                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/README.md" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
-                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_full" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
-                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_full/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_full/Makefile"
-                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
-                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
-            )
-            add_dependencies(publish_inference_android_cxx_demos logging gflags)
-            add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)
-        endif()
-
-        if (LITE_WITH_JAVA)
-            # copy java mobile_light demo/lib
-            add_custom_target(publish_inference_android_java_demo ${TARGET}
-                    COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/java"
-                    COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/java/android" "${INFER_LITE_PUBLISH_ROOT}/demo/java"
-                    COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/java/README.md" "${INFER_LITE_PUBLISH_ROOT}/demo/java"
-                    COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/java/android/PaddlePredictor/app/libs"
-                    COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/java/android/PaddlePredictor/app/src/main/jniLibs/arm7"
-                    COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/java/android/PaddlePredictor/app/src/main/jniLibs/arm8"
-                    COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/java/android/PaddlePredictor/app/src/main/jniLibs/arm64-v8a"
-                    COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/java/android/PaddlePredictor/app/src/main/jniLibs/armeabi-v7a"
-                    COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/java/android/PaddlePredictor/app/src/main/jniLibs/x86"
-            )
-            add_dependencies(publish_inference_java_lib publish_inference_android_java_demo)
-        endif()
-    endif()
-
-    if (LITE_WITH_OPENCL)
-        add_custom_target(publish_inference_opencl ${TARGET}
-            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/opencl"
-            COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/backends/opencl/cl_kernel" "${INFER_LITE_PUBLISH_ROOT}/opencl"
-        )
-        add_dependencies(publish_inference_cxx_lib publish_inference_opencl)
-    endif()
-endif()
diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt
deleted file mode 100644
index 7767458b37..0000000000
--- a/lite/api/CMakeLists.txt
+++ /dev/null
@@ -1,239 +0,0 @@
-if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-  lite_cc_library(place SRCS paddle_place.cc DEPS logging)
-else()
-  lite_cc_library(place SRCS paddle_place.cc DEPS glog)
-endif(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-
-if (WITH_TESTING)
-    lite_cc_library(lite_api_test_helper SRCS lite_api_test_helper.cc
-      DEPS scope optimizer target_wrapper_host model_parser program
-           ${ops} ${host_kernels}
-      CUDA_DEPS ${cuda_kernels}
-      X86_DEPS ${x86_kernels})
-endif()
-if(LITE_WITH_FPGA)
-    set(light_api_deps ${light_api_deps} ${fpga_deps})
-    set(cxx_api_deps ${cxx_api_deps} ${fpga_deps})
-endif()
-
-message(STATUS "get ops ${ops}")
-message(STATUS "get X86 kernels ${x86_kernels}")
-message(STATUS "get Host kernels ${host_kernels}")
-message(STATUS "get ARM kernels ${arm_kernels}")
-message(STATUS "get NPU kernels ${npu_kernels}")
-message(STATUS "get FPGA kernels ${fpga_kernels}")
-
-# for full api
-if (NOT LITE_ON_TINY_PUBLISH)
-    set(cxx_api_deps
-      scope optimizer target_wrapper_host model_parser program)
-    lite_cc_library(cxx_api
-                    SRCS cxx_api.cc
-                    DEPS ${cxx_api_deps} ${ops} ${host_kernels} program
-                    X86_DEPS ${x86_kernels}
-                    ARM_DEPS ${arm_kernels}
-                    NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass
-                    CL_DEPS ${opencl_kenrels}
-                    FPGA_DEPS ${fpga_kenrels})
-endif()
-
-# for light api
-set(light_api_deps
-    scope target_wrapper_host model_parser program)
-if(LITE_WITH_CUDA)
-    set(light_api_deps ${light_api_deps} target_wrapper_cuda)
-endif()
-lite_cc_library(light_api SRCS light_api.cc
-        DEPS scope target_wrapper_host model_parser
-            ${light_api_deps} ${ops} ${host_kernels} program
-        CUDA_DEPS ${cuda_kernels}
-        X86_DEPS ${x86_kernels}
-        ARM_DEPS ${arm_kernels}
-        NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass
-        CL_DEPS ${opencl_kenrels}
-        FPGA_DEPS ${fpga_kenrels})
-
-include(ExternalProject)
-set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
-        "A path setting inference demo download directories.")
-
-if(WITH_TESTING)
-    lite_cc_test(test_cxx_api SRCS cxx_api_test.cc
-       DEPS cxx_api mir_passes lite_api_test_helper
-       ${ops} ${host_kernels}
-       X86_DEPS ${x86_kernels}
-       ARM_DEPS ${arm_kernels}
-       NPU_DEPS ${npu_kernels}
-       CL_DEPS ${opencl_kernels}
-       FPGA_DEPS ${fpga_kernels}
-       EXCLUDE_COMPILE_DEPS "ON"
-       ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
-            --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
-    add_dependencies(test_cxx_api extern_lite_download_lite_naive_model_tar_gz)
-    if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-        lite_cc_test(test_googlenet SRCS test_googlenet_lite.cc
-           DEPS cxx_api mir_passes lite_api_test_helper
-           ${ops} ${host_kernels} ${x86_kernels}
-           ARGS --model_dir=${LITE_MODEL_DIR}/googlenet)
-        add_dependencies(test_googlenet extern_lite_download_GoogleNet_inference_tar_gz)
-        lite_cc_test(test_mobilenetv1_lite_x86 SRCS test_mobilenetv1_lite_x86.cc
-           DEPS cxx_api mir_passes lite_api_test_helper
-           ${ops} ${host_kernels} ${x86_kernels}
-           ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1)
-        add_dependencies(test_mobilenetv1_lite_x86 extern_lite_download_mobilenet_v1_tar_gz)
-        lite_cc_test(test_mobilenetv2_lite_x86 SRCS test_mobilenetv2_lite_x86.cc
-           DEPS cxx_api mir_passes lite_api_test_helper
-           ${ops} ${host_kernels} ${x86_kernels}
-           ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v2_relu)
-        add_dependencies(test_mobilenetv2_lite_x86 extern_lite_download_mobilenet_v2_relu_tar_gz)
-        lite_cc_test(test_inceptionv4_lite_x86 SRCS test_inceptionv4_lite_x86.cc
-           DEPS cxx_api mir_passes lite_api_test_helper
-           ${ops} ${host_kernels} ${x86_kernels}
-           ARGS --model_dir=${LITE_MODEL_DIR}/inception_v4_simple)
-        add_dependencies(test_inceptionv4_lite_x86 extern_lite_download_inception_v4_simple_tar_gz)
-    endif()
-endif()
-
-if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
-    set(lite_model_test_DEPS cxx_api mir_passes ${ops} ${host_kernels} ${arm_kernels} ${npu_kernels} ${fpga_kernels})
-
-    lite_cc_test(test_mobilenetv1_int8 SRCS mobilenetv1_int8_test.cc
-       DEPS ${lite_model_test_DEPS}
-       CL_DEPS ${opencl_kernels}
-       ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl
-        --model_dir=${LITE_MODEL_DIR}/MobilenetV1_quant SERIAL)
-    add_dependencies(test_mobilenetv1_int8 extern_lite_download_MobileNetV1_quant_tar_gz)
-
-    lite_cc_test(test_mobilenetv1 SRCS mobilenetv1_test.cc
-       DEPS ${lite_model_test_DEPS}
-       CL_DEPS ${opencl_kernels}
-       NPU_DEPS ${npu_kernels} ${npu_bridges}
-       ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl
-            --model_dir=${LITE_MODEL_DIR}/mobilenet_v1 SERIAL)
-    add_dependencies(test_mobilenetv1 extern_lite_download_mobilenet_v1_tar_gz)
-    set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map")
-    set_target_properties(test_mobilenetv1 PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-
-    lite_cc_test(test_mobilenetv2 SRCS mobilenetv2_test.cc
-       DEPS ${lite_model_test_DEPS}
-       CL_DEPS ${opencl_kernels}
-       ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl
-            --model_dir=${LITE_MODEL_DIR}/mobilenet_v2_relu SERIAL)
-    add_dependencies(test_mobilenetv2 extern_lite_download_mobilenet_v2_relu_tar_gz)
-    set_target_properties(test_mobilenetv2 PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-
-    lite_cc_test(test_resnet50 SRCS resnet50_test.cc
-       DEPS ${lite_model_test_DEPS} paddle_api_light
-       CL_DEPS ${opencl_kernels}
-       FPGA_DEPS ${fpga_kernels}
-       ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl
-            --model_dir=${LITE_MODEL_DIR}/resnet50 SERIAL)
-    add_dependencies(test_resnet50 extern_lite_download_resnet50_tar_gz)
-
-    lite_cc_test(test_resnet50_fpga SRCS resnet50_test_fpga.cc
-       DEPS ${lite_model_test_DEPS}
-       CL_DEPS ${opencl_kernels}
-       FPGA_DEPS ${fpga_kernels})
-
-    lite_cc_test(test_inceptionv4 SRCS inceptionv4_test.cc
-       DEPS ${lite_model_test_DEPS}
-       CL_DEPS ${opencl_kernels}
-       ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl
-            --model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL)
-    add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz)
-   # lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc
-   #    DEPS ${lite_model_test_DEPS})
-
-   # lite_cc_test(model_run_test_image SRCS model_run_test_image.cc
-   #    DEPS ${lite_model_test_DEPS}
-   #    CL_DEPS ${opencl_kernels}
-   #    FPGA_DEPS ${fpga_kernels})
-endif()
-
-# These tests needs CLI arguments, and is not supported in ARM CI.
-# TODO(Superjomn) support latter.
-lite_cc_test(test_light_api SRCS light_api_test.cc
-    DEPS light_api program mir_passes
-    CL_DEPS ${opencl_kernels}
-    FPGA_DEPS ${fpga_kernels}
-    ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
-
-lite_cc_test(test_apis SRCS apis_test.cc
-  DEPS cxx_api light_api ${ops}
-  CL_DEPS ${opencl_kernels}
-  X86_DEPS ${x86_kernels}
-    FPGA_DEPS ${fpga_kernels}
-  ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
-       --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
-
-lite_cc_library(paddle_api SRCS paddle_api.cc DEPS op_params tensor)
-
-#-----------------------------------------------------------------------------------------------------
-# The final inference library for both CxxConfig and MobileConfig.
-if (LITE_ON_TINY_PUBLISH)
-    lite_cc_library(paddle_api_light SRCS light_api_impl.cc DEPS light_api paddle_api stream)
-else()
-    lite_cc_library(paddle_api_light SRCS light_api_impl.cc DEPS light_api paddle_api)
-endif()
-if (NOT LITE_ON_TINY_PUBLISH)
-    lite_cc_library(paddle_api_full SRCS cxx_api_impl.cc DEPS cxx_api paddle_api_light
-      ${ops}
-      ARM_DEPS ${arm_kernels}
-      NPU_DEPS ${npu_kernels}
-      CL_DEPS ${opencl_kernels}
-      FPGA_DEPS ${fpga_kernels})
-    # The final inference library for just MobileConfig.
-    bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
-endif()
-bundle_static_library(paddle_api_light paddle_api_light_bundled bundle_light_api)
-#-----------------------------------------------------------------------------------------------------
-
-if (LITE_WITH_JAVA AND LITE_WITH_ARM)
-    add_subdirectory(android)
-endif()
-
-if (LITE_ON_TINY_PUBLISH)
-    return()
-endif()
-
-if (LITE_ON_MODEL_OPTIMIZE_TOOL)
-    message(STATUS "Compiling model_optimize_tool")
-    lite_cc_binary(model_optimize_tool SRCS model_optimize_tool.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc
-        DEPS gflags kernel op optimizer mir_passes utils)
-    add_dependencies(model_optimize_tool op_list_h kernel_list_h all_kernel_faked_cc)
-endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
-
-lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle_api_light
-  ${ops}
-  ARM_DEPS ${arm_kernels}
-  NPU_DEPS ${npu_kernels}
-  CL_DEPS ${opencl_kernels}
-  X86_DEPS ${x86_kernels}
-  FPGA_DEPS ${fpga_kernels}
-  ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
-if (WITH_TESTING)
-    add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz)
-endif()
-
-# Some bins
-if(NOT IOS)
-  lite_cc_binary(test_model_bin SRCS model_test.cc DEPS paddle_api_full paddle_api_light gflags utils
-    ${ops}
-    ARM_DEPS ${arm_kernels}
-    NPU_DEPS ${npu_kernels}
-    CL_DEPS ${opencl_kernels}
-    FPGA_DEPS ${fpga_kernels}
-    X86_DEPS ${x86_kernels})
-  lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
-    ${ops}
-    ARM_DEPS ${arm_kernels}
-    NPU_DEPS ${npu_kernels}
-    CL_DEPS ${opencl_kernels}
-    FPGA_DEPS ${fpga_kernels}
-    X86_DEPS ${x86_kernels})
-endif()
-
-#lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc
-    #X86_DEPS operator
-    #DEPS light_api model_parser target_wrapper_host mir_passes
-    #ARM_DEPS ${arm_kernels}) NPU_DEPS ${npu_kernels})
diff --git a/lite/api/_paddle_use_kernels.h b/lite/api/_paddle_use_kernels.h
deleted file mode 100644
index 75756736f4..0000000000
--- a/lite/api/_paddle_use_kernels.h
+++ /dev/null
@@ -1,209 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/*
- * ATTENTION this header file can only include in .cc file.
- */
-
-#pragma once
-#include "paddle_lite_factory_helper.h"  // NOLINT
-#ifndef LITE_WITH_FPGA
-USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(flatten, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(flatten2, kHost, kAny, kAny, def);
-#else
-USE_LITE_KERNEL(feed, kFPGA, kFP16, kNHWC, def);
-USE_LITE_KERNEL(fetch, kFPGA, kFP16, kNHWC, def);
-#endif
-
-// host kernels
-USE_LITE_KERNEL(reshape, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(reshape2, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(multiclass_nms, kHost, kFloat, kNCHW, def);
-
-#ifdef LITE_WITH_ARM
-USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(matmul, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(lrn, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(decode_bboxes, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(box_coder, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(elementwise_sub, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(elementwise_mul, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(elementwise_max, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(elementwise_div, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(fusion_elementwise_div_activation, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(fusion_elementwise_add_activation, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(fusion_elementwise_mul_activation, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(fusion_elementwise_max_activation, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(split, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(relu6, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(transpose, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(transpose2, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(power, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(shuffle_channel, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(yolo_box, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(argmax, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(axpy, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(leaky_relu, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(relu_clipped, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(prelu, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(sigmoid, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(tanh, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(swish, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(log, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(exp, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(conv2d_transpose, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(pad2d, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(prior_box, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(density_prior_box, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(negative, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(crop, kARM, kFloat, kNCHW, def);
-
-USE_LITE_KERNEL(norm, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(sequence_softmax, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(im2sequence, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(bilinear_interp, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(nearest_interp, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(logical_xor, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(logical_and, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(less_than, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(top_k, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(increment, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(write_to_array, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(read_from_array, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(reduce_max, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(sequence_expand, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(sequence_pool, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(shape, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(fill_constant, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(cast, kARM, kFloat, kNCHW, def)
-USE_LITE_KERNEL(slice, kARM, kFloat, kNCHW, def)
-USE_LITE_KERNEL(affine_channel, kARM, kFloat, kNCHW, def)
-USE_LITE_KERNEL(anchor_generator, kARM, kFloat, kNCHW, def)
-USE_LITE_KERNEL(generate_proposals, kARM, kFloat, kNCHW, def)
-USE_LITE_KERNEL(squeeze, kARM, kFloat, kNCHW, def)   // for x2paddle
-USE_LITE_KERNEL(squeeze2, kARM, kFloat, kNCHW, def)  // for x2paddle
-USE_LITE_KERNEL(expand, kARM, kFloat, kNCHW, def)    // for x2paddle
-USE_LITE_KERNEL(roi_align, kARM, kFloat, kNCHW, def)
-USE_LITE_KERNEL(box_clip, kARM, kFloat, kNCHW, def)
-USE_LITE_KERNEL(reduce_mean, kARM, kFloat, kNCHW, def)
-USE_LITE_KERNEL(stack, kARM, kFloat, kNCHW, def)
-USE_LITE_KERNEL(assign_value, kARM, kFloat, kNCHW, def)
-USE_LITE_KERNEL(hard_sigmoid, kARM, kFloat, kNCHW, def)
-
-USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, fp32_to_int8);
-USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, int8_to_fp32);
-USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, fp32_to_int8);
-USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, int8_to_fp32);
-USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, int8_out);
-USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, fp32_out);
-USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, int8out);
-USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, fp32out);
-USE_LITE_KERNEL(gru_unit, kARM, kFloat, kNCHW, def)
-USE_LITE_KERNEL(gru, kARM, kFloat, kNCHW, def)
-USE_LITE_KERNEL(beam_search_decode, kARM, kFloat, kNCHW, def)
-USE_LITE_KERNEL(beam_search, kARM, kFloat, kNCHW, def)
-USE_LITE_KERNEL(while, kARM, kFloat, kNCHW, def)
-USE_LITE_KERNEL(lod_reset, kARM, kFloat, kNCHW, def)
-USE_LITE_KERNEL(lookup_table, kARM, kFloat, kNCHW, def)
-USE_LITE_KERNEL(is_empty, kARM, kFloat, kNCHW, def)
-USE_LITE_KERNEL(assign, kARM, kFloat, kNCHW, def);
-#endif
-
-#ifdef LITE_WITH_X86
-// NOTE all the X86 kernels are disabled temporarily for kernel are changed.
-// USE_LITE_KERNEL(relu, kX86, kFloat, kNCHW, def);
-// USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
-// USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(scale, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(slice, kX86, kFloat, kNCHW, def);
-// USE_LITE_KERNEL(fill_constant, kX86, kFloat, kNCHW, def);
-// USE_LITE_KERNEL(square, kX86, kFloat, kNCHW, def);
-// USE_LITE_KERNEL(elementwise_sub, kX86, kFloat, kNCHW, def);
-// USE_LITE_KERNEL(elementwise_add, kX86, kFloat, kNCHW, def);
-// USE_LITE_KERNEL(softmax, kX86, kFloat, kNCHW, def);
-// USE_LITE_KERNEL(dropout, kX86, kFloat, kNCHW, def);
-// USE_LITE_KERNEL(concat, kX86, kFloat, kNCHW, def);
-// USE_LITE_KERNEL(conv2d, kX86, kFloat, kNCHW, def);
-// USE_LITE_KERNEL(depthwise_conv2d, kX86, kFloat, kNCHW, def);
-// USE_LITE_KERNEL(pool2d, kX86, kFloat, kNCHW, def);
-// USE_LITE_KERNEL(batch_norm, kX86, kFloat, kNCHW, def);
-#endif
-
-#ifdef LITE_WITH_CUDA
-USE_LITE_KERNEL(mul, kCUDA, kFloat, kNCHW, def);
-USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device);
-USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, device_to_host);
-USE_LITE_KERNEL(io_copy_once, kCUDA, kAny, kAny, host_to_device);
-USE_LITE_KERNEL(io_copy_once, kCUDA, kAny, kAny, device_to_host);
-USE_LITE_KERNEL(conv2d, kCUDA, kFloat, kNCHW, def);
-USE_LITE_KERNEL(leaky_relu, kCUDA, kFloat, kNCHW, def);
-USE_LITE_KERNEL(nearest_interp, kCUDA, kFloat, kNCHW, def);
-USE_LITE_KERNEL(yolo_box, kCUDA, kFloat, kNCHW, def);
-USE_LITE_KERNEL(concat, kCUDA, kFloat, kNCHW, def);
-#endif
-
-#ifdef LITE_WITH_OPENCL
-USE_LITE_KERNEL(io_copy, kOpenCL, kAny, kAny, host_to_device);
-USE_LITE_KERNEL(io_copy, kOpenCL, kAny, kAny, device_to_host);
-USE_LITE_KERNEL(io_copy_once, kOpenCL, kAny, kAny, host_to_device);
-USE_LITE_KERNEL(io_copy_once, kOpenCL, kAny, kAny, device_to_host);
-
-USE_LITE_KERNEL(fc, kOpenCL, kFloat, kNCHW, def);
-USE_LITE_KERNEL(mul, kOpenCL, kFloat, kNCHW, def);
-USE_LITE_KERNEL(elementwise_add, kOpenCL, kFloat, kNCHW, def);
-USE_LITE_KERNEL(fusion_elementwise_add_activation, kOpenCL, kFloat, kNCHW, def);
-USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kNCHW, def);
-USE_LITE_KERNEL(relu, kOpenCL, kFloat, kNCHW, def);
-USE_LITE_KERNEL(depthwise_conv2d, kOpenCL, kFloat, kNCHW, def);
-USE_LITE_KERNEL(conv2d, kOpenCL, kFloat, kNCHW, def);
-#endif
-
-#ifdef LITE_WITH_NPU
-USE_LITE_KERNEL(graph_op, kNPU, kFloat, kNCHW, def);
-#endif
-#ifdef LITE_WITH_FPGA
-USE_LITE_KERNEL(relu, kFPGA, kFP16, kNHWC, def);
-USE_LITE_KERNEL(conv2d, kFPGA, kFP16, kNHWC, def);
-USE_LITE_KERNEL(elementwise_add, kFPGA, kFP16, kNHWC, def);
-USE_LITE_KERNEL(fusion_elementwise_add_activation, kFPGA, kFP16, kNHWC, def);
-USE_LITE_KERNEL(fc, kFPGA, kFP16, kNHWC, def);
-USE_LITE_KERNEL(pool2d, kFPGA, kFP16, kNHWC, def);
-USE_LITE_KERNEL(scale, kFPGA, kFP16, kNHWC, def);
-USE_LITE_KERNEL(softmax, kFPGA, kFP16, kNHWC, def);
-USE_LITE_KERNEL(io_copy, kFPGA, kAny, kAny, host_to_device);
-USE_LITE_KERNEL(io_copy, kFPGA, kAny, kAny, device_to_host);
-USE_LITE_KERNEL(io_copy_once, kFPGA, kAny, kAny, host_to_device_once);
-USE_LITE_KERNEL(io_copy_once, kFPGA, kAny, kAny, device_to_host_once);
-USE_LITE_KERNEL(calib, kFPGA, kFP16, kNHWC, fp32_to_fp16_fpga);
-USE_LITE_KERNEL(calib, kFPGA, kFP16, kNHWC, fp16_to_fp32_fpga);
-USE_LITE_KERNEL(calib_once, kFPGA, kFP16, kNHWC, fp32_to_fp16_fpga);
-USE_LITE_KERNEL(calib_once, kFPGA, kFP16, kNHWC, fp16_to_fp32_fpga);
-USE_LITE_KERNEL(layout, kFPGA, kAny, kNHWC, hwc_to_chw_fpga_fp16);
-USE_LITE_KERNEL(layout, kFPGA, kAny, kNHWC, chw_to_hwc_fpga_fp16);
-USE_LITE_KERNEL(layout_once, kFPGA, kAny, kNHWC, hwc_to_chw_fpga_fp16);
-USE_LITE_KERNEL(layout_once, kFPGA, kAny, kNHWC, chw_to_hwc_fpga_fp16);
-#endif
diff --git a/lite/api/_paddle_use_ops.h b/lite/api/_paddle_use_ops.h
deleted file mode 100644
index 890c57c4aa..0000000000
--- a/lite/api/_paddle_use_ops.h
+++ /dev/null
@@ -1,127 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-// ATTENTION This can only include in a .cc file.
-
-#include "paddle_lite_factory_helper.h"  // NOLINT
-
-USE_LITE_OP(mul);
-USE_LITE_OP(matmul);
-USE_LITE_OP(fc);
-USE_LITE_OP(relu);
-USE_LITE_OP(relu6);
-USE_LITE_OP(scale);
-USE_LITE_OP(feed);
-USE_LITE_OP(lrn);
-USE_LITE_OP(decode_bboxes);
-USE_LITE_OP(box_coder);
-USE_LITE_OP(fetch);
-USE_LITE_OP(io_copy);
-USE_LITE_OP(io_copy_once);
-USE_LITE_OP(elementwise_add)
-USE_LITE_OP(elementwise_sub)
-USE_LITE_OP(elementwise_mul)
-USE_LITE_OP(elementwise_max)
-USE_LITE_OP(elementwise_div)
-USE_LITE_OP(fusion_elementwise_add_activation)
-USE_LITE_OP(fusion_elementwise_mul_activation)
-USE_LITE_OP(fusion_elementwise_max_activation)
-USE_LITE_OP(fusion_elementwise_div_activation)
-USE_LITE_OP(square)
-USE_LITE_OP(softmax)
-USE_LITE_OP(dropout)
-USE_LITE_OP(concat)
-USE_LITE_OP(conv2d)
-USE_LITE_OP(depthwise_conv2d)
-USE_LITE_OP(pool2d)
-USE_LITE_OP(batch_norm)
-USE_LITE_OP(fusion_elementwise_sub_activation)
-USE_LITE_OP(transpose)
-USE_LITE_OP(transpose2)
-USE_LITE_OP(arg_max)
-USE_LITE_OP(axpy)
-USE_LITE_OP(leaky_relu)
-USE_LITE_OP(relu_clipped)
-USE_LITE_OP(prelu)
-USE_LITE_OP(sigmoid)
-USE_LITE_OP(tanh)
-USE_LITE_OP(swish)
-USE_LITE_OP(log)
-USE_LITE_OP(exp)
-USE_LITE_OP(conv2d_transpose)
-USE_LITE_OP(negative)
-USE_LITE_OP(pad2d)
-USE_LITE_OP(power)
-USE_LITE_OP(shuffle_channel)
-USE_LITE_OP(yolo_box)
-USE_LITE_OP(bilinear_interp)
-USE_LITE_OP(nearest_interp)
-USE_LITE_OP(reduce_mean)
-USE_LITE_OP(stack)
-
-USE_LITE_OP(assign);
-USE_LITE_OP(crop)
-USE_LITE_OP(prior_box)
-USE_LITE_OP(density_prior_box)
-USE_LITE_OP(reshape)
-USE_LITE_OP(reshape2)
-USE_LITE_OP(flatten)
-USE_LITE_OP(flatten2)
-USE_LITE_OP(split)
-USE_LITE_OP(fake_quantize_moving_average_abs_max);
-USE_LITE_OP(fake_dequantize_max_abs);
-USE_LITE_OP(fake_quantize_range_abs_max);
-USE_LITE_OP(calib);
-USE_LITE_OP(calib_once);
-USE_LITE_OP(norm);
-USE_LITE_OP(layout);
-USE_LITE_OP(layout_once);
-USE_LITE_OP(im2sequence);
-USE_LITE_OP(sequence_softmax);
-USE_LITE_OP(logical_xor);
-USE_LITE_OP(logical_and);
-USE_LITE_OP(less_than);
-USE_LITE_OP(top_k);
-USE_LITE_OP(increment);
-USE_LITE_OP(write_to_array);
-USE_LITE_OP(read_from_array);
-USE_LITE_OP(gru_unit)
-USE_LITE_OP(gru)
-USE_LITE_OP(beam_search_decode)
-USE_LITE_OP(beam_search)
-USE_LITE_OP(fill_constant)
-USE_LITE_OP(while)
-USE_LITE_OP(lod_reset)
-USE_LITE_OP(lookup_table)
-USE_LITE_OP(multiclass_nms)
-USE_LITE_OP(graph_op)
-USE_LITE_OP(sequence_expand)
-USE_LITE_OP(sequence_pool)
-USE_LITE_OP(reduce_max)
-USE_LITE_OP(is_empty)
-USE_LITE_OP(shape)
-USE_LITE_OP(slice)
-USE_LITE_OP(cast)
-USE_LITE_OP(affine_channel)
-USE_LITE_OP(anchor_generator)
-USE_LITE_OP(generate_proposals)
-USE_LITE_OP(squeeze)   // for x2paddle
-USE_LITE_OP(squeeze2)  // for x2paddle
-USE_LITE_OP(expand)    // for x2paddle
-USE_LITE_OP(roi_align)
-USE_LITE_OP(box_clip)
-USE_LITE_OP(assign_value)
-USE_LITE_OP(hard_sigmoid)
diff --git a/lite/api/android/.gitignore b/lite/api/android/.gitignore
deleted file mode 100644
index a1d6334395..0000000000
--- a/lite/api/android/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-/bin/
-.classpath
diff --git a/lite/api/android/CMakeLists.txt b/lite/api/android/CMakeLists.txt
deleted file mode 100644
index 7f31f7e947..0000000000
--- a/lite/api/android/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-if ((NOT LITE_WITH_JAVA) OR (NOT LITE_WITH_ARM))
-  return()
-endif()
-
-add_subdirectory(jni)
diff --git a/lite/api/android/jni/.gitignore b/lite/api/android/jni/.gitignore
deleted file mode 100644
index 1299d2738c..0000000000
--- a/lite/api/android/jni/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-/PaddleListTest.class
-/PaddleLite.class
-/bin/
diff --git a/lite/api/android/jni/CMakeLists.txt b/lite/api/android/jni/CMakeLists.txt
deleted file mode 100644
index b2f5671a7b..0000000000
--- a/lite/api/android/jni/CMakeLists.txt
+++ /dev/null
@@ -1,52 +0,0 @@
-if ((NOT LITE_WITH_ARM) OR (NOT LITE_WITH_JAVA))
-  return()
-endif()
-
-include(UseJava)
-find_package(Java REQUIRED)
-
-# We are only interested in finding jni.h: we do not care about extended JVM
-# functionality or the AWT library.
-set(JAVA_AWT_LIBRARY NotNeeded)
-set(JAVA_JVM_LIBRARY NotNeeded)
-set(JAVA_INCLUDE_PATH2 NotNeeded)
-set(JAVA_AWT_INCLUDE_PATH NotNeeded)
-find_package(JNI REQUIRED)
-
-# Generate PaddlePredictor.jar
-include_directories(${JNI_INCLUDE_DIRS})
-add_jar(PaddlePredictor
-    src/com/baidu/paddle/lite/ConfigBase.java
-    src/com/baidu/paddle/lite/CxxConfig.java
-    src/com/baidu/paddle/lite/MobileConfig.java
-    src/com/baidu/paddle/lite/PaddleLiteInitializer.java
-    src/com/baidu/paddle/lite/PaddlePredictor.java
-    src/com/baidu/paddle/lite/PowerMode.java
-    src/com/baidu/paddle/lite/Place.java
-    src/com/baidu/paddle/lite/Tensor.java)
-get_target_property(_jarFile PaddlePredictor JAR_FILE)
-get_target_property(_classDir PaddlePredictor CLASSDIR)
-set(_stubDir "${CMAKE_CURRENT_BINARY_DIR}")
-
-# Generate native headers
-add_custom_target(
-    paddle_lite_jni_header ALL
-    COMMAND ${Java_JAVAH_EXECUTABLE} -verbose
-        -classpath ${_classDir}
-        -o "${CMAKE_BINARY_DIR}/lite/api/android/jni/native/paddle_lite_jni.h"
-        -jni
-        com.baidu.paddle.lite.PaddlePredictor
-    COMMAND ${Java_JAVAH_EXECUTABLE} -verbose
-        -classpath ${_classDir}
-        -o "${CMAKE_BINARY_DIR}/lite/api/android/jni/native/tensor_jni.h"
-        -jni
-        com.baidu.paddle.lite.Tensor
-    COMMAND ${Java_JAVAH_EXECUTABLE} -verbose
-        -classpath ${_classDir}
-        -o "${CMAKE_BINARY_DIR}/lite/api/android/jni/native/paddle_init_jni.h"
-        -jni
-        com.baidu.paddle.lite.PaddleLiteInitializer
-    DEPENDS PaddlePredictor
-)
-
-add_subdirectory(native)
diff --git a/lite/api/android/jni/native/CMakeLists.txt b/lite/api/android/jni/native/CMakeLists.txt
deleted file mode 100644
index afe051a437..0000000000
--- a/lite/api/android/jni/native/CMakeLists.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-# Generate paddle_lite_jni.so
-
-if (LITE_ON_TINY_PUBLISH)
-    set(CMAKE_CXX_FLAGS_RELEASE "-Os -DNDEBUG")
-    set(CMAKE_C_FLAGS_RELEASE "-Os -DNDEBUG")
-    set(lib_DEPS light_api paddle_api paddle_api_light)
-else()
-    set(lib_DEPS light_api cxx_api paddle_api_full paddle_api paddle_api_light)
-endif()
-
-include_directories(${JNI_INCLUDE_DIRS} ${_classDir} ${_stubDir})
-if (NOT LITE_ON_TINY_PUBLISH)
-    lite_cc_library(paddle_lite_jni MODULE 
-        SRCS paddle_lite_jni.cc tensor_jni.cc
-        DEPS ${lib_DEPS}
-        ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels})
-    # Unlike static library, module library has to link target to be able to work
-    # as a single .so lib.
-    target_link_libraries(paddle_lite_jni ${lib_DEPS} ${arm_kernels} ${npu_kernels})
-else()
-    add_library(paddle_lite_jni SHARED "")
-    target_sources(paddle_lite_jni PUBLIC ${__lite_cc_files} paddle_lite_jni.cc tensor_jni.cc)
-    add_dependencies(paddle_lite_jni op_list_h kernel_list_h)
-endif()
-
-if (APPLE)
-    # MacOS only accepts JNI lib ends with .jnilib or .dylib
-    set_target_properties(paddle_lite_jni PROPERTIES SUFFIX ".jnilib")
-elseif (WIN32)
-    # Windows only accepts JNI lib ends with .dll
-    set_target_properties(paddle_lite_jni PROPERTIES SUFFIX ".dll")
-endif (APPLE)
diff --git a/lite/api/android/jni/native/convert_util_jni.h b/lite/api/android/jni/native/convert_util_jni.h
deleted file mode 100644
index ae987c330d..0000000000
--- a/lite/api/android/jni/native/convert_util_jni.h
+++ /dev/null
@@ -1,197 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <jni.h>
-#include <string>
-#include <vector>
-
-#include "lite/api/light_api.h"
-#include "lite/api/paddle_api.h"
-#include "lite/api/paddle_place.h"
-
-namespace paddle {
-namespace lite_api {
-
-inline std::string jstring_to_cpp_string(JNIEnv *env, jstring jstr) {
-  // In java, a unicode char will be encoded using 2 bytes (utf16).
-  // so jstring will contain characters utf16. std::string in c++ is
-  // essentially a string of bytes, not characters, so if we want to
-  // pass jstring from JNI to c++, we have convert utf16 to bytes.
-  if (!jstr) {
-    return "";
-  }
-  const jclass stringClass = env->GetObjectClass(jstr);
-  const jmethodID getBytes =
-      env->GetMethodID(stringClass, "getBytes", "(Ljava/lang/String;)[B");
-  const jbyteArray stringJbytes = (jbyteArray)env->CallObjectMethod(
-      jstr, getBytes, env->NewStringUTF("UTF-8"));
-
-  size_t length = (size_t)env->GetArrayLength(stringJbytes);
-  jbyte *pBytes = env->GetByteArrayElements(stringJbytes, NULL);
-
-  std::string ret = std::string(reinterpret_cast<char *>(pBytes), length);
-  env->ReleaseByteArrayElements(stringJbytes, pBytes, JNI_ABORT);
-
-  env->DeleteLocalRef(stringJbytes);
-  env->DeleteLocalRef(stringClass);
-  return ret;
-}
-
-inline jfloatArray cpp_array_to_jfloatarray(JNIEnv *env,
-                                            const float *buf,
-                                            int64_t len) {
-  jfloatArray result = env->NewFloatArray(len);
-  env->SetFloatArrayRegion(result, 0, len, buf);
-  return result;
-}
-
-inline jintArray cpp_array_to_jintarray(JNIEnv *env,
-                                        const int *buf,
-                                        int64_t len) {
-  jintArray result = env->NewIntArray(len);
-  env->SetIntArrayRegion(result, 0, len, buf);
-  return result;
-}
-
-inline jbyteArray cpp_array_to_jbytearray(JNIEnv *env,
-                                          const int8_t *buf,
-                                          int64_t len) {
-  jbyteArray result = env->NewByteArray(len);
-  env->SetByteArrayRegion(result, 0, len, buf);
-  return result;
-}
-
-inline jlongArray int64_vector_to_jlongarray(JNIEnv *env,
-                                             const std::vector<int64_t> &vec) {
-  jlongArray result = env->NewLongArray(vec.size());
-  jlong *buf = new jlong[vec.size()];
-  for (size_t i = 0; i < vec.size(); ++i) {
-    buf[i] = (jlong)vec[i];
-  }
-  env->SetLongArrayRegion(result, 0, vec.size(), buf);
-  delete[] buf;
-  return result;
-}
-
-inline std::vector<int64_t> jlongarray_to_int64_vector(JNIEnv *env,
-                                                       jlongArray dims) {
-  int dim_size = env->GetArrayLength(dims);
-  jlong *dim_nums = env->GetLongArrayElements(dims, nullptr);
-  std::vector<int64_t> dim_vec(dim_nums, dim_nums + dim_size);
-  env->ReleaseLongArrayElements(dims, dim_nums, 0);
-  return dim_vec;
-}
-
-/**
- * Converts Java com.baidu.paddle.lite.Place to c++ paddle::lite_api::Place.
- */
-inline Place jplace_to_cpp_place(JNIEnv *env, jobject java_place) {
-  jclass place_jclazz = env->GetObjectClass(java_place);
-
-  jmethodID target_method =
-      env->GetMethodID(place_jclazz, "getTargetInt", "()I");
-  jmethodID precision_method =
-      env->GetMethodID(place_jclazz, "getPrecisionInt", "()I");
-  jmethodID data_layout_method =
-      env->GetMethodID(place_jclazz, "getDataLayoutInt", "()I");
-  jmethodID device_method = env->GetMethodID(place_jclazz, "getDevice", "()I");
-
-  int target = env->CallIntMethod(java_place, target_method);
-  int precision = env->CallIntMethod(java_place, precision_method);
-  int data_layout = env->CallIntMethod(java_place, data_layout_method);
-  int device = env->CallIntMethod(java_place, device_method);
-
-  return Place(static_cast<paddle::lite_api::TargetType>(target),
-               static_cast<paddle::lite_api::PrecisionType>(precision),
-               static_cast<paddle::lite_api::DataLayoutType>(data_layout),
-               device);
-}
-
-inline CxxConfig jcxxconfig_to_cpp_cxxconfig(JNIEnv *env, jobject jcxxconfig) {
-  jclass cxxconfig_jclazz = env->GetObjectClass(jcxxconfig);
-
-  jmethodID model_dir_method =
-      env->GetMethodID(cxxconfig_jclazz, "getModelDir", "()Ljava/lang/String;");
-  jmethodID preferred_place_method = env->GetMethodID(
-      cxxconfig_jclazz, "getPreferredPlace", "()Lcom/baidu/paddle/lite/Place;");
-  jmethodID valid_places_method = env->GetMethodID(
-      cxxconfig_jclazz, "getValidPlaces", "()[Lcom/baidu/paddle/lite/Place;");
-
-  CxxConfig config;
-
-  jstring java_model_dir =
-      (jstring)env->CallObjectMethod(jcxxconfig, model_dir_method);
-  if (java_model_dir != nullptr) {
-    std::string cpp_model_dir = jstring_to_cpp_string(env, java_model_dir);
-    config.set_model_dir(cpp_model_dir);
-  }
-
-  jobject java_preferred_place =
-      env->CallObjectMethod(jcxxconfig, preferred_place_method);
-  if (java_preferred_place != nullptr) {
-    Place cpp_preferred_place = jplace_to_cpp_place(env, java_preferred_place);
-    config.set_preferred_place(cpp_preferred_place);
-  }
-
-  jobject object_valid_places =
-      env->CallObjectMethod(jcxxconfig, valid_places_method);
-  jobjectArray *java_valid_places =
-      reinterpret_cast<jobjectArray *>(&object_valid_places);
-  if (java_valid_places != nullptr) {
-    int valid_place_count = env->GetArrayLength(*java_valid_places);
-    std::vector<Place> cpp_valid_places;
-    for (int i = 0; i < valid_place_count; ++i) {
-      jobject jplace = env->GetObjectArrayElement(*java_valid_places, i);
-      cpp_valid_places.push_back(jplace_to_cpp_place(env, jplace));
-    }
-    config.set_valid_places(cpp_valid_places);
-  }
-
-  return config;
-}
-
-inline MobileConfig jmobileconfig_to_cpp_mobileconfig(JNIEnv *env,
-                                                      jobject jmobileconfig) {
-  jclass mobileconfig_jclazz = env->GetObjectClass(jmobileconfig);
-
-  MobileConfig config;
-
-  // set model dir
-  jmethodID model_dir_method = env->GetMethodID(
-      mobileconfig_jclazz, "getModelDir", "()Ljava/lang/String;");
-  jstring java_model_dir =
-      (jstring)env->CallObjectMethod(jmobileconfig, model_dir_method);
-  if (java_model_dir != nullptr) {
-    std::string cpp_model_dir = jstring_to_cpp_string(env, java_model_dir);
-    config.set_model_dir(cpp_model_dir);
-  }
-
-  // set threads
-  jmethodID threads_method =
-      env->GetMethodID(mobileconfig_jclazz, "getThreads", "()I");
-  int threads = env->CallIntMethod(jmobileconfig, threads_method);
-  config.set_threads(threads);
-
-  // set power mode
-  jmethodID power_mode_method =
-      env->GetMethodID(mobileconfig_jclazz, "getPowerModeInt", "()I");
-  int power_mode = env->CallIntMethod(jmobileconfig, power_mode_method);
-  config.set_power_mode(static_cast<paddle::lite_api::PowerMode>(power_mode));
-
-  return config;
-}
-
-}  // namespace lite_api
-}  // namespace paddle
diff --git a/lite/api/android/jni/native/paddle_lite_jni.cc b/lite/api/android/jni/native/paddle_lite_jni.cc
deleted file mode 100644
index aa4ece6818..0000000000
--- a/lite/api/android/jni/native/paddle_lite_jni.cc
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/api/android/jni/native/paddle_lite_jni.h"
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "lite/api/android/jni/native/convert_util_jni.h"
-#include "lite/api/light_api.h"
-#include "lite/api/paddle_api.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-namespace paddle {
-namespace lite_api {
-
-inline static std::shared_ptr<PaddlePredictor> *getPaddlePredictorPointer(
-    JNIEnv *env, jobject jpaddle_predictor) {
-  jclass jclazz = env->GetObjectClass(jpaddle_predictor);
-  jfieldID jfield = env->GetFieldID(jclazz, "cppPaddlePredictorPointer", "J");
-  jlong java_pointer = env->GetLongField(jpaddle_predictor, jfield);
-  std::shared_ptr<PaddlePredictor> *ptr =
-      reinterpret_cast<std::shared_ptr<PaddlePredictor> *>(java_pointer);
-  return ptr;
-}
-
-JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_PaddlePredictor_run(
-    JNIEnv *env, jobject jpaddle_predictor) {
-  std::shared_ptr<PaddlePredictor> *predictor =
-      getPaddlePredictorPointer(env, jpaddle_predictor);
-  if (predictor == nullptr || (*predictor == nullptr)) {
-    return JNI_FALSE;
-  }
-  (*predictor)->Run();
-  return JNI_TRUE;
-}
-
-JNIEXPORT jboolean JNICALL
-Java_com_baidu_paddle_lite_PaddlePredictor_saveOptimizedModel(
-    JNIEnv *env, jobject jpaddle_predictor, jstring model_dir) {
-  std::shared_ptr<PaddlePredictor> *predictor =
-      getPaddlePredictorPointer(env, jpaddle_predictor);
-  if (predictor == nullptr || (*predictor == nullptr)) {
-    return JNI_FALSE;
-  }
-  (*predictor)->SaveOptimizedModel(jstring_to_cpp_string(env, model_dir));
-  return JNI_TRUE;
-}
-
-JNIEXPORT jlong JNICALL
-Java_com_baidu_paddle_lite_PaddlePredictor_getInputCppTensorPointer(
-    JNIEnv *env, jobject jpaddle_predictor, jint offset) {
-  std::shared_ptr<PaddlePredictor> *predictor =
-      getPaddlePredictorPointer(env, jpaddle_predictor);
-  if (predictor == nullptr || (*predictor == nullptr)) {
-    return 0;
-  }
-  std::unique_ptr<Tensor> tensor =
-      (*predictor)->GetInput(static_cast<int>(offset));
-  std::unique_ptr<Tensor> *cpp_tensor_pointer =
-      new std::unique_ptr<Tensor>(std::move(tensor));
-  return reinterpret_cast<jlong>(cpp_tensor_pointer);
-}
-
-JNIEXPORT jlong JNICALL
-Java_com_baidu_paddle_lite_PaddlePredictor_getOutputCppTensorPointer(
-    JNIEnv *env, jobject jpaddle_predictor, jint offset) {
-  std::shared_ptr<PaddlePredictor> *predictor =
-      getPaddlePredictorPointer(env, jpaddle_predictor);
-  if (predictor == nullptr || (*predictor == nullptr)) {
-    return 0;
-  }
-  std::unique_ptr<const Tensor> tensor =
-      (*predictor)->GetOutput(static_cast<int>(offset));
-  std::unique_ptr<const Tensor> *cpp_tensor_pointer =
-      new std::unique_ptr<const Tensor>(std::move(tensor));
-  return reinterpret_cast<jlong>(cpp_tensor_pointer);
-}
-
-JNIEXPORT jlong JNICALL
-Java_com_baidu_paddle_lite_PaddlePredictor_getCppTensorPointerByName(
-    JNIEnv *env, jobject jpaddle_predictor, jstring name) {
-  std::string cpp_name = jstring_to_cpp_string(env, name);
-  std::shared_ptr<PaddlePredictor> *predictor =
-      getPaddlePredictorPointer(env, jpaddle_predictor);
-  if (predictor == nullptr || (*predictor == nullptr)) {
-    return 0;
-  }
-  std::unique_ptr<const Tensor> tensor = (*predictor)->GetTensor(cpp_name);
-  std::unique_ptr<const Tensor> *cpp_tensor_pointer =
-      new std::unique_ptr<const Tensor>(std::move(tensor));
-  return reinterpret_cast<jlong>(cpp_tensor_pointer);
-}
-
-JNIEXPORT jlong JNICALL
-Java_com_baidu_paddle_lite_PaddlePredictor_newCppPaddlePredictor__Lcom_baidu_\
-paddle_lite_CxxConfig_2(JNIEnv *env,
-                        jobject jpaddle_predictor,
-                        jobject jcxxconfig) {
-#ifndef LITE_ON_TINY_PUBLISH
-  CxxConfig config = jcxxconfig_to_cpp_cxxconfig(env, jcxxconfig);
-  std::shared_ptr<PaddlePredictor> predictor =
-      paddle::lite_api::CreatePaddlePredictor(config);
-  if (predictor == nullptr) {
-    return 0;
-  }
-  std::shared_ptr<PaddlePredictor> *predictor_pointer =
-      new std::shared_ptr<PaddlePredictor>(predictor);
-  return reinterpret_cast<jlong>(predictor_pointer);
-#else
-  return 0;
-#endif
-}
-
-JNIEXPORT jlong JNICALL
-Java_com_baidu_paddle_lite_PaddlePredictor_newCppPaddlePredictor__Lcom_baidu_\
-paddle_lite_MobileConfig_2(JNIEnv *env,
-                           jobject jpaddle_predictor,
-                           jobject jmobileconfig) {
-  MobileConfig config = jmobileconfig_to_cpp_mobileconfig(env, jmobileconfig);
-  std::shared_ptr<PaddlePredictor> predictor =
-      paddle::lite_api::CreatePaddlePredictor(config);
-  if (predictor == nullptr) {
-    return 0;
-  }
-  std::shared_ptr<PaddlePredictor> *predictor_pointer =
-      new std::shared_ptr<PaddlePredictor>(predictor);
-  return reinterpret_cast<jlong>(predictor_pointer);
-}
-
-JNIEXPORT jboolean JNICALL
-Java_com_baidu_paddle_lite_PaddlePredictor_deleteCppPaddlePredictor(
-    JNIEnv *env, jobject jpaddle_predictor, jlong java_pointer) {
-  if (java_pointer == 0) {
-    return JNI_FALSE;
-  }
-  std::shared_ptr<PaddlePredictor> *ptr =
-      reinterpret_cast<std::shared_ptr<PaddlePredictor> *>(java_pointer);
-  ptr->reset();
-  delete ptr;
-  return JNI_TRUE;
-}
-
-}  // namespace lite_api
-}  // namespace paddle
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/lite/api/android/jni/native/paddle_lite_jni.h b/lite/api/android/jni/native/paddle_lite_jni.h
deleted file mode 100644
index 913e9a4c3a..0000000000
--- a/lite/api/android/jni/native/paddle_lite_jni.h
+++ /dev/null
@@ -1,113 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-/* DO NOT EDIT THIS FILE - it is machine generated */
-#include <jni.h>
-/* Header for class com_baidu_paddle_lite_PaddlePredictor */
-#include "lite/api/paddle_lite_factory_helper.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#ifndef LITE_ON_TINY_PUBLISH
-#include "lite/api/paddle_use_passes.h"
-#endif
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-namespace paddle {
-namespace lite_api {
-
-/*
- * Class:     com_baidu_paddle_lite_PaddlePredictor
- * Method:    run
- * Signature: ()Z
- */
-JNIEXPORT jboolean JNICALL
-Java_com_baidu_paddle_lite_PaddlePredictor_run(JNIEnv *, jobject);
-
-/*
- * Class:     com_baidu_paddle_lite_PaddlePredictor
- * Method:    saveOptimizedModel
- * Signature: (Ljava/lang/String;)Z
- */
-JNIEXPORT jboolean JNICALL
-Java_com_baidu_paddle_lite_PaddlePredictor_saveOptimizedModel(JNIEnv *,
-                                                              jobject,
-                                                              jstring);
-
-/*
- * Class:     com_baidu_paddle_lite_PaddlePredictor
- * Method:    getInputCppTensorPointer
- * Signature: (I)J
- */
-JNIEXPORT jlong JNICALL
-Java_com_baidu_paddle_lite_PaddlePredictor_getInputCppTensorPointer(JNIEnv *,
-                                                                    jobject,
-                                                                    jint);
-
-/*
- * Class:     com_baidu_paddle_lite_PaddlePredictor
- * Method:    getOutputCppTensorPointer
- * Signature: (I)J
- */
-JNIEXPORT jlong JNICALL
-Java_com_baidu_paddle_lite_PaddlePredictor_getOutputCppTensorPointer(JNIEnv *,
-                                                                     jobject,
-                                                                     jint);
-
-/*
- * Class:     com_baidu_paddle_lite_PaddlePredictor
- * Method:    getCppTensorPointerByName
- * Signature: (Ljava/lang/String;)J
- */
-JNIEXPORT jlong JNICALL
-Java_com_baidu_paddle_lite_PaddlePredictor_getCppTensorPointerByName(JNIEnv *,
-                                                                     jobject,
-                                                                     jstring);
-
-/*
- * Class:     com_baidu_paddle_lite_PaddlePredictor
- * Method:    newCppPaddlePredictor
- * Signature: (Lcom/baidu/paddle/lite/CxxConfig;)J
- */
-JNIEXPORT jlong JNICALL
-Java_com_baidu_paddle_lite_PaddlePredictor_newCppPaddlePredictor__Lcom_baidu_\
-paddle_lite_CxxConfig_2(JNIEnv *, jobject, jobject);
-
-/*
- * Class:     com_baidu_paddle_lite_PaddlePredictor
- * Method:    newCppPaddlePredictor
- * Signature: (Lcom/baidu/paddle/lite/MobileConfig;)J
- */
-JNIEXPORT jlong JNICALL
-Java_com_baidu_paddle_lite_PaddlePredictor_newCppPaddlePredictor__Lcom_baidu_\
-paddle_lite_MobileConfig_2(JNIEnv *, jobject, jobject);
-
-/*
- * Class:     com_baidu_paddle_lite_PaddlePredictor
- * Method:    deleteCppPaddlePredictor
- * Signature: (J)Z
- */
-JNIEXPORT jboolean JNICALL
-Java_com_baidu_paddle_lite_PaddlePredictor_deleteCppPaddlePredictor(JNIEnv *,
-                                                                    jobject,
-                                                                    jlong);
-
-}  // namespace lite_api
-}  // namespace paddle
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/lite/api/android/jni/native/tensor_jni.cc b/lite/api/android/jni/native/tensor_jni.cc
deleted file mode 100644
index 59cafa1939..0000000000
--- a/lite/api/android/jni/native/tensor_jni.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/api/android/jni/native/tensor_jni.h"
-
-#include <memory>
-#include <vector>
-
-#include "lite/api/android/jni/native/convert_util_jni.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-namespace paddle {
-namespace lite_api {
-
-inline static int64_t product(const std::vector<int64_t> &vec) {
-  if (vec.empty()) {
-    return 0;
-  }
-  int64_t result = 1;
-  for (int64_t d : vec) {
-    result *= d;
-  }
-  return result;
-}
-
-inline static bool is_const_tensor(JNIEnv *env, jobject jtensor) {
-  jclass jclazz = env->GetObjectClass(jtensor);
-  jfieldID jfield = env->GetFieldID(jclazz, "readOnly", "Z");
-  jboolean read_only = env->GetBooleanField(jtensor, jfield);
-  return static_cast<bool>(read_only);
-}
-
-inline static std::unique_ptr<Tensor> *get_writable_tensor_pointer(
-    JNIEnv *env, jobject jtensor) {
-  jclass jclazz = env->GetObjectClass(jtensor);
-  jfieldID jfield = env->GetFieldID(jclazz, "cppTensorPointer", "J");
-  jlong java_pointer = env->GetLongField(jtensor, jfield);
-  std::unique_ptr<Tensor> *ptr =
-      reinterpret_cast<std::unique_ptr<Tensor> *>(java_pointer);
-  return ptr;
-}
-
-inline static std::unique_ptr<const Tensor> *get_read_only_tensor_pointer(
-    JNIEnv *env, jobject jtensor) {
-  jclass jclazz = env->GetObjectClass(jtensor);
-  jfieldID jfield = env->GetFieldID(jclazz, "cppTensorPointer", "J");
-  jlong java_pointer = env->GetLongField(jtensor, jfield);
-  std::unique_ptr<const Tensor> *ptr =
-      reinterpret_cast<std::unique_ptr<const Tensor> *>(java_pointer);
-  return ptr;
-}
-
-JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeResize(
-    JNIEnv *env, jobject jtensor, jlongArray dims) {
-  std::unique_ptr<Tensor> *tensor = get_writable_tensor_pointer(env, jtensor);
-  if (tensor == nullptr || (*tensor == nullptr)) {
-    return JNI_FALSE;
-  }
-  std::vector<int64_t> shape = jlongarray_to_int64_vector(env, dims);
-  (*tensor)->Resize(shape);
-  return JNI_TRUE;
-}
-
-JNIEXPORT jlongArray JNICALL
-Java_com_baidu_paddle_lite_Tensor_shape(JNIEnv *env, jobject jtensor) {
-  if (is_const_tensor(env, jtensor)) {
-    std::unique_ptr<const Tensor> *tensor =
-        get_read_only_tensor_pointer(env, jtensor);
-    std::vector<int64_t> shape = (*tensor)->shape();
-    return int64_vector_to_jlongarray(env, shape);
-  } else {
-    std::unique_ptr<Tensor> *tensor = get_writable_tensor_pointer(env, jtensor);
-    std::vector<int64_t> shape = (*tensor)->shape();
-    return int64_vector_to_jlongarray(env, shape);
-  }
-}
-
-JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3F(
-    JNIEnv *env, jobject jtensor, jfloatArray buf) {
-  std::unique_ptr<Tensor> *tensor = get_writable_tensor_pointer(env, jtensor);
-  if (tensor == nullptr || (*tensor == nullptr)) {
-    return JNI_FALSE;
-  }
-  int64_t buf_size = (int64_t)env->GetArrayLength(buf);
-  if (buf_size != product((*tensor)->shape())) {
-    return JNI_FALSE;
-  }
-
-  float *input = (*tensor)->mutable_data<float>();
-  env->GetFloatArrayRegion(buf, 0, buf_size, input);
-  return JNI_TRUE;
-}
-
-JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3B(
-    JNIEnv *env, jobject jtensor, jbyteArray buf) {
-  std::unique_ptr<Tensor> *tensor = get_writable_tensor_pointer(env, jtensor);
-  if (tensor == nullptr || (*tensor == nullptr)) {
-    return JNI_FALSE;
-  }
-  int64_t buf_size = (int64_t)env->GetArrayLength(buf);
-  if (buf_size != product((*tensor)->shape())) {
-    return JNI_FALSE;
-  }
-
-  int8_t *input = (*tensor)->mutable_data<int8_t>();
-  env->GetByteArrayRegion(buf, 0, buf_size, input);
-  return JNI_TRUE;
-}
-
-JNIEXPORT jfloatArray JNICALL
-Java_com_baidu_paddle_lite_Tensor_getFloatData(JNIEnv *env, jobject jtensor) {
-  if (is_const_tensor(env, jtensor)) {
-    std::unique_ptr<const Tensor> *tensor =
-        get_read_only_tensor_pointer(env, jtensor);
-    return cpp_array_to_jfloatarray(
-        env, (*tensor)->data<float>(), product((*tensor)->shape()));
-  } else {
-    std::unique_ptr<Tensor> *tensor = get_writable_tensor_pointer(env, jtensor);
-    return cpp_array_to_jfloatarray(
-        env, (*tensor)->data<float>(), product((*tensor)->shape()));
-  }
-}
-
-JNIEXPORT jbyteArray JNICALL
-Java_com_baidu_paddle_lite_Tensor_getByteData(JNIEnv *env, jobject jtensor) {
-  if (is_const_tensor(env, jtensor)) {
-    std::unique_ptr<const Tensor> *tensor =
-        get_read_only_tensor_pointer(env, jtensor);
-    return cpp_array_to_jbytearray(
-        env, (*tensor)->data<int8_t>(), product((*tensor)->shape()));
-  } else {
-    std::unique_ptr<Tensor> *tensor = get_writable_tensor_pointer(env, jtensor);
-    return cpp_array_to_jbytearray(
-        env, (*tensor)->data<int8_t>(), product((*tensor)->shape()));
-  }
-}
-
-JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_deleteCppTensor(
-    JNIEnv *env, jobject jtensor, jlong java_pointer) {
-  if (java_pointer == 0) {
-    return JNI_FALSE;
-  }
-  std::unique_ptr<Tensor> *ptr =
-      reinterpret_cast<std::unique_ptr<Tensor> *>(java_pointer);
-  ptr->reset();
-  delete ptr;
-  return JNI_TRUE;
-}
-
-}  // namespace lite_api
-}  // namespace paddle
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/lite/api/android/jni/native/tensor_jni.h b/lite/api/android/jni/native/tensor_jni.h
deleted file mode 100644
index 34c35b6a76..0000000000
--- a/lite/api/android/jni/native/tensor_jni.h
+++ /dev/null
@@ -1,90 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/* DO NOT EDIT THIS FILE - it is machine generated */
-#include <jni.h>
-/* Header for class com_baidu_paddle_lite_Tensor */
-
-#ifndef PADDLE_FLUID_LITE_API_ANDROID_JNI_NATIVE_TENSOR_JNI_H_
-#define PADDLE_FLUID_LITE_API_ANDROID_JNI_NATIVE_TENSOR_JNI_H_
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-namespace paddle {
-namespace lite_api {
-
-/*
- * Class:     com_baidu_paddle_lite_Tensor
- * Method:    shape
- * Signature: ()[J
- */
-JNIEXPORT jlongArray JNICALL Java_com_baidu_paddle_lite_Tensor_shape(JNIEnv *,
-                                                                     jobject);
-
-/*
- * Class:     com_baidu_paddle_lite_Tensor
- * Method:    getFloatData
- * Signature: ()[F
- */
-JNIEXPORT jfloatArray JNICALL
-Java_com_baidu_paddle_lite_Tensor_getFloatData(JNIEnv *, jobject);
-
-/*
- * Class:     com_baidu_paddle_lite_Tensor
- * Method:    getByteData
- * Signature: ()[B
- */
-JNIEXPORT jbyteArray JNICALL
-Java_com_baidu_paddle_lite_Tensor_getByteData(JNIEnv *, jobject);
-
-/*
- * Class:     com_baidu_paddle_lite_Tensor
- * Method:    nativeResize
- * Signature: ([J)Z
- */
-JNIEXPORT jboolean JNICALL
-Java_com_baidu_paddle_lite_Tensor_nativeResize(JNIEnv *, jobject, jlongArray);
-
-/*
- * Class:     com_baidu_paddle_lite_Tensor
- * Method:    nativeSetData
- * Signature: ([F)Z
- */
-JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3F(
-    JNIEnv *, jobject, jfloatArray);
-
-/*
- * Class:     com_baidu_paddle_lite_Tensor
- * Method:    nativeSetData
- * Signature: ([B)Z
- */
-JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3B(
-    JNIEnv *, jobject, jbyteArray);
-
-/*
- * Class:     com_baidu_paddle_lite_Tensor
- * Method:    deleteCppTensor
- * Signature: (J)Z
- */
-JNIEXPORT jboolean JNICALL
-Java_com_baidu_paddle_lite_Tensor_deleteCppTensor(JNIEnv *, jobject, jlong);
-
-}  // namespace lite_api
-}  // namespace paddle
-
-#ifdef __cplusplus
-}
-#endif
-#endif  // PADDLE_FLUID_LITE_API_ANDROID_JNI_NATIVE_TENSOR_JNI_H_
diff --git a/lite/api/android/jni/src/com/baidu/paddle/lite/.gitignore b/lite/api/android/jni/src/com/baidu/paddle/lite/.gitignore
deleted file mode 100644
index 870ec275e8..0000000000
--- a/lite/api/android/jni/src/com/baidu/paddle/lite/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-/PaddleLite.class
-/PaddleLiteTest.class
diff --git a/lite/api/android/jni/src/com/baidu/paddle/lite/ConfigBase.java b/lite/api/android/jni/src/com/baidu/paddle/lite/ConfigBase.java
deleted file mode 100644
index 51115b3016..0000000000
--- a/lite/api/android/jni/src/com/baidu/paddle/lite/ConfigBase.java
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-package com.baidu.paddle.lite;
-
-/**
- * Base class for all configurations.
- */
-public class ConfigBase {
-
-    protected String modelDir;
-
-    public String getModelDir() {
-        return modelDir;
-    }
-
-    public void setModelDir(String modelDir) {
-        this.modelDir = modelDir;
-    }
-
-}
diff --git a/lite/api/android/jni/src/com/baidu/paddle/lite/CxxConfig.java b/lite/api/android/jni/src/com/baidu/paddle/lite/CxxConfig.java
deleted file mode 100644
index 906293c92f..0000000000
--- a/lite/api/android/jni/src/com/baidu/paddle/lite/CxxConfig.java
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-package com.baidu.paddle.lite;
-
-/**
- * CxxConfig is the configuration for the Full feature predictor.
- */
-public class CxxConfig extends ConfigBase {
-
-    protected Place preferredPlace;
-    protected Place[] validPlaces;
-
-    public Place getPreferredPlace() {
-        return preferredPlace;
-    }
-
-    public void setPreferredPlace(Place preferredPlace) {
-        this.preferredPlace = preferredPlace;
-    }
-
-    public Place[] getValidPlaces() {
-        return validPlaces;
-    }
-
-    public void setValidPlaces(Place[] validPlaces) {
-        this.validPlaces = validPlaces;
-    }
-}
diff --git a/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java b/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java
deleted file mode 100644
index 5c71db0c92..0000000000
--- a/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-package com.baidu.paddle.lite;
-
-/**
- * MobileConfig is the config for the light weight predictor, it will skip IR
- * optimization or other unnecessary stages.
- */
-public class MobileConfig extends ConfigBase {
-
-    /**
-     * Set power mode.
-     *
-     * @return
-     */
-    public void setPowerMode(PowerMode powerMode) {
-        this.powerMode = powerMode;
-    }
-
-    /**
-     * Returns power mode.
-     *
-     * @return power mode
-     */
-    public PowerMode getPowerMode() {
-        return powerMode;
-    }
-
-    /**
-     * Set threads num.
-     *
-     * @return
-     */
-    public void setThreads(int threads) {
-        this.threads = threads;
-    }
-
-    /**
-     * Returns threads num.
-     *
-     * @return threads num
-     */
-    public int getThreads() {
-        return threads;
-    }
-
-    /**
-     * Returns power mode as enum int value.
-     *
-     * @return power mode as enum int value
-     */
-    public int getPowerModeInt() {
-        return powerMode.value();
-    }
-
-    private PowerMode powerMode = PowerMode.LITE_POWER_HIGH;
-    private int threads = 1;
-}
diff --git a/lite/api/android/jni/src/com/baidu/paddle/lite/PaddleLiteInitializer.java b/lite/api/android/jni/src/com/baidu/paddle/lite/PaddleLiteInitializer.java
deleted file mode 100644
index 876d7cebd4..0000000000
--- a/lite/api/android/jni/src/com/baidu/paddle/lite/PaddleLiteInitializer.java
+++ /dev/null
@@ -1,23 +0,0 @@
-package com.baidu.paddle.lite;
-
-/**
- * Initializer for PaddleLite. The initialization methods are called by package
- * classes only. Public users don't have to call them. Public users can get
- * PaddleLite information constants such as JNI lib name in this class.
- */
-public class PaddleLiteInitializer {
-
-    /** name of C++ JNI lib */
-    public final static String JNI_LIB_NAME = "paddle_lite_jni";
-
-    /**
-     * loads the C++ JNI lib. We only call it in our package, so it shouldn't be
-     * visible to public users.
-     * 
-     * @return true if initialize successfully.
-     */
-    protected static boolean init() {
-        System.loadLibrary(JNI_LIB_NAME);
-        return true;
-    }
-}
diff --git a/lite/api/android/jni/src/com/baidu/paddle/lite/PaddlePredictor.java b/lite/api/android/jni/src/com/baidu/paddle/lite/PaddlePredictor.java
deleted file mode 100644
index d022fd7d61..0000000000
--- a/lite/api/android/jni/src/com/baidu/paddle/lite/PaddlePredictor.java
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-package com.baidu.paddle.lite;
-
-/** Java Native Interface (JNI) class for Paddle Lite APIs */
-public class PaddlePredictor {
-
-    /**
-     * Java doesn't have pointer. To maintain the life cycle of underneath C++
-     * PaddlePredictor object, we use a long value to maintain it.
-     */
-    private long cppPaddlePredictorPointer;
-
-    /**
-     * Constructor of a PaddlePredictor.
-     * 
-     * @param config the input configuration.
-     */
-    public PaddlePredictor(ConfigBase config) {
-        init(config);
-    }
-
-    /**
-     * Creates a PaddlePredictor object.
-     *
-     * @param config the input configuration.
-     * @return the PaddlePredictor object, or null if failed to create it.
-     */
-    public static PaddlePredictor createPaddlePredictor(ConfigBase config) {
-        PaddlePredictor predictor = new PaddlePredictor(config);
-        return predictor.cppPaddlePredictorPointer == 0L ? null : predictor;
-    }
-
-    /**
-     * Get offset-th input tensor.
-     *
-     * @param offset
-     * @return the tensor or null if failed to get it.
-     */
-    public Tensor getInput(int offset) {
-        long cppTensorPointer = getInputCppTensorPointer(offset);
-        return cppTensorPointer == 0 ? null : new Tensor(cppTensorPointer, /* readOnly = */ false, this);
-    }
-
-    /**
-     * Get offset-th output tensor.
-     *
-     * @param offset
-     * @return the tensor or null if failed to get it.
-     */
-    public Tensor getOutput(int offset) {
-        long cppTensorPointer = getOutputCppTensorPointer(offset);
-        return cppTensorPointer == 0 ? null : new Tensor(cppTensorPointer, /* readOnly = */ true, this);
-    }
-
-    /**
-     * Get a tensor by name.
-     *
-     * @param name the name of the tensor.
-     * @return the tensor or null if failed to get it.
-     */
-    public Tensor getTensor(String name) {
-        long cppTensorPointer = getCppTensorPointerByName(name);
-        return cppTensorPointer == 0 ? null : new Tensor(cppTensorPointer, /* readOnly = */ true, this);
-    }
-
-    /**
-     * Run the PaddlePredictor.
-     *
-     * @return true if run successfully.
-     */
-    public native boolean run();
-
-    /**
-     * Saves the optimized model. It is available only for {@link CxxConfig}
-     *
-     * @param modelDir the path to save the optimized model
-     * @return true if save successfully. Otherwise returns false.
-     */
-    public native boolean saveOptimizedModel(String modelDir);
-
-    /**
-     * Deletes C++ PaddlePredictor pointer when Java PaddlePredictor object is
-     * destroyed
-     */
-    @Override
-    protected void finalize() throws Throwable {
-        clear();
-        super.finalize();
-    }
-
-    /**
-     * Create a C++ PaddlePredictor object based on configuration
-     *
-     * @param config the input configuration
-     * @return true if create successfully
-     */
-    protected boolean init(ConfigBase config) {
-        if (config instanceof CxxConfig) {
-            cppPaddlePredictorPointer = newCppPaddlePredictor((CxxConfig) config);
-        } else if (config instanceof MobileConfig) {
-            cppPaddlePredictorPointer = newCppPaddlePredictor((MobileConfig) config);
-        } else {
-            throw new IllegalArgumentException("Not supported PaddleLite Config type");
-        }
-        return cppPaddlePredictorPointer != 0L;
-    }
-
-    /**
-     * Deletes C++ PaddlePredictor pointer
-     * 
-     * @return true if deletion success
-     */
-    protected boolean clear() {
-        boolean result = false;
-        if (cppPaddlePredictorPointer != 0L) {
-            result = deleteCppPaddlePredictor(cppPaddlePredictorPointer);
-            cppPaddlePredictorPointer = 0L;
-        }
-        return result;
-    }
-
-    /**
-     * Gets offset-th input tensor pointer at C++ side.
-     *
-     * @param offset
-     * @return a long value which is reinterpret_cast of the C++ pointer.
-     */
-    private native long getInputCppTensorPointer(int offset);
-
-    /**
-     * Gets offset-th output tensor pointer at C++ side.
-     *
-     * @param offset
-     * @return a long value which is reinterpret_cast of the C++ pointer.
-     */
-    private native long getOutputCppTensorPointer(int offset);
-
-    /**
-     * Gets tensor pointer at C++ side by name.
-     *
-     * @param name the name of the tensor.
-     * @return a long value which is reinterpret_cast of the C++ pointer.
-     */
-    private native long getCppTensorPointerByName(String name);
-
-    /**
-     * Creates a new C++ PaddlePredcitor object using CxxConfig, returns the
-     * reinterpret_cast value of the C++ pointer which points to C++
-     * PaddlePredictor.
-     *
-     * @param config
-     * @return a long value which is reinterpret_cast of the C++ pointer.
-     */
-    private native long newCppPaddlePredictor(CxxConfig config);
-
-    /**
-     * Creates a new C++ PaddlePredcitor object using Mobile, returns the
-     * reinterpret_cast value of the C++ pointer which points to C++
-     * PaddlePredictor.
-     *
-     * @param config
-     * @return a long value which is reinterpret_cast of the C++ pointer.
-     */
-    private native long newCppPaddlePredictor(MobileConfig config);
-
-    /**
-     * Delete C++ PaddlePredictor object pointed by the input pointer, which is
-     * presented by a long value.
-     * 
-     * @param nativePointer a long value which is reinterpret_cast of the C++
-     *                      pointer.
-     * @return true if deletion success.
-     */
-    private native boolean deleteCppPaddlePredictor(long nativePointer);
-
-    /* Initializes at the beginning */
-    static {
-        PaddleLiteInitializer.init();
-    }
-}
diff --git a/lite/api/android/jni/src/com/baidu/paddle/lite/Place.java b/lite/api/android/jni/src/com/baidu/paddle/lite/Place.java
deleted file mode 100644
index 98777f3111..0000000000
--- a/lite/api/android/jni/src/com/baidu/paddle/lite/Place.java
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-package com.baidu.paddle.lite;
-
-/**
- * Place specifies the execution context of a Kernel or input/output for a
- * kernel. It is used to make the analysis of the MIR more clear and accurate.
- */
-public class Place {
-
-    /** Place hardware target type. */
-    public enum TargetType {
-        UNKNOWN(0), HOST(1), X86(2), CUDA(3), ARM(4), OPEN_CL(5), FPGA(7), NPU(8), ANY(6);
-
-        public final int value;
-
-        private TargetType(int value) {
-            this.value = value;
-        }
-    }
-
-    /** Place precision type */
-    public enum PrecisionType {
-        UNKNOWN(0), FLOAT(1), INT8(2), FP16(5), INT32(3), ANY(4), BOOL(6);
-
-        public final int value;
-
-        private PrecisionType(int value) {
-            this.value = value;
-        }
-    }
-
-    /** Place data layout type */
-    public enum DataLayoutType {
-        UNKNOWN(0), NCHW(1), NHWC(3), ANY(2);
-
-        public final int value;
-
-        private DataLayoutType(int value) {
-            this.value = value;
-        }
-    }
-
-    private TargetType target;
-    private PrecisionType precision;
-    private DataLayoutType layout;
-    private int device;
-
-    public Place() {
-        target = TargetType.UNKNOWN;
-        precision = PrecisionType.UNKNOWN;
-        layout = DataLayoutType.UNKNOWN;
-        device = 0;
-    }
-
-    public Place(TargetType target) {
-        this(target, PrecisionType.FLOAT);
-    }
-
-    public Place(TargetType target, PrecisionType precision) {
-        this(target, precision, DataLayoutType.NCHW);
-    }
-
-    public Place(TargetType target, PrecisionType precision, DataLayoutType layout) {
-        this(target, precision, layout, 0);
-    }
-
-    public Place(TargetType target, PrecisionType precision, DataLayoutType layout, int device) {
-        this.target = target;
-        this.precision = precision;
-        this.layout = layout;
-        this.device = device;
-    }
-
-    public boolean isValid() {
-        return target != TargetType.UNKNOWN && precision != PrecisionType.UNKNOWN && layout != DataLayoutType.UNKNOWN;
-    }
-
-    public TargetType getTarget() {
-        return target;
-    }
-
-    public void setTarget(TargetType target) {
-        this.target = target;
-    }
-
-    public PrecisionType getPrecision() {
-        return precision;
-    }
-
-    public void setPrecision(PrecisionType precision) {
-        this.precision = precision;
-    }
-
-    public DataLayoutType getLayout() {
-        return layout;
-    }
-
-    public void setLayout(DataLayoutType layout) {
-        this.layout = layout;
-    }
-
-    public int getDevice() {
-        return device;
-    }
-
-    public void setDevice(int device) {
-        this.device = device;
-    }
-
-    /**
-     * Returns hardware target as enum int value.
-     *
-     * @return hardware target as enum int value
-     */
-    public int getTargetInt() {
-        return target.value;
-    }
-
-    /**
-     * Returns precision target as enum int value.
-     *
-     * @return precision as enum int value
-     */
-    public int getPrecisionInt() {
-        return precision.value;
-    }
-
-    /**
-     * Returns data layout as enum int value.
-     *
-     * @return data layout as enum int value
-     */
-    public int getDataLayoutInt() {
-        return layout.value;
-    }
-}
diff --git a/lite/api/android/jni/src/com/baidu/paddle/lite/PowerMode.java b/lite/api/android/jni/src/com/baidu/paddle/lite/PowerMode.java
deleted file mode 100644
index 36bd568406..0000000000
--- a/lite/api/android/jni/src/com/baidu/paddle/lite/PowerMode.java
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-package com.baidu.paddle.lite;
-
-/**
- * PowerMode is the cpu running power mode for the light weight predictor.
- */
-public enum PowerMode {
-    LITE_POWER_HIGH(0),
-    LITE_POWER_LOW(1),
-    LITE_POWER_FULL(2),
-    LITE_POWER_NO_BIND(3),
-    LITE_POWER_RAND_HIGH(4),
-    LITE_POWER_RAND_LOW(5);
-
-    private PowerMode(int value) {
-        this.value = value;
-    }
-    
-    public int value() {
-        return this.value;
-    }
-
-    private final int value;
-}
diff --git a/lite/api/android/jni/src/com/baidu/paddle/lite/Tensor.java b/lite/api/android/jni/src/com/baidu/paddle/lite/Tensor.java
deleted file mode 100644
index ac78800bd2..0000000000
--- a/lite/api/android/jni/src/com/baidu/paddle/lite/Tensor.java
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-package com.baidu.paddle.lite;
-
-/**
- * Tensor class provides the Java APIs that users can get or set the shape or
- * the data of a Tensor.
- */
-public class Tensor {
-
-    /**
-     * Java doesn't have pointer. To maintain the life cycle of underneath C++
-     * PaddlePredictor object, we use a long value to maintain it.
-     */
-    private long cppTensorPointer;
-
-    /**
-     * Is this tensor read-only. This field is also used at C++ side to know whether
-     * we should interpret the C++ tensor pointer to "Tensor" pointer or "const
-     * Tensor" pointer.
-     */
-    private boolean readOnly;
-
-    /**
-     * Due to different memory management of Java and C++, at C++, if a user
-     * destroys PaddlePredictor object, the tensor's memory will be released and a
-     * pointer operating on the released tensor will cause unknown behavior. At C++
-     * side, that's users' responsibility to manage memory well. But for our Java
-     * code, we have to prevent this case. We make this {@link Tensor} keep a
-     * reference to {@link PaddlePredictor} to prevent the {@link PaddlePredictor}
-     * object be collected by JVM before {@Tensor}.
-     */
-    private PaddlePredictor predictor;
-
-    /**
-     * Accessed by package only to prevent public users to create it wrongly. A
-     * Tensor can be created by {@link com.baidu.paddle.lite.PaddlePredictor} only
-     */
-    protected Tensor(long cppTensorPointer, boolean readOnly, PaddlePredictor predictor) {
-        this.cppTensorPointer = cppTensorPointer;
-        this.readOnly = readOnly;
-        this.predictor = predictor;
-    }
-
-    /** Deletes C++ Tensor pointer when Java Tensor object is destroyed */
-    protected void finalize() throws Throwable {
-        if (cppTensorPointer != 0L) {
-            deleteCppTensor(cppTensorPointer);
-            cppTensorPointer = 0L;
-        }
-        super.finalize();
-    }
-
-    /**
-     * @return whether this Tensor is read-only.
-     */
-    public boolean isReadOnly() {
-        return readOnly;
-    }
-
-    /**
-     * Resizes the tensor shape.
-     *
-     * @param dims long array of shape.
-     * @return true if resize successfully.
-     */
-    public boolean resize(long[] dims) {
-        if (readOnly) {
-            return false;
-        }
-        return nativeResize(dims);
-    }
-
-    /**
-     * Set the tensor float data.
-     *
-     * @param buf the float array buffer which will be copied into tensor.
-     * @return true if set data successfully.
-     */
-    public boolean setData(float[] buf) {
-        if (readOnly) {
-            return false;
-        }
-        return nativeSetData(buf);
-    }
-
-    /**
-     * Set the tensor byte data.
-     *
-     * @param buf the byte array buffer which will be copied into tensor.
-     * @return true if set data successfully.
-     */
-    public boolean setData(byte[] buf) {
-        if (readOnly) {
-            return false;
-        }
-        return nativeSetData(buf);
-    }
-
-    /**
-     * @return shape of the tensor as long array.
-     */
-    public native long[] shape();
-
-    /**
-     * @return the tensor data as float array.
-     */
-    public native float[] getFloatData();
-
-    /**
-     * @return the tensor data as byte array.
-     */
-    public native byte[] getByteData();
-
-    private native boolean nativeResize(long[] dims);
-
-    private native boolean nativeSetData(float[] buf);
-
-    private native boolean nativeSetData(byte[] buf);
-
-    /**
-     * Delete C++ Tenor object pointed by the input pointer, which is presented by a
-     * long value.
-     * 
-     * @param nativePointer a long value which is reinterpret_cast of the C++
-     *                      pointer.
-     * @return true if deletion success.
-     */
-    private native boolean deleteCppTensor(long nativePointer);
-}
\ No newline at end of file
diff --git a/lite/api/android/jni/test/com/baidu/paddle/lite/PaddlePredictorTest.java b/lite/api/android/jni/test/com/baidu/paddle/lite/PaddlePredictorTest.java
deleted file mode 100644
index 0af11efd28..0000000000
--- a/lite/api/android/jni/test/com/baidu/paddle/lite/PaddlePredictorTest.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-package com.baidu.paddle.lite;
-
-import org.junit.jupiter.api.Test;
-
-import static org.junit.Assert.assertEquals;
-
-/**
- * Deprecated test. Now we use Android demo's Instrument test.
- * 
- * @TODO make this test as Java Unit test. Then we don't have to launch Android
- *       demo to test.
- */
-class PaddlePredictorTest {
-
-    @Test
-    public void run_defaultModel() {
-        MobileConfig config = new MobileConfig();
-        config.setModelDir("");
-        PaddlePredictor predictor = PaddlePredictor.createPaddlePredictor(config);
-
-        float[] inputBuffer = new float[10000];
-        for (int i = 0; i < 10000; ++i) {
-            inputBuffer[i] = i;
-        }
-        long[] dims = { 100, 100 };
-
-        Tensor input = predictor.getInput(0);
-        input.resize(dims);
-        input.setData(inputBuffer);
-
-        predictor.run();
-
-        Tensor output = predictor.getOutput(0);
-        float[] outputBuffer = output.getFloatData();
-
-        assertEquals(outputBuffer.length, 50000);
-        assertEquals(outputBuffer[0], 50.2132f, 1e-3f);
-        assertEquals(outputBuffer[1], -28.8729f, 1e-3f);
-    }
-
-}
diff --git a/lite/api/apis_test.cc b/lite/api/apis_test.cc
deleted file mode 100644
index 3dc0224084..0000000000
--- a/lite/api/apis_test.cc
+++ /dev/null
@@ -1,118 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/*
- * We test multiple apis here.
- */
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <sstream>
-#include <vector>
-#include "lite/api/cxx_api.h"
-#include "lite/api/light_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/core/mir/pass_registry.h"
-
-DEFINE_string(model_dir, "", "");
-DEFINE_string(optimized_model, "", "");
-
-namespace paddle {
-namespace lite {
-
-void SetConstInput(lite::Tensor* x) {
-  x->Resize(DDim(std::vector<DDim::value_type>({100, 100})));
-  auto* data = x->mutable_data<float>();
-  for (int i = 0; i < 100 * 100; i++) {
-    data[i] = i;
-  }
-}
-
-bool CompareTensors(const std::string& name,
-                    const Predictor& cxx_api,
-                    const LightPredictor& light_api) {
-  const auto* a = cxx_api.GetTensor(name);
-  const auto* b = light_api.GetTensor(name);
-  return TensorCompareWith(*a, *b);
-}
-
-TEST(CXXApi_LightApi, optim_model) {
-  lite::Predictor cxx_api;
-  std::vector<Place> valid_places({
-      Place{TARGET(kHost), PRECISION(kFloat)},
-      Place{TARGET(kX86), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kFloat)},  // Both works on X86 and ARM
-  });
-  // On ARM devices, the preferred X86 target not works, but it can still
-  // select ARM kernels.
-  cxx_api.Build(FLAGS_model_dir,
-                "",
-                "",
-                Place{TARGET(kX86), PRECISION(kFloat)},
-                valid_places);
-  cxx_api.SaveModel(FLAGS_optimized_model);
-}
-
-TEST(CXXApi_LightApi, save_and_load_model) {
-  lite::Predictor cxx_api;
-  lite::LightPredictor light_api(FLAGS_optimized_model);
-
-  // CXXAPi
-  {
-    std::vector<Place> valid_places({
-        Place{TARGET(kHost), PRECISION(kFloat)},
-        Place{TARGET(kX86), PRECISION(kFloat)},
-        Place{TARGET(kARM), PRECISION(kFloat)},  // Both works on X86 and ARM
-    });
-    // On ARM devices, the preferred X86 target not works, but it can still
-    // select ARM kernels.
-    cxx_api.Build(FLAGS_model_dir,
-                  "",
-                  "",
-                  Place{TARGET(kX86), PRECISION(kFloat)},
-                  valid_places);
-
-    auto* x = cxx_api.GetInput(0);
-    SetConstInput(x);
-
-    cxx_api.Run();
-
-    LOG(INFO) << "Save optimized model to " << FLAGS_optimized_model;
-    cxx_api.SaveModel(FLAGS_optimized_model);
-  }
-
-  // LightApi
-  {
-    auto* x = light_api.GetInput(0);
-    SetConstInput(x);
-
-    light_api.Run();
-  }
-
-  const auto* cxx_out = cxx_api.GetOutput(0);
-  const auto* light_out = light_api.GetOutput(0);
-  ASSERT_TRUE(TensorCompareWith(*cxx_out, *light_out));
-
-  std::vector<std::string> tensors_with_order({
-      "a", "fc_0.w_0", "scale_0.tmp_0",
-  });
-
-  for (const auto& tensor_name : tensors_with_order) {
-    ASSERT_TRUE(CompareTensors(tensor_name, cxx_api, light_api));
-  }
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/api/benchmark.cc b/lite/api/benchmark.cc
deleted file mode 100644
index ca7bfe7fe6..0000000000
--- a/lite/api/benchmark.cc
+++ /dev/null
@@ -1,190 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <cstdio>
-#include <fstream>
-#include <string>
-#include <vector>
-#include "lite/api/paddle_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/api/test_helper.h"
-#include "lite/core/device_info.h"
-#include "lite/utils/cp_logging.h"
-#include "lite/utils/string.h"
-
-DEFINE_string(input_shape,
-              "1,3,224,224",
-              "input shapes, separated by colon and comma");
-DEFINE_string(result_filename, "", "save test result");
-DEFINE_bool(run_model_optimize,
-            false,
-            "apply model_optimize_tool to model, use optimized model to test");
-
-namespace paddle {
-namespace lite_api {
-
-void OutputOptModel(const std::string& load_model_dir,
-                    const std::string& save_optimized_model_dir,
-                    const std::vector<std::vector<int64_t>>& input_shapes) {
-  lite_api::CxxConfig config;
-  config.set_model_dir(load_model_dir);
-  config.set_preferred_place(Place{TARGET(kX86), PRECISION(kFloat)});
-  config.set_valid_places({
-      Place{TARGET(kX86), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
-  });
-  auto predictor = lite_api::CreatePaddlePredictor(config);
-
-  int ret = system(
-      paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str())
-          .c_str());
-  if (ret == 0) {
-    LOG(INFO) << "delete old optimized model " << save_optimized_model_dir;
-  }
-  predictor->SaveOptimizedModel(save_optimized_model_dir,
-                                LiteModelType::kNaiveBuffer);
-  LOG(INFO) << "Load model from " << load_model_dir;
-  LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
-}
-
-#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-void Run(const std::vector<std::vector<int64_t>>& input_shapes,
-         const std::string& model_dir,
-         const int repeat,
-         const int thread_num,
-         const int warmup_times,
-         const std::string model_name) {
-  lite_api::MobileConfig config;
-  config.set_threads(thread_num);
-  if (thread_num == 1) {
-    config.set_power_mode(LITE_POWER_HIGH);
-  } else {
-    config.set_power_mode(LITE_POWER_NO_BIND);
-  }
-  config.set_model_dir(model_dir);
-
-  auto predictor = lite_api::CreatePaddlePredictor(config);
-
-  for (int j = 0; j < input_shapes.size(); ++j) {
-    auto input_tensor = predictor->GetInput(j);
-    input_tensor->Resize(input_shapes[j]);
-    auto input_data = input_tensor->mutable_data<float>();
-    int input_num = 1;
-    for (int i = 0; i < input_shapes[j].size(); ++i) {
-      input_num *= input_shapes[j][i];
-    }
-    for (int i = 0; i < input_num; ++i) {
-      input_data[i] = 1.f;
-    }
-  }
-
-  for (int i = 0; i < warmup_times; ++i) {
-    predictor->Run();
-  }
-
-  auto start = lite::GetCurrentUS();
-  for (int i = 0; i < repeat; ++i) {
-    predictor->Run();
-  }
-  auto end = lite::GetCurrentUS();
-
-  std::FILE* pf = std::fopen(FLAGS_result_filename.c_str(), "a");
-  if (nullptr == pf) {
-    LOG(INFO) << "create result file error";
-    exit(0);
-  }
-  fprintf(pf,
-          "-- %-18s    avg = %5.4f ms\n",
-          model_name.c_str(),
-          (end - start) / repeat / 1000.0);
-  std::fclose(pf);
-}
-#endif
-
-}  // namespace lite_api
-}  // namespace paddle
-
-int main(int argc, char** argv) {
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-  if (FLAGS_model_dir == "" || FLAGS_result_filename == "") {
-    LOG(INFO) << "usage: "
-              << "--model_dir /path/to/your/model --result_filename "
-                 "/path/to/resultfile";
-    exit(0);
-  }
-
-  std::size_t found = FLAGS_model_dir.find_last_of("/");
-  std::string model_name = FLAGS_model_dir.substr(found + 1);
-  std::string save_optimized_model_dir = FLAGS_model_dir + "opt2";
-
-  auto split_string =
-      [](const std::string& str_in) -> std::vector<std::string> {
-    std::vector<std::string> str_out;
-    std::string tmp_str = str_in;
-    while (!tmp_str.empty()) {
-      size_t next_offset = tmp_str.find(":");
-      str_out.push_back(tmp_str.substr(0, next_offset));
-      if (next_offset == std::string::npos) {
-        break;
-      } else {
-        tmp_str = tmp_str.substr(next_offset + 1);
-      }
-    }
-    return str_out;
-  };
-
-  auto get_shape = [](const std::string& str_shape) -> std::vector<int64_t> {
-    std::vector<int64_t> shape;
-    std::string tmp_str = str_shape;
-    while (!tmp_str.empty()) {
-      int dim = atoi(tmp_str.data());
-      shape.push_back(dim);
-      size_t next_offset = tmp_str.find(",");
-      if (next_offset == std::string::npos) {
-        break;
-      } else {
-        tmp_str = tmp_str.substr(next_offset + 1);
-      }
-    }
-    return shape;
-  };
-
-  std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
-  std::vector<std::vector<int64_t>> input_shapes;
-  for (int i = 0; i < str_input_shapes.size(); ++i) {
-    input_shapes.push_back(get_shape(str_input_shapes[i]));
-  }
-
-  // Output optimized model
-  if (FLAGS_run_model_optimize) {
-    paddle::lite_api::OutputOptModel(
-        FLAGS_model_dir, save_optimized_model_dir, input_shapes);
-  }
-
-#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-  // Run inference using optimized model
-  std::string run_model_dir =
-      FLAGS_run_model_optimize ? save_optimized_model_dir : FLAGS_model_dir;
-  paddle::lite_api::Run(input_shapes,
-                        run_model_dir,
-                        FLAGS_repeats,
-                        FLAGS_threads,
-                        FLAGS_warmup,
-                        model_name);
-#endif
-  return 0;
-}
diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc
deleted file mode 100644
index eeba686301..0000000000
--- a/lite/api/cxx_api.cc
+++ /dev/null
@@ -1,177 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/api/cxx_api.h"
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "lite/utils/io.h"
-#ifdef LITE_WITH_NPU
-#include "lite/backends/npu/npu_helper.h"
-#endif
-
-namespace paddle {
-namespace lite {
-
-void Predictor::SaveModel(const std::string &dir,
-                          lite_api::LiteModelType model_type) {
-  if (!program_) {
-    GenRuntimeProgram();
-  }
-  program_->SaveOpInfosToProgram(&program_desc_);
-  program_->UpdateVarsOfProgram(&program_desc_);
-  switch (model_type) {
-    case lite_api::LiteModelType::kProtobuf:
-      SaveModelPb(dir, *program_->exec_scope(), program_desc_, true);
-      break;
-    case lite_api::LiteModelType::kNaiveBuffer:
-      SaveModelNaive(dir, *program_->exec_scope(), program_desc_);
-      break;
-    default:
-      LOG(FATAL) << "Unknown model type";
-  }
-#ifdef LITE_WITH_NPU
-  for (auto name : npu::DeviceInfo::Global().AllClientNames()) {
-    // the npu offline model is saved in current dir
-    // so just copy to dst dir
-    CHECK_EQ(
-        system(string_format("cp -r %s %s", name.c_str(), dir.c_str()).c_str()),
-        0)
-        << "Failed copy NPU model to " << dir;
-  }
-#endif
-}
-
-lite::Tensor *Predictor::GetInput(size_t offset) {
-  auto *_feed_list = exec_scope_->FindVar("feed");
-  CHECK(_feed_list) << "no feed variable in exec_scope";
-  auto *feed_list = _feed_list->GetMutable<std::vector<lite::Tensor>>();
-  if (offset >= feed_list->size()) {
-    feed_list->resize(offset + 1);
-  }
-  return &feed_list->at(offset);
-}
-
-const lite::Tensor *Predictor::GetOutput(size_t offset) const {
-  auto *_fetch_list = exec_scope_->FindVar("fetch");
-  CHECK(_fetch_list) << "no fatch variable in exec_scope";
-  auto &fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
-  CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
-  return &fetch_list.at(offset);
-}
-
-const std::vector<lite::Tensor> *Predictor::GetOutputs() const {
-  auto *_fetch_list = exec_scope_->FindVar("fetch");
-  CHECK(_fetch_list) << "no fatch variable in exec_scope";
-  auto &fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
-  return &fetch_list;
-}
-
-const cpp::ProgramDesc &Predictor::program_desc() const {
-  return program_desc_;
-}
-const RuntimeProgram &Predictor::runtime_program() const { return *program_; }
-
-void Predictor::Build(const lite_api::CxxConfig &config,
-                      const std::vector<Place> &valid_places,
-                      const std::vector<std::string> &passes,
-                      lite_api::LiteModelType model_type) {
-  const std::string &model_path = config.model_dir();
-  const std::string &model_file = config.model_file();
-  const std::string &param_file = config.param_file();
-  const Place prefer_place = config.preferred_place();
-  const bool model_from_memory = config.model_from_memory();
-  LOG(INFO) << "load from memory " << model_from_memory;
-
-  Build(model_path,
-        model_file,
-        param_file,
-        prefer_place,
-        valid_places,
-        passes,
-        model_type,
-        model_from_memory);
-}
-void Predictor::Build(const std::string &model_path,
-                      const std::string &model_file,
-                      const std::string &param_file,
-                      const Place &prefer_place,
-                      const std::vector<Place> &valid_places,
-                      const std::vector<std::string> &passes,
-                      lite_api::LiteModelType model_type,
-                      bool model_from_memory) {
-  switch (model_type) {
-    case lite_api::LiteModelType::kProtobuf: {
-      bool combined_param = false;
-      if (!model_file.empty() && !param_file.empty()) {
-        combined_param = true;
-      }
-      LoadModelPb(model_path,
-                  model_file,
-                  param_file,
-                  scope_.get(),
-                  &program_desc_,
-                  combined_param,
-                  model_from_memory);
-    } break;
-    case lite_api::LiteModelType::kNaiveBuffer:
-      CHECK(!model_path.empty())
-          << "NaiveBuffer backend only supported combined param";
-      LoadModelNaive(model_path, scope_.get(), &program_desc_);
-      break;
-    default:
-      LOG(FATAL) << "Unknown model type";
-  }
-  Build(program_desc_, prefer_place, valid_places, passes);
-}
-
-void Predictor::Build(const cpp::ProgramDesc &desc,
-                      const Place &prefer_place,
-                      const std::vector<Place> &valid_places,
-                      const std::vector<std::string> &passes) {
-  program_desc_ = desc;
-  Program program(desc, scope_, valid_places);
-  optimizer_.KernelPickPreferPlace(prefer_place);
-  core::KernelPickFactor factor;
-  factor.ConsiderTarget();
-  factor.ConsiderPrecision();
-  optimizer_.Run(std::move(program), valid_places, factor, passes);
-  exec_scope_ = optimizer_.exec_scope();
-}
-
-void Predictor::GenRuntimeProgram() {
-  program_ = optimizer_.GenRuntimeProgram();
-  CHECK_EQ(exec_scope_, program_->exec_scope());
-  program_generated_ = true;
-}
-
-const lite::Tensor *Predictor::GetTensor(const std::string &name) const {
-  auto *var = exec_scope_->FindVar(name);
-  return &var->Get<lite::Tensor>();
-}
-
-#ifdef LITE_WITH_TRAIN
-void Predictor::FeedVars(const std::vector<framework::Tensor> &tensors) {
-  auto var = scope_->FindVar("feed");
-  auto &feed_list = *(var->GetMutable<std::vector<lite::Tensor>>());
-  feed_list.resize(tensors.size());
-
-  for (size_t i = 0; i < tensors.size(); ++i)
-    feed_list[i].ShareDataWith(tensors[i]);
-}
-#endif
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/api/cxx_api.h b/lite/api/cxx_api.h
deleted file mode 100644
index 2506ae47b0..0000000000
--- a/lite/api/cxx_api.h
+++ /dev/null
@@ -1,173 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "lite/api/paddle_api.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/optimizer.h"
-#include "lite/core/program.h"
-#include "lite/core/types.h"
-#include "lite/model_parser/model_parser.h"
-
-namespace paddle {
-namespace lite {
-
-/*
- * Predictor for inference, input a model, it will optimize and execute it.
- */
-class LITE_API Predictor {
- public:
-  // Create an empty predictor.
-  Predictor() { scope_ = std::make_shared<Scope>(); }
-  // Create a predictor with the weight variable scope set.
-  explicit Predictor(const std::shared_ptr<lite::Scope>& root_scope)
-      : scope_(root_scope) {}
-
-  // Build from a model, with places set for hardware config.
-  void Build(
-      const lite_api::CxxConfig& config,
-      const std::vector<Place>& valid_places,
-      const std::vector<std::string>& passes = {},
-      lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf);
-
-  void Build(
-      const std::string& model_path,
-      const std::string& model_file_path,
-      const std::string& param_file_path,
-      const Place& prefer_place,
-      const std::vector<Place>& valid_places,
-      const std::vector<std::string>& passes = {},
-      lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
-      bool memory_from_memory = false);
-
-  void Build(const cpp::ProgramDesc& desc,
-             const Place& prefer_place,
-             const std::vector<Place>& valid_places,
-             const std::vector<std::string>& passes = {});
-
-  void GenRuntimeProgram();
-
-  // Run the predictor for a single batch of data.
-  void Run() {
-    if (!program_generated_) {
-      GenRuntimeProgram();
-    }
-    program_->Run();
-    LOG(INFO) << "running";
-  }
-
-  // Get offset-th col of feed inputs.
-  lite::Tensor* GetInput(size_t offset);
-
-  // Get offset-th col of fetch results.
-  const lite::Tensor* GetOutput(size_t offset) const;
-  const std::vector<lite::Tensor>* GetOutputs() const;
-
-  const cpp::ProgramDesc& program_desc() const;
-  const lite::Tensor* GetTensor(const std::string& name) const;
-  const RuntimeProgram& runtime_program() const;
-
-  // This method is disabled in mobile, for unnecessary dependencies required.
-  void SaveModel(
-      const std::string& dir,
-      lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf);
-
-#ifdef LITE_WITH_TRAIN
-  void Run(const std::vector<framework::Tensor>& tensors) {
-    FeedVars(tensors);
-    program_->Run();
-  }
-
-  void FeedVars(const std::vector<framework::Tensor>& tensors);
-#endif
-
- private:
-  Optimizer optimizer_;
-  cpp::ProgramDesc program_desc_;
-  std::shared_ptr<Scope> scope_;
-  const Scope* exec_scope_;
-  std::unique_ptr<RuntimeProgram> program_;
-  bool program_generated_{false};
-};
-
-/*
- * An executor for training.
- *
- * Usage:
- *
- * CXXTrainer trainer(...);
- * trainer.RunStartupProgram(...);
- * auto exe = BuildMainProgramExecutor(...);
- *
- * for (auto& epoch : epoches) {
- *   auto* tensor0 = exe.GetInput(...);
- *   // fill data for tensor0
- *   exe.Run();
- * }
-#ifdef LITE_WITH_X86
-class LITE_API CXXTrainer {
- public:
-  CXXTrainer(const std::shared_ptr<lite::Scope>& root_scope,
-             const Place& preferred_place,
-             const std::vector<Place>& valid_places)
-      : scope_(root_scope),
-        preferred_place_(preferred_place),
-        valid_places_(valid_places),
-        main_program_executor_(Predictor(scope_)) {}
-
-  // Build the RuntimeProgram cache for the main program. The cache will run
-  // multiple times for the epoches.
-  // NOTE Just support to execute the 0-th block currently.
-  Predictor& BuildMainProgramExecutor(const framework::proto::ProgramDesc& desc,
-                                      int block_id = 0) {
-    main_program_executor_.Build(desc, preferred_place_, valid_places_);
-    return main_program_executor_;
-  }
-
-#ifdef LITE_WITH_TRAIN
-  Predictor& BuildMainProgramExecutor(framework::ProgramDesc& desc) {  // NOLINT
-    return BuildMainProgramExecutor(*desc.Proto());
-  }
-
-  void RunStartupProgram(framework::ProgramDesc& desc) {  // NOLINT
-    RunStartupProgram(*desc.Proto());
-  }
-#endif
-
-  // Run the startup program. It just executes once, no cache needed.
-  void RunStartupProgram(const framework::proto::ProgramDesc& desc,
-                         int block_id = 0) {
-    Predictor exe(scope_);
-    exe.Build(desc, preferred_place_, valid_places_);
-    exe.Run();
-  }
-
- private:
-  std::shared_ptr<lite::Scope> scope_;
-
-  Place preferred_place_;
-  std::vector<Place> valid_places_;
-
-  // The training program.
-  Predictor main_program_executor_;
-};
-#endif
-*/
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/api/cxx_api_bin.cc b/lite/api/cxx_api_bin.cc
deleted file mode 100644
index 000e94307c..0000000000
--- a/lite/api/cxx_api_bin.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/api/cxx_api.h"
-#include <chrono>  // NOLINT
-#include "lite/api/paddle_use_passes.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-
-using Time = decltype(std::chrono::high_resolution_clock::now());
-Time time() { return std::chrono::high_resolution_clock::now(); }
-double time_diff(Time t1, Time t2) {
-  typedef std::chrono::microseconds ms;
-  auto diff = t2 - t1;
-  ms counter = std::chrono::duration_cast<ms>(diff);
-  return counter.count() / 1000.0;
-}
-
-void Run(const char* model_dir, int repeat) {
-#ifdef LITE_WITH_ARM
-  DeviceInfo::Init();
-#endif
-  lite::Predictor predictor;
-  std::vector<Place> valid_places({
-      Place{TARGET(kHost), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kInt8)},
-  });
-
-  predictor.Build(
-      model_dir, "", "", Place{TARGET(kARM), PRECISION(kInt8)}, valid_places);
-
-  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
-  auto* data = input_tensor->mutable_data<float>();
-  for (int i = 0; i < input_tensor->dims().production(); i++) {
-    data[i] = 1;
-  }
-
-  auto time1 = time();
-  for (int i = 0; i < repeat; i++) predictor.Run();
-  auto time2 = time();
-  std::cout << " predict cost: " << time_diff(time1, time2) / repeat << "ms"
-            << std::endl;
-
-  auto* out = predictor.GetOutput(0);
-  LOG(INFO) << out << " memory size " << out->data_size();
-  LOG(INFO) << "out " << out->data<float>()[0];
-  LOG(INFO) << "out " << out->data<float>()[1];
-  LOG(INFO) << "dims " << out->dims();
-  LOG(INFO) << "out data size: " << out->data_size();
-}
-
-}  // namespace lite
-}  // namespace paddle
-
-int main(int argc, char** argv) {
-  CHECK_EQ(argc, 3) << "usage: ./cmd <model_dir> <repeat>";
-  paddle::lite::Run(argv[1], std::stoi(argv[2]));
-
-  return 0;
-}
-
-USE_LITE_OP(mul);
-USE_LITE_OP(fc);
-USE_LITE_OP(scale);
-USE_LITE_OP(feed);
-USE_LITE_OP(fetch);
-USE_LITE_OP(io_copy);
-USE_LITE_OP(io_copy_once);
-
-USE_LITE_OP(conv2d);
-USE_LITE_OP(batch_norm);
-USE_LITE_OP(relu);
-USE_LITE_OP(depthwise_conv2d);
-USE_LITE_OP(pool2d);
-USE_LITE_OP(elementwise_add);
-USE_LITE_OP(softmax);
-USE_LITE_OP(fake_quantize_moving_average_abs_max);
-USE_LITE_OP(fake_dequantize_max_abs);
-
-USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
-USE_LITE_OP(calib);
-
-#ifdef LITE_WITH_ARM
-USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, int8out);
-USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, fp32out);
-USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
-
-USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, int8_out);
-USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, fp32_out);
-USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
-
-USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, fp32_to_int8);
-USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, int8_to_fp32);
-
-// USE_LITE_KERNEL(feed, kARM, kAny, kAny, def);
-// USE_LITE_KERNEL(fetch, kARM, kAny, kAny, def);
-#endif  // LITE_WITH_ARM
-
-#ifdef LITE_WITH_CUDA
-USE_LITE_KERNEL(mul, kCUDA, kFloat, kNCHW, def);
-USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device);
-USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, device_to_host);
-USE_LITE_KERNEL(io_copy_once, kCUDA, kAny, kAny, host_to_device);
-USE_LITE_KERNEL(io_copy_once, kCUDA, kAny, kAny, device_to_host);
-#endif
diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc
deleted file mode 100644
index b8c92a8f96..0000000000
--- a/lite/api/cxx_api_impl.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/api/cxx_api.h"
-#include "lite/api/paddle_api.h"
-
-namespace paddle {
-namespace lite {
-
-class CxxPaddleApiImpl : public lite_api::PaddlePredictor {
- public:
-  CxxPaddleApiImpl();
-
-  /// Create a new predictor from a config.
-  void Init(const lite_api::CxxConfig &config);
-
-  std::unique_ptr<lite_api::Tensor> GetInput(int i) override;
-
-  std::unique_ptr<const lite_api::Tensor> GetOutput(int i) const override;
-
-  void Run() override;
-
-  std::unique_ptr<const lite_api::Tensor> GetTensor(
-      const std::string &name) const override;
-
-  void SaveOptimizedModel(const std::string &model_dir,
-                          lite_api::LiteModelType model_type =
-                              lite_api::LiteModelType::kProtobuf) override;
-
- private:
-  Predictor raw_predictor_;
-};
-
-CxxPaddleApiImpl::CxxPaddleApiImpl() {}
-
-void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
-  auto places = config.valid_places();
-  places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny));
-  raw_predictor_.Build(config, places);
-}
-
-std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetInput(int i) {
-  auto *x = raw_predictor_.GetInput(i);
-  return std::unique_ptr<lite_api::Tensor>(new lite_api::Tensor(x));
-}
-
-std::unique_ptr<const lite_api::Tensor> CxxPaddleApiImpl::GetOutput(
-    int i) const {
-  const auto *x = raw_predictor_.GetOutput(i);
-  return std::unique_ptr<lite_api::Tensor>(new lite_api::Tensor(x));
-}
-
-void CxxPaddleApiImpl::Run() { raw_predictor_.Run(); }
-
-std::unique_ptr<const lite_api::Tensor> CxxPaddleApiImpl::GetTensor(
-    const std::string &name) const {
-  auto *x = raw_predictor_.GetTensor(name);
-  return std::unique_ptr<const lite_api::Tensor>(new lite_api::Tensor(x));
-}
-
-void CxxPaddleApiImpl::SaveOptimizedModel(const std::string &model_dir,
-                                          lite_api::LiteModelType model_type) {
-  raw_predictor_.SaveModel(model_dir, model_type);
-}
-
-}  // namespace lite
-
-namespace lite_api {
-
-template <>
-std::shared_ptr<PaddlePredictor> CreatePaddlePredictor(
-    const CxxConfig &config) {
-  auto x = std::make_shared<lite::CxxPaddleApiImpl>();
-  x->Init(config);
-  return x;
-}
-
-}  // namespace lite_api
-}  // namespace paddle
diff --git a/lite/api/cxx_api_test.cc b/lite/api/cxx_api_test.cc
deleted file mode 100644
index c562b9f080..0000000000
--- a/lite/api/cxx_api_test.cc
+++ /dev/null
@@ -1,157 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/api/cxx_api.h"
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/api/lite_api_test_helper.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-
-// For training.
-DEFINE_string(startup_program_path, "", "");
-DEFINE_string(main_program_path, "", "");
-
-namespace paddle {
-namespace lite {
-
-#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-TEST(CXXApi, test) {
-  const lite::Tensor* out = RunHvyModel();
-  LOG(INFO) << out << " memory size " << out->data_size();
-  for (int i = 0; i < 10; i++) {
-    LOG(INFO) << "out " << out->data<float>()[i];
-  }
-  LOG(INFO) << "dims " << out->dims();
-  // LOG(INFO) << "out " << *out;
-}
-
-TEST(CXXApi, save_model) {
-  lite::Predictor predictor;
-  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
-                                   Place{TARGET(kX86), PRECISION(kFloat)}});
-  predictor.Build(FLAGS_model_dir,
-                  "",
-                  "",
-                  Place{TARGET(kCUDA), PRECISION(kFloat)},
-                  valid_places);
-
-  LOG(INFO) << "Save optimized model to " << FLAGS_optimized_model;
-  predictor.SaveModel(FLAGS_optimized_model,
-                      lite_api::LiteModelType::kProtobuf);
-  predictor.SaveModel(FLAGS_optimized_model + ".naive",
-                      lite_api::LiteModelType::kNaiveBuffer);
-}
-
-/*TEST(CXXTrainer, train) {
-  Place prefer_place({TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)});
-  std::vector<Place> valid_places({prefer_place});
-  auto scope = std::make_shared<lite::Scope>();
-
-  CXXTrainer trainer(scope, prefer_place, valid_places);
-
-  std::string main_program_pb, startup_program_pb;
-  ReadBinaryFile(FLAGS_main_program_path, &main_program_pb);
-  ReadBinaryFile(FLAGS_startup_program_path, &startup_program_pb);
-  framework::proto::ProgramDesc main_program_desc, startup_program_desc;
-  main_program_desc.ParseFromString(main_program_pb);
-  startup_program_desc.ParseFromString(startup_program_pb);
-
-  // LOG(INFO) << main_program_desc.DebugString();
-
-  for (const auto& op : main_program_desc.blocks(0).ops()) {
-    LOG(INFO) << "get op " << op.type();
-  }
-
-  return;
-
-  trainer.RunStartupProgram(startup_program_desc);
-  auto& exe = trainer.BuildMainProgramExecutor(main_program_desc);
-  auto* tensor0 = exe.GetInput(0);
-  tensor0->Resize(std::vector<int64_t>({100, 100}));
-  auto* data0 = tensor0->mutable_data<float>();
-  data0[0] = 0;
-
-  exe.Run();
-}*/
-#endif  // LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-
-#ifdef LITE_WITH_ARM
-TEST(CXXApi, save_model) {
-  lite::Predictor predictor;
-  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
-                                   Place{TARGET(kARM), PRECISION(kFloat)}});
-  predictor.Build(FLAGS_model_dir,
-                  "",
-                  "",
-                  Place{TARGET(kARM), PRECISION(kFloat)},
-                  valid_places);
-
-  LOG(INFO) << "Save optimized model to " << FLAGS_optimized_model;
-  predictor.SaveModel(FLAGS_optimized_model);
-  predictor.SaveModel(FLAGS_optimized_model + ".naive",
-                      lite_api::LiteModelType::kNaiveBuffer);
-}
-
-TEST(CXXApi, load_model_naive) {
-  lite::Predictor predictor;
-  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
-                                   Place{TARGET(kARM), PRECISION(kFloat)}});
-  predictor.Build(FLAGS_optimized_model + ".naive",
-                  "",
-                  "",
-                  Place{TARGET(kARM), PRECISION(kFloat)},
-                  valid_places,
-                  {},
-                  lite_api::LiteModelType::kNaiveBuffer);
-
-  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(std::vector<int64_t>({1, 100}));
-  auto* data = input_tensor->mutable_data<float>();
-  for (int i = 0; i < 100; i++) {
-    data[i] = 1;
-  }
-
-  predictor.Run();
-
-  std::vector<float> result({0.4350058,
-                             -0.6048313,
-                             -0.29346266,
-                             0.40377066,
-                             -0.13400325,
-                             0.37114543,
-                             -0.3407839,
-                             0.14574292,
-                             0.4104212,
-                             0.8938774});
-
-  auto* output_tensor = predictor.GetOutput(0);
-  auto output_shape = output_tensor->dims().Vectorize();
-  ASSERT_EQ(output_shape.size(), 2);
-  ASSERT_EQ(output_shape[0], 1);
-  ASSERT_EQ(output_shape[1], 500);
-
-  int step = 50;
-  for (int i = 0; i < result.size(); i += step) {
-    EXPECT_NEAR(output_tensor->data<float>()[i], result[i], 1e-6);
-  }
-}
-#endif
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/api/detection_model_test.cc b/lite/api/detection_model_test.cc
deleted file mode 100644
index 2d79653baa..0000000000
--- a/lite/api/detection_model_test.cc
+++ /dev/null
@@ -1,137 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <fstream>
-#include <vector>
-#include "lite/api/paddle_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/api/test_helper.h"
-#include "lite/core/op_registry.h"
-
-DEFINE_bool(is_run_model_optimize,
-            false,
-            "apply model_optimize_tool to model, use optimized model to test");
-
-namespace paddle {
-namespace lite_api {
-
-void OutputOptModel(const std::string& load_model_dir,
-                    const std::string& save_optimized_model_dir) {
-  lite_api::CxxConfig config;
-  config.set_model_dir(load_model_dir);
-  config.set_preferred_place(Place{TARGET(kX86), PRECISION(kFloat)});
-  config.set_valid_places({
-      Place{TARGET(kX86), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
-  });
-  auto predictor = lite_api::CreatePaddlePredictor(config);
-
-  int ret = system(
-      paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str())
-          .c_str());
-  if (ret == 0) {
-    LOG(INFO) << "delete old optimized model " << save_optimized_model_dir;
-  }
-  predictor->SaveOptimizedModel(save_optimized_model_dir,
-                                LiteModelType::kNaiveBuffer);
-  LOG(INFO) << "Load model from " << load_model_dir;
-  LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
-}
-
-#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-void Run(const std::string& model_dir,
-         const int repeat,
-         const int warmup_times,
-         const int thread_num) {
-  // set config and create predictor
-  lite_api::MobileConfig config;
-  config.set_model_dir(model_dir);
-  config.set_threads(thread_num);
-  if (thread_num == 1) {
-    config.set_power_mode(LITE_POWER_HIGH);
-  } else {
-    config.set_power_mode(LITE_POWER_NO_BIND);
-  }
-
-  auto predictor = lite_api::CreatePaddlePredictor(config);
-
-  // set input
-  auto input_image = predictor->GetInput(0);
-  input_image->Resize({1, 3, 300, 300});
-  auto input_image_data = input_image->mutable_data<float>();
-  std::ifstream read_file("/data/local/tmp/pjc/ssd_img.txt");
-  if (!read_file.is_open()) {
-    LOG(INFO) << "read image file fail";
-    return;
-  }
-  auto input_shape = input_image->shape();
-  int64_t input_image_size = 1;
-  for (auto t : input_shape) {
-    input_image_size *= t;
-  }
-  for (int i = 0; i < input_image_size; i++) {
-    read_file >> input_image_data[i];
-  }
-
-  // warmup and run
-  for (int i = 0; i < warmup_times; ++i) {
-    predictor->Run();
-  }
-
-  auto start = lite::GetCurrentUS();
-  for (int i = 0; i < repeat; ++i) {
-    predictor->Run();
-  }
-
-  // show result
-  auto end = lite::GetCurrentUS();
-  LOG(INFO) << "================== Speed Report ===================";
-  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
-            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
-            << ", spend " << (end - start) / FLAGS_repeats / 1000.0
-            << " ms in average.";
-
-  auto out = predictor->GetOutput(0);
-  auto out_data = out->data<float>();
-  LOG(INFO) << "output shape:";
-  auto out_shape = out->shape();
-  for (auto t : out_shape) {
-    LOG(INFO) << t;
-  }
-  LOG(INFO) << "output data:";
-  int output_len = 20;
-  for (int i = 0; i < output_len; i++) {
-    LOG(INFO) << out_data[i];
-  }
-}
-#endif
-
-}  // namespace lite_api
-}  // namespace paddle
-
-TEST(Faster_RCNN, test_arm) {
-  std::string save_optimized_model_dir;
-  if (FLAGS_is_run_model_optimize) {
-    save_optimized_model_dir = FLAGS_model_dir + "opt";
-    paddle::lite_api::OutputOptModel(FLAGS_model_dir, save_optimized_model_dir);
-  }
-  std::string run_model_dir =
-      FLAGS_is_run_model_optimize ? save_optimized_model_dir : FLAGS_model_dir;
-  paddle::lite_api::Run(
-      run_model_dir, FLAGS_repeats, FLAGS_threads, FLAGS_warmup);
-}
diff --git a/lite/api/efficientnet_b0_test.cc b/lite/api/efficientnet_b0_test.cc
deleted file mode 100644
index fa16a6be81..0000000000
--- a/lite/api/efficientnet_b0_test.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/api/cxx_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/api/test_helper.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-
-void TestModel(const std::vector<Place> &valid_places,
-               const Place &preferred_place) {
-  DeviceInfo::Init();
-  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
-  lite::Predictor predictor;
-
-  predictor.Build(FLAGS_model_dir, "", "", preferred_place, valid_places);
-
-  auto *input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
-  auto *data = input_tensor->mutable_data<float>();
-  auto item_size = input_tensor->dims().production();
-  for (int i = 0; i < item_size; i++) {
-    data[i] = 1;
-  }
-
-  for (int i = 0; i < FLAGS_warmup; ++i) {
-    predictor.Run();
-  }
-
-  auto start = GetCurrentUS();
-  for (int i = 0; i < FLAGS_repeats; ++i) {
-    predictor.Run();
-  }
-
-  LOG(INFO) << "================== Speed Report ===================";
-  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
-            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
-            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
-            << " ms in average.";
-
-  std::vector<std::vector<float>> results;
-  // i = 1
-  results.emplace_back(std::vector<float>(
-      {-0.6746618,  -0.7119305,  -0.053502668, -0.6767762,  -0.07488631,
-       -1.1109267,  0.63711894,  0.5979086,    -0.20651843, -0.49293622,
-       -0.7404337,  -0.25586239, 2.244521,     0.8738271,   0.7193805,
-       -0.21894705, -0.90460795, 0.07160086,   0.54588217,  0.020132724}));
-  auto *out = predictor.GetOutput(0);
-  ASSERT_EQ(out->dims().size(), 2);
-  ASSERT_EQ(out->dims()[0], 1);
-  ASSERT_EQ(out->dims()[1], 1000);
-
-  int step = 50;
-  for (int i = 0; i < results.size(); ++i) {
-    for (int j = 0; j < results[i].size(); ++j) {
-      EXPECT_NEAR(out->data<float>()[j * step + (out->dims()[1] * i)],
-                  results[i][j],
-                  2e-4);
-    }
-  }
-}
-
-TEST(EfficientNetB0, test_arm) {
-  std::vector<Place> valid_places({
-      Place{TARGET(kHost), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
-      // Place{TARGET(kOpenCL), PRECISION(kFloat)},
-  });
-
-  TestModel(valid_places, Place({TARGET(kARM), PRECISION(kFloat)}));
-}
-
-TEST(EfficientNetB0, test_opencl) {
-  std::vector<Place> valid_places({
-      Place{TARGET(kHost), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
-      Place{TARGET(kOpenCL), PRECISION(kFloat)},
-  });
-
-  TestModel(valid_places, Place({TARGET(kOpenCL), PRECISION(kFloat)}));
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/api/inceptionv4_test.cc b/lite/api/inceptionv4_test.cc
deleted file mode 100644
index ae772dbba5..0000000000
--- a/lite/api/inceptionv4_test.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/api/cxx_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/api/test_helper.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-
-#ifdef LITE_WITH_ARM
-TEST(InceptionV4, test) {
-  DeviceInfo::Init();
-  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
-  lite::Predictor predictor;
-  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
-                                   Place{TARGET(kARM), PRECISION(kFloat)}});
-
-  predictor.Build(FLAGS_model_dir,
-                  "",
-                  "",
-                  Place{TARGET(kARM), PRECISION(kFloat)},
-                  valid_places);
-
-  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
-  auto* data = input_tensor->mutable_data<float>();
-  auto item_size = input_tensor->dims().production();
-  for (int i = 0; i < item_size; i++) {
-    data[i] = 1;
-  }
-
-  for (int i = 0; i < FLAGS_warmup; ++i) {
-    predictor.Run();
-  }
-
-  auto start = GetCurrentUS();
-  for (int i = 0; i < FLAGS_repeats; ++i) {
-    predictor.Run();
-  }
-
-  LOG(INFO) << "================== Speed Report ===================";
-  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
-            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
-            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
-            << " ms in average.";
-
-  // std::vector<float> results({0.00078033, 0.00083865, 0.00060029, 0.00057083,
-  //                            0.00070094, 0.00080584, 0.00044525, 0.00074907,
-  //                            0.00059774, 0.00063654});
-  //
-  std::vector<std::vector<float>> results;
-  // i = 1
-  results.emplace_back(std::vector<float>(
-      {0.0011684548,  0.0010390386,  0.0011301535,  0.0010133048,
-       0.0010259597,  0.0010982729,  0.00093195855, 0.0009141837,
-       0.00096620916, 0.00089982944, 0.0010064574,  0.0010474789,
-       0.0009782845,  0.0009230255,  0.0010548076,  0.0010974824,
-       0.0010612885,  0.00089107914, 0.0010112736,  0.00097655767}));
-  auto* out = predictor.GetOutput(0);
-  ASSERT_EQ(out->dims().size(), 2);
-  ASSERT_EQ(out->dims()[0], 1);
-  ASSERT_EQ(out->dims()[1], 1000);
-
-  int step = 50;
-  for (int i = 0; i < results.size(); ++i) {
-    for (int j = 0; j < results[i].size(); ++j) {
-      EXPECT_NEAR(out->data<float>()[j * step + (out->dims()[1] * i)],
-                  results[i][j],
-                  1e-6);
-    }
-  }
-}
-#endif
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/api/light_api.cc b/lite/api/light_api.cc
deleted file mode 100644
index 98b79e58aa..0000000000
--- a/lite/api/light_api.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/api/light_api.h"
-
-namespace paddle {
-namespace lite {
-
-void LightPredictor::Build(const std::string& model_dir,
-                           const std::string& model_buffer,
-                           const std::string& param_buffer,
-                           lite_api::LiteModelType model_type,
-                           bool model_from_memory) {
-  cpp::ProgramDesc desc;
-  switch (model_type) {
-#ifndef LITE_ON_TINY_PUBLISH
-    case lite_api::LiteModelType::kProtobuf:
-      LoadModelPb(model_dir, "", "", scope_.get(), &desc);
-      break;
-#endif
-    case lite_api::LiteModelType::kNaiveBuffer: {
-      if (model_from_memory) {
-        LoadModelNaiveFromMemory(
-            model_buffer, param_buffer, scope_.get(), &desc);
-      } else {
-        LoadModelNaive(model_dir, scope_.get(), &desc);
-      }
-      break;
-    }
-    default:
-      LOG(FATAL) << "Unknown model type";
-  }
-  BuildRuntimeProgram(desc);
-}
-
-Tensor* LightPredictor::GetInput(size_t offset) {
-  auto* _feed_list = program_->exec_scope()->FindVar("feed");
-  CHECK(_feed_list) << "no feed variable in exec_scope";
-  auto* feed_list = _feed_list->GetMutable<std::vector<Tensor>>();
-  if (offset >= feed_list->size()) {
-    feed_list->resize(offset + 1);
-  }
-  return &feed_list->at(offset);
-}
-
-const Tensor* LightPredictor::GetOutput(size_t offset) {
-  auto* _fetch_list = program_->exec_scope()->FindVar("fetch");
-  CHECK(_fetch_list) << "no fatch variable in exec_scope";
-  auto& fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
-  CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
-  return &fetch_list.at(offset);
-}
-
-void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
-  std::vector<Instruction> insts;
-  // 1. Create op first
-  Program program(prog, scope_, {});
-
-  // 2. Create Instructs
-
-  // Create the kernels of the target places, and filter out the specific
-  // kernel with the target alias.
-  for (auto& op : program.ops()) {
-    auto kernel_type = op->op_info()->GetAttr<std::string>(kKernelTypeAttr);
-    std::string op_type, alias;
-    Place place;
-    KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
-    auto kernels = op->CreateKernels({place});
-    // filter out a kernel
-    auto it = std::find_if(
-        kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& it) {
-          return it->alias() == alias;
-        });
-    CHECK(it != kernels.end());
-    (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
-    insts.emplace_back(op, std::move(*it));
-  }
-  program_.reset(new RuntimeProgram(std::move(insts)));
-  CHECK(program.exec_scope());
-  program_->set_exec_scope(program.exec_scope());
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/api/light_api.h b/lite/api/light_api.h
deleted file mode 100644
index 2415401744..0000000000
--- a/lite/api/light_api.h
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/*
- * This file implements a light-weight API which can run on mobile. We limit the
- * dependencies and the runtime computation complexity.
- */
-#pragma once
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "lite/api/paddle_api.h"
-#include "lite/core/context.h"
-#include "lite/core/program.h"
-#include "lite/core/tensor.h"
-#include "lite/core/types.h"
-#include "lite/model_parser/model_parser.h"
-
-namespace paddle {
-namespace lite {
-
-/*
- * The light weight predictor, mainly for mobile. It loads an optimized model,
- * and will not depend on the MIR or perform latter optimization.
- */
-class LITE_API LightPredictor {
- public:
-  LightPredictor(
-      const std::string& model_dir,
-      const std::string& model_buffer = "",
-      const std::string& param_buffer = "",
-      bool model_from_memory = false,
-      lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf) {
-    scope_ = std::make_shared<Scope>();
-    Build(model_dir, model_buffer, param_buffer, model_type, model_from_memory);
-  }
-
-  void Run() { program_->Run(); }
-
-  // Get offset-th col of feed inputs.
-  Tensor* GetInput(size_t offset);
-
-  // Get offset-th col of fetch outputs.
-  const Tensor* GetOutput(size_t offset);
-
-  const lite::Tensor* GetTensor(const std::string& name) const {
-    auto* var = program_->exec_scope()->FindVar(name);
-    return &var->Get<lite::Tensor>();
-  }
-
- private:
-  void Build(
-      const std::string& model_dir,
-      const std::string& model_buffer,
-      const std::string& param_buffer,
-      lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
-      bool model_from_memory = false);
-
-  void BuildRuntimeProgram(const cpp::ProgramDesc& prog);
-
- private:
-  std::shared_ptr<Scope> scope_;
-  std::unique_ptr<RuntimeProgram> program_;
-};
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/api/light_api_impl.cc b/lite/api/light_api_impl.cc
deleted file mode 100644
index 6075f1a36f..0000000000
--- a/lite/api/light_api_impl.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/api/light_api.h"
-#include "lite/api/paddle_api.h"
-#include "lite/model_parser/model_parser.h"
-
-namespace paddle {
-namespace lite_api {
-
-class LightPredictorImpl : public PaddlePredictor {
- public:
-  LightPredictorImpl() = default;
-
-  std::unique_ptr<Tensor> GetInput(int i) override;
-
-  std::unique_ptr<const Tensor> GetOutput(int i) const override;
-
-  void Run() override;
-
-  std::unique_ptr<const Tensor> GetTensor(
-      const std::string& name) const override;
-
-  void Init(const MobileConfig& config);
-
- private:
-  std::unique_ptr<lite::LightPredictor> raw_predictor_;
-};
-
-void LightPredictorImpl::Init(const MobileConfig& config) {
-// LightPredictor Only support NaiveBuffer backend in publish lib
-#ifdef LITE_WITH_ARM
-  lite::DeviceInfo::Init();
-  lite::DeviceInfo::Global().SetRunMode(config.power_mode(), config.threads());
-#endif
-  raw_predictor_.reset(new lite::LightPredictor(config.model_dir(),
-                                                config.model_buffer(),
-                                                config.param_buffer(),
-                                                config.model_from_memory(),
-                                                LiteModelType::kNaiveBuffer));
-}
-
-std::unique_ptr<Tensor> LightPredictorImpl::GetInput(int i) {
-  return std::unique_ptr<Tensor>(new Tensor(raw_predictor_->GetInput(i)));
-}
-
-std::unique_ptr<const Tensor> LightPredictorImpl::GetOutput(int i) const {
-  return std::unique_ptr<Tensor>(new Tensor(raw_predictor_->GetOutput(i)));
-}
-
-void LightPredictorImpl::Run() { raw_predictor_->Run(); }
-
-std::unique_ptr<const Tensor> LightPredictorImpl::GetTensor(
-    const std::string& name) const {
-  return std::unique_ptr<const Tensor>(
-      new Tensor(raw_predictor_->GetTensor(name)));
-}
-
-template <>
-std::shared_ptr<PaddlePredictor> CreatePaddlePredictor(
-    const MobileConfig& config) {
-  auto x = std::make_shared<LightPredictorImpl>();
-  x->Init(config);
-  return x;
-}
-
-}  // namespace lite_api
-}  // namespace paddle
diff --git a/lite/api/light_api_test.cc b/lite/api/light_api_test.cc
deleted file mode 100644
index 8e2fc420bc..0000000000
--- a/lite/api/light_api_test.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/api/light_api.h"
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-
-DEFINE_string(optimized_model, "", "");
-
-namespace paddle {
-namespace lite {
-
-TEST(LightAPI, load) {
-  if (FLAGS_optimized_model.empty()) {
-    FLAGS_optimized_model = "lite_naive_model";
-  }
-  LightPredictor predictor(FLAGS_optimized_model, "", "");
-  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<int64_t>({100, 100})));
-  auto* data = input_tensor->mutable_data<float>();
-  for (int i = 0; i < 100 * 100; i++) {
-    data[i] = i;
-  }
-
-  predictor.Run();
-
-  const auto* output = predictor.GetOutput(0);
-  const float* raw_output = output->data<float>();
-
-  for (int i = 0; i < 10; i++) {
-    LOG(INFO) << "out " << raw_output[i];
-  }
-}
-
-TEST(LightAPI, loadNaiveBuffer) {
-  if (FLAGS_optimized_model.empty()) {
-    FLAGS_optimized_model = "lite_naive_model";
-  }
-
-  auto model_path = std::string(FLAGS_optimized_model) + "/__model__.nb";
-  auto params_path = std::string(FLAGS_optimized_model) + "/param.nb";
-  std::string model_buffer = lite::ReadFile(model_path);
-  size_t size_model = model_buffer.length();
-  std::string params_buffer = lite::ReadFile(params_path);
-  size_t size_params = params_buffer.length();
-  LOG(INFO) << "sizeModel: " << size_model;
-  LOG(INFO) << "sizeParams: " << size_params;
-
-  lite_api::MobileConfig config;
-  config.set_model_buffer(
-      model_buffer.c_str(), size_model, params_buffer.c_str(), size_params);
-  LightPredictor predictor(config.model_dir(),
-                           config.model_buffer(),
-                           config.param_buffer(),
-                           config.model_from_memory(),
-                           lite_api::LiteModelType::kNaiveBuffer);
-
-  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<int64_t>({100, 100})));
-  auto* data = input_tensor->mutable_data<float>();
-  for (int i = 0; i < 100 * 100; i++) {
-    data[i] = i;
-  }
-
-  predictor.Run();
-
-  const auto* output = predictor.GetOutput(0);
-  const float* raw_output = output->data<float>();
-
-  for (int i = 0; i < 10; i++) {
-    LOG(INFO) << "out " << raw_output[i];
-  }
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/api/lite_api_test_helper.cc b/lite/api/lite_api_test_helper.cc
deleted file mode 100644
index cd576998d3..0000000000
--- a/lite/api/lite_api_test_helper.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/api/lite_api_test_helper.h"
-#include <vector>
-
-DEFINE_string(model_dir, "", "");
-DEFINE_string(optimized_model, "", "");
-
-namespace paddle {
-namespace lite {
-
-const lite::Tensor* RunHvyModel() {
-  lite::Predictor predictor;
-#ifndef LITE_WITH_CUDA
-  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
-                                   Place{TARGET(kX86), PRECISION(kFloat)}});
-#else
-  std::vector<Place> valid_places({
-      Place{TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)},
-      Place{TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)},
-      Place{TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kNCHW)},
-      Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kNCHW)},
-      Place{TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kAny)},
-      Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)},
-  });
-#endif
-
-  predictor.Build(FLAGS_model_dir,
-                  "",
-                  "",
-                  Place{TARGET(kX86), PRECISION(kFloat)},  // origin cuda
-                  valid_places);
-
-  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({100, 100})));
-  auto* data = input_tensor->mutable_data<float>();
-  for (int i = 0; i < 100 * 100; i++) {
-    data[i] = i;
-  }
-
-  // LOG(INFO) << "input " << *input_tensor;
-
-  predictor.Run();
-
-  const auto* out = predictor.GetOutput(0);
-  return out;
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/api/lite_api_test_helper.h b/lite/api/lite_api_test_helper.h
deleted file mode 100644
index ac3be77b10..0000000000
--- a/lite/api/lite_api_test_helper.h
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <gflags/gflags.h>
-#include "lite/api/cxx_api.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-
-DECLARE_string(model_dir);
-DECLARE_string(optimized_model);
-
-namespace paddle {
-namespace lite {
-
-const lite::Tensor* RunHvyModel();
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/api/mobilenetv1_int8_test.cc b/lite/api/mobilenetv1_int8_test.cc
deleted file mode 100644
index 769f195d19..0000000000
--- a/lite/api/mobilenetv1_int8_test.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/api/cxx_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/api/test_helper.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-
-void TestModel(const std::vector<Place>& valid_places,
-               const Place& preferred_place) {
-  DeviceInfo::Init();
-  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
-  lite::Predictor predictor;
-
-  predictor.Build(FLAGS_model_dir, "", "", preferred_place, valid_places);
-
-  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
-  auto* data = input_tensor->mutable_data<float>();
-  auto item_size = input_tensor->dims().production();
-  for (int i = 0; i < item_size; i++) {
-    data[i] = 1;
-  }
-
-  for (int i = 0; i < FLAGS_warmup; ++i) {
-    predictor.Run();
-  }
-
-  auto start = GetCurrentUS();
-  for (int i = 0; i < FLAGS_repeats; ++i) {
-    predictor.Run();
-  }
-
-  LOG(INFO) << "================== Speed Report ===================";
-  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
-            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
-            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
-            << " ms in average.";
-
-  std::vector<std::vector<float>> results;
-  // i = 1
-  results.emplace_back(std::vector<float>(
-      {0.000227548, 0.000262385, 0.000260347, 0.000293865, 0.00025008}));
-  auto* out = predictor.GetOutput(0);
-  ASSERT_EQ(out->dims().size(), 2);
-  ASSERT_EQ(out->dims()[0], 1);
-  ASSERT_EQ(out->dims()[1], 1000);
-
-  int step = 50;
-  for (int i = 0; i < results.size(); ++i) {
-    for (int j = 0; j < results[i].size(); ++j) {
-      EXPECT_NEAR(out->data<float>()[j * step + (out->dims()[1] * i)],
-                  results[i][j],
-                  1e-6);
-    }
-  }
-}
-
-TEST(MobileNetV1, test_arm) {
-  std::vector<Place> valid_places({
-      Place{TARGET(kHost), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kInt8)},
-  });
-
-  TestModel(valid_places, Place({TARGET(kARM), PRECISION(kInt8)}));
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/api/mobilenetv1_ssd_test.cc b/lite/api/mobilenetv1_ssd_test.cc
deleted file mode 100644
index e37e180f9b..0000000000
--- a/lite/api/mobilenetv1_ssd_test.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/api/cxx_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/api/test_helper.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-
-#ifdef LITE_WITH_ARM
-void TestModel(const std::vector<Place>& valid_places,
-               const Place& preferred_place) {
-  DeviceInfo::Init();
-  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
-  lite::Predictor predictor;
-
-  predictor.Build(FLAGS_model_dir, "", "", preferred_place, valid_places);
-
-  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 300, 300})));
-  auto* data = input_tensor->mutable_data<float>();
-  auto item_size = input_tensor->dims().production();
-  for (int i = 0; i < item_size; i++) {
-    data[i] = 1;
-  }
-
-  for (int i = 0; i < FLAGS_warmup; ++i) {
-    predictor.Run();
-  }
-
-  auto start = GetCurrentUS();
-  for (int i = 0; i < FLAGS_repeats; ++i) {
-    predictor.Run();
-  }
-
-  LOG(INFO) << "================== Speed Report ===================";
-  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
-            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
-            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
-            << " ms in average.";
-
-  std::vector<std::vector<float>> results;
-  // i = 1
-  results.emplace_back(std::vector<float>(
-      {3, 0.042103, 0.00439525, 0.0234783, 1.01127, 0.990756}));
-  results.emplace_back(std::vector<float>(
-      {5, 0.0145793, 0.00860882, 0.0344975, 1.01375, 1.00129}));
-  results.emplace_back(std::vector<float>(
-      {8, 0.560059, 0.00439525, 0.0234783, 1.01127, 0.990756}));
-  results.emplace_back(std::vector<float>(
-      {9, 0.0165109, -0.0020006, 0.0013622, 0.999179, 0.991846}));
-  results.emplace_back(std::vector<float>(
-      {12, 0.0263337, -0.0020006, 0.0013622, 0.999179, 0.991846}));
-  results.emplace_back(std::vector<float>(
-      {15, 0.0116742, 0.00580454, 0.0321349, 1.00545, 0.98476}));
-  results.emplace_back(std::vector<float>(
-      {17, 0.0405541, 0.00860882, 0.0344975, 1.01375, 1.00129}));
-  results.emplace_back(std::vector<float>(
-      {18, 0.0231487, -0.00245976, 0.00771075, 1.01654, 1.00395}));
-  results.emplace_back(std::vector<float>(
-      {19, 0.0133921, 0.00860882, 0.0344975, 1.01375, 1.00129}));
-  results.emplace_back(std::vector<float>(
-      {20, 0.039664, 0.00860882, 0.0344975, 1.01375, 1.00129}));
-
-  auto* out = predictor.GetOutput(0);
-  ASSERT_EQ(out->dims().size(), 2);
-  ASSERT_EQ(out->dims()[0], 10);
-  ASSERT_EQ(out->dims()[1], 6);
-  ASSERT_EQ(out->lod().size(), 1);
-  ASSERT_EQ(out->lod()[0].size(), 2);
-  ASSERT_EQ(out->lod()[0][0], 0);
-  ASSERT_EQ(out->lod()[0][1], 10);
-
-  for (int i = 0; i < results.size(); ++i) {
-    for (int j = 0; j < results[i].size(); ++j) {
-      EXPECT_NEAR(
-          out->data<float>()[j + (out->dims()[1] * i)], results[i][j], 5e-6);
-    }
-  }
-}
-
-TEST(MobileNetV1_SSD, test_arm) {
-  std::vector<Place> valid_places({
-      Place{TARGET(kHost), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
-  });
-
-  TestModel(valid_places, Place({TARGET(kARM), PRECISION(kFloat)}));
-}
-
-#endif  // LITE_WITH_ARM
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/api/mobilenetv1_test.cc b/lite/api/mobilenetv1_test.cc
deleted file mode 100644
index 91d1828a94..0000000000
--- a/lite/api/mobilenetv1_test.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/api/cxx_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/api/test_helper.h"
-#include "lite/core/op_registry.h"
-
-DEFINE_string(optimized_model, "", "optimized_model");
-
-namespace paddle {
-namespace lite {
-
-void TestModel(const std::vector<Place>& valid_places,
-               const Place& preferred_place,
-               const std::string& model_dir = FLAGS_model_dir,
-               bool save_model = false) {
-  DeviceInfo::Init();
-  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
-  lite::Predictor predictor;
-
-  predictor.Build(model_dir, "", "", preferred_place, valid_places);
-
-  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
-  auto* data = input_tensor->mutable_data<float>();
-  auto item_size = input_tensor->dims().production();
-  for (int i = 0; i < item_size; i++) {
-    data[i] = 1;
-  }
-
-  for (int i = 0; i < FLAGS_warmup; ++i) {
-    predictor.Run();
-  }
-
-  auto start = GetCurrentUS();
-  for (int i = 0; i < FLAGS_repeats; ++i) {
-    predictor.Run();
-  }
-
-  if (save_model) {
-    LOG(INFO) << "Save optimized model to " << FLAGS_optimized_model;
-    predictor.SaveModel(FLAGS_optimized_model);
-  }
-
-  LOG(INFO) << "================== Speed Report ===================";
-  LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads
-            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
-            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
-            << " ms in average.";
-
-  std::vector<std::vector<float>> ref;
-  ref.emplace_back(std::vector<float>(
-      {0.00019130898, 9.467885e-05,  0.00015971427, 0.0003650665,
-       0.00026431272, 0.00060884043, 0.0002107942,  0.0015819625,
-       0.0010323516,  0.00010079765, 0.00011006987, 0.0017364529,
-       0.0048292773,  0.0013995157,  0.0018453331,  0.0002428986,
-       0.00020211363, 0.00013668182, 0.0005855956,  0.00025901722}));
-  auto* out = predictor.GetOutput(0);
-  const auto* pdata = out->data<float>();
-  int step = 50;
-#ifdef LITE_WITH_NPU
-  ASSERT_EQ(out->dims().production(), 1000);
-  double eps = 0.1;
-  for (int i = 0; i < ref.size(); ++i) {
-    for (int j = 0; j < ref[i].size(); ++j) {
-      auto result = pdata[j * step + (out->dims()[1] * i)];
-      auto diff = std::fabs((result - ref[i][j]) / ref[i][j]);
-      VLOG(3) << diff;
-      EXPECT_LT(diff, eps);
-    }
-  }
-#else
-  ASSERT_EQ(out->dims().size(), 2);
-  ASSERT_EQ(out->dims()[0], 1);
-  ASSERT_EQ(out->dims()[1], 1000);
-  double eps = 1e-6;
-  for (int i = 0; i < ref.size(); ++i) {
-    for (int j = 0; j < ref[i].size(); ++j) {
-      auto result = pdata[j * step + (out->dims()[1] * i)];
-      EXPECT_NEAR(result, ref[i][j], eps);
-    }
-  }
-#endif
-}
-
-#ifdef LITE_WITH_NPU
-TEST(MobileNetV1, test_npu) {
-  std::vector<Place> valid_places({
-      Place{TARGET(kHost), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
-      Place{TARGET(kNPU), PRECISION(kFloat)},
-  });
-
-  TestModel(valid_places,
-            Place({TARGET(kARM), PRECISION(kFloat)}),
-            FLAGS_model_dir,
-            true /* save_model*/);
-
-  TestModel(valid_places,
-            Place({TARGET(kARM), PRECISION(kFloat)}),
-            FLAGS_optimized_model,
-            false /* save model */);
-}
-#endif  // LITE_WITH_NPU
-
-TEST(MobileNetV1, test_arm) {
-  std::vector<Place> valid_places({
-      Place{TARGET(kHost), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
-  });
-
-  TestModel(valid_places, Place({TARGET(kARM), PRECISION(kFloat)}));
-}
-
-#ifdef LITE_WITH_OPENCL
-TEST(MobileNetV1, test_opencl) {
-  std::vector<Place> valid_places({
-      Place{TARGET(kHost), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
-      Place{TARGET(kOpenCL), PRECISION(kFloat)},
-  });
-
-  TestModel(valid_places, Place({TARGET(kOpenCL), PRECISION(kFloat)}));
-}
-#endif  // LITE_WITH_OPENCL
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/api/mobilenetv1_yolov3_test.cc b/lite/api/mobilenetv1_yolov3_test.cc
deleted file mode 100644
index 3a12203b71..0000000000
--- a/lite/api/mobilenetv1_yolov3_test.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/api/cxx_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/api/test_helper.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-
-#ifdef LITE_WITH_ARM
-void TestModel(const std::vector<Place>& valid_places,
-               const Place& preferred_place) {
-  DeviceInfo::Init();
-  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
-  lite::Predictor predictor;
-
-  predictor.Build(FLAGS_model_dir, "", "", preferred_place, valid_places);
-
-  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 608, 608})));
-  auto* data = input_tensor->mutable_data<float>();
-  auto item_size = input_tensor->dims().production();
-  for (int i = 0; i < item_size; i++) {
-    data[i] = 50;
-  }
-
-  auto* img_size = predictor.GetInput(1);
-  img_size->Resize(DDim(std::vector<DDim::value_type>({1, 2})));
-  auto* size_data = img_size->mutable_data<float>();
-  size_data[0] = 608;
-  size_data[1] = 608;
-
-  for (int i = 0; i < FLAGS_warmup; ++i) {
-    predictor.Run();
-  }
-
-  auto start = GetCurrentUS();
-  for (int i = 0; i < FLAGS_repeats; ++i) {
-    predictor.Run();
-  }
-
-  LOG(INFO) << "================== Speed Report ===================";
-  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
-            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
-            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
-            << " ms in average.";
-
-  std::vector<std::vector<float>> results;
-  // i = 1
-  results.emplace_back(std::vector<float>(
-      {0., 0.7803235, 577.7447, 592.5643, 582.15314, 597.3399}));
-  results.emplace_back(std::vector<float>(
-      {0., 0.7643098, 473.50653, 592.58966, 478.26117, 597.2353}));
-  results.emplace_back(std::vector<float>(
-      {0., 0.7614112, 593.06946, 591.99646, 598.64087, 597.553}));
-  results.emplace_back(std::vector<float>(
-      {0., 0.7579255, 161.40321, 592.61694, 166.33885, 597.28406}));
-  results.emplace_back(std::vector<float>(
-      {0., 0.7569634, 193.39563, 592.62164, 198.35269, 597.2968}));
-  results.emplace_back(std::vector<float>(
-      {0., 0.7568337, 297.3981, 592.62024, 302.35202, 597.2969}));
-  results.emplace_back(std::vector<float>(
-      {0., 0.7568283, 265.39816, 592.6203, 270.35214, 597.29694}));
-  results.emplace_back(std::vector<float>(
-      {0., 0.74383223, 33.430492, 592.7017, 38.453976, 597.4267}));
-  results.emplace_back(std::vector<float>(
-      {0., 0.66492873, 9.396143, 576.7084, 15.35708, 581.8059}));
-  results.emplace_back(std::vector<float>(
-      {0., 0.6568178, 9.970305, 145.12535, 15.043035, 149.76646}));
-
-  auto* out = predictor.GetOutput(0);
-  ASSERT_EQ(out->dims().size(), 2);
-  ASSERT_EQ(out->dims()[0], 100);
-  ASSERT_EQ(out->dims()[1], 6);
-  ASSERT_EQ(out->lod().size(), 1);
-  ASSERT_EQ(out->lod()[0].size(), 2);
-  ASSERT_EQ(out->lod()[0][0], 0);
-  ASSERT_EQ(out->lod()[0][1], 100);
-
-  int skip = 10;
-  for (int i = 0; i < results.size(); i += skip) {
-    for (int j = 0; j < results[i].size(); ++j) {
-      EXPECT_NEAR(
-          out->data<float>()[j + (out->dims()[1] * i)], results[i][j], 3e-6);
-    }
-  }
-}
-
-TEST(MobileNetV1_YoloV3, test_arm) {
-  std::vector<Place> valid_places({
-      Place{TARGET(kHost), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
-  });
-
-  TestModel(valid_places, Place({TARGET(kARM), PRECISION(kFloat)}));
-}
-
-#endif  // LITE_WITH_ARM
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/api/mobilenetv2_test.cc b/lite/api/mobilenetv2_test.cc
deleted file mode 100644
index ca36943cb9..0000000000
--- a/lite/api/mobilenetv2_test.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/api/cxx_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/api/test_helper.h"
-#include "lite/core/op_registry.h"
-
-DEFINE_string(optimized_model, "", "optimized_model");
-
-namespace paddle {
-namespace lite {
-
-#ifdef LITE_WITH_ARM
-void TestModel(const std::vector<Place>& valid_places,
-               const Place& preferred_place,
-               const std::string& model_dir = FLAGS_model_dir,
-               bool save_model = false) {
-  DeviceInfo::Init();
-  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
-  lite::Predictor predictor;
-
-  predictor.Build(model_dir, "", "", preferred_place, valid_places);
-
-  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
-  auto* data = input_tensor->mutable_data<float>();
-  auto item_size = input_tensor->dims().production();
-  for (int i = 0; i < item_size; i++) {
-    data[i] = 1;
-  }
-
-  for (int i = 0; i < FLAGS_warmup; ++i) {
-    predictor.Run();
-  }
-
-  auto start = GetCurrentUS();
-  for (int i = 0; i < FLAGS_repeats; ++i) {
-    predictor.Run();
-  }
-
-  if (save_model) {
-    LOG(INFO) << "Save optimized model to " << FLAGS_optimized_model;
-    predictor.SaveModel(FLAGS_optimized_model);
-  }
-
-  LOG(INFO) << "================== Speed Report ===================";
-  LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads
-            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
-            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
-            << " ms in average.";
-
-  std::vector<std::vector<float>> ref;
-  // i = 1
-  ref.emplace_back(std::vector<float>(
-      {0.00017082224, 5.699624e-05,  0.000260885,   0.00016412718,
-       0.00034818667, 0.00015230637, 0.00032959113, 0.0014772735,
-       0.0009059976,  9.5378724e-05, 5.386537e-05,  0.0006427285,
-       0.0070957416,  0.0016094646,  0.0018807327,  0.00010506048,
-       6.823785e-05,  0.00012269315, 0.0007806194,  0.00022354358}));
-  auto* out = predictor.GetOutput(0);
-  const auto* pdata = out->data<float>();
-  int step = 50;
-#ifdef LITE_WITH_NPU
-  ASSERT_EQ(out->dims().production(), 1000);
-  double eps = 0.1;
-  for (int i = 0; i < ref.size(); ++i) {
-    for (int j = 0; j < ref[i].size(); ++j) {
-      auto result = pdata[j * step + (out->dims()[1] * i)];
-      auto diff = std::fabs((result - ref[i][j]) / ref[i][j]);
-      VLOG(3) << diff;
-      EXPECT_LT(diff, eps);
-    }
-  }
-#else
-  ASSERT_EQ(out->dims().size(), 2);
-  ASSERT_EQ(out->dims()[0], 1);
-  ASSERT_EQ(out->dims()[1], 1000);
-  for (int i = 0; i < ref.size(); ++i) {
-    for (int j = 0; j < ref[i].size(); ++j) {
-      EXPECT_NEAR(pdata[j * step + (out->dims()[1] * i)], ref[i][j], 1e-6);
-    }
-  }
-#endif
-}
-
-#ifdef LITE_WITH_NPU
-TEST(MobileNetV2, test_npu) {
-  std::vector<Place> valid_places({
-      Place{TARGET(kHost), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
-      Place{TARGET(kNPU), PRECISION(kFloat)},
-  });
-
-  TestModel(valid_places,
-            Place({TARGET(kARM), PRECISION(kFloat)}),
-            FLAGS_model_dir,
-            true /* save_model*/);
-
-  TestModel(valid_places,
-            Place({TARGET(kARM), PRECISION(kFloat)}),
-            FLAGS_optimized_model,
-            false /* save model */);
-}
-#endif  // LITE_WITH_NPU
-
-TEST(MobileNetV2, test_arm) {
-  std::vector<Place> valid_places({
-      Place{TARGET(kHost), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
-  });
-
-  TestModel(valid_places, Place({TARGET(kARM), PRECISION(kFloat)}));
-}
-
-#ifdef LITE_WITH_OPENCL
-TEST(MobileNetV2, test_opencl) {
-  std::vector<Place> valid_places({
-      Place{TARGET(kHost), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
-      Place{TARGET(kOpenCL), PRECISION(kFloat)},
-  });
-
-  TestModel(valid_places, Place({TARGET(kOpenCL), PRECISION(kFloat)}));
-}
-#endif  // LITE_WITH_OPENCL
-
-#endif  // LITE_WITH_ARM
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/api/model_optimize_tool.cc b/lite/api/model_optimize_tool.cc
deleted file mode 100644
index 37c09b3446..0000000000
--- a/lite/api/model_optimize_tool.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#ifdef PADDLE_WITH_TESTING
-#include <gtest/gtest.h>
-#endif
-#include "all_kernel_faked.cc"  // NOLINT
-#include "lite/api/paddle_api.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/core/op_registry.h"
-#include "lite/utils/cp_logging.h"
-#include "lite/utils/string.h"
-
-DEFINE_string(model_dir,
-              "",
-              "path of the model. This option will be ignored if model_file "
-              "and param_file are exist");
-DEFINE_string(model_file, "", "model file path of the combined-param model");
-DEFINE_string(param_file, "", "param file path of the combined-param model");
-DEFINE_string(
-    optimize_out_type,
-    "protobuf",
-    "store type of the output optimized model. protobuf/naive_buffer");
-DEFINE_bool(display_kernels, false, "Display kernel information");
-DEFINE_string(optimize_out, "", "path of the output optimized model");
-DEFINE_string(valid_targets,
-              "arm",
-              "The targets this model optimized for, should be one of (arm, "
-              "opencl, x86), splitted by space");
-DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels");
-
-namespace paddle {
-namespace lite_api {
-
-//! Display the kernel information.
-void DisplayKernels() {
-  LOG(INFO) << ::paddle::lite::KernelRegistry::Global().DebugString();
-}
-
-void Main() {
-  if (!FLAGS_model_file.empty() && !FLAGS_param_file.empty()) {
-    LOG(WARNING)
-        << "Load combined-param model. Option model_dir will be ignored";
-  }
-
-  if (FLAGS_display_kernels) {
-    DisplayKernels();
-    exit(0);
-  }
-
-  lite_api::CxxConfig config;
-  config.set_model_dir(FLAGS_model_dir);
-  config.set_model_file(FLAGS_model_file);
-  config.set_param_file(FLAGS_param_file);
-
-  std::vector<Place> valid_places;
-  auto target_reprs = lite::Split(FLAGS_valid_targets, " ");
-  for (auto& target_repr : target_reprs) {
-    if (target_repr == "arm") {
-      valid_places.emplace_back(TARGET(kARM));
-    } else if (target_repr == "opencl") {
-      valid_places.emplace_back(TARGET(kOpenCL));
-    } else if (target_repr == "x86") {
-      valid_places.emplace_back(TARGET(kX86));
-    } else {
-      LOG(FATAL) << lite::string_format(
-          "Wrong target '%s' found, please check the command flag "
-          "'valid_targets'",
-          target_repr.c_str());
-    }
-  }
-  valid_places.emplace_back(TARGET(kHost));
-
-  CHECK(!valid_places.empty())
-      << "At least one target should be set, should set the "
-         "command argument 'valid_targets'";
-
-  if (FLAGS_prefer_int8_kernel) {
-    LOG(WARNING) << "Int8 mode is only support by ARM target";
-    valid_places.push_back(Place{TARGET(kARM), PRECISION(kInt8)});
-    config.set_preferred_place(Place{TARGET(kARM), PRECISION(kInt8)});
-  }
-  config.set_valid_places(valid_places);
-
-  auto predictor = lite_api::CreatePaddlePredictor(config);
-
-  LiteModelType model_type;
-  if (FLAGS_optimize_out_type == "protobuf") {
-    model_type = LiteModelType::kProtobuf;
-  } else if (FLAGS_optimize_out_type == "naive_buffer") {
-    model_type = LiteModelType::kNaiveBuffer;
-  } else {
-    LOG(FATAL) << "Unsupported Model type :" << FLAGS_optimize_out_type;
-  }
-
-  predictor->SaveOptimizedModel(FLAGS_optimize_out, model_type);
-}
-
-}  // namespace lite_api
-}  // namespace paddle
-
-int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, false);
-  paddle::lite_api::Main();
-  return 0;
-}
diff --git a/lite/api/model_run_test_image.cc b/lite/api/model_run_test_image.cc
deleted file mode 100644
index 099a74ed7f..0000000000
--- a/lite/api/model_run_test_image.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/api/cxx_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/api/test_helper.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-
-TEST(model, test) {
-#ifdef LITE_WITH_ARM
-  DeviceInfo::Init();
-  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
-  lite::Predictor predictor;
-  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
-                                   Place{TARGET(kARM), PRECISION(kFloat)},
-                                   Place{TARGET(kARM), PRECISION(kInt8)}});
-
-  auto precision = PRECISION(kFloat);
-  if (FLAGS_int8) {
-    precision = PRECISION(kInt8);
-  }
-  predictor.Build(
-      FLAGS_model_dir, "", "", Place{TARGET(kARM), precision}, valid_places);
-  int im_width = FLAGS_im_width;
-  int im_height = FLAGS_im_height;
-  auto* input_tensor = predictor.GetInput(0);
-  auto in_dims = input_tensor->dims();
-  input_tensor->Resize(
-      DDim(std::vector<DDim::value_type>({1, 3, im_width, im_height})));
-  auto* data = input_tensor->mutable_data<float>();
-  auto item_size = input_tensor->dims().production();
-  for (int i = 0; i < item_size; i++) {
-    data[i] = 1;
-  }
-
-  for (int i = 0; i < FLAGS_warmup; ++i) {
-    predictor.Run();
-  }
-
-  auto start = GetCurrentUS();
-  for (int i = 0; i < FLAGS_repeats; ++i) {
-    predictor.Run();
-  }
-  auto* output_tensors = predictor.GetOutputs();
-
-  LOG(INFO) << "======output:========";
-  for (auto t : *output_tensors) {
-    LOG(INFO) << t;
-  }
-  LOG(INFO)
-      << "=====RUN_finished!!============= Speed Report ===================";
-  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
-            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
-            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
-            << " ms in average.";
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/api/model_test.cc b/lite/api/model_test.cc
deleted file mode 100644
index 6e0a249a81..0000000000
--- a/lite/api/model_test.cc
+++ /dev/null
@@ -1,181 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <string>
-#include <vector>
-#include "lite/api/paddle_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/api/test_helper.h"
-#include "lite/core/device_info.h"
-#include "lite/utils/cp_logging.h"
-#include "lite/utils/string.h"
-
-DEFINE_string(input_shape,
-              "1,3,224,224",
-              "input shapes, separated by colon and comma");
-
-namespace paddle {
-namespace lite_api {
-
-void OutputOptModel(const std::string& load_model_dir,
-                    const std::string& save_optimized_model_dir,
-                    const std::vector<std::vector<int64_t>>& input_shapes) {
-  lite_api::CxxConfig config;
-  config.set_model_dir(load_model_dir);
-  config.set_preferred_place(Place{TARGET(kX86), PRECISION(kFloat)});
-  config.set_valid_places({
-      Place{TARGET(kX86), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
-  });
-  auto predictor = lite_api::CreatePaddlePredictor(config);
-
-  // delete old optimized model
-  int ret = system(
-      paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str())
-          .c_str());
-  if (ret == 0) {
-    LOG(INFO) << "delete old optimized model " << save_optimized_model_dir;
-  }
-  predictor->SaveOptimizedModel(save_optimized_model_dir,
-                                LiteModelType::kNaiveBuffer);
-  LOG(INFO) << "Load model from " << load_model_dir;
-  LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
-}
-
-#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-void Run(const std::vector<std::vector<int64_t>>& input_shapes,
-         const std::string& model_dir,
-         const int repeat,
-         const int thread_num,
-         const int warmup_times = 0) {
-#ifdef LITE_WITH_ARM
-  lite::DeviceInfo::Init();
-  lite::DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, thread_num);
-#endif
-  lite_api::MobileConfig config;
-  config.set_model_dir(model_dir);
-
-  auto predictor = lite_api::CreatePaddlePredictor(config);
-
-  for (int j = 0; j < input_shapes.size(); ++j) {
-    auto input_tensor = predictor->GetInput(j);
-    input_tensor->Resize(input_shapes[j]);
-    auto input_data = input_tensor->mutable_data<float>();
-    int input_num = 1;
-    for (int i = 0; i < input_shapes[j].size(); ++i) {
-      input_num *= input_shapes[j][i];
-    }
-    for (int i = 0; i < input_num; ++i) {
-      input_data[i] = 1.f;
-    }
-  }
-
-  for (int i = 0; i < warmup_times; ++i) {
-    predictor->Run();
-  }
-
-  auto start = lite::GetCurrentUS();
-  for (int i = 0; i < repeat; ++i) {
-    predictor->Run();
-  }
-  auto end = lite::GetCurrentUS();
-
-  LOG(INFO) << "================== Speed Report ===================";
-  LOG(INFO) << "Model: " << model_dir << ", threads num " << thread_num
-            << ", warmup: " << warmup_times << ", repeats: " << repeat
-            << ", spend " << (end - start) / repeat / 1000.0
-            << " ms in average.";
-
-  auto output = predictor->GetOutput(0);
-  auto out = output->data<float>();
-  LOG(INFO) << "out " << out[0];
-  LOG(INFO) << "out " << out[1];
-  auto output_shape = output->shape();
-  int output_num = 1;
-  for (int i = 0; i < output_shape.size(); ++i) {
-    output_num *= output_shape[i];
-  }
-  LOG(INFO) << "output_num: " << output_num;
-}
-#endif
-
-}  // namespace lite_api
-}  // namespace paddle
-
-int main(int argc, char** argv) {
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-  if (FLAGS_model_dir == "") {
-    LOG(INFO) << "usage: "
-              << "--model_dir /path/to/your/model";
-    exit(0);
-  }
-  std::string save_optimized_model_dir = FLAGS_model_dir + "opt2";
-
-  auto split_string =
-      [](const std::string& str_in) -> std::vector<std::string> {
-    std::vector<std::string> str_out;
-    std::string tmp_str = str_in;
-    while (!tmp_str.empty()) {
-      size_t next_offset = tmp_str.find(":");
-      str_out.push_back(tmp_str.substr(0, next_offset));
-      if (next_offset == std::string::npos) {
-        break;
-      } else {
-        tmp_str = tmp_str.substr(next_offset + 1);
-      }
-    }
-    return str_out;
-  };
-
-  auto get_shape = [](const std::string& str_shape) -> std::vector<int64_t> {
-    std::vector<int64_t> shape;
-    std::string tmp_str = str_shape;
-    while (!tmp_str.empty()) {
-      int dim = atoi(tmp_str.data());
-      shape.push_back(dim);
-      size_t next_offset = tmp_str.find(",");
-      if (next_offset == std::string::npos) {
-        break;
-      } else {
-        tmp_str = tmp_str.substr(next_offset + 1);
-      }
-    }
-    return shape;
-  };
-
-  LOG(INFO) << "input shapes: " << FLAGS_input_shape;
-  std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
-  std::vector<std::vector<int64_t>> input_shapes;
-  for (int i = 0; i < str_input_shapes.size(); ++i) {
-    LOG(INFO) << "input shape: " << str_input_shapes[i];
-    input_shapes.push_back(get_shape(str_input_shapes[i]));
-  }
-
-  // Output optimized model
-  paddle::lite_api::OutputOptModel(
-      FLAGS_model_dir, save_optimized_model_dir, input_shapes);
-
-#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-  // Run inference using optimized model
-  paddle::lite_api::Run(input_shapes,
-                        save_optimized_model_dir,
-                        FLAGS_repeats,
-                        FLAGS_threads,
-                        FLAGS_warmup);
-#endif
-  return 0;
-}
diff --git a/lite/api/ocr_attention_test.cc b/lite/api/ocr_attention_test.cc
deleted file mode 100644
index 89cf6a3e8d..0000000000
--- a/lite/api/ocr_attention_test.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/api/cxx_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/api/test_helper.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-
-void TestModel(const std::vector<Place>& valid_places,
-               const Place& preferred_place,
-               bool use_npu = false) {
-  DeviceInfo::Init();
-  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
-  lite::Predictor predictor;
-
-  predictor.Build(FLAGS_model_dir, "", "", preferred_place, valid_places);
-
-  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 1, 48, 512})));
-  auto* data = input_tensor->mutable_data<float>();
-  auto item_size = input_tensor->dims().production();
-  for (int i = 0; i < item_size; i++) {
-    data[i] = 1;
-  }
-
-  auto* init_scores = predictor.GetInput(2);
-  init_scores->Resize(DDim(std::vector<DDim::value_type>({1, 1})));
-  auto* data_scores = init_scores->mutable_data<float>();
-  auto scores_size = input_tensor->dims().production();
-  for (int i = 0; i < scores_size; i++) {
-    data_scores[i] = 0;
-  }
-  auto lod_scores = init_scores->mutable_lod();
-  std::vector<std::vector<uint64_t>> lod_s{{0, 1}, {0, 1}};
-  *lod_scores = lod_s;
-
-  auto* init_ids = predictor.GetInput(1);
-  init_ids->Resize(DDim(std::vector<DDim::value_type>({1, 1})));
-  auto* data_ids = init_ids->mutable_data<float>();
-  auto ids_size = init_ids->dims().production();
-  for (int i = 0; i < ids_size; i++) {
-    data_ids[i] = 0;
-  }
-  auto lod_ids = init_ids->mutable_lod();
-  std::vector<std::vector<uint64_t>> lod_i{{0, 1}, {0, 1}};
-  *lod_ids = lod_i;
-
-  for (int i = 0; i < FLAGS_warmup; ++i) {
-    predictor.Run();
-  }
-
-  auto start = GetCurrentUS();
-  for (int i = 0; i < FLAGS_repeats; ++i) {
-    predictor.Run();
-  }
-
-  LOG(INFO) << "================== Speed Report ===================";
-  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
-            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
-            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
-            << " ms in average.";
-
-  //  std::vector<std::vector<float>> results;
-  //  // i = 1
-  //  results.emplace_back(std::vector<float>(
-  //      {0.00019130898, 9.467885e-05,  0.00015971427, 0.0003650665,
-  //       0.00026431272, 0.00060884043, 0.0002107942,  0.0015819625,
-  //       0.0010323516,  0.00010079765, 0.00011006987, 0.0017364529,
-  //       0.0048292773,  0.0013995157,  0.0018453331,  0.0002428986,
-  //       0.00020211363, 0.00013668182, 0.0005855956,  0.00025901722}));
-  //  auto* out = predictor.GetOutput(0);
-  //  ASSERT_EQ(out->dims().size(), 2);
-  //  ASSERT_EQ(out->dims()[0], 1);
-  //  ASSERT_EQ(out->dims()[1], 1000);
-  //
-  //  int step = 50;
-  //  for (int i = 0; i < results.size(); ++i) {
-  //    for (int j = 0; j < results[i].size(); ++j) {
-  //      EXPECT_NEAR(out->data<float>()[j * step + (out->dims()[1] * i)],
-  //                  results[i][j],
-  //                  1e-6);
-  //    }
-  //  }
-}
-
-TEST(OcrAttention, test_arm) {
-  std::vector<Place> valid_places({
-      Place{TARGET(kHost), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
-  });
-
-  TestModel(valid_places, Place({TARGET(kARM), PRECISION(kFloat)}));
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc
deleted file mode 100644
index fee4ebf6dc..0000000000
--- a/lite/api/paddle_api.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/api/paddle_api.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite_api {
-
-Tensor::Tensor(void *raw) : raw_tensor_(raw) {}
-
-// TODO(Superjomn) refine this by using another `const void* const_raw`;
-Tensor::Tensor(const void *raw) { raw_tensor_ = const_cast<void *>(raw); }
-
-lite::Tensor *tensor(void *x) { return static_cast<lite::Tensor *>(x); }
-const lite::Tensor *ctensor(void *x) {
-  return static_cast<const lite::Tensor *>(x);
-}
-
-void Tensor::Resize(const shape_t &shape) {
-  tensor(raw_tensor_)->Resize(shape);
-}
-
-template <>
-const float *Tensor::data() const {
-  return ctensor(raw_tensor_)->data<float>();
-}
-template <>
-const int8_t *Tensor::data() const {
-  return ctensor(raw_tensor_)->data<int8_t>();
-}
-
-template <>
-float *Tensor::mutable_data() const {
-  return tensor(raw_tensor_)->mutable_data<float>();
-}
-template <>
-int8_t *Tensor::mutable_data() const {
-  return tensor(raw_tensor_)->mutable_data<int8_t>();
-}
-
-shape_t Tensor::shape() const {
-  return ctensor(raw_tensor_)->dims().Vectorize();
-}
-
-lod_t Tensor::lod() const { return ctensor(raw_tensor_)->lod(); }
-
-void Tensor::SetLoD(const lod_t &lod) { tensor(raw_tensor_)->set_lod(lod); }
-
-void PaddlePredictor::SaveOptimizedModel(const std::string &model_dir,
-                                         LiteModelType model_type) {
-  LOG(FATAL)
-      << "The SaveOptimizedModel API is only supported by CxxConfig predictor.";
-}
-
-template <typename ConfigT>
-std::shared_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT &) {
-  return std::shared_ptr<PaddlePredictor>();
-}
-
-}  // namespace lite_api
-}  // namespace paddle
diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h
deleted file mode 100644
index b1a8b21935..0000000000
--- a/lite/api/paddle_api.h
+++ /dev/null
@@ -1,167 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/*
- * This file defines PaddlePredictor, the api for lite. It supports multiple
- * hardware including ARM, X86, OpenCL, CUDA and so on.
- */
-
-#ifndef PADDLE_LITE_API_H_  // NOLINT
-#define PADDLE_LITE_API_H_
-#include <memory>
-#include <string>
-#include <vector>
-#include "paddle_place.h"  // NOLINT
-
-namespace paddle {
-namespace lite_api {
-
-using shape_t = std::vector<int64_t>;
-using lod_t = std::vector<std::vector<uint64_t>>;
-
-enum class LiteModelType { kProtobuf = 0, kNaiveBuffer, UNK };
-
-struct LITE_API Tensor {
-  explicit Tensor(void* raw);
-  explicit Tensor(const void* raw);
-
-  void Resize(const shape_t& shape);
-
-  /// Readonly data.
-  template <typename T>
-  const T* data() const;
-
-  template <typename T>
-  T* mutable_data() const;
-
-  /// Shape of the tensor.
-  shape_t shape() const;
-
-  // LoD of the tensor
-  lod_t lod() const;
-
-  // Set LoD of the tensor
-  void SetLoD(const lod_t& lod);
-
- private:
-  void* raw_tensor_;
-};
-
-/// The PaddlePredictor defines the basic interfaces for different kinds of
-/// predictors.
-class LITE_API PaddlePredictor {
- public:
-  PaddlePredictor() = default;
-
-  /// Get i-th input.
-  virtual std::unique_ptr<Tensor> GetInput(int i) = 0;
-
-  /// Get i-th output.
-  virtual std::unique_ptr<const Tensor> GetOutput(int i) const = 0;
-
-  virtual void Run() = 0;
-
-  /// Get a readonly tensor, return null if no one called `name` exists.
-  virtual std::unique_ptr<const Tensor> GetTensor(
-      const std::string& name) const = 0;
-
-  /// Persist the optimized model to disk. This API is only supported by
-  /// CxxConfig, and the persisted model can be reused for MobileConfig.
-  virtual void SaveOptimizedModel(
-      const std::string& model_dir,
-      LiteModelType model_type = LiteModelType::kProtobuf);
-
-  virtual ~PaddlePredictor() = default;
-};
-
-/// Base class for all the configs.
-class LITE_API ConfigBase {
-  std::string model_dir_;
-
- public:
-  void set_model_dir(const std::string& x) { model_dir_ = x; }
-
-  const std::string& model_dir() const { return model_dir_; }
-};
-
-/// CxxConfig is the config for the Full feature predictor.
-class LITE_API CxxConfig : public ConfigBase {
-  Place preferred_place_;
-  std::vector<Place> valid_places_;
-  std::string model_file_;
-  std::string param_file_;
-  bool model_from_memory_{false};
-
- public:
-  void set_preferred_place(const Place& x) { preferred_place_ = x; }
-  void set_valid_places(const std::vector<Place>& x) { valid_places_ = x; }
-  void set_model_file(const std::string& path) { model_file_ = path; }
-  void set_param_file(const std::string& path) { param_file_ = path; }
-  void set_model_buffer(const char* model_buffer,
-                        size_t model_buffer_size,
-                        const char* param_buffer,
-                        size_t param_buffer_size) {
-    model_file_ = std::string(model_buffer, model_buffer + model_buffer_size);
-    param_file_ = std::string(param_buffer, param_buffer + param_buffer_size);
-    model_from_memory_ = true;
-  }
-
-  const Place& preferred_place() const { return preferred_place_; }
-  const std::vector<Place>& valid_places() const { return valid_places_; }
-  std::string model_file() const { return model_file_; }
-  std::string param_file() const { return param_file_; }
-  bool model_from_memory() const { return model_from_memory_; }
-};
-
-/// MobileConfig is the config for the light weight predictor, it will skip
-/// IR optimization or other unnecessary stages.
-class LITE_API MobileConfig : public ConfigBase {
-  PowerMode mode_{LITE_POWER_HIGH};
-  int threads_{1};
-  std::string model_buffer_;
-  std::string param_buffer_;
-  bool model_from_memory_{false};
-
- public:
-  MobileConfig(Place preferred_place = Place(TARGET(kARM),
-                                             PRECISION(kFloat),
-                                             DATALAYOUT(kNCHW)),
-               PowerMode mode = LITE_POWER_HIGH,
-               int threads = 1)
-      : mode_(mode), threads_(threads) {}
-  void set_power_mode(PowerMode mode) { mode_ = mode; }
-  void set_threads(int threads) { threads_ = threads; }
-  void set_model_buffer(const char* model_buffer,
-                        size_t model_buffer_size,
-                        const char* param_buffer,
-                        size_t param_buffer_size) {
-    model_buffer_ = std::string(model_buffer, model_buffer + model_buffer_size);
-    param_buffer_ = std::string(param_buffer, param_buffer + param_buffer_size);
-    model_from_memory_ = true;
-  }
-
-  PowerMode power_mode() const { return mode_; }
-  int threads() const { return threads_; }
-  bool model_from_memory() const { return model_from_memory_; }
-  const std::string& model_buffer() const { return model_buffer_; }
-  const std::string& param_buffer() const { return param_buffer_; }
-};
-
-template <typename ConfigT>
-std::shared_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT&);
-
-}  // namespace lite_api
-}  // namespace paddle
-
-#endif  // NOLINT
diff --git a/lite/api/paddle_api_test.cc b/lite/api/paddle_api_test.cc
deleted file mode 100644
index 02502ff9c8..0000000000
--- a/lite/api/paddle_api_test.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/api/paddle_api.h"
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/utils/cp_logging.h"
-#include "lite/utils/io.h"
-DEFINE_string(model_dir, "", "");
-
-namespace paddle {
-namespace lite_api {
-
-TEST(CxxApi, run) {
-  lite_api::CxxConfig config;
-  config.set_model_dir(FLAGS_model_dir);
-  config.set_preferred_place(Place{TARGET(kX86), PRECISION(kFloat)});
-  config.set_valid_places({
-      Place{TARGET(kX86), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
-  });
-
-  auto predictor = lite_api::CreatePaddlePredictor(config);
-
-  auto input_tensor = predictor->GetInput(0);
-  input_tensor->Resize(std::vector<int64_t>({100, 100}));
-  auto* data = input_tensor->mutable_data<float>();
-  for (int i = 0; i < 100 * 100; i++) {
-    data[i] = i;
-  }
-
-  predictor->Run();
-
-  auto output = predictor->GetOutput(0);
-  auto* out = output->data<float>();
-  LOG(INFO) << out[0];
-  LOG(INFO) << out[1];
-
-  EXPECT_NEAR(out[0], 50.2132, 1e-3);
-  EXPECT_NEAR(out[1], -28.8729, 1e-3);
-
-  predictor->SaveOptimizedModel(FLAGS_model_dir + ".opt2");
-  predictor->SaveOptimizedModel(FLAGS_model_dir + ".opt2.naive",
-                                LiteModelType::kNaiveBuffer);
-}
-
-// Demo1 for Mobile Devices :Load model from file and run
-#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-TEST(LightApi, run) {
-  lite_api::MobileConfig config;
-  config.set_model_dir(FLAGS_model_dir + ".opt2.naive");
-
-  auto predictor = lite_api::CreatePaddlePredictor(config);
-
-  auto input_tensor = predictor->GetInput(0);
-  input_tensor->Resize(std::vector<int64_t>({100, 100}));
-  auto* data = input_tensor->mutable_data<float>();
-  for (int i = 0; i < 100 * 100; i++) {
-    data[i] = i;
-  }
-
-  predictor->Run();
-
-  auto output = predictor->GetOutput(0);
-  auto* out = output->data<float>();
-  LOG(INFO) << out[0];
-  LOG(INFO) << out[1];
-
-  EXPECT_NEAR(out[0], 50.2132, 1e-3);
-  EXPECT_NEAR(out[1], -28.8729, 1e-3);
-}
-
-// Demo2 for Loading model from memory
-TEST(MobileConfig, LoadfromMemory) {
-  // Get naive buffer
-  auto model_path = std::string(FLAGS_model_dir) + ".opt2.naive/__model__.nb";
-  auto params_path = std::string(FLAGS_model_dir) + ".opt2.naive/param.nb";
-  std::string model_buffer = lite::ReadFile(model_path);
-  size_t size_model = model_buffer.length();
-  std::string params_buffer = lite::ReadFile(params_path);
-  size_t size_params = params_buffer.length();
-  // set model buffer and run model
-  lite_api::MobileConfig config;
-  config.set_model_buffer(
-      model_buffer.c_str(), size_model, params_buffer.c_str(), size_params);
-
-  auto predictor = lite_api::CreatePaddlePredictor(config);
-  auto input_tensor = predictor->GetInput(0);
-  input_tensor->Resize(std::vector<int64_t>({100, 100}));
-  auto* data = input_tensor->mutable_data<float>();
-  for (int i = 0; i < 100 * 100; i++) {
-    data[i] = i;
-  }
-
-  predictor->Run();
-
-  const auto output = predictor->GetOutput(0);
-  const float* raw_output = output->data<float>();
-
-  for (int i = 0; i < 10; i++) {
-    LOG(INFO) << "out " << raw_output[i];
-  }
-}
-
-#endif
-
-}  // namespace lite_api
-}  // namespace paddle
diff --git a/lite/api/paddle_lite_factory_helper.h b/lite/api/paddle_lite_factory_helper.h
deleted file mode 100644
index e99127e233..0000000000
--- a/lite/api/paddle_lite_factory_helper.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/*
- * This file defines some MACROS that explicitly determine the op, kernel, mir
- * passes used in the inference lib.
- */
-#pragma once
-
-#define USE_LITE_OP(op_type__)                                   \
-  extern int touch_op_##op_type__();                             \
-  int LITE_OP_REGISTER_FAKE(op_type__) __attribute__((unused)) = \
-      touch_op_##op_type__();
-
-#define USE_LITE_KERNEL(op_type__, target__, precision__, layout__, alias__) \
-  extern int touch_##op_type__##target__##precision__##layout__##alias__();  \
-  int op_type__##target__##precision__##layout__##alias__##__use_lite_kernel \
-      __attribute__((unused)) =                                              \
-          touch_##op_type__##target__##precision__##layout__##alias__();
-
-#define USE_MIR_PASS(name__)                                   \
-  extern bool mir_pass_registry##name__##_fake();              \
-  static bool mir_pass_usage##name__ __attribute__((unused)) = \
-      mir_pass_registry##name__##_fake();
-
-#define LITE_OP_REGISTER_FAKE(op_type__) op_type__##__registry__
diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc
deleted file mode 100644
index dbdf9ff269..0000000000
--- a/lite/api/paddle_place.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/api/paddle_place.h"
-#include "lite/utils/cp_logging.h"
-#include "lite/utils/hash.h"
-#include "lite/utils/replace_stl/stream.h"
-#include "lite/utils/string.h"
-
-namespace paddle {
-namespace lite_api {
-
-size_t Place::hash() const {
-  std::hash<int> h;
-  size_t hash = h(static_cast<int>(target));
-  hash = lite::hash_combine(hash, static_cast<int>(precision));
-  hash = lite::hash_combine(hash, static_cast<int>(layout));
-  hash = lite::hash_combine(hash, static_cast<int>(device));
-  return hash;
-}
-
-bool operator<(const Place& a, const Place& b) {
-  if (a.target != b.target) return a.target < b.target;
-  if (a.precision != b.precision) return a.precision < b.precision;
-  if (a.layout != b.layout) return a.layout < b.layout;
-  if (a.device != b.device) return a.device < b.device;
-  return false;
-}
-
-std::string Place::DebugString() const {
-  STL::stringstream os;
-  os << TargetToStr(target) << "/" << PrecisionToStr(precision) << "/"
-     << DataLayoutToStr(layout);
-  return os.str();
-}
-
-const std::string& TargetToStr(TargetType target) {
-  static const std::string target2string[] = {
-      "unk", "host", "x86", "cuda", "arm", "opencl", "any", "fpga", "npu"};
-  auto x = static_cast<int>(target);
-  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
-  return target2string[x];
-}
-
-const std::string& PrecisionToStr(PrecisionType precision) {
-  static const std::string precision2string[] = {"unk",
-                                                 "float",
-                                                 "int8_t",
-                                                 "int32_t",
-                                                 "any",
-                                                 "float16",
-                                                 "bool",
-                                                 "int64_t",
-                                                 "int16_t"};
-  auto x = static_cast<int>(precision);
-  CHECK_LT(x, static_cast<int>(PRECISION(NUM)));
-  return precision2string[x];
-}
-
-const std::string& DataLayoutToStr(DataLayoutType layout) {
-  static const std::string datalayout2string[] = {"unk", "NCHW", "any", "NHWC"};
-  auto x = static_cast<int>(layout);
-  CHECK_LT(x, static_cast<int>(DATALAYOUT(NUM)));
-  return datalayout2string[x];
-}
-
-const std::string& TargetRepr(TargetType target) {
-  static const std::string target2string[] = {"kUnk",
-                                              "kHost",
-                                              "kX86",
-                                              "kCUDA",
-                                              "kARM",
-                                              "kOpenCL",
-                                              "kAny",
-                                              "kFPGA",
-                                              "kNPU"};
-  auto x = static_cast<int>(target);
-  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
-  return target2string[x];
-}
-
-const std::string& PrecisionRepr(PrecisionType precision) {
-  static const std::string precision2string[] = {"kUnk",
-                                                 "kFloat",
-                                                 "kInt8",
-                                                 "kInt32",
-                                                 "kAny",
-                                                 "kFP16",
-                                                 "kBool",
-                                                 "kInt64",
-                                                 "kInt16"};
-  auto x = static_cast<int>(precision);
-  CHECK_LT(x, static_cast<int>(PRECISION(NUM)));
-  return precision2string[x];
-}
-
-const std::string& DataLayoutRepr(DataLayoutType layout) {
-  static const std::string datalayout2string[] = {
-      "kUnk", "kNCHW", "kAny", "kNHWC"};
-  auto x = static_cast<int>(layout);
-  CHECK_LT(x, static_cast<int>(DATALAYOUT(NUM)));
-  return datalayout2string[x];
-}
-
-}  // namespace lite_api
-}  // namespace paddle
diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h
deleted file mode 100644
index 5e4f2ed21c..0000000000
--- a/lite/api/paddle_place.h
+++ /dev/null
@@ -1,164 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-
-// Generic helper definitions for shared library support
-#if defined _WIN32 || defined __CYGWIN__
-#define PADDLE_LITE_HELPER_DLL_IMPORT __declspec(dllimport)
-#define PADDLE_LITE_HELPER_DLL_EXPORT __declspec(dllexport)
-#define PADDLE_LITE_HELPER_DLL_LOCAL
-#else
-#if __GNUC__ >= 4
-#define PADDLE_LITE_HELPER_DLL_IMPORT __attribute__((visibility("default")))
-#define PADDLE_LITE_HELPER_DLL_EXPORT __attribute__((visibility("default")))
-#else
-#define PADDLE_LITE_HELPER_DLL_IMPORT
-#define PADDLE_LITE_HELPER_DLL_EXPORT
-#endif
-#endif
-
-#ifdef LITE_ON_TINY_PUBLISH
-#define LITE_API PADDLE_LITE_HELPER_DLL_EXPORT
-#define LITE_API_IMPORT PADDLE_LITE_HELPER_DLL_IMPORT
-#else
-#define LITE_API
-#define LITE_API_IMPORT
-#endif
-
-namespace paddle {
-namespace lite_api {
-
-enum class TargetType : int {
-  kUnk = 0,
-  kHost = 1,
-  kX86 = 2,
-  kCUDA = 3,
-  kARM = 4,
-  kOpenCL = 5,
-  kFPGA = 7,
-  kNPU = 8,
-  kAny = 6,  // any target
-  NUM = 9,   // number of fields.
-};
-enum class PrecisionType : int {
-  kUnk = 0,
-  kFloat = 1,
-  kInt8 = 2,
-  kInt32 = 3,
-  kAny = 4,  // any precision
-  kFP16 = 5,
-  kBool = 6,
-  kInt64 = 7,
-  kInt16 = 8,
-  NUM = 9,  // number of fields.
-};
-enum class DataLayoutType : int {
-  kUnk = 0,
-  kNCHW = 1,
-  kNHWC = 3,
-  kAny = 2,  // any data layout
-  NUM = 4,   // number of fields.
-};
-
-typedef enum {
-  LITE_POWER_HIGH = 0,
-  LITE_POWER_LOW = 1,
-  LITE_POWER_FULL = 2,
-  LITE_POWER_NO_BIND = 3,
-  LITE_POWER_RAND_HIGH = 4,
-  LITE_POWER_RAND_LOW = 5
-} PowerMode;
-
-enum class ActivationType : int {
-  kIndentity = 0,
-  kRelu = 1,
-  kRelu6 = 2,
-  kPRelu = 3,
-  kLeakyRelu = 4,
-  kSigmoid = 5,
-  kTanh = 6,
-  kSwish = 7
-};
-
-static size_t PrecisionTypeLength(PrecisionType type) {
-  switch (type) {
-    case PrecisionType::kFloat:
-      return 4;
-    case PrecisionType::kInt8:
-      return 1;
-    case PrecisionType::kInt32:
-      return 4;
-    case PrecisionType::kFP16:
-      return 2;
-    default:
-      return 4;
-  }
-}
-
-#define TARGET(item__) paddle::lite_api::TargetType::item__
-#define PRECISION(item__) paddle::lite_api::PrecisionType::item__
-#define DATALAYOUT(item__) paddle::lite_api::DataLayoutType::item__
-
-const std::string& TargetToStr(TargetType target);
-
-const std::string& PrecisionToStr(PrecisionType precision);
-
-const std::string& DataLayoutToStr(DataLayoutType layout);
-
-const std::string& TargetRepr(TargetType target);
-
-const std::string& PrecisionRepr(PrecisionType precision);
-
-const std::string& DataLayoutRepr(DataLayoutType layout);
-
-/*
- * Place specifies the execution context of a Kernel or input/output for a
- * kernel. It is used to make the analysis of the MIR more clear and accurate.
- */
-struct LITE_API Place {
-  TargetType target{TARGET(kUnk)};
-  PrecisionType precision{PRECISION(kUnk)};
-  DataLayoutType layout{DATALAYOUT(kUnk)};
-  int16_t device{0};  // device ID
-
-  Place() = default;
-  Place(TargetType target,
-        PrecisionType precision = PRECISION(kFloat),
-        DataLayoutType layout = DATALAYOUT(kNCHW),
-        int16_t device = 0)
-      : target(target), precision(precision), layout(layout), device(device) {}
-
-  bool is_valid() const {
-    return target != TARGET(kUnk) && precision != PRECISION(kUnk) &&
-           layout != DATALAYOUT(kUnk);
-  }
-
-  size_t hash() const;
-
-  bool operator==(const Place& other) const {
-    return target == other.target && precision == other.precision &&
-           layout == other.layout && device == other.device;
-  }
-
-  bool operator!=(const Place& other) const { return !(*this == other); }
-
-  friend bool operator<(const Place& a, const Place& b);
-
-  std::string DebugString() const;
-};
-
-}  // namespace lite_api
-}  // namespace paddle
diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h
deleted file mode 100644
index e43c0f2768..0000000000
--- a/lite/api/paddle_use_passes.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle_lite_factory_helper.h"  // NOLINT
-
-USE_MIR_PASS(demo);
-USE_MIR_PASS(static_kernel_pick_pass);
-USE_MIR_PASS(variable_place_inference_pass);
-USE_MIR_PASS(type_target_cast_pass);
-USE_MIR_PASS(generate_program_pass);
-USE_MIR_PASS(subgraph_program_pass);
-
-USE_MIR_PASS(io_copy_kernel_pick_pass);
-USE_MIR_PASS(argument_type_display_pass);
-USE_MIR_PASS(runtime_context_assign_pass);
-USE_MIR_PASS(graph_visualze);
-
-USE_MIR_PASS(lite_conv_bn_fuse_pass);
-USE_MIR_PASS(lite_fc_fuse_pass);
-USE_MIR_PASS(lite_shuffle_channel_fuse_pass);
-USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass);
-USE_MIR_PASS(lite_interpolate_fuse_pass);
-USE_MIR_PASS(identity_scale_eliminate_pass);
-USE_MIR_PASS(lite_conv_elementwise_fuse_pass);
-USE_MIR_PASS(lite_conv_activation_fuse_pass);
-USE_MIR_PASS(lite_elementwise_add_activation_fuse_pass);
-USE_MIR_PASS(lite_quant_dequant_fuse_pass);
-USE_MIR_PASS(type_precision_cast_pass);
-USE_MIR_PASS(type_layout_cast_pass);
diff --git a/lite/api/resnet18_test.cc b/lite/api/resnet18_test.cc
deleted file mode 100644
index c003dc1dba..0000000000
--- a/lite/api/resnet18_test.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/api/cxx_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/api/test_helper.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-
-#ifdef LITE_WITH_ARM
-TEST(ResNet18, test) {
-  lite::Predictor predictor;
-  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
-                                   Place{TARGET(kARM), PRECISION(kFloat)}});
-
-  predictor.Build(FLAGS_model_dir,
-                  "",
-                  "",
-                  Place{TARGET(kARM), PRECISION(kFloat)},
-                  valid_places);
-
-  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
-  auto* data = input_tensor->mutable_data<float>();
-  auto item_size = input_tensor->dims().production();
-  for (int i = 0; i < item_size; i++) {
-    data[i] = 1;
-  }
-
-  for (int i = 0; i < FLAGS_warmup; ++i) {
-    predictor.Run();
-  }
-
-  auto start = GetCurrentUS();
-  for (int i = 0; i < FLAGS_repeats; ++i) {
-    predictor.Run();
-  }
-
-  LOG(INFO) << "================== Speed Report ===================";
-  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
-            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
-            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
-            << " ms in average.";
-
-  std::vector<std::vector<float>> results;
-  // i = 1
-  results.emplace_back(std::vector<float>(
-      {0.00020891492, 0.00012855872, 0.00019274367, 0.00031139381,
-       0.0003184143,  0.00022596598, 0.00025920002, 0.0006651449,
-       0.0015664422,  0.0002835265,  0.0001418782,  0.0013916927,
-       0.007779476,   0.0020724828,  0.0012296075,  0.00073855236,
-       0.00014572912, 0.00025809053, 0.0004427299,  0.00042198936}));
-  auto* out = predictor.GetOutput(0);
-  ASSERT_EQ(out->dims().size(), 2);
-  ASSERT_EQ(out->dims()[0], 1);
-  ASSERT_EQ(out->dims()[1], 1000);
-
-  int step = 50;
-  for (int i = 0; i < results.size(); ++i) {
-    for (int j = 0; j < results[i].size(); ++j) {
-      EXPECT_NEAR(out->data<float>()[j * step + (out->dims()[1] * i)],
-                  results[i][j],
-                  1e-6);
-    }
-  }
-}
-#endif
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/api/resnet50_test.cc b/lite/api/resnet50_test.cc
deleted file mode 100644
index 6e78d12be0..0000000000
--- a/lite/api/resnet50_test.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/api/cxx_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/api/test_helper.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-
-#ifdef LITE_WITH_ARM
-void TestModel(const std::vector<Place>& valid_places,
-               const Place& preferred_place) {
-  DeviceInfo::Init();
-  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
-  lite::Predictor predictor;
-
-  predictor.Build(FLAGS_model_dir, "", "", preferred_place, valid_places);
-
-  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
-  auto* data = input_tensor->mutable_data<float>();
-  auto item_size = input_tensor->dims().production();
-  for (int i = 0; i < item_size; i++) {
-    data[i] = 1;
-  }
-
-  for (int i = 0; i < FLAGS_warmup; ++i) {
-    predictor.Run();
-  }
-
-  auto start = GetCurrentUS();
-  for (int i = 0; i < FLAGS_repeats; ++i) {
-    predictor.Run();
-  }
-
-  LOG(INFO) << "================== Speed Report ===================";
-  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
-            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
-            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
-            << " ms in average.";
-
-  std::vector<std::vector<float>> results;
-  // i = 1
-  results.emplace_back(std::vector<float>(
-      {0.00024139918, 0.00020566184, 0.00022418296, 0.00041731037,
-       0.0005366107,  0.00016948722, 0.00028638865, 0.0009257241,
-       0.00072681636, 8.531815e-05,  0.0002129998,  0.0021168243,
-       0.006387163,   0.0037145028,  0.0012812682,  0.00045948103,
-       0.00013535398, 0.0002483765,  0.00076759676, 0.0002773295}));
-  auto* out = predictor.GetOutput(0);
-  ASSERT_EQ(out->dims().size(), 2);
-  ASSERT_EQ(out->dims()[0], 1);
-  ASSERT_EQ(out->dims()[1], 1000);
-
-  int step = 50;
-  for (int i = 0; i < results.size(); ++i) {
-    for (int j = 0; j < results[i].size(); ++j) {
-      EXPECT_NEAR(out->data<float>()[j * step + (out->dims()[1] * i)],
-                  results[i][j],
-                  1e-6);
-    }
-  }
-}
-
-TEST(ResNet50, test_arm) {
-  std::vector<Place> valid_places({
-      Place{TARGET(kHost), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
-  });
-
-  TestModel(valid_places, Place({TARGET(kARM), PRECISION(kFloat)}));
-}
-
-#ifdef LITE_WITH_OPENCL
-TEST(ResNet50, test_opencl) {
-  std::vector<Place> valid_places({
-      Place{TARGET(kHost), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
-      Place{TARGET(kOpenCL), PRECISION(kFloat)},
-  });
-
-  TestModel(valid_places, Place({TARGET(kOpenCL), PRECISION(kFloat)}));
-}
-#endif  // LITE_WITH_OPENCL
-
-#endif  // LITE_WITH_ARM
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/api/resnet50_test_fpga.cc b/lite/api/resnet50_test_fpga.cc
deleted file mode 100644
index 7ea81cc746..0000000000
--- a/lite/api/resnet50_test_fpga.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/api/cxx_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/api/test_helper.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-
-#ifdef LITE_WITH_FPGA
-TEST(ResNet50, test) {
-  lite::Predictor predictor;
-  std::vector<Place> valid_places(
-      {Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
-       Place{TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNHWC)}});
-
-  predictor.Build(FLAGS_model_dir,
-                  "",
-                  "",
-                  Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},
-                  valid_places);
-
-  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
-  auto* data = input_tensor->mutable_data<float>();
-  auto item_size = input_tensor->dims().production();
-  for (int i = 0; i < item_size; i++) {
-    data[i] = 1;
-  }
-
-  for (int i = 0; i < FLAGS_warmup; ++i) {
-    predictor.Run();
-  }
-
-  auto start = GetCurrentUS();
-  for (int i = 0; i < FLAGS_repeats; ++i) {
-    predictor.Run();
-  }
-
-  LOG(INFO) << "================== Speed Report ===================";
-}
-#endif
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/api/shufflenetv2_test.cc b/lite/api/shufflenetv2_test.cc
deleted file mode 100644
index f67bc8c6cf..0000000000
--- a/lite/api/shufflenetv2_test.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/api/cxx_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/api/test_helper.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-
-void TestModel(const std::vector<Place>& valid_places,
-               const Place& preferred_place) {
-  DeviceInfo::Init();
-  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
-  lite::Predictor predictor;
-
-  predictor.Build(FLAGS_model_dir, "", "", preferred_place, valid_places);
-
-  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim((std::vector<DDim::value_type>({1, 3, 224, 224}))));
-  auto* data = input_tensor->mutable_data<float>();
-  auto item_size = input_tensor->dims().production();
-  for (int i = 0; i < item_size; ++i) {
-    data[i] = 1;
-  }
-
-  for (int i = 0; i < FLAGS_warmup; ++i) {
-    predictor.Run();
-  }
-
-  auto start = GetCurrentUS();
-  for (int i = 0; i < FLAGS_repeats; ++i) {
-    predictor.Run();
-  }
-
-  LOG(INFO) << "================== Speed Report ===================";
-  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
-            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
-            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
-            << " ms in average.";
-
-  std::vector<std::vector<float>> results;
-  results.emplace_back(std::vector<float>(
-      {0.00020622103, 9.36264e-05,   0.0002608151,  0.0004974526,
-       0.00028529152, 9.3994095e-05, 0.00028626667, 0.0011567438,
-       0.00094107876, 8.8955254e-05, 4.1932417e-05, 0.00016469292,
-       0.006776762,   0.0028232741,  0.00024495262, 0.00022493803,
-       0.00015700555, 0.00013883937, 0.00093898486, 0.00018184447}));
-  auto* out = predictor.GetOutput(0);
-  ASSERT_EQ(out->dims().size(), 2);
-  ASSERT_EQ(out->dims()[0], 1);
-  ASSERT_EQ(out->dims()[1], 1000);
-
-  int step = 50;
-  for (int i = 0; i < results.size(); ++i) {
-    for (int j = 0; j < results[i].size(); ++j) {
-      EXPECT_NEAR(out->data<float>()[j * step + (out->dims()[1] * i)],
-                  results[i][j],
-                  1e-6);
-    }
-  }
-}
-
-TEST(ShuffleNetV2, test_arm) {
-  std::vector<Place> valid_places({
-      Place{TARGET(kHost), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
-      // Place{TARGET(kOpenCL), PRECISION(kFloat)},
-  });
-
-  TestModel(valid_places, Place({TARGET(kARM), PRECISION(kFloat)}));
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/api/test_googlenet_lite.cc b/lite/api/test_googlenet_lite.cc
deleted file mode 100644
index 4c9ecd90c6..0000000000
--- a/lite/api/test_googlenet_lite.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/api/cxx_api.h"
-#include "lite/api/lite_api_test_helper.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-
-// for googlenet
-DEFINE_string(model_dir, "", "");
-
-namespace paddle {
-namespace lite {
-#ifdef LITE_WITH_X86
-TEST(CXXApi, test_lite_googlenet) {
-  lite::Predictor predictor;
-  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
-                                   Place{TARGET(kX86), PRECISION(kFloat)}});
-
-  //  LOG(INFO)<<"FLAGS_eval_googlenet_dir:"<<FLAGS_test_lite_googlenet_dir;
-  std::string model_dir = FLAGS_model_dir;
-  predictor.Build(
-      model_dir, "", "", Place{TARGET(kX86), PRECISION(kFloat)}, valid_places);
-
-  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
-  auto* data = input_tensor->mutable_data<float>();
-  for (int i = 0; i < input_tensor->dims().production(); i++) {
-    data[i] = 1;
-  }
-  predictor.Run();
-
-  auto* out = predictor.GetOutput(0);
-  std::vector<float> results(
-      {0.00034298553, 0.0008200012, 0.0005046297, 0.000839279,
-       0.00052616704, 0.0003447803, 0.0010877076, 0.00081762316,
-       0.0003941339,  0.0011430943, 0.0008892841, 0.00080191303,
-       0.0004442384,  0.000658702,  0.0026721435, 0.0013686896,
-       0.0005618166,  0.0006556497, 0.0006984528, 0.0014619455});
-  for (size_t i = 0; i < results.size(); ++i) {
-    EXPECT_NEAR(out->data<float>()[i * 51], results[i], 1e-5);
-  }
-  ASSERT_EQ(out->dims().size(), 2);
-  ASSERT_EQ(out->dims()[0], 1);
-  ASSERT_EQ(out->dims()[1], 1000);
-}
-#endif
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/api/test_helper.h b/lite/api/test_helper.h
deleted file mode 100644
index d835c030f0..0000000000
--- a/lite/api/test_helper.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <gflags/gflags.h>
-#include <sys/time.h>
-#include <time.h>
-
-// for eval
-DEFINE_string(model_dir, "", "model dir");
-DEFINE_int32(warmup, 0, "warmup times");
-DEFINE_int32(repeats, 1, "repeats times");
-DEFINE_int32(threads, 1, "threads num");
-DEFINE_int32(im_width, 224, "image width");
-DEFINE_int32(im_height, 224, "image height");
-DEFINE_bool(int8, false, "is run int8");
-
-namespace paddle {
-namespace lite {
-
-inline double GetCurrentUS() {
-  struct timeval time;
-  gettimeofday(&time, NULL);
-  return 1e+6 * time.tv_sec + time.tv_usec;
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/api/test_inceptionv4_lite_x86.cc b/lite/api/test_inceptionv4_lite_x86.cc
deleted file mode 100644
index 5d1dbbe144..0000000000
--- a/lite/api/test_inceptionv4_lite_x86.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/api/cxx_api.h"
-#include "lite/api/lite_api_test_helper.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/api/test_helper.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-
-TEST(InceptionV4, test_inceptionv4_lite_x86) {
-  lite::Predictor predictor;
-  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
-                                   Place{TARGET(kX86), PRECISION(kFloat)}});
-
-  //  LOG(INFO)<<"FLAGS_eval_googlenet_dir:"<<FLAGS_test_lite_googlenet_dir;
-  std::string model_dir = FLAGS_model_dir;
-  std::vector<std::string> passes({"static_kernel_pick_pass",
-                                   "variable_place_inference_pass",
-                                   "type_target_cast_pass",
-                                   "variable_place_inference_pass",
-                                   "io_copy_kernel_pick_pass",
-                                   "variable_place_inference_pass",
-                                   "runtime_context_assign_pass"});
-  predictor.Build(model_dir,
-                  "",
-                  "",
-                  Place{TARGET(kX86), PRECISION(kFloat)},
-                  valid_places,
-                  passes);
-
-  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
-  auto* data = input_tensor->mutable_data<float>();
-  for (int i = 0; i < input_tensor->dims().production(); i++) {
-    data[i] = 1;
-  }
-
-  for (int i = 0; i < FLAGS_warmup; ++i) {
-    predictor.Run();
-  }
-
-  auto start = GetCurrentUS();
-  for (int i = 0; i < FLAGS_repeats; ++i) {
-    predictor.Run();
-  }
-
-  LOG(INFO) << "================== Speed Report ===================";
-  LOG(INFO) << "Model: " << FLAGS_model_dir << ", warmup: " << FLAGS_warmup
-            << ", repeats: " << FLAGS_repeats << ", spend "
-            << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
-            << " ms in average.";
-
-  std::vector<std::vector<float>> results;
-  // i = 1
-  results.emplace_back(std::vector<float>(
-      {0.0011684548,  0.0010390386,  0.0011301535,  0.0010133048,
-       0.0010259597,  0.0010982729,  0.00093195855, 0.0009141837,
-       0.00096620916, 0.00089982944, 0.0010064574,  0.0010474789,
-       0.0009782845,  0.0009230255,  0.0010548076,  0.0010974824,
-       0.0010612885,  0.00089107914, 0.0010112736,  0.00097655767}));
-
-  auto* out = predictor.GetOutput(0);
-  ASSERT_EQ(out->dims().size(), 2);
-  ASSERT_EQ(out->dims()[0], 1);
-  ASSERT_EQ(out->dims()[1], 1000);
-
-  int step = 50;
-  for (int i = 0; i < results.size(); ++i) {
-    for (int j = 0; j < results[i].size(); ++j) {
-      EXPECT_NEAR(out->data<float>()[j * step + (out->dims()[1] * i)],
-                  results[i][j],
-                  1e-6);
-    }
-  }
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/api/test_mobilenetv1_lite_x86.cc b/lite/api/test_mobilenetv1_lite_x86.cc
deleted file mode 100644
index d755410b6a..0000000000
--- a/lite/api/test_mobilenetv1_lite_x86.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/api/cxx_api.h"
-#include "lite/api/lite_api_test_helper.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/api/test_helper.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-
-TEST(Mobilenet_v1, test_mobilenetv1_lite_x86) {
-  lite::Predictor predictor;
-  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
-                                   Place{TARGET(kX86), PRECISION(kFloat)}});
-
-  std::string model_dir = FLAGS_model_dir;
-  std::vector<std::string> passes({"static_kernel_pick_pass",
-                                   "variable_place_inference_pass",
-                                   "type_target_cast_pass",
-                                   "variable_place_inference_pass",
-                                   "io_copy_kernel_pick_pass",
-                                   "variable_place_inference_pass",
-                                   "runtime_context_assign_pass"});
-  predictor.Build(model_dir,
-                  "",
-                  "",
-                  Place{TARGET(kX86), PRECISION(kFloat)},
-                  valid_places,
-                  passes);
-  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
-  auto* data = input_tensor->mutable_data<float>();
-  for (int i = 0; i < input_tensor->dims().production(); i++) {
-    data[i] = 1;
-  }
-
-  for (int i = 0; i < FLAGS_warmup; ++i) {
-    predictor.Run();
-  }
-
-  auto start = GetCurrentUS();
-  for (int i = 0; i < FLAGS_repeats; ++i) {
-    predictor.Run();
-  }
-
-  LOG(INFO) << "================== Speed Report ===================";
-  LOG(INFO) << "Model: " << FLAGS_model_dir << ", warmup: " << FLAGS_warmup
-            << ", repeats: " << FLAGS_repeats << ", spend "
-            << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
-            << " ms in average.";
-
-  std::vector<std::vector<float>> results;
-  // i = 1
-  results.emplace_back(std::vector<float>(
-      {0.00019130898, 9.467885e-05,  0.00015971427, 0.0003650665,
-       0.00026431272, 0.00060884043, 0.0002107942,  0.0015819625,
-       0.0010323516,  0.00010079765, 0.00011006987, 0.0017364529,
-       0.0048292773,  0.0013995157,  0.0018453331,  0.0002428986,
-       0.00020211363, 0.00013668182, 0.0005855956,  0.00025901722}));
-  auto* out = predictor.GetOutput(0);
-  ASSERT_EQ(out->dims().size(), 2);
-  ASSERT_EQ(out->dims()[0], 1);
-  ASSERT_EQ(out->dims()[1], 1000);
-
-  int step = 50;
-  for (int i = 0; i < results.size(); ++i) {
-    for (int j = 0; j < results[i].size(); ++j) {
-      EXPECT_NEAR(out->data<float>()[j * step + (out->dims()[1] * i)],
-                  results[i][j],
-                  1e-6);
-    }
-  }
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/api/test_mobilenetv2_lite_x86.cc b/lite/api/test_mobilenetv2_lite_x86.cc
deleted file mode 100644
index b1090cc6f2..0000000000
--- a/lite/api/test_mobilenetv2_lite_x86.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/api/cxx_api.h"
-#include "lite/api/lite_api_test_helper.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/api/test_helper.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-// for googlenet
-
-namespace paddle {
-namespace lite {
-
-TEST(Mobilenet_v2, test_mobilenetv2_lite_x86) {
-  lite::Predictor predictor;
-  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
-                                   Place{TARGET(kX86), PRECISION(kFloat)}});
-
-  //  LOG(INFO)<<"FLAGS_eval_googlenet_dir:"<<FLAGS_test_lite_googlenet_dir;
-  std::string model_dir = FLAGS_model_dir;
-  std::vector<std::string> passes({"static_kernel_pick_pass",
-                                   "variable_place_inference_pass",
-                                   "type_target_cast_pass",
-                                   "variable_place_inference_pass",
-                                   "io_copy_kernel_pick_pass",
-                                   "variable_place_inference_pass",
-                                   "runtime_context_assign_pass"});
-  predictor.Build(model_dir,
-                  "",
-                  "",
-                  Place{TARGET(kX86), PRECISION(kFloat)},
-                  valid_places,
-                  passes);
-
-  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
-  auto* data = input_tensor->mutable_data<float>();
-  for (int i = 0; i < input_tensor->dims().production(); i++) {
-    data[i] = 1;
-  }
-
-  for (int i = 0; i < FLAGS_warmup; ++i) {
-    predictor.Run();
-  }
-
-  auto start = GetCurrentUS();
-  for (int i = 0; i < FLAGS_repeats; ++i) {
-    predictor.Run();
-  }
-
-  LOG(INFO) << "================== Speed Report ===================";
-  LOG(INFO) << "Model: " << FLAGS_model_dir << ", warmup: " << FLAGS_warmup
-            << ", repeats: " << FLAGS_repeats << ", spend "
-            << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
-            << " ms in average.";
-
-  std::vector<std::vector<float>> results;
-  // i = 1
-  results.emplace_back(std::vector<float>(
-      {0.00017082224, 5.699624e-05,  0.000260885,   0.00016412718,
-       0.00034818667, 0.00015230637, 0.00032959113, 0.0014772735,
-       0.0009059976,  9.5378724e-05, 5.386537e-05,  0.0006427285,
-       0.0070957416,  0.0016094646,  0.0018807327,  0.00010506048,
-       6.823785e-05,  0.00012269315, 0.0007806194,  0.00022354358}));
-  auto* out = predictor.GetOutput(0);
-  ASSERT_EQ(out->dims().size(), 2);
-  ASSERT_EQ(out->dims()[0], 1);
-  ASSERT_EQ(out->dims()[1], 1000);
-
-  int step = 50;
-  for (int i = 0; i < results.size(); ++i) {
-    for (int j = 0; j < results[i].size(); ++j) {
-      EXPECT_NEAR(out->data<float>()[j * step + (out->dims()[1] * i)],
-                  results[i][j],
-                  1e-6);
-    }
-  }
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/api/unet_test.cc b/lite/api/unet_test.cc
deleted file mode 100644
index aae5f493eb..0000000000
--- a/lite/api/unet_test.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/api/cxx_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/api/test_helper.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-
-#ifdef LITE_WITH_ARM
-TEST(unet, test) {
-  DeviceInfo::Init();
-  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
-  lite::Predictor predictor;
-  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
-                                   Place{TARGET(kARM), PRECISION(kFloat)}});
-
-  predictor.Build(FLAGS_model_dir,
-                  "",
-                  "",
-                  Place{TARGET(kARM), PRECISION(kFloat)},
-                  valid_places);
-
-  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 512, 512})));
-  auto* data = input_tensor->mutable_data<float>();
-  auto item_size = input_tensor->dims().production();
-  for (int i = 0; i < item_size; i++) {
-    data[i] = 1;
-  }
-
-  for (int i = 0; i < FLAGS_warmup; ++i) {
-    predictor.Run();
-  }
-
-  auto start = GetCurrentUS();
-  for (int i = 0; i < FLAGS_repeats; ++i) {
-    predictor.Run();
-  }
-
-  LOG(INFO) << "================== Speed Report ===================";
-  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
-            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
-            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
-            << " ms in average.";
-
-  // std::vector<float> results({0.00078033, 0.00083865, 0.00060029, 0.00057083,
-  //                            0.00070094, 0.00080584, 0.00044525, 0.00074907,
-  //                            0.00059774, 0.00063654});
-  //
-  std::vector<std::vector<float>> results;
-  // i = 1
-  results.emplace_back(std::vector<float>(
-      {0.9134332,  0.9652493,  0.959906,   0.96601194, 0.9704161,  0.973321,
-       0.9763035,  0.9788776,  0.98090196, 0.9823532,  0.9830632,  0.98336476,
-       0.9837605,  0.98430413, 0.9848935,  0.9854547,  0.9858877,  0.9862335,
-       0.9865361,  0.9867324,  0.98686767, 0.9870094,  0.98710895, 0.98710257,
-       0.98703253, 0.98695105, 0.98681927, 0.98661137, 0.98637575, 0.98613656,
-       0.9858899,  0.98564225, 0.9853931,  0.9851323,  0.98487836, 0.9846578,
-       0.9844529,  0.9842441,  0.98405427, 0.9839205,  0.98382735, 0.98373055,
-       0.9836299,  0.9835474,  0.9834818,  0.9834427,  0.98343164, 0.9834163,
-       0.9833809,  0.9833255,  0.9832343,  0.9831207,  0.98302484, 0.9829579,
-       0.9829039,  0.98283756, 0.9827444,  0.98264474, 0.9825466,  0.98243505,
-       0.982312,   0.98218083, 0.98203814, 0.981895,   0.9817609,  0.9816264,
-       0.9814932,  0.9813706,  0.98124915, 0.9811211,  0.98099536, 0.9808748,
-       0.98075336, 0.9806301,  0.98050594, 0.98038554, 0.980272,   0.9801562,
-       0.9800356,  0.9799207,  0.9798147,  0.97971845, 0.97963905, 0.9795745,
-       0.9795107,  0.97943753, 0.9793595,  0.97928876, 0.97922987, 0.9791764,
-       0.97912955, 0.9790941,  0.9790663,  0.9790414,  0.9790204,  0.9790055,
-       0.97899526, 0.9789867,  0.9789797,  0.9789748}));
-  auto* out = predictor.GetOutput(0);
-  ASSERT_EQ(out->dims().size(), 4);
-  ASSERT_EQ(out->dims()[0], 1);
-  ASSERT_EQ(out->dims()[1], 21);
-
-  int step = 1;
-  for (int i = 0; i < results.size(); ++i) {
-    for (int j = 0; j < results[i].size(); ++j) {
-      EXPECT_NEAR(out->data<float>()[j * step + (out->dims()[1] * i)],
-                  results[i][j],
-                  1e-6);
-    }
-  }
-}
-#endif
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt
deleted file mode 100644
index 80dc574de8..0000000000
--- a/lite/backends/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-add_subdirectory(arm)
-add_subdirectory(x86)
-add_subdirectory(cuda)
-add_subdirectory(fpga)
-add_subdirectory(host)
-add_subdirectory(opencl)
-add_subdirectory(npu)
diff --git a/lite/backends/arm/CMakeLists.txt b/lite/backends/arm/CMakeLists.txt
deleted file mode 100644
index 2767b4e7ae..0000000000
--- a/lite/backends/arm/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-add_subdirectory(math)
diff --git a/lite/backends/arm/math/CMakeLists.txt b/lite/backends/arm/math/CMakeLists.txt
deleted file mode 100644
index f17928cc29..0000000000
--- a/lite/backends/arm/math/CMakeLists.txt
+++ /dev/null
@@ -1,111 +0,0 @@
-if(NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM))
-    return()
-endif()
-
-if(NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM))
-  return()
-endif()
-
-set(HAS_ARM_MATH_LIB_DIR OFF)
-# will search name as "libmath_arm.${os}.${abi}.${lang}.a"
-if(ARM_MATH_LIB_DIR AND EXISTS "${ARM_MATH_LIB_DIR}")
-  set(arm_math_name "")
-  if(ARM_TARGET_OS STREQUAL "android")
-    if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
-      set(arm_math_name "math_arm.android.armv8")
-    elseif(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
-      set(arm_math_name "math_arm.android.armv7")
-    endif()
-  endif()
-
-  if(ARM_TARGET_OS STREQUAL "armlinux" )
-    if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
-      set(arm_math_name "math_arm.armlinux.armv8")
-    elseif(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
-      set(arm_math_name "math_arm.armlinux.armv7")
-    endif()
-  endif()
-
-  if(ARM_TARGET_LANG STREQUAL "clang")
-    set(arm_math_name "${arm_math_name}.clang")
-  else()
-    set(arm_math_name "${arm_math_name}.gcc")
-  endif()
-
-  find_library(math_arm_file ${arm_math_name} ${ARM_MATH_LIB_DIR} NO_DEFAULT_PATH)
-  if(math_arm_file)
-    add_library(math_arm STATIC IMPORTED GLOBAL)
-    set_property(TARGET math_arm PROPERTY IMPORTED_LOCATION ${math_arm_file})
-    message(STATUS "ARM math library imported: ${math_arm_file}")
-    set(HAS_ARM_MATH_LIB_DIR ON)
-  else()
-    message(WARNING "Can not find arm math library ${arm_math_name} in ${ARM_MATH_LIB_DIR}")
-  endif()
-endif()
-
-
-if (NOT HAS_ARM_MATH_LIB_DIR)
-  # TODO(xxx): seperate them and do not deps proto, eigen3
-  cc_library(math_arm SRCS  
-      funcs.cc 
-      packed_sgemm.cc
-      sgemm.cc
-      softmax.cc 
-      scale.cc
-      pooling.cc
-      elementwise.cc
-      lrn.cc
-      decode_bboxes.cc
-      concat.cc
-      sgemv.cc
-      type_trans.cc
-      box_coder.cc
-      conv_impl.cc
-      conv_direct_3x3s1.cc
-      conv_direct_3x3s2.cc
-      conv_direct.cc
-      conv_depthwise_3x3_int8.cc
-      conv_depthwise_5x5s1_int8.cc
-      conv_depthwise_3x3p0.cc
-      conv_depthwise_3x3p1.cc
-      conv_depthwise_5x5s1.cc
-      conv_depthwise_5x5s2.cc
-      conv_depthwise.cc
-      conv_gemmlike.cc
-      conv_winograd_3x3.cc
-      conv_winograd.cc
-      split.cc
-      shuffle_channel.cc
-      activation.cc
-      yolo_box.cc
-      dropout.cc
-      gemm_prepacked_int8.cc
-      gemv_arm_int8.cc
-      conv3x3s1_direct_int8.cc
-      conv3x3s2_direct_int8.cc
-      power.cc
-      interpolate.cc
-      argmax.cc
-      axpy.cc
-      fill_bias_relu.cc
-      col_im_transform.cc
-      im2sequence.cc
-      prior_box.cc
-      sequence_softmax.cc
-      norm.cc
-      topk.cc
-      increment.cc
-      pad2d.cc
-      negative.cc
-      beam_search.cc
-      reduce_max.cc
-      sequence_pool.cc
-      sequence_expand.cc
-      slice.cc
-      reduce_mean.cc
-      stack.cc
-			affine_channel.cc
-			anchor_generator.cc
-      DEPS ${lite_kernel_deps})
-endif()
- 
diff --git a/lite/backends/arm/math/activation.cc b/lite/backends/arm/math/activation.cc
deleted file mode 100644
index c227077779..0000000000
--- a/lite/backends/arm/math/activation.cc
+++ /dev/null
@@ -1,698 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/activation.h"
-#include <string>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <>
-void act_relu<float>(const float* din, float* dout, int size, int threads) {
-  int nums_per_thread = size / threads;
-  int remain = size - threads * nums_per_thread;
-  int neon_loop_cnt = nums_per_thread >> 4;
-  int neon_loop_remain = nums_per_thread - (neon_loop_cnt << 4);
-  float32x4_t vzero = vdupq_n_f32(0.f);
-#pragma omp parallel for
-  for (int i = 0; i < threads; ++i) {
-    const float* ptr_in_thread = din + i * nums_per_thread;
-    float* ptr_out_thread = dout + i * nums_per_thread;
-    int cnt = neon_loop_cnt;
-#ifdef __aarch64__
-    for (int num = 0; num < neon_loop_cnt; ++num) {
-      float32x4_t vr0 = vld1q_f32(ptr_in_thread);
-      ptr_in_thread += 4;
-      float32x4_t vr1 = vld1q_f32(ptr_in_thread);
-      ptr_in_thread += 4;
-      float32x4_t vr2 = vld1q_f32(ptr_in_thread);
-      ptr_in_thread += 4;
-      float32x4_t vr3 = vld1q_f32(ptr_in_thread);
-      ptr_in_thread += 4;
-      vr0 = vmaxq_f32(vr0, vzero);
-      vr1 = vmaxq_f32(vr1, vzero);
-      vr2 = vmaxq_f32(vr2, vzero);
-      vr3 = vmaxq_f32(vr3, vzero);
-      vst1q_f32(ptr_out_thread, vr0);
-      ptr_out_thread += 4;
-      vst1q_f32(ptr_out_thread, vr1);
-      ptr_out_thread += 4;
-      vst1q_f32(ptr_out_thread, vr2);
-      ptr_out_thread += 4;
-      vst1q_f32(ptr_out_thread, vr3);
-      ptr_out_thread += 4;
-    }
-
-#else
-    if (cnt > 0) {
-      asm volatile(
-          "1:                                     @ loop header\n"
-          "vld1.32  {d0-d3}, [%[din]]!            @ load din 0\n"
-          "vld1.32  {d4-d7}, [%[din]]!            @ load din 0\n"
-
-          "vmax.f32 q8, q0, %q[vzero]             @ relu\n"
-          "vmax.f32 q9, q1, %q[vzero]             @ relu\n"
-          "vmax.f32 q10, q2, %q[vzero]            @ relu\n"
-          "vmax.f32 q11, q3, %q[vzero]            @ relu\n"
-
-          "vst1.32  {d16-d19}, [%[dout]]!         @ store result, add pointer\n"
-          "vst1.32  {d20-d23}, [%[dout]]!         @ store result, add pointer\n"
-
-          "subs %[cnt], #1                        @ loop count minus 1\n"
-          "bne    1b                              @ jump to main loop start "
-          "point\n"
-          : [dout] "+r"(ptr_out_thread),
-            [din] "+r"(ptr_in_thread),
-            [cnt] "+r"(cnt)
-          : [vzero] "w"(vzero)
-          : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
-    }
-#endif
-    for (int j = 0; j < neon_loop_remain; ++j) {
-      ptr_out_thread[0] = ptr_in_thread[0] > 0.f ? ptr_in_thread[0] : 0.f;
-      ptr_in_thread++;
-      ptr_out_thread++;
-    }
-  }
-  float* out_ptr_remain = dout + threads * nums_per_thread;
-  const float* in_ptr_remain = din + threads * nums_per_thread;
-  for (int j = 0; j < remain; ++j) {
-    out_ptr_remain[0] = in_ptr_remain[0] > 0.f ? in_ptr_remain[0] : 0.f;
-    in_ptr_remain++;
-    out_ptr_remain++;
-  }
-}
-
-template <>
-void act_relu_neg<float>(const float* din,
-                         float* dout,
-                         int size,
-                         float negative_slope,
-                         int threads) {
-  int nums_per_thread = size / threads;
-  int remain = size - threads * nums_per_thread;
-  int neon_loop_cnt = nums_per_thread >> 4;
-  int neon_loop_remain = nums_per_thread - (neon_loop_cnt << 4);
-  float32x4_t vzero = vdupq_n_f32(0.f);
-  float32x4_t valpha = vdupq_n_f32(negative_slope);
-#pragma omp parallel for
-  for (int i = 0; i < threads; ++i) {
-    const float* ptr_in_thread = din + i * nums_per_thread;
-    float* ptr_out_thread = dout + i * nums_per_thread;
-    int cnt = neon_loop_cnt;
-#ifdef __aarch64__
-    for (int num = 0; num < neon_loop_cnt; ++num) {
-      float32x4_t vr0 = vld1q_f32(ptr_in_thread);
-      ptr_in_thread += 4;
-      float32x4_t vr1 = vld1q_f32(ptr_in_thread);
-      ptr_in_thread += 4;
-      float32x4_t vr2 = vld1q_f32(ptr_in_thread);
-      ptr_in_thread += 4;
-      float32x4_t vr3 = vld1q_f32(ptr_in_thread);
-      ptr_in_thread += 4;
-
-      uint32x4_t vm0 = vcgeq_f32(vr0, vzero);
-      uint32x4_t vm1 = vcgeq_f32(vr1, vzero);
-      uint32x4_t vm2 = vcgeq_f32(vr2, vzero);
-      uint32x4_t vm3 = vcgeq_f32(vr3, vzero);
-
-      float32x4_t vn0 = vmulq_f32(vr0, valpha);
-      float32x4_t vn1 = vmulq_f32(vr1, valpha);
-      float32x4_t vn2 = vmulq_f32(vr2, valpha);
-      float32x4_t vn3 = vmulq_f32(vr3, valpha);
-
-      float32x4_t vo0 = vbslq_f32(vm0, vr0, vn0);
-      float32x4_t vo1 = vbslq_f32(vm1, vr1, vn1);
-      float32x4_t vo2 = vbslq_f32(vm2, vr2, vn2);
-      float32x4_t vo3 = vbslq_f32(vm3, vr3, vn3);
-
-      vst1q_f32(ptr_out_thread, vo0);
-      ptr_out_thread += 4;
-      vst1q_f32(ptr_out_thread, vo1);
-      ptr_out_thread += 4;
-      vst1q_f32(ptr_out_thread, vo2);
-      ptr_out_thread += 4;
-      vst1q_f32(ptr_out_thread, vo3);
-      ptr_out_thread += 4;
-    }
-
-#else
-    if (cnt > 0) {
-      asm volatile(
-          "1:                                             @ loop header\n"
-          "vld1.32  {d0-d3}, [%[din]]!            @ load din 0\n"
-          "vld1.32  {d4-d7}, [%[din]]!            @ load din 0\n"
-
-          "vcge.f32 q8, q0, %q[vzero]             @ get mask\n"
-          "vcge.f32 q9, q1, %q[vzero]             @ get mask\n"
-          "vcge.f32 q10, q2, %q[vzero]            @ get mask\n"
-          "vcge.f32 q11, q3, %q[vzero]            @ get mask\n"
-
-          "vmul.f32   q4, q0, %q[valpha]          @ get neg data\n"
-          "vmul.f32   q5, q1, %q[valpha]          @ get neg data\n"
-          "vmul.f32   q6, q2, %q[valpha]          @ get neg data\n"
-          "vmul.f32   q7, q3, %q[valpha]          @ get neg data\n"
-
-          "vbit   q4, q0, q8                      @ bitsel, insert q0 to q4, "
-          "if q8 is 1\n"
-          "vbit   q5, q1, q9                      @ bitsel, insert q1 to q5, "
-          "if q9 is 1\n"
-          "vbit   q6, q2, q10                     @ bitsel, insert q2 to q6, "
-          "if q10 is 1\n"
-          "vbit   q7, q3, q11                     @ bitsel, insert q3 to q7, "
-          "if q11 is 1\n"
-
-          "vst1.32  {d8-d11}, [%[dout]]!          @ store result, add pointer\n"
-          "vst1.32  {d12-d15}, [%[dout]]!         @ store result, add pointer\n"
-
-          "subs %[cnt], #1                        @ loop count minus 1\n"
-          "bne    1b                              @ jump to main loop start "
-          "point\n"
-          : [dout] "+r"(ptr_out_thread),
-            [din] "+r"(ptr_in_thread),
-            [cnt] "+r"(cnt)
-          : [vzero] "w"(vzero), [valpha] "w"(valpha)
-          : "cc",
-            "memory",
-            "q0",
-            "q1",
-            "q2",
-            "q3",
-            "q4",
-            "q5",
-            "q6",
-            "q7",
-            "q8",
-            "q9",
-            "q10",
-            "q11");
-    }
-#endif
-    for (int j = 0; j < neon_loop_remain; ++j) {
-      ptr_out_thread[0] = ptr_in_thread[0] > 0.f
-                              ? ptr_in_thread[0]
-                              : ptr_in_thread[0] * negative_slope;
-      ptr_in_thread++;
-      ptr_out_thread++;
-    }
-  }
-  float* out_ptr_remain = dout + threads * nums_per_thread;
-  const float* in_ptr_remain = din + threads * nums_per_thread;
-  for (int j = 0; j < remain; ++j) {
-    out_ptr_remain[0] = in_ptr_remain[0] > 0.f
-                            ? in_ptr_remain[0]
-                            : in_ptr_remain[0] * negative_slope;
-    in_ptr_remain++;
-    out_ptr_remain++;
-  }
-}
-
-template <>
-void act_clipped_relu<float>(
-    const float* din, float* dout, int size, float coef, int threads) {
-  int nums_per_thread = size / threads;
-  int remain = size - threads * nums_per_thread;
-  int neon_loop_cnt = nums_per_thread >> 4;
-  int neon_loop_remain = nums_per_thread - (neon_loop_cnt << 4);
-  float32x4_t vzero = vdupq_n_f32(0.f);
-  float32x4_t vclip = vdupq_n_f32(coef);
-#pragma omp parallel for
-  for (int i = 0; i < threads; ++i) {
-    const float* ptr_in_thread = din + i * nums_per_thread;
-    float* ptr_out_thread = dout + i * nums_per_thread;
-    int cnt = neon_loop_cnt;
-#ifdef __aarch64__
-    for (int num = 0; num < neon_loop_cnt; ++num) {
-      float32x4_t vr0 = vld1q_f32(ptr_in_thread);
-      ptr_in_thread += 4;
-      float32x4_t vr1 = vld1q_f32(ptr_in_thread);
-      ptr_in_thread += 4;
-      float32x4_t vr2 = vld1q_f32(ptr_in_thread);
-      ptr_in_thread += 4;
-      float32x4_t vr3 = vld1q_f32(ptr_in_thread);
-      ptr_in_thread += 4;
-      float32x4_t vt0 = vmaxq_f32(vr0, vzero);
-      float32x4_t vt1 = vmaxq_f32(vr1, vzero);
-      float32x4_t vt2 = vmaxq_f32(vr2, vzero);
-      float32x4_t vt3 = vmaxq_f32(vr3, vzero);
-
-      float32x4_t vo0 = vminq_f32(vt0, vclip);
-      float32x4_t vo1 = vminq_f32(vt1, vclip);
-      float32x4_t vo2 = vminq_f32(vt2, vclip);
-      float32x4_t vo3 = vminq_f32(vt3, vclip);
-
-      vst1q_f32(ptr_out_thread, vo0);
-      ptr_out_thread += 4;
-      vst1q_f32(ptr_out_thread, vo1);
-      ptr_out_thread += 4;
-      vst1q_f32(ptr_out_thread, vo2);
-      ptr_out_thread += 4;
-      vst1q_f32(ptr_out_thread, vo3);
-      ptr_out_thread += 4;
-    }
-#else
-    if (cnt > 0) {
-      asm volatile(
-          "1:                                     @ loop header\n"
-          "vld1.32  {d0-d3}, [%[din]]!            @ load din 0\n"
-          "vld1.32  {d4-d7}, [%[din]]!            @ load din 0\n"
-
-          "vmax.f32 q8, q0, %q[vzero]             @ relu\n"
-          "vmax.f32 q9, q1, %q[vzero]             @ relu\n"
-          "vmax.f32 q10, q2, %q[vzero]            @ relu\n"
-          "vmax.f32 q11, q3, %q[vzero]            @ relu\n"
-
-          "vmin.f32 q4, q8, %q[vclip]             @ clip relu\n"
-          "vmin.f32 q5, q9, %q[vclip]             @ clip relu\n"
-          "vmin.f32 q6, q10, %q[vclip]            @ clip relu\n"
-          "vmin.f32 q7, q11, %q[vclip]            @ clip relu\n"
-
-          "vst1.32  {d8-d11}, [%[dout]]!          @ store result, add pointer\n"
-          "vst1.32  {d12-d15}, [%[dout]]!         @ store result, add pointer\n"
-
-          "subs %[cnt], #1                        @ loop count minus 1\n"
-          "bne    1b                              @ jump to main loop start "
-          "point\n"
-          : [dout] "+r"(ptr_out_thread),
-            [din] "+r"(ptr_in_thread),
-            [cnt] "+r"(cnt)
-          : [vzero] "w"(vzero), [vclip] "w"(vclip)
-          : "cc",
-            "memory",
-            "q0",
-            "q1",
-            "q2",
-            "q3",
-            "q4",
-            "q5",
-            "q6",
-            "q7",
-            "q8",
-            "q9",
-            "q10",
-            "q11");
-    }
-#endif
-    for (int j = 0; j < neon_loop_remain; ++j) {
-      ptr_out_thread[0] = ptr_in_thread[0] > 0.f ? ptr_in_thread[0] : 0.f;
-      ptr_out_thread[0] = ptr_out_thread[0] < coef ? ptr_out_thread[0] : coef;
-      ptr_in_thread++;
-      ptr_out_thread++;
-    }
-  }
-  float* out_ptr_remain = dout + threads * nums_per_thread;
-  const float* in_ptr_remain = din + threads * nums_per_thread;
-  for (int j = 0; j < remain; ++j) {
-    out_ptr_remain[0] = in_ptr_remain[0] > 0.f ? in_ptr_remain[0] : 0.f;
-    out_ptr_remain[0] = out_ptr_remain[0] < coef ? out_ptr_remain[0] : coef;
-    in_ptr_remain++;
-    out_ptr_remain++;
-  }
-}
-
-template <>
-void act_prelu<float>(const float* din,
-                      float* dout,
-                      int outer_size,
-                      int channel_size,
-                      int inner_size,
-                      std::string mode,
-                      const float* alpha_data,
-                      int threads) {
-  if (mode == "all" || mode == "channel") {
-    int stride_size = inner_size * channel_size;
-    int cnt = inner_size >> 4;
-    int remain = inner_size & 15;
-    float32x4_t vzero = vdupq_n_f32(0.f);
-    for (int n = 0; n < outer_size; n++) {
-      const float* data_in_batch = din + n * stride_size;
-      float* data_out_batch = dout + n * stride_size;
-#pragma omp parallel for
-      for (int c = 0; c < channel_size; c++) {
-        const float* data_in_c = data_in_batch + c * inner_size;
-        float* data_out_c = data_out_batch + c * inner_size;
-
-        float slope = mode == "all" ? alpha_data[0] : alpha_data[c];
-        float32x4_t vslope = vdupq_n_f32(slope);
-#ifdef __aarch64__
-        for (int i = 0; i < cnt; ++i) {
-          float32x4_t vr0 = vld1q_f32(data_in_c);
-          float32x4_t vr1 = vld1q_f32(data_in_c + 4);
-          float32x4_t vr2 = vld1q_f32(data_in_c + 8);
-          float32x4_t vr3 = vld1q_f32(data_in_c + 12);
-          uint32x4_t vm0 = vcltq_f32(vr0, vzero);    // vr0 <= vzero
-          uint32x4_t vm1 = vcltq_f32(vr1, vzero);    // vr0 <= vzero
-          uint32x4_t vm2 = vcltq_f32(vr2, vzero);    // vr0 <= vzero
-          uint32x4_t vm3 = vcltq_f32(vr3, vzero);    // vr0 <= vzero
-          float32x4_t vo0 = vmulq_f32(vr0, vslope);  // vr0 * vslope
-          float32x4_t vo1 = vmulq_f32(vr1, vslope);  // vr0 * vslope
-          float32x4_t vo2 = vmulq_f32(vr2, vslope);  // vr0 * vslope
-          float32x4_t vo3 = vmulq_f32(vr3, vslope);  // vr0 * vslope
-          float32x4_t vos0 = vbslq_f32(vm0, vo0, vr0);
-          float32x4_t vos1 = vbslq_f32(vm1, vo1, vr1);
-          float32x4_t vos2 = vbslq_f32(vm2, vo2, vr2);
-          float32x4_t vos3 = vbslq_f32(vm3, vo3, vr3);
-          vst1q_f32(data_out_c, vos0);
-          vst1q_f32(data_out_c + 4, vos1);
-          vst1q_f32(data_out_c + 8, vos2);
-          vst1q_f32(data_out_c + 12, vos3);
-          data_in_c += 16;
-          data_out_c += 16;
-        }
-#else
-        int cnt_loop = cnt;
-        if (cnt_loop > 0) {
-          asm volatile(
-              "vld1.32    {d0-d3}, [%[ptr_in]]!                       @ load "
-              "input to q0, q1\n"
-              "pld [%[ptr_in]]                                @ preload\n"
-              "pld [%[ptr_in], #64]                           @ preload\n"
-              "pld [%[ptr_in], #128]                          @ preload\n"
-              "pld [%[ptr_in], #192]                          @ preload\n"
-              "1:                                             @main loop\n"
-              "vld1.32    {d4-d7}, [%[ptr_in]]!               @ load input to "
-              "q2, q3\n"
-              "vclt.f32   q8, q0, %q[vzero]                   @vcle q0 <= "
-              "vzero\n"
-              "vclt.f32   q9, q1, %q[vzero]                   @vcle q1 <= "
-              "vzero\n"
-              "vmul.f32  q10, q0, %q[vslope]                  @vmul q0 * "
-              "vslope\n"
-              "vmul.f32  q11, q1, %q[vslope]                  @vmul q1 * "
-              "vslope\n"
-
-              "vclt.f32  q12, q2, %q[vzero]                   @vcle q2 <= "
-              "vzero\n"
-              "vclt.f32  q13, q3, %q[vzero]                   @vcle q3 <= "
-              "vzero\n"
-              "vmul.f32  q14, q2, %q[vslope]                  @vmul q2 * "
-              "vslope\n"
-              "vmul.f32  q15, q3, %q[vslope]                  @vmul q3 * "
-              "vslope\n"
-
-              "vbif.32    q10, q0, q8                         @vbit q10, q0, "
-              "q8\n"
-              "vbif.32    q11, q1, q9                         @vbit q11, q1, "
-              "q9\n"
-              "vbif.32    q14, q2, q12                        @vbit q14, q2, "
-              "q12\n"
-              "vbif.32    q15, q3, q13                        @vbit q15, q3, "
-              "q13\n"
-
-              "subs       %[cnt], #1                          @subs nn, 1\n"
-              "vld1.32    {d0-d3}, [%[ptr_in]]!               @ load input to "
-              "q0, q1\n"
-
-              "vst1.f32   {d20-d23}, [%[dout]]!               @store data\n"
-              "vst1.f32   {d28-d31}, [%[dout]]!               @store data\n"
-              "bne        1b                                  @bne nn\n"
-              "sub    %[ptr_in], #32                          @ ptr-32\n"
-              : [ptr_in] "+r"(data_in_c),
-                [cnt] "+r"(cnt_loop),
-                [dout] "+r"(data_out_c)
-              : [vzero] "w"(vzero), [vslope] "w"(vslope)
-              : "cc",
-                "memory",
-                "q0",
-                "q1",
-                "q2",
-                "q3",
-                "q8",
-                "q9",
-                "q10",
-                "q11",
-                "q12",
-                "q13",
-                "q14",
-                "q15");
-        }
-#endif  // __aarch64__
-        for (int i = remain; i > 0; i--) {
-          *(data_out_c++) =
-              data_in_c[0] > 0.f ? data_in_c[0] : data_in_c[0] * slope;
-          data_in_c++;
-        }
-      }
-    }
-  } else {  // mode = element
-    int stride_size = inner_size * channel_size;
-    for (int n = 0; n < outer_size; n++) {
-      const float* data_in_batch = din + n * stride_size;
-      const float* data_alpha_batch = alpha_data + n * stride_size;
-      float* data_out_batch = dout + n * stride_size;
-      for (int c = 0; c < channel_size; c++) {
-        const float* data_in_c = data_in_batch + c * inner_size;
-        const float* data_alpha_c = data_alpha_batch + c * inner_size;
-        float* data_out_c = data_out_batch + c * inner_size;
-        for (int i = 0; i < inner_size; i++) {
-          data_out_c[0] = data_in_c[0] > 0.f ? data_in_c[0]
-                                             : data_in_c[0] * data_alpha_c[0];
-          data_in_c++;
-          data_alpha_c++;
-          data_out_c++;
-        }
-      }
-    }
-  }
-}
-
-template <>
-void act_sigmoid<float>(const float* din, float* dout, int size, int threads) {
-  int nums_per_thread = size / threads;
-  int remain = size - threads * nums_per_thread;
-  int neon_loop_cnt_dim4 = nums_per_thread >> 2;
-  int neon_loop_remain_dim4 = nums_per_thread - (neon_loop_cnt_dim4 << 2);
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-#pragma omp parallel for
-  for (int i = 0; i < threads; ++i) {
-    float32x4_t exp_vec = vdupq_n_f32(0.0f);
-    float32x4_t recip = vdupq_n_f32(0.0f);
-    const float* ptr_in_thread = din + i * nums_per_thread;
-    float* ptr_out_thread = dout + i * nums_per_thread;
-    for (int k = 0; k < neon_loop_cnt_dim4; ++k) {
-      exp_vec = exp_ps(vnegq_f32(vld1q_f32(ptr_in_thread)));
-      exp_vec = vaddq_f32(exp_vec, vdupq_n_f32(1.0f));
-      recip = vrecpeq_f32(exp_vec);
-      recip = vmulq_f32(vrecpsq_f32(exp_vec, recip), recip);
-      recip = vmulq_f32(vrecpsq_f32(exp_vec, recip), recip);
-      vst1q_f32(ptr_out_thread, recip);
-      ptr_out_thread += 4;
-      ptr_in_thread += 4;
-    }
-    for (int j = 0; j < neon_loop_remain_dim4; ++j) {
-      ptr_out_thread[0] = 1.f / (1 + expf(-ptr_in_thread[0]));
-      ptr_in_thread++;
-      ptr_out_thread++;
-    }
-  }
-  float* ptr_out = dout + threads * nums_per_thread;
-  const float* ptr_in = din + threads * nums_per_thread;
-  for (int j = 0; j < remain; ++j) {
-    ptr_out[0] = 1.f / (1 + expf(-ptr_in[0]));
-    ptr_in++;
-    ptr_out++;
-  }
-}
-
-// tanh : (exp(x) - exp(-x)) / (exp(x) + exp(-x))
-template <>
-void act_tanh<float>(const float* din, float* dout, int size, int threads) {
-  int nums_per_thread = size / threads;
-  int remain = size - threads * nums_per_thread;
-  int neon_loop_cnt_dim4 = nums_per_thread >> 2;
-  int neon_loop_remain_dim4 = nums_per_thread - (neon_loop_cnt_dim4 << 2);
-#pragma omp parallel for
-  for (int i = 0; i < threads; ++i) {
-    float32x4_t exp_plus_vec = vdupq_n_f32(0.0f);
-    float32x4_t exp_minus_vec = vdupq_n_f32(0.0f);
-    float32x4_t exp_sum_vec = vdupq_n_f32(0.0f);
-    float32x4_t exp_diff_vec = vdupq_n_f32(0.0f);
-    float32x4_t recip = vdupq_n_f32(0.0f);
-    const float* ptr_in_thread = din + i * nums_per_thread;
-    float* ptr_out_thread = dout + i * nums_per_thread;
-    for (int k = 0; k < neon_loop_cnt_dim4; ++k) {
-      exp_plus_vec = exp_ps(vld1q_f32(ptr_in_thread));
-      exp_minus_vec = exp_ps(vnegq_f32(vld1q_f32(ptr_in_thread)));
-      exp_sum_vec = vaddq_f32(exp_plus_vec, exp_minus_vec);
-      exp_diff_vec = vsubq_f32(exp_plus_vec, exp_minus_vec);
-      recip = div_ps(exp_diff_vec, exp_sum_vec);
-      vst1q_f32(ptr_out_thread, recip);
-      ptr_out_thread += 4;
-      ptr_in_thread += 4;
-    }
-    for (int j = 0; j < neon_loop_remain_dim4; ++j) {
-      ptr_out_thread[0] = (expf(ptr_in_thread[0]) - expf(-ptr_in_thread[0])) /
-                          (expf(ptr_in_thread[0]) + expf(-ptr_in_thread[0]));
-      ptr_in_thread++;
-      ptr_out_thread++;
-    }
-  }
-  float* ptr_out = dout + threads * nums_per_thread;
-  const float* ptr_in = din + threads * nums_per_thread;
-  for (int j = 0; j < remain; ++j) {
-    ptr_out[0] = (expf(ptr_in[0]) - expf(-ptr_in[0])) /
-                 (expf(ptr_in[0]) + expf(-ptr_in[0]));
-    ptr_in++;
-    ptr_out++;
-  }
-}
-
-// swish: x /(1 + exp(-(b * x)))
-template <>
-void act_swish<float>(
-    const float* din, float* dout, int size, float coef, int threads) {
-  int nums_per_thread = size / threads;
-  int remain = size - threads * nums_per_thread;
-  int neon_loop_cnt_dim4 = nums_per_thread >> 2;
-  int neon_loop_remain_dim4 = nums_per_thread - (neon_loop_cnt_dim4 << 2);
-  const float beta = coef;
-  float32x4_t vbeta = vdupq_n_f32(beta);
-  float32x4_t vone = vdupq_n_f32(1.f);
-#pragma omp parallel for
-  for (int i = 0; i < threads; ++i) {
-    const float* ptr_in_thread = din + i * nums_per_thread;
-    float* ptr_out_thread = dout + i * nums_per_thread;
-    for (int k = 0; k < neon_loop_cnt_dim4; ++k) {
-      float32x4_t va = vld1q_f32(ptr_in_thread);             // x
-      float32x4_t vb = vnegq_f32(vld1q_f32(ptr_in_thread));  // -x
-      float32x4_t vsum = vmulq_f32(vb, vbeta);
-      vsum = exp_ps(vsum);
-      float32x4_t vc = vaddq_f32(vone, vsum);
-      float32x4_t vrst = div_ps(va, vc);
-      vst1q_f32(ptr_out_thread, vrst);
-      ptr_out_thread += 4;
-      ptr_in_thread += 4;
-    }
-    for (int j = 0; j < neon_loop_remain_dim4; ++j) {
-      ptr_out_thread[0] =
-          ptr_in_thread[0] / (1.0 + expf(-ptr_in_thread[0] * beta));
-      ptr_in_thread++;
-      ptr_out_thread++;
-    }
-  }
-  float* ptr_out = dout + threads * nums_per_thread;
-  const float* ptr_in = din + threads * nums_per_thread;
-  for (int j = 0; j < remain; ++j) {
-    ptr_out[0] = ptr_in[0] / (1.0 + expf(-ptr_in[0] * beta));
-    ptr_in++;
-    ptr_out++;
-  }
-}
-
-template <>
-void act_log<float>(const float* din, float* dout, int size, int threads) {
-  int nums_per_thread = size / threads;
-  int remain = size - threads * nums_per_thread;
-  int neon_loop_cnt_dim4 = nums_per_thread >> 2;
-  int neon_loop_remain_dim4 = nums_per_thread - (neon_loop_cnt_dim4 << 2);
-  LOG(INFO) << "nums_per_thread" << nums_per_thread;
-  LOG(INFO) << "remain" << remain;
-  LOG(INFO) << "neon_loop_cnt_dim4" << neon_loop_cnt_dim4;
-  LOG(INFO) << "neon_loop_remian_dim4" << neon_loop_remain_dim4;
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-#pragma omp parallel for
-  for (int i = 0; i < threads; ++i) {
-    float32x4_t exp_vec = vdupq_n_f32(0.0f);
-    const float* ptr_in_thread = din + i * nums_per_thread;
-    float* ptr_out_thread = dout + i * nums_per_thread;
-    for (int k = 0; k < neon_loop_cnt_dim4; ++k) {
-      exp_vec = log_ps(vld1q_f32(ptr_in_thread));
-      vst1q_f32(ptr_out_thread, exp_vec);
-      ptr_out_thread += 4;
-      ptr_in_thread += 4;
-    }
-    for (int j = 0; j < neon_loop_remain_dim4; ++j) {
-      ptr_out_thread[0] = logf(ptr_in_thread[0]);
-      ptr_in_thread++;
-      ptr_out_thread++;
-    }
-  }
-  float* ptr_out = dout + threads * nums_per_thread;
-  const float* ptr_in = din + threads * nums_per_thread;
-  for (int j = 0; j < remain; ++j) {
-    ptr_out[0] = logf(ptr_in[0]);
-    ptr_in++;
-    ptr_out++;
-  }
-}
-
-template <>
-void act_exp<float>(const float* din, float* dout, int size, int threads) {
-  int nums_per_thread = size / threads;
-  int remain = size - threads * nums_per_thread;
-  int neon_loop_cnt_dim4 = nums_per_thread >> 2;
-  int neon_loop_remain_dim4 = nums_per_thread - (neon_loop_cnt_dim4 << 2);
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-#pragma omp parallel for
-  for (int i = 0; i < threads; ++i) {
-    float32x4_t exp_vec = vdupq_n_f32(0.0f);
-    const float* ptr_in_thread = din + i * nums_per_thread;
-    float* ptr_out_thread = dout + i * nums_per_thread;
-    for (int k = 0; k < neon_loop_cnt_dim4; ++k) {
-      exp_vec = exp_ps(vld1q_f32(ptr_in_thread));
-      vst1q_f32(ptr_out_thread, exp_vec);
-      ptr_out_thread += 4;
-      ptr_in_thread += 4;
-    }
-    for (int j = 0; j < neon_loop_remain_dim4; ++j) {
-      ptr_out_thread[0] = expf(ptr_in_thread[0]);
-      ptr_in_thread++;
-      ptr_out_thread++;
-    }
-  }
-  float* ptr_out = dout + threads * nums_per_thread;
-  const float* ptr_in = din + threads * nums_per_thread;
-  for (int j = 0; j < remain; ++j) {
-    ptr_out[0] = expf(ptr_in[0]);
-    ptr_in++;
-    ptr_out++;
-  }
-}
-
-template <>
-void act_floor<float>(const float* din, float* dout, int size, int threads) {
-  const float* ptr_in = din;
-  float* ptr_out = dout;
-  for (int i = 0; i < size; ++i) {
-    ptr_out[0] = floorf(ptr_in[0]);
-    ptr_in++;
-    ptr_out++;
-  }
-}
-
-template <>
-void act_hard_sigmoid<float>(const float* din,
-                             float* dout,
-                             const int64_t size,
-                             const float slope,
-                             const float offset,
-                             int threads) {
-  for (int64_t i = 0; i < size; ++i) {
-    dout[0] = din[0] * slope + offset;
-    dout[0] = dout[0] < 1.0f ? dout[0] : 1.0f;
-    dout[0] = dout[0] > 0.0f ? dout[0] : 0.0f;
-    ++din;
-    ++dout;
-  }
-}
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/activation.h b/lite/backends/arm/math/activation.h
deleted file mode 100644
index 794c5e0d41..0000000000
--- a/lite/backends/arm/math/activation.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <typename T>
-void act_relu(const T* din, T* dout, int size, int threads);
-
-template <typename T>
-void act_relu_neg(
-    const T* din, T* dout, int size, float negative_slope, int threads);
-
-template <typename T>
-void act_clipped_relu(const T* din, T* dout, int size, float coef, int threads);
-
-template <typename T>
-void act_prelu(const T* din,
-               T* dout,
-               int outer_size,
-               int channel_size,
-               int inner_size,
-               std::string mode,
-               const float* alpha_data,
-               int threads);
-
-template <typename T>
-void act_sigmoid(const T* din, T* dout, int size, int threads);
-
-template <typename T>
-void act_tanh(const T* din, T* dout, int size, int threads);
-
-template <typename T>
-void act_swish(const T* din, T* dout, int size, float coef, int threads);
-
-template <typename T>
-void act_log(const T* din, T* dout, int size, int threads);
-
-template <typename T>
-void act_exp(const T* din, T* dout, int size, int threads);
-
-template <typename T>
-void act_floor(const T* din, T* dout, int size, int threads);
-
-template <typename T>
-void act_hard_sigmoid(const T* din,
-                      T* dout,
-                      const int64_t size,
-                      const float slope,
-                      const float offset,
-                      int threads);
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/affine_channel.cc b/lite/backends/arm/math/affine_channel.cc
deleted file mode 100644
index a2c735afcc..0000000000
--- a/lite/backends/arm/math/affine_channel.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/affine_channel.h"
-#include <algorithm>
-#include <limits>
-#include <memory>
-#include "lite/backends/arm/math/axpy.h"
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/backends/arm/math/saturate.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void affine_channel_func(const float* x,
-                         const float* scale,
-                         const float* bias,
-                         const std::string data_layout,
-                         int num,
-                         int channel,
-                         int height,
-                         int width,
-                         float* out) {
-  if (data_layout == "NCHW") {
-    int hw_size = height * width;
-    for (int n = 0; n < num; n++) {
-      for (int c = 0; c < channel; c++) {
-        const float* x_ptr = x + n * channel * hw_size + c * hw_size;
-        const float* scale_ptr = scale + c;
-        const float* bias_ptr = bias + c;
-        float* out_ptr = out + n * channel * hw_size + c * hw_size;
-        for (int i = 0; i < hw_size; i++) {
-          *out_ptr = (*x_ptr) * (*scale_ptr) + (*bias_ptr);
-          x_ptr++;
-          out_ptr++;
-        }
-      }
-    }
-  } else if (data_layout == "NHWC") {
-    int nhw = num * height * width;
-    for (int i = 0; i < nhw; i++) {
-      const float* x_ptr = x + i * channel;
-      float* out_ptr = out + i * channel;
-      for (int c = 0; c < channel; c++) {
-        *out_ptr = (*x_ptr) * scale[c] + bias[c];
-        x_ptr++;
-        out_ptr++;
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/affine_channel.h b/lite/backends/arm/math/affine_channel.h
deleted file mode 100644
index f050d0ae28..0000000000
--- a/lite/backends/arm/math/affine_channel.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "lite/operators/op_params.h"
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void affine_channel_func(const float* x,
-                         const float* scale,
-                         const float* bias,
-                         const std::string data_layout,
-                         int num,
-                         int channel,
-                         int h,
-                         int w,
-                         float* dout);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/anchor_generator.cc b/lite/backends/arm/math/anchor_generator.cc
deleted file mode 100644
index 2f8a738fbf..0000000000
--- a/lite/backends/arm/math/anchor_generator.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/anchor_generator.h"
-#include <algorithm>
-#include <limits>
-#include <memory>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/backends/arm/math/saturate.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void anchor_generator_func(int feature_height,
-                           int feature_width,
-                           std::vector<float> anchor_sizes,
-                           std::vector<float> aspect_ratios,
-                           std::vector<float> stride,
-                           std::vector<float> variances,
-                           float offset,
-                           float* anchors_ptr,
-                           float* vars_ptr) {
-  float stride_width = stride[0];
-  float stride_height = stride[1];
-  int num_anchors = aspect_ratios.size() * anchor_sizes.size();
-  for (int h_idx = 0; h_idx < feature_height; ++h_idx) {
-    float* anchors_ptr_h =
-        anchors_ptr + h_idx * feature_width * num_anchors * 4;
-    for (int w_idx = 0; w_idx < feature_width; ++w_idx) {
-      float* anchors_ptr_w = anchors_ptr_h + w_idx * num_anchors * 4;
-      float x_ctr = (w_idx * stride_width) + offset * (stride_width - 1);
-      float y_ctr = (h_idx * stride_height) + offset * (stride_height - 1);
-      float area, area_ratios;
-      float base_w, base_h;
-      float scale_w, scale_h;
-      float anchor_width, anchor_height;
-      int idx = 0;
-      for (size_t r = 0; r < aspect_ratios.size(); ++r) {
-        auto ar = aspect_ratios[r];
-        for (size_t s = 0; s < anchor_sizes.size(); ++s) {
-          auto anchor_size = anchor_sizes[s];
-          area = stride_width * stride_height;
-          area_ratios = area / ar;
-          base_w = round(sqrt(area_ratios));
-          base_h = round(base_w * ar);
-          scale_w = anchor_size / stride_width;
-          scale_h = anchor_size / stride_height;
-          anchor_width = scale_w * base_w;
-          anchor_height = scale_h * base_h;
-          anchors_ptr_w[idx++] = x_ctr - 0.5 * (anchor_width - 1);
-          anchors_ptr_w[idx++] = y_ctr - 0.5 * (anchor_height - 1);
-          anchors_ptr_w[idx++] = x_ctr + 0.5 * (anchor_width - 1);
-          anchors_ptr_w[idx++] = y_ctr + 0.5 * (anchor_height - 1);
-        }
-      }
-    }
-  }
-
-  int64_t hwn = feature_height * feature_width * num_anchors * 4;
-  for (int64_t i = 0; i < hwn; i++) {
-    *vars_ptr = variances[i % 4];
-    vars_ptr++;
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/anchor_generator.h b/lite/backends/arm/math/anchor_generator.h
deleted file mode 100644
index c6be6700d3..0000000000
--- a/lite/backends/arm/math/anchor_generator.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "lite/operators/op_params.h"
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void anchor_generator_func(int feature_height,
-                           int feature_widht,
-                           std::vector<float> anchor_sizes,
-                           std::vector<float> aspect_ratios,
-                           std::vector<float> stride,
-                           std::vector<float> variances,
-                           float offset,
-                           float* anchors_data,
-                           float* variances_data);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/argmax.cc b/lite/backends/arm/math/argmax.cc
deleted file mode 100644
index 3ca6d97c4d..0000000000
--- a/lite/backends/arm/math/argmax.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/argmax.h"
-#include <algorithm>
-#include <functional>
-#include <limits>
-#include <memory>
-#include <utility>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void argmax_func(const lite::Tensor *input,
-                 const int axis,
-                 lite::Tensor *output) {
-  auto input_ddim = input->dims();
-  auto output_ddim = output->dims();
-
-  const int size = input_ddim[axis];
-  const int in_channel = input_ddim.count(axis, input_ddim.size());
-  const int out_channel = output_ddim.count(axis, output_ddim.size());
-  const int in_stride = input_ddim.count(axis + 1, input_ddim.size());
-  const int out_stride = input_ddim.count(0, axis);
-
-  for (int n = 0; n < out_stride; n++) {
-    for (int k = 0; k < in_stride; k++) {
-      const float *in_ptr = input->data<float>() + n * in_channel + k;
-      std::vector<std::pair<float, int>> vec;
-      vec.resize(size);
-      for (int i = 0; i < size; i++) {
-        vec[i] = std::make_pair(in_ptr[i * in_stride], i);
-      }
-      // sort
-      std::partial_sort(vec.begin(),
-                        vec.begin() + 1,
-                        vec.end(),
-                        std::greater<std::pair<float, int>>());
-
-      // out
-      float *out_ptr = output->mutable_data<float>() + n * out_channel + k;
-      *out_ptr = vec[0].second;
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/argmax.h b/lite/backends/arm/math/argmax.h
deleted file mode 100644
index c78cf2f7a8..0000000000
--- a/lite/backends/arm/math/argmax.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "lite/operators/op_params.h"
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void argmax_func(const lite::Tensor* input,
-                 const int axis,
-                 lite::Tensor* output);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/axpy.cc b/lite/backends/arm/math/axpy.cc
deleted file mode 100644
index 0863cc009c..0000000000
--- a/lite/backends/arm/math/axpy.cc
+++ /dev/null
@@ -1,203 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/axpy.h"
-#include <algorithm>
-#include <limits>
-#include <memory>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/backends/arm/math/saturate.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void axpy_kernel_fp32(const float* scale,
-                      const float* din,
-                      const float* bias,
-                      float* dout,
-                      int num,
-                      int channel,
-                      int size,
-                      int in_channel) {
-  int cnt = size >> 3;
-  int remain = size % 8;
-  for (int n = 0; n < num; n++) {
-    const float* din_ptr = din + n * in_channel;
-    const float* scale_ptr = scale + n * channel;
-    const float* bias_ptr = bias + n * in_channel;
-    float* dout_ptr = dout + n * in_channel;
-#pragma omp parallel for
-    for (int c = 0; c < channel; c++) {
-      const float* din_ch_ptr = din_ptr + c * size;
-      const float* bias_ch_ptr = bias_ptr + c * size;
-      float* dout_ch_ptr = dout_ptr + c * size;
-      float32x4_t scale_val = vdupq_n_f32(scale_ptr[c]);
-      int col_cnt = cnt;
-      if (cnt > 0) {
-#ifdef __aarch64__
-        asm volatile(
-            "ld1 {v0.4s}, [%[din_ptr]], #16     \n"
-            "ld1 {v1.4s}, [%[bias_ptr]], #16    \n"
-            "1:                                     \n"
-            "ld1 {v2.4s}, [%[din_ptr]], #16     \n"
-            "ld1 {v3.4s}, [%[bias_ptr]], #16    \n"
-            "fmul v4.4s ,  v0.4s,  %[scale].4s  \n"
-            "fmul v5.4s ,  v2.4s,  %[scale].4s  \n"
-            "fadd v4.4s, v4.4s, v1.4s           \n"
-            "fadd v5.4s, v5.4s, v3.4s           \n"
-            "ld1 {v0.4s}, [%[din_ptr]], #16     \n"
-            "ld1 {v1.4s}, [%[bias_ptr]], #16    \n"
-            "subs %[cnt], %[cnt], #1 \n"
-            "st1 {v4.4s}, [%[dout_ptr]], #16     \n"
-            "st1 {v5.4s}, [%[dout_ptr]], #16     \n"
-            "bne        1b                          \n"
-            : [din_ptr] "+r"(din_ch_ptr),
-              [bias_ptr] "+r"(bias_ch_ptr),
-              [dout_ptr] "+r"(dout_ch_ptr),
-              [cnt] "+r"(col_cnt)
-            : [scale] "w"(scale_val)
-            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
-#else
-        asm volatile(
-            "vld1.32 {d2-d3}, [%[din_ptr]]!        \n"
-            "vld1.32 {d4-d5}, [%[bias_ptr]]!       \n"
-            "1:                                     \n"
-            "vld1.32 {d6-d7}, [%[din_ptr]]!        \n"
-            "vld1.32 {d8-d9}, [%[bias_ptr]]!       \n"
-            "vmul.f32 q5, q1, %q[scale]            \n"
-            "vmul.f32 q6, q3, %q[scale]            \n"
-            "vadd.f32 q5, q5, q2                   \n"
-            "vadd.f32 q6, q6, q4                   \n"
-            "vld1.f32 {d2-d3}, [%[din_ptr]]!        \n"
-            "vld1.f32 {d4-d5}, [%[bias_ptr]]!       \n"
-            "subs    %[cnt], #1                 \n"
-            "vst1.32 {d10-d11}, [%[dout_ptr]]!    \n"
-            "vst1.32 {d12-d13}, [%[dout_ptr]]!    \n"
-            "bne        1b                          \n"
-            : [din_ptr] "+r"(din_ch_ptr),
-              [bias_ptr] "+r"(bias_ch_ptr),
-              [dout_ptr] "+r"(dout_ch_ptr),
-              [cnt] "+r"(col_cnt)
-            : [scale] "w"(scale_val)
-            : "cc", "memory", "q1", "q2", "q3", "q4", "q5", "q6");
-#endif
-      }
-      din_ch_ptr = din_ptr + c * size + cnt * 8;
-      bias_ch_ptr = bias_ptr + c * size + cnt * 8;
-      for (int i = 0; i < remain; i++) {
-        *dout_ch_ptr = (*din_ch_ptr) * scale_ptr[c] + (*bias_ch_ptr);
-        dout_ch_ptr++;
-        din_ch_ptr++;
-        bias_ch_ptr++;
-      }
-    }
-  }
-}
-
-void axpy_kernel_int8(const int8_t* scale,
-                      const int8_t* din,
-                      const int8_t* bias,
-                      int8_t* dout,
-                      int num,
-                      int channel,
-                      int size,
-                      int in_channel) {
-  int cnt = size >> 4;
-  int remain = size % 16;
-  for (int n = 0; n < num; n++) {
-    const int8_t* din_ptr = din + n * in_channel;
-    const int8_t* scale_ptr = scale + n * channel;
-    const int8_t* bias_ptr = bias + n * in_channel;
-    int8_t* dout_ptr = dout + n * in_channel;
-#pragma omp parallel for
-    for (int c = 0; c < channel; c++) {
-      const int8_t* din_ch_ptr = din_ptr + c * size;
-      const int8_t* bias_ch_ptr = bias_ptr + c * size;
-      int8_t* dout_ch_ptr = dout_ptr + c * size;
-      int8x8_t scale_val = vdup_n_s8(scale_ptr[c]);
-      int col_cnt = cnt;
-      if (col_cnt > 0) {
-#ifdef __aarch64__
-        asm volatile(
-            "ld1 {v0.8b}, [%[din_ptr]], #8     \n"
-            "ld1 {v1.8b}, [%[bias_ptr]], #8    \n"
-            "1:                                     \n"
-            "ld1 {v2.8b}, [%[din_ptr]], #8     \n"
-            "ld1 {v3.8b}, [%[bias_ptr]], #8    \n"
-            "smull  v4.8h,  v0.8b,  %[scale].8b \n"
-            "smull  v5.8h,  v2.8b,  %[scale].8b \n"
-            "saddw v4.8h, v4.8h, v1.8b           \n"
-            "saddw v5.8h, v5.8h, v3.8b           \n"
-            "ld1 {v0.8b}, [%[din_ptr]], #8     \n"
-            "ld1 {v1.8b}, [%[bias_ptr]], #8    \n"
-            "subs %[cnt], %[cnt], #1 \n"
-            // int16->int8
-            "sqxtn  v6.8b, v4.8h               \n"
-            "sqxtn  v7.8b, v5.8h               \n"
-            "st1   {v6.8b}, [%[dout_ptr]], #8 \n" /* store c0r0*/
-            "st1   {v7.8b}, [%[dout_ptr]], #8 \n" /* store c2r0*/
-            "bne        1b                          \n"
-            : [din_ptr] "+r"(din_ch_ptr),
-              [bias_ptr] "+r"(bias_ch_ptr),
-              [dout_ptr] "+r"(dout_ch_ptr),
-              [cnt] "+r"(col_cnt)
-            : [scale] "w"(scale_val)
-            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
-#else
-        asm volatile(
-            "vdup.s8 d0, %[scale]          \n"
-            "vld1.8 {d2}, [%[din_ptr]]!        \n"
-            "vld1.8 {d4}, [%[bias_ptr]]!       \n"
-            "1:                                     \n"
-            "vld1.8 {d3}, [%[din_ptr]]!        \n"
-            "vld1.8 {d5}, [%[bias_ptr]]!       \n"
-            "vmull.s8 q4, d2, d0            \n"
-            "vmull.s8 q5, d3, d0            \n"
-            "vaddw.s16 q4, q4, d4                   \n"
-            "vaddw.s16 q5, q5, d5                   \n"
-            "vld1.8 {d2}, [%[din_ptr]]!        \n"
-            "vld1.8 {d4}, [%[bias_ptr]]!       \n"
-            "subs       %[cnt], #1                 \n"
-            // int16->int8
-            "vqmovn.s16 d12, q4                     @ cnt to int8\n"
-            "vqmovn.s16 d13, q5                     @ cnt to int8\n"
-            "vst1.32 {d12-d13}, [%[dout_ptr]]!    \n"
-            "bne        1b                          \n"
-            : [din_ptr] "+r"(din_ch_ptr),
-              [bias_ptr] "+r"(bias_ch_ptr),
-              [dout_ptr] "+r"(dout_ch_ptr),
-              [cnt] "+r"(col_cnt)
-            : [scale] "r"(scale_val)
-            : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
-#endif
-      }
-      din_ch_ptr = din_ptr + c * size + cnt * 16;
-      bias_ch_ptr = bias_ptr + c * size + cnt * 16;
-      for (int i = 0; i < remain; i++) {
-        *dout_ch_ptr = saturate_cast<int8_t>(
-            roundf((*din_ch_ptr) * scale_ptr[c] + (*bias_ch_ptr)));
-        dout_ch_ptr++;
-        din_ch_ptr++;
-        bias_ch_ptr++;
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/axpy.h b/lite/backends/arm/math/axpy.h
deleted file mode 100644
index 8245bf1d1a..0000000000
--- a/lite/backends/arm/math/axpy.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "lite/operators/op_params.h"
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void axpy_kernel_fp32(const float* scale,
-                      const float* din,
-                      const float* bias,
-                      float* dout,
-                      int num,
-                      int channel,
-                      int size,
-                      int in_channel);
-
-void axpy_kernel_int8(const int8_t* scale,
-                      const int8_t* din,
-                      const int8_t* bias,
-                      int8_t* dout,
-                      int num,
-                      int channel,
-                      int size,
-                      int in_channel);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/beam_search.cc b/lite/backends/arm/math/beam_search.cc
deleted file mode 100644
index f93fcc0d60..0000000000
--- a/lite/backends/arm/math/beam_search.cc
+++ /dev/null
@@ -1,271 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/beam_search.h"
-#include <arm_neon.h>
-#include <cmath>
-#include <string>
-#include <vector>
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-/*
-* The basic items help to sort.
-*/
-struct Item {
-  Item() {}
-  Item(size_t offset, size_t id, float score)
-      : offset(offset), id(id), score(score) {}
-  // offset in the higher lod level.
-  size_t offset;
-  // prefix id in the lower lod level.
-  // size_t prefix;
-  // the candidate id
-  size_t id;
-  // the corresponding score
-  float score;
-
-  inline bool operator<(const Item &in) const {
-    return (score < in.score) || ((score == in.score) && (offset < in.offset));
-  }
-
-  inline void operator=(const Item &in) {
-    offset = in.offset;
-    id = in.id;
-    score = in.score;
-  }
-
-  std::string ToString() {
-    std::ostringstream os;
-    os << "{";
-    os << "offset: " << offset << ", ";
-    os << "id: " << id << ", ";
-    os << "score: " << score << "";
-    os << "}";
-    return os.str();
-  }
-};
-
-/*
- * Prune the source sentences all branchs finished, and it is optional.
- * Pruning must one step later than finishing (thus pre_ids is needed here),
- * since the end tokens must be writed out.
- */
-void PruneEndBeams(const Tensor *pre_ids,
-                   const LoD &abs_lod,
-                   std::vector<std::vector<Item>> *items,
-                   size_t lod_level,
-                   int end_id) {
-  auto *pre_ids_data = pre_ids->data<float>();
-  auto &high_level = abs_lod[lod_level];
-  for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) {
-    size_t src_prefix_start = high_level[src_idx];
-    size_t src_prefix_end = high_level[src_idx + 1];
-    bool finish_flag = true;
-    for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++) {
-      for (auto &item : items->at(offset)) {
-        if (item.id != static_cast<size_t>(end_id) ||
-            pre_ids_data[offset] != end_id) {
-          finish_flag = false;
-          break;
-        }
-      }
-      if (!finish_flag) break;
-    }
-    if (finish_flag) {  // all branchs of the beam (source sentence) end and
-                        // prune this beam
-      for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++)
-        items->at(offset).clear();
-    }
-  }
-}
-
-/*
- * Transform the items into a map whose key is offset, value is the items.
- * NOTE low performance.
- */
-std::vector<std::vector<Item>> ToMap(
-    const std::vector<std::vector<Item>> &items, size_t element_num) {
-  std::vector<std::vector<Item>> result;
-  result.resize(element_num);
-  for (auto &entries : items) {
-    for (const auto &item : entries) {
-      result[item.offset].push_back(item);
-    }
-  }
-  return result;
-}
-
-void Insert(std::vector<Item> *top_beam_ptr,
-            const Item &item,
-            size_t beam_size) {
-  std::vector<Item> &top_beam = *top_beam_ptr;
-
-  size_t num_beams = top_beam.size();
-  if (num_beams < beam_size) {
-    top_beam.resize(num_beams + 1);
-    num_beams++;
-  } else {
-    if (item < top_beam[beam_size - 1]) {
-      return;
-    }
-  }
-
-  for (int k = static_cast<int>(num_beams) - 2; k >= 0; --k) {
-    if (top_beam[k] < item) {
-      top_beam[k + 1] = top_beam[k];
-    } else {
-      top_beam[k + 1] = item;
-      return;
-    }
-  }
-  top_beam[0] = item;
-}
-
-/*
- * For each source, select top beam_size records.
- */
-std::vector<std::vector<Item>> SelectTopBeamSizeItems(const Tensor *pre_ids,
-                                                      const Tensor *pre_scores,
-                                                      const Tensor *ids,
-                                                      const Tensor *scores,
-                                                      size_t lod_level,
-                                                      size_t beam_size,
-                                                      int end_id,
-                                                      bool is_accumulated) {
-  std::vector<std::vector<Item>> result;
-
-  // find the current candidates
-  // auto abs_lod = framework::ToAbsOffset(scores->lod());
-  auto abs_lod = scores->lod();
-  auto *pre_ids_data = pre_ids->data<float>();
-  auto *pre_scores_data = pre_scores->data<float>();
-
-  auto *ids_data = ids ? ids->data<int>() : nullptr;
-  auto *scores_data = scores->data<float>();
-
-  size_t num_seqs = abs_lod[lod_level].size() - 1;
-  size_t seq_width = 1;
-  for (int i = 1; i < scores->dims().size(); i++) {
-    seq_width *= scores->dims()[i];
-  }
-
-  for (size_t seq_id = 0; seq_id < num_seqs; ++seq_id) {
-    size_t seq_offset_start = abs_lod[lod_level][seq_id];
-    size_t seq_offset_end = abs_lod[lod_level][seq_id + 1];
-
-    std::vector<Item> top_beam;
-    top_beam.reserve(beam_size);
-
-    for (size_t offset = seq_offset_start; offset < seq_offset_end; ++offset) {
-      auto pre_id = pre_ids_data[offset];
-      auto pre_score = pre_scores_data[offset];
-      if (pre_id == end_id) {
-        // Allocate all probability mass to end_id for finished branchs and
-        // the other candidate ids can be ignored.
-        Item item(offset, end_id, pre_score);
-        Insert(&top_beam, item, beam_size);
-      } else {
-        size_t index = offset * seq_width;
-        for (size_t d = 0; d < seq_width; d++, index++) {
-          int64_t id = ids_data ? ids_data[index] : static_cast<int64_t>(d);
-          float score = is_accumulated
-                            ? scores_data[index]
-                            : pre_score + std::log(scores_data[index]);
-          Item item(offset, id, score);
-          Insert(&top_beam, item, beam_size);
-        }
-      }
-    }
-
-    result.emplace_back(top_beam);
-  }
-  return result;
-}
-
-void beam_search(const Tensor *pre_ids,
-                 const Tensor *pre_scores,
-                 const Tensor *ids,
-                 const Tensor *scores,
-                 Tensor *selected_ids,
-                 Tensor *selected_scores,
-                 Tensor *parent_idx,
-                 int level,
-                 int beam_size,
-                 int end_id,
-                 bool is_accumulated,
-                 Context<TARGET(kARM)> *ctx) {
-  // auto abs_lod = framework::ToAbsOffset(scores->lod());
-  auto abs_lod = scores->lod();
-  auto &high_level = abs_lod[level];
-  auto items = SelectTopBeamSizeItems(pre_ids,
-                                      pre_scores,
-                                      ids,
-                                      scores,
-                                      level,
-                                      beam_size,
-                                      end_id,
-                                      is_accumulated);
-  auto selected_items = ToMap(items, high_level.back());
-
-  PruneEndBeams(pre_ids, abs_lod, &selected_items, level, end_id);
-  // calculate the output tensor's height
-  size_t num_instances = std::accumulate(
-      std::begin(selected_items),
-      std::end(selected_items),
-      0,
-      [](size_t a, std::vector<Item> &b) { return a + b.size(); });
-  // the output tensor shape should be [num_instances, 1]
-  auto dims = std::vector<int64_t>({static_cast<int>(num_instances), 1});
-  selected_ids->Resize(dims);
-  selected_scores->Resize(dims);
-  if (parent_idx) {
-    parent_idx->Resize(dims);
-  }
-  auto *selected_ids_data = selected_ids->mutable_data<float>();
-  auto *selected_scores_data = selected_scores->mutable_data<float>();
-  auto *parent_idx_data =
-      parent_idx ? parent_idx->mutable_data<int>() : nullptr;
-
-  // fill in data
-  std::vector<size_t> low_level;
-  size_t low_offset = 0;
-  for (auto &items : selected_items) {
-    low_level.push_back(low_offset);
-    for (auto &item : items) {
-      if (parent_idx) {
-        parent_idx_data[low_offset] = static_cast<int>(low_level.size() - 1);
-      }
-      selected_ids_data[low_offset] = item.id;
-      selected_scores_data[low_offset] = item.score;
-      low_offset++;
-    }
-  }
-  low_level.push_back(low_offset);
-
-  // fill lod
-  LoD lod(2);
-  lod[0].assign(high_level.begin(), high_level.end());
-  lod[1].assign(low_level.begin(), low_level.end());
-  *(selected_ids->mutable_lod()) = lod;
-  *(selected_scores->mutable_lod()) = lod;
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/beam_search.h b/lite/backends/arm/math/beam_search.h
deleted file mode 100644
index 2f07175e35..0000000000
--- a/lite/backends/arm/math/beam_search.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cmath>
-#include "lite/core/context.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void beam_search(const Tensor* pre_ids,
-                 const Tensor* pre_scores,
-                 const Tensor* ids,
-                 const Tensor* scores,
-                 Tensor* selected_ids,
-                 Tensor* selected_scores,
-                 Tensor* parent_idx,
-                 int level,
-                 int beam_size,
-                 int end_id,
-                 bool is_accumulated,
-                 Context<TARGET(kARM)>* ctx);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/box_coder.cc b/lite/backends/arm/math/box_coder.cc
deleted file mode 100644
index 7cb904a8ee..0000000000
--- a/lite/backends/arm/math/box_coder.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/box_coder.h"
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void box_coder(lite::Tensor* proposals,
-               const lite::Tensor* anchors,
-               const lite::Tensor* variances,
-               const lite::Tensor* bbox_deltas,
-               const std::string code_type,
-               bool box_normalized,
-               int axis) {
-  if (code_type == "decode_center_size") {
-    float normalized = !box_normalized ? 1.f : 0;
-
-    const float* anchor_data = anchors->data<float>();
-    const float* bbox_deltas_data = bbox_deltas->data<float>();
-    float* proposals_data = proposals->mutable_data<float>();
-    const float* variances_data = variances->data<float>();
-
-    int N = bbox_deltas->dims()[0];
-    int M = bbox_deltas->dims()[1];
-    int len = bbox_deltas->dims()[2];
-
-    for (int64_t row_id = 0; row_id < N; ++row_id) {
-      for (int64_t col_id = 0; col_id < M; ++col_id) {
-        size_t offset = row_id * M * len + col_id * len;
-        int prior_box_offset = axis == 0 ? col_id * len : row_id * len;
-        int var_offset = axis == 0 ? col_id * len : row_id * len;
-
-        auto anchor_data_tmp = anchor_data + prior_box_offset;
-        auto bbox_deltas_data_tmp = bbox_deltas_data + offset;
-        auto proposals_data_tmp = proposals_data + offset;
-
-        auto anchor_width =
-            anchor_data_tmp[2] - anchor_data_tmp[0] + normalized;
-        auto anchor_height =
-            anchor_data_tmp[3] - anchor_data_tmp[1] + normalized;
-        auto anchor_center_x = anchor_data_tmp[0] + 0.5 * anchor_width;
-        auto anchor_center_y = anchor_data_tmp[1] + 0.5 * anchor_height;
-
-        float bbox_center_x = 0, bbox_center_y = 0;
-        float bbox_width = 0, bbox_height = 0;
-
-        auto variances_data_tmp = variances_data + var_offset;
-
-        bbox_center_x =
-            variances_data_tmp[0] * bbox_deltas_data_tmp[0] * anchor_width +
-            anchor_center_x;
-        bbox_center_y =
-            variances_data_tmp[1] * bbox_deltas_data_tmp[1] * anchor_height +
-            anchor_center_y;
-        bbox_width = std::exp(variances_data_tmp[2] * bbox_deltas_data_tmp[2]) *
-                     anchor_width;
-        bbox_height =
-            std::exp(variances_data_tmp[3] * bbox_deltas_data_tmp[3]) *
-            anchor_height;
-
-        proposals_data_tmp[0] = bbox_center_x - bbox_width / 2;
-        proposals_data_tmp[1] = bbox_center_y - bbox_height / 2;
-        proposals_data_tmp[2] = bbox_center_x + bbox_width / 2 - normalized;
-        proposals_data_tmp[3] = bbox_center_y + bbox_height / 2 - normalized;
-      }
-    }
-  } else if (code_type == "encode_center_size") {
-    LOG(FATAL) << "not implemented type: " << code_type;
-  } else {
-    LOG(FATAL) << "not supported type: " << code_type;
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/box_coder.h b/lite/backends/arm/math/box_coder.h
deleted file mode 100644
index bbeb3e0618..0000000000
--- a/lite/backends/arm/math/box_coder.h
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void box_coder(lite::Tensor* proposals,
-               const lite::Tensor* anchors,
-               const lite::Tensor* variances,
-               const lite::Tensor* bbox_deltas,
-               const std::string code_type,
-               bool box_normalized,
-               int axis);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/col_im_transform.cc b/lite/backends/arm/math/col_im_transform.cc
deleted file mode 100644
index b5d2c6af13..0000000000
--- a/lite/backends/arm/math/col_im_transform.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/col_im_transform.h"
-#include <algorithm>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-inline bool is_a_ge_zero_and_a_lt_b(int a, int b) {
-  return static_cast<unsigned>(a) < static_cast<unsigned>(b);
-}
-
-template <>
-void col2im<float>(const float* data_col,
-                   const int channels,
-                   const int height,
-                   const int width,
-                   const int kernel_h,
-                   const int kernel_w,
-                   const int pad_h,
-                   const int pad_w,
-                   const int stride_h,
-                   const int stride_w,
-                   const int dilation_h,
-                   const int dilation_w,
-                   float* data_im) {
-  memset(data_im, 0, height * width * channels * sizeof(float));
-  const int output_h =
-      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
-  const int output_w =
-      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
-  const int channel_size = height * width;
-  for (int channel = channels; channel--; data_im += channel_size) {
-    for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
-      for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
-        int input_row = -pad_h + kernel_row * dilation_h;
-        for (int output_rows = output_h; output_rows; output_rows--) {
-          if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
-            data_col += output_w;
-          } else {
-            int input_col = -pad_w + kernel_col * dilation_w;
-            for (int output_col = output_w; output_col; output_col--) {
-              if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
-                data_im[input_row * width + input_col] += *data_col;
-              }
-              data_col++;
-              input_col += stride_w;
-            }
-          }
-          input_row += stride_h;
-        }
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/col_im_transform.h b/lite/backends/arm/math/col_im_transform.h
deleted file mode 100644
index 8560679d7f..0000000000
--- a/lite/backends/arm/math/col_im_transform.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <typename Dtype>
-void col2im(const Dtype* data_col,
-            const int channels,
-            const int height,
-            const int width,
-            const int kernel_h,
-            const int kernel_w,
-            const int pad_h,
-            const int pad_w,
-            const int stride_h,
-            const int stride_w,
-            const int dilation_h,
-            const int dilation_w,
-            Dtype* data_im);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/concat.cc b/lite/backends/arm/math/concat.cc
deleted file mode 100644
index 9b94cefa16..0000000000
--- a/lite/backends/arm/math/concat.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/concat.h"
-#include <algorithm>
-#include <limits>
-#include <memory>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void concat_func(const std::vector<lite::Tensor *> &input,
-                 const int axis,
-                 lite::Tensor *output) {
-  size_t num = input.size();
-  int rows = 1;
-  auto dim_0 = input[0]->dims();
-  for (int i = 0; i < axis; ++i) {
-    rows *= dim_0[i];
-  }
-  int out_rows = rows, out_cols = 0;
-
-  std::vector<int64_t> input_cols(input.size());
-  for (int i = 0; i < num; ++i) {
-    int t_cols = input[i]->numel() / rows;
-    out_cols += t_cols;
-    input_cols[i] = t_cols;
-  }
-
-  // computation
-  for (int k = 0; k < out_rows; ++k) {
-    float *dst_ptr = output->mutable_data<float>() + k * out_cols;
-    int col_idx = 0;
-    for (int j = 0; j < num; ++j) {
-      int col_len = input_cols[j];
-      const float *src_prt = input[j]->data<float>() + k * col_len;
-      std::memcpy(dst_ptr + col_idx, src_prt, sizeof(float) * col_len);
-      col_idx += col_len;
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/concat.h b/lite/backends/arm/math/concat.h
deleted file mode 100644
index 4c6159e9e0..0000000000
--- a/lite/backends/arm/math/concat.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "lite/operators/op_params.h"
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void concat_func(const std::vector<lite::Tensor *> &input,
-                 const int axis,
-                 lite::Tensor *output);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv3x3s1_direct_int8.cc b/lite/backends/arm/math/conv3x3s1_direct_int8.cc
deleted file mode 100644
index d44d911131..0000000000
--- a/lite/backends/arm/math/conv3x3s1_direct_int8.cc
+++ /dev/null
@@ -1,806 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <arm_neon.h>
-#include "lite/backends/arm/math/conv_block_utils.h"
-#include "lite/backends/arm/math/conv_impl.h"
-#include "lite/core/context.h"
-#include "lite/operators/op_params.h"
-#ifdef ARM_WITH_OMP
-#include <omp.h>
-#endif
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-#ifdef __aarch64__
-void conv_3x3s1_direct_int8(const int8_t* din,
-                            int32_t* dout,
-                            int num,
-                            int chout,
-                            int hout,
-                            int wout,
-                            int chin,
-                            int hin,
-                            int win,
-                            const int8_t* weights,
-                            const int32_t* bias,
-                            const operators::ConvParam& param,
-                            Context<TARGET(kARM)>* ctx,
-                            PrecisionType out_type,
-                            const float* scale) {
-  const int hin_r_block = 4;
-  const int hout_c_block = 4;  // 8;
-  const int hout_r_block = 2;
-
-  int stride_w = param.strides[1];
-  int pad_w = param.paddings[1];
-  int pad_h = param.paddings[0];
-  bool flag_relu = param.fuse_relu;
-  bool flag_bias = (param.bias != nullptr);
-
-  int wout_round = ((wout + 3) / 4) * 4;
-  int win_round = wout_round * stride_w + 4;
-
-  int threads = ctx->threads();
-
-  int* tmp_work_space = ctx->workspace_data<int>();
-  int* ptr_zero = tmp_work_space;
-  memset(ptr_zero, 0, sizeof(int) * win_round);
-  int* ptr_write = ptr_zero + win_round;
-
-  int in_len = win_round * chin;
-  int pre_in_size = hin_r_block * in_len;
-  int pre_out_size = hout_c_block * hout_r_block * wout_round;
-
-  signed char* pre_din = reinterpret_cast<signed char*>(ptr_write + wout_round);
-
-  int size_in_channel = win * hin;
-  int size_out_channel = wout * hout;
-  int w_stride = chin * 9;
-
-  int ws = -pad_w;
-  int we = ws + win_round;
-  int w_loop = wout_round / 4;
-
-  int size_out = wout_round * hout_c_block;
-
-  // printf("win_round: %d, wout_round: %d, ws: %d, we: %d\n", win_round,
-  // wout_round, ws, we);
-  // here
-  for (int n = 0; n < num; ++n) {
-    const signed char* din_batch =
-        static_cast<const signed char*>(din) + n * chin * size_in_channel;
-    signed char* dout_batch =
-        reinterpret_cast<signed char*>(dout) +
-        n * chout * size_out_channel * PrecisionTypeLength(out_type);
-
-    for (int h = 0; h < hout; h += 2) {
-      int hs = h - pad_h;
-      int he = hs + 4;
-      // printf("hs: %d, he: %d, chin: %d, hin: %d, win: %d \n", hs, he, chin,
-      // hin, win);
-      prepack_input_nxw(din_batch,
-                        pre_din,
-                        0,
-                        chin,
-                        hs,
-                        he,
-                        ws,
-                        we,
-                        chin,
-                        win,
-                        hin,
-                        (signed char*)ptr_zero);
-
-#pragma omp parallel for num_threads(threads)
-      for (int c = 0; c < chout; c += hout_c_block) {
-#ifdef ARM_WITH_OMP
-        int* pre_out =
-            reinterpret_cast<int*>(pre_din + (pre_in_size + 3) / 4 * 4) +
-            omp_get_thread_num() * pre_out_size;
-#else
-        int* pre_out =
-            reinterpret_cast<int*>(pre_din + (pre_in_size + 3) / 4 * 4);
-#endif
-        // printf("ptr_zero_int: %x, ptr_zero: %x, ptr_write: %x, pre_din: %x,
-        // pre_out: %x \n", ptr_zero_int, ptr_zero, ptr_write, pre_din,
-        // pre_out);
-        const signed char* inr0 = pre_din;
-        const signed char* inr1 = inr0 + in_len;
-        const signed char* inr2 = inr1 + in_len;
-        const signed char* inr3 = inr2 + in_len;
-
-        const signed char* wc0 =
-            static_cast<const signed char*>(weights) + c * w_stride;
-
-        const int* bias_ptr = ptr_zero;
-        if (flag_bias) {
-          bias_ptr = static_cast<const int*>(bias) + c;
-        }
-        // hout_r_block * wout_round * hout_c_block
-        fill_packed_bias_nxmw_int8(
-            bias_ptr, pre_out, hout_c_block, hout_r_block, wout_round);
-
-        for (int i = 0; i < chin; ++i) {
-          const signed char* r0 = inr0;
-          const signed char* r1 = inr1;
-          const signed char* r2 = inr2;
-          const signed char* r3 = inr3;
-
-          int* ptr_out0 = pre_out;
-          int* ptr_out1 = pre_out + size_out;
-
-          int cnt = w_loop;
-          const signed char* ptr_wc0 = wc0;
-
-          asm volatile(
-              "ldp   q4, q5, [%[wc0]] \n"  /* w4 w5 w6 w7 */
-              "ldr   q6, [%[wc0], #32] \n" /* w8 */
-              "SXTL  v11.8h, v4.8b \n"     /* w to int16 */
-              "SXTL2 v12.8h, v4.16b \n"    /* w to int16 */
-              "SXTL  v13.8h, v5.8b \n"     /*  to int16 */
-              "SXTL2 v14.8h, v5.16b \n"    /* to int16 */
-              "SXTL  v15.8h, v6.8b \n"     /* to int16 */
-              "1:                     \n"  /* main loop*/
-              "ldr  d0, [%[r0]]    \n"     /* load data din0-dinn7*/
-              "SXTL  v1.8h,  v0.8b \n"     /* to int16 */
-
-              /*output 1st row*/
-              "smull  v16.4s, v11.4h, v1.h[0]   \n" /*  */
-              "smull v17.4s, v11.4h, v1.h[1]   \n"  /*  */
-              "smull  v18.4s, v11.4h, v1.h[2]   \n" /*  */
-              "smull v19.4s, v11.4h, v1.h[3]   \n"  /*  */
-
-              "add   %[r0], %[r0], #4\n"
-
-              /*output 1st row*/
-              "smlal2  v16.4s, v11.8h, v1.h[1]   \n" /*  */
-              "smlal2 v17.4s, v11.8h, v1.h[2]   \n"  /*  */
-              "smlal2  v18.4s, v11.8h, v1.h[3]   \n" /*  */
-              "smlal2 v19.4s, v11.8h, v1.h[4]   \n"  /*  */
-
-              "ldr  d0, [%[r1]]    \n" /* load data */
-
-              /*output 1st row*/
-              "smlal  v16.4s, v12.4h, v1.h[2]   \n" /*  */
-              "smlal v17.4s, v12.4h, v1.h[3]   \n"  /*  */
-              "SXTL  v2.8h,  v0.8b \n"              /* to int16 */
-              "smlal  v18.4s, v12.4h, v1.h[4]   \n" /*  */
-              "smlal v19.4s, v12.4h, v1.h[5]   \n"  /*  */
-
-              "add   %[r1], %[r1], #4  \n"
-
-              /*output 1st row*/
-              "smlal2  v16.4s, v12.8h, v2.h[0]   \n" /*  */
-              "smlal2 v17.4s, v12.8h, v2.h[1]   \n"  /*  */
-              "smlal2  v18.4s, v12.8h, v2.h[2]   \n" /*  */
-              "smlal2 v19.4s, v12.8h, v2.h[3]   \n"  /*  */
-
-              /*output 1st row*/
-              "smlal  v16.4s, v13.4h, v2.h[1]   \n" /*  */
-              "smlal v17.4s, v13.4h, v2.h[2]   \n"  /*  */
-              "smlal  v18.4s, v13.4h, v2.h[3]   \n" /*  */
-              "smlal v19.4s, v13.4h, v2.h[4]   \n"  /*  */
-
-              /*output 1st row*/
-              "smlal2  v16.4s, v13.8h, v2.h[2]   \n" /*  */
-              "smlal2 v17.4s, v13.8h, v2.h[3]   \n"  /*  */
-              "smlal2  v18.4s, v13.8h, v2.h[4]   \n" /*  */
-              "smlal2 v19.4s, v13.8h, v2.h[5]   \n"  /*  */
-
-              /*output 2rd row*/
-              "smull  v24.4s, v11.4h, v2.h[0]   \n" /*  */
-              "smull v25.4s, v11.4h, v2.h[1]   \n"  /*  */
-              "smull  v26.4s, v11.4h, v2.h[2]   \n" /*  */
-              "smull v27.4s, v11.4h, v2.h[3]   \n"  /*  */
-
-              /*output 2rd row*/
-              "smlal2  v24.4s, v11.8h, v2.h[1]   \n" /*  */
-              "smlal2 v25.4s, v11.8h, v2.h[2]   \n"  /*  */
-              "smlal2  v26.4s, v11.8h, v2.h[3]   \n" /*  */
-              "smlal2 v27.4s, v11.8h, v2.h[4]   \n"  /*  */
-
-              "ldr  d0, [%[r2]]    \n" /* load data */
-
-              /*output 2rd row*/
-              "smlal  v24.4s, v12.4h, v2.h[2]   \n" /*  */
-              "smlal v25.4s, v12.4h, v2.h[3]   \n"  /*  */
-              "SXTL  v1.8h,  v0.8b \n"              /* to int16 */
-              "smlal  v26.4s, v12.4h, v2.h[4]   \n" /*  */
-              "smlal v27.4s, v12.4h, v2.h[5]   \n"  /*  */
-
-              /*output 1st row*/
-              "smlal  v16.4s, v14.4h, v1.h[0]   \n" /*  */
-              "smlal v17.4s, v14.4h, v1.h[1]   \n"  /*  */
-              "smlal  v18.4s, v14.4h, v1.h[2]   \n" /*  */
-              "smlal v19.4s, v14.4h, v1.h[3]   \n"  /*  */
-
-              "add   %[r2], %[r2], #4  \n"
-
-              /*output 1st row*/
-              "smlal2  v16.4s, v14.8h, v1.h[1]   \n" /*  */
-              "smlal2 v17.4s, v14.8h, v1.h[2]   \n"  /*  */
-              "smlal2  v18.4s, v14.8h, v1.h[3]   \n" /*  */
-              "smlal2 v19.4s, v14.8h, v1.h[4]   \n"  /*  */
-
-              "ldp    q3, q4, [%[ptr_out0]]             \n"
-              "ldp    q5, q6, [%[ptr_out0], #32]             \n"
-
-              /*output 1st row*/
-              "smlal  v16.4s, v15.4h, v1.h[2]   \n" /*  */
-              "smlal v17.4s, v15.4h, v1.h[3]   \n"  /*  */
-              "smlal  v18.4s, v15.4h, v1.h[4]   \n" /*  */
-              "smlal v19.4s, v15.4h, v1.h[5]   \n"  /*  */
-
-              "ADD    v3.4s, v16.4s, v3.4s              \n"
-              "ADD    v4.4s, v17.4s, v4.4s              \n"
-              "ADD    v5.4s, v18.4s, v5.4s              \n"
-              "ADD    v6.4s, v19.4s, v6.4s              \n"
-
-              "stp    q3, q4, [%[ptr_out0]], #32          \n"   /* save to
-                                                                   output*/
-              "stp    q5, q6, [%[ptr_out0]], #32            \n" /* save to
-                                                                   output*/
-
-              /*output 2rd row*/
-              "smlal2  v24.4s, v12.8h, v1.h[0]   \n" /*  */
-              "smlal2 v25.4s, v12.8h, v1.h[1]   \n"  /*  */
-              "smlal2  v26.4s, v12.8h, v1.h[2]   \n" /*  */
-              "smlal2 v27.4s, v12.8h, v1.h[3]   \n"  /*  */
-
-              /*output 2rd row*/
-              "smlal  v24.4s, v13.4h, v1.h[1]   \n" /*  */
-              "smlal v25.4s, v13.4h, v1.h[2]   \n"  /*  */
-              "smlal  v26.4s, v13.4h, v1.h[3]   \n" /*  */
-              "smlal v27.4s, v13.4h, v1.h[4]   \n"  /*  */
-
-              "ldr  d0, [%[r3]]    \n" /* load data */
-
-              /*output 2rd row*/
-              "smlal2  v24.4s, v13.8h, v1.h[2]   \n" /*  */
-              "smlal2 v25.4s, v13.8h, v1.h[3]   \n"  /*  */
-              "SXTL  v2.8h,  v0.8b \n"               /* to int16 */
-              "smlal2  v26.4s, v13.8h, v1.h[4]   \n" /*  */
-              "smlal2 v27.4s, v13.8h, v1.h[5]   \n"  /*  */
-
-              /*output 2rd row*/
-              "smlal  v24.4s, v14.4h, v2.h[0]   \n" /*  */
-              "smlal v25.4s, v14.4h, v2.h[1]   \n"  /*  */
-              "smlal  v26.4s, v14.4h, v2.h[2]   \n" /*  */
-              "smlal v27.4s, v14.4h, v2.h[3]   \n"  /*  */
-
-              "add   %[r3], %[r3], #4  \n"
-
-              /*output 2rd row*/
-              "smlal2  v24.4s, v14.8h, v2.h[1]   \n" /*  */
-              "smlal2 v25.4s, v14.8h, v2.h[2]   \n"  /*  */
-              "smlal2  v26.4s, v14.8h, v2.h[3]   \n" /*  */
-              "smlal2 v27.4s, v14.8h, v2.h[4]   \n"  /*  */
-
-              "ldp    q3, q4, [%[ptr_out1]]             \n"
-              "ldp    q5, q6, [%[ptr_out1], #32]             \n"
-
-              "subs    %w[cnt], %w[cnt], #1     \n" /* loop count -1 */
-
-              /*output 2rd row*/
-              "smlal  v24.4s, v15.4h, v2.h[2]   \n" /*  */
-              "smlal v25.4s, v15.4h, v2.h[3]   \n"  /*  */
-              "smlal  v26.4s, v15.4h, v2.h[4]   \n" /*  */
-              "smlal v27.4s, v15.4h, v2.h[5]   \n"  /*  */
-
-              "ADD    v3.4s, v24.4s, v3.4s              \n"
-              "ADD    v4.4s, v25.4s, v4.4s              \n"
-              "ADD    v5.4s, v26.4s, v5.4s              \n"
-              "ADD    v6.4s, v27.4s, v6.4s              \n"
-
-              "stp    q3, q4, [%[ptr_out1]], #32        \n" /* save to output*/
-              "stp    q5, q6, [%[ptr_out1]], #32        \n" /* save to output*/
-
-              "bne    1b                          \n" /* jump to main loop*/
-
-              : [cnt] "+r"(cnt),
-                [wc0] "+r"(ptr_wc0),
-                [r0] "+r"(r0),
-                [r1] "+r"(r1),
-                [r2] "+r"(r2),
-                [r3] "+r"(r3),
-                [ptr_out0] "+r"(ptr_out0),
-                [ptr_out1] "+r"(ptr_out1)
-              :
-              : "cc",
-                "memory",
-                "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v24",
-                "v25",
-                "v26",
-                "v27"
-
-              );
-
-          wc0 += 9 * hout_c_block;
-          inr0 += win_round;
-          inr1 += win_round;
-          inr2 += win_round;
-          inr3 += win_round;
-        }
-        if (out_type == PRECISION(kFloat)) {
-          write_to_output_c4_int32_1(pre_out,
-                                     reinterpret_cast<float*>(dout_batch),
-                                     hout_c_block,
-                                     hout_r_block,
-                                     c,
-                                     c + 4,
-                                     h,
-                                     h + 2,
-                                     0,
-                                     wout_round,
-                                     chout,
-                                     hout,
-                                     wout,
-                                     flag_relu,
-                                     reinterpret_cast<float*>(ptr_write),
-                                     &scale[c],
-                                     out_type);
-        } else if (out_type == PRECISION(kInt8)) {
-          write_to_output_c4_int32_1(pre_out,
-                                     dout_batch,
-                                     hout_c_block,
-                                     hout_r_block,
-                                     c,
-                                     c + 4,
-                                     h,
-                                     h + 2,
-                                     0,
-                                     wout_round,
-                                     chout,
-                                     hout,
-                                     wout,
-                                     flag_relu,
-                                     reinterpret_cast<signed char*>(ptr_write),
-                                     &scale[c],
-                                     out_type);
-        } else {  // int32
-          write_to_output_c4_int32(pre_out,
-                                   reinterpret_cast<int*>(dout_batch),
-                                   hout_c_block,
-                                   hout_r_block,
-                                   c,
-                                   c + 4,
-                                   h,
-                                   h + 2,
-                                   0,
-                                   wout_round,
-                                   chout,
-                                   hout,
-                                   wout,
-                                   flag_relu,
-                                   ptr_write);
-        }
-      }
-    }
-  }
-}
-
-#else
-
-void conv_3x3s1_direct_int8(const int8_t* din,
-                            int32_t* dout,
-                            int num,
-                            int chout,
-                            int hout,
-                            int wout,
-                            int chin,
-                            int hin,
-                            int win,
-                            const int8_t* weights,
-                            const int32_t* bias,
-                            const operators::ConvParam& param,
-                            Context<TARGET(kARM)>* ctx,
-                            PrecisionType out_type,
-                            const float* scale) {
-  // printf("conv2_3x3s1_direct_int8 \n");
-
-  const int hin_r_block = 4;
-  const int hout_c_block = 4;  // 8
-  const int hout_r_block = 2;
-
-  int stride_w = param.strides[1];
-  int pad_w = param.paddings[1];
-  int pad_h = param.paddings[0];
-  bool flag_relu = param.fuse_relu;
-  bool flag_bias = (param.bias != nullptr);
-
-  int wout_round = ((wout + 3) / 4) * 4;
-  int win_round = wout_round * stride_w + 4;
-
-  int threads = ctx->threads();
-
-  int* tmp_work_space = ctx->workspace_data<int>();
-  int* ptr_zero = tmp_work_space;
-  memset(ptr_zero, 0, sizeof(int) * win_round);
-  int* ptr_write = ptr_zero + win_round;
-
-  int in_len = win_round * chin;
-  int pre_in_size = hin_r_block * in_len;
-  int pre_out_size = hout_c_block * hout_r_block * wout_round;
-
-  signed char* pre_din = reinterpret_cast<signed char*>(ptr_write + wout_round);
-
-  int size_in_channel = win * hin;
-  int size_out_channel = wout * hout;
-  int w_stride = chin * 9;
-
-  int ws = -pad_w;
-  int we = ws + win_round;
-  int w_loop = wout_round / 4;
-
-  int size_out = wout_round * hout_c_block;
-
-  // printf("win_round: %d, wout_round: %d, ws: %d, we: %d\n", win_round,
-  // wout_round, ws, we);
-
-  for (int n = 0; n < num; ++n) {
-    const signed char* din_batch =
-        static_cast<const signed char*>(din) + n * chin * size_in_channel;
-    signed char* dout_batch =
-        reinterpret_cast<signed char*>(dout) +
-        n * chout * size_out_channel * PrecisionTypeLength(out_type);
-
-    for (int h = 0; h < hout; h += 2) {
-      int hs = h - pad_h;
-      int he = hs + 4;
-      // printf("hs: %d, he: %d, chin: %d, hin: %d, win: %d \n", hs, he, chin,
-      // hin, win);
-      prepack_input_nxw(din_batch,
-                        pre_din,
-                        0,
-                        chin,
-                        hs,
-                        he,
-                        ws,
-                        we,
-                        chin,
-                        win,
-                        hin,
-                        (signed char*)ptr_zero);
-
-#pragma omp parallel for num_threads(threads)
-      for (int c = 0; c < chout; c += hout_c_block) {  // 4
-#ifdef ARM_WITH_OMP
-        int* pre_out =
-            reinterpret_cast<int*>(pre_din + (pre_in_size + 3) / 4 * 4) +
-            omp_get_thread_num() * pre_out_size;
-#else
-        int* pre_out =
-            reinterpret_cast<int*>(pre_din + (pre_in_size + 3) / 4 * 4);
-#endif
-        // printf("ptr_zero_int: %x, ptr_zero: %x, ptr_write: %x, pre_din: %x,
-        // pre_out: %x \n", ptr_zero_int, ptr_zero, ptr_write, pre_din,
-        // pre_out);
-        const signed char* inr0 = pre_din;
-        const signed char* inr1 = inr0 + in_len;
-        const signed char* inr2 = inr1 + in_len;
-        const signed char* inr3 = inr2 + in_len;
-
-        const signed char* wc0 =
-            static_cast<const signed char*>(weights) + c * w_stride;
-
-        const int* bias_ptr = ptr_zero;
-        if (flag_bias) {
-          bias_ptr = static_cast<const int*>(bias) + c;
-        }
-        // hout_r_block * wout_round * hout_c_block
-        fill_packed_bias_nxmw_int8(
-            bias_ptr, pre_out, hout_c_block, hout_r_block, wout_round);
-
-        for (int i = 0; i < chin; ++i) {
-          const signed char* r0 = inr0;
-          const signed char* r1 = inr1;
-          const signed char* r2 = inr2;
-          const signed char* r3 = inr3;
-
-          int* ptr_out0 = pre_out;
-          int* ptr_out1 = pre_out + size_out;
-
-          int cnt = w_loop;
-          const signed char* ptr_wc = wc0;
-
-          asm volatile(
-              "vld1.s8 {d0-d3}, [%[wc0]]!     \n" /* wc0, wc1, wc2, wc3, wc4,
-                                                     wc5, wc6, wc7*/
-              "vld1.s8 {d4},    [%[wc0]]!     \n" /*  wc8 */
-              "vmovl.s8   q3,   d0            \n" /* q3 = w0, w1 */
-              "vmovl.s8   q4,   d1            \n" /* q4 = w2 ,w3 */
-              "vmovl.s8   q5,   d2            \n" /* q5 = w4, w5 */
-              "vmovl.s8   q6,   d3            \n" /* q6 = w6, w7 */
-              "vmovl.s8   q7,   d4            \n" /* q7 = w8 */
-
-              "1:                           \n" /* main loop*/
-              "vld1.s32  {d0}, [%[r0]]    \n"   /* load data din0-dinn7*/
-              "vmovl.s8   q0,   d0        \n"   /* movl d0 -> q0 */
-              /*output 1st row*/
-              "vmull.s16 q8, d6, d0[0]   \n" /* q8 = w0 * r0[0] */
-              "vmull.s16 q9, d6, d0[1]   \n" /* q9 = w0 * r0[2] */
-              "vmull.s16 q10, d6, d0[2]  \n" /* q10 = w0 * r0[4] */
-              "vmull.s16 q11, d6, d0[3]  \n" /* q11 = w0 * r0[6] */
-
-              "add   %[r0], #4           \n"
-
-              /*output 1st row*/
-              "vmlal.s16 q8, d7, d0[1]   \n" /* q8 = w1 * r0[1] */
-              "vmlal.s16 q9, d7, d0[2]   \n" /* q9 = w1 * r0[2] */
-              "vmlal.s16 q10, d7, d0[3]  \n" /* q10 = w1 * r0[3] */
-              "vmlal.s16 q11, d7, d1[0]  \n" /* q11 = w1 * r0[4] */
-
-              "vld1.s32 {d2}, [%[r1]]    \n" /* load input r1 -> d2 */
-              "vmovl.s8  q1,   d2        \n" /* movl d2 -> q1 */
-
-              /*output 1st row*/
-              "vmlal.s16 q8, d8, d0[2]   \n" /* q8 = w2 * r0[2] */
-              "vmlal.s16 q9, d8, d0[3]   \n" /* q9 = w2 * r0[3] */
-              "vmlal.s16 q10, d8, d1[0]  \n" /* q10 = w2 * r0[4] */
-              "vmlal.s16 q11, d8, d1[1]  \n" /* q11 = w2 * r0[5] */
-
-              /*output 1st row*/
-              "vmlal.s16 q8, d9, d2[0]   \n" /*  */
-              "vmlal.s16 q9, d9, d2[1]   \n" /*  */
-              "vmlal.s16 q10, d9, d2[2]  \n" /*  */
-              "vmlal.s16 q11, d9, d2[3]  \n" /*  */
-
-              "add   %[r1],   #4         \n"
-
-              /*output 1st row*/
-              "vmlal.s16 q8, d10, d2[1]  \n" /*  */
-              "vmlal.s16 q9, d10, d2[2]  \n" /*  */
-              "vmlal.s16 q10, d10, d2[3] \n" /*  */
-              "vmlal.s16 q11, d10, d3[0] \n" /*  */
-
-              /*output 1st row*/
-              "vmlal.s16 q8, d11, d2[2]  \n" /*  */
-              "vmlal.s16 q9, d11, d2[3]  \n" /*  */
-              "vmlal.s16 q10, d11, d3[0] \n" /*  */
-              "vmlal.s16 q11, d11, d3[1] \n" /*  */
-
-              /*output 2rd row*/
-              "vmull.s16 q12, d6, d2[0]  \n" /*  */
-              "vmull.s16 q13, d6, d2[1]  \n" /*  */
-              "vmull.s16 q14, d6, d2[2]  \n" /*  */
-              "vmull.s16 q15, d6, d2[3]  \n" /*  */
-
-              "vld1.s32 {d0}, [%[r2]]    \n" /* load input r2 -> d2 */
-              "vmovl.s8 q0,   d0         \n" /* movl d2 -> q1 */
-
-              /*output 2rd row*/
-              "vmlal.s16 q12, d7, d2[1]  \n" /*  */
-              "vmlal.s16 q13, d7, d2[2]  \n" /*  */
-              "vmlal.s16 q14, d7, d2[3]  \n" /*  */
-              "vmlal.s16 q15, d7, d3[0]  \n" /*  */
-
-              /*output 2rd row*/
-              "vmlal.s16 q12, d8, d2[2]  \n" /*  */
-              "vmlal.s16 q13, d8, d2[3]  \n" /*  */
-              "vmlal.s16 q14, d8, d3[0]  \n" /*  */
-              "vmlal.s16 q15, d8, d3[1]  \n" /*  */
-
-              "add   %[r2], #4           \n"
-
-              /*output 1st row*/
-              "vmlal.s16 q8, d12, d0[0]   \n" /*  */
-              "vmlal.s16 q9, d12, d0[1]   \n" /*  */
-              "vmlal.s16 q10, d12, d0[2]  \n" /*  */
-              "vmlal.s16 q11, d12, d0[3]  \n" /*  */
-
-              /*output 1st row*/
-              "vmlal.s16 q8, d13, d0[1]   \n" /*  */
-              "vmlal.s16 q9, d13, d0[2]   \n" /*  */
-              "vmlal.s16 q10, d13, d0[3]  \n" /*  */
-              "vmlal.s16 q11, d13, d1[0]  \n" /*  */
-
-              "vld1.32    {d2-d5}, [%[ptr_out0]]   \n" /* load ptr_out -> q, q
-                                                          */
-
-              /*output 1st row*/
-              "vmlal.s16 q8, d14, d0[2]   \n" /*  */
-              "vmlal.s16 q9, d14, d0[3]   \n" /*  */
-              "vmlal.s16 q10, d14, d1[0]  \n" /*  */
-              "vmlal.s16 q11, d14, d1[1]  \n" /*  */
-
-              /*load & store output 1st row*/
-              "vadd.s32  q1, q8, q1                \n" /* out[0] += q8 */
-              "vadd.s32  q2, q9, q2                \n" /* out[0] += q8 */
-              "vst1.s32   {d2-d5}, [%[ptr_out0]]!  \n"
-
-              /*output 2rd row*/
-              "vmlal.s16 q12, d9, d0[0]   \n" /*  */
-              "vmlal.s16 q13, d9, d0[1]   \n" /*  */
-              "vmlal.s16 q14, d9, d0[2]   \n" /*  */
-              "vmlal.s16 q15, d9, d0[3]   \n" /*  */
-
-              "vld1.32    {d2-d5}, [%[ptr_out0]]   \n" /* load ptr_out -> q2, q3
-                                                          */
-
-              /*output 2rd row */
-              "vmlal.s16 q12, d10, d0[1]   \n" /*  */
-              "vmlal.s16 q13, d10, d0[2]   \n" /*  */
-              "vadd.s32  q1, q10, q1       \n" /* out[0] += q */
-              "vadd.s32  q2, q11, q2       \n" /* out[1] += q */
-
-              "vmlal.s16 q14, d10, d0[3]            \n" /*  */
-              "vst1.s32   {d2-d5}, [%[ptr_out0]]!   \n"
-              "vmlal.s16 q15, d10, d1[0]            \n" /*  */
-
-              /*output 2rd row */
-              "vmlal.s16 q12, d11, d0[2]            \n" /*  */
-              "vmlal.s16 q13, d11, d0[3]            \n" /*  */
-
-              "vld1.s32 {d4}, [%[r3]]               \n" /* load input r2 -> d2
-                                                           */
-              "vmovl.s8 q2,   d4                    \n" /* movl d2 -> q2 */
-
-              "vmlal.s16 q14, d11, d1[0]            \n" /*  */
-              "vmlal.s16 q15, d11, d1[1]            \n" /*  */
-
-              "add   %[r3], #4                      \n"
-
-              /*output 2rd row */
-              "vmlal.s16 q12, d12, d4[0]             \n" /*  */
-              "vmlal.s16 q13, d12, d4[1]             \n" /*  */
-              "vmlal.s16 q14, d12, d4[2]             \n" /*  */
-              "vmlal.s16 q15, d12, d4[3]             \n" /*  */
-
-              "vld1.32    {d0-d3}, [%[ptr_out1]]     \n" /*  */
-
-              /*output 2rd row */
-              "vmlal.s16 q12, d13, d4[1]             \n" /*  */
-              "vmlal.s16 q13, d13, d4[2]             \n" /*  */
-              "vmlal.s16 q14, d13, d4[3]             \n" /*  */
-              "vmlal.s16 q15, d13, d5[0]             \n" /*  */
-
-              "subs  %[cnt], #1                      \n"
-
-              /*output 2rd row */
-              "vmlal.s16 q12, d14, d4[2]             \n" /*  */
-              "vmlal.s16 q13, d14, d4[3]             \n" /*  */
-              "vmlal.s16 q14, d14, d5[0]             \n" /*  */
-              "vmlal.s16 q15, d14, d5[1]             \n" /*  */
-
-              /*output 2rd row*/
-              "vadd.s32  q0, q12, q0                 \n" /*  */
-              "vadd.s32  q1, q13, q1                 \n" /*  */
-              "vst1.s32   {d0-d3}, [%[ptr_out1]]!    \n"
-
-              "vld1.32    {d0-d3}, [%[ptr_out1]]     \n" /*  */
-              "vadd.s32  q0, q14, q0                 \n" /*  */
-              "vadd.s32  q1, q15, q1                 \n" /*  */
-              "vst1.s32   {d0-d3}, [%[ptr_out1]]!    \n"
-
-              "bne    1b                             \n" /* jump to main loop*/
-
-              : [cnt] "+r"(cnt),
-                [r0] "+r"(r0),
-                [r1] "+r"(r1),
-                [r2] "+r"(r2),
-                [r3] "+r"(r3),
-                [ptr_out0] "+r"(ptr_out0),
-                [ptr_out1] "+r"(ptr_out1),
-                [wc0] "+r"(ptr_wc)
-              :
-              : "cc",
-                "memory",
-                "q0",
-                "q1",
-                "q2",
-                "q3",
-                "q4",
-                "q5",
-                "q6",
-                "q7",
-                "q8",
-                "q9",
-                "q10",
-                "q11",
-                "q12",
-                "q13",
-                "q14",
-                "q15");
-
-          wc0 += 9 * hout_c_block;
-          inr0 += win_round;
-          inr1 += win_round;
-          inr2 += win_round;
-          inr3 += win_round;
-        }
-
-        if (out_type == PRECISION(kFloat)) {
-          write_to_output_c4_int32_1(pre_out,
-                                     reinterpret_cast<float*>(dout_batch),
-                                     hout_c_block,
-                                     hout_r_block,
-                                     c,
-                                     c + 4,
-                                     h,
-                                     h + 2,
-                                     0,
-                                     wout_round,
-                                     chout,
-                                     hout,
-                                     wout,
-                                     flag_relu,
-                                     reinterpret_cast<float*>(ptr_write),
-                                     &scale[c],
-                                     out_type);
-        } else if (out_type == PRECISION(kInt8)) {
-          write_to_output_c4_int32_1(pre_out,
-                                     dout_batch,
-                                     hout_c_block,
-                                     hout_r_block,
-                                     c,
-                                     c + 4,
-                                     h,
-                                     h + 2,
-                                     0,
-                                     wout_round,
-                                     chout,
-                                     hout,
-                                     wout,
-                                     flag_relu,
-                                     reinterpret_cast<signed char*>(ptr_write),
-                                     &scale[c],
-                                     out_type);
-        } else {  // int32
-          write_to_output_c4_int32(pre_out,
-                                   reinterpret_cast<int*>(dout_batch),
-                                   hout_c_block,
-                                   hout_r_block,
-                                   c,
-                                   c + 4,
-                                   h,
-                                   h + 2,
-                                   0,
-                                   wout_round,
-                                   chout,
-                                   hout,
-                                   wout,
-                                   flag_relu,
-                                   ptr_write);
-        }
-      }
-    }
-  }
-}
-
-#endif  // __aarch64__
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv3x3s2_direct_int8.cc b/lite/backends/arm/math/conv3x3s2_direct_int8.cc
deleted file mode 100644
index 6169ad5d12..0000000000
--- a/lite/backends/arm/math/conv3x3s2_direct_int8.cc
+++ /dev/null
@@ -1,1081 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <arm_neon.h>
-#include "lite/backends/arm/math/conv_block_utils.h"
-#include "lite/backends/arm/math/conv_impl.h"
-#include "lite/core/context.h"
-#include "lite/operators/op_params.h"
-#ifdef ARM_WITH_OMP
-#include <omp.h>
-#endif
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-#ifdef __aarch64__
-int conv_3x3s2_direct_int8_c_num() { return 8; }
-void conv_3x3s2_direct_int8(const int8_t* din,
-                            int32_t* dout,
-                            int num,
-                            int chout,
-                            int hout,
-                            int wout,
-                            int chin,
-                            int hin,
-                            int win,
-                            const int8_t* weights,
-                            const int32_t* bias,
-                            const operators::ConvParam& param,
-                            Context<TARGET(kARM)>* ctx,
-                            PrecisionType out_type,
-                            const float* scale) {
-  //! 3x3s2 int8 convolution, implemented by direct algorithm
-  //! prepack input to tmp buffer
-  //! write output to tmp buffer
-  int threads = ctx->threads();
-  int stride_w = param.strides[1];
-  int pad_w = param.paddings[1];
-  int pad_h = param.paddings[0];
-  bool flag_relu = param.fuse_relu;
-  bool flag_bias = (param.bias != nullptr);
-
-  //! set 2/3 l2 cache
-  int l2_size = ctx->llc_size() / 3 * 2;
-  const int hout_c_block = 8;
-  const int hout_r_kernel = 2;
-  const int wout_round = ((wout + 3) / 4) * 4;
-  const int win_round = wout_round * stride_w + 1;
-
-  //! get h block
-  //! win_round * chin * hin_r_block * sizeof(int8_t) + wout_round *
-  //! hout_c_block * hout_r_block * threads * sizeof(int32_t)= l2_size
-  //! win_round = 2 * wout_round + 1
-  //! hin_r_block = 2 * hout_r_block + 1
-  int hout_r_block =
-      (l2_size - 2 * wout_round * chin - chin) /
-      ((4 * wout_round + 2) * chin + wout_round * hout_c_block * threads * 4);
-  hout_r_block = hout_r_block > hout ? hout : hout_r_block;
-  hout_r_block = (hout_r_block / hout_r_kernel) * hout_r_kernel;
-  hout_r_block = hout_r_block < hout_r_kernel ? hout_r_kernel : hout_r_block;
-
-  const int hin_r_block = hout_r_block * 2 + 1;
-
-  int8_t* tmp_work_space = ctx->workspace_data<int8_t>();
-  int zero_size = chout > (win_round + 3) / 4 ? chout : (win_round + 3) / 4;
-  const int kZeroSize = zero_size;
-  int32_t ptr_zero[kZeroSize];
-  memset(ptr_zero, 0, sizeof(int32_t) * zero_size);
-  const int kWoutRound = wout_round;
-  int32_t ptr_write[kWoutRound];
-
-  int in_len = win_round * chin;
-  int pre_in_size = hin_r_block * in_len;
-  int pre_out_size = hout_c_block * hout_r_block * wout_round;
-
-  //! l2_cache start
-  int8_t* pre_din = tmp_work_space;
-
-  int size_in_channel = win * hin;
-  int size_out_channel = wout * hout;
-  int w_stride = chin * 9;
-
-  int ws = -pad_w;
-  int we = ws + win_round;
-  int w_loop = wout_round / 4;
-
-  int out_row_stride = hout_c_block * wout_round;
-
-  for (int n = 0; n < num; ++n) {
-    const int8_t* din_batch = din + n * chin * size_in_channel;
-    int8_t* dout_batch =
-        reinterpret_cast<int8_t*>(dout) +
-        n * chout * size_out_channel * PrecisionTypeLength(out_type);
-    for (int h = 0; h < hout; h += hout_r_block) {
-      int h_kernel = hout_r_block;
-      if (h + hout_r_block > hout) {
-        h_kernel = hout - h;
-      }
-      int hs = h * 2 - pad_h;
-      int he = hs + h_kernel * 2 + 1;
-      prepack_input_nxw(din_batch,
-                        pre_din,
-                        0,
-                        chin,
-                        hs,
-                        he,
-                        ws,
-                        we,
-                        chin,
-                        win,
-                        hin,
-                        reinterpret_cast<int8_t*>(ptr_zero));
-
-      const int8_t* cblock_inr0 = pre_din;
-      const int8_t* cblock_inr1 = cblock_inr0 + in_len;
-      const int8_t* cblock_inr2 = cblock_inr1 + in_len;
-      const int8_t* cblock_inr3 = cblock_inr2 + in_len;
-      const int8_t* cblock_inr4 = cblock_inr3 + in_len;
-
-#pragma omp parallel for num_threads(threads)
-      for (int c = 0; c < chout; c += hout_c_block) {
-#ifdef ARM_WITH_OMP
-        int32_t* pre_out =
-            reinterpret_cast<int*>(pre_din + (pre_in_size + 3) / 4 * 4) +
-            omp_get_thread_num() * pre_out_size;
-#else
-        int32_t* pre_out =
-            reinterpret_cast<int32_t*>(pre_din + (pre_in_size + 3) / 4 * 4);
-#endif
-        const int8_t* block_inr0 = cblock_inr0;
-        const int8_t* block_inr1 = cblock_inr1;
-        const int8_t* block_inr2 = cblock_inr2;
-        const int8_t* block_inr3 = cblock_inr3;
-        const int8_t* block_inr4 = cblock_inr4;
-
-        const int8_t* weight_c = weights + c * w_stride;
-        const int32_t* bias_ptr = ptr_zero;
-        if (flag_bias) {
-          bias_ptr = bias + c;
-        }
-
-        fill_packed_bias_nxmw_int8(bias_ptr, pre_out, 8, h_kernel, wout_round);
-        for (int hk = 0; hk < h_kernel; hk += hout_r_kernel) {
-          const int8_t* wc0 = weight_c;
-
-          const int8_t* inr0 = block_inr0;
-          const int8_t* inr1 = block_inr1;
-          const int8_t* inr2 = block_inr2;
-          const int8_t* inr3 = block_inr3;
-          const int8_t* inr4 = block_inr4;
-
-          int32_t* pre_out0 = pre_out + hk * out_row_stride;
-          int32_t* pre_out1 = pre_out0 + out_row_stride;
-          for (int i = 0; i < chin; ++i) {
-            int16x8_t v0 = vmovl_s8(vld1_s8(wc0));       // w0
-            int16x8_t v1 = vmovl_s8(vld1_s8(wc0 + 8));   // w1
-            int16x8_t v2 = vmovl_s8(vld1_s8(wc0 + 16));  // w2,
-
-            int16x8_t v3 = vmovl_s8(vld1_s8(wc0 + 24));  // w3
-            int16x8_t v4 = vmovl_s8(vld1_s8(wc0 + 32));  // w4
-            int16x8_t v5 = vmovl_s8(vld1_s8(wc0 + 40));  // w5
-
-            int16x8_t v6 = vmovl_s8(vld1_s8(wc0 + 48));  // w6
-            int16x8_t v7 = vmovl_s8(vld1_s8(wc0 + 56));  // w7
-            int16x8_t v8 = vmovl_s8(vld1_s8(wc0 + 64));  // w8
-
-            const int8_t* r0 = inr0;
-            const int8_t* r1 = inr1;
-            const int8_t* r2 = inr2;
-            const int8_t* r3 = inr3;
-            const int8_t* r4 = inr4;
-
-            int32_t* ptr_out0 = pre_out0;
-            int32_t* ptr_out1 = pre_out1;
-            int cnt = w_loop;
-
-            asm volatile(
-                "ldr    q0,    [%[r0]], #8  \n" /* load input r0 */
-                "ldr    q1,    [%[r2]], #8  \n" /* load input r2 */
-                "sshll  v0.8h, v0.8b, #0    \n" /*  r0: int8 -> int16 */
-                "sshll  v1.8h, v1.8b, #0    \n" /*  r1: int8 -> int16*/
-                "1:                         \n" /* main loop */
-
-                /* r0, r2 mul w00 */
-                "smull   v4.4s,   %[v0].4h,  v0.h[0]\n" /* outr00 = v0 * r0[0]
-                                                           */
-                "smull2  v5.4s,   %[v0].8h,  v0.h[0]\n" /* outr00 = v0 * r0[0]
-                                                           */
-                "smull   v6.4s,   %[v0].4h,  v0.h[2]\n" /* outr01 = v0 * r0[2]
-                                                           */
-                "smull2  v7.4s,   %[v0].8h,  v0.h[2]\n" /* outr00 = v0 * r0[0]
-                                                           */
-                "smull   v8.4s,   %[v0].4h,  v0.h[4]\n" /* outr02 = v0 * r0[4]
-                                                           */
-                "smull2  v9.4s,   %[v0].8h,  v0.h[4]\n" /* outr00 = v0 * r0[0]
-                                                           */
-                "smull   v10.4s,  %[v0].4h,  v0.h[6]\n" /* outr03 = v0 * r0[6]
-                                                           */
-                "smull2  v11.4s,  %[v0].8h,  v0.h[6]\n" /* outr00 = v0 * r0[0]
-                                                           */
-
-                "smull   v12.4s,  %[v0].4h,  v1.h[0]\n" /* outr10 = v0 * r2[0]
-                                                           */
-                "smull2  v13.4s,  %[v0].8h,  v1.h[0]\n" /* outr11 = v0 * r2[2]
-                                                           */
-                "smull   v14.4s,  %[v0].4h,  v1.h[2]\n" /* outr12 = v0 * r2[4]
-                                                           */
-                "smull2  v15.4s,  %[v0].8h,  v1.h[2]\n" /* outr13 = v0 * r2[6]
-                                                           */
-                "smull   v16.4s,  %[v0].4h,  v1.h[4]\n" /* outr10 = v0 * r2[0]
-                                                           */
-                "smull2  v17.4s,  %[v0].8h,  v1.h[4]\n" /* outr11 = v0 * r2[2]
-                                                           */
-                "smull   v18.4s,  %[v0].4h,  v1.h[6]\n" /* outr12 = v0 * r2[4]
-                                                           */
-                "smull2  v19.4s,  %[v0].8h,  v1.h[6]\n" /* outr13 = v0 * r2[6]
-                                                           */
-
-                /* r2, mul w06 */
-                "smlal   v4.4s,   %[v6].4h,  v1.h[0]\n" /* outr00 = v6 * r2[1]
-                                                           */
-                "smlal2  v5.4s,   %[v6].8h,  v1.h[0]\n" /* outr01 = v6 * r2[3]
-                                                           */
-                "smlal   v6.4s,   %[v6].4h,  v1.h[2]\n" /* outr02 = v6 * r2[5]
-                                                           */
-                "smlal2  v7.4s,   %[v6].8h,  v1.h[2]\n" /* outr03 = v6 * r2[7]
-                                                           */
-                "smlal   v8.4s,   %[v6].4h,  v1.h[4]\n" /* outr00 = v6 * r2[1]
-                                                           */
-                "smlal2  v9.4s,   %[v6].8h,  v1.h[4]\n" /* outr01 = v6 * r2[3]
-                                                           */
-                "smlal   v10.4s,  %[v6].4h,  v1.h[6]\n" /* outr02 = v6 * r2[5]
-                                                           */
-                "smlal2  v11.4s,  %[v6].8h,  v1.h[6]\n" /* outr03 = v6 * r2[7]
-                                                           */
-
-                "ldr    q2,      [%[r0]]        \n" /* load r0, 9th
-                                                       data,v10.s[0] */
-
-                /*  r0, r2, mul w01 */
-                "smlal   v4.4s,   %[v1].4h,  v0.h[1]\n" /* outr00 = v0 * r0[0]
-                                                           */
-                "smlal2  v5.4s,   %[v1].8h,  v0.h[1]\n" /* outr00 = v0 * r0[0]
-                                                           */
-                "smlal   v6.4s,   %[v1].4h,  v0.h[3]\n" /* outr01 = v0 * r0[2]
-                                                           */
-                "smlal2  v7.4s,   %[v1].8h,  v0.h[3]\n" /* outr00 = v0 * r0[0]
-                                                           */
-                "sshll   v2.8h,   v2.8b,     #0     \n" /*  r0: int8 -> int16 */
-                "smlal   v8.4s,   %[v1].4h,  v0.h[5]\n" /* outr02 = v0 * r0[4]
-                                                           */
-                "smlal2  v9.4s,   %[v1].8h,  v0.h[5]\n" /* outr00 = v0 * r0[0]
-                                                           */
-                "smlal   v10.4s,  %[v1].4h,  v0.h[7]\n" /* outr03 = v0 * r0[6]
-                                                           */
-                "smlal2  v11.4s,  %[v1].8h,  v0.h[7]\n" /* outr00 = v0 * r0[0]
-                                                           */
-
-                "smlal   v12.4s,  %[v1].4h,  v1.h[1]\n" /* outr10 = v0 * r2[0]
-                                                           */
-                "smlal2  v13.4s,  %[v1].8h,  v1.h[1]\n" /* outr11 = v0 * r2[2]
-                                                           */
-                "smlal   v14.4s,  %[v1].4h,  v1.h[3]\n" /* outr12 = v0 * r2[4]
-                                                           */
-                "smlal2  v15.4s,  %[v1].8h,  v1.h[3]\n" /* outr13 = v0 * r2[6]
-                                                           */
-                "smlal   v16.4s,  %[v1].4h,  v1.h[5]\n" /* outr10 = v0 * r2[0]
-                                                           */
-                "smlal2  v17.4s,  %[v1].8h,  v1.h[5]\n" /* outr11 = v0 * r2[2]
-                                                           */
-                "smlal   v18.4s,  %[v1].4h,  v1.h[7]\n" /* outr12 = v0 * r2[4]
-                                                           */
-                "smlal2  v19.4s,  %[v1].8h,  v1.h[7]\n" /* outr13 = v0 * r2[6]
-                                                           */
-
-                /* r2, mul w07 */
-                "smlal   v4.4s,   %[v7].4h,  v1.h[1]\n" /* outr00 = v6 * r2[1]
-                                                           */
-                "smlal2  v5.4s,   %[v7].8h,  v1.h[1]\n" /* outr01 = v6 * r2[3]
-                                                           */
-                "smlal   v6.4s,   %[v7].4h,  v1.h[3]\n" /* outr02 = v6 * r2[5]
-                                                           */
-                "smlal2  v7.4s,   %[v7].8h,  v1.h[3]\n" /* outr03 = v6 * r2[7]
-                                                           */
-                "smlal   v8.4s,   %[v7].4h,  v1.h[5]\n" /* outr00 = v6 * r2[1]
-                                                           */
-                "smlal2  v9.4s,   %[v7].8h,  v1.h[5]\n" /* outr01 = v6 * r2[3]
-                                                           */
-                "smlal   v10.4s,  %[v7].4h,  v1.h[7]\n" /* outr02 = v6 * r2[5]
-                                                           */
-                "smlal2  v11.4s,  %[v7].8h,  v1.h[7]\n" /* outr03 = v6 * r2[7]
-                                                           */
-
-                "ldr     q3,      [%[r2]]        \n" /* load r2, 9th
-                                                        data,v11.s[0] */
-
-                /*  r0, r2, mul w02 */
-                "smlal   v4.4s,   %[v2].4h,  v0.h[2]\n" /* outr00 = v0 * r0[0]
-                                                           */
-                "smlal2  v5.4s,   %[v2].8h,  v0.h[2]\n" /* outr00 = v0 * r0[0]
-                                                           */
-                "smlal   v6.4s,   %[v2].4h,  v0.h[4]\n" /* outr01 = v0 * r0[2]
-                                                           */
-                "smlal2  v7.4s,   %[v2].8h,  v0.h[4]\n" /* outr00 = v0 * r0[0]
-                                                           */
-                "sshll   v3.8h,   v3.8b,     #0     \n" /* r2: int8 -> int16*/
-                "smlal   v8.4s,   %[v2].4h,  v0.h[6]\n" /* outr02 = v0 * r0[4]
-                                                           */
-                "smlal2  v9.4s,   %[v2].8h,  v0.h[6]\n" /* outr00 = v0 * r0[0]
-                                                           */
-                "smlal   v10.4s,  %[v2].4h,  v2.h[0]\n" /* outr03 = v0 * r0[6]
-                                                           */
-                "smlal2  v11.4s,  %[v2].8h,  v2.h[0]\n" /* outr00 = v0 * r0[0]
-                                                           */
-
-                "ldr     q0, [%[r1]], #8 \n" /* load input r1 */
-
-                "smlal   v12.4s,  %[v2].4h,  v1.h[2]\n" /* outr10 = v0 * r2[0]
-                                                           */
-                "smlal2  v13.4s,  %[v2].8h,  v1.h[2]\n" /* outr11 = v0 * r2[2]
-                                                           */
-                "smlal   v14.4s,  %[v2].4h,  v1.h[4]\n" /* outr12 = v0 * r2[4]
-                                                           */
-                "smlal2  v15.4s,  %[v2].8h,  v1.h[4]\n" /* outr13 = v0 * r2[6]
-                                                           */
-                "sshll   v0.8h,   v0.8b,     #0     \n" /* r1 : int8 -> int16 */
-                "smlal   v16.4s,  %[v2].4h,  v1.h[6]\n" /* outr10 = v0 * r2[0]
-                                                           */
-                "smlal2  v17.4s,  %[v2].8h,  v1.h[6]\n" /* outr11 = v0 * r2[2]
-                                                           */
-                "smlal   v18.4s,  %[v2].4h,  v3.h[0]\n" /* outr12 = v0 * r2[4]
-                                                           */
-                "smlal2  v19.4s,  %[v2].8h,  v3.h[0]\n" /* outr13 = v0 * r2[6]
-                                                           */
-
-                /* r2, mul w08 */
-                "smlal   v4.4s,   %[v8].4h,  v1.h[2]\n" /* outr00 = v6 * r2[1]
-                                                           */
-                "smlal2  v5.4s,   %[v8].8h,  v1.h[2]\n" /* outr01 = v6 * r2[3]
-                                                           */
-                "smlal   v6.4s,   %[v8].4h,  v1.h[4]\n" /* outr02 = v6 * r2[5]
-                                                           */
-                "smlal2  v7.4s,   %[v8].8h,  v1.h[4]\n" /* outr03 = v6 * r2[7]
-                                                           */
-                "smlal   v8.4s,   %[v8].4h,  v1.h[6]\n" /* outr00 = v6 * r2[1]
-                                                           */
-                "smlal2  v9.4s,   %[v8].8h,  v1.h[6]\n" /* outr01 = v6 * r2[3]
-                                                           */
-                "smlal   v10.4s,  %[v8].4h,  v3.h[0]\n" /* outr02 = v6 * r2[5]
-                                                           */
-                "smlal2  v11.4s,  %[v8].8h,  v3.h[0]\n" /* outr03 = v6 * r2[7]
-                                                           */
-
-                "ldr     q1, [%[r3]], #8 \n" /* load input r3 */
-
-                /*  r1, r3, mul w03 */
-                "smlal   v4.4s,   %[v3].4h,  v0.h[0]\n" /* outr00 = v0 * r0[0]
-                                                           */
-                "smlal2  v5.4s,   %[v3].8h,  v0.h[0]\n" /* outr00 = v0 * r0[0]
-                                                           */
-                "smlal   v6.4s,   %[v3].4h,  v0.h[2]\n" /* outr01 = v0 * r0[2]
-                                                           */
-                "smlal2  v7.4s,   %[v3].8h,  v0.h[2]\n" /* outr00 = v0 * r0[0]
-                                                           */
-                "sshll   v1.8h,   v1.8b,     #0     \n" /* r3: int8 -> int16 */
-                "smlal   v8.4s,   %[v3].4h,  v0.h[4]\n" /* outr02 = v0 * r0[4]
-                                                           */
-                "smlal2  v9.4s,   %[v3].8h,  v0.h[4]\n" /* outr00 = v0 * r0[0]
-                                                           */
-                "smlal   v10.4s,  %[v3].4h,  v0.h[6]\n" /* outr03 = v0 * r0[6]
-                                                           */
-                "smlal2  v11.4s,  %[v3].8h,  v0.h[6]\n" /* outr00 = v0 * r0[0]
-                                                           */
-                "ldr     q2,       [%[r1]]          \n" /* load r1, 9th
-                                                           data,v10.s[0] */
-
-                "smlal   v12.4s,  %[v3].4h,  v1.h[0]\n" /* outr10 = v0 * r2[0]
-                                                           */
-                "smlal2  v13.4s,  %[v3].8h,  v1.h[0]\n" /* outr11 = v0 * r2[2]
-                                                           */
-                "smlal   v14.4s,  %[v3].4h,  v1.h[2]\n" /* outr12 = v0 * r2[4]
-                                                           */
-                "smlal2  v15.4s,  %[v3].8h,  v1.h[2]\n" /* outr13 = v0 * r2[6]
-                                                           */
-                "ldr     q3,      [%[r3]]          \n"  /* load r3, 9th
-                                                           data,v11.s[0] */
-                "smlal   v16.4s,  %[v3].4h,  v1.h[4]\n" /* outr10 = v0 * r2[0]
-                                                           */
-                "smlal2  v17.4s,  %[v3].8h,  v1.h[4]\n" /* outr11 = v0 * r2[2]
-                                                           */
-                "smlal   v18.4s,  %[v3].4h,  v1.h[6]\n" /* outr12 = v0 * r2[4]
-                                                           */
-                "smlal2  v19.4s,  %[v3].8h,  v1.h[6]\n" /* outr13 = v0 * r2[6]
-                                                           */
-                "sshll v2.8h, v2.8b, #0 \n"             /* r1 : int8 -> int16 */
-
-                /*  r1, r3, mul w05 */
-                "smlal   v4.4s,   %[v5].4h,  v0.h[2]\n" /* outr00 = v0 * r0[0]
-                                                           */
-                "smlal2  v5.4s,   %[v5].8h,  v0.h[2]\n" /* outr00 = v0 * r0[0]
-                                                           */
-                "smlal   v6.4s,   %[v5].4h,  v0.h[4]\n" /* outr01 = v0 * r0[2]
-                                                           */
-                "smlal2  v7.4s,   %[v5].8h,  v0.h[4]\n" /* outr00 = v0 * r0[0]
-                                                           */
-                "sshll   v3.8h,   v3.8b,     #0     \n" /* r3 : int8 -> int16 */
-                "smlal   v8.4s,   %[v5].4h,  v0.h[6]\n" /* outr02 = v0 * r0[4]
-                                                           */
-                "smlal2  v9.4s,   %[v5].8h,  v0.h[6]\n" /* outr00 = v0 * r0[0]
-                                                           */
-                "smlal   v10.4s,  %[v5].4h,  v2.h[0]\n" /* outr03 = v0 * r0[6]
-                                                           */
-                "smlal2  v11.4s,  %[v5].8h,  v2.h[0]\n" /* outr00 = v0 * r0[0]
-                                                           */
-
-                "smlal   v12.4s,  %[v5].4h,  v1.h[2]\n" /* outr10 = v0 * r2[0]
-                                                           */
-                "smlal2  v13.4s,  %[v5].8h,  v1.h[2]\n" /* outr11 = v0 * r2[2]
-                                                           */
-                "smlal   v14.4s,  %[v5].4h,  v1.h[4]\n" /* outr12 = v0 * r2[4]
-                                                           */
-                "smlal2  v15.4s,  %[v5].8h,  v1.h[4]\n" /* outr13 = v0 * r2[6]
-                                                           */
-                "smlal   v16.4s,  %[v5].4h,  v1.h[6]\n" /* outr10 = v0 * r2[0]
-                                                           */
-                "smlal2  v17.4s,  %[v5].8h,  v1.h[6]\n" /* outr11 = v0 * r2[2]
-                                                           */
-                "smlal   v18.4s,  %[v5].4h,  v3.h[0]\n" /* outr12 = v0 * r2[4]
-                                                           */
-                "smlal2  v19.4s,  %[v5].8h,  v3.h[0]\n" /* outr13 = v0 * r2[6]
-                                                           */
-
-                "subs    %w[cnt], %w[cnt], #1       \n" /* loop count -1 */
-
-                /*  r1, r3, mul w04 */
-                "smlal   v4.4s,   %[v4].4h,  v0.h[1]\n" /* outr00 = v0 * r0[0]
-                                                           */
-                "smlal2  v5.4s,   %[v4].8h,  v0.h[1]\n" /* outr00 = v0 * r0[0]
-                                                           */
-                "smlal   v6.4s,   %[v4].4h,  v0.h[3]\n" /* outr01 = v0 * r0[2]
-                                                           */
-                "smlal2  v7.4s,   %[v4].8h,  v0.h[3]\n" /* outr00 = v0 * r0[0]
-                                                           */
-                "smlal   v8.4s,   %[v4].4h,  v0.h[5]\n" /* outr02 = v0 * r0[4]
-                                                           */
-                "smlal2  v9.4s,   %[v4].8h,  v0.h[5]\n" /* outr00 = v0 * r0[0]
-                                                           */
-                "smlal   v10.4s,  %[v4].4h,  v0.h[7]\n" /* outr03 = v0 * r0[6]
-                                                           */
-                "smlal2  v11.4s,  %[v4].8h,  v0.h[7]\n" /* outr00 = v0 * r0[0]
-                                                           */
-
-                "ldr     q0, [%[r4]], #8            \n" /* load input r4 */
-
-                "smlal   v12.4s,  %[v4].4h,  v1.h[1]\n" /* outr10 = v0 * r2[0]
-                                                           */
-                "smlal2  v13.4s,  %[v4].8h,  v1.h[1]\n" /* outr11 = v0 * r2[2]
-                                                           */
-                "smlal   v14.4s,  %[v4].4h,  v1.h[3]\n" /* outr12 = v0 * r2[4]
-                                                           */
-                "smlal2  v15.4s,  %[v4].8h,  v1.h[3]\n" /* outr13 = v0 * r2[6]
-                                                           */
-                "sshll   v0.8h,   v0.8b,     #0     \n" /* r4 : int8 -> int16 */
-                "smlal   v16.4s,  %[v4].4h,  v1.h[5]\n" /* outr10 = v0 * r2[0]
-                                                           */
-                "smlal2  v17.4s,  %[v4].8h,  v1.h[5]\n" /* outr11 = v0 * r2[2]
-                                                           */
-                "smlal   v18.4s,  %[v4].4h,  v1.h[7]\n" /* outr12 = v0 * r2[4]
-                                                           */
-                "smlal2  v19.4s,  %[v4].8h,  v1.h[7]\n" /* outr13 = v0 * r2[6]
-                                                           */
-
-                "ldr     q2,      [%[r4]]           \n" /* load r4, 9th
-                                                           data,v10.s[0] */
-                "sshll   v2.8h,   v2.8b,     #0     \n" /* r4 : int8 -> int16 */
-
-                "ldp     q1, q3, [%[ptr_out0]]      \n"  /* load ptr_out + 0  ->
-                                                            q2, q3 */
-                "ldp     q20, q21, [%[ptr_out0], #32]\n" /* load ptr_out + 32 ->
-                                                            q4, q5 */
-
-                "add     v4.4s,  v1.4s ,  v4.4s     \n" /* v10 = outr00[0].low
-                                                           + q2 */
-                "add     v5.4s,  v3.4s ,  v5.4s     \n" /* v11 = outr00[0].high
-                                                           + q3 */
-                "add     v6.4s,  v20.4s,  v6.4s     \n" /* v12 = outr01[0].low
-                                                           + q4 */
-                "add     v7.4s,  v21.4s,  v7.4s     \n" /* v13 = outr01[0].high
-                                                           + q5 */
-
-                "ldp     q1 , q3 , [%[ptr_out0], #64]\n" /* load ptr_out + 64 ->
-                                                            q6, q7 */
-                "ldp     q20, q21, [%[ptr_out0], #96]\n" /* load ptr_out + 96 ->
-                                                            q8, q9 */
-
-                "stp     q4,  q5 , [%[ptr_out0]], #32\n" /* store q10, q11 ->
-                                                            ptr_out   */
-                "stp     q6,  q7 , [%[ptr_out0]], #32\n" /* store q10, q11 ->
-                                                            ptr_out   */
-
-                "add     v8.4s ,  v1.4s ,  v8.4s     \n" /* v10 = outr00[0].low
-                                                            + q2 */
-                "add     v9.4s ,  v3.4s ,  v9.4s     \n" /* v11 = outr00[0].high
-                                                            + q3 */
-                "add     v10.4s,  v20.4s,  v10.4s    \n" /* v12 = outr01[0].low
-                                                            + q4 */
-                "add     v11.4s,  v21.4s,  v11.4s    \n" /* v13 = outr01[0].high
-                                                            + q5 */
-                "stp     q8,  q9,  [%[ptr_out0]], #32\n" /* store q14, q15 ->
-                                                            ptr_out += 64 */
-                "stp     q10, q11, [%[ptr_out0]], #32\n" /* store q16, q17 ->
-                                                            ptr_out += 96 */
-
-                /* r4, mul w08 */
-                "smlal   v12.4s,   %[v8].4h,  v0.h[2]\n" /* outr00 = v0 * r0[0]
-                                                            */
-                "smlal2  v13.4s,   %[v8].8h,  v0.h[2]\n" /* outr00 = v0 * r0[0]
-                                                            */
-                "smlal   v14.4s,   %[v8].4h,  v0.h[4]\n" /* outr01 = v0 * r0[2]
-                                                            */
-                "smlal2  v15.4s,   %[v8].8h,  v0.h[4]\n" /* outr00 = v0 * r0[0]
-                                                            */
-
-                "smlal   v16.4s,   %[v8].4h,  v0.h[6]\n" /* outr02 = v0 * r0[4]
-                                                            */
-                "smlal2  v17.4s,   %[v8].8h,  v0.h[6]\n" /* outr00 = v0 * r0[0]
-                                                            */
-                "smlal   v18.4s,   %[v8].4h,  v2.h[0]\n" /* outr03 = v0 * r0[6]
-                                                            */
-                "smlal2  v19.4s,   %[v8].8h,  v2.h[0]\n" /* outr00 = v0 * r0[0]
-                                                            */
-
-                /* r4, mul w07 */
-                "smlal   v12.4s,   %[v7].4h,  v0.h[1]\n" /* outr00 = v0 * r0[0]
-                                                            */
-                "smlal2  v13.4s,   %[v7].8h,  v0.h[1]\n" /* outr00 = v0 * r0[0]
-                                                            */
-                "smlal   v14.4s,   %[v7].4h,  v0.h[3]\n" /* outr01 = v0 * r0[2]
-                                                            */
-                "smlal2  v15.4s,   %[v7].8h,  v0.h[3]\n" /* outr00 = v0 * r0[0]
-                                                            */
-
-                "ldr     q1,   [%[r2]], #8            \n" /* load input r2 */
-
-                "smlal   v16.4s,   %[v7].4h,  v0.h[5]\n" /* outr02 = v0 * r0[4]
-                                                            */
-                "smlal2  v17.4s,   %[v7].8h,  v0.h[5]\n" /* outr00 = v0 * r0[0]
-                                                            */
-                "smlal   v18.4s,   %[v7].4h,  v0.h[7]\n" /* outr03 = v0 * r0[6]
-                                                            */
-                "smlal2  v19.4s,   %[v7].8h,  v0.h[7]\n" /* outr00 = v0 * r0[0]
-                                                            */
-
-                "sshll   v1.8h,    v1.8b,     #0     \n" /*  r2: int8 -> int16
-                                                            */
-
-                /* r4, mul w06 */
-                "ldp     q4,  q5,  [%[ptr_out1]]     \n" /* load ptr_out + 0  ->
-                                                            q2, q3 */
-
-                "smlal   v12.4s,   %[v6].4h,  v0.h[0]\n" /* outr00 = v0 * r0[0]
-                                                            */
-                "smlal2  v13.4s,   %[v6].8h,  v0.h[0]\n" /* outr00 = v0 * r0[0]
-                                                            */
-                "smlal   v14.4s,   %[v6].4h,  v0.h[2]\n" /* outr01 = v0 * r0[2]
-                                                            */
-
-                "ldp     q8,  q9,  [%[ptr_out1], #64]\n" /* load ptr_out + 64 ->
-                                                            q6, q7 */
-
-                "smlal2  v15.4s,   %[v6].8h,  v0.h[2]\n" /* outr00 = v0 * r0[0]
-                                                            */
-                "smlal   v16.4s,   %[v6].4h,  v0.h[4]\n" /* outr02 = v0 * r0[4]
-                                                            */
-                "smlal2  v17.4s,   %[v6].8h,  v0.h[4]\n" /* outr00 = v0 * r0[0]
-                                                            */
-
-                "ldp     q10, q11, [%[ptr_out1], #96]\n" /* load ptr_out + 96 ->
-                                                            q8, q9 */
-
-                "smlal   v18.4s,   %[v6].4h,  v0.h[6]\n" /* outr03 = v0 * r0[6]
-                                                            */
-                "smlal2  v19.4s,   %[v6].8h,  v0.h[6]\n" /* outr00 = v0 * r0[0]
-                                                            */
-
-                "ldr     q0,   [%[r0]], #8           \n" /* load input r2 */
-                "ldp     q6,   q7, [%[ptr_out1], #32]\n" /* load ptr_out + 32 ->
-                                                            q4, q5 */
-
-                "sshll   v0.8h, v0.8b, #0            \n" /* r0: int8 -> int16 */
-
-                /* store outr1 */
-                "add   v12.4s, v4.4s , v12.4s\n" /* v10 = outr10[0].low  + q2 */
-                "add   v13.4s, v5.4s , v13.4s\n" /* v11 = outr10[0].high + q3 */
-                "add   v14.4s, v6.4s , v14.4s\n" /* v12 = outr11[0].low  + q4 */
-                "add   v15.4s, v7.4s , v15.4s\n" /* v13 = outr11[0].high + q5 */
-
-                "stp   q12, q13, [%[ptr_out1]], #32\n" /* store q10, q11 ->
-                                                          ptr_out       */
-
-                "add   v16.4s, v8.4s , v16.4s\n" /* v14 = outr12[0].low  + q6 */
-                "add   v17.4s, v9.4s , v17.4s\n" /* v15 = outr12[0].high + q7 */
-
-                "stp   q14, q15, [%[ptr_out1]], #32\n" /* store q12, q13 ->
-                                                          ptr_out += 32 */
-
-                "add   v18.4s, v10.4s, v18.4s\n" /* v16 = outr13[0].low  + q8 */
-                "add   v19.4s, v11.4s, v19.4s\n" /* v17 = outr13[0].high + q9 */
-
-                "stp   q16, q17, [%[ptr_out1]], #32\n" /* store q14, q15 ->
-                                                          ptr_out += 64 */
-                "stp   q18, q19, [%[ptr_out1]], #32\n" /* store q16, q17 ->
-                                                          ptr_out += 96 */
-
-                "bne     1b                        \n" /* jump to main loop */
-
-                : [cnt] "+r"(cnt),
-                  [r0] "+r"(r0),
-                  [r1] "+r"(r1),
-                  [r2] "+r"(r2),
-                  [r3] "+r"(r3),
-                  [r4] "+r"(r4),
-                  [ptr_out0] "+r"(ptr_out0),
-                  [ptr_out1] "+r"(ptr_out1)
-                : [v0] "w"(v0),
-                  [v1] "w"(v1),
-                  [v2] "w"(v2),
-                  [v3] "w"(v3),
-                  [v4] "w"(v4),
-                  [v5] "w"(v5),
-                  [v6] "w"(v6),
-                  [v7] "w"(v7),
-                  [v8] "w"(v8)
-                : "cc",
-                  "memory",
-                  "v0",
-                  "v1",
-                  "v2",
-                  "v3",
-                  "v4",
-                  "v5",
-                  "v6",
-                  "v7",
-                  "v8",
-                  "v9",
-                  "v10",
-                  "v11",
-                  "v12",
-                  "v13",
-                  "v14",
-                  "v15",
-                  "v16",
-                  "v17",
-                  "v18",
-                  "v19",
-                  "v20",
-                  "v21",
-                  "v22");
-
-            wc0 += 9 * hout_c_block;
-            inr0 += win_round;
-            inr1 += win_round;
-            inr2 += win_round;
-            inr3 += win_round;
-            inr4 += win_round;
-          }
-          block_inr0 = block_inr4;
-          block_inr1 = block_inr0 + in_len;
-          block_inr2 = block_inr1 + in_len;
-          block_inr3 = block_inr2 + in_len;
-          block_inr4 = block_inr3 + in_len;
-        }
-        if (out_type == PRECISION(kFloat)) {
-          write_to_output_c8_int32_1(pre_out,
-                                     reinterpret_cast<float*>(dout_batch),
-                                     hout_c_block,
-                                     2,
-                                     c,
-                                     c + hout_c_block,
-                                     h,
-                                     h + h_kernel,
-                                     0,
-                                     wout_round,
-                                     chout,
-                                     hout,
-                                     wout,
-                                     flag_relu,
-                                     reinterpret_cast<float*>(ptr_write),
-                                     &scale[c],
-                                     out_type);
-        } else if (out_type == PRECISION(kInt8)) {
-          write_to_output_c8_int32_1(pre_out,
-                                     dout_batch,
-                                     hout_c_block,
-                                     2,
-                                     c,
-                                     c + hout_c_block,
-                                     h,
-                                     h + h_kernel,
-                                     0,
-                                     wout_round,
-                                     chout,
-                                     hout,
-                                     wout,
-                                     flag_relu,
-                                     reinterpret_cast<signed char*>(ptr_write),
-                                     &scale[c],
-                                     out_type);
-        } else {
-          write_to_output_c8_int32(pre_out,
-                                   reinterpret_cast<int*>(dout_batch),
-                                   hout_c_block,
-                                   2,
-                                   c,
-                                   c + hout_c_block,
-                                   h,
-                                   h + h_kernel,
-                                   0,
-                                   wout_round,
-                                   chout,
-                                   hout,
-                                   wout,
-                                   flag_relu,
-                                   ptr_write);
-        }
-      }
-    }
-  }
-}
-
-#else  // __aarch64__
-int conv_3x3s2_direct_int8_c_num() { return 4; }
-void conv_3x3s2_direct_int8(const int8_t* din,
-                            int32_t* dout,
-                            int num,
-                            int chout,
-                            int hout,
-                            int wout,
-                            int chin,
-                            int hin,
-                            int win,
-                            const int8_t* weights,
-                            const int32_t* bias,
-                            const operators::ConvParam& param,
-                            Context<TARGET(kARM)>* ctx,
-                            PrecisionType out_type,
-                            const float* scale) {
-  //! 3x3s2 int8 convolution, implemented by direct algorithm
-  //! prepack input to tmp buffer
-  //! write output to tmp buffer
-  int threads = ctx->threads();
-  int stride_w = param.strides[1];
-  int pad_w = param.paddings[1];
-  int pad_h = param.paddings[0];
-  bool flag_relu = param.fuse_relu;
-  bool flag_bias = (param.bias != nullptr);
-
-  //! set 2/3 l2 cache
-  int l2_size = ctx->llc_size() / 3 * 2;
-  const int hout_c_block = 4;
-  const int hout_r_kernel = 1;
-  const int wout_round = ((wout + 3) / 4) * 4;
-  const int win_round = wout_round * stride_w + 1;
-
-  //! get h block
-  //! win_round * chin * hin_r_block * sizeof(int8_t) + wout_round *
-  //! hout_c_block * hout_r_block * threads * sizeof(int32_t)= l2_size
-  //! win_round = 2 * wout_round + 1
-  //! hin_r_block = 2 * hout_r_block + 1
-  int hout_r_block =
-      (l2_size - 2 * wout_round * chin - chin) /
-      ((4 * wout_round + 2) * chin + wout_round * hout_c_block * threads * 4);
-  hout_r_block = hout_r_block > hout ? hout : hout_r_block;
-  hout_r_block = (hout_r_block / hout_r_kernel) * hout_r_kernel;
-  hout_r_block = hout_r_block < hout_r_kernel ? hout_r_kernel : hout_r_block;
-
-  const int hin_r_block = hout_r_block * 2 + 1;
-
-  int8_t* tmp_work_space = ctx->workspace_data<int8_t>();
-  int zero_size = chout > (win_round + 3) / 4 ? chout : (win_round + 3) / 4;
-  const int kZeroSize = zero_size;
-  int32_t ptr_zero[kZeroSize];
-  memset(ptr_zero, 0, sizeof(int32_t) * zero_size);
-  const int kWoutRound = wout_round;
-  int32_t ptr_write[kWoutRound];
-
-  int in_len = win_round * chin;
-  int pre_in_size = hin_r_block * in_len;
-  int pre_out_size = hout_c_block * hout_r_block * wout_round;
-
-  //! l2_cache start
-  int8_t* pre_din = tmp_work_space;
-
-  int size_in_channel = win * hin;
-  int size_out_channel = wout * hout;
-  int w_stride = chin * 9;
-
-  int ws = -pad_w;
-  int we = ws + win_round;
-  int w_loop = wout_round / 4;
-
-  int out_row_stride = hout_c_block * wout_round;
-
-  for (int n = 0; n < num; ++n) {
-    const int8_t* din_batch = din + n * chin * size_in_channel;
-    int8_t* dout_batch =
-        reinterpret_cast<int8_t*>(dout) +
-        n * chout * size_out_channel * PrecisionTypeLength(out_type);
-    for (int h = 0; h < hout; h += hout_r_block) {
-      int h_kernel = hout_r_block;
-      if (h + hout_r_block > hout) {
-        h_kernel = hout - h;
-      }
-      int hs = h * 2 - pad_h;
-      int he = hs + h_kernel * 2 + 1;
-      prepack_input_nxw(din_batch,
-                        pre_din,
-                        0,
-                        chin,
-                        hs,
-                        he,
-                        ws,
-                        we,
-                        chin,
-                        win,
-                        hin,
-                        reinterpret_cast<int8_t*>(ptr_zero));
-
-      const int8_t* cblock_inr0 = pre_din;
-      const int8_t* cblock_inr1 = cblock_inr0 + in_len;
-      const int8_t* cblock_inr2 = cblock_inr1 + in_len;
-#pragma omp parallel for num_threads(threads)
-      for (int c = 0; c < chout; c += hout_c_block) {
-#ifdef ARM_WITH_OMP
-        int32_t* pre_out =
-            reinterpret_cast<int*>(pre_din + (pre_in_size + 3) / 4 * 4) +
-            omp_get_thread_num() * pre_out_size;
-#else
-        int32_t* pre_out =
-            reinterpret_cast<int32_t*>(pre_din + (pre_in_size + 3) / 4 * 4);
-#endif
-        const int8_t* block_inr0 = cblock_inr0;
-        const int8_t* block_inr1 = cblock_inr1;
-        const int8_t* block_inr2 = cblock_inr2;
-
-        const int8_t* weight_c = weights + c * w_stride;
-        const int32_t* bias_ptr = ptr_zero;
-        if (flag_bias) {
-          bias_ptr = bias + c;
-        }
-
-        fill_packed_bias_nxmw_int8(bias_ptr, pre_out, 4, h_kernel, wout_round);
-        for (int hk = 0; hk < h_kernel; hk += hout_r_kernel) {
-          const int8_t* wc0 = weight_c;
-
-          const int8_t* inr0 = block_inr0;
-          const int8_t* inr1 = block_inr1;
-          const int8_t* inr2 = block_inr2;
-
-          int32_t* pre_out0 = pre_out + hk * out_row_stride;
-          for (int i = 0; i < chin; ++i) {
-            const int8_t* r0 = inr0;
-            const int8_t* r1 = inr1;
-            const int8_t* r2 = inr2;
-
-            int32_t* ptr_out0 = pre_out0;
-            const signed char* ptr_wc0 = wc0;
-            int cnt = w_loop;
-            asm volatile(
-                "vld1.s32   {d0-d3}, [%[wc0]]!  \n" /* w0-w7 */
-                "vld1.s32   {d4},   [%[wc0]]!   \n" /* w8 */
-                "vmovl.s8   q3,   d0            \n" /* q3 = w0, w1 */
-                "vmovl.s8   q4,   d1            \n" /* q4 = w2 ,w3 */
-                "vmovl.s8   q5,   d2            \n" /* q5 = w4, w5 */
-                "vmovl.s8   q6,   d3            \n" /* q6 = w6, w7 */
-                "vmovl.s8   q7,   d4            \n" /* q7 = w8 */
-                "vld1.s32   {d0}, [%[r0]]!      \n" /* load input r0 -> d0 */
-                "vmovl.s8   q0,   d0            \n" /* movl d0 -> q0 */
-                "1:                             \n" /* main loop */
-
-                /* r0 mul w0 */
-                "vmull.s16 q8, d6, d0[0]   \n" /* q8 = w0 * r0[0] */
-                "vmull.s16 q9, d6, d0[2]   \n" /* q9 = w0 * r0[2] */
-                "vmull.s16 q10, d6, d1[0]  \n" /* q10 = w0 * r0[4] */
-                "vmull.s16 q11, d6, d1[2]  \n" /* q11 = w0 * r0[6] */
-
-                "vld1.s32 {d2}, [%[r1]]!   \n" /* load input r1 -> d2 */
-                "vmovl.s8 q1,   d2         \n" /* movl d2 -> q1 */
-
-                /* r0 mul w1 */
-                "vmlal.s16 q8, d7, d0[1]   \n" /* q8 = w1 * r0[1] */
-                "vmlal.s16 q9, d7, d0[3]   \n" /* q9 = w1 * r0[3] */
-                "vmlal.s16 q10, d7, d1[1]  \n" /* q10 = w1 * r0[5] */
-                "vmlal.s16 q11, d7, d1[3]  \n" /* q11 = w1 * r0[7] */
-
-                "vld1.s32 {d4}, [%[r0]]    \n" /* load r0[8] -> d4 */
-                "vmovl.s8 q2  ,  d4        \n" /* movl d4 -> q2 */
-
-                /* r0 mul w2 */
-                "vmlal.s16 q8, d8, d0[2]   \n" /* q8 = w2 * r0[2] */
-                "vmlal.s16 q9, d8, d1[0]   \n" /* q9 = w2 * r0[4] */
-                "vmlal.s16 q10, d8, d1[2]  \n" /* q10 = w2 * r0[6] */
-                "vmlal.s16 q11, d8, d4[0]  \n" /* q11 = w2 * r0[8] */
-
-                "subs       %[cnt], #1     \n" /* loop count -1 */
-
-                /* r1 mul w3 */
-                "vmlal.s16 q8, d9, d2[0]   \n" /* q8 = w3 * r1[0] */
-                "vmlal.s16 q9, d9, d2[2]   \n" /* q9 = w3 * r1[2] */
-                "vmlal.s16 q10, d9, d3[0]  \n" /* q10 = w3 * r1[4] */
-                "vmlal.s16 q11, d9, d3[2]  \n" /* q11 = w3 * r1[6] */
-
-                "vld1.s32 {d4}, [%[r2]]!   \n" /* load input r2 -> d4*/
-                "vmovl.s8   q2,   d4       \n" /* movl d4 -> q2 */
-
-                /* r1 mul w4 */
-                "vmlal.s16 q8, d10, d2[1]   \n" /* q8 = w4 * r1[1] */
-                "vmlal.s16 q9, d10, d2[3]   \n" /* q9 = w4 * r1[3] */
-                "vmlal.s16 q10, d10, d3[1]  \n" /* q10 = w4 * r1[5] */
-                "vmlal.s16 q11, d10, d3[3]  \n" /* q11 = w4 * r1[7] */
-
-                "vld1.s32 {d0}, [%[r1]]     \n" /* load r1[8] -> d0 */
-                "vmovl.s8   q0,   d0        \n" /* movl d0 -> q0 */
-
-                /* r1 mul w5 */
-                "vmlal.s16 q8, d11, d2[2]   \n" /* q8 = w5 * r1[2] */
-                "vmlal.s16 q9, d11, d3[0]   \n" /* q9 = w5 * r1[4] */
-                "vmlal.s16 q10, d11, d3[2]  \n" /* q10 = w5 * r1[6] */
-                "vmlal.s16 q11, d11, d0[0]  \n" /* q11 = w5 * r1[8] */
-
-                /* r2 mul w6 */
-                "vmlal.s16 q8, d12, d4[0]   \n" /* q8 = w6 * r2[0] */
-                "vmlal.s16 q9, d12, d4[2]   \n" /* q9 = w6 * r2[2] */
-                "vmlal.s16 q10, d12, d5[0]  \n" /* q10 = w6 * r2[4] */
-                "vmlal.s16 q11, d12, d5[2]  \n" /* q11 = w6 * r2[6] */
-
-                "vld1.s32 {d24-d27}, [%[ptr_out0]] \n" /* load output -> q12,
-                                                          q13 */
-
-                /* r2 mul w7 */
-                "vmlal.s16 q8, d13, d4[1]   \n" /* q8 = w7 * r2[1] */
-                "vmlal.s16 q9, d13, d4[3]   \n" /* q9 = w7 * r2[3] */
-                "vmlal.s16 q10, d13, d5[1]  \n" /* q10 = w7 * r2[5] */
-                "vmlal.s16 q11, d13, d5[3]  \n" /* q11 = w7 * r2[7] */
-
-                "vld1.s32 {d0}, [%[r2]]     \n" /* load r2[8] -> d0 */
-                "vmovl.s8   q0,   d0        \n" /* movl d0 -> q0 */
-
-                /* r2 mul w8 */
-                "vmlal.s16 q8, d14, d4[2]   \n" /* q8 = w8 * r2[2] */
-                "vmlal.s16 q9, d14, d5[0]   \n" /* q9 = w8 * r2[4] */
-                "vmlal.s16 q10, d14, d5[2]  \n" /* q10 = w8 * r2[6] */
-                "vmlal.s16 q11, d14, d0[0]  \n" /* q11 = w8 * r2[8] */
-
-                "vadd.s32  q12, q8, q12     \n"         /* out[0] += q8 */
-                "vadd.s32  q13, q9, q13     \n"         /* out[1] += q9 */
-                "vst1.s32 {d24-d27}, [%[ptr_out0]]! \n" /* store q12, q13 ->
-                                                           output[0,1] */
-
-                "vld1.s32  {d0}, [%[r0]]!   \n" /* load next input r0 -> d0*/
-                "vmovl.s8   q0,   d0        \n" /* movl d0 -> q0 */
-
-                "vld1.s32 {d28-d31}, [%[ptr_out0]] \n"  /* load output[0,1] ->
-                                                           q14, q15 */
-                "vadd.s32  q14, q10, q14    \n"         /* out[2] += q10 */
-                "vadd.s32  q15, q11, q15    \n"         /* out[3] += q11 */
-                "vst1.s32 {d28-d31}, [%[ptr_out0]]! \n" /* store q14, q15 ->
-                                                           output[2,3] */
-
-                "bne        1b             \n" /* jump to main loop */
-
-                : [cnt] "+r"(cnt),
-                  [r0] "+r"(r0),
-                  [r1] "+r"(r1),
-                  [r2] "+r"(r2),
-                  [ptr_out0] "+r"(ptr_out0),
-                  [wc0] "+r"(ptr_wc0)
-                :
-                : "cc",
-                  "memory",
-                  "q0",
-                  "q1",
-                  "q2",
-                  "q3",
-                  "q4",
-                  "q5",
-                  "q6",
-                  "q7",
-                  "q8",
-                  "q9",
-                  "q10",
-                  "q11",
-                  "q12",
-                  "q13",
-                  "q14",
-                  "q15");
-            wc0 += 9 * hout_c_block;
-            inr0 += win_round;
-            inr1 += win_round;
-            inr2 += win_round;
-          }
-          block_inr0 = block_inr2;
-          block_inr1 = block_inr0 + in_len;
-          block_inr2 = block_inr1 + in_len;
-        }
-        if (out_type == PRECISION(kFloat)) {
-          write_to_output_c4_int32_1(pre_out,
-                                     reinterpret_cast<float*>(dout_batch),
-                                     hout_c_block,
-                                     1,
-                                     c,
-                                     c + hout_c_block,
-                                     h,
-                                     h + h_kernel,
-                                     0,
-                                     wout_round,
-                                     chout,
-                                     hout,
-                                     wout,
-                                     flag_relu,
-                                     reinterpret_cast<float*>(ptr_write),
-                                     &scale[c],
-                                     out_type);
-        } else if (out_type == PRECISION(kInt8)) {
-          write_to_output_c4_int32_1(pre_out,
-                                     dout_batch,
-                                     hout_c_block,
-                                     1,
-                                     c,
-                                     c + hout_c_block,
-                                     h,
-                                     h + h_kernel,
-                                     0,
-                                     wout_round,
-                                     chout,
-                                     hout,
-                                     wout,
-                                     flag_relu,
-                                     reinterpret_cast<signed char*>(ptr_write),
-                                     &scale[c],
-                                     out_type);
-        } else {
-          write_to_output_c4_int32(pre_out,
-                                   reinterpret_cast<int*>(dout_batch),
-                                   hout_c_block,
-                                   1,
-                                   c,
-                                   c + hout_c_block,
-                                   h,
-                                   h + h_kernel,
-                                   0,
-                                   wout_round,
-                                   chout,
-                                   hout,
-                                   wout,
-                                   flag_relu,
-                                   ptr_write);
-        }
-      }
-    }
-  }
-}
-#endif  // __aarch64__
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_block_utils.h b/lite/backends/arm/math/conv_block_utils.h
deleted file mode 100644
index 3deb6bcb5f..0000000000
--- a/lite/backends/arm/math/conv_block_utils.h
+++ /dev/null
@@ -1,4292 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <arm_neon.h>
-#include <cmath>
-#include "lite/backends/arm/math/saturate.h"
-#include "lite/backends/arm/math/type_trans.h"
-#include "lite/core/target_wrapper.h"
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-#define LITEMAX(a, b) ((a) > (b) ? (a) : (b))
-
-inline void fill_packed_biasc4(float* dout, const float* bias, int size) {
-  float32x4_t vb = vld1q_f32(bias);
-  int cnt = size / 4;
-  for (int i = 0; i < cnt; ++i) {
-    vst1q_f32(dout, vb);
-    dout += 4;
-  }
-}
-
-/*preprocessing weights
-* input weights: [chout, chin/ group, kh, kw] --> outputs weights: [chout / n,
-* chin/ group, kh, kw, n]
-*/
-template <typename dtype>
-static bool conv_trans_weights_numc(const dtype* din,
-                                    dtype* dout,
-                                    int chout,
-                                    int chin,
-                                    int n,
-                                    int kernel_size) {
-  if (n <= 0) {
-    LOG(ERROR) << "ch_n and hei_n are more than zero";
-    return false;
-  }
-  int c_loop = chout / n;
-  int chout_round = (chout + n - 1) / n;
-  int win_stride = chin * kernel_size;
-  int wout_stride = n * win_stride;
-  int co = 0;
-  for (; co < c_loop; ++co) {
-    dtype* dout_c = dout + co * wout_stride;
-    const dtype* din_array[n];
-    din_array[0] = din + co * wout_stride;
-    for (int i = 1; i < n; i++) {
-      din_array[i] = din_array[i - 1] + win_stride;
-    }
-    for (int ci = 0; ci < chin; ++ci) {
-      for (int k = 0; k < kernel_size; ++k) {
-        for (int i = 0; i < n; i++) {
-          *(dout_c++) = *(din_array[i]++);
-        }
-      }
-    }
-  }
-  // pad final chout
-  if (chout_round > c_loop) {
-    dtype* dout_c = dout + c_loop * wout_stride;
-    const dtype* din_array[n];
-    din_array[0] = din + c_loop * wout_stride;
-    for (int i = 1; i < n; i++) {
-      din_array[i] = din_array[i - 1] + win_stride;
-    }
-    // deal remain
-    int cremain = chout_round * n - chout;
-    for (int i = 1; i <= cremain; i++) {
-      din_array[n - i] = din_array[0];
-    }
-    for (int ci = 0; ci < chin; ++ci) {
-      for (int k = 0; k < kernel_size; ++k) {
-        for (int i = 0; i < n; i++) {
-          *(dout_c++) = *(din_array[i]++);
-        }
-      }
-    }
-  }
-  return true;
-}
-/*preprocessing inputs
-* input din: [1, chin, he-hs, we - ws] --> outputs dout: [n, chin, 1, we - ws]
-* n = he - hs
-*/
-template <typename dtype>
-static bool prepack_input_nxw(const dtype* din,
-                              dtype* dout,
-                              int cs,
-                              int ce,
-                              int hs,
-                              int he,
-                              int ws,
-                              int we,
-                              int channel,
-                              int width,
-                              int height,
-                              dtype* zero_ptr) {
-  int n = he - hs;
-  if (n <= 0) {
-    LOG(ERROR) << "hei_n is more than zero";
-    return false;
-  }
-  int w0 = ws < 0 ? 0 : ws;
-  int w1 = we > width ? width : we;
-
-  int size_w = we - ws;
-  int size_wc_len = size_w * channel;
-  int size_c = width * height;
-
-  int valid_w = w1 - w0;
-  size_t valid_w_byte = valid_w * sizeof(dtype);
-
-  dtype* out_array[n];
-  out_array[0] = dout;
-  for (int i = 1; i < n; i++) {
-    out_array[i] = out_array[i - 1] + size_wc_len;
-  }
-
-  for (int c = 0; c < channel; ++c) {
-    int j = 0;
-    // valid height
-    for (int i = hs; i < he; i++) {
-      // get address
-      const dtype* in_array;
-      if (i < 0 || i >= height) {
-        in_array = zero_ptr;
-      } else {
-        in_array = din + i * width;
-      }
-
-      for (int w = ws; w < w0; ++w) {
-        *(out_array[j]++) = 0.f;
-      }
-      memcpy(out_array[j], in_array, valid_w_byte);
-      out_array[j] += valid_w;
-      for (int w = w1; w < we; ++w) {
-        *(out_array[j]++) = 0.f;
-      }
-      j++;
-    }
-    din += size_c;
-  }
-  return true;
-}
-
-/*wirte result in outputs
-* input din: [n, c, h, w], output dout: [n, c, h, w]
-*/
-inline bool write_to_output_c1_fp32(const float* din,
-                                    float* dout,
-                                    int cs,
-                                    int ce,
-                                    int hs,
-                                    int he,
-                                    int ws,
-                                    int we,
-                                    int channel,
-                                    int height,
-                                    int width,
-                                    bool flag_relu,
-                                    float* trash_ptr) {
-  if (cs > channel) {
-    return true;
-  }
-
-  const int c1 = 1;
-  const int w4 = 4;
-
-  int size_c_out = width * height;
-
-  float* doutc0r0 = dout + cs * size_c_out + hs * width + ws;
-
-  const float* ptr_din = din;
-
-  int size_h = (he > height ? height : he) - hs;  // size_h == hei_n
-
-  int w_round = we - ws;
-  int cnt = (width - ws) / w4;
-
-  for (int i = 0; i < size_h; i++) {
-    int size_w = i * width;
-    float* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
-    const float* din_hei_ptr = ptr_din + i * w_round * c1;
-    if (cnt > 0) {
-      int cnt_loop = cnt;
-      if (flag_relu) {
-#ifdef __aarch64__
-        asm volatile(
-            "ldr q0, [%[ptr_din]], #16      \n" /* load data, c0r0, c0r1, c0r2,
-                                                   c0r3 */
-            "movi v20.4s, #0                \n" /* for relu */
-            "1:                             \n" /* main loop*/
-            "fmax   v1.4s, v0.4s, v20.4s    \n" /*relu*/
-            "ldr q0, [%[ptr_din]], #16      \n" /* load data, c0r0, c0r1, c0r2,
-                                                   c0r3 */
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-            "str    q1, [%[doutc0r0]], #16  \n" /* store c0r0*/
-            "bne    1b                      \n" /* jump to main loop*/
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0", "v1", "v20");
-#else
-        asm volatile(
-            "vld1.32 {d0-d1}, [%[ptr_din]]!                 @ load data, c0r0, "
-            "c1r0, c0r1, c1r1, , c0r2, c1r2, c0r3, c1r3\n"
-            "vmov.u32 q15, #0                       @ dump zero\n"
-            "1:                                     @ main loop\n"
-
-            "vmax.f32   q1, q0, q15                 @ relu\n"
-            "vld1.32 {d0-d1}, [%[ptr_din]]!         @ load data \n"
-
-            "vst1.32  {d2-d3}, [%[doutc0r0]]!       @ store result, add "
-            "pointer\n"
-
-            "subs   %[cnt], %[cnt], #1              @ loop count - 1\n"
-
-            "bne    1b                              @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q15");
-#endif
-      } else {
-#ifdef __aarch64__
-        asm volatile(
-            "ldr q0, [%[ptr_din]], #16      \n" /* load data, c0r0, c0r1, c0r2,
-                                                   c0r3 */
-            "1:                             \n" /* main loop*/
-            "str    q0, [%[doutc0r0]], #16  \n" /* store c2r0*/
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-            "ldr q0, [%[ptr_din]], #16      \n" /* load data, c0r0, c0r1, c0r2,
-                                                   c0r3 */
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0");
-#else
-        asm volatile(
-            "vld1.32 {d0-d1}, [%[ptr_din]]!                 @ load data, c0r0, "
-            "c0r1, c0r2, c0r3\n"
-            "1:                                     @ main loop\n"
-            "vst1.32  {d0-d1}, [%[doutc0r0]]!       @ store result, add "
-            "pointer\n"
-            "subs   %[cnt], %[cnt], #1              @ loop count - 1\n"
-            "vld1.32 {d0-d1}, [%[ptr_din]]!         @ load data \n"
-            "bne    1b                              @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0");
-#endif
-      }
-    }
-    if (we > width) {
-      int offset = i * w_round * c1 + c1 * w4 * cnt;
-      din_hei_ptr = ptr_din + offset;
-      int j = we - w4;
-      if (flag_relu) {
-        for (; j < width; ++j) {
-          *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0.f);
-          din_hei_ptr++;
-        }
-      } else {
-        for (; j < width; ++j) {
-          *(doutc0_ptr++) = *(din_hei_ptr++);
-        }
-      }
-    }
-  }
-  return true;
-}
-
-/*wirte result in outputs
-* input din: [n, c / 4, h, w * 4], output dout: [n, c, h, w]
-*/
-inline bool write_to_output_c2_fp32(const float* din,
-                                    float* dout,
-                                    int cs,
-                                    int ce,
-                                    int hs,
-                                    int he,
-                                    int ws,
-                                    int we,
-                                    int channel,
-                                    int height,
-                                    int width,
-                                    bool flag_relu,
-                                    float* trash_ptr) {
-  if (cs > channel) {
-    return true;
-  }
-
-  const int c2 = 2;
-  const int w4 = 4;
-
-  //    float trash_ptr[width];
-
-  int size_c_out = width * height;
-
-  float* doutc0r0 = dout + cs * size_c_out + hs * width + ws;
-  float* doutc1r0 = doutc0r0 + size_c_out;
-
-  const float* ptr_din = din;
-
-  int size_h = (he > height ? height : he) - hs;  // size_h == hei_n
-
-  int w_round = we - ws;
-  int cnt = (width - ws) / w4;
-
-  for (int i = 0; i < size_h; i++) {
-    int size_w = i * width;
-    float* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
-    float* doutc1_ptr = doutc1r0 + size_w;
-    if (ce > channel) {
-      switch (ce - channel) {
-        case 1:
-          doutc1_ptr = trash_ptr;
-        default:
-          break;
-      }
-    }
-    const float* din_hei_ptr = ptr_din + i * w_round * c2;
-    if (cnt > 0) {
-      int cnt_loop = cnt;
-      if (flag_relu) {
-#ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load data, c0r0, c1r0, c0r1,
-                                                   c1r1, , c0r2, c1r2, c0r3,
-                                                   c1r3 */
-            "movi v20.4s, #0                \n" /* for relu */
-            "1:                             \n" /* main loop*/
-            "trn1   v2.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "trn2   v3.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load data, c0r0, c1r0, c0r1,
-                                                   c1r1, , c0r2, c1r2, c0r3,
-                                                   c1r3  */
-            "trn1   v4.2d, v2.2d, v3.2d     \n" /* trans q8, q10*/
-            "trn2   v5.2d, v2.2d, v3.2d     \n" /* trans q8, q10*/
-
-            "fmax   v2.4s, v4.4s, v20.4s    \n" /*relu*/
-            "fmax   v3.4s, v5.4s, v20.4s    \n" /*relu*/
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-
-            "str    q2, [%[doutc0r0]], #16  \n" /* store c0r0*/
-            "str    q3, [%[doutc1r0]], #16  \n" /* store c2r0*/
-
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0", "v1", "v2", "v3", "v4", "v5", "v20");
-#else
-        asm volatile(
-            "vld1.32 {d0-d3}, [%[ptr_din]]!                 @ load data, c0r0, "
-            "c1r0, c0r1, c1r1, , c0r2, c1r2, c0r3, c1r3\n"
-            "vmov.u32 q15, #0                       @ dump zero\n"
-            "1:                                     @ main loop\n"
-            "vtrn.32 d0, d1                         @ trans data:c0r0, c0r1, "
-            "c1r0, c1r1 \n"
-            "vtrn.32 d2, d3                         @ trans data:c0r2, c0r3, "
-            "c1r2, c1r3 \n"
-
-            "vswp  d1, d2                           @ swap data\n"
-
-            "vmax.f32   q0, q0, q15                 @ relu\n"
-            "vmax.f32   q1, q1, q15                 @ relu\n"
-
-            "vst1.32  {d0-d1}, [%[doutc0r0]]!       @ store result, add "
-            "pointer\n"
-            "vst1.32  {d2-d3}, [%[doutc1r0]]!       @ store result, add "
-            "pointer\n"
-
-            "subs   %[cnt], %[cnt], #1              @ loop count - 1\n"
-
-            "vld1.32 {d0-d3}, [%[ptr_din]]!         @ load data \n"
-
-            "bne    1b                              @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3", "q15");
-#endif
-      } else {
-#ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load data, c0r0, c1r0, c0r1,
-                                                   c1r1, , c0r2, c1r2, c0r3,
-                                                   c1r3 */
-            "1:                             \n" /* main loop*/
-            "trn1   v2.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "trn2   v3.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load data, c0r0, c1r0, c0r1,
-                                                   c1r1, , c0r2, c1r2, c0r3,
-                                                   c1r3  */
-            "trn1   v4.2d, v2.2d, v3.2d     \n" /* trans q8, q10*/
-            "trn2   v5.2d, v2.2d, v3.2d     \n" /* trans q8, q10*/
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-
-            "str    q4, [%[doutc0r0]], #16  \n" /* store c0r0*/
-            "str    q5, [%[doutc1r0]], #16  \n" /* store c2r0*/
-
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0", "v1", "v2", "v3", "v4", "v5");
-#else
-        asm volatile(
-            "vld1.32 {d0-d3}, [%[ptr_din]]!                 @ load data, c0r0, "
-            "c1r0, c0r1, c1r1, , c0r2, c1r2, c0r3, c1r3\n"
-            "1:                                     @ main loop\n"
-            "vtrn.32 d0, d1                         @ trans data:c0r0, c0r1, "
-            "c1r0, c1r1 \n"
-            "vtrn.32 d2, d3                         @ trans data:c0r2, c0r3, "
-            "c1r2, c1r3 \n"
-
-            "vswp  d1, d2                           @ swap data\n"
-
-            "vst1.32  {d0-d1}, [%[doutc0r0]]!       @ store result, add "
-            "pointer\n"
-            "vst1.32  {d2-d3}, [%[doutc1r0]]!       @ store result, add "
-            "pointer\n"
-
-            "subs   %[cnt], %[cnt], #1              @ loop count - 1\n"
-
-            "vld1.32 {d0-d3}, [%[ptr_din]]!         @ load data \n"
-
-            "bne    1b                              @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3", "q15");
-#endif
-      }
-    }
-    if (we > width) {
-      int offset = i * w_round * c2 + c2 * w4 * cnt;
-      din_hei_ptr = ptr_din + offset;
-      int j = we - w4;
-      if (flag_relu) {
-        for (; j < width; ++j) {
-          *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0.f);
-          *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1], 0.f);
-          din_hei_ptr += 2;
-        }
-      } else {
-        for (; j < width; ++j) {
-          *(doutc0_ptr++) = *(din_hei_ptr++);
-          *(doutc1_ptr++) = *(din_hei_ptr++);
-        }
-      }
-    }
-  }
-  return true;
-}
-
-/*wirte result in outputs
-* input din: [n, c / 4, h, w * 4], output dout: [n, c, h, w]
-*/
-inline bool write_to_output_c4_fp32(const float* din,
-                                    float* dout,
-                                    int cs,
-                                    int ce,
-                                    int hs,
-                                    int he,
-                                    int ws,
-                                    int we,
-                                    int channel,
-                                    int height,
-                                    int width,
-                                    bool flag_relu,
-                                    float* trash_ptr) {
-  const int c4 = 4;
-  const int w4 = 4;
-  const int w_round = we - ws;
-  const int ch_n = ce - cs;
-  if (ch_n != 4) {
-    LOG(ERROR) << "write_to_output_c4_fp32 ch_n must be equal 4 and hei_n is "
-                  "more than zero";
-    return false;
-  }
-  int size_c_out = width * height;
-
-  float* doutc0r0 = dout + cs * size_c_out + hs * width + ws;
-  float* doutc1r0 = doutc0r0 + size_c_out;
-  float* doutc2r0 = doutc1r0 + size_c_out;
-  float* doutc3r0 = doutc2r0 + size_c_out;
-
-  const float* ptr_din = din;
-
-  int size_h = (he > height ? height : he) - hs;  // size_h == hei_n
-
-  int cnt = (width - ws) / w4;
-
-  for (int i = 0; i < size_h; i++) {
-    int size_w = i * width;
-    float* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
-    float* doutc1_ptr = doutc1r0 + size_w;
-    float* doutc2_ptr = doutc2r0 + size_w;
-    float* doutc3_ptr = doutc3r0 + size_w;
-    if (ce > channel) {
-      switch (ce - channel) {
-        case 3:
-          doutc1_ptr = trash_ptr;
-        case 2:
-          doutc2_ptr = trash_ptr;
-        case 1:
-          doutc3_ptr = trash_ptr;
-        default:
-          break;
-      }
-    }
-    const float* din_hei_ptr = ptr_din + i * w_round * ch_n;
-    if (cnt > 0) {
-      int cnt_loop = cnt;
-      if (flag_relu) {
-#ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "movi v20.4s, #0                \n" /* for relu */
-            "1:                             \n" /* main loop*/
-            "trn1   v8.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "trn2   v9.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "trn1   v10.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/
-            "trn2   v11.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "trn1   v16.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/
-            "trn2   v17.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/
-            "trn1   v18.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
-            "trn2   v19.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
-            "fmax   v16.4s, v16.4s, v20.4s  \n" /*relu*/
-            "fmax   v17.4s, v17.4s, v20.4s  \n" /*relu*/
-            "fmax   v18.4s, v18.4s, v20.4s  \n" /*relu*/
-            "fmax   v19.4s, v19.4s, v20.4s  \n" /*relu*/
-            "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/
-            "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/
-            "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/
-            "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20");
-#else
-        asm volatile(
-            "vld1.32 {d0-d3}, [%[ptr_din]]!                 @load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!         @load data \n"
-            "vmov.u32 q15, #0                       @ dump zero\n"
-            "1:                                     @ main loop\n"
-            "vtrn.32 q0, q1                         @ trans data:c00c01c20c21 "
-            "\n"
-            "vtrn.32 q2, q3                         @ trans data:c02c03c22c23 "
-            "\n"
-
-            "vswp   d1, d4                          @ swap data\n"
-            "vswp   d3, d6                          @ swap data\n"
-
-            "vmax.f32   q0, q0, q15        @ relu\n"
-            "vmax.f32   q1, q1, q15        @ relu\n"
-            "vmax.f32   q2, q2, q15        @ relu\n"
-            "vmax.f32   q3, q3, q15        @ relu\n"
-
-            "vst1.32  {d0-d1}, [%[doutc0r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d2-d3}, [%[doutc1r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d4-d5}, [%[doutc2r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d6-d7}, [%[doutc3r0]]!     @ store result, add pointer\n"
-
-            "subs   %[cnt], %[cnt], #1    @ loop count - 1\n"
-
-            "vld1.32 {d0-d3}, [%[ptr_din]]!        @load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!        @load data \n"
-
-            "bne    1b                            @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3", "q15");
-#endif
-      } else {
-#ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "1:                             \n" /* main loop*/
-            "trn1   v8.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "trn2   v9.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "trn1   v10.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/
-            "trn2   v11.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "trn1   v16.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/
-            "trn2   v17.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/
-            "trn1   v18.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
-            "trn2   v19.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
-            "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/
-            "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/
-            "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/
-            "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v16",
-              "v17",
-              "v18",
-              "v19");
-#else
-        asm volatile(
-            "vld1.32 {d0-d3}, [%[ptr_din]]!                 @load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!         @load data \n"
-            "1:                                     @ main loop\n"
-            "vtrn.32 q0, q1                         @ trans data:c00c01c20c21 "
-            "\n"
-            "vtrn.32 q2, q3                         @ trans data:c02c03c22c23 "
-            "\n"
-
-            "vswp   d1, d4                          @ swap data\n"
-            "vswp   d3, d6                          @ swap data\n"
-
-            "vst1.32  {d0-d1}, [%[doutc0r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d2-d3}, [%[doutc1r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d4-d5}, [%[doutc2r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d6-d7}, [%[doutc3r0]]!     @ store result, add pointer\n"
-
-            "subs   %[cnt], %[cnt], #1    @ loop count - 1\n"
-
-            "vld1.32 {d0-d3}, [%[ptr_din]]!        @load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!        @load data \n"
-
-            "bne    1b                            @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3");
-#endif
-      }
-    }
-    if (we > width) {
-      int offset = i * w_round * c4 + c4 * w4 * cnt;
-      din_hei_ptr = ptr_din + offset;
-      int j = we - w4;
-      if (flag_relu) {
-        for (; j < width; ++j) {
-          *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0.f);
-          *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1], 0.f);
-          *(doutc2_ptr++) = LITEMAX(din_hei_ptr[2], 0.f);
-          *(doutc3_ptr++) = LITEMAX(din_hei_ptr[3], 0.f);
-          din_hei_ptr += w4;
-        }
-      } else {
-        for (; j < width; ++j) {
-          *(doutc0_ptr++) = din_hei_ptr[0];
-          *(doutc1_ptr++) = din_hei_ptr[1];
-          *(doutc2_ptr++) = din_hei_ptr[2];
-          *(doutc3_ptr++) = din_hei_ptr[3];
-          din_hei_ptr += w4;
-        }
-      }
-    }
-  }
-  return true;
-}
-
-/*wirte result in outputs
-* input din: [n, c / 8, h, w * 8], output dout: [n, c, h, w]
-*/
-inline bool write_to_output_c8_fp32(const float* din,
-                                    float* dout,
-                                    int ch_n,
-                                    int hei_n,
-                                    int cs,
-                                    int ce,
-                                    int hs,
-                                    int he,
-                                    int ws,
-                                    int we,
-                                    int channel,
-                                    int height,
-                                    int width,
-                                    bool flag_relu,
-                                    float* trash_ptr) {
-  if (ch_n != 8 || hei_n <= 0) {
-    LOG(ERROR) << "ch_n must be equal 8 and hei_n is more than zero";
-    return false;
-  }
-  int size_c_out = width * height;
-
-  float* doutc0r0 = dout + cs * size_c_out + hs * width + ws;
-  float* doutc1r0 = doutc0r0 + size_c_out;
-  float* doutc2r0 = doutc1r0 + size_c_out;
-  float* doutc3r0 = doutc2r0 + size_c_out;
-  float* doutc4r0 = doutc3r0 + size_c_out;
-  float* doutc5r0 = doutc4r0 + size_c_out;
-  float* doutc6r0 = doutc5r0 + size_c_out;
-  float* doutc7r0 = doutc6r0 + size_c_out;
-
-  const float* ptr_din = din;
-
-  int size_h = (he > height ? height : he) - hs;  // size_h == hei_n
-
-  int valid_w = we - ws;
-  int cnt = valid_w / 4;
-
-  if (we > width) {
-    cnt--;
-  }
-  if (flag_relu) {
-    for (int i = 0; i < size_h; i++) {
-      int size_w = i * width;
-      float* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
-      float* doutc1_ptr = doutc1r0 + size_w;
-      float* doutc2_ptr = doutc2r0 + size_w;
-      float* doutc3_ptr = doutc3r0 + size_w;
-      float* doutc4_ptr = doutc4r0 + size_w;
-      float* doutc5_ptr = doutc5r0 + size_w;
-      float* doutc6_ptr = doutc6r0 + size_w;
-      float* doutc7_ptr = doutc7r0 + size_w;
-      if (ce > channel) {
-        switch (ce - channel) {
-          case 7:
-            doutc1_ptr = trash_ptr;
-          case 6:
-            doutc2_ptr = trash_ptr;
-          case 5:
-            doutc3_ptr = trash_ptr;
-          case 4:
-            doutc4_ptr = trash_ptr;
-          case 3:
-            doutc5_ptr = trash_ptr;
-          case 2:
-            doutc6_ptr = trash_ptr;
-          case 1:
-            doutc7_ptr = trash_ptr;
-          default:
-            break;
-        }
-      }
-      ptr_din = din + i * valid_w * ch_n;
-      const float* din_hei_ptr = ptr_din;
-      if (cnt > 0) {
-        int cnt_loop = cnt;
-#ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "movi v20.4s, #0                \n" /* for relu */
-            "1:                             \n" /* main loop*/
-            "trn1   v8.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/
-            "trn2   v9.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/
-            "trn1   v10.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/
-            "trn2   v11.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-
-            "trn1   v12.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/
-            "trn2   v13.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/
-            "trn1   v14.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/
-            "trn2   v15.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-
-            "trn1   v16.2d, v8.2d, v12.2d   \n" /* trans q8, q10 00 01 02 03*/
-            "trn2   v17.2d, v8.2d, v12.2d   \n" /* trans q8, q10 20 21 22 23*/
-            "trn1   v18.2d, v9.2d, v13.2d   \n" /* trans q9, q11 10 11 12 13*/
-            "trn2   v19.2d, v9.2d, v13.2d   \n" /* trans q9, q11 30 31 32 33*/
-            "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-
-            "trn1   v8.2d, v10.2d, v14.2d   \n" /* trans q8, q10 40 41 42 43*/
-            "trn2   v9.2d, v10.2d, v14.2d   \n" /* trans q8, q10 60 61 62 63*/
-            "trn1   v12.2d, v11.2d, v15.2d  \n" /* trans q9, q11 50 51 52 53*/
-            "trn2   v13.2d, v11.2d, v15.2d  \n" /* trans q9, q11 70 71 72 73*/
-            "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-
-            "fmax   v16.4s, v16.4s, v20.4s  \n" /*relu*/
-            "fmax   v17.4s, v17.4s, v20.4s  \n" /*relu*/
-            "fmax   v18.4s, v18.4s, v20.4s  \n" /*relu*/
-            "fmax   v19.4s, v19.4s, v20.4s  \n" /*relu*/
-
-            "fmax   v8.4s,  v8.4s,  v20.4s  \n" /*relu*/
-            "fmax   v9.4s,  v9.4s,  v20.4s  \n" /*relu*/
-            "fmax   v12.4s, v12.4s, v20.4s  \n" /*relu*/
-            "fmax   v13.4s, v13.4s, v20.4s  \n" /*relu*/
-
-            "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/
-            "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/
-            "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/
-            "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-            "str    q8,  [%[doutc4r0]], #16 \n" /* store c0r0*/
-            "str    q9,  [%[doutc6r0]], #16 \n" /* store c2r0*/
-            "str    q12, [%[doutc5r0]], #16 \n" /* store c1r0*/
-            "str    q13, [%[doutc7r0]], #16 \n" /* store c3r0*/
-
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [doutc4r0] "+r"(doutc4_ptr),
-              [doutc5r0] "+r"(doutc5_ptr),
-              [doutc6r0] "+r"(doutc6_ptr),
-              [doutc7r0] "+r"(doutc7_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20");
-#else
-        asm volatile(
-            "vld1.32 {d0-d3}, [%[ptr_din]]!        @load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!        @load data \n"
-            "vld1.32 {d8-d11}, [%[ptr_din]]!       @load data \n"
-            "vld1.32 {d12-d15}, [%[ptr_din]]!      @load data \n"
-            "vmov.u32 q15, #0                      @ dump zero\n"
-            "1:                                    @ main loop\n"
-            "vtrn.32   q0, q2                      @ trans q0, q2 \n"
-            "vtrn.32   q4, q6                      @ trans q4, q6 \n"
-            "vswp.32   d1, d8                      @ swap  d1, d8 \n"
-            "vswp.32   d5, d12                     @ swap  d5, d12\n"
-
-            "vtrn.32   q1, q3                      @ trans q1, q3 \n"
-            "vtrn.32   q5, q7                      @ trans q5, q7 \n"
-            "vswp.32   d3, d10                     @ swap  d3, d10\n"
-            "vswp.32   d7, d14                     @ swap  d7, d14\n"
-
-            "vmax.f32  q0, q0, q15                 @ relu\n"
-            "vmax.f32  q1, q1, q15                 @ relu\n"
-            "vmax.f32  q2, q2, q15                 @ relu\n"
-            "vmax.f32  q3, q3, q15                 @ relu\n"
-
-            "vmax.f32  q4, q4, q15                 @ relu\n"
-            "vmax.f32  q5, q5, q15                 @ relu\n"
-            "vmax.f32  q6, q6, q15                 @ relu\n"
-            "vmax.f32  q7, q7, q15                 @ relu\n"
-
-            "subs   %[cnt], %[cnt], #1             @ loop count - 1\n"
-            "vst1.32   {d0-d1}, [%[doutc0r0]]!     @ store result, add "
-            "pointer\n"
-            "vst1.32   {d2-d3}, [%[doutc4r0]]!     @ store result, add "
-            "pointer\n"
-            "vst1.32   {d4-d5}, [%[doutc1r0]]!     @ store result, add "
-            "pointer\n"
-            "vst1.32   {d6-d7}, [%[doutc5r0]]!     @ store result, add "
-            "pointer\n"
-
-            "vld1.32   {d0-d3}, [%[ptr_din]]!      @load data \n"
-            "vld1.32   {d4-d7}, [%[ptr_din]]!      @load data \n"
-
-            "vst1.32   {d8-d9},   [%[doutc2r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d10-d11}, [%[doutc6r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d12-d13}, [%[doutc3r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d14-d15}, [%[doutc7r0]]!   @ store result, add "
-            "pointer\n"
-
-            "vld1.32 {d8-d11}, [%[ptr_din]]!       @load data \n"
-            "vld1.32 {d12-d15}, [%[ptr_din]]!      @load data \n"
-
-            "bne    1b                             @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [doutc4r0] "+r"(doutc4_ptr),
-              [doutc5r0] "+r"(doutc5_ptr),
-              [doutc6r0] "+r"(doutc6_ptr),
-              [doutc7r0] "+r"(doutc7_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3", "q4", "q15");
-#endif
-      }
-      if (we > width) {
-        int offset = 32 * (valid_w / 4 - 1);
-        din_hei_ptr = ptr_din + offset;
-        int i = we - 4;
-        for (; i < width; ++i) {
-          *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0.f);
-          *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1], 0.f);
-          *(doutc2_ptr++) = LITEMAX(din_hei_ptr[2], 0.f);
-          *(doutc3_ptr++) = LITEMAX(din_hei_ptr[3], 0.f);
-          *(doutc4_ptr++) = LITEMAX(din_hei_ptr[4], 0.f);
-          *(doutc5_ptr++) = LITEMAX(din_hei_ptr[5], 0.f);
-          *(doutc6_ptr++) = LITEMAX(din_hei_ptr[6], 0.f);
-          *(doutc7_ptr++) = LITEMAX(din_hei_ptr[7], 0.f);
-          din_hei_ptr += 8;
-        }
-      }
-    }
-  } else {
-    for (int i = 0; i < size_h; i++) {
-      int size_w = i * width;
-      float* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
-      float* doutc1_ptr = doutc1r0 + size_w;
-      float* doutc2_ptr = doutc2r0 + size_w;
-      float* doutc3_ptr = doutc3r0 + size_w;
-      float* doutc4_ptr = doutc4r0 + size_w;
-      float* doutc5_ptr = doutc5r0 + size_w;
-      float* doutc6_ptr = doutc6r0 + size_w;
-      float* doutc7_ptr = doutc7r0 + size_w;
-      if (ce > channel) {
-        switch (ce - channel) {
-          case 7:
-            doutc1_ptr = trash_ptr;
-          case 6:
-            doutc2_ptr = trash_ptr;
-          case 5:
-            doutc3_ptr = trash_ptr;
-          case 4:
-            doutc4_ptr = trash_ptr;
-          case 3:
-            doutc5_ptr = trash_ptr;
-          case 2:
-            doutc6_ptr = trash_ptr;
-          case 1:
-            doutc7_ptr = trash_ptr;
-          default:
-            break;
-        }
-      }
-      ptr_din = din + i * valid_w * ch_n;
-      const float* din_hei_ptr = ptr_din;
-      if (cnt > 0) {
-        int cnt_loop = cnt;
-#ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "1:                             \n" /* main loop*/
-            "trn1   v8.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/
-            "trn2   v9.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/
-            "trn1   v10.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/
-            "trn2   v11.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-
-            "trn1   v12.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/
-            "trn2   v13.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/
-            "trn1   v14.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/
-            "trn2   v15.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-
-            "trn1   v16.2d, v8.2d, v12.2d   \n" /* trans q8, q10 00 01 02 03*/
-            "trn2   v17.2d, v8.2d, v12.2d   \n" /* trans q8, q10 20 21 22 23*/
-            "trn1   v18.2d, v9.2d, v13.2d   \n" /* trans q9, q11 10 11 12 13*/
-            "trn2   v19.2d, v9.2d, v13.2d   \n" /* trans q9, q11 30 31 32 33*/
-            "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-
-            "trn1   v8.2d, v10.2d, v14.2d   \n" /* trans q8, q10 40 41 42 43*/
-            "trn2   v9.2d, v10.2d, v14.2d   \n" /* trans q8, q10 60 61 62 63*/
-            "trn1   v12.2d, v11.2d, v15.2d  \n" /* trans q9, q11 50 51 52 53*/
-            "trn2   v13.2d, v11.2d, v15.2d  \n" /* trans q9, q11 70 71 72 73*/
-            "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-
-            "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/
-            "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/
-            "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/
-            "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-            "str    q8,  [%[doutc4r0]], #16 \n" /* store c0r0*/
-            "str    q9,  [%[doutc6r0]], #16 \n" /* store c2r0*/
-            "str    q12, [%[doutc5r0]], #16 \n" /* store c1r0*/
-            "str    q13, [%[doutc7r0]], #16 \n" /* store c3r0*/
-
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [doutc4r0] "+r"(doutc4_ptr),
-              [doutc5r0] "+r"(doutc5_ptr),
-              [doutc6r0] "+r"(doutc6_ptr),
-              [doutc7r0] "+r"(doutc7_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20");
-#else
-        asm volatile(
-            "vld1.32   {d0-d3}, [%[ptr_din]]!      @load data \n"
-            "vld1.32   {d4-d7}, [%[ptr_din]]!      @load data \n"
-            "vld1.32   {d8-d11}, [%[ptr_din]]!     @load data \n"
-            "vld1.32   {d12-d15}, [%[ptr_din]]!    @load data \n"
-            "1:                                    @ main loop\n"
-            "vtrn.32   q0, q2                      @ trans q0, q2 \n"
-            "vtrn.32   q4, q6                      @ trans q4, q6 \n"
-            "vswp.32   d1, d8                      @ swap  d1, d8 \n"
-            "vswp.32   d5, d12                     @ swap  d5, d12\n"
-
-            "vtrn.32   q1, q3                      @ trans q1, q3 \n"
-            "vtrn.32   q5, q7                      @ trans q5, q7 \n"
-            "vswp.32   d3, d10                     @ swap  d3, d10\n"
-            "vswp.32   d7, d14                     @ swap  d7, d14\n"
-
-            "subs      %[cnt], %[cnt], #1          @ loop count - 1\n"
-
-            "vst1.32   {d0-d1},   [%[doutc0r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d2-d3},   [%[doutc4r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d4-d5},   [%[doutc1r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d6-d7},   [%[doutc5r0]]!   @ store result, add "
-            "pointer\n"
-
-            "vld1.32   {d0-d3},   [%[ptr_din]]!    @load data \n"
-            "vld1.32   {d4-d7},   [%[ptr_din]]!    @load data \n"
-
-            "vst1.32   {d8-d9},   [%[doutc2r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d10-d11}, [%[doutc6r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d12-d13}, [%[doutc3r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d14-d15}, [%[doutc7r0]]!   @ store result, add "
-            "pointer\n"
-
-            "vld1.32 {d8-d11},  [%[ptr_din]]!      @load data \n"
-            "vld1.32 {d12-d15}, [%[ptr_din]]!      @load data \n"
-
-            "bne    1b                             @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [doutc4r0] "+r"(doutc4_ptr),
-              [doutc5r0] "+r"(doutc5_ptr),
-              [doutc6r0] "+r"(doutc6_ptr),
-              [doutc7r0] "+r"(doutc7_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3", "q4");
-#endif
-      }
-      if (we > width) {
-        int offset = 32 * (valid_w / 4 - 1);
-        din_hei_ptr = ptr_din + offset;
-        int i = we - 4;
-        for (; i < width; ++i) {
-          *(doutc0_ptr++) = din_hei_ptr[0];
-          *(doutc1_ptr++) = din_hei_ptr[1];
-          *(doutc2_ptr++) = din_hei_ptr[2];
-          *(doutc3_ptr++) = din_hei_ptr[3];
-          *(doutc4_ptr++) = din_hei_ptr[4];
-          *(doutc5_ptr++) = din_hei_ptr[5];
-          *(doutc6_ptr++) = din_hei_ptr[6];
-          *(doutc7_ptr++) = din_hei_ptr[7];
-          din_hei_ptr += 8;
-        }
-      }
-    }
-  }
-  return true;
-}
-
-/*wirte result in outputs
-* input din: [n, c / 4, h, w * 4], output dout: [n, c, h, w]
-*/
-inline bool write_to_output_c4_int32(const int* din,
-                                     int* dout,
-                                     int ch_n,
-                                     int hei_n,
-                                     int cs,
-                                     int ce,
-                                     int hs,
-                                     int he,
-                                     int ws,
-                                     int we,
-                                     int channel,
-                                     int height,
-                                     int width,
-                                     bool flag_relu,
-                                     int* trash_ptr) {
-  if (ch_n != 4 || hei_n <= 0) {
-    LOG(ERROR) << "ch_n must be equal 4 and hei_n is more than zero";
-    return false;
-  }
-  int size_c_out = width * height;
-
-  int* doutc0r0 = dout + cs * size_c_out + hs * width + ws;
-  int* doutc1r0 = doutc0r0 + size_c_out;
-  int* doutc2r0 = doutc1r0 + size_c_out;
-  int* doutc3r0 = doutc2r0 + size_c_out;
-
-  const int* ptr_din = din;
-
-  int size_h = (he > height ? height : he) - hs;  // size_h == hei_n
-
-  int valid_w = we - ws;
-  int cnt = valid_w / 4;
-
-  if (we > width) {
-    cnt--;
-  }
-  if (flag_relu) {
-    for (int i = 0; i < size_h; i++) {
-      int size_w = i * width;
-      int* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
-      int* doutc1_ptr = doutc1r0 + size_w;
-      int* doutc2_ptr = doutc2r0 + size_w;
-      int* doutc3_ptr = doutc3r0 + size_w;
-      if (ce > channel) {
-        switch (ce - channel) {
-          case 3:
-            doutc1_ptr = trash_ptr;
-          case 2:
-            doutc2_ptr = trash_ptr;
-          case 1:
-            doutc3_ptr = trash_ptr;
-          default:
-            break;
-        }
-      }
-      ptr_din = din + i * valid_w * ch_n;
-      const int* din_hei_ptr = ptr_din;
-      if (cnt > 0) {
-        int cnt_loop = cnt;
-#ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "movi v20.4s, #0                \n" /* for relu */
-            "1:                             \n" /* main loop*/
-            "trn1   v8.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "trn2   v9.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "trn1   v10.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/
-            "trn2   v11.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "trn1   v16.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/
-            "trn2   v17.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/
-            "trn1   v18.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
-            "trn2   v19.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
-            "smax   v16.4s, v16.4s, v20.4s  \n" /* relu */
-            "smax   v17.4s, v17.4s, v20.4s  \n" /* relu */
-            "smax   v18.4s, v18.4s, v20.4s  \n" /* relu */
-            "smax   v19.4s, v19.4s, v20.4s  \n" /* relu */
-            "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/
-            "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/
-            "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/
-            "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20");
-#else
-        asm volatile(
-            "vld1.32    {d0-d3}, [%[ptr_din]]!    @load data \n"
-            "vld1.32    {d4-d7}, [%[ptr_din]]!    @load data \n"
-            "vmov.u32   q15, #0                   @ dump zero\n"
-            "1:                                   @ main loop\n"
-            "vtrn.32    q0, q1                    @ trans q0, q1 \n"
-            "vtrn.32    q2, q3                    @ trans q2, q3 \n"
-            "vswp.32    d1, d4                    @ swap d1, d4  \n"
-            "vswp.32    d3, d6                    @ swap d3, d6  \n"
-
-            "vmax.s32   q0, q0, q15               @ relu\n"
-            "vmax.s32   q1, q1, q15               @ relu\n"
-            "vmax.s32   q2, q2, q15               @ relu\n"
-            "vmax.s32   q3, q3, q15               @ relu\n"
-
-            "vst1.32  {d0-d1}, [%[doutc0r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d2-d3}, [%[doutc1r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d4-d5}, [%[doutc2r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d6-d7}, [%[doutc3r0]]!     @ store result, add pointer\n"
-
-            "subs   %[cnt], %[cnt], #1            @ loop count - 1\n"
-
-            "vld1.32 {d0-d3}, [%[ptr_din]]!       @load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!       @load data \n"
-
-            "bne    1b                            @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3", "q4", "q15");
-#endif
-      }
-      if (we > width) {
-        int offset = 16 * (valid_w / 4 - 1);
-        din_hei_ptr = ptr_din + offset;
-        int i = we - 4;
-        for (; i < width; ++i) {
-          *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0);
-          *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1], 0);
-          *(doutc2_ptr++) = LITEMAX(din_hei_ptr[2], 0);
-          *(doutc3_ptr++) = LITEMAX(din_hei_ptr[3], 0);
-          din_hei_ptr += 4;
-        }
-      }
-    }
-  } else {
-    for (int i = 0; i < size_h; i++) {
-      int size_w = i * width;
-      int* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
-      int* doutc1_ptr = doutc1r0 + size_w;
-      int* doutc2_ptr = doutc2r0 + size_w;
-      int* doutc3_ptr = doutc3r0 + size_w;
-      if (ce > channel) {
-        switch (ce - channel) {
-          case 3:
-            doutc1_ptr = trash_ptr;
-          case 2:
-            doutc2_ptr = trash_ptr;
-          case 1:
-            doutc3_ptr = trash_ptr;
-          default:
-            break;
-        }
-      }
-      ptr_din = din + i * valid_w * ch_n;
-      const int* din_hei_ptr = ptr_din;
-      if (cnt > 0) {
-        int cnt_loop = cnt;
-#ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "1:                             \n" /* main loop*/
-            "trn1   v8.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "trn2   v9.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "trn1   v10.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/
-            "trn2   v11.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "trn1   v16.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/
-            "trn2   v17.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/
-            "trn1   v18.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
-            "trn2   v19.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
-
-            "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/
-            "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/
-            "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/
-            "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20");
-#else
-        asm volatile(
-            "vld1.32    {d0-d3}, [%[ptr_din]]!     @load data \n"
-            "vld1.32    {d4-d7}, [%[ptr_din]]!     @load data \n"
-            "1:                                    @ main loop\n"
-            "vtrn.32    q0, q1                     @ trans q0, q1\n"
-            "vtrn.32    q2, q3                     @ trans q2, q3\n"
-            "vswp.32    d1, d4                     @ swap d1, d4 \n"
-            "vswp.32    d3, d6                     @ swap d3, d6 \n"
-
-            "subs       %[cnt], %[cnt], #1         @ loop count - 1\n"
-            "vst1.32    {d0-d1}, [%[doutc0r0]]!    @ store result, add "
-            "pointer\n"
-            "vst1.32    {d2-d3}, [%[doutc1r0]]!    @ store result, add "
-            "pointer\n"
-            "vst1.32    {d4-d5}, [%[doutc2r0]]!    @ store result, add "
-            "pointer\n"
-            "vst1.32    {d6-d7}, [%[doutc3r0]]!    @ store result, add "
-            "pointer\n"
-
-            "vld1.32    {d0-d3}, [%[ptr_din]]!     @load data \n"
-            "vld1.32    {d4-d7}, [%[ptr_din]]!     @load data \n"
-
-            "bne    1b                             @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3", "q4", "q15");
-#endif
-      }
-      if (we > width) {
-        int offset = 16 * (valid_w / 4 - 1);
-        din_hei_ptr = ptr_din + offset;
-        int i = we - 4;
-        for (; i < width; ++i) {
-          *(doutc0_ptr++) = din_hei_ptr[0];
-          *(doutc1_ptr++) = din_hei_ptr[1];
-          *(doutc2_ptr++) = din_hei_ptr[2];
-          *(doutc3_ptr++) = din_hei_ptr[3];
-          din_hei_ptr += 4;
-        }
-      }
-    }
-  }
-  return true;
-}
-
-/*wirte result in outputs --int8, fp32
-* input din: [n, c / 4, h, w * 4], output dout: [n, c, h, w]
-*/
-template <typename dtype>
-inline bool write_to_output_c4_int32_1(const int* din,
-                                       dtype* dout,
-                                       int ch_n,
-                                       int hei_n,
-                                       int cs,
-                                       int ce,
-                                       int hs,
-                                       int he,
-                                       int ws,
-                                       int we,
-                                       int channel,
-                                       int height,
-                                       int width,
-                                       bool flag_relu,
-                                       dtype* trash_ptr,
-                                       const float* scale,
-                                       PrecisionType out_dtype) {
-  if (ch_n != 4 || hei_n <= 0) {
-    LOG(ERROR) << "ch_n must be equal 4 and hei_n is more than zero";
-    return false;
-  }
-  int size_c_out = width * height;
-
-  dtype* doutc0r0 = dout + cs * size_c_out + hs * width + ws;
-  dtype* doutc1r0 = doutc0r0 + size_c_out;
-  dtype* doutc2r0 = doutc1r0 + size_c_out;
-  dtype* doutc3r0 = doutc2r0 + size_c_out;
-
-  const int* ptr_din = din;
-
-  int size_h = (he > height ? height : he) - hs;  // size_h == hei_n
-
-  int valid_w = we - ws;
-  int cnt = valid_w / 4;
-
-  float32x4_t w_scale = vld1q_f32(scale);
-  // float32x4_t vzero = vdupq_n_f32(0.f);
-
-  if (we > width) {
-    cnt--;
-  }
-  if (out_dtype == PRECISION(kFloat)) {
-    // int32_to_fp32
-    if (flag_relu) {
-      for (int i = 0; i < size_h; i++) {
-        int size_w = i * width;
-        dtype* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
-        dtype* doutc1_ptr = doutc1r0 + size_w;
-        dtype* doutc2_ptr = doutc2r0 + size_w;
-        dtype* doutc3_ptr = doutc3r0 + size_w;
-        if (ce > channel) {
-          switch (ce - channel) {
-            case 3:
-              doutc1_ptr = trash_ptr;
-            case 2:
-              doutc2_ptr = trash_ptr;
-            case 1:
-              doutc3_ptr = trash_ptr;
-            default:
-              break;
-          }
-        }
-        ptr_din = din + i * valid_w * ch_n;
-        const int* din_hei_ptr = ptr_din;
-        if (cnt > 0) {
-          int cnt_loop = cnt;
-#ifdef __aarch64__
-          asm volatile(
-              "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-              "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-              "movi v20.4s, #0                \n" /* for relu */
-              "1:                             \n" /* main loop*/
-              "trn1   v8.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-              "trn2   v9.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-              "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-              "trn1   v10.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/
-              "trn2   v11.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/
-              "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-              "trn1   v16.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/
-              "trn2   v17.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/
-              "trn1   v18.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
-              "trn2   v19.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
-              "smax   v16.4s, v16.4s, v20.4s  \n" /* relu */
-              "smax   v17.4s, v17.4s, v20.4s  \n" /* relu */
-              "smax   v18.4s, v18.4s, v20.4s  \n" /* relu */
-              "smax   v19.4s, v19.4s, v20.4s  \n" /* relu */
-              // int32 --> fp32
-              "scvtf   v4.4s, v16.4s               \n"
-              "scvtf   v5.4s, v17.4s               \n"
-              "scvtf   v6.4s, v18.4s               \n"
-              "scvtf   v7.4s, v19.4s               \n"
-              // mul
-              "fmul    v16.4s, v4.4s, %[scale].s[0]  \n"
-              "fmul    v17.4s, v5.4s, %[scale].s[2] \n"
-              "fmul    v18.4s, v6.4s, %[scale].s[1] \n"
-              "fmul    v19.4s, v7.4s, %[scale].s[3] \n"
-              // res
-              "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/
-              "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/
-              "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/
-              "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/
-
-              "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-              "bne    1b                      \n" /* jump to main loop*/
-
-              : [doutc0r0] "+r"(doutc0_ptr),
-                [doutc1r0] "+r"(doutc1_ptr),
-                [doutc2r0] "+r"(doutc2_ptr),
-                [doutc3r0] "+r"(doutc3_ptr),
-                [cnt] "+r"(cnt_loop),
-                [ptr_din] "+r"(din_hei_ptr)
-              : [scale] "w"(w_scale)
-              : "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v20");
-#else
-          asm volatile(
-              "vld1.32    {d4-d7}, [%[ptr_din]]!    @load data \n"
-              "vld1.32    {d8-d11}, [%[ptr_din]]!    @load data \n"
-              "vmov.u32   q15, #0                   @ dump zero\n"
-              "1:                                   @ main loop\n"
-              "vtrn.32    q2, q3                    @ trans q0, q1 \n"
-              "vtrn.32    q4, q5                    @ trans q2, q3 \n"
-              "vswp.32    d5, d8                    @ swap d1, d4  \n"
-              "vswp.32    d7, d10                    @ swap d3, d6  \n"
-
-              "vmax.s32   q2, q2, q15               @ relu\n"
-              "vmax.s32   q3, q3, q15               @ relu\n"
-              "vmax.s32   q4, q4, q15               @ relu\n"
-              "vmax.s32   q5, q5, q15               @ relu\n"
-
-              // int32-> fp32
-              "vcvt.f32.s32   q6, q2                  \n"
-              "vcvt.f32.s32   q7, q3                  \n"
-              "vcvt.f32.s32   q8, q4                  \n"
-              "vcvt.f32.s32   q9, q5                  \n"
-
-              // mul
-              "vmul.f32  q2, q6, %e[scale][0]       \n"
-              "vmul.f32  q3, q7, %e[scale][1]       \n"
-              "vmul.f32  q4, q8, %f[scale][0]      \n"
-              "vmul.f32  q5, q9, %f[scale][1]     \n"
-
-              "vst1.32  {d4-d5}, [%[doutc0r0]]!     @ store result, add "
-              "pointer\n"
-              "vst1.32  {d6-d7}, [%[doutc1r0]]!     @ store result, add "
-              "pointer\n"
-              "vst1.32  {d8-d9}, [%[doutc2r0]]!     @ store result, add "
-              "pointer\n"
-              "vst1.32  {d10-d11}, [%[doutc3r0]]!     @ store result, add "
-              "pointer\n"
-
-              "subs   %[cnt], %[cnt], #1            @ loop count - 1\n"
-
-              "vld1.32 {d4-d7}, [%[ptr_din]]!       @load data \n"
-              "vld1.32 {d8-d11}, [%[ptr_din]]!       @load data \n"
-
-              "bne    1b                            @ jump to main loop\n"
-
-              : [doutc0r0] "+r"(doutc0_ptr),
-                [doutc1r0] "+r"(doutc1_ptr),
-                [doutc2r0] "+r"(doutc2_ptr),
-                [doutc3r0] "+r"(doutc3_ptr),
-                [ptr_din] "+r"(din_hei_ptr),
-                [cnt] "+r"(cnt_loop)
-              : [scale] "w"(w_scale)
-              : "q2",
-                "q3",
-                "q4",
-                "q5",
-                "q6",
-                "q7",
-                "q8",
-                "q9",
-                "q10",
-                "q11",
-                "q12",
-                "q13",
-                "q14",
-                "q15");
-#endif
-        }
-        if (we > width) {
-          int offset = 16 * (valid_w / 4 - 1);
-          din_hei_ptr = ptr_din + offset;
-          int j = we - 4;
-          for (; j < width; ++j) {
-            *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0] * scale[0], 0);
-            *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1] * scale[1], 0);
-            *(doutc2_ptr++) = LITEMAX(din_hei_ptr[2] * scale[2], 0);
-            *(doutc3_ptr++) = LITEMAX(din_hei_ptr[3] * scale[3], 0);
-            din_hei_ptr += 4;
-          }
-        }
-      }
-    } else {
-      for (int i = 0; i < size_h; i++) {
-        int size_w = i * width;
-        dtype* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
-        dtype* doutc1_ptr = doutc1r0 + size_w;
-        dtype* doutc2_ptr = doutc2r0 + size_w;
-        dtype* doutc3_ptr = doutc3r0 + size_w;
-        if (ce > channel) {
-          switch (ce - channel) {
-            case 3:
-              doutc1_ptr = trash_ptr;
-            case 2:
-              doutc2_ptr = trash_ptr;
-            case 1:
-              doutc3_ptr = trash_ptr;
-            default:
-              break;
-          }
-        }
-        ptr_din = din + i * valid_w * ch_n;
-        const int* din_hei_ptr = ptr_din;
-        if (cnt > 0) {
-          int cnt_loop = cnt;
-#ifdef __aarch64__
-          asm volatile(
-              "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-              "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-              "movi v20.4s, #0                \n" /* for relu */
-              "1:                             \n" /* main loop*/
-              "trn1   v8.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-              "trn2   v9.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-              "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-              "trn1   v10.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/
-              "trn2   v11.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/
-              "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-              "trn1   v16.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/
-              "trn2   v17.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/
-              "trn1   v18.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
-              "trn2   v19.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
-              // int32 --> fp32
-              "scvtf   v4.4s, v16.4s               \n"
-              "scvtf   v5.4s, v17.4s               \n"
-              "scvtf   v6.4s, v18.4s               \n"
-              "scvtf   v7.4s, v19.4s               \n"
-              // mul
-              "fmul    v16.4s, v4.4s, %[scale].s[0]  \n"
-              "fmul    v17.4s, v5.4s, %[scale].s[2]  \n"
-              "fmul    v18.4s, v6.4s, %[scale].s[1] \n"
-              "fmul    v19.4s, v7.4s, %[scale].s[3] \n"
-              // res
-              "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/
-              "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/
-              "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/
-              "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/
-
-              "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-              "bne    1b                      \n" /* jump to main loop*/
-
-              : [doutc0r0] "+r"(doutc0_ptr),
-                [doutc1r0] "+r"(doutc1_ptr),
-                [doutc2r0] "+r"(doutc2_ptr),
-                [doutc3r0] "+r"(doutc3_ptr),
-                [cnt] "+r"(cnt_loop),
-                [ptr_din] "+r"(din_hei_ptr)
-              : [scale] "w"(w_scale)
-              : "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v20");
-#else
-          asm volatile(
-              "vld1.32    {d4-d7}, [%[ptr_din]]!    @load data \n"
-              "vld1.32    {d8-d11}, [%[ptr_din]]!    @load data \n"
-              "vmov.u32   q15, #0                   @ dump zero\n"
-              "1:                                   @ main loop\n"
-              "vtrn.32    q2, q3                    @ trans q0, q1 \n"
-              "vtrn.32    q4, q5                    @ trans q2, q3 \n"
-              "vswp.32    d5, d8                    @ swap d1, d4  \n"
-              "vswp.32    d7, d10                    @ swap d3, d6  \n"
-
-              // int32-> fp32
-              "vcvt.f32.s32   q6, q2                  \n"
-              "vcvt.f32.s32   q7, q3                  \n"
-              "vcvt.f32.s32   q8, q4                  \n"
-              "vcvt.f32.s32   q9, q5                  \n"
-
-              // mul
-              "vmul.f32  q2, q6, %e[scale][0]       \n"
-              "vmul.f32  q3, q7, %e[scale][1]       \n"
-              "vmul.f32  q4, q8, %f[scale][0]      \n"
-              "vmul.f32  q5, q9, %f[scale][1]     \n"
-
-              "vst1.32  {d4-d5}, [%[doutc0r0]]!     @ store result, add "
-              "pointer\n"
-              "vst1.32  {d6-d7}, [%[doutc1r0]]!     @ store result, add "
-              "pointer\n"
-              "vst1.32  {d8-d9}, [%[doutc2r0]]!     @ store result, add "
-              "pointer\n"
-              "vst1.32  {d10-d11}, [%[doutc3r0]]!     @ store result, add "
-              "pointer\n"
-
-              "subs   %[cnt], %[cnt], #1            @ loop count - 1\n"
-
-              "vld1.32 {d4-d7}, [%[ptr_din]]!       @load data \n"
-              "vld1.32 {d8-d11}, [%[ptr_din]]!       @load data \n"
-
-              "bne    1b                            @ jump to main loop\n"
-
-              : [doutc0r0] "+r"(doutc0_ptr),
-                [doutc1r0] "+r"(doutc1_ptr),
-                [doutc2r0] "+r"(doutc2_ptr),
-                [doutc3r0] "+r"(doutc3_ptr),
-                [ptr_din] "+r"(din_hei_ptr),
-                [cnt] "+r"(cnt_loop)
-              : [scale] "w"(w_scale)
-              : "q2",
-                "q3",
-                "q4",
-                "q5",
-                "q6",
-                "q7",
-                "q8",
-                "q9",
-                "q10",
-                "q11",
-                "q12",
-                "q13",
-                "q14",
-                "q15");
-#endif
-        }
-        if (we > width) {
-          int offset = 16 * (valid_w / 4 - 1);
-          din_hei_ptr = ptr_din + offset;
-          int j = we - 4;
-          for (; j < width; ++j) {
-            *(doutc0_ptr++) = din_hei_ptr[0] * scale[0];
-            *(doutc1_ptr++) = din_hei_ptr[1] * scale[1];
-            *(doutc2_ptr++) = din_hei_ptr[2] * scale[2];
-            *(doutc3_ptr++) = din_hei_ptr[3] * scale[3];
-            din_hei_ptr += 4;
-          }
-        }
-      }
-    }
-
-  } else if (out_dtype == PRECISION(kInt8)) {
-    // int32_to_int8
-    if (flag_relu) {
-      for (int i = 0; i < size_h; i++) {
-        int size_w = i * width;
-        dtype* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
-        dtype* doutc1_ptr = doutc1r0 + size_w;
-        dtype* doutc2_ptr = doutc2r0 + size_w;
-        dtype* doutc3_ptr = doutc3r0 + size_w;
-        if (ce > channel) {
-          switch (ce - channel) {
-            case 3:
-              doutc1_ptr = trash_ptr;
-            case 2:
-              doutc2_ptr = trash_ptr;
-            case 1:
-              doutc3_ptr = trash_ptr;
-            default:
-              break;
-          }
-        }
-        ptr_din = din + i * valid_w * ch_n;
-        const int* din_hei_ptr = ptr_din;
-        if (cnt > 0) {
-          int cnt_loop = cnt;
-#ifdef __aarch64__
-          asm volatile(
-              "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-              "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-              "movi v20.4s, #0                \n" /* for relu */
-              "1:                             \n" /* main loop*/
-              "trn1   v8.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-              "trn2   v9.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-              "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-              "trn1   v10.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/
-              "trn2   v11.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/
-              "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-              "trn1   v16.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/
-              "trn2   v17.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/
-              "trn1   v18.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
-              "trn2   v19.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
-              "smax   v16.4s, v16.4s, v20.4s  \n" /* relu */
-              "smax   v17.4s, v17.4s, v20.4s  \n" /* relu */
-              "smax   v18.4s, v18.4s, v20.4s  \n" /* relu */
-              "smax   v19.4s, v19.4s, v20.4s  \n" /* relu */
-              // int32 --> fp32
-              "scvtf   v4.4s, v16.4s               \n"
-              "scvtf   v5.4s, v17.4s               \n"
-              "scvtf   v6.4s, v18.4s               \n"
-              "scvtf   v7.4s, v19.4s               \n"
-
-              // mul
-              "fmul    v16.4s, v4.4s, %[scale].s[0]  \n"
-              "fmul    v17.4s, v5.4s, %[scale].s[2]  \n"
-              "fmul    v18.4s, v6.4s, %[scale].s[1] \n"
-              "fmul    v19.4s, v7.4s, %[scale].s[3] \n"
-
-              // fp32-int32
-              "fcvtas  v4.4s, v16.4s                      \n"
-              "fcvtas  v5.4s, v17.4s                      \n"
-              "fcvtas  v6.4s, v18.4s                      \n"
-              "fcvtas  v7.4s, v19.4s                      \n"
-
-              // int32-int16
-              "sqxtn   v8.4h, v4.4s                      \n"
-              "sqxtn   v9.4h, v5.4s                      \n"
-              "sqxtn   v10.4h, v6.4s                      \n"
-              "sqxtn   v11.4h, v7.4s                      \n"
-
-              "sqxtn  v16.8b, v8.8h                      \n"
-              "sqxtn  v17.8b, v9.8h                     \n"
-              "sqxtn  v18.8b, v10.8h                      \n"
-              "sqxtn  v19.8b, v11.8h                     \n"
-              // res
-              "str     s16, [%[doutc0r0]], #4           \n"
-              "str     s17, [%[doutc2r0]], #4           \n"
-              "str     s18, [%[doutc1r0]], #4           \n"
-              "str     s19, [%[doutc3r0]], #4           \n"
-
-              "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-              "bne    1b                      \n" /* jump to main loop*/
-
-              : [doutc0r0] "+r"(doutc0_ptr),
-                [doutc1r0] "+r"(doutc1_ptr),
-                [doutc2r0] "+r"(doutc2_ptr),
-                [doutc3r0] "+r"(doutc3_ptr),
-                [cnt] "+r"(cnt_loop),
-                [ptr_din] "+r"(din_hei_ptr)
-              : [scale] "w"(w_scale)
-              : "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v20");
-#else
-          asm volatile(
-              "vld1.32    {d4-d7}, [%[ptr_din]]!    @load data \n"
-              "vld1.32    {d8-d11}, [%[ptr_din]]!    @load data \n"
-              "vmov.u32   q15, #0                   @ dump zero\n"
-              "1:                                   @ main loop\n"
-              "vtrn.32    q2, q3                    @ trans q0, q1 \n"
-              "vtrn.32    q4, q5                    @ trans q2, q3 \n"
-              "vswp.32    d5, d8                    @ swap d1, d4  \n"
-              "vswp.32    d7, d10                    @ swap d3, d6  \n"
-
-              "vmax.s32   q2, q2, q15               @ relu\n"
-              "vmax.s32   q3, q3, q15               @ relu\n"
-              "vmax.s32   q4, q4, q15               @ relu\n"
-              "vmax.s32   q5, q5, q15               @ relu\n"
-
-              // int32-> fp32
-              "vcvt.f32.s32   q6, q2                  \n"
-              "vcvt.f32.s32   q7, q3                  \n"
-              "vcvt.f32.s32   q8, q4                  \n"
-              "vcvt.f32.s32   q9, q5                  \n"
-
-              "vmov.f32 q2, #0.5                    \n"
-
-              // "vand.i32   q0, %q[vpoff], %q[vpoff]    @ set offset, 0.5\n"
-              "vand.i32   q3, q2, q2                  @ set offset, 0.5\n"
-              "vand.i32   q4, q2, q2                  @ set offset, 0.5\n"
-              "vand.i32   q5, q2, q2                  @ set offset, 0.5\n"
-
-              "vcgt.f32   q10, q6, q15           @ get mask > 0, in0\n"
-              "vcgt.f32   q11, q7, q15           @ get mask > 0, in1\n"
-              "vcgt.f32   q12, q8, q15          @ get mask > 0, in2\n"
-              "vcgt.f32   q13, q9, q15          @ get mask > 0, in3\n"
-
-              "vmov.f32 q15, #-0.5                    \n"
-
-              "vbif.f32   q2, q15, q10           @ get right offset\n"
-              "vbif.f32   q3, q15, q11           @ get right offset\n"
-              "vbif.f32   q4, q15, q12          @ get right offset\n"
-              "vbif.f32   q5, q15, q13          @ get right offset\n"
-
-              "vmla.f32   q2, q6, %e[scale][0]          @ mul scale\n"
-              "vmla.f32   q3, q7, %e[scale][1]          @ mul scale\n"
-              "vmla.f32   q4, q8, %f[scale][0]          @ mul scale\n"
-              "vmla.f32   q5, q9, %f[scale][1]          @ mul scale\n"
-
-              "vcvt.s32.f32  q6, q2                   @ cvt to int32\n"
-              "vcvt.s32.f32  q7, q3                   @ cvt to int32\n"
-              "vcvt.s32.f32  q8, q4                   @ cvt to int32\n"
-              "vcvt.s32.f32  q9, q5                   @ cvt to int32\n"
-
-              "vqmovn.s32 d20, q6                     @ cnt to int16\n"
-              "vqmovn.s32 d22, q7                     @ cnt to int16\n"
-              "vqmovn.s32 d24, q8                     @ cnt to int16\n"
-              "vqmovn.s32 d26, q9                     @ cnt to int16\n"
-
-              "vqmovn.s16 d8, q10                      @ cnt to int8\n"
-              "vqmovn.s16 d9, q11                      @ cnt to int8\n"
-              "vqmovn.s16 d10, q12                      @ cnt to int8\n"
-              "vqmovn.s16 d11, q13                      @ cnt to int8\n"
-
-              "vst1.32 {d8[0]},    [%[doutc0r0]]         @ write to output\n"
-              "vst1.32 {d9[0]},    [%[doutc1r0]]         @ write to output\n"
-              "vst1.32 {d10[0]},    [%[doutc2r0]]         @ write to output\n"
-              "vst1.32 {d11[0]},    [%[doutc3r0]]         @ write to output\n"
-
-              "add %[doutc0r0], #4 \n"
-              "add %[doutc1r0], #4 \n"
-              "add %[doutc2r0], #4 \n"
-              "add %[doutc3r0], #4 \n"
-
-              "subs   %[cnt], %[cnt], #1            @ loop count - 1\n"
-              "vmov.u32   q15, #0                   @ dump zero\n"
-
-              "vld1.32 {d4-d7}, [%[ptr_din]]!       @load data \n"
-              "vld1.32 {d8-d11}, [%[ptr_din]]!       @load data \n"
-
-              "bne    1b                            @ jump to main loop\n"
-
-              : [doutc0r0] "+r"(doutc0_ptr),
-                [doutc1r0] "+r"(doutc1_ptr),
-                [doutc2r0] "+r"(doutc2_ptr),
-                [doutc3r0] "+r"(doutc3_ptr),
-                [ptr_din] "+r"(din_hei_ptr),
-                [cnt] "+r"(cnt_loop)
-              : [scale] "w"(w_scale)
-              : "q2",
-                "q3",
-                "q4",
-                "q5",
-                "q6",
-                "q7",
-                "q8",
-                "q9",
-                "q10",
-                "q11",
-                "q12",
-                "q13",
-                "q14",
-                "q15");
-#endif
-        }
-        if (we > width) {
-          int offset = 16 * (valid_w / 4 - 1);
-          din_hei_ptr = ptr_din + offset;
-          int j = we - 4;
-          for (; j < width; ++j) {
-            *(doutc0_ptr++) = saturate_cast<signed char>(
-                roundf(LITEMAX(din_hei_ptr[0], 0) * scale[0]));
-            *(doutc1_ptr++) = saturate_cast<signed char>(
-                roundf(LITEMAX(din_hei_ptr[1], 0) * scale[1]));
-            *(doutc2_ptr++) = saturate_cast<signed char>(
-                roundf(LITEMAX(din_hei_ptr[2], 0) * scale[2]));
-            *(doutc3_ptr++) = saturate_cast<signed char>(
-                roundf(LITEMAX(din_hei_ptr[3], 0) * scale[3]));
-            din_hei_ptr += 4;
-          }
-        }
-      }
-    } else {
-      for (int i = 0; i < size_h; i++) {  // size_h
-        int size_w = i * width;
-        dtype* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
-        dtype* doutc1_ptr = doutc1r0 + size_w;
-        dtype* doutc2_ptr = doutc2r0 + size_w;
-        dtype* doutc3_ptr = doutc3r0 + size_w;
-        if (ce > channel) {
-          switch (ce - channel) {
-            case 3:
-              doutc1_ptr = trash_ptr;
-            case 2:
-              doutc2_ptr = trash_ptr;
-            case 1:
-              doutc3_ptr = trash_ptr;
-            default:
-              break;
-          }
-        }
-        ptr_din = din + i * valid_w * ch_n;
-        const int* din_hei_ptr = ptr_din;
-        if (cnt > 0) {
-          int cnt_loop = cnt;
-#ifdef __aarch64__
-          asm volatile(
-              "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-              "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-              "movi v20.4s, #0                \n" /* for relu */
-              "1:                             \n" /* main loop*/
-              "trn1   v8.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-              "trn2   v9.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-              "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-              "trn1   v10.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/
-              "trn2   v11.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/
-              "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-              "trn1   v16.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/
-              "trn2   v17.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/
-              "trn1   v18.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
-              "trn2   v19.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
-              // int32 --> fp32
-              "scvtf   v4.4s, v16.4s               \n"
-              "scvtf   v5.4s, v17.4s               \n"
-              "scvtf   v6.4s, v18.4s               \n"
-              "scvtf   v7.4s, v19.4s               \n"
-
-              // mul
-              "fmul    v16.4s, v4.4s, %[scale].s[0]  \n"
-              "fmul    v17.4s, v5.4s, %[scale].s[2]  \n"
-              "fmul    v18.4s, v6.4s, %[scale].s[1] \n"
-              "fmul    v19.4s, v7.4s, %[scale].s[3] \n"
-
-              // fp32-int32
-              "fcvtas  v4.4s, v16.4s                      \n"
-              "fcvtas  v5.4s, v17.4s                      \n"
-              "fcvtas  v6.4s, v18.4s                      \n"
-              "fcvtas  v7.4s, v19.4s                      \n"
-
-              // int32-int16
-              "sqxtn   v8.4h, v4.4s                      \n"
-              "sqxtn   v9.4h, v5.4s                      \n"
-              "sqxtn   v10.4h, v6.4s                      \n"
-              "sqxtn   v11.4h, v7.4s                      \n"
-
-              "sqxtn  v16.8b, v8.8h                      \n"
-              "sqxtn  v17.8b, v9.8h                     \n"
-              "sqxtn  v18.8b, v10.8h                      \n"
-              "sqxtn  v19.8b, v11.8h                     \n"
-              // res
-              "str     s16, [%[doutc0r0]], #4           \n"
-              "str     s17, [%[doutc2r0]], #4           \n"
-              "str     s18, [%[doutc1r0]], #4           \n"
-              "str     s19, [%[doutc3r0]], #4           \n"
-
-              "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-              "bne    1b                      \n" /* jump to main loop*/
-
-              : [doutc0r0] "+r"(doutc0_ptr),
-                [doutc1r0] "+r"(doutc1_ptr),
-                [doutc2r0] "+r"(doutc2_ptr),
-                [doutc3r0] "+r"(doutc3_ptr),
-                [cnt] "+r"(cnt_loop),
-                [ptr_din] "+r"(din_hei_ptr)
-              : [scale] "w"(w_scale)
-              : "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v20");
-#else
-          asm volatile(
-              "vld1.32    {d4-d7}, [%[ptr_din]]!    @load data \n"
-              "vld1.32    {d8-d11}, [%[ptr_din]]!    @load data \n"
-              "vmov.u32   q15, #0                   @ dump zero\n"
-              "1:                                   @ main loop\n"
-              "vtrn.32    q2, q3                    @ trans q0, q1 \n"
-              "vtrn.32    q4, q5                    @ trans q2, q3 \n"
-              "vswp.32    d5, d8                    @ swap d1, d4  \n"
-              "vswp.32    d7, d10                    @ swap d3, d6  \n"
-
-              // int32-> fp32
-              "vcvt.f32.s32   q6, q2                  \n"
-              "vcvt.f32.s32   q7, q3                  \n"
-              "vcvt.f32.s32   q8, q4                  \n"
-              "vcvt.f32.s32   q9, q5                  \n"
-
-              "vmov.f32 q2, #0.5                    \n"
-
-              // "vand.i32   q0, %q[vpoff], %q[vpoff]    @ set offset, 0.5\n"
-              "vand.i32   q3, q2, q2                  @ set offset, 0.5\n"
-              "vand.i32   q4, q2, q2                  @ set offset, 0.5\n"
-              "vand.i32   q5, q2, q2                  @ set offset, 0.5\n"
-
-              "vcgt.f32   q10, q6, q15           @ get mask > 0, in0\n"
-              "vcgt.f32   q11, q7, q15           @ get mask > 0, in1\n"
-              "vcgt.f32   q12, q8, q15          @ get mask > 0, in2\n"
-              "vcgt.f32   q13, q9, q15          @ get mask > 0, in3\n"
-
-              "vmov.f32 q15, #-0.5                    \n"
-
-              "vbif.f32   q2, q15, q10           @ get right offset\n"
-              "vbif.f32   q3, q15, q11           @ get right offset\n"
-              "vbif.f32   q4, q15, q12          @ get right offset\n"
-              "vbif.f32   q5, q15, q13          @ get right offset\n"
-
-              "vmla.f32   q2, q6, %e[scale][0]          @ mul scale\n"
-              "vmla.f32   q3, q7, %e[scale][1]          @ mul scale\n"
-              "vmla.f32   q4, q8, %f[scale][0]          @ mul scale\n"
-              "vmla.f32   q5, q9, %f[scale][1]          @ mul scale\n"
-
-              "vcvt.s32.f32  q6, q2                   @ cvt to int32\n"
-              "vcvt.s32.f32  q7, q3                   @ cvt to int32\n"
-              "vcvt.s32.f32  q8, q4                   @ cvt to int32\n"
-              "vcvt.s32.f32  q9, q5                   @ cvt to int32\n"
-
-              "vqmovn.s32 d20, q6                     @ cnt to int16\n"
-              "vqmovn.s32 d22, q7                     @ cnt to int16\n"
-              "vqmovn.s32 d24, q8                     @ cnt to int16\n"
-              "vqmovn.s32 d26, q9                     @ cnt to int16\n"
-
-              "vqmovn.s16 d8, q10                      @ cnt to int8\n"
-              "vqmovn.s16 d9, q11                      @ cnt to int8\n"
-              "vqmovn.s16 d10, q12                      @ cnt to int8\n"
-              "vqmovn.s16 d11, q13                      @ cnt to int8\n"
-
-              "vst1.32 {d8[0]},    [%[doutc0r0]]         @ write to output\n"
-              "vst1.32 {d9[0]},    [%[doutc1r0]]         @ write to output\n"
-              "vst1.32 {d10[0]},    [%[doutc2r0]]         @ write to output\n"
-              "vst1.32 {d11[0]},    [%[doutc3r0]]         @ write to output\n"
-
-              "add %[doutc0r0], #4 \n"
-              "add %[doutc1r0], #4 \n"
-              "add %[doutc2r0], #4 \n"
-              "add %[doutc3r0], #4 \n"
-
-              "subs   %[cnt], %[cnt], #1            @ loop count - 1\n"
-
-              "vld1.32 {d4-d7}, [%[ptr_din]]!       @load data \n"
-              "vld1.32 {d8-d11}, [%[ptr_din]]!       @load data \n"
-              "vmov.u32   q15, #0                   @ dump zero\n"
-
-              "bne    1b                            @ jump to main loop\n"
-
-              : [doutc0r0] "+r"(doutc0_ptr),
-                [doutc1r0] "+r"(doutc1_ptr),
-                [doutc2r0] "+r"(doutc2_ptr),
-                [doutc3r0] "+r"(doutc3_ptr),
-                [ptr_din] "+r"(din_hei_ptr),
-                [cnt] "+r"(cnt_loop)
-              : [scale] "w"(w_scale)
-              : "q2",
-                "q3",
-                "q4",
-                "q5",
-                "q6",
-                "q7",
-                "q8",
-                "q9",
-                "q10",
-                "q11",
-                "q12",
-                "q13",
-                "q14",
-                "q15");
-#endif
-        }
-        if (we > width) {
-          int offset = 16 * (valid_w / 4 - 1);
-          din_hei_ptr = ptr_din + offset;
-          int j = we - 4;
-          for (; j < width; ++j) {
-            *(doutc0_ptr++) =
-                saturate_cast<int8_t>(roundf(din_hei_ptr[0] * scale[0]));
-            *(doutc1_ptr++) =
-                saturate_cast<int8_t>(roundf(din_hei_ptr[1] * scale[1]));
-            *(doutc2_ptr++) =
-                saturate_cast<int8_t>(roundf(din_hei_ptr[2] * scale[2]));
-            *(doutc3_ptr++) =
-                saturate_cast<int8_t>(roundf(din_hei_ptr[3] * scale[3]));
-            din_hei_ptr += 4;
-          }
-        }
-      }
-    }
-  } else {
-    LOG(ERROR) << "ERROR: unsupported input data type!!";
-    return false;
-  }
-  return true;
-}
-
-/*wirte result in outputs
-* input din: [n, c / 8, h, w * 8], output dout: [n, c, h, w]
-*/
-inline bool write_to_output_c8_int32(const int* din,
-                                     int* dout,
-                                     int ch_n,
-                                     int hei_n,
-                                     int cs,
-                                     int ce,
-                                     int hs,
-                                     int he,
-                                     int ws,
-                                     int we,
-                                     int channel,
-                                     int height,
-                                     int width,
-                                     bool flag_relu,
-                                     int* trash_ptr) {
-  if (ch_n != 8 || hei_n <= 0) {
-    LOG(ERROR) << "ch_n must be equal 8 and hei_n is more than zero";
-    return false;
-  }
-  int size_c_out = width * height;
-
-  int* doutc0r0 = dout + cs * size_c_out + hs * width + ws;
-  int* doutc1r0 = doutc0r0 + size_c_out;
-  int* doutc2r0 = doutc1r0 + size_c_out;
-  int* doutc3r0 = doutc2r0 + size_c_out;
-  int* doutc4r0 = doutc3r0 + size_c_out;
-  int* doutc5r0 = doutc4r0 + size_c_out;
-  int* doutc6r0 = doutc5r0 + size_c_out;
-  int* doutc7r0 = doutc6r0 + size_c_out;
-
-  const int* ptr_din = din;
-
-  int size_h = (he > height ? height : he) - hs;  // size_h == hei_n
-
-  int valid_w = we - ws;
-  int cnt = valid_w / 4;
-
-  if (we > width) {
-    cnt--;
-  }
-  if (flag_relu) {
-    for (int i = 0; i < size_h; i++) {
-      int size_w = i * width;
-      int* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
-      int* doutc1_ptr = doutc1r0 + size_w;
-      int* doutc2_ptr = doutc2r0 + size_w;
-      int* doutc3_ptr = doutc3r0 + size_w;
-      int* doutc4_ptr = doutc4r0 + size_w;
-      int* doutc5_ptr = doutc5r0 + size_w;
-      int* doutc6_ptr = doutc6r0 + size_w;
-      int* doutc7_ptr = doutc7r0 + size_w;
-      if (ce > channel) {
-        switch (ce - channel) {
-          case 7:
-            doutc1_ptr = trash_ptr;
-          case 6:
-            doutc2_ptr = trash_ptr;
-          case 5:
-            doutc3_ptr = trash_ptr;
-          case 4:
-            doutc4_ptr = trash_ptr;
-          case 3:
-            doutc5_ptr = trash_ptr;
-          case 2:
-            doutc6_ptr = trash_ptr;
-          case 1:
-            doutc7_ptr = trash_ptr;
-          default:
-            break;
-        }
-      }
-      ptr_din = din + i * valid_w * ch_n;
-      const int* din_hei_ptr = ptr_din;
-      if (cnt > 0) {
-        int cnt_loop = cnt;
-#ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "movi v20.4s, #0                \n" /* for relu */
-            "1:                             \n" /* main loop*/
-            "trn1   v8.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/
-            "trn2   v9.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/
-            "trn1   v10.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/
-            "trn2   v11.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-
-            "trn1   v12.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/
-            "trn2   v13.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/
-            "trn1   v14.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/
-            "trn2   v15.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-
-            "trn1   v16.2d, v8.2d, v12.2d   \n" /* trans q8, q10 00 01 02 03*/
-            "trn2   v17.2d, v8.2d, v12.2d   \n" /* trans q8, q10 20 21 22 23*/
-            "trn1   v18.2d, v9.2d, v13.2d   \n" /* trans q9, q11 10 11 12 13*/
-            "trn2   v19.2d, v9.2d, v13.2d   \n" /* trans q9, q11 30 31 32 33*/
-            "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-
-            "trn1   v8.2d, v10.2d, v14.2d   \n" /* trans q8, q10 40 41 42 43*/
-            "trn2   v9.2d, v10.2d, v14.2d   \n" /* trans q8, q10 60 61 62 63*/
-            "trn1   v12.2d, v11.2d, v15.2d  \n" /* trans q9, q11 50 51 52 53*/
-            "trn2   v13.2d, v11.2d, v15.2d  \n" /* trans q9, q11 70 71 72 73*/
-            "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-
-            "smax   v16.4s, v16.4s, v20.4s  \n" /*relu*/
-            "smax   v17.4s, v17.4s, v20.4s  \n" /*relu*/
-            "smax   v18.4s, v18.4s, v20.4s  \n" /*relu*/
-            "smax   v19.4s, v19.4s, v20.4s  \n" /*relu*/
-
-            "smax   v8.4s, v8.4s, v20.4s    \n" /*relu*/
-            "smax   v9.4s, v9.4s, v20.4s    \n" /*relu*/
-            "smax   v12.4s, v12.4s, v20.4s  \n" /*relu*/
-            "smax   v13.4s, v13.4s, v20.4s  \n" /*relu*/
-
-            "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/
-            "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/
-            "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/
-            "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/
-
-            "subs   %w[cnt], %w[cnt],  #1   \n" /* loop count -1*/
-            "str    q8, [%[doutc4r0]], #16  \n" /* store c0r0*/
-            "str    q9, [%[doutc6r0]], #16  \n" /* store c2r0*/
-            "str    q12, [%[doutc5r0]], #16 \n" /* store c1r0*/
-            "str    q13, [%[doutc7r0]], #16 \n" /* store c3r0*/
-
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [doutc4r0] "+r"(doutc4_ptr),
-              [doutc5r0] "+r"(doutc5_ptr),
-              [doutc6r0] "+r"(doutc6_ptr),
-              [doutc7r0] "+r"(doutc7_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20");
-#else
-        asm volatile(
-            "vld1.32 {d0-d3},   [%[ptr_din]]!   @load data \n"
-            "vld1.32 {d4-d7},   [%[ptr_din]]!   @load data \n"
-            "vld1.32 {d8-d11},  [%[ptr_din]]!   @load data \n"
-            "vld1.32 {d12-d15}, [%[ptr_din]]!   @load data \n"
-            "vmov.s32   q15, #0                 @ dump zero\n"
-            "1:                                 @ main loop\n"
-            "vtrn.32    q0, q2                  @ trans q0, q2 \n"
-            "vtrn.32    q4, q6                  @ trans q4, q6 \n"
-            "vswp.32    d1, d8                  @ swap  d1, d8 \n"
-            "vswp.32    d5, d12                 @ swap  d5, d12\n"
-
-            "vtrn.32    q1, q3                  @ trans q1, q3 \n"
-            "vtrn.32    q5, q7                  @ trans q5, q7 \n"
-            "vswp.32    d3, d10                 @ swap  d3, d10\n"
-            "vswp.32    d7, d14                 @ swap  d7, d14\n"
-
-            "vmax.s32   q0, q0, q15             @ relu\n"
-            "vmax.s32   q1, q1, q15             @ relu\n"
-            "vmax.s32   q2, q2, q15             @ relu\n"
-            "vmax.s32   q3, q3, q15             @ relu\n"
-
-            "vmax.s32   q4, q4, q15             @ relu\n"
-            "vmax.s32   q5, q5, q15             @ relu\n"
-            "vmax.s32   q6, q6, q15             @ relu\n"
-            "vmax.s32   q7, q7, q15             @ relu\n"
-
-            "subs   %[cnt], %[cnt], #1          @ loop count - 1\n"
-
-            "vst1.32  {d0-d1}, [%[doutc0r0]]!   @ store result, add pointer\n"
-            "vst1.32  {d2-d3}, [%[doutc4r0]]!   @ store result, add pointer\n"
-            "vst1.32  {d4-d5}, [%[doutc1r0]]!   @ store result, add pointer\n"
-            "vst1.32  {d6-d7}, [%[doutc5r0]]!   @ store result, add pointer\n"
-
-            "vld1.32 {d0-d3}, [%[ptr_din]]!      @load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!      @load data \n"
-
-            "vst1.32  {d8-d9},   [%[doutc2r0]]!  @ store result, add pointer\n"
-            "vst1.32  {d10-d11}, [%[doutc6r0]]!  @ store result, add pointer\n"
-            "vst1.32  {d12-d13}, [%[doutc3r0]]!  @ store result, add pointer\n"
-            "vst1.32  {d14-d15}, [%[doutc7r0]]!  @ store result, add pointer\n"
-
-            "vld1.32 {d8-d11},  [%[ptr_din]]!    @load data \n"
-            "vld1.32 {d12-d15}, [%[ptr_din]]!    @load data \n"
-
-            "bne    1b                           @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [doutc4r0] "+r"(doutc4_ptr),
-              [doutc5r0] "+r"(doutc5_ptr),
-              [doutc6r0] "+r"(doutc6_ptr),
-              [doutc7r0] "+r"(doutc7_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3", "q4", "q15");
-#endif
-      }
-      if (we > width) {
-        int offset = 32 * (valid_w / 4 - 1);
-        din_hei_ptr = ptr_din + offset;
-        int i = we - 4;
-        for (; i < width; ++i) {
-          *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0);
-          *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1], 0);
-          *(doutc2_ptr++) = LITEMAX(din_hei_ptr[2], 0);
-          *(doutc3_ptr++) = LITEMAX(din_hei_ptr[3], 0);
-          *(doutc4_ptr++) = LITEMAX(din_hei_ptr[4], 0);
-          *(doutc5_ptr++) = LITEMAX(din_hei_ptr[5], 0);
-          *(doutc6_ptr++) = LITEMAX(din_hei_ptr[6], 0);
-          *(doutc7_ptr++) = LITEMAX(din_hei_ptr[7], 0);
-          din_hei_ptr += 8;
-        }
-      }
-    }
-  } else {
-    for (int i = 0; i < size_h; i++) {
-      int size_w = i * width;
-      int* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
-      int* doutc1_ptr = doutc1r0 + size_w;
-      int* doutc2_ptr = doutc2r0 + size_w;
-      int* doutc3_ptr = doutc3r0 + size_w;
-      int* doutc4_ptr = doutc4r0 + size_w;
-      int* doutc5_ptr = doutc5r0 + size_w;
-      int* doutc6_ptr = doutc6r0 + size_w;
-      int* doutc7_ptr = doutc7r0 + size_w;
-      if (ce > channel) {
-        switch (ce - channel) {
-          case 7:
-            doutc1_ptr = trash_ptr;
-          case 6:
-            doutc2_ptr = trash_ptr;
-          case 5:
-            doutc3_ptr = trash_ptr;
-          case 4:
-            doutc4_ptr = trash_ptr;
-          case 3:
-            doutc5_ptr = trash_ptr;
-          case 2:
-            doutc6_ptr = trash_ptr;
-          case 1:
-            doutc7_ptr = trash_ptr;
-          default:
-            break;
-        }
-      }
-      ptr_din = din + i * valid_w * ch_n;
-      const int* din_hei_ptr = ptr_din;
-      if (cnt > 0) {
-        int cnt_loop = cnt;
-#ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "1:                             \n" /* main loop*/
-            "trn1   v8.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/
-            "trn2   v9.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/
-            "trn1   v10.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/
-            "trn2   v11.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-
-            "trn1   v12.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/
-            "trn2   v13.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/
-            "trn1   v14.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/
-            "trn2   v15.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-
-            "trn1   v16.2d, v8.2d, v12.2d   \n" /* trans q8, q10 00 01 02 03*/
-            "trn2   v17.2d, v8.2d, v12.2d   \n" /* trans q8, q10 20 21 22 23*/
-            "trn1   v18.2d, v9.2d, v13.2d   \n" /* trans q9, q11 10 11 12 13*/
-            "trn2   v19.2d, v9.2d, v13.2d   \n" /* trans q9, q11 30 31 32 33*/
-            "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-
-            "trn1   v8.2d, v10.2d, v14.2d   \n" /* trans q8, q10 40 41 42 43*/
-            "trn2   v9.2d, v10.2d, v14.2d   \n" /* trans q8, q10 60 61 62 63*/
-            "trn1   v12.2d, v11.2d, v15.2d  \n" /* trans q9, q11 50 51 52 53*/
-            "trn2   v13.2d, v11.2d, v15.2d  \n" /* trans q9, q11 70 71 72 73*/
-            "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-
-            "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/
-            "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/
-            "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/
-            "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-            "str    q8,  [%[doutc4r0]], #16 \n" /* store c0r0*/
-            "str    q9,  [%[doutc6r0]], #16 \n" /* store c2r0*/
-            "str    q12, [%[doutc5r0]], #16 \n" /* store c1r0*/
-            "str    q13, [%[doutc7r0]], #16 \n" /* store c3r0*/
-
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [doutc4r0] "+r"(doutc4_ptr),
-              [doutc5r0] "+r"(doutc5_ptr),
-              [doutc6r0] "+r"(doutc6_ptr),
-              [doutc7r0] "+r"(doutc7_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20");
-#else
-        asm volatile(
-            "vld1.32 {d0-d3},   [%[ptr_din]]!    @load data \n"
-            "vld1.32 {d4-d7},   [%[ptr_din]]!    @load data \n"
-            "vld1.32 {d8-d11},  [%[ptr_din]]!    @load data \n"
-            "vld1.32 {d12-d15}, [%[ptr_din]]!    @load data \n"
-            "1:                                  @ main loop\n"
-            "vtrn.32    q0, q2                   @ trans q0, q2 \n"
-            "vtrn.32    q4, q6                   @ trans q4, q6 \n"
-            "vswp.32    d1, d8                   @ swap  d1, d8 \n"
-            "vswp.32    d5, d12                  @ swap  d5, d12\n"
-
-            "vtrn.32    q1, q3                   @ trans q1, q3 \n"
-            "vtrn.32    q5, q7                   @ trans q5, q7 \n"
-            "vswp.32    d3, d10                  @ swap  d3, d10\n"
-            "vswp.32    d7, d14                  @ swap  d7, d14\n"
-
-            "subs   %[cnt], %[cnt], #1           @ loop count - 1\n"
-
-            "vst1.32  {d0-d1}, [%[doutc0r0]]!    @ store result, add pointer\n"
-            "vst1.32  {d2-d3}, [%[doutc4r0]]!    @ store result, add pointer\n"
-            "vst1.32  {d4-d5}, [%[doutc1r0]]!    @ store result, add pointer\n"
-            "vst1.32  {d6-d7}, [%[doutc5r0]]!    @ store result, add pointer\n"
-
-            "vld1.32  {d0-d3}, [%[ptr_din]]!     @load data \n"
-            "vld1.32  {d4-d7}, [%[ptr_din]]!     @load data \n"
-
-            "vst1.32  {d8-d9},   [%[doutc2r0]]!  @ store result, add pointer\n"
-            "vst1.32  {d10-d11}, [%[doutc6r0]]!  @ store result, add pointer\n"
-            "vst1.32  {d12-d13}, [%[doutc3r0]]!  @ store result, add pointer\n"
-            "vst1.32  {d14-d15}, [%[doutc7r0]]!  @ store result, add pointer\n"
-
-            "vld1.32  {d8-d11},  [%[ptr_din]]!   @load data \n"
-            "vld1.32  {d12-d15}, [%[ptr_din]]!   @load data \n"
-
-            "bne    1b                           @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [doutc4r0] "+r"(doutc4_ptr),
-              [doutc5r0] "+r"(doutc5_ptr),
-              [doutc6r0] "+r"(doutc6_ptr),
-              [doutc7r0] "+r"(doutc7_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3", "q4", "q15");
-#endif
-      }
-      if (we > width) {
-        int offset = 32 * (valid_w / 4 - 1);
-        din_hei_ptr = ptr_din + offset;
-        int i = we - 4;
-        for (; i < width; ++i) {
-          *(doutc0_ptr++) = din_hei_ptr[0];
-          *(doutc1_ptr++) = din_hei_ptr[1];
-          *(doutc2_ptr++) = din_hei_ptr[2];
-          *(doutc3_ptr++) = din_hei_ptr[3];
-          *(doutc4_ptr++) = din_hei_ptr[4];
-          *(doutc5_ptr++) = din_hei_ptr[5];
-          *(doutc6_ptr++) = din_hei_ptr[6];
-          *(doutc7_ptr++) = din_hei_ptr[7];
-          din_hei_ptr += 8;
-        }
-      }
-    }
-  }
-  return true;
-}
-
-/*wirte result in outputs--int8, fp32
-* input din: [n, c / 8, h, w * 8], output dout: [n, c, h, w]
-*/
-template <typename dtype>
-static bool write_to_output_c8_int32_1(const int* din,
-                                       dtype* dout,
-                                       int ch_n,
-                                       int hei_n,
-                                       int cs,
-                                       int ce,
-                                       int hs,
-                                       int he,
-                                       int ws,
-                                       int we,
-                                       int channel,
-                                       int height,
-                                       int width,
-                                       bool flag_relu,
-                                       dtype* trash_ptr,
-                                       const float* scale,
-                                       PrecisionType out_dtype) {
-  if (ch_n != 8 || hei_n <= 0) {
-    LOG(ERROR) << "ch_n must be equal 8 and hei_n is more than zero";
-    return false;
-  }
-  int size_c_out = width * height;
-
-  dtype* doutc0r0 = dout + cs * size_c_out + hs * width + ws;
-  dtype* doutc1r0 = doutc0r0 + size_c_out;
-  dtype* doutc2r0 = doutc1r0 + size_c_out;
-  dtype* doutc3r0 = doutc2r0 + size_c_out;
-  dtype* doutc4r0 = doutc3r0 + size_c_out;
-  dtype* doutc5r0 = doutc4r0 + size_c_out;
-  dtype* doutc6r0 = doutc5r0 + size_c_out;
-  dtype* doutc7r0 = doutc6r0 + size_c_out;
-
-  const int* ptr_din = din;
-
-  int size_h = (he > height ? height : he) - hs;  // size_h == hei_n
-
-  int valid_w = we - ws;
-  int cnt = valid_w / 4;
-
-  float32x4_t w_scale0 = vld1q_f32(scale);
-  float32x4_t w_scale1 = vld1q_f32(scale + 4);
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-
-  if (we > width) {
-    cnt--;
-  }
-  if (out_dtype == PRECISION(kFloat)) {
-    if (flag_relu) {
-      for (int i = 0; i < size_h; i++) {
-        int size_w = i * width;
-        dtype* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
-        dtype* doutc1_ptr = doutc1r0 + size_w;
-        dtype* doutc2_ptr = doutc2r0 + size_w;
-        dtype* doutc3_ptr = doutc3r0 + size_w;
-        dtype* doutc4_ptr = doutc4r0 + size_w;
-        dtype* doutc5_ptr = doutc5r0 + size_w;
-        dtype* doutc6_ptr = doutc6r0 + size_w;
-        dtype* doutc7_ptr = doutc7r0 + size_w;
-        if (ce > channel) {
-          switch (ce - channel) {
-            case 7:
-              doutc1_ptr = trash_ptr;
-            case 6:
-              doutc2_ptr = trash_ptr;
-            case 5:
-              doutc3_ptr = trash_ptr;
-            case 4:
-              doutc4_ptr = trash_ptr;
-            case 3:
-              doutc5_ptr = trash_ptr;
-            case 2:
-              doutc6_ptr = trash_ptr;
-            case 1:
-              doutc7_ptr = trash_ptr;
-            default:
-              break;
-          }
-        }
-        ptr_din = din + i * valid_w * ch_n;
-        const int* din_hei_ptr = ptr_din;
-        if (cnt > 0) {
-          int cnt_loop = cnt;
-#ifdef __aarch64__
-          asm volatile(
-              "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-              "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-              "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-              "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-              "movi v20.4s, #0                \n" /* for relu */
-              "1:                             \n" /* main loop*/
-              "trn1   v8.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/
-              "trn2   v9.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/
-              "trn1   v10.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/
-              "trn2   v11.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/
-              "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-
-              "trn1   v12.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/
-              "trn2   v13.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/
-              "trn1   v14.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/
-              "trn2   v15.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/
-              "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-
-              "trn1   v16.2d, v8.2d, v12.2d   \n" /* trans q8, q10 00 01 02 03*/
-              "trn2   v17.2d, v8.2d, v12.2d   \n" /* trans q8, q10 20 21 22 23*/
-              "trn1   v18.2d, v9.2d, v13.2d   \n" /* trans q9, q11 10 11 12 13*/
-              "trn2   v19.2d, v9.2d, v13.2d   \n" /* trans q9, q11 30 31 32 33*/
-              "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-
-              "trn1   v8.2d, v10.2d, v14.2d   \n" /* trans q8, q10 40 41 42 43*/
-              "trn2   v9.2d, v10.2d, v14.2d   \n" /* trans q8, q10 60 61 62 63*/
-              "trn1   v12.2d, v11.2d, v15.2d  \n" /* trans q9, q11 50 51 52 53*/
-              "trn2   v13.2d, v11.2d, v15.2d  \n" /* trans q9, q11 70 71 72 73*/
-              "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-
-              "smax   v16.4s, v16.4s, v20.4s  \n" /*relu*/
-              "smax   v17.4s, v17.4s, v20.4s  \n" /*relu*/
-              "smax   v18.4s, v18.4s, v20.4s  \n" /*relu*/
-              "smax   v19.4s, v19.4s, v20.4s  \n" /*relu*/
-
-              "smax   v8.4s, v8.4s, v20.4s    \n" /*relu*/
-              "smax   v9.4s, v9.4s, v20.4s    \n" /*relu*/
-              "smax   v12.4s, v12.4s, v20.4s  \n" /*relu*/
-              "smax   v13.4s, v13.4s, v20.4s  \n" /*relu*/
-
-              // int32->fp32
-              "scvtf   v10.4s, v16.4s               \n"
-              "scvtf   v11.4s, v17.4s               \n"
-              "scvtf   v14.4s, v18.4s               \n"
-              "scvtf   v15.4s, v19.4s               \n"
-              // mul
-              "fmul    v16.4s, v10.4s, %[scale0].s[0]  \n"
-              "fmul    v17.4s, v11.4s, %[scale0].s[2] \n"
-              "fmul    v18.4s, v14.4s, %[scale0].s[1] \n"
-              "fmul    v19.4s, v15.4s, %[scale0].s[3] \n"
-
-              "scvtf   v10.4s, v8.4s               \n"
-              "scvtf   v11.4s, v9.4s               \n"
-              "scvtf   v14.4s, v12.4s               \n"
-              "scvtf   v15.4s, v13.4s               \n"
-
-              "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/
-              "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/
-              "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/
-              "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/
-
-              // mul
-              "fmul    v8.4s, v10.4s, %[scale1].s[0]  \n"
-              "fmul    v9.4s, v11.4s, %[scale1].s[2] \n"
-              "fmul    v12.4s, v14.4s, %[scale1].s[1] \n"
-              "fmul    v13.4s, v15.4s, %[scale1].s[3] \n"
-
-              "subs   %w[cnt], %w[cnt],  #1   \n" /* loop count -1*/
-              "str    q8, [%[doutc4r0]], #16  \n" /* store c0r0*/
-              "str    q9, [%[doutc6r0]], #16  \n" /* store c2r0*/
-              "str    q12, [%[doutc5r0]], #16 \n" /* store c1r0*/
-              "str    q13, [%[doutc7r0]], #16 \n" /* store c3r0*/
-
-              "bne    1b                      \n" /* jump to main loop*/
-
-              : [doutc0r0] "+r"(doutc0_ptr),
-                [doutc1r0] "+r"(doutc1_ptr),
-                [doutc2r0] "+r"(doutc2_ptr),
-                [doutc3r0] "+r"(doutc3_ptr),
-                [doutc4r0] "+r"(doutc4_ptr),
-                [doutc5r0] "+r"(doutc5_ptr),
-                [doutc6r0] "+r"(doutc6_ptr),
-                [doutc7r0] "+r"(doutc7_ptr),
-                [cnt] "+r"(cnt_loop),
-                [ptr_din] "+r"(din_hei_ptr)
-              : [scale0] "w"(w_scale0), [scale1] "w"(w_scale1)
-              : "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v20");
-#else
-          asm volatile(
-              "vld1.32 {d0-d3},   [%[ptr_din]]!   @load data \n"
-              "vld1.32 {d4-d7},   [%[ptr_din]]!   @load data \n"
-              "vld1.32 {d8-d11},  [%[ptr_din]]!   @load data \n"
-              "vld1.32 {d12-d15}, [%[ptr_din]]!   @load data \n"
-              "vmov.s32   q15, #0                 @ dump zero\n"
-              "1:                                 @ main loop\n"
-              "vmax.s32   q0, q0, q15             @ relu\n"
-              "vmax.s32   q1, q1, q15             @ relu\n"
-              "vmax.s32   q2, q2, q15             @ relu\n"
-              "vmax.s32   q3, q3, q15             @ relu\n"
-
-              "vmax.s32   q4, q4, q15             @ relu\n"
-              "vmax.s32   q5, q5, q15             @ relu\n"
-              "vmax.s32   q6, q6, q15             @ relu\n"
-              "vmax.s32   q7, q7, q15             @ relu\n"
-
-              // int32-> fp32
-              "vcvt.f32.s32   q8, q0                  \n"
-              "vcvt.f32.s32   q9, q1                  \n"
-              "vcvt.f32.s32   q10, q2                  \n"
-              "vcvt.f32.s32   q11, q3                  \n"
-
-              // mul
-              "vmul.f32  q0, q8, %q[scale0]       \n"
-              "vmul.f32  q1, q9, %q[scale1]       \n"
-              "vmul.f32  q2, q10, %q[scale0]      \n"
-              "vmul.f32  q3, q11, %q[scale1]      \n"
-
-              // int32-> fp32
-              "vcvt.f32.s32   q8, q4                  \n"
-              "vcvt.f32.s32   q9, q5                  \n"
-              "vcvt.f32.s32   q10, q6                  \n"
-              "vcvt.f32.s32   q11, q7                  \n"
-
-              // mul
-              "vmul.f32  q4, q8, %q[scale0]       \n"
-              "vmul.f32  q5, q9, %q[scale1]        \n"
-              "vmul.f32  q6, q10, %q[scale0]      \n"
-              "vmul.f32  q7, q11, %q[scale1]      \n"
-
-              "vtrn.32    q0, q2                  @ trans q0, q2 \n"
-              "vtrn.32    q4, q6                  @ trans q4, q6 \n"
-              "vswp.32    d1, d8                  @ swap  d1, d8 \n"
-              "vswp.32    d5, d12                 @ swap  d5, d12\n"
-
-              "vtrn.32    q1, q3                  @ trans q1, q3 \n"
-              "vtrn.32    q5, q7                  @ trans q5, q7 \n"
-              "vswp.32    d3, d10                 @ swap  d3, d10\n"
-              "vswp.32    d7, d14                 @ swap  d7, d14\n"
-
-              "vst1.32  {d0-d1}, [%[doutc0r0]]!   @ store result, add pointer\n"
-              "vst1.32  {d4-d5}, [%[doutc1r0]]!   @ store result, add pointer\n"
-              "vst1.32  {d8-d9},   [%[doutc2r0]]!  @ store result, add "
-              "pointer\n"
-              "vst1.32  {d12-d13}, [%[doutc3r0]]!  @ store result, add "
-              "pointer\n"
-
-              "vst1.32  {d2-d3}, [%[doutc4r0]]!   @ store result, add pointer\n"
-              "vst1.32  {d6-d7}, [%[doutc5r0]]!   @ store result, add pointer\n"
-              "vst1.32  {d10-d11}, [%[doutc6r0]]!  @ store result, add "
-              "pointer\n"
-              "vst1.32  {d14-d15}, [%[doutc7r0]]!  @ store result, add "
-              "pointer\n"
-
-              "subs   %[cnt], %[cnt], #1          @ loop count - 1\n"
-
-              "vld1.32 {d0-d3}, [%[ptr_din]]!      @load data \n"
-              "vld1.32 {d4-d7}, [%[ptr_din]]!      @load data \n"
-              "vld1.32 {d8-d11},  [%[ptr_din]]!    @load data \n"
-              "vld1.32 {d12-d15}, [%[ptr_din]]!    @load data \n"
-
-              "bne    1b                           @ jump to main loop\n"
-
-              : [doutc0r0] "+r"(doutc0_ptr),
-                [doutc1r0] "+r"(doutc1_ptr),
-                [doutc2r0] "+r"(doutc2_ptr),
-                [doutc3r0] "+r"(doutc3_ptr),
-                [doutc4r0] "+r"(doutc4_ptr),
-                [doutc5r0] "+r"(doutc5_ptr),
-                [doutc6r0] "+r"(doutc6_ptr),
-                [doutc7r0] "+r"(doutc7_ptr),
-                [ptr_din] "+r"(din_hei_ptr),
-                [cnt] "+r"(cnt_loop)
-              : [scale0] "w"(w_scale0), [scale1] "w"(w_scale1)
-              : "q0",
-                "q1",
-                "q2",
-                "q3",
-                "q4",
-                "q5",
-                "q6",
-                "q7",
-                "q8",
-                "q9",
-                "q10",
-                "q11",
-                "q15");
-#endif
-        }
-        if (we > width) {
-          int offset = 32 * (valid_w / 4 - 1);
-          din_hei_ptr = ptr_din + offset;
-          int i = we - 4;
-          for (; i < width; ++i) {
-            *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0] * scale[0], 0);
-            *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1] * scale[1], 0);
-            *(doutc2_ptr++) = LITEMAX(din_hei_ptr[2] * scale[2], 0);
-            *(doutc3_ptr++) = LITEMAX(din_hei_ptr[3] * scale[3], 0);
-            *(doutc4_ptr++) = LITEMAX(din_hei_ptr[4] * scale[4], 0);
-            *(doutc5_ptr++) = LITEMAX(din_hei_ptr[5] * scale[5], 0);
-            *(doutc6_ptr++) = LITEMAX(din_hei_ptr[6] * scale[6], 0);
-            *(doutc7_ptr++) = LITEMAX(din_hei_ptr[7] * scale[7], 0);
-            din_hei_ptr += 8;
-          }
-        }
-      }
-    } else {
-      for (int i = 0; i < size_h; i++) {
-        int size_w = i * width;
-        dtype* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
-        dtype* doutc1_ptr = doutc1r0 + size_w;
-        dtype* doutc2_ptr = doutc2r0 + size_w;
-        dtype* doutc3_ptr = doutc3r0 + size_w;
-        dtype* doutc4_ptr = doutc4r0 + size_w;
-        dtype* doutc5_ptr = doutc5r0 + size_w;
-        dtype* doutc6_ptr = doutc6r0 + size_w;
-        dtype* doutc7_ptr = doutc7r0 + size_w;
-        if (ce > channel) {
-          switch (ce - channel) {
-            case 7:
-              doutc1_ptr = trash_ptr;
-            case 6:
-              doutc2_ptr = trash_ptr;
-            case 5:
-              doutc3_ptr = trash_ptr;
-            case 4:
-              doutc4_ptr = trash_ptr;
-            case 3:
-              doutc5_ptr = trash_ptr;
-            case 2:
-              doutc6_ptr = trash_ptr;
-            case 1:
-              doutc7_ptr = trash_ptr;
-            default:
-              break;
-          }
-        }
-        ptr_din = din + i * valid_w * ch_n;
-        const int* din_hei_ptr = ptr_din;
-        if (cnt > 0) {
-          int cnt_loop = cnt;
-#ifdef __aarch64__
-          asm volatile(
-              "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-              "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-              "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-              "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-              "movi v20.4s, #0                \n" /* for relu */
-              "1:                             \n" /* main loop*/
-              "trn1   v8.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/
-              "trn2   v9.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/
-              "trn1   v10.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/
-              "trn2   v11.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/
-              "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-
-              "trn1   v12.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/
-              "trn2   v13.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/
-              "trn1   v14.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/
-              "trn2   v15.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/
-              "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-
-              "trn1   v16.2d, v8.2d, v12.2d   \n" /* trans q8, q10 00 01 02 03*/
-              "trn2   v17.2d, v8.2d, v12.2d   \n" /* trans q8, q10 20 21 22 23*/
-              "trn1   v18.2d, v9.2d, v13.2d   \n" /* trans q9, q11 10 11 12 13*/
-              "trn2   v19.2d, v9.2d, v13.2d   \n" /* trans q9, q11 30 31 32 33*/
-              "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-
-              "trn1   v8.2d, v10.2d, v14.2d   \n" /* trans q8, q10 40 41 42 43*/
-              "trn2   v9.2d, v10.2d, v14.2d   \n" /* trans q8, q10 60 61 62 63*/
-              "trn1   v12.2d, v11.2d, v15.2d  \n" /* trans q9, q11 50 51 52 53*/
-              "trn2   v13.2d, v11.2d, v15.2d  \n" /* trans q9, q11 70 71 72 73*/
-              "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-
-              // int32->fp32
-              "scvtf   v10.4s, v16.4s               \n"
-              "scvtf   v11.4s, v17.4s               \n"
-              "scvtf   v14.4s, v18.4s               \n"
-              "scvtf   v15.4s, v19.4s               \n"
-              // mul
-              "fmul    v16.4s, v10.4s, %[scale0].s[0]  \n"
-              "fmul    v17.4s, v11.4s, %[scale0].s[2] \n"
-              "fmul    v18.4s, v14.4s, %[scale0].s[1] \n"
-              "fmul    v19.4s, v15.4s, %[scale0].s[3] \n"
-
-              "scvtf   v10.4s, v8.4s               \n"
-              "scvtf   v11.4s, v9.4s               \n"
-              "scvtf   v14.4s, v12.4s               \n"
-              "scvtf   v15.4s, v13.4s               \n"
-
-              "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/
-              "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/
-              "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/
-              "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/
-
-              // mul
-              "fmul    v8.4s, v10.4s, %[scale1].s[0]  \n"
-              "fmul    v9.4s, v11.4s, %[scale1].s[2] \n"
-              "fmul    v12.4s, v14.4s, %[scale1].s[1] \n"
-              "fmul    v13.4s, v15.4s, %[scale1].s[3] \n"
-
-              "subs   %w[cnt], %w[cnt],  #1   \n" /* loop count -1*/
-              "str    q8, [%[doutc4r0]], #16  \n" /* store c0r0*/
-              "str    q9, [%[doutc6r0]], #16  \n" /* store c2r0*/
-              "str    q12, [%[doutc5r0]], #16 \n" /* store c1r0*/
-              "str    q13, [%[doutc7r0]], #16 \n" /* store c3r0*/
-
-              "bne    1b                      \n" /* jump to main loop*/
-
-              : [doutc0r0] "+r"(doutc0_ptr),
-                [doutc1r0] "+r"(doutc1_ptr),
-                [doutc2r0] "+r"(doutc2_ptr),
-                [doutc3r0] "+r"(doutc3_ptr),
-                [doutc4r0] "+r"(doutc4_ptr),
-                [doutc5r0] "+r"(doutc5_ptr),
-                [doutc6r0] "+r"(doutc6_ptr),
-                [doutc7r0] "+r"(doutc7_ptr),
-                [cnt] "+r"(cnt_loop),
-                [ptr_din] "+r"(din_hei_ptr)
-              : [scale0] "w"(w_scale0), [scale1] "w"(w_scale1)
-              : "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v20");
-#else
-          asm volatile(
-              "vld1.32 {d0-d3},   [%[ptr_din]]!   @load data \n"
-              "vld1.32 {d4-d7},   [%[ptr_din]]!   @load data \n"
-              "vld1.32 {d8-d11},  [%[ptr_din]]!   @load data \n"
-              "vld1.32 {d12-d15}, [%[ptr_din]]!   @load data \n"
-              "vmov.s32   q15, #0                 @ dump zero\n"
-              "1:                                 @ main loop\n"
-              // int32-> fp32
-              "vcvt.f32.s32   q8, q0                  \n"
-              "vcvt.f32.s32   q9, q1                  \n"
-              "vcvt.f32.s32   q10, q2                  \n"
-              "vcvt.f32.s32   q11, q3                  \n"
-
-              // mul
-              "vmul.f32  q0, q8, %q[scale0]       \n"
-              "vmul.f32  q1, q9, %q[scale1]       \n"
-              "vmul.f32  q2, q10, %q[scale0]      \n"
-              "vmul.f32  q3, q11, %q[scale1]      \n"
-
-              // int32-> fp32
-              "vcvt.f32.s32   q8, q4                  \n"
-              "vcvt.f32.s32   q9, q5                  \n"
-              "vcvt.f32.s32   q10, q6                  \n"
-              "vcvt.f32.s32   q11, q7                  \n"
-
-              // mul
-              "vmul.f32  q4, q8, %q[scale0]       \n"
-              "vmul.f32  q5, q9, %q[scale1]        \n"
-              "vmul.f32  q6, q10, %q[scale0]      \n"
-              "vmul.f32  q7, q11, %q[scale1]      \n"
-
-              "vtrn.32    q0, q2                  @ trans q0, q2 \n"
-              "vtrn.32    q4, q6                  @ trans q4, q6 \n"
-              "vswp.32    d1, d8                  @ swap  d1, d8 \n"
-              "vswp.32    d5, d12                 @ swap  d5, d12\n"
-
-              "vtrn.32    q1, q3                  @ trans q1, q3 \n"
-              "vtrn.32    q5, q7                  @ trans q5, q7 \n"
-              "vswp.32    d3, d10                 @ swap  d3, d10\n"
-              "vswp.32    d7, d14                 @ swap  d7, d14\n"
-
-              "vst1.32  {d0-d1}, [%[doutc0r0]]!   @ store result, add pointer\n"
-              "vst1.32  {d4-d5}, [%[doutc1r0]]!   @ store result, add pointer\n"
-              "vst1.32  {d8-d9},   [%[doutc2r0]]!  @ store result, add "
-              "pointer\n"
-              "vst1.32  {d12-d13}, [%[doutc3r0]]!  @ store result, add "
-              "pointer\n"
-
-              "vst1.32  {d2-d3}, [%[doutc4r0]]!   @ store result, add pointer\n"
-              "vst1.32  {d6-d7}, [%[doutc5r0]]!   @ store result, add pointer\n"
-              "vst1.32  {d10-d11}, [%[doutc6r0]]!  @ store result, add "
-              "pointer\n"
-              "vst1.32  {d14-d15}, [%[doutc7r0]]!  @ store result, add "
-              "pointer\n"
-
-              "subs   %[cnt], %[cnt], #1          @ loop count - 1\n"
-
-              "vld1.32 {d0-d3}, [%[ptr_din]]!      @load data \n"
-              "vld1.32 {d4-d7}, [%[ptr_din]]!      @load data \n"
-              "vld1.32 {d8-d11},  [%[ptr_din]]!    @load data \n"
-              "vld1.32 {d12-d15}, [%[ptr_din]]!    @load data \n"
-
-              "bne    1b                           @ jump to main loop\n"
-
-              : [doutc0r0] "+r"(doutc0_ptr),
-                [doutc1r0] "+r"(doutc1_ptr),
-                [doutc2r0] "+r"(doutc2_ptr),
-                [doutc3r0] "+r"(doutc3_ptr),
-                [doutc4r0] "+r"(doutc4_ptr),
-                [doutc5r0] "+r"(doutc5_ptr),
-                [doutc6r0] "+r"(doutc6_ptr),
-                [doutc7r0] "+r"(doutc7_ptr),
-                [ptr_din] "+r"(din_hei_ptr),
-                [cnt] "+r"(cnt_loop)
-              : [scale0] "w"(w_scale0), [scale1] "w"(w_scale1)
-              : "q0",
-                "q1",
-                "q2",
-                "q3",
-                "q4",
-                "q5",
-                "q6",
-                "q7",
-                "q8",
-                "q9",
-                "q10",
-                "q11",
-                "q15");
-#endif
-        }
-        if (we > width) {
-          int offset = 32 * (valid_w / 4 - 1);
-          din_hei_ptr = ptr_din + offset;
-          int i = we - 4;
-          for (; i < width; ++i) {
-            *(doutc0_ptr++) = din_hei_ptr[0] * scale[0];
-            *(doutc1_ptr++) = din_hei_ptr[1] * scale[1];
-            *(doutc2_ptr++) = din_hei_ptr[2] * scale[2];
-            *(doutc3_ptr++) = din_hei_ptr[3] * scale[3];
-            *(doutc4_ptr++) = din_hei_ptr[4] * scale[4];
-            *(doutc5_ptr++) = din_hei_ptr[5] * scale[5];
-            *(doutc6_ptr++) = din_hei_ptr[6] * scale[6];
-            *(doutc7_ptr++) = din_hei_ptr[7] * scale[7];
-            din_hei_ptr += 8;
-          }
-        }
-      }
-    }
-  } else if (out_dtype == PRECISION(kInt8)) {
-    // int32_to_int8
-    float32x4_t vpoff = vdupq_n_f32(0.5f);
-    float32x4_t vnoff = vdupq_n_f32(-0.5f);
-    if (flag_relu) {
-      for (int i = 0; i < size_h; i++) {
-        int size_w = i * width;
-        dtype* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
-        dtype* doutc1_ptr = doutc1r0 + size_w;
-        dtype* doutc2_ptr = doutc2r0 + size_w;
-        dtype* doutc3_ptr = doutc3r0 + size_w;
-        dtype* doutc4_ptr = doutc4r0 + size_w;
-        dtype* doutc5_ptr = doutc5r0 + size_w;
-        dtype* doutc6_ptr = doutc6r0 + size_w;
-        dtype* doutc7_ptr = doutc7r0 + size_w;
-        if (ce > channel) {
-          switch (ce - channel) {
-            case 7:
-              doutc1_ptr = trash_ptr;
-            case 6:
-              doutc2_ptr = trash_ptr;
-            case 5:
-              doutc3_ptr = trash_ptr;
-            case 4:
-              doutc4_ptr = trash_ptr;
-            case 3:
-              doutc5_ptr = trash_ptr;
-            case 2:
-              doutc6_ptr = trash_ptr;
-            case 1:
-              doutc7_ptr = trash_ptr;
-            default:
-              break;
-          }
-        }
-        ptr_din = din + i * valid_w * ch_n;
-        const int* din_hei_ptr = ptr_din;
-        if (cnt > 0) {
-          int cnt_loop = cnt;
-#ifdef __aarch64__
-          asm volatile(
-              "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-              "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-              "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-              "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-              // "movi v20.4s, #0                \n"         /* for relu */
-              "1:                             \n" /* main loop*/
-              "trn1   v8.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/
-              "trn2   v9.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/
-              "trn1   v10.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/
-              "trn2   v11.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/
-              "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-
-              "trn1   v12.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/
-              "trn2   v13.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/
-              "trn1   v14.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/
-              "trn2   v15.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/
-              "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-
-              "trn1   v16.2d, v8.2d, v12.2d   \n" /* trans q8, q10 00 01 02 03*/
-              "trn2   v17.2d, v8.2d, v12.2d   \n" /* trans q8, q10 20 21 22 23*/
-              "trn1   v18.2d, v9.2d, v13.2d   \n" /* trans q9, q11 10 11 12 13*/
-              "trn2   v19.2d, v9.2d, v13.2d   \n" /* trans q9, q11 30 31 32 33*/
-              "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-
-              "trn1   v8.2d, v10.2d, v14.2d   \n" /* trans q8, q10 40 41 42 43*/
-              "trn2   v9.2d, v10.2d, v14.2d   \n" /* trans q8, q10 60 61 62 63*/
-              "trn1   v12.2d, v11.2d, v15.2d  \n" /* trans q9, q11 50 51 52 53*/
-              "trn2   v13.2d, v11.2d, v15.2d  \n" /* trans q9, q11 70 71 72 73*/
-              "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-
-              "smax   v16.4s, v16.4s, %[vzero].4s  \n" /*relu*/
-              "smax   v17.4s, v17.4s, %[vzero].4s  \n" /*relu*/
-              "smax   v18.4s, v18.4s, %[vzero].4s  \n" /*relu*/
-              "smax   v19.4s, v19.4s, %[vzero].4s  \n" /*relu*/
-
-              "smax   v8.4s, v8.4s, %[vzero].4s    \n" /*relu*/
-              "smax   v9.4s, v9.4s, %[vzero].4s    \n" /*relu*/
-              "smax   v12.4s, v12.4s, %[vzero].4s  \n" /*relu*/
-              "smax   v13.4s, v13.4s, %[vzero].4s  \n" /*relu*/
-
-              // int32 --> fp32
-              "scvtf   v10.4s, v16.4s               \n"
-              "scvtf   v11.4s, v17.4s               \n"
-              "scvtf   v14.4s, v18.4s               \n"
-              "scvtf   v15.4s, v19.4s               \n"
-
-              "scvtf   v20.4s, v8.4s               \n"
-              "scvtf   v21.4s, v9.4s               \n"
-              "scvtf   v22.4s, v12.4s               \n"
-              "scvtf   v23.4s, v13.4s               \n"
-
-              // mul
-              "fmul    v16.4s, v10.4s, %[scale0].s[0]  \n"
-              "fmul    v17.4s, v11.4s, %[scale0].s[2]  \n"
-              "fmul    v18.4s, v14.4s, %[scale0].s[1] \n"
-              "fmul    v19.4s, v15.4s, %[scale0].s[3] \n"
-
-              "fmul    v8.4s, v20.4s, %[scale1].s[0]  \n"
-              "fmul    v9.4s, v21.4s, %[scale1].s[2]  \n"
-              "fmul    v12.4s, v22.4s, %[scale1].s[1] \n"
-              "fmul    v13.4s, v23.4s, %[scale1].s[3] \n"
-
-              // fp32-int32
-              "fcvtas  v10.4s, v16.4s                      \n"
-              "fcvtas  v11.4s, v17.4s                      \n"
-              "fcvtas  v14.4s, v18.4s                      \n"
-              "fcvtas  v15.4s, v19.4s                      \n"
-
-              "fcvtas  v20.4s, v8.4s                      \n"
-              "fcvtas  v21.4s, v9.4s                      \n"
-              "fcvtas  v22.4s, v12.4s                      \n"
-              "fcvtas  v23.4s, v13.4s                      \n"
-
-              // int32-int16
-              "sqxtn   v16.4h, v10.4s                      \n"
-              "sqxtn   v17.4h, v11.4s                      \n"
-              "sqxtn   v18.4h, v14.4s                      \n"
-              "sqxtn   v19.4h, v15.4s                      \n"
-
-              "sqxtn   v8.4h, v20.4s                      \n"
-              "sqxtn   v9.4h, v21.4s                      \n"
-              "sqxtn   v12.4h, v22.4s                      \n"
-              "sqxtn   v13.4h, v23.4s                      \n"
-
-              // int16-int8
-              "sqxtn  v10.8b, v16.8h                      \n"
-              "sqxtn  v11.8b, v17.8h                     \n"
-              "sqxtn  v14.8b, v18.8h                      \n"
-              "sqxtn  v15.8b, v19.8h                     \n"
-
-              "sqxtn  v20.8b, v8.8h                      \n"
-              "sqxtn  v21.8b, v9.8h                     \n"
-              "sqxtn  v22.8b, v12.8h                      \n"
-              "sqxtn  v23.8b, v13.8h                     \n"
-
-              "str    s10, [%[doutc0r0]], #4 \n" /* store c0r0*/
-              "str    s11, [%[doutc2r0]], #4 \n" /* store c2r0*/
-              "str    s14, [%[doutc1r0]], #4 \n" /* store c1r0*/
-              "str    s15, [%[doutc3r0]], #4 \n" /* store c3r0*/
-
-              "subs   %w[cnt], %w[cnt],  #1   \n" /* loop count -1*/
-              "str    s20, [%[doutc4r0]], #4  \n" /* store c0r0*/
-              "str    s21, [%[doutc6r0]], #4  \n" /* store c2r0*/
-              "str    s22, [%[doutc5r0]], #4 \n"  /* store c1r0*/
-              "str    s23, [%[doutc7r0]], #4 \n"  /* store c3r0*/
-
-              "bne    1b                      \n" /* jump to main loop*/
-              : [doutc0r0] "+r"(doutc0_ptr),
-                [doutc1r0] "+r"(doutc1_ptr),
-                [doutc2r0] "+r"(doutc2_ptr),
-                [doutc3r0] "+r"(doutc3_ptr),
-                [doutc4r0] "+r"(doutc4_ptr),
-                [doutc5r0] "+r"(doutc5_ptr),
-                [doutc6r0] "+r"(doutc6_ptr),
-                [doutc7r0] "+r"(doutc7_ptr),
-                [cnt] "+r"(cnt_loop),
-                [ptr_din] "+r"(din_hei_ptr)
-              :
-              [scale0] "w"(w_scale0), [scale1] "w"(w_scale1), [vzero] "w"(vzero)
-              : "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v20",
-                "v21",
-                "v22",
-                "v23");
-#else
-          asm volatile(
-              "vld1.32 {d8-d11},   [%[ptr_din]]!   @load data \n"
-              "vld1.32 {d12-d15},   [%[ptr_din]]!   @load data \n"
-
-              "1:                                 @ main loop\n"
-              "vmax.s32   q4, q4, %q[vzero]             @ relu\n"
-              "vmax.s32   q5, q5, %q[vzero]             @ relu\n"
-              "vmax.s32   q6, q6, %q[vzero]             @ relu\n"
-              "vmax.s32   q7, q7, %q[vzero]             @ relu\n"
-
-              // int32-> fp32
-              "vmov.f32 q15, #0.5                    \n"
-              "vcvt.f32.s32   q8, q4                  \n"
-              "vcvt.f32.s32   q9, q5                  \n"
-              "vcvt.f32.s32   q10, q6                  \n"
-              "vcvt.f32.s32   q11, q7                  \n"
-
-              "vand.i32   q4, q15, q15    @ set offset, 0.5\n"
-              "vand.i32   q5, q15, q15                  @ set offset, 0.5\n"
-              "vand.i32   q6, q15, q15                  @ set offset, 0.5\n"
-              "vand.i32   q7, q15, q15                  @ set offset, 0.5\n"
-
-              "vmov.f32 q15, #-0.5                    \n"
-
-              "vcgt.f32   q12, q8, %q[vzero]           @ get mask > 0, in0\n"
-              "vcgt.f32   q13, q9, %q[vzero]           @ get mask > 0, in0\n"
-              "vcgt.f32   q14, q10, %q[vzero]           @ get mask > 0, in0\n"
-              "vcgt.f32   q3, q11, %q[vzero]           @ get mask > 0, in0\n"
-
-              "vbif.f32   q4, q15, q12           @ get right offset\n"
-              "vbif.f32   q5, q15, q13           @ get right offset\n"
-              "vbif.f32   q6, q15, q14           @ get right offset\n"
-              "vbif.f32   q7, q15, q3           @ get right offset\n"
-
-              "vld1.32 {d24-d27},  [%[ptr_din]]!   @load data \n"
-              "vld1.32 {d28-d29}, [%[ptr_din]]!   @load data \n"
-              "vld1.32 {d6-d7}, [%[ptr_din]]!   @load data \n"
-
-              "vmla.f32   q4, q8, %q[scale0]          @ mul scale\n"
-              "vmla.f32   q5, q9, %q[scale1]         @ mul scale\n"
-              "vmla.f32   q6, q10, %q[scale0]          @ mul scale\n"
-              "vmla.f32   q7, q11, %q[scale1]          @ mul scale\n"
-
-              "vmax.s32   q12, q12, %q[vzero]             @ relu\n"
-              "vmax.s32   q13, q13, %q[vzero]             @ relu\n"
-              "vmax.s32   q14, q14, %q[vzero]             @ relu\n"
-              "vmax.s32   q3, q3, %q[vzero]             @ relu\n"
-
-              "vcvt.s32.f32  q8, q4                   @ cvt to int32\n"
-              "vcvt.s32.f32  q9, q5                   @ cvt to int32\n"
-              "vcvt.s32.f32  q10, q6                   @ cvt to int32\n"
-              "vcvt.s32.f32  q11, q7                   @ cvt to int32\n"
-
-              "vqmovn.s32 d8, q8                     @ cnt to int16\n"
-              "vqmovn.s32 d10, q9                     @ cnt to int16\n"
-              "vqmovn.s32 d12, q10                     @ cnt to int16\n"
-              "vqmovn.s32 d14, q11                     @ cnt to int16\n"
-
-              "vqmovn.s16 d16, q4                      @ cnt to int8\n"
-              "vqmovn.s16 d17, q5                      @ cnt to int8\n"
-              "vqmovn.s16 d18, q6                      @ cnt to int8\n"
-              "vqmovn.s16 d19, q7                      @ cnt to int8\n"
-
-              "vmov.f32 q15, #0.5                    \n"
-
-              "vcvt.f32.s32   q4, q12                  \n"
-              "vcvt.f32.s32   q5, q13                  \n"
-              "vcvt.f32.s32   q6, q14                  \n"
-              "vcvt.f32.s32   q7, q3                  \n"
-
-              "vand.i32   q12, q15, q15    @ set offset, 0.5\n"
-              "vand.i32   q13, q15, q15                  @ set offset, 0.5\n"
-              "vand.i32   q14, q15, q15                  @ set offset, 0.5\n"
-              "vand.i32   q3, q15, q15                  @ set offset, 0.5\n"
-
-              "vmov.f32 q15, #-0.5                    \n"
-
-              "vcgt.f32   q10, q4, %q[vzero]           @ get mask > 0, in0\n"
-              "vcgt.f32   q11, q5, %q[vzero]           @ get mask > 0, in0\n"
-
-              "vbif.f32   q12, q15, q10           @ get right offset\n"
-              "vbif.f32   q13, q15, q11           @ get right offset\n"
-
-              "vcgt.f32   q10, q6, %q[vzero]           @ get mask > 0, in0\n"
-              "vcgt.f32   q11, q7, %q[vzero]           @ get mask > 0, in0\n"
-
-              "vbif.f32   q14, q15, q10           @ get right offset\n"
-              "vbif.f32   q3, q15, q11           @ get right offset\n"
-
-              "vmla.f32   q12, q4, %q[scale0]          @ mul scale\n"
-              "vmla.f32   q13, q5, %q[scale1]           @ mul scale\n"
-              "vmla.f32   q14, q6, %q[scale0]           @ mul scale\n"
-              "vmla.f32   q3, q7, %q[scale1]           @ mul scale\n"
-
-              "vcvt.s32.f32  q4, q12                   @ cvt to int32\n"
-              "vcvt.s32.f32  q5, q13                   @ cvt to int32\n"
-              "vcvt.s32.f32  q6, q14                   @ cvt to int32\n"
-              "vcvt.s32.f32  q7, q3                   @ cvt to int32\n"
-
-              "vqmovn.s32 d24, q4                     @ cnt to int16\n"
-              "vqmovn.s32 d26, q5                     @ cnt to int16\n"
-              "vqmovn.s32 d28, q6                     @ cnt to int16\n"
-              "vqmovn.s32 d6, q7                    @ cnt to int16\n"
-
-              "vqmovn.s16 d20, q12                      @ cnt to int8\n"
-              "vqmovn.s16 d21, q13                      @ cnt to int8\n"
-              "vqmovn.s16 d22, q14                      @ cnt to int8\n"
-              "vqmovn.s16 d23, q3                      @ cnt to int8\n"
-
-              "vtrn.8    d16, d18                  @ trans q0, q2 \n"
-              "vtrn.8    d20, d22                 @ trans q4, q6 \n"
-              "vtrn.16    d16, d20                  @ trans q0, q2 \n"
-              "vtrn.16    d18, d22                 @ trans q4, q6 \n"
-
-              "vtrn.8    d17, d19                  @ trans q0, q2 \n"
-              "vtrn.8    d21, d23                 @ trans q4, q6 \n"
-              "vtrn.16    d17, d21                  @ trans q0, q2 \n"
-              "vtrn.16    d19, d23                 @ trans q4, q6 \n"
-
-              "vld1.32 {d8-d11},   [%[ptr_din]]!   @load data \n"
-              "vld1.32 {d12-d15},   [%[ptr_din]]!   @load data \n"
-
-              "vst1.32  {d16[0]},   [%[doutc0r0]]  @ store result, add "
-              "pointer\n"
-              "vst1.32  {d18[0]},   [%[doutc1r0]]  @ store result, add "
-              "pointer\n"
-              "vst1.32  {d20[0]},   [%[doutc2r0]]  @ store result, add "
-              "pointer\n"
-              "vst1.32  {d22[0]},   [%[doutc3r0]]  @ store result, add "
-              "pointer\n"
-
-              "vst1.32  {d17[0]},   [%[doutc4r0]]  @ store result, add "
-              "pointer\n"
-              "vst1.32  {d19[0]},   [%[doutc5r0]]  @ store result, add "
-              "pointer\n"
-              "vst1.32  {d21[0]},   [%[doutc6r0]]  @ store result, add "
-              "pointer\n"
-              "vst1.32  {d23[0]},   [%[doutc7r0]]  @ store result, add "
-              "pointer\n"
-
-              "add %[doutc0r0], #4                @ add \n"
-              "add %[doutc1r0], #4                @ add \n"
-              "add %[doutc2r0], #4                @ add \n"
-              "add %[doutc3r0], #4                @ add \n"
-
-              "subs   %[cnt], %[cnt], #1          @ loop count - 1\n"
-
-              "add %[doutc4r0], #4                @ add \n"
-              "add %[doutc5r0], #4                @ add \n"
-              "add %[doutc6r0], #4                @ add \n"
-              "add %[doutc7r0], #4                @ add \n"
-              "bne    1b                           @ jump to main loop\n"
-
-              : [doutc0r0] "+r"(doutc0_ptr),
-                [doutc1r0] "+r"(doutc1_ptr),
-                [doutc2r0] "+r"(doutc2_ptr),
-                [doutc3r0] "+r"(doutc3_ptr),
-                [doutc4r0] "+r"(doutc4_ptr),
-                [doutc5r0] "+r"(doutc5_ptr),
-                [doutc6r0] "+r"(doutc6_ptr),
-                [doutc7r0] "+r"(doutc7_ptr),
-                [ptr_din] "+r"(din_hei_ptr),
-                [cnt] "+r"(cnt_loop)
-              :
-              [scale0] "w"(w_scale0), [scale1] "w"(w_scale1), [vzero] "w"(vzero)
-              : "q3",
-                "q4",
-                "q5",
-                "q6",
-                "q7",
-                "q8",
-                "q9",
-                "q10",
-                "q11",
-                "q12",
-                "q13",
-                "q14",
-                "q15");
-#endif
-        }
-        if (we > width) {
-          int offset = 32 * (valid_w / 4 - 1);
-          din_hei_ptr = ptr_din + offset;
-          int i = we - 4;
-          for (; i < width; ++i) {
-            *(doutc0_ptr++) = saturate_cast<signed char>(
-                roundf(LITEMAX(din_hei_ptr[0] * scale[0], 0)));
-            *(doutc1_ptr++) = saturate_cast<signed char>(
-                roundf(LITEMAX(din_hei_ptr[1] * scale[1], 0)));
-            *(doutc2_ptr++) = saturate_cast<signed char>(
-                roundf(LITEMAX(din_hei_ptr[2] * scale[2], 0)));
-            *(doutc3_ptr++) = saturate_cast<signed char>(
-                roundf(LITEMAX(din_hei_ptr[3] * scale[3], 0)));
-            *(doutc4_ptr++) = saturate_cast<signed char>(
-                roundf(LITEMAX(din_hei_ptr[4] * scale[4], 0)));
-            *(doutc5_ptr++) = saturate_cast<signed char>(
-                roundf(LITEMAX(din_hei_ptr[5] * scale[5], 0)));
-            *(doutc6_ptr++) = saturate_cast<signed char>(
-                roundf(LITEMAX(din_hei_ptr[6] * scale[6], 0)));
-            *(doutc7_ptr++) = saturate_cast<signed char>(
-                roundf(LITEMAX(din_hei_ptr[7] * scale[7], 0)));
-            din_hei_ptr += 8;
-          }
-        }
-      }
-    } else {
-      for (int i = 0; i < size_h; i++) {
-        int size_w = i * width;
-        dtype* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
-        dtype* doutc1_ptr = doutc1r0 + size_w;
-        dtype* doutc2_ptr = doutc2r0 + size_w;
-        dtype* doutc3_ptr = doutc3r0 + size_w;
-        dtype* doutc4_ptr = doutc4r0 + size_w;
-        dtype* doutc5_ptr = doutc5r0 + size_w;
-        dtype* doutc6_ptr = doutc6r0 + size_w;
-        dtype* doutc7_ptr = doutc7r0 + size_w;
-        if (ce > channel) {
-          switch (ce - channel) {
-            case 7:
-              doutc1_ptr = trash_ptr;
-            case 6:
-              doutc2_ptr = trash_ptr;
-            case 5:
-              doutc3_ptr = trash_ptr;
-            case 4:
-              doutc4_ptr = trash_ptr;
-            case 3:
-              doutc5_ptr = trash_ptr;
-            case 2:
-              doutc6_ptr = trash_ptr;
-            case 1:
-              doutc7_ptr = trash_ptr;
-            default:
-              break;
-          }
-        }
-        ptr_din = din + i * valid_w * ch_n;
-        const int* din_hei_ptr = ptr_din;
-        if (cnt > 0) {
-          int cnt_loop = cnt;
-#ifdef __aarch64__
-          asm volatile(
-              "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-              "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-              "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-              "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-              // "movi v20.4s, #0                \n"         /* for relu */
-              "1:                             \n" /* main loop*/
-              "trn1   v8.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/
-              "trn2   v9.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/
-              "trn1   v10.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/
-              "trn2   v11.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/
-              "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-
-              "trn1   v12.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/
-              "trn2   v13.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/
-              "trn1   v14.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/
-              "trn2   v15.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/
-              "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-
-              "trn1   v16.2d, v8.2d, v12.2d   \n" /* trans q8, q10 00 01 02 03*/
-              "trn2   v17.2d, v8.2d, v12.2d   \n" /* trans q8, q10 20 21 22 23*/
-              "trn1   v18.2d, v9.2d, v13.2d   \n" /* trans q9, q11 10 11 12 13*/
-              "trn2   v19.2d, v9.2d, v13.2d   \n" /* trans q9, q11 30 31 32 33*/
-              "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-
-              "trn1   v8.2d, v10.2d, v14.2d   \n" /* trans q8, q10 40 41 42 43*/
-              "trn2   v9.2d, v10.2d, v14.2d   \n" /* trans q8, q10 60 61 62 63*/
-              "trn1   v12.2d, v11.2d, v15.2d  \n" /* trans q9, q11 50 51 52 53*/
-              "trn2   v13.2d, v11.2d, v15.2d  \n" /* trans q9, q11 70 71 72 73*/
-              "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-
-              // int32 --> fp32
-              "scvtf   v10.4s, v16.4s               \n"
-              "scvtf   v11.4s, v17.4s               \n"
-              "scvtf   v14.4s, v18.4s               \n"
-              "scvtf   v15.4s, v19.4s               \n"
-
-              "scvtf   v20.4s, v8.4s               \n"
-              "scvtf   v21.4s, v9.4s               \n"
-              "scvtf   v22.4s, v12.4s               \n"
-              "scvtf   v23.4s, v13.4s               \n"
-
-              // mul
-              "fmul    v16.4s, v10.4s, %[scale0].s[0]  \n"
-              "fmul    v17.4s, v11.4s, %[scale0].s[2]  \n"
-              "fmul    v18.4s, v14.4s, %[scale0].s[1] \n"
-              "fmul    v19.4s, v15.4s, %[scale0].s[3] \n"
-
-              "fmul    v8.4s, v20.4s, %[scale1].s[0]  \n"
-              "fmul    v9.4s, v21.4s, %[scale1].s[2]  \n"
-              "fmul    v12.4s, v22.4s, %[scale1].s[1] \n"
-              "fmul    v13.4s, v23.4s, %[scale1].s[3] \n"
-
-              // fp32-int32
-              "fcvtas  v10.4s, v16.4s                      \n"
-              "fcvtas  v11.4s, v17.4s                      \n"
-              "fcvtas  v14.4s, v18.4s                      \n"
-              "fcvtas  v15.4s, v19.4s                      \n"
-
-              "fcvtas  v20.4s, v8.4s                      \n"
-              "fcvtas  v21.4s, v9.4s                      \n"
-              "fcvtas  v22.4s, v12.4s                      \n"
-              "fcvtas  v23.4s, v13.4s                      \n"
-
-              // int32-int16
-              "sqxtn   v16.4h, v10.4s                      \n"
-              "sqxtn   v17.4h, v11.4s                      \n"
-              "sqxtn   v18.4h, v14.4s                      \n"
-              "sqxtn   v19.4h, v15.4s                      \n"
-
-              "sqxtn   v8.4h, v20.4s                      \n"
-              "sqxtn   v9.4h, v21.4s                      \n"
-              "sqxtn   v12.4h, v22.4s                      \n"
-              "sqxtn   v13.4h, v23.4s                      \n"
-
-              // int16-int8
-              "sqxtn  v10.8b, v16.8h                      \n"
-              "sqxtn  v11.8b, v17.8h                     \n"
-              "sqxtn  v14.8b, v18.8h                      \n"
-              "sqxtn  v15.8b, v19.8h                     \n"
-
-              "sqxtn  v20.8b, v8.8h                      \n"
-              "sqxtn  v21.8b, v9.8h                     \n"
-              "sqxtn  v22.8b, v12.8h                      \n"
-              "sqxtn  v23.8b, v13.8h                     \n"
-
-              "str    s10, [%[doutc0r0]], #4 \n" /* store c0r0*/
-              "str    s11, [%[doutc2r0]], #4 \n" /* store c2r0*/
-              "str    s14, [%[doutc1r0]], #4 \n" /* store c1r0*/
-              "str    s15, [%[doutc3r0]], #4 \n" /* store c3r0*/
-
-              "subs   %w[cnt], %w[cnt],  #1   \n" /* loop count -1*/
-              "str    s20, [%[doutc4r0]], #4  \n" /* store c0r0*/
-              "str    s21, [%[doutc6r0]], #4  \n" /* store c2r0*/
-              "str    s22, [%[doutc5r0]], #4 \n"  /* store c1r0*/
-              "str    s23, [%[doutc7r0]], #4 \n"  /* store c3r0*/
-
-              "bne    1b                      \n" /* jump to main loop*/
-
-              : [doutc0r0] "+r"(doutc0_ptr),
-                [doutc1r0] "+r"(doutc1_ptr),
-                [doutc2r0] "+r"(doutc2_ptr),
-                [doutc3r0] "+r"(doutc3_ptr),
-                [doutc4r0] "+r"(doutc4_ptr),
-                [doutc5r0] "+r"(doutc5_ptr),
-                [doutc6r0] "+r"(doutc6_ptr),
-                [doutc7r0] "+r"(doutc7_ptr),
-                [cnt] "+r"(cnt_loop),
-                [ptr_din] "+r"(din_hei_ptr)
-              : [scale0] "w"(w_scale0), [scale1] "w"(w_scale1)
-              : "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v20",
-                "v21",
-                "v22",
-                "v23");
-#else
-          asm volatile(
-              "vld1.32 {d8-d11},   [%[ptr_din]]!   @load data \n"
-              "vld1.32 {d12-d15},   [%[ptr_din]]!   @load data \n"
-
-              "1:                                 @ main loop\n"
-              // int32-> fp32
-              "vmov.f32 q15, #0.5                    \n"
-              "vcvt.f32.s32   q8, q4                  \n"
-              "vcvt.f32.s32   q9, q5                  \n"
-              "vcvt.f32.s32   q10, q6                  \n"
-              "vcvt.f32.s32   q11, q7                  \n"
-
-              "vand.i32   q4, q15, q15    @ set offset, 0.5\n"
-              "vand.i32   q5, q4, q4                  @ set offset, 0.5\n"
-              "vand.i32   q6, q4, q4                  @ set offset, 0.5\n"
-              "vand.i32   q7, q4, q4                  @ set offset, 0.5\n"
-
-              "vmov.f32 q15, #-0.5                    \n"
-
-              "vcgt.f32   q12, q8, %q[vzero]           @ get mask > 0, in0\n"
-              "vcgt.f32   q13, q9, %q[vzero]           @ get mask > 0, in0\n"
-              "vcgt.f32   q14, q10, %q[vzero]           @ get mask > 0, in0\n"
-              "vcgt.f32   q3, q11, %q[vzero]           @ get mask > 0, in0\n"
-
-              "vbif.f32   q4, q15, q12           @ get right offset\n"
-              "vbif.f32   q5, q15, q13           @ get right offset\n"
-              "vbif.f32   q6, q15, q14           @ get right offset\n"
-              "vbif.f32   q7, q15, q3           @ get right offset\n"
-
-              "vld1.32 {d24-d27},  [%[ptr_din]]!   @load data \n"
-              "vld1.32 {d28-d29}, [%[ptr_din]]!   @load data \n"
-              "vld1.32 {d6-d7}, [%[ptr_din]]!   @load data \n"
-
-              "vmla.f32   q4, q8, %q[scale0]          @ mul scale\n"
-              "vmla.f32   q5, q9, %q[scale1]           @ mul scale\n"
-              "vmla.f32   q6, q10, %q[scale0]           @ mul scale\n"
-              "vmla.f32   q7, q11, %q[scale1]          @ mul scale\n"
-
-              "vcvt.s32.f32  q8, q4                   @ cvt to int32\n"
-              "vcvt.s32.f32  q9, q5                   @ cvt to int32\n"
-              "vcvt.s32.f32  q10, q6                   @ cvt to int32\n"
-              "vcvt.s32.f32  q11, q7                   @ cvt to int32\n"
-
-              "vqmovn.s32 d8, q8                     @ cnt to int16\n"
-              "vqmovn.s32 d10, q9                     @ cnt to int16\n"
-              "vqmovn.s32 d12, q10                     @ cnt to int16\n"
-              "vqmovn.s32 d14, q11                     @ cnt to int16\n"
-
-              "vqmovn.s16 d16, q4                      @ cnt to int8\n"
-              "vqmovn.s16 d17, q5                      @ cnt to int8\n"
-              "vqmovn.s16 d18, q6                      @ cnt to int8\n"
-              "vqmovn.s16 d19, q7                      @ cnt to int8\n"
-
-              "vmov.f32 q15, #0.5                    \n"
-
-              "vcvt.f32.s32   q4, q12                  \n"
-              "vcvt.f32.s32   q5, q13                  \n"
-              "vcvt.f32.s32   q6, q14                  \n"
-              "vcvt.f32.s32   q7, q3                  \n"
-
-              "vand.i32   q12, q15, q15    @ set offset, 0.5\n"
-              "vand.i32   q13, q12, q12                  @ set offset, 0.5\n"
-              "vand.i32   q14, q12, q12                  @ set offset, 0.5\n"
-              "vand.i32   q3, q12, q12                  @ set offset, 0.5\n"
-
-              "vmov.f32 q15, #-0.5                    \n"
-
-              "vcgt.f32   q10, q4, %q[vzero]           @ get mask > 0, in0\n"
-              "vcgt.f32   q11, q5, %q[vzero]           @ get mask > 0, in0\n"
-
-              "vbif.f32   q12, q15, q10           @ get right offset\n"
-              "vbif.f32   q13, q15, q11           @ get right offset\n"
-
-              "vcgt.f32   q10, q6, %q[vzero]           @ get mask > 0, in0\n"
-              "vcgt.f32   q11, q7, %q[vzero]           @ get mask > 0, in0\n"
-
-              "vbif.f32   q14, q15, q10           @ get right offset\n"
-              "vbif.f32   q3, q15, q11           @ get right offset\n"
-
-              "vmla.f32   q12, q4, %q[scale0]           @ mul scale\n"
-              "vmla.f32   q13, q5, %q[scale1]        @ mul scale\n"
-              "vmla.f32   q14, q6, %q[scale0]          @ mul scale\n"
-              "vmla.f32   q3, q7, %q[scale1]         @ mul scale\n"
-
-              "vcvt.s32.f32  q4, q12                   @ cvt to int32\n"
-              "vcvt.s32.f32  q5, q13                   @ cvt to int32\n"
-              "vcvt.s32.f32  q6, q14                   @ cvt to int32\n"
-              "vcvt.s32.f32  q7, q3                   @ cvt to int32\n"
-
-              "vqmovn.s32 d24, q4                     @ cnt to int16\n"
-              "vqmovn.s32 d26, q5                     @ cnt to int16\n"
-              "vqmovn.s32 d28, q6                     @ cnt to int16\n"
-              "vqmovn.s32 d6, q7                    @ cnt to int16\n"
-
-              "vqmovn.s16 d20, q12                      @ cnt to int8\n"
-              "vqmovn.s16 d21, q13                      @ cnt to int8\n"
-              "vqmovn.s16 d22, q14                      @ cnt to int8\n"
-              "vqmovn.s16 d23, q3                      @ cnt to int8\n"
-
-              "vtrn.8    d16, d18                  @ trans q0, q2 \n"
-              "vtrn.8    d20, d22                 @ trans q4, q6 \n"
-              "vtrn.16    d16, d20                  @ trans q0, q2 \n"
-              "vtrn.16    d18, d22                 @ trans q4, q6 \n"
-
-              "vtrn.8    d17, d19                  @ trans q0, q2 \n"
-              "vtrn.8    d21, d23                 @ trans q4, q6 \n"
-              "vtrn.16    d17, d21                  @ trans q0, q2 \n"
-              "vtrn.16    d19, d23                 @ trans q4, q6 \n"
-
-              "vld1.32 {d8-d11},   [%[ptr_din]]!   @load data \n"
-              "vld1.32 {d12-d15},   [%[ptr_din]]!   @load data \n"
-
-              "vst1.32  {d16[0]},   [%[doutc0r0]]  @ store result, add "
-              "pointer\n"
-              "vst1.32  {d18[0]},   [%[doutc1r0]]  @ store result, add "
-              "pointer\n"
-              "vst1.32  {d20[0]},   [%[doutc2r0]]  @ store result, add "
-              "pointer\n"
-              "vst1.32  {d22[0]},   [%[doutc3r0]]  @ store result, add "
-              "pointer\n"
-
-              "vst1.32  {d17[0]},   [%[doutc4r0]]  @ store result, add "
-              "pointer\n"
-              "vst1.32  {d19[0]},   [%[doutc5r0]]  @ store result, add "
-              "pointer\n"
-              "vst1.32  {d21[0]},   [%[doutc6r0]]  @ store result, add "
-              "pointer\n"
-              "vst1.32  {d23[0]},   [%[doutc7r0]]  @ store result, add "
-              "pointer\n"
-
-              "add %[doutc0r0], #4                @ add \n"
-              "add %[doutc1r0], #4                @ add \n"
-              "add %[doutc2r0], #4                @ add \n"
-              "add %[doutc3r0], #4                @ add \n"
-
-              "subs   %[cnt], %[cnt], #1          @ loop count - 1\n"
-
-              "add %[doutc4r0], #4                @ add \n"
-              "add %[doutc5r0], #4                @ add \n"
-              "add %[doutc6r0], #4                @ add \n"
-              "add %[doutc7r0], #4                @ add \n"
-              "bne    1b                           @ jump to main loop\n"
-
-              : [doutc0r0] "+r"(doutc0_ptr),
-                [doutc1r0] "+r"(doutc1_ptr),
-                [doutc2r0] "+r"(doutc2_ptr),
-                [doutc3r0] "+r"(doutc3_ptr),
-                [doutc4r0] "+r"(doutc4_ptr),
-                [doutc5r0] "+r"(doutc5_ptr),
-                [doutc6r0] "+r"(doutc6_ptr),
-                [doutc7r0] "+r"(doutc7_ptr),
-                [ptr_din] "+r"(din_hei_ptr),
-                [cnt] "+r"(cnt_loop)
-              :
-              [scale0] "w"(w_scale0), [scale1] "w"(w_scale1), [vzero] "w"(vzero)
-              : "q3",
-                "q4",
-                "q5",
-                "q6",
-                "q7",
-                "q8",
-                "q9",
-                "q10",
-                "q11",
-                "q12",
-                "q13",
-                "q14",
-                "q15");
-#endif
-        }
-        if (we > width) {
-          int offset = 32 * (valid_w / 4 - 1);
-          din_hei_ptr = ptr_din + offset;
-          int i = we - 4;
-          for (; i < width; ++i) {
-            *(doutc0_ptr++) =
-                saturate_cast<signed char>(roundf(din_hei_ptr[0] * scale[0]));
-            *(doutc1_ptr++) =
-                saturate_cast<signed char>(roundf(din_hei_ptr[1] * scale[1]));
-            *(doutc2_ptr++) =
-                saturate_cast<signed char>(roundf(din_hei_ptr[2] * scale[2]));
-            *(doutc3_ptr++) =
-                saturate_cast<signed char>(roundf(din_hei_ptr[3] * scale[3]));
-            *(doutc4_ptr++) =
-                saturate_cast<signed char>(roundf(din_hei_ptr[4] * scale[4]));
-            *(doutc5_ptr++) =
-                saturate_cast<signed char>(roundf(din_hei_ptr[5] * scale[5]));
-            *(doutc6_ptr++) =
-                saturate_cast<signed char>(roundf(din_hei_ptr[6] * scale[6]));
-            *(doutc7_ptr++) =
-                saturate_cast<signed char>(roundf(din_hei_ptr[7] * scale[7]));
-            din_hei_ptr += 8;
-          }
-        }
-      }
-    }
-  } else {
-    LOG(ERROR) << "ERROR: unsupported input data type!!";
-    return false;
-  }
-  return true;
-}
-
-/*
-* din [n, hei_n, ch_n, w]
-* dout [n, ch_n, hei_n, w]
-*/
-template <typename dtype>
-static bool write_to_output_numc(const dtype* din,
-                                 dtype* dout,
-                                 int ch_n,
-                                 int hei_n,
-                                 int cs,
-                                 int ce,
-                                 int hs,
-                                 int he,
-                                 int ws,
-                                 int we,
-                                 int channel,
-                                 int height,
-                                 int width,
-                                 bool flag_relu,
-                                 dtype* trash_ptr) {
-  if (ch_n <= 0 || hei_n <= 0) {
-    LOG(ERROR) << "ch_n and hei_n are more than zero";
-    return false;
-  }
-  int size_c_out = width * height;
-
-  dtype* out_array[ch_n];
-  out_array[0] = dout + cs * size_c_out + hs * width + ws;
-
-  for (int i = 1; i < ch_n; i++) {
-    out_array[i] = out_array[i - 1] + size_c_out;
-  }
-
-  const dtype* ptr_din = din;
-
-  int cremain = ce - channel;
-  for (int i = 1; i <= cremain; i++) {
-    out_array[ch_n - i] = trash_ptr;
-  }
-
-  int size_h = (he > height ? height : he) - hs;  // size_h == hei_n
-
-  int size_w = we - ws;
-
-  int size_c_in = ch_n * size_w;
-
-  size_t valid_w_byte = width * sizeof(dtype);
-
-  if (flag_relu) {
-    for (int h = 0; h < size_h; h++) {
-      const dtype* din_ptr = din + h * size_c_in;
-      for (int i = 0; i < ch_n; i++) {
-        dtype* dout_ptr = out_array[i] + h * width;
-        for (int k = 0; k < width; k++) {
-          *(dout_ptr++) = LITEMAX(din_ptr[k], 0);
-        }
-        din_ptr += size_w;
-      }
-    }
-  } else {
-    for (int h = 0; h < size_h; h++) {
-      const dtype* din_ptr = din + h * size_c_in;
-      for (int i = 0; i < ch_n; i++) {
-        dtype* dout_ptr = out_array[i] + h * width;
-        memcpy(dout_ptr, din_ptr, valid_w_byte);
-        din_ptr += size_w;
-      }
-    }
-  }
-  return true;
-}
-
-/// ch_n == ce - cs ??
-/// hei_n == he - hs ??
-/// channel height width ? -> output
-template <typename ditype, typename dotype>
-static bool write2_to_output_numc(const ditype* din,
-                                  dotype* dout,
-                                  int ch_n,
-                                  int hei_n,
-                                  int cs,
-                                  int ce,
-                                  int hs,
-                                  int he,
-                                  int ws,
-                                  int we,
-                                  int channel,
-                                  int height,
-                                  int width,
-                                  bool flag_relu,
-                                  dotype* trash_ptr,
-                                  float const* scales) {
-  // static_assert(std::is_same<dotype, float>::value, "just support float");
-
-  if (ch_n <= 0 || hei_n <= 0) {
-    LOG(ERROR) << "ch_n and hei_n are more than zero";
-    return false;
-  }
-
-  int size_c_out = width * height;
-
-  dotype* out_array[ch_n];
-  out_array[0] = dout + cs * size_c_out + hs * width + ws;
-
-  for (int i = 1; i < ch_n; i++) {
-    out_array[i] = out_array[i - 1] + size_c_out;
-  }
-
-  const ditype* ptr_din = din;
-
-  int cremain = ce - channel;
-  for (int i = 1; i <= cremain; i++) {
-    out_array[ch_n - i] = trash_ptr;
-  }
-
-  int size_h = (he > height ? height : he) - hs;  // size_h == hei_n
-
-  int size_w = we - ws;
-
-  int size_c_in = ch_n * size_w;
-
-  size_t valid_w_byte = width * sizeof(ditype);
-
-  if (flag_relu) {
-    for (int h = 0; h < size_h; h++) {
-      ditype const* din_ptr = din + h * size_c_in;
-      for (int i = 0; i < ch_n; i++) {
-        float const ws = scales[(i + cs) % ch_n];
-        dotype* dout_ptr = out_array[i] + h * width;
-        for (int k = 0; k < width; k++) {
-          *(dout_ptr++) = LITEMAX(din_ptr[k] * ws, 0);
-        }
-        din_ptr += size_w;
-      }
-    }
-  } else {
-    for (int h = 0; h < size_h; h++) {
-      ditype const* din_ptr = din + h * size_c_in;
-      for (int i = 0; i < ch_n; i++) {
-        dotype* dout_ptr = out_array[i] + h * width;
-
-        float const* ws = &scales[(i + cs) % ch_n];
-        int32_to_dtype(din_ptr, dout_ptr, ws, 1, 1, width);
-
-        din_ptr += size_w;
-      }
-    }
-  }
-  return true;
-}
-/**
-* innput din: nchwc(num)
-*/
-inline bool fill_packed_bias_nxmw_fp32(
-    const float* bias, float* dout, int ch_n, int hei_n, int wround) {
-  if (ch_n <= 0 || hei_n <= 0) {
-    LOG(ERROR) << "ch_n and hei_n are more than zero";
-    return false;
-  }
-  int cnt_ch = ch_n / 4;
-  int size = wround * ch_n;
-  for (int h = 0; h < hei_n; h++) {
-    float* dout_ptr = dout + h * size;
-    for (int i = 0; i < wround; i++) {
-      const float* bias_ptr = bias;
-      int j = 0;
-      for (; j < cnt_ch; j++) {
-        float32x4_t vb = vld1q_f32(bias_ptr);
-        bias_ptr += 4;
-
-        vst1q_f32(dout_ptr, vb);
-        dout_ptr += 4;
-      }
-      j = j * 4;
-      for (; j < ch_n; j++) {
-        *dout_ptr = *bias_ptr;
-        dout_ptr++;
-        bias_ptr++;
-      }
-    }
-  }
-}
-
-inline bool fill_packed_bias_nxmw_int8(
-    const int* bias, int* dout, int ch_n, int hei_n, int wround) {
-  if (ch_n <= 0 || hei_n <= 0) {
-    LOG(ERROR) << "ch_n and hei_n are more than zero";
-    return false;
-  }
-  int cnt_ch = ch_n / 4;
-  int size = wround * ch_n;
-  for (int h = 0; h < hei_n; h++) {
-    int* dout_ptr = dout + h * size;
-    for (int i = 0; i < wround; i++) {
-      const int* bias_ptr = bias;
-      int j = 0;
-      for (; j < cnt_ch; j++) {
-        int32x4_t vb = vld1q_s32(bias_ptr);
-        bias_ptr += 4;
-
-        vst1q_s32(dout_ptr, vb);
-        dout_ptr += 4;
-      }
-      j = j * 4;
-      for (; j < ch_n; j++) {
-        *dout_ptr = *bias_ptr;
-        dout_ptr++;
-        bias_ptr++;
-      }
-    }
-  }
-  return true;
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_depthwise.cc b/lite/backends/arm/math/conv_depthwise.cc
deleted file mode 100644
index 79b8cec571..0000000000
--- a/lite/backends/arm/math/conv_depthwise.cc
+++ /dev/null
@@ -1,239 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/conv_depthwise.h"
-#include "lite/backends/arm/math/conv_block_utils.h"
-#include "lite/backends/arm/math/conv_impl.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <>
-bool DepthwiseConv<PRECISION(kFloat)>::create(const operators::ConvParam& param,
-                                              ARMContext* ctx) {
-  this->ctx_ = ctx;
-  auto x_dims = param.x->dims();
-  auto w_dims = param.filter->dims();
-  auto o_dims = param.output->dims();
-
-  int iw = x_dims[3];  // nchw
-  int ic = x_dims[1];
-  int ow = o_dims[3];
-  int oc = o_dims[1];
-  int kw = w_dims[3];
-  int sw = param.strides[1];
-  // select dw conv kernel
-  if (kw == 3) {
-    VLOG(5) << "invoke 3x3 dw conv";
-    impl_ = conv_depthwise_3x3;
-  } else if (kw == 5) {
-    VLOG(5) << "invoke 5x5 dw conv";
-    this->ctx_->ExtendWorkspace((iw + ow) * sizeof(float));
-    impl_ = conv_depthwise_5x5;
-  } else {
-    LOG(ERROR) << "this type dw conv not impl";
-    return false;
-  }
-  return true;
-}
-
-template <>
-bool DepthwiseConv<PRECISION(kFloat)>::init(const operators::ConvParam& param,
-                                            Context<TARGET(kARM)>* ctx) {
-  this->ctx_ = ctx;
-  return create(param, ctx);
-}
-
-template <>
-bool DepthwiseConv<PRECISION(kFloat)>::run(const operators::ConvParam& param) {
-  // start timer
-  const auto* i_data = param.x->data<float>();
-  const auto* w_data = param.filter->data<float>();
-  const auto* b_data = param.bias ? param.bias->data<float>() : nullptr;
-  auto* o_data = param.output->mutable_data<float>();
-
-  auto x_dims = param.x->dims();
-  auto w_dims = param.filter->dims();
-  auto o_dims = param.output->dims();
-
-  int iw = x_dims[3];  // nchw
-  int ih = x_dims[2];
-  int ic = x_dims[1];
-  int bs = x_dims[0];
-  int oh = o_dims[2];
-  int ow = o_dims[3];
-  int oc = o_dims[1];
-
-  impl_(i_data,
-        o_data,
-        bs,
-        oc,
-        oh,
-        ow,
-        ic,
-        ih,
-        iw,
-        w_data,
-        b_data,
-        param,
-        this->ctx_);
-
-  // timer end
-  return true;
-}
-
-template <PrecisionType Ptype_out>
-bool DepthwiseConvInt8<Ptype_out>::create(const operators::ConvParam& param,
-                                          ARMContext* ctx) {
-  this->ctx_ = ctx;
-  auto x_dims = param.x->dims();
-  auto w_dims = param.filter->dims();
-  auto o_dims = param.output->dims();
-
-  int ic = x_dims[1];
-  int ih = x_dims[2];
-  int iw = x_dims[3];  // nchw
-  int oc = o_dims[1];
-  int oh = o_dims[2];
-  int ow = o_dims[3];
-  int kw = w_dims[3];
-  int sw = param.strides[1];
-  w_scale_ = param.weight_scale;
-
-  //! select dw conv kernel
-  if (kw == 3) {
-    tmp_int32_out_.Resize(o_dims);
-    VLOG(5) << "invoke 3x3 depthwise int8 conv";
-    impl_ = conv_depthwise_3x3_int8;
-  } else if (kw == 5) {
-    // update w_data scale
-    if (Ptype_out == PRECISION(kFloat) || Ptype_out == PRECISION(kInt8)) {
-      CHECK_EQ(w_scale_.size(), oc) << "w_data scale size must be oc";
-      float input_scale = param.input_scale;
-      float output_scale = param.output_scale;
-      for (auto& ws : w_scale_) {
-        ws *= input_scale;
-        if (Ptype_out == PRECISION(kInt8)) {
-          ws /= output_scale;
-        }
-      }
-    }
-
-    const int wout_round = ((ow + 7) / 8) * 8;
-    const int win_round = wout_round * sw + 5 - 1;
-    const int hout_round = ((oh + 2) / 3) * 3;
-    const int hin_round = hout_round * sw + 5 - 1;
-    const int tmp_size_out = wout_round * hout_round;
-    const int tmp_size_in = win_round * hin_round;
-    const int tmp_size_io_bytes = tmp_size_in + tmp_size_out * sizeof(int);
-    const int tmp_row_io_bytes = win_round + wout_round * sizeof(int);
-    const int tmp_size_io_float =
-        (tmp_size_io_bytes + sizeof(float) - 1) / sizeof(float);
-    const int tmp_row_io_float =
-        (tmp_row_io_bytes + sizeof(float) - 1) / sizeof(float);
-    ctx_->ExtendWorkspace(
-        (ctx_->threads() * tmp_size_io_float + tmp_row_io_float) *
-        sizeof(float));
-    impl_ = conv_depthwise_5x5_int8;
-    VLOG(5) << "invoke conv_depthwise_5x5 int8 conv";
-  } else {
-    LOG(ERROR) << "this type depthwise int8 conv not impl";
-    return false;
-  }
-  return true;
-}
-
-template <PrecisionType Ptype_out>
-bool DepthwiseConvInt8<Ptype_out>::init(const operators::ConvParam& param,
-                                        Context<TARGET(kARM)>* ctx) {
-  this->ctx_ = ctx;
-  return create(param, ctx);
-}
-
-template <PrecisionType Ptype_out>
-bool DepthwiseConvInt8<Ptype_out>::run(const operators::ConvParam& param) {
-  const int8_t* i_data = param.x->data<int8_t>();
-  int32_t* o_data = nullptr;
-  const int8_t* w_data = param.filter->data<int8_t>();
-  const int32_t* b_data = param.bias ? param.bias->data<int32_t>() : nullptr;
-
-  //  LOG(INFO) << "input size: " << param.x->memory_size() << " "
-  //            << param.input_scale << " " << w_scale_.size();
-
-  auto x_dims = param.x->dims();
-  auto w_dims = param.filter->dims();
-  auto o_dims = param.output->dims();
-  int bs = x_dims[0];
-  int ic = x_dims[1];
-  int ih = x_dims[2];
-  int iw = x_dims[3];  // nchw
-  int oc = o_dims[1];
-  int oh = o_dims[2];
-  int ow = o_dims[3];
-  int kw = w_dims[3];
-  int sw = param.strides[1];
-
-  if (kw == 3 && Ptype_out != PRECISION(kInt32)) {
-    o_data = tmp_int32_out_.mutable_data<int32_t>();
-  } else if (kw == 5 || (kw == 3 && Ptype_out == PRECISION(kInt32))) {
-    o_data = param.output->mutable_data<int32_t>();
-  } else {
-    LOG(ERROR) << "this type dw int8 conv not impl";
-    return false;
-  }
-
-  impl_(i_data,
-        o_data,
-        bs,
-        oc,
-        oh,
-        ow,
-        ic,
-        ih,
-        iw,
-        w_data,
-        b_data,
-        param,
-        this->ctx_,
-        Ptype_out,
-        w_scale_.data());
-
-  auto i_scale = param.input_scale;
-  auto o_scale = param.output_scale;
-  if (kw == 3) {
-    if (Ptype_out == PRECISION(kInt8)) {
-      trans_tensor_dtype<PRECISION(kInt32), PRECISION(kInt8)>(
-          &tmp_int32_out_, param.output, i_scale, o_scale, w_scale_);
-    } else if (Ptype_out == PRECISION(kFloat)) {
-      trans_tensor_dtype<PRECISION(kInt32), PRECISION(kFloat)>(
-          &tmp_int32_out_, param.output, i_scale, 1.f, w_scale_);
-    } else if (Ptype_out != PRECISION(kInt32)) {
-      LOG(ERROR) << "unsupported precision type!!";
-      return false;
-    }
-  }
-
-  return true;
-}
-
-template class DepthwiseConvInt8<PRECISION(kInt8)>;
-template class DepthwiseConvInt8<PRECISION(kFloat)>;
-template class DepthwiseConvInt8<PRECISION(kInt32)>;
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_depthwise.h b/lite/backends/arm/math/conv_depthwise.h
deleted file mode 100644
index cdddda79d1..0000000000
--- a/lite/backends/arm/math/conv_depthwise.h
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cmath>
-#include <vector>
-#include "lite/backends/arm/math/conv_impl.h"
-#include "lite/core/context.h"
-#include "lite/core/target_wrapper.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <PrecisionType Ptype>
-class DepthwiseConv
-    : public ImplBase<TARGET(kARM), Ptype, operators::ConvParam> {
- public:
-  typedef void (*conv_dw_impl)(const float* i_data,
-                               float* o_data,
-                               int bs,
-                               int oc,
-                               int oh,
-                               int ow,
-                               int ic,
-                               int ih,
-                               int kw,
-                               const float* w_data,
-                               const float* b_data,
-                               const operators::ConvParam& param,
-                               Context<TARGET(kARM)>* ctx);
-  DepthwiseConv() = default;
-  ~DepthwiseConv() {}
-
-  virtual bool init(const operators::ConvParam& param,
-                    Context<TARGET(kARM)>* ctx);
-
-  virtual bool create(const operators::ConvParam& param,
-                      Context<TARGET(kARM)>* ctx);
-
-  virtual bool run(const operators::ConvParam& param);
-
- private:
-  conv_dw_impl impl_{nullptr};
-};
-
-template <PrecisionType Ptype_out>
-class DepthwiseConvInt8
-    : public ImplBase<TARGET(kARM), PRECISION(kInt8), operators::ConvParam> {
- public:
-  typedef void (*conv_dw_int8_impl)(const int8_t* i_data,
-                                    int32_t* o_data,
-                                    int bs,
-                                    int oc,
-                                    int oh,
-                                    int ow,
-                                    int ic,
-                                    int ih,
-                                    int kw,
-                                    const int8_t* w_data,
-                                    const int32_t* b_data,
-                                    const operators::ConvParam& param,
-                                    Context<TARGET(kARM)>* ctx,
-                                    PrecisionType out_type,
-                                    const float* scale);
-
-  DepthwiseConvInt8() = default;
-  ~DepthwiseConvInt8() {}
-
-  virtual bool init(const operators::ConvParam& param,
-                    Context<TARGET(kARM)>* ctx);
-
-  virtual bool create(const operators::ConvParam& param,
-                      Context<TARGET(kARM)>* ctx);
-
-  virtual bool run(const operators::ConvParam& param);
-
- private:
-  conv_dw_int8_impl impl_{nullptr};
-  std::vector<float> w_scale_;
-  Tensor tmp_int32_out_;
-};
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_depthwise_3x3_int8.cc b/lite/backends/arm/math/conv_depthwise_3x3_int8.cc
deleted file mode 100644
index d1eedd9557..0000000000
--- a/lite/backends/arm/math/conv_depthwise_3x3_int8.cc
+++ /dev/null
@@ -1,5832 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <arm_neon.h>
-#include "lite/backends/arm/math/conv_impl.h"
-#include "lite/core/context.h"
-#include "lite/operators/op_params.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void conv_depthwise_3x3s1p1_bias_int8(int* dout,
-                                      const signed char* din,
-                                      const signed char* weights,
-                                      const int* bias,
-                                      bool flag_bias,
-                                      const int num,
-                                      const int ch_in,
-                                      const int h_in,
-                                      const int w_in,
-                                      const int h_out,
-                                      const int w_out,
-                                      ARMContext* ctx);
-
-//! for input width <= 8
-void conv_depthwise_3x3s1p1_bias_s_int8(int* dout,
-                                        const signed char* din,
-                                        const signed char* weights,
-                                        const int* bias,
-                                        bool flag_bias,
-                                        const int num,
-                                        const int ch_in,
-                                        const int h_in,
-                                        const int w_in,
-                                        const int h_out,
-                                        const int w_out,
-                                        ARMContext* ctx);
-
-void conv_depthwise_3x3s2p1_bias_int8(int* dout,
-                                      const signed char* din,
-                                      const signed char* weights,
-                                      const int* bias,
-                                      bool flag_bias,
-                                      const int num,
-                                      const int ch_in,
-                                      const int h_in,
-                                      const int w_in,
-                                      const int h_out,
-                                      const int w_out,
-                                      ARMContext* ctx);
-
-//! for input width <= 8
-void conv_depthwise_3x3s2p1_bias_s_int8(int* dout,
-                                        const signed char* din,
-                                        const signed char* weights,
-                                        const int* bias,
-                                        bool flag_bias,
-                                        const int num,
-                                        const int ch_in,
-                                        const int h_in,
-                                        const int w_in,
-                                        const int h_out,
-                                        const int w_out,
-                                        ARMContext* ctx);
-
-void conv_depthwise_3x3s1p1_bias_relu_int8(int* dout,
-                                           const signed char* din,
-                                           const signed char* weights,
-                                           const int* bias,
-                                           bool flag_bias,
-                                           const int num,
-                                           const int ch_in,
-                                           const int h_in,
-                                           const int w_in,
-                                           const int h_out,
-                                           const int w_out,
-                                           ARMContext* ctx);
-
-//! for input width <= 4
-void conv_depthwise_3x3s1p1_bias_s_relu_int8(int* dout,
-                                             const signed char* din,
-                                             const signed char* weights,
-                                             const int* bias,
-                                             bool flag_bias,
-                                             const int num,
-                                             const int ch_in,
-                                             const int h_in,
-                                             const int w_in,
-                                             const int h_out,
-                                             const int w_out,
-                                             ARMContext* ctx);
-
-void conv_depthwise_3x3s2p1_bias_relu_int8(int* dout,
-                                           const signed char* din,
-                                           const signed char* weights,
-                                           const int* bias,
-                                           bool flag_bias,
-                                           const int num,
-                                           const int ch_in,
-                                           const int h_in,
-                                           const int w_in,
-                                           const int h_out,
-                                           const int w_out,
-                                           ARMContext* ctx);
-
-//! for input width <= 4
-void conv_depthwise_3x3s2p1_bias_s_relu_int8(int* dout,
-                                             const signed char* din,
-                                             const signed char* weights,
-                                             const int* bias,
-                                             bool flag_bias,
-                                             const int num,
-                                             const int ch_in,
-                                             const int h_in,
-                                             const int w_in,
-                                             const int h_out,
-                                             const int w_out,
-                                             ARMContext* ctx);
-
-void conv_depthwise_3x3_int8(const int8_t* din,
-                             int32_t* dout,
-                             int num,
-                             int chout,
-                             int hout,
-                             int wout,
-                             int chin,
-                             int hin,
-                             int win,
-                             const int8_t* weights,
-                             const int32_t* bias,
-                             const operators::ConvParam& param,
-                             ARMContext* ctx,
-                             PrecisionType out_type,
-                             const float* scale) {
-  int w_in = win;
-  int h_in = hin;
-  int ch_in = chin;
-
-  int w_out = wout;
-  int h_out = hout;
-  int ch_out = chout;
-  int stride_h = param.strides[0];
-  bool flag_relu = param.fuse_relu;
-  bool flag_bias = param.bias != nullptr;
-  // if (param.activation_param.has_active){
-  //     if (param.activation_param.active == Active_relu ||
-  //     fabs(param.activation_param.negative_slope) > 1e-6f){
-  //         flag_relu = true;
-  //     }
-  // }
-  //! only support stride = 1 or 2
-  if (stride_h == 1) {
-    if (flag_relu) {
-      if (w_in > 8) {
-        conv_depthwise_3x3s1p1_bias_relu_int8(dout,
-                                              din,
-                                              weights,
-                                              bias,
-                                              flag_bias,
-                                              num,
-                                              ch_in,
-                                              h_in,
-                                              w_in,
-                                              h_out,
-                                              w_out,
-                                              ctx);
-      } else {
-        conv_depthwise_3x3s1p1_bias_s_relu_int8(dout,
-                                                din,
-                                                weights,
-                                                bias,
-                                                flag_bias,
-                                                num,
-                                                ch_in,
-                                                h_in,
-                                                w_in,
-                                                h_out,
-                                                w_out,
-                                                ctx);
-      }
-    } else {
-      if (w_in > 8) {
-        conv_depthwise_3x3s1p1_bias_int8(dout,
-                                         din,
-                                         weights,
-                                         bias,
-                                         flag_bias,
-                                         num,
-                                         ch_in,
-                                         h_in,
-                                         w_in,
-                                         h_out,
-                                         w_out,
-                                         ctx);
-      } else {
-        conv_depthwise_3x3s1p1_bias_s_int8(dout,
-                                           din,
-                                           weights,
-                                           bias,
-                                           flag_bias,
-                                           num,
-                                           ch_in,
-                                           h_in,
-                                           w_in,
-                                           h_out,
-                                           w_out,
-                                           ctx);
-      }
-    }
-  } else {  //! stride = 2
-    if (flag_relu) {
-      if (w_in > 16) {
-        conv_depthwise_3x3s2p1_bias_relu_int8(dout,
-                                              din,
-                                              weights,
-                                              bias,
-                                              flag_bias,
-                                              num,
-                                              ch_in,
-                                              h_in,
-                                              w_in,
-                                              h_out,
-                                              w_out,
-                                              ctx);
-      } else {
-        conv_depthwise_3x3s2p1_bias_s_relu_int8(dout,
-                                                din,
-                                                weights,
-                                                bias,
-                                                flag_bias,
-                                                num,
-                                                ch_in,
-                                                h_in,
-                                                w_in,
-                                                h_out,
-                                                w_out,
-                                                ctx);
-      }
-    } else {
-      if (w_in > 16) {
-        conv_depthwise_3x3s2p1_bias_int8(dout,
-                                         din,
-                                         weights,
-                                         bias,
-                                         flag_bias,
-                                         num,
-                                         ch_in,
-                                         h_in,
-                                         w_in,
-                                         h_out,
-                                         w_out,
-                                         ctx);
-      } else {
-        conv_depthwise_3x3s2p1_bias_s_int8(dout,
-                                           din,
-                                           weights,
-                                           bias,
-                                           flag_bias,
-                                           num,
-                                           ch_in,
-                                           h_in,
-                                           w_in,
-                                           h_out,
-                                           w_out,
-                                           ctx);
-      }
-    }
-  }
-}
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width > 4
- */
-
-// 4line w_in > 8
-void conv_depthwise_3x3s1p1_bias_int8(int* dout,
-                                      const signed char* din,
-                                      const signed char* weights,
-                                      const int* bias,
-                                      bool flag_bias,
-                                      const int num,
-                                      const int ch_in,
-                                      const int h_in,
-                                      const int w_in,
-                                      const int h_out,
-                                      const int w_out,
-                                      ARMContext* ctx) {
-  // printf("3x3s1 mult height \n");
-  //! pad is done implicit
-  const char zero[8] = {0, 0, 0, 0, 0, 0, 0, 0};
-  const unsigned char right_pad_idx[16] = {
-      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
-  const unsigned int right_pad_rst[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-
-  // printf("conv3x3_dw start \n");
-  signed char* zero_ptr = ctx->workspace_data<signed char>();
-  memset(zero_ptr, 0, w_in * sizeof(signed char));
-  int* write_ptr =
-      reinterpret_cast<int*>(ctx->workspace_data<signed char>()) + w_in;
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  int w_stride = 9;
-
-  int tile_w = (w_in + 7) >> 3;
-  int tile_h = (h_out + 1) >> 1;
-  int cnt_col = tile_w - 2;
-
-  unsigned int size_pad_right = (unsigned int)(w_in - 7 - (cnt_col << 3));
-
-  int size_pad_bottom = h_out % 2;
-
-  uint8x8_t vmask_rp1 =
-      vcgt_u8(vdup_n_u8(size_pad_right), vld1_u8(right_pad_idx));
-  uint8x8_t vmask_rp2 =
-      vcgt_u8(vdup_n_u8(size_pad_right), vld1_u8(right_pad_idx + 8));
-
-  uint8x16_t vmask_rp =
-      vcgtq_u8(vdupq_n_u8(size_pad_right), vld1q_u8(right_pad_idx));
-  // uint8x8_t vmask_rp2 = vcgt_u8(vdup_n_u8(size_pad_right),
-  // vld1_u8(right_pad_idx + 8));
-  unsigned char vmask[16];
-  vst1q_u8(vmask, vmask_rp);
-
-  unsigned int rst_remain = (unsigned int)(w_out - ((cnt_col + 1) << 3));
-  uint32x4_t vmask_result1 =
-      vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst));
-  uint32x4_t vmask_result2 =
-      vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst + 4));
-
-  unsigned int rmask[8];
-  vst1q_u32(rmask, vmask_result1);
-  vst1q_u32(rmask + 4, vmask_result2);
-
-  int8x8_t vzero = vdup_n_s8(0);
-  int32x4_t vzero_32 = vdupq_n_s32(0);
-
-  for (int n = 0; n < num; ++n) {
-    const signed char* din_batch = din + n * ch_in * size_in_channel;
-    int* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; c++) {
-      int* dout_ptr = dout_batch + c * size_out_channel;
-
-      const signed char* din_ch_ptr = din_batch + c * size_in_channel;
-
-      int bias_val = flag_bias ? bias[c] : 0;
-
-      const signed char* wei_ptr = weights + c * w_stride;
-
-#ifdef __aarch64__
-      int vbias[4] = {bias_val, bias_val, bias_val, bias_val};
-
-      int8x8_t wr00 = vdup_n_s8(wei_ptr[0]);
-      int8x8_t wr10 = vdup_n_s8(wei_ptr[3]);
-      int8x8_t wr20 = vdup_n_s8(wei_ptr[6]);
-
-      int8x8_t wr01 = vdup_n_s8(wei_ptr[1]);
-      int8x8_t wr11 = vdup_n_s8(wei_ptr[4]);
-      int8x8_t wr21 = vdup_n_s8(wei_ptr[7]);
-
-      int8x8_t wr02 = vdup_n_s8(wei_ptr[2]);
-      int8x8_t wr12 = vdup_n_s8(wei_ptr[5]);
-      int8x8_t wr22 = vdup_n_s8(wei_ptr[8]);
-#endif
-      int* doutr0 = nullptr;
-      int* doutr1 = nullptr;
-
-      const signed char* dr0 = din_ch_ptr;
-      const signed char* dr1 = dr0 + w_in;
-      const signed char* dr2 = dr1 + w_in;
-      const signed char* dr3 = dr2 + w_in;
-
-      const signed char* din_ptr0 = nullptr;
-      const signed char* din_ptr1 = nullptr;
-      const signed char* din_ptr2 = nullptr;
-      const signed char* din_ptr3 = nullptr;
-
-      for (int i = 0; i < h_in; i += 2) {
-        //! process top pad pad_h = 1
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-        din_ptr3 = dr3;
-
-        doutr0 = dout_ptr;
-        doutr1 = doutr0 + w_out;
-        unsigned int* rst_mask = rmask;
-        unsigned char* val_mask = vmask;
-
-        if (i == 0) {
-          din_ptr0 = zero_ptr;
-          din_ptr1 = dr0;
-          din_ptr2 = dr1;
-          din_ptr3 = dr2;
-          dr0 = dr1;
-          dr1 = dr2;
-          dr2 = dr3;
-          dr3 = dr2 + w_in;
-        } else {
-          dr0 = dr2;
-          dr1 = dr3;
-          dr2 = dr1 + w_in;
-          dr3 = dr2 + w_in;
-        }
-        //! process bottom pad
-        if (i + 3 > h_in) {
-          switch (i + 3 - h_in) {
-            case 3:
-              din_ptr1 = zero_ptr;
-            case 2:
-              din_ptr2 = zero_ptr;
-            case 1:
-              din_ptr3 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 2 > h_out) {
-          doutr1 = write_ptr;
-        }
-        int cnt = cnt_col;
-#ifdef __aarch64__
-        asm volatile(
-            "PRFM PLDL1KEEP, [%[din_ptr0]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr1]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr2]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr3]] \n"
-            "movi   v21.4s, #0x0\n" /* out0 = 0 */
-                                    // left
-            "ld1    {v0.8b}, [%[din_ptr0]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-            "ld1    {v2.8b}, [%[din_ptr1]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-            "ld1    {v1.8b}, [%[din_ptr0]]                   \n"         /* load
-                                                                            a00-a015 to
-                                                                            q0*/
-            "ld1    {v3.8b}, [%[din_ptr1]]                   \n"         /* load
-                                                                            a00-a015 to
-                                                                            q0*/
-
-            "ld1    {v10.4s}, [%[bias_val]] \n" /* dup v10, bias */
-            "ld1    {v11.4s}, [%[bias_val]] \n" /* dup v10, bias */
-            "ld1    {v12.4s}, [%[bias_val]] \n" /* dup v10, bias */
-            "ld1    {v13.4s}, [%[bias_val]] \n" /* dup v10, bias */
-
-            // r0
-            "smull  v18.8h,  %[v1].8b,  v0.8b   \n" /* outr00 = 01234567 * w01
-                                                     */
-
-            "ext v4.8b, v21.8b, v0.8B, #7       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       00123456 */
-            "ext v5.8b, v0.8b, v1.8B, #1       \n"  /* vext_s8(vinr0, vinr0_1,
-                                                       1); 12345678 */
-
-            "ld1    {v6.8b}, [%[din_ptr2]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-            "ld1    {v8.8b}, [%[din_ptr3]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-
-            "smlal  v18.8h,  %[v0].8b,  v4.8b\n" /* outr00 += 00123456 * w00 */
-
-            "ld1    {v7.8b}, [%[din_ptr2]]                       \n" /* load
-                                                                        a00-a015
-                                                                        to q0*/
-            "ld1    {v9.8b}, [%[din_ptr3]]                       \n" /* load
-                                                                        a00-a015
-                                                                        to q0*/
-
-            "sub   %[din_ptr0], %[din_ptr0], #1                       \n"
-            "sub   %[din_ptr1], %[din_ptr1], #1                       \n"
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v18.8h,  %[v2].8b,  v5.8b\n" /* outr00 += 12345678 * w02 */
-
-            "ext v4.8b, v21.8b, v2.8b, #7       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       00123456 */
-            "ext v5.8b, v2.8b, v3.8b, #1       \n"  /* vext_s8(vinr0, vinr0_1,
-                                                       1); 12345678 */
-
-            // r1
-            "sub   %[din_ptr2], %[din_ptr2], #1                       \n"
-            "sub   %[din_ptr3], %[din_ptr3], #1                       \n"
-
-            "smull  v19.8h,  %[v1].8b,  v2.8b   \n" /* outr10 += 01234567 * w11
-                                                     */
-            "smlal  v18.8h,  %[v4].8b,  v2.8b   \n" /* outr00 += 01234567 * w11
-                                                     */
-
-            "ext v14.8b, v21.8b, v6.8b, #7       \n" /* vext_s8(vzero, vinr0,
-                                                        7); 00123456 */
-            "ext v15.8b, v6.8b, v7.8b, #1       \n"  /* vext_s8(vinr0, vinr0_1,
-                                                        1); 12345678 */
-
-            "smlal  v19.8h,  %[v0].8b,  v4.8b   \n" /* outr00 += 01234567 * w11
-                                                     */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v18.8h,  %[v3].8b,  v4.8b   \n" /* outr00 += 001234567 * w10
-                                                     */
-
-            "ld1    {v0.8b}, [%[din_ptr0]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-            "ld1    {v2.8b}, [%[din_ptr1]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-
-            "saddw   v12.4s, v12.4s, v19.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h        \n" /* v11 += outr00.high*/
-
-            "smull  v19.8h,  %[v2].8b,  v5.8b   \n" /* outr00 += 01234567 * w11
-                                                     */
-            "smlal  v18.8h,  %[v5].8b,  v5.8b   \n" /* outr00 += 12345678 * w12
-                                                     */
-
-            // r2
-            "ld1    {v1.8b}, [%[din_ptr0]]                   \n" /* load
-                                                                    a00-a015 to
-                                                                    q0*/
-            "ld1    {v3.8b}, [%[din_ptr1]]                   \n" /* load
-                                                                    a00-a015 to
-                                                                    q0*/
-
-            "smlal  v19.8h,  %[v4].8b,  v6.8b   \n" /* outr10 += 01234567 * w11
-                                                     */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v18.8h,  %[v7].8b,  v6.8b   \n" /* outr00 += 01234567 * w11
-                                                     */
-
-            "ext v4.8b, v21.8b, v8.8b, #7       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       00123456 */
-            "ext v5.8b, v8.8b, v9.8b, #1       \n"  /* vext_s8(vinr0, vinr0_1,
-                                                       1); 12345678 */
-
-            "saddw   v12.4s, v12.4s, v19.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h        \n" /* v11 += outr00.high*/
-
-            "smull  v19.8h,  %[v3].8b,  v14.8b   \n" /* outr10 += 01234567 * w11
-                                                      */
-            "smlal  v18.8h,  %[v6].8b,  v14.8b   \n" /* outr00 += 01234567 * w11
-                                                      */
-
-            "ld1    {v6.8b}, [%[din_ptr2]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-
-            "smlal  v19.8h,  %[v5].8b,  v15.8b   \n" /* outr10 += 01234567 * w11
-                                                      */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v18.8h,  %[v8].8b,  v15.8b   \n" /* outr00 += 01234567 * w11
-                                                      */
-
-            "saddw   v12.4s, v12.4s, v19.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h        \n" /* v11 += outr00.high*/
-
-            // r3
-            "smull  v19.8h,  %[v7].8b,  v8.8b   \n" /* outr00 += 01234567 * w11
-                                                     */
-
-            "ld1    {v8.8b}, [%[din_ptr3]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-
-            "ld1    {v7.8b}, [%[din_ptr2]]                   \n" /* load
-                                                                    a00-a015 to
-                                                                    q0*/
-            "ld1    {v9.8b}, [%[din_ptr3]]                   \n" /* load
-                                                                    a00-a015 to
-                                                                    q0*/
-
-            "smlal  v19.8h,  %[v6].8b,  v4.8b     \n" /* outr00 += 01234567 *
-                                                         w11 */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "stp     q10, q11, [%[ptr_out0]], #32 \n" /* store q10, q11 ->
-                                                         ptr_out       */
-
-            "saddw   v12.4s, v12.4s, v19.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h        \n" /* v11 += outr00.high*/
-
-            "smull  v19.8h,  %[v8].8b,  v5.8b        \n" /* outr00 += 01234567 *
-                                                            w11 */
-
-            "ld1    {v10.4s}, [%[bias_val]] \n" /* dup v10, bias */
-            "ld1    {v11.4s}, [%[bias_val]] \n" /* dup v10, bias */
-
-            "saddw   v12.4s, v12.4s, v19.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h        \n" /* v11 += outr00.high*/
-
-            "stp     q12, q13, [%[ptr_out1]], #32   \n" /* store q10, q11 ->
-                                                           ptr_out       */
-
-            "ld1    {v12.4s}, [%[bias_val]] \n" /* dup v10, bias */
-            "ld1    {v13.4s}, [%[bias_val]] \n" /* dup v10, bias */
-
-            "cmp  %[cnt], #1                \n"
-            "blt 3f                         \n"
-            // mid
-            "1:                             \n"
-            "ext v4.8b, v0.8B, v1.8b, #1       \n" /*12345678 */
-            "ext v5.8b, v0.8b, v1.8B, #2       \n" /*23456789 */
-
-            // r0
-            "smull  v18.8h,  %[v0].8b,  v0.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "ext v14.8b, v2.8B, v3.8b, #1       \n" /*12345678 */
-            "ext v15.8b, v2.8b, v3.8B, #2       \n" /*23456789 */
-
-            "smlal  v18.8h,  %[v1].8b,  v4.8b\n" /* outr00 += 12345678 * w01 */
-
-            "ext v16.8b, v6.8B, v7.8b, #1       \n" /*12345678 */
-            "ext v17.8b, v6.8b, v7.8B, #2       \n" /*23456789 */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v18.8h,  %[v2].8b,  v5.8b\n" /* outr00 += 23456789 * w02 */
-
-            // r1
-            "ext v4.8b, v8.8B, v9.8b, #1       \n" /*12345678 */
-            "ext v5.8b, v8.8b, v9.8B, #2       \n" /*23456789 */
-
-            "smull  v19.8h,  %[v0].8b,  v2.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-            "smlal  v18.8h,  %[v3].8b,  v2.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "ld1    {v0.8b}, [%[din_ptr0]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-            "ld1    {v2.8b}, [%[din_ptr1]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-
-            "smlal  v19.8h,  %[v1].8b,  v14.8b\n" /* outr00 += 12345678 * w01 */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v18.8h,  %[v4].8b,  v14.8b\n" /* outr00 += 12345678 * w01 */
-
-            "ld1    {v1.8b}, [%[din_ptr0]]                       \n" /* load
-                                                                        a00-a015
-                                                                        to q0*/
-            "ld1    {v3.8b}, [%[din_ptr1]]                       \n" /* load
-                                                                        a00-a015
-                                                                        to q0*/
-
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v19.8h,  %[v2].8b,  v15.8b\n" /* outr00 += 23456789 * w02 */
-            "smlal  v18.8h,  %[v5].8b,  v15.8b\n" /* outr00 += 12345678 * w01 */
-
-            // r2
-            "smlal  v19.8h,  %[v3].8b,  v6.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v18.8h,  %[v6].8b,  v6.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v19.8h,  %[v4].8b,  v16.8b\n" /* outr00 += 12345678 * w01 */
-            "smlal  v18.8h,  %[v7].8b,  v16.8b\n" /* outr00 += 12345678 * w01 */
-
-            "smlal  v19.8h,  %[v5].8b,  v17.8b\n" /* outr00 += 23456789 * w02 */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v18.8h,  %[v8].8b,  v17.8b\n" /* outr00 += 12345678 * w01 */
-
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            // r3
-            "smull  v19.8h,  %[v6].8b,  v8.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "ld1    {v6.8b}, [%[din_ptr2]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-            "ld1    {v8.8b}, [%[din_ptr3]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smlal  v19.8h,  %[v7].8b,  v4.8b\n" /* outr00 += 12345678 * w01 */
-
-            "ld1    {v7.8b}, [%[din_ptr2]]                       \n" /* load
-                                                                        a00-a015
-                                                                        to q0*/
-            "ld1    {v9.8b}, [%[din_ptr3]]                       \n" /* load
-                                                                        a00-a015
-                                                                        to q0*/
-
-            "stp     q10, q11, [%[ptr_out0]], #32 \n" /* store q10, q11 ->
-                                                         ptr_out       */
-
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v19.8h,  %[v8].8b,  v5.8b\n" /* outr00 += 23456789 * w02 */
-
-            "ld1    {v10.4s}, [%[bias_val]] \n" /* dup v10, bias */
-            "ld1    {v11.4s}, [%[bias_val]] \n" /* dup v10, bias */
-
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            "subs %[cnt], %[cnt], #1            \n"
-
-            "stp     q12, q13, [%[ptr_out1]], #32 \n" /* store q10, q11 ->
-                                                         ptr_out       */
-
-            "ld1    {v12.4s}, [%[bias_val]] \n" /* dup v10, bias */
-            "ld1    {v13.4s}, [%[bias_val]] \n" /* dup v10, bias */
-
-            "bne 1b                                 \n"
-            // right
-            "3:                             \n"
-            "ld1 {v14.8b}, [%[vmask]], #8             \n"
-            "ld1 {v15.8b}, [%[vmask]]                \n"
-
-            "bif v0.8b, v21.8b, v14.8b               \n"
-            "bif v1.8b, v21.8b, v15.8b               \n"
-            "bif v2.8b, v21.8b, v14.8b               \n"
-            "bif v3.8b, v21.8b, v15.8b               \n"
-
-            "ext v4.8b, v0.8b, v1.8b, #1             \n"
-            "ext v5.8b, v0.8b, v1.8b, #2             \n"
-
-            // r0
-            "smull  v18.8h,  %[v0].8b,  v0.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "ext v16.8b, v2.8b, v3.8b, #1             \n"
-            "ext v17.8b, v2.8b, v3.8b, #2             \n"
-
-            "bif v6.8b, v21.8b, v14.8b               \n"
-            "bif v7.8b, v21.8b, v15.8b               \n"
-
-            "smlal  v18.8h,  %[v1].8b,  v4.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "bif v8.8b, v21.8b, v14.8b               \n"
-            "bif v9.8b, v21.8b, v15.8b               \n"
-
-            "ext v20.8b, v6.8b, v7.8b, #1             \n"
-            "ext v22.8b, v6.8b, v7.8b, #2             \n"
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v18.8h,  %[v2].8b,  v5.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            // r1
-            "ext v4.8b, v8.8b, v9.8b, #1             \n"
-            "ext v5.8b, v8.8b, v9.8b, #2             \n"
-
-            "smull  v19.8h,  %[v0].8b,  v2.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-            "smlal  v18.8h,  %[v3].8b,  v2.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "ld1 {v14.4s}, [%[rmask]], #16                \n"
-            "ld1 {v15.4s}, [%[rmask]]                     \n"
-
-            "smlal  v19.8h,  %[v1].8b,  v16.8b   \n" /* outr00 = 01234567 * w00
-                                                      */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v18.8h,  %[v4].8b,  v16.8b   \n" /* outr00 = 01234567 * w00
-                                                      */
-
-            "ld1 {v0.4s}, [%[ptr_out0]], #16                \n"
-            "ld1 {v2.4s}, [%[ptr_out1]], #16                \n"
-
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v19.8h,  %[v2].8b,  v17.8b   \n" /* outr00 = 01234567 * w00
-                                                      */
-            "smlal  v18.8h,  %[v5].8b,  v17.8b   \n" /* outr00 = 01234567 * w00
-                                                      */
-
-            "ld1 {v1.4s}, [%[ptr_out0]]                   \n"
-            "ld1 {v3.4s}, [%[ptr_out1]]                   \n"
-
-            // r2
-            "smlal  v19.8h,  %[v3].8b,  v6.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v18.8h,  %[v6].8b,  v6.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "sub %[ptr_out0], %[ptr_out0], #16   \n"
-            "sub %[ptr_out1], %[ptr_out1], #16   \n"
-
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v19.8h,  %[v4].8b,  v20.8b   \n" /* outr00 = 01234567 * w00
-                                                      */
-            "smlal  v18.8h,  %[v7].8b,  v20.8b   \n" /* outr00 = 01234567 * w00
-                                                      */
-
-            "smlal  v19.8h,  %[v5].8b,  v22.8b   \n" /* outr00 = 01234567 * w00
-                                                      */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v18.8h,  %[v8].8b,  v22.8b   \n" /* outr00 = 01234567 * w00
-                                                      */
-
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            // r3
-            "smull  v19.8h,  %[v6].8b,  v8.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smlal  v19.8h,  %[v7].8b,  v4.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "bif v10.16b, v0.16b, v14.16b         \n"
-            "bif v11.16b, v1.16b, v15.16b         \n"
-
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v19.8h,  %[v8].8b,  v5.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "stp     q10, q11, [%[ptr_out0]], #32 \n" /* store q10, q11 ->
-                                                         ptr_out       */
-
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            "bif v12.16b, v2.16b, v14.16b         \n"
-            "bif v13.16b, v3.16b, v15.16b         \n"
-
-            "stp     q12, q13, [%[ptr_out1]], #32 \n" /* store q10, q11 ->
-                                                         ptr_out       */
-
-            : [cnt] "+r"(cnt),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [ptr_out0] "+r"(doutr0),
-              [ptr_out1] "+r"(doutr1),
-              [vmask] "+r"(val_mask),
-              [rmask] "+r"(rst_mask)
-            : [v0] "w"(wr00),
-              [v1] "w"(wr01),
-              [v2] "w"(wr02),
-              [v3] "w"(wr10),
-              [bias_val] "r"(vbias),
-              [v4] "w"(wr11),
-              [v5] "w"(wr12),
-              [v6] "w"(wr20),
-              [v7] "w"(wr21),
-              [v8] "w"(wr22)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22");
-#else
-        // store weights
-        asm volatile("vld1.8    {d0-d1}, [%[wei_ptr]]    \n"
-                     :
-                     : [wei_ptr] "r"(wei_ptr)
-                     : "memory");
-        asm volatile(
-            // left
-            "pld [%[din_ptr0]]                @ preload data\n"
-            "pld [%[din_ptr1]]                @ preload data\n"
-            "pld [%[din_ptr2]]                @ preload data\n"
-            "pld [%[din_ptr3]]                @ preload data\n"
-            "vdup.s8     d2, d0[0]               @ d2 = w00, w00, w00, w00\n"
-            "vdup.s8     d3, d0[1]               @ d3 = w01, w01, w01, w01\n"
-            "vdup.s8     d4, d0[2]               @ d4 = w02, w02, w02, w02\n"
-            "vld1.8 {d12-d13}, [%[din_ptr0]]    @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-            "vmov.u32 d11, #0                   @ zero\n"
-            // out0
-            "vdup.32 q8, %[bias]                            @ and \n"  // q8 =
-                                                                       // vbias
-            "vdup.32 q9, %[bias]                            @ and \n"  // q9 =
-                                                                       // vbias
-            // out1
-            "vdup.32 q10, %[bias]                            @ and \n"  // q8 =
-                                                                        // vbias
-            "vdup.32 q11, %[bias]                            @ and \n"  // q9 =
-                                                                        // vbias
-
-            // r0
-            "vmull.s8 q12, d12, d3                 @ out0 = din0 * w01 \n"  // q12 = d12 * w01
-            "vext.8     d30, d11, d12, #7     @ ext \n"       // d10 = 00123456
-            "vext.8     d31, d12, d13, #1          @ ext \n"  // d11 = 12345678
-
-            "vld1.8 {d12-d13}, [%[din_ptr1]]    @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-            "vld1.8 {d14-d15}, [%[din_ptr2]]    @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-            "vdup.s8     d5, d0[3]               @ d5 = w10, w10, w00, w00\n"
-            "vdup.s8     d6, d0[4]               @ d6 = w11, w11, w01, w01\n"
-
-            "vmlal.s8 q12, d30, d2                 @ out0 += din0 * w00 \n"  // q12 += d10 * w00
-
-            "vdup.s8     d7, d0[5]               @ d7 = w12, w12\n"
-            "add %[din_ptr0], #7                   @add \n"
-            "add %[din_ptr1], #7                   @add \n"
-
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmull.s8 q12, d31, d4                 @ out0 += din0 * w02 \n"  // q12 += d11 * w02
-
-            // r1
-            "vext.8     d30, d11, d12, #7     @ ext \n"       // d10 = 00123456
-            "vext.8     d31, d12, d13, #1          @ ext \n"  // d11 = 12345678
-            "vmull.s8 q13, d12, d3                 @ out1 = din1 * w01 \n"  // q13 = d12 * w01
-
-            "vmlal.s8 q12, d12, d6                 @ out0 = din1 * w11 \n"  // q12 = d12 * w11
-
-            "vld1.8 {d12-d13}, [%[din_ptr3]]    @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-            "vdup.s8     d8, d0[6]               @ d8 = w20, w00, w00, w00\n"
-            "vdup.s8     d9, d0[7]               @ d9 = w21, w01, w01, w01\n"
-            "vdup.s8     d10, d1[0]               @ d10 = w22, w02, w02, w02\n"
-
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmlal.s8 q13, d30, d2                 @ out1 += din1 * w00 \n"  // q12 += d10 * w00
-            "vmull.s8 q12, d30, d5                 @ out0 += din1 * w10 \n"  // q12 += d10 * w00
-
-            "add %[din_ptr2], #7                   @add \n"
-            "add %[din_ptr3], #7                   @add \n"
-
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmull.s8 q13, d31, d4                 @ out1 += din1 * w02 \n"  // q12 += d10 * w00
-            "vmlal.s8 q12, d31, d7                 @ out0 += din1 * w12 \n"  // q12 += d10 * w00
-
-            // r2
-            "vext.8     d30, d11, d14, #7     @ ext \n"       // d10 = 00123456
-            "vext.8     d31, d14, d15, #1          @ ext \n"  // d11 = 12345678
-
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmlal.s8 q13, d14, d6                 @ out1 = din2 * w11 \n"  // q13 = d12 * w01
-            "vmull.s8 q12, d14, d9                 @ out1 = din2 * w21 \n"  // q13 = d12 * w01
-
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmull.s8 q13, d30, d5                 @ out1 += din2 * w10 \n"  // q12 += d10 * w00
-            "vmlal.s8 q12, d30, d8                 @ out0 += din2 * w20 \n"  // q12 += d10 * w00
-
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmlal.s8 q13, d31, d7                 @ out1 += din2 * w12 \n"  // q12 += d10 * w00
-            "vmull.s8 q12, d31, d10                 @ out0 += din2 * w22 \n"  // q12 += d10 * w00
-
-            // r3
-            "vext.8     d30, d11, d12, #7     @ ext \n"       // d10 = 00123456
-            "vext.8     d31, d12, d13, #1          @ ext \n"  // d11 = 12345678
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmull.s8 q13, d12, d9                 @ out1 = din3 * w21 \n"  // q13 = d12 * w01
-            "pld [%[din_ptr0]]                @ preload data\n"
-            "pld [%[din_ptr1]]                @ preload data\n"
-
-            "vmlal.s8 q13, d30, d8                 @ out1 += din3 * w20 \n"  // q13 += d10 * w00
-            "pld [%[din_ptr2]]                @ preload data\n"
-            "pld [%[din_ptr3]]                @ preload data\n"
-
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vst1.32 {d16-d17}, [%[dout_ptr1]]!         @ store\n"
-
-            "vmull.s8 q13, d31, d10                 @ out1 += din3 * w22 \n"  // q12 += d10 * w00
-
-            "vst1.32 {d18-d19}, [%[dout_ptr1]]!         @ store\n"
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-            "vst1.32 {d20-d21}, [%[dout_ptr2]]!         @ store\n"
-            "cmp %[cnt], #1                                 \n"
-            "vst1.32 {d22-d23}, [%[dout_ptr2]]!         @ store\n"
-            "blt 1f                                         \n"
-
-            // mid
-            "2:                                          \n"
-            "vld1.8 {d12-d13}, [%[din_ptr0]]    @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-            // out0
-            "vdup.32 q8, %[bias]                            @ and \n"  // q8 =
-                                                                       // vbias
-            "vdup.32 q9, %[bias]                            @ and \n"  // q9 =
-                                                                       // vbias
-            // out1
-            "vdup.32 q10, %[bias]                            @ and \n"  // q8 =
-                                                                        // vbias
-            "vdup.32 q11, %[bias]                            @ and \n"  // q9 =
-                                                                        // vbias
-
-            // r0
-            "vmull.s8 q12, d12, d2                 @ out0 = din0 * w01 \n"  // q12 = d12 * w01
-            "vext.8     d30, d12, d13, #1     @ ext \n"       // d10 = 12345678
-            "vext.8     d31, d12, d13, #2          @ ext \n"  // d11 = 23456789
-
-            "vld1.8 {d12-d13}, [%[din_ptr1]]    @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-            "vld1.8 {d14-d15}, [%[din_ptr2]]    @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-
-            "vmlal.s8 q12, d30, d3                 @ out0 += din0 * w00 \n"  // q12 += d10 * w00
-
-            "add %[din_ptr0], #8                   @add \n"
-            "add %[din_ptr1], #8                   @add \n"
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmull.s8 q12, d31, d4                 @ out0 += din0 * w02 \n"  // q12 += d11 * w02
-
-            // r1
-            "vext.8     d30, d12, d13, #1     @ ext \n"       // d10 = 00123456
-            "vext.8     d31, d12, d13, #2          @ ext \n"  // d11 = 12345678
-            "vmull.s8 q13, d12, d2                 @ out1 = din1 * w01 \n"  // q13 = d12 * w01
-
-            "vmlal.s8 q12, d12, d5                 @ out0 = din1 * w11 \n"  // q12 = d12 * w11
-
-            "vld1.8 {d12-d13}, [%[din_ptr3]]    @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-
-            "vmlal.s8 q13, d30, d3                 @ out1 += din1 * w00 \n"  // q12 += d10 * w00
-
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmull.s8 q12, d30, d6                 @ out0 += din1 * w10 \n"  // q12 += d10 * w00
-
-            "add %[din_ptr2], #8                   @add \n"
-            "add %[din_ptr3], #8                   @add \n"
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmull.s8 q13, d31, d4                 @ out1 += din1 * w02 \n"  // q12 += d10 * w00
-            "vmlal.s8 q12, d31, d7                 @ out0 += din1 * w12 \n"  // q12 += d10 * w00
-
-            // r2
-            "vext.8     d30, d14, d15, #1     @ ext \n"       // d10 = 00123456
-            "vext.8     d31, d14, d15, #2          @ ext \n"  // d11 = 12345678
-
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmlal.s8 q13, d14, d5                 @ out1 = din2 * w11 \n"  // q13 = d12 * w01
-            "vmull.s8 q12, d14, d8                 @ out1 = din2 * w21 \n"  // q13 = d12 * w01
-
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmull.s8 q13, d30, d6                 @ out1 += din2 * w10 \n"  // q12 += d10 * w00
-            "vmlal.s8 q12, d30, d9                 @ out0 += din2 * w20 \n"  // q12 += d10 * w00
-
-            "vmlal.s8 q13, d31, d7                 @ out1 += din2 * w12 \n"  // q12 += d10 * w00
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmull.s8 q12, d31, d10                 @ out0 += din2 * w22 \n"  // q12 += d10 * w00
-
-            // r3
-            "vext.8     d30, d12, d13, #1     @ ext \n"       // d10 = 00123456
-            "vext.8     d31, d12, d13, #2          @ ext \n"  // d11 = 12345678
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmull.s8 q13, d12, d8                 @ out1 = din3 * w21 \n"  // q13 = d12 * w01
-            "pld [%[din_ptr0]]                @ preload data\n"
-            "pld [%[din_ptr1]]                @ preload data\n"
-
-            "vmlal.s8 q13, d30, d9                 @ out1 += din3 * w20 \n"  // q13 += d10 * w00
-            "pld [%[din_ptr2]]                @ preload data\n"
-            "pld [%[din_ptr3]]                @ preload data\n"
-
-            "vst1.32 {d16-d17}, [%[dout_ptr1]]!         @ store\n"
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmull.s8 q13, d31, d10                 @ out1 += din3 * w22 \n"  // q12 += d10 * w00
-
-            "vst1.32 {d18-d19}, [%[dout_ptr1]]!         @ store\n"
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-            "vst1.32 {d20-d21}, [%[dout_ptr2]]!         @ store\n"
-            "subs %[cnt], #1                                \n"
-            "vst1.32 {d22-d23}, [%[dout_ptr2]]!         @ store\n"
-            "bne  2b                                        \n"
-            // right
-            "1:                                          \n"
-            "vld1.8 {d12-d13}, [%[din_ptr0]]    @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-            "vld1.8 {d28-d29}, [%[mask]]        @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-            // out0
-            "vdup.32 q8, %[bias]                 @ and \n"  // q8 = vbias
-            "vdup.32 q9, %[bias]                 @ and \n"  // q9 = vbias
-            // out1
-            "vdup.32 q10, %[bias]                @ and \n"  // q8 = vbias
-            "vdup.32 q11, %[bias]                @ and \n"  // q9 = vbias
-
-            "vbif.8 d12, d11, d28        @ bit select, deal with right pad\n"
-            "vbif.8 d13, d11, d29        @ bit select, deal with right pad\n"
-            "vld1.8 {d14-d15}, [%[din_ptr1]]    @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-
-            // r0
-            "vmull.s8 q12, d12, d2                 @ out0 = din0 * w00 \n"  // q12 = d12 * w01
-            "vext.8 d30, d12, d13, #1               @ ext \n"  // d10 = 12345678
-            "vext.8 d31, d12, d13, #2               @ ext \n"  // d11 = 23456789
-
-            "vld1.8 {d12-d13}, [%[din_ptr2]]    @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-            "vbif.8 d14, d11, d28        @ bit select, deal with right pad\n"
-            "vbif.8 d15, d11, d29        @ bit select, deal with right pad\n"
-
-            "vmlal.s8 q12, d30, d3                 @ out0 += din0 * w01 \n"  // q12 += d10 * w00
-
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmull.s8 q12, d31, d4                 @ out0 += din0 * w02 \n"  // q12 += d11 * w02
-
-            // r1
-            "vext.8 d30, d14, d15, #1           @ ext \n"  // d10 = 00123456
-            "vext.8 d31, d14, d15, #2          @ ext \n"   // d11 = 12345678
-
-            "vmull.s8 q13, d14, d2                 @ out1 = din1 * w00 \n"  // q13 = d12 * w01
-
-            "vmlal.s8 q12, d14, d5                 @ out0 = din1 * w10 \n"  // q12 = d12 * w11
-
-            "vld1.8 {d14-d15}, [%[din_ptr3]]    @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-            "vbif.8 d12, d11, d28                 @ bit select, deal with "
-            "right pad\n"
-            "vbif.8 d13, d11, d29                 @ bit select, deal with "
-            "right pad\n"
-
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmlal.s8 q13, d30, d3                 @ out1 += din1 * w01 \n"  // q12 += d10 * w00
-            "vmull.s8 q12, d30, d6                 @ out0 += din1 * w11 \n"  // q12 += d10 * w00
-
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmull.s8 q13, d31, d4                 @ out1 += din1 * w02 \n"  // q12 += d10 * w00
-            "vmlal.s8 q12, d31, d7                 @ out0 += din1 * w12 \n"  // q12 += d10 * w00
-
-            // r2
-            "vext.8 d30, d12, d13, #1               @ ext \n"  // d10 = 00123456
-            "vext.8 d31, d12, d13, #2               @ ext \n"  // d11 = 12345678
-
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmlal.s8 q13, d12, d5                 @ out1 = din2 * w10 \n"  // q13 = d12 * w01
-            "vmull.s8 q12, d12, d8                 @ out1 = din2 * w20 \n"  // q13 = d12 * w01
-
-            "vbif.8 d14, d11, d28                     @ bit select, deal with "
-            "right pad\n"
-            "vbif.8 d15, d11, d29                     @ bit select, deal with "
-            "right pad\n"
-
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmull.s8 q13, d30, d6                 @ out1 += din2 * w10 \n"  // q12 += d10 * w00
-            "vmlal.s8 q12, d30, d9                 @ out0 += din2 * w20 \n"  // q12 += d10 * w00
-
-            "vld1.32 {d28-d29}, [%[dout_ptr1]]!    @ load din00= 0 1 2 3 4 5 6 "
-            "7 8 9\n"
-            "vld1.32 {d12-d13}, [%[dout_ptr1]]    @ load din00= 0 1 2 3 4 5 6 "
-            "7 8 9\n"
-            "vld1.32 {d2-d3}, [%[rs_mask]]!     @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-            "vld1.32 {d4-d5}, [%[rs_mask]]    @ load din00= 0 1 2 3 4 5 6 7 8 "
-            "9\n"
-
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmlal.s8 q13, d31, d7                 @ out1 += din2 * w12 \n"  // q12 += d10 * w00
-            "vmull.s8 q12, d31, d10                 @ out0 += din2 * w22 \n"  // q12 += d10 * w00
-
-            // r3
-            "vext.8     d30, d14, d15, #1     @ ext \n"       // d10 = 00123456
-            "vext.8     d31, d14, d15, #2          @ ext \n"  // d11 = 12345678
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmull.s8 q13, d14, d8                 @ out1 = din3 * w20 \n"  // q13 = d12 * w01
-            "sub %[dout_ptr1], #16                  @ sub \n"
-            "vld1.32 {d14-d15}, [%[dout_ptr2]]!    @ load din00= 0 1 2 3 4 5 6 "
-            "7 8 9\n"
-            "vld1.32 {d24-d25}, [%[dout_ptr2]]     @ load din00= 0 1 2 3 4 5 6 "
-            "7 8 9\n"
-
-            "vmlal.s8 q13, d30, d9                 @ out1 += din3 * w21 \n"  // q13 += d10 * w00
-            "vbif q8, q14, q1                   @ bit select, deal with right "
-            "pad\n"
-            "vbif q9, q6, q2                    @ bit select, deal with right "
-            "pad\n"
-            "sub %[dout_ptr2], #16                  @ sub \n"
-
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmull.s8 q13, d31, d10                 @ out1 += din3 * w22 \n"  // q12 += d10 * w00
-
-            "vst1.32 {d16-d17}, [%[dout_ptr1]]!         @ store\n"
-            "vst1.32 {d18-d19}, [%[dout_ptr1]]!         @ store\n"
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vbif q10, q7, q1        @ bit select, deal with right pad\n"
-            "vbif q11, q12, q2       @ bit select, deal with right pad\n"
-
-            "vst1.32 {d20-d21}, [%[dout_ptr2]]!         @ store\n"
-            "vst1.32 {d22-d23}, [%[dout_ptr2]]!         @ store\n"
-
-            : [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [dout_ptr1] "+r"(doutr0),
-              [dout_ptr2] "+r"(doutr1),
-              [cnt] "+r"(cnt),
-              [bias] "+r"(bias_val),
-              [rs_mask] "+r"(rst_mask)
-            : [mask] "r"(vmask)
-            : "cc",
-              "memory",
-              "q0",
-              "q1",
-              "q2",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif
-        dout_ptr += 2 * w_out;
-      }
-    }
-  }
-}
-
-// w_in <= 8
-void conv_depthwise_3x3s1p1_bias_s_int8(int* dout,
-                                        const signed char* din,
-                                        const signed char* weights,
-                                        const int* bias,
-                                        bool flag_bias,
-                                        const int num,
-                                        const int ch_in,
-                                        const int h_in,
-                                        const int w_in,
-                                        const int h_out,
-                                        const int w_out,
-                                        ARMContext* ctx) {
-  // printf("3x3s1 mult height \n");
-  const char zero[8] = {0, 0, 0, 0, 0, 0, 0, 0};
-  //! for 4x6 convolution window
-  const unsigned char right_pad_idx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  const unsigned int right_pad_rst[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-
-  // printf("conv3x3_dw start \n");
-  signed char* zero_ptr = ctx->workspace_data<signed char>();
-  memset(zero_ptr, 0, w_in * sizeof(signed char));
-  int* write_ptr =
-      reinterpret_cast<int*>(ctx->workspace_data<signed char>()) + w_in;
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  int w_stride = 9;
-
-  int tile_h = (h_out + 1) >> 1;
-
-  unsigned int size_pad_right = (unsigned int)(w_in);
-
-  uint8x8_t vmask_rp =
-      vcgt_u8(vdup_n_u8(size_pad_right), vld1_u8(right_pad_idx));
-  // uint8x8_t vmask_rp2 = vcgt_u8(vdup_n_u8(size_pad_right),
-  // vld1_u8(right_pad_idx + 8));
-  unsigned char vmask[8];
-  vst1_u8(vmask, vmask_rp);
-
-  unsigned int rst_remain = (unsigned int)w_out;
-  uint32x4_t vmask_result1 =
-      vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst));
-  uint32x4_t vmask_result2 =
-      vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst + 4));
-
-  unsigned int rmask[8];
-  vst1q_u32(rmask, vmask_result1);
-  vst1q_u32(rmask + 4, vmask_result2);
-
-  int8x8_t vzero = vdup_n_s8(0);
-  int32x4_t vzero_32 = vdupq_n_s32(0);
-
-  for (int n = 0; n < num; ++n) {
-    const signed char* din_batch = din + n * ch_in * size_in_channel;
-    int* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; c++) {
-      int* dout_ptr = dout_batch + c * size_out_channel;
-
-      const signed char* din_ch_ptr = din_batch + c * size_in_channel;
-
-      int bias_val = flag_bias ? bias[c] : 0;
-
-      const signed char* wei_ptr = weights + c * w_stride;
-#ifdef __aarch64__
-      int vbias[4] = {bias_val, bias_val, bias_val, bias_val};
-      int8x8_t wr00 = vdup_n_s8(wei_ptr[0]);
-      int8x8_t wr10 = vdup_n_s8(wei_ptr[3]);
-      int8x8_t wr20 = vdup_n_s8(wei_ptr[6]);
-
-      int8x8_t wr01 = vdup_n_s8(wei_ptr[1]);
-      int8x8_t wr11 = vdup_n_s8(wei_ptr[4]);
-      int8x8_t wr21 = vdup_n_s8(wei_ptr[7]);
-
-      int8x8_t wr02 = vdup_n_s8(wei_ptr[2]);
-      int8x8_t wr12 = vdup_n_s8(wei_ptr[5]);
-      int8x8_t wr22 = vdup_n_s8(wei_ptr[8]);
-#endif
-      int* doutr0 = nullptr;
-      int* doutr1 = nullptr;
-
-      const signed char* dr0 = din_ch_ptr;
-      const signed char* dr1 = dr0 + w_in;
-      const signed char* dr2 = dr1 + w_in;
-      const signed char* dr3 = dr2 + w_in;
-
-      const signed char* din_ptr0 = nullptr;
-      const signed char* din_ptr1 = nullptr;
-      const signed char* din_ptr2 = nullptr;
-      const signed char* din_ptr3 = nullptr;
-
-      for (int i = 0; i < h_in; i += 2) {
-        //! process top pad pad_h = 1
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-        din_ptr3 = dr3;
-
-        doutr0 = dout_ptr;
-        doutr1 = doutr0 + w_out;
-        unsigned int* rst_mask = rmask;
-
-        int out_buf1[8];
-        int out_buf2[8];
-        int trash_buf[8] = {0, 0, 0, 0, 0, 0, 0, 0};
-
-        if (i == 0) {
-          din_ptr0 = zero_ptr;
-          din_ptr1 = dr0;
-          din_ptr2 = dr1;
-          din_ptr3 = dr2;
-          dr0 = dr1;
-          dr1 = dr2;
-          dr2 = dr3;
-          dr3 = dr2 + w_in;
-        } else {
-          dr0 = dr2;
-          dr1 = dr3;
-          dr2 = dr1 + w_in;
-          dr3 = dr2 + w_in;
-        }
-        //! process bottom pad
-        if (i + 3 > h_in) {
-          switch (i + 3 - h_in) {
-            case 3:
-              din_ptr1 = zero_ptr;
-            case 2:
-              din_ptr2 = zero_ptr;
-            case 1:
-              din_ptr3 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 2 > h_out) {
-          doutr1 = trash_buf;
-        }
-#ifdef __aarch64__
-        asm volatile(
-            "PRFM PLDL1KEEP, [%[din_ptr0]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr1]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr2]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr3]] \n"
-            "movi   v21.4s, #0x0\n" /* out0 = 0 */
-                                    // left
-            "ld1 {v4.8b}, [%[vmask]]            \n"
-            "ld1    {v0.8b}, [%[din_ptr0]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-            "ld1    {v1.8b}, [%[din_ptr1]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-            "ld1    {v2.8b}, [%[din_ptr2]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-            "ld1    {v3.8b}, [%[din_ptr3]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-
-            "bif v0.8b, v21.8b, v4.8b               \n"
-            "bif v1.8b, v21.8b, v4.8b               \n"
-            "bif v2.8b, v21.8b, v4.8b               \n"
-            "bif v3.8b, v21.8b, v4.8b               \n"
-
-            "ext v6.8b, v21.8b, v0.8B, #7       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       00123456 */
-            "ext v7.8b, v0.8b, v21.8B, #1       \n" /* vext_s8(vinr0, vinr0_1,
-                                                       1); 12345678 */
-
-            "ld1 {v10.4s}, [%[vbias]]            \n"
-            "ld1 {v11.4s}, [%[vbias]]            \n"
-
-            // r0
-            "smull  v18.8h,  %[v1].8b,  v0.8b   \n" /* outr00 = 01234567 * w01
-                                                     */
-
-            "ext v8.8b, v21.8b, v1.8B, #7       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       00123456 */
-            "ext v9.8b, v1.8b, v21.8B, #1       \n" /* vext_s8(vinr0, vinr0_1,
-                                                       1); 12345678 */
-
-            "smlal  v18.8h,  %[v0].8b,  v6.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "ld1 {v12.4s}, [%[vbias]]            \n"
-            "ld1 {v13.4s}, [%[vbias]]            \n"
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v18.8h,  %[v2].8b,  v7.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "ext v6.8b, v21.8b, v2.8B, #7       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       00123456 */
-            "ext v7.8b, v2.8b, v21.8B, #1       \n" /* vext_s8(vinr0, vinr0_1,
-                                                       1); 12345678 */
-
-            // r1
-            "smull  v19.8h,  %[v1].8b,  v1.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-            "smlal  v18.8h,  %[v4].8b,  v1.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            // "ld1 {v14.4s}, [%[rmask]], #16                \n"
-            // "ld1 {v15.4s}, [%[rmask]]                     \n"
-
-            "smlal  v19.8h,  %[v0].8b,  v8.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v18.8h,  %[v3].8b,  v8.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            // "ld1 {v16.4s}, [%[ptr_out0]], #16                \n"
-            // "ld1 {v17.4s}, [%[ptr_out1]], #16                \n"
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v19.8h,  %[v2].8b,  v9.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-            "smlal  v18.8h,  %[v5].8b,  v9.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "ext v8.8b, v21.8b, v3.8B, #7       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       00123456 */
-            "ext v9.8b, v3.8b, v21.8B, #1       \n"  // vext_s8(vinr0, vinr0_1,
-                                                     // 1); 12345678
-
-            // "ld1 {v0.4s}, [%[ptr_out0]]                   \n"
-            // "ld1 {v1.4s}, [%[ptr_out1]]                   \n"
-
-            // r2
-            "smlal  v19.8h,  %[v4].8b,  v2.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v18.8h,  %[v7].8b,  v2.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            // "sub %[ptr_out0], %[ptr_out0], #16   \n"
-            // "sub %[ptr_out1], %[ptr_out1], #16   \n"
-
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v19.8h,  %[v3].8b,  v6.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-            "smlal  v18.8h,  %[v6].8b,  v6.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "smlal  v19.8h,  %[v5].8b,  v7.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v18.8h,  %[v8].8b,  v7.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            // r3
-            "smull  v19.8h,  %[v7].8b,  v3.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smlal  v19.8h,  %[v6].8b,  v8.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            // "bif v10.16b, v16.16b, v14.16b         \n"
-            // "bif v11.16b, v0.16b, v15.16b         \n"
-
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v19.8h,  %[v8].8b,  v9.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "stp     q10, q11, [%[ptr_out0]] \n" /* store q10, q11 -> ptr_out */
-
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            // "bif v12.16b, v17.16b, v14.16b         \n"
-            // "bif v13.16b, v1.16b, v15.16b         \n"
-
-            "stp     q12, q13, [%[ptr_out1]] \n" /* store q10, q11 -> ptr_out */
-
-            : [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [rmask] "+r"(rst_mask)
-            : [v0] "w"(wr00),
-              [v1] "w"(wr01),
-              [v2] "w"(wr02),
-              [v3] "w"(wr10),
-              [vbias] "r"(vbias),
-              [v4] "w"(wr11),
-              [v5] "w"(wr12),
-              [v6] "w"(wr20),
-              [v7] "w"(wr21),
-              [v8] "w"(wr22),
-              [vmask] "r"(vmask),
-              [ptr_out0] "r"(out_buf1),
-              [ptr_out1] "r"(out_buf2)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22");
-#else
-        // store weights
-        asm volatile("vld1.8    {d0-d1}, [%[wei_ptr]]    \n"
-                     :
-                     : [wei_ptr] "r"(wei_ptr)
-                     : "memory");
-        asm volatile(
-            // left
-            "pld [%[din_ptr0]]                @ preload data\n"
-            "pld [%[din_ptr1]]                @ preload data\n"
-            "pld [%[din_ptr2]]                @ preload data\n"
-            "pld [%[din_ptr3]]                @ preload data\n"
-            "vld1.8 {d28}, [%[mask]]        @ load din00= 0 1 2 3 4 5 6 7 8 9\n"
-            "vld1.8 {d12}, [%[din_ptr0]]    @ load din00= 0 1 2 3 4 5 6 7 8 9\n"
-            "vld1.8 {d13}, [%[din_ptr1]]    @ load din00= 0 1 2 3 4 5 6 7 8 9\n"
-            "vdup.s8     d2, d0[0]               @ d2 = w00, w00, w00, w00\n"
-            "vdup.s8     d3, d0[1]               @ d3 = w01, w01, w01, w01\n"
-            "vdup.s8     d4, d0[2]               @ d4 = w02, w02, w02, w02\n"
-
-            "vmov.u32 d11, #0                   @ zero\n"
-            // out0
-            "vdup.32 q8, %[bias]                            @ and \n"  // q8 =
-                                                                       // vbias
-            "vdup.32 q9, %[bias]                            @ and \n"  // q9 =
-                                                                       // vbias
-
-            "vbif.8 d12, d11, d28        @ bit select, deal with right pad\n"
-            "vbif.8 d13, d11, d28        @ bit select, deal with right pad\n"
-            "vld1.8 {d14}, [%[din_ptr2]]    @ load din00= 0 1 2 3 4 5 6 7 8 9\n"
-            "vld1.8 {d15}, [%[din_ptr3]]    @ load din00= 0 1 2 3 4 5 6 7 8 9\n"
-            // out1
-            "vdup.32 q10, %[bias]                            @ and \n"  // q8 =
-                                                                        // vbias
-            "vdup.32 q11, %[bias]                            @ and \n"  // q9 =
-                                                                        // vbias
-
-            // r0
-            "vmull.s8 q12, d12, d3                 @ out0 = din0 * w01 \n"  // q12 = d12 * w01
-            "vext.8 d30, d11, d12, #7           @ ext \n"  // d10 = 00123456
-            "vext.8 d31, d12, d11, #1          @ ext \n"   // d11 = 12345678
-
-            "vdup.s8 d5, d0[3]               @ d5 = w10, w10, w00, w00\n"
-            "vdup.s8 d6, d0[4]               @ d6 = w11, w11, w01, w01\n"
-
-            "vmlal.s8 q12, d30, d2                 @ out0 += din0 * w00 \n"  // q12 += d10 * w00
-
-            "vdup.s8 d7, d0[5]               @ d7 = w12, w12\n"
-            "vbif.8 d14, d11, d28        @ bit select, deal with right pad\n"
-            "vbif.8 d15, d11, d28        @ bit select, deal with right pad\n"
-
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmull.s8 q12, d31, d4                 @ out0 += din0 * w02 \n"  // q12 += d11 * w02
-
-            // r1
-            "vext.8     d30, d11, d13, #7     @ ext \n"       // d10 = 00123456
-            "vext.8     d31, d13, d11, #1          @ ext \n"  // d11 = 12345678
-            "vmull.s8 q13, d13, d3                 @ out1 = din1 * w01 \n"  // q13 = d12 * w01
-
-            "vmlal.s8 q12, d13, d6                 @ out0 = din1 * w11 \n"  // q12 = d12 * w11
-
-            "vdup.s8 d8, d0[6]               @ d8 = w20, w00, w00, w00\n"
-            "vdup.s8 d9, d0[7]               @ d9 = w21, w01, w01, w01\n"
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmlal.s8 q13, d30, d2                 @ out1 += din1 * w00 \n"  // q12 += d10 * w00
-            "vmull.s8 q12, d30, d5                 @ out0 += din1 * w10 \n"  // q12 += d10 * w00
-
-            "vdup.s8 d10, d1[0]               @ d10 = w22, w02, w02, w02\n"
-            // "vld1.32 {d28-d29}, [%[dout_ptr1]]!    @ load din00= 0 1 2 3 4 5
-            // 6 7 8 9\n" "vld1.32 {d12-d13}, [%[dout_ptr1]]    @ load din00= 0
-            // 1 2 3 4 5 6 7 8 9\n"
-
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmull.s8 q13, d31, d4                 @ out1 += din1 * w02 \n"  // q12 += d10 * w00
-            "vmlal.s8 q12, d31, d7                 @ out0 += din1 * w12 \n"  // q12 += d10 * w00
-
-            // r2
-            "vext.8     d30, d11, d14, #7     @ ext \n"       // d10 = 00123456
-            "vext.8     d31, d14, d11, #1          @ ext \n"  // d11 = 12345678
-
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmlal.s8 q13, d14, d6                 @ out1 = din2 * w11 \n"  // q13 = d12 * w01
-            "vmull.s8 q12, d14, d9                 @ out1 = din2 * w21 \n"  // q13 = d12 * w01
-
-            // "sub %[dout_ptr1], #16                  @ sub \n"
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmull.s8 q13, d30, d5                 @ out1 += din2 * w10 \n"  // q12 += d10 * w00
-            "vmlal.s8 q12, d30, d8                 @ out0 += din2 * w20 \n"  // q12 += d10 * w00
-
-            // "vld1.32 {d2-d3}, [%[rs_mask]]!     @ load din00= 0 1 2 3 4 5 6 7
-            // 8 9\n" "vld1.32 {d4-d5}, [%[rs_mask]]    @ load din00= 0 1 2 3 4
-            // 5 6 7 8 9\n"
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmlal.s8 q13, d31, d7                 @ out1 += din2 * w12 \n"  // q12 += d10 * w00
-            "vmull.s8 q12, d31, d10                 @ out0 += din2 * w22 \n"  // q12 += d10 * w00
-
-            // r3
-            "vext.8     d30, d11, d15, #7     @ ext \n"       // d10 = 00123456
-            "vext.8     d31, d15, d11, #1          @ ext \n"  // d11 = 12345678
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmull.s8 q13, d15, d9                 @ out1 = din3 * w21 \n"  // q13 = d12 * w01
-
-            // "vld1.32 {d6-d7}, [%[dout_ptr2]]!    @ load din00= 0 1 2 3 4 5 6
-            // 7 8 9\n" "vld1.32 {d14-d15}, [%[dout_ptr2]]    @ load din00= 0 1
-            // 2 3 4 5 6 7 8 9\n"
-
-            "vmlal.s8 q13, d30, d8                 @ out1 += din3 * w20 \n"  // q13 += d10 * w00
-
-            // "vbif q8, q14, q1                   @ bit select, deal with right
-            // pad\n" "vbif q9, q6, q2                    @ bit select, deal
-            // with right pad\n"
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmull.s8 q13, d31, d10                 @ out1 += din3 * w22 \n"  // q12 += d10 * w00
-
-            // "sub %[dout_ptr2], #16                  @ sub \n"
-
-            "vst1.32 {d16-d19}, [%[dout_ptr1]]         @ store\n"
-            // "vst1.32 {d18-d19}, [%[dout_ptr1]]!         @ store\n"
-
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            // "vbif q10, q3, q1                   @ bit select, deal with right
-            // pad\n" "vbif q11, q7, q2                    @ bit select, deal
-            // with right pad\n"
-
-            "vst1.32 {d20-d23}, [%[dout_ptr2]]         @ store\n"
-            // "vst1.32 {d22-d23}, [%[dout_ptr2]]!         @ store\n"
-            : [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [bias] "+r"(bias_val),
-              [rs_mask] "+r"(rst_mask)
-            : [mask] "r"(vmask),
-              [dout_ptr1] "r"(out_buf1),
-              [dout_ptr2] "r"(out_buf2)
-            : "cc",
-              "memory",
-              "q0",
-              "q1",
-              "q2",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif
-        for (int w = 0; w < w_out; ++w) {
-          *doutr0++ = out_buf1[w];
-          *doutr1++ = out_buf2[w];
-        }
-        dout_ptr += 2 * w_out;
-      }
-    }
-  }
-}
-
-// 4line w_in > 16
-void conv_depthwise_3x3s2p1_bias_int8(int* dout,
-                                      const signed char* din,
-                                      const signed char* weights,
-                                      const int* bias,
-                                      bool flag_bias,
-                                      const int num,
-                                      const int ch_in,
-                                      const int h_in,
-                                      const int w_in,
-                                      const int h_out,
-                                      const int w_out,
-                                      ARMContext* ctx) {
-  // printf("3x3s2 mult height \n");
-  //! pad is done implicit
-  const char zero[8] = {0, 0, 0, 0, 0, 0, 0, 0};
-  //! for 4x6 convolution window
-  const unsigned char right_pad_idx[16] = {
-      0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
-  const unsigned int right_pad_rst[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-
-  // printf("conv3x3_dw start \n");
-  signed char* zero_ptr = ctx->workspace_data<signed char>();
-  memset(zero_ptr, 0, w_in * sizeof(signed char));
-  int* write_ptr =
-      reinterpret_cast<int*>(ctx->workspace_data<signed char>()) + w_out;
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  int w_stride = 9;
-
-  int tile_w = (w_in + 15) >> 4;
-  int cnt_col = tile_w - 2;
-
-  unsigned int size_pad_right = (unsigned int)(w_in - 15 - (cnt_col << 4));
-  if (size_pad_right == 17) {
-    size_pad_right = 0;
-    cnt_col++;
-  }
-
-  uint8x8_t vmask_rp1 =
-      vcgt_u8(vdup_n_u8(size_pad_right), vld1_u8(right_pad_idx));
-  uint8x8_t vmask_rp2 =
-      vcgt_u8(vdup_n_u8(size_pad_right), vld1_u8(right_pad_idx + 8));
-  unsigned int rst_remain = (unsigned int)(w_out - ((cnt_col + 1) << 3));
-  uint32x4_t vmask_result1 =
-      vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst));
-  uint32x4_t vmask_result2 =
-      vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst + 4));
-
-  uint8x16_t vmask_rp =
-      vcgtq_u8(vdupq_n_u8(size_pad_right), vld1q_u8(right_pad_idx));
-  unsigned char vmask[16];
-  vst1q_u8(vmask, vmask_rp);
-
-  unsigned int rmask[8];
-  vst1q_u32(rmask, vmask_result1);
-  vst1q_u32(rmask + 4, vmask_result2);
-
-  int8x8_t vzero = vdup_n_s8(0);
-  // printf("cnt_col: %d, rst_remain: %d, size_pad_right: %d\n", cnt_col,
-  // rst_remain, size_pad_right);
-  for (int n = 0; n < num; ++n) {
-    const signed char* din_batch = din + n * ch_in * size_in_channel;
-    int* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; c++) {
-      int* dout_ptr = dout_batch + c * size_out_channel;
-
-      const signed char* din_ch_ptr = din_batch + c * size_in_channel;
-
-      int bias_val = flag_bias ? bias[c] : 0;
-
-      const signed char* wei_ptr = weights + c * w_stride;
-#ifdef __aarch64__
-      int vbias[4] = {bias_val, bias_val, bias_val, bias_val};
-      int8x8_t wr00 = vdup_n_s8(wei_ptr[0]);
-      int8x8_t wr10 = vdup_n_s8(wei_ptr[3]);
-      int8x8_t wr20 = vdup_n_s8(wei_ptr[6]);
-
-      int8x8_t wr01 = vdup_n_s8(wei_ptr[1]);
-      int8x8_t wr11 = vdup_n_s8(wei_ptr[4]);
-      int8x8_t wr21 = vdup_n_s8(wei_ptr[7]);
-
-      int8x8_t wr02 = vdup_n_s8(wei_ptr[2]);
-      int8x8_t wr12 = vdup_n_s8(wei_ptr[5]);
-      int8x8_t wr22 = vdup_n_s8(wei_ptr[8]);
-#endif
-
-      int* doutr0 = nullptr;
-
-      const signed char* dr0 = din_ch_ptr;
-      const signed char* dr1 = dr0 + w_in;
-      const signed char* dr2 = dr1 + w_in;
-
-      const signed char* din_ptr0 = nullptr;
-      const signed char* din_ptr1 = nullptr;
-      const signed char* din_ptr2 = nullptr;
-
-      for (int i = 0; i < h_in; i += 2) {
-        //! process top pad pad_h = 1
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-
-        doutr0 = dout_ptr;
-        if (i == 0) {
-          din_ptr0 = zero_ptr;
-          din_ptr1 = dr0;
-          din_ptr2 = dr1;
-          dr0 = dr1;
-          dr1 = dr2;
-          dr2 = dr1 + w_in;
-        } else {
-          dr0 = dr2;
-          dr1 = dr0 + w_in;
-          dr2 = dr1 + w_in;
-        }
-        //! process bottom pad
-        if (i + 2 > h_in) {
-          switch (i + 2 - h_in) {
-            case 2:
-              din_ptr1 = zero_ptr;
-            case 1:
-              din_ptr2 = zero_ptr;
-            default:
-              break;
-          }
-        }
-#ifdef __aarch64__
-        int cnt = cnt_col;
-        unsigned char* val_mask = vmask;
-        asm volatile(
-            "PRFM PLDL1KEEP, [%[din_ptr0]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr1]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr2]] \n"
-            "movi   v10.4s, #0x0\n"
-            // left
-            "ld2    {v0.8b - v1.8b}, [%[din_ptr0]]         \n" /*load a00-a015
-                                                                  to q0*/
-            "ld2    {v2.8b - v3.8b}, [%[din_ptr1]]         \n" /* load a00-a015
-                                                                  to q0*/
-            "ld2    {v4.8b - v5.8b}, [%[din_ptr2]]         \n" /*load a00-a015
-                                                                  to q0*/
-
-            "ld1    {v12.4s}, [%[bias_val]] \n" /* dup v10, bias*/
-            "ld1    {v13.4s}, [%[bias_val]] \n" /* dup v10, bias */
-
-            "ext v6.8b, v10.8b, v1.8B, #7       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       013579 */
-            "ext v7.8b, v10.8b, v3.8B, #7       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       013579 */
-            "ext v8.8b, v10.8b, v5.8B, #7       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       013579 */
-
-            // r0
-            "smull  v14.8h,  %[v1].8b,  v0.8b   \n" /* outr00 = 02468 * w01 */
-            "smull  v15.8h,  %[v2].8b,  v1.8b\n"    /* outr00 += 13579 * w02 */
-            "smull  v16.8h,  %[v0].8b,  v6.8b\n"    /* outr00 += 013579 * w00 */
-
-            "add   %[din_ptr0], %[din_ptr0], #15                       \n"
-            "add   %[din_ptr1], %[din_ptr1], #15                       \n"
-            "add   %[din_ptr2], %[din_ptr2], #15                       \n"
-
-            // r1
-            "smlal  v14.8h,  %[v4].8b,  v2.8b   \n" /* outr00 = 02468 * w01 */
-            "smlal  v15.8h,  %[v5].8b,  v3.8b\n"    /* outr00 += 13579 * w02 */
-            "smlal  v16.8h,  %[v3].8b,  v7.8b\n"    /* outr00 += 013579 * w00 */
-
-            "saddw   v12.4s, v12.4s, v14.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v14.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v15.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v15.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v16.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v16.8h        \n" /* v11 += outr00.high*/
-
-            // r2
-            "smull  v14.8h,  %[v7].8b,  v4.8b   \n" /* outr00 = 02468 * w01 */
-            "smull  v15.8h,  %[v8].8b,  v5.8b\n"    /* outr00 += 13579 * w02 */
-            "smull  v16.8h,  %[v6].8b,  v8.8b\n"    /* outr00 += 013579 * w00 */
-
-            "ld2    {v0.8b - v1.8b}, [%[din_ptr0]], #16         \n" /*load
-                                                                       a00-a015
-                                                                       to q0*/
-            "ld2    {v2.8b - v3.8b}, [%[din_ptr1]], #16         \n" /* load
-                                                                       a00-a015
-                                                                       to q0*/
-            "ld2    {v4.8b - v5.8b}, [%[din_ptr2]], #16         \n" /*load
-                                                                       a00-a015
-                                                                       to q0*/
-
-            "saddw   v12.4s, v12.4s, v14.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v14.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v15.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v15.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v16.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v16.8h        \n" /* v11 += outr00.high*/
-
-            "stp     q12, q13, [%[ptr_out0]], #32   \n" /* store q10, q11 ->
-                                                           ptr_out   */
-
-            "ld1    {v12.4s}, [%[bias_val]] \n" /* dup v10, bias */
-            "ld1    {v13.4s}, [%[bias_val]] \n" /* dup v10, bias */
-
-            "cmp  %[cnt], #1                \n"
-            "blt 3f                         \n"
-            // mid
-            "1:                             \n"
-            "ld1    {v6.8b}, [%[din_ptr0]]         \n" /*load a00-a015 to q0*/
-            "ld1    {v7.8b}, [%[din_ptr1]]         \n" /*load a00-a015 to q0*/
-            "ld1    {v8.8b}, [%[din_ptr2]]         \n" /*load a00-a015 to q0*/
-
-            "ext v9.8b, v0.8b, v6.8B, #1       \n"  /* vext_s8(vzero, vinr0, 7);
-                                                       246810 */
-            "ext v11.8b, v2.8b, v7.8B, #1       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       246810 */
-            "ext v14.8b, v4.8b, v8.8B, #1       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       246810 */
-
-            // r0
-            "smull  v6.8h,  %[v0].8b,  v0.8b   \n" /* outr00 = 02468 * w00 */
-            "smull  v7.8h,  %[v1].8b,  v1.8b\n"    /* outr00 += 13579 * w01 */
-            "smull  v8.8h,  %[v2].8b,  v9.8b\n"    /* outr00 += 246810 * w02 */
-
-            // r1
-            "smlal  v6.8h,  %[v3].8b,  v2.8b   \n" /* outr00 = 02468 * w00 */
-            "smlal  v7.8h,  %[v4].8b,  v3.8b\n"    /* outr00 += 13579 * w01 */
-            "smlal  v8.8h,  %[v5].8b,  v11.8b\n"   /* outr00 += 246810 * w02 */
-
-            "saddw   v12.4s, v12.4s, v6.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v6.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v7.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v7.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v8.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v8.8h        \n" /* v11 += outr00.high*/
-
-            // r2
-            "smull  v6.8h,  %[v6].8b,  v4.8b   \n" /* outr00 = 02468 * w00 */
-            "smull  v7.8h,  %[v7].8b,  v5.8b\n"    /* outr00 += 13579 * w01 */
-            "smull  v8.8h,  %[v8].8b,  v14.8b\n"   /* outr00 += 246810 * w02 */
-
-            "ld2    {v0.8b - v1.8b}, [%[din_ptr0]], #16         \n" /*load
-                                                                       a00-a015
-                                                                       to q0*/
-            "ld2    {v2.8b - v3.8b}, [%[din_ptr1]], #16         \n" /* load
-                                                                       a00-a015
-                                                                       to q0*/
-            "ld2    {v4.8b - v5.8b}, [%[din_ptr2]], #16         \n" /*load
-                                                                       a00-a015
-                                                                       to q0*/
-
-            "saddw   v12.4s, v12.4s, v6.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v6.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v7.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v7.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v8.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v8.8h        \n" /* v11 += outr00.high*/
-
-            "subs %[cnt], %[cnt], #1               \n"
-
-            "stp     q12, q13, [%[ptr_out0]], #32   \n" /* store q10, q11 ->
-                                                           ptr_out   */
-
-            "ld1    {v12.4s}, [%[bias_val]] \n" /* dup v10, bias */
-            "ld1    {v13.4s}, [%[bias_val]] \n" /* dup v10, bias */
-            "bne 1b                         \n"
-            // right
-            "3:                             \n"
-            "ld1 {v14.8b}, [%[vmask]], #8             \n"
-            "ld1 {v15.8b}, [%[vmask]]                \n"
-
-            "bif v0.8b, v10.8b, v14.8b               \n"
-            "bif v1.8b, v10.8b, v15.8b               \n"
-            "bif v2.8b, v10.8b, v14.8b               \n"
-            "bif v3.8b, v10.8b, v15.8b               \n"
-            "bif v4.8b, v10.8b, v14.8b               \n"
-            "bif v5.8b, v10.8b, v15.8b               \n"
-
-            "ext v6.8b, v0.8b, v10.8B, #1       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       2468.. */
-            "ext v7.8b, v2.8b, v10.8B, #1       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       2468..*/
-            "ext v8.8b, v4.8b, v10.8B, #1       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       2468.. */
-
-            // r0
-            "smull  v14.8h,  %[v0].8b,  v0.8b   \n" /* outr00 = 02468 * w00 */
-            "smull  v15.8h,  %[v1].8b,  v1.8b\n"    /* outr00 += 13579 * w01 */
-            "smull  v16.8h,  %[v2].8b,  v6.8b\n"    /* outr00 += 246810 * w02 */
-
-            // r1
-            "smlal  v14.8h,  %[v3].8b,  v2.8b   \n" /* outr00 = 02468 * w00 */
-            "smlal  v15.8h,  %[v4].8b,  v3.8b\n"    /* outr00 += 13579 * w01 */
-            "smlal  v16.8h,  %[v5].8b,  v7.8b\n"    /* outr00 += 246810 * w02 */
-
-            "saddw   v12.4s, v12.4s, v14.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v14.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v15.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v15.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v16.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v16.8h        \n" /* v11 += outr00.high*/
-
-            // r2
-            "smull  v14.8h,  %[v6].8b,  v4.8b   \n" /* outr00 = 02468 * w00 */
-            "smull  v15.8h,  %[v7].8b,  v5.8b\n"    /* outr00 += 13579 * w01 */
-            "smull  v16.8h,  %[v8].8b,  v8.8b\n"    /* outr00 += 246810 * w02 */
-
-            "ldp    q0, q1, [%[ptr_out0]] \n"  /* dup v10, bias */
-            "ldp    q9, q11, [%[rst_mask]] \n" /* dup v10, bias */
-
-            "saddw   v12.4s, v12.4s, v14.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v14.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v15.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v15.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v16.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v16.8h        \n" /* v11 += outr00.high*/
-
-            "bif v12.16b, v0.16b, v9.16b         \n"
-            "bif v13.16b, v1.16b, v11.16b         \n"
-
-            "stp     q12, q13, [%[ptr_out0]], #32 \n" /* store q10, q11 ->
-                                                         ptr_out       */
-
-            : [cnt] "+r"(cnt),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [ptr_out0] "+r"(doutr0),
-              [vmask] "+r"(val_mask)
-            : [v0] "w"(wr00),
-              [v1] "w"(wr01),
-              [v2] "w"(wr02),
-              [v3] "w"(wr10),
-              [bias_val] "r"(vbias),
-              [v4] "w"(wr11),
-              [v5] "w"(wr12),
-              [v6] "w"(wr20),
-              [v7] "w"(wr21),
-              [v8] "w"(wr22),
-              [rst_mask] "r"(rmask)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16");
-#else
-        unsigned int* rst_mask = rmask;
-        int cnt = cnt_col;
-        // prefetch input
-        // store weights
-        asm volatile("vld1.8    {d0-d1}, [%[wei_ptr]]    \n"
-                     :
-                     : [wei_ptr] "r"(wei_ptr)
-                     : "memory");
-        asm volatile(
-            // left
-            "pld [%[din_ptr0]]                @ preload data\n"
-            "pld [%[din_ptr1]]                @ preload data\n"
-            "pld [%[din_ptr2]]                @ preload data\n"
-            "vdup.s8     d2, d0[0]               @ d2 = w00, w00, w00, w00\n"
-            "vdup.s8     d3, d0[1]               @ d3 = w01, w01, w01, w01\n"
-            "vdup.s8     d4, d0[2]               @ d4 = w02, w02, w02, w02\n"
-            "vld2.8 {d12-d13}, [%[din_ptr0]]    @ load din00= 0 2 4 6 8\n"  // d10 = 0 2 4 6
-            "vld2.8 {d14-d15}, [%[din_ptr1]]    @ load din00= 0 2 4 6 8\n"  // d12 = 0 2 4 6
-            "vld2.8 {d16-d17}, [%[din_ptr2]]    @ load din00= 0 2 4 6 8\n"  // d14 = 0 2 4 6
-            "vmov.u32 d11, #0                   @ zero\n"
-
-            "vdup.s8     d5, d0[3]               @ d2 = w00, w00, w00, w00\n"
-            "vdup.s8     d6, d0[4]               @ d3 = w01, w01, w01, w01\n"
-            "vdup.s8     d7, d0[5]               @ d4 = w02, w02, w02, w02\n"
-
-            "vext.8  d18, d11, d13, #7     @ ext \n"  // d16 = -1 1 3 5
-            "vext.8  d19, d11, d15, #7     @ ext \n"  // d17 = -1 1 3 5
-            "vext.8  d20, d11, d17, #7     @ ext \n"  // d18 = -1 1 3 5
-
-            // r0
-            "vmull.s8 q13, d12, d3                 @ out0 = din0 * w01 \n"  // q12 = d12 * w01
-            "vmull.s8 q14, d13, d4                 @ out1 = din0 * w02 \n"  // q12 = d12 * w02
-            "vmull.s8 q15, d18, d2                 @ out2 = din0 * w00 \n"  // q12 = d12 * w02
-
-            "vdup.s8 d8, d0[6]               @ d2 = w00, w00, w00, w00\n"
-            "vdup.s8 d9, d0[7]               @ d3 = w01, w01, w01, w01\n"
-            "vdup.s8 d10, d1[0]               @ d4 = w02, w02, w02, w02\n"
-
-            // out0
-            "vdup.32 q11, %[bias]                            @ and \n"  // q8 =
-                                                                        // vbias
-            "vdup.32 q12, %[bias]                            @ and \n"  // q9 =
-                                                                        // vbias
-
-            // r1
-            "vmlal.s8 q13, d14, d6                 @ out0 += din1 * w11 \n"  // q12 = d12 * w11
-            "vmlal.s8 q14, d15, d7                 @ out1 += din1 * w12 \n"  // q12 = d12 * w11
-            "vmlal.s8 q15, d19, d5                 @ out2 += din1 * w10 \n"  // q12 = d12 * w11
-
-            "add %[din_ptr0], #15                   @add \n"
-
-            "vaddw.s16 q11, q11, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-            "add %[din_ptr1], #15                   @add \n"
-
-            "vaddw.s16 q11, q11, d28                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d29                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-            "add %[din_ptr2], #15                   @add \n"
-
-            "vaddw.s16 q11, q11, d30                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d31                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            // r2
-            "vmull.s8 q13, d16, d9                 @ out0 += din1 * w21 \n"  // q12 = d12 * w11
-            "vmull.s8 q14, d17, d10                 @ out1 += din1 * w22 \n"  // q12 = d12 * w11
-            "vmull.s8 q15, d20, d8                 @ out2 += din1 * w20 \n"  // q12 = d12 * w11
-
-            "pld [%[din_ptr0]]                @ preload data\n"
-            "pld [%[din_ptr1]]                @ preload data\n"
-            "pld [%[din_ptr2]]                @ preload data\n"
-
-            "vaddw.s16 q11, q11, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vaddw.s16 q11, q11, d28                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d29                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vaddw.s16 q11, q11, d30                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d31                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vst1.32 {d22-d23}, [%[dout_ptr1]]!         @ store\n"
-            "cmp %[cnt], #1                                 \n"
-            "vst1.32 {d24-d25}, [%[dout_ptr1]]!         @ store\n"
-            "blt 1f                                         \n"
-
-            // mid
-            "2:                                              \n"
-            "vld2.8 {d12-d13}, [%[din_ptr0]]!    @ load din00= 0 2 4 6 8\n"  // d10 = 0 2 4 6
-            "vld2.8 {d14-d15}, [%[din_ptr1]]!    @ load din00= 0 2 4 6 8\n"  // d12 = 0 2 4 6
-            "vld2.8 {d16-d17}, [%[din_ptr2]]!    @ load din00= 0 2 4 6 8\n"  // d14 = 0 2 4 6
-
-            "vld1.8 {d21}, [%[din_ptr0]]    @ load din00= 16 17\n"  // d10 = 0 2
-                                                                    // 4 6
-            "vld1.8 {d22}, [%[din_ptr1]]    @ load din00= 16 17\n"  // d12 = 0 2
-                                                                    // 4 6
-            "vld1.8 {d23}, [%[din_ptr2]]    @ load din00= 16 17\n"  // d14 = 0 2
-                                                                    // 4 6
-
-            "vext.8  d18, d12, d21, #1     @ ext din00 = 2 4 6 8\n"  // d16 = 2
-                                                                     // 4 6 8
-            "vext.8  d19, d14, d22, #1     @ ext \n"  // d17 = 2 4 6 8
-            "vext.8  d20, d16, d23, #1     @ ext \n"  // d18 = 2 4 6 8
-
-            // r0
-            "vmull.s8 q13, d12, d2                 @ out0 = din0 * w00 \n"  // q12 = 0 2 4 6
-            "vmull.s8 q14, d13, d3                 @ out1 = din0 * w01 \n"  // q12 = 1 3 5 7
-            "vmull.s8 q15, d18, d4                 @ out2 = din0 * w02 \n"  // q12 = 2 4 6 8
-
-            // out0
-            "vdup.32 q11, %[bias]                            @ and \n"  // q8 =
-                                                                        // vbias
-            "vdup.32 q12, %[bias]                            @ and \n"  // q9 =
-                                                                        // vbias
-
-            // r1
-            "vmlal.s8 q13, d14, d5                 @ out0 += din1 * w10 \n"  // q12 = 0 2 4 6
-            "vmlal.s8 q14, d15, d6                 @ out1 += din1 * w11 \n"  // q12 = 1 3 5 7
-            "vmlal.s8 q15, d19, d7                 @ out2 += din1 * w12 \n"  // q12 = 2 4 6 8
-
-            "pld [%[din_ptr0]]                @ preload data\n"
-            "pld [%[din_ptr1]]                @ preload data\n"
-            "pld [%[din_ptr2]]                @ preload data\n"
-
-            "vaddw.s16 q11, q11, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vaddw.s16 q11, q11, d28                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d29                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vaddw.s16 q11, q11, d30                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d31                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            // r2
-            "vmull.s8 q13, d16, d8                 @ out0 += din1 * w20 \n"  // q12 = 0 2 4 6
-            "vmull.s8 q14, d17, d9                 @ out1 += din1 * w21 \n"  // q12 = 1 3 5 7
-            "vmull.s8 q15, d20, d10                 @ out2 += din1 * w22 \n"  // q12 = 2 4 6 8
-
-            "vaddw.s16 q11, q11, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vaddw.s16 q11, q11, d28                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d29                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vaddw.s16 q11, q11, d30                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d31                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vst1.32 {d22-d23}, [%[dout_ptr1]]!         @ store\n"
-
-            "subs %[cnt], #1                                \n"
-            "vst1.32 {d24-d25}, [%[dout_ptr1]]!         @ store\n"
-            "bne  2b                                        \n"
-            // right
-            "1:                                              \n"
-            "cmp %[size_pad_right], #1                       \n"
-            "blt 3f                                         \n"
-            "vld2.8 {d12-d13}, [%[din_ptr0]]!    @ load din00= 0 2 4 6 8\n"  // d10 = 0 2 4 6
-            "vld2.8 {d14-d15}, [%[din_ptr1]]!    @ load din00= 0 2 4 6 8\n"  // d12 = 0 2 4 6
-            "vld2.8 {d16-d17}, [%[din_ptr2]]!    @ load din00= 0 2 4 6 8\n"  // d14 = 0 2 4 6
-            "vld1.8 {d28-d29}, [%[mask]]        @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-
-            // out0
-            "vdup.32 q11, %[bias]                 @ and \n"  // q8 = vbias
-            "vdup.32 q12, %[bias]                 @ and \n"  // q9 = vbias
-
-            "vbif.8 d12, d11, d28        @ bit select, deal with right pad\n"
-            "vbif.8 d13, d11, d29        @ bit select, deal with right pad\n"
-
-            "vbif.8 d14, d11, d28        @ bit select, deal with right pad\n"
-            "vbif.8 d15, d11, d29        @ bit select, deal with right pad\n"
-
-            "vbif.8 d16, d11, d28        @ bit select, deal with right pad\n"
-            "vbif.8 d17, d11, d29        @ bit select, deal with right pad\n"
-
-            "vext.8  d18, d12, d11, #1     @ ext din00 = 2 4 6 8\n"  // d16 = -1
-                                                                     // 1 3 5
-            "vext.8  d19, d14, d11, #1     @ ext \n"  // d17 = -1 1 3 5
-            "vext.8  d20, d16, d11, #1     @ ext \n"  // d18 = -1 1 3 5
-
-            // r0
-            "vmull.s8 q13, d12, d2                 @ out0 = din0 * w00 \n"  // q12 = 0 2 4 6
-            "vmull.s8 q14, d13, d3                 @ out1 = din0 * w01 \n"  // q12 = 1 3 5 7
-            "vmull.s8 q15, d18, d4                 @ out2 = din0 * w02 \n"  // q12 = 2 4 6 8
-
-            // r1
-            "vmlal.s8 q13, d14, d5                 @ out0 += din1 * w11 \n"  // q12 = 0 2 4 6
-            "vmlal.s8 q14, d15, d6                 @ out1 += din1 * w12 \n"  // q12 = 1 3 5 7
-            "vmlal.s8 q15, d19, d7                 @ out2 += din1 * w10 \n"  // q12 = 2 4 6 8
-
-            "vld1.32 {d12-d13}, [%[dout_ptr1]]!    @ load din00= 0 1 2 3 4 5 6 "
-            "7 8 9\n"
-            "vld1.32 {d14-d15}, [%[dout_ptr1]]    @ load din00= 0 1 2 3 4 5 6 "
-            "7 8 9\n"
-
-            "vaddw.s16 q11, q11, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "sub %[dout_ptr1], #16                  @ sub \n"
-
-            "vaddw.s16 q11, q11, d28                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d29                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vaddw.s16 q11, q11, d30                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d31                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            // r2
-            "vmull.s8 q13, d16, d8                 @ out0 += din1 * w11 \n"  // q12 = 0 2 4 6
-            "vmull.s8 q14, d17, d9                 @ out1 += din1 * w12 \n"  // q12 = 1 3 5 7
-            "vmull.s8 q15, d20, d10                 @ out2 += din1 * w10 \n"  // q12 = 2 4 6 8
-
-            "vld1.32 {d2-d3}, [%[rs_mask]]!     @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-            "vld1.32 {d4-d5}, [%[rs_mask]]    @ load din00= 0 1 2 3 4 5 6 7 8 "
-            "9\n"
-
-            "vaddw.s16 q11, q11, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vaddw.s16 q11, q11, d28                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d29                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vaddw.s16 q11, q11, d30                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d31                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vbif q11, q6, q1        @ bit select, deal with right pad\n"
-            "vbif q12, q7, q2       @ bit select, deal with right pad\n"
-
-            "vst1.32 {d22-d23}, [%[dout_ptr1]]!         @ store\n"
-            "vst1.32 {d24-d25}, [%[dout_ptr1]]!         @ store\n"
-            "3:                                             \n"
-
-            : [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [dout_ptr1] "+r"(doutr0),
-              [cnt] "+r"(cnt),
-              [bias] "+r"(bias_val),
-              [rs_mask] "+r"(rst_mask)
-            : [mask] "r"(vmask), [size_pad_right] "r"(size_pad_right)
-            : "cc",
-              "memory",
-              "q0",
-              "q1",
-              "q2",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif
-        dout_ptr += w_out;
-      }
-    }
-  }
-}
-// w_in <= 16
-void conv_depthwise_3x3s2p1_bias_s_int8(int* dout,
-                                        const signed char* din,
-                                        const signed char* weights,
-                                        const int* bias,
-                                        bool flag_bias,
-                                        const int num,
-                                        const int ch_in,
-                                        const int h_in,
-                                        const int w_in,
-                                        const int h_out,
-                                        const int w_out,
-                                        ARMContext* ctx) {
-  // printf("3x3s2 mult height \n");
-  //! pad is done implicit
-  // const char zero[8] = {0, 0, 0, 0, 0, 0, 0, 0};
-  //! for 4x6 convolution window
-  const unsigned char right_pad_idx[16] = {
-      0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
-  const unsigned int right_pad_rst[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-
-  // printf("conv3x3_dw start \n");
-  signed char* zero_ptr = ctx->workspace_data<signed char>();
-  memset(zero_ptr, 0, w_in * sizeof(signed char));
-  int* write_ptr =
-      reinterpret_cast<int*>(ctx->workspace_data<signed char>()) + w_out;
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  int w_stride = 9;
-
-  unsigned int size_pad_right = (unsigned int)(w_in);
-
-  uint8x8_t vmask_rp1 =
-      vcgt_u8(vdup_n_u8(size_pad_right), vld1_u8(right_pad_idx));
-  uint8x8_t vmask_rp2 =
-      vcgt_u8(vdup_n_u8(size_pad_right), vld1_u8(right_pad_idx + 8));
-  unsigned int rst_remain = (unsigned int)w_out;
-  uint32x4_t vmask_result1 =
-      vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst));
-  uint32x4_t vmask_result2 =
-      vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst + 4));
-
-  uint8x16_t vmask_rp =
-      vcgtq_u8(vdupq_n_u8(size_pad_right), vld1q_u8(right_pad_idx));
-  unsigned char vmask[16];
-  vst1q_u8(vmask, vmask_rp);
-
-  unsigned int rmask[8];
-  vst1q_u32(rmask, vmask_result1);
-  vst1q_u32(rmask + 4, vmask_result2);
-
-  int8x8_t vzero = vdup_n_s8(0);
-  for (int n = 0; n < num; ++n) {
-    const signed char* din_batch = din + n * ch_in * size_in_channel;
-    int* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; c++) {
-      int* dout_ptr = dout_batch + c * size_out_channel;
-
-      const signed char* din_ch_ptr = din_batch + c * size_in_channel;
-
-      int bias_val = flag_bias ? bias[c] : 0;
-
-      const signed char* wei_ptr = weights + c * w_stride;
-#ifdef __aarch64__
-      int vbias[4] = {bias_val, bias_val, bias_val, bias_val};
-
-      int8x8_t wr00 = vdup_n_s8(wei_ptr[0]);
-      int8x8_t wr10 = vdup_n_s8(wei_ptr[3]);
-      int8x8_t wr20 = vdup_n_s8(wei_ptr[6]);
-
-      int8x8_t wr01 = vdup_n_s8(wei_ptr[1]);
-      int8x8_t wr11 = vdup_n_s8(wei_ptr[4]);
-      int8x8_t wr21 = vdup_n_s8(wei_ptr[7]);
-
-      int8x8_t wr02 = vdup_n_s8(wei_ptr[2]);
-      int8x8_t wr12 = vdup_n_s8(wei_ptr[5]);
-      int8x8_t wr22 = vdup_n_s8(wei_ptr[8]);
-#endif
-      int* doutr0 = nullptr;
-
-      const signed char* dr0 = din_ch_ptr;
-      const signed char* dr1 = dr0 + w_in;
-      const signed char* dr2 = dr1 + w_in;
-
-      const signed char* din_ptr0 = nullptr;
-      const signed char* din_ptr1 = nullptr;
-      const signed char* din_ptr2 = nullptr;
-
-      for (int i = 0; i < h_in; i += 2) {
-        //! process top pad pad_h = 1
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-
-        doutr0 = dout_ptr;
-        int out_buf1[8];
-
-        if (i == 0) {
-          din_ptr0 = zero_ptr;
-          din_ptr1 = dr0;
-          din_ptr2 = dr1;
-          dr0 = dr1;
-          dr1 = dr2;
-          dr2 = dr1 + w_in;
-        } else {
-          dr0 = dr2;
-          dr1 = dr2 + w_in;
-          dr2 = dr1 + w_in;
-        }
-        //! process bottom pad
-        if (i + 2 > h_in) {
-          switch (i + 2 - h_in) {
-            case 2:
-              din_ptr1 = zero_ptr;
-            case 1:
-              din_ptr2 = zero_ptr;
-            default:
-              break;
-          }
-        }
-#ifdef __aarch64__
-        unsigned int* rst_mask = rmask;
-        unsigned char* val_mask = vmask;
-        asm volatile(
-            "PRFM PLDL1KEEP, [%[din_ptr0]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr1]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr2]] \n"
-            "movi   v16.4s, #0x0\n"
-            // left
-            "ld1 {v10.8b}, [%[vmask]], #8             \n"
-            "ld1 {v11.8b}, [%[vmask]]                \n"
-            "ld2    {v0.8b - v1.8b}, [%[din_ptr0]]         \n" /*load a00-a015
-                                                                  to q0*/
-            "ld2    {v2.8b - v3.8b}, [%[din_ptr1]]         \n" /* load a00-a015
-                                                                  to q0*/
-            "ld2    {v4.8b - v5.8b}, [%[din_ptr2]]         \n" /*load a00-a015
-                                                                  to q0*/
-
-            "bif v0.8b, v16.8b, v10.8b               \n"
-            "bif v1.8b, v16.8b, v11.8b               \n"
-            "bif v2.8b, v16.8b, v10.8b               \n"
-            "bif v3.8b, v16.8b, v11.8b               \n"
-            "bif v4.8b, v16.8b, v10.8b               \n"
-            "bif v5.8b, v16.8b, v11.8b               \n"
-
-            "ld1    {v12.4s}, [%[bias_val]] \n" /* dup v10, bias*/
-            "ld1    {v13.4s}, [%[bias_val]] \n" /* dup v10, bias */
-
-            "ext v6.8b, v16.8b, v1.8B, #7       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       013579 */
-            "ext v7.8b, v16.8b, v3.8B, #7       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       013579 */
-            "ext v8.8b, v16.8b, v5.8B, #7       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       013579 */
-
-            // r0
-            "smull  v17.8h,  %[v1].8b,  v0.8b   \n" /* outr00 = 02468 * w01 */
-            "smull  v18.8h,  %[v2].8b,  v1.8b\n"    /* outr00 += 13579 * w02 */
-            "smull  v19.8h,  %[v0].8b,  v6.8b\n"    /* outr00 += 013579 * w00 */
-
-            // "ldp    q0, q1, [%[ptr_out0]] \n"                    /* dup v10,
-            // bias */ "ldp    q10, q11, [%[rst_mask]] \n"                    /*
-            // dup v10, bias */
-
-            // r1
-            "smlal  v17.8h,  %[v4].8b,  v2.8b   \n" /* outr00 = 02468 * w01 */
-            "smlal  v18.8h,  %[v5].8b,  v3.8b\n"    /* outr00 += 13579 * w02 */
-            "smlal  v19.8h,  %[v3].8b,  v7.8b\n"    /* outr00 += 013579 * w00 */
-
-            "saddw   v12.4s, v12.4s, v17.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v17.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v18.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v18.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v19.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h        \n" /* v11 += outr00.high*/
-
-            // r2
-            "smull  v17.8h,  %[v7].8b,  v4.8b   \n" /* outr00 = 02468 * w01 */
-            "smull  v18.8h,  %[v8].8b,  v5.8b\n"    /* outr00 += 13579 * w02 */
-            "smull  v19.8h,  %[v6].8b,  v8.8b\n"    /* outr00 += 013579 * w00 */
-
-            "saddw   v12.4s, v12.4s, v17.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v17.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v18.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v18.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v19.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h        \n" /* v11 += outr00.high*/
-
-            // "bif v12.16b, v0.16b, v10.16b         \n"
-            // "bif v13.16b, v1.16b, v11.16b         \n"
-
-            "stp     q12, q13, [%[ptr_out0]]   \n" /* store q10, q11 -> ptr_out
-                                                    */
-            : [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [vmask] "+r"(val_mask)
-            : [v0] "w"(wr00),
-              [v1] "w"(wr01),
-              [v2] "w"(wr02),
-              [v3] "w"(wr10),
-              [bias_val] "r"(vbias),
-              [v4] "w"(wr11),
-              [v5] "w"(wr12),
-              [v6] "w"(wr20),
-              [v7] "w"(wr21),
-              [v8] "w"(wr22),
-              [rst_mask] "r"(rmask),
-              [ptr_out0] "r"(out_buf1)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20");
-#else
-        unsigned int* rst_mask = rmask;
-        // prefetch input
-        // store weights
-        asm volatile("vld1.8    {d0-d1}, [%[wei_ptr]]    \n"
-                     :
-                     : [wei_ptr] "r"(wei_ptr)
-                     : "memory");
-        asm volatile(
-            // left
-            "pld [%[din_ptr0]]                @ preload data\n"
-            "pld [%[din_ptr1]]                @ preload data\n"
-            "pld [%[din_ptr2]]                @ preload data\n"
-            "vdup.s8     d2, d0[0]               @ d2 = w00, w00, w00, w00\n"
-            "vdup.s8     d3, d0[1]               @ d3 = w01, w01, w01, w01\n"
-            "vdup.s8     d4, d0[2]               @ d4 = w02, w02, w02, w02\n"
-            "vld2.8 {d12-d13}, [%[din_ptr0]]    @ load din00= 0 2 4 6 8\n"  // d10 = 0 2 4 6
-            "vld2.8 {d14-d15}, [%[din_ptr1]]    @ load din00= 0 2 4 6 8\n"  // d12 = 0 2 4 6
-            "vld2.8 {d16-d17}, [%[din_ptr2]]    @ load din00= 0 2 4 6 8\n"  // d14 = 0 2 4 6
-            "vld1.8 {d28-d29}, [%[mask]]        @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-            "vmov.u32 d11, #0                   @ zero\n"
-
-            "vdup.s8     d5, d0[3]               @ d2 = w00, w00, w00, w00\n"
-            "vdup.s8     d6, d0[4]               @ d3 = w01, w01, w01, w01\n"
-            "vdup.s8     d7, d0[5]               @ d4 = w02, w02, w02, w02\n"
-
-            "vbif.8 d12, d11, d28        @ bit select, deal with right pad\n"
-            "vbif.8 d13, d11, d29        @ bit select, deal with right pad\n"
-
-            "vbif.8 d14, d11, d28        @ bit select, deal with right pad\n"
-            "vbif.8 d15, d11, d29        @ bit select, deal with right pad\n"
-
-            "vbif.8 d16, d11, d28        @ bit select, deal with right pad\n"
-            "vbif.8 d17, d11, d29        @ bit select, deal with right pad\n"
-
-            "vext.8  d18, d11, d13, #7     @ ext \n"  // d16 = -1 1 3 5
-            "vext.8  d19, d11, d15, #7     @ ext \n"  // d17 = -1 1 3 5
-            "vext.8  d20, d11, d17, #7     @ ext \n"  // d18 = -1 1 3 5
-
-            // "pld [%[dout_ptr1]]                @ preload data\n"
-
-            // r0
-            "vmull.s8 q13, d12, d3                 @ out0 = din0 * w01 \n"  // q12 = d12 * w01
-            "vmull.s8 q14, d13, d4                 @ out1 = din0 * w02 \n"  // q12 = d12 * w02
-            "vmull.s8 q15, d18, d2                 @ out2 = din0 * w00 \n"  // q12 = d12 * w02
-
-            "vdup.s8 d8, d0[6]               @ d2 = w00, w00, w00, w00\n"
-            "vdup.s8 d9, d0[7]               @ d3 = w01, w01, w01, w01\n"
-            "vdup.s8 d10, d1[0]               @ d4 = w02, w02, w02, w02\n"
-
-            // out0
-            "vdup.32 q11, %[bias]                            @ and \n"  // q8 =
-                                                                        // vbias
-            "vdup.32 q12, %[bias]                            @ and \n"  // q9 =
-                                                                        // vbias
-
-            // r1
-            "vmlal.s8 q13, d14, d6                 @ out0 += din1 * w11 \n"  // q12 = d12 * w11
-            "vmlal.s8 q14, d15, d7                 @ out1 += din1 * w12 \n"  // q12 = d12 * w11
-            "vmlal.s8 q15, d19, d5                 @ out2 += din1 * w10 \n"  // q12 = d12 * w11
-
-            // "vld1.32 {d12-d13}, [%[dout_ptr1]]!    @ load din00= 0 1 2 3 4 5
-            // 6 7 8 9\n" "vld1.32 {d14-d15}, [%[dout_ptr1]]    @ load din00= 0
-            // 1 2 3 4 5 6 7 8 9\n"
-
-            "vaddw.s16 q11, q11, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vaddw.s16 q11, q11, d28                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d29                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vaddw.s16 q11, q11, d30                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d31                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            // r2
-            "vmull.s8 q13, d16, d9                 @ out0 += din1 * w21 \n"  // q12 = d12 * w11
-            "vmull.s8 q14, d17, d10                 @ out1 += din1 * w22 \n"  // q12 = d12 * w11
-            "vmull.s8 q15, d20, d8                 @ out2 += din1 * w20 \n"  // q12 = d12 * w11
-
-            // "vld1.32 {d2-d3}, [%[rs_mask]]!     @ load din00= 0 1 2 3 4 5 6 7
-            // 8 9\n" "vld1.32 {d4-d5}, [%[rs_mask]]    @ load din00= 0 1 2 3 4
-            // 5 6 7 8 9\n"
-
-            // "sub %[dout_ptr1], #16                  @ sub \n"
-
-            "vaddw.s16 q11, q11, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vaddw.s16 q11, q11, d28                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d29                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vaddw.s16 q11, q11, d30                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d31                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            // "vbif q11, q6, q1        @ bit select, deal with right pad\n"
-            // "vbif q12, q7, q2       @ bit select, deal with right pad\n"
-
-            "vst1.32 {d22-d25}, [%[dout_ptr1]]         @ store\n"
-            // "vst1.32 {d24-d25}, [%[dout_ptr1]]!         @ store\n"
-            : [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [bias] "+r"(bias_val),
-              [rs_mask] "+r"(rst_mask)
-            : [mask] "r"(vmask),
-              [size_pad_right] "r"(size_pad_right),
-              [dout_ptr1] "r"(out_buf1)
-            : "cc",
-              "memory",
-              "q0",
-              "q1",
-              "q2",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif
-        for (int w = 0; w < w_out; ++w) {
-          *doutr0++ = out_buf1[w];
-        }
-        dout_ptr += w_out;
-      }
-    }
-  }
-}
-
-// relu
-void conv_depthwise_3x3s1p1_bias_relu_int8(int* dout,
-                                           const signed char* din,
-                                           const signed char* weights,
-                                           const int* bias,
-                                           bool flag_bias,
-                                           const int num,
-                                           const int ch_in,
-                                           const int h_in,
-                                           const int w_in,
-                                           const int h_out,
-                                           const int w_out,
-                                           ARMContext* ctx) {
-  // printf("3x3s1 mult height \n");
-  //! pad is done implicit
-  const char zero[8] = {0, 0, 0, 0, 0, 0, 0, 0};
-  //! for 4x6 convolution window
-  const unsigned char right_pad_idx[16] = {
-      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
-  const unsigned int right_pad_rst[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-
-  // printf("conv3x3_dw start \n");
-  signed char* zero_ptr = ctx->workspace_data<signed char>();
-  memset(zero_ptr, 0, w_in * sizeof(signed char));
-  int* write_ptr =
-      reinterpret_cast<int*>(ctx->workspace_data<signed char>()) + w_in;
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  int w_stride = 9;
-
-  int tile_w = (w_in + 7) >> 3;
-  int tile_h = (h_out + 1) >> 1;
-  int cnt_col = tile_w - 2;
-
-  unsigned int size_pad_right = (unsigned int)(w_in - 7 - (cnt_col << 3));
-
-  int size_pad_bottom = h_out % 2;
-
-  uint8x8_t vmask_rp1 =
-      vcgt_u8(vdup_n_u8(size_pad_right), vld1_u8(right_pad_idx));
-  uint8x8_t vmask_rp2 =
-      vcgt_u8(vdup_n_u8(size_pad_right), vld1_u8(right_pad_idx + 8));
-  unsigned int rst_remain = (unsigned int)(w_out - ((cnt_col + 1) << 3));
-  uint32x4_t vmask_result1 =
-      vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst));
-  uint32x4_t vmask_result2 =
-      vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst + 4));
-
-  int8x8_t vzero = vdup_n_s8(0);
-  int32x4_t vzero_32 = vdupq_n_s32(0);
-
-  uint8x16_t vmask_rp =
-      vcgtq_u8(vdupq_n_u8(size_pad_right), vld1q_u8(right_pad_idx));
-  // uint8x8_t vmask_rp2 = vcgt_u8(vdup_n_u8(size_pad_right),
-  // vld1_u8(right_pad_idx + 8));
-  unsigned char vmask[16];
-  vst1q_u8(vmask, vmask_rp);
-
-  unsigned int rmask[8];
-  vst1q_u32(rmask, vmask_result1);
-  vst1q_u32(rmask + 4, vmask_result2);
-
-  for (int n = 0; n < num; ++n) {
-    const signed char* din_batch = din + n * ch_in * size_in_channel;
-    int* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; c++) {
-      int* dout_ptr = dout_batch + c * size_out_channel;
-
-      const signed char* din_ch_ptr = din_batch + c * size_in_channel;
-
-      int bias_val = flag_bias ? bias[c] : 0;
-
-      const signed char* wei_ptr = weights + c * w_stride;
-#ifdef __aarch64__
-      int vbias[4] = {bias_val, bias_val, bias_val, bias_val};
-      int8x8_t wr00 = vdup_n_s8(wei_ptr[0]);
-      int8x8_t wr10 = vdup_n_s8(wei_ptr[3]);
-      int8x8_t wr20 = vdup_n_s8(wei_ptr[6]);
-
-      int8x8_t wr01 = vdup_n_s8(wei_ptr[1]);
-      int8x8_t wr11 = vdup_n_s8(wei_ptr[4]);
-      int8x8_t wr21 = vdup_n_s8(wei_ptr[7]);
-
-      int8x8_t wr02 = vdup_n_s8(wei_ptr[2]);
-      int8x8_t wr12 = vdup_n_s8(wei_ptr[5]);
-      int8x8_t wr22 = vdup_n_s8(wei_ptr[8]);
-#endif
-
-      int* doutr0 = nullptr;
-      int* doutr1 = nullptr;
-
-      const signed char* dr0 = din_ch_ptr;
-      const signed char* dr1 = dr0 + w_in;
-      const signed char* dr2 = dr1 + w_in;
-      const signed char* dr3 = dr2 + w_in;
-
-      const signed char* din_ptr0 = nullptr;
-      const signed char* din_ptr1 = nullptr;
-      const signed char* din_ptr2 = nullptr;
-      const signed char* din_ptr3 = nullptr;
-
-      for (int i = 0; i < h_in; i += 2) {
-        //! process top pad pad_h = 1
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-        din_ptr3 = dr3;
-
-        doutr0 = dout_ptr;
-        doutr1 = doutr0 + w_out;
-        unsigned int* rst_mask = rmask;
-        unsigned char* val_mask = vmask;
-
-        if (i == 0) {
-          din_ptr0 = zero_ptr;
-          din_ptr1 = dr0;
-          din_ptr2 = dr1;
-          din_ptr3 = dr2;
-          dr0 = dr1;
-          dr1 = dr2;
-          dr2 = dr3;
-          dr3 = dr2 + w_in;
-        } else {
-          dr0 = dr2;
-          dr1 = dr3;
-          dr2 = dr1 + w_in;
-          dr3 = dr2 + w_in;
-        }
-        //! process bottom pad
-        if (i + 3 > h_in) {
-          switch (i + 3 - h_in) {
-            case 3:
-              din_ptr1 = zero_ptr;
-            case 2:
-              din_ptr2 = zero_ptr;
-            case 1:
-              din_ptr3 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 2 > h_out) {
-          doutr1 = write_ptr;
-        }
-        int cnt = cnt_col;
-#ifdef __aarch64__
-        asm volatile(
-            "PRFM PLDL1KEEP, [%[din_ptr0]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr1]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr2]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr3]] \n"
-            "movi   v21.4s, #0x0\n" /* out0 = 0 */
-                                    // left
-            "ld1    {v0.8b}, [%[din_ptr0]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-            "ld1    {v2.8b}, [%[din_ptr1]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-            "ld1    {v1.8b}, [%[din_ptr0]]                   \n"         /* load
-                                                                            a00-a015 to
-                                                                            q0*/
-            "ld1    {v3.8b}, [%[din_ptr1]]                   \n"         /* load
-                                                                            a00-a015 to
-                                                                            q0*/
-
-            "ld1    {v10.4s}, [%[bias_val]] \n" /* dup v10, bias */
-            "ld1    {v11.4s}, [%[bias_val]] \n" /* dup v10, bias */
-            "ld1    {v12.4s}, [%[bias_val]] \n" /* dup v10, bias */
-            "ld1    {v13.4s}, [%[bias_val]] \n" /* dup v10, bias */
-
-            // r0
-            "smull  v18.8h,  %[v1].8b,  v0.8b   \n" /* outr00 = 01234567 * w01
-                                                     */
-
-            "ext v4.8b, v21.8b, v0.8B, #7       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       00123456 */
-            "ext v5.8b, v0.8b, v1.8B, #1       \n"  /* vext_s8(vinr0, vinr0_1,
-                                                       1); 12345678 */
-
-            "ld1    {v6.8b}, [%[din_ptr2]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-            "ld1    {v8.8b}, [%[din_ptr3]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-
-            "smlal  v18.8h,  %[v0].8b,  v4.8b\n" /* outr00 += 00123456 * w00 */
-
-            "ld1    {v7.8b}, [%[din_ptr2]]                       \n" /* load
-                                                                        a00-a015
-                                                                        to q0*/
-            "ld1    {v9.8b}, [%[din_ptr3]]                       \n" /* load
-                                                                        a00-a015
-                                                                        to q0*/
-
-            "sub   %[din_ptr0], %[din_ptr0], #1                       \n"
-            "sub   %[din_ptr1], %[din_ptr1], #1                       \n"
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v18.8h,  %[v2].8b,  v5.8b\n" /* outr00 += 12345678 * w02 */
-
-            "ext v4.8b, v21.8b, v2.8b, #7       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       00123456 */
-            "ext v5.8b, v2.8b, v3.8b, #1       \n"  /* vext_s8(vinr0, vinr0_1,
-                                                       1); 12345678 */
-
-            // r1
-            "sub   %[din_ptr2], %[din_ptr2], #1                       \n"
-            "sub   %[din_ptr3], %[din_ptr3], #1                       \n"
-
-            "smull  v19.8h,  %[v1].8b,  v2.8b   \n" /* outr10 += 01234567 * w11
-                                                     */
-            "smlal  v18.8h,  %[v4].8b,  v2.8b   \n" /* outr00 += 01234567 * w11
-                                                     */
-
-            "ext v14.8b, v21.8b, v6.8b, #7       \n" /* vext_s8(vzero, vinr0,
-                                                        7); 00123456 */
-            "ext v15.8b, v6.8b, v7.8b, #1       \n"  /* vext_s8(vinr0, vinr0_1,
-                                                        1); 12345678 */
-
-            "smlal  v19.8h,  %[v0].8b,  v4.8b   \n" /* outr00 += 01234567 * w11
-                                                     */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-            "smull  v18.8h,  %[v3].8b,  v4.8b   \n" /* outr00 += 001234567 * w10
-                                                     */
-
-            "ld1    {v0.8b}, [%[din_ptr0]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-            "ld1    {v2.8b}, [%[din_ptr1]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-
-            "saddw   v12.4s, v12.4s, v19.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h        \n" /* v11 += outr00.high*/
-
-            "smull  v19.8h,  %[v2].8b,  v5.8b   \n" /* outr00 += 01234567 * w11
-                                                     */
-            "smlal  v18.8h,  %[v5].8b,  v5.8b   \n" /* outr00 += 12345678 * w12
-                                                     */
-
-            // r2
-            "ld1    {v1.8b}, [%[din_ptr0]]                   \n" /* load
-                                                                    a00-a015 to
-                                                                    q0*/
-            "ld1    {v3.8b}, [%[din_ptr1]]                   \n" /* load
-                                                                    a00-a015 to
-                                                                    q0*/
-
-            "smlal  v19.8h,  %[v4].8b,  v6.8b   \n" /* outr10 += 01234567 * w11
-                                                     */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-            "smull  v18.8h,  %[v7].8b,  v6.8b   \n" /* outr00 += 01234567 * w11
-                                                     */
-
-            "ext v4.8b, v21.8b, v8.8b, #7       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       00123456 */
-            "ext v5.8b, v8.8b, v9.8b, #1       \n"  /* vext_s8(vinr0, vinr0_1,
-                                                       1); 12345678 */
-
-            "saddw   v12.4s, v12.4s, v19.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h        \n" /* v11 += outr00.high*/
-
-            "smull  v19.8h,  %[v3].8b,  v14.8b   \n" /* outr10 += 01234567 * w11
-                                                      */
-            "smlal  v18.8h,  %[v6].8b,  v14.8b   \n" /* outr00 += 01234567 * w11
-                                                      */
-
-            "ld1    {v6.8b}, [%[din_ptr2]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-
-            "smlal  v19.8h,  %[v5].8b,  v15.8b   \n" /* outr10 += 01234567 * w11
-                                                      */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v18.8h,  %[v8].8b,  v15.8b   \n" /* outr00 += 01234567 * w11
-                                                      */
-            "saddw   v12.4s, v12.4s, v19.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h        \n" /* v11 += outr00.high*/
-
-            // r3
-            "smull  v19.8h,  %[v7].8b,  v8.8b   \n" /* outr00 += 01234567 * w11
-                                                     */
-
-            "ld1    {v8.8b}, [%[din_ptr3]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-
-            "ld1    {v7.8b}, [%[din_ptr2]]                   \n" /* load
-                                                                    a00-a015 to
-                                                                    q0*/
-            "ld1    {v9.8b}, [%[din_ptr3]]                   \n" /* load
-                                                                    a00-a015 to
-                                                                    q0*/
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smlal  v19.8h,  %[v6].8b,  v4.8b     \n" /* outr00 += 01234567 *
-                                                         w11 */
-
-            "smax  v10.4s, v10.4s, v21.4s        \n" /* relu*/
-            "smax  v11.4s, v11.4s, v21.4s        \n" /* relu*/
-
-            "stp     q10, q11, [%[ptr_out0]], #32 \n" /* store q10, q11 ->
-                                                         ptr_out       */
-
-            "saddw   v12.4s, v12.4s, v19.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h        \n" /* v11 += outr00.high*/
-
-            "smull  v19.8h,  %[v8].8b,  v5.8b        \n" /* outr00 += 01234567 *
-                                                            w11 */
-
-            "ld1    {v10.4s}, [%[bias_val]] \n" /* dup v10, bias */
-            "ld1    {v11.4s}, [%[bias_val]] \n" /* dup v10, bias */
-
-            "saddw   v12.4s, v12.4s, v19.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h        \n" /* v11 += outr00.high*/
-
-            "smax  v12.4s, v12.4s, v21.4s        \n" /* relu*/
-            "smax  v13.4s, v13.4s, v21.4s        \n" /* relu*/
-
-            "stp     q12, q13, [%[ptr_out1]], #32   \n" /* store q10, q11 ->
-                                                           ptr_out       */
-
-            "ld1    {v12.4s}, [%[bias_val]] \n" /* dup v10, bias */
-            "ld1    {v13.4s}, [%[bias_val]] \n" /* dup v10, bias */
-
-            "cmp  %[cnt], #1                \n"
-            "blt 3f                         \n"
-            // mid
-            "1:                             \n"
-            "ext v4.8b, v0.8B, v1.8b, #1       \n" /*12345678 */
-            "ext v5.8b, v0.8b, v1.8B, #2       \n" /*23456789 */
-
-            // r0
-            "smull  v18.8h,  %[v0].8b,  v0.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "ext v14.8b, v2.8B, v3.8b, #1       \n" /*12345678 */
-            "ext v15.8b, v2.8b, v3.8B, #2       \n" /*23456789 */
-
-            "smlal  v18.8h,  %[v1].8b,  v4.8b\n" /* outr00 += 12345678 * w01 */
-
-            "ext v16.8b, v6.8B, v7.8b, #1       \n" /*12345678 */
-            "ext v17.8b, v6.8b, v7.8B, #2       \n" /*23456789 */
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v18.8h,  %[v2].8b,  v5.8b\n" /* outr00 += 23456789 * w02 */
-
-            // r1
-            "ext v4.8b, v8.8B, v9.8b, #1       \n" /*12345678 */
-            "ext v5.8b, v8.8b, v9.8B, #2       \n" /*23456789 */
-
-            "smull  v19.8h,  %[v0].8b,  v2.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-            "smlal  v18.8h,  %[v3].8b,  v2.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "ld1    {v0.8b}, [%[din_ptr0]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-            "ld1    {v2.8b}, [%[din_ptr1]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-
-            "smlal  v19.8h,  %[v1].8b,  v14.8b\n" /* outr00 += 12345678 * w01 */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v18.8h,  %[v4].8b,  v14.8b\n" /* outr00 += 12345678 * w01 */
-
-            "ld1    {v1.8b}, [%[din_ptr0]]                       \n" /* load
-                                                                        a00-a015
-                                                                        to q0*/
-            "ld1    {v3.8b}, [%[din_ptr1]]                       \n" /* load
-                                                                        a00-a015
-                                                                        to q0*/
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v19.8h,  %[v2].8b,  v15.8b\n" /* outr00 += 23456789 * w02 */
-            "smlal  v18.8h,  %[v5].8b,  v15.8b\n" /* outr00 += 12345678 * w01 */
-
-            // r2
-            "smlal  v19.8h,  %[v3].8b,  v6.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v18.8h,  %[v6].8b,  v6.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v19.8h,  %[v4].8b,  v16.8b\n" /* outr00 += 12345678 * w01 */
-            "smlal  v18.8h,  %[v7].8b,  v16.8b\n" /* outr00 += 12345678 * w01 */
-
-            "smlal  v19.8h,  %[v5].8b,  v17.8b\n" /* outr00 += 23456789 * w02 */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v18.8h,  %[v8].8b,  v17.8b\n" /* outr00 += 12345678 * w01 */
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            // r3
-            "smull  v19.8h,  %[v6].8b,  v8.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "ld1    {v6.8b}, [%[din_ptr2]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-            "ld1    {v8.8b}, [%[din_ptr3]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smlal  v19.8h,  %[v7].8b,  v4.8b\n" /* outr00 += 12345678 * w01 */
-
-            "ld1    {v7.8b}, [%[din_ptr2]]                       \n" /* load
-                                                                        a00-a015
-                                                                        to q0*/
-            "ld1    {v9.8b}, [%[din_ptr3]]                       \n" /* load
-                                                                        a00-a015
-                                                                        to q0*/
-
-            "smax  v10.4s, v10.4s, v21.4s        \n" /* relu*/
-            "smax  v11.4s, v11.4s, v21.4s        \n" /* relu*/
-
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v19.8h,  %[v8].8b,  v5.8b\n" /* outr00 += 23456789 * w02 */
-
-            "stp     q10, q11, [%[ptr_out0]], #32 \n" /* store q10, q11 ->
-                                                         ptr_out       */
-
-            "ld1    {v10.4s}, [%[bias_val]] \n" /* dup v10, bias */
-            "ld1    {v11.4s}, [%[bias_val]] \n" /* dup v10, bias */
-
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            "subs %[cnt], %[cnt], #1            \n"
-
-            "smax  v12.4s, v12.4s, v21.4s        \n" /* relu*/
-            "smax  v13.4s, v13.4s, v21.4s        \n" /* relu*/
-
-            "stp     q12, q13, [%[ptr_out1]], #32 \n" /* store q10, q11 ->
-                                                         ptr_out       */
-
-            "ld1    {v12.4s}, [%[bias_val]] \n" /* dup v10, bias */
-            "ld1    {v13.4s}, [%[bias_val]] \n" /* dup v10, bias */
-
-            "bne 1b                                 \n"
-            // right
-            "3:                             \n"
-            "ld1 {v14.8b}, [%[vmask]], #8             \n"
-            "ld1 {v15.8b}, [%[vmask]]                \n"
-
-            "bif v0.8b, v21.8b, v14.8b               \n"
-            "bif v1.8b, v21.8b, v15.8b               \n"
-            "bif v2.8b, v21.8b, v14.8b               \n"
-            "bif v3.8b, v21.8b, v15.8b               \n"
-
-            "ext v4.8b, v0.8b, v1.8b, #1             \n"
-            "ext v5.8b, v0.8b, v1.8b, #2             \n"
-
-            // r0
-            "smull  v18.8h,  %[v0].8b,  v0.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "ext v16.8b, v2.8b, v3.8b, #1             \n"
-            "ext v17.8b, v2.8b, v3.8b, #2             \n"
-
-            "bif v6.8b, v21.8b, v14.8b               \n"
-            "bif v7.8b, v21.8b, v15.8b               \n"
-
-            "smlal  v18.8h,  %[v1].8b,  v4.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "bif v8.8b, v21.8b, v14.8b               \n"
-            "bif v9.8b, v21.8b, v15.8b               \n"
-
-            "ext v20.8b, v6.8b, v7.8b, #1             \n"
-            "ext v22.8b, v6.8b, v7.8b, #2             \n"
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v18.8h,  %[v2].8b,  v5.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            // r1
-            "ext v4.8b, v8.8b, v9.8b, #1             \n"
-            "ext v5.8b, v8.8b, v9.8b, #2             \n"
-
-            "smull  v19.8h,  %[v0].8b,  v2.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-            "smlal  v18.8h,  %[v3].8b,  v2.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "ld1 {v14.4s}, [%[rmask]], #16                \n"
-            "ld1 {v15.4s}, [%[rmask]]                     \n"
-
-            "smlal  v19.8h,  %[v1].8b,  v16.8b   \n" /* outr00 = 01234567 * w00
-                                                      */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n"  /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n"  /* v11 += outr00.high*/
-            "smull  v18.8h,  %[v4].8b,  v16.8b   \n" /* outr00 = 01234567 * w00
-                                                      */
-
-            "ld1 {v0.4s}, [%[ptr_out0]], #16                \n"
-            "ld1 {v2.4s}, [%[ptr_out1]], #16                \n"
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v19.8h,  %[v2].8b,  v17.8b   \n" /* outr00 = 01234567 * w00
-                                                      */
-            "smlal  v18.8h,  %[v5].8b,  v17.8b   \n" /* outr00 = 01234567 * w00
-                                                      */
-
-            "ld1 {v1.4s}, [%[ptr_out0]]                   \n"
-            "ld1 {v3.4s}, [%[ptr_out1]]                   \n"
-
-            // r2
-            "smlal  v19.8h,  %[v3].8b,  v6.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-            "smull  v18.8h,  %[v6].8b,  v6.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "sub %[ptr_out0], %[ptr_out0], #16   \n"
-            "sub %[ptr_out1], %[ptr_out1], #16   \n"
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v19.8h,  %[v4].8b,  v20.8b   \n" /* outr00 = 01234567 * w00
-                                                      */
-            "smlal  v18.8h,  %[v7].8b,  v20.8b   \n" /* outr00 = 01234567 * w00
-                                                      */
-
-            "smlal  v19.8h,  %[v5].8b,  v22.8b   \n" /* outr00 = 01234567 * w00
-                                                      */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v18.8h,  %[v8].8b,  v22.8b   \n" /* outr00 = 01234567 * w00
-                                                      */
-            "saddw   v12.4s, v12.4s, v19.4h     \n"  /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n"  /* v11 += outr00.high*/
-
-            // r3
-            "smull  v19.8h,  %[v6].8b,  v8.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smlal  v19.8h,  %[v7].8b,  v4.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "smax  v10.4s, v10.4s, v21.4s        \n" /* relu*/
-            "smax  v11.4s, v11.4s, v21.4s        \n" /* relu*/
-
-            "bif v10.16b, v0.16b, v14.16b         \n"
-            "bif v11.16b, v1.16b, v15.16b         \n"
-
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v19.8h,  %[v8].8b,  v5.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "stp     q10, q11, [%[ptr_out0]], #32 \n" /* store q10, q11 ->
-                                                         ptr_out       */
-
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            "smax  v12.4s, v12.4s, v21.4s        \n" /* relu*/
-            "smax  v13.4s, v13.4s, v21.4s        \n" /* relu*/
-
-            "bif v12.16b, v2.16b, v14.16b         \n"
-            "bif v13.16b, v3.16b, v15.16b         \n"
-
-            "stp     q12, q13, [%[ptr_out1]], #32 \n" /* store q10, q11 ->
-                                                         ptr_out       */
-
-            : [cnt] "+r"(cnt),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [ptr_out0] "+r"(doutr0),
-              [ptr_out1] "+r"(doutr1),
-              [vmask] "+r"(val_mask),
-              [rmask] "+r"(rst_mask)
-            : [v0] "w"(wr00),
-              [v1] "w"(wr01),
-              [v2] "w"(wr02),
-              [v3] "w"(wr10),
-              [bias_val] "r"(vbias),
-              [v4] "w"(wr11),
-              [v5] "w"(wr12),
-              [v6] "w"(wr20),
-              [v7] "w"(wr21),
-              [v8] "w"(wr22)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22");
-#else
-        // store weights
-        asm volatile("vld1.8    {d0-d1}, [%[wei_ptr]]    \n"
-                     :
-                     : [wei_ptr] "r"(wei_ptr)
-                     : "memory");
-        asm volatile(
-            // left
-            "pld [%[din_ptr0]]                @ preload data\n"
-            "pld [%[din_ptr1]]                @ preload data\n"
-            "pld [%[din_ptr2]]                @ preload data\n"
-            "pld [%[din_ptr3]]                @ preload data\n"
-            "vdup.s8     d2, d0[0]               @ d2 = w00, w00, w00, w00\n"
-            "vdup.s8     d3, d0[1]               @ d3 = w01, w01, w01, w01\n"
-            "vdup.s8     d4, d0[2]               @ d4 = w02, w02, w02, w02\n"
-            "vld1.8 {d12-d13}, [%[din_ptr0]]    @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-            "vmov.u32 d11, #0                   @ zero\n"
-            // out0
-            "vdup.32 q8, %[bias]                            @ and \n"  // q8 =
-                                                                       // vbias
-            "vdup.32 q9, %[bias]                            @ and \n"  // q9 =
-                                                                       // vbias
-            // out1
-            "vdup.32 q10, %[bias]                            @ and \n"  // q8 =
-                                                                        // vbias
-            "vdup.32 q11, %[bias]                            @ and \n"  // q9 =
-                                                                        // vbias
-
-            // r0
-            "vmull.s8 q12, d12, d3                 @ out0 = din0 * w01 \n"  // q12 = d12 * w01
-            "vext.8     d30, d11, d12, #7     @ ext \n"       // d10 = 00123456
-            "vext.8     d31, d12, d13, #1          @ ext \n"  // d11 = 12345678
-
-            "vld1.8 {d12-d13}, [%[din_ptr1]]    @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-            "vld1.8 {d14-d15}, [%[din_ptr2]]    @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-            "vdup.s8     d5, d0[3]               @ d5 = w10, w10, w00, w00\n"
-            "vdup.s8     d6, d0[4]               @ d6 = w11, w11, w01, w01\n"
-
-            "vmlal.s8 q12, d30, d2                 @ out0 += din0 * w00 \n"  // q12 += d10 * w00
-
-            "vdup.s8     d7, d0[5]               @ d7 = w12, w12\n"
-            "add %[din_ptr0], #7                   @add \n"
-            "add %[din_ptr1], #7                   @add \n"
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmull.s8 q12, d31, d4                 @ out0 += din0 * w02 \n"  // q12 += d11 * w02
-
-            // r1
-            "vext.8     d30, d11, d12, #7     @ ext \n"       // d10 = 00123456
-            "vext.8     d31, d12, d13, #1          @ ext \n"  // d11 = 12345678
-            "vmull.s8 q13, d12, d3                 @ out1 = din1 * w01 \n"  // q13 = d12 * w01
-
-            "vmlal.s8 q12, d12, d6                 @ out0 = din1 * w11 \n"  // q12 = d12 * w11
-
-            "vld1.8 {d12-d13}, [%[din_ptr3]]    @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-            "vdup.s8     d8, d0[6]               @ d8 = w20, w00, w00, w00\n"
-            "vdup.s8     d9, d0[7]               @ d9 = w21, w01, w01, w01\n"
-            "vdup.s8     d10, d1[0]               @ d10 = w22, w02, w02, w02\n"
-
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmlal.s8 q13, d30, d2                 @ out1 += din1 * w00 \n"  // q12 += d10 * w00
-            "vmull.s8 q12, d30, d5                 @ out0 += din1 * w10 \n"  // q12 += d10 * w00
-
-            "add %[din_ptr2], #7                   @add \n"
-            "add %[din_ptr3], #7                   @add \n"
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmull.s8 q13, d31, d4                 @ out1 += din1 * w02 \n"  // q12 += d10 * w00
-            "vmlal.s8 q12, d31, d7                 @ out0 += din1 * w12 \n"  // q12 += d10 * w00
-
-            // r2
-            "vext.8     d30, d11, d14, #7     @ ext \n"       // d10 = 00123456
-            "vext.8     d31, d14, d15, #1          @ ext \n"  // d11 = 12345678
-
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmlal.s8 q13, d14, d6                 @ out1 = din2 * w11 \n"  // q13 = d12 * w01
-            "vmull.s8 q12, d14, d9                 @ out1 = din2 * w21 \n"  // q13 = d12 * w01
-
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmull.s8 q13, d30, d5                 @ out1 += din2 * w10 \n"  // q12 += d10 * w00
-            "vmlal.s8 q12, d30, d8                 @ out0 += din2 * w20 \n"  // q12 += d10 * w00
-
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmlal.s8 q13, d31, d7                 @ out1 += din2 * w12 \n"  // q12 += d10 * w00
-            "vmull.s8 q12, d31, d10                 @ out0 += din2 * w22 \n"  // q12 += d10 * w00
-
-            // r3
-            "vext.8     d30, d11, d12, #7     @ ext \n"       // d10 = 00123456
-            "vext.8     d31, d12, d13, #1          @ ext \n"  // d11 = 12345678
-            "vmov.u32 q0, #0                         @ mov \n"
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmull.s8 q13, d12, d9                 @ out1 = din3 * w21 \n"  // q13 = d12 * w01
-            "pld [%[din_ptr0]]                @ preload data\n"
-            "pld [%[din_ptr1]]                @ preload data\n"
-            "vmax.s32 q8, q8, q0              @ max \n"
-            "vmax.s32 q9, q9, q0              @ max \n"
-
-            "vmlal.s8 q13, d30, d8                 @ out1 += din3 * w20 \n"  // q13 += d10 * w00
-            "pld [%[din_ptr2]]                @ preload data\n"
-            "pld [%[din_ptr3]]                @ preload data\n"
-
-            "vst1.32 {d16-d17}, [%[dout_ptr1]]!         @ store\n"
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmull.s8 q13, d31, d10                 @ out1 += din3 * w22 \n"  // q12 += d10 * w00
-
-            "vst1.32 {d18-d19}, [%[dout_ptr1]]!         @ store\n"
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmax.s32 q10, q10, q0              @ max \n"
-            "vmax.s32 q11, q11, q0              @ max \n"
-
-            "vst1.32 {d20-d21}, [%[dout_ptr2]]!         @ store\n"
-            "cmp %[cnt], #1                                 \n"
-            "vst1.32 {d22-d23}, [%[dout_ptr2]]!         @ store\n"
-            "blt 1f                                         \n"
-
-            // mid
-            "2:                                          \n"
-            "vld1.8 {d12-d13}, [%[din_ptr0]]    @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-            // out0
-            "vdup.32 q8, %[bias]                            @ and \n"  // q8 =
-                                                                       // vbias
-            "vdup.32 q9, %[bias]                            @ and \n"  // q9 =
-                                                                       // vbias
-            // out1
-            "vdup.32 q10, %[bias]                            @ and \n"  // q8 =
-                                                                        // vbias
-            "vdup.32 q11, %[bias]                            @ and \n"  // q9 =
-                                                                        // vbias
-
-            // r0
-            "vmull.s8 q12, d12, d2                 @ out0 = din0 * w01 \n"  // q12 = d12 * w01
-            "vext.8     d30, d12, d13, #1     @ ext \n"       // d10 = 12345678
-            "vext.8     d31, d12, d13, #2          @ ext \n"  // d11 = 23456789
-
-            "vld1.8 {d12-d13}, [%[din_ptr1]]    @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-            "vld1.8 {d14-d15}, [%[din_ptr2]]    @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-
-            "vmlal.s8 q12, d30, d3                 @ out0 += din0 * w00 \n"  // q12 += d10 * w00
-
-            "add %[din_ptr0], #8                   @add \n"
-            "add %[din_ptr1], #8                   @add \n"
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmull.s8 q12, d31, d4                 @ out0 += din0 * w02 \n"  // q12 += d11 * w02
-
-            // r1
-            "vext.8     d30, d12, d13, #1     @ ext \n"       // d10 = 00123456
-            "vext.8     d31, d12, d13, #2          @ ext \n"  // d11 = 12345678
-            "vmull.s8 q13, d12, d2                 @ out1 = din1 * w01 \n"  // q13 = d12 * w01
-
-            "vmlal.s8 q12, d12, d5                 @ out0 = din1 * w11 \n"  // q12 = d12 * w11
-
-            "vld1.8 {d12-d13}, [%[din_ptr3]]    @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-
-            "vmlal.s8 q13, d30, d3                 @ out1 += din1 * w00 \n"  // q12 += d10 * w00
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmull.s8 q12, d30, d6                 @ out0 += din1 * w10 \n"  // q12 += d10 * w00
-
-            "add %[din_ptr2], #8                   @add \n"
-            "add %[din_ptr3], #8                   @add \n"
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmull.s8 q13, d31, d4                 @ out1 += din1 * w02 \n"  // q12 += d10 * w00
-            "vmlal.s8 q12, d31, d7                 @ out0 += din1 * w12 \n"  // q12 += d10 * w00
-
-            // r2
-            "vext.8     d30, d14, d15, #1     @ ext \n"       // d10 = 00123456
-            "vext.8     d31, d14, d15, #2          @ ext \n"  // d11 = 12345678
-
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmlal.s8 q13, d14, d5                 @ out1 = din2 * w11 \n"  // q13 = d12 * w01
-            "vmull.s8 q12, d14, d8                 @ out1 = din2 * w21 \n"  // q13 = d12 * w01
-
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmull.s8 q13, d30, d6                 @ out1 += din2 * w10 \n"  // q12 += d10 * w00
-            "vmlal.s8 q12, d30, d9                 @ out0 += din2 * w20 \n"  // q12 += d10 * w00
-
-            "vmlal.s8 q13, d31, d7                 @ out1 += din2 * w12 \n"  // q12 += d10 * w00
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmull.s8 q12, d31, d10                 @ out0 += din2 * w22 \n"  // q12 += d10 * w00
-
-            // r3
-            "vext.8     d30, d12, d13, #1     @ ext \n"       // d10 = 00123456
-            "vext.8     d31, d12, d13, #2          @ ext \n"  // d11 = 12345678
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmull.s8 q13, d12, d8                 @ out1 = din3 * w21 \n"  // q13 = d12 * w01
-            "pld [%[din_ptr0]]                @ preload data\n"
-            "pld [%[din_ptr1]]                @ preload data\n"
-            "vmax.s32 q8, q8, q0              @ max \n"
-            "vmax.s32 q9, q9, q0              @ max \n"
-
-            "vmlal.s8 q13, d30, d9                 @ out1 += din3 * w20 \n"  // q13 += d10 * w00
-            "pld [%[din_ptr2]]                @ preload data\n"
-            "pld [%[din_ptr3]]                @ preload data\n"
-
-            "vst1.32 {d16-d17}, [%[dout_ptr1]]!         @ store\n"
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmull.s8 q13, d31, d10                 @ out1 += din3 * w22 \n"  // q12 += d10 * w00
-
-            "vst1.32 {d18-d19}, [%[dout_ptr1]]!         @ store\n"
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmax.s32 q10, q10, q0              @ max \n"
-            "vmax.s32 q11, q11, q0              @ max \n"
-
-            "vst1.32 {d20-d21}, [%[dout_ptr2]]!         @ store\n"
-            "subs %[cnt], #1                                \n"
-            "vst1.32 {d22-d23}, [%[dout_ptr2]]!         @ store\n"
-            "bne  2b                                        \n"
-            // right
-            "1:                                          \n"
-            "vld1.8 {d12-d13}, [%[din_ptr0]]    @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-            "vld1.8 {d28-d29}, [%[mask]]        @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-            // out0
-            "vdup.32 q8, %[bias]                 @ and \n"  // q8 = vbias
-            "vdup.32 q9, %[bias]                 @ and \n"  // q9 = vbias
-            // out1
-            "vdup.32 q10, %[bias]                @ and \n"  // q8 = vbias
-            "vdup.32 q11, %[bias]                @ and \n"  // q9 = vbias
-
-            "vbif.8 d12, d11, d28        @ bit select, deal with right pad\n"
-            "vbif.8 d13, d11, d29        @ bit select, deal with right pad\n"
-            "vld1.8 {d14-d15}, [%[din_ptr1]]    @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-
-            // r0
-            "vmull.s8 q12, d12, d2                 @ out0 = din0 * w00 \n"  // q12 = d12 * w01
-            "vext.8 d30, d12, d13, #1               @ ext \n"  // d10 = 12345678
-            "vext.8 d31, d12, d13, #2               @ ext \n"  // d11 = 23456789
-
-            "vld1.8 {d12-d13}, [%[din_ptr2]]    @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-            "vbif.8 d14, d11, d28        @ bit select, deal with right pad\n"
-            "vbif.8 d15, d11, d29        @ bit select, deal with right pad\n"
-
-            "vmlal.s8 q12, d30, d3                 @ out0 += din0 * w01 \n"  // q12 += d10 * w00
-
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmull.s8 q12, d31, d4                 @ out0 += din0 * w02 \n"  // q12 += d11 * w02
-
-            // r1
-            "vext.8 d30, d14, d15, #1           @ ext \n"  // d10 = 00123456
-            "vext.8 d31, d14, d15, #2          @ ext \n"   // d11 = 12345678
-
-            "vmull.s8 q13, d14, d2                 @ out1 = din1 * w00 \n"  // q13 = d12 * w01
-
-            "vmlal.s8 q12, d14, d5                 @ out0 = din1 * w10 \n"  // q12 = d12 * w11
-
-            "vld1.8 {d14-d15}, [%[din_ptr3]]    @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-            "vbif.8 d12, d11, d28                 @ bit select, deal with "
-            "right pad\n"
-            "vbif.8 d13, d11, d29                 @ bit select, deal with "
-            "right pad\n"
-
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmlal.s8 q13, d30, d3                 @ out1 += din1 * w01 \n"  // q12 += d10 * w00
-            "vmull.s8 q12, d30, d6                 @ out0 += din1 * w11 \n"  // q12 += d10 * w00
-
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmull.s8 q13, d31, d4                 @ out1 += din1 * w02 \n"  // q12 += d10 * w00
-            "vmlal.s8 q12, d31, d7                 @ out0 += din1 * w12 \n"  // q12 += d10 * w00
-
-            // r2
-            "vext.8 d30, d12, d13, #1               @ ext \n"  // d10 = 00123456
-            "vext.8 d31, d12, d13, #2               @ ext \n"  // d11 = 12345678
-
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmlal.s8 q13, d12, d5                 @ out1 = din2 * w10 \n"  // q13 = d12 * w01
-            "vmull.s8 q12, d12, d8                 @ out1 = din2 * w20 \n"  // q13 = d12 * w01
-
-            "vbif.8 d14, d11, d28                     @ bit select, deal with "
-            "right pad\n"
-            "vbif.8 d15, d11, d29                     @ bit select, deal with "
-            "right pad\n"
-
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmull.s8 q13, d30, d6                 @ out1 += din2 * w10 \n"  // q12 += d10 * w00
-            "vmlal.s8 q12, d30, d9                 @ out0 += din2 * w20 \n"  // q12 += d10 * w00
-
-            "vld1.32 {d28-d29}, [%[dout_ptr1]]!    @ load din00= 0 1 2 3 4 5 6 "
-            "7 8 9\n"
-            "vld1.32 {d12-d13}, [%[dout_ptr1]]    @ load din00= 0 1 2 3 4 5 6 "
-            "7 8 9\n"
-            "vld1.32 {d2-d3}, [%[rs_mask]]!     @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-            "vld1.32 {d4-d5}, [%[rs_mask]]    @ load din00= 0 1 2 3 4 5 6 7 8 "
-            "9\n"
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmlal.s8 q13, d31, d7                 @ out1 += din2 * w12 \n"  // q12 += d10 * w00
-            "vmull.s8 q12, d31, d10                 @ out0 += din2 * w22 \n"  // q12 += d10 * w00
-
-            // r3
-            "vext.8     d30, d14, d15, #1     @ ext \n"       // d10 = 00123456
-            "vext.8     d31, d14, d15, #2          @ ext \n"  // d11 = 12345678
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmull.s8 q13, d14, d8                 @ out1 = din3 * w20 \n"  // q13 = d12 * w01
-            "vld1.32 {d14-d15}, [%[dout_ptr2]]!    @ load din00= 0 1 2 3 4 5 6 "
-            "7 8 9\n"
-            "vld1.32 {d24-d25}, [%[dout_ptr2]]     @ load din00= 0 1 2 3 4 5 6 "
-            "7 8 9\n"
-            "vmax.s32 q8, q8, q0              @ max \n"
-            "vmax.s32 q9, q9, q0              @ max \n"
-
-            "vmlal.s8 q13, d30, d9                 @ out1 += din3 * w21 \n"  // q13 += d10 * w00
-            "vbif q8, q14, q1                   @ bit select, deal with right "
-            "pad\n"
-            "vbif q9, q6, q2                    @ bit select, deal with right "
-            "pad\n"
-            "sub %[dout_ptr1], #16                  @ sub \n"
-            "sub %[dout_ptr2], #16                  @ sub \n"
-
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmull.s8 q13, d31, d10                 @ out1 += din3 * w22 \n"  // q12 += d10 * w00
-
-            "vst1.32 {d16-d17}, [%[dout_ptr1]]!         @ store\n"
-            "vst1.32 {d18-d19}, [%[dout_ptr1]]!         @ store\n"
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmax.s32 q10, q10, q0              @ max \n"
-            "vmax.s32 q11, q11, q0              @ max \n"
-
-            "vbif q10, q7, q1        @ bit select, deal with right pad\n"
-            "vbif q11, q12, q2       @ bit select, deal with right pad\n"
-
-            "vst1.32 {d20-d21}, [%[dout_ptr2]]!         @ store\n"
-            "vst1.32 {d22-d23}, [%[dout_ptr2]]!         @ store\n"
-
-            : [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [dout_ptr1] "+r"(doutr0),
-              [dout_ptr2] "+r"(doutr1),
-              [cnt] "+r"(cnt),
-              [bias] "+r"(bias_val),
-              [rs_mask] "+r"(rst_mask)
-            : [mask] "r"(vmask)
-            : "cc",
-              "memory",
-              "q0",
-              "q1",
-              "q2",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif
-        dout_ptr += 2 * w_out;
-      }
-    }
-  }
-}
-// w_in <= 8
-void conv_depthwise_3x3s1p1_bias_s_relu_int8(int* dout,
-                                             const signed char* din,
-                                             const signed char* weights,
-                                             const int* bias,
-                                             bool flag_bias,
-                                             const int num,
-                                             const int ch_in,
-                                             const int h_in,
-                                             const int w_in,
-                                             const int h_out,
-                                             const int w_out,
-                                             ARMContext* ctx) {
-  //! pad is done implicit
-  const char zero[8] = {0, 0, 0, 0, 0, 0, 0, 0};
-  //! for 4x6 convolution window
-  const unsigned char right_pad_idx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  const unsigned int right_pad_rst[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-
-  // printf("conv3x3_dw start \n");
-  signed char* zero_ptr = ctx->workspace_data<signed char>();
-  memset(zero_ptr, 0, w_in * sizeof(signed char));
-  int* write_ptr =
-      reinterpret_cast<int*>(ctx->workspace_data<signed char>()) + w_in;
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  int w_stride = 9;
-
-  int tile_h = (h_out + 3) >> 2;
-
-  unsigned int size_pad_right = (unsigned int)(w_in);
-
-  int size_pad_bottom = h_out % 4;
-
-  uint8x8_t vmask_rp =
-      vcgt_u8(vdup_n_u8(size_pad_right), vld1_u8(right_pad_idx));
-  unsigned int rst_remain = (unsigned int)w_out;
-  uint32x4_t vmask_result1 =
-      vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst));
-  uint32x4_t vmask_result2 =
-      vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst + 4));
-
-  unsigned char vmask[8];
-  vst1_u8(vmask, vmask_rp);
-
-  unsigned int rmask[8];
-  vst1q_u32(rmask, vmask_result1);
-  vst1q_u32(rmask + 4, vmask_result2);
-
-  int8x8_t vzero = vdup_n_s8(0);
-  int32x4_t vzero_32 = vdupq_n_s32(0);
-
-  for (int n = 0; n < num; ++n) {
-    const signed char* din_batch = din + n * ch_in * size_in_channel;
-    int* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; c++) {
-      int* dout_ptr = dout_batch + c * size_out_channel;
-
-      const signed char* din_ch_ptr = din_batch + c * size_in_channel;
-
-      int bias_val = flag_bias ? bias[c] : 0;
-
-      const signed char* wei_ptr = weights + c * w_stride;
-#ifdef __aarch64__
-      int vbias[4] = {bias_val, bias_val, bias_val, bias_val};
-      int8x8_t wr00 = vdup_n_s8(wei_ptr[0]);
-      int8x8_t wr10 = vdup_n_s8(wei_ptr[3]);
-      int8x8_t wr20 = vdup_n_s8(wei_ptr[6]);
-
-      int8x8_t wr01 = vdup_n_s8(wei_ptr[1]);
-      int8x8_t wr11 = vdup_n_s8(wei_ptr[4]);
-      int8x8_t wr21 = vdup_n_s8(wei_ptr[7]);
-
-      int8x8_t wr02 = vdup_n_s8(wei_ptr[2]);
-      int8x8_t wr12 = vdup_n_s8(wei_ptr[5]);
-      int8x8_t wr22 = vdup_n_s8(wei_ptr[8]);
-#endif
-
-      int* doutr0 = nullptr;
-      int* doutr1 = nullptr;
-
-      const signed char* dr0 = din_ch_ptr;
-      const signed char* dr1 = dr0 + w_in;
-      const signed char* dr2 = dr1 + w_in;
-      const signed char* dr3 = dr2 + w_in;
-
-      const signed char* din_ptr0 = nullptr;
-      const signed char* din_ptr1 = nullptr;
-      const signed char* din_ptr2 = nullptr;
-      const signed char* din_ptr3 = nullptr;
-
-      for (int i = 0; i < h_in; i += 2) {
-        //! process top pad pad_h = 1
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-        din_ptr3 = dr3;
-
-        doutr0 = dout_ptr;
-        doutr1 = doutr0 + w_out;
-        int out_buf1[8];
-        int out_buf2[8];
-        int trash_buf[8];
-
-        unsigned int* rst_mask = rmask;
-        unsigned char* val_mask = vmask;
-
-        if (i == 0) {
-          din_ptr0 = zero_ptr;
-          din_ptr1 = dr0;
-          din_ptr2 = dr1;
-          din_ptr3 = dr2;
-          dr0 = dr1;
-          dr1 = dr2;
-          dr2 = dr3;
-          dr3 = dr2 + w_in;
-        } else {
-          dr0 = dr2;
-          dr1 = dr3;
-          dr2 = dr1 + w_in;
-          dr3 = dr2 + w_in;
-        }
-        //! process bottom pad
-        if (i + 3 > h_in) {
-          switch (i + 3 - h_in) {
-            case 3:
-              din_ptr1 = zero_ptr;
-            case 2:
-              din_ptr2 = zero_ptr;
-            case 1:
-              din_ptr3 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 2 > h_out) {
-          doutr1 = trash_buf;
-        }
-#ifdef __aarch64__
-        asm volatile(
-            "PRFM PLDL1KEEP, [%[din_ptr0]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr1]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr2]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr3]] \n"
-            "movi   v21.4s, #0x0\n" /* out0 = 0 */
-                                    // left
-            "ld1 {v4.8b}, [%[vmask]]            \n"
-            "ld1    {v0.8b}, [%[din_ptr0]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-            "ld1    {v1.8b}, [%[din_ptr1]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-            "ld1    {v2.8b}, [%[din_ptr2]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-            "ld1    {v3.8b}, [%[din_ptr3]], #8                       \n" /* load
-                                                                            a00-a015
-                                                                            to
-                                                                            q0*/
-
-            "bif v0.8b, v21.8b, v4.8b               \n"
-            "bif v1.8b, v21.8b, v4.8b               \n"
-            "bif v2.8b, v21.8b, v4.8b               \n"
-            "bif v3.8b, v21.8b, v4.8b               \n"
-
-            "ext v6.8b, v21.8b, v0.8B, #7       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       00123456 */
-            "ext v7.8b, v0.8b, v21.8B, #1       \n" /* vext_s8(vinr0, vinr0_1,
-                                                       1); 12345678 */
-
-            "ld1 {v10.4s}, [%[vbias]]            \n"
-            "ld1 {v11.4s}, [%[vbias]]            \n"
-
-            // r0
-            "smull  v18.8h,  %[v1].8b,  v0.8b   \n" /* outr00 = 01234567 * w01
-                                                     */
-
-            "ext v8.8b, v21.8b, v1.8B, #7       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       00123456 */
-            "ext v9.8b, v1.8b, v21.8B, #1       \n" /* vext_s8(vinr0, vinr0_1,
-                                                       1); 12345678 */
-
-            "smlal  v18.8h,  %[v0].8b,  v6.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "ld1 {v12.4s}, [%[vbias]]            \n"
-            "ld1 {v13.4s}, [%[vbias]]            \n"
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v18.8h,  %[v2].8b,  v7.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "ext v6.8b, v21.8b, v2.8B, #7       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       00123456 */
-            "ext v7.8b, v2.8b, v21.8B, #1       \n" /* vext_s8(vinr0, vinr0_1,
-                                                       1); 12345678 */
-
-            // r1
-            "smull  v19.8h,  %[v1].8b,  v1.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-            "smlal  v18.8h,  %[v4].8b,  v1.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            // "ld1 {v14.4s}, [%[rmask]], #16                \n"
-            // "ld1 {v15.4s}, [%[rmask]]                     \n"
-
-            "smlal  v19.8h,  %[v0].8b,  v8.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v18.8h,  %[v3].8b,  v8.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            // "ld1 {v16.4s}, [%[ptr_out0]], #16                \n"
-            // "ld1 {v17.4s}, [%[ptr_out1]], #16                \n"
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v19.8h,  %[v2].8b,  v9.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-            "smlal  v18.8h,  %[v5].8b,  v9.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "ext v8.8b, v21.8b, v3.8B, #7       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       00123456 */
-            "ext v9.8b, v3.8b, v21.8B, #1       \n"  // vext_s8(vinr0, vinr0_1,
-                                                     // 1); 12345678
-
-            // "ld1 {v0.4s}, [%[ptr_out0]]                   \n"
-            // "ld1 {v1.4s}, [%[ptr_out1]]                   \n"
-
-            // r2
-            "smlal  v19.8h,  %[v4].8b,  v2.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v18.8h,  %[v7].8b,  v2.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            // "sub %[ptr_out0], %[ptr_out0], #16   \n"
-            // "sub %[ptr_out1], %[ptr_out1], #16   \n"
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v19.8h,  %[v3].8b,  v6.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-            "smlal  v18.8h,  %[v6].8b,  v6.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "smlal  v19.8h,  %[v5].8b,  v7.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v18.8h,  %[v8].8b,  v7.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            // r3
-            "smull  v19.8h,  %[v7].8b,  v3.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "saddw   v10.4s, v10.4s, v18.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v11.4s, v11.4s, v18.8h    \n" /* v11 += outr00.high*/
-
-            "smlal  v19.8h,  %[v6].8b,  v8.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "smax  v10.4s, v10.4s, v21.4s       \n" /* relu */
-            "smax  v11.4s, v11.4s, v21.4s       \n" /* relu */
-
-            // "bif v10.16b, v16.16b, v14.16b         \n"
-            // "bif v11.16b, v0.16b, v15.16b         \n"
-
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            "smull  v19.8h,  %[v8].8b,  v9.8b   \n" /* outr00 = 01234567 * w00
-                                                     */
-
-            "stp     q10, q11, [%[ptr_out0]] \n" /* store q10, q11 -> ptr_out */
-
-            "saddw   v12.4s, v12.4s, v19.4h     \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h    \n" /* v11 += outr00.high*/
-
-            "smax  v12.4s, v12.4s, v21.4s       \n" /* relu */
-            "smax  v13.4s, v13.4s, v21.4s       \n" /* relu */
-
-            // "bif v12.16b, v17.16b, v14.16b         \n"
-            // "bif v13.16b, v1.16b, v15.16b         \n"
-
-            "stp     q12, q13, [%[ptr_out1]] \n" /* store q10, q11 -> ptr_out */
-
-            : [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [rmask] "+r"(rst_mask)
-            : [v0] "w"(wr00),
-              [v1] "w"(wr01),
-              [v2] "w"(wr02),
-              [v3] "w"(wr10),
-              [vbias] "r"(vbias),
-              [v4] "w"(wr11),
-              [v5] "w"(wr12),
-              [v6] "w"(wr20),
-              [v7] "w"(wr21),
-              [v8] "w"(wr22),
-              [vmask] "r"(vmask),
-              [ptr_out0] "r"(out_buf1),
-              [ptr_out1] "r"(out_buf2)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22");
-#else
-        // store weights
-        asm volatile("vld1.8    {d0-d1}, [%[wei_ptr]]    \n"
-                     :
-                     : [wei_ptr] "r"(wei_ptr)
-                     : "memory");
-        asm volatile(
-            // left
-            "pld [%[din_ptr0]]                @ preload data\n"
-            "pld [%[din_ptr1]]                @ preload data\n"
-            "pld [%[din_ptr2]]                @ preload data\n"
-            "pld [%[din_ptr3]]                @ preload data\n"
-            "vld1.8 {d28}, [%[mask]]        @ load din00= 0 1 2 3 4 5 6 7 8 9\n"
-            "vld1.8 {d12}, [%[din_ptr0]]    @ load din00= 0 1 2 3 4 5 6 7 8 9\n"
-            "vld1.8 {d13}, [%[din_ptr1]]    @ load din00= 0 1 2 3 4 5 6 7 8 9\n"
-            "vdup.s8     d2, d0[0]               @ d2 = w00, w00, w00, w00\n"
-            "vdup.s8     d3, d0[1]               @ d3 = w01, w01, w01, w01\n"
-            "vdup.s8     d4, d0[2]               @ d4 = w02, w02, w02, w02\n"
-
-            "vmov.u32 d11, #0                   @ zero\n"
-            // out0
-            "vdup.32 q8, %[bias]                            @ and \n"  // q8 =
-                                                                       // vbias
-            "vdup.32 q9, %[bias]                            @ and \n"  // q9 =
-                                                                       // vbias
-
-            "vbif.8 d12, d11, d28        @ bit select, deal with right pad\n"
-            "vbif.8 d13, d11, d28        @ bit select, deal with right pad\n"
-            "vld1.8 {d14}, [%[din_ptr2]]    @ load din00= 0 1 2 3 4 5 6 7 8 9\n"
-            "vld1.8 {d15}, [%[din_ptr3]]    @ load din00= 0 1 2 3 4 5 6 7 8 9\n"
-            // out1
-            "vdup.32 q10, %[bias]                            @ and \n"  // q8 =
-                                                                        // vbias
-            "vdup.32 q11, %[bias]                            @ and \n"  // q9 =
-                                                                        // vbias
-
-            // r0
-            "vmull.s8 q12, d12, d3                 @ out0 = din0 * w01 \n"  // q12 = d12 * w01
-            "vext.8 d30, d11, d12, #7           @ ext \n"  // d10 = 00123456
-            "vext.8 d31, d12, d11, #1          @ ext \n"   // d11 = 12345678
-
-            "vdup.s8 d5, d0[3]               @ d5 = w10, w10, w00, w00\n"
-            "vdup.s8 d6, d0[4]               @ d6 = w11, w11, w01, w01\n"
-
-            "vmlal.s8 q12, d30, d2                 @ out0 += din0 * w00 \n"  // q12 += d10 * w00
-
-            "vdup.s8 d7, d0[5]               @ d7 = w12, w12\n"
-            "vbif.8 d14, d11, d28        @ bit select, deal with right pad\n"
-            "vbif.8 d15, d11, d28        @ bit select, deal with right pad\n"
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmull.s8 q12, d31, d4                 @ out0 += din0 * w02 \n"  // q12 += d11 * w02
-
-            // r1
-            "vext.8     d30, d11, d13, #7     @ ext \n"       // d10 = 00123456
-            "vext.8     d31, d13, d11, #1          @ ext \n"  // d11 = 12345678
-            "vmull.s8 q13, d13, d3                 @ out1 = din1 * w01 \n"  // q13 = d12 * w01
-
-            "vmlal.s8 q12, d13, d6                 @ out0 = din1 * w11 \n"  // q12 = d12 * w11
-
-            "vdup.s8 d8, d0[6]               @ d8 = w20, w00, w00, w00\n"
-            "vdup.s8 d9, d0[7]               @ d9 = w21, w01, w01, w01\n"
-
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmlal.s8 q13, d30, d2                 @ out1 += din1 * w00 \n"  // q12 += d10 * w00
-            "vmull.s8 q12, d30, d5                 @ out0 += din1 * w10 \n"  // q12 += d10 * w00
-
-            "vdup.s8 d10, d1[0]               @ d10 = w22, w02, w02, w02\n"
-            // "vld1.32 {d28-d29}, [%[dout_ptr1]]!    @ load din00= 0 1 2 3 4 5
-            // 6 7 8 9\n" "vld1.32 {d12-d13}, [%[dout_ptr1]]    @ load din00= 0
-            // 1 2 3 4 5 6 7 8 9\n"
-
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmull.s8 q13, d31, d4                 @ out1 += din1 * w02 \n"  // q12 += d10 * w00
-            "vmlal.s8 q12, d31, d7                 @ out0 += din1 * w12 \n"  // q12 += d10 * w00
-
-            // r2
-            "vext.8     d30, d11, d14, #7     @ ext \n"       // d10 = 00123456
-            "vext.8     d31, d14, d11, #1          @ ext \n"  // d11 = 12345678
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmlal.s8 q13, d14, d6                 @ out1 = din2 * w11 \n"  // q13 = d12 * w01
-            "vmull.s8 q12, d14, d9                 @ out1 = din2 * w21 \n"  // q13 = d12 * w01
-
-            // "sub %[dout_ptr1], #16                  @ sub \n"
-
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmull.s8 q13, d30, d5                 @ out1 += din2 * w10 \n"  // q12 += d10 * w00
-            "vmlal.s8 q12, d30, d8                 @ out0 += din2 * w20 \n"  // q12 += d10 * w00
-
-            // "vld1.32 {d2-d3}, [%[rs_mask]]!     @ load din00= 0 1 2 3 4 5 6 7
-            // 8 9\n" "vld1.32 {d4-d5}, [%[rs_mask]]    @ load din00= 0 1 2 3 4
-            // 5 6 7 8 9\n"
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmlal.s8 q13, d31, d7                 @ out1 += din2 * w12 \n"  // q12 += d10 * w00
-            "vmull.s8 q12, d31, d10                 @ out0 += din2 * w22 \n"  // q12 += d10 * w00
-
-            // r3
-            "vext.8     d30, d11, d15, #7     @ ext \n"       // d10 = 00123456
-            "vext.8     d31, d15, d11, #1          @ ext \n"  // d11 = 12345678
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-            "vaddw.s16 q8, q8, d24                 @addw \n"  // out0 +=
-            // vget_low_s16(out00)
-            "vaddw.s16 q9, q9, d25                 @addw \n"  // out0_1 +=
-            // vget_high_s16(out00)
-
-            "vmull.s8 q13, d15, d9                 @ out1 = din3 * w21 \n"  // q13 = d12 * w01
-
-            "vmov.u32 q0, #0                   @ zero\n"
-
-            // "vld1.32 {d6-d7}, [%[dout_ptr2]]!    @ load din00= 0 1 2 3 4 5 6
-            // 7 8 9\n" "vld1.32 {d14-d15}, [%[dout_ptr2]]    @ load din00= 0 1
-            // 2 3 4 5 6 7 8 9\n"
-
-            "vmlal.s8 q13, d30, d8                 @ out1 += din3 * w20 \n"  // q13 += d10 * w00
-
-            "vmax.s32 q8, q8, q0                    @ max \n"
-            "vmax.s32 q9, q9, q0                    @ max \n"
-
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmull.s8 q13, d31, d10                 @ out1 += din3 * w22 \n"  // q12 += d10 * w00
-
-            // "sub %[dout_ptr2], #16                  @ sub \n"
-            // "vbif q8, q14, q1                   @ bit select, deal with right
-            // pad\n" "vbif q9, q6, q2                    @ bit select, deal
-            // with right pad\n"
-
-            "vaddw.s16 q10, q10, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q11, q11, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vst1.32 {d16-d19}, [%[dout_ptr1]]         @ store\n"
-            // "vst1.32 {d18-d19}, [%[dout_ptr1]]!         @ store\n"
-
-            "vmax.s32 q10, q10, q0                    @ max \n"
-            "vmax.s32 q11, q11, q0                    @ max \n"
-
-            // "vbif q10, q3, q1                   @ bit select, deal with right
-            // pad\n" "vbif q11, q7, q2                    @ bit select, deal
-            // with right pad\n"
-
-            "vst1.32 {d20-d23}, [%[dout_ptr2]]         @ store\n"
-            // "vst1.32 {d22-d23}, [%[dout_ptr2]]!         @ store\n"
-            : [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [bias] "+r"(bias_val),
-              [rs_mask] "+r"(rst_mask)
-            : [mask] "r"(vmask),
-              [dout_ptr1] "r"(out_buf1),
-              [dout_ptr2] "r"(out_buf2)
-            : "cc",
-              "memory",
-              "q0",
-              "q1",
-              "q2",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif
-        for (int w = 0; w < w_out; ++w) {
-          *doutr0++ = out_buf1[w];
-          *doutr1++ = out_buf2[w];
-        }
-        dout_ptr += 2 * w_out;
-      }
-    }
-  }
-}
-
-// 1 line w_in > 16
-void conv_depthwise_3x3s2p1_bias_relu_int8(int* dout,
-                                           const signed char* din,
-                                           const signed char* weights,
-                                           const int* bias,
-                                           bool flag_bias,
-                                           const int num,
-                                           const int ch_in,
-                                           const int h_in,
-                                           const int w_in,
-                                           const int h_out,
-                                           const int w_out,
-                                           ARMContext* ctx) {
-  // printf("3x3s2 mult height \n");
-  //! pad is done implicit
-  //! for 4x6 convolution window
-  const unsigned char right_pad_idx[16] = {
-      0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
-  const unsigned int right_pad_rst[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-
-  // printf("conv3x3_dw start \n");
-  signed char* zero_ptr = ctx->workspace_data<signed char>();
-  memset(zero_ptr, 0, w_in * sizeof(signed char));
-  int* write_ptr =
-      reinterpret_cast<int*>(ctx->workspace_data<signed char>()) + w_out;
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  int w_stride = 9;
-
-  int tile_w = (w_in + 15) >> 4;
-  int cnt_col = tile_w - 2;
-
-  unsigned int size_pad_right = (unsigned int)(w_in - 15 - (cnt_col << 4));
-  if (size_pad_right == 17) {
-    size_pad_right = 0;
-    cnt_col++;
-  }
-
-  uint8x8_t vmask_rp1 =
-      vcgt_u8(vdup_n_u8(size_pad_right), vld1_u8(right_pad_idx));
-  uint8x8_t vmask_rp2 =
-      vcgt_u8(vdup_n_u8(size_pad_right), vld1_u8(right_pad_idx + 8));
-  unsigned int rst_remain = (unsigned int)(w_out - ((cnt_col + 1) << 3));
-  uint32x4_t vmask_result1 =
-      vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst));
-  uint32x4_t vmask_result2 =
-      vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst + 4));
-
-  int8x8_t vzero = vdup_n_s8(0);
-  int32x4_t vzero_32 = vdupq_n_s32(0);
-
-  uint8x16_t vmask_rp =
-      vcgtq_u8(vdupq_n_u8(size_pad_right), vld1q_u8(right_pad_idx));
-  unsigned char vmask[16];
-  vst1q_u8(vmask, vmask_rp);
-
-  unsigned int rmask[8];
-  vst1q_u32(rmask, vmask_result1);
-  vst1q_u32(rmask + 4, vmask_result2);
-
-  for (int n = 0; n < num; ++n) {
-    const signed char* din_batch = din + n * ch_in * size_in_channel;
-    int* dout_batch = dout + n * ch_in * size_out_channel;
-
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; c++) {
-      int* dout_ptr = dout_batch + c * size_out_channel;
-
-      const signed char* din_ch_ptr = din_batch + c * size_in_channel;
-
-      int bias_val = flag_bias ? bias[c] : 0;
-
-      const signed char* wei_ptr = weights + c * w_stride;
-#ifdef __aarch64__
-      int vbias[4] = {bias_val, bias_val, bias_val, bias_val};
-      int8x8_t wr00 = vdup_n_s8(wei_ptr[0]);
-      int8x8_t wr10 = vdup_n_s8(wei_ptr[3]);
-      int8x8_t wr20 = vdup_n_s8(wei_ptr[6]);
-
-      int8x8_t wr01 = vdup_n_s8(wei_ptr[1]);
-      int8x8_t wr11 = vdup_n_s8(wei_ptr[4]);
-      int8x8_t wr21 = vdup_n_s8(wei_ptr[7]);
-
-      int8x8_t wr02 = vdup_n_s8(wei_ptr[2]);
-      int8x8_t wr12 = vdup_n_s8(wei_ptr[5]);
-      int8x8_t wr22 = vdup_n_s8(wei_ptr[8]);
-#endif
-
-      int* doutr0 = nullptr;
-
-      const signed char* dr0 = din_ch_ptr;
-      const signed char* dr1 = dr0 + w_in;
-      const signed char* dr2 = dr1 + w_in;
-
-      const signed char* din_ptr0 = nullptr;
-      const signed char* din_ptr1 = nullptr;
-      const signed char* din_ptr2 = nullptr;
-
-      for (int i = 0; i < h_in; i += 2) {
-        //! process top pad pad_h = 1
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-
-        doutr0 = dout_ptr;
-        if (i == 0) {
-          din_ptr0 = zero_ptr;
-          din_ptr1 = dr0;
-          din_ptr2 = dr1;
-          dr0 = dr1;
-          dr1 = dr2;
-          dr2 = dr1 + w_in;
-        } else {
-          dr0 = dr2;
-          dr1 = dr0 + w_in;
-          dr2 = dr1 + w_in;
-        }
-        //! process bottom pad
-        if (i + 2 > h_in) {
-          switch (i + 2 - h_in) {
-            case 2:
-              din_ptr1 = zero_ptr;
-            case 1:
-              din_ptr2 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        int cnt = cnt_col;
-#ifdef __aarch64__
-        unsigned char* val_mask = vmask;
-        asm volatile(
-            "PRFM PLDL1KEEP, [%[din_ptr0]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr1]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr2]] \n"
-            "movi   v10.4s, #0x0\n"
-            // left
-            "ld2    {v0.8b - v1.8b}, [%[din_ptr0]]         \n" /*load a00-a015
-                                                                  to q0*/
-            "ld2    {v2.8b - v3.8b}, [%[din_ptr1]]         \n" /* load a00-a015
-                                                                  to q0*/
-            "ld2    {v4.8b - v5.8b}, [%[din_ptr2]]         \n" /*load a00-a015
-                                                                  to q0*/
-
-            "ld1    {v12.4s}, [%[bias_val]] \n" /* dup v10, bias*/
-            "ld1    {v13.4s}, [%[bias_val]] \n" /* dup v10, bias */
-
-            "ext v6.8b, v10.8b, v1.8B, #7       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       013579 */
-            "ext v7.8b, v10.8b, v3.8B, #7       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       013579 */
-            "ext v8.8b, v10.8b, v5.8B, #7       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       013579 */
-
-            // r0
-            "smull  v14.8h,  %[v1].8b,  v0.8b   \n" /* outr00 = 02468 * w01 */
-            "smull  v15.8h,  %[v2].8b,  v1.8b\n"    /* outr00 += 13579 * w02 */
-            "smull  v16.8h,  %[v0].8b,  v6.8b\n"    /* outr00 += 013579 * w00 */
-
-            "add   %[din_ptr0], %[din_ptr0], #15                       \n"
-            "add   %[din_ptr1], %[din_ptr1], #15                       \n"
-            "add   %[din_ptr2], %[din_ptr2], #15                       \n"
-
-            // r1
-            "smlal  v14.8h,  %[v4].8b,  v2.8b   \n" /* outr00 = 02468 * w01 */
-            "smlal  v15.8h,  %[v5].8b,  v3.8b\n"    /* outr00 += 13579 * w02 */
-            "smlal  v16.8h,  %[v3].8b,  v7.8b\n"    /* outr00 += 013579 * w00 */
-
-            "saddw   v12.4s, v12.4s, v14.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v14.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v15.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v15.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v16.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v16.8h        \n" /* v11 += outr00.high*/
-
-            // r2
-            "smull  v14.8h,  %[v7].8b,  v4.8b   \n" /* outr00 = 02468 * w01 */
-            "smull  v15.8h,  %[v8].8b,  v5.8b\n"    /* outr00 += 13579 * w02 */
-            "smull  v16.8h,  %[v6].8b,  v8.8b\n"    /* outr00 += 013579 * w00 */
-
-            "ld2    {v0.8b - v1.8b}, [%[din_ptr0]], #16         \n" /*load
-                                                                       a00-a015
-                                                                       to q0*/
-            "ld2    {v2.8b - v3.8b}, [%[din_ptr1]], #16         \n" /* load
-                                                                       a00-a015
-                                                                       to q0*/
-            "ld2    {v4.8b - v5.8b}, [%[din_ptr2]], #16         \n" /*load
-                                                                       a00-a015
-                                                                       to q0*/
-
-            "saddw   v12.4s, v12.4s, v14.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v14.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v15.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v15.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v16.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v16.8h        \n" /* v11 += outr00.high*/
-
-            "smax   v12.4s, v12.4s, v10.4s    \n" /*relu*/
-            "smax   v13.4s, v13.4s, v10.4s    \n" /*relu*/
-
-            "stp     q12, q13, [%[ptr_out0]], #32   \n" /* store q10, q11 ->
-                                                           ptr_out   */
-
-            "ld1    {v12.4s}, [%[bias_val]] \n" /* dup v10, bias */
-            "ld1    {v13.4s}, [%[bias_val]] \n" /* dup v10, bias */
-
-            "cmp  %[cnt], #1                \n"
-            "blt 3f                         \n"
-            // mid
-            "1:                             \n"
-            "ld1    {v6.8b}, [%[din_ptr0]]         \n" /*load a00-a015 to q0*/
-            "ld1    {v7.8b}, [%[din_ptr1]]         \n" /*load a00-a015 to q0*/
-            "ld1    {v8.8b}, [%[din_ptr2]]         \n" /*load a00-a015 to q0*/
-
-            "ext v9.8b, v0.8b, v6.8B, #1       \n"  /* vext_s8(vzero, vinr0, 7);
-                                                       246810 */
-            "ext v11.8b, v2.8b, v7.8B, #1       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       246810 */
-            "ext v14.8b, v4.8b, v8.8B, #1       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       246810 */
-
-            // r0
-            "smull  v6.8h,  %[v0].8b,  v0.8b   \n" /* outr00 = 02468 * w00 */
-            "smull  v7.8h,  %[v1].8b,  v1.8b\n"    /* outr00 += 13579 * w01 */
-            "smull  v8.8h,  %[v2].8b,  v9.8b\n"    /* outr00 += 246810 * w02 */
-
-            // r1
-            "smlal  v6.8h,  %[v3].8b,  v2.8b   \n" /* outr00 = 02468 * w00 */
-            "smlal  v7.8h,  %[v4].8b,  v3.8b\n"    /* outr00 += 13579 * w01 */
-            "smlal  v8.8h,  %[v5].8b,  v11.8b\n"   /* outr00 += 246810 * w02 */
-
-            "saddw   v12.4s, v12.4s, v6.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v6.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v7.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v7.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v8.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v8.8h        \n" /* v11 += outr00.high*/
-
-            // r2
-            "smull  v6.8h,  %[v6].8b,  v4.8b   \n" /* outr00 = 02468 * w00 */
-            "smull  v7.8h,  %[v7].8b,  v5.8b\n"    /* outr00 += 13579 * w01 */
-            "smull  v8.8h,  %[v8].8b,  v14.8b\n"   /* outr00 += 246810 * w02 */
-
-            "ld2    {v0.8b - v1.8b}, [%[din_ptr0]], #16         \n" /*load
-                                                                       a00-a015
-                                                                       to q0*/
-            "ld2    {v2.8b - v3.8b}, [%[din_ptr1]], #16         \n" /* load
-                                                                       a00-a015
-                                                                       to q0*/
-            "ld2    {v4.8b - v5.8b}, [%[din_ptr2]], #16         \n" /*load
-                                                                       a00-a015
-                                                                       to q0*/
-
-            "saddw   v12.4s, v12.4s, v6.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v6.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v7.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v7.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v8.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v8.8h        \n" /* v11 += outr00.high*/
-
-            "smax   v12.4s, v12.4s, v10.4s    \n" /*relu*/
-            "smax   v13.4s, v13.4s, v10.4s    \n" /*relu*/
-
-            "subs %[cnt], %[cnt], #1               \n"
-
-            "stp     q12, q13, [%[ptr_out0]], #32   \n" /* store q10, q11 ->
-                                                           ptr_out   */
-
-            "ld1    {v12.4s}, [%[bias_val]] \n" /* dup v10, bias */
-            "ld1    {v13.4s}, [%[bias_val]] \n" /* dup v10, bias */
-            "bne 1b                         \n"
-            // right
-            "3:                             \n"
-            "ld1 {v14.8b}, [%[vmask]], #8             \n"
-            "ld1 {v15.8b}, [%[vmask]]                \n"
-
-            "bif v0.8b, v10.8b, v14.8b               \n"
-            "bif v1.8b, v10.8b, v15.8b               \n"
-            "bif v2.8b, v10.8b, v14.8b               \n"
-            "bif v3.8b, v10.8b, v15.8b               \n"
-            "bif v4.8b, v10.8b, v14.8b               \n"
-            "bif v5.8b, v10.8b, v15.8b               \n"
-
-            "ext v6.8b, v0.8b, v10.8B, #1       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       2468.. */
-            "ext v7.8b, v2.8b, v10.8B, #1       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       2468..*/
-            "ext v8.8b, v4.8b, v10.8B, #1       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       2468.. */
-
-            // r0
-            "smull  v14.8h,  %[v0].8b,  v0.8b   \n" /* outr00 = 02468 * w00 */
-            "smull  v15.8h,  %[v1].8b,  v1.8b\n"    /* outr00 += 13579 * w01 */
-            "smull  v16.8h,  %[v2].8b,  v6.8b\n"    /* outr00 += 246810 * w02 */
-
-            // r1
-            "smlal  v14.8h,  %[v3].8b,  v2.8b   \n" /* outr00 = 02468 * w00 */
-            "smlal  v15.8h,  %[v4].8b,  v3.8b\n"    /* outr00 += 13579 * w01 */
-            "smlal  v16.8h,  %[v5].8b,  v7.8b\n"    /* outr00 += 246810 * w02 */
-
-            "saddw   v12.4s, v12.4s, v14.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v14.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v15.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v15.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v16.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v16.8h        \n" /* v11 += outr00.high*/
-
-            // r2
-            "smull  v14.8h,  %[v6].8b,  v4.8b   \n" /* outr00 = 02468 * w00 */
-            "smull  v15.8h,  %[v7].8b,  v5.8b\n"    /* outr00 += 13579 * w01 */
-            "smull  v16.8h,  %[v8].8b,  v8.8b\n"    /* outr00 += 246810 * w02 */
-
-            "ldp    q0, q1, [%[ptr_out0]] \n"  /* dup v10, bias */
-            "ldp    q9, q11, [%[rst_mask]] \n" /* dup v10, bias */
-
-            "saddw   v12.4s, v12.4s, v14.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v14.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v15.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v15.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v16.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v16.8h        \n" /* v11 += outr00.high*/
-
-            "smax   v12.4s, v12.4s, v10.4s    \n" /*relu*/
-            "smax   v13.4s, v13.4s, v10.4s    \n" /*relu*/
-
-            "bif v12.16b, v0.16b, v9.16b         \n"
-            "bif v13.16b, v1.16b, v11.16b         \n"
-
-            "stp     q12, q13, [%[ptr_out0]], #32 \n" /* store q10, q11 ->
-                                                         ptr_out       */
-
-            : [cnt] "+r"(cnt),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [ptr_out0] "+r"(doutr0),
-              [vmask] "+r"(val_mask)
-            : [v0] "w"(wr00),
-              [v1] "w"(wr01),
-              [v2] "w"(wr02),
-              [v3] "w"(wr10),
-              [bias_val] "r"(vbias),
-              [v4] "w"(wr11),
-              [v5] "w"(wr12),
-              [v6] "w"(wr20),
-              [v7] "w"(wr21),
-              [v8] "w"(wr22),
-              [rst_mask] "r"(rmask)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16");
-#else
-        unsigned int* rst_mask = rmask;
-        // prefetch input
-        // store weights
-        asm volatile("vld1.8    {d0-d1}, [%[wei_ptr]]    \n"
-                     :
-                     : [wei_ptr] "r"(wei_ptr)
-                     : "memory");
-        asm volatile(
-            // left
-            "pld [%[din_ptr0]]                @ preload data\n"
-            "pld [%[din_ptr1]]                @ preload data\n"
-            "pld [%[din_ptr2]]                @ preload data\n"
-            "vdup.s8     d2, d0[0]               @ d2 = w00, w00, w00, w00\n"
-            "vdup.s8     d3, d0[1]               @ d3 = w01, w01, w01, w01\n"
-            "vdup.s8     d4, d0[2]               @ d4 = w02, w02, w02, w02\n"
-            "vld2.8 {d12-d13}, [%[din_ptr0]]    @ load din00= 0 2 4 6 8\n"  // d10 = 0 2 4 6
-            "vld2.8 {d14-d15}, [%[din_ptr1]]    @ load din00= 0 2 4 6 8\n"  // d12 = 0 2 4 6
-            "vld2.8 {d16-d17}, [%[din_ptr2]]    @ load din00= 0 2 4 6 8\n"  // d14 = 0 2 4 6
-            "vmov.u32 d11, #0                   @ zero\n"
-
-            "vdup.s8     d5, d0[3]               @ d2 = w00, w00, w00, w00\n"
-            "vdup.s8     d6, d0[4]               @ d3 = w01, w01, w01, w01\n"
-            "vdup.s8     d7, d0[5]               @ d4 = w02, w02, w02, w02\n"
-
-            "vext.8  d18, d11, d13, #7     @ ext \n"  // d16 = -1 1 3 5
-            "vext.8  d19, d11, d15, #7     @ ext \n"  // d17 = -1 1 3 5
-            "vext.8  d20, d11, d17, #7     @ ext \n"  // d18 = -1 1 3 5
-
-            // r0
-            "vmull.s8 q13, d12, d3                 @ out0 = din0 * w01 \n"  // q12 = d12 * w01
-            "vmull.s8 q14, d13, d4                 @ out1 = din0 * w02 \n"  // q12 = d12 * w02
-            "vmull.s8 q15, d18, d2                 @ out2 = din0 * w00 \n"  // q12 = d12 * w02
-
-            "vdup.s8 d8, d0[6]               @ d2 = w00, w00, w00, w00\n"
-            "vdup.s8 d9, d0[7]               @ d3 = w01, w01, w01, w01\n"
-            "vdup.s8 d10, d1[0]               @ d4 = w02, w02, w02, w02\n"
-
-            // out0
-            "vdup.32 q11, %[bias]                            @ and \n"  // q8 =
-                                                                        // vbias
-            "vdup.32 q12, %[bias]                            @ and \n"  // q9 =
-                                                                        // vbias
-
-            // r1
-            "vmlal.s8 q13, d14, d6                 @ out0 += din1 * w11 \n"  // q12 = d12 * w11
-            "vmlal.s8 q14, d15, d7                 @ out1 += din1 * w12 \n"  // q12 = d12 * w11
-            "vmlal.s8 q15, d19, d5                 @ out2 += din1 * w10 \n"  // q12 = d12 * w11
-
-            "add %[din_ptr0], #15                   @add \n"
-
-            "vaddw.s16 q11, q11, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "add %[din_ptr1], #15                   @add \n"
-
-            "vaddw.s16 q11, q11, d28                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d29                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "add %[din_ptr2], #15                   @add \n"
-
-            "vaddw.s16 q11, q11, d30                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d31                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            // r2
-            "vmull.s8 q13, d16, d9                 @ out0 += din1 * w21 \n"  // q12 = d12 * w11
-            "vmull.s8 q14, d17, d10                 @ out1 += din1 * w22 \n"  // q12 = d12 * w11
-            "vmull.s8 q15, d20, d8                 @ out2 += din1 * w20 \n"  // q12 = d12 * w11
-
-            "pld [%[din_ptr0]]                @ preload data\n"
-            "pld [%[din_ptr1]]                @ preload data\n"
-            "pld [%[din_ptr2]]                @ preload data\n"
-
-            "vaddw.s16 q11, q11, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmov.u32 q8, #0                        @ max \n"  // max
-
-            "vaddw.s16 q11, q11, d28                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d29                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vaddw.s16 q11, q11, d30                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d31                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmax.s32 q11, q11, q8                      @ max\n"
-            "vmax.s32 q12, q12, q8                      @ max\n"
-
-            "vst1.32 {d22-d23}, [%[dout_ptr1]]!         @ store\n"
-            "cmp %[cnt], #1                                 \n"
-            "vst1.32 {d24-d25}, [%[dout_ptr1]]!         @ store\n"
-            "blt 1f                                         \n"
-
-            // mid
-            "2:                                              \n"
-            "vld2.8 {d12-d13}, [%[din_ptr0]]!    @ load din00= 0 2 4 6 8\n"  // d10 = 0 2 4 6
-            "vld2.8 {d14-d15}, [%[din_ptr1]]!    @ load din00= 0 2 4 6 8\n"  // d12 = 0 2 4 6
-            "vld2.8 {d16-d17}, [%[din_ptr2]]!    @ load din00= 0 2 4 6 8\n"  // d14 = 0 2 4 6
-
-            "vld1.8 {d21}, [%[din_ptr0]]    @ load din00= 16 17\n"  // d10 = 0 2
-                                                                    // 4 6
-            "vld1.8 {d22}, [%[din_ptr1]]    @ load din00= 16 17\n"  // d12 = 0 2
-                                                                    // 4 6
-            "vld1.8 {d23}, [%[din_ptr2]]    @ load din00= 16 17\n"  // d14 = 0 2
-                                                                    // 4 6
-
-            "vext.8  d18, d12, d21, #1     @ ext din00 = 2 4 6 8\n"  // d16 = 2
-                                                                     // 4 6 8
-            "vext.8  d19, d14, d22, #1     @ ext \n"  // d17 = 2 4 6 8
-            "vext.8  d20, d16, d23, #1     @ ext \n"  // d18 = 2 4 6 8
-
-            // r0
-            "vmull.s8 q13, d12, d2                 @ out0 = din0 * w00 \n"  // q12 = 0 2 4 6
-            "vmull.s8 q14, d13, d3                 @ out1 = din0 * w01 \n"  // q12 = 1 3 5 7
-            "vmull.s8 q15, d18, d4                 @ out2 = din0 * w02 \n"  // q12 = 2 4 6 8
-
-            // out0
-            "vdup.32 q11, %[bias]                            @ and \n"  // q8 =
-                                                                        // vbias
-            "vdup.32 q12, %[bias]                            @ and \n"  // q9 =
-                                                                        // vbias
-
-            // r1
-            "vmlal.s8 q13, d14, d5                 @ out0 += din1 * w10 \n"  // q12 = 0 2 4 6
-            "vmlal.s8 q14, d15, d6                 @ out1 += din1 * w11 \n"  // q12 = 1 3 5 7
-            "vmlal.s8 q15, d19, d7                 @ out2 += din1 * w12 \n"  // q12 = 2 4 6 8
-
-            "vaddw.s16 q11, q11, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vaddw.s16 q11, q11, d28                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d29                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vaddw.s16 q11, q11, d30                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d31                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            // r2
-            "vmull.s8 q13, d16, d8                 @ out0 += din1 * w20 \n"  // q12 = 0 2 4 6
-            "vmull.s8 q14, d17, d9                 @ out1 += din1 * w21 \n"  // q12 = 1 3 5 7
-            "vmull.s8 q15, d20, d10                 @ out2 += din1 * w22 \n"  // q12 = 2 4 6 8
-
-            "vaddw.s16 q11, q11, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-            "vmov.u32 q8, #0                          @ mov \n"
-
-            "vaddw.s16 q11, q11, d28                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d29                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vaddw.s16 q11, q11, d30                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d31                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "pld [%[din_ptr0]]                @ preload data\n"
-            "pld [%[din_ptr1]]                @ preload data\n"
-            "pld [%[din_ptr2]]                @ preload data\n"
-
-            "vmax.s32 q11, q11, q8                      @ max\n"
-            "vmax.s32 q12, q12, q8                      @ max\n"
-
-            "vst1.32 {d22-d23}, [%[dout_ptr1]]!         @ store\n"
-
-            "subs %[cnt], #1                                \n"
-            "vst1.32 {d24-d25}, [%[dout_ptr1]]!         @ store\n"
-            "bne  2b                                        \n"
-            // right
-            "1:                                              \n"
-            "cmp %[size_pad_right], #1                       \n"
-            "blt 3f                                         \n"
-            "vld2.8 {d12-d13}, [%[din_ptr0]]!    @ load din00= 0 2 4 6 8\n"  // d10 = 0 2 4 6
-            "vld2.8 {d14-d15}, [%[din_ptr1]]!    @ load din00= 0 2 4 6 8\n"  // d12 = 0 2 4 6
-            "vld2.8 {d16-d17}, [%[din_ptr2]]!    @ load din00= 0 2 4 6 8\n"  // d14 = 0 2 4 6
-            "vld1.8 {d28-d29}, [%[mask]]        @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-
-            // out0
-            "vdup.32 q11, %[bias]                 @ and \n"  // q8 = vbias
-            "vdup.32 q12, %[bias]                 @ and \n"  // q9 = vbias
-
-            "vbif.8 d12, d11, d28        @ bit select, deal with right pad\n"
-            "vbif.8 d13, d11, d29        @ bit select, deal with right pad\n"
-
-            "vbif.8 d14, d11, d28        @ bit select, deal with right pad\n"
-            "vbif.8 d15, d11, d29        @ bit select, deal with right pad\n"
-
-            "vbif.8 d16, d11, d28        @ bit select, deal with right pad\n"
-            "vbif.8 d17, d11, d29        @ bit select, deal with right pad\n"
-
-            "vext.8  d18, d12, d11, #1     @ ext din00 = 2 4 6 8\n"  // d16 = -1
-                                                                     // 1 3 5
-            "vext.8  d19, d14, d11, #1     @ ext \n"  // d17 = -1 1 3 5
-            "vext.8  d20, d16, d11, #1     @ ext \n"  // d18 = -1 1 3 5
-
-            // r0
-            "vmull.s8 q13, d12, d2                 @ out0 = din0 * w00 \n"  // q12 = 0 2 4 6
-            "vmull.s8 q14, d13, d3                 @ out1 = din0 * w01 \n"  // q12 = 1 3 5 7
-            "vmull.s8 q15, d18, d4                 @ out2 = din0 * w02 \n"  // q12 = 2 4 6 8
-
-            // r1
-            "vmlal.s8 q13, d14, d5                 @ out0 += din1 * w11 \n"  // q12 = 0 2 4 6
-            "vmlal.s8 q14, d15, d6                 @ out1 += din1 * w12 \n"  // q12 = 1 3 5 7
-            "vmlal.s8 q15, d19, d7                 @ out2 += din1 * w10 \n"  // q12 = 2 4 6 8
-
-            "vld1.32 {d12-d13}, [%[dout_ptr1]]!    @ load din00= 0 1 2 3 4 5 6 "
-            "7 8 9\n"
-            "vld1.32 {d14-d15}, [%[dout_ptr1]]    @ load din00= 0 1 2 3 4 5 6 "
-            "7 8 9\n"
-
-            "vaddw.s16 q11, q11, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vaddw.s16 q11, q11, d28                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d29                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vaddw.s16 q11, q11, d30                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d31                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            // r2
-            "vmull.s8 q13, d16, d8                 @ out0 += din1 * w11 \n"  // q12 = 0 2 4 6
-            "vmull.s8 q14, d17, d9                 @ out1 += din1 * w12 \n"  // q12 = 1 3 5 7
-            "vmull.s8 q15, d20, d10                 @ out2 += din1 * w10 \n"  // q12 = 2 4 6 8
-
-            "vld1.32 {d2-d3}, [%[rs_mask]]!     @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-            "vld1.32 {d4-d5}, [%[rs_mask]]    @ load din00= 0 1 2 3 4 5 6 7 8 "
-            "9\n"
-
-            "vaddw.s16 q11, q11, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "sub %[dout_ptr1], #16                  @ sub \n"
-            "vmov.u32 q8, #0                         @mov \n"
-            "vaddw.s16 q11, q11, d28                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d29                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vaddw.s16 q11, q11, d30                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d31                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmax.s32 q11, q11, q8                      @ max\n"
-            "vmax.s32 q12, q12, q8                      @ max\n"
-
-            "vbif q11, q6, q1        @ bit select, deal with right pad\n"
-            "vbif q12, q7, q2       @ bit select, deal with right pad\n"
-
-            "vst1.32 {d22-d23}, [%[dout_ptr1]]!         @ store\n"
-            "vst1.32 {d24-d25}, [%[dout_ptr1]]!         @ store\n"
-            "3:                                             \n"
-
-            : [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [dout_ptr1] "+r"(doutr0),
-              [cnt] "+r"(cnt),
-              [bias] "+r"(bias_val),
-              [rs_mask] "+r"(rst_mask)
-            : [mask] "r"(vmask), [size_pad_right] "r"(size_pad_right)
-            : "cc",
-              "memory",
-              "q0",
-              "q1",
-              "q2",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif
-        dout_ptr += w_out;
-      }
-    }
-  }
-}
-// w_in <= 16
-void conv_depthwise_3x3s2p1_bias_s_relu_int8(int* dout,
-                                             const signed char* din,
-                                             const signed char* weights,
-                                             const int* bias,
-                                             bool flag_bias,
-                                             const int num,
-                                             const int ch_in,
-                                             const int h_in,
-                                             const int w_in,
-                                             const int h_out,
-                                             const int w_out,
-                                             ARMContext* ctx) {
-  // printf("3x3s2 mult height \n");
-  //! pad is done implicit
-  // const char zero[8] = {0, 0, 0, 0, 0, 0, 0, 0};
-  //! for 4x6 convolution window
-  const unsigned char right_pad_idx[16] = {
-      0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
-  const unsigned int right_pad_rst[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-
-  // printf("conv3x3_dw start \n");
-  signed char* zero_ptr = ctx->workspace_data<signed char>();
-  memset(zero_ptr, 0, w_in * sizeof(signed char));
-  int* write_ptr =
-      reinterpret_cast<int*>(ctx->workspace_data<signed char>()) + w_out;
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  int w_stride = 9;
-
-  unsigned int size_pad_right = (unsigned int)(w_in);
-
-  uint8x8_t vmask_rp1 =
-      vcgt_u8(vdup_n_u8(size_pad_right), vld1_u8(right_pad_idx));
-  uint8x8_t vmask_rp2 =
-      vcgt_u8(vdup_n_u8(size_pad_right), vld1_u8(right_pad_idx + 8));
-  unsigned int rst_remain = (unsigned int)w_out;
-  uint32x4_t vmask_result1 =
-      vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst));
-  uint32x4_t vmask_result2 =
-      vcgtq_u32(vdupq_n_u32(rst_remain), vld1q_u32(right_pad_rst + 4));
-
-  uint8x16_t vmask_rp =
-      vcgtq_u8(vdupq_n_u8(size_pad_right), vld1q_u8(right_pad_idx));
-  unsigned char vmask[16];
-  vst1q_u8(vmask, vmask_rp);
-
-  unsigned int rmask[8];
-  vst1q_u32(rmask, vmask_result1);
-  vst1q_u32(rmask + 4, vmask_result2);
-  int8x8_t vzero = vdup_n_s8(0);
-  int32x4_t vzero_32 = vdupq_n_s32(0);
-
-  for (int n = 0; n < num; ++n) {
-    const signed char* din_batch = din + n * ch_in * size_in_channel;
-    int* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; c++) {
-      int* dout_ptr = dout_batch + c * size_out_channel;
-
-      const signed char* din_ch_ptr = din_batch + c * size_in_channel;
-
-      int bias_val = flag_bias ? bias[c] : 0;
-
-      const signed char* wei_ptr = weights + c * w_stride;
-
-#ifdef __aarch64__
-      int vbias[4] = {bias_val, bias_val, bias_val, bias_val};
-      int8x8_t wr00 = vdup_n_s8(wei_ptr[0]);
-      int8x8_t wr10 = vdup_n_s8(wei_ptr[3]);
-      int8x8_t wr20 = vdup_n_s8(wei_ptr[6]);
-
-      int8x8_t wr01 = vdup_n_s8(wei_ptr[1]);
-      int8x8_t wr11 = vdup_n_s8(wei_ptr[4]);
-      int8x8_t wr21 = vdup_n_s8(wei_ptr[7]);
-
-      int8x8_t wr02 = vdup_n_s8(wei_ptr[2]);
-      int8x8_t wr12 = vdup_n_s8(wei_ptr[5]);
-      int8x8_t wr22 = vdup_n_s8(wei_ptr[8]);
-#endif
-
-      int* doutr0 = nullptr;
-
-      const signed char* dr0 = din_ch_ptr;
-      const signed char* dr1 = dr0 + w_in;
-      const signed char* dr2 = dr1 + w_in;
-
-      const signed char* din_ptr0 = nullptr;
-      const signed char* din_ptr1 = nullptr;
-      const signed char* din_ptr2 = nullptr;
-
-      for (int i = 0; i < h_in; i += 2) {
-        //! process top pad pad_h = 1
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-
-        doutr0 = dout_ptr;
-
-        int out_buf1[8];
-        if (i == 0) {
-          din_ptr0 = zero_ptr;
-          din_ptr1 = dr0;
-          din_ptr2 = dr1;
-          dr0 = dr1;
-          dr1 = dr2;
-          dr2 = dr1 + w_in;
-        } else {
-          dr0 = dr2;
-          dr1 = dr2 + w_in;
-          dr2 = dr1 + w_in;
-        }
-        //! process bottom pad
-        if (i + 2 > h_in) {
-          switch (i + 2 - h_in) {
-            case 2:
-              din_ptr1 = zero_ptr;
-            case 1:
-              din_ptr2 = zero_ptr;
-            default:
-              break;
-          }
-        }
-#ifdef __aarch64__
-        unsigned int* rst_mask = rmask;
-        unsigned char* val_mask = vmask;
-        asm volatile(
-            "PRFM PLDL1KEEP, [%[din_ptr0]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr1]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr2]] \n"
-            "movi   v16.4s, #0x0\n"
-            // left
-            "ld1 {v10.8b}, [%[vmask]], #8             \n"
-            "ld1 {v11.8b}, [%[vmask]]                \n"
-            "ld2    {v0.8b - v1.8b}, [%[din_ptr0]]         \n" /*load a00-a015
-                                                                  to q0*/
-            "ld2    {v2.8b - v3.8b}, [%[din_ptr1]]         \n" /* load a00-a015
-                                                                  to q0*/
-            "ld2    {v4.8b - v5.8b}, [%[din_ptr2]]         \n" /*load a00-a015
-                                                                  to q0*/
-
-            "bif v0.8b, v16.8b, v10.8b               \n"
-            "bif v1.8b, v16.8b, v11.8b               \n"
-            "bif v2.8b, v16.8b, v10.8b               \n"
-            "bif v3.8b, v16.8b, v11.8b               \n"
-            "bif v4.8b, v16.8b, v10.8b               \n"
-            "bif v5.8b, v16.8b, v11.8b               \n"
-
-            "ld1    {v12.4s}, [%[bias_val]] \n" /* dup v10, bias*/
-            "ld1    {v13.4s}, [%[bias_val]] \n" /* dup v10, bias */
-
-            "ext v6.8b, v16.8b, v1.8B, #7       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       013579 */
-            "ext v7.8b, v16.8b, v3.8B, #7       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       013579 */
-            "ext v8.8b, v16.8b, v5.8B, #7       \n" /* vext_s8(vzero, vinr0, 7);
-                                                       013579 */
-
-            // r0
-            "smull  v17.8h,  %[v1].8b,  v0.8b   \n" /* outr00 = 02468 * w01 */
-            "smull  v18.8h,  %[v2].8b,  v1.8b\n"    /* outr00 += 13579 * w02 */
-            "smull  v19.8h,  %[v0].8b,  v6.8b\n"    /* outr00 += 013579 * w00 */
-
-            // "ldp    q0, q1, [%[ptr_out0]] \n"                    /* dup v10,
-            // bias */ "ldp    q10, q11, [%[rst_mask]] \n"                    /*
-            // dup v10, bias */
-
-            // r1
-            "smlal  v17.8h,  %[v4].8b,  v2.8b   \n" /* outr00 = 02468 * w01 */
-            "smlal  v18.8h,  %[v5].8b,  v3.8b\n"    /* outr00 += 13579 * w02 */
-            "smlal  v19.8h,  %[v3].8b,  v7.8b\n"    /* outr00 += 013579 * w00 */
-
-            "saddw   v12.4s, v12.4s, v17.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v17.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v18.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v18.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v19.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h        \n" /* v11 += outr00.high*/
-
-            // r2
-            "smull  v17.8h,  %[v7].8b,  v4.8b   \n" /* outr00 = 02468 * w01 */
-            "smull  v18.8h,  %[v8].8b,  v5.8b\n"    /* outr00 += 13579 * w02 */
-            "smull  v19.8h,  %[v6].8b,  v8.8b\n"    /* outr00 += 013579 * w00 */
-
-            "saddw   v12.4s, v12.4s, v17.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v17.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v18.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v18.8h        \n" /* v11 += outr00.high*/
-
-            "saddw   v12.4s, v12.4s, v19.4h         \n" /* v10 += outr00.low*/
-            "saddw2   v13.4s, v13.4s, v19.8h        \n" /* v11 += outr00.high*/
-
-            "smax   v12.4s, v12.4s, v16.4s    \n" /*relu*/
-            "smax   v13.4s, v13.4s, v16.4s    \n" /*relu*/
-
-            // "bif v12.16b, v0.16b, v10.16b         \n"
-            // "bif v13.16b, v1.16b, v11.16b         \n"
-
-            "stp     q12, q13, [%[ptr_out0]]   \n" /* store q10, q11 -> ptr_out
-                                                    */
-            : [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [vmask] "+r"(val_mask)
-            : [v0] "w"(wr00),
-              [v1] "w"(wr01),
-              [v2] "w"(wr02),
-              [v3] "w"(wr10),
-              [bias_val] "r"(vbias),
-              [v4] "w"(wr11),
-              [v5] "w"(wr12),
-              [v6] "w"(wr20),
-              [v7] "w"(wr21),
-              [v8] "w"(wr22),
-              [rst_mask] "r"(rmask),
-              [ptr_out0] "r"(out_buf1)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20");
-
-#else
-        unsigned int* rst_mask = rmask;
-        // prefetch input
-        // store weights
-        asm volatile("vld1.8    {d0-d1}, [%[wei_ptr]]    \n"
-                     :
-                     : [wei_ptr] "r"(wei_ptr)
-                     : "memory");
-        asm volatile(
-            // left
-            "pld [%[din_ptr0]]                @ preload data\n"
-            "pld [%[din_ptr1]]                @ preload data\n"
-            "pld [%[din_ptr2]]                @ preload data\n"
-            "vdup.s8     d2, d0[0]               @ d2 = w00, w00, w00, w00\n"
-            "vdup.s8     d3, d0[1]               @ d3 = w01, w01, w01, w01\n"
-            "vdup.s8     d4, d0[2]               @ d4 = w02, w02, w02, w02\n"
-            "vld2.8 {d12-d13}, [%[din_ptr0]]    @ load din00= 0 2 4 6 8\n"  // d10 = 0 2 4 6
-            "vld2.8 {d14-d15}, [%[din_ptr1]]    @ load din00= 0 2 4 6 8\n"  // d12 = 0 2 4 6
-            "vld2.8 {d16-d17}, [%[din_ptr2]]    @ load din00= 0 2 4 6 8\n"  // d14 = 0 2 4 6
-            "vld1.8 {d28-d29}, [%[mask]]        @ load din00= 0 1 2 3 4 5 6 7 "
-            "8 9\n"
-            "vmov.u32 d11, #0                   @ zero\n"
-
-            "vdup.s8     d5, d0[3]               @ d2 = w00, w00, w00, w00\n"
-            "vdup.s8     d6, d0[4]               @ d3 = w01, w01, w01, w01\n"
-            "vdup.s8     d7, d0[5]               @ d4 = w02, w02, w02, w02\n"
-
-            "vbif.8 d12, d11, d28        @ bit select, deal with right pad\n"
-            "vbif.8 d13, d11, d29        @ bit select, deal with right pad\n"
-
-            "vbif.8 d14, d11, d28        @ bit select, deal with right pad\n"
-            "vbif.8 d15, d11, d29        @ bit select, deal with right pad\n"
-
-            "vbif.8 d16, d11, d28        @ bit select, deal with right pad\n"
-            "vbif.8 d17, d11, d29        @ bit select, deal with right pad\n"
-
-            "vext.8  d18, d11, d13, #7     @ ext \n"  // d16 = -1 1 3 5
-            "vext.8  d19, d11, d15, #7     @ ext \n"  // d17 = -1 1 3 5
-            "vext.8  d20, d11, d17, #7     @ ext \n"  // d18 = -1 1 3 5
-
-            // "pld [%[dout_ptr1]]                @ preload data\n"
-
-            // r0
-            "vmull.s8 q13, d12, d3                 @ out0 = din0 * w01 \n"  // q12 = d12 * w01
-            "vmull.s8 q14, d13, d4                 @ out1 = din0 * w02 \n"  // q12 = d12 * w02
-            "vmull.s8 q15, d18, d2                 @ out2 = din0 * w00 \n"  // q12 = d12 * w02
-
-            "vdup.s8 d8, d0[6]               @ d2 = w00, w00, w00, w00\n"
-            "vdup.s8 d9, d0[7]               @ d3 = w01, w01, w01, w01\n"
-            "vdup.s8 d10, d1[0]               @ d4 = w02, w02, w02, w02\n"
-
-            // out0
-            "vdup.32 q11, %[bias]                            @ and \n"  // q8 =
-                                                                        // vbias
-            "vdup.32 q12, %[bias]                            @ and \n"  // q9 =
-                                                                        // vbias
-
-            // r1
-            "vmlal.s8 q13, d14, d6                 @ out0 += din1 * w11 \n"  // q12 = d12 * w11
-            "vmlal.s8 q14, d15, d7                 @ out1 += din1 * w12 \n"  // q12 = d12 * w11
-            "vmlal.s8 q15, d19, d5                 @ out2 += din1 * w10 \n"  // q12 = d12 * w11
-
-            // "vld1.32 {d12-d13}, [%[dout_ptr1]]!    @ load din00= 0 1 2 3 4 5
-            // 6 7 8 9\n" "vld1.32 {d14-d15}, [%[dout_ptr1]]    @ load din00= 0
-            // 1 2 3 4 5 6 7 8 9\n"
-
-            "vaddw.s16 q11, q11, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vaddw.s16 q11, q11, d28                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d29                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vaddw.s16 q11, q11, d30                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d31                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            // r2
-            "vmull.s8 q13, d16, d9                 @ out0 += din1 * w21 \n"  // q12 = d12 * w11
-            "vmull.s8 q14, d17, d10                 @ out1 += din1 * w22 \n"  // q12 = d12 * w11
-            "vmull.s8 q15, d20, d8                 @ out2 += din1 * w20 \n"  // q12 = d12 * w11
-
-            // "vld1.32 {d2-d3}, [%[rs_mask]]!     @ load din00= 0 1 2 3 4 5 6 7
-            // 8 9\n" "vld1.32 {d4-d5}, [%[rs_mask]]    @ load din00= 0 1 2 3 4
-            // 5 6 7 8 9\n"
-
-            // "sub %[dout_ptr1], #16                  @ sub \n"
-
-            "vaddw.s16 q11, q11, d26                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d27                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-            "vmov.u32 q8, #0                         @ mov \n"
-
-            "vaddw.s16 q11, q11, d28                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d29                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vaddw.s16 q11, q11, d30                 @addw \n"  // out1 +=
-            // vget_low_s16(out10)
-            "vaddw.s16 q12, q12, d31                 @addw \n"  // out1_1 +=
-            // vget_high_s16(out10)
-
-            "vmax.s32 q11, q11, q8                      @ max\n"
-            "vmax.s32 q12, q12, q8                      @ max\n"
-
-            // "vbif q11, q6, q1        @ bit select, deal with right pad\n"
-            // "vbif q12, q7, q2       @ bit select, deal with right pad\n"
-
-            "vst1.32 {d22-d25}, [%[dout_ptr1]]         @ store\n"
-            // "vst1.32 {d24-d25}, [%[dout_ptr1]]!         @ store\n"
-            : [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [bias] "+r"(bias_val),
-              [rs_mask] "+r"(rst_mask)
-            : [mask] "r"(vmask),
-              [size_pad_right] "r"(size_pad_right),
-              [dout_ptr1] "r"(out_buf1)
-            : "cc",
-              "memory",
-              "q0",
-              "q1",
-              "q2",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif
-        for (int w = 0; w < w_out; ++w) {
-          *doutr0++ = out_buf1[w];
-        }
-        dout_ptr += w_out;
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_depthwise_3x3p0.cc b/lite/backends/arm/math/conv_depthwise_3x3p0.cc
deleted file mode 100644
index ec7f3cfb84..0000000000
--- a/lite/backends/arm/math/conv_depthwise_3x3p0.cc
+++ /dev/null
@@ -1,4178 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/conv_depthwise.h"
-#include <arm_neon.h>
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void conv_depthwise_3x3s1p0_bias(float* dout,
-                                 const float* din,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext* ctx);
-
-//! for input width <= 4
-void conv_depthwise_3x3s1p0_bias_s(float* dout,
-                                   const float* din,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext* ctx);
-
-void conv_depthwise_3x3s2p0_bias(float* dout,
-                                 const float* din,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext* ctx);
-
-//! for input width <= 4
-void conv_depthwise_3x3s2p0_bias_s(float* dout,
-                                   const float* din,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext* ctx);
-
-void conv_depthwise_3x3s1p0_bias_relu(float* dout,
-                                      const float* din,
-                                      const float* weights,
-                                      const float* bias,
-                                      bool flag_bias,
-                                      const int num,
-                                      const int ch_in,
-                                      const int h_in,
-                                      const int w_in,
-                                      const int h_out,
-                                      const int w_out,
-                                      ARMContext* ctx);
-
-//! for input width <= 4
-void conv_depthwise_3x3s1p0_bias_s_relu(float* dout,
-                                        const float* din,
-                                        const float* weights,
-                                        const float* bias,
-                                        bool flag_bias,
-                                        const int num,
-                                        const int ch_in,
-                                        const int h_in,
-                                        const int w_in,
-                                        const int h_out,
-                                        const int w_out,
-                                        ARMContext* ctx);
-
-void conv_depthwise_3x3s2p0_bias_relu(float* dout,
-                                      const float* din,
-                                      const float* weights,
-                                      const float* bias,
-                                      bool flag_bias,
-                                      const int num,
-                                      const int ch_in,
-                                      const int h_in,
-                                      const int w_in,
-                                      const int h_out,
-                                      const int w_out,
-                                      ARMContext* ctx);
-
-//! for input width <= 4
-void conv_depthwise_3x3s2p0_bias_s_relu(float* dout,
-                                        const float* din,
-                                        const float* weights,
-                                        const float* bias,
-                                        bool flag_bias,
-                                        const int num,
-                                        const int ch_in,
-                                        const int h_in,
-                                        const int w_in,
-                                        const int h_out,
-                                        const int w_out,
-                                        ARMContext* ctx);
-
-void conv_depthwise_3x3p0(const float* din,
-                          float* dout,
-                          int num,
-                          int ch_out,
-                          int h_out,
-                          int w_out,
-                          int ch_in,
-                          int h_in,
-                          int w_in,
-                          const float* weights,
-                          const float* bias,
-                          int stride,
-                          bool flag_bias,
-                          bool flag_relu,
-                          ARMContext* ctx) {
-  if (stride == 1) {
-    if (flag_relu) {
-      if (w_in > 5) {
-        conv_depthwise_3x3s1p0_bias_relu(dout,
-                                         din,
-                                         weights,
-                                         bias,
-                                         flag_bias,
-                                         num,
-                                         ch_in,
-                                         h_in,
-                                         w_in,
-                                         h_out,
-                                         w_out,
-                                         ctx);
-      } else {
-        conv_depthwise_3x3s1p0_bias_s_relu(dout,
-                                           din,
-                                           weights,
-                                           bias,
-                                           flag_bias,
-                                           num,
-                                           ch_in,
-                                           h_in,
-                                           w_in,
-                                           h_out,
-                                           w_out,
-                                           ctx);
-      }
-    } else {
-      if (w_in > 5) {
-        conv_depthwise_3x3s1p0_bias(dout,
-                                    din,
-                                    weights,
-                                    bias,
-                                    flag_bias,
-                                    num,
-                                    ch_in,
-                                    h_in,
-                                    w_in,
-                                    h_out,
-                                    w_out,
-                                    ctx);
-      } else {
-        conv_depthwise_3x3s1p0_bias_s(dout,
-                                      din,
-                                      weights,
-                                      bias,
-                                      flag_bias,
-                                      num,
-                                      ch_in,
-                                      h_in,
-                                      w_in,
-                                      h_out,
-                                      w_out,
-                                      ctx);
-      }
-    }
-  } else {  //! stride = 2
-    if (flag_relu) {
-      if (w_in > 8) {
-        conv_depthwise_3x3s2p0_bias_relu(dout,
-                                         din,
-                                         weights,
-                                         bias,
-                                         flag_bias,
-                                         num,
-                                         ch_in,
-                                         h_in,
-                                         w_in,
-                                         h_out,
-                                         w_out,
-                                         ctx);
-      } else {
-        conv_depthwise_3x3s2p0_bias_s_relu(dout,
-                                           din,
-                                           weights,
-                                           bias,
-                                           flag_bias,
-                                           num,
-                                           ch_in,
-                                           h_in,
-                                           w_in,
-                                           h_out,
-                                           w_out,
-                                           ctx);
-      }
-    } else {
-      if (w_in > 8) {
-        conv_depthwise_3x3s2p0_bias(dout,
-                                    din,
-                                    weights,
-                                    bias,
-                                    flag_bias,
-                                    num,
-                                    ch_in,
-                                    h_in,
-                                    w_in,
-                                    h_out,
-                                    w_out,
-                                    ctx);
-      } else {
-        conv_depthwise_3x3s2p0_bias_s(dout,
-                                      din,
-                                      weights,
-                                      bias,
-                                      flag_bias,
-                                      num,
-                                      ch_in,
-                                      h_in,
-                                      w_in,
-                                      h_out,
-                                      w_out,
-                                      ctx);
-      }
-    }
-  }
-}
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width > 4
- */
-// 4line
-void conv_depthwise_3x3s1p0_bias(float* dout,
-                                 const float* din,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext* ctx) {
-  //! pad is done implicit
-  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
-  //! for 4x6 convolution window
-  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
-
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  int w_stride = 9;
-
-  int tile_w = w_out >> 2;
-  int remain = w_out % 4;
-
-  unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in);
-  const int remian_idx[4] = {0, 1, 2, 3};
-
-  uint32x4_t vmask_rp1 =
-      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_rp2 =
-      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_result =
-      vcgtq_s32(vdupq_n_s32(remain), vld1q_s32(remian_idx));
-
-  unsigned int vmask[8];
-  vst1q_u32(vmask, vmask_rp1);
-  vst1q_u32(vmask + 4, vmask_rp2);
-
-  unsigned int rmask[4];
-  vst1q_u32(rmask, vmask_result);
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-#ifdef __aarch64__
-    for (int c = 0; c < ch_in; c++) {
-      float* dout_ptr = dout_batch + c * size_out_channel;
-
-      const float* din_ch_ptr = din_batch + c * size_in_channel;
-
-      float bias_val = flag_bias ? bias[c] : 0.f;
-      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
-
-      const float* wei_ptr = weights + c * w_stride;
-
-      float32x4_t wr0 = vld1q_f32(wei_ptr);
-      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
-      // wr0 = vsetq_lane_f32(0.f, wr0, 3);
-      // wr1 = vsetq_lane_f32(0.f, wr1, 3);
-      // wr2 = vsetq_lane_f32(0.f, wr2, 3);
-
-      float* doutr0 = dout_ptr;
-      float* doutr1 = doutr0 + w_out;
-      float* doutr2 = doutr1 + w_out;
-      float* doutr3 = doutr2 + w_out;
-
-      const float* dr0 = din_ch_ptr;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-      const float* dr4 = dr3 + w_in;
-      const float* dr5 = dr4 + w_in;
-
-      const float* din_ptr0 = dr0;
-      const float* din_ptr1 = dr1;
-      const float* din_ptr2 = dr2;
-      const float* din_ptr3 = dr3;
-      const float* din_ptr4 = dr4;
-      const float* din_ptr5 = dr5;
-
-      for (int i = 0; i < h_out; i += 4) {
-        //! process top pad pad_h = 1
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-        din_ptr3 = dr3;
-        din_ptr4 = dr4;
-        din_ptr5 = dr5;
-
-        doutr0 = dout_ptr;
-        doutr1 = doutr0 + w_out;
-        doutr2 = doutr1 + w_out;
-        doutr3 = doutr2 + w_out;
-
-        dr0 = dr4;
-        dr1 = dr5;
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-        dr5 = dr4 + w_in;
-
-        //! process bottom pad
-        if (i + 5 >= h_in) {
-          switch (i + 5 - h_in) {
-            case 5:
-              din_ptr1 = zero_ptr;
-            case 4:
-              din_ptr2 = zero_ptr;
-            case 3:
-              din_ptr3 = zero_ptr;
-            case 2:
-              din_ptr4 = zero_ptr;
-            case 1:
-              din_ptr5 = zero_ptr;
-            case 0:
-              din_ptr5 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 4 > h_out) {
-          switch (i + 4 - h_out) {
-            case 3:
-              doutr1 = write_ptr;
-            case 2:
-              doutr2 = write_ptr;
-            case 1:
-              doutr3 = write_ptr;
-            default:
-              break;
-          }
-        }
-
-        int cnt = tile_w;
-        asm volatile(
-            "PRFM PLDL1KEEP, [%[din_ptr0]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr1]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr2]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr3]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr4]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr5]] \n"
-            "movi   v21.4s, #0x0\n" /* out0 = 0 */
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "ld1 {v12.4s}, [%[bias_val]]     \n"  /*vdupq_n_f32(bias_val)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */
-
-            // mid
-            // "cmp  %[cnt], #1                \n"
-            // "blt 5f                         \n"
-            "4:                             \n"
-            // r0
-            "fmla v12.4s ,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-
-            "fmla v12.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"      /*vld1q_f32(din_ptr0)*/
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v1.4s}, [%[din_ptr0]]        \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "st1 {v12.4s}, [%[doutr0]], #16     \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */
-
-            // r4
-            "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "st1 {v13.4s}, [%[doutr1]], #16     \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */
-
-            // r5
-            "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                      w0[0]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16     \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-
-            "subs %[cnt], %[cnt], #1 \n"
-
-            "st1 {v15.4s}, [%[doutr3]], #16     \n"
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "bne 4b \n"
-
-            // right
-            "5:                             \n"
-            "cmp  %[remain], #1             \n"
-            "blt 0f                         \n"
-            "ld1 {v18.4s, v19.4s}, [%[vmask]]         \n"
-            "ld1 {v22.4s}, [%[doutr0]]         \n"
-            "ld1 {v23.4s}, [%[doutr1]]         \n"
-            "ld1 {v24.4s}, [%[doutr2]]         \n"
-            "ld1 {v25.4s}, [%[doutr3]]         \n"
-
-            "bif v0.16b, %[vzero].16b, v18.16b \n"
-            "bif v1.16b, %[vzero].16b, v19.16b \n"
-            "bif v2.16b, %[vzero].16b, v18.16b \n"
-            "bif v3.16b, %[vzero].16b, v19.16b \n"
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v16 = 2345 */
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
-
-            // r0
-            "fmla v12.4s,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                    w0[0]*/
-
-            "bif v4.16b, %[vzero].16b, v18.16b \n"
-            "bif v5.16b, %[vzero].16b, v19.16b \n"
-            "bif v6.16b, %[vzero].16b, v18.16b \n"
-            "bif v7.16b, %[vzero].16b, v19.16b \n"
-
-            "fmla v12.4s,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                     w0[1]*/
-
-            "bif v8.16b, %[vzero].16b, v18.16b \n"
-            "bif v9.16b, %[vzero].16b, v19.16b \n"
-            "bif v10.16b, %[vzero].16b, v18.16b \n"
-            "bif v11.16b, %[vzero].16b, v19.16b \n"
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */
-            "ld1 {v18.4s}, [%[rmask]]         \n"
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "bif v12.16b, v22.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v12.4s}, [%[doutr0]], #16     \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "bif v13.16b, v23.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v13.4s}, [%[doutr1]], #16     \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                      w0[0]*/
-
-            "bif v14.16b, v24.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16     \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "bif v15.16b, v25.16b, v18.16b \n"
-
-            "st1 {v15.4s}, [%[doutr3]], #16     \n"
-            // end
-            "0:                             \n"
-            : [cnt] "+r"(cnt),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [din_ptr5] "+r"(din_ptr5),
-              [doutr0] "+r"(doutr0),
-              [doutr1] "+r"(doutr1),
-              [doutr2] "+r"(doutr2),
-              [doutr3] "+r"(doutr3)
-            : [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [bias_val] "r"(vbias),
-              [vmask] "r"(vmask),
-              [rmask] "r"(rmask),
-              [vzero] "w"(vzero),
-              [remain] "r"(remain)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22",
-              "v23",
-              "v24",
-              "v25");
-        dout_ptr = dout_ptr + 4 * w_out;
-      }
-    }
-#else
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float bias_val = flag_bias ? bias[i] : 0.f;
-
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-
-      const float* din0_ptr = nullptr;
-      const float* din1_ptr = nullptr;
-      const float* din2_ptr = nullptr;
-      const float* din3_ptr = nullptr;
-
-      float* doutr0 = nullptr;
-      float* doutr1 = nullptr;
-
-      float* ptr_zero = const_cast<float*>(zero);
-
-      for (int i = 0; i < h_out; i += 2) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-
-        doutr0 = dout_channel;
-        doutr1 = dout_channel + w_out;
-
-        dr0 = dr2;
-        dr1 = dr3;
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        //! process bottom pad
-        if (i + 3 >= h_in) {
-          switch (i + 3 - h_in) {
-            case 3:
-              din1_ptr = zero_ptr;
-            case 2:
-              din2_ptr = zero_ptr;
-            case 1:
-              din3_ptr = zero_ptr;
-            case 0:
-              din3_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 2 > h_out) {
-          doutr1 = write_ptr;
-        }
-        int cnt = tile_w;
-        unsigned int* rmask_ptr = rmask;
-        unsigned int* vmask_ptr = vmask;
-        asm volatile(
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                      @ preload data\n"
-            "pld [%[din2_ptr]]                      @ preload data\n"
-            "pld [%[din3_ptr]]                      @ preload data\n"
-
-            "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"
-            "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r1\n"
-            "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r2\n"
-            "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r3\n"
-            "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"
-            "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"
-            "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"
-            "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"
-
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q5
-                                                                           // =
-            // vbias
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-            // mid
-            "1:                                    @ right pad entry\n"
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                             @ preload data\n"
-            "pld [%[din2_ptr]]                             @ preload data\n"
-            "pld [%[din3_ptr]]                             @ preload data\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            "subs %[cnt], #1 @ loop count minus 1\n"
-
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "bne    1b                             @ jump to main loop start "
-            "point\n"
-
-            // right
-            "3:                                    @ right pad entry\n"
-            "cmp %[remain], #1                             @ check whether has "
-            "mid cols\n"
-            "blt  0f                                @ jump to main loop start "
-            "point\n"
-            "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"
-
-            "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d31}, [%[vmask]]!    @ load din r0\n"
-
-            "vbif d16, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d17, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d18, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vbif d20, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d21, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d22, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vbif d24, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d25, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d26, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vbif d28, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d29, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d30, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d19}, [%[rmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[rmask]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[dout_ptr1]]    @ load din r0\n"
-            "vld1.32  {d20-d21}, [%[dout_ptr2]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vbif d8, d16, d19              @ bit select, deal with right pad\n"
-            "vbif d9, d17, d23              @ bit select, deal with right pad\n"
-
-            "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vbif d10, d20, d19              @ bit select, deal with right "
-            "pad\n"
-            "vbif d11, d21, d23              @ bit select, deal with right "
-            "pad\n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-            "0:                         \n"
-
-            : [dout_ptr1] "+r"(doutr0),
-              [dout_ptr2] "+r"(doutr1),
-              [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [din3_ptr] "+r"(din3_ptr),
-              [cnt] "+r"(cnt),
-              [rmask] "+r"(rmask_ptr),
-              [vmask] "+r"(vmask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias_val] "r"(bias_val),
-              [vzero] "w"(vzero),
-              [remain] "r"(remain)
-            : "cc",
-              "memory",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-        dout_channel += 2 * w_out;
-      }  //! end of processing mid rows
-    }
-#endif
-  }
-}
-
-/**
- * \brief depthwise convolution kernel 3x3, stride 2
- */
-// w_in > 7
-void conv_depthwise_3x3s2p0_bias(float* dout,
-                                 const float* din,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext* ctx) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-
-  int tile_w = w_out >> 2;
-  int cnt_remain = w_out % 4;
-
-  unsigned int size_right_remain = (unsigned int)(w_in - (tile_w << 3));
-
-  uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-  uint32x4_t wmask =
-      vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));  // 0 1 2 3
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-
-  unsigned int dmask[12];
-
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-  vst1q_u32(dmask + 8, wmask);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float32x4_t vzero = vdupq_n_f32(0.f);
-
-      float32x4_t wbias;
-      float bias_c = 0.f;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-        bias_c = bias[i];
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-      const float* dr4 = dr3 + w_in;
-
-      const float* din0_ptr = dr0;
-      const float* din1_ptr = dr1;
-      const float* din2_ptr = dr2;
-      const float* din3_ptr = dr3;
-      const float* din4_ptr = dr4;
-
-      float* doutr0 = dout_channel;
-      float* doutr0_ptr = nullptr;
-      float* doutr1_ptr = nullptr;
-
-#ifdef __aarch64__
-      for (int i = 0; i < h_out; i += 2) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-        din4_ptr = dr4;
-
-        doutr0_ptr = doutr0;
-        doutr1_ptr = doutr0 + w_out;
-
-        dr0 = dr4;
-        dr1 = dr0 + w_in;
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-
-        //! process bottom pad
-        if (i + 4 >= h_in) {
-          switch (i + 4 - h_in) {
-            case 4:
-              din1_ptr = zero_ptr;
-            case 3:
-              din2_ptr = zero_ptr;
-            case 2:
-              din3_ptr = zero_ptr;
-            case 1:
-              din4_ptr = zero_ptr;
-            case 0:
-              din4_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process output pad
-        if (i + 2 > h_out) {
-          doutr1_ptr = write_ptr;
-        }
-        int cnt = tile_w;
-        asm volatile(
-            // top
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "0:                                      \n"
-            "prfm pldl1keep, [%[inptr0]]             \n"
-            "prfm pldl1keep, [%[inptr1]]             \n"
-            "prfm pldl1keep, [%[inptr2]]             \n"
-            "prfm pldl1keep, [%[inptr3]]             \n"
-            "prfm pldl1keep, [%[inptr4]]             \n"
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-                                                        // mid
-            "2:                                          \n"
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[0]            \n"   // {0,2,4,6} * w00
-            "fmul v12.4s, v1.4s, %[w0].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v2.16b, v18.16b, #4     \n"     // v10 = {2,4,6,8}
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v12.4s, v3.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v4.16b, v19.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[0]            \n"  // {0,2,4,6} * w00
-            "fmla v11.4s, v4.4s, %[w2].s[0]            \n"  // {0,2,4,6} * w00
-
-            "fmul v14.4s, v5.4s, %[w0].s[1]            \n"  // {1,3,5,7} * w01
-            "fmla v12.4s, v5.4s, %[w2].s[1]            \n"  // {1,3,5,7} * w01
-
-            "fmla v17.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-            "fmla v16.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v6.16b, v20.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v7.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v8.16b, v21.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v9.4s, %[w2].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"      // v10 = {2,4,6,8}
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-            "subs %[cnt], %[cnt], #1                    \n"
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "bne  2b                                    \n"
-
-            // right
-            "1:                                          \n"
-            "cmp %[remain], #1                           \n"
-            "blt 4f                                     \n"
-            "3:                                         \n"
-            "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "bif  v2.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v3.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "ext  v10.16b, v0.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "bif  v6.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v7.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[0]            \n"   // {0,2,4,6} * w00
-            "fmul v12.4s, v1.4s, %[w0].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v2.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-            "bif  v8.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v9.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v12.4s, v3.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v4.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[0]            \n"  // {0,2,4,6} * w00
-            "fmla v11.4s, v4.4s, %[w2].s[0]            \n"  // {0,2,4,6} * w00
-
-            "fmul v14.4s, v5.4s, %[w0].s[1]            \n"  // {1,3,5,7} * w01
-            "fmla v12.4s, v5.4s, %[w2].s[1]            \n"  // {1,3,5,7} * w01
-
-            "fmla v17.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-            "fmla v16.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v6.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v7.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v8.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-            "ld1 {v0.4s}, [%[outptr0]]                  \n"
-
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-            "ld1 {v1.4s}, [%[outptr1]]                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v9.4s, %[w2].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "bif  v16.16b, v0.16b, %[wmask].16b    \n"  // pipei
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "bif  v17.16b, v1.16b, %[wmask].16b    \n"  // pipei
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-            "4:                                          \n"
-            : [inptr0] "+r"(din0_ptr),
-              [inptr1] "+r"(din1_ptr),
-              [inptr2] "+r"(din2_ptr),
-              [inptr3] "+r"(din3_ptr),
-              [inptr4] "+r"(din4_ptr),
-              [outptr0] "+r"(doutr0_ptr),
-              [outptr1] "+r"(doutr1_ptr),
-              [cnt] "+r"(cnt)
-            : [vzero] "w"(vzero),
-              [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [remain] "r"(cnt_remain),
-              [mask1] "w"(vmask_rp1),
-              [mask2] "w"(vmask_rp2),
-              [wmask] "w"(wmask),
-              [vbias] "w"(wbias)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21");
-        doutr0 = doutr0 + 2 * w_out;
-      }
-#else
-      for (int i = 0; i < h_out; i++) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-
-        doutr0_ptr = doutr0;
-
-        dr0 = dr2;
-        dr1 = dr0 + w_in;
-        dr2 = dr1 + w_in;
-
-        //! process bottom pad
-        if (i + 2 > h_in) {
-          switch (i + 2 - h_in) {
-            case 2:
-              din1_ptr = zero_ptr;
-            case 1:
-              din2_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        int cnt = tile_w;
-        unsigned int* mask_ptr = dmask;
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "0:                                                     \n"
-            "vmov.u32 q9, #0                                \n"
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r1\n"
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"
-            "pld [%[din0_ptr]]                              @ preload data\n"
-            "pld [%[din1_ptr]]                              @ preload data\n"
-            "pld [%[din2_ptr]]                              @ preload data\n"
-
-            "vld1.32  {d16}, [%[din0_ptr]]                  @ load din r0\n"  // q2={8,10,12,14}
-
-            "vdup.32 q3, %[bias]                            @ and \n"  // q10 =
-                                                                       // vbias
-            // mid
-            "2:                                             \n"
-            "vext.32  q6, q10, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.32 {d16}, [%[din1_ptr]]                   @ load din r1\n"  // q2={8,10,12,14}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q0 * w00
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6 * w02
-
-            "vext.32  q7, q12, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.32 {d16}, [%[din2_ptr]]                   @ load din r1\n"  // q2={8,10,12,14}
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w02
-
-            "vext.32  q6, q14, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q6 * w02
-
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "subs %[cnt], #1                                \n"
-
-            "vld1.32  {d16}, [%[din0_ptr]]                  @ load din r0\n"  // q2={8,10,12,14}
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-
-            "vdup.32  q3, %[bias]                           @ and \n"  // q10 =
-                                                                       // vbias
-            "bne  2b                                        \n"
-
-            // right
-            "1:                                             \n"
-            "cmp %[remain], #1                              \n"
-            "blt 3f                                         \n"
-
-            "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"
-
-            "vbif q10, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q11, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q12, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q13, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q14, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q15, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-
-            "vext.32 q6, q10, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vext.32 q7, q12, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q0 * w00
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6 * w02
-
-            "vext.32 q6, q14, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.f32   {d20-d21}, [%[outptr]]              @ load output\n"
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w02
-
-            "vld1.f32   {d22-d23}, [%[mask_ptr]]            @ load mask\n"
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q6 * w02
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vbif.f32 q3, q10, q11                          @ write mask\n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "3:                                             \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [outptr] "+r"(doutr0_ptr),
-              [cnt] "+r"(cnt),
-              [mask_ptr] "+r"(mask_ptr)
-            : [remain] "r"(cnt_remain),
-              [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "r"(bias_c)
-            : "cc",
-              "memory",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-
-        doutr0 = doutr0 + w_out;
-      }
-#endif
-    }
-  }
-}
-
-// 4line
-void conv_depthwise_3x3s1p0_bias_relu(float* dout,
-                                      const float* din,
-                                      const float* weights,
-                                      const float* bias,
-                                      bool flag_bias,
-                                      const int num,
-                                      const int ch_in,
-                                      const int h_in,
-                                      const int w_in,
-                                      const int h_out,
-                                      const int w_out,
-                                      ARMContext* ctx) {
-  //! pad is done implicit
-  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
-  //! for 4x6 convolution window
-  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
-
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  int w_stride = 9;
-
-  int tile_w = w_out >> 2;
-  int remain = w_out % 4;
-
-  unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in);
-  const int remian_idx[4] = {0, 1, 2, 3};
-
-  uint32x4_t vmask_rp1 =
-      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_rp2 =
-      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_result =
-      vcgtq_s32(vdupq_n_s32(remain), vld1q_s32(remian_idx));
-
-  unsigned int vmask[8];
-  vst1q_u32(vmask, vmask_rp1);
-  vst1q_u32(vmask + 4, vmask_rp2);
-
-  unsigned int rmask[4];
-  vst1q_u32(rmask, vmask_result);
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-#ifdef __aarch64__
-    for (int c = 0; c < ch_in; c++) {
-      float* dout_ptr = dout_batch + c * size_out_channel;
-
-      const float* din_ch_ptr = din_batch + c * size_in_channel;
-
-      float bias_val = flag_bias ? bias[c] : 0.f;
-      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
-
-      const float* wei_ptr = weights + c * w_stride;
-
-      float32x4_t wr0 = vld1q_f32(wei_ptr);
-      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
-      // wr0 = vsetq_lane_f32(0.f, wr0, 3);
-      // wr1 = vsetq_lane_f32(0.f, wr1, 3);
-      // wr2 = vsetq_lane_f32(0.f, wr2, 3);
-
-      float* doutr0 = dout_ptr;
-      float* doutr1 = doutr0 + w_out;
-      float* doutr2 = doutr1 + w_out;
-      float* doutr3 = doutr2 + w_out;
-
-      const float* dr0 = din_ch_ptr;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-      const float* dr4 = dr3 + w_in;
-      const float* dr5 = dr4 + w_in;
-
-      const float* din_ptr0 = dr0;
-      const float* din_ptr1 = dr1;
-      const float* din_ptr2 = dr2;
-      const float* din_ptr3 = dr3;
-      const float* din_ptr4 = dr4;
-      const float* din_ptr5 = dr5;
-
-      for (int i = 0; i < h_out; i += 4) {
-        //! process top pad pad_h = 1
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-        din_ptr3 = dr3;
-        din_ptr4 = dr4;
-        din_ptr5 = dr5;
-
-        doutr0 = dout_ptr;
-        doutr1 = doutr0 + w_out;
-        doutr2 = doutr1 + w_out;
-        doutr3 = doutr2 + w_out;
-
-        dr0 = dr4;
-        dr1 = dr5;
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-        dr5 = dr4 + w_in;
-
-        //! process bottom pad
-        if (i + 5 >= h_in) {
-          switch (i + 5 - h_in) {
-            case 5:
-              din_ptr1 = zero_ptr;
-            case 4:
-              din_ptr2 = zero_ptr;
-            case 3:
-              din_ptr3 = zero_ptr;
-            case 2:
-              din_ptr4 = zero_ptr;
-            case 1:
-              din_ptr5 = zero_ptr;
-            case 0:
-              din_ptr5 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 4 > h_out) {
-          switch (i + 4 - h_out) {
-            case 3:
-              doutr1 = write_ptr;
-            case 2:
-              doutr2 = write_ptr;
-            case 1:
-              doutr3 = write_ptr;
-            default:
-              break;
-          }
-        }
-
-        int cnt = tile_w;
-        asm volatile(
-            "PRFM PLDL1KEEP, [%[din_ptr0]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr1]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr2]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr3]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr4]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr5]] \n"
-            "movi   v21.4s, #0x0\n" /* out0 = 0 */
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "ld1 {v12.4s}, [%[bias_val]]     \n"  /*vdupq_n_f32(bias_val)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */
-
-            // mid
-            "4:                             \n"
-            // r0
-            "fmla v12.4s ,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-
-            "fmla v12.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"      /*vld1q_f32(din_ptr0)*/
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v1.4s}, [%[din_ptr0]]        \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmax v12.4s, v12.4s, %[vzero].4s \n" /* relu */
-
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v12.4s}, [%[doutr0]], #16     \n"
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */
-            "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            // r4
-            "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmax v13.4s, v13.4s, %[vzero].4s \n" /* relu */
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v13.4s}, [%[doutr1]], #16     \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */
-            "ld1 {v13.4s}, [%[bias_val]]      \n"   /*vdupq_n_f32(bias_val)*/
-            // r5
-            "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                      w0[0]*/
-
-            "fmax v14.4s, v14.4s, %[vzero].4s \n" /* relu */
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16     \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "fmax v15.4s, v15.4s, %[vzero].4s \n" /* relu */
-
-            "subs %[cnt], %[cnt], #1 \n"
-
-            "st1 {v15.4s}, [%[doutr3]], #16     \n"
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "bne 4b \n"
-
-            // right
-            "5:                             \n"
-            "cmp  %[remain], #1             \n"
-            "blt 0f                         \n"
-            "ld1 {v18.4s, v19.4s}, [%[vmask]]         \n"
-            "ld1 {v22.4s}, [%[doutr0]]         \n"
-            "ld1 {v23.4s}, [%[doutr1]]         \n"
-            "ld1 {v24.4s}, [%[doutr2]]         \n"
-            "ld1 {v25.4s}, [%[doutr3]]         \n"
-
-            "bif v0.16b, %[vzero].16b, v18.16b \n"
-            "bif v1.16b, %[vzero].16b, v19.16b \n"
-            "bif v2.16b, %[vzero].16b, v18.16b \n"
-            "bif v3.16b, %[vzero].16b, v19.16b \n"
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v16 = 2345 */
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
-
-            // r0
-            "fmla v12.4s,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                    w0[0]*/
-
-            "bif v4.16b, %[vzero].16b, v18.16b \n"
-            "bif v5.16b, %[vzero].16b, v19.16b \n"
-            "bif v6.16b, %[vzero].16b, v18.16b \n"
-            "bif v7.16b, %[vzero].16b, v19.16b \n"
-
-            "fmla v12.4s,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                     w0[1]*/
-
-            "bif v8.16b, %[vzero].16b, v18.16b \n"
-            "bif v9.16b, %[vzero].16b, v19.16b \n"
-            "bif v10.16b, %[vzero].16b, v18.16b \n"
-            "bif v11.16b, %[vzero].16b, v19.16b \n"
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */
-            "ld1 {v18.4s}, [%[rmask]]         \n"
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmax v12.4s, v12.4s, %[vzero].4s \n" /* relu */
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "bif v12.16b, v22.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */
-            "st1 {v12.4s}, [%[doutr0]], #16     \n"
-
-            // r3
-            "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmax v13.4s, v13.4s, %[vzero].4s \n" /* relu */
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "bif v13.16b, v23.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "st1 {v13.4s}, [%[doutr1]], #16     \n"
-
-            "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                      w0[0]*/
-
-            "fmax v14.4s, v14.4s, %[vzero].4s \n" /* relu */
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "bif v14.16b, v24.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16     \n"
-
-            "fmax v15.4s, v15.4s, %[vzero].4s \n" /* relu */
-
-            "bif v15.16b, v25.16b, v18.16b \n"
-
-            "st1 {v15.4s}, [%[doutr3]], #16     \n"
-            // end
-            "0:                             \n"
-            : [cnt] "+r"(cnt),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [din_ptr5] "+r"(din_ptr5),
-              [doutr0] "+r"(doutr0),
-              [doutr1] "+r"(doutr1),
-              [doutr2] "+r"(doutr2),
-              [doutr3] "+r"(doutr3)
-            : [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [bias_val] "r"(vbias),
-              [vmask] "r"(vmask),
-              [rmask] "r"(rmask),
-              [vzero] "w"(vzero),
-              [remain] "r"(remain)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22",
-              "v23",
-              "v24",
-              "v25");
-        dout_ptr = dout_ptr + 4 * w_out;
-      }
-    }
-#else
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float bias_val = flag_bias ? bias[i] : 0.f;
-
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-
-      const float* din0_ptr = nullptr;
-      const float* din1_ptr = nullptr;
-      const float* din2_ptr = nullptr;
-      const float* din3_ptr = nullptr;
-
-      float* doutr0 = nullptr;
-      float* doutr1 = nullptr;
-
-      float* ptr_zero = const_cast<float*>(zero);
-
-      for (int i = 0; i < h_out; i += 2) {
-        //! process top pad pad_h = 1
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-
-        doutr0 = dout_channel;
-        doutr1 = dout_channel + w_out;
-
-        dr0 = dr2;
-        dr1 = dr3;
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        //! process bottom pad
-        if (i + 3 >= h_in) {
-          switch (i + 3 - h_in) {
-            case 3:
-              din1_ptr = zero_ptr;
-            case 2:
-              din2_ptr = zero_ptr;
-            case 1:
-              din3_ptr = zero_ptr;
-            case 0:
-              din3_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 2 > h_out) {
-          doutr1 = write_ptr;
-        }
-        int cnt = tile_w;
-        unsigned int* rmask_ptr = rmask;
-        unsigned int* vmask_ptr = vmask;
-        asm volatile(
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                      @ preload data\n"
-            "pld [%[din2_ptr]]                      @ preload data\n"
-            "pld [%[din3_ptr]]                      @ preload data\n"
-
-            "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"
-            "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r1\n"
-            "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r2\n"
-            "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r3\n"
-            "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"
-            "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"
-            "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"
-            "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"
-
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q5
-                                                                           // =
-            // vbias
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            // mid
-            "1:                                    @ right pad entry\n"
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                             @ preload data\n"
-            "pld [%[din2_ptr]]                             @ preload data\n"
-            "pld [%[din3_ptr]]                             @ preload data\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"
-            "vmax.f32 q4, q4, %q[vzero]          @ relu \n"
-
-            "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-            "vmax.f32 q5, q5, %q[vzero]          @ relu \n"
-
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            "subs %[cnt], #1 @ loop count minus 1\n"
-
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "bne    1b                             @ jump to main loop start "
-            "point\n"
-
-            // right
-            "3:                                    @ right pad entry\n"
-            "cmp %[remain], #1                             @ check whether has "
-            "mid cols\n"
-            "blt  0f                                @ jump to main loop start "
-            "point\n"
-            "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"
-
-            "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d31}, [%[vmask]]!    @ load din r0\n"
-
-            "vbif d16, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d17, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d18, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vbif d20, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d21, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d22, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vbif d24, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d25, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d26, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vbif d28, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d29, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d30, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d19}, [%[rmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[rmask]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[dout_ptr1]]    @ load din r0\n"
-            "vld1.32  {d20-d21}, [%[dout_ptr2]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vmax.f32 q4, q4, %q[vzero]          @ relu \n"
-
-            "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vbif d8, d16, d19              @ bit select, deal with right pad\n"
-            "vbif d9, d17, d23              @ bit select, deal with right pad\n"
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmax.f32 q5, q5, %q[vzero]          @ relu \n"
-
-            "vbif d10, d20, d19              @ bit select, deal with right "
-            "pad\n"
-            "vbif d11, d21, d23              @ bit select, deal with right "
-            "pad\n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-            "0:                         \n"
-
-            : [dout_ptr1] "+r"(doutr0),
-              [dout_ptr2] "+r"(doutr1),
-              [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [din3_ptr] "+r"(din3_ptr),
-              [cnt] "+r"(cnt),
-              [rmask] "+r"(rmask_ptr),
-              [vmask] "+r"(vmask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias_val] "r"(bias_val),
-              [vzero] "w"(vzero),
-              [remain] "r"(remain)
-            : "cc",
-              "memory",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-        dout_channel += 2 * w_out;
-      }  //! end of processing mid rows
-    }
-#endif
-  }
-}
-/**
- * \brief depthwise convolution kernel 3x3, stride 2, with reulu
- */
-// w_in > 7
-void conv_depthwise_3x3s2p0_bias_relu(float* dout,
-                                      const float* din,
-                                      const float* weights,
-                                      const float* bias,
-                                      bool flag_bias,
-                                      const int num,
-                                      const int ch_in,
-                                      const int h_in,
-                                      const int w_in,
-                                      const int h_out,
-                                      const int w_out,
-                                      ARMContext* ctx) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-
-  int tile_w = w_out >> 2;
-  int cnt_remain = w_out % 4;
-
-  unsigned int size_right_remain = (unsigned int)(w_in - (tile_w << 3));
-
-  uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-  uint32x4_t wmask =
-      vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));  // 0 1 2 3
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-
-  unsigned int dmask[12];
-
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-  vst1q_u32(dmask + 8, wmask);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float32x4_t vzero = vdupq_n_f32(0.f);
-
-      float32x4_t wbias;
-      float bias_c = 0.f;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-        bias_c = bias[i];
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-      const float* dr4 = dr3 + w_in;
-
-      const float* din0_ptr = dr0;
-      const float* din1_ptr = dr1;
-      const float* din2_ptr = dr2;
-      const float* din3_ptr = dr3;
-      const float* din4_ptr = dr4;
-
-      float* doutr0 = dout_channel;
-      float* doutr0_ptr = nullptr;
-      float* doutr1_ptr = nullptr;
-
-#ifdef __aarch64__
-      for (int i = 0; i < h_out; i += 2) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-        din4_ptr = dr4;
-
-        doutr0_ptr = doutr0;
-        doutr1_ptr = doutr0 + w_out;
-
-        dr0 = dr4;
-        dr1 = dr0 + w_in;
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-
-        //! process bottom pad
-        if (i + 4 >= h_in) {
-          switch (i + 4 - h_in) {
-            case 4:
-              din1_ptr = zero_ptr;
-            case 3:
-              din2_ptr = zero_ptr;
-            case 2:
-              din3_ptr = zero_ptr;
-            case 1:
-              din4_ptr = zero_ptr;
-            case 0:
-              din4_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process output pad
-        if (i + 2 > h_out) {
-          doutr1_ptr = write_ptr;
-        }
-        int cnt = tile_w;
-        asm volatile(
-            // top
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "0:                                      \n"
-            "prfm pldl1keep, [%[inptr0]]             \n"
-            "prfm pldl1keep, [%[inptr1]]             \n"
-            "prfm pldl1keep, [%[inptr2]]             \n"
-            "prfm pldl1keep, [%[inptr3]]             \n"
-            "prfm pldl1keep, [%[inptr4]]             \n"
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-                                                        // mid
-            "2:                                          \n"
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[0]            \n"   // {0,2,4,6} * w00
-            "fmul v12.4s, v1.4s, %[w0].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v2.16b, v18.16b, #4     \n"     // v10 = {2,4,6,8}
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v12.4s, v3.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v4.16b, v19.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[0]            \n"  // {0,2,4,6} * w00
-            "fmla v11.4s, v4.4s, %[w2].s[0]            \n"  // {0,2,4,6} * w00
-
-            "fmul v14.4s, v5.4s, %[w0].s[1]            \n"  // {1,3,5,7} * w01
-            "fmla v12.4s, v5.4s, %[w2].s[1]            \n"  // {1,3,5,7} * w01
-
-            "fmla v17.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-            "fmla v16.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v6.16b, v20.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v7.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v8.16b, v21.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v9.4s, %[w2].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"       // v10 = {2,4,6,8}
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"   // v10 = vbias
-            "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */
-
-            "subs %[cnt], %[cnt], #1                    \n"
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "bne  2b                                    \n"
-
-            // right
-            "1:                                          \n"
-            "cmp %[remain], #1                           \n"
-            "blt 4f                                     \n"
-            "3:                                         \n"
-            "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "bif  v2.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v3.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "ext  v10.16b, v0.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "bif  v6.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v7.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[0]            \n"   // {0,2,4,6} * w00
-            "fmul v12.4s, v1.4s, %[w0].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v2.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-            "bif  v8.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v9.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v12.4s, v3.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v4.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[0]            \n"  // {0,2,4,6} * w00
-            "fmla v11.4s, v4.4s, %[w2].s[0]            \n"  // {0,2,4,6} * w00
-
-            "fmul v14.4s, v5.4s, %[w0].s[1]            \n"  // {1,3,5,7} * w01
-            "fmla v12.4s, v5.4s, %[w2].s[1]            \n"  // {1,3,5,7} * w01
-
-            "fmla v17.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-            "fmla v16.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v6.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v7.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v8.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-            "ld1 {v0.4s}, [%[outptr0]]                  \n"
-
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-            "ld1 {v1.4s}, [%[outptr1]]                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v9.4s, %[w2].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "bif  v16.16b, v0.16b, %[wmask].16b    \n"  // pipei
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */
-
-            "bif  v17.16b, v1.16b, %[wmask].16b    \n"  // pipei
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-            "4:                                          \n"
-            : [inptr0] "+r"(din0_ptr),
-              [inptr1] "+r"(din1_ptr),
-              [inptr2] "+r"(din2_ptr),
-              [inptr3] "+r"(din3_ptr),
-              [inptr4] "+r"(din4_ptr),
-              [outptr0] "+r"(doutr0_ptr),
-              [outptr1] "+r"(doutr1_ptr),
-              [cnt] "+r"(cnt)
-            : [vzero] "w"(vzero),
-              [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [remain] "r"(cnt_remain),
-              [mask1] "w"(vmask_rp1),
-              [mask2] "w"(vmask_rp2),
-              [wmask] "w"(wmask),
-              [vbias] "w"(wbias)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21");
-        doutr0 = doutr0 + 2 * w_out;
-      }
-#else
-      for (int i = 0; i < h_out; i++) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-
-        doutr0_ptr = doutr0;
-
-        dr0 = dr2;
-        dr1 = dr0 + w_in;
-        dr2 = dr1 + w_in;
-
-        //! process bottom pad
-        if (i + 2 > h_in) {
-          switch (i + 2 - h_in) {
-            case 2:
-              din1_ptr = zero_ptr;
-            case 1:
-              din2_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        int cnt = tile_w;
-        unsigned int* mask_ptr = dmask;
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "0:                                                     \n"
-            "vmov.u32 q9, #0                                \n"
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r1\n"
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"
-            "pld [%[din0_ptr]]                              @ preload data\n"
-            "pld [%[din1_ptr]]                              @ preload data\n"
-            "pld [%[din2_ptr]]                              @ preload data\n"
-
-            "vld1.32  {d16}, [%[din0_ptr]]                  @ load din r0\n"  // q2={8,10,12,14}
-
-            "vdup.32 q3, %[bias]                            @ and \n"  // q10 =
-                                                                       // vbias
-            // mid
-            "2:                                             \n"
-            "vext.32  q6, q10, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.32 {d16}, [%[din1_ptr]]                   @ load din r1\n"  // q2={8,10,12,14}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q0 * w00
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6 * w02
-
-            "vext.32  q7, q12, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.32 {d16}, [%[din2_ptr]]                   @ load din r1\n"  // q2={8,10,12,14}
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w02
-
-            "vext.32  q6, q14, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q6 * w02
-
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "subs %[cnt], #1                                \n"
-            "vmax.f32 q3, q3, q9                     @ relu \n"
-
-            "vld1.32  {d16}, [%[din0_ptr]]                  @ load din r0\n"  // q2={8,10,12,14}
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-
-            "vdup.32  q3, %[bias]                           @ and \n"  // q10 =
-                                                                       // vbias
-            "bne  2b                                        \n"
-
-            // right
-            "1:                                             \n"
-            "cmp %[remain], #1                              \n"
-            "blt 3f                                         \n"
-
-            "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"
-
-            "vbif q10, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q11, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q12, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q13, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q14, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q15, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-
-            "vext.32 q6, q10, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vext.32 q7, q12, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q0 * w00
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6 * w02
-
-            "vext.32 q6, q14, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.f32   {d20-d21}, [%[outptr]]              @ load output\n"
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w02
-
-            "vld1.f32   {d22-d23}, [%[mask_ptr]]            @ load mask\n"
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q6 * w02
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vmax.f32 q3, q3, q9                     @ relu \n"
-
-            "vbif.f32 q3, q10, q11                          @ write mask\n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "3:                                             \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [outptr] "+r"(doutr0_ptr),
-              [cnt] "+r"(cnt),
-              [mask_ptr] "+r"(mask_ptr)
-            : [remain] "r"(cnt_remain),
-              [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "r"(bias_c)
-            : "cc",
-              "memory",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-
-        doutr0 = doutr0 + w_out;
-      }
-#endif
-    }
-  }
-}
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width <= 4
- */
-void conv_depthwise_3x3s1p0_bias_s(float* dout,
-                                   const float* din,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext* ctx) {
-  //! 3x3s1 convolution, implemented by direct algorithm
-  //! pad is done implicit
-  //! for 4x6 convolution window
-  const int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
-  const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f};
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-  uint32x4_t vmask_rp1 =
-      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(6 - w_in));
-  uint32x4_t vmask_rp2 =
-      vcgeq_s32(vld1q_s32(right_pad_idx + 4), vdupq_n_s32(6 - w_in));
-
-  unsigned int vmask[8];
-  vst1q_u32(vmask, vmask_rp1);
-  vst1q_u32(vmask + 4, vmask_rp2);
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      float* dout_channel = dout_batch + i * size_out_channel;
-      const float* din_channel = din_batch + i * size_in_channel;
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float32x4_t wbias;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      float out_buf1[4];
-      float out_buf2[4];
-      float trash_buf[4];
-
-      float* doutr0 = dout_channel;
-      float* doutr1 = dout_channel + w_out;
-
-      for (int j = 0; j < h_out; j += 2) {
-        const float* dr0 = din_channel + j * w_in;
-        const float* dr1 = dr0 + w_in;
-        const float* dr2 = dr1 + w_in;
-        const float* dr3 = dr2 + w_in;
-
-        doutr0 = dout_channel + j * w_out;
-        doutr1 = doutr0 + w_out;
-
-        if (j + 3 >= h_in) {
-          switch (j + 3 - h_in) {
-            case 3:
-              dr1 = zero_ptr;
-            case 2:
-              dr2 = zero_ptr;
-            case 1:
-              dr3 = zero_ptr;
-              doutr1 = trash_buf;
-            case 0:
-              dr3 = zero_ptr;
-              doutr1 = trash_buf;
-            default:
-              break;
-          }
-        }
-#ifdef __aarch64__
-        asm volatile(
-            "prfm pldl1keep, [%[din0]]\n"
-            "prfm pldl1keep, [%[din1]]\n"
-            "prfm pldl1keep, [%[din2]]\n"
-            "prfm pldl1keep, [%[din3]]\n"
-
-            "ld1 {v0.4s, v1.4s}, [%[din0]]\n"
-            "ld1 {v2.4s, v3.4s}, [%[din1]]\n"
-            "ld1 {v4.4s, v5.4s}, [%[din2]]\n"
-            "ld1 {v6.4s, v7.4s}, [%[din3]]\n"
-
-            "bif v0.16b, %[zero].16b, %[mask1].16b\n"  // d0_1234
-            "bif v1.16b, %[zero].16b, %[mask2].16b\n"  // d0_1234
-
-            "bif v2.16b, %[zero].16b, %[mask1].16b\n"  // d1_1234
-            "bif v3.16b, %[zero].16b, %[mask2].16b\n"  // d1_1234
-
-            "bif v4.16b, %[zero].16b, %[mask1].16b\n"  // d2_1234
-            "bif v5.16b, %[zero].16b, %[mask2].16b\n"  // d2_1234
-
-            "bif v6.16b, %[zero].16b, %[mask1].16b\n"  // d3_1234
-            "bif v7.16b, %[zero].16b, %[mask2].16b\n"  // d3_1234
-
-            "ext v8.16b, v0.16b, v1.16b, #4\n"  // d1_2345
-            "ext v9.16b, v0.16b, v1.16b, #8\n"  // d1_3450
-
-            "and  v12.16b, %[vbias].16b, %[vbias].16b  \n"  // v12 = vbias
-            "and  v13.16b, %[vbias].16b, %[vbias].16b  \n"  // v13 = vbias
-
-            // r0
-            "fmul v10.4s, v0.4s, %[wr0].s[0]\n"  // d0_1234 * w0[0]
-            "fmul v11.4s, v8.4s, %[wr0].s[1]\n"  // d1_2345 * w0[1]
-            "fmla v12.4s, v9.4s, %[wr0].s[2]\n"  // d0_3456 * w0[2]
-
-            "ext v8.16b, v2.16b, v3.16b, #4\n"  // d1_2345
-            "ext v9.16b, v2.16b, v3.16b, #8\n"  // d1_3450
-
-            // r1
-            "fmul v14.4s, v2.4s, %[wr0].s[0]\n"  // d0_1234 * w0[0]
-            "fmla v10.4s, v2.4s, %[wr1].s[0]\n"  // d0_1234 * w0[0]
-
-            "fmul v15.4s, v8.4s, %[wr0].s[1]\n"  // d1_2345 * w0[1]
-            "fmla v11.4s, v8.4s, %[wr1].s[1]\n"  // d1_2345 * w0[1]
-
-            "fmla v13.4s, v9.4s, %[wr0].s[2]\n"  // d0_3456 * w0[2]
-            "fmla v12.4s, v9.4s, %[wr1].s[2]\n"  // d0_3456 * w0[2]
-
-            "ext v8.16b, v4.16b, v5.16b, #4\n"  // d1_2345
-            "ext v9.16b, v4.16b, v5.16b, #8\n"  // d1_3450
-
-            // r2
-            "fmla v14.4s, v4.4s, %[wr1].s[0]\n"  // d0_1234 * w0[0]
-            "fmla v10.4s, v4.4s, %[wr2].s[0]\n"  // d0_1234 * w0[0]
-
-            "fmla v15.4s, v8.4s, %[wr1].s[1]\n"  // d1_2345 * w0[1]
-            "fmla v11.4s, v8.4s, %[wr2].s[1]\n"  // d1_2345 * w0[1]
-
-            "fmla v13.4s, v9.4s, %[wr1].s[2]\n"  // d0_3456 * w0[2]
-            "fmla v12.4s, v9.4s, %[wr2].s[2]\n"  // d0_3456 * w0[2]
-
-            "ext v8.16b, v6.16b, v7.16b, #4\n"  // d1_2345
-            "ext v9.16b, v6.16b, v7.16b, #8\n"  // d1_3450
-
-            // r3
-            "fmla v14.4s, v6.4s, %[wr2].s[0]\n"  // d0_1234 * w0[0]
-
-            "fmla v15.4s, v8.4s, %[wr2].s[1]\n"  // d1_2345 * w0[1]
-
-            "fadd v12.4s, v12.4s, v10.4s\n"
-
-            "fmla v13.4s, v9.4s, %[wr2].s[2]\n"  // d0_3456 * w0[2]
-
-            "fadd v12.4s, v12.4s, v11.4s\n"  // out1
-            "fadd v13.4s, v13.4s, v14.4s\n"  // out2
-            "fadd v13.4s, v13.4s, v15.4s\n"  // out2
-
-            "prfm pldl1keep, [%[out1]]\n"
-            "prfm pldl1keep, [%[out2]]\n"
-
-            "st1 {v12.4s}, [%[out1]]\n"
-            "st1 {v13.4s}, [%[out2]]\n"
-            : [din0] "+r"(dr0),
-              [din1] "+r"(dr1),
-              [din2] "+r"(dr2),
-              [din3] "+r"(dr3)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [vbias] "w"(wbias),
-              [mask1] "w"(vmask_rp1),
-              [mask2] "w"(vmask_rp2),
-              [zero] "w"(vzero),
-              [out1] "r"(out_buf1),
-              [out2] "r"(out_buf2)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15");
-#else
-        unsigned int* vmask_ptr = vmask;
-        float bias_val = flag_bias ? bias[i] : 0.f;
-        asm volatile(
-            "pld [%[din0]]\n"
-            "pld [%[din1]]\n"
-            "pld [%[din2]]\n"
-            "pld [%[din3]]\n"
-
-            "vld1.32  {d16-d18}, [%[din0]]    @ load din r0\n"
-            "vld1.32  {d20-d22}, [%[din1]]    @ load din r1\n"
-            "vld1.32  {d24-d26}, [%[din2]]    @ load din r2\n"
-            "vld1.32  {d28-d30}, [%[din3]]    @ load din r3\n"
-
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q5
-                                                                           // =
-            // vbias
-
-            "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"
-
-            "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"
-
-            "vbif d16, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d20, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-
-            "vbif d17, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d21, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-
-            "vbif d18, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-            "vbif d22, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vbif d24, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d25, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d26, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vbif d28, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d29, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d30, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmul.f32 q8, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmul.f32 q10, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmul.f32 q9, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmul.f32 q11, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q8, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q10, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q9, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q11, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vmla.f32 q8, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-            "vadd.f32 q4, q4, q10         @ q4 += q10 \n"
-
-            "pld [%[out1]]\n"
-            "pld [%[out2]]\n"
-
-            "vmla.f32 q9, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-            "vadd.f32 q4, q4, q11         @ q4 += q10 \n"
-
-            "vadd.f32 q5, q5, q8         @ q4 += q10 \n"
-            "vadd.f32 q5, q5, q9         @ q4 += q10 \n"
-
-            "vst1.32  {d8-d9},   [%[out1]]  @ store result, add pointer\n"
-            "vst1.32  {d10-d11},   [%[out2]]  @ store result, add pointer\n"
-
-            : [din0] "+r"(dr0),
-              [din1] "+r"(dr1),
-              [din2] "+r"(dr2),
-              [din3] "+r"(dr3),
-              [vmask] "+r"(vmask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [vzero] "w"(vzero),
-              [bias_val] "r"(bias_val),
-              [out1] "r"(out_buf1),
-              [out2] "r"(out_buf2)
-            : "cc",
-              "memory",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif  // __aarch64__
-        for (int w = 0; w < w_out; ++w) {
-          *doutr0++ = out_buf1[w];
-          *doutr1++ = out_buf2[w];
-        }
-      }  // end of processing heights
-    }    // end of processing channels
-  }      // end of processing batchs
-}
-/**
- * \brief depthwise convolution kernel 3x3, stride 2, width <= 4
- */
-
-void conv_depthwise_3x3s2p0_bias_s(float* dout,
-                                   const float* din,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext* ctx) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-  float zeros[8] = {0.0f};
-
-  uint32x4_t vmask_rp1 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  unsigned int dmask[8];
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float bias_c = 0.f;
-
-      if (flag_bias) {
-        bias_c = bias[i];
-      }
-      float32x4_t vbias = vdupq_n_f32(bias_c);
-      float out_buf[4];
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      for (int j = 0; j < h_out; ++j) {
-        const float* din0_ptr = dr0;
-        const float* din1_ptr = dr1;
-        const float* din2_ptr = dr2;
-
-        dr0 = dr2;
-        dr1 = dr0 + w_in;
-        dr2 = dr1 + w_in;
-
-        unsigned int* mask_ptr = dmask;
-#ifdef __aarch64__
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "movi v9.4s, #0                                 \n"
-            "ld1  {v6.4s, v7.4s}, [%[mask_ptr]], #32        \n"
-
-            "ld2  {v10.4s, v11.4s}, [%[din0_ptr]], #32      \n"  // v10={0,2,4,6}
-            // v11={1,3,5,7}
-            "ld2  {v12.4s, v13.4s}, [%[din1_ptr]], #32      \n"  // v13={0,2,4,6}
-            // v12={1,3,5,7}
-            "ld2  {v14.4s, v15.4s}, [%[din2_ptr]], #32      \n"  // v14={0,2,4,6}
-            // v15={1,3,5,7}
-            "and  v4.16b, %[bias].16b, %[bias].16b  \n"  // v10 = vbias
-
-            "bif v10.16b, v9.16b, v6.16b                    \n"
-            "bif v11.16b, v9.16b, v7.16b                    \n"
-            "bif v12.16b, v9.16b, v6.16b                    \n"
-            "bif v13.16b, v9.16b, v7.16b                    \n"
-            "bif v14.16b, v9.16b, v6.16b                    \n"
-            "bif v15.16b, v9.16b, v7.16b                    \n"
-
-            "ext v6.16b, v10.16b, v9.16b, #4               \n"  // v6 =
-                                                                // {2,4,6,8}
-            "ext v7.16b, v12.16b, v9.16b, #4               \n"  // v6 =
-                                                                // {2,4,6,8}
-            "ext v8.16b, v14.16b, v9.16b, #4               \n"  // v6 =
-                                                                // {2,4,6,8}
-
-            "fmla v4.4s, v10.4s, %[wr0].s[0]                \n"   // 0246 * w00
-            "fmul v5.4s, v11.4s, %[wr0].s[1]                \n"   // 1357 * w01
-            "fmul v16.4s, v6.4s,  %[wr0].s[2]                \n"  // 2468  * w02
-
-            "fmla v4.4s, v12.4s, %[wr1].s[0]                \n"   // v12 * w11
-            "fmla v5.4s, v13.4s, %[wr1].s[1]                \n"   // v13 * w12
-            "fmla v16.4s, v7.4s,  %[wr1].s[2]                \n"  // v7  * w10
-
-            "fmla v4.4s, v14.4s, %[wr2].s[0]                \n"   // v14 * w20
-            "fmla v5.4s, v15.4s, %[wr2].s[1]                \n"   // v15 * w21
-            "fmla v16.4s, v8.4s,  %[wr2].s[2]                \n"  // v8  * w22
-
-            "fadd v4.4s, v4.4s, v5.4s                       \n"
-            "fadd v4.4s, v4.4s, v16.4s                       \n"
-
-            // "fadd v4.4s, v4.4s, %[bias].4s                  \n"
-            "st1 {v4.4s}, [%[out]]                          \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [mask_ptr] "+r"(mask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "w"(vbias),
-              [out] "r"(out_buf)
-            : "cc",
-              "memory",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16");
-
-#else
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "vmov.u32 q9, #0                                \n"
-            "vld1.f32   {d12-d15}, [%[mask_ptr]]           @ load mask\n"
-            "vdup.32  q3, %[bias]                           @ and \n"  // q3 =
-                                                                       // vbias
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // q10={0,2,4,6} q11={1,3,5,7}
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // q13={0,2,4,6} q12={1,3,5,7}
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"  // q14={0,2,4,6} q15={1,3,5,7}
-
-            "vbif q10, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q11, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q12, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q13, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q14, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q15, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-
-            "vext.32 q6, q10, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,0}
-            "vext.32 q7, q12, q9, #1                        @ shift left 1 \n"  // q7 = {2,4,6,0}
-            "vext.32 q8, q14, q9, #1                        @ shift left 1 \n"  // q8 = {2,4,6,0}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // {0,2,4,6}
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // {1,3,5,7}
-            "vmla.f32 q3, q6,  %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // {2,4,6,0}
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q12 * w11
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q13 * w12
-            "vmla.f32 q3, q7,  %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q7  * w10
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q14 * w20
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q15 * w21
-            "vmla.f32 q3, q8,  %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q8  * w22
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vst1.32 {d6-d7}, [%[out]]                            \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "r"(bias_c),
-              [out] "r"(out_buf),
-              [mask_ptr] "r"(dmask)
-            : "cc",
-              "memory",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif  // __aarch64__
-        for (int w = 0; w < w_out; ++w) {
-          *dout_channel++ = out_buf[w];
-        }
-      }
-    }
-  }
-}
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width <= 4
- */
-void conv_depthwise_3x3s1p0_bias_s_relu(float* dout,
-                                        const float* din,
-                                        const float* weights,
-                                        const float* bias,
-                                        bool flag_bias,
-                                        const int num,
-                                        const int ch_in,
-                                        const int h_in,
-                                        const int w_in,
-                                        const int h_out,
-                                        const int w_out,
-                                        ARMContext* ctx) {
-  //! 3x3s1 convolution, implemented by direct algorithm
-  //! pad is done implicit
-  //! for 4x6 convolution window
-  const int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
-  const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f};
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-  uint32x4_t vmask_rp1 =
-      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(6 - w_in));
-  uint32x4_t vmask_rp2 =
-      vcgeq_s32(vld1q_s32(right_pad_idx + 4), vdupq_n_s32(6 - w_in));
-
-  unsigned int vmask[8];
-  vst1q_u32(vmask, vmask_rp1);
-  vst1q_u32(vmask + 4, vmask_rp2);
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      float* dout_channel = dout_batch + i * size_out_channel;
-      const float* din_channel = din_batch + i * size_in_channel;
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float32x4_t wbias;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      float out_buf1[4];
-      float out_buf2[4];
-      float trash_buf[4];
-
-      float* doutr0 = dout_channel;
-      float* doutr1 = dout_channel + w_out;
-
-      for (int j = 0; j < h_out; j += 2) {
-        const float* dr0 = din_channel + j * w_in;
-        const float* dr1 = dr0 + w_in;
-        const float* dr2 = dr1 + w_in;
-        const float* dr3 = dr2 + w_in;
-
-        doutr0 = dout_channel + j * w_out;
-        doutr1 = doutr0 + w_out;
-
-        if (j + 3 >= h_in) {
-          switch (j + 3 - h_in) {
-            case 3:
-              dr1 = zero_ptr;
-            case 2:
-              dr2 = zero_ptr;
-            case 1:
-              dr3 = zero_ptr;
-              doutr1 = trash_buf;
-            case 0:
-              dr3 = zero_ptr;
-              doutr1 = trash_buf;
-            default:
-              break;
-          }
-        }
-#ifdef __aarch64__
-        asm volatile(
-            "prfm pldl1keep, [%[din0]]\n"
-            "prfm pldl1keep, [%[din1]]\n"
-            "prfm pldl1keep, [%[din2]]\n"
-            "prfm pldl1keep, [%[din3]]\n"
-
-            "ld1 {v0.4s, v1.4s}, [%[din0]]\n"
-            "ld1 {v2.4s, v3.4s}, [%[din1]]\n"
-            "ld1 {v4.4s, v5.4s}, [%[din2]]\n"
-            "ld1 {v6.4s, v7.4s}, [%[din3]]\n"
-
-            "bif v0.16b, %[zero].16b, %[mask1].16b\n"  // d0_1234
-            "bif v1.16b, %[zero].16b, %[mask2].16b\n"  // d0_1234
-
-            "bif v2.16b, %[zero].16b, %[mask1].16b\n"  // d1_1234
-            "bif v3.16b, %[zero].16b, %[mask2].16b\n"  // d1_1234
-
-            "bif v4.16b, %[zero].16b, %[mask1].16b\n"  // d2_1234
-            "bif v5.16b, %[zero].16b, %[mask2].16b\n"  // d2_1234
-
-            "bif v6.16b, %[zero].16b, %[mask1].16b\n"  // d3_1234
-            "bif v7.16b, %[zero].16b, %[mask2].16b\n"  // d3_1234
-
-            "ext v8.16b, v0.16b, v1.16b, #4\n"  // d1_2345
-            "ext v9.16b, v0.16b, v1.16b, #8\n"  // d1_3450
-
-            "and  v12.16b, %[vbias].16b, %[vbias].16b  \n"  // v12 = vbias
-            "and  v13.16b, %[vbias].16b, %[vbias].16b  \n"  // v13 = vbias
-
-            // r0
-            "fmul v10.4s, v0.4s, %[wr0].s[0]\n"  // d0_1234 * w0[0]
-            "fmul v11.4s, v8.4s, %[wr0].s[1]\n"  // d1_2345 * w0[1]
-            "fmla v12.4s, v9.4s, %[wr0].s[2]\n"  // d0_3456 * w0[2]
-
-            "ext v8.16b, v2.16b, v3.16b, #4\n"  // d1_2345
-            "ext v9.16b, v2.16b, v3.16b, #8\n"  // d1_3450
-
-            // r1
-            "fmul v14.4s, v2.4s, %[wr0].s[0]\n"  // d0_1234 * w0[0]
-            "fmla v10.4s, v2.4s, %[wr1].s[0]\n"  // d0_1234 * w0[0]
-
-            "fmul v15.4s, v8.4s, %[wr0].s[1]\n"  // d1_2345 * w0[1]
-            "fmla v11.4s, v8.4s, %[wr1].s[1]\n"  // d1_2345 * w0[1]
-
-            "fmla v13.4s, v9.4s, %[wr0].s[2]\n"  // d0_3456 * w0[2]
-            "fmla v12.4s, v9.4s, %[wr1].s[2]\n"  // d0_3456 * w0[2]
-
-            "ext v8.16b, v4.16b, v5.16b, #4\n"  // d1_2345
-            "ext v9.16b, v4.16b, v5.16b, #8\n"  // d1_3450
-
-            // r2
-            "fmla v14.4s, v4.4s, %[wr1].s[0]\n"  // d0_1234 * w0[0]
-            "fmla v10.4s, v4.4s, %[wr2].s[0]\n"  // d0_1234 * w0[0]
-
-            "fmla v15.4s, v8.4s, %[wr1].s[1]\n"  // d1_2345 * w0[1]
-            "fmla v11.4s, v8.4s, %[wr2].s[1]\n"  // d1_2345 * w0[1]
-
-            "fmla v13.4s, v9.4s, %[wr1].s[2]\n"  // d0_3456 * w0[2]
-            "fmla v12.4s, v9.4s, %[wr2].s[2]\n"  // d0_3456 * w0[2]
-
-            "ext v8.16b, v6.16b, v7.16b, #4\n"  // d1_2345
-            "ext v9.16b, v6.16b, v7.16b, #8\n"  // d1_3450
-
-            // r3
-            "fmla v14.4s, v6.4s, %[wr2].s[0]\n"  // d0_1234 * w0[0]
-
-            "fmla v15.4s, v8.4s, %[wr2].s[1]\n"  // d1_2345 * w0[1]
-
-            "fadd v12.4s, v12.4s, v10.4s\n"
-
-            "fmla v13.4s, v9.4s, %[wr2].s[2]\n"  // d0_3456 * w0[2]
-
-            "fadd v12.4s, v12.4s, v11.4s\n"  // out1
-            "fadd v13.4s, v13.4s, v14.4s\n"  // out2
-            "fadd v13.4s, v13.4s, v15.4s\n"  // out2
-
-            "prfm pldl1keep, [%[out1]]\n"
-            "prfm pldl1keep, [%[out2]]\n"
-            "fmax v12.4s, v12.4s, %[zero].4s                       \n"
-            "fmax v13.4s, v13.4s, %[zero].4s                       \n"
-
-            "st1 {v12.4s}, [%[out1]]\n"
-            "st1 {v13.4s}, [%[out2]]\n"
-            : [din0] "+r"(dr0),
-              [din1] "+r"(dr1),
-              [din2] "+r"(dr2),
-              [din3] "+r"(dr3)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [vbias] "w"(wbias),
-              [mask1] "w"(vmask_rp1),
-              [mask2] "w"(vmask_rp2),
-              [zero] "w"(vzero),
-              [out1] "r"(out_buf1),
-              [out2] "r"(out_buf2)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15");
-#else
-        unsigned int* vmask_ptr = vmask;
-        float bias_val = flag_bias ? bias[i] : 0.f;
-        asm volatile(
-            "pld [%[din0]]\n"
-            "pld [%[din1]]\n"
-            "pld [%[din2]]\n"
-            "pld [%[din3]]\n"
-
-            "vld1.32  {d16-d18}, [%[din0]]    @ load din r0\n"
-            "vld1.32  {d20-d22}, [%[din1]]    @ load din r1\n"
-            "vld1.32  {d24-d26}, [%[din2]]    @ load din r2\n"
-            "vld1.32  {d28-d30}, [%[din3]]    @ load din r3\n"
-
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q5
-                                                                           // =
-            // vbias
-
-            "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"
-
-            "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"
-
-            "vbif d16, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d20, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-
-            "vbif d17, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d21, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-
-            "vbif d18, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-            "vbif d22, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vbif d24, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d25, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d26, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vbif d28, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d29, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d30, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmul.f32 q8, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmul.f32 q10, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmul.f32 q9, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmul.f32 q11, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q8, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q10, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q9, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q11, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vmla.f32 q8, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-            "vadd.f32 q4, q4, q10         @ q4 += q10 \n"
-
-            "pld [%[out1]]\n"
-            "pld [%[out2]]\n"
-
-            "vmla.f32 q9, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-            "vadd.f32 q4, q4, q11         @ q4 += q10 \n"
-
-            "vadd.f32 q5, q5, q8         @ q4 += q10 \n"
-            "vadd.f32 q5, q5, q9         @ q4 += q10 \n"
-            "vmax.f32 q4, q4, %q[vzero]   @ relu \n"
-            "vmax.f32 q5, q5, %q[vzero]   @ relu \n"
-
-            "vst1.32  {d8-d9},   [%[out1]]  @ store result, add pointer\n"
-            "vst1.32  {d10-d11},   [%[out2]]  @ store result, add pointer\n"
-
-            : [din0] "+r"(dr0),
-              [din1] "+r"(dr1),
-              [din2] "+r"(dr2),
-              [din3] "+r"(dr3),
-              [vmask] "+r"(vmask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [vzero] "w"(vzero),
-              [bias_val] "r"(bias_val),
-              [out1] "r"(out_buf1),
-              [out2] "r"(out_buf2)
-            : "cc",
-              "memory",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif  // __aarch64__
-        for (int w = 0; w < w_out; ++w) {
-          *doutr0++ = out_buf1[w];
-          *doutr1++ = out_buf2[w];
-        }
-        // doutr0 = doutr1;
-        // doutr1 += w_out;
-      }  // end of processing heights
-    }    // end of processing channels
-  }      // end of processing batchs
-}
-
-/**
- * \brief depthwise convolution kernel 3x3, stride 2, width <= 7
- */
-void conv_depthwise_3x3s2p0_bias_s_relu(float* dout,
-                                        const float* din,
-                                        const float* weights,
-                                        const float* bias,
-                                        bool flag_bias,
-                                        const int num,
-                                        const int ch_in,
-                                        const int h_in,
-                                        const int w_in,
-                                        const int h_out,
-                                        const int w_out,
-                                        ARMContext* ctx) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-  float zeros[8] = {0.0f};
-
-  uint32x4_t vmask_rp1 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  unsigned int dmask[8];
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float bias_c = 0.f;
-
-      if (flag_bias) {
-        bias_c = bias[i];
-      }
-      float32x4_t vbias = vdupq_n_f32(bias_c);
-      float out_buf[4];
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      for (int j = 0; j < h_out; ++j) {
-        const float* din0_ptr = dr0;
-        const float* din1_ptr = dr1;
-        const float* din2_ptr = dr2;
-
-        dr0 = dr2;
-        dr1 = dr0 + w_in;
-        dr2 = dr1 + w_in;
-
-        unsigned int* mask_ptr = dmask;
-#ifdef __aarch64__
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "movi v9.4s, #0                                 \n"
-            "ld1  {v6.4s, v7.4s}, [%[mask_ptr]]        \n"
-
-            "ld2  {v10.4s, v11.4s}, [%[din0_ptr]], #32      \n"  // v10={0,2,4,6}
-            // v11={1,3,5,7}
-            "ld2  {v12.4s, v13.4s}, [%[din1_ptr]], #32      \n"  // v13={0,2,4,6}
-            // v12={1,3,5,7}
-            "ld2  {v14.4s, v15.4s}, [%[din2_ptr]], #32      \n"  // v14={0,2,4,6}
-            // v15={1,3,5,7}
-            "and  v4.16b, %[bias].16b, %[bias].16b  \n"  // v10 = vbias
-
-            "bif v10.16b, v9.16b, v6.16b                    \n"
-            "bif v11.16b, v9.16b, v7.16b                    \n"
-            "bif v12.16b, v9.16b, v6.16b                    \n"
-            "bif v13.16b, v9.16b, v7.16b                    \n"
-            "bif v14.16b, v9.16b, v6.16b                    \n"
-            "bif v15.16b, v9.16b, v7.16b                    \n"
-
-            "ext v6.16b, v10.16b, v9.16b, #4               \n"  // v6 =
-                                                                // {2,4,6,8}
-            "ext v7.16b, v12.16b, v9.16b, #4               \n"  // v6 =
-                                                                // {2,4,6,8}
-            "ext v8.16b, v14.16b, v9.16b, #4               \n"  // v6 =
-                                                                // {2,4,6,8}
-
-            "fmla v4.4s, v10.4s, %[wr0].s[0]                \n"   // 0246 * w00
-            "fmul v5.4s, v11.4s, %[wr0].s[1]                \n"   // 1357 * w01
-            "fmul v16.4s, v6.4s,  %[wr0].s[2]                \n"  // 2468  * w02
-
-            "fmla v4.4s, v12.4s, %[wr1].s[0]                \n"   // v12 * w11
-            "fmla v5.4s, v13.4s, %[wr1].s[1]                \n"   // v13 * w12
-            "fmla v16.4s, v7.4s,  %[wr1].s[2]                \n"  // v7  * w10
-
-            "fmla v4.4s, v14.4s, %[wr2].s[0]                \n"   // v14 * w20
-            "fmla v5.4s, v15.4s, %[wr2].s[1]                \n"   // v15 * w21
-            "fmla v16.4s, v8.4s,  %[wr2].s[2]                \n"  // v8  * w22
-
-            "fadd v4.4s, v4.4s, v5.4s                       \n"
-            "fadd v4.4s, v4.4s, v16.4s                       \n"
-            "fmax v4.4s, v4.4s, v9.4s                       \n"
-
-            // "fadd v4.4s, v4.4s, %[bias].4s                  \n"
-            "st1 {v4.4s}, [%[out]]                          \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "w"(vbias),
-              [out] "r"(out_buf),
-              [mask_ptr] "r"(mask_ptr)
-            : "cc",
-              "memory",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16");
-
-#else
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "vmov.u32 q9, #0                                \n"
-            "vld1.f32   {d12-d15}, [%[mask_ptr]]           @ load mask\n"
-            "vdup.32  q3, %[bias]                           @ and \n"  // q3 =
-                                                                       // vbias
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // q10={0,2,4,6} q11={1,3,5,7}
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // q13={0,2,4,6} q12={1,3,5,7}
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"  // q14={0,2,4,6} q15={1,3,5,7}
-
-            "vbif q10, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q11, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q12, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q13, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q14, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q15, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-
-            "vext.32 q6, q10, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,0}
-            "vext.32 q7, q12, q9, #1                        @ shift left 1 \n"  // q7 = {2,4,6,0}
-            "vext.32 q8, q14, q9, #1                        @ shift left 1 \n"  // q8 = {2,4,6,0}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // {0,2,4,6}
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // {1,3,5,7}
-            "vmla.f32 q3, q6,  %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // {2,4,6,0}
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q12 * w11
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q13 * w12
-            "vmla.f32 q3, q7,  %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q7  * w10
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q14 * w20
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q15 * w21
-            "vmla.f32 q3, q8,  %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q8  * w22
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vmax.f32 q3, q3, q9                            @ relu \n"
-
-            "vst1.32 {d6-d7}, [%[out]]                            \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "r"(bias_c),
-              [out] "r"(out_buf),
-              [mask_ptr] "r"(mask_ptr)
-            : "cc",
-              "memory",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif  // __aarch64__
-        for (int w = 0; w < w_out; ++w) {
-          *dout_channel++ = out_buf[w];
-        }
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_depthwise_3x3p1.cc b/lite/backends/arm/math/conv_depthwise_3x3p1.cc
deleted file mode 100644
index b5de99d7f5..0000000000
--- a/lite/backends/arm/math/conv_depthwise_3x3p1.cc
+++ /dev/null
@@ -1,4850 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/conv_depthwise.h"
-#include <arm_neon.h>
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void conv_depthwise_3x3s1p1_bias(float* dout,
-                                 const float* din,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext* ctx);
-
-//! for input width <= 4
-void conv_depthwise_3x3s1p1_bias_s(float* dout,
-                                   const float* din,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext* ctx);
-
-void conv_depthwise_3x3s2p1_bias(float* dout,
-                                 const float* din,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext* ctx);
-
-//! for input width <= 4
-void conv_depthwise_3x3s2p1_bias_s(float* dout,
-                                   const float* din,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext* ctx);
-
-void conv_depthwise_3x3s1p1_bias_relu(float* dout,
-                                      const float* din,
-                                      const float* weights,
-                                      const float* bias,
-                                      bool flag_bias,
-                                      const int num,
-                                      const int ch_in,
-                                      const int h_in,
-                                      const int w_in,
-                                      const int h_out,
-                                      const int w_out,
-                                      ARMContext* ctx);
-
-//! for input width <= 4
-void conv_depthwise_3x3s1p1_bias_s_relu(float* dout,
-                                        const float* din,
-                                        const float* weights,
-                                        const float* bias,
-                                        bool flag_bias,
-                                        const int num,
-                                        const int ch_in,
-                                        const int h_in,
-                                        const int w_in,
-                                        const int h_out,
-                                        const int w_out,
-                                        ARMContext* ctx);
-
-void conv_depthwise_3x3s2p1_bias_relu(float* dout,
-                                      const float* din,
-                                      const float* weights,
-                                      const float* bias,
-                                      bool flag_bias,
-                                      const int num,
-                                      const int ch_in,
-                                      const int h_in,
-                                      const int w_in,
-                                      const int h_out,
-                                      const int w_out,
-                                      ARMContext* ctx);
-
-//! for input width <= 4
-void conv_depthwise_3x3s2p1_bias_s_relu(float* dout,
-                                        const float* din,
-                                        const float* weights,
-                                        const float* bias,
-                                        bool flag_bias,
-                                        const int num,
-                                        const int ch_in,
-                                        const int h_in,
-                                        const int w_in,
-                                        const int h_out,
-                                        const int w_out,
-                                        ARMContext* ctx);
-
-void conv_depthwise_3x3p1(const float* din,
-                          float* dout,
-                          int num,
-                          int ch_out,
-                          int h_out,
-                          int w_out,
-                          int ch_in,
-                          int h_in,
-                          int w_in,
-                          const float* weights,
-                          const float* bias,
-                          int stride,
-                          bool flag_bias,
-                          bool flag_relu,
-                          ARMContext* ctx) {
-  if (stride == 1) {
-    if (flag_relu) {
-      if (w_in > 4) {
-        conv_depthwise_3x3s1p1_bias_relu(dout,
-                                         din,
-                                         weights,
-                                         bias,
-                                         flag_bias,
-                                         num,
-                                         ch_in,
-                                         h_in,
-                                         w_in,
-                                         h_out,
-                                         w_out,
-                                         ctx);
-      } else {
-        conv_depthwise_3x3s1p1_bias_s_relu(dout,
-                                           din,
-                                           weights,
-                                           bias,
-                                           flag_bias,
-                                           num,
-                                           ch_in,
-                                           h_in,
-                                           w_in,
-                                           h_out,
-                                           w_out,
-                                           ctx);
-      }
-    } else {
-      if (w_in > 4) {
-        conv_depthwise_3x3s1p1_bias(dout,
-                                    din,
-                                    weights,
-                                    bias,
-                                    flag_bias,
-                                    num,
-                                    ch_in,
-                                    h_in,
-                                    w_in,
-                                    h_out,
-                                    w_out,
-                                    ctx);
-      } else {
-        conv_depthwise_3x3s1p1_bias_s(dout,
-                                      din,
-                                      weights,
-                                      bias,
-                                      flag_bias,
-                                      num,
-                                      ch_in,
-                                      h_in,
-                                      w_in,
-                                      h_out,
-                                      w_out,
-                                      ctx);
-      }
-    }
-  } else {  //! stride = 2
-    if (flag_relu) {
-      if (w_in > 7) {
-        conv_depthwise_3x3s2p1_bias_relu(dout,
-                                         din,
-                                         weights,
-                                         bias,
-                                         flag_bias,
-                                         num,
-                                         ch_in,
-                                         h_in,
-                                         w_in,
-                                         h_out,
-                                         w_out,
-                                         ctx);
-      } else {
-        conv_depthwise_3x3s2p1_bias_s_relu(dout,
-                                           din,
-                                           weights,
-                                           bias,
-                                           flag_bias,
-                                           num,
-                                           ch_in,
-                                           h_in,
-                                           w_in,
-                                           h_out,
-                                           w_out,
-                                           ctx);
-      }
-    } else {
-      if (w_in > 7) {
-        conv_depthwise_3x3s2p1_bias(dout,
-                                    din,
-                                    weights,
-                                    bias,
-                                    flag_bias,
-                                    num,
-                                    ch_in,
-                                    h_in,
-                                    w_in,
-                                    h_out,
-                                    w_out,
-                                    ctx);
-      } else {
-        conv_depthwise_3x3s2p1_bias_s(dout,
-                                      din,
-                                      weights,
-                                      bias,
-                                      flag_bias,
-                                      num,
-                                      ch_in,
-                                      h_in,
-                                      w_in,
-                                      h_out,
-                                      w_out,
-                                      ctx);
-      }
-    }
-  }
-}
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width > 4
- */
-// 4line
-void conv_depthwise_3x3s1p1_bias(float* dout,
-                                 const float* din,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext* ctx) {
-  //! pad is done implicit
-  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
-  //! for 4x6 convolution window
-  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
-
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-
-  // printf("conv3x3_dw start \n");
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  int w_stride = 9;
-
-  int tile_w = (w_in + 3) >> 2;
-  int cnt_col = tile_w - 2;
-
-  unsigned int size_pad_right = (unsigned int)(1 + (tile_w << 2) - w_in);
-
-  uint32x4_t vmask_rp1 =
-      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_rp2 =
-      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_result =
-      vcgtq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
-
-  unsigned int vmask[8];
-  vst1q_u32(vmask, vmask_rp1);
-  vst1q_u32(vmask + 4, vmask_rp2);
-
-  unsigned int rmask[4];
-  vst1q_u32(rmask, vmask_result);
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-#ifdef __aarch64__
-    for (int c = 0; c < ch_in; c++) {
-      float* dout_ptr = dout_batch + c * size_out_channel;
-
-      const float* din_ch_ptr = din_batch + c * size_in_channel;
-
-      float bias_val = flag_bias ? bias[c] : 0.f;
-      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
-
-      const float* wei_ptr = weights + c * w_stride;
-
-      float32x4_t wr0 = vld1q_f32(wei_ptr);
-      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
-
-      float* doutr0 = dout_ptr;
-      float* doutr1 = doutr0 + w_out;
-      float* doutr2 = doutr1 + w_out;
-      float* doutr3 = doutr2 + w_out;
-
-      const float* dr0 = din_ch_ptr;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-      const float* dr4 = dr3 + w_in;
-      const float* dr5 = dr4 + w_in;
-
-      const float* din_ptr0 = dr0;
-      const float* din_ptr1 = dr1;
-      const float* din_ptr2 = dr2;
-      const float* din_ptr3 = dr3;
-      const float* din_ptr4 = dr4;
-      const float* din_ptr5 = dr5;
-
-      for (int i = 0; i < h_in; i += 4) {
-        //! process top pad pad_h = 1
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-        din_ptr3 = dr3;
-        din_ptr4 = dr4;
-        din_ptr5 = dr5;
-
-        doutr0 = dout_ptr;
-        doutr1 = doutr0 + w_out;
-        doutr2 = doutr1 + w_out;
-        doutr3 = doutr2 + w_out;
-        if (i == 0) {
-          din_ptr0 = zero_ptr;
-          din_ptr1 = dr0;
-          din_ptr2 = dr1;
-          din_ptr3 = dr2;
-          din_ptr4 = dr3;
-          din_ptr5 = dr4;
-          dr0 = dr3;
-          dr1 = dr4;
-          dr2 = dr5;
-        } else {
-          dr0 = dr4;
-          dr1 = dr5;
-          dr2 = dr1 + w_in;
-        }
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-        dr5 = dr4 + w_in;
-
-        //! process bottom pad
-        if (i + 5 > h_in) {
-          switch (i + 5 - h_in) {
-            case 5:
-              din_ptr1 = zero_ptr;
-            case 4:
-              din_ptr2 = zero_ptr;
-            case 3:
-              din_ptr3 = zero_ptr;
-            case 2:
-              din_ptr4 = zero_ptr;
-            case 1:
-              din_ptr5 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 4 > h_out) {
-          switch (i + 4 - h_out) {
-            case 3:
-              doutr1 = write_ptr;
-            case 2:
-              doutr2 = write_ptr;
-            case 1:
-              doutr3 = write_ptr;
-            default:
-              break;
-          }
-        }
-
-        int cnt = cnt_col;
-        asm volatile(
-            "PRFM PLDL1KEEP, [%[din_ptr0]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr1]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr2]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr3]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr4]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr5]] \n"
-            "movi   v21.4s, #0x0\n" /* out0 = 0 */
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "ld1 {v12.4s}, [%[bias_val]]     \n"  /*vdupq_n_f32(bias_val)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "ext  v16.16b, %[vzero].16b, v0.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v0.16b, v1.16b, #4 \n"        /* v16 = 1234 */
-
-            // left
-            // r0
-            "fmla v12.4s,  v0.4s,  %[w0].s[1]\n" /* outr00 += din0_0123 *
-                                                    w0[1]*/
-
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "sub %[din_ptr0], %[din_ptr0], #4 \n"   /* din_ptr0-- */
-            "sub %[din_ptr1], %[din_ptr1], #4 \n"   /* din_ptr0-- */
-
-            "fmla v12.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din0_0012 *
-                                                      w0[0]*/
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/
-            "sub %[din_ptr2], %[din_ptr2], #4 \n" /* din_ptr0-- */
-            "sub %[din_ptr3], %[din_ptr3], #4 \n" /* din_ptr0-- */
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_1234 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, %[vzero].16b, v2.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v2.16b, v3.16b, #4 \n"        /* v16 = 1234 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[1]\n" /* outr00 += din1_0123 *
-                                                     w0[1]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[1]\n" /* outr00 += din1_0123 *
-                                                     w1[1]*/
-            "sub %[din_ptr4], %[din_ptr4], #4 \n" /* din_ptr0-- */
-            "sub %[din_ptr5], %[din_ptr5], #4 \n" /* din_ptr0-- */
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v4.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v4.16b, v5.16b, #4 \n"        /* v16 = 1234 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[1]\n" /* outr00 += din2_0123 *
-                                                     w0[1]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 *
-                                                     w1[1]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                     w2[1]*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v6.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v6.16b, v7.16b, #4 \n"        /* v16 = 1234 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[1]\n" /*outr00 += din2_0123 *
-                                                     w0[1]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 *
-                                                     w1[1]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                     w2[1]*/
-
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v8.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v8.16b, v9.16b, #4 \n"        /* v16 = 1234 */
-
-            // r4
-            "fmla v15.4s ,  v8.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 *
-                                                     w1[1]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                     w2[1]*/
-
-            "st1 {v12.4s}, [%[doutr0]], #16 \n"    /* vst1q_f32() */
-            "st1 {v13.4s}, [%[doutr1]], #16 \n"    /* vst1q_f32() */
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v10.16b, v11.16b, #4 \n"       /* v16 = 1234 */
-
-            // r5
-            "fmla v15.4s ,  v10.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16 \n"    /* vst1q_f32() */
-            "ld1 {v10.4s}, [%[din_ptr5]], #16  \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-
-            "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */
-            "cmp  %[cnt], #1                \n"
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "blt 3f                         \n"
-            // mid
-            "1:                             \n"
-            // r0
-            "fmla v12.4s ,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v12.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "st1 {v12.4s}, [%[doutr0]], #16     \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "st1 {v13.4s}, [%[doutr1]], #16     \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                      w0[0]*/
-
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "st1 {v14.4s}, [%[doutr2]], #16     \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-
-            "subs %[cnt], %[cnt], #1 \n"
-
-            "st1 {v15.4s}, [%[doutr3]], #16     \n"
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "bne 1b \n"
-
-            // right
-            "3:                             \n"
-            "ld1 {v18.4s, v19.4s}, [%[vmask]]         \n"
-            "ld1 {v22.4s}, [%[doutr0]]         \n"
-            "ld1 {v23.4s}, [%[doutr1]]         \n"
-            "ld1 {v24.4s}, [%[doutr2]]         \n"
-            "ld1 {v25.4s}, [%[doutr3]]         \n"
-
-            "bif v0.16b, %[vzero].16b, v18.16b \n"
-            "bif v1.16b, %[vzero].16b, v19.16b \n"
-            "bif v2.16b, %[vzero].16b, v18.16b \n"
-            "bif v3.16b, %[vzero].16b, v19.16b \n"
-
-            "bif v4.16b, %[vzero].16b, v18.16b \n"
-            "bif v5.16b, %[vzero].16b, v19.16b \n"
-            "bif v6.16b, %[vzero].16b, v18.16b \n"
-            "bif v7.16b, %[vzero].16b, v19.16b \n"
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-
-            // r0
-            "fmla v12.4s,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                    w0[0]*/
-
-            "bif v8.16b, %[vzero].16b, v18.16b \n"
-            "bif v9.16b, %[vzero].16b, v19.16b \n"
-            "bif v10.16b, %[vzero].16b, v18.16b \n"
-            "bif v11.16b, %[vzero].16b, v19.16b \n"
-
-            "fmla v12.4s,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                     w0[1]*/
-
-            "ld1 {v18.4s}, [%[rmask]]         \n"
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "bif v12.16b, v22.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v12.4s}, [%[doutr0]], #16     \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "bif v13.16b, v23.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v13.4s}, [%[doutr1]], #16     \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                      w0[0]*/
-
-            "bif v14.16b, v24.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16     \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "bif v15.16b, v25.16b, v18.16b \n"
-
-            "st1 {v15.4s}, [%[doutr3]], #16     \n"
-            : [cnt] "+r"(cnt),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [din_ptr5] "+r"(din_ptr5),
-              [doutr0] "+r"(doutr0),
-              [doutr1] "+r"(doutr1),
-              [doutr2] "+r"(doutr2),
-              [doutr3] "+r"(doutr3)
-            : [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [bias_val] "r"(vbias),
-              [vmask] "r"(vmask),
-              [rmask] "r"(rmask),
-              [vzero] "w"(vzero)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22",
-              "v23",
-              "v24",
-              "v25");
-        dout_ptr = dout_ptr + 4 * w_out;
-      }
-    }
-#else
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float bias_val = flag_bias ? bias[i] : 0.f;
-
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-
-      const float* din0_ptr = nullptr;
-      const float* din1_ptr = nullptr;
-      const float* din2_ptr = nullptr;
-      const float* din3_ptr = nullptr;
-
-      float* doutr0 = nullptr;
-      float* doutr1 = nullptr;
-
-      float* ptr_zero = const_cast<float*>(zero);
-
-      for (int i = 0; i < h_in; i += 2) {
-        //! process top pad pad_h = 1
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-
-        doutr0 = dout_channel;
-        doutr1 = dout_channel + w_out;
-        // unsigned int* rst_mask = rmask;
-
-        if (i == 0) {
-          din0_ptr = zero_ptr;
-          din1_ptr = dr0;
-          din2_ptr = dr1;
-          din3_ptr = dr2;
-          dr0 = dr1;
-          dr1 = dr2;
-          dr2 = dr3;
-          dr3 = dr2 + w_in;
-        } else {
-          dr0 = dr2;
-          dr1 = dr3;
-          dr2 = dr1 + w_in;
-          dr3 = dr2 + w_in;
-        }
-        //! process bottom pad
-        if (i + 3 > h_in) {
-          switch (i + 3 - h_in) {
-            case 3:
-              din1_ptr = zero_ptr;
-            case 2:
-              din2_ptr = zero_ptr;
-            case 1:
-              din3_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 2 > h_out) {
-          doutr1 = write_ptr;
-        }
-        int cnt = cnt_col;
-        unsigned int* rmask_ptr = rmask;
-        unsigned int* vmask_ptr = vmask;
-        asm volatile(
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                      @ preload data\n"
-            "pld [%[din2_ptr]]                      @ preload data\n"
-            "pld [%[din3_ptr]]                      @ preload data\n"
-
-            "vld1.32  {d16-d18}, [%[din0_ptr]]!    @ load din r0\n"
-            "vld1.32  {d20-d22}, [%[din1_ptr]]!    @ load din r1\n"
-            "vld1.32  {d24-d26}, [%[din2_ptr]]!    @ load din r2\n"
-            "vld1.32  {d28-d30}, [%[din3_ptr]]!    @ load din r3\n"
-
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q5
-                                                                           // =
-            // vbias
-
-            "vext.32  q6, %q[vzero], q8, #3     @ 0012\n"
-            "vext.32  q7, q8, q9, #1     @ 1234\n"
-
-            // left
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n"
-            "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n"
-            "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n"
-            "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n"
-
-            "vmla.f32 q4, q6, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                             @ preload data\n"
-            "pld [%[din2_ptr]]                             @ preload data\n"
-            "pld [%[din3_ptr]]                             @ preload data\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, %q[vzero], q10, #3     @ 0012\n"
-            "vext.32  q7, q10, q11, #1     @ 1234\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q10, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"
-            "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q6, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"
-            "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[2]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, %q[vzero], q12, #3     @ 0012\n"
-            "vext.32  q7, q12, q13, #1     @ 1234\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q12, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[2]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, %q[vzero], q14, #3     @ 0012\n"
-            "vext.32  q7, q14, q15, #1     @ 1234\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmla.f32 q5, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-            "cmp %[cnt], #1                             @ check whether has "
-            "mid cols\n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q5
-                                                                           // =
-            // vbias
-            "blt  3f                                @ jump to main loop start "
-            "point\n"
-
-            // mid
-            "1:                                    @ right pad entry\n"
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                             @ preload data\n"
-            "pld [%[din2_ptr]]                             @ preload data\n"
-            "pld [%[din3_ptr]]                             @ preload data\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            "subs %[cnt], #1 @ loop count minus 1\n"
-
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "bne    1b                             @ jump to main loop start "
-            "point\n"
-
-            // right
-            "3:                                    @ right pad entry\n"
-            "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"
-
-            "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d31}, [%[vmask]]!    @ load din r0\n"
-
-            "vbif d16, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d17, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d18, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vbif d20, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d21, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d22, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vbif d24, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d25, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d26, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vbif d28, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d29, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d30, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d19}, [%[rmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[rmask]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[dout_ptr1]]    @ load din r0\n"
-            "vld1.32  {d20-d21}, [%[dout_ptr2]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vbif d8, d16, d19              @ bit select, deal with right pad\n"
-            "vbif d9, d17, d23              @ bit select, deal with right pad\n"
-
-            "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vbif d10, d20, d19              @ bit select, deal with right "
-            "pad\n"
-            "vbif d11, d21, d23              @ bit select, deal with right "
-            "pad\n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            : [dout_ptr1] "+r"(doutr0),
-              [dout_ptr2] "+r"(doutr1),
-              [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [din3_ptr] "+r"(din3_ptr),
-              [cnt] "+r"(cnt),
-              [rmask] "+r"(rmask_ptr),
-              [vmask] "+r"(vmask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias_val] "r"(bias_val),
-              [vzero] "w"(vzero)
-            : "cc",
-              "memory",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-        dout_channel += 2 * w_out;
-      }  //! end of processing mid rows
-    }
-#endif
-  }
-}
-
-/**
- * \brief depthwise convolution kernel 3x3, stride 2
- */
-// w_in > 7
-void conv_depthwise_3x3s2p1_bias(float* dout,
-                                 const float* din,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext* ctx) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-  int size_pad_bottom = h_out * 2 - h_in;
-
-  int cnt_col = (w_out >> 2) - 2;
-  int size_right_remain = w_in - (7 + cnt_col * 8);
-  if (size_right_remain >= 9) {
-    cnt_col++;
-    size_right_remain -= 8;
-  }
-  int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4);  //
-
-  int size_right_pad = w_out * 2 - w_in;
-
-  uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-  uint32x4_t wmask =
-      vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));  // 0 1 2 3
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-
-  unsigned int dmask[12];
-
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-  vst1q_u32(dmask + 8, wmask);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float32x4_t vzero = vdupq_n_f32(0.f);
-
-      float32x4_t wbias;
-      float bias_c = 0.f;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-        bias_c = bias[i];
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-      const float* dr4 = dr3 + w_in;
-
-      const float* din0_ptr = dr0;
-      const float* din1_ptr = dr1;
-      const float* din2_ptr = dr2;
-      const float* din3_ptr = dr3;
-      const float* din4_ptr = dr4;
-
-      float* doutr0 = dout_channel;
-      float* doutr0_ptr = nullptr;
-      float* doutr1_ptr = nullptr;
-
-#ifdef __aarch64__
-      for (int i = 0; i < h_in; i += 4) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-        din4_ptr = dr4;
-
-        doutr0_ptr = doutr0;
-        doutr1_ptr = doutr0 + w_out;
-
-        if (i == 0) {
-          din0_ptr = zero_ptr;
-          din1_ptr = dr0;
-          din2_ptr = dr1;
-          din3_ptr = dr2;
-          din4_ptr = dr3;
-          dr0 = dr3;
-          dr1 = dr4;
-        } else {
-          dr0 = dr4;
-          dr1 = dr0 + w_in;
-        }
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-
-        //! process bottom pad
-        if (i + 4 > h_in) {
-          switch (i + 4 - h_in) {
-            case 4:
-              din1_ptr = zero_ptr;
-            case 3:
-              din2_ptr = zero_ptr;
-            case 2:
-              din3_ptr = zero_ptr;
-            case 1:
-              din4_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process output pad
-        if (i / 2 + 2 > h_out) {
-          doutr1_ptr = write_ptr;
-        }
-        int cnt = cnt_col;
-        asm volatile(
-            // top
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "0:                                      \n"
-            "prfm pldl1keep, [%[inptr0]]             \n"
-            "prfm pldl1keep, [%[inptr1]]             \n"
-            "prfm pldl1keep, [%[inptr2]]             \n"
-            "prfm pldl1keep, [%[inptr3]]             \n"
-            "prfm pldl1keep, [%[inptr4]]             \n"
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "ext  v10.16b, %[vzero].16b, v1.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[1]            \n"   // {0,2,4,6} * w01
-            "fmul v12.4s, v1.4s, %[w0].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v16.4s, v10.4s, %[w0].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v3.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            "sub %[inptr0], %[inptr0], #4            \n"
-            "sub %[inptr1], %[inptr1], #4             \n"
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[1]            \n"   // {0,2,4,6} * w01
-            "fmla v12.4s, v3.4s, %[w1].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v16.4s, v10.4s, %[w1].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v5.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            "sub %[inptr2], %[inptr2], #4            \n"
-            "sub %[inptr3], %[inptr3], #4             \n"
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[1]            \n"  // {0,2,4,6} * w01
-            "fmla v11.4s, v4.4s, %[w2].s[1]            \n"  // {0,2,4,6} * w01
-
-            "fmul v14.4s, v5.4s, %[w0].s[2]            \n"  // {1,3,5,7} * w02
-            "fmla v12.4s, v5.4s, %[w2].s[2]            \n"  // {1,3,5,7} * w02
-
-            "fmla v17.4s, v10.4s, %[w0].s[0]            \n"  // {0,1,3,5} * w00
-            "fmla v16.4s, v10.4s, %[w2].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v7.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            "sub %[inptr4], %[inptr4], #4            \n"
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[1]            \n"   // {0,2,4,6} * w01
-            "fmla v14.4s, v7.4s, %[w1].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v17.4s, v10.4s, %[w1].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v9.16b, #12     \n"  // v10 = {0,1,3,5}
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[1]            \n"   // {0,2,4,6} * w01
-            "fmla v14.4s, v9.4s, %[w2].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v17.4s, v10.4s, %[w2].s[0]            \n"  // {0,1,3,5} * w00
-
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-
-            "cmp %[cnt], #1                             \n"
-
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "blt 1f                                     \n"
-            // mid
-            "2:                                          \n"
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[0]            \n"   // {0,2,4,6} * w00
-            "fmul v12.4s, v1.4s, %[w0].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v2.16b, v18.16b, #4     \n"     // v10 = {2,4,6,8}
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v12.4s, v3.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v4.16b, v19.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[0]            \n"  // {0,2,4,6} * w00
-            "fmla v11.4s, v4.4s, %[w2].s[0]            \n"  // {0,2,4,6} * w00
-
-            "fmul v14.4s, v5.4s, %[w0].s[1]            \n"  // {1,3,5,7} * w01
-            "fmla v12.4s, v5.4s, %[w2].s[1]            \n"  // {1,3,5,7} * w01
-
-            "fmla v17.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-            "fmla v16.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v6.16b, v20.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v7.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v8.16b, v21.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v9.4s, %[w2].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"      // v10 = {2,4,6,8}
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-            "subs %[cnt], %[cnt], #1                    \n"
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "bne  2b                                    \n"
-
-            // right
-            "1:                                          \n"
-            "cmp %[remain], #1                           \n"
-            "blt 4f                                     \n"
-            "3:                                         \n"
-            "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "bif  v2.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v3.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "ext  v10.16b, v0.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "bif  v6.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v7.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[0]            \n"   // {0,2,4,6} * w00
-            "fmul v12.4s, v1.4s, %[w0].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v2.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-            "bif  v8.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v9.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v12.4s, v3.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v4.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[0]            \n"  // {0,2,4,6} * w00
-            "fmla v11.4s, v4.4s, %[w2].s[0]            \n"  // {0,2,4,6} * w00
-
-            "fmul v14.4s, v5.4s, %[w0].s[1]            \n"  // {1,3,5,7} * w01
-            "fmla v12.4s, v5.4s, %[w2].s[1]            \n"  // {1,3,5,7} * w01
-
-            "fmla v17.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-            "fmla v16.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v6.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v7.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v8.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-            "ld1 {v0.4s}, [%[outptr0]]                  \n"
-
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-            "ld1 {v1.4s}, [%[outptr1]]                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v9.4s, %[w2].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "bif  v16.16b, v0.16b, %[wmask].16b    \n"  // pipei
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "bif  v17.16b, v1.16b, %[wmask].16b    \n"  // pipei
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-            "4:                                          \n"
-            : [inptr0] "+r"(din0_ptr),
-              [inptr1] "+r"(din1_ptr),
-              [inptr2] "+r"(din2_ptr),
-              [inptr3] "+r"(din3_ptr),
-              [inptr4] "+r"(din4_ptr),
-              [outptr0] "+r"(doutr0_ptr),
-              [outptr1] "+r"(doutr1_ptr),
-              [cnt] "+r"(cnt)
-            : [vzero] "w"(vzero),
-              [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [remain] "r"(cnt_remain),
-              [mask1] "w"(vmask_rp1),
-              [mask2] "w"(vmask_rp2),
-              [wmask] "w"(wmask),
-              [vbias] "w"(wbias)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21");
-        doutr0 = doutr0 + 2 * w_out;
-      }
-#else
-      for (int i = 0; i < h_in; i += 2) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-
-        doutr0_ptr = doutr0;
-
-        if (i == 0) {
-          din0_ptr = zero_ptr;
-          din1_ptr = dr0;
-          din2_ptr = dr1;
-          dr0 = dr1;
-          dr1 = dr2;
-          dr2 = dr1 + w_in;
-        } else {
-          dr0 = dr2;
-          dr1 = dr0 + w_in;
-          dr2 = dr1 + w_in;
-        }
-
-        //! process bottom pad
-        if (i + 2 > h_in) {
-          switch (i + 2 - h_in) {
-            case 2:
-              din1_ptr = zero_ptr;
-            case 1:
-              din2_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        int cnt = cnt_col;
-        unsigned int* mask_ptr = dmask;
-        asm volatile(
-            // top
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "0:                                                     \n"
-            "vmov.u32 q9, #0                                \n"
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r1\n"  // v11={0,2,4,6} v12={1,3,5,7}, q10, q11
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v11={0,2,4,6} v12={1,3,5,7}, q12, q13
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"  // v13={0,2,4,6} v14={1,3,5,7}, q14, q15
-            "pld [%[din0_ptr]]                              @ preload data\n"
-            "pld [%[din1_ptr]]                              @ preload data\n"
-            "pld [%[din2_ptr]]                              @ preload data\n"
-
-            "vdup.32 q3, %[bias]                            @ and \n"  // q10 =
-                                                                       // vbias
-
-            "vext.32 q6, q9, q11, #3                        @ shift right 1 "
-            "data\n"  // q2 = {0,1,3,5}
-            "vext.32 q7, q9, q13, #3                        @ shift right 1 "
-            "data\n"  // q6 = {0,1,3,5}
-            "vext.32 q8, q9, q15, #3                        @ shift right 1 "
-            "data\n"  // q6 = {0,1,3,5}
-
-            "vmul.f32 q4, q10, %e[wr0][1]                   @ mul weight 1, "
-            "out0\n"  // q11 * w01
-            "vmul.f32 q5, q11, %f[wr0][0]                   @ mul weight 1, "
-            "out0\n"  // q12 * w02
-            "vmla.f32 q3,  q6, %e[wr0][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w00
-
-            "sub %[din0_ptr], #4                            @ inpitr0 - 1\n"
-            "sub %[din1_ptr], #4                            @ inpitr1 - 1\n"
-            "sub %[din2_ptr], #4                            @ inpitr2 - 1\n"
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q12, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q11 * w01
-            "vmla.f32 q5, q13, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q12 * w02
-            "vmla.f32 q3,  q7, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w00
-
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vmla.f32 q4, q14, %e[wr2][1]                   @ mul weight 1, "
-            "out1\n"  // q0 * w01
-            "vmla.f32 q5, q15, %f[wr2][0]                   @ mul weight 1, "
-            "out1\n"  // q1 * w02
-            "vmla.f32 q3,  q8, %e[wr2][0]                   @ mul weight 1, "
-            "out1\n"  // q2 * w00
-
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "cmp %[cnt], #1                                 \n"
-            "blt 1f                                         \n"
-            // mid
-            "2:                                             \n"
-            "vld1.32  {d16}, [%[din0_ptr]]                  @ load din r0\n"  // q2={8,10,12,14}
-            "vdup.32  q3, %[bias]                           @ and \n"  // q10 =
-                                                                       // vbias
-            "vext.32  q6, q10, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.32 {d16}, [%[din1_ptr]]                   @ load din r1\n"  // q2={8,10,12,14}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q0 * w00
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6 * w02
-
-            "vext.32  q7, q12, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.32 {d16}, [%[din2_ptr]]                   @ load din r1\n"  // q2={8,10,12,14}
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w02
-
-            "vext.32  q6, q14, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q6 * w02
-
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "subs %[cnt], #1                                \n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "bne  2b                                        \n"
-
-            // right
-            "1:                                             \n"
-            "cmp %[remain], #1                              \n"
-            "blt 3f                                         \n"
-
-            "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"
-            "vdup.32  q3, %[bias]                           @ and \n"  // q10 =
-                                                                       // vbias
-
-            "vbif q10, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q11, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q12, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q13, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q14, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q15, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-
-            "vext.32 q6, q10, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vext.32 q7, q12, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q0 * w00
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6 * w02
-
-            "vext.32 q6, q14, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.f32   {d20-d21}, [%[outptr]]              @ load output\n"
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w02
-
-            "vld1.f32   {d22-d23}, [%[mask_ptr]]            @ load mask\n"
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q6 * w02
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vbif.f32 q3, q10, q11                          @ write mask\n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "3:                                             \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [outptr] "+r"(doutr0_ptr),
-              [cnt] "+r"(cnt),
-              [mask_ptr] "+r"(mask_ptr)
-            : [remain] "r"(cnt_remain),
-              [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "r"(bias_c)
-            : "cc",
-              "memory",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-
-        doutr0 = doutr0 + w_out;
-      }
-#endif
-    }
-  }
-}
-
-// 4line
-void conv_depthwise_3x3s1p1_bias_relu(float* dout,
-                                      const float* din,
-                                      const float* weights,
-                                      const float* bias,
-                                      bool flag_bias,
-                                      const int num,
-                                      const int ch_in,
-                                      const int h_in,
-                                      const int w_in,
-                                      const int h_out,
-                                      const int w_out,
-                                      ARMContext* ctx) {
-  //! pad is done implicit
-  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
-  //! for 4x6 convolution window
-  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
-
-  // printf("conv3x3_dw start \n");
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  int w_stride = 9;
-
-  int tile_w = (w_in + 3) >> 2;
-  int tile_h = (h_in + 3) >> 2;
-  int cnt_col = tile_w - 2;
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-
-  unsigned int size_pad_right = (unsigned int)(1 + (tile_w << 2) - w_in);
-  int size_pad_bottom = (unsigned int)(1 + (tile_h << 2) - h_in);
-
-  uint32x4_t vmask_rp1 =
-      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_rp2 =
-      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_result =
-      vcgtq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
-
-  unsigned int vmask[8];
-  vst1q_u32(vmask, vmask_rp1);
-  vst1q_u32(vmask + 4, vmask_rp2);
-
-  unsigned int rmask[4];
-  vst1q_u32(rmask, vmask_result);
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-#ifdef __aarch64__
-    for (int c = 0; c < ch_in; c++) {
-      float* dout_ptr = dout_batch + c * size_out_channel;
-
-      const float* din_ch_ptr = din_batch + c * size_in_channel;
-
-      float bias_val = flag_bias ? bias[c] : 0.f;
-      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
-
-      const float* wei_ptr = weights + c * w_stride;
-
-      float32x4_t wr0 = vld1q_f32(wei_ptr);
-      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
-
-      float* doutr0 = dout_ptr;
-      float* doutr1 = doutr0 + w_out;
-      float* doutr2 = doutr1 + w_out;
-      float* doutr3 = doutr2 + w_out;
-
-      const float* dr0 = din_ch_ptr;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-      const float* dr4 = dr3 + w_in;
-      const float* dr5 = dr4 + w_in;
-
-      const float* din_ptr0 = dr0;
-      const float* din_ptr1 = dr1;
-      const float* din_ptr2 = dr2;
-      const float* din_ptr3 = dr3;
-      const float* din_ptr4 = dr4;
-      const float* din_ptr5 = dr5;
-
-      for (int i = 0; i < h_in; i += 4) {
-        //! process top pad pad_h = 1
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-        din_ptr3 = dr3;
-        din_ptr4 = dr4;
-        din_ptr5 = dr5;
-
-        doutr0 = dout_ptr;
-        doutr1 = doutr0 + w_out;
-        doutr2 = doutr1 + w_out;
-        doutr3 = doutr2 + w_out;
-        if (i == 0) {
-          din_ptr0 = zero_ptr;
-          din_ptr1 = dr0;
-          din_ptr2 = dr1;
-          din_ptr3 = dr2;
-          din_ptr4 = dr3;
-          din_ptr5 = dr4;
-          dr0 = dr3;
-          dr1 = dr4;
-          dr2 = dr5;
-        } else {
-          dr0 = dr4;
-          dr1 = dr5;
-          dr2 = dr1 + w_in;
-        }
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-        dr5 = dr4 + w_in;
-
-        //! process bottom pad
-        if (i + 5 > h_in) {
-          switch (i + 5 - h_in) {
-            case 5:
-              din_ptr1 = zero_ptr;
-            case 4:
-              din_ptr2 = zero_ptr;
-            case 3:
-              din_ptr3 = zero_ptr;
-            case 2:
-              din_ptr4 = zero_ptr;
-            case 1:
-              din_ptr5 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 4 > h_out) {
-          switch (i + 4 - h_out) {
-            case 3:
-              doutr1 = write_ptr;
-            case 2:
-              doutr2 = write_ptr;
-            case 1:
-              doutr3 = write_ptr;
-            default:
-              break;
-          }
-        }
-
-        int cnt = cnt_col;
-        asm volatile(
-            "PRFM PLDL1KEEP, [%[din_ptr0]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr1]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr2]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr3]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr4]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr5]] \n"
-            "movi   v21.4s, #0x0\n" /* out0 = 0 */
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "ld1 {v12.4s}, [%[bias_val]]     \n"  /*vdupq_n_f32(bias_val)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "ext  v16.16b, %[vzero].16b, v0.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v0.16b, v1.16b, #4 \n"        /* v16 = 1234 */
-
-            // left
-            // r0
-            "fmla v12.4s,  v0.4s,  %[w0].s[1]\n" /* outr00 += din0_0123 *
-                                                    w0[1]*/
-
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "sub %[din_ptr0], %[din_ptr0], #4 \n"   /* din_ptr0-- */
-            "sub %[din_ptr1], %[din_ptr1], #4 \n"   /* din_ptr0-- */
-
-            "fmla v12.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din0_0012 *
-                                                      w0[0]*/
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/
-            "sub %[din_ptr2], %[din_ptr2], #4 \n" /* din_ptr0-- */
-            "sub %[din_ptr3], %[din_ptr3], #4 \n" /* din_ptr0-- */
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_1234 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, %[vzero].16b, v2.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v2.16b, v3.16b, #4 \n"        /* v16 = 1234 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[1]\n" /* outr00 += din1_0123 *
-                                                     w0[1]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[1]\n" /* outr00 += din1_0123 *
-                                                     w1[1]*/
-            "sub %[din_ptr4], %[din_ptr4], #4 \n" /* din_ptr0-- */
-            "sub %[din_ptr5], %[din_ptr5], #4 \n" /* din_ptr0-- */
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v4.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v4.16b, v5.16b, #4 \n"        /* v16 = 1234 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[1]\n" /* outr00 += din2_0123 *
-                                                     w0[1]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 *
-                                                     w1[1]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                     w2[1]*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v6.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v6.16b, v7.16b, #4 \n"        /* v16 = 1234 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[1]\n" /*outr00 += din2_0123 *
-                                                     w0[1]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 *
-                                                     w1[1]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                     w2[1]*/
-
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v8.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v8.16b, v9.16b, #4 \n"        /* v16 = 1234 */
-
-            // r4
-            "fmla v15.4s ,  v8.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 *
-                                                     w1[1]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                     w2[1]*/
-
-            "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/
-            "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/
-
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */
-            "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v10.16b, v11.16b, #4 \n"       /* v16 = 1234 */
-            "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            // r5
-            "fmla v15.4s ,  v10.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/
-
-            "ld1 {v10.4s}, [%[din_ptr5]], #16  \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */
-
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-
-            "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/
-
-            "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */
-            "cmp  %[cnt], #1                \n"
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "blt 3f                         \n"
-            // mid
-            "1:                             \n"
-            // r0
-            "fmla v12.4s ,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v12.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "fmax v12.4s, v12.4s, %[vzero].4s \n"  /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v12.4s}, [%[doutr0]], #16     \n"
-
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "fmax v13.4s, v13.4s, %[vzero].4s \n"  /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v13.4s}, [%[doutr1]], #16     \n"
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                      w0[0]*/
-
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "fmax v14.4s, v14.4s, %[vzero].4s \n"   /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16     \n"
-
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-
-            "subs %[cnt], %[cnt], #1 \n"
-
-            "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/
-
-            "st1 {v15.4s}, [%[doutr3]], #16     \n"
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "bne 1b \n"
-
-            // right
-            "3:                             \n"
-            "ld1 {v18.4s, v19.4s}, [%[vmask]]         \n"
-            "ld1 {v22.4s}, [%[doutr0]]         \n"
-            "ld1 {v23.4s}, [%[doutr1]]         \n"
-            "ld1 {v24.4s}, [%[doutr2]]         \n"
-            "ld1 {v25.4s}, [%[doutr3]]         \n"
-
-            "bif v0.16b, %[vzero].16b, v18.16b \n"
-            "bif v1.16b, %[vzero].16b, v19.16b \n"
-            "bif v2.16b, %[vzero].16b, v18.16b \n"
-            "bif v3.16b, %[vzero].16b, v19.16b \n"
-
-            "bif v4.16b, %[vzero].16b, v18.16b \n"
-            "bif v5.16b, %[vzero].16b, v19.16b \n"
-            "bif v6.16b, %[vzero].16b, v18.16b \n"
-            "bif v7.16b, %[vzero].16b, v19.16b \n"
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-
-            // r0
-            "fmla v12.4s,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                    w0[0]*/
-
-            "bif v8.16b, %[vzero].16b, v18.16b \n"
-            "bif v9.16b, %[vzero].16b, v19.16b \n"
-            "bif v10.16b, %[vzero].16b, v18.16b \n"
-            "bif v11.16b, %[vzero].16b, v19.16b \n"
-
-            "fmla v12.4s,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                     w0[1]*/
-
-            "ld1 {v18.4s}, [%[rmask]]         \n"
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "bif v12.16b, v22.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "st1 {v12.4s}, [%[doutr0]], #16     \n"
-            "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "bif v13.16b, v23.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */
-
-            "st1 {v13.4s}, [%[doutr1]], #16     \n"
-
-            // r3
-            "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                      w0[0]*/
-
-            "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "bif v14.16b, v24.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16     \n"
-
-            "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/
-
-            "bif v15.16b, v25.16b, v18.16b \n"
-
-            "st1 {v15.4s}, [%[doutr3]], #16     \n"
-            : [cnt] "+r"(cnt),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [din_ptr5] "+r"(din_ptr5),
-              [doutr0] "+r"(doutr0),
-              [doutr1] "+r"(doutr1),
-              [doutr2] "+r"(doutr2),
-              [doutr3] "+r"(doutr3)
-            : [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [bias_val] "r"(vbias),
-              [vmask] "r"(vmask),
-              [rmask] "r"(rmask),
-              [vzero] "w"(vzero)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22",
-              "v23",
-              "v24",
-              "v25");
-        dout_ptr = dout_ptr + 4 * w_out;
-      }
-    }
-#else
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float bias_val = flag_bias ? bias[i] : 0.f;
-
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-
-      const float* din0_ptr = nullptr;
-      const float* din1_ptr = nullptr;
-      const float* din2_ptr = nullptr;
-      const float* din3_ptr = nullptr;
-
-      float* doutr0 = nullptr;
-      float* doutr1 = nullptr;
-
-      float* ptr_zero = const_cast<float*>(zero);
-
-      for (int i = 0; i < h_in; i += 2) {
-        //! process top pad pad_h = 1
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-
-        doutr0 = dout_channel;
-        doutr1 = dout_channel + w_out;
-        // unsigned int* rst_mask = rmask;
-
-        if (i == 0) {
-          din0_ptr = zero_ptr;
-          din1_ptr = dr0;
-          din2_ptr = dr1;
-          din3_ptr = dr2;
-          dr0 = dr1;
-          dr1 = dr2;
-          dr2 = dr3;
-          dr3 = dr2 + w_in;
-        } else {
-          dr0 = dr2;
-          dr1 = dr3;
-          dr2 = dr1 + w_in;
-          dr3 = dr2 + w_in;
-        }
-        //! process bottom pad
-        if (i + 3 > h_in) {
-          switch (i + 3 - h_in) {
-            case 3:
-              din1_ptr = zero_ptr;
-            case 2:
-              din2_ptr = zero_ptr;
-            case 1:
-              din3_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 2 > h_out) {
-          doutr1 = write_ptr;
-        }
-        int cnt = cnt_col;
-        unsigned int* rmask_ptr = rmask;
-        unsigned int* vmask_ptr = vmask;
-        asm volatile(
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                      @ preload data\n"
-            "pld [%[din2_ptr]]                      @ preload data\n"
-            "pld [%[din3_ptr]]                      @ preload data\n"
-
-            "vld1.32  {d16-d18}, [%[din0_ptr]]!    @ load din r0\n"
-            "vld1.32  {d20-d22}, [%[din1_ptr]]!    @ load din r1\n"
-            "vld1.32  {d24-d26}, [%[din2_ptr]]!    @ load din r2\n"
-            "vld1.32  {d28-d30}, [%[din3_ptr]]!    @ load din r3\n"
-
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q5
-                                                                           // =
-            // vbias
-
-            "vext.32  q6, %q[vzero], q8, #3     @ 0012\n"
-            "vext.32  q7, q8, q9, #1     @ 1234\n"
-
-            // left
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n"
-            "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n"
-            "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n"
-            "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n"
-
-            "vmla.f32 q4, q6, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                             @ preload data\n"
-            "pld [%[din2_ptr]]                             @ preload data\n"
-            "pld [%[din3_ptr]]                             @ preload data\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, %q[vzero], q10, #3     @ 0012\n"
-            "vext.32  q7, q10, q11, #1     @ 1234\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q10, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"
-            "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q6, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"
-            "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[2]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, %q[vzero], q12, #3     @ 0012\n"
-            "vext.32  q7, q12, q13, #1     @ 1234\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q12, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[2]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, %q[vzero], q14, #3     @ 0012\n"
-            "vext.32  q7, q14, q15, #1     @ 1234\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"
-            "vmax.f32  q4, q4, %q[vzero]  @ relu \n"
-
-            "vmla.f32 q5, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "vmax.f32  q5, q5, %q[vzero]  @ relu \n"
-
-            "cmp %[cnt], #1                             @ check whether has "
-            "mid cols\n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q5
-                                                                           // =
-            // vbias
-            "blt  3f                                @ jump to main loop start "
-            "point\n"
-
-            // mid
-            "1:                                    @ right pad entry\n"
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                             @ preload data\n"
-            "pld [%[din2_ptr]]                             @ preload data\n"
-            "pld [%[din3_ptr]]                             @ preload data\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"
-            "vmax.f32  q4, q4, %q[vzero]  @ relu \n"
-
-            "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "vmax.f32  q5, q5, %q[vzero]  @ relu \n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            "subs %[cnt], #1 @ loop count minus 1\n"
-
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "bne    1b                             @ jump to main loop start "
-            "point\n"
-
-            // right
-            "3:                                    @ right pad entry\n"
-            "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"
-
-            "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d31}, [%[vmask]]!    @ load din r0\n"
-
-            "vbif d16, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d17, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d18, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vbif d20, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d21, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d22, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vbif d24, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d25, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d26, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vbif d28, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d29, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d30, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d19}, [%[rmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[rmask]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[dout_ptr1]]    @ load din r0\n"
-            "vld1.32  {d20-d21}, [%[dout_ptr2]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vmax.f32  q4, q4, %q[vzero]  @ relu \n"
-
-            "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vbif d8, d16, d19              @ bit select, deal with right pad\n"
-            "vbif d9, d17, d23              @ bit select, deal with right pad\n"
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmax.f32  q5, q5, %q[vzero]  @ relu \n"
-
-            "vbif d10, d20, d19              @ bit select, deal with right "
-            "pad\n"
-            "vbif d11, d21, d23              @ bit select, deal with right "
-            "pad\n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            : [dout_ptr1] "+r"(doutr0),
-              [dout_ptr2] "+r"(doutr1),
-              [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [din3_ptr] "+r"(din3_ptr),
-              [cnt] "+r"(cnt),
-              [rmask] "+r"(rmask_ptr),
-              [vmask] "+r"(vmask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias_val] "r"(bias_val),
-              [vzero] "w"(vzero)
-            : "cc",
-              "memory",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-        dout_channel += 2 * w_out;
-      }  //! end of processing mid rows
-    }
-#endif
-  }
-}
-/**
- * \brief depthwise convolution kernel 3x3, stride 2, with reulu
- */
-// w_in > 7
-void conv_depthwise_3x3s2p1_bias_relu(float* dout,
-                                      const float* din,
-                                      const float* weights,
-                                      const float* bias,
-                                      bool flag_bias,
-                                      const int num,
-                                      const int ch_in,
-                                      const int h_in,
-                                      const int w_in,
-                                      const int h_out,
-                                      const int w_out,
-                                      ARMContext* ctx) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-  int size_pad_bottom = h_out * 2 - h_in;
-
-  int cnt_col = (w_out >> 2) - 2;
-  int size_right_remain = w_in - (7 + cnt_col * 8);
-  if (size_right_remain >= 9) {
-    cnt_col++;
-    size_right_remain -= 8;
-  }
-  int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4);  //
-
-  int size_right_pad = w_out * 2 - w_in;
-
-  uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-  uint32x4_t wmask =
-      vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));  // 0 1 2 3
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-
-  unsigned int dmask[12];
-
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-  vst1q_u32(dmask + 8, wmask);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float32x4_t vzero = vdupq_n_f32(0.f);
-
-      float32x4_t wbias;
-      float bias_c = 0.f;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-        bias_c = bias[i];
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-      const float* dr4 = dr3 + w_in;
-
-      const float* din0_ptr = dr0;
-      const float* din1_ptr = dr1;
-      const float* din2_ptr = dr2;
-      const float* din3_ptr = dr3;
-      const float* din4_ptr = dr4;
-
-      float* doutr0 = dout_channel;
-      float* doutr0_ptr = nullptr;
-      float* doutr1_ptr = nullptr;
-
-#ifdef __aarch64__
-      for (int i = 0; i < h_in; i += 4) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-        din4_ptr = dr4;
-
-        doutr0_ptr = doutr0;
-        doutr1_ptr = doutr0 + w_out;
-
-        if (i == 0) {
-          din0_ptr = zero_ptr;
-          din1_ptr = dr0;
-          din2_ptr = dr1;
-          din3_ptr = dr2;
-          din4_ptr = dr3;
-          dr0 = dr3;
-          dr1 = dr4;
-        } else {
-          dr0 = dr4;
-          dr1 = dr0 + w_in;
-        }
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-
-        //! process bottom pad
-        if (i + 4 > h_in) {
-          switch (i + 4 - h_in) {
-            case 4:
-              din1_ptr = zero_ptr;
-            case 3:
-              din2_ptr = zero_ptr;
-            case 2:
-              din3_ptr = zero_ptr;
-            case 1:
-              din4_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process output pad
-        if (i / 2 + 2 > h_out) {
-          doutr1_ptr = write_ptr;
-        }
-        int cnt = cnt_col;
-        asm volatile(
-            // top
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "0:                                      \n"
-            "prfm pldl1keep, [%[inptr0]]             \n"
-            "prfm pldl1keep, [%[inptr1]]             \n"
-            "prfm pldl1keep, [%[inptr2]]             \n"
-            "prfm pldl1keep, [%[inptr3]]             \n"
-            "prfm pldl1keep, [%[inptr4]]             \n"
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "ext  v10.16b, %[vzero].16b, v1.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[1]            \n"   // {0,2,4,6} * w01
-            "fmul v12.4s, v1.4s, %[w0].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v16.4s, v10.4s, %[w0].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v3.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            "sub %[inptr0], %[inptr0], #4            \n"
-            "sub %[inptr1], %[inptr1], #4             \n"
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[1]            \n"   // {0,2,4,6} * w01
-            "fmla v12.4s, v3.4s, %[w1].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v16.4s, v10.4s, %[w1].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v5.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            "sub %[inptr2], %[inptr2], #4            \n"
-            "sub %[inptr3], %[inptr3], #4             \n"
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[1]            \n"  // {0,2,4,6} * w01
-            "fmla v11.4s, v4.4s, %[w2].s[1]            \n"  // {0,2,4,6} * w01
-
-            "fmul v14.4s, v5.4s, %[w0].s[2]            \n"  // {1,3,5,7} * w02
-            "fmla v12.4s, v5.4s, %[w2].s[2]            \n"  // {1,3,5,7} * w02
-
-            "fmla v17.4s, v10.4s, %[w0].s[0]            \n"  // {0,1,3,5} * w00
-            "fmla v16.4s, v10.4s, %[w2].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v7.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            "sub %[inptr4], %[inptr4], #4            \n"
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[1]            \n"   // {0,2,4,6} * w01
-            "fmla v14.4s, v7.4s, %[w1].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v17.4s, v10.4s, %[w1].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v9.16b, #12     \n"  // v10 = {0,1,3,5}
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[1]            \n"   // {0,2,4,6} * w01
-            "fmla v14.4s, v9.4s, %[w2].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v17.4s, v10.4s, %[w2].s[0]            \n"  // {0,1,3,5} * w00
-
-            "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */
-
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */
-
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-
-            "cmp %[cnt], #1                             \n"
-
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "blt 1f                                     \n"
-            // mid
-            "2:                                          \n"
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[0]            \n"   // {0,2,4,6} * w00
-            "fmul v12.4s, v1.4s, %[w0].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v2.16b, v18.16b, #4     \n"     // v10 = {2,4,6,8}
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v12.4s, v3.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v4.16b, v19.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[0]            \n"  // {0,2,4,6} * w00
-            "fmla v11.4s, v4.4s, %[w2].s[0]            \n"  // {0,2,4,6} * w00
-
-            "fmul v14.4s, v5.4s, %[w0].s[1]            \n"  // {1,3,5,7} * w01
-            "fmla v12.4s, v5.4s, %[w2].s[1]            \n"  // {1,3,5,7} * w01
-
-            "fmla v17.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-            "fmla v16.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v6.16b, v20.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v7.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v8.16b, v21.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v9.4s, %[w2].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"      // v10 = {2,4,6,8}
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-            "subs %[cnt], %[cnt], #1                    \n"
-
-            "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "bne  2b                                    \n"
-
-            // right
-            "1:                                          \n"
-            "cmp %[remain], #1                           \n"
-            "blt 4f                                     \n"
-            "3:                                         \n"
-            "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "bif  v2.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v3.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "ext  v10.16b, v0.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "bif  v6.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v7.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[0]            \n"   // {0,2,4,6} * w00
-            "fmul v12.4s, v1.4s, %[w0].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v2.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-            "bif  v8.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v9.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v12.4s, v3.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v4.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[0]            \n"  // {0,2,4,6} * w00
-            "fmla v11.4s, v4.4s, %[w2].s[0]            \n"  // {0,2,4,6} * w00
-
-            "fmul v14.4s, v5.4s, %[w0].s[1]            \n"  // {1,3,5,7} * w01
-            "fmla v12.4s, v5.4s, %[w2].s[1]            \n"  // {1,3,5,7} * w01
-
-            "fmla v17.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-            "fmla v16.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v6.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v7.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v8.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-            "ld1 {v0.4s}, [%[outptr0]]                  \n"
-
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-            "ld1 {v1.4s}, [%[outptr1]]                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v9.4s, %[w2].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "bif  v16.16b, v0.16b, %[wmask].16b    \n"  // pipei
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */
-
-            "bif  v17.16b, v1.16b, %[wmask].16b    \n"  // pipei
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-            "4:                                          \n"
-            : [inptr0] "+r"(din0_ptr),
-              [inptr1] "+r"(din1_ptr),
-              [inptr2] "+r"(din2_ptr),
-              [inptr3] "+r"(din3_ptr),
-              [inptr4] "+r"(din4_ptr),
-              [outptr0] "+r"(doutr0_ptr),
-              [outptr1] "+r"(doutr1_ptr),
-              [cnt] "+r"(cnt)
-            : [vzero] "w"(vzero),
-              [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [remain] "r"(cnt_remain),
-              [mask1] "w"(vmask_rp1),
-              [mask2] "w"(vmask_rp2),
-              [wmask] "w"(wmask),
-              [vbias] "w"(wbias)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21");
-        doutr0 = doutr0 + 2 * w_out;
-      }
-#else
-
-      for (int i = 0; i < h_in; i += 2) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-
-        doutr0_ptr = doutr0;
-
-        if (i == 0) {
-          din0_ptr = zero_ptr;
-          din1_ptr = dr0;
-          din2_ptr = dr1;
-          dr0 = dr1;
-          dr1 = dr2;
-          dr2 = dr1 + w_in;
-        } else {
-          dr0 = dr2;
-          dr1 = dr0 + w_in;
-          dr2 = dr1 + w_in;
-        }
-
-        //! process bottom pad
-        if (i + 2 > h_in) {
-          switch (i + 2 - h_in) {
-            case 2:
-              din1_ptr = zero_ptr;
-            case 1:
-              din2_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        int cnt = cnt_col;
-
-        unsigned int* mask_ptr = dmask;
-        asm volatile(
-            // top
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "0:                                                     \n"
-            "vmov.u32 q9, #0                                \n"
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r1\n"  // v11={0,2,4,6} v12={1,3,5,7}, q10, q11
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v11={0,2,4,6} v12={1,3,5,7}, q12, q13
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"  // v13={0,2,4,6} v14={1,3,5,7}, q14, q15
-            "pld [%[din0_ptr]]                              @ preload data\n"
-            "pld [%[din1_ptr]]                              @ preload data\n"
-            "pld [%[din2_ptr]]                              @ preload data\n"
-
-            "vdup.32 q3, %[bias]                            @ and \n"  // q10 =
-                                                                       // vbias
-
-            "vext.32 q6, q9, q11, #3                        @ shift right 1 "
-            "data\n"  // q2 = {0,1,3,5}
-            "vext.32 q7, q9, q13, #3                        @ shift right 1 "
-            "data\n"  // q6 = {0,1,3,5}
-            "vext.32 q8, q9, q15, #3                        @ shift right 1 "
-            "data\n"  // q6 = {0,1,3,5}
-
-            "vmul.f32 q4, q10, %e[wr0][1]                   @ mul weight 1, "
-            "out0\n"  // q11 * w01
-            "vmul.f32 q5, q11, %f[wr0][0]                   @ mul weight 1, "
-            "out0\n"  // q12 * w02
-            "vmla.f32 q3,  q6, %e[wr0][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w00
-
-            "sub %[din0_ptr], #4                            @ inpitr0 - 1\n"
-            "sub %[din1_ptr], #4                            @ inpitr1 - 1\n"
-            "sub %[din2_ptr], #4                            @ inpitr2 - 1\n"
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q12, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q11 * w01
-            "vmla.f32 q5, q13, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q12 * w02
-            "vmla.f32 q3,  q7, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w00
-
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vmla.f32 q4, q14, %e[wr2][1]                   @ mul weight 1, "
-            "out1\n"  // q0 * w01
-            "vmla.f32 q5, q15, %f[wr2][0]                   @ mul weight 1, "
-            "out1\n"  // q1 * w02
-            "vmla.f32 q3,  q8, %e[wr2][0]                   @ mul weight 1, "
-            "out1\n"  // q2 * w00
-
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vmax.f32 q3, q3, q9                    @ relu \n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "cmp %[cnt], #1                                 \n"
-            "blt 1f                                         \n"
-            // mid
-            "2:                                             \n"
-            "vld1.32  {d16}, [%[din0_ptr]]                  @ load din r0\n"  // q2={8,10,12,14}
-            "vdup.32  q3, %[bias]                           @ and \n"  // q10 =
-                                                                       // vbias
-            "vext.32  q6, q10, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.32 {d16}, [%[din1_ptr]]                   @ load din r1\n"  // q2={8,10,12,14}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q0 * w00
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6 * w02
-
-            "vext.32  q7, q12, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.32 {d16}, [%[din2_ptr]]                   @ load din r1\n"  // q2={8,10,12,14}
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w02
-
-            "vext.32  q6, q14, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q6 * w02
-
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vmax.f32 q3, q3, q9                    @ relu \n"
-
-            "subs %[cnt], #1                                \n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "bne  2b                                        \n"
-
-            // right
-            "1:                                             \n"
-            "cmp %[remain], #1                              \n"
-            "blt 3f                                         \n"
-
-            "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"
-            "vdup.32  q3, %[bias]                           @ and \n"  // q10 =
-                                                                       // vbias
-
-            "vbif q10, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q11, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q12, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q13, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q14, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q15, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-
-            "vext.32 q6, q10, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vext.32 q7, q12, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q0 * w00
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6 * w02
-
-            "vext.32 q6, q14, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.f32   {d20-d21}, [%[outptr]]              @ load output\n"
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w02
-
-            "vld1.f32   {d22-d23}, [%[mask_ptr]]            @ load mask\n"
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q6 * w02
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vmax.f32 q3, q3, q9                    @ relu \n"
-
-            "vbif.f32 q3, q10, q11                          @ write mask\n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "3:                                             \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [outptr] "+r"(doutr0_ptr),
-              [cnt] "+r"(cnt),
-              [mask_ptr] "+r"(mask_ptr)
-            : [remain] "r"(cnt_remain),
-              [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "r"(bias_c)
-            : "cc",
-              "memory",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-
-        doutr0 = doutr0 + w_out;
-      }
-#endif
-    }
-  }
-}
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width <= 4
- */
-void conv_depthwise_3x3s1p1_bias_s(float* dout,
-                                   const float* din,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext* ctx) {
-  //! 3x3s1 convolution, implemented by direct algorithm
-  //! pad is done implicit
-  //! for 4x6 convolution window
-  const int right_pad_idx[4] = {3, 2, 1, 0};
-  const float zero[4] = {0.f, 0.f, 0.f, 0.f};
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-  uint32x4_t vmask_rp =
-      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in));
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      float* dout_channel = dout_batch + i * size_out_channel;
-      const float* din_channel = din_batch + i * size_in_channel;
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float32x4_t wbias;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      int hs = -1;
-      int he = 3;
-
-      float out_buf1[4];
-      float out_buf2[4];
-      float trash_buf[4];
-
-      int h_cnt = (h_out + 1) >> 1;
-      float* doutr0 = dout_channel;
-      float* doutr1 = dout_channel + w_out;
-
-      for (int j = 0; j < h_cnt; ++j) {
-        const float* dr0 = din_channel + hs * w_in;
-        const float* dr1 = dr0 + w_in;
-        const float* dr2 = dr1 + w_in;
-        const float* dr3 = dr2 + w_in;
-
-        if (hs == -1) {
-          dr0 = zero;
-        }
-
-        switch (he - h_in) {
-          case 2:
-            dr2 = zero;
-            doutr1 = trash_buf;
-          case 1:
-            dr3 = zero;
-          default:
-            break;
-        }
-#ifdef __aarch64__
-        asm volatile(
-            "prfm pldl1keep, [%[din0]]\n"
-            "prfm pldl1keep, [%[din1]]\n"
-            "prfm pldl1keep, [%[din2]]\n"
-            "prfm pldl1keep, [%[din3]]\n"
-
-            "ld1 {v0.4s}, [%[din0]], #16\n"
-            "ld1 {v1.4s}, [%[din1]], #16\n"
-            "ld1 {v2.4s}, [%[din2]], #16\n"
-            "ld1 {v3.4s}, [%[din3]], #16\n"
-
-            "bif v0.16b, %[zero].16b, %[mask].16b\n"  // d0_1234
-            "bif v1.16b, %[zero].16b, %[mask].16b\n"  // d1_1234
-            "bif v2.16b, %[zero].16b, %[mask].16b\n"  // d2_1234
-            "bif v3.16b, %[zero].16b, %[mask].16b\n"  // d3_1234
-
-            "ext v4.16b, %[zero].16b, v0.16b, #12\n"  // d0_0123
-            "ext v5.16b, %[zero].16b, v1.16b, #12\n"  // d1_0123
-            "ext v6.16b, %[zero].16b, v2.16b, #12\n"  // d2_0123
-            "ext v7.16b, %[zero].16b, v3.16b, #12\n"  // d3_0123
-
-            "ext v8.16b, v0.16b, %[zero].16b, #4\n"   // d0_2340
-            "ext v9.16b, v1.16b, %[zero].16b, #4\n"   // d1_2340
-            "ext v10.16b, v2.16b, %[zero].16b, #4\n"  // d2_2340
-            "ext v11.16b, v3.16b, %[zero].16b, #4\n"  // d3_2340
-
-            "fmul v12.4s, v0.4s, %[wr0].s[1]\n"
-            "fmul v13.4s, v1.4s, %[wr0].s[1]\n"
-
-            "fmul v14.4s, v1.4s, %[wr1].s[1]\n"
-            "fmul v15.4s, v2.4s, %[wr1].s[1]\n"
-
-            "fmul v16.4s, v2.4s, %[wr2].s[1]\n"
-            "fmul v17.4s, v3.4s, %[wr2].s[1]\n"
-
-            "fmla v12.4s, v4.4s, %[wr0].s[0]\n"
-            "fmla v13.4s, v5.4s, %[wr0].s[0]\n"
-
-            "fmla v14.4s, v5.4s, %[wr1].s[0]\n"
-            "fmla v15.4s, v6.4s, %[wr1].s[0]\n"
-
-            "fmla v16.4s, v6.4s, %[wr2].s[0]\n"
-            "fmla v17.4s, v7.4s, %[wr2].s[0]\n"
-
-            "fmla v12.4s, v8.4s, %[wr0].s[2]\n"
-            "fmla v13.4s, v9.4s, %[wr0].s[2]\n"
-
-            "fmla v14.4s, v9.4s, %[wr1].s[2]\n"
-            "fmla v15.4s, v10.4s, %[wr1].s[2]\n"
-
-            "fmla v16.4s, v10.4s, %[wr2].s[2]\n"
-            "fmla v17.4s, v11.4s, %[wr2].s[2]\n"
-
-            "fadd v12.4s, v12.4s, v14.4s\n"
-            "fadd v12.4s, v12.4s, v16.4s\n"
-
-            "fadd v13.4s, v13.4s, v15.4s\n"  // out1
-            "fadd v13.4s, v13.4s, v17.4s\n"  // out2
-
-            "fadd v12.4s, v12.4s, %[bias].4s\n"  // out1 add bias
-            "fadd v13.4s, v13.4s, %[bias].4s\n"  // out2 add bias
-
-            "prfm pldl1keep, [%[out1]]\n"
-            "prfm pldl1keep, [%[out2]]\n"
-
-            "st1 {v12.4s}, [%[out1]]\n"
-            "st1 {v13.4s}, [%[out2]]\n"
-
-            : [din0] "+r"(dr0),
-              [din1] "+r"(dr1),
-              [din2] "+r"(dr2),
-              [din3] "+r"(dr3)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [zero] "w"(vzero),
-              [mask] "w"(vmask_rp),
-              [bias] "w"(wbias),
-              [out1] "r"(out_buf1),
-              [out2] "r"(out_buf2)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17");
-#else
-        asm volatile(
-            "pld [%[din0]]\n"
-            "pld [%[din1]]\n"
-            "pld [%[din2]]\n"
-            "pld [%[din3]]\n"
-
-            "vld1.32 {d12-d13}, [%[din0]]!\n"
-            "vld1.32 {d14-d15}, [%[din1]]!\n"
-            "vld1.32 {d16-d17}, [%[din2]]!\n"
-            "vld1.32 {d18-d19}, [%[din3]]!\n"
-
-            "vbif q6, %q[zero], %q[mask]\n"  // d0_1234
-            "vbif q7, %q[zero], %q[mask]\n"  // d1_1234
-            "vbif q8, %q[zero], %q[mask]\n"  // d2_1234
-            "vbif q9, %q[zero], %q[mask]\n"  // d3_1234
-
-            "vmul.f32 q14, q6, %e[wr0][1]\n"
-            "vmul.f32 q15, q7, %e[wr0][1]\n"
-
-            "vmla.f32 q14, q7, %e[wr1][1]\n"
-            "vmla.f32 q15, q8, %e[wr1][1]\n"
-
-            "vmla.f32 q14, q8, %e[wr2][1]\n"
-            "vmla.f32 q15, q9, %e[wr2][1]\n"
-
-            "vext.32 q10, %q[zero], q6, #3\n"  // d0_0123
-            "vext.32 q11, %q[zero], q7, #3\n"  // d1_0123
-            "vext.32 q12, %q[zero], q8, #3\n"  // d2_0123
-            "vext.32 q13, %q[zero], q9, #3\n"  // d3_0123
-
-            "vmla.f32 q14, q10, %e[wr0][0]\n"
-            "vmla.f32 q15, q11, %e[wr0][0]\n"
-
-            "vmla.f32 q14, q11, %e[wr1][0]\n"
-            "vmla.f32 q15, q12, %e[wr1][0]\n"
-
-            "vmla.f32 q14, q12, %e[wr2][0]\n"
-            "vmla.f32 q15, q13, %e[wr2][0]\n"
-
-            "vext.32 q10, q6, %q[zero], #1\n"  // d0_2340
-            "vext.32 q11, q7, %q[zero], #1\n"  // d1_2340
-            "vext.32 q12, q8, %q[zero], #1\n"  // d2_2340
-            "vext.32 q13, q9, %q[zero], #1\n"  // d3_2340
-
-            "vmla.f32 q14, q10, %f[wr0][0]\n"
-            "vmla.f32 q15, q11, %f[wr0][0]\n"
-
-            "vmla.f32 q14, q11, %f[wr1][0]\n"
-            "vmla.f32 q15, q12, %f[wr1][0]\n"
-
-            "vmla.f32 q14, q12, %f[wr2][0]\n"  // out1
-            "vmla.f32 q15, q13, %f[wr2][0]\n"  // out2
-
-            "vadd.f32 q14, q14, %q[bias]\n"  // out1 add bias
-            "vadd.f32 q15, q15, %q[bias]\n"  // out2 add bias
-
-            "pld [%[out1]]\n"
-            "pld [%[out2]]\n"
-
-            "vst1.32 {d28-d29}, [%[out1]]\n"
-            "vst1.32 {d30-d31}, [%[out2]]\n"
-
-            : [din0] "+r"(dr0),
-              [din1] "+r"(dr1),
-              [din2] "+r"(dr2),
-              [din3] "+r"(dr3)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [zero] "w"(vzero),
-              [mask] "w"(vmask_rp),
-              [bias] "w"(wbias),
-              [out1] "r"(out_buf1),
-              [out2] "r"(out_buf2)
-            : "cc",
-              "memory",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif  // __aarch64__
-        for (int w = 0; w < w_out; ++w) {
-          *doutr0++ = out_buf1[w];
-          *doutr1++ = out_buf2[w];
-        }
-        doutr0 = doutr1;
-        doutr1 += w_out;
-        hs += 2;
-        he += 2;
-      }  // end of processing heights
-    }    // end of processing channels
-  }      // end of processing batchs
-}
-/**
- * \brief depthwise convolution kernel 3x3, stride 2, width <= 4
- */
-
-void conv_depthwise_3x3s2p1_bias_s(float* dout,
-                                   const float* din,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext* ctx) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-  float zeros[8] = {0.0f};
-
-  uint32x4_t vmask_rp1 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  unsigned int dmask[8];
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float bias_c = 0.f;
-
-      if (flag_bias) {
-        bias_c = bias[i];
-      }
-      float32x4_t vbias = vdupq_n_f32(bias_c);
-      int hs = -1;
-      int he = 2;
-      float out_buf[4];
-      for (int j = 0; j < h_out; ++j) {
-        const float* dr0 = din_channel + hs * w_in;
-        const float* dr1 = dr0 + w_in;
-        const float* dr2 = dr1 + w_in;
-        if (hs == -1) {
-          dr0 = zeros;
-        }
-        if (he > h_in) {
-          dr2 = zeros;
-        }
-        const float* din0_ptr = dr0;
-        const float* din1_ptr = dr1;
-        const float* din2_ptr = dr2;
-
-        unsigned int* mask_ptr = dmask;
-#ifdef __aarch64__
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "movi v9.4s, #0                                 \n"
-            "ld1  {v6.4s, v7.4s}, [%[mask_ptr]], #32        \n"
-
-            "ld2  {v10.4s, v11.4s}, [%[din0_ptr]], #32      \n"  // v10={0,2,4,6}
-            // v11={1,3,5,7}
-            "ld2  {v12.4s, v13.4s}, [%[din1_ptr]], #32      \n"  // v13={0,2,4,6}
-            // v12={1,3,5,7}
-            "ld2  {v14.4s, v15.4s}, [%[din2_ptr]], #32      \n"  // v14={0,2,4,6}
-            // v15={1,3,5,7}
-
-            "bif v10.16b, v9.16b, v6.16b                    \n"
-            "bif v11.16b, v9.16b, v7.16b                    \n"
-            "bif v12.16b, v9.16b, v6.16b                    \n"
-            "bif v13.16b, v9.16b, v7.16b                    \n"
-            "bif v14.16b, v9.16b, v6.16b                    \n"
-            "bif v15.16b, v9.16b, v7.16b                    \n"
-
-            "ext v6.16b, v9.16b, v11.16b, #12               \n"  // v6 =
-                                                                 // {0,1,3,5}
-            "ext v7.16b, v9.16b, v13.16b, #12               \n"  // v7 =
-                                                                 // {0,1,3,5}
-            "ext v8.16b, v9.16b, v15.16b, #12               \n"  // v8 =
-                                                                 // {0,1,3,5}
-
-            "fmul v4.4s, v10.4s, %[wr0].s[1]                \n"  // v10 * w01
-            "fmul v5.4s, v11.4s, %[wr0].s[2]                \n"  // v11 * w02
-            "fmul v6.4s, v6.4s,  %[wr0].s[0]                \n"  // v6  * w00
-
-            "fmla v4.4s, v12.4s, %[wr1].s[1]                \n"  // v12 * w11
-            "fmla v5.4s, v13.4s, %[wr1].s[2]                \n"  // v13 * w12
-            "fmla v6.4s, v7.4s,  %[wr1].s[0]                \n"  // v7  * w10
-
-            "fmla v4.4s, v14.4s, %[wr2].s[1]                \n"  // v14 * w20
-            "fmla v5.4s, v15.4s, %[wr2].s[2]                \n"  // v15 * w21
-            "fmla v6.4s, v8.4s,  %[wr2].s[0]                \n"  // v8  * w22
-
-            "fadd v4.4s, v4.4s, v5.4s                       \n"
-            "fadd v4.4s, v4.4s, v6.4s                       \n"
-
-            "fadd v4.4s, v4.4s, %[bias].4s                  \n"
-
-            "st1 {v4.4s}, [%[out]]                          \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [mask_ptr] "+r"(mask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "w"(vbias),
-              [out] "r"(out_buf)
-            : "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15");
-
-#else
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "vmov.u32 q9, #0                                \n"
-            "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"
-            "vdup.32  q3, %[bias]                           @ and \n"  // q3 =
-                                                                       // vbias
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // q10={0,2,4,6} q11={1,3,5,7}
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // q13={0,2,4,6} q12={1,3,5,7}
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"  // q14={0,2,4,6} q15={1,3,5,7}
-
-            "vbif q10, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q11, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q12, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q13, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q14, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q15, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-
-            "vext.32 q6, q9, q11, #3                        @ shift left 1 \n"  // q6 = {0,1,3,5}
-            "vext.32 q7, q9, q13, #3                        @ shift left 1 \n"  // q7 = {0,1,3,5}
-            "vext.32 q8, q9, q15, #3                        @ shift left 1 \n"  // q8 = {0,1,3,5}
-
-            "vmul.f32 q4, q10, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q10 * w01
-            "vmul.f32 q5, q11, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q11 * w02
-            "vmla.f32 q3, q6,  %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6  * w00
-
-            "vmla.f32 q4, q12, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q12 * w11
-            "vmla.f32 q5, q13, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q13 * w12
-            "vmla.f32 q3, q7,  %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q7  * w10
-
-            "vmla.f32 q4, q14, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q14 * w20
-            "vmla.f32 q5, q15, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q15 * w21
-            "vmla.f32 q3, q8,  %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q8  * w22
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vst1.32 {d6-d7}, [%[out]]                            \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [mask_ptr] "+r"(mask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "r"(bias_c),
-              [out] "r"(out_buf)
-            : "cc",
-              "memory",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif  // __aarch64__
-        for (int w = 0; w < w_out; ++w) {
-          *dout_channel++ = out_buf[w];
-        }
-        hs += 2;
-        he += 2;
-      }
-    }
-  }
-}
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width <= 4
- */
-void conv_depthwise_3x3s1p1_bias_s_relu(float* dout,
-                                        const float* din,
-                                        const float* weights,
-                                        const float* bias,
-                                        bool flag_bias,
-                                        const int num,
-                                        const int ch_in,
-                                        const int h_in,
-                                        const int w_in,
-                                        const int h_out,
-                                        const int w_out,
-                                        ARMContext* ctx) {
-  //! 3x3s1 convolution, implemented by direct algorithm
-  //! pad is done implicit
-  //! for 4x6 convolution window
-  const int right_pad_idx[4] = {3, 2, 1, 0};
-  const float zero[4] = {0.f, 0.f, 0.f, 0.f};
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-  uint32x4_t vmask_rp =
-      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in));
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      float* dout_channel = dout_batch + i * size_out_channel;
-      const float* din_channel = din_batch + i * size_in_channel;
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float32x4_t wbias;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      int hs = -1;
-      int he = 3;
-
-      float out_buf1[4];
-      float out_buf2[4];
-      float trash_buf[4];
-
-      int h_cnt = (h_out + 1) >> 1;
-      float* doutr0 = dout_channel;
-      float* doutr1 = dout_channel + w_out;
-
-      for (int j = 0; j < h_cnt; ++j) {
-        const float* dr0 = din_channel + hs * w_in;
-        const float* dr1 = dr0 + w_in;
-        const float* dr2 = dr1 + w_in;
-        const float* dr3 = dr2 + w_in;
-
-        if (hs == -1) {
-          dr0 = zero;
-        }
-
-        switch (he - h_in) {
-          case 2:
-            dr2 = zero;
-            doutr1 = trash_buf;
-          case 1:
-            dr3 = zero;
-          default:
-            break;
-        }
-#ifdef __aarch64__
-        asm volatile(
-            "prfm pldl1keep, [%[din0]]\n"
-            "prfm pldl1keep, [%[din1]]\n"
-            "prfm pldl1keep, [%[din2]]\n"
-            "prfm pldl1keep, [%[din3]]\n"
-
-            "ld1 {v0.4s}, [%[din0]], #16\n"
-            "ld1 {v1.4s}, [%[din1]], #16\n"
-            "ld1 {v2.4s}, [%[din2]], #16\n"
-            "ld1 {v3.4s}, [%[din3]], #16\n"
-
-            "bif v0.16b, %[zero].16b, %[mask].16b\n"  // d0_1234
-            "bif v1.16b, %[zero].16b, %[mask].16b\n"  // d1_1234
-            "bif v2.16b, %[zero].16b, %[mask].16b\n"  // d2_1234
-            "bif v3.16b, %[zero].16b, %[mask].16b\n"  // d3_1234
-
-            "ext v4.16b, %[zero].16b, v0.16b, #12\n"  // d0_0123
-            "ext v5.16b, %[zero].16b, v1.16b, #12\n"  // d1_0123
-            "ext v6.16b, %[zero].16b, v2.16b, #12\n"  // d2_0123
-            "ext v7.16b, %[zero].16b, v3.16b, #12\n"  // d3_0123
-
-            "ext v8.16b, v0.16b, %[zero].16b, #4\n"   // d0_2340
-            "ext v9.16b, v1.16b, %[zero].16b, #4\n"   // d1_2340
-            "ext v10.16b, v2.16b, %[zero].16b, #4\n"  // d2_2340
-            "ext v11.16b, v3.16b, %[zero].16b, #4\n"  // d3_2340
-
-            "fmul v12.4s, v0.4s, %[wr0].s[1]\n"
-            "fmul v13.4s, v1.4s, %[wr0].s[1]\n"
-
-            "fmul v14.4s, v1.4s, %[wr1].s[1]\n"
-            "fmul v15.4s, v2.4s, %[wr1].s[1]\n"
-
-            "fmul v16.4s, v2.4s, %[wr2].s[1]\n"
-            "fmul v17.4s, v3.4s, %[wr2].s[1]\n"
-
-            "fmla v12.4s, v4.4s, %[wr0].s[0]\n"
-            "fmla v13.4s, v5.4s, %[wr0].s[0]\n"
-
-            "fmla v14.4s, v5.4s, %[wr1].s[0]\n"
-            "fmla v15.4s, v6.4s, %[wr1].s[0]\n"
-
-            "fmla v16.4s, v6.4s, %[wr2].s[0]\n"
-            "fmla v17.4s, v7.4s, %[wr2].s[0]\n"
-
-            "fmla v12.4s, v8.4s, %[wr0].s[2]\n"
-            "fmla v13.4s, v9.4s, %[wr0].s[2]\n"
-
-            "fmla v14.4s, v9.4s, %[wr1].s[2]\n"
-            "fmla v15.4s, v10.4s, %[wr1].s[2]\n"
-
-            "fmla v16.4s, v10.4s, %[wr2].s[2]\n"
-            "fmla v17.4s, v11.4s, %[wr2].s[2]\n"
-
-            "fadd v12.4s, v12.4s, v14.4s\n"
-            "fadd v12.4s, v12.4s, v16.4s\n"
-
-            "fadd v13.4s, v13.4s, v15.4s\n"  // out1
-            "fadd v13.4s, v13.4s, v17.4s\n"  // out2
-
-            "fadd v12.4s, v12.4s, %[bias].4s\n"  // out1 add bias
-            "fadd v13.4s, v13.4s, %[bias].4s\n"  // out2 add bias
-
-            "prfm pldl1keep, [%[out1]]\n"
-            "prfm pldl1keep, [%[out2]]\n"
-
-            "fmax v12.4s, v12.4s, %[zero].4s\n"  // out1 -> relu
-            "fmax v13.4s, v13.4s, %[zero].4s\n"  // out2 -> relu
-
-            "st1 {v12.4s}, [%[out1]]\n"
-            "st1 {v13.4s}, [%[out2]]\n"
-
-            : [din0] "+r"(dr0),
-              [din1] "+r"(dr1),
-              [din2] "+r"(dr2),
-              [din3] "+r"(dr3)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [zero] "w"(vzero),
-              [mask] "w"(vmask_rp),
-              [bias] "w"(wbias),
-              [out1] "r"(out_buf1),
-              [out2] "r"(out_buf2)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17");
-#else
-        asm volatile(
-            "pld [%[din0]]\n"
-            "pld [%[din1]]\n"
-            "pld [%[din2]]\n"
-            "pld [%[din3]]\n"
-
-            "vld1.32 {d12-d13}, [%[din0]]!\n"
-            "vld1.32 {d14-d15}, [%[din1]]!\n"
-            "vld1.32 {d16-d17}, [%[din2]]!\n"
-            "vld1.32 {d18-d19}, [%[din3]]!\n"
-
-            "vbif q6, %q[zero], %q[mask]\n"  // d0_1234
-            "vbif q7, %q[zero], %q[mask]\n"  // d1_1234
-            "vbif q8, %q[zero], %q[mask]\n"  // d2_1234
-            "vbif q9, %q[zero], %q[mask]\n"  // d3_1234
-
-            "vmul.f32 q14, q6, %e[wr0][1]\n"
-            "vmul.f32 q15, q7, %e[wr0][1]\n"
-
-            "vmla.f32 q14, q7, %e[wr1][1]\n"
-            "vmla.f32 q15, q8, %e[wr1][1]\n"
-
-            "vmla.f32 q14, q8, %e[wr2][1]\n"
-            "vmla.f32 q15, q9, %e[wr2][1]\n"
-
-            "vext.32 q10, %q[zero], q6, #3\n"  // d0_0123
-            "vext.32 q11, %q[zero], q7, #3\n"  // d1_0123
-            "vext.32 q12, %q[zero], q8, #3\n"  // d2_0123
-            "vext.32 q13, %q[zero], q9, #3\n"  // d3_0123
-
-            "vmla.f32 q14, q10, %e[wr0][0]\n"
-            "vmla.f32 q15, q11, %e[wr0][0]\n"
-
-            "vmla.f32 q14, q11, %e[wr1][0]\n"
-            "vmla.f32 q15, q12, %e[wr1][0]\n"
-
-            "vmla.f32 q14, q12, %e[wr2][0]\n"
-            "vmla.f32 q15, q13, %e[wr2][0]\n"
-
-            "vext.32 q10, q6, %q[zero], #1\n"  // d0_2340
-            "vext.32 q11, q7, %q[zero], #1\n"  // d1_2340
-            "vext.32 q12, q8, %q[zero], #1\n"  // d2_2340
-            "vext.32 q13, q9, %q[zero], #1\n"  // d3_2340
-
-            "vmla.f32 q14, q10, %f[wr0][0]\n"
-            "vmla.f32 q15, q11, %f[wr0][0]\n"
-
-            "vmla.f32 q14, q11, %f[wr1][0]\n"
-            "vmla.f32 q15, q12, %f[wr1][0]\n"
-
-            "vmla.f32 q14, q12, %f[wr2][0]\n"  // out1
-            "vmla.f32 q15, q13, %f[wr2][0]\n"  // out2
-
-            "vadd.f32 q14, q14, %q[bias]\n"  // out1 add bias
-            "vadd.f32 q15, q15, %q[bias]\n"  // out2 add bias
-
-            "pld [%[out1]]\n"
-            "pld [%[out2]]\n"
-
-            "vmax.f32 q14, q14, %q[zero]\n"  // out1 -> relu
-            "vmax.f32 q15, q15, %q[zero]\n"  // out2 -> relu
-
-            "vst1.32 {d28-d29}, [%[out1]]\n"
-            "vst1.32 {d30-d31}, [%[out2]]\n"
-
-            : [din0] "+r"(dr0),
-              [din1] "+r"(dr1),
-              [din2] "+r"(dr2),
-              [din3] "+r"(dr3)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [zero] "w"(vzero),
-              [mask] "w"(vmask_rp),
-              [bias] "w"(wbias),
-              [out1] "r"(out_buf1),
-              [out2] "r"(out_buf2)
-            : "cc",
-              "memory",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif  // __aarch64__
-        for (int w = 0; w < w_out; ++w) {
-          *doutr0++ = out_buf1[w];
-          *doutr1++ = out_buf2[w];
-        }
-        doutr0 = doutr1;
-        doutr1 += w_out;
-        hs += 2;
-        he += 2;
-      }  // end of processing heights
-    }    // end of processing channels
-  }      // end of processing batchs
-}
-
-/**
- * \brief depthwise convolution kernel 3x3, stride 2, width <= 7
- */
-void conv_depthwise_3x3s2p1_bias_s_relu(float* dout,
-                                        const float* din,
-                                        const float* weights,
-                                        const float* bias,
-                                        bool flag_bias,
-                                        const int num,
-                                        const int ch_in,
-                                        const int h_in,
-                                        const int w_in,
-                                        const int h_out,
-                                        const int w_out,
-                                        ARMContext* ctx) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-  float zeros[8] = {0.0f};
-
-  uint32x4_t vmask_rp1 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  unsigned int dmask[8];
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float bias_c = 0.f;
-
-      if (flag_bias) {
-        bias_c = bias[i];
-      }
-      float32x4_t vbias = vdupq_n_f32(bias_c);
-      int hs = -1;
-      int he = 2;
-      float out_buf[4];
-      for (int j = 0; j < h_out; ++j) {
-        const float* dr0 = din_channel + hs * w_in;
-        const float* dr1 = dr0 + w_in;
-        const float* dr2 = dr1 + w_in;
-        if (hs == -1) {
-          dr0 = zeros;
-        }
-        if (he > h_in) {
-          dr2 = zeros;
-        }
-        const float* din0_ptr = dr0;
-        const float* din1_ptr = dr1;
-        const float* din2_ptr = dr2;
-
-        unsigned int* mask_ptr = dmask;
-#ifdef __aarch64__
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "movi v9.4s, #0                                 \n"
-            "ld1  {v6.4s, v7.4s}, [%[mask_ptr]], #32        \n"
-
-            "ld2  {v10.4s, v11.4s}, [%[din0_ptr]], #32      \n"  // v10={0,2,4,6}
-            // v11={1,3,5,7}
-            "ld2  {v12.4s, v13.4s}, [%[din1_ptr]], #32      \n"  // v13={0,2,4,6}
-            // v12={1,3,5,7}
-            "ld2  {v14.4s, v15.4s}, [%[din2_ptr]], #32      \n"  // v14={0,2,4,6}
-            // v15={1,3,5,7}
-
-            "bif v10.16b, v9.16b, v6.16b                    \n"
-            "bif v11.16b, v9.16b, v7.16b                    \n"
-            "bif v12.16b, v9.16b, v6.16b                    \n"
-            "bif v13.16b, v9.16b, v7.16b                    \n"
-            "bif v14.16b, v9.16b, v6.16b                    \n"
-            "bif v15.16b, v9.16b, v7.16b                    \n"
-
-            "ext v6.16b, v9.16b, v11.16b, #12               \n"  // v6 =
-                                                                 // {0,1,3,5}
-            "ext v7.16b, v9.16b, v13.16b, #12               \n"  // v7 =
-                                                                 // {0,1,3,5}
-            "ext v8.16b, v9.16b, v15.16b, #12               \n"  // v8 =
-                                                                 // {0,1,3,5}
-
-            "fmul v4.4s, v10.4s, %[wr0].s[1]                \n"  // v10 * w01
-            "fmul v5.4s, v11.4s, %[wr0].s[2]                \n"  // v11 * w02
-            "fmul v6.4s, v6.4s,  %[wr0].s[0]                \n"  // v6  * w00
-
-            "fmla v4.4s, v12.4s, %[wr1].s[1]                \n"  // v12 * w11
-            "fmla v5.4s, v13.4s, %[wr1].s[2]                \n"  // v13 * w12
-            "fmla v6.4s, v7.4s,  %[wr1].s[0]                \n"  // v7  * w10
-
-            "fmla v4.4s, v14.4s, %[wr2].s[1]                \n"  // v14 * w20
-            "fmla v5.4s, v15.4s, %[wr2].s[2]                \n"  // v15 * w21
-            "fmla v6.4s, v8.4s,  %[wr2].s[0]                \n"  // v8  * w22
-
-            "fadd v4.4s, v4.4s, v5.4s                       \n"
-            "fadd v4.4s, v4.4s, v6.4s                       \n"
-
-            "fadd v4.4s, v4.4s, %[bias].4s                  \n"  // out add bias
-            "fmax v4.4s, v4.4s, v9.4s                       \n"
-
-            "st1 {v4.4s}, [%[out]]                          \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [mask_ptr] "+r"(mask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "w"(vbias),
-              [out] "r"(out_buf)
-            : "cc",
-              "memory",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15");
-
-#else
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "vmov.u32 q9, #0                                \n"
-            "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"
-            "vdup.32  q3, %[bias]                           @ and \n"  // q3 =
-                                                                       // vbias
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // q10={0,2,4,6} q11={1,3,5,7}
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // q13={0,2,4,6} q12={1,3,5,7}
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"  // q14={0,2,4,6} q15={1,3,5,7}
-
-            "vbif q10, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q11, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q12, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q13, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q14, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q15, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-
-            "vext.32 q6, q9, q11, #3                        @ shift left 1 \n"  // q6 = {0,1,3,5}
-            "vext.32 q7, q9, q13, #3                        @ shift left 1 \n"  // q7 = {0,1,3,5}
-            "vext.32 q8, q9, q15, #3                        @ shift left 1 \n"  // q8 = {0,1,3,5}
-
-            "vmul.f32 q4, q10, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q10 * w01
-            "vmul.f32 q5, q11, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q11 * w02
-            "vmla.f32 q3, q6,  %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6  * w00
-
-            "vmla.f32 q4, q12, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q12 * w11
-            "vmla.f32 q5, q13, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q13 * w12
-            "vmla.f32 q3, q7,  %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q7  * w10
-
-            "vmla.f32 q4, q14, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q14 * w20
-            "vmla.f32 q5, q15, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q15 * w21
-            "vmla.f32 q3, q8,  %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q8  * w22
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vmax.f32 q3, q3, q9                            @ relu\n"
-
-            "vst1.32 {d6-d7}, [%[out]]                            \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [mask_ptr] "+r"(mask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "r"(bias_c),
-              [out] "r"(out_buf)
-            : "cc",
-              "memory",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif  // __aarch64__
-        for (int w = 0; w < w_out; ++w) {
-          *dout_channel++ = out_buf[w];
-        }
-        hs += 2;
-        he += 2;
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_depthwise_5x5s1.cc b/lite/backends/arm/math/conv_depthwise_5x5s1.cc
deleted file mode 100644
index 2b9744665c..0000000000
--- a/lite/backends/arm/math/conv_depthwise_5x5s1.cc
+++ /dev/null
@@ -1,9615 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/conv_depthwise.h"
-#include <arm_neon.h>
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-//!    weights layout
-//!            *-----------------------*-----*
-//!    w0  <-- | W0    W1    W2    W3  | W4  |
-//!            *-----------------------*     |
-//!    w1  <-- | W5    W6    W7    W8  | W9  |
-//!            *-----------------------*     | -->  w5
-//!    w2  <-- | W10   W11   W12   W13 | W14 |
-//!            *-----------------------*     |
-//!    w3  <-- | W15   W16   W17   W18 | W19 |
-//!            *-----------------------*-----*
-//!    w4  <-- | W20   W21   W22   W23 | W24 | -->  w6[0]
-//!            *-----------------------*-----*
-
-void conv_depthwise_5x5s1_impl(const float* din,
-                               float* dout,
-                               int num,
-                               int ch_out,
-                               int h_out,
-                               int w_out,
-                               int ch_in,
-                               int h_in,
-                               int w_in,
-                               const float* weights,
-                               const float* bias,
-                               int pad,
-                               bool flag_bias,
-                               bool flag_relu,
-                               ARMContext* ctx);
-
-void conv_depthwise_5x5s1_small_impl(const float* din,
-                                     float* dout,
-                                     int num,
-                                     int ch_out,
-                                     int h_out,
-                                     int w_out,
-                                     int ch_in,
-                                     int h_in,
-                                     int w_in,
-                                     const float* weights,
-                                     const float* bias,
-                                     int pad,
-                                     bool flag_bias,
-                                     bool flag_relu,
-                                     ARMContext* ctx);
-
-void conv_depthwise_5x5s1_relu_impl(const float* din,
-                                    float* dout,
-                                    int num,
-                                    int ch_out,
-                                    int h_out,
-                                    int w_out,
-                                    int ch_in,
-                                    int h_in,
-                                    int w_in,
-                                    const float* weights,
-                                    const float* bias,
-                                    int pad,
-                                    bool flag_bias,
-                                    bool flag_relu,
-                                    ARMContext* ctx);
-
-void conv_depthwise_5x5s1_small_relu_impl(const float* din,
-                                          float* dout,
-                                          int num,
-                                          int ch_out,
-                                          int h_out,
-                                          int w_out,
-                                          int ch_in,
-                                          int h_in,
-                                          int w_in,
-                                          const float* weights,
-                                          const float* bias,
-                                          int pad,
-                                          bool flag_bias,
-                                          bool flag_relu,
-                                          ARMContext* ctx);
-
-static float* prepad_input(
-    const float* input, int num, int ch_in, int h_in, int w_in, int pad) {
-  int h_new = h_in + 2 * pad;
-  int w_new = w_in + 2 * pad;
-  float* new_input =
-      static_cast<float*>(malloc(h_new * w_new * ch_in * num * sizeof(float)));
-  float* new_input_ptr = new_input;
-  for (int c = 0; c < num * ch_in; ++c) {
-    memset(new_input_ptr, 0x00, w_new * pad * sizeof(float));
-    new_input_ptr += w_new * pad;
-    for (int i = 0; i < h_in; ++i) {
-      memset(new_input_ptr, 0x00, pad * sizeof(float));
-      new_input_ptr += pad;
-      memcpy(new_input_ptr, input, w_in * sizeof(float));
-      new_input_ptr += w_in;
-      input += w_in;
-      memset(new_input_ptr, 0x00, pad * sizeof(float));
-      new_input_ptr += pad;
-    }
-    memset(new_input_ptr, 0x00, w_new * pad * sizeof(float));
-    new_input_ptr += w_new * pad;
-  }
-  return new_input;
-}
-
-#ifdef __aarch64__
-
-//! kernel for one out without extracting data mid
-//! deal with four lines out
-void compute_one_out_without_extract(const float* din0,
-                                     const float* din1,
-                                     const float* din2,
-                                     const float* din3,
-                                     const float* din4,
-                                     const float* din5,
-                                     const float* din6,
-                                     const float* din7,
-                                     float* dout0,
-                                     float* dout1,
-                                     float* dout2,
-                                     float* dout3,
-                                     float32x4_t w0,
-                                     float32x4_t w1,
-                                     float32x4_t w2,
-                                     float32x4_t w3,
-                                     float32x4_t w4,
-                                     float32x4_t w5,
-                                     float32x4_t w6,
-                                     const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! din0 - din7: 5   v20, v21
-  //! dout0 - dout3: v16-v19
-  asm volatile(
-      "ld1 {v8.4s}, [%[din0]], #16  \n"
-      "ld1 {v9.4s}, [%[din1]], #16  \n"
-      "ld1 {v10.4s}, [%[din2]], #16  \n"
-      "ld1 {v11.4s}, [%[din3]], #16  \n"
-      "ld1 {v12.4s}, [%[din4]], #16  \n"
-      "ld1 {v13.4s}, [%[din5]], #16  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s  \n"
-      "fmul v17.4s, %[w0].4s, v9.4s  \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      "ld1 {v14.4s}, [%[din6]], #16  \n"
-      "ld1 {v15.4s}, [%[din7]], #16  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      "ld1 {v20.s}[0], [%[din0]]  \n"
-      "ld1 {v21.s}[0], [%[din4]]  \n"
-      "ld1 {v20.s}[1], [%[din1]]  \n"
-      "ld1 {v21.s}[1], [%[din5]]  \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      "ld1 {v20.s}[2], [%[din2]]  \n"
-      "ld1 {v21.s}[2], [%[din6]]  \n"
-      "ld1 {v20.s}[3], [%[din3]]  \n"
-      "ld1 {v21.s}[3], [%[din7]]  \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // ext
-      "ext v22.16b, v20.16b, v21.16b, #4  \n"  // 1 2 3 4
-      "ext v23.16b, v20.16b, v21.16b, #8  \n"  // 2 3 4 5
-      "ext v24.16b, v20.16b, v21.16b, #12 \n"  // 3 4 5 6
-
-      // in col5
-      "fmla v16.4s, %[w5].4s, v20.4s  \n"
-      "fmla v17.4s, %[w5].4s, v22.4s  \n"
-      "fmla v18.4s, %[w5].4s, v23.4s  \n"
-      "fmla v19.4s, %[w5].4s, v24.4s  \n"
-
-      "ld1 {v31.4s}, [%[bias]] \n"
-
-      // add to out register v25
-      "faddp v25.4s, v16.4s, v17.4s  \n"
-      "faddp v26.4s, v18.4s, v19.4s  \n"
-      "faddp v25.4s, v25.4s, v26.4s  \n"
-
-      // in[24] * w6[0]
-      "fmla v25.4s, v21.4s, %[w6].s[0]\n"
-      "fadd v25.4s, v25.4s, v31.4s    \n"
-
-      // write output
-      "st1 {v25.s}[0], [%[dout0]]  \n"
-      "st1 {v25.s}[1], [%[dout1]]  \n"
-      "st1 {v25.s}[2], [%[dout2]]  \n"
-      "st1 {v25.s}[3], [%[dout3]]  \n"
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7)
-      : [dout0] "r"(dout0),
-        [dout1] "r"(dout1),
-        [dout2] "r"(dout2),
-        [dout3] "r"(dout3),
-        [w0] "w"(w0),
-        [w1] "w"(w1),
-        [w2] "w"(w2),
-        [w3] "w"(w3),
-        [w4] "w"(w4),
-        [w5] "w"(w5),
-        [w6] "w"(w6),
-        [bias] "r"(bias)
-      : "memory",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v20",
-        "v21",
-        "v22",
-        "v23",
-        "v24",
-        "v25",
-        "v26",
-        "v31");
-}
-
-//! kernel for one out without extracting data mid
-//! deal with four lines out
-void compute_one_out_without_extract_relu(const float* din0,
-                                          const float* din1,
-                                          const float* din2,
-                                          const float* din3,
-                                          const float* din4,
-                                          const float* din5,
-                                          const float* din6,
-                                          const float* din7,
-                                          float* dout0,
-                                          float* dout1,
-                                          float* dout2,
-                                          float* dout3,
-                                          float32x4_t w0,
-                                          float32x4_t w1,
-                                          float32x4_t w2,
-                                          float32x4_t w3,
-                                          float32x4_t w4,
-                                          float32x4_t w5,
-                                          float32x4_t w6,
-                                          const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! din0 - din7: 5   v20, v21
-  //! dout0 - dout3: v16-v19
-  asm volatile(
-      "ld1 {v8.4s}, [%[din0]], #16   \n"
-      "ld1 {v9.4s}, [%[din1]], #16   \n"
-      "ld1 {v10.4s}, [%[din2]], #16  \n"
-      "ld1 {v11.4s}, [%[din3]], #16  \n"
-      "ld1 {v12.4s}, [%[din4]], #16  \n"
-      "ld1 {v13.4s}, [%[din5]], #16  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s   \n"
-      "fmul v17.4s, %[w0].4s, v9.4s   \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      "ld1 {v14.4s}, [%[din6]], #16  \n"
-      "ld1 {v15.4s}, [%[din7]], #16  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      "ld1 {v20.s}[0], [%[din0]]  \n"
-      "ld1 {v21.s}[0], [%[din4]]  \n"
-      "ld1 {v20.s}[1], [%[din1]]  \n"
-      "ld1 {v21.s}[1], [%[din5]]  \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      "ld1 {v20.s}[2], [%[din2]]  \n"
-      "ld1 {v21.s}[2], [%[din6]]  \n"
-      "ld1 {v20.s}[3], [%[din3]]  \n"
-      "ld1 {v21.s}[3], [%[din7]]  \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // ext
-      "ext v22.16b, v20.16b, v21.16b, #4  \n"  // 1 2 3 4
-      "ext v23.16b, v20.16b, v21.16b, #8  \n"  // 2 3 4 5
-      "ext v24.16b, v20.16b, v21.16b, #12 \n"  // 3 4 5 6
-
-      // in col5
-      "fmla v16.4s, %[w5].4s, v20.4s  \n"
-      "fmla v17.4s, %[w5].4s, v22.4s  \n"
-      "fmla v18.4s, %[w5].4s, v23.4s  \n"
-      "fmla v19.4s, %[w5].4s, v24.4s  \n"
-
-      "ld1 {v31.4s}, [%[bias]] \n"
-      "movi v30.4s, #0  \n"
-
-      // add to out register v25
-      "faddp v25.4s, v16.4s, v17.4s  \n"
-      "faddp v26.4s, v18.4s, v19.4s  \n"
-      "faddp v25.4s, v25.4s, v26.4s  \n"
-
-      // in[24] * w6[0]
-      "fmla v25.4s, v21.4s, %[w6].s[0] \n"
-      "fadd v25.4s, v25.4s, v31.4s     \n"
-      "fmax v25.4s, v25.4s, v30.4s     \n"
-
-      // write output
-      "st1 {v25.s}[0], [%[dout0]]  \n"
-      "st1 {v25.s}[1], [%[dout1]]  \n"
-      "st1 {v25.s}[2], [%[dout2]]  \n"
-      "st1 {v25.s}[3], [%[dout3]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7)
-      : [dout0] "r"(dout0),
-        [dout1] "r"(dout1),
-        [dout2] "r"(dout2),
-        [dout3] "r"(dout3),
-        [w0] "w"(w0),
-        [w1] "w"(w1),
-        [w2] "w"(w2),
-        [w3] "w"(w3),
-        [w4] "w"(w4),
-        [w5] "w"(w5),
-        [w6] "w"(w6),
-        [bias] "r"(bias)
-      : "memory",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v20",
-        "v21",
-        "v22",
-        "v23",
-        "v24",
-        "v25",
-        "v26",
-        "v30",
-        "v31");
-}
-
-//! kernel for one out with extracting data pre
-//! deal with four lines out
-//! need extra load weights
-void compute_one_out_extract_pre(const float* din0,
-                                 const float* din1,
-                                 const float* din2,
-                                 const float* din3,
-                                 const float* din4,
-                                 const float* din5,
-                                 const float* din6,
-                                 const float* din7,
-                                 float* dout0,
-                                 float* dout1,
-                                 float* dout2,
-                                 float* dout3,
-                                 const float* weights,
-                                 const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v16-v19
-  //! weights: v0-v4
-  asm volatile(
-      // load weights
-      "add %[wh], %[wh], #4  \n"
-      "ldr q0, [%[wh]], #20  \n"
-      "ldr q1, [%[wh]], #20  \n"
-      "ldr q2, [%[wh]], #20  \n"
-      "ldr q3, [%[wh]], #20  \n"
-      "ldr q4, [%[wh]], #20  \n"
-
-      "ld1 {v31.4s}, [%[bias]]  \n"
-      "ld1 {v8.4s}, [%[din0]], #16   \n"
-      "ld1 {v9.4s}, [%[din1]], #16   \n"
-      "ld1 {v10.4s}, [%[din2]], #16  \n"
-      "ld1 {v11.4s}, [%[din3]], #16  \n"
-      "ld1 {v12.4s}, [%[din4]], #16  \n"
-      "ld1 {v13.4s}, [%[din5]], #16  \n"
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s  \n"
-      "fmul v17.4s, v0.4s, v9.4s  \n"
-      "fmul v18.4s, v0.4s, v10.4s \n"
-      "fmul v19.4s, v0.4s, v11.4s \n"
-
-      "ld1 {v14.4s}, [%[din6]], #16  \n"
-      "ld1 {v15.4s}, [%[din7]], #16  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v25
-      "faddp v25.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v25.4s, v25.4s, v26.4s \n"
-      "fadd  v25.4s, v25.4s, v31.4s \n"
-
-      // write output
-      "st1 {v25.s}[0], [%[dout0]]  \n"
-      "st1 {v25.s}[1], [%[dout1]]  \n"
-      "st1 {v25.s}[2], [%[dout2]]  \n"
-      "st1 {v25.s}[3], [%[dout3]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7),
-        [wh] "+r"(weights)
-      : [dout0] "r"(dout0),
-        [dout1] "r"(dout1),
-        [dout2] "r"(dout2),
-        [dout3] "r"(dout3),
-        [bias] "r"(bias)
-      : "memory",
-        "v0",
-        "v1",
-        "v2",
-        "v3",
-        "v4",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v25",
-        "v26",
-        "v31");
-}
-
-//! kernel for one out with extracting data pre
-//! deal with four lines out
-//! need extra load weights
-void compute_one_out_extract_pre_relu(const float* din0,
-                                      const float* din1,
-                                      const float* din2,
-                                      const float* din3,
-                                      const float* din4,
-                                      const float* din5,
-                                      const float* din6,
-                                      const float* din7,
-                                      float* dout0,
-                                      float* dout1,
-                                      float* dout2,
-                                      float* dout3,
-                                      const float* weights,
-                                      const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v16-v19
-  //! weights: v0-v4
-  asm volatile(
-      // load weights
-      "add %[wh], %[wh], #4  \n"
-      "ldr q0, [%[wh]], #20  \n"
-      "ldr q1, [%[wh]], #20  \n"
-      "ldr q2, [%[wh]], #20  \n"
-      "ldr q3, [%[wh]], #20  \n"
-      "ldr q4, [%[wh]], #20  \n"
-
-      "ld1 {v8.4s}, [%[din0]], #16   \n"
-      "ld1 {v9.4s}, [%[din1]], #16   \n"
-      "ld1 {v10.4s}, [%[din2]], #16  \n"
-      "ld1 {v11.4s}, [%[din3]], #16  \n"
-      "ld1 {v12.4s}, [%[din4]], #16  \n"
-      "ld1 {v13.4s}, [%[din5]], #16  \n"
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s   \n"
-      "fmul v17.4s, v0.4s, v9.4s   \n"
-      "fmul v18.4s, v0.4s, v10.4s  \n"
-      "fmul v19.4s, v0.4s, v11.4s  \n"
-
-      "ld1 {v14.4s}, [%[din6]], #16  \n"
-      "ld1 {v15.4s}, [%[din7]], #16  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      "ld1 {v31.4s}, [%[bias]]  \n"
-      "movi v30.4s, #0  \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v25
-      "faddp v25.4s, v16.4s, v17.4s  \n"
-      "faddp v26.4s, v18.4s, v19.4s  \n"
-      "faddp v25.4s, v25.4s, v26.4s  \n"
-      "fadd  v25.4s, v25.4s, v31.4s  \n"
-      "fmax  v25.4s, v25.4s, v30.4s  \n"
-
-      // write output
-      "st1 {v25.s}[0], [%[dout0]]  \n"
-      "st1 {v25.s}[1], [%[dout1]]  \n"
-      "st1 {v25.s}[2], [%[dout2]]  \n"
-      "st1 {v25.s}[3], [%[dout3]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7),
-        [wh] "+r"(weights)
-      : [dout0] "r"(dout0),
-        [dout1] "r"(dout1),
-        [dout2] "r"(dout2),
-        [dout3] "r"(dout3),
-        [bias] "r"(bias)
-      : "memory",
-        "v0",
-        "v1",
-        "v2",
-        "v3",
-        "v4",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v25",
-        "v26",
-        "v30",
-        "v31");
-}
-
-//! kernel for one out with extracting data post
-//! deal with four lines out
-void compute_one_out_extract_post(const float* din0,
-                                  const float* din1,
-                                  const float* din2,
-                                  const float* din3,
-                                  const float* din4,
-                                  const float* din5,
-                                  const float* din6,
-                                  const float* din7,
-                                  float* dout0,
-                                  float* dout1,
-                                  float* dout2,
-                                  float* dout3,
-                                  float32x4_t w0,
-                                  float32x4_t w1,
-                                  float32x4_t w2,
-                                  float32x4_t w3,
-                                  float32x4_t w4,
-                                  const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v16-v19
-  asm volatile(
-      "ld1 {v31.4s}, [%[bias]]  \n"
-      "ld1 {v8.4s}, [%[din0]], #16  \n"
-      "ld1 {v9.4s}, [%[din1]], #16  \n"
-      "ld1 {v10.4s}, [%[din2]], #16  \n"
-      "ld1 {v11.4s}, [%[din3]], #16  \n"
-      "ld1 {v12.4s}, [%[din4]], #16  \n"
-      "ld1 {v13.4s}, [%[din5]], #16  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s   \n"
-      "fmul v17.4s, %[w0].4s, v9.4s   \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      "ld1 {v14.4s}, [%[din6]], #16  \n"
-      "ld1 {v15.4s}, [%[din7]], #16  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v25
-      "faddp v25.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v25.4s, v25.4s, v26.4s \n"
-      "fadd v25.4s, v25.4s, v31.4s  \n"
-
-      // write output
-      "st1 {v25.s}[0], [%[dout0]]  \n"
-      "st1 {v25.s}[1], [%[dout1]]  \n"
-      "st1 {v25.s}[2], [%[dout2]]  \n"
-      "st1 {v25.s}[3], [%[dout3]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7)
-      : [dout0] "r"(dout0),
-        [dout1] "r"(dout1),
-        [dout2] "r"(dout2),
-        [dout3] "r"(dout3),
-        [w0] "w"(w0),
-        [w1] "w"(w1),
-        [w2] "w"(w2),
-        [w3] "w"(w3),
-        [w4] "w"(w4),
-        [bias] "r"(bias)
-      : "memory",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v25",
-        "v26",
-        "v31");
-}
-
-//! kernel for one out with extracting data post
-//! deal with four lines out
-void compute_one_out_extract_post_relu(const float* din0,
-                                       const float* din1,
-                                       const float* din2,
-                                       const float* din3,
-                                       const float* din4,
-                                       const float* din5,
-                                       const float* din6,
-                                       const float* din7,
-                                       float* dout0,
-                                       float* dout1,
-                                       float* dout2,
-                                       float* dout3,
-                                       float32x4_t w0,
-                                       float32x4_t w1,
-                                       float32x4_t w2,
-                                       float32x4_t w3,
-                                       float32x4_t w4,
-                                       const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v16-v19
-  asm volatile(
-      "ld1 {v8.4s}, [%[din0]], #16  \n"
-      "ld1 {v9.4s}, [%[din1]], #16  \n"
-      "ld1 {v10.4s}, [%[din2]], #16  \n"
-      "ld1 {v11.4s}, [%[din3]], #16  \n"
-      "ld1 {v12.4s}, [%[din4]], #16  \n"
-      "ld1 {v13.4s}, [%[din5]], #16  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s  \n"
-      "fmul v17.4s, %[w0].4s, v9.4s  \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      "ld1 {v14.4s}, [%[din6]], #16  \n"
-      "ld1 {v15.4s}, [%[din7]], #16  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      "ld1 {v31.4s}, [%[bias]]  \n"
-      "movi v30.4s, #0  \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v25
-      "faddp v25.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v25.4s, v25.4s, v26.4s \n"
-      "fadd v25.4s, v25.4s, v31.4s  \n"
-      "fmax v25.4s, v25.4s, v30.4s  \n"
-
-      // write output
-      "st1 {v25.s}[0], [%[dout0]]  \n"
-      "st1 {v25.s}[1], [%[dout1]]  \n"
-      "st1 {v25.s}[2], [%[dout2]]  \n"
-      "st1 {v25.s}[3], [%[dout3]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7)
-      : [dout0] "r"(dout0),
-        [dout1] "r"(dout1),
-        [dout2] "r"(dout2),
-        [dout3] "r"(dout3),
-        [w0] "w"(w0),
-        [w1] "w"(w1),
-        [w2] "w"(w2),
-        [w3] "w"(w3),
-        [w4] "w"(w4),
-        [bias] "r"(bias)
-      : "memory",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v25",
-        "v26",
-        "v30",
-        "v31");
-}
-
-//! kernel for two out with extracting data pre
-//! deal with four lines out
-//! need extra load weights
-void compute_two_out_extract_pre(const float* din0,
-                                 const float* din1,
-                                 const float* din2,
-                                 const float* din3,
-                                 const float* din4,
-                                 const float* din5,
-                                 const float* din6,
-                                 const float* din7,
-                                 float* dout0,
-                                 float* dout1,
-                                 float* dout2,
-                                 float* dout3,
-                                 const float* weights,
-                                 const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v16-v19
-  //! weights: v0-v4
-  asm volatile(
-      // load weights
-      "movi v31.4s, #0  \n"
-      "add %[wh], %[wh], #4  \n"
-      "ldr q0, [%[wh]], #20  \n"  // 1, 2, 3, 4
-      "ldr q1, [%[wh]], #20  \n"  // 6, 7, 8, 9
-      "ldr q2, [%[wh]], #20  \n"  // 11, 12, 13, 14
-      "ldr q3, [%[wh]], #20  \n"  // 16, 17, 18, 19
-      "ldr q4, [%[wh]], #20  \n"  // 21, 22, 23, 24
-
-      // load inputs
-      "ld1 {v20.4s}, [%[bias]]  \n"
-      "ld1 {v8.4s}, [%[din0]], #16   \n"
-      "ld1 {v9.4s}, [%[din1]], #16   \n"
-      "ld1 {v10.4s}, [%[din2]], #16  \n"
-      "ld1 {v11.4s}, [%[din3]], #16  \n"
-      "ld1 {v12.4s}, [%[din4]], #16  \n"
-      "ld1 {v13.4s}, [%[din5]], #16  \n"
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s   \n"
-      "fmul v17.4s, v0.4s, v9.4s   \n"
-      "fmul v18.4s, v0.4s, v10.4s  \n"
-      "fmul v19.4s, v0.4s, v11.4s  \n"
-
-      "ld1 {v14.4s}, [%[din6]], #16  \n"
-      "ld1 {v15.4s}, [%[din7]], #16  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v5
-      "faddp v5.4s, v16.4s, v17.4s \n"
-      "faddp v6.4s, v18.4s, v19.4s \n"
-      "faddp v5.4s, v5.4s, v6.4s   \n"
-
-      // ext weights
-      "ext v0.16b, v0.16b, v31.16b, #4  \n"  // 2, 3, 4
-      "ext v1.16b, v1.16b, v31.16b, #4  \n"  // 7, 8, 9
-      "ext v2.16b, v2.16b, v31.16b, #4  \n"  // 12, 13, 14
-      "ext v3.16b, v3.16b, v31.16b, #4  \n"  // 17, 18, 19
-      "ext v4.16b, v4.16b, v31.16b, #4  \n"  // 22, 23, 24
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s   \n"
-      "fmul v17.4s, v0.4s, v9.4s   \n"
-      "fmul v18.4s, v0.4s, v10.4s  \n"
-      "fmul v19.4s, v0.4s, v11.4s  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v7
-      "faddp v7.4s, v16.4s, v17.4s \n"
-      "faddp v8.4s, v18.4s, v19.4s \n"
-      "faddp v7.4s, v7.4s, v8.4s   \n"
-
-      // zip
-      "zip1 v6.4s, v7.4s, v5.4s  \n"
-      "zip2 v8.4s, v7.4s, v5.4s  \n"
-      "fadd v6.4s, v6.4s, v20.4s \n"
-      "fadd v8.4s, v8.4s, v20.4s \n"
-      "ext v7.16b, v6.16b, v31.16b, #8  \n"
-      "ext v9.16b, v8.16b, v31.16b, #8  \n"
-
-      // write output
-      "str d6, [%[dout0]]  \n"
-      "str d7, [%[dout1]]  \n"
-      "str d8, [%[dout2]]  \n"
-      "str d9, [%[dout3]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7),
-        [wh] "+r"(weights)
-      : [dout0] "r"(dout0),
-        [dout1] "r"(dout1),
-        [dout2] "r"(dout2),
-        [dout3] "r"(dout3),
-        [bias] "r"(bias)
-      : "memory",
-        "v0",
-        "v1",
-        "v2",
-        "v3",
-        "v4",
-        "v5",
-        "v6",
-        "v7",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v20",
-        "v31");
-}
-
-//! kernel for two out with extracting data pre
-//! deal with four lines out
-//! need extra load weights
-void compute_two_out_extract_pre_relu(const float* din0,
-                                      const float* din1,
-                                      const float* din2,
-                                      const float* din3,
-                                      const float* din4,
-                                      const float* din5,
-                                      const float* din6,
-                                      const float* din7,
-                                      float* dout0,
-                                      float* dout1,
-                                      float* dout2,
-                                      float* dout3,
-                                      const float* weights,
-                                      const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v16-v19
-  //! weights: v0-v4
-  asm volatile(
-      // load weights
-      "movi v31.4s, #0  \n"
-      "add %[wh], %[wh], #4  \n"
-      "ldr q0, [%[wh]], #20  \n"  // 1, 2, 3, 4
-      "ldr q1, [%[wh]], #20  \n"  // 6, 7, 8, 9
-      "ldr q2, [%[wh]], #20  \n"  // 11, 12, 13, 14
-      "ldr q3, [%[wh]], #20  \n"  // 16, 17, 18, 19
-      "ldr q4, [%[wh]], #20  \n"  // 21, 22, 23, 24
-
-      // load inputs
-      "ld1 {v20.4s}, [%[bias]]      \n"
-      "ld1 {v8.4s}, [%[din0]], #16  \n"
-      "ld1 {v9.4s}, [%[din1]], #16  \n"
-      "ld1 {v10.4s}, [%[din2]], #16  \n"
-      "ld1 {v11.4s}, [%[din3]], #16  \n"
-      "ld1 {v12.4s}, [%[din4]], #16  \n"
-      "ld1 {v13.4s}, [%[din5]], #16  \n"
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s   \n"
-      "fmul v17.4s, v0.4s, v9.4s   \n"
-      "fmul v18.4s, v0.4s, v10.4s  \n"
-      "fmul v19.4s, v0.4s, v11.4s  \n"
-
-      "ld1 {v14.4s}, [%[din6]], #16  \n"
-      "ld1 {v15.4s}, [%[din7]], #16  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v5
-      "faddp v5.4s, v16.4s, v17.4s \n"
-      "faddp v6.4s, v18.4s, v19.4s \n"
-      "faddp v5.4s, v5.4s, v6.4s   \n"
-
-      // ext weights
-      "ext v0.16b, v0.16b, v31.16b, #4  \n"  // 2, 3, 4
-      "ext v1.16b, v1.16b, v31.16b, #4  \n"  // 7, 8, 9
-      "ext v2.16b, v2.16b, v31.16b, #4  \n"  // 12, 13, 14
-      "ext v3.16b, v3.16b, v31.16b, #4  \n"  // 17, 18, 19
-      "ext v4.16b, v4.16b, v31.16b, #4  \n"  // 22, 23, 24
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s  \n"
-      "fmul v17.4s, v0.4s, v9.4s  \n"
-      "fmul v18.4s, v0.4s, v10.4s  \n"
-      "fmul v19.4s, v0.4s, v11.4s  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v7
-      "faddp v7.4s, v16.4s, v17.4s \n"
-      "faddp v8.4s, v18.4s, v19.4s \n"
-      "faddp v7.4s, v7.4s, v8.4s   \n"
-
-      // zip
-      "zip1 v6.4s, v7.4s, v5.4s  \n"
-      "zip2 v8.4s, v7.4s, v5.4s  \n"
-
-      // add bias
-      "fadd v6.4s, v6.4s, v20.4s \n"
-      "fadd v8.4s, v8.4s, v20.4s \n"
-
-      // relu
-      "fmax v6.4s, v6.4s, v31.4s \n"
-      "fmax v8.4s, v8.4s, v31.4s \n"
-
-      "ext v7.16b, v6.16b, v31.16b, #8  \n"
-      "ext v9.16b, v8.16b, v31.16b, #8  \n"
-
-      // write output
-      "str d6, [%[dout0]]  \n"
-      "str d7, [%[dout1]]  \n"
-      "str d8, [%[dout2]]  \n"
-      "str d9, [%[dout3]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7),
-        [wh] "+r"(weights)
-      : [dout0] "r"(dout0),
-        [dout1] "r"(dout1),
-        [dout2] "r"(dout2),
-        [dout3] "r"(dout3),
-        [bias] "r"(bias)
-      : "memory",
-        "v0",
-        "v1",
-        "v2",
-        "v3",
-        "v4",
-        "v5",
-        "v6",
-        "v7",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v20",
-        "v31");
-}
-
-//! kernel for two out with extracting data post
-//! deal with four lines out
-void compute_two_out_extract_post(const float* din0,
-                                  const float* din1,
-                                  const float* din2,
-                                  const float* din3,
-                                  const float* din4,
-                                  const float* din5,
-                                  const float* din6,
-                                  const float* din7,
-                                  float* dout0,
-                                  float* dout1,
-                                  float* dout2,
-                                  float* dout3,
-                                  float32x4_t w0,
-                                  float32x4_t w1,
-                                  float32x4_t w2,
-                                  float32x4_t w3,
-                                  float32x4_t w4,
-                                  const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v16-v19
-  asm volatile(
-      "movi v31.4s, #0  \n"
-
-      // load inputs
-      "ld1 {v20.4s}, [%[bias]]  \n"
-      "ld1 {v8.4s}, [%[din0]], #16  \n"
-      "ld1 {v9.4s}, [%[din1]], #16  \n"
-      "ld1 {v10.4s}, [%[din2]], #16  \n"
-      "ld1 {v11.4s}, [%[din3]], #16  \n"
-      "ld1 {v12.4s}, [%[din4]], #16  \n"
-      "ld1 {v13.4s}, [%[din5]], #16  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s   \n"
-      "fmul v17.4s, %[w0].4s, v9.4s   \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      "ld1 {v14.4s}, [%[din6]], #16  \n"
-      "ld1 {v15.4s}, [%[din7]], #16  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v5
-      "faddp v5.4s, v16.4s, v17.4s \n"
-      "faddp v6.4s, v18.4s, v19.4s \n"
-      "faddp v5.4s, v5.4s, v6.4s   \n"
-
-      // ext input
-      "ext v8.16b, v8.16b, v31.16b, #4    \n"
-      "ext v9.16b, v9.16b, v31.16b, #4    \n"
-      "ext v10.16b, v10.16b, v31.16b, #4  \n"
-      "ext v11.16b, v11.16b, v31.16b, #4  \n"
-      "ext v12.16b, v12.16b, v31.16b, #4  \n"
-      "ext v13.16b, v13.16b, v31.16b, #4  \n"
-      "ext v14.16b, v14.16b, v31.16b, #4  \n"
-      "ext v15.16b, v15.16b, v31.16b, #4  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s   \n"
-      "fmul v17.4s, %[w0].4s, v9.4s   \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v7
-      "faddp v7.4s, v16.4s, v17.4s \n"
-      "faddp v8.4s, v18.4s, v19.4s \n"
-      "faddp v7.4s, v7.4s, v8.4s   \n"
-
-      // zip
-      "zip1 v6.4s, v5.4s, v7.4s  \n"
-      "zip2 v8.4s, v5.4s, v7.4s  \n"
-      "fadd v6.4s, v6.4s, v20.4s \n"
-      "fadd v8.4s, v8.4s, v20.4s  \n"
-      "ext v7.16b, v6.16b, v31.16b, #8  \n"
-      "ext v9.16b, v8.16b, v31.16b, #8  \n"
-
-      // write output
-      "str d6, [%[dout0]]  \n"
-      "str d7, [%[dout1]]  \n"
-      "str d8, [%[dout2]]  \n"
-      "str d9, [%[dout3]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7)
-      : [dout0] "r"(dout0),
-        [dout1] "r"(dout1),
-        [dout2] "r"(dout2),
-        [dout3] "r"(dout3),
-        [w0] "w"(w0),
-        [w1] "w"(w1),
-        [w2] "w"(w2),
-        [w3] "w"(w3),
-        [w4] "w"(w4),
-        [bias] "r"(bias)
-      : "memory",
-        "v5",
-        "v6",
-        "v7",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v20",
-        "v31");
-}
-
-//! kernel for two out with extracting data post
-//! deal with four lines out
-void compute_two_out_extract_post_relu(const float* din0,
-                                       const float* din1,
-                                       const float* din2,
-                                       const float* din3,
-                                       const float* din4,
-                                       const float* din5,
-                                       const float* din6,
-                                       const float* din7,
-                                       float* dout0,
-                                       float* dout1,
-                                       float* dout2,
-                                       float* dout3,
-                                       float32x4_t w0,
-                                       float32x4_t w1,
-                                       float32x4_t w2,
-                                       float32x4_t w3,
-                                       float32x4_t w4,
-                                       const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v16-v19
-  asm volatile(
-      "movi v31.4s, #0  \n"
-
-      // load inputs
-      "ld1 {v20.4s}, [%[bias]]  \n"
-      "ld1 {v8.4s}, [%[din0]], #16   \n"
-      "ld1 {v9.4s}, [%[din1]], #16   \n"
-      "ld1 {v10.4s}, [%[din2]], #16  \n"
-      "ld1 {v11.4s}, [%[din3]], #16  \n"
-      "ld1 {v12.4s}, [%[din4]], #16  \n"
-      "ld1 {v13.4s}, [%[din5]], #16  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s   \n"
-      "fmul v17.4s, %[w0].4s, v9.4s   \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      "ld1 {v14.4s}, [%[din6]], #16  \n"
-      "ld1 {v15.4s}, [%[din7]], #16  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v5
-      "faddp v5.4s, v16.4s, v17.4s \n"
-      "faddp v6.4s, v18.4s, v19.4s \n"
-      "faddp v5.4s, v5.4s, v6.4s \n"
-
-      // ext input
-      "ext v8.16b, v8.16b, v31.16b, #4    \n"
-      "ext v9.16b, v9.16b, v31.16b, #4    \n"
-      "ext v10.16b, v10.16b, v31.16b, #4  \n"
-      "ext v11.16b, v11.16b, v31.16b, #4  \n"
-      "ext v12.16b, v12.16b, v31.16b, #4  \n"
-      "ext v13.16b, v13.16b, v31.16b, #4  \n"
-      "ext v14.16b, v14.16b, v31.16b, #4  \n"
-      "ext v15.16b, v15.16b, v31.16b, #4  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s   \n"
-      "fmul v17.4s, %[w0].4s, v9.4s   \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v7
-      "faddp v7.4s, v16.4s, v17.4s \n"
-      "faddp v8.4s, v18.4s, v19.4s \n"
-      "faddp v7.4s, v7.4s, v8.4s \n"
-
-      // zip
-      "zip1 v6.4s, v5.4s, v7.4s  \n"
-      "zip2 v8.4s, v5.4s, v7.4s  \n"
-
-      // add bias
-      "fadd v6.4s, v6.4s, v20.4s \n"
-      "fadd v8.4s, v8.4s, v20.4s \n"
-
-      // relu
-      "fmax v6.4s, v6.4s, v31.4s  \n"
-      "fmax v8.4s, v8.4s, v31.4s  \n"
-      "ext v7.16b, v6.16b, v31.16b, #8  \n"
-      "ext v9.16b, v8.16b, v31.16b, #8  \n"
-
-      // write output
-      "str d6, [%[dout0]]  \n"
-      "str d7, [%[dout1]]  \n"
-      "str d8, [%[dout2]]  \n"
-      "str d9, [%[dout3]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7)
-      : [dout0] "r"(dout0),
-        [dout1] "r"(dout1),
-        [dout2] "r"(dout2),
-        [dout3] "r"(dout3),
-        [w0] "w"(w0),
-        [w1] "w"(w1),
-        [w2] "w"(w2),
-        [w3] "w"(w3),
-        [w4] "w"(w4),
-        [bias] "r"(bias)
-      : "memory",
-        "v5",
-        "v6",
-        "v7",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v20",
-        "v31");
-}
-
-//! kernel for three out with extracting data pre
-//! deal with four lines out
-//! need extra load weights
-void compute_three_out_extract_pre(const float* din0,
-                                   const float* din1,
-                                   const float* din2,
-                                   const float* din3,
-                                   const float* din4,
-                                   const float* din5,
-                                   const float* din6,
-                                   const float* din7,
-                                   float* dout0,
-                                   float* dout1,
-                                   float* dout2,
-                                   float* dout3,
-                                   const float* weights,
-                                   const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v16-v19
-  //! weights: v0-v4
-  asm volatile(
-      // load weights
-      "movi v31.4s, #0  \n"
-      "add %[wh], %[wh], #4  \n"
-      "ldr q0, [%[wh]], #20  \n"  // 1, 2, 3, 4
-      "ldr q1, [%[wh]], #20  \n"  // 6, 7, 8, 9
-      "ldr q2, [%[wh]], #20  \n"  // 11, 12, 13, 14
-      "ldr q3, [%[wh]], #20  \n"  // 16, 17, 18, 19
-      "ldr q4, [%[wh]], #20  \n"  // 21, 22, 23, 24
-
-      // load inputs
-      "ld1 {v20.4s}, [%[bias]]  \n"
-      "ld1 {v8.4s}, [%[din0]], #16  \n"
-      "ld1 {v9.4s}, [%[din1]], #16  \n"
-      "ld1 {v10.4s}, [%[din2]], #16  \n"
-      "ld1 {v11.4s}, [%[din3]], #16  \n"
-      "ld1 {v12.4s}, [%[din4]], #16  \n"
-      "ld1 {v13.4s}, [%[din5]], #16  \n"
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s   \n"
-      "fmul v17.4s, v0.4s, v9.4s   \n"
-      "fmul v18.4s, v0.4s, v10.4s  \n"
-      "fmul v19.4s, v0.4s, v11.4s  \n"
-
-      "ld1 {v14.4s}, [%[din6]], #16  \n"
-      "ld1 {v15.4s}, [%[din7]], #16  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v5
-      "faddp v5.4s, v16.4s, v17.4s \n"
-      "faddp v6.4s, v18.4s, v19.4s \n"
-      "faddp v5.4s, v5.4s, v6.4s \n"
-
-      // ext weights
-      "ext v0.16b, v0.16b, v31.16b, #4  \n"  // 2, 3, 4
-      "ext v1.16b, v1.16b, v31.16b, #4  \n"  // 7, 8, 9
-      "ext v2.16b, v2.16b, v31.16b, #4  \n"  // 12, 13, 14
-      "ext v3.16b, v3.16b, v31.16b, #4  \n"  // 17, 18, 19
-      "ext v4.16b, v4.16b, v31.16b, #4  \n"  // 22, 23, 24
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s   \n"
-      "fmul v17.4s, v0.4s, v9.4s   \n"
-      "fmul v18.4s, v0.4s, v10.4s  \n"
-      "fmul v19.4s, v0.4s, v11.4s  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v7
-      "faddp v7.4s, v16.4s, v17.4s \n"
-      "faddp v6.4s, v18.4s, v19.4s \n"
-      "faddp v7.4s, v7.4s, v6.4s \n"
-
-      // ext weights
-      "ext v0.16b, v0.16b, v31.16b, #4  \n"  // 3, 4
-      "ext v1.16b, v1.16b, v31.16b, #4  \n"  // 8, 9
-      "ext v2.16b, v2.16b, v31.16b, #4  \n"  // 13, 14
-      "ext v3.16b, v3.16b, v31.16b, #4  \n"  // 18, 19
-      "ext v4.16b, v4.16b, v31.16b, #4  \n"  // 23, 24
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s   \n"
-      "fmul v17.4s, v0.4s, v9.4s   \n"
-      "fmul v18.4s, v0.4s, v10.4s  \n"
-      "fmul v19.4s, v0.4s, v11.4s  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v25
-      "faddp v25.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v25.4s, v25.4s, v26.4s \n"
-      "fadd  v25.4s, v25.4s, v20.4s \n"
-
-      // zip
-      "zip1 v6.4s, v7.4s, v5.4s  \n"
-      "zip2 v8.4s, v7.4s, v5.4s  \n"
-      "fadd v6.4s, v6.4s, v20.4s \n"
-      "fadd v8.4s, v8.4s, v20.4s \n"
-      "ext v7.16b, v6.16b, v31.16b, #8  \n"
-      "ext v9.16b, v8.16b, v31.16b, #8  \n"
-
-      // write output
-      "st1 {v25.s}[0], [%[dout0]], #4  \n"
-      "st1 {v25.s}[1], [%[dout1]], #4  \n"
-      "st1 {v25.s}[2], [%[dout2]], #4  \n"
-      "st1 {v25.s}[3], [%[dout3]], #4  \n"
-
-      "str d6, [%[dout0]]  \n"
-      "str d7, [%[dout1]]  \n"
-      "str d8, [%[dout2]]  \n"
-      "str d9, [%[dout3]]  \n"
-
-      : [dout0] "+r"(dout0),
-        [dout1] "+r"(dout1),
-        [dout2] "+r"(dout2),
-        [dout3] "+r"(dout3),
-        [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7),
-        [wh] "+r"(weights)
-      : [bias] "r"(bias)
-      : "memory",
-        "v0",
-        "v1",
-        "v2",
-        "v3",
-        "v4",
-        "v5",
-        "v6",
-        "v7",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v20",
-        "v25",
-        "v26",
-        "v31");
-}
-
-//! kernel for three out with extracting data pre
-//! deal with four lines out
-//! need extra load weights
-void compute_three_out_extract_pre_relu(const float* din0,
-                                        const float* din1,
-                                        const float* din2,
-                                        const float* din3,
-                                        const float* din4,
-                                        const float* din5,
-                                        const float* din6,
-                                        const float* din7,
-                                        float* dout0,
-                                        float* dout1,
-                                        float* dout2,
-                                        float* dout3,
-                                        const float* weights,
-                                        const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v16-v19
-  //! weights: v0-v4
-  asm volatile(
-      // load weights
-      "movi v31.4s, #0  \n"
-      "add %[wh], %[wh], #4  \n"
-      "ldr q0, [%[wh]], #20  \n"  // 1, 2, 3, 4
-      "ldr q1, [%[wh]], #20  \n"  // 6, 7, 8, 9
-      "ldr q2, [%[wh]], #20  \n"  // 11, 12, 13, 14
-      "ldr q3, [%[wh]], #20  \n"  // 16, 17, 18, 19
-      "ldr q4, [%[wh]], #20  \n"  // 21, 22, 23, 24
-
-      // load inputs
-      "ld1 {v20.4s}, [%[bias]]  \n"
-      "ld1 {v8.4s}, [%[din0]], #16  \n"
-      "ld1 {v9.4s}, [%[din1]], #16  \n"
-      "ld1 {v10.4s}, [%[din2]], #16  \n"
-      "ld1 {v11.4s}, [%[din3]], #16  \n"
-      "ld1 {v12.4s}, [%[din4]], #16  \n"
-      "ld1 {v13.4s}, [%[din5]], #16  \n"
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s   \n"
-      "fmul v17.4s, v0.4s, v9.4s   \n"
-      "fmul v18.4s, v0.4s, v10.4s  \n"
-      "fmul v19.4s, v0.4s, v11.4s  \n"
-
-      "ld1 {v14.4s}, [%[din6]], #16  \n"
-      "ld1 {v15.4s}, [%[din7]], #16  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v5
-      "faddp v5.4s, v16.4s, v17.4s \n"
-      "faddp v6.4s, v18.4s, v19.4s \n"
-      "faddp v5.4s, v5.4s, v6.4s \n"
-
-      // ext weights
-      "ext v0.16b, v0.16b, v31.16b, #4  \n"  // 2, 3, 4
-      "ext v1.16b, v1.16b, v31.16b, #4  \n"  // 7, 8, 9
-      "ext v2.16b, v2.16b, v31.16b, #4  \n"  // 12, 13, 14
-      "ext v3.16b, v3.16b, v31.16b, #4  \n"  // 17, 18, 19
-      "ext v4.16b, v4.16b, v31.16b, #4  \n"  // 22, 23, 24
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s   \n"
-      "fmul v17.4s, v0.4s, v9.4s   \n"
-      "fmul v18.4s, v0.4s, v10.4s  \n"
-      "fmul v19.4s, v0.4s, v11.4s  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v7
-      "faddp v7.4s, v16.4s, v17.4s \n"
-      "faddp v6.4s, v18.4s, v19.4s \n"
-      "faddp v7.4s, v7.4s, v6.4s \n"
-
-      // ext weights
-      "ext v0.16b, v0.16b, v31.16b, #4  \n"  // 3, 4
-      "ext v1.16b, v1.16b, v31.16b, #4  \n"  // 8, 9
-      "ext v2.16b, v2.16b, v31.16b, #4  \n"  // 13, 14
-      "ext v3.16b, v3.16b, v31.16b, #4  \n"  // 18, 19
-      "ext v4.16b, v4.16b, v31.16b, #4  \n"  // 23, 24
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s   \n"
-      "fmul v17.4s, v0.4s, v9.4s   \n"
-      "fmul v18.4s, v0.4s, v10.4s  \n"
-      "fmul v19.4s, v0.4s, v11.4s  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v25
-      "faddp v25.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v25.4s, v25.4s, v26.4s \n"
-      "fadd  v25.4s, v25.4s, v20.4s \n"
-      "fmax  v25.4s, v25.4s, v31.4s \n"
-
-      // zip
-      "zip1 v6.4s, v7.4s, v5.4s  \n"
-      "zip2 v8.4s, v7.4s, v5.4s  \n"
-
-      // add bias
-      "fadd v6.4s, v6.4s, v20.4s \n"
-      "fadd v8.4s, v8.4s, v20.4s \n"
-
-      // relu
-      "fmax v6.4s, v6.4s, v31.4s \n"
-      "fmax v8.4s, v8.4s, v31.4s \n"
-
-      "ext v7.16b, v6.16b, v31.16b, #8  \n"
-      "ext v9.16b, v8.16b, v31.16b, #8  \n"
-
-      // write output
-      "st1 {v25.s}[0], [%[dout0]], #4  \n"
-      "st1 {v25.s}[1], [%[dout1]], #4  \n"
-      "st1 {v25.s}[2], [%[dout2]], #4  \n"
-      "st1 {v25.s}[3], [%[dout3]], #4  \n"
-
-      "str d6, [%[dout0]]  \n"
-      "str d7, [%[dout1]]  \n"
-      "str d8, [%[dout2]]  \n"
-      "str d9, [%[dout3]]  \n"
-
-      : [dout0] "+r"(dout0),
-        [dout1] "+r"(dout1),
-        [dout2] "+r"(dout2),
-        [dout3] "+r"(dout3),
-        [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7),
-        [wh] "+r"(weights)
-      : [bias] "r"(bias)
-      : "memory",
-        "v0",
-        "v1",
-        "v2",
-        "v3",
-        "v4",
-        "v5",
-        "v6",
-        "v7",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v20",
-        "v25",
-        "v26",
-        "v31");
-}
-
-//! kernel for three out with extracting data post
-//! deal with four lines out
-void compute_three_out_extract_post(const float* din0,
-                                    const float* din1,
-                                    const float* din2,
-                                    const float* din3,
-                                    const float* din4,
-                                    const float* din5,
-                                    const float* din6,
-                                    const float* din7,
-                                    float* dout0,
-                                    float* dout1,
-                                    float* dout2,
-                                    float* dout3,
-                                    float32x4_t w0,
-                                    float32x4_t w1,
-                                    float32x4_t w2,
-                                    float32x4_t w3,
-                                    float32x4_t w4,
-                                    const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v6, v8, v25
-  asm volatile(
-      "movi v31.4s, #0  \n"
-      // load inputs
-      "ld1 {v20.4s}, [%[bias]]  \n"
-      "ld1 {v8.4s}, [%[din0]], #16  \n"
-      "ld1 {v9.4s}, [%[din1]], #16  \n"
-      "ld1 {v10.4s}, [%[din2]], #16  \n"
-      "ld1 {v11.4s}, [%[din3]], #16  \n"
-      "ld1 {v12.4s}, [%[din4]], #16  \n"
-      "ld1 {v13.4s}, [%[din5]], #16  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s  \n"
-      "fmul v17.4s, %[w0].4s, v9.4s  \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      "ld1 {v14.4s}, [%[din6]], #16  \n"
-      "ld1 {v15.4s}, [%[din7]], #16  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v5
-      "faddp v5.4s, v16.4s, v17.4s \n"
-      "faddp v6.4s, v18.4s, v19.4s \n"
-      "faddp v5.4s, v5.4s, v6.4s \n"
-
-      // ext input
-      "ext v8.16b, v8.16b, v31.16b, #4  \n"
-      "ext v9.16b, v9.16b, v31.16b, #4  \n"
-      "ext v10.16b, v10.16b, v31.16b, #4  \n"
-      "ext v11.16b, v11.16b, v31.16b, #4  \n"
-      "ext v12.16b, v12.16b, v31.16b, #4  \n"
-      "ext v13.16b, v13.16b, v31.16b, #4  \n"
-      "ext v14.16b, v14.16b, v31.16b, #4  \n"
-      "ext v15.16b, v15.16b, v31.16b, #4  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s   \n"
-      "fmul v17.4s, %[w0].4s, v9.4s   \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v7
-      "faddp v7.4s, v16.4s, v17.4s \n"
-      "faddp v6.4s, v18.4s, v19.4s \n"
-      "faddp v7.4s, v7.4s, v6.4s \n"
-
-      // ext input
-      "ext v8.16b, v8.16b, v31.16b, #4    \n"
-      "ext v9.16b, v9.16b, v31.16b, #4    \n"
-      "ext v10.16b, v10.16b, v31.16b, #4  \n"
-      "ext v11.16b, v11.16b, v31.16b, #4  \n"
-      "ext v12.16b, v12.16b, v31.16b, #4  \n"
-      "ext v13.16b, v13.16b, v31.16b, #4  \n"
-      "ext v14.16b, v14.16b, v31.16b, #4  \n"
-      "ext v15.16b, v15.16b, v31.16b, #4  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s  \n"
-      "fmul v17.4s, %[w0].4s, v9.4s  \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v25
-      "faddp v25.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v25.4s, v25.4s, v26.4s \n"
-      "fadd  v25.4s, v25.4s, v20.4s \n"
-
-      // zip
-      "zip1 v6.4s, v5.4s, v7.4s  \n"
-      "zip2 v8.4s, v5.4s, v7.4s  \n"
-      "fadd v6.4s, v6.4s, v20.4s \n"
-      "fadd v8.4s, v8.4s, v20.4s \n"
-      "ext v7.16b, v6.16b, v31.16b, #8  \n"
-      "ext v9.16b, v8.16b, v31.16b, #8  \n"
-
-      // write output
-      "str d6, [%[dout0]], #8  \n"
-      "str d7, [%[dout1]], #8  \n"
-      "str d8, [%[dout2]], #8  \n"
-      "str d9, [%[dout3]], #8  \n"
-
-      "st1 {v25.s}[0], [%[dout0]]  \n"
-      "st1 {v25.s}[1], [%[dout1]]  \n"
-      "st1 {v25.s}[2], [%[dout2]]  \n"
-      "st1 {v25.s}[3], [%[dout3]]  \n"
-
-      : [dout0] "+r"(dout0),
-        [dout1] "+r"(dout1),
-        [dout2] "+r"(dout2),
-        [dout3] "+r"(dout3),
-        [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7)
-      : [w0] "w"(w0),
-        [w1] "w"(w1),
-        [w2] "w"(w2),
-        [w3] "w"(w3),
-        [w4] "w"(w4),
-        [bias] "r"(bias)
-      : "memory",
-        "v5",
-        "v6",
-        "v7",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v20",
-        "v25",
-        "v26",
-        "v31");
-}
-
-//! kernel for three out with extracting data post
-//! deal with four lines out
-void compute_three_out_extract_post_relu(const float* din0,
-                                         const float* din1,
-                                         const float* din2,
-                                         const float* din3,
-                                         const float* din4,
-                                         const float* din5,
-                                         const float* din6,
-                                         const float* din7,
-                                         float* dout0,
-                                         float* dout1,
-                                         float* dout2,
-                                         float* dout3,
-                                         float32x4_t w0,
-                                         float32x4_t w1,
-                                         float32x4_t w2,
-                                         float32x4_t w3,
-                                         float32x4_t w4,
-                                         const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v6, v8, v25
-  asm volatile(
-      "movi v31.4s, #0  \n"
-
-      // load inputs
-      "ld1 {v20.4s}, [%[bias]]  \n"
-      "ld1 {v8.4s}, [%[din0]], #16  \n"
-      "ld1 {v9.4s}, [%[din1]], #16  \n"
-      "ld1 {v10.4s}, [%[din2]], #16  \n"
-      "ld1 {v11.4s}, [%[din3]], #16  \n"
-      "ld1 {v12.4s}, [%[din4]], #16  \n"
-      "ld1 {v13.4s}, [%[din5]], #16  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s  \n"
-      "fmul v17.4s, %[w0].4s, v9.4s  \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      "ld1 {v14.4s}, [%[din6]], #16  \n"
-      "ld1 {v15.4s}, [%[din7]], #16  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v5
-      "faddp v5.4s, v16.4s, v17.4s \n"
-      "faddp v6.4s, v18.4s, v19.4s \n"
-      "faddp v5.4s, v5.4s, v6.4s \n"
-
-      // ext input
-      "ext v8.16b, v8.16b, v31.16b, #4    \n"
-      "ext v9.16b, v9.16b, v31.16b, #4    \n"
-      "ext v10.16b, v10.16b, v31.16b, #4  \n"
-      "ext v11.16b, v11.16b, v31.16b, #4  \n"
-      "ext v12.16b, v12.16b, v31.16b, #4  \n"
-      "ext v13.16b, v13.16b, v31.16b, #4  \n"
-      "ext v14.16b, v14.16b, v31.16b, #4  \n"
-      "ext v15.16b, v15.16b, v31.16b, #4  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s   \n"
-      "fmul v17.4s, %[w0].4s, v9.4s   \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v7
-      "faddp v7.4s, v16.4s, v17.4s \n"
-      "faddp v6.4s, v18.4s, v19.4s \n"
-      "faddp v7.4s, v7.4s, v6.4s \n"
-
-      // ext input
-      "ext v8.16b, v8.16b, v31.16b, #4    \n"
-      "ext v9.16b, v9.16b, v31.16b, #4    \n"
-      "ext v10.16b, v10.16b, v31.16b, #4  \n"
-      "ext v11.16b, v11.16b, v31.16b, #4  \n"
-      "ext v12.16b, v12.16b, v31.16b, #4  \n"
-      "ext v13.16b, v13.16b, v31.16b, #4  \n"
-      "ext v14.16b, v14.16b, v31.16b, #4  \n"
-      "ext v15.16b, v15.16b, v31.16b, #4  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s   \n"
-      "fmul v17.4s, %[w0].4s, v9.4s   \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v25
-      "faddp v25.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v25.4s, v25.4s, v26.4s \n"
-      "fadd  v25.4s, v25.4s, v20.4s \n"
-      "fmax  v25.4s, v25.4s, v31.4s \n"
-
-      // zip
-      "zip1 v6.4s, v5.4s, v7.4s  \n"
-      "zip2 v8.4s, v5.4s, v7.4s  \n"
-
-      // add bias
-      "fadd v6.4s, v6.4s, v20.4s \n"
-      "fadd v8.4s, v8.4s, v20.4s \n"
-
-      // relu
-      "fmax v6.4s, v6.4s, v31.4s \n"
-      "fmax v8.4s, v8.4s, v31.4s \n"
-
-      "ext v7.16b, v6.16b, v31.16b, #8  \n"
-      "ext v9.16b, v8.16b, v31.16b, #8  \n"
-
-      // write output
-      "str d6, [%[dout0]], #8  \n"
-      "str d7, [%[dout1]], #8  \n"
-      "str d8, [%[dout2]], #8  \n"
-      "str d9, [%[dout3]], #8  \n"
-
-      "st1 {v25.s}[0], [%[dout0]]  \n"
-      "st1 {v25.s}[1], [%[dout1]]  \n"
-      "st1 {v25.s}[2], [%[dout2]]  \n"
-      "st1 {v25.s}[3], [%[dout3]]  \n"
-
-      : [dout0] "+r"(dout0),
-        [dout1] "+r"(dout1),
-        [dout2] "+r"(dout2),
-        [dout3] "+r"(dout3),
-        [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7)
-      : [w0] "w"(w0),
-        [w1] "w"(w1),
-        [w2] "w"(w2),
-        [w3] "w"(w3),
-        [w4] "w"(w4),
-        [bias] "r"(bias)
-      : "memory",
-        "v5",
-        "v6",
-        "v7",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v20",
-        "v25",
-        "v26",
-        "v31");
-}
-
-//! kernel for four out with extracting data pre
-//! deal with four lines out
-//! need extra load weights
-void compute_four_out_extract_pre(const float* din0,
-                                  const float* din1,
-                                  const float* din2,
-                                  const float* din3,
-                                  const float* din4,
-                                  const float* din5,
-                                  const float* din6,
-                                  const float* din7,
-                                  float* dout0,
-                                  float* dout1,
-                                  float* dout2,
-                                  float* dout3,
-                                  const float* weights,
-                                  const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v0-v3
-  //! weights: v0-v4, v5, v6
-  asm volatile(
-      // load weights
-      "movi v31.4s, #0  \n"
-      "mov x0, #20  \n"
-      "add %[wh], %[wh], #4  \n"
-      "ldr q0, [%[wh]], #20  \n"  // 1, 2, 3, 4
-      "ldr q1, [%[wh]], #20  \n"  // 6, 7, 8, 9
-      "ldr q2, [%[wh]], #20  \n"  // 11, 12, 13, 14
-      "ldr q3, [%[wh]], #20  \n"  // 16, 17, 18, 19
-      "ldr q4, [%[wh]]       \n"  // 21, 22, 23, 24
-      "sub %[wh], %[wh], #68 \n"
-
-      // load inputs
-      "ld1 {v8.4s},  [%[din0]]  \n"
-      "ld1 {v9.4s},  [%[din1]]  \n"
-      "ld1 {v10.4s}, [%[din2]]  \n"
-      "ld1 {v11.4s}, [%[din3]]  \n"
-      "ld1 {v12.4s}, [%[din4]]  \n"
-      "ld1 {v13.4s}, [%[din5]]  \n"
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s  \n"
-      "fmul v17.4s, v0.4s, v9.4s  \n"
-      "fmul v18.4s, v0.4s, v10.4s \n"
-      "fmul v19.4s, v0.4s, v11.4s \n"
-
-      "ld1 {v14.4s}, [%[din6]]  \n"
-      "ld1 {v15.4s}, [%[din7]]  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v25
-      "faddp v25.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v25.4s, v25.4s, v26.4s \n"
-
-      // load weights col5
-      "ld1 {v5.s}[0], [%[wh]], x0  \n"
-      "ld1 {v5.s}[1], [%[wh]], x0  \n"
-      "ld1 {v5.s}[2], [%[wh]], x0  \n"
-      "ld1 {v5.s}[3], [%[wh]], x0  \n"
-      "ld1 {v6.s}[0], [%[wh]]      \n"
-
-      // ext weights
-      "ext v0.16b, v0.16b, v31.16b, #4  \n"  // 2, 3, 4
-      "ext v1.16b, v1.16b, v31.16b, #4  \n"  // 7, 8, 9
-      "ext v2.16b, v2.16b, v31.16b, #4  \n"  // 12, 13, 14
-      "ext v3.16b, v3.16b, v31.16b, #4  \n"  // 17, 18, 19
-      "ext v4.16b, v4.16b, v31.16b, #4  \n"  // 22, 23, 24
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s   \n"
-      "fmul v17.4s, v0.4s, v9.4s   \n"
-      "fmul v18.4s, v0.4s, v10.4s  \n"
-      "fmul v19.4s, v0.4s, v11.4s  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v27
-      "faddp v27.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v27.4s, v27.4s, v26.4s \n"
-
-      // load in col5
-      "ld1 {v20.s}[0], [%[din0]] \n"
-      "ld1 {v20.s}[1], [%[din1]] \n"
-      "ld1 {v20.s}[2], [%[din2]] \n"
-      "ld1 {v20.s}[3], [%[din3]] \n"
-
-      // ext weights
-      "ext v0.16b, v0.16b, v31.16b, #4  \n"  // 3, 4
-      "ext v1.16b, v1.16b, v31.16b, #4  \n"  // 8, 9
-      "ext v2.16b, v2.16b, v31.16b, #4  \n"  // 13, 14
-      "ext v3.16b, v3.16b, v31.16b, #4  \n"  // 18, 19
-      "ext v4.16b, v4.16b, v31.16b, #4  \n"  // 23, 24
-
-      "ld1 {v21.s}[0], [%[din4]] \n"
-      "ld1 {v21.s}[1], [%[din5]] \n"
-      "ld1 {v21.s}[2], [%[din6]] \n"
-      "ld1 {v21.s}[3], [%[din7]] \n"
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s  \n"
-      "fmul v17.4s, v0.4s, v9.4s  \n"
-      "fmul v18.4s, v0.4s, v10.4s \n"
-      "fmul v19.4s, v0.4s, v11.4s \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v26
-      "faddp v26.4s, v16.4s, v17.4s \n"
-      "faddp v28.4s, v18.4s, v19.4s \n"
-      "faddp v26.4s, v26.4s, v28.4s \n"
-
-      // ext input col5
-      "ext v22.16b, v20.16b, v21.16b, #4  \n"
-      "ext v23.16b, v20.16b, v21.16b, #8  \n"
-      "ext v24.16b, v20.16b, v21.16b, #12 \n"
-
-      // in col5
-      "fmul v16.4s, v5.4s, v20.4s  \n"
-      "fmul v17.4s, v5.4s, v22.4s  \n"
-      "fmul v18.4s, v5.4s, v23.4s  \n"
-      "fmul v19.4s, v5.4s, v24.4s  \n"
-
-      // add to out register v28
-      "faddp v28.4s, v16.4s, v17.4s \n"
-      "faddp v29.4s, v18.4s, v19.4s \n"
-      "faddp v28.4s, v28.4s, v29.4s \n"
-      "fmla v28.4s, v21.4s, v6.s[0] \n"
-
-      "ld1 {v8.4s}, [%[bias]]  \n"
-
-      // zip
-      "zip1 v0.4s, v28.4s, v26.4s  \n"
-      "zip2 v2.4s, v28.4s, v26.4s  \n"
-      "zip1 v4.4s, v27.4s, v25.4s  \n"
-      "zip2 v6.4s, v27.4s, v25.4s  \n"
-
-      "fadd v0.4s, v0.4s, v8.4s  \n"
-      "fadd v2.4s, v2.4s, v8.4s  \n"
-      "fadd v4.4s, v4.4s, v8.4s  \n"
-      "fadd v6.4s, v6.4s, v8.4s  \n"
-
-      "ext v1.16b, v0.16b, v31.16b, #8  \n"
-      "ext v3.16b, v2.16b, v31.16b, #8  \n"
-      "ext v5.16b, v4.16b, v31.16b, #8  \n"
-      "ext v7.16b, v6.16b, v31.16b, #8  \n"
-
-      // write output
-      "str d0, [%[dout0]], #8  \n"
-      "str d1, [%[dout1]], #8  \n"
-      "str d2, [%[dout2]], #8  \n"
-      "str d3, [%[dout3]], #8  \n"
-
-      "str d4, [%[dout0]]  \n"
-      "str d5, [%[dout1]]  \n"
-      "str d6, [%[dout2]]  \n"
-      "str d7, [%[dout3]]  \n"
-
-      : [dout0] "+r"(dout0),
-        [dout1] "+r"(dout1),
-        [dout2] "+r"(dout2),
-        [dout3] "+r"(dout3),
-        [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7),
-        [wh] "+r"(weights)
-      : [bias] "r"(bias)
-      : "memory",
-        "x0",
-        "v0",
-        "v1",
-        "v2",
-        "v3",
-        "v4",
-        "v5",
-        "v6",
-        "v7",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v20",
-        "v21",
-        "v22",
-        "v23",
-        "v24",
-        "v25",
-        "v26",
-        "v27",
-        "v28",
-        "v29",
-        "v31");
-}
-
-//! kernel for four out with extracting data pre
-//! deal with four lines out
-//! need extra load weights
-void compute_four_out_extract_pre_relu(const float* din0,
-                                       const float* din1,
-                                       const float* din2,
-                                       const float* din3,
-                                       const float* din4,
-                                       const float* din5,
-                                       const float* din6,
-                                       const float* din7,
-                                       float* dout0,
-                                       float* dout1,
-                                       float* dout2,
-                                       float* dout3,
-                                       const float* weights,
-                                       const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v0-v3
-  //! weights: v0-v4, v5, v6
-  asm volatile(
-      // load weights
-      "movi v31.4s, #0  \n"
-      "mov x0, #20  \n"
-      "add %[wh], %[wh], #4  \n"
-      "ldr q0, [%[wh]], #20  \n"  // 1, 2, 3, 4
-      "ldr q1, [%[wh]], #20  \n"  // 6, 7, 8, 9
-      "ldr q2, [%[wh]], #20  \n"  // 11, 12, 13, 14
-      "ldr q3, [%[wh]], #20  \n"  // 16, 17, 18, 19
-      "ldr q4, [%[wh]]       \n"  // 21, 22, 23, 24
-      "sub %[wh], %[wh], #68 \n"
-
-      // load inputs
-      "ld1 {v8.4s}, [%[din0]]   \n"
-      "ld1 {v9.4s}, [%[din1]]   \n"
-      "ld1 {v10.4s}, [%[din2]]  \n"
-      "ld1 {v11.4s}, [%[din3]]  \n"
-      "ld1 {v12.4s}, [%[din4]]  \n"
-      "ld1 {v13.4s}, [%[din5]]  \n"
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s  \n"
-      "fmul v17.4s, v0.4s, v9.4s  \n"
-      "fmul v18.4s, v0.4s, v10.4s \n"
-      "fmul v19.4s, v0.4s, v11.4s \n"
-
-      "ld1 {v14.4s}, [%[din6]]  \n"
-      "ld1 {v15.4s}, [%[din7]]  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v25
-      "faddp v25.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v25.4s, v25.4s, v26.4s \n"
-
-      // load weights col5
-      "ld1 {v5.s}[0], [%[wh]], x0  \n"
-      "ld1 {v5.s}[1], [%[wh]], x0  \n"
-      "ld1 {v5.s}[2], [%[wh]], x0  \n"
-      "ld1 {v5.s}[3], [%[wh]], x0  \n"
-      "ld1 {v6.s}[0], [%[wh]]      \n"
-
-      // ext weights
-      "ext v0.16b, v0.16b, v31.16b, #4  \n"  // 2, 3, 4
-      "ext v1.16b, v1.16b, v31.16b, #4  \n"  // 7, 8, 9
-      "ext v2.16b, v2.16b, v31.16b, #4  \n"  // 12, 13, 14
-      "ext v3.16b, v3.16b, v31.16b, #4  \n"  // 17, 18, 19
-      "ext v4.16b, v4.16b, v31.16b, #4  \n"  // 22, 23, 24
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s   \n"
-      "fmul v17.4s, v0.4s, v9.4s   \n"
-      "fmul v18.4s, v0.4s, v10.4s  \n"
-      "fmul v19.4s, v0.4s, v11.4s  \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v27
-      "faddp v27.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v27.4s, v27.4s, v26.4s \n"
-
-      // load in col5
-      "ld1 {v20.s}[0], [%[din0]] \n"
-      "ld1 {v20.s}[1], [%[din1]] \n"
-      "ld1 {v20.s}[2], [%[din2]] \n"
-      "ld1 {v20.s}[3], [%[din3]] \n"
-
-      // ext weights
-      "ext v0.16b, v0.16b, v31.16b, #4  \n"  // 3, 4
-      "ext v1.16b, v1.16b, v31.16b, #4  \n"  // 8, 9
-      "ext v2.16b, v2.16b, v31.16b, #4  \n"  // 13, 14
-      "ext v3.16b, v3.16b, v31.16b, #4  \n"  // 18, 19
-      "ext v4.16b, v4.16b, v31.16b, #4  \n"  // 23, 24
-
-      "ld1 {v21.s}[0], [%[din4]] \n"
-      "ld1 {v21.s}[1], [%[din5]] \n"
-      "ld1 {v21.s}[2], [%[din6]] \n"
-      "ld1 {v21.s}[3], [%[din7]] \n"
-
-      // in row0
-      "fmul v16.4s, v0.4s, v8.4s  \n"
-      "fmul v17.4s, v0.4s, v9.4s  \n"
-      "fmul v18.4s, v0.4s, v10.4s \n"
-      "fmul v19.4s, v0.4s, v11.4s \n"
-
-      // in row1
-      "fmla v16.4s, v1.4s, v9.4s  \n"
-      "fmla v17.4s, v1.4s, v10.4s \n"
-      "fmla v18.4s, v1.4s, v11.4s \n"
-      "fmla v19.4s, v1.4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, v2.4s, v10.4s \n"
-      "fmla v17.4s, v2.4s, v11.4s \n"
-      "fmla v18.4s, v2.4s, v12.4s \n"
-      "fmla v19.4s, v2.4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, v3.4s, v11.4s \n"
-      "fmla v17.4s, v3.4s, v12.4s \n"
-      "fmla v18.4s, v3.4s, v13.4s \n"
-      "fmla v19.4s, v3.4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, v4.4s, v12.4s \n"
-      "fmla v17.4s, v4.4s, v13.4s \n"
-      "fmla v18.4s, v4.4s, v14.4s \n"
-      "fmla v19.4s, v4.4s, v15.4s \n"
-
-      // add to out register v26
-      "faddp v26.4s, v16.4s, v17.4s \n"
-      "faddp v28.4s, v18.4s, v19.4s \n"
-      "faddp v26.4s, v26.4s, v28.4s \n"
-
-      // ext input col5
-      "ext v22.16b, v20.16b, v21.16b, #4  \n"
-      "ext v23.16b, v20.16b, v21.16b, #8  \n"
-      "ext v24.16b, v20.16b, v21.16b, #12 \n"
-
-      // in col5
-      "fmul v16.4s, v5.4s, v20.4s  \n"
-      "fmul v17.4s, v5.4s, v22.4s  \n"
-      "fmul v18.4s, v5.4s, v23.4s  \n"
-      "fmul v19.4s, v5.4s, v24.4s  \n"
-
-      // add to out register v28
-      "faddp v28.4s, v16.4s, v17.4s \n"
-      "faddp v29.4s, v18.4s, v19.4s \n"
-      "faddp v28.4s, v28.4s, v29.4s \n"
-      "fmla v28.4s, v21.4s, v6.s[0] \n"
-
-      "ld1 {v8.4s}, [%[bias]]  \n"
-
-      // zip
-      "zip1 v0.4s, v28.4s, v26.4s  \n"
-      "zip2 v2.4s, v28.4s, v26.4s  \n"
-      "zip1 v4.4s, v27.4s, v25.4s  \n"
-      "zip2 v6.4s, v27.4s, v25.4s  \n"
-
-      // add bias
-      "fadd v0.4s, v0.4s, v8.4s  \n"
-      "fadd v2.4s, v2.4s, v8.4s  \n"
-      "fadd v4.4s, v4.4s, v8.4s  \n"
-      "fadd v6.4s, v6.4s, v8.4s  \n"
-
-      // relu
-      "fmax v0.4s, v0.4s, v31.4s \n"
-      "fmax v2.4s, v2.4s, v31.4s \n"
-      "fmax v4.4s, v4.4s, v31.4s \n"
-      "fmax v6.4s, v6.4s, v31.4s \n"
-
-      "ext v1.16b, v0.16b, v31.16b, #8  \n"
-      "ext v3.16b, v2.16b, v31.16b, #8  \n"
-      "ext v5.16b, v4.16b, v31.16b, #8  \n"
-      "ext v7.16b, v6.16b, v31.16b, #8  \n"
-
-      // write output
-      "str d0, [%[dout0]], #8  \n"
-      "str d1, [%[dout1]], #8  \n"
-      "str d2, [%[dout2]], #8  \n"
-      "str d3, [%[dout3]], #8  \n"
-
-      "str d4, [%[dout0]]  \n"
-      "str d5, [%[dout1]]  \n"
-      "str d6, [%[dout2]]  \n"
-      "str d7, [%[dout3]]  \n"
-
-      : [dout0] "+r"(dout0),
-        [dout1] "+r"(dout1),
-        [dout2] "+r"(dout2),
-        [dout3] "+r"(dout3),
-        [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7),
-        [wh] "+r"(weights)
-      : [bias] "r"(bias)
-      : "memory",
-        "x0",
-        "v0",
-        "v1",
-        "v2",
-        "v3",
-        "v4",
-        "v5",
-        "v6",
-        "v7",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v20",
-        "v21",
-        "v22",
-        "v23",
-        "v24",
-        "v25",
-        "v26",
-        "v27",
-        "v28",
-        "v29",
-        "v31");
-}
-
-//! kernel for four out with extracting data post
-//! deal with four lines out
-void compute_four_out_extract_post(const float* din0,
-                                   const float* din1,
-                                   const float* din2,
-                                   const float* din3,
-                                   const float* din4,
-                                   const float* din5,
-                                   const float* din6,
-                                   const float* din7,
-                                   float* dout0,
-                                   float* dout1,
-                                   float* dout2,
-                                   float* dout3,
-                                   float32x4_t w0,
-                                   float32x4_t w1,
-                                   float32x4_t w2,
-                                   float32x4_t w3,
-                                   float32x4_t w4,
-                                   const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v0-v3
-  const int64_t s_12 = 12;
-  const float* doutl[4] = {dout0, dout1, dout2, dout3};
-  void* doutl_ptr = reinterpret_cast<void*>(doutl);
-  asm volatile(
-      "movi v31.4s, #0  \n"
-      "ldp x0, x1, [%[doutl]], #16  \n"
-      "ldp x2, x3, [%[doutl]]  \n"
-
-      // load inputs
-      "ld1 {v8.4s}, [%[din0]], %[s_12]   \n"
-      "ld1 {v9.4s}, [%[din1]], %[s_12]   \n"
-      "ld1 {v10.4s}, [%[din2]], %[s_12]  \n"
-      "ld1 {v11.4s}, [%[din3]], %[s_12]  \n"
-      "ld1 {v12.4s}, [%[din4]], %[s_12]  \n"
-      "ld1 {v13.4s}, [%[din5]], %[s_12]  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s  \n"
-      "fmul v17.4s, %[w0].4s, v9.4s  \n"
-      "fmul v18.4s, %[w0].4s, v10.4s \n"
-      "fmul v19.4s, %[w0].4s, v11.4s \n"
-
-      "ld1 {v14.4s}, [%[din6]], %[s_12]  \n"
-      "ld1 {v15.4s}, [%[din7]], %[s_12]  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v25
-      "faddp v25.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v25.4s, v25.4s, v26.4s \n"
-
-      // load input col5
-      "ld1 {v20.s}[0], [%[din0]]  \n"
-      "ld1 {v20.s}[1], [%[din1]]  \n"
-      "ld1 {v20.s}[2], [%[din2]]  \n"
-      "ld1 {v20.s}[3], [%[din3]]  \n"
-
-      // ext input
-      "ext v8.16b, v8.16b, v31.16b, #4  \n"
-      "ext v9.16b, v9.16b, v31.16b, #4  \n"
-      "ext v10.16b, v10.16b, v31.16b, #4  \n"
-      "ext v11.16b, v11.16b, v31.16b, #4  \n"
-      "ext v12.16b, v12.16b, v31.16b, #4  \n"
-      "ext v13.16b, v13.16b, v31.16b, #4  \n"
-      "ext v14.16b, v14.16b, v31.16b, #4  \n"
-      "ext v15.16b, v15.16b, v31.16b, #4  \n"
-
-      // load input col5
-      "ld1 {v21.s}[0], [%[din4]]  \n"
-      "ld1 {v21.s}[1], [%[din5]]  \n"
-      "ld1 {v21.s}[2], [%[din6]]  \n"
-      "ld1 {v21.s}[3], [%[din7]]  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s  \n"
-      "fmul v17.4s, %[w0].4s, v9.4s  \n"
-      "fmul v18.4s, %[w0].4s, v10.4s \n"
-      "fmul v19.4s, %[w0].4s, v11.4s \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v27
-      "faddp v27.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v27.4s, v27.4s, v26.4s \n"
-
-      // ext input
-      "ext v8.16b, v8.16b, v31.16b, #4    \n"
-      "ext v9.16b, v9.16b, v31.16b, #4    \n"
-      "ext v10.16b, v10.16b, v31.16b, #4  \n"
-      "ext v11.16b, v11.16b, v31.16b, #4  \n"
-      "ext v12.16b, v12.16b, v31.16b, #4  \n"
-      "ext v13.16b, v13.16b, v31.16b, #4  \n"
-      "ext v14.16b, v14.16b, v31.16b, #4  \n"
-      "ext v15.16b, v15.16b, v31.16b, #4  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s  \n"
-      "fmul v17.4s, %[w0].4s, v9.4s  \n"
-      "fmul v18.4s, %[w0].4s, v10.4s \n"
-      "fmul v19.4s, %[w0].4s, v11.4s \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v26
-      "faddp v26.4s, v16.4s, v17.4s \n"
-      "faddp v28.4s, v18.4s, v19.4s \n"
-      "faddp v26.4s, v26.4s, v28.4s \n"
-
-      // ext input col5
-      "ext v8.16b, v20.16b, v21.16b, #4  \n"
-      "ext v9.16b, v20.16b, v21.16b, #8  \n"
-      "ext v10.16b, v20.16b, v21.16b, #12  \n"
-
-      // ext weights col0
-      "ins v5.s[0], %[w0].s[0]  \n"
-      "ins v5.s[1], %[w1].s[0]  \n"
-      "ins v5.s[2], %[w2].s[0]  \n"
-      "ins v5.s[3], %[w3].s[0]  \n"
-
-      // in col5
-      "fmul v16.4s, v5.4s, v20.4s \n"
-      "fmul v17.4s, v5.4s, v8.4s  \n"
-      "fmul v18.4s, v5.4s, v9.4s  \n"
-      "fmul v19.4s, v5.4s, v10.4s \n"
-
-      // add to out register v28
-      "faddp v28.4s, v16.4s, v17.4s \n"
-      "faddp v29.4s, v18.4s, v19.4s \n"
-      "faddp v28.4s, v28.4s, v29.4s \n"
-      "fmla v28.4s, v21.4s, %[w4].s[0]  \n"
-
-      "ld1 {v8.4s}, [%[bias]]  \n"
-
-      // zip
-      "zip1 v0.4s, v25.4s, v27.4s  \n"
-      "zip2 v2.4s, v25.4s, v27.4s  \n"
-      "zip1 v4.4s, v26.4s, v28.4s  \n"
-      "zip2 v6.4s, v26.4s, v28.4s  \n"
-
-      "fadd v0.4s, v0.4s, v8.4s  \n"
-      "fadd v2.4s, v2.4s, v8.4s  \n"
-      "fadd v4.4s, v4.4s, v8.4s  \n"
-      "fadd v6.4s, v6.4s, v8.4s  \n"
-
-      "ext v1.16b, v0.16b, v31.16b, #8  \n"
-      "ext v3.16b, v2.16b, v31.16b, #8  \n"
-      "ext v5.16b, v4.16b, v31.16b, #8  \n"
-      "ext v7.16b, v6.16b, v31.16b, #8  \n"
-
-      // write output
-      "str d0, [x0], #8  \n"
-      "str d1, [x1], #8  \n"
-      "str d2, [x2], #8  \n"
-      "str d3, [x3], #8  \n"
-
-      "str d4, [x0]  \n"
-      "str d5, [x1]  \n"
-      "str d6, [x2]  \n"
-      "str d7, [x3]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7),
-        [doutl] "+r"(doutl_ptr)
-      : [s_12] "r"(s_12),
-        [w0] "w"(w0),
-        [w1] "w"(w1),
-        [w2] "w"(w2),
-        [w3] "w"(w3),
-        [w4] "w"(w4),
-        [bias] "r"(bias)
-      : "memory",
-        "x0",
-        "x1",
-        "x2",
-        "x3",
-        "v0",
-        "v1",
-        "v2",
-        "v3",
-        "v5",
-        "v7",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v20",
-        "v21",
-        "v25",
-        "v26",
-        "v27",
-        "v28",
-        "v29",
-        "v31");
-}
-
-//! kernel for four out with extracting data post
-//! deal with four lines out
-void compute_four_out_extract_post_relu(const float* din0,
-                                        const float* din1,
-                                        const float* din2,
-                                        const float* din3,
-                                        const float* din4,
-                                        const float* din5,
-                                        const float* din6,
-                                        const float* din7,
-                                        float* dout0,
-                                        float* dout1,
-                                        float* dout2,
-                                        float* dout3,
-                                        float32x4_t w0,
-                                        float32x4_t w1,
-                                        float32x4_t w2,
-                                        float32x4_t w3,
-                                        float32x4_t w4,
-                                        const float* bias) {
-  //! din0 - din7: 0-4 v8-v15
-  //! dout0 - dout3: v0-v3
-  const int64_t s_12 = 12;
-  const float* doutl[4] = {dout0, dout1, dout2, dout3};
-  void* doutl_ptr = reinterpret_cast<void*>(doutl);
-  asm volatile(
-      "movi v31.4s, #0  \n"
-      "ldp x0, x1, [%[doutl]], #16  \n"
-      "ldp x2, x3, [%[doutl]]  \n"
-
-      // load inputs
-      "ld1 {v8.4s}, [%[din0]], %[s_12]   \n"
-      "ld1 {v9.4s}, [%[din1]], %[s_12]   \n"
-      "ld1 {v10.4s}, [%[din2]], %[s_12]  \n"
-      "ld1 {v11.4s}, [%[din3]], %[s_12]  \n"
-      "ld1 {v12.4s}, [%[din4]], %[s_12]  \n"
-      "ld1 {v13.4s}, [%[din5]], %[s_12]  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s   \n"
-      "fmul v17.4s, %[w0].4s, v9.4s   \n"
-      "fmul v18.4s, %[w0].4s, v10.4s  \n"
-      "fmul v19.4s, %[w0].4s, v11.4s  \n"
-
-      "ld1 {v14.4s}, [%[din6]], %[s_12]  \n"
-      "ld1 {v15.4s}, [%[din7]], %[s_12]  \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v25
-      "faddp v25.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v25.4s, v25.4s, v26.4s \n"
-
-      // load input col5
-      "ld1 {v20.s}[0], [%[din0]]  \n"
-      "ld1 {v20.s}[1], [%[din1]]  \n"
-      "ld1 {v20.s}[2], [%[din2]]  \n"
-      "ld1 {v20.s}[3], [%[din3]]  \n"
-
-      // ext input
-      "ext v8.16b, v8.16b, v31.16b, #4  \n"
-      "ext v9.16b, v9.16b, v31.16b, #4  \n"
-      "ext v10.16b, v10.16b, v31.16b, #4  \n"
-      "ext v11.16b, v11.16b, v31.16b, #4  \n"
-      "ext v12.16b, v12.16b, v31.16b, #4  \n"
-      "ext v13.16b, v13.16b, v31.16b, #4  \n"
-      "ext v14.16b, v14.16b, v31.16b, #4  \n"
-      "ext v15.16b, v15.16b, v31.16b, #4  \n"
-
-      // load input col5
-      "ld1 {v21.s}[0], [%[din4]]  \n"
-      "ld1 {v21.s}[1], [%[din5]]  \n"
-      "ld1 {v21.s}[2], [%[din6]]  \n"
-      "ld1 {v21.s}[3], [%[din7]]  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s  \n"
-      "fmul v17.4s, %[w0].4s, v9.4s  \n"
-      "fmul v18.4s, %[w0].4s, v10.4s \n"
-      "fmul v19.4s, %[w0].4s, v11.4s \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v27
-      "faddp v27.4s, v16.4s, v17.4s \n"
-      "faddp v26.4s, v18.4s, v19.4s \n"
-      "faddp v27.4s, v27.4s, v26.4s \n"
-
-      // ext input
-      "ext v8.16b, v8.16b, v31.16b, #4  \n"
-      "ext v9.16b, v9.16b, v31.16b, #4  \n"
-      "ext v10.16b, v10.16b, v31.16b, #4  \n"
-      "ext v11.16b, v11.16b, v31.16b, #4  \n"
-      "ext v12.16b, v12.16b, v31.16b, #4  \n"
-      "ext v13.16b, v13.16b, v31.16b, #4  \n"
-      "ext v14.16b, v14.16b, v31.16b, #4  \n"
-      "ext v15.16b, v15.16b, v31.16b, #4  \n"
-
-      // in row0
-      "fmul v16.4s, %[w0].4s, v8.4s  \n"
-      "fmul v17.4s, %[w0].4s, v9.4s  \n"
-      "fmul v18.4s, %[w0].4s, v10.4s \n"
-      "fmul v19.4s, %[w0].4s, v11.4s \n"
-
-      // in row1
-      "fmla v16.4s, %[w1].4s, v9.4s  \n"
-      "fmla v17.4s, %[w1].4s, v10.4s \n"
-      "fmla v18.4s, %[w1].4s, v11.4s \n"
-      "fmla v19.4s, %[w1].4s, v12.4s \n"
-
-      // in row2
-      "fmla v16.4s, %[w2].4s, v10.4s \n"
-      "fmla v17.4s, %[w2].4s, v11.4s \n"
-      "fmla v18.4s, %[w2].4s, v12.4s \n"
-      "fmla v19.4s, %[w2].4s, v13.4s \n"
-
-      // in row3
-      "fmla v16.4s, %[w3].4s, v11.4s \n"
-      "fmla v17.4s, %[w3].4s, v12.4s \n"
-      "fmla v18.4s, %[w3].4s, v13.4s \n"
-      "fmla v19.4s, %[w3].4s, v14.4s \n"
-
-      // in row4
-      "fmla v16.4s, %[w4].4s, v12.4s \n"
-      "fmla v17.4s, %[w4].4s, v13.4s \n"
-      "fmla v18.4s, %[w4].4s, v14.4s \n"
-      "fmla v19.4s, %[w4].4s, v15.4s \n"
-
-      // add to out register v26
-      "faddp v26.4s, v16.4s, v17.4s \n"
-      "faddp v28.4s, v18.4s, v19.4s \n"
-      "faddp v26.4s, v26.4s, v28.4s \n"
-
-      // ext input col5
-      "ext v8.16b, v20.16b, v21.16b, #4   \n"
-      "ext v9.16b, v20.16b, v21.16b, #8   \n"
-      "ext v10.16b, v20.16b, v21.16b, #12 \n"
-
-      // ext weights col0
-      "ins v5.s[0], %[w0].s[0]  \n"
-      "ins v5.s[1], %[w1].s[0]  \n"
-      "ins v5.s[2], %[w2].s[0]  \n"
-      "ins v5.s[3], %[w3].s[0]  \n"
-
-      // in col5
-      "fmul v16.4s, v5.4s, v20.4s \n"
-      "fmul v17.4s, v5.4s, v8.4s  \n"
-      "fmul v18.4s, v5.4s, v9.4s  \n"
-      "fmul v19.4s, v5.4s, v10.4s \n"
-
-      // add to out register v28
-      "faddp v28.4s, v16.4s, v17.4s \n"
-      "faddp v29.4s, v18.4s, v19.4s \n"
-      "faddp v28.4s, v28.4s, v29.4s \n"
-      "fmla v28.4s, v21.4s, %[w4].s[0]  \n"
-
-      "ld1 {v8.4s}, [%[bias]]  \n"
-
-      // zip
-      "zip1 v0.4s, v25.4s, v27.4s  \n"
-      "zip2 v2.4s, v25.4s, v27.4s  \n"
-      "zip1 v4.4s, v26.4s, v28.4s  \n"
-      "zip2 v6.4s, v26.4s, v28.4s  \n"
-
-      // add bias
-      "fadd v0.4s, v0.4s, v8.4s  \n"
-      "fadd v2.4s, v2.4s, v8.4s  \n"
-      "fadd v4.4s, v4.4s, v8.4s  \n"
-      "fadd v6.4s, v6.4s, v8.4s  \n"
-
-      // relu
-      "fmax v0.4s, v0.4s, v31.4s \n"
-      "fmax v2.4s, v2.4s, v31.4s \n"
-      "fmax v4.4s, v4.4s, v31.4s \n"
-      "fmax v6.4s, v6.4s, v31.4s \n"
-
-      "ext v1.16b, v0.16b, v31.16b, #8  \n"
-      "ext v3.16b, v2.16b, v31.16b, #8  \n"
-      "ext v5.16b, v4.16b, v31.16b, #8  \n"
-      "ext v7.16b, v6.16b, v31.16b, #8  \n"
-
-      // write output
-      "str d0, [x0], #8  \n"
-      "str d1, [x1], #8  \n"
-      "str d2, [x2], #8  \n"
-      "str d3, [x3], #8  \n"
-
-      "str d4, [x0]  \n"
-      "str d5, [x1]  \n"
-      "str d6, [x2]  \n"
-      "str d7, [x3]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [din6] "+r"(din6),
-        [din7] "+r"(din7),
-        [doutl] "+r"(doutl_ptr)
-      : [s_12] "r"(s_12),
-        [w0] "w"(w0),
-        [w1] "w"(w1),
-        [w2] "w"(w2),
-        [w3] "w"(w3),
-        [w4] "w"(w4),
-        [bias] "r"(bias)
-      : "memory",
-        "x0",
-        "x1",
-        "x2",
-        "x3",
-        "v0",
-        "v1",
-        "v2",
-        "v3",
-        "v5",
-        "v7",
-        "v8",
-        "v9",
-        "v10",
-        "v11",
-        "v12",
-        "v13",
-        "v14",
-        "v15",
-        "v16",
-        "v17",
-        "v18",
-        "v19",
-        "v20",
-        "v21",
-        "v25",
-        "v26",
-        "v27",
-        "v28",
-        "v29",
-        "v31");
-}
-
-void conv_depthwise_5x5s1_impl(const float* din,
-                               float* dout,
-                               int num,
-                               int ch_out,
-                               int h_out,
-                               int w_out,
-                               int ch_in,
-                               int h_in,
-                               int w_in,
-                               const float* weights,
-                               const float* bias,
-                               int pad,
-                               bool flag_bias,
-                               bool flag_relu,
-                               ARMContext* ctx) {
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-  int pad_new = pad > 4 ? 4 : pad;
-  int pad_0 = pad - pad_new;
-  int h_out_new = h_out - 2 * pad_0;
-  int mid_out = w_out - 2 * pad;
-  int mid_cnt = mid_out >> 2;
-  int mid_remain = mid_out - (mid_cnt << 2);
-  int pad_cnt = pad_0 >> 2;
-  int pad_remain = pad_0 - (pad_cnt << 2);
-  int bias_cnt = (w_out * pad_0) >> 2;
-  int bias_remain = (w_out * pad_0) - (bias_cnt << 2);
-  int in_spatial_size = w_in * h_in;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      float bias_c = flag_bias ? bias[c] : 0.f;
-      float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-      float32x4_t vbias_c = vdupq_n_f32(bias_c);
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        for (int i = 0; i < bias_cnt; ++i) {
-          vst1q_f32(dout_ch, vbias_c);
-          dout_ch += 4;
-        }
-        for (int i = 0; i < bias_remain; ++i) {
-          *dout_ch++ = bias_c;
-        }
-      } else {
-        //! deal with h_out pad_0 line without bias
-        for (int i = 0; i < pad_0; ++i) {
-          memset(dout_ch, 0x00, w_out * sizeof(float));
-          dout_ch += w_out;
-        }
-      }
-      const float* din_list[8];
-      const float* dinl[8];
-      //! set din ptr with zero buffer
-      for (int i = 0; i < pad_new; ++i) {
-        din_list[i] = zero_ptr;
-      }
-      //! set din ptr with input data
-      for (int i = pad_new; i < 8; ++i) {
-        din_list[i] = din_ch;
-        din_ch += w_in;
-      }
-
-      //! every h loop, deal with 4 line output
-      float* dout0 = dout_ch;
-      float* dout1 = dout0 + w_out;
-      float* dout2 = dout1 + w_out;
-      float* dout3 = dout2 + w_out;
-
-      //! load weights to neon register
-      const float* weights_c = weights + c * weights_saptial_size;
-
-      float32x4_t w5;
-      float32x4_t w6;
-      float32x4_t w0 = vld1q_f32(weights_c);
-      float32x4_t w1 = vld1q_f32(weights_c + 5);
-      float32x4_t w2 = vld1q_f32(weights_c + 10);
-      float32x4_t w3 = vld1q_f32(weights_c + 15);
-      float32x4_t w4 = vld1q_f32(weights_c + 20);
-      w5 = vsetq_lane_f32(weights_c[4], w5, 0);
-      w5 = vsetq_lane_f32(weights_c[9], w5, 1);
-      w5 = vsetq_lane_f32(weights_c[14], w5, 2);
-      w5 = vsetq_lane_f32(weights_c[19], w5, 3);
-      w6 = vsetq_lane_f32(weights_c[24], w6, 0);
-
-      //! h loop
-      for (int h = 0; h < h_out_new; h += 4) {
-        //! (h - pad_new) + 7 > h_in - 1
-        if (h + 8 - pad_new > h_in) {
-          switch (h + 8 - pad_new - h_in) {
-            case 7:
-              din_list[1] = zero_ptr;
-            case 6:
-              din_list[2] = zero_ptr;
-            case 5:
-              din_list[3] = zero_ptr;
-            case 4:
-              din_list[4] = zero_ptr;
-            case 3:
-              din_list[5] = zero_ptr;
-            case 2:
-              din_list[6] = zero_ptr;
-            case 1:
-              din_list[7] = zero_ptr;
-            default:
-              break;
-          }
-        }
-        if (h + 4 > h_out_new) {
-          switch (h + 4 - h_out_new) {
-            case 3:
-              dout1 = write_ptr;
-            case 2:
-              dout2 = write_ptr;
-            case 1:
-              dout3 = write_ptr;
-            default:
-              break;
-          }
-        }
-
-        //! every h loop, deal with 8 line input
-        dinl[0] = din_list[0];
-        dinl[1] = din_list[1];
-        dinl[2] = din_list[2];
-        dinl[3] = din_list[3];
-        dinl[4] = din_list[4];
-        dinl[5] = din_list[5];
-        dinl[6] = din_list[6];
-        dinl[7] = din_list[7];
-
-        const float* weights_ptr = weights_c;
-        float* dout_ptr0 = dout0;
-        float* dout_ptr1 = dout1;
-        float* dout_ptr2 = dout2;
-        float* dout_ptr3 = dout3;
-        if (flag_bias) {
-          //! deal with w_out pad_0 column pre with bias
-          for (int i = 0; i < pad_cnt; i++) {
-            vst1q_f32(dout_ptr0, vbias_c);
-            vst1q_f32(dout_ptr1, vbias_c);
-            vst1q_f32(dout_ptr2, vbias_c);
-            vst1q_f32(dout_ptr3, vbias_c);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-            dout_ptr2 += 4;
-            dout_ptr3 += 4;
-          }
-          for (int i = 0; i < pad_remain; ++i) {
-            *dout_ptr0++ = bias_c;
-            *dout_ptr1++ = bias_c;
-            *dout_ptr2++ = bias_c;
-            *dout_ptr3++ = bias_c;
-          }
-        } else {
-          //! deal with w_out pad_0 column pre without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr2, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr3, 0x00, pad_0 * sizeof(float));
-          dout_ptr0 += pad_0;
-          dout_ptr1 += pad_0;
-          dout_ptr2 += pad_0;
-          dout_ptr3 += pad_0;
-        }
-        //! deal with w_out pad_new column pre
-        switch (pad_new) {
-          case 4:
-            compute_four_out_extract_pre(dinl[0],
-                                         dinl[1],
-                                         dinl[2],
-                                         dinl[3],
-                                         dinl[4],
-                                         dinl[5],
-                                         dinl[6],
-                                         dinl[7],
-                                         dout_ptr0,
-                                         dout_ptr1,
-                                         dout_ptr2,
-                                         dout_ptr3,
-                                         weights_ptr,
-                                         vbias);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-            dout_ptr2 += 4;
-            dout_ptr3 += 4;
-            break;
-          case 3:
-            compute_three_out_extract_pre(dinl[0],
-                                          dinl[1],
-                                          dinl[2],
-                                          dinl[3],
-                                          dinl[4],
-                                          dinl[5],
-                                          dinl[6],
-                                          dinl[7],
-                                          dout_ptr0,
-                                          dout_ptr1,
-                                          dout_ptr2,
-                                          dout_ptr3,
-                                          weights_ptr,
-                                          vbias);
-            dout_ptr0 += 3;
-            dout_ptr1 += 3;
-            dout_ptr2 += 3;
-            dout_ptr3 += 3;
-            break;
-          case 2:
-            compute_two_out_extract_pre(dinl[0],
-                                        dinl[1],
-                                        dinl[2],
-                                        dinl[3],
-                                        dinl[4],
-                                        dinl[5],
-                                        dinl[6],
-                                        dinl[7],
-                                        dout_ptr0,
-                                        dout_ptr1,
-                                        dout_ptr2,
-                                        dout_ptr3,
-                                        weights_ptr,
-                                        vbias);
-            dout_ptr0 += 2;
-            dout_ptr1 += 2;
-            dout_ptr2 += 2;
-            dout_ptr3 += 2;
-            break;
-          case 1:
-            compute_one_out_extract_pre(dinl[0],
-                                        dinl[1],
-                                        dinl[2],
-                                        dinl[3],
-                                        dinl[4],
-                                        dinl[5],
-                                        dinl[6],
-                                        dinl[7],
-                                        dout_ptr0,
-                                        dout_ptr1,
-                                        dout_ptr2,
-                                        dout_ptr3,
-                                        weights_ptr,
-                                        vbias);
-            dout_ptr0 += 1;
-            dout_ptr1 += 1;
-            dout_ptr2 += 1;
-            dout_ptr3 += 1;
-            break;
-        }
-        //! mid loop
-        if (mid_cnt > 0) {
-          void* dinl_ptr = reinterpret_cast<void*>(dinl);
-          int mid_loop = mid_cnt;
-          asm volatile(
-              //! din: v7-v14
-              //! dout: v15-v18
-              "mov x0, #0  \n"
-              "mov x1, #4  \n"
-              "ldp x2, x3, [%[dinl]], #16  \n"
-              "ldp x4, x5, [%[dinl]], #16  \n"
-              "ldp x6, x7, [%[dinl]], #16  \n"
-              "ldp x8, x9, [%[dinl]], #16  \n"
-
-              "ld1 {v7.4s} , [x2], x1  \n"
-              "ld1 {v8.4s} , [x3], x1  \n"
-              "ld1 {v9.4s} , [x4], x1  \n"
-              "ld1 {v10.4s}, [x5], x1  \n"
-              "ld1 {v11.4s}, [x6], x1  \n"
-              "ld1 {v12.4s}, [x7], x1  \n"
-              "ld1 {v13.4s}, [x8], x1  \n"
-              "ld1 {v14.4s}, [x9], x1  \n"
-
-              //! load bias
-              "ld1 {v19.4s}, [%[bias]]  \n"
-
-              "1: \n"
-              //! add bias to output
-              "mov v15.16b, v19.16b  \n"
-              "mov v16.16b, v19.16b  \n"
-              "mov v17.16b, v19.16b  \n"
-              "mov v18.16b, v19.16b  \n"
-
-              //! loop cnt is even, prefetch 64 Byte to l1 cache
-              "cmp x0, #1  \n"
-              "bne 2f  \n"
-              "mov x0, #0  \n"
-              "prfm pldl1keep, [x2]  \n"
-              "prfm pldl1keep, [x3]  \n"
-              "prfm pldl1keep, [x4]  \n"
-              "prfm pldl1keep, [x5]  \n"
-              "prfm pldl1keep, [x6]  \n"
-              "prfm pldl1keep, [x7]  \n"
-              "prfm pldl1keep, [x8]  \n"
-              "prfm pldl1keep, [x9]  \n"
-
-              "2:  \n"
-              // weights col 0
-              "fmla v15.4s, v7.4s , %[w0].s[0]  \n"
-              "fmla v16.4s, v8.4s , %[w0].s[0]  \n"
-              "fmla v17.4s, v9.4s , %[w0].s[0]  \n"
-              "fmla v18.4s, v10.4s, %[w0].s[0]  \n"
-
-              "fmla v15.4s, v8.4s , %[w1].s[0]  \n"
-              "fmla v16.4s, v9.4s , %[w1].s[0]  \n"
-              "fmla v17.4s, v10.4s, %[w1].s[0]  \n"
-              "fmla v18.4s, v11.4s, %[w1].s[0]  \n"
-
-              "ld1 {v7.4s}, [x2], x1  \n"
-              "ld1 {v8.4s}, [x3], x1  \n"
-
-              "fmla v15.4s, v9.4s , %[w2].s[0]  \n"
-              "fmla v16.4s, v10.4s, %[w2].s[0]  \n"
-              "fmla v17.4s, v11.4s, %[w2].s[0]  \n"
-              "fmla v18.4s, v12.4s, %[w2].s[0]  \n"
-
-              "fmla v15.4s, v10.4s, %[w3].s[0]  \n"
-              "fmla v16.4s, v11.4s, %[w3].s[0]  \n"
-              "fmla v17.4s, v12.4s, %[w3].s[0]  \n"
-              "fmla v18.4s, v13.4s, %[w3].s[0]  \n"
-
-              "ld1 {v9.4s} , [x4], x1  \n"
-              "ld1 {v10.4s}, [x5], x1  \n"
-
-              "fmla v15.4s, v11.4s, %[w4].s[0]  \n"
-              "fmla v16.4s, v12.4s, %[w4].s[0]  \n"
-              "fmla v17.4s, v13.4s, %[w4].s[0]  \n"
-              "fmla v18.4s, v14.4s, %[w4].s[0]  \n"
-
-              "ld1 {v11.4s}, [x6], x1  \n"
-              "ld1 {v12.4s}, [x7], x1  \n"
-
-              // weights col 1
-              "fmla v15.4s, v7.4s , %[w0].s[1]  \n"
-              "fmla v16.4s, v8.4s , %[w0].s[1]  \n"
-              "fmla v17.4s, v9.4s , %[w0].s[1]  \n"
-              "fmla v18.4s, v10.4s, %[w0].s[1]  \n"
-
-              "ld1 {v13.4s}, [x8], x1  \n"
-              "ld1 {v14.4s}, [x9], x1  \n"
-
-              "fmla v15.4s, v8.4s , %[w1].s[1]  \n"
-              "fmla v16.4s, v9.4s , %[w1].s[1]  \n"
-              "fmla v17.4s, v10.4s, %[w1].s[1]  \n"
-              "fmla v18.4s, v11.4s, %[w1].s[1]  \n"
-
-              "ld1 {v7.4s}, [x2], x1  \n"
-              "ld1 {v8.4s}, [x3], x1  \n"
-
-              "fmla v15.4s, v9.4s , %[w2].s[1]  \n"
-              "fmla v16.4s, v10.4s, %[w2].s[1]  \n"
-              "fmla v17.4s, v11.4s, %[w2].s[1]  \n"
-              "fmla v18.4s, v12.4s, %[w2].s[1]  \n"
-
-              "fmla v15.4s, v10.4s, %[w3].s[1]  \n"
-              "fmla v16.4s, v11.4s, %[w3].s[1]  \n"
-              "fmla v17.4s, v12.4s, %[w3].s[1]  \n"
-              "fmla v18.4s, v13.4s, %[w3].s[1]  \n"
-
-              "ld1 {v9.4s} , [x4], x1  \n"
-              "ld1 {v10.4s}, [x5], x1  \n"
-
-              "fmla v15.4s, v11.4s, %[w4].s[1]  \n"
-              "fmla v16.4s, v12.4s, %[w4].s[1]  \n"
-              "fmla v17.4s, v13.4s, %[w4].s[1]  \n"
-              "fmla v18.4s, v14.4s, %[w4].s[1]  \n"
-
-              "ld1 {v11.4s}, [x6], x1  \n"
-              "ld1 {v12.4s}, [x7], x1  \n"
-
-              // weights col 2
-              "fmla v15.4s, v7.4s , %[w0].s[2]  \n"
-              "fmla v16.4s, v8.4s , %[w0].s[2]  \n"
-              "fmla v17.4s, v9.4s , %[w0].s[2]  \n"
-              "fmla v18.4s, v10.4s, %[w0].s[2]  \n"
-
-              "ld1 {v13.4s}, [x8], x1  \n"
-              "ld1 {v14.4s}, [x9], x1  \n"
-
-              "fmla v15.4s, v8.4s , %[w1].s[2]  \n"
-              "fmla v16.4s, v9.4s , %[w1].s[2]  \n"
-              "fmla v17.4s, v10.4s, %[w1].s[2]  \n"
-              "fmla v18.4s, v11.4s, %[w1].s[2]  \n"
-
-              "ld1 {v7.4s}, [x2], x1  \n"
-              "ld1 {v8.4s}, [x3], x1  \n"
-
-              "fmla v15.4s, v9.4s , %[w2].s[2]  \n"
-              "fmla v16.4s, v10.4s, %[w2].s[2]  \n"
-              "fmla v17.4s, v11.4s, %[w2].s[2]  \n"
-              "fmla v18.4s, v12.4s, %[w2].s[2]  \n"
-
-              "fmla v15.4s, v10.4s, %[w3].s[2]  \n"
-              "fmla v16.4s, v11.4s, %[w3].s[2]  \n"
-              "fmla v17.4s, v12.4s, %[w3].s[2]  \n"
-              "fmla v18.4s, v13.4s, %[w3].s[2]  \n"
-
-              "ld1 {v9.4s} , [x4], x1  \n"
-              "ld1 {v10.4s}, [x5], x1  \n"
-
-              "fmla v15.4s, v11.4s, %[w4].s[2]  \n"
-              "fmla v16.4s, v12.4s, %[w4].s[2]  \n"
-              "fmla v17.4s, v13.4s, %[w4].s[2]  \n"
-              "fmla v18.4s, v14.4s, %[w4].s[2]  \n"
-
-              "ld1 {v11.4s}, [x6], x1  \n"
-              "ld1 {v12.4s}, [x7], x1  \n"
-
-              // weights col 3
-              "fmla v15.4s, v7.4s , %[w0].s[3] \n"
-              "fmla v16.4s, v8.4s , %[w0].s[3] \n"
-              "fmla v17.4s, v9.4s , %[w0].s[3] \n"
-              "fmla v18.4s, v10.4s, %[w0].s[3] \n"
-
-              "ld1 {v13.4s}, [x8], x1  \n"
-              "ld1 {v14.4s}, [x9], x1  \n"
-
-              "fmla v15.4s, v8.4s , %[w1].s[3]  \n"
-              "fmla v16.4s, v9.4s , %[w1].s[3]  \n"
-              "fmla v17.4s, v10.4s, %[w1].s[3]  \n"
-              "fmla v18.4s, v11.4s, %[w1].s[3]  \n"
-
-              "ld1 {v7.4s}, [x2], x1  \n"
-              "ld1 {v8.4s}, [x3], x1  \n"
-
-              "fmla v15.4s, v9.4s , %[w2].s[3]  \n"
-              "fmla v16.4s, v10.4s, %[w2].s[3]  \n"
-              "fmla v17.4s, v11.4s, %[w2].s[3]  \n"
-              "fmla v18.4s, v12.4s, %[w2].s[3]  \n"
-
-              "fmla v15.4s, v10.4s, %[w3].s[3]  \n"
-              "fmla v16.4s, v11.4s, %[w3].s[3]  \n"
-              "fmla v17.4s, v12.4s, %[w3].s[3]  \n"
-              "fmla v18.4s, v13.4s, %[w3].s[3]  \n"
-
-              "ld1 {v9.4s} , [x4], x1  \n"
-              "ld1 {v10.4s}, [x5], x1  \n"
-
-              "fmla v15.4s, v11.4s, %[w4].s[3]  \n"
-              "fmla v16.4s, v12.4s, %[w4].s[3]  \n"
-              "fmla v17.4s, v13.4s, %[w4].s[3]  \n"
-              "fmla v18.4s, v14.4s, %[w4].s[3]  \n"
-
-              "ld1 {v11.4s}, [x6], x1  \n"
-              "ld1 {v12.4s}, [x7], x1  \n"
-
-              // weights col 4
-              "fmla v15.4s, v7.4s, %[w5].s[0]  \n"
-              "fmla v16.4s, v8.4s, %[w5].s[0]  \n"
-              "fmla v17.4s, v9.4s, %[w5].s[0]  \n"
-              "fmla v18.4s, v10.4s, %[w5].s[0] \n"
-
-              "ld1 {v13.4s}, [x8], x1  \n"
-              "ld1 {v14.4s}, [x9], x1  \n"
-
-              "fmla v15.4s, v8.4s, %[w5].s[1]   \n"
-              "fmla v16.4s, v9.4s, %[w5].s[1]   \n"
-              "fmla v17.4s, v10.4s, %[w5].s[1]  \n"
-              "fmla v18.4s, v11.4s, %[w5].s[1]  \n"
-
-              "fmla v15.4s, v9.4s , %[w5].s[2]  \n"
-              "fmla v16.4s, v10.4s, %[w5].s[2]  \n"
-              "fmla v17.4s, v11.4s, %[w5].s[2]  \n"
-              "fmla v18.4s, v12.4s, %[w5].s[2]  \n"
-
-              "fmla v15.4s, v10.4s, %[w5].s[3]  \n"
-              "fmla v16.4s, v11.4s, %[w5].s[3]  \n"
-              "fmla v17.4s, v12.4s, %[w5].s[3]  \n"
-              "fmla v18.4s, v13.4s, %[w5].s[3]  \n"
-
-              "fmla v15.4s, v11.4s, %[w6].s[0]  \n"
-              "fmla v16.4s, v12.4s, %[w6].s[0]  \n"
-              "fmla v17.4s, v13.4s, %[w6].s[0]  \n"
-              "fmla v18.4s, v14.4s, %[w6].s[0]  \n"
-
-              "st1 {v15.4s}, [%[dout0]], #16  \n"
-              "st1 {v16.4s}, [%[dout1]], #16  \n"
-              "st1 {v17.4s}, [%[dout2]], #16  \n"
-              "st1 {v18.4s}, [%[dout3]], #16  \n"
-
-              "subs %w[cnt], %w[cnt], #1  \n"
-              "add x0, x0, #1  \n"
-              "bne 1b  \n"
-
-              : [dout0] "+r"(dout_ptr0),
-                [dout1] "+r"(dout_ptr1),
-                [dout2] "+r"(dout_ptr2),
-                [dout3] "+r"(dout_ptr3),
-                [cnt] "+r"(mid_loop),
-                [dinl] "+r"(dinl_ptr)
-              : [w0] "w"(w0),
-                [w1] "w"(w1),
-                [w2] "w"(w2),
-                [w3] "w"(w3),
-                [w4] "w"(w4),
-                [w5] "w"(w5),
-                [w6] "w"(w6),
-                [bias] "r"(vbias)
-              : "cc",
-                "memory",
-                "x0",
-                "x1",
-                "x2",
-                "x3",
-                "x4",
-                "x5",
-                "x6",
-                "x7",
-                "x8",
-                "x9",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19");
-        }
-        dinl[0] += 4 * mid_cnt;
-        dinl[1] += 4 * mid_cnt;
-        dinl[2] += 4 * mid_cnt;
-        dinl[3] += 4 * mid_cnt;
-        dinl[4] += 4 * mid_cnt;
-        dinl[5] += 4 * mid_cnt;
-        dinl[6] += 4 * mid_cnt;
-        dinl[7] += 4 * mid_cnt;
-        //! deal with mid remain
-        for (int i = 0; i < mid_remain; ++i) {
-          compute_one_out_without_extract(dinl[0],
-                                          dinl[1],
-                                          dinl[2],
-                                          dinl[3],
-                                          dinl[4],
-                                          dinl[5],
-                                          dinl[6],
-                                          dinl[7],
-                                          dout_ptr0,
-                                          dout_ptr1,
-                                          dout_ptr2,
-                                          dout_ptr3,
-                                          w0,
-                                          w1,
-                                          w2,
-                                          w3,
-                                          w4,
-                                          w5,
-                                          w6,
-                                          vbias);
-          dinl[0]++;
-          dinl[1]++;
-          dinl[2]++;
-          dinl[3]++;
-          dinl[4]++;
-          dinl[5]++;
-          dinl[6]++;
-          dinl[7]++;
-
-          dout_ptr0++;
-          dout_ptr1++;
-          dout_ptr2++;
-          dout_ptr3++;
-        }
-        //! deal with w_out pad_new column post
-        switch (pad_new) {
-          case 4:
-            compute_four_out_extract_post(dinl[0],
-                                          dinl[1],
-                                          dinl[2],
-                                          dinl[3],
-                                          dinl[4],
-                                          dinl[5],
-                                          dinl[6],
-                                          dinl[7],
-                                          dout_ptr0,
-                                          dout_ptr1,
-                                          dout_ptr2,
-                                          dout_ptr3,
-                                          w0,
-                                          w1,
-                                          w2,
-                                          w3,
-                                          w4,
-                                          vbias);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-            dout_ptr2 += 4;
-            dout_ptr3 += 4;
-            break;
-          case 3:
-            compute_three_out_extract_post(dinl[0],
-                                           dinl[1],
-                                           dinl[2],
-                                           dinl[3],
-                                           dinl[4],
-                                           dinl[5],
-                                           dinl[6],
-                                           dinl[7],
-                                           dout_ptr0,
-                                           dout_ptr1,
-                                           dout_ptr2,
-                                           dout_ptr3,
-                                           w0,
-                                           w1,
-                                           w2,
-                                           w3,
-                                           w4,
-                                           vbias);
-            dout_ptr0 += 3;
-            dout_ptr1 += 3;
-            dout_ptr2 += 3;
-            dout_ptr3 += 3;
-            break;
-          case 2:
-            compute_two_out_extract_post(dinl[0],
-                                         dinl[1],
-                                         dinl[2],
-                                         dinl[3],
-                                         dinl[4],
-                                         dinl[5],
-                                         dinl[6],
-                                         dinl[7],
-                                         dout_ptr0,
-                                         dout_ptr1,
-                                         dout_ptr2,
-                                         dout_ptr3,
-                                         w0,
-                                         w1,
-                                         w2,
-                                         w3,
-                                         w4,
-                                         vbias);
-            dout_ptr0 += 2;
-            dout_ptr1 += 2;
-            dout_ptr2 += 2;
-            dout_ptr3 += 2;
-            break;
-          case 1:
-            compute_one_out_extract_post(dinl[0],
-                                         dinl[1],
-                                         dinl[2],
-                                         dinl[3],
-                                         dinl[4],
-                                         dinl[5],
-                                         dinl[6],
-                                         dinl[7],
-                                         dout_ptr0,
-                                         dout_ptr1,
-                                         dout_ptr2,
-                                         dout_ptr3,
-                                         w0,
-                                         w1,
-                                         w2,
-                                         w3,
-                                         w4,
-                                         vbias);
-            dout_ptr0 += 1;
-            dout_ptr1 += 1;
-            dout_ptr2 += 1;
-            dout_ptr3 += 1;
-            break;
-        }
-
-        if (flag_bias) {
-          //! deal with w_out pad_0 column post with bias
-          memcpy(dout_ptr0, dout0, pad_0 * sizeof(float));
-          memcpy(dout_ptr1, dout1, pad_0 * sizeof(float));
-          memcpy(dout_ptr2, dout2, pad_0 * sizeof(float));
-          memcpy(dout_ptr3, dout3, pad_0 * sizeof(float));
-        } else {
-          //! deal with w_out pad_0 column post without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr2, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr3, 0x00, pad_0 * sizeof(float));
-        }
-
-        din_list[0] = din_list[4];
-        din_list[1] = din_list[5];
-        din_list[2] = din_list[6];
-        din_list[3] = din_list[7];
-        din_list[4] = din_list[3] + w_in;
-        din_list[5] = din_list[4] + w_in;
-        din_list[6] = din_list[5] + w_in;
-        din_list[7] = din_list[6] + w_in;
-
-        dout0 = dout3 + w_out;
-        dout1 = dout0 + w_out;
-        dout2 = dout1 + w_out;
-        dout3 = dout2 + w_out;
-      }
-      float* dout_pad_end = dout_ch + h_out_new * w_out;
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        memcpy(reinterpret_cast<void*>(dout_pad_end),
-               dout_ch - pad_0 * w_out,
-               pad_0 * w_out * sizeof(float));
-      } else {
-        //! deal with h_out pad_0 line without bias
-        memset(reinterpret_cast<void*>(dout_pad_end),
-               0x00,
-               pad_0 * w_out * sizeof(float));
-      }
-    }
-  }
-}
-
-void conv_depthwise_5x5s1_relu_impl(const float* din,
-                                    float* dout,
-                                    int num,
-                                    int ch_out,
-                                    int h_out,
-                                    int w_out,
-                                    int ch_in,
-                                    int h_in,
-                                    int w_in,
-                                    const float* weights,
-                                    const float* bias,
-                                    int pad,
-                                    bool flag_bias,
-                                    bool flag_relu,
-                                    ARMContext* ctx) {
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-  int pad_new = pad > 4 ? 4 : pad;
-  int pad_0 = pad - pad_new;
-  int h_out_new = h_out - 2 * pad_0;
-  int mid_out = w_out - 2 * pad;
-  int mid_cnt = mid_out >> 2;
-  int mid_remain = mid_out - (mid_cnt << 2);
-  int pad_cnt = pad_0 >> 2;
-  int pad_remain = pad_0 - (pad_cnt << 2);
-  int bias_cnt = (w_out * pad_0) >> 2;
-  int bias_remain = (w_out * pad_0) - (bias_cnt << 2);
-  int in_spatial_size = w_in * h_in;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      float bias_c = flag_bias ? bias[c] : 0.f;
-      float bias_relu = bias_c > 0.f ? bias_c : 0.f;
-      float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-      float32x4_t vbias_c = vdupq_n_f32(bias_relu);
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        for (int i = 0; i < bias_cnt; ++i) {
-          vst1q_f32(dout_ch, vbias_c);
-          dout_ch += 4;
-        }
-        for (int i = 0; i < bias_remain; ++i) {
-          *dout_ch++ = bias_relu;
-        }
-      } else {
-        //! deal with h_out pad_0 line without bias
-        for (int i = 0; i < pad_0; ++i) {
-          memset(dout_ch, 0x00, w_out * sizeof(float));
-          dout_ch += w_out;
-        }
-      }
-      const float* din_list[8];
-      const float* dinl[8];
-      //! set din ptr with zero buffer
-      for (int i = 0; i < pad_new; ++i) {
-        din_list[i] = zero_ptr;
-      }
-      //! set din ptr with input data
-      for (int i = pad_new; i < 8; ++i) {
-        din_list[i] = din_ch;
-        din_ch += w_in;
-      }
-
-      //! every h loop, deal with 4 line output
-      float* dout0 = dout_ch;
-      float* dout1 = dout0 + w_out;
-      float* dout2 = dout1 + w_out;
-      float* dout3 = dout2 + w_out;
-
-      //! load weights to neon register
-      const float* weights_c = weights + c * weights_saptial_size;
-
-      float32x4_t w5;
-      float32x4_t w6;
-      float32x4_t w0 = vld1q_f32(weights_c);
-      float32x4_t w1 = vld1q_f32(weights_c + 5);
-      float32x4_t w2 = vld1q_f32(weights_c + 10);
-      float32x4_t w3 = vld1q_f32(weights_c + 15);
-      float32x4_t w4 = vld1q_f32(weights_c + 20);
-      w5 = vsetq_lane_f32(weights_c[4], w5, 0);
-      w5 = vsetq_lane_f32(weights_c[9], w5, 1);
-      w5 = vsetq_lane_f32(weights_c[14], w5, 2);
-      w5 = vsetq_lane_f32(weights_c[19], w5, 3);
-      w6 = vsetq_lane_f32(weights_c[24], w6, 0);
-
-      //! h loop
-      for (int h = 0; h < h_out_new; h += 4) {
-        //! (h - pad_new) + 7 > h_in - 1
-        if (h + 8 - pad_new > h_in) {
-          switch (h + 8 - pad_new - h_in) {
-            case 7:
-              din_list[1] = zero_ptr;
-            case 6:
-              din_list[2] = zero_ptr;
-            case 5:
-              din_list[3] = zero_ptr;
-            case 4:
-              din_list[4] = zero_ptr;
-            case 3:
-              din_list[5] = zero_ptr;
-            case 2:
-              din_list[6] = zero_ptr;
-            case 1:
-              din_list[7] = zero_ptr;
-            default:
-              break;
-          }
-        }
-        if (h + 4 > h_out_new) {
-          switch (h + 4 - h_out_new) {
-            case 3:
-              dout1 = write_ptr;
-            case 2:
-              dout2 = write_ptr;
-            case 1:
-              dout3 = write_ptr;
-            default:
-              break;
-          }
-        }
-
-        //! every h loop, deal with 8 line input
-        dinl[0] = din_list[0];
-        dinl[1] = din_list[1];
-        dinl[2] = din_list[2];
-        dinl[3] = din_list[3];
-        dinl[4] = din_list[4];
-        dinl[5] = din_list[5];
-        dinl[6] = din_list[6];
-        dinl[7] = din_list[7];
-
-        const float* weights_ptr = weights_c;
-        float* dout_ptr0 = dout0;
-        float* dout_ptr1 = dout1;
-        float* dout_ptr2 = dout2;
-        float* dout_ptr3 = dout3;
-        if (flag_bias) {
-          //! deal with w_out pad_0 column pre with bias
-          for (int i = 0; i < pad_cnt; i++) {
-            vst1q_f32(dout_ptr0, vbias_c);
-            vst1q_f32(dout_ptr1, vbias_c);
-            vst1q_f32(dout_ptr2, vbias_c);
-            vst1q_f32(dout_ptr3, vbias_c);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-            dout_ptr2 += 4;
-            dout_ptr3 += 4;
-          }
-          for (int i = 0; i < pad_remain; ++i) {
-            *dout_ptr0++ = bias_relu;
-            *dout_ptr1++ = bias_relu;
-            *dout_ptr2++ = bias_relu;
-            *dout_ptr3++ = bias_relu;
-          }
-        } else {
-          //! deal with w_out pad_0 column pre without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr2, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr3, 0x00, pad_0 * sizeof(float));
-          dout_ptr0 += pad_0;
-          dout_ptr1 += pad_0;
-          dout_ptr2 += pad_0;
-          dout_ptr3 += pad_0;
-        }
-        //! deal with w_out pad_new column pre
-        switch (pad_new) {
-          case 4:
-            compute_four_out_extract_pre_relu(dinl[0],
-                                              dinl[1],
-                                              dinl[2],
-                                              dinl[3],
-                                              dinl[4],
-                                              dinl[5],
-                                              dinl[6],
-                                              dinl[7],
-                                              dout_ptr0,
-                                              dout_ptr1,
-                                              dout_ptr2,
-                                              dout_ptr3,
-                                              weights_ptr,
-                                              vbias);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-            dout_ptr2 += 4;
-            dout_ptr3 += 4;
-            break;
-          case 3:
-            compute_three_out_extract_pre_relu(dinl[0],
-                                               dinl[1],
-                                               dinl[2],
-                                               dinl[3],
-                                               dinl[4],
-                                               dinl[5],
-                                               dinl[6],
-                                               dinl[7],
-                                               dout_ptr0,
-                                               dout_ptr1,
-                                               dout_ptr2,
-                                               dout_ptr3,
-                                               weights_ptr,
-                                               vbias);
-            dout_ptr0 += 3;
-            dout_ptr1 += 3;
-            dout_ptr2 += 3;
-            dout_ptr3 += 3;
-            break;
-          case 2:
-            compute_two_out_extract_pre_relu(dinl[0],
-                                             dinl[1],
-                                             dinl[2],
-                                             dinl[3],
-                                             dinl[4],
-                                             dinl[5],
-                                             dinl[6],
-                                             dinl[7],
-                                             dout_ptr0,
-                                             dout_ptr1,
-                                             dout_ptr2,
-                                             dout_ptr3,
-                                             weights_ptr,
-                                             vbias);
-            dout_ptr0 += 2;
-            dout_ptr1 += 2;
-            dout_ptr2 += 2;
-            dout_ptr3 += 2;
-            break;
-          case 1:
-            compute_one_out_extract_pre_relu(dinl[0],
-                                             dinl[1],
-                                             dinl[2],
-                                             dinl[3],
-                                             dinl[4],
-                                             dinl[5],
-                                             dinl[6],
-                                             dinl[7],
-                                             dout_ptr0,
-                                             dout_ptr1,
-                                             dout_ptr2,
-                                             dout_ptr3,
-                                             weights_ptr,
-                                             vbias);
-            dout_ptr0 += 1;
-            dout_ptr1 += 1;
-            dout_ptr2 += 1;
-            dout_ptr3 += 1;
-            break;
-        }
-        //! mid loop
-        if (mid_cnt > 0) {
-          void* dinl_ptr = reinterpret_cast<void*>(dinl);
-          int mid_loop = mid_cnt;
-          asm volatile(
-              //! din: v7-v14
-              //! dout: v15-v18
-              "mov x0, #0  \n"
-              "mov x1, #4  \n"
-              "movi v31.4s, #0  \n"
-              "ldp x2, x3, [%[dinl]], #16  \n"
-              "ldp x4, x5, [%[dinl]], #16  \n"
-              "ldp x6, x7, [%[dinl]], #16  \n"
-              "ldp x8, x9, [%[dinl]], #16  \n"
-
-              "ld1 {v7.4s} , [x2], x1  \n"
-              "ld1 {v8.4s} , [x3], x1  \n"
-              "ld1 {v9.4s} , [x4], x1  \n"
-              "ld1 {v10.4s}, [x5], x1  \n"
-              "ld1 {v11.4s}, [x6], x1  \n"
-              "ld1 {v12.4s}, [x7], x1  \n"
-              "ld1 {v13.4s}, [x8], x1  \n"
-              "ld1 {v14.4s}, [x9], x1  \n"
-
-              //! load bias
-              "ld1 {v19.4s}, [%[bias]]  \n"
-
-              "1: \n"
-              //! add bias to output
-              "mov v15.16b, v19.16b  \n"
-              "mov v16.16b, v19.16b  \n"
-              "mov v17.16b, v19.16b  \n"
-              "mov v18.16b, v19.16b  \n"
-
-              //! loop cnt is even, prefetch 64 Byte to l1 cache
-              "cmp x0, #1  \n"
-              "bne 2f  \n"
-              "mov x0, #0  \n"
-              "prfm pldl1keep, [x2]  \n"
-              "prfm pldl1keep, [x3]  \n"
-              "prfm pldl1keep, [x4]  \n"
-              "prfm pldl1keep, [x5]  \n"
-              "prfm pldl1keep, [x6]  \n"
-              "prfm pldl1keep, [x7]  \n"
-              "prfm pldl1keep, [x8]  \n"
-              "prfm pldl1keep, [x9]  \n"
-
-              "2:  \n"
-              // weights col 0
-              "fmla v15.4s, v7.4s , %[w0].s[0]  \n"
-              "fmla v16.4s, v8.4s , %[w0].s[0]  \n"
-              "fmla v17.4s, v9.4s , %[w0].s[0]  \n"
-              "fmla v18.4s, v10.4s, %[w0].s[0]  \n"
-
-              "fmla v15.4s, v8.4s , %[w1].s[0]  \n"
-              "fmla v16.4s, v9.4s , %[w1].s[0]  \n"
-              "fmla v17.4s, v10.4s, %[w1].s[0]  \n"
-              "fmla v18.4s, v11.4s, %[w1].s[0]  \n"
-
-              "ld1 {v7.4s}, [x2], x1  \n"
-              "ld1 {v8.4s}, [x3], x1  \n"
-
-              "fmla v15.4s, v9.4s , %[w2].s[0]  \n"
-              "fmla v16.4s, v10.4s, %[w2].s[0]  \n"
-              "fmla v17.4s, v11.4s, %[w2].s[0]  \n"
-              "fmla v18.4s, v12.4s, %[w2].s[0]  \n"
-
-              "fmla v15.4s, v10.4s, %[w3].s[0]  \n"
-              "fmla v16.4s, v11.4s, %[w3].s[0]  \n"
-              "fmla v17.4s, v12.4s, %[w3].s[0]  \n"
-              "fmla v18.4s, v13.4s, %[w3].s[0]  \n"
-
-              "ld1 {v9.4s} , [x4], x1  \n"
-              "ld1 {v10.4s}, [x5], x1  \n"
-
-              "fmla v15.4s, v11.4s, %[w4].s[0]  \n"
-              "fmla v16.4s, v12.4s, %[w4].s[0]  \n"
-              "fmla v17.4s, v13.4s, %[w4].s[0]  \n"
-              "fmla v18.4s, v14.4s, %[w4].s[0]  \n"
-
-              "ld1 {v11.4s}, [x6], x1  \n"
-              "ld1 {v12.4s}, [x7], x1  \n"
-
-              // weights col 1
-              "fmla v15.4s, v7.4s , %[w0].s[1]  \n"
-              "fmla v16.4s, v8.4s , %[w0].s[1]  \n"
-              "fmla v17.4s, v9.4s , %[w0].s[1]  \n"
-              "fmla v18.4s, v10.4s, %[w0].s[1]  \n"
-
-              "ld1 {v13.4s}, [x8], x1  \n"
-              "ld1 {v14.4s}, [x9], x1  \n"
-
-              "fmla v15.4s, v8.4s , %[w1].s[1]  \n"
-              "fmla v16.4s, v9.4s , %[w1].s[1]  \n"
-              "fmla v17.4s, v10.4s, %[w1].s[1]  \n"
-              "fmla v18.4s, v11.4s, %[w1].s[1]  \n"
-
-              "ld1 {v7.4s}, [x2], x1  \n"
-              "ld1 {v8.4s}, [x3], x1  \n"
-
-              "fmla v15.4s, v9.4s , %[w2].s[1]  \n"
-              "fmla v16.4s, v10.4s, %[w2].s[1]  \n"
-              "fmla v17.4s, v11.4s, %[w2].s[1]  \n"
-              "fmla v18.4s, v12.4s, %[w2].s[1]  \n"
-
-              "fmla v15.4s, v10.4s, %[w3].s[1]  \n"
-              "fmla v16.4s, v11.4s, %[w3].s[1]  \n"
-              "fmla v17.4s, v12.4s, %[w3].s[1]  \n"
-              "fmla v18.4s, v13.4s, %[w3].s[1]  \n"
-
-              "ld1 {v9.4s} , [x4], x1  \n"
-              "ld1 {v10.4s}, [x5], x1  \n"
-
-              "fmla v15.4s, v11.4s, %[w4].s[1]  \n"
-              "fmla v16.4s, v12.4s, %[w4].s[1]  \n"
-              "fmla v17.4s, v13.4s, %[w4].s[1]  \n"
-              "fmla v18.4s, v14.4s, %[w4].s[1]  \n"
-
-              "ld1 {v11.4s}, [x6], x1  \n"
-              "ld1 {v12.4s}, [x7], x1  \n"
-
-              // weights col 2
-              "fmla v15.4s, v7.4s , %[w0].s[2]  \n"
-              "fmla v16.4s, v8.4s , %[w0].s[2]  \n"
-              "fmla v17.4s, v9.4s , %[w0].s[2]  \n"
-              "fmla v18.4s, v10.4s, %[w0].s[2]  \n"
-
-              "ld1 {v13.4s}, [x8], x1  \n"
-              "ld1 {v14.4s}, [x9], x1  \n"
-
-              "fmla v15.4s, v8.4s , %[w1].s[2]  \n"
-              "fmla v16.4s, v9.4s , %[w1].s[2]  \n"
-              "fmla v17.4s, v10.4s, %[w1].s[2]  \n"
-              "fmla v18.4s, v11.4s, %[w1].s[2]  \n"
-
-              "ld1 {v7.4s}, [x2], x1  \n"
-              "ld1 {v8.4s}, [x3], x1  \n"
-
-              "fmla v15.4s, v9.4s , %[w2].s[2]  \n"
-              "fmla v16.4s, v10.4s, %[w2].s[2]  \n"
-              "fmla v17.4s, v11.4s, %[w2].s[2]  \n"
-              "fmla v18.4s, v12.4s, %[w2].s[2]  \n"
-
-              "fmla v15.4s, v10.4s, %[w3].s[2]  \n"
-              "fmla v16.4s, v11.4s, %[w3].s[2]  \n"
-              "fmla v17.4s, v12.4s, %[w3].s[2]  \n"
-              "fmla v18.4s, v13.4s, %[w3].s[2]  \n"
-
-              "ld1 {v9.4s} , [x4], x1  \n"
-              "ld1 {v10.4s}, [x5], x1  \n"
-
-              "fmla v15.4s, v11.4s, %[w4].s[2]  \n"
-              "fmla v16.4s, v12.4s, %[w4].s[2]  \n"
-              "fmla v17.4s, v13.4s, %[w4].s[2]  \n"
-              "fmla v18.4s, v14.4s, %[w4].s[2]  \n"
-
-              "ld1 {v11.4s}, [x6], x1  \n"
-              "ld1 {v12.4s}, [x7], x1  \n"
-
-              // weights col 3
-              "fmla v15.4s, v7.4s , %[w0].s[3] \n"
-              "fmla v16.4s, v8.4s , %[w0].s[3] \n"
-              "fmla v17.4s, v9.4s , %[w0].s[3] \n"
-              "fmla v18.4s, v10.4s, %[w0].s[3] \n"
-
-              "ld1 {v13.4s}, [x8], x1  \n"
-              "ld1 {v14.4s}, [x9], x1  \n"
-
-              "fmla v15.4s, v8.4s , %[w1].s[3]  \n"
-              "fmla v16.4s, v9.4s , %[w1].s[3]  \n"
-              "fmla v17.4s, v10.4s, %[w1].s[3]  \n"
-              "fmla v18.4s, v11.4s, %[w1].s[3]  \n"
-
-              "ld1 {v7.4s}, [x2], x1  \n"
-              "ld1 {v8.4s}, [x3], x1  \n"
-
-              "fmla v15.4s, v9.4s , %[w2].s[3]  \n"
-              "fmla v16.4s, v10.4s, %[w2].s[3]  \n"
-              "fmla v17.4s, v11.4s, %[w2].s[3]  \n"
-              "fmla v18.4s, v12.4s, %[w2].s[3]  \n"
-
-              "fmla v15.4s, v10.4s, %[w3].s[3]  \n"
-              "fmla v16.4s, v11.4s, %[w3].s[3]  \n"
-              "fmla v17.4s, v12.4s, %[w3].s[3]  \n"
-              "fmla v18.4s, v13.4s, %[w3].s[3]  \n"
-
-              "ld1 {v9.4s} , [x4], x1  \n"
-              "ld1 {v10.4s}, [x5], x1  \n"
-
-              "fmla v15.4s, v11.4s, %[w4].s[3]  \n"
-              "fmla v16.4s, v12.4s, %[w4].s[3]  \n"
-              "fmla v17.4s, v13.4s, %[w4].s[3]  \n"
-              "fmla v18.4s, v14.4s, %[w4].s[3]  \n"
-
-              "ld1 {v11.4s}, [x6], x1  \n"
-              "ld1 {v12.4s}, [x7], x1  \n"
-
-              // weights col 4
-              "fmla v15.4s, v7.4s, %[w5].s[0]  \n"
-              "fmla v16.4s, v8.4s, %[w5].s[0]  \n"
-              "fmla v17.4s, v9.4s, %[w5].s[0]  \n"
-              "fmla v18.4s, v10.4s, %[w5].s[0] \n"
-
-              "ld1 {v13.4s}, [x8], x1  \n"
-              "ld1 {v14.4s}, [x9], x1  \n"
-
-              "fmla v15.4s, v8.4s, %[w5].s[1]   \n"
-              "fmla v16.4s, v9.4s, %[w5].s[1]   \n"
-              "fmla v17.4s, v10.4s, %[w5].s[1]  \n"
-              "fmla v18.4s, v11.4s, %[w5].s[1]  \n"
-
-              "fmla v15.4s, v9.4s , %[w5].s[2]  \n"
-              "fmla v16.4s, v10.4s, %[w5].s[2]  \n"
-              "fmla v17.4s, v11.4s, %[w5].s[2]  \n"
-              "fmla v18.4s, v12.4s, %[w5].s[2]  \n"
-
-              "fmla v15.4s, v10.4s, %[w5].s[3]  \n"
-              "fmla v16.4s, v11.4s, %[w5].s[3]  \n"
-              "fmla v17.4s, v12.4s, %[w5].s[3]  \n"
-              "fmla v18.4s, v13.4s, %[w5].s[3]  \n"
-
-              "fmla v15.4s, v11.4s, %[w6].s[0]  \n"
-              "fmla v16.4s, v12.4s, %[w6].s[0]  \n"
-              "fmla v17.4s, v13.4s, %[w6].s[0]  \n"
-              "fmla v18.4s, v14.4s, %[w6].s[0]  \n"
-
-              "fmax v15.4s, v15.4s, v31.4s  \n"
-              "fmax v16.4s, v16.4s, v31.4s  \n"
-              "fmax v17.4s, v17.4s, v31.4s  \n"
-              "fmax v18.4s, v18.4s, v31.4s  \n"
-
-              "st1 {v15.4s}, [%[dout0]], #16  \n"
-              "st1 {v16.4s}, [%[dout1]], #16  \n"
-              "st1 {v17.4s}, [%[dout2]], #16  \n"
-              "st1 {v18.4s}, [%[dout3]], #16  \n"
-
-              "subs %w[cnt], %w[cnt], #1  \n"
-              "add x0, x0, #1  \n"
-              "bne 1b  \n"
-
-              : [dout0] "+r"(dout_ptr0),
-                [dout1] "+r"(dout_ptr1),
-                [dout2] "+r"(dout_ptr2),
-                [dout3] "+r"(dout_ptr3),
-                [cnt] "+r"(mid_loop),
-                [dinl] "+r"(dinl_ptr)
-              : [w0] "w"(w0),
-                [w1] "w"(w1),
-                [w2] "w"(w2),
-                [w3] "w"(w3),
-                [w4] "w"(w4),
-                [w5] "w"(w5),
-                [w6] "w"(w6),
-                [bias] "r"(vbias)
-              : "cc",
-                "memory",
-                "x0",
-                "x1",
-                "x2",
-                "x3",
-                "x4",
-                "x5",
-                "x6",
-                "x7",
-                "x8",
-                "x9",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v31");
-        }
-        dinl[0] += 4 * mid_cnt;
-        dinl[1] += 4 * mid_cnt;
-        dinl[2] += 4 * mid_cnt;
-        dinl[3] += 4 * mid_cnt;
-        dinl[4] += 4 * mid_cnt;
-        dinl[5] += 4 * mid_cnt;
-        dinl[6] += 4 * mid_cnt;
-        dinl[7] += 4 * mid_cnt;
-        //! deal with mid remain
-        for (int i = 0; i < mid_remain; ++i) {
-          compute_one_out_without_extract_relu(dinl[0],
-                                               dinl[1],
-                                               dinl[2],
-                                               dinl[3],
-                                               dinl[4],
-                                               dinl[5],
-                                               dinl[6],
-                                               dinl[7],
-                                               dout_ptr0,
-                                               dout_ptr1,
-                                               dout_ptr2,
-                                               dout_ptr3,
-                                               w0,
-                                               w1,
-                                               w2,
-                                               w3,
-                                               w4,
-                                               w5,
-                                               w6,
-                                               vbias);
-          dinl[0]++;
-          dinl[1]++;
-          dinl[2]++;
-          dinl[3]++;
-          dinl[4]++;
-          dinl[5]++;
-          dinl[6]++;
-          dinl[7]++;
-
-          dout_ptr0++;
-          dout_ptr1++;
-          dout_ptr2++;
-          dout_ptr3++;
-        }
-        //! deal with w_out pad_new column post
-        switch (pad_new) {
-          case 4:
-            compute_four_out_extract_post_relu(dinl[0],
-                                               dinl[1],
-                                               dinl[2],
-                                               dinl[3],
-                                               dinl[4],
-                                               dinl[5],
-                                               dinl[6],
-                                               dinl[7],
-                                               dout_ptr0,
-                                               dout_ptr1,
-                                               dout_ptr2,
-                                               dout_ptr3,
-                                               w0,
-                                               w1,
-                                               w2,
-                                               w3,
-                                               w4,
-                                               vbias);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-            dout_ptr2 += 4;
-            dout_ptr3 += 4;
-            break;
-          case 3:
-            compute_three_out_extract_post_relu(dinl[0],
-                                                dinl[1],
-                                                dinl[2],
-                                                dinl[3],
-                                                dinl[4],
-                                                dinl[5],
-                                                dinl[6],
-                                                dinl[7],
-                                                dout_ptr0,
-                                                dout_ptr1,
-                                                dout_ptr2,
-                                                dout_ptr3,
-                                                w0,
-                                                w1,
-                                                w2,
-                                                w3,
-                                                w4,
-                                                vbias);
-            dout_ptr0 += 3;
-            dout_ptr1 += 3;
-            dout_ptr2 += 3;
-            dout_ptr3 += 3;
-            break;
-          case 2:
-            compute_two_out_extract_post_relu(dinl[0],
-                                              dinl[1],
-                                              dinl[2],
-                                              dinl[3],
-                                              dinl[4],
-                                              dinl[5],
-                                              dinl[6],
-                                              dinl[7],
-                                              dout_ptr0,
-                                              dout_ptr1,
-                                              dout_ptr2,
-                                              dout_ptr3,
-                                              w0,
-                                              w1,
-                                              w2,
-                                              w3,
-                                              w4,
-                                              vbias);
-            dout_ptr0 += 2;
-            dout_ptr1 += 2;
-            dout_ptr2 += 2;
-            dout_ptr3 += 2;
-            break;
-          case 1:
-            compute_one_out_extract_post_relu(dinl[0],
-                                              dinl[1],
-                                              dinl[2],
-                                              dinl[3],
-                                              dinl[4],
-                                              dinl[5],
-                                              dinl[6],
-                                              dinl[7],
-                                              dout_ptr0,
-                                              dout_ptr1,
-                                              dout_ptr2,
-                                              dout_ptr3,
-                                              w0,
-                                              w1,
-                                              w2,
-                                              w3,
-                                              w4,
-                                              vbias);
-            dout_ptr0 += 1;
-            dout_ptr1 += 1;
-            dout_ptr2 += 1;
-            dout_ptr3 += 1;
-            break;
-        }
-
-        if (flag_bias) {
-          //! deal with w_out pad_0 column post with bias
-          memcpy(dout_ptr0, dout0, pad_0 * sizeof(float));
-          memcpy(dout_ptr1, dout1, pad_0 * sizeof(float));
-          memcpy(dout_ptr2, dout2, pad_0 * sizeof(float));
-          memcpy(dout_ptr3, dout3, pad_0 * sizeof(float));
-        } else {
-          //! deal with w_out pad_0 column post without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr2, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr3, 0x00, pad_0 * sizeof(float));
-        }
-
-        din_list[0] = din_list[4];
-        din_list[1] = din_list[5];
-        din_list[2] = din_list[6];
-        din_list[3] = din_list[7];
-        din_list[4] = din_list[3] + w_in;
-        din_list[5] = din_list[4] + w_in;
-        din_list[6] = din_list[5] + w_in;
-        din_list[7] = din_list[6] + w_in;
-
-        dout0 = dout3 + w_out;
-        dout1 = dout0 + w_out;
-        dout2 = dout1 + w_out;
-        dout3 = dout2 + w_out;
-      }
-      float* dout_pad_end = dout_ch + h_out_new * w_out;
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        memcpy(reinterpret_cast<void*>(dout_pad_end),
-               dout_ch - pad_0 * w_out,
-               pad_0 * w_out * sizeof(float));
-      } else {
-        //! deal with h_out pad_0 line without bias
-        memset(reinterpret_cast<void*>(dout_pad_end),
-               0x00,
-               pad_0 * w_out * sizeof(float));
-      }
-    }
-  }
-}
-
-void conv_depthwise_5x5s1_small_impl(const float* din,
-                                     float* dout,
-                                     int num,
-                                     int ch_out,
-                                     int h_out,
-                                     int w_out,
-                                     int ch_in,
-                                     int h_in,
-                                     int w_in,
-                                     const float* weights,
-                                     const float* bias,
-                                     int pad,
-                                     bool flag_bias,
-                                     bool flag_relu,
-                                     ARMContext* ctx) {
-  int pad_new = pad > 4 ? 4 : pad;
-  int pad_0 = pad - pad_new;
-  int h_in_new = h_in + 2 * pad_new;
-  int w_in_new = w_in + 2 * pad_new;
-  int h_out_new = h_out - 2 * pad_0;
-  int w_out_new = w_out - 2 * pad_0;
-  float zero_ptr[w_in_new + w_out];
-  memset(zero_ptr, 0, w_in_new * sizeof(float));
-  float* write_ptr = zero_ptr + w_in_new;
-  int pad_cnt = pad_0 >> 2;
-  int pad_remain = pad_0 - (pad_cnt << 2);
-  int bias_cnt = (w_out * pad_0) >> 2;
-  int bias_remain = (w_out * pad_0) - (bias_cnt << 2);
-  int in_spatial_size = w_in_new * h_in_new;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  float* din_new = prepad_input(din, num, ch_in, h_in, w_in, pad_new);
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din_new + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      float bias_c = flag_bias ? bias[c] : 0.f;
-      float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-      float32x4_t vbias_c = vdupq_n_f32(bias_c);
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        for (int i = 0; i < bias_cnt; ++i) {
-          vst1q_f32(dout_ch, vbias_c);
-          dout_ch += 4;
-        }
-        for (int i = 0; i < bias_remain; ++i) {
-          *dout_ch++ = bias_c;
-        }
-      } else {
-        //! deal with h_out pad_0 line without bias
-        for (int i = 0; i < pad_0; ++i) {
-          memset(dout_ch, 0x00, w_out * sizeof(float));
-          dout_ch += w_out;
-        }
-      }
-      //! every h loop, deal with 8 line input
-      const float* din0 = din_ch;
-      const float* din1 = din0 + w_in_new;
-      const float* din2 = din1 + w_in_new;
-      const float* din3 = din2 + w_in_new;
-      const float* din4 = din3 + w_in_new;
-      const float* din5 = din4 + w_in_new;
-      const float* din6 = din5 + w_in_new;
-      const float* din7 = din6 + w_in_new;
-      //! every h loop, deal with 4 line output
-      float* dout0 = dout_ch;
-      float* dout1 = dout0 + w_out;
-      float* dout2 = dout1 + w_out;
-      float* dout3 = dout2 + w_out;
-
-      //! load weights to neon register
-      const float* weights_c = weights + c * weights_saptial_size;
-
-      float32x4_t w5;
-      float32x4_t w6;
-      float32x4_t w0 = vld1q_f32(weights_c);
-      float32x4_t w1 = vld1q_f32(weights_c + 5);
-      float32x4_t w2 = vld1q_f32(weights_c + 10);
-      float32x4_t w3 = vld1q_f32(weights_c + 15);
-      float32x4_t w4 = vld1q_f32(weights_c + 20);
-      w5 = vsetq_lane_f32(weights_c[4], w5, 0);
-      w5 = vsetq_lane_f32(weights_c[9], w5, 1);
-      w5 = vsetq_lane_f32(weights_c[14], w5, 2);
-      w5 = vsetq_lane_f32(weights_c[19], w5, 3);
-      w6 = vsetq_lane_f32(weights_c[24], w6, 0);
-      //! h loop
-      for (int h = 0; h < h_out_new; h += 4) {
-        //! (h - pad_new) + 7 > h_in - 1
-        if (h + 8 > h_in_new) {
-          switch (h + 8 - h_in_new) {
-            case 7:
-              din1 = zero_ptr;
-            case 6:
-              din2 = zero_ptr;
-            case 5:
-              din3 = zero_ptr;
-            case 4:
-              din4 = zero_ptr;
-            case 3:
-              din5 = zero_ptr;
-            case 2:
-              din6 = zero_ptr;
-            case 1:
-              din7 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        if (h + 4 > h_out_new) {
-          switch (h + 4 - h_out_new) {
-            case 3:
-              dout1 = write_ptr;
-            case 2:
-              dout2 = write_ptr;
-            case 1:
-              dout3 = write_ptr;
-            default:
-              break;
-          }
-        }
-        const float* din_ptr0 = din0;
-        const float* din_ptr1 = din1;
-        const float* din_ptr2 = din2;
-        const float* din_ptr3 = din3;
-        const float* din_ptr4 = din4;
-        const float* din_ptr5 = din5;
-        const float* din_ptr6 = din6;
-        const float* din_ptr7 = din7;
-
-        const float* weights_ptr = weights_c;
-        float* dout_ptr0 = dout0;
-        float* dout_ptr1 = dout1;
-        float* dout_ptr2 = dout2;
-        float* dout_ptr3 = dout3;
-
-        if (flag_bias) {
-          //! deal with w_out pad_0 column pre with bias
-          for (int i = 0; i < pad_cnt; i++) {
-            vst1q_f32(dout_ptr0, vbias_c);
-            vst1q_f32(dout_ptr1, vbias_c);
-            vst1q_f32(dout_ptr2, vbias_c);
-            vst1q_f32(dout_ptr3, vbias_c);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-            dout_ptr2 += 4;
-            dout_ptr3 += 4;
-          }
-          for (int i = 0; i < pad_remain; ++i) {
-            *dout_ptr0++ = bias_c;
-            *dout_ptr1++ = bias_c;
-            *dout_ptr2++ = bias_c;
-            *dout_ptr3++ = bias_c;
-          }
-        } else {
-          //! deal with w_out pad_0 column pre without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr2, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr3, 0x00, pad_0 * sizeof(float));
-          dout_ptr0 += pad_0;
-          dout_ptr1 += pad_0;
-          dout_ptr2 += pad_0;
-          dout_ptr3 += pad_0;
-        }
-        //! mid loop
-        for (int i = 0; i < w_out_new; ++i) {
-          compute_one_out_without_extract(din_ptr0,
-                                          din_ptr1,
-                                          din_ptr2,
-                                          din_ptr3,
-                                          din_ptr4,
-                                          din_ptr5,
-                                          din_ptr6,
-                                          din_ptr7,
-                                          dout_ptr0,
-                                          dout_ptr1,
-                                          dout_ptr2,
-                                          dout_ptr3,
-                                          w0,
-                                          w1,
-                                          w2,
-                                          w3,
-                                          w4,
-                                          w5,
-                                          w6,
-                                          vbias);
-          din_ptr0++;
-          din_ptr1++;
-          din_ptr2++;
-          din_ptr3++;
-          din_ptr4++;
-          din_ptr5++;
-          din_ptr6++;
-          din_ptr7++;
-
-          dout_ptr0++;
-          dout_ptr1++;
-          dout_ptr2++;
-          dout_ptr3++;
-        }
-        if (flag_bias) {
-          //! deal with w_out pad_0 column post with bias
-          memcpy(dout_ptr0, dout0, pad_0 * sizeof(float));
-          memcpy(dout_ptr1, dout1, pad_0 * sizeof(float));
-          memcpy(dout_ptr2, dout2, pad_0 * sizeof(float));
-          memcpy(dout_ptr3, dout3, pad_0 * sizeof(float));
-        } else {
-          //! deal with w_out pad_0 column post without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr2, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr3, 0x00, pad_0 * sizeof(float));
-        }
-
-        din0 = din4;
-        din1 = din5;
-        din2 = din6;
-        din3 = din7;
-        din4 = din3 + w_in_new;
-        din5 = din4 + w_in_new;
-        din6 = din5 + w_in_new;
-        din7 = din6 + w_in_new;
-
-        dout0 = dout3 + w_out;
-        dout1 = dout0 + w_out;
-        dout2 = dout1 + w_out;
-        dout3 = dout2 + w_out;
-      }
-      float* dout_pad_end = dout_ch + h_out_new * w_out;
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        memcpy(reinterpret_cast<void*>(dout_pad_end),
-               dout_ch - pad_0 * w_out,
-               pad_0 * w_out * sizeof(float));
-      } else {
-        //! deal with h_out pad_0 line without bias
-        memset(reinterpret_cast<void*>(dout_pad_end),
-               0x00,
-               pad_0 * w_out * sizeof(float));
-      }
-    }
-  }
-  free(din_new);
-}
-
-void conv_depthwise_5x5s1_small_relu_impl(const float* din,
-                                          float* dout,
-                                          int num,
-                                          int ch_out,
-                                          int h_out,
-                                          int w_out,
-                                          int ch_in,
-                                          int h_in,
-                                          int w_in,
-                                          const float* weights,
-                                          const float* bias,
-                                          int pad,
-                                          bool flag_bias,
-                                          bool flag_relu,
-                                          ARMContext* ctx) {
-  int pad_new = pad > 4 ? 4 : pad;
-  int pad_0 = pad - pad_new;
-  int h_in_new = h_in + 2 * pad_new;
-  int w_in_new = w_in + 2 * pad_new;
-  float zero_ptr[w_in_new + w_out];
-  memset(zero_ptr, 0, w_in_new * sizeof(float));
-  float* write_ptr = zero_ptr + w_in_new;
-  int h_out_new = h_out - 2 * pad_0;
-  int w_out_new = w_out - 2 * pad_0;
-  int pad_cnt = pad_0 >> 2;
-  int pad_remain = pad_0 - (pad_cnt << 2);
-  int bias_cnt = (w_out * pad_0) >> 2;
-  int bias_remain = (w_out * pad_0) - (bias_cnt << 2);
-  int in_spatial_size = w_in_new * h_in_new;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  float* din_new = prepad_input(din, num, ch_in, h_in, w_in, pad_new);
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din_new + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      float bias_c = flag_bias ? bias[c] : 0.f;
-      float bias_relu = bias_c > 0.f ? bias_c : 0.f;
-      float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-      float32x4_t vbias_c = vdupq_n_f32(bias_relu);
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        for (int i = 0; i < bias_cnt; ++i) {
-          vst1q_f32(dout_ch, vbias_c);
-          dout_ch += 4;
-        }
-        for (int i = 0; i < bias_remain; ++i) {
-          *dout_ch++ = bias_relu;
-        }
-      } else {
-        //! deal with h_out pad_0 line without bias
-        for (int i = 0; i < pad_0; ++i) {
-          memset(dout_ch, 0x00, w_out * sizeof(float));
-          dout_ch += w_out;
-        }
-      }
-
-      //! every h loop, deal with 8 line input
-      const float* din0 = din_ch;
-      const float* din1 = din0 + w_in_new;
-      const float* din2 = din1 + w_in_new;
-      const float* din3 = din2 + w_in_new;
-      const float* din4 = din3 + w_in_new;
-      const float* din5 = din4 + w_in_new;
-      const float* din6 = din5 + w_in_new;
-      const float* din7 = din6 + w_in_new;
-      //! every h loop, deal with 4 line output
-      float* dout0 = dout_ch;
-      float* dout1 = dout0 + w_out;
-      float* dout2 = dout1 + w_out;
-      float* dout3 = dout2 + w_out;
-
-      //! load weights to neon register
-      const float* weights_c = weights + c * weights_saptial_size;
-
-      float32x4_t w5;
-      float32x4_t w6;
-      float32x4_t w0 = vld1q_f32(weights_c);
-      float32x4_t w1 = vld1q_f32(weights_c + 5);
-      float32x4_t w2 = vld1q_f32(weights_c + 10);
-      float32x4_t w3 = vld1q_f32(weights_c + 15);
-      float32x4_t w4 = vld1q_f32(weights_c + 20);
-      w5 = vsetq_lane_f32(weights_c[4], w5, 0);
-      w5 = vsetq_lane_f32(weights_c[9], w5, 1);
-      w5 = vsetq_lane_f32(weights_c[14], w5, 2);
-      w5 = vsetq_lane_f32(weights_c[19], w5, 3);
-      w6 = vsetq_lane_f32(weights_c[24], w6, 0);
-
-      //! h loop
-      for (int h = 0; h < h_out_new; h += 4) {
-        //! (h - pad_new) + 7 > h_in - 1
-        if (h + 8 > h_in_new) {
-          switch (h + 8 - h_in_new) {
-            case 7:
-              din1 = zero_ptr;
-            case 6:
-              din2 = zero_ptr;
-            case 5:
-              din3 = zero_ptr;
-            case 4:
-              din4 = zero_ptr;
-            case 3:
-              din5 = zero_ptr;
-            case 2:
-              din6 = zero_ptr;
-            case 1:
-              din7 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        if (h + 4 > h_out_new) {
-          switch (h + 4 - h_out_new) {
-            case 3:
-              dout1 = write_ptr;
-            case 2:
-              dout2 = write_ptr;
-            case 1:
-              dout3 = write_ptr;
-            default:
-              break;
-          }
-        }
-        const float* din_ptr0 = din0;
-        const float* din_ptr1 = din1;
-        const float* din_ptr2 = din2;
-        const float* din_ptr3 = din3;
-        const float* din_ptr4 = din4;
-        const float* din_ptr5 = din5;
-        const float* din_ptr6 = din6;
-        const float* din_ptr7 = din7;
-
-        float* dout_ptr0 = dout0;
-        float* dout_ptr1 = dout1;
-        float* dout_ptr2 = dout2;
-        float* dout_ptr3 = dout3;
-
-        if (flag_bias) {
-          //! deal with w_out pad_0 column pre with bias
-          for (int i = 0; i < pad_cnt; i++) {
-            vst1q_f32(dout_ptr0, vbias_c);
-            vst1q_f32(dout_ptr1, vbias_c);
-            vst1q_f32(dout_ptr2, vbias_c);
-            vst1q_f32(dout_ptr3, vbias_c);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-            dout_ptr2 += 4;
-            dout_ptr3 += 4;
-          }
-          for (int i = 0; i < pad_remain; ++i) {
-            *dout_ptr0++ = bias_relu;
-            *dout_ptr1++ = bias_relu;
-            *dout_ptr2++ = bias_relu;
-            *dout_ptr3++ = bias_relu;
-          }
-        } else {
-          //! deal with w_out pad_0 column pre without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr2, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr3, 0x00, pad_0 * sizeof(float));
-          dout_ptr0 += pad_0;
-          dout_ptr1 += pad_0;
-          dout_ptr2 += pad_0;
-          dout_ptr3 += pad_0;
-        }
-
-        //! mid loop
-        for (int i = 0; i < w_out_new; ++i) {
-          compute_one_out_without_extract_relu(din_ptr0,
-                                               din_ptr1,
-                                               din_ptr2,
-                                               din_ptr3,
-                                               din_ptr4,
-                                               din_ptr5,
-                                               din_ptr6,
-                                               din_ptr7,
-                                               dout_ptr0,
-                                               dout_ptr1,
-                                               dout_ptr2,
-                                               dout_ptr3,
-                                               w0,
-                                               w1,
-                                               w2,
-                                               w3,
-                                               w4,
-                                               w5,
-                                               w6,
-                                               vbias);
-          din_ptr0++;
-          din_ptr1++;
-          din_ptr2++;
-          din_ptr3++;
-          din_ptr4++;
-          din_ptr5++;
-          din_ptr6++;
-          din_ptr7++;
-
-          dout_ptr0++;
-          dout_ptr1++;
-          dout_ptr2++;
-          dout_ptr3++;
-        }
-
-        if (flag_bias) {
-          //! deal with w_out pad_0 column post with bias
-          memcpy(dout_ptr0, dout0, pad_0 * sizeof(float));
-          memcpy(dout_ptr1, dout1, pad_0 * sizeof(float));
-          memcpy(dout_ptr2, dout2, pad_0 * sizeof(float));
-          memcpy(dout_ptr3, dout3, pad_0 * sizeof(float));
-        } else {
-          //! deal with w_out pad_0 column post without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr2, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr3, 0x00, pad_0 * sizeof(float));
-        }
-
-        din0 = din4;
-        din1 = din5;
-        din2 = din6;
-        din3 = din7;
-        din4 = din3 + w_in_new;
-        din5 = din4 + w_in_new;
-        din6 = din5 + w_in_new;
-        din7 = din6 + w_in_new;
-
-        dout0 = dout3 + w_out;
-        dout1 = dout0 + w_out;
-        dout2 = dout1 + w_out;
-        dout3 = dout2 + w_out;
-      }
-      float* dout_pad_end = dout_ch + h_out_new * w_out;
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        memcpy(reinterpret_cast<void*>(dout_pad_end),
-               dout_ch - pad_0 * w_out,
-               pad_0 * w_out * sizeof(float));
-      } else {
-        //! deal with h_out pad_0 line without bias
-        memset(reinterpret_cast<void*>(dout_pad_end),
-               0x00,
-               pad_0 * w_out * sizeof(float));
-      }
-    }
-  }
-  free(din_new);
-}
-
-#else
-
-//! kernel for one out without extracting data mid
-//! deal with two lines out
-void compute_one_out_without_extract(const float* din0,
-                                     const float* din1,
-                                     const float* din2,
-                                     const float* din3,
-                                     const float* din4,
-                                     const float* din5,
-                                     float* dout0,
-                                     float* dout1,
-                                     const float* weights,
-                                     const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]!  \n"
-      "vld1.32 {d6-d7},   [%[din1]]!  \n"
-      "vld1.32 {d8-d9},   [%[din2]]!  \n"
-      "vld1.32 {d10-d11}, [%[din3]]!  \n"
-      "vld1.32 {d12-d13}, [%[din4]]!  \n"
-      "vld1.32 {d14-d15}, [%[din5]]!  \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-      "vld1.32 {d6[0]}, [%[din0]]  \n"
-      "vld1.32 {d6[1]}, [%[din1]]  \n"
-      "vld1.32 {d7[0]}, [%[din2]]  \n"
-      "vld1.32 {d7[1]}, [%[din3]]  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q0, q4  \n"
-      "vmla.f32 q10, q0, q5  \n"
-
-      "vld1.32 {d8[0]}, [%[din4]]  \n"
-      "vld1.32 {d8[1]}, [%[din5]]  \n"
-
-      "vld1.32 {d0-d1}, [%[wh]]  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q1, q5  \n"
-      "vmla.f32 q10, q1, q6  \n"
-
-      // weights col4
-      "sub %[wh], #64  \n"
-      "vld1.32 {d4[0]}, [%[wh]], r0  \n"
-      "vld1.32 {d4[1]}, [%[wh]], r0  \n"
-      "vld1.32 {d5[0]}, [%[wh]], r0  \n"
-      "vld1.32 {d5[1]}, [%[wh]], r0  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q0, q6  \n"
-      "vmla.f32 q10, q0, q7  \n"
-
-      "vext.32 q5, q3, q4, #1  \n"
-
-      "vmla.f32 q9,  q2, q3  \n"
-      "vmla.f32 q10, q2, q5  \n"
-
-      "vld1.32 {d4[0]}, [%[wh]]  \n"
-      "vld1.32 {d6}, [%[bias]]  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d18, d18, d19  \n"
-
-      "vmla.f32 d18, d8, d4[0]  \n"
-
-      // add bias
-      "vadd.f32 d18, d18, d6  \n"
-
-      "vst1.32 {d18[0]}, [%[dout0]]  \n"
-      "vst1.32 {d18[1]}, [%[dout1]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [wh] "+r"(weights)
-      : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11");
-}
-
-//! kernel for one out without extracting data mid
-//! deal with two lines out
-void compute_one_out_without_extract_relu(const float* din0,
-                                          const float* din1,
-                                          const float* din2,
-                                          const float* din3,
-                                          const float* din4,
-                                          const float* din5,
-                                          float* dout0,
-                                          float* dout1,
-                                          const float* weights,
-                                          const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "vmov.i32  q15, #0x0  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]!  \n"
-      "vld1.32 {d6-d7},   [%[din1]]!  \n"
-      "vld1.32 {d8-d9},   [%[din2]]!  \n"
-      "vld1.32 {d10-d11}, [%[din3]]!  \n"
-      "vld1.32 {d12-d13}, [%[din4]]!  \n"
-      "vld1.32 {d14-d15}, [%[din5]]!  \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      "vld1.32 {d0-d1}, [%[wh]], r0 \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      "vld1.32 {d2-d3}, [%[wh]], r0 \n"
-      "vld1.32 {d6[0]}, [%[din0]]   \n"
-      "vld1.32 {d6[1]}, [%[din1]]   \n"
-      "vld1.32 {d7[0]}, [%[din2]]   \n"
-      "vld1.32 {d7[1]}, [%[din3]]   \n"
-
-      // weights r2
-      "vmla.f32 q9,  q0, q4  \n"
-      "vmla.f32 q10, q0, q5  \n"
-
-      "vld1.32 {d8[0]}, [%[din4]]  \n"
-      "vld1.32 {d8[1]}, [%[din5]]  \n"
-
-      "vld1.32 {d0-d1}, [%[wh]]  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q1, q5  \n"
-      "vmla.f32 q10, q1, q6  \n"
-
-      // weights col4
-      "sub %[wh], #64  \n"
-      "vld1.32 {d4[0]}, [%[wh]], r0  \n"
-      "vld1.32 {d4[1]}, [%[wh]], r0  \n"
-      "vld1.32 {d5[0]}, [%[wh]], r0  \n"
-      "vld1.32 {d5[1]}, [%[wh]], r0  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q0, q6  \n"
-      "vmla.f32 q10, q0, q7  \n"
-
-      "vext.32 q5, q3, q4, #1  \n"
-
-      "vmla.f32 q9,  q2, q3  \n"
-      "vmla.f32 q10, q2, q5  \n"
-
-      "vld1.32 {d4[0]}, [%[wh]] \n"
-      "vld1.32 {d6}, [%[bias]]  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d18, d18, d19  \n"
-
-      "vmla.f32 d18, d8, d4[0] \n"
-
-      // add bias
-      "vadd.f32 d18, d18, d6   \n"
-
-      // relu
-      "vmax.f32 d18, d18, d30  \n"
-
-      "vst1.32 {d18[0]}, [%[dout0]]  \n"
-      "vst1.32 {d18[1]}, [%[dout1]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [wh] "+r"(weights)
-      : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q15");
-}
-
-//! kernel for one out without extracting data pre
-//! deal with two lines out
-void compute_one_out_extract_pre(const float* din0,
-                                 const float* din1,
-                                 const float* din2,
-                                 const float* din3,
-                                 const float* din4,
-                                 const float* din5,
-                                 float* dout0,
-                                 float* dout1,
-                                 const float* weights,
-                                 const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "add %[wh], #4  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]!  \n"
-      "vld1.32 {d6-d7},   [%[din1]]!  \n"
-      "vld1.32 {d8-d9},   [%[din2]]!  \n"
-      "vld1.32 {d10-d11}, [%[din3]]!  \n"
-      "vld1.32 {d12-d13}, [%[din4]]!  \n"
-      "vld1.32 {d14-d15}, [%[din5]]!  \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q0, q4  \n"
-      "vmla.f32 q10, q0, q5  \n"
-
-      "vld1.32 {d0-d1}, [%[wh]]  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q1, q5  \n"
-      "vmla.f32 q10, q1, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q0, q6  \n"
-      "vmla.f32 q10, q0, q7  \n"
-
-      // load bias
-      "vld1.32 {d0}, [%[bias]]  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d18, d18, d19  \n"
-
-      // add bias
-      "vadd.f32 d18, d18, d0  \n"
-
-      "vst1.32 {d18[0]}, [%[dout0]]  \n"
-      "vst1.32 {d18[1]}, [%[dout1]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [wh] "+r"(weights)
-      : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11");
-}
-
-//! kernel for one out without extracting data pre
-//! deal with two lines out
-void compute_one_out_extract_pre_relu(const float* din0,
-                                      const float* din1,
-                                      const float* din2,
-                                      const float* din3,
-                                      const float* din4,
-                                      const float* din5,
-                                      float* dout0,
-                                      float* dout1,
-                                      const float* weights,
-                                      const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "add %[wh], #4  \n"
-      "vmov.i32  q15, #0x0  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]!  \n"
-      "vld1.32 {d6-d7},   [%[din1]]!  \n"
-      "vld1.32 {d8-d9},   [%[din2]]!  \n"
-      "vld1.32 {d10-d11}, [%[din3]]!  \n"
-      "vld1.32 {d12-d13}, [%[din4]]!  \n"
-      "vld1.32 {d14-d15}, [%[din5]]!  \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q0, q4  \n"
-      "vmla.f32 q10, q0, q5  \n"
-
-      "vld1.32 {d0-d1}, [%[wh]]  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q1, q5  \n"
-      "vmla.f32 q10, q1, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q0, q6  \n"
-      "vmla.f32 q10, q0, q7  \n"
-
-      // load bias
-      "vld1.32 {d0}, [%[bias]]  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d18, d18, d19  \n"
-
-      // add bias
-      "vadd.f32 d18, d18, d0  \n"
-
-      // relu
-      "vmax.f32 d18, d18, d30  \n"
-      "vst1.32 {d18[0]}, [%[dout0]]  \n"
-      "vst1.32 {d18[1]}, [%[dout1]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [wh] "+r"(weights)
-      : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q15");
-}
-
-//! kernel for one out with extracting data post
-//! deal with two lines out
-void compute_one_out_extract_post(const float* din0,
-                                  const float* din1,
-                                  const float* din2,
-                                  const float* din3,
-                                  const float* din4,
-                                  const float* din5,
-                                  float* dout0,
-                                  float* dout1,
-                                  const float* weights,
-                                  const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]!  \n"
-      "vld1.32 {d6-d7},   [%[din1]]!  \n"
-      "vld1.32 {d8-d9},   [%[din2]]!  \n"
-      "vld1.32 {d10-d11}, [%[din3]]!  \n"
-      "vld1.32 {d12-d13}, [%[din4]]!  \n"
-      "vld1.32 {d14-d15}, [%[din5]]!  \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q0, q4  \n"
-      "vmla.f32 q10, q0, q5  \n"
-
-      "vld1.32 {d0-d1}, [%[wh]]  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q1, q5  \n"
-      "vmla.f32 q10, q1, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q0, q6  \n"
-      "vmla.f32 q10, q0, q7  \n"
-
-      "vld1.32 {d0}, [%[bias]]  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d18, d18, d19  \n"
-
-      // add bias
-      "vadd.f32 d18, d18, d0  \n"
-
-      "vst1.32 {d18[0]}, [%[dout0]]  \n"
-      "vst1.32 {d18[1]}, [%[dout1]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [wh] "+r"(weights)
-      : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11");
-}
-
-//! kernel for one out with extracting data post
-//! deal with two lines out
-void compute_one_out_extract_post_relu(const float* din0,
-                                       const float* din1,
-                                       const float* din2,
-                                       const float* din3,
-                                       const float* din4,
-                                       const float* din5,
-                                       float* dout0,
-                                       float* dout1,
-                                       const float* weights,
-                                       const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "vmov.i32  q15, #0x0  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]!  \n"
-      "vld1.32 {d6-d7},   [%[din1]]!  \n"
-      "vld1.32 {d8-d9},   [%[din2]]!  \n"
-      "vld1.32 {d10-d11}, [%[din3]]!  \n"
-      "vld1.32 {d12-d13}, [%[din4]]!  \n"
-      "vld1.32 {d14-d15}, [%[din5]]!  \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q0, q4  \n"
-      "vmla.f32 q10, q0, q5  \n"
-
-      "vld1.32 {d0-d1}, [%[wh]]  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q1, q5  \n"
-      "vmla.f32 q10, q1, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q0, q6  \n"
-      "vmla.f32 q10, q0, q7  \n"
-
-      "vld1.32 {d0}, [%[bias]]  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d18, d18, d19  \n"
-
-      // add bias
-      "vadd.f32 d18, d18, d0  \n"
-
-      // relu
-      "vmax.f32 d18, d18, d30  \n"
-
-      "vst1.32 {d18[0]}, [%[dout0]]  \n"
-      "vst1.32 {d18[1]}, [%[dout1]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [wh] "+r"(weights)
-      : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q15");
-}
-
-//! kernel for two out with extracting data pre
-//! deal with two lines out
-void compute_two_out_extract_pre(const float* din0,
-                                 const float* din1,
-                                 const float* din2,
-                                 const float* din3,
-                                 const float* din4,
-                                 const float* din5,
-                                 float* dout0,
-                                 float* dout1,
-                                 const float* weights,
-                                 const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "mov r1, #0  \n"
-      "add %[wh], #8  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vmov.32 d1[1], r1  \n"
-      "vmov.32 d3[1], r1  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]!  \n"
-      "vld1.32 {d6-d7},   [%[din1]]!  \n"
-      "vld1.32 {d8-d9},   [%[din2]]!  \n"
-      "vld1.32 {d10-d11}, [%[din3]]!  \n"
-      "vld1.32 {d12-d13}, [%[din4]]!  \n"
-      "vld1.32 {d14-d15}, [%[din5]]!  \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-      "vmov.32 d25[1], r1  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-      "vmov.32 d27[1], r1  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-      "vmov.32 d29[1], r1  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      "sub %[wh], #84  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-
-      "vpadd.f32 d22, d18, d19  \n"
-      "vpadd.f32 d23, d20, d21  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-
-      "vpadd.f32 d22, d22, d23  \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-      "vld1.32 {d30-d31}, [%[bias]]  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d23, d18, d19  \n"
-
-      // trn out neon register
-      "vtrn.32 d22, d23  \n"
-
-      // add bias
-      "vadd.f32 q11, q11, q15  \n"
-
-      // store result
-      "vst1.32 {d22}, [%[dout0]] \n"
-      "vst1.32 {d23}, [%[dout1]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [wh] "+r"(weights)
-      : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "r1",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q12",
-        "q13",
-        "q14",
-        "q15");
-}
-
-//! kernel for two out with extracting data pre
-//! deal with two lines out
-void compute_two_out_extract_pre_relu(const float* din0,
-                                      const float* din1,
-                                      const float* din2,
-                                      const float* din3,
-                                      const float* din4,
-                                      const float* din5,
-                                      float* dout0,
-                                      float* dout1,
-                                      const float* weights,
-                                      const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "mov r1, #0  \n"
-      "add %[wh], #8  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vmov.32 d1[1], r1  \n"
-      "vmov.32 d3[1], r1  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]!  \n"
-      "vld1.32 {d6-d7},   [%[din1]]!  \n"
-      "vld1.32 {d8-d9},   [%[din2]]!  \n"
-      "vld1.32 {d10-d11}, [%[din3]]!  \n"
-      "vld1.32 {d12-d13}, [%[din4]]!  \n"
-      "vld1.32 {d14-d15}, [%[din5]]!  \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-      "vmov.32 d25[1], r1  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-      "vmov.32 d27[1], r1  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-      "vmov.32 d29[1], r1  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      "sub %[wh], #84  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-
-      "vpadd.f32 d22, d18, d19  \n"
-      "vpadd.f32 d23, d20, d21  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-
-      "vpadd.f32 d22, d22, d23  \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-      "vld1.32 {d30-d31}, [%[bias]]  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d23, d18, d19  \n"
-      "vmov.i32  q9, #0x0  \n"
-
-      // trn out neon register
-      "vtrn.32 d22, d23  \n"
-
-      // add bias
-      "vadd.f32 q11, q11, q15  \n"
-
-      // relu
-      "vmax.f32 q11, q11, q9  \n"
-      // store result
-      "vst1.32 {d22}, [%[dout0]] \n"
-      "vst1.32 {d23}, [%[dout1]]  \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [wh] "+r"(weights)
-      : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "r1",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q12",
-        "q13",
-        "q14",
-        "q15");
-}
-
-//! kernel for two out with extracting data post
-//! deal with two lines out
-void compute_two_out_extract_post(const float* din0,
-                                  const float* din1,
-                                  const float* din2,
-                                  const float* din3,
-                                  const float* din4,
-                                  const float* din5,
-                                  float* dout0,
-                                  float* dout1,
-                                  const float* weights,
-                                  const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]!  \n"
-      "vld1.32 {d6-d7},   [%[din1]]!  \n"
-      "vld1.32 {d8-d9},   [%[din2]]!  \n"
-      "vld1.32 {d10-d11}, [%[din3]]!  \n"
-      "vld1.32 {d12-d13}, [%[din4]]!  \n"
-      "vld1.32 {d14-d15}, [%[din5]]!  \n"
-
-      //! out zero
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      "vpadd.f32 d22, d18, d19  \n"
-      "vpadd.f32 d23, d20, d21  \n"
-      "vpadd.f32 d22, d22, d23  \n"
-
-      "vmov.f32 q15, #0.0  \n"
-      "vext.32 q2, q2, q15, #1 \n"
-      "vext.32 q3, q3, q15, #1 \n"
-      "vext.32 q4, q4, q15, #1 \n"
-      "vext.32 q5, q5, q15, #1 \n"
-      "vext.32 q6, q6, q15, #1 \n"
-      "vext.32 q7, q7, q15, #1 \n"
-      "vext.32 q8, q8, q15, #1 \n"
-
-      //! out one
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-      "vld1.32 {d30-d31}, [%[bias]] \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d23, d18, d19  \n"
-
-      // trn out neon register
-      "vtrn.32 d22, d23  \n"
-
-      // add bias
-      "vadd.f32 q11, q11, q15  \n"
-
-      // store result
-      "vst1.32 {d22}, [%[dout0]] \n"
-      "vst1.32 {d23}, [%[dout1]] \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [wh] "+r"(weights)
-      : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q12",
-        "q13",
-        "q14",
-        "q15");
-}
-
-//! kernel for two out with extracting data post
-//! deal with two lines out
-void compute_two_out_extract_post_relu(const float* din0,
-                                       const float* din1,
-                                       const float* din2,
-                                       const float* din3,
-                                       const float* din4,
-                                       const float* din5,
-                                       float* dout0,
-                                       float* dout1,
-                                       const float* weights,
-                                       const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]!  \n"
-      "vld1.32 {d6-d7},   [%[din1]]!  \n"
-      "vld1.32 {d8-d9},   [%[din2]]!  \n"
-      "vld1.32 {d10-d11}, [%[din3]]!  \n"
-      "vld1.32 {d12-d13}, [%[din4]]!  \n"
-      "vld1.32 {d14-d15}, [%[din5]]!  \n"
-
-      //! out zero
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      "vpadd.f32 d22, d18, d19  \n"
-      "vpadd.f32 d23, d20, d21  \n"
-      "vpadd.f32 d22, d22, d23  \n"
-
-      "vmov.f32 q15, #0.0  \n"
-      "vext.32 q2, q2, q15, #1 \n"
-      "vext.32 q3, q3, q15, #1 \n"
-      "vext.32 q4, q4, q15, #1 \n"
-      "vext.32 q5, q5, q15, #1 \n"
-      "vext.32 q6, q6, q15, #1 \n"
-      "vext.32 q7, q7, q15, #1 \n"
-      "vext.32 q8, q8, q15, #1 \n"
-
-      //! out one
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-      "vld1.32 {d30-d31}, [%[bias]] \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d23, d18, d19  \n"
-      "vmov.i32  q9, #0x0  \n"
-
-      // trn out neon register
-      "vtrn.32 d22, d23  \n"
-
-      // add bias
-      "vadd.f32 q11, q11, q15  \n"
-
-      // relu
-      "vmax.f32 q11, q11, q9  \n"
-
-      // store result
-      "vst1.32 {d22}, [%[dout0]] \n"
-      "vst1.32 {d23}, [%[dout1]] \n"
-
-      : [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [wh] "+r"(weights)
-      : [dout0] "r"(dout0), [dout1] "r"(dout1), [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q12",
-        "q13",
-        "q14",
-        "q15");
-}
-
-//! kernel for three out with extracting data pre
-//! deal with two lines out
-void compute_three_out_extract_pre(const float* din0,
-                                   const float* din1,
-                                   const float* din2,
-                                   const float* din3,
-                                   const float* din4,
-                                   const float* din5,
-                                   float* dout0,
-                                   float* dout1,
-                                   const float* weights,
-                                   const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "add %[wh], #12  \n"
-      "vld1.32 {d0}, [%[wh]], r0  \n"
-      "vld1.32 {d2}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]  \n"
-      "vld1.32 {d6-d7},   [%[din1]]  \n"
-      "vld1.32 {d8-d9},   [%[din2]]  \n"
-      "vld1.32 {d10-d11}, [%[din3]]  \n"
-      "vld1.32 {d12-d13}, [%[din4]]  \n"
-      "vld1.32 {d14-d15}, [%[din5]]  \n"
-
-      //! out zero
-      // weights r0
-      "vmul.f32 d18, d0, d4  \n"
-      "vmul.f32 d20, d0, d6  \n"
-
-      "vld1.32 {d24}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 d18, d2, d6  \n"
-      "vmla.f32 d20, d2, d8  \n"
-
-      "vld1.32 {d26}, [%[wh]], r0  \n"
-
-      // weights r2
-      "vmla.f32 d18, d24, d8  \n"
-      "vmla.f32 d20, d24, d10  \n"
-
-      "vld1.32 {d28}, [%[wh]]  \n"
-
-      // weights r3
-      "vmla.f32 d18, d26, d10  \n"
-      "vmla.f32 d20, d26, d12  \n"
-
-      // load bias
-      "vld1.32 {d30-d31}, [%[bias]]  \n"
-
-      // weights r4
-      "vmla.f32 d18, d28, d12  \n"
-      "vmla.f32 d20, d28, d14  \n"
-      "vpadd.f32 d22, d18, d20 \n"
-
-      //! out one
-      "mov r1, #0  \n"
-      "sub %[wh], #84  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vmov.32 d1[1], r1  \n"
-      "vmov.32 d3[1], r1  \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-      "vmov.32 d25[1], r1  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-      "vmov.32 d27[1], r1  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-      "vmov.32 d29[1], r1  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      "sub %[wh], #84  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-
-      "vpadd.f32 d23, d18, d19  \n"
-
-      // trn out neon register
-      "vtrn.32 d22, d23  \n"
-
-      // add bias
-      "vadd.f32 q11, q11, q15  \n"
-
-      // store result
-      "vst1.32 {d22}, [%[dout0]]! \n"
-      "vst1.32 {d23}, [%[dout1]]!  \n"
-
-      //! out two
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d18, d18, d19  \n"
-
-      // add bias
-      "vadd.f32 d18, d18, d30  \n"
-
-      // store result
-      "vst1.32 {d18[0]}, [%[dout0]] \n"
-      "vst1.32 {d18[1]}, [%[dout1]] \n"
-
-      : [dout0] "+r"(dout0), [dout1] "+r"(dout1), [wh] "+r"(weights)
-      : [din0] "r"(din0),
-        [din1] "r"(din1),
-        [din2] "r"(din2),
-        [din3] "r"(din3),
-        [din4] "r"(din4),
-        [din5] "r"(din5),
-        [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "r1",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q12",
-        "q13",
-        "q14",
-        "q15");
-}
-
-//! kernel for three out with extracting data pre
-//! deal with two lines out
-void compute_three_out_extract_pre_relu(const float* din0,
-                                        const float* din1,
-                                        const float* din2,
-                                        const float* din3,
-                                        const float* din4,
-                                        const float* din5,
-                                        float* dout0,
-                                        float* dout1,
-                                        const float* weights,
-                                        const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "add %[wh], #12  \n"
-      "vld1.32 {d0}, [%[wh]], r0  \n"
-      "vld1.32 {d2}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]  \n"
-      "vld1.32 {d6-d7},   [%[din1]]  \n"
-      "vld1.32 {d8-d9},   [%[din2]]  \n"
-      "vld1.32 {d10-d11}, [%[din3]]  \n"
-      "vld1.32 {d12-d13}, [%[din4]]  \n"
-      "vld1.32 {d14-d15}, [%[din5]]  \n"
-
-      //! out zero
-      // weights r0
-      "vmul.f32 d18, d0, d4  \n"
-      "vmul.f32 d20, d0, d6  \n"
-
-      "vld1.32 {d24}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 d18, d2, d6  \n"
-      "vmla.f32 d20, d2, d8  \n"
-
-      "vld1.32 {d26}, [%[wh]], r0  \n"
-
-      // weights r2
-      "vmla.f32 d18, d24, d8  \n"
-      "vmla.f32 d20, d24, d10  \n"
-
-      "vld1.32 {d28}, [%[wh]]  \n"
-
-      // weights r3
-      "vmla.f32 d18, d26, d10  \n"
-      "vmla.f32 d20, d26, d12  \n"
-
-      // load bias
-      "vld1.32 {d30-d31}, [%[bias]]  \n"
-
-      // weights r4
-      "vmla.f32 d18, d28, d12  \n"
-      "vmla.f32 d20, d28, d14  \n"
-      "vpadd.f32 d22, d18, d20 \n"
-
-      //! out one
-      "mov r1, #0  \n"
-      "sub %[wh], #84  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vmov.32 d1[1], r1  \n"
-      "vmov.32 d3[1], r1  \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-      "vmov.32 d25[1], r1  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-      "vmov.32 d27[1], r1  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-      "vmov.32 d29[1], r1  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      "sub %[wh], #84  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-
-      "vpadd.f32 d23, d18, d19  \n"
-      "vmov.i32  q8, #0x0  \n"
-
-      // trn out neon register
-      "vtrn.32 d22, d23  \n"
-
-      // add bias
-      "vadd.f32 q11, q11, q15  \n"
-
-      // relu
-      "vmax.f32 q11, q11, q8  \n"
-
-      // store result
-      "vst1.32 {d22}, [%[dout0]]! \n"
-      "vst1.32 {d23}, [%[dout1]]!  \n"
-
-      //! out two
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d18, d18, d19  \n"
-
-      // add bias
-      "vadd.f32 d18, d18, d30  \n"
-
-      // relu
-      "vmax.f32 d18, d18, d16  \n"
-
-      // store result
-      "vst1.32 {d18[0]}, [%[dout0]] \n"
-      "vst1.32 {d18[1]}, [%[dout1]] \n"
-
-      : [dout0] "+r"(dout0), [dout1] "+r"(dout1), [wh] "+r"(weights)
-      : [din0] "r"(din0),
-        [din1] "r"(din1),
-        [din2] "r"(din2),
-        [din3] "r"(din3),
-        [din4] "r"(din4),
-        [din5] "r"(din5),
-        [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "r1",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q12",
-        "q13",
-        "q14",
-        "q15");
-}
-
-//! kernel for three out with extracting data post
-//! deal with two lines out
-void compute_three_out_extract_post(const float* din0,
-                                    const float* din1,
-                                    const float* din2,
-                                    const float* din3,
-                                    const float* din4,
-                                    const float* din5,
-                                    float* dout0,
-                                    float* dout1,
-                                    const float* weights,
-                                    const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]  \n"
-      "vld1.32 {d6-d7},   [%[din1]]  \n"
-      "vld1.32 {d8-d9},   [%[din2]]  \n"
-      "vld1.32 {d10-d11}, [%[din3]]  \n"
-      "vld1.32 {d12-d13}, [%[din4]]  \n"
-      "vld1.32 {d14-d15}, [%[din5]]  \n"
-
-      //! out zero && two
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-      "vmul.f32 d16, d0, d5  \n"
-      "vmul.f32 d17, d0, d7  \n"
-
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-      "vmla.f32 d16, d2, d7  \n"
-      "vmla.f32 d17, d2, d9  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-      "vmla.f32 d16, d24, d9  \n"
-      "vmla.f32 d17, d24, d11 \n"
-
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-      "vmla.f32 d16, d26, d11 \n"
-      "vmla.f32 d17, d26, d13 \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-      "vmla.f32 d16, d28, d13 \n"
-      "vmla.f32 d17, d28, d15 \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d16, d16, d17  \n"
-      "vpadd.f32 d22, d18, d19  \n"
-
-      "vmov.f32 q15, #0.0  \n"
-      "vext.32 q2, q2, q15, #1 \n"
-      "vext.32 q3, q3, q15, #1 \n"
-      "vext.32 q4, q4, q15, #1 \n"
-      "vext.32 q5, q5, q15, #1 \n"
-      "vext.32 q6, q6, q15, #1 \n"
-      "vext.32 q7, q7, q15, #1 \n"
-
-      //! out one
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      // load bias
-      "vld1.32 {d30-d31}, [%[bias]] \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d23, d18, d19  \n"
-      "vmov.i32  q9, #0x0  \n"
-
-      // trn out neon register
-      "vtrn.32 d22, d23  \n"
-
-      // add bias
-      "vadd.f32 q11, q11, q15  \n"
-      "vadd.f32 d16, d16, d30  \n"
-
-      "vst1.32 {d22},    [%[dout0]]!  \n"
-      "vst1.32 {d23},    [%[dout1]]!  \n"
-      "vst1.32 {d16[0]}, [%[dout0]]!  \n"
-      "vst1.32 {d16[1]}, [%[dout1]]!  \n"
-
-      : [dout0] "+r"(dout0), [dout1] "+r"(dout1), [wh] "+r"(weights)
-      : [din0] "r"(din0),
-        [din1] "r"(din1),
-        [din2] "r"(din2),
-        [din3] "r"(din3),
-        [din4] "r"(din4),
-        [din5] "r"(din5),
-        [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q12",
-        "q13",
-        "q14",
-        "q15");
-}
-
-//! kernel for three out with extracting data post
-//! deal with two lines out
-void compute_three_out_extract_post_relu(const float* din0,
-                                         const float* din1,
-                                         const float* din2,
-                                         const float* din3,
-                                         const float* din4,
-                                         const float* din5,
-                                         float* dout0,
-                                         float* dout1,
-                                         const float* weights,
-                                         const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]  \n"
-      "vld1.32 {d6-d7},   [%[din1]]  \n"
-      "vld1.32 {d8-d9},   [%[din2]]  \n"
-      "vld1.32 {d10-d11}, [%[din3]]  \n"
-      "vld1.32 {d12-d13}, [%[din4]]  \n"
-      "vld1.32 {d14-d15}, [%[din5]]  \n"
-
-      //! out zero && two
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-      "vmul.f32 d16, d0, d5  \n"
-      "vmul.f32 d17, d0, d7  \n"
-
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-      "vmla.f32 d16, d2, d7  \n"
-      "vmla.f32 d17, d2, d9  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-      "vmla.f32 d16, d24, d9  \n"
-      "vmla.f32 d17, d24, d11 \n"
-
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-      "vmla.f32 d16, d26, d11 \n"
-      "vmla.f32 d17, d26, d13 \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-      "vmla.f32 d16, d28, d13 \n"
-      "vmla.f32 d17, d28, d15 \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d16, d16, d17  \n"
-      "vpadd.f32 d22, d18, d19  \n"
-
-      "vmov.f32 q15, #0.0  \n"
-      "vext.32 q2, q2, q15, #1 \n"
-      "vext.32 q3, q3, q15, #1 \n"
-      "vext.32 q4, q4, q15, #1 \n"
-      "vext.32 q5, q5, q15, #1 \n"
-      "vext.32 q6, q6, q15, #1 \n"
-      "vext.32 q7, q7, q15, #1 \n"
-
-      //! out one
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      // load bias
-      "vld1.32 {d30-d31}, [%[bias]] \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d23, d18, d19  \n"
-      "vmov.i32 q9, #0x0  \n"
-
-      // trn out neon register
-      "vtrn.32 d22, d23  \n"
-
-      // add bias
-      "vadd.f32 q11, q11, q15  \n"
-      "vadd.f32 d16, d16, d30  \n"
-
-      // relu
-      "vmax.f32 q11, q11, q9  \n"
-      "vmax.f32 d16, d16, d18 \n"
-
-      "vst1.32 {d22},    [%[dout0]]!  \n"
-      "vst1.32 {d23},    [%[dout1]]!  \n"
-      "vst1.32 {d16[0]}, [%[dout0]]!  \n"
-      "vst1.32 {d16[1]}, [%[dout1]]!  \n"
-
-      : [dout0] "+r"(dout0), [dout1] "+r"(dout1), [wh] "+r"(weights)
-      : [din0] "r"(din0),
-        [din1] "r"(din1),
-        [din2] "r"(din2),
-        [din3] "r"(din3),
-        [din4] "r"(din4),
-        [din5] "r"(din5),
-        [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q12",
-        "q13",
-        "q14",
-        "q15");
-}
-
-//! kernel for four out with extracting data pre
-//! deal with two lines out
-void compute_four_out_extract_pre(const float* din0,
-                                  const float* din1,
-                                  const float* din2,
-                                  const float* din3,
-                                  const float* din4,
-                                  const float* din5,
-                                  float* dout0,
-                                  float* dout1,
-                                  const float* weights,
-                                  const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "add %[wh], #16  \n"
-
-      //! out zero
-      // load input
-      "vld1.32 {d4[0]}, [%[din0]]  \n"
-      "vld1.32 {d4[1]}, [%[din1]]  \n"
-      "vld1.32 {d5[0]}, [%[din2]]  \n"
-      "vld1.32 {d5[1]}, [%[din3]]  \n"
-      "vld1.32 {d6[0]}, [%[din4]]  \n"
-      "vld1.32 {d6[1]}, [%[din5]]  \n"
-
-      "vext.32 q4, q2, q3, #1  \n"
-
-      // load weights
-      "vld1.32 d0[0], [%[wh]], r0  \n"
-      "vld1.32 d0[1], [%[wh]], r0 \n"
-      "vld1.32 d1[0], [%[wh]], r0  \n"
-      "vld1.32 d1[1], [%[wh]], r0  \n"
-      "vld1.32 d2[0], [%[wh]]\n"
-
-      "vmul.f32 q9, q0, q2  \n"
-      "vmul.f32 q10, q0, q4 \n"
-
-      "vld1.32 {d30-d31}, [%[bias]]  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d22, d18, d19  \n"
-
-      "vmla.f32 d22, d6, d2[0]  \n"
-
-      "sub %[wh], #84  \n"
-      "vld1.32 {d0}, [%[wh]], r0  \n"
-      "vld1.32 {d2}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]  \n"
-      "vld1.32 {d6-d7},   [%[din1]]  \n"
-      "vld1.32 {d8-d9},   [%[din2]]  \n"
-      "vld1.32 {d10-d11}, [%[din3]]  \n"
-      "vld1.32 {d12-d13}, [%[din4]]  \n"
-      "vld1.32 {d14-d15}, [%[din5]]  \n"
-
-      //! out one
-      // weights r0
-      "vmul.f32 d18, d0, d4  \n"
-      "vmul.f32 d20, d0, d6  \n"
-
-      "vld1.32 {d24}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 d18, d2, d6  \n"
-      "vmla.f32 d20, d2, d8  \n"
-
-      "vld1.32 {d26}, [%[wh]], r0  \n"
-
-      // weights r2
-      "vmla.f32 d18, d24, d8  \n"
-      "vmla.f32 d20, d24, d10  \n"
-
-      "vld1.32 {d28}, [%[wh]]  \n"
-
-      // weights r3
-      "vmla.f32 d18, d26, d10  \n"
-      "vmla.f32 d20, d26, d12  \n"
-
-      // weights r4
-      "vmla.f32 d18, d28, d12  \n"
-      "vmla.f32 d20, d28, d14  \n"
-
-      "vpadd.f32 d23, d18, d20 \n"
-
-      // trn out neon register
-      "vtrn.32 d22, d23  \n"
-
-      // add bias
-      "vadd.f32 q11, q11, q15  \n"
-
-      // store result
-      "vst1.32 {d22}, [%[dout0]]!  \n"
-      "vst1.32 {d23}, [%[dout1]]!  \n"
-
-      //! out two
-      "mov r1, #0  \n"
-      "sub %[wh], #84  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vmov.32 d1[1], r1  \n"
-      "vmov.32 d3[1], r1  \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-      "vmov.32 d25[1], r1  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-      "vmov.32 d27[1], r1  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-      "vmov.32 d29[1], r1  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      "sub %[wh], #84  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-
-      "vpadd.f32 d22, d18, d19  \n"
-
-      //! out three
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d23, d18, d19  \n"
-
-      // trn out neon register
-      "vtrn.32 d22, d23  \n"
-
-      // add bias
-      "vadd.f32 q11, q11, q15  \n"
-
-      // store result
-      "vst1.32 {d22}, [%[dout0]]  \n"
-      "vst1.32 {d23}, [%[dout1]]  \n"
-
-      : [dout0] "+r"(dout0), [dout1] "+r"(dout1), [wh] "+r"(weights)
-      : [din0] "r"(din0),
-        [din1] "r"(din1),
-        [din2] "r"(din2),
-        [din3] "r"(din3),
-        [din4] "r"(din4),
-        [din5] "r"(din5),
-        [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "r1",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q12",
-        "q13",
-        "q14",
-        "q15");
-}
-
-//! kernel for four out with extracting data pre
-//! deal with two lines out
-void compute_four_out_extract_pre_relu(const float* din0,
-                                       const float* din1,
-                                       const float* din2,
-                                       const float* din3,
-                                       const float* din4,
-                                       const float* din5,
-                                       float* dout0,
-                                       float* dout1,
-                                       const float* weights,
-                                       const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "add %[wh], #16  \n"
-
-      //! out zero
-      // load input
-      "vld1.32 {d4[0]}, [%[din0]]  \n"
-      "vld1.32 {d4[1]}, [%[din1]]  \n"
-      "vld1.32 {d5[0]}, [%[din2]]  \n"
-      "vld1.32 {d5[1]}, [%[din3]]  \n"
-      "vld1.32 {d6[0]}, [%[din4]]  \n"
-      "vld1.32 {d6[1]}, [%[din5]]  \n"
-
-      "vext.32 q4, q2, q3, #1  \n"
-
-      // load weights
-      "vld1.32 d0[0], [%[wh]], r0  \n"
-      "vld1.32 d0[1], [%[wh]], r0 \n"
-      "vld1.32 d1[0], [%[wh]], r0  \n"
-      "vld1.32 d1[1], [%[wh]], r0  \n"
-      "vld1.32 d2[0], [%[wh]]\n"
-
-      "vmul.f32 q9, q0, q2  \n"
-      "vmul.f32 q10, q0, q4 \n"
-
-      "vld1.32 {d30-d31}, [%[bias]]  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d22, d18, d19  \n"
-
-      "vmla.f32 d22, d6, d2[0]  \n"
-
-      "sub %[wh], #84  \n"
-      "vld1.32 {d0}, [%[wh]], r0  \n"
-      "vld1.32 {d2}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]]  \n"
-      "vld1.32 {d6-d7},   [%[din1]]  \n"
-      "vld1.32 {d8-d9},   [%[din2]]  \n"
-      "vld1.32 {d10-d11}, [%[din3]]  \n"
-      "vld1.32 {d12-d13}, [%[din4]]  \n"
-      "vld1.32 {d14-d15}, [%[din5]]  \n"
-
-      //! out one
-      // weights r0
-      "vmul.f32 d18, d0, d4  \n"
-      "vmul.f32 d20, d0, d6  \n"
-
-      "vld1.32 {d24}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 d18, d2, d6  \n"
-      "vmla.f32 d20, d2, d8  \n"
-
-      "vld1.32 {d26}, [%[wh]], r0  \n"
-
-      // weights r2
-      "vmla.f32 d18, d24, d8  \n"
-      "vmla.f32 d20, d24, d10  \n"
-
-      "vld1.32 {d28}, [%[wh]]  \n"
-
-      // weights r3
-      "vmla.f32 d18, d26, d10  \n"
-      "vmla.f32 d20, d26, d12  \n"
-
-      // weights r4
-      "vmla.f32 d18, d28, d12  \n"
-      "vmla.f32 d20, d28, d14  \n"
-
-      "vpadd.f32 d23, d18, d20 \n"
-      "vmov.i32 q8, #0x0  \n"
-
-      // trn out neon register
-      "vtrn.32 d22, d23  \n"
-
-      // add bias
-      "vadd.f32 q11, q11, q15  \n"
-
-      // relu
-      "vmax.f32 q11, q11, q8  \n"
-
-      // store result
-      "vst1.32 {d22}, [%[dout0]]!  \n"
-      "vst1.32 {d23}, [%[dout1]]!  \n"
-
-      //! out two
-      "mov r1, #0  \n"
-      "sub %[wh], #84  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vmov.32 d1[1], r1  \n"
-      "vmov.32 d3[1], r1  \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-      "vmov.32 d25[1], r1  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-      "vmov.32 d27[1], r1  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-      "vmov.32 d29[1], r1  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      "sub %[wh], #84  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-      "vld1.32 {d28-d29}, [%[wh]]\n"
-
-      "vpadd.f32 d22, d18, d19  \n"
-
-      //! out three
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d23, d18, d19  \n"
-
-      // trn out neon register
-      "vtrn.32 d22, d23  \n"
-
-      // add bias
-      "vadd.f32 q11, q11, q15  \n"
-
-      // relu
-      "vmax.f32 q11, q11, q8  \n"
-
-      // store result
-      "vst1.32 {d22}, [%[dout0]]  \n"
-      "vst1.32 {d23}, [%[dout1]]  \n"
-
-      : [dout0] "+r"(dout0), [dout1] "+r"(dout1), [wh] "+r"(weights)
-      : [din0] "r"(din0),
-        [din1] "r"(din1),
-        [din2] "r"(din2),
-        [din3] "r"(din3),
-        [din4] "r"(din4),
-        [din5] "r"(din5),
-        [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "r1",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q12",
-        "q13",
-        "q14",
-        "q15");
-}
-
-//! kernel for three out with extracting data post
-//! deal with two lines out
-void compute_four_out_extract_post(const float* din0,
-                                   const float* din1,
-                                   const float* din2,
-                                   const float* din3,
-                                   const float* din4,
-                                   const float* din5,
-                                   float* dout0,
-                                   float* dout1,
-                                   const float* weights,
-                                   const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "mov r1, #12  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]], r1  \n"
-      "vld1.32 {d6-d7},   [%[din1]], r1  \n"
-      "vld1.32 {d8-d9},   [%[din2]], r1  \n"
-      "vld1.32 {d10-d11}, [%[din3]], r1  \n"
-      "vld1.32 {d12-d13}, [%[din4]], r1  \n"
-      "vld1.32 {d14-d15}, [%[din5]], r1  \n"
-
-      //! out zero && two
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-      "vmul.f32 d16, d0, d5  \n"
-      "vmul.f32 d17, d0, d7  \n"
-
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-      "vmla.f32 d16, d2, d7  \n"
-      "vmla.f32 d17, d2, d9  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-      "vmla.f32 d16, d24, d9  \n"
-      "vmla.f32 d17, d24, d11 \n"
-
-      "vld1.32 {d28-d29}, [%[wh]]  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-      "vmla.f32 d16, d26, d11 \n"
-      "vmla.f32 d17, d26, d13 \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-      "vmla.f32 d16, d28, d13 \n"
-      "vmla.f32 d17, d28, d15 \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d16, d16, d17  \n"
-      "vpadd.f32 d22, d18, d19  \n"
-
-      //! out one
-      "vmov.f32 q15, #0.0  \n"
-      "vext.32 q2, q2, q15, #1 \n"
-      "vext.32 q3, q3, q15, #1 \n"
-      "vext.32 q4, q4, q15, #1 \n"
-      "vext.32 q5, q5, q15, #1 \n"
-      "vext.32 q6, q6, q15, #1 \n"
-      "vext.32 q7, q7, q15, #1 \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      "vld1.32 {d30-d31}, [%[bias]] \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d23, d18, d19  \n"
-
-      // trn out neon register
-      "vtrn.32 d22, d23  \n"
-
-      // add bias
-      "vadd.f32 q11, q11, q15  \n"
-
-      // store result
-      "vst1.32 {d22}, [%[dout0]]! \n"
-      "vst1.32 {d23}, [%[dout1]]! \n"
-
-      //! out three
-      "sub %[wh], #80  \n"
-      "vld1.32 {d4[0]}, [%[din0]]  \n"
-      "vld1.32 {d4[1]}, [%[din1]]  \n"
-      "vld1.32 {d5[0]}, [%[din2]]  \n"
-      "vld1.32 {d5[1]}, [%[din3]]  \n"
-      "vld1.32 {d6[0]}, [%[din4]]  \n"
-      "vld1.32 {d6[1]}, [%[din5]]  \n"
-
-      "vext.32 q4, q2, q3, #1  \n"
-
-      "vld1.32 {d0[0]}, [%[wh]], r0  \n"
-      "vld1.32 {d0[1]}, [%[wh]], r0  \n"
-      "vld1.32 {d1[0]}, [%[wh]], r0  \n"
-      "vld1.32 {d1[1]}, [%[wh]], r0  \n"
-      "vld1.32 {d2[0]}, [%[wh]]      \n"
-
-      "vmul.f32 q9, q0, q2   \n"
-      "vmul.f32 q10, q0, q4  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d20, d20, d21  \n"
-      "vpadd.f32 d17, d18, d20  \n"
-
-      "vmla.f32 d17, d6, d2[0]  \n"
-
-      // trn out neon register
-      "vtrn.32 d16, d17  \n"
-
-      // add bias
-      "vadd.f32 q8, q8, q15  \n"
-
-      // store result
-      "vst1.32 {d16}, [%[dout0]]  \n"
-      "vst1.32 {d17}, [%[dout1]]  \n"
-
-      : [dout0] "+r"(dout0),
-        [dout1] "+r"(dout1),
-        [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [wh] "+r"(weights)
-      : [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "r1",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q12",
-        "q13",
-        "q14",
-        "q15");
-}
-
-//! kernel for three out with extracting data post
-//! deal with two lines out
-void compute_four_out_extract_post_relu(const float* din0,
-                                        const float* din1,
-                                        const float* din2,
-                                        const float* din3,
-                                        const float* din4,
-                                        const float* din5,
-                                        float* dout0,
-                                        float* dout1,
-                                        const float* weights,
-                                        const float* bias) {
-  asm volatile(
-      "mov r0, #20  \n"
-      "mov r1, #12  \n"
-      "vld1.32 {d0-d1}, [%[wh]], r0  \n"
-      "vld1.32 {d2-d3}, [%[wh]], r0  \n"
-
-      "vld1.32 {d4-d5},   [%[din0]], r1  \n"
-      "vld1.32 {d6-d7},   [%[din1]], r1  \n"
-      "vld1.32 {d8-d9},   [%[din2]], r1  \n"
-      "vld1.32 {d10-d11}, [%[din3]], r1  \n"
-      "vld1.32 {d12-d13}, [%[din4]], r1  \n"
-      "vld1.32 {d14-d15}, [%[din5]], r1  \n"
-
-      //! out zero && two
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-      "vmul.f32 d16, d0, d5  \n"
-      "vmul.f32 d17, d0, d7  \n"
-
-      "vld1.32 {d24-d25}, [%[wh]], r0  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-      "vmla.f32 d16, d2, d7  \n"
-      "vmla.f32 d17, d2, d9  \n"
-
-      "vld1.32 {d26-d27}, [%[wh]], r0  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-      "vmla.f32 d16, d24, d9  \n"
-      "vmla.f32 d17, d24, d11 \n"
-
-      "vld1.32 {d28-d29}, [%[wh]]  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-      "vmla.f32 d16, d26, d11 \n"
-      "vmla.f32 d17, d26, d13 \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-      "vmla.f32 d16, d28, d13 \n"
-      "vmla.f32 d17, d28, d15 \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d16, d16, d17  \n"
-      "vpadd.f32 d22, d18, d19  \n"
-
-      //! out one
-      "vmov.f32 q15, #0.0  \n"
-      "vext.32 q2, q2, q15, #1 \n"
-      "vext.32 q3, q3, q15, #1 \n"
-      "vext.32 q4, q4, q15, #1 \n"
-      "vext.32 q5, q5, q15, #1 \n"
-      "vext.32 q6, q6, q15, #1 \n"
-      "vext.32 q7, q7, q15, #1 \n"
-
-      // weights r0
-      "vmul.f32 q9,  q0, q2  \n"
-      "vmul.f32 q10, q0, q3  \n"
-
-      // weights r1
-      "vmla.f32 q9,  q1, q3  \n"
-      "vmla.f32 q10, q1, q4  \n"
-
-      // weights r2
-      "vmla.f32 q9,  q12, q4  \n"
-      "vmla.f32 q10, q12, q5  \n"
-
-      // weights r3
-      "vmla.f32 q9,  q13, q5  \n"
-      "vmla.f32 q10, q13, q6  \n"
-
-      // weights r4
-      "vmla.f32 q9,  q14, q6  \n"
-      "vmla.f32 q10, q14, q7  \n"
-
-      "vld1.32 {d30-d31}, [%[bias]] \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d19, d20, d21  \n"
-      "vpadd.f32 d23, d18, d19  \n"
-      "vmov.i32 q5, #0x0  \n"
-
-      // trn out neon register
-      "vtrn.32 d22, d23  \n"
-
-      // add bias
-      "vadd.f32 q11, q11, q15  \n"
-
-      // relu
-      "vmax.f32 q11, q11, q5  \n"
-
-      // store result
-      "vst1.32 {d22}, [%[dout0]]! \n"
-      "vst1.32 {d23}, [%[dout1]]! \n"
-
-      //! out three
-      "sub %[wh], #80  \n"
-      "vld1.32 {d4[0]}, [%[din0]]  \n"
-      "vld1.32 {d4[1]}, [%[din1]]  \n"
-      "vld1.32 {d5[0]}, [%[din2]]  \n"
-      "vld1.32 {d5[1]}, [%[din3]]  \n"
-      "vld1.32 {d6[0]}, [%[din4]]  \n"
-      "vld1.32 {d6[1]}, [%[din5]]  \n"
-
-      "vext.32 q4, q2, q3, #1  \n"
-
-      "vld1.32 {d0[0]}, [%[wh]], r0  \n"
-      "vld1.32 {d0[1]}, [%[wh]], r0  \n"
-      "vld1.32 {d1[0]}, [%[wh]], r0  \n"
-      "vld1.32 {d1[1]}, [%[wh]], r0  \n"
-      "vld1.32 {d2[0]}, [%[wh]]      \n"
-
-      "vmul.f32 q9, q0, q2   \n"
-      "vmul.f32 q10, q0, q4  \n"
-
-      "vpadd.f32 d18, d18, d19  \n"
-      "vpadd.f32 d20, d20, d21  \n"
-      "vpadd.f32 d17, d18, d20  \n"
-
-      "vmla.f32 d17, d6, d2[0]  \n"
-
-      // trn out neon register
-      "vtrn.32 d16, d17  \n"
-
-      // add bias
-      "vadd.f32 q8, q8, q15  \n"
-
-      // relu
-      "vmax.f32 q8, q8, q5  \n"
-
-      // store result
-      "vst1.32 {d16}, [%[dout0]]  \n"
-      "vst1.32 {d17}, [%[dout1]]  \n"
-
-      : [dout0] "+r"(dout0),
-        [dout1] "+r"(dout1),
-        [din0] "+r"(din0),
-        [din1] "+r"(din1),
-        [din2] "+r"(din2),
-        [din3] "+r"(din3),
-        [din4] "+r"(din4),
-        [din5] "+r"(din5),
-        [wh] "+r"(weights)
-      : [bias] "r"(bias)
-      : "memory",
-        "r0",
-        "r1",
-        "q0",
-        "q1",
-        "q2",
-        "q3",
-        "q4",
-        "q5",
-        "q6",
-        "q7",
-        "q8",
-        "q9",
-        "q10",
-        "q11",
-        "q12",
-        "q13",
-        "q14",
-        "q15");
-}
-
-void conv_depthwise_5x5s1_impl(const float* din,
-                               float* dout,
-                               int num,
-                               int ch_out,
-                               int h_out,
-                               int w_out,
-                               int ch_in,
-                               int h_in,
-                               int w_in,
-                               const float* weights,
-                               const float* bias,
-                               int pad,
-                               bool flag_bias,
-                               bool flag_relu,
-                               ARMContext* ctx) {
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-  int pad_new = pad > 4 ? 4 : pad;
-  int pad_0 = pad - pad_new;
-  int h_out_new = h_out - 2 * pad_0;
-  int mid_out = w_out - 2 * pad;
-  int mid_cnt = mid_out >> 2;
-  int mid_remain = mid_out - (mid_cnt << 2);
-  int pad_cnt = pad_0 >> 2;
-  int pad_remain = pad_0 - (pad_cnt << 2);
-  int bias_cnt = (w_out * pad_0) >> 2;
-  int bias_remain = (w_out * pad_0) - (bias_cnt << 2);
-  int in_spatial_size = w_in * h_in;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      float bias_c = flag_bias ? bias[c] : 0.f;
-      float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-      float32x4_t vbias_c = vdupq_n_f32(bias_c);
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        for (int i = 0; i < bias_cnt; ++i) {
-          vst1q_f32(dout_ch, vbias_c);
-          dout_ch += 4;
-        }
-        for (int i = 0; i < bias_remain; ++i) {
-          *dout_ch++ = bias_c;
-        }
-      } else {
-        //! deal with h_out pad_0 line without bias
-        for (int i = 0; i < pad_0; ++i) {
-          memset(dout_ch, 0x00, w_out * sizeof(float));
-          dout_ch += w_out;
-        }
-      }
-      const float* din_list[6];
-      //! set din ptr with zero buffer
-      for (int i = 0; i < pad_new; ++i) {
-        din_list[i] = zero_ptr;
-      }
-      //! set din ptr with input data
-      for (int i = pad_new; i < 6; ++i) {
-        din_list[i] = din_ch;
-        din_ch += w_in;
-      }
-      //! every h loop, deal with 6 line input
-      const float* din0 = din_list[0];
-      const float* din1 = din_list[1];
-      const float* din2 = din_list[2];
-      const float* din3 = din_list[3];
-      const float* din4 = din_list[4];
-      const float* din5 = din_list[5];
-
-      //! every h loop, deal with 2 line output
-      float* dout0 = dout_ch;
-      float* dout1 = dout0 + w_out;
-
-      //! load weights to neon register
-      const float* weights_c = weights + c * weights_saptial_size;
-
-      //! h loop
-      for (int h = 0; h < h_out_new; h += 2) {
-        //! (h - pad_new) + 7 > h_in - 1
-        if (h + 6 - pad_new > h_in) {
-          switch (h + 6 - pad_new - h_in) {
-            case 5:
-              din1 = zero_ptr;
-            case 4:
-              din2 = zero_ptr;
-            case 3:
-              din3 = zero_ptr;
-            case 2:
-              din4 = zero_ptr;
-            case 1:
-              din5 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        if (h + 2 > h_out_new) {
-          dout1 = write_ptr;
-        }
-        const float* din_ptr0 = din0;
-        const float* din_ptr1 = din1;
-        const float* din_ptr2 = din2;
-        const float* din_ptr3 = din3;
-        const float* din_ptr4 = din4;
-        const float* din_ptr5 = din5;
-
-        float* dout_ptr0 = dout0;
-        float* dout_ptr1 = dout1;
-        if (flag_bias) {
-          //! deal with w_out pad_0 column pre with bias
-          for (int i = 0; i < pad_cnt; i++) {
-            vst1q_f32(dout_ptr0, vbias_c);
-            vst1q_f32(dout_ptr1, vbias_c);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-          }
-          for (int i = 0; i < pad_remain; ++i) {
-            *dout_ptr0++ = bias_c;
-            *dout_ptr1++ = bias_c;
-          }
-        } else {
-          //! deal with w_out pad_0 column pre without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-          dout_ptr0 += pad_0;
-          dout_ptr1 += pad_0;
-        }
-
-        //! deal with w_out pad_new column pre
-        switch (pad_new) {
-          case 4:
-            compute_four_out_extract_pre(din_ptr0,
-                                         din_ptr1,
-                                         din_ptr2,
-                                         din_ptr3,
-                                         din_ptr4,
-                                         din_ptr5,
-                                         dout_ptr0,
-                                         dout_ptr1,
-                                         weights_c,
-                                         vbias);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-            break;
-          case 3:
-            compute_three_out_extract_pre(din_ptr0,
-                                          din_ptr1,
-                                          din_ptr2,
-                                          din_ptr3,
-                                          din_ptr4,
-                                          din_ptr5,
-                                          dout_ptr0,
-                                          dout_ptr1,
-                                          weights_c,
-                                          vbias);
-            dout_ptr0 += 3;
-            dout_ptr1 += 3;
-            break;
-          case 2:
-            compute_two_out_extract_pre(din_ptr0,
-                                        din_ptr1,
-                                        din_ptr2,
-                                        din_ptr3,
-                                        din_ptr4,
-                                        din_ptr5,
-                                        dout_ptr0,
-                                        dout_ptr1,
-                                        weights_c,
-                                        vbias);
-            dout_ptr0 += 2;
-            dout_ptr1 += 2;
-            break;
-          case 1:
-            compute_one_out_extract_pre(din_ptr0,
-                                        din_ptr1,
-                                        din_ptr2,
-                                        din_ptr3,
-                                        din_ptr4,
-                                        din_ptr5,
-                                        dout_ptr0,
-                                        dout_ptr1,
-                                        weights_c,
-                                        vbias);
-            dout_ptr0 += 1;
-            dout_ptr1 += 1;
-            break;
-        }
-
-        //! mid loop
-        if (mid_cnt > 0) {
-          int mid_loop = mid_cnt;
-          const float* weights_ptr = weights_c;
-          asm volatile(
-              //! din: q7-q12
-              //! dout: q13, q14
-              "mov r1, #20  \n"
-              //! load weights
-              "vld1.32 {d0-d1}, [%[wh]], r1  \n"
-              "vld1.32 {d2-d3}, [%[wh]], r1  \n"
-              "vld1.32 {d4-d5}, [%[wh]], r1  \n"
-              "vld1.32 {d6-d7}, [%[wh]], r1  \n"
-              "vld1.32 {d8-d9}, [%[wh]]  \n"
-
-              "sub %[wh], #64  \n"
-              "vld1.32 {d10[0]}, [%[wh]], r1  \n"
-              "vld1.32 {d10[1]}, [%[wh]], r1  \n"
-              "vld1.32 {d11[0]}, [%[wh]], r1  \n"
-              "vld1.32 {d11[1]}, [%[wh]], r1  \n"
-              "vld1.32 {d12[0]}, [%[wh]]      \n"
-
-              //! load input
-              "mov r1, #4  \n"
-              "vld1.32 {d14-d15}, [%[din0]], r1  \n"
-              "vld1.32 {d16-d17}, [%[din1]], r1  \n"
-              "vld1.32 {d18-d19}, [%[din2]], r1  \n"
-              "vld1.32 {d20-d21}, [%[din3]], r1  \n"
-              "vld1.32 {d22-d23}, [%[din4]], r1  \n"
-              "vld1.32 {d24-d25}, [%[din5]], r1  \n"
-
-              //! load bias
-              "vld1.32 {d30-d31}, [%[bias]]  \n"
-
-              "1: \n"
-              //! add bias to output
-              "vmov.32 q13, q15 \n"
-              "vmov.32 q14, q15 \n"
-
-              "pld [%[din0]]  \n"
-              "pld [%[din1]]  \n"
-              "pld [%[din2]]  \n"
-              "pld [%[din3]]  \n"
-              "pld [%[din4]]  \n"
-              "pld [%[din5]]  \n"
-
-              // weights col 0
-              "vmla.f32 q13, q7, d0[0]  \n"
-              "vmla.f32 q14, q8, d0[0]  \n"
-
-              "vmla.f32 q13, q8, d2[0]  \n"
-              "vmla.f32 q14, q9, d2[0]  \n"
-
-              "vld1.32 {d14-d15}, [%[din0]], r1  \n"
-              "vld1.32 {d16-d17}, [%[din1]], r1  \n"
-
-              "vmla.f32 q13, q9, d4[0]  \n"
-              "vmla.f32 q14, q10, d4[0]  \n"
-
-              "vmla.f32 q13, q10, d6[0]  \n"
-              "vmla.f32 q14, q11, d6[0]  \n"
-
-              "vld1.32 {d18-d19}, [%[din2]], r1  \n"
-              "vld1.32 {d20-d21}, [%[din3]], r1  \n"
-
-              "vmla.f32 q13, q11, d8[0]  \n"
-              "vmla.f32 q14, q12, d8[0]  \n"
-
-              "vld1.32 {d22-d23}, [%[din4]], r1  \n"
-              "vld1.32 {d24-d25}, [%[din5]], r1  \n"
-
-              // weights col 1
-              "vmla.f32 q13, q7, d0[1]  \n"
-              "vmla.f32 q14, q8, d0[1]  \n"
-
-              "vmla.f32 q13, q8, d2[1]   \n"
-              "vmla.f32 q14, q9, d2[1]   \n"
-
-              "vld1.32 {d14-d15}, [%[din0]], r1  \n"
-              "vld1.32 {d16-d17}, [%[din1]], r1  \n"
-
-              "vmla.f32 q13, q9, d4[1]  \n"
-              "vmla.f32 q14, q10, d4[1]  \n"
-
-              "vmla.f32 q13, q10, d6[1]  \n"
-              "vmla.f32 q14, q11, d6[1]  \n"
-
-              "vld1.32 {d18-d19}, [%[din2]], r1  \n"
-              "vld1.32 {d20-d21}, [%[din3]], r1  \n"
-
-              "vmla.f32 q13, q11, d8[1]  \n"
-              "vmla.f32 q14, q12, d8[1]  \n"
-
-              "vld1.32 {d22-d23}, [%[din4]], r1  \n"
-              "vld1.32 {d24-d25}, [%[din5]], r1  \n"
-
-              // weights col 2
-              "vmla.f32 q13, q7, d1[0]  \n"
-              "vmla.f32 q14, q8, d1[0]  \n"
-
-              "vmla.f32 q13, q8, d3[0]   \n"
-              "vmla.f32 q14, q9, d3[0]   \n"
-
-              "vld1.32 {d14-d15}, [%[din0]], r1  \n"
-              "vld1.32 {d16-d17}, [%[din1]], r1  \n"
-
-              "vmla.f32 q13, q9, d5[0]  \n"
-              "vmla.f32 q14, q10, d5[0]  \n"
-
-              "vmla.f32 q13, q10, d7[0]  \n"
-              "vmla.f32 q14, q11, d7[0]  \n"
-
-              "vld1.32 {d18-d19}, [%[din2]], r1  \n"
-              "vld1.32 {d20-d21}, [%[din3]], r1  \n"
-
-              "vmla.f32 q13, q11, d9[0]  \n"
-              "vmla.f32 q14, q12, d9[0]  \n"
-
-              "vld1.32 {d22-d23}, [%[din4]], r1  \n"
-              "vld1.32 {d24-d25}, [%[din5]], r1  \n"
-
-              // weights col 3
-              "vmla.f32 q13, q7, d1[1]  \n"
-              "vmla.f32 q14, q8, d1[1]  \n"
-
-              "vmla.f32 q13, q8, d3[1]   \n"
-              "vmla.f32 q14, q9, d3[1]   \n"
-
-              "vld1.32 {d14-d15}, [%[din0]], r1  \n"
-              "vld1.32 {d16-d17}, [%[din1]], r1  \n"
-
-              "vmla.f32 q13, q9, d5[1]  \n"
-              "vmla.f32 q14, q10, d5[1]  \n"
-
-              "vmla.f32 q13, q10, d7[1]  \n"
-              "vmla.f32 q14, q11, d7[1]  \n"
-
-              "vld1.32 {d18-d19}, [%[din2]], r1  \n"
-              "vld1.32 {d20-d21}, [%[din3]], r1  \n"
-
-              "vmla.f32 q13, q11, d9[1]  \n"
-              "vmla.f32 q14, q12, d9[1]  \n"
-
-              "vld1.32 {d22-d23}, [%[din4]], r1  \n"
-              "vld1.32 {d24-d25}, [%[din5]], r1  \n"
-
-              // weights col 4
-              "vmla.f32 q13, q7, d10[0]  \n"
-              "vmla.f32 q14, q8, d10[0]  \n"
-
-              "vmla.f32 q13, q8,  d10[1]   \n"
-              "vmla.f32 q14, q9, d10[1]   \n"
-
-              "vmla.f32 q13, q9, d11[0]  \n"
-              "vmla.f32 q14, q10, d11[0]  \n"
-
-              "vmla.f32 q13, q10, d11[1]  \n"
-              "vmla.f32 q14, q11, d11[1]  \n"
-
-              "vmla.f32 q13, q11, d12[0]   \n"
-              "vmla.f32 q14, q12, d12[0]  \n"
-
-              // store reslult
-              "vst1.32 {d26-d27}, [%[out0]]! \n"
-              "vst1.32 {d28-d29}, [%[out1]]! \n"
-
-              "subs %[cnt], #1  \n"
-              "bne 1b  \n"
-
-              "sub %[din0], r1  \n"
-              "sub %[din1], r1  \n"
-              "sub %[din2], r1  \n"
-              "sub %[din3], r1  \n"
-              "sub %[din4], r1  \n"
-              "sub %[din5], r1  \n"
-
-              : [din0] "+r"(din_ptr0),
-                [din1] "+r"(din_ptr1),
-                [din2] "+r"(din_ptr2),
-                [din3] "+r"(din_ptr3),
-                [din4] "+r"(din_ptr4),
-                [din5] "+r"(din_ptr5),
-                [out0] "+r"(dout_ptr0),
-                [out1] "+r"(dout_ptr1),
-                [wh] "+r"(weights_ptr),
-                [cnt] "+r"(mid_loop)
-              : [bias] "r"(vbias)
-              : "cc",
-                "memory",
-                "r1",
-                "q0",
-                "q1",
-                "q2",
-                "q3",
-                "q4",
-                "q5",
-                "q6",
-                "q7",
-                "q8",
-                "q9",
-                "q10",
-                "q11",
-                "q12",
-                "q13",
-                "q14",
-                "q15");
-        }
-        //! deal with mid remain
-        for (int i = 0; i < mid_remain; ++i) {
-          compute_one_out_without_extract(din_ptr0,
-                                          din_ptr1,
-                                          din_ptr2,
-                                          din_ptr3,
-                                          din_ptr4,
-                                          din_ptr5,
-                                          dout_ptr0,
-                                          dout_ptr1,
-                                          weights_c,
-                                          vbias);
-          din_ptr0++;
-          din_ptr1++;
-          din_ptr2++;
-          din_ptr3++;
-          din_ptr4++;
-          din_ptr5++;
-
-          dout_ptr0++;
-          dout_ptr1++;
-        }
-        //! deal with w_out pad_new column post
-        switch (pad_new) {
-          case 4:
-            compute_four_out_extract_post(din_ptr0,
-                                          din_ptr1,
-                                          din_ptr2,
-                                          din_ptr3,
-                                          din_ptr4,
-                                          din_ptr5,
-                                          dout_ptr0,
-                                          dout_ptr1,
-                                          weights_c,
-                                          vbias);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-            break;
-          case 3:
-            compute_three_out_extract_post(din_ptr0,
-                                           din_ptr1,
-                                           din_ptr2,
-                                           din_ptr3,
-                                           din_ptr4,
-                                           din_ptr5,
-                                           dout_ptr0,
-                                           dout_ptr1,
-                                           weights_c,
-                                           vbias);
-            dout_ptr0 += 3;
-            dout_ptr1 += 3;
-            break;
-          case 2:
-            compute_two_out_extract_post(din_ptr0,
-                                         din_ptr1,
-                                         din_ptr2,
-                                         din_ptr3,
-                                         din_ptr4,
-                                         din_ptr5,
-                                         dout_ptr0,
-                                         dout_ptr1,
-                                         weights_c,
-                                         vbias);
-            dout_ptr0 += 2;
-            dout_ptr1 += 2;
-            break;
-          case 1:
-            compute_one_out_extract_post(din_ptr0,
-                                         din_ptr1,
-                                         din_ptr2,
-                                         din_ptr3,
-                                         din_ptr4,
-                                         din_ptr5,
-                                         dout_ptr0,
-                                         dout_ptr1,
-                                         weights_c,
-                                         vbias);
-            dout_ptr0 += 1;
-            dout_ptr1 += 1;
-            break;
-        }
-
-        if (flag_bias) {
-          //! deal with w_out pad_0 column post with bias
-          memcpy(dout_ptr0, dout0, pad_0 * sizeof(float));
-          memcpy(dout_ptr1, dout1, pad_0 * sizeof(float));
-        } else {
-          //! deal with w_out pad_0 column post without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-        }
-
-        din0 = din2;
-        din1 = din3;
-        din2 = din4;
-        din3 = din5;
-        din4 = din3 + w_in;
-        din5 = din4 + w_in;
-
-        dout0 = dout1 + w_out;
-        dout1 = dout0 + w_out;
-      }
-      float* dout_pad_end = dout_ch + h_out_new * w_out;
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        memcpy(reinterpret_cast<void*>(dout_pad_end),
-               dout_ch - pad_0 * w_out,
-               pad_0 * w_out * sizeof(float));
-      } else {
-        //! deal with h_out pad_0 line without bias
-        memset(reinterpret_cast<void*>(dout_pad_end),
-               0x00,
-               pad_0 * w_out * sizeof(float));
-      }
-    }
-  }
-}
-
-void conv_depthwise_5x5s1_relu_impl(const float* din,
-                                    float* dout,
-                                    int num,
-                                    int ch_out,
-                                    int h_out,
-                                    int w_out,
-                                    int ch_in,
-                                    int h_in,
-                                    int w_in,
-                                    const float* weights,
-                                    const float* bias,
-                                    int pad,
-                                    bool flag_bias,
-                                    bool flag_relu,
-                                    ARMContext* ctx) {
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-  int pad_new = pad > 4 ? 4 : pad;
-  int pad_0 = pad - pad_new;
-  int h_out_new = h_out - 2 * pad_0;
-  int mid_out = w_out - 2 * pad;
-  int mid_cnt = mid_out >> 2;
-  int mid_remain = mid_out - (mid_cnt << 2);
-  int pad_cnt = pad_0 >> 2;
-  int pad_remain = pad_0 - (pad_cnt << 2);
-  int bias_cnt = (w_out * pad_0) >> 2;
-  int bias_remain = (w_out * pad_0) - (bias_cnt << 2);
-  int in_spatial_size = w_in * h_in;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      float bias_c = flag_bias ? bias[c] : 0.f;
-      float bias_relu = bias_c > 0.f ? bias_c : 0.f;
-      float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-      float32x4_t vbias_c = vdupq_n_f32(bias_relu);
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        for (int i = 0; i < bias_cnt; ++i) {
-          vst1q_f32(dout_ch, vbias_c);
-          dout_ch += 4;
-        }
-        for (int i = 0; i < bias_remain; ++i) {
-          *dout_ch++ = bias_relu;
-        }
-      } else {
-        //! deal with h_out pad_0 line without bias
-        for (int i = 0; i < pad_0; ++i) {
-          memset(dout_ch, 0x00, w_out * sizeof(float));
-          dout_ch += w_out;
-        }
-      }
-      const float* din_list[6];
-      //! set din ptr with zero buffer
-      for (int i = 0; i < pad_new; ++i) {
-        din_list[i] = zero_ptr;
-      }
-      //! set din ptr with input data
-      for (int i = pad_new; i < 6; ++i) {
-        din_list[i] = din_ch;
-        din_ch += w_in;
-      }
-      //! every h loop, deal with 6 line input
-      const float* din0 = din_list[0];
-      const float* din1 = din_list[1];
-      const float* din2 = din_list[2];
-      const float* din3 = din_list[3];
-      const float* din4 = din_list[4];
-      const float* din5 = din_list[5];
-
-      //! every h loop, deal with 2 line output
-      float* dout0 = dout_ch;
-      float* dout1 = dout0 + w_out;
-
-      //! load weights to neon register
-      const float* weights_c = weights + c * weights_saptial_size;
-
-      //! h loop
-      for (int h = 0; h < h_out_new; h += 2) {
-        //! (h - pad_new) + 7 > h_in - 1
-        if (h + 6 - pad_new > h_in) {
-          switch (h + 6 - pad_new - h_in) {
-            case 5:
-              din1 = zero_ptr;
-            case 4:
-              din2 = zero_ptr;
-            case 3:
-              din3 = zero_ptr;
-            case 2:
-              din4 = zero_ptr;
-            case 1:
-              din5 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        if (h + 2 > h_out_new) {
-          dout1 = write_ptr;
-        }
-        const float* din_ptr0 = din0;
-        const float* din_ptr1 = din1;
-        const float* din_ptr2 = din2;
-        const float* din_ptr3 = din3;
-        const float* din_ptr4 = din4;
-        const float* din_ptr5 = din5;
-
-        float* dout_ptr0 = dout0;
-        float* dout_ptr1 = dout1;
-        if (flag_bias) {
-          //! deal with w_out pad_0 column pre with bias
-          for (int i = 0; i < pad_cnt; i++) {
-            vst1q_f32(dout_ptr0, vbias_c);
-            vst1q_f32(dout_ptr1, vbias_c);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-          }
-          for (int i = 0; i < pad_remain; ++i) {
-            *dout_ptr0++ = bias_relu;
-            *dout_ptr1++ = bias_relu;
-          }
-        } else {
-          //! deal with w_out pad_0 column pre without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-          dout_ptr0 += pad_0;
-          dout_ptr1 += pad_0;
-        }
-
-        //! deal with w_out pad_new column pre
-        switch (pad_new) {
-          case 4:
-            compute_four_out_extract_pre_relu(din_ptr0,
-                                              din_ptr1,
-                                              din_ptr2,
-                                              din_ptr3,
-                                              din_ptr4,
-                                              din_ptr5,
-                                              dout_ptr0,
-                                              dout_ptr1,
-                                              weights_c,
-                                              vbias);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-            break;
-          case 3:
-            compute_three_out_extract_pre_relu(din_ptr0,
-                                               din_ptr1,
-                                               din_ptr2,
-                                               din_ptr3,
-                                               din_ptr4,
-                                               din_ptr5,
-                                               dout_ptr0,
-                                               dout_ptr1,
-                                               weights_c,
-                                               vbias);
-            dout_ptr0 += 3;
-            dout_ptr1 += 3;
-            break;
-          case 2:
-            compute_two_out_extract_pre_relu(din_ptr0,
-                                             din_ptr1,
-                                             din_ptr2,
-                                             din_ptr3,
-                                             din_ptr4,
-                                             din_ptr5,
-                                             dout_ptr0,
-                                             dout_ptr1,
-                                             weights_c,
-                                             vbias);
-            dout_ptr0 += 2;
-            dout_ptr1 += 2;
-            break;
-          case 1:
-            compute_one_out_extract_pre_relu(din_ptr0,
-                                             din_ptr1,
-                                             din_ptr2,
-                                             din_ptr3,
-                                             din_ptr4,
-                                             din_ptr5,
-                                             dout_ptr0,
-                                             dout_ptr1,
-                                             weights_c,
-                                             vbias);
-            dout_ptr0 += 1;
-            dout_ptr1 += 1;
-            break;
-        }
-
-        //! mid loop
-        if (mid_cnt > 0) {
-          int mid_loop = mid_cnt;
-          const float* weights_ptr = weights_c;
-          asm volatile(
-              //! din: q7-q12
-              //! dout: q13, q14
-              "mov r1, #20  \n"
-              "vmov.i32 q15, #0x0  \n"
-              //! load weights
-              "vld1.32 {d0-d1}, [%[wh]], r1  \n"
-              "vld1.32 {d2-d3}, [%[wh]], r1  \n"
-              "vld1.32 {d4-d5}, [%[wh]], r1  \n"
-              "vld1.32 {d6-d7}, [%[wh]], r1  \n"
-              "vld1.32 {d8-d9}, [%[wh]]  \n"
-
-              "sub %[wh], #64  \n"
-              "vld1.32 {d10[0]}, [%[wh]], r1  \n"
-              "vld1.32 {d10[1]}, [%[wh]], r1  \n"
-              "vld1.32 {d11[0]}, [%[wh]], r1  \n"
-              "vld1.32 {d11[1]}, [%[wh]], r1  \n"
-              "vld1.32 {d12[0]}, [%[wh]]      \n"
-
-              //! load input
-              "mov r1, #4  \n"
-              "vld1.32 {d14-d15}, [%[din0]], r1  \n"
-              "vld1.32 {d16-d17}, [%[din1]], r1  \n"
-              "vld1.32 {d18-d19}, [%[din2]], r1  \n"
-              "vld1.32 {d20-d21}, [%[din3]], r1  \n"
-              "vld1.32 {d22-d23}, [%[din4]], r1  \n"
-              "vld1.32 {d24-d25}, [%[din5]], r1  \n"
-
-              "1: \n"
-
-              //! load bias to output
-              "vld1.32 {d26-d27}, [%[bias]] \n"
-              "vld1.32 {d28-d29}, [%[bias]] \n"
-
-              "pld [%[din0]]  \n"
-              "pld [%[din1]]  \n"
-              "pld [%[din2]]  \n"
-              "pld [%[din3]]  \n"
-              "pld [%[din4]]  \n"
-              "pld [%[din5]]  \n"
-
-              // weights col 0
-              "vmla.f32 q13, q7, d0[0]  \n"
-              "vmla.f32 q14, q8, d0[0]  \n"
-
-              "vmla.f32 q13, q8, d2[0]  \n"
-              "vmla.f32 q14, q9, d2[0]  \n"
-
-              "vld1.32 {d14-d15}, [%[din0]], r1  \n"
-              "vld1.32 {d16-d17}, [%[din1]], r1  \n"
-
-              "vmla.f32 q13, q9, d4[0]  \n"
-              "vmla.f32 q14, q10, d4[0]  \n"
-
-              "vmla.f32 q13, q10, d6[0]  \n"
-              "vmla.f32 q14, q11, d6[0]  \n"
-
-              "vld1.32 {d18-d19}, [%[din2]], r1  \n"
-              "vld1.32 {d20-d21}, [%[din3]], r1  \n"
-
-              "vmla.f32 q13, q11, d8[0]  \n"
-              "vmla.f32 q14, q12, d8[0]  \n"
-
-              "vld1.32 {d22-d23}, [%[din4]], r1  \n"
-              "vld1.32 {d24-d25}, [%[din5]], r1  \n"
-
-              // weights col 1
-              "vmla.f32 q13, q7, d0[1]  \n"
-              "vmla.f32 q14, q8, d0[1]  \n"
-
-              "vmla.f32 q13, q8, d2[1]   \n"
-              "vmla.f32 q14, q9, d2[1]   \n"
-
-              "vld1.32 {d14-d15}, [%[din0]], r1  \n"
-              "vld1.32 {d16-d17}, [%[din1]], r1  \n"
-
-              "vmla.f32 q13, q9, d4[1]  \n"
-              "vmla.f32 q14, q10, d4[1]  \n"
-
-              "vmla.f32 q13, q10, d6[1]  \n"
-              "vmla.f32 q14, q11, d6[1]  \n"
-
-              "vld1.32 {d18-d19}, [%[din2]], r1  \n"
-              "vld1.32 {d20-d21}, [%[din3]], r1  \n"
-
-              "vmla.f32 q13, q11, d8[1]  \n"
-              "vmla.f32 q14, q12, d8[1]  \n"
-
-              "vld1.32 {d22-d23}, [%[din4]], r1  \n"
-              "vld1.32 {d24-d25}, [%[din5]], r1  \n"
-
-              // weights col 2
-              "vmla.f32 q13, q7, d1[0]  \n"
-              "vmla.f32 q14, q8, d1[0]  \n"
-
-              "vmla.f32 q13, q8, d3[0]   \n"
-              "vmla.f32 q14, q9, d3[0]   \n"
-
-              "vld1.32 {d14-d15}, [%[din0]], r1  \n"
-              "vld1.32 {d16-d17}, [%[din1]], r1  \n"
-
-              "vmla.f32 q13, q9, d5[0]  \n"
-              "vmla.f32 q14, q10, d5[0]  \n"
-
-              "vmla.f32 q13, q10, d7[0]  \n"
-              "vmla.f32 q14, q11, d7[0]  \n"
-
-              "vld1.32 {d18-d19}, [%[din2]], r1  \n"
-              "vld1.32 {d20-d21}, [%[din3]], r1  \n"
-
-              "vmla.f32 q13, q11, d9[0]  \n"
-              "vmla.f32 q14, q12, d9[0]  \n"
-
-              "vld1.32 {d22-d23}, [%[din4]], r1  \n"
-              "vld1.32 {d24-d25}, [%[din5]], r1  \n"
-
-              // weights col 3
-              "vmla.f32 q13, q7, d1[1]  \n"
-              "vmla.f32 q14, q8, d1[1]  \n"
-
-              "vmla.f32 q13, q8, d3[1]   \n"
-              "vmla.f32 q14, q9, d3[1]   \n"
-
-              "vld1.32 {d14-d15}, [%[din0]], r1  \n"
-              "vld1.32 {d16-d17}, [%[din1]], r1  \n"
-
-              "vmla.f32 q13, q9, d5[1]  \n"
-              "vmla.f32 q14, q10, d5[1]  \n"
-
-              "vmla.f32 q13, q10, d7[1]  \n"
-              "vmla.f32 q14, q11, d7[1]  \n"
-
-              "vld1.32 {d18-d19}, [%[din2]], r1  \n"
-              "vld1.32 {d20-d21}, [%[din3]], r1  \n"
-
-              "vmla.f32 q13, q11, d9[1]  \n"
-              "vmla.f32 q14, q12, d9[1]  \n"
-
-              "vld1.32 {d22-d23}, [%[din4]], r1  \n"
-              "vld1.32 {d24-d25}, [%[din5]], r1  \n"
-
-              // weights col 4
-              "vmla.f32 q13, q7, d10[0]  \n"
-              "vmla.f32 q14, q8, d10[0]  \n"
-
-              "vmla.f32 q13, q8,  d10[1]   \n"
-              "vmla.f32 q14, q9, d10[1]   \n"
-
-              "vmla.f32 q13, q9, d11[0]  \n"
-              "vmla.f32 q14, q10, d11[0]  \n"
-
-              "vmla.f32 q13, q10, d11[1]  \n"
-              "vmla.f32 q14, q11, d11[1]  \n"
-
-              "vmla.f32 q13, q11, d12[0]   \n"
-              "vmla.f32 q14, q12, d12[0]  \n"
-
-              // relu
-              "vmax.f32 q13, q13, q15  \n"
-              "vmax.f32 q14, q14, q15  \n"
-
-              // store result
-              "vst1.32 {d26-d27}, [%[out0]]! \n"
-              "vst1.32 {d28-d29}, [%[out1]]! \n"
-
-              "subs %[cnt], #1  \n"
-              "bne 1b  \n"
-
-              "sub %[din0], r1  \n"
-              "sub %[din1], r1  \n"
-              "sub %[din2], r1  \n"
-              "sub %[din3], r1  \n"
-              "sub %[din4], r1  \n"
-              "sub %[din5], r1  \n"
-
-              : [din0] "+r"(din_ptr0),
-                [din1] "+r"(din_ptr1),
-                [din2] "+r"(din_ptr2),
-                [din3] "+r"(din_ptr3),
-                [din4] "+r"(din_ptr4),
-                [din5] "+r"(din_ptr5),
-                [out0] "+r"(dout_ptr0),
-                [out1] "+r"(dout_ptr1),
-                [wh] "+r"(weights_ptr),
-                [cnt] "+r"(mid_loop)
-              : [bias] "r"(vbias)
-              : "cc",
-                "memory",
-                "r1",
-                "q0",
-                "q1",
-                "q2",
-                "q3",
-                "q4",
-                "q5",
-                "q6",
-                "q7",
-                "q8",
-                "q9",
-                "q10",
-                "q11",
-                "q12",
-                "q13",
-                "q14",
-                "q15");
-        }
-        //! deal with mid remain
-        for (int i = 0; i < mid_remain; ++i) {
-          compute_one_out_without_extract_relu(din_ptr0,
-                                               din_ptr1,
-                                               din_ptr2,
-                                               din_ptr3,
-                                               din_ptr4,
-                                               din_ptr5,
-                                               dout_ptr0,
-                                               dout_ptr1,
-                                               weights_c,
-                                               vbias);
-          din_ptr0++;
-          din_ptr1++;
-          din_ptr2++;
-          din_ptr3++;
-          din_ptr4++;
-          din_ptr5++;
-
-          dout_ptr0++;
-          dout_ptr1++;
-        }
-        //! deal with w_out pad_new column post
-        switch (pad_new) {
-          case 4:
-            compute_four_out_extract_post_relu(din_ptr0,
-                                               din_ptr1,
-                                               din_ptr2,
-                                               din_ptr3,
-                                               din_ptr4,
-                                               din_ptr5,
-                                               dout_ptr0,
-                                               dout_ptr1,
-                                               weights_c,
-                                               vbias);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-            break;
-          case 3:
-            compute_three_out_extract_post_relu(din_ptr0,
-                                                din_ptr1,
-                                                din_ptr2,
-                                                din_ptr3,
-                                                din_ptr4,
-                                                din_ptr5,
-                                                dout_ptr0,
-                                                dout_ptr1,
-                                                weights_c,
-                                                vbias);
-            dout_ptr0 += 3;
-            dout_ptr1 += 3;
-            break;
-          case 2:
-            compute_two_out_extract_post_relu(din_ptr0,
-                                              din_ptr1,
-                                              din_ptr2,
-                                              din_ptr3,
-                                              din_ptr4,
-                                              din_ptr5,
-                                              dout_ptr0,
-                                              dout_ptr1,
-                                              weights_c,
-                                              vbias);
-            dout_ptr0 += 2;
-            dout_ptr1 += 2;
-            break;
-          case 1:
-            compute_one_out_extract_post_relu(din_ptr0,
-                                              din_ptr1,
-                                              din_ptr2,
-                                              din_ptr3,
-                                              din_ptr4,
-                                              din_ptr5,
-                                              dout_ptr0,
-                                              dout_ptr1,
-                                              weights_c,
-                                              vbias);
-            dout_ptr0 += 1;
-            dout_ptr1 += 1;
-            break;
-        }
-
-        if (flag_bias) {
-          //! deal with w_out pad_0 column post with bias
-          memcpy(dout_ptr0, dout0, pad_0 * sizeof(float));
-          memcpy(dout_ptr1, dout1, pad_0 * sizeof(float));
-        } else {
-          //! deal with w_out pad_0 column post without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-        }
-
-        din0 = din2;
-        din1 = din3;
-        din2 = din4;
-        din3 = din5;
-        din4 = din3 + w_in;
-        din5 = din4 + w_in;
-
-        dout0 = dout1 + w_out;
-        dout1 = dout0 + w_out;
-      }
-      float* dout_pad_end = dout_ch + h_out_new * w_out;
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        memcpy(reinterpret_cast<void*>(dout_pad_end),
-               dout_ch - pad_0 * w_out,
-               pad_0 * w_out * sizeof(float));
-      } else {
-        //! deal with h_out pad_0 line without bias
-        memset(reinterpret_cast<void*>(dout_pad_end),
-               0x00,
-               pad_0 * w_out * sizeof(float));
-      }
-    }
-  }
-}
-
-void conv_depthwise_5x5s1_small_impl(const float* din,
-                                     float* dout,
-                                     int num,
-                                     int ch_out,
-                                     int h_out,
-                                     int w_out,
-                                     int ch_in,
-                                     int h_in,
-                                     int w_in,
-                                     const float* weights,
-                                     const float* bias,
-                                     int pad,
-                                     bool flag_bias,
-                                     bool flag_relu,
-                                     ARMContext* ctx) {
-  int pad_new = pad > 4 ? 4 : pad;
-  int pad_0 = pad - pad_new;
-  int h_in_new = h_in + 2 * pad_new;
-  int w_in_new = w_in + 2 * pad_new;
-  int h_out_new = h_out - 2 * pad_0;
-  int w_out_new = w_out - 2 * pad_0;
-  float zero_ptr[w_in_new + w_out];
-  memset(zero_ptr, 0, w_in_new * sizeof(float));
-  float* write_ptr = zero_ptr + w_in_new;
-  int pad_cnt = pad_0 >> 2;
-  int pad_remain = pad_0 - (pad_cnt << 2);
-  int bias_cnt = (w_out * pad_0) >> 2;
-  int bias_remain = (w_out * pad_0) - (bias_cnt << 2);
-  int in_spatial_size = w_in_new * h_in_new;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  float* din_new = prepad_input(din, num, ch_in, h_in, w_in, pad_new);
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din_new + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      float bias_c = flag_bias ? bias[c] : 0.f;
-      float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-      float32x4_t vbias_c = vdupq_n_f32(bias_c);
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        for (int i = 0; i < bias_cnt; ++i) {
-          vst1q_f32(dout_ch, vbias_c);
-          dout_ch += 4;
-        }
-        for (int i = 0; i < bias_remain; ++i) {
-          *dout_ch++ = bias_c;
-        }
-      } else {
-        //! deal with h_out pad_0 line without bias
-        for (int i = 0; i < pad_0; ++i) {
-          memset(dout_ch, 0x00, w_out * sizeof(float));
-          dout_ch += w_out;
-        }
-      }
-      //! every h loop, deal with 6 line input
-      const float* din0 = din_ch;
-      const float* din1 = din0 + w_in_new;
-      const float* din2 = din1 + w_in_new;
-      const float* din3 = din2 + w_in_new;
-      const float* din4 = din3 + w_in_new;
-      const float* din5 = din4 + w_in_new;
-      //! every h loop, deal with 2 line output
-      float* dout0 = dout_ch;
-      float* dout1 = dout0 + w_out;
-
-      const float* weights_c = weights + c * weights_saptial_size;
-
-      //! h loop
-      for (int h = 0; h < h_out_new; h += 2) {
-        //! (h - pad_new) + 6 > h_in - 1
-        if (h + 6 > h_in_new) {
-          switch (h + 6 - h_in_new) {
-            case 5:
-              din1 = zero_ptr;
-            case 4:
-              din2 = zero_ptr;
-            case 3:
-              din3 = zero_ptr;
-            case 2:
-              din4 = zero_ptr;
-            case 1:
-              din5 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        if (h + 2 > h_out_new) {
-          dout1 = write_ptr;
-        }
-        const float* din_ptr0 = din0;
-        const float* din_ptr1 = din1;
-        const float* din_ptr2 = din2;
-        const float* din_ptr3 = din3;
-        const float* din_ptr4 = din4;
-        const float* din_ptr5 = din5;
-
-        float* dout_ptr0 = dout0;
-        float* dout_ptr1 = dout1;
-
-        if (flag_bias) {
-          //! deal with w_out pad_0 column pre with bias
-          for (int i = 0; i < pad_cnt; i++) {
-            vst1q_f32(dout_ptr0, vbias_c);
-            vst1q_f32(dout_ptr1, vbias_c);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-          }
-          for (int i = 0; i < pad_remain; ++i) {
-            *dout_ptr0++ = bias_c;
-            *dout_ptr1++ = bias_c;
-          }
-        } else {
-          //! deal with w_out pad_0 column pre without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-          dout_ptr0 += pad_0;
-          dout_ptr1 += pad_0;
-        }
-        //! mid loop
-        for (int i = 0; i < w_out_new; ++i) {
-          compute_one_out_without_extract(din_ptr0,
-                                          din_ptr1,
-                                          din_ptr2,
-                                          din_ptr3,
-                                          din_ptr4,
-                                          din_ptr5,
-                                          dout_ptr0,
-                                          dout_ptr1,
-                                          weights_c,
-                                          vbias);
-          din_ptr0++;
-          din_ptr1++;
-          din_ptr2++;
-          din_ptr3++;
-          din_ptr4++;
-          din_ptr5++;
-
-          dout_ptr0++;
-          dout_ptr1++;
-        }
-        if (flag_bias) {
-          //! deal with w_out pad_0 column post with bias
-          memcpy(dout_ptr0, dout0, pad_0 * sizeof(float));
-          memcpy(dout_ptr1, dout1, pad_0 * sizeof(float));
-        } else {
-          //! deal with w_out pad_0 column post without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-        }
-
-        din0 = din2;
-        din1 = din3;
-        din2 = din4;
-        din3 = din5;
-        din4 = din3 + w_in_new;
-        din5 = din4 + w_in_new;
-
-        dout0 = dout1 + w_out;
-        dout1 = dout0 + w_out;
-      }
-      float* dout_pad_end = dout_ch + h_out_new * w_out;
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        memcpy(reinterpret_cast<void*>(dout_pad_end),
-               dout_ch - pad_0 * w_out,
-               pad_0 * w_out * sizeof(float));
-      } else {
-        //! deal with h_out pad_0 line without bias
-        memset(reinterpret_cast<void*>(dout_pad_end),
-               0x00,
-               pad_0 * w_out * sizeof(float));
-      }
-    }
-  }
-  free(din_new);
-}
-
-void conv_depthwise_5x5s1_small_relu_impl(const float* din,
-                                          float* dout,
-                                          int num,
-                                          int ch_out,
-                                          int h_out,
-                                          int w_out,
-                                          int ch_in,
-                                          int h_in,
-                                          int w_in,
-                                          const float* weights,
-                                          const float* bias,
-                                          int pad,
-                                          bool flag_bias,
-                                          bool flag_relu,
-                                          ARMContext* ctx) {
-  int pad_new = pad > 4 ? 4 : pad;
-  int pad_0 = pad - pad_new;
-  int h_in_new = h_in + 2 * pad_new;
-  int w_in_new = w_in + 2 * pad_new;
-  int h_out_new = h_out - 2 * pad_0;
-  int w_out_new = w_out - 2 * pad_0;
-  float zero_ptr[w_in_new + w_out];
-  memset(zero_ptr, 0, w_in_new * sizeof(float));
-  float* write_ptr = zero_ptr + w_in_new;
-  int pad_cnt = pad_0 >> 2;
-  int pad_remain = pad_0 - (pad_cnt << 2);
-  int bias_cnt = (w_out * pad_0) >> 2;
-  int bias_remain = (w_out * pad_0) - (bias_cnt << 2);
-  int in_spatial_size = w_in_new * h_in_new;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  float* din_new = prepad_input(din, num, ch_in, h_in, w_in, pad_new);
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din_new + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      float bias_c = flag_bias ? bias[c] : 0.f;
-      float bias_relu = bias_c > 0.f ? bias_c : 0.f;
-      float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-      float32x4_t vbias_c = vdupq_n_f32(bias_relu);
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        for (int i = 0; i < bias_cnt; ++i) {
-          vst1q_f32(dout_ch, vbias_c);
-          dout_ch += 4;
-        }
-        for (int i = 0; i < bias_remain; ++i) {
-          *dout_ch++ = bias_relu;
-        }
-      } else {
-        //! deal with h_out pad_0 line without bias
-        for (int i = 0; i < pad_0; ++i) {
-          memset(dout_ch, 0x00, w_out * sizeof(float));
-          dout_ch += w_out;
-        }
-      }
-      //! every h loop, deal with 6 line input
-      const float* din0 = din_ch;
-      const float* din1 = din0 + w_in_new;
-      const float* din2 = din1 + w_in_new;
-      const float* din3 = din2 + w_in_new;
-      const float* din4 = din3 + w_in_new;
-      const float* din5 = din4 + w_in_new;
-      //! every h loop, deal with 2 line output
-      float* dout0 = dout_ch;
-      float* dout1 = dout0 + w_out;
-
-      const float* weights_c = weights + c * weights_saptial_size;
-
-      //! h loop
-      for (int h = 0; h < h_out_new; h += 2) {
-        //! (h - pad_new) + 6 > h_in - 1
-        if (h + 6 > h_in_new) {
-          switch (h + 6 - h_in_new) {
-            case 5:
-              din1 = zero_ptr;
-            case 4:
-              din2 = zero_ptr;
-            case 3:
-              din3 = zero_ptr;
-            case 2:
-              din4 = zero_ptr;
-            case 1:
-              din5 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        if (h + 2 > h_out_new) {
-          dout1 = write_ptr;
-        }
-        const float* din_ptr0 = din0;
-        const float* din_ptr1 = din1;
-        const float* din_ptr2 = din2;
-        const float* din_ptr3 = din3;
-        const float* din_ptr4 = din4;
-        const float* din_ptr5 = din5;
-
-        const float* weights_ptr = weights_c;
-        float* dout_ptr0 = dout0;
-        float* dout_ptr1 = dout1;
-
-        if (flag_bias) {
-          //! deal with w_out pad_0 column pre with bias
-          for (int i = 0; i < pad_cnt; i++) {
-            vst1q_f32(dout_ptr0, vbias_c);
-            vst1q_f32(dout_ptr1, vbias_c);
-            dout_ptr0 += 4;
-            dout_ptr1 += 4;
-          }
-          for (int i = 0; i < pad_remain; ++i) {
-            *dout_ptr0++ = bias_relu;
-            *dout_ptr1++ = bias_relu;
-          }
-        } else {
-          //! deal with w_out pad_0 column pre without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-          dout_ptr0 += pad_0;
-          dout_ptr1 += pad_0;
-        }
-        //! mid loop
-        for (int i = 0; i < w_out_new; ++i) {
-          compute_one_out_without_extract_relu(din_ptr0,
-                                               din_ptr1,
-                                               din_ptr2,
-                                               din_ptr3,
-                                               din_ptr4,
-                                               din_ptr5,
-                                               dout_ptr0,
-                                               dout_ptr1,
-                                               weights_c,
-                                               vbias);
-          din_ptr0++;
-          din_ptr1++;
-          din_ptr2++;
-          din_ptr3++;
-          din_ptr4++;
-          din_ptr5++;
-
-          dout_ptr0++;
-          dout_ptr1++;
-        }
-        if (flag_bias) {
-          //! deal with w_out pad_0 column post with bias
-          memcpy(dout_ptr0, dout0, pad_0 * sizeof(float));
-          memcpy(dout_ptr1, dout1, pad_0 * sizeof(float));
-        } else {
-          //! deal with w_out pad_0 column post without bias
-          memset(dout_ptr0, 0x00, pad_0 * sizeof(float));
-          memset(dout_ptr1, 0x00, pad_0 * sizeof(float));
-        }
-
-        din0 = din2;
-        din1 = din3;
-        din2 = din4;
-        din3 = din5;
-        din4 = din3 + w_in_new;
-        din5 = din4 + w_in_new;
-
-        dout0 = dout1 + w_out;
-        dout1 = dout0 + w_out;
-      }
-      float* dout_pad_end = dout_ch + h_out_new * w_out;
-      if (flag_bias) {
-        //! deal with h_out pad_0 line with bias
-        memcpy(reinterpret_cast<void*>(dout_pad_end),
-               dout_ch - pad_0 * w_out,
-               pad_0 * w_out * sizeof(float));
-      } else {
-        //! deal with h_out pad_0 line without bias
-        memset(reinterpret_cast<void*>(dout_pad_end),
-               0x00,
-               pad_0 * w_out * sizeof(float));
-      }
-    }
-  }
-  free(din_new);
-}
-#endif  // __aarch64__
-
-void conv_depthwise_5x5s1(const float* din,
-                          float* dout,
-                          int num,
-                          int chout,
-                          int hout,
-                          int wout,
-                          int chin,
-                          int hin,
-                          int win,
-                          const float* weights,
-                          const float* bias,
-                          int pad,
-                          bool flag_bias,
-                          bool flag_relu,
-                          ARMContext* ctx) {
-  if (win < 4) {
-    if (flag_relu) {
-      conv_depthwise_5x5s1_small_relu_impl(din,
-                                           dout,
-                                           num,
-                                           chout,
-                                           hout,
-                                           wout,
-                                           chin,
-                                           hin,
-                                           win,
-                                           weights,
-                                           bias,
-                                           pad,
-                                           flag_bias,
-                                           flag_relu,
-                                           ctx);
-    } else {
-      conv_depthwise_5x5s1_small_impl(din,
-                                      dout,
-                                      num,
-                                      chout,
-                                      hout,
-                                      wout,
-                                      chin,
-                                      hin,
-                                      win,
-                                      weights,
-                                      bias,
-                                      pad,
-                                      flag_bias,
-                                      flag_relu,
-                                      ctx);
-    }
-  } else {
-    if (flag_relu) {
-      conv_depthwise_5x5s1_relu_impl(din,
-                                     dout,
-                                     num,
-                                     chout,
-                                     hout,
-                                     wout,
-                                     chin,
-                                     hin,
-                                     win,
-                                     weights,
-                                     bias,
-                                     pad,
-                                     flag_bias,
-                                     flag_relu,
-                                     ctx);
-    } else {
-      conv_depthwise_5x5s1_impl(din,
-                                dout,
-                                num,
-                                chout,
-                                hout,
-                                wout,
-                                chin,
-                                hin,
-                                win,
-                                weights,
-                                bias,
-                                pad,
-                                flag_bias,
-                                flag_relu,
-                                ctx);
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_depthwise_5x5s1_int8.cc b/lite/backends/arm/math/conv_depthwise_5x5s1_int8.cc
deleted file mode 100644
index 0d0034dd85..0000000000
--- a/lite/backends/arm/math/conv_depthwise_5x5s1_int8.cc
+++ /dev/null
@@ -1,618 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <arm_neon.h>
-#include "lite/backends/arm/math/conv_block_utils.h"
-#include "lite/backends/arm/math/conv_impl.h"
-#include "lite/core/context.h"
-#include "lite/operators/op_params.h"
-#ifdef ARM_WITH_OMP
-#include <omp.h>
-#endif
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void conv_depthwise_5x5s1_int8(int32_t* dout,
-                               const int8_t* din,
-                               const int8_t* weights,
-                               const int* bias,
-                               bool flag_bias,
-                               bool flag_relu,
-                               const int num,
-                               const int chin,
-                               const int hin,
-                               const int win,
-                               const int hout,
-                               const int wout,
-                               ARMContext* ctx,
-                               PrecisionType out_type,
-                               const float* scale);
-
-void conv_depthwise_5x5_int8(const int8_t* din,
-                             int32_t* dout,
-                             int num,
-                             int chout,
-                             int hout,
-                             int wout,
-                             int chin,
-                             int hin,
-                             int win,
-                             const int8_t* weights,
-                             const int32_t* bias,
-                             const operators::ConvParam& param,
-                             ARMContext* ctx,
-                             PrecisionType out_type,
-                             const float* scale) {
-  int stride_h = param.strides[0];
-  bool flag_relu = param.fuse_relu;
-  bool flag_bias = param.bias != nullptr;
-  // if (param.activation_param.has_active){
-  //     if (param.activation_param.active == Active_relu ||
-  //     fabs(param.activation_param.negative_slope) > 1e-6f){
-  //         flag_relu = true;
-  //     }
-  // }
-  if (stride_h == 1) {
-#ifdef __aarch64__
-    conv_depthwise_5x5s1_int8(dout,
-                              din,
-                              weights,
-                              bias,
-                              flag_bias,
-                              flag_relu,
-                              num,
-                              chin,
-                              hin,
-                              win,
-                              hout,
-                              wout,
-                              ctx,
-                              out_type,
-                              scale);
-#else
-
-    LOG(FATAL) << "5x5 dw conv armv7 has not impl";
-#endif
-  }
-}
-
-/**
- * \brief depthwise convolution, kernel size 5x5, stride 1, pad 1, with bias,
- * width > 4
- */
-// 2 line
-#ifdef __aarch64__
-
-template <typename Dtype>
-inline void prefetch(const Dtype* din) {
-#ifdef __aarch64__
-  asm volatile("PRFM PLDL1KEEP, [%[din]] \n" : : [din] "r"(din) : "memory");
-#else
-  asm volatile("pld [%[din]] \n" : : [din] "r"(din) : "memory");
-#endif
-}
-
-void conv_depthwise_5x5s1_int8(
-    int32_t* dout,
-    const int8_t* din,
-    const int8_t* weights,
-    const int32_t* bias,
-    bool flag_bias,
-    bool flag_relu,
-    const int num,
-    const int chin,
-    const int hin,
-    const int win,
-    const int hout,
-    const int wout,
-    ARMContext* ctx,
-    PrecisionType od_type,
-    float const* scales) {  /// scale_size = channel-out
-
-  // printf("5*5 multiply\n");
-  int size_in_channel = win * hin;
-  int size_out_channel = wout * hout;
-  int w_stride = 5 * 5;
-
-  static int const stride_w = 1;
-  int const stride_h = stride_w;
-  int const chout = chin;
-  int const pad_w = 2;
-  int const pad_h = pad_w;
-
-  int const wout_round = ((wout + 7) / 8) * 8;
-  int const win_round = wout_round * stride_w + 5 - 1;
-  int const hout_round = ((hout + 2) / 3) * 3;
-  int const hin_round = hout_round * stride_h + 5 - 1;
-  int const tile_h = hout_round / 3;
-  int const tile_w = wout_round / 8;
-
-  int const pre_in_size = hin_round * win_round;
-  int const pre_out_size = hout_round * wout_round;
-  int const pre_io_size = pre_in_size + pre_out_size * sizeof(int);
-
-  int const hs = -pad_h;
-  int const he = hs + hin_round;
-  int const ws = -pad_w;
-  int const we = ws + win_round;
-
-  // signed char* tmp_work_space = new signed char [1024*5];
-  signed char* tmp_work_space = ctx->workspace_data<signed char>();
-  signed char* ptr_zero = tmp_work_space;
-  int* ptr_write = reinterpret_cast<int*>(ptr_zero + win_round);
-  signed char* pre_data =
-      reinterpret_cast<signed char*>(ptr_write + wout_round);
-
-  memset(ptr_zero, 0, win_round * sizeof(signed char));
-
-  for (int n = 0; n < num; ++n) {
-    signed char const* din_batch = din + n * chin * size_in_channel;
-    int* dout_batch = dout + n * chout * size_out_channel;
-
-    // #pragma omp parallel for
-    for (int c = 0; c < chout; c++) {
-#ifdef ARM_WITH_OMP
-      int const thno = omp_get_thread_num();
-#else
-      int const thno = 0;
-#endif
-      signed char const* din_channel = din_batch + c * size_in_channel;
-      signed char* pre_din = pre_data + thno * pre_io_size;
-      int* pre_out = reinterpret_cast<int*>(pre_din + pre_in_size);
-      int* dout_ptr = pre_out;
-
-      prepack_input_nxw(din_channel,
-                        pre_din,
-                        c,
-                        c + 1,
-                        hs,
-                        he,
-                        ws,
-                        we,
-                        1,
-                        win,
-                        hin,
-                        ptr_zero);
-
-      signed char const* wei_ptr = weights + c * w_stride;
-      int bias_val = flag_bias ? bias[c] : 0.f;
-
-      int8x8_t wr00 = vdup_n_s8(wei_ptr[0 * 5 + 0]);
-      int8x8_t wr01 = vdup_n_s8(wei_ptr[0 * 5 + 1]);
-      int8x8_t wr02 = vdup_n_s8(wei_ptr[0 * 5 + 2]);
-      int8x8_t wr03 = vdup_n_s8(wei_ptr[0 * 5 + 3]);
-      int8x8_t wr04 = vdup_n_s8(wei_ptr[0 * 5 + 4]);
-
-      int8x8_t wr10 = vdup_n_s8(wei_ptr[1 * 5 + 0]);
-      int8x8_t wr11 = vdup_n_s8(wei_ptr[1 * 5 + 1]);
-      int8x8_t wr12 = vdup_n_s8(wei_ptr[1 * 5 + 2]);
-      int8x8_t wr13 = vdup_n_s8(wei_ptr[1 * 5 + 3]);
-      int8x8_t wr14 = vdup_n_s8(wei_ptr[1 * 5 + 4]);
-
-      int8x8_t wr20 = vdup_n_s8(wei_ptr[2 * 5 + 0]);
-      int8x8_t wr21 = vdup_n_s8(wei_ptr[2 * 5 + 1]);
-      int8x8_t wr22 = vdup_n_s8(wei_ptr[2 * 5 + 2]);
-      int8x8_t wr23 = vdup_n_s8(wei_ptr[2 * 5 + 3]);
-      int8x8_t wr24 = vdup_n_s8(wei_ptr[2 * 5 + 4]);
-
-      int8x8_t wr30 = vdup_n_s8(wei_ptr[3 * 5 + 0]);
-      int8x8_t wr31 = vdup_n_s8(wei_ptr[3 * 5 + 1]);
-      int8x8_t wr32 = vdup_n_s8(wei_ptr[3 * 5 + 2]);
-      int8x8_t wr33 = vdup_n_s8(wei_ptr[3 * 5 + 3]);
-      int8x8_t wr34 = vdup_n_s8(wei_ptr[3 * 5 + 4]);
-
-      int8x8_t wr40 = vdup_n_s8(wei_ptr[4 * 5 + 0]);
-      int8x8_t wr41 = vdup_n_s8(wei_ptr[4 * 5 + 1]);
-      int8x8_t wr42 = vdup_n_s8(wei_ptr[4 * 5 + 2]);
-      int8x8_t wr43 = vdup_n_s8(wei_ptr[4 * 5 + 3]);
-      int8x8_t wr44 = vdup_n_s8(wei_ptr[4 * 5 + 4]);
-
-      int* doutr0 = nullptr;
-      int* doutr1 = nullptr;
-      int* doutr2 = nullptr;
-
-      signed char const* dr0 = pre_din;
-      signed char const* dr1 = dr0 + win_round;
-      signed char const* dr2 = dr1 + win_round;
-      signed char const* dr3 = dr2 + win_round;
-      signed char const* dr4 = dr3 + win_round;
-      signed char const* dr5 = dr4 + win_round;
-      signed char const* dr6 = dr5 + win_round;
-
-      signed char const* din_ptr0 = nullptr;
-      signed char const* din_ptr1 = nullptr;
-      signed char const* din_ptr2 = nullptr;
-      signed char const* din_ptr3 = nullptr;
-      signed char const* din_ptr4 = nullptr;
-      signed char const* din_ptr5 = nullptr;
-      signed char const* din_ptr6 = nullptr;
-
-      for (int h = 0; h < tile_h; h++) {
-        // printf("c:%d h:%d\n", c, h);
-        doutr0 = dout_ptr;
-        doutr1 = doutr0 + wout_round;
-        doutr2 = doutr1 + wout_round;
-
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-        din_ptr3 = dr3;
-        din_ptr4 = dr4;
-        din_ptr5 = dr5;
-        din_ptr6 = dr6;
-
-        prefetch(doutr0);
-        prefetch(doutr1);
-        prefetch(doutr2);
-        prefetch(din_ptr0);
-        prefetch(din_ptr1);
-        prefetch(din_ptr2);
-        prefetch(din_ptr3);
-        prefetch(din_ptr4);
-        prefetch(din_ptr5);
-        prefetch(din_ptr6);
-
-        for (int j = 0; j < tile_w; ++j) {
-          // printf("j:%d\n", j);
-          int32x4_t voutr00 = vdupq_n_s32(bias_val);
-          int32x4_t voutr01 = vdupq_n_s32(bias_val);
-          int32x4_t voutr10 = vdupq_n_s32(bias_val);
-          int32x4_t voutr11 = vdupq_n_s32(bias_val);
-          int32x4_t voutr20 = vdupq_n_s32(bias_val);
-          int32x4_t voutr21 = vdupq_n_s32(bias_val);
-
-          // din data
-          int8x8_t vinr00 = vld1_s8(din_ptr0 + 0);
-          int8x8_t vinr01 = vld1_s8(din_ptr0 + 8);
-          int8x8_t vinr10 = vld1_s8(din_ptr1 + 0);
-          int8x8_t vinr11 = vld1_s8(din_ptr1 + 8);
-          int8x8_t vinr20 = vld1_s8(din_ptr2 + 0);
-          int8x8_t vinr21 = vld1_s8(din_ptr2 + 8);
-          int8x8_t vinr30 = vld1_s8(din_ptr3 + 0);
-          int8x8_t vinr31 = vld1_s8(din_ptr3 + 8);
-          int8x8_t vinr40 = vld1_s8(din_ptr4 + 0);
-          int8x8_t vinr41 = vld1_s8(din_ptr4 + 8);
-          int8x8_t vinr50 = vld1_s8(din_ptr5 + 0);
-          int8x8_t vinr51 = vld1_s8(din_ptr5 + 8);
-          int8x8_t vinr60 = vld1_s8(din_ptr6 + 0);
-          int8x8_t vinr61 = vld1_s8(din_ptr6 + 8);
-
-          /// the first row
-          // r0
-          int8x8_t vtmp1 = vext_s8(vinr00, vinr01, 1);  // 12345678
-          int8x8_t vtmp2 = vext_s8(vinr00, vinr01, 2);  // 2345678
-          int8x8_t vtmp3 = vext_s8(vinr00, vinr01, 3);  // 345678
-          int8x8_t vtmp4 = vext_s8(vinr00, vinr01, 4);  // 45678
-
-          int16x8_t tvoutr0 = vmull_s8(vinr00, wr00);
-          tvoutr0 = vmlal_s8(tvoutr0, vtmp1, wr01);
-          voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0));
-          voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0));
-          tvoutr0 = vmull_s8(vtmp2, wr02);
-          tvoutr0 = vmlal_s8(tvoutr0, vtmp3, wr03);
-          voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0));
-          voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0));
-          tvoutr0 = vmull_s8(vtmp4, wr04);
-          voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0));
-          voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0));
-
-          // r1
-          vtmp1 = vext_s8(vinr10, vinr11, 1);  // 12345678
-          vtmp2 = vext_s8(vinr10, vinr11, 2);  // 2345678
-          vtmp3 = vext_s8(vinr10, vinr11, 3);  // 345678
-          vtmp4 = vext_s8(vinr10, vinr11, 4);  // 45678
-
-          tvoutr0 = vmull_s8(vinr10, wr10);
-          tvoutr0 = vmlal_s8(tvoutr0, vtmp1, wr11);
-          voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0));
-          voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0));
-          tvoutr0 = vmull_s8(vtmp2, wr12);
-          tvoutr0 = vmlal_s8(tvoutr0, vtmp3, wr13);
-          voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0));
-          voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0));
-          tvoutr0 = vmull_s8(vtmp4, wr14);
-          voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0));
-          voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0));
-
-          int16x8_t tvoutr1 = vmull_s8(vinr10, wr00);
-          tvoutr1 = vmlal_s8(tvoutr1, vtmp1, wr01);
-          voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1));
-          voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1));
-          tvoutr1 = vmull_s8(vtmp2, wr02);
-          tvoutr1 = vmlal_s8(tvoutr1, vtmp3, wr03);
-          voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1));
-          voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1));
-          tvoutr1 = vmull_s8(vtmp4, wr04);
-          voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1));
-          voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1));
-
-          // r2
-          vtmp1 = vext_s8(vinr20, vinr21, 1);  // 12345678
-          vtmp2 = vext_s8(vinr20, vinr21, 2);  // 2345678
-          vtmp3 = vext_s8(vinr20, vinr21, 3);  // 345678
-          vtmp4 = vext_s8(vinr20, vinr21, 4);  // 45678
-
-          tvoutr0 = vmull_s8(vinr20, wr20);
-          tvoutr0 = vmlal_s8(tvoutr0, vtmp1, wr21);
-          voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0));
-          voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0));
-          tvoutr0 = vmull_s8(vtmp2, wr22);
-          tvoutr0 = vmlal_s8(tvoutr0, vtmp3, wr23);
-          voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0));
-          voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0));
-          tvoutr0 = vmull_s8(vtmp4, wr24);
-          voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0));
-          voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0));
-
-          tvoutr1 = vmull_s8(vinr20, wr10);
-          tvoutr1 = vmlal_s8(tvoutr1, vtmp1, wr11);
-          voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1));
-          voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1));
-          tvoutr1 = vmull_s8(vtmp2, wr12);
-          tvoutr1 = vmlal_s8(tvoutr1, vtmp3, wr13);
-          voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1));
-          voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1));
-          tvoutr1 = vmull_s8(vtmp4, wr14);
-          voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1));
-          voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1));
-
-          int16x8_t tvoutr2 = vmull_s8(vinr20, wr00);
-          tvoutr2 = vmlal_s8(tvoutr2, vtmp1, wr01);
-          voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2));
-          voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2));
-          tvoutr2 = vmull_s8(vtmp2, wr02);
-          tvoutr2 = vmlal_s8(tvoutr2, vtmp3, wr03);
-          voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2));
-          voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2));
-          tvoutr2 = vmull_s8(vtmp4, wr04);
-          voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2));
-          voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2));
-
-          // r3
-          vtmp1 = vext_s8(vinr30, vinr31, 1);  // 12345678
-          vtmp2 = vext_s8(vinr30, vinr31, 2);  // 2345678
-          vtmp3 = vext_s8(vinr30, vinr31, 3);  // 345678
-          vtmp4 = vext_s8(vinr30, vinr31, 4);  // 45678
-
-          tvoutr0 = vmull_s8(vinr30, wr30);
-          tvoutr0 = vmlal_s8(tvoutr0, vtmp1, wr31);
-          voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0));
-          voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0));
-          tvoutr0 = vmull_s8(vtmp2, wr32);
-          tvoutr0 = vmlal_s8(tvoutr0, vtmp3, wr33);
-          voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0));
-          voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0));
-          tvoutr0 = vmull_s8(vtmp4, wr34);
-          voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0));
-          voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0));
-
-          tvoutr1 = vmull_s8(vinr30, wr20);
-          tvoutr1 = vmlal_s8(tvoutr1, vtmp1, wr21);
-          voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1));
-          voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1));
-          tvoutr1 = vmull_s8(vtmp2, wr22);
-          tvoutr1 = vmlal_s8(tvoutr1, vtmp3, wr23);
-          voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1));
-          voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1));
-          tvoutr1 = vmull_s8(vtmp4, wr24);
-          voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1));
-          voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1));
-
-          tvoutr2 = vmull_s8(vinr30, wr10);
-          tvoutr2 = vmlal_s8(tvoutr2, vtmp1, wr11);
-          voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2));
-          voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2));
-          tvoutr2 = vmull_s8(vtmp2, wr12);
-          tvoutr2 = vmlal_s8(tvoutr2, vtmp3, wr13);
-          voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2));
-          voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2));
-          tvoutr2 = vmull_s8(vtmp4, wr14);
-          voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2));
-          voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2));
-
-          // r4
-          vtmp1 = vext_s8(vinr40, vinr41, 1);  // 12345678
-          vtmp2 = vext_s8(vinr40, vinr41, 2);  // 2345678
-          vtmp3 = vext_s8(vinr40, vinr41, 3);  // 345678
-          vtmp4 = vext_s8(vinr40, vinr41, 4);  // 45678
-
-          tvoutr0 = vmull_s8(vinr40, wr40);
-          tvoutr0 = vmlal_s8(tvoutr0, vtmp1, wr41);
-          voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0));
-          voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0));
-          tvoutr0 = vmull_s8(vtmp2, wr42);
-          tvoutr0 = vmlal_s8(tvoutr0, vtmp3, wr43);
-          voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0));
-          voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0));
-          tvoutr0 = vmull_s8(vtmp4, wr44);
-          voutr00 = vaddw_s16(voutr00, vget_low_s16(tvoutr0));
-          voutr01 = vaddw_s16(voutr01, vget_high_s16(tvoutr0));
-
-          tvoutr1 = vmull_s8(vinr40, wr30);
-          tvoutr1 = vmlal_s8(tvoutr1, vtmp1, wr31);
-          voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1));
-          voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1));
-          tvoutr1 = vmull_s8(vtmp2, wr32);
-          tvoutr1 = vmlal_s8(tvoutr1, vtmp3, wr33);
-          voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1));
-          voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1));
-          tvoutr1 = vmull_s8(vtmp4, wr34);
-          voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1));
-          voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1));
-
-          tvoutr2 = vmull_s8(vinr40, wr20);
-          tvoutr2 = vmlal_s8(tvoutr2, vtmp1, wr21);
-          voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2));
-          voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2));
-          tvoutr2 = vmull_s8(vtmp2, wr22);
-          tvoutr2 = vmlal_s8(tvoutr2, vtmp3, wr23);
-          voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2));
-          voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2));
-          tvoutr2 = vmull_s8(vtmp4, wr24);
-          voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2));
-          voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2));
-
-          // r5
-          vtmp1 = vext_s8(vinr50, vinr51, 1);  // 12345678
-          vtmp2 = vext_s8(vinr50, vinr51, 2);  // 2345678
-          vtmp3 = vext_s8(vinr50, vinr51, 3);  // 345678
-          vtmp4 = vext_s8(vinr50, vinr51, 4);  // 45678
-
-          tvoutr1 = vmull_s8(vinr50, wr40);
-          tvoutr1 = vmlal_s8(tvoutr1, vtmp1, wr41);
-          voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1));
-          voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1));
-          tvoutr1 = vmull_s8(vtmp2, wr42);
-          tvoutr1 = vmlal_s8(tvoutr1, vtmp3, wr43);
-          voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1));
-          voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1));
-          tvoutr1 = vmull_s8(vtmp4, wr44);
-          voutr10 = vaddw_s16(voutr10, vget_low_s16(tvoutr1));
-          voutr11 = vaddw_s16(voutr11, vget_high_s16(tvoutr1));
-
-          tvoutr2 = vmull_s8(vinr50, wr30);
-          tvoutr2 = vmlal_s8(tvoutr2, vtmp1, wr31);
-          voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2));
-          voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2));
-          tvoutr2 = vmull_s8(vtmp2, wr32);
-          tvoutr2 = vmlal_s8(tvoutr2, vtmp3, wr33);
-          voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2));
-          voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2));
-          tvoutr2 = vmull_s8(vtmp4, wr34);
-          voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2));
-          voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2));
-
-          // r6
-          vtmp1 = vext_s8(vinr60, vinr61, 1);  // 12345678
-          vtmp2 = vext_s8(vinr60, vinr61, 2);  // 2345678
-          vtmp3 = vext_s8(vinr60, vinr61, 3);  // 345678
-          vtmp4 = vext_s8(vinr60, vinr61, 4);  // 45678
-
-          tvoutr2 = vmull_s8(vinr60, wr40);
-          tvoutr2 = vmlal_s8(tvoutr2, vtmp1, wr41);
-          voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2));
-          voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2));
-          tvoutr2 = vmull_s8(vtmp2, wr42);
-          tvoutr2 = vmlal_s8(tvoutr2, vtmp3, wr43);
-          voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2));
-          voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2));
-          tvoutr2 = vmull_s8(vtmp4, wr44);
-          voutr20 = vaddw_s16(voutr20, vget_low_s16(tvoutr2));
-          voutr21 = vaddw_s16(voutr21, vget_high_s16(tvoutr2));
-
-          /// data shift 8 bytes
-          din_ptr0 += 8;
-          din_ptr1 += 8;
-          din_ptr2 += 8;
-          din_ptr3 += 8;
-          din_ptr4 += 8;
-          din_ptr5 += 8;
-          din_ptr6 += 8;
-
-          /// store
-          vst1q_s32(doutr0, voutr00);
-          vst1q_s32(doutr1, voutr10);
-          vst1q_s32(doutr2, voutr20);
-          doutr0 += 4;
-          doutr1 += 4;
-          doutr2 += 4;
-          vst1q_s32(doutr0, voutr01);
-          vst1q_s32(doutr1, voutr11);
-          vst1q_s32(doutr2, voutr21);
-          doutr0 += 4;
-          doutr1 += 4;
-          doutr2 += 4;
-        }  /// end of tile_w
-
-        dr0 = dr3;
-        dr1 = dr4;
-        dr2 = dr5;
-        dr3 = dr6;
-        dr4 = dr3 + win_round;
-        dr5 = dr4 + win_round;
-        dr6 = dr5 + win_round;
-
-        dout_ptr = dout_ptr + 3 * wout_round;
-      }  /// end of tile_h
-
-      if (scales == 0) {
-        write_to_output_numc(pre_out,
-                             dout_batch,
-                             1,
-                             hout_round,
-                             c,
-                             c + 1,
-                             0,
-                             hout,
-                             0,
-                             wout_round,
-                             chout,
-                             hout,
-                             wout,
-                             flag_relu,
-                             ptr_write);
-      } else if (od_type == PRECISION(kFloat)) {
-        write2_to_output_numc(pre_out,
-                              reinterpret_cast<float*>(dout_batch),
-                              1,
-                              hout_round,
-                              c,
-                              c + 1,
-                              0,
-                              hout,
-                              0,
-                              wout_round,
-                              chout,
-                              hout,
-                              wout,
-                              flag_relu,
-                              reinterpret_cast<float*>(ptr_write),
-                              scales);
-      } else if (od_type == PRECISION(kInt8)) {
-        write2_to_output_numc(pre_out,
-                              reinterpret_cast<signed char*>(dout_batch),
-                              1,
-                              hout_round,
-                              c,
-                              c + 1,
-                              0,
-                              hout,
-                              0,
-                              wout_round,
-                              chout,
-                              hout,
-                              wout,
-                              flag_relu,
-                              reinterpret_cast<signed char*>(ptr_write),
-                              scales);
-      }
-      // else if (od_type == AK_INT32) {
-      //     write2_to_output_numc(pre_out, (int*)dout_batch, 1, hout_round, c,
-      //     c+1,
-      //         0, hout, 0, wout_round, chout, hout, wout, flag_relu,
-      //         (int*)ptr_write, scales);
-      // }
-    }  /// end of chout
-  }    /// end of batch num
-}
-
-#endif  // __aarch64__
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_depthwise_5x5s2.cc b/lite/backends/arm/math/conv_depthwise_5x5s2.cc
deleted file mode 100644
index dd715fd534..0000000000
--- a/lite/backends/arm/math/conv_depthwise_5x5s2.cc
+++ /dev/null
@@ -1,3746 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/conv_depthwise.h"
-#include <arm_neon.h>
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void conv_depthwise_5x5s2p2(const float* din,
-                            float* dout,
-                            int num,
-                            int ch_out,
-                            int h_out,
-                            int w_out,
-                            int ch_in,
-                            int h_in,
-                            int w_in,
-                            const float* weights,
-                            const float* bias,
-                            bool flag_bias,
-                            bool flag_relu,
-                            ARMContext* ctx);
-
-void conv_depthwise_5x5s2p2_relu(const float* din,
-                                 float* dout,
-                                 int num,
-                                 int ch_out,
-                                 int h_out,
-                                 int w_out,
-                                 int ch_in,
-                                 int h_in,
-                                 int w_in,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 bool flag_relu,
-                                 ARMContext* ctx);
-
-void conv_depthwise_5x5s2p2_s(const float* din,
-                              float* dout,
-                              int num,
-                              int ch_out,
-                              int h_out,
-                              int w_out,
-                              int ch_in,
-                              int h_in,
-                              int w_in,
-                              const float* weights,
-                              const float* bias,
-                              bool flag_bias,
-                              bool flag_relu,
-                              ARMContext* ctx);
-
-void conv_depthwise_5x5s2p2_relu_s(const float* din,
-                                   float* dout,
-                                   int num,
-                                   int ch_out,
-                                   int h_out,
-                                   int w_out,
-                                   int ch_in,
-                                   int h_in,
-                                   int w_in,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   bool flag_relu,
-                                   ARMContext* ctx);
-
-void conv_depthwise_5x5s2(const float* din,
-                          float* dout,
-                          int num,
-                          int chout,
-                          int hout,
-                          int wout,
-                          int chin,
-                          int hin,
-                          int win,
-                          const float* weights,
-                          const float* bias,
-                          int pad,
-                          bool flag_bias,
-                          bool flag_relu,
-                          ARMContext* ctx) {
-  if (pad == 2) {
-    if (win >= 9) {
-      if (flag_relu) {
-        conv_depthwise_5x5s2p2_relu(din,
-                                    dout,
-                                    num,
-                                    chout,
-                                    hout,
-                                    wout,
-                                    chin,
-                                    hin,
-                                    win,
-                                    weights,
-                                    bias,
-                                    flag_bias,
-                                    flag_relu,
-                                    ctx);
-      } else {
-        conv_depthwise_5x5s2p2(din,
-                               dout,
-                               num,
-                               chout,
-                               hout,
-                               wout,
-                               chin,
-                               hin,
-                               win,
-                               weights,
-                               bias,
-                               flag_bias,
-                               flag_relu,
-                               ctx);
-      }
-    } else {
-      if (flag_relu) {
-        conv_depthwise_5x5s2p2_relu_s(din,
-                                      dout,
-                                      num,
-                                      chout,
-                                      hout,
-                                      wout,
-                                      chin,
-                                      hin,
-                                      win,
-                                      weights,
-                                      bias,
-                                      flag_bias,
-                                      flag_relu,
-                                      ctx);
-      } else {
-        conv_depthwise_5x5s2p2_s(din,
-                                 dout,
-                                 num,
-                                 chout,
-                                 hout,
-                                 wout,
-                                 chin,
-                                 hin,
-                                 win,
-                                 weights,
-                                 bias,
-                                 flag_bias,
-                                 flag_relu,
-                                 ctx);
-      }
-    }
-  }
-}
-
-#ifdef __aarch64__
-
-//! larger depthwise, win >= 9;
-void conv_depthwise_5x5s2p2(const float* din,
-                            float* dout,
-                            int num,
-                            int ch_out,
-                            int h_out,
-                            int w_out,
-                            int ch_in,
-                            int h_in,
-                            int w_in,
-                            const float* weights,
-                            const float* bias,
-                            bool flag_bias,
-                            bool flag_relu,
-                            ARMContext* ctx) {
-  CHECK_GE(w_in, 9) << "only support win >= 9";
-  int w_out_round = (w_out + 3) / 4 * 4;
-  int cnt = (w_out_round - 4) / 4;
-  int mid_cnt = cnt - 1;
-  int right_start = cnt * 2 * 4 - 2;
-  int mask_cnt = 12 - (w_in - right_start);
-  int mask[12];
-  memset(mask, 0xff, 12 * sizeof(int));
-  for (int i = 0; i < mask_cnt; ++i) {
-    mask[11 - i] = 0;
-  }
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-  int in_spatial_size = w_in * h_in;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      const float* din0 = zero_ptr;
-      const float* din1 = zero_ptr;
-      const float* din2 = din_ch;
-      const float* din3 = din2 + w_in;
-      const float* din4 = din3 + w_in;
-      const float* din5 = din4 + w_in;
-      const float* din6 = din5 + w_in;
-
-      float out_buf0[4];
-      float out_buf1[4];
-      float* dout0 = dout_ch;
-      float* dout1 = dout0 + w_out;
-
-      const float* weights_c = weights + c * weights_saptial_size;
-      for (int h = 0; h < h_out; h += 2) {
-        //! (h * 2 - 2) + 6 > h_in - 1
-        if (h * 2 + 5 > h_in) {
-          switch (h * 2 + 5 - h_in) {
-            case 6:
-              din1 = zero_ptr;
-            case 5:
-              din2 = zero_ptr;
-            case 4:
-              din3 = zero_ptr;
-            case 3:
-              din4 = zero_ptr;
-            case 2:
-              din5 = zero_ptr;
-            case 1:
-              din6 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        if (h + 2 > h_out) {
-          switch (h + 2 - h_out) {
-            case 1:
-              dout1 = write_ptr;
-            default:
-              break;
-          }
-        }
-        const float* din_ptr0 = din0;
-        const float* din_ptr1 = din1;
-        const float* din_ptr2 = din2;
-        const float* din_ptr3 = din3;
-        const float* din_ptr4 = din4;
-        const float* din_ptr5 = din5;
-        const float* din_ptr6 = din6;
-
-        const float* weights_ptr = weights_c;
-        float* dout_ptr0 = dout0;
-        float* dout_ptr1 = dout1;
-
-        float bias_c = 0.f;
-        if (flag_bias) {
-          bias_c = bias[c];
-        }
-        float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-        int* mask_ptr = mask;
-        int loop = mid_cnt;
-        const int s_8 = 8;
-        const int s_16 = 16;
-
-        //! in r0, r1/r4, r2/r5, r3/r6: x 0 2 4 -- v8   v13  v18  v23
-        //! in r0, r1/r4, r2/r5, r3/r6: x 1 3 5 -- v9   v14  v19  v24
-        //! in r0, r1/r4, r2/r5, r3/r6: 0 2 4 6 -- v6   v11  v16  v21
-        //! in r0, r1/r4, r2/r5, r3/r6: 1 3 5 7 -- v7   v12  v17  v22
-        //! in r0, r1/r4, r2/r5, r3/r6: 2 4 6 8 -- v10  v15  v20  v25
-        //! out r0, r1 -- v26, v27
-        asm volatile(
-            "movi   v31.4s, #0x0\n"
-            "prfm pldl1keep, [%[din_ptr0]]  \n"
-            "prfm pldl1keep, [%[din_ptr1]]  \n"
-            "prfm pldl1keep, [%[din_ptr2]]  \n"
-            "prfm pldl1keep, [%[din_ptr3]]  \n"
-            "prfm pldl1keep, [%[din_ptr4]]  \n"
-            "prfm pldl1keep, [%[din_ptr5]]  \n"
-            "prfm pldl1keep, [%[din_ptr6]]  \n"
-            "prfm pldl1keep, [%[weights]]   \n"
-            "prfm pldl1keep, [%[mask]]      \n"
-            // left
-            "ld2 {v6.4s, v7.4s}, [%[din_ptr0]], #32             \n"  // r0 v6: 0
-                                                                     // 2 4 6,
-                                                                     // v7: 1 3
-                                                                     // 5 7
-            "ext v8.16b, v31.16b, v6.16b, #12                   \n"  // r0 v8: x
-                                                                     // 0 2 4
-            "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], #32           \n"  // r1 v11:
-                                                                     // 0 2 4 6,
-                                                                     // v12: 1 3
-                                                                     // 5 7
-            "ext v9.16b, v31.16b, v7.16b, #12                   \n"  // r0 v9: x
-                                                                     // 1 3 5
-            "ld1 {v0.4s, v1.4s}, [%[weights]], #32              \n"  // load
-                                                                     // weights
-                                                                     // 0-7
-            "ext v10.16b, v6.16b, v31.16b, #4                   \n"
-            "ld1 {v10.s}[3], [%[din_ptr0]]                      \n"  // r0 v10:
-                                                                     // 2 4 6 8
-            "sub %[din_ptr0], %[din_ptr0], #8                   \n"
-            "ext v13.16b, v31.16b, v11.16b, #12                 \n"  // r1 v13:
-                                                                     // x 0 2 4
-            "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], #32           \n"  // r2 v16:
-                                                                     // 0 2 4 6,
-                                                                     // v17: 1 3
-                                                                     // 5 7
-            "ext v14.16b, v31.16b, v12.16b, #12                 \n"  // r1 v14:
-                                                                     // x 1 3 5
-            "ld1 {v2.4s, v3.4s}, [%[weights]], #32              \n"  // load
-                                                                     // weights
-                                                                     // 8-15
-            "ext v15.16b, v11.16b, v31.16b, #4                  \n"
-            "ld1 {v15.s}[3], [%[din_ptr1]]                      \n"  // r1 v15:
-                                                                     // 2 4 6
-            "sub %[din_ptr1], %[din_ptr1], #8                   \n"
-            "ext v18.16b, v31.16b, v16.16b, #12                 \n"  // r2 v18:
-                                                                     // x 0 2 4
-            "ld1 {v4.4s, v5.4s}, [%[weights]], #32              \n"  // load
-                                                                     // weights
-                                                                     // 16-23
-            "ext v19.16b, v31.16b, v17.16b, #12                 \n"  // r2 v19:
-                                                                     // x 1 3 5
-            "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], #32           \n"  // r3 v21:
-                                                                     // 0 2 4 6,
-                                                                     // v22: 1 3
-                                                                     // 5 7
-            "ext v20.16b, v16.16b, v31.16b, #4                  \n"
-            "ld1 {v20.s}[3], [%[din_ptr2]]                      \n"  // r2 v20:
-                                                                     // 2 4 6 8
-            "sub %[din_ptr2], %[din_ptr2], #8                   \n"
-            "ext v23.16b, v31.16b, v21.16b, #12                 \n"  // r3 v23:
-                                                                     // x 0 2 4
-            "ld1 {v30.4s}, [%[weights]]                         \n"  // load
-                                                                     // weights
-                                                                     // 24
-            "ext v24.16b, v31.16b, v22.16b, #12                 \n"  // r3 v24:
-                                                                     // x 1 3 5
-            "ld1 {v26.4s}, [%[vbias]]                           \n"  // load
-                                                                     // bias to
-                                                                     // out_r0
-            "ext v25.16b, v21.16b, v31.16b, #4                  \n"
-            "ld1 {v25.s}[3], [%[din_ptr3]]                      \n"  // r2 v25:
-                                                                     // 2 4 6 8
-            "sub %[din_ptr3], %[din_ptr3], #8                   \n"
-            "mov v27.16b, v26.16b                               \n"  // load
-                                                                     // bias to
-                                                                     // out_r1
-            "mov v28.16b, v31.16b                               \n"  // load
-                                                                     // zero to
-                                                                     // out_r0
-            "mov v29.16b, v31.16b                               \n"  // load
-                                                                     // zero to
-                                                                     // out_r1
-
-            "fmla v26.4s, v8.4s, v0.s[0]                        \n"  // out r0:
-                                                                     // w0
-            "fmla v28.4s, v9.4s, v0.s[1]                        \n"  // out r0:
-                                                                     // w1
-            "fmla v26.4s, v6.4s, v0.s[2]                        \n"  // out r0:
-                                                                     // w2
-            "fmla v28.4s, v7.4s, v0.s[3]                        \n"  // out r0:
-                                                                     // w3
-
-            "ld2 {v8.4s, v9.4s}, [%[din_ptr0]], %[s_8]          \n"  // next r0
-                                                                     // v8: 0 2
-                                                                     // 4 6, v9:
-                                                                     // 1 3 5 7
-
-            "fmla v26.4s, v10.4s, v1.s[0]                       \n"  // out r0:
-                                                                     // w4
-            "fmla v28.4s, v13.4s, v1.s[1]                       \n"  // out r0:
-                                                                     // w5
-            "fmla v26.4s, v14.4s, v1.s[2]                       \n"  // out r0:
-                                                                     // w6
-            "fmla v28.4s, v11.4s, v1.s[3]                       \n"  // out r0:
-                                                                     // w7
-
-            "ld2 {v6.4s, v7.4s}, [%[din_ptr0]], %[s_8]          \n"  // next r0
-                                                                     // v6: 2 4
-                                                                     // 6 8, v7:
-                                                                     // 3 5 7 9
-
-            "fmla v26.4s, v12.4s, v2.s[0]                       \n"  // out r0:
-                                                                     // w8
-            "fmla v28.4s, v15.4s, v2.s[1]                       \n"  // out r0:
-                                                                     // w9
-            "fmla v26.4s, v18.4s, v2.s[2]                       \n"  // out r0:
-                                                                     // w10
-            "fmla v28.4s, v19.4s, v2.s[3]                       \n"  // out r0:
-                                                                     // w11
-
-            "ld2 {v10.4s, v11.4s}, [%[din_ptr0]], %[s_16]       \n"  // next r0
-                                                                     // v10: 4 6
-                                                                     // 8 10,
-                                                                     // v11:
-                                                                     // trash
-                                                                     // register
-
-            "fmla v26.4s, v16.4s, v3.s[0]                       \n"  // out r0:
-                                                                     // w12
-            "fmla v28.4s, v17.4s, v3.s[1]                       \n"  // out r0:
-                                                                     // w13
-            "fmla v26.4s, v20.4s, v3.s[2]                       \n"  // out r0:
-                                                                     // w14
-            "fmla v28.4s, v23.4s, v3.s[3]                       \n"  // out r0:
-                                                                     // w15
-            "prfm pldl1keep, [%[din_ptr0]]                      \n"
-
-            "ld2 {v11.4s, v12.4s}, [%[din_ptr4]], #32           \n"  // r4 v11:
-                                                                     // 0 2 4 6,
-                                                                     // v12: 1 3
-                                                                     // 5 7
-
-            "fmla v26.4s, v24.4s, v4.s[0]                       \n"  // out r0:
-                                                                     // w16
-            "fmla v28.4s, v21.4s, v4.s[1]                       \n"  // out r0:
-                                                                     // w17
-
-            "ext v13.16b, v31.16b, v11.16b, #12                 \n"  // r4 v13:
-                                                                     // x 0 2 4
-            "ext v14.16b, v31.16b, v12.16b, #12                 \n"  // r4 v14:
-                                                                     // x 1 3 5
-            "ext v15.16b, v11.16b, v31.16b, #4                  \n"
-
-            "fmla v26.4s, v22.4s, v4.s[2]                       \n"  // out r0:
-                                                                     // w18
-            "fmla v28.4s, v25.4s, v4.s[3]                       \n"  // out r0:
-                                                                     // w19
-
-            "ld1 {v15.s}[3], [%[din_ptr4]]                      \n"  // r4 v15:
-                                                                     // 2 4 6
-
-            "fmla v27.4s, v18.4s, v0.s[0]                       \n"  // out r1:
-                                                                     // w0
-            "fmla v29.4s, v19.4s, v0.s[1]                       \n"  // out r1:
-                                                                     // w1
-
-            "sub %[din_ptr4], %[din_ptr4], #8                   \n"
-
-            "fmla v27.4s, v16.4s, v0.s[2]                       \n"  // out r1:
-                                                                     // w2
-            "fmla v29.4s, v17.4s, v0.s[3]                       \n"  // out r1:
-                                                                     // w3
-            "fmla v27.4s, v20.4s, v1.s[0]                       \n"  // out r1:
-                                                                     // w4
-            "fmla v29.4s, v23.4s, v1.s[1]                       \n"  // out r1:
-                                                                     // w5
-
-            "ld2 {v16.4s, v17.4s}, [%[din_ptr5]], #32           \n"  // r5 v16:
-                                                                     // 0 2 4 6,
-                                                                     // v17: 1 3
-                                                                     // 5 7
-
-            "fmla v27.4s, v24.4s, v1.s[2]                       \n"  // out r1:
-                                                                     // w6
-            "fmla v29.4s, v21.4s, v1.s[3]                       \n"  // out r1:
-                                                                     // w7
-
-            "ext v18.16b, v31.16b, v16.16b, #12                 \n"  // r5 v18:
-                                                                     // x 0 2 4
-            "ext v19.16b, v31.16b, v17.16b, #12                 \n"  // r5 v19:
-                                                                     // x 1 3 5
-            "ext v20.16b, v16.16b, v31.16b, #4                  \n"
-
-            "fmla v27.4s, v22.4s, v2.s[0]                       \n"  // out r1:
-                                                                     // w8
-            "fmla v29.4s, v25.4s, v2.s[1]                       \n"  // out r1:
-                                                                     // w9
-
-            "ld1 {v20.s}[3], [%[din_ptr5]]                      \n"  // r5 v20:
-                                                                     // 2 4 6
-            "ld2 {v21.4s, v22.4s}, [%[din_ptr6]], #32           \n"  // r6 v21:
-                                                                     // 0 2 4 6,
-                                                                     // v22: 1 3
-                                                                     // 5 7
-
-            "ext v23.16b, v31.16b, v21.16b, #12                 \n"  // r6 v23:
-                                                                     // x 0 2 4
-            "ext v24.16b, v31.16b, v22.16b, #12                 \n"  // r6 v24:
-                                                                     // x 1 3 5
-            "ext v25.16b, v21.16b, v31.16b, #4                  \n"
-            "sub %[din_ptr5], %[din_ptr5], #8                   \n"
-
-            "fmla v26.4s, v11.4s, v5.s[2]                       \n"  // out r0:
-                                                                     // w22
-            "fmla v28.4s, v12.4s, v5.s[3]                       \n"  // out r0:
-                                                                     // w23
-
-            "ld1 {v25.s}[3], [%[din_ptr6]]                      \n"  // r6 v25:
-                                                                     // 2 4 6
-
-            "fmla v26.4s, v13.4s, v5.s[0]                       \n"  // out r0:
-                                                                     // w20
-            "fmla v28.4s, v14.4s, v5.s[1]                       \n"  // out r0:
-                                                                     // w21
-
-            "sub %[din_ptr6], %[din_ptr6], #8                   \n"
-
-            "fmla v26.4s, v15.4s, v30.s[0]                      \n"  // out r0:
-                                                                     // w24
-            "fmla v27.4s, v13.4s, v2.s[2]                       \n"  // out r1:
-                                                                     // w10
-
-            "fadd v26.4s, v26.4s, v28.4s                        \n"
-            "fmla v29.4s, v14.4s, v2.s[3]                       \n"  // out r1:
-                                                                     // w11
-
-            "ld2 {v13.4s, v14.4s}, [%[din_ptr1]], %[s_8]        \n"  // next r1
-                                                                     // v13: 0 2
-                                                                     // 4 6,
-                                                                     // v14: 1 3
-                                                                     // 5 7
-            "fmla v27.4s, v11.4s, v3.s[0]                       \n"  // out r1:
-                                                                     // w12
-            "fmla v29.4s, v12.4s, v3.s[1]                       \n"  // out r1:
-                                                                     // w13
-
-            "st1 {v26.4s}, [%[dout_ptr0]], %[s_16]              \n"  // store
-                                                                     // output
-                                                                     // r0
-            "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], %[s_8]        \n"  // next r1
-                                                                     // v11: 2 4
-                                                                     // 6 8,
-                                                                     // v12: 3 5
-                                                                     // 7 9
-
-            "fmla v27.4s, v15.4s, v3.s[2]                       \n"  // out r1:
-                                                                     // w14
-            "fmla v29.4s, v16.4s, v4.s[1]                       \n"  // out r1:
-                                                                     // w17
-            "fmla v27.4s, v18.4s, v3.s[3]                       \n"  // out r1:
-                                                                     // w15
-            "fmla v29.4s, v19.4s, v4.s[0]                       \n"  // out r1:
-                                                                     // w16
-
-            "ld2 {v15.4s, v16.4s}, [%[din_ptr1]], %[s_16]       \n"  // next r1
-                                                                     // v15: 4 6
-                                                                     // 8 10,
-                                                                     // v16:
-                                                                     // trash
-                                                                     // register
-
-            "fmla v27.4s, v17.4s, v4.s[2]                       \n"  // out r1:
-                                                                     // w18
-            "fmla v29.4s, v20.4s, v4.s[3]                       \n"  // out r1:
-                                                                     // w19
-
-            "ld2 {v18.4s, v19.4s}, [%[din_ptr2]], %[s_8]        \n"  // next r2
-                                                                     // v18: 0 2
-                                                                     // 4 6,
-                                                                     // v19: 1 3
-                                                                     // 5 7
-            "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], %[s_8]        \n"  // next r2
-                                                                     // v16: 2 4
-                                                                     // 6 8,
-                                                                     // v11: 3 5
-                                                                     // 7 9
-
-            "fmla v27.4s, v23.4s, v5.s[0]                       \n"  // out r1:
-                                                                     // w20
-            "fmla v29.4s, v21.4s, v5.s[2]                       \n"  // out r1:
-                                                                     // w22
-            "fmla v27.4s, v24.4s, v5.s[1]                       \n"  // out r1:
-                                                                     // w21
-            "fmla v29.4s, v22.4s, v5.s[3]                       \n"  // out r1:
-                                                                     // w23
-
-            "ld2 {v20.4s, v21.4s}, [%[din_ptr2]], %[s_16]       \n"  // next r2
-                                                                     // v20: 4 6
-                                                                     // 8 10,
-                                                                     // v21:
-                                                                     // trash
-                                                                     // register
-            "ld2 {v23.4s, v24.4s}, [%[din_ptr3]], %[s_8]        \n"  // next r3
-                                                                     // v23: 0 2
-                                                                     // 4 6,
-                                                                     // v24: 1 3
-                                                                     // 5 7
-
-            "fmla v27.4s, v25.4s, v30.s[0]                      \n"  // out r1:
-                                                                     // w24
-
-            "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], %[s_8]        \n"  // next r3
-                                                                     // v21: 2 4
-                                                                     // 6 8,
-                                                                     // v22: 3 5
-                                                                     // 7 9
-            "ld2 {v25.4s, v26.4s}, [%[din_ptr3]], %[s_16]       \n"  // next r3
-                                                                     // v25: 4 6
-                                                                     // 8 10,
-                                                                     // v26:
-                                                                     // trash
-                                                                     // register
-
-            "fadd v27.4s, v27.4s, v29.4s                        \n"
-            "cmp %w[mid_cnt], #1                                \n"
-
-            "prfm pldl1keep, [%[din_ptr1]]                      \n"
-            "prfm pldl1keep, [%[din_ptr2]]                      \n"
-            "prfm pldl1keep, [%[din_ptr3]]                      \n"
-
-            "st1 {v27.4s}, [%[dout_ptr1]], #16                  \n"
-            "blt 2f                                             \n"
-
-            // mid loop
-            "1:                                                 \n"
-            "ld1 {v26.4s}, [%[vbias]]                           \n"
-            "mov v27.16b, v26.16b                               \n"
-            "mov v28.16b, v31.16b                               \n"
-            "mov v29.16b, v31.16b                               \n"
-
-            // out_r0 r0-r3
-            "fmla v26.4s, v8.4s, v0.s[0]                        \n"
-            "fmla v28.4s, v9.4s, v0.s[1]                        \n"
-            "fmla v26.4s, v6.4s, v0.s[2]                        \n"
-            "fmla v28.4s, v7.4s, v0.s[3]                        \n"
-
-            "ld2 {v8.4s, v9.4s}, [%[din_ptr0]], %[s_8]          \n"
-
-            "fmla v26.4s, v10.4s, v1.s[0]                       \n"
-            "fmla v28.4s, v11.4s, v1.s[3]                       \n"
-
-            "ld2 {v6.4s, v7.4s}, [%[din_ptr0]], %[s_8]          \n"
-
-            "fmla v26.4s, v14.4s, v1.s[2]                       \n"
-            "fmla v28.4s, v13.4s, v1.s[1]                       \n"
-
-            "ld2 {v10.4s, v11.4s}, [%[din_ptr0]], %[s_16]       \n"
-            "prfm pldl1keep, [%[din_ptr0]]                      \n"
-
-            "fmla v26.4s, v12.4s, v2.s[0]                       \n"
-            "fmla v28.4s, v15.4s, v2.s[1]                       \n"
-
-            "ld2 {v13.4s, v14.4s}, [%[din_ptr4]], %[s_8]        \n"
-
-            "fmla v26.4s, v16.4s, v3.s[0]                       \n"
-            "fmla v27.4s, v16.4s, v0.s[2]                       \n"
-
-            "ld2 {v11.4s, v12.4s}, [%[din_ptr4]], %[s_8]        \n"
-
-            "fmla v28.4s, v19.4s, v2.s[3]                       \n"
-            "fmla v29.4s, v19.4s, v0.s[1]                       \n"
-
-            "ld2 {v15.4s, v16.4s}, [%[din_ptr4]], %[s_16]       \n"
-            "prfm pldl1keep, [%[din_ptr4]]                      \n"
-
-            "fmla v26.4s, v18.4s, v2.s[2]                       \n"
-            "fmla v27.4s, v18.4s, v0.s[0]                       \n"
-
-            "fmla v28.4s, v17.4s, v3.s[1]                       \n"
-            "fmla v29.4s, v17.4s, v0.s[3]                       \n"
-
-            "ld2 {v18.4s, v19.4s}, [%[din_ptr5]], %[s_8]        \n"
-
-            "fmla v26.4s, v20.4s, v3.s[2]                       \n"
-            "fmla v27.4s, v20.4s, v1.s[0]                       \n"
-
-            "ld2 {v16.4s, v17.4s}, [%[din_ptr5]], %[s_8]        \n"
-
-            "fmla v29.4s, v21.4s, v1.s[3]                       \n"
-            "fmla v28.4s, v21.4s, v4.s[1]                       \n"
-            "fmla v28.4s, v23.4s, v3.s[3]                       \n"
-            "fmla v29.4s, v23.4s, v1.s[1]                       \n"
-
-            "ld2 {v20.4s, v21.4s}, [%[din_ptr5]], %[s_16]       \n"
-            "prfm pldl1keep, [%[din_ptr5]]                      \n"
-
-            "fmla v26.4s, v24.4s, v4.s[0]                       \n"
-            "fmla v27.4s, v24.4s, v1.s[2]                       \n"
-
-            "ld2 {v23.4s, v24.4s}, [%[din_ptr6]], %[s_8]        \n"
-
-            "fmla v27.4s, v22.4s, v2.s[0]                       \n"
-            "fmla v26.4s, v22.4s, v4.s[2]                       \n"
-
-            "fmla v28.4s, v25.4s, v4.s[3]                       \n"
-            "fmla v29.4s, v25.4s, v2.s[1]                       \n"
-
-            "ld2 {v21.4s, v22.4s}, [%[din_ptr6]], %[s_8]        \n"
-            "fadd v28.4s, v26.4s, v28.4s                        \n"
-
-            "ld2 {v25.4s, v26.4s}, [%[din_ptr6]], %[s_16]       \n"
-            "mov v26.16b, v31.16b                               \n"
-            "prfm pldl1keep, [%[din_ptr6]]                      \n"
-
-            "fmla v26.4s, v13.4s, v5.s[0]                       \n"
-            "fmla v28.4s, v14.4s, v5.s[1]                       \n"
-            "fmla v27.4s, v13.4s, v2.s[2]                       \n"
-            "fmla v29.4s, v14.4s, v2.s[3]                       \n"
-
-            "ld2 {v13.4s, v14.4s}, [%[din_ptr1]], %[s_8]        \n"
-
-            "fmla v26.4s, v11.4s, v5.s[2]                       \n"
-            "fmla v28.4s, v12.4s, v5.s[3]                       \n"
-            "fmla v27.4s, v11.4s, v3.s[0]                       \n"
-            "fmla v29.4s, v12.4s, v3.s[1]                       \n"
-
-            "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], %[s_8]        \n"
-
-            "fmla v26.4s, v15.4s, v30.s[0]                      \n"
-            "fmla v27.4s, v15.4s, v3.s[2]                       \n"
-            "fmla v29.4s, v16.4s, v4.s[1]                       \n"
-            "fmla v27.4s, v17.4s, v4.s[2]                       \n"
-
-            "ld2 {v15.4s, v16.4s}, [%[din_ptr1]], %[s_16]       \n"
-            "prfm pldl1keep, [%[din_ptr1]]                      \n"
-
-            "fmla v29.4s, v18.4s, v3.s[3]                       \n"
-            "fmla v27.4s, v19.4s, v4.s[0]                       \n"
-
-            "ld2 {v18.4s, v19.4s}, [%[din_ptr2]], %[s_8]        \n"
-
-            "fmla v29.4s, v20.4s, v4.s[3]                       \n"
-
-            "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], %[s_8]        \n"
-
-            "fmla v27.4s, v23.4s, v5.s[0]                       \n"
-            "fmla v27.4s, v21.4s, v5.s[2]                       \n"
-
-            "ld2 {v20.4s, v21.4s}, [%[din_ptr2]], %[s_16]       \n"
-
-            "fmla v29.4s, v24.4s, v5.s[1]                       \n"
-
-            "ld2 {v23.4s, v24.4s}, [%[din_ptr3]], %[s_8]        \n"
-            "prfm pldl1keep, [%[din_ptr2]]                      \n"
-
-            "fmla v29.4s, v22.4s, v5.s[3]                       \n"
-
-            "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], %[s_8]        \n"
-
-            "fmla v27.4s, v25.4s, v30.s[0]                      \n"
-
-            "fadd v26.4s, v26.4s, v28.4s                        \n"
-
-            "prfm pldl1keep, [%[din_ptr3]]                      \n"
-
-            "fadd v27.4s, v27.4s, v29.4s                        \n"
-
-            "st1 {v26.4s}, [%[dout_ptr0]], #16                  \n"
-            "st1 {v27.4s}, [%[dout_ptr1]], #16                  \n"
-
-            "ld2 {v25.4s, v26.4s}, [%[din_ptr3]], %[s_16]       \n"
-            "subs %w[mid_cnt], %w[mid_cnt], #1                  \n"
-            "bne 1b                                             \n"
-
-            "2:                                                 \n"
-            "ld2 {v26.4s, v27.4s}, [%[mask]], %[s_8]            \n"
-            "ld2 {v28.4s, v29.4s}, [%[mask]], %[s_8]            \n"
-            "bif v8.16b, v31.16b, v26.16b                       \n"
-            "bif v9.16b, v31.16b, v27.16b                       \n"
-            "bif v6.16b, v31.16b, v28.16b                       \n"
-            "bif v7.16b, v31.16b, v29.16b                       \n"
-
-            "bif v13.16b, v31.16b, v26.16b                      \n"
-            "bif v14.16b, v31.16b, v27.16b                      \n"
-            "bif v11.16b, v31.16b, v28.16b                      \n"
-            "bif v12.16b, v31.16b, v29.16b                      \n"
-
-            "bif v18.16b, v31.16b, v26.16b                      \n"
-            "bif v19.16b, v31.16b, v27.16b                      \n"
-            "bif v16.16b, v31.16b, v28.16b                      \n"
-            "bif v17.16b, v31.16b, v29.16b                      \n"
-
-            "bif v23.16b, v31.16b, v26.16b                      \n"
-            "bif v24.16b, v31.16b, v27.16b                      \n"
-            "bif v21.16b, v31.16b, v28.16b                      \n"
-            "bif v22.16b, v31.16b, v29.16b                      \n"
-
-            "ld2 {v28.4s, v29.4s}, [%[mask]]                    \n"
-            "ld1 {v26.4s}, [%[vbias]]                           \n"
-            "mov v29.16b, v31.16b                               \n"
-
-            "bif v10.16b, v31.16b, v28.16b                      \n"
-            "bif v15.16b, v31.16b, v28.16b                      \n"
-
-            "mov v27.16b, v26.16b                               \n"
-
-            "bif v20.16b, v31.16b, v28.16b                      \n"
-            "bif v25.16b, v31.16b, v28.16b                      \n"
-            "mov v28.16b, v31.16b                               \n"
-
-            "fmla v26.4s, v8.4s, v0.s[0]                        \n"
-            "fmla v28.4s, v9.4s, v0.s[1]                        \n"
-            "fmla v26.4s, v6.4s, v0.s[2]                        \n"
-            "fmla v28.4s, v7.4s, v0.s[3]                        \n"
-
-            "fmla v26.4s, v10.4s, v1.s[0]                       \n"
-            "fmla v28.4s, v13.4s, v1.s[1]                       \n"
-            "fmla v26.4s, v14.4s, v1.s[2]                       \n"
-            "fmla v28.4s, v11.4s, v1.s[3]                       \n"
-
-            "sub %[mask], %[mask], #16                          \n"
-            "ld2 {v6.4s, v7.4s}, [%[mask]], %[s_8]              \n"
-            "ld2 {v8.4s, v9.4s}, [%[mask]], %[s_8]              \n"
-            "ld2 {v10.4s, v11.4s}, [%[mask]]                    \n"
-
-            "fmla v26.4s, v12.4s, v2.s[0]                       \n"
-            "fmla v28.4s, v15.4s, v2.s[1]                       \n"
-
-            "ld2 {v13.4s, v14.4s}, [%[din_ptr4]], %[s_8]        \n"
-
-            "fmla v26.4s, v16.4s, v3.s[0]                       \n"
-            "fmla v28.4s, v17.4s, v3.s[1]                       \n"
-
-            "ld2 {v11.4s, v12.4s}, [%[din_ptr4]], %[s_8]        \n"
-
-            "fmla v27.4s, v16.4s, v0.s[2]                       \n"
-            "fmla v29.4s, v17.4s, v0.s[3]                       \n"
-
-            "ld2 {v15.4s, v16.4s}, [%[din_ptr4]]                \n"
-
-            "fmla v26.4s, v18.4s, v2.s[2]                       \n"
-            "fmla v28.4s, v19.4s, v2.s[3]                       \n"
-            "fmla v27.4s, v18.4s, v0.s[0]                       \n"
-            "fmla v29.4s, v19.4s, v0.s[1]                       \n"
-
-            "bif  v13.16b, v31.16b, v6.16b                      \n"
-            "bif  v14.16b, v31.16b, v7.16b                      \n"
-            "bif  v11.16b, v31.16b, v8.16b                      \n"
-            "bif  v12.16b, v31.16b, v9.16b                      \n"
-            "bif  v15.16b, v31.16b, v10.16b                     \n"
-
-            "ld2 {v18.4s, v19.4s}, [%[din_ptr5]], %[s_8]        \n"
-
-            "fmla v26.4s, v20.4s, v3.s[2]                       \n"
-            "fmla v27.4s, v20.4s, v1.s[0]                       \n"
-
-            "ld2 {v16.4s, v17.4s}, [%[din_ptr5]], %[s_8]        \n"
-
-            "fmla v29.4s, v21.4s, v1.s[3]                       \n"
-            "fmla v28.4s, v21.4s, v4.s[1]                       \n"
-
-            "ld2 {v20.4s, v21.4s}, [%[din_ptr5]]                \n"
-
-            "fmla v28.4s, v23.4s, v3.s[3]                       \n"
-            "fmla v29.4s, v23.4s, v1.s[1]                       \n"
-            "fmla v27.4s, v24.4s, v1.s[2]                       \n"
-            "fmla v26.4s, v24.4s, v4.s[0]                       \n"
-
-            "bif  v18.16b, v31.16b, v6.16b                      \n"
-            "bif  v19.16b, v31.16b, v7.16b                      \n"
-            "bif  v16.16b, v31.16b, v8.16b                      \n"
-            "bif  v17.16b, v31.16b, v9.16b                      \n"
-            "bif  v20.16b, v31.16b, v10.16b                     \n"
-
-            "ld2 {v23.4s, v24.4s}, [%[din_ptr6]], %[s_8]        \n"
-
-            "fmla v27.4s, v22.4s, v2.s[0]                       \n"
-            "fmla v26.4s, v22.4s, v4.s[2]                       \n"
-
-            "ld2 {v21.4s, v22.4s}, [%[din_ptr6]], %[s_8]        \n"
-
-            "fmla v28.4s, v25.4s, v4.s[3]                       \n"
-            "fmla v29.4s, v25.4s, v2.s[1]                       \n"
-            "fadd v28.4s, v28.4s, v26.4s                        \n"
-
-            "ld2 {v25.4s, v26.4s}, [%[din_ptr6]]                \n"
-            "mov v26.16b, v31.16b                               \n"
-
-            "bif  v23.16b, v31.16b, v6.16b                      \n"
-            "bif  v24.16b, v31.16b, v7.16b                      \n"
-            "bif  v21.16b, v31.16b, v8.16b                      \n"
-            "bif  v22.16b, v31.16b, v9.16b                      \n"
-            "bif  v25.16b, v31.16b, v10.16b                     \n"
-
-            "fmla v26.4s, v13.4s, v5.s[0]                       \n"
-            "fmla v28.4s, v14.4s, v5.s[1]                       \n"
-            "fmla v26.4s, v11.4s, v5.s[2]                       \n"
-            "fmla v28.4s, v12.4s, v5.s[3]                       \n"
-            "fmla v26.4s, v15.4s, v30.s[0]                      \n"
-
-            "fmla v27.4s, v13.4s, v2.s[2]                       \n"
-            "fmla v29.4s, v14.4s, v2.s[3]                       \n"
-            "fmla v27.4s, v11.4s, v3.s[0]                       \n"
-            "fmla v29.4s, v12.4s, v3.s[1]                       \n"
-
-            "fadd v26.4s, v26.4s, v28.4s                        \n"
-            "fmla v27.4s, v15.4s, v3.s[2]                       \n"
-            "fmla v29.4s, v18.4s, v3.s[3]                       \n"
-            "fmla v27.4s, v19.4s, v4.s[0]                       \n"
-            "fmla v29.4s, v16.4s, v4.s[1]                       \n"
-
-            "st1 {v26.4s}, [%[out_buf0]]                        \n"
-            "fmla v27.4s, v17.4s, v4.s[2]                       \n"
-            "fmla v29.4s, v20.4s, v4.s[3]                       \n"
-            "fmla v27.4s, v23.4s, v5.s[0]                       \n"
-            "fmla v29.4s, v24.4s, v5.s[1]                       \n"
-
-            "fmla v27.4s, v21.4s, v5.s[2]                       \n"
-            "fmla v29.4s, v22.4s, v5.s[3]                       \n"
-            "fmla v27.4s, v25.4s, v30.s[0]                      \n"
-            "fadd v27.4s, v27.4s, v29.4s                        \n"
-
-            "st1 {v27.4s}, [%[out_buf1]]                        \n"
-
-            : [dout_ptr0] "+r"(dout_ptr0),
-              [dout_ptr1] "+r"(dout_ptr1),
-              [mid_cnt] "+r"(loop),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [din_ptr5] "+r"(din_ptr5),
-              [din_ptr6] "+r"(din_ptr6),
-              [mask] "+r"(mask_ptr),
-              [weights] "+r"(weights_ptr)
-            : [vbias] "r"(vbias),
-              [out_buf0] "r"(out_buf0),
-              [out_buf1] "r"(out_buf1),
-              [s_8] "r"(s_8),
-              [s_16] "r"(s_16)
-            : "memory",
-              "cc",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22",
-              "v23",
-              "v24",
-              "v25",
-              "v26",
-              "v27",
-              "v28",
-              "v29",
-              "v30",
-              "v31");
-
-        int remain_cnt = w_out - (mid_cnt + 1) * 4;
-        for (int i = 0; i < remain_cnt; ++i) {
-          dout_ptr0[i] = out_buf0[i];
-          dout_ptr1[i] = out_buf1[i];
-        }
-        din0 = din4;
-        din1 = din5;
-        din2 = din6;
-        din3 = din6 + w_in;
-        din4 = din3 + w_in;
-        din5 = din4 + w_in;
-        din6 = din5 + w_in;
-        dout0 = dout1 + w_out;
-        dout1 = dout0 + w_out;
-      }
-    }
-  }
-}
-
-//! larger depthwise, win >= 9;
-void conv_depthwise_5x5s2p2_relu(const float* din,
-                                 float* dout,
-                                 int num,
-                                 int ch_out,
-                                 int h_out,
-                                 int w_out,
-                                 int ch_in,
-                                 int h_in,
-                                 int w_in,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 bool flag_relu,
-                                 ARMContext* ctx) {
-  CHECK_GE(w_in, 9) << "only support win >= 9";
-  int w_out_round = (w_out + 3) / 4 * 4;
-  int cnt = (w_out_round - 4) / 4;
-  int mid_cnt = cnt - 1;
-  int right_start = cnt * 2 * 4 - 2;
-  int mask_cnt = 12 - (w_in - right_start);
-  int mask[12];
-  memset(mask, 0xff, 12 * sizeof(int));
-  for (int i = 0; i < mask_cnt; ++i) {
-    mask[11 - i] = 0;
-  }
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-  int in_spatial_size = w_in * h_in;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      const float* din0 = zero_ptr;
-      const float* din1 = zero_ptr;
-      const float* din2 = din_ch;
-      const float* din3 = din2 + w_in;
-      const float* din4 = din3 + w_in;
-      const float* din5 = din4 + w_in;
-      const float* din6 = din5 + w_in;
-
-      float out_buf0[4];
-      float out_buf1[4];
-      float* dout0 = dout_ch;
-      float* dout1 = dout0 + w_out;
-
-      const float* weights_c = weights + c * weights_saptial_size;
-      for (int h = 0; h < h_out; h += 2) {
-        //! (h * 2 - 2) + 6 > h_in - 1
-        if (h * 2 + 5 > h_in) {
-          switch (h * 2 + 5 - h_in) {
-            case 6:
-              din1 = zero_ptr;
-            case 5:
-              din2 = zero_ptr;
-            case 4:
-              din3 = zero_ptr;
-            case 3:
-              din4 = zero_ptr;
-            case 2:
-              din5 = zero_ptr;
-            case 1:
-              din6 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        if (h + 2 > h_out) {
-          switch (h + 2 - h_out) {
-            case 1:
-              dout1 = write_ptr;
-            default:
-              break;
-          }
-        }
-        const float* din_ptr0 = din0;
-        const float* din_ptr1 = din1;
-        const float* din_ptr2 = din2;
-        const float* din_ptr3 = din3;
-        const float* din_ptr4 = din4;
-        const float* din_ptr5 = din5;
-        const float* din_ptr6 = din6;
-
-        const float* weights_ptr = weights_c;
-        float* dout_ptr0 = dout0;
-        float* dout_ptr1 = dout1;
-
-        float bias_c = 0.f;
-        if (flag_bias) {
-          bias_c = bias[c];
-        }
-        float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-        int* mask_ptr = mask;
-        int loop = mid_cnt;
-        const int s_8 = 8;
-        const int s_16 = 16;
-
-        //! in r0, r1/r4, r2/r5, r3/r6: x 0 2 4 -- v8   v13  v18  v23
-        //! in r0, r1/r4, r2/r5, r3/r6: x 1 3 5 -- v9   v14  v19  v24
-        //! in r0, r1/r4, r2/r5, r3/r6: 0 2 4 6 -- v6   v11  v16  v21
-        //! in r0, r1/r4, r2/r5, r3/r6: 1 3 5 7 -- v7   v12  v17  v22
-        //! in r0, r1/r4, r2/r5, r3/r6: 2 4 6 8 -- v10  v15  v20  v25
-        //! out r0, r1 -- v26, v27
-        asm volatile(
-            "movi   v31.4s, #0x0\n"
-            "prfm pldl1keep, [%[din_ptr0]]  \n"
-            "prfm pldl1keep, [%[din_ptr1]]  \n"
-            "prfm pldl1keep, [%[din_ptr2]]  \n"
-            "prfm pldl1keep, [%[din_ptr3]]  \n"
-            "prfm pldl1keep, [%[din_ptr4]]  \n"
-            "prfm pldl1keep, [%[din_ptr5]]  \n"
-            "prfm pldl1keep, [%[din_ptr6]]  \n"
-            "prfm pldl1keep, [%[weights]]   \n"
-            "prfm pldl1keep, [%[mask]]      \n"
-            // left
-            "ld2 {v6.4s, v7.4s}, [%[din_ptr0]], #32             \n"  // r0 v6: 0
-                                                                     // 2 4 6,
-                                                                     // v7: 1 3
-                                                                     // 5 7
-            "ext v8.16b, v31.16b, v6.16b, #12                   \n"  // r0 v8: x
-                                                                     // 0 2 4
-            "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], #32           \n"  // r1 v11:
-                                                                     // 0 2 4 6,
-                                                                     // v12: 1 3
-                                                                     // 5 7
-            "ext v9.16b, v31.16b, v7.16b, #12                   \n"  // r0 v9: x
-                                                                     // 1 3 5
-            "ld1 {v0.4s, v1.4s}, [%[weights]], #32              \n"  // load
-                                                                     // weights
-                                                                     // 0-7
-            "ext v10.16b, v6.16b, v31.16b, #4                   \n"
-            "ld1 {v10.s}[3], [%[din_ptr0]]                      \n"  // r0 v10:
-                                                                     // 2 4 6 8
-            "sub %[din_ptr0], %[din_ptr0], #8                   \n"
-            "ext v13.16b, v31.16b, v11.16b, #12                 \n"  // r1 v13:
-                                                                     // x 0 2 4
-            "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], #32           \n"  // r2 v16:
-                                                                     // 0 2 4 6,
-                                                                     // v17: 1 3
-                                                                     // 5 7
-            "ext v14.16b, v31.16b, v12.16b, #12                 \n"  // r1 v14:
-                                                                     // x 1 3 5
-            "ld1 {v2.4s, v3.4s}, [%[weights]], #32              \n"  // load
-                                                                     // weights
-                                                                     // 8-15
-            "ext v15.16b, v11.16b, v31.16b, #4                  \n"
-            "ld1 {v15.s}[3], [%[din_ptr1]]                      \n"  // r1 v15:
-                                                                     // 2 4 6
-            "sub %[din_ptr1], %[din_ptr1], #8                   \n"
-            "ext v18.16b, v31.16b, v16.16b, #12                 \n"  // r2 v18:
-                                                                     // x 0 2 4
-            "ld1 {v4.4s, v5.4s}, [%[weights]], #32              \n"  // load
-                                                                     // weights
-                                                                     // 16-23
-            "ext v19.16b, v31.16b, v17.16b, #12                 \n"  // r2 v19:
-                                                                     // x 1 3 5
-            "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], #32           \n"  // r3 v21:
-                                                                     // 0 2 4 6,
-                                                                     // v22: 1 3
-                                                                     // 5 7
-            "ext v20.16b, v16.16b, v31.16b, #4                  \n"
-            "ld1 {v20.s}[3], [%[din_ptr2]]                      \n"  // r2 v20:
-                                                                     // 2 4 6 8
-            "sub %[din_ptr2], %[din_ptr2], #8                   \n"
-            "ext v23.16b, v31.16b, v21.16b, #12                 \n"  // r3 v23:
-                                                                     // x 0 2 4
-            "ld1 {v30.4s}, [%[weights]]                         \n"  // load
-                                                                     // weights
-                                                                     // 24
-            "ext v24.16b, v31.16b, v22.16b, #12                 \n"  // r3 v24:
-                                                                     // x 1 3 5
-            "ld1 {v26.4s}, [%[vbias]]                           \n"  // load
-                                                                     // bias to
-                                                                     // out_r0
-            "ext v25.16b, v21.16b, v31.16b, #4                  \n"
-            "ld1 {v25.s}[3], [%[din_ptr3]]                      \n"  // r2 v25:
-                                                                     // 2 4 6 8
-            "sub %[din_ptr3], %[din_ptr3], #8                   \n"
-            "mov v27.16b, v26.16b                               \n"  // load
-                                                                     // bias to
-                                                                     // out_r1
-            "mov v28.16b, v31.16b                               \n"  // load
-                                                                     // zero to
-                                                                     // out_r0
-            "mov v29.16b, v31.16b                               \n"  // load
-                                                                     // zero to
-                                                                     // out_r1
-
-            "fmla v26.4s, v8.4s, v0.s[0]                        \n"  // out r0:
-                                                                     // w0
-            "fmla v28.4s, v9.4s, v0.s[1]                        \n"  // out r0:
-                                                                     // w1
-            "fmla v26.4s, v6.4s, v0.s[2]                        \n"  // out r0:
-                                                                     // w2
-            "fmla v28.4s, v7.4s, v0.s[3]                        \n"  // out r0:
-                                                                     // w3
-
-            "ld2 {v8.4s, v9.4s}, [%[din_ptr0]], %[s_8]          \n"  // next r0
-                                                                     // v8: 0 2
-                                                                     // 4 6, v9:
-                                                                     // 1 3 5 7
-
-            "fmla v26.4s, v10.4s, v1.s[0]                       \n"  // out r0:
-                                                                     // w4
-            "fmla v28.4s, v13.4s, v1.s[1]                       \n"  // out r0:
-                                                                     // w5
-            "fmla v26.4s, v14.4s, v1.s[2]                       \n"  // out r0:
-                                                                     // w6
-            "fmla v28.4s, v11.4s, v1.s[3]                       \n"  // out r0:
-                                                                     // w7
-
-            "ld2 {v6.4s, v7.4s}, [%[din_ptr0]], %[s_8]          \n"  // next r0
-                                                                     // v6: 2 4
-                                                                     // 6 8, v7:
-                                                                     // 3 5 7 9
-
-            "fmla v26.4s, v12.4s, v2.s[0]                       \n"  // out r0:
-                                                                     // w8
-            "fmla v28.4s, v15.4s, v2.s[1]                       \n"  // out r0:
-                                                                     // w9
-            "fmla v26.4s, v18.4s, v2.s[2]                       \n"  // out r0:
-                                                                     // w10
-            "fmla v28.4s, v19.4s, v2.s[3]                       \n"  // out r0:
-                                                                     // w11
-
-            "ld2 {v10.4s, v11.4s}, [%[din_ptr0]], %[s_16]       \n"  // next r0
-                                                                     // v10: 4 6
-                                                                     // 8 10,
-                                                                     // v11:
-                                                                     // trash
-                                                                     // register
-
-            "fmla v26.4s, v16.4s, v3.s[0]                       \n"  // out r0:
-                                                                     // w12
-            "fmla v28.4s, v17.4s, v3.s[1]                       \n"  // out r0:
-                                                                     // w13
-            "fmla v26.4s, v20.4s, v3.s[2]                       \n"  // out r0:
-                                                                     // w14
-            "fmla v28.4s, v23.4s, v3.s[3]                       \n"  // out r0:
-                                                                     // w15
-            "prfm pldl1keep, [%[din_ptr0]]                      \n"
-
-            "ld2 {v11.4s, v12.4s}, [%[din_ptr4]], #32           \n"  // r4 v11:
-                                                                     // 0 2 4 6,
-                                                                     // v12: 1 3
-                                                                     // 5 7
-
-            "fmla v26.4s, v24.4s, v4.s[0]                       \n"  // out r0:
-                                                                     // w16
-            "fmla v28.4s, v21.4s, v4.s[1]                       \n"  // out r0:
-                                                                     // w17
-
-            "ext v13.16b, v31.16b, v11.16b, #12                 \n"  // r4 v13:
-                                                                     // x 0 2 4
-            "ext v14.16b, v31.16b, v12.16b, #12                 \n"  // r4 v14:
-                                                                     // x 1 3 5
-            "ext v15.16b, v11.16b, v31.16b, #4                  \n"
-
-            "fmla v26.4s, v22.4s, v4.s[2]                       \n"  // out r0:
-                                                                     // w18
-            "fmla v28.4s, v25.4s, v4.s[3]                       \n"  // out r0:
-                                                                     // w19
-
-            "ld1 {v15.s}[3], [%[din_ptr4]]                      \n"  // r4 v15:
-                                                                     // 2 4 6
-
-            "fmla v27.4s, v18.4s, v0.s[0]                       \n"  // out r1:
-                                                                     // w0
-            "fmla v29.4s, v19.4s, v0.s[1]                       \n"  // out r1:
-                                                                     // w1
-
-            "sub %[din_ptr4], %[din_ptr4], #8                   \n"
-
-            "fmla v27.4s, v16.4s, v0.s[2]                       \n"  // out r1:
-                                                                     // w2
-            "fmla v29.4s, v17.4s, v0.s[3]                       \n"  // out r1:
-                                                                     // w3
-            "fmla v27.4s, v20.4s, v1.s[0]                       \n"  // out r1:
-                                                                     // w4
-            "fmla v29.4s, v23.4s, v1.s[1]                       \n"  // out r1:
-                                                                     // w5
-
-            "ld2 {v16.4s, v17.4s}, [%[din_ptr5]], #32           \n"  // r5 v16:
-                                                                     // 0 2 4 6,
-                                                                     // v17: 1 3
-                                                                     // 5 7
-
-            "fmla v27.4s, v24.4s, v1.s[2]                       \n"  // out r1:
-                                                                     // w6
-            "fmla v29.4s, v21.4s, v1.s[3]                       \n"  // out r1:
-                                                                     // w7
-
-            "ext v18.16b, v31.16b, v16.16b, #12                 \n"  // r5 v18:
-                                                                     // x 0 2 4
-            "ext v19.16b, v31.16b, v17.16b, #12                 \n"  // r5 v19:
-                                                                     // x 1 3 5
-            "ext v20.16b, v16.16b, v31.16b, #4                  \n"
-
-            "fmla v27.4s, v22.4s, v2.s[0]                       \n"  // out r1:
-                                                                     // w8
-            "fmla v29.4s, v25.4s, v2.s[1]                       \n"  // out r1:
-                                                                     // w9
-
-            "ld1 {v20.s}[3], [%[din_ptr5]]                      \n"  // r5 v20:
-                                                                     // 2 4 6
-            "ld2 {v21.4s, v22.4s}, [%[din_ptr6]], #32           \n"  // r6 v21:
-                                                                     // 0 2 4 6,
-                                                                     // v22: 1 3
-                                                                     // 5 7
-
-            "ext v23.16b, v31.16b, v21.16b, #12                 \n"  // r6 v23:
-                                                                     // x 0 2 4
-            "ext v24.16b, v31.16b, v22.16b, #12                 \n"  // r6 v24:
-                                                                     // x 1 3 5
-            "ext v25.16b, v21.16b, v31.16b, #4                  \n"
-            "sub %[din_ptr5], %[din_ptr5], #8                   \n"
-
-            "fmla v26.4s, v11.4s, v5.s[2]                       \n"  // out r0:
-                                                                     // w22
-            "fmla v28.4s, v12.4s, v5.s[3]                       \n"  // out r0:
-                                                                     // w23
-
-            "ld1 {v25.s}[3], [%[din_ptr6]]                      \n"  // r6 v25:
-                                                                     // 2 4 6
-
-            "fmla v26.4s, v13.4s, v5.s[0]                       \n"  // out r0:
-                                                                     // w20
-            "fmla v28.4s, v14.4s, v5.s[1]                       \n"  // out r0:
-                                                                     // w21
-
-            "sub %[din_ptr6], %[din_ptr6], #8                   \n"
-
-            "fmla v26.4s, v15.4s, v30.s[0]                      \n"  // out r0:
-                                                                     // w24
-            "fmla v27.4s, v13.4s, v2.s[2]                       \n"  // out r1:
-                                                                     // w10
-
-            "fadd v26.4s, v26.4s, v28.4s                        \n"
-            "fmla v29.4s, v14.4s, v2.s[3]                       \n"  // out r1:
-                                                                     // w11
-            "fmax v26.4s, v26.4s, v31.4s                        \n"
-
-            "ld2 {v13.4s, v14.4s}, [%[din_ptr1]], %[s_8]        \n"  // next r1
-                                                                     // v13: 0 2
-                                                                     // 4 6,
-                                                                     // v14: 1 3
-                                                                     // 5 7
-            "fmla v27.4s, v11.4s, v3.s[0]                       \n"  // out r1:
-                                                                     // w12
-            "fmla v29.4s, v12.4s, v3.s[1]                       \n"  // out r1:
-                                                                     // w13
-
-            "st1 {v26.4s}, [%[dout_ptr0]], %[s_16]              \n"  // store
-                                                                     // output
-                                                                     // r0
-            "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], %[s_8]        \n"  // next r1
-                                                                     // v11: 2 4
-                                                                     // 6 8,
-                                                                     // v12: 3 5
-                                                                     // 7 9
-
-            "fmla v27.4s, v15.4s, v3.s[2]                       \n"  // out r1:
-                                                                     // w14
-            "fmla v29.4s, v16.4s, v4.s[1]                       \n"  // out r1:
-                                                                     // w17
-            "fmla v27.4s, v18.4s, v3.s[3]                       \n"  // out r1:
-                                                                     // w15
-            "fmla v29.4s, v19.4s, v4.s[0]                       \n"  // out r1:
-                                                                     // w16
-
-            "ld2 {v15.4s, v16.4s}, [%[din_ptr1]], %[s_16]       \n"  // next r1
-                                                                     // v15: 4 6
-                                                                     // 8 10,
-                                                                     // v16:
-                                                                     // trash
-                                                                     // register
-
-            "fmla v27.4s, v17.4s, v4.s[2]                       \n"  // out r1:
-                                                                     // w18
-            "fmla v29.4s, v20.4s, v4.s[3]                       \n"  // out r1:
-                                                                     // w19
-
-            "ld2 {v18.4s, v19.4s}, [%[din_ptr2]], %[s_8]        \n"  // next r2
-                                                                     // v18: 0 2
-                                                                     // 4 6,
-                                                                     // v19: 1 3
-                                                                     // 5 7
-            "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], %[s_8]        \n"  // next r2
-                                                                     // v16: 2 4
-                                                                     // 6 8,
-                                                                     // v11: 3 5
-                                                                     // 7 9
-
-            "fmla v27.4s, v23.4s, v5.s[0]                       \n"  // out r1:
-                                                                     // w20
-            "fmla v29.4s, v21.4s, v5.s[2]                       \n"  // out r1:
-                                                                     // w22
-            "fmla v27.4s, v24.4s, v5.s[1]                       \n"  // out r1:
-                                                                     // w21
-            "fmla v29.4s, v22.4s, v5.s[3]                       \n"  // out r1:
-                                                                     // w23
-
-            "ld2 {v20.4s, v21.4s}, [%[din_ptr2]], %[s_16]       \n"  // next r2
-                                                                     // v20: 4 6
-                                                                     // 8 10,
-                                                                     // v21:
-                                                                     // trash
-                                                                     // register
-            "ld2 {v23.4s, v24.4s}, [%[din_ptr3]], %[s_8]        \n"  // next r3
-                                                                     // v23: 0 2
-                                                                     // 4 6,
-                                                                     // v24: 1 3
-                                                                     // 5 7
-
-            "fmla v27.4s, v25.4s, v30.s[0]                      \n"  // out r1:
-                                                                     // w24
-
-            "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], %[s_8]        \n"  // next r3
-                                                                     // v21: 2 4
-                                                                     // 6 8,
-                                                                     // v22: 3 5
-                                                                     // 7 9
-            "ld2 {v25.4s, v26.4s}, [%[din_ptr3]], %[s_16]       \n"  // next r3
-                                                                     // v25: 4 6
-                                                                     // 8 10,
-                                                                     // v26:
-                                                                     // trash
-                                                                     // register
-
-            "fadd v27.4s, v27.4s, v29.4s                        \n"
-            "fmax v27.4s, v27.4s, v31.4s                        \n"
-            "cmp %w[mid_cnt], #1                                \n"
-            "prfm pldl1keep, [%[din_ptr1]]                      \n"
-            "prfm pldl1keep, [%[din_ptr2]]                      \n"
-            "prfm pldl1keep, [%[din_ptr3]]                      \n"
-            "st1 {v27.4s}, [%[dout_ptr1]], #16                  \n"
-            "blt 2f                                             \n"
-
-            // mid loop
-            "1:                                                 \n"
-            "ld1 {v26.4s}, [%[vbias]]                           \n"
-            "mov v27.16b, v26.16b                               \n"
-            "mov v28.16b, v31.16b                               \n"
-            "mov v29.16b, v31.16b                               \n"
-
-            // out_r0 r0-r3
-            "fmla v26.4s, v8.4s, v0.s[0]                        \n"
-            "fmla v28.4s, v9.4s, v0.s[1]                        \n"
-            "fmla v26.4s, v6.4s, v0.s[2]                        \n"
-            "fmla v28.4s, v7.4s, v0.s[3]                        \n"
-
-            "ld2 {v8.4s, v9.4s}, [%[din_ptr0]], %[s_8]          \n"
-
-            "fmla v26.4s, v10.4s, v1.s[0]                       \n"
-            "fmla v28.4s, v11.4s, v1.s[3]                       \n"
-
-            "ld2 {v6.4s, v7.4s}, [%[din_ptr0]], %[s_8]          \n"
-
-            "fmla v26.4s, v14.4s, v1.s[2]                       \n"
-            "fmla v28.4s, v13.4s, v1.s[1]                       \n"
-
-            "ld2 {v10.4s, v11.4s}, [%[din_ptr0]], %[s_16]       \n"
-            "prfm pldl1keep, [%[din_ptr0]]                      \n"
-
-            "fmla v26.4s, v12.4s, v2.s[0]                       \n"
-            "fmla v28.4s, v15.4s, v2.s[1]                       \n"
-
-            "ld2 {v13.4s, v14.4s}, [%[din_ptr4]], %[s_8]        \n"
-
-            "fmla v26.4s, v16.4s, v3.s[0]                       \n"
-            "fmla v27.4s, v16.4s, v0.s[2]                       \n"
-
-            "ld2 {v11.4s, v12.4s}, [%[din_ptr4]], %[s_8]        \n"
-
-            "fmla v28.4s, v19.4s, v2.s[3]                       \n"
-            "fmla v29.4s, v19.4s, v0.s[1]                       \n"
-
-            "ld2 {v15.4s, v16.4s}, [%[din_ptr4]], %[s_16]       \n"
-            "prfm pldl1keep, [%[din_ptr4]]                      \n"
-
-            "fmla v26.4s, v18.4s, v2.s[2]                       \n"
-            "fmla v27.4s, v18.4s, v0.s[0]                       \n"
-
-            "fmla v28.4s, v17.4s, v3.s[1]                       \n"
-            "fmla v29.4s, v17.4s, v0.s[3]                       \n"
-
-            "ld2 {v18.4s, v19.4s}, [%[din_ptr5]], %[s_8]        \n"
-
-            "fmla v26.4s, v20.4s, v3.s[2]                       \n"
-            "fmla v27.4s, v20.4s, v1.s[0]                       \n"
-
-            "ld2 {v16.4s, v17.4s}, [%[din_ptr5]], %[s_8]        \n"
-
-            "fmla v29.4s, v21.4s, v1.s[3]                       \n"
-            "fmla v28.4s, v21.4s, v4.s[1]                       \n"
-            "fmla v28.4s, v23.4s, v3.s[3]                       \n"
-            "fmla v29.4s, v23.4s, v1.s[1]                       \n"
-
-            "ld2 {v20.4s, v21.4s}, [%[din_ptr5]], %[s_16]       \n"
-            "prfm pldl1keep, [%[din_ptr5]]                      \n"
-
-            "fmla v26.4s, v24.4s, v4.s[0]                       \n"
-            "fmla v27.4s, v24.4s, v1.s[2]                       \n"
-
-            "ld2 {v23.4s, v24.4s}, [%[din_ptr6]], %[s_8]        \n"
-
-            "fmla v27.4s, v22.4s, v2.s[0]                       \n"
-            "fmla v26.4s, v22.4s, v4.s[2]                       \n"
-
-            "fmla v28.4s, v25.4s, v4.s[3]                       \n"
-            "fmla v29.4s, v25.4s, v2.s[1]                       \n"
-
-            "ld2 {v21.4s, v22.4s}, [%[din_ptr6]], %[s_8]        \n"
-            "fadd v28.4s, v26.4s, v28.4s                        \n"
-
-            "ld2 {v25.4s, v26.4s}, [%[din_ptr6]], %[s_16]       \n"
-            "mov v26.16b, v31.16b                               \n"
-            "prfm pldl1keep, [%[din_ptr6]]                      \n"
-
-            "fmla v26.4s, v13.4s, v5.s[0]                       \n"
-            "fmla v28.4s, v14.4s, v5.s[1]                       \n"
-            "fmla v27.4s, v13.4s, v2.s[2]                       \n"
-            "fmla v29.4s, v14.4s, v2.s[3]                       \n"
-
-            "ld2 {v13.4s, v14.4s}, [%[din_ptr1]], %[s_8]        \n"
-
-            "fmla v26.4s, v11.4s, v5.s[2]                       \n"
-            "fmla v28.4s, v12.4s, v5.s[3]                       \n"
-            "fmla v27.4s, v11.4s, v3.s[0]                       \n"
-            "fmla v29.4s, v12.4s, v3.s[1]                       \n"
-
-            "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], %[s_8]        \n"
-
-            "fmla v26.4s, v15.4s, v30.s[0]                      \n"
-            "fmla v27.4s, v15.4s, v3.s[2]                       \n"
-            "fmla v29.4s, v16.4s, v4.s[1]                       \n"
-            "fmla v27.4s, v17.4s, v4.s[2]                       \n"
-
-            "ld2 {v15.4s, v16.4s}, [%[din_ptr1]], %[s_16]       \n"
-            "prfm pldl1keep, [%[din_ptr1]]                      \n"
-
-            "fmla v29.4s, v18.4s, v3.s[3]                       \n"
-            "fmla v27.4s, v19.4s, v4.s[0]                       \n"
-
-            "ld2 {v18.4s, v19.4s}, [%[din_ptr2]], %[s_8]        \n"
-
-            "fmla v29.4s, v20.4s, v4.s[3]                       \n"
-
-            "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], %[s_8]        \n"
-
-            "fmla v27.4s, v23.4s, v5.s[0]                       \n"
-            "fmla v27.4s, v21.4s, v5.s[2]                       \n"
-
-            "ld2 {v20.4s, v21.4s}, [%[din_ptr2]], %[s_16]       \n"
-
-            "fmla v29.4s, v24.4s, v5.s[1]                       \n"
-
-            "ld2 {v23.4s, v24.4s}, [%[din_ptr3]], %[s_8]        \n"
-            "prfm pldl1keep, [%[din_ptr2]]                      \n"
-
-            "fmla v29.4s, v22.4s, v5.s[3]                       \n"
-
-            "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], %[s_8]        \n"
-
-            "fmla v27.4s, v25.4s, v30.s[0]                      \n"
-
-            "fadd v26.4s, v26.4s, v28.4s                        \n"
-            "fadd v27.4s, v27.4s, v29.4s                        \n"
-            "fmax v26.4s, v26.4s, v31.4s                        \n"
-            "fmax v27.4s, v27.4s, v31.4s                        \n"
-
-            "prfm pldl1keep, [%[din_ptr3]]                      \n"
-            "st1 {v26.4s}, [%[dout_ptr0]], #16                  \n"
-            "st1 {v27.4s}, [%[dout_ptr1]], #16                  \n"
-
-            "ld2 {v25.4s, v26.4s}, [%[din_ptr3]], %[s_16]       \n"
-            "subs %w[mid_cnt], %w[mid_cnt], #1                  \n"
-            "bne 1b                                             \n"
-
-            "2:                                                 \n"
-            "ld2 {v26.4s, v27.4s}, [%[mask]], %[s_8]            \n"
-            "ld2 {v28.4s, v29.4s}, [%[mask]], %[s_8]            \n"
-            "bif v8.16b, v31.16b, v26.16b                       \n"
-            "bif v9.16b, v31.16b, v27.16b                       \n"
-            "bif v6.16b, v31.16b, v28.16b                       \n"
-            "bif v7.16b, v31.16b, v29.16b                       \n"
-
-            "bif v13.16b, v31.16b, v26.16b                      \n"
-            "bif v14.16b, v31.16b, v27.16b                      \n"
-            "bif v11.16b, v31.16b, v28.16b                      \n"
-            "bif v12.16b, v31.16b, v29.16b                      \n"
-
-            "bif v18.16b, v31.16b, v26.16b                      \n"
-            "bif v19.16b, v31.16b, v27.16b                      \n"
-            "bif v16.16b, v31.16b, v28.16b                      \n"
-            "bif v17.16b, v31.16b, v29.16b                      \n"
-
-            "bif v23.16b, v31.16b, v26.16b                      \n"
-            "bif v24.16b, v31.16b, v27.16b                      \n"
-            "bif v21.16b, v31.16b, v28.16b                      \n"
-            "bif v22.16b, v31.16b, v29.16b                      \n"
-
-            "ld2 {v28.4s, v29.4s}, [%[mask]]                    \n"
-            "ld1 {v26.4s}, [%[vbias]]                           \n"
-            "mov v29.16b, v31.16b                               \n"
-
-            "bif v10.16b, v31.16b, v28.16b                      \n"
-            "bif v15.16b, v31.16b, v28.16b                      \n"
-
-            "mov v27.16b, v26.16b                               \n"
-
-            "bif v20.16b, v31.16b, v28.16b                      \n"
-            "bif v25.16b, v31.16b, v28.16b                      \n"
-            "mov v28.16b, v31.16b                               \n"
-
-            "fmla v26.4s, v8.4s, v0.s[0]                        \n"
-            "fmla v28.4s, v9.4s, v0.s[1]                        \n"
-            "fmla v26.4s, v6.4s, v0.s[2]                        \n"
-            "fmla v28.4s, v7.4s, v0.s[3]                        \n"
-
-            "fmla v26.4s, v10.4s, v1.s[0]                       \n"
-            "fmla v28.4s, v13.4s, v1.s[1]                       \n"
-            "fmla v26.4s, v14.4s, v1.s[2]                       \n"
-            "fmla v28.4s, v11.4s, v1.s[3]                       \n"
-
-            "sub %[mask], %[mask], #16                          \n"
-            "ld2 {v6.4s, v7.4s}, [%[mask]], %[s_8]              \n"
-            "ld2 {v8.4s, v9.4s}, [%[mask]], %[s_8]              \n"
-            "ld2 {v10.4s, v11.4s}, [%[mask]]                    \n"
-
-            "fmla v26.4s, v12.4s, v2.s[0]                       \n"
-            "fmla v28.4s, v15.4s, v2.s[1]                       \n"
-
-            "ld2 {v13.4s, v14.4s}, [%[din_ptr4]], %[s_8]        \n"
-
-            "fmla v26.4s, v16.4s, v3.s[0]                       \n"
-            "fmla v28.4s, v17.4s, v3.s[1]                       \n"
-
-            "ld2 {v11.4s, v12.4s}, [%[din_ptr4]], %[s_8]        \n"
-
-            "fmla v27.4s, v16.4s, v0.s[2]                       \n"
-            "fmla v29.4s, v17.4s, v0.s[3]                       \n"
-
-            "ld2 {v15.4s, v16.4s}, [%[din_ptr4]]                \n"
-
-            "fmla v26.4s, v18.4s, v2.s[2]                       \n"
-            "fmla v28.4s, v19.4s, v2.s[3]                       \n"
-            "fmla v27.4s, v18.4s, v0.s[0]                       \n"
-            "fmla v29.4s, v19.4s, v0.s[1]                       \n"
-
-            "bif  v13.16b, v31.16b, v6.16b                      \n"
-            "bif  v14.16b, v31.16b, v7.16b                      \n"
-            "bif  v11.16b, v31.16b, v8.16b                      \n"
-            "bif  v12.16b, v31.16b, v9.16b                      \n"
-            "bif  v15.16b, v31.16b, v10.16b                     \n"
-
-            "ld2 {v18.4s, v19.4s}, [%[din_ptr5]], %[s_8]        \n"
-
-            "fmla v26.4s, v20.4s, v3.s[2]                       \n"
-            "fmla v27.4s, v20.4s, v1.s[0]                       \n"
-
-            "ld2 {v16.4s, v17.4s}, [%[din_ptr5]], %[s_8]        \n"
-
-            "fmla v29.4s, v21.4s, v1.s[3]                       \n"
-            "fmla v28.4s, v21.4s, v4.s[1]                       \n"
-
-            "ld2 {v20.4s, v21.4s}, [%[din_ptr5]]                \n"
-
-            "fmla v28.4s, v23.4s, v3.s[3]                       \n"
-            "fmla v29.4s, v23.4s, v1.s[1]                       \n"
-            "fmla v27.4s, v24.4s, v1.s[2]                       \n"
-            "fmla v26.4s, v24.4s, v4.s[0]                       \n"
-
-            "bif  v18.16b, v31.16b, v6.16b                      \n"
-            "bif  v19.16b, v31.16b, v7.16b                      \n"
-            "bif  v16.16b, v31.16b, v8.16b                      \n"
-            "bif  v17.16b, v31.16b, v9.16b                      \n"
-            "bif  v20.16b, v31.16b, v10.16b                     \n"
-
-            "ld2 {v23.4s, v24.4s}, [%[din_ptr6]], %[s_8]        \n"
-
-            "fmla v27.4s, v22.4s, v2.s[0]                       \n"
-            "fmla v26.4s, v22.4s, v4.s[2]                       \n"
-
-            "ld2 {v21.4s, v22.4s}, [%[din_ptr6]], %[s_8]        \n"
-
-            "fmla v28.4s, v25.4s, v4.s[3]                       \n"
-            "fmla v29.4s, v25.4s, v2.s[1]                       \n"
-            "fadd v28.4s, v28.4s, v26.4s                        \n"
-
-            "ld2 {v25.4s, v26.4s}, [%[din_ptr6]]                \n"
-            "mov v26.16b, v31.16b                               \n"
-
-            "bif  v23.16b, v31.16b, v6.16b                      \n"
-            "bif  v24.16b, v31.16b, v7.16b                      \n"
-            "bif  v21.16b, v31.16b, v8.16b                      \n"
-            "bif  v22.16b, v31.16b, v9.16b                      \n"
-            "bif  v25.16b, v31.16b, v10.16b                     \n"
-
-            "fmla v26.4s, v13.4s, v5.s[0]                       \n"
-            "fmla v28.4s, v14.4s, v5.s[1]                       \n"
-            "fmla v26.4s, v11.4s, v5.s[2]                       \n"
-            "fmla v28.4s, v12.4s, v5.s[3]                       \n"
-            "fmla v26.4s, v15.4s, v30.s[0]                      \n"
-
-            "fmla v27.4s, v13.4s, v2.s[2]                       \n"
-            "fmla v29.4s, v14.4s, v2.s[3]                       \n"
-            "fmla v27.4s, v11.4s, v3.s[0]                       \n"
-            "fmla v29.4s, v12.4s, v3.s[1]                       \n"
-
-            "fadd v26.4s, v26.4s, v28.4s                        \n"
-            "fmla v27.4s, v15.4s, v3.s[2]                       \n"
-            "fmla v29.4s, v18.4s, v3.s[3]                       \n"
-            "fmla v27.4s, v19.4s, v4.s[0]                       \n"
-            "fmla v29.4s, v16.4s, v4.s[1]                       \n"
-
-            "fmax v26.4s, v26.4s, v31.4s                        \n"
-            "fmla v27.4s, v17.4s, v4.s[2]                       \n"
-            "fmla v29.4s, v20.4s, v4.s[3]                       \n"
-            "fmla v27.4s, v23.4s, v5.s[0]                       \n"
-            "fmla v29.4s, v24.4s, v5.s[1]                       \n"
-
-            "st1 {v26.4s}, [%[out_buf0]]                        \n"
-            "fmla v27.4s, v21.4s, v5.s[2]                       \n"
-            "fmla v29.4s, v22.4s, v5.s[3]                       \n"
-            "fmla v27.4s, v25.4s, v30.s[0]                      \n"
-            "fadd v27.4s, v27.4s, v29.4s                        \n"
-
-            "fmax v27.4s, v27.4s, v31.4s                        \n"
-            "st1 {v27.4s}, [%[out_buf1]]                        \n"
-
-            : [dout_ptr0] "+r"(dout_ptr0),
-              [dout_ptr1] "+r"(dout_ptr1),
-              [mid_cnt] "+r"(loop),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [din_ptr5] "+r"(din_ptr5),
-              [din_ptr6] "+r"(din_ptr6),
-              [mask] "+r"(mask_ptr),
-              [weights] "+r"(weights_ptr)
-            : [vbias] "r"(vbias),
-              [out_buf0] "r"(out_buf0),
-              [out_buf1] "r"(out_buf1),
-              [s_8] "r"(s_8),
-              [s_16] "r"(s_16)
-            : "memory",
-              "cc",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22",
-              "v23",
-              "v24",
-              "v25",
-              "v26",
-              "v27",
-              "v28",
-              "v29",
-              "v30",
-              "v31");
-
-        int remain_cnt = w_out - (mid_cnt + 1) * 4;
-        for (int i = 0; i < remain_cnt; ++i) {
-          dout_ptr0[i] = out_buf0[i];
-          dout_ptr1[i] = out_buf1[i];
-        }
-        din0 = din4;
-        din1 = din5;
-        din2 = din6;
-        din3 = din6 + w_in;
-        din4 = din3 + w_in;
-        din5 = din4 + w_in;
-        din6 = din5 + w_in;
-        dout0 = dout1 + w_out;
-        dout1 = dout0 + w_out;
-      }
-    }
-  }
-}
-
-//! small depthwise, win < 9;
-void conv_depthwise_5x5s2p2_s(const float* din,
-                              float* dout,
-                              int num,
-                              int ch_out,
-                              int h_out,
-                              int w_out,
-                              int ch_in,
-                              int h_in,
-                              int w_in,
-                              const float* weights,
-                              const float* bias,
-                              bool flag_bias,
-                              bool flag_relu,
-                              ARMContext* ctx) {
-  CHECK_LT(w_in, 9) << "only support win < 9";
-  int w_out_round = (w_out + 3) / 4 * 4;
-  int mask_cnt = 12 - w_in - 2;
-  int mask[12];
-  memset(mask, 0xff, 12 * sizeof(int));
-  for (int i = 0; i < mask_cnt; ++i) {
-    mask[11 - i] = 0;
-  }
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  int in_spatial_size = w_in * h_in;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      const float* din0 = zero_ptr;
-      const float* din1 = zero_ptr;
-      const float* din2 = din_ch;
-      const float* din3 = din2 + w_in;
-      const float* din4 = din3 + w_in;
-
-      float out_buf0[4];
-      float out_buf1[4];
-      float* dout0 = dout_ch;
-      float* dout1 = dout0 + w_out;
-
-      const float* weights_c = weights + c * weights_saptial_size;
-      for (int h = 0; h < h_out; h += 1) {
-        //! (h * 2 - 2) + 4 > h_in - 1
-        if (h * 2 + 3 > h_in) {
-          switch (h * 2 + 3 - h_in) {
-            case 4:
-              din1 = zero_ptr;
-            case 3:
-              din2 = zero_ptr;
-            case 2:
-              din3 = zero_ptr;
-            case 1:
-              din4 = zero_ptr;
-            default:
-              break;
-          }
-        }
-
-        const float* din_ptr0 = din0;
-        const float* din_ptr1 = din1;
-        const float* din_ptr2 = din2;
-        const float* din_ptr3 = din3;
-        const float* din_ptr4 = din4;
-
-        const float* weights_ptr = weights_c;
-        float* dout_ptr0 = dout0;
-
-        float bias_c = 0.f;
-        if (flag_bias) {
-          bias_c = bias[c];
-        }
-        float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-        int* mask_ptr = mask;
-        const int s_8 = 8;
-        //! in r0/r4, r1, r2, r3: x 0 2 4 -- v8   v13  v18  v23  v28
-        //! in r0/r4, r1, r2, r3: x 1 3 5 -- v9   v14  v19  v24  v29
-        //! in r0/r4, r1, r2, r3: 0 2 4 6 -- v6   v11  v16  v21  v26
-        //! in r0/r4, r1, r2, r3: 1 3 5 7 -- v7   v12  v17  v22  v27
-        //! in r0/r4, r1, r2, r3: 2 4 6 8 -- v10  v15  v20  v25  v30
-        //! out r0 -- v4
-        asm volatile(
-            "movi   v31.4s, #0x0\n"
-            "prfm pldl1keep, [%[din_ptr0]]  \n"
-            "prfm pldl1keep, [%[din_ptr1]]  \n"
-            "prfm pldl1keep, [%[din_ptr2]]  \n"
-            "prfm pldl1keep, [%[din_ptr3]]  \n"
-            "prfm pldl1keep, [%[din_ptr4]]  \n"
-            "prfm pldl1keep, [%[weights]]   \n"
-            "prfm pldl1keep, [%[mask]]      \n"
-
-            //! load mask
-            "ld2 {v0.4s, v1.4s}, [%[mask]], %[s_8]  \n"
-            "ld2 {v2.4s, v3.4s}, [%[mask]], %[s_8]  \n"
-            "ld2 {v4.4s, v5.4s}, [%[mask]]  \n"
-
-            //! load and extract input
-            "ld2 {v6.4s, v7.4s},   [%[din_ptr0]], #32  \n"
-            "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], #32 \n"
-            "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], #32 \n"
-            "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], #32 \n"
-            "ld2 {v26.4s, v27.4s}, [%[din_ptr4]], #32 \n"
-
-            "ext v8.16b, v31.16b, v6.16b, #12  \n"
-            "ext v9.16b, v31.16b, v7.16b, #12  \n"
-            "ext v13.16b, v31.16b, v11.16b, #12  \n"
-            "ext v14.16b, v31.16b, v12.16b, #12  \n"
-
-            "ext v18.16b, v31.16b, v16.16b, #12  \n"
-            "ext v19.16b, v31.16b, v17.16b, #12  \n"
-            "ext v23.16b, v31.16b, v21.16b, #12  \n"
-            "ext v24.16b, v31.16b, v22.16b, #12  \n"
-            "ext v28.16b, v31.16b, v26.16b, #12  \n"
-            "ext v29.16b, v31.16b, v27.16b, #12  \n"
-
-            "ext v10.16b, v6.16b,  v31.16b, #4  \n"
-            "ext v15.16b, v11.16b, v31.16b, #4  \n"
-            "ext v20.16b, v16.16b, v31.16b, #4  \n"
-            "ext v25.16b, v21.16b, v31.16b, #4  \n"
-            "ext v30.16b, v26.16b, v31.16b, #4  \n"
-
-            "bif v8.16b, v31.16b, v0.16b  \n"
-            "bif v9.16b, v31.16b, v1.16b  \n"
-            "bif v6.16b, v31.16b, v2.16b  \n"
-            "bif v7.16b, v31.16b, v3.16b  \n"
-
-            "bif v13.16b, v31.16b, v0.16b  \n"
-            "bif v14.16b, v31.16b, v1.16b  \n"
-            "bif v11.16b, v31.16b, v2.16b  \n"
-            "bif v12.16b, v31.16b, v3.16b  \n"
-
-            "bif v18.16b, v31.16b, v0.16b  \n"
-            "bif v19.16b, v31.16b, v1.16b  \n"
-            "bif v16.16b, v31.16b, v2.16b  \n"
-            "bif v17.16b, v31.16b, v3.16b  \n"
-
-            "ld1 {v10.s}[3], [%[din_ptr0]]  \n"
-            "ld1 {v15.s}[3], [%[din_ptr1]]  \n"
-            "ld1 {v20.s}[3], [%[din_ptr2]]  \n"
-            "ld1 {v25.s}[3], [%[din_ptr3]]  \n"
-            "ld1 {v30.s}[3], [%[din_ptr4]]  \n"
-
-            "bif v23.16b, v31.16b, v0.16b  \n"
-            "bif v24.16b, v31.16b, v1.16b  \n"
-            "bif v21.16b, v31.16b, v2.16b  \n"
-            "bif v22.16b, v31.16b, v3.16b  \n"
-
-            "bif v28.16b, v31.16b, v0.16b  \n"
-            "bif v29.16b, v31.16b, v1.16b  \n"
-            "bif v26.16b, v31.16b, v2.16b  \n"
-            "bif v27.16b, v31.16b, v3.16b  \n"
-
-            "bif v10.16b, v31.16b, v4.16b  \n"
-            "bif v15.16b, v31.16b, v4.16b  \n"
-            "bif v20.16b, v31.16b, v4.16b  \n"
-            "bif v25.16b, v31.16b, v4.16b  \n"
-            "bif v30.16b, v31.16b, v4.16b  \n"
-
-            "ld1 {v4.4s}, [%[vbias]]  \n"
-            "mov v5.16b, v31.16b  \n"
-
-            "ld1 {v0.4s, v1.4s}, [%[weights]], #32  \n"  // load weights 0-7
-            "ld1 {v2.4s, v3.4s}, [%[weights]], #32  \n"  // load weights 8-15
-
-            //! compute
-            "fmla v4.4s, v8.4s, v0.s[0]  \n"  // out r0: w0
-            "fmla v5.4s, v9.4s, v0.s[1]  \n"  // out r0: w1
-            "fmla v4.4s, v6.4s, v0.s[2]  \n"  // out r0: w2
-            "fmla v5.4s, v7.4s, v0.s[3]  \n"  // out r0: w3
-
-            "fmla v4.4s, v10.4s, v1.s[0]  \n"  // out r0: w4
-            "fmla v5.4s, v13.4s, v1.s[1]  \n"  // out r0: w5
-            "fmla v4.4s, v14.4s, v1.s[2]  \n"  // out r0: w6
-            "fmla v5.4s, v11.4s, v1.s[3]  \n"  // out r0: w7
-
-            "ld1 {v6.4s, v7.4s}, [%[weights]], #32  \n"  // load weights 16-23
-            "ld1 {v8.s}[0], [%[weights]]  \n"            // load weights 24
-
-            "fmla v4.4s, v12.4s, v2.s[0]  \n"  // out r0: w8
-            "fmla v5.4s, v15.4s, v2.s[1]  \n"  // out r0: w9
-            "fmla v4.4s, v18.4s, v2.s[2]  \n"  // out r0: w10
-            "fmla v5.4s, v19.4s, v2.s[3]  \n"  // out r0: w11
-
-            "fmla v4.4s, v16.4s, v3.s[0]  \n"  // out r0: w12
-            "fmla v5.4s, v17.4s, v3.s[1]  \n"  // out r0: w13
-            "fmla v4.4s, v20.4s, v3.s[2]  \n"  // out r0: w14
-            "fmla v5.4s, v23.4s, v3.s[3]  \n"  // out r0: w15
-
-            "fmla v4.4s, v24.4s, v6.s[0]  \n"  // out r0: w16
-            "fmla v5.4s, v21.4s, v6.s[1]  \n"  // out r0: w17
-            "fmla v4.4s, v22.4s, v6.s[2]  \n"  // out r0: w18
-            "fmla v5.4s, v25.4s, v6.s[3]  \n"  // out r0: w19
-
-            "fmla v4.4s, v28.4s, v7.s[0]  \n"  // out r0: w20
-            "fmla v5.4s, v29.4s, v7.s[1]  \n"  // out r0: w21
-            "fmla v4.4s, v26.4s, v7.s[2]  \n"  // out r0: w22
-            "fmla v5.4s, v27.4s, v7.s[3]  \n"  // out r0: w23
-            "fmla v4.4s, v30.4s, v8.s[0] \n"   // out r0: w24
-
-            "fadd v4.4s, v4.4s, v5.4s  \n"  // add out to v4
-            "st1 {v4.4s}, [%[out_buf0]]  \n"
-
-            : [dout_ptr0] "+r"(dout_ptr0),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [mask] "+r"(mask_ptr),
-              [weights] "+r"(weights_ptr)
-            : [vbias] "r"(vbias),
-              [out_buf0] "r"(out_buf0),
-              [out_buf1] "r"(out_buf1),
-              [s_8] "r"(s_8)
-            : "memory",
-              "cc",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22",
-              "v23",
-              "v24",
-              "v25",
-              "v26",
-              "v27",
-              "v28",
-              "v29",
-              "v30",
-              "v31");
-        for (int i = 0; i < w_out; ++i) {
-          dout_ptr0[i] = out_buf0[i];
-        }
-        din0 = din2;
-        din1 = din3;
-        din2 = din4;
-        din3 = din2 + w_in;
-        din4 = din3 + w_in;
-        dout0 += w_out;
-      }
-    }
-  }
-}
-
-//! small depthwise, win < 9;
-void conv_depthwise_5x5s2p2_relu_s(const float* din,
-                                   float* dout,
-                                   int num,
-                                   int ch_out,
-                                   int h_out,
-                                   int w_out,
-                                   int ch_in,
-                                   int h_in,
-                                   int w_in,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   bool flag_relu,
-                                   ARMContext* ctx) {
-  CHECK_LT(w_in, 9) << "only support win < 9";
-  int w_out_round = (w_out + 3) / 4 * 4;
-  int mask_cnt = 12 - w_in - 2;
-  int mask[12];
-  memset(mask, 0xff, 12 * sizeof(int));
-  for (int i = 0; i < mask_cnt; ++i) {
-    mask[11 - i] = 0;
-  }
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  int in_spatial_size = w_in * h_in;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      const float* din0 = zero_ptr;
-      const float* din1 = zero_ptr;
-      const float* din2 = din_ch;
-      const float* din3 = din2 + w_in;
-      const float* din4 = din3 + w_in;
-
-      float out_buf0[4];
-      float out_buf1[4];
-      float* dout0 = dout_ch;
-      float* dout1 = dout0 + w_out;
-
-      const float* weights_c = weights + c * weights_saptial_size;
-      for (int h = 0; h < h_out; h += 1) {
-        //! (h * 2 - 2) + 4 > h_in - 1
-        if (h * 2 + 3 > h_in) {
-          switch (h * 2 + 3 - h_in) {
-            case 4:
-              din1 = zero_ptr;
-            case 3:
-              din2 = zero_ptr;
-            case 2:
-              din3 = zero_ptr;
-            case 1:
-              din4 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        const float* din_ptr0 = din0;
-        const float* din_ptr1 = din1;
-        const float* din_ptr2 = din2;
-        const float* din_ptr3 = din3;
-        const float* din_ptr4 = din4;
-
-        const float* weights_ptr = weights_c;
-        float* dout_ptr0 = dout0;
-
-        float bias_c = 0.f;
-        if (flag_bias) {
-          bias_c = bias[c];
-        }
-        float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-        int* mask_ptr = mask;
-        const int s_8 = 8;
-        //! in r0/r4, r1, r2, r3: x 0 2 4 -- v8   v13  v18  v23  v28
-        //! in r0/r4, r1, r2, r3: x 1 3 5 -- v9   v14  v19  v24  v29
-        //! in r0/r4, r1, r2, r3: 0 2 4 6 -- v6   v11  v16  v21  v26
-        //! in r0/r4, r1, r2, r3: 1 3 5 7 -- v7   v12  v17  v22  v27
-        //! in r0/r4, r1, r2, r3: 2 4 6 8 -- v10  v15  v20  v25  v30
-        //! out r0 -- v4
-        asm volatile(
-            "movi   v31.4s, #0x0\n"
-            "prfm pldl1keep, [%[din_ptr0]]  \n"
-            "prfm pldl1keep, [%[din_ptr1]]  \n"
-            "prfm pldl1keep, [%[din_ptr2]]  \n"
-            "prfm pldl1keep, [%[din_ptr3]]  \n"
-            "prfm pldl1keep, [%[din_ptr4]]  \n"
-            "prfm pldl1keep, [%[weights]]   \n"
-            "prfm pldl1keep, [%[mask]]      \n"
-
-            //! load mask
-            "ld2 {v0.4s, v1.4s}, [%[mask]], %[s_8]  \n"
-            "ld2 {v2.4s, v3.4s}, [%[mask]], %[s_8]  \n"
-            "ld2 {v4.4s, v5.4s}, [%[mask]]  \n"
-
-            //! load and extract input
-            "ld2 {v6.4s, v7.4s},   [%[din_ptr0]], #32  \n"
-            "ld2 {v11.4s, v12.4s}, [%[din_ptr1]], #32 \n"
-            "ld2 {v16.4s, v17.4s}, [%[din_ptr2]], #32 \n"
-            "ld2 {v21.4s, v22.4s}, [%[din_ptr3]], #32 \n"
-            "ld2 {v26.4s, v27.4s}, [%[din_ptr4]], #32 \n"
-
-            "ext v8.16b, v31.16b, v6.16b, #12  \n"
-            "ext v9.16b, v31.16b, v7.16b, #12  \n"
-            "ext v13.16b, v31.16b, v11.16b, #12  \n"
-            "ext v14.16b, v31.16b, v12.16b, #12  \n"
-
-            "ext v18.16b, v31.16b, v16.16b, #12  \n"
-            "ext v19.16b, v31.16b, v17.16b, #12  \n"
-            "ext v23.16b, v31.16b, v21.16b, #12  \n"
-            "ext v24.16b, v31.16b, v22.16b, #12  \n"
-            "ext v28.16b, v31.16b, v26.16b, #12  \n"
-            "ext v29.16b, v31.16b, v27.16b, #12  \n"
-
-            "ext v10.16b, v6.16b,  v31.16b, #4  \n"
-            "ext v15.16b, v11.16b, v31.16b, #4  \n"
-            "ext v20.16b, v16.16b, v31.16b, #4  \n"
-            "ext v25.16b, v21.16b, v31.16b, #4  \n"
-            "ext v30.16b, v26.16b, v31.16b, #4  \n"
-
-            "bif v8.16b, v31.16b, v0.16b  \n"
-            "bif v9.16b, v31.16b, v1.16b  \n"
-            "bif v6.16b, v31.16b, v2.16b  \n"
-            "bif v7.16b, v31.16b, v3.16b  \n"
-
-            "bif v13.16b, v31.16b, v0.16b  \n"
-            "bif v14.16b, v31.16b, v1.16b  \n"
-            "bif v11.16b, v31.16b, v2.16b  \n"
-            "bif v12.16b, v31.16b, v3.16b  \n"
-
-            "bif v18.16b, v31.16b, v0.16b  \n"
-            "bif v19.16b, v31.16b, v1.16b  \n"
-            "bif v16.16b, v31.16b, v2.16b  \n"
-            "bif v17.16b, v31.16b, v3.16b  \n"
-
-            "ld1 {v10.s}[3], [%[din_ptr0]]  \n"
-            "ld1 {v15.s}[3], [%[din_ptr1]]  \n"
-            "ld1 {v20.s}[3], [%[din_ptr2]]  \n"
-            "ld1 {v25.s}[3], [%[din_ptr3]]  \n"
-            "ld1 {v30.s}[3], [%[din_ptr4]]  \n"
-
-            "bif v23.16b, v31.16b, v0.16b  \n"
-            "bif v24.16b, v31.16b, v1.16b  \n"
-            "bif v21.16b, v31.16b, v2.16b  \n"
-            "bif v22.16b, v31.16b, v3.16b  \n"
-
-            "bif v28.16b, v31.16b, v0.16b  \n"
-            "bif v29.16b, v31.16b, v1.16b  \n"
-            "bif v26.16b, v31.16b, v2.16b  \n"
-            "bif v27.16b, v31.16b, v3.16b  \n"
-
-            "bif v10.16b, v31.16b, v4.16b  \n"
-            "bif v15.16b, v31.16b, v4.16b  \n"
-            "bif v20.16b, v31.16b, v4.16b  \n"
-            "bif v25.16b, v31.16b, v4.16b  \n"
-            "bif v30.16b, v31.16b, v4.16b  \n"
-
-            "ld1 {v4.4s}, [%[vbias]]  \n"
-            "mov v5.16b, v31.16b  \n"
-
-            "ld1 {v0.4s, v1.4s}, [%[weights]], #32  \n"  // load weights 0-7
-            "ld1 {v2.4s, v3.4s}, [%[weights]], #32  \n"  // load weights 8-15
-
-            //! compute
-            "fmla v4.4s, v8.4s, v0.s[0]  \n"  // out r0: w0
-            "fmla v5.4s, v9.4s, v0.s[1]  \n"  // out r0: w1
-            "fmla v4.4s, v6.4s, v0.s[2]  \n"  // out r0: w2
-            "fmla v5.4s, v7.4s, v0.s[3]  \n"  // out r0: w3
-
-            "fmla v4.4s, v10.4s, v1.s[0]  \n"  // out r0: w4
-            "fmla v5.4s, v13.4s, v1.s[1]  \n"  // out r0: w5
-            "fmla v4.4s, v14.4s, v1.s[2]  \n"  // out r0: w6
-            "fmla v5.4s, v11.4s, v1.s[3]  \n"  // out r0: w7
-
-            "ld1 {v6.4s, v7.4s}, [%[weights]], #32  \n"  // load weights 16-23
-            "ld1 {v8.s}[0], [%[weights]]  \n"            // load weights 24
-
-            "fmla v4.4s, v12.4s, v2.s[0]  \n"  // out r0: w8
-            "fmla v5.4s, v15.4s, v2.s[1]  \n"  // out r0: w9
-            "fmla v4.4s, v18.4s, v2.s[2]  \n"  // out r0: w10
-            "fmla v5.4s, v19.4s, v2.s[3]  \n"  // out r0: w11
-
-            "fmla v4.4s, v16.4s, v3.s[0]  \n"  // out r0: w12
-            "fmla v5.4s, v17.4s, v3.s[1]  \n"  // out r0: w13
-            "fmla v4.4s, v20.4s, v3.s[2]  \n"  // out r0: w14
-            "fmla v5.4s, v23.4s, v3.s[3]  \n"  // out r0: w15
-
-            "fmla v4.4s, v24.4s, v6.s[0]  \n"  // out r0: w16
-            "fmla v5.4s, v21.4s, v6.s[1]  \n"  // out r0: w17
-            "fmla v4.4s, v22.4s, v6.s[2]  \n"  // out r0: w18
-            "fmla v5.4s, v25.4s, v6.s[3]  \n"  // out r0: w19
-
-            "fmla v4.4s, v28.4s, v7.s[0]  \n"  // out r0: w20
-            "fmla v5.4s, v29.4s, v7.s[1]  \n"  // out r0: w21
-            "fmla v4.4s, v26.4s, v7.s[2]  \n"  // out r0: w22
-            "fmla v5.4s, v27.4s, v7.s[3]  \n"  // out r0: w23
-            "fmla v4.4s, v30.4s, v8.s[0]  \n"  // out r0: w24
-
-            "fadd v4.4s, v4.4s, v5.4s     \n"  // add out to v4
-            "fmax v4.4s, v4.4s, v31.4s    \n"
-            "st1 {v4.4s}, [%[out_buf0]]   \n"
-
-            : [dout_ptr0] "+r"(dout_ptr0),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [mask] "+r"(mask_ptr),
-              [weights] "+r"(weights_ptr)
-            : [vbias] "r"(vbias),
-              [out_buf0] "r"(out_buf0),
-              [out_buf1] "r"(out_buf1),
-              [s_8] "r"(s_8)
-            : "memory",
-              "cc",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22",
-              "v23",
-              "v24",
-              "v25",
-              "v26",
-              "v27",
-              "v28",
-              "v29",
-              "v30",
-              "v31");
-        for (int i = 0; i < w_out; ++i) {
-          dout_ptr0[i] = out_buf0[i];
-        }
-        din0 = din2;
-        din1 = din3;
-        din2 = din4;
-        din3 = din2 + w_in;
-        din4 = din3 + w_in;
-        dout0 += w_out;
-      }
-    }
-  }
-}
-
-#else
-
-//! larger depthwise, win >= 9;
-void conv_depthwise_5x5s2p2(const float* din,
-                            float* dout,
-                            int num,
-                            int ch_out,
-                            int h_out,
-                            int w_out,
-                            int ch_in,
-                            int h_in,
-                            int w_in,
-                            const float* weights,
-                            const float* bias,
-                            bool flag_bias,
-                            bool flag_relu,
-                            ARMContext* ctx) {
-  // printf("invoke 5x5s2p2 armv7\n");
-  CHECK_GE(w_in, 9) << "only support win >= 9";
-  int w_out_round = (w_out + 3) / 4 * 4;
-  int cnt = (w_out_round - 4) / 4;
-  int mid_cnt = cnt - 1;
-  int right_start = cnt * 2 * 4 - 2;
-  int mask_cnt = 12 - (w_in - right_start);
-  int mask[12];
-  memset(mask, 0xff, 12 * sizeof(int));
-  for (int i = 0; i < mask_cnt; ++i) {
-    mask[11 - i] = 0;
-  }
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  int in_spatial_size = w_in * h_in;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      const float* din0 = zero_ptr;
-      const float* din1 = zero_ptr;
-      const float* din2 = din_ch;
-      const float* din3 = din2 + w_in;
-      const float* din4 = din3 + w_in;
-
-      float out_buf0[4];
-      float* dout0 = dout_ch;
-
-      const float* weights_c = weights + c * weights_saptial_size;
-      float32x4_t w0 = vld1q_f32(weights_c);
-      float32x4_t w1 = vld1q_f32(weights_c + 4);
-      float32x4_t w2 = vld1q_f32(weights_c + 8);
-      float32x4_t w3 = vld1q_f32(weights_c + 12);
-      float32x4_t w4 = vld1q_f32(weights_c + 16);
-      float32x4_t w5 = vld1q_f32(weights_c + 20);
-      for (int h = 0; h < h_out; h += 1) {
-        //! (h * 2 - 2) + 4 > h_in - 1
-        if (h * 2 + 3 > h_in) {
-          switch (h * 2 + 3 - h_in) {
-            case 4:
-              din1 = zero_ptr;
-            case 3:
-              din2 = zero_ptr;
-            case 2:
-              din3 = zero_ptr;
-            case 1:
-              din4 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        const float* din_ptr0 = din0;
-        const float* din_ptr1 = din1;
-        const float* din_ptr2 = din2;
-        const float* din_ptr3 = din3;
-        const float* din_ptr4 = din4;
-
-        const float* weights_ptr = weights_c + 24;
-        float* dout_ptr0 = dout0;
-
-        float bias_c = 0.f;
-        if (flag_bias) {
-          bias_c = bias[c];
-        }
-        float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-        int* mask_ptr = mask;
-        int loop = mid_cnt;
-        const int s_8 = 8;
-        const int s_16 = 16;
-
-        asm volatile(
-            "vmov.i32   q15, #0x0           \n"
-            "pld [%[din_ptr0]]              \n"
-            "pld [%[din_ptr1]]              \n"
-            "pld [%[din_ptr2]]              \n"
-            "pld [%[din_ptr3]]              \n"
-            "pld [%[din_ptr4]]              \n"
-            "pld [%[mask]]                  \n"
-
-            // left
-            "vld2.32 {d16-d19}, [%[din_ptr0]]!          \n"
-            "vld1.32 {d26-d29}, [%[vbias]]              \n"
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-            "vmov.32 q14, q15                           \n"
-
-            // r0
-            "vmla.f32 q13, q8, %f[w0][0]                \n"
-            "vmla.f32 q14, q9, %f[w0][1]                \n"
-
-            "vld1.32 {d21[1]}, [%[din_ptr0]]            \n"
-            "vld2.32 {d16-d19}, [%[din_ptr1]]!          \n"
-            "sub %[din_ptr0], #8  \n"
-
-            "vmla.f32 q13, q6, %e[w0][0]                \n"
-            "vmla.f32 q14, q7, %e[w0][1]                \n"
-            "vmla.f32 q13, q10, %e[w1][0]               \n"
-
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-
-            // r1
-            "vmla.f32 q13, q8, %f[w1][1]                \n"
-            "vmla.f32 q14, q9, %e[w2][0]                \n"
-
-            "vld1.32 {d21[1]}, [%[din_ptr1]]            \n"
-            "vld2.32 {d16-d19}, [%[din_ptr2]]!          \n"
-            "sub %[din_ptr1], #8                        \n"
-
-            "vmla.f32 q13, q6, %e[w1][1]                \n"
-            "vmla.f32 q14, q7, %f[w1][0]                \n"
-            "vmla.f32 q13, q10, %e[w2][1]               \n"
-
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-
-            // r2
-            "vmla.f32 q13, q8, %e[w3][0]                \n"
-            "vmla.f32 q14, q9, %e[w3][1]                \n"
-
-            "vld1.32 {d21[1]}, [%[din_ptr2]]            \n"
-            "vld2.32 {d16-d19}, [%[din_ptr3]]!          \n"
-            "sub %[din_ptr2], #8                        \n"
-
-            "vmla.f32 q13, q6, %f[w2][0]                \n"
-            "vmla.f32 q14, q7, %f[w2][1]                \n"
-            "vmla.f32 q13, q10, %f[w3][0]               \n"
-
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-
-            // r3
-            "vmla.f32 q13, q8, %e[w4][1]                \n"
-            "vmla.f32 q14, q9, %f[w4][0]                \n"
-
-            "vld1.32 {d21[1]}, [%[din_ptr3]]            \n"
-            "vld2.32 {d16-d19}, [%[din_ptr4]]!          \n"
-            "sub %[din_ptr3], #8                        \n"
-
-            "vmla.f32 q13, q6, %f[w3][1]                \n"
-            "vmla.f32 q14, q7, %e[w4][0]                \n"
-            "vmla.f32 q13, q10, %f[w4][1]               \n"
-
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-
-            // r4
-            "vmla.f32 q13, q6, %e[w5][0]                \n"
-            "vmla.f32 q14, q7, %e[w5][1]                \n"
-
-            "vld1.32 {d21[1]}, [%[din_ptr4]]            \n"
-            "vld2.32 {d12-d15}, [%[din_ptr0]], %[s_8]   \n"
-            "sub %[din_ptr4], #8                        \n"
-
-            "vmla.f32 q13, q8, %f[w5][0]                \n"
-            "vmla.f32 q14, q9, %f[w5][1]                \n"
-
-            "vld2.32 {d16-d19}, [%[din_ptr0]], %[s_8]   \n"
-
-            "vmov.32 q12, %q[w0]                        \n"
-            "vld1.32 {%e[w0][0]}, [%[weights]]          \n"
-            "vmla.f32 q13, q10, %e[w0][0]               \n"
-            "vadd.f32 q13, q13, q14                     \n"
-            "vmov.32 %q[w0], q12                        \n"
-            "cmp %[mid_cnt], #1                         \n"
-            "vld2.32 {d20-d23}, [%[din_ptr0]], %[s_16]  \n"
-            "vst1.32 {d26-d27}, [%[dout_ptr0]]!         \n"
-            "pld [%[din_ptr0]]                          \n"
-            "blt 2f                                     \n"
-
-            // mid
-            "1:                                         \n"
-            "vld1.32 {d26-d27}, [%[vbias]]              \n"
-            "vmov.32 q14, q15                           \n"
-
-            // r0
-            "vmla.f32 q13, q6,  %e[w0][0]               \n"
-            "vmla.f32 q14, q7,  %e[w0][1]               \n"
-
-            "vld2.32 {d12-d15}, [%[din_ptr1]], %[s_8]   \n"
-
-            "vmla.f32 q13, q8,  %f[w0][0]               \n"
-            "vmla.f32 q14, q9,  %f[w0][1]               \n"
-
-            "vld2.32 {d16-d19}, [%[din_ptr1]], %[s_8]   \n"
-
-            "vmla.f32 q13, q10, %e[w1][0]               \n"
-
-            "vld2.32 {d20-d23}, [%[din_ptr1]], %[s_16]  \n"
-
-            // r1
-            "vmla.f32 q13, q6,  %e[w1][1]               \n"
-            "vmla.f32 q14, q7,  %f[w1][0]               \n"
-            "pld [%[din_ptr1]]                          \n"
-
-            "vld2.32 {d12-d15}, [%[din_ptr2]], %[s_8]   \n"
-
-            "vmla.f32 q13, q8,  %f[w1][1]               \n"
-            "vmla.f32 q14, q9,  %e[w2][0]               \n"
-
-            "vld2.32 {d16-d19}, [%[din_ptr2]], %[s_8]   \n"
-
-            "vmla.f32 q13, q10, %e[w2][1]               \n"
-
-            "vld2.32 {d20-d23}, [%[din_ptr2]], %[s_16]  \n"
-
-            // r2
-            "vmla.f32 q13, q6,  %f[w2][0]               \n"
-            "vmla.f32 q14, q7,  %f[w2][1]               \n"
-            "pld [%[din_ptr2]]                          \n"
-
-            "vld2.32 {d12-d15}, [%[din_ptr3]], %[s_8]   \n"
-
-            "vmla.f32 q13, q8,  %e[w3][0]               \n"
-            "vmla.f32 q14, q9,  %e[w3][1]               \n"
-
-            "vld2.32 {d16-d19}, [%[din_ptr3]], %[s_8]   \n"
-
-            "vmla.f32 q13, q10, %f[w3][0]               \n"
-
-            "vld2.32 {d20-d23}, [%[din_ptr3]], %[s_16]  \n"
-
-            // r3
-            "vmla.f32 q13, q6,  %f[w3][1]               \n"
-            "vmla.f32 q14, q7,  %e[w4][0]               \n"
-            "pld [%[din_ptr3]]                          \n"
-
-            "vld2.32 {d12-d15}, [%[din_ptr4]], %[s_8]   \n"
-
-            "vmla.f32 q13, q8,  %e[w4][1]               \n"
-            "vmla.f32 q14, q9,  %f[w4][0]               \n"
-
-            "vld2.32 {d16-d19}, [%[din_ptr4]], %[s_8]   \n"
-
-            "vmla.f32 q13, q10, %f[w4][1]               \n"
-
-            "vld2.32 {d20-d23}, [%[din_ptr4]], %[s_16]  \n"
-
-            // r4
-            "vmla.f32 q13, q6,  %e[w5][0]               \n"
-            "vmla.f32 q14, q7,  %e[w5][1]               \n"
-            "pld [%[din_ptr4]]                          \n"
-
-            "vld2.32 {d12-d15}, [%[din_ptr0]], %[s_8]   \n"
-            "vld1.32 {%e[w0][0]}, [%[weights]]          \n"
-
-            "vmla.f32 q13, q8,  %f[w5][0]               \n"
-            "vmla.f32 q14, q9,  %f[w5][1]               \n"
-
-            "vld2.32 {d16-d19}, [%[din_ptr0]], %[s_8]   \n"
-
-            "vmla.f32 q13, q10, %e[w0][0]               \n"
-
-            "vld2.32 {d20-d23}, [%[din_ptr0]], %[s_16]  \n"
-
-            "vmov.32 %q[w0], q12                        \n"
-            "vadd.f32 q13, q13, q14                     \n"
-            "subs %[mid_cnt], #1                        \n"
-            "vst1.32 {d26-d27}, [%[dout_ptr0]]!         \n"
-            "bne 1b                                     \n"
-
-            "2:                                         \n"
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vld1.32 {d26-d27}, [%[vbias]]              \n"
-            "vmov.32 q14, q15                           \n"
-
-            // r0
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %e[w0][0]                \n"
-            "vmla.f32 q14, q7, %e[w0][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vld2.32 {d12-d15}, [%[din_ptr1]], %[s_8]   \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %f[w0][0]                \n"
-            "vmla.f32 q14, q9, %f[w0][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "sub %[mask], #16                           \n"
-            "vld2.32 {d16-d19}, [%[din_ptr1]], %[s_8]   \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, %e[w1][0]               \n"
-
-            // r1
-            "vld2.32 {d20-d23}, [%[din_ptr1]]           \n"
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %e[w1][1]                \n"
-            "vmla.f32 q14, q7, %f[w1][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vld2.32 {d12-d15}, [%[din_ptr2]], %[s_8]   \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %f[w1][1]                \n"
-            "vmla.f32 q14, q9, %e[w2][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "sub %[mask], #16                           \n"
-            "vld2.32 {d16-d19}, [%[din_ptr2]], %[s_8]   \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, %e[w2][1]               \n"
-
-            // r2
-            "vld2.32 {d20-d23}, [%[din_ptr2]]           \n"
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %f[w2][0]                \n"
-            "vmla.f32 q14, q7, %f[w2][1]                \n"
-
-            "vld2.32 {d12-d15}, [%[din_ptr3]], %[s_8]   \n"
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %e[w3][0]                \n"
-            "vmla.f32 q14, q9, %e[w3][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "sub %[mask], #16                           \n"
-            "vld2.32 {d16-d19}, [%[din_ptr3]], %[s_8]   \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, %f[w3][0]               \n"
-
-            // r3
-            "vld2.32 {d20-d23}, [%[din_ptr3]]           \n"
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %f[w3][1]                \n"
-            "vmla.f32 q14, q7, %e[w4][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vld2.32 {d12-d15}, [%[din_ptr4]], %[s_8]   \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %e[w4][1]                \n"
-            "vmla.f32 q14, q9, %f[w4][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "sub %[mask], #16                           \n"
-            "vld2.32 {d16-d19}, [%[din_ptr4]], %[s_8]   \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, %f[w4][1]               \n"
-
-            // r4
-            "vld2.32 {d20-d23}, [%[din_ptr4]]           \n"
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %e[w5][0]                \n"
-            "vmla.f32 q14, q7, %e[w5][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vld1.32 {d12[0]}, [%[weights]]             \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %f[w5][0]                \n"
-            "vmla.f32 q14, q9, %f[w5][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, d12[0]                  \n"
-
-            "vadd.f32 q13, q13, q14                     \n"
-            "vst1.32 {d26-d27}, [%[out_buf0]]           \n"
-
-            : [dout_ptr0] "+r"(dout_ptr0),
-              [mid_cnt] "+r"(loop),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [mask] "+r"(mask_ptr),
-              [weights] "+r"(weights_ptr)
-            : [w0] "w"(w0),
-              [w1] "w"(w1),
-              [w2] "w"(w2),
-              [w3] "w"(w3),
-              [w4] "w"(w4),
-              [w5] "w"(w5),
-              [vbias] "r"(vbias),
-              [out_buf0] "r"(out_buf0),
-              [s_8] "r"(s_8),
-              [s_16] "r"(s_16)
-            : "memory",
-              "cc",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-
-        int remain_cnt = w_out - (mid_cnt + 1) * 4;
-        for (int i = 0; i < remain_cnt; ++i) {
-          dout_ptr0[i] = out_buf0[i];
-        }
-
-        din0 = din2;
-        din1 = din3;
-        din2 = din4;
-        din3 = din2 + w_in;
-        din4 = din3 + w_in;
-        dout0 += w_out;
-      }
-    }
-  }
-}
-
-//! larger depthwise, win >= 9;
-void conv_depthwise_5x5s2p2_relu(const float* din,
-                                 float* dout,
-                                 int num,
-                                 int ch_out,
-                                 int h_out,
-                                 int w_out,
-                                 int ch_in,
-                                 int h_in,
-                                 int w_in,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 bool flag_relu,
-                                 ARMContext* ctx) {
-  // printf("invoke 5x5s2p2 armv7\n");
-  CHECK_GE(w_in, 9) << "only support win >= 9";
-  int w_out_round = (w_out + 3) / 4 * 4;
-  int cnt = (w_out_round - 4) / 4;
-  int mid_cnt = cnt - 1;
-  int right_start = cnt * 2 * 4 - 2;
-  int mask_cnt = 12 - (w_in - right_start);
-  int mask[12];
-  memset(mask, 0xff, 12 * sizeof(int));
-  for (int i = 0; i < mask_cnt; ++i) {
-    mask[11 - i] = 0;
-  }
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  int in_spatial_size = w_in * h_in;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      const float* din0 = zero_ptr;
-      const float* din1 = zero_ptr;
-      const float* din2 = din_ch;
-      const float* din3 = din2 + w_in;
-      const float* din4 = din3 + w_in;
-
-      float out_buf0[4];
-      float* dout0 = dout_ch;
-
-      const float* weights_c = weights + c * weights_saptial_size;
-      float32x4_t w0 = vld1q_f32(weights_c);
-      float32x4_t w1 = vld1q_f32(weights_c + 4);
-      float32x4_t w2 = vld1q_f32(weights_c + 8);
-      float32x4_t w3 = vld1q_f32(weights_c + 12);
-      float32x4_t w4 = vld1q_f32(weights_c + 16);
-      float32x4_t w5 = vld1q_f32(weights_c + 20);
-      for (int h = 0; h < h_out; h += 1) {
-        //! (h * 2 - 2) + 4 > h_in - 1
-        if (h * 2 + 3 > h_in) {
-          switch (h * 2 + 3 - h_in) {
-            case 4:
-              din1 = zero_ptr;
-            case 3:
-              din2 = zero_ptr;
-            case 2:
-              din3 = zero_ptr;
-            case 1:
-              din4 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        const float* din_ptr0 = din0;
-        const float* din_ptr1 = din1;
-        const float* din_ptr2 = din2;
-        const float* din_ptr3 = din3;
-        const float* din_ptr4 = din4;
-
-        const float* weights_ptr = weights_c + 24;
-        float* dout_ptr0 = dout0;
-
-        float bias_c = 0.f;
-        if (flag_bias) {
-          bias_c = bias[c];
-        }
-        float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-        int* mask_ptr = mask;
-        int loop = mid_cnt;
-        const int s_8 = 8;
-        const int s_16 = 16;
-
-        asm volatile(
-            "vmov.i32   q15, #0x0           \n"
-            "pld [%[din_ptr0]]              \n"
-            "pld [%[din_ptr1]]              \n"
-            "pld [%[din_ptr2]]              \n"
-            "pld [%[din_ptr3]]              \n"
-            "pld [%[din_ptr4]]              \n"
-            "pld [%[mask]]                  \n"
-
-            // left
-            "vld2.32 {d16-d19}, [%[din_ptr0]]!          \n"
-            "vld1.32 {d26-d29}, [%[vbias]]              \n"
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-            "vmov.32 q14, q15                           \n"
-
-            // r0
-            "vmla.f32 q13, q8, %f[w0][0]                \n"
-            "vmla.f32 q14, q9, %f[w0][1]                \n"
-
-            "vld1.32 {d21[1]}, [%[din_ptr0]]            \n"
-            "vld2.32 {d16-d19}, [%[din_ptr1]]!          \n"
-            "sub %[din_ptr0], #8  \n"
-
-            "vmla.f32 q13, q6, %e[w0][0]                \n"
-            "vmla.f32 q14, q7, %e[w0][1]                \n"
-            "vmla.f32 q13, q10, %e[w1][0]               \n"
-
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-
-            // r1
-            "vmla.f32 q13, q8, %f[w1][1]                \n"
-            "vmla.f32 q14, q9, %e[w2][0]                \n"
-
-            "vld1.32 {d21[1]}, [%[din_ptr1]]            \n"
-            "vld2.32 {d16-d19}, [%[din_ptr2]]!          \n"
-            "sub %[din_ptr1], #8                        \n"
-
-            "vmla.f32 q13, q6, %e[w1][1]                \n"
-            "vmla.f32 q14, q7, %f[w1][0]                \n"
-            "vmla.f32 q13, q10, %e[w2][1]               \n"
-
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-
-            // r2
-            "vmla.f32 q13, q8, %e[w3][0]                \n"
-            "vmla.f32 q14, q9, %e[w3][1]                \n"
-
-            "vld1.32 {d21[1]}, [%[din_ptr2]]            \n"
-            "vld2.32 {d16-d19}, [%[din_ptr3]]!          \n"
-            "sub %[din_ptr2], #8                        \n"
-
-            "vmla.f32 q13, q6, %f[w2][0]                \n"
-            "vmla.f32 q14, q7, %f[w2][1]                \n"
-            "vmla.f32 q13, q10, %f[w3][0]               \n"
-
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-
-            // r3
-            "vmla.f32 q13, q8, %e[w4][1]                \n"
-            "vmla.f32 q14, q9, %f[w4][0]                \n"
-
-            "vld1.32 {d21[1]}, [%[din_ptr3]]            \n"
-            "vld2.32 {d16-d19}, [%[din_ptr4]]!          \n"
-            "sub %[din_ptr3], #8                        \n"
-
-            "vmla.f32 q13, q6, %f[w3][1]                \n"
-            "vmla.f32 q14, q7, %e[w4][0]                \n"
-            "vmla.f32 q13, q10, %f[w4][1]               \n"
-
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-
-            // r4
-            "vmla.f32 q13, q6, %e[w5][0]                \n"
-            "vmla.f32 q14, q7, %e[w5][1]                \n"
-
-            "vld1.32 {d21[1]}, [%[din_ptr4]]            \n"
-            "vld2.32 {d12-d15}, [%[din_ptr0]], %[s_8]   \n"
-            "sub %[din_ptr4], #8                        \n"
-
-            "vmla.f32 q13, q8, %f[w5][0]                \n"
-            "vmla.f32 q14, q9, %f[w5][1]                \n"
-
-            "vld2.32 {d16-d19}, [%[din_ptr0]], %[s_8]   \n"
-
-            "vmov.32 q12, %q[w0]                        \n"
-            "vld1.32 {%e[w0][0]}, [%[weights]]          \n"
-            "vmla.f32 q13, q10, %e[w0][0]               \n"
-            "vadd.f32 q13, q13, q14                     \n"
-            "vmov.f32 %q[w0], q12                        \n"
-            "vmax.f32 q13, q13, q15                     \n"
-            "cmp %[mid_cnt], #1                         \n"
-            "vld2.32 {d20-d23}, [%[din_ptr0]], %[s_16]  \n"
-            "vst1.32 {d26-d27}, [%[dout_ptr0]]!         \n"
-            "pld [%[din_ptr0]]                          \n"
-            "blt 2f                                     \n"
-
-            // mid
-            "1:                                         \n"
-            "vld1.32 {d26-d27}, [%[vbias]]              \n"
-            "vmov.32 q14, q15                           \n"
-
-            // r0
-            "vmla.f32 q13, q6,  %e[w0][0]               \n"
-            "vmla.f32 q14, q7,  %e[w0][1]               \n"
-
-            "vld2.32 {d12-d15}, [%[din_ptr1]], %[s_8]   \n"
-
-            "vmla.f32 q13, q8,  %f[w0][0]               \n"
-            "vmla.f32 q14, q9,  %f[w0][1]               \n"
-
-            "vld2.32 {d16-d19}, [%[din_ptr1]], %[s_8]   \n"
-
-            "vmla.f32 q13, q10, %e[w1][0]               \n"
-
-            "vld2.32 {d20-d23}, [%[din_ptr1]], %[s_16]  \n"
-
-            // r1
-            "vmla.f32 q13, q6,  %e[w1][1]               \n"
-            "vmla.f32 q14, q7,  %f[w1][0]               \n"
-            "pld [%[din_ptr1]]                          \n"
-
-            "vld2.32 {d12-d15}, [%[din_ptr2]], %[s_8]   \n"
-
-            "vmla.f32 q13, q8,  %f[w1][1]               \n"
-            "vmla.f32 q14, q9,  %e[w2][0]               \n"
-
-            "vld2.32 {d16-d19}, [%[din_ptr2]], %[s_8]   \n"
-
-            "vmla.f32 q13, q10, %e[w2][1]               \n"
-
-            "vld2.32 {d20-d23}, [%[din_ptr2]], %[s_16]  \n"
-
-            // r2
-            "vmla.f32 q13, q6,  %f[w2][0]               \n"
-            "vmla.f32 q14, q7,  %f[w2][1]               \n"
-            "pld [%[din_ptr2]]                          \n"
-
-            "vld2.32 {d12-d15}, [%[din_ptr3]], %[s_8]   \n"
-
-            "vmla.f32 q13, q8,  %e[w3][0]               \n"
-            "vmla.f32 q14, q9,  %e[w3][1]               \n"
-
-            "vld2.32 {d16-d19}, [%[din_ptr3]], %[s_8]   \n"
-
-            "vmla.f32 q13, q10, %f[w3][0]               \n"
-
-            "vld2.32 {d20-d23}, [%[din_ptr3]], %[s_16]  \n"
-
-            // r3
-            "vmla.f32 q13, q6,  %f[w3][1]               \n"
-            "vmla.f32 q14, q7,  %e[w4][0]               \n"
-            "pld [%[din_ptr3]]                          \n"
-
-            "vld2.32 {d12-d15}, [%[din_ptr4]], %[s_8]   \n"
-
-            "vmla.f32 q13, q8,  %e[w4][1]               \n"
-            "vmla.f32 q14, q9,  %f[w4][0]               \n"
-
-            "vld2.32 {d16-d19}, [%[din_ptr4]], %[s_8]   \n"
-
-            "vmla.f32 q13, q10, %f[w4][1]               \n"
-
-            "vld2.32 {d20-d23}, [%[din_ptr4]], %[s_16]  \n"
-
-            // r4
-            "vmla.f32 q13, q6,  %e[w5][0]               \n"
-            "vmla.f32 q14, q7,  %e[w5][1]               \n"
-            "pld [%[din_ptr4]]                          \n"
-
-            "vld2.32 {d12-d15}, [%[din_ptr0]], %[s_8]   \n"
-            "vld1.32 {%e[w0][0]}, [%[weights]]          \n"
-
-            "vmla.f32 q13, q8,  %f[w5][0]               \n"
-            "vmla.f32 q14, q9,  %f[w5][1]               \n"
-
-            "vld2.32 {d16-d19}, [%[din_ptr0]], %[s_8]   \n"
-
-            "vmla.f32 q13, q10, %e[w0][0]               \n"
-
-            "vld2.32 {d20-d23}, [%[din_ptr0]], %[s_16]  \n"
-
-            "vmov.32 %q[w0], q12                        \n"
-            "vadd.f32 q13, q13, q14                     \n"
-            "vmax.f32 q13, q13, q15                     \n"
-            "subs %[mid_cnt], #1                        \n"
-            "vst1.32 {d26-d27}, [%[dout_ptr0]]!         \n"
-            "bne 1b                                     \n"
-
-            "2:                                         \n"
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vld1.32 {d26-d27}, [%[vbias]]              \n"
-            "vmov.32 q14, q15                           \n"
-
-            // r0
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %e[w0][0]                \n"
-            "vmla.f32 q14, q7, %e[w0][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vld2.32 {d12-d15}, [%[din_ptr1]], %[s_8]   \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %f[w0][0]                \n"
-            "vmla.f32 q14, q9, %f[w0][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "sub %[mask], #16                           \n"
-            "vld2.32 {d16-d19}, [%[din_ptr1]], %[s_8]   \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, %e[w1][0]               \n"
-
-            // r1
-            "vld2.32 {d20-d23}, [%[din_ptr1]]           \n"
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %e[w1][1]                \n"
-            "vmla.f32 q14, q7, %f[w1][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vld2.32 {d12-d15}, [%[din_ptr2]], %[s_8]   \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %f[w1][1]                \n"
-            "vmla.f32 q14, q9, %e[w2][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "sub %[mask], #16                           \n"
-            "vld2.32 {d16-d19}, [%[din_ptr2]], %[s_8]   \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, %e[w2][1]               \n"
-
-            // r2
-            "vld2.32 {d20-d23}, [%[din_ptr2]]           \n"
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %f[w2][0]                \n"
-            "vmla.f32 q14, q7, %f[w2][1]                \n"
-
-            "vld2.32 {d12-d15}, [%[din_ptr3]], %[s_8]   \n"
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %e[w3][0]                \n"
-            "vmla.f32 q14, q9, %e[w3][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "sub %[mask], #16                           \n"
-            "vld2.32 {d16-d19}, [%[din_ptr3]], %[s_8]   \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, %f[w3][0]               \n"
-
-            // r3
-            "vld2.32 {d20-d23}, [%[din_ptr3]]           \n"
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %f[w3][1]                \n"
-            "vmla.f32 q14, q7, %e[w4][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vld2.32 {d12-d15}, [%[din_ptr4]], %[s_8]   \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %e[w4][1]                \n"
-            "vmla.f32 q14, q9, %f[w4][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "sub %[mask], #16                           \n"
-            "vld2.32 {d16-d19}, [%[din_ptr4]], %[s_8]   \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, %f[w4][1]               \n"
-
-            // r4
-            "vld2.32 {d20-d23}, [%[din_ptr4]]           \n"
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %e[w5][0]                \n"
-            "vmla.f32 q14, q7, %e[w5][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vld1.32 {d12[0]}, [%[weights]]             \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %f[w5][0]                \n"
-            "vmla.f32 q14, q9, %f[w5][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, d12[0]                  \n"
-
-            "vadd.f32 q13, q13, q14                     \n"
-            "vmax.f32 q13, q13, q15                     \n"
-            "vst1.32 {d26-d27}, [%[out_buf0]]           \n"
-
-            : [dout_ptr0] "+r"(dout_ptr0),
-              [mid_cnt] "+r"(loop),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [mask] "+r"(mask_ptr),
-              [weights] "+r"(weights_ptr)
-            : [w0] "w"(w0),
-              [w1] "w"(w1),
-              [w2] "w"(w2),
-              [w3] "w"(w3),
-              [w4] "w"(w4),
-              [w5] "w"(w5),
-              [vbias] "r"(vbias),
-              [out_buf0] "r"(out_buf0),
-              [s_8] "r"(s_8),
-              [s_16] "r"(s_16)
-            : "memory",
-              "cc",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-
-        int remain_cnt = w_out - (mid_cnt + 1) * 4;
-        for (int i = 0; i < remain_cnt; ++i) {
-          dout_ptr0[i] = out_buf0[i];
-        }
-
-        din0 = din2;
-        din1 = din3;
-        din2 = din4;
-        din3 = din2 + w_in;
-        din4 = din3 + w_in;
-        dout0 += w_out;
-      }
-    }
-  }
-}
-
-//! small depthwise, win < 9;
-void conv_depthwise_5x5s2p2_s(const float* din,
-                              float* dout,
-                              int num,
-                              int ch_out,
-                              int h_out,
-                              int w_out,
-                              int ch_in,
-                              int h_in,
-                              int w_in,
-                              const float* weights,
-                              const float* bias,
-                              bool flag_bias,
-                              bool flag_relu,
-                              ARMContext* ctx) {
-  CHECK_LT(w_in, 9) << "only support win < 9";
-  int w_out_round = (w_out + 3) / 4 * 4;
-  int mask_cnt = 12 - w_in - 2;
-  int mask[12];
-  memset(mask, 0xff, 12 * sizeof(int));
-  for (int i = 0; i < mask_cnt; ++i) {
-    mask[11 - i] = 0;
-  }
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  int in_spatial_size = w_in * h_in;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      const float* din0 = zero_ptr;
-      const float* din1 = zero_ptr;
-      const float* din2 = din_ch;
-      const float* din3 = din2 + w_in;
-      const float* din4 = din3 + w_in;
-
-      float out_buf0[4];
-      float out_buf1[4];
-      float* dout0 = dout_ch;
-      float* dout1 = dout0 + w_out;
-
-      const float* weights_c = weights + c * weights_saptial_size;
-      float32x4_t w0 = vld1q_f32(weights_c);
-      float32x4_t w1 = vld1q_f32(weights_c + 4);
-      float32x4_t w2 = vld1q_f32(weights_c + 8);
-      float32x4_t w3 = vld1q_f32(weights_c + 12);
-      float32x4_t w4 = vld1q_f32(weights_c + 16);
-      float32x4_t w5 = vld1q_f32(weights_c + 20);
-      for (int h = 0; h < h_out; h += 1) {
-        //! (h * 2 - 2) + 4 > h_in - 1
-        if (h * 2 + 3 > h_in) {
-          switch (h * 2 + 3 - h_in) {
-            case 4:
-              din1 = zero_ptr;
-            case 3:
-              din2 = zero_ptr;
-            case 2:
-              din3 = zero_ptr;
-            case 1:
-              din4 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        const float* din_ptr0 = din0;
-        const float* din_ptr1 = din1;
-        const float* din_ptr2 = din2;
-        const float* din_ptr3 = din3;
-        const float* din_ptr4 = din4;
-
-        const float* weights_ptr = weights_c + 24;
-        float* dout_ptr0 = dout0;
-
-        float bias_c = 0.f;
-        if (flag_bias) {
-          bias_c = bias[c];
-        }
-        float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-        int* mask_ptr = mask;
-        const int s_8 = 8;
-
-        asm volatile(
-            "vmov.i32  q15, #0x0                 \n"
-            "pld [%[din_ptr0]]                   \n"
-            "pld [%[din_ptr1]]                   \n"
-            "pld [%[din_ptr2]]                   \n"
-            "pld [%[din_ptr3]]                   \n"
-            "pld [%[din_ptr4]]                   \n"
-            "vld1.32 {d26-d27}, [%[vbias]]       \n"
-            "vmov.32 q14, q15                    \n"
-            "vld2.32 {d16-d19}, [%[din_ptr0]]!   \n"
-
-            // r0
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-            "vld1.32 {d21[1]}, [%[din_ptr0]]            \n"
-
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %e[w0][0]                \n"
-            "vmla.f32 q14, q7, %e[w0][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %f[w0][0]                \n"
-            "vmla.f32 q14, q9, %f[w0][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "vld2.32 {d16-d19}, [%[din_ptr1]]!          \n"
-            "sub %[mask], #16                           \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, %e[w1][0]               \n"
-
-            // r1
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-            "vld1.32 {d21[1]}, [%[din_ptr1]]            \n"
-
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q14, q6, %e[w1][1]                \n"
-            "vmla.f32 q13, q7, %f[w1][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q14, q8, %f[w1][1]                \n"
-            "vmla.f32 q13, q9, %e[w2][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "vld2.32 {d16-d19}, [%[din_ptr2]]!          \n"
-            "sub %[mask], #16                           \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q14, q10, %e[w2][1]               \n"
-
-            // r2
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-            "vld1.32 {d21[1]}, [%[din_ptr2]]            \n"
-
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %f[w2][0]                \n"
-            "vmla.f32 q14, q7, %f[w2][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %e[w3][0]                \n"
-            "vmla.f32 q14, q9, %e[w3][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "vld2.32 {d16-d19}, [%[din_ptr3]]!          \n"
-            "sub %[mask], #16                           \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, %f[w3][0]               \n"
-
-            // r3
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-            "vld1.32 {d21[1]}, [%[din_ptr3]]            \n"
-
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q14, q6, %f[w3][1]                \n"
-            "vmla.f32 q13, q7, %e[w4][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q14, q8, %e[w4][1]                \n"
-            "vmla.f32 q13, q9, %f[w4][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "vld2.32 {d16-d19}, [%[din_ptr4]]!          \n"
-            "sub %[mask], #16                           \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q14, q10, %f[w4][1]               \n"
-
-            // r4
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-            "vld1.32 {d21[1]}, [%[din_ptr4]]            \n"
-
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %e[w5][0]                \n"
-            "vmla.f32 q14, q7, %e[w5][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vld1.32 {d12[0]}, [%[weights]]             \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %f[w5][0]                \n"
-            "vmla.f32 q14, q9, %f[w5][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, d12[0]                  \n"
-
-            "vadd.f32 q13, q13, q14                     \n"
-            "vst1.32 {d26-d27}, [%[out_buf0]]           \n"
-
-            : [dout_ptr0] "+r"(dout_ptr0),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [mask] "+r"(mask_ptr),
-              [weights] "+r"(weights_ptr)
-            : [vbias] "r"(vbias),
-              [out_buf0] "r"(out_buf0),
-              [s_8] "r"(s_8),
-              [w0] "w"(w0),
-              [w1] "w"(w1),
-              [w2] "w"(w2),
-              [w3] "w"(w3),
-              [w4] "w"(w4),
-              [w5] "w"(w5)
-            : "memory",
-              "cc",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-        for (int i = 0; i < w_out; ++i) {
-          dout_ptr0[i] = out_buf0[i];
-        }
-        din0 = din2;
-        din1 = din3;
-        din2 = din4;
-        din3 = din2 + w_in;
-        din4 = din3 + w_in;
-        dout0 += w_out;
-      }
-    }
-  }
-}
-
-//! small depthwise, win < 9;
-void conv_depthwise_5x5s2p2_relu_s(const float* din,
-                                   float* dout,
-                                   int num,
-                                   int ch_out,
-                                   int h_out,
-                                   int w_out,
-                                   int ch_in,
-                                   int h_in,
-                                   int w_in,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   bool flag_relu,
-                                   ARMContext* ctx) {
-  CHECK_LT(w_in, 9) << "only support win < 9\n";
-  int w_out_round = (w_out + 3) / 4 * 4;
-  int mask_cnt = 12 - w_in - 2;
-  int mask[12];
-  memset(mask, 0xff, 12 * sizeof(int));
-  for (int i = 0; i < mask_cnt; ++i) {
-    mask[11 - i] = 0;
-  }
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  int in_spatial_size = w_in * h_in;
-  int out_spatial_size = w_out * h_out;
-  int weights_saptial_size = 25;
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * in_spatial_size * ch_in;
-    float* dout_batch = dout + n * out_spatial_size * ch_out;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; ++c) {
-      const float* din_ch = din_batch + c * in_spatial_size;
-      float* dout_ch = dout_batch + c * out_spatial_size;
-      const float* din0 = zero_ptr;
-      const float* din1 = zero_ptr;
-      const float* din2 = din_ch;
-      const float* din3 = din2 + w_in;
-      const float* din4 = din3 + w_in;
-
-      float out_buf0[4];
-      float out_buf1[4];
-      float* dout0 = dout_ch;
-      float* dout1 = dout0 + w_out;
-
-      const float* weights_c = weights + c * weights_saptial_size;
-      float32x4_t w0 = vld1q_f32(weights_c);
-      float32x4_t w1 = vld1q_f32(weights_c + 4);
-      float32x4_t w2 = vld1q_f32(weights_c + 8);
-      float32x4_t w3 = vld1q_f32(weights_c + 12);
-      float32x4_t w4 = vld1q_f32(weights_c + 16);
-      float32x4_t w5 = vld1q_f32(weights_c + 20);
-      for (int h = 0; h < h_out; h += 1) {
-        //! (h * 2 - 2) + 4 > h_in - 1
-        if (h * 2 + 3 > h_in) {
-          switch (h * 2 + 3 - h_in) {
-            case 4:
-              din1 = zero_ptr;
-            case 3:
-              din2 = zero_ptr;
-            case 2:
-              din3 = zero_ptr;
-            case 1:
-              din4 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        const float* din_ptr0 = din0;
-        const float* din_ptr1 = din1;
-        const float* din_ptr2 = din2;
-        const float* din_ptr3 = din3;
-        const float* din_ptr4 = din4;
-
-        const float* weights_ptr = weights_c + 24;
-        float* dout_ptr0 = dout0;
-
-        float bias_c = 0.f;
-        if (flag_bias) {
-          bias_c = bias[c];
-        }
-        float vbias[4] = {bias_c, bias_c, bias_c, bias_c};
-        int* mask_ptr = mask;
-        const int s_8 = 8;
-
-        asm volatile(
-            "vmov.i32  q15, #0x0                \n"
-            "pld [%[din_ptr0]]                  \n"
-            "pld [%[din_ptr1]]                  \n"
-            "pld [%[din_ptr2]]                  \n"
-            "pld [%[din_ptr3]]                  \n"
-            "pld [%[din_ptr4]]                  \n"
-            "vld1.32 {d26-d27}, [%[vbias]]      \n"
-            "vmov.32 q14, q15                   \n"
-            "vld2.32 {d16-d19}, [%[din_ptr0]]!  \n"
-
-            // r0
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-            "vld1.32 {d21[1]}, [%[din_ptr0]]            \n"
-
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %e[w0][0]                \n"
-            "vmla.f32 q14, q7, %e[w0][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %f[w0][0]                \n"
-            "vmla.f32 q14, q9, %f[w0][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "vld2.32 {d16-d19}, [%[din_ptr1]]!          \n"
-            "sub %[mask], #16                           \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, %e[w1][0]               \n"
-
-            // r1
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-            "vld1.32 {d21[1]}, [%[din_ptr1]]            \n"
-
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q14, q6, %e[w1][1]                \n"
-            "vmla.f32 q13, q7, %f[w1][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q14, q8, %f[w1][1]                \n"
-            "vmla.f32 q13, q9, %e[w2][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "vld2.32 {d16-d19}, [%[din_ptr2]]!          \n"
-            "sub %[mask], #16                           \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q14, q10, %e[w2][1]               \n"
-
-            // r2
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-            "vld1.32 {d21[1]}, [%[din_ptr2]]            \n"
-
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %f[w2][0]                \n"
-            "vmla.f32 q14, q7, %f[w2][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %e[w3][0]                \n"
-            "vmla.f32 q14, q9, %e[w3][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "vld2.32 {d16-d19}, [%[din_ptr3]]!          \n"
-            "sub %[mask], #16                           \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, %f[w3][0]               \n"
-
-            // r3
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-            "vld1.32 {d21[1]}, [%[din_ptr3]]            \n"
-
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q14, q6, %f[w3][1]                \n"
-            "vmla.f32 q13, q7, %e[w4][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q14, q8, %e[w4][1]                \n"
-            "vmla.f32 q13, q9, %f[w4][0]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "vld2.32 {d16-d19}, [%[din_ptr4]]!          \n"
-            "sub %[mask], #16                           \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q14, q10, %f[w4][1]               \n"
-
-            // r4
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vext.32 q6, q15, q8, #3                    \n"
-            "vext.32 q7, q15, q9, #3                    \n"
-            "vext.32 q10, q8, q15, #1                   \n"
-            "vld1.32 {d21[1]}, [%[din_ptr4]]            \n"
-
-            "vbif.32 q6, q15, q11                       \n"
-            "vbif.32 q7, q15, q12                       \n"
-            "vmla.f32 q13, q6, %e[w5][0]                \n"
-            "vmla.f32 q14, q7, %e[w5][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]], %[s_8]       \n"
-            "vld1.32 {d12[0]}, [%[weights]]             \n"
-            "vbif.32 q8, q15, q11                       \n"
-            "vbif.32 q9, q15, q12                       \n"
-            "vmla.f32 q13, q8, %f[w5][0]                \n"
-            "vmla.f32 q14, q9, %f[w5][1]                \n"
-
-            "vld2.32 {d22-d25}, [%[mask]]               \n"
-            "vbif.32 q10, q15, q11                      \n"
-            "vmla.f32 q13, q10, d12[0]                  \n"
-
-            "vadd.f32 q13, q13, q14                     \n"
-            "vmax.f32 q13, q13, q15                     \n"
-            "vst1.32 {d26-d27}, [%[out_buf0]]           \n"
-
-            : [dout_ptr0] "+r"(dout_ptr0),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [mask] "+r"(mask_ptr),
-              [weights] "+r"(weights_ptr)
-            : [vbias] "r"(vbias),
-              [out_buf0] "r"(out_buf0),
-              [s_8] "r"(s_8),
-              [w0] "w"(w0),
-              [w1] "w"(w1),
-              [w2] "w"(w2),
-              [w3] "w"(w3),
-              [w4] "w"(w4),
-              [w5] "w"(w5)
-            : "memory",
-              "cc",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-        for (int i = 0; i < w_out; ++i) {
-          dout_ptr0[i] = out_buf0[i];
-        }
-        din0 = din2;
-        din1 = din3;
-        din2 = din4;
-        din3 = din2 + w_in;
-        din4 = din3 + w_in;
-        dout0 += w_out;
-      }
-    }
-  }
-}
-#endif  // __aarch64__
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_direct.cc b/lite/backends/arm/math/conv_direct.cc
deleted file mode 100644
index 51526aa2b3..0000000000
--- a/lite/backends/arm/math/conv_direct.cc
+++ /dev/null
@@ -1,242 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/conv_direct.h"
-#include "lite/backends/arm/math/conv_block_utils.h"
-#include "lite/backends/arm/math/conv_impl.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <>
-bool DirectConv<PRECISION(kFloat)>::create(const operators::ConvParam& param,
-                                           ARMContext* ctx) {
-  this->ctx_ = ctx;
-  auto x_dims = param.x->dims();
-  auto w_dims = param.filter->dims();
-  auto o_dims = param.output->dims();
-
-  int iw = x_dims[3];  // nchw
-  int ic = x_dims[1];
-  int ow = o_dims[3];
-  int oc = o_dims[1];
-  int kw = w_dims[3];
-  int sw = param.strides[1];
-  // select dw conv kernel
-  const auto* w_data = param.filter->data<float>();
-  if (kw == 3 && sw == 1) {
-    VLOG(5) << "invoke 3x3s1 direct conv";
-    impl_ = conv_3x3s1_direct_fp32;
-
-    constexpr int cblock = 4;
-    int cround = (oc + cblock - 1) / cblock * cblock;
-    weights_trans_.Resize({cround, ic, kw, kw});
-    float* transed_w_data = weights_trans_.mutable_data<float>();
-
-    conv_trans_weights_numc(w_data, transed_w_data, oc, ic, cblock, kw * kw);
-    is_weights_transed_ = true;
-  } else if (kw == 3 && sw == 2) {
-    VLOG(5) << "invoke 3x3s2 direct conv";
-    impl_ = conv_3x3s2_direct_fp32;
-
-    constexpr int cblock = 4;
-    int cround = (oc + cblock - 1) / cblock * cblock;
-    weights_trans_.Resize({cround, ic, kw, kw});
-    float* transed_w_data = weights_trans_.mutable_data<float>();
-    conv_trans_weights_numc(w_data, transed_w_data, oc, ic, cblock, kw * kw);
-    is_weights_transed_ = true;
-  } else {
-    LOG(ERROR) << "this type direct conv not impl";
-    return false;
-  }
-  return true;
-}
-
-template <>
-bool DirectConv<PRECISION(kFloat)>::init(const operators::ConvParam& param,
-                                         Context<TARGET(kARM)>* ctx) {
-  this->ctx_ = ctx;
-  return create(param, ctx);
-}
-
-template <>
-bool DirectConv<PRECISION(kFloat)>::run(const operators::ConvParam& param) {
-  // start timer
-  const auto* i_data = param.x->data<float>();
-  const auto* w_data = param.filter->data<float>();
-  const auto* b_data = param.bias ? param.bias->data<float>() : nullptr;
-  auto* o_data = param.output->mutable_data<float>();
-
-  if (is_weights_transed_ == true) {
-    w_data = weights_trans_.data<float>();
-  }
-  auto x_dims = param.x->dims();
-  auto w_dims = param.filter->dims();
-  auto o_dims = param.output->dims();
-
-  int iw = x_dims[3];  // nchw
-  int ih = x_dims[2];
-  int ic = x_dims[1];
-  int bs = x_dims[0];
-  int oh = o_dims[2];
-  int ow = o_dims[3];
-  int oc = o_dims[1];
-
-  impl_(i_data,
-        o_data,
-        bs,
-        oc,
-        oh,
-        ow,
-        ic,
-        ih,
-        iw,
-        w_data,
-        b_data,
-        param,
-        this->ctx_);
-
-  // timer end
-  return true;
-}
-
-template <PrecisionType Ptype_out>
-bool DirectConvInt8<Ptype_out>::create(const operators::ConvParam& param,
-                                       ARMContext* ctx) {
-  this->ctx_ = ctx;
-  auto x_dims = param.x->dims();
-  auto w_dims = param.filter->dims();
-  auto o_dims = param.output->dims();
-
-  int iw = x_dims[3];  // nchw
-  int ic = x_dims[1];
-  int ow = o_dims[3];
-  int oc = o_dims[1];
-  int kw = w_dims[3];
-  int sw = param.strides[1];
-  // select dw conv kernel
-  w_scale_ = param.weight_scale;
-  //! update weights scale
-  const auto* w_data = param.filter->data<int8_t>();
-  if (Ptype_out == PRECISION(kInt8) || Ptype_out == PRECISION(kFloat)) {
-    CHECK_EQ(this->w_scale_.size(), oc) << "weights scale size must be chout";
-    float input_scale = param.input_scale;
-    for (auto& w_s : w_scale_) {
-      w_s *= input_scale;
-      if (Ptype_out == PRECISION(kInt8)) {
-        w_s /= param.output_scale;
-      }
-    }
-  }
-  if (kw == 3 && sw == 1) {
-    VLOG(5) << "invoke 3x3s1 direct conv";
-    impl_int8_ = conv_3x3s1_direct_int8;
-
-    constexpr int cblock = 4;
-    int inpad = 4;
-    int cround = (oc + cblock - 1) / cblock * cblock;
-    weights_trans_.Resize({cround, ic, kw, kw});
-    int8_t* transed_w_data = weights_trans_.mutable_data<int8_t>();
-    conv_trans_weights_numc(w_data, transed_w_data, oc, ic, cblock, kw * kw);
-
-    int wout_round = ((ow + 3) / 4) * 4;
-    int win_round = wout_round * sw + inpad;
-    int row_out = 2;
-    int row_in = 4;
-    int tmp_size_out = wout_round * row_out * cblock;
-    int in_len = win_round * ic;
-    int tmp_size_in = row_in * in_len;
-    ctx_->ExtendWorkspace(ctx_->threads() * tmp_size_out +
-                          (tmp_size_in + 3) / 4 * 4 + wout_round + win_round);
-    is_weights_transed_ = true;
-
-  } else if (kw == 3 && sw == 2) {
-    VLOG(5) << "invoke 3x3s2 direct conv";
-    impl_int8_ = conv_3x3s2_direct_int8;
-
-    // constexpr int cblock = 4;
-    int cblock = conv_3x3s2_direct_int8_c_num();
-    int cround = (oc + cblock - 1) / cblock * cblock;
-    weights_trans_.Resize({cround, ic, kw, kw});
-    int8_t* transed_w_data = weights_trans_.mutable_data<int8_t>();
-    conv_trans_weights_numc(w_data, transed_w_data, oc, ic, cblock, kw * kw);
-    is_weights_transed_ = true;
-
-  } else {
-    LOG(ERROR) << "this type direct conv not impl";
-    return false;
-  }
-  return true;
-}
-
-template <PrecisionType Ptype_out>
-bool DirectConvInt8<Ptype_out>::init(const operators::ConvParam& param,
-                                     Context<TARGET(kARM)>* ctx) {
-  this->ctx_ = ctx;
-  return create(param, ctx);
-}
-
-template <PrecisionType Ptype_out>
-bool DirectConvInt8<Ptype_out>::run(const operators::ConvParam& param) {
-  // start timer
-  const auto* i_data = param.x->data<int8_t>();
-  const auto* w_data = param.filter->data<int8_t>();
-  const auto* b_data = param.bias ? param.bias->data<int32_t>() : nullptr;
-  auto* o_data = param.output->mutable_data<int32_t>();
-  if (is_weights_transed_ == true) {
-    w_data = weights_trans_.data<int8_t>();
-  }
-  auto x_dims = param.x->dims();
-  auto w_dims = param.filter->dims();
-  auto o_dims = param.output->dims();
-
-  int iw = x_dims[3];  // nchw
-  int ih = x_dims[2];
-  int ic = x_dims[1];
-  int bs = x_dims[0];
-  int oh = o_dims[2];
-  int ow = o_dims[3];
-  int oc = o_dims[1];
-
-  impl_int8_(i_data,
-             o_data,
-             bs,
-             oc,
-             oh,
-             ow,
-             ic,
-             ih,
-             iw,
-             w_data,
-             b_data,
-             param,
-             this->ctx_,
-             Ptype_out,
-             w_scale_.data());
-
-  // Modified from int32 for debug convenience
-  if (Ptype_out == PRECISION(kInt8)) param.output->mutable_data<int8_t>();
-  return true;
-}
-
-template class DirectConvInt8<PRECISION(kInt8)>;
-template class DirectConvInt8<PRECISION(kFloat)>;
-template class DirectConvInt8<PRECISION(kInt32)>;
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_direct.h b/lite/backends/arm/math/conv_direct.h
deleted file mode 100644
index e6132dca5e..0000000000
--- a/lite/backends/arm/math/conv_direct.h
+++ /dev/null
@@ -1,107 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cmath>
-#include <vector>
-#include "lite/backends/arm/math/conv_impl.h"
-#include "lite/core/context.h"
-#include "lite/core/target_wrapper.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <PrecisionType Ptype>
-class DirectConv : public ImplBase<TARGET(kARM), Ptype, operators::ConvParam> {
- public:
-  typedef void (*conv_direct_impl)(const float* din,
-                                   float* dout,
-                                   int num,
-                                   int chout,
-                                   int hout,
-                                   int wout,
-                                   int chin,
-                                   int hin,
-                                   int win,
-                                   const float* weights,
-                                   const float* bias,
-                                   const operators::ConvParam& param,
-                                   Context<TARGET(kARM)>* ctx);
-
-  DirectConv() = default;
-  ~DirectConv() {}
-
-  virtual bool init(const operators::ConvParam& param,
-                    Context<TARGET(kARM)>* ctx);
-
-  virtual bool create(const operators::ConvParam& param,
-                      Context<TARGET(kARM)>* ctx);
-
-  virtual bool run(const operators::ConvParam& param);
-
- protected:
-  bool is_weights_transed_{false};
-  Tensor weights_trans_;
-  Tensor _tmp_out;
-
- private:
-  conv_direct_impl impl_{nullptr};
-};
-
-template <PrecisionType Ptype_out>
-class DirectConvInt8
-    : public ImplBase<TARGET(kARM), PRECISION(kInt8), operators::ConvParam> {
- public:
-  typedef void (*conv_direct_int8_impl)(const int8_t* din,
-                                        int32_t* dout,
-                                        int num,
-                                        int chout,
-                                        int hout,
-                                        int wout,
-                                        int chin,
-                                        int hin,
-                                        int win,
-                                        const int8_t* weights,
-                                        const int32_t* bias,
-                                        const operators::ConvParam& param,
-                                        Context<TARGET(kARM)>* ctx,
-                                        PrecisionType out_type,
-                                        const float* scale);
-
-  DirectConvInt8() = default;
-  ~DirectConvInt8() {}
-
-  virtual bool init(const operators::ConvParam& param,
-                    Context<TARGET(kARM)>* ctx);
-
-  virtual bool create(const operators::ConvParam& param,
-                      Context<TARGET(kARM)>* ctx);
-
-  virtual bool run(const operators::ConvParam& param);
-
- private:
-  bool is_weights_transed_{false};
-  Tensor weights_trans_;
-  Tensor _tmp_out;
-  conv_direct_int8_impl impl_int8_{nullptr};
-  std::vector<float> w_scale_;
-};
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_direct_3x3s1.cc b/lite/backends/arm/math/conv_direct_3x3s1.cc
deleted file mode 100644
index 6991481ee1..0000000000
--- a/lite/backends/arm/math/conv_direct_3x3s1.cc
+++ /dev/null
@@ -1,1067 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <arm_neon.h>
-#include "lite/backends/arm/math/conv_block_utils.h"
-#include "lite/backends/arm/math/conv_impl.h"
-#include "lite/core/context.h"
-#include "lite/operators/op_params.h"
-#ifdef ARM_WITH_OMP
-#include <omp.h>
-#endif
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void conv_3x3s1_direct_fp32(const float* i_data,
-                            float* o_data,
-                            int bs,
-                            int oc,
-                            int oh,
-                            int ow,
-                            int ic,
-                            int ih,
-                            int win,
-                            const float* weights,
-                            const float* bias,
-                            const operators::ConvParam& param,
-                            ARMContext* ctx) {
-  const int threads = ctx->threads();
-  int l2_size = ctx->llc_size() / sizeof(float);
-
-  const int pad_h = param.paddings[0];
-  const int pad_w = param.paddings[1];
-  const int hout_c_block = 4;
-  const int hout_r_kernel = 2;
-  const int wout_block = 4;
-  const int wout_round = ((ow + wout_block - 1) / wout_block) * wout_block;
-  const int win_round = wout_round + 2;
-  bool flag_relu = param.fuse_relu;
-  bool flag_bias = param.bias != nullptr;
-  // if (param.activation_param.has_active) {
-  //   if (param.activation_param.active == Active_relu &&
-  //       fabs(param.activation_param.negative_slope) < 1e-6f) {
-  //     flag_relu = true;
-  //   }
-  // }
-  int hout_r_block = (l2_size - 2 * win_round * ic) /
-                     (win_round * ic + hout_c_block * wout_round * threads);
-  hout_r_block = hout_r_block > oh ? oh : hout_r_block;
-  hout_r_block = (hout_r_block / hout_r_kernel) * hout_r_kernel;
-  hout_r_block = hout_r_block < hout_r_kernel ? hout_r_kernel : hout_r_block;
-
-  const int hin_r_block = hout_r_block + 2;
-
-  float* tmp_work_space = ctx->workspace_data<float>();
-  float ptr_zero[win_round];  // NOLINT
-  memset(ptr_zero, 0, sizeof(float) * win_round);
-  float ptr_write[wout_round];  // NOLINT
-
-  int in_len = win_round * ic;
-  int pre_in_size = hin_r_block * in_len;
-  int pre_out_size = hout_c_block * hout_r_block * wout_round;
-
-  float* pre_din = tmp_work_space;
-
-  int size_in_channel = win * ih;
-  int size_out_channel = ow * oh;
-  int w_stride = ic * 9;                 // kernel_w * kernel_h;
-  int w_stride_chin = hout_c_block * 9;  // kernel_w * kernel_h *
-
-  int ws = -pad_w;
-  int we = ws + win_round;
-  int w_loop = wout_round / 4;
-
-  int c_remain = oc - (oc / hout_c_block) * hout_c_block;
-  int c_round_down = (oc / hout_c_block) * hout_c_block;
-
-  int out_row_stride = hout_c_block * wout_round;
-  for (int n = 0; n < bs; ++n) {
-    const float* din_batch = i_data + n * ic * size_in_channel;
-    float* dout_batch = o_data + n * oc * size_out_channel;
-    for (int h = 0; h < oh; h += hout_r_block) {
-      int h_kernel = hout_r_block;
-      if (h + hout_r_block > oh) {
-        h_kernel = oh - h;
-      }
-      int hs = h - pad_h;
-      int he = hs + h_kernel + 2;
-      prepack_input_nxw(
-          din_batch, pre_din, 0, ic, hs, he, ws, we, ic, win, ih, ptr_zero);
-#pragma omp parallel for num_threads(threads)
-      for (int c = 0; c < oc - (hout_c_block - 1); c += hout_c_block) {
-#ifdef ARM_WITH_OMP
-        float* pre_out =
-            pre_din + pre_in_size + omp_get_thread_num() * pre_out_size;
-#else
-        float* pre_out = pre_din + pre_in_size;
-#endif
-        const float* block_inr0 = pre_din;
-        const float* block_inr1 = block_inr0 + in_len;
-        const float* block_inr2 = block_inr1 + in_len;
-        const float* block_inr3 = block_inr2 + in_len;
-
-        const float* weight_c = weights + c * w_stride;
-        const float* bias_ptr = ptr_zero;
-        if (flag_bias) {
-          bias_ptr = bias + c;
-        }
-        fill_packed_biasc4(
-            pre_out, bias_ptr, wout_round * hout_c_block * h_kernel);
-
-        for (int hk = 0; hk < h_kernel; hk += hout_r_kernel) {
-          const float* wc0 = weight_c;
-
-          const float* inr0 = block_inr0;
-          const float* inr1 = block_inr1;
-          const float* inr2 = block_inr2;
-          const float* inr3 = block_inr3;
-
-          float* pre_out0 = pre_out + hk * out_row_stride;
-          float* pre_out1 = pre_out0 + out_row_stride;
-#ifdef __aarch64__
-          for (int i = 0; i < ic; ++i) {
-            float* ptr_out0 = pre_out0;
-            float* ptr_out1 = pre_out1;
-
-            float32x4_t w0 = vld1q_f32(wc0);       // w0, v23
-            float32x4_t w1 = vld1q_f32(wc0 + 4);   // w1, v24
-            float32x4_t w2 = vld1q_f32(wc0 + 8);   // w2, v25
-            float32x4_t w3 = vld1q_f32(wc0 + 12);  // w3, v26
-            float32x4_t w4 = vld1q_f32(wc0 + 16);  // w4, v27
-            float32x4_t w5 = vld1q_f32(wc0 + 20);  // w5, v28
-            float32x4_t w6 = vld1q_f32(wc0 + 24);  // w6, v29
-            float32x4_t w7 = vld1q_f32(wc0 + 28);  // w7, v30
-            float32x4_t w8 = vld1q_f32(wc0 + 32);  // w8, v31
-
-            const float* r0 = inr0;
-            const float* r1 = inr1;
-            const float* r2 = inr2;
-            const float* r3 = inr3;
-
-            int cnt = w_loop;
-            asm volatile(
-                "ldp    q15, q16, [%[ptr_out0]]             \n" /* load outr00,
-                                                                   outr01*/
-                "ldp    q17, q18, [%[ptr_out0], #32]\n" /* load outr02, outr03*/
-                "ldp    q19, q20, [%[ptr_out1]]     \n" /* load outr10, outr11*/
-                "ldp    q21, q22, [%[ptr_out1], #32]\n" /* load outr10, outr11*/
-                "ldp    q0, q1,   [%[r0]], #16      \n" /* load input r0*/
-                "ldp    q2, q3,   [%[r1]], #16      \n" /* load input r1*/
-                "2:                                 \n" /* main loop*/
-                /*  r0, r1, mul w0, get out r0, r1 */
-                "fmla   v15.4s ,  %[w0].4s,  v0.s[0]\n" /* outr00 = w0 * r0[0]*/
-                "fmla   v16.4s ,  %[w0].4s,  v0.s[1]\n" /* outr01 = w0 * r0[1]*/
-                "fmla   v17.4s ,  %[w0].4s,  v0.s[2]\n" /* outr02 = w0 * r0[2]*/
-                "fmla   v18.4s ,  %[w0].4s,  v0.s[3]\n" /* outr03 = w0 * r0[3]*/
-                "fmla   v19.4s ,  %[w0].4s,  v2.s[0]\n" /* outr10 = w0 * r1[0]*/
-                "fmla   v20.4s ,  %[w0].4s,  v2.s[1]\n" /* outr11 = w0 * r1[1]*/
-                "fmla   v21.4s ,  %[w0].4s,  v2.s[2]\n" /* outr12 = w0 * r1[2]*/
-                "fmla   v22.4s ,  %[w0].4s,  v2.s[3]\n" /* outr13 = w0 * r1[3]*/
-
-                /*  r0, r1, mul w1, get out r0, r1 */
-                "fmla   v15.4s ,  %[w1].4s,  v0.s[1]\n" /* outr00 = w1 * r0[1]*/
-                "fmla   v16.4s ,  %[w1].4s,  v0.s[2]\n" /* outr01 = w1 * r0[2]*/
-                "fmla   v17.4s ,  %[w1].4s,  v0.s[3]\n" /* outr02 = w1 * r0[3]*/
-                "fmla   v18.4s ,  %[w1].4s,  v1.s[0]\n" /* outr03 = w1 * r0[4]*/
-                "fmla   v19.4s ,  %[w1].4s,  v2.s[1]\n" /* outr10 = w1 * r1[1]*/
-                "fmla   v20.4s ,  %[w1].4s,  v2.s[2]\n" /* outr11 = w1 * r1[2]*/
-                "fmla   v21.4s ,  %[w1].4s,  v2.s[3]\n" /* outr12 = w1 * r1[3]*/
-                "fmla   v22.4s ,  %[w1].4s,  v3.s[0]\n" /* outr13 = w1 * r1[4]*/
-
-                "ldp    q4, q5,   [%[r2]], #16      \n" /* load input r2*/
-
-                /*  r0, r1, mul w2, get out r0, r1 */
-                "fmla   v15.4s ,  %[w2].4s,  v0.s[2]\n" /* outr00 = w2 * r0[2]*/
-                "fmla   v16.4s ,  %[w2].4s,  v0.s[3]\n" /* outr01 = w2 * r0[3]*/
-                "fmla   v17.4s ,  %[w2].4s,  v1.s[0]\n" /* outr02 = w2 * r0[0]*/
-                "fmla   v18.4s ,  %[w2].4s,  v1.s[1]\n" /* outr03 = w2 * r0[1]*/
-                "fmla   v19.4s ,  %[w2].4s,  v2.s[2]\n" /* outr10 = w2 * r1[2]*/
-                "fmla   v20.4s ,  %[w2].4s,  v2.s[3]\n" /* outr11 = w2 * r1[3]*/
-                "fmla   v21.4s ,  %[w2].4s,  v3.s[0]\n" /* outr12 = w2 * r1[0]*/
-                "fmla   v22.4s ,  %[w2].4s,  v3.s[1]\n" /* outr13 = w2 * r1[1]*/
-
-                /*  r1, r2, mul w3, get out r0, r1 */
-                "fmla   v15.4s ,  %[w3].4s,  v2.s[0]\n" /* outr00 = w3 * r1[0]*/
-                "fmla   v16.4s ,  %[w3].4s,  v2.s[1]\n" /* outr01 = w3 * r1[1]*/
-                "fmla   v17.4s ,  %[w3].4s,  v2.s[2]\n" /* outr02 = w3 * r1[2]*/
-                "fmla   v18.4s ,  %[w3].4s,  v2.s[3]\n" /* outr03 = w3 * r1[3]*/
-                "fmla   v19.4s ,  %[w3].4s,  v4.s[0]\n" /* outr10 = w3 * r2[0]*/
-                "fmla   v20.4s ,  %[w3].4s,  v4.s[1]\n" /* outr11 = w3 * r2[1]*/
-                "fmla   v21.4s ,  %[w3].4s,  v4.s[2]\n" /* outr12 = w3 * r2[2]*/
-                "fmla   v22.4s ,  %[w3].4s,  v4.s[3]\n" /* outr13 = w3 * r2[3]*/
-
-                "ldp    q0, q1,   [%[r0]], #16      \n" /* load next input r0*/
-
-                /*  r1, r2, mul w4, get out r0, r1 */
-                "fmla   v15.4s ,  %[w4].4s,  v2.s[1]\n" /* outr00 = w4 * r1[1]*/
-                "fmla   v16.4s ,  %[w4].4s,  v2.s[2]\n" /* outr01 = w4 * r1[2]*/
-                "fmla   v17.4s ,  %[w4].4s,  v2.s[3]\n" /* outr02 = w4 * r1[3]*/
-                "fmla   v18.4s ,  %[w4].4s,  v3.s[0]\n" /* outr03 = w4 * r1[4]*/
-                "fmla   v19.4s ,  %[w4].4s,  v4.s[1]\n" /* outr10 = w4 * r2[1]*/
-                "fmla   v20.4s ,  %[w4].4s,  v4.s[2]\n" /* outr11 = w4 * r2[2]*/
-                "fmla   v21.4s ,  %[w4].4s,  v4.s[3]\n" /* outr12 = w4 * r2[3]*/
-                "fmla   v22.4s ,  %[w4].4s,  v5.s[0]\n" /* outr13 = w4 * r2[4]*/
-
-                "ldp    q6, q7,   [%[r3]], #16      \n" /* load input r3*/
-
-                /*  r1, r2, mul w5, get out r0, r1 */
-                "fmla   v15.4s ,  %[w5].4s,  v2.s[2]\n" /* outr00 = w5 * r1[2]*/
-                "fmla   v16.4s ,  %[w5].4s,  v2.s[3]\n" /* outr01 = w5 * r1[3]*/
-                "fmla   v17.4s ,  %[w5].4s,  v3.s[0]\n" /* outr02 = w5 * r1[0]*/
-                "fmla   v18.4s ,  %[w5].4s,  v3.s[1]\n" /* outr03 = w5 * r1[1]*/
-                "fmla   v19.4s ,  %[w5].4s,  v4.s[2]\n" /* outr10 = w5 * r2[2]*/
-                "fmla   v20.4s ,  %[w5].4s,  v4.s[3]\n" /* outr11 = w5 * r2[3]*/
-                "fmla   v21.4s ,  %[w5].4s,  v5.s[0]\n" /* outr12 = w5 * r2[0]*/
-                "fmla   v22.4s ,  %[w5].4s,  v5.s[1]\n" /* outr13 = w5 * r2[1]*/
-
-                /*  r2, r3, mul w6, get out r0, r1 */
-                "fmla   v15.4s ,  %[w6].4s,  v4.s[0]\n" /* outr00 = w6 * r2[0]*/
-                "fmla   v16.4s ,  %[w6].4s,  v4.s[1]\n" /* outr01 = w6 * r2[1]*/
-                "fmla   v17.4s ,  %[w6].4s,  v4.s[2]\n" /* outr02 = w6 * r2[2]*/
-                "fmla   v18.4s ,  %[w6].4s,  v4.s[3]\n" /* outr03 = w6 * r2[3]*/
-                "fmla   v19.4s ,  %[w6].4s,  v6.s[0]\n" /* outr10 = w6 * r3[0]*/
-                "fmla   v20.4s ,  %[w6].4s,  v6.s[1]\n" /* outr11 = w6 * r3[1]*/
-                "fmla   v21.4s ,  %[w6].4s,  v6.s[2]\n" /* outr12 = w6 * r3[2]*/
-                "fmla   v22.4s ,  %[w6].4s,  v6.s[3]\n" /* outr13 = w6 * r3[3]*/
-
-                "ldp    q2, q3,   [%[r1]], #16      \n" /* load next input r1*/
-
-                /*  r2, r3, mul w7, get out r0, r1 */
-                "fmla   v15.4s ,  %[w7].4s,  v4.s[1]\n" /* outr00 = w7 * r2[1]*/
-                "fmla   v16.4s ,  %[w7].4s,  v4.s[2]\n" /* outr01 = w7 * r2[2]*/
-                "fmla   v17.4s ,  %[w7].4s,  v4.s[3]\n" /* outr02 = w7 * r2[3]*/
-                "fmla   v18.4s ,  %[w7].4s,  v5.s[0]\n" /* outr03 = w7 * r2[4]*/
-                "fmla   v19.4s ,  %[w7].4s,  v6.s[1]\n" /* outr10 = w7 * r3[1]*/
-                "fmla   v20.4s ,  %[w7].4s,  v6.s[2]\n" /* outr11 = w7 * r3[2]*/
-                "fmla   v21.4s ,  %[w7].4s,  v6.s[3]\n" /* outr12 = w7 * r3[3]*/
-                "fmla   v22.4s ,  %[w7].4s,  v7.s[0]\n" /* outr13 = w7 * r3[4]*/
-
-                "subs   %w[cnt], %w[cnt], #1        \n" /*loop count -1*/
-
-                /*  r2, r3, mul w8, get out r0, r1 */
-                "fmla   v15.4s ,  %[w8].4s,  v4.s[2]\n" /* outr00 = w8 * r2[2]*/
-                "fmla   v16.4s ,  %[w8].4s,  v4.s[3]\n" /* outr01 = w8 * r2[3]*/
-                "fmla   v17.4s ,  %[w8].4s,  v5.s[0]\n" /* outr02 = w8 * r2[0]*/
-                "fmla   v18.4s ,  %[w8].4s,  v5.s[1]\n" /* outr03 = w8 * r2[1]*/
-
-                "stp    q15, q16, [%[ptr_out0]], #32\n" /* save outr00, outr01*/
-                "fmla   v19.4s ,  %[w8].4s,  v6.s[2]\n" /* outr10 = w8 * r3[2]*/
-                "stp    q17, q18, [%[ptr_out0]], #32\n" /* save outr02, outr03*/
-                "fmla   v20.4s ,  %[w8].4s,  v6.s[3]\n" /* outr11 = w8 * r3[3]*/
-                "ldp    q15, q16, [%[ptr_out0]]     \n" /* load outr00, outr01*/
-                "fmla   v21.4s ,  %[w8].4s,  v7.s[0]\n" /* outr12 = w8 * r3[0]*/
-                "ldp    q17, q18, [%[ptr_out0], #32]\n" /* load outr02, outr03*/
-                "fmla   v22.4s ,  %[w8].4s,  v7.s[1]\n" /* outr13 = w8 * r3[1]*/
-                "stp    q19, q20, [%[ptr_out1]], #32\n" /* save outr10, outr11*/
-                "stp    q21, q22, [%[ptr_out1]], #32\n" /* save outr12, outr13*/
-                "ldp    q19, q20, [%[ptr_out1]]     \n" /* load outr10, outr11*/
-                "ldp    q21, q22, [%[ptr_out1], #32]\n" /* load outr12, outr13*/
-                "bne    2b                          \n" /* jump to main loop*/
-
-                : [cnt] "+r"(cnt),
-                  [r0] "+r"(r0),
-                  [r1] "+r"(r1),
-                  [r2] "+r"(r2),
-                  [r3] "+r"(r3),
-                  [ptr_out0] "+r"(ptr_out0),
-                  [ptr_out1] "+r"(ptr_out1)
-                : [w0] "w"(w0),
-                  [w1] "w"(w1),
-                  [w2] "w"(w2),
-                  [w3] "w"(w3),
-                  [w4] "w"(w4),
-                  [w5] "w"(w5),
-                  [w6] "w"(w6),
-                  [w7] "w"(w7),
-                  [w8] "w"(w8)
-                : "cc",
-                  "memory",
-                  "v0",
-                  "v1",
-                  "v2",
-                  "v3",
-                  "v4",
-                  "v5",
-                  "v6",
-                  "v7",
-                  "v15",
-                  "v16",
-                  "v17",
-                  "v18",
-                  "v19",
-                  "v20",
-                  "v21",
-                  "v22");
-
-            wc0 += 9 * hout_c_block;
-            inr0 += win_round;
-            inr1 += win_round;
-            inr2 += win_round;
-            inr3 += win_round;
-          }
-#else   // not __aarch64__
-          for (int i = 0; i < ic; ++i) {
-            const float* wc0 = weight_c + i * w_stride_chin;
-
-            float* ptr_out0 = pre_out0;
-            float* ptr_out1 = pre_out1;
-
-            const float* r0 = inr0;
-            const float* r1 = inr1;
-            const float* r2 = inr2;
-            const float* r3 = inr3;
-
-            int cnt = w_loop;
-            asm volatile(
-                "vld1.32    {d16-d19}, [%[ptr_out0]]!                       @ "
-                "load outr0, w0, w1, c0~c3\n"
-                "vld1.32    {d20-d23}, [%[ptr_out0]]                @ load "
-                "outr0, w2, w3, c0~c3\n"
-
-                /* load weights */
-                "vld1.32    {d10-d13}, [%[wc0]]!                    @ load w0, "
-                "w1, to q5, q6\n"
-                "vld1.32    {d14-d15}, [%[wc0]]!                    @ load w2, "
-                "to q7\n"
-
-                /* load r0, r1 */
-                "vld1.32    {d0-d1}, [%[r0]]!                       @ load r0, "
-                "4 float\n"
-                "vld1.32    {d2}, [%[r0]]                           @ load r0, "
-                "2 float\n"
-
-                "sub    %[ptr_out0], %[ptr_out0], #32               @ ptr_out0 "
-                "- 32, to start address\n"
-
-                /* main loop */
-                "0:                                                 @ main "
-                "loop\n"
-                /* mul r0 with w0, w1, w2, get out r0 */
-                "vld1.32    {d24-d27}, [%[ptr_out1]]!               @ load "
-                "outr1, w0, w1, c0~c3\n"
-                "vmla.f32   q8, q5, d0[0]                           @ w0 * "
-                "inr00\n"
-                "vld1.32    {d28-d31}, [%[ptr_out1]]                @ load "
-                "outr1, w2, w3, c0~c3\n"
-                "vmla.f32   q9, q5, d0[1]                           @ w0 * "
-                "inr01\n"
-                "vmla.f32   q10, q5, d1[0]                          @ w0 * "
-                "inr02\n"
-                "vmla.f32   q11, q5, d1[1]                          @ w0 * "
-                "inr03\n"
-                "vld1.32    {d3-d4}, [%[r1]]!                       @ load r1, "
-                "4 float\n"
-                "vmla.f32   q8, q6, d0[1]                           @ w1 * "
-                "inr01\n"
-                "vmla.f32   q9, q6, d1[0]                           @ w1 * "
-                "inr02\n"
-                "vmla.f32   q10, q6, d1[1]                          @ w1 * "
-                "inr03\n"
-                "vmla.f32   q11, q6, d2[0]                          @ w1 * "
-                "inr04\n"
-                "vld1.32    {d5}, [%[r1]]                           @ load r0, "
-                "2 float\n"
-                "vmla.f32   q8, q7, d1[0]                           @ w2 * "
-                "inr02\n"
-                "vmla.f32   q9, q7, d1[1]                           @ w2 * "
-                "inr03\n"
-                "vmla.f32   q10, q7, d2[0]                          @ w2 * "
-                "inr04\n"
-                "vmla.f32   q11, q7, d2[1]                          @ w2 * "
-                "inr05\n"
-
-                "sub    %[ptr_out1], %[ptr_out1], #32               @ ptr_out1 "
-                "- 32, to start address\n"
-
-                /* mul r1 with w0, w1, w2, get out r1 */
-                "vmla.f32   q12, q5, d3[0]                          @ w0 * "
-                "inr10\n"
-                "vmla.f32   q13, q5, d3[1]                          @ w0 * "
-                "inr11\n"
-                "vmla.f32   q14, q5, d4[0]                          @ w0 * "
-                "inr12\n"
-                "vmla.f32   q15, q5, d4[1]                          @ w0 * "
-                "inr13\n"
-                "vmla.f32   q12, q6, d3[1]                          @ w1 * "
-                "inr11\n"
-                "vmla.f32   q13, q6, d4[0]                          @ w1 * "
-                "inr12\n"
-                "vmla.f32   q14, q6, d4[1]                          @ w1 * "
-                "inr13\n"
-                "vmla.f32   q15, q6, d5[0]                          @ w1 * "
-                "inr14\n"
-                "vld1.32    {d10-d13}, [%[wc0]]!                    @ load w3, "
-                "w4, to q5, q6\n"
-                "vmla.f32   q12, q7, d4[0]                          @ w2 * "
-                "inr12\n"
-                "vmla.f32   q13, q7, d4[1]                          @ w2 * "
-                "inr13\n"
-                "vmla.f32   q14, q7, d5[0]                          @ w2 * "
-                "inr14\n"
-                "vmla.f32   q15, q7, d5[1]                          @ w2 * "
-                "inr15\n"
-                "vld1.32    {d14-d15}, [%[wc0]]!                    @ load w5, "
-                "to q7\n"
-
-                /* mul r1 with w3, w4, w5, get out r0 */
-                "vmla.f32   q8, q5, d3[0]                           @ w3 * "
-                "inr10\n"
-                "vmla.f32   q9, q5, d3[1]                           @ w3 * "
-                "inr11\n"
-                "vmla.f32   q10, q5, d4[0]                          @ w3 * "
-                "inr12\n"
-                "vmla.f32   q11, q5, d4[1]                          @ w3 * "
-                "inr13\n"
-                "vld1.32    {d0-d1}, [%[r2]]!                       @ load r2, "
-                "4 float\n"
-                "vmla.f32   q8, q6, d3[1]                           @ w4 * "
-                "inr11\n"
-                "vmla.f32   q9, q6, d4[0]                           @ w4 * "
-                "inr12\n"
-                "vmla.f32   q10, q6, d4[1]                          @ w4 * "
-                "inr13\n"
-                "vmla.f32   q11, q6, d5[0]                          @ w4 * "
-                "inr14\n"
-                "vld1.32    {d2}, [%[r2]]                           @ load r2, "
-                "2 float\n"
-                "vmla.f32   q8, q7, d4[0]                           @ w5 * "
-                "inr12\n"
-                "vmla.f32   q9, q7, d4[1]                           @ w5 * "
-                "inr13\n"
-                "vmla.f32   q10, q7, d5[0]                          @ w5 * "
-                "inr14\n"
-                "vmla.f32   q11, q7, d5[1]                          @ w5 * "
-                "inr15\n"
-
-                /* mul r2 with w3, w4, w5, get out r1 */
-                "vmla.f32   q12, q5, d0[0]                          @ w3 * "
-                "inr20\n"
-                "vmla.f32   q13, q5, d0[1]                          @ w3 * "
-                "inr21\n"
-                "vmla.f32   q14, q5, d1[0]                          @ w3 * "
-                "inr22\n"
-                "vmla.f32   q15, q5, d1[1]                          @ w3 * "
-                "inr23\n"
-                "vmla.f32   q12, q6, d0[1]                          @ w4 * "
-                "inr21\n"
-                "vmla.f32   q13, q6, d1[0]                          @ w4 * "
-                "inr22\n"
-                "vmla.f32   q14, q6, d1[1]                          @ w4 * "
-                "inr23\n"
-                "vmla.f32   q15, q6, d2[0]                          @ w4 * "
-                "inr24\n"
-                "vld1.32    {d10-d13}, [%[wc0]]!                    @ load w6, "
-                "w7, to q5, q6\n"
-                "vmla.f32   q12, q7, d1[0]                          @ w5 * "
-                "inr22\n"
-                "vmla.f32   q13, q7, d1[1]                          @ w5 * "
-                "inr23\n"
-                "vmla.f32   q14, q7, d2[0]                          @ w5 * "
-                "inr24\n"
-                "vmla.f32   q15, q7, d2[1]                          @ w5 * "
-                "inr25\n"
-                "vld1.32    {d14-d15}, [%[wc0]]!                    @ load w8, "
-                "to q7\n"
-
-                "sub    %[wc0], %[wc0], #144                        @ wc0 - "
-                "144 to start address\n"
-
-                /* mul r2 with w6, w7, w8, get out r0 */
-                "vmla.f32   q8, q5, d0[0]                           @ w6 * "
-                "inr20\n"
-                "vmla.f32   q9, q5, d0[1]                           @ w6 * "
-                "inr21\n"
-                "vld1.32    {d3-d4}, [%[r3]]!                       @ load r3, "
-                "4 float\n"
-                "vmla.f32   q10, q5, d1[0]                          @ w6 * "
-                "inr22\n"
-                "vmla.f32   q11, q5, d1[1]                          @ w6 * "
-                "inr23\n"
-                "vmla.f32   q8, q6, d0[1]                           @ w7 * "
-                "inr21\n"
-                "vmla.f32   q9, q6, d1[0]                           @ w7 * "
-                "inr22\n"
-                "vld1.32    {d5}, [%[r3]]                           @ load r3, "
-                "2 float\n"
-                "vmla.f32   q10, q6, d1[1]                          @ w7 * "
-                "inr23\n"
-                "vmla.f32   q11, q6, d2[0]                          @ w7 * "
-                "inr24\n"
-                "vmla.f32   q8, q7, d1[0]                           @ w8 * "
-                "inr22\n"
-                "vmla.f32   q9, q7, d1[1]                           @ w8 * "
-                "inr23\n"
-                "vld1.32    {d0-d1}, [%[r0]]!                       @ load r0, "
-                "4 float\n"
-                "vmla.f32   q10, q7, d2[0]                          @ w8 * "
-                "inr24\n"
-                "vmla.f32   q11, q7, d2[1]                          @ w8 * "
-                "inr25\n"
-                "vld1.32    {d2}, [%[r0]]                           @ load r0, "
-                "2 float\n"
-
-                /* mul r3 with w6, w7, w8, get out r1 */
-                "vmla.f32   q12, q5, d3[0]                          @ w6 * "
-                "inr20\n"
-                "vmla.f32   q13, q5, d3[1]                          @ w6 * "
-                "inr21\n"
-                "vst1.32    {d16-d19}, [%[ptr_out0]]!               @ save "
-                "r00, r01, c0~c3\n"
-                "vmla.f32   q14, q5, d4[0]                          @ w6 * "
-                "inr22\n"
-                "vmla.f32   q15, q5, d4[1]                          @ w6 * "
-                "inr23\n"
-                "vst1.32    {d20-d23}, [%[ptr_out0]]!               @ save "
-                "r02, r03, c0~c3\n"
-                "vmla.f32   q12, q6, d3[1]                          @ w7 * "
-                "inr21\n"
-                "vmla.f32   q13, q6, d4[0]                          @ w7 * "
-                "inr22\n"
-                "vld1.32    {d16-d19}, [%[ptr_out0]]!               @ load "
-                "outr0, w0, w1, c0~c3\n"
-                "vmla.f32   q14, q6, d4[1]                          @ w7 * "
-                "inr23\n"
-                "vmla.f32   q15, q6, d5[0]                          @ w7 * "
-                "inr24\n"
-                "vld1.32    {d10-d13}, [%[wc0]]!                    @ load w0, "
-                "w1, to q5, q6\n"
-                "vmla.f32   q12, q7, d4[0]                          @ w8 * "
-                "inr22\n"
-                "vmla.f32   q13, q7, d4[1]                          @ w8 * "
-                "inr23\n"
-                "vld1.32    {d20-d23}, [%[ptr_out0]]                @ load "
-                "outr0, w2, w3, c0~c3\n"
-                "vmla.f32   q14, q7, d5[0]                          @ w8 * "
-                "inr24\n"
-                "vmla.f32   q15, q7, d5[1]                          @ w8 * "
-                "inr25\n"
-
-                "vst1.32    {d24-d27}, [%[ptr_out1]]!               @ save "
-                "r10, r11, c0~c3\n"
-                "vst1.32    {d28-d31}, [%[ptr_out1]]!               @ save "
-                "r12, r13, c0~c3\n"
-                "vld1.32    {d14-d15}, [%[wc0]]!                    @ load w2, "
-                "to q7\n"
-
-                "sub    %[ptr_out0], %[ptr_out0], #32               @ ptr_out0 "
-                "- 32, to start address\n"
-
-                "subs   %[cnt], #1                                  @ loop "
-                "count--\n"
-                "bne    0b                                          @ jump to "
-                "main loop\n"
-
-                : [cnt] "+r"(cnt),
-                  [r0] "+r"(r0),
-                  [r1] "+r"(r1),
-                  [r2] "+r"(r2),
-                  [r3] "+r"(r3),
-                  [ptr_out0] "+r"(ptr_out0),
-                  [ptr_out1] "+r"(ptr_out1),
-                  [wc0] "+r"(wc0)
-                :
-                : "cc",
-                  "memory",
-                  "q0",
-                  "q1",
-                  "q2",
-                  "q3",
-                  "q4",
-                  "q5",
-                  "q6",
-                  "q7",
-                  "q8",
-                  "q9",
-                  "q10",
-                  "q11",
-                  "q12",
-                  "q13",
-                  "q14",
-                  "q15");
-
-            inr0 += win_round;
-            inr1 += win_round;
-            inr2 += win_round;
-            inr3 += win_round;
-          }
-#endif  // __aarch64__
-          block_inr0 = block_inr2;
-          block_inr1 = block_inr3;
-          block_inr2 = block_inr1 + in_len;
-          block_inr3 = block_inr2 + in_len;
-        }
-        write_to_output_c4_fp32(pre_out,
-                                dout_batch,
-                                c,
-                                c + hout_c_block,
-                                h,
-                                h + h_kernel,
-                                0,
-                                wout_round,
-                                oc,
-                                oh,
-                                ow,
-                                flag_relu,
-                                ptr_write);
-      }
-      const float* weight_remain_ptr = weights + c_round_down * w_stride;
-#pragma omp parallel for num_threads(threads)
-      for (int c = 0; c < c_remain; ++c) {
-#ifdef ARM_WITH_OMP
-        float* pre_out =
-            pre_din + pre_in_size + omp_get_thread_num() * pre_out_size;
-#else
-        float* pre_out = pre_din + pre_in_size;
-#endif
-
-        int c_idx = c_round_down + c;
-
-        int h_kernel = hout_r_block;
-        if (h + hout_r_block > oh) {
-          h_kernel = oh - h;
-        }
-
-        const float* block_inr0 = pre_din;
-        const float* block_inr1 = block_inr0 + in_len;
-        const float* block_inr2 = block_inr1 + in_len;
-        const float* block_inr3 = block_inr2 + in_len;
-
-        const float* bias_ptr = ptr_zero;
-        if (flag_bias) {
-          bias_ptr = bias + c_idx;
-        }
-        fill_bias(pre_out, bias_ptr, 1, wout_round * h_kernel);
-
-        for (int hk = 0; hk < h_kernel; hk += hout_r_kernel) {
-          const float* wc0 = weight_remain_ptr;
-
-          const float* inr0 = block_inr0;
-          const float* inr1 = block_inr1;
-          const float* inr2 = block_inr2;
-          const float* inr3 = block_inr3;
-
-          float* pre_out0 = pre_out + hk * wout_round;
-          float* pre_out1 = pre_out0 + wout_round;
-#ifdef __aarch64__
-          for (int i = 0; i < ic; ++i) {
-            float* ptr_out0 = pre_out0;
-            float* ptr_out1 = pre_out1;
-
-            float32x4_t w0 = vdupq_n_f32(wc0[c]);       // w0, v23
-            float32x4_t w1 = vdupq_n_f32(wc0[4 + c]);   // w1, v24
-            float32x4_t w2 = vdupq_n_f32(wc0[8 + c]);   // w2, v25
-            float32x4_t w3 = vdupq_n_f32(wc0[12 + c]);  // w3, v26
-            float32x4_t w4 = vdupq_n_f32(wc0[16 + c]);  // w4, v27
-            float32x4_t w5 = vdupq_n_f32(wc0[20 + c]);  // w5, v28
-            float32x4_t w6 = vdupq_n_f32(wc0[24 + c]);  // w6, v29
-            float32x4_t w7 = vdupq_n_f32(wc0[28 + c]);  // w7, v30
-            float32x4_t w8 = vdupq_n_f32(wc0[32 + c]);  // w8, v31
-
-            const float* r0 = inr0;
-            const float* r1 = inr1;
-            const float* r2 = inr2;
-            const float* r3 = inr3;
-
-            int cnt = w_loop;
-            asm volatile(
-                "ldr    q21, [%[ptr_out0]]                  \n" /* load outr0,
-                                                                   w0~w3*/
-                "ldr    q22, [%[ptr_out1]]          \n" /* load outr1, w0~w3*/
-                "ldp    q0, q1,   [%[r0]], #16      \n" /* load input r0*/
-                "ldp    q2, q3,   [%[r1]], #16      \n" /* load input r1*/
-                "ldp    q4, q5,   [%[r2]], #16      \n" /* load input r2*/
-                "ldp    q6, q7,   [%[r3]], #16      \n" /* load input r3*/
-                "2:                                 \n" /* main loop*/
-
-                "fmla   v21.4s ,  %[w0].4s,  v0.4s  \n" /* outr0 = w0 * r0*/
-                "fmla   v22.4s ,  %[w0].4s,  v2.4s  \n" /* outr1 = w0 * r1*/
-
-                "ext    v8.16b,  v0.16b,  v1.16b, #4   \n" /* shift r0 left 1*/
-                "ext    v10.16b,  v2.16b,  v3.16b, #4  \n" /* shift r1 left 1*/
-                "ext    v9.16b,  v0.16b,  v1.16b, #8   \n" /* shift r0 left 2*/
-                "ext    v11.16b,  v2.16b,  v3.16b, #8  \n" /* shift r1 left 2*/
-
-                "ldp    q0, q1,   [%[r0]], #16      \n" /* load input r0*/
-
-                "fmla   v21.4s ,  %[w1].4s,  v8.4s  \n" /* outr0 = w1 * r1*/
-                "fmla   v22.4s ,  %[w1].4s,  v10.4s \n" /* outr1 = w1 * r2*/
-
-                "fmla   v21.4s ,  %[w2].4s,  v9.4s  \n" /* outr0 = w2 * r1*/
-                "fmla   v22.4s ,  %[w2].4s,  v11.4s \n" /* outr1 = w2 * r2*/
-
-                "fmla   v21.4s ,  %[w3].4s,  v2.4s  \n" /* outr0 = w3 * r1*/
-                "fmla   v22.4s ,  %[w3].4s,  v4.4s  \n" /* outr1 = w3 * r2*/
-
-                "ext    v12.16b,  v4.16b,  v5.16b, #4\n" /* shift r2 left 1*/
-                "ext    v14.16b,  v6.16b,  v7.16b, #4\n" /* shift r3 left 1*/
-                "ext    v13.16b,  v4.16b,  v5.16b, #8\n" /* shift r2 left 2*/
-                "ext    v15.16b,  v6.16b,  v7.16b, #8\n" /* shift r3 left 2*/
-
-                "fmla   v21.4s ,  %[w4].4s,  v10.4s \n" /* outr0 = w4 * r1*/
-                "fmla   v22.4s ,  %[w4].4s,  v12.4s \n" /* outr1 = w4 * r2*/
-
-                "fmla   v21.4s ,  %[w5].4s,  v11.4s \n" /* outr0 = w5 * r1*/
-                "fmla   v22.4s ,  %[w5].4s,  v13.4s \n" /* outr1 = w5 * r2*/
-
-                "ldp    q2, q3,   [%[r1]], #16      \n" /* load input r0*/
-
-                "fmla   v21.4s ,  %[w6].4s,  v4.4s  \n" /* outr0 = w6 * r2*/
-                "fmla   v22.4s ,  %[w6].4s,  v6.4s  \n" /* outr1 = w6 * r3*/
-
-                "ldp    q4, q5,   [%[r2]], #16      \n" /* load input r2*/
-
-                "fmla   v21.4s ,  %[w7].4s,  v12.4s \n" /* outr0 = w7 * r1*/
-                "fmla   v22.4s ,  %[w7].4s,  v14.4s \n" /* outr1 = w7 * r2*/
-
-                "ldp    q6, q7,   [%[r3]], #16      \n" /* load input r3*/
-
-                "fmla   v21.4s ,  %[w8].4s,  v13.4s \n" /* outr0 = w8 * r1*/
-                "fmla   v22.4s ,  %[w8].4s,  v15.4s \n" /* outr1 = w8 * r2*/
-
-                "str    q21,    [%[ptr_out0]], #16  \n" /*write output r0*/
-                "str    q22,    [%[ptr_out1]], #16  \n" /*write output r1*/
-
-                "subs   %w[cnt], %w[cnt], #1        \n" /*loop count -1*/
-
-                "ldr    q21, [%[ptr_out0]]          \n" /* load outr0, w0~w3*/
-                "ldr    q22, [%[ptr_out1]]          \n" /* load outr1, w0~w3*/
-
-                "bne    2b                          \n" /* jump to main loop*/
-
-                : [cnt] "+r"(cnt),
-                  [r0] "+r"(r0),
-                  [r1] "+r"(r1),
-                  [r2] "+r"(r2),
-                  [r3] "+r"(r3),
-                  [ptr_out0] "+r"(ptr_out0),
-                  [ptr_out1] "+r"(ptr_out1)
-                : [w0] "w"(w0),
-                  [w1] "w"(w1),
-                  [w2] "w"(w2),
-                  [w3] "w"(w3),
-                  [w4] "w"(w4),
-                  [w5] "w"(w5),
-                  [w6] "w"(w6),
-                  [w7] "w"(w7),
-                  [w8] "w"(w8)
-                : "cc",
-                  "memory",
-                  "v0",
-                  "v1",
-                  "v2",
-                  "v3",
-                  "v4",
-                  "v5",
-                  "v6",
-                  "v7",
-                  "v8",
-                  "v9",
-                  "v10",
-                  "v11",
-                  "v12",
-                  "v13",
-                  "v14",
-                  "v15",
-                  "v21",
-                  "v22");
-
-            wc0 += 9 * hout_c_block;
-            inr0 += win_round;
-            inr1 += win_round;
-            inr2 += win_round;
-            inr3 += win_round;
-          }
-#else   // not __aarch64__
-          for (int i = 0; i < ic; ++i) {
-            float* ptr_out0 = pre_out0;
-            float* ptr_out1 = pre_out1;
-
-            //! get valid weights of current output channel
-            float w_tmp[10] = {wc0[c],
-                               wc0[c + 4],
-                               wc0[c + 8],
-                               wc0[c + 12],
-                               wc0[c + 16],
-                               wc0[c + 20],
-                               wc0[c + 24],
-                               wc0[c + 28],
-                               wc0[c + 32],
-                               0.f};
-            float32x4_t w0 = vld1q_f32(w_tmp);      // w0, w1, w2, q0
-            float32x4_t w1 = vld1q_f32(w_tmp + 3);  // w3, w4, w5, q1
-            float32x4_t w2 = vld1q_f32(w_tmp + 6);  // w6, w7, w8, q2
-
-            const float* r0 = inr0;
-            const float* r1 = inr1;
-            const float* r2 = inr2;
-            const float* r3 = inr3;
-            int cnt = w_loop / 2;
-            if (cnt > 0) {
-              asm volatile(
-                  "vld1.32    {d24-d27},    [%[ptr_out0]]                 @ "
-                  "load or00, or01\n"
-                  "vld1.32    {d6-d9},    [%[r0]]!                @ load r0, 8 "
-                  "float\n"
-                  "vld1.32    {d10},       [%[r0]]                @ load r0, 2 "
-                  "float\n"
-                  /* main loop */
-                  "0:                                             @ main loop\n"
-                  /* r0 * w0, w1, w2, get out r0*/
-                  "vld1.32    {d28-d31},    [%[ptr_out1]]         @ load or10, "
-                  "or11\n"
-                  "vext.32    q8, q3, q4, #1                      @ r0, shift "
-                  "left 1, get 1, 2, 3, 4\n"
-                  "vext.32    q9, q4, q5, #1                      @ r0, shift "
-                  "left 1, get 5, 6, 7, 8\n"
-                  "vmla.f32   q12,    q3, %e[w0][0]               @ w00 * r0, "
-                  "0, 1, 2, 3\n"
-                  "vmla.f32   q13,    q4, %e[w0][0]               @ w00 * r0, "
-                  "4, 5, 6, 7\n"
-                  "vext.32    q10, q3, q4, #2                     @ r0, shift "
-                  "left 2, get 2, 3, 4, 5\n"
-                  "vext.32    q11, q4, q5, #2                     @ r0, shift "
-                  "left 2, get 6, 7, 8, 9\n"
-                  "vmla.f32   q12,    q8, %e[w0][1]               @ w01 * r0, "
-                  "1, 2, 3, 4\n"
-                  "vmla.f32   q13,    q9, %e[w0][1]               @ w01 * r0, "
-                  "5, 6, 7, 8\n"
-                  "vld1.32    {d6-d9},    [%[r1]]!                @ load r1, 8 "
-                  "float\n"
-                  "vmla.f32   q12,    q10, %f[w0][0]              @ w02 * r0, "
-                  "2, 3, 4, 5\n"
-                  "vmla.f32   q13,    q11, %f[w0][0]              @ w02 * r0, "
-                  "6, 7, 8, 9\n"
-                  "vld1.32    {d10},       [%[r1]]                @ load r1, 2 "
-                  "float\n"
-
-                  /* r1 * w3, w4, w5, get out r0*/
-                  /* r1 * w0, w1, w2, get out r1*/
-                  "vmla.f32   q12,    q3, %e[w1][0]               @ w10 * r1, "
-                  "0, 1, 2, 3\n"
-                  "vmla.f32   q13,    q4, %e[w1][0]               @ w10 * r1, "
-                  "4, 5, 6, 7\n"
-                  "vext.32    q8, q3, q4, #1                      @ r1, shift "
-                  "left 1, get 1, 2, 3, 4\n"
-                  "vext.32    q9, q4, q5, #1                      @ r1, shift "
-                  "left 1, get 5, 6, 7, 8\n"
-                  "vmla.f32   q14,    q3, %e[w0][0]               @ w00 * r1, "
-                  "0, 1, 2, 3\n"
-                  "vmla.f32   q15,    q4, %e[w0][0]               @ w00 * r1, "
-                  "4, 5, 6, 7\n"
-                  "vext.32    q10, q3, q4, #2                     @ r1, shift "
-                  "left 2, get 2, 3, 4, 5\n"
-                  "vext.32    q11, q4, q5, #2                     @ r1, shift "
-                  "left 2, get 6, 7, 8, 9\n"
-                  "vmla.f32   q12,    q8, %e[w1][1]               @ w11 * r1, "
-                  "1, 2, 3, 4\n"
-                  "vmla.f32   q13,    q9, %e[w1][1]               @ w11 * r1, "
-                  "5, 6, 7, 8\n"
-                  "vmla.f32   q14,    q8, %e[w0][1]               @ w01 * r1, "
-                  "1, 2, 3, 4\n"
-                  "vmla.f32   q15,    q9, %e[w0][1]               @ w01 * r1, "
-                  "5, 6, 7, 8\n"
-                  "vld1.32    {d6-d9},    [%[r2]]!                @ load r2, 8 "
-                  "float\n"
-                  "vmla.f32   q12,    q10, %f[w1][0]              @ w12 * r1, "
-                  "2, 3, 4, 5\n"
-                  "vmla.f32   q13,    q11, %f[w1][0]              @ w12 * r1, "
-                  "6, 7, 8, 9\n"
-                  "vmla.f32   q14,    q10, %f[w0][0]              @ w02 * r1, "
-                  "2, 3, 4, 5\n"
-                  "vmla.f32   q15,    q11, %f[w0][0]              @ w02 * r1, "
-                  "6, 7, 8, 9\n"
-                  "vld1.32    {d10},    [%[r2]]                   @ load r2, 2 "
-                  "float\n"
-
-                  /* r2 * w6, w7, w8, get out r0*/
-                  /* r2 * w3, w4, w5, get out r1*/
-                  "vmla.f32   q12,    q3, %e[w2][0]               @ w20 * r2, "
-                  "0, 1, 2, 3\n"
-                  "vmla.f32   q13,    q4, %e[w2][0]               @ w20 * r2, "
-                  "4, 5, 6, 7\n"
-                  "vext.32    q8, q3, q4, #1                      @ r2, shift "
-                  "left 1, get 1, 2, 3, 4\n"
-                  "vext.32    q9, q4, q5, #1                      @ r2, shift "
-                  "left 1, get 5, 6, 7, 8\n"
-                  "vmla.f32   q14,    q3, %e[w1][0]               @ w10 * r2, "
-                  "0, 1, 2, 3\n"
-                  "vmla.f32   q15,    q4, %e[w1][0]               @ w10 * r2, "
-                  "4, 5, 6, 7\n"
-                  "vext.32    q10, q3, q4, #2                     @ r2, shift "
-                  "left 2, get 2, 3, 4, 5\n"
-                  "vext.32    q11, q4, q5, #2                     @ r2, shift "
-                  "left 2, get 6, 7, 8, 9\n"
-                  "vmla.f32   q12,    q8, %e[w2][1]               @ w21 * r2, "
-                  "1, 2, 3, 4\n"
-                  "vmla.f32   q13,    q9, %e[w2][1]               @ w21 * r2, "
-                  "5, 6, 7, 8\n"
-                  "vmla.f32   q14,    q8, %e[w1][1]               @ w11 * r2, "
-                  "1, 2, 3, 4\n"
-                  "vmla.f32   q15,    q9, %e[w1][1]                @ w11 * r2, "
-                  "5, 6, 7, 8\n"
-                  "vld1.32    {d6-d9},    [%[r3]]!                @ load r3, 8 "
-                  "float\n"
-                  "vmla.f32   q12,    q10, %f[w2][0]              @ w22 * r2, "
-                  "2, 3, 4, 5\n"
-                  "vmla.f32   q13,    q11, %f[w2][0]              @ w22 * r2, "
-                  "6, 7, 8, 9\n"
-                  "vmla.f32   q14,    q10, %f[w1][0]              @ w12 * r2, "
-                  "2, 3, 4, 5\n"
-                  "vmla.f32   q15,    q11, %f[w1][0]              @ w12 * r2, "
-                  "6, 7, 8, 9\n"
-                  "vld1.32    {d10},    [%[r3]]                   @ load r3, 2 "
-                  "float\n"
-
-                  /* r3 * w6, w7, w8, get out r1*/
-                  "vext.32    q8, q3, q4, #1                      @ r3, shift "
-                  "left 1, get 1, 2, 3, 4\n"
-                  "vext.32    q9, q4, q5, #1                      @ r3, shift "
-                  "left 1, get 5, 6, 7, 8\n"
-                  "vmla.f32   q14,    q3, %e[w2][0]               @ w20 * r3, "
-                  "0, 1, 2, 3\n"
-                  "vmla.f32   q15,    q4, %e[w2][0]               @ w20 * r3, "
-                  "4, 5, 6, 7\n"
-                  "vst1.32    {d24-d27},  [%[ptr_out0]]!          @ save or00, "
-                  "or01\n"
-                  "vext.32    q10, q3, q4, #2                     @ r3, shift "
-                  "left 2, get 2, 3, 4, 5\n"
-                  "vext.32    q11, q4, q5, #2                     @ r3, shift "
-                  "left 2, get 6, 7, 8, 9\n"
-                  "vmla.f32   q14,    q8, %e[w2][1]               @ w21 * r3, "
-                  "0, 1, 2, 3\n"
-                  "vmla.f32   q15,    q9, %e[w2][1]               @ w21 * r3, "
-                  "4, 5, 6, 7\n"
-                  "vld1.32    {d24-d27},  [%[ptr_out0]]           @ load or00, "
-                  "or01\n"
-                  "vld1.32    {d6-d9},    [%[r0]]!                @ load r3, 8 "
-                  "float\n"
-                  "vmla.f32   q14,    q10, %f[w2][0]              @ w22 * r3, "
-                  "2, 3, 4, 5\n"
-                  "vmla.f32   q15,    q11, %f[w2][0]              @ w22 * r3, "
-                  "6, 7, 8, 9\n"
-                  "vld1.32    {d10},    [%[r0]]                   @ load r0, 2 "
-                  "float\n"
-                  "vst1.32    {d28-d31},  [%[ptr_out1]]!          @ save or10, "
-                  "or11\n"
-
-                  "subs   %[cnt], #1                              @loop count "
-                  "-1\n"
-                  "bne    0b                                      @ jump to "
-                  "main loop\n"
-
-                  : [cnt] "+r"(cnt),
-                    [r0] "+r"(r0),
-                    [r1] "+r"(r1),
-                    [r2] "+r"(r2),
-                    [r3] "+r"(r3),
-                    [ptr_out0] "+r"(ptr_out0),
-                    [ptr_out1] "+r"(ptr_out1)
-                  : [w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2)
-                  : "cc",
-                    "memory",
-                    "q3",
-                    "q4",
-                    "q5",
-                    "q6",
-                    "q7",
-                    "q8",
-                    "q9",
-                    "q10",
-                    "q11",
-                    "q12",
-                    "q13",
-                    "q14",
-                    "q15");
-              r0 -= 8;
-            }
-            //! deal with remain ow
-            if (w_loop & 1) {
-              ptr_out0[0] +=
-                  r0[0] * w_tmp[0] + r0[1] * w_tmp[1] + r0[2] * w_tmp[2] +
-                  r1[0] * w_tmp[3] + r1[1] * w_tmp[4] + r1[2] * w_tmp[5] +
-                  r2[0] * w_tmp[6] + r2[1] * w_tmp[7] + r2[2] * w_tmp[8];
-
-              ptr_out0[1] +=
-                  r0[1] * w_tmp[0] + r0[2] * w_tmp[1] + r0[3] * w_tmp[2] +
-                  r1[1] * w_tmp[3] + r1[2] * w_tmp[4] + r1[3] * w_tmp[5] +
-                  r2[1] * w_tmp[6] + r2[2] * w_tmp[7] + r2[3] * w_tmp[8];
-
-              ptr_out0[2] +=
-                  r0[2] * w_tmp[0] + r0[3] * w_tmp[1] + r0[4] * w_tmp[2] +
-                  r1[2] * w_tmp[3] + r1[3] * w_tmp[4] + r1[4] * w_tmp[5] +
-                  r2[2] * w_tmp[6] + r2[3] * w_tmp[7] + r2[4] * w_tmp[8];
-
-              ptr_out0[3] +=
-                  r0[3] * w_tmp[0] + r0[4] * w_tmp[1] + r0[5] * w_tmp[2] +
-                  r1[3] * w_tmp[3] + r1[4] * w_tmp[4] + r1[5] * w_tmp[5] +
-                  r2[3] * w_tmp[6] + r2[4] * w_tmp[7] + r2[5] * w_tmp[8];
-
-              ptr_out1[0] +=
-                  r1[0] * w_tmp[0] + r1[1] * w_tmp[1] + r1[2] * w_tmp[2] +
-                  r2[0] * w_tmp[3] + r2[1] * w_tmp[4] + r2[2] * w_tmp[5] +
-                  r3[0] * w_tmp[6] + r3[1] * w_tmp[7] + r3[2] * w_tmp[8];
-
-              ptr_out1[1] +=
-                  r1[1] * w_tmp[0] + r1[2] * w_tmp[1] + r1[3] * w_tmp[2] +
-                  r2[1] * w_tmp[3] + r2[2] * w_tmp[4] + r2[3] * w_tmp[5] +
-                  r3[1] * w_tmp[6] + r3[2] * w_tmp[7] + r3[3] * w_tmp[8];
-
-              ptr_out1[2] +=
-                  r1[2] * w_tmp[0] + r1[3] * w_tmp[1] + r1[4] * w_tmp[2] +
-                  r2[2] * w_tmp[3] + r2[3] * w_tmp[4] + r2[4] * w_tmp[5] +
-                  r3[2] * w_tmp[6] + r3[3] * w_tmp[7] + r3[4] * w_tmp[8];
-
-              ptr_out1[3] +=
-                  r1[3] * w_tmp[0] + r1[4] * w_tmp[1] + r1[5] * w_tmp[2] +
-                  r2[3] * w_tmp[3] + r2[4] * w_tmp[4] + r2[5] * w_tmp[5] +
-                  r3[3] * w_tmp[6] + r3[4] * w_tmp[7] + r3[5] * w_tmp[8];
-            }
-
-            wc0 += 36;
-            inr0 += win_round;
-            inr1 += win_round;
-            inr2 += win_round;
-            inr3 += win_round;
-          }
-#endif  // __aarch64__
-          block_inr0 = block_inr2;
-          block_inr1 = block_inr3;
-          block_inr2 = block_inr1 + in_len;
-          block_inr3 = block_inr2 + in_len;
-        }
-        write_to_output_c1_fp32(pre_out,
-                                dout_batch,
-                                c_idx,
-                                c_idx + 1,
-                                h,
-                                h + h_kernel,
-                                0,
-                                wout_round,
-                                oc,
-                                oh,
-                                ow,
-                                flag_relu,
-                                ptr_write);
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_direct_3x3s2.cc b/lite/backends/arm/math/conv_direct_3x3s2.cc
deleted file mode 100644
index 4bc9c5d25b..0000000000
--- a/lite/backends/arm/math/conv_direct_3x3s2.cc
+++ /dev/null
@@ -1,1209 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/conv_block_utils.h"
-#include "lite/backends/arm/math/conv_impl.h"
-#include "lite/core/context.h"
-#ifdef ARM_WITH_OMP
-#include <omp.h>
-#endif
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void conv_3x3s2_direct_fp32(const float* i_data,
-                            float* o_data,
-                            int bs,
-                            int oc,
-                            int oh,
-                            int ow,
-                            int ic,
-                            int ih,
-                            int win,
-                            const float* weights,
-                            const float* bias,
-                            const operators::ConvParam& param,
-                            ARMContext* ctx) {
-  //! 3x3s2 convolution, implemented by direct algorithm
-  //! prepack input to tmp buffer
-  //! write output to tmp buffer
-  const int threads = ctx->threads();
-  int l2_size = ctx->llc_size() / sizeof(float);
-  const int pad_w = param.paddings[1];
-  const int pad_h = param.paddings[0];
-  const int hout_c_block = 4;
-  const int hout_r_kernel = 2;
-  const int wout_block = 4;
-  const int wout_round = ((ow + wout_block - 1) / wout_block) * wout_block;
-  const int win_round = wout_round * 2 /*stride_w*/ + 1;
-  bool flag_relu = param.fuse_relu;
-  bool flag_bias = param.bias != nullptr;
-  // if (param.activation_param.has_active) {
-  //   if (param.activation_param.active == Active_relu &&
-  //       fabs(param.activation_param.negative_slope) < 1e-6f) {
-  //     flag_relu = true;
-  //   }
-  // }
-  //! get h block
-  //! win_round * ic * hin_r_block + wout_round * hout_c_block * hout_r_block
-  //! * threads = l2_size
-  //! win_round = 2 * wout_round + 1
-  //! hin_r_block = 2 * hout_r_block + 1
-  int hout_r_block =
-      (l2_size - 2 * wout_round * ic - ic) /
-      ((4 * wout_round + 2) * ic + wout_round * hout_c_block * threads);
-  hout_r_block = hout_r_block > oh ? oh : hout_r_block;
-  hout_r_block = (hout_r_block / hout_r_kernel) * hout_r_kernel;
-  hout_r_block = hout_r_block < hout_r_kernel ? hout_r_kernel : hout_r_block;
-
-  const int hin_r_block = hout_r_block * 2 /*stride_h*/ + 1;
-
-  float* tmp_work_space = ctx->workspace_data<float>();
-  float ptr_zero[win_round];  // NOLINT
-  memset(ptr_zero, 0, sizeof(float) * win_round);
-  float ptr_write[wout_round];  // NOLINT
-
-  int in_len = win_round * ic;
-  int pre_in_size = hin_r_block * in_len;
-  int pre_out_size = hout_c_block * hout_r_block * wout_round;
-
-  //! l2_cache start
-  float* pre_din = tmp_work_space;
-
-  int size_in_channel = win * ih;
-  int size_out_channel = ow * oh;
-  int w_stride = ic * 9;                 /*kernel_w * kernel_h*/
-  int w_stride_chin = hout_c_block * 9;  // kernel_w * kernel_h *
-
-  int ws = -pad_w;
-  int we = ws + win_round;
-  int w_loop = wout_round / 4;
-
-  int c_remain = oc - (oc / hout_c_block) * hout_c_block;
-  int c_round_down = (oc / hout_c_block) * hout_c_block;
-
-  int out_row_stride = hout_c_block * wout_round;
-
-  for (int n = 0; n < bs; ++n) {
-    const float* din_batch = i_data + n * ic * size_in_channel;
-    float* dout_batch = o_data + n * oc * size_out_channel;
-    for (int h = 0; h < oh; h += hout_r_block) {
-      int h_kernel = hout_r_block;
-      if (h + hout_r_block > oh) {
-        h_kernel = oh - h;
-      }
-
-      int hs = h * 2 /*stride_h*/ - pad_h;
-      int he = hs + h_kernel * 2 /*stride_h*/ + 1;
-
-      prepack_input_nxw(
-          din_batch, pre_din, 0, ic, hs, he, ws, we, ic, win, ih, ptr_zero);
-
-      const float* cblock_inr0 = pre_din;
-      const float* cblock_inr1 = cblock_inr0 + in_len;
-      const float* cblock_inr2 = cblock_inr1 + in_len;
-      const float* cblock_inr3 = cblock_inr2 + in_len;
-      const float* cblock_inr4 = cblock_inr3 + in_len;
-
-#pragma omp parallel for num_threads(threads)
-      for (int c = 0; c < c_round_down; c += hout_c_block) {
-#ifdef ARM_WITH_OMP
-        float* pre_out =
-            pre_din + pre_in_size + omp_get_thread_num() * pre_out_size;
-#else
-        float* pre_out = pre_din + pre_in_size;
-#endif
-        const float* block_inr0 = cblock_inr0;
-        const float* block_inr1 = cblock_inr1;
-        const float* block_inr2 = cblock_inr2;
-        const float* block_inr3 = cblock_inr3;
-        const float* block_inr4 = cblock_inr4;
-
-        const float* weight_c = weights + c * w_stride;
-        const float* bias_ptr = ptr_zero;
-        if (flag_bias) {
-          bias_ptr = bias + c;
-        }
-        fill_packed_biasc4(
-            pre_out, bias_ptr, wout_round * hout_c_block * h_kernel);
-
-        for (int hk = 0; hk < h_kernel; hk += hout_r_kernel) {
-          const float* wc0 = weight_c;
-
-          const float* inr0 = block_inr0;
-          const float* inr1 = block_inr1;
-          const float* inr2 = block_inr2;
-          const float* inr3 = block_inr3;
-          const float* inr4 = block_inr4;
-
-          float* pre_out0 = pre_out + hk * out_row_stride;
-          float* pre_out1 = pre_out0 + out_row_stride;
-#ifdef __aarch64__
-          for (int i = 0; i < ic; ++i) {
-            float* ptr_out0 = pre_out0;
-            float* ptr_out1 = pre_out1;
-
-            float32x4_t w0 = vld1q_f32(wc0);       // w0, v23
-            float32x4_t w1 = vld1q_f32(wc0 + 4);   // w1, v24
-            float32x4_t w2 = vld1q_f32(wc0 + 8);   // w2, v25
-            float32x4_t w3 = vld1q_f32(wc0 + 12);  // w3, v26
-            float32x4_t w4 = vld1q_f32(wc0 + 16);  // w4, v27
-            float32x4_t w5 = vld1q_f32(wc0 + 20);  // w5, v28
-            float32x4_t w6 = vld1q_f32(wc0 + 24);  // w6, v29
-            float32x4_t w7 = vld1q_f32(wc0 + 28);  // w7, v30
-            float32x4_t w8 = vld1q_f32(wc0 + 32);  // w8, v31
-
-            const float* r0 = inr0;
-            const float* r1 = inr1;
-            const float* r2 = inr2;
-            const float* r3 = inr3;
-            const float* r4 = inr4;
-
-            int cnt = w_loop;
-            asm volatile(
-                "ldp    q15, q16, [%[ptr_out0]]             \n" /* load outr00,
-                                                                   outr01*/
-                "ldp    q17, q18, [%[ptr_out0], #32]\n" /* load outr02, outr03*/
-
-                "ldp    q0, q1,   [%[r0]], #32      \n" /* load input r0*/
-                "ldr    d10,      [%[r0]]           \n" /* load input r0, 9th
-                                                           element*/
-                "ldp    q4, q5,   [%[r2]], #32      \n" /* load input r2*/
-                "ldr    d12,      [%[r2]]           \n" /* load input r2, 9th
-                                                           element*/
-                "2:                                 \n" /* main loop*/
-                /*  r0, r2, mul w0, get out r0, r1 */
-                "ldp    q19, q20, [%[ptr_out1]]     \n" /* load outr10, outr11*/
-                "ldp    q21, q22, [%[ptr_out1], #32]\n" /* load outr12, outr13*/
-                "fmla   v15.4s ,  %[w0].4s,  v0.s[0]\n" /* outr00 = w0 * r0[0]*/
-                "fmla   v16.4s ,  %[w0].4s,  v0.s[2]\n" /* outr01 = w0 * r0[2]*/
-                "fmla   v17.4s ,  %[w0].4s,  v1.s[0]\n" /* outr02 = w0 * r0[4]*/
-                "fmla   v18.4s ,  %[w0].4s,  v1.s[2]\n" /* outr03 = w0 * r0[6]*/
-                "fmla   v19.4s ,  %[w0].4s,  v4.s[0]\n" /* outr10 = w0 * r2[0]*/
-                "fmla   v20.4s ,  %[w0].4s,  v4.s[2]\n" /* outr11 = w0 * r2[2]*/
-                "fmla   v21.4s ,  %[w0].4s,  v5.s[0]\n" /* outr12 = w0 * r2[4]*/
-                "fmla   v22.4s ,  %[w0].4s,  v5.s[2]\n" /* outr13 = w0 * r2[6]*/
-
-                "ldp    q2, q3,   [%[r1]], #32      \n" /* load input r1*/
-
-                /* r2 mul w6, get out r0*/
-                "fmla   v15.4s ,  %[w6].4s,  v4.s[0]\n" /* outr00 = w6 * r2[0]*/
-                "fmla   v16.4s ,  %[w6].4s,  v4.s[2]\n" /* outr01 = w6 * r2[2]*/
-                "fmla   v17.4s ,  %[w6].4s,  v5.s[0]\n" /* outr02 = w6 * r2[4]*/
-                "fmla   v18.4s ,  %[w6].4s,  v5.s[2]\n" /* outr03 = w6 * r2[6]*/
-
-                "ldr    d11,      [%[r1]]           \n" /* load input r1, 9th
-                                                           element*/
-
-                /*  r0, r2, mul w1, get out r0, r1 */
-                "fmla   v15.4s ,  %[w1].4s,  v0.s[1]\n" /* outr00 = w1 * r0[1]*/
-                "fmla   v16.4s ,  %[w1].4s,  v0.s[3]\n" /* outr01 = w1 * r0[3]*/
-                "fmla   v17.4s ,  %[w1].4s,  v1.s[1]\n" /* outr02 = w1 * r0[5]*/
-                "fmla   v18.4s ,  %[w1].4s,  v1.s[3]\n" /* outr03 = w1 * r0[7]*/
-                "fmla   v19.4s ,  %[w1].4s,  v4.s[1]\n" /* outr10 = w1 * r2[1]*/
-                "fmla   v20.4s ,  %[w1].4s,  v4.s[3]\n" /* outr11 = w1 * r2[3]*/
-                "fmla   v21.4s ,  %[w1].4s,  v5.s[1]\n" /* outr12 = w1 * r2[5]*/
-                "fmla   v22.4s ,  %[w1].4s,  v5.s[3]\n" /* outr13 = w1 * r2[7]*/
-
-                "ldp    q6, q7,   [%[r3]], #32      \n" /* load input r3*/
-
-                /*  r2 mul w7, get out r0 */
-                "fmla   v15.4s ,  %[w7].4s,  v4.s[1]\n" /* outr00 = w7 * r2[1]*/
-                "fmla   v16.4s ,  %[w7].4s,  v4.s[3]\n" /* outr01 = w7 * r2[3]*/
-                "fmla   v17.4s ,  %[w7].4s,  v5.s[1]\n" /* outr02 = w7 * r2[5]*/
-                "fmla   v18.4s ,  %[w7].4s,  v5.s[3]\n" /* outr03 = w7 * r2[7]*/
-
-                "ldr    d13,      [%[r3]]           \n" /* load input r3, 9th
-                                                           element*/
-
-                /*  r0, r2, mul w2, get out r0, r1 */
-                "fmla   v15.4s ,  %[w2].4s,  v0.s[2]\n" /* outr00 = w2 * r0[2]*/
-                "fmla   v16.4s ,  %[w2].4s,  v1.s[0]\n" /* outr01 = w2 * r0[4]*/
-                "fmla   v17.4s ,  %[w2].4s,  v1.s[2]\n" /* outr02 = w2 * r0[6]*/
-                "fmla   v18.4s ,  %[w2].4s,  v10.s[0]\n" /* outr03 = w2 *
-                                                            r0[8]*/
-                "fmla   v19.4s ,  %[w2].4s,  v4.s[2]\n" /* outr10 = w2 * r2[2]*/
-                "fmla   v20.4s ,  %[w2].4s,  v5.s[0]\n" /* outr11 = w2 * r2[4]*/
-                "fmla   v21.4s ,  %[w2].4s,  v5.s[2]\n" /* outr12 = w2 * r2[6]*/
-                "fmla   v22.4s ,  %[w2].4s,  v12.s[0]\n" /* outr13 = w2 *
-                                                            r2[8]*/
-
-                "ldp    q8, q9,   [%[r4]], #32      \n" /* load input r4*/
-
-                /*  r2, mul w8, get out r0 */
-                "fmla   v15.4s ,  %[w8].4s,  v4.s[2]\n" /* outr00 = w8 * r2[2]*/
-                "fmla   v16.4s ,  %[w8].4s,  v5.s[0]\n" /* outr01 = w8 * r2[4]*/
-                "fmla   v17.4s ,  %[w8].4s,  v5.s[2]\n" /* outr02 = w8 * r2[6]*/
-                "fmla   v18.4s ,  %[w8].4s,  v12.s[0]\n" /* outr03 = w8 *
-                                                            r2[8]*/
-
-                "ldr    d14,      [%[r4]]           \n" /* load input r4, 9th
-                                                           element*/
-
-                /* r1, r3, mul w3, get out r0, r1 */
-                "fmla   v15.4s ,  %[w3].4s,  v2.s[0]\n" /* outr00 = w3 * r1[0]*/
-                "fmla   v16.4s ,  %[w3].4s,  v2.s[2]\n" /* outr01 = w3 * r1[2]*/
-                "fmla   v17.4s ,  %[w3].4s,  v3.s[0]\n" /* outr02 = w3 * r1[4]*/
-                "fmla   v18.4s ,  %[w3].4s,  v3.s[2]\n" /* outr03 = w3 * r1[6]*/
-                "fmla   v19.4s ,  %[w3].4s,  v6.s[0]\n" /* outr10 = w3 * r3[0]*/
-                "fmla   v20.4s ,  %[w3].4s,  v6.s[2]\n" /* outr11 = w3 * r3[2]*/
-                "fmla   v21.4s ,  %[w3].4s,  v7.s[0]\n" /* outr12 = w3 * r3[4]*/
-                "fmla   v22.4s ,  %[w3].4s,  v7.s[2]\n" /* outr13 = w3 * r3[6]*/
-
-                "ldp    q0, q1,   [%[r0]], #32      \n" /* load input r0*/
-
-                /*  r1, r3, mul w4, get out r0, r1 */
-                "fmla   v15.4s ,  %[w4].4s,  v2.s[1]\n" /* outr00 = w4 * r1[1]*/
-                "fmla   v16.4s ,  %[w4].4s,  v2.s[3]\n" /* outr01 = w4 * r1[3]*/
-                "fmla   v17.4s ,  %[w4].4s,  v3.s[1]\n" /* outr02 = w4 * r1[5]*/
-                "fmla   v18.4s ,  %[w4].4s,  v3.s[3]\n" /* outr03 = w4 * r1[7]*/
-                "fmla   v19.4s ,  %[w4].4s,  v6.s[1]\n" /* outr10 = w4 * r3[1]*/
-                "fmla   v20.4s ,  %[w4].4s,  v6.s[3]\n" /* outr11 = w4 * r3[3]*/
-                "fmla   v21.4s ,  %[w4].4s,  v7.s[1]\n" /* outr12 = w4 * r3[5]*/
-                "fmla   v22.4s ,  %[w4].4s,  v7.s[3]\n" /* outr13 = w4 * r3[7]*/
-
-                "ldr    d10,      [%[r0]]           \n" /* load input r0, 9th
-                                                           element*/
-
-                /*  r1, r3, mul w5, get out r0, r1 */
-                "fmla   v15.4s ,  %[w5].4s,  v2.s[2]\n" /* outr00 = w5 * r1[2]*/
-                "fmla   v16.4s ,  %[w5].4s,  v3.s[0]\n" /* outr01 = w5 * r1[4]*/
-                "fmla   v17.4s ,  %[w5].4s,  v3.s[2]\n" /* outr02 = w5 * r1[6]*/
-                "fmla   v18.4s ,  %[w5].4s,  v11.s[0]\n" /* outr03 = w5 *
-                                                            r1[8]*/
-
-                "ldp    q4, q5,   [%[r2]], #32      \n" /* load input r2*/
-                "stp    q15, q16, [%[ptr_out0]], #32\n" /* save outr00, outr01*/
-
-                "fmla   v19.4s ,  %[w5].4s,  v6.s[2]\n" /* outr10 = w5 * r3[2]*/
-                "fmla   v20.4s ,  %[w5].4s,  v7.s[0]\n" /* outr11 = w5 * r3[4]*/
-                "fmla   v21.4s ,  %[w5].4s,  v7.s[2]\n" /* outr12 = w5 * r3[6]*/
-                "fmla   v22.4s ,  %[w5].4s,  v13.s[0]\n" /* outr13 = w5 *
-                                                            r3[8]*/
-
-                "ldr    d12,      [%[r2]]           \n" /* load input r2, 9th
-                                                           element*/
-                "stp    q17, q18, [%[ptr_out0]], #32\n" /* save outr02, outr03*/
-
-                /*  r4, mul w6, get out r1 */
-                "fmla   v19.4s ,  %[w6].4s,  v8.s[0]\n" /* outr10 = w6 * r4[0]*/
-                "fmla   v20.4s ,  %[w6].4s,  v8.s[2]\n" /* outr11 = w6 * r4[2]*/
-                "fmla   v21.4s ,  %[w6].4s,  v9.s[0]\n" /* outr12 = w6 * r4[4]*/
-                "fmla   v22.4s ,  %[w6].4s,  v9.s[2]\n" /* outr13 = w6 * r4[6]*/
-
-                "ldp    q15, q16, [%[ptr_out0]]     \n" /* load outr00, outr01*/
-
-                /*  r4, mul w7, get out r1 */
-                "fmla   v19.4s ,  %[w7].4s,  v8.s[1]\n" /* outr10 = w7 * r4[1]*/
-                "fmla   v20.4s ,  %[w7].4s,  v8.s[3]\n" /* outr11 = w7 * r4[3]*/
-                "fmla   v21.4s ,  %[w7].4s,  v9.s[1]\n" /* outr12 = w7 * r4[5]*/
-                "fmla   v22.4s ,  %[w7].4s,  v9.s[3]\n" /* outr13 = w7 * r4[7]*/
-
-                "ldp    q17, q18, [%[ptr_out0], #32]\n" /* load outr02, outr03*/
-
-                /*  r4, mul w8, get out r1 */
-                "fmla   v19.4s ,  %[w8].4s,  v8.s[2]\n" /* outr10 = w8 * r4[2]*/
-                "fmla   v20.4s ,  %[w8].4s,  v9.s[0]\n" /* outr11 = w8 * r4[4]*/
-                "fmla   v21.4s ,  %[w8].4s,  v9.s[2]\n" /* outr12 = w8 * r4[6]*/
-                "fmla   v22.4s ,  %[w8].4s,  v14.s[0]\n" /* outr13 = w8 *
-                                                            r4[8]*/
-
-                "subs   %w[cnt], %w[cnt], #1        \n" /*loop count -1*/
-
-                "stp    q19, q20, [%[ptr_out1]], #32\n" /* save outr10, outr11*/
-                "stp    q21, q22, [%[ptr_out1]], #32\n" /* save outr12, outr13*/
-
-                "bne    2b                          \n" /* jump to main loop*/
-
-                : [cnt] "+r"(cnt),
-                  [r0] "+r"(r0),
-                  [r1] "+r"(r1),
-                  [r2] "+r"(r2),
-                  [r3] "+r"(r3),
-                  [r4] "+r"(r4),
-                  [ptr_out0] "+r"(ptr_out0),
-                  [ptr_out1] "+r"(ptr_out1)
-                : [w0] "w"(w0),
-                  [w1] "w"(w1),
-                  [w2] "w"(w2),
-                  [w3] "w"(w3),
-                  [w4] "w"(w4),
-                  [w5] "w"(w5),
-                  [w6] "w"(w6),
-                  [w7] "w"(w7),
-                  [w8] "w"(w8)
-                : "cc",
-                  "memory",
-                  "v0",
-                  "v1",
-                  "v2",
-                  "v3",
-                  "v4",
-                  "v5",
-                  "v6",
-                  "v7",
-                  "v8",
-                  "v9",
-                  "v10",
-                  "v11",
-                  "v12",
-                  "v13",
-                  "v14",
-                  "v15",
-                  "v16",
-                  "v17",
-                  "v18",
-                  "v19",
-                  "v20",
-                  "v21",
-                  "v22");
-
-            wc0 += 9 * hout_c_block;
-            inr0 += win_round;
-            inr1 += win_round;
-            inr2 += win_round;
-            inr3 += win_round;
-            inr4 += win_round;
-          }
-#else   // not __aarch64__
-          for (int i = 0; i < ic; ++i) {
-            const float* wc0 = weight_c + i * w_stride_chin;
-
-            float* ptr_out0 = pre_out0;
-            float* ptr_out1 = pre_out1;
-
-            const float* r0 = inr0;
-            const float* r1 = inr1;
-            const float* r2 = inr2;
-            const float* r3 = inr3;
-            const float* r4 = inr4;
-
-            int cnt = w_loop;
-            asm volatile(
-                "vld1.32    {d16-d19}, [%[ptr_out0]]!                       @ "
-                "load outr0, w0, w1, c0~c3\n"
-                "vld1.32    {d20-d23}, [%[ptr_out0]]                @ load "
-                "outr0, w2, w3, c0~c3\n"
-
-                /* load weights */
-                "vld1.32    {d10-d13}, [%[wc0]]!                    @ load w0, "
-                "w1, to q5, q6\n"
-                "vld1.32    {d14-d15}, [%[wc0]]!                    @ load w2, "
-                "to q7\n"
-
-                /* load r0, r2 */
-                "vld1.32    {d0-d3}, [%[r0]]!                       @ load r0, "
-                "8 float\n"
-                "vld1.32    {d8},   [%[r0]]                         @ load r0, "
-                "9th float\n"
-
-                "sub    %[ptr_out0], %[ptr_out0], #32               @ ptr_out0 "
-                "- 32, to start address\n"
-
-                /* main loop */
-                "0:                                                 @ main "
-                "loop\n"
-                /* mul r0, with w0, w1, w2 */
-                "vld1.32    {d24-d27}, [%[ptr_out1]]!               @ load "
-                "outr1, w0, w1, c0~c3\n"
-                "vmla.f32   q8, q5, d0[0]                           @ w0 * "
-                "inr00\n"
-                "vld1.32    {d28-d31}, [%[ptr_out1]]                @ load "
-                "outr1, w2, w3, c0~c3\n"
-                "vmla.f32   q9, q5, d1[0]                           @ w0 * "
-                "inr02\n"
-                "vmla.f32   q10, q5, d2[0]                          @ w0 * "
-                "inr04\n"
-                "vmla.f32   q11, q5, d3[0]                          @ w0 * "
-                "inr06\n"
-                "vld1.32    {d4-d7}, [%[r2]]!                       @ load r2, "
-                "8 float\n"
-                "vmla.f32   q8, q6, d0[1]                           @ w1 * "
-                "inr01\n"
-                "vmla.f32   q9, q6, d1[1]                           @ w1 * "
-                "inr03\n"
-                "vmla.f32   q10, q6, d2[1]                          @ w1 * "
-                "inr05\n"
-                "vmla.f32   q11, q6, d3[1]                          @ w1 * "
-                "inr07\n"
-                "vld1.32    {d9},   [%[r2]]                         @ load r2, "
-                "9th float\n"
-                "vmla.f32   q8, q7, d1[0]                           @ w2 * "
-                "inr02\n"
-                "vmla.f32   q9, q7, d2[0]                           @ w2 * "
-                "inr04\n"
-                "vmla.f32   q10, q7, d3[0]                          @ w2 * "
-                "inr06\n"
-                "vmla.f32   q11, q7, d8[0]                          @ w2 * "
-                "inr08\n"
-
-                "sub    %[r2], %[r2], #32                           @ r2 - 32, "
-                "load r2 twice\n"
-
-                /* mul r2, with w0, w1, w2 */
-                "vld1.32    {d0-d3}, [%[r1]]!                       @ load r1, "
-                "8 float\n"
-                "vmla.f32   q12, q5, d4[0]                          @ w0 * "
-                "inr20\n"
-                "vmla.f32   q13, q5, d5[0]                          @ w0 * "
-                "inr22\n"
-                "vmla.f32   q14, q5, d6[0]                          @ w0 * "
-                "inr24\n"
-                "vmla.f32   q15, q5, d7[0]                          @ w0 * "
-                "inr26\n"
-                "vld1.32    {d8},   [%[r1]]                         @ load r1, "
-                "9th float\n"
-                "vmla.f32   q12, q6, d4[1]                          @ w1 * "
-                "inr21\n"
-                "vmla.f32   q13, q6, d5[1]                          @ w1 * "
-                "inr23\n"
-                "vmla.f32   q14, q6, d6[1]                          @ w1 * "
-                "inr25\n"
-                "vmla.f32   q15, q6, d7[1]                          @ w1 * "
-                "inr27\n"
-                "vld1.32    {d10-d13}, [%[wc0]]!                    @ load w3, "
-                "w4, to q5, q6\n"
-                "vmla.f32   q12, q7, d5[0]                          @ w2 * "
-                "inr22\n"
-                "vmla.f32   q13, q7, d6[0]                          @ w2 * "
-                "inr24\n"
-                "vmla.f32   q14, q7, d7[0]                          @ w2 * "
-                "inr26\n"
-                "vmla.f32   q15, q7, d9[0]                          @ w2 * "
-                "inr28\n"
-                "vld1.32    {d14-d15}, [%[wc0]]!                    @ load w5, "
-                "to q7\n"
-
-                /* mul r1, with w3, w4, w5 */
-                "vmla.f32   q8, q5, d0[0]                           @ w3 * "
-                "inr10\n"
-                "vmla.f32   q9, q5, d1[0]                           @ w3 * "
-                "inr12\n"
-                "vmla.f32   q10, q5, d2[0]                          @ w3 * "
-                "inr14\n"
-                "vmla.f32   q11, q5, d3[0]                          @ w3 * "
-                "inr16\n"
-                "vld1.32    {d4-d7}, [%[r3]]!                       @ load r3, "
-                "8 float\n"
-                "vmla.f32   q8, q6, d0[1]                           @ w4 * "
-                "inr11\n"
-                "vmla.f32   q9, q6, d1[1]                           @ w4 * "
-                "inr13\n"
-                "vmla.f32   q10, q6, d2[1]                          @ w4 * "
-                "inr15\n"
-                "vmla.f32   q11, q6, d3[1]                          @ w4 * "
-                "inr17\n"
-                "vld1.32    {d9},   [%[r3]]                         @ load r3, "
-                "9th float\n"
-                "vmla.f32   q8, q7, d1[0]                           @ w5 * "
-                "inr12\n"
-                "vmla.f32   q9, q7, d2[0]                           @ w5 * "
-                "inr14\n"
-                "vmla.f32   q10, q7, d3[0]                          @ w5 * "
-                "inr16\n"
-                "vmla.f32   q11, q7, d8[0]                          @ w5 * "
-                "inr18\n"
-
-                "sub    %[ptr_out1], %[ptr_out1], #32               @ ptr_out1 "
-                "- 32, to start address\n"
-
-                /* mul r3, with w3, w4, w5 */
-                "vld1.32    {d0-d3}, [%[r2]]!                       @ load r2, "
-                "8 float\n"
-                "vmla.f32   q12, q5, d4[0]                          @ w3 * "
-                "inr30\n"
-                "vmla.f32   q13, q5, d5[0]                          @ w3 * "
-                "inr32\n"
-                "vmla.f32   q14, q5, d6[0]                          @ w3 * "
-                "inr34\n"
-                "vmla.f32   q15, q5, d7[0]                          @ w3 * "
-                "inr36\n"
-                "vld1.32    {d8},   [%[r2]]                         @ load r2, "
-                "9th float\n"
-                "vmla.f32   q12, q6, d4[1]                          @ w4 * "
-                "inr31\n"
-                "vmla.f32   q13, q6, d5[1]                          @ w4 * "
-                "inr33\n"
-                "vmla.f32   q14, q6, d6[1]                          @ w4 * "
-                "inr35\n"
-                "vmla.f32   q15, q6, d7[1]                          @ w4 * "
-                "inr37\n"
-                "vld1.32    {d10-d13}, [%[wc0]]!                    @ load w6, "
-                "w7, to q5, q6\n"
-                "vmla.f32   q12, q7, d5[0]                          @ w5 * "
-                "inr32\n"
-                "vmla.f32   q13, q7, d6[0]                          @ w5 * "
-                "inr34\n"
-                "vmla.f32   q14, q7, d7[0]                          @ w5 * "
-                "inr36\n"
-                "vmla.f32   q15, q7, d9[0]                          @ w5 * "
-                "inr38\n"
-                "vld1.32    {d14-d15}, [%[wc0]]!                    @ load w8, "
-                "to q7\n"
-
-                /* mul r2, with w6, w7, w8 */
-                "vmla.f32   q8, q5, d0[0]                           @ w6 * "
-                "inr20\n"
-                "vmla.f32   q9, q5, d1[0]                           @ w6 * "
-                "inr22\n"
-                "vmla.f32   q10, q5, d2[0]                          @ w6 * "
-                "inr24\n"
-                "vmla.f32   q11, q5, d3[0]                          @ w6 * "
-                "inr26\n"
-                "vld1.32    {d4-d7}, [%[r4]]!                       @ load r4, "
-                "8 float\n"
-                "vmla.f32   q8, q6, d0[1]                           @ w7 * "
-                "inr21\n"
-                "vmla.f32   q9, q6, d1[1]                           @ w7 * "
-                "inr23\n"
-                "vmla.f32   q10, q6, d2[1]                          @ w7 * "
-                "inr25\n"
-                "vmla.f32   q11, q6, d3[1]                          @ w7 * "
-                "inr27\n"
-                "vld1.32    {d9},   [%[r4]]                         @ load r4, "
-                "9th float\n"
-                "vmla.f32   q8, q7, d1[0]                           @ w8 * "
-                "inr22\n"
-                "vmla.f32   q9, q7, d2[0]                           @ w8 * "
-                "inr24\n"
-                "vmla.f32   q10, q7, d3[0]                          @ w8 * "
-                "inr26\n"
-                "vmla.f32   q11, q7, d8[0]                          @ w8 * "
-                "inr28\n"
-
-                "sub    %[wc0], %[wc0], #144                        @ wc0 - "
-                "144 to start address\n"
-
-                /* mul r4, with w6, w7, w8 */
-                "vld1.32    {d0-d3}, [%[r0]]!                       @ load r0, "
-                "8 float\n"
-                "vmla.f32   q12, q5, d4[0]                          @ w3 * "
-                "inr40\n"
-                "vst1.32    {d16-d19}, [%[ptr_out0]]!               @ save "
-                "r00, r01, c0~c3\n"
-                "vmla.f32   q13, q5, d5[0]                          @ w3 * "
-                "inr42\n"
-                "vst1.32    {d20-d23}, [%[ptr_out0]]!               @ save "
-                "r02, r03, c0~c3\n"
-                "vmla.f32   q14, q5, d6[0]                          @ w3 * "
-                "inr44\n"
-                "vmla.f32   q15, q5, d7[0]                          @ w3 * "
-                "inr46\n"
-                "vld1.32    {d8},   [%[r0]]                           @ load "
-                "r0, 9th float\n"
-                "vmla.f32   q12, q6, d4[1]                          @ w4 * "
-                "inr41\n"
-                "vmla.f32   q13, q6, d5[1]                          @ w4 * "
-                "inr43\n"
-                "vmla.f32   q14, q6, d6[1]                          @ w4 * "
-                "inr45\n"
-                "vmla.f32   q15, q6, d7[1]                          @ w4 * "
-                "inr47\n"
-                "vld1.32    {d10-d13}, [%[wc0]]!                    @ load w0, "
-                "w1, to q5, q6\n"
-                "vmla.f32   q12, q7, d5[0]                          @ w5 * "
-                "inr42\n"
-                "vmla.f32   q13, q7, d6[0]                          @ w5 * "
-                "inr44\n"
-                "vmla.f32   q14, q7, d7[0]                          @ w5 * "
-                "inr46\n"
-                "vmla.f32   q15, q7, d9[0]                          @ w5 * "
-                "inr48\n"
-                "vld1.32    {d14-d15}, [%[wc0]]!                    @ load w2, "
-                "to q7\n"
-
-                "vst1.32    {d24-d27}, [%[ptr_out1]]!               @ save "
-                "r10, r11, c0~c3\n"
-                "vst1.32    {d28-d31}, [%[ptr_out1]]!               @ save "
-                "r12, r13, c0~c3\n"
-
-                "vld1.32    {d16-d19}, [%[ptr_out0]]!               @ load "
-                "outr0, w0, w1, c0~c3\n"
-                "vld1.32    {d20-d23}, [%[ptr_out0]]                @ load "
-                "outr0, w2, w3, c0~c3\n"
-
-                "sub    %[ptr_out0], %[ptr_out0], #32               @ ptr_out0 "
-                "- 32, to start address\n"
-
-                "subs   %[cnt], #1                                  @ loop "
-                "count--\n"
-                "bne    0b                                          @ jump to "
-                "main loop\n"
-
-                : [cnt] "+r"(cnt),
-                  [r0] "+r"(r0),
-                  [r1] "+r"(r1),
-                  [r2] "+r"(r2),
-                  [r3] "+r"(r3),
-                  [r4] "+r"(r4),
-                  [ptr_out0] "+r"(ptr_out0),
-                  [ptr_out1] "+r"(ptr_out1),
-                  [wc0] "+r"(wc0)
-                :
-                : "cc",
-                  "memory",
-                  "q0",
-                  "q1",
-                  "q2",
-                  "q3",
-                  "q4",
-                  "q5",
-                  "q6",
-                  "q7",
-                  "q8",
-                  "q9",
-                  "q10",
-                  "q11",
-                  "q12",
-                  "q13",
-                  "q14",
-                  "q15");
-
-            inr0 += win_round;
-            inr1 += win_round;
-            inr2 += win_round;
-            inr3 += win_round;
-            inr4 += win_round;
-          }
-#endif  // __aarch64__
-          block_inr0 = block_inr4;
-          block_inr1 = block_inr0 + in_len;
-          block_inr2 = block_inr1 + in_len;
-          block_inr3 = block_inr2 + in_len;
-          block_inr4 = block_inr3 + in_len;
-        }
-
-        write_to_output_c4_fp32(pre_out,
-                                dout_batch,
-                                c,
-                                c + hout_c_block,
-                                h,
-                                h + h_kernel,
-                                0,
-                                wout_round,
-                                oc,
-                                oh,
-                                ow,
-                                flag_relu,
-                                ptr_write);
-      }
-
-#pragma omp parallel for num_threads(threads)
-      for (int c = 0; c < c_remain; ++c) {
-#ifdef ARM_WITH_OMP
-        float* pre_out =
-            pre_din + pre_in_size + omp_get_thread_num() * pre_out_size;
-#else
-        float* pre_out = pre_din + pre_in_size;
-#endif
-
-        const float* block_inr0 = cblock_inr0;
-        const float* block_inr1 = cblock_inr1;
-        const float* block_inr2 = cblock_inr2;
-        const float* block_inr3 = cblock_inr3;
-        const float* block_inr4 = cblock_inr4;
-
-        //! get weights ptr of remained
-        const float* weight_c = weights + c_round_down * w_stride;
-
-        //! fill bias to one channel
-        const float* bias_ptr = ptr_zero;
-        if (flag_bias) {
-          bias_ptr = bias + c_round_down + c;
-        }
-        fill_bias(pre_out, bias_ptr, 1, wout_round * h_kernel);
-
-        for (int hk = 0; hk < h_kernel; hk += hout_r_kernel) {
-          const float* wc0 = weight_c;
-
-          const float* inr0 = block_inr0;
-          const float* inr1 = block_inr1;
-          const float* inr2 = block_inr2;
-          const float* inr3 = block_inr3;
-          const float* inr4 = block_inr4;
-
-          float* pre_out0 = pre_out + hk * wout_round;
-          float* pre_out1 = pre_out0 + wout_round;
-#ifdef __aarch64__
-          for (int i = 0; i < ic; ++i) {
-            float* ptr_out0 = pre_out0;
-            float* ptr_out1 = pre_out1;
-
-            //! get valid weights of current output channel
-            float32x4_t w0 = vdupq_n_f32(wc0[c]);       // w0, v23
-            float32x4_t w1 = vdupq_n_f32(wc0[c + 4]);   // w1, v24
-            float32x4_t w2 = vdupq_n_f32(wc0[c + 8]);   // w2, v25
-            float32x4_t w3 = vdupq_n_f32(wc0[c + 12]);  // w3, v26
-            float32x4_t w4 = vdupq_n_f32(wc0[c + 16]);  // w4, v27
-            float32x4_t w5 = vdupq_n_f32(wc0[c + 20]);  // w5, v28
-            float32x4_t w6 = vdupq_n_f32(wc0[c + 24]);  // w6, v29
-            float32x4_t w7 = vdupq_n_f32(wc0[c + 28]);  // w7, v30
-            float32x4_t w8 = vdupq_n_f32(wc0[c + 32]);  // w8, v31
-
-            const float* r0 = inr0;
-            const float* r1 = inr1;
-            const float* r2 = inr2;
-            const float* r3 = inr3;
-            const float* r4 = inr4;
-
-            int cnt = w_loop;
-            asm volatile(
-                "ldr    q21, [%[ptr_out0]]                  \n" /* load outr00,
-                                                                   outr01,
-                                                                   outr02,
-                                                                   outr03*/
-
-                "ld2  {v0.4s, v1.4s}, [%[r0]], #32  \n" /* load input r0*/
-                "ldr    d10,      [%[r0]]           \n" /* load input r0, 9th
-                                                           element*/
-                "ld2  {v4.4s, v5.4s}, [%[r2]], #32  \n" /* load input r2*/
-                "ldr    d12,      [%[r2]]           \n" /* load input r2, 9th
-                                                           element*/
-                "2:                                 \n" /* main loop*/
-                /*  r0, r2, mul w0, get out r0, r1 */
-                "ldr    q22, [%[ptr_out1]]          \n" /* load outr10, outr11,
-                                                           outr12, outr13*/
-
-                "fmla   v21.4s ,  %[w0].4s,  v0.4s  \n" /* outr0 = w0 * r0[0, 2,
-                                                           4, 6]*/
-                "fmla   v22.4s ,  %[w0].4s,  v4.4s  \n" /* outr1 = w0 * r2[0, 2,
-                                                           4, 6]*/
-
-                "ld2  {v2.4s, v3.4s}, [%[r1]], #32  \n" /* load input r1*/
-
-                /* r2 mul w6, get out r0*/
-                "fmla   v21.4s ,  %[w6].4s,  v4.4s  \n" /* outr0 = w6 * r2[0, 2,
-                                                           4, 6]*/
-                "ldr    d11,      [%[r1]]           \n" /* load input r1, 9th
-                                                           element*/
-
-                /* shift left 1 */
-                "ext    v15.16b, v0.16b, v10.16b, #4\n" /* shift left r0 1*/
-                "ext    v16.16b, v4.16b, v12.16b, #4\n" /* shift left r2 1*/
-
-                /*  r0, r2, mul w1, get out r0, r1 */
-                "fmla   v21.4s ,  %[w1].4s,  v1.4s  \n" /* outr0 = w1 * r0[1, 3,
-                                                           5, 7]*/
-                "fmla   v22.4s ,  %[w1].4s,  v5.4s  \n" /* outr1 = w1 * r2[1, 3,
-                                                           5, 7]*/
-
-                "ld2  {v6.4s, v7.4s}, [%[r3]], #32  \n" /* load input r3*/
-
-                /*  r2 mul w7, get out r0 */
-                "fmla   v21.4s ,  %[w7].4s,  v5.4s  \n" /* outr00 = w7 * r2[1,
-                                                           3, 5, 7]*/
-
-                "ldr    d13,      [%[r3]]           \n" /* load input r3, 9th
-                                                           element*/
-
-                /*  r0, r2, mul w2, get out r0, r1 */
-                "fmla   v21.4s ,  %[w2].4s,  v15.4s \n" /* outr0 = w2 * r0[2, 4,
-                                                           6, 8]*/
-                "fmla   v22.4s ,  %[w2].4s,  v16.4s \n" /* outr1 = w2 * r2[2, 4,
-                                                           6, 8]*/
-
-                "ld2  {v8.4s, v9.4s}, [%[r4]], #32  \n" /* load input r4*/
-
-                /*  r2, mul w8, get out r0 */
-                "fmla   v21.4s ,  %[w8].4s,  v16.4s \n" /* outr00 = w8 * r2[2,
-                                                           4, 6, 8]*/
-
-                "ldr    d14,      [%[r4]]           \n" /* load input r4, 9th
-                                                           element*/
-
-                /* r1, r3, mul w3, get out r0, r1 */
-                "fmla   v21.4s ,  %[w3].4s,  v2.4s  \n" /* outr0 = w3 * r1[0, 2,
-                                                           4, 6]*/
-                "fmla   v22.4s ,  %[w3].4s,  v6.4s  \n" /* outr1 = w3 * r3[0, 2,
-                                                           4, 6]*/
-
-                /* shift left 1 */
-                "ext    v15.16b, v2.16b, v11.16b, #4\n" /* shift left r1 1*/
-                "ext    v16.16b, v6.16b, v13.16b, #4\n" /* shift left r3 1*/
-
-                "ld2  {v0.4s, v1.4s}, [%[r0]], #32  \n" /* load input r0*/
-
-                /*  r1, r3, mul w4, get out r0, r1 */
-                "fmla   v21.4s ,  %[w4].4s,  v3.4s  \n" /* outr0 = w4 * r1[1, 3,
-                                                           5, 7]*/
-                "fmla   v22.4s ,  %[w4].4s,  v7.4s  \n" /* outr1 = w4 * r3[1, 3,
-                                                           5, 7]*/
-
-                "ldr    d10,      [%[r0]]           \n" /* load input r0, 9th
-                                                           element*/
-
-                /*  r1, r3, mul w5, get out r0, r1 */
-                "fmla   v21.4s ,  %[w5].4s,  v15.4s \n" /* outr0 = w5 * r1[2]*/
-                "fmla   v22.4s ,  %[w5].4s,  v16.4s \n" /* outr1 = w5 * r1[4]*/
-
-                "ld2  {v4.4s, v5.4s}, [%[r2]], #32  \n" /* load input r2*/
-                "ldr    d12,      [%[r2]]           \n" /* load input r2, 9th
-                                                           element*/
-                "str    q21, [%[ptr_out0]], #16     \n" /* save outr00, outr01*/
-
-                /*  r4, mul w6, get out r1 */
-                "fmla   v22.4s ,  %[w6].4s,  v8.4s  \n" /* outr1 = w6 * r4[0, 2,
-                                                           4, 6]*/
-
-                "ext    v15.16b, v8.16b, v14.16b, #4\n" /* shift left r1 1*/
-                "ldr    q21, [%[ptr_out0]]          \n" /* load outr0*/
-
-                /*  r4, mul w7, get out r1 */
-                "fmla   v22.4s ,  %[w7].4s,  v9.4s  \n" /* outr1 = w7 * r4[1, 3,
-                                                           5, 7]*/
-
-                /*  r4, mul w8, get out r1 */
-                "fmla   v22.4s ,  %[w8].4s,  v15.4s \n" /* outr1 = w8 * r4[2, 4,
-                                                           6, 8]*/
-
-                "subs   %w[cnt], %w[cnt], #1        \n" /*loop count -1*/
-                "str    q22, [%[ptr_out1]], #16     \n" /* save outr1*/
-                "bne    2b                          \n" /* jump to main loop*/
-
-                : [cnt] "+r"(cnt),
-                  [r0] "+r"(r0),
-                  [r1] "+r"(r1),
-                  [r2] "+r"(r2),
-                  [r3] "+r"(r3),
-                  [r4] "+r"(r4),
-                  [ptr_out0] "+r"(ptr_out0),
-                  [ptr_out1] "+r"(ptr_out1)
-                : [w0] "w"(w0),
-                  [w1] "w"(w1),
-                  [w2] "w"(w2),
-                  [w3] "w"(w3),
-                  [w4] "w"(w4),
-                  [w5] "w"(w5),
-                  [w6] "w"(w6),
-                  [w7] "w"(w7),
-                  [w8] "w"(w8)
-                : "cc",
-                  "memory",
-                  "v0",
-                  "v1",
-                  "v2",
-                  "v3",
-                  "v4",
-                  "v5",
-                  "v6",
-                  "v7",
-                  "v8",
-                  "v9",
-                  "v10",
-                  "v11",
-                  "v12",
-                  "v13",
-                  "v14",
-                  "v15",
-                  "v16",
-                  "v21",
-                  "v22");
-
-            wc0 += 36;
-            inr0 += win_round;
-            inr1 += win_round;
-            inr2 += win_round;
-            inr3 += win_round;
-            inr4 += win_round;
-          }
-#else   // not __aarch64__
-          for (int i = 0; i < ic; ++i) {
-            float* ptr_out0 = pre_out0;
-            float* ptr_out1 = pre_out1;
-
-            //! get valid weights of current output channel
-            float w_tmp[12] = {wc0[c],
-                               wc0[c + 4],
-                               wc0[c + 8],
-                               0.f,
-                               wc0[c + 12],
-                               wc0[c + 16],
-                               wc0[c + 20],
-                               0.f,
-                               wc0[c + 24],
-                               wc0[c + 28],
-                               wc0[c + 32],
-                               0.f};
-            float32x4_t w0 = vld1q_f32(w_tmp);      // w0, w1, w2, q0
-            float32x4_t w1 = vld1q_f32(w_tmp + 4);  // w3, w4, w5, q1
-            float32x4_t w2 = vld1q_f32(w_tmp + 8);  // w6, w7, w8, q2
-
-            const float* r0 = inr0;
-            const float* r1 = inr1;
-            const float* r2 = inr2;
-            const float* r3 = inr3;
-            const float* r4 = inr4;
-
-            int cnt = w_loop / 2;
-            if (cnt > 0) {
-              asm volatile(
-                  /* main loop */
-                  "0:                                                     @ "
-                  "main loop\n"
-                  "vld1.32    {d24-d27},    [%[ptr_out0]]         @ load or00, "
-                  "or01\n"
-                  "vld1.32    {d28-d31},    [%[ptr_out1]]         @ load or10, "
-                  "or11\n"
-                  "vld2.32    {d6-d9},    [%[r2]]!                @ load r2, 8 "
-                  "float, interleave\n"
-                  "vld2.32    {d10-d13},  [%[r2]]!                @ load r2, 8 "
-                  "float, interleave\n"
-                  "vld1.32    {d22},  [%[r2]]                     @ load 16th "
-                  "float\n"
-
-                  /* r2 * w2, r2 * w0, get or0, or1 */
-                  "vmla.f32   q12,    q4, %e[w2][1]               @ w21 * r2, "
-                  "1, 3, 5, 7\n"
-                  "vmla.f32   q13,    q6, %e[w2][1]               @ w21 * r2, "
-                  "9, 11, 13, 15\n"
-                  "vld2.32    {d14-d17},    [%[r0]]!              @ load r0, 8 "
-                  "float, interleave\n"
-                  "vmla.f32   q14,    q4, %e[w0][1]               @ w01 * r2, "
-                  "1, 3, 5, 7\n"
-                  "vmla.f32   q15,    q6, %e[w0][1]               @ w01 * r2, "
-                  "9, 11, 13, 15\n"
-
-                  "vext.32    q4, q3, q5, #1                      @ r2, shift "
-                  "left 1, get 2, 4, 6, 8\n"
-                  "vext.32    q6, q5, q11, #1                     @ r2, shift "
-                  "left 1, get 10, 12, 14, 16\n"
-
-                  "vmla.f32   q12,    q3, %e[w2][0]               @ w20 * r2, "
-                  "0, 2, 4, 6\n"
-                  "vmla.f32   q13,    q5, %e[w2][0]               @ w20 * r2, "
-                  "8, 10, 12, 14\n"
-                  "vld2.32    {d18-d21},  [%[r0]]!                @ load r0, 8 "
-                  "float, interleave\n"
-                  "vmla.f32   q14,    q3, %e[w0][0]               @ w00 * r2, "
-                  "0, 2, 4, 6\n"
-                  "vmla.f32   q15,    q5, %e[w0][0]               @ w00 * r2, "
-                  "8, 10, 12, 14\n"
-
-                  "vld1.32    {d22},  [%[r0]]                     @ load 16th "
-                  "float\n"
-
-                  "vmla.f32   q12,    q4, %f[w2][0]               @ w22 * r2, "
-                  "2, 4, 6, 8\n"
-                  "vmla.f32   q14,    q4, %f[w0][0]               @ w02 * r2, "
-                  "2, 4, 6, 8\n"
-                  "vld2.32    {d6-d9},    [%[r3]]!                @ load r3, 8 "
-                  "float, interleave\n"
-                  "vmla.f32   q13,    q6, %f[w2][0]               @ w22 * r2, "
-                  "10, 12, 14, 16\n"
-                  "vmla.f32   q15,    q6, %f[w0][0]               @ w02 * r2, "
-                  "10, 12, 14, 16\n"
-                  "vld2.32    {d10-d13},  [%[r3]]!                @ load r3, 8 "
-                  "float, interleave\n"
-
-                  /* r0 * w0, get or0, r3 * w1, get or1*/
-                  "vmla.f32   q12,    q8, %e[w0][1]               @ w01 * r0, "
-                  "1, 3, 5, 7\n"
-                  "vmla.f32   q13,    q10, %e[w0][1]              @ w01 * r0, "
-                  "9, 11, 13, 15\n"
-                  "vext.32    q8, q7, q9, #1                      @ r0, shift "
-                  "left 1, get 2, 4, 6, 8\n"
-                  "vext.32    q10, q9, q11, #1                    @ r0, shift "
-                  "left 1, get 10, 12, 14, 16\n"
-                  "vld1.32    {d22},  [%[r3]]                     @ load 16th "
-                  "float\n"
-                  "vmla.f32   q14,    q4, %e[w1][1]               @ w11 * r3, "
-                  "1, 3, 5, 7\n"
-                  "vmla.f32   q15,    q6, %e[w1][1]               @ w11 * r3, "
-                  "9, 11, 13, 15\n"
-
-                  "vmla.f32   q12,    q7, %e[w0][0]               @ w00 * r0, "
-                  "0, 2, 4, 6\n"
-                  "vmla.f32   q13,    q9, %e[w0][0]               @ w00 * r0, "
-                  "8, 10, 12, 14\n"
-                  "vext.32    q4, q3, q5, #1                      @ r3, shift "
-                  "left 1, get 2, 4, 6, 8\n"
-                  "vext.32    q6, q5, q11, #1                     @ r3, shift "
-                  "left 1, get 10, 12, 14, 16\n"
-                  "vmla.f32   q14,    q3, %e[w1][0]               @ w10 * r3, "
-                  "0, 2, 4, 6\n"
-                  "vmla.f32   q15,    q5, %e[w1][0]               @ w10 * r3, "
-                  "8, 10, 12, 14\n"
-
-                  "vmla.f32   q12,    q8, %f[w0][0]               @ w02 * r0, "
-                  "2, 4, 6, 8\n"
-                  "vld2.32    {d14-d17},  [%[r1]]!                @ load r1, 8 "
-                  "float, interleave\n"
-                  "vmla.f32   q13,    q10,%f[w0][0]               @ w02 * r0, "
-                  "10, 12, 14, 16\n"
-                  "vld2.32    {d18-d21},  [%[r1]]!                @ load r1, 8 "
-                  "float, interleave\n"
-                  "vmla.f32   q14,    q4, %f[w1][0]               @ w12 * r3, "
-                  "2, 4, 6, 8\n"
-                  "vld2.32    {d6-d9},    [%[r4]]!                @ load r4, 8 "
-                  "float, interleave\n"
-                  "vmla.f32   q15,    q6, %f[w1][0]               @ w12 * r3, "
-                  "10, 12, 14, 16\n"
-                  "vld2.32    {d10-d13},  [%[r4]]!                @ load r4, 8 "
-                  "float, interleave\n"
-
-                  "vld1.32    {d22},  [%[r1]]                     @ load 16th "
-                  "float\n"
-
-                  /* r1 * w1, get or0, r4 * w2, get or1 */
-                  "vmla.f32   q12,    q8, %e[w1][1]               @ w11 * r1, "
-                  "1, 3, 5, 7\n"
-                  "vmla.f32   q13,    q10, %e[w1][1]              @ w11 * r1, "
-                  "9, 11, 13, 15\n"
-                  "vext.32    q8, q7, q9, #1                      @ r1, shift "
-                  "left 1, get 2, 4, 6, 8\n"
-                  "vext.32    q10, q9, q11, #1                    @ r1, shift "
-                  "left 1, get 10, 12, 14, 16\n"
-                  "vmla.f32   q14,    q4, %e[w2][1]               @ w21 * r4, "
-                  "1, 3, 5, 7\n"
-                  "vmla.f32   q15,    q6, %e[w2][1]               @ w21 * r4, "
-                  "9, 11, 13, 15\n"
-                  "vld1.32    {d22},  [%[r4]]                     @ load 16th "
-                  "float\n"
-
-                  "vmla.f32   q12,    q7, %e[w1][0]               @ w10 * r1, "
-                  "0, 2, 4, 6\n"
-                  "vmla.f32   q13,    q9, %e[w1][0]               @ w10 * r1, "
-                  "8, 10, 12, 14\n"
-                  "vext.32    q4, q3, q5, #1                      @ r1, shift "
-                  "left 1, get 2, 4, 6, 8\n"
-                  "vext.32    q6, q5, q11, #1                     @ r1, shift "
-                  "left 1, get 10, 12, 14, 16\n"
-                  "vmla.f32   q14,    q3, %e[w2][0]               @ w20 * r4, "
-                  "0, 2, 4, 6\n"
-                  "vmla.f32   q15,    q5, %e[w2][0]               @ w20 * r4, "
-                  "8, 10, 12, 14\n"
-
-                  "vmla.f32   q12,    q8, %f[w1][0]               @ w12 * r1, "
-                  "2, 4, 6, 8\n"
-                  "vmla.f32   q13,    q10, %f[w1][0]              @ w12 * r1, "
-                  "10, 12, 14, 16\n"
-                  "vmla.f32   q14,    q4, %f[w2][0]               @ w22 * r4, "
-                  "2, 4, 6, 8\n"
-                  "vmla.f32   q15,    q6, %f[w2][0]               @ w22 * r4, "
-                  "10, 12, 14, 16\n"
-
-                  "vst1.32    {d24-d27},  [%[ptr_out0]]!          @ save or0\n"
-                  "vst1.32    {d28-d31},  [%[ptr_out1]]!          @ save or0\n"
-
-                  "subs   %[cnt], #1                              @loop count "
-                  "-1\n"
-                  "bne    0b                                      @ jump to "
-                  "main loop\n"
-
-                  : [cnt] "+r"(cnt),
-                    [r0] "+r"(r0),
-                    [r1] "+r"(r1),
-                    [r2] "+r"(r2),
-                    [r3] "+r"(r3),
-                    [r4] "+r"(r4),
-                    [ptr_out0] "+r"(ptr_out0),
-                    [ptr_out1] "+r"(ptr_out1)
-                  : [w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2)
-                  : "cc",
-                    "memory",
-                    "q3",
-                    "q4",
-                    "q5",
-                    "q6",
-                    "q7",
-                    "q8",
-                    "q9",
-                    "q10",
-                    "q11",
-                    "q12",
-                    "q13",
-                    "q14",
-                    "q15");
-            }
-            //! deal with remain ow
-            if (w_loop & 1) {
-              ptr_out0[0] +=
-                  r0[0] * w_tmp[0] + r0[1] * w_tmp[1] + r0[2] * w_tmp[2] +
-                  r1[0] * w_tmp[4] + r1[1] * w_tmp[5] + r1[2] * w_tmp[6] +
-                  r2[0] * w_tmp[8] + r2[1] * w_tmp[9] + r2[2] * w_tmp[10];
-
-              ptr_out0[1] +=
-                  r0[2] * w_tmp[0] + r0[3] * w_tmp[1] + r0[4] * w_tmp[2] +
-                  r1[2] * w_tmp[4] + r1[3] * w_tmp[5] + r1[4] * w_tmp[6] +
-                  r2[2] * w_tmp[8] + r2[3] * w_tmp[9] + r2[4] * w_tmp[10];
-
-              ptr_out0[2] +=
-                  r0[4] * w_tmp[0] + r0[5] * w_tmp[1] + r0[6] * w_tmp[2] +
-                  r1[4] * w_tmp[4] + r1[5] * w_tmp[5] + r1[6] * w_tmp[6] +
-                  r2[4] * w_tmp[8] + r2[5] * w_tmp[9] + r2[6] * w_tmp[10];
-
-              ptr_out0[3] +=
-                  r0[6] * w_tmp[0] + r0[7] * w_tmp[1] + r0[8] * w_tmp[2] +
-                  r1[6] * w_tmp[4] + r1[7] * w_tmp[5] + r1[8] * w_tmp[6] +
-                  r2[6] * w_tmp[8] + r2[7] * w_tmp[9] + r2[8] * w_tmp[10];
-
-              ptr_out1[0] +=
-                  r2[0] * w_tmp[0] + r2[1] * w_tmp[1] + r2[2] * w_tmp[2] +
-                  r3[0] * w_tmp[4] + r3[1] * w_tmp[5] + r3[2] * w_tmp[6] +
-                  r4[0] * w_tmp[8] + r4[1] * w_tmp[9] + r4[2] * w_tmp[10];
-
-              ptr_out1[1] +=
-                  r2[2] * w_tmp[0] + r2[3] * w_tmp[1] + r2[4] * w_tmp[2] +
-                  r3[2] * w_tmp[4] + r3[3] * w_tmp[5] + r3[4] * w_tmp[6] +
-                  r4[2] * w_tmp[8] + r4[3] * w_tmp[9] + r4[4] * w_tmp[10];
-
-              ptr_out1[2] +=
-                  r2[4] * w_tmp[0] + r2[5] * w_tmp[1] + r2[6] * w_tmp[2] +
-                  r3[4] * w_tmp[4] + r3[5] * w_tmp[5] + r3[6] * w_tmp[6] +
-                  r4[4] * w_tmp[8] + r4[5] * w_tmp[9] + r4[6] * w_tmp[10];
-
-              ptr_out1[3] +=
-                  r2[6] * w_tmp[0] + r2[7] * w_tmp[1] + r2[8] * w_tmp[2] +
-                  r3[6] * w_tmp[4] + r3[7] * w_tmp[5] + r3[8] * w_tmp[6] +
-                  r4[6] * w_tmp[8] + r4[7] * w_tmp[9] + r4[8] * w_tmp[10];
-            }
-
-            wc0 += 36;
-            inr0 += win_round;
-            inr1 += win_round;
-            inr2 += win_round;
-            inr3 += win_round;
-            inr4 += win_round;
-          }
-#endif  // __aarch64__
-          block_inr0 = block_inr4;
-          block_inr1 = block_inr0 + in_len;
-          block_inr2 = block_inr1 + in_len;
-          block_inr3 = block_inr2 + in_len;
-          block_inr4 = block_inr3 + in_len;
-        }
-        write_to_output_c1_fp32(pre_out,
-                                dout_batch,
-                                c + c_round_down,
-                                c + c_round_down + 1,
-                                h,
-                                h + h_kernel,
-                                0,
-                                wout_round,
-                                oc,
-                                oh,
-                                ow,
-                                flag_relu,
-                                ptr_write);
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_gemmlike.cc b/lite/backends/arm/math/conv_gemmlike.cc
deleted file mode 100644
index 1dd102db1e..0000000000
--- a/lite/backends/arm/math/conv_gemmlike.cc
+++ /dev/null
@@ -1,285 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/conv_gemmlike.h"
-#include <vector>
-#include "lite/backends/arm/math/gemm_prepacked_int8.h"
-#include "lite/backends/arm/math/packed_sgemm.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-/********************* Gemmlike Conv Precision Is Float ***********************/
-template <>
-bool GemmLikeConv<PRECISION(kFloat)>::create(const operators::ConvParam& param,
-                                             ARMContext* ctx) {
-  this->ctx_ = ctx;
-  auto x_dims = param.x->dims();
-  auto w_dims = param.filter->dims();
-  auto o_dims = param.output->dims();
-
-  int iw = x_dims[3];  // nchw
-  int ih = x_dims[2];
-  int ic = x_dims[1];
-  int ow = o_dims[3];
-  int oh = o_dims[2];
-  int oc = o_dims[1];
-  int kw = w_dims[3];
-  int kh = w_dims[2];
-  int sw = param.strides[1];
-  int sh = param.strides[0];
-  int pw = param.paddings[1];
-  int ph = param.paddings[0];
-  int dw = param.dilations[1];
-  int dh = param.dilations[0];
-
-  int m = oc / param.groups;
-  int k = ic * kh * kw / param.groups;
-  int n = oh * ow;
-  bool kps_equal = (pw == ph) && (sw == sh) && (kw == kh);
-  bool ks_equal = (sw == sh) && (kw == kh);
-  //! select conv gemmlike kernel
-  if (kw == 1 && sw == 1 && pw == 0 && kps_equal) {
-    //! 1x1s1p0 gemmlike conv
-    impl_ = conv1x1s1_gemm;
-  } else {
-    //! otherwise case
-    if (kw == 3 && sw == 1 && n > 1 && ks_equal) {
-      idx_data_.Resize({1, 1, 1, n * kh * kw});
-      int* idx_out = idx_data_.mutable_data<int>();
-      for (int i = 0; i < oh; ++i) {
-        for (int j = 0; j < ow; ++j) {
-          compute_offset(idx_out, i, j, kh, kw, ih, iw, ph, pw, dh, dw);
-          idx_out += kh * kw;
-        }
-      }
-    }
-    //! im2col gemmlike conv
-    impl_ = conv_im2col_gemm;
-    this->ctx_->ExtendWorkspace(k * n * sizeof(float));
-  }
-
-  if (n > 1) {
-    int hblock = get_hblock(this->ctx_->arch());
-    int m_roundup = hblock * ((m + hblock - 1) / hblock);
-    int group_size_round_up = ((m_roundup * k + 15) / 16) * 16;
-    float* w_trans_ptr = nullptr;
-    weights_trans_.Resize({1, 1, 1, group_size_round_up * param.groups});
-    w_trans_ptr = weights_trans_.mutable_data<float>();
-    const auto* w_data = param.filter->data<float>();
-    for (int g = 0; g < param.groups; ++g) {
-      const float* weights_group = w_data + g * m * k;
-      float* weights_trans_ptr = w_trans_ptr + g * group_size_round_up;
-      prepackA(weights_trans_ptr,
-               weights_group,
-               1.f,
-               k,
-               0,
-               m,
-               0,
-               k,
-               false,
-               this->ctx_);
-    }
-    is_weights_transed_ = true;
-  }
-  return true;
-}
-
-template <>
-bool GemmLikeConv<PRECISION(kFloat)>::init(const operators::ConvParam& param,
-                                           ARMContext* ctx) {
-  this->ctx_ = ctx;
-  return create(param, ctx);
-}
-
-template <>
-bool GemmLikeConv<PRECISION(kFloat)>::run(const operators::ConvParam& param) {
-  // start timer
-  const auto* i_data = param.x->data<float>();
-  const auto* w_data = param.filter->data<float>();
-  const auto* b_data = param.bias ? param.bias->data<float>() : nullptr;
-  auto* o_data = param.output->mutable_data<float>();
-  const int* idx_data = idx_data_.mutable_data<int>();
-
-  if (is_weights_transed_) {
-    w_data = weights_trans_.data<float>();
-  }
-  auto x_dims = param.x->dims();
-  auto w_dims = param.filter->dims();
-  auto o_dims = param.output->dims();
-
-  int iw = x_dims[3];  // nchw
-  int ih = x_dims[2];
-  int ic = x_dims[1];
-  int bs = x_dims[0];
-  int oh = o_dims[2];
-  int ow = o_dims[3];
-  int oc = o_dims[1];
-
-  impl_(i_data,
-        o_data,
-        bs,
-        oc,
-        oh,
-        ow,
-        ic,
-        ih,
-        iw,
-        w_data,
-        b_data,
-        param,
-        this->ctx_,
-        idx_data);
-
-  // timer end
-  return true;
-}
-
-/********************* Gemmlike Conv Precision Is Int8 ************************/
-template <PrecisionType Ptype_out>
-bool GemmLikeConvInt8<Ptype_out>::create(const operators::ConvParam& param,
-                                         ARMContext* ctx) {
-  this->ctx_ = ctx;
-  auto x_dims = param.x->dims();
-  auto w_dims = param.filter->dims();
-  auto o_dims = param.output->dims();
-
-  int iw = x_dims[3];  // nchw
-  int ih = x_dims[2];
-  int ic = x_dims[1];
-  int ow = o_dims[3];
-  int oh = o_dims[2];
-  int oc = o_dims[1];
-  int kw = w_dims[3];
-  int kh = w_dims[2];
-  int sw = param.strides[1];
-  int sh = param.strides[0];
-  int pw = param.paddings[1];
-  int ph = param.paddings[0];
-  int dw = param.dilations[1];
-  int dh = param.dilations[0];
-
-  int m = oc / param.groups;
-  int k = ic * kh * kw / param.groups;
-  int n = oh * ow;
-  w_scale_ = param.weight_scale;
-  //! update weights scale
-  if (Ptype_out == PRECISION(kInt8) || Ptype_out == PRECISION(kFloat)) {
-    CHECK_EQ(this->w_scale_.size(), oc) << "weights scale size must be chout";
-    float input_scale = param.input_scale;
-    for (auto& w_s : w_scale_) {
-      w_s *= input_scale;
-      if (Ptype_out == PRECISION(kInt8)) {
-        w_s /= param.output_scale;
-      }
-    }
-  }
-
-  bool kps_equal = (pw == ph) && (sw == sh) && (kw == kh);
-  bool ks_equal = (sw == sh) && (kw == kh);
-  //! select conv gemmlike kernel
-  if (kw == 1 && sw == 1 && pw == 0 && kps_equal) {
-    //! 1x1s1p0 gemmlike conv
-    impl_int8_ = conv1x1s1_gemm_int8;
-  } else {
-    //! otherwise case
-    if (kw == 3 && sw == 1 && n > 1 && ks_equal) {
-      idx_data_.Resize({1, 1, 1, n * kh * kw});
-      int* idx_out = idx_data_.mutable_data<int>();
-      for (int i = 0; i < oh; ++i) {
-        for (int j = 0; j < ow; ++j) {
-          compute_offset(idx_out, i, j, kh, kw, ih, iw, ph, pw, dh, dw);
-          idx_out += kh * kw;
-        }
-      }
-    }
-    //! im2col gemmlike conv
-    impl_int8_ = conv_im2col_gemm_int8;
-    this->ctx_->ExtendWorkspace(k * n);
-  }
-
-  if (n > 1) {
-    prepackA_int8(&this->weights_trans_,
-                  *param.filter,
-                  m,
-                  k,
-                  param.groups,
-                  false,
-                  this->ctx_);
-    this->is_weights_transed_ = true;
-  }
-  return true;
-}
-
-template <PrecisionType Ptype_out>
-bool GemmLikeConvInt8<Ptype_out>::init(const operators::ConvParam& param,
-                                       ARMContext* ctx) {
-  this->ctx_ = ctx;
-  return create(param, ctx);
-}
-
-template <PrecisionType Ptype_out>
-bool GemmLikeConvInt8<Ptype_out>::run(const operators::ConvParam& param) {
-  const auto* i_data = param.x->data<int8_t>();
-  const auto* w_data = param.filter->data<int8_t>();
-  const auto* b_data = param.bias ? param.bias->data<int32_t>() : nullptr;
-  auto* o_data = param.output->mutable_data<int32_t>();
-  const int32_t* idx_data = idx_data_.mutable_data<int32_t>();
-
-  if (this->is_weights_transed_ == true) {
-    w_data = this->weights_trans_.template data<int8_t>();
-  }
-  auto x_dims = param.x->dims();
-  auto w_dims = param.filter->dims();
-  auto o_dims = param.output->dims();
-
-  int iw = x_dims[3];  // nchw
-  int ih = x_dims[2];
-  int ic = x_dims[1];
-  int bs = x_dims[0];
-  int oh = o_dims[2];
-  int ow = o_dims[3];
-  int oc = o_dims[1];
-
-  impl_int8_(i_data,
-             o_data,
-             bs,
-             oc,
-             oh,
-             ow,
-             ic,
-             ih,
-             iw,
-             w_data,
-             b_data,
-             param,
-             this->ctx_,
-             Ptype_out,
-             this->w_scale_.data(),
-             idx_data);
-
-  return true;
-}
-
-template class GemmLikeConvInt8<PRECISION(kInt8)>;
-template class GemmLikeConvInt8<PRECISION(kFloat)>;
-template class GemmLikeConvInt8<PRECISION(kInt32)>;
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_gemmlike.h b/lite/backends/arm/math/conv_gemmlike.h
deleted file mode 100644
index 5986b5c2c8..0000000000
--- a/lite/backends/arm/math/conv_gemmlike.h
+++ /dev/null
@@ -1,108 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cmath>
-#include <vector>
-#include "lite/backends/arm/math/conv_impl.h"
-#include "lite/core/context.h"
-#include "lite/core/target_wrapper.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <PrecisionType Ptype>
-class GemmLikeConv
-    : public ImplBase<TARGET(kARM), Ptype, operators::ConvParam> {
- public:
-  typedef void (*conv_im2col_gemm_impl)(const float* din,
-                                        float* dout,
-                                        int num,
-                                        int chout,
-                                        int hout,
-                                        int wout,
-                                        int chin,
-                                        int hin,
-                                        int win,
-                                        const float* weights,
-                                        const float* bias,
-                                        const operators::ConvParam& param,
-                                        ARMContext* ctx,
-                                        const int* idx_ptr);
-
-  GemmLikeConv() = default;
-  ~GemmLikeConv() {}
-
-  virtual bool init(const operators::ConvParam& param, ARMContext* ctx) {
-    LOG(FATAL) << "GemmLikeConv::init() not implemented.";
-  }
-
-  virtual bool create(const operators::ConvParam& param, ARMContext* ctx) {
-    LOG(FATAL) << "GemmLikeConv::create() not implemented.";
-  }
-
-  virtual bool run(const operators::ConvParam& param) {
-    LOG(FATAL) << "GemmLikeConv::run() not implemented.";
-  }
-
- protected:
-  bool is_weights_transed_{false};
-  Tensor idx_data_;
-  Tensor weights_trans_;
-
- private:
-  conv_im2col_gemm_impl impl_{nullptr};
-};
-
-template <PrecisionType Ptype_out>
-class GemmLikeConvInt8 : public GemmLikeConv<PRECISION(kInt8)> {
- public:
-  typedef void (*conv_im2col_gemm_int8_impl)(const int8_t* din,
-                                             int32_t* dout,
-                                             int num,
-                                             int chout,
-                                             int hout,
-                                             int wout,
-                                             int chin,
-                                             int hin,
-                                             int win,
-                                             const int8_t* weights,
-                                             const int32_t* bias,
-                                             const operators::ConvParam& param,
-                                             ARMContext* ctx,
-                                             PrecisionType out_type,
-                                             const float* scale,
-                                             const int* idx_ptr);
-
-  GemmLikeConvInt8() = default;
-  ~GemmLikeConvInt8() {}
-
-  virtual bool init(const operators::ConvParam& param, ARMContext* ctx);
-
-  virtual bool create(const operators::ConvParam& param, ARMContext* ctx);
-
-  virtual bool run(const operators::ConvParam& param);
-
- private:
-  conv_im2col_gemm_int8_impl impl_int8_{nullptr};
-  std::vector<float> w_scale_;
-};
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_impl.cc b/lite/backends/arm/math/conv_impl.cc
deleted file mode 100644
index dbea9d643e..0000000000
--- a/lite/backends/arm/math/conv_impl.cc
+++ /dev/null
@@ -1,900 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// #include "saber/funcs/impl/arm/neon/impl/conv_arm_depthwise.h"
-// #include "saber/funcs/impl/arm/neon/impl/conv_arm_impl.h"
-// #include "saber/funcs/impl/arm/neon/impl/gemm_prepacked_int8.h"
-// #include "saber/funcs/impl/arm/neon/impl/gemv_arm_int8.h"
-// #include "saber/funcs/impl/arm/neon/impl/sgemv_arm.h"
-
-#include "lite/backends/arm/math/conv_impl.h"
-#include <arm_neon.h>
-#include "lite/backends/arm/math/gemm_prepacked_int8.h"
-#include "lite/backends/arm/math/gemv_arm_int8.h"
-#include "lite/backends/arm/math/packed_sgemm.h"
-#include "lite/backends/arm/math/sgemv.h"
-#include "lite/core/context.h"
-#include "lite/core/target_wrapper.h"
-#include "lite/operators/op_params.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-/**
- * \brief neon implementation to add bias
- * @param tensor
- * @param bias
- * @param channel
- * @param channel_size
- */
-void fill_bias(float* tensor,
-               const float* bias,
-               int channel,
-               int channel_size) {
-  if (tensor == nullptr) {
-    return;
-  }
-  float* data = tensor;
-
-  for (int j = 0; j < channel; ++j) {
-    float32x4_t vdata = vdupq_n_f32(bias[j]);
-    int i = 0;
-    for (; i < channel_size - 3; i += 4) {
-      vst1q_f32(data + i, vdata);
-    }
-    for (; i < channel_size; i++) {
-      data[i] = bias[j];
-    }
-    data += channel_size;
-  }
-}
-
-void fill_bias_int8(int* tensor,
-                    const int* bias,
-                    int channel,
-                    int channel_size) {
-  if (tensor == nullptr) {
-    return;
-  }
-  int* data = tensor;
-  for (int j = 0; j < channel; ++j) {
-    int32x4_t vdata = vdupq_n_s32(bias[j]);
-    int i = 0;
-    for (; i < channel_size - 3; i += 4) {
-      vst1q_s32(data + i, vdata);
-    }
-    for (; i < channel_size; i++) {
-      data[i] = bias[j];
-    }
-    data += channel_size;
-  }
-}
-
-/**
- * \brief inline funcs used in im2col
- * @param a
- * @param b
- * @return
- */
-inline bool is_a_ge_zero_and_a_lt_b(int a, int b) {
-  return static_cast<unsigned>(a) < static_cast<unsigned>(b);
-}
-
-/**
- * \brief normal im2col function for gemm conv
- * @tparam dtype
- * @param data_im
- * @param channels
- * @param height
- * @param width
- * @param kernel_size
- * @param pad
- * @param stride
- * @param data_col
- */
-template <typename Dtype>
-void im2col(const Dtype* data_im,
-            const int channels,
-            const int height,
-            const int width,
-            const int kernel_h,
-            const int kernel_w,
-            const int pad_h,
-            const int pad_w,
-            const int stride_h,
-            const int stride_w,
-            const int dilation_h,
-            const int dilation_w,
-            Dtype* data_col) {
-  const int output_h =
-      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
-  const int output_w =
-      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
-  const int channel_size = height * width;
-  for (int channel = channels; channel--; data_im += channel_size) {
-    for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
-      for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
-        int input_row = -pad_h + kernel_row * dilation_h;
-        for (int output_rows = output_h; output_rows; output_rows--) {
-          if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
-            for (int output_cols = output_w; output_cols; output_cols--) {
-              *(data_col++) = 0;
-            }
-          } else {
-            int input_col = -pad_w + kernel_col * dilation_w;
-            for (int output_col = output_w; output_col; output_col--) {
-              if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
-                *(data_col++) = data_im[input_row * width + input_col];
-              } else {
-                *(data_col++) = 0;
-              }
-              input_col += stride_w;
-            }
-          }
-          input_row += stride_h;
-        }
-      }
-    }
-  }
-}
-void compute_offset(int* idx_out,
-                    int h,
-                    int w,
-                    int kernel_h,
-                    int kernel_w,
-                    int height,
-                    int width,
-                    int pad_h,
-                    int pad_w,
-                    int dilation_h,
-                    int dilation_w) {
-  int idx_h[kernel_h];  // NOLINT
-  int idx_w[kernel_w];  // NOLINT
-  for (int i = 0; i < kernel_h; ++i) {
-    idx_h[i] = h - pad_h + i * dilation_h;
-  }
-  for (int i = 0; i < kernel_w; ++i) {
-    idx_w[i] = w - pad_w + i * dilation_w;
-  }
-  for (int k_h = 0; k_h < kernel_h; ++k_h) {
-    for (int k_w = 0; k_w < kernel_w; ++k_w) {
-      idx_out[k_h * kernel_w + k_w] =
-          (idx_h[k_h] >= 0 && idx_w[k_w] >= 0 && idx_h[k_h] < height &&
-           idx_w[k_w] < width)
-              ? idx_h[k_h] * width + idx_w[k_w]
-              : -1;
-    }
-  }
-}
-template <typename Dtype>
-void im2col3x3(const Dtype* data_im,
-               const int channels,
-               const int height,
-               const int width,
-               const int kernel_h,
-               const int kernel_w,
-               const int pad_h,
-               const int pad_w,
-               const int stride_h,
-               const int stride_w,
-               const int dilation_h,
-               const int dilation_w,
-               Dtype* data_col,
-               const int* idx) {
-  const int output_h =
-      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
-  const int output_w =
-      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
-  int kernel_stride = kernel_h * kernel_w;
-  int in_channel_stride = height * width;
-  const int* idx_out = idx;
-  Dtype* data_col_ptr = data_col;
-
-  bool flag_continue = false;
-  if (dilation_h == 1 && dilation_w == 1) {
-    flag_continue = true;
-  }
-
-  for (int o = 0; o < output_h * output_w; o += 1) {
-    const Dtype* data_im_ptr = data_im;
-
-    // int* idx_out_d = idx_out;
-
-    int idx_out_d0 = idx_out[0];
-    int idx_out_d1 = idx_out[1];
-    int idx_out_d2 = idx_out[2];
-    int idx_out_d3 = idx_out[3];
-    int idx_out_d4 = idx_out[4];
-    int idx_out_d5 = idx_out[5];
-    int idx_out_d6 = idx_out[6];
-    int idx_out_d7 = idx_out[7];
-    int idx_out_d8 = idx_out[8];
-
-    for (int i = 0; i < channels; i += 1) {
-      if (idx_out_d0 >= 0 && idx_out_d2 >= 0 && idx_out_d6 >= 0 &&
-          idx_out_d8 >= 0) {
-        if (flag_continue) {
-          memcpy(
-              data_col_ptr, data_im_ptr + idx_out_d0, kernel_w * sizeof(Dtype));
-          memcpy(data_col_ptr + kernel_w,
-                 data_im_ptr + idx_out_d3,
-                 kernel_w * sizeof(Dtype));
-          memcpy(data_col_ptr + kernel_w + kernel_w,
-                 data_im_ptr + idx_out_d6,
-                 kernel_w * sizeof(Dtype));
-        } else {
-          data_col_ptr[0] = data_im_ptr[idx_out_d0];
-          data_col_ptr[1] = data_im_ptr[idx_out_d1];
-          data_col_ptr[2] = data_im_ptr[idx_out_d2];
-          data_col_ptr[3] = data_im_ptr[idx_out_d3];
-          data_col_ptr[4] = data_im_ptr[idx_out_d4];
-          data_col_ptr[5] = data_im_ptr[idx_out_d5];
-          data_col_ptr[6] = data_im_ptr[idx_out_d6];
-          data_col_ptr[7] = data_im_ptr[idx_out_d7];
-          data_col_ptr[8] = data_im_ptr[idx_out_d8];
-        }
-      } else {
-        data_col_ptr[0] = (idx_out_d0 < 0) ? 0 : data_im_ptr[idx_out_d0];
-        data_col_ptr[1] = (idx_out_d1 < 0) ? 0 : data_im_ptr[idx_out_d1];
-        data_col_ptr[2] = (idx_out_d2 < 0) ? 0 : data_im_ptr[idx_out_d2];
-        data_col_ptr[3] = (idx_out_d3 < 0) ? 0 : data_im_ptr[idx_out_d3];
-        data_col_ptr[4] = (idx_out_d4 < 0) ? 0 : data_im_ptr[idx_out_d4];
-        data_col_ptr[5] = (idx_out_d5 < 0) ? 0 : data_im_ptr[idx_out_d5];
-        data_col_ptr[6] = (idx_out_d6 < 0) ? 0 : data_im_ptr[idx_out_d6];
-        data_col_ptr[7] = (idx_out_d7 < 0) ? 0 : data_im_ptr[idx_out_d7];
-        data_col_ptr[8] = (idx_out_d8 < 0) ? 0 : data_im_ptr[idx_out_d8];
-      }
-      data_im_ptr += height * width;
-      data_col_ptr += kernel_stride;
-    }
-    // data_col_ptr += channels * kernel_stride;
-    // idx_out += kernel_stride * 2;
-    idx_out += kernel_stride;
-  }
-}
-
-/**
- * \brief convolution function for kernel size 1x1, stride size 1, gemm
- * implementation
- */
-void conv1x1s1_gemm(const float* i_data,
-                    float* o_data,
-                    int num,
-                    int oc,
-                    int oh,
-                    int ow,
-                    int ic,
-                    int ih,
-                    int win,
-                    const float* weights,
-                    const float* bias,
-                    const operators::ConvParam& param,
-                    ARMContext* ctx,
-                    const int* idx_ptr) {
-  int channel_size_out = ow * oh;
-  int channel_size_in = win * ih;
-
-  const int group = param.groups;
-  const int m = oc / group;
-  const int n = oh * ow;
-  const int k = ic / group;
-
-  bool flag_relu = param.fuse_relu;
-  bool flag_bias = param.bias != nullptr;
-  // if (param.activation_param.has_active) {
-  //   if (param.activation_param.active == Active_relu &&
-  //       fabs(param.activation_param.negative_slope) < 1e-6f) {
-  //     flag_relu = true;
-  //   }
-  // }
-  int hblock = get_hblock(ctx->arch());
-  int m_roundup = hblock * ((m + hblock - 1) / hblock);
-  int weights_size_per_group = m * k;
-  if (n > 1) {
-    weights_size_per_group = ((m_roundup * k + 15) / 16) * 16;
-  }
-
-  // int weights_size_per_group = m_roundup * k;//oc * ic / (group *
-  // group);
-  //! use gemv when the output channel size = 1
-  for (int b = 0; b < num; ++b) {
-    // dC
-    for (int g = 0; g < group; ++g) {
-      float* dout_group =
-          static_cast<float*>(o_data) + (b * oc + g * m) * channel_size_out;
-      const float* din_group = static_cast<const float*>(i_data) +
-                               (b * ic + g * k) * channel_size_in;
-      const float* weights_group =
-          static_cast<const float*>(weights) + g * weights_size_per_group;
-      const float* bias_group = static_cast<const float*>(bias) + g * m;
-
-      if (n == 1) {
-        sgemv(weights_group,
-              din_group,
-              dout_group,
-              false,
-              m,
-              k,
-              flag_bias,
-              bias_group,
-              flag_relu);
-      } else {
-        sgemm_prepack(false,
-                      m,
-                      n,
-                      k,
-                      weights_group,
-                      din_group,
-                      n,
-                      0.f,
-                      dout_group,
-                      n,
-                      bias_group,
-                      flag_bias,
-                      flag_relu,
-                      ctx);
-      }
-    }
-  }
-}
-
-void conv1x1s1_gemm_int8(const int8_t* i_data,
-                         int32_t* o_data,
-                         int num,
-                         int oc,
-                         int oh,
-                         int ow,
-                         int ic,
-                         int ih,
-                         int win,
-                         const int8_t* weights,
-                         const int32_t* bias,
-                         const operators::ConvParam& param,
-                         ARMContext* ctx,
-                         PrecisionType out_type,
-                         const float* scale,
-                         const int32_t* idx_ptr) {
-  int group = param.groups;
-  int channel_size_out = ow * oh;
-  int channel_size_in = win * ih;
-  const int m = oc / group;
-  const int n = oh * ow;
-  const int k = ic / group;
-  int hblock = get_hblock_int8(ctx);
-  int k_roundup = ROUNDUP(k, KBLOCK_INT8);
-  int m_roundup = ROUNDUP(m, hblock);
-  int weights_size_per_group = m * k;
-  if (n > 1) {
-    weights_size_per_group = ((m_roundup * k_roundup + 15) / 16) * 16;
-  }
-  bool flag_relu = param.fuse_relu;
-  bool flag_bias = param.bias != nullptr;
-  //! use gemv when the output channel size = 1
-  for (int b = 0; b < num; ++b) {
-    // dC
-    for (int g = 0; g < group; ++g) {
-      signed char* dout_group =
-          reinterpret_cast<signed char*>(o_data) +
-          (b * oc + g * m) * channel_size_out * PrecisionTypeLength(out_type);
-      const int8_t* din_group = i_data + (b * ic + g * k) * channel_size_in;
-      const int8_t* weights_group = weights + g * weights_size_per_group;
-      const int* bias_group = bias + g * m;
-      const float* scale_group = scale + g * m;
-      if (n == 1) {
-        if (out_type == PRECISION(kFloat)) {
-          gemv_int8(weights_group,
-                    din_group,
-                    reinterpret_cast<float*>(dout_group),
-                    false,
-                    m,
-                    k,
-                    scale_group,
-                    flag_bias,
-                    bias_group,
-                    flag_relu);
-        } else if (out_type == PRECISION(kInt8)) {  // int8
-          gemv_int8(weights_group,
-                    din_group,
-                    dout_group,
-                    false,
-                    m,
-                    k,
-                    scale_group,
-                    flag_bias,
-                    bias_group,
-                    flag_relu);
-        } else {
-          gemv_int8(weights_group,
-                    din_group,
-                    reinterpret_cast<int*>(dout_group),
-                    false,
-                    m,
-                    k,
-                    scale_group,
-                    flag_bias,
-                    bias_group,
-                    flag_relu);
-        }
-      } else {
-        if (out_type == PRECISION(kFloat)) {
-          gemm_prepack_int8(weights_group,
-                            din_group,
-                            bias_group,
-                            reinterpret_cast<float*>(dout_group),
-                            m,
-                            n,
-                            k,
-                            flag_bias,
-                            flag_relu,
-                            false,
-                            scale_group,
-                            ctx);
-        } else if (out_type == PRECISION(kInt8)) {  // int8
-          gemm_prepack_int8(weights_group,
-                            din_group,
-                            bias_group,
-                            dout_group,
-                            m,
-                            n,
-                            k,
-                            flag_bias,
-                            flag_relu,
-                            false,
-                            scale_group,
-                            ctx);
-        } else {
-          gemm_prepack_int8(weights_group,
-                            din_group,
-                            bias_group,
-                            reinterpret_cast<int*>(dout_group),
-                            m,
-                            n,
-                            k,
-                            flag_bias,
-                            flag_relu,
-                            false,
-                            scale_group,
-                            ctx);
-        }
-      }
-    }
-  }
-}
-
-/**
- * \brief convolution function for kernel size 3x3, stride size 2, gemm
- * implementation
- */
-void conv_im2col_gemm(const float* i_data,
-                      float* o_data,
-                      int num,
-                      int oc,
-                      int oh,
-                      int ow,
-                      int ic,
-                      int ih,
-                      int win,
-                      const float* weights,
-                      const float* bias,
-                      const operators::ConvParam& param,
-                      ARMContext* ctx,
-                      const int* idx_ptr) {
-  const int group = param.groups;
-  auto filter_dims = param.filter->dims();
-  const int kernel_h = filter_dims[2];
-  const int kernel_w = filter_dims[3];  // nchw
-  const int m = oc / group;
-  const int n = oh * ow;
-  const int k = ic * kernel_h * kernel_w / group;
-  const int chin_per_group = ic / group;
-  int channel_size_out = ow * oh;
-  int channel_size_in = win * ih;
-  bool flag_relu = param.fuse_relu;
-  bool flag_bias = param.bias != nullptr;
-  // if (param.activation_param.has_active) {
-  //   if (param.activation_param.active == Active_relu &&
-  //       fabs(param.activation_param.negative_slope) < 1e-6f) {
-  //     flag_relu = true;
-  //   }
-  // }
-  int hblock = get_hblock(ctx->arch());
-  int m_roundup = hblock * ((m + hblock - 1) / hblock);
-  int weights_size_per_group = m * k;
-  if (n > 1) {
-    weights_size_per_group = ((m_roundup * k + 15) / 16) * 16;
-  }
-
-  bool flag_im2col2 = (kernel_h == 3 && kernel_w == 3 &&
-                       param.strides[0] == 1 && param.strides[1] == 1 && n > 1);
-
-  float* tmp_work_space =
-      ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
-
-  //! use gemv when the output channel size = 1
-  for (int b = 0; b < num; ++b) {
-    // dC
-    for (int g = 0; g < group; ++g) {
-      float* dout_group = o_data + (b * oc + g * m) * channel_size_out;
-      const float* din_group =
-          i_data + (b * ic + g * chin_per_group) * channel_size_in;
-      const float* weights_group = weights + g * weights_size_per_group;
-      const float* bias_group = bias + g * m;
-      float* dB = tmp_work_space;
-
-      if (flag_im2col2) {
-        im2col3x3(din_group,
-                  chin_per_group,
-                  ih,
-                  win,
-                  kernel_h,
-                  kernel_w,
-                  param.paddings[0],
-                  param.paddings[1],
-                  param.strides[0],
-                  param.strides[1],
-                  param.dilations[0],
-                  param.dilations[1],
-                  dB,
-                  idx_ptr);
-      } else {
-        im2col(din_group,
-               chin_per_group,
-               ih,
-               win,
-               kernel_h,
-               kernel_w,
-               param.paddings[0],
-               param.paddings[1],
-               param.strides[0],
-               param.strides[1],
-               param.dilations[0],
-               param.dilations[1],
-               dB);
-      }
-      if (n == 1) {
-        sgemv(weights_group,
-              dB,
-              dout_group,
-              false,
-              m,
-              k,
-              flag_bias,
-              bias_group,
-              flag_relu);
-      } else {
-        int ldb = n;
-        if (flag_im2col2) {
-          ldb = k;
-        }
-        sgemm_prepack(flag_im2col2,
-                      m,
-                      n,
-                      k,
-                      weights_group,
-                      dB,
-                      ldb,
-                      0.f,
-                      dout_group,
-                      n,
-                      bias_group,
-                      flag_bias,
-                      flag_relu,
-                      ctx);
-      }
-    }
-  }
-}
-
-void conv_im2col_gemm_int8(const int8_t* i_data,
-                           int32_t* o_data,
-                           int num,
-                           int oc,
-                           int oh,
-                           int ow,
-                           int ic,
-                           int ih,
-                           int win,
-                           const int8_t* weights,
-                           const int32_t* bias,
-                           const operators::ConvParam& param,
-                           ARMContext* ctx,
-                           PrecisionType out_type,
-                           const float* scale,
-                           const int32_t* idx_ptr) {
-  int group = param.groups;
-  auto filter_dims = param.filter->dims();
-  int kernel_h = filter_dims[2];
-  int kernel_w = filter_dims[3];
-  int stride_h = param.strides[0];
-  int stride_w = param.strides[1];
-  int dila_h = param.dilations[0];
-  int dila_w = param.dilations[1];
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
-  const int m = oc / group;
-  const int n = oh * ow;
-  const int k = ic * kernel_h * kernel_w / group;
-  const int chin_per_group = ic / group;
-  int channel_size_out = ow * oh;
-  int channel_size_in = win * ih;
-  bool flag_relu = param.fuse_relu;
-  bool flag_bias = param.bias != nullptr;
-
-  int hblock = get_hblock_int8(ctx);
-  int k_roundup = ROUNDUP(k, KBLOCK_INT8);
-  int m_roundup = ROUNDUP(m, hblock);
-  int weights_size_per_group = m * k;
-  if (n > 1) {
-    weights_size_per_group = ((m_roundup * k_roundup + 15) / 16) * 16;
-  }
-
-  bool flag_im2col2 = (kernel_h == 3 && kernel_w == 3 && stride_h == 1 &&
-                       stride_w == 1 && n > 1);
-
-  int8_t* tmp_work_space =
-      ctx->workspace_data<int8_t>() + ctx->llc_size() / sizeof(int8_t);
-
-  //! use gemv when the output channel size = 1
-  for (int b = 0; b < num; ++b) {
-    // dC
-    for (int g = 0; g < group; ++g) {
-      signed char* dout_group =
-          reinterpret_cast<signed char*>(o_data) +
-          (b * oc + g * m) * channel_size_out * PrecisionTypeLength(out_type);
-      const int8_t* din_group = static_cast<const int8_t*>(i_data) +
-                                (b * ic + g * chin_per_group) * channel_size_in;
-      const int8_t* weights_group =
-          static_cast<const int8_t*>(weights) + g * weights_size_per_group;
-      const int* bias_group = static_cast<const int*>(bias) + g * m;
-      int8_t* dB = tmp_work_space;
-      const float* scale_group = scale + g * m;
-
-      if (flag_im2col2) {
-        im2col3x3(din_group,
-                  chin_per_group,
-                  ih,
-                  win,
-                  kernel_h,
-                  kernel_w,
-                  pad_h,
-                  pad_w,
-                  stride_h,
-                  stride_w,
-                  dila_h,
-                  dila_w,
-                  dB,
-                  idx_ptr);
-
-      } else {
-        im2col(din_group,
-               chin_per_group,
-               ih,
-               win,
-               kernel_h,
-               kernel_w,
-               pad_h,
-               pad_w,
-               stride_h,
-               stride_w,
-               dila_h,
-               dila_w,
-               dB);
-      }
-      if (n == 1) {
-        if (out_type == PRECISION(kFloat)) {
-          gemv_int8(weights_group,
-                    dB,
-                    reinterpret_cast<float*>(dout_group),
-                    false,
-                    m,
-                    k,
-                    scale_group,
-                    flag_bias,
-                    bias_group,
-                    flag_relu);
-        } else if (out_type == PRECISION(kInt8)) {  // int8
-          gemv_int8(weights_group,
-                    dB,
-                    dout_group,
-                    false,
-                    m,
-                    k,
-                    scale_group,
-                    flag_bias,
-                    bias_group,
-                    flag_relu);
-        } else {
-          gemv_int8(weights_group,
-                    dB,
-                    reinterpret_cast<int*>(dout_group),
-                    false,
-                    m,
-                    k,
-                    scale_group,
-                    flag_bias,
-                    bias_group,
-                    flag_relu);
-        }
-      } else {
-        if (out_type == PRECISION(kFloat)) {
-          gemm_prepack_int8(weights_group,
-                            dB,
-                            bias_group,
-                            reinterpret_cast<float*>(dout_group),
-                            m,
-                            n,
-                            k,
-                            flag_bias,
-                            flag_relu,
-                            flag_im2col2,
-                            scale_group,
-                            ctx);
-        } else if (out_type == PRECISION(kInt8)) {  // int8
-          gemm_prepack_int8(weights_group,
-                            dB,
-                            bias_group,
-                            dout_group,
-                            m,
-                            n,
-                            k,
-                            flag_bias,
-                            flag_relu,
-                            flag_im2col2,
-                            scale_group,
-                            ctx);
-        } else {
-          gemm_prepack_int8(weights_group,
-                            dB,
-                            bias_group,
-                            reinterpret_cast<int*>(dout_group),
-                            m,
-                            n,
-                            k,
-                            flag_bias,
-                            flag_relu,
-                            flag_im2col2,
-                            scale_group,
-                            ctx);
-        }
-      }
-    }
-  }
-}
-
-void conv_depthwise_3x3(const float* i_data,
-                        float* o_data,
-                        int num,
-                        int oc,
-                        int oh,
-                        int ow,
-                        int ic,
-                        int ih,
-                        int win,
-                        const float* weights,
-                        const float* bias,
-                        const operators::ConvParam& param,
-                        ARMContext* ctx) {
-  int pad = param.paddings[1];
-  int stride = param.strides[1];
-  bool flag_relu = param.fuse_relu;
-  bool flag_bias = param.bias != nullptr;
-  // if (param.activation_param.has_active) {
-  //   if (param.activation_param.active == Active_relu &&
-  //       fabs(param.activation_param.negative_slope) < 1e-6f) {
-  //     flag_relu = true;
-  //   }
-  // }
-  if (pad == 1) {
-    conv_depthwise_3x3p1(i_data,
-                         o_data,
-                         num,
-                         oc,
-                         oh,
-                         ow,
-                         ic,
-                         ih,
-                         win,
-                         weights,
-                         bias,
-                         stride,
-                         flag_bias,
-                         flag_relu,
-                         ctx);
-  } else if (pad == 0 && ih > 2) {
-    conv_depthwise_3x3p0(i_data,
-                         o_data,
-                         num,
-                         oc,
-                         oh,
-                         ow,
-                         ic,
-                         ih,
-                         win,
-                         weights,
-                         bias,
-                         stride,
-                         flag_bias,
-                         flag_relu,
-                         ctx);
-  } else {
-    LOG(FATAL) << "unsupport this type 3x3 dw conv";
-  }
-}
-
-void conv_depthwise_5x5(const float* i_data,
-                        float* o_data,
-                        int num,
-                        int oc,
-                        int oh,
-                        int ow,
-                        int ic,
-                        int ih,
-                        int win,
-                        const float* weights,
-                        const float* bias,
-                        const operators::ConvParam& param,
-                        ARMContext* ctx) {
-  int pad = param.paddings[1];
-  int stride = param.strides[1];
-  bool flag_relu = param.fuse_relu;
-  bool flag_bias = param.bias != nullptr;
-  // if (param.activation_param.has_active &&
-  //     fabs(param.activation_param.negative_slope) < 1e-6f) {
-  //   if (param.activation_param.active == Active_relu) {
-  //     flag_relu = true;
-  //   }
-  // }
-  if (pad == 2 && stride == 2) {
-    conv_depthwise_5x5s2(i_data,
-                         o_data,
-                         num,
-                         oc,
-                         oh,
-                         ow,
-                         ic,
-                         ih,
-                         win,
-                         weights,
-                         bias,
-                         pad,
-                         flag_bias,
-                         flag_relu,
-                         ctx);
-  } else if (stride == 1) {
-    conv_depthwise_5x5s1(i_data,
-                         o_data,
-                         num,
-                         oc,
-                         oh,
-                         ow,
-                         ic,
-                         ih,
-                         win,
-                         weights,
-                         bias,
-                         pad,
-                         flag_bias,
-                         flag_relu,
-                         ctx);
-  } else {
-    LOG(FATAL) << "unsupport this type 5x5 dw conv";
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_impl.h b/lite/backends/arm/math/conv_impl.h
deleted file mode 100644
index 38d799bb4c..0000000000
--- a/lite/backends/arm/math/conv_impl.h
+++ /dev/null
@@ -1,423 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "lite/core/context.h"
-#include "lite/core/target_wrapper.h"
-#include "lite/operators/op_params.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-// TODO(TJ): move to somewhere else common
-template <TargetType TType, PrecisionType PType, typename Param>
-class ImplBase {
- public:
-  ImplBase() {}
-  virtual ~ImplBase() {}
-
-  virtual bool create(const Param& param, Context<TType>* ctx) { return false; }
-
-  virtual bool init(const Param& param, Context<TType>* ctx) { return false; }
-
-  virtual bool run(const Param& param) { return false; }
-  // void set_op_name(const char* name){_op_name = name;}
-  // const char* get_op_name() { return _op_name.c_str();}
-
- protected:
-  Param* param_;
-  Context<TType>* ctx_;
-};
-
-void conv_3x3s1_direct_fp32(const float* din,
-                            float* dout,
-                            int num,
-                            int chout,
-                            int hout,
-                            int wout,
-                            int chin,
-                            int hin,
-                            int win,
-                            const float* weights,
-                            const float* bias,
-                            const operators::ConvParam& param,
-                            Context<TARGET(kARM)>* ctx);
-
-void conv_3x3s1_direct_int8(const int8_t* din,
-                            int32_t* dout,
-                            int num,
-                            int chout,
-                            int hout,
-                            int wout,
-                            int chin,
-                            int hin,
-                            int win,
-                            const int8_t* weights,
-                            const int32_t* bias,
-                            const operators::ConvParam& param,
-                            Context<TARGET(kARM)>* ctx,
-                            PrecisionType out_type,
-                            const float* scale);
-
-void conv_3x3s1_direct_int7(const int8_t* din,
-                            int32_t* dout,
-                            int num,
-                            int chout,
-                            int hout,
-                            int wout,
-                            int chin,
-                            int hin,
-                            int win,
-                            const int8_t* weights,
-                            const int32_t* bias,
-                            const operators::ConvParam& param,
-                            Context<TARGET(kARM)>* ctx,
-                            PrecisionType out_type,
-                            const float* scale);
-
-void conv_3x3s2_direct_fp32(const float* din,
-                            float* dout,
-                            int num,
-                            int chout,
-                            int hout,
-                            int wout,
-                            int chin,
-                            int hin,
-                            int win,
-                            const float* weights,
-                            const float* bias,
-                            const operators::ConvParam& param,
-                            Context<TARGET(kARM)>* ctx);
-
-int conv_3x3s2_direct_int8_c_num();
-
-void conv_3x3s2_direct_int8(const int8_t* din,
-                            int32_t* dout,
-                            int num,
-                            int chout,
-                            int hout,
-                            int wout,
-                            int chin,
-                            int hin,
-                            int win,
-                            const int8_t* weights,
-                            const int32_t* bias,
-                            const operators::ConvParam& param,
-                            Context<TARGET(kARM)>* ctx,
-                            PrecisionType out_type,
-                            const float* scale);
-
-void conv_1x5s1_direct(const void* din,
-                       void* dout,
-                       int num,
-                       int chout,
-                       int hout,
-                       int wout,
-                       int chin,
-                       int hin,
-                       int win,
-                       const void* weights,
-                       const void* bias,
-                       int group,
-                       int kernel_w,
-                       int kernel_h,
-                       int stride_w,
-                       int stride_h,
-                       int dila_w,
-                       int dila_h,
-                       int pad_w,
-                       int pad_h,
-                       bool flag_bias,
-                       bool flag_relu,
-                       Context<TARGET(kARM)>& ctx,
-                       void* work_space,
-                       const void* idx_ptr);
-
-void conv_5x1s1_direct(const void* din,
-                       void* dout,
-                       int num,
-                       int chout,
-                       int hout,
-                       int wout,
-                       int chin,
-                       int hin,
-                       int win,
-                       const void* weights,
-                       const void* bias,
-                       int group,
-                       int kernel_w,
-                       int kernel_h,
-                       int stride_w,
-                       int stride_h,
-                       int dila_w,
-                       int dila_h,
-                       int pad_w,
-                       int pad_h,
-                       bool flag_bias,
-                       bool flag_relu,
-                       Context<TARGET(kARM)>& ctx,
-                       void* work_space,
-                       const void* idx_ptr);
-
-void conv1x1s1_gemm(const float* din,
-                    float* dout,
-                    int num,
-                    int chout,
-                    int hout,
-                    int wout,
-                    int chin,
-                    int hin,
-                    int win,
-                    const float* weights,
-                    const float* bias,
-                    const operators::ConvParam& param,
-                    Context<TARGET(kARM)>* ctx,
-                    const int* idx_ptr);
-
-void conv1x1s1_gemm_int8(const int8_t* din,
-                         int32_t* dout,
-                         int num,
-                         int chout,
-                         int hout,
-                         int wout,
-                         int chin,
-                         int hin,
-                         int win,
-                         const int8_t* weights,
-                         const int32_t* bias,
-                         const operators::ConvParam& param,
-                         Context<TARGET(kARM)>* ctx,
-                         PrecisionType out_type,
-                         const float* scale,
-                         const int32_t* idx_ptr);
-
-void conv_im2col_gemm(const float* din,
-                      float* dout,
-                      int num,
-                      int chout,
-                      int hout,
-                      int wout,
-                      int chin,
-                      int hin,
-                      int win,
-                      const float* weights,
-                      const float* bias,
-                      const operators::ConvParam& param,
-                      Context<TARGET(kARM)>* ctx,
-                      const int* idx_ptr);
-
-void conv_im2col_gemm_int8(const int8_t* din,
-                           int32_t* dout,
-                           int num,
-                           int chout,
-                           int hout,
-                           int wout,
-                           int chin,
-                           int hin,
-                           int win,
-                           const int8_t* weights,
-                           const int32_t* bias,
-                           const operators::ConvParam& param,
-                           Context<TARGET(kARM)>* ctx,
-                           PrecisionType out_type,
-                           const float* scale,
-                           const int32_t* idx_ptr);
-
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias
- */
-
-void conv_depthwise_3x3p0(const float* din,
-                          float* dout,
-                          int num,
-                          int ch_out,
-                          int h_out,
-                          int w_out,
-                          int ch_in,
-                          int h_in,
-                          int w_in,
-                          const float* weights,
-                          const float* bias,
-                          int stride,
-                          bool flag_bias,
-                          bool flag_relu,
-                          ARMContext* ctx);
-
-void conv_depthwise_3x3p1(const float* din,
-                          float* dout,
-                          int num,
-                          int ch_out,
-                          int h_out,
-                          int w_out,
-                          int ch_in,
-                          int h_in,
-                          int w_in,
-                          const float* weights,
-                          const float* bias,
-                          int stride,
-                          bool flag_bias,
-                          bool flag_relu,
-                          ARMContext* ctx);
-
-void conv_depthwise_5x5s1(const float* din,
-                          float* dout,
-                          int num,
-                          int chout,
-                          int hout,
-                          int wout,
-                          int chin,
-                          int hin,
-                          int win,
-                          const float* weights,
-                          const float* bias,
-                          int pad,
-                          bool flag_bias,
-                          bool flag_relu,
-                          ARMContext* ctx);
-
-void conv_depthwise_5x5s2(const float* din,
-                          float* dout,
-                          int num,
-                          int chout,
-                          int hout,
-                          int wout,
-                          int chin,
-                          int hin,
-                          int win,
-                          const float* weights,
-                          const float* bias,
-                          int pad,
-                          bool flag_bias,
-                          bool flag_relu,
-                          ARMContext* ctx);
-
-void conv_depthwise_3x3(const float* din,
-                        float* dout,
-                        int num,
-                        int chout,
-                        int hout,
-                        int wout,
-                        int chin,
-                        int hin,
-                        int win,
-                        const float* weights,
-                        const float* bias,
-                        const operators::ConvParam& param,
-                        Context<TARGET(kARM)>* ctx);
-
-void conv_depthwise_3x3_int8(const int8_t* din,
-                             int32_t* dout,
-                             int num,
-                             int chout,
-                             int hout,
-                             int wout,
-                             int chin,
-                             int hin,
-                             int win,
-                             const int8_t* weights,
-                             const int32_t* bias,
-                             const operators::ConvParam& param,
-                             Context<TARGET(kARM)>* ctx,
-                             PrecisionType out_type,
-                             const float* scale);
-
-void conv_depthwise_3x3_int7(const int8_t* din,
-                             int32_t* dout,
-                             int num,
-                             int chout,
-                             int hout,
-                             int wout,
-                             int chin,
-                             int hin,
-                             int win,
-                             int8_t* weights,
-                             const int32_t* bias,
-                             const operators::ConvParam& param,
-                             Context<TARGET(kARM)>* ctx,
-                             PrecisionType out_type,
-                             const float* scale);
-
-void conv_depthwise_5x5(const float* din,
-                        float* dout,
-                        int num,
-                        int chout,
-                        int hout,
-                        int wout,
-                        int chin,
-                        int hin,
-                        int win,
-                        const float* weights,
-                        const float* bias,
-                        const operators::ConvParam& param,
-                        Context<TARGET(kARM)>* ctx);
-
-void conv_depthwise_5x5_int8(const int8_t* din,
-                             int32_t* dout,
-                             int num,
-                             int chout,
-                             int hout,
-                             int wout,
-                             int chin,
-                             int hin,
-                             int win,
-                             const int8_t* weights,
-                             const int32_t* bias,
-                             const operators::ConvParam& param,
-                             Context<TARGET(kARM)>* ctx,
-                             PrecisionType out_type,
-                             const float* scale);
-
-void conv_winograd3x3(const float* din,
-                      float* dout,
-                      int num,
-                      int chout,
-                      int hout,
-                      int wout,
-                      int chin,
-                      int hin,
-                      int win,
-                      const float* weights,
-                      const float* bias,
-                      const operators::ConvParam& param,
-                      Context<TARGET(kARM)>* ctx);
-
-void winograd_transform_weights(
-    void* dout, const void* din, int ch_out, int ch_in, void* work_space);
-
-void compute_offset(int* idx_out,
-                    int h,
-                    int w,
-                    int kernel_h,
-                    int kernel_w,
-                    int height,
-                    int width,
-                    int pad_h,
-                    int pad_w,
-                    int dilation_h,
-                    int dilation_w);
-
-void fill_bias(float* tensor, const float* bias, int channel, int channel_size);
-
-void fill_bias_int8(int* tensor,
-                    const int* bias,
-                    int channel,
-                    int channel_size);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_winograd.cc b/lite/backends/arm/math/conv_winograd.cc
deleted file mode 100644
index 43ad9e2cd8..0000000000
--- a/lite/backends/arm/math/conv_winograd.cc
+++ /dev/null
@@ -1,141 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/conv_winograd.h"
-#include <vector>
-#include "lite/backends/arm/math/conv_impl.h"
-#include "lite/backends/arm/math/packed_sgemm.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <>
-bool WinogradConv<PRECISION(kFloat)>::create(const operators::ConvParam& param,
-                                             ARMContext* ctx) {
-  this->ctx_ = ctx;
-  auto x_dims = param.x->dims();
-  auto w_dims = param.filter->dims();
-  auto o_dims = param.output->dims();
-
-  int iw = x_dims[3];  // nchw
-  int ic = x_dims[1];
-  int ow = o_dims[3];
-  int oh = o_dims[2];
-  int oc = o_dims[1];
-  int kw = w_dims[3];
-  int sw = param.strides[1];
-  if (kw == 3) {
-    is_weights_transed_ = true;
-    int tile_w = (ow + 5) / 6;
-    int tile_h = (oh + 5) / 6;
-    int size_tile = tile_h * tile_w;
-    int size_trans_channel = 8 * 8 * size_tile;
-    int max_ch = ic > oc ? ic : oc;
-
-    const int m_wino = oc;
-    const int n_wino = size_tile;
-    int hblock = get_hblock(this->ctx_->arch());
-    int m_round = hblock * ((m_wino + hblock - 1) / hblock);
-    weights_trans_.Resize({1, 1, 1, 8 * 8 * m_round * ic});
-    this->ctx_->ExtendWorkspace((size_trans_channel * max_ch * 2 + n_wino) *
-                                sizeof(float));
-    auto weights_wino =
-        static_cast<float*>(malloc(sizeof(float) * 8 * 8 * oc * ic));
-    void* trans_tmp_ptr = malloc(sizeof(float) * 8 * 8 * oc * ic);
-    if (weights_wino && trans_tmp_ptr) {
-      winograd_transform_weights(
-          weights_wino, param.filter->data<float>(), oc, ic, trans_tmp_ptr);
-      auto weights_trans = weights_trans_.mutable_data<float>();
-      for (int i = 0; i < 64; ++i) {
-        float* packed_weights = weights_trans + i * m_round * ic;
-        const float* weights_wino_ptr = weights_wino + i * oc * ic;
-        prepackA(packed_weights,
-                 weights_wino_ptr,
-                 1.f,
-                 ic,
-                 0,
-                 m_wino,
-                 0,
-                 ic,
-                 false,
-                 this->ctx_);
-      }
-      impl_ = conv_winograd3x3;
-      free(trans_tmp_ptr);
-      free(weights_wino);
-      return true;
-    }
-    free(trans_tmp_ptr);
-    free(weights_wino);
-  } else {
-    LOG(ERROR) << "this type winograd conv not impl";
-  }
-  return false;
-}
-
-template <>
-bool WinogradConv<PRECISION(kFloat)>::init(const operators::ConvParam& param,
-                                           Context<TARGET(kARM)>* ctx) {
-  this->ctx_ = ctx;
-  return create(param, ctx);
-}
-
-template <>
-bool WinogradConv<PRECISION(kFloat)>::run(const operators::ConvParam& param) {
-  // start timer
-  const auto* i_data = param.x->data<float>();
-  const auto* w_data = param.filter->data<float>();
-  const auto* b_data = param.bias ? param.bias->data<float>() : nullptr;
-  auto* o_data = param.output->mutable_data<float>();
-
-  if (is_weights_transed_) {
-    w_data = weights_trans_.data<float>();
-  }
-
-  auto x_dims = param.x->dims();
-  auto w_dims = param.filter->dims();
-  auto o_dims = param.output->dims();
-
-  int iw = x_dims[3];  // nchw
-  int ih = x_dims[2];
-  int ic = x_dims[1];
-  int bs = x_dims[0];
-  int oh = o_dims[2];
-  int ow = o_dims[3];
-  int oc = o_dims[1];
-
-  impl_(i_data,
-        o_data,
-        bs,
-        oc,
-        oh,
-        ow,
-        ic,
-        ih,
-        iw,
-        w_data,
-        b_data,
-        param,
-        this->ctx_);
-
-  // timer end
-  return true;
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_winograd.h b/lite/backends/arm/math/conv_winograd.h
deleted file mode 100644
index 1ae5edb0aa..0000000000
--- a/lite/backends/arm/math/conv_winograd.h
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cmath>
-#include "lite/backends/arm/math/conv_impl.h"
-#include "lite/core/context.h"
-#include "lite/core/target_wrapper.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <PrecisionType Ptype>
-class WinogradConv
-    : public ImplBase<TARGET(kARM), Ptype, operators::ConvParam> {
- public:
-  typedef void (*conv_winograd_impl)(const float* din,
-                                     float* dout,
-                                     int num,
-                                     int chout,
-                                     int hout,
-                                     int wout,
-                                     int chin,
-                                     int hin,
-                                     int win,
-                                     const float* weights,
-                                     const float* bias,
-                                     const operators::ConvParam& param,
-                                     Context<TARGET(kARM)>* ctx);
-
-  WinogradConv() = default;
-  ~WinogradConv() {}
-
-  virtual bool init(const operators::ConvParam& param,
-                    Context<TARGET(kARM)>* ctx);
-
-  virtual bool create(const operators::ConvParam& param,
-                      Context<TARGET(kARM)>* ctx);
-
-  virtual bool run(const operators::ConvParam& param);
-
- private:
-  conv_winograd_impl impl_{nullptr};
-  bool is_weights_transed_{false};
-  Tensor weights_trans_;
-};
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_winograd_3x3.cc b/lite/backends/arm/math/conv_winograd_3x3.cc
deleted file mode 100644
index 87f51381e6..0000000000
--- a/lite/backends/arm/math/conv_winograd_3x3.cc
+++ /dev/null
@@ -1,479 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/conv_impl.h"
-#include "lite/backends/arm/math/packed_sgemm.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void transpose(float* data_out, const float* data_in, int w_in, int h_in);
-void transform_input_f6x6(float* dout, const float* din);
-void transform_output_f6x6(float* output, const float* din, float bias);
-void conv_winograd3x3(const float* din,
-                      float* dout,
-                      int num,
-                      int chout,
-                      int hout,
-                      int wout,
-                      int chin,
-                      int hin,
-                      int win,
-                      const float* weights,
-                      const float* bias,
-                      const operators::ConvParam& param,
-                      ARMContext* ctx) {
-  int threads = ctx->threads();
-
-  const int pad_h = param.paddings[0];
-  const int pad_w = param.paddings[1];
-  int size_in_channel = win * hin;
-  int size_out_channel = wout * hout;
-  bool flag_relu = param.fuse_relu;
-  bool flag_bias = param.bias != nullptr;
-
-  //! transform input
-  int tile_w = (wout + 5) / 6;
-  int tile_h = (hout + 5) / 6;
-  int size_tile = tile_h * tile_w;
-  int size_trans_channel = 8 * 8 * size_tile;
-  int max_ch = chin > chout ? chin : chout;
-
-  int m = chout;
-  int n = size_tile;
-  int k = chin;
-
-  float* tmp_work_space =
-      ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
-
-  //! tmp data buffer for input transform
-  float* tmp_data1 = tmp_work_space;
-  //! tmp data buffer for dot mul
-  float* tmp_data2 = tmp_data1 + size_trans_channel * max_ch;
-
-  for (int i = 0; i < num; ++i) {
-    const float* din_batch = din + i * chin * size_in_channel;
-    float* dout_batch = dout + i * chout * size_out_channel;
-
-//! transform input Bt * data * B
-#pragma omp parallel for num_threads(threads)
-    for (int j = 0; j < chin; ++j) {
-      const float* din_channel = din_batch + j * size_in_channel;
-      float* data_trans_channel = tmp_data1 + j * size_trans_channel;
-
-      for (int h = 0; h < tile_h; h++) {
-        for (int w = 0; w < tile_w; w++) {
-          //! prepare data 8x8
-          //! row 8
-          float data_in_tmp[8][8] = {0.f};
-          // memset(data_in_tmp[0], 0, sizeof(float) * 64);
-          for (int j = 0; j < 8; ++j) {
-            int start_row = h * 6 + j - pad_h;
-            if (start_row >= 0 && start_row < hin) {
-              for (int k = 0; k < 8; ++k) {
-                int start_col = w * 6 + k - pad_w;
-                if (start_col >= 0 && start_col < win) {
-                  data_in_tmp[j][k] = din_channel[start_row * win + start_col];
-                }
-              }
-            }
-          }
-          transform_input_f6x6(data_trans_channel, data_in_tmp[0]);
-          data_trans_channel += 64;
-        }
-      }
-    }
-    //! end of transform input
-
-    ////////////////////////////////////////////////////////////////////////////////
-    //! dot mul
-    //! transpose input, convert from ch_in * tile_h * tile_w * 64 to
-    //! 64 * ch_in * tile_h * tile_w
-    int hblock = get_hblock(ctx->arch());
-    int m_round = hblock * ((chout + hblock - 1) / hblock);
-    int stride_a = m_round * chin;
-    int stride_b = chin * size_tile;
-    int stride_c = chout * size_tile;
-    transpose(tmp_data2, tmp_data1, 64, stride_b);
-
-    //! gemm
-    // #pragma omp parallel for
-    for (int l = 0; l < 64; ++l) {
-      const float* ptr_a = weights + l * stride_a;
-      const float* ptr_b = tmp_data2 + l * stride_b;
-      float* ptr_c = tmp_data1 + l * stride_c;
-      sgemm_prepack(false,
-                    chout,
-                    size_tile,
-                    chin,
-                    ptr_a,
-                    ptr_b,
-                    size_tile,
-                    0.f,
-                    ptr_c,
-                    size_tile,
-                    nullptr,
-                    false,
-                    false,
-                    ctx);
-    }
-
-    //! transpose output, convert from 64 * ch_out * tile_h * tile_w to
-    //! ch_out * tile_h * tile_w * 64
-    transpose(tmp_data2, tmp_data1, stride_c, 64);
-//! end of dot mul
-
-///////////////////////////////////////////////////////////////////////////////
-//! transform output
-#pragma omp parallel for
-    for (int i = 0; i < chout; ++i) {
-      float bias_value = flag_bias ? bias[i] : 0.f;
-      float* dout_tmp = tmp_data2 + i * size_trans_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      for (int h = 0; h < tile_h; ++h) {
-        for (int w = 0; w < tile_w; ++w) {
-          float out_tmp[6][6];
-
-          transform_output_f6x6(out_tmp[0], dout_tmp, bias_value);
-          dout_tmp += 64;
-
-          for (int j = 0; j < 6; ++j) {
-            int end_row = h * 6 + j;
-            if (end_row < hout) {
-              for (int k = 0; k < 6; ++k) {
-                int end_col = w * 6 + k;
-                if (end_col < wout) {
-                  if (flag_relu) {
-                    dout_channel[end_row * wout + end_col] =
-                        out_tmp[j][k] > 0.f ? out_tmp[j][k] : 0.f;
-                  } else {
-                    dout_channel[end_row * wout + end_col] = out_tmp[j][k];
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-    //! end of transform output
-  }
-}
-
-/**
- * \brief transpose with arm neon optimization
- * @param data_out
- * @param data_in
- * @param w_in
- * @param h_in
- */
-void transpose(float* data_out, const float* data_in, int w_in, int h_in) {
-  int nw = w_in >> 2;
-  int nh = h_in >> 2;
-  int size_in = w_in * h_in;
-
-  float* ptr_out = data_out;
-  const float* ptr_in = data_in;
-#pragma omp parallel for
-  for (int h = 0; h < nh; h++) {
-    const float* ptr_din_row = ptr_in + h * 4 * w_in;
-    for (int w = 0; w < nw; w++) {
-      float* data_out_ptr = ptr_out + w * 4 * h_in + h * 4;
-      const float* din0 = ptr_din_row;
-      const float* din1 = din0 + w_in;
-      const float* din2 = din1 + w_in;
-      const float* din3 = din2 + w_in;
-
-      float* dout0 = data_out_ptr;
-      float* dout1 = dout0 + h_in;
-      float* dout2 = dout1 + h_in;
-      float* dout3 = dout2 + h_in;
-#ifdef __aarch64__
-      asm("ldr    q0, [%[in0]]                                            \n" /*load input 0*/
-          "ldr    q1, [%[in1]]                                \n"
-          "ldr    q2, [%[in2]]                                \n"
-          "ldr    q3, [%[in3]]                                \n"
-          "trn1   v4.4s, v0.4s, v1.4s                         \n"
-          "trn2   v5.4s, v0.4s, v1.4s                         \n"
-          "trn1   v6.4s, v2.4s, v3.4s                         \n"
-          "trn2   v7.4s, v2.4s, v3.4s                         \n"
-          "trn1   v8.2d, v4.2d, v6.2d                         \n"
-          "trn1   v9.2d, v5.2d, v7.2d                         \n"
-          "trn2   v10.2d, v4.2d, v6.2d                        \n"
-          "trn2   v11.2d, v5.2d, v7.2d                        \n"
-          "str    q8, [%[out0]]                               \n"
-          "str    q9, [%[out1]]                               \n"
-          "str   q10, [%[out2]]                               \n"
-          "str   q11, [%[out3]]                               \n"
-          :
-          : [out0] "r"(dout0),
-            [out1] "r"(dout1),
-            [out2] "r"(dout2),
-            [out3] "r"(dout3),
-            [in0] "r"(din0),
-            [in1] "r"(din1),
-            [in2] "r"(din2),
-            [in3] "r"(din3)
-          : "v0",
-            "v1",
-            "v2",
-            "v3",
-            "v4",
-            "v5",
-            "v6",
-            "v7",
-            "v8",
-            "v9",
-            "v10",
-            "v11");
-#else
-      asm("vld1.32 {d0, d1}, [%[in0]]    \n"
-          "vld1.32 {d2, d3}, [%[in1]]    \n"
-          "vld1.32 {d4, d5}, [%[in2]]    \n"
-          "vld1.32 {d6, d7}, [%[in3]]    \n"
-          "vtrn.32 q0, q1                \n"
-          "vtrn.32 q2, q3                \n"
-          "vswp d1, d4                   \n"
-          "vswp d3, d6                   \n"
-          "vst1.32 {d0, d1}, [%[out0]]   \n"
-          "vst1.32 {d2, d3}, [%[out1]]   \n"
-          "vst1.32 {d4, d5}, [%[out2]]   \n"
-          "vst1.32 {d6, d7}, [%[out3]]   \n"
-          :
-          : [out0] "r"(dout0),
-            [out1] "r"(dout1),
-            [out2] "r"(dout2),
-            [out3] "r"(dout3),
-            [in0] "r"(din0),
-            [in1] "r"(din1),
-            [in2] "r"(din2),
-            [in3] "r"(din3)
-          : "q0", "q1", "q2", "q3");
-#endif
-      ptr_din_row += 4;
-    }
-  }
-  // remian
-  for (int h = 0; h < h_in; h++) {
-    for (int w = nw * 4; w < w_in; w++) {
-      const float* data_in_ptr = ptr_in + h * w_in + w;
-      float* data_out_ptr = ptr_out + w * h_in + h;
-      *data_out_ptr = *data_in_ptr;
-    }
-  }
-  for (int w = 0; w < w_in; w++) {
-    for (int h = nh * 4; h < h_in; h++) {
-      const float* data_in_ptr = ptr_in + h * w_in + w;
-      float* data_out_ptr = ptr_out + w * h_in + h;
-      *data_out_ptr = *data_in_ptr;
-    }
-  }
-}
-
-/**
- * \brief winograd transform conv3x3 weights, f63
- * this is done in op initialization or creation, only do once
- * dout = G * g * GT, where G is the transform coeff, g is the input weights
- * @param dout
- * @param din
- * @param ch_out
- * @param ch_in
- * @param work_space
- */
-void winograd_transform_weights(
-    void* dout, const void* din, int ch_out, int ch_in, void* work_space) {
-  const float coeff[8][3] = {{1.0f, 0.0f, 0.0f},
-                             {-2.0f / 9, -2.0f / 9, -2.0f / 9},
-                             {-2.0f / 9, 2.0f / 9, -2.0f / 9},
-                             {1.0f / 90, 1.0f / 45, 2.0f / 45},
-                             {1.0f / 90, -1.0f / 45, 2.0f / 45},
-                             {32.0f / 45, 16.0f / 45, 8.0f / 45},
-                             {32.0f / 45, -16.0f / 45, 8.0f / 45},
-                             {0.0f, 0.0f, 1.0f}};
-
-  float* ptr_out = static_cast<float*>(work_space);
-
-  for (int i = 0; i < ch_out; i++) {
-    for (int j = 0; j < ch_in; j++) {
-      const float* kernel0 =
-          static_cast<const float*>(din) + (i * ch_in + j) * 9;
-      float* ptr_channel = ptr_out + (i * ch_in + j) * 64;
-
-      //! transform kernel, transposed
-      const float* k0 = kernel0;
-      const float* k1 = kernel0 + 3;
-      const float* k2 = kernel0 + 6;
-
-      //! h
-      float tmp[8][3];
-      for (int i = 0; i < 8; i++) {
-        tmp[i][0] =
-            k0[0] * coeff[i][0] + k0[1] * coeff[i][1] + k0[2] * coeff[i][2];
-        tmp[i][1] =
-            k1[0] * coeff[i][0] + k1[1] * coeff[i][1] + k1[2] * coeff[i][2];
-        tmp[i][2] =
-            k2[0] * coeff[i][0] + k2[1] * coeff[i][1] + k2[2] * coeff[i][2];
-      }
-
-      //! v
-      for (int j = 0; j < 8; j++) {
-        float* tmpp = &tmp[j][0];
-        for (int i = 0; i < 8; i++) {
-          ptr_channel[j * 8 + i] = tmpp[0] * coeff[i][0] +
-                                   tmpp[1] * coeff[i][1] +
-                                   tmpp[2] * coeff[i][2];
-        }
-      }
-    }
-  }
-  transpose(static_cast<float*>(dout), ptr_out, 64, ch_out * ch_in);
-}
-
-/**
- * \brief winograd conv, transform input, f6x3
- * dout = BT * d * B, whrer B is the transform
- * BT = 1      0   -21/4       0     21/4        0   -1   0
- *      0      1       1   -17/4    -17/4        1    1   0
- *      0     -1       1    17/4    -17/4       -1    1   0
- *      0    1/2     1/4    -5/2     -5/4        2    1   0
- *      0   -1/2     1/4     5/2     -5/4       -2    1   0
- *      0      2       4    -5/2       -5      1/2    1   0
- *      0     -2       4     5/2       -5     -1/2    1   0
- *      0     -1       0    21/4        0    -21/4    0   1
- * @param dout
- * @param din
- */
-void transform_input_f6x6(float* dout, const float* din) {
-  float tmp[8][8];
-  //! BT * d
-  for (int m = 0; m < 8; m++) {
-    tmp[0][m] = din[0] - din[6] + (din[4] - din[2]) * 5.25f;
-    tmp[7][m] = din[7] - din[1] + (din[3] - din[5]) * 5.25f;
-
-    float tmp12a = din[2] + din[6] - din[4] * 4.25f;
-    float tmp12b = din[1] + din[5] - din[3] * 4.25f;
-
-    tmp[1][m] = tmp12a + tmp12b;
-    tmp[2][m] = tmp12a - tmp12b;
-
-    float tmp34a = din[6] + din[2] * 0.25f - din[4] * 1.25f;
-    float tmp34b = din[1] * 0.5f - din[3] * 2.5f + din[5] * 2.f;
-
-    tmp[3][m] = tmp34a + tmp34b;
-    tmp[4][m] = tmp34a - tmp34b;
-
-    float tmp56a = din[6] + (din[2] - din[4] * 1.25f) * 4.f;
-    float tmp56b = din[1] * 2.f - din[3] * 2.5f + din[5] * 0.5f;
-
-    tmp[5][m] = tmp56a + tmp56b;
-    tmp[6][m] = tmp56a - tmp56b;
-
-    din += 8;
-  }
-
-  for (int m = 0; m < 8; m++) {
-    const float* tmp0 = tmp[m];
-
-    dout[0] = tmp0[0] - tmp0[6] + (tmp0[4] - tmp0[2]) * 5.25f;
-    dout[7] = tmp0[7] - tmp0[1] + (tmp0[3] - tmp0[5]) * 5.25f;
-
-    float tmp12a = tmp0[2] + tmp0[6] - tmp0[4] * 4.25f;
-    float tmp12b = tmp0[1] + tmp0[5] - tmp0[3] * 4.25f;
-
-    dout[1] = tmp12a + tmp12b;
-    dout[2] = tmp12a - tmp12b;
-
-    float tmp34a = tmp0[6] + tmp0[2] * 0.25f - tmp0[4] * 1.25f;
-    float tmp34b = tmp0[1] * 0.5f - tmp0[3] * 2.5f + tmp0[5] * 2.f;
-
-    dout[3] = tmp34a + tmp34b;
-    dout[4] = tmp34a - tmp34b;
-
-    float tmp56a = tmp0[6] + (tmp0[2] - tmp0[4] * 1.25f) * 4.f;
-    float tmp56b = tmp0[1] * 2.f - tmp0[3] * 2.5f + tmp0[5] * 0.5f;
-
-    dout[5] = tmp56a + tmp56b;
-    dout[6] = tmp56a - tmp56b;
-
-    dout += 8;
-  }
-}
-
-/**
- * \brief winograd conv, transform output, f63
- * out = AT * din * A
- * AT = 1      1       1       1        1        1        1   0
- *      0      1      -1       2       -2      1/2     -1/2   0
- *      0      1       1       4        4      1/4      1/4   0
- *      0      1      -1       8       -8      1/8     -1/8   0
- *      0      1       1      16       16     1/16     1/16   0
- *      0      1      -1      32      -32     1/32    -1/32   1
- * @param output
- * @param din
- * @param bias
- */
-void transform_output_f6x6(float* output, const float* din, float bias) {
-  float tmp[6][8];
-  for (int m = 0; m < 8; m++) {
-    float tmp024a = din[1] + din[2];
-    float tmp135a = din[1] - din[2];
-
-    float tmp024b = din[3] + din[4];
-    float tmp135b = din[3] - din[4];
-
-    float tmp024c = din[5] + din[6];
-    float tmp135c = din[5] - din[6];
-
-    tmp[0][m] = din[0] + tmp024a + tmp024b + tmp024c;
-    tmp[2][m] = tmp024a + tmp024b * 4 + tmp024c * 0.25f;
-    tmp[4][m] = tmp024a + tmp024b * 16 + tmp024c * 0.0625f;
-
-    tmp[1][m] = tmp135a + tmp135b * 2 + tmp135c * 0.5f;
-    tmp[3][m] = tmp135a + tmp135b * 8 + tmp135c * 0.125f;
-    tmp[5][m] = din[7] + tmp135a + tmp135b * 32 + tmp135c * 0.03125f;
-
-    din += 8;
-  }
-
-  for (int m = 0; m < 6; m++) {
-    const float* tmp0 = tmp[m];
-
-    float tmp024a = tmp0[1] + tmp0[2];
-    float tmp135a = tmp0[1] - tmp0[2];
-
-    float tmp024b = tmp0[3] + tmp0[4];
-    float tmp135b = tmp0[3] - tmp0[4];
-
-    float tmp024c = tmp0[5] + tmp0[6];
-    float tmp135c = tmp0[5] - tmp0[6];
-
-    output[0] = bias + tmp0[0] + tmp024a + tmp024b + tmp024c;
-    output[2] = bias + tmp024a + tmp024b * 4 + tmp024c * 0.25f;
-    output[4] = bias + tmp024a + tmp024b * 16 + tmp024c * 0.0625f;
-
-    output[1] = bias + tmp135a + tmp135b * 2 + tmp135c * 0.5f;
-    output[3] = bias + tmp135a + tmp135b * 8 + tmp135c * 0.125f;
-    output[5] = bias + tmp0[7] + tmp135a + tmp135b * 32 + tmp135c * 0.03125f;
-
-    output += 6;
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/decode_bboxes.cc b/lite/backends/arm/math/decode_bboxes.cc
deleted file mode 100644
index 12ee42ebb3..0000000000
--- a/lite/backends/arm/math/decode_bboxes.cc
+++ /dev/null
@@ -1,651 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/decode_bboxes.h"
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <typename T>
-void decode_bbox_corner_variance_kernel(const int batch_num,
-                                        const T* loc_data,
-                                        const T* prior_data,
-                                        const T* variance,
-                                        const int num_priors,
-                                        const bool share_location,
-                                        const int num_loc_classes,
-                                        const int background_label_id,
-                                        T* bbox_data);
-
-template <typename T>
-void decode_bbox_corner_no_variance_kernel(const int batch_num,
-                                           const T* loc_data,
-                                           const T* prior_data,
-                                           const T* variance,
-                                           const int num_priors,
-                                           const bool share_location,
-                                           const int num_loc_classes,
-                                           const int background_label_id,
-                                           T* bbox_data);
-
-template <typename T>
-void decode_bbox_center_variance_kernel(const int batch_num,
-                                        const T* loc_data,
-                                        const T* prior_data,
-                                        const T* variance,
-                                        const int num_priors,
-                                        const bool share_location,
-                                        const int num_loc_classes,
-                                        const int background_label_id,
-                                        T* bbox_data);
-
-template <typename T>
-void decode_bbox_center_no_variance_kernel(const int batch_num,
-                                           const float* loc_data,
-                                           const float* prior_data,
-                                           const float* variance,
-                                           const int num_priors,
-                                           const bool share_location,
-                                           const int num_loc_classes,
-                                           const int background_label_id,
-                                           float* bbox_data);
-
-template <typename T>
-void decode_bbox_corner_size_variance_kernel(const int batch_num,
-                                             const T* loc_data,
-                                             const T* prior_data,
-                                             const T* variance,
-                                             const int num_priors,
-                                             const bool share_location,
-                                             const int num_loc_classes,
-                                             const int background_label_id,
-                                             T* bbox_data);
-
-template <typename T>
-void decode_bbox_corner_size_no_variance_kernel(const int batch_num,
-                                                const T* loc_data,
-                                                const T* prior_data,
-                                                const T* variance,
-                                                const int num_priors,
-                                                const bool share_location,
-                                                const int num_loc_classes,
-                                                const int background_label_id,
-                                                T* bbox_data);
-
-template <>
-void decode_bbox_corner_variance_kernel<float>(const int batch_num,
-                                               const float* loc_data,
-                                               const float* prior_data,
-                                               const float* variance,
-                                               const int num_priors,
-                                               const bool share_location,
-                                               const int num_loc_classes,
-                                               const int background_label_id,
-                                               float* bbox_data) {
-  if (!share_location) {
-    CHECK_EQ(share_location, true)
-        << "ERROR: decode boxes without share_location is unimplemented\n";
-    return;
-  }
-
-  int cnt = num_priors / 4;
-  int len_batch = num_priors * 4;
-
-  for (int n = 0; n < batch_num; ++n) {
-    const float* ptr_loc_batch = loc_data + n * len_batch;
-    float* ptr_bbox_batch = bbox_data + n * len_batch;
-#pragma omp parallel for
-    for (int i = 0; i < cnt; ++i) {
-      int idx = i * 16;
-      const float* ptr_loc = ptr_loc_batch + idx;
-      const float* ptr_prior = prior_data + idx;
-      float* ptr_bbox = ptr_bbox_batch + idx;
-
-      float32x4_t vloc1 = vld1q_f32(ptr_loc);
-      float32x4_t vloc2 = vld1q_f32(ptr_loc + 4);
-      float32x4_t vloc3 = vld1q_f32(ptr_loc + 8);
-      float32x4_t vloc4 = vld1q_f32(ptr_loc + 12);
-
-      float32x4_t vprior1 = vld1q_f32(ptr_prior);
-      float32x4_t vprior2 = vld1q_f32(ptr_prior + 4);
-      float32x4_t vprior3 = vld1q_f32(ptr_prior + 8);
-      float32x4_t vprior4 = vld1q_f32(ptr_prior + 12);
-
-      vst1q_f32(ptr_bbox, vaddq_f32(vloc1, vprior1));
-      vst1q_f32(ptr_bbox + 4, vaddq_f32(vloc2, vprior2));
-      vst1q_f32(ptr_bbox + 8, vaddq_f32(vloc3, vprior3));
-      vst1q_f32(ptr_bbox + 12, vaddq_f32(vloc4, vprior4));
-    }
-#pragma omp parallel for
-    for (int i = cnt * 4; i < num_priors; i++) {
-      int idx = i * 4;
-      float32x4_t vloc = vld1q_f32(ptr_loc_batch + idx);
-      float32x4_t vprior = vld1q_f32(prior_data + idx);
-      vst1q_f32(ptr_bbox_batch + idx, vaddq_f32(vloc, vprior));
-    }
-  }
-}
-
-template <>
-void decode_bbox_corner_no_variance_kernel<float>(const int batch_num,
-                                                  const float* loc_data,
-                                                  const float* prior_data,
-                                                  const float* variance,
-                                                  const int num_priors,
-                                                  const bool share_location,
-                                                  const int num_loc_classes,
-                                                  const int background_label_id,
-                                                  float* bbox_data) {
-  if (!share_location) {
-    CHECK_EQ(share_location, true)
-        << "ERROR: decode boxes without share_location is unimplemented\n";
-    return;
-  }
-
-  int cnt = num_priors / 4;
-  int len_batch = num_priors * 4;
-
-  for (int n = 0; n < batch_num; ++n) {
-    const float* ptr_loc_batch = loc_data + n * len_batch;
-    float* ptr_bbox_batch = bbox_data + n * len_batch;
-
-#pragma omp parallel for
-    for (int i = 0; i < cnt; ++i) {
-      int idx = i * 16;
-      const float* ptr_loc = ptr_loc_batch + idx;
-      const float* ptr_prior = prior_data + idx;
-      const float* ptr_var = variance + idx;
-      float* ptr_bbox = ptr_bbox_batch + idx;
-
-      float32x4_t vloc1 = vld1q_f32(ptr_loc);
-      float32x4_t vprior1 = vld1q_f32(ptr_prior);
-      float32x4_t vvar1 = vld1q_f32(ptr_var);
-      float32x4_t vout1 = vmulq_f32(vloc1, vvar1);
-
-      float32x4_t vloc2 = vld1q_f32(ptr_loc + 4);
-      float32x4_t vprior2 = vld1q_f32(ptr_prior + 4);
-      float32x4_t vvar2 = vld1q_f32(ptr_var + 4);
-      float32x4_t vout2 = vmulq_f32(vloc2, vvar2);
-
-      float32x4_t vloc3 = vld1q_f32(ptr_loc + 8);
-      float32x4_t vprior3 = vld1q_f32(ptr_prior + 8);
-      float32x4_t vvar3 = vld1q_f32(ptr_var + 8);
-      float32x4_t vout3 = vmulq_f32(vloc3, vvar3);
-
-      float32x4_t vloc4 = vld1q_f32(ptr_loc + 12);
-      float32x4_t vprior4 = vld1q_f32(ptr_prior + 12);
-      float32x4_t vvar4 = vld1q_f32(ptr_var + 12);
-      float32x4_t vout4 = vmulq_f32(vloc4, vvar4);
-
-      vst1q_f32(ptr_bbox, vaddq_f32(vout1, vprior1));
-      vst1q_f32(ptr_bbox + 4, vaddq_f32(vout2, vprior2));
-      vst1q_f32(ptr_bbox + 8, vaddq_f32(vout3, vprior3));
-      vst1q_f32(ptr_bbox + 12, vaddq_f32(vout4, vprior4));
-    }
-
-    for (int i = cnt * 4; i < num_priors; i++) {
-      int idx = i * 4;
-      float32x4_t vloc = vld1q_f32(ptr_loc_batch + idx);
-      float32x4_t vprior = vld1q_f32(prior_data + idx);
-      float32x4_t vvar = vld1q_f32(variance + idx);
-      float32x4_t vout = vmulq_f32(vloc, vvar);
-      vst1q_f32(ptr_bbox_batch + idx, vaddq_f32(vout, vprior));
-    }
-  }
-}
-
-template <>
-void decode_bbox_center_variance_kernel<float>(const int batch_num,
-                                               const float* loc_data,
-                                               const float* prior_data,
-                                               const float* variance,
-                                               const int num_priors,
-                                               const bool share_location,
-                                               const int num_loc_classes,
-                                               const int background_label_id,
-                                               float* bbox_data) {
-  if (!share_location) {
-    CHECK_EQ(share_location, true)
-        << "ERROR: decode boxes without share_location is unimplemented\n";
-    return;
-  }
-
-  int cnt = num_priors / 4;
-  //! vprior 0: xmin, 1: ymin, 2: xmax, 3: ymax
-  //! vloc   0: xmin, 1: ymin, 2: xmax, 3: ymax
-  //! vvar
-  float32x4_t vhalf = vdupq_n_f32(0.5f);
-
-  int len_batch = num_priors * 4;
-
-  for (int n = 0; n < batch_num; ++n) {
-    const float* ptr_loc_batch = loc_data + n * len_batch;
-    float* ptr_bbox_batch = bbox_data + n * len_batch;
-
-#pragma omp parallel for
-    for (int i = 0; i < cnt; ++i) {
-      int idx = i * 16;
-      const float* ptr_loc = ptr_loc_batch + idx;
-      const float* ptr_prior = prior_data + idx;
-      float* ptr_bbox = ptr_bbox_batch + idx;
-
-      float32x4x4_t vprior = vld4q_f32(ptr_prior);
-      float32x4x4_t vloc = vld4q_f32(ptr_loc);
-      float32x4_t vprior_width = vsubq_f32(vprior.val[2], vprior.val[0]);
-      float32x4_t vprior_height = vsubq_f32(vprior.val[3], vprior.val[1]);
-      float32x4_t vprior_cx =
-          vmulq_f32(vaddq_f32(vprior.val[0], vprior.val[2]), vhalf);
-      float32x4_t vprior_cy =
-          vmulq_f32(vaddq_f32(vprior.val[1], vprior.val[3]), vhalf);
-
-      float32x4_t vdec_bbx_cx =
-          vaddq_f32(vmulq_f32(vloc.val[0], vprior_width), vprior_cx);
-      float32x4_t vdec_bbx_cy =
-          vaddq_f32(vmulq_f32(vloc.val[1], vprior_height), vprior_cy);
-      float32x4_t vdec_bbx_w = exp_ps(vloc.val[2]);
-      float32x4_t vdec_bbx_h = exp_ps(vloc.val[3]);
-      vprior_width = vmulq_f32(vprior_width, vhalf);
-      vprior_height = vmulq_f32(vprior_height, vhalf);
-      vdec_bbx_w = vmulq_f32(vdec_bbx_w, vprior_width);
-      vdec_bbx_h = vmulq_f32(vdec_bbx_h, vprior_height);
-
-      vloc.val[0] = vsubq_f32(vdec_bbx_cx, vdec_bbx_w);
-      vloc.val[1] = vsubq_f32(vdec_bbx_cy, vdec_bbx_h);
-      vloc.val[2] = vaddq_f32(vdec_bbx_cx, vdec_bbx_w);
-      vloc.val[3] = vaddq_f32(vdec_bbx_cy, vdec_bbx_h);
-
-      vst4q_f32(ptr_bbox, vloc);
-    }
-#pragma omp parallel for
-    for (int i = cnt * 4; i < num_priors; i++) {
-      int idx = i * 4;
-      float p_xmin = prior_data[idx];
-      float p_ymin = prior_data[idx + 1];
-      float p_xmax = prior_data[idx + 2];
-      float p_ymax = prior_data[idx + 3];
-      float prior_width = p_xmax - p_xmin;
-      float prior_height = p_ymax - p_ymin;
-      float prior_center_x = (p_xmin + p_xmax) / 2.f;
-      float prior_center_y = (p_ymin + p_ymax) / 2.f;
-
-      float xmin = ptr_loc_batch[idx];
-      float ymin = ptr_loc_batch[idx + 1];
-      float xmax = ptr_loc_batch[idx + 2];
-      float ymax = ptr_loc_batch[idx + 3];
-
-      //! variance is encoded in target, we simply need to retore the offset
-      //! predictions.
-      float decode_bbox_center_x = xmin * prior_width + prior_center_x;
-      float decode_bbox_center_y = ymin * prior_height + prior_center_y;
-      float decode_bbox_width = expf(xmax) * prior_width;
-      float decode_bbox_height = expf(ymax) * prior_height;
-
-      ptr_bbox_batch[idx] = decode_bbox_center_x - decode_bbox_width / 2.f;
-      ptr_bbox_batch[idx + 1] = decode_bbox_center_y - decode_bbox_height / 2.f;
-      ptr_bbox_batch[idx + 2] = decode_bbox_center_x + decode_bbox_width / 2.f;
-      ptr_bbox_batch[idx + 3] = decode_bbox_center_y + decode_bbox_height / 2.f;
-    }
-  }
-}
-
-template <>
-void decode_bbox_center_no_variance_kernel<float>(const int batch_num,
-                                                  const float* loc_data,
-                                                  const float* prior_data,
-                                                  const float* variance,
-                                                  const int num_priors,
-                                                  const bool share_location,
-                                                  const int num_loc_classes,
-                                                  const int background_label_id,
-                                                  float* bbox_data) {
-  if (!share_location) {
-    CHECK_EQ(share_location, true)
-        << "ERROR: decode boxes without share_location is unimplemented\n";
-    return;
-  }
-
-  int cnt = num_priors / 4;
-  //! vprior 0: xmin, 1: ymin, 2: xmax, 3: ymax
-  //! vloc   0: xmin, 1: ymin, 2: xmax, 3: ymax
-  //! vvar
-  float32x4_t vhalf = vdupq_n_f32(0.5f);
-
-  int len_batch = num_priors * 4;
-
-  for (int n = 0; n < batch_num; ++n) {
-    const float* ptr_loc_batch = loc_data + n * len_batch;
-    float* ptr_bbox_batch = bbox_data + n * len_batch;
-
-#pragma omp parallel for
-    for (int i = 0; i < cnt; ++i) {
-      int idx = i * 16;
-
-      const float* ptr_loc = ptr_loc_batch + idx;
-      const float* ptr_prior = prior_data + idx;
-      const float* ptr_var = variance + idx;
-      float* ptr_bbox = ptr_bbox_batch + idx;
-
-      float32x4x4_t vprior = vld4q_f32(ptr_prior);
-      float32x4x4_t vloc = vld4q_f32(ptr_loc);
-      float32x4x4_t vvar = vld4q_f32(ptr_var);
-      float32x4_t vprior_width = vsubq_f32(vprior.val[2], vprior.val[0]);
-      float32x4_t vprior_height = vsubq_f32(vprior.val[3], vprior.val[1]);
-      float32x4_t vprior_cx =
-          vmulq_f32(vaddq_f32(vprior.val[0], vprior.val[2]), vhalf);
-      float32x4_t vprior_cy =
-          vmulq_f32(vaddq_f32(vprior.val[1], vprior.val[3]), vhalf);
-
-      vloc.val[0] = vmulq_f32(vloc.val[0], vvar.val[0]);
-      vloc.val[1] = vmulq_f32(vloc.val[1], vvar.val[1]);
-      vloc.val[2] = vmulq_f32(vloc.val[2], vvar.val[2]);
-      vloc.val[3] = vmulq_f32(vloc.val[3], vvar.val[3]);
-
-      float32x4_t vdec_bbx_cx =
-          vaddq_f32(vmulq_f32(vloc.val[0], vprior_width), vprior_cx);
-      float32x4_t vdec_bbx_cy =
-          vaddq_f32(vmulq_f32(vloc.val[1], vprior_height), vprior_cy);
-      float32x4_t vdec_bbx_w = exp_ps(vloc.val[2]);
-      float32x4_t vdec_bbx_h = exp_ps(vloc.val[3]);
-      vprior_width = vmulq_f32(vprior_width, vhalf);
-      vprior_height = vmulq_f32(vprior_height, vhalf);
-      vdec_bbx_w = vmulq_f32(vdec_bbx_w, vprior_width);
-      vdec_bbx_h = vmulq_f32(vdec_bbx_h, vprior_height);
-
-      vloc.val[0] = vsubq_f32(vdec_bbx_cx, vdec_bbx_w);
-      vloc.val[1] = vsubq_f32(vdec_bbx_cy, vdec_bbx_h);
-      vloc.val[2] = vaddq_f32(vdec_bbx_cx, vdec_bbx_w);
-      vloc.val[3] = vaddq_f32(vdec_bbx_cy, vdec_bbx_h);
-
-      vst4q_f32(ptr_bbox, vloc);
-    }
-
-#pragma omp parallel for
-    for (int i = cnt * 4; i < num_priors; i++) {
-      int idx = i * 4;
-      float p_xmin = prior_data[idx];
-      float p_ymin = prior_data[idx + 1];
-      float p_xmax = prior_data[idx + 2];
-      float p_ymax = prior_data[idx + 3];
-      float prior_width = p_xmax - p_xmin;
-      float prior_height = p_ymax - p_ymin;
-      float prior_center_x = (p_xmin + p_xmax) / 2.f;
-      float prior_center_y = (p_ymin + p_ymax) / 2.f;
-
-      float xmin = ptr_loc_batch[idx];
-      float ymin = ptr_loc_batch[idx + 1];
-      float xmax = ptr_loc_batch[idx + 2];
-      float ymax = ptr_loc_batch[idx + 3];
-
-      //! variance is encoded in target, we simply need to retore the offset
-      //! predictions.
-      float decode_bbox_center_x =
-          variance[idx] * xmin * prior_width + prior_center_x;
-      float decode_bbox_center_y =
-          variance[idx + 1] * ymin * prior_height + prior_center_y;
-      float decode_bbox_width = expf(variance[idx + 2] * xmax) * prior_width;
-      float decode_bbox_height = expf(variance[idx + 3] * ymax) * prior_height;
-
-      ptr_bbox_batch[idx] = decode_bbox_center_x - decode_bbox_width / 2.f;
-      ptr_bbox_batch[idx + 1] = decode_bbox_center_y - decode_bbox_height / 2.f;
-      ptr_bbox_batch[idx + 2] = decode_bbox_center_x + decode_bbox_width / 2.f;
-      ptr_bbox_batch[idx + 3] = decode_bbox_center_y + decode_bbox_height / 2.f;
-    }
-  }
-}
-
-template <>
-void decode_bbox_corner_size_variance_kernel<float>(
-    const int batch_num,
-    const float* loc_data,
-    const float* prior_data,
-    const float* variance,
-    const int num_priors,
-    const bool share_location,
-    const int num_loc_classes,
-    const int background_label_id,
-    float* bbox_data) {
-  if (!share_location) {
-    CHECK_EQ(share_location, true)
-        << "ERROR: decode boxes without share_location is unimplemented\n";
-    return;
-  }
-
-  int cnt = num_priors / 4;
-  //! vprior 0: xmin, 1: ymin, 2: xmax, 3: ymax
-  //! bbx
-
-  int len_batch = num_priors * 4;
-
-  for (int n = 0; n < batch_num; ++n) {
-    const float* ptr_loc_batch = loc_data + n * len_batch;
-    float* ptr_bbox_batch = bbox_data + n * len_batch;
-
-#pragma omp parallel for
-    for (int i = 0; i < cnt; ++i) {
-      int idx = i * 16;
-
-      const float* ptr_loc = ptr_loc_batch + idx;
-      const float* ptr_prior = prior_data + idx;
-      const float* ptr_var = variance + idx;
-      float* ptr_bbox = ptr_bbox_batch + idx;
-
-      float32x4x4_t vprior = vld4q_f32(ptr_prior);
-      float32x4x4_t vloc = vld4q_f32(ptr_loc);
-
-      float32x4_t vprior_width = vsubq_f32(vprior.val[2], vprior.val[0]);
-      float32x4_t vprior_height = vsubq_f32(vprior.val[3], vprior.val[1]);
-
-      float32x4x4_t vbbx;
-      vbbx.val[0] = vmulq_f32(vloc.val[0], vprior_width);
-      vbbx.val[1] = vmulq_f32(vloc.val[1], vprior_height);
-      vbbx.val[2] = vmulq_f32(vloc.val[2], vprior_width);
-      vbbx.val[3] = vmulq_f32(vloc.val[3], vprior_height);
-
-      vbbx.val[0] = vaddq_f32(vprior.val[0], vbbx.val[0]);
-      vbbx.val[1] = vaddq_f32(vprior.val[1], vbbx.val[1]);
-      vbbx.val[2] = vaddq_f32(vprior.val[2], vbbx.val[2]);
-      vbbx.val[3] = vaddq_f32(vprior.val[3], vbbx.val[3]);
-
-      vst4q_f32(ptr_bbox, vbbx);
-    }
-
-#pragma omp parallel for
-    for (int i = cnt * 4; i < num_priors; i++) {
-      int idx = i * 4;
-      float p_xmin = prior_data[idx];
-      float p_ymin = prior_data[idx + 1];
-      float p_xmax = prior_data[idx + 2];
-      float p_ymax = prior_data[idx + 3];
-      float prior_width = p_xmax - p_xmin;
-      float prior_height = p_ymax - p_ymin;
-
-      ptr_bbox_batch[idx] = p_xmin + ptr_loc_batch[idx] * prior_width;
-      ptr_bbox_batch[idx + 1] = p_ymin + ptr_loc_batch[idx + 1] * prior_height;
-      ptr_bbox_batch[idx + 2] = p_xmax + ptr_loc_batch[idx + 2] * prior_width;
-      ptr_bbox_batch[idx + 3] = p_ymax + ptr_loc_batch[idx + 3] * prior_height;
-    }
-  }
-}
-
-template <>
-void decode_bbox_corner_size_no_variance_kernel<float>(
-    const int batch_num,
-    const float* loc_data,
-    const float* prior_data,
-    const float* variance,
-    const int num_priors,
-    const bool share_location,
-    const int num_loc_classes,
-    const int background_label_id,
-    float* bbox_data) {
-  if (!share_location) {
-    CHECK_EQ(share_location, true)
-        << "ERROR: decode boxes without share_location is unimplemented\n";
-    return;
-  }
-
-  int cnt = num_priors / 4;
-  //! vprior 0: xmin, 1: ymin, 2: xmax, 3: ymax
-  //! bbx
-
-  int len_batch = num_priors * 4;
-
-  for (int n = 0; n < batch_num; ++n) {
-    const float* ptr_loc_batch = loc_data + n * len_batch;
-    float* ptr_bbox_batch = bbox_data + n * len_batch;
-
-#pragma omp parallel for
-    for (int i = 0; i < cnt; ++i) {
-      int idx = i * 16;
-
-      const float* ptr_loc = ptr_loc_batch + idx;
-      const float* ptr_prior = prior_data + idx;
-      const float* ptr_var = variance + idx;
-      float* ptr_bbox = ptr_bbox_batch + idx;
-
-      float32x4x4_t vprior = vld4q_f32(ptr_prior);
-      float32x4x4_t vloc = vld4q_f32(ptr_loc);
-
-      float32x4_t vprior_width = vsubq_f32(vprior.val[2], vprior.val[0]);
-      float32x4_t vprior_height = vsubq_f32(vprior.val[3], vprior.val[1]);
-
-      float32x4x4_t vbbx;
-      vbbx.val[0] = vmulq_f32(vloc.val[0], vprior_width);
-      vbbx.val[1] = vmulq_f32(vloc.val[1], vprior_height);
-      vbbx.val[2] = vmulq_f32(vloc.val[2], vprior_width);
-      vbbx.val[3] = vmulq_f32(vloc.val[3], vprior_height);
-
-      vloc = vld4q_f32(ptr_var);
-      vbbx.val[0] = vmulq_f32(vbbx.val[0], vloc.val[0]);
-      vbbx.val[1] = vmulq_f32(vbbx.val[1], vloc.val[1]);
-      vbbx.val[2] = vmulq_f32(vbbx.val[2], vloc.val[2]);
-      vbbx.val[3] = vmulq_f32(vbbx.val[3], vloc.val[3]);
-
-      vbbx.val[0] = vaddq_f32(vprior.val[0], vbbx.val[0]);
-      vbbx.val[1] = vaddq_f32(vprior.val[1], vbbx.val[1]);
-      vbbx.val[2] = vaddq_f32(vprior.val[2], vbbx.val[2]);
-      vbbx.val[3] = vaddq_f32(vprior.val[3], vbbx.val[3]);
-
-      vst4q_f32(ptr_bbox, vbbx);
-    }
-#pragma omp parallel for
-    for (int i = cnt * 4; i < num_priors; i++) {
-      int idx = i * 4;
-      float p_xmin = prior_data[idx];
-      float p_ymin = prior_data[idx + 1];
-      float p_xmax = prior_data[idx + 2];
-      float p_ymax = prior_data[idx + 3];
-      float prior_width = p_xmax - p_xmin;
-      float prior_height = p_ymax - p_ymin;
-
-      ptr_bbox_batch[idx] =
-          p_xmin + ptr_loc_batch[idx] * variance[idx] * prior_width;
-      ptr_bbox_batch[idx + 1] =
-          p_ymin + ptr_loc_batch[idx + 1] * variance[idx + 1] * prior_height;
-      ptr_bbox_batch[idx + 2] =
-          p_xmax + ptr_loc_batch[idx + 2] * variance[idx + 2] * prior_width;
-      ptr_bbox_batch[idx + 3] =
-          p_ymax + ptr_loc_batch[idx + 3] * variance[idx + 3] * prior_height;
-    }
-  }
-}
-
-template <>
-void decode_bboxes<float>(const int batch_num,
-                          const float* loc_data,
-                          const float* prior_data,
-                          const std::string code_type,
-                          const bool variance_encoded_in_target,
-                          const int num_priors,
-                          const bool share_location,
-                          const int num_loc_classes,
-                          const int background_label_id,
-                          float* bbox_data) {
-  const float* variance_data = prior_data + 4 * num_priors;
-  if (code_type == "corner") {
-    if (variance_encoded_in_target) {
-      decode_bbox_corner_variance_kernel<float>(batch_num,
-                                                loc_data,
-                                                prior_data,
-                                                variance_data,
-                                                num_priors,
-                                                share_location,
-                                                num_loc_classes,
-                                                background_label_id,
-                                                bbox_data);
-    } else {
-      decode_bbox_corner_no_variance_kernel<float>(batch_num,
-                                                   loc_data,
-                                                   prior_data,
-                                                   variance_data,
-                                                   num_priors,
-                                                   share_location,
-                                                   num_loc_classes,
-                                                   background_label_id,
-                                                   bbox_data);
-    }
-  } else if (code_type == "center_size") {
-    if (variance_encoded_in_target) {
-      decode_bbox_center_variance_kernel<float>(batch_num,
-                                                loc_data,
-                                                prior_data,
-                                                variance_data,
-                                                num_priors,
-                                                share_location,
-                                                num_loc_classes,
-                                                background_label_id,
-                                                bbox_data);
-    } else {
-      decode_bbox_center_no_variance_kernel<float>(batch_num,
-                                                   loc_data,
-                                                   prior_data,
-                                                   variance_data,
-                                                   num_priors,
-                                                   share_location,
-                                                   num_loc_classes,
-                                                   background_label_id,
-                                                   bbox_data);
-    }
-  } else if (code_type == "corner_size") {
-    if (variance_encoded_in_target) {
-      decode_bbox_corner_size_variance_kernel<float>(batch_num,
-                                                     loc_data,
-                                                     prior_data,
-                                                     variance_data,
-                                                     num_priors,
-                                                     share_location,
-                                                     num_loc_classes,
-                                                     background_label_id,
-                                                     bbox_data);
-    } else {
-      decode_bbox_corner_size_no_variance_kernel<float>(batch_num,
-                                                        loc_data,
-                                                        prior_data,
-                                                        variance_data,
-                                                        num_priors,
-                                                        share_location,
-                                                        num_loc_classes,
-                                                        background_label_id,
-                                                        bbox_data);
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/decode_bboxes.h b/lite/backends/arm/math/decode_bboxes.h
deleted file mode 100644
index f18bfe6420..0000000000
--- a/lite/backends/arm/math/decode_bboxes.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <typename T>
-void decode_bboxes(const int batch_num,
-                   const T* loc_data,
-                   const T* prior_data,
-                   const std::string code_type,
-                   const bool variance_encoded_in_target,
-                   const int num_priors,
-                   const bool share_location,
-                   const int num_loc_classes,
-                   const int background_label_id,
-                   T* bbox_data);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/dot_toolchain_support.h b/lite/backends/arm/math/dot_toolchain_support.h
deleted file mode 100644
index 8342ffee19..0000000000
--- a/lite/backends/arm/math/dot_toolchain_support.h
+++ /dev/null
@@ -1,196 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// This file is modified according to
-// https://github.com/ARM-software/ComputeLibrary
-// * Copyright (c) 2017-2018 ARM Limited.
-// *
-// * SPDX-License-Identifier: MIT
-// *
-// * Permission is hereby granted, free of charge, to any person obtaining a
-// copy
-// * of this software and associated documentation files (the "Software"), to
-// * deal in the Software without restriction, including without limitation the
-// * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-// * sell copies of the Software, and to permit persons to whom the Software is
-// * furnished to do so, subject to the following conditions:
-// *
-// * The above copyright notice and this permission notice shall be included in
-// all
-// * copies or substantial portions of the Software.
-// *
-// * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM,
-// * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-// THE
-// * SOFTWARE.
-
-#pragma once
-
-#define _DECLARE_SDOT_ELEMENT                                                \
-  ".altmacro\n"                                                              \
-  ".macro sdot opd:req, opn:req, opm:req\n"                                  \
-  "local vd, vn, vm, h, l\n"                                                 \
-  ".irp "                                                                    \
-  "reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25," \
-  "26,27,28,29,30,31\n"                                                      \
-  ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n"                                       \
-  ".set vd,\\reg\n"                                                          \
-  ".endif\n"                                                                 \
-  ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n"                                      \
-  ".set vn,\\reg\n"                                                          \
-  ".endif\n"                                                                 \
-  ".irp idx,0,1,2,3\n"                                                       \
-  ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n"                              \
-  ".set vm,\\reg\n"                                                          \
-  ".set h,\\idx / 2\n"                                                       \
-  ".set l,\\idx %% 2\n"                                                      \
-  ".endif\n"                                                                 \
-  ".endr\n"                                                                  \
-  ".endr\n"                                                                  \
-  ".ifndef vd\n"                                                             \
-  ".error \"Bad operand \\opd\"\n"                                           \
-  ".exitm\n"                                                                 \
-  ".endif\n"                                                                 \
-  ".ifndef vn\n"                                                             \
-  ".error \"Bad operand \\opn\"\n"                                           \
-  ".exitm\n"                                                                 \
-  ".endif\n"                                                                 \
-  ".ifndef vm\n"                                                             \
-  ".error \"Bad operand \\opm\"\n"                                           \
-  ".exitm\n"                                                                 \
-  ".endif\n"                                                                 \
-  ".ifndef h\n"                                                              \
-  ".error \"Bad operand \\opm\"\n"                                           \
-  ".exitm\n"                                                                 \
-  ".endif\n"                                                                 \
-  ".ifndef l\n"                                                              \
-  ".error \"Bad operand \\opm\"\n"                                           \
-  ".exitm\n"                                                                 \
-  ".endif\n"                                                                 \
-  ".int  0x4f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n" \
-  ".endm\n"
-
-#define _DECLARE_SDOT_VECTOR                                                 \
-  ".altmacro\n"                                                              \
-  ".macro sdot opd:req, opn:req, opm:req\n"                                  \
-  "local vd, vn, vm\n"                                                       \
-  ".irp "                                                                    \
-  "reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25," \
-  "26,27,28,29,30,31\n"                                                      \
-  ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n"                                       \
-  ".set vd,\\reg\n"                                                          \
-  ".endif\n"                                                                 \
-  ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n"                                      \
-  ".set vn,\\reg\n"                                                          \
-  ".endif\n"                                                                 \
-  ".ifeqs \"\\opm\",\"v\\reg\\.16b\"\n"                                      \
-  ".set vm,\\reg\n"                                                          \
-  ".endif\n"                                                                 \
-  ".endr\n"                                                                  \
-  ".endr\n"                                                                  \
-  ".ifndef vd\n"                                                             \
-  ".error \"Bad operand \\opd\"\n"                                           \
-  ".exitm\n"                                                                 \
-  ".endif\n"                                                                 \
-  ".ifndef vn\n"                                                             \
-  ".error \"Bad operand \\opn\"\n"                                           \
-  ".exitm\n"                                                                 \
-  ".endif\n"                                                                 \
-  ".ifndef vm\n"                                                             \
-  ".error \"Bad operand \\opm\"\n"                                           \
-  ".exitm\n"                                                                 \
-  ".endif\n"                                                                 \
-  ".int  0x4e809400 | vd | (vn << 5) | (vm << 16)\n"                         \
-  ".endm\n"
-
-#define _DECLARE_SDOT_VECTOR_2s                                              \
-  ".altmacro\n"                                                              \
-  ".macro sdot opd:req, opn:req, opm:req\n"                                  \
-  "local vd, vn, vm\n"                                                       \
-  ".irp "                                                                    \
-  "reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25," \
-  "26,27,28,29,30,31\n"                                                      \
-  ".ifeqs \"\\opd\",\"v\\reg\\.2s\"\n"                                       \
-  ".set vd,\\reg\n"                                                          \
-  ".endif\n"                                                                 \
-  ".ifeqs \"\\opn\",\"v\\reg\\.8b\"\n"                                       \
-  ".set vn,\\reg\n"                                                          \
-  ".endif\n"                                                                 \
-  ".ifeqs \"\\opm\",\"v\\reg\\.8b\"\n"                                       \
-  ".set vm,\\reg\n"                                                          \
-  ".endif\n"                                                                 \
-  ".endr\n"                                                                  \
-  ".endr\n"                                                                  \
-  ".ifndef vd\n"                                                             \
-  ".error \"Bad operand \\opd\"\n"                                           \
-  ".exitm\n"                                                                 \
-  ".endif\n"                                                                 \
-  ".ifndef vn\n"                                                             \
-  ".error \"Bad operand \\opn\"\n"                                           \
-  ".exitm\n"                                                                 \
-  ".endif\n"                                                                 \
-  ".ifndef vm\n"                                                             \
-  ".error \"Bad operand \\opm\"\n"                                           \
-  ".exitm\n"                                                                 \
-  ".endif\n"                                                                 \
-  ".int  0x0e809400 | vd | (vn << 5) | (vm << 16)\n"                         \
-  ".endm\n"
-
-#define _DECLARE_SDOT_ELEMENT_2s                                               \
-  ".altmacro\n"                                                                \
-  ".macro sdot opd:req, opn:req, opm:req\n"                                    \
-  "local vd, vn, vm, h, l\n"                                                   \
-  ".irp "                                                                      \
-  "reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,"   \
-  "26,27,28,29,30,31\n"                                                        \
-  ".ifeqs \"\\opd\",\"v\\reg\\.2s\"\n"                                         \
-  ".set vd,\\reg\n"                                                            \
-  ".endif\n"                                                                   \
-  ".ifeqs \"\\opn\",\"v\\reg\\.8b\"\n"                                         \
-  ".set vn,\\reg\n"                                                            \
-  ".endif\n"                                                                   \
-  ".irp idx,0,1,2,3\n"                                                         \
-  ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n"                                \
-  ".set vm,\\reg\n"                                                            \
-  ".set h,\\idx / 2\n"                                                         \
-  ".set l,\\idx %% 2\n"                                                        \
-  ".endif\n"                                                                   \
-  ".endr\n"                                                                    \
-  ".endr\n"                                                                    \
-  ".ifndef vd\n"                                                               \
-  ".error \"Bad operand \\opd\"\n"                                             \
-  ".exitm\n"                                                                   \
-  ".endif\n"                                                                   \
-  ".ifndef vn\n"                                                               \
-  ".error \"Bad operand \\opn\"\n"                                             \
-  ".exitm\n"                                                                   \
-  ".endif\n"                                                                   \
-  ".ifndef vm\n"                                                               \
-  ".error \"Bad operand \\opm\"\n"                                             \
-  ".exitm\n"                                                                   \
-  ".endif\n"                                                                   \
-  ".ifndef h\n"                                                                \
-  ".error \"Bad operand \\opm\"\n"                                             \
-  ".exitm\n"                                                                   \
-  ".endif\n"                                                                   \
-  ".ifndef l\n"                                                                \
-  ".error \"Bad operand \\opm\"\n"                                             \
-  ".exitm\n"                                                                   \
-  ".endif\n"                                                                   \
-  ".int    0x0f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n" \
-  ".endm\n"
diff --git a/lite/backends/arm/math/dropout.cc b/lite/backends/arm/math/dropout.cc
deleted file mode 100644
index 406c850ef5..0000000000
--- a/lite/backends/arm/math/dropout.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/dropout.h"
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <>
-void dropout_down<float>(const float* din, float* dout, int num, float prob) {
-  const float scale = 1.0f - prob;
-  int cnt = num >> 4;
-  int remain = num % 16;
-  float32x4_t vscale = vdupq_n_f32(scale);
-#pragma omp parallel for
-  for (int i = 0; i < cnt; i++) {
-    const float* din_ptr = din + (i << 4);
-    float* dout_ptr = dout + (i << 4);
-
-    float32x4_t din0 = vld1q_f32(din_ptr);
-    float32x4_t din1 = vld1q_f32(din_ptr + 4);
-    float32x4_t din2 = vld1q_f32(din_ptr + 8);
-    float32x4_t din3 = vld1q_f32(din_ptr + 12);
-
-    float32x4_t vmul0 = vmulq_f32(din0, vscale);
-    float32x4_t vmul1 = vmulq_f32(din1, vscale);
-    float32x4_t vmul2 = vmulq_f32(din2, vscale);
-    float32x4_t vmul3 = vmulq_f32(din3, vscale);
-
-    vst1q_f32(dout_ptr, vmul0);
-    vst1q_f32(dout_ptr + 4, vmul1);
-    vst1q_f32(dout_ptr + 8, vmul2);
-    vst1q_f32(dout_ptr + 12, vmul3);
-  }
-  if (remain > 0) {
-    const float* din_ptr = din + (cnt << 4);
-    float* dout_ptr = dout + (cnt << 4);
-    for (int i = 0; i < remain; i++) {
-      *dout_ptr = *din_ptr * scale;
-      dout_ptr++;
-      din_ptr++;
-    }
-  }
-}
-
-template <>
-void dropout_up<float>(const float* din, float* dout, int num) {
-  int cnt = num >> 4;
-  int remain = num % 16;
-#pragma omp parallel for
-  for (int i = 0; i < cnt; i++) {
-    const float* din_ptr = din + (i << 4);
-    float* dout_ptr = dout + (i << 4);
-
-    float32x4_t din0 = vld1q_f32(din_ptr);
-    float32x4_t din1 = vld1q_f32(din_ptr + 4);
-    float32x4_t din2 = vld1q_f32(din_ptr + 8);
-    float32x4_t din3 = vld1q_f32(din_ptr + 12);
-
-    vst1q_f32(dout_ptr, din0);
-    vst1q_f32(dout_ptr + 4, din1);
-    vst1q_f32(dout_ptr + 8, din2);
-    vst1q_f32(dout_ptr + 12, din3);
-  }
-  if (remain > 0) {
-    const float* din_ptr = din + (cnt << 4);
-    float* dout_ptr = dout + (cnt << 4);
-    for (int i = 0; i < remain; i++) {
-      *dout_ptr = *din_ptr;
-      dout_ptr++;
-      din_ptr++;
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/dropout.h b/lite/backends/arm/math/dropout.h
deleted file mode 100644
index df2be016de..0000000000
--- a/lite/backends/arm/math/dropout.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <typename T>
-void dropout_down(const T* din, T* dout, int num, float prob);
-
-template <typename T>
-void dropout_up(const T* din, T* dout, int num);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/elementwise.cc b/lite/backends/arm/math/elementwise.cc
deleted file mode 100644
index a4c61f9a9d..0000000000
--- a/lite/backends/arm/math/elementwise.cc
+++ /dev/null
@@ -1,1290 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/elementwise.h"
-#include <algorithm>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <>
-void elementwise_add<float>(const float* dinx,
-                            const float* diny,
-                            float* dout,
-                            int num) {
-  int cnt = num >> 4;
-  int remain = num % 16;
-#pragma omp parallel for
-  for (int i = 0; i < cnt; i++) {
-    const float* dinx_ptr = dinx + (i << 4);
-    const float* diny_ptr = diny + (i << 4);
-    float* dout_ptr = dout + (i << 4);
-
-    float32x4_t dinx0 = vld1q_f32(dinx_ptr);
-    float32x4_t dinx1 = vld1q_f32(dinx_ptr + 4);
-    float32x4_t dinx2 = vld1q_f32(dinx_ptr + 8);
-    float32x4_t dinx3 = vld1q_f32(dinx_ptr + 12);
-
-    float32x4_t diny0 = vld1q_f32(diny_ptr);
-    float32x4_t diny1 = vld1q_f32(diny_ptr + 4);
-    float32x4_t diny2 = vld1q_f32(diny_ptr + 8);
-    float32x4_t diny3 = vld1q_f32(diny_ptr + 12);
-
-    dinx0 = vaddq_f32(dinx0, diny0);
-    dinx1 = vaddq_f32(dinx1, diny1);
-    dinx2 = vaddq_f32(dinx2, diny2);
-    dinx3 = vaddq_f32(dinx3, diny3);
-
-    vst1q_f32(dout_ptr, dinx0);
-    vst1q_f32(dout_ptr + 4, dinx1);
-    vst1q_f32(dout_ptr + 8, dinx2);
-    vst1q_f32(dout_ptr + 12, dinx3);
-  }
-  if (remain > 0) {
-    const float* dinx_ptr = dinx + (cnt << 4);
-    const float* diny_ptr = diny + (cnt << 4);
-    float* dout_ptr = dout + (cnt << 4);
-    for (int i = 0; i < remain; i++) {
-      *dout_ptr = *dinx_ptr + *diny_ptr;
-      dout_ptr++;
-      dinx_ptr++;
-      diny_ptr++;
-    }
-  }
-}
-
-template <>
-void elementwise_add_relu<float>(const float* dinx,
-                                 const float* diny,
-                                 float* dout,
-                                 int num) {
-  int cnt = num >> 4;
-  int remain = num % 16;
-  float32x4_t vzero = vdupq_n_f32(0.f);
-#pragma omp parallel for
-  for (int i = 0; i < cnt; i++) {
-    const float* dinx_ptr = dinx + (i << 4);
-    const float* diny_ptr = diny + (i << 4);
-    float* dout_ptr = dout + (i << 4);
-
-    float32x4_t dinx0 = vld1q_f32(dinx_ptr);
-    float32x4_t dinx1 = vld1q_f32(dinx_ptr + 4);
-    float32x4_t dinx2 = vld1q_f32(dinx_ptr + 8);
-    float32x4_t dinx3 = vld1q_f32(dinx_ptr + 12);
-
-    float32x4_t diny0 = vld1q_f32(diny_ptr);
-    float32x4_t diny1 = vld1q_f32(diny_ptr + 4);
-    float32x4_t diny2 = vld1q_f32(diny_ptr + 8);
-    float32x4_t diny3 = vld1q_f32(diny_ptr + 12);
-
-    dinx0 = vaddq_f32(dinx0, diny0);
-    dinx1 = vaddq_f32(dinx1, diny1);
-    dinx2 = vaddq_f32(dinx2, diny2);
-    dinx3 = vaddq_f32(dinx3, diny3);
-
-    // relu
-    dinx0 = vmaxq_f32(dinx0, vzero);
-    dinx1 = vmaxq_f32(dinx1, vzero);
-    dinx2 = vmaxq_f32(dinx2, vzero);
-    dinx3 = vmaxq_f32(dinx3, vzero);
-
-    vst1q_f32(dout_ptr, dinx0);
-    vst1q_f32(dout_ptr + 4, dinx1);
-    vst1q_f32(dout_ptr + 8, dinx2);
-    vst1q_f32(dout_ptr + 12, dinx3);
-  }
-  if (remain > 0) {
-    const float* dinx_ptr = dinx + (cnt << 4);
-    const float* diny_ptr = diny + (cnt << 4);
-    float* dout_ptr = dout + (cnt << 4);
-    for (int i = 0; i < remain; i++) {
-      float tmp = *dinx_ptr + *diny_ptr;
-      *dout_ptr = tmp > 0.f ? tmp : 0.f;
-      dout_ptr++;
-      dinx_ptr++;
-      diny_ptr++;
-    }
-  }
-}
-
-template <>
-void elementwise_add_broadcast<float>(const float* dinx,
-                                      const float* diny,
-                                      float* dout,
-                                      int batch,
-                                      int channels,
-                                      int num) {
-#pragma omp parallel for collapse(2)
-  for (int i = 0; i < batch; ++i) {
-    for (int j = 0; j < channels; ++j) {
-      int offset = (i * channels + j) * num;
-      const float* din_ptr = dinx + offset;
-      const float diny_data = diny[j];
-      float* dout_ptr = dout + offset;
-
-      int cnt = num >> 4;
-      int remain = num % 16;
-      float32x4_t rb = vdupq_n_f32(diny_data);
-      for (int k = 0; k < cnt; ++k) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        float32x4_t din1 = vld1q_f32(din_ptr + 4);
-        float32x4_t din2 = vld1q_f32(din_ptr + 8);
-        float32x4_t din3 = vld1q_f32(din_ptr + 12);
-
-        din0 = vaddq_f32(din0, rb);
-        din1 = vaddq_f32(din1, rb);
-        din2 = vaddq_f32(din2, rb);
-        din3 = vaddq_f32(din3, rb);
-
-        vst1q_f32(dout_ptr, din0);
-        vst1q_f32(dout_ptr + 4, din1);
-        vst1q_f32(dout_ptr + 8, din2);
-        vst1q_f32(dout_ptr + 12, din3);
-        din_ptr += 16;
-        dout_ptr += 16;
-      }
-      if (remain >= 8) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        float32x4_t din1 = vld1q_f32(din_ptr + 4);
-        din0 = vaddq_f32(din0, rb);
-        din1 = vaddq_f32(din1, rb);
-        vst1q_f32(dout_ptr, din0);
-        vst1q_f32(dout_ptr + 4, din1);
-        din_ptr += 8;
-        dout_ptr += 8;
-        remain -= 8;
-      }
-      if (remain >= 4) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        din0 = vaddq_f32(din0, rb);
-        vst1q_f32(dout_ptr, din0);
-        din_ptr += 4;
-        dout_ptr += 4;
-        remain -= 4;
-      }
-      if (remain > 0) {
-        for (int p = 0; p < remain; p++) {
-          *dout_ptr = *din_ptr + diny_data;
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  }
-}
-
-template <>
-void elementwise_add_relu_broadcast<float>(const float* dinx,
-                                           const float* diny,
-                                           float* dout,
-                                           int batch,
-                                           int channels,
-                                           int num) {
-  float32x4_t vzero = vdupq_n_f32(0.f);
-#pragma omp parallel for collapse(2)
-  for (int i = 0; i < batch; ++i) {
-    for (int j = 0; j < channels; ++j) {
-      int offset = (i * channels + j) * num;
-      const float* din_ptr = dinx + offset;
-      const float diny_data = diny[j];
-      float* dout_ptr = dout + offset;
-
-      int cnt = num >> 4;
-      int remain = num % 16;
-      float32x4_t rb = vdupq_n_f32(diny_data);
-      for (int k = 0; k < cnt; ++k) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        float32x4_t din1 = vld1q_f32(din_ptr + 4);
-        float32x4_t din2 = vld1q_f32(din_ptr + 8);
-        float32x4_t din3 = vld1q_f32(din_ptr + 12);
-
-        din0 = vaddq_f32(din0, rb);
-        din1 = vaddq_f32(din1, rb);
-        din2 = vaddq_f32(din2, rb);
-        din3 = vaddq_f32(din3, rb);
-
-        // relu
-        din0 = vmaxq_f32(din0, vzero);
-        din1 = vmaxq_f32(din1, vzero);
-        din2 = vmaxq_f32(din2, vzero);
-        din3 = vmaxq_f32(din3, vzero);
-
-        vst1q_f32(dout_ptr, din0);
-        vst1q_f32(dout_ptr + 4, din1);
-        vst1q_f32(dout_ptr + 8, din2);
-        vst1q_f32(dout_ptr + 12, din3);
-        din_ptr += 16;
-        dout_ptr += 16;
-      }
-      if (remain >= 8) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        float32x4_t din1 = vld1q_f32(din_ptr + 4);
-        din0 = vaddq_f32(din0, rb);
-        din1 = vaddq_f32(din1, rb);
-        // relu
-        din0 = vmaxq_f32(din0, vzero);
-        din1 = vmaxq_f32(din1, vzero);
-        vst1q_f32(dout_ptr, din0);
-        vst1q_f32(dout_ptr + 4, din1);
-        din_ptr += 8;
-        dout_ptr += 8;
-        remain -= 8;
-      }
-      if (remain >= 4) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        din0 = vaddq_f32(din0, rb);
-        // relu
-        din0 = vmaxq_f32(din0, vzero);
-        vst1q_f32(dout_ptr, din0);
-        din_ptr += 4;
-        dout_ptr += 4;
-        remain -= 4;
-      }
-      if (remain > 0) {
-        for (int p = 0; p < remain; p++) {
-          float tmp = *din_ptr + diny_data;
-          *dout_ptr = tmp > 0.f ? tmp : 0.f;
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  }
-}
-
-template <>
-void elementwise_sub<float>(const float* dinx,
-                            const float* diny,
-                            float* dout,
-                            int num) {
-  int cnt = num >> 4;
-  int remain = num % 16;
-#pragma omp parallel for
-  for (int i = 0; i < cnt; i++) {
-    const float* dinx_ptr = dinx + (i << 4);
-    const float* diny_ptr = diny + (i << 4);
-    float* dout_ptr = dout + (i << 4);
-
-    float32x4_t dinx0 = vld1q_f32(dinx_ptr);
-    float32x4_t dinx1 = vld1q_f32(dinx_ptr + 4);
-    float32x4_t dinx2 = vld1q_f32(dinx_ptr + 8);
-    float32x4_t dinx3 = vld1q_f32(dinx_ptr + 12);
-
-    float32x4_t diny0 = vld1q_f32(diny_ptr);
-    float32x4_t diny1 = vld1q_f32(diny_ptr + 4);
-    float32x4_t diny2 = vld1q_f32(diny_ptr + 8);
-    float32x4_t diny3 = vld1q_f32(diny_ptr + 12);
-
-    dinx0 = vsubq_f32(dinx0, diny0);
-    dinx1 = vsubq_f32(dinx1, diny1);
-    dinx2 = vsubq_f32(dinx2, diny2);
-    dinx3 = vsubq_f32(dinx3, diny3);
-
-    vst1q_f32(dout_ptr, dinx0);
-    vst1q_f32(dout_ptr + 4, dinx1);
-    vst1q_f32(dout_ptr + 8, dinx2);
-    vst1q_f32(dout_ptr + 12, dinx3);
-  }
-  if (remain > 0) {
-    const float* dinx_ptr = dinx + (cnt << 4);
-    const float* diny_ptr = diny + (cnt << 4);
-    float* dout_ptr = dout + (cnt << 4);
-    for (int i = 0; i < remain; i++) {
-      *dout_ptr = *dinx_ptr - *diny_ptr;
-      dout_ptr++;
-      dinx_ptr++;
-      diny_ptr++;
-    }
-  }
-}
-
-template <>
-void elementwise_sub_relu<float>(const float* dinx,
-                                 const float* diny,
-                                 float* dout,
-                                 int num) {
-  int cnt = num >> 4;
-  int remain = num % 16;
-  float32x4_t vzero = vdupq_n_f32(0.f);
-#pragma omp parallel for
-  for (int i = 0; i < cnt; i++) {
-    const float* dinx_ptr = dinx + (i << 4);
-    const float* diny_ptr = diny + (i << 4);
-    float* dout_ptr = dout + (i << 4);
-
-    float32x4_t dinx0 = vld1q_f32(dinx_ptr);
-    float32x4_t dinx1 = vld1q_f32(dinx_ptr + 4);
-    float32x4_t dinx2 = vld1q_f32(dinx_ptr + 8);
-    float32x4_t dinx3 = vld1q_f32(dinx_ptr + 12);
-
-    float32x4_t diny0 = vld1q_f32(diny_ptr);
-    float32x4_t diny1 = vld1q_f32(diny_ptr + 4);
-    float32x4_t diny2 = vld1q_f32(diny_ptr + 8);
-    float32x4_t diny3 = vld1q_f32(diny_ptr + 12);
-
-    dinx0 = vsubq_f32(dinx0, diny0);
-    dinx1 = vsubq_f32(dinx1, diny1);
-    dinx2 = vsubq_f32(dinx2, diny2);
-    dinx3 = vsubq_f32(dinx3, diny3);
-
-    // relu
-    dinx0 = vmaxq_f32(dinx0, vzero);
-    dinx1 = vmaxq_f32(dinx1, vzero);
-    dinx2 = vmaxq_f32(dinx2, vzero);
-    dinx3 = vmaxq_f32(dinx3, vzero);
-
-    vst1q_f32(dout_ptr, dinx0);
-    vst1q_f32(dout_ptr + 4, dinx1);
-    vst1q_f32(dout_ptr + 8, dinx2);
-    vst1q_f32(dout_ptr + 12, dinx3);
-  }
-  if (remain > 0) {
-    const float* dinx_ptr = dinx + (cnt << 4);
-    const float* diny_ptr = diny + (cnt << 4);
-    float* dout_ptr = dout + (cnt << 4);
-    for (int i = 0; i < remain; i++) {
-      float tmp = *dinx_ptr - *diny_ptr;
-      *dout_ptr = tmp > 0.f ? tmp : 0.f;
-      dout_ptr++;
-      dinx_ptr++;
-      diny_ptr++;
-    }
-  }
-}
-
-template <>
-void elementwise_sub_broadcast<float>(const float* dinx,
-                                      const float* diny,
-                                      float* dout,
-                                      int batch,
-                                      int channels,
-                                      int num) {
-#pragma omp parallel for collapse(2)
-  for (int i = 0; i < batch; ++i) {
-    for (int j = 0; j < channels; ++j) {
-      int offset = (i * channels + j) * num;
-      const float* din_ptr = dinx + offset;
-      const float diny_data = diny[j];
-      float* dout_ptr = dout + offset;
-
-      int cnt = num >> 4;
-      int remain = num % 16;
-      float32x4_t rb = vdupq_n_f32(diny_data);
-      for (int k = 0; k < cnt; ++k) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        float32x4_t din1 = vld1q_f32(din_ptr + 4);
-        float32x4_t din2 = vld1q_f32(din_ptr + 8);
-        float32x4_t din3 = vld1q_f32(din_ptr + 12);
-
-        din0 = vsubq_f32(din0, rb);
-        din1 = vsubq_f32(din1, rb);
-        din2 = vsubq_f32(din2, rb);
-        din3 = vsubq_f32(din3, rb);
-
-        vst1q_f32(dout_ptr, din0);
-        vst1q_f32(dout_ptr + 4, din1);
-        vst1q_f32(dout_ptr + 8, din2);
-        vst1q_f32(dout_ptr + 12, din3);
-        din_ptr += 16;
-        dout_ptr += 16;
-      }
-      if (remain >= 8) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        float32x4_t din1 = vld1q_f32(din_ptr + 4);
-        din0 = vsubq_f32(din0, rb);
-        din1 = vsubq_f32(din1, rb);
-        vst1q_f32(dout_ptr, din0);
-        vst1q_f32(dout_ptr + 4, din1);
-        din_ptr += 8;
-        dout_ptr += 8;
-        remain -= 8;
-      }
-      if (remain >= 4) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        din0 = vsubq_f32(din0, rb);
-        vst1q_f32(dout_ptr, din0);
-        din_ptr += 4;
-        dout_ptr += 4;
-        remain -= 4;
-      }
-      if (remain > 0) {
-        for (int p = 0; p < remain; p++) {
-          *dout_ptr = *din_ptr - diny_data;
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  }
-}
-
-template <>
-void elementwise_sub_relu_broadcast<float>(const float* dinx,
-                                           const float* diny,
-                                           float* dout,
-                                           int batch,
-                                           int channels,
-                                           int num) {
-  float32x4_t vzero = vdupq_n_f32(0.f);
-#pragma omp parallel for collapse(2)
-  for (int i = 0; i < batch; ++i) {
-    for (int j = 0; j < channels; ++j) {
-      int offset = (i * channels + j) * num;
-      const float* din_ptr = dinx + offset;
-      const float diny_data = diny[j];
-      float* dout_ptr = dout + offset;
-
-      int cnt = num >> 4;
-      int remain = num % 16;
-      float32x4_t rb = vdupq_n_f32(diny_data);
-      for (int k = 0; k < cnt; ++k) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        float32x4_t din1 = vld1q_f32(din_ptr + 4);
-        float32x4_t din2 = vld1q_f32(din_ptr + 8);
-        float32x4_t din3 = vld1q_f32(din_ptr + 12);
-
-        din0 = vsubq_f32(din0, rb);
-        din1 = vsubq_f32(din1, rb);
-        din2 = vsubq_f32(din2, rb);
-        din3 = vsubq_f32(din3, rb);
-
-        // relu
-        din0 = vmaxq_f32(din0, vzero);
-        din1 = vmaxq_f32(din1, vzero);
-        din2 = vmaxq_f32(din2, vzero);
-        din3 = vmaxq_f32(din3, vzero);
-
-        vst1q_f32(dout_ptr, din0);
-        vst1q_f32(dout_ptr + 4, din1);
-        vst1q_f32(dout_ptr + 8, din2);
-        vst1q_f32(dout_ptr + 12, din3);
-        din_ptr += 16;
-        dout_ptr += 16;
-      }
-      if (remain >= 8) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        float32x4_t din1 = vld1q_f32(din_ptr + 4);
-        din0 = vsubq_f32(din0, rb);
-        din1 = vsubq_f32(din1, rb);
-        // relu
-        din0 = vmaxq_f32(din0, vzero);
-        din1 = vmaxq_f32(din1, vzero);
-        vst1q_f32(dout_ptr, din0);
-        vst1q_f32(dout_ptr + 4, din1);
-        din_ptr += 8;
-        dout_ptr += 8;
-        remain -= 8;
-      }
-      if (remain >= 4) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        din0 = vsubq_f32(din0, rb);
-        // relu
-        din0 = vmaxq_f32(din0, vzero);
-        vst1q_f32(dout_ptr, din0);
-        din_ptr += 4;
-        dout_ptr += 4;
-        remain -= 4;
-      }
-      if (remain > 0) {
-        for (int p = 0; p < remain; p++) {
-          float tmp = *din_ptr - diny_data;
-          *dout_ptr = tmp > 0.f ? tmp : 0.f;
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  }
-}
-
-template <>
-void elementwise_mul<float>(const float* dinx,
-                            const float* diny,
-                            float* dout,
-                            int num) {
-  int cnt = num >> 4;
-  int remain = num % 16;
-#pragma omp parallel for
-  for (int i = 0; i < cnt; ++i) {
-    const float* dinx_ptr = dinx + (i << 4);
-    const float* diny_ptr = diny + (i << 4);
-    float* dout_ptr = dout + (i << 4);
-
-    float32x4_t dinx0 = vld1q_f32(dinx_ptr);
-    float32x4_t dinx1 = vld1q_f32(dinx_ptr + 4);
-    float32x4_t dinx2 = vld1q_f32(dinx_ptr + 8);
-    float32x4_t dinx3 = vld1q_f32(dinx_ptr + 12);
-
-    float32x4_t diny0 = vld1q_f32(diny_ptr);
-    float32x4_t diny1 = vld1q_f32(diny_ptr + 4);
-    float32x4_t diny2 = vld1q_f32(diny_ptr + 8);
-    float32x4_t diny3 = vld1q_f32(diny_ptr + 12);
-
-    dinx0 = vmulq_f32(dinx0, diny0);
-    dinx1 = vmulq_f32(dinx1, diny1);
-    dinx2 = vmulq_f32(dinx2, diny2);
-    dinx3 = vmulq_f32(dinx3, diny3);
-
-    vst1q_f32(dout_ptr, dinx0);
-    vst1q_f32(dout_ptr + 4, dinx1);
-    vst1q_f32(dout_ptr + 8, dinx2);
-    vst1q_f32(dout_ptr + 12, dinx3);
-  }
-  if (remain > 0) {
-    const float* dinx_ptr = dinx + (cnt << 4);
-    const float* diny_ptr = diny + (cnt << 4);
-    float* dout_ptr = dout + (cnt << 4);
-    for (int i = 0; i < remain; i++) {
-      *dout_ptr = *dinx_ptr * *diny_ptr;
-      dout_ptr++;
-      dinx_ptr++;
-      diny_ptr++;
-    }
-  }
-}
-
-template <>
-void elementwise_mul_relu<float>(const float* dinx,
-                                 const float* diny,
-                                 float* dout,
-                                 int num) {
-  int cnt = num >> 4;
-  int remain = num % 16;
-  float32x4_t vzero = vdupq_n_f32(0.f);
-#pragma omp parallel for
-  for (int i = 0; i < cnt; ++i) {
-    const float* dinx_ptr = dinx + (i << 4);
-    const float* diny_ptr = diny + (i << 4);
-    float* dout_ptr = dout + (i << 4);
-
-    float32x4_t dinx0 = vld1q_f32(dinx_ptr);
-    float32x4_t dinx1 = vld1q_f32(dinx_ptr + 4);
-    float32x4_t dinx2 = vld1q_f32(dinx_ptr + 8);
-    float32x4_t dinx3 = vld1q_f32(dinx_ptr + 12);
-
-    float32x4_t diny0 = vld1q_f32(diny_ptr);
-    float32x4_t diny1 = vld1q_f32(diny_ptr + 4);
-    float32x4_t diny2 = vld1q_f32(diny_ptr + 8);
-    float32x4_t diny3 = vld1q_f32(diny_ptr + 12);
-
-    dinx0 = vmulq_f32(dinx0, diny0);
-    dinx1 = vmulq_f32(dinx1, diny1);
-    dinx2 = vmulq_f32(dinx2, diny2);
-    dinx3 = vmulq_f32(dinx3, diny3);
-
-    // relu
-    dinx0 = vmaxq_f32(dinx0, vzero);
-    dinx1 = vmaxq_f32(dinx1, vzero);
-    dinx2 = vmaxq_f32(dinx2, vzero);
-    dinx3 = vmaxq_f32(dinx3, vzero);
-
-    vst1q_f32(dout_ptr, dinx0);
-    vst1q_f32(dout_ptr + 4, dinx1);
-    vst1q_f32(dout_ptr + 8, dinx2);
-    vst1q_f32(dout_ptr + 12, dinx3);
-  }
-  if (remain > 0) {
-    const float* dinx_ptr = dinx + (cnt << 4);
-    const float* diny_ptr = diny + (cnt << 4);
-    float* dout_ptr = dout + (cnt << 4);
-    for (int i = 0; i < remain; i++) {
-      float tmp = *dinx_ptr * *diny_ptr;
-      *dout_ptr = tmp > 0.f ? tmp : 0.f;
-      dout_ptr++;
-      dinx_ptr++;
-      diny_ptr++;
-    }
-  }
-}
-
-template <>
-void elementwise_mul_broadcast<float>(const float* dinx,
-                                      const float* diny,
-                                      float* dout,
-                                      int batch,
-                                      int channels,
-                                      int num) {
-#pragma omp parallel for collapse(2)
-  for (int i = 0; i < batch; ++i) {
-    for (int j = 0; j < channels; ++j) {
-      int offset = (i * channels + j) * num;
-      const float* din_ptr = dinx + offset;
-      const float diny_data = diny[j];
-      float* dout_ptr = dout + offset;
-
-      int cnt = num >> 4;
-      int remain = num % 16;
-      float32x4_t rb = vdupq_n_f32(diny_data);
-      for (int k = 0; k < cnt; ++k) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        float32x4_t din1 = vld1q_f32(din_ptr + 4);
-        float32x4_t din2 = vld1q_f32(din_ptr + 8);
-        float32x4_t din3 = vld1q_f32(din_ptr + 12);
-
-        din0 = vmulq_f32(din0, rb);
-        din1 = vmulq_f32(din1, rb);
-        din2 = vmulq_f32(din2, rb);
-        din3 = vmulq_f32(din3, rb);
-
-        vst1q_f32(dout_ptr, din0);
-        vst1q_f32(dout_ptr + 4, din1);
-        vst1q_f32(dout_ptr + 8, din2);
-        vst1q_f32(dout_ptr + 12, din3);
-
-        din_ptr += 16;
-        dout_ptr += 16;
-      }
-      if (remain >= 8) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        float32x4_t din1 = vld1q_f32(din_ptr + 4);
-        din0 = vmulq_f32(din0, rb);
-        din1 = vmulq_f32(din1, rb);
-        vst1q_f32(dout_ptr, din0);
-        vst1q_f32(dout_ptr + 4, din1);
-        din_ptr += 8;
-        dout_ptr += 8;
-        remain -= 8;
-      }
-      if (remain >= 4) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        din0 = vmulq_f32(din0, rb);
-        vst1q_f32(dout_ptr, din0);
-        din_ptr += 4;
-        dout_ptr += 4;
-        remain -= 4;
-      }
-      if (remain > 0) {
-        for (int p = 0; p < remain; ++p) {
-          *dout_ptr = *din_ptr * diny_data;
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  }
-}
-
-template <>
-void elementwise_mul_relu_broadcast<float>(const float* dinx,
-                                           const float* diny,
-                                           float* dout,
-                                           int batch,
-                                           int channels,
-                                           int num) {
-  float32x4_t vzero = vdupq_n_f32(0.f);
-#pragma omp parallel for collapse(2)
-  for (int i = 0; i < batch; ++i) {
-    for (int j = 0; j < channels; ++j) {
-      int offset = (i * channels + j) * num;
-      const float* din_ptr = dinx + offset;
-      const float diny_data = diny[j];
-      float* dout_ptr = dout + offset;
-
-      int cnt = num >> 4;
-      int remain = num % 16;
-      float32x4_t rb = vdupq_n_f32(diny_data);
-      for (int k = 0; k < cnt; ++k) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        float32x4_t din1 = vld1q_f32(din_ptr + 4);
-        float32x4_t din2 = vld1q_f32(din_ptr + 8);
-        float32x4_t din3 = vld1q_f32(din_ptr + 12);
-
-        din0 = vmulq_f32(din0, rb);
-        din1 = vmulq_f32(din1, rb);
-        din2 = vmulq_f32(din2, rb);
-        din3 = vmulq_f32(din3, rb);
-
-        // relu
-        din0 = vmaxq_f32(din0, vzero);
-        din1 = vmaxq_f32(din1, vzero);
-        din2 = vmaxq_f32(din2, vzero);
-        din3 = vmaxq_f32(din3, vzero);
-
-        vst1q_f32(dout_ptr, din0);
-        vst1q_f32(dout_ptr + 4, din1);
-        vst1q_f32(dout_ptr + 8, din2);
-        vst1q_f32(dout_ptr + 12, din3);
-        din_ptr += 16;
-        dout_ptr += 16;
-      }
-      if (remain >= 8) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        float32x4_t din1 = vld1q_f32(din_ptr + 4);
-        din0 = vmulq_f32(din0, rb);
-        din1 = vmulq_f32(din1, rb);
-        // relu
-        din0 = vmaxq_f32(din0, vzero);
-        din1 = vmaxq_f32(din1, vzero);
-        vst1q_f32(dout_ptr, din0);
-        vst1q_f32(dout_ptr + 4, din1);
-        din_ptr += 8;
-        dout_ptr += 8;
-        remain -= 8;
-      }
-      if (remain >= 4) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        din0 = vmulq_f32(din0, rb);
-        // relu
-        din0 = vmaxq_f32(din0, vzero);
-        vst1q_f32(dout_ptr, din0);
-        din_ptr += 4;
-        dout_ptr += 4;
-        remain -= 4;
-      }
-      if (remain > 0) {
-        for (int p = 0; p < remain; ++p) {
-          float tmp = *din_ptr * diny_data;
-          *dout_ptr = tmp > 0.f ? tmp : 0.f;
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  }
-}
-
-template <>
-void elementwise_max<float>(const float* dinx,
-                            const float* diny,
-                            float* dout,
-                            int num) {
-  int cnt = num >> 4;
-  int remain = num % 16;
-#pragma omp parallel for
-  for (int i = 0; i < cnt; ++i) {
-    const float* dinx_ptr = dinx + (i << 4);
-    const float* diny_ptr = diny + (i << 4);
-    float* dout_ptr = dout + (i << 4);
-
-    float32x4_t dinx0 = vld1q_f32(dinx_ptr);
-    float32x4_t dinx1 = vld1q_f32(dinx_ptr + 4);
-    float32x4_t dinx2 = vld1q_f32(dinx_ptr + 8);
-    float32x4_t dinx3 = vld1q_f32(dinx_ptr + 12);
-
-    float32x4_t diny0 = vld1q_f32(diny_ptr);
-    float32x4_t diny1 = vld1q_f32(diny_ptr + 4);
-    float32x4_t diny2 = vld1q_f32(diny_ptr + 8);
-    float32x4_t diny3 = vld1q_f32(diny_ptr + 12);
-
-    dinx0 = vmaxq_f32(dinx0, diny0);
-    dinx1 = vmaxq_f32(dinx1, diny1);
-    dinx2 = vmaxq_f32(dinx2, diny2);
-    dinx3 = vmaxq_f32(dinx3, diny3);
-
-    vst1q_f32(dout_ptr, dinx0);
-    vst1q_f32(dout_ptr + 4, dinx1);
-    vst1q_f32(dout_ptr + 8, dinx2);
-    vst1q_f32(dout_ptr + 12, dinx3);
-  }
-  if (remain > 0) {
-    const float* dinx_ptr = dinx + (cnt << 4);
-    const float* diny_ptr = diny + (cnt << 4);
-    float* dout_ptr = dout + (cnt << 4);
-    for (int i = 0; i < remain; ++i) {
-      *(dout_ptr++) = std::max(*(dinx_ptr++), *(diny_ptr++));
-    }
-  }
-}
-
-template <>
-void elementwise_max_relu<float>(const float* dinx,
-                                 const float* diny,
-                                 float* dout,
-                                 int num) {
-  int cnt = num >> 4;
-  int remain = num % 16;
-  float32x4_t vzero = vdupq_n_f32(0.f);
-#pragma omp parallel for
-  for (int i = 0; i < cnt; ++i) {
-    const float* dinx_ptr = dinx + (i << 4);
-    const float* diny_ptr = diny + (i << 4);
-    float* dout_ptr = dout + (i << 4);
-
-    float32x4_t dinx0 = vld1q_f32(dinx_ptr);
-    float32x4_t dinx1 = vld1q_f32(dinx_ptr + 4);
-    float32x4_t dinx2 = vld1q_f32(dinx_ptr + 8);
-    float32x4_t dinx3 = vld1q_f32(dinx_ptr + 12);
-
-    float32x4_t diny0 = vld1q_f32(diny_ptr);
-    float32x4_t diny1 = vld1q_f32(diny_ptr + 4);
-    float32x4_t diny2 = vld1q_f32(diny_ptr + 8);
-    float32x4_t diny3 = vld1q_f32(diny_ptr + 12);
-
-    dinx0 = vmaxq_f32(dinx0, diny0);
-    dinx1 = vmaxq_f32(dinx1, diny1);
-    dinx2 = vmaxq_f32(dinx2, diny2);
-    dinx3 = vmaxq_f32(dinx3, diny3);
-
-    // relu
-    dinx0 = vmaxq_f32(dinx0, vzero);
-    dinx1 = vmaxq_f32(dinx1, vzero);
-    dinx2 = vmaxq_f32(dinx2, vzero);
-    dinx3 = vmaxq_f32(dinx3, vzero);
-
-    vst1q_f32(dout_ptr, dinx0);
-    vst1q_f32(dout_ptr + 4, dinx1);
-    vst1q_f32(dout_ptr + 8, dinx2);
-    vst1q_f32(dout_ptr + 12, dinx3);
-  }
-  if (remain > 0) {
-    const float* dinx_ptr = dinx + (cnt << 4);
-    const float* diny_ptr = diny + (cnt << 4);
-    float* dout_ptr = dout + (cnt << 4);
-    for (int i = 0; i < remain; ++i) {
-      float tmp = std::max(*(dinx_ptr++), *(diny_ptr++));
-      *(dout_ptr++) = tmp > 0.f ? tmp : 0.f;
-    }
-  }
-}
-
-template <>
-void elementwise_max_broadcast<float>(const float* dinx,
-                                      const float* diny,
-                                      float* dout,
-                                      int batch,
-                                      int channels,
-                                      int num) {
-#pragma omp parallel for collapse(2)
-  for (int i = 0; i < batch; ++i) {
-    for (int j = 0; j < channels; ++j) {
-      int offset = (i * channels + j) * num;
-      const float* din_ptr = dinx + offset;
-      const float diny_data = diny[j];
-      float* dout_ptr = dout + offset;
-
-      int cnt = num >> 4;
-      int remain = num % 16;
-      float32x4_t rb = vdupq_n_f32(diny_data);
-      for (int k = 0; k < cnt; ++k) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        float32x4_t din1 = vld1q_f32(din_ptr + 4);
-        float32x4_t din2 = vld1q_f32(din_ptr + 8);
-        float32x4_t din3 = vld1q_f32(din_ptr + 12);
-
-        din0 = vmaxq_f32(din0, rb);
-        din1 = vmaxq_f32(din1, rb);
-        din2 = vmaxq_f32(din2, rb);
-        din3 = vmaxq_f32(din3, rb);
-
-        vst1q_f32(dout_ptr, din0);
-        vst1q_f32(dout_ptr + 4, din1);
-        vst1q_f32(dout_ptr + 8, din2);
-        vst1q_f32(dout_ptr + 12, din3);
-
-        din_ptr += 16;
-        dout_ptr += 16;
-      }
-      if (remain >= 8) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        float32x4_t din1 = vld1q_f32(din_ptr + 4);
-        din0 = vmaxq_f32(din0, rb);
-        din1 = vmaxq_f32(din1, rb);
-        vst1q_f32(dout_ptr, din0);
-        vst1q_f32(dout_ptr + 4, din1);
-        din_ptr += 8;
-        dout_ptr += 8;
-        remain -= 8;
-      }
-      if (remain >= 4) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        din0 = vmaxq_f32(din0, rb);
-        vst1q_f32(dout_ptr, din0);
-        din_ptr += 4;
-        dout_ptr += 4;
-        remain -= 4;
-      }
-      if (remain > 0) {
-        for (int p = 0; p < remain; ++p) {
-          *dout_ptr = std::max(*din_ptr, diny_data);
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  }
-}
-
-template <>
-void elementwise_max_relu_broadcast<float>(const float* dinx,
-                                           const float* diny,
-                                           float* dout,
-                                           int batch,
-                                           int channels,
-                                           int num) {
-  float32x4_t vzero = vdupq_n_f32(0.f);
-#pragma omp parallel for collapse(2)
-  for (int i = 0; i < batch; ++i) {
-    for (int j = 0; j < channels; ++j) {
-      int offset = (i * channels + j) * num;
-      const float* din_ptr = dinx + offset;
-      const float diny_data = diny[j];
-      float* dout_ptr = dout + offset;
-
-      int cnt = num >> 4;
-      int remain = num % 16;
-      float32x4_t rb = vdupq_n_f32(diny_data);
-      for (int k = 0; k < cnt; ++k) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        float32x4_t din1 = vld1q_f32(din_ptr + 4);
-        float32x4_t din2 = vld1q_f32(din_ptr + 8);
-        float32x4_t din3 = vld1q_f32(din_ptr + 12);
-
-        din0 = vmaxq_f32(din0, rb);
-        din1 = vmaxq_f32(din1, rb);
-        din2 = vmaxq_f32(din2, rb);
-        din3 = vmaxq_f32(din3, rb);
-
-        // relu
-        din0 = vmaxq_f32(din0, vzero);
-        din1 = vmaxq_f32(din1, vzero);
-        din2 = vmaxq_f32(din2, vzero);
-        din3 = vmaxq_f32(din3, vzero);
-
-        vst1q_f32(dout_ptr, din0);
-        vst1q_f32(dout_ptr + 4, din1);
-        vst1q_f32(dout_ptr + 8, din2);
-        vst1q_f32(dout_ptr + 12, din3);
-        din_ptr += 16;
-        dout_ptr += 16;
-      }
-      if (remain >= 8) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        float32x4_t din1 = vld1q_f32(din_ptr + 4);
-        din0 = vmaxq_f32(din0, rb);
-        din1 = vmaxq_f32(din1, rb);
-        // relu
-        din0 = vmaxq_f32(din0, vzero);
-        din1 = vmaxq_f32(din1, vzero);
-        vst1q_f32(dout_ptr, din0);
-        vst1q_f32(dout_ptr + 4, din1);
-        din_ptr += 8;
-        dout_ptr += 8;
-        remain -= 8;
-      }
-      if (remain >= 4) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        din0 = vmaxq_f32(din0, rb);
-        // relu
-        din0 = vmaxq_f32(din0, vzero);
-        vst1q_f32(dout_ptr, din0);
-        din_ptr += 4;
-        dout_ptr += 4;
-        remain -= 4;
-      }
-      if (remain > 0) {
-        for (int p = 0; p < remain; ++p) {
-          float tmp = std::max(*din_ptr, diny_data);
-          *dout_ptr = tmp > 0.f ? tmp : 0.f;
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  }
-}
-
-template <>
-void elementwise_div<float>(const float* dinx,
-                            const float* diny,
-                            float* dout,
-                            int num) {
-  int cnt = num >> 4;
-  int remain = num % 16;
-#pragma omp parallel for
-  for (int i = 0; i < cnt; i++) {
-    const float* dinx_ptr = dinx + (i << 4);
-    const float* diny_ptr = diny + (i << 4);
-    float* dout_ptr = dout + (i << 4);
-
-    float32x4_t dinx0 = vld1q_f32(dinx_ptr);
-    float32x4_t dinx1 = vld1q_f32(dinx_ptr + 4);
-    float32x4_t dinx2 = vld1q_f32(dinx_ptr + 8);
-    float32x4_t dinx3 = vld1q_f32(dinx_ptr + 12);
-
-    float32x4_t diny0 = vld1q_f32(diny_ptr);
-    float32x4_t diny1 = vld1q_f32(diny_ptr + 4);
-    float32x4_t diny2 = vld1q_f32(diny_ptr + 8);
-    float32x4_t diny3 = vld1q_f32(diny_ptr + 12);
-
-#ifdef __aarch64__
-    dinx0 = vdivq_f32(dinx0, diny0);
-    dinx1 = vdivq_f32(dinx1, diny1);
-    dinx2 = vdivq_f32(dinx2, diny2);
-    dinx3 = vdivq_f32(dinx3, diny3);
-#else
-    dinx0 = div_ps(dinx0, diny0);
-    dinx1 = div_ps(dinx1, diny1);
-    dinx2 = div_ps(dinx2, diny2);
-    dinx3 = div_ps(dinx3, diny3);
-#endif
-    vst1q_f32(dout_ptr, dinx0);
-    vst1q_f32(dout_ptr + 4, dinx1);
-    vst1q_f32(dout_ptr + 8, dinx2);
-    vst1q_f32(dout_ptr + 12, dinx3);
-  }
-  if (remain > 0) {
-    const float* dinx_ptr = dinx + (cnt << 4);
-    const float* diny_ptr = diny + (cnt << 4);
-    float* dout_ptr = dout + (cnt << 4);
-    for (int i = 0; i < remain; i++) {
-      *dout_ptr = *dinx_ptr / *diny_ptr;
-      dout_ptr++;
-      dinx_ptr++;
-      diny_ptr++;
-    }
-  }
-}
-
-template <>
-void elementwise_div_broadcast<float>(const float* dinx,
-                                      const float* diny,
-                                      float* dout,
-                                      int batch,
-                                      int channels,
-                                      int num) {
-#pragma omp parallel for collapse(2)
-  for (int i = 0; i < batch; ++i) {
-    for (int j = 0; j < channels; ++j) {
-      int offset = (i * channels + j) * num;
-      const float* din_ptr = dinx + offset;
-      const float diny_data = diny[j];
-      float* dout_ptr = dout + offset;
-
-      int cnt = num >> 4;
-      int remain = num % 16;
-      float32x4_t rb = vdupq_n_f32(diny_data);
-      for (int k = 0; k < cnt; ++k) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        float32x4_t din1 = vld1q_f32(din_ptr + 4);
-        float32x4_t din2 = vld1q_f32(din_ptr + 8);
-        float32x4_t din3 = vld1q_f32(din_ptr + 12);
-
-#ifdef __aarch64__
-        din0 = vdivq_f32(din0, rb);
-        din1 = vdivq_f32(din1, rb);
-        din2 = vdivq_f32(din2, rb);
-        din3 = vdivq_f32(din3, rb);
-#else
-        din0 = div_ps(din0, rb);
-        din1 = div_ps(din1, rb);
-        din2 = div_ps(din2, rb);
-        din3 = div_ps(din3, rb);
-#endif
-
-        vst1q_f32(dout_ptr, din0);
-        vst1q_f32(dout_ptr + 4, din1);
-        vst1q_f32(dout_ptr + 8, din2);
-        vst1q_f32(dout_ptr + 12, din3);
-        din_ptr += 16;
-        dout_ptr += 16;
-      }
-      if (remain >= 8) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        float32x4_t din1 = vld1q_f32(din_ptr + 4);
-#ifdef __aarch64__
-        din0 = vdivq_f32(din0, rb);
-        din1 = vdivq_f32(din1, rb);
-#else
-        din0 = div_ps(din0, rb);
-        din1 = div_ps(din1, rb);
-#endif
-        vst1q_f32(dout_ptr, din0);
-        vst1q_f32(dout_ptr + 4, din1);
-        din_ptr += 8;
-        dout_ptr += 8;
-        remain -= 8;
-      }
-      if (remain >= 4) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-#ifdef __aarch64__
-        din0 = vdivq_f32(din0, rb);
-#else
-        din0 = div_ps(din0, rb);
-#endif
-        vst1q_f32(dout_ptr, din0);
-        din_ptr += 4;
-        dout_ptr += 4;
-        remain -= 4;
-      }
-      if (remain > 0) {
-        for (int p = 0; p < remain; p++) {
-          *dout_ptr = *din_ptr / diny_data;
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  }
-}
-
-template <>
-void elementwise_div_relu<float>(const float* dinx,
-                                 const float* diny,
-                                 float* dout,
-                                 int num) {
-  int cnt = num >> 4;
-  int remain = num % 16;
-  float32x4_t vzero = vdupq_n_f32(0.f);
-#pragma omp parallel for
-  for (int i = 0; i < cnt; ++i) {
-    const float* dinx_ptr = dinx + (i << 4);
-    const float* diny_ptr = diny + (i << 4);
-    float* dout_ptr = dout + (i << 4);
-
-    float32x4_t dinx0 = vld1q_f32(dinx_ptr);
-    float32x4_t dinx1 = vld1q_f32(dinx_ptr + 4);
-    float32x4_t dinx2 = vld1q_f32(dinx_ptr + 8);
-    float32x4_t dinx3 = vld1q_f32(dinx_ptr + 12);
-
-    float32x4_t diny0 = vld1q_f32(diny_ptr);
-    float32x4_t diny1 = vld1q_f32(diny_ptr + 4);
-    float32x4_t diny2 = vld1q_f32(diny_ptr + 8);
-    float32x4_t diny3 = vld1q_f32(diny_ptr + 12);
-
-#ifdef __aarch64__
-    dinx0 = vdivq_f32(dinx0, diny0);
-    dinx1 = vdivq_f32(dinx1, diny1);
-    dinx2 = vdivq_f32(dinx2, diny2);
-    dinx3 = vdivq_f32(dinx3, diny3);
-#else
-    dinx0 = div_ps(dinx0, diny0);
-    dinx1 = div_ps(dinx1, diny1);
-    dinx2 = div_ps(dinx2, diny2);
-    dinx3 = div_ps(dinx3, diny3);
-#endif
-    // relu
-    dinx0 = vmaxq_f32(dinx0, vzero);
-    dinx1 = vmaxq_f32(dinx1, vzero);
-    dinx2 = vmaxq_f32(dinx2, vzero);
-    dinx3 = vmaxq_f32(dinx3, vzero);
-
-    vst1q_f32(dout_ptr, dinx0);
-    vst1q_f32(dout_ptr + 4, dinx1);
-    vst1q_f32(dout_ptr + 8, dinx2);
-    vst1q_f32(dout_ptr + 12, dinx3);
-  }
-  if (remain > 0) {
-    const float* dinx_ptr = dinx + (cnt << 4);
-    const float* diny_ptr = diny + (cnt << 4);
-    float* dout_ptr = dout + (cnt << 4);
-    for (int i = 0; i < remain; ++i) {
-      float tmp = *dinx_ptr / *diny_ptr;
-      *(dout_ptr++) = tmp > 0.f ? tmp : 0.f;
-      dinx_ptr++;
-      diny_ptr++;
-    }
-  }
-}
-
-template <>
-void elementwise_div_relu_broadcast<float>(const float* dinx,
-                                           const float* diny,
-                                           float* dout,
-                                           int batch,
-                                           int channels,
-                                           int num) {
-  float32x4_t vzero = vdupq_n_f32(0.f);
-#pragma omp parallel for collapse(2)
-  for (int i = 0; i < batch; ++i) {
-    for (int j = 0; j < channels; ++j) {
-      int offset = (i * channels + j) * num;
-      const float* din_ptr = dinx + offset;
-      const float diny_data = diny[j];
-      float* dout_ptr = dout + offset;
-
-      int cnt = num >> 4;
-      int remain = num % 16;
-      float32x4_t rb = vdupq_n_f32(diny_data);
-      for (int k = 0; k < cnt; ++k) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        float32x4_t din1 = vld1q_f32(din_ptr + 4);
-        float32x4_t din2 = vld1q_f32(din_ptr + 8);
-        float32x4_t din3 = vld1q_f32(din_ptr + 12);
-
-#ifdef __aarch64__
-        din0 = vdivq_f32(din0, rb);
-        din1 = vdivq_f32(din1, rb);
-        din2 = vdivq_f32(din2, rb);
-        din3 = vdivq_f32(din3, rb);
-#else
-        din0 = div_ps(din0, rb);
-        din1 = div_ps(din1, rb);
-        din2 = div_ps(din2, rb);
-        din3 = div_ps(din3, rb);
-#endif
-        // relu
-        din0 = vmaxq_f32(din0, vzero);
-        din1 = vmaxq_f32(din1, vzero);
-        din2 = vmaxq_f32(din2, vzero);
-        din3 = vmaxq_f32(din3, vzero);
-
-        vst1q_f32(dout_ptr, din0);
-        vst1q_f32(dout_ptr + 4, din1);
-        vst1q_f32(dout_ptr + 8, din2);
-        vst1q_f32(dout_ptr + 12, din3);
-        din_ptr += 16;
-        dout_ptr += 16;
-      }
-      if (remain >= 8) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        float32x4_t din1 = vld1q_f32(din_ptr + 4);
-#ifdef __aarch64__
-        din0 = vdivq_f32(din0, rb);
-        din1 = vdivq_f32(din1, rb);
-#else
-        din0 = div_ps(din0, rb);
-        din1 = div_ps(din1, rb);
-#endif
-        // relu
-        din0 = vmaxq_f32(din0, vzero);
-        din1 = vmaxq_f32(din1, vzero);
-        vst1q_f32(dout_ptr, din0);
-        vst1q_f32(dout_ptr + 4, din1);
-        din_ptr += 8;
-        dout_ptr += 8;
-        remain -= 8;
-      }
-      if (remain >= 4) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-#ifdef __aarch64__
-        din0 = vdivq_f32(din0, rb);
-#else
-        din0 = div_ps(din0, rb);
-#endif
-        // relu
-        din0 = vmaxq_f32(din0, vzero);
-        vst1q_f32(dout_ptr, din0);
-        din_ptr += 4;
-        dout_ptr += 4;
-        remain -= 4;
-      }
-      if (remain > 0) {
-        for (int p = 0; p < remain; p++) {
-          float tmp = *din_ptr / diny_data;
-          *dout_ptr = tmp > 0.f ? tmp : 0.f;
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/elementwise.h b/lite/backends/arm/math/elementwise.h
deleted file mode 100644
index f8273a5bb3..0000000000
--- a/lite/backends/arm/math/elementwise.h
+++ /dev/null
@@ -1,95 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <typename T>
-void elementwise_add(const T* dinx, const T* diny, T* dout, int num);
-
-template <typename T>
-void elementwise_add_relu(const T* dinx, const T* diny, T* dout, int num);
-
-template <typename T>
-void elementwise_add_broadcast(
-    const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
-
-template <typename T>
-void elementwise_add_relu_broadcast(
-    const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
-
-template <typename T>
-void elementwise_sub(const T* dinx, const T* diny, T* dout, int num);
-
-template <typename T>
-void elementwise_sub_relu(const T* dinx, const T* diny, T* dout, int num);
-
-template <typename T>
-void elementwise_sub_broadcast(
-    const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
-
-template <typename T>
-void elementwise_sub_relu_broadcast(
-    const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
-
-template <typename T>
-void elementwise_mul(const T* dinx, const T* diny, T* dout, int num);
-
-template <typename T>
-void elementwise_mul_relu(const T* dinx, const T* diny, T* dout, int num);
-
-template <typename T>
-void elementwise_mul_broadcast(
-    const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
-
-template <typename T>
-void elementwise_mul_relu_broadcast(
-    const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
-
-template <typename T>
-void elementwise_max(const T* dinx, const T* diny, T* dout, int num);
-
-template <typename T>
-void elementwise_max_relu(const T* dinx, const T* diny, T* dout, int num);
-
-template <typename T>
-void elementwise_max_broadcast(
-    const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
-
-template <typename T>
-void elementwise_max_relu_broadcast(
-    const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
-
-template <typename T>
-void elementwise_div(const T* dinx, const T* diny, T* dout, int num);
-
-template <typename T>
-void elementwise_div_broadcast(
-    const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
-
-template <typename T>
-void elementwise_div_relu(const T* dinx, const T* diny, T* dout, int num);
-
-template <typename T>
-void elementwise_div_relu_broadcast(
-    const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/fill_bias_relu.cc b/lite/backends/arm/math/fill_bias_relu.cc
deleted file mode 100644
index 7137a0363b..0000000000
--- a/lite/backends/arm/math/fill_bias_relu.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/fill_bias_relu.h"
-#include <algorithm>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <>
-void fill_bias_relu<float>(float* tensor,
-                           const float* bias,
-                           int channel,
-                           int channel_size,
-                           bool flag_bias,
-                           bool flag_relu) {
-  float* data = tensor;
-  if (flag_relu) {
-    for (int j = 0; j < channel; ++j) {
-      float bias_data = flag_bias ? bias[j] : 0.f;
-      float32x4_t vbias = vdupq_n_f32(bias_data);
-      float32x4_t vzero = vdupq_n_f32(0.f);
-      int i = 0;
-      for (; i < channel_size - 3; i += 4) {
-        float32x4_t vdata = vld1q_f32(&data[i]);
-        vdata = vaddq_f32(vdata, vbias);
-        float32x4_t vmax = vmaxq_f32(vdata, vzero);
-        vst1q_f32(data + i, vmax);
-      }
-      for (; i < channel_size; i++) {
-        data[i] += bias_data;
-        data[i] = data[i] > 0 ? data[i] : 0.f;
-      }
-      data += channel_size;
-    }
-  } else {
-    for (int j = 0; j < channel; ++j) {
-      float bias_data = flag_bias ? bias[j] : 0.f;
-      float32x4_t vbias = vdupq_n_f32(bias_data);
-      int i = 0;
-      for (; i < channel_size - 3; i += 4) {
-        float32x4_t vdata = vld1q_f32(&data[i]);
-        vdata = vaddq_f32(vdata, vbias);
-        vst1q_f32(data + i, vdata);
-      }
-      for (; i < channel_size; i++) {
-        data[i] += bias_data;
-      }
-      data += channel_size;
-    }
-  }
-}
-
-template <>
-void fill_bias_relu<int>(int* tensor,
-                         const int* bias,
-                         int channel,
-                         int channel_size,
-                         bool flag_bias,
-                         bool flag_relu) {
-  int* data = tensor;
-  if (flag_relu) {
-    for (int j = 0; j < channel; ++j) {
-      int bias_data = flag_bias ? bias[j] : 0;
-      int32x4_t vbias = vdupq_n_s32(bias_data);
-      int32x4_t vzero = vdupq_n_s32(0);
-      int i = 0;
-      for (; i < channel_size - 7; i += 8) {
-        int32x4_t vdata1 = vld1q_s32(data + i);
-        int32x4_t vdata2 = vld1q_s32(data + i + 4);
-        vdata1 = vaddq_s32(vdata1, vbias);
-        vdata2 = vaddq_s32(vdata2, vbias);
-        int32x4_t vmax1 = vmaxq_s32(vdata1, vzero);
-        int32x4_t vmax2 = vmaxq_s32(vdata2, vzero);
-        vst1q_s32(data + i, vmax1);
-        vst1q_s32(data + i + 4, vmax2);
-      }
-      for (; i < channel_size; i++) {
-        data[i] += bias_data;
-        data[i] = data[i] > 0 ? data[i] : 0;
-      }
-      data += channel_size;
-    }
-  } else {
-    for (int j = 0; j < channel; ++j) {
-      int bias_data = flag_bias ? bias[j] : 0;
-      int32x4_t vbias = vdupq_n_s32(bias_data);
-      int i = 0;
-      for (; i < channel_size - 7; i += 8) {
-        int32x4_t vdata1 = vld1q_s32(data + i);
-        int32x4_t vdata2 = vld1q_s32(data + i + 4);
-        vdata1 = vaddq_s32(vdata1, vbias);
-        vdata2 = vaddq_s32(vdata2, vbias);
-        vst1q_s32(data + i, vdata1);
-        vst1q_s32(data + i + 4, vdata2);
-      }
-      for (; i < channel_size; i++) {
-        data[i] += bias_data;
-      }
-      data += channel_size;
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/fill_bias_relu.h b/lite/backends/arm/math/fill_bias_relu.h
deleted file mode 100644
index 254d6d43be..0000000000
--- a/lite/backends/arm/math/fill_bias_relu.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <vector>
-#include "lite/core/op_lite.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-/**
- *  * \brief neon implementation to add bias and relu
- *  * @param tensor
- *  * @param bias
- *  * @param channel
- *  * @param channel_size
- *
- */
-template <typename Dtype>
-void fill_bias_relu(Dtype* tensor,
-                    const Dtype* bias,
-                    int channel,
-                    int channel_size,
-                    bool flag_bias,
-                    bool flag_relu);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/funcs.cc b/lite/backends/arm/math/funcs.cc
deleted file mode 100644
index e4425ade2e..0000000000
--- a/lite/backends/arm/math/funcs.cc
+++ /dev/null
@@ -1,153 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/funcs.h"
-#include <arm_neon.h>
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <>
-void fill_bias_fc<float>(float *out, const float *bias, int num, int channel) {
-  int cnt = channel >> 4;
-  int remain = channel & 15;
-
-  for (int j = 0; j < num; ++j) {
-    const float *ptr_bias = bias;
-    float *ptr_out = out + j * channel;
-
-    float32x4_t vout1;
-    float32x4_t vout2;
-    float32x4_t vout3;
-    float32x4_t vout4;
-
-    for (int i = 0; i < cnt; ++i) {
-      float32x4_t vin1 = vld1q_f32(ptr_out);
-      float32x4_t vb1 = vld1q_f32(ptr_bias);
-
-      float32x4_t vin2 = vld1q_f32(ptr_out + 4);
-      float32x4_t vb2 = vld1q_f32(ptr_bias + 4);
-
-      float32x4_t vin3 = vld1q_f32(ptr_out + 8);
-      float32x4_t vb3 = vld1q_f32(ptr_bias + 8);
-
-      float32x4_t vin4 = vld1q_f32(ptr_out + 12);
-      float32x4_t vb4 = vld1q_f32(ptr_bias + 12);
-
-      vout1 = vaddq_f32(vin1, vb1);
-      vout2 = vaddq_f32(vin2, vb2);
-      vout3 = vaddq_f32(vin3, vb3);
-      vout4 = vaddq_f32(vin4, vb4);
-
-      vst1q_f32(ptr_out, vout1);
-      vst1q_f32(ptr_out + 4, vout2);
-      vst1q_f32(ptr_out + 8, vout3);
-      vst1q_f32(ptr_out + 12, vout4);
-
-      ptr_out += 16;
-      ptr_bias += 16;
-    }
-#if 0
-        if (cnt > 0) {
-            asm(
-            "1: \n"
-            "vld1.32 {d0-d1}, [%[ptr_out]]    @ load data\n"
-            "vld1.32 {d2-d3}, [%[ptr_bias]]!  @ load data\n"
-            "vadd.f32 q2, q0, q1              @ add bias\n"
-            "vst1.32  {d4-d5}, [%[ptr_out]]!  @ store result\n"
-            "subs   %[cnt], #1                @ loop count -1\n"
-            "bne    1b                        @ jump to main loop\n"
-            :[ptr_out] "+r"(ptr_out), [ptr_bias] "+r"(ptr_bias), \
-                    [cnt] "+r"(cnt)
-            :
-            :"q0", "q1", "q2"
-            );
-        }
-#endif
-    for (int i = 0; i < remain; ++i) {
-      *(ptr_out++) += *(ptr_bias++);
-    }
-  }
-}
-
-template <>
-void fill_bias_fc<int>(int *out, const int *bias, int num, int channel) {
-  int cnt = channel >> 4;
-  int remain = channel & 15;
-
-  for (int j = 0; j < num; ++j) {
-    const int *ptr_bias = bias;
-    int *ptr_out = out + j * channel;
-
-    int32x4_t vout1;
-    int32x4_t vout2;
-    int32x4_t vout3;
-    int32x4_t vout4;
-
-    for (int i = 0; i < cnt; ++i) {
-      int32x4_t vin1 = vld1q_s32(ptr_out);
-      int32x4_t vb1 = vld1q_s32(ptr_bias);
-
-      int32x4_t vin2 = vld1q_s32(ptr_out + 4);
-      int32x4_t vb2 = vld1q_s32(ptr_bias + 4);
-
-      int32x4_t vin3 = vld1q_s32(ptr_out + 8);
-      int32x4_t vb3 = vld1q_s32(ptr_bias + 8);
-
-      int32x4_t vin4 = vld1q_s32(ptr_out + 12);
-      int32x4_t vb4 = vld1q_s32(ptr_bias + 12);
-
-      vout1 = vaddq_s32(vin1, vb1);
-      vout2 = vaddq_s32(vin2, vb2);
-      vout3 = vaddq_s32(vin3, vb3);
-      vout4 = vaddq_s32(vin4, vb4);
-
-      vst1q_s32(ptr_out, vout1);
-      vst1q_s32(ptr_out + 4, vout2);
-      vst1q_s32(ptr_out + 8, vout3);
-      vst1q_s32(ptr_out + 12, vout4);
-
-      ptr_out += 16;
-      ptr_bias += 16;
-    }
-
-#if 0
-        if (cnt > 0) {
-        asm(
-        "1: \n"
-        "vld1.32 {d0-d1}, [%[ptr_out]]    @ load data\n"
-        "vld1.32 {d2-d3}, [%[ptr_bias]]!  @ load data\n"
-        "vadd.s32 q2, q0, q1              @ add bias\n"
-        "vst1.32  {d4-d5}, [%[ptr_out]]!  @ store result\n"
-        "subs   %[cnt], #1                @ loop count -1\n"
-        "bne    1b                        @ jump to main loop\n"
-        :[ptr_out] "+r"(ptr_out), [ptr_bias] "+r"(ptr_bias), \
-                [cnt] "+r"(cnt)
-        :
-        :"q0", "q1", "q2"
-        );
-    }
-#endif
-    for (int i = 0; i < remain; ++i) {
-      *(ptr_out++) += *(ptr_bias++);
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/funcs.h b/lite/backends/arm/math/funcs.h
deleted file mode 100644
index 9438a997b6..0000000000
--- a/lite/backends/arm/math/funcs.h
+++ /dev/null
@@ -1,427 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <arm_neon.h>
-#include <algorithm>
-#include <cmath>
-
-#include "lite/backends/arm/math/activation.h"
-#include "lite/backends/arm/math/affine_channel.h"
-#include "lite/backends/arm/math/anchor_generator.h"
-#include "lite/backends/arm/math/argmax.h"
-#include "lite/backends/arm/math/axpy.h"
-#include "lite/backends/arm/math/beam_search.h"
-#include "lite/backends/arm/math/box_coder.h"
-#include "lite/backends/arm/math/col_im_transform.h"
-#include "lite/backends/arm/math/concat.h"
-#include "lite/backends/arm/math/conv_depthwise.h"
-#include "lite/backends/arm/math/conv_direct.h"
-#include "lite/backends/arm/math/conv_gemmlike.h"
-#include "lite/backends/arm/math/conv_winograd.h"
-#include "lite/backends/arm/math/decode_bboxes.h"
-#include "lite/backends/arm/math/dropout.h"
-#include "lite/backends/arm/math/elementwise.h"
-#include "lite/backends/arm/math/fill_bias_relu.h"
-#include "lite/backends/arm/math/im2sequence.h"
-#include "lite/backends/arm/math/increment.h"
-#include "lite/backends/arm/math/interpolate.h"
-#include "lite/backends/arm/math/lrn.h"
-#include "lite/backends/arm/math/negative.h"
-#include "lite/backends/arm/math/norm.h"
-#include "lite/backends/arm/math/packed_sgemm.h"
-#include "lite/backends/arm/math/pad2d.h"
-#include "lite/backends/arm/math/pooling.h"
-#include "lite/backends/arm/math/power.h"
-#include "lite/backends/arm/math/prior_box.h"
-#include "lite/backends/arm/math/reduce_max.h"
-#include "lite/backends/arm/math/reduce_mean.h"
-#include "lite/backends/arm/math/scale.h"
-#include "lite/backends/arm/math/sequence_expand.h"
-#include "lite/backends/arm/math/sequence_pool.h"
-#include "lite/backends/arm/math/sequence_softmax.h"
-#include "lite/backends/arm/math/sgemm.h"
-#include "lite/backends/arm/math/sgemv.h"
-#include "lite/backends/arm/math/shuffle_channel.h"
-#include "lite/backends/arm/math/slice.h"
-#include "lite/backends/arm/math/softmax.h"
-#include "lite/backends/arm/math/split.h"
-#include "lite/backends/arm/math/stack.h"
-#include "lite/backends/arm/math/topk.h"
-#include "lite/backends/arm/math/yolo_box.h"
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-#define c_inv_mant_mask ~0x7f800000u
-#define c_cephes_SQRTHF 0.707106781186547524
-#define c_cephes_log_p0 7.0376836292E-2
-#define c_cephes_log_p1 -1.1514610310E-1
-#define c_cephes_log_p2 1.1676998740E-1
-#define c_cephes_log_p3 -1.2420140846E-1
-#define c_cephes_log_p4 +1.4249322787E-1
-#define c_cephes_log_p5 -1.6668057665E-1
-#define c_cephes_log_p6 +2.0000714765E-1
-#define c_cephes_log_p7 -2.4999993993E-1
-#define c_cephes_log_p8 +3.3333331174E-1
-#define c_cephes_log_q1 -2.12194440e-4
-#define c_cephes_log_q2 0.693359375
-
-// natural logarithm computed for 4 simultaneous float
-// return NaN for x <= 0
-inline float32x4_t log_ps(float32x4_t x) {
-  float32x4_t one = vdupq_n_f32(1);
-
-  x = vmaxq_f32(x, vdupq_n_f32(0));  // force flush to zero on denormal values
-  uint32x4_t invalid_mask = vcleq_f32(x, vdupq_n_f32(0));
-
-  int32x4_t ux = vreinterpretq_s32_f32(x);
-
-  int32x4_t emm0 = vshrq_n_s32(ux, 23);
-
-  // keep only the fractional part
-  ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask));
-  ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f)));
-  x = vreinterpretq_f32_s32(ux);
-
-  emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f));
-  float32x4_t e = vcvtq_f32_s32(emm0);
-
-  e = vaddq_f32(e, one);
-
-  // part2:
-  // if( x < SQRTHF ) {
-  //   e -= 1;
-  //   x = x + x - 1.0;
-  // } else {
-  //   x = x - 1.0;
-  // }
-  //
-  uint32x4_t mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF));
-  float32x4_t tmp =
-      vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
-  x = vsubq_f32(x, one);
-  e = vsubq_f32(
-      e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask)));
-  x = vaddq_f32(x, tmp);
-
-  float32x4_t z = vmulq_f32(x, x);
-
-  float32x4_t y = vdupq_n_f32(c_cephes_log_p0);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8));
-  y = vmulq_f32(y, x);
-
-  y = vmulq_f32(y, z);
-
-  tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1));
-  y = vaddq_f32(y, tmp);
-
-  tmp = vmulq_f32(z, vdupq_n_f32(0.5f));
-  y = vsubq_f32(y, tmp);
-
-  tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2));
-  x = vaddq_f32(x, y);
-  x = vaddq_f32(x, tmp);
-  x = vreinterpretq_f32_u32(vorrq_u32(
-      vreinterpretq_u32_f32(x), invalid_mask));  // negative arg will be NAN
-  return x;
-}
-
-#define c_exp_hi 88.3762626647949f
-#define c_exp_lo -88.3762626647949f
-
-#define c_cephes_LOG2EF 1.44269504088896341
-#define c_cephes_exp_C1 0.693359375
-#define c_cephes_exp_C2 -2.12194440e-4
-
-#define c_cephes_exp_p0 1.9875691500E-4
-#define c_cephes_exp_p1 1.3981999507E-3
-#define c_cephes_exp_p2 8.3334519073E-3
-#define c_cephes_exp_p3 4.1665795894E-2
-#define c_cephes_exp_p4 1.6666665459E-1
-#define c_cephes_exp_p5 5.0000001201E-1
-
-// exp() computed for 4 float at once
-inline float32x4_t exp_ps(float32x4_t x) {
-  float32x4_t tmp, fx;
-
-  float32x4_t one = vdupq_n_f32(1);
-  x = vminq_f32(x, vdupq_n_f32(c_exp_hi));
-  x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo));
-
-  // express exp(x) as exp(g + n*log(2))
-  fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF));
-
-  // perform a floorf
-  tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
-
-  // if greater, substract 1
-  uint32x4_t mask = vcgtq_f32(tmp, fx);
-  mask = vandq_u32(mask, vreinterpretq_u32_f32(one));
-
-  fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
-
-  tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1));
-  float32x4_t z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2));
-  x = vsubq_f32(x, tmp);
-  x = vsubq_f32(x, z);
-
-  static const float cephes_exp_p[6] = {c_cephes_exp_p0,
-                                        c_cephes_exp_p1,
-                                        c_cephes_exp_p2,
-                                        c_cephes_exp_p3,
-                                        c_cephes_exp_p4,
-                                        c_cephes_exp_p5};
-  float32x4_t y = vld1q_dup_f32(cephes_exp_p + 0);
-  float32x4_t c1 = vld1q_dup_f32(cephes_exp_p + 1);
-  float32x4_t c2 = vld1q_dup_f32(cephes_exp_p + 2);
-  float32x4_t c3 = vld1q_dup_f32(cephes_exp_p + 3);
-  float32x4_t c4 = vld1q_dup_f32(cephes_exp_p + 4);
-  float32x4_t c5 = vld1q_dup_f32(cephes_exp_p + 5);
-
-  y = vmulq_f32(y, x);
-  z = vmulq_f32(x, x);
-
-  y = vaddq_f32(y, c1);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, c2);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, c3);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, c4);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, c5);
-
-  y = vmulq_f32(y, z);
-  y = vaddq_f32(y, x);
-  y = vaddq_f32(y, one);
-
-  // build 2^n
-  int32x4_t mm;
-  mm = vcvtq_s32_f32(fx);
-  mm = vaddq_s32(mm, vdupq_n_s32(0x7f));
-  mm = vshlq_n_s32(mm, 23);
-  float32x4_t pow2n = vreinterpretq_f32_s32(mm);
-
-  y = vmulq_f32(y, pow2n);
-  return y;
-}
-
-#define c_minus_cephes_DP1 -0.78515625
-#define c_minus_cephes_DP2 -2.4187564849853515625e-4
-#define c_minus_cephes_DP3 -3.77489497744594108e-8
-#define c_sincof_p0 -1.9515295891E-4
-#define c_sincof_p1 8.3321608736E-3
-#define c_sincof_p2 -1.6666654611E-1
-#define c_coscof_p0 2.443315711809948E-005
-#define c_coscof_p1 -1.388731625493765E-003
-#define c_coscof_p2 4.166664568298827E-002
-#define c_cephes_FOPI 1.27323954473516  // 4 / M_PI
-
-// evaluation of 4 sines & cosines at once.
-//
-// The code is the exact rewriting of the cephes sinf function.
-// Precision is excellent as long as x < 8192 (I did not bother to
-// take into account the special handling they have for greater values
-// -- it does not return garbage for arguments over 8192, though, but
-// the extra precision is missing).
-//
-// Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
-// surprising but correct result.
-//
-// Note also that when you compute sin(x), cos(x) is available at
-// almost no extra price so both sin_ps and cos_ps make use of
-// sincos_ps..
-//
-inline void sincos_ps(float32x4_t x, float32x4_t *ysin, float32x4_t *ycos) {
-  // any x
-  float32x4_t xmm1, xmm2, xmm3, y;
-
-  uint32x4_t emm2;
-
-  uint32x4_t sign_mask_sin, sign_mask_cos;
-  sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0));
-  x = vabsq_f32(x);
-
-  // scale by 4/Pi
-  y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI));
-
-  // store the integer part of y in mm0
-  emm2 = vcvtq_u32_f32(y);
-  // j=(j+1) & (~1) (see the cephes sources)
-  emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
-  emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
-  y = vcvtq_f32_u32(emm2);
-
-  // get the polynom selection mask
-  // there is one polynom for 0 <= x <= Pi/4
-  // and another one for Pi/4<x<=Pi/2
-  uint32x4_t poly_mask = vtstq_u32(emm2, vdupq_n_u32(2));
-
-  // the magic pass: "Extended precision modular arithmetic"
-  // x = ((x - y * DP1) - y * DP2) - y * DP3;
-  xmm1 = vmulq_n_f32(y, c_minus_cephes_DP1);
-  xmm2 = vmulq_n_f32(y, c_minus_cephes_DP2);
-  xmm3 = vmulq_n_f32(y, c_minus_cephes_DP3);
-  x = vaddq_f32(x, xmm1);
-  x = vaddq_f32(x, xmm2);
-  x = vaddq_f32(x, xmm3);
-
-  sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, vdupq_n_u32(4)));
-  sign_mask_cos = vtstq_u32(vsubq_u32(emm2, vdupq_n_u32(2)), vdupq_n_u32(4));
-
-  // evaluate the first polynom  (0 <= x <= Pi/4) in y1,
-  // and the second polynom      (Pi/4 <= x <= 0) in y2
-  float32x4_t z = vmulq_f32(x, x);
-  float32x4_t y1, y2;
-
-  y1 = vmulq_n_f32(z, c_coscof_p0);
-  y2 = vmulq_n_f32(z, c_sincof_p0);
-  y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p1));
-  y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p1));
-  y1 = vmulq_f32(y1, z);
-  y2 = vmulq_f32(y2, z);
-  y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p2));
-  y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p2));
-  y1 = vmulq_f32(y1, z);
-  y2 = vmulq_f32(y2, z);
-  y1 = vmulq_f32(y1, z);
-  y2 = vmulq_f32(y2, x);
-  y1 = vsubq_f32(y1, vmulq_f32(z, vdupq_n_f32(0.5f)));
-  y2 = vaddq_f32(y2, x);
-  y1 = vaddq_f32(y1, vdupq_n_f32(1));
-
-  // select the correct result from the two polynoms
-  float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
-  float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
-  *ysin = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
-  *ycos = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
-}
-
-inline float32x4_t sin_ps(float32x4_t x) {
-  float32x4_t ysin, ycos;
-  sincos_ps(x, &ysin, &ycos);
-  return ysin;
-}
-
-inline float32x4_t cos_ps(float32x4_t x) {
-  float32x4_t ysin, ycos;
-  sincos_ps(x, &ysin, &ycos);
-  return ycos;
-}
-
-inline float32x4_t div_ps(float32x4_t a, float32x4_t b) {
-  float32x4_t reciprocal = vrecpeq_f32(b);
-  reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
-  return vmulq_f32(a, reciprocal);
-}
-
-inline float32x4_t pow_ps(float32x4_t a, float32x4_t b) {
-  // pow(x, m) = exp(m * log(x))
-  return exp_ps(vmulq_f32(b, log_ps(a)));
-}
-
-template <typename T>
-void fill_bias_fc(T *tensor, const T *bias, int num, int channel);
-
-template <lite_api::ActivationType Act = lite_api::ActivationType::kIndentity>
-inline float32x4_t vactive_f32(const float32x4_t &x) {
-  return x;
-}
-
-template <>
-inline float32x4_t vactive_f32<lite_api::ActivationType::kRelu>(
-    const float32x4_t &x) {
-  float32x4_t __zero = vdupq_n_f32(0.f);
-  return vmaxq_f32(x, __zero);
-}
-
-template <>
-inline float32x4_t vactive_f32<lite_api::ActivationType::kRelu6>(
-    const float32x4_t &x) {
-  float32x4_t __zero = vdupq_n_f32(0.f);
-  float32x4_t __six = vdupq_n_f32(6.f);
-  return vminq_f32(vmaxq_f32(x, __zero), __six);
-}
-
-template <>
-inline float32x4_t vactive_f32<lite_api::ActivationType::kSigmoid>(
-    const float32x4_t &x) {
-  float32x4_t __one = vdupq_n_f32(1.f);
-  float32x4_t __x = vnegq_f32(x);
-  __x = exp_ps(__x);
-  __x = vaddq_f32(__x, __one);
-  float32x4_t __out = vrecpeq_f32(__x);
-  return vmulq_f32(vrecpsq_f32(__x, __out), __out);
-}
-
-template <>
-inline float32x4_t vactive_f32<lite_api::ActivationType::kTanh>(
-    const float32x4_t &x) {
-  float32x4_t __one = vdupq_n_f32(1.f);
-  float32x4_t __x = vmulq_n_f32(x, -2.f);
-  __x = exp_ps(__x);
-  __x = vaddq_f32(__x, __one);
-  float32x4_t __out = vrecpeq_f32(__x);
-  __out = vmulq_f32(vrecpsq_f32(__x, __out), __out);
-  __out = vmulq_n_f32(__out, 2.f);
-  return vsubq_f32(__out, __one);
-}
-
-template <lite_api::ActivationType Act = lite_api::ActivationType::kIndentity>
-inline float active_f32(const float &x) {
-  return x;
-}
-
-template <>
-inline float active_f32<lite_api::ActivationType::kRelu>(const float &x) {
-  return std::max(x, 0.f);
-}
-
-template <>
-inline float active_f32<lite_api::ActivationType::kRelu6>(const float &x) {
-  return std::min(std::max(x, 0.f), 6.f);
-}
-
-template <>
-inline float active_f32<lite_api::ActivationType::kSigmoid>(const float &x) {
-  return 1.f / (1.f + exp(-x));
-}
-
-template <>
-inline float active_f32<lite_api::ActivationType::kTanh>(const float &x) {
-  return 2.f / (1.f + exp(-2.f * x)) - 1.f;
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/gemm_prepacked_int8.cc b/lite/backends/arm/math/gemm_prepacked_int8.cc
deleted file mode 100644
index 9efae11157..0000000000
--- a/lite/backends/arm/math/gemm_prepacked_int8.cc
+++ /dev/null
@@ -1,3942 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/gemm_prepacked_int8.h"
-#include <arm_neon.h>
-#include "lite/backends/arm/math/dot_toolchain_support.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void prepackA_m4k2x2_int8(int8_t* out,
-                          const int8_t* in,
-                          int ldin,
-                          int m0,
-                          int mmax,
-                          int k0,
-                          int kmax);
-
-void prepackA_m4k2x2_trans_int8(int8_t* out,
-                                const int8_t* in,
-                                int ldin,
-                                int m0,
-                                int mmax,
-                                int k0,
-                                int kmax);
-
-void packb_int8(int8_t* out,
-                const int8_t* in,
-                int ldin,
-                int k0,
-                int kmax,
-                int n0,
-                int nmax,
-                const int8_t* zerobuf);
-
-void packb_trans_int8(int8_t* out,
-                      const int8_t* in,
-                      int ldin,
-                      int k0,
-                      int kmax,
-                      int n0,
-                      int nmax,
-                      const int8_t* zerobuf);
-
-#ifdef WITH_ARM_DOTPROD
-void prepackA_m8k4_int8(int8_t* out,
-                        const int8_t* in,
-                        int ldin,
-                        int m0,
-                        int mmax,
-                        int k0,
-                        int kmax);
-
-void prepackA_m8k4_trans_int8(int8_t* out,
-                              const int8_t* in,
-                              int ldin,
-                              int m0,
-                              int mmax,
-                              int k0,
-                              int kmax);
-
-void packb_sdot_int8(int8_t* out,
-                     const int8_t* in,
-                     int ldin,
-                     int k0,
-                     int kmax,
-                     int n0,
-                     int nmax);
-
-void packb_sdot_trans_int8(int8_t* out,
-                           const int8_t* in,
-                           int ldin,
-                           int k0,
-                           int kmax,
-                           int n0,
-                           int nmax);
-#endif
-
-void prepackA_int8(void* out,
-                   const void* in,
-                   int ldin,
-                   int m0,
-                   int mmax,
-                   int k0,
-                   int kmax,
-                   bool is_trans,
-                   ARMContext* ctx) {
-#if defined(__aarch64__) && defined(WITH_ARM_DOTPROD)
-  if (is_trans) {
-    if (ctx->has_dot()) {
-      prepackA_m8k4_trans_int8(static_cast<int8_t*>(out),
-                               static_cast<const int8_t*>(in),
-                               ldin,
-                               m0,
-                               mmax,
-                               k0,
-                               kmax);
-    } else {
-      prepackA_m4k2x2_trans_int8(static_cast<int8_t*>(out),
-                                 static_cast<const int8_t*>(in),
-                                 ldin,
-                                 m0,
-                                 mmax,
-                                 k0,
-                                 kmax);
-    }
-  } else {
-    if (ctx->has_dot()) {
-      prepackA_m8k4_int8(static_cast<int8_t*>(out),
-                         static_cast<const int8_t*>(in),
-                         ldin,
-                         m0,
-                         mmax,
-                         k0,
-                         kmax);
-    } else {
-      prepackA_m4k2x2_int8(static_cast<int8_t*>(out),
-                           static_cast<const int8_t*>(in),
-                           ldin,
-                           m0,
-                           mmax,
-                           k0,
-                           kmax);
-    }
-  }
-#else
-  if (is_trans) {
-    prepackA_m4k2x2_trans_int8(static_cast<int8_t*>(out),
-                               static_cast<const int8_t*>(in),
-                               ldin,
-                               m0,
-                               mmax,
-                               k0,
-                               kmax);
-  } else {
-    prepackA_m4k2x2_int8(static_cast<int8_t*>(out),
-                         static_cast<const int8_t*>(in),
-                         ldin,
-                         m0,
-                         mmax,
-                         k0,
-                         kmax);
-  }
-#endif
-}
-
-void prepackA_int8(TensorLite* tout,
-                   const TensorLite& tin,
-                   int m,
-                   int k,
-                   int group,
-                   bool is_trans,
-                   ARMContext* ctx) {
-  int hblock = get_hblock_int8(ctx);
-  int m_roundup = ROUNDUP(m, hblock);
-  // round up to 128 bits
-  int kup = ROUNDUP(k, KBLOCK_INT8);
-  int group_size_round_up = ((m_roundup * kup + 15) / 16) * 16;
-
-  if (tout->numel() < group_size_round_up * group) {
-    tout->Resize({1, 1, 1, group_size_round_up * group});
-  }
-  int lda = k;
-  if (is_trans) {
-    lda = m;
-  }
-  for (int g = 0; g < group; ++g) {
-    const char* weights_group = tin.data<char>() + g * m * k;
-    char* weights_trans_ptr =
-        tout->mutable_data<char>() + g * group_size_round_up;
-    prepackA_int8(
-        weights_trans_ptr, weights_group, lda, 0, m, 0, k, is_trans, ctx);
-  }
-}
-
-template <typename Dtype>
-inline void gemm_int8_kernel(const int8_t* a_ptr,
-                             const int8_t*& b_ptr,  // NOLINT
-                             const int32_t* bias,
-                             Dtype*& c_ptr0,  // NOLINT
-                             Dtype*& c_ptr1,  // NOLINT
-                             Dtype*& c_ptr2,  // NOLINT
-                             Dtype*& c_ptr3,  // NOLINT
-                             const float* scale,
-                             bool is_relu,
-                             int k,
-                             int rem);
-#ifdef __aarch64__
-#define GEMM_INT8_KERNEL                                                      \
-  "ld1 {v0.16b}, [%[a_ptr]],#16\n"         /* load a to q0, q1 */             \
-  "ld1 {v4.16b, v5.16b}, [%[b_ptr]],#32\n" /* load b to q4, q5 */             \
-  "ld1 {v6.16b, v7.16b}, [%[b_ptr]],#32\n" /* load b to q6, q7 */             \
-  "ldr    q8, [%[bias]]\n"                 /* load bias */                    \
-  "ext    v9.16b, v8.16b, v8.16b, #4\n"    /* shift left 1s */                \
-  "ext    v10.16b, v8.16b, v8.16b, #8\n"   /* shift left 2s */                \
-  "ext    v11.16b, v8.16b, v8.16b, #12\n"  /* shift left 3s */                \
-  "and v16.16b, v8.16b, v8.16b\n"          /* set bias0 to out00 */           \
-  "and v17.16b, v9.16b, v9.16b\n"          /* set bias0 to out01 */           \
-  "prfm   pldl1keep, [%[a_ptr], #64]\n"    /* preload a*/                     \
-  "and v18.16b, v10.16b, v10.16b\n"        /* set bias0 to out02 */           \
-  "and v19.16b, v11.16b, v11.16b\n"        /* set bias0 to out03 */           \
-  "prfm   pldl1keep, [%[b_ptr], #64]\n"    /* preload b*/                     \
-  "and v20.16b, v8.16b, v8.16b\n"          /* set bias0 to out10 */           \
-  "and v21.16b, v9.16b, v9.16b\n"          /* set bias0 to out11 */           \
-  "prfm   pldl1keep, [%[a_ptr], #128]\n"   /* preload a*/                     \
-  "and v22.16b, v10.16b, v10.16b\n"        /* set bias0 to out12 */           \
-  "and v23.16b, v11.16b, v11.16b\n"        /* set bias0 to out13 */           \
-  "prfm   pldl1keep, [%[b_ptr], #128]\n"   /* preload b*/                     \
-  "and v24.16b, v8.16b, v8.16b\n"          /* set bias0 to out20 */           \
-  "and v25.16b, v9.16b, v9.16b\n"          /* set bias0 to out21 */           \
-  "prfm   pldl1keep, [%[a_ptr], #192]\n"   /* preload a*/                     \
-  "and v26.16b, v10.16b, v10.16b\n"        /* set bias0 to out22 */           \
-  "and v27.16b, v11.16b, v11.16b\n"        /* set bias0 to out23 */           \
-  "prfm   pldl1keep, [%[b_ptr], #192]\n"   /* preload b*/                     \
-  "and v28.16b, v8.16b, v8.16b\n"          /* set bias0 to out30 */           \
-  "and v29.16b, v9.16b, v9.16b\n"          /* set bias0 to out31 */           \
-  "prfm   pldl1keep, [%[b_ptr], #256]\n"   /* preload b*/                     \
-  "and v30.16b, v10.16b, v10.16b\n"        /* set bias0 to out32 */           \
-  "and v31.16b, v11.16b, v11.16b\n"        /* set bias0 to out33 */           \
-  "ext    v1.16b, v0.16b, v0.16b, #2\n"    /* shift left 2bytes */            \
-  "ins    v1.h[3], v0.h[0]\n"              /* insert element */               \
-  "ins    v1.h[7], v0.h[4]\n"              /* insert element */               \
-  "rev64  v2.4s,  v0.4s\n" /* get low: 22,33,00,11; hi: 66,77,44,55 */        \
-  "rev64  v3.4s,  v1.4s\n" /* get low: 33,00,11,22; hi: 77,44,55,66 */        \
-  "prfm   pldl1keep, [%[b_ptr], #320]\n"                  /* preload a*/      \
-  "prfm   pldl1keep, [%[b_ptr], #384]\n"                  /* preload b*/      \
-  "cbz    %w[k],    3f\n" /* if k = 0, jump to remains */ /* 1st b0, b1 */    \
-  "smull  v8.8h,   v0.8b, v4.8b\n"                        /* a0 * b0 = c00 */ \
-  "smull  v12.8h,  v0.8b, v5.8b\n"                        /* a0 * b1 = c01 */ \
-  "smull  v9.8h,   v1.8b, v4.8b\n"                        /* a1 * b0 = c10 */ \
-  "smull  v13.8h,  v1.8b, v5.8b\n"                        /* a1 * b1 = c11 */ \
-  "smull  v10.8h,  v2.8b, v4.8b\n"                        /* a2 * b0 = c20 */ \
-  "smull  v14.8h,  v2.8b, v5.8b\n"                        /* a2 * b1 = c21 */ \
-  "smull  v11.8h,  v3.8b, v4.8b\n"                        /* a3 * b0 = c30 */ \
-  "smull  v15.8h,  v3.8b, v5.8b\n"                        /* a3 * b1 = c31 */ \
-  "subs %w[k], %w[k], #1\n" /* loop count -1 */           /* 2nd b0, b1 */    \
-  "smlal2  v8.8h,   v0.16b, v4.16b\n"                     /* a0 * b0 = c00 */ \
-  "smlal2  v12.8h,  v0.16b, v5.16b\n"                     /* a0 * b1 = c01 */ \
-  "smlal2  v9.8h,   v1.16b, v4.16b\n"                     /* a1 * b0 = c10 */ \
-  "smlal2  v13.8h,  v1.16b, v5.16b\n"                     /* a1 * b1 = c11 */ \
-  "smlal2  v10.8h,  v2.16b, v4.16b\n"                     /* a2 * b0 = c20 */ \
-  "smlal2  v14.8h,  v2.16b, v5.16b\n"                     /* a2 * b1 = c21 */ \
-  "smlal2  v11.8h,  v3.16b, v4.16b\n"                     /* a3 * b0 = c30 */ \
-  "smlal2  v15.8h,  v3.16b, v5.16b\n"                     /* a3 * b1 = c31 */ \
-  "beq    8f\n" /* skip main loop */                      /* main loop*/      \
-  "0:\n"                                                  /* main loop */     \
-  "ld1 {v4.16b, v5.16b}, [%[b_ptr]],#32\n" /* load b to q4, q5 */             \
-  "sadalp  v16.4s, v8.8h\n"        /* pairwise accumulate to int32, out00 */  \
-  "smull  v8.8h,   v0.8b, v6.8b\n" /* a0 * b2 = c02 */                        \
-  "sadalp  v20.4s, v12.8h\n"       /* pairwise accumulate to int32, out01 */  \
-  "smull  v12.8h,  v0.8b, v7.8b\n" /* a0 * b3 = c03 */                        \
-  "sadalp  v17.4s, v9.8h\n"        /* pairwise accumulate to int32, out10 */  \
-  "smull  v9.8h,   v1.8b, v6.8b\n" /* a1 * b2 = c12 */                        \
-  "sadalp  v21.4s, v13.8h\n"       /* pairwise accumulate to int32, out11 */  \
-  "smull  v13.8h,  v1.8b, v7.8b\n" /* a1 * b3 = c13 */                        \
-  "sadalp  v18.4s, v10.8h\n"       /* pairwise accumulate to int32, out20 */  \
-  "smull  v10.8h,  v2.8b, v6.8b\n" /* a2 * b2 = c22 */                        \
-  "sadalp  v22.4s, v14.8h\n"       /* pairwise accumulate to int32, out21 */  \
-  "smull  v14.8h,  v2.8b, v7.8b\n" /* a2 * b3 = c23 */                        \
-  "sadalp  v19.4s, v11.8h\n"       /* pairwise accumulate to int32, out30 */  \
-  "smlal2  v8.8h,   v0.16b, v6.16b\n" /* a0 * b2 = c02 */                     \
-  "smlal2  v12.8h,  v0.16b, v7.16b\n" /* a0 * b3 = c03 */                     \
-  "ld1 {v0.16b}, [%[a_ptr]],#16\n"    /* load a to q0, q1 */                  \
-  "smull  v11.8h,  v3.8b, v6.8b\n"    /* a3 * b2 = c32 */                     \
-  "sadalp  v23.4s, v15.8h\n" /* pairwise accumulate to int32, out31 */        \
-  "smull  v15.8h,  v3.8b, v7.8b\n" /* a3 * b3 = c33 */ /* 2nd b2, b3 */       \
-  "smlal2  v9.8h,   v1.16b, v6.16b\n"                  /* a1 * b2 = c12 */    \
-  "smlal2  v13.8h,  v1.16b, v7.16b\n"                  /* a1 * b3 = c13 */    \
-  "smlal2  v10.8h,  v2.16b, v6.16b\n"                  /* a2 * b2 = c22 */    \
-  "ext    v1.16b, v0.16b, v0.16b, #2\n"                /* shift left 2bytes*/ \
-  "ins    v1.h[3], v0.h[0]\n"                          /* insert element */   \
-  "ins    v1.h[7], v0.h[4]\n"                          /* insert element */   \
-  "smlal2  v14.8h,  v2.16b, v7.16b\n"                  /* a2 * b3 = c23 */    \
-  "smlal2  v11.8h,  v3.16b, v6.16b\n"                  /* a3 * b2 = c32 */    \
-  "smlal2  v15.8h,  v3.16b, v7.16b\n" /* a3 * b3 = c33 */ /* pre-process a */ \
-  "rev64  v2.4s,  v0.4s\n" /* get low: 22,33,00,11; hi: 66,77,44,55 */        \
-  "rev64  v3.4s,  v1.4s\n" /* get low: 33,00,11,22; hi: 77,44,55,66 */        \
-  "ld1 {v6.16b, v7.16b}, [%[b_ptr]],#32\n" /* load b to q6, q7 */             \
-  "sadalp  v24.4s, v8.8h\n"        /* pairwise accumulate to int32, out02 */  \
-  "smull  v8.8h,   v0.8b, v4.8b\n" /* a0 * b0 = c00 */                        \
-  "sadalp  v28.4s, v12.8h\n"       /* pairwise accumulate to int32, out03 */  \
-  "smull  v12.8h,  v0.8b, v5.8b\n" /* a0 * b1 = c01 */                        \
-  "sadalp  v25.4s, v9.8h\n"        /* pairwise accumulate to int32, out12 */  \
-  "smull  v9.8h,   v1.8b, v4.8b\n" /* a1 * b0 = c00 */                        \
-  "sadalp  v29.4s, v13.8h\n"       /* pairwise accumulate to int32, out13 */  \
-  "smull  v13.8h,  v1.8b, v5.8b\n" /* a1 * b1 = c01 */                        \
-  "sadalp  v26.4s, v10.8h\n"       /* pairwise accumulate to int32, out22 */  \
-  "smull  v10.8h,  v2.8b, v4.8b\n" /* a2 * b0 = c00 */                        \
-  "sadalp  v30.4s, v14.8h\n"       /* pairwise accumulate to int32, out23 */  \
-  "smull  v14.8h,  v2.8b, v5.8b\n" /* a2 * b1 = c01 */                        \
-  "sadalp  v27.4s, v11.8h\n"       /* pairwise accumulate to int32, out32 */  \
-  "smull  v11.8h,  v3.8b, v4.8b\n" /* a3 * b0 = c00 */                        \
-  "sadalp  v31.4s, v15.8h\n"       /* pairwise accumulate to int32, out33 */  \
-  "smull  v15.8h,  v3.8b, v5.8b\n" /* a3 * b1 = c01 */                        \
-  "subs %w[k], %w[k], #1\n" /* loop count -1 */ /* 2nd b0, b1 */              \
-  "smlal2  v8.8h,   v0.16b, v4.16b\n"           /* a0 * b0 = c00 */           \
-  "smlal2  v12.8h,  v0.16b, v5.16b\n"           /* a0 * b1 = c01 */           \
-  "smlal2  v9.8h,   v1.16b, v4.16b\n"           /* a1 * b0 = c10 */           \
-  "smlal2  v13.8h,  v1.16b, v5.16b\n"           /* a1 * b1 = c11 */           \
-  "smlal2  v10.8h,  v2.16b, v4.16b\n"           /* a2 * b0 = c20 */           \
-  "smlal2  v14.8h,  v2.16b, v5.16b\n"           /* a2 * b1 = c21 */           \
-  "smlal2  v11.8h,  v3.16b, v4.16b\n"           /* a3 * b0 = c30 */           \
-  "smlal2  v15.8h,  v3.16b, v5.16b\n"           /* a3 * b1 = c31 */           \
-  "bgt 0b\n"                                    /* jump to main loop */       \
-  "8:\n" /* finish main loop */                 /* 1st b2, b3 */              \
-  "sadalp  v16.4s, v8.8h\n"        /* pairwise accumulate to int32, out00 */  \
-  "smull  v8.8h,   v0.8b, v6.8b\n" /* a0 * b0 = c02 */                        \
-  "sadalp  v20.4s, v12.8h\n"       /* pairwise accumulate to int32, out01 */  \
-  "smull  v12.8h,  v0.8b, v7.8b\n" /* a0 * b1 = c03 */                        \
-  "sadalp  v17.4s, v9.8h\n"        /* pairwise accumulate to int32, out10 */  \
-  "smull  v9.8h,   v1.8b, v6.8b\n" /* a1 * b0 = c12 */                        \
-  "sadalp  v21.4s, v13.8h\n"       /* pairwise accumulate to int32, out11 */  \
-  "smull  v13.8h,  v1.8b, v7.8b\n" /* a1 * b1 = c13 */                        \
-  "sadalp  v18.4s, v10.8h\n"       /* pairwise accumulate to int32, out20 */  \
-  "smull  v10.8h,  v2.8b, v6.8b\n" /* a2 * b0 = c22 */                        \
-  "sadalp  v22.4s, v14.8h\n"       /* pairwise accumulate to int32, out21 */  \
-  "smull  v14.8h,  v2.8b, v7.8b\n" /* a2 * b1 = c23 */                        \
-  "sadalp  v19.4s, v11.8h\n"       /* pairwise accumulate to int32, out30 */  \
-  "smull  v11.8h,  v3.8b, v6.8b\n" /* a3 * b0 = c32 */                        \
-  "sadalp  v23.4s, v15.8h\n"       /* pairwise accumulate to int32, out31 */  \
-  "smull  v15.8h,  v3.8b, v7.8b\n" /* a3 * b1 = c33 */ /* 2nd b2, b3 */       \
-  "smlal2  v8.8h,   v0.16b, v6.16b\n"                  /* a0 * b0 = c02 */    \
-  "smlal2  v12.8h,  v0.16b, v7.16b\n"                  /* a0 * b1 = c03 */    \
-  "smlal2  v9.8h,   v1.16b, v6.16b\n"                  /* a1 * b0 = c12 */    \
-  "smlal2  v13.8h,  v1.16b, v7.16b\n"                  /* a1 * b1 = c23 */    \
-  "smlal2  v10.8h,  v2.16b, v6.16b\n"                  /* a2 * b0 = c13 */    \
-  "smlal2  v14.8h,  v2.16b, v7.16b\n"                  /* a2 * b1 = c32 */    \
-  "smlal2  v11.8h,  v3.16b, v6.16b\n"                  /* a3 * b0 = c22 */    \
-  "smlal2  v15.8h,  v3.16b, v7.16b\n"                  /* a3 * b1 = c33 */    \
-  "cbz    %w[rem],    5f\n"                            /* skip remain */      \
-  "ld1 {v0.8b}, [%[a_ptr]]\n"              /* load a to q0, final */          \
-  "ld1 {v4.16b, v5.16b}, [%[b_ptr]],#32\n" /* load b to q4, q5 */             \
-  "ld1 {v6.16b, v7.16b}, [%[b_ptr]],#32\n" /* load b to q6, q7 */             \
-  "5:\n"                                   /* no remain */                    \
-  "sadalp  v24.4s, v8.8h\n"  /* pairwise accumulate to int32, out02 */        \
-  "sadalp  v28.4s, v12.8h\n" /* pairwise accumulate to int32, out03 */        \
-  "sadalp  v25.4s, v9.8h\n"  /* pairwise accumulate to int32, out12 */        \
-  "sadalp  v29.4s, v13.8h\n" /* pairwise accumulate to int32, out13 */        \
-  "sadalp  v26.4s, v10.8h\n" /* pairwise accumulate to int32, out22 */        \
-  "sadalp  v30.4s, v14.8h\n" /* pairwise accumulate to int32, out23 */        \
-  "sadalp  v27.4s, v11.8h\n" /* pairwise accumulate to int32, out32 */        \
-  "sadalp  v31.4s, v15.8h\n" /* pairwise accumulate to int32, out33 */        \
-  "3: \n"                    /* process remains */                            \
-  "cbz    %w[rem],    7f\n" /* skip remain */ /* process remain k */          \
-  "4: \n"                                     /* remain = 1, 2 */             \
-  "ext    v1.8b, v0.8b, v0.8b, #2\n"          /* shift left 2bytes */         \
-  "ext    v2.8b, v0.8b, v0.8b, #4\n"          /* shift left 4bytes */         \
-  "ext    v3.8b, v0.8b, v0.8b, #6\n" /* shift left 6bytes */ /* 1st b0, b1 */ \
-  "smull  v8.8h,   v0.8b, v4.8b\n"                     /* a0 * b0 = c00 */    \
-  "smull  v12.8h,  v0.8b, v5.8b\n"                     /* a0 * b1 = c01 */    \
-  "smull  v9.8h,   v1.8b, v4.8b\n"                     /* a1 * b0 = c10 */    \
-  "smull  v13.8h,  v1.8b, v5.8b\n"                     /* a1 * b1 = c11 */    \
-  "smull  v10.8h,  v2.8b, v4.8b\n"                     /* a2 * b0 = c20 */    \
-  "smull  v14.8h,  v2.8b, v5.8b\n"                     /* a2 * b1 = c21 */    \
-  "smull  v11.8h,  v3.8b, v4.8b\n"                     /* a3 * b0 = c30 */    \
-  "smull  v15.8h,  v3.8b, v5.8b\n" /* a3 * b1 = c31 */ /* 1st b2, b3 */       \
-  "sadalp  v16.4s, v8.8h\n"        /* pairwise accumulate to int32, out00 */  \
-  "smull  v8.8h,   v0.8b, v6.8b\n" /* a0 * b0 = c02 */                        \
-  "sadalp  v20.4s, v12.8h\n"       /* pairwise accumulate to int32, out01 */  \
-  "smull  v12.8h,  v0.8b, v7.8b\n" /* a0 * b1 = c03 */                        \
-  "sadalp  v17.4s, v9.8h\n"        /* pairwise accumulate to int32, out10 */  \
-  "smull  v9.8h,   v1.8b, v6.8b\n" /* a1 * b0 = c12 */                        \
-  "sadalp  v21.4s, v13.8h\n"       /* pairwise accumulate to int32, out11 */  \
-  "smull  v13.8h,  v1.8b, v7.8b\n" /* a1 * b1 = c13 */                        \
-  "sadalp  v18.4s, v10.8h\n"       /* pairwise accumulate to int32, out20 */  \
-  "smull  v10.8h,  v2.8b, v6.8b\n" /* a2 * b0 = c22 */                        \
-  "sadalp  v22.4s, v14.8h\n"       /* pairwise accumulate to int32, out21 */  \
-  "smull  v14.8h,  v2.8b, v7.8b\n" /* a2 * b1 = c23 */                        \
-  "sadalp  v19.4s, v11.8h\n"       /* pairwise accumulate to int32, out30 */  \
-  "smull  v11.8h,  v3.8b, v6.8b\n" /* a3 * b0 = c32 */                        \
-  "sadalp  v23.4s, v15.8h\n"       /* pairwise accumulate to int32, out31 */  \
-  "smull  v15.8h,  v3.8b, v7.8b\n" /* a3 * b1 = c33 */                        \
-  "sadalp  v24.4s, v8.8h\n"        /* pairwise accumulate to int32, out02 */  \
-  "sadalp  v28.4s, v12.8h\n"       /* pairwise accumulate to int32, out03 */  \
-  "sadalp  v25.4s, v9.8h\n"        /* pairwise accumulate to int32, out12 */  \
-  "sadalp  v29.4s, v13.8h\n"       /* pairwise accumulate to int32, out13 */  \
-  "sadalp  v26.4s, v10.8h\n"       /* pairwise accumulate to int32, out22 */  \
-  "sadalp  v30.4s, v14.8h\n"       /* pairwise accumulate to int32, out23 */  \
-  "sadalp  v27.4s, v11.8h\n"       /* pairwise accumulate to int32, out32 */  \
-  "sadalp  v31.4s, v15.8h\n"       /* pairwise accumulate to int32, out33 */  \
-  "7: \n" /* do relu */            /* do relu */                              \
-  "cbz    %w[is_relu],    9f\n"    /* not relu, jump to unpack */             \
-  "movi   v0.4s, #0\n"             /* for relu */                             \
-  "smax   v16.4s, v16.4s, v0.4s\n" /* relu */                                 \
-  "smax   v17.4s, v17.4s, v0.4s\n" /* relu */                                 \
-  "smax   v18.4s, v18.4s, v0.4s\n" /* relu */                                 \
-  "smax   v19.4s, v19.4s, v0.4s\n" /* relu */                                 \
-  "smax   v20.4s, v20.4s, v0.4s\n" /* relu */                                 \
-  "smax   v21.4s, v21.4s, v0.4s\n" /* relu */                                 \
-  "smax   v22.4s, v22.4s, v0.4s\n" /* relu */                                 \
-  "smax   v23.4s, v23.4s, v0.4s\n" /* relu */                                 \
-  "smax   v24.4s, v24.4s, v0.4s\n" /* relu */                                 \
-  "smax   v25.4s, v25.4s, v0.4s\n" /* relu */                                 \
-  "smax   v26.4s, v26.4s, v0.4s\n" /* relu */                                 \
-  "smax   v27.4s, v27.4s, v0.4s\n" /* relu */                                 \
-  "smax   v28.4s, v28.4s, v0.4s\n" /* relu */                                 \
-  "smax   v29.4s, v29.4s, v0.4s\n" /* relu */                                 \
-  "smax   v30.4s, v30.4s, v0.4s\n" /* relu */                                 \
-  "smax   v31.4s, v31.4s, v0.4s\n" /* relu */ /* unpack the result */         \
-  "9:\n" /* unpack */                         /* trans 1 */                   \
-  "trn1   v0.4s,  v16.4s, v17.4s\n"           /* get a0,b0, a2,b2 */          \
-  "trn2   v1.4s,  v16.4s, v17.4s\n"           /* get a1,b1, a3,b3 */          \
-  "trn1   v2.4s,  v18.4s, v19.4s\n"           /* get c0,d0, c2,c2 */          \
-  "trn2   v3.4s,  v18.4s, v19.4s\n"           /* get c1,d1, c3,d3 */          \
-  "trn1   v4.4s,  v20.4s, v21.4s\n"                                           \
-  "trn2   v5.4s,  v20.4s, v21.4s\n"                                           \
-  "trn1   v6.4s,  v22.4s, v23.4s\n"                                           \
-  "trn2   v7.4s,  v22.4s, v23.4s\n"                                           \
-  "trn1   v8.4s,  v24.4s, v25.4s\n"                                           \
-  "trn2   v9.4s,  v24.4s, v25.4s\n"                                           \
-  "trn1  v10.4s,  v26.4s, v27.4s\n"                                           \
-  "trn2  v11.4s,  v26.4s, v27.4s\n"                                           \
-  "trn1  v12.4s,  v28.4s, v29.4s\n"                                           \
-  "trn2  v13.4s,  v28.4s, v29.4s\n"                                           \
-  "trn1  v14.4s,  v30.4s, v31.4s\n"                                           \
-  "trn2  v15.4s,  v30.4s, v31.4s\n" /* trans 2 */                             \
-  "trn1   v16.2d,  v0.2d, v2.2d\n"  /* get a0,b0, c0,d0 */                    \
-  "trn2   v18.2d,  v0.2d, v2.2d\n"  /* get a2,b2, c2,d2 */                    \
-  "trn1   v17.2d,  v1.2d, v3.2d\n"  /* get a1,b1, c1,d1 */                    \
-  "trn2   v19.2d,  v1.2d, v3.2d\n"  /* get a3,b3, c3,d3 */                    \
-  "trn1   v20.2d,  v4.2d, v6.2d\n"                                            \
-  "trn2   v22.2d,  v4.2d, v6.2d\n"                                            \
-  "trn1   v21.2d,  v5.2d, v7.2d\n"                                            \
-  "trn2   v23.2d,  v5.2d, v7.2d\n"                                            \
-  "trn1   v24.2d,  v8.2d, v10.2d\n"                                           \
-  "trn2   v26.2d,  v8.2d, v10.2d\n"                                           \
-  "trn1   v25.2d,  v9.2d, v11.2d\n"                                           \
-  "trn2   v27.2d,  v9.2d, v11.2d\n"                                           \
-  "trn1   v28.2d,  v12.2d, v14.2d\n"                                          \
-  "trn2   v30.2d,  v12.2d, v14.2d\n"                                          \
-  "trn1   v29.2d,  v13.2d, v15.2d\n"                                          \
-  "trn2   v31.2d,  v13.2d, v15.2d\n"        /* shift */                       \
-  "ext    v17.16b, v17.16b, v17.16b, #12\n" /* circular shift left 1 */       \
-  "ext    v18.16b, v18.16b, v18.16b, #8\n"  /* circular shift left 2 */       \
-  "ext    v19.16b, v19.16b, v19.16b, #4\n"  /* circular shift left 3 */       \
-  "ext    v21.16b, v21.16b, v21.16b, #12\n" /* circular shift left 1 */       \
-  "ext    v22.16b, v22.16b, v22.16b, #8\n"  /* circular shift left 2 */       \
-  "ext    v23.16b, v23.16b, v23.16b, #4\n"  /* circular shift left 3 */       \
-  "ext    v25.16b, v25.16b, v25.16b, #12\n" /* circular shift left 1 */       \
-  "ext    v26.16b, v26.16b, v26.16b, #8\n"  /* circular shift left 2 */       \
-  "ext    v27.16b, v27.16b, v27.16b, #4\n"  /* circular shift left 3 */       \
-  "ext    v29.16b, v29.16b, v29.16b, #12\n" /* circular shift left 1 */       \
-  "ext    v30.16b, v30.16b, v30.16b, #8\n"  /* circular shift left 2 */       \
-  "ext    v31.16b, v31.16b, v31.16b, #4\n"  /* circular shift left 3 */       \
-  "trn1   v0.4s,  v16.4s, v17.4s\n"         /* get a0,b0, a2,b2 */            \
-  "trn2   v1.4s,  v16.4s, v17.4s\n"         /* get a1,b1, a3,b3 */            \
-  "trn1   v2.4s,  v18.4s, v19.4s\n"         /* get c0,d0, c2,c2 */            \
-  "trn2   v3.4s,  v18.4s, v19.4s\n"         /* get c1,d1, c3,d3 */            \
-  "trn1   v4.4s,  v20.4s, v21.4s\n"                                           \
-  "trn2   v5.4s,  v20.4s, v21.4s\n"                                           \
-  "trn1   v6.4s,  v22.4s, v23.4s\n"                                           \
-  "trn2   v7.4s,  v22.4s, v23.4s\n"                                           \
-  "trn1   v8.4s,  v24.4s, v25.4s\n"                                           \
-  "trn2   v9.4s,  v24.4s, v25.4s\n"                                           \
-  "trn1  v10.4s,  v26.4s, v27.4s\n"                                           \
-  "trn2  v11.4s,  v26.4s, v27.4s\n"                                           \
-  "trn1  v12.4s,  v28.4s, v29.4s\n"                                           \
-  "trn2  v13.4s,  v28.4s, v29.4s\n"                                           \
-  "trn1  v14.4s,  v30.4s, v31.4s\n"                                           \
-  "trn2  v15.4s,  v30.4s, v31.4s\n" /* trans 2 */                             \
-  "trn1   v16.2d,  v0.2d, v2.2d\n"  /* get a0,b0, c0,d0 */                    \
-  "trn2   v24.2d,  v0.2d, v2.2d\n"  /* get a2,b2, c2,d2 */                    \
-  "trn1   v20.2d,  v1.2d, v3.2d\n"  /* get a1,b1, c1,d1 */                    \
-  "trn2   v28.2d,  v1.2d, v3.2d\n"  /* get a3,b3, c3,d3 */                    \
-  "trn1   v17.2d,  v4.2d, v6.2d\n"                                            \
-  "trn2   v25.2d,  v4.2d, v6.2d\n"                                            \
-  "trn1   v21.2d,  v5.2d, v7.2d\n"                                            \
-  "trn2   v29.2d,  v5.2d, v7.2d\n"                                            \
-  "trn1   v18.2d,  v8.2d, v10.2d\n"                                           \
-  "trn2   v26.2d,  v8.2d, v10.2d\n"                                           \
-  "trn1   v22.2d,  v9.2d, v11.2d\n"                                           \
-  "trn2   v30.2d,  v9.2d, v11.2d\n"                                           \
-  "trn1   v19.2d,  v12.2d, v14.2d\n"                                          \
-  "trn2   v27.2d,  v12.2d, v14.2d\n"                                          \
-  "trn1   v23.2d,  v13.2d, v15.2d\n"                                          \
-  "trn2   v31.2d,  v13.2d, v15.2d\n"
-
-// clang-format off
-#define GEMM_INT8_INT32_OUT                                                   \
-  /* store */                                                                 \
-  "st1    {v16.4s, v17.4s, v18.4s, v19.4s},   [%[c_ptr0]], #64\n"   \
-  "st1    {v20.4s, v21.4s, v22.4s, v23.4s},   [%[c_ptr1]], #64\n"   \
-  "st1    {v24.4s, v25.4s, v26.4s, v27.4s},   [%[c_ptr2]], #64\n"   \
-  "st1    {v28.4s, v29.4s, v30.4s, v31.4s},   [%[c_ptr3]], #64\n"
-// clang-format on
-
-#define GEMM_INT8_FP32_OUT                                                    \
-  /* store */                                                                 \
-  "ldr    q15, [%[scale]]\n"         /* load scale */                         \
-  "scvtf  v0.4s , v16.4s\n"          /*  00, convert to fp32 */               \
-  "scvtf  v1.4s , v17.4s\n"          /*  01, convert to fp32 */               \
-  "scvtf  v2.4s , v18.4s\n"          /*  02, convert to fp32 */               \
-  "scvtf  v3.4s , v19.4s\n"          /*  03, convert to fp32 */               \
-  "scvtf  v4.4s , v20.4s\n"          /*  10, convert to fp32 */               \
-  "scvtf  v5.4s , v21.4s\n"          /*  11, convert to fp32 */               \
-  "scvtf  v6.4s , v22.4s\n"          /*  12, convert to fp32 */               \
-  "scvtf  v7.4s , v23.4s\n"          /*  13, convert to fp32 */               \
-  "fmul   v16.4s, v0.4s, v15.s[0]\n" /*  00, mul scale to get final result */ \
-  "fmul   v17.4s, v1.4s, v15.s[0]\n" /*  01, mul scale to get final result */ \
-  "fmul   v18.4s, v2.4s, v15.s[0]\n" /*  02, mul scale to get final result */ \
-  "fmul   v19.4s, v3.4s, v15.s[0]\n" /*  03, mul scale to get final result */ \
-  "fmul   v20.4s, v4.4s, v15.s[1]\n" /*  10, mul scale to get final result */ \
-  "fmul   v21.4s, v5.4s, v15.s[1]\n" /*  11, mul scale to get final result */ \
-  "fmul   v22.4s, v6.4s, v15.s[1]\n" /*  12, mul scale to get final result */ \
-  "fmul   v23.4s, v7.4s, v15.s[1]\n" /*  13, mul scale to get final result */ \
-  "scvtf  v0.4s , v24.4s\n"          /*  20, convert to fp32 */               \
-  "scvtf  v1.4s , v25.4s\n"          /*  21, convert to fp32 */               \
-  "stp    q16, q17, [%[c_ptr0]], #32\n" /*  write r0, 0,1 */                  \
-  "scvtf  v2.4s , v26.4s\n"             /*  22, convert to fp32 */            \
-  "scvtf  v3.4s , v27.4s\n"             /*  23, convert to fp32 */            \
-  "stp    q18, q19, [%[c_ptr0]], #32\n" /*  write r0, 2,3 */                  \
-  "scvtf  v4.4s , v28.4s\n"             /*  30, convert to fp32 */            \
-  "scvtf  v5.4s , v29.4s\n"             /*  31, convert to fp32 */            \
-  "stp    q20, q21, [%[c_ptr1]], #32\n" /*  write r1, 0,1 */                  \
-  "scvtf  v6.4s , v30.4s\n"             /*  32, convert to fp32 */            \
-  "scvtf  v7.4s , v31.4s\n"             /*  33, convert to fp32 */            \
-  "stp    q22, q23, [%[c_ptr1]], #32\n" /*  write r1, 2,3 */                  \
-  "fmul   v24.4s, v0.4s, v15.s[2]\n" /*  20, mul scale to get final result */ \
-  "fmul   v25.4s, v1.4s, v15.s[2]\n" /*  21, mul scale to get final result */ \
-  "fmul   v26.4s, v2.4s, v15.s[2]\n" /*  22, mul scale to get final result */ \
-  "fmul   v27.4s, v3.4s, v15.s[2]\n" /*  23, mul scale to get final result */ \
-  "fmul   v28.4s, v4.4s, v15.s[3]\n" /*  30, mul scale to get final result */ \
-  "fmul   v29.4s, v5.4s, v15.s[3]\n" /*  31, mul scale to get final result */ \
-  "stp    q24, q25, [%[c_ptr2]], #32\n" /*  write r2, 2,3 */                  \
-  "fmul   v30.4s, v6.4s, v15.s[3]\n" /*  32, mul scale to get final result */ \
-  "stp    q26, q27, [%[c_ptr2]], #32\n" /*  write r2, 2,3 */                  \
-  "fmul   v31.4s, v7.4s, v15.s[3]\n" /*  33, mul scale to get final result */ \
-  "stp    q28, q29, [%[c_ptr3]], #32\n" /*  write r3, 2,3 */                  \
-  "stp    q30, q31, [%[c_ptr3]], #32\n" /*  write r3, 2,3 */
-
-#define GEMM_INT8_INT8_OUT                                                    \
-  /* store */                                                                 \
-  "ldr    q15, [%[scale]]\n"         /* load scale */                         \
-  "scvtf  v0.4s , v16.4s\n"          /*  00, convert to fp32 */               \
-  "scvtf  v1.4s , v17.4s\n"          /*  01, convert to fp32 */               \
-  "scvtf  v2.4s , v18.4s\n"          /*  02, convert to fp32 */               \
-  "scvtf  v3.4s , v19.4s\n"          /*  03, convert to fp32 */               \
-  "scvtf  v4.4s , v20.4s\n"          /*  10, convert to fp32 */               \
-  "scvtf  v5.4s , v21.4s\n"          /*  11, convert to fp32 */               \
-  "scvtf  v6.4s , v22.4s\n"          /*  12, convert to fp32 */               \
-  "scvtf  v7.4s , v23.4s\n"          /*  13, convert to fp32 */               \
-  "fmul   v16.4s, v0.4s, v15.s[0]\n" /*  00, mul scale to get final result */ \
-  "fmul   v17.4s, v1.4s, v15.s[0]\n" /*  01, mul scale to get final result */ \
-  "fmul   v18.4s, v2.4s, v15.s[0]\n" /*  02, mul scale to get final result */ \
-  "fmul   v19.4s, v3.4s, v15.s[0]\n" /*  03, mul scale to get final result */ \
-  "fmul   v20.4s, v4.4s, v15.s[1]\n" /*  20, mul scale to get final result */ \
-  "fmul   v21.4s, v5.4s, v15.s[1]\n" /*  21, mul scale to get final result */ \
-  "fmul   v22.4s, v6.4s, v15.s[1]\n" /*  22, mul scale to get final result */ \
-  "fmul   v23.4s, v7.4s, v15.s[1]\n" /*  23, mul scale to get final result */ \
-  "scvtf  v0.4s , v24.4s\n"          /*  20, convert to fp32 */               \
-  "scvtf  v1.4s , v25.4s\n"          /*  21, convert to fp32 */               \
-  "scvtf  v2.4s , v26.4s\n"          /*  22, convert to fp32 */               \
-  "scvtf  v3.4s , v27.4s\n"          /*  23, convert to fp32 */               \
-  "scvtf  v4.4s , v28.4s\n"          /*  30, convert to fp32 */               \
-  "scvtf  v5.4s , v29.4s\n"          /*  31, convert to fp32 */               \
-  "scvtf  v6.4s , v30.4s\n"          /*  32, convert to fp32 */               \
-  "scvtf  v7.4s , v31.4s\n"          /*  33, convert to fp32 */               \
-  "fmul   v24.4s, v0.4s, v15.s[2]\n" /*  20, mul scale to get final result */ \
-  "fmul   v25.4s, v1.4s, v15.s[2]\n" /*  21, mul scale to get final result */ \
-  "fmul   v26.4s, v2.4s, v15.s[2]\n" /*  22, mul scale to get final result */ \
-  "fmul   v27.4s, v3.4s, v15.s[2]\n" /*  23, mul scale to get final result */ \
-  "fmul   v28.4s, v4.4s, v15.s[3]\n" /*  30, mul scale to get final result */ \
-  "fmul   v29.4s, v5.4s, v15.s[3]\n" /*  31, mul scale to get final result */ \
-  "fmul   v30.4s, v6.4s, v15.s[3]\n" /*  32, mul scale to get final result */ \
-  "fmul   v31.4s, v7.4s, v15.s[3]\n" /*  33, mul scale to get final result */ \
-  "fcvtas v0.4s, v16.4s\n"           /*  00, cvt to int */                    \
-  "fcvtas v1.4s, v17.4s\n"           /*  01, cvt to int */                    \
-  "fcvtas v2.4s, v18.4s\n"           /*  02, cvt to int */                    \
-  "fcvtas v3.4s, v19.4s\n"           /*  03, cvt to int */                    \
-  "fcvtas v4.4s, v20.4s\n"           /*  10, cvt to int */                    \
-  "fcvtas v5.4s, v21.4s\n"           /*  11, cvt to int */                    \
-  "fcvtas v6.4s, v22.4s\n"           /*  12, cvt to int */                    \
-  "fcvtas v7.4s, v23.4s\n"           /*  13, cvt to int */                    \
-  "sqxtn  v16.4h, v0.4s\n"           /*  00, cvt int32 to int16 */            \
-  "fcvtas v8.4s, v24.4s\n"           /*  20, cvt to int */                    \
-  "sqxtn2 v16.8h, v1.4s\n"           /*  01, cvt int32 to int16 */            \
-  "fcvtas v9.4s, v25.4s\n"           /*  21, cvt to int */                    \
-  "sqxtn  v17.4h, v2.4s\n"           /*  02, cvt int32 to int16 */            \
-  "fcvtas v10.4s, v26.4s\n"          /*  22, cvt to int */                    \
-  "sqxtn2 v17.8h, v3.4s\n"           /*  03, cvt int32 to int16 */            \
-  "fcvtas v11.4s, v27.4s\n"          /*  23, cvt to int */                    \
-  "sqxtn  v18.4h, v4.4s\n"           /*  10, cvt int32 to int16 */            \
-  "fcvtas v12.4s, v28.4s\n"          /*  30, cvt to int */                    \
-  "sqxtn2 v18.8h, v5.4s\n"           /*  11, cvt int32 to int16 */            \
-  "fcvtas v13.4s, v29.4s\n"          /*  31, cvt to int */                    \
-  "sqxtn  v19.4h, v6.4s\n"           /*  12, cvt int32 to int16 */            \
-  "fcvtas v14.4s, v30.4s\n"          /*  32, cvt to int */                    \
-  "sqxtn2 v19.8h, v7.4s\n"           /*  13, cvt int32 to int16 */            \
-  "fcvtas v15.4s, v31.4s\n"          /*  33, cvt to int */                    \
-  "sqxtn  v0.8b, v16.8h\n"           /*  00, 01, cvt int16 to int8 */         \
-  "sqxtn2 v0.16b, v17.8h\n"          /*  02, 03, cvt int16 to int8 */         \
-  "sqxtn  v1.8b, v18.8h\n"           /*  10, 11, cvt int16 to int8 */         \
-  "sqxtn2 v1.16b, v19.8h\n"          /*  12, 13, cvt int16 to int8 */         \
-  "sqxtn  v20.4h, v8.4s\n"           /*  20, cvt int32 to int16 */            \
-  "sqxtn2 v20.8h, v9.4s\n"           /*  21, cvt int32 to int16 */            \
-  "sqxtn  v21.4h, v10.4s\n"          /*  22, cvt int32 to int16 */            \
-  "sqxtn2 v21.8h, v11.4s\n"          /*  23, cvt int32 to int16 */            \
-  "sqxtn  v22.4h, v12.4s\n"          /*  30, cvt int32 to int16 */            \
-  "sqxtn2 v22.8h, v13.4s\n"          /*  31, cvt int32 to int16 */            \
-  "sqxtn  v23.4h, v14.4s\n"          /*  32, cvt int32 to int16 */            \
-  "sqxtn2 v23.8h, v15.4s\n"          /*  33, cvt int32 to int16 */            \
-  "sqxtn  v2.8b, v20.8h\n"           /*  20, 21, cvt int16 to int8 */         \
-  "sqxtn2 v2.16b, v21.8h\n"          /*  22, 23, cvt int16 to int8 */         \
-  "sqxtn  v3.8b, v22.8h\n"           /*  30, 31, cvt int16 to int8 */         \
-  "sqxtn2 v3.16b, v23.8h\n"          /*  32, 33, cvt int16 to int8 */         \
-  "str    q0, [%[c_ptr0]], #16\n"    /*  write r0 */                          \
-  "str    q1, [%[c_ptr1]], #16\n"    /*  write r1 */                          \
-  "str    q2, [%[c_ptr2]], #16\n"    /*  write r2 */                          \
-  "str    q3, [%[c_ptr3]], #16\n"    /*  write r3 */
-
-template <>
-inline void gemm_int8_kernel(const int8_t* a_ptr,
-                             const int8_t*& b_ptr,  // NOLINT
-                             const int32_t* bias,
-                             int32_t*& c_ptr0,    // NOLINT
-                             int32_t*& c_ptr1,    // NOLINT
-                             int32_t*& c_ptr2,    // NOLINT
-                             int32_t*& c_ptr3,    // NOLINT
-                             const float* scale,  // NOLINT
-                             bool is_relu,        // NOLINT
-                             int k,
-                             int rem) {
-  asm volatile(GEMM_INT8_KERNEL GEMM_INT8_INT32_OUT
-               : [a_ptr] "+r"(a_ptr),
-                 [b_ptr] "+r"(b_ptr),
-                 [c_ptr0] "+r"(c_ptr0),
-                 [c_ptr1] "+r"(c_ptr1),
-                 [c_ptr2] "+r"(c_ptr2),
-                 [c_ptr3] "+r"(c_ptr3),
-                 [k] "+r"(k)
-               : [is_relu] "r"(is_relu), [bias] "r"(bias), [rem] "r"(rem)
-               : "v0",
-                 "v1",
-                 "v2",
-                 "v3",
-                 "v4",
-                 "v5",
-                 "v6",
-                 "v7",
-                 "v8",
-                 "v9",
-                 "v10",
-                 "v11",
-                 "v12",
-                 "v13",
-                 "v14",
-                 "v15",
-                 "v16",
-                 "v17",
-                 "v18",
-                 "v19",
-                 "v20",
-                 "v21",
-                 "v22",
-                 "v23",
-                 "v24",
-                 "v25",
-                 "v26",
-                 "v27",
-                 "v28",
-                 "v29",
-                 "v30",
-                 "v31",
-                 "cc");
-}
-template <>
-inline void gemm_int8_kernel(const int8_t* a_ptr,
-                             const int8_t*& b_ptr,  // NOLINT
-                             const int32_t* bias,
-                             float*& c_ptr0,  // NOLINT
-                             float*& c_ptr1,  // NOLINT
-                             float*& c_ptr2,  // NOLINT
-                             float*& c_ptr3,  // NOLINT
-                             const float* scale,
-                             bool is_relu,
-                             int k,
-                             int rem) {
-  asm volatile(GEMM_INT8_KERNEL GEMM_INT8_FP32_OUT
-               : [a_ptr] "+r"(a_ptr),
-                 [b_ptr] "+r"(b_ptr),
-                 [c_ptr0] "+r"(c_ptr0),
-                 [c_ptr1] "+r"(c_ptr1),
-                 [c_ptr2] "+r"(c_ptr2),
-                 [c_ptr3] "+r"(c_ptr3),
-                 [k] "+r"(k)
-               : [is_relu] "r"(is_relu),
-                 [bias] "r"(bias),
-                 [rem] "r"(rem),
-                 [scale] "r"(scale)
-               : "v0",
-                 "v1",
-                 "v2",
-                 "v3",
-                 "v4",
-                 "v5",
-                 "v6",
-                 "v7",
-                 "v8",
-                 "v9",
-                 "v10",
-                 "v11",
-                 "v12",
-                 "v13",
-                 "v14",
-                 "v15",
-                 "v16",
-                 "v17",
-                 "v18",
-                 "v19",
-                 "v20",
-                 "v21",
-                 "v22",
-                 "v23",
-                 "v24",
-                 "v25",
-                 "v26",
-                 "v27",
-                 "v28",
-                 "v29",
-                 "v30",
-                 "v31",
-                 "cc");
-}
-
-template <>
-inline void gemm_int8_kernel(const int8_t* a_ptr,
-                             const int8_t*& b_ptr,  // NOLINT
-                             const int32_t* bias,
-                             int8_t*& c_ptr0,  // NOLINT
-                             int8_t*& c_ptr1,  // NOLINT
-                             int8_t*& c_ptr2,  // NOLINT
-                             int8_t*& c_ptr3,  // NOLINT
-                             const float* scale,
-                             bool is_relu,
-                             int k,
-                             int rem) {
-  asm volatile(GEMM_INT8_KERNEL GEMM_INT8_INT8_OUT
-               : [a_ptr] "+r"(a_ptr),
-                 [b_ptr] "+r"(b_ptr),
-                 [c_ptr0] "+r"(c_ptr0),
-                 [c_ptr1] "+r"(c_ptr1),
-                 [c_ptr2] "+r"(c_ptr2),
-                 [c_ptr3] "+r"(c_ptr3),
-                 [k] "+r"(k)
-               : [is_relu] "r"(is_relu),
-                 [bias] "r"(bias),
-                 [rem] "r"(rem),
-                 [scale] "r"(scale)
-               : "v0",
-                 "v1",
-                 "v2",
-                 "v3",
-                 "v4",
-                 "v5",
-                 "v6",
-                 "v7",
-                 "v8",
-                 "v9",
-                 "v10",
-                 "v11",
-                 "v12",
-                 "v13",
-                 "v14",
-                 "v15",
-                 "v16",
-                 "v17",
-                 "v18",
-                 "v19",
-                 "v20",
-                 "v21",
-                 "v22",
-                 "v23",
-                 "v24",
-                 "v25",
-                 "v26",
-                 "v27",
-                 "v28",
-                 "v29",
-                 "v30",
-                 "v31",
-                 "cc");
-}
-
-#ifdef WITH_ARM_DOTPROD
-template <typename Dtype>
-inline void sgemm_sdot_int8_kernel(const int8_t* a_ptr,
-                                   const int8_t*& b_ptr,  // NOLINT
-                                   const int32_t* bias,
-                                   Dtype*& c_ptr0,  // NOLINT
-                                   Dtype*& c_ptr1,  // NOLINT
-                                   Dtype*& c_ptr2,  // NOLINT
-                                   Dtype*& c_ptr3,  // NOLINT
-                                   Dtype*& c_ptr4,  // NOLINT
-                                   Dtype*& c_ptr5,  // NOLINT
-                                   Dtype*& c_ptr6,  // NOLINT
-                                   Dtype*& c_ptr7,  // NOLINT
-                                   const float32_t* scale,
-                                   bool is_relu,
-                                   int k,
-                                   int rem);
-
-#define GEMM_SDOT_INT8_KERNEL                                              \
-  "ldp    q2, q3, [%[bias_ptr]]\n"       /* load bias to q2, q3*/          \
-  "ldp    q0, q1, [%[a_ptr]], #32\n"     /* load a00,a01 to q0, q1*/       \
-  "ldp    q4, q5, [%[b_ptr]], #32\n"     /* load b0, b1 to q4, q5*/        \
-  "dup    v8.4s,  v2.s[0]\n"             /* out0 = 0 */                    \
-  "dup    v9.4s,  v2.s[0]\n"             /* out1 = 0*/                     \
-  "dup    v10.4s, v2.s[0]\n"             /* out2 = 0*/                     \
-  "dup    v11.4s, v2.s[1]\n"             /* out3 = 0*/                     \
-  "dup    v12.4s, v2.s[1]\n"             /* out4 = 0*/                     \
-  "prfm   pldl1keep, [%[b_ptr], #64]\n"  /* preload b*/                    \
-  "dup    v13.4s, v2.s[1]\n"             /* out5 = 0*/                     \
-  "prfm   pldl1keep, [%[a_ptr], #64]\n"  /* preload a*/                    \
-  "dup    v14.4s, v2.s[2]\n"             /* out6 = 0*/                     \
-  "prfm   pldl1keep, [%[b_ptr], #128]\n" /* preload b*/                    \
-  "dup    v15.4s, v2.s[2]\n"             /* out7 = 0*/                     \
-  "prfm   pldl1keep, [%[a_ptr], #128]\n" /* preload a*/                    \
-  "dup    v16.4s, v2.s[2]\n"             /* out8 = 0*/                     \
-  "prfm   pldl1keep, [%[b_ptr], #192]\n" /* preload b*/                    \
-  "dup    v17.4s, v2.s[3]\n"             /* out9 = 0*/                     \
-  "prfm   pldl1keep, [%[b_ptr], #256]\n" /* preload b*/                    \
-  "dup    v18.4s, v2.s[3]\n"             /* out10 = 0*/                    \
-  "prfm   pldl1keep, [%[a_ptr], #192]\n" /* preload a*/                    \
-  "dup    v19.4s, v2.s[3]\n"             /* out11 = 0*/                    \
-  "prfm   pldl1keep, [%[b_ptr], #320]\n" /* preload b*/                    \
-  "dup    v20.4s, v3.s[0]\n"             /* out12 = 0*/                    \
-  "prfm   pldl1keep, [%[a_ptr], #256]\n" /* preload a*/                    \
-  "dup    v21.4s, v3.s[0]\n"             /* out13 = 0*/                    \
-  "prfm   pldl1keep, [%[b_ptr], #384]\n" /* preload b*/                    \
-  "dup    v22.4s, v3.s[0]\n"             /* out14 = 0*/                    \
-  "dup    v23.4s, v3.s[1]\n"             /* out15 = 0*/                    \
-  "dup    v24.4s, v3.s[1]\n"             /* out16 = 0*/                    \
-  "dup    v25.4s, v3.s[1]\n"             /* out17 = 0*/                    \
-  "dup    v26.4s, v3.s[2]\n"             /* out18 = 0*/                    \
-  "dup    v27.4s, v3.s[2]\n"             /* out19 = 0*/                    \
-  "dup    v28.4s, v3.s[2]\n"             /* out20 = 0*/                    \
-  "dup    v29.4s, v3.s[3]\n"             /* out21 = 0*/                    \
-  "dup    v30.4s, v3.s[3]\n"             /* out22 = 0*/                    \
-  "dup    v31.4s, v3.s[3]\n"             /* out23 = 0*/                    \
-  "cbz    %w[k], 2f\n"                   /* check loop count > 0 */        \
-  "1:\n"                                 /* main loop */                   \
-  "sdot   v8.4s ,  v4.16b,  v0.4b[0]\n"  /* out0 = b0 * a00[0], b0 = q4 */ \
-  "sdot   v11.4s ,  v4.16b,  v0.4b[1]\n" /* out1 = b0 * a00[1], b0 = q4 */ \
-  "ldp    q6, q7, [%[b_ptr]], #32\n"     /* load b2, b0 to q6, q7       */ \
-  "sdot   v14.4s,  v4.16b,  v0.4b[2]\n"  /* out2 = b0 * a00[2], b0 = q4 */ \
-  "sdot   v17.4s,  v4.16b,  v0.4b[3]\n"  /* out3 = b0 * a00[3], b0 = q4 */ \
-  "ldp    q2, q3, [%[a_ptr]], #32\n"     /* load a10, a11 to q3, q4     */ \
-  "sdot   v20.4s,  v4.16b,  v1.4b[0]\n"  /* out4 = b0 * a01[0], b0 = q4 */ \
-  "sdot   v23.4s,  v4.16b,  v1.4b[1]\n"  /* out5 = b0 * a01[1], b0 = q4 */ \
-  "sdot   v26.4s,  v4.16b,  v1.4b[2]\n"  /* out6 = b0 * a01[2], b0 = q4 */ \
-  "sdot   v29.4s,  v4.16b,  v1.4b[3]\n"  /* out7 = b0 * a01[3], b0 = q4 */ \
-  "sdot   v9.4s,  v5.16b,  v0.4b[0]\n"   /* out8 = b1 * a00[0], b1 = q5 */ \
-  "sdot   v12.4s,  v5.16b,  v0.4b[1]\n"  /* out9 = b1 * a00[1], b1 = q5 */ \
-  "sdot   v15.4s,  v5.16b,  v0.4b[2]\n"  /* out10 = b1 * a00[2], b1 = q5*/ \
-  "sdot   v18.4s,  v5.16b,  v0.4b[3]\n"  /* out11 = b1 * a00[3], b1 = q5*/ \
-  "sdot   v21.4s,  v5.16b,  v1.4b[0]\n"  /* out12 = b1 * a01[0], b1 = q5*/ \
-  "sdot   v24.4s,  v5.16b,  v1.4b[1]\n"  /* out13 = b1 * a01[1], b1 = q5*/ \
-  "sdot   v27.4s,  v5.16b,  v1.4b[2]\n"  /* out14 = b1 * a01[2], b1 = q5*/ \
-  "sdot   v30.4s,  v5.16b,  v1.4b[3]\n"  /* out15 = b1 * a01[3], b1 = q5*/ \
-  "ldp    q4, q5, [%[b_ptr]], #32\n"     /* load b1, b2 to q4, q5       */ \
-  "sdot   v10.4s,  v6.16b,  v0.4b[0]\n"  /* out16 = b2 * a00[0], b2 = q6*/ \
-  "sdot   v13.4s,  v6.16b,  v0.4b[1]\n"  /* out17 = b2 * a00[1], b2 = q6*/ \
-  "prfm   pldl1keep, [%[b_ptr], #384]\n"                                   \
-  "sdot   v16.4s,  v6.16b,  v0.4b[2]\n"  /* out18 = b2 * a00[2], b2 = q6*/ \
-  "sdot   v19.4s,  v6.16b,  v0.4b[3]\n"  /* out19 = b2 * a00[3], b2 = q6*/ \
-  "sdot   v22.4s,  v6.16b,  v1.4b[0]\n"  /* out20 = b2 * a00[0], b2 = q6*/ \
-  "sdot   v25.4s,  v6.16b,  v1.4b[1]\n"  /* out21 = b2 * a00[1], b2 = q6*/ \
-  "sdot   v28.4s,  v6.16b,  v1.4b[2]\n"  /* out22 = b2 * a00[2], b2 = q6*/ \
-  "sdot   v31.4s,  v6.16b,  v1.4b[3]\n"  /* out23 = b2 * a00[3], b2 = q6*/ \
-  "ldp    q0, q1, [%[a_ptr]], #32\n"     /* load a00, a01 to q0, q1     */ \
-  "sdot   v8.4s ,  v7.16b,  v2.4b[0]\n"  /* out0 = b0 * a10[0], b0 = q7 */ \
-  "sdot   v11.4s ,  v7.16b,  v2.4b[1]\n" /* out1 = b0 * a10[1], b0 = q7 */ \
-  "sdot   v14.4s,  v7.16b,  v2.4b[2]\n"  /* out2 = b0 * a10[2], b0 = q7 */ \
-  "prfm   pldl1keep, [%[a_ptr], #256]\n"                                   \
-  "sdot   v17.4s,  v7.16b,  v2.4b[3]\n"  /* out3 = b0 * a10[3], b0 = q7 */ \
-  "sdot   v20.4s,  v7.16b,  v3.4b[0]\n"  /* out4 = b0 * a11[0], b0 = q7 */ \
-  "sdot   v23.4s,  v7.16b,  v3.4b[1]\n"  /* out5 = b0 * a11[1], b0 = q7 */ \
-  "sdot   v26.4s,  v7.16b,  v3.4b[2]\n"  /* out6 = b0 * a11[2], b0 = q7 */ \
-  "sdot   v29.4s,  v7.16b,  v3.4b[3]\n"  /* out7 = b0 * a11[3], b0 = q7 */ \
-  "ldp    q6, q7, [%[b_ptr]], #32\n"     /* load b0, b1 to q6, q7       */ \
-  "sdot   v9.4s,  v4.16b,  v2.4b[0]\n"   /* out8 = b0 * a10[0], b1 = q4 */ \
-  "sdot   v12.4s,  v4.16b,  v2.4b[1]\n"  /* out9 = b0 * a10[1], b1 = q4 */ \
-  "sdot   v15.4s,  v4.16b,  v2.4b[2]\n"  /* out10 = b1 * a10[2], b1 = q4*/ \
-  "sdot   v18.4s,  v4.16b,  v2.4b[3]\n"  /* out11 = b1 * a10[3], b1 = q4*/ \
-  "sdot   v21.4s,  v4.16b,  v3.4b[0]\n"  /* out12 = b1 * a10[0], b1 = q4*/ \
-  "sdot   v24.4s,  v4.16b,  v3.4b[1]\n"  /* out13 = b1 * a10[1], b1 = q4*/ \
-  "sdot   v27.4s,  v4.16b,  v3.4b[2]\n"  /* out14 = b1 * a10[2], b1 = q4*/ \
-  "sdot   v30.4s,  v4.16b,  v3.4b[3]\n"  /* out15 = b1 * a10[3], b1 = q4*/ \
-  "sdot   v10.4s,  v5.16b,  v2.4b[0]\n"  /* out16 = b2 * a10[0], b2 = q5*/ \
-  "sdot   v13.4s,  v5.16b,  v2.4b[1]\n"  /* out17 = b2 * a10[0], b2 = q5*/ \
-  "sdot   v16.4s,  v5.16b,  v2.4b[2]\n"  /* out18 = b2 * a10[0], b2 = q5*/ \
-  "sdot   v19.4s,  v5.16b,  v2.4b[3]\n"  /* out19 = b2 * a10[0], b2 = q5*/ \
-  "sdot   v22.4s,  v5.16b,  v3.4b[0]\n"  /* out20 = b2 * a10[0], b2 = q5*/ \
-  "sdot   v25.4s,  v5.16b,  v3.4b[1]\n"  /* out21 = b2 * a10[0], b2 = q5*/ \
-  "sdot   v28.4s,  v5.16b,  v3.4b[2]\n"  /* out22 = b2 * a10[0], b2 = q5*/ \
-  "sdot   v31.4s,  v5.16b,  v3.4b[3]\n"  /* out23 = b2 * a10[0], b2 = q5*/ \
-  "ldp    q4, q5, [%[b_ptr]], #32\n"     /* load b2, b0 to q4, q5       */ \
-  "sdot   v8.4s ,  v6.16b,  v0.4b[0]\n"  /* out0 = b0 * a00[0], b0 = q6 */ \
-  "sdot   v11.4s ,  v6.16b,  v0.4b[1]\n" /* out1 = b0 * a00[1], b0 = q6 */ \
-  "ldp    q2, q3, [%[a_ptr]], #32\n"     /* load a10, a11 to q3, q4*/      \
-  "sdot   v14.4s,  v6.16b,  v0.4b[2]\n"  /* out2 = b0 * a00[2], b0 = q6*/  \
-  "sdot   v17.4s,  v6.16b,  v0.4b[3]\n"  /* out3 = b0 * a00[3], b0 = q6*/  \
-  "sdot   v20.4s,  v6.16b,  v1.4b[0]\n"  /* out4 = b0 * a01[0], b0 = q6*/  \
-  "sdot   v23.4s,  v6.16b,  v1.4b[1]\n"  /* out5 = b0 * a01[1], b0 = q6*/  \
-  "sdot   v26.4s,  v6.16b,  v1.4b[2]\n"  /* out6 = b0 * a01[2], b0 = q6*/  \
-  "sdot   v29.4s,  v6.16b,  v1.4b[3]\n"  /* out7 = b0 * a01[3], b0 = q6*/  \
-  "sdot   v9.4s,  v7.16b,  v0.4b[0]\n"   /* out8 = b1 * a00[0], b1 = q7*/  \
-  "sdot   v12.4s,  v7.16b,  v0.4b[1]\n"  /* out9 = b1 * a00[1], b1 = q7*/  \
-  "prfm   pldl1keep, [%[b_ptr], #384]\n"                                   \
-  "sdot   v15.4s,  v7.16b,  v0.4b[2]\n" /* out10 = b1 * a00[2], b1 = q7*/  \
-  "sdot   v18.4s,  v7.16b,  v0.4b[3]\n" /* out11 = b1 * a00[3], b1 = q7*/  \
-  "sdot   v21.4s,  v7.16b,  v1.4b[0]\n" /* out12 = b1 * a01[0], b1 = q7*/  \
-  "sdot   v24.4s,  v7.16b,  v1.4b[1]\n" /* out13 = b1 * a01[1], b1 = q7*/  \
-  "sdot   v27.4s,  v7.16b,  v1.4b[2]\n" /* out14 = b1 * a01[2], b1 = q7*/  \
-  "sdot   v30.4s,  v7.16b,  v1.4b[3]\n" /* out15 = b1 * a01[3], b1 = q7*/  \
-  "ldp    q6, q7, [%[b_ptr]], #32\n"    /* load b1, b2 to q6, q7*/         \
-  "sdot   v10.4s,  v4.16b,  v0.4b[0]\n" /* out16 = b2 * a00[0], b2 = q4*/  \
-  "sdot   v13.4s,  v4.16b,  v0.4b[1]\n" /* out17 = b2 * a00[1], b2 = q4*/  \
-  "sdot   v16.4s,  v4.16b,  v0.4b[2]\n" /* out18 = b2 * a00[2], b2 = q4*/  \
-  "sdot   v19.4s,  v4.16b,  v0.4b[3]\n" /* out19 = b2 * a00[3], b2 = q4*/  \
-  "sdot   v22.4s,  v4.16b,  v1.4b[0]\n" /* out20 = b2 * a00[0], b2 = q4*/  \
-  "sdot   v25.4s,  v4.16b,  v1.4b[1]\n" /* out21 = b2 * a00[1], b2 = q4*/  \
-  "sdot   v28.4s,  v4.16b,  v1.4b[2]\n" /* out22 = b2 * a00[2], b2 = q4*/  \
-  "sdot   v31.4s,  v4.16b,  v1.4b[3]\n" /* out23 = b2 * a00[3], b2 = q4*/  \
-  "ldp    q0, q1, [%[a_ptr]], #32\n" /* load a00, a01 */ /* unrool 3*/     \
-  "sdot   v8.4s ,  v5.16b,  v2.4b[0]\n"  /* out0 = b0 * a10[0], b0 = q5*/  \
-  "sdot   v11.4s ,  v5.16b,  v2.4b[1]\n" /* out1 = b0 * a10[1], b0 = q5*/  \
-  "sdot   v14.4s,  v5.16b,  v2.4b[2]\n"  /* out2 = b0 * a10[2], b0 = q5*/  \
-  "sdot   v17.4s,  v5.16b,  v2.4b[3]\n"  /* out3 = b0 * a10[3], b0 = q5*/  \
-  "sdot   v20.4s,  v5.16b,  v3.4b[0]\n"  /* out4 = b0 * a11[0], b0 = q5*/  \
-  "sdot   v23.4s,  v5.16b,  v3.4b[1]\n"  /* out5 = b0 * a11[1], b0 = q5*/  \
-  "sdot   v26.4s,  v5.16b,  v3.4b[2]\n"  /* out6 = b0 * a11[2], b0 = q5*/  \
-  "sdot   v29.4s,  v5.16b,  v3.4b[3]\n"  /* out7 = b0 * a11[3], b0 = q5*/  \
-  "ldp    q4, q5, [%[b_ptr]], #32\n"     /* load b0, b1 to q4, q5*/        \
-  "sdot   v9.4s,  v6.16b,  v2.4b[0]\n"   /* out8 = b0 * a10[0], b1 = q6*/  \
-  "sdot   v12.4s,  v6.16b,  v2.4b[1]\n"  /* out9 = b0 * a10[1], b1 = q6*/  \
-  "prfm   pldl1keep, [%[a_ptr], #256]\n"                                   \
-  "sdot   v15.4s,  v6.16b,  v2.4b[2]\n" /* out10 = b1 * a10[2], b1 = q6*/  \
-  "sdot   v18.4s,  v6.16b,  v2.4b[3]\n" /* out11 = b1 * a10[3], b1 = q6*/  \
-  "sdot   v21.4s,  v6.16b,  v3.4b[0]\n" /* out12 = b1 * a10[0], b1 = q6*/  \
-  "sdot   v24.4s,  v6.16b,  v3.4b[1]\n" /* out13 = b1 * a10[1], b1 = q6*/  \
-  "sdot   v27.4s,  v6.16b,  v3.4b[2]\n" /* out14 = b1 * a10[2], b1 = q6*/  \
-  "prfm   pldl1keep, [%[b_ptr], #384]\n"                                   \
-  "sdot   v30.4s,  v6.16b,  v3.4b[3]\n" /* out15 = b1 * a10[3], b1 = q6*/  \
-  "sdot   v10.4s,  v7.16b,  v2.4b[0]\n" /* out16 = b2 * a10[0], b2 = q7*/  \
-  "sdot   v13.4s,  v7.16b,  v2.4b[1]\n" /* out17 = b2 * a10[0], b2 = q7*/  \
-  "sdot   v16.4s,  v7.16b,  v2.4b[2]\n" /* out18 = b2 * a10[0], b2 = q7*/  \
-  "sdot   v19.4s,  v7.16b,  v2.4b[3]\n" /* out19 = b2 * a10[0], b2 = q7*/  \
-  "sdot   v22.4s,  v7.16b,  v3.4b[0]\n" /* out20 = b2 * a10[0], b2 = q7*/  \
-  "sdot   v25.4s,  v7.16b,  v3.4b[1]\n" /* out21 = b2 * a10[0], b2 = q7*/  \
-  "subs   %w[k], %w[k], #1\n"           /* loop count - 1*/                \
-  "sdot   v28.4s,  v7.16b,  v3.4b[2]\n" /* out22 = b2 * a10[0], b2 = q7*/  \
-  "sdot   v31.4s,  v7.16b,  v3.4b[3]\n" /* out23 = b2 * a10[0], b2 = q7*/  \
-  "bne    1b\n"                                                            \
-  "2:\n"                                /* process tail*/                  \
-  "subs       %w[tail], %w[tail], #1\n" /* tail--*/                        \
-  "beq        3f\n"                                                        \
-  "sdot   v8.4s ,  v4.16b,  v0.4b[0]\n"  /* out0 = b0 * a00[0], b0 = q4*/  \
-  "sdot   v11.4s ,  v4.16b,  v0.4b[1]\n" /* out1 = b0 * a00[1], b0 = q4*/  \
-  "ldp    q6, q7, [%[b_ptr]], #32\n"     /* load b2, b0 to q6, q7*/        \
-  "sdot   v14.4s,  v4.16b,  v0.4b[2]\n"  /* out2 = b0 * a00[2], b0 = q4*/  \
-  "sdot   v17.4s,  v4.16b,  v0.4b[3]\n"  /* out3 = b0 * a00[3], b0 = q4*/  \
-  "ldp    q2, q3, [%[a_ptr]], #32\n"     /* load a10, a11 to q2, q3*/      \
-  "sdot   v20.4s,  v4.16b,  v1.4b[0]\n"  /* out4 = b0 * a01[0], b0 = q4*/  \
-  "sdot   v23.4s,  v4.16b,  v1.4b[1]\n"  /* out5 = b0 * a01[1], b0 = q4*/  \
-  "sdot   v26.4s,  v4.16b,  v1.4b[2]\n"  /* out6 = b0 * a01[2], b0 = q4*/  \
-  "sdot   v29.4s,  v4.16b,  v1.4b[3]\n"  /* out7 = b0 * a01[3], b0 = q4*/  \
-  "subs   %w[tail], %w[tail], #1\n"      /* tail--*/                       \
-  "sdot   v9.4s,  v5.16b,  v0.4b[0]\n"   /* out8 = b1 * a00[0], b1 = q5*/  \
-  "sdot   v12.4s,  v5.16b,  v0.4b[1]\n"  /* out9 = b1 * a00[1], b1 = q5*/  \
-  "sdot   v15.4s,  v5.16b,  v0.4b[2]\n"  /* out10 = b1 * a00[2], b1 = q5*/ \
-  "sdot   v18.4s,  v5.16b,  v0.4b[3]\n"  /* out11 = b1 * a00[3], b1 = q5*/ \
-  "sdot   v21.4s,  v5.16b,  v1.4b[0]\n"  /* out12 = b1 * a01[0], b1 = q5*/ \
-  "sdot   v24.4s,  v5.16b,  v1.4b[1]\n"  /* out13 = b1 * a01[1], b1 = q5*/ \
-  "sdot   v27.4s,  v5.16b,  v1.4b[2]\n"  /* out14 = b1 * a01[2], b1 = q5*/ \
-  "sdot   v30.4s,  v5.16b,  v1.4b[3]\n"  /* out15 = b1 * a01[3], b1 = q5*/ \
-  "ldp    q4, q5, [%[b_ptr]], #32\n"     /* load b1, b2 to q4, q5*/        \
-  "sdot   v10.4s,  v6.16b,  v0.4b[0]\n"  /* out16 = b2 * a00[0], b2 = q6*/ \
-  "sdot   v13.4s,  v6.16b,  v0.4b[1]\n"  /* out17 = b2 * a00[1], b2 = q6*/ \
-  "sdot   v16.4s,  v6.16b,  v0.4b[2]\n"  /* out18 = b2 * a00[2], b2 = q6*/ \
-  "sdot   v19.4s,  v6.16b,  v0.4b[3]\n"  /* out19 = b2 * a00[3], b2 = q6*/ \
-  "sdot   v22.4s,  v6.16b,  v1.4b[0]\n"  /* out20 = b2 * a00[0], b2 = q6*/ \
-  "sdot   v25.4s,  v6.16b,  v1.4b[1]\n"  /* out21 = b2 * a00[1], b2 = q6*/ \
-  "sdot   v28.4s,  v6.16b,  v1.4b[2]\n"  /* out22 = b2 * a00[2], b2 = q6*/ \
-  "sdot   v31.4s,  v6.16b,  v1.4b[3]\n"  /* out23 = b2 * a00[3], b2 = q6*/ \
-  "beq        4f\n" /*jump to tail = 2*/ /* unrool 1, tail > 2*/           \
-  "ldp    q0, q1, [%[a_ptr]], #32\n"     /* load a00, a01 to q0, q1*/      \
-  "sdot   v8.4s ,  v7.16b,  v2.4b[0]\n"  /* out0 = b0 * a10[0], b0 = q7*/  \
-  "sdot   v11.4s ,  v7.16b,  v2.4b[1]\n" /* out1 = b0 * a10[1], b0 = q7*/  \
-  "sdot   v14.4s,  v7.16b,  v2.4b[2]\n"  /* out2 = b0 * a10[2], b0 = q7*/  \
-  "sdot   v17.4s,  v7.16b,  v2.4b[3]\n"  /* out3 = b0 * a10[3], b0 = q7*/  \
-  "sdot   v20.4s,  v7.16b,  v3.4b[0]\n"  /* out4 = b0 * a11[0], b0 = q7*/  \
-  "sdot   v23.4s,  v7.16b,  v3.4b[1]\n"  /* out5 = b0 * a11[1], b0 = q7*/  \
-  "sdot   v26.4s,  v7.16b,  v3.4b[2]\n"  /* out6 = b0 * a11[2], b0 = q7*/  \
-  "sdot   v29.4s,  v7.16b,  v3.4b[3]\n"  /* out7 = b0 * a11[3], b0 = q7*/  \
-  "ldp    q6, q7, [%[b_ptr]], #32\n"     /* load b0, b1 to q6, q7*/        \
-  "sdot   v9.4s,  v4.16b,  v2.4b[0]\n"   /* out8 = b0 * a10[0], b1 = q4*/  \
-  "sdot   v12.4s,  v4.16b,  v2.4b[1]\n"  /* out9 = b0 * a10[1], b1 = q4*/  \
-  "sdot   v15.4s,  v4.16b,  v2.4b[2]\n"  /* out10 = b1 * a10[2], b1 = q4*/ \
-  "sdot   v18.4s,  v4.16b,  v2.4b[3]\n"  /* out11 = b1 * a10[3], b1 = q4*/ \
-  "sdot   v21.4s,  v4.16b,  v3.4b[0]\n"  /* out12 = b1 * a10[0], b1 = q4*/ \
-  "sdot   v24.4s,  v4.16b,  v3.4b[1]\n"  /* out13 = b1 * a10[1], b1 = q4*/ \
-  "sdot   v27.4s,  v4.16b,  v3.4b[2]\n"  /* out14 = b1 * a10[2], b1 = q4*/ \
-  "sdot   v30.4s,  v4.16b,  v3.4b[3]\n"  /* out15 = b1 * a10[3], b1 = q4*/ \
-  "subs   %w[tail], %w[tail], #1\n"      /* tail--*/                       \
-  "sdot   v10.4s,  v5.16b,  v2.4b[0]\n"  /* out16 = b2 * a10[0], b2 = q5*/ \
-  "sdot   v13.4s,  v5.16b,  v2.4b[1]\n"  /* out17 = b2 * a10[0], b2 = q5*/ \
-  "sdot   v16.4s,  v5.16b,  v2.4b[2]\n"  /* out18 = b2 * a10[0], b2 = q5*/ \
-  "sdot   v19.4s,  v5.16b,  v2.4b[3]\n"  /* out19 = b2 * a10[0], b2 = q5*/ \
-  "sdot   v22.4s,  v5.16b,  v3.4b[0]\n"  /* out20 = b2 * a10[0], b2 = q5*/ \
-  "sdot   v25.4s,  v5.16b,  v3.4b[1]\n"  /* out21 = b2 * a10[0], b2 = q5*/ \
-  "sdot   v28.4s,  v5.16b,  v3.4b[2]\n"  /* out22 = b2 * a10[0], b2 = q5*/ \
-  "sdot   v31.4s,  v5.16b,  v3.4b[3]\n"  /* out23 = b2 * a10[0], b2 = q5*/ \
-  "beq        5f\n" /*jump to tail = 3*/ /* unrool 2, tail = 4*/           \
-  "ldp    q4, q5, [%[b_ptr]], #32\n"     /* load b2, b0 to q4, q5*/        \
-  "sdot   v8.4s ,  v6.16b,  v0.4b[0]\n"  /* out0 = b0 * a00[0], b0 = q6*/  \
-  "sdot   v11.4s ,  v6.16b,  v0.4b[1]\n" /* out1 = b0 * a00[1], b0 = q6*/  \
-  "ldp    q2, q3, [%[a_ptr]], #32\n"     /* load a10, a11 to q3, q4*/      \
-  "sdot   v14.4s,  v6.16b,  v0.4b[2]\n"  /* out2 = b0 * a00[2], b0 = q6*/  \
-  "sdot   v17.4s,  v6.16b,  v0.4b[3]\n"  /* out3 = b0 * a00[3], b0 = q6*/  \
-  "sdot   v20.4s,  v6.16b,  v1.4b[0]\n"  /* out4 = b0 * a01[0], b0 = q6*/  \
-  "sdot   v23.4s,  v6.16b,  v1.4b[1]\n"  /* out5 = b0 * a01[1], b0 = q6*/  \
-  "sdot   v26.4s,  v6.16b,  v1.4b[2]\n"  /* out6 = b0 * a01[2], b0 = q6*/  \
-  "sdot   v29.4s,  v6.16b,  v1.4b[3]\n"  /* out7 = b0 * a01[3], b0 = q6*/  \
-  "sdot   v9.4s,  v7.16b,  v0.4b[0]\n"   /* out8 = b1 * a00[0], b1 = q7*/  \
-  "sdot   v12.4s,  v7.16b,  v0.4b[1]\n"  /* out9 = b1 * a00[1], b1 = q7*/  \
-  "sdot   v15.4s,  v7.16b,  v0.4b[2]\n"  /* out10 = b1 * a00[2], b1 = q7*/ \
-  "sdot   v18.4s,  v7.16b,  v0.4b[3]\n"  /* out11 = b1 * a00[3], b1 = q7*/ \
-  "sdot   v21.4s,  v7.16b,  v1.4b[0]\n"  /* out12 = b1 * a01[0], b1 = q7*/ \
-  "sdot   v24.4s,  v7.16b,  v1.4b[1]\n"  /* out13 = b1 * a01[1], b1 = q7*/ \
-  "sdot   v27.4s,  v7.16b,  v1.4b[2]\n"  /* out14 = b1 * a01[2], b1 = q7*/ \
-  "sdot   v30.4s,  v7.16b,  v1.4b[3]\n"  /* out15 = b1 * a01[3], b1 = q7*/ \
-  "ldp    q6, q7, [%[b_ptr]], #32\n"     /* load b1, b2 to q6, q7*/        \
-  "sdot   v10.4s,  v4.16b,  v0.4b[0]\n"  /* out16 = b2 * a00[0], b2 = q4*/ \
-  "sdot   v13.4s,  v4.16b,  v0.4b[1]\n"  /* out17 = b2 * a00[1], b2 = q4*/ \
-  "sdot   v16.4s,  v4.16b,  v0.4b[2]\n"  /* out18 = b2 * a00[2], b2 = q4*/ \
-  "sdot   v19.4s,  v4.16b,  v0.4b[3]\n"  /* out19 = b2 * a00[3], b2 = q4*/ \
-  "sdot   v22.4s,  v4.16b,  v1.4b[0]\n"  /* out20 = b2 * a00[0], b2 = q4*/ \
-  "sdot   v25.4s,  v4.16b,  v1.4b[1]\n"  /* out21 = b2 * a00[1], b2 = q4*/ \
-  "sdot   v28.4s,  v4.16b,  v1.4b[2]\n"  /* out22 = b2 * a00[2], b2 = q4*/ \
-  "sdot   v31.4s,  v4.16b,  v1.4b[3]\n"  /* out23 = b2 * a00[3], b2 = q4*/ \
-  "sdot   v8.4s ,  v5.16b,  v2.4b[0]\n"  /* out0 = b0 * a10[0], b0 = q5*/  \
-  "sdot   v11.4s ,  v5.16b,  v2.4b[1]\n" /* out1 = b0 * a10[1], b0 = q5*/  \
-  "sdot   v14.4s,  v5.16b,  v2.4b[2]\n"  /* out2 = b0 * a10[2], b0 = q5*/  \
-  "sdot   v17.4s,  v5.16b,  v2.4b[3]\n"  /* out3 = b0 * a10[3], b0 = q5*/  \
-  "sdot   v20.4s,  v5.16b,  v3.4b[0]\n"  /* out4 = b0 * a11[0], b0 = q5*/  \
-  "sdot   v23.4s,  v5.16b,  v3.4b[1]\n"  /* out5 = b0 * a11[1], b0 = q5*/  \
-  "sdot   v26.4s,  v5.16b,  v3.4b[2]\n"  /* out6 = b0 * a11[2], b0 = q5*/  \
-  "sdot   v29.4s,  v5.16b,  v3.4b[3]\n"  /* out7 = b0 * a11[3], b0 = q5*/  \
-  "sdot   v9.4s,  v6.16b,  v2.4b[0]\n"   /* out8 = b0 * a10[0], b1 = q6*/  \
-  "sdot   v12.4s,  v6.16b,  v2.4b[1]\n"  /* out9 = b1 * a10[1], b1 = q6*/  \
-  "sdot   v15.4s,  v6.16b,  v2.4b[2]\n"  /* out10 = b1 * a10[2], b1 = q6*/ \
-  "sdot   v18.4s,  v6.16b,  v2.4b[3]\n"  /* out11 = b1 * a10[3], b1 = q6*/ \
-  "sdot   v21.4s,  v6.16b,  v3.4b[0]\n"  /* out12 = b1 * a10[0], b1 = q6*/ \
-  "sdot   v24.4s,  v6.16b,  v3.4b[1]\n"  /* out13 = b1 * a10[1], b1 = q6*/ \
-  "sdot   v27.4s,  v6.16b,  v3.4b[2]\n"  /* out14 = b1 * a10[2], b1 = q6*/ \
-  "sdot   v30.4s,  v6.16b,  v3.4b[3]\n"  /* out15 = b1 * a10[3], b1 = q6*/ \
-  "sdot   v10.4s,  v7.16b,  v2.4b[0]\n"  /* out16 = b2 * a10[0], b2 = q7*/ \
-  "sdot   v13.4s,  v7.16b,  v2.4b[1]\n"  /* out17 = b2 * a10[0], b2 = q7*/ \
-  "sdot   v16.4s,  v7.16b,  v2.4b[2]\n"  /* out18 = b2 * a10[0], b2 = q7*/ \
-  "sdot   v19.4s,  v7.16b,  v2.4b[3]\n"  /* out19 = b2 * a10[0], b2 = q7*/ \
-  "sdot   v22.4s,  v7.16b,  v3.4b[0]\n"  /* out20 = b2 * a10[0], b2 = q7*/ \
-  "sdot   v25.4s,  v7.16b,  v3.4b[1]\n"  /* out21 = b2 * a10[0], b2 = q7*/ \
-  "sdot   v28.4s,  v7.16b,  v3.4b[2]\n"  /* out22 = b2 * a10[0], b2 = q7*/ \
-  "sdot   v31.4s,  v7.16b,  v3.4b[3]\n"  /* out23 = b2 * a10[0], b2 = q7*/ \
-  "b      11f\n"                         /* tails==1 final tail*/          \
-  "3: \n"                                /* tail=1*/                       \
-  "ldr    q6, [%[b_ptr]], #16\n"         /* load b2 to q6*/                \
-  "sdot   v8.4s ,  v4.16b,  v0.4b[0]\n"  /* out0 = b0 * a10[0], b0 = q5*/  \
-  "sdot   v11.4s ,  v4.16b,  v0.4b[1]\n" /* out1 = b0 * a10[1], b0 = q5*/  \
-  "sdot   v14.4s,  v4.16b,  v0.4b[2]\n"  /* out2 = b0 * a10[2], b0 = q5*/  \
-  "sdot   v17.4s,  v4.16b,  v0.4b[3]\n"  /* out3 = b0 * a10[3], b0 = q5*/  \
-  "sdot   v20.4s,  v4.16b,  v1.4b[0]\n"  /* out4 = b0 * a11[0], b0 = q5*/  \
-  "sdot   v23.4s,  v4.16b,  v1.4b[1]\n"  /* out5 = b0 * a11[1], b0 = q5*/  \
-  "sdot   v26.4s,  v4.16b,  v1.4b[2]\n"  /* out6 = b0 * a11[2], b0 = q5*/  \
-  "sdot   v29.4s,  v4.16b,  v1.4b[3]\n"  /* out7 = b0 * a11[3], b0 = q5*/  \
-  "sdot   v9.4s,  v5.16b,  v0.4b[0]\n"   /* out8 = b0 * a10[0], b1 = q6*/  \
-  "sdot   v12.4s,  v5.16b,  v0.4b[1]\n"  /* out9 = b1 * a10[1], b1 = q6*/  \
-  "sdot   v15.4s,  v5.16b,  v0.4b[2]\n"  /* out10 = b1 * a10[2], b1 = q6*/ \
-  "sdot   v18.4s,  v5.16b,  v0.4b[3]\n"  /* out11 = b1 * a10[3], b1 = q6*/ \
-  "sdot   v21.4s,  v5.16b,  v1.4b[0]\n"  /* out12 = b1 * a10[0], b1 = q6*/ \
-  "sdot   v24.4s,  v5.16b,  v1.4b[1]\n"  /* out13 = b1 * a10[1], b1 = q6*/ \
-  "sdot   v27.4s,  v5.16b,  v1.4b[2]\n"  /* out14 = b1 * a10[2], b1 = q6*/ \
-  "sdot   v30.4s,  v5.16b,  v1.4b[3]\n"  /* out15 = b1 * a10[3], b1 = q6*/ \
-  "sdot   v10.4s,  v6.16b,  v0.4b[0]\n"  /* out16 = b2 * a10[0], b2 = q7*/ \
-  "sdot   v13.4s,  v6.16b,  v0.4b[1]\n"  /* out17 = b2 * a10[0], b2 = q7*/ \
-  "sdot   v16.4s,  v6.16b,  v0.4b[2]\n"  /* out18 = b2 * a10[0], b2 = q7*/ \
-  "sdot   v19.4s,  v6.16b,  v0.4b[3]\n"  /* out19 = b2 * a10[0], b2 = q7*/ \
-  "sdot   v22.4s,  v6.16b,  v1.4b[0]\n"  /* out20 = b2 * a10[0], b2 = q7*/ \
-  "sdot   v25.4s,  v6.16b,  v1.4b[1]\n"  /* out21 = b2 * a10[0], b2 = q7*/ \
-  "sdot   v28.4s,  v6.16b,  v1.4b[2]\n"  /* out22 = b2 * a10[0], b2 = q7*/ \
-  "sdot   v31.4s,  v6.16b,  v1.4b[3]\n"  /* out23 = b2 * a10[0], b2 = q7*/ \
-  "b      11f\n"                         /* tails==2 final tail*/          \
-  "4:\n"                                 /* tail = 2*/                     \
-  "sdot   v8.4s ,  v7.16b,  v2.4b[0]\n"  /* out0 = b0 * a10[0], b0 = q5*/  \
-  "sdot   v11.4s ,  v7.16b,  v2.4b[1]\n" /* out1 = b0 * a10[1], b0 = q5*/  \
-  "sdot   v14.4s,  v7.16b,  v2.4b[2]\n"  /* out2 = b0 * a10[2], b0 = q5*/  \
-  "sdot   v17.4s,  v7.16b,  v2.4b[3]\n"  /* out3 = b0 * a10[3], b0 = q5*/  \
-  "sdot   v20.4s,  v7.16b,  v3.4b[0]\n"  /* out4 = b0 * a11[0], b0 = q5*/  \
-  "sdot   v23.4s,  v7.16b,  v3.4b[1]\n"  /* out5 = b0 * a11[1], b0 = q5*/  \
-  "sdot   v26.4s,  v7.16b,  v3.4b[2]\n"  /* out6 = b0 * a11[2], b0 = q5*/  \
-  "sdot   v29.4s,  v7.16b,  v3.4b[3]\n"  /* out7 = b0 * a11[3], b0 = q5*/  \
-  "sdot   v9.4s,  v4.16b,  v2.4b[0]\n"   /* out8 = b0 * a10[0], b1 = q6*/  \
-  "sdot   v12.4s,  v4.16b,  v2.4b[1]\n"  /* out9 = b1 * a10[1], b1 = q6*/  \
-  "sdot   v15.4s,  v4.16b,  v2.4b[2]\n"  /* out10 = b1 * a10[2], b1 = q6*/ \
-  "sdot   v18.4s,  v4.16b,  v2.4b[3]\n"  /* out11 = b1 * a10[3], b1 = q6*/ \
-  "sdot   v21.4s,  v4.16b,  v3.4b[0]\n"  /* out12 = b1 * a10[0], b1 = q6*/ \
-  "sdot   v24.4s,  v4.16b,  v3.4b[1]\n"  /* out13 = b1 * a10[1], b1 = q6*/ \
-  "sdot   v27.4s,  v4.16b,  v3.4b[2]\n"  /* out14 = b1 * a10[2], b1 = q6*/ \
-  "sdot   v30.4s,  v4.16b,  v3.4b[3]\n"  /* out15 = b1 * a10[3], b1 = q6*/ \
-  "sdot   v10.4s,  v5.16b,  v2.4b[0]\n"  /* out16 = b2 * a10[0], b2 = q7*/ \
-  "sdot   v13.4s,  v5.16b,  v2.4b[1]\n"  /* out17 = b2 * a10[0], b2 = q7*/ \
-  "sdot   v16.4s,  v5.16b,  v2.4b[2]\n"  /* out18 = b2 * a10[0], b2 = q7*/ \
-  "sdot   v19.4s,  v5.16b,  v2.4b[3]\n"  /* out19 = b2 * a10[0], b2 = q7*/ \
-  "sdot   v22.4s,  v5.16b,  v3.4b[0]\n"  /* out20 = b2 * a10[0], b2 = q7*/ \
-  "sdot   v25.4s,  v5.16b,  v3.4b[1]\n"  /* out21 = b2 * a10[0], b2 = q7*/ \
-  "sdot   v28.4s,  v5.16b,  v3.4b[2]\n"  /* out22 = b2 * a10[0], b2 = q7*/ \
-  "sdot   v31.4s,  v5.16b,  v3.4b[3]\n"  /* out23 = b2 * a10[0], b2 = q7*/ \
-  "b      11f\n"                         /* tails==3 final tail*/          \
-  "5:\n"                                 /* tail = 3*/                     \
-  "ldr    q4, [%[b_ptr]], #16\n"         /* load b2, b0 to q4*/            \
-  "sdot   v8.4s ,  v6.16b,  v0.4b[0]\n"  /* out0 = b0 * a10[0], b0 = q5*/  \
-  "sdot   v11.4s ,  v6.16b,  v0.4b[1]\n" /* out1 = b0 * a10[1], b0 = q5*/  \
-  "sdot   v14.4s,  v6.16b,  v0.4b[2]\n"  /* out2 = b0 * a10[2], b0 = q5*/  \
-  "sdot   v17.4s,  v6.16b,  v0.4b[3]\n"  /* out3 = b0 * a10[3], b0 = q5*/  \
-  "sdot   v20.4s,  v6.16b,  v1.4b[0]\n"  /* out4 = b0 * a11[0], b0 = q5*/  \
-  "sdot   v23.4s,  v6.16b,  v1.4b[1]\n"  /* out5 = b0 * a11[1], b0 = q5*/  \
-  "sdot   v26.4s,  v6.16b,  v1.4b[2]\n"  /* out6 = b0 * a11[2], b0 = q5*/  \
-  "sdot   v29.4s,  v6.16b,  v1.4b[3]\n"  /* out7 = b0 * a11[3], b0 = q5*/  \
-  "sdot   v9.4s,  v7.16b,  v0.4b[0]\n"   /* out8 = b0 * a10[0], b1 = q6*/  \
-  "sdot   v12.4s,  v7.16b,  v0.4b[1]\n"  /* out9 = b1 * a10[1], b1 = q6*/  \
-  "sdot   v15.4s,  v7.16b,  v0.4b[2]\n"  /* out10 = b1 * a10[2], b1 = q6*/ \
-  "sdot   v18.4s,  v7.16b,  v0.4b[3]\n"  /* out11 = b1 * a10[3], b1 = q6*/ \
-  "sdot   v21.4s,  v7.16b,  v1.4b[0]\n"  /* out12 = b1 * a10[0], b1 = q6*/ \
-  "sdot   v24.4s,  v7.16b,  v1.4b[1]\n"  /* out13 = b1 * a10[1], b1 = q6*/ \
-  "sdot   v27.4s,  v7.16b,  v1.4b[2]\n"  /* out14 = b1 * a10[2], b1 = q6*/ \
-  "sdot   v30.4s,  v7.16b,  v1.4b[3]\n"  /* out15 = b1 * a10[3], b1 = q6*/ \
-  "sdot   v10.4s,  v4.16b,  v0.4b[0]\n"  /* out16 = b2 * a10[0], b2 = q7*/ \
-  "sdot   v13.4s,  v4.16b,  v0.4b[1]\n"  /* out17 = b2 * a10[0], b2 = q7*/ \
-  "sdot   v16.4s,  v4.16b,  v0.4b[2]\n"  /* out18 = b2 * a10[0], b2 = q7*/ \
-  "sdot   v19.4s,  v4.16b,  v0.4b[3]\n"  /* out19 = b2 * a10[0], b2 = q7*/ \
-  "sdot   v22.4s,  v4.16b,  v1.4b[0]\n"  /* out20 = b2 * a10[0], b2 = q7*/ \
-  "sdot   v25.4s,  v4.16b,  v1.4b[1]\n"  /* out21 = b2 * a10[0], b2 = q7*/ \
-  "sdot   v28.4s,  v4.16b,  v1.4b[2]\n"  /* out22 = b2 * a10[0], b2 = q7*/ \
-  "sdot   v31.4s,  v4.16b,  v1.4b[3]\n"  /* out23 = b2 * a10[0], b2 = q7*/ \
-  "11: \n"                               /* check if relu */               \
-  "cbz    %w[relu],   12f\n"             /* skip relu */                   \
-  "movi   v2.4s, #0\n"                   /* for relu*/                     \
-  "smax   v8.4s, v8.4s, v2.4s\n"         /* relu*/                         \
-  "smax   v9.4s, v9.4s, v2.4s\n"         /* relu*/                         \
-  "smax   v10.4s, v10.4s, v2.4s\n"       /* relu*/                         \
-  "smax   v11.4s, v11.4s, v2.4s\n"       /* relu*/                         \
-  "smax   v12.4s, v12.4s, v2.4s\n"       /* relu*/                         \
-  "smax   v13.4s, v13.4s, v2.4s\n"       /* relu*/                         \
-  "smax   v14.4s, v14.4s, v2.4s\n"       /* relu*/                         \
-  "smax   v15.4s, v15.4s, v2.4s\n"       /* relu*/                         \
-  "smax   v16.4s,v16.4s,v2.4s\n"         /* relu*/                         \
-  "smax   v17.4s,v17.4s,v2.4s\n"         /* relu*/                         \
-  "smax   v18.4s, v18.4s, v2.4s\n"       /* relu*/                         \
-  "smax   v19.4s, v19.4s, v2.4s\n"       /* relu*/                         \
-  "smax   v20.4s, v20.4s, v2.4s\n"       /* relu*/                         \
-  "smax   v21.4s, v21.4s, v2.4s\n"       /* relu*/                         \
-  "smax   v22.4s, v22.4s, v2.4s\n"       /* relu*/                         \
-  "smax   v23.4s, v23.4s, v2.4s\n"       /* relu*/                         \
-  "smax   v24.4s, v24.4s, v2.4s\n"       /* relu*/                         \
-  "smax   v25.4s, v25.4s, v2.4s\n"       /* relu*/                         \
-  "smax   v26.4s, v26.4s, v2.4s\n"       /* relu*/                         \
-  "smax   v27.4s, v27.4s, v2.4s\n"       /* relu*/                         \
-  "smax   v28.4s, v28.4s, v2.4s\n"       /* relu*/                         \
-  "smax   v29.4s, v29.4s, v2.4s\n"       /* relu*/                         \
-  "smax   v30.4s, v30.4s, v2.4s\n"       /* relu*/                         \
-  "smax   v31.4s, v31.4s, v2.4s\n"       /* relu*/                         \
-  "12: \n"
-
-#define GEMM_SDOT_INT32_OUT                                        \
-  "st1 {v8.4s, v9.4s, v10.4s},[%[c_ptr0]], #48\n"   /* store r0 */ \
-  "st1 {v11.4s, v12.4s, v13.4s},[%[c_ptr1]], #48\n" /* store r1 */ \
-  "st1 {v14.4s, v15.4s, v16.4s},[%[c_ptr2]], #48\n" /* store r2 */ \
-  "st1 {v17.4s, v18.4s, v19.4s},[%[c_ptr3]], #48\n" /* store r3 */ \
-  "st1 {v20.4s, v21.4s, v22.4s},[%[c_ptr4]], #48\n" /* store r4 */ \
-  "st1 {v23.4s, v24.4s, v25.4s},[%[c_ptr5]], #48\n" /* store r5 */ \
-  "st1 {v26.4s, v27.4s, v28.4s},[%[c_ptr6]], #48\n" /* store r6 */ \
-  "st1 {v29.4s, v30.4s, v31.4s},[%[c_ptr7]], #48\n" /* store r7 */
-
-#define GEMM_SDOT_FP32_OUT                                          \
-  "ldp  q0, q1, [%[scale]]\n"     /* load scale */                  \
-  "scvtf  v2.4s , v8.4s\n"        /*  00, convert to fp32 */        \
-  "scvtf  v3.4s , v9.4s\n"        /*  01, convert to fp32 */        \
-  "scvtf  v4.4s , v10.4s\n"       /*  02, convert to fp32 */        \
-  "scvtf  v5.4s , v11.4s\n"       /*  03, convert to fp32 */        \
-  "scvtf  v6.4s , v12.4s\n"       /*  00, convert to fp32 */        \
-  "scvtf  v7.4s , v13.4s\n"       /*  00, convert to fp32 */        \
-  "fmul v8.4s, v2.4s, v0.s[0]\n"  /*  00, mul scale to get final */ \
-  "fmul v9.4s, v3.4s, v0.s[0]\n"  /*  00, mul scale to get final */ \
-  "fmul v10.4s, v4.4s, v0.s[0]\n" /*  00, mul scale to get final */ \
-  "fmul v11.4s, v5.4s, v0.s[1]\n" /*  00, mul scale to get final */ \
-  "fmul v12.4s, v6.4s, v0.s[1]\n" /*  00, mul scale to get final */ \
-  "fmul v13.4s, v7.4s, v0.s[1]\n" /*  00, mul scale to get final */ \
-  "scvtf  v2.4s , v14.4s\n"       /*  00, convert to fp32 */        \
-  "scvtf  v3.4s , v15.4s\n"       /*  01, convert to fp32 */        \
-  "scvtf  v4.4s , v16.4s\n"       /*  02, convert to fp32 */        \
-  "scvtf  v5.4s , v17.4s\n"       /*  03, convert to fp32 */        \
-  "scvtf  v6.4s , v18.4s\n"       /*  00, convert to fp32 */        \
-  "scvtf  v7.4s , v19.4s\n"       /*  00, convert to fp32 */        \
-  "st1 {v8.4s, v9.4s, v10.4s},[%[c_ptr0]], #48\n"   /* store r0 */  \
-  "st1 {v11.4s, v12.4s, v13.4s},[%[c_ptr1]], #48\n" /* store r1 */  \
-  "fmul v14.4s, v2.4s, v0.s[2]\n" /*  00, mul scale to get final */ \
-  "fmul v15.4s, v3.4s, v0.s[2]\n" /*  00, mul scale to get final */ \
-  "fmul v16.4s, v4.4s, v0.s[2]\n" /*  00, mul scale to get final */ \
-  "fmul v17.4s, v5.4s, v0.s[3]\n" /*  00, mul scale to get final */ \
-  "fmul v18.4s, v6.4s, v0.s[3]\n" /*  00, mul scale to get final */ \
-  "fmul v19.4s, v7.4s, v0.s[3]\n" /*  00, mul scale to get final */ \
-  "scvtf  v2.4s , v20.4s\n"       /*  00, convert to fp32 */        \
-  "scvtf  v3.4s , v21.4s\n"       /*  01, convert to fp32 */        \
-  "scvtf  v4.4s , v22.4s\n"       /*  02, convert to fp32 */        \
-  "scvtf  v5.4s , v23.4s\n"       /*  03, convert to fp32 */        \
-  "scvtf  v6.4s , v24.4s\n"       /*  00, convert to fp32 */        \
-  "scvtf  v7.4s , v25.4s\n"       /*  00, convert to fp32 */        \
-  "st1 {v14.4s, v15.4s, v16.4s},[%[c_ptr2]], #48\n" /* store r2 */  \
-  "st1 {v17.4s, v18.4s, v19.4s},[%[c_ptr3]], #48\n" /* store r3 */  \
-  "fmul v20.4s, v2.4s, v1.s[0]\n" /*  00, mul scale to get final */ \
-  "fmul v21.4s, v3.4s, v1.s[0]\n" /*  00, mul scale to get final */ \
-  "fmul v22.4s, v4.4s, v1.s[0]\n" /*  00, mul scale to get final */ \
-  "fmul v23.4s, v5.4s, v1.s[1]\n" /*  00, mul scale to get final */ \
-  "fmul v24.4s, v6.4s, v1.s[1]\n" /*  00, mul scale to get final */ \
-  "fmul v25.4s, v7.4s, v1.s[1]\n" /*  00, mul scale to get final */ \
-  "scvtf  v2.4s , v26.4s\n"       /*  00, convert to fp32 */        \
-  "scvtf  v3.4s , v27.4s\n"       /*  01, convert to fp32 */        \
-  "scvtf  v4.4s , v28.4s\n"       /*  02, convert to fp32 */        \
-  "scvtf  v5.4s , v29.4s\n"       /*  03, convert to fp32 */        \
-  "scvtf  v6.4s , v30.4s\n"       /*  00, convert to fp32 */        \
-  "scvtf  v7.4s , v31.4s\n"       /*  00, convert to fp32 */        \
-  "st1 {v20.4s, v21.4s, v22.4s},[%[c_ptr4]], #48\n" /* store r4 */  \
-  "st1 {v23.4s, v24.4s, v25.4s},[%[c_ptr5]], #48\n" /* store r5 */  \
-  "fmul v26.4s, v2.4s, v1.s[2]\n" /*  00, mul scale to get final */ \
-  "fmul v27.4s, v3.4s, v1.s[2]\n" /*  00, mul scale to get final */ \
-  "fmul v28.4s, v4.4s, v1.s[2]\n" /*  00, mul scale to get final */ \
-  "fmul v29.4s, v5.4s, v1.s[3]\n" /*  00, mul scale to get final */ \
-  "fmul v30.4s, v6.4s, v1.s[3]\n" /*  00, mul scale to get final */ \
-  "fmul v31.4s, v7.4s, v1.s[3]\n" /*  00, mul scale to get final */ \
-  "st1 {v26.4s, v27.4s, v28.4s},[%[c_ptr6]], #48\n" /* store r6 */  \
-  "st1 {v29.4s, v30.4s, v31.4s},[%[c_ptr7]], #48\n" /* store r7 */
-
-#define GEMM_SDOT_INT8_OUT                                          \
-  "ldp  q0, q1, [%[scale]]\n"      /* load scale */                 \
-  "scvtf  v2.4s , v8.4s\n"         /*  00, convert to fp32 */       \
-  "scvtf  v3.4s , v9.4s\n"         /*  01, convert to fp32 */       \
-  "scvtf  v4.4s , v10.4s\n"        /*  02, convert to fp32 */       \
-  "scvtf  v5.4s , v11.4s\n"        /*  03, convert to fp32 */       \
-  "scvtf  v6.4s , v12.4s\n"        /*  00, convert to fp32 */       \
-  "scvtf  v7.4s , v13.4s\n"        /*  00, convert to fp32 */       \
-  "fmul v8.4s, v2.4s, v0.s[0]\n"   /*  00, mul scale to get final*/ \
-  "fmul v9.4s, v3.4s, v0.s[0]\n"   /*  00, mul scale to get final*/ \
-  "fmul v10.4s, v4.4s, v0.s[0]\n"  /*  00, mul scale to get final*/ \
-  "fmul v11.4s, v5.4s, v0.s[1]\n"  /*  00, mul scale to get final*/ \
-  "fmul v12.4s, v6.4s, v0.s[1]\n"  /*  00, mul scale to get final*/ \
-  "fmul v13.4s, v7.4s, v0.s[1]\n"  /*  00, mul scale to get final*/ \
-  "scvtf  v2.4s , v14.4s\n"        /*  00, convert to fp32 */       \
-  "scvtf  v3.4s , v15.4s\n"        /*  01, convert to fp32 */       \
-  "scvtf  v4.4s , v16.4s\n"        /*  02, convert to fp32 */       \
-  "scvtf  v5.4s , v17.4s\n"        /*  03, convert to fp32 */       \
-  "scvtf  v6.4s , v18.4s\n"        /*  00, convert to fp32 */       \
-  "scvtf  v7.4s , v19.4s\n"        /*  00, convert to fp32 */       \
-  "fmul v14.4s, v2.4s, v0.s[2]\n"  /*  00, mul scale to get final*/ \
-  "fmul v15.4s, v3.4s, v0.s[2]\n"  /*  00, mul scale to get final*/ \
-  "fmul v16.4s, v4.4s, v0.s[2]\n"  /*  00, mul scale to get final*/ \
-  "fmul v17.4s, v5.4s, v0.s[3]\n"  /*  00, mul scale to get final*/ \
-  "fmul v18.4s, v6.4s, v0.s[3]\n"  /*  00, mul scale to get final*/ \
-  "fmul v19.4s, v7.4s, v0.s[3]\n"  /*  00, mul scale to get final*/ \
-  "scvtf  v2.4s , v20.4s\n"        /*  00, convert to fp32 */       \
-  "scvtf  v3.4s , v21.4s\n"        /*  01, convert to fp32 */       \
-  "scvtf  v4.4s , v22.4s\n"        /*  02, convert to fp32 */       \
-  "scvtf  v5.4s , v23.4s\n"        /*  03, convert to fp32 */       \
-  "scvtf  v6.4s , v24.4s\n"        /*  00, convert to fp32 */       \
-  "scvtf  v7.4s , v25.4s\n"        /*  00, convert to fp32 */       \
-  "fmul v20.4s, v2.4s, v1.s[0]\n"  /*  00, mul scale to get final*/ \
-  "fmul v21.4s, v3.4s, v1.s[0]\n"  /*  00, mul scale to get final*/ \
-  "fmul v22.4s, v4.4s, v1.s[0]\n"  /*  00, mul scale to get final*/ \
-  "fmul v23.4s, v5.4s, v1.s[1]\n"  /*  00, mul scale to get final*/ \
-  "fmul v24.4s, v6.4s, v1.s[1]\n"  /*  00, mul scale to get final*/ \
-  "fmul v25.4s, v7.4s, v1.s[1]\n"  /*  00, mul scale to get final*/ \
-  "scvtf  v2.4s , v26.4s\n"        /*  00, convert to fp32 */       \
-  "scvtf  v3.4s , v27.4s\n"        /*  01, convert to fp32 */       \
-  "scvtf  v4.4s , v28.4s\n"        /*  02, convert to fp32 */       \
-  "scvtf  v5.4s , v29.4s\n"        /*  03, convert to fp32 */       \
-  "scvtf  v6.4s , v30.4s\n"        /*  00, convert to fp32 */       \
-  "scvtf  v7.4s , v31.4s\n"        /*  00, convert to fp32 */       \
-  "fmul v26.4s, v2.4s, v1.s[2]\n"  /*  00, mul scale to get final*/ \
-  "fmul v27.4s, v3.4s, v1.s[2]\n"  /*  00, mul scale to get final*/ \
-  "fmul v28.4s, v4.4s, v1.s[2]\n"  /*  00, mul scale to get final*/ \
-  "fmul v29.4s, v5.4s, v1.s[3]\n"  /*  00, mul scale to get final*/ \
-  "fmul v30.4s, v6.4s, v1.s[3]\n"  /*  00, mul scale to get final*/ \
-  "fmul v31.4s, v7.4s, v1.s[3]\n"  /*  00, mul scale to get final*/ \
-  "fcvtas v0.4s, v8.4s\n"          /*  00, cvt to int */            \
-  "fcvtas v1.4s, v9.4s\n"          /*  00, cvt to int */            \
-  "fcvtas v2.4s, v10.4s\n"         /*  00, cvt to int */            \
-  "fcvtas v3.4s, v11.4s\n"         /*  00, cvt to int */            \
-  "fcvtas v4.4s, v12.4s\n"         /*  00, cvt to int */            \
-  "fcvtas v5.4s, v13.4s\n"         /*  00, cvt to int */            \
-  "sqxtn  v8.4h, v0.4s\n"          /*  00, cvt int32 to int16 */    \
-  "sqxtn2  v8.8h, v1.4s\n"         /*  00, cvt int32 to int16 */    \
-  "sqxtn  v9.4h, v2.4s\n"          /*  00, cvt int32 to int16 */    \
-  "fcvtas v0.4s, v14.4s\n"         /*  00, cvt to int */            \
-  "fcvtas v1.4s, v15.4s\n"         /*  00, cvt to int */            \
-  "fcvtas v2.4s, v16.4s\n"         /*  00, cvt to int */            \
-  "sqxtn  v11.4h, v3.4s\n"         /*  00, cvt int32 to int16 */    \
-  "sqxtn2  v11.8h, v4.4s\n"        /*  00, cvt int32 to int16 */    \
-  "sqxtn  v12.4h, v5.4s\n"         /*  00, cvt int32 to int16 */    \
-  "fcvtas v3.4s, v17.4s\n"         /*  00, cvt to int */            \
-  "fcvtas v4.4s, v18.4s\n"         /*  00, cvt to int */            \
-  "fcvtas v5.4s, v19.4s\n"         /*  00, cvt to int */            \
-  "sqxtn  v14.4h, v0.4s\n"         /*  00, cvt int32 to int16 */    \
-  "sqxtn2  v14.8h, v1.4s\n"        /*  00, cvt int32 to int16 */    \
-  "sqxtn  v15.4h, v2.4s\n"         /*  00, cvt int32 to int16 */    \
-  "fcvtas v0.4s, v20.4s\n"         /*  00, cvt to int */            \
-  "fcvtas v1.4s, v21.4s\n"         /*  00, cvt to int */            \
-  "fcvtas v2.4s, v22.4s\n"         /*  00, cvt to int */            \
-  "sqxtn  v17.4h, v3.4s\n"         /*  00, cvt int32 to int16 */    \
-  "sqxtn2  v17.8h, v4.4s\n"        /*  00, cvt int32 to int16 */    \
-  "sqxtn  v18.4h, v5.4s\n"         /*  00, cvt int32 to int16 */    \
-  "fcvtas v3.4s, v23.4s\n"         /*  00, cvt to int */            \
-  "fcvtas v4.4s, v24.4s\n"         /*  00, cvt to int */            \
-  "fcvtas v5.4s, v25.4s\n"         /*  00, cvt to int */            \
-  "sqxtn  v20.4h, v0.4s\n"         /*  00, cvt int32 to int16 */    \
-  "sqxtn2  v20.8h, v1.4s\n"        /*  00, cvt int32 to int16 */    \
-  "sqxtn  v21.4h, v2.4s\n"         /*  00, cvt int32 to int16 */    \
-  "fcvtas v0.4s, v26.4s\n"         /*  00, cvt to int */            \
-  "fcvtas v1.4s, v27.4s\n"         /*  00, cvt to int */            \
-  "fcvtas v2.4s, v28.4s\n"         /*  00, cvt to int */            \
-  "sqxtn  v23.4h, v3.4s\n"         /*  00, cvt int32 to int16 */    \
-  "sqxtn2  v23.8h, v4.4s\n"        /*  00, cvt int32 to int16 */    \
-  "sqxtn  v24.4h, v5.4s\n"         /*  00, cvt int32 to int16 */    \
-  "fcvtas v3.4s, v29.4s\n"         /*  00, cvt to int */            \
-  "fcvtas v4.4s, v30.4s\n"         /*  00, cvt to int */            \
-  "fcvtas v5.4s, v31.4s\n"         /*  00, cvt to int */            \
-  "sqxtn  v26.4h, v0.4s\n"         /*  00, cvt int32 to int16 */    \
-  "sqxtn2  v26.8h, v1.4s\n"        /*  00, cvt int32 to int16 */    \
-  "sqxtn  v27.4h, v2.4s\n"         /*  00, cvt int32 to int16 */    \
-  "sqxtn  v29.4h, v3.4s\n"         /*  00, cvt int32 to int16 */    \
-  "sqxtn2  v29.8h, v4.4s\n"        /*  00, cvt int32 to int16 */    \
-  "sqxtn  v30.4h, v5.4s\n"         /*  00, cvt int32 to int16 */    \
-  "sqxtn  v4.8b, v8.8h\n"          /*  00, 01, cvt int16 to int8 */ \
-  "sqxtn  v0.8b, v9.8h\n"          /*  00, 01, cvt int16 to int8 */ \
-  "sqxtn  v5.8b, v11.8h\n"         /*  00, 01, cvt int16 to int8 */ \
-  "sqxtn  v1.8b, v12.8h\n"         /*  00, 01, cvt int16 to int8 */ \
-  "sqxtn  v6.8b, v14.8h\n"         /*  00, 01, cvt int16 to int8 */ \
-  "sqxtn  v2.8b, v15.8h\n"         /*  00, 01, cvt int16 to int8 */ \
-  "sqxtn  v7.8b, v17.8h\n"         /*  00, 01, cvt int16 to int8 */ \
-  "sqxtn  v3.8b, v18.8h\n"         /*  00, 01, cvt int16 to int8 */ \
-  "sqxtn  v16.8b, v20.8h\n"        /*  00, 01, cvt int16 to int8 */ \
-  "sqxtn  v15.8b, v21.8h\n"        /*  00, 01, cvt int16 to int8 */ \
-  "sqxtn  v20.8b, v23.8h\n"        /*  00, 01, cvt int16 to int8 */ \
-  "sqxtn  v17.8b, v24.8h\n"        /*  00, 01, cvt int16 to int8 */ \
-  "sqxtn  v24.8b, v26.8h\n"        /*  00, 01, cvt int16 to int8 */ \
-  "sqxtn  v18.8b, v27.8h\n"        /*  00, 01, cvt int16 to int8 */ \
-  "sqxtn  v28.8b, v29.8h\n"        /*  00, 01, cvt int16 to int8 */ \
-  "sqxtn  v19.8b, v30.8h\n"        /*  00, 01, cvt int16 to int8 */ \
-  "st1 {v4.8b},[%[c_ptr0]], #8\n"  /* store r0 */                   \
-  "st1 {v5.8b},[%[c_ptr1]], #8\n"  /* store r0 */                   \
-  "st1 {v6.8b},[%[c_ptr2]], #8\n"  /* store r0 */                   \
-  "st1 {v7.8b},[%[c_ptr3]], #8\n"  /* store r0 */                   \
-  "st1 {v16.8b},[%[c_ptr4]], #8\n" /* store r0 */                   \
-  "st1 {v20.8b},[%[c_ptr5]], #8\n" /* store r0 */                   \
-  "st1 {v24.8b},[%[c_ptr6]], #8\n" /* store r0 */                   \
-  "st1 {v28.8b},[%[c_ptr7]], #8\n" /* store r0 */                   \
-  "str s0,[%[c_ptr0]], #4\n"       /* store r0 */                   \
-  "str s1,[%[c_ptr1]], #4\n"       /* store r0 */                   \
-  "str s2,[%[c_ptr2]], #4\n"       /* store r0 */                   \
-  "str s3,[%[c_ptr3]], #4\n"       /* store r0 */                   \
-  "str s15,[%[c_ptr4]], #4\n"      /* store r0 */                   \
-  "str s17,[%[c_ptr5]], #4\n"      /* store r0 */                   \
-  "str s18,[%[c_ptr6]], #4\n"      /* store r0 */                   \
-  "str s19,[%[c_ptr7]], #4\n"      /* store r0 */
-
-template <>
-inline void sgemm_sdot_int8_kernel(const int8_t* a_ptr,
-                                   const int8_t*& b_ptr,  // NOLINT
-                                   const int32_t* bias,
-                                   int32_t*& c_ptr0,  // NOLINT
-                                   int32_t*& c_ptr1,  // NOLINT
-                                   int32_t*& c_ptr2,  // NOLINT
-                                   int32_t*& c_ptr3,  // NOLINT
-                                   int32_t*& c_ptr4,  // NOLINT
-                                   int32_t*& c_ptr5,  // NOLINT
-                                   int32_t*& c_ptr6,  // NOLINT
-                                   int32_t*& c_ptr7,  // NOLINT
-                                   const float32_t* scale,
-                                   bool is_relu,
-                                   int k,
-                                   int tail) {
-  asm volatile(_DECLARE_SDOT_ELEMENT GEMM_SDOT_INT8_KERNEL GEMM_SDOT_INT32_OUT
-               : [a_ptr] "+r"(a_ptr),
-                 [b_ptr] "+r"(b_ptr),
-                 [k] "+r"(k),
-                 [tail] "+r"(tail),
-                 [c_ptr0] "+r"(c_ptr0),
-                 [c_ptr1] "+r"(c_ptr1),
-                 [c_ptr2] "+r"(c_ptr2),
-                 [c_ptr3] "+r"(c_ptr3),
-                 [c_ptr4] "+r"(c_ptr4),
-                 [c_ptr5] "+r"(c_ptr5),
-                 [c_ptr6] "+r"(c_ptr6),
-                 [c_ptr7] "+r"(c_ptr7)
-               : [bias_ptr] "r"(bias), [scale] "r"(scale), [relu] "r"(is_relu)
-               : "cc",
-                 "memory",
-                 "v0",
-                 "v1",
-                 "v2",
-                 "v3",
-                 "v4",
-                 "v5",
-                 "v6",
-                 "v7",
-                 "v8",
-                 "v9",
-                 "v10",
-                 "v11",
-                 "v12",
-                 "v13",
-                 "v14",
-                 "v15",
-                 "v16",
-                 "v17",
-                 "v18",
-                 "v19",
-                 "v20",
-                 "v21",
-                 "v22",
-                 "v23",
-                 "v24",
-                 "v25",
-                 "v26",
-                 "v27",
-                 "v28",
-                 "v29",
-                 "v30",
-                 "v31");
-}
-template <>
-inline void sgemm_sdot_int8_kernel(const int8_t* a_ptr,
-                                   const int8_t*& b_ptr,  // NOLINT
-                                   const int32_t* bias,
-                                   float32_t*& c_ptr0,  // NOLINT
-                                   float32_t*& c_ptr1,  // NOLINT
-                                   float32_t*& c_ptr2,  // NOLINT
-                                   float32_t*& c_ptr3,  // NOLINT
-                                   float32_t*& c_ptr4,  // NOLINT
-                                   float32_t*& c_ptr5,  // NOLINT
-                                   float32_t*& c_ptr6,  // NOLINT
-                                   float32_t*& c_ptr7,  // NOLINT
-                                   const float32_t* scale,
-                                   bool is_relu,
-                                   int k,
-                                   int tail) {
-  asm volatile(GEMM_SDOT_INT8_KERNEL GEMM_SDOT_FP32_OUT
-               : [a_ptr] "+r"(a_ptr),
-                 [b_ptr] "+r"(b_ptr),
-                 [k] "+r"(k),
-                 [tail] "+r"(tail),
-                 [c_ptr0] "+r"(c_ptr0),
-                 [c_ptr1] "+r"(c_ptr1),
-                 [c_ptr2] "+r"(c_ptr2),
-                 [c_ptr3] "+r"(c_ptr3),
-                 [c_ptr4] "+r"(c_ptr4),
-                 [c_ptr5] "+r"(c_ptr5),
-                 [c_ptr6] "+r"(c_ptr6),
-                 [c_ptr7] "+r"(c_ptr7)
-               : [bias_ptr] "r"(bias), [scale] "r"(scale), [relu] "r"(is_relu)
-               : "cc",
-                 "memory",
-                 "v0",
-                 "v1",
-                 "v2",
-                 "v3",
-                 "v4",
-                 "v5",
-                 "v6",
-                 "v7",
-                 "v8",
-                 "v9",
-                 "v10",
-                 "v11",
-                 "v12",
-                 "v13",
-                 "v14",
-                 "v15",
-                 "v16",
-                 "v17",
-                 "v18",
-                 "v19",
-                 "v20",
-                 "v21",
-                 "v22",
-                 "v23",
-                 "v24",
-                 "v25",
-                 "v26",
-                 "v27",
-                 "v28",
-                 "v29",
-                 "v30",
-                 "v31");
-}
-template <>
-inline void sgemm_sdot_int8_kernel(const int8_t* a_ptr,
-                                   const int8_t*& b_ptr,  // NOLINT
-                                   const int32_t* bias,
-                                   int8_t*& c_ptr0,  // NOLINT
-                                   int8_t*& c_ptr1,  // NOLINT
-                                   int8_t*& c_ptr2,  // NOLINT
-                                   int8_t*& c_ptr3,  // NOLINT
-                                   int8_t*& c_ptr4,  // NOLINT
-                                   int8_t*& c_ptr5,  // NOLINT
-                                   int8_t*& c_ptr6,  // NOLINT
-                                   int8_t*& c_ptr7,  // NOLINT
-                                   const float32_t* scale,
-                                   bool is_relu,
-                                   int k,
-                                   int tail) {
-  asm volatile(GEMM_SDOT_INT8_KERNEL GEMM_SDOT_INT8_OUT
-               : [a_ptr] "+r"(a_ptr),
-                 [b_ptr] "+r"(b_ptr),
-                 [k] "+r"(k),
-                 [tail] "+r"(tail),
-                 [c_ptr0] "+r"(c_ptr0),
-                 [c_ptr1] "+r"(c_ptr1),
-                 [c_ptr2] "+r"(c_ptr2),
-                 [c_ptr3] "+r"(c_ptr3),
-                 [c_ptr4] "+r"(c_ptr4),
-                 [c_ptr5] "+r"(c_ptr5),
-                 [c_ptr6] "+r"(c_ptr6),
-                 [c_ptr7] "+r"(c_ptr7)
-               : [bias_ptr] "r"(bias), [scale] "r"(scale), [relu] "r"(is_relu)
-               : "cc",
-                 "memory",
-                 "v0",
-                 "v1",
-                 "v2",
-                 "v3",
-                 "v4",
-                 "v5",
-                 "v6",
-                 "v7",
-                 "v8",
-                 "v9",
-                 "v10",
-                 "v11",
-                 "v12",
-                 "v13",
-                 "v14",
-                 "v15",
-                 "v16",
-                 "v17",
-                 "v18",
-                 "v19",
-                 "v20",
-                 "v21",
-                 "v22",
-                 "v23",
-                 "v24",
-                 "v25",
-                 "v26",
-                 "v27",
-                 "v28",
-                 "v29",
-                 "v30",
-                 "v31");
-}
-#endif
-
-#else  // armv7
-// clang-format off
-#define GEMM_INT8_KERNEL                                                     \
-  "vld1.8 {d0-d1},    [%[a_ptr]: 128]!\n" /* load 4x2x2 int8, A, k2x2 */     \
-  "vld1.8 {d4-d7},    [%[b_ptr]: 128]!\n" /* load 8x2x2 int8, B, k2x2 */     \
-  "vld1.8 {d8-d9},    [%[bias]]\n"        /* load int32x4 bias */            \
-  "vext.8 q5, q4, q4, #4\n"               /* bias shift 1 int32 */           \
-  "vext.8 q6, q4, q4, #8\n"               /* bias shift 2 int32 */           \
-  "vext.8 q7, q4, q4, #12\n"              /* bias shift 3 int32 */           \
-  "pld [%[a_ptr]]\n"                      /* preload A */                    \
-  "vand   q8, q4, q4\n"                   /* set bias to out00 */            \
-  "vand   q9, q4, q4\n"                   /* set bias to out01 */            \
-  "pld [%[b_ptr]]\n"                      /* preload B */                    \
-  "vand  q10, q5, q5\n"                   /* set bias to out10 */            \
-  "vand  q11, q5, q5\n"                   /* set bias to out11 */            \
-  "pld [%[b_ptr], #64]\n"                 /* preload B */                    \
-  "vand  q12, q6, q6\n"                   /* set bias to out20 */            \
-  "vand  q13, q6, q6\n"                   /* set bias to out21 */            \
-  "pld [%[b_ptr], #128]\n"                /* preload B */                    \
-  "vand  q14, q7, q7\n"                   /* set bias to out30 */            \
-  "vand  q15, q7, q7\n"                   /* set bias to out31 */            \
-  "pld [%[a_ptr], #64]\n"                 /* preload A */                    \
-  "vext.8 d2, d0, d0, #2\n"               /* shift left circular by 2byte */ \
-  "vext.8 d3, d1, d1, #2\n"               /* shift left circular by 2byte */ \
-  "pld [%[b_ptr], #192]\n"                /* preload b */                    \
-  "pld [%[b_ptr], #256]\n"                /* preload b */                    \
-  "pld [%[a_ptr], #128]\n"                /* preload a */                    \
-  "cmp    %[k],   #0\n"                   /* check main loop count */        \
-  "beq    3f\n" /* if k = 0, jump to remains */ /* 1st r0, r1 */             \
-  "vmull.s8  q4, d0, d4\n"                      /* a0 * b0 = c00 */          \
-  "vmull.s8  q5, d0, d5\n"                      /* a0 * b1 = c01 */          \
-  "vmull.s8  q6, d2, d4\n"                      /* a1 * b0 = c10 */          \
-  "vmull.s8  q7, d2, d5\n"                      /* a1 * b1 = c11 */          \
-  "subs %[k], %[k], #1\n" /* loop count -1 */   /* 2nd r0, r1 */             \
-  "vmlal.s8  q4, d1, d6\n"                      /* a0 * b0 = c00 */          \
-  "vmlal.s8  q5, d1, d7\n"                      /* a0 * b1 = c01 */          \
-  "vrev64.32  q0, q0\n"              /* shift left circular by 4byte */      \
-  "vmlal.s8  q6, d3, d6\n"           /* a1 * b0 = c10 */                     \
-  "vmlal.s8  q7, d3, d7\n"           /* a1 * b1 = c11 */                     \
-  "vrev64.32  q1, q1\n"              /* shift left circular by 4byte */      \
-  "beq    8f\n" /* skip main loop */ /* main loop*/                          \
-  "0:\n" /* main loop */             /* 1st r2, r3 */                        \
-  "vpadal.s16 q8, q4\n"    /* pair add and accumulate to int32, c00 */       \
-  "vmull.s8  q4, d0, d4\n" /* a2 * b0 = c20 */                               \
-  "vpadal.s16 q9, q5\n"    /* pair add and accumulate to int32, c01 */       \
-  "vmull.s8  q5, d0, d5\n" /* a2 * b1 = c21 */                               \
-  "vpadal.s16 q10,q6\n"    /* pair add and accumulate to int32, c10 */       \
-  "vmull.s8  q6, d2, d4\n" /* a3 * b0 = c30 */                               \
-  "vpadal.s16 q11,q7\n"    /* pair add and accumulate to int32, c11 */       \
-  "vmull.s8  q7, d2, d5\n" /* a3 * b1 = c31 */                               \
-  "vld1.8 {d4-d5},    [%[b_ptr]: 128]!\n" /* load 4x2x2 int8, B, k2x2 */     \
-  "vmlal.s8  q4, d1, d6\n"                /* a0 * b0 = c00 */                \
-  "vmlal.s8  q5, d1, d7\n"                /* a0 * b1 = c01 */                \
-  "vld1.8 {d0-d1},    [%[a_ptr]: 128]!\n" /* load 4x2x2 int8, A, k2x2 */     \
-  "vmlal.s8  q6, d3, d6\n"                /* a1 * b0 = c10 */                \
-  "vmlal.s8  q7, d3, d7\n"                /* a1 * b1 = c11 */                \
-  "vld1.8 {d6-d7},    [%[b_ptr]: 128]!\n" /* load 4x2x2 int8, B, k2x2 */     \
-  "vext.8 d2, d0, d0, #2\n"               /* shift left circular by 2byte */ \
-  "vext.8 d3, d1, d1, #2\n" /* shift left circular by 2byte */               \
-  "vpadal.s16 q12,q4\n"    /* pair add and accumulate to int32, c20 */       \
-  "vmull.s8  q4, d0, d4\n" /* a0 * b0 = c00 */                               \
-  "vpadal.s16 q13,q5\n"    /* pair add and accumulate to int32, c21 */       \
-  "vmull.s8  q5, d0, d5\n" /* a0 * b1 = c01 */                               \
-  "vpadal.s16 q14,q6\n"    /* pair add and accumulate to int32, c30 */       \
-  "vmull.s8  q6, d2, d4\n" /* a1 * b0 = c10 */                               \
-  "vpadal.s16 q15,q7\n"    /* pair add and accumulate to int32, c31 */       \
-  "vmull.s8  q7, d2, d5\n" /* a1 * b1 = c11 */                               \
-  "subs %[k], %[k], #1\n" /* loop count -1 */ /* 2nd r0, r1 */               \
-  "vmlal.s8  q4, d1, d6\n"                    /* a0 * b0 = c00 */            \
-  "vmlal.s8  q5, d1, d7\n"                    /* a0 * b1 = c01 */            \
-  "vrev64.32  q0, q0\n"                       /* shift left circular by 2 */ \
-  "vmlal.s8  q6, d3, d6\n"                    /* a1 * b0 = c10 */            \
-  "vmlal.s8  q7, d3, d7\n"                    /* a1 * b1 = c11 */            \
-  "vrev64.32  q1, q1\n"                       /* shift left circular by 2 */ \
-  "bgt    0b\n"                               /* jump to main loop */        \
-  "8:\n" /* end of main loop */               /* 1st r2, r3 */               \
-  "vpadal.s16 q8, q4\n"    /* pair add and accumulate to int32, c00 */       \
-  "vmull.s8  q4, d0, d4\n" /* a2 * b0 = c20 */                               \
-  "vpadal.s16 q9, q5\n"    /* pair add and accumulate to int32, c01 */       \
-  "vmull.s8  q5, d0, d5\n" /* a2 * b1 = c21 */                               \
-  "vpadal.s16 q10,q6\n"    /* pair add and accumulate to int32, c10 */       \
-  "vmull.s8  q6, d2, d4\n" /* a3 * b0 = c30 */                               \
-  "vpadal.s16 q11,q7\n"    /* pair add and accumulate to int32, c11 */       \
-  "vmull.s8  q7, d2, d5\n" /* a3 * b1 = c31 */ /* 2nd r2, r3 */              \
-  "vmlal.s8  q4, d1, d6\n"                     /* a0 * b0 = c20 */           \
-  "vmlal.s8  q5, d1, d7\n"                     /* a0 * b1 = c21 */           \
-  "vmlal.s8  q6, d3, d6\n"                     /* a1 * b0 = c30 */           \
-  "vmlal.s8  q7, d3, d7\n"                     /* a1 * b1 = c31 */           \
-  "cmp    %[rem],    #0\n"                     /* skip remain */             \
-  "beq    5f\n"                                                              \
-  "mov r0,    #32\n"                 /* address offset */                    \
-  "vld1.8 {d0}, [%[a_ptr]]\n"        /* load a to d0, final */               \
-  "vld1.8 {d4-d5}, [%[b_ptr]], r0\n" /* load b to d4, d5 */                  \
-  "5:\n"                             /* skip rem */                          \
-  "vpadal.s16 q12, q4\n"    /* pair add and accumulate to int32, c20 */      \
-  "vpadal.s16 q13, q5\n"    /* pair add and accumulate to int32, c21 */      \
-  "vpadal.s16 q14, q6\n"    /* pair add and accumulate to int32, c30 */      \
-  "vpadal.s16 q15, q7\n"    /* pair add and accumulate to int32, c31 */      \
-  "3:\n"                    /* process remain k */                           \
-  "cmp    %[rem],    #0\n"  /* skip remain */                                \
-  "beq    7f\n"             /* process remain k */                           \
-  "vext.8 d1, d0, d0, #2\n" /* shift left 2bytes */                          \
-  "vext.8 d2, d0, d0, #4\n" /* shift left 4bytes */                          \
-  "vext.8 d3, d0, d0, #6\n" /* shift left 6bytes */ /* 1st r0, r1 */         \
-  "vmull.s8  q4, d0, d4\n"                          /* a0 * b0 = c00 */      \
-  "vmull.s8  q5, d0, d5\n"                          /* a0 * b1 = c01 */      \
-  "vmull.s8  q6, d1, d4\n"                          /* a1 * b0 = c10 */      \
-  "vmull.s8  q7, d1, d5\n" /* a1 * b1 = c11 */      /* 1st r2, r3 */         \
-  "vpadal.s16 q8, q4\n"        /* pair add and accumulate to int32, c00 */   \
-  "vmull.s8  q4, d2, d4\n"     /* a2 * b0 = c20 */                           \
-  "vpadal.s16 q9, q5\n"        /* pair add and accumulate to int32, c01 */   \
-  "vmull.s8  q5, d2, d5\n"     /* a2 * b1 = c21 */                           \
-  "vpadal.s16 q10,q6\n"        /* pair add and accumulate to int32, c10 */   \
-  "vmull.s8  q6, d3, d4\n"     /* a3 * b0 = c30 */                           \
-  "vpadal.s16 q11,q7\n"        /* pair add and accumulate to int32, c11 */   \
-  "vmull.s8  q7, d3, d5\n"     /* a3 * b1 = c31 */                           \
-  "vpadal.s16 q12, q4\n"       /* pair add and accumulate to int32, c20 */   \
-  "vpadal.s16 q13, q5\n"       /* pair add and accumulate to int32, c21 */   \
-  "vpadal.s16 q14, q6\n"       /* pair add and accumulate to int32, c30 */   \
-  "vpadal.s16 q15, q7\n"       /* pair add and accumulate to int32, c31 */   \
-  "7: \n" /* do relu */        /* do relu */                                 \
-  "cmp    %[is_relu],    #0\n" /* skip relu */                               \
-  "beq    9f\n"                /* skip relu */                               \
-  "vmov.i32   q0, #0\n"        /* for relu */                                \
-  "vmax.s32   q8, q8, q0\n"    /* relu */                                    \
-  "vmax.s32   q9, q9, q0\n"    /* relu */                                    \
-  "vmax.s32  q10,q10, q0\n"    /* relu */                                    \
-  "vmax.s32  q11,q11, q0\n"    /* relu */                                    \
-  "vmax.s32  q12,q12, q0\n"    /* relu */                                    \
-  "vmax.s32  q13,q13, q0\n"    /* relu */                                    \
-  "vmax.s32  q14,q14, q0\n"    /* relu */                                    \
-  "vmax.s32  q15,q15, q0\n" /* relu */ /* unpack the result */               \
-  "9:\n" /* unpack */                  /* trans 1 */                         \
-  "vtrn.32    q8, q10\n" /* get q8 */                                    \
-  "vtrn.32   q12, q14\n" /* get q12 */                                    \
-  "vtrn.32    q9, q11\n" /* get q9 */                                    \
-  "vtrn.32   q13, q15\n" /* get q13*/ \
-  "vswp   d17,    d24\n" /* get q8*/                                    \
-  "vswp   d21,    d28\n" /* get q10 */                                    \
-  "vswp   d19,    d26\n" /* get q9 */                                    \
-  "vswp   d23,    d30\n" /* get q11 */  \
-  "vext.8 q0, q10, q10, #12\n" /* circular shift left 1 q0 */ \
-  "vext.8 q2, q12, q12, #8\n"  /* circular shift left 2 q2 */ \
-  "vext.8 q4, q14, q14, #4\n"  /* circular shift left 3 q4 */ \
-  "vext.8 q1, q11, q11, #12\n" /* circular shift left 1 q1 */ \
-  "vext.8 q3, q13, q13, #8\n"  /* circular shift left 2 q3 */ \
-  "vext.8 q5, q15, q15, #4\n" /* circular shift left 3 q5 */ \
-  "vtrn.32    q8, q0\n" /* get q8 */ \
-  "vtrn.32    q2, q4\n" /* get q2 */ \
-  "vtrn.32    q9, q1\n" /* get q9 */ \
-  "vtrn.32    q3, q5\n" /* get q3 */ /* trans 2 */ \
-  "vswp   d17,    d4\n" /* get q8 */ \
-  "vswp   d1, d8\n" /* get q0: a1*/ \
-  "vswp   d19,    d6\n" /* get q9: */ \
-  "vswp   d3, d10\n" /* get q1: a3b3 */
-
-// clang-format off
-
-#define GEMM_INT8_INT32_OUT                                 \
-  /* write output */                                        \
-  "vst1.32    {d16-d19},  [%[c_ptr0]]!\n" /* write outr0 */ \
-  "vst1.32    {d0-d3},    [%[c_ptr1]]!\n" /* write outr1 */ \
-  "vst1.32    {d4-d7},    [%[c_ptr2]]!\n" /* write outr2 */ \
-  "vst1.32    {d8-d11},   [%[c_ptr3]]!\n" /* write outr3 */
-
-#define GEMM_INT8_FP32_OUT                                               \
-  /* write output */                                                     \
-  "vld1.32    {d12-d13},  [%[scale]]\n" /* load scale */                 \
-  "vcvt.f32.s32   q10, q8\n"            /* r00, cvt int32 to fp32*/      \
-  "vcvt.f32.s32   q11, q9\n"            /* r01, cvt int32 to fp32*/      \
-  "vcvt.f32.s32   q12, q0\n"            /* r10, cvt int32 to fp32*/      \
-  "vcvt.f32.s32   q13, q1\n"            /* r11, cvt int32 to fp32*/      \
-  "vmul.f32 q8, q10, d12[0]\n" /*  r00, mul scale to get final result */ \
-  "vmul.f32 q9, q11, d12[0]\n" /*  r01, mul scale to get final result */ \
-  "vmul.f32 q0, q12, d12[1]\n" /*  r10, mul scale to get final result */ \
-  "vmul.f32 q1, q13, d12[1]\n" /*  r11, mul scale to get final result */ \
-  "vcvt.f32.s32   q10, q2\n"   /* r20, cvt int32 to fp32*/               \
-  "vcvt.f32.s32   q11, q3\n"   /* r21, cvt int32 to fp32*/               \
-  "vcvt.f32.s32   q12, q4\n"   /* r30, cvt int32 to fp32*/               \
-  "vcvt.f32.s32   q13, q5\n"   /* r31, cvt int32 to fp32*/               \
-  "vst1.32    {d16-d19},  [%[c_ptr0]]!\n" /* write r0, float32x4 x2 */   \
-  "vmul.f32 q2, q10, d13[0]\n" /* r20, mul scale to get final result */  \
-  "vmul.f32 q3, q11, d13[0]\n" /* r21, mul scale to get final result */  \
-  "vst1.32    {d0-d3},    [%[c_ptr1]]!\n" /* write r1, float32x4 x2 */   \
-  "vmul.f32 q4, q12, d13[1]\n" /* r30, mul scale to get final result */  \
-  "vmul.f32 q5, q13, d13[1]\n" /* r31, mul scale to get final result */  \
-  "vst1.32    {d4-d7},    [%[c_ptr2]]!\n" /* write r2, float32x4 x2 */   \
-  "vst1.32    {d8-d11},   [%[c_ptr3]]!\n" /* write r3, float32x4 x2 */
-
-#define GEMM_INT8_INT8_OUT                                                    \
-  /* write output */                                                          \
-  "vld1.32    {d12-d13},  [%[scale]]\n" /* load scale */                      \
-  "vmov.f32  q7, #-0.5\n"               /* neg offset */                      \
-  "vcvt.f32.s32   q10, q8\n"            /* r00, cvt int32 to fp32*/           \
-  "vcvt.f32.s32   q11, q9\n"            /* r01, cvt int32 to fp32*/           \
-  "vcvt.f32.s32   q12, q0\n"            /* r10, cvt int32 to fp32*/           \
-  "vcvt.f32.s32   q13, q1\n"            /* r11, cvt int32 to fp32*/           \
-  "vmov.f32  q8, #0.5\n"                /* pos offset */                      \
-  "vmov.f32  q9, #0.5\n"                /* pos offset */                      \
-  "vmov.f32  q0, #0.5\n"                /* pos offset */                      \
-  "vmov.f32  q1, #0.5\n"                /* pos offset */                      \
-  "vcgt.f32  q14, q10, #0\n"            /* get pos mask */                    \
-  "vcgt.f32  q15, q11, #0\n"            /* get pos mask */                    \
-  "vbif.f32  q8, q7, q14\n"             /* get right offset */                \
-  "vbif.f32  q9, q7, q15\n"             /* get right offset */                \
-  "vcgt.f32  q14, q12, #0\n"            /* get pos mask */                    \
-  "vcgt.f32  q15, q13, #0\n"            /* get pos mask */                    \
-  "vbif.f32  q0, q7, q14\n"             /* get right offset */                \
-  "vbif.f32  q1, q7, q15\n"             /* get right offset */                \
-  "vmla.f32 q8, q10, d12[0]\n"       /* r00, mul scale to get final result */ \
-  "vmla.f32 q9, q11, d12[0]\n"       /* r01, mul scale to get final result */ \
-  "vmla.f32 q0, q12, d12[1]\n"       /* r10, mul scale to get final result */ \
-  "vmla.f32 q1, q13, d12[1]\n"       /* r11, mul scale to get final result */ \
-  "vcvt.f32.s32   q10, q2\n"         /* r20, cvt int32 to fp32*/              \
-  "vcvt.f32.s32   q11, q3\n"         /* r21, cvt int32 to fp32*/              \
-  "vcvt.f32.s32   q12, q4\n"         /* r30, cvt int32 to fp32*/              \
-  "vcvt.f32.s32   q13, q5\n"         /* r31, cvt int32 to fp32*/              \
-  "vmov.f32  q2, #0.5\n"             /* pos offset */                         \
-  "vmov.f32  q3, #0.5\n"             /* pos offset */                         \
-  "vmov.f32  q4, #0.5\n"             /* pos offset */                         \
-  "vmov.f32  q5, #0.5\n"             /* pos offset */                         \
-  "vcgt.f32  q14, q10, #0\n"         /* get pos mask */                       \
-  "vcgt.f32  q15, q11, #0\n"         /* get pos mask */                       \
-  "vbif.f32  q2, q7, q14\n"          /* get right offset */                   \
-  "vbif.f32  q3, q7, q15\n"          /* get right offset */                   \
-  "vcgt.f32  q14, q12, #0\n"         /* get pos mask */                       \
-  "vcgt.f32  q15, q13, #0\n"         /* get pos mask */                       \
-  "vbif.f32  q4, q7, q14\n"          /* get right offset */                   \
-  "vbif.f32  q5, q7, q15\n"          /* get right offset */                   \
-  "vmla.f32 q2, q10, d13[0]\n"       /* r20, mul scale to get final result */ \
-  "vmla.f32 q3, q11, d13[0]\n"       /* r21, mul scale to get final result */ \
-  "vmla.f32 q4, q12, d13[1]\n"       /* r30, mul scale to get final result */ \
-  "vmla.f32 q5, q13, d13[1]\n"       /* r31, mul scale to get final result */ \
-  "vcvt.s32.f32   q6, q8\n"          /* r00, fp32->int32 */                   \
-  "vcvt.s32.f32   q7, q9\n"          /* r01, fp32->int32 */                   \
-  "vcvt.s32.f32   q10, q0\n"         /* r10, fp32->int32 */                   \
-  "vcvt.s32.f32   q11, q1\n"         /* r11, fp32->int32 */                   \
-  "vcvt.s32.f32   q12, q2\n"         /* r20, fp32->int32 */                   \
-  "vcvt.s32.f32   q13, q3\n"         /* r21, fp32->int32 */                   \
-  "vcvt.s32.f32   q14, q4\n"         /* r30, fp32->int32 */                   \
-  "vcvt.s32.f32   q15, q5\n"         /* r31, fp32->int32 */                   \
-  "vqmovn.s32 d0, q6\n"              /* r00, int32 -> int16 */                \
-  "vqmovn.s32 d1, q7\n"              /* r01, int32 -> int16 */                \
-  "vqmovn.s32 d2, q10\n"             /* r10, int32 -> int16 */                \
-  "vqmovn.s32 d3, q11\n"             /* r11, int32 -> int16 */                \
-  "vqmovn.s32 d4, q12\n"             /* r00, int32 -> int16 */                \
-  "vqmovn.s32 d5, q13\n"             /* r01, int32 -> int16 */                \
-  "vqmovn.s32 d6, q14\n"             /* r10, int32 -> int16 */                \
-  "vqmovn.s32 d7, q15\n"             /* r11, int32 -> int16 */                \
-  "vqmovn.s16 d8, q0\n"              /* 0, int16 -> int8 */                   \
-  "vqmovn.s16 d9, q1\n"              /* 1, int16 -> int8 */                   \
-  "vqmovn.s16 d10, q2\n"             /* 2, int16 -> int8 */                   \
-  "vqmovn.s16 d11, q3\n"             /* 3, int16 -> int8 */                   \
-  "vst1.32    {d8}, [%[c_ptr0]]!\n"  /* write r0*/                            \
-  "vst1.32    {d9}, [%[c_ptr1]]!\n"  /* write r1*/                            \
-  "vst1.32    {d10}, [%[c_ptr2]]!\n" /* write r2*/                            \
-  "vst1.32    {d11}, [%[c_ptr3]]!\n" /* write r3*/
-
-template <>
-inline void gemm_int8_kernel(const int8_t* a_ptr, const int8_t*& b_ptr,    // NOLINT
-                             const int32_t* bias, int32_t*& c_ptr0,        // NOLINT
-                             int32_t*& c_ptr1, int32_t*& c_ptr2,           // NOLINT
-                             int32_t*& c_ptr3, const float* scale, bool is_relu, // NOLINT
-                             int k, int rem) {
-  asm volatile(GEMM_INT8_KERNEL GEMM_INT8_INT32_OUT
-               : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr),
-                 [c_ptr0] "+r"(c_ptr0), [c_ptr1] "+r"(c_ptr1),
-                 [c_ptr2] "+r"(c_ptr2), [c_ptr3] "+r"(c_ptr3), [k] "+r"(k)
-               : [is_relu] "r"(is_relu), [bias] "r"(bias), [rem] "r"(rem)
-               : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
-                 "q10", "q11", "q12", "q13", "q14", "q15", "r0", "cc");
-}
-
-template <>
-inline void gemm_int8_kernel(const int8_t* a_ptr, const int8_t*& b_ptr,  // NOLINT
-                             const int32_t* bias, float*& c_ptr0,        // NOLINT
-                             float*& c_ptr1, float*& c_ptr2, float*& c_ptr3, // NOLINT
-                             const float* scale, bool is_relu, int k, int rem) {
-  asm volatile(GEMM_INT8_KERNEL GEMM_INT8_FP32_OUT
-               : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr),
-                 [c_ptr0] "+r"(c_ptr0), [c_ptr1] "+r"(c_ptr1),
-                 [c_ptr2] "+r"(c_ptr2), [c_ptr3] "+r"(c_ptr3), [k] "+r"(k)
-               : [is_relu] "r"(is_relu), [bias] "r"(bias), [rem] "r"(rem),
-                 [scale] "r"(scale)
-               : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
-                 "q10", "q11", "q12", "q13", "q14", "q15", "r0", "cc");
-}
-
-template <>
-inline void gemm_int8_kernel(const int8_t* a_ptr, const int8_t*& b_ptr,   // NOLINT
-                             const int32_t* bias, int8_t*& c_ptr0,        // NOLINT
-                             int8_t*& c_ptr1, int8_t*& c_ptr2, int8_t*& c_ptr3, // NOLINT
-                             const float* scale, bool is_relu, int k, int rem) {
-  asm volatile(GEMM_INT8_KERNEL GEMM_INT8_INT8_OUT
-               : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr),
-                 [c_ptr0] "+r"(c_ptr0), [c_ptr1] "+r"(c_ptr1),
-                 [c_ptr2] "+r"(c_ptr2), [c_ptr3] "+r"(c_ptr3), [k] "+r"(k)
-               : [is_relu] "r"(is_relu), [bias] "r"(bias), [rem] "r"(rem),
-                 [scale] "r"(scale)
-               : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
-                 "q10", "q11", "q12", "q13", "q14", "q15", "r0", "cc");
-}
-#endif                               //__aarch64__ // NOLINT
-
-// gemm wrapper
-template <typename Dtype>
-void gemm_prepack_oth_int8(const int8_t* A_packed,
-                           const int8_t* B,
-                           const int* bias,
-                           Dtype* C,
-                           int M,
-                           int N,
-                           int K,
-                           bool is_bias,
-                           bool is_relu,
-                           bool is_transB,
-                           const float* scale,
-                           ARMContext* ctx) {
-  const int KUP = ROUNDUP(K, KBLOCK_INT8);
-  size_t llc_size = ctx->llc_size() / 4;
-  auto workspace = ctx->workspace_data<int8_t>();
-  int threads = ctx->threads();
-  int x_block = llc_size / (sizeof(int8_t) * (KUP + MBLOCK_INT8_OTH));
-  x_block /= NBLOCK_INT8_OTH;
-  x_block *= NBLOCK_INT8_OTH;
-  int x_num = (N + (x_block - 1)) / x_block;
-  x_block = (N + x_num - 1) / x_num;
-  x_block = (x_block + NBLOCK_INT8_OTH - 1) / NBLOCK_INT8_OTH;
-  x_block *= NBLOCK_INT8_OTH;
-  int k = K / KBLOCK_INT8;
-  int k_rem = K & (KBLOCK_INT8 - 1);
-  if (k_rem > KBLOCK_INT8 / 2) {
-    k_rem = 0;
-    k += 1;
-  }
-  int n_rem = N & (NBLOCK_INT8_OTH - 1);
-
-  auto* b_tmp = static_cast<int8_t*>(workspace);
-
-  auto* zerobuf = static_cast<int8_t *>(malloc(x_block * \
-                  (sizeof(int8_t) + sizeof(Dtype))));
-  memset(zerobuf, 0, x_block * sizeof(int8_t));
-  auto* trash_ptr = reinterpret_cast<Dtype*>(zerobuf + \
-                  x_block * sizeof(int8_t));
-
-  //! apanel is pre_compute outside gemm
-
-  for (unsigned int x0 = 0; x0 < N; x0 += x_block) {
-    unsigned int xmax = x0 + x_block;
-    bool flag_rem = false;
-    if (xmax >= N) {
-      xmax = N;
-      flag_rem = n_rem > 0;
-    }
-    int bblocks = (xmax - x0 + NBLOCK_INT8_OTH - 1) / NBLOCK_INT8_OTH;
-    //! load bpanel
-    int8_t* b_pannel = b_tmp;
-    if (is_transB) {
-      packb_trans_int8(b_pannel, B, K, 0, K, x0, xmax, zerobuf);
-    } else {
-      packb_int8(b_pannel, B, N, 0, K, x0, xmax, zerobuf);
-    }
-
-#pragma omp parallel for num_threads(threads)
-    for (unsigned int y = 0; y < M; y += MBLOCK_INT8_OTH) {
-      Dtype out0[NBLOCK_INT8_OTH] = {0};
-      Dtype out1[NBLOCK_INT8_OTH] = {0};
-      Dtype out2[NBLOCK_INT8_OTH] = {0};
-      Dtype out3[NBLOCK_INT8_OTH] = {0};
-      Dtype* c_ptr0 = C + y * N + x0;
-      Dtype* c_ptr1 = c_ptr0 + N;
-      Dtype* c_ptr2 = c_ptr1 + N;
-      Dtype* c_ptr3 = c_ptr2 + N;
-      Dtype* tmp0 = nullptr;
-      Dtype* tmp1 = nullptr;
-      Dtype* tmp2 = nullptr;
-      Dtype* tmp3 = nullptr;
-      float32_t scale_local[4];
-      int32_t bias_local[4] = {0, 0, 0, 0};
-      if (is_bias) {
-        bias_local[0] = bias[y];
-        bias_local[1] = bias[y + 1];
-        bias_local[2] = bias[y + 2];
-        bias_local[3] = bias[y + 3];
-      }
-      if (scale) {
-        scale_local[0] = scale[y];
-        scale_local[1] = scale[y + 1];
-        scale_local[2] = scale[y + 2];
-        scale_local[3] = scale[y + 3];
-      }
-      if (y + MBLOCK_INT8_OTH > M) {
-        switch (y + MBLOCK_INT8_OTH - M) {
-          case 3:
-            c_ptr1 = trash_ptr;
-          case 2:
-            c_ptr2 = trash_ptr;
-          case 1:
-            c_ptr3 = trash_ptr;
-          default:
-            break;
-        }
-      }
-      const int8_t* a_ptr_l = A_packed + y * KUP;
-      const int8_t* b_ptr = b_pannel;
-      for (int xb = 0; xb < bblocks; xb++) {
-        if (flag_rem && (xb == bblocks - 1)) {
-          tmp0 = c_ptr0;
-          tmp1 = c_ptr1;
-          tmp2 = c_ptr2;
-          tmp3 = c_ptr3;
-          c_ptr0 = out0;
-          c_ptr1 = out1;
-          c_ptr2 = out2;
-          c_ptr3 = out3;
-        }
-        gemm_int8_kernel<Dtype>(a_ptr_l, b_ptr, bias_local,
-                        c_ptr0, c_ptr1, c_ptr2, c_ptr3,
-                        scale_local, is_relu, k, k_rem);
-        if (flag_rem && (xb == bblocks - 1)) {
-          for (int i = 0; i < n_rem; ++i) {
-            *(tmp0++) = out0[i];
-            *(tmp1++) = out1[i];
-            *(tmp2++) = out2[i];
-            *(tmp3++) = out3[i];
-          }
-        }
-      }
-    }
-  }
-  free(zerobuf);
-}
-
-/***********************************************************************/
-// prepack A according to gemm kernel
-// A block size: (<4x2>x1) x2, with unroll=2 can be described as below:
-// origin A data:
-// A_origin(no trans, m x k):
-//      r0: ==>   a0, b0, c0, d0, e0, f0, g0, h0
-//      r1: ==>   a1, b1, c1, d1, e1, f1, g1, h1
-//      r2: ==>   a2, b2, c2, d2, e2, f2, g2, h2
-//      r3: ==>   a3, b3, c3, d3, e3, f3, g3, h3
-// packed A
-//      a0,b0, a1,b1, a2,b2, a3,b3;
-//      c0,d0, c1,d1, c2,d2, c3,d3;
-//      e0,f0, e1,f1, e2,f2, e3,f3;
-//      g0,h0, g1,h1, g2,h2, g3,h3;
-/***********************************************************************/
-void prepackA_m4k2x2_int8(int8_t* out, const int8_t* in, const int ldin,
-                          const int m0, const int mmax, const int k0,
-                          const int kmax) {
-  int y_len = mmax - m0;
-  int x_len = kmax - k0;
-  int x_len_roundup = ROUNDUP(x_len, KBLOCK_INT8);
-  auto zerobuff = static_cast<int8_t*>(malloc(x_len_roundup * sizeof(char)));
-  memset(zerobuff, 0, sizeof(char) * x_len_roundup);
-
-  const int8_t* inptr = in + m0 * ldin + k0;
-  uint8_t remain = static_cast<uint8_t>(x_len & (KBLOCK_INT8 - 1));
-
-#pragma omp parallel for
-  for (int y = 0; y < y_len; y += MBLOCK_INT8_OTH) {
-    const int8_t* ptr0 = inptr + y * ldin;
-    const int8_t* ptr1 = ptr0 + ldin;
-    const int8_t* ptr2 = ptr1 + ldin;
-    const int8_t* ptr3 = ptr2 + ldin;
-    //! cope with row index exceed real size, set to zero buffer
-    if ((y + MBLOCK_INT8_OTH) > y_len) {
-      switch ((y + MBLOCK_INT8_OTH) - y_len) {
-        case 3:
-          ptr1 = zerobuff;
-        case 2:
-          ptr2 = zerobuff;
-        case 1:
-          ptr3 = zerobuff;
-        default:
-          break;
-      }
-    }
-    int8_t* ptr_out = out + y * x_len_roundup;
-    int i = 0;
-    for (; i < x_len + 1 - 2 * KBLOCK_INT8; i += 2 * KBLOCK_INT8) {
-#ifdef __aarch64__
-      asm volatile(
-          "ld1    {v0.8b}, [%[ptr0]], #8\n" /* load r0, 8 int8 */
-          "ld1    {v1.8b}, [%[ptr1]], #8\n" /* load r1, 8 int8 */
-          "ld1    {v2.8b}, [%[ptr2]], #8\n" /* load r2, 8 int8 */
-          "ld1    {v3.8b}, [%[ptr3]], #8\n" /* load r3, 8 int8 */
-          "trn1   v4.4h, v0.4h, v1.4h\n"    /* get a0,b0, a2,b2 */
-          "trn2   v5.4h, v0.4h, v1.4h\n"    /* get a1,b1, a3,b3 */
-          "trn1   v6.4h, v2.4h, v3.4h\n"    /* get c0,d0, c2,d2 */
-          "trn2   v7.4h, v2.4h, v3.4h\n"    /* get c1,d1, c3,d3 */
-          "trn1   v0.2s, v4.2s, v6.2s\n"    /* get a0,b0, c0,d0 */
-          "trn2   v2.2s, v4.2s, v6.2s\n"    /* get a2,b2, c2,d2 */
-          "trn1   v1.2s, v5.2s, v7.2s\n"    /* get a1,b1, c1,d1 */
-          "trn2   v3.2s, v5.2s, v7.2s\n"    /* get a3,b3, c3,d3 */
-          "st1    {v0.8b, v1.8b, v2.8b, v3.8b}, [%[ptr_out]], #32\n" /* write
-                                                                        out*/
-          : [ptr_out] "+r"(ptr_out), [ptr0] "+r"(ptr0), [ptr1] "+r"(ptr1),
-            [ptr2] "+r"(ptr2), [ptr3] "+r"(ptr3)
-          :
-          : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "cc", "memory");
-#else   // armv7
-      asm volatile(
-          "vld1.8 {d0}, [%[ptr0]]!\n" /* load r0, 8 int8,
-                                         a0,b0,c0,d0,e0,f0,g0,h0 */
-          "vld1.8 {d1}, [%[ptr1]]!\n" /* load r1, 8 int8,
-                                         a1,b1,c1,d1,e1,f1,g1,h1 */
-          "vld1.8 {d2}, [%[ptr2]]!\n" /* load r2, 8 int8,
-                                         a2,b2,c2,d2,e2,f2,g2,h2 */
-          "vld1.8 {d3}, [%[ptr3]]!\n" /* load r3, 8 int8,
-                                         a3,b3,c3,d3,e3,f3,g3,h3 */
-          "vtrn.16    d0, d1\n" /* trans, d0: a0,b0,a1,b1, e0,f0,e1,f1; d1:
-                                   c0,d0,c1,d1, g0,h0,g1,h1 */
-          "vtrn.16    d2, d3\n" /* trans, d2: a2,b2,a3,b3, e2,f2,e3,f3; d3:
-                                   c2,d2,c3,d3, g2,h2,g3,h3 */
-          "vtrn.32    d0, d2\n" /* trans, d0: a0,b0,a1,b1, a2,b2,a3,b3; d2:
-                                   e0,f0,e1,f1, e2,f2,e3,f3 */
-          "vtrn.32    d1, d3\n" /* trans, d1: c0,d0,c1,d1, e2,f2,e3,f3; d3:
-                                   g0,h0,g1,h1, g2,h2,g3,h3 */
-          "vst1.32 {d0-d3}, [%[outptr]]!\n" /* write to output ptr */
-          : [outptr] "+r"(ptr_out), [ptr0] "+r"(ptr0), [ptr1] "+r"(ptr1),
-            [ptr2] "+r"(ptr2), [ptr3] "+r"(ptr3)
-          :
-          : "q0", "q1", "cc", "memory");
-#endif  //__aarch64 // NOLINT
-    }
-    if (i + KBLOCK_INT8 <= x_len) {
-      ptr_out[0] = ptr0[0];
-      ptr_out[1] = ptr0[1];
-      ptr_out[2] = ptr1[0];
-      ptr_out[3] = ptr1[1];
-      ptr_out[4] = ptr2[0];
-      ptr_out[5] = ptr2[1];
-      ptr_out[6] = ptr3[0];
-      ptr_out[7] = ptr3[1];
-      // unroll
-      ptr_out[8] = ptr0[2];
-      ptr_out[9] = ptr0[3];
-      ptr_out[10] = ptr1[2];
-      ptr_out[11] = ptr1[3];
-      ptr_out[12] = ptr2[2];
-      ptr_out[13] = ptr2[3];
-      ptr_out[14] = ptr3[2];
-      ptr_out[15] = ptr3[3];
-      ptr_out += 16;
-      ptr0 += 4;
-      ptr1 += 4;
-      ptr2 += 4;
-      ptr3 += 4;
-    }
-    switch (remain) {
-      case 0:
-        break;
-      case 1:
-        ptr_out[0] = ptr0[0];
-        ptr_out[1] = 0;
-        ptr_out[2] = ptr1[0];
-        ptr_out[3] = 0;
-        ptr_out[4] = ptr2[0];
-        ptr_out[5] = 0;
-        ptr_out[6] = ptr3[0];
-        ptr_out[7] = 0;
-        // unroll
-        ptr_out[8] = 0;
-        ptr_out[9] = 0;
-        ptr_out[10] = 0;
-        ptr_out[11] = 0;
-        ptr_out[12] = 0;
-        ptr_out[13] = 0;
-        ptr_out[14] = 0;
-        ptr_out[15] = 0;
-        ptr_out += 16;
-        break;
-      case 2:
-        ptr_out[0] = ptr0[0];
-        ptr_out[1] = ptr0[1];
-        ptr_out[2] = ptr1[0];
-        ptr_out[3] = ptr1[1];
-        ptr_out[4] = ptr2[0];
-        ptr_out[5] = ptr2[1];
-        ptr_out[6] = ptr3[0];
-        ptr_out[7] = ptr3[1];
-        // unroll
-        ptr_out[8] = 0;
-        ptr_out[9] = 0;
-        ptr_out[10] = 0;
-        ptr_out[11] = 0;
-        ptr_out[12] = 0;
-        ptr_out[13] = 0;
-        ptr_out[14] = 0;
-        ptr_out[15] = 0;
-        ptr_out += 16;
-        break;
-      case 3:
-        ptr_out[0] = ptr0[0];
-        ptr_out[1] = ptr0[1];
-        ptr_out[2] = ptr1[0];
-        ptr_out[3] = ptr1[1];
-        ptr_out[4] = ptr2[0];
-        ptr_out[5] = ptr2[1];
-        ptr_out[6] = ptr3[0];
-        ptr_out[7] = ptr3[1];
-        // unroll
-        ptr_out[8] = ptr0[2];
-        ptr_out[9] = 0;
-        ptr_out[10] = ptr1[2];
-        ptr_out[11] = 0;
-        ptr_out[12] = ptr2[2];
-        ptr_out[13] = 0;
-        ptr_out[14] = ptr3[2];
-        ptr_out[15] = 0;
-        ptr_out += 16;
-        break;
-      default:
-        break;
-    }
-  }
-  free(zerobuff);
-}
-
-/***************************************************************************/
-// prepack A according to gemm kernel
-// A block size: <4x2>x2, unroll x4, can be described as below:
-// origin A data:
-// A_origin(no trans, k x m):
-//      r0: ==>   a0, a1, a2, a3 .... a12, a13, a14, a15
-//      r1: ==>   b0, b1, b2, b3 .... b12, b13, b14, b15
-//      r2: ==>   c0, c1, c2, c3 .... c12, c13, c14, c15
-//      r3: ==>   d0, d1, d2, d3 .... d12, d13, d14, d15
-// packed A:
-//      a0,b0, a1,b1, a2,b2, a3,b3;
-//      c0,d0, c1,d1, c2,d2, c3,d3;----block0
-//      a4,b4, a5,b5, a6,b6, a7,b7;
-//      c4,d4, c5,d5, c6,d6, c7,d7;----block1
-//      a8,b8, a9,b9, a10,b10, a11,b11;
-//      c8,d8, c9,d9, c10,d10, c11,d11;----block2
-//      a12,b12, a13,b13, a14,b14, a15,b15;
-//      c12,d12, c13,d13, c14,d14, c15,d15;----block3
-/***************************************************************************/
-void prepackA_m4k2x2_trans_int8(int8_t* out, const int8_t* in, const int ldin,
-                                const int m0, const int mmax, const int k0,
-                                const int kmax) {
-  int xlen = mmax - m0;
-  int ylen = kmax - k0;
-  int ylen_roundup = ROUNDUP(ylen, KBLOCK_INT8);
-  int xlen_roundup = ROUNDUP(xlen, MBLOCK_INT8_OTH);
-
-  const int MUNROLL = 4;
-  int mcnt = xlen / (MUNROLL * MBLOCK_INT8_OTH);
-  int x_rem = xlen & (MUNROLL * MBLOCK_INT8_OTH - 1);
-  int m_rem = (x_rem + MBLOCK_INT8_OTH - 1) / MBLOCK_INT8_OTH;
-
-  const uint8_t mask_buffer[16] = {0, 1, 2,  3,  4,  5,  6,  7,
-                                   8, 9, 10, 11, 12, 13, 14, 15};
-  int8x16_t vzero = vdupq_n_s8(0);
-  uint8x16_t vmask = vcltq_u8(vld1q_u8(mask_buffer), vdupq_n_u8(x_rem));
-
-  int stride_out = ylen_roundup * MBLOCK_INT8_OTH;
-
-  int8_t* zerobuf = static_cast<int8_t*>(malloc(xlen_roundup));
-  memset(zerobuf, 0, xlen_roundup);
-
-  const int8_t* inr = in + ldin * k0 + m0;
-#pragma omp parallel for
-  for (int y = 0; y < ylen; y += KBLOCK_INT8) {
-    const int8_t* ptr0 = inr + y * ldin;
-    const int8_t* ptr1 = ptr0 + ldin;
-    const int8_t* ptr2 = ptr1 + ldin;
-    const int8_t* ptr3 = ptr2 + ldin;
-    int8_t* ptr_out = out + MBLOCK_INT8_OTH * y;
-    if (y + KBLOCK_INT8 > ylen) {
-      switch (y + KBLOCK_INT8 - ylen) {
-        case 3:
-          ptr1 = zerobuf;
-        case 2:
-          ptr2 = zerobuf;
-        case 1:
-          ptr3 = zerobuf;
-        default:
-          break;
-      }
-    }
-    int k = mcnt;
-    int rem = m_rem;
-#ifdef __aarch64__
-    asm volatile(
-        "ld1    {v0.16b},   [%[ptr0]],  #16\n" /* load r0 */
-        "ld1    {v1.16b},   [%[ptr1]],  #16\n" /* load r1 */
-        "ld1    {v2.16b},   [%[ptr2]],  #16\n" /* load r2 */
-        "ld1    {v3.16b},   [%[ptr3]],  #16\n" /* load r3 */
-        "cbz    %w[k], 1f\n"                   /* jump to remain */
-        "0:\n"                                 /* main loop */
-        /* trans 16b */
-        "trn1   v4.16b, v0.16b, v1.16b\n" /* get a0,b0, a2,b2, a4,b4, a6,b6,
-                                             a8,b8, a10,b10, a12,b12, a14,b14 */
-        "trn2   v5.16b, v0.16b, v1.16b\n" /* get a1,b1, a3,b3, a5,b5, a7,b7,
-                                             a9,b9, a11,b11, a13,b13, a15,b15 */
-        "trn1   v6.16b, v2.16b, v3.16b\n" /* get c0,d0, c2,d2, c4,d4, c6,d6,
-                                             c8,d8, c10,d10, c12,d12, c14,d14 */
-        "trn2   v7.16b, v2.16b, v3.16b\n" /* get c1,d1, c3,d3, c5,d5, c7,d7,
-                                             c9,d9, c11,d11, c13,d13, c15,d15 */
-        "ld1    {v0.16b},   [%[ptr0]],  #16\n" /* load r0 */
-        "ld1    {v1.16b},   [%[ptr1]],  #16\n" /* load r1 */
-        "subs   %w[k], %w[k], #1\n"            /* loop cnt -1 */
-        /* trans 8h */
-        "trn1   v8.8h, v4.8h, v5.8h\n" /* get a0,b0, a1,b1, a4,b4, a5,b5, a8,b8,
-                                          a9,b9, a12,b12, a13,b13 */
-        "trn2   v9.8h, v4.8h, v5.8h\n" /* get a2,b2, a3,b3, a6,b6, a7,b7,
-                                          a10,b10, a11,b11, a14,b14, a15,b15 */
-        "trn1   v10.8h, v6.8h, v7.8h\n" /* get c0,d0, c1,d1, c4,d4, c5,d5,
-                                           c8,d8, c9,d9, c12,d12, c13,d13 */
-        "trn2   v11.8h, v6.8h, v7.8h\n" /* get c2,d2, c3,d3, c6,d6, c7,d7,
-                                           c10,d10, c11,d11, c14,d14, c15,d15 */
-        /* trans 4s */
-        "ld1    {v2.16b},   [%[ptr2]],  #16\n" /* load r2 */
-        "trn1   v4.4s, v8.4s, v9.4s\n" /* get a0,b0, a1,b1, a2,b2, a3,b3, a8,b8,
-                                          a9,b9, a10,b10, a11,b11 */
-        "trn2   v5.4s, v8.4s, v9.4s\n" /* get a4,b4, a5,b5, a6,b6, a7,b7,
-                                          a12,b12, a13,b13, a14,b14, a15,b15 */
-        "trn1   v6.4s, v10.4s, v11.4s\n" /* get c0,d0, c1,d1, c2,d2, c3,d3,
-                                            c8,d8, c9,d9, c10,d10, c11,d11 */
-        "trn2   v7.4s, v10.4s, v11.4s\n" /* get c4,d4, c5,d5, c6,d6, c7,d7,
-                                            c12,d12, c13,d13, c14,d14, c15,d15
-                                            */
-        /* trans 2d */
-        "ld1    {v3.16b},   [%[ptr3]],  #16\n" /* load r3 */
-        "trn1   v8.2d, v4.2d, v6.2d\n" /* get a0,b0, a1,b1, a2,b2, a3,b3, c0,d0,
-                                          c1,d1, c2,d2, c3,d3 */
-        "trn1   v9.2d, v5.2d, v7.2d\n" /* get a4,b4, a5,b5, a6,b6, a7,b7, c4,d4,
-                                          c5,d5, c6,d6, c7,d7 */
-        "trn2   v10.2d, v4.2d, v6.2d\n" /* get a8,b8, a9,b9, a10,b10, a11,b11,
-                                           c8,d8, c9,d9, c10,d10, c11,d11 */
-        "trn2   v11.2d, v5.2d, v7.2d\n" /* get a12,b12, a13,b13, a14,b14,
-                                           a15,b15, c12,d12, c13,d13, c14,d14,
-                                           c15,d15 */
-        "st1    {v8.16b}, [%[ptr_out]], %[stride]\n" /* write block0, address +
-                                                        stride */
-        "st1    {v9.16b}, [%[ptr_out]], %[stride]\n" /* write block1, address +
-                                                        stride */
-        "st1   {v10.16b}, [%[ptr_out]], %[stride]\n" /* write block2, address +
-                                                        stride */
-        "st1   {v11.16b}, [%[ptr_out]], %[stride]\n" /* write block3, address +
-                                                        stride */
-        "bgt    0b\n"                                /* jump to main loop */
-        "1:\n"                                       /* process remain */
-        "cbz    %w[rem], 2f\n"                       /* skip to remain */
-        /* bit select */
-        "bif    v0.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */
-        "bif    v1.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */
-        "bif    v2.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */
-        "bif    v3.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */
-        /* trans 16b */
-        "trn1   v4.16b, v0.16b, v1.16b\n" /* get a0,b0, a2,b2, a4,b4, a6,b6,
-                                             a8,b8, a10,b10, a12,b12, a14,b14 */
-        "trn2   v5.16b, v0.16b, v1.16b\n" /* get a1,b1, a3,b3, a5,b5, a7,b7,
-                                             a9,b9, a11,b11, a13,b13, a15,b15 */
-        "trn1   v6.16b, v2.16b, v3.16b\n" /* get c0,d0, c2,d2, c4,d4, c6,d6,
-                                             c8,d8, c10,d10, c12,d12, c14,d14 */
-        "trn2   v7.16b, v2.16b, v3.16b\n" /* get c1,d1, c3,d3, c5,d5, c7,d7,
-                                             c9,d9, c11,d11, c13,d13, c15,d15 */
-        /* trans 8h */
-        "trn1   v8.8h, v4.8h, v5.8h\n" /* get a0,b0, a1,b1, a4,b4, a5,b5, a8,b8,
-                                          a9,b9, a12,b12, a13,b13 */
-        "trn2   v9.8h, v4.8h, v5.8h\n" /* get a2,b2, a3,b3, a6,b6, a7,b7,
-                                          a10,b10, a11,b11, a14,b14, a15,b15 */
-        "trn1   v10.8h, v6.8h, v7.8h\n" /* get c0,d0, c1,d1, c4,d4, c5,d5,
-                                           c8,d8, c9,d9, c12,d12, c13,d13 */
-        "trn2   v11.8h, v6.8h, v7.8h\n" /* get c2,d2, c3,d3, c6,d6, c7,d7,
-                                           c10,d10, c11,d11, c14,d14, c15,d15 */
-        /* trans 4s */
-        "trn1   v4.4s, v8.4s, v9.4s\n" /* get a0,b0, a1,b1, a2,b2, a3,b3, a8,b8,
-                                          a9,b9, a10,b10, a11,b11 */
-        "trn2   v5.4s, v8.4s, v9.4s\n" /* get a4,b4, a5,b5, a6,b6, a7,b7,
-                                          a12,b12, a13,b13, a14,b14, a15,b15 */
-        "trn1   v6.4s, v10.4s, v11.4s\n" /* get c0,d0, c1,d1, c2,d2, c3,d3,
-                                            c8,d8, c9,d9, c10,d10, c11,d11 */
-        "trn2   v7.4s, v10.4s, v11.4s\n" /* get c4,d4, c5,d5, c6,d6, c7,d7,
-                                            c12,d12, c13,d13, c14,d14, c15,d15
-                                            */
-        /* trans 2d */
-        "trn1   v8.2d, v4.2d, v6.2d\n" /* get a0,b0, a1,b1, a2,b2, a3,b3, c0,d0,
-                                          c1,d1, c2,d2, c3,d3 */
-        "trn1   v9.2d, v5.2d, v7.2d\n" /* get a4,b4, a5,b5, a6,b6, a7,b7, c4,d4,
-                                          c5,d5, c6,d6, c7,d7 */
-        "trn2   v10.2d, v4.2d, v6.2d\n" /* get a8,b8, a9,b9, a10,b10, a11,b11,
-                                           c8,d8, c9,d9, c10,d10, c11,d11 */
-        "trn2   v11.2d, v5.2d, v7.2d\n" /* get a12,b12, a13,b13, a14,b14,
-                                           a15,b15, c12,d12, c13,d13, c14,d14,
-                                           c15,d15 */
-        /* check remain size */
-        "subs    %w[rem], %w[rem], #1\n"             /* check remain num */
-        "st1    {v8.16b}, [%[ptr_out]], %[stride]\n" /* write 0 */
-        "beq    2f\n"                                /* remain = 1 */
-        "subs    %w[rem], %w[rem], #1\n"             /* check remain num */
-        "st1    {v9.16b}, [%[ptr_out]], %[stride]\n" /* write 1 */
-        "beq    2f\n"                                /* remain = 2 */
-        "subs    %w[rem], %w[rem], #1\n"             /* check remain num */
-        "st1   {v10.16b}, [%[ptr_out]], %[stride]\n" /* write 2 */
-        "beq    2f\n"                                /* remain = 3 */
-        "st1   {v11.16b}, [%[ptr_out]]\n"            /* write 3 */
-        /* end */
-        "2:\n" /* end */
-        : [ptr0] "+r"(ptr0), [ptr1] "+r"(ptr1), [ptr2] "+r"(ptr2),
-          [ptr3] "+r"(ptr3), [k] "+r"(k), [rem] "+r"(rem),
-          [ptr_out] "+r"(ptr_out)
-        : [mask] "w"(vmask), [vzero] "w"(vzero), [stride] "r"(stride_out)
-        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-          "v11", "cc");
-#else   // armv7
-    asm volatile(
-        "vld1.8 {d0-d1},    [%[ptr0]]!\n" /* load r0 */
-        "vld1.8 {d2-d3},    [%[ptr1]]!\n" /* load r1 */
-        "vld1.8 {d4-d5},    [%[ptr2]]!\n" /* load r2 */
-        "vld1.8 {d6-d7},    [%[ptr3]]!\n" /* load r3 */
-        "cmp    %[k], #0\n"               /* check main loop */
-        "beq    1f\n"                     /* jump to remain */
-        "0:\n"                            /* main loop */
-        /* trans 16b */
-        "vtrn.8 q0, q1\n" /* get q0: a0,b0, a2,b2, a4,b4, a6,b6, a8,b8, a10,b10,
-                             a12,b12, a14,b14; q1: a1,b1, a3,b3, a5,b5, a7,b7,
-                             a9,b9, a11,b11, a13,b13, a15,b15 */
-        "vtrn.8 q2, q3\n" /* get q2: c0,d0, c2,d2, c4,d4, c6,d6, c8,d8, c10,d10,
-                             c12,d12, c14,d14; q3: c0,d0, c2,d2, c4,d4, c6,d6,
-                             c8,d8, c10,d10, c12,d12, c14,d14 */
-        "subs   %[k], %[k], #1\n" /* loop cnt -1 */
-        /* trans 8h */
-        "vtrn.16    q0, q1\n" /* get q0: a0,b0, a1,b1, a4,b4, a5,b5, a8,b8,
-                                 a9,b9, a12,b12, a13,b13; q1: a2,b2, a3,b3,
-                                 a6,b6, a7,b7, a10,b10, a11,b11, a14,b14,
-                                 a15,b15 */
-        "vtrn.16    q2, q3\n" /* get q2: c0,d0, c1,d1, c4,d4, c5,d5, c8,d8,
-                                 c9,d9, c12,d12, c13,d13; q3: c2,d2, c3,d3,
-                                 c6,d6, c7,d7, c10,d10, c11,d11, c14,d14,
-                                 c15,d15 */
-        /* trans 4s */
-        "vtrn.32    q0, q1\n" /* get q0: a0,b0, a1,b1, a2,b2, a3,b3, a8,b8,
-                                 a9,b9, a10,b10, a11,b11; q1: a4,b4, a5,b5,
-                                 a6,b6, a7,b7, a12,b12, a13,b13, a14,b14,
-                                 a15,b15 */
-        "vtrn.32    q2, q3\n" /* get q2: c0,d0, c1,d1, c2,d2, c3,d3, c8,d8,
-                                 c9,d9, c10,d10, c11,d11; q3: c4,d4, c5,d5,
-                                 c6,d6, c7,d7, c12,d12, c13,d13, c14,d14,
-                                 c15,d15 */
-        /* trans 2d */
-        "vswp   d1, d4\n" /* get q0: a0,b0, a1,b1, a2,b2, a3,b3, c0,d0, c1,d1,
-                             c2,d2, c3,d3; q2: a8,b8, a9,b9, a10,b10, a11,b11,
-                             c8,d8, c9,d9, c10,d10, c11,d11 */
-        "vswp   d3, d6\n" /* get q1: a4,b4, a5,b5, a6,b6, a7,b7, c4,d4, c5,d5,
-                             c6,d6, c7,d7; q3: a12,b12, a13,b13, a14,b14,
-                             a15,b15, c12,d12, c13,d13, c14,d14, c15,d15 */
-        "vst1.8 {d0-d1}, [%[ptr_out]], %[stride]\n" /* write block0, address +
-                                                       stride */
-        "vst1.8 {d2-d3}, [%[ptr_out]], %[stride]\n" /* write block1, address +
-                                                       stride */
-        "vst1.8 {d4-d5}, [%[ptr_out]], %[stride]\n" /* write block2, address +
-                                                       stride */
-        "vst1.8 {d6-d7}, [%[ptr_out]], %[stride]\n" /* write block3, address +
-                                                       stride */
-        "vld1.8 {d0-d1},    [%[ptr0]]!\n"           /* load r0 */
-        "vld1.8 {d2-d3},    [%[ptr1]]!\n"           /* load r1 */
-        "vld1.8 {d4-d5},    [%[ptr2]]!\n"           /* load r2 */
-        "vld1.8 {d6-d7},    [%[ptr3]]!\n"           /* load r3 */
-        "bgt    0b\n"                               /* jump to main loop */
-        "1:\n"                                      /* process remain */
-        "cmp    %[rem], #0\n"                       /* check remain */
-        "beq    2f\n"                               /* skip to remain */
-        /* bit select */
-        "vbif   q0, %q[vzero], %q[mask]\n" /* pad 0 */
-        "vbif   q1, %q[vzero], %q[mask]\n" /* pad 0 */
-        "vbif   q2, %q[vzero], %q[mask]\n" /* pad 0 */
-        "vbif   q3, %q[vzero], %q[mask]\n" /* pad 0 */
-        /* trans 16b */
-        "vtrn.8 q0, q1\n" /* get q0: a0,b0, a2,b2, a4,b4, a6,b6, a8,b8, a10,b10,
-                             a12,b12, a14,b14; q1: a1,b1, a3,b3, a5,b5, a7,b7,
-                             a9,b9, a11,b11, a13,b13, a15,b15 */
-        "vtrn.8 q2, q3\n" /* get q2: c0,d0, c2,d2, c4,d4, c6,d6, c8,d8, c10,d10,
-                             c12,d12, c14,d14; q3: c0,d0, c2,d2, c4,d4, c6,d6,
-                             c8,d8, c10,d10, c12,d12, c14,d14 */
-        /* trans 8h */
-        "vtrn.16    q0, q1\n" /* get q0: a0,b0, a1,b1, a4,b4, a5,b5, a8,b8,
-                                 a9,b9, a12,b12, a13,b13; q1: a2,b2, a3,b3,
-                                 a6,b6, a7,b7, a10,b10, a11,b11, a14,b14,
-                                 a15,b15 */
-        "vtrn.16    q2, q3\n" /* get q2: c0,d0, c1,d1, c4,d4, c5,d5, c8,d8,
-                                 c9,d9, c12,d12, c13,d13; q3: c2,d2, c3,d3,
-                                 c6,d6, c7,d7, c10,d10, c11,d11, c14,d14,
-                                 c15,d15 */
-        /* trans 4s */
-        "vtrn.32    q0, q1\n" /* get q0: a0,b0, a1,b1, a2,b2, a3,b3, a8,b8,
-                                 a9,b9, a10,b10, a11,b11; q1: a4,b4, a5,b5,
-                                 a6,b6, a7,b7, a12,b12, a13,b13, a14,b14,
-                                 a15,b15 */
-        "vtrn.32    q2, q3\n" /* get q2: c0,d0, c1,d1, c2,d2, c3,d3, c8,d8,
-                                 c9,d9, c10,d10, c11,d11; q3: c4,d4, c5,d5,
-                                 c6,d6, c7,d7, c12,d12, c13,d13, c14,d14,
-                                 c15,d15 */
-        /* trans 2d */
-        "vswp   d1, d4\n" /* get q0: a0,b0, a1,b1, a2,b2, a3,b3, c0,d0, c1,d1,
-                             c2,d2, c3,d3; q2: a8,b8, a9,b9, a10,b10, a11,b11,
-                             c8,d8, c9,d9, c10,d10, c11,d11 */
-        "vswp   d3, d6\n" /* get q1: a4,b4, a5,b5, a6,b6, a7,b7, c4,d4, c5,d5,
-                             c6,d6, c7,d7; q3: a12,b12, a13,b13, a14,b14,
-                             a15,b15, c12,d12, c13,d13, c14,d14, c15,d15 */
-        /* check remain size */
-        "subs    %[rem], %[rem], #1\n"              /* check remain num */
-        "vst1.8 {d0-d1}, [%[ptr_out]], %[stride]\n" /* write 0 */
-        "beq    2f\n"                               /* remain = 1 */
-        "subs    %[rem], %[rem], #1\n"              /* check remain num */
-        "vst1.8 {d2-d3}, [%[ptr_out]], %[stride]\n" /* write 1 */
-        "beq    2f\n"                               /* remain = 2 */
-        "subs    %[rem], %[rem], #1\n"              /* check remain num */
-        "vst1.8 {d4-d5}, [%[ptr_out]], %[stride]\n" /* write 2 */
-        "beq    2f\n"                               /* remain = 3 */
-        "vst1.8 {d6-d7}, [%[ptr_out]], %[stride]\n" /* write 3 */
-        /* end */
-        "2:\n" /* end */
-        : [ptr0] "+r"(ptr0), [ptr1] "+r"(ptr1), [ptr2] "+r"(ptr2),
-          [ptr3] "+r"(ptr3), [k] "+r"(k), [rem] "+r"(rem),
-          [ptr_out] "+r"(ptr_out)
-        : [mask] "w"(vmask), [vzero] "w"(vzero), [stride] "r"(stride_out)
-        : "q0", "q1", "q2", "q3", "cc");
-#endif  //__aarch64__ // NOLINT
-  }
-  free(zerobuf);
-}
-
-/**************************************************************************/
-// for armv8
-// prepack B according to gemm kernel
-// B block size: (<4x2>x4) x2, can be described as below:
-// origin B data:
-// B_origin(no trans, k x n):
-//      r0: ==>   a0, a1, a2, a3 .... a12, a13, a14, a15
-//      r1: ==>   b0, b1, b2, b3 .... b12, b13, b14, b15
-//      r2: ==>   c0, c1, c2, c3 .... c12, c13, c14, c15
-//      r3: ==>   d0, d1, d2, d3 .... d12, d13, d14, d15
-// packed B:
-//      a0,b0, a1,b1, a2,b2, a3,b3;
-//      c0,d0, c1,d1, c2,d2, c3,d3;
-//                   .
-//                   .
-//                   .
-//      a12,b12, a13,b13, a14,b14, a15,b15;
-//      c12,d12, c13,d13, c14,d14, c15,d15;
-// for armv7
-// prepack B according to gemm kernel
-// B block size: (<4x2>x4) x2, can be described as below:
-// origin B data:
-// B_origin(no trans, k x n):
-//      r0: ==>   a0, a1, a2, a3, a4, a5, a6, a7
-//      r1: ==>   b0, b1, b2, b3, b4, b5, b6, b7
-//      r2: ==>   c0, c1, c2, c3, c4, c5, c6, c7
-//      r3: ==>   d0, d1, d2, d3, d4, d5, d6, d7
-// packed B:
-//      a0,b0, a1,b1, a2,b2, a3,b3;
-//      a4,b4, a5,b5, a6,b6, a7,b7;
-//      c0,d0, c1,d1, c2,d2, c3,d3;
-//      c4,d4, c5,d5, c6,d6, c7,d7;
-/***************************************************************************/
-void packb_int8(int8_t* out, const int8_t* in, const int ldin, const int k0,
-                const int kmax, const int n0, const int nmax,
-                const int8_t* zerobuf) {
-  const int8_t* inptr = in + k0 * ldin + n0;
-  const uint8_t mask_buffer[16] = {0, 1, 2,  3,  4,  5,  6,  7,
-                                   8, 9, 10, 11, 12, 13, 14, 15};
-  int x_len = nmax - n0;
-  int y_len = kmax - k0;
-  int kup = ROUNDUP(y_len, KBLOCK_INT8);
-  int kcnt = x_len / NBLOCK_INT8_OTH;
-  int rem = x_len & (NBLOCK_INT8_OTH - 1);
-  int stride_out = NBLOCK_INT8_OTH * kup;
-
-  int8x16_t vzero = vdupq_n_s8(0);
-  uint8x16_t vmask = vcltq_u8(vld1q_u8(mask_buffer), vdupq_n_u8(rem));
-#pragma omp parallel for
-  for (int y = 0; y < y_len; y += KBLOCK_INT8) {
-    const int8_t* ptr0 = inptr + y * ldin;
-    const int8_t* ptr1 = ptr0 + ldin;
-    const int8_t* ptr2 = ptr1 + ldin;
-    const int8_t* ptr3 = ptr2 + ldin;
-    if (y + KBLOCK_INT8 > y_len) {
-      switch (y + KBLOCK_INT8 - y_len) {
-        case 3:
-          ptr1 = zerobuf;
-        case 2:
-          ptr2 = zerobuf;
-        case 1:
-          ptr3 = zerobuf;
-        default:
-          break;
-      }
-    }
-    int8_t* outptr_row_col = out + y * NBLOCK_INT8_OTH;
-    int k = kcnt;
-#ifdef __aarch64__
-    asm volatile(
-    "ld1    {v0.16b},   [%[ptr0]],  #16\n" /* load r0 */
-    "ld1    {v1.16b},   [%[ptr1]],  #16\n" /* load r1 */
-    "ld1    {v2.16b},   [%[ptr2]],  #16\n" /* load r2 */
-    "ld1    {v3.16b},   [%[ptr3]],  #16\n" /* load r3 */
-    "cbz    %w[k], 1f\n"                   /* jump to remain */
-    "0:\n"                                 /* main loop */
-    /* trans 16b */
-    "trn1   v4.16b, v0.16b, v1.16b\n" /* get a0,b0, a2,b2, a4,b4, a6,b6,
-                                         a8,b8, a10,b10, a12,b12, a14,b14 */
-    "trn2   v5.16b, v0.16b, v1.16b\n" /* get a1,b1, a3,b3, a5,b5, a7,b7,
-                                         a9,b9, a11,b11, a13,b13, a15,b15 */
-    "trn1   v6.16b, v2.16b, v3.16b\n" /* get c0,d0, c2,d2, c4,d4, c6,d6,
-                                         c8,d8, c10,d10, c12,d12, c14,d14 */
-    "trn2   v7.16b, v2.16b, v3.16b\n" /* get c1,d1, c3,d3, c5,d5, c7,d7,
-                                         c9,d9, c11,d11, c13,d13, c15,d15 */
-    "ld1    {v0.16b},   [%[ptr0]],  #16\n" /* load r0 */
-    "ld1    {v1.16b},   [%[ptr1]],  #16\n" /* load r1 */
-    "subs   %w[k], %w[k], #1\n"            /* loop cnt -1 */
-    /* trans 8h */
-    "trn1   v8.8h, v4.8h, v5.8h\n" /* get a0,b0, a1,b1, a4,b4, a5,b5, a8,b8,
-                                      a9,b9, a12,b12, a13,b13 */
-    "trn2   v9.8h, v4.8h, v5.8h\n" /* get a2,b2, a3,b3, a6,b6, a7,b7,
-                                      a10,b10, a11,b11, a14,b14, a15,b15 */
-    "trn1   v10.8h, v6.8h, v7.8h\n" /* get c0,d0, c1,d1, c4,d4, c5,d5,
-                                       c8,d8, c9,d9, c12,d12, c13,d13 */
-    "trn2   v11.8h, v6.8h, v7.8h\n" /* get c2,d2, c3,d3, c6,d6, c7,d7,
-                                       c10,d10, c11,d11, c14,d14, c15,d15 */
-    /* trans 4s */
-    "ld1    {v2.16b},   [%[ptr2]],  #16\n" /* load r2 */
-    "trn1   v4.4s, v8.4s, v9.4s\n" /* get a0,b0, a1,b1, a2,b2, a3,b3, a8,b8,
-                                      a9,b9, a10,b10, a11,b11 */
-    "trn2   v5.4s, v8.4s, v9.4s\n" /* get a4,b4, a5,b5, a6,b6, a7,b7,
-                                      a12,b12, a13,b13, a14,b14, a15,b15 */
-    "trn1   v6.4s, v10.4s, v11.4s\n" /* get c0,d0, c1,d1, c2,d2, c3,d3,
-                                        c8,d8, c9,d9, c10,d10, c11,d11 */
-    "trn2   v7.4s, v10.4s, v11.4s\n" /* get c4,d4, c5,d5, c6,d6, c7,d7,
-                                        c12,d12, c13,d13, c14,d14, c15,d15
-                                        */
-    /* trans 2d */
-    "ld1    {v3.16b},   [%[ptr3]],  #16\n" /* load r3 */
-    "trn1   v8.2d, v4.2d, v6.2d\n" /* get a0,b0, a1,b1, a2,b2, a3,b3, c0,d0,
-                                      c1,d1, c2,d2, c3,d3 */
-    "trn2   v10.2d, v4.2d, v6.2d\n" /* get a8,b8, a9,b9, a10,b10, a11,b11,
-                                       c8,d8, c9,d9, c10,d10, c11,d11 */
-    "trn1   v9.2d, v5.2d, v7.2d\n" /* get a4,b4, a5,b5, a6,b6, a7,b7, c4,d4,
-                                      c5,d5, c6,d6, c7,d7 */
-    "trn2   v11.2d, v5.2d, v7.2d\n" /* get a12,b12, a13,b13, a14,b14,
-                                       a15,b15, c12,d12, c13,d13, c14,d14,
-                                       c15,d15 */
-    "st1    {v8.16b, v9.16b, v10.16b, v11.16b},   [%[ptr_out]], %[stride]\n"
-    "bgt    0b\n"          /* jump to main loop */
-    "1:\n"                 /* process remain */
-    "cbz    %w[rem], 2f\n" /* jump to remain */
-    /* bit select */
-    "bif    v0.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */
-    "bif    v1.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */
-    "bif    v2.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */
-    "bif    v3.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */
-    /* trans 16b */
-    "trn1   v4.16b, v0.16b, v1.16b\n" /* get a0,b0, a2,b2, a4,b4, a6,b6,
-                                         a8,b8, a10,b10, a12,b12, a14,b14 */
-    "trn2   v5.16b, v0.16b, v1.16b\n" /* get a1,b1, a3,b3, a5,b5, a7,b7,
-                                         a9,b9, a11,b11, a13,b13, a15,b15 */
-    "trn1   v6.16b, v2.16b, v3.16b\n" /* get c0,d0, c2,d2, c4,d4, c6,d6,
-                                         c8,d8, c10,d10, c12,d12, c14,d14 */
-    "trn2   v7.16b, v2.16b, v3.16b\n" /* get c1,d1, c3,d3, c5,d5, c7,d7,
-                                         c9,d9, c11,d11, c13,d13, c15,d15 */
-    /* trans 8h */
-    "trn1   v8.8h, v4.8h, v5.8h\n" /* get a0,b0, a1,b1, a4,b4, a5,b5, a8,b8,
-                                      a9,b9, a12,b12, a13,b13 */
-    "trn2   v9.8h, v4.8h, v5.8h\n" /* get a2,b2, a3,b3, a6,b6, a7,b7,
-                                      a10,b10, a11,b11, a14,b14, a15,b15 */
-    "trn1   v10.8h, v6.8h, v7.8h\n" /* get c0,d0, c1,d1, c4,d4, c5,d5,
-                                       c8,d8, c9,d9, c12,d12, c13,d13 */
-    "trn2   v11.8h, v6.8h, v7.8h\n" /* get c2,d2, c3,d3, c6,d6, c7,d7,
-                                       c10,d10, c11,d11, c14,d14, c15,d15 */
-    /* trans 4s */
-    "trn1   v4.4s, v8.4s, v9.4s\n" /* get a0,b0, a1,b1, a2,b2, a3,b3, a8,b8,
-                                      a9,b9, a10,b10, a11,b11 */
-    "trn2   v5.4s, v8.4s, v9.4s\n" /* get a4,b4, a5,b5, a6,b6, a7,b7,
-                                      a12,b12, a13,b13, a14,b14, a15,b15 */
-    "trn1   v6.4s, v10.4s, v11.4s\n" /* get c0,d0, c1,d1, c2,d2, c3,d3,
-                                        c8,d8, c9,d9, c10,d10, c11,d11 */
-    "trn2   v7.4s, v10.4s, v11.4s\n" /* get c4,d4, c5,d5, c6,d6, c7,d7,
-                                        c12,d12, c13,d13, c14,d14, c15,d15
-                                        */
-    /* trans 2d */
-    "trn1   v8.2d, v4.2d, v6.2d\n" /* get a0,b0, a1,b1, a2,b2, a3,b3, c0,d0,
-                                      c1,d1, c2,d2, c3,d3 */
-    "trn2   v10.2d, v4.2d, v6.2d\n" /* get a8,b8, a9,b9, a10,b10, a11,b11,
-                                       c8,d8, c9,d9, c10,d10, c11,d11 */
-    "trn1   v9.2d, v5.2d, v7.2d\n" /* get a4,b4, a5,b5, a6,b6, a7,b7, c4,d4,
-                                      c5,d5, c6,d6, c7,d7 */
-    "trn2   v11.2d, v5.2d, v7.2d\n" /* get a12,b12, a13,b13, a14,b14,
-                                       a15,b15, c12,d12, c13,d13, c14,d14,
-                                       c15,d15 */
-    "st1    {v8.16b, v9.16b, v10.16b, v11.16b},   [%[ptr_out]]\n" /* save to
-                                                                     memory
-                                                                     */
-    /* end */
-    "2:\n" /* end */
-    : [ptr0] "+r"(ptr0), [ptr1] "+r"(ptr1), [ptr2] "+r"(ptr2),
-      [ptr3] "+r"(ptr3), [k] "+r"(k), [ptr_out] "+r"(outptr_row_col)
-    : [rem] "r"(rem), [mask] "w"(vmask), [vzero] "w"(vzero),
-      [stride] "r"(stride_out)
-    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-      "v11", "cc");
-#else   // armv7
-    asm volatile(
-        "vld1.8 {d0},   [%[ptr0]]!\n" /* load r0, a0,a1,a2,a3,a4,a5,a6,a7 */
-        "vld1.8 {d1},   [%[ptr1]]!\n" /* load r1, b0,b1,b2,b3,b4,b5,b6,b7 */
-        "vld1.8 {d2},   [%[ptr2]]!\n" /* load r2, c0,c1,c2,c3,c4,c5,c6,c7 */
-        "vld1.8 {d3},   [%[ptr3]]!\n" /* load r3, d0,d1,d2,d3,d4,d5,d6,d7 */
-        "cmp    %[k], #0\n"           /* check main loop count */
-        "beq    1f\n"                 /* jump to remain */
-        "0:\n"                        /* main loop */
-        /* trans 8b */
-        "vtrn.8 d0, d1\n" /* get d0: a0,b0, a2,b2, a4,b4, a6,b6; d1: a1,b1,
-                             a3,b3, a5,b5, a7,b7 */
-        "vtrn.8 d2, d3\n" /* get d2: c0,d0, c2,d2, c4,d4, c6,d6; d3: c1,d1,
-                             c3,d3, c5,d5, c7,d7 */
-        /* trans 4h */
-        "vtrn.16    d0, d1\n" /* get d0: a0,b0, a1,b1, a4,b4, a5,b5; d1: a2,b2,
-                                 a3,b3, a6,b6, a7,b7 */
-        "vtrn.16    d2, d3\n" /* get d2: c0,d0, c1,d1, c4,d4, c5,d5; d3: c2,d2,
-                                 c3,d3, c6,d6, c7,d7 */
-        "subs   %[k],   %[k],   #1\n" /* loop - 1 */
-        /* trans 2s */
-        "vtrn.32    d0, d1\n" /* get d0: a0,b0, a1,b1, a2,b2, a3,b3; d1: a4,b4,
-                                 a5,b5, a6,b6, a7,b7 */
-        "vtrn.32    d2, d3\n" /* get d2: c0,d0, c1,d1, c2,d2, c3,d3; d3: c4,d4,
-                                 c5,d5, c6,d6, c7,d7 */
-        "vst1.8 {d0-d3},   [%[ptr_out]], %[stride]\n" /* save to memory */
-        "vld1.8 {d0},   [%[ptr0]]!\n" /* load r0, a0,a1,a2,a3,a4,a5,a6,a7 */
-        "vld1.8 {d1},   [%[ptr1]]!\n" /* load r1, b0,b1,b2,b3,b4,b5,b6,b7 */
-        "vld1.8 {d2},   [%[ptr2]]!\n" /* load r2, c0,c1,c2,c3,c4,c5,c6,c7 */
-        "vld1.8 {d3},   [%[ptr3]]!\n" /* load r3, d0,d1,d2,d3,d4,d5,d6,d7 */
-        "bgt    0b\n"                 /* jump to main loop */
-        "1:\n"                        /* process remain */
-        "cmp    %[rem], #0\n"         /* check remain size */
-        "beq    2f\n"                 /* jump to end */
-        /* bit select */
-        "vbif    d0, %e[vzero], %e[mask]\n" /* pad 0 */
-        "vbif    d1, %e[vzero], %e[mask]\n" /* pad 0 */
-        "vbif    d2, %e[vzero], %e[mask]\n" /* pad 0 */
-        "vbif    d3, %e[vzero], %e[mask]\n" /* pad 0 */
-        /* trans 8b */
-        "vtrn.8 d0, d1\n" /* get d0: a0,b0, a2,b2, a4,b4, a6,b6; d1: a1,b1,
-                             a3,b3, a5,b5, a7,b7 */
-        "vtrn.8 d2, d3\n" /* get d2: c0,d0, c2,d2, c4,d4, c6,d6; d3: c1,d1,
-                             c3,d3, c5,d5, c7,d7 */
-        /* trans 4h */
-        "vtrn.16    d0, d1\n" /* get d0: a0,b0, a1,b1, a4,b4, a5,b5; d1: a2,b2,
-                                 a3,b3, a6,b6, a7,b7 */
-        "vtrn.16    d2, d3\n" /* get d2: c0,d0, c1,d1, c4,d4, c5,d5; d3: c2,d2,
-                                 c3,d3, c6,d6, c7,d7 */
-        /* trans 2s */
-        "vtrn.32    d0, d1\n" /* get d0: a0,b0, a1,b1, a2,b2, a3,b3; d1: a4,b4,
-                                 a5,b5, a6,b6, a7,b7 */
-        "vtrn.32    d2, d3\n" /* get d2: c0,d0, c1,d1, c2,d2, c3,d3; d3: c4,d4,
-                                 c5,d5, c6,d6, c7,d7 */
-        "vst1.8 {d0-d3},   [%[ptr_out]]\n" /* save to memory */
-        /* end */
-        "2:\n" /* end */
-        : [ptr0] "+r"(ptr0), [ptr1] "+r"(ptr1), [ptr2] "+r"(ptr2),
-          [ptr3] "+r"(ptr3), [k] "+r"(k), [ptr_out] "+r"(outptr_row_col)
-        : [rem] "r"(rem), [mask] "w"(vmask), [vzero] "w"(vzero),
-          [stride] "r"(stride_out)
-        : "q0", "q1", "cc");
-#endif  //__aarch64__  // NOLINT
-  }
-}
-
-/************************************************************************/
-// prepack B according to gemm kernel
-// origin B data:
-// B_origin(transpose, n x k:
-//      k unroll 2, a0=k0,k1
-//      r0: ==>   a0, a1, a2, a3, a4, a5, a6, a7
-//      r1: ==>   b0, b1, b2, b3, b4, b5, b6, b7
-//      r2: ==>   c0, c1, c2, c3, c4, c5, c6, c7
-//      r3: ==>   d0, d1, d2, d3, d4, d5, d6, d7
-//      r4: ==>   e0, e1, e2, e3, e4, e5, e6, e7
-//      r5: ==>   f0, f1, f2, f3, f4, f5, f6, f7
-//      r6: ==>   g0, g1, g2, g3, g4, g5, g6, g7
-//      r7: ==>   h0, h1, h2, h3, h4, h5, h6, h7
-// for armv8:
-// B block size: (<4x2>x4) x2, can be described as below:
-// packed B:
-//      a0,b0, c0,d0, a1,b1, c1,d1;
-//      e0,f0, g0,h0, e1,f1, g1,h1;--block0, address+64
-//                   .
-//                   .
-//                   .
-//      a6,b6, c6,d6, a7,b7, c7,d7;
-//      e6,f6, g6,h6, e7,f7, g7,h7;--block3, address+64
-// for armv7:
-// B block size: (<8x2>x1) x2, can be described as below:
-// packed B:
-//      a0,b0, c0,d0, e0,f0, g0,h0;
-//      a1,b1, c1,d1, e1,f1, g1,h1;--block0, address+32
-//                   .
-//                   .
-//                   .
-//      a6,b6, c6,d6, e6,f6, g6,h6;
-//      a7,b7, c7,d7, e7,f7, g7,h7;--block3, address+32
-/*******************************************************************/
-void packb_trans_int8(int8_t* out, const int8_t* in, const int ldin,
-                      const int k0, const int kmax, const int n0,
-                      const int nmax, const int8_t* zerobuf) {
-  const int KUNROLL = 4;
-  const int NUNROLL = 8;
-  const int RATIO = NBLOCK_INT8_OTH / NUNROLL;
-  const int8_t* inptr = in + n0 * ldin + k0;
-  const uint8_t mask_buffer[16] = {0, 1, 2,  3,  4,  5,  6,  7,
-                                   8, 9, 10, 11, 12, 13, 14, 15};
-  int y_len = nmax - n0;
-  int x_len = kmax - k0;
-  int yup = ROUNDUP(y_len, NBLOCK_INT8_OTH);
-  const int kup = ROUNDUP(x_len, KBLOCK_INT8);
-  const int KSTRIDE = KBLOCK_INT8 * KUNROLL;
-  int kcnt = x_len / KSTRIDE;
-  int x_rem = (x_len & (KSTRIDE - 1));
-  int k_rem = (x_rem + KBLOCK_INT8 - 1) / KBLOCK_INT8;
-  const int stride_inner = KBLOCK_INT8 * NUNROLL;
-  const int stride_outer = kup * NBLOCK_INT8_OTH;
-  const int ncnt = yup / NUNROLL;
-
-  int8x16_t vzero = vdupq_n_s8(0);
-  uint8x16_t vmask = vcltq_u8(vld1q_u8(mask_buffer), vdupq_n_u8(x_rem));
-
-#pragma omp parallel for
-  for (int y = 0; y < ncnt; y++) {
-    int idx = y * NUNROLL;
-    const int8_t* ptr0 = inptr + idx * ldin;
-    const int8_t* ptr1 = ptr0 + ldin;
-    const int8_t* ptr2 = ptr1 + ldin;
-    const int8_t* ptr3 = ptr2 + ldin;
-    const int8_t* ptr4 = ptr3 + ldin;
-    const int8_t* ptr5 = ptr4 + ldin;
-    const int8_t* ptr6 = ptr5 + ldin;
-    const int8_t* ptr7 = ptr6 + ldin;
-    // only for ratio = 0 or 1
-    int8_t* ptr_out =
-        out + (y & (RATIO - 1)) * stride_inner + (y / RATIO) * stride_outer;
-    if (idx + NUNROLL > y_len) {
-      switch (idx + NUNROLL - y_len) {
-        case 8:
-          ptr0 = zerobuf;
-        case 7:
-          ptr1 = zerobuf;
-        case 6:
-          ptr2 = zerobuf;
-        case 5:
-          ptr3 = zerobuf;
-        case 4:
-          ptr4 = zerobuf;
-        case 3:
-          ptr5 = zerobuf;
-        case 2:
-          ptr6 = zerobuf;
-        case 1:
-          ptr7 = zerobuf;
-        default:
-          break;
-      }
-    }
-    int k = kcnt;
-    int rem = k_rem;
-#ifdef __aarch64__
-    asm volatile(
-        "cbz    %w[k], 1f\n" /* skip  main loop */
-        /* main loop */
-        "0:\n"                              /* main loop */
-        "ld1    {v0.16b}, [%[ptr0]], #16\n" /* load n0, k0~k15 */
-        "ld1    {v1.16b}, [%[ptr1]], #16\n" /* load n1, k0~k15 */
-        "ld1    {v2.16b}, [%[ptr2]], #16\n" /* load n2, k0~k15 */
-        "ld1    {v3.16b}, [%[ptr3]], #16\n" /* load n3, k0~k15 */
-        "ld1    {v4.16b}, [%[ptr4]], #16\n" /* load n4, k0~k15 */
-        "ld1    {v5.16b}, [%[ptr5]], #16\n" /* load n5, k0~k15 */
-        "ld1    {v6.16b}, [%[ptr6]], #16\n" /* load n6, k0~k15 */
-        "ld1    {v7.16b}, [%[ptr7]], #16\n" /* load n7, k0~k15 */
-        /* trans, 8h */
-        "trn1   v8.8h,  v0.8h,  v1.8h\n" /* trans, zip n0,n1 */
-        "trn2   v9.8h,  v0.8h,  v1.8h\n" /* trans, zip n0,n1 */
-        "trn1  v10.8h,  v2.8h,  v3.8h\n" /* trans, zip n2,n3 */
-        "trn2  v11.8h,  v2.8h,  v3.8h\n" /* trans, zip n2,n3 */
-        "trn1  v12.8h,  v4.8h,  v5.8h\n" /* trans, zip n4,n5 */
-        "trn2  v13.8h,  v4.8h,  v5.8h\n" /* trans, zip n4,n5 */
-        "trn1  v14.8h,  v6.8h,  v7.8h\n" /* trans, zip n6,n7 */
-        "trn2  v15.8h,  v6.8h,  v7.8h\n" /* trans, zip n6,n7 */
-        /* trans, 4s */
-        "trn1  v16.4s,  v8.4s, v10.4s\n" /* trans, block 0 */
-        "trn2  v17.4s,  v8.4s, v10.4s\n" /* trans, block 0 */
-        "trn1  v18.4s,  v9.4s, v11.4s\n" /* trans, block 0 */
-        "trn2  v19.4s,  v9.4s, v11.4s\n" /* trans, block 0 */
-        "trn1  v20.4s, v12.4s, v14.4s\n" /* trans, block 1 */
-        "trn2  v21.4s, v12.4s, v14.4s\n" /* trans, block 1 */
-        "trn1  v22.4s, v13.4s, v15.4s\n" /* trans, block 1 */
-        "trn2  v23.4s, v13.4s, v15.4s\n" /* trans, block 1 */
-        "subs   %w[k],  %w[k],  #1\n"    /* loop count -1 */
-        /* trans, 2d */
-        "trn1   v8.2d, v16.2d, v18.2d\n" /* trans, block 0, out0 */
-        "trn1   v9.2d, v20.2d, v22.2d\n" /* trans, block 1, out0 */
-        "trn1  v10.2d, v17.2d, v19.2d\n" /* trans, block 0, out1 */
-        "trn1  v11.2d, v21.2d, v23.2d\n" /* trans, block 1, out1 */
-        "trn2  v12.2d, v16.2d, v18.2d\n" /* trans, block 0, out2 */
-        "trn2  v13.2d, v20.2d, v22.2d\n" /* trans, block 1, out2 */
-        "trn2  v14.2d, v17.2d, v19.2d\n" /* trans, block 0, out3 */
-        "trn2  v15.2d, v21.2d, v23.2d\n" /* trans, block 1, out3 */
-        /* store result */
-        "stp    q8, q9,   [%[ptr_out]],#64\n" /* write 0 */
-        "stp  q10, q11,   [%[ptr_out]],#64\n" /* write 1 */
-        "stp  q12, q13,   [%[ptr_out]],#64\n" /* write 2 */
-        "stp  q14, q15,   [%[ptr_out]],#64\n" /* write 3 */
-        "bgt    0b\n"                         /* jump to main loop */
-        /* process remain */
-        "1:\n"                         /* process remains */
-        "cbz    %w[rem], 2f\n"         /* no remain, jump to end */
-        "ld1    {v0.16b}, [%[ptr0]]\n" /* load n0, k0~k15 */
-        "ld1    {v1.16b}, [%[ptr1]]\n" /* load n1, k0~k15 */
-        "ld1    {v2.16b}, [%[ptr2]]\n" /* load n2, k0~k15 */
-        "ld1    {v3.16b}, [%[ptr3]]\n" /* load n3, k0~k15 */
-        "ld1    {v4.16b}, [%[ptr4]]\n" /* load n4, k0~k15 */
-        "ld1    {v5.16b}, [%[ptr5]]\n" /* load n5, k0~k15 */
-        "ld1    {v6.16b}, [%[ptr6]]\n" /* load n6, k0~k15 */
-        "ld1    {v7.16b}, [%[ptr7]]\n" /* load n7, k0~k15 */
-        /* bit select */
-        "bif    v0.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */
-        "bif    v1.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */
-        "bif    v2.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */
-        "bif    v3.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */
-        "bif    v4.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */
-        "bif    v5.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */
-        "bif    v6.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */
-        "bif    v7.16b, %[vzero].16b, %[mask].16b\n" /* pad 0 */
-        /* trans, 8h */
-        "trn1   v8.8h,  v0.8h,  v1.8h\n" /* trans, zip n0,n1 */
-        "trn2   v9.8h,  v0.8h,  v1.8h\n" /* trans, zip n0,n1 */
-        "trn1  v10.8h,  v2.8h,  v3.8h\n" /* trans, zip n2,n3 */
-        "trn2  v11.8h,  v2.8h,  v3.8h\n" /* trans, zip n2,n3 */
-        "trn1  v12.8h,  v4.8h,  v5.8h\n" /* trans, zip n4,n5 */
-        "trn2  v13.8h,  v4.8h,  v5.8h\n" /* trans, zip n4,n5 */
-        "trn1  v14.8h,  v6.8h,  v7.8h\n" /* trans, zip n6,n7 */
-        "trn2  v15.8h,  v6.8h,  v7.8h\n" /* trans, zip n6,n7 */
-        /* trans, 4s */
-        "trn1  v16.4s,  v8.4s, v10.4s\n" /* trans, block 0 */
-        "trn2  v17.4s,  v8.4s, v10.4s\n" /* trans, block 0 */
-        "trn1  v18.4s,  v9.4s, v11.4s\n" /* trans, block 0 */
-        "trn2  v19.4s,  v9.4s, v11.4s\n" /* trans, block 0 */
-        "trn1  v20.4s, v12.4s, v14.4s\n" /* trans, block 1 */
-        "trn2  v21.4s, v12.4s, v14.4s\n" /* trans, block 1 */
-        "trn1  v22.4s, v13.4s, v15.4s\n" /* trans, block 1 */
-        "trn2  v23.4s, v13.4s, v15.4s\n" /* trans, block 1 */
-        /* trans, 2d */
-        "trn1   v8.2d, v16.2d, v18.2d\n" /* trans, block 0, out0 */
-        "trn1   v9.2d, v20.2d, v22.2d\n" /* trans, block 1, out0 */
-        "trn1  v10.2d, v17.2d, v19.2d\n" /* trans, block 0, out1 */
-        "trn1  v11.2d, v21.2d, v23.2d\n" /* trans, block 1, out1 */
-        "trn2  v12.2d, v16.2d, v18.2d\n" /* trans, block 0, out2 */
-        "trn2  v13.2d, v20.2d, v22.2d\n" /* trans, block 1, out2 */
-        "trn2  v14.2d, v17.2d, v19.2d\n" /* trans, block 0, out3 */
-        "trn2  v15.2d, v21.2d, v23.2d\n" /* trans, block 1, out3 */
-        /* check remain size */
-        "subs    %w[rem], %w[rem], #1\n"      /* check remain num */
-        "stp    q8, q9,   [%[ptr_out]],#64\n" /* write 0 */
-        "beq    2f\n"                         /* remain = 1 */
-        "subs    %w[rem], %w[rem], #1\n"      /* check remain num */
-        "stp  q10, q11,   [%[ptr_out]],#64\n" /* write 1 */
-        "beq    2f\n"                         /* remain = 2 */
-        "subs    %w[rem], %w[rem], #1\n"      /* check remain num */
-        "stp  q12, q13,   [%[ptr_out]],#64\n" /* write 2 */
-        "beq    2f\n"                         /* remain = 3 */
-        "stp  q14, q15,   [%[ptr_out]]\n"     /* write 3 */
-        /* end */
-        "2:\n" /* end */
-        : [ptr0] "+r"(ptr0), [ptr1] "+r"(ptr1), [ptr2] "+r"(ptr2),
-          [ptr3] "+r"(ptr3), [ptr4] "+r"(ptr4), [ptr5] "+r"(ptr5),
-          [ptr6] "+r"(ptr6), [ptr7] "+r"(ptr7), [ptr_out] "+r"(ptr_out),
-          [k] "+r"(k), [rem] "+r"(rem)
-        : [mask] "w"(vmask), [vzero] "w"(vzero)
-        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-          "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
-          "v21", "v22", "v23", "cc");
-#else   // armv7
-    asm volatile(
-        "cmp    %[k], #0\n" /* check  main loop */
-        "beq    1f\n"       /* skip  main loop */
-        /* main loop */
-        "0:\n"                           /* main loop */
-        "vld1.8 {d0-d1}, [%[ptr0]]!\n"   /* load n0, a0~a7 */
-        "vld1.8 {d2-d3}, [%[ptr1]]!\n"   /* load n1, b0~b7 */
-        "vld1.8 {d4-d5}, [%[ptr2]]!\n"   /* load n2, c0~c7 */
-        "vld1.8 {d6-d7}, [%[ptr3]]!\n"   /* load n3, d0~d7 */
-        "vld1.8 {d8-d9}, [%[ptr4]]!\n"   /* load n4, e0~e7 */
-        "vld1.8 {d10-d11}, [%[ptr5]]!\n" /* load n5, f0~f7 */
-        "vld1.8 {d12-d13}, [%[ptr6]]!\n" /* load n6, g0~g7 */
-        "vld1.8 {d14-d15}, [%[ptr7]]!\n" /* load n7, h0~h7 */
-        /* trans, 8h */
-        "vtrn.16    q0, q1\n" /* trans, zip n0,n1, q0: a0b0,a2b2, a4b4,a6b6, q1:
-                                 a1b1,a3b3, a5b5,a7b7 */
-        "vtrn.16    q2, q3\n" /* trans, zip n2,n3, q2: c0d0,c2d2, c4d4,c6d6, q3:
-                                 c1d1,c3d3, c5d5,c7d7 */
-        "vtrn.16    q4, q5\n" /* trans, zip n4,n5, q4: e0f0,e2f2, e4f4,e6f6, q5:
-                                 e1f1,e3f3, e5f5,e7f7 */
-        "vtrn.16    q6, q7\n" /* trans, zip n6,n7, q6: g0h0,g2h2, g4h4,g6h6, q7:
-                                 g1h1,g3h3, g5h5,g7h7 */
-        /* trans, 4s */
-        "vtrn.32    q0, q2\n" /* trans, q0: a0b0,c0d0, a4b4,c4d4, q2: a2b2,c2d2,
-                                 a6b6,c6d6 */
-        "vtrn.32    q1, q3\n" /* trans, q1: a1b1,c1d1, a5b5,c5d5, q3: a3b3,c3d3,
-                                 a7b7,c7d7 */
-        "vtrn.32    q4, q6\n" /* trans, q4: e0f0,g0h0, e4f4,g4h4, q6: e2f2,g2h2,
-                                 e6f6,g6h6 */
-        "vtrn.32    q5, q7\n" /* trans, q5: e1f1,g1h1, e5f5,g5h5, q7: e3f3,g3h3,
-                                 e7f7,g7h7 */
-        "subs   %[k],  %[k],  #1\n" /* loop count -1 */
-        /* trans, 2d */
-        "vswp   d1, d8\n"  /* q0: a0b0,c0d0, e0f0,g0h0, q4: a4b4,c4d4, e4f4,g4h4
-                              */
-        "vswp   d3, d10\n" /* q1: a1b1,c1d1, e1f1,g1h1, q5: a5b5,c5d5, e5f5,g5h5
-                              */
-        "vswp   d5, d12\n" /* q2: a2b2,c2d2, e2f2,g2h2, q6: a6b6,c6d6, e6f6,g6h6
-                              */
-        "vswp   d7, d14\n" /* q3: a3b3,c3d3, e3f3,g3h3, q7: a7b7,c7d7, e7f7,g7h7
-                              */
-        /* store result */
-        "vst1.8 {d0-d3},    [%[ptr_out]]!\n" /* write 0 */
-        "vst1.8 {d4-d7},    [%[ptr_out]]!\n" /* write 1 */
-        "vst1.8 {d8-d11},   [%[ptr_out]]!\n" /* write 2 */
-        "vst1.8 {d12-d15},  [%[ptr_out]]!\n" /* write 3 */
-        "bgt    0b\n"                        /* jump to main loop */
-        /* process remain */
-        "1:\n"                           /* process remains */
-        "cmp    %[rem], #0\n"            /* check remain */
-        "beq    2f\n"                    /* no remain, jump to end */
-        "vld1.8 {d0-d1}, [%[ptr0]]!\n"   /* load n0, a0~a7 */
-        "vld1.8 {d2-d3}, [%[ptr1]]!\n"   /* load n1, b0~b7 */
-        "vld1.8 {d4-d5}, [%[ptr2]]!\n"   /* load n2, c0~c7 */
-        "vld1.8 {d6-d7}, [%[ptr3]]!\n"   /* load n3, d0~d7 */
-        "vld1.8 {d8-d9}, [%[ptr4]]!\n"   /* load n4, e0~e7 */
-        "vld1.8 {d10-d11}, [%[ptr5]]!\n" /* load n5, f0~f7 */
-        "vld1.8 {d12-d13}, [%[ptr6]]!\n" /* load n6, g0~g7 */
-        "vld1.8 {d14-d15}, [%[ptr7]]!\n" /* load n7, h0~h7 */
-        /* bit select */
-        "vbif   q0, %q[vzero], %q[mask]\n" /* pad 0 */
-        "vbif   q1, %q[vzero], %q[mask]\n" /* pad 0 */
-        "vbif   q2, %q[vzero], %q[mask]\n" /* pad 0 */
-        "vbif   q3, %q[vzero], %q[mask]\n" /* pad 0 */
-        "vbif   q4, %q[vzero], %q[mask]\n" /* pad 0 */
-        "vbif   q5, %q[vzero], %q[mask]\n" /* pad 0 */
-        "vbif   q6, %q[vzero], %q[mask]\n" /* pad 0 */
-        "vbif   q7, %q[vzero], %q[mask]\n" /* pad 0 */
-        /* trans, 8h */
-        "vtrn.16    q0, q1\n" /* trans, zip n0,n1, q0: a0b0,a2b2, a4b4,a6b6, q1:
-                                 a1b1,a3b3, a5b5,a7b7 */
-        "vtrn.16    q2, q3\n" /* trans, zip n2,n3, q2: c0d0,c2d2, c4d4,c6d6, q3:
-                                 c1d1,c3d3, c5d5,c7d7 */
-        "vtrn.16    q4, q5\n" /* trans, zip n4,n5, q4: e0f0,e2f2, e4f4,e6f6, q5:
-                                 e1f1,e3f3, e5f5,e7f7 */
-        "vtrn.16    q6, q7\n" /* trans, zip n6,n7, q6: g0h0,g2h2, g4h4,g6h6, q7:
-                                 g1h1,g3h3, g5h5,g7h7 */
-        /* trans, 4s */
-        "vtrn.32    q0, q2\n" /* trans, q0: a0b0,c0d0, a4b4,c4d4, q2: a2b2,c2d2,
-                                 a6b6,c6d6 */
-        "vtrn.32    q1, q3\n" /* trans, q1: a1b1,c1d1, a5b5,c5d5, q3: a3b3,c3d3,
-                                 a7b7,c7d7 */
-        "vtrn.32    q4, q6\n" /* trans, q4: e0f0,g0h0, e4f4,g4h4, q6: e2f2,g2h2,
-                                 e6f6,g6h6 */
-        "vtrn.32    q5, q7\n" /* trans, q5: e1f1,g1h1, e5f5,g5h5, q7: e3f3,g3h3,
-                                 e7f7,g7h7 */
-        /* trans, 2d */
-        "vswp   d1, d8\n"  /* q0: a0b0,c0d0, e0f0,g0h0, q4: a4b4,c4d4, e4f4,g4h4
-                              */
-        "vswp   d3, d10\n" /* q1: a1b1,c1d1, e1f1,g1h1, q5: a5b5,c5d5, e5f5,g5h5
-                              */
-        "vswp   d5, d12\n" /* q2: a2b2,c2d2, e2f2,g2h2, q6: a6b6,c6d6, e6f6,g6h6
-                              */
-        "vswp   d7, d14\n" /* q3: a3b3,c3d3, e3f3,g3h3, q7: a7b7,c7d7, e7f7,g7h7
-                              */
-        /* check remain size */
-        "subs    %[rem], %[rem], #1\n"       /* check remain num */
-        "vst1.8 {d0-d3},    [%[ptr_out]]!\n" /* write 0 */
-        "beq    2f\n"                        /* remain = 1 */
-        "subs    %[rem], %[rem], #1\n"       /* check remain num */
-        "vst1.8 {d4-d7},    [%[ptr_out]]!\n" /* write 1 */
-        "beq    2f\n"                        /* remain = 2 */
-        "subs    %[rem], %[rem], #1\n"       /* check remain num */
-        "vst1.8 {d8-d11},   [%[ptr_out]]!\n" /* write 2 */
-        "beq    2f\n"                        /* remain = 3 */
-        "vst1.8 {d12-d15},  [%[ptr_out]]!\n" /* write 3 */
-        /* end */
-        "2:\n" /* end */
-        : [ptr0] "+r"(ptr0), [ptr1] "+r"(ptr1), [ptr2] "+r"(ptr2),
-          [ptr3] "+r"(ptr3), [ptr4] "+r"(ptr4), [ptr5] "+r"(ptr5),
-          [ptr6] "+r"(ptr6), [ptr7] "+r"(ptr7), [ptr_out] "+r"(ptr_out),
-          [k] "+r"(k), [rem] "+r"(rem)
-        : [mask] "w"(vmask), [vzero] "w"(vzero)
-        : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "cc");
-#endif  //__aarch64__  // NOLINT
-  }
-}
-
-#if defined(__aarch64__) && defined(WITH_ARM_DOTPROD)
-
-template <typename Dtype>
-void gemm_prepack_sdot_int8(const int8_t* A_packed,
-                            const int8_t* B,
-                            const int* bias,
-                            Dtype* C,
-                            int M,
-                            int N,
-                            int K,
-                            bool is_bias,
-                            bool is_relu,
-                            bool is_transB,
-                            const float* scale,
-                            ARMContext* ctx) {
-    size_t llc_size = ctx->llc_size() / 4;
-    auto workspace = ctx->workspace_data<int8_t>();
-    //! MBLOCK_INT8_DOT * x (result) + MBLOCK_INT8_DOT * k (A) + x * k (B) = l2
-    int x_block = (llc_size - (MBLOCK_INT8_DOT * K)) / \
-                  (sizeof(int8_t) * (K + MBLOCK_INT8_DOT));
-    x_block /= NBLOCK_INT8_DOT;
-    x_block *= NBLOCK_INT8_DOT;
-    int x_num = (N + (x_block - 1)) / x_block;
-    x_block = (N + x_num - 1) / x_num;
-    x_block = (x_block + NBLOCK_INT8_DOT - 1) / NBLOCK_INT8_DOT;
-    x_block *= NBLOCK_INT8_DOT;
-    x_block = x_block < NBLOCK_INT8_DOT ? NBLOCK_INT8_DOT : x_block;
-
-    int kup = ROUNDUP(K, KBLOCK_INT8);
-    // unroll 2 loop
-    int tail_pre = ((kup / 4) & (KBLOCK_INT8 - 1));
-    int k_pre = (((kup / 4) + KBLOCK_INT8 - 1) / KBLOCK_INT8) - 1;
-
-    bool flag_p_remain = false;
-    int remain = 0;
-
-    //! apanel is pre_compute outside gemm
-    for (unsigned int x0 = 0; x0 < N; x0 += x_block) {
-        unsigned int xmax = x0 + x_block;
-        if (xmax > N) {
-            xmax = N;
-        }
-        int bblocks = (xmax - x0 + NBLOCK_INT8_DOT - 1) / NBLOCK_INT8_DOT;
-        remain = xmax - x0 - (bblocks - 1) * NBLOCK_INT8_DOT;
-        if (remain > 0) {
-            flag_p_remain = true;
-        }
-        //! load bpanel
-        auto b_pannel = static_cast<int8_t *>(workspace);
-        if (!is_transB) {
-          // K * N
-          packb_sdot_int8(b_pannel, B, N, 0, K, x0, xmax);
-        } else {
-          // N X K
-          packb_sdot_trans_int8(b_pannel, B, K, 0, K, x0, xmax);
-        }
-#pragma omp parallel for
-        for (unsigned int y = 0; y < M; y += MBLOCK_INT8_DOT) {
-            unsigned int ymax = y + MBLOCK_INT8_DOT;
-            if (ymax > M) {
-                ymax = M;
-            }
-
-            int32_t bias_local[8] = {0, 0, 0, 0, 0, 0, 0, 0};
-            if (is_bias) {
-                bias_local[0] = bias[y];
-                bias_local[1] = bias[y + 1];
-                bias_local[2] = bias[y + 2];
-                bias_local[3] = bias[y + 3];
-                bias_local[4] = bias[y + 4];
-                bias_local[5] = bias[y + 5];
-                bias_local[6] = bias[y + 6];
-                bias_local[7] = bias[y + 7];
-            }
-            float32_t scale_local[8];
-            if (scale) {
-                scale_local[0] = scale[y];
-                scale_local[1] = scale[y + 1];
-                scale_local[2] = scale[y + 2];
-                scale_local[3] = scale[y + 3];
-                scale_local[4] = scale[y + 4];
-                scale_local[5] = scale[y + 5];
-                scale_local[6] = scale[y + 6];
-                scale_local[7] = scale[y + 7];
-            }
-
-            Dtype cout0[NBLOCK_INT8_DOT];
-            Dtype cout1[NBLOCK_INT8_DOT];
-            Dtype cout2[NBLOCK_INT8_DOT];
-            Dtype cout3[NBLOCK_INT8_DOT];
-            Dtype cout4[NBLOCK_INT8_DOT];
-            Dtype cout5[NBLOCK_INT8_DOT];
-            Dtype cout6[NBLOCK_INT8_DOT];
-            Dtype cout7[NBLOCK_INT8_DOT];
-
-            Dtype *c_ptr0 = C + y * N + x0;
-            Dtype *c_ptr1 = c_ptr0 + N;
-            Dtype *c_ptr2 = c_ptr1 + N;
-            Dtype *c_ptr3 = c_ptr2 + N;
-            Dtype *c_ptr4 = c_ptr3 + N;
-            Dtype *c_ptr5 = c_ptr4 + N;
-            Dtype *c_ptr6 = c_ptr5 + N;
-            Dtype *c_ptr7 = c_ptr6 + N;
-
-            Dtype *pout0 = c_ptr0;
-            Dtype *pout1 = c_ptr1;
-            Dtype *pout2 = c_ptr2;
-            Dtype *pout3 = c_ptr3;
-            Dtype *pout4 = c_ptr4;
-            Dtype *pout5 = c_ptr5;
-            Dtype *pout6 = c_ptr6;
-            Dtype *pout7 = c_ptr7;
-
-            // const int8_t *a_ptr_l = A_packed + y * K;
-            const int8_t *a_ptr_l = A_packed + y * kup;
-            const int8_t *b_ptr = b_pannel;
-            for (int xb = 0; xb < bblocks; xb++) {
-                if ((y + 7) >= ymax) {
-                    switch ((y + 7) - ymax) {
-                        case 6:
-                            c_ptr1 = cout1;
-                        case 5:
-                            c_ptr2 = cout2;
-                        case 4:
-                            c_ptr3 = cout3;
-                        case 3:
-                            c_ptr4 = cout4;
-                        case 2:
-                            c_ptr5 = cout5;
-                        case 1:
-                            c_ptr6 = cout6;
-                        case 0:
-                            c_ptr7 = cout7;
-                        default:
-                            break;
-                    }
-                }
-                if (flag_p_remain && (xb == bblocks - 1)) {
-                    pout0 = c_ptr0;
-                    pout1 = c_ptr1;
-                    pout2 = c_ptr2;
-                    pout3 = c_ptr3;
-                    pout4 = c_ptr4;
-                    pout5 = c_ptr5;
-                    pout6 = c_ptr6;
-                    pout7 = c_ptr7;
-
-                    c_ptr0 = cout0;
-                    c_ptr1 = cout1;
-                    c_ptr2 = cout2;
-                    c_ptr3 = cout3;
-                    c_ptr4 = cout4;
-                    c_ptr5 = cout5;
-                    c_ptr6 = cout6;
-                    c_ptr7 = cout7;
-                }
-                const int8_t *a_ptr = a_ptr_l;
-                int tail = tail_pre;
-                int k = k_pre;
-                sgemm_sdot_int8_kernel<Dtype>(a_ptr, b_ptr,
-                        bias_local, c_ptr0, c_ptr1, c_ptr2, c_ptr3, \
-                        c_ptr4, c_ptr5, c_ptr6, c_ptr7, scale_local, \
-                        is_relu, k, tail);
-                if (flag_p_remain && (xb == bblocks - 1)) {
-                    for (int i = 0; i < remain; ++i) {
-                        *pout0++ = cout0[i];
-                        *pout1++ = cout1[i];
-                        *pout2++ = cout2[i];
-                        *pout3++ = cout3[i];
-                        *pout4++ = cout4[i];
-                        *pout5++ = cout5[i];
-                        *pout6++ = cout6[i];
-                        *pout7++ = cout7[i];
-                    }
-                }
-            }
-        }
-    }
-}
-
-void prepackA_m8k4_int8(int8_t* out,
-                        const int8_t* in,
-                        const int ldin,
-                        const int m0,
-                        const int mmax,
-                        const int k0,
-                        const int kmax) {
-    int x_len = (kmax - k0);
-    int8_t zerobuff[x_len];  //NOLINT
-    memset(zerobuff, 0, sizeof(int8_t) * x_len);
-
-    int8_t *dout = out;
-    const int8_t *inptr = in;
-    int kup = ROUNDUP(x_len, KBLOCK_INT8);
-    int stride = kup * 8;
-    int remain = x_len % 4;
-#pragma omp parallel for
-    for (int y = m0; y < mmax; y += 8) {
-        int8_t* outptr = dout + stride * (y - m0) / 8;
-        const int8_t * inptr_row[8];
-        inptr_row[0] = inptr + y * ldin + k0;
-        for (int i = 1; i < 8; i++) {
-            inptr_row[i] = inptr_row[i - 1] + ldin;
-        }
-        //! cope with row index exceed real size, set to zero buffer
-        if ((y + 7) >= mmax) {
-            switch ((y + 7) - mmax) {
-                case 6:
-                    inptr_row[1] = zerobuff;
-                case 5:
-                    inptr_row[2] = zerobuff;
-                case 4:
-                    inptr_row[3] = zerobuff;
-                case 3:
-                    inptr_row[4] = zerobuff;
-                case 2:
-                    inptr_row[5] = zerobuff;
-                case 1:
-                    inptr_row[6] = zerobuff;
-                case 0:
-                    inptr_row[7] = zerobuff;
-                default:
-                    break;
-            }
-        }
-        asm volatile(
-        "prfm   pldl1keep, [%[ptr0]]                \n"
-        "prfm   pldl1keep, [%[ptr0], #64]   \n"
-        "prfm   pldl1keep, [%[ptr1]]        \n"
-        "prfm   pldl1keep, [%[ptr1], #64]   \n"
-        "prfm   pldl1keep, [%[ptr2]]        \n"
-        "prfm   pldl1keep, [%[ptr2], #64]   \n"
-        "prfm   pldl1keep, [%[ptr3]]        \n"
-        "prfm   pldl1keep, [%[ptr3], #64]   \n"
-        "prfm   pldl1keep, [%[ptr4]]        \n"
-        "prfm   pldl1keep, [%[ptr4], #64]   \n"
-        "prfm   pldl1keep, [%[ptr5]]        \n"
-        "prfm   pldl1keep, [%[ptr5], #64]   \n"
-        "prfm   pldl1keep, [%[ptr6]]        \n"
-        "prfm   pldl1keep, [%[ptr6], #64]   \n"
-        "prfm   pldl1keep, [%[ptr7]]        \n"
-        "prfm   pldl1keep, [%[ptr7], #64]   \n"
-        :
-        :[ptr0] "r"(inptr_row[0]),[ptr1] "r"(inptr_row[1]),[ptr2] "r"(inptr_row[2]),[ptr3] "r"(inptr_row[3]),\
-                [ptr4] "r"(inptr_row[4]),[ptr5] "r"(inptr_row[5]),[ptr6] "r"(inptr_row[6]),[ptr7] "r"(inptr_row[7])
-        :"memory"
-        );
-
-        int x = x_len;
-
-        for (; x > 7; x -= 8) {
-            asm volatile(
-            "ld1 {v0.8b}, [%[inptr0]], #8 \n" // v0=a0a1a2a3a4a5a6a7
-            "ld1 {v1.8b}, [%[inptr1]], #8 \n" // v1=b0b1b2b3b4b5b6b7
-            "ld1 {v2.8b}, [%[inptr2]], #8 \n" // v2=c0c1c2c3c4c5c6c7
-            "ld1 {v3.8b}, [%[inptr3]], #8 \n" // v3=d0d1d2d3d4d5d6d7
-
-            "ld1 {v4.8b}, [%[inptr4]], #8 \n" // v0=e0e1a2a3a4a5a6a7
-            "ld1 {v5.8b}, [%[inptr5]], #8 \n" // v1=f0f1b2b3b4b5b6b7
-            "ld1 {v6.8b}, [%[inptr6]], #8 \n" // v2=g0g1c2c3c4c5c6c7
-            "ld1 {v7.8b}, [%[inptr7]], #8 \n" // v3=h0h1d2d3d4d5d6d7
-
-            "trn1 v8.2s, v0.2s, v1.2s \n" // v0=a0a1a2a3b0b1b2b3
-            "trn2 v9.2s, v0.2s, v1.2s \n" // v0=a4a5a6a7b4b5b6b7
-            "trn1 v10.2s, v2.2s, v3.2s \n" // v0=c0c1c2c3d0d1d2d3
-            "trn2 v11.2s, v2.2s, v3.2s \n" // v0=c4c5c6c7d4d5d6d7
-
-            "trn1 v12.2s, v4.2s, v5.2s \n" // v0=e0e1e2e3f0f1f2f3
-            "trn2 v13.2s, v4.2s, v5.2s \n" // v0=e4e5e6e7f4f5f6f7
-            "trn1 v14.2s, v6.2s, v7.2s \n" // v0=g0g1g2g3h0h1h2h3
-            "trn2 v15.2s, v6.2s, v7.2s \n" // v0=g4g5g6g7h4h5h6h7
-
-            "st1 {v8.2s}, [%[outptr]], #8\n"
-            "st1 {v10.2s}, [%[outptr]], #8\n"
-            "st1 {v12.2s}, [%[outptr]], #8\n"
-            "st1 {v14.2s}, [%[outptr]], #8\n"
-
-            "st1 {v9.2s}, [%[outptr]], #8\n"
-            "st1 {v11.2s}, [%[outptr]], #8\n"
-            "st1 {v13.2s}, [%[outptr]], #8\n"
-            "st1 {v15.2s}, [%[outptr]], #8\n"
-
-            :[inptr0] "+r"(inptr_row[0]), [inptr1] "+r"(inptr_row[1]),
-            [inptr2] "+r"(inptr_row[2]), [inptr3] "+r"(inptr_row[3]),
-            [inptr4] "+r"(inptr_row[4]), [inptr5] "+r"(inptr_row[5]),
-            [inptr6] "+r"(inptr_row[6]), [inptr7] "+r"(inptr_row[7]),
-            [outptr] "+r"(outptr)
-            :
-            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
-                    "v13", "v14", "v15", "v16", "cc", "memory"
-            );
-        }
-        if (x >= 4) {
-            asm volatile(
-            "mov x1, #4 \n"
-            "ld1 {v0.8b}, [%[inptr0]], x1 \n" // v0=a0a1a2a3a4a5a6a7
-            "ld1 {v1.8b}, [%[inptr1]], x1 \n" // v1=b0b1b2b3b4b5b6b7
-            "ld1 {v2.8b}, [%[inptr2]], x1 \n" // v2=c0c1c2c3c4c5c6c7
-            "ld1 {v3.8b}, [%[inptr3]], x1 \n" // v3=d0d1d2d3d4d5d6d7
-
-            "ld1 {v4.8b}, [%[inptr4]], x1 \n" // v0=e0e1a2a3a4a5a6a7
-            "ld1 {v5.8b}, [%[inptr5]], x1 \n" // v1=f0f1b2b3b4b5b6b7
-            "ld1 {v6.8b}, [%[inptr6]], x1 \n" // v2=g0g1c2c3c4c5c6c7
-            "ld1 {v7.8b}, [%[inptr7]], x1 \n" // v3=h0h1d2d3d4d5d6d7
-
-            "trn1 v8.2s, v0.2s, v1.2s \n" // v0=a0a1a2a3b0b1b2b3
-            "trn1 v10.2s, v2.2s, v3.2s \n" // v0=c0c1c2c3d0d1d2d3
-
-            "trn1 v12.2s, v4.2s, v5.2s \n" // v0=e0e1e2e3f0f1f2f3
-            "trn1 v14.2s, v6.2s, v7.2s \n" // v0=g0g1g2g3h0h1h2h3
-
-            "st1 {v8.2s}, [%[outptr]], #8\n"
-            "st1 {v10.2s}, [%[outptr]], #8\n"
-
-            "st1 {v12.2s}, [%[outptr]], #8\n"
-            "st1 {v14.2s}, [%[outptr]], #8\n"
-
-            :[inptr0] "+r"(inptr_row[0]), [inptr1] "+r"(inptr_row[1]),
-            [inptr2] "+r"(inptr_row[2]), [inptr3] "+r"(inptr_row[3]),
-            [inptr4] "+r"(inptr_row[4]), [inptr5] "+r"(inptr_row[5]),
-            [inptr6] "+r"(inptr_row[6]), [inptr7] "+r"(inptr_row[7]),
-            [outptr] "+r"(outptr)
-            :
-            : "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
-                    "v13", "v14", "v15", "v16", "cc", "memory"
-            );
-            x -= 4;
-        }
-        if (x > 0) {
-            for (int i = 0; i < 8; i++) {
-                for (int j = x; j > 0; j--) {
-                    *outptr++ = *inptr_row[i]++;
-                }
-                for (int j = 0; j < 4 - remain; j++) {
-                    *outptr++ = 0;
-                }
-            }
-        }
-    }
-}
-
-void prepackA_m8k4_trans_int8(int8_t* out,
-                              const int8_t* in,
-                              const int ldin,
-                              const int m0,
-                              const int mmax,
-                              const int k0,
-                              const int kmax) {
-    int8_t *outptr = out;
-    const int8_t *inptr = in + k0 * ldin + m0;
-    int x_len = mmax - m0;
-    int y_len = kmax - k0;
-    int right_remain = x_len % 8;
-    int kup = ROUNDUP(y_len, KBLOCK_INT8);
-
-    int stride_out = 8 * kup;
-    int8_t zerobuff[x_len];    //NOLINT
-    memset(zerobuff, 0, sizeof(int8_t) * x_len);
-    printf("right_remain: %d \n", right_remain);
-
-#pragma omp parallel for
-    for (int y = 0; y < y_len; y += 4) {
-        const int8_t* inptr0 = inptr + y * ldin;
-        const int8_t* inptr1 = inptr0 + ldin;
-        const int8_t* inptr2 = inptr1 + ldin;
-        const int8_t* inptr3 = inptr2 + ldin;
-
-        if (y + 4 > y_len) {
-            switch (y + 4 - y_len) {
-                case 3:
-                    inptr1 = zerobuff;
-                case 2:
-                    inptr2 = zerobuff;
-                case 1:
-                    inptr3 = zerobuff;
-                default:
-                    break;
-            }
-        }
-        asm volatile(
-        "prfm   pldl1keep, [%[ptr0]]                \n"
-        "prfm   pldl1keep, [%[ptr0], #64]   \n"
-        "prfm   pldl1keep, [%[ptr1]]        \n"
-        "prfm   pldl1keep, [%[ptr1], #64]   \n"
-        "prfm   pldl1keep, [%[ptr2]]        \n"
-        "prfm   pldl1keep, [%[ptr2], #64]   \n"
-        "prfm   pldl1keep, [%[ptr3]]        \n"
-        "prfm   pldl1keep, [%[ptr3], #64]   \n"
-        :
-        :[ptr0] "r"(inptr0),[ptr1] "r"(inptr1),[ptr2] "r"(inptr2),
-        [ptr3] "r"(inptr3)
-        :"memory"
-        );
-
-        int8_t *outptr_row = outptr + y * 8;
-        int x = 0;
-        for (; x < x_len - 7; x += 8) {
-            int8_t *out0 = outptr_row;
-            asm volatile (
-            "ld1 {v0.8b}, [%[inptr0]], #8 \n" // v0 = a0a1a2a3a4a5a6a7
-            "ld1 {v1.8b}, [%[inptr1]], #8 \n" // v0 = b0b1b2b3b4b5b6b7
-            "ld1 {v2.8b}, [%[inptr2]], #8 \n" // v0 = c0c1c2c3c4c5c6c7
-            "ld1 {v3.8b}, [%[inptr3]], #8 \n" // v0 = d0d1d2d3d4d5d6d7
-
-            "trn1 v4.8b, v0.8b, v1.8b \n" // v4 = a0b0a2b2a4b4a6b6
-            "trn2 v5.8b, v0.8b, v1.8b \n" // v4 = a1b1a3b3a5b5a7b7
-            "trn1 v6.8b, v2.8b, v3.8b \n" // v4 = c0d0c2d2a4b4a6b6
-            "trn2 v7.8b, v2.8b, v3.8b \n" // v4 = c1d1c3d3a5b5a7b7
-
-            "trn1 v0.4h, v4.4h, v6.4h \n" // v4 = a0b0c0d0a4b4c4d4
-            "trn2 v1.4h, v4.4h, v6.4h \n" // v4 = a2b2c2d2a6b6c6d6
-            "trn1 v2.4h, v5.4h, v7.4h \n" // v4 = a1b1c1d1a5b5c5d5
-            "trn2 v3.4h, v5.4h, v7.4h \n" // v4 = a3b3c3d3a7b7c7d7
-
-            "trn1 v4.2s, v0.2s, v2.2s \n" //v4 =a0b0c0d0a1b1c1d1
-            "trn2 v5.2s, v0.2s, v2.2s \n" //v4 =a4b4c4d4a5b5c5d5
-            "trn1 v6.2s, v1.2s, v3.2s \n" //v4 =a2b2c2d2a3b3c3d3
-            "trn2 v7.2s, v1.2s, v3.2s \n" //v4 =a6b6c6d6a7b7c7d7
-
-            "st1 {v4.2s}, [%[outr]], #8\n"
-            "st1 {v6.2s}, [%[outr]], #8\n"
-            "st1 {v5.2s}, [%[outr]], #8\n"
-            "st1 {v7.2s}, [%[outr]], #8\n"
-            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1),
-              [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
-              [outr] "+r"(out0)
-            :
-            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
-              "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16",
-              "cc", "memory"
-            );
-            outptr_row += stride_out;
-        }
-        if (right_remain > 0) {
-            int8_t *out0 = outptr_row;
-            for (; x < x_len; x++) {
-                *out0++ = *inptr0++;
-                *out0++ = *inptr1++;
-                *out0++ = *inptr2++;
-                *out0++ = *inptr3++;
-            }
-            for (int i = 0; i < 8 - right_remain; i++) {
-                *out0++ = 0;
-                *out0++ = 0;
-                *out0++ = 0;
-                *out0++ = 0;
-            }
-        }
-    }
-}
-
-void packb_sdot_int8(int8_t* out,
-        const int8_t* in,
-        const int ldin,
-        const int k0,
-        const int kmax,
-        const int n0,
-        const int nmax) {
-    int y_len = kmax - k0;
-    int x_len = nmax - n0;
-    int kup = ROUNDUP(y_len, KBLOCK_INT8);  //  4k
-    int8_t zerobuff[x_len];    //NOLINT
-    memset(zerobuff, 0, sizeof(int8_t) * x_len);
-    int8_t *outptr = out;
-    const int8_t *inptr = in + k0 * ldin + n0;
-
-    int stride_out = 12 * kup;
-    // int stride_y = 48;
-    int remain = x_len % 12;
-
-    // data B is not transposed, transpose B to k * 12
-#pragma omp parallel for
-    for (int y = 0; y < y_len; y += 4) {
-        // cope with row index exceed real size, set to zero
-        const int8_t *inptr0 = inptr + y * ldin;
-        const int8_t *inptr1 = inptr0 + ldin;
-        const int8_t *inptr2 = inptr1 + ldin;
-        const int8_t *inptr3 = inptr2 + ldin;
-        if (y + 4 > y_len) {
-            switch (y + 4 - y_len) {
-                case 3:
-                    inptr1 = zerobuff;
-                case 2:
-                    inptr2 = zerobuff;
-                case 1:
-                    inptr3 = zerobuff;
-                default:
-                    break;
-            }
-        }
-        asm volatile(
-        "prfm   pldl1keep, [%[inptr0]]                \n"
-        "prfm   pldl1keep, [%[inptr0], #64]        \n"
-        "prfm   pldl1keep, [%[inptr1]]        \n"
-        "prfm   pldl1keep, [%[inptr1], #64]        \n"
-        "prfm   pldl1keep, [%[inptr2]]        \n"
-        "prfm   pldl1keep, [%[inptr2], #64]        \n"
-        "prfm   pldl1keep, [%[inptr3]]        \n"
-        "prfm   pldl1keep, [%[inptr3], #64]        \n"
-        :
-        :[inptr0] "r"(inptr0), [inptr1] "r"(inptr1),
-          [inptr2] "r"(inptr2), [inptr3] "r"(inptr3)
-        :"memory"
-        );
-        int8_t* outptr_row = outptr + y * 12;
-        int x = 0;
-        for (; x < x_len - 11; x += 12) {
-            int8_t *out0 = outptr_row;
-            asm volatile (
-            "mov x1, #4 \n"
-            "ld1 {v0.8b}, [%[inptr0]], #8 \n" // v0 = a0a1a2a3a4a5a6a7
-            "ld1 {v1.8b}, [%[inptr1]], #8 \n" // v0 = b0b1b2b3b4b5b6b7
-            "ld1 {v2.8b}, [%[inptr2]], #8 \n" // v0 = c0c1c2c3c4c5c6c7
-            "ld1 {v3.8b}, [%[inptr3]], #8 \n" // v0 = d0d1d2d3d4d5d6d7
-
-            "ld1 {v8.8b}, [%[inptr0]]  \n" // v0 = a8a9a10a11
-            "ld1 {v9.8b}, [%[inptr1]]  \n" // v0 = b8b9b10b11
-            "ld1 {v10.8b}, [%[inptr2]]  \n" // v0 = c8c9c10c11
-            "ld1 {v11.8b}, [%[inptr3]]  \n" // v0 = d8d9d10d11
-
-            "trn1 v4.8b, v0.8b, v1.8b \n" // v4 = a0b0a2b2a4b4a6b6
-            "trn2 v5.8b, v0.8b, v1.8b \n" // v4 = a1b1a3b3a5b5a7b7
-            "trn1 v6.8b, v2.8b, v3.8b \n" // v4 = c0d0c2d2a4b4a6b6
-            "trn2 v7.8b, v2.8b, v3.8b \n" // v4 = c1d1c3d3a5b5a7b7
-
-            "trn1 v12.8b, v8.8b, v9.8b \n" // v4 = a8b8a10b10a4b4a6b6
-            "trn2 v13.8b, v8.8b, v9.8b \n" // v4 = a9b9a11b11a5b5a7b7
-            "trn1 v14.8b, v10.8b, v11.8b \n" // v4 = c8d8c10d10a4b4a6b6
-            "trn2 v15.8b, v10.8b, v11.8b \n" // v4 = c9d9c11d11a5b5a7b7
-
-            "trn1 v0.4h, v4.4h, v6.4h \n" // v4 = a0b0c0d0a4b4c4d4
-            "trn2 v1.4h, v4.4h, v6.4h \n" // v4 = a2b2c2d2a6b6c6d6
-            "trn1 v2.4h, v5.4h, v7.4h \n" // v4 = a1b1c1d1a5b5c5d5
-            "trn2 v3.4h, v5.4h, v7.4h \n" // v4 = a3b3c3d3a7b7c7d7
-
-            "trn1 v8.4h, v12.4h, v14.4h \n" // v4 = a8b8c8d8
-            "trn2 v9.4h, v12.4h, v14.4h \n" // v4 = a10b10c10d10
-            "trn1 v10.4h, v13.4h, v15.4h \n" // v4 = a9b9c9d9
-            "trn2 v11.4h, v13.4h, v15.4h \n" // v4 = a11b11c11d11
-
-            "trn1 v4.2s, v0.2s, v2.2s \n" //v4 =a0b0c0d0a1b1c1d1
-            "trn2 v5.2s, v0.2s, v2.2s \n" //v4 =a4b4c4d4a5b5c5d5
-            "trn1 v6.2s, v1.2s, v3.2s \n" //v4 =a2b2c2d2a3b3c3d3
-            "trn2 v7.2s, v1.2s, v3.2s \n" //v4 =a6b6c6d6a7b7c7d7
-
-            "trn1 v0.2s, v8.2s, v10.2s \n" //v4 =a8b8c8d8a9b9c9d9
-            "trn1 v1.2s, v9.2s, v11.2s \n" //v4 =a10b10c10d10a11b11c11d11
-
-            "st1 {v4.2s}, [%[outr]], #8\n"
-            "st1 {v6.2s}, [%[outr]], #8\n"
-            "add %[inptr0], %[inptr0], #4\n"
-            "add %[inptr1], %[inptr1], #4\n"
-            "st1 {v5.2s}, [%[outr]], #8\n"
-            "st1 {v7.2s}, [%[outr]], #8\n"
-            "add %[inptr2], %[inptr2], #4\n"
-            "add %[inptr3], %[inptr3], #4\n"
-            "st1 {v0.2s}, [%[outr]], #8\n"
-            "st1 {v1.2s}, [%[outr]], #8\n"
-            : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1),
-              [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
-              [outr] "+r"(out0)
-            :
-            : "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-              "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
-              "v16", "cc", "memory"
-            );
-            outptr_row += stride_out;
-        }
-        int8_t* out0 = outptr_row;  //  outptr + stride_out + y * remain;
-        for (; x < x_len; x++) {
-            *out0++ = *inptr0++;
-            *out0++ = *inptr1++;
-            *out0++ = *inptr2++;
-            *out0++ = *inptr3++;
-        }
-        for (int i = 0; i < 12 - remain; i++) {
-            *out0++ = 0;
-            *out0++ = 0;
-            *out0++ = 0;
-            *out0++ = 0;
-        }
-    }
-}
-
-void packb_sdot_trans_int8(int8_t* out,
-                           const int8_t* in,
-                           const int ldin,
-                           const int k0,
-                           const int kmax,
-                           const int n0,
-                           const int nmax) {
-    int8_t *outptr = out;
-    const int8_t *inptr = in + n0 * ldin + k0;
-    int y_len = nmax - n0;
-    int x_len = kmax - k0;
-
-    int kup = ROUNDUP(x_len, KBLOCK_INT8);  //  4
-
-    int8_t zerobuff[kup];    //NOLINT
-    memset(zerobuff, 0, sizeof(int8_t) * kup);
-
-    int stride_y = 48;
-    int stride_out = kup;
-
-    int remain = x_len % 8;
-
-#pragma omp parallel for
-    for (int y = 0; y < y_len; y += 12) {
-        const int8_t *inptr_row[12];
-        inptr_row[0] = inptr + y * ldin;
-        for (int i = 1; i < 12; i++) {
-            inptr_row[i] = inptr_row[i - 1] + ldin;
-        }
-        if (y + 12 > y_len) {
-            for (int i = y + 12 - y_len; i > 0; i--) {
-                // inptr_row[12 - i] = zero_ptr[12 - i - 1];
-                inptr_row[12 - i] = zerobuff;
-            }
-        }
-        asm volatile(
-        "prfm   pldl1keep, [%[ptr0]]                \n"
-        "prfm   pldl1keep, [%[ptr1]]        \n"
-        "prfm   pldl1keep, [%[ptr2]]        \n"
-        "prfm   pldl1keep, [%[ptr3]]        \n"
-        "prfm   pldl1keep, [%[ptr4]]        \n"
-        "prfm   pldl1keep, [%[ptr5]]        \n"
-        "prfm   pldl1keep, [%[ptr6]]        \n"
-        "prfm   pldl1keep, [%[ptr7]]        \n"
-        "prfm   pldl1keep, [%[ptr8]]        \n"
-        "prfm   pldl1keep, [%[ptr9]]        \n"
-        "prfm   pldl1keep, [%[ptr10]]        \n"
-        "prfm   pldl1keep, [%[ptr11]]        \n"
-        :
-        :[ptr0] "r"(inptr_row[0]), [ptr1] "r"(inptr_row[1]),
-         [ptr2] "r"(inptr_row[2]), [ptr3] "r"(inptr_row[3]),
-         [ptr4] "r"(inptr_row[4]), [ptr5] "r"(inptr_row[5]),
-         [ptr6] "r"(inptr_row[6]), [ptr7] "r"(inptr_row[7]),
-         [ptr8] "r"(inptr_row[8]), [ptr9] "r"(inptr_row[9]),
-         [ptr10] "r"(inptr_row[10]), [ptr11] "r"(inptr_row[11])
-        :"memory"
-        );
-        int right_remain = remain;
-        int8_t *outptr_row = outptr + y * stride_out;
-        for (int x = 0; x < x_len - 7; x += 8) {
-            int8_t *out0 = outptr_row;
-            int8_t *out1 = out0 + stride_y;
-            asm volatile(
-            "ld1  {v0.8b}, [%[inptr0]], #8 \n" // q0=A0A1A2A3A4A5A6A7
-            "ld1  {v1.8b}, [%[inptr1]], #8 \n" // q0=B0b1b2b3A4A5A6A7
-            "ld1  {v2.8b}, [%[inptr2]], #8 \n" // q0=c0c1c2c3A4A5A6A7
-            "ld1  {v3.8b}, [%[inptr3]], #8 \n" // q0=d0d1d2d3A4A5A6A7
-
-            "ld1  {v4.8b}, [%[inptr4]], #8 \n" // q0=A0A1A2A3A4A5A6A7
-            "ld1  {v5.8b}, [%[inptr5]], #8 \n" // q0=B0b1b2b3A4A5A6A7
-            "ld1  {v6.8b}, [%[inptr6]], #8 \n" // q0=c0c1c2c3A4A5A6A7
-            "ld1  {v7.8b}, [%[inptr7]], #8 \n" // q0=d0d1d2d3A4A5A6A7
-
-            "trn1  v8.2s, v0.2s, v1.2s \n"  //v0=a0a1a2a3'b0b1b2b3 -00 01
-            "trn2  v12.2s, v0.2s, v1.2s \n"  //v0=a4a5a6a7'b4b5b6b7 - 10 11
-            "trn1  v9.2s, v2.2s, v3.2s \n"  //v0=c0c1a2a3'd0b1b2b3 -02 03
-            "trn2  v13.2s, v2.2s, v3.2s \n"  //v0=c4a5a6a7'c4b5b6b7 - 12 13
-
-            "ld1  {v0.8b}, [%[inptr8]], #8 \n" // q0=A0A1A2A3A4A5A6A7
-            "ld1  {v1.8b}, [%[inptr9]], #8 \n" // q0=B0b1b2b3A4A5A6A7
-            "ld1  {v2.8b}, [%[inptr10]], #8 \n" // q0=c0c1c2c3A4A5A6A7
-            "ld1  {v3.8b}, [%[inptr11]], #8 \n" // q0=d0d1d2d3A4A5A6A7
-
-            "st1 {v8.8b}, [%[outptr_row0]], #8 \n"
-            "st1 {v12.8b}, [%[outptr_row1]], #8 \n"
-            "st1 {v9.8b}, [%[outptr_row0]], #8 \n"
-            "st1 {v13.8b}, [%[outptr_row1]], #8 \n"
-
-            "trn1  v10.2s, v4.2s, v5.2s \n"  //v0=a0b0a0b0'a4b4a4b4 -04 05
-            "trn2  v14.2s, v4.2s, v5.2s \n"  //v0=a2b2a2b2'a6b6a6b6 -14 15
-            "trn1  v11.2s, v6.2s, v7.2s \n"  //v0=a0b0a0b0'a4b4a4b4 -06 07
-            "trn2  v15.2s, v6.2s, v7.2s \n"  //v0=a2b2a2b2'a6b6a6b6 -16 17
-
-            "trn1  v4.2s, v0.2s, v1.2s \n"  //v0=a0b0a0b0'a4b4a4b4 -08 09
-            "trn2  v5.2s, v0.2s, v1.2s \n"  //v0=a2b2a2b2'a6b6a6b6 -18 19
-            "trn1  v6.2s, v2.2s, v3.2s \n"  //v0=a0b0a0b0'a4b4a4b4 -010 011
-            "trn2  v7.2s, v2.2s, v3.2s \n"  //v0=a2b2a2b2'a6b6a6b6 -110 111
-
-            "st1 {v10.8b}, [%[outptr_row0]], #8 \n"
-            "st1 {v14.8b}, [%[outptr_row1]], #8 \n"
-            "st1 {v11.8b}, [%[outptr_row0]], #8 \n"
-            "st1 {v15.8b}, [%[outptr_row1]], #8 \n"
-
-            "st1 {v4.8b}, [%[outptr_row0]], #8 \n"
-            "st1 {v5.8b}, [%[outptr_row1]], #8 \n"
-            "st1 {v6.8b}, [%[outptr_row0]], #8 \n"
-            "st1 {v7.8b}, [%[outptr_row1]], #8 \n"
-            : [inptr0] "+r"(inptr_row[0]), [inptr1] "+r"(inptr_row[1]),
-              [inptr2] "+r"(inptr_row[2]), [inptr3] "+r"(inptr_row[3]),
-              [inptr4] "+r"(inptr_row[4]), [inptr5] "+r"(inptr_row[5]),
-              [inptr6] "+r"(inptr_row[6]), [inptr7] "+r"(inptr_row[7]),
-              [inptr8] "+r"(inptr_row[8]), [inptr9] "+r"(inptr_row[9]),
-              [inptr10] "+r"(inptr_row[10]), [inptr11] "+r"(inptr_row[11]),
-              [outptr_row0] "+r"(out0), [outptr_row1] "+r"(out1)
-            :
-            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
-              "v10", "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory"
-            );
-            outptr_row += 96;
-        }
-        int8_t *out0 = outptr_row;
-        if (right_remain >= 4) {
-            asm volatile(
-            "mov x1, #4 \n"
-            "ld1  {v0.8b}, [%[inptr0]], x1 \n" // q0=A0A1A2A3A4A5A6A7
-            "ld1  {v1.8b}, [%[inptr1]], x1 \n" // q0=B0b1b2b3A4A5A6A7
-            "ld1  {v2.8b}, [%[inptr2]], x1 \n" // q0=c0c1c2c3A4A5A6A7
-            "ld1  {v3.8b}, [%[inptr3]], x1 \n" // q0=d0d1d2d3A4A5A6A7
-
-            "ld1  {v4.8b}, [%[inptr4]], x1 \n" // q0=A0A1A2A3A4A5A6A7
-            "ld1  {v5.8b}, [%[inptr5]], x1 \n" // q0=B0b1b2b3A4A5A6A7
-            "ld1  {v6.8b}, [%[inptr6]], x1 \n" // q0=c0c1c2c3A4A5A6A7
-            "ld1  {v7.8b}, [%[inptr7]], x1 \n" // q0=d0d1d2d3A4A5A6A7
-
-            "trn1  v8.2s, v0.2s, v1.2s \n"  //v0=a0a1a2a3'b0b1b2b3 -00 01
-            "trn1  v9.2s, v2.2s, v3.2s \n"  //v0=c0c1a2a3'd0b1b2b3 -02 03
-
-            "ld1  {v12.8b}, [%[inptr8]], x1 \n" // q0=A0A1A2A3A4A5A6A7
-            "ld1  {v13.8b}, [%[inptr9]], x1 \n" // q0=B0b1b2b3A4A5A6A7
-            "ld1  {v14.8b}, [%[inptr10]], x1 \n" // q0=c0c1c2c3A4A5A6A7
-            "ld1  {v15.8b}, [%[inptr11]], x1 \n" // q0=d0d1d2d3A4A5A6A7
-
-            "trn1  v10.2s, v4.2s, v5.2s \n"  //v0=a0b0a0b0'a4b4a4b4 -04 05
-            "trn1  v11.2s, v6.2s, v7.2s \n"  //v0=a0b0a0b0'a4b4a4b4 -06 07
-
-            "trn1  v4.2s, v12.2s, v13.2s \n"  //v0=a0b0a0b0'a4b4a4b4 -08 09
-            "trn1  v6.2s, v14.2s, v15.2s \n"  //v0=a0b0a0b0'a4b4a4b4 -010 011
-
-            "st1 {v8.8b}, [%[outptr_row0]], #8 \n"
-            "st1 {v9.8b}, [%[outptr_row0]], #8 \n"
-            "st1 {v10.8b}, [%[outptr_row0]], #8 \n"
-            "st1 {v11.8b}, [%[outptr_row0]], #8 \n"
-            "st1 {v4.8b}, [%[outptr_row0]], #8 \n"
-            "st1 {v6.8b}, [%[outptr_row0]], #8 \n"
-            : [inptr0] "+r"(inptr_row[0]), [inptr1] "+r"(inptr_row[1]),
-            [inptr2] "+r"(inptr_row[2]), [inptr3] "+r"(inptr_row[3]),
-             [inptr4] "+r"(inptr_row[4]), [inptr5] "+r"(inptr_row[5]),
-             [inptr6] "+r"(inptr_row[6]), [inptr7] "+r"(inptr_row[7]),
-             [inptr8] "+r"(inptr_row[8]), [inptr9] "+r"(inptr_row[9]),
-             [inptr10] "+r"(inptr_row[10]), [inptr11] "+r"(inptr_row[11]), \
-              [outptr_row0] "+r"(out0)
-            :
-            : "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
-            "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "cc", "memory"
-            );
-            right_remain -= 4;
-        }
-        if (right_remain > 0) {
-            for (int i = 0; i < 12; i++) {
-                for (int x = 0; x < right_remain; x++) {
-                    *out0++ = *inptr_row[i]++;
-                }
-                for (int x = 0; x < 4 - right_remain; x++) {
-                    *out0++ = 0;
-                }
-            }
-        }
-    }
-}
-#endif //dotprod  //NOLINT
-
-template <>
-void gemm_prepack_int8(const int8_t* A_packed,
-                       const int8_t* B,
-                       const int* bias,
-                       float32_t* C,
-                       int M,
-                       int N,
-                       int K,
-                       bool is_bias,
-                       bool is_relu,
-                       bool is_transB,
-                       const float* scale,
-                       ARMContext* ctx) {
-#if defined(__aarch64__) && defined(WITH_ARM_DOTPROD)
-  if (ctx->has_dot()) {
-        gemm_prepack_sdot_int8<float32_t>(A_packed,
-                B, bias, C, M, N, K, is_bias, is_relu,
-                is_transB, scale, ctx);
-    } else {
-        gemm_prepack_oth_int8<float32_t>(A_packed, B,
-                bias, C, M, N, K, is_bias, is_relu,
-                is_transB, scale, ctx);
-    }
-#else
-  gemm_prepack_oth_int8<float32_t>(A_packed, B,
-          bias, C, M, N, K, is_bias, is_relu,
-          is_transB, scale, ctx);
-#endif
-}
-
-template <>
-void gemm_prepack_int8(const int8_t* A_packed,
-                       const int8_t* B,
-                       const int* bias,
-                       int8_t* C,
-                       int M,
-                       int N,
-                       int K,
-                       bool is_bias,
-                       bool is_relu,
-                       bool is_transB,
-                       const float* scale,
-                       ARMContext* ctx) {
-#if defined(__aarch64__) && defined(WITH_ARM_DOTPROD)
-  if (ctx->has_dot()) {
-        gemm_prepack_sdot_int8<int8_t>(A_packed, B, bias,
-                C, M, N, K, is_bias, is_relu,
-                is_transB, scale, ctx);
-    } else {
-        gemm_prepack_oth_int8<int8_t>(A_packed, B, bias,
-                C, M, N, K, is_bias, is_relu,
-                is_transB, scale, ctx);
-    }
-#else
-  gemm_prepack_oth_int8<int8_t>(A_packed, B, bias,
-          C, M, N, K, is_bias, is_relu,
-          is_transB, scale, ctx);
-#endif
-}
-
-template <>
-void gemm_prepack_int8(const int8_t* A_packed,
-                       const int8_t* B,
-                       const int* bias,
-                       int32_t* C,
-                       int M,
-                       int N,
-                       int K,
-                       bool is_bias,
-                       bool is_relu,
-                       bool is_transB,
-                       const float* scale,
-                       ARMContext* ctx) {
-#if defined(__aarch64__) && defined(WITH_ARM_DOTPROD)
-  if (ctx->has_dot()) {
-        gemm_prepack_sdot_int8<int32_t>(A_packed, B,
-                bias, C, M, N, K, is_bias, is_relu,
-                is_transB, scale, ctx);
-    } else {
-        gemm_prepack_oth_int8<int32_t>(A_packed, B,
-                bias, C, M, N, K, is_bias, is_relu,
-                is_transB, scale, ctx);
-    }
-#else
-  gemm_prepack_oth_int8<int32_t>(A_packed, B, bias,
-          C, M, N, K, is_bias, is_relu, is_transB, scale, ctx);
-#endif
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/gemm_prepacked_int8.h b/lite/backends/arm/math/gemm_prepacked_int8.h
deleted file mode 100644
index 7f54eea398..0000000000
--- a/lite/backends/arm/math/gemm_prepacked_int8.h
+++ /dev/null
@@ -1,94 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <cmath>
-#include "lite/core/context.h"
-#include "lite/core/device_info.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-const int KBLOCK_INT8 = 4;
-#ifdef __aarch64__
-// for int7/int8 gemm
-// const int HBLOCK = 4;
-// const int NBLOCK = 16;
-const int MBLOCK_INT8_OTH = 4;
-const int NBLOCK_INT8_OTH = 16;
-
-const int MBLOCK_INT8_DOT = 8;
-const int NBLOCK_INT8_DOT = 12;
-
-inline int get_hblock_int8(const ARMContext* ctx) {
-#ifdef WITH_ARM_DOTPROD
-  if (ctx->has_dot()) {
-    return MBLOCK_INT8_DOT;
-  } else {
-    return MBLOCK_INT8_OTH;
-  }
-#else
-  return MBLOCK_INT8_OTH;
-#endif
-}
-#else
-// const int HBLOCK = 4;
-// const int WBLOCK = 8;
-const int MBLOCK_INT8_OTH = 4;
-const int NBLOCK_INT8_OTH = 8;
-
-inline int get_hblock_int8(const ARMContext* ctx) { return 4; }
-#endif  // __aarch64__
-
-void prepackA_int8(void* out,
-                   const void* in,
-                   int ldin,
-                   int m0,
-                   int mmax,
-                   int k0,
-                   int kmax,
-                   bool is_trans,
-                   ARMContext* ctx);
-
-void prepackA_int8(TensorLite* tout,
-                   const TensorLite& tin,
-                   int m,
-                   int k,
-                   int group,
-                   bool is_trans,
-                   ARMContext* ctx);
-
-template <typename dtype>
-void gemm_prepack_int8(const int8_t* A_packed,
-                       const int8_t* B,
-                       const int* bias,
-                       dtype* C,
-                       int M,
-                       int N,
-                       int K,
-                       bool is_bias,
-                       bool is_relu,
-                       bool is_transB,
-                       const float* scale,
-                       ARMContext* ctx);
-
-#define ROUNDUP(a, b) ((((a) + (b)-1) / (b)) * (b))
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/gemv_arm_int8.cc b/lite/backends/arm/math/gemv_arm_int8.cc
deleted file mode 100644
index dff3024ba4..0000000000
--- a/lite/backends/arm/math/gemv_arm_int8.cc
+++ /dev/null
@@ -1,480 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/gemv_arm_int8.h"
-#include <arm_neon.h>
-#include "lite/backends/arm/math/saturate.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <typename dtype>
-inline void write_gemv_out(const int* in, dtype* out, const float* scale);
-
-template <>
-inline void write_gemv_out(const int* in, int* out, const float* scale) {
-  out[0] = in[0];
-}
-
-template <>
-inline void write_gemv_out(const int* in, float* out, const float* scale) {
-  out[0] = in[0] * scale[0];
-}
-
-template <>
-inline void write_gemv_out(const int* in,
-                           signed char* out,
-                           const float* scale) {
-  out[0] = saturate_cast<signed char>(roundf(in[0] * scale[0]));
-}
-
-template <typename dtype>
-bool gemv_int8(const int8_t* A,
-               const int8_t* x,
-               dtype* y,
-               bool transA,
-               int M,
-               int N,
-               const float* scale,
-               bool is_bias,
-               const int* bias,
-               bool is_relu) {
-  if (transA) {
-    LOG(ERROR) << "ERROR: sgemv, transA is not supported now";
-    return false;
-  }
-  dtype* data_out = y;
-  const int8_t* data_in = x;
-  const int8_t* weights_ptr = A;
-  int cnt = N >> 4;
-  int tail = N & 15;
-  int flag_bias = is_bias ? 1 : 0;
-
-#ifdef __aarch64__
-  int out_cnt = M >> 3;
-#pragma omp parallel for
-  for (int j = 0; j < out_cnt; j++) {
-    int out_idx = j * 8;
-    dtype* out_ptr = data_out + out_idx;
-    const float* scale_ptr = scale + out_idx;
-    int ptr_out[8] = {0, 0, 0, 0, 0, 0, 0, 0};
-    const int8_t* ptr_in = data_in;
-    const int8_t* ptr_w0 = weights_ptr + (N * out_idx);
-    const int8_t* ptr_w1 = ptr_w0 + N;
-    const int8_t* ptr_w2 = ptr_w1 + N;
-    const int8_t* ptr_w3 = ptr_w2 + N;
-    const int8_t* ptr_w4 = ptr_w3 + N;
-    const int8_t* ptr_w5 = ptr_w4 + N;
-    const int8_t* ptr_w6 = ptr_w5 + N;
-    const int8_t* ptr_w7 = ptr_w6 + N;
-    const int* bias_ptr = is_bias ? (bias + out_idx) : nullptr;
-    int cnt_loop = cnt;
-    asm volatile(
-        "prfm  pldl1keep, [%[in]]           \n" /* preload din */
-        "prfm  pldl1keep, [%[w0]]   \n"         /* preload w0 */
-        "prfm  pldl1keep, [%[w1]]   \n"         /* preload w1 */
-        "prfm  pldl1keep, [%[w2]]   \n"         /* preload w2 */
-        "prfm  pldl1keep, [%[w3]]   \n"         /* preload w3 */
-        "prfm  pldl1keep, [%[w4]]   \n"         /* preload w4 */
-        "prfm  pldl1keep, [%[w5]]   \n"         /* preload w5 */
-        "prfm  pldl1keep, [%[w6]]   \n"         /* preload w6 */
-        "prfm  pldl1keep, [%[w7]]   \n"         /* preload w7 */
-        "movi   v0.4s,  #0          \n"         /* set out0 to 0 */
-        "movi   v1.4s,  #0          \n"         /* set out1 to 0 */
-        "movi   v2.4s,  #0          \n"         /* set out2 to 0 */
-        "movi   v3.4s,  #0          \n"         /* set out3 to 0 */
-        "movi   v4.4s,  #0          \n"         /* set out4 to 0 */
-        "movi   v5.4s,  #0          \n"         /* set out5 to 0 */
-        "movi   v6.4s,  #0          \n"         /* set out6 to 0 */
-        "movi   v7.4s,  #0          \n"         /* set out7 to 0 */
-        /* check main loop */
-        "cmp %w[cnt], #1            \n" /* check whether has main loop */
-        "blt  2f                    \n" /* jump to tail */
-        /* main loop */
-        "1:                         \n"  /* main loop */
-        "ldr    q8,     [%[in]], #16 \n" /* load input, 16 int8 */
-        "ldr    q9,     [%[w0]], #16 \n" /* load w0, 16 int8 */
-        "ldr    q10,    [%[w1]], #16 \n" /* load w1, 16 int8 */
-        "ldr    q11,    [%[w2]], #16 \n" /* load w2, 16 int8 */
-        "ldr    q12,    [%[w3]], #16 \n" /* load w3, 16 int8 */
-        "ldr    q13,    [%[w4]], #16 \n" /* load w4, 16 int8 */
-        "ldr    q14,    [%[w5]], #16 \n" /* load w5, 16 int8 */
-        "ldr    q15,    [%[w6]], #16 \n" /* load w6, 16 int8 */
-        "ldr    q16,    [%[w7]], #16 \n" /* load w7, 16 int8 */
-        /* mul, lower 8 int8 * int8 = int16 */
-        "smull  v18.8h, v8.8b, v9.8b \n" /* mul in * w0, low, 8 int8 */
-        "smull  v19.8h, v8.8b, v10.8b\n" /* mul in * w1, low, 8 int8 */
-        "smull  v20.8h, v8.8b, v11.8b\n" /* mul in * w2, low, 8 int8 */
-        "smull  v21.8h, v8.8b, v12.8b\n" /* mul in * w3, low, 8 int8 */
-        "smull  v22.8h, v8.8b, v13.8b\n" /* mul in * w4, low, 8 int8 */
-        "smull  v23.8h, v8.8b, v14.8b\n" /* mul in * w5, low, 8 int8 */
-        "smull  v24.8h, v8.8b, v15.8b\n" /* mul in * w6, low, 8 int8 */
-        "smull  v25.8h, v8.8b, v16.8b\n" /* mul in * w7, low, 8 int8 */
-        /* mul, higher 8 int8 * int8 + int16 = int16 */
-        "smlal2 v18.8h,v8.16b,v9.16b \n" /* mul in * w0, high, 8 int8 */
-        "smlal2 v19.8h,v8.16b,v10.16b\n" /* mul in * w1, high, 8 int8 */
-        "smlal2 v20.8h,v8.16b,v11.16b\n" /* mul in * w2, high, 8 int8 */
-        "smlal2 v21.8h,v8.16b,v12.16b\n" /* mul in * w2, high, 8 int8 */
-        "smlal2 v22.8h,v8.16b,v13.16b\n" /* mul in * w2, high, 8 int8 */
-        "smlal2 v23.8h,v8.16b,v14.16b\n" /* mul in * w2, high, 8 int8 */
-        "smlal2 v24.8h,v8.16b,v15.16b\n" /* mul in * w2, high, 8 int8 */
-        "smlal2 v25.8h,v8.16b,v16.16b\n" /* mul in * w2, high, 8 int8 */
-        "subs %w[cnt], %w[cnt], #1   \n" /* sub main loop count */
-        /* add int16 to int32 */
-        "sadalp v0.4s, v18.8h \n"        /* pair acc, 8 int16 -> 4 int32 */
-        "sadalp v1.4s, v19.8h \n"        /* pair acc, 8 int16 -> 4 int32 */
-        "sadalp v2.4s, v20.8h \n"        /* pair acc, 8 int16 -> 4 int32 */
-        "sadalp v3.4s, v21.8h \n"        /* pair acc, 8 int16 -> 4 int32 */
-        "sadalp v4.4s, v22.8h \n"        /* pair acc, 8 int16 -> 4 int32 */
-        "sadalp v5.4s, v23.8h \n"        /* pair acc, 8 int16 -> 4 int32 */
-        "sadalp v6.4s, v24.8h \n"        /* pair acc, 8 int16 -> 4 int32 */
-        "sadalp v7.4s, v25.8h \n"        /* pair acc, 8 int16 -> 4 int32 */
-        "bne 1b                      \n" /* jump to main loop */
-        /* pair add to final result */
-        "2:                          \n" /* reduce to scale */
-        "addp v8.4s , v0.4s , v1.4s  \n" /* pair add to 4 int32*/
-        "addp v9.4s , v2.4s , v3.4s  \n" /* pair add to 4 int32*/
-        "addp v10.4s, v4.4s , v5.4s  \n" /* pair add to 4 int32*/
-        "addp v11.4s, v6.4s , v7.4s  \n" /* pair add to 4 int32*/
-
-        "addp v12.4s, v8.4s , v9.4s  \n" /* pair add to 4 int32*/
-        "addp v13.4s, v10.4s, v11.4s \n" /* pair add to 4 int32*/
-
-        "cmp %w[bias], #1           \n" /* check whether has bias */
-        "blt  0f                    \n" /* jump to tail */
-        "ldp   q8, q9, [%[bias_ptr]]\n" /* load bias to q8, q9*/
-        "add v12.4s, v12.4s, v8.4s  \n" /* add bias */
-        "add v13.4s, v13.4s, v9.4s  \n" /* add bias */
-        "0:                         \n" /* end of add bias */
-
-        /* write to output */
-        "stp q12, q13, [%[out]]     \n" /* save result */
-        : [in] "+r"(ptr_in),
-          [w0] "+r"(ptr_w0),
-          [w1] "+r"(ptr_w1),
-          [w2] "+r"(ptr_w2),
-          [w3] "+r"(ptr_w3),
-          [w4] "+r"(ptr_w4),
-          [w5] "+r"(ptr_w5),
-          [w6] "+r"(ptr_w6),
-          [w7] "+r"(ptr_w7),
-          [cnt] "+r"(cnt_loop)
-        : [out] "r"(ptr_out), [bias_ptr] "r"(bias_ptr), [bias] "r"(flag_bias)
-        : "cc",
-          "memory",
-          "v0",
-          "v1",
-          "v2",
-          "v3",
-          "v4",
-          "v5",
-          "v6",
-          "v7",
-          "v8",
-          "v9",
-          "v10",
-          "v11",
-          "v12",
-          "v13",
-          "v14",
-          "v15",
-          "v16",
-          "v17",
-          "v18",
-          "v19",
-          "v20",
-          "v21",
-          "v22",
-          "v23",
-          "v24",
-          "v25");
-    for (int i = 0; i < tail; ++i) {
-      ptr_out[0] += ptr_in[i] * ptr_w0[i];
-      ptr_out[1] += ptr_in[i] * ptr_w1[i];
-      ptr_out[2] += ptr_in[i] * ptr_w2[i];
-      ptr_out[3] += ptr_in[i] * ptr_w3[i];
-      ptr_out[4] += ptr_in[i] * ptr_w4[i];
-      ptr_out[5] += ptr_in[i] * ptr_w5[i];
-      ptr_out[6] += ptr_in[i] * ptr_w6[i];
-      ptr_out[7] += ptr_in[i] * ptr_w7[i];
-    }
-    if (is_relu) {
-      ptr_out[0] = ptr_out[0] > 0 ? ptr_out[0] : 0;
-      ptr_out[1] = ptr_out[1] > 0 ? ptr_out[1] : 0;
-      ptr_out[2] = ptr_out[2] > 0 ? ptr_out[2] : 0;
-      ptr_out[3] = ptr_out[3] > 0 ? ptr_out[3] : 0;
-      ptr_out[4] = ptr_out[4] > 0 ? ptr_out[4] : 0;
-      ptr_out[5] = ptr_out[5] > 0 ? ptr_out[5] : 0;
-      ptr_out[6] = ptr_out[6] > 0 ? ptr_out[6] : 0;
-      ptr_out[7] = ptr_out[7] > 0 ? ptr_out[7] : 0;
-    }
-
-    write_gemv_out(ptr_out, out_ptr, scale_ptr);
-    write_gemv_out(ptr_out + 1, out_ptr + 1, scale_ptr + 1);
-    write_gemv_out(ptr_out + 2, out_ptr + 2, scale_ptr + 2);
-    write_gemv_out(ptr_out + 3, out_ptr + 3, scale_ptr + 3);
-    write_gemv_out(ptr_out + 4, out_ptr + 4, scale_ptr + 4);
-    write_gemv_out(ptr_out + 5, out_ptr + 5, scale_ptr + 5);
-    write_gemv_out(ptr_out + 6, out_ptr + 6, scale_ptr + 6);
-    write_gemv_out(ptr_out + 7, out_ptr + 7, scale_ptr + 7);
-  }
-
-//! deal with remains
-#pragma omp parallel for
-  for (int j = out_cnt * 8; j < M; j++) {
-    // int *ptr_out = data_out + j;
-    dtype* out_ptr = data_out + j;
-    const float* scale_ptr = scale + j;
-    int ptr_out[1] = {0};
-    const int8_t* ptr_in = data_in;
-    const int8_t* ptr_w0 = weights_ptr + (N * j);
-    int cnt_loop = cnt;
-    int bias0 = is_bias ? bias[j] : 0;
-    asm volatile(
-        "prfm  pldl1keep, [%[in]]               \n" /* preload din */
-        "prfm  pldl1keep, [%[w0]]       \n"         /* preload w0 */
-        "movi   v0.4s,  #0              \n"         /* set out0 to 0 */
-        "fmov   s0, %w[bias0]           \n"         /* set bias */
-        /* check main loop */
-        "cmp %w[cnt], #1                \n" /* check whether has main loop */
-        "blt  2f                        \n" /* jump to tail */
-        /* main loop */
-        "1:                             \n" /* main loop */
-        "ldr    q8,     [%[in]], #16    \n" /* load input, 16 int8 */
-        "ldr    q9,     [%[w0]], #16    \n" /* load w0, 16 int8 */
-        /* mul, lower 8 int8 * int8 = int16 */
-        "smull  v18.8h, v8.8b, v9.8b    \n" /* mul in * w0, low, 8 int8 */
-        "subs %w[cnt], %w[cnt], #1      \n" /* sub main loop count */
-        /* mul, higher 8 int8 * int8 + int16 = int16 */
-        "smlal2 v18.8h,v8.16b,v9.16b    \n" /* mul in * w0, high, 8 int8 */
-        /* add int16 to int32 */
-        "sadalp v0.4s, v18.8h           \n" /* pair acc, 8 int16 -> 4 int32 */
-        "bne 1b                         \n" /* jump to main loop */
-        /* pair add to final result */
-        "2:                             \n" /* reduce to scale */
-        "addv   s8, v0.4s               \n" /* reduction to out0 */
-        /* write to output */
-        "str s8, [%[out]]               \n" /* save result */
-        : [in] "+r"(ptr_in), [w0] "+r"(ptr_w0), [cnt] "+r"(cnt_loop)
-        : [out] "r"(ptr_out), [bias0] "r"(bias0)
-        : "cc", "memory", "v0", "v8", "v9", "v18");
-    for (int i = 0; i < tail; ++i) {
-      ptr_out[0] += ptr_in[i] * ptr_w0[i];
-    }
-    if (is_relu) {
-      ptr_out[0] = ptr_out[0] > 0 ? ptr_out[0] : 0;
-    }
-    write_gemv_out(ptr_out, out_ptr, scale_ptr);
-  }
-#else  //__aarch64__ // NOLINT
-  int out_cnt = M >> 2;
-#pragma omp parallel for
-  for (int j = 0; j < out_cnt; j++) {
-    int out_idx = j * 4;
-    dtype* out_ptr = data_out + out_idx;
-    const float* scale_ptr = scale + out_idx;
-    int ptr_out[4] = {0, 0, 0, 0};
-    const int8_t* ptr_in = data_in;
-    const int8_t* ptr_w0 = weights_ptr + (N * out_idx);
-    const int8_t* ptr_w1 = ptr_w0 + N;
-    const int8_t* ptr_w2 = ptr_w1 + N;
-    const int8_t* ptr_w3 = ptr_w2 + N;
-    int cnt_loop = cnt;
-    int bias0 = is_bias ? bias[out_idx] : 0;
-    int bias1 = is_bias ? bias[out_idx + 1] : 0;
-    int bias2 = is_bias ? bias[out_idx + 2] : 0;
-    int bias3 = is_bias ? bias[out_idx + 3] : 0;
-    asm volatile(
-        "pld [%[in]]                    @ preload cache line, input\n"
-        "pld [%[w0]]                    @ preload cache line, weights r0\n"
-        "pld [%[w1]]                    @ preload cache line, weights r1\n"
-        "pld [%[w2]]                    @ preload cache line, weights r2\n"
-        "pld [%[w3]]                    @ preload cache line, weights r3\n"
-        "vmov.u32 q0, #0                @ set q0 to 0\n"
-        "vmov.u32 q1, #0                @ set q1 to 0\n"
-        "vmov.u32 q2, #0                @ set q2 to 0\n"
-        "vmov.u32 q3, #0                @ set q3 to 0\n"
-        "vmov s0, %[bias0]              @ set q0 to bias0\n"
-        "vmov s4, %[bias1]              @ set q1 to bias1\n"
-        "vmov s8, %[bias2]              @ set q2 to bias2\n"
-        "vmov s12,%[bias3]              @ set q3 to bias3\n"
-        // "vld1.32 {d20-d21}, %[bias]     @ load bias data"
-        "cmp %[cnt], #1                 @ check whether has main loop\n"
-        "blt  2f                        @ jump to pair add\n"
-        /* main loop */
-        "1:                             @ main loop\n"
-        "vld1.8 {d8-d9}, [%[in]]!       @ load input, q4\n"
-        "vld1.8 {d12-d13}, [%[w0]]!     @ load weights r0, q6\n"
-        "vld1.8 {d14-d15}, [%[w1]]!     @ load weights r1, q7\n"
-        "vld1.8 {d16-d17}, [%[w2]]!     @ load weights r2, q8\n"
-        "vld1.8 {d18-d19}, [%[w3]]!     @ load weights r3, q9\n"
-        /* mul, int8 * int8 = int16 */
-        "vmull.s8 q12, d8, d12          @ mul add\n"
-        "vmull.s8 q13, d8, d14          @ mul add\n"
-        "vmull.s8 q14, d8, d16          @ mul add\n"
-        "vmull.s8 q15, d8, d18          @ mul add\n"
-        /* mla, int8 * int8 + int16 = int16 */
-        "vmlal.s8 q12, d9, d13          @ mul add\n"
-        "vmlal.s8 q13, d9, d15          @ mul add\n"
-        "vmlal.s8 q14, d9, d17          @ mul add\n"
-        "vmlal.s8 q15, d9, d19          @ mul add\n"
-        /* pacc, int16 + int32 = int32 */
-        "vpadal.s16 q0, q12             @ pair acc\n"
-        "vpadal.s16 q1, q13             @ pair acc\n"
-        "vpadal.s16 q2, q14             @ pair acc\n"
-        "vpadal.s16 q3, q15             @ pair acc\n"
-        "subs %[cnt], #1                @ sub loop count \n"
-        /* check loop end */
-        "bne 1b                         @ jump to main loop\n"
-        /* pair add to final result */
-        "2:                             @ pair add \n"
-        "vpadd.s32 d8, d0, d1           @ pair add, first step\n"
-        "vpadd.s32 d9, d2, d3           @ pair add, first step\n"
-        "vpadd.s32 d10, d4, d5          @ pair add, first step\n"
-        "vpadd.s32 d11, d6, d7          @ pair add, first step\n"
-        "vpadd.s32 d0, d8, d9           @ pair add, second step\n"
-        "vpadd.s32 d1, d10, d11         @ pair add, second step\n"
-        /* write output */
-        "vst1.32 {d0-d1}, [%[out]]      @ save result\n"
-        : [in] "+r"(ptr_in),
-          [w0] "+r"(ptr_w0),
-          [w1] "+r"(ptr_w1),
-          [w2] "+r"(ptr_w2),
-          [w3] "+r"(ptr_w3),
-          [cnt] "+r"(cnt_loop)
-        : [bias0] "r"(bias0),
-          [bias1] "r"(bias1),
-          [bias2] "r"(bias2),
-          [bias3] "r"(bias3),
-          [out] "r"(ptr_out)
-        : "cc",
-          "memory",
-          "q0",
-          "q1",
-          "q2",
-          "q3",
-          "q4",
-          "q5",
-          "q6",
-          "q7",
-          "q8",
-          "q9",
-          "q12",
-          "q13",
-          "q14",
-          "q15");
-    for (int i = 0; i < tail; ++i) {
-      ptr_out[0] += ptr_in[i] * ptr_w0[i];
-      ptr_out[1] += ptr_in[i] * ptr_w1[i];
-      ptr_out[2] += ptr_in[i] * ptr_w2[i];
-      ptr_out[3] += ptr_in[i] * ptr_w3[i];
-    }
-    if (is_relu) {
-      ptr_out[0] = ptr_out[0] > 0 ? ptr_out[0] : 0;
-      ptr_out[1] = ptr_out[1] > 0 ? ptr_out[1] : 0;
-      ptr_out[2] = ptr_out[2] > 0 ? ptr_out[2] : 0;
-      ptr_out[3] = ptr_out[3] > 0 ? ptr_out[3] : 0;
-    }
-    write_gemv_out(ptr_out, out_ptr, scale_ptr);
-    write_gemv_out(ptr_out + 1, out_ptr + 1, scale_ptr + 1);
-    write_gemv_out(ptr_out + 2, out_ptr + 2, scale_ptr + 2);
-    write_gemv_out(ptr_out + 3, out_ptr + 3, scale_ptr + 3);
-  }
-//! deal with remains
-#pragma omp parallel for
-  for (int j = out_cnt * 4; j < M; j++) {
-    dtype* out_ptr = data_out + j;
-    const float* scale_ptr = scale + j;
-    int ptr_out[1] = {0};
-    const int8_t* ptr_in = data_in;
-    const int8_t* ptr_w0 = weights_ptr + (N * j);
-    int cnt_loop = cnt;
-    int bias0 = is_bias ? bias[j] : 0;
-    asm volatile(
-        "pld [%[in]]                                @ preload cache line, "
-        "input\n"
-        "pld [%[w0]]                        @ preload cache line, weights r0\n"
-        "vmov.u32 q0, #0                    @ set q0 to 0\n"
-        "vmov s0, %[bias0]                  @ set q0 to bias0\n"
-        "cmp %[cnt], #1                     @ check whether has main loop\n"
-        "blt  2f                            @ jump to tail\n"
-        /* main loop */
-        "1:                                 @ main loop\n"
-        "vld1.8 {d24-d25}, [%[in]]!         @ load input, q12\n"
-        "vld1.8 {d28-d29}, [%[w0]]!         @ load weights q14\n"
-        /* mull int8 * int8 = int16*/
-        "vmull.s8 q1, d24, d28              @ mul add\n"
-        "vmlal.s8 q1, d25, d29              @ mul add\n"
-        "subs %[cnt] , #1                   @ sub loop count \n"
-        /* pacc int16 + int32 = int32*/
-        "vpadal.s16 q0, q1                  @ pair acc\n"
-        "bne 1b                             @ jump to main loop\n"
-        /* pair add to final result */
-        "2:                                 @ end processing\n"
-        "vpadd.s32 d2, d0, d1               @ pair add, first step\n"
-        "vpadd.s32 d0, d2, d2               @ pair add, final step\n"
-        /* write output */
-        "vst1.32 {d0[0]}, [%[out]]          @ save result\n"
-        : [in] "+r"(ptr_in), [w0] "+r"(ptr_w0), [cnt] "+r"(cnt_loop)
-        : [bias0] "r"(bias0), [out] "r"(ptr_out)
-        : "cc", "memory", "q0", "q1", "q12", "q13");
-    for (int i = 0; i < tail; ++i) {
-      ptr_out[0] += ptr_in[i] * ptr_w0[i];
-    }
-    if (is_relu) {
-      ptr_out[0] = ptr_out[0] > 0 ? ptr_out[0] : 0;
-    }
-    write_gemv_out(ptr_out, out_ptr, scale_ptr);
-  }
-#endif  //__aarch64__ // NOLINT
-  return true;
-}
-
-template bool gemv_int8<float>(const int8_t* A,
-                               const int8_t* x,
-                               float* y,
-                               bool transA,
-                               int M,
-                               int N,
-                               const float* scale,
-                               bool is_bias,
-                               const int* bias,
-                               bool is_relu);
-template bool gemv_int8<int>(const int8_t* A,
-                             const int8_t* x,
-                             int* y,
-                             bool transA,
-                             int M,
-                             int N,
-                             const float* scale,
-                             bool is_bias,
-                             const int* bias,
-                             bool is_relu);
-template bool gemv_int8<signed char>(const int8_t* A,
-                                     const int8_t* x,
-                                     signed char* y,
-                                     bool transA,
-                                     int M,
-                                     int N,
-                                     const float* scale,
-                                     bool is_bias,
-                                     const int* bias,
-                                     bool is_relu);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/gemv_arm_int8.h b/lite/backends/arm/math/gemv_arm_int8.h
deleted file mode 100644
index 3021120695..0000000000
--- a/lite/backends/arm/math/gemv_arm_int8.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <cmath>
-#include "lite/core/device_info.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-// fixme now only support transA = false
-template <typename dtype>
-bool gemv_int8(const int8_t* A,
-               const int8_t* x,
-               dtype* y,
-               bool transA,
-               int M,
-               int N,
-               const float* scale,
-               bool is_bias = false,
-               const int* bias = nullptr,
-               bool is_relu = false);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/gru_utils.h b/lite/backends/arm/math/gru_utils.h
deleted file mode 100644
index 9bef1889b8..0000000000
--- a/lite/backends/arm/math/gru_utils.h
+++ /dev/null
@@ -1,434 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "lite/backends/arm/math/sgemm.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <typename T>
-struct GRUMetaValue {
-  T* gate_weight;
-  T* state_weight;
-  T* gate_value;
-  T* reset_output_value;
-  T* output_value;
-  T* prev_out_value;
-};
-
-template <typename Dtype>
-inline void gru_add_with_bias(
-    const Dtype* din, const Dtype* bias, Dtype* dout, int batch, int size);
-
-template <>
-inline void gru_add_with_bias(
-    const float* din, const float* bias, float* dout, int batch, int size) {
-#pragma omp parallel for
-  for (int i = 0; i < batch; ++i) {
-    int j = 0;
-    auto din_batch = din + i * size;
-    auto dout_batch = dout + i * size;
-    float32x4_t vb0 = vld1q_f32(bias);
-    float32x4_t vin0 = vld1q_f32(din_batch);
-    float32x4_t vout0;
-    float32x4_t vout1;
-    float32x4_t vin1;
-    float32x4_t vb1;
-    for (; j < size - 7; j += 8) {
-      vin1 = vld1q_f32(din_batch + j + 4);
-      vb1 = vld1q_f32(bias + j + 4);
-      vout0 = vaddq_f32(vb0, vin0);
-      vout1 = vaddq_f32(vb1, vin1);
-      vb0 = vld1q_f32(bias + j + 8);
-      vin0 = vld1q_f32(din_batch + j + 8);
-      vst1q_f32(dout_batch + j, vout0);
-      vst1q_f32(dout_batch + j + 4, vout1);
-    }
-    for (; j < size; ++j) {
-      dout_batch[j] = din_batch[j] + bias[j];
-    }
-  }
-}
-
-template <lite_api::ActivationType Act>
-static void gru_unit_reset_act_impl(float* updata_gate,
-                                    int stride_update,
-                                    float* reset_gate,
-                                    int stride_reset,
-                                    const float* hidden_prev,
-                                    int stride_hidden_prev,
-                                    float* reset_hidden_prev,
-                                    int stride_reset_hidden_prev,
-                                    int frame_size,
-                                    int batch_size) {
-#pragma omp parallel for
-  for (int b = 0; b < batch_size; ++b) {
-    float32x4_t vpre0 = vdupq_n_f32(0.f);
-    float32x4_t vpre1 = vdupq_n_f32(0.f);
-    float prev = 0.f;
-    int i = 0;
-    for (; i < frame_size - 7; i += 8) {
-      float32x4_t vu0 = vld1q_f32(updata_gate + i);
-      float32x4_t vu1 = vld1q_f32(updata_gate + i + 4);
-      float32x4_t vr0 = vld1q_f32(reset_gate + i);
-      float32x4_t vr1 = vld1q_f32(reset_gate + i + 4);
-
-      float32x4_t vau0 = lite::arm::math::vactive_f32<Act>(vu0);
-      float32x4_t vau1 = lite::arm::math::vactive_f32<Act>(vu1);
-
-      if (hidden_prev) {
-        vpre0 = vld1q_f32(hidden_prev + i);
-        vpre1 = vld1q_f32(hidden_prev + i + 4);
-      }
-
-      float32x4_t var0 = lite::arm::math::vactive_f32<Act>(vr0);
-      float32x4_t var1 = lite::arm::math::vactive_f32<Act>(vr1);
-
-      vst1q_f32(updata_gate + i, vau0);
-      vst1q_f32(updata_gate + i + 4, vau1);
-
-      float32x4_t vres0 = vmulq_f32(vpre0, var0);
-      float32x4_t vres1 = vmulq_f32(vpre1, var1);
-
-      vst1q_f32(reset_gate + i, var0);
-      vst1q_f32(reset_gate + i + 4, var1);
-      vst1q_f32(reset_hidden_prev + i, vres0);
-      vst1q_f32(reset_hidden_prev + i + 4, vres1);
-    }
-
-    for (; i < frame_size; ++i) {
-      updata_gate[i] = lite::arm::math::active_f32<Act>(updata_gate[i]);
-      reset_gate[i] = lite::arm::math::active_f32<Act>(reset_gate[i]);
-      if (hidden_prev) {
-        prev = hidden_prev[i];
-      }
-      reset_hidden_prev[i] = reset_gate[i] * prev;
-    }
-
-    updata_gate += stride_update;
-    reset_gate += stride_reset;
-    if (hidden_prev) {
-      hidden_prev += stride_hidden_prev;
-    }
-    reset_hidden_prev += stride_reset_hidden_prev;
-  }
-}
-
-template <lite_api::ActivationType Act>
-static void gru_unit_out_act_impl(bool origin_mode,
-                                  float* updata_gate,
-                                  int stride_update,
-                                  float* cell_state,
-                                  int stride_cell_state,
-                                  const float* hidden_prev,
-                                  int stride_hidden_prev,
-                                  float* hidden,
-                                  int stride_hidden,
-                                  int frame_size,
-                                  int batch_size) {
-#pragma omp parallel for
-  for (int b = 0; b < batch_size; ++b) {
-    float32x4_t vpre0 = vdupq_n_f32(0.f);
-    float32x4_t vpre1 = vdupq_n_f32(0.f);
-    float prev = 0.f;
-    int i = 0;
-    if (origin_mode) {
-      for (; i < frame_size - 7; i += 8) {
-        float32x4_t vc0 = vld1q_f32(cell_state + i);
-        float32x4_t vc1 = vld1q_f32(cell_state + i + 4);
-        float32x4_t vu0 = vld1q_f32(updata_gate + i);
-        float32x4_t vu1 = vld1q_f32(updata_gate + i + 4);
-
-        float32x4_t vac0 = lite::arm::math::vactive_f32<Act>(vc0);
-        float32x4_t vac1 = lite::arm::math::vactive_f32<Act>(vc1);
-        if (hidden_prev) {
-          vpre0 = vld1q_f32(hidden_prev + i);
-          vpre1 = vld1q_f32(hidden_prev + i + 4);
-        }
-
-        float32x4_t vh0 = vmlsq_f32(vac0, vu0, vac0);
-        float32x4_t vh1 = vmlsq_f32(vac1, vu1, vac1);
-
-        vst1q_f32(cell_state + i, vac0);
-        vst1q_f32(cell_state + i + 4, vac1);
-
-        vh0 = vmlaq_f32(vh0, vu0, vpre0);
-        vh1 = vmlaq_f32(vh1, vu1, vpre1);
-
-        vst1q_f32(hidden + i, vh0);
-        vst1q_f32(hidden + i + 4, vh1);
-      }
-
-      for (; i < frame_size; ++i) {
-        if (hidden_prev) {
-          prev = hidden_prev[i];
-        }
-        cell_state[i] = lite::arm::math::active_f32<Act>(cell_state[i]);
-        hidden[i] =
-            cell_state[i] * (1.f - updata_gate[i]) + updata_gate[i] * prev;
-      }
-    } else {
-      for (; i < frame_size - 7; i += 8) {
-        float32x4_t vc0 = vld1q_f32(cell_state + i);
-        float32x4_t vc1 = vld1q_f32(cell_state + i + 4);
-        float32x4_t vu0 = vld1q_f32(updata_gate + i);
-        float32x4_t vu1 = vld1q_f32(updata_gate + i + 4);
-
-        float32x4_t vac0 = lite::arm::math::vactive_f32<Act>(vc0);
-        float32x4_t vac1 = lite::arm::math::vactive_f32<Act>(vc1);
-
-        if (hidden_prev) {
-          vpre0 = vld1q_f32(hidden_prev + i);
-          vpre1 = vld1q_f32(hidden_prev + i + 4);
-        }
-
-        float32x4_t vh0 = vmlsq_f32(vpre0, vpre0, vu0);
-        float32x4_t vh1 = vmlsq_f32(vpre1, vpre1, vu1);
-
-        vst1q_f32(cell_state + i, vac0);
-        vst1q_f32(cell_state + i + 4, vac1);
-
-        vh0 = vmlaq_f32(vh0, vu0, vac0);
-        vh1 = vmlaq_f32(vh1, vu1, vac1);
-
-        vst1q_f32(hidden + i, vh0);
-        vst1q_f32(hidden + i + 4, vh1);
-      }
-
-      for (; i < frame_size; ++i) {
-        cell_state[i] = lite::arm::math::active_f32<Act>(cell_state[i]);
-        if (hidden_prev) {
-          prev = hidden_prev[i];
-        }
-        hidden[i] =
-            prev * (1.f - updata_gate[i]) + updata_gate[i] * cell_state[i];
-      }
-    }
-    updata_gate += stride_update;
-    cell_state += stride_cell_state;
-    if (hidden_prev) {
-      hidden_prev += stride_hidden_prev;
-    }
-    hidden += stride_hidden;
-  }
-}
-
-inline void gru_unit_reset_act(lite_api::ActivationType act_type,
-                               GRUMetaValue<float> value,
-                               int frame_size,
-                               int batch_size) {
-  auto updata_gate = value.gate_value;
-  auto reset_gate = value.gate_value + frame_size;
-  auto hidden_prev = value.prev_out_value;
-  auto reset_hidden_prev = value.reset_output_value;
-  int stride_update = 3 * frame_size;
-  int stride_reset = 3 * frame_size;
-  int stride_hidden_prev = frame_size;
-  int stride_reset_hidden_prev = frame_size;
-
-  switch (act_type) {
-    case lite_api::ActivationType::kIndentity:
-      gru_unit_reset_act_impl<lite_api::ActivationType::kIndentity>(
-          updata_gate,
-          stride_update,
-          reset_gate,
-          stride_reset,
-          hidden_prev,
-          stride_hidden_prev,
-          reset_hidden_prev,
-          stride_reset_hidden_prev,
-          frame_size,
-          batch_size);
-      break;
-    case lite_api::ActivationType::kTanh:
-      gru_unit_reset_act_impl<lite_api::ActivationType::kTanh>(
-          updata_gate,
-          stride_update,
-          reset_gate,
-          stride_reset,
-          hidden_prev,
-          stride_hidden_prev,
-          reset_hidden_prev,
-          stride_reset_hidden_prev,
-          frame_size,
-          batch_size);
-      break;
-    case lite_api::ActivationType::kSigmoid:
-      gru_unit_reset_act_impl<lite_api::ActivationType::kSigmoid>(
-          updata_gate,
-          stride_update,
-          reset_gate,
-          stride_reset,
-          hidden_prev,
-          stride_hidden_prev,
-          reset_hidden_prev,
-          stride_reset_hidden_prev,
-          frame_size,
-          batch_size);
-      break;
-    case lite_api::ActivationType::kRelu:
-      gru_unit_reset_act_impl<lite_api::ActivationType::kRelu>(
-          updata_gate,
-          stride_update,
-          reset_gate,
-          stride_reset,
-          hidden_prev,
-          stride_hidden_prev,
-          reset_hidden_prev,
-          stride_reset_hidden_prev,
-          frame_size,
-          batch_size);
-      break;
-    default:
-      break;
-  }
-}
-
-inline void gru_unit_out_act(lite_api::ActivationType act_type,
-                             bool origin_mode,
-                             GRUMetaValue<float> value,
-                             int frame_size,
-                             int batch_size) {
-  auto updata_gate = value.gate_value;
-  auto cell_state = value.gate_value + 2 * frame_size;
-  auto hidden_prev = value.prev_out_value;
-  auto hidden = value.output_value;
-
-  int stride_update = 3 * frame_size;
-  int stride_cell_state = 3 * frame_size;
-  int stride_hidden_prev = frame_size;
-  int stride_hidden = frame_size;
-
-  switch (act_type) {
-    case lite_api::ActivationType::kIndentity:
-      gru_unit_out_act_impl<lite_api::ActivationType::kIndentity>(
-          origin_mode,
-          updata_gate,
-          stride_update,
-          cell_state,
-          stride_cell_state,
-          hidden_prev,
-          stride_hidden_prev,
-          hidden,
-          stride_hidden,
-          frame_size,
-          batch_size);
-      break;
-    case lite_api::ActivationType::kTanh:
-      gru_unit_out_act_impl<lite_api::ActivationType::kTanh>(origin_mode,
-                                                             updata_gate,
-                                                             stride_update,
-                                                             cell_state,
-                                                             stride_cell_state,
-                                                             hidden_prev,
-                                                             stride_hidden_prev,
-                                                             hidden,
-                                                             stride_hidden,
-                                                             frame_size,
-                                                             batch_size);
-      break;
-    case lite_api::ActivationType::kSigmoid:
-      gru_unit_out_act_impl<lite_api::ActivationType::kSigmoid>(
-          origin_mode,
-          updata_gate,
-          stride_update,
-          cell_state,
-          stride_cell_state,
-          hidden_prev,
-          stride_hidden_prev,
-          hidden,
-          stride_hidden,
-          frame_size,
-          batch_size);
-      break;
-    case lite_api::ActivationType::kRelu:
-      gru_unit_out_act_impl<lite_api::ActivationType::kRelu>(origin_mode,
-                                                             updata_gate,
-                                                             stride_update,
-                                                             cell_state,
-                                                             stride_cell_state,
-                                                             hidden_prev,
-                                                             stride_hidden_prev,
-                                                             hidden,
-                                                             stride_hidden,
-                                                             frame_size,
-                                                             batch_size);
-      break;
-    default:
-      break;
-  }
-}
-
-template <typename T>
-struct GRUUnitFunctor {
-  static void compute(GRUMetaValue<T> value,
-                      int frame_size,
-                      int batch_size,
-                      const lite_api::ActivationType active_node,
-                      const lite_api::ActivationType active_gate,
-                      bool origin_mode,
-                      ARMContext* ctx) {
-    if (value.prev_out_value) {
-      sgemm(false,
-            false,
-            batch_size,
-            frame_size * 2,
-            frame_size,
-            1.f,
-            value.prev_out_value,
-            frame_size,
-            value.gate_weight,
-            frame_size * 2,
-            1.f,
-            value.gate_value,
-            frame_size * 3,
-            nullptr,
-            false,
-            false,
-            ctx);
-    }
-    gru_unit_reset_act(active_gate, value, frame_size, batch_size);
-
-    if (value.prev_out_value) {
-      sgemm(false,
-            false,
-            batch_size,
-            frame_size,
-            frame_size,
-            1.f,
-            value.reset_output_value,
-            frame_size,
-            value.state_weight,
-            frame_size,
-            1.f,
-            value.gate_value + frame_size * 2,
-            frame_size * 3,
-            nullptr,
-            false,
-            false,
-            ctx);
-    }
-
-    gru_unit_out_act(active_node, origin_mode, value, frame_size, batch_size);
-  }
-};
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/im2sequence.cc b/lite/backends/arm/math/im2sequence.cc
deleted file mode 100644
index 39fb9b477e..0000000000
--- a/lite/backends/arm/math/im2sequence.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/im2sequence.h"
-#include <arm_neon.h>
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void im2sequence(const float* input,
-                 const int input_c,
-                 const int input_h,
-                 const int input_w,
-                 const int kernel_h,
-                 const int kernel_w,
-                 const int pad_top,
-                 const int pad_bottom,
-                 const int pad_left,
-                 const int pad_right,
-                 const int stride_h,
-                 const int stride_w,
-                 const int out_h,
-                 const int out_w,
-                 float* out,
-                 Context<TARGET(kARM)>* ctx) {
-  int window_size = kernel_h * kernel_w;
-  int out_rows = out_h * out_w;
-  int out_cols = input_c * window_size;
-  int H_pad = input_h + pad_top + pad_bottom;
-  int W_pad = input_w + pad_left + pad_right;
-  for (int h_id = 0; h_id < out_h; h_id++) {
-    for (int w_id = 0; w_id < out_w; w_id++) {
-      // consider dilation.
-      int start_h = h_id * stride_h - pad_top;
-      int start_w = w_id * stride_w - pad_left;
-      for (int c_id = 0; c_id < input_c; c_id++) {
-        for (int k_h_id = 0; k_h_id < kernel_h; k_h_id++) {
-          int in_h_id = start_h + k_h_id;
-          bool exceed_flag = (in_h_id < 0) || (in_h_id >= H_pad);
-          int out_start_id =
-              (h_id * out_w + w_id) * out_cols + c_id * window_size;
-          for (int k_w_id = 0; k_w_id < kernel_w; k_w_id++) {
-            int in_w_id = start_w + k_w_id;
-            exceed_flag = exceed_flag || (in_w_id < 0) || (in_w_id >= W_pad);
-            int input_id = (c_id * input_h + in_h_id) * input_w + in_w_id;
-            int out_id = out_start_id + k_h_id * kernel_w + k_w_id;
-            out[out_id] = exceed_flag ? 0.f : input[input_id];
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/im2sequence.h b/lite/backends/arm/math/im2sequence.h
deleted file mode 100644
index 5fd06c2608..0000000000
--- a/lite/backends/arm/math/im2sequence.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cmath>
-#include "lite/core/context.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-void im2sequence(const float* input,
-                 const int input_c,
-                 const int input_h,
-                 const int input_w,
-                 const int kernel_h,
-                 const int kernel_w,
-                 const int pad_top,
-                 const int pad_bottom,
-                 const int pad_left,
-                 const int pad_right,
-                 const int stride_h,
-                 const int stride_w,
-                 const int out_h,
-                 const int out_w,
-                 float* out,
-                 Context<TARGET(kARM)>* ctx);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/increment.cc b/lite/backends/arm/math/increment.cc
deleted file mode 100644
index 094fe78de9..0000000000
--- a/lite/backends/arm/math/increment.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/increment.h"
-#include <arm_neon.h>
-#include <cmath>
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-void increment(const int* input,
-               const int n,
-               const float step,
-               int* out,
-               Context<TARGET(kARM)>* ctx) {
-  for (int i = 0; i < n; i++) {
-    out[i] = input[i] + step;
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/increment.h b/lite/backends/arm/math/increment.h
deleted file mode 100644
index 80aec62885..0000000000
--- a/lite/backends/arm/math/increment.h
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cmath>
-#include "lite/core/context.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-void increment(const int* input,
-               const int n,
-               const float step,
-               int* out,
-               Context<TARGET(kARM)>* ctx);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/interpolate.cc b/lite/backends/arm/math/interpolate.cc
deleted file mode 100644
index c32494c2ba..0000000000
--- a/lite/backends/arm/math/interpolate.cc
+++ /dev/null
@@ -1,534 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/interpolate.h"
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-// The following function bilinear_interp is partially base on
-// https://github.com/Tencent/ncnn/blob/master/src/layer/arm/interp_arm.cpp
-// Tencent is pleased to support the open source community by making ncnn
-// available.
-//
-// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this
-// file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-void bilinear_interp(const float* src,
-                     int w_in,
-                     int h_in,
-                     float* dst,
-                     int w_out,
-                     int h_out,
-                     float scale_x,
-                     float scale_y,
-                     bool with_align) {
-  int* buf = new int[w_out + h_out + w_out * 2 + h_out * 2];
-
-  int* xofs = buf;
-  int* yofs = buf + w_out;
-
-  float* alpha = reinterpret_cast<float*>(buf + w_out + h_out);
-  float* beta = reinterpret_cast<float*>(buf + w_out + h_out + w_out * 2);
-
-  float fx = 0.0f;
-  float fy = 0.0f;
-  int sx = 0;
-  int sy = 0;
-  if (with_align) {
-    scale_x = static_cast<float>(w_in - 1) / (w_out - 1);
-    scale_y = static_cast<float>(h_in - 1) / (h_out - 1);
-    // calculate x axis coordinate
-    for (int dx = 0; dx < w_out; dx++) {
-      fx = dx * scale_x;
-      sx = static_cast<int>(fx);
-      fx -= sx;
-      xofs[dx] = sx;
-      alpha[dx * 2] = 1.f - fx;
-      alpha[dx * 2 + 1] = fx;
-    }
-    // calculate y axis coordinate
-    for (int dy = 0; dy < h_out; dy++) {
-      fy = dy * scale_y;
-      sy = static_cast<int>(fy);
-      fy -= sy;
-      yofs[dy] = sy;
-      beta[dy * 2] = 1.f - fy;
-      beta[dy * 2 + 1] = fy;
-    }
-  } else {
-    scale_x = static_cast<float>(w_in / w_out);
-    scale_y = static_cast<float>(h_in / h_out);
-    // calculate x axis coordinate
-    for (int dx = 0; dx < w_out; dx++) {
-      fx = scale_x * (dx + 0.5f) - 0.5f;
-      fx = fx < 0 ? 0.f : fx;
-      sx = static_cast<int>(fx);
-      fx -= sx;
-      xofs[dx] = sx;
-      alpha[dx * 2] = 1.f - fx;
-      alpha[dx * 2 + 1] = fx;
-    }
-    // calculate y axis coordinate
-    for (int dy = 0; dy < h_out; dy++) {
-      fy = scale_y * (dy + 0.5f) - 0.5f;
-      fy = fy < 0 ? 0.f : fy;
-      sy = static_cast<int>(fy);
-      fy -= sy;
-      yofs[dy] = sy;
-      beta[dy * 2] = 1.f - fy;
-      beta[dy * 2 + 1] = fy;
-    }
-  }
-  float* rowsbuf0 = new float[w_out];
-  float* rowsbuf1 = new float[w_out];
-  float* rows0 = rowsbuf0;
-  float* rows1 = rowsbuf1;
-  // output w , h boundary
-  int w_bound = w_out;
-  int h_bound = h_out;
-  if (with_align) {
-    w_bound = ceil((w_in - 1) / scale_x);
-    h_bound = ceil((h_in - 1) / scale_y);
-  } else {
-    w_bound = ceil((w_in - 0.5f) / scale_x - 0.5f);
-    h_bound = ceil((h_in - 0.5f) / scale_y - 0.5f);
-  }
-  // h_bound loop
-  for (int dy = 0; dy < h_bound; dy++) {
-    int sy = yofs[dy];
-
-    const float* s0 = src + sy * w_in;
-    const float* s1 = src + (sy + 1) * w_in;
-
-    const float* alphap = alpha;
-    float* rows0p = rows0;
-    float* rows1p = rows1;
-
-    int dx = 0;
-    // w_bound loop
-    for (; dx + 1 < w_bound; dx += 2) {
-      int sx = xofs[dx];
-      int sxn = xofs[dx + 1];
-      const float* s0p = s0 + sx;
-      const float* s1p = s1 + sx;
-      const float* s0np = s0 + sxn;
-      const float* s1np = s1 + sxn;
-
-      float32x4_t _a = vld1q_f32(alphap);
-      float32x2_t _s0 = vld1_f32(s0p);
-      float32x2_t _s1 = vld1_f32(s1p);
-      float32x2_t _s0n = vld1_f32(s0np);
-      float32x2_t _s1n = vld1_f32(s1np);
-
-      float32x4_t _s0s0n = vcombine_f32(_s0, _s0n);
-      float32x4_t _ms0 = vmulq_f32(_s0s0n, _a);
-      float32x4_t _s1s1n = vcombine_f32(_s1, _s1n);
-      float32x4_t _ms1 = vmulq_f32(_s1s1n, _a);
-
-      float32x2_t _rows0 = vpadd_f32(vget_low_f32(_ms0), vget_high_f32(_ms0));
-      vst1_f32(rows0p + dx, _rows0);
-      float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1));
-      vst1_f32(rows1p + dx, _rows1);
-
-      alphap += 4;
-    }
-    // w_bound remain loop
-    for (; dx < w_bound; dx++) {
-      int sx = xofs[dx];
-      const float* s0p = s0 + sx;
-      const float* s1p = s1 + sx;
-
-      float a0 = alphap[0];
-      float a1 = alphap[1];
-      rows0p[dx] = s0p[0] * a0 + s0p[1] * a1;
-      rows1p[dx] = s1p[0] * a0 + s1p[1] * a1;
-
-      alphap += 2;
-    }
-
-    const float buffer1[2] = {*(src + sy * w_in + w_in - 1),
-                              *(src + sy * w_in + w_in - 1)};
-    const float buffer2[2] = {*(src + (sy + 1) * w_in + w_in - 1),
-                              *(src + (sy + 1) * w_in + w_in - 1)};
-    // w_bound - w_out loop
-    for (; dx + 1 < w_out; dx += 2) {
-      const float* s0p = buffer1;
-      const float* s1p = buffer2;
-      const float* s0np = buffer1;
-      const float* s1np = buffer2;
-
-      float32x4_t _a = vld1q_f32(alphap);
-      float32x2_t _s0 = vld1_f32(s0p);
-      float32x2_t _s1 = vld1_f32(s1p);
-      float32x2_t _s0n = vld1_f32(s0np);
-      float32x2_t _s1n = vld1_f32(s1np);
-
-      float32x4_t _s0s0n = vcombine_f32(_s0, _s0n);
-      float32x4_t _ms0 = vmulq_f32(_s0s0n, _a);
-      float32x4_t _s1s1n = vcombine_f32(_s1, _s1n);
-      float32x4_t _ms1 = vmulq_f32(_s1s1n, _a);
-
-      float32x2_t _rows0 = vpadd_f32(vget_low_f32(_ms0), vget_high_f32(_ms0));
-      vst1_f32(rows0p + dx, _rows0);
-      float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1));
-      vst1_f32(rows1p + dx, _rows1);
-
-      alphap += 4;
-    }
-    // w_bound - w_out remain loop
-    for (; dx < w_out; dx++) {
-      const float* s0p = buffer1;
-      const float* s1p = buffer2;
-
-      float a0 = alphap[0];
-      float a1 = alphap[1];
-      rows0p[dx] = s0p[0] * a0 + s0p[1] * a1;
-      rows1p[dx] = s1p[0] * a0 + s1p[1] * a1;
-
-      alphap += 2;
-    }
-
-    float b0 = beta[0];
-    float b1 = beta[1];
-
-    float* dp = dst + dy * w_out;
-
-    int nn = w_out >> 3;
-    int remain = w_out - (nn << 3);
-
-#ifdef __aarch64__
-    float32x4_t _b0 = vdupq_n_f32(b0);
-    float32x4_t _b1 = vdupq_n_f32(b1);
-    // calculate and store results
-    for (; nn > 0; nn--) {
-      float32x4_t _rows0 = vld1q_f32(rows0p);
-      float32x4_t _d = vmulq_f32(_rows0, _b0);
-      float32x4_t _rows1 = vld1q_f32(rows1p);
-      _d = vmlaq_f32(_d, _rows1, _b1);
-
-      float32x4_t _rows0n = vld1q_f32(rows0p + 4);
-      float32x4_t _rows1n = vld1q_f32(rows1p + 4);
-
-      float32x4_t _dn = vmulq_f32(_rows0n, _b0);
-      vst1q_f32(dp, _d);
-      _dn = vmlaq_f32(_dn, _rows1n, _b1);
-      vst1q_f32(dp + 4, _dn);
-
-      dp += 8;
-      rows0p += 8;
-      rows1p += 8;
-    }
-
-#else
-    if (nn > 0) {
-      asm volatile(
-          "vdup.32 q0, %[b0]                   @dup b0 to q1\n"
-          "vdup.32 q1, %[b1]                   @dup b1 to q0\n"
-          "1:                                                      \n"
-          "vld1.32 {d4-d5}, [%[rows0p]]!       @loads rows0p to q2\n"
-          "vld1.32 {d6-d7}, [%[rows1p]]!       @loads rows0p to q3\n"
-          "vmul.f32 q2, q2, q0                 @mul\n"
-          "vmla.f32 q2, q3, q1                 @mul add\n"
-          "vst1.32 {d4-d5}, [%[out]]!          @store out to q2 \n"
-          "pld [%[rows0p]]                     @preload rows0p\n"
-
-          "vld1.32 {d4-d5}, [%[rows0p]]!       @loads rows0p to q2\n"
-          "vld1.32 {d6-d7}, [%[rows1p]]!       @load rows1p to q3\n"
-          "vmul.f32 q2, q2, q0                 @mul\n"
-          "vmla.f32 q2, q3, q1                 @mul add\n"
-          "vst1.32 {d4-d5}, [%[out]]!          @store out to q2 \n"
-          "pld [%[rows1p]]                     @preload rows1p\n"
-          "subs %[loopc], #1                   @loop count minus #1\n"
-          "bne 1b                              @jump to 1\n"
-          : [rows0p] "+r"(rows0p),
-            [rows1p] "+r"(rows1p),
-            [out] "+r"(dp),
-            [loopc] "+r"(nn)
-          : [b0] "r"(b0), [b1] "r"(b1)
-          : "cc", "memory", "q0", "q1", "q2", "q3");
-    }
-#endif
-    // calculate and store remain resluts
-    for (; remain; --remain) {
-      *dp++ = *rows0p++ * b0 + *rows1p++ * b1;
-    }
-    beta += 2;
-  }
-
-  // h_bound - h_out loop
-  for (int dy = h_bound; dy < h_out; dy++) {
-    int sy = h_in - 1;
-    const float* s0 = src + sy * w_in;
-    const float* s1 = s0;
-    const float* alphap = alpha;
-    float* rows0p = rows0;
-    float* rows1p = rows1;
-
-    int dx = 0;
-    // w_bound loop
-    for (; dx + 1 < w_bound; dx += 2) {
-      int sx = xofs[dx];
-      int sxn = xofs[dx + 1];
-      const float* s0p = s0 + sx;
-      const float* s1p = s1 + sx;
-      const float* s0np = s0 + sxn;
-      const float* s1np = s1 + sxn;
-
-      float32x4_t _a = vld1q_f32(alphap);
-      float32x2_t _s0 = vld1_f32(s0p);
-      float32x2_t _s1 = vld1_f32(s1p);
-      float32x2_t _s0n = vld1_f32(s0np);
-      float32x2_t _s1n = vld1_f32(s1np);
-
-      float32x4_t _s0s0n = vcombine_f32(_s0, _s0n);
-      float32x4_t _ms0 = vmulq_f32(_s0s0n, _a);
-      float32x4_t _s1s1n = vcombine_f32(_s1, _s1n);
-      float32x4_t _ms1 = vmulq_f32(_s1s1n, _a);
-
-      float32x2_t _rows0 = vpadd_f32(vget_low_f32(_ms0), vget_high_f32(_ms0));
-      vst1_f32(rows0p + dx, _rows0);
-      float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1));
-      vst1_f32(rows1p + dx, _rows1);
-
-      alphap += 4;
-    }
-    // w_bound remain loop
-    for (; dx < w_bound; dx++) {
-      int sx = xofs[dx];
-      const float* s0p = s0 + sx;
-      float a0 = alphap[0];
-      float a1 = alphap[1];
-      rows0p[dx] = s0p[0] * a0 + s0p[1] * a1;
-      rows1p[dx] = rows0p[dx];
-
-      alphap += 2;
-    }
-
-    const float buffer1[2] = {*(src + sy * w_in + w_in - 1),
-                              *(src + sy * w_in + w_in - 1)};
-    // w_bound - w_out loop
-    for (; dx + 1 < w_out; dx += 2) {
-      const float* s0p = buffer1;
-      const float* s1p = buffer1;
-      const float* s0np = buffer1;
-      const float* s1np = buffer1;
-
-      float32x4_t _a = vld1q_f32(alphap);
-      float32x2_t _s0 = vld1_f32(s0p);
-      float32x2_t _s1 = vld1_f32(s1p);
-      float32x2_t _s0n = vld1_f32(s0np);
-      float32x2_t _s1n = vld1_f32(s1np);
-
-      float32x4_t _s0s0n = vcombine_f32(_s0, _s0n);
-      float32x4_t _ms0 = vmulq_f32(_s0s0n, _a);
-      float32x4_t _s1s1n = vcombine_f32(_s1, _s1n);
-      float32x4_t _ms1 = vmulq_f32(_s1s1n, _a);
-
-      float32x2_t _rows0 = vpadd_f32(vget_low_f32(_ms0), vget_high_f32(_ms0));
-      vst1_f32(rows0p + dx, _rows0);
-      float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1));
-      vst1_f32(rows1p + dx, _rows1);
-
-      alphap += 4;
-    }
-    // w_bound - wout remain loop
-    for (; dx < w_out; dx++) {
-      const float* s0p = buffer1;
-      float a0 = alphap[0];
-      float a1 = alphap[1];
-      rows0p[dx] = s0p[0] * a0 + s0p[1] * a1;
-      rows1p[dx] = rows0p[dx];
-      alphap += 2;
-    }
-
-    float b0 = beta[0];
-    float b1 = beta[1];
-
-    float* dp = dst + dy * w_out;
-
-    int nn = w_out >> 3;
-    int remain = w_out - (nn << 3);
-
-#ifdef __aarch64__
-    float32x4_t _b0 = vdupq_n_f32(b0);
-    float32x4_t _b1 = vdupq_n_f32(b1);
-    // calculate and store results
-    for (; nn > 0; nn--) {
-      float32x4_t _rows0 = vld1q_f32(rows0p);
-      float32x4_t _d = vmulq_f32(_rows0, _b0);
-      float32x4_t _rows1 = vld1q_f32(rows1p);
-      _d = vmlaq_f32(_d, _rows1, _b1);
-
-      float32x4_t _rows0n = vld1q_f32(rows0p + 4);
-      float32x4_t _rows1n = vld1q_f32(rows1p + 4);
-
-      float32x4_t _dn = vmulq_f32(_rows0n, _b0);
-      vst1q_f32(dp, _d);
-      _dn = vmlaq_f32(_dn, _rows1n, _b1);
-      vst1q_f32(dp + 4, _dn);
-
-      dp += 8;
-      rows0p += 8;
-      rows1p += 8;
-    }
-
-#else
-    if (nn > 0) {
-      asm volatile(
-          "vdup.32 q0, %[b0]                   @dup b0 to q1\n"
-          "vdup.32 q1, %[b1]                   @dup b1 to q0\n"
-          "1:                                                      \n"
-          "vld1.32 {d4-d5}, [%[rows0p]]!       @loads rows0p to q2\n"
-          "vld1.32 {d6-d7}, [%[rows1p]]!       @loads rows0p to q3\n"
-          "vmul.f32 q2, q2, q0                 @mul\n"
-          "vmla.f32 q2, q3, q1                 @mul add\n"
-          "vst1.32 {d4-d5}, [%[out]]!          @store out to q2 \n"
-          "pld [%[rows0p]]                     @preload rows0p\n"
-
-          "vld1.32 {d4-d5}, [%[rows0p]]!       @loads rows0p to q2\n"
-          "vld1.32 {d6-d7}, [%[rows1p]]!       @load rows1p to q3\n"
-          "vmul.f32 q2, q2, q0                 @mul\n"
-          "vmla.f32 q2, q3, q1                 @mul add\n"
-          "vst1.32 {d4-d5}, [%[out]]!          @store out to q2 \n"
-          "pld [%[rows1p]]                     @preload rows1p\n"
-          "subs %[loopc], #1                   @loop count minus #1\n"
-          "bne 1b                              @jump to 1\n"
-          : [rows0p] "+r"(rows0p),
-            [rows1p] "+r"(rows1p),
-            [out] "+r"(dp),
-            [loopc] "+r"(nn)
-          : [b0] "r"(b0), [b1] "r"(b1)
-          : "cc", "memory", "q0", "q1", "q2", "q3");
-    }
-#endif
-    // calculate and store remain results
-    for (; remain; --remain) {
-      *dp++ = *rows0p++ * b0 + *rows1p++ * b1;
-    }
-
-    beta += 2;
-  }
-  delete[] buf;
-  delete[] rowsbuf0;
-  delete[] rowsbuf1;
-}
-
-void nearest_interp(const float* src,
-                    int w_in,
-                    int h_in,
-                    float* dst,
-                    int w_out,
-                    int h_out,
-                    float scale_x,
-                    float scale_y,
-                    bool with_align) {
-  float scale_w_new = (with_align)
-                          ? (static_cast<float>(w_in - 1) / (w_out - 1))
-                          : (static_cast<float>(w_in) / (w_out));
-  float scale_h_new = (with_align)
-                          ? (static_cast<float>(h_in - 1) / (h_out - 1))
-                          : (static_cast<float>(h_in) / (h_out));
-
-#pragma omp parallel for collapse(2) schedule(static)
-  for (int h = 0; h < h_out; ++h) {
-    for (int w = 0; w < w_out; ++w) {
-      int near_x = (with_align) ? static_cast<int>(scale_w_new * w + 0.5)
-                                : static_cast<int>(scale_w_new * w);
-      int near_y = (with_align) ? static_cast<int>(scale_h_new * h + 0.5)
-                                : static_cast<int>(scale_h_new * h);
-      near_x = near_x < 0 ? 0 : near_x;
-      near_y = near_y < 0 ? 0 : near_y;
-      dst[h * w_out + w] = src[near_y * w_in + near_x];
-    }
-  }
-}
-
-void interpolate(lite::Tensor* X,
-                 lite::Tensor* OutSize,
-                 lite::Tensor* Out,
-                 int out_height,
-                 int out_width,
-                 float height_scale,
-                 float width_scale,
-                 bool with_align,
-                 std::string interpolate_type) {
-  if (out_width > 0 && out_height > 0) {
-    height_scale = static_cast<float>(out_height / X->dims()[2]);
-    width_scale = static_cast<float>(out_width / X->dims()[3]);
-  }
-  if (OutSize != nullptr) {
-    auto OutSize_data = OutSize->data<int>();
-    int h_out = OutSize_data[0];  // HW
-    int w_out = OutSize_data[1];  // HW
-    int num_cout = Out->dims()[0];
-    int c_cout = Out->dims()[1];
-    Out->Resize({num_cout, c_cout, h_out, w_out});
-  }
-
-  float* dout = Out->mutable_data<float>();
-  const float* din = X->data<float>();
-  int out_num = Out->dims()[0];
-  int out_c = Out->dims()[1];
-  int count = out_num * out_c;
-  int in_h = X->dims()[2];
-  int in_w = X->dims()[3];
-  int out_h = Out->dims()[2];
-  int out_w = Out->dims()[3];
-  int spatial_in = in_h * in_w;
-  int spatial_out = out_h * out_w;
-  for (int i = 0; i < count; ++i) {
-    if ("Bilinear" == interpolate_type) {
-      bilinear_interp(din + spatial_in * i,
-                      in_w,
-                      in_h,
-                      dout + spatial_out * i,
-                      out_w,
-                      out_h,
-                      1.f / width_scale,
-                      1.f / height_scale,
-                      with_align);
-    } else if ("Nearest" == interpolate_type) {
-      nearest_interp(din + spatial_in * i,
-                     in_w,
-                     in_h,
-                     dout + spatial_out * i,
-                     out_w,
-                     out_h,
-                     1.f / width_scale,
-                     1.f / height_scale,
-                     with_align);
-    }
-  }
-}
-
-} /* namespace math */
-} /* namespace arm */
-} /* namespace lite */
-} /* namespace paddle */
diff --git a/lite/backends/arm/math/interpolate.h b/lite/backends/arm/math/interpolate.h
deleted file mode 100644
index be250f6a5e..0000000000
--- a/lite/backends/arm/math/interpolate.h
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void bilinear_interp(const float* src,
-                     int w_in,
-                     int h_in,
-                     float* dst,
-                     int w_out,
-                     int h_out,
-                     float scale_x,
-                     float scale_y,
-                     bool with_align);
-
-void nearest_interp(const float* src,
-                    int w_in,
-                    int h_in,
-                    float* dst,
-                    int w_out,
-                    int h_out,
-                    float scale_x,
-                    float scale_y,
-                    bool with_align);
-
-void interpolate(lite::Tensor* X,
-                 lite::Tensor* OutSize,
-                 lite::Tensor* Out,
-                 int out_height,
-                 int out_width,
-                 float height_scale,
-                 float width_scale,
-                 bool with_align,
-                 std::string interpolate_type);
-
-} /* namespace math */
-} /* namespace arm */
-} /* namespace lite */
-} /* namespace paddle */
diff --git a/lite/backends/arm/math/lrn.cc b/lite/backends/arm/math/lrn.cc
deleted file mode 100644
index 7c89e9fed3..0000000000
--- a/lite/backends/arm/math/lrn.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/lrn.h"
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <>
-void compute_across_channels<float>(const float* din,
-                                    float* dout,
-                                    int num,
-                                    int channel,
-                                    int h,
-                                    int w,
-                                    int local_size,
-                                    float alpha,
-                                    float beta,
-                                    float k) {
-  int channel_size = h * w;
-  int cnt = channel_size / 4;
-  int remain = channel_size % 4;
-  int pre_pad = (local_size - 1) / 2;
-  int post_pad = local_size - pre_pad - 1;
-  float32x4_t k_val = vdupq_n_f32(k);
-  float32x4_t alpha_val = vdupq_n_f32(alpha);
-  float32x4_t beta_val = vdupq_n_f32(-beta);
-  for (int n = 0; n < num; ++n) {
-    const float* din_ptr = din + n * channel * channel_size;
-    float* dout_ptr = dout + n * channel * channel_size;
-    for (int c = 0; c < channel; ++c) {
-      const float* din_ch_ptr = din_ptr + c * channel_size;
-      float* dout_ch_ptr = dout_ptr + c * channel_size;
-      int cs = (c - pre_pad) < 0 ? 0 : (c - pre_pad);
-      int ce = (c + post_pad) >= channel ? channel : (c + pre_pad + 1);
-      for (int i = 0; i < cnt; ++i) {
-        int idx = i * 4;
-        float32x4_t sum = vdupq_n_f32(0.f);
-        float32x4_t din = vld1q_f32(din_ch_ptr);
-        for (int k = cs; k < ce; ++k) {
-          float32x4_t v0 = vld1q_f32(&din_ptr[k * channel_size + idx]);
-          sum = vmlaq_f32(sum, v0, v0);
-        }
-        sum = vmulq_f32(sum, alpha_val);
-        sum = vaddq_f32(sum, k_val);
-        float32x4_t res0 = pow_ps(sum, beta_val);
-        float32x4_t res1 = vmulq_f32(din, res0);
-        vst1q_f32(dout_ch_ptr, res1);
-        dout_ch_ptr += 4;
-        din_ch_ptr += 4;
-      }
-      int idx = cnt * 4;
-      for (int i = 0; i < remain; ++i) {
-        float sum = 0.0;
-        for (int k = cs; k < ce; ++k) {
-          sum +=
-              din_ptr[k * channel_size + idx] * din_ptr[k * channel_size + idx];
-        }
-        sum = k + sum * alpha;
-        dout_ch_ptr[0] = din_ch_ptr[0] * pow(sum, -beta);
-        dout_ch_ptr++;
-        din_ch_ptr++;
-        idx++;
-      }
-    }
-  }
-}
-
-template <>
-void compute_within_channels<float>(const float* din,
-                                    float* dout,
-                                    int num,
-                                    int channel,
-                                    int h,
-                                    int w,
-                                    int local_size,
-                                    float alpha,
-                                    float beta,
-                                    float k) {
-  LOG(ERROR) << "unsupported method!!";
-  return;
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/lrn.h b/lite/backends/arm/math/lrn.h
deleted file mode 100644
index 0355123189..0000000000
--- a/lite/backends/arm/math/lrn.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <typename T>
-void compute_across_channels(const T* din,
-                             T* dout,
-                             int num,
-                             int channel,
-                             int h,
-                             int w,
-                             int local_size,
-                             float alpha,
-                             float beta,
-                             float k);
-
-template <typename T>
-void compute_within_channels(const T* din,
-                             T* dout,
-                             int num,
-                             int channel,
-                             int h,
-                             int w,
-                             int local_size,
-                             float alpha,
-                             float beta,
-                             float k);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/negative.cc b/lite/backends/arm/math/negative.cc
deleted file mode 100644
index 30eba11e35..0000000000
--- a/lite/backends/arm/math/negative.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/negative.h"
-#include <algorithm>
-#include <limits>
-#include <memory>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <>
-void negative_func<float>(const float* din, float* dout, int num) {
-  for (int i = 0; i < num; i++) {
-    dout[i] = -din[i];
-    LOG(INFO) << "arm i:" << i;
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/negative.h b/lite/backends/arm/math/negative.h
deleted file mode 100644
index 9a5648743d..0000000000
--- a/lite/backends/arm/math/negative.h
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "lite/operators/op_params.h"
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <typename T>
-void negative_func(const T* din, T* dout, int num);
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/norm.cc b/lite/backends/arm/math/norm.cc
deleted file mode 100644
index 4780ef68c1..0000000000
--- a/lite/backends/arm/math/norm.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/norm.h"
-#include <arm_neon.h>
-#include <cmath>
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void norm(const float* input,
-          const int pre_n,
-          const int n,
-          const int post_n,
-          const float epsilon,
-          float* out,
-          Context<TARGET(kARM)>* ctx) {
-  for (int i = 0; i < pre_n; i++) {
-    for (int k = 0; k < post_n; k++) {
-      float sum = epsilon;
-      const float* in_tmp = input + i * n * post_n + k;
-      for (int j = 0; j < n; j++) {
-        sum += in_tmp[j * post_n] * in_tmp[j * post_n];
-      }
-      sum = std::sqrt(sum);
-      float* out_tmp = out + i * n * post_n + k;
-      for (int j = 0; j < n; j++) {
-        out_tmp[j * post_n] = in_tmp[j * post_n] / sum;
-      }
-    }
-  }
-  LOG(INFO) << "norm math finished";
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/norm.h b/lite/backends/arm/math/norm.h
deleted file mode 100644
index 503d2c5af4..0000000000
--- a/lite/backends/arm/math/norm.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cmath>
-#include "lite/core/context.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-void norm(const float* input,
-          const int pre_n,
-          const int n,
-          const int post_n,
-          const float epsilon,
-          float* out,
-          Context<TARGET(kARM)>* ctx);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/packed_sgemm.cc b/lite/backends/arm/math/packed_sgemm.cc
deleted file mode 100644
index 77b3beae80..0000000000
--- a/lite/backends/arm/math/packed_sgemm.cc
+++ /dev/null
@@ -1,3481 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//     http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/packed_sgemm.h"
-#include <arm_neon.h>
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-#ifdef __aarch64__
-void prepackA_8x12(float *out,
-                   const float *in,
-                   float alpha,
-                   int ldin,
-                   int m0,
-                   int mmax,
-                   int k0,
-                   int kmax);
-
-void prepackA_trans_8x12(float *out,
-                         const float *in,
-                         float alpha,
-                         int ldin,
-                         int m0,
-                         int mmax,
-                         int k0,
-                         int kmax);
-
-void sgemm_prepacked_8x12(bool is_transB,
-                          int M,
-                          int N,
-                          int K,
-                          const float *A_packed,
-                          const float *B,
-                          int ldb,
-                          float beta,
-                          float *C,
-                          int ldc,
-                          const float *bias,
-                          bool has_bias,
-                          bool has_relu,
-                          ARMContext *ctx);
-#else
-// for kA72
-void prepackA_6x8(float *out,
-                  const float *in,
-                  float alpha,
-                  int ldin,
-                  int m0,
-                  int mmax,
-                  int k0,
-                  int kmax);
-
-void prepackA_trans_6x8(float *out,
-                        const float *in,
-                        float alpha,
-                        int ldin,
-                        int m0,
-                        int mmax,
-                        int k0,
-                        int kmax);
-// for kA73
-void prepackA_4x8(float *out,
-                  const float *in,
-                  float alpha,
-                  int ldin,
-                  int m0,
-                  int mmax,
-                  int k0,
-                  int kmax);
-
-void prepackA_trans_4x8(float *out,
-                        const float *in,
-                        float alpha,
-                        int ldin,
-                        int m0,
-                        int mmax,
-                        int k0,
-                        int kmax);
-
-// for kA72, 6x8
-void sgemm_prepacked_6x8(bool is_transB,
-                         int M,
-                         int N,
-                         int K,
-                         const float *A_packed,
-                         const float *B,
-                         int ldb,
-                         float beta,
-                         float *C,
-                         int ldc,
-                         const float *bias,
-                         bool has_bias,
-                         bool has_relu,
-                         ARMContext *ctx);
-// for kA73, 4x8
-void sgemm_prepacked_4x8(bool is_transB,
-                         int M,
-                         int N,
-                         int K,
-                         const float *A_packed,
-                         const float *B,
-                         int ldb,
-                         float beta,
-                         float *C,
-                         int ldc,
-                         const float *bias,
-                         bool has_bias,
-                         bool has_relu,
-                         ARMContext *ctx);
-#endif  // __aarch64__
-
-/**
- * \brief input data is not transpose
- * for arm-v7a, transform data to block x k x 6 layout
- * for arm-v8a, transform data to block x k x 8 layout
- */
-void prepackA(float *out,
-              const float *in,
-              float alpha,
-              int ldin,
-              int m0,
-              int mmax,
-              int k0,
-              int kmax,
-              bool is_trans,
-              ARMContext *ctx) {
-#ifdef __aarch64__
-  if (is_trans) {
-    prepackA_trans_8x12(out, in, alpha, ldin, m0, mmax, k0, kmax);
-  } else {
-    prepackA_8x12(out, in, alpha, ldin, m0, mmax, k0, kmax);
-  }
-#else
-  if (ctx->arch() == kA73) {
-    if (is_trans) {
-      prepackA_trans_4x8(out, in, alpha, ldin, m0, mmax, k0, kmax);
-    } else {
-      prepackA_4x8(out, in, alpha, ldin, m0, mmax, k0, kmax);
-    }
-  } else {
-    if (is_trans) {
-      prepackA_trans_6x8(out, in, alpha, ldin, m0, mmax, k0, kmax);
-    } else {
-      prepackA_6x8(out, in, alpha, ldin, m0, mmax, k0, kmax);
-    }
-  }
-#endif
-}
-
-void prepackA(TensorLite *tout,
-              const TensorLite &tin,
-              float alpha,
-              int m,
-              int k,
-              int group,
-              bool is_trans,
-              ARMContext *ctx) {
-  int hblock = get_hblock(ctx->arch());
-  int m_roundup = hblock * ((m + hblock - 1) / hblock);
-  int group_size_round_up = ((m_roundup * k + 15) / 16) * 16;
-  if (tout->numel() < group_size_round_up * group) {
-    tout->Resize({group_size_round_up * group});
-  }
-  int lda = k;
-  if (is_trans) {
-    lda = m;
-  }
-  for (int g = 0; g < group; ++g) {
-    const float *weights_group = tin.data<float>() + g * m * k;
-    float *weights_trans_ptr =
-        tout->mutable_data<float>() + g * group_size_round_up;
-    prepackA(weights_trans_ptr,
-             weights_group,
-             alpha,
-             lda,
-             0,
-             m,
-             0,
-             k,
-             is_trans,
-             ctx);
-  }
-}
-
-/// a: m*k  b: k*n  c: m*n
-void sgemm_prepack(bool is_transB,
-                   int M,
-                   int N,
-                   int K,
-                   const float *A_packed,
-                   const float *B,
-                   int ldb,
-                   float beta,
-                   float *C,
-                   int ldc,
-                   const float *bias,
-                   bool has_bias,
-                   bool has_relu,
-                   ARMContext *ctx) {
-#ifdef __aarch64__
-  sgemm_prepacked_8x12(is_transB,
-                       M,
-                       N,
-                       K,
-                       A_packed,
-                       B,
-                       ldb,
-                       beta,
-                       C,
-                       ldc,
-                       bias,
-                       has_bias,
-                       has_relu,
-                       ctx);
-#else   // armv7
-  if (ctx->arch() == kA73) {
-    sgemm_prepacked_4x8(is_transB,
-                        M,
-                        N,
-                        K,
-                        A_packed,
-                        B,
-                        ldb,
-                        beta,
-                        C,
-                        ldc,
-                        bias,
-                        has_bias,
-                        has_relu,
-                        ctx);
-  } else {
-    sgemm_prepacked_6x8(is_transB,
-                        M,
-                        N,
-                        K,
-                        A_packed,
-                        B,
-                        ldb,
-                        beta,
-                        C,
-                        ldc,
-                        bias,
-                        has_bias,
-                        has_relu,
-                        ctx);
-  }
-#endif  // arm64
-}
-
-#ifdef __aarch64__
-/*
- * The following function prepackA_8x12 is base on
- * https://github.com/ARM-software/ComputeLibrary/
- *
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-void prepackA_8x12(float *dout,
-                   const float *inptr,
-                   float alpha,
-                   int ldin,
-                   int m0,
-                   int mmax,
-                   int k0,
-                   int kmax) {
-  int x_len = kmax - k0;
-  int stride = x_len * 8;
-  float zerobuff[x_len];  // NOLINT
-  memset(zerobuff, 0, sizeof(float) * x_len);
-  bool has_alpha = fabsf(alpha - 1.f) > 1e-8f;
-
-#pragma omp parallel for
-  for (int y = m0; y < mmax; y += 8) {
-    float *outptr = dout + stride * (y - m0) / 8;
-
-    const float *inptr0 = inptr + y * ldin + k0;
-    const float *inptr1 = inptr0 + ldin;
-    const float *inptr2 = inptr1 + ldin;
-    const float *inptr3 = inptr2 + ldin;
-    const float *inptr4 = inptr3 + ldin;
-    const float *inptr5 = inptr4 + ldin;
-    const float *inptr6 = inptr5 + ldin;
-    const float *inptr7 = inptr6 + ldin;
-
-    asm volatile(
-        "prfm   pldl1keep, [%[ptr0]]        \n"
-        "prfm   pldl1keep, [%[ptr0], #64]   \n"
-        "prfm   pldl1keep, [%[ptr1]]        \n"
-        "prfm   pldl1keep, [%[ptr1], #64]   \n"
-        "prfm   pldl1keep, [%[ptr2]]        \n"
-        "prfm   pldl1keep, [%[ptr2], #64]   \n"
-        "prfm   pldl1keep, [%[ptr3]]        \n"
-        "prfm   pldl1keep, [%[ptr3], #64]   \n"
-        "prfm   pldl1keep, [%[ptr4]]        \n"
-        "prfm   pldl1keep, [%[ptr4], #64]   \n"
-        "prfm   pldl1keep, [%[ptr5]]        \n"
-        "prfm   pldl1keep, [%[ptr5], #64]   \n"
-        "prfm   pldl1keep, [%[ptr6]]        \n"
-        "prfm   pldl1keep, [%[ptr6], #64]   \n"
-        "prfm   pldl1keep, [%[ptr7]]        \n"
-        "prfm   pldl1keep, [%[ptr7], #64]   \n"
-        :
-        : [ptr0] "r"(inptr0),
-          [ptr1] "r"(inptr1),
-          [ptr2] "r"(inptr2),
-          [ptr3] "r"(inptr3),
-          [ptr4] "r"(inptr4),
-          [ptr5] "r"(inptr5),
-          [ptr6] "r"(inptr6),
-          [ptr7] "r"(inptr7)
-        : "memory");
-
-    int x = x_len;
-    //! cope with row index exceed real size, set to zero buffer
-    if ((y + 7) >= mmax) {
-      switch ((y + 7) - mmax) {
-        case 6:
-          inptr1 = zerobuff;
-        case 5:
-          inptr2 = zerobuff;
-        case 4:
-          inptr3 = zerobuff;
-        case 3:
-          inptr4 = zerobuff;
-        case 2:
-          inptr5 = zerobuff;
-        case 1:
-          inptr6 = zerobuff;
-        case 0:
-          inptr7 = zerobuff;
-        default:
-          break;
-      }
-    }
-    for (; x > 7; x -= 8) {
-      asm volatile(
-          "cbz    %w[has_alpha], 0f\n"            /* check alpha == 1.f? */
-          "dup    v31.4s, %w[alpha]\n"            /* alpha to vector */
-          "ldp    q0, q1,     [%[inptr0]], #32\n" /* load r0, a0~a7 */
-          "ldp    q2, q3,     [%[inptr1]], #32\n" /* load r1, b0~b7 */
-          "fmul   v0.4s,  v31.4s, v0.4s\n"        /* mul alpha */
-          "fmul   v1.4s,  v31.4s, v1.4s\n"        /* mul alpha */
-          "ldp    q4, q5,     [%[inptr2]], #32\n" /* load r2, c0~c7 */
-          "fmul   v2.4s,  v31.4s, v2.4s\n"        /* mul alpha */
-          "fmul   v3.4s,  v31.4s, v3.4s\n"        /* mul alpha */
-          "ldp    q6, q7,     [%[inptr3]], #32\n" /* load r3, d0~d7 */
-          "fmul   v4.4s,  v31.4s, v4.4s\n"        /* mul alpha */
-          "fmul   v5.4s,  v31.4s, v5.4s\n"        /* mul alpha */
-          "ldp    q8, q9,     [%[inptr4]], #32\n" /* load r4, e0~e7 */
-          "fmul   v6.4s,  v31.4s, v6.4s\n"        /* mul alpha */
-          "fmul   v7.4s,  v31.4s, v7.4s\n"        /* mul alpha */
-          "ldp    q10, q11,   [%[inptr5]], #32\n" /* load r5, f0~f7 */
-          "fmul   v8.4s,  v31.4s, v8.4s\n"        /* mul alpha */
-          "fmul   v9.4s,  v31.4s, v9.4s\n"        /* mul alpha */
-          "ldp    q12, q13,   [%[inptr6]], #32\n" /* load r6, g0~g7 */
-          "fmul   v10.4s,  v31.4s, v10.4s\n"      /* mul alpha */
-          "fmul   v11.4s,  v31.4s, v11.4s\n"      /* mul alpha */
-          "ldp    q14, q15,   [%[inptr7]], #32\n" /* load r7, h0~h7 */
-          "fmul   v12.4s,  v31.4s, v12.4s\n"      /* mul alpha */
-          "fmul   v13.4s,  v31.4s, v13.4s\n"      /* mul alpha */
-          "fmul   v14.4s,  v31.4s, v14.4s\n"      /* mul alpha */
-          "fmul   v15.4s,  v31.4s, v15.4s\n"      /* mul alpha */
-          "b 1f\n"                                /* to main process */
-          "0: \n"                                 /* alpha == 1 */
-          "ldp    q0, q1,     [%[inptr0]], #32\n" /* load r0, a0~a7 */
-          "ldp    q2, q3,     [%[inptr1]], #32\n" /* load r1, b0~b7 */
-          "ldp    q4, q5,     [%[inptr2]], #32\n" /* load r2, c0~c7 */
-          "ldp    q6, q7,     [%[inptr3]], #32\n" /* load r3, d0~d7 */
-          "ldp    q8, q9,     [%[inptr4]], #32\n" /* load r4, e0~e7 */
-          "ldp    q10, q11,   [%[inptr5]], #32\n" /* load r5, f0~f7 */
-          "ldp    q12, q13,   [%[inptr6]], #32\n" /* load r6, g0~g7 */
-          "ldp    q14, q15,   [%[inptr7]], #32\n" /* load r7, h0~h7 */
-          "1: \n"                                 /* main process */
-          "trn1   v16.4s, v0.4s, v2.4s\n"         /* a0b0a2b2*/
-          "trn2   v17.4s, v0.4s, v2.4s\n"         /* a1b1a3b3*/
-          "trn1   v18.4s, v1.4s, v3.4s\n"         /* a4b4a6b6*/
-          "trn2   v19.4s, v1.4s, v3.4s\n"         /* a5b5a7b7*/
-
-          "trn1   v20.4s, v4.4s, v6.4s\n" /* c0d0c2d2*/
-          "trn2   v21.4s, v4.4s, v6.4s\n" /* c1d1c3d3*/
-          "trn1   v22.4s, v5.4s, v7.4s\n" /* c4d4c6d6*/
-          "trn2   v23.4s, v5.4s, v7.4s\n" /* c5d5c7d7*/
-
-          "trn1   v24.4s, v8.4s, v10.4s\n" /* e0f0e2f2*/
-          "trn2   v25.4s, v8.4s, v10.4s\n" /* e1f1e3f3*/
-          "trn1   v26.4s, v9.4s, v11.4s\n" /* e4f4e6f6*/
-          "trn2   v27.4s, v9.4s, v11.4s\n" /* e5f5e7f7*/
-
-          "trn1   v28.4s, v12.4s, v14.4s\n" /* g0h0g2h2*/
-          "trn2   v29.4s, v12.4s, v14.4s\n" /* g1h1g3h3*/
-          "trn1   v30.4s, v13.4s, v15.4s\n" /* g4h4g6h6*/
-          "trn2   v31.4s, v13.4s, v15.4s\n" /* g5h5g7h7*/
-
-          "trn1   v0.2d, v16.2d, v20.2d\n" /* a0b0c0d0 */
-          "trn1   v1.2d, v24.2d, v28.2d\n" /* e0f0g0h0 */
-          "trn1   v2.2d, v17.2d, v21.2d\n" /* a1b1c1d1 */
-          "trn1   v3.2d, v25.2d, v29.2d\n" /* e1b1c1d1 */
-
-          "trn2   v4.2d, v16.2d, v20.2d\n"    /* a2b2c2d2 */
-          "trn2   v5.2d, v24.2d, v28.2d\n"    /* e2f2g2h2 */
-          "stp    q0, q1, [%[outptr]], #32\n" /* save q0, q1, a0~h0*/
-          "trn2   v6.2d, v17.2d, v21.2d\n"    /* a3b3c3d3 */
-          "trn2   v7.2d, v25.2d, v29.2d\n"    /* e3f3g3h3 */
-          "stp    q2, q3, [%[outptr]], #32\n" /* save q2, q3, a1~h1*/
-
-          "trn1   v8.2d, v18.2d, v22.2d\n"    /* a4b4c4d4 */
-          "trn1   v9.2d, v26.2d, v30.2d\n"    /* e4f4g4h4 */
-          "stp    q4, q5, [%[outptr]], #32\n" /* save q4, q5, a2~h2*/
-          "trn1   v10.2d, v19.2d, v23.2d\n"   /* a5b5c5d5 */
-          "trn1   v11.2d, v27.2d, v31.2d\n"   /* e5f5g5h5 */
-          "stp    q6, q7, [%[outptr]], #32\n" /* save q6, q7, a3~h3*/
-
-          "trn2   v12.2d, v18.2d, v22.2d\n"     /* a6b6c6d6 */
-          "trn2   v13.2d, v26.2d, v30.2d\n"     /* e6f6g6h6 */
-          "stp    q8, q9, [%[outptr]], #32\n"   /* save q8, q9, a4~h4*/
-          "trn2   v14.2d, v19.2d, v23.2d\n"     /* a7b7c7d7 */
-          "trn2   v15.2d, v27.2d, v31.2d\n"     /* e7f7g7h7 */
-          "stp    q10, q11, [%[outptr]], #32\n" /* save q10, q11, a5~h5*/
-
-          "stp    q12, q13, [%[outptr]], #32\n" /* save q12, q13, a6~h6*/
-          "stp    q14, q15, [%[outptr]], #32\n" /* save q14, q15, a7~h7*/
-          : [inptr0] "+r"(inptr0),
-            [inptr1] "+r"(inptr1),
-            [inptr2] "+r"(inptr2),
-            [inptr3] "+r"(inptr3),
-            [inptr4] "+r"(inptr4),
-            [inptr5] "+r"(inptr5),
-            [inptr6] "+r"(inptr6),
-            [inptr7] "+r"(inptr7),
-            [outptr] "+r"(outptr)
-          : [alpha] "r"(alpha), [has_alpha] "r"(has_alpha)
-          : "v0",
-            "v1",
-            "v2",
-            "v3",
-            "v4",
-            "v5",
-            "v6",
-            "v7",
-            "v8",
-            "v9",
-            "v10",
-            "v11",
-            "v12",
-            "v13",
-            "v14",
-            "v15",
-            "v16",
-            "v17",
-            "v18",
-            "v19",
-            "v20",
-            "v21",
-            "v22",
-            "v23",
-            "v24",
-            "v25",
-            "v26",
-            "v27",
-            "v28",
-            "v29",
-            "v30",
-            "v31",
-            "cc",
-            "memory");
-    }
-
-    for (; x > 0; x--) {
-      if (has_alpha) {
-        *outptr++ = *inptr0++ * alpha;
-        *outptr++ = *inptr1++ * alpha;
-        *outptr++ = *inptr2++ * alpha;
-        *outptr++ = *inptr3++ * alpha;
-        *outptr++ = *inptr4++ * alpha;
-        *outptr++ = *inptr5++ * alpha;
-        *outptr++ = *inptr6++ * alpha;
-        *outptr++ = *inptr7++ * alpha;
-      } else {
-        *outptr++ = *inptr0++;
-        *outptr++ = *inptr1++;
-        *outptr++ = *inptr2++;
-        *outptr++ = *inptr3++;
-        *outptr++ = *inptr4++;
-        *outptr++ = *inptr5++;
-        *outptr++ = *inptr6++;
-        *outptr++ = *inptr7++;
-      }
-    }
-  }
-}
-
-void prepackA_trans_8x12(float *outptr,
-                         const float *in,
-                         float alpha,
-                         int ldin,
-                         int m0,
-                         int mmax,
-                         int k0,
-                         int kmax) {
-  auto inptr = in + k0 * ldin + m0;
-  uint32_t mask_buffer[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  int x_len = mmax - m0;
-  int y_len = kmax - k0;
-  int right_remain = x_len - 8 * (x_len / 8);
-  int stride_out = 8 * y_len;
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-  uint32x4_t vmask1 =
-      vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain));
-  uint32x4_t vmask2 =
-      vcltq_u32(vld1q_u32(mask_buffer + 4), vdupq_n_u32(right_remain));
-
-  bool has_alpha = fabsf(alpha - 1.f) > 1e-8f;
-  float32x4_t valpha = vdupq_n_f32(alpha);
-
-#pragma omp parallel for
-  for (int y = 0; y < y_len - 3; y += 4) {
-    const float *ptr0 = inptr + y * ldin;
-    const float *ptr1 = ptr0 + ldin;
-    const float *ptr2 = ptr1 + ldin;
-    const float *ptr3 = ptr2 + ldin;
-
-    asm volatile(
-        "prfm   pldl1keep, [%[ptr0]]        \n"
-        "prfm   pldl1keep, [%[ptr0], #64]   \n"
-        "prfm   pldl1keep, [%[ptr1]]        \n"
-        "prfm   pldl1keep, [%[ptr1], #64]   \n"
-        "prfm   pldl1keep, [%[ptr2]]        \n"
-        "prfm   pldl1keep, [%[ptr2], #64]   \n"
-        "prfm   pldl1keep, [%[ptr3]]        \n"
-        "prfm   pldl1keep, [%[ptr3], #64]   \n"
-        :
-        : [ptr0] "r"(ptr0), [ptr1] "r"(ptr1), [ptr2] "r"(ptr2), [ptr3] "r"(ptr3)
-        : "memory");
-
-    float *outptr_row_col = outptr + y * 8;
-    int i = 0;
-    for (; i < x_len - 7; i += 8) {
-      float32x4_t vr00 = vld1q_f32(ptr0);
-      float32x4_t vr01 = vld1q_f32(ptr0 + 4);
-      float32x4_t vr10 = vld1q_f32(ptr1);
-      float32x4_t vr11 = vld1q_f32(ptr1 + 4);
-      float32x4_t vr20 = vld1q_f32(ptr2);
-      float32x4_t vr21 = vld1q_f32(ptr2 + 4);
-      float32x4_t vr30 = vld1q_f32(ptr3);
-      float32x4_t vr31 = vld1q_f32(ptr3 + 4);
-      if (has_alpha) {
-        vr00 = vmulq_f32(vr00, valpha);
-        vr01 = vmulq_f32(vr01, valpha);
-        vr10 = vmulq_f32(vr10, valpha);
-        vr11 = vmulq_f32(vr11, valpha);
-        vr20 = vmulq_f32(vr20, valpha);
-        vr21 = vmulq_f32(vr21, valpha);
-        vr30 = vmulq_f32(vr30, valpha);
-        vr31 = vmulq_f32(vr31, valpha);
-      }
-
-      vst1q_f32(outptr_row_col, vr00);
-      vst1q_f32(outptr_row_col + 4, vr01);
-      vst1q_f32(outptr_row_col + 8, vr10);
-      vst1q_f32(outptr_row_col + 12, vr11);
-      vst1q_f32(outptr_row_col + 16, vr20);
-      vst1q_f32(outptr_row_col + 20, vr21);
-      vst1q_f32(outptr_row_col + 24, vr30);
-      vst1q_f32(outptr_row_col + 28, vr31);
-
-      ptr0 += 8;
-      ptr1 += 8;
-      ptr2 += 8;
-      ptr3 += 8;
-
-      outptr_row_col += stride_out;
-    }
-    if (right_remain > 0) {
-      float32x4_t vr00 = vld1q_f32(ptr0);
-      float32x4_t vr01 = vld1q_f32(ptr0 + 4);
-      float32x4_t vr10 = vld1q_f32(ptr1);
-      float32x4_t vr11 = vld1q_f32(ptr1 + 4);
-      float32x4_t vr20 = vld1q_f32(ptr2);
-      float32x4_t vr21 = vld1q_f32(ptr2 + 4);
-      float32x4_t vr30 = vld1q_f32(ptr3);
-      float32x4_t vr31 = vld1q_f32(ptr3 + 4);
-
-      if (has_alpha) {
-        vr00 = vmulq_f32(vr00, valpha);
-        vr01 = vmulq_f32(vr01, valpha);
-        vr10 = vmulq_f32(vr10, valpha);
-        vr11 = vmulq_f32(vr11, valpha);
-        vr20 = vmulq_f32(vr20, valpha);
-        vr21 = vmulq_f32(vr21, valpha);
-        vr30 = vmulq_f32(vr30, valpha);
-        vr31 = vmulq_f32(vr31, valpha);
-      }
-
-      float32x4_t vr00_1 = vbslq_f32(vmask1, vr00, vzero);
-      float32x4_t vr01_1 = vbslq_f32(vmask2, vr01, vzero);
-      float32x4_t vr10_1 = vbslq_f32(vmask1, vr10, vzero);
-      float32x4_t vr11_1 = vbslq_f32(vmask2, vr11, vzero);
-      float32x4_t vr20_1 = vbslq_f32(vmask1, vr20, vzero);
-      float32x4_t vr21_1 = vbslq_f32(vmask2, vr21, vzero);
-      float32x4_t vr30_1 = vbslq_f32(vmask1, vr30, vzero);
-      float32x4_t vr31_1 = vbslq_f32(vmask2, vr31, vzero);
-
-      vst1q_f32(outptr_row_col, vr00_1);
-      vst1q_f32(outptr_row_col + 4, vr01_1);
-      vst1q_f32(outptr_row_col + 8, vr10_1);
-      vst1q_f32(outptr_row_col + 12, vr11_1);
-      vst1q_f32(outptr_row_col + 16, vr20_1);
-      vst1q_f32(outptr_row_col + 20, vr21_1);
-      vst1q_f32(outptr_row_col + 24, vr30_1);
-      vst1q_f32(outptr_row_col + 28, vr31_1);
-    }
-  }
-
-#pragma omp parallel for
-  for (int y = 4 * (y_len / 4); y < y_len; ++y) {
-    const float *ptr0 = inptr + y * ldin;
-    float *outptr_row_col = outptr + y * 8;
-    int i = 0;
-    for (; i < x_len - 7; i += 8) {
-      float32x4_t vr0 = vld1q_f32(ptr0);
-      float32x4_t vr1 = vld1q_f32(ptr0 + 4);
-      if (has_alpha) {
-        vr0 = vmulq_f32(vr0, valpha);
-        vr1 = vmulq_f32(vr1, valpha);
-      }
-      vst1q_f32(outptr_row_col, vr0);
-      vst1q_f32(outptr_row_col + 4, vr1);
-
-      ptr0 += 8;
-
-      outptr_row_col += stride_out;
-    }
-    if (right_remain > 0) {
-      float32x4_t vr0 = vld1q_f32(ptr0);
-      float32x4_t vr1 = vld1q_f32(ptr0 + 4);
-
-      if (has_alpha) {
-        vr0 = vmulq_f32(vr0, valpha);
-        vr1 = vmulq_f32(vr1, valpha);
-      }
-
-      float32x4_t vr0_1 = vbslq_f32(vmask1, vr0, vzero);
-      float32x4_t vr1_1 = vbslq_f32(vmask2, vr1, vzero);
-
-      vst1q_f32(outptr_row_col, vr0_1);
-      vst1q_f32(outptr_row_col + 4, vr1_1);
-    }
-  }
-}
-
-#else  // __aarch64__
-void prepackA_6x8(float* outptr,
-                  const float* inptr,
-                  float alpha,
-                  int ldin,
-                  int m0,
-                  int mmax,
-                  int k0,
-                  int kmax) {
-  int x_len = kmax - k0;
-  float zerobuff[x_len];  // NOLINT
-  memset(zerobuff, 0, sizeof(float) * x_len);
-
-  bool has_alpha = fabsf(alpha - 1.f) > 1e-8f;
-  float32x4_t valpha = vdupq_n_f32(alpha);
-
-  for (int y = m0; y < mmax; y += 6) {
-    const float* inptr0 = inptr + y * ldin + k0;
-    const float* inptr1 = inptr0 + ldin;
-    const float* inptr2 = inptr1 + ldin;
-    const float* inptr3 = inptr2 + ldin;
-    const float* inptr4 = inptr3 + ldin;
-    const float* inptr5 = inptr4 + ldin;
-
-    int x = x_len;
-    if ((y + 5) >= mmax) {
-      switch ((y + 5) - mmax) {
-        case 4:
-          inptr1 = zerobuff;
-        case 3:
-          inptr2 = zerobuff;
-        case 2:
-          inptr3 = zerobuff;
-        case 1:
-          inptr4 = zerobuff;
-        case 0:
-          inptr5 = zerobuff;
-        default:
-          break;
-      }
-    }
-
-    for (; x > 7; x -= 8) {
-      asm volatile(
-          "vld4.32  {d0-d3}, [%[inptr0]]!   @ zip load r0, "
-          "q0,q1=r00,r04,r01,r05,r02,r06,r03,r07\n"
-          "vld4.32  {d4-d7}, [%[inptr1]]!   @ zip load r1, "
-          "q2,q3=r10,r14,r11,r15,r12,r16,r13,r17\n"
-          "vld4.32  {d8-d11}, [%[inptr2]]!  @ zip load r2, "
-          "q4,q5=r20,r24,r21,r25,r22,r26,r23,r27\n"
-          "vld4.32  {d12-d15}, [%[inptr3]]! @ zip load r3, "
-          "q6,q7=r30,r34,r31,r35,r32,r36,r33,r37\n"
-          "vld4.32  {d16-d19}, [%[inptr4]]! @ zip load r4, "
-          "q8,q9=r40,r44,r41,r45,r42,r46,r43,r47\n"
-          "vld4.32  {d20-d23}, [%[inptr5]]! @ zip load r5, "
-          "q10,q11=r50,r54,r51,r55,r52,r56,r53,r57\n"
-          "cmp %[has_alpha], #0\n"
-          "beq  0f\n"                        /* check whether alpha == 1? */
-          "vmul.f32   q0, q0, %q[alpha]\n"   /* mul alpha */
-          "vmul.f32   q1, q1, %q[alpha]\n"   /* mul alpha */
-          "vmul.f32   q2, q2, %q[alpha]\n"   /* mul alpha */
-          "vmul.f32   q3, q3, %q[alpha]\n"   /* mul alpha */
-          "vmul.f32   q4, q4, %q[alpha]\n"   /* mul alpha */
-          "vmul.f32   q5, q5, %q[alpha]\n"   /* mul alpha */
-          "vmul.f32   q6, q6, %q[alpha]\n"   /* mul alpha */
-          "vmul.f32   q7, q7, %q[alpha]\n"   /* mul alpha */
-          "vmul.f32   q8, q8, %q[alpha]\n"   /* mul alpha */
-          "vmul.f32   q9, q9, %q[alpha]\n"   /* mul alpha */
-          "vmul.f32   q10, q10, %q[alpha]\n" /* mul alpha */
-          "vmul.f32   q11, q11, %q[alpha]\n" /* mul alpha */
-          "0: \n"
-          "vtrn.32  q0, q2                  @ trans data: q0=r00,r10,r01,r11; "
-          "q2=r04,r14,r05,r15\n"
-          "vtrn.32  q4, q6                  @ trans data: q4=r20,r30,r21,r31; "
-          "q6=r24,r34,r25,r35\n"
-          "vtrn.32  q8, q10                 @ trans data: q8=r40,r50,r41,r51; "
-          "q10=r44,r54,r45,r55\n"
-
-          "vswp     d1, d8                  @ swap d1, d8, q0=r00,r10,r20,r30; "
-          "q4=r01,r11,r21,r31\n"
-          "vst1.32  {d0-d1},  [%[outptr]]!  @ write q0:r00,r10,r20,r30\n"
-          "vst1.32  {d16},    [%[outptr]]!  @ write d16(q8,low),r40,r50\n"
-          "vst1.32  {d8-d9},  [%[outptr]]!  @ write q4:r01,r11,r21,r31\n"
-          "vst1.32  {d17},    [%[outptr]]!  @ write d16(q8,high),r41,r51\n"
-
-          "vtrn.32  q1, q3                  @ trans data: q1=r02,r12,r03,r13; "
-          "q3=r06,r16,r07,r17\n"
-          "vtrn.32  q5, q7                  @ trans data: q5=r22,r32,r23,r33; "
-          "q7=r26,r36,r27,r37\n"
-          "vtrn.32  q9, q11                 @ trans data: q9=r42,r52,r43,r53; "
-          "q11=r46,r56,r47,r57\n"
-
-          "vswp     d3, d10                 @ swap d3, d10, "
-          "q1=r02,r12,r22,r32; q5=r03,r13,r23,r33\n"
-          "vst1.32  {d2-d3},  [%[outptr]]!  @ write q1:r02,r12,r22,r32\n"
-          "vst1.32  {d18},    [%[outptr]]!  @ write d18(q9,low),r42,r52\n"
-          "vst1.32  {d10-d11},[%[outptr]]!  @ write q5:r03,r13,r23,r33\n"
-          "vst1.32  {d19},    [%[outptr]]!  @ write d19(q9,high),r43,r53\n"
-
-          "vswp     d5, d12                 @ swap d5, d12,q2=r04,r14,r24,r34; "
-          "q6=r05,r15,r25,r35\n"
-          "vst1.32  {d4-d5},  [%[outptr]]!  @ write q2:r04,r14,r24,r34\n"
-          "vst1.32  {d20},    [%[outptr]]!  @ write d20(q10,low),r44,r54\n"
-          "vst1.32  {d12-d13},[%[outptr]]!  @ write q6:r05,r15,r25,r35\n"
-          "vst1.32  {d21},    [%[outptr]]!  @ write d21(q10,high),r45,r55\n"
-
-          "vswp     d7, d14                 @ swap d7, d14, "
-          "q3=r06,r16,r26,r36; q7=r07,r17,r27,r37\n"
-          "vst1.32  {d6-d7},  [%[outptr]]!  @ write q3:r06,r16,r26,r36\n"
-          "vst1.32  {d22},    [%[outptr]]!  @ write d22(q11,low),r46,r56\n"
-          "vst1.32  {d14-d15},[%[outptr]]!  @ write q7:r07,r17,r27,r37\n"
-          "vst1.32  {d23},    [%[outptr]]!  @ write d23(q11,high),r47,r57\n"
-          : [inptr0] "+r"(inptr0),
-            [inptr1] "+r"(inptr1),
-            [inptr2] "+r"(inptr2),
-            [inptr3] "+r"(inptr3),
-            [inptr4] "+r"(inptr4),
-            [inptr5] "+r"(inptr5),
-            [outptr] "+r"(outptr)
-          : [has_alpha] "r"(has_alpha), [alpha] "w"(valpha)
-          : "q0",
-            "q1",
-            "q2",
-            "q3",
-            "q4",
-            "q5",
-            "q6",
-            "q7",
-            "q8",
-            "q9",
-            "q10",
-            "q11",
-            "q15",
-            "cc",
-            "memory");
-    }
-
-    for (; x > 0; x--) {
-      if (has_alpha) {
-        *outptr++ = *inptr0++ * alpha;
-        *outptr++ = *inptr1++ * alpha;
-        *outptr++ = *inptr2++ * alpha;
-        *outptr++ = *inptr3++ * alpha;
-        *outptr++ = *inptr4++ * alpha;
-        *outptr++ = *inptr5++ * alpha;
-      } else {
-        *outptr++ = *inptr0++;
-        *outptr++ = *inptr1++;
-        *outptr++ = *inptr2++;
-        *outptr++ = *inptr3++;
-        *outptr++ = *inptr4++;
-        *outptr++ = *inptr5++;
-      }
-    }
-  }
-}
-
-void prepackA_trans_6x8(float* outptr,
-                        const float* in,
-                        float alpha,
-                        int ldin,
-                        int m0,
-                        int mmax,
-                        int k0,
-                        int kmax) {
-  auto inptr = in + k0 * ldin + m0;
-
-  bool has_alpha = fabsf(alpha - 1.f) > 1e-8f;
-  float32x4_t valpha = vdupq_n_f32(alpha);
-
-  uint32_t mask_buffer[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  int x_len = mmax - m0;
-  int y_len = kmax - k0;
-  int right_remain = x_len - 6 * (x_len / 6);
-  int right_pad = 6 - right_remain;
-  if (right_remain == 0) {
-    right_pad = 0;
-  }
-
-  float* outptr_row = outptr;
-  int stride_out = 6 * y_len;
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-  uint32x4_t vmask1 =
-      vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain));
-  uint32x4_t vmask2 =
-      vcltq_u32(vld1q_u32(mask_buffer + 4), vdupq_n_u32(right_remain));
-
-#pragma omp parallel for
-  for (int y = 0; y < y_len - 3; y += 4) {
-    const float* ptr0 = inptr + y * ldin;
-    const float* ptr1 = ptr0 + ldin;
-    const float* ptr2 = ptr1 + ldin;
-    const float* ptr3 = ptr2 + ldin;
-
-    float* outptr_row_col = outptr_row + y * 6;
-    int i = 0;
-    for (; i < x_len - 5; i += 6) {
-      float* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d2}, [%[ptr0]]!        @ load r0, 6 elements\n"
-          "vld1.32 {d4-d6}, [%[ptr1]]!        @ load r1, 6 elements\n"
-          "vld1.32 {d8-d10}, [%[ptr2]]!       @ load r2, 6 elements\n"
-          "vld1.32 {d12-d14}, [%[ptr3]]!      @ load r3, 6 elements\n"
-          "cmp %[has_alpha], #0\n"
-          "beq  0f\n"                        /* check whether alpha == 1? */
-          "vmul.f32   q0, q0, %q[alpha]\n"   /* mul alpha */
-          "vmul.f32   d2, d2, %e[alpha]\n"   /* mul alpha */
-          "vmul.f32   q2, q2, %q[alpha]\n"   /* mul alpha */
-          "vmul.f32   d6, d6, %e[alpha]\n"   /* mul alpha */
-          "vmul.f32   q4, q4, %q[alpha]\n"   /* mul alpha */
-          "vmul.f32   d10, d10, %e[alpha]\n" /* mul alpha */
-          "vmul.f32   q6, q6, %q[alpha]\n"   /* mul alpha */
-          "vmul.f32   d14, d14, %e[alpha]\n" /* mul alpha */
-          "0: \n"
-          "vst1.32 {d0-d2}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d4-d6}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d8-d10}, [%[outptr]]!     @ write to output ptr\n"
-          "vst1.32 {d12-d14}, [%[outptr]]!    @ write to output ptr\n"
-          : [outptr] "+r"(ptr_out),
-            [ptr0] "+r"(ptr0),
-            [ptr1] "+r"(ptr1),
-            [ptr2] "+r"(ptr2),
-            [ptr3] "+r"(ptr3)
-          : [has_alpha] "r"(has_alpha), [alpha] "w"(valpha)
-          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "cc", "memory");
-      outptr_row_col += stride_out;
-    }
-    if (right_pad > 0) {
-      float* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d2}, [%[ptr0]]!        @ load r0, 6 elements\n"
-          "vld1.32 {d4-d6}, [%[ptr1]]!        @ load r1, 6 elements\n"
-          "vld1.32 {d8-d10}, [%[ptr2]]!       @ load r2, 8 elements\n"
-          "vld1.32 {d12-d14}, [%[ptr3]]!      @ load r3, 8 elements\n"
-          "cmp %[has_alpha], #0\n"
-          "beq  0f\n"                        /* check whether alpha == 1? */
-          "vmul.f32   q0, q0, %q[alpha]\n"   /* mul alpha */
-          "vmul.f32   d2, d2, %e[alpha]\n"   /* mul alpha */
-          "vmul.f32   q2, q2, %q[alpha]\n"   /* mul alpha */
-          "vmul.f32   d6, d6, %e[alpha]\n"   /* mul alpha */
-          "vmul.f32   q4, q4, %q[alpha]\n"   /* mul alpha */
-          "vmul.f32   d10, d10, %e[alpha]\n" /* mul alpha */
-          "vmul.f32   q6, q6, %q[alpha]\n"   /* mul alpha */
-          "vmul.f32   d14, d14, %e[alpha]\n" /* mul alpha */
-          "0: \n"
-          "vbif   q0, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   d2, %e[vzero], %e[vmask2]   @ bit select, pad zero\n"
-          "vbif   q2, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   d6, %e[vzero], %e[vmask2]   @ bit select, pad zero\n"
-          "vst1.32 {d0-d2}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d4-d6}, [%[outptr]]!      @ write to output ptr\n"
-          "vbif   q4, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   d10, %e[vzero], %e[vmask2]  @ bit select, pad zero\n"
-          "vbif   q6, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   d14, %e[vzero], %e[vmask2]  @ bit select, pad zero\n"
-          "vst1.32 {d8-d10}, [%[outptr]]!     @ write to output ptr\n"
-          "vst1.32 {d12-d14}, [%[outptr]]!    @ write to output ptr\n"
-          : [outptr] "+r"(ptr_out),
-            [ptr0] "+r"(ptr0),
-            [ptr1] "+r"(ptr1),
-            [ptr2] "+r"(ptr2),
-            [ptr3] "+r"(ptr3)
-          : [vmask1] "w"(vmask1),
-            [vmask2] "w"(vmask2),
-            [vzero] "w"(vzero),
-            [has_alpha] "r"(has_alpha),
-            [alpha] "w"(valpha)
-          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "cc", "memory");
-    }
-  }
-
-#pragma omp parallel for
-  for (int y = 4 * (y_len / 4); y < y_len; ++y) {
-    const float* ptr0 = inptr + y * ldin;
-    float* outptr_row_col = outptr_row + y * 6;
-    int i = 0;
-    for (; i < x_len - 5; i += 6) {
-      float* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d2}, [%[ptr0]]!        @ load r0, 6 elements\n"
-          "cmp %[has_alpha], #0\n"
-          "beq  0f\n"                      /* check whether alpha == 1? */
-          "vmul.f32   q0, q0, %q[alpha]\n" /* mul alpha */
-          "vmul.f32   d2, d2, %e[alpha]\n" /* mul alpha */
-          "0: \n"
-          "vst1.32 {d0-d2}, [%[outptr]]!      @ write to output ptr\n"
-          : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out)
-          : [has_alpha] "r"(has_alpha), [alpha] "w"(valpha)
-          : "q0", "q1", "cc", "memory");
-      outptr_row_col += stride_out;
-    }
-    if (right_pad > 0) {
-      float* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d2}, [%[ptr0]]!        @ load r0, 6 elements\n"
-          "cmp %[has_alpha], #0\n"
-          "beq  0f\n"                      /* check whether alpha == 1? */
-          "vmul.f32   q0, q0, %q[alpha]\n" /* mul alpha */
-          "vmul.f32   d2, d2, %e[alpha]\n" /* mul alpha */
-          "0: \n"
-          "vbif   q0, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   d2, %e[vzero], %e[vmask2]   @ bit select, pad zero\n"
-          "vst1.32 {d0-d2}, [%[outptr]]!      @ write to output ptr\n"
-          : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out)
-          : [vmask1] "w"(vmask1),
-            [vmask2] "w"(vmask2),
-            [vzero] "w"(vzero),
-            [has_alpha] "r"(has_alpha),
-            [alpha] "w"(valpha)
-          : "q0", "q1", "cc", "memory");
-    }
-  }
-}
-
-void prepackA_4x8(float* outptr,
-                  const float* inptr,
-                  float alpha,
-                  int ldin,
-                  int m0,
-                  int mmax,
-                  int k0,
-                  int kmax) {
-  int x_len = kmax - k0;
-  float zerobuff[x_len];  // NOLINT
-  memset(zerobuff, 0, sizeof(float) * x_len);
-
-  bool has_alpha = fabsf(alpha - 1.f) > 1e-8f;
-  float32x4_t valpha = vdupq_n_f32(alpha);
-
-  for (int y = m0; y < mmax; y += 4) {
-    const float* inptr0 = inptr + y * ldin + k0;
-    const float* inptr1 = inptr0 + ldin;
-    const float* inptr2 = inptr1 + ldin;
-    const float* inptr3 = inptr2 + ldin;
-
-    int x = x_len;
-    if ((y + 3) >= mmax) {
-      switch ((y + 3) - mmax) {
-        case 2:
-          inptr1 = zerobuff;
-        case 1:
-          inptr2 = zerobuff;
-        case 0:
-          inptr3 = zerobuff;
-        default:
-          break;
-      }
-    }
-
-    for (; x > 7; x -= 8) {
-      asm volatile(
-          "vld4.32  {d0-d3}, [%[inptr0]]!   @ zip load r0, "
-          "q0,q1=r00,r04,r01,r05,r02,r06,r03,r07\n"
-          "vld4.32  {d4-d7}, [%[inptr1]]!   @ zip load r1, "
-          "q2,q3=r10,r14,r11,r15,r12,r16,r13,r17\n"
-          "vld4.32  {d8-d11}, [%[inptr2]]!  @ zip load r2, "
-          "q4,q5=r20,r24,r21,r25,r22,r26,r23,r27\n"
-          "vld4.32  {d12-d15}, [%[inptr3]]! @ zip load r3, "
-          "q6,q7=r30,r34,r31,r35,r32,r36,r33,r37\n"
-          "cmp %[has_alpha], #0\n"
-          "beq  0f\n"                      /* check whether alpha == 1? */
-          "vmul.f32   q0, q0, %q[alpha]\n" /* mul alpha */
-          "vmul.f32   q1, q1, %q[alpha]\n" /* mul alpha */
-          "vmul.f32   q2, q2, %q[alpha]\n" /* mul alpha */
-          "vmul.f32   q3, q3, %q[alpha]\n" /* mul alpha */
-          "vmul.f32   q4, q4, %q[alpha]\n" /* mul alpha */
-          "vmul.f32   q5, q5, %q[alpha]\n" /* mul alpha */
-          "vmul.f32   q6, q6, %q[alpha]\n" /* mul alpha */
-          "vmul.f32   q7, q7, %q[alpha]\n" /* mul alpha */
-          "0: \n"
-          "vtrn.32  q0, q2                  @ trans data: q0=r00,r10,r01,r11; "
-          "q2=r04,r14,r05,r15\n"
-          "vtrn.32  q4, q6                  @ trans data: q4=r20,r30,r21,r31; "
-          "q6=r24,r34,r25,r35\n"
-
-          "vswp     d1, d8                  @ swap d1, d8, q0=r00,r10,r20,r30; "
-          "q4=r01,r11,r21,r31\n"
-          "vst1.32  {d0-d1},  [%[outptr]]!  @ write q0:r00,r10,r20,r30\n"
-          "vst1.32  {d8-d9},  [%[outptr]]!  @ write q4:r01,r11,r21,r31\n"
-
-          "vtrn.32  q1, q3                  @ trans data: q1=r02,r12,r03,r13; "
-          "q3=r06,r16,r07,r17\n"
-          "vtrn.32  q5, q7                  @ trans data: q5=r22,r32,r23,r33; "
-          "q7=r26,r36,r27,r37\n"
-
-          "vswp     d3, d10                 @ swap d3, d10, "
-          "q1=r02,r12,r22,r32; q5=r03,r13,r23,r33\n"
-          "vst1.32  {d2-d3},  [%[outptr]]!  @ write q1:r02,r12,r22,r32\n"
-          "vst1.32  {d10-d11},[%[outptr]]!  @ write q5:r03,r13,r23,r33\n"
-
-          "vswp     d5, d12                 @ swap d5, d12,q2=r04,r14,r24,r34; "
-          "q6=r05,r15,r25,r35\n"
-          "vst1.32  {d4-d5},  [%[outptr]]!  @ write q2:r04,r14,r24,r34\n"
-          "vst1.32  {d12-d13},[%[outptr]]!  @ write q6:r05,r15,r25,r35\n"
-
-          "vswp     d7, d14                 @ swap d7, d14, "
-          "q3=r06,r16,r26,r36; q7=r07,r17,r27,r37\n"
-          "vst1.32  {d6-d7},  [%[outptr]]!  @ write q3:r06,r16,r26,r36\n"
-          "vst1.32  {d14-d15},[%[outptr]]!  @ write q7:r07,r17,r27,r37\n"
-          : [inptr0] "+r"(inptr0),
-            [inptr1] "+r"(inptr1),
-            [inptr2] "+r"(inptr2),
-            [inptr3] "+r"(inptr3),
-            [outptr] "+r"(outptr)
-          : [has_alpha] "r"(has_alpha), [alpha] "w"(valpha)
-          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "cc", "memory");
-    }
-
-    for (; x > 0; x--) {
-      if (has_alpha) {
-        *outptr++ = *inptr0++ * alpha;
-        *outptr++ = *inptr1++ * alpha;
-        *outptr++ = *inptr2++ * alpha;
-        *outptr++ = *inptr3++ * alpha;
-      } else {
-        *outptr++ = *inptr0++;
-        *outptr++ = *inptr1++;
-        *outptr++ = *inptr2++;
-        *outptr++ = *inptr3++;
-      }
-    }
-  }
-}
-
-void prepackA_trans_4x8(float* outptr,
-                        const float* in,
-                        float alpha,
-                        int ldin,
-                        int m0,
-                        int mmax,
-                        int k0,
-                        int kmax) {
-  auto inptr = in + k0 * ldin + m0;
-  bool has_alpha = fabsf(alpha - 1.f) > 1e-8f;
-  float32x4_t valpha = vdupq_n_f32(alpha);
-
-  uint32_t mask_buffer[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  int x_len = mmax - m0;
-  int y_len = kmax - k0;
-  int right_remain = x_len - 4 * (x_len / 4);
-  int right_pad = 4 - right_remain;
-  if (right_remain == 0) {
-    right_pad = 0;
-  }
-
-  int stride_out = 4 * y_len;
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-  uint32x4_t vmask1 =
-      vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain));
-
-#pragma omp parallel for
-  for (int y = 0; y < y_len - 3; y += 4) {
-    const float* ptr0 = inptr + y * ldin;
-    const float* ptr1 = ptr0 + ldin;
-    const float* ptr2 = ptr1 + ldin;
-    const float* ptr3 = ptr2 + ldin;
-
-    float* outptr_row_col = outptr + y * 4;
-    int i = 0;
-    for (; i < x_len - 3; i += 4) {
-      float* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d1}, [%[ptr0]]!        @ load r0, 4 elements\n"
-          "vld1.32 {d2-d3}, [%[ptr1]]!        @ load r1, 4 elements\n"
-          "vld1.32 {d4-d5}, [%[ptr2]]!        @ load r2, 4 elements\n"
-          "vld1.32 {d6-d7}, [%[ptr3]]!        @ load r3, 4 elements\n"
-          "cmp %[has_alpha], #0\n"
-          "beq  0f\n"                      /* check whether alpha == 1? */
-          "vmul.f32   q0, q0, %q[alpha]\n" /* mul alpha */
-          "vmul.f32   q1, q1, %q[alpha]\n" /* mul alpha */
-          "vmul.f32   q2, q2, %q[alpha]\n" /* mul alpha */
-          "vmul.f32   q3, q3, %q[alpha]\n" /* mul alpha */
-          "0: \n"
-          "vst1.32 {d0-d1}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d2-d3}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d4-d5}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d6-d7}, [%[outptr]]!      @ write to output ptr\n"
-          : [outptr] "+r"(ptr_out),
-            [ptr0] "+r"(ptr0),
-            [ptr1] "+r"(ptr1),
-            [ptr2] "+r"(ptr2),
-            [ptr3] "+r"(ptr3)
-          : [has_alpha] "r"(has_alpha), [alpha] "w"(valpha)
-          : "q0", "q1", "q2", "q3", "cc", "memory");
-      outptr_row_col += stride_out;
-    }
-    if (right_pad > 0) {
-      float* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d1}, [%[ptr0]]!        @ load r0, 4 elements\n"
-          "vld1.32 {d2-d3}, [%[ptr1]]!        @ load r1, 4 elements\n"
-          "vld1.32 {d4-d5}, [%[ptr2]]!        @ load r2, 4 elements\n"
-          "vld1.32 {d6-d7}, [%[ptr3]]!        @ load r3, 4 elements\n"
-          "cmp %[has_alpha], #0\n"
-          "beq  0f\n"                      /* check whether alpha == 1? */
-          "vmul.f32   q0, q0, %q[alpha]\n" /* mul alpha */
-          "vmul.f32   q1, q1, %q[alpha]\n" /* mul alpha */
-          "vmul.f32   q2, q2, %q[alpha]\n" /* mul alpha */
-          "vmul.f32   q3, q3, %q[alpha]\n" /* mul alpha */
-          "0: \n"
-          "vbif   q0, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   q1, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   q2, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   q3, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vst1.32 {d0-d1}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d2-d3}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d4-d5}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d6-d7}, [%[outptr]]!      @ write to output ptr\n"
-          : [outptr] "+r"(ptr_out),
-            [ptr0] "+r"(ptr0),
-            [ptr1] "+r"(ptr1),
-            [ptr2] "+r"(ptr2),
-            [ptr3] "+r"(ptr3)
-          : [vmask1] "w"(vmask1),
-            [vzero] "w"(vzero),
-            [has_alpha] "r"(has_alpha),
-            [alpha] "w"(valpha)
-          : "q0", "q1", "q2", "q3", "cc", "memory");
-    }
-  }
-
-#pragma omp parallel for
-  for (int y = 4 * (y_len / 4); y < y_len; ++y) {
-    const float* ptr0 = inptr + y * ldin;
-    float* outptr_row_col = outptr + y * 4;
-    int i = 0;
-    for (; i < x_len - 3; i += 4) {
-      float* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d1}, [%[ptr0]]!        @ load r0, 4 elements\n"
-          "cmp %[has_alpha], #0\n"
-          "beq  0f\n"                      /* check whether alpha == 1? */
-          "vmul.f32   q0, q0, %q[alpha]\n" /* mul alpha */
-          "0: \n"
-          "vst1.32 {d0-d1}, [%[outptr]]!      @ write to output ptr\n"
-          : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out)
-          : [has_alpha] "r"(has_alpha), [alpha] "w"(valpha)
-          : "q0", "q1", "cc", "memory");
-      outptr_row_col += stride_out;
-    }
-    if (right_pad > 0) {
-      float* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d1}, [%[ptr0]]!        @ load r0, 4 elements\n"
-          "cmp %[has_alpha], #0\n"
-          "beq  0f\n"                      /* check whether alpha == 1? */
-          "vmul.f32   q0, q0, %q[alpha]\n" /* mul alpha */
-          "0: \n"
-          "vbif   q0, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vst1.32 {d0-d1}, [%[outptr]]!      @ write to output ptr\n"
-          : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out)
-          : [vmask1] "w"(vmask1),
-            [vzero] "w"(vzero),
-            [has_alpha] "r"(has_alpha),
-            [alpha] "w"(valpha)
-          : "q0", "q1", "cc", "memory");
-    }
-  }
-}
-
-#endif  // __aarch64__
-
-/**
-* \brief input data is transpose
-* for arm-v7a, transform data to block x k x 8 layout
-* for arm-v8a, transform data to block x k x 12 layout
-*/
-#ifdef __aarch64__
-void loadb(
-    float *out, const float *in, int ldin, int k0, int kmax, int n0, int nmax) {
-  auto outptr = reinterpret_cast<uint32_t *>(out);
-  auto inptr = reinterpret_cast<const uint32_t *>(in) + k0 * ldin + n0;
-  uint32_t mask_buffer[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  int x_len = nmax - n0;
-  int y_len = kmax - k0;
-  int right_remain = x_len - 12 * (x_len / 12);
-  int right_pad = 12 - right_remain;
-
-  uint32_t *outptr_row = outptr;
-  int stride_out = 12 * y_len;
-
-  uint32x4_t vzero = vdupq_n_u32(0);
-  uint32x4_t vmask1 =
-      vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain));
-  uint32x4_t vmask2 =
-      vcltq_u32(vld1q_u32(mask_buffer + 4), vdupq_n_u32(right_remain));
-  uint32x4_t vmask3 =
-      vcltq_u32(vld1q_u32(mask_buffer + 8), vdupq_n_u32(right_remain));
-
-#pragma omp parallel for
-  for (int y = 0; y < y_len - 3; y += 4) {
-    const uint32_t *ptr0 = inptr + y * ldin;
-    const uint32_t *ptr1 = ptr0 + ldin;
-    const uint32_t *ptr2 = ptr1 + ldin;
-    const uint32_t *ptr3 = ptr2 + ldin;
-    asm volatile(
-        "prfm   pldl1keep, [%[ptr0]]                \n"
-        "prfm   pldl1keep, [%[ptr0], #64]   \n"
-        "prfm   pldl1keep, [%[ptr1]]        \n"
-        "prfm   pldl1keep, [%[ptr1], #64]   \n"
-        "prfm   pldl1keep, [%[ptr2]]        \n"
-        "prfm   pldl1keep, [%[ptr2], #64]   \n"
-        "prfm   pldl1keep, [%[ptr3]]        \n"
-        "prfm   pldl1keep, [%[ptr3], #64]   \n"
-        :
-        : [ptr0] "r"(ptr0), [ptr1] "r"(ptr1), [ptr2] "r"(ptr2), [ptr3] "r"(ptr3)
-        : "memory");
-
-    uint32_t *outptr_row_col = outptr_row + y * 12;
-
-    int i = 0;
-    for (; i < x_len - 11; i += 12) {
-      uint32x4_t vr00 = vld1q_u32(ptr0);
-      uint32x4_t vr01 = vld1q_u32(ptr0 + 4);
-      uint32x4_t vr02 = vld1q_u32(ptr0 + 8);
-
-      uint32x4_t vr10 = vld1q_u32(ptr1);
-      uint32x4_t vr11 = vld1q_u32(ptr1 + 4);
-      uint32x4_t vr12 = vld1q_u32(ptr1 + 8);
-
-      vst1q_u32(outptr_row_col, vr00);
-      vst1q_u32(outptr_row_col + 4, vr01);
-      vst1q_u32(outptr_row_col + 8, vr02);
-
-      uint32x4_t vr20 = vld1q_u32(ptr2);
-      uint32x4_t vr21 = vld1q_u32(ptr2 + 4);
-      uint32x4_t vr22 = vld1q_u32(ptr2 + 8);
-
-      vst1q_u32(outptr_row_col + 12, vr10);
-      vst1q_u32(outptr_row_col + 16, vr11);
-      vst1q_u32(outptr_row_col + 20, vr12);
-
-      uint32x4_t vr30 = vld1q_u32(ptr3);
-      uint32x4_t vr31 = vld1q_u32(ptr3 + 4);
-      uint32x4_t vr32 = vld1q_u32(ptr3 + 8);
-
-      vst1q_u32(outptr_row_col + 24, vr20);
-      vst1q_u32(outptr_row_col + 28, vr21);
-      vst1q_u32(outptr_row_col + 32, vr22);
-
-      vst1q_u32(outptr_row_col + 36, vr30);
-      vst1q_u32(outptr_row_col + 40, vr31);
-      vst1q_u32(outptr_row_col + 44, vr32);
-
-      ptr0 += 12;
-      ptr1 += 12;
-      ptr2 += 12;
-      ptr3 += 12;
-
-      outptr_row_col += stride_out;
-    }
-    if (right_remain > 0) {
-      uint32x4_t vr00 = vld1q_u32(ptr0);
-      uint32x4_t vr01 = vld1q_u32(ptr0 + 4);
-      uint32x4_t vr02 = vld1q_u32(ptr0 + 8);
-
-      uint32x4_t vr10 = vld1q_u32(ptr1);
-      uint32x4_t vr11 = vld1q_u32(ptr1 + 4);
-      uint32x4_t vr12 = vld1q_u32(ptr1 + 8);
-
-      uint32x4_t vr00_1 = vbslq_u32(vmask1, vr00, vzero);
-      uint32x4_t vr01_1 = vbslq_u32(vmask2, vr01, vzero);
-      uint32x4_t vr02_1 = vbslq_u32(vmask3, vr02, vzero);
-
-      uint32x4_t vr20 = vld1q_u32(ptr2);
-      uint32x4_t vr21 = vld1q_u32(ptr2 + 4);
-      uint32x4_t vr22 = vld1q_u32(ptr2 + 8);
-
-      vst1q_u32(outptr_row_col, vr00_1);
-      vst1q_u32(outptr_row_col + 4, vr01_1);
-      vst1q_u32(outptr_row_col + 8, vr02_1);
-
-      uint32x4_t vr10_1 = vbslq_u32(vmask1, vr10, vzero);
-      uint32x4_t vr11_1 = vbslq_u32(vmask2, vr11, vzero);
-      uint32x4_t vr12_1 = vbslq_u32(vmask3, vr12, vzero);
-
-      uint32x4_t vr30 = vld1q_u32(ptr3);
-      uint32x4_t vr31 = vld1q_u32(ptr3 + 4);
-      uint32x4_t vr32 = vld1q_u32(ptr3 + 8);
-
-      vst1q_u32(outptr_row_col + 12, vr10_1);
-      vst1q_u32(outptr_row_col + 16, vr11_1);
-      vst1q_u32(outptr_row_col + 20, vr12_1);
-
-      uint32x4_t vr20_1 = vbslq_u32(vmask1, vr20, vzero);
-      uint32x4_t vr21_1 = vbslq_u32(vmask2, vr21, vzero);
-      uint32x4_t vr22_1 = vbslq_u32(vmask3, vr22, vzero);
-
-      uint32x4_t vr30_1 = vbslq_u32(vmask1, vr30, vzero);
-      uint32x4_t vr31_1 = vbslq_u32(vmask2, vr31, vzero);
-      uint32x4_t vr32_1 = vbslq_u32(vmask3, vr32, vzero);
-
-      vst1q_u32(outptr_row_col + 24, vr20_1);
-      vst1q_u32(outptr_row_col + 28, vr21_1);
-      vst1q_u32(outptr_row_col + 32, vr22_1);
-
-      vst1q_u32(outptr_row_col + 36, vr30_1);
-      vst1q_u32(outptr_row_col + 40, vr31_1);
-      vst1q_u32(outptr_row_col + 44, vr32_1);
-    }
-  }
-
-#pragma omp parallel for
-  for (int y = 4 * (y_len / 4); y < y_len; ++y) {
-    const uint32_t *ptr0 = inptr + y * ldin;
-    uint32_t *outptr_row_col = outptr_row + y * 12;
-
-    int i = 0;
-    for (; i < x_len - 11; i += 12) {
-      uint32x4_t vr0 = vld1q_u32(ptr0);
-      uint32x4_t vr1 = vld1q_u32(ptr0 + 4);
-      uint32x4_t vr2 = vld1q_u32(ptr0 + 8);
-      vst1q_u32(outptr_row_col, vr0);
-      vst1q_u32(outptr_row_col + 4, vr1);
-      vst1q_u32(outptr_row_col + 8, vr2);
-
-      ptr0 += 12;
-
-      outptr_row_col += stride_out;
-    }
-    if (right_remain > 0) {
-      uint32x4_t vr0 = vld1q_u32(ptr0);
-      uint32x4_t vr1 = vld1q_u32(ptr0 + 4);
-      uint32x4_t vr2 = vld1q_u32(ptr0 + 8);
-
-      uint32x4_t vr0_1 = vbslq_u32(vmask1, vr0, vzero);
-      uint32x4_t vr1_1 = vbslq_u32(vmask2, vr1, vzero);
-      uint32x4_t vr2_1 = vbslq_u32(vmask3, vr2, vzero);
-
-      vst1q_u32(outptr_row_col, vr0_1);
-      vst1q_u32(outptr_row_col + 4, vr1_1);
-      vst1q_u32(outptr_row_col + 8, vr2_1);
-    }
-  }
-}
-
-void loadb_trans(
-    float *out, const float *in, int ldin, int k0, int kmax, int n0, int nmax) {
-  int x_len = kmax - k0;
-  uint32_t zerobuff[x_len];  // NOLINT
-  memset(zerobuff, 0, sizeof(uint32_t) * x_len);
-  auto outptr = reinterpret_cast<uint32_t *>(out);
-  auto inptr = reinterpret_cast<const uint32_t *>(in);
-
-  //! data B is not transposed, transpose B to k * 12
-  for (int y = n0; y < nmax; y += 12) {
-    const uint32_t *inptr0 = inptr + y * ldin + k0;
-    const uint32_t *inptr1 = inptr0 + ldin;
-    const uint32_t *inptr2 = inptr1 + ldin;
-    const uint32_t *inptr3 = inptr2 + ldin;
-    const uint32_t *inptr4 = inptr3 + ldin;
-    const uint32_t *inptr5 = inptr4 + ldin;
-    const uint32_t *inptr6 = inptr5 + ldin;
-    const uint32_t *inptr7 = inptr6 + ldin;
-    const uint32_t *inptr8 = inptr7 + ldin;
-    const uint32_t *inptr9 = inptr8 + ldin;
-    const uint32_t *inptr10 = inptr9 + ldin;
-    const uint32_t *inptr11 = inptr10 + ldin;
-
-    asm volatile(
-        "prfm   pldl1keep, [%[ptr0]]        \n"
-        "prfm   pldl1keep, [%[ptr0], #64]   \n"
-        "prfm   pldl1keep, [%[ptr1]]        \n"
-        "prfm   pldl1keep, [%[ptr1], #64]   \n"
-        "prfm   pldl1keep, [%[ptr2]]        \n"
-        "prfm   pldl1keep, [%[ptr2], #64]   \n"
-        "prfm   pldl1keep, [%[ptr3]]        \n"
-        "prfm   pldl1keep, [%[ptr3], #64]   \n"
-        "prfm   pldl1keep, [%[ptr4]]        \n"
-        "prfm   pldl1keep, [%[ptr4], #64]   \n"
-        "prfm   pldl1keep, [%[ptr5]]        \n"
-        "prfm   pldl1keep, [%[ptr5], #64]   \n"
-        "prfm   pldl1keep, [%[ptr6]]        \n"
-        "prfm   pldl1keep, [%[ptr6], #64]   \n"
-        "prfm   pldl1keep, [%[ptr7]]        \n"
-        "prfm   pldl1keep, [%[ptr7], #64]   \n"
-        "prfm   pldl1keep, [%[ptr8]]        \n"
-        "prfm   pldl1keep, [%[ptr8], #64]   \n"
-        "prfm   pldl1keep, [%[ptr9]]        \n"
-        "prfm   pldl1keep, [%[ptr9], #64]   \n"
-        "prfm   pldl1keep, [%[ptr10]]        \n"
-        "prfm   pldl1keep, [%[ptr10], #64]   \n"
-        "prfm   pldl1keep, [%[ptr11]]        \n"
-        "prfm   pldl1keep, [%[ptr11], #64]   \n"
-        :
-        : [ptr0] "r"(inptr0),
-          [ptr1] "r"(inptr1),
-          [ptr2] "r"(inptr2),
-          [ptr3] "r"(inptr3),
-          [ptr4] "r"(inptr4),
-          [ptr5] "r"(inptr5),
-          [ptr6] "r"(inptr6),
-          [ptr7] "r"(inptr7),
-          [ptr8] "r"(inptr8),
-          [ptr9] "r"(inptr9),
-          [ptr10] "r"(inptr10),
-          [ptr11] "r"(inptr11)
-        : "memory");
-
-    int x = x_len;
-
-    //! cope with row index exceed real size, set to zero buffer
-    if ((y + 11) >= nmax) {
-      switch ((y + 11) - nmax) {
-        case 10:
-          inptr1 = zerobuff;
-        case 9:
-          inptr2 = zerobuff;
-        case 8:
-          inptr3 = zerobuff;
-        case 7:
-          inptr4 = zerobuff;
-        case 6:
-          inptr5 = zerobuff;
-        case 5:
-          inptr6 = zerobuff;
-        case 4:
-          inptr7 = zerobuff;
-        case 3:
-          inptr8 = zerobuff;
-        case 2:
-          inptr9 = zerobuff;
-        case 1:
-          inptr10 = zerobuff;
-        case 0:
-          inptr11 = zerobuff;
-        default:
-          break;
-      }
-    }
-    for (; x > 7; x -= 8) {
-      asm volatile(
-          "ldp    q0, q1, [%[inptr0]], #32\n" /* r0, a0~a7 */
-          "ldp    q2, q3, [%[inptr1]], #32\n" /* r1, b0~b7 */
-          "ldp    q4, q5, [%[inptr2]], #32\n" /* r2, c0~c7 */
-          "ldp    q6, q7, [%[inptr3]], #32\n" /* r3, d0~d7 */
-
-          "zip1   v16.4s, v0.4s, v4.4s\n" /* a0c0a1c1 */
-          "zip1   v17.4s, v2.4s, v6.4s\n" /* b0d0b1d1 */
-          "prfm   pldl1keep, [%[inptr0], #128] \n"
-
-          "ldp    q8, q9, [%[inptr4]], #32\n"   /* r4, e0~e7 */
-          "ldp    q10, q11, [%[inptr5]], #32\n" /* r5, f0~f7 */
-          "ldp    q12, q13, [%[inptr6]], #32\n" /* r6, g0~g7 */
-          "ldp    q14, q15, [%[inptr7]], #32\n" /* r7, h0~h7 */
-
-          "zip1   v18.4s, v8.4s, v12.4s\n"  /* e0g0e1g1 */
-          "zip1   v19.4s, v10.4s, v14.4s\n" /* f0h0f1h1 */
-          "prfm   pldl1keep, [%[inptr1], #128]\n"
-          "zip1   v20.4s, v16.4s, v17.4s\n" /* a0b0c0d0 */
-          "zip1   v21.4s, v18.4s, v19.4s\n" /* e0f0g0h0 */
-          "prfm   pldl1keep, [%[inptr2], #128]\n"
-          "zip2   v22.4s, v16.4s, v17.4s\n" /* a1b1c1d1 */
-          "zip2   v23.4s, v18.4s, v19.4s\n" /* e1f1g1h1 */
-
-          "ldp    q24, q25, [%[inptr8]], #32\n"  /* r8, i0~i7 */
-          "ldp    q26, q27, [%[inptr9]], #32\n"  /* r9, j0~j7 */
-          "ldp    q28, q29, [%[inptr10]], #32\n" /* r10, k0~k7 */
-          "ldp    q30, q31, [%[inptr11]], #32\n" /* r11, l0~l7 */
-
-          "stp    q20, q21, [%[outptr]], #32\n" /* save a0~h0 */
-          "prfm   pldl1keep, [%[inptr3], #128]\n"
-
-          "zip1   v16.4s, v24.4s, v28.4s\n" /* i0k0i1k1 */
-          "zip1   v17.4s, v26.4s, v30.4s\n" /* j0l0j1l1 */
-          "prfm   pldl1keep, [%[inptr4], #128]\n"
-          "zip1   v18.4s, v16.4s, v17.4s\n" /* i0j0k0l0 */
-          "zip2   v19.4s, v16.4s, v17.4s\n" /* i1j1k1l1 */
-          "prfm   pldl1keep, [%[inptr5], #128]\n"
-          "zip2   v16.4s, v0.4s, v4.4s\n" /* a2c2a3c3 */
-          "zip2   v17.4s, v2.4s, v6.4s\n" /* b2d2b3d3 */
-
-          "str    q18, [%[outptr]], #16\n"      /* save j0~l0 */
-          "stp    q22, q23, [%[outptr]], #32\n" /* save a1~h1 */
-          "str    q19, [%[outptr]], #16\n"      /* save j1~l1 */
-
-          "zip2   v18.4s, v8.4s, v12.4s\n"  /* e2g2e3g3 */
-          "zip2   v19.4s, v10.4s, v14.4s\n" /* f2h2f3h3 */
-          "prfm   pldl1keep, [%[inptr6], #128]\n"
-          "zip1   v20.4s, v16.4s, v17.4s\n" /* a2b2c2d2 */
-          "zip1   v21.4s, v18.4s, v19.4s\n" /* e2f2g2h2 */
-          "prfm   pldl1keep, [%[inptr7], #128]\n"
-          "zip2   v22.4s, v16.4s, v17.4s\n" /* a3b3c3d3 */
-          "zip2   v23.4s, v18.4s, v19.4s\n" /* e3f3g3h3 */
-          "prfm   pldl1keep, [%[inptr8], #128]\n"
-          "zip2   v16.4s, v24.4s, v28.4s\n" /* i2k2i3k3 */
-          "zip2   v17.4s, v26.4s, v30.4s\n" /* j2l2j3l3 */
-
-          "stp    q20, q21, [%[outptr]], #32\n" /* save a2~h2 */
-
-          "zip1   v18.4s, v16.4s, v17.4s\n" /* i2j2k2l2 */
-          "zip2   v19.4s, v16.4s, v17.4s\n" /* i3j3k3l3 */
-          "prfm   pldl1keep, [%[inptr9], #128]\n"
-          "zip1   v16.4s, v1.4s, v5.4s\n" /* a4c4a5c5 */
-          "zip1   v17.4s, v3.4s, v7.4s\n" /* b4d4b5d5 */
-
-          "str    q18, [%[outptr]], #16\n"      /* save i2~l2 */
-          "stp    q22, q23, [%[outptr]], #32\n" /* save a3~h3 */
-          "str    q19, [%[outptr]], #16\n"      /* save i3~l3 */
-
-          "zip1   v18.4s, v9.4s, v13.4s\n"  /* e4g4e5g5 */
-          "zip1   v19.4s, v11.4s, v15.4s\n" /* f4h4f5h5 */
-          "prfm   pldl1keep, [%[inptr10], #128]\n"
-          "zip1   v20.4s, v16.4s, v17.4s\n" /* a4b4c4d4 */
-          "zip1   v21.4s, v18.4s, v19.4s\n" /* e4f4g4h4 */
-          "prfm   pldl1keep, [%[inptr11], #128]\n"
-          "zip2   v22.4s, v16.4s, v17.4s\n" /* a5b5c5d5 */
-          "zip2   v23.4s, v18.4s, v19.4s\n" /* e5f5g5h5 */
-          "zip1   v16.4s, v25.4s, v29.4s\n" /* i4k4i5k5 */
-          "zip1   v17.4s, v27.4s, v31.4s\n" /* j4l4j5l5 */
-
-          "stp    q20, q21, [%[outptr]], #32\n" /* save a4~h4 */
-
-          "zip1   v18.4s, v16.4s, v17.4s\n" /* i4j4k4l4 */
-          "zip2   v19.4s, v16.4s, v17.4s\n" /* i5j5k5l5 */
-          "zip2   v16.4s, v1.4s, v5.4s\n"   /* a6c6a7c7 */
-          "zip2   v17.4s, v3.4s, v7.4s\n"   /* b6d6b7d7 */
-
-          "str    q18, [%[outptr]], #16\n"      /* save i4~l4 */
-          "stp    q22, q23, [%[outptr]], #32\n" /* save a5~h5 */
-          "str    q19, [%[outptr]], #16\n"      /* save i5~l5 */
-
-          "zip2   v18.4s, v9.4s, v13.4s\n"  /* e6g6e7g7 */
-          "zip2   v19.4s, v11.4s, v15.4s\n" /* f6h6f7h7 */
-          "zip1   v20.4s, v16.4s, v17.4s\n" /* a6b6c6d6 */
-          "zip1   v21.4s, v18.4s, v19.4s\n" /* e6f6g6h6 */
-          "zip2   v22.4s, v16.4s, v17.4s\n" /* a7b7c7d7 */
-          "zip2   v23.4s, v18.4s, v19.4s\n" /* e7f7g7h7 */
-          "zip2   v16.4s, v25.4s, v29.4s\n" /* i6k6i7k7 */
-          "zip2   v17.4s, v27.4s, v31.4s\n" /* j6l6j7l7 */
-
-          "stp    q20, q21, [%[outptr]], #32\n" /* save a6~h6 */
-
-          "zip1   v18.4s, v16.4s, v17.4s\n" /* i6j6k6l6 */
-          "zip2   v19.4s, v16.4s, v17.4s\n" /* i7j7k7l7 */
-
-          "str    q18, [%[outptr]], #16\n"      /* save i6~l6 */
-          "stp    q22, q23, [%[outptr]], #32\n" /* save a7~h7 */
-          "str    q19, [%[outptr]], #16\n"      /* save i7~l7 */
-          : [inptr0] "+r"(inptr0),
-            [inptr1] "+r"(inptr1),
-            [inptr2] "+r"(inptr2),
-            [inptr3] "+r"(inptr3),
-            [inptr4] "+r"(inptr4),
-            [inptr5] "+r"(inptr5),
-            [inptr6] "+r"(inptr6),
-            [inptr7] "+r"(inptr7),
-            [inptr8] "+r"(inptr8),
-            [inptr9] "+r"(inptr9),
-            [inptr10] "+r"(inptr10),
-            [inptr11] "+r"(inptr11),
-            [outptr] "+r"(outptr)
-          :
-          : "v0",
-            "v1",
-            "v2",
-            "v3",
-            "v4",
-            "v5",
-            "v6",
-            "v7",
-            "v8",
-            "v9",
-            "v10",
-            "v11",
-            "v12",
-            "v13",
-            "v14",
-            "v15",
-            "v16",
-            "v17",
-            "v18",
-            "v19",
-            "v20",
-            "v21",
-            "v22",
-            "v23",
-            "v24",
-            "v25",
-            "v26",
-            "v27",
-            "v28",
-            "v29",
-            "v30",
-            "v31",
-            "cc",
-            "memory");
-    }
-
-    for (; x > 0; x--) {
-      *outptr++ = *inptr0++;
-      *outptr++ = *inptr1++;
-      *outptr++ = *inptr2++;
-      *outptr++ = *inptr3++;
-      *outptr++ = *inptr4++;
-      *outptr++ = *inptr5++;
-      *outptr++ = *inptr6++;
-      *outptr++ = *inptr7++;
-      *outptr++ = *inptr8++;
-      *outptr++ = *inptr9++;
-      *outptr++ = *inptr10++;
-      *outptr++ = *inptr11++;
-    }
-  }
-}
-
-#else  // __aarch64__
-void loadb(
-    float* out, const float* in, int ldin, int k0, int kmax, int n0, int nmax) {
-  auto outptr = reinterpret_cast<uint32_t*>(out);
-  auto inptr = reinterpret_cast<const uint32_t*>(in) + k0 * ldin + n0;
-  uint32_t mask_buffer[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  int x_len = nmax - n0;
-  int y_len = kmax - k0;
-  int right_remain = x_len - 8 * (x_len / 8);
-  int right_pad = 8 - right_remain;
-
-  uint32_t* outptr_row = outptr;
-  int stride_out = 8 * y_len;
-
-  uint32x4_t vzero = vdupq_n_u32(0);
-  uint32x4_t vmask1 =
-      vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain));
-  uint32x4_t vmask2 =
-      vcltq_u32(vld1q_u32(mask_buffer + 4), vdupq_n_u32(right_remain));
-
-#pragma omp parallel for
-  for (int y = 0; y < y_len - 3; y += 4) {
-    const uint32_t* ptr0 = inptr + y * ldin;
-    const uint32_t* ptr1 = ptr0 + ldin;
-    const uint32_t* ptr2 = ptr1 + ldin;
-    const uint32_t* ptr3 = ptr2 + ldin;
-    uint32_t* outptr_row_col = outptr_row + y * 8;
-    int i = 0;
-    for (; i < x_len - 7; i += 8) {
-      uint32_t* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d3}, [%[ptr0]]!        @ load r0, 8 elements\n"
-          "vld1.32 {d4-d7}, [%[ptr1]]!        @ load r1, 8 elements\n"
-          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d4-d7}, [%[outptr]]!      @ write to output ptr\n"
-
-          "vld1.32 {d0-d3}, [%[ptr2]]!        @ load r2, 8 elements\n"
-          "vld1.32 {d4-d7}, [%[ptr3]]!        @ load r3, 8 elements\n"
-          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d4-d7}, [%[outptr]]!      @ write to output ptr\n"
-          : [outptr] "+r"(ptr_out),
-            [ptr0] "+r"(ptr0),
-            [ptr1] "+r"(ptr1),
-            [ptr2] "+r"(ptr2),
-            [ptr3] "+r"(ptr3)
-          :
-          : "q0", "q1", "q2", "q3", "cc", "memory");
-      outptr_row_col += stride_out;
-    }
-    if (right_remain > 0) {
-      uint32_t* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d3}, [%[ptr0]]!        @ load r0, 8 elements\n"
-          "vld1.32 {d4-d7}, [%[ptr1]]!        @ load r1, 8 elements\n"
-          "vbif   q0, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   q1, %q[vzero], %q[vmask2]   @ bit select, pad zero\n"
-          //"vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          "vbif   q2, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   q3, %q[vzero], %q[vmask2]   @ bit select, pad zero\n"
-          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d4-d7}, [%[outptr]]!      @ write to output ptr\n"
-
-          "vld1.32 {d0-d3}, [%[ptr2]]!        @ load r2, 8 elements\n"
-          "vld1.32 {d4-d7}, [%[ptr3]]!        @ load r3, 8 elements\n"
-          "vbif   q0, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   q1, %q[vzero], %q[vmask2]   @ bit select, pad zero\n"
-          //"vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          "vbif   q2, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   q3, %q[vzero], %q[vmask2]   @ bit select, pad zero\n"
-          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d4-d7}, [%[outptr]]!      @ write to output ptr\n"
-          : [outptr] "+r"(ptr_out),
-            [ptr0] "+r"(ptr0),
-            [ptr1] "+r"(ptr1),
-            [ptr2] "+r"(ptr2),
-            [ptr3] "+r"(ptr3)
-          : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [vzero] "w"(vzero)
-          : "q0", "q1", "q2", "q3", "cc", "memory");
-    }
-  }
-#pragma omp parallel for
-  for (int y = 4 * (y_len / 4); y < y_len; ++y) {
-    const uint32_t* ptr0 = inptr + y * ldin;
-    uint32_t* outptr_row_col = outptr_row + y * 8;
-    int i = 0;
-    for (; i < x_len - 7; i += 8) {
-      uint32_t* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d3}, [%[ptr0]]!        @ load r0, 8 elements\n"
-          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out)
-          :
-          : "q0", "q1", "cc", "memory");
-      outptr_row_col += stride_out;
-    }
-    if (right_remain > 0) {
-      uint32_t* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d3}, [%[ptr0]]!        @ load r0, 8 elements\n"
-          "vbif   q0, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   q1, %q[vzero], %q[vmask2]   @ bit select, pad zero\n"
-          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out)
-          : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [vzero] "w"(vzero)
-          : "q0", "q1", "cc", "memory");
-    }
-  }
-}
-
-void loadb_trans(
-    float* out, const float* in, int ldin, int k0, int kmax, int n0, int nmax) {
-  int x_len = kmax - k0;
-  uint32_t zerobuff[x_len];  // NOLINT
-  memset(zerobuff, 0, sizeof(uint32_t) * x_len);
-
-  auto outptr = reinterpret_cast<uint32_t*>(out);
-  auto inptr = reinterpret_cast<const uint32_t*>(in);
-  //! data B is not transposed, transpose B to k * 8
-  for (int y = n0; y < nmax; y += 8) {
-    const uint32_t* inptr0 = inptr + y * ldin + k0;
-    const uint32_t* inptr1 = inptr0 + ldin;
-    const uint32_t* inptr2 = inptr1 + ldin;
-    const uint32_t* inptr3 = inptr2 + ldin;
-    const uint32_t* inptr4 = inptr3 + ldin;
-    const uint32_t* inptr5 = inptr4 + ldin;
-    const uint32_t* inptr6 = inptr5 + ldin;
-    const uint32_t* inptr7 = inptr6 + ldin;
-
-    int x = x_len;
-
-    //! cope with row index exceed real size, set to zero buffer
-    if ((y + 7) >= nmax) {
-      switch ((y + 7) - nmax) {
-        case 6:
-          inptr1 = zerobuff;
-        case 5:
-          inptr2 = zerobuff;
-        case 4:
-          inptr3 = zerobuff;
-        case 3:
-          inptr4 = zerobuff;
-        case 2:
-          inptr5 = zerobuff;
-        case 1:
-          inptr6 = zerobuff;
-        case 0:
-          inptr7 = zerobuff;
-        default:
-          break;
-      }
-    }
-
-    for (; x > 7; x -= 8) {
-      //! zip load 8 elements (2 neon Q registers) from each of 8 rows
-      asm volatile(
-          "vld4.32  {d0-d3}, [%[inptr0]]!   @ zip load r0, "
-          "q0,q1=r00,r04,r01,r05,r02,r06,r03,r07\n"
-          "vld4.32  {d4-d7}, [%[inptr1]]!   @ zip load r1, "
-          "q2,q3=r10,r14,r11,r15,r12,r16,r13,r17\n"
-          "vtrn.32  q0, q2                  @ trans data: q0=r00,r10,r01,r11; "
-          "q2=r04,r14,r05,r15\n"
-          "vst1.32  {d0},    [%[outptr]]!   @ write d0(q0,low),r00,r10\n"
-
-          "vld4.32  {d8-d11}, [%[inptr2]]!  @ zip load r2, "
-          "q4,q5=r20,r24,r21,r25,r22,r26,r23,r27\n"
-          "vld4.32  {d12-d15}, [%[inptr3]]! @ zip load r3, "
-          "q6,q7=r30,r34,r31,r35,r32,r36,r33,r37\n"
-          "vtrn.32  q4, q6                  @ trans data: q4=r20,r30,r21,r31; "
-          "q6=r24,r34,r25,r35\n"
-          "vst1.32  {d8},    [%[outptr]]!   @ write d8(q4,low),r20,r30\n"
-
-          "vld4.32  {d16-d19}, [%[inptr4]]! @ zip load r4, "
-          "q8,q9=r40,r44,r41,r45,r42,r46,r43,r47\n"
-          "vld4.32  {d20-d23}, [%[inptr5]]! @ zip load r5, "
-          "q10,q11=r50,r54,r51,r55,r52,r56,r53,r57\n"
-          "vtrn.32  q8, q10                 @ trans data: q8=r40,r50,r41,r51; "
-          "q10=r44,r54,r45,r55\n"
-          "vst1.32  {d16},    [%[outptr]]!  @ write d16(q8,low),r40,r50\n"
-
-          "vld4.32  {d24-d27}, [%[inptr6]]! @ zip load r6, "
-          "q12,q13=r60,r64,r61,r65,r62,r66,r63,r67\n"
-          "vld4.32  {d28-d31}, [%[inptr7]]! @ zip load r7, "
-          "q14,q15=r70,r74,r71,r75,r72,r76,r73,r77\n"
-          "vtrn.32  q12, q14                @ trans data:q12=r60,r70,r61,r71; "
-          "q14=r64,r74,r65,r75\n"
-          "vst1.32  {d24},    [%[outptr]]!  @ write d24(q8,low),r60,r70\n"
-
-          //"pld      [%[inptr0], #128]       @ preload r0 data to cache, fill
-          // pipeline\n"
-          "vst1.32  {d1},     [%[outptr]]!  @ write d1(q0,high),r01,r11\n"
-          "vst1.32  {d9},     [%[outptr]]!  @ write d9(q4,high),r21,r31\n"
-          "vst1.32  {d17},    [%[outptr]]!  @ write d17(q8,high),r41,r51\n"
-          "vst1.32  {d25},    [%[outptr]]!  @ write d25(q12,high),r61,r71\n"
-
-          "vtrn.32  q1, q3                  @ trans data: q1=r02,r12,r03,r13; "
-          "q3=r06,r16,r07,r17\n"
-          "vst1.32  {d2},     [%[outptr]]!  @ write d2(q1,low),r02,r12\n"
-          "vtrn.32  q5, q7                  @ trans data: q5=r22,r32,r23,r33; "
-          "q7=r26,r36,r27,r37\n"
-          "vst1.32  {d10},    [%[outptr]]!  @ write d10(q5,low),r22,r32\n"
-          "vtrn.32  q9, q11                 @ trans data: q9=r42,r52,r43,r53; "
-          "q11=r46,r56,r47,r57\n"
-          "vst1.32  {d18},    [%[outptr]]!  @ write d18(q9,low),r42,r52\n"
-          "vtrn.32  q13, q15                @ trans data:q13=r62,r72,r63,r73; "
-          "q15=r66,r76,r67,r77\n"
-          "vst1.32  {d26},    [%[outptr]]!  @ write d18(q9,low),r62,r72\n"
-
-          //"pld      [%[inptr1], #128]       @ preload r1 data to cache, fill
-          // pipeline\n"
-          "vst1.32  {d3},     [%[outptr]]!  @ write d3(q1,high),r03,r13\n"
-          "vst1.32  {d11},    [%[outptr]]!  @ write d11(q5,high),r23,r33\n"
-          "vst1.32  {d19},    [%[outptr]]!  @ write d19(q9,high),r43,r53\n"
-          "vst1.32  {d27},    [%[outptr]]!  @ write d27(q13,high),r63,r73\n"
-
-          //"pld      [%[inptr2], #128]       @ preload r2 data to cache, fill
-          // pipeline\n"
-          "vst1.32  {d4},     [%[outptr]]!  @ write d4(q2,low),r04,r14\n"
-          "vst1.32  {d12},    [%[outptr]]!  @ write d12(q6,low),r24,r34\n"
-          "vst1.32  {d20},    [%[outptr]]!  @ write d20(q10,low),r44,r54\n"
-          "vst1.32  {d28},    [%[outptr]]!  @ write d28(q14,low),r64,r74\n"
-
-          //"pld      [%[inptr3], #128]       @ preload r3 data to cache, fill
-          // pipeline\n"
-          "vst1.32  {d5},     [%[outptr]]!  @ write d5(q2,high),r05,r15\n"
-          "vst1.32  {d13},    [%[outptr]]!  @ write d13(q6,high),r25,r35\n"
-          "vst1.32  {d21},    [%[outptr]]!  @ write d21(q10,high),r45,r55\n"
-          "vst1.32  {d29},    [%[outptr]]!  @ write d29(q14,high),r65,r75\n"
-
-          //"pld      [%[inptr4], #128]       @ preload r4 data to cache, fill
-          // pipeline\n"
-          "vst1.32  {d6},     [%[outptr]]!  @ write d6(q3,low),r06,r16\n"
-          "vst1.32  {d14},    [%[outptr]]!  @ write d14(q7,low),r26,r36\n"
-          "vst1.32  {d22},    [%[outptr]]!  @ write d22(q11,low),r46,r56\n"
-          "vst1.32  {d30},    [%[outptr]]!  @ write d30(q15,low),r66,r76\n"
-
-          //"pld      [%[inptr5], #128]       @ preload r5 data to cache, fill
-          // pipeline\n"
-          "vst1.32  {d7},     [%[outptr]]!  @ write d7(q3,high),r07,r17\n"
-          "vst1.32  {d15},    [%[outptr]]!  @ write d15(q7,high),r27,r37\n"
-          "vst1.32  {d23},    [%[outptr]]!  @ write d23(q11,high),r47,r57\n"
-          "vst1.32  {d31},    [%[outptr]]!  @ write d31(q15,high),r67,r77\n"
-          : [inptr0] "+r"(inptr0),
-            [inptr1] "+r"(inptr1),
-            [inptr2] "+r"(inptr2),
-            [inptr3] "+r"(inptr3),
-            [inptr4] "+r"(inptr4),
-            [inptr5] "+r"(inptr5),
-            [inptr6] "+r"(inptr6),
-            [inptr7] "+r"(inptr7),
-            [outptr] "+r"(outptr)
-          :
-          : "q0",
-            "q1",
-            "q2",
-            "q3",
-            "q4",
-            "q5",
-            "q6",
-            "q7",
-            "q8",
-            "q9",
-            "q10",
-            "q11",
-            "q12",
-            "q13",
-            "q14",
-            "q15",
-            "cc",
-            "memory");
-    }
-
-    for (; x > 0; x--) {
-      *outptr++ = *inptr0++;
-      *outptr++ = *inptr1++;
-      *outptr++ = *inptr2++;
-      *outptr++ = *inptr3++;
-      *outptr++ = *inptr4++;
-      *outptr++ = *inptr5++;
-      *outptr++ = *inptr6++;
-      *outptr++ = *inptr7++;
-    }
-  }
-}
-
-#endif  // __aarch64__
-
-#ifdef __aarch64__
-void sgemm_prepacked_8x12(bool is_transB,
-                          int M,
-                          int N,
-                          int K,
-                          const float *A_packed,
-                          const float *B,
-                          int ldb,
-                          float beta,
-                          float *C,
-                          int ldc,
-                          const float *bias,
-                          bool has_bias,
-                          bool has_relu,
-                          ARMContext *ctx) {
-  size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
-  auto workspace = ctx->workspace_data<float>();
-  int threads = ctx->threads();
-  //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
-  int x_block = (l2_cache - (MBLOCK * K)) / (sizeof(float) * (K + MBLOCK));
-  x_block /= NBLOCK;
-  x_block *= NBLOCK;
-  int x_num = (N + (x_block - 1)) / x_block;
-  x_block = (N + x_num - 1) / x_num;
-  x_block = (x_block + NBLOCK - 1) / NBLOCK;
-  x_block *= NBLOCK;
-  x_block = x_block < NBLOCK ? NBLOCK : x_block;
-
-  // unroll 2 loop
-  int tail_pre = (K & (KBLOCK - 1));
-  int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1;
-
-  bool flag_p_remain = false;
-  int remain = 0;
-
-  int has_beta = fabsf(beta) > 1e-8f ? 1 : 0;
-
-  //! apanel is pre_compute outside gemm
-  for (unsigned int x0 = 0; x0 < N; x0 += x_block) {
-    unsigned int xmax = x0 + x_block;
-    if (xmax > N) {
-      xmax = N;
-    }
-    int bblocks = (xmax - x0 + NBLOCK - 1) / NBLOCK;
-    remain = xmax - x0 - (bblocks - 1) * NBLOCK;
-    if (remain > 0) {
-      flag_p_remain = true;
-    }
-    //! load bpanel
-    float *b_pannel = workspace;
-    if (is_transB) {
-      loadb_trans(b_pannel, B, ldb, 0, K, x0, xmax);
-    } else {
-      loadb(b_pannel, B, ldb, 0, K, x0, xmax);
-    }
-#pragma omp parallel for num_threads(threads)
-    for (unsigned int y = 0; y < M; y += MBLOCK) {
-      unsigned int ymax = y + MBLOCK;
-      if (ymax > M) {
-        ymax = M;
-      }
-
-      float bias_local[8] = {0};
-      if (has_bias) {
-        bias_local[0] = bias[y];
-        bias_local[1] = bias[y + 1];
-        bias_local[2] = bias[y + 2];
-        bias_local[3] = bias[y + 3];
-        bias_local[4] = bias[y + 4];
-        bias_local[5] = bias[y + 5];
-        bias_local[6] = bias[y + 6];
-        bias_local[7] = bias[y + 7];
-      }
-
-      float cout0[NBLOCK];
-      float cout1[NBLOCK];
-      float cout2[NBLOCK];
-      float cout3[NBLOCK];
-      float cout4[NBLOCK];
-      float cout5[NBLOCK];
-      float cout6[NBLOCK];
-      float cout7[NBLOCK];
-
-      float *c_ptr0 = C + y * ldc + x0;
-      float *c_ptr1 = c_ptr0 + ldc;
-      float *c_ptr2 = c_ptr1 + ldc;
-      float *c_ptr3 = c_ptr2 + ldc;
-      float *c_ptr4 = c_ptr3 + ldc;
-      float *c_ptr5 = c_ptr4 + ldc;
-      float *c_ptr6 = c_ptr5 + ldc;
-      float *c_ptr7 = c_ptr6 + ldc;
-
-      float *pout0 = c_ptr0;
-      float *pout1 = c_ptr1;
-      float *pout2 = c_ptr2;
-      float *pout3 = c_ptr3;
-      float *pout4 = c_ptr4;
-      float *pout5 = c_ptr5;
-      float *pout6 = c_ptr6;
-      float *pout7 = c_ptr7;
-
-      const float *a_ptr_l = A_packed + y * K;
-      const float *b_ptr = b_pannel;
-      for (int xb = 0; xb < bblocks; xb++) {
-        if ((y + 7) >= ymax) {
-          switch ((y + 7) - ymax) {
-            case 6:
-              c_ptr1 = cout1;
-            case 5:
-              c_ptr2 = cout2;
-            case 4:
-              c_ptr3 = cout3;
-            case 3:
-              c_ptr4 = cout4;
-            case 2:
-              c_ptr5 = cout5;
-            case 1:
-              c_ptr6 = cout6;
-            case 0:
-              c_ptr7 = cout7;
-            default:
-              break;
-          }
-        }
-        if (flag_p_remain && (xb == bblocks - 1)) {
-          pout0 = c_ptr0;
-          pout1 = c_ptr1;
-          pout2 = c_ptr2;
-          pout3 = c_ptr3;
-          pout4 = c_ptr4;
-          pout5 = c_ptr5;
-          pout6 = c_ptr6;
-          pout7 = c_ptr7;
-
-          c_ptr0 = cout0;
-          c_ptr1 = cout1;
-          c_ptr2 = cout2;
-          c_ptr3 = cout3;
-          c_ptr4 = cout4;
-          c_ptr5 = cout5;
-          c_ptr6 = cout6;
-          c_ptr7 = cout7;
-          if (has_beta) {
-            for (int i = 0; i < remain; ++i) {
-              cout0[i] = pout0[i];
-              cout1[i] = pout1[i];
-              cout2[i] = pout2[i];
-              cout3[i] = pout3[i];
-              cout4[i] = pout4[i];
-              cout5[i] = pout5[i];
-              cout6[i] = pout6[i];
-              cout7[i] = pout7[i];
-            }
-          }
-        }
-        const float *a_ptr = a_ptr_l;
-        int tail = tail_pre;
-        int k = k_pre;
-
-        asm volatile(
-            "prfm   pldl1keep, [%[a_ptr]]\n"       /* preload a*/
-            "ldp	q2, q3, [%[bias_ptr]]\n"         /* load bias to q2, q3*/
-            "dup	v8.4s,  v2.s[0]\n"               /* out0 = 0 */
-            "dup	v9.4s,  v2.s[0]\n"               /* out1 = 0*/
-            "dup	v10.4s, v2.s[0]\n"               /* out2 = 0*/
-            "prfm   pldl1keep, [%[b_ptr]]\n"       /* preload b*/
-            "dup	v11.4s, v2.s[1]\n"               /* out3 = 0*/
-            "dup	v12.4s, v2.s[1]\n"               /* out4 = 0*/
-            "prfm   pldl1keep, [%[b_ptr], #64]\n"  /* preload b*/
-            "dup	v13.4s, v2.s[1]\n"               /* out5 = 0*/
-            "prfm   pldl1keep, [%[a_ptr], #64]\n"  /* preload a*/
-            "dup	v14.4s, v2.s[2]\n"               /* out6 = 0*/
-            "prfm   pldl1keep, [%[b_ptr], #128]\n" /* preload b*/
-            "dup	v15.4s, v2.s[2]\n"               /* out7 = 0*/
-            "prfm   pldl1keep, [%[a_ptr], #128]\n" /* preload a*/
-            "dup	v16.4s, v2.s[2]\n"               /* out8 = 0*/
-            "prfm   pldl1keep, [%[b_ptr], #192]\n" /* preload b*/
-            "dup	v17.4s, v2.s[3]\n"               /* out9 = 0*/
-            "prfm   pldl1keep, [%[b_ptr], #256]\n" /* preload b*/
-            "dup	v18.4s, v2.s[3]\n"               /* out10 = 0*/
-            "prfm   pldl1keep, [%[a_ptr], #192]\n" /* preload a*/
-            "dup	v19.4s, v2.s[3]\n"               /* out11 = 0*/
-            "prfm   pldl1keep, [%[b_ptr], #320]\n" /* preload b*/
-            "dup	v20.4s, v3.s[0]\n"               /* out12 = 0*/
-            "prfm   pldl1keep, [%[a_ptr], #256]\n" /* preload a*/
-            "dup	v21.4s, v3.s[0]\n"               /* out13 = 0*/
-            "prfm   pldl1keep, [%[b_ptr], #384]\n" /* preload b*/
-            "dup	v22.4s, v3.s[0]\n"               /* out14 = 0*/
-            "dup	v23.4s, v3.s[1]\n"               /* out15 = 0*/
-            "dup	v24.4s, v3.s[1]\n"               /* out16 = 0*/
-            "dup	v25.4s, v3.s[1]\n"               /* out17 = 0*/
-            "dup	v26.4s, v3.s[2]\n"               /* out18 = 0*/
-            "dup	v27.4s, v3.s[2]\n"               /* out19 = 0*/
-            "dup	v28.4s, v3.s[2]\n"               /* out20 = 0*/
-            "dup	v29.4s, v3.s[3]\n"               /* out21 = 0*/
-            "dup	v30.4s, v3.s[3]\n"               /* out22 = 0*/
-            "dup	v31.4s, v3.s[3]\n"               /* out23 = 0*/
-            "cbz    %w[has_beta], 0f\n"            /* check beta == 0? */
-            /* process beta */
-            "dup    v7.4s, %w[beta]\n"                    /* beta to vector */
-            "ld1    {v0.4s, v1.4s, v2.4s}, [%[c_ptr0]]\n" /* load output r0 */
-            "ld1    {v3.4s, v4.4s, v5.4s}, [%[c_ptr1]]\n" /* load output r1 */
-            "fmla   v8.4s, v0.4s, v7.4s\n"  /* cr00 += beta * c_r00*/
-            "fmla   v9.4s, v1.4s, v7.4s\n"  /* cr01 += beta * c_r01*/
-            "fmla   v10.4s, v2.4s, v7.4s\n" /* cr02 += beta * c_r02*/
-            "ld1    {v0.4s, v1.4s, v2.4s}, [%[c_ptr2]]\n" /* load output r2*/
-            "fmla   v11.4s, v3.4s, v7.4s\n" /* cr10 += beta * c_r10*/
-            "fmla   v12.4s, v4.4s, v7.4s\n" /* cr11 += beta * c_r11*/
-            "fmla   v13.4s, v5.4s, v7.4s\n" /* cr12 += beta * c_r12*/
-            "ld1    {v3.4s, v4.4s, v5.4s}, [%[c_ptr3]]\n" /* load output r3*/
-            "fmla   v14.4s, v0.4s, v7.4s\n" /* cr20 += beta * c_r20*/
-            "fmla   v15.4s, v1.4s, v7.4s\n" /* cr21 += beta * c_r21*/
-            "fmla   v16.4s, v2.4s, v7.4s\n" /* cr22 += beta * c_r22*/
-            "ld1    {v0.4s, v1.4s, v2.4s}, [%[c_ptr4]]\n" /* load output r4*/
-            "fmla   v17.4s, v3.4s, v7.4s\n" /* cr30 += beta * c_r30*/
-            "fmla   v18.4s, v4.4s, v7.4s\n" /* cr31 += beta * c_r31*/
-            "fmla   v19.4s, v5.4s, v7.4s\n" /* cr32 += beta * c_r32*/
-            "ld1    {v3.4s, v4.4s, v5.4s}, [%[c_ptr5]]\n" /* load output r5*/
-            "fmla   v20.4s, v0.4s, v7.4s\n" /* cr40 += beta * c_r40*/
-            "fmla   v21.4s, v1.4s, v7.4s\n" /* cr41 += beta * c_r41*/
-            "fmla   v22.4s, v2.4s, v7.4s\n" /* cr42 += beta * c_r42*/
-            "ld1    {v0.4s, v1.4s, v2.4s}, [%[c_ptr6]]\n" /* load output r6*/
-            "fmla   v23.4s, v3.4s, v7.4s\n" /* cr50 += beta * c_r50*/
-            "fmla   v24.4s, v4.4s, v7.4s\n" /* cr51 += beta * c_r51*/
-            "fmla   v25.4s, v5.4s, v7.4s\n" /* cr52 += beta * c_r52*/
-            "ld1    {v3.4s, v4.4s, v5.4s}, [%[c_ptr7]]\n" /* load output r7*/
-            "fmla   v26.4s, v0.4s, v7.4s\n"  /* cr60 += beta * c_r60*/
-            "fmla   v27.4s, v1.4s, v7.4s\n"  /* cr61 += beta * c_r61*/
-            "fmla   v28.4s, v2.4s, v7.4s\n"  /* cr62 += beta * c_r62*/
-            "fmla   v29.4s, v3.4s, v7.4s\n"  /* cr70 += beta * c_r70*/
-            "fmla   v30.4s, v4.4s, v7.4s\n"  /* cr71 += beta * c_r71*/
-            "fmla   v31.4s, v5.4s, v7.4s\n"  /* cr72 += beta * c_r72*/
-            "0: \n"                          /* check loop count */
-            "ldp	q0, q1, [%[a_ptr]], #32\n" /* load a00,a01 to q0, q1*/
-            "ldp	q4, q5, [%[b_ptr]], #32\n" /* load b0, b1 to q4, q5*/
-            "cbz	%w[k], 2f\n"               /* check loop count > 0 */
-            /* main loop */
-            /* unrool 0*/
-            "1:\n"                              /* main loop */
-            "fmla 	v8.4s ,  v4.4s,  v0.s[0]\n" /* out0 = b0 * a00[0], b0 =q4 */
-            "fmla  	v11.4s ,  v4.4s,  v0.s[1]\n" /* out1 = b0 * a00[1], b0 =q4
-                                                    */
-            "ldp	q6, q7, [%[b_ptr]], #32\n"     /* load b2, b0 to q6, q7 */
-            "fmla	v14.4s,  v4.4s,  v0.s[2]\n"   /* out2 = b0 * a00[2], b0 =q4 */
-            "fmla	v17.4s,  v4.4s,  v0.s[3]\n"   /* out3 = b0 * a00[3], b0 =q4 */
-            "ldp	q2, q3, [%[a_ptr]], #32\n"    /* load a10, a11 to q3, q4 */
-            "fmla 	v20.4s,  v4.4s,  v1.s[0]\n" /* out4 = b0 * a01[0], b0 =q4 */
-            "fmla	v23.4s,  v4.4s,  v1.s[1]\n"   /* out5 = b0 * a01[1], b0 =q4 */
-            "fmla	v26.4s,  v4.4s,  v1.s[2]\n"   /* out6 = b0 * a01[2], b0 =q4 */
-            "fmla	v29.4s,  v4.4s,  v1.s[3]\n"   /* out7 = b0 * a01[3], b0 =q4 */
-
-            "fmla	v9.4s,  v5.4s,  v0.s[0]\n"  /* out8 = b1 * a00[0], b1 =q5 */
-            "fmla	v12.4s,  v5.4s,  v0.s[1]\n" /* out9 = b1 * a00[1], b1 =q5 */
-            "fmla	v15.4s,  v5.4s,  v0.s[2]\n" /* out10 = b1 * a00[2], b1 =q5*/
-            "fmla	v18.4s,  v5.4s,  v0.s[3]\n" /* out11 = b1 * a00[3], b1 =q5*/
-            "fmla	v21.4s,  v5.4s,  v1.s[0]\n" /* out12 = b1 * a01[0], b1 =q5*/
-            "fmla	v24.4s,  v5.4s,  v1.s[1]\n" /* out13 = b1 * a01[1], b1 =q5*/
-            "fmla	v27.4s,  v5.4s,  v1.s[2]\n" /* out14 = b1 * a01[2], b1 =q5*/
-            "fmla	v30.4s,  v5.4s,  v1.s[3]\n" /* out15 = b1 * a01[3], b1 =q5*/
-
-            "ldp	q4, q5, [%[b_ptr]], #32\n" /* load b1, b2 to q4, q5 */
-
-            "fmla	v10.4s,  v6.4s,  v0.s[0]\n" /* out16 = b2 * a00[0], b2 =q6*/
-            "fmla	v13.4s,  v6.4s,  v0.s[1]\n" /* out17 = b2 * a00[1], b2 =q6*/
-            "prfm   pldl1keep, [%[b_ptr], #384]\n"
-            "fmla	v16.4s,  v6.4s,  v0.s[2]\n" /* out18 = b2 * a00[2], b2 =q6*/
-            "fmla	v19.4s,  v6.4s,  v0.s[3]\n" /* out19 = b2 * a00[3], b2 =q6*/
-            "fmla	v22.4s,  v6.4s,  v1.s[0]\n" /* out20 = b2 * a00[0], b2 =q6*/
-            "fmla	v25.4s,  v6.4s,  v1.s[1]\n" /* out21 = b2 * a00[1], b2 =q6*/
-            "fmla	v28.4s,  v6.4s,  v1.s[2]\n" /* out22 = b2 * a00[2], b2 =q6*/
-            "fmla	v31.4s,  v6.4s,  v1.s[3]\n" /* out23 = b2 * a00[3], b2 =q6*/
-
-            "ldp	q0, q1, [%[a_ptr]], #32\n" /* load a00, a01 to q0, q1 */
-
-            /* unrool 1 */
-            "fmla 	v8.4s ,  v7.4s,  v2.s[0]\n" /* out0 = b0 * a10[0], b0 =q7 */
-            "fmla	v11.4s ,  v7.4s,  v2.s[1]\n"  /* out1 = b0 * a10[1], b0 =q7 */
-            "fmla	v14.4s,  v7.4s,  v2.s[2]\n"   /* out2 = b0 * a10[2], b0 =q7 */
-            "prfm   pldl1keep, [%[a_ptr], #256]\n"
-            "fmla	v17.4s,  v7.4s,  v2.s[3]\n"   /* out3 = b0 * a10[3], b0 =q7 */
-            "fmla 	v20.4s,  v7.4s,  v3.s[0]\n" /* out4 = b0 * a11[0], b0 =q7 */
-            "fmla   v23.4s,  v7.4s,  v3.s[1]\n" /* out5 = b0 * a11[1], b0 = q7*/
-            "fmla	v26.4s,  v7.4s,  v3.s[2]\n"   /* out6 = b0 * a11[2], b0 =q7 */
-            "fmla	v29.4s,  v7.4s,  v3.s[3]\n"   /* out7 = b0 * a11[3], b0 =q7 */
-
-            "ldp	q6, q7, [%[b_ptr]], #32\n" /* load b0, b1 to q6, q7 */
-
-            "fmla	v9.4s,  v4.4s,  v2.s[0]\n"  /* out8 = b0 * a10[0], b1 =q4 */
-            "fmla	v12.4s,  v4.4s,  v2.s[1]\n" /* out9 = b0 * a10[1], b1 =q4 */
-            "fmla	v15.4s,  v4.4s,  v2.s[2]\n" /* out10 = b1 * a10[2], b1 =q4*/
-            "fmla	v18.4s,  v4.4s,  v2.s[3]\n" /* out11 = b1 * a10[3], b1 =q4*/
-            "fmla	v21.4s,  v4.4s,  v3.s[0]\n" /* out12 = b1 * a10[0], b1 =q4*/
-            "fmla	v24.4s,  v4.4s,  v3.s[1]\n" /* out13 = b1 * a10[1], b1 =q4*/
-            "fmla	v27.4s,  v4.4s,  v3.s[2]\n" /* out14 = b1 * a10[2], b1 =q4*/
-            "fmla	v30.4s,  v4.4s,  v3.s[3]\n" /* out15 = b1 * a10[3], b1 =q4*/
-
-            "fmla	v10.4s,  v5.4s,  v2.s[0]\n" /* out16 = b2 * a10[0], b2 =q5*/
-            "fmla	v13.4s,  v5.4s,  v2.s[1]\n" /* out17 = b2 * a10[0], b2 =q5*/
-            "fmla	v16.4s,  v5.4s,  v2.s[2]\n" /* out18 = b2 * a10[0], b2 =q5*/
-            "fmla	v19.4s,  v5.4s,  v2.s[3]\n" /* out19 = b2 * a10[0], b2 =q5*/
-            "fmla	v22.4s,  v5.4s,  v3.s[0]\n" /* out20 = b2 * a10[0], b2 =q5*/
-            "fmla	v25.4s,  v5.4s,  v3.s[1]\n" /* out21 = b2 * a10[0], b2 =q5*/
-            "fmla	v28.4s,  v5.4s,  v3.s[2]\n" /* out22 = b2 * a10[0], b2 =q5*/
-            "fmla	v31.4s,  v5.4s,  v3.s[3]\n" /* out23 = b2 * a10[0], b2 =q5*/
-            "ldp	q4, q5, [%[b_ptr]], #32\n"  /* load b2, b0 to q4, q5 */
-            /* unrool 2*/
-            "fmla 	v8.4s ,  v6.4s,  v0.s[0]\n" /* out0 = b0 * a00[0], b0 =q6 */
-            "fmla  	v11.4s ,  v6.4s,  v0.s[1]\n" /* out1 = b0 * a00[1], b0 =q6
-                                                    */
-            "ldp	q2, q3, [%[a_ptr]], #32\n"     /* load a10, a11 to q3, q4*/
-            "fmla	v14.4s,  v6.4s,  v0.s[2]\n"    /* out2 = b0 * a00[2], b0 =q6*/
-            "fmla	v17.4s,  v6.4s,  v0.s[3]\n"    /* out3 = b0 * a00[3], b0 =q6*/
-            "fmla 	v20.4s,  v6.4s,  v1.s[0]\n"  /* out4 = b0 * a01[0], b0 =q6*/
-            "fmla	v23.4s,  v6.4s,  v1.s[1]\n"    /* out5 = b0 * a01[1], b0 =q6*/
-            "fmla	v26.4s,  v6.4s,  v1.s[2]\n"    /* out6 = b0 * a01[2], b0 =q6*/
-            "fmla	v29.4s,  v6.4s,  v1.s[3]\n"    /* out7 = b0 * a01[3], b0 =q6*/
-            "fmla	v9.4s,  v7.4s,  v0.s[0]\n"     /* out8 = b1 * a00[0], b1 =q7*/
-            "fmla	v12.4s,  v7.4s,  v0.s[1]\n"    /* out9 = b1 * a00[1], b1 =q7*/
-            "prfm   pldl1keep, [%[b_ptr], #384]\n"
-            "fmla	v15.4s,  v7.4s,  v0.s[2]\n" /* out10 = b1 * a00[2], b1 =q7*/
-            "fmla	v18.4s,  v7.4s,  v0.s[3]\n" /* out11 = b1 * a00[3], b1 =q7*/
-            "fmla	v21.4s,  v7.4s,  v1.s[0]\n" /* out12 = b1 * a01[0], b1 =q7*/
-            "fmla	v24.4s,  v7.4s,  v1.s[1]\n" /* out13 = b1 * a01[1], b1 =q7*/
-            "fmla	v27.4s,  v7.4s,  v1.s[2]\n" /* out14 = b1 * a01[2], b1 =q7*/
-            "fmla	v30.4s,  v7.4s,  v1.s[3]\n" /* out15 = b1 * a01[3], b1 =q7*/
-
-            "ldp	q6, q7, [%[b_ptr]], #32\n" /* load b1, b2 to q6, q7*/
-
-            "fmla	v10.4s,  v4.4s,  v0.s[0]\n" /* out16 = b2 * a00[0], b2 =q4*/
-            "fmla	v13.4s,  v4.4s,  v0.s[1]\n" /* out17 = b2 * a00[1], b2 =q4*/
-            "fmla	v16.4s,  v4.4s,  v0.s[2]\n" /* out18 = b2 * a00[2], b2 =q4*/
-            "fmla	v19.4s,  v4.4s,  v0.s[3]\n" /* out19 = b2 * a00[3], b2 =q4*/
-            "fmla	v22.4s,  v4.4s,  v1.s[0]\n" /* out20 = b2 * a00[0], b2 =q4*/
-            "fmla	v25.4s,  v4.4s,  v1.s[1]\n" /* out21 = b2 * a00[1], b2 =q4*/
-            "fmla	v28.4s,  v4.4s,  v1.s[2]\n" /* out22 = b2 * a00[2], b2 =q4*/
-            "fmla	v31.4s,  v4.4s,  v1.s[3]\n" /* out23 = b2 * a00[3], b2 =q4*/
-            "ldp	q0, q1, [%[a_ptr]], #32\n"  /* load a00, a01 to q0, q1*/
-            /* unrool 3*/
-            "fmla 	v8.4s ,  v5.4s,  v2.s[0]\n" /* out0 = b0 * a10[0], b0 =q5*/
-            "fmla	v11.4s ,  v5.4s,  v2.s[1]\n"  /* out1 = b0 * a10[1], b0 =q5*/
-            "fmla	v14.4s,  v5.4s,  v2.s[2]\n"   /* out2 = b0 * a10[2], b0 =q5*/
-            "fmla	v17.4s,  v5.4s,  v2.s[3]\n"   /* out3 = b0 * a10[3], b0 =q5*/
-            "fmla 	v20.4s,  v5.4s,  v3.s[0]\n" /* out4 = b0 * a11[0], b0 =q5*/
-            "fmla   v23.4s,  v5.4s,  v3.s[1]\n" /* out5 = b0 * a11[1], b0 =q5*/
-            "fmla	v26.4s,  v5.4s,  v3.s[2]\n"   /* out6 = b0 * a11[2], b0 =q5*/
-            "fmla	v29.4s,  v5.4s,  v3.s[3]\n"   /* out7 = b0 * a11[3], b0 =q5*/
-            "ldp	q4, q5, [%[b_ptr]], #32\n"    /* load b0, b1 to q4, q5*/
-            "fmla	v9.4s,  v6.4s,  v2.s[0]\n"    /* out8 = b0 * a10[0], b1 =q6*/
-            "fmla	v12.4s,  v6.4s,  v2.s[1]\n"   /* out9 = b0 * a10[1], b1 =q6*/
-            "prfm   pldl1keep, [%[a_ptr], #256]\n"
-            "fmla	v15.4s,  v6.4s,  v2.s[2]\n" /* out10 = b1 * a10[2], b1 =q6*/
-            "fmla	v18.4s,  v6.4s,  v2.s[3]\n" /* out11 = b1 * a10[3], b1 =q6*/
-            "fmla	v21.4s,  v6.4s,  v3.s[0]\n" /* out12 = b1 * a10[0], b1 =q6*/
-            "fmla	v24.4s,  v6.4s,  v3.s[1]\n" /* out13 = b1 * a10[1], b1 =q6*/
-            "fmla	v27.4s,  v6.4s,  v3.s[2]\n" /* out14 = b1 * a10[2], b1 =q6*/
-            "prfm   pldl1keep, [%[b_ptr], #384]\n"
-            "fmla	v30.4s,  v6.4s,  v3.s[3]\n" /* out15 = b1 * a10[3], b1 =q6*/
-            "fmla	v10.4s,  v7.4s,  v2.s[0]\n" /* out16 = b2 * a10[0], b2 =q7*/
-            "fmla	v13.4s,  v7.4s,  v2.s[1]\n" /* out17 = b2 * a10[0], b2 =q7*/
-            "fmla	v16.4s,  v7.4s,  v2.s[2]\n" /* out18 = b2 * a10[0], b2 =q7*/
-            "fmla	v19.4s,  v7.4s,  v2.s[3]\n" /* out19 = b2 * a10[0], b2 =q7*/
-            "fmla	v22.4s,  v7.4s,  v3.s[0]\n" /* out20 = b2 * a10[0], b2 =q7*/
-            "fmla	v25.4s,  v7.4s,  v3.s[1]\n" /* out21 = b2 * a10[0], b2 =q7*/
-            "subs	%w[k], %w[k], #1\n"         /* loop count - 1*/
-            "fmla	v28.4s,  v7.4s,  v3.s[2]\n" /* out22 = b2 * a10[0], b2 =q7*/
-            "fmla	v31.4s,  v7.4s,  v3.s[3]\n" /* out23 = b2 * a10[0], b2 =q7*/
-            "bne	1b\n"
-            "2:\n"                            /* process tail*/
-            "subs		%w[tail], %w[tail], #1\n" /* tail--*/
-            "beq		3f\n"                     /*jump to tail = 1*/
-            /* final unrool 0*/
-            /* unrool 0, tail > 1*/
-            "fmla 	v8.4s ,  v4.4s,  v0.s[0]\n"  /* out0 = b0 * a00[0], b0 =q4*/
-            "fmla  	v11.4s ,  v4.4s,  v0.s[1]\n" /* out1 = b0 * a00[1], b0 =q4*/
-            "ldp	q6, q7, [%[b_ptr]], #32\n"     /* load b2, b0 to q6, q7*/
-            "fmla	v14.4s,  v4.4s,  v0.s[2]\n"    /* out2 = b0 * a00[2], b0 =q4*/
-            "fmla	v17.4s,  v4.4s,  v0.s[3]\n"    /* out3 = b0 * a00[3], b0 =q4*/
-            "ldp	q2, q3, [%[a_ptr]], #32\n"     /* load a10, a11 to q2, q3*/
-            "fmla 	v20.4s,  v4.4s,  v1.s[0]\n"  /* out4 = b0 * a01[0], b0 =q4*/
-            "fmla	v23.4s,  v4.4s,  v1.s[1]\n"    /* out5 = b0 * a01[1], b0 =q4*/
-            "fmla	v26.4s,  v4.4s,  v1.s[2]\n"    /* out6 = b0 * a01[2], b0 =q4*/
-            "fmla	v29.4s,  v4.4s,  v1.s[3]\n"    /* out7 = b0 * a01[3], b0 =q4*/
-            "subs	%w[tail], %w[tail], #1\n"      /* tail--*/
-            "fmla	v9.4s,  v5.4s,  v0.s[0]\n"     /* out8 = b1 * a00[0], b1 =q5*/
-            "fmla	v12.4s,  v5.4s,  v0.s[1]\n"    /* out9 = b1 * a00[1], b1 =q5*/
-            "fmla	v15.4s,  v5.4s,  v0.s[2]\n" /* out10 = b1 * a00[2], b1 =q5*/
-            "fmla	v18.4s,  v5.4s,  v0.s[3]\n" /* out11 = b1 * a00[3], b1 =q5*/
-            "fmla	v21.4s,  v5.4s,  v1.s[0]\n" /* out12 = b1 * a01[0], b1 =q5*/
-            "fmla	v24.4s,  v5.4s,  v1.s[1]\n" /* out13 = b1 * a01[1], b1 =q5*/
-            "fmla	v27.4s,  v5.4s,  v1.s[2]\n" /* out14 = b1 * a01[2], b1 =q5*/
-            "fmla	v30.4s,  v5.4s,  v1.s[3]\n" /* out15 = b1 * a01[3], b1 =q5*/
-            "ldp	q4, q5, [%[b_ptr]], #32\n"  /* load b1, b2 to q4, q5*/
-            "fmla	v10.4s,  v6.4s,  v0.s[0]\n" /* out16 = b2 * a00[0], b2 =q6*/
-            "fmla	v13.4s,  v6.4s,  v0.s[1]\n" /* out17 = b2 * a00[1], b2 =q6*/
-            "fmla	v16.4s,  v6.4s,  v0.s[2]\n" /* out18 = b2 * a00[2], b2 =q6*/
-            "fmla	v19.4s,  v6.4s,  v0.s[3]\n" /* out19 = b2 * a00[3], b2 =q6*/
-            "fmla	v22.4s,  v6.4s,  v1.s[0]\n" /* out20 = b2 * a00[0], b2 =q6*/
-            "fmla	v25.4s,  v6.4s,  v1.s[1]\n" /* out21 = b2 * a00[1], b2 =q6*/
-            "fmla	v28.4s,  v6.4s,  v1.s[2]\n" /* out22 = b2 * a00[2], b2 =q6*/
-            "fmla	v31.4s,  v6.4s,  v1.s[3]\n" /* out23 = b2 * a00[3], b2 =q6*/
-            "beq		4f\n"                     /*jump to tail = 2*/
-            /* unrool 1, tail > 2*/
-            "ldp	q0, q1, [%[a_ptr]], #32\n"    /* load a00, a01 to q0, q1*/
-            "fmla 	v8.4s ,  v7.4s,  v2.s[0]\n" /* out0 = b0 * a10[0], b0 =q7*/
-            "fmla	v11.4s ,  v7.4s,  v2.s[1]\n"  /* out1 = b0 * a10[1], b0 =q7*/
-            "fmla	v14.4s,  v7.4s,  v2.s[2]\n"   /* out2 = b0 * a10[2], b0 =q7*/
-            "fmla	v17.4s,  v7.4s,  v2.s[3]\n"   /* out3 = b0 * a10[3], b0 =q7*/
-            "fmla 	v20.4s,  v7.4s,  v3.s[0]\n" /* out4 = b0 * a11[0], b0 =q7*/
-            "fmla   v23.4s,  v7.4s,  v3.s[1]\n" /* out5 = b0 * a11[1], b0 =q7*/
-            "fmla	v26.4s,  v7.4s,  v3.s[2]\n"   /* out6 = b0 * a11[2], b0 =q7*/
-            "fmla	v29.4s,  v7.4s,  v3.s[3]\n"   /* out7 = b0 * a11[3], b0 =q7*/
-            "ldp	q6, q7, [%[b_ptr]], #32\n"    /* load b0, b1 to q6, q7*/
-            "fmla	v9.4s,  v4.4s,  v2.s[0]\n"    /* out8 = b0 * a10[0], b1 =q4*/
-            "fmla	v12.4s,  v4.4s,  v2.s[1]\n"   /* out9 = b0 * a10[1], b1 =q4*/
-            "fmla	v15.4s,  v4.4s,  v2.s[2]\n"   /* out10 = b1 * a10[2], b1 =q4*/
-            "fmla	v18.4s,  v4.4s,  v2.s[3]\n"   /* out11 = b1 * a10[3], b1 =q4*/
-            "fmla	v21.4s,  v4.4s,  v3.s[0]\n"   /* out12 = b1 * a10[0], b1 =q4*/
-            "fmla	v24.4s,  v4.4s,  v3.s[1]\n"   /* out13 = b1 * a10[1], b1 =q4*/
-            "fmla	v27.4s,  v4.4s,  v3.s[2]\n"   /* out14 = b1 * a10[2], b1 =q4*/
-            "fmla	v30.4s,  v4.4s,  v3.s[3]\n"   /* out15 = b1 * a10[3], b1 =q4*/
-            "subs	%w[tail], %w[tail], #1\n"     /* tail--*/
-            "fmla	v10.4s,  v5.4s,  v2.s[0]\n"   /* out16 = b2 * a10[0], b2 =q5*/
-            "fmla	v13.4s,  v5.4s,  v2.s[1]\n"   /* out17 = b2 * a10[0], b2 =q5*/
-            "fmla	v16.4s,  v5.4s,  v2.s[2]\n"   /* out18 = b2 * a10[0], b2 =q5*/
-            "fmla	v19.4s,  v5.4s,  v2.s[3]\n"   /* out19 = b2 * a10[0], b2 =q5*/
-            "fmla	v22.4s,  v5.4s,  v3.s[0]\n"   /* out20 = b2 * a10[0], b2 =q5*/
-            "fmla	v25.4s,  v5.4s,  v3.s[1]\n"   /* out21 = b2 * a10[0], b2 =q5*/
-            "fmla	v28.4s,  v5.4s,  v3.s[2]\n"   /* out22 = b2 * a10[0], b2 =q5*/
-            "fmla	v31.4s,  v5.4s,  v3.s[3]\n"   /* out23 = b2 * a10[0], b2 =q5*/
-            "beq		5f\n"                       /*jump to tail = 3*/
-            /* unrool 2, tail = 4*/
-            "ldp	q4, q5, [%[b_ptr]], #32\n"     /* load b2, b0 to q4, q5*/
-            "fmla 	v8.4s ,  v6.4s,  v0.s[0]\n"  /* out0 = b0 * a00[0], b0 =q6*/
-            "fmla  	v11.4s ,  v6.4s,  v0.s[1]\n" /* out1 = b0 * a00[1], b0 =q6*/
-            "ldp	q2, q3, [%[a_ptr]], #32\n"     /* load a10, a11 to q3, q4*/
-            "fmla	v14.4s,  v6.4s,  v0.s[2]\n"    /* out2 = b0 * a00[2], b0 =q6*/
-            "fmla	v17.4s,  v6.4s,  v0.s[3]\n"    /* out3 = b0 * a00[3], b0 =q6*/
-            "fmla 	v20.4s,  v6.4s,  v1.s[0]\n"  /* out4 = b0 * a01[0], b0 =q6*/
-            "fmla	v23.4s,  v6.4s,  v1.s[1]\n"    /* out5 = b0 * a01[1], b0 =q6*/
-            "fmla	v26.4s,  v6.4s,  v1.s[2]\n"    /* out6 = b0 * a01[2], b0 =q6*/
-            "fmla	v29.4s,  v6.4s,  v1.s[3]\n"    /* out7 = b0 * a01[3], b0 =q6*/
-            "fmla	v9.4s,  v7.4s,  v0.s[0]\n"     /* out8 = b1 * a00[0], b1 =q7*/
-            "fmla	v12.4s,  v7.4s,  v0.s[1]\n"    /* out9 = b1 * a00[1], b1 =q7*/
-            "fmla	v15.4s,  v7.4s,  v0.s[2]\n" /* out10 = b1 * a00[2], b1 =q7*/
-            "fmla	v18.4s,  v7.4s,  v0.s[3]\n" /* out11 = b1 * a00[3], b1 =q7*/
-            "fmla	v21.4s,  v7.4s,  v1.s[0]\n" /* out12 = b1 * a01[0], b1 =q7*/
-            "fmla	v24.4s,  v7.4s,  v1.s[1]\n" /* out13 = b1 * a01[1], b1 =q7*/
-            "fmla	v27.4s,  v7.4s,  v1.s[2]\n" /* out14 = b1 * a01[2], b1 =q7*/
-            "fmla	v30.4s,  v7.4s,  v1.s[3]\n" /* out15 = b1 * a01[3], b1 =q7*/
-            "ldp	q6, q7, [%[b_ptr]], #32\n"  /* load b1, b2 to q6, q7*/
-            "fmla	v10.4s,  v4.4s,  v0.s[0]\n" /* out16 = b2 * a00[0], b2 =q4*/
-            "fmla	v13.4s,  v4.4s,  v0.s[1]\n" /* out17 = b2 * a00[1], b2 =q4*/
-            "fmla	v16.4s,  v4.4s,  v0.s[2]\n" /* out18 = b2 * a00[2], b2 =q4*/
-            "fmla	v19.4s,  v4.4s,  v0.s[3]\n" /* out19 = b2 * a00[3], b2 =q4*/
-            "fmla	v22.4s,  v4.4s,  v1.s[0]\n" /* out20 = b2 * a00[0], b2 =q4*/
-            "fmla	v25.4s,  v4.4s,  v1.s[1]\n" /* out21 = b2 * a00[1], b2 =q4*/
-            "fmla	v28.4s,  v4.4s,  v1.s[2]\n" /* out22 = b2 * a00[2], b2 =q4*/
-            "fmla	v31.4s,  v4.4s,  v1.s[3]\n" /* out23 = b2 * a00[3], b2 =q4*/
-            /* unrool 3, tail = 4*/
-            "fmla 	v8.4s ,  v5.4s,  v2.s[0]\n" /* out0 = b0 * a10[0], b0 =q5*/
-            "fmla	v11.4s ,  v5.4s,  v2.s[1]\n"  /* out1 = b0 * a10[1], b0 =q5*/
-            "fmla	v14.4s,  v5.4s,  v2.s[2]\n"   /* out2 = b0 * a10[2], b0 =q5*/
-            "fmla	v17.4s,  v5.4s,  v2.s[3]\n"   /* out3 = b0 * a10[3], b0 =q5*/
-            "fmla 	v20.4s,  v5.4s,  v3.s[0]\n" /* out4 = b0 * a11[0], b0 =q5*/
-            "fmla   v23.4s,  v5.4s,  v3.s[1]\n" /* out5 = b0 * a11[1], b0 =q5*/
-            "fmla	v26.4s,  v5.4s,  v3.s[2]\n"   /* out6 = b0 * a11[2], b0 =q5*/
-            "fmla	v29.4s,  v5.4s,  v3.s[3]\n"   /* out7 = b0 * a11[3], b0 =q5*/
-            "fmla	v9.4s,  v6.4s,  v2.s[0]\n"    /* out8 = b0 * a10[0], b1 =q6*/
-            "fmla	v12.4s,  v6.4s,  v2.s[1]\n"   /* out9 = b1 * a10[1], b1 =q6*/
-            "fmla	v15.4s,  v6.4s,  v2.s[2]\n"   /* out10 = b1 * a10[2], b1 =q6*/
-            "fmla	v18.4s,  v6.4s,  v2.s[3]\n"   /* out11 = b1 * a10[3], b1 =q6*/
-            "fmla	v21.4s,  v6.4s,  v3.s[0]\n"   /* out12 = b1 * a10[0], b1 =q6*/
-            "fmla	v24.4s,  v6.4s,  v3.s[1]\n"   /* out13 = b1 * a10[1], b1 =q6*/
-            "fmla	v27.4s,  v6.4s,  v3.s[2]\n"   /* out14 = b1 * a10[2], b1 =q6*/
-            "fmla	v30.4s,  v6.4s,  v3.s[3]\n"   /* out15 = b1 * a10[3], b1 =q6*/
-            "fmla	v10.4s,  v7.4s,  v2.s[0]\n"   /* out16 = b2 * a10[0], b2 =q7*/
-            "fmla	v13.4s,  v7.4s,  v2.s[1]\n"   /* out17 = b2 * a10[0], b2 =q7*/
-            "fmla	v16.4s,  v7.4s,  v2.s[2]\n"   /* out18 = b2 * a10[0], b2 =q7*/
-            "fmla	v19.4s,  v7.4s,  v2.s[3]\n"   /* out19 = b2 * a10[0], b2 =q7*/
-            "fmla	v22.4s,  v7.4s,  v3.s[0]\n"   /* out20 = b2 * a10[0], b2 =q7*/
-            "fmla	v25.4s,  v7.4s,  v3.s[1]\n"   /* out21 = b2 * a10[0], b2 =q7*/
-            "fmla	v28.4s,  v7.4s,  v3.s[2]\n"   /* out22 = b2 * a10[0], b2 =q7*/
-            "fmla	v31.4s,  v7.4s,  v3.s[3]\n"   /* out23 = b2 * a10[0], b2 =q7*/
-            "b		11f\n"
-            /* tails==1 final tail*/
-            "3: \n"                            /* tail=1*/
-            "ldr	q6, [%[b_ptr]], #16\n"       /* load b2 to q6*/
-            "fmla v8.4s ,  v4.4s,  v0.s[0]\n"  /* out0 = b0 * a10[0], b0 =q5*/
-            "fmla	v11.4s ,  v4.4s,  v0.s[1]\n" /* out1 = b0 * a10[1], b0 =q5*/
-            "fmla	v14.4s,  v4.4s,  v0.s[2]\n"  /* out2 = b0 * a10[2], b0 =q5*/
-            "fmla	v17.4s,  v4.4s,  v0.s[3]\n"  /* out3 = b0 * a10[3], b0 =q5*/
-            "fmla v20.4s,  v4.4s,  v1.s[0]\n"  /* out4 = b0 * a11[0], b0 =q5*/
-            "fmla v23.4s,  v4.4s,  v1.s[1]\n"  /* out5 = b0 * a11[1], b0 = q5*/
-            "fmla	v26.4s,  v4.4s,  v1.s[2]\n"  /* out6 = b0 * a11[2], b0 =q5*/
-            "fmla	v29.4s,  v4.4s,  v1.s[3]\n"  /* out7 = b0 * a11[3], b0 =q5*/
-            "fmla	v9.4s,  v5.4s,  v0.s[0]\n"   /* out8 = b0 * a10[0], b1 =q6*/
-            "fmla	v12.4s,  v5.4s,  v0.s[1]\n"  /* out9 = b1 * a10[1], b1 =q6*/
-            "fmla	v15.4s,  v5.4s,  v0.s[2]\n"  /* out10 = b1 * a10[2], b1 =q6*/
-            "fmla	v18.4s,  v5.4s,  v0.s[3]\n"  /* out11 = b1 * a10[3], b1 =q6*/
-            "fmla	v21.4s,  v5.4s,  v1.s[0]\n"  /* out12 = b1 * a10[0], b1 =q6*/
-            "fmla	v24.4s,  v5.4s,  v1.s[1]\n"  /* out13 = b1 * a10[1], b1 =q6*/
-            "fmla	v27.4s,  v5.4s,  v1.s[2]\n"  /* out14 = b1 * a10[2], b1 =q6*/
-            "fmla	v30.4s,  v5.4s,  v1.s[3]\n"  /* out15 = b1 * a10[3], b1 =q6*/
-            "fmla	v10.4s,  v6.4s,  v0.s[0]\n"  /* out16 = b2 * a10[0], b2 =q7*/
-            "fmla	v13.4s,  v6.4s,  v0.s[1]\n"  /* out17 = b2 * a10[0], b2 =q7*/
-            "fmla	v16.4s,  v6.4s,  v0.s[2]\n"  /* out18 = b2 * a10[0], b2 =q7*/
-            "fmla	v19.4s,  v6.4s,  v0.s[3]\n"  /* out19 = b2 * a10[0], b2 =q7*/
-            "fmla	v22.4s,  v6.4s,  v1.s[0]\n"  /* out20 = b2 * a10[0], b2 =q7*/
-            "fmla	v25.4s,  v6.4s,  v1.s[1]\n"  /* out21 = b2 * a10[0], b2 =q7*/
-            "fmla	v28.4s,  v6.4s,  v1.s[2]\n"  /* out22 = b2 * a10[0], b2 =q7*/
-            "fmla	v31.4s,  v6.4s,  v1.s[3]\n"  /* out23 = b2 * a10[0], b2 =q7*/
-            "b		11f\n"
-            /* tails==2 final tail*/
-            "4:\n"                              /* tail = 2*/
-            "fmla 	v8.4s ,  v7.4s,  v2.s[0]\n" /* out0 = b0 * a10[0], b0 =q5*/
-            "fmla	v11.4s ,  v7.4s,  v2.s[1]\n"  /* out1 = b0 * a10[1], b0 =q5*/
-            "fmla	v14.4s,  v7.4s,  v2.s[2]\n"   /* out2 = b0 * a10[2], b0 =q5*/
-            "fmla	v17.4s,  v7.4s,  v2.s[3]\n"   /* out3 = b0 * a10[3], b0 =q5*/
-            "fmla 	v20.4s,  v7.4s,  v3.s[0]\n" /* out4 = b0 * a11[0], b0 =q5*/
-            "fmla   v23.4s,  v7.4s,  v3.s[1]\n" /* out5 = b0 * a11[1], b0 = q5*/
-            "fmla	v26.4s,  v7.4s,  v3.s[2]\n"   /* out6 = b0 * a11[2], b0 =q5*/
-            "fmla	v29.4s,  v7.4s,  v3.s[3]\n"   /* out7 = b0 * a11[3], b0 =q5*/
-            "fmla	v9.4s,  v4.4s,  v2.s[0]\n"    /* out8 = b0 * a10[0], b1 =q6*/
-            "fmla	v12.4s,  v4.4s,  v2.s[1]\n"   /* out9 = b1 * a10[1], b1 =q6*/
-            "fmla	v15.4s,  v4.4s,  v2.s[2]\n"   /* out10 = b1 * a10[2], b1 =q6*/
-            "fmla	v18.4s,  v4.4s,  v2.s[3]\n"   /* out11 = b1 * a10[3], b1 =q6*/
-            "fmla	v21.4s,  v4.4s,  v3.s[0]\n"   /* out12 = b1 * a10[0], b1 =q6*/
-            "fmla	v24.4s,  v4.4s,  v3.s[1]\n"   /* out13 = b1 * a10[1], b1 =q6*/
-            "fmla	v27.4s,  v4.4s,  v3.s[2]\n"   /* out14 = b1 * a10[2], b1 =q6*/
-            "fmla	v30.4s,  v4.4s,  v3.s[3]\n"   /* out15 = b1 * a10[3], b1 =q6*/
-            "fmla	v10.4s,  v5.4s,  v2.s[0]\n"   /* out16 = b2 * a10[0], b2 =q7*/
-            "fmla	v13.4s,  v5.4s,  v2.s[1]\n"   /* out17 = b2 * a10[0], b2 =q7*/
-            "fmla	v16.4s,  v5.4s,  v2.s[2]\n"   /* out18 = b2 * a10[0], b2 =q7*/
-            "fmla	v19.4s,  v5.4s,  v2.s[3]\n"   /* out19 = b2 * a10[0], b2 =q7*/
-            "fmla	v22.4s,  v5.4s,  v3.s[0]\n"   /* out20 = b2 * a10[0], b2 =q7*/
-            "fmla	v25.4s,  v5.4s,  v3.s[1]\n"   /* out21 = b2 * a10[0], b2 =q7*/
-            "fmla	v28.4s,  v5.4s,  v3.s[2]\n"   /* out22 = b2 * a10[0], b2 =q7*/
-            "fmla	v31.4s,  v5.4s,  v3.s[3]\n"   /* out23 = b2 * a10[0], b2 =q7*/
-            "b		11f\n"
-            /* tails==3 final tail*/
-            "5:\n"                              /* tail = 3*/
-            "ldr	q4, [%[b_ptr]], #16\n"        /* load b2, b0 to q4*/
-            "fmla 	v8.4s ,  v6.4s,  v0.s[0]\n" /* out0 = b0 * a10[0], b0 =q5*/
-            "fmla	v11.4s ,  v6.4s,  v0.s[1]\n"  /* out1 = b0 * a10[1], b0 =q5*/
-            "fmla	v14.4s,  v6.4s,  v0.s[2]\n"   /* out2 = b0 * a10[2], b0 =q5*/
-            "fmla	v17.4s,  v6.4s,  v0.s[3]\n"   /* out3 = b0 * a10[3], b0 =q5*/
-            "fmla 	v20.4s,  v6.4s,  v1.s[0]\n" /* out4 = b0 * a11[0], b0 =q5*/
-            "fmla   v23.4s,  v6.4s,  v1.s[1]\n" /* out5 = b0 * a11[1], b0 = q5*/
-            "fmla	v26.4s,  v6.4s,  v1.s[2]\n"   /* out6 = b0 * a11[2], b0 =q5*/
-            "fmla	v29.4s,  v6.4s,  v1.s[3]\n"   /* out7 = b0 * a11[3], b0 =q5*/
-            "fmla	v9.4s,  v7.4s,  v0.s[0]\n"    /* out8 = b0 * a10[0], b1 =q6*/
-            "fmla	v12.4s,  v7.4s,  v0.s[1]\n"   /* out9 = b1 * a10[1], b1 =q6*/
-            "fmla	v15.4s,  v7.4s,  v0.s[2]\n"   /* out10 = b1 * a10[2], b1 =q6*/
-            "fmla	v18.4s,  v7.4s,  v0.s[3]\n"   /* out11 = b1 * a10[3], b1 =q6*/
-            "fmla	v21.4s,  v7.4s,  v1.s[0]\n"   /* out12 = b1 * a10[0], b1 =q6*/
-            "fmla	v24.4s,  v7.4s,  v1.s[1]\n"   /* out13 = b1 * a10[1], b1 =q6*/
-            "fmla	v27.4s,  v7.4s,  v1.s[2]\n"   /* out14 = b1 * a10[2], b1 =q6*/
-            "fmla	v30.4s,  v7.4s,  v1.s[3]\n"   /* out15 = b1 * a10[3], b1 =q6*/
-            "fmla	v10.4s,  v4.4s,  v0.s[0]\n"   /* out16 = b2 * a10[0], b2 =q7*/
-            "fmla	v13.4s,  v4.4s,  v0.s[1]\n"   /* out17 = b2 * a10[0], b2 =q7*/
-            "fmla	v16.4s,  v4.4s,  v0.s[2]\n"   /* out18 = b2 * a10[0], b2 =q7*/
-            "fmla	v19.4s,  v4.4s,  v0.s[3]\n"   /* out19 = b2 * a10[0], b2 =q7*/
-            "fmla	v22.4s,  v4.4s,  v1.s[0]\n"   /* out20 = b2 * a10[0], b2 =q7*/
-            "fmla	v25.4s,  v4.4s,  v1.s[1]\n"   /* out21 = b2 * a10[0], b2 =q7*/
-            "fmla	v28.4s,  v4.4s,  v1.s[2]\n"   /* out22 = b2 * a10[0], b2 =q7*/
-            "fmla	v31.4s,  v4.4s,  v1.s[3]\n"   /* out23 = b2 * a10[0], b2 =q7*/
-            "11: \n"                            /* check if relu */
-            "cbz    %w[relu],   12f\n"          /* skip relu */
-            "movi   v2.4s, #0\n"                /* for relu*/
-            "fmax   v8.4s, v8.4s, v2.4s\n"      /* relu*/
-            "fmax   v9.4s, v9.4s, v2.4s\n"      /* relu*/
-            "fmax   v10.4s, v10.4s, v2.4s\n"    /* relu*/
-            "fmax   v11.4s, v11.4s, v2.4s\n"    /* relu*/
-            "fmax   v12.4s, v12.4s, v2.4s\n"    /* relu*/
-            "fmax   v13.4s, v13.4s, v2.4s\n"    /* relu*/
-            "fmax   v14.4s, v14.4s, v2.4s\n"    /* relu*/
-            "fmax   v15.4s, v15.4s, v2.4s\n"    /* relu*/
-            "fmax   v16.4s,v16.4s,v2.4s\n"      /* relu*/
-            "fmax   v17.4s,v17.4s,v2.4s\n"      /* relu*/
-            "fmax   v18.4s, v18.4s, v2.4s\n"    /* relu*/
-            "fmax   v19.4s, v19.4s, v2.4s\n"    /* relu*/
-            "fmax   v20.4s, v20.4s, v2.4s\n"    /* relu*/
-            "fmax   v21.4s, v21.4s, v2.4s\n"    /* relu*/
-            "fmax   v22.4s, v22.4s, v2.4s\n"    /* relu*/
-            "fmax   v23.4s, v23.4s, v2.4s\n"    /* relu*/
-            "fmax   v24.4s,v24.4s,v2.4s\n"      /* relu*/
-            "fmax   v25.4s,v25.4s,v2.4s\n"      /* relu*/
-            "fmax   v26.4s, v26.4s, v2.4s\n"    /* relu*/
-            "fmax   v27.4s, v27.4s, v2.4s\n"    /* relu*/
-            "fmax   v28.4s, v28.4s, v2.4s\n"    /* relu*/
-            "fmax   v29.4s, v29.4s, v2.4s\n"    /* relu*/
-            "fmax   v30.4s, v30.4s, v2.4s\n"    /* relu*/
-            "fmax   v31.4s, v31.4s, v2.4s\n"    /* relu*/
-            "12: \n"
-            "st1 {v8.4s, v9.4s, v10.4s},[%[c_ptr0]], #48\n"   /* store r0 */
-            "st1 {v11.4s, v12.4s, v13.4s},[%[c_ptr1]], #48\n" /* store r1 */
-            "st1 {v14.4s, v15.4s, v16.4s},[%[c_ptr2]], #48\n" /* store r2 */
-            "st1 {v17.4s, v18.4s, v19.4s},[%[c_ptr3]], #48\n" /* store r3 */
-            "st1 {v20.4s, v21.4s, v22.4s},[%[c_ptr4]], #48\n" /* store r4 */
-            "st1 {v23.4s, v24.4s, v25.4s},[%[c_ptr5]], #48\n" /* store r5 */
-            "st1 {v26.4s, v27.4s, v28.4s},[%[c_ptr6]], #48\n" /* store r6 */
-            "st1 {v29.4s, v30.4s, v31.4s},[%[c_ptr7]], #48\n" /* store r7 */
-
-            : [a_ptr] "+r"(a_ptr),
-              [b_ptr] "+r"(b_ptr),
-              [k] "+r"(k),
-              [tail] "+r"(tail),
-              [c_ptr0] "+r"(c_ptr0),
-              [c_ptr1] "+r"(c_ptr1),
-              [c_ptr2] "+r"(c_ptr2),
-              [c_ptr3] "+r"(c_ptr3),
-              [c_ptr4] "+r"(c_ptr4),
-              [c_ptr5] "+r"(c_ptr5),
-              [c_ptr6] "+r"(c_ptr6),
-              [c_ptr7] "+r"(c_ptr7)
-            : [bias_ptr] "r"(bias_local),
-              [relu] "r"(has_relu),
-              [has_beta] "r"(has_beta),
-              [beta] "r"(beta)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22",
-              "v23",
-              "v24",
-              "v25",
-              "v26",
-              "v27",
-              "v28",
-              "v29",
-              "v30",
-              "v31");
-        if (flag_p_remain && (xb == bblocks - 1)) {
-          for (int i = 0; i < remain; ++i) {
-            *pout0++ = cout0[i];
-            *pout1++ = cout1[i];
-            *pout2++ = cout2[i];
-            *pout3++ = cout3[i];
-            *pout4++ = cout4[i];
-            *pout5++ = cout5[i];
-            *pout6++ = cout6[i];
-            *pout7++ = cout7[i];
-          }
-        }
-      }
-    }
-  }
-}
-#else  // __aarch64__
-/**
- * \brief gemm with ablock = 6, bblock = 8, output 6x8
- * @param A
- * @param B
- * @param C
- * @param M
- * @param N
- * @param K
- * @param threads
- * @param workspace
- */
-void sgemm_prepacked_6x8(bool is_transB,
-                         int M,
-                         int N,
-                         int K,
-                         const float* A_packed,
-                         const float* B,
-                         int ldb,
-                         float beta,
-                         float* C,
-                         int ldc,
-                         const float* bias,
-                         bool has_bias,
-                         bool has_relu,
-                         ARMContext* ctx) {
-  size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
-  auto* workspace = ctx->workspace_data<float>();
-  int threads = ctx->threads();
-  //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
-  int x_block =
-      (l2_cache - (MBLOCK_OTH * K)) / (sizeof(float) * (K + MBLOCK_OTH));
-  x_block /= NBLOCK;
-  x_block *= NBLOCK;
-  int x_num = (N + (x_block - 1)) / x_block;
-  x_block = (N + x_num - 1) / x_num;
-  x_block = (x_block + NBLOCK - 1) / NBLOCK;
-  x_block *= NBLOCK;
-  x_block = x_block < NBLOCK ? NBLOCK : x_block;
-
-  int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1;
-  int tail_pre = (K & (KBLOCK - 1));
-  if (tail_pre == 0) {
-    tail_pre = KBLOCK;
-  }
-
-  bool flag_p_remain = false;
-  int remain = 0;
-
-  int has_beta = fabsf(beta) > 1e-8f ? 1 : 0;
-
-  //! apanel is pre_compute outside gemm
-  for (unsigned int x0 = 0; x0 < N; x0 += x_block) {
-    unsigned int xmax = x0 + x_block;
-    if (xmax > N) {
-      xmax = N;
-    }
-    int bblocks = (xmax - x0 + NBLOCK - 1) / NBLOCK;
-    remain = xmax - x0 - (bblocks - 1) * NBLOCK;
-    if (remain > 0) {
-      flag_p_remain = true;
-    }
-    //! load bpanel
-    auto b_pannel = static_cast<float*>(workspace);
-    if (is_transB) {
-      loadb_trans(b_pannel, B, ldb, 0, K, x0, xmax);
-    } else {
-      loadb(b_pannel, B, ldb, 0, K, x0, xmax);
-    }
-#pragma omp parallel for num_threads(threads)
-    for (unsigned int y = 0; y < M; y += MBLOCK_OTH) {
-      unsigned int ymax = y + MBLOCK_OTH;
-      if (ymax > M) {
-        ymax = M;
-      }
-      float* c_ptr0 = C + y * ldc + x0;
-      float* c_ptr1 = c_ptr0 + ldc;
-      float* c_ptr2 = c_ptr1 + ldc;
-      float* c_ptr3 = c_ptr2 + ldc;
-      float* c_ptr4 = c_ptr3 + ldc;
-      float* c_ptr5 = c_ptr4 + ldc;
-
-      float* pout0 = c_ptr0;
-      float* pout1 = c_ptr1;
-      float* pout2 = c_ptr2;
-      float* pout3 = c_ptr3;
-      float* pout4 = c_ptr4;
-      float* pout5 = c_ptr5;
-
-      float bias_local[6] = {0};
-      if (has_bias) {
-        bias_local[0] = bias[y];
-        bias_local[1] = bias[y + 1];
-        bias_local[2] = bias[y + 2];
-        bias_local[3] = bias[y + 3];
-        bias_local[4] = bias[y + 4];
-        bias_local[5] = bias[y + 5];
-      }
-
-      float cout0[NBLOCK];
-      float cout1[NBLOCK];
-      float cout2[NBLOCK];
-      float cout3[NBLOCK];
-      float cout4[NBLOCK];
-      float cout5[NBLOCK];
-
-      const float* a_ptr_l = A_packed + y * K;
-      const float* b_ptr = b_pannel;
-      for (int xb = 0; xb < bblocks; xb++) {
-        if ((y + 5) >= ymax) {
-          switch ((y + 5) - ymax) {
-            case 4:
-              c_ptr1 = cout1;
-            case 3:
-              c_ptr2 = cout2;
-            case 2:
-              c_ptr3 = cout3;
-            case 1:
-              c_ptr4 = cout4;
-            case 0:
-              c_ptr5 = cout5;
-            default:
-              break;
-          }
-        }
-        if (flag_p_remain && (xb == bblocks - 1)) {
-          pout0 = c_ptr0;
-          pout1 = c_ptr1;
-          pout2 = c_ptr2;
-          pout3 = c_ptr3;
-          pout4 = c_ptr4;
-          pout5 = c_ptr5;
-
-          c_ptr0 = cout0;
-          c_ptr1 = cout1;
-          c_ptr2 = cout2;
-          c_ptr3 = cout3;
-          c_ptr4 = cout4;
-          c_ptr5 = cout5;
-          if (has_beta) {
-            for (int i = 0; i < remain; ++i) {
-              cout0[i] = pout0[i];
-              cout1[i] = pout1[i];
-              cout2[i] = pout2[i];
-              cout3[i] = pout3[i];
-              cout4[i] = pout4[i];
-              cout5[i] = pout5[i];
-            }
-          }
-        }
-        const float* a_ptr = a_ptr_l;
-        int tails = tail_pre;
-        int k = k_pre;
-        asm volatile(
-            // sgemm 6x8
-            "vld1.32	{d2-d4}, [%[bias_ptr]]      @ load bias 6 elements\n"
-            "pld [%[a_ptr]]                         @ preload a\n"
-            "vdup.i32	q12,d4[0]                   @ out40=0\n"
-            "pld [%[b_ptr]]                         @ preload b\n"
-            "vdup.i32	q13,d4[0]                   @ out41=0\n"
-            "pld [%[a_ptr], #64]                    @ preload a\n"
-            "vdup.i32	q14,d4[1]                   @ out50=0\n"
-            "pld [%[b_ptr], #64]                    @ preload b\n"
-            "vdup.i32	q15,d4[1]                   @ out51=0\n"
-            "pld [%[a_ptr], #128]                   @ preload a\n"
-            "vdup.i32	q4, d2[0]                   @ out00=0\n"
-            "pld [%[b_ptr], #128]                   @ preload b\n"
-            "vdup.i32	q5, d2[0]                   @ out01=0\n"
-            "vdup.i32	q6, d2[1]                   @ out10=0\n"
-            "pld [%[a_ptr], #192]                   @ preload a\n"
-            "vdup.i32	q7, d2[1]                   @ out11=0\n"
-            "pld [%[b_ptr], #192]                   @ preload a\n"
-            "vdup.i32	q8, d3[0]                   @ out20=0\n"
-            "pld [%[a_ptr], #256]                   @ preload a\n"
-            "vdup.i32	q9, d3[0]                   @ out21=0\n"
-            "pld [%[b_ptr], #256]                   @ preload a\n"
-            "vdup.i32	q10,d3[1]                   @ out30=0\n"
-            "pld [%[b_ptr], #320]                   @ preload b\n"
-            "vdup.i32	q11,d3[1]                   @ out31=0\n"
-            "pld [%[b_ptr], #384]                   @ preload b\n"
-            "cmp %[has_beta], #0\n"
-            "beq    11f\n" /* check beta == 0? */
-            /* process beta */
-            "vdup.32    q3, %[beta]\n"          /* beta to vector */
-            "vld1.32    {d0-d3}, [%[c_ptr0]]\n" /* load output r0 */
-            "vmla.f32   q4, q0, q3\n"           /* cr00 += beta * c_r00 */
-            "vmla.f32   q5, q1, q3\n"           /* cr01 += beta * c_r01 */
-            "vld1.32    {d0-d3}, [%[c_ptr1]]\n" /* load output r1 */
-            "vmla.f32   q6, q0, q3\n"           /* cr10 += beta * c_r10 */
-            "vmla.f32   q7, q1, q3\n"           /* cr11 += beta * c_r11 */
-            "vld1.32    {d0-d3}, [%[c_ptr2]]\n" /* load output r2 */
-            "vmla.f32   q8, q0, q3\n"           /* cr20 += beta * c_r20 */
-            "vmla.f32   q9, q1, q3\n"           /* cr21 += beta * c_r21 */
-            "vld1.32    {d0-d3}, [%[c_ptr3]]\n" /* load output r3 */
-            "vmla.f32   q10, q0, q3\n"          /* cr30 += beta * c_r30 */
-            "vmla.f32   q11, q1, q3\n"          /* cr31 += beta * c_r31 */
-            "vld1.32    {d0-d3}, [%[c_ptr4]]\n" /* load output r4 */
-            "vmla.f32   q12, q0, q3\n"          /* cr40 += beta * c_r40 */
-            "vmla.f32   q13, q1, q3\n"          /* cr41 += beta * c_r41 */
-            "vld1.32    {d0-d3}, [%[c_ptr5]]\n" /* load output r5 */
-            "vmla.f32   q14, q0, q3\n"          /* cr50 += beta * c_r50 */
-            "vmla.f32   q15, q1, q3\n"          /* cr51 += beta * c_r51 */
-            "11: \n"                            /* check loop count */
-            "vld1.32	{d0-d1}, [%[a_ptr] :64]!    @ load a0~a3\n"
-            "vld1.32	{d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "cmp %[k], #0                     @ check weather k is bigger than "
-            "0\n"
-            "beq 0f                                 @ jump to tail\n"
-            "1:                                     @ main loop for k\n"
-            /* Unroll 0*/
-            "vld1.32	{d2-d3}, [%[a_ptr] :64]!    @ load a4, a5, and next a0, "
-            "a1\n"
-            "vmla.f32	q4, q2, d0[0]               @ out0 += b1 * a0\n"
-            "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            "vmla.f32	q6, q2, d0[1]               @ out1 += b1 * a1\n"
-            "vmla.f32	q8, q2, d1[0]               @ out2 += b1 * a2\n"
-            "vmla.f32	q10, q2, d1[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q12, q2, d2[0]              @ out4 += b1 * a4\n"
-            "vmla.f32	q14, q2, d2[1]              @ out5 += b1 * a5\n"
-            "vld1.32	{d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vmla.f32	q5, q3, d0[0]               @ out6 += b2 * a0\n"
-            "vmla.f32	q7, q3, d0[1]               @ out7 += b2 * a1\n"
-            "vmla.f32	q9, q3, d1[0]               @ out8 += b2 * a2\n"
-            "vmla.f32	q11, q3, d1[1]              @ out9 += b2 * a3\n"
-            "vld1.32	{d0-d1}, [%[a_ptr] :64]!    @ load a2~a5\n"
-            "vmla.f32	q13, q3, d2[0]              @ out10 += b2 * a4\n"
-            "vmla.f32	q15, q3, d2[1]              @ out11 += b2 * a5\n"
-            "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            /* Unroll 1 */
-            "vmla.f32	q4, q2, d3[0]               @ out0 += b1 * a0\n"
-            "vmla.f32	q6, q2, d3[1]               @ out1 += b1 * a1\n"
-            /*"pld [%[a_ptr], #64]                    @ preload a\n"*/
-            "vmla.f32	q8, q2, d0[0]               @ out2 += b1 * a2\n"
-            "vmla.f32	q10, q2, d0[1]              @ out3 += b1 * a3\n"
-            /*"pld [%[b_ptr], #192]\n"*/
-            "vmla.f32	q12, q2, d1[0]              @ out4 += b1 * a4\n"
-            "vmla.f32	q14, q2, d1[1]              @ out5 += b1 * a5\n"
-            "vld1.32	{d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vmla.f32	q5, q3, d3[0]               @ out6 += b2 * a0\n"
-            "vmla.f32	q7, q3, d3[1]               @ out7 += b2 * a1\n"
-            "vld1.32	{d2-d3}, [%[a_ptr] :64]!    @ load a0~a3\n"
-            "vmla.f32	q9, q3, d0[0]               @ out8 += b2 * a2\n"
-            "vmla.f32	q11, q3, d0[1]              @ out9 += b2 * a3\n"
-            "vmla.f32	q13, q3, d1[0]              @ out10 += b2 * a4\n"
-            "vmla.f32	q15, q3, d1[1]              @ out11 += b2 * a5\n"
-            "vld1.32	{d0-d1}, [%[a_ptr] :64]!    @ load a4, a5, a0, a1\n"
-            /* Unroll 2 */
-            "vmla.f32	q4, q2, d2[0]               @ out0 += b1 * a0\n"
-            "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            "vmla.f32	q6, q2, d2[1]               @ out1 += b1 * a1\n"
-            "vmla.f32	q8, q2, d3[0]               @ out2 += b1 * a2\n"
-            "vmla.f32	q10, q2, d3[1]              @ out3 += b1 * a3\n"
-            /*"pld [%[a_ptr], #240]                   @ preload\n"*/
-            "vmla.f32	q12, q2, d0[0]              @ out4 += b1 * a4\n"
-            "vmla.f32	q14, q2, d0[1]              @ out5 += b1 * a5\n"
-            "vld1.32	{d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vmla.f32	q5, q3, d2[0]               @ out6 += b2 * a0\n"
-            "vmla.f32	q7, q3, d2[1]               @ out7 += b2 * a1\n"
-            /*"pld [%[b_ptr], #208]\n"*/
-            "vmla.f32	q9, q3, d3[0]               @ out8 += b2 * a2\n"
-            "vmla.f32	q11, q3, d3[1]              @ out9 += b2 * a3\n"
-            "vld1.32	{d2-d3}, [%[a_ptr] :64]!    @ load a2~a5\n"
-            "vmla.f32	q13, q3, d0[0]              @ out10 += b2 * a4\n"
-            "vmla.f32	q15, q3, d0[1]              @ out11 += b2 * a5\n"
-            "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            /* Unroll 3 */
-            "vmla.f32	q4, q2, d1[0]               @ out0 += b1 * a0\n"
-            "vmla.f32	q6, q2, d1[1]               @ out1 += b1 * a1\n"
-            "vmla.f32	q8, q2, d2[0]               @ out2 += b1 * a2\n"
-            "vmla.f32	q10, q2, d2[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q12, q2, d3[0]              @ out4 += b1 * a4\n"
-            "vmla.f32	q14, q2, d3[1]              @ out5 += b1 * a5\n"
-            "vld1.32	{d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vmla.f32	q5, q3, d1[0]               @ out6 += b2 * a0\n"
-            "vmla.f32	q7, q3, d1[1]               @ out7 += b2 * a1\n"
-            "vld1.32	{d0-d1}, [%[a_ptr] :64]!    @ load a0~a3\n"
-            "vmla.f32	q9, q3, d2[0]               @ out8 += b2 * a2\n"
-            "vmla.f32	q11, q3, d2[1]              @ out9 += b2 * a3\n"
-            "subs		%[k], %[k], #1              @ k--\n"
-            "vmla.f32	q13, q3, d3[0]              @ out10 += b2 * a4\n"
-            "vmla.f32	q15, q3, d3[1]              @ out11 += b2 * a5\n"
-            "bne		1b                          @ jump to main loop\n"
-            "0:                                     @ process tail\n"
-            "subs		%[tails], %[tails], #1      @ tail--\n"
-            "beq		3f                          @ jump to tail = 1\n"
-            /* Unroll 0*/
-            "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            "vmla.f32	q4, q2, d0[0]               @ out0 += b1 * a0\n"
-            "vld1.32	{d2-d3}, [%[a_ptr] :64]!    @ load a4,5, a0, a1\n"
-            "vmla.f32	q6, q2, d0[1]               @ out1 += b1 * a1\n"
-            "vmla.f32	q8, q2, d1[0]               @ out2 += b1 * a2\n"
-            "vmla.f32	q10, q2, d1[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q12, q2, d2[0]              @ out4 += b1 * a4\n"
-            "subs		%[tails], %[tails], #1      @ tail--\n"
-            "vmla.f32	q14, q2, d2[1]              @ out5 += b1 * a5\n"
-            "vld1.32	{d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vmla.f32	q5, q3, d0[0]               @ out6 += b2 * a0\n"
-            "vmla.f32	q7, q3, d0[1]               @ out7 += b2 * a1\n"
-            "vmla.f32	q9, q3, d1[0]               @ out8 += b2 * a2\n"
-            "vmla.f32	q11, q3, d1[1]              @ out9 += b2 * a3\n"
-            "vld1.32	{d0-d1}, [%[a_ptr] :64]!    @ load a2~a5\n"
-            "vmla.f32	q13, q3, d2[0]              @ out10 += b2 * a4\n"
-            "vmla.f32	q15, q3, d2[1]              @ out11 += b2 * a5\n"
-            "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            "beq		4f                          @ jump to tail==2\n"
-            /* Unroll 1*/
-            "vmla.f32	q4, q2, d3[0]               @ out0 += b1 * a0\n"
-            "vmla.f32	q6, q2, d3[1]               @ out1 += b1 * a1\n"
-            "subs		%[tails], %[tails], #1      @ tail--\n"
-            "vmla.f32	q8, q2, d0[0]               @ out2 += b1 * a2\n"
-            "vmla.f32	q10, q2, d0[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q12, q2, d1[0]              @ out4 += b1 * a4\n"
-            "vmla.f32	q14, q2, d1[1]              @ out5 += b1 * a5\n"
-            "vld1.32	{d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vmla.f32	q5, q3, d3[0]               @ out6 += b2 * a0\n"
-            "vmla.f32	q7, q3, d3[1]               @ out7 += b2 * a1\n"
-            "vld1.32	{d2-d3}, [%[a_ptr] :64]!    @ load a0~a3\n"
-            "vmla.f32	q9, q3, d0[0]               @ out8 += b2 * a2\n"
-            "vmla.f32	q11, q3, d0[1]              @ out9 += b2 * a3\n"
-            "vmla.f32	q13, q3, d1[0]              @ out10 += b2 * a4\n"
-            "vmla.f32	q15, q3, d1[1]              @ out11 += b2 * a5\n"
-            "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            "beq		5f                          @ jump to tail==3\n"
-            /* Unroll 2 */
-            "vld1.32	{d0-d1}, [%[a_ptr] :64]!    @ load a4,a5, a0,a1\n"
-            "vmla.f32	q4, q2, d2[0]               @ out0 += b1 * a0\n"
-            "vmla.f32	q6, q2, d2[1]               @ out1 += b1 * a1\n"
-            "vmla.f32	q8, q2, d3[0]               @ out2 += b1 * a2\n"
-            "vmla.f32	q10, q2, d3[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q12, q2, d0[0]              @ out4 += b1 * a4\n"
-            "vmla.f32	q14, q2, d0[1]              @ out5 += b1 * a5\n"
-            "vld1.32	{d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vmla.f32	q5, q3, d2[0]               @ out6 += b2 * a0\n"
-            "vmla.f32	q7, q3, d2[1]               @ out7 += b2 * a1\n"
-            "vmla.f32	q9, q3, d3[0]               @ out8 += b2 * a2\n"
-            "vmla.f32	q11, q3, d3[1]              @ out9 += b2 * a3\n"
-            "vld1.32	{d2-d3}, [%[a_ptr] :64]!    @ load a2~a5\n"
-            "vmla.f32	q13, q3, d0[0]              @ out10 += b2 * a4\n"
-            "vmla.f32	q15, q3, d0[1]              @ out11 += b2 * a5\n"
-            "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            /* Unroll 3*/
-            "vmla.f32	q4, q2, d1[0]               @ out0 += b1 * a0\n"
-            "vmla.f32	q6, q2, d1[1]               @ out1 += b1 * a1\n"
-            "vmla.f32	q8, q2, d2[0]               @ out2 += b1 * a2\n"
-            "vmla.f32	q10, q2, d2[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q12, q2, d3[0]              @ out4 += b1 * a4\n"
-            "vmla.f32	q14, q2, d3[1]              @ out5  += b1 * a5\n"
-            "vmla.f32	q5, q3, d1[0]               @ out6 += b2 * a0\n"
-            "vmla.f32	q7, q3, d1[1]               @ out7 += b2 * a1\n"
-            "vmla.f32	q9, q3, d2[0]               @ out8 += b2 * a2\n"
-            "vmla.f32	q11, q3, d2[1]              @ out9 += b2 * a3\n"
-            "vmla.f32	q13, q3, d3[0]              @ out10 += b2 * a4\n"
-            "vmla.f32	q15, q3, d3[1]              @ out11 += b2 * a5\n"
-            "b		2f\n"
-            /* tails==1 final tail*/
-            "3:                                     @ tail=1\n"
-            "vmla.f32	q4, q2, d0[0]               @ out0 += b1 * a0\n"
-            "vld1.32	{d2}, [%[a_ptr] :64]!       @ load a4,a5\n"
-            "vmla.f32	q6, q2, d0[1]               @ out1 += b1 * a1\n"
-            "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            "vmla.f32	q8, q2, d1[0]               @ out2 += b1 * a2\n"
-            "vmla.f32	q10, q2, d1[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q12, q2, d2[0]              @ out4 += b1 * a4\n"
-            "vmla.f32	q14, q2, d2[1]              @ out5 += b1 * a5\n"
-            "vmla.f32	q5, q3, d0[0]               @ out6 += b2 * a0\n"
-            "vmla.f32	q7, q3, d0[1]               @ out7 += b2 * a1\n"
-            "vmla.f32	q9, q3, d1[0]               @ out8 += b2 * a2\n"
-            "vmla.f32	q11, q3, d1[1]              @ out9 += b2 * a3\n"
-            "vmla.f32	q13, q3, d2[0]              @ out10 += b2 * a4\n"
-            "vmla.f32	q15, q3, d2[1]              @ out11 += b2 * a5\n"
-            "b		2f                              @ jump to end\n"
-            /* tails==2 final tail*/
-            "4:                                     @ tail == 2\n"
-            "vmla.f32	q4, q2, d3[0]               @ out0 += b1 * a0\n"
-            "vmla.f32	q6, q2, d3[1]               @ out1 += b1 * a1\n"
-            "vmla.f32	q8, q2, d0[0]               @ out2 += b1 * a2\n"
-            "vmla.f32	q10, q2, d0[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q12, q2, d1[0]              @ out4 += b1 * a4\n"
-            "vmla.f32	q14, q2, d1[1]              @ out5 += b1 * a5\n"
-            "vmla.f32	q5, q3, d3[0]               @ out6 += b2 * a0\n"
-            "vmla.f32	q7, q3, d3[1]               @ out7 += b2 * a1\n"
-            "vmla.f32	q9, q3, d0[0]               @ out8 += b2 * a2\n"
-            "vmla.f32	q11, q3, d0[1]              @ out9 += b2 * a3\n"
-            "vmla.f32	q13, q3, d1[0]              @ out10 += b2 * a4\n"
-            "vmla.f32	q15, q3, d1[1]              @ out11 += b2 * a5\n"
-            "b		2f                              @ jump to end\n"
-            /* tails==3 final tail*/
-            "5:                                     @ tail=3\n"
-            "vmla.f32	q4, q2, d2[0]               @ out0 += b1 * a0\n"
-            "vld1.32	{d0}, [%[a_ptr] :64]!       @ load a4,a5\n"
-            "vmla.f32	q6, q2, d2[1]               @ out1 += b1 * a1\n"
-            "vmla.f32	q8, q2, d3[0]               @ out2 += b1 * a2\n"
-            "vmla.f32	q10, q2, d3[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q12, q2, d0[0]              @ out4 += b1 * a4\n"
-            "vmla.f32	q14, q2, d0[1]              @ out5 += b1 * a5\n"
-            "vmla.f32	q5, q3, d2[0]               @ out6 += b2 * a0\n"
-            "vmla.f32	q7, q3, d2[1]               @ out7 += b2 * a1\n"
-            "vmla.f32	q9, q3, d3[0]               @ out8 += b2 * a2\n"
-            "vmla.f32	q11, q3, d3[1]              @ out9 += b2 * a3\n"
-            "vmla.f32	q13, q3, d0[0]              @ out10 += b2 * a4\n"
-            "vmla.f32	q15, q3, d0[1]              @ out11 += b2 * a5\n"
-            "2:                                     @ check relu\n"
-            "cmp    %[relu], #0                     @ check if has relu\n"
-            "ble    6f                              @ skip relu if relu <= 0\n"
-            "vmov.u32    q0, #0                     @ for relu\n"
-            "vmax.f32   q4, q4, q0                  @ for relu\n"
-            "vmax.f32   q5, q5, q0                  @ for relu\n"
-            "vmax.f32   q6, q6, q0                  @ for relu\n"
-            "vmax.f32   q7, q7, q0                  @ for relu\n"
-            "vmax.f32   q8, q8, q0                  @ for relu\n"
-            "vmax.f32   q9, q9, q0                  @ for relu\n"
-            "vmax.f32   q10, q10, q0                @ for relu\n"
-            "vmax.f32   q11, q11, q0                @ for relu\n"
-            "vmax.f32   q12, q12, q0                @ for relu\n"
-            "vmax.f32   q13, q13, q0                @ for relu\n"
-            "vmax.f32   q14, q14, q0                @ for relu\n"
-            "vmax.f32   q15, q15, q0                @ for relu\n"
-            "6:                                     @ store result\n"
-            "vst1.32    {d8-d11},   [%[c_ptr0]]!    @ store r0\n"
-            "vst1.32    {d12-d15},  [%[c_ptr1]]!    @ store r1\n"
-            "vst1.32    {d16-d19},  [%[c_ptr2]]!    @ store r2\n"
-            "vst1.32    {d20-d23},  [%[c_ptr3]]!    @ store r3\n"
-            "vst1.32    {d24-d27},  [%[c_ptr4]]!    @ store r4\n"
-            "vst1.32    {d28-d31},  [%[c_ptr5]]!    @ store r5\n"
-            : [a_ptr] "+r"(a_ptr),
-              [b_ptr] "+r"(b_ptr),
-              [c_ptr0] "+r"(c_ptr0),
-              [c_ptr1] "+r"(c_ptr1),
-              [c_ptr2] "+r"(c_ptr2),
-              [c_ptr3] "+r"(c_ptr3),
-              [c_ptr4] "+r"(c_ptr4),
-              [c_ptr5] "+r"(c_ptr5),
-              [k] "+r"(k),
-              [tails] "+r"(tails)
-            : [bias_ptr] "r"(bias_local),
-              [relu] "r"(has_relu),
-              [has_beta] "r"(has_beta),
-              [beta] "r"(beta)
-            : "q0",
-              "q1",
-              "q2",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15",
-              "cc",
-              "memory");
-
-        if (flag_p_remain && (xb == bblocks - 1)) {
-          for (int i = 0; i < remain; ++i) {
-            *pout0++ = cout0[i];
-            *pout1++ = cout1[i];
-            *pout2++ = cout2[i];
-            *pout3++ = cout3[i];
-            *pout4++ = cout4[i];
-            *pout5++ = cout5[i];
-          }
-        }
-      }
-    }
-  }
-}
-
-void sgemm_prepacked_4x8(bool is_transB,
-                         int M,
-                         int N,
-                         int K,
-                         const float* A_packed,
-                         const float* B,
-                         int ldb,
-                         float beta,
-                         float* C,
-                         int ldc,
-                         const float* bias,
-                         bool has_bias,
-                         bool has_relu,
-                         ARMContext* ctx) {
-  size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
-  auto* workspace = ctx->workspace_data<float>();
-  int threads = ctx->threads();
-  //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
-  int x_block =
-      (l2_cache - (MBLOCK_A73 * K)) / (sizeof(float) * (K + MBLOCK_A73));
-  x_block /= NBLOCK;
-  x_block *= NBLOCK;
-  int x_num = (N + (x_block - 1)) / x_block;
-  x_block = (N + x_num - 1) / x_num;
-  x_block = (x_block + NBLOCK - 1) / NBLOCK;
-  x_block *= NBLOCK;
-  x_block = x_block < NBLOCK ? NBLOCK : x_block;
-
-  int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1;
-  int tail_pre = (K & (KBLOCK - 1));
-  if (tail_pre == 0) {
-    tail_pre = KBLOCK;
-  }
-
-  bool flag_p_remain = false;
-  int remain = 0;
-
-  int has_beta = fabsf(beta) > 1e-8f ? 1 : 0;
-
-  //! apanel is pre_compute outside gemm
-  for (unsigned int x0 = 0; x0 < N; x0 += x_block) {
-    unsigned int xmax = x0 + x_block;
-    if (xmax > N) {
-      xmax = N;
-    }
-    int bblocks = (xmax - x0 + NBLOCK - 1) / NBLOCK;
-    remain = xmax - x0 - (bblocks - 1) * NBLOCK;
-    if (remain > 0) {
-      flag_p_remain = true;
-    }
-    //! load bpanel
-    auto b_pannel = static_cast<float*>(workspace);
-    if (is_transB) {
-      loadb_trans(b_pannel, B, ldb, 0, K, x0, xmax);
-    } else {
-      loadb(b_pannel, B, ldb, 0, K, x0, xmax);
-    }
-#pragma omp parallel for num_threads(threads)
-    for (unsigned int y = 0; y < M; y += MBLOCK_A73) {
-      unsigned int ymax = y + MBLOCK_A73;
-      if (ymax > M) {
-        ymax = M;
-      }
-
-      float cout0[NBLOCK];
-      float cout1[NBLOCK];
-      float cout2[NBLOCK];
-      float cout3[NBLOCK];
-
-      float bias_local[4] = {0};
-      if (has_bias) {
-        bias_local[0] = bias[y];
-        bias_local[1] = bias[y + 1];
-        bias_local[2] = bias[y + 2];
-        bias_local[3] = bias[y + 3];
-      }
-
-      float* c_ptr0 = C + y * ldc + x0;
-      float* c_ptr1 = c_ptr0 + ldc;
-      float* c_ptr2 = c_ptr1 + ldc;
-      float* c_ptr3 = c_ptr2 + ldc;
-
-      float* pout0 = c_ptr0;
-      float* pout1 = c_ptr1;
-      float* pout2 = c_ptr2;
-      float* pout3 = c_ptr3;
-
-      const float* a_ptr_l = A_packed + y * K;
-      const float* b_ptr = b_pannel;
-      for (int xb = 0; xb < bblocks; xb++) {
-        if ((y + 3) >= ymax) {
-          switch ((y + 3) - ymax) {
-            case 2:
-              c_ptr1 = cout1;
-            case 1:
-              c_ptr2 = cout1;
-            case 0:
-              c_ptr3 = cout1;
-            default:
-              break;
-          }
-        }
-        if (flag_p_remain && (xb == bblocks - 1)) {
-          pout0 = c_ptr0;
-          pout1 = c_ptr1;
-          pout2 = c_ptr2;
-          pout3 = c_ptr3;
-
-          c_ptr0 = cout0;
-          c_ptr1 = cout1;
-          c_ptr2 = cout2;
-          c_ptr3 = cout3;
-
-          if (has_beta) {
-            for (int i = 0; i < remain; ++i) {
-              cout0[i] = pout0[i];
-              cout1[i] = pout1[i];
-              cout2[i] = pout2[i];
-              cout3[i] = pout3[i];
-            }
-          }
-        }
-        const float* a_ptr = a_ptr_l;
-        int tails = tail_pre;
-        int k = k_pre;
-        asm volatile(
-            "vld1.32    {d4-d5}, [%[bias_ptr]]      @ load bias\n"
-            "vdup.32    q8, d4[0]                   @ add bias to out00\n"
-            "pld [%[a_ptr]]                         @ preload a, 64byte\n"
-            "vdup.32    q9, d4[0]                   @ add bias to out01\n"
-            "pld [%[b_ptr]]                         @ preload b\n"
-            "vdup.32    q10, d4[1]                  @ add bias to out10\n"
-            "pld [%[a_ptr], #64]                    @ preload a\n"
-            "vdup.32    q11, d4[1]                  @ add bias to out11\n"
-            "vdup.32    q12, d5[0]                  @ add bias to out20\n"
-            "pld [%[b_ptr], #64]                    @ preload b\n"
-            "vdup.32    q13, d5[0]                  @ add bias to out21\n"
-            "pld [%[a_ptr], #128]                   @ preload a\n"
-            "vdup.32    q14, d5[1]                  @ add bias to out30\n"
-            "pld [%[b_ptr], #128]                   @ preload b\n"
-            "vdup.32    q15, d5[1]                  @ add bias to out31\n"
-            "pld [%[b_ptr], #192]                   @ preload b\n"
-            "cmp %[has_beta], #0\n"
-            "beq    11f\n" /* check beta == 0? */
-            /* process beta */
-            "vdup.32    q4, %[beta]\n"          /* beta to vector */
-            "vld1.32    {d0-d3}, [%[c_ptr0]]\n" /* load output r0 */
-            "vld1.32    {d4-d7}, [%[c_ptr1]]\n" /* load output r1 */
-            "vmla.f32   q8, q0, q4\n"           /* cr00 += beta * c_r00 */
-            "vmla.f32   q9, q1, q4\n"           /* cr01 += beta * c_r01 */
-            "vld1.32    {d0-d3}, [%[c_ptr2]]\n" /* load output r2 */
-            "vmla.f32   q10, q2, q4\n"          /* cr10 += beta * c_r10 */
-            "vmla.f32   q11, q3, q4\n"          /* cr11 += beta * c_r11 */
-            "vld1.32    {d4-d7}, [%[c_ptr3]]\n" /* load output r3 */
-            "vmla.f32   q12, q0, q4\n"          /* cr20 += beta * c_r20 */
-            "vmla.f32   q13, q1, q4\n"          /* cr21 += beta * c_r21 */
-            "vmla.f32   q14, q2, q4\n"          /* cr30 += beta * c_r30 */
-            "vmla.f32   q15, q3, q4\n"          /* cr31 += beta * c_r31 */
-            "11: \n"                            /* check loop count */
-            "vld1.32	{d0-d3}, [%[a_ptr] :128]!   @ load a0~a3\n"
-            "vld1.32   {d8-d11}, [%[b_ptr] :128]!   @ load b1\n"
-            "cmp %[k], #0                    @ check weather k is bigger than "
-            "0\n"
-            "beq 0f                                 @ jump to tail\n"
-            "1:                                     @ main loop for k\n"
-            /* Unroll 0*/
-            "vld1.32  {d12-d15}, [%[b_ptr] :128]!   @ load next b1, b2\n"
-            "vmla.f32	q8, q4, d0[0]               @ out0 += b1 * a0\n"
-            "vld1.32	{d4-d7}, [%[a_ptr] :128]!   @ load next 2xa0~a3\n"
-            "vmla.f32	q10, q4, d0[1]              @ out1 += b1 * a1\n"
-            "vmla.f32	q12, q4, d1[0]              @ out2 += b1 * a2\n"
-            "vmla.f32	q14, q4, d1[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q9, q5, d0[0]               @ out4 += b2 * a0\n"
-            "vmla.f32	q11, q5, d0[1]              @ out5 += b2 * a1\n"
-            "vmla.f32	q13, q5, d1[0]              @ out6 += b2 * a2\n"
-            "vmla.f32	q15, q5, d1[1]              @ out7 += b2 * a3\n"
-            "vld1.32	{d8-d11}, [%[b_ptr] :128]!  @ load next b1, b2\n"
-            /* Unroll 1 */
-            "vmla.f32	q8, q6, d2[0]               @ out0 += b1 * a0\n"
-            "pld [%[b_ptr], #64]                    @ preload b\n"
-            "vmla.f32	q10, q6, d2[1]              @ out1 += b1 * a1\n"
-            "vmla.f32	q12, q6, d3[0]              @ out2 += b1 * a2\n"
-            "vmla.f32	q14, q6, d3[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q9, q7, d2[0]               @ out6 += b2 * a0\n"
-            "vmla.f32	q11, q7, d2[1]              @ out7 += b2 * a1\n"
-            "vmla.f32	q13, q7, d3[0]              @ out8 += b2 * a2\n"
-            "vmla.f32	q15, q7, d3[1]              @ out9 += b2 * a3\n"
-            "vld1.32	{d12-d15}, [%[b_ptr] :128]! @ load next b1,b2\n"
-            /* Unroll 2 */
-            "vmla.f32	q8, q4, d4[0]               @ out0 += b1 * a0\n"
-            "vld1.32	{d0-d3}, [%[a_ptr] :128]!   @ load next a0~a3\n"
-            "vmla.f32	q10, q4, d4[1]              @ out1 += b1 * a1\n"
-            "vmla.f32	q12, q4, d5[0]              @ out2 += b1 * a2\n"
-            "vmla.f32	q14, q4, d5[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q9, q5, d4[0]               @ out4 += b2 * a0\n"
-            "vmla.f32	q11, q5, d4[1]              @ out5 += b2 * a1\n"
-            "vmla.f32	q13, q5, d5[0]              @ out6 += b2 * a2\n"
-            "vmla.f32	q15, q5, d5[1]              @ out7 += b2 * a3\n"
-            "vld1.32	{d8-d11}, [%[b_ptr] :128]!  @ load next b1, b2\n"
-            /* Unroll 3 */
-            "vmla.f32	q8, q6, d6[0]               @ out0 += b1 * a0\n"
-            "pld [%[a_ptr], #64]                    @ preload a\n"
-            "vmla.f32	q10, q6, d6[1]              @ out1 += b1 * a1\n"
-            "vmla.f32	q12, q6, d7[0]              @ out2 += b1 * a2\n"
-            "vmla.f32	q14, q6, d7[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q9, q7, d6[0]               @ out4 += b2 * a0\n"
-            "vmla.f32	q11, q7, d6[1]              @ out5 += b2 * a1\n"
-            "vmla.f32	q13, q7, d7[0]              @ out6 += b2 * a2\n"
-            "vmla.f32	q15, q7, d7[1]              @ out7 += b2 * a3\n"
-            "subs		%[k], %[k], #1              @ k--\n"
-            "bne		1b                          @ jump to main loop\n"
-            "0:                                     @ process tail\n"
-            "subs		%[tails], %[tails], #1      @ tail--\n"
-            "beq		3f                          @ jump to tail = 1\n"
-            /* Unroll 0*/
-            "vld1.32  {d12-d15}, [%[b_ptr] :128]!   @ load next b1, b2\n"
-            "vmla.f32	q8, q4, d0[0]               @ out0 += b1 * a0\n"
-            "vmla.f32	q10, q4, d0[1]              @ out1 += b1 * a1\n"
-            "subs		%[tails], %[tails], #1      @ tail--\n"
-            "vmla.f32	q12, q4, d1[0]              @ out2 += b1 * a2\n"
-            "vmla.f32	q14, q4, d1[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q9, q5, d0[0]               @ out4 += b2 * a0\n"
-            "vmla.f32	q11, q5, d0[1]              @ out5 += b2 * a1\n"
-            "vmla.f32	q13, q5, d1[0]              @ out6 += b2 * a2\n"
-            "vmla.f32	q15, q5, d1[1]              @ out7 += b2 * a3\n"
-            "beq		4f                          @ jump to tail==2\n"
-            /* Unroll 1 */
-            "vld1.32	{d8-d11}, [%[b_ptr] :128]!  @ load next b1, b2\n"
-            "vmla.f32	q8, q6, d2[0]               @ out0 += b1 * a0\n"
-            "vld1.32	{d4-d7}, [%[a_ptr] :128]!   @ load next 2xa0~a3\n"
-            "vmla.f32	q10, q6, d2[1]              @ out1 += b1 * a1\n"
-            "subs		%[tails], %[tails], #1      @ tail--\n"
-            "vmla.f32	q12, q6, d3[0]              @ out2 += b1 * a2\n"
-            "vmla.f32	q14, q6, d3[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q9, q7, d2[0]               @ out6 += b2 * a0\n"
-            "vmla.f32	q11, q7, d2[1]              @ out7 += b2 * a1\n"
-            "vmla.f32	q13, q7, d3[0]              @ out8 += b2 * a2\n"
-            "vmla.f32	q15, q7, d3[1]              @ out9 += b2 * a3\n"
-            "beq		5f                          @ jump to tail==3\n"
-            /* Unroll 2 */
-            "vld1.32	{d12-d15}, [%[b_ptr] :128]! @ load next b1,b2\n"
-            "vmla.f32	q8, q4, d4[0]               @ out0 += b1 * a0\n"
-            "vmla.f32	q10, q4, d4[1]              @ out1 += b1 * a1\n"
-            "vmla.f32	q12, q4, d5[0]              @ out2 += b1 * a2\n"
-            "vmla.f32	q14, q4, d5[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q9, q5, d4[0]               @ out4 += b2 * a0\n"
-            "vmla.f32	q11, q5, d4[1]              @ out5 += b2 * a1\n"
-            "vmla.f32	q13, q5, d5[0]              @ out6 += b2 * a2\n"
-            "vmla.f32	q15, q5, d5[1]              @ out7 += b2 * a3\n"
-            /* Unroll 3 */
-            "vmla.f32	q8, q6, d6[0]               @ out0 += b1 * a0\n"
-            "vmla.f32	q10, q6, d6[1]              @ out1 += b1 * a1\n"
-            "vmla.f32	q12, q6, d7[0]              @ out2 += b1 * a2\n"
-            "vmla.f32	q14, q6, d7[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q9, q7, d6[0]               @ out4 += b2 * a0\n"
-            "vmla.f32	q11, q7, d6[1]              @ out5 += b2 * a1\n"
-            "vmla.f32	q13, q7, d7[0]              @ out6 += b2 * a2\n"
-            "vmla.f32	q15, q7, d7[1]              @ out7 += b2 * a3\n"
-            "b		2f\n"
-            /* tails==1 final tail */
-            "3:                                     @ tail=1\n"
-            "vmla.f32	q8, q4, d0[0]               @ out0 += b1 * a0\n"
-            "vmla.f32	q10, q4, d0[1]              @ out1 += b1 * a1\n"
-            "vmla.f32	q12, q4, d1[0]              @ out2 += b1 * a2\n"
-            "vmla.f32	q14, q4, d1[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q9, q5, d0[0]               @ out4 += b2 * a0\n"
-            "vmla.f32	q11, q5, d0[1]              @ out5 += b2 * a1\n"
-            "vmla.f32	q13, q5, d1[0]              @ out6 += b2 * a2\n"
-            "vmla.f32	q15, q5, d1[1]              @ out7 += b2 * a3\n"
-            /*aptr - 16 */
-            "sub		%[a_ptr], %[a_ptr], #16     @ tail--\n"
-            "b		2f                              @ jump to end\n"
-            /* tails==2 final tail*/
-            "4:                                     @ tail == 2\n"
-            "vmla.f32	q8, q6, d2[0]               @ out0 += b1 * a0\n"
-            "vmla.f32	q10, q6, d2[1]              @ out1 += b1 * a1\n"
-            "vmla.f32	q12, q6, d3[0]              @ out2 += b1 * a2\n"
-            "vmla.f32	q14, q6, d3[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q9, q7, d2[0]               @ out4 += b2 * a0\n"
-            "vmla.f32	q11, q7, d2[1]              @ out5 += b2 * a1\n"
-            "vmla.f32	q13, q7, d3[0]              @ out6 += b2 * a2\n"
-            "vmla.f32	q15, q7, d3[1]              @ out7 += b2 * a3\n"
-            "b		2f                              @ jump to end\n"
-            /* tails==3 final tail*/
-            "5:                                     @ tail=3\n"
-            "vmla.f32	q8, q4, d4[0]               @ out0 += b1 * a0\n"
-            "vmla.f32	q10, q4, d4[1]              @ out1 += b1 * a1\n"
-            "vmla.f32	q12, q4, d5[0]              @ out2 += b1 * a2\n"
-            "vmla.f32	q14, q4, d5[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q9, q5, d4[0]               @ out4 += b2 * a0\n"
-            "vmla.f32	q11, q5, d4[1]              @ out5 += b2 * a1\n"
-            "vmla.f32	q13, q5, d5[0]              @ out6 += b2 * a2\n"
-            "vmla.f32	q15, q5, d5[1]              @ out7 += b2 * a3\n"
-            /*aptr - 16*/
-            "sub		%[a_ptr], %[a_ptr], #16     @ tail--\n"
-            "2:                                     @ check relu\n"
-            "cmp    %[relu], #0                     @ check if has relu\n"
-            "ble    6f                              @ skip relu if relu <= 0\n"
-            "vmov.u32    q0, #0                     @ for relu\n"
-            "vmax.f32   q8, q8, q0                  @ for relu\n"
-            "vmax.f32   q9, q9, q0                  @ for relu\n"
-            "vmax.f32   q10, q10, q0                @ for relu\n"
-            "vmax.f32   q11, q11, q0                @ for relu\n"
-            "vmax.f32   q12, q12, q0                @ for relu\n"
-            "vmax.f32   q13, q13, q0                @ for relu\n"
-            "vmax.f32   q14, q14, q0                @ for relu\n"
-            "vmax.f32   q15, q15, q0                @ for relu\n"
-            "6:                                     @ store result\n"
-            "vst1.32    {d16-d19},  [%[c_ptr0]]!    @ store r0\n"
-            "vst1.32    {d20-d23},  [%[c_ptr1]]!    @ store r1\n"
-            "vst1.32    {d24-d27},  [%[c_ptr2]]!    @ store r2\n"
-            "vst1.32    {d28-d31},  [%[c_ptr3]]!    @ store r3\n"
-            : [a_ptr] "+r"(a_ptr),
-              [b_ptr] "+r"(b_ptr),
-              [c_ptr0] "+r"(c_ptr0),
-              [c_ptr1] "+r"(c_ptr1),
-              [c_ptr2] "+r"(c_ptr2),
-              [c_ptr3] "+r"(c_ptr3),
-              [k] "+r"(k),
-              [tails] "+r"(tails)
-            : [bias_ptr] "r"(bias_local),
-              [relu] "r"(has_relu),
-              [has_beta] "r"(has_beta),
-              [beta] "r"(beta)
-            : "q0",
-              "q1",
-              "q2",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15",
-              "cc",
-              "memory");
-
-        if (flag_p_remain && (xb == bblocks - 1)) {
-          for (int i = 0; i < remain; ++i) {
-            *pout0++ = cout0[i];
-            *pout1++ = cout1[i];
-            *pout2++ = cout2[i];
-            *pout3++ = cout3[i];
-          }
-        }
-      }
-    }
-  }
-}
-#endif  // __aarch64__
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/packed_sgemm.h b/lite/backends/arm/math/packed_sgemm.h
deleted file mode 100644
index 396ca7beb9..0000000000
--- a/lite/backends/arm/math/packed_sgemm.h
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cmath>
-#include "lite/core/context.h"
-#include "lite/core/device_info.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-#ifdef __aarch64__
-constexpr int MBLOCK = 8;
-constexpr int NBLOCK = 12;
-constexpr int KBLOCK = 4;
-inline int get_hblock(ARMArch arch) { return MBLOCK; }
-#else
-constexpr int MBLOCK_A73 = 4;
-constexpr int MBLOCK_OTH = 6;
-constexpr int NBLOCK = 8;
-constexpr int KBLOCK = 4;
-inline int get_hblock(ARMArch arch) {
-  if (arch == kA73) {
-    return MBLOCK_A73;
-  } else {
-    return MBLOCK_OTH;
-  }
-}
-#endif  // __aarch64__
-
-void prepackA(float* out,
-              const float* in,
-              float alpha,
-              int ldin,
-              int m0,
-              int mmax,
-              int k0,
-              int kmax,
-              bool is_trans,
-              ARMContext* ctx);
-
-void prepackA(TensorLite* tout,
-              const TensorLite& tin,
-              float alpha,
-              int m,
-              int k,
-              int group,
-              bool is_trans,
-              ARMContext* ctx);
-
-void sgemm_prepack(bool is_transB,
-                   int M,
-                   int N,
-                   int K,
-                   const float* A_packed,
-                   const float* B,
-                   int ldb,
-                   float beta,
-                   float* C,
-                   int ldc,
-                   const float* bias,
-                   bool has_bias,
-                   bool has_relu,
-                   ARMContext* ctx);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/pad2d.cc b/lite/backends/arm/math/pad2d.cc
deleted file mode 100644
index 35c4fafb77..0000000000
--- a/lite/backends/arm/math/pad2d.cc
+++ /dev/null
@@ -1,413 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/pad2d.h"
-#include <algorithm>
-#include <limits>
-#include <memory>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void pad_constant(const float* din,
-                  float* dout,
-                  int n,
-                  int c,
-                  int h,
-                  int w,
-                  const int pad_top,
-                  const int pad_bottom,
-                  const int pad_left,
-                  const int pad_right,
-                  const float pad_value) {
-  int h_in = h - pad_top - pad_bottom;
-  int w_in = w - pad_left - pad_right;
-  int spatial_size_out = w * h;
-  int spatial_size_in = h_in * w_in;
-#pragma omp parallel for
-  for (int s = 0; s < n * c; ++s) {
-    const float* din_s = din + s * spatial_size_in;
-    float* dout_s = dout + s * spatial_size_out;
-    int top_loop = (w * pad_top) >> 3;
-    int top_loop_remain = (w * pad_top) & 7;
-    float32x4_t vpad_value = vdupq_n_f32(pad_value);
-    // process top
-    for (int i = 0; i < top_loop; ++i) {
-      vst1q_f32(dout_s, vpad_value);
-      vst1q_f32(dout_s + 4, vpad_value);
-      dout_s += 8;
-    }
-    for (int i = 0; i < top_loop_remain; ++i) {
-      *dout_s++ = pad_value;
-    }
-    // process med
-    int left_loop = pad_left >> 2;
-    int left_loop_remain = pad_left & 3;
-    int med_loop = w_in >> 3;
-    int med_loop_remain = w_in & 7;
-    for (int i = 0; i < left_loop; ++i) {
-      vst1q_f32(dout_s, vpad_value);
-      dout_s += 4;
-    }
-
-    for (int i = 0; i < left_loop_remain; ++i) {
-      *dout_s++ = pad_value;
-    }
-
-    for (int i = 0; i < med_loop; ++i) {
-      float32x4_t val = vld1q_f32(din_s);
-      float32x4_t val1 = vld1q_f32(din_s + 4);
-      vst1q_f32(dout_s, val);
-      vst1q_f32(dout_s + 4, val1);
-      dout_s += 8;
-      din_s += 8;
-    }
-    for (int i = 0; i < med_loop_remain; ++i) {
-      float val = *din_s++;
-      *dout_s++ = val;
-    }
-
-    int loop = (pad_right + pad_left) >> 2;
-    int loop_remain = (pad_right + pad_left) & 3;
-    for (int j = 0; j < h_in - 1; ++j) {
-      for (int i = 0; i < loop; ++i) {
-        vst1q_f32(dout_s, vpad_value);
-        dout_s += 4;
-      }
-
-      for (int i = 0; i < loop_remain; ++i) {
-        *dout_s++ = pad_value;
-      }
-
-      for (int i = 0; i < med_loop; ++i) {
-        float32x4_t val = vld1q_f32(din_s);
-        float32x4_t val1 = vld1q_f32(din_s + 4);
-        vst1q_f32(dout_s, val);
-        vst1q_f32(dout_s + 4, val1);
-        dout_s += 8;
-        din_s += 8;
-      }
-
-      for (int i = 0; i < med_loop_remain; ++i) {
-        *dout_s++ = *din_s++;
-      }
-    }
-    int right_loop = pad_right >> 2;
-    int right_loop_remain = pad_right & 3;
-
-    for (int i = 0; i < right_loop; ++i) {
-      vst1q_f32(dout_s, vpad_value);
-      dout_s += 4;
-    }
-
-    for (int i = 0; i < right_loop_remain; ++i) {
-      *dout_s++ = pad_value;
-    }
-    // process bottom
-    int bottom_loop = (pad_bottom * w) >> 3;
-    int bottom_loop_remain = (pad_bottom * w) & 7;
-    for (int i = 0; i < bottom_loop; ++i) {
-      vst1q_f32(dout_s, vpad_value);
-      vst1q_f32(dout_s + 4, vpad_value);
-      dout_s += 8;
-    }
-    for (int i = 0; i < bottom_loop_remain; ++i) {
-      *dout_s++ = pad_value;
-    }
-  }
-}
-
-void pad_edge(const float* din,
-              float* dout,
-              int n,
-              int c,
-              int h,
-              int w,
-              const int pad_top,
-              const int pad_bottom,
-              const int pad_left,
-              const int pad_right,
-              const float pad_value) {
-  int h_in = h - pad_top - pad_bottom;
-  int w_in = w - pad_left - pad_right;
-  int spatial_size_out = w * h;
-  int spatial_size_in = h_in * w_in;
-#pragma omp parallel for
-  for (int s = 0; s < n * c; ++s) {
-    const float* din_s = din + s * spatial_size_in;
-    float* dout_s = dout + s * spatial_size_out;
-
-    // process med
-    int left_loop = pad_left >> 2;
-    int right_loop = pad_right >> 2;
-    int med_loop = w_in >> 3;
-    int med_loop_remain = w_in & 7;
-    int left_loop_remain = pad_left & 3;
-    int right_loop_remain = pad_right & 3;
-    float* dout_med = dout_s + w * pad_top;
-    for (int j = 0; j < h_in; ++j) {
-      float edge_val = din_s[0];
-      float32x4_t vedge = vdupq_n_f32(edge_val);
-      for (int i = 0; i < left_loop; ++i) {
-        vst1q_f32(dout_med, vedge);
-        dout_med += 4;
-      }
-      for (int i = 0; i < left_loop_remain; ++i) {
-        *dout_med++ = edge_val;
-      }
-      for (int i = 0; i < med_loop; ++i) {
-        float32x4_t val = vld1q_f32(din_s);
-        float32x4_t val1 = vld1q_f32(din_s + 4);
-        vst1q_f32(dout_med, val);
-        vst1q_f32(dout_med + 4, val1);
-        din_s += 8;
-        dout_med += 8;
-      }
-      for (int i = 0; i < med_loop_remain; ++i) {
-        *dout_med++ = *din_s++;
-      }
-      edge_val = din_s[-1];
-      vedge = vdupq_n_f32(edge_val);
-      for (int i = 0; i < right_loop; ++i) {
-        vst1q_f32(dout_med, vedge);
-        dout_med += 4;
-      }
-      for (int i = 0; i < right_loop_remain; ++i) {
-        *dout_med++ = edge_val;
-      }
-    }
-
-    // process bottom
-    float* dout_bottom = dout_med;
-    for (int i = 0; i < pad_bottom; ++i) {
-      memcpy(dout_bottom, dout_s + w * (pad_top + h_in - 1), w * sizeof(float));
-      dout_bottom += w;
-    }
-
-    // process top
-    float* dout_top = dout_s;
-    for (int i = 0; i < pad_top; ++i) {
-      memcpy(dout_top, dout_s + w * pad_top, w * sizeof(float));
-      dout_top += w;
-    }
-  }
-}
-
-void pad_reflect(const float* din,
-                 float* dout,
-                 int n,
-                 int c,
-                 int h,
-                 int w,
-                 const int pad_top,
-                 const int pad_bottom,
-                 const int pad_left,
-                 const int pad_right,
-                 const float pad_value) {
-  int h_in = h - pad_top - pad_bottom;
-  int w_in = w - pad_left - pad_right;
-  int spatial_size_out = w * h;
-  int spatial_size_in = h_in * w_in;
-#pragma omp parallel for
-  for (int s = 0; s < n * c; ++s) {
-    const float* din_s = din + s * spatial_size_in;
-    float* dout_s = dout + s * spatial_size_out;
-
-    // process med
-    int left_loop = pad_left >> 2;
-    int right_loop = pad_right >> 2;
-    int med_loop = w_in >> 3;
-    int med_loop_remain = w_in & 7;
-    int left_loop_remain = pad_left & 3;
-    int right_loop_remain = pad_right & 3;
-    float* dout_med = dout_s + w * pad_top;
-    for (int j = 0; j < h_in; ++j) {
-#ifdef __aarch64__
-      for (int i = 0; i < left_loop; ++i) {
-        float32x4_t val = vld1q_f32(din_s + left_loop_remain +
-                                    ((left_loop - i - 1) << 2) + 1);
-        val = vrev64q_f32(val);
-        float32x2_t low = vget_low_f32(val);
-        float32x2_t high = vget_high_f32(val);
-        float32x2_t tmp = low;
-        low = high;
-        high = tmp;
-        float32x4_t val1 = vcombine_f32(low, high);
-        vst1q_f32(dout_med, val1);
-        dout_med += 4;
-      }
-#else
-      const float* din_s_ptr =
-          din_s + left_loop_remain + ((left_loop - 1) << 2) + 1;
-      int cnt = left_loop;
-      if (cnt > 0) {
-        asm volatile(
-            "1:    \n"
-            "vld1.32 {d0-d1}, [%[din_s]]  \n"
-            "subs %[cnt], #1       \n"
-            "sub %[din_s], #16   \n"
-            "vrev64.32 q1, q0   \n"
-            "vswp d2, d3        \n"
-            "vst1.32 {d2-d3}, [%[dout_med]]!\n"
-            "bne 1b \n"
-            :
-            [din_s] "+r"(din_s_ptr), [dout_med] "+r"(dout_med), [cnt] "+r"(cnt)
-            :
-            : "cc", "memory", "q0", "q1");
-      }
-#endif  // __aarch64__
-      for (int i = 0; i < left_loop_remain; ++i) {
-        *dout_med++ = *(din_s + left_loop_remain - i);
-      }
-      for (int i = 0; i < med_loop; ++i) {
-        float32x4_t val = vld1q_f32(din_s);
-        float32x4_t val1 = vld1q_f32(din_s + 4);
-        vst1q_f32(dout_med, val);
-        vst1q_f32(dout_med + 4, val1);
-        din_s += 8;
-        dout_med += 8;
-      }
-      for (int i = 0; i < med_loop_remain; ++i) {
-        *dout_med++ = *din_s++;
-      }
-#ifdef __aarch64__
-      for (int i = 0; i < right_loop; ++i) {
-        float32x4_t val = vld1q_f32(din_s - ((i + 1) << 2) - 1);
-        val = vrev64q_f32(val);
-        float32x2_t low = vget_low_f32(val);
-        float32x2_t high = vget_high_f32(val);
-        float32x2_t tmp = low;
-        low = high;
-        high = tmp;
-        float32x4_t val1 = vcombine_f32(low, high);
-        vst1q_f32(dout_med, val1);
-        dout_med += 4;
-      }
-#else
-      din_s_ptr = din_s - 5;
-      cnt = right_loop;
-      if (cnt > 0) {
-        asm volatile(
-            "1:    \n"
-            "vld1.32 {d0-d1}, [%[din_s]]  \n"
-            "subs %[cnt], #1     \n"
-            "sub %[din_s], #16    \n"
-            "vrev64.32 q1, q0    \n"
-            "vswp d2, d3         \n"
-            "vst1.32 {d2-d3}, [%[dout_med]]!\n"
-            "bne 1b \n"
-            :
-            [din_s] "+r"(din_s_ptr), [dout_med] "+r"(dout_med), [cnt] "+r"(cnt)
-            :
-            : "cc", "memory", "q0", "q1");
-      }
-#endif  // __aarch64__
-      const float* remain = din_s - (right_loop << 2) - 2;
-      for (int i = 0; i < right_loop_remain; ++i) {
-        *dout_med++ = *remain--;
-      }
-    }
-
-    // process bottom
-    float* dout_bottom = dout_med;
-    float* dout_bottom_reflect = dout_med - (w << 1);
-    for (int i = 0; i < pad_bottom; ++i) {
-      memcpy(dout_bottom, dout_bottom_reflect, w * sizeof(float));
-      dout_bottom += w;
-      dout_bottom_reflect -= w;
-    }
-
-    // process top
-    float* dout_top = dout_s;
-    float* dout_top_reflect = dout_s + w * (pad_top << 1);
-    for (int i = 0; i < pad_top; ++i) {
-      memcpy(dout_top, dout_top_reflect, w * sizeof(float));
-      dout_top += w;
-      dout_top_reflect -= w;
-    }
-  }
-}
-
-// void pad2d_func(const lite::Tensor *input,lite::Tensor *output)
-void pad2d_func(const lite::Tensor* input,
-                lite::Tensor* output,
-                int _mode,
-                std::vector<int> _pad_h,
-                std::vector<int> _pad_w,
-                float _pad_value) {
-  float* dout = output->mutable_data<float>();  // modified by zhiqiang
-  const float* din = input->data<float>();      // modified by zhiqiang
-
-  auto output_dims = output->dims();
-  // nchw
-  int on = output_dims[0];
-  int oc = output_dims[1];
-  int oh = output_dims[2];
-  int ow = output_dims[3];
-  /////////////////////////////
-  /*     _mode是PadMode
-         typedef enum{
-             PAD_CONSTANT = 0,
-             PAD_EDGE = 1,
-             PAD_REFLECT = 2,
-         } PadMode;   */
-  /////////////////////////
-  if (_mode == 0) {
-    pad_constant(din,
-                 dout,
-                 on,
-                 oc,
-                 oh,
-                 ow,
-                 _pad_h[0],
-                 _pad_h[1],
-                 _pad_w[0],
-                 _pad_w[1],
-                 _pad_value);
-  } else if (_mode == 1) {
-    pad_edge(din,
-             dout,
-             on,
-             oc,
-             oh,
-             ow,
-             _pad_h[0],
-             _pad_h[1],
-             _pad_w[0],
-             _pad_w[1],
-             _pad_value);
-  } else if (_mode == 2) {
-    pad_reflect(din,
-                dout,
-                on,
-                oc,
-                oh,
-                ow,
-                _pad_h[0],
-                _pad_h[1],
-                _pad_w[0],
-                _pad_w[1],
-                _pad_value);
-  } else {
-    LOG(ERROR) << "ERROR: unknown pad mode " << _mode;
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/pad2d.h b/lite/backends/arm/math/pad2d.h
deleted file mode 100644
index 08c5c8c1a2..0000000000
--- a/lite/backends/arm/math/pad2d.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "lite/operators/op_params.h"
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void pad_constant(const float* din,
-                  float* dout,
-                  int n,
-                  int c,
-                  int h,
-                  int w,
-                  const int pad_top,
-                  const int pad_bottom,
-                  const int pad_left,
-                  const int pad_right,
-                  const float pad_value);
-void pad_edge(const float* din,
-              float* dout,
-              int n,
-              int c,
-              int h,
-              int w,
-              const int pad_top,
-              const int pad_bottom,
-              const int pad_left,
-              const int pad_right,
-              const float pad_value);
-void pad_reflect(const float* din,
-                 float* dout,
-                 int n,
-                 int c,
-                 int h,
-                 int w,
-                 const int pad_top,
-                 const int pad_bottom,
-                 const int pad_left,
-                 const int pad_right,
-                 const float pad_value);
-void pad2d_func(const lite::Tensor* input,
-                lite::Tensor* output,
-                int _mode,
-                std::vector<int> _pad_h,
-                std::vector<int> _pad_w,
-                float _pad_value);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/pooling.cc b/lite/backends/arm/math/pooling.cc
deleted file mode 100644
index 38078580c2..0000000000
--- a/lite/backends/arm/math/pooling.cc
+++ /dev/null
@@ -1,3173 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/pooling.h"
-#include <algorithm>
-#include <limits>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void pooling_basic(const float* din,
-                   float* dout,
-                   int num,
-                   int chout,
-                   int hout,
-                   int wout,
-                   int chin,
-                   int hin,
-                   int win,
-                   const std::vector<int>& ksize,
-                   const std::vector<int>& strides,
-                   const std::vector<int>& paddings,
-                   bool global_pooling,
-                   bool exclusive,
-                   bool adaptive,
-                   bool ceil_mode,
-                   bool use_quantizer,
-                   const std::string& pooling_type) {
-  // no need to pad input tensor, border is zero pad inside this function
-  int kernel_h = ksize[0];
-  int kernel_w = ksize[1];
-  int stride_h = strides[0];
-  int stride_w = strides[1];
-  int pad_h = paddings[0];
-  int pad_w = paddings[1];
-  int size_channel_in = win * hin;
-  int size_channel_out = wout * hout;
-  if (global_pooling) {
-    if (pooling_type == "max") {  // Pooling_max
-      for (int n = 0; n < num; ++n) {
-        float* dout_batch = dout + n * chout * size_channel_out;
-        const float* din_batch = din + n * chin * size_channel_in;
-#pragma omp parallel for
-        for (int c = 0; c < chout; ++c) {
-          const float* din_ch = din_batch + c * size_channel_in;  // in address
-          float tmp1 = din_ch[0];
-          for (int i = 0; i < size_channel_in; ++i) {
-            float tmp2 = din_ch[i];
-            tmp1 = tmp1 > tmp2 ? tmp1 : tmp2;
-          }
-          dout_batch[c] = tmp1;
-        }
-      }
-    } else if (pooling_type == "avg") {
-      // Pooling_average_include_padding
-      // Pooling_average_exclude_padding
-      for (int n = 0; n < num; ++n) {
-        float* dout_batch = dout + n * chout * size_channel_out;
-        const float* din_batch = din + n * chin * size_channel_in;
-#pragma omp parallel for
-        for (int c = 0; c < chout; ++c) {
-          const float* din_ch = din_batch + c * size_channel_in;  // in address
-          float sum = 0.f;
-          for (int i = 0; i < size_channel_in; ++i) {
-            sum += din_ch[i];
-          }
-          dout_batch[c] = sum / size_channel_in;
-        }
-      }
-    } else {
-      LOG(FATAL) << "unsupported pooling type: " << pooling_type;
-    }
-  } else {
-    if (pooling_type == "max") {
-      // Pooling_max
-      for (int n = 0; n < num; ++n) {
-        float* dout_ch = dout + n * chout * size_channel_out;
-        const float* din_batch = din + n * chin * size_channel_in;
-#pragma omp parallel for
-        for (int c = 0; c < chout; c++) {
-          float* dout_row = dout_ch + c * size_channel_out;
-          const float* din_ch = din_batch + c * size_channel_in;
-          for (int i = 0; i < hout; i++) {
-            for (int j = 0; j < wout; j++) {
-              int hstart = i * stride_h - pad_h;
-              int wstart = j * stride_w - pad_w;
-              int hend = std::min(hstart + kernel_h, hin + pad_h);
-              int wend = std::min(wstart + kernel_w, win + pad_w);
-              hstart = std::max(hstart, 0);
-              wstart = std::max(wstart, 0);
-              hend = std::min(hend, hin);
-              wend = std::min(wend, win);
-              int pool_size = (hend - hstart) * (wend - wstart);
-              if (pool_size == 0) continue;
-              float tmp1 = din_ch[hstart * win + wstart];
-              for (int h = hstart; h < hend; ++h) {
-                for (int w = wstart; w < wend; ++w) {
-                  float tmp2 = din_ch[h * win + w];
-                  tmp1 = tmp1 > tmp2 ? tmp1 : tmp2;
-                }
-              }
-              dout_row[j] = tmp1;
-            }
-            dout_row += wout;
-          }
-        }
-      }
-    } else if (pooling_type == "avg") {
-      if (exclusive) {
-        // Pooling_average_exclude_padding
-        for (int n = 0; n < num; ++n) {
-          float* dout_ch = dout + n * chout * size_channel_out;
-          const float* din_batch = din + n * chin * size_channel_in;
-#pragma omp parallel for
-          for (int c = 0; c < chout; c++) {
-            float* dout_row = dout_ch + c * size_channel_out;
-            const float* din_ch = din_batch + c * size_channel_in;
-            for (int i = 0; i < hout; i++) {
-              for (int j = 0; j < wout; j++) {
-                int hstart = i * stride_h - pad_h;
-                int wstart = j * stride_w - pad_w;
-                int hend = std::min(hstart + kernel_h, hin + pad_h);
-                int wend = std::min(wstart + kernel_w, win + pad_w);
-                hstart = std::max(hstart, 0);
-                wstart = std::max(wstart, 0);
-                hend = std::min(hend, hin);
-                wend = std::min(wend, win);
-                int pool_size = (hend - hstart) * (wend - wstart);
-                if (pool_size == 0) continue;
-                float sum = 0.f;
-                for (int h = hstart; h < hend; ++h) {
-                  for (int w = wstart; w < wend; ++w) {
-                    sum += din_ch[h * win + w];
-                  }
-                }
-                dout_row[j] = sum / pool_size;
-              }
-              dout_row += wout;
-            }
-          }
-        }
-      } else {  // Pooling_average_include_padding
-        for (int n = 0; n < num; ++n) {
-          float* dout_ch = dout + n * chout * size_channel_out;
-          const float* din_batch = din + n * chin * size_channel_in;
-#pragma omp parallel for
-          for (int c = 0; c < chout; c++) {
-            float* dout_row = dout_ch + c * size_channel_out;
-            const float* din_ch = din_batch + c * size_channel_in;
-            for (int i = 0; i < hout; i++) {
-              for (int j = 0; j < wout; j++) {
-                int hstart = i * stride_h - pad_h;
-                int wstart = j * stride_w - pad_w;
-                int hend = std::min(hstart + kernel_h, hin + pad_h);
-                int wend = std::min(wstart + kernel_w, win + pad_w);
-                hstart = std::max(hstart, 0);
-                wstart = std::max(wstart, 0);
-                hend = std::min(hend, hin);
-                wend = std::min(wend, win);
-                int pool_size = (hend - hstart) * (wend - wstart);
-                if (pool_size == 0) continue;
-                float sum = 0.f;
-                for (int h = hstart; h < hend; ++h) {
-                  for (int w = wstart; w < wend; ++w) {
-                    sum += din_ch[h * win + w];
-                  }
-                }
-                dout_row[j] = sum / (kernel_w * kernel_h);
-              }
-              dout_row += wout;
-            }
-          }
-        }
-      }
-    } else {
-      LOG(FATAL) << "unsupported pooling type: " << pooling_type;
-    }
-  }
-}
-
-void pooling_global_max(const float* din,
-                        float* dout,
-                        int num,
-                        int chout,
-                        int hout,
-                        int wout,
-                        int chin,
-                        int hin,
-                        int win) {
-  int size_channel_in = win * hin;
-  int cnt = size_channel_in / 8;
-  for (int n = 0; n < num; ++n) {
-    float* dout_batch = dout + n * chout;
-    const float* din_batch = din + n * chin * size_channel_in;
-#pragma omp parallel for
-    for (int c = 0; c < chout; ++c) {
-      const float* din_ch = din_batch + c * size_channel_in;
-      int i = 0;
-      float minval = std::numeric_limits<float>::lowest();
-      float32x4_t vmax = vdupq_n_f32(minval);
-#ifdef __aarch64__
-      for (; i < cnt; i++) {
-        float32x4_t vdin1 = vld1q_f32(din_ch);
-        vmax = vmaxq_f32(vdin1, vmax);
-        float32x4_t vdin2 = vld1q_f32(din_ch + 4);
-        vmax = vmaxq_f32(vmax, vdin2);
-        din_ch += 8;
-      }
-#else
-      int cnt_num = cnt;
-      if (cnt_num > 0) {
-        asm volatile(
-            "max_loop:                          @main loop\n"
-            "vld1.f32   {d0-d1}, [%[din_ch]]!   @load q1,din_ch\n"
-            "vmax.f32   %q[vmax], %q[vmax], q0  @max vmax,vmax,din_ch\n"
-            "vld1.f32   {d2-d3}, [%[din_ch]]!   @load 2nd 4 data\n"
-            "vmax.f32   %q[vmax], %q[vmax], q1  @compare 2nd 4 datas\n"
-            "subs       %[cnt_num], #1          @cnt_num--\n"
-            "bne        max_loop                @bne cnt_num\n"
-            : [din_ch] "+r"(din_ch), [cnt_num] "+r"(cnt_num), [vmax] "+w"(vmax)
-            :
-            : "cc", "memory", "q0", "q1");
-      }
-#endif  // __aarch64__
-      float32x2_t vmax_tmp = vmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
-      float tmp1 = vget_lane_f32(vmax_tmp, 0);
-      float tmp2 = vget_lane_f32(vmax_tmp, 1);
-      float max_tmp = tmp1 > tmp2 ? tmp1 : tmp2;
-      for (i = cnt * 8; i < size_channel_in; ++i) {
-        /* code */
-        max_tmp = max_tmp > din_ch[0] ? max_tmp : din_ch[0];
-        din_ch++;
-      }
-      dout_batch[c] = max_tmp;
-    }
-  }
-}
-
-void pooling_global_avg(const float* din,
-                        float* dout,
-                        int num,
-                        int chout,
-                        int hout,
-                        int wout,
-                        int chin,
-                        int hin,
-                        int win) {
-  int size_channel_in = win * hin;
-  int cnt = size_channel_in / 4;
-  for (int n = 0; n < num; ++n) {
-    float* dout_batch = dout + n * chout;
-    const float* din_batch = din + n * chin * size_channel_in;
-#pragma omp parallel for
-    for (int c = 0; c < chout; c++) {
-      const float* din_ch = din_batch + c * size_channel_in;  // in address
-      int i = 0;
-      float32x4_t vsum = vdupq_n_f32(0.0f);
-#ifdef __aarch64__
-      for (; i < cnt; i++) {
-        vsum = vaddq_f32(vld1q_f32(din_ch), vsum);
-        din_ch += 4;
-      }
-#else
-      int cnt_num = cnt;
-      if (cnt_num > 0) {
-        asm volatile(
-            "add_loop:                          @main loop\n"
-            "vld1.f32   {d0-d1}, [%[din_ch]]!   @load q1,din_ch\n"
-            "vadd.f32   %q[vsum], %q[vsum], q0  @add vmax,vmax, din_ch\n"
-            "subs       %[cnt_num], #1          @cnt_num--\n"
-            "bne        add_loop                @bne num\n"
-            : [din_ch] "+r"(din_ch), [cnt_num] "+r"(cnt_num), [vsum] "+w"(vsum)
-            :
-            : "cc", "memory", "q0");
-      }
-#endif  // __aarch64__
-      float32x2_t vsum_tmp = vadd_f32(vget_low_f32(vsum), vget_high_f32(vsum));
-      float sum = vget_lane_f32(vsum_tmp, 0) + vget_lane_f32(vsum_tmp, 1);
-      for (i = cnt * 4; i < size_channel_in; i++) {
-        sum += din_ch[0];
-        din_ch++;
-      }
-      dout_batch[c] = sum / size_channel_in;
-    }
-  }
-}
-
-void pooling2x2s2_max(const float* din,
-                      float* dout,
-                      int num,
-                      int chout,
-                      int hout,
-                      int wout,
-                      int chin,
-                      int hin,
-                      int win) {
-  int kernel = 2;
-  int stride = 2;
-  int padding = 0;
-  int size_channel_out = wout * hout;
-  int size_channel_in = win * hin;
-
-  int w_needed = (wout << 1);
-  int h_needed = (hout << 1);
-  int w_limit = w_needed > win ? win : w_needed;
-  int h_limit = h_needed > hin ? hin : h_needed;
-  int w_even = (w_limit >> 1) << 1;
-  int h_even = (h_limit >> 1) << 1;
-  int w_unroll_size = (w_even >> 3) << 3;
-  // int w_unroll_remain = w_even - w_unroll_size;
-  int w_in_2 = win << 1;
-  for (int n = 0; n < num; ++n) {
-    float* dout_batch = dout + n * chout * size_channel_out;
-    const float* din_batch = din + n * chin * size_channel_in;
-#pragma omp parallel for
-    for (int c = 0; c < chout; c++) {
-      float* dout_ch = dout_batch + c * size_channel_out;
-      const float* din_ch = din_batch + c * size_channel_in;
-      const float* r0 = din_ch;
-      const float* r1 = r0 + win;
-      int h = 0;
-      for (; h < h_even; h += 2) {
-        int w = 0;
-#ifdef __aarch64__
-        for (; w < w_unroll_size; w += 8) {
-          float32x4_t dr00 = vld1q_f32(&r0[w]);
-          float32x4_t dr01 = vld1q_f32(&r0[w + 4]);
-          float32x4_t dr10 = vld1q_f32(&r1[w]);
-          float32x4_t dr11 = vld1q_f32(&r1[w + 4]);
-          float32x4_t dmax1 = vmaxq_f32(dr00, dr10);
-          float32x4_t dmax2 = vmaxq_f32(dr01, dr11);
-#ifdef __aarch64__
-          float32x4_t dmax = vpmaxq_f32(dmax1, dmax2);
-#else
-          float32x2_t dmaxl =
-              vpmax_f32(vget_low_f32(dmax1), vget_high_f32(dmax1));
-          float32x2_t dmaxh =
-              vpmax_f32(vget_low_f32(dmax2), vget_high_f32(dmax2));
-          float32x4_t dmax = vcombine_f32(dmaxl, dmaxh);
-#endif
-          vst1q_f32(&dout_ch[w >> 1], dmax);
-        }
-#else
-        float* dr_out = dout_ch;
-        const float* dr0 = r0;
-        const float* dr1 = r1;
-        int cnt_num = w_unroll_size >> 3;
-        if (cnt_num > 0) {
-          asm volatile(
-              "s2_max_loop:                      @main loop\n"
-              "vld1.f32   {d0-d3}, [%[dr0]]!     @load q0,dr0\n"
-              "vld1.f32   {d4-d7}, [%[dr1]]!     @load q1,dr1\n"
-              "vmax.f32   q0, q0, q2             @max q0,q0,q2\n"
-              "vmax.f32   q1, q1, q3             @max q1,q1,q2\n"
-              "vpmax.f32  d4, d0, d1             @max d4,d0,d1\n"
-              "vpmax.f32  d5, d2, d3             @max d5,d2,d3\n"
-              "vst1.f32   {d4-d5}, [%[dr_out]]!  @vst1 q2,dr_out\n"
-              "subs       %[cnt_num], #1         @cnt_num--\n"
-              "bne        s2_max_loop            @bne cnt_num\n"
-              : [dr0] "+r"(dr0),
-                [dr1] "+r"(dr1),
-                [dr_out] "+r"(dr_out),
-                [cnt_num] "+r"(cnt_num)
-              :
-              : "cc", "memory", "q0", "q1", "q2", "q3");
-        }
-        w = w_unroll_size;
-#endif  // __aarch64__
-        for (; w < w_even; w += 2) {
-          dout_ch[w >> 1] =
-              std::max(std::max(r0[w], r0[w + 1]), std::max(r1[w], r1[w + 1]));
-        }
-        for (; w < w_limit; ++w) {  // run 0 or 1 time
-          dout_ch[w >> 1] = std::max(r0[w], r1[w]);
-        }
-        r0 += w_in_2;  // << 1;
-        r1 += w_in_2;  // << 1;
-        dout_ch += wout;
-      }
-      // process remain row (odd, last row)
-      for (; h < h_limit; h++) {  // run 0 or 1 time
-        int w = 0;
-#ifdef __aarch64__
-        for (; w < w_unroll_size; w += 8) {
-          float32x4_t dr00 = vld1q_f32(&r0[w]);
-          float32x4_t dr01 = vld1q_f32(&r0[w + 4]);
-#ifdef __aarch64__
-          float32x4_t dmax = vpmaxq_f32(dr00, dr01);
-#else
-          float32x2_t dmaxl =
-              vpmax_f32(vget_low_f32(dr00), vget_high_f32(dr00));
-          float32x2_t dmaxh =
-              vpmax_f32(vget_low_f32(dr01), vget_high_f32(dr01));
-          float32x4_t dmax = vcombine_f32(dmaxl, dmaxh);
-#endif
-          vst1q_f32(&dout_ch[w >> 1], dmax);
-        }
-#else
-        float* dr_out = dout_ch;
-        const float* dr0 = r0;
-        int cnt_num = w_unroll_size >> 3;
-        if (cnt_num > 0) {
-          asm volatile(
-              "s2_max_loop1:                      @main loop\n"
-              "vld1.f32   {d0-d3}, [%[dr0]]!      @load q0,dr0\n"
-              "vpmax.f32  d4, d0, d1              @max d4,d0,d1\n"
-              "vpmax.f32  d5, d2, d3              @max d5,d2,d3\n"
-              "vst1.f32   {d4-d5}, [%[dr_out]]!   @vst1 q2,dr_out\n"
-              "subs       %[cnt_num], #1          @cnt_num--\n"
-              "bne        s2_max_loop1            @bne cnt_num\n"
-              : [dr0] "+r"(dr0), [dr_out] "+r"(dr_out), [cnt_num] "+r"(cnt_num)
-              :
-              : "cc", "memory", "q0", "q1", "q2");
-        }
-        w = w_unroll_size;
-#endif  // __aarch64__
-        for (; w < w_even; w += 2) {
-          dout_ch[w >> 1] = std::max(r0[w], r0[w + 1]);
-        }
-        for (; w < w_limit; ++w) {  // run 0 or 1 time
-          dout_ch[w >> 1] = r0[w];
-        }
-      }
-    }
-  }
-}
-
-void pooling2x2s2_avg(const float* din,
-                      float* dout,
-                      int num,
-                      int chout,
-                      int hout,
-                      int wout,
-                      int chin,
-                      int hin,
-                      int win,
-                      bool exclusive) {
-  int kernel = 2;
-  int stride = 2;
-  int padding = 0;
-  int size_channel_out = wout * hout;
-  int size_channel_in = win * hin;
-
-  int w_needed = (wout << 1);
-  int h_needed = (hout << 1);
-  int w_limit = w_needed > win ? win : w_needed;
-  int h_limit = h_needed > hin ? hin : h_needed;
-  int w_even = (w_limit >> 1) << 1;
-  int h_even = (h_limit >> 1) << 1;
-  int w_unroll_size = (w_even >> 3) << 3;
-  // int w_unroll_remain = w_even - w_unroll_size;
-  int w_in_2 = win << 1;
-  const float coef = 1.f / 4.f;
-  const float coef_1 = exclusive ? 1.f : coef;
-  const float coef_2 = exclusive ? 1.f / 2.f : coef;
-  float32x4_t vcoef = vdupq_n_f32(coef);
-  float32x4_t vcoef_1 = vdupq_n_f32(coef_1);
-  float32x4_t vcoef_2 = vdupq_n_f32(coef_2);
-  for (int n = 0; n < num; ++n) {
-    float* dout_batch = dout + n * chout * size_channel_out;
-    const float* din_batch = din + n * chin * size_channel_in;
-#pragma omp parallel for
-    for (int c = 0; c < chout; c++) {
-      float* dout_ch = dout_batch + c * size_channel_out;
-      const float* din_ch = din_batch + c * size_channel_in;
-      const float* r0 = din_ch;
-      const float* r1 = r0 + win;
-      int h = 0;
-      for (; h < h_even; h += 2) {
-        int w = 0;
-#ifdef __aarch64__
-        for (; w < w_unroll_size; w += 8) {
-          float32x4_t dr00 = vld1q_f32(&r0[w]);
-          float32x4_t dr01 = vld1q_f32(&r0[w + 4]);
-          float32x4_t dr10 = vld1q_f32(&r1[w]);
-          float32x4_t dr11 = vld1q_f32(&r1[w + 4]);
-          float32x4_t dsum1 = vaddq_f32(dr00, dr10);
-          float32x4_t dsum2 = vaddq_f32(dr01, dr11);
-#ifdef __aarch64__
-          float32x4_t dsum = vpaddq_f32(dsum1, dsum2);
-#else
-          float32x2_t dsuml =
-              vpadd_f32(vget_low_f32(dsum1), vget_high_f32(dsum1));
-          float32x2_t dsumh =
-              vpadd_f32(vget_low_f32(dsum2), vget_high_f32(dsum2));
-          float32x4_t dsum = vcombine_f32(dsuml, dsumh);
-#endif
-          float32x4_t res = vmulq_f32(dsum, vcoef);
-          vst1q_f32(&dout_ch[w >> 1], res);
-        }
-#else
-        float* dr_out = dout_ch;
-        const float* dr0 = r0;
-        const float* dr1 = r1;
-        int cnt_num = w_unroll_size >> 3;
-        if (cnt_num > 0) {
-          asm volatile(
-              "1:                                @main loop\n"
-              "vld1.f32   {d0-d3}, [%[dr0]]!     @load q0,dr0\n"
-              "vld1.f32   {d4-d7}, [%[dr1]]!     @load q1,dr1\n"
-              "vadd.f32   q0, q0, q2             @add q0,q0,q2\n"
-              "vadd.f32   q1, q1, q3             @add q1,q1,q2\n"
-              "vpadd.f32  d4, d0, d1             @add d4,d0,d1\n"
-              "vpadd.f32  d5, d2, d3             @add d5,d2,d3\n"
-              "vmul.f32   q2, q2, %q[vcoef]    @mul q2,q2,vcoef\n"
-              "vst1.f32   {d4-d5}, [%[dr_out]]!  @vst1 q2,dr_out\n"
-              "subs       %[cnt_num], #1         @cnt_num--\n"
-              "bne        1b                     @bne cnt_num\n"
-              : [dr0] "+r"(dr0),
-                [dr1] "+r"(dr1),
-                [dr_out] "+r"(dr_out),
-                [vcoef] "+w"(vcoef),
-                [cnt_num] "+r"(cnt_num)
-              : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num), "w"(vcoef)
-              : "cc", "memory", "q0", "q1", "q2", "q3");
-        }
-        w = w_unroll_size;
-#endif  // __aarch64__
-        for (; w < w_even; w += 2) {
-          dout_ch[w >> 1] = (r0[w] + r0[w + 1] + r1[w] + r1[w + 1]) * coef;
-        }
-        for (; w < w_limit; ++w) {  // run 0 or 1 time
-          dout_ch[w >> 1] = (r0[w] + r1[w]) * coef_2;
-        }
-        r0 += w_in_2;  // << 1;
-        r1 += w_in_2;  // << 1;
-        dout_ch += wout;
-      }
-      // process remain row (odd, last row)
-      for (; h < h_limit; h++) {  // run 0 or 1 time
-        int w = 0;
-#ifdef __aarch64__
-        for (; w < w_unroll_size; w += 8) {
-          float32x4_t dr00 = vld1q_f32(&r0[w]);
-          float32x4_t dr01 = vld1q_f32(&r0[w + 4]);
-#ifdef __aarch64__
-          float32x4_t dsum = vpaddq_f32(dr00, dr01);
-#else
-          float32x2_t dsuml =
-              vpadd_f32(vget_low_f32(dr00), vget_high_f32(dr00));
-          float32x2_t dsumh =
-              vpadd_f32(vget_low_f32(dr01), vget_high_f32(dr01));
-          float32x4_t dsum = vcombine_f32(dsuml, dsumh);
-#endif
-          float32x4_t res = vmulq_f32(dsum, vcoef_2);
-          vst1q_f32(&dout_ch[w >> 1], res);
-        }
-#else
-        float* dr_out = dout_ch;
-        const float* dr0 = r0;
-        int cnt_num = w_unroll_size >> 3;
-        if (cnt_num > 0) {
-          asm volatile(
-              "1:                                @main loop\n"
-              "vld1.f32   {d0-d3}, [%[dr0]]!     @load q0,dr0\n"
-              "vpadd.f32  d4, d0, d1             @add d4,d0,d1\n"
-              "vpadd.f32  d5, d2, d3             @add d5,d2,d3\n"
-              "vmul.f32   q2, q2, %q[vcoef_2]    @mul q2,q2,vcoef_2\n"
-              "vst1.f32   {d4-d5}, [%[dr_out]]!  @vst1 q2,dr_out\n"
-              "subs       %[cnt_num], #1         @cnt_num--\n"
-              "bne        1b                     @bne cnt_num\n"
-              : [dr0] "+r"(dr0),
-                [dr_out] "+r"(dr_out),
-                [vcoef_2] "+w"(vcoef_2),
-                [cnt_num] "+r"(cnt_num)
-              : "r"(dr0), "r"(dr_out), "r"(cnt_num), "w"(vcoef_2)
-              : "cc", "memory", "q0", "q1", "q2");
-        }
-        w = w_unroll_size;
-#endif  // __aarch64__
-        for (; w < w_even; w += 2) {
-          dout_ch[w >> 1] = (r0[w] + r0[w + 1]) * coef_2;
-        }
-        for (; w < w_limit; ++w) {  // run 0 or 1 time
-          dout_ch[w >> 1] = r0[w] * coef_1;
-        }
-      }
-    }
-  }
-}
-
-void pooling3x3s1p1_max(const float* din,
-                        float* dout,
-                        int num,
-                        int chout,
-                        int hout,
-                        int wout,
-                        int chin,
-                        int hin,
-                        int win) {
-  int kernel = 3;
-  int stride = 1;
-  int padding = 1;
-  int size_channel_out = wout * hout;
-  int size_channel_in = win * hin;
-
-  int w_unroll_size = ((win - 2) >> 2) << 2;
-  int w_unroll_remain = win - 2 - w_unroll_size;
-  const float minval = std::numeric_limits<float>::lowest();
-  for (int n = 0; n < num; ++n) {
-    float* dout_batch = dout + n * chout * size_channel_out;
-    const float* din_batch = din + n * chin * size_channel_in;
-#pragma omp parallel for
-    for (int c = 0; c < chout; c++) {
-      float* dout_ch = dout_batch + c * size_channel_out;
-      const float* din_ch = din_batch + c * size_channel_in;
-      const float* r0 = din_ch;
-      const float* r1 = r0 + win;
-      const float* r2 = r1 + win;
-      int cnt_num = w_unroll_size >> 2;  // w_unroll_size / 4
-      float* dr_out = dout_ch;
-      const float* dr0 = r0;
-      const float* dr1 = r1;
-      const float* dr2 = r2;
-      int w = 0;
-      int cnt = 1;
-      // left
-      dout_ch[0] = std::max(std::max(r0[0], r0[1]), std::max(r1[0], r1[1]));
-// first row with zero pad
-#ifdef __aarch64__
-      for (; w < w_unroll_size; w += 4) {
-        float32x4_t vr0_1234 = vld1q_f32(&r0[w]);
-        float32x4_t vr1_1234 = vld1q_f32(&r1[w]);
-        float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]);
-        float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]);
-        float32x4_t vmax_1234 = vmaxq_f32(vr0_1234, vr1_1234);
-        float32x4_t vmax_5678 = vmaxq_f32(vr0_5678, vr1_5678);
-
-        float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678, 1);
-        float32x4_t vmax_3456 = vextq_f32(vmax_1234, vmax_5678, 2);
-        float32x2_t vmax_12_34 =
-            vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234));
-        float32x2_t vmax_23_45 =
-            vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345));
-        float32x2_t vmax_34_56 =
-            vpmax_f32(vget_low_f32(vmax_3456), vget_high_f32(vmax_3456));
-        float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45);
-        float32x2_t vmax_234_456 = vmax_f32(vmax_23_45, vmax_34_56);
-        float32x4_t vmax = vdupq_n_f32(vget_lane_f32(vmax_123_345, 0));
-        vmax = vsetq_lane_f32(vget_lane_f32(vmax_234_456, 0), vmax, 1);
-        vmax = vsetq_lane_f32(vget_lane_f32(vmax_123_345, 1), vmax, 2);
-        vmax = vsetq_lane_f32(vget_lane_f32(vmax_234_456, 1), vmax, 3);
-        vst1q_f32(&dout_ch[cnt], vmax);
-        cnt += 4;
-      }
-
-#else
-      dr_out = dr_out + 1;
-      if (cnt_num > 0) {
-        asm volatile(
-            "1:                              @main loop\n"
-            "vld1.f32  {d0-d1}, [%[dr0]]!    @load d0-d5,dr0\n"
-            "vld1.f32  {d4-d5}, [%[dr1]]!    @load d4-d7,dr1\n"
-            "vld1.f32  {d2}, [%[dr0]]!       @load d0-d5,dr0\n"
-            "vld1.f32  {d6}, [%[dr1]]!       @load d4-d7,dr1\n"
-            "vmax.f32  q5, q0, q2            @max r0_1234,r1_1234\n"
-            "vmax.f32  d12, d2, d6           @max r0_5678,r1_5678\n"
-            //"vmov.f32  s7,s6               @mov s7,s6\n"
-            "vext.f32  q0, q5, q6, #1        @vext max_2345\n"
-            "vext.f32  q2, q5, q6, #2        @vext max_3456\n"
-            "vpmax.f32 d2, d10, d11          @pmax d4,max_1234,max_1234\n"
-            "vpmax.f32 d3, d0, d1            @pmax d4,max_2345,max_2345\n"
-            "vpmax.f32 d6, d4, d5            @pmax d6,max_3456,max_3456\n"
-            "vmax.f32  d8, d2, d3            @max d2,vmax_12_34,vmax_23_45\n"
-            "vmax.f32  d9, d3, d6            @max d2,vmax_23_45,vmax_34_56\n"
-            "sub       %[dr0], #8            @sub w,8\n"
-            "sub       %[dr1], #8            @sub w,8\n"
-            // swap
-            "vmov.f32  s0, s17               @mov\n"
-            "vmov.f32  s17, s18              @mov\n"
-            "vmov.f32  s18, s0               @mov\n"
-            "subs      %[cnt_num], #1        @subs cnt_num,#1\n"
-            "vst1.f32  d8, [%[dr_out]]!      @vst1 d0,dr_out\n"
-            "vst1.f32  d9, [%[dr_out]]!      @vst1 d0,dr_out\n"
-            "bne       1b                    @bne s1_max_loop\n"
-            : [dr0] "+r"(dr0),
-              [dr1] "+r"(dr1),
-              [dr_out] "+r"(dr_out),
-              [cnt_num] "+r"(cnt_num)
-            : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num)
-            : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
-      }
-
-#endif
-      // remain
-      w = w_unroll_size;
-      for (int j = 0; j < w_unroll_remain; j++) {
-        float tmp_max = std::max(r0[j + w], r1[j + w]);
-        tmp_max = std::max(tmp_max, std::max(r0[j + w + 1], r1[j + w + 1]));
-        tmp_max = std::max(tmp_max, std::max(r0[j + w + 2], r1[j + w + 2]));
-        dout_ch[j + w + 1] = tmp_max;
-      }
-      // right
-      float tmp = std::max(r0[win - 2], r1[win - 2]);
-      tmp = std::max(tmp, std::max(r0[win - 1], r1[win - 1]));
-      dout_ch[wout - 1] = tmp;
-
-      // r0 = r1;
-      // r1 = r0 + w_in;
-      // r2 = r1 + w_in;
-      dout_ch += wout;
-      int h = 0;
-      for (; h < hin - 2; h += 1) {
-        // deal with left pad
-        float maxr0 = std::max(r0[0], r0[1]);
-        float maxr1 = std::max(r1[0], r1[1]);
-        float maxr2 = std::max(r2[0], r2[1]);
-        dout_ch[0] = std::max(std::max(maxr0, maxr1), maxr2);
-#ifdef __aarch64__
-        w = 0;
-        cnt = 1;
-        for (; w < w_unroll_size; w += 4) {
-          float32x4_t vr0_1234 = vld1q_f32(&r0[w]);
-          float32x4_t vr1_1234 = vld1q_f32(&r1[w]);
-          float32x4_t vr2_1234 = vld1q_f32(&r2[w]);
-          float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]);
-          float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]);
-          float32x4_t vr2_5678 = vld1q_f32(&r2[w + 4]);
-          float32x4_t vmax_1234 = vmaxq_f32(vr0_1234, vr1_1234);
-          vmax_1234 = vmaxq_f32(vmax_1234, vr2_1234);
-          float32x4_t vmax_5678 = vmaxq_f32(vr0_5678, vr1_5678);
-          vmax_5678 = vmaxq_f32(vmax_5678, vr2_5678);
-
-          float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678, 1);
-          float32x4_t vmax_3456 = vextq_f32(vmax_1234, vmax_5678, 2);
-          float32x2_t vmax_12_34 =
-              vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234));
-          float32x2_t vmax_23_45 =
-              vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345));
-          float32x2_t vmax_34_56 =
-              vpmax_f32(vget_low_f32(vmax_3456), vget_high_f32(vmax_3456));
-          float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45);
-          float32x2_t vmax_234_456 = vmax_f32(vmax_23_45, vmax_34_56);
-          float32x4_t vmax = vdupq_n_f32(vget_lane_f32(vmax_123_345, 0));
-          vmax = vsetq_lane_f32(vget_lane_f32(vmax_234_456, 0), vmax, 1);
-          vmax = vsetq_lane_f32(vget_lane_f32(vmax_123_345, 1), vmax, 2);
-          vmax = vsetq_lane_f32(vget_lane_f32(vmax_234_456, 1), vmax, 3);
-          vst1q_f32(&dout_ch[cnt], vmax);
-          cnt += 4;
-        }
-#else
-        dr_out = dout_ch + 1;
-        dr0 = r0;
-        dr1 = r1;
-        dr2 = r2;
-        cnt_num = w_unroll_size >> 2;
-        if (cnt_num > 0) {
-          asm volatile(
-              "1:                             @main loop\n"
-              "vld1.f32  {d0-d1}, [%[dr0]]!   @load d0-d5,dr0\n"
-              "vld1.f32  {d4-d5}, [%[dr1]]!   @load d4-d7,dr1\n"
-              "vld1.f32  {d8-d9}, [%[dr2]]!   @load d4-d7,dr1\n"
-              "vld1.f32  {d2}, [%[dr0]]!      @load d0-d5,dr0\n"
-              "vld1.f32  {d6}, [%[dr1]]!      @load d4-d7,dr1\n"
-              "vld1.f32  {d10}, [%[dr2]]!     @load d4-d7, dr1\n"
-              "vmax.f32  q7, q0, q2           @max r0_1234,r1_1234\n"
-              "vmax.f32  d16, d2, d6          @max r0_5678,r1_5678\n"
-              "vmax.f32  q3, q7, q4           @max r0_1234,r1_1234\n"
-              "vmax.f32  d12, d16, d10        @max r0_5678,r1_5678\n"
-              //"vmov.f32  s7,s6              @mov s7,s6\n"
-              "vext.f32  q0, q3, q6, #1       @vext max_2345\n"
-              "vext.f32  q2, q3, q6, #2       @vext max_3456\n"
-              "vpmax.f32 d2, d6, d7           @pmax d4,max_1234,max_1234\n"
-              "vpmax.f32 d3, d0, d1           @pmax d4,max_2345,max_2345\n"
-              "vpmax.f32 d6, d4, d5           @pmax d6,max_3456,max_3456\n"
-              "vmax.f32  d8, d2, d3           @max d2,vmax_12_34,vmax_23_45\n"
-              "vmax.f32  d9, d3, d6           @max d2,vmax_23_45,vmax_34_56\n"
-              "sub       %[dr0], #8           @sub w,8\n"
-              "sub       %[dr1], #8           @sub w,8\n"
-              "sub       %[dr2], #8           @sub w,8\n"
-              // swap
-              "vmov.f32  s0, s17              @mov\n"
-              "vmov.f32  s17, s18             @mov\n"
-              "vmov.f32  s18, s0              @mov\n"
-              "subs      %[cnt_num], #1       @subs cnt_num,#1\n"
-              "vst1.f32  d8, [%[dr_out]]!     @vst1 d0,dr_out\n"
-              "vst1.f32  d9, [%[dr_out]]!     @vst1 d0,dr_out\n"
-              "bne       1b                   @bne s1_max_loop\n"
-              : [dr0] "+r"(dr0),
-                [dr1] "+r"(dr1),
-                [dr2] "+r"(dr2),
-                [dr_out] "+r"(dr_out),
-                [cnt_num] "+r"(cnt_num)
-              : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num)
-              : "cc",
-                "memory",
-                "q0",
-                "q1",
-                "q2",
-                "q3",
-                "q4",
-                "q5",
-                "q6",
-                "q7",
-                "q8");
-        }
-#endif
-        // remain
-        w = w_unroll_size;
-        for (int j = 0; j < w_unroll_remain; j++) {
-          float tmp_max = std::max(r0[j + w], r1[j + w]);
-          tmp_max = std::max(tmp_max, std::max(r0[j + w + 1], r1[j + w + 1]));
-          tmp_max = std::max(tmp_max, std::max(r0[j + w + 2], r1[j + w + 2]));
-          tmp_max = std::max(tmp_max, std::max(r2[j + w], r2[j + w + 1]));
-          tmp_max = std::max(tmp_max, r2[j + w + 2]);
-          dout_ch[j + w + 1] = tmp_max;
-        }
-        // right
-        tmp = std::max(r0[win - 2], r1[win - 2]);
-        tmp = std::max(tmp, std::max(r0[win - 1], r1[win - 1]));
-        tmp = std::max(tmp, std::max(r2[win - 2], r2[win - 1]));
-        dout_ch[wout - 1] = tmp;
-
-        r0 = r1;
-        r1 = r2;
-        r2 = r1 + win;
-        dout_ch += wout;
-      }
-
-      // the last two line
-      float maxr0 = std::max(r0[0], r0[1]);
-      float maxr1 = std::max(r1[0], r1[1]);
-      dout_ch[0] = std::max(maxr0, maxr1);
-#ifdef __aarch64__
-      w = 0;
-      cnt = 1;
-      for (; w < w_unroll_size; w += 4) {
-        float32x4_t vr0_1234 = vld1q_f32(&r0[w]);
-        float32x4_t vr1_1234 = vld1q_f32(&r1[w]);
-        float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]);
-        float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]);
-        float32x4_t vmax_1234 = vmaxq_f32(vr0_1234, vr1_1234);
-        float32x4_t vmax_5678 = vmaxq_f32(vr0_5678, vr1_5678);
-
-        float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678, 1);
-        float32x4_t vmax_3456 = vextq_f32(vmax_1234, vmax_5678, 2);
-        float32x2_t vmax_12_34 =
-            vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234));
-        float32x2_t vmax_23_45 =
-            vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345));
-        float32x2_t vmax_34_56 =
-            vpmax_f32(vget_low_f32(vmax_3456), vget_high_f32(vmax_3456));
-        float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45);
-        float32x2_t vmax_234_456 = vmax_f32(vmax_23_45, vmax_34_56);
-        float32x4_t vmax = vdupq_n_f32(vget_lane_f32(vmax_123_345, 0));
-        vmax = vsetq_lane_f32(vget_lane_f32(vmax_234_456, 0), vmax, 1);
-        vmax = vsetq_lane_f32(vget_lane_f32(vmax_123_345, 1), vmax, 2);
-        vmax = vsetq_lane_f32(vget_lane_f32(vmax_234_456, 1), vmax, 3);
-        vst1q_f32(&dout_ch[cnt], vmax);
-        cnt += 4;
-      }
-#else
-      dr_out = dout_ch + 1;
-      dr0 = r0;
-      dr1 = r1;
-      cnt_num = w_unroll_size >> 2;
-      if (cnt_num > 0) {
-        asm volatile(
-            "1:                              @main loop\n"
-            "vld1.f32  {d0-d1}, [%[dr0]]!    @load d0-d5,dr0\n"
-            "vld1.f32  {d4-d5}, [%[dr1]]!    @load d4-d7,dr1\n"
-            "vld1.f32  {d2}, [%[dr0]]!       @load d0-d5,dr0\n"
-            "vld1.f32  {d6}, [%[dr1]]!       @load d4-d7,dr1\n"
-            "vmax.f32  q5, q0, q2            @max r0_1234,r1_1234\n"
-            "vmax.f32  d12, d2, d6           @max r0_5678,r1_5678\n"
-            //"vmov.f32  s7,s6               @mov s7,s6\n"
-            "vext.f32  q0, q5, q6, #1        @vext max_2345\n"
-            "vext.f32  q2, q5, q6, #2        @vext max_3456\n"
-            "vpmax.f32 d2, d10, d11          @pmax d4,max_1234,max_1234\n"
-            "vpmax.f32 d3, d0, d1            @pmax d4,max_2345,max_2345\n"
-            "vpmax.f32 d6, d4, d5            @pmax d6,max_3456,max_3456\n"
-            "vmax.f32  d8, d2, d3            @max d2,vmax_12_34,vmax_23_45\n"
-            "vmax.f32  d9, d3, d6            @max d2,vmax_23_45,vmax_34_56\n"
-            "sub       %[dr0], #8            @sub w,8\n"
-            "sub       %[dr1], #8            @sub w,8\n"
-            // swap
-            "vmov.f32  s0, s17               @mov\n"
-            "vmov.f32  s17, s18              @mov\n"
-            "vmov.f32  s18, s0               @mov\n"
-            "subs      %[cnt_num], #1        @subs cnt_num,#1\n"
-            "vst1.f32  d8, [%[dr_out]]!      @vst1 d0,dr_out\n"
-            "vst1.f32  d9, [%[dr_out]]!      @vst1 d0,dr_out\n"
-            "bne       1b                    @bne s1_max_loop\n"
-            : [dr0] "+r"(dr0),
-              [dr1] "+r"(dr1),
-              [dr_out] "+r"(dr_out),
-              [cnt_num] "+r"(cnt_num)
-            : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num)
-            : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
-      }
-#endif
-      // remian
-      w = w_unroll_size;
-      for (int j = 0; j < w_unroll_remain; j++) {
-        float tmp_max = std::max(r0[j + w], r1[j + w]);
-        tmp_max = std::max(tmp_max, std::max(r0[j + w + 1], r1[j + w + 1]));
-        tmp_max = std::max(tmp_max, std::max(r0[j + w + 2], r1[j + w + 2]));
-        dout_ch[j + w + 1] = tmp_max;
-      }
-      tmp = std::max(r0[win - 2], r1[win - 2]);
-      tmp = std::max(tmp, std::max(r0[win - 1], r1[win - 1]));
-      dout_ch[wout - 1] = tmp;
-    }
-  }
-}
-
-void pooling3x3s1p1_avg(const float* din,
-                        float* dout,
-                        int num,
-                        int chout,
-                        int hout,
-                        int wout,
-                        int chin,
-                        int hin,
-                        int win,
-                        bool exclusive) {
-  int kernel = 3;
-  int stride = 1;
-  int padding = 1;
-  int size_channel_out = wout * hout;
-  int size_channel_in = win * hin;
-
-  int w_unroll_size = ((win - 2) >> 2) << 2;
-  int w_unroll_remain = win - 2 - w_unroll_size;
-  const float coef = 1.f / 9.f;
-  const float coef_2 = exclusive ? 1.f / 2.f : coef;
-  const float coef_4 = exclusive ? 1.f / 4.f : coef;
-  const float coef_6 = exclusive ? 1.f / 6.f : coef;
-  float32x4_t vcoef = vdupq_n_f32(coef);
-  float32x4_t vcoef_2 = vdupq_n_f32(coef_2);
-  float32x4_t vcoef_4 = vdupq_n_f32(coef_4);
-  float32x4_t vcoef_6 = vdupq_n_f32(coef_6);
-  for (int n = 0; n < num; ++n) {
-    float* dout_batch = dout + n * chout * size_channel_out;
-    const float* din_batch = din + n * chin * size_channel_in;
-#pragma omp parallel for
-    for (int c = 0; c < chout; c++) {
-      float* dout_ch = dout_batch + c * size_channel_out;
-      const float* din_ch = din_batch + c * size_channel_in;
-      const float* r0 = din_ch;
-      const float* r1 = r0 + win;
-      const float* r2 = r1 + win;
-      int cnt_num = w_unroll_size >> 2;  // w_unroll_size / 4
-      float* dr_out = dout_ch;
-      const float* dr0 = r0;
-      const float* dr1 = r1;
-      const float* dr2 = r2;
-      int w = 0;
-      int cnt = 1;
-      // left
-      dout_ch[0] = (r0[0] + r0[1] + r1[0] + r1[1]) * coef_4;
-// first row with zero pad
-#ifdef __aarch64__
-      for (; w < w_unroll_size; w += 4) {
-        float32x4_t vr0_1234 = vld1q_f32(&r0[w]);
-        float32x4_t vr1_1234 = vld1q_f32(&r1[w]);
-        float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]);
-        float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]);
-        float32x4_t vsum_1234 = vaddq_f32(vr0_1234, vr1_1234);
-        float32x4_t vsum_5678 = vaddq_f32(vr0_5678, vr1_5678);
-
-        float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678, 1);
-        float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678, 2);
-        float32x4_t vsum = vaddq_f32(vsum_1234, vsum_2345);
-        vsum = vaddq_f32(vsum, vsum_3456);
-        vsum = vmulq_f32(vsum, vcoef_6);
-        vst1q_f32(&dout_ch[cnt], vsum);
-        cnt += 4;
-      }
-#else
-      dr_out = dr_out + 1;
-      if (cnt_num > 0) {
-        asm volatile(
-            "1:                               @main loop\n"
-            "vld1.f32  {d0-d1}, [%[dr0]]!     @load d0-d5,dr0\n"
-            "vld1.f32  {d4-d5}, [%[dr1]]!     @load d4-d7,dr1\n"
-            "vld1.f32  {d2}, [%[dr0]]!        @load d0-d5,dr0\n"
-            "vld1.f32  {d6}, [%[dr1]]!        @load d4-d7,dr1\n"
-            "vadd.f32  q5, q0, q2             @max r0_1234,r1_1234\n"
-            "vadd.f32  d12, d2, d6            @max r0_5678,r1_5678\n"
-            //"vmov.f32  s7,s6                @mov s7,s6\n"
-            "vext.f32  q0, q5, q6, #1         @vext max_2345\n"
-            "vext.f32  q2, q5, q6, #2         @vext max_3456\n"
-            "vadd.f32  q1, q5, q0             @add 1234+2345\n"
-            "vadd.f32  q1, q1, q2             @add + 3456\n"
-            "vmul.f32  q4, q1, %q[vcoef_6]    @mul * 1/9.f\n"
-            "sub       %[dr0], #8             @sub w,8\n"
-            "sub       %[dr1], #8             @sub w,8\n"
-            "subs      %[cnt_num], #1         @subs cnt_num,#1\n"
-            "vst1.f32  d8, [%[dr_out]]!       @vst1 d0,dr_out\n"
-            "vst1.f32  d9, [%[dr_out]]!       @vst1 d0,dr_out\n"
-            "bne       1b                     @bne s1_max_loop\n"
-            : [dr0] "+r"(dr0),
-              [dr1] "+r"(dr1),
-              [dr_out] "+r"(dr_out),
-              [cnt_num] "+r"(cnt_num),
-              [vcoef_6] "+w"(vcoef_6)
-            : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num)
-            : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
-      }
-
-#endif
-      // remain
-      w = w_unroll_size;
-      for (int j = 0; j < w_unroll_remain; j++) {
-        float tmp_sum = r0[j + w] + r1[j + w];
-        tmp_sum += (r0[j + w + 1] + r1[j + w + 1]);
-        tmp_sum += (r0[j + w + 2] + r1[j + w + 2]);
-        dout_ch[j + w + 1] = tmp_sum * coef_6;
-      }
-      // right
-      float tmp = r0[win - 2] + r1[win - 2];
-      tmp += (r0[win - 1] + r1[win - 1]);
-      dout_ch[wout - 1] = tmp * coef_4;
-
-      // r0 = r1;
-      // r1 = r0 + w_in;
-      // r2 = r1 + w_in;
-      dout_ch += wout;
-      int h = 0;
-      for (; h < hin - 2; h += 1) {
-        // deal with left pad
-        float maxr0 = r0[0] + r0[1];
-        float maxr1 = r1[0] + r1[1];
-        float maxr2 = r2[0] + r2[1];
-        dout_ch[0] = (maxr0 + maxr1 + maxr2) * coef_6;
-#ifdef __aarch64__
-        w = 0;
-        cnt = 1;
-        for (; w < w_unroll_size; w += 4) {
-          float32x4_t vr0_1234 = vld1q_f32(&r0[w]);
-          float32x4_t vr1_1234 = vld1q_f32(&r1[w]);
-          float32x4_t vr2_1234 = vld1q_f32(&r2[w]);
-          float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]);
-          float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]);
-          float32x4_t vr2_5678 = vld1q_f32(&r2[w + 4]);
-          float32x4_t vsum_1234 = vaddq_f32(vr0_1234, vr1_1234);
-          vsum_1234 = vaddq_f32(vsum_1234, vr2_1234);
-          float32x4_t vsum_5678 = vaddq_f32(vr0_5678, vr1_5678);
-          vsum_5678 = vaddq_f32(vsum_5678, vr2_5678);
-
-          float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678, 1);
-          float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678, 2);
-          float32x4_t vsum = vaddq_f32(vsum_1234, vsum_2345);
-          vsum = vaddq_f32(vsum, vsum_3456);
-          vsum = vmulq_f32(vsum, vcoef);
-          vst1q_f32(&dout_ch[cnt], vsum);
-          cnt += 4;
-        }
-#else
-        dr_out = dout_ch + 1;
-        dr0 = r0;
-        dr1 = r1;
-        dr2 = r2;
-        cnt_num = w_unroll_size >> 2;
-        if (cnt_num > 0) {
-          asm volatile(
-              "1:                            @main loop\n"
-              "vld1.f32  {d0-d1}, [%[dr0]]!  @load d0-d5,dr0\n"
-              "vld1.f32  {d4-d5}, [%[dr1]]!  @load d4-d7,dr1\n"
-              "vld1.f32  {d8-d9}, [%[dr2]]!  @load d4-d7,dr1\n"
-              "vld1.f32  {d2}, [%[dr0]]!     @load d0-d5,dr0\n"
-              "vld1.f32  {d6}, [%[dr1]]!     @load d4-d7,dr1\n"
-              "vld1.f32  {d10}, [%[dr2]]!    @load d4-d7,dr1\n"
-              "vadd.f32  q7, q0, q2          @max r0_1234,r1_1234\n"
-              "vadd.f32  d16, d2, d6         @max r0_5678,r1_5678\n"
-              "vadd.f32  q3, q7, q4          @max r0_1234,r1_1234\n"
-              "vadd.f32  d12, d16, d10       @max r0_5678,r1_5678\n"
-              //"vmov.f32  s7,s6             @mov s7,s6\n"
-              "vext.f32  q0, q3, q6, #1      @vext max_2345\n"
-              "vext.f32  q2, q3, q6, #2      @vext max_3456\n"
-              "vadd.f32  q1, q3, q0          @add 1234+2345\n"
-              "vadd.f32  q1, q1, q2          @add+3456\n"
-              "vmul.f32  q4, q1, %q[vcoef]   @mul*1/9.f\n"
-              "sub       %[dr0], #8          @sub w,8\n"
-              "sub       %[dr1], #8          @sub w,8\n"
-              "sub       %[dr2], #8          @sub w,8\n"
-              "subs      %[cnt_num], #1      @subs cnt_num,#1\n"
-              "vst1.f32  d8, [%[dr_out]]!    @vst1 d0,dr_out\n"
-              "vst1.f32  d9, [%[dr_out]]!    @vst1 d0,dr_out\n"
-              "bne       1b                  @bne s1_max_loop\n"
-              : [dr0] "+r"(dr0),
-                [dr1] "+r"(dr1),
-                [dr2] "+r"(dr2),
-                [dr_out] "+r"(dr_out),
-                [cnt_num] "+r"(cnt_num),
-                [vcoef] "+w"(vcoef)
-              : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num)
-              : "cc",
-                "memory",
-                "q0",
-                "q1",
-                "q2",
-                "q3",
-                "q4",
-                "q5",
-                "q6",
-                "q7",
-                "q8");
-        }
-#endif
-        // remain
-        w = w_unroll_size;
-        for (int j = 0; j < w_unroll_remain; j++) {
-          float tmp_sum = r0[j + w] + r1[j + w];
-          tmp_sum += (r0[j + w + 1] + r1[j + w + 1]);
-          tmp_sum += (r0[j + w + 2] + r1[j + w + 2]);
-          tmp_sum += (r2[j + w + 1] + r2[j + w + 2]);
-          tmp_sum += r2[j + w];
-          dout_ch[j + w + 1] = tmp_sum * coef;
-        }
-        // right
-        tmp = r0[win - 2] + r1[win - 2];
-        tmp += (r0[win - 1] + r1[win - 1]);
-        tmp += (r2[win - 2] + r2[win - 1]);
-        dout_ch[wout - 1] = tmp * coef_6;
-
-        r0 = r1;
-        r1 = r2;
-        r2 = r1 + win;
-        dout_ch += wout;
-      }
-
-      // last line
-      float maxr0 = (r0[0] + r0[1]);
-      float maxr1 = (r1[0] + r1[1]);
-      dout_ch[0] = (maxr0 + maxr1) * coef_4;
-#ifdef __aarch64__
-      w = 0;
-      cnt = 1;
-      for (; w < w_unroll_size; w += 4) {
-        float32x4_t vr0_1234 = vld1q_f32(&r0[w]);
-        float32x4_t vr1_1234 = vld1q_f32(&r1[w]);
-        float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]);
-        float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]);
-        float32x4_t vsum_1234 = vaddq_f32(vr0_1234, vr1_1234);
-        float32x4_t vsum_5678 = vaddq_f32(vr0_5678, vr1_5678);
-
-        float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678, 1);
-        float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678, 2);
-        float32x4_t vsum = vaddq_f32(vsum_1234, vsum_2345);
-        vsum = vaddq_f32(vsum, vsum_3456);
-        vsum = vmulq_f32(vsum, vcoef_6);
-        vst1q_f32(&dout_ch[cnt], vsum);
-        cnt += 4;
-      }
-#else
-      dr_out = dout_ch + 1;
-      dr0 = r0;
-      dr1 = r1;
-      cnt_num = w_unroll_size >> 2;
-      if (cnt_num > 0) {
-        asm volatile(
-            "1:                              @main loop\n"
-            "vld1.f32  {d0-d1}, [%[dr0]]!    @load d0-d5,dr0\n"
-            "vld1.f32  {d4-d5}, [%[dr1]]!    @load d4-d7,dr1\n"
-            "vld1.f32  {d2}, [%[dr0]]!       @load d0-d5,dr0\n"
-            "vld1.f32  {d6}, [%[dr1]]!       @load d4-d7,dr1\n"
-            "vadd.f32  q5, q0, q2            @max r0_1234,r1_1234\n"
-            "vadd.f32  d12, d2, d6           @max r0_5678,r1_5678\n"
-            //"vmov.f32  s7,s6               @mov s7,s6\n"
-            "vext.f32  q0, q5, q6, #1        @vext max_2345\n"
-            "vext.f32  q2, q5, q6, #2        @vext max_3456\n"
-            "vadd.f32  q1, q5, q0            @add 1234+2345\n"
-            "vadd.f32  q1, q1, q2            @add + 3456\n"
-            "vmul.f32  q4, q1, %q[vcoef_6]   @mul * 1/9.f\n"
-            "sub       %[dr0], #8            @sub w,8\n"
-            "sub       %[dr1], #8            @sub w,8\n"
-            "subs      %[cnt_num], #1        @subs cnt_num,#1\n"
-            "vst1.f32  d8, [%[dr_out]]!      @vst1 d0,dr_out\n"
-            "vst1.f32  d9, [%[dr_out]]!      @vst1 d0,dr_out\n"
-            "bne       1b                    @bne s1_max_loop\n"
-            : [dr0] "+r"(dr0),
-              [dr1] "+r"(dr1),
-              [dr_out] "+r"(dr_out),
-              [cnt_num] "+r"(cnt_num),
-              [vcoef_6] "+w"(vcoef_6)
-            : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num)
-            : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
-      }
-#endif
-      // remain
-      w = w_unroll_size;
-      for (int j = 0; j < w_unroll_remain; j++) {
-        float tmp_sum = r0[j + w] + r1[j + w];
-        tmp_sum += (r0[j + w + 1] + r1[j + w + 1]);
-        tmp_sum += (r0[j + w + 2] + r1[j + w + 2]);
-        dout_ch[j + w + 1] = tmp_sum * coef_6;
-      }
-      // right
-      tmp = r0[win - 2] + r1[win - 2];
-      tmp += (r0[win - 1] + r1[win - 1]);
-      dout_ch[wout - 1] = tmp * coef_4;
-    }
-  }
-}
-
-void pooling3x3s2p1_max(const float* din,
-                        float* dout,
-                        int num,
-                        int chout,
-                        int hout,
-                        int wout,
-                        int chin,
-                        int hin,
-                        int win) {
-  int kernel = 3;
-  int stride = 2;
-  int padding = 1;
-  int size_channel_out = wout * hout;
-  int size_channel_in = win * hin;
-
-  int w_needed = (wout << 1) + 1;
-  int h_needed = (hout << 1) + 1;
-  int w_limit = w_needed > win ? win : w_needed;
-  int h_limit = h_needed > hin ? hin : h_needed;
-  int w_even = (w_limit >> 1) << 1;
-  int h_even = (h_limit >> 1) << 1;
-  int w_unroll_size = ((w_even - 1) >> 3) << 3;
-  int w_unroll_remain = w_even - 1 - w_unroll_size;
-  int w_remain = w_needed - w_limit - padding;
-  int h_remain = h_needed - h_limit - padding;
-  int w_in_2 = win << 1;
-  float minval = std::numeric_limits<float>::lowest();
-  for (int n = 0; n < num; ++n) {
-    float* dout_batch = dout + n * chout * size_channel_out;
-    const float* din_batch = din + n * chin * size_channel_in;
-#pragma omp parallel for
-    for (int c = 0; c < chout; c++) {
-      float* dout_ch = dout_batch + c * size_channel_out;
-      const float* din_ch = din_batch + c * size_channel_in;
-      const float* r0 = din_ch;
-      const float* r1 = r0 + win;
-      const float* r2 = r1 + win;
-      int cnt_num = w_unroll_size >> 3;
-      int cnt_num_remain = w_unroll_remain >> 1;
-      float* dr_out = dout_ch;
-      const float* dr0 = r0;
-      const float* dr1 = r1;
-      const float* dr2 = r2;
-      int w = 1;
-      int cnt = 1;
-      dout_ch[0] = std::max(std::max(r0[0], r0[1]), std::max(r1[0], r1[1]));
-// first row with zero pad
-#if __aarch64__
-      for (; w < w_unroll_size; w += 8) {
-        float32x4_t vr0_1234 = vld1q_f32(&r0[w]);
-        float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]);
-        float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]);
-        float32x4_t vr1_1234 = vld1q_f32(&r1[w]);
-        float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]);
-        float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]);
-        float32x4_t vmax_1234 = vmaxq_f32(vr0_1234, vr1_1234);
-        float32x4_t vmax_5678 = vmaxq_f32(vr0_5678, vr1_5678);
-        float32x4_t vmax_9101112 = vmaxq_f32(vr0_9101112, vr1_9101112);
-        float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678, 1);
-        float32x4_t vmax_6789 = vextq_f32(vmax_5678, vmax_9101112, 1);
-        float32x2_t vmax_12_34 =
-            vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234));
-        float32x2_t vmax_23_45 =
-            vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345));
-        float32x2_t vmax_56_78 =
-            vpmax_f32(vget_low_f32(vmax_5678), vget_high_f32(vmax_5678));
-        float32x2_t vmax_67_89 =
-            vpmax_f32(vget_low_f32(vmax_6789), vget_high_f32(vmax_6789));
-        float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45);
-        float32x2_t vmax_567_789 = vmax_f32(vmax_56_78, vmax_67_89);
-        vst1_f32(&dout_ch[cnt], vmax_123_345);
-        vst1_f32(&dout_ch[cnt + 2], vmax_567_789);
-        cnt += 4;
-      }
-      for (; w < w_even - 1; w += 2) {
-        float32x4_t vr0 = vld1q_f32(&r0[w]);
-        float32x4_t vr1 = vld1q_f32(&r1[w]);
-        vr0 = vsetq_lane_f32(minval, vr0, 3);
-        vr1 = vsetq_lane_f32(minval, vr1, 3);
-        float32x4_t vmax1 = vmaxq_f32(vr0, vr1);
-        float32x2_t vmax2 =
-            vpmax_f32(vget_low_f32(vmax1), vget_high_f32(vmax1));
-        vmax2 = vpmax_f32(vmax2, vmax2);
-        dout_ch[cnt] = vget_lane_f32(vmax2, 0);
-        cnt++;
-      }
-#else
-      dr0 = dr0 + 1;
-      dr1 = dr1 + 1;
-      dr_out = dr_out + 1;
-      // LOG(INFO) << "cnt_num: " << cnt_num << " cnt_num_remain: " <<
-      // cnt_num_remain;
-      if (cnt_num > 0 || cnt_num_remain > 0) {
-        asm volatile(
-            "cmp       %[cnt_num], #0        @cmp cnt_num,0\n"
-            "ble       3f                    @ble exit\n"
-            "1:                              @main loop\n"
-            "vld1.f32  {d0-d3}, [%[dr0]]!    @load d0-d5,dr0\n"
-            "vld1.f32  {d6-d9}, [%[dr1]]!    @load d4-d7,dr1\n"
-            "vld1.f32  {d4-d5}, [%[dr0]]!    @load d0-d5,dr0\n"
-            "vld1.f32  {d10-d11}, [%[dr1]]!  @load d4-d7,dr1\n"
-            "vmax.f32  q6, q0, q3            @max r0_1234,r1_1234\n"
-            "vmax.f32  q7, q1, q4            @max r0_5678,r1_5678\n"
-            "vmax.f32  q8, q2, q5            @max r0_9101112,r1_9101112\n"
-            //"vmov.f32  s7,s6               @mov s7,s6\n"
-            "vext.f32  q0, q6, q7, #1        @vext max_2345\n"
-            "vext.f32  q1, q7, q8, #1        @vext max_6789\n"
-            "vpmax.f32 d4, d12, d13          @pmax d4,vmax_1234,vmax_1234\n"
-            "vpmax.f32 d6, d14, d15          @pmax d6,vmax_5678,vmax_5678\n"
-            "vpmax.f32 d5, d0, d1            @pmax d5,vmax_2345,vmax_2345\n"
-            "vpmax.f32 d7, d2, d3            @pmax d7,vmax_6789,vmax_6789\n"
-            "vmax.f32 d8, d4, d5             @max d2,vmax_12_34,vmax_23_45\n"
-            "vmax.f32 d9, d6, d7             @max d2,vmax_56_78,vmax_67_89\n"
-            "sub       %[dr0], #16           @add w,8\n"
-            "sub       %[dr1], #16           @add w, 8\n"
-            "vst1.f32  d8, [%[dr_out]]!      @vst1 d0,dr_out\n"
-            "vst1.f32  d9, [%[dr_out]]!      @vst1 d0,dr_out\n"
-            "subs      %[cnt_num], #1        @subs cnt_num, #1\n"
-            "bne       1b                    @bne s3_max_loop\n"
-            "3:                              @loop \n"
-            "cmp       %[cnt_num_remain], #0 @cmp cnt_num,0\n"
-            "ble       4f                    @ble exit\n"
-            "2:                              @main loop\n"
-            "vld1.f32  {d0-d1}, [%[dr0]]!    @load d0-d1,dr0\n"
-            "vld1.f32  {d2-d3}, [%[dr1]]!    @load d2-d3,dr1\n"
-            "vmov.f32  s3,s2                 @movs3,s2\n"
-            "vmov.f32  s7,s6                 @movs7,s6\n"
-            "vmax.f32  q0, q0, q1            @max q0,q0,q1\n"
-            "vpmax.f32 d0, d0, d1            @pmax d0,d0,d1\n"
-            "vpmax.f32 d0, d0, d0            @pmax d0,d0,d0\n"
-            "vst1.f32  d0[0], [%[dr_out]]!   @vst d0[0],dr_out\n"
-            "sub       %[dr0], #8            @add w,6\n"
-            "sub       %[dr1], #8            @add w,6\n"
-            "subs      %[cnt_num_remain], #1 @subs cnt_num,#1\n"
-            "bne       2b                    @bne s3_max_loop_1\n"
-            "4:                              @exit\n"
-            : [dr0] "+r"(dr0),
-              [dr1] "+r"(dr1),
-              [dr_out] "+r"(dr_out),
-              [cnt_num] "+r"(cnt_num),
-              [cnt_num_remain] "+r"(cnt_num_remain)
-            : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num), "r"(cnt_num_remain)
-            : "cc",
-              "memory",
-              "q0",
-              "q1",
-              "q2",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9");
-      }
-#endif
-      // int w = w_even - 1;
-      if (w_remain > 0) {
-        // deal with right pad
-        int wstart = (w_even >> 1) * stride - padding;
-        int wend = std::min(std::min(wstart + kernel, win + padding), win);
-        float tmp = r0[wstart];  // std::numeric_limits<float>::min();
-        for (int i = wstart; i < wend; i++) {  // only run 1 or 2 times
-          tmp = std::max(tmp, std::max(r0[i], r1[i]));
-        }
-        dout_ch[w_even >> 1] = tmp;
-        // cnt ++;
-      }
-
-      r0 = r1;
-      r1 = r0 + win;
-      r2 = r1 + win;
-      dout_ch += wout;
-      int h = 2;
-      for (; h < h_even; h += 2) {
-        // deal with left pad
-        float maxr0 = std::max(r0[0], r0[1]);
-        float maxr1 = std::max(r1[0], r1[1]);
-        float maxr2 = std::max(r2[0], r2[1]);
-        dout_ch[0] = std::max(std::max(maxr0, maxr1), maxr2);
-#if __aarch64__
-        w = 1;
-        cnt = 1;
-        for (; w < w_unroll_size; w += 8) {
-          float32x4_t vr0_1234 = vld1q_f32(&r0[w]);
-          float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]);
-          float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]);
-          float32x4_t vr1_1234 = vld1q_f32(&r1[w]);
-          float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]);
-          float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]);
-          float32x4_t vr2_1234 = vld1q_f32(&r2[w]);
-          float32x4_t vr2_5678 = vld1q_f32(&r2[w + 4]);
-          float32x4_t vr2_9101112 = vld1q_f32(&r2[w + 8]);
-          float32x4_t vmax_1234 = vmaxq_f32(vr0_1234, vr1_1234);
-          vmax_1234 = vmaxq_f32(vmax_1234, vr2_1234);
-          float32x4_t vmax_5678 = vmaxq_f32(vr0_5678, vr1_5678);
-          vmax_5678 = vmaxq_f32(vmax_5678, vr2_5678);
-          float32x4_t vmax_9101112 = vmaxq_f32(vr0_9101112, vr1_9101112);
-          vmax_9101112 = vmaxq_f32(vmax_9101112, vr2_9101112);
-          float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678, 1);
-          float32x4_t vmax_6789 = vextq_f32(vmax_5678, vmax_9101112, 1);
-          float32x2_t vmax_12_34 =
-              vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234));
-          float32x2_t vmax_23_45 =
-              vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345));
-          float32x2_t vmax_56_78 =
-              vpmax_f32(vget_low_f32(vmax_5678), vget_high_f32(vmax_5678));
-          float32x2_t vmax_67_89 =
-              vpmax_f32(vget_low_f32(vmax_6789), vget_high_f32(vmax_6789));
-          float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45);
-          float32x2_t vmax_567_789 = vmax_f32(vmax_56_78, vmax_67_89);
-          vst1_f32(&dout_ch[cnt], vmax_123_345);
-          vst1_f32(&dout_ch[cnt + 2], vmax_567_789);
-          cnt += 4;
-        }
-        for (; w < w_even - 1; w += 2) {
-          float32x4_t vr0 = vld1q_f32(&r0[w]);
-          float32x4_t vr1 = vld1q_f32(&r1[w]);
-          float32x4_t vr2 = vld1q_f32(&r2[w]);
-          vr0 = vsetq_lane_f32(minval, vr0, 3);
-          vr1 = vsetq_lane_f32(minval, vr1, 3);
-          vr2 = vsetq_lane_f32(minval, vr2, 3);
-          float32x4_t vmax1 = vmaxq_f32(vr0, vr1);
-          vmax1 = vmaxq_f32(vmax1, vr2);
-          float32x2_t vmax2 =
-              vpmax_f32(vget_low_f32(vmax1), vget_high_f32(vmax1));
-          float32x2_t vmax = vpmax_f32(vmax2, vmax2);
-          dout_ch[cnt] = vget_lane_f32(vmax, 0);
-          cnt++;
-        }
-#else
-        dr_out = dout_ch + 1;
-        dr0 = (r0 + 1);
-        dr1 = (r1 + 1);
-        dr2 = (r2 + 1);
-        cnt_num = w_unroll_size >> 3;
-        cnt_num_remain = w_unroll_remain >> 1;
-        if (cnt_num > 0 || cnt_num_remain > 0) {
-          asm volatile(
-              "cmp       %[cnt_num], #0        @cmp cnt_num,0\n"
-              "ble       3f                    @ble exit\n"
-              "1:                              @main loop\n"
-              "vld1.f32  {d0-d3}, [%[dr0]]!    @load d0-d5,dr0\n"
-              "vld1.f32  {d6-d9}, [%[dr1]]!    @load d4-d7,dr1\n"
-              "vld1.f32  {d12-d15}, [%[dr2]]!  @load d4-d7,dr1\n"
-              "vld1.f32  {d4-d5}, [%[dr0]]!    @load d0-d5,dr0\n"
-              "vld1.f32  {d10-d11}, [%[dr1]]!  @load d4-d7,dr1\n"
-              "vld1.f32  {d16-d17}, [%[dr2]]!  @load d4-d7,dr1\n"
-              "vmax.f32  q9, q0, q3            @max q0,q0,q2\n"
-              "vmax.f32  q10, q1, q4           @max q1,q1,q3\n"
-              "vmax.f32  q11, q2, q5           @max q1,q1,q3\n"
-              "vmax.f32  q0, q9, q6            @max q0,q0,q2 1234\n"
-              "vmax.f32  q3, q10, q7           @max q1,q1,q3 5678\n"
-              "vmax.f32  q1, q11, q8           @max q1,q1,q3 9101112\n"
-              //"vmov.f32  s7,s6               @mov s7, s6\n"
-              "vext.f32  q4, q0, q3, #1        @vext 2345\n"
-              "vext.f32  q2, q3, q1, #1        @vext 6789\n"
-              "vpmax.f32 d10, d0, d1           @pmax d10,vmax_1234,vmax_1234\n"
-              "vpmax.f32 d12, d6, d7           @pmax d12,vmax_5678,vmax_5678\n"
-              "vpmax.f32 d11, d8, d9           @pmax d11,vmax_2345,vmax_2345\n"
-              "vpmax.f32 d13, d4, d5           @pmax d13,vmax_6789,vmax_6789\n"
-              "vmax.f32 d0, d10, d11           @pmax d0,vmax_12_34,vmax_23_45\n"
-              "vmax.f32 d1, d12, d13           @pmax d1,vmax_56_78,vmax_67_89\n"
-              "sub       %[dr0], #16           @add w,8\n"
-              "sub       %[dr1], #16           @add w,8\n"
-              "sub       %[dr2], #16           @add w,8\n"
-              "vst1.f32  d0, [%[dr_out]]!      @vst1 d0,dr_out\n"
-              "vst1.f32  d1, [%[dr_out]]!      @vst1 d0,dr_out\n"
-              "subs      %[cnt_num], #1        @subs cnt_num,#1\n"
-              "bne       1b                    @bne s3_max_loop_mid\n"
-              "3:                              @loop \n"
-              "cmp       %[cnt_num_remain], #0 @cmp cnt_num,0\n"
-              "ble       4f                    @ble exit1\n"
-              "2:                              @mid loop\n"
-              "vld1.f32  {d0-d1}, [%[dr0]]!    @load d0-d1,dr0\n"
-              "vld1.f32  {d2-d3}, [%[dr1]]!    @load d2-d3,dr1\n"
-              "vld1.f32  {d4-d5}, [%[dr2]]!    @load d2-d3,dr1\n"
-              "vmov.f32  s3,s2                 @movs3,s2\n"
-              "vmov.f32  s7,s6                 @movs7,s6\n"
-              "vmov.f32  s11,s10               @movs11,s10\n"
-              "vmax.f32  q0, q0, q1            @max q0,q0,q1\n"
-              "vmax.f32  q0, q0, q2            @max q0,q0,q2\n"
-              "vpmax.f32 d0, d0, d1            @pmax d0,d0,d1\n"
-              "vpmax.f32 d0, d0, d0            @pmax d0, d0,d0\n"
-              "vst1.f32  d0[0], [%[dr_out]]!   @vst d0[0],dr_out\n"
-              "sub       %[dr0], #8            @add w,6\n"
-              "sub       %[dr1], #8            @add w,6\n"
-              "sub       %[dr2], #8            @add w,6\n"
-              "subs      %[cnt_num_remain], #1 @subs cnt_num,#1\n"
-              "bne       2b                    @bne s3_max_loop_mid_1\n"
-              "4:                              @exit\n"
-              : [dr0] "+r"(dr0),
-                [dr1] "+r"(dr1),
-                [dr2] "+r"(dr2),
-                [dr_out] "+r"(dr_out),
-                [cnt_num] "+r"(cnt_num),
-                [cnt_num_remain] "+r"(cnt_num_remain)
-              : "r"(dr0),
-                "r"(dr1),
-                "r"(dr2),
-                "r"(dr_out),
-                "r"(cnt_num),
-                "r"(cnt_num_remain)
-              : "cc",
-                "memory",
-                "q0",
-                "q1",
-                "q2",
-                "q3",
-                "q4",
-                "q5",
-                "q6",
-                "q7",
-                "q8",
-                "q9",
-                "q10",
-                "q11",
-                "q12");
-        }
-#endif
-        if (w_remain > 0) {
-          // deal with right pad
-          int wstart = (w_even >> 1) * stride - padding;
-          int wend = std::min(std::min(wstart + kernel, win + padding), win);
-          float tmp = r0[wstart];  // std::numeric_limits<float>::min();
-          for (int i = wstart; i < wend; i++) {
-            tmp = std::max(tmp, std::max(r0[i], r1[i]));
-            tmp = std::max(tmp, r2[i]);
-          }
-          dout_ch[w_even >> 1] = tmp;
-          // cnt ++;
-        }
-        r0 = r2;
-        r1 = r0 + win;
-        r2 = r1 + win;
-        dout_ch += wout;
-      }
-
-      if (h_remain > 0) {
-        // deal with bottom pad
-        // first row with zero pad
-        int hstart = (h >> 1) * stride - padding;
-        int hend = std::min(std::min(hstart + kernel, hin + padding), hin);
-        if (hstart == hend - 1) {  // only one lline
-          dout_ch[0] = std::max(r0[0], r0[1]);
-#if __aarch64__
-          w = 1;
-          cnt = 1;
-          for (; w < w_unroll_size; w += 8) {
-            float32x4_t vmax_1234 = vld1q_f32(&r0[w]);
-            float32x4_t vmax_5678 = vld1q_f32(&r0[w + 4]);
-            float32x4_t vmax_9101112 = vld1q_f32(&r0[w + 8]);
-            float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678, 1);
-            float32x4_t vmax_6789 = vextq_f32(vmax_5678, vmax_9101112, 1);
-            float32x2_t vmax_12_34 =
-                vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234));
-            float32x2_t vmax_23_45 =
-                vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345));
-            float32x2_t vmax_56_78 =
-                vpmax_f32(vget_low_f32(vmax_5678), vget_high_f32(vmax_5678));
-            float32x2_t vmax_67_89 =
-                vpmax_f32(vget_low_f32(vmax_6789), vget_high_f32(vmax_6789));
-            float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45);
-            float32x2_t vmax_567_789 = vmax_f32(vmax_56_78, vmax_67_89);
-            vst1_f32(&dout_ch[cnt], vmax_123_345);
-            vst1_f32(&dout_ch[cnt + 2], vmax_567_789);
-            cnt += 4;
-          }
-          for (; w < w_even - 1; w += 2) {
-            float32x4_t vr0 = vld1q_f32(&r0[w]);
-            vr0 = vsetq_lane_f32(minval, vr0, 3);
-            float32x2_t vmax = vpmax_f32(vget_low_f32(vr0), vget_high_f32(vr0));
-            vmax = vpmax_f32(vmax, vmax);
-            dout_ch[cnt] = vget_lane_f32(vmax, 0);
-            cnt++;
-          }
-#else
-          dr_out = dout_ch + 1;
-          dr0 = (r0 + 1);
-          cnt_num = w_unroll_size >> 3;
-          cnt_num_remain = w_unroll_remain >> 1;
-          // LOG(INFO) << "cnt_num: " << cnt_num << " cnt_num_remain: " <<
-          // cnt_num_remain;
-          if (cnt_num > 0 || cnt_num_remain > 0) {
-            asm volatile(
-                "cmp       %[cnt_num], #0            @cmp cnt_num,0\n"
-                "ble       3f                        @ble exit\n"
-                "1:                                  @main loop\n"
-                "vld1.f32  {d0-d3}, [%[dr0]]!        @load d0-d3,dr0\n"
-                "vld1.f32  {d4-d5}, [%[dr0]]!        @load d0-d3,dr0\n"
-                "vext.f32  q4, q0, q1, #1            @vmax_2345\n"
-                "vext.f32  q5, q1, q2, #1            @vmax_6789\n"
-                "vpmax.f32 d12, d0, d1               @vmax_12_34\n"
-                "vpmax.f32 d14, d2, d3               @vmax_56_78\n"
-                "vpmax.f32 d13, d8, d9               @vmax_23_45\n"
-                "vpmax.f32 d15, d10, d11             @vmax_67_89\n"
-                "vmax.f32  d0, d12, d13              @12_34,23_45\n"
-                "vmax.f32  d1, d14, d15              @56_78,67_89\n"
-                "sub       %[dr0], #16               @add w,6\n"
-                "vst1.f32  d0, [%[dr_out]]!          @vst1 d0,dr_out\n"
-                "vst1.f32  d1, [%[dr_out]]!          @vst1 d0,dr_out\n"
-                "subs      %[cnt_num], #1            @subs cnt_num,#1\n"
-                "bne       1b                        @bne s3_max_loop_bot\n"
-                "3:                                  @loop \n"
-                "cmp       %[cnt_num_remain], #0     @cmp cnt_num,0\n"
-                "ble       4f                        @ble exit\n"
-                "2:                                  @bot loop\n"
-                "vld1.f32  {d0-d1}, [%[dr0]]!        @load d0-d1,dr0\n"
-                "vmov.f32  s3,s2                     @movs3, s2\n"
-                "vpmax.f32 d0, d0, d1                @pmax d0,d0,d1\n"
-                "vpmax.f32 d0, d0, d0                @pmax d0,d0,d0\n"
-                "vst1.f32  d0[0], [%[dr_out]]!       @vst d0[0],dr_out\n"
-                "sub       %[dr0], #8                @add w,2\n"
-                "subs      %[cnt_num_remain], #1     @subs cnt_num,#1\n"
-                "bne       2b                        @bne s3_max_loop_bot_1\n"
-                "4:                                  @exit\n"
-                : [dr0] "+r"(dr0),
-                  [dr1] "+r"(dr1),
-                  [dr_out] "+r"(dr_out),
-                  [cnt_num] "+r"(cnt_num),
-                  [cnt_num_remain] "+r"(cnt_num_remain)
-                : "r"(dr0),
-                  "r"(dr1),
-                  "r"(dr_out),
-                  "r"(cnt_num),
-                  "r"(cnt_num_remain)
-                : "cc",
-                  "memory",
-                  "q0",
-                  "q1",
-                  "q2",
-                  "q3",
-                  "q4",
-                  "q5",
-                  "q6",
-                  "q7",
-                  "q8");
-          }
-#endif
-          if (w_remain > 0) {
-            // deal with right pad
-            int wstart = (w_even >> 1) * stride - padding;
-            int wend = std::min(std::min(wstart + kernel, win + padding), win);
-            float tmp = r0[wstart];  // std::numeric_limits<float>::min();
-            for (int i = wstart; i < wend; i++) {
-              tmp = std::max(tmp, r0[i]);
-            }
-            dout_ch[w_even >> 1] = tmp;
-          }
-        } else {  // two lines
-          dout_ch[0] = std::max(std::max(r0[0], r0[1]), std::max(r1[0], r1[1]));
-#ifdef __aarch64__
-          w = 1;
-          cnt = 1;
-          for (; w < w_unroll_size; w += 8) {
-            float32x4_t vr0_1234 = vld1q_f32(&r0[w]);
-            float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]);
-            float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]);
-            float32x4_t vr1_1234 = vld1q_f32(&r1[w]);
-            float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]);
-            float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]);
-            float32x4_t vmax_1234 = vmaxq_f32(vr0_1234, vr1_1234);
-            float32x4_t vmax_5678 = vmaxq_f32(vr0_5678, vr1_5678);
-            float32x4_t vmax_9101112 = vmaxq_f32(vr0_9101112, vr1_9101112);
-            float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678, 1);
-            float32x4_t vmax_6789 = vextq_f32(vmax_5678, vmax_9101112, 1);
-            float32x2_t vmax_12_34 =
-                vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234));
-            float32x2_t vmax_23_45 =
-                vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345));
-            float32x2_t vmax_56_78 =
-                vpmax_f32(vget_low_f32(vmax_5678), vget_high_f32(vmax_5678));
-            float32x2_t vmax_67_89 =
-                vpmax_f32(vget_low_f32(vmax_6789), vget_high_f32(vmax_6789));
-            float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45);
-            float32x2_t vmax_567_789 = vmax_f32(vmax_56_78, vmax_67_89);
-            vst1_f32(&dout_ch[cnt], vmax_123_345);
-            vst1_f32(&dout_ch[cnt + 2], vmax_567_789);
-            cnt += 4;
-          }
-          for (; w < w_even - 1; w += 2) {
-            float32x4_t vr0 = vld1q_f32(&r0[w]);
-            float32x4_t vr1 = vld1q_f32(&r1[w]);
-            vr0 = vsetq_lane_f32(minval, vr0, 3);
-            vr1 = vsetq_lane_f32(minval, vr1, 3);
-            float32x4_t vmax1 = vmaxq_f32(vr0, vr1);
-            float32x2_t vmax2 =
-                vpmax_f32(vget_low_f32(vmax1), vget_high_f32(vmax1));
-            vmax2 = vpmax_f32(vmax2, vmax2);
-            dout_ch[cnt] = vget_lane_f32(vmax2, 0);
-            cnt++;
-          }
-#else
-          dr_out = dout_ch + 1;
-          dr0 = (r0 + 1);
-          dr1 = (r1 + 1);
-          cnt_num = w_unroll_size >> 3;
-          cnt_num_remain = w_unroll_remain >> 1;
-          // LOG(INFO) << "cnt_num: " << cnt_num << " cnt_num_remain: " <<
-          // cnt_num_remain;
-          if (cnt_num > 0 || cnt_num_remain > 0) {
-            asm volatile(
-                "cmp       %[cnt_num], #0         @cmp cnt_num,0\n"
-                "ble       3f                     @ble exit\n"
-                "1:                               @main loop\n"
-                "vld1.f32  {d0-d3}, [%[dr0]]!     @load d0-d5,dr0\n"
-                "vld1.f32  {d6-d9}, [%[dr1]]!     @load d4-d7,dr1\n"
-                "vld1.f32  {d4-d5}, [%[dr0]]!     @load d0-d3,dr0\n"
-                "vld1.f32  {d10-d11}, [%[dr1]]!   @load d4-d7,dr1\n"
-                "vmax.f32  q6, q0, q3             @max q0,q0,q2 1234\n"
-                "vmax.f32  q7, q1, q4             @max q1,q1,q3 5678\n"
-                "vmax.f32  q8, q2, q5             @max q1,q1,q3 9101112\n"
-                //"vmov.f32  s7,s6                @mov s7, s6\n"
-                "vext.f32  q0, q6, q7, #1         @vext q0,2345\n"
-                "vext.f32  q1, q7, q8, #1         @vext q1,6789\n"
-                "vpmax.f32 d4, d12, d13           @pmax "
-                "d4,vmax_1234,vmax_1234\n"
-                "vpmax.f32 d6, d14, d15           @pmax "
-                "d6,vmax_5678,vmax_5678\n"
-                "vpmax.f32 d5, d0, d1             @pmax "
-                "d5,vmax_2345,vmax_2345\n"
-                "vpmax.f32 d7, d2, d3             @pmax "
-                "d7,vmax_6789,vmax_6789\n"
-                "vmax.f32 d8, d4, d5              @max "
-                "d2,vmax_12_34,vmax_23_45\n"
-                "vmax.f32 d9, d6, d7              @max "
-                "d2,vmax_56_78,vmax_67_89\n"
-                "sub       %[dr0], #16            @add w,8\n"
-                "sub       %[dr1], #16            @add w,8\n"
-                "vst1.f32  d8, [%[dr_out]]!       @vst1 d0,dr_out\n"
-                "vst1.f32  d9, [%[dr_out]]!       @vst1 d0,dr_out\n"
-                "subs      %[cnt_num], #1         @subs cnt_num,#1\n"
-                "bne       1b                     @bne s3_max_loop_bot\n"
-                "3:                               @loop \n"
-                "cmp       %[cnt_num_remain], #0  @cmp cnt_num,0\n"
-                "ble       4f                     @ble exit\n"
-                "2:                               @bot loop\n"
-                "vld1.f32  {d0-d1}, [%[dr0]]!     @load d0-d1,dr0\n"
-                "vld1.f32  {d2-d3}, [%[dr1]]!     @load d2-d3,dr1\n"
-                "vmov.f32  s3,s2                  @movs3, s2\n"
-                "vmov.f32  s7,s6                  @movs7, s6\n"
-                "vmax.f32  q0, q0, q1             @max q0,q0,q1\n"
-                "vpmax.f32 d0, d0, d1             @pmax d0,d0,d1\n"
-                "vpmax.f32 d0, d0, d0             @pmax d0,d0,d0\n"
-                "vst1.f32  d0[0], [%[dr_out]]!    @vst d0[0],dr_out\n"
-                "sub       %[dr0], #8             @add w,6\n"
-                "sub       %[dr1], #8             @add w,6\n"
-                "subs      %[cnt_num_remain], #1  @subs cnt_num,#1\n"
-                "bne       2b                     @bne s3_max_loop_bot_1\n"
-                "4:                               @exit\n"
-                : [dr0] "+r"(dr0),
-                  [dr1] "+r"(dr1),
-                  [dr_out] "+r"(dr_out),
-                  [cnt_num] "+r"(cnt_num),
-                  [cnt_num_remain] "+r"(cnt_num_remain)
-                : "r"(dr0),
-                  "r"(dr1),
-                  "r"(dr_out),
-                  "r"(cnt_num),
-                  "r"(cnt_num_remain)
-                : "cc",
-                  "memory",
-                  "q0",
-                  "q1",
-                  "q2",
-                  "q3",
-                  "q4",
-                  "q5",
-                  "q6",
-                  "q7",
-                  "q8",
-                  "q9");
-          }
-#endif
-          if (w_remain > 0) {
-            // deal with right pad
-            int wstart = (w_even >> 1) * stride - padding;
-            int wend = std::min(std::min(wstart + kernel, win + padding), win);
-            float tmp = r0[wstart];  // std::numeric_limits<float>::min();
-            for (int i = wstart; i < wend; i++) {  // only run 1 or 2 times
-              tmp = std::max(tmp, std::max(r0[i], r1[i]));
-            }
-            dout_ch[w_even >> 1] = tmp;
-          }
-        }
-      }
-    }
-  }
-}
-
-void pooling3x3s2p1_avg(const float* din,
-                        float* dout,
-                        int num,
-                        int chout,
-                        int hout,
-                        int wout,
-                        int chin,
-                        int hin,
-                        int win,
-                        bool exclusive) {
-  int kernel = 3;
-  int stride = 2;
-  int padding = 1;
-  int size_channel_out = wout * hout;
-  int size_channel_in = win * hin;
-
-  int w_needed = (wout << 1) + 1;
-  int h_needed = (hout << 1) + 1;
-  int w_limit = w_needed > win ? win : w_needed;
-  int h_limit = h_needed > hin ? hin : h_needed;
-  int w_even = (w_limit >> 1) << 1;
-  int h_even = (h_limit >> 1) << 1;
-  int w_unroll_size = ((w_even - 1) >> 3) << 3;
-  int w_unroll_remain = w_even - 1 - w_unroll_size;
-  int w_remain = w_needed - w_limit - padding;
-  int h_remain = h_needed - h_limit - padding;
-  int w_in_2 = win << 1;
-  const float coef = 1.f / 9.f;
-  const float coef_1 = exclusive ? 1.f : coef;
-  const float coef_2 = exclusive ? 1.f / 2.f : coef;
-  const float coef_3 = exclusive ? 1.f / 3.f : coef;
-  const float coef_4 = exclusive ? 1.f / 4.f : coef;
-  const float coef_6 = exclusive ? 1.f / 6.f : coef;
-  float32x4_t vcoef = vdupq_n_f32(coef);
-  float32x4_t vcoef_1 = vdupq_n_f32(coef_1);
-  float32x4_t vcoef_2 = vdupq_n_f32(coef_2);
-  float32x4_t vcoef_3 = vdupq_n_f32(coef_3);
-  float32x4_t vcoef_4 = vdupq_n_f32(coef_4);
-  float32x4_t vcoef_6 = vdupq_n_f32(coef_6);
-  for (int n = 0; n < num; ++n) {
-    float* dout_batch = dout + n * chout * size_channel_out;
-    const float* din_batch = din + n * chin * size_channel_in;
-#pragma omp parallel for
-    for (int c = 0; c < chout; c++) {
-      float* dout_ch = dout_batch + c * size_channel_out;
-      const float* din_ch = din_batch + c * size_channel_in;
-      const float* r0 = din_ch;
-      const float* r1 = r0 + win;
-      const float* r2 = r1 + win;
-      int cnt_num = w_unroll_size >> 3;
-      int cnt_num_remain = w_unroll_remain >> 1;
-      float* dr_out = dout_ch;
-      const float* dr0 = r0;
-      const float* dr1 = r1;
-      const float* dr2 = r2;
-      int w = 1;
-      int cnt = 1;
-      float32x4_t vzero = vdupq_n_f32(0.f);
-      dout_ch[0] = (r0[0] + r0[1] + r1[0] + r1[1]) * coef_4;
-// first row with zero pad
-#ifdef __aarch64__
-      for (; w < w_unroll_size; w += 8) {
-        float32x4_t vr0_1234 = vld1q_f32(&r0[w]);
-        float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]);
-        float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]);
-        float32x4_t vr1_1234 = vld1q_f32(&r1[w]);
-        float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]);
-        float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]);
-        float32x4_t vsum_1234 = vaddq_f32(vr0_1234, vr1_1234);
-        float32x4_t vsum_5678 = vaddq_f32(vr0_5678, vr1_5678);
-        float32x4_t vsum_9101112 = vaddq_f32(vr0_9101112, vr1_9101112);
-
-        float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678, 1);
-        float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678, 2);
-        float32x4_t vsum_4567 = vextq_f32(vsum_1234, vsum_5678, 3);
-        float32x4_t vsum_6789 = vextq_f32(vsum_5678, vsum_9101112, 1);
-        float32x4_t vsum_123_345 = vaddq_f32(vsum_1234, vsum_2345);
-        vsum_123_345 = vaddq_f32(vsum_123_345, vsum_3456);
-        float32x4_t vsum_567_789 = vaddq_f32(vsum_4567, vsum_5678);
-        vsum_567_789 = vaddq_f32(vsum_567_789, vsum_6789);
-        vsum_123_345 =
-            vsetq_lane_f32(vgetq_lane_f32(vsum_123_345, 2), vsum_123_345, 1);
-        vsum_123_345 =
-            vsetq_lane_f32(vgetq_lane_f32(vsum_567_789, 1), vsum_123_345, 2);
-        vsum_123_345 =
-            vsetq_lane_f32(vgetq_lane_f32(vsum_567_789, 3), vsum_123_345, 3);
-        float32x4_t vrst = vmulq_f32(vsum_123_345, vcoef_6);
-        vst1q_f32(&dout_ch[cnt], vrst);
-        cnt += 4;
-      }
-      for (; w < w_even - 1; w += 2) {
-        float32x4_t vr0 = vld1q_f32(&r0[w]);
-        float32x4_t vr1 = vld1q_f32(&r1[w]);
-        vr0 = vsetq_lane_f32(0.f, vr0, 3);
-        vr1 = vsetq_lane_f32(0.f, vr1, 3);
-        float32x4_t vsum1 = vaddq_f32(vr0, vr1);
-        float32x2_t vsum2 =
-            vpadd_f32(vget_low_f32(vsum1), vget_high_f32(vsum1));
-        vsum2 = vpadd_f32(vsum2, vsum2);
-        float32x2_t vrst = vmul_f32(vsum2, vget_low_f32(vcoef_6));
-        dout_ch[cnt] = vget_lane_f32(vrst, 0);
-        cnt++;
-      }
-#else
-      dr0 = dr0 + 1;
-      dr1 = dr1 + 1;
-      dr_out = dr_out + 1;
-      // LOG(INFO) << "cnt_num: " << cnt_num << " cnt_num_remain: " <<
-      // cnt_num_remain;
-      if (cnt_num > 0 || cnt_num_remain > 0) {
-        asm volatile(
-            "cmp       %[cnt_num], #0        @cmp cnt_num,0\n"
-            "ble       3f                    @ble exit\n"
-            "1:                              @main loop\n"
-            "vld1.f32  {d0-d3}, [%[dr0]]!    @load d0-d5,dr0\n"
-            "vld1.f32  {d6-d9}, [%[dr1]]!    @load d4-d7,dr1\n"
-            "vld1.f32  {d4-d5}, [%[dr0]]!    @load d0-d5,dr0\n"
-            "vld1.f32  {d10-d11}, [%[dr1]]!  @load d4-d7,dr1\n"
-            "vadd.f32  q6, q0, q3            @max r0_1234,r1_1234\n"
-            "vadd.f32  q7, q1, q4            @max r0_5678,r1_5678\n"
-            "vadd.f32  q8, q2, q5            @max r0_9101112,r1_9101112\n"
-            //"vmov.f32  s7,s6               @mov s7, s6\n"
-            "vext.f32  q0, q6, q7, #1        @vext max_2345\n"
-            "vext.f32  q1, q6, q7, #3        @vext max_4567\n"
-            "vext.f32  q2, q6, q7, #2        @vext max_3456\n"
-            "vext.f32  q3, q7, q8, #1        @vext max_6789\n"
-            "vadd.f32  q4, q6, q0            @add 1234, 2345\n"
-            "vadd.f32  q5, q7, q1            @add 5678, 4567\n"
-            "vadd.f32  q4, q4, q2            @add 3456, sum1\n"
-            "vadd.f32  q5, q5, q3            @add 6789, sum2\n"
-            "vmov.f32  s17, s18              @mov\n"
-            "vmov.f32  s18, s21              @mov\n"
-            "vmov.f32  s19, s23              @mov\n"
-            "vmul.f32  q4, q4, %q[vcoef_6]   @mul\n"
-            "sub       %[dr0], #16           @add w,8\n"
-            "sub       %[dr1], #16           @add w,8\n"
-            "subs      %[cnt_num], #1        @subs cnt_num,#1\n"
-            "vst1.f32  d8, [%[dr_out]]!      @vst1 d0,dr_out\n"
-            "vst1.f32  d9, [%[dr_out]]!      @vst1 d0,dr_out\n"
-            "bne       1b                    @bne s3_max_loop\n"
-            "3:                              @loop\n"
-            "cmp       %[cnt_num_remain], #0 @cnt_num_remain<=0\n"
-            "ble       4f                    @ble exit\n"
-            "2:                              @main loop\n"
-            "vld1.f32  {d0-d1}, [%[dr0]]!    @load d0-d1,dr0\n"
-            "vld1.f32  {d2-d3}, [%[dr1]]!    @load d2-d3,dr1\n"
-            "vext.f32  q0, %q[vzero], q0, #3 @ext v0_0123\n"
-            "vext.f32  q1, %q[vzero], q1, #3 @ext v1_0123\n"
-            "vadd.f32  q0, q0, q1            @add q0,q0,q1\n"
-            "vpadd.f32 d0, d0, d1            @padd d0,d0,d1\n"
-            "vpadd.f32 d0, d0, d0            @padd d0, d0,d0\n"
-            "vmul.f32  d0, d0, %e[vcoef_6]   @mul\n"
-            "sub       %[dr0], #8            @add w,6\n"
-            "sub       %[dr1], #8            @add w,6\n"
-            "subs      %[cnt_num_remain], #1 @subs cnt_num,#1\n"
-            "vst1.f32  d0[0], [%[dr_out]]!   @vst d0[0],dr_out\n"
-            "bne       2b                    @bne s3_max_loop_1\n"
-            "4:                              @exit\n"
-            : [dr0] "+r"(dr0),
-              [dr1] "+r"(dr1),
-              [dr_out] "+r"(dr_out),
-              [cnt_num] "+r"(cnt_num),
-              [cnt_num_remain] "+r"(cnt_num_remain),
-              [vcoef_6] "+w"(vcoef_6),
-              [vzero] "+w"(vzero)
-            : "r"(dr0), "r"(dr1), "r"(dr_out), "r"(cnt_num), "r"(cnt_num_remain)
-            : "cc",
-              "memory",
-              "q0",
-              "q1",
-              "q2",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9");
-      }
-#endif
-      // int w = w_even - 1;
-      if (w_remain > 0) {
-        // deal with right pad
-        int wstart = (w_even >> 1) * stride - padding;
-        int wend = std::min(std::min(wstart + kernel, win + padding), win);
-        float tmp1 = 0.f;  // std::numeric_limits<float>::min();
-        float tmp2 = exclusive ? 1.0f / (2.f * (wend - wstart)) : coef;
-        for (int i = wstart; i < wend; i++) {  // only run 1 or 2 times
-          tmp1 += (r0[i] + r1[i]);
-        }
-        dout_ch[w_even >> 1] = tmp1 * tmp2;
-        // cnt ++;
-      }
-
-      r0 = r1;
-      r1 = r0 + win;
-      r2 = r1 + win;
-      dout_ch += wout;
-      int h = 2;
-      for (; h < h_even; h += 2) {
-        // deal with left pad
-        float sum0 = r0[0] + r0[1];
-        float sum1 = r1[0] + r1[1];
-        float sum2 = r2[0] + r2[1];
-        dout_ch[0] = (sum0 + sum1 + sum2) * coef_6;
-#ifdef __aarch64__
-        w = 1;
-        cnt = 1;
-        for (; w < w_unroll_size; w += 8) {
-          float32x4_t vr0_1234 = vld1q_f32(&r0[w]);
-          float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]);
-          float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]);
-          float32x4_t vr1_1234 = vld1q_f32(&r1[w]);
-          float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]);
-          float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]);
-          float32x4_t vr2_1234 = vld1q_f32(&r2[w]);
-          float32x4_t vr2_5678 = vld1q_f32(&r2[w + 4]);
-          float32x4_t vr2_9101112 = vld1q_f32(&r2[w + 8]);
-          float32x4_t vsum_1234 = vaddq_f32(vr0_1234, vr1_1234);
-          float32x4_t vsum_5678 = vaddq_f32(vr0_5678, vr1_5678);
-          float32x4_t vsum_9101112 = vaddq_f32(vr0_9101112, vr1_9101112);
-          vsum_1234 = vaddq_f32(vsum_1234, vr2_1234);
-          vsum_5678 = vaddq_f32(vsum_5678, vr2_5678);
-          vsum_9101112 = vaddq_f32(vsum_9101112, vr2_9101112);
-
-          float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678, 1);
-          float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678, 2);
-          float32x4_t vsum_4567 = vextq_f32(vsum_1234, vsum_5678, 3);
-          float32x4_t vsum_6789 = vextq_f32(vsum_5678, vsum_9101112, 1);
-          float32x4_t vsum_123_345 = vaddq_f32(vsum_1234, vsum_2345);
-          vsum_123_345 = vaddq_f32(vsum_123_345, vsum_3456);
-          float32x4_t vsum_567_789 = vaddq_f32(vsum_4567, vsum_5678);
-          vsum_567_789 = vaddq_f32(vsum_567_789, vsum_6789);
-          vsum_123_345 =
-              vsetq_lane_f32(vgetq_lane_f32(vsum_123_345, 2), vsum_123_345, 1);
-          vsum_123_345 =
-              vsetq_lane_f32(vgetq_lane_f32(vsum_567_789, 1), vsum_123_345, 2);
-          vsum_123_345 =
-              vsetq_lane_f32(vgetq_lane_f32(vsum_567_789, 3), vsum_123_345, 3);
-          float32x4_t vrst = vmulq_f32(vsum_123_345, vcoef);
-          vst1q_f32(&dout_ch[cnt], vrst);
-          cnt += 4;
-        }
-        for (; w < w_even - 1; w += 2) {
-          float32x4_t vr0 = vld1q_f32(&r0[w]);
-          float32x4_t vr1 = vld1q_f32(&r1[w]);
-          float32x4_t vr2 = vld1q_f32(&r2[w]);
-          vr0 = vsetq_lane_f32(0.f, vr0, 3);
-          vr1 = vsetq_lane_f32(0.f, vr1, 3);
-          vr2 = vsetq_lane_f32(0.f, vr2, 3);
-          float32x4_t vsum1 = vaddq_f32(vr0, vr1);
-          vsum1 = vaddq_f32(vsum1, vr2);
-          float32x2_t vsum2 =
-              vpadd_f32(vget_low_f32(vsum1), vget_high_f32(vsum1));
-          float32x2_t vsum = vpadd_f32(vsum2, vsum2);
-          dout_ch[cnt] = vget_lane_f32(vsum, 0) * coef;
-          cnt++;
-        }
-#else
-        dr_out = dout_ch + 1;
-        dr0 = (r0 + 1);
-        dr1 = (r1 + 1);
-        dr2 = (r2 + 1);
-        cnt_num = w_unroll_size >> 3;
-        cnt_num_remain = w_unroll_remain >> 1;
-        if (cnt_num > 0 || cnt_num_remain > 0) {
-          asm volatile(
-              "cmp       %[cnt_num], #0        @cmp cnt_num,0\n"
-              "ble       3f                    @ble exit\n"
-              "1:                              @main loop\n"
-              "vld1.f32  {d0-d3}, [%[dr0]]!    @load d0-d5, "
-              "dr0\n"
-              "vld1.f32  {d6-d9}, [%[dr1]]!    @load d4-d7,dr1\n"
-              "vld1.f32  {d12-d15}, [%[dr2]]!  @load d4-d7,dr1\n"
-              "vld1.f32  {d4-d5}, [%[dr0]]!    @load d0-d5,dr0\n"
-              "vld1.f32  {d10-d11}, [%[dr1]]!  @load d4-d7,dr1\n"
-              "vld1.f32  {d16-d17}, [%[dr2]]!  @load d4-d7,dr1\n"
-              "vadd.f32  q9, q0, q3            @max q0,q0,q2\n"
-              "vadd.f32  q10, q1, q4           @max q1,q1,q3\n"
-              "vadd.f32  q11, q2, q5           @max q1,q1,q3\n"
-              "vadd.f32  q6, q9, q6            @max q0,q0,q2 1234\n"
-              "vadd.f32  q7, q10, q7           @max q1,q1,q3 5678\n"
-              "vadd.f32  q8, q11, q8           @max q1,q1,q3 9101112\n"
-              //"vmov.f32  s7,s6               @mov s7, s6\n"
-              "vext.f32  q0, q6, q7, #1        @vext max_2345\n"
-              "vext.f32  q1, q6, q7, #3        @vext max_4567\n"
-              "vext.f32  q2, q6, q7, #2        @vext max_3456\n"
-              "vext.f32  q3, q7, q8, #1        @vext max_6789\n"
-              "vadd.f32  q4, q6, q0            @add 1234,2345\n"
-              "vadd.f32  q5, q7, q1            @add 5678,4567\n"
-              "vadd.f32  q4, q4, q2            @add 3456,sum1\n"
-              "vadd.f32  q5, q5, q3            @add 6789,sum2\n"
-              "vmov.f32  s17, s18              @mov\n"
-              "vmov.f32  s18, s21              @mov\n"
-              "vmov.f32  s19, s23              @mov\n"
-              "vmul.f32  q4, q4, %q[vcoef]     @mul\n"
-              "sub       %[dr0], #16           @add w,8\n"
-              "sub       %[dr1], #16           @add w,8\n"
-              "sub       %[dr2], #16           @add w, 8\n"
-              "subs      %[cnt_num], #1        @subs cnt_num,#1\n"
-              "vst1.f32  d8, [%[dr_out]]!      @vst1 d0,dr_out\n"
-              "vst1.f32  d9, [%[dr_out]]!      @vst1 d0,dr_out\n"
-              "bne       1b                    @bne s3_max_loop_mid\n"
-              "3:                              @loop\n"
-              "cmp       %[cnt_num_remain], #0 @cnt_num_remain<=0\n"
-              "ble       4f                    @ble exit1\n"
-              "2:                              @mid loop\n"
-              "vld1.f32  {d0-d1}, [%[dr0]]!    @load d0-d1,dr0\n"
-              "vld1.f32  {d2-d3}, [%[dr1]]!    @load d2-d3,dr1\n"
-              "vld1.f32  {d4-d5}, [%[dr2]]!    @load d2-d3,dr1\n"
-              "vext.f32  q0, %q[vzero], q0, #3 @ext v0_0123\n"
-              "vext.f32  q1, %q[vzero], q1, #3 @ext v1_0123\n"
-              "vext.f32  q2, %q[vzero], q2, #3 @ext v1_0123\n"
-              "vadd.f32  q0, q0, q1            @add q0,q0,q1\n"
-              "vadd.f32  q0, q0, q2            @add q0,q0,q1\n"
-              "vpadd.f32 d0, d0, d1            @padd d0,d0,d1\n"
-              "vpadd.f32 d0, d0, d0            @padd d0,d0,d0\n"
-              "vmul.f32  d0, d0, %e[vcoef]     @mul\n"
-              "sub       %[dr0], #8            @add w,6\n"
-              "sub       %[dr1], #8            @add w,6\n"
-              "sub       %[dr2], #8            @add w,6\n"
-              "subs      %[cnt_num_remain], #1 @cnt_num_remain--\n"
-              "vst1.f32  d0[0], [%[dr_out]]!   @vst d0[0],dr_out\n"
-              "bne       2b                    @bne s3_max_loop_mid_1\n"
-              "4:                              @exit\n"
-              : [dr0] "+r"(dr0),
-                [dr1] "+r"(dr1),
-                [dr2] "+r"(dr2),
-                [dr_out] "+r"(dr_out),
-                [cnt_num] "+r"(cnt_num),
-                [cnt_num_remain] "+r"(cnt_num_remain),
-                [vcoef] "+w"(vcoef),
-                [vzero] "+w"(vzero)
-              : "r"(dr0),
-                "r"(dr1),
-                "r"(dr2),
-                "r"(dr_out),
-                "r"(cnt_num),
-                "r"(cnt_num_remain)
-              : "cc",
-                "memory",
-                "q0",
-                "q1",
-                "q2",
-                "q3",
-                "q4",
-                "q5",
-                "q6",
-                "q7",
-                "q8",
-                "q9",
-                "q10",
-                "q11",
-                "q12");
-        }
-#endif
-        if (w_remain > 0) {
-          // deal with right pad
-          int wstart = (w_even >> 1) * stride - padding;
-          int wend = std::min(std::min(wstart + kernel, win + padding), win);
-          float tmp1 = 0.f;
-          float tmp2 = exclusive ? 1.0f / (3.f * (wend - wstart)) : coef;
-          for (int i = wstart; i < wend; i++) {
-            tmp1 += (r0[i] + r1[i] + r2[i]);
-          }
-          dout_ch[w_even >> 1] = tmp1 * tmp2;
-          // cnt ++;
-        }
-        r0 = r2;
-        r1 = r0 + win;
-        r2 = r1 + win;
-        dout_ch += wout;
-      }
-
-      if (h_remain > 0) {
-        // deal with bottom pad
-        // first row with zero pad
-        int hstart = (h >> 1) * stride - padding;
-        int hend = std::min(std::min(hstart + kernel, hin + padding), hin);
-        if (hstart == hend - 1) {  // only one line
-          dout_ch[0] = (r0[0] + r0[1]) * coef_2;
-#ifdef __aarch64__
-          w = 1;
-          cnt = 1;
-          for (; w < w_unroll_size; w += 8) {
-            float32x4_t vsum_1234 = vld1q_f32(&r0[w]);
-            float32x4_t vsum_5678 = vld1q_f32(&r0[w + 4]);
-            float32x4_t vsum_9101112 = vld1q_f32(&r0[w + 8]);
-
-            float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678, 1);
-            float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678, 2);
-            float32x4_t vsum_4567 = vextq_f32(vsum_1234, vsum_5678, 3);
-            float32x4_t vsum_6789 = vextq_f32(vsum_5678, vsum_9101112, 1);
-            float32x4_t vsum_123_345 = vaddq_f32(vsum_1234, vsum_2345);
-            vsum_123_345 = vaddq_f32(vsum_123_345, vsum_3456);
-            float32x4_t vsum_567_789 = vaddq_f32(vsum_4567, vsum_5678);
-            vsum_567_789 = vaddq_f32(vsum_567_789, vsum_6789);
-            vsum_123_345 = vsetq_lane_f32(
-                vgetq_lane_f32(vsum_123_345, 2), vsum_123_345, 1);
-            vsum_123_345 = vsetq_lane_f32(
-                vgetq_lane_f32(vsum_567_789, 1), vsum_123_345, 2);
-            vsum_123_345 = vsetq_lane_f32(
-                vgetq_lane_f32(vsum_567_789, 3), vsum_123_345, 3);
-            float32x4_t vrst = vmulq_f32(vsum_123_345, vcoef_3);
-            vst1q_f32(&dout_ch[cnt], vrst);
-            cnt += 4;
-          }
-          for (; w < w_even - 1; w += 2) {
-            float32x4_t vr0 = vld1q_f32(&r0[w]);
-            vr0 = vsetq_lane_f32(0.f, vr0, 3);
-            float32x2_t vsum = vpadd_f32(vget_low_f32(vr0), vget_high_f32(vr0));
-            vsum = vpadd_f32(vsum, vsum);
-            dout_ch[cnt] = vget_lane_f32(vsum, 0) * coef_3;
-            cnt++;
-          }
-#else
-          dr_out = dout_ch + 1;
-          dr0 = (r0 + 1);
-          cnt_num = w_unroll_size >> 3;
-          cnt_num_remain = w_unroll_remain >> 1;
-          if (cnt_num > 0 || cnt_num_remain > 0) {
-            asm volatile(
-                "cmp       %[cnt_num], #0         @cmp cnt_num,0\n"
-                "ble       3f                     @ble exit\n"
-                "1:                               @main loop\n"
-                "vld1.f32  {d12-d15}, [%[dr0]]!   @load d0-d3,dr0\n"
-                "vld1.f32  {d16-d17}, [%[dr0]]!   @load d0-d3,dr0\n"
-                "vext.f32  q0, q6, q7, #1         @vext max_2345\n"
-                "vext.f32  q1, q6, q7, #3         @vext max_4567\n"
-                "vext.f32  q2, q6, q7, #2         @vext max_3456\n"
-                "vext.f32  q3, q7, q8, #1         @vext max_6789\n"
-                "vadd.f32  q4, q6, q0             @add 1234,2345\n"
-                "vadd.f32  q5, q7, q1             @add 5678,4567\n"
-                "vadd.f32  q4, q4, q2             @add 3456,sum1\n"
-                "vadd.f32  q5, q5, q3             @add 6789,sum2\n"
-                "vmov.f32  s17, s18               @mov\n"
-                "vmov.f32  s18, s21               @mov\n"
-                "vmov.f32  s19, s23               @mov\n"
-                "vmul.f32  q4, q4, %q[vcoef_3]    @mul\n"
-                "sub       %[dr0], #16            @add w,6\n"
-                "subs      %[cnt_num], #1         @subs cnt_num,#1\n"
-                "vst1.f32  d8, [%[dr_out]]!       @vst1 d0,dr_out\n"
-                "vst1.f32  d9, [%[dr_out]]!       @vst1 d0,dr_out\n"
-                "bne       1b                     @bne s3_max_loop_bot\n"
-                "3:                               @loop\n"
-                "cmp       %[cnt_num_remain], #0  @cnt_num_remain<=0\n"
-                "ble       4f                     @ble exit\n"
-                "2:                               @bot loop\n"
-                "vld1.f32  {d0-d1}, [%[dr0]]!     @load d0-d1,dr0\n"
-                "vext.f32  q0, %q[vzero], q0, #3  @ext v0_0123\n"
-                "vpadd.f32 d0, d0, d1             @padd d0,d0,d1\n"
-                "vpadd.f32 d0, d0, d0             @padd d0,d0,d0\n"
-                "vmul.f32  d0, d0, %e[vcoef_3]    @mul\n"
-                "sub       %[dr0], #8             @add w,2\n"
-                "subs      %[cnt_num_remain], #1  @cnt_num_remain--\n"
-                "vst1.f32  d0[0], [%[dr_out]]!    @vst d0[0],dr_out\n"
-                "bne       2b                     @bne s3_max_loop_bot_1\n"
-                "4:                               @exit\n"
-                : [dr0] "+r"(dr0),
-                  [dr1] "+r"(dr1),
-                  [dr_out] "+r"(dr_out),
-                  [cnt_num] "+r"(cnt_num),
-                  [cnt_num_remain] "+r"(cnt_num_remain),
-                  [vcoef_3] "+w"(vcoef_3),
-                  [vzero] "+w"(vzero)
-                : "r"(dr0),
-                  "r"(dr1),
-                  "r"(dr_out),
-                  "r"(cnt_num),
-                  "r"(cnt_num_remain)
-                : "cc",
-                  "memory",
-                  "q0",
-                  "q1",
-                  "q2",
-                  "q3",
-                  "q4",
-                  "q5",
-                  "q6",
-                  "q7",
-                  "q8");
-          }
-#endif
-          if (w_remain > 0) {
-            // deal with right pad
-            int wstart = (w_even >> 1) * stride - padding;
-            int wend = std::min(std::min(wstart + kernel, win + padding), win);
-            float tmp1 = 0.f;
-            float tmp2 = exclusive ? 1.0f / (1.f * (wend - wstart)) : coef;
-            for (int i = wstart; i < wend; i++) {
-              tmp1 += r0[i];
-            }
-            dout_ch[w_even >> 1] = tmp1 * tmp2;
-          }
-        } else {  // two lines
-          dout_ch[0] = (r0[0] + r0[1] + r1[0] + r1[1]) * coef_4;
-#ifdef __aarch64__
-          w = 1;
-          cnt = 1;
-          for (; w < w_unroll_size; w += 8) {
-            float32x4_t vr0_1234 = vld1q_f32(&r0[w]);
-            float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]);
-            float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]);
-            float32x4_t vr1_1234 = vld1q_f32(&r1[w]);
-            float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]);
-            float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]);
-
-            float32x4_t vsum_1234 = vaddq_f32(vr0_1234, vr1_1234);
-            float32x4_t vsum_5678 = vaddq_f32(vr0_5678, vr1_5678);
-            float32x4_t vsum_9101112 = vaddq_f32(vr0_9101112, vr1_9101112);
-            float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678, 1);
-            float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678, 2);
-            float32x4_t vsum_4567 = vextq_f32(vsum_1234, vsum_5678, 3);
-            float32x4_t vsum_6789 = vextq_f32(vsum_5678, vsum_9101112, 1);
-            float32x4_t vsum_123_345 = vaddq_f32(vsum_1234, vsum_2345);
-            vsum_123_345 = vaddq_f32(vsum_123_345, vsum_3456);
-            float32x4_t vsum_567_789 = vaddq_f32(vsum_4567, vsum_5678);
-            vsum_567_789 = vaddq_f32(vsum_567_789, vsum_6789);
-            vsum_123_345 = vsetq_lane_f32(
-                vgetq_lane_f32(vsum_123_345, 2), vsum_123_345, 1);
-            vsum_123_345 = vsetq_lane_f32(
-                vgetq_lane_f32(vsum_567_789, 1), vsum_123_345, 2);
-            vsum_123_345 = vsetq_lane_f32(
-                vgetq_lane_f32(vsum_567_789, 3), vsum_123_345, 3);
-            float32x4_t vrst = vmulq_f32(vsum_123_345, vcoef_6);
-            vst1q_f32(&dout_ch[cnt], vrst);
-            cnt += 4;
-          }
-          for (; w < w_even - 1; w += 2) {
-            float32x4_t vr0 = vld1q_f32(&r0[w]);
-            float32x4_t vr1 = vld1q_f32(&r1[w]);
-            vr0 = vsetq_lane_f32(0.f, vr0, 3);
-            vr1 = vsetq_lane_f32(0.f, vr1, 3);
-            float32x4_t vsum1 = vaddq_f32(vr0, vr1);
-            float32x2_t vsum2 =
-                vpadd_f32(vget_low_f32(vsum1), vget_high_f32(vsum1));
-            vsum2 = vpadd_f32(vsum2, vsum2);
-            float32x2_t vrst = vmul_f32(vsum2, vget_low_f32(vcoef_6));
-            dout_ch[cnt] = vget_lane_f32(vrst, 0);
-            cnt++;
-          }
-#else
-          dr_out = dout_ch + 1;
-          dr0 = (r0 + 1);
-          dr1 = (r1 + 1);
-          cnt_num = w_unroll_size >> 3;
-          cnt_num_remain = w_unroll_remain >> 1;
-          if (cnt_num > 0 || cnt_num_remain > 0) {
-            asm volatile(
-                "cmp       %[cnt_num], #0        @cmp cnt_num,0\n"
-                "ble       3f                    @ble exit\n"
-                "1:                              @main loop\n"
-                "vld1.f32  {d0-d3}, [%[dr0]]!    @load d0-d5,dr0\n"
-                "vld1.f32  {d6-d9}, [%[dr1]]!    @load d4-d7,dr1\n"
-                "vld1.f32  {d4-d5}, [%[dr0]]!    @load d0-d3,dr0\n"
-                "vld1.f32  {d10-d11}, [%[dr1]]!  @load d4-d7,dr1\n"
-                "vadd.f32  q6, q0, q3            @add q0,q0,q2 1234\n"
-                "vadd.f32  q7, q1, q4            @add q1,q1,q3 5678\n"
-                "vadd.f32  q8, q2, q5            @add q1,q1,q3 9101112\n"
-                //"vmov.f32  s7,s6               @mov s7,s6\n"
-                "vext.f32  q0, q6, q7, #1        @vext max_2345\n"
-                "vext.f32  q1, q6, q7, #3        @vext max_4567\n"
-                "vext.f32  q2, q6, q7, #2        @vext max_3456\n"
-                "vext.f32  q3, q7, q8, #1        @vext max_6789\n"
-                "vadd.f32  q4, q6, q0            @add 1234,2345\n"
-                "vadd.f32  q5, q7, q1            @add 5678,4567\n"
-                "vadd.f32  q4, q4, q2            @add 3456,sum1\n"
-                "vadd.f32  q5, q5, q3            @add 6789,sum2\n"
-                "vmov.f32  s17, s18              @mov\n"
-                "vmov.f32  s18, s21              @mov\n"
-                "vmov.f32  s19, s23              @mov\n"
-                "vmul.f32  q4, q4, %q[vcoef_6]   @mul\n"
-                "sub       %[dr0], #16           @add w,8\n"
-                "sub       %[dr1], #16           @add w,8\n"
-                "subs      %[cnt_num], #1        @subs cnt_num,#1\n"
-                "vst1.f32  d8, [%[dr_out]]!      @vst1 d0,dr_out\n"
-                "vst1.f32  d9, [%[dr_out]]!      @vst1 d0, dr_out\n"
-                "bne       1b                    @bne s3_max_loop_bot\n"
-                "3:                              @loop\n"
-                "cmp       %[cnt_num_remain], #0 @cnt_num_remain<=0\n"
-                "ble       4f                    @ble exit\n"
-                "2:                              @bot loop\n"
-                "vld1.f32  {d0-d1}, [%[dr0]]!    @load d0-d1,dr0\n"
-                "vld1.f32  {d2-d3}, [%[dr1]]!    @load d2-d3,dr1\n"
-                "vext.f32  q0, %q[vzero], q0, #3 @ext v0_0123\n"
-                "vext.f32  q1, %q[vzero], q1, #3 @ext v1_0123\n"
-                "vadd.f32  q0, q0, q1            @add q0,q0,q1\n"
-                "vpadd.f32 d0, d0, d1            @padd d0,d0,d1\n"
-                "vpadd.f32 d0, d0, d0            @padd d0,d0,d0\n"
-                "vmul.f32  d0, d0, %e[vcoef_6]   @mul\n"
-                "sub       %[dr0], #8            @add w,6\n"
-                "sub       %[dr1], #8            @add w,6\n"
-                "subs      %[cnt_num_remain], #1 @cnt_num_remain--\n"
-                "vst1.f32  d0[0], [%[dr_out]]!   @vst d0[0],dr_out\n"
-                "bne       2b                    @bne s3_max_loop_bot_1\n"
-                "4:                              @exit\n"
-                : [dr0] "+r"(dr0),
-                  [dr1] "+r"(dr1),
-                  [dr_out] "+r"(dr_out),
-                  [cnt_num] "+r"(cnt_num),
-                  [cnt_num_remain] "+r"(cnt_num_remain),
-                  [vcoef_6] "+w"(vcoef_6),
-                  [vzero] "+w"(vzero)
-                : "r"(dr0),
-                  "r"(dr1),
-                  "r"(dr_out),
-                  "r"(cnt_num),
-                  "r"(cnt_num_remain)
-                : "cc",
-                  "memory",
-                  "q0",
-                  "q1",
-                  "q2",
-                  "q3",
-                  "q4",
-                  "q5",
-                  "q6",
-                  "q7",
-                  "q8",
-                  "q9");
-          }
-#endif
-          if (w_remain > 0) {
-            // deal with right pad
-            int wstart = (w_even >> 1) * stride - padding;
-            int wend = std::min(std::min(wstart + kernel, win + padding), win);
-            float tmp1 = 0.f;
-            float tmp2 = exclusive ? 1.0f / (2.f * (wend - wstart)) : coef;
-            for (int i = wstart; i < wend; i++) {  // only run 1 or 2 times
-              tmp1 += (r0[i] + r1[i]);
-            }
-            dout_ch[w_even >> 1] = tmp1 * tmp2;
-          }
-        }
-      }
-    }
-  }
-}
-
-void pooling3x3s2p0_max(const float* din,
-                        float* dout,
-                        int num,
-                        int chout,
-                        int hout,
-                        int wout,
-                        int chin,
-                        int hin,
-                        int win) {
-  int kernel = 3;
-  int stride = 2;
-  int padding = 0;
-  int size_channel_out = wout * hout;
-  int size_channel_in = win * hin;
-
-  int w_needed = (wout << 1) + 1;
-  int h_needed = (hout << 1) + 1;
-  int w_limit = w_needed > win ? win : w_needed;
-  int h_limit = h_needed > hin ? hin : h_needed;
-  int w_even = ((w_limit - 1) >> 1) << 1;
-  int h_even = ((h_limit - 1) >> 1) << 1;
-  int w_unroll_size = (w_even >> 3) << 3;
-  int w_unroll_remain = w_even - w_unroll_size;
-  int w_remain = w_needed - w_limit;
-  int h_remain = h_needed - h_limit;
-  int w_in_2 = win << 1;
-  float minval = std::numeric_limits<float>::lowest();
-  for (int n = 0; n < num; ++n) {
-    float* dout_batch = dout + n * chout * size_channel_out;
-    const float* din_batch = din + n * chin * size_channel_in;
-#pragma omp parallel for
-    for (int c = 0; c < chout; c++) {
-      float* dout_ch = dout_batch + c * size_channel_out;
-      const float* din_ch = din_batch + c * size_channel_in;
-      const float* r0 = din_ch;
-      const float* r1 = r0 + win;
-      const float* r2 = r1 + win;
-      // w = w_in - 8;
-      float* dr_out = dout_ch;
-      const float* dr0 = r0;
-      const float* dr1 = r1;
-      const float* dr2 = r2;
-      int w = 0;
-      int cnt = 0;
-      // dout_ch[0] = std::max(std::max(r0[0], r0[1]), std::max(r1[0],
-      // r1[1]));
-      // first row with zero pad
-      // r0 = r1;
-      // r1 = r0 + w_in;
-      // r2 = r1 + w_in;
-      // dout_channel += w_out;
-      int h = 0;
-      for (; h < h_even; h += 2) {
-        // deal with left pad
-        float maxr0 = std::max(r0[0], r0[1]);
-        float maxr1 = std::max(r1[0], r1[1]);
-        float maxr2 = std::max(r2[0], r2[1]);
-// dout_ch[0] = std::max(std::max(maxr0, maxr1), maxr2);
-#ifdef __aarch64__
-        w = 0;
-        cnt = 0;
-        for (; w < w_unroll_size; w += 8) {
-          float32x4_t vr0_1234 = vld1q_f32(&r0[w]);
-          float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]);
-          float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]);
-          float32x4_t vr1_1234 = vld1q_f32(&r1[w]);
-          float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]);
-          float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]);
-          float32x4_t vr2_1234 = vld1q_f32(&r2[w]);
-          float32x4_t vr2_5678 = vld1q_f32(&r2[w + 4]);
-          float32x4_t vr2_9101112 = vld1q_f32(&r2[w + 8]);
-          float32x4_t vmax_1234 = vmaxq_f32(vr0_1234, vr1_1234);
-          vmax_1234 = vmaxq_f32(vmax_1234, vr2_1234);
-          float32x4_t vmax_5678 = vmaxq_f32(vr0_5678, vr1_5678);
-          vmax_5678 = vmaxq_f32(vmax_5678, vr2_5678);
-          float32x4_t vmax_9101112 = vmaxq_f32(vr0_9101112, vr1_9101112);
-          vmax_9101112 = vmaxq_f32(vmax_9101112, vr2_9101112);
-          float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678, 1);
-          float32x4_t vmax_6789 = vextq_f32(vmax_5678, vmax_9101112, 1);
-          float32x2_t vmax_12_34 =
-              vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234));
-          float32x2_t vmax_23_45 =
-              vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345));
-          float32x2_t vmax_56_78 =
-              vpmax_f32(vget_low_f32(vmax_5678), vget_high_f32(vmax_5678));
-          float32x2_t vmax_67_89 =
-              vpmax_f32(vget_low_f32(vmax_6789), vget_high_f32(vmax_6789));
-          float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45);
-          float32x2_t vmax_567_789 = vmax_f32(vmax_56_78, vmax_67_89);
-          vst1_f32(&dout_ch[cnt], vmax_123_345);
-          vst1_f32(&dout_ch[cnt + 2], vmax_567_789);
-          cnt += 4;
-        }
-        for (; w < w_even; w += 2) {
-          float32x4_t vr0 = vld1q_f32(&r0[w]);
-          float32x4_t vr1 = vld1q_f32(&r1[w]);
-          float32x4_t vr2 = vld1q_f32(&r2[w]);
-          vr0 = vsetq_lane_f32(minval, vr0, 3);
-          vr1 = vsetq_lane_f32(minval, vr1, 3);
-          vr2 = vsetq_lane_f32(minval, vr2, 3);
-          float32x4_t vmax1 = vmaxq_f32(vr0, vr1);
-          vmax1 = vmaxq_f32(vmax1, vr2);
-          float32x2_t vmax2 =
-              vpmax_f32(vget_low_f32(vmax1), vget_high_f32(vmax1));
-          float32x2_t vmax = vpmax_f32(vmax2, vmax2);
-          dout_ch[cnt] = vget_lane_f32(vmax, 0);
-          cnt++;
-        }
-#else
-        dr_out = dout_ch;  // + 1;
-        dr0 = r0;          // (r0 + 1);
-        dr1 = r1;          // (r1 + 1);
-        dr2 = r2;          // (r2 + 1);
-        int cnt_num = w_unroll_size >> 3;
-        int cnt_num_remain = w_unroll_remain >> 1;
-        if (cnt_num > 0 || cnt_num_remain > 0) {
-          asm volatile(
-              "cmp       %[cnt_num], #0           @cmp cnt_num,0\n"
-              "ble       3f                       @ble exit\n"
-              "1:                                 @main loop\n"
-              "vld1.f32  {d0-d3}, [%[dr0]]!       @load d0-d5,dr0\n"
-              "vld1.f32  {d6-d9}, [%[dr1]]!       @load d4-d7,dr1\n"
-              "vld1.f32  {d12-d15}, [%[dr2]]!     @load d4-d7,dr1\n"
-              "vld1.f32  {d4}, [%[dr0]]!          @load d0-d5,dr0\n"
-              "vld1.f32  {d10}, [%[dr1]]!         @load d4-d7,dr1\n"
-              "vld1.f32  {d16}, [%[dr2]]!         @load d4-d7,dr1\n"
-              "vmax.f32  q9, q0, q3               @max q0,q0,q2\n"
-              "vmax.f32  q10, q1, q4              @max q1,q1,q3\n"
-              "vmax.f32  d22, d4, d10             @max q1,q1,q3\n"
-              "vmax.f32  q0, q9, q6               @max q0,q0,q2 1234\n"
-              "vmax.f32  q3, q10, q7              @max q1,q1,q3 5678\n"
-              "vmax.f32  d2, d22, d16             @max q1,q1,q3 9101112\n"
-              //"vmov.f32  s7,s6                  @mov s7, s6\n"
-              "vext.f32  q4, q0, q3, #1           @vext 2345\n"
-              "vext.f32  q2, q3, q1, #1           @vext 6789\n"
-              "vpmax.f32 d10, d0, d1              @pmax "
-              "d10,vmax_1234,vmax_1234\n"
-              "vpmax.f32 d12, d6, d7              @pmax "
-              "d12,vmax_5678,vmax_5678\n"
-              "vpmax.f32 d11, d8, d9              @pmax "
-              "d11,vmax_2345,vmax_2345\n"
-              "vpmax.f32 d13, d4, d5              @pmax "
-              "d13,vmax_6789,vmax_6789\n"
-              "vmax.f32 d0, d10, d11              @pmax "
-              "d0,vmax_12_34,vmax_23_45\n"
-              "vmax.f32 d1, d12, d13              @pmax "
-              "d1,vmax_56_78,vmax_67_89\n"
-              "sub       %[dr0], #8               @add w,8\n"
-              "sub       %[dr1], #8               @add w,8\n"
-              "sub       %[dr2], #8               @add w,8\n"
-              "vst1.f32  d0, [%[dr_out]]!         @vst1 d0,dr_out\n"
-              "vst1.f32  d1, [%[dr_out]]!         @vst1 d0,dr_out\n"
-              "subs      %[cnt_num], #1           @cnt_num--\n"
-              "bne       1b                       @bne s3_max_loop_mid\n"
-              "3:                                 @loop\n"
-              "cmp       %[cnt_num_remain], #0    @cmp cnt_num_remain,0\n"
-              "ble       4f                       @ble exit1\n"
-              "2:                                 @mid loop\n"
-              "vld1.f32  {d0-d1}, [%[dr0]]!       @load d0-d1,dr0\n"
-              "vld1.f32  {d2-d3}, [%[dr1]]!       @load d2-d3,dr1\n"
-              "vld1.f32  {d4-d5}, [%[dr2]]!       @load d2-d3,dr1\n"
-              "vmov.f32  s3,s2                    @movs3,s2\n"
-              "vmov.f32  s7,s6                    @movs7,s6\n"
-              "vmov.f32  s11,s10                  @movs11,s10\n"
-              "vmax.f32  q0, q0, q1               @max q0,q0,q1\n"
-              "vmax.f32  q0, q0, q2               @max q0,q0,q2\n"
-              "vpmax.f32 d0, d0, d1               @pmax d0,d0,d1\n"
-              "vpmax.f32 d0, d0, d0               @pmax d0,d0,d0\n"
-              "vst1.f32  d0[0], [%[dr_out]]!      @vst d0[0],dr_out\n"
-              "sub       %[dr0], #8               @add w,6\n"
-              "sub       %[dr1], #8               @add w,6\n"
-              "sub       %[dr2], #8               @add w,6\n"
-              "subs      %[cnt_num_remain], #1    @cnt_num_remain--\n"
-              "bne       2b                       @bne s3_max_loop_mid_1\n"
-              "4:                                 @exit\n"
-              : [dr0] "+r"(dr0),
-                [dr1] "+r"(dr1),
-                [dr2] "+r"(dr2),
-                [dr_out] "+r"(dr_out),
-                [cnt_num] "+r"(cnt_num),
-                [cnt_num_remain] "+r"(cnt_num_remain)
-              : "r"(dr0),
-                "r"(dr1),
-                "r"(dr2),
-                "r"(dr_out),
-                "r"(cnt_num),
-                "r"(cnt_num_remain)
-              : "cc",
-                "memory",
-                "q0",
-                "q1",
-                "q2",
-                "q3",
-                "q4",
-                "q5",
-                "q6",
-                "q7",
-                "q8",
-                "q9",
-                "q10",
-                "q11",
-                "q12");
-        }
-#endif
-        if (w_remain > 0) {
-          // deal with right pad
-          int wstart = (w_even >> 1) * stride - padding;
-          int wend = std::min(std::min(wstart + kernel, win + padding), win);
-          float tmp = r0[wstart];  // std::numeric_limits<float>::min();
-          for (int i = wstart; i < wend; i++) {
-            tmp = std::max(tmp, std::max(r0[i], r1[i]));
-            tmp = std::max(tmp, r2[i]);
-          }
-          dout_ch[w_even >> 1] = tmp;
-          // cnt ++;
-        }
-        r0 = r2;
-        r1 = r0 + win;
-        r2 = r1 + win;
-        dout_ch += wout;
-      }
-
-      if (h_remain > 0) {
-// deal with bottom pad
-// first row with zero pad
-// int hstart = (h >> 1) * stride_h - pad_h;
-// int hend = std::min(std::min(hstart + kernel_h, hin + pad_h), hin);
-// dout_ch[0] = std::max(std::max(r0[0], r0[1]), std::max(r1[0],
-// r1[1]));
-#ifdef __aarch64__
-        w = 0;
-        cnt = 0;
-        for (; w < w_unroll_size; w += 8) {
-          float32x4_t vr0_1234 = vld1q_f32(&r0[w]);
-          float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]);
-          float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]);
-          float32x4_t vr1_1234 = vld1q_f32(&r1[w]);
-          float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]);
-          float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]);
-          float32x4_t vmax_1234 = vmaxq_f32(vr0_1234, vr1_1234);
-          float32x4_t vmax_5678 = vmaxq_f32(vr0_5678, vr1_5678);
-          float32x4_t vmax_9101112 = vmaxq_f32(vr0_9101112, vr1_9101112);
-          float32x4_t vmax_2345 = vextq_f32(vmax_1234, vmax_5678, 1);
-          float32x4_t vmax_6789 = vextq_f32(vmax_5678, vmax_9101112, 1);
-          float32x2_t vmax_12_34 =
-              vpmax_f32(vget_low_f32(vmax_1234), vget_high_f32(vmax_1234));
-          float32x2_t vmax_23_45 =
-              vpmax_f32(vget_low_f32(vmax_2345), vget_high_f32(vmax_2345));
-          float32x2_t vmax_56_78 =
-              vpmax_f32(vget_low_f32(vmax_5678), vget_high_f32(vmax_5678));
-          float32x2_t vmax_67_89 =
-              vpmax_f32(vget_low_f32(vmax_6789), vget_high_f32(vmax_6789));
-          float32x2_t vmax_123_345 = vmax_f32(vmax_12_34, vmax_23_45);
-          float32x2_t vmax_567_789 = vmax_f32(vmax_56_78, vmax_67_89);
-          vst1_f32(&dout_ch[cnt], vmax_123_345);
-          vst1_f32(&dout_ch[cnt + 2], vmax_567_789);
-          cnt += 4;
-        }
-        for (; w < w_even; w += 2) {
-          float32x4_t vr0 = vld1q_f32(&r0[w]);
-          float32x4_t vr1 = vld1q_f32(&r1[w]);
-          vr0 = vsetq_lane_f32(minval, vr0, 3);
-          vr1 = vsetq_lane_f32(minval, vr1, 3);
-          float32x4_t vmax1 = vmaxq_f32(vr0, vr1);
-          float32x2_t vmax2 =
-              vpmax_f32(vget_low_f32(vmax1), vget_high_f32(vmax1));
-          vmax2 = vpmax_f32(vmax2, vmax2);
-          dout_ch[cnt] = vget_lane_f32(vmax2, 0);
-          cnt++;
-        }
-#else
-        dr_out = dout_ch;  // + 1;
-        dr0 = r0;          // (r0 + 1);
-        dr1 = r1;          // (r1 + 1);
-        int cnt_num = w_unroll_size >> 3;
-        int cnt_num_remain = w_unroll_remain >> 1;
-        if (cnt_num > 0 || cnt_num_remain > 0) {
-          asm volatile(
-              "cmp       %[cnt_num], #0           @cmp cnt_num,0\n"
-              "ble       3f                       @ble exit\n"
-              "1:                                 @main loop\n"
-              "vld1.f32  {d0-d3}, [%[dr0]]!       @load d0-d5,dr0\n"
-              "vld1.f32  {d6-d9}, [%[dr1]]!       @load d4-d7,dr1\n"
-              "vld1.f32  {d4}, [%[dr0]]!          @load d0-d3,dr0\n"
-              "vld1.f32  {d10}, [%[dr1]]!         @load d4-d7,dr1\n"
-              "vmax.f32  q6, q0, q3               @max q0,q0,q2 1234\n"
-              "vmax.f32  q7, q1, q4               @max q1,q1,q3 5678\n"
-              "vmax.f32  d16, d4, d10             @max q1,q1,q3 9101112\n"
-              //"vmov.f32  s7,s6                  @mov s7,s6\n"
-              "vext.f32  q0, q6, q7, #1           @vext q0,2345\n"
-              "vext.f32  q1, q7, q8, #1           @vext q1,6789\n"
-              "vpmax.f32 d4, d12, d13             @pmax "
-              "d4,vmax_1234,vmax_1234\n"
-              "vpmax.f32 d6, d14, d15             @pmax "
-              "d6,vmax_5678,vmax_5678\n"
-              "vpmax.f32 d5, d0, d1               @pmax "
-              "d5,vmax_2345,vmax_2345\n"
-              "vpmax.f32 d7, d2, d3               @pmax "
-              "d7,vmax_6789,vmax_6789\n"
-              "vmax.f32 d8, d4, d5                @max "
-              "d2,vmax_12_34,vmax_23_45\n"
-              "vmax.f32 d9, d6, d7                @max "
-              "d2,vmax_56_78,vmax_67_89\n"
-              "sub       %[dr0], #8               @add w,8\n"
-              "sub       %[dr1], #8               @add w,8\n"
-              "vst1.f32  d8, [%[dr_out]]!         @vst1 d0,dr_out\n"
-              "vst1.f32  d9, [%[dr_out]]!         @vst1 d0,dr_out\n"
-              "subs      %[cnt_num], #1           @subs cnt_num,#1\n"
-              "bne       1b                       @bne s3_max_loop_bot\n"
-              "3:                                 @loop \n"
-              "cmp       %[cnt_num_remain], #0    @cmp cnt_num_remain,0\n"
-              "ble       4f                       @ble exit\n"
-              "2:                                 @bot loop\n"
-              "vld1.f32  {d0-d1}, [%[dr0]]!       @load d0-d1,dr0\n"
-              "vld1.f32  {d2-d3}, [%[dr1]]!       @load d2-d3,dr1\n"
-              "vmov.f32  s3,s2                    @movs3,s2\n"
-              "vmov.f32  s7,s6                    @movs7,s6\n"
-              "vmax.f32  q0, q0, q1               @max q0,q0,q1\n"
-              "vpmax.f32 d0, d0, d1               @pmax d0,d0,d1\n"
-              "vpmax.f32 d0, d0, d0               @pmax d0,d0,d0\n"
-              "vst1.f32  d0[0], [%[dr_out]]!      @vst d0[0],dr_out\n"
-              "sub       %[dr0], #8               @add w,6\n"
-              "sub       %[dr1], #8               @add w,6\n"
-              "subs      %[cnt_num_remain], #1    @cnt_num_remain--\n"
-              "bne       2b                       @bne s3_max_loop_bot_1\n"
-              "4:                                 @exit\n"
-              : [dr0] "+r"(dr0),
-                [dr1] "+r"(dr1),
-                [dr_out] "+r"(dr_out),
-                [cnt_num] "+r"(cnt_num),
-                [cnt_num_remain] "+r"(cnt_num_remain)
-              : "r"(dr0),
-                "r"(dr1),
-                "r"(dr_out),
-                "r"(cnt_num),
-                "r"(cnt_num_remain)
-              : "cc",
-                "memory",
-                "q0",
-                "q1",
-                "q2",
-                "q3",
-                "q4",
-                "q5",
-                "q6",
-                "q7",
-                "q8",
-                "q9");
-        }
-#endif
-        if (w_remain > 0) {
-          // deal with right pad
-          int wstart = (w_even >> 1) * stride - padding;
-          int wend = std::min(std::min(wstart + kernel, win + padding), win);
-          float tmp = r0[wstart];  // std::numeric_limits<float>::min();
-          for (int i = wstart; i < wend; i++) {  // only run 1 or 2 times
-            tmp = std::max(tmp, std::max(r0[i], r1[i]));
-          }
-          dout_ch[w_even >> 1] = tmp;
-        }
-      }
-    }
-  }
-}
-
-void pooling3x3s2p0_avg(const float* din,
-                        float* dout,
-                        int num,
-                        int chout,
-                        int hout,
-                        int wout,
-                        int chin,
-                        int hin,
-                        int win,
-                        bool exclusive) {
-  int kernel = 3;
-  int stride = 2;
-  int padding = 0;
-  int size_channel_out = wout * hout;
-  int size_channel_in = win * hin;
-
-  int w_needed = (wout << 1) + 1;
-  int h_needed = (hout << 1) + 1;
-  int w_limit = w_needed > win ? win : w_needed;
-  int h_limit = h_needed > hin ? hin : h_needed;
-  int w_even = ((w_limit - 1) >> 1) << 1;
-  int h_even = ((h_limit - 1) >> 1) << 1;
-  int w_unroll_size = (w_even >> 3) << 3;
-  int w_unroll_remain = w_even - w_unroll_size;
-  int w_remain = w_needed - w_limit;
-  int h_remain = h_needed - h_limit;
-  int w_in_2 = win << 1;
-  const float coef = 1.f / 9.f;
-  const float coef_6 = exclusive ? 1.f / 6.f : coef;
-  float32x4_t vcoef = vdupq_n_f32(coef);
-  float32x4_t vcoef_6 = vdupq_n_f32(coef_6);
-  for (int n = 0; n < num; ++n) {
-    float* dout_batch = dout + n * chout * size_channel_out;
-    const float* din_batch = din + n * chin * size_channel_in;
-#pragma omp parallel for
-    for (int c = 0; c < chout; c++) {
-      float* dout_ch = dout_batch + c * size_channel_out;
-      const float* din_ch = din_batch + c * size_channel_in;
-      const float* r0 = din_ch;
-      const float* r1 = r0 + win;
-      const float* r2 = r1 + win;
-      // w = w_in - 8;
-      float* dr_out = dout_ch;
-      const float* dr0 = r0;
-      const float* dr1 = r1;
-      const float* dr2 = r2;
-
-      float32x4_t vzero = vdupq_n_f32(0.f);
-
-      int h = 0;
-      for (; h < h_even; h += 2) {
-// LOG(INFO) << "h: " << h <<", dr0:" << r0 << ", dr1: " << r1 <<
-// ",dr2: " <<r2; deal with left pad float sum0 = r0[0] + r0[1]; float
-// sum1 = r1[0] + r1[1]; float sum2 = r2[0] + r2[1]; dout_channel[0] =
-// (sum0 + sum1 + sum2) / 9.f;
-#ifdef __aarch64__
-        int w = 0;
-        int cnt = 0;
-        for (; w < w_unroll_size; w += 8) {
-          float32x4_t vr0_1234 = vld1q_f32(&r0[w]);
-          float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]);
-          float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]);
-          float32x4_t vr1_1234 = vld1q_f32(&r1[w]);
-          float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]);
-          float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]);
-          float32x4_t vr2_1234 = vld1q_f32(&r2[w]);
-          float32x4_t vr2_5678 = vld1q_f32(&r2[w + 4]);
-          float32x4_t vr2_9101112 = vld1q_f32(&r2[w + 8]);
-          float32x4_t vsum_1234 = vaddq_f32(vr0_1234, vr1_1234);
-          float32x4_t vsum_5678 = vaddq_f32(vr0_5678, vr1_5678);
-          float32x4_t vsum_9101112 = vaddq_f32(vr0_9101112, vr1_9101112);
-          vsum_1234 = vaddq_f32(vsum_1234, vr2_1234);
-          vsum_5678 = vaddq_f32(vsum_5678, vr2_5678);
-          vsum_9101112 = vaddq_f32(vsum_9101112, vr2_9101112);
-
-          float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678, 1);
-          float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678, 2);
-          float32x4_t vsum_4567 = vextq_f32(vsum_1234, vsum_5678, 3);
-          float32x4_t vsum_6789 = vextq_f32(vsum_5678, vsum_9101112, 1);
-          float32x4_t vsum_123_345 = vaddq_f32(vsum_1234, vsum_2345);
-          vsum_123_345 = vaddq_f32(vsum_123_345, vsum_3456);
-          float32x4_t vsum_567_789 = vaddq_f32(vsum_4567, vsum_5678);
-          vsum_567_789 = vaddq_f32(vsum_567_789, vsum_6789);
-          vsum_123_345 =
-              vsetq_lane_f32(vgetq_lane_f32(vsum_123_345, 2), vsum_123_345, 1);
-          vsum_123_345 =
-              vsetq_lane_f32(vgetq_lane_f32(vsum_567_789, 1), vsum_123_345, 2);
-          vsum_123_345 =
-              vsetq_lane_f32(vgetq_lane_f32(vsum_567_789, 3), vsum_123_345, 3);
-          float32x4_t vrst = vmulq_f32(vsum_123_345, vcoef);
-          vst1q_f32(&dout_ch[cnt], vrst);
-          cnt += 4;
-        }
-        for (; w < w_even; w += 2) {
-          float32x4_t vr0 = vld1q_f32(&r0[w]);
-          float32x4_t vr1 = vld1q_f32(&r1[w]);
-          float32x4_t vr2 = vld1q_f32(&r2[w]);
-          vr0 = vsetq_lane_f32(0.f, vr0, 3);
-          vr1 = vsetq_lane_f32(0.f, vr1, 3);
-          vr2 = vsetq_lane_f32(0.f, vr2, 3);
-          float32x4_t vsum1 = vaddq_f32(vr0, vr1);
-          vsum1 = vaddq_f32(vsum1, vr2);
-          float32x2_t vsum2 =
-              vpadd_f32(vget_low_f32(vsum1), vget_high_f32(vsum1));
-          float32x2_t vsum = vpadd_f32(vsum2, vsum2);
-          dout_ch[cnt] = vget_lane_f32(vsum, 0) * coef;
-          cnt++;
-        }
-#else
-        dr_out = dout_ch;  // + 1;
-        dr0 = r0;          // (r0 + 1);
-        dr1 = r1;          // (r1 + 1);
-        dr2 = r2;          // (r2 + 1);
-        int cnt_num = w_unroll_size >> 3;
-        int cnt_num_remain = w_unroll_remain >> 1;
-        // LOG(INFO) << "cnt_num: " << cnt_num << " cnt_num_remain: " <<
-        // cnt_num_remain;
-        if (cnt_num > 0 || cnt_num_remain > 0) {
-          asm volatile(
-              "cmp       %[cnt_num], #0           @cmp cnt_num, 0\n"
-              "ble       3f             @ble exit\n"
-              "s3_ave_loop_mid_p0:                @main loop\n"
-              "vld1.f32  {d0-d3}, [%[dr0]]!       @load d0-d5, dr0\n"
-              "vld1.f32  {d6-d9}, [%[dr1]]!       @load d4-d7, dr1\n"
-              "vld1.f32  {d12-d15}, [%[dr2]]!     @load d4-d7, dr2\n"
-              "vld1.f32  {d4}, [%[dr0]]!          @load d0-d5, dr0\n"
-              "vld1.f32  {d10}, [%[dr1]]!         @load d4-d7, dr1\n"
-              "vld1.f32  {d16}, [%[dr2]]!         @load d4-d7, dr2\n"
-              "vadd.f32  q9, q0, q3               @max q0,q0,q2\n"
-              "vadd.f32  q10, q1, q4              @max q1,q1,q3\n"
-              "vadd.f32  d22, d4, d10             @max q1,q1,q3\n"
-              "vadd.f32  q6, q9, q6               @max q0,q0,q2 1234\n"
-              "vadd.f32  q7, q10, q7              @max q1,q1,q3 5678\n"
-              "vadd.f32  d16, d22, d16            @max q1,q1,q3 9101112\n"
-              //"vmov.f32  s7,s6                  @mov s7, s6\n"
-              "vext.f32  q0, q6, q7, #1           @vext max_2345\n"
-              "vext.f32  q1, q6, q7, #3           @vext max_4567\n"
-              "vext.f32  q2, q6, q7, #2           @vext max_3456\n"
-              "vext.f32  q3, q7, q8, #1           @vext max_6789\n"
-              "vadd.f32  q4, q6, q0               @add 1234, 2345\n"
-              "vadd.f32  q5, q7, q1               @add 5678, 4567\n"
-              "vadd.f32  q4, q4, q2               @add 3456, sum1\n"
-              "vadd.f32  q5, q5, q3               @add 6789, sum2\n"
-              "vmov.f32  s17, s18                 @mov\n"
-              "vmov.f32  s18, s21                 @mov\n"
-              "vmov.f32  s19, s23                 @mov\n"
-              "vmul.f32  q4, q4, %q[vcoef]        @mul\n"
-              "sub       %[dr0], #8               @add w,8\n"
-              "sub       %[dr1], #8               @add w,8\n"
-              "sub       %[dr2], #8               @add w,8\n"
-              "subs      %[cnt_num], #1           @cnt_num--\n"
-              "vst1.f32  d8, [%[dr_out]]!         @vst1 d0,dr_out\n"
-              "vst1.f32  d9, [%[dr_out]]!         @vst1 d0,dr_out\n"
-              "bne       s3_ave_loop_mid_p0       @bne s3_max_loop_mid\n"
-              "3:                      @loop\n"
-              "cmp       %[cnt_num_remain], #0    @cmp cnt_num_remain,0\n"
-              "ble       4f             @ble exit1\n"
-              "s3_ave_loop_mid_1_p0:              @mid loop\n"
-              "vld1.f32  {d0-d1}, [%[dr0]]!       @load d0-d1,dr0\n"
-              "vld1.f32  {d2-d3}, [%[dr1]]!       @load d2-d3,dr1\n"
-              "vld1.f32  {d4-d5}, [%[dr2]]!       @load d2-d3,dr1\n"
-              "vext.f32  q0, %q[vzero], q0, #3    @ext v0_0123\n"
-              "vext.f32  q1, %q[vzero], q1, #3    @ext v1_0123\n"
-              "vext.f32  q2, %q[vzero], q2, #3    @ext v1_0123\n"
-              "vadd.f32  q0, q0, q1               @add q0,q0,q1\n"
-              "vadd.f32  q0, q0, q2               @add q0,q0,q1\n"
-              "vpadd.f32 d0, d0, d1               @padd d0,d0,d1\n"
-              "vpadd.f32 d0, d0, d0               @padd d0,d0,d0\n"
-              "vmul.f32  d0, d0, %e[vcoef]        @mul\n"
-              "sub       %[dr0], #8               @add w,6\n"
-              "sub       %[dr1], #8               @add w,6\n"
-              "sub       %[dr2], #8               @add w,6\n"
-              "subs      %[cnt_num_remain], #1    @cnt_num_remain--\n"
-              "vst1.f32  d0[0], [%[dr_out]]!      @vst d0[0],dr_out\n"
-              "bne       s3_ave_loop_mid_1_p0     @bne s3_max_loop_mid_1\n"
-              "4:                      @exit\n"
-              : [dr0] "+r"(dr0),
-                [dr1] "+r"(dr1),
-                [dr2] "+r"(dr2),
-                [dr_out] "+r"(dr_out),
-                [cnt_num] "+r"(cnt_num),
-                [cnt_num_remain] "+r"(cnt_num_remain),
-                [vcoef] "+w"(vcoef),
-                [vzero] "+w"(vzero)
-              : "r"(dr0),
-                "r"(dr1),
-                "r"(dr2),
-                "r"(dr_out),
-                "r"(cnt_num),
-                "r"(cnt_num_remain)
-              : "q0",
-                "q1",
-                "q2",
-                "q3",
-                "q4",
-                "q5",
-                "q6",
-                "q7",
-                "q8",
-                "q9",
-                "q10",
-                "q11",
-                "q12");
-        }
-#endif
-        if (w_remain > 0) {
-          // deal with right pad
-          int wstart = (w_even >> 1) * stride - padding;
-          int wend = std::min(std::min(wstart + kernel, win + padding), win);
-          float tmp1 = 0.f;
-          float tmp2 = exclusive ? 1.0f / (3.f * (wend - wstart)) : coef;
-          for (int i = wstart; i < wend; i++) {
-            tmp1 += (r0[i] + r1[i] + r2[i]);
-          }
-          dout_ch[w_even >> 1] = tmp1 * tmp2;
-          // cnt ++;
-        }
-        r0 = r2;
-        r1 = r0 + win;
-        r2 = r1 + win;
-        dout_ch += wout;
-      }
-
-      if (h_remain > 0) {
-// deal with bottom pad
-// first row with zero pad
-// int hstart = (h >> 1) * stride_h - pad_h;
-// int hend = std::min(std::min(hstart + kernel_h, hin + padding_h),
-// hin); data_out_channel[0] =(r0[0] + r0[1] + r0[2] + r1[0] + r1[1] +
-// r1[2]) / 9.f;
-#ifdef __aarch64__
-        int w = 0;
-        int cnt = 0;
-        for (; w < w_unroll_size; w += 8) {
-          float32x4_t vr0_1234 = vld1q_f32(&r0[w]);
-          float32x4_t vr0_5678 = vld1q_f32(&r0[w + 4]);
-          float32x4_t vr0_9101112 = vld1q_f32(&r0[w + 8]);
-          float32x4_t vr1_1234 = vld1q_f32(&r1[w]);
-          float32x4_t vr1_5678 = vld1q_f32(&r1[w + 4]);
-          float32x4_t vr1_9101112 = vld1q_f32(&r1[w + 8]);
-
-          float32x4_t vsum_1234 = vaddq_f32(vr0_1234, vr1_1234);
-          float32x4_t vsum_5678 = vaddq_f32(vr0_5678, vr1_5678);
-          float32x4_t vsum_9101112 = vaddq_f32(vr0_9101112, vr1_9101112);
-          float32x4_t vsum_2345 = vextq_f32(vsum_1234, vsum_5678, 1);
-          float32x4_t vsum_3456 = vextq_f32(vsum_1234, vsum_5678, 2);
-          float32x4_t vsum_4567 = vextq_f32(vsum_1234, vsum_5678, 3);
-          float32x4_t vsum_6789 = vextq_f32(vsum_5678, vsum_9101112, 1);
-          float32x4_t vsum_123_345 = vaddq_f32(vsum_1234, vsum_2345);
-          vsum_123_345 = vaddq_f32(vsum_123_345, vsum_3456);
-          float32x4_t vsum_567_789 = vaddq_f32(vsum_4567, vsum_5678);
-          vsum_567_789 = vaddq_f32(vsum_567_789, vsum_6789);
-          vsum_123_345 =
-              vsetq_lane_f32(vgetq_lane_f32(vsum_123_345, 2), vsum_123_345, 1);
-          vsum_123_345 =
-              vsetq_lane_f32(vgetq_lane_f32(vsum_567_789, 1), vsum_123_345, 2);
-          vsum_123_345 =
-              vsetq_lane_f32(vgetq_lane_f32(vsum_567_789, 3), vsum_123_345, 3);
-          float32x4_t vrst = vmulq_f32(vsum_123_345, vcoef_6);
-          vst1q_f32(&dout_ch[cnt], vrst);
-          cnt += 4;
-        }
-        for (; w < w_even; w += 2) {
-          float32x4_t vr0 = vld1q_f32(&r0[w]);
-          float32x4_t vr1 = vld1q_f32(&r1[w]);
-          vr0 = vsetq_lane_f32(0.f, vr0, 3);
-          vr1 = vsetq_lane_f32(0.f, vr1, 3);
-          float32x4_t vsum1 = vaddq_f32(vr0, vr1);
-          float32x2_t vsum2 =
-              vpadd_f32(vget_low_f32(vsum1), vget_high_f32(vsum1));
-          vsum2 = vpadd_f32(vsum2, vsum2);
-          float32x2_t vrst = vmul_f32(vsum2, vget_low_f32(vcoef_6));
-          dout_ch[cnt] = vget_lane_f32(vrst, 0);
-          cnt++;
-        }
-#else
-        dr_out = dout_ch;  // + 1;
-        dr0 = r0;          // (r0 + 1);
-        dr1 = r1;          // (r1 + 1);
-        int cnt_num = w_unroll_size >> 3;
-        int cnt_num_remain = w_unroll_remain >> 1;
-        // LOG(INFO) << "cnt_num: " << cnt_num << " cnt_num_remain: " <<
-        // cnt_num_remain;
-        if (cnt_num > 0 || cnt_num_remain > 0) {
-          asm volatile(
-              "cmp       %[cnt_num], #0           @cmp cnt_num,0\n"
-              "ble       2f                       @ble exit\n"
-              "1:                                 @main loop\n"
-              "vld1.f32  {d0-d3}, [%[dr0]]!       @load d0-d5,dr0\n"
-              "vld1.f32  {d6-d9}, [%[dr1]]!       @load d4-d7,dr1\n"
-              "vld1.f32  {d4}, [%[dr0]]!          @load d0-d3,dr0\n"
-              "vld1.f32  {d10}, [%[dr1]]!         @load d4-d7,dr1\n"
-              "vadd.f32  q6, q0, q3               @max q0,q0,q2 1234\n"
-              "vadd.f32  q7, q1, q4               @max q1,q1,q3 5678\n"
-              "vadd.f32  d16, d4, d10             @max q1,q1,q3 9101112\n"
-              //"vmov.f32  s7,s6                  @mov s7, s6\n"
-              "vext.f32  q0, q6, q7, #1           @vext max_2345\n"
-              "vext.f32  q1, q6, q7, #3           @vext max_4567\n"
-              "vext.f32  q2, q6, q7, #2           @vext max_3456\n"
-              "vext.f32  q3, q7, q8, #1           @vext max_6789\n"
-              "vadd.f32  q4, q6, q0               @add 1234,2345\n"
-              "vadd.f32  q5, q7, q1               @add 5678,4567\n"
-              "vadd.f32  q4, q4, q2               @add 3456,sum1\n"
-              "vadd.f32  q5, q5, q3               @add 6789,sum2\n"
-              "vmov.f32  s17, s18                 @mov\n"
-              "vmov.f32  s18, s21                 @mov\n"
-              "vmov.f32  s19, s23                 @mov\n"
-              "vmul.f32  q4, q4, %q[vcoef_6]      @mul\n"
-              "sub       %[dr0], #8               @add w,8\n"
-              "sub       %[dr1], #8               @add w,8\n"
-              "subs      %[cnt_num], #1           @cnt_num--\n"
-              "vst1.f32  d8, [%[dr_out]]!         @vst1 d0,dr_out\n"
-              "vst1.f32  d9, [%[dr_out]]!         @vst1 d0,dr_out\n"
-              "bne       1b                       @bne s3_max_loop_bot\n"
-              "2:                                 @loop\n"
-              "cmp       %[cnt_num_remain], #0    @cmp cnt_num_remain, 0\n"
-              "ble       3f                       @ble exit\n"
-              "4:                                 @bot loop\n"
-              "vld1.f32  {d0-d1}, [%[dr0]]!       @load d0-d1,dr0\n"
-              "vld1.f32  {d2-d3}, [%[dr1]]!       @load d2-d3,dr1\n"
-              "vext.f32  q0, %q[vzero], q0, #3    @ext v0_0123\n"
-              "vext.f32  q1, %q[vzero], q1, #3    @ext v1_0123\n"
-              "vadd.f32  q0, q0, q1               @add q0,q0,q1\n"
-              "vpadd.f32 d0, d0, d1               @padd d0,d0,d1\n"
-              "vpadd.f32 d0, d0, d0               @padd d0,d0,d0\n"
-              "vmul.f32  d0, d0, %e[vcoef_6]      @mul\n"
-              "sub       %[dr0], #8               @add w,6\n"
-              "sub       %[dr1], #8               @add w,6\n"
-              "subs      %[cnt_num_remain], #1    @cnt_num_remain--\n"
-              "vst1.f32  d0[0], [%[dr_out]]!      @vst d0[0],dr_out\n"
-              "bne       4b                       @bne s3_max_loop_bot_1\n"
-              "3:                                 @exit\n"
-              : [dr0] "+r"(dr0),
-                [dr1] "+r"(dr1),
-                [dr_out] "+r"(dr_out),
-                [cnt_num] "+r"(cnt_num),
-                [cnt_num_remain] "+r"(cnt_num_remain),
-                [vcoef_6] "+w"(vcoef_6),
-                [vzero] "+w"(vzero)
-              : "r"(dr0),
-                "r"(dr1),
-                "r"(dr_out),
-                "r"(cnt_num),
-                "r"(cnt_num_remain)
-              : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9");
-        }
-
-#endif
-        if (w_remain > 0) {
-          // deal with right pad
-          int wstart = (w_even >> 1) * stride - padding;
-          int wend = std::min(std::min(wstart + kernel, win + padding), win);
-          float tmp1 = 0.f;
-          float tmp2 = exclusive ? 1.0f / (2.f * (wend - wstart)) : coef;
-          for (int i = wstart; i < wend; i++) {  // only run 1 or 2 times
-            tmp1 += (r0[i] + r1[i]);
-          }
-          dout_ch[w_even >> 1] = tmp1 * tmp2;
-        }
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/pooling.h b/lite/backends/arm/math/pooling.h
deleted file mode 100644
index 8fc9e0c4e0..0000000000
--- a/lite/backends/arm/math/pooling.h
+++ /dev/null
@@ -1,154 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-// !pooling fp32 Op
-void pooling_basic(const float* din,
-                   float* dout,
-                   int num,
-                   int chout,
-                   int hout,
-                   int wout,
-                   int chin,
-                   int hin,
-                   int win,
-                   const std::vector<int>& ksize,
-                   const std::vector<int>& strides,
-                   const std::vector<int>& paddings,
-                   bool global_pooling,
-                   bool exclusive,
-                   bool adaptive,
-                   bool ceil_mode,
-                   bool use_quantizer,
-                   const std::string& pooling_type);
-
-void pooling_global_max(const float* din,
-                        float* dout,
-                        int num,
-                        int chout,
-                        int hout,
-                        int wout,
-                        int chin,
-                        int hin,
-                        int win);
-
-void pooling_global_avg(const float* din,
-                        float* dout,
-                        int num,
-                        int chout,
-                        int hout,
-                        int wout,
-                        int chin,
-                        int hin,
-                        int win);
-
-void pooling2x2s2_max(const float* din,
-                      float* dout,
-                      int num,
-                      int chout,
-                      int hout,
-                      int wout,
-                      int chin,
-                      int hin,
-                      int win);
-
-void pooling2x2s2_avg(const float* din,
-                      float* dout,
-                      int num,
-                      int chout,
-                      int hout,
-                      int wout,
-                      int chin,
-                      int hin,
-                      int win,
-                      bool exclusive);
-
-void pooling3x3s1p1_max(const float* din,
-                        float* dout,
-                        int num,
-                        int chout,
-                        int hout,
-                        int wout,
-                        int chin,
-                        int hin,
-                        int win);
-
-void pooling3x3s1p1_avg(const float* din,
-                        float* dout,
-                        int num,
-                        int chout,
-                        int hout,
-                        int wout,
-                        int chin,
-                        int hin,
-                        int win,
-                        bool exclusive);
-
-void pooling3x3s2p1_max(const float* din,
-                        float* dout,
-                        int num,
-                        int chout,
-                        int hout,
-                        int wout,
-                        int chin,
-                        int hin,
-                        int win);
-
-void pooling3x3s2p1_avg(const float* din,
-                        float* dout,
-                        int num,
-                        int chout,
-                        int hout,
-                        int wout,
-                        int chin,
-                        int hin,
-                        int win,
-                        bool exclusive);
-
-void pooling3x3s2p0_max(const float* din,
-                        float* dout,
-                        int num,
-                        int chout,
-                        int hout,
-                        int wout,
-                        int chin,
-                        int hin,
-                        int win);
-
-void pooling3x3s2p0_avg(const float* din,
-                        float* dout,
-                        int num,
-                        int chout,
-                        int hout,
-                        int wout,
-                        int chin,
-                        int hin,
-                        int win,
-                        bool exclusive);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/power.cc b/lite/backends/arm/math/power.cc
deleted file mode 100644
index 752c63d917..0000000000
--- a/lite/backends/arm/math/power.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/power.h"
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <>
-void power<float>(const float* din,
-                  float* dout,
-                  const int num,
-                  float scale_,
-                  float shift_,
-                  float power_) {
-  int cnt = num >> 4;
-  int remain = num % 16;
-  bool _do_power = true;
-  bool _do_scale = true;
-  bool _do_shift = true;
-  if (fabsf(power_ - 1.f) < 1e-6f) {
-    _do_power = false;
-  }
-  if (fabsf(scale_ - 1.f) < 1e-6f) {
-    _do_scale = false;
-  }
-  if (fabsf(shift_ - 0.f) < 1e-6f) {
-    _do_shift = false;
-  }
-  float* ptr_out = dout;
-  const float* ptr_in = din;
-  float32x4_t vscale = vdupq_n_f32(scale_);
-  float32x4_t vshift = vdupq_n_f32(shift_);
-  float32x4_t vpower = vdupq_n_f32(power_);
-#pragma omp parallel for
-  for (int nums = 0; nums < cnt; ++nums) {
-    float32x4_t vr0 = vld1q_f32(ptr_in);
-    ptr_in += 4;
-    float32x4_t vr1 = vld1q_f32(ptr_in);
-    ptr_in += 4;
-    float32x4_t vr2 = vld1q_f32(ptr_in);
-    ptr_in += 4;
-    float32x4_t vr3 = vld1q_f32(ptr_in);
-    ptr_in += 4;
-    if (_do_scale) {
-      vr0 = vmulq_f32(vr0, vscale);
-      vr1 = vmulq_f32(vr1, vscale);
-      vr2 = vmulq_f32(vr2, vscale);
-      vr3 = vmulq_f32(vr3, vscale);
-    }
-    if (_do_shift) {
-      vr0 = vaddq_f32(vr0, vshift);
-      vr1 = vaddq_f32(vr1, vshift);
-      vr2 = vaddq_f32(vr2, vshift);
-      vr3 = vaddq_f32(vr3, vshift);
-    }
-    if (_do_power) {
-      vr0 = pow_ps(vr0, vpower);
-      vr1 = pow_ps(vr1, vpower);
-      vr2 = pow_ps(vr2, vpower);
-      vr3 = pow_ps(vr3, vpower);
-    }
-    vst1q_f32(ptr_out, vr0);
-    ptr_out += 4;
-    vst1q_f32(ptr_out, vr1);
-    ptr_out += 4;
-    vst1q_f32(ptr_out, vr2);
-    ptr_out += 4;
-    vst1q_f32(ptr_out, vr3);
-    ptr_out += 4;
-  }
-  for (int j = 0; j < remain; ++j) {
-    ptr_out[0] = std::pow((ptr_in[0] * scale_ + shift_), power_);
-    ptr_in++;
-    ptr_out++;
-  }
-}
-
-} /* namespace math */
-} /* namespace arm */
-} /* namespace lite */
-} /* namespace paddle */
diff --git a/lite/backends/arm/math/power.h b/lite/backends/arm/math/power.h
deleted file mode 100644
index 7b9074918d..0000000000
--- a/lite/backends/arm/math/power.h
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <typename T>
-void power(const T* din,
-           T* dout,
-           const int num,
-           float scale_,
-           float shift_,
-           float power_);
-
-} /* namespace math */
-} /* namespace arm */
-} /* namespace lite */
-} /* namespace paddle */
diff --git a/lite/backends/arm/math/prior_box.cc b/lite/backends/arm/math/prior_box.cc
deleted file mode 100644
index f262e6e1d7..0000000000
--- a/lite/backends/arm/math/prior_box.cc
+++ /dev/null
@@ -1,362 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/prior_box.h"
-#include <algorithm>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-const int MALLOC_ALIGN = 64;
-
-void* fast_malloc(size_t size) {
-  size_t offset = sizeof(void*) + MALLOC_ALIGN - 1;
-  char* p = static_cast<char*>(malloc(offset + size));
-
-  if (!p) {
-    return nullptr;
-  }
-
-  void* r = reinterpret_cast<void*>(reinterpret_cast<size_t>(p + offset) &
-                                    (~(MALLOC_ALIGN - 1)));
-  static_cast<void**>(r)[-1] = p;
-  memset(r, 0, size);
-  return r;
-}
-
-void fast_free(void* ptr) {
-  if (ptr) {
-    free(static_cast<void**>(ptr)[-1]);
-  }
-}
-
-void density_prior_box(const lite::Tensor* input,
-                       const lite::Tensor* image,
-                       lite::Tensor** boxes,
-                       lite::Tensor** variances,
-                       const std::vector<float>& min_size_,
-                       const std::vector<float>& fixed_size_,
-                       const std::vector<float>& fixed_ratio_,
-                       const std::vector<int>& density_size_,
-                       const std::vector<float>& max_size_,
-                       const std::vector<float>& aspect_ratio_,
-                       const std::vector<float>& variance_,
-                       int img_w_,
-                       int img_h_,
-                       float step_w_,
-                       float step_h_,
-                       float offset_,
-                       int prior_num_,
-                       bool is_flip_,
-                       bool is_clip_,
-                       const std::vector<std::string>& order_) {
-  // compute output shape
-  int win1 = input->dims()[3];
-  int hin1 = input->dims()[2];
-  DDim shape_out({hin1, win1, prior_num_, 4});
-  (*boxes)->Resize(shape_out);
-  (*variances)->Resize(shape_out);
-
-  float* _cpu_data = (*boxes)->mutable_data<float>();
-  float* _variance_data = (*variances)->mutable_data<float>();
-
-  const int width = win1;
-  const int height = hin1;
-  int img_width = img_w_;
-  int img_height = img_h_;
-  if (img_width == 0 || img_height == 0) {
-    img_width = image->dims()[3];
-    img_height = image->dims()[2];
-  }
-  float step_w = step_w_;
-  float step_h = step_h_;
-  if (step_w == 0 || step_h == 0) {
-    step_w = static_cast<float>(img_width) / width;
-    step_h = static_cast<float>(img_height) / height;
-  }
-  float offset = offset_;
-  int step_average = static_cast<int>((step_w + step_h) * 0.5);  // add
-  int channel_size = height * width * prior_num_ * 4;
-  int idx = 0;
-  for (int h = 0; h < height; ++h) {
-    for (int w = 0; w < width; ++w) {
-      float center_x = (w + offset) * step_w;
-      float center_y = (h + offset) * step_h;
-      float box_width;
-      float box_height;
-      if (fixed_size_.size() > 0) {
-        // add
-        for (int s = 0; s < fixed_size_.size(); ++s) {
-          int fixed_size = fixed_size_[s];
-          int com_idx = 0;
-          box_width = fixed_size;
-          box_height = fixed_size;
-
-          if (fixed_ratio_.size() > 0) {
-            for (int r = 0; r < fixed_ratio_.size(); ++r) {
-              float ar = fixed_ratio_[r];
-              int density = density_size_[s];
-              int shift = step_average / density;
-              float box_width_ratio = fixed_size_[s] * sqrt(ar);
-              float box_height_ratio = fixed_size_[s] / sqrt(ar);
-
-              for (int p = 0; p < density; ++p) {
-                for (int c = 0; c < density; ++c) {
-                  float center_x_temp =
-                      center_x - step_average / 2.0f + shift / 2.f + c * shift;
-                  float center_y_temp =
-                      center_y - step_average / 2.0f + shift / 2.f + p * shift;
-                  // xmin
-                  _cpu_data[idx++] =
-                      (center_x_temp - box_width_ratio / 2.f) / img_width >= 0
-                          ? (center_x_temp - box_width_ratio / 2.f) / img_width
-                          : 0;
-                  // ymin
-                  _cpu_data[idx++] =
-                      (center_y_temp - box_height_ratio / 2.f) / img_height >= 0
-                          ? (center_y_temp - box_height_ratio / 2.f) /
-                                img_height
-                          : 0;
-                  // xmax
-                  _cpu_data[idx++] =
-                      (center_x_temp + box_width_ratio / 2.f) / img_width <= 1
-                          ? (center_x_temp + box_width_ratio / 2.f) / img_width
-                          : 1;
-                  // ymax
-                  _cpu_data[idx++] =
-                      (center_y_temp + box_height_ratio / 2.f) / img_height <= 1
-                          ? (center_y_temp + box_height_ratio / 2.f) /
-                                img_height
-                          : 1;
-                }
-              }
-            }
-          } else {
-            // this code for density anchor box
-            if (density_size_.size() > 0) {
-              CHECK_EQ(fixed_size_.size(), density_size_.size())
-                  << "fixed_size_ should be same with density_size_";
-              int density = density_size_[s];
-              int shift = fixed_size_[s] / density;
-
-              for (int r = 0; r < density; ++r) {
-                for (int c = 0; c < density; ++c) {
-                  float center_x_temp =
-                      center_x - fixed_size / 2.f + shift / 2.f + c * shift;
-                  float center_y_temp =
-                      center_y - fixed_size / 2.f + shift / 2.f + r * shift;
-                  // xmin
-                  _cpu_data[idx++] =
-                      (center_x_temp - box_width / 2.f) / img_width >= 0
-                          ? (center_x_temp - box_width / 2.f) / img_width
-                          : 0;
-                  // ymin
-                  _cpu_data[idx++] =
-                      (center_y_temp - box_height / 2.f) / img_height >= 0
-                          ? (center_y_temp - box_height / 2.f) / img_height
-                          : 0;
-                  // xmax
-                  _cpu_data[idx++] =
-                      (center_x_temp + box_width / 2.f) / img_width <= 1
-                          ? (center_x_temp + box_width / 2.f) / img_width
-                          : 1;
-                  // ymax
-                  _cpu_data[idx++] =
-                      (center_y_temp + box_height / 2.f) / img_height <= 1
-                          ? (center_y_temp + box_height / 2.f) / img_height
-                          : 1;
-                }
-              }
-            }
-
-            // rest of priors: will never come here!!!
-            for (int r = 0; r < aspect_ratio_.size(); ++r) {
-              float ar = aspect_ratio_[r];
-
-              if (fabs(ar - 1.) < 1e-6) {
-                continue;
-              }
-
-              int density = density_size_[s];
-              int shift = fixed_size_[s] / density;
-              float box_width_ratio = fixed_size_[s] * sqrt(ar);
-              float box_height_ratio = fixed_size_[s] / sqrt(ar);
-
-              for (int p = 0; p < density; ++p) {
-                for (int c = 0; c < density; ++c) {
-                  float center_x_temp =
-                      center_x - fixed_size / 2.f + shift / 2.f + c * shift;
-                  float center_y_temp =
-                      center_y - fixed_size / 2.f + shift / 2.f + p * shift;
-                  // xmin
-                  _cpu_data[idx++] =
-                      (center_x_temp - box_width_ratio / 2.f) / img_width >= 0
-                          ? (center_x_temp - box_width_ratio / 2.f) / img_width
-                          : 0;
-                  // ymin
-                  _cpu_data[idx++] =
-                      (center_y_temp - box_height_ratio / 2.f) / img_height >= 0
-                          ? (center_y_temp - box_height_ratio / 2.f) /
-                                img_height
-                          : 0;
-                  // xmax
-                  _cpu_data[idx++] =
-                      (center_x_temp + box_width_ratio / 2.f) / img_width <= 1
-                          ? (center_x_temp + box_width_ratio / 2.f) / img_width
-                          : 1;
-                  // ymax
-                  _cpu_data[idx++] =
-                      (center_y_temp + box_height_ratio / 2.f) / img_height <= 1
-                          ? (center_y_temp + box_height_ratio / 2.f) /
-                                img_height
-                          : 1;
-                }
-              }
-            }
-          }
-        }
-      } else {
-        float* min_buf =
-            reinterpret_cast<float*>(fast_malloc(sizeof(float) * 4));
-        float* max_buf =
-            reinterpret_cast<float*>(fast_malloc(sizeof(float) * 4));
-        float* com_buf = reinterpret_cast<float*>(
-            fast_malloc(sizeof(float) * aspect_ratio_.size() * 4));
-
-        for (int s = 0; s < min_size_.size(); ++s) {
-          int min_idx = 0;
-          int max_idx = 0;
-          int com_idx = 0;
-          int min_size = min_size_[s];
-          // first prior: aspect_ratio = 1, size = min_size
-          box_width = box_height = min_size;
-          //! xmin
-          min_buf[min_idx++] = (center_x - box_width / 2.f) / img_width;
-          //! ymin
-          min_buf[min_idx++] = (center_y - box_height / 2.f) / img_height;
-          //! xmax
-          min_buf[min_idx++] = (center_x + box_width / 2.f) / img_width;
-          //! ymax
-          min_buf[min_idx++] = (center_y + box_height / 2.f) / img_height;
-
-          if (max_size_.size() > 0) {
-            int max_size = max_size_[s];
-            //! second prior: aspect_ratio = 1, size = sqrt(min_size * max_size)
-            box_width = box_height = sqrtf(min_size * max_size);
-            //! xmin
-            max_buf[max_idx++] = (center_x - box_width / 2.f) / img_width;
-            //! ymin
-            max_buf[max_idx++] = (center_y - box_height / 2.f) / img_height;
-            //! xmax
-            max_buf[max_idx++] = (center_x + box_width / 2.f) / img_width;
-            //! ymax
-            max_buf[max_idx++] = (center_y + box_height / 2.f) / img_height;
-          }
-
-          //! rest of priors
-          for (int r = 0; r < aspect_ratio_.size(); ++r) {
-            float ar = aspect_ratio_[r];
-            if (fabs(ar - 1.) < 1e-6) {
-              continue;
-            }
-            box_width = min_size * sqrt(ar);
-            box_height = min_size / sqrt(ar);
-            //! xmin
-            com_buf[com_idx++] = (center_x - box_width / 2.f) / img_width;
-            //! ymin
-            com_buf[com_idx++] = (center_y - box_height / 2.f) / img_height;
-            //! xmax
-            com_buf[com_idx++] = (center_x + box_width / 2.f) / img_width;
-            //! ymax
-            com_buf[com_idx++] = (center_y + box_height / 2.f) / img_height;
-          }
-          memcpy(_cpu_data + idx, min_buf, sizeof(float) * min_idx);
-          idx += min_idx;
-          memcpy(_cpu_data + idx, com_buf, sizeof(float) * com_idx);
-          idx += com_idx;
-          memcpy(_cpu_data + idx, max_buf, sizeof(float) * max_idx);
-          idx += max_idx;
-        }
-        fast_free(min_buf);
-        fast_free(max_buf);
-        fast_free(com_buf);
-      }
-    }
-  }
-  //! clip the prior's coordinate such that it is within [0, 1]
-  if (is_clip_) {
-    for (int d = 0; d < channel_size; ++d) {
-      _cpu_data[d] = std::min(std::max(_cpu_data[d], 0.f), 1.f);
-    }
-  }
-  //! set the variance.
-  int count = 0;
-  for (int h = 0; h < height; ++h) {
-    for (int w = 0; w < width; ++w) {
-      for (int i = 0; i < prior_num_; ++i) {
-        for (int j = 0; j < 4; ++j) {
-          _variance_data[count] = variance_[j];
-          ++count;
-        }
-      }
-    }
-  }
-}
-
-void prior_box(const lite::Tensor* input,
-               const lite::Tensor* image,
-               lite::Tensor** boxes,
-               lite::Tensor** variances,
-               const std::vector<float>& min_size,
-               const std::vector<float>& max_size,
-               const std::vector<float>& aspect_ratio,
-               const std::vector<float>& variance,
-               int img_w,
-               int img_h,
-               float step_w,
-               float step_h,
-               float offset,
-               int prior_num,
-               bool is_flip,
-               bool is_clip,
-               const std::vector<std::string>& order) {
-  density_prior_box(input,
-                    image,
-                    boxes,
-                    variances,
-                    min_size,
-                    std::vector<float>(),
-                    std::vector<float>(),
-                    std::vector<int>(),
-                    max_size,
-                    aspect_ratio,
-                    variance,
-                    img_w,
-                    img_h,
-                    step_w,
-                    step_h,
-                    offset,
-                    prior_num,
-                    is_flip,
-                    is_clip,
-                    order);
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/prior_box.h b/lite/backends/arm/math/prior_box.h
deleted file mode 100644
index ffa821b75e..0000000000
--- a/lite/backends/arm/math/prior_box.h
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void density_prior_box(const lite::Tensor* input,
-                       const lite::Tensor* image,
-                       lite::Tensor** boxes,
-                       lite::Tensor** variances,
-                       const std::vector<float>& min_size_,
-                       const std::vector<float>& fixed_size_,
-                       const std::vector<float>& fixed_ratio_,
-                       const std::vector<int>& density_size_,
-                       const std::vector<float>& max_size_,
-                       const std::vector<float>& aspect_ratio_,
-                       const std::vector<float>& variance_,
-                       int img_w_,
-                       int img_h_,
-                       float step_w_,
-                       float step_h_,
-                       float offset_,
-                       int prior_num_,
-                       bool is_flip_,
-                       bool is_clip_,
-                       const std::vector<std::string>& order_);
-
-void prior_box(const lite::Tensor* input,
-               const lite::Tensor* image,
-               lite::Tensor** boxes,
-               lite::Tensor** variances,
-               const std::vector<float>& min_size,
-               const std::vector<float>& max_size,
-               const std::vector<float>& aspect_ratio,
-               const std::vector<float>& variance,
-               int img_w,
-               int img_h,
-               float step_w,
-               float step_h,
-               float offset,
-               int prior_num,
-               bool is_flip,
-               bool is_clip,
-               const std::vector<std::string>& order);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/reduce_max.cc b/lite/backends/arm/math/reduce_max.cc
deleted file mode 100644
index 5c75960d72..0000000000
--- a/lite/backends/arm/math/reduce_max.cc
+++ /dev/null
@@ -1,207 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/arm/math/reduce_max.h"
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <>
-void reduce_n<float>(const float* src,
-                     float* dst,
-                     int num_in,
-                     int channel_in,
-                     int height_in,
-                     int width_in) {
-  int hw_size = height_in * width_in;
-  int chw_size = channel_in * hw_size;
-  int data_index, src_index, src_index0;
-  for (int c = 0; c < channel_in; ++c) {
-    for (int h = 0; h < height_in; ++h) {
-      for (int w = 0; w < width_in; ++w) {
-        data_index = c * hw_size + h * width_in + w;
-        dst[data_index] = src[data_index];
-        for (int n = 1; n < num_in; ++n) {
-          src_index = n * chw_size + data_index;
-          dst[data_index] = dst[data_index] > src[src_index] ? dst[data_index]
-                                                             : src[src_index];
-        }
-      }
-    }
-  }
-}
-
-template <>
-void reduce_c<float>(const float* src,
-                     float* dst,
-                     int num_in,
-                     int channel_in,
-                     int height_in,
-                     int width_in) {
-  int hw_size = height_in * width_in;
-  int chw_size = hw_size * channel_in;
-  int data_index, src_index0, src_index;
-  for (int n = 0; n < num_in; ++n) {
-    for (int h = 0; h < height_in; ++h) {
-      for (int w = 0; w < width_in; ++w) {
-        data_index = n * hw_size + h * width_in + w;
-        src_index0 = n * chw_size + h * width_in + w;
-        dst[data_index] = src[src_index0];
-        for (int c = 1; c < channel_in; ++c) {
-          src_index = src_index0 + c * hw_size;
-          dst[data_index] = dst[data_index] > src[src_index] ? dst[data_index]
-                                                             : src[src_index];
-        }
-      }
-    }
-  }
-}
-
-template <>
-void reduce_h<float>(const float* src,
-                     float* dst,
-                     int num_in,
-                     int channel_in,
-                     int height_in,
-                     int width_in) {
-  int cw_size = channel_in * width_in;
-  int chw_size = cw_size * height_in;
-  int hw_size = height_in * width_in;
-  int data_index, src_index, src_index0;
-  for (int n = 0; n < num_in; ++n) {
-    for (int c = 0; c < channel_in; ++c) {
-      for (int w = 0; w < width_in; ++w) {
-        data_index = n * cw_size + c * width_in + w;
-        src_index0 = n * chw_size + c * hw_size + w;
-        dst[data_index] = src[src_index0];
-        for (int h = 1; h < height_in; ++h) {
-          src_index = src_index0 + h * width_in;
-          dst[data_index] = dst[data_index] > src[src_index] ? dst[data_index]
-                                                             : src[src_index];
-        }
-      }
-    }
-  }
-}
-
-template <>
-void reduce_w<float>(const float* src,
-                     float* dst,
-                     int num_in,
-                     int channel_in,
-                     int height_in,
-                     int width_in) {
-  int ch_size = channel_in * height_in;
-  int hw_size = height_in * width_in;
-  int chw_size = ch_size * width_in;
-  int data_index = 0;
-  int src_index0 = 0;
-  int src_index = 0;
-  for (int n = 0; n < num_in; ++n) {
-    for (int c = 0; c < channel_in; ++c) {
-      for (int h = 0; h < height_in; ++h) {
-        data_index = n * ch_size + c * height_in + h;
-        src_index0 = n * chw_size + c * hw_size + h * width_in;
-        dst[data_index] = src[src_index0];
-        for (int w = 1; w < width_in; ++w) {
-          src_index = src_index0 + w;
-          dst[data_index] = dst[data_index] > src[src_index] ? dst[data_index]
-                                                             : src[src_index];
-        }
-      }
-    }
-  }
-}
-
-template <>
-void reduce_all<float>(const float* src,
-                       float* dst,
-                       int num_in,
-                       int channel_in,
-                       int height_in,
-                       int width_in) {
-  float max = src[0];
-  int src_index;
-  int n_id, c_id;
-  for (int n = 0; n < num_in; ++n) {
-    n_id = n * channel_in * height_in * width_in;
-    for (int c = 0; c < channel_in; ++c) {
-      c_id = c * height_in * width_in;
-      for (int h = 0; h < height_in; ++h) {
-        for (int w = 0; w < width_in; ++w) {
-          src_index = n_id + c_id + h * width_in + w;
-          max = src[src_index] > max ? src[src_index] : max;
-        }
-      }
-    }
-  }
-  dst[0] = max;
-}
-
-template <>
-void reduce_nc<float>(const float* src,
-                      float* dst,
-                      int num_in,
-                      int channel_in,
-                      int height_in,
-                      int width_in) {
-  // reduce n first.
-  DDimLite ddimA({1, channel_in, height_in, width_in});
-  lite::Tensor tensor_tmp;
-  tensor_tmp.Resize(ddimA);
-  float* tmp_out = tensor_tmp.mutable_data<float>();
-  reduce_n(src, tmp_out, num_in, channel_in, height_in, width_in);
-  reduce_c(tmp_out, dst, 1, channel_in, height_in, width_in);
-}
-
-template <>
-void reduce_ch<float>(const float* src,
-                      float* dst,
-                      int num_in,
-                      int channel_in,
-                      int height_in,
-                      int width_in) {
-  // reduce c first
-  DDimLite ddimA({num_in, 1, height_in, width_in});
-  lite::Tensor tensor_tmp;
-  tensor_tmp.Resize(ddimA);
-  float* tmp_out = tensor_tmp.mutable_data<float>();
-  reduce_c(src, tmp_out, num_in, channel_in, height_in, width_in);
-  reduce_h(tmp_out, dst, num_in, 1, height_in, width_in);
-}
-
-template <>
-void reduce_hw<float>(const float* src,
-                      float* dst,
-                      int num_in,
-                      int channel_in,
-                      int height_in,
-                      int width_in) {
-  // reduce h first
-  DDimLite ddimA({num_in, channel_in, 1, width_in});
-  lite::Tensor tensor_tmp;
-  tensor_tmp.Resize(ddimA);
-  float* tmp_out = tensor_tmp.mutable_data<float>();
-  reduce_h(src, tmp_out, num_in, channel_in, height_in, width_in);
-  reduce_w(tmp_out, dst, num_in, channel_in, 1, width_in);
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/reduce_max.h b/lite/backends/arm/math/reduce_max.h
deleted file mode 100644
index dab9626182..0000000000
--- a/lite/backends/arm/math/reduce_max.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <typename T>
-void reduce_n(const T* src,
-              T* dst,
-              int num_in,
-              int channel_in,
-              int height_in,
-              int width_in);
-
-template <typename T>
-void reduce_c(const T* src,
-              T* dst,
-              int num_in,
-              int channel_in,
-              int height_in,
-              int width_in);
-
-template <typename T>
-void reduce_h(const T* src,
-              T* dst,
-              int num_in,
-              int channel_in,
-              int height_in,
-              int width_in);
-
-template <typename T>
-void reduce_w(const T* src,
-              T* dst,
-              int num_in,
-              int channel_in,
-              int height_in,
-              int width_in);
-
-template <typename T>
-void reduce_nc(const T* src,
-               T* dst,
-               int num_in,
-               int channel_in,
-               int height_in,
-               int width_in);
-
-template <typename T>
-void reduce_ch(const T* src,
-               T* dst,
-               int num_in,
-               int channel_in,
-               int height_in,
-               int width_in);
-
-template <typename T>
-void reduce_hw(const T* src,
-               T* dst,
-               int num_in,
-               int channel_in,
-               int height_in,
-               int width_in);
-
-template <typename T>
-void reduce_all(const T* src,
-                T* dst,
-                int num_in,
-                int channel_in,
-                int height_in,
-                int width_in);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/reduce_mean.cc b/lite/backends/arm/math/reduce_mean.cc
deleted file mode 100644
index 56104550d8..0000000000
--- a/lite/backends/arm/math/reduce_mean.cc
+++ /dev/null
@@ -1,204 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/arm/math/reduce_mean.h"
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <>
-void reduce_mean_n<float>(const float* src,
-                          float* dst,
-                          int num_in,
-                          int channel_in,
-                          int height_in,
-                          int width_in) {
-  int hw_size = height_in * width_in;
-  int chw_size = channel_in * hw_size;
-  int data_index, src_index, src_index0;
-  for (int c = 0; c < channel_in; ++c) {
-    for (int h = 0; h < height_in; ++h) {
-      for (int w = 0; w < width_in; ++w) {
-        data_index = c * hw_size + h * width_in + w;
-        dst[data_index] = 0.0;
-        for (int n = 0; n < num_in; ++n) {
-          src_index = n * chw_size + data_index;
-          dst[data_index] += static_cast<float>(src[src_index]) / num_in;
-        }
-      }
-    }
-  }
-}
-
-template <>
-void reduce_mean_c<float>(const float* src,
-                          float* dst,
-                          int num_in,
-                          int channel_in,
-                          int height_in,
-                          int width_in) {
-  int hw_size = height_in * width_in;
-  int chw_size = hw_size * channel_in;
-  int data_index, src_index0, src_index;
-  for (int n = 0; n < num_in; ++n) {
-    for (int h = 0; h < height_in; ++h) {
-      for (int w = 0; w < width_in; ++w) {
-        data_index = n * hw_size + h * width_in + w;
-        src_index0 = n * chw_size + h * width_in + w;
-        dst[data_index] = 0.0;
-        for (int c = 0; c < channel_in; ++c) {
-          src_index = src_index0 + c * hw_size;
-          dst[data_index] += static_cast<float>(src[src_index]) / channel_in;
-        }
-      }
-    }
-  }
-}
-
-template <>
-void reduce_mean_h<float>(const float* src,
-                          float* dst,
-                          int num_in,
-                          int channel_in,
-                          int height_in,
-                          int width_in) {
-  int cw_size = channel_in * width_in;
-  int chw_size = cw_size * height_in;
-  int hw_size = height_in * width_in;
-  int data_index, src_index, src_index0;
-  for (int n = 0; n < num_in; ++n) {
-    for (int c = 0; c < channel_in; ++c) {
-      for (int w = 0; w < width_in; ++w) {
-        data_index = n * cw_size + c * width_in + w;
-        src_index0 = n * chw_size + c * hw_size + w;
-        dst[data_index] = 0.0;
-        for (int h = 0; h < height_in; ++h) {
-          src_index = src_index0 + h * width_in;
-          dst[data_index] += static_cast<float>(src[src_index]) / height_in;
-        }
-      }
-    }
-  }
-}
-
-template <>
-void reduce_mean_w<float>(const float* src,
-                          float* dst,
-                          int num_in,
-                          int channel_in,
-                          int height_in,
-                          int width_in) {
-  int ch_size = channel_in * height_in;
-  int hw_size = height_in * width_in;
-  int chw_size = ch_size * width_in;
-  int data_index = 0;
-  int src_index0 = 0;
-  int src_index = 0;
-  for (int n = 0; n < num_in; ++n) {
-    for (int c = 0; c < channel_in; ++c) {
-      for (int h = 0; h < height_in; ++h) {
-        data_index = n * ch_size + c * height_in + h;
-        src_index0 = n * chw_size + c * hw_size + h * width_in;
-        dst[data_index] = 0.0;
-        for (int w = 0; w < width_in; ++w) {
-          src_index = src_index0 + w;
-          dst[data_index] += static_cast<float>(src[src_index]) / width_in;
-        }
-      }
-    }
-  }
-}
-
-template <>
-void reduce_mean_all<float>(const float* src,
-                            float* dst,
-                            int num_in,
-                            int channel_in,
-                            int height_in,
-                            int width_in) {
-  float mean = 0.0;
-  int src_index;
-  int n_id, c_id;
-  int all = num_in * channel_in * height_in * width_in;
-  for (int n = 0; n < num_in; ++n) {
-    n_id = n * channel_in * height_in * width_in;
-    for (int c = 0; c < channel_in; ++c) {
-      c_id = c * height_in * width_in;
-      for (int h = 0; h < height_in; ++h) {
-        for (int w = 0; w < width_in; ++w) {
-          src_index = n_id + c_id + h * width_in + w;
-          mean = src[src_index] / all;
-        }
-      }
-    }
-  }
-  dst[0] = mean;
-}
-
-template <>
-void reduce_mean_nc<float>(const float* src,
-                           float* dst,
-                           int num_in,
-                           int channel_in,
-                           int height_in,
-                           int width_in) {
-  // reduce n first.
-  DDimLite ddimA({1, channel_in, height_in, width_in});
-  lite::Tensor tensor_tmp;
-  tensor_tmp.Resize(ddimA);
-  float* tmp_out = tensor_tmp.mutable_data<float>();
-  reduce_mean_n(src, tmp_out, num_in, channel_in, height_in, width_in);
-  reduce_mean_c(tmp_out, dst, 1, channel_in, height_in, width_in);
-}
-
-template <>
-void reduce_mean_ch<float>(const float* src,
-                           float* dst,
-                           int num_in,
-                           int channel_in,
-                           int height_in,
-                           int width_in) {
-  // reduce c first
-  DDimLite ddimA({num_in, 1, height_in, width_in});
-  lite::Tensor tensor_tmp;
-  tensor_tmp.Resize(ddimA);
-  float* tmp_out = tensor_tmp.mutable_data<float>();
-  reduce_mean_c(src, tmp_out, num_in, channel_in, height_in, width_in);
-  reduce_mean_h(tmp_out, dst, num_in, 1, height_in, width_in);
-}
-
-template <>
-void reduce_mean_hw<float>(const float* src,
-                           float* dst,
-                           int num_in,
-                           int channel_in,
-                           int height_in,
-                           int width_in) {
-  // reduce h first
-  DDimLite ddimA({num_in, channel_in, 1, width_in});
-  lite::Tensor tensor_tmp;
-  tensor_tmp.Resize(ddimA);
-  float* tmp_out = tensor_tmp.mutable_data<float>();
-  reduce_mean_h(src, tmp_out, num_in, channel_in, height_in, width_in);
-  reduce_mean_w(tmp_out, dst, num_in, channel_in, 1, width_in);
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/reduce_mean.h b/lite/backends/arm/math/reduce_mean.h
deleted file mode 100644
index 277ed209c0..0000000000
--- a/lite/backends/arm/math/reduce_mean.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <typename T>
-void reduce_mean_n(const T* src,
-                   T* dst,
-                   int num_in,
-                   int channel_in,
-                   int height_in,
-                   int width_in);
-
-template <typename T>
-void reduce_mean_c(const T* src,
-                   T* dst,
-                   int num_in,
-                   int channel_in,
-                   int height_in,
-                   int width_in);
-
-template <typename T>
-void reduce_mean_h(const T* src,
-                   T* dst,
-                   int num_in,
-                   int channel_in,
-                   int height_in,
-                   int width_in);
-
-template <typename T>
-void reduce_mean_w(const T* src,
-                   T* dst,
-                   int num_in,
-                   int channel_in,
-                   int height_in,
-                   int width_in);
-
-template <typename T>
-void reduce_mean_nc(const T* src,
-                    T* dst,
-                    int num_in,
-                    int channel_in,
-                    int height_in,
-                    int width_in);
-
-template <typename T>
-void reduce_mean_ch(const T* src,
-                    T* dst,
-                    int num_in,
-                    int channel_in,
-                    int height_in,
-                    int width_in);
-
-template <typename T>
-void reduce_mean_hw(const T* src,
-                    T* dst,
-                    int num_in,
-                    int channel_in,
-                    int height_in,
-                    int width_in);
-
-template <typename T>
-void reduce_mean_all(const T* src,
-                     T* dst,
-                     int num_in,
-                     int channel_in,
-                     int height_in,
-                     int width_in);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/saturate.h b/lite/backends/arm/math/saturate.h
deleted file mode 100644
index 833f0f5c1c..0000000000
--- a/lite/backends/arm/math/saturate.h
+++ /dev/null
@@ -1,320 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <limits.h>
-#include <algorithm>
-#include <cmath>
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <typename _Tp>
-static inline _Tp saturate_cast(uint8_t v) {
-  return _Tp(v);
-}
-/** @overload */
-template <typename _Tp>
-static inline _Tp saturate_cast(int8_t v) {
-  return _Tp(v);
-}
-/** @overload */
-template <typename _Tp>
-static inline _Tp saturate_cast(uint16_t v) {
-  return _Tp(v);
-}
-/** @overload */
-template <typename _Tp>
-static inline _Tp saturate_cast(int16_t v) {
-  return _Tp(v);
-}
-/** @overload */
-template <typename _Tp>
-static inline _Tp saturate_cast(uint32_t v) {
-  return _Tp(v);
-}
-/** @overload */
-template <typename _Tp>
-static inline _Tp saturate_cast(int32_t v) {
-  return _Tp(v);
-}
-/** @overload */
-template <typename _Tp>
-static inline _Tp saturate_cast(float v) {
-  return _Tp(v);
-}
-/** @overload */
-template <typename _Tp>
-static inline _Tp saturate_cast(double v) {
-  return _Tp(v);
-}
-/** @overload */
-template <typename _Tp>
-static inline _Tp saturate_cast(int64_t v) {
-  return _Tp(v);
-}
-/** @overload */
-template <typename _Tp>
-static inline _Tp saturate_cast(uint64_t v) {
-  return _Tp(v);
-}
-
-template <>
-inline uint8_t saturate_cast<uint8_t>(int8_t v) {
-  return static_cast<uint8_t>(std::max(static_cast<int>(v), 0));
-}
-
-template <>
-inline uint8_t saturate_cast<uint8_t>(uint16_t v) {
-  return static_cast<uint8_t>(std::min((unsigned)v, (unsigned)UCHAR_MAX));
-}
-
-template <>
-inline uint8_t saturate_cast<uint8_t>(int v) {
-  return static_cast<uint8_t>(
-      ((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0));
-}
-
-template <>
-inline uint8_t saturate_cast<uint8_t>(int16_t v) {
-  return saturate_cast<uint8_t>(static_cast<int>(v));
-}
-
-template <>
-inline uint8_t saturate_cast<uint8_t>(unsigned v) {
-  return static_cast<uint8_t>(std::min(v, (unsigned)UCHAR_MAX));
-}
-template <>
-inline uint8_t saturate_cast<uint8_t>(float v) {
-  int iv = static_cast<int>(roundf(v));
-  return saturate_cast<uint8_t>(iv);
-}
-template <>
-inline uint8_t saturate_cast<uint8_t>(double v) {
-  int iv = static_cast<int>(round(v));
-  return saturate_cast<uint8_t>(iv);
-}
-template <>
-inline uint8_t saturate_cast<uint8_t>(int64_t v) {
-  return static_cast<uint8_t>(
-      ((uint64_t)v <= (uint64_t)UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0));
-}
-template <>
-inline uint8_t saturate_cast<uint8_t>(uint64_t v) {
-  return static_cast<uint8_t>(std::min(v, (uint64_t)UCHAR_MAX));
-}
-
-template <>
-inline int8_t saturate_cast<int8_t>(uint8_t v) {
-  return static_cast<int8_t>(std::min(static_cast<int>(v), SCHAR_MAX));
-}
-template <>
-inline int8_t saturate_cast<int8_t>(uint16_t v) {
-  return static_cast<int8_t>(std::min((unsigned)v, (unsigned)SCHAR_MAX));
-}
-template <>
-inline int8_t saturate_cast<int8_t>(int v) {
-  return static_cast<int8_t>(((unsigned)(v - SCHAR_MIN) <= (unsigned)UCHAR_MAX
-                                  ? v
-                                  : v > 0 ? SCHAR_MAX : SCHAR_MIN));
-}
-template <>
-inline int8_t saturate_cast<int8_t>(int16_t v) {
-  return saturate_cast<int8_t>(static_cast<int>(v));
-}
-template <>
-inline int8_t saturate_cast<int8_t>(unsigned v) {
-  return static_cast<int8_t>(std::min(v, (unsigned)SCHAR_MAX));
-}
-template <>
-inline int8_t saturate_cast<int8_t>(float v) {
-  int iv = static_cast<int>(roundf(v));
-  return saturate_cast<int8_t>(iv);
-}
-template <>
-inline int8_t saturate_cast<int8_t>(double v) {
-  int iv = static_cast<int>(round(v));
-  return saturate_cast<int8_t>(iv);
-}
-template <>
-inline int8_t saturate_cast<int8_t>(int64_t v) {
-  return static_cast<int8_t>(
-      ((uint64_t)(static_cast<int64_t>(v) - SCHAR_MIN) <= (uint64_t)UCHAR_MAX
-           ? v
-           : v > 0 ? SCHAR_MAX : SCHAR_MIN));
-}
-template <>
-inline int8_t saturate_cast<int8_t>(uint64_t v) {
-  return static_cast<int8_t>(std::min(v, (uint64_t)SCHAR_MAX));
-}
-
-template <>
-inline uint16_t saturate_cast<uint16_t>(int8_t v) {
-  return static_cast<uint16_t>(std::max(static_cast<int>(v), 0));
-}
-
-template <>
-inline uint16_t saturate_cast<uint16_t>(int16_t v) {
-  return static_cast<uint16_t>(std::max(static_cast<int>(v), 0));
-}
-template <>
-inline uint16_t saturate_cast<uint16_t>(int v) {
-  return static_cast<uint16_t>(
-      (unsigned)v <= (unsigned)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0);
-}
-template <>
-inline uint16_t saturate_cast<uint16_t>(unsigned v) {
-  return static_cast<uint16_t>(std::min(v, (unsigned)USHRT_MAX));
-}
-template <>
-inline uint16_t saturate_cast<uint16_t>(float v) {
-  int iv = static_cast<int>(roundf(v));
-  return saturate_cast<uint16_t>(iv);
-}
-template <>
-inline uint16_t saturate_cast<uint16_t>(double v) {
-  int iv = static_cast<int>(round(v));
-  return saturate_cast<uint16_t>(iv);
-}
-template <>
-inline uint16_t saturate_cast<uint16_t>(int64_t v) {
-  return static_cast<uint16_t>(
-      (uint64_t)v <= (uint64_t)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0);
-}
-template <>
-inline uint16_t saturate_cast<uint16_t>(uint64_t v) {
-  return static_cast<uint16_t>(std::min(v, (uint64_t)USHRT_MAX));
-}
-
-template <>
-inline int16_t saturate_cast<int16_t>(uint16_t v) {
-  return static_cast<int16_t>(std::min(static_cast<int>(v), SHRT_MAX));
-}
-template <>
-inline int16_t saturate_cast<int16_t>(int v) {
-  return static_cast<int16_t>((unsigned)(v - SHRT_MIN) <= (unsigned)USHRT_MAX
-                                  ? v
-                                  : v > 0 ? SHRT_MAX : SHRT_MIN);
-}
-template <>
-inline int16_t saturate_cast<int16_t>(unsigned v) {
-  return (int16_t)std::min(v, (unsigned)SHRT_MAX);
-}
-template <>
-inline int16_t saturate_cast<int16_t>(float v) {
-  int iv = static_cast<int>(roundf(v));
-  return saturate_cast<int16_t>(iv);
-}
-template <>
-inline int16_t saturate_cast<int16_t>(double v) {
-  int iv = static_cast<int>(round(v));
-  return saturate_cast<int16_t>(iv);
-}
-template <>
-inline int16_t saturate_cast<int16_t>(int64_t v) {
-  return static_cast<int16_t>((uint64_t)((int64_t)v - SHRT_MIN) <=
-                                      (uint64_t)USHRT_MAX
-                                  ? v
-                                  : v > 0 ? SHRT_MAX : SHRT_MIN);
-}
-template <>
-inline int16_t saturate_cast<int16_t>(uint64_t v) {
-  return static_cast<int16_t>(std::min(v, (uint64_t)SHRT_MAX));
-}
-
-template <>
-inline int saturate_cast<int>(unsigned v) {
-  return static_cast<int>(std::min(v, (unsigned)INT_MAX));
-}
-template <>
-inline int saturate_cast<int>(int64_t v) {
-  return static_cast<int>((uint64_t)(v - INT_MIN) <= (uint64_t)UINT_MAX
-                              ? v
-                              : v > 0 ? INT_MAX : INT_MIN);
-}
-template <>
-inline int saturate_cast<int>(uint64_t v) {
-  return static_cast<int>(std::min(v, (uint64_t)INT_MAX));
-}
-template <>
-inline int saturate_cast<int>(float v) {
-  return static_cast<int>(roundf(v));
-}
-template <>
-inline int saturate_cast<int>(double v) {
-  return static_cast<int>(round(v));
-}
-
-template <>
-inline unsigned saturate_cast<unsigned>(int8_t v) {
-  return static_cast<unsigned>(std::max(v, static_cast<int8_t>(0)));
-}
-template <>
-inline unsigned saturate_cast<unsigned>(int16_t v) {
-  return static_cast<unsigned>(std::max(v, (int16_t)0));
-}
-template <>
-inline unsigned saturate_cast<unsigned>(int v) {
-  return static_cast<unsigned>(std::max(v, static_cast<int>(0)));
-}
-template <>
-inline unsigned saturate_cast<unsigned>(int64_t v) {
-  return static_cast<unsigned>(
-      (uint64_t)v <= (uint64_t)UINT_MAX ? v : v > 0 ? UINT_MAX : 0);
-}
-template <>
-inline unsigned saturate_cast<unsigned>(uint64_t v) {
-  return static_cast<unsigned>(std::min(v, (uint64_t)UINT_MAX));
-}
-// we intentionally do not clip negative numbers, to make -1 become 0xffffffff
-// etc.
-template <>
-inline unsigned saturate_cast<unsigned>(float v) {
-  return static_cast<unsigned>(roundf(v));
-}
-template <>
-inline unsigned saturate_cast<unsigned>(double v) {
-  return static_cast<unsigned>(round(v));
-}
-
-template <>
-inline uint64_t saturate_cast<uint64_t>(int8_t v) {
-  return static_cast<uint64_t>(std::max(v, static_cast<int8_t>(0)));
-}
-
-template <>
-inline uint64_t saturate_cast<uint64_t>(int16_t v) {
-  return static_cast<uint64_t>(std::max(v, (int16_t)0));
-}
-template <>
-inline uint64_t saturate_cast<uint64_t>(int v) {
-  return static_cast<uint64_t>(std::max(v, static_cast<int>(0)));
-}
-template <>
-inline uint64_t saturate_cast<uint64_t>(int64_t v) {
-  return static_cast<uint64_t>(std::max(v, (int64_t)0));
-}
-
-template <>
-inline int64_t saturate_cast<int64_t>(uint64_t v) {
-  return static_cast<int64_t>(std::min(v, (uint64_t)LLONG_MAX));
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/scale.cc b/lite/backends/arm/math/scale.cc
deleted file mode 100644
index 7f2169a645..0000000000
--- a/lite/backends/arm/math/scale.cc
+++ /dev/null
@@ -1,177 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/scale.h"
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <>
-void scale<float>(
-    const float* din, float* dout, int num, float scale, float bias) {
-  int cnt = num >> 4;
-  int remain = num % 16;
-  float32x4_t vscale = vdupq_n_f32(scale);
-  float32x4_t vbias = vdupq_n_f32(bias);
-#pragma omp parallel for
-  for (int i = 0; i < cnt; i++) {
-    const float* din_ptr = din + (i << 4);
-    float* dout_ptr = dout + (i << 4);
-
-    float32x4_t din0 = vld1q_f32(din_ptr);
-    float32x4_t din1 = vld1q_f32(din_ptr + 4);
-    float32x4_t din2 = vld1q_f32(din_ptr + 8);
-    float32x4_t din3 = vld1q_f32(din_ptr + 12);
-
-    float32x4_t vsum1 = vmlaq_f32(vbias, din0, vscale);
-    float32x4_t vsum2 = vmlaq_f32(vbias, din1, vscale);
-    float32x4_t vsum3 = vmlaq_f32(vbias, din2, vscale);
-    float32x4_t vsum4 = vmlaq_f32(vbias, din3, vscale);
-
-    vst1q_f32(dout_ptr, vsum1);
-    vst1q_f32(dout_ptr + 4, vsum2);
-    vst1q_f32(dout_ptr + 8, vsum3);
-    vst1q_f32(dout_ptr + 12, vsum4);
-  }
-  if (remain > 0) {
-    const float* din_ptr = din + (cnt << 4);
-    float* dout_ptr = dout + (cnt << 4);
-    for (int i = 0; i < remain; i++) {
-      *dout_ptr = *din_ptr * scale + bias;
-      dout_ptr++;
-      din_ptr++;
-    }
-  }
-}
-
-template <>
-void scale<float>(const float* din,
-                  float* dout,
-                  int outer_dim,
-                  int scale_dim,
-                  int inner_dim,
-                  const float* scale_data,
-                  const float* bias_data) {
-  int cnt = inner_dim >> 4;
-  int remain = inner_dim % 16;
-  int size = inner_dim * scale_dim;
-  for (int n = 0; n < outer_dim; n++) {
-    const float* din_ptr_n = din + n * size;
-    float* dout_ptr_n = dout + n * size;
-#pragma omp parallel for
-    for (int i = 0; i < scale_dim; i++) {
-      const float* din_ptr = din_ptr_n + i * inner_dim;
-      float* dout_ptr = dout_ptr_n + i * inner_dim;
-      float scale = scale_data[i];
-      float32x4_t vscale = vdupq_n_f32(scale);
-      float bias = bias_data[i];
-      float32x4_t vbias = vdupq_n_f32(bias);
-      for (int j = 0; j < cnt; j++) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        float32x4_t din1 = vld1q_f32(din_ptr + 4);
-        float32x4_t din2 = vld1q_f32(din_ptr + 8);
-        float32x4_t din3 = vld1q_f32(din_ptr + 12);
-
-        float32x4_t vsum1 = vmlaq_f32(vbias, din0, vscale);
-        float32x4_t vsum2 = vmlaq_f32(vbias, din1, vscale);
-        float32x4_t vsum3 = vmlaq_f32(vbias, din2, vscale);
-        float32x4_t vsum4 = vmlaq_f32(vbias, din3, vscale);
-
-        din_ptr += 16;
-        vst1q_f32(dout_ptr, vsum1);
-        vst1q_f32(dout_ptr + 4, vsum2);
-        vst1q_f32(dout_ptr + 8, vsum3);
-        vst1q_f32(dout_ptr + 12, vsum4);
-
-        dout_ptr += 16;
-      }
-      for (int j = 0; j < remain; j++) {
-        *dout_ptr = *din_ptr * scale + bias;
-        dout_ptr++;
-        din_ptr++;
-      }
-    }
-  }
-}
-
-template <>
-void scale<float>(const float* din,
-                  float* dout,
-                  int outer_dim,
-                  int scale_dim,
-                  const float* scale_data,
-                  const float* bias_data) {
-  int cnt = scale_dim >> 4;
-  int remain = scale_dim % 16;
-  for (int n = 0; n < outer_dim; n++) {
-    const float* din_ptr_n = din + n * scale_dim;
-    float* dout_ptr_n = dout + n * scale_dim;
-#pragma omp parallel for
-    for (int i = 0; i < cnt; i++) {
-      int idx = i << 4;
-      const float* din_ptr = din_ptr_n + idx;
-      const float* scale_ptr = scale_data + idx;
-      const float* bias_ptr = bias_data + idx;
-      float* dout_ptr = dout_ptr_n + idx;
-
-      float32x4_t din0 = vld1q_f32(din_ptr);
-      float32x4_t vscale0 = vld1q_f32(scale_ptr);
-      float32x4_t vbias0 = vld1q_f32(bias_ptr);
-
-      float32x4_t din1 = vld1q_f32(din_ptr + 4);
-      float32x4_t vscale1 = vld1q_f32(scale_ptr + 4);
-      float32x4_t vbias1 = vld1q_f32(bias_ptr + 4);
-
-      float32x4_t din2 = vld1q_f32(din_ptr + 8);
-      float32x4_t vscale2 = vld1q_f32(scale_ptr + 8);
-      float32x4_t vbias2 = vld1q_f32(bias_ptr + 8);
-
-      float32x4_t vsum1 = vmlaq_f32(vbias0, din0, vscale0);
-      float32x4_t vsum2 = vmlaq_f32(vbias1, din1, vscale1);
-
-      float32x4_t din3 = vld1q_f32(din_ptr + 12);
-      float32x4_t vscale3 = vld1q_f32(scale_ptr + 12);
-      float32x4_t vbias3 = vld1q_f32(bias_ptr + 12);
-
-      vst1q_f32(dout_ptr, vsum1);
-      vst1q_f32(dout_ptr + 4, vsum2);
-
-      float32x4_t vsum3 = vmlaq_f32(vbias2, din2, vscale2);
-      float32x4_t vsum4 = vmlaq_f32(vbias3, din3, vscale3);
-
-      vst1q_f32(dout_ptr + 8, vsum3);
-      vst1q_f32(dout_ptr + 12, vsum4);
-    }
-    int idx = cnt << 4;
-    const float* din_ptr = din_ptr_n + idx;
-    float* dout_ptr = dout_ptr_n + idx;
-    const float* scale_ptr = scale_data + idx;
-    const float* bias_ptr = bias_data + idx;
-    for (int j = 0; j < remain; j++) {
-      *dout_ptr = *din_ptr * (*scale_ptr) + (*bias_ptr);
-      dout_ptr++;
-      din_ptr++;
-      scale_ptr++;
-      bias_ptr++;
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/scale.h b/lite/backends/arm/math/scale.h
deleted file mode 100644
index a86528c9df..0000000000
--- a/lite/backends/arm/math/scale.h
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <typename T>
-void scale(const T* din, T* dout, int num, float scale, float bias);
-
-template <typename T>
-void scale(const T* din,
-           T* dout,
-           int outer_dim,
-           int scale_dim,
-           int inner_dim,
-           const float* scale_data,
-           const float* bias_data);
-
-template <typename T>
-void scale(const T* din,
-           T* dout,
-           int outer_dim,
-           int scale_dim,
-           const float* scale_data,
-           const float* bias_data);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/sequence2batch.h b/lite/backends/arm/math/sequence2batch.h
deleted file mode 100644
index d982ad6667..0000000000
--- a/lite/backends/arm/math/sequence2batch.h
+++ /dev/null
@@ -1,210 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <vector>
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <typename T>
-class CopyMatrixRowsFunctor {
- public:
-  // If is_src_index is true,
-  // copy the indexed rows of input src to the output dst.
-  // If is_src_index is false,
-  // copy the input src to the indexed rows of output dst.
-  // The indexed rows are based on the input index.
-  void operator()(const Tensor& src,
-                  std::vector<uint64_t> index_lod,
-                  Tensor* dst,
-                  bool is_src_index) {
-    auto index = index_lod.data();
-    auto src_dims = src.dims();
-    auto dst_dims = dst->dims();
-    CHECK_EQ(src_dims.size(), 2UL) << "The src must be matrix with rank 2.";
-    CHECK_EQ(dst_dims.size(), 2UL) << "The dst must be matrix with rank 2.";
-    CHECK_EQ(src_dims[1], dst_dims[1])
-        << "The width of src and dst must be same.";
-    auto height = dst_dims[0];
-    auto width = dst_dims[1];
-    auto* src_data = src.data<T>();
-    auto* dst_data = dst->mutable_data<T>();
-    const int sz = width * sizeof(T);
-    if (is_src_index) {
-      for (int i = 0; i < height; ++i) {
-        TargetCopy(TARGET(kARM),
-                   dst_data + i * width,
-                   src_data + index[i] * width,
-                   sz);
-      }
-    } else {
-      for (int i = 0; i < height; ++i) {
-        TargetCopy(TARGET(kARM),
-                   dst_data + index[i] * width,
-                   src_data + i * width,
-                   sz);
-      }
-    }
-  }
-};
-
-template <typename T>
-class LoDTensor2BatchFunctor {
-  // Calculate the length of each sequence and
-  // sort sequence index by the length.
-  // example:  sequences = {s0, s1, s2}
-  //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
-  //           seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
-  //
-  struct SeqInfo {
-    SeqInfo(int start, int length, int seq_idx)
-        : start(start), length(length), seq_idx(seq_idx) {}
-    int start;
-    int length;
-    int seq_idx;
-  };
-
- public:
-  void operator()(const Tensor& lod_tensor,
-                  Tensor* batch,
-                  bool is_cal_batch_lod,
-                  bool is_reverse = false) const {
-    if (!is_cal_batch_lod) {
-      auto lods = batch->lod();
-      CHECK_GT(lods.size(), 2UL)
-          << "The LoD of LoDTensor should inlcude at least 2-level "
-             "sequence information.";
-      CHECK_EQ(lods[1].size(), static_cast<size_t>(lod_tensor.dims()[0]))
-          << "The LoD information should be consistent with the dims.";
-      CopyMatrixRowsFunctor<T> to_batch;
-      to_batch(lod_tensor, lods[1], batch, true);
-      return;
-    }
-
-    auto lods = lod_tensor.lod();
-    CHECK_EQ(lods.size(), 1UL) << "Only support one level sequence now.";
-
-    const auto& lod = lods[0];
-
-    std::vector<SeqInfo> seq_info;
-    for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
-      int length = lod[seq_id + 1] - lod[seq_id];
-      seq_info.emplace_back(lod[seq_id], length, seq_id);
-    }
-
-    std::sort(seq_info.begin(), seq_info.end(), [](SeqInfo a, SeqInfo b) {
-      return a.length > b.length;
-    });
-
-    // Calculate the start position of each batch.
-    // example:  sequences = {s0, s1, s2}
-    //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
-    //           max_seqlen = 5,
-    //           batchIndex = {b0, b1, b2, b3, b4}
-    //           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
-    //           batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
-    //              batch_start_positions[0] = len(b0)
-    //              batch_start_positions[1] = len(b0) + len(b1)
-    //              batch_start_positions[2] = len(b0) + len(b1) + len(b2)
-    //              ...
-    //           seq2batch_idx[12] = {4, 0, 9,
-    //                                5, 1, 10,
-    //                                6, 2, 11,
-    //                                7, 3,
-    //                                8}
-    //           seq_order = {1, 0, 2}, the sort order.
-    //               where 1 is the second sequence,
-    //                     0 is the first sequence,
-    //                     2 is the third sequence.
-    // The max_seqlen represents batch size after rearranging the
-    // input LodTensor. It is also the maximum length of input sequence.
-
-    LoD batch_lods;
-    batch_lods.emplace_back(std::vector<uint64_t>{0});
-    batch_lods.emplace_back(std::vector<uint64_t>{0});
-    batch_lods.emplace_back(std::vector<uint64_t>{0});
-
-    // batch_lods[0] is the start positions for batch LoDTensor
-    int max_seqlen = seq_info[0].length;
-    batch_lods[0].resize(static_cast<size_t>(max_seqlen + 1));
-    // batch_lods[1] is the raw index in the input LoDTensor
-    batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
-    // batch_lods[2] is the sort order for the input LoDTensor.
-    batch_lods[2].resize(seq_info.size());
-
-    auto batch_starts = batch_lods[0].data();
-    auto seq2batch_idx = batch_lods[1].data();
-    batch_starts[0] = 0;
-    for (int n = 0; n < max_seqlen; n++) {
-      auto batch_id = static_cast<int>(batch_starts[n]);
-      for (size_t i = 0; i < seq_info.size(); ++i) {
-        int seq_len = seq_info[i].length;
-        int start = seq_info[i].start;
-        if (n < seq_len) {
-          seq2batch_idx[batch_id] =
-              is_reverse ? start + seq_len - 1 - n : start + n;
-          batch_id++;
-        } else {
-          break;
-        }
-      }
-      batch_starts[n + 1] = static_cast<size_t>(batch_id);
-    }
-    auto seq_order = batch_lods[2].data();
-    for (size_t i = 0; i < seq_info.size(); ++i) {
-      seq_order[i] = seq_info[i].seq_idx;
-    }
-    *(batch->mutable_lod()) = batch_lods;
-
-    CopyMatrixRowsFunctor<T> to_batch;
-    to_batch(lod_tensor, batch_lods[1], batch, true);
-  }
-};
-
-template <typename T>
-class Batch2LoDTensorFunctor {
- public:
-  void operator()(const Tensor& batch, Tensor* lod_tensor) const {
-    auto in_lod = batch.lod();
-    CHECK_GT(in_lod.size(), 2UL)
-        << "The LoD of LoDTensor should inlcude at least 2-level "
-           "sequence information.";
-    CHECK_EQ(in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0]))
-        << "The LoD information should be consistent with the dims.";
-    CopyMatrixRowsFunctor<T> to_seq;
-    to_seq(batch, in_lod[1], lod_tensor, false);
-  }
-};
-
-template <typename T>
-inline void ReorderInitState(const Tensor& src,
-                             const std::vector<uint64_t>& index_lod,
-                             Tensor* dst,
-                             bool indexed_src) {
-  CopyMatrixRowsFunctor<T> row_shuffle;
-  dst->Resize(src.dims());
-  dst->mutable_data<T>();
-  row_shuffle(src, index_lod, dst, indexed_src);
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/sequence_expand.cc b/lite/backends/arm/math/sequence_expand.cc
deleted file mode 100644
index 63a2e91793..0000000000
--- a/lite/backends/arm/math/sequence_expand.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/sequence_expand.h"
-#include <string.h>
-#include <vector>
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <>
-void SequenceExpandImpl<float>(const float* x_data,
-                               const LoD& x_lod,
-                               int width,
-                               const std::vector<uint64_t>& ref_lod,
-                               lite::Tensor* output) {
-  float* output_data = output->mutable_data<float>();
-  if (x_lod.size() == 0) {
-    for (int i = 0; i < ref_lod.size() - 1; i++) {
-      for (int j = ref_lod[i]; j < ref_lod[i + 1]; j++) {
-        memcpy(
-            output_data + j * width, x_data + i * width, sizeof(float) * width);
-      }
-    }
-    (output->mutable_lod())->push_back(ref_lod);
-  } else {
-    std::vector<uint64_t> out_lod;
-    out_lod.push_back(0);
-    uint64_t out_offset = 0;
-    uint64_t len = 0;
-    for (int i = 0; i < ref_lod.size() - 1; i++) {
-      auto x_seq_len = x_lod[0][i + 1] - x_lod[0][i];
-      for (int j = ref_lod[i]; j < ref_lod[i + 1]; j++) {
-        memcpy(output_data + out_offset * width,
-               x_data + len * width,
-               width * sizeof(float) * x_seq_len);
-        out_offset += x_seq_len;
-        out_lod.push_back(out_offset);
-      }
-      len += x_seq_len;
-    }
-    (output->mutable_lod())->push_back(out_lod);
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/sequence_expand.h b/lite/backends/arm/math/sequence_expand.h
deleted file mode 100644
index d3b19a4c62..0000000000
--- a/lite/backends/arm/math/sequence_expand.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-#include "lite/core/tensor.h"
-
-#pragma once
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <typename T>
-void SequenceExpandImpl(const T* x_data,
-                        const LoD& x_lod,
-                        int width,
-                        const std::vector<uint64_t>& ref_lod,
-                        lite::Tensor* output);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/sequence_pool.cc b/lite/backends/arm/math/sequence_pool.cc
deleted file mode 100644
index b8f9ab0a1a..0000000000
--- a/lite/backends/arm/math/sequence_pool.cc
+++ /dev/null
@@ -1,224 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/sequence_pool.h"
-#include <algorithm>
-#include <cmath>
-#include <limits>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <>
-void seq_pool_sum<float>(const float* din,
-                         float* dout,
-                         const std::vector<uint64_t> lod,
-                         int64_t width) {
-  for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-    const float* din_ptr = din + lod[i] * width;
-    float* dout_ptr = dout + i * width;
-    int64_t height = static_cast<int64_t>(lod[i + 1] - lod[i]);
-    if (width == 1) {
-      float sum = 0.f;
-      for (int h = 0; h < height; ++h) {
-        sum += din_ptr[h];
-      }
-      *dout_ptr = sum;
-    } else {
-      memcpy(dout_ptr, din_ptr, width * sizeof(float));
-      din_ptr += width;
-      height = height - 1;
-      for (int h = 0; h < height; h++) {
-        for (int w = 0; w < width; ++w) {
-          dout_ptr[w] += din_ptr[w];
-        }
-        din_ptr += width;
-      }
-    }
-  }
-}
-
-template <>
-void seq_pool_average<float>(const float* din,
-                             float* dout,
-                             const std::vector<uint64_t> lod,
-                             int64_t width) {
-  for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-    const float* din_ptr = din + lod[i] * width;
-    float* dout_ptr = dout + i * width;
-    int64_t height = static_cast<int64_t>(lod[i + 1] - lod[i]);
-    if (height > 0) {
-      if (width == 1) {
-        float sum = 0.f;
-        for (int h = 0; h < height; ++h) {
-          sum += din_ptr[h];
-        }
-        *dout_ptr = sum / height;
-      } else {
-        memcpy(dout_ptr, din_ptr, width * sizeof(float));
-        din_ptr += width;
-        int remain_h = height - 1;
-        for (int h = 0; h < remain_h; h++) {
-          for (int w = 0; w < width; ++w) {
-            dout_ptr[w] += din_ptr[w];
-          }
-          din_ptr += width;
-        }
-        for (int w = 0; w < width; ++w) {
-          dout_ptr[w] /= height;
-        }
-      }
-    }
-  }
-}
-
-template <>
-void seq_pool_sqrt<float>(const float* din,
-                          float* dout,
-                          const std::vector<uint64_t> lod,
-                          int64_t width) {
-  for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-    const float* din_ptr = din + lod[i] * width;
-    float* dout_ptr = dout + i * width;
-    int64_t height = static_cast<int64_t>(lod[i + 1] - lod[i]);
-    if (height > 0) {
-      float sqrt_len = sqrtf(height);
-      if (width == 1) {
-        float sum = 0.f;
-        for (int h = 0; h < height; ++h) {
-          sum += din_ptr[h];
-        }
-        *dout_ptr = sum / sqrt_len;
-      } else {
-        memcpy(dout_ptr, din_ptr, width * sizeof(float));
-        din_ptr += width;
-        int remain_h = height - 1;
-        for (int h = 0; h < remain_h; h++) {
-          for (int w = 0; w < width; ++w) {
-            dout_ptr[w] += din_ptr[w];
-          }
-          din_ptr += width;
-        }
-        for (int w = 0; w < width; ++w) {
-          dout_ptr[w] /= sqrt_len;
-        }
-      }
-    }
-  }
-}
-
-template <>
-void seq_pool_max<float>(const float* din,
-                         float* dout,
-                         const std::vector<uint64_t> lod,
-                         int64_t width) {
-  for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-    const float* din_ptr = din + lod[i] * width;
-    float* dout_ptr = dout + i * width;
-    int64_t height = static_cast<int64_t>(lod[i + 1] - lod[i]);
-    if (height > 0) {
-      if (width == 1) {
-        float max = -std::numeric_limits<float>::max();
-        for (int h = 0; h < height; ++h) {
-          max = std::max(max, din_ptr[h]);
-        }
-        *dout_ptr = max;
-      } else {
-        memcpy(dout_ptr, din_ptr, width * sizeof(float));
-        din_ptr += width;
-        int remain_h = height - 1;
-        for (int h = 0; h < remain_h; h++) {
-          for (int w = 0; w < width; w++) {
-            dout_ptr[w] = std::max(dout_ptr[w], din_ptr[w]);
-          }
-          din_ptr += width;
-        }
-      }
-    }
-  }
-}
-
-template <>
-void seq_pool_min<float>(const float* din,
-                         float* dout,
-                         const std::vector<uint64_t> lod,
-                         int64_t width) {
-  for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-    const float* din_ptr = din + lod[i] * width;
-    float* dout_ptr = dout + i * width;
-    int64_t height = static_cast<int64_t>(lod[i + 1] - lod[i]);
-    if (height > 0) {
-      if (width == 1) {
-        float min = std::numeric_limits<float>::max();
-        for (int h = 0; h < height; ++h) {
-          min = std::min(min, din_ptr[h]);
-        }
-        *dout_ptr = min;
-      } else {
-        memcpy(dout_ptr, din_ptr, width * sizeof(float));
-        din_ptr += width;
-        int remain_h = height - 1;
-        for (int h = 0; h < remain_h; h++) {
-          for (int w = 0; w < width; w++) {
-            dout_ptr[w] = std::min(dout_ptr[w], din_ptr[w]);
-          }
-          din_ptr += width;
-        }
-      }
-    }
-  }
-}
-
-template <>
-void seq_pool_first<float>(const float* din,
-                           float* dout,
-                           const std::vector<uint64_t> lod,
-                           int64_t width) {
-  for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-    int64_t height = lod[i + 1] - lod[i];
-    const float* din_ptr = din + width * lod[i];
-    float* dout_ptr = dout + i * width;
-    if (height > 0) {
-      memcpy(dout_ptr, din_ptr, width * sizeof(float));
-    }
-  }
-}
-
-template <>
-void seq_pool_last<float>(const float* din,
-                          float* dout,
-                          const std::vector<uint64_t> lod,
-                          int64_t width) {
-  for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-    int64_t height = lod[i + 1] - lod[i];
-    int64_t seq_len = static_cast<int64_t>(lod[i + 1] - lod[0]);
-    const float* din_ptr = din + width * seq_len;
-    float* dout_ptr = dout + i * width;
-    if (height > 0) {
-      memcpy(dout_ptr, din_ptr - width, width * sizeof(float));
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/sequence_pool.h b/lite/backends/arm/math/sequence_pool.h
deleted file mode 100644
index 6cbcd7d6d6..0000000000
--- a/lite/backends/arm/math/sequence_pool.h
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <vector>
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <typename T>
-void seq_pool_sum(const T* din,
-                  T* dout,
-                  const std::vector<uint64_t> lod,
-                  int64_t width);
-
-template <typename T>
-void seq_pool_average(const T* din,
-                      T* dout,
-                      const std::vector<uint64_t> lod,
-                      int64_t width);
-
-template <typename T>
-void seq_pool_sqrt(const T* din,
-                   T* dout,
-                   const std::vector<uint64_t> lod,
-                   int64_t width);
-
-template <typename T>
-void seq_pool_max(const T* din,
-                  T* dout,
-                  const std::vector<uint64_t> lod,
-                  int64_t width);
-
-template <typename T>
-void seq_pool_min(const T* din,
-                  T* dout,
-                  const std::vector<uint64_t> lod,
-                  int64_t width);
-
-template <typename T>
-void seq_pool_first(const T* din,
-                    T* dout,
-                    const std::vector<uint64_t> lod,
-                    int64_t width);
-
-template <typename T>
-void seq_pool_last(const T* din,
-                   T* dout,
-                   const std::vector<uint64_t> lod,
-                   int64_t width);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/sequence_softmax.cc b/lite/backends/arm/math/sequence_softmax.cc
deleted file mode 100644
index fcbb1a353d..0000000000
--- a/lite/backends/arm/math/sequence_softmax.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/sequence_softmax.h"
-#include <arm_neon.h>
-#include <algorithm>
-#include <cmath>
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-bool sequence_softmax(const float* input,
-                      const std::vector<uint64_t>& seq_offset,
-                      float* out,
-                      Context<TARGET(kARM)>* ctx) {
-  int seq_num = seq_offset.size() - 1;
-  for (int i = 0; i < seq_num; i++) {
-    float seq_max = input[seq_offset[i]];
-    float exp_sum = 0.f;
-    for (int j = seq_offset[i]; j < seq_offset[i + 1]; j++) {
-      seq_max = std::max(seq_max, input[j]);
-    }
-    for (int j = seq_offset[i]; j < seq_offset[i + 1]; j++) {
-      exp_sum += expf(input[j] - seq_max);
-    }
-    for (int j = seq_offset[i]; j < seq_offset[i + 1]; j++) {
-      out[j] = expf(input[j] - seq_max) / exp_sum;
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/sequence_softmax.h b/lite/backends/arm/math/sequence_softmax.h
deleted file mode 100644
index 2923039b0c..0000000000
--- a/lite/backends/arm/math/sequence_softmax.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cmath>
-#include <vector>
-#include "lite/core/context.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-bool sequence_softmax(const float* input,
-                      const std::vector<uint64_t>& seq_offset,
-                      float* out,
-                      Context<TARGET(kARM)>* ctx);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/sgemm.cc b/lite/backends/arm/math/sgemm.cc
deleted file mode 100644
index 93f64445e2..0000000000
--- a/lite/backends/arm/math/sgemm.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/sgemm.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void sgemm(bool is_transA,
-           bool is_transB,
-           int M,
-           int N,
-           int K,
-           float alpha,
-           const float* A,
-           int lda,
-           const float* B,
-           int ldb,
-           float beta,
-           float* C,
-           int ldc,
-           const float* bias,
-           bool is_bias,
-           bool is_relu,
-           ARMContext* ctx) {
-  auto arch = ctx->arch();
-  int hblock = get_hblock(arch);
-  int m_roundup = hblock * ((M + hblock - 1) / hblock);
-
-  auto packed_A = static_cast<float*>(
-      TargetMalloc(TargetType::kARM, m_roundup * K * sizeof(float)));
-
-  prepackA(packed_A, A, alpha, lda, 0, M, 0, K, is_transA, ctx);
-
-  sgemm_prepack(is_transB,
-                M,
-                N,
-                K,
-                packed_A,
-                B,
-                ldb,
-                beta,
-                C,
-                ldc,
-                bias,
-                is_bias,
-                is_relu,
-                ctx);
-  TargetFree(TargetType::kARM, packed_A);
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/sgemm.h b/lite/backends/arm/math/sgemm.h
deleted file mode 100644
index 08f68fb3d4..0000000000
--- a/lite/backends/arm/math/sgemm.h
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cmath>
-#include "lite/backends/arm/math/packed_sgemm.h"
-#include "lite/core/context.h"
-#include "lite/core/device_info.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void sgemm(bool is_transA,
-           bool is_transB,
-           int M,
-           int N,
-           int K,
-           float alpha,
-           const float* A,
-           int lda,
-           const float* B,
-           int ldb,
-           float beta,
-           float* C,
-           int ldc,
-           const float* bias,
-           bool is_bias,
-           bool is_relu,
-           ARMContext* ctx);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/sgemv.cc b/lite/backends/arm/math/sgemv.cc
deleted file mode 100644
index 506451932d..0000000000
--- a/lite/backends/arm/math/sgemv.cc
+++ /dev/null
@@ -1,1054 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/sgemv.h"
-#include <arm_neon.h>
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void sgemv(const bool transA,
-           const int M,
-           const int N,
-           const float *A,
-           const float *x,
-           float *y);
-
-void sgemv_relu(const bool transA,
-                const int M,
-                const int N,
-                const float *A,
-                const float *x,
-                float *y);
-
-void sgemv_bias(const bool transA,
-                const int M,
-                const int N,
-                const float *A,
-                const float *x,
-                float *y,
-                const float *bias);
-
-void sgemv_bias_relu(const bool transA,
-                     const int M,
-                     const int N,
-                     const float *A,
-                     const float *x,
-                     float *y,
-                     const float *bias);
-
-bool sgemv(const float *A,
-           const float *x,
-           float *y,
-           bool transA,
-           int M,
-           int N,
-           bool is_bias,
-           const float *bias,
-           bool is_relu) {
-  if (transA) {
-    LOG(ERROR) << " sgemv, transA is not supported now";
-    return false;
-  }
-  if (is_bias) {
-    //! with bias
-    if (is_relu) {
-      //! with relu
-      sgemv_bias_relu(transA, M, N, A, x, y, bias);
-    } else {
-      //! without relu
-      sgemv_bias(transA, M, N, A, x, y, bias);
-    }
-  } else {
-    //! without bias
-    if (is_relu) {
-      //! with relu
-      sgemv_relu(transA, M, N, A, x, y);
-    } else {
-      //! without relu
-      sgemv(transA, M, N, A, x, y);
-    }
-  }
-  return true;
-}
-
-//! define compute kernel
-#ifdef __aarch64__
-#define SGEMV_IN_8                                    \
-  "prfm  pldl1keep, [%[in]]   \n" /* preload din */   \
-  "prfm  pldl1keep, [%[w0]]   \n" /* preload w0 */    \
-  "prfm  pldl1keep, [%[w1]]   \n" /* preload w1 */    \
-  "prfm  pldl1keep, [%[w2]]   \n" /* preload w2 */    \
-  "prfm  pldl1keep, [%[w3]]   \n" /* preload w3 */    \
-  "prfm  pldl1keep, [%[w4]]   \n" /* preload w4 */    \
-  "prfm  pldl1keep, [%[w5]]   \n" /* preload w5 */    \
-  "prfm  pldl1keep, [%[w6]]   \n" /* preload w6 */    \
-  "prfm  pldl1keep, [%[w7]]   \n" /* preload w7 */    \
-  "movi   v0.4s,  #0          \n" /* set out0 to 0 */ \
-  "movi   v1.4s,  #0          \n" /* set out1 to 0 */ \
-  "movi   v2.4s,  #0          \n" /* set out2 to 0 */ \
-  "movi   v3.4s,  #0          \n" /* set out3 to 0 */ \
-  "movi   v4.4s,  #0          \n" /* set out4 to 0 */ \
-  "movi   v5.4s,  #0          \n" /* set out5 to 0 */ \
-  "movi   v6.4s,  #0          \n" /* set out6 to 0 */ \
-  "movi   v7.4s,  #0          \n" /* set out7 to 0 */
-
-#define SGEMV_IN_8_BIAS                                    \
-  "ldp   q8, q9, [%[bias_ptr]]\n" /* load bias to q8, q9*/ \
-  "prfm  pldl1keep, [%[in]]   \n" /* preload din */        \
-  "prfm  pldl1keep, [%[w0]]   \n" /* preload w0 */         \
-  "prfm  pldl1keep, [%[w1]]   \n" /* preload w1 */         \
-  "prfm  pldl1keep, [%[w2]]   \n" /* preload w2 */         \
-  "prfm  pldl1keep, [%[w3]]   \n" /* preload w3 */         \
-  "prfm  pldl1keep, [%[w4]]   \n" /* preload w4 */         \
-  "prfm  pldl1keep, [%[w5]]   \n" /* preload w5 */         \
-  "prfm  pldl1keep, [%[w6]]   \n" /* preload w6 */         \
-  "prfm  pldl1keep, [%[w7]]   \n" /* preload w7 */         \
-  "movi   v0.4s,  #0          \n" /* set out0 to 0 */      \
-  "movi   v1.4s,  #0          \n" /* set out1 to 0 */      \
-  "movi   v2.4s,  #0          \n" /* set out2 to 0 */      \
-  "movi   v3.4s,  #0          \n" /* set out3 to 0 */      \
-  "movi   v4.4s,  #0          \n" /* set out4 to 0 */      \
-  "movi   v5.4s,  #0          \n" /* set out5 to 0 */      \
-  "movi   v6.4s,  #0          \n" /* set out6 to 0 */      \
-  "movi   v7.4s,  #0          \n" /* set out7 to 0 */      \
-  "ins    v0.s[0], v8.s[0]    \n" /* out0 = bias0 */       \
-  "ins    v1.s[0], v8.s[1]    \n" /* out1 = bias1 */       \
-  "ins    v2.s[0], v8.s[2]    \n" /* out2 = bias2 */       \
-  "ins    v3.s[0], v8.s[3]    \n" /* out3 = bias3 */       \
-  "ins    v4.s[0], v9.s[0]    \n" /* out4 = bias4 */       \
-  "ins    v5.s[0], v9.s[1]    \n" /* out5 = bias5 */       \
-  "ins    v6.s[0], v9.s[2]    \n" /* out6 = bias6 */       \
-  "ins    v7.s[0], v9.s[3]    \n" /* out7 = bias7 */
-
-#define SGEMV_IN_1                                    \
-  "prfm  pldl1keep, [%[in]]   \n" /* preload din */   \
-  "prfm  pldl1keep, [%[w0]]   \n" /* preload w0 */    \
-  "movi   v0.4s,  #0          \n" /* set out0 to 0 */ \
-  "movi   v1.4s,  #0          \n" /* set out0 to 0 */
-
-#define SGEMV_IN_1_BIAS                               \
-  "prfm  pldl1keep, [%[in]]   \n" /* preload din */   \
-  "prfm  pldl1keep, [%[w0]]   \n" /* preload w0 */    \
-  "movi   v0.4s,  #0          \n" /* set out0 to 0 */ \
-  "movi   v1.4s,  #0          \n" /* set out0 to 0 */ \
-  "fmov   s0,  %w[bias0]      \n" /* set out0 = bias0 */
-
-#define SGEMV_KERNEL_8                                                         \
-  /* check main loop */                                                        \
-  "cmp %w[cnt], #1            \n" /* check whether has main loop */            \
-  "blt  2f                    \n" /* jump to tail */ /* main loop */           \
-  "1:                         \n"                    /* main loop */           \
-  "ldp q8, q9, [%[in]], #32   \n"                    /* load input 8 float */  \
-  "ldp q10, q11, [%[w0]], #32 \n"                    /* load w0 8 float */     \
-  "ldp q12, q13, [%[w1]], #32 \n"                    /* load w1 8 float */     \
-  "ldp q14, q15, [%[w2]], #32 \n"                    /* load w2 8 float */     \
-  "ldp q16, q17, [%[w3]], #32 \n"                    /* load w3 8 float */     \
-  "ldp q18, q19, [%[w4]], #32 \n"                    /* load w4 8 float */     \
-  "ldp q20, q21, [%[w5]], #32 \n"                    /* load w5 8 float */     \
-  "ldp q22, q23, [%[w6]], #32 \n"                    /* load w6 8 float */     \
-  "ldp q24, q25, [%[w7]], #32 \n"                    /* load w7 8 float */     \
-  "fmla v0.4s, v8.4s, v10.4s  \n"                    /* mul + add*/            \
-  "fmla v1.4s, v8.4s, v12.4s  \n"                    /* mul + add*/            \
-  "fmla v2.4s, v8.4s, v14.4s  \n"                    /* mul + add*/            \
-  "fmla v3.4s, v8.4s, v16.4s  \n"                    /* mul + add*/            \
-  "fmla v4.4s, v8.4s, v18.4s  \n"                    /* mul + add*/            \
-  "fmla v5.4s, v8.4s, v20.4s  \n"                    /* mul + add*/            \
-  "fmla v6.4s, v8.4s, v22.4s  \n"                    /* mul + add*/            \
-  "fmla v7.4s, v8.4s, v24.4s  \n"                    /* mul + add*/            \
-  "subs %w[cnt], %w[cnt], #1  \n"                    /* sub main loop count */ \
-  "fmla v0.4s, v9.4s, v11.4s  \n"                    /* mul + add*/            \
-  "fmla v1.4s, v9.4s, v13.4s  \n"                    /* mul + add*/            \
-  "fmla v2.4s, v9.4s, v15.4s  \n"                    /* mul + add*/            \
-  "fmla v3.4s, v9.4s, v17.4s  \n"                    /* mul + add*/            \
-  "fmla v4.4s, v9.4s, v19.4s  \n"                    /* mul + add*/            \
-  "fmla v5.4s, v9.4s, v21.4s  \n"                    /* mul + add*/            \
-  "fmla v6.4s, v9.4s, v23.4s  \n"                    /* mul + add*/            \
-  "fmla v7.4s, v9.4s, v25.4s  \n"                    /* mul + add*/            \
-  "bne 1b                     \n" /* jump to main loop */ /* pair add to final \
-                                                             result */         \
-  "2:                         \n"  /* reduce to scale */                       \
-  "faddp  v16.4s, v0.4s, v0.4s\n"  /* pair add to vector */                    \
-  "faddp  s8, v16.2s          \n"  /* pair add to scale */                     \
-  "faddp  v17.4s, v1.4s, v1.4s\n"  /* pair add to vector */                    \
-  "faddp  s9, v17.2s          \n"  /* pair add to scale */                     \
-  "faddp  v18.4s, v2.4s, v2.4s\n"  /* pair add to vector */                    \
-  "faddp  s10, v18.2s         \n"  /* pair add to scale */                     \
-  "faddp  v19.4s, v3.4s, v3.4s\n"  /* pair add to vector */                    \
-  "faddp  s11, v19.2s         \n"  /* pair add to scale */                     \
-  "faddp  v20.4s, v4.4s, v4.4s\n"  /* pair add to vector */                    \
-  "faddp  s12, v20.2s         \n"  /* pair add to scale */                     \
-  "faddp  v21.4s, v5.4s, v5.4s\n"  /* pair add to vector */                    \
-  "faddp  s13, v21.2s         \n"  /* pair add to scale */                     \
-  "faddp  v22.4s, v6.4s, v6.4s\n"  /* pair add to vector */                    \
-  "faddp  s14, v22.2s          \n" /* pair add to scale */                     \
-  "faddp  v23.4s, v7.4s, v7.4s\n"  /* pair add to vector */                    \
-  "faddp  s15, v23.2s          \n" /* pair add to scale */                     \
-  "cmp %w[tail], #1           \n"  /* check whether has tail */                \
-  "blt  4f                    \n"  /* jump to end */                           \
-  "3:                         \n"  /* tail loop */                             \
-  "ldr     s16, [%[in]], #4   \n"  /* load in, 1 float */                      \
-  "ldr     s17, [%[w0]], #4   \n"  /* load w0, 1 float */                      \
-  "ldr     s18, [%[w1]], #4   \n"  /* load w1, 1 float */                      \
-  "ldr     s19, [%[w2]], #4   \n"  /* load w2, 1 float */                      \
-  "ldr     s20, [%[w3]], #4   \n"  /* load w3, 1 float */                      \
-  "ldr     s21, [%[w4]], #4   \n"  /* load w4, 1 float */                      \
-  "ldr     s22, [%[w5]], #4   \n"  /* load w5, 1 float */                      \
-  "ldr     s23, [%[w6]], #4   \n"  /* load w6, 1 float */                      \
-  "ldr     s24, [%[w7]], #4   \n"  /* load w7, 1 float */                      \
-  "fmadd   s8, s16, s17, s8   \n"  /* mul + add */                             \
-  "fmadd   s9, s16, s18, s9   \n"  /* mul + add */                             \
-  "fmadd   s10, s16, s19, s10 \n"  /* mul + add */                             \
-  "fmadd   s11, s16, s20, s11 \n"  /* mul + add */                             \
-  "fmadd   s12, s16, s21, s12 \n"  /* mul + add */                             \
-  "fmadd   s13, s16, s22, s13 \n"  /* mul + add */                             \
-  "fmadd   s14, s16, s23, s14 \n"  /* mul + add */                             \
-  "fmadd   s15, s16, s24, s15 \n"  /* mul + add */                             \
-  "subs %w[tail], %w[tail], #1\n"  /* sub tail loop count */                   \
-  "bne 3b                     \n"  /* jump to tail loop */
-
-#define SGEMV_KERNEL_1                                                         \
-  /* check main loop */                                                        \
-  "cmp %w[cnt], #1            \n" /* check whether has main loop */            \
-  "blt  2f                    \n" /* jump to tail */ /* main loop */           \
-  "1:                         \n"                    /* main loop */           \
-  "ldp q8, q9, [%[in]], #32   \n"                    /* load input 8 float */  \
-  "ldp q10, q11, [%[w0]], #32 \n"                    /* load w0 8 float */     \
-  "fmla v0.4s, v8.4s, v10.4s  \n"                    /* mul + add*/            \
-  "subs %w[cnt], %w[cnt], #1  \n"                    /* sub main loop count */ \
-  "fmla v1.4s, v9.4s, v11.4s  \n"                    /* mul + add*/            \
-  "bne 1b                     \n" /* jump to main loop */ /* pair add to final \
-                                                             result */         \
-  "2:                         \n" /* reduce to scale */                        \
-  "fadd   v9.4s, v0.4s, v1.4s \n" /* add 2 vector */                           \
-  "faddp  v10.4s, v9.4s, v9.4s\n" /* pair add to vector */                     \
-  "faddp  s8, v10.2s          \n" /* pair add to scale */ /* check tails */    \
-  "cmp %w[tail], #1           \n" /* check whether has tail */                 \
-  "blt  4f                    \n" /* jump to end */                            \
-  "3:                         \n" /* tail loop */                              \
-  "ldr     s16, [%[in]], #4   \n" /* load in, 1 float */                       \
-  "ldr     s17, [%[w0]], #4   \n" /* load w0, 1 float */                       \
-  "fmadd   s8, s16, s17, s8   \n" /* mul + add */                              \
-  "subs %w[tail], %w[tail], #1\n" /* sub tail loop count */                    \
-  "bne 3b                     \n" /* jump to tail loop */
-
-#define SGEMV_OUT_8                                 \
-  /* end */                                         \
-  "4:                         \n" /* end */         \
-  "stp s8, s9, [%[out]]       \n" /* save result */ \
-  "stp s10, s11, [%[out], #8] \n" /* save result */ \
-  "stp s12, s13, [%[out], #16]\n" /* save result */ \
-  "stp s14, s15, [%[out], #24]\n" /* save result */
-
-#define SGEMV_OUT_8_RELU                                   \
-  /* end */                                                \
-  "4:                         \n" /* end */                \
-  "movi   d0, #0              \n" /* zero data for relu */ \
-  "fmax   s8, s8, s0          \n" /* relu */               \
-  "fmax   s9, s9, s0          \n" /* relu */               \
-  "fmax   s10, s10, s0        \n" /* relu */               \
-  "fmax   s11, s11, s0        \n" /* relu */               \
-  "fmax   s12, s12, s0        \n" /* relu */               \
-  "fmax   s13, s13, s0        \n" /* relu */               \
-  "fmax   s14, s14, s0        \n" /* relu */               \
-  "fmax   s15, s15, s0        \n" /* relu */               \
-  "stp s8, s9, [%[out]]       \n" /* save result */        \
-  "stp s10, s11, [%[out], #8] \n" /* save result */        \
-  "stp s12, s13, [%[out], #16]\n" /* save result */        \
-  "stp s14, s15, [%[out], #24]\n" /* save result */
-
-#define SGEMV_OUT_1                         \
-  /* end */                                 \
-  "4:                         \n" /* end */ \
-  "str s8, [%[out]]           \n" /* save result */
-
-#define SGEMV_OUT_1_RELU                                   \
-  /* end */                                                \
-  "4:                         \n" /* end */                \
-  "movi   d0, #0              \n" /* zero data for relu */ \
-  "fmax   s8, s8, s0          \n" /* relu */               \
-  "str s8, [%[out]]           \n" /* save result */
-
-#else  //__aarch64__
-
-#define SGEMV_IN_4                                                    \
-  "pld [%[in]]                    @ preload cache line, input\n"      \
-  "pld [%[w0]]                    @ preload cache line, weights r0\n" \
-  "pld [%[w1]]                    @ preload cache line, weights r1\n" \
-  "pld [%[w2]]                    @ preload cache line, weights r2\n" \
-  "pld [%[w3]]                    @ preload cache line, weights r3\n" \
-  "vmov.u32 q0, #0                @ set q0 to 0\n"                    \
-  "vmov.u32 q1, #0                @ set q1 to 0\n"                    \
-  "vmov.u32 q2, #0                @ set q2 to 0\n"                    \
-  "vmov.u32 q3, #0                @ set q3 to 0\n"                    \
-  "pld [%[w0], #64]               @ preload cache line, weights r0\n" \
-  "pld [%[w1], #64]               @ preload cache line, weights r1\n" \
-  "pld [%[w2], #64]               @ preload cache line, weights r2\n" \
-  "pld [%[w3], #64]               @ preload cache line, weights r3\n"
-
-#define SGEMV_IN_4_BIAS                                               \
-  "pld [%[in]]                    @ preload cache line, input\n"      \
-  "pld [%[w0]]                    @ preload cache line, weights r0\n" \
-  "pld [%[w1]]                    @ preload cache line, weights r1\n" \
-  "pld [%[w2]]                    @ preload cache line, weights r2\n" \
-  "pld [%[w3]]                    @ preload cache line, weights r3\n" \
-  "vmov.u32 q0, #0                @ set q0 to 0\n"                    \
-  "vmov.u32 q1, #0                @ set q1 to 0\n"                    \
-  "vmov.u32 q2, #0                @ set q2 to 0\n"                    \
-  "vmov.u32 q3, #0                @ set q3 to 0\n"                    \
-  "vmov s0, %[bias0]              @ set q0 to bias0\n"                \
-  "vmov s4, %[bias1]              @ set q1 to bias1\n"                \
-  "vmov s8, %[bias2]              @ set q2 to bias2\n"                \
-  "vmov s12,%[bias3]              @ set q3 to bias3\n"                \
-  "pld [%[w0], #64]               @ preload cache line, weights r0\n" \
-  "pld [%[w1], #64]               @ preload cache line, weights r1\n" \
-  "pld [%[w2], #64]               @ preload cache line, weights r2\n" \
-  "pld [%[w3], #64]               @ preload cache line, weights r3\n"
-
-#define SGEMV_IN_1                                                        \
-  "pld [%[in]]                        @ preload cache line, input\n"      \
-  "pld [%[w0]]                        @ preload cache line, weights r0\n" \
-  "vmov.u32 q0, #0                    @ set q0 to 0\n"
-
-#define SGEMV_IN_1_BIAS                                                   \
-  "pld [%[in]]                        @ preload cache line, input\n"      \
-  "pld [%[w0]]                        @ preload cache line, weights r0\n" \
-  "vmov.u32 q0, #0                    @ set q0 to 0\n"                    \
-  "vmov s0, %[bias0]                  @ set q0 to 0\n"
-
-#define SGEMV_KERNEL_4                                                         \
-  /* check main loop */                                                        \
-  "cmp %[cnt], #1                 @ check whether has main loop\n"             \
-  "blt  2f                        @ jump to tail\n"                            \
-  "1:                             @ main loop\n"                               \
-  "vld1.32 {d8-d11}, [%[in]]!     @ load input, q4, q5\n"                      \
-  "vld1.32 {d12-d15}, [%[w0]]!    @ load weights r0, q6,q7\n"                  \
-  "vld1.32 {d16-d19}, [%[w1]]!    @ load weights r1, q8,q9\n"                  \
-  "vld1.32 {d20-d23}, [%[w2]]!    @ load weights r2, q10,q11\n"                \
-  "vld1.32 {d24-d27}, [%[w3]]!    @ load weights r3, q12,q13\n"                \
-  "vmla.f32 q0, q4, q6            @ mul add\n"                                 \
-  "vmla.f32 q1, q4, q8            @ mul add\n"                                 \
-  "vmla.f32 q2, q4, q10           @ mul add\n"                                 \
-  "vmla.f32 q3, q4, q12           @ mul add\n"                                 \
-  "subs %[cnt], #1                @ sub loop count \n"                         \
-  "vmla.f32 q0, q5, q7            @ mul add\n"                                 \
-  "vmla.f32 q1, q5, q9            @ mul add\n"                                 \
-  "vmla.f32 q2, q5, q11           @ mul add\n"                                 \
-  "vmla.f32 q3, q5, q13           @ mul add\n"                                 \
-  "bne 1b                         @ jump to main loop\n" /* pair add to final  \
-                                                            result */          \
-  "2:                             @ pair add \n"                               \
-  "vpadd.f32 d8, d0, d1           @ pair add, first step\n"                    \
-  "vpadd.f32 d9, d2, d3           @ pair add, first step\n"                    \
-  "vpadd.f32 d10, d4, d5          @ pair add, first step\n"                    \
-  "vpadd.f32 d11, d6, d7          @ pair add, first step\n"                    \
-  "vpadd.f32 d0, d8, d9           @ pair add, second step\n"                   \
-  "vpadd.f32 d1, d10, d11         @ pair add, second step\n" /* check tails */ \
-  "cmp %[tail], #1                @ check whether has tail\n"                  \
-  "blt  4f                        @ jump to end\n"                             \
-  "3:                             @ tail loop\n"                               \
-  "vldm     %[in]!, {s16}         @ load 1 float\n"                            \
-  "vldm     %[w0]!, {s17}         @ load 1 float\n"                            \
-  "vldm     %[w1]!, {s18}         @ load 1 float\n"                            \
-  "vldm     %[w2]!, {s19}         @ load 1 float\n"                            \
-  "vldm     %[w3]!, {s20}         @ load 1 float\n"                            \
-  "vmla.f32   s0, s16, s17        @ mul + add\n"                               \
-  "vmla.f32   s1, s16, s18        @ mul + add\n"                               \
-  "vmla.f32   s2, s16, s19        @ mul + add\n"                               \
-  "vmla.f32   s3, s16, s20        @ mul + add\n"                               \
-  "subs %[tail], #1               @ sub loop count \n"                         \
-  "bne 3b                         @ jump to tail loop\n"
-
-#define SGEMV_KERNEL_1                                                         \
-  "cmp %[cnt], #1                     @ check whether has main loop\n"         \
-  "blt  2f                            @ jump to tail\n"                        \
-  "1:                                 @ main loop\n"                           \
-  "vld1.32 {d24-d27}, [%[in]]!        @ load input, q12,q13\n"                 \
-  "vld1.32 {d28-d31}, [%[w0]]!        @ load weights r0, q14, q15\n"           \
-  "vmla.f32 q0, q12, q14              @ mul add\n"                             \
-  "vmla.f32 q0, q13, q15              @ mul add\n"                             \
-  "subs %[cnt] , #1                   @ sub loop count \n"                     \
-  "bne 1b                             @ jump to main loop\n" /* pair add to    \
-                                                                final result   \
-                                                                */             \
-  "2:                                 @ end processing\n"                      \
-  "vpadd.f32 d2, d0, d1               @ pair add, first step\n"                \
-  "vpadd.f32 d0, d2, d2               @ pair add, final step\n" /* check tails \
-                                                                   */          \
-  "cmp %[tail], #1                    @ check whether has mid cols\n"          \
-  "blt  4f                            @ jump to end\n"                         \
-  "3:                                 @ tail loop\n"                           \
-  "vldm     %[in]!, {s16}             @ load 1 float\n"                        \
-  "vldm     %[w0]!, {s17}             @ load 1 float\n"                        \
-  "vmla.f32   s0, s16, s17            @ mul + add\n"                           \
-  "subs %[tail], #1                   @ sub loop count \n"                     \
-  "bne 3b                             @ jump to tail loop\n"
-
-#define SGEMV_OUT_4                        \
-  /* end */                                \
-  "4:                             @ end\n" \
-  "vst1.32 {d0-d1}, [%[out]]      @ save result\n"
-
-#define SGEMV_OUT_4_RELU                             \
-  /* end */                                          \
-  "4:                             @ end\n"           \
-  "vmov.i32   q1, #0              @ zero for relu\n" \
-  "vmax.f32   q0, q0, q1          @ relu\n"          \
-  "vst1.32 {d0-d1}, [%[out]]      @ save result\n"
-
-#define SGEMV_OUT_1                        \
-  /* end */                                \
-  "4:                             @ end\n" \
-  "vst1.32 {d0[0]}, [%[out]]      @ save result\n"
-
-#define SGEMV_OUT_1_RELU                             \
-  /* end */                                          \
-  "4:                             @ end\n"           \
-  "vmov.i32   d1, #0              @ zero for relu\n" \
-  "vmax.f32   d0, d0, d1          @ relu\n"          \
-  "vst1.32 {d0[0]}, [%[out]]      @ save result\n"
-#endif
-
-void sgemv(const bool transA,
-           const int M,
-           const int N,
-           const float *A,
-           const float *x,
-           float *y) {
-  float *data_out = y;
-  const float *data_in = x;
-  const float *weights_ptr = A;
-
-  int cnt = N >> 3;
-  int tail = N & 7;
-
-#ifdef __aarch64__
-  int out_cnt = M >> 3;
-
-#pragma omp parallel for
-  for (int j = 0; j < out_cnt; j++) {
-    int out_idx = j * 8;
-    float *ptr_out = data_out + out_idx;
-    const float *ptr_in = data_in;
-    const float *ptr_w0 = weights_ptr + (N * out_idx);
-    const float *ptr_w1 = ptr_w0 + N;
-    const float *ptr_w2 = ptr_w1 + N;
-    const float *ptr_w3 = ptr_w2 + N;
-    const float *ptr_w4 = ptr_w3 + N;
-    const float *ptr_w5 = ptr_w4 + N;
-    const float *ptr_w6 = ptr_w5 + N;
-    const float *ptr_w7 = ptr_w6 + N;
-    int cnt_loop = cnt;
-    int tail_loop = tail;
-    asm volatile(SGEMV_IN_8 SGEMV_KERNEL_8 SGEMV_OUT_8
-                 : [in] "+r"(ptr_in),
-                   [w0] "+r"(ptr_w0),
-                   [w1] "+r"(ptr_w1),
-                   [w2] "+r"(ptr_w2),
-                   [w3] "+r"(ptr_w3),
-                   [w4] "+r"(ptr_w4),
-                   [w5] "+r"(ptr_w5),
-                   [w6] "+r"(ptr_w6),
-                   [w7] "+r"(ptr_w7),
-                   [cnt] "+r"(cnt_loop),
-                   [tail] "+r"(tail_loop)
-                 : [out] "r"(ptr_out)
-                 : "v0",
-                   "v1",
-                   "v2",
-                   "v3",
-                   "v4",
-                   "v5",
-                   "v6",
-                   "v7",
-                   "v8",
-                   "v9",
-                   "v10",
-                   "v11",
-                   "v12",
-                   "v13",
-                   "v14",
-                   "v15",
-                   "v16",
-                   "v17",
-                   "v18",
-                   "v19",
-                   "v20",
-                   "v21",
-                   "v22",
-                   "v23",
-                   "v24",
-                   "v25",
-                   "cc",
-                   "memory");
-  }
-//! deal with remains
-#pragma omp parallel for
-  for (int j = out_cnt * 8; j < M; ++j) {
-    float *ptr_out = data_out + j;
-    const float *ptr_in = data_in;
-    const float *ptr_w0 = weights_ptr + (N * j);
-    int cnt_loop = cnt;
-    int tail_loop = tail;
-    float tmp[4];
-    float tmp1[4];
-    float tmp2[4];
-    float tmp3[4];
-    float tmp4[4];
-    asm volatile(
-        SGEMV_IN_1 SGEMV_KERNEL_1 SGEMV_OUT_1
-        : [in] "+r"(ptr_in),
-          [w0] "+r"(ptr_w0),
-          [cnt] "+r"(cnt_loop),
-          [tail] "+r"(tail_loop)
-        : [out] "r"(ptr_out),
-          [tmp] "r"(tmp),
-          [tmp1] "r"(tmp1),
-          [tmp2] "r"(tmp2),
-          [tmp3] "r"(tmp3),
-          [tmp4] "r"(tmp4)
-        : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc", "memory");
-  }
-#else  //__aarch64__
-  int out_cnt = M >> 2;
-#pragma omp parallel for
-  for (int j = 0; j < out_cnt; j++) {
-    int out_idx = j * 4;
-    float *ptr_out = data_out + out_idx;
-    const float *ptr_in = data_in;
-    const float *ptr_w0 = weights_ptr + (N * out_idx);
-    const float *ptr_w1 = ptr_w0 + N;
-    const float *ptr_w2 = ptr_w1 + N;
-    const float *ptr_w3 = ptr_w2 + N;
-
-    int cnt_loop = cnt;
-    int tail_loop = tail;
-    asm volatile(SGEMV_IN_4 SGEMV_KERNEL_4 SGEMV_OUT_4
-                 : [in] "+r"(ptr_in),
-                   [w0] "+r"(ptr_w0),
-                   [w1] "+r"(ptr_w1),
-                   [w2] "+r"(ptr_w2),
-                   [w3] "+r"(ptr_w3),
-                   [cnt] "+r"(cnt_loop),
-                   [tail] "+r"(tail_loop)
-                 : [out] "r"(ptr_out)
-                 : "q0",
-                   "q1",
-                   "q2",
-                   "q3",
-                   "q4",
-                   "q5",
-                   "q6",
-                   "q7",
-                   "q8",
-                   "q9",
-                   "q10",
-                   "q11",
-                   "q12",
-                   "q13",
-                   "cc",
-                   "memory");
-  }
-//! deal with remains
-#pragma omp parallel for
-  for (int j = out_cnt * 4; j < M; ++j) {
-    float *ptr_out = data_out + j;
-    const float *ptr_in = data_in;
-    const float *ptr_w0 = weights_ptr + (N * j);
-    int cnt_loop = cnt;
-    int tail_loop = tail;
-    asm volatile(SGEMV_IN_1 SGEMV_KERNEL_1 SGEMV_OUT_1
-                 : [in] "+r"(ptr_in),
-                   [w0] "+r"(ptr_w0),
-                   [cnt] "+r"(cnt_loop),
-                   [tail] "+r"(tail_loop)
-                 : [out] "r"(ptr_out)
-                 : "q0", "q1", "q12", "q13", "q14", "q15", "cc", "memory");
-  }
-#endif  //__aarch64__
-}
-
-void sgemv_relu(const bool transA,
-                const int M,
-                const int N,
-                const float *A,
-                const float *x,
-                float *y) {
-  float *data_out = y;
-  const float *data_in = x;
-  const float *weights_ptr = A;
-
-  int cnt = N >> 3;
-  int tail = N & 7;
-
-#ifdef __aarch64__
-  int out_cnt = M >> 3;
-#pragma omp parallel for
-  for (int j = 0; j < out_cnt; j++) {
-    int out_idx = j * 8;
-    float *ptr_out = data_out + out_idx;
-    const float *ptr_in = data_in;
-    const float *ptr_w0 = weights_ptr + (N * out_idx);
-    const float *ptr_w1 = ptr_w0 + N;
-    const float *ptr_w2 = ptr_w1 + N;
-    const float *ptr_w3 = ptr_w2 + N;
-    const float *ptr_w4 = ptr_w3 + N;
-    const float *ptr_w5 = ptr_w4 + N;
-    const float *ptr_w6 = ptr_w5 + N;
-    const float *ptr_w7 = ptr_w6 + N;
-    int cnt_loop = cnt;
-    int tail_loop = tail;
-    asm volatile(SGEMV_IN_8 SGEMV_KERNEL_8 SGEMV_OUT_8_RELU
-                 : [in] "+r"(ptr_in),
-                   [w0] "+r"(ptr_w0),
-                   [w1] "+r"(ptr_w1),
-                   [w2] "+r"(ptr_w2),
-                   [w3] "+r"(ptr_w3),
-                   [w4] "+r"(ptr_w4),
-                   [w5] "+r"(ptr_w5),
-                   [w6] "+r"(ptr_w6),
-                   [w7] "+r"(ptr_w7),
-                   [cnt] "+r"(cnt_loop),
-                   [tail] "+r"(tail_loop)
-                 : [out] "r"(ptr_out)
-                 : "v0",
-                   "v1",
-                   "v2",
-                   "v3",
-                   "v4",
-                   "v5",
-                   "v6",
-                   "v7",
-                   "v8",
-                   "v9",
-                   "v10",
-                   "v11",
-                   "v12",
-                   "v13",
-                   "v14",
-                   "v15",
-                   "v16",
-                   "v17",
-                   "v18",
-                   "v19",
-                   "v20",
-                   "v21",
-                   "v22",
-                   "v23",
-                   "v24",
-                   "v25",
-                   "cc",
-                   "memory");
-  }
-//! deal with remains
-#pragma omp parallel for
-  for (int j = out_cnt * 8; j < M; ++j) {
-    float *ptr_out = data_out + j;
-    const float *ptr_in = data_in;
-    const float *ptr_w0 = weights_ptr + (N * j);
-    int cnt_loop = cnt;
-    int tail_loop = tail;
-    asm volatile(
-        SGEMV_IN_1 SGEMV_KERNEL_1 SGEMV_OUT_1_RELU
-        : [in] "+r"(ptr_in),
-          [w0] "+r"(ptr_w0),
-          [cnt] "+r"(cnt_loop),
-          [tail] "+r"(tail_loop)
-        : [out] "r"(ptr_out)
-        : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc", "memory");
-  }
-#else  //__aarch64__
-  int out_cnt = M >> 2;
-#pragma omp parallel for
-  for (int j = 0; j < out_cnt; j++) {
-    int out_idx = j * 4;
-    float *ptr_out = data_out + out_idx;
-    const float *ptr_in = data_in;
-    const float *ptr_w0 = weights_ptr + (N * out_idx);
-    const float *ptr_w1 = ptr_w0 + N;
-    const float *ptr_w2 = ptr_w1 + N;
-    const float *ptr_w3 = ptr_w2 + N;
-
-    int cnt_loop = cnt;
-    int tail_loop = tail;
-    asm volatile(SGEMV_IN_4 SGEMV_KERNEL_4 SGEMV_OUT_4_RELU
-                 : [in] "+r"(ptr_in),
-                   [w0] "+r"(ptr_w0),
-                   [w1] "+r"(ptr_w1),
-                   [w2] "+r"(ptr_w2),
-                   [w3] "+r"(ptr_w3),
-                   [cnt] "+r"(cnt_loop),
-                   [tail] "+r"(tail_loop)
-                 : [out] "r"(ptr_out)
-                 : "q0",
-                   "q1",
-                   "q2",
-                   "q3",
-                   "q4",
-                   "q5",
-                   "q6",
-                   "q7",
-                   "q8",
-                   "q9",
-                   "q10",
-                   "q11",
-                   "q12",
-                   "q13",
-                   "cc",
-                   "memory");
-  }
-//! deal with remains
-#pragma omp parallel for
-  for (int j = out_cnt * 4; j < M; ++j) {
-    float *ptr_out = data_out + j;
-    const float *ptr_in = data_in;
-    const float *ptr_w0 = weights_ptr + (N * j);
-    int cnt_loop = cnt;
-    int tail_loop = tail;
-    asm volatile(SGEMV_IN_1 SGEMV_KERNEL_1 SGEMV_OUT_1_RELU
-                 : [in] "+r"(ptr_in),
-                   [w0] "+r"(ptr_w0),
-                   [cnt] "+r"(cnt_loop),
-                   [tail] "+r"(tail_loop)
-                 : [out] "r"(ptr_out)
-                 : "q0", "q1", "q12", "q13", "q14", "q15", "cc", "memory");
-  }
-#endif  //__aarch64__
-}
-
-void sgemv_bias(const bool transA,
-                const int M,
-                const int N,
-                const float *A,
-                const float *x,
-                float *y,
-                const float *bias) {
-  float *data_out = y;
-  const float *data_in = x;
-  const float *weights_ptr = A;
-
-  int cnt = N >> 3;
-  int tail = N & 7;
-
-#ifdef __aarch64__
-  int out_cnt = M >> 3;
-#pragma omp parallel for
-  for (int j = 0; j < out_cnt; j++) {
-    int out_idx = j * 8;
-    float *ptr_out = data_out + out_idx;
-    const float *ptr_in = data_in;
-    const float *ptr_w0 = weights_ptr + (N * out_idx);
-    const float *ptr_w1 = ptr_w0 + N;
-    const float *ptr_w2 = ptr_w1 + N;
-    const float *ptr_w3 = ptr_w2 + N;
-    const float *ptr_w4 = ptr_w3 + N;
-    const float *ptr_w5 = ptr_w4 + N;
-    const float *ptr_w6 = ptr_w5 + N;
-    const float *ptr_w7 = ptr_w6 + N;
-    const float *bias_ptr = bias + out_idx;
-    int cnt_loop = cnt;
-    int tail_loop = tail;
-    asm volatile(SGEMV_IN_8_BIAS SGEMV_KERNEL_8 SGEMV_OUT_8
-                 : [in] "+r"(ptr_in),
-                   [w0] "+r"(ptr_w0),
-                   [w1] "+r"(ptr_w1),
-                   [w2] "+r"(ptr_w2),
-                   [w3] "+r"(ptr_w3),
-                   [w4] "+r"(ptr_w4),
-                   [w5] "+r"(ptr_w5),
-                   [w6] "+r"(ptr_w6),
-                   [w7] "+r"(ptr_w7),
-                   [cnt] "+r"(cnt_loop),
-                   [tail] "+r"(tail_loop)
-                 : [out] "r"(ptr_out), [bias_ptr] "r"(bias_ptr)
-                 : "v0",
-                   "v1",
-                   "v2",
-                   "v3",
-                   "v4",
-                   "v5",
-                   "v6",
-                   "v7",
-                   "v8",
-                   "v9",
-                   "v10",
-                   "v11",
-                   "v12",
-                   "v13",
-                   "v14",
-                   "v15",
-                   "v16",
-                   "v17",
-                   "v18",
-                   "v19",
-                   "v20",
-                   "v21",
-                   "v22",
-                   "v23",
-                   "v24",
-                   "v25",
-                   "cc",
-                   "memory");
-  }
-//! deal with remains
-#pragma omp parallel for
-  for (int j = out_cnt * 8; j < M; ++j) {
-    float *ptr_out = data_out + j;
-    const float *ptr_in = data_in;
-    const float *ptr_w0 = weights_ptr + (N * j);
-    int cnt_loop = cnt;
-    int tail_loop = tail;
-    float bias0 = bias[j];
-    asm volatile(
-        SGEMV_IN_1_BIAS SGEMV_KERNEL_1 SGEMV_OUT_1
-        : [in] "+r"(ptr_in),
-          [w0] "+r"(ptr_w0),
-          [cnt] "+r"(cnt_loop),
-          [tail] "+r"(tail_loop)
-        : [out] "r"(ptr_out), [bias0] "r"(bias0)
-        : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc", "memory");
-  }
-#else  //__aarch64__
-  int out_cnt = M >> 2;
-#pragma omp parallel for
-  for (int j = 0; j < out_cnt; j++) {
-    int out_idx = j * 4;
-    float *ptr_out = data_out + out_idx;
-    const float *ptr_in = data_in;
-    const float *ptr_w0 = weights_ptr + (N * out_idx);
-    const float *ptr_w1 = ptr_w0 + N;
-    const float *ptr_w2 = ptr_w1 + N;
-    const float *ptr_w3 = ptr_w2 + N;
-    float bias0 = bias[out_idx];
-    float bias1 = bias[out_idx + 1];
-    float bias2 = bias[out_idx + 2];
-    float bias3 = bias[out_idx + 3];
-
-    int cnt_loop = cnt;
-    int tail_loop = tail;
-    asm volatile(SGEMV_IN_4_BIAS SGEMV_KERNEL_4 SGEMV_OUT_4
-                 : [in] "+r"(ptr_in),
-                   [w0] "+r"(ptr_w0),
-                   [w1] "+r"(ptr_w1),
-                   [w2] "+r"(ptr_w2),
-                   [w3] "+r"(ptr_w3),
-                   [cnt] "+r"(cnt_loop),
-                   [tail] "+r"(tail_loop)
-                 : [out] "r"(ptr_out),
-                   [bias0] "r"(bias0),
-                   [bias1] "r"(bias1),
-                   [bias2] "r"(bias2),
-                   [bias3] "r"(bias3)
-                 : "q0",
-                   "q1",
-                   "q2",
-                   "q3",
-                   "q4",
-                   "q5",
-                   "q6",
-                   "q7",
-                   "q8",
-                   "q9",
-                   "q10",
-                   "q11",
-                   "q12",
-                   "q13",
-                   "cc",
-                   "memory");
-  }
-//! deal with remains
-#pragma omp parallel for
-  for (int j = out_cnt * 4; j < M; ++j) {
-    float *ptr_out = data_out + j;
-    const float *ptr_in = data_in;
-    const float *ptr_w0 = weights_ptr + (N * j);
-    int cnt_loop = cnt;
-    int tail_loop = tail;
-    float bias0 = bias[j];
-    asm volatile(SGEMV_IN_1_BIAS SGEMV_KERNEL_1 SGEMV_OUT_1
-                 : [in] "+r"(ptr_in),
-                   [w0] "+r"(ptr_w0),
-                   [cnt] "+r"(cnt_loop),
-                   [tail] "+r"(tail_loop)
-                 : [out] "r"(ptr_out), [bias0] "r"(bias0)
-                 : "q0", "q1", "q12", "q13", "q14", "q15", "cc", "memory");
-  }
-#endif  //__aarch64__
-}
-
-void sgemv_bias_relu(const bool transA,
-                     const int M,
-                     const int N,
-                     const float *A,
-                     const float *x,
-                     float *y,
-                     const float *bias) {
-  float *data_out = y;
-  const float *data_in = x;
-  const float *weights_ptr = A;
-  int cnt = N >> 3;
-  int tail = N & 7;
-#ifdef __aarch64__
-  int out_cnt = M >> 3;
-#pragma omp parallel for
-  for (int j = 0; j < out_cnt; j++) {
-    int out_idx = j * 8;
-    float *ptr_out = data_out + out_idx;
-    const float *ptr_in = data_in;
-    const float *ptr_w0 = weights_ptr + (N * out_idx);
-    const float *ptr_w1 = ptr_w0 + N;
-    const float *ptr_w2 = ptr_w1 + N;
-    const float *ptr_w3 = ptr_w2 + N;
-    const float *ptr_w4 = ptr_w3 + N;
-    const float *ptr_w5 = ptr_w4 + N;
-    const float *ptr_w6 = ptr_w5 + N;
-    const float *ptr_w7 = ptr_w6 + N;
-    const float *bias_ptr = bias + out_idx;
-    int cnt_loop = cnt;
-    int tail_loop = tail;
-    asm volatile(SGEMV_IN_8_BIAS SGEMV_KERNEL_8 SGEMV_OUT_8_RELU
-                 : [in] "+r"(ptr_in),
-                   [w0] "+r"(ptr_w0),
-                   [w1] "+r"(ptr_w1),
-                   [w2] "+r"(ptr_w2),
-                   [w3] "+r"(ptr_w3),
-                   [w4] "+r"(ptr_w4),
-                   [w5] "+r"(ptr_w5),
-                   [w6] "+r"(ptr_w6),
-                   [w7] "+r"(ptr_w7),
-                   [cnt] "+r"(cnt_loop),
-                   [tail] "+r"(tail_loop)
-                 : [out] "r"(ptr_out), [bias_ptr] "r"(bias_ptr)
-                 : "v0",
-                   "v1",
-                   "v2",
-                   "v3",
-                   "v4",
-                   "v5",
-                   "v6",
-                   "v7",
-                   "v8",
-                   "v9",
-                   "v10",
-                   "v11",
-                   "v12",
-                   "v13",
-                   "v14",
-                   "v15",
-                   "v16",
-                   "v17",
-                   "v18",
-                   "v19",
-                   "v20",
-                   "v21",
-                   "v22",
-                   "v23",
-                   "v24",
-                   "v25",
-                   "cc",
-                   "memory");
-  }
-//! deal with remains
-#pragma omp parallel for
-  for (int j = out_cnt * 8; j < M; ++j) {
-    float *ptr_out = data_out + j;
-    const float *ptr_in = data_in;
-    const float *ptr_w0 = weights_ptr + (N * j);
-    int cnt_loop = cnt;
-    int tail_loop = tail;
-    float bias0 = bias[j];
-    asm volatile(
-        SGEMV_IN_1_BIAS SGEMV_KERNEL_1 SGEMV_OUT_1_RELU
-        : [in] "+r"(ptr_in),
-          [w0] "+r"(ptr_w0),
-          [cnt] "+r"(cnt_loop),
-          [tail] "+r"(tail_loop)
-        : [out] "r"(ptr_out), [bias0] "r"(bias0)
-        : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc", "memory");
-  }
-#else  //__aarch64__
-  int out_cnt = M >> 2;
-#pragma omp parallel for
-  for (int j = 0; j < out_cnt; j++) {
-    int out_idx = j * 4;
-    float *ptr_out = data_out + out_idx;
-    const float *ptr_in = data_in;
-    const float *ptr_w0 = weights_ptr + (N * out_idx);
-    const float *ptr_w1 = ptr_w0 + N;
-    const float *ptr_w2 = ptr_w1 + N;
-    const float *ptr_w3 = ptr_w2 + N;
-    float bias0 = bias[out_idx];
-    float bias1 = bias[out_idx + 1];
-    float bias2 = bias[out_idx + 2];
-    float bias3 = bias[out_idx + 3];
-
-    int cnt_loop = cnt;
-    int tail_loop = tail;
-    asm volatile(SGEMV_IN_4_BIAS SGEMV_KERNEL_4 SGEMV_OUT_4_RELU
-                 : [in] "+r"(ptr_in),
-                   [w0] "+r"(ptr_w0),
-                   [w1] "+r"(ptr_w1),
-                   [w2] "+r"(ptr_w2),
-                   [w3] "+r"(ptr_w3),
-                   [cnt] "+r"(cnt_loop),
-                   [tail] "+r"(tail_loop)
-                 : [out] "r"(ptr_out),
-                   [bias0] "r"(bias0),
-                   [bias1] "r"(bias1),
-                   [bias2] "r"(bias2),
-                   [bias3] "r"(bias3)
-                 : "q0",
-                   "q1",
-                   "q2",
-                   "q3",
-                   "q4",
-                   "q5",
-                   "q6",
-                   "q7",
-                   "q8",
-                   "q9",
-                   "q10",
-                   "q11",
-                   "q12",
-                   "q13",
-                   "cc",
-                   "memory");
-  }
-//! deal with remains
-#pragma omp parallel for
-  for (int j = out_cnt * 4; j < M; ++j) {
-    float *ptr_out = data_out + j;
-    const float *ptr_in = data_in;
-    const float *ptr_w0 = weights_ptr + (N * j);
-    int cnt_loop = cnt;
-    int tail_loop = tail;
-    float bias0 = bias[j];
-    asm volatile(SGEMV_IN_1_BIAS SGEMV_KERNEL_1 SGEMV_OUT_1_RELU
-                 : [in] "+r"(ptr_in),
-                   [w0] "+r"(ptr_w0),
-                   [cnt] "+r"(cnt_loop),
-                   [tail] "+r"(tail_loop)
-                 : [out] "r"(ptr_out), [bias0] "r"(bias0)
-                 : "q0", "q1", "q12", "q13", "q14", "q15", "cc", "memory");
-  }
-#endif  //__aarch64__
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/sgemv.h b/lite/backends/arm/math/sgemv.h
deleted file mode 100644
index 4d74006f93..0000000000
--- a/lite/backends/arm/math/sgemv.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cmath>
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-// TODO(xxx): fixme now only support transA = false
-bool sgemv(const float* A,
-           const float* x,
-           float* y,
-           bool transA,
-           int M,
-           int N,
-           bool is_bias = false,
-           const float* bias = nullptr,
-           bool is_relu = false);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/shuffle_channel.cc b/lite/backends/arm/math/shuffle_channel.cc
deleted file mode 100644
index 7c4564aa00..0000000000
--- a/lite/backends/arm/math/shuffle_channel.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/shuffle_channel.h"
-#include <typeinfo>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <typename Dtype>
-void shuffle_kernel(
-    Dtype* output, const Dtype* input, int group_row, int group_col, int len) {
-  for (int i = 0; i < group_row; ++i) {
-    for (int j = 0; j < group_col; ++j) {
-      const Dtype* p_i = input + (i * group_col + j) * len;
-      Dtype* p_o = output + (j * group_row + i) * len;
-      memcpy(p_o, p_i, len * sizeof(Dtype));
-    }
-  }
-}
-
-template <>
-void shuffle_channel<float>(const float* inputs,
-                            float* outputs,
-                            int group,
-                            int num,
-                            int channel,
-                            int height,
-                            int width) {
-  int fea_size = channel * height * width;
-  int spatial_size = height * width;
-  int group_row = group;
-  int group_col = channel / group;
-  for (int i = 0; i < num; ++i) {
-    shuffle_kernel(outputs + i * fea_size,
-                   inputs + i * fea_size,
-                   group_row,
-                   group_col,
-                   spatial_size);
-  }
-}
-
-template <>
-void shuffle_channel<char>(const char* inputs,
-                           char* outputs,
-                           int group,
-                           int num,
-                           int channel,
-                           int height,
-                           int width) {
-  int fea_size = channel * height * width;
-  int spatial_size = height * width;
-  int group_row = group;
-  int group_col = channel / group;
-  for (int i = 0; i < num; ++i) {
-    shuffle_kernel(outputs + i * fea_size,
-                   inputs + i * fea_size,
-                   group_row,
-                   group_col,
-                   spatial_size);
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/shuffle_channel.h b/lite/backends/arm/math/shuffle_channel.h
deleted file mode 100644
index d0c8b7b81e..0000000000
--- a/lite/backends/arm/math/shuffle_channel.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <typename T>
-void shuffle_channel(const T* inputs,
-                     T* outputs,
-                     int group,
-                     int num,
-                     int channel,
-                     int height,
-                     int width);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/slice.cc b/lite/backends/arm/math/slice.cc
deleted file mode 100644
index 8b9a769050..0000000000
--- a/lite/backends/arm/math/slice.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/slice.h"
-#include <arm_neon.h>
-#include <algorithm>
-#include <cmath>
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <typename Dtype>
-void slice(const Dtype* input,
-           std::vector<int64_t> in_dims,
-           std::vector<int> axes,
-           std::vector<int> starts,
-           std::vector<int> ends,
-           Dtype* out,
-           Context<TARGET(kARM)>* ctx) {
-  auto out_dims = in_dims;
-  std::vector<int> real_starts(in_dims.size(), 0);
-  std::vector<int> real_ends(in_dims.size(), 0);
-  std::vector<int> real_step(in_dims.size(), 0);
-  for (int i = 0; i < in_dims.size(); i++) {
-    real_ends[i] = in_dims[i];
-  }
-  for (int i = 0; i < axes.size(); i++) {
-    int dim_value = in_dims[axes[i]];
-    if (dim_value > 0) {
-      int start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
-      int end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
-      start = std::max(start, 0);
-      end = std::max(end, 0);
-      end = std::min(end, dim_value);
-      out_dims[axes[i]] = end - start;
-      real_starts[axes[i]] = start;
-      real_ends[axes[i]] = end;
-    }
-  }
-  const int LEN = in_dims.size();
-  int dst_step[LEN];
-  for (int i = 0; i < in_dims.size(); ++i) {
-    dst_step[i] = 1;
-  }
-  int src_step[LEN];
-  for (int i = 0; i < in_dims.size(); ++i) {
-    src_step[i] = 1;
-  }
-  int out_num = out_dims[in_dims.size() - 1];
-  for (int i = in_dims.size() - 2; i >= 0; i--) {
-    dst_step[i] = out_dims[i + 1] * dst_step[i + 1];
-    src_step[i] = in_dims[i + 1] * src_step[i + 1];
-    out_num *= out_dims[i];
-  }
-
-  for (int dst_id = 0; dst_id < out_num; dst_id++) {
-    int src_id = 0;
-    int index_id = dst_id;
-    for (int j = 0; j < out_dims.size(); j++) {
-      int cur_id = index_id / dst_step[j];
-      index_id = index_id % dst_step[j];
-      src_id += (cur_id + real_starts[j]) * src_step[j];
-    }
-    out[dst_id] = input[src_id];
-  }
-}
-
-template void slice(const int* input,
-                    std::vector<int64_t> dims,
-                    std::vector<int> axes,
-                    std::vector<int> starts,
-                    std::vector<int> ends,
-                    int* out,
-                    Context<TARGET(kARM)>* ctx);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/slice.h b/lite/backends/arm/math/slice.h
deleted file mode 100644
index 86172d28a7..0000000000
--- a/lite/backends/arm/math/slice.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cmath>
-#include <vector>
-#include "lite/core/context.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <typename Dtype>
-void slice(const Dtype* input,
-           std::vector<int64_t> dims,
-           std::vector<int> axes,
-           std::vector<int> starts,
-           std::vector<int> ends,
-           Dtype* out,
-           Context<TARGET(kARM)>* ctx);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/softmax.cc b/lite/backends/arm/math/softmax.cc
deleted file mode 100644
index 65d41b0491..0000000000
--- a/lite/backends/arm/math/softmax.cc
+++ /dev/null
@@ -1,616 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/softmax.h"
-#include <algorithm>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <>
-void softmax_basic<float>(const float* din,
-                          float* dout,
-                          const int axis_size,
-                          const int inner_num,
-                          const int outer_num) {
-  int compute_size = inner_num * outer_num;
-#pragma omp parallel for
-  for (int i = 0; i < compute_size; ++i) {
-    int idx_inner = i % inner_num;
-    int idx_outer = (i / inner_num) * axis_size;
-    int real_index = idx_outer * inner_num + idx_inner;
-
-    float max_data = din[real_index];
-    // get max
-    for (int j = 1; j < axis_size; ++j) {
-      real_index += inner_num;
-      max_data = din[real_index] > max_data ? din[real_index] : max_data;
-    }
-
-    real_index = idx_outer * inner_num + idx_inner;
-    // sub, exp and sum
-    dout[real_index] = expf(din[real_index] - max_data);
-    float sum_data = dout[real_index];
-    for (int j = 1; j < axis_size; ++j) {
-      real_index += inner_num;
-      dout[real_index] = expf(din[real_index] - max_data);
-      sum_data += dout[real_index];
-    }
-
-    float sum_inv = 1.f / sum_data;
-    real_index = idx_outer * inner_num + idx_inner;
-    // get softmax result
-    for (int j = 0; j < axis_size; ++j) {
-      dout[real_index] *= sum_inv;
-      real_index += inner_num;
-    }
-  }
-}
-
-template <>
-void softmax_inner8_axis4<float>(const float* din,
-                                 float* dout,
-                                 const int axis_size,
-                                 const int inner_num,
-                                 const int outer_num) {
-  int compute_size = inner_num * outer_num;
-  int cmp_cnt = compute_size >> 3;
-  int remain = compute_size % 8;
-  float32x4_t vone = vdupq_n_f32(1.0f);
-
-#pragma omp parallel for
-  for (int c = 0; c < cmp_cnt; ++c) {
-    int i = c * 8;
-    int idx_inner = i % inner_num;
-    int idx_outer = (i / inner_num) * axis_size;
-    int real_index = idx_outer * inner_num + idx_inner;
-
-    // get max axis_size == 4
-    const float* din_ptr = din + real_index;
-    const float* din_ptr1 = din_ptr + inner_num;
-    const float* din_ptr2 = din_ptr1 + inner_num;
-    const float* din_ptr3 = din_ptr2 + inner_num;
-    float32x4_t vdata0 = vld1q_f32(din_ptr);
-    float32x4_t vdata1 = vld1q_f32(din_ptr1);
-    float32x4_t vdata2 = vld1q_f32(din_ptr2);
-    float32x4_t vdata3 = vld1q_f32(din_ptr3);
-
-    float32x4_t vdata01 = vld1q_f32(din_ptr + 4);
-    float32x4_t vdata11 = vld1q_f32(din_ptr1 + 4);
-    float32x4_t vdata21 = vld1q_f32(din_ptr2 + 4);
-    float32x4_t vdata31 = vld1q_f32(din_ptr3 + 4);
-
-    float* dout_ptr0 = dout + real_index;
-    float* dout_ptr1 = dout_ptr0 + inner_num;
-    float32x4_t vmax1 = vmaxq_f32(vdata0, vdata1);
-    float32x4_t vmax2 = vmaxq_f32(vdata2, vdata3);
-    float32x4_t vmax11 = vmaxq_f32(vdata01, vdata11);
-    float32x4_t vmax21 = vmaxq_f32(vdata21, vdata31);
-    float* dout_ptr2 = dout_ptr1 + inner_num;
-    float* dout_ptr3 = dout_ptr2 + inner_num;
-    float32x4_t vmax = vmaxq_f32(vmax1, vmax2);
-    float32x4_t vmax_1 = vmaxq_f32(vmax11, vmax21);
-
-    // sub, exp and sum
-    float32x4_t vsum0 = exp_ps(vsubq_f32(vdata0, vmax));
-    float32x4_t vsum1 = exp_ps(vsubq_f32(vdata1, vmax));
-    float32x4_t vsum2 = exp_ps(vsubq_f32(vdata2, vmax));
-    float32x4_t vsum3 = exp_ps(vsubq_f32(vdata3, vmax));
-
-    float32x4_t vsum01 = exp_ps(vsubq_f32(vdata01, vmax_1));
-    float32x4_t vsum11 = exp_ps(vsubq_f32(vdata11, vmax_1));
-    float32x4_t vsum21 = exp_ps(vsubq_f32(vdata21, vmax_1));
-    float32x4_t vsum31 = exp_ps(vsubq_f32(vdata31, vmax_1));
-
-    float32x4_t vsum_1 = vaddq_f32(vsum0, vsum1);
-    float32x4_t vsum_2 = vaddq_f32(vsum2, vsum3);
-    float32x4_t vsum_11 = vaddq_f32(vsum01, vsum11);
-    float32x4_t vsum_21 = vaddq_f32(vsum21, vsum31);
-
-    float32x4_t vsum = vaddq_f32(vsum_1, vsum_2);
-    float32x4_t vsum111 = vaddq_f32(vsum_11, vsum_21);
-
-    float32x4_t vinf = div_ps(vone, vsum);
-    float32x4_t vinf1 = div_ps(vone, vsum111);
-
-    vsum0 = vmulq_f32(vsum0, vinf);
-    vsum1 = vmulq_f32(vsum1, vinf);
-    vsum2 = vmulq_f32(vsum2, vinf);
-    vsum3 = vmulq_f32(vsum3, vinf);
-
-    vsum01 = vmulq_f32(vsum01, vinf1);
-    vsum11 = vmulq_f32(vsum11, vinf1);
-    vsum21 = vmulq_f32(vsum21, vinf1);
-    vsum31 = vmulq_f32(vsum31, vinf1);
-
-    vst1q_f32(dout_ptr0, vsum0);
-    vst1q_f32(dout_ptr1, vsum1);
-    vst1q_f32(dout_ptr2, vsum2);
-    vst1q_f32(dout_ptr3, vsum3);
-
-    vst1q_f32(dout_ptr0 + 4, vsum01);
-    vst1q_f32(dout_ptr1 + 4, vsum11);
-    vst1q_f32(dout_ptr2 + 4, vsum21);
-    vst1q_f32(dout_ptr3 + 4, vsum31);
-  }
-
-  int i = cmp_cnt * 8;
-
-  if (remain > 4) {
-    int idx_inner = i % inner_num;
-    int idx_outer = (i / inner_num) * axis_size;
-    int real_index = idx_outer * inner_num + idx_inner;
-    // get max axis_size == 4
-    const float* din_ptr = din + real_index;
-    const float* din_ptr1 = din_ptr + inner_num;
-    const float* din_ptr2 = din_ptr1 + inner_num;
-    const float* din_ptr3 = din_ptr2 + inner_num;
-    float32x4_t vdata0 = vld1q_f32(din_ptr);
-    float32x4_t vdata1 = vld1q_f32(din_ptr1);
-    float32x4_t vdata2 = vld1q_f32(din_ptr2);
-    float32x4_t vdata3 = vld1q_f32(din_ptr3);
-
-    float* dout_ptr0 = dout + real_index;
-    float* dout_ptr1 = dout_ptr0 + inner_num;
-    float32x4_t vmax1 = vmaxq_f32(vdata0, vdata1);
-    float32x4_t vmax2 = vmaxq_f32(vdata2, vdata3);
-    float* dout_ptr2 = dout_ptr1 + inner_num;
-    float* dout_ptr3 = dout_ptr2 + inner_num;
-    float32x4_t vmax = vmaxq_f32(vmax1, vmax2);
-
-    // sub, exp and sum
-    float32x4_t vsum0 = exp_ps(vsubq_f32(vdata0, vmax));
-    float32x4_t vsum1 = exp_ps(vsubq_f32(vdata1, vmax));
-    float32x4_t vsum2 = exp_ps(vsubq_f32(vdata2, vmax));
-    float32x4_t vsum3 = exp_ps(vsubq_f32(vdata3, vmax));
-
-    float32x4_t vsum_1 = vaddq_f32(vsum0, vsum1);
-    float32x4_t vsum_2 = vaddq_f32(vsum2, vsum3);
-
-    float32x4_t vsum = vaddq_f32(vsum_1, vsum_2);
-
-    float32x4_t vone = vdupq_n_f32(1.0f);
-    float32x4_t vinf = div_ps(vone, vsum);
-
-    vsum0 = vmulq_f32(vsum0, vinf);
-    vsum1 = vmulq_f32(vsum1, vinf);
-    vsum2 = vmulq_f32(vsum2, vinf);
-    vsum3 = vmulq_f32(vsum3, vinf);
-
-    vst1q_f32(dout_ptr0, vsum0);
-    vst1q_f32(dout_ptr1, vsum1);
-    vst1q_f32(dout_ptr2, vsum2);
-    vst1q_f32(dout_ptr3, vsum3);
-
-    i += 4;
-  }
-  for (; i < compute_size; i++) {
-    int idx_inner = i % inner_num;
-    int idx_outer = (i / inner_num) * axis_size;
-    int real_index = idx_outer * inner_num + idx_inner;
-
-    float max_data = din[real_index];
-    // get max
-    for (int j = 1; j < axis_size; ++j) {
-      real_index += inner_num;
-      max_data = din[real_index] > max_data ? din[real_index] : max_data;
-    }
-
-    real_index = idx_outer * inner_num + idx_inner;
-    // sub, exp and sum
-    dout[real_index] = expf(din[real_index] - max_data);
-    float sum_data = dout[real_index];
-    for (int j = 1; j < axis_size; ++j) {
-      real_index += inner_num;
-      dout[real_index] = expf(din[real_index] - max_data);
-      sum_data += dout[real_index];
-    }
-
-    float sum_inv = 1.f / sum_data;
-    real_index = idx_outer * inner_num + idx_inner;
-    // get softmax result
-    for (int j = 0; j < axis_size; ++j) {
-      dout[real_index] *= sum_inv;
-      real_index += inner_num;
-    }
-  }
-}
-
-template <>
-void softmax_inner4_axis4<float>(const float* din,
-                                 float* dout,
-                                 const int axis_size,
-                                 const int inner_num,
-                                 const int outer_num) {
-  int compute_size = inner_num * outer_num;
-  int cmp_cnt = compute_size >> 2;
-  int remain = compute_size % 4;
-  float32x4_t vone = vdupq_n_f32(1.0f);
-
-#pragma omp parallel for
-  for (int c = 0; c < cmp_cnt; ++c) {
-    int i = c * 4;
-    int idx_inner = i % inner_num;
-    int idx_outer = (i / inner_num) * axis_size;
-    int real_index = idx_outer * inner_num + idx_inner;
-
-    // get max axis_size == 4
-    const float* din_ptr = din + real_index;
-    const float* din_ptr1 = din_ptr + inner_num;
-    const float* din_ptr2 = din_ptr1 + inner_num;
-    const float* din_ptr3 = din_ptr2 + inner_num;
-    float32x4_t vdata0 = vld1q_f32(din_ptr);
-    float32x4_t vdata1 = vld1q_f32(din_ptr1);
-    float32x4_t vdata2 = vld1q_f32(din_ptr2);
-    float32x4_t vdata3 = vld1q_f32(din_ptr3);
-
-    float* dout_ptr0 = dout + real_index;
-    float* dout_ptr1 = dout_ptr0 + inner_num;
-    float32x4_t vmax1 = vmaxq_f32(vdata0, vdata1);
-    float32x4_t vmax2 = vmaxq_f32(vdata2, vdata3);
-    float* dout_ptr2 = dout_ptr1 + inner_num;
-    float* dout_ptr3 = dout_ptr2 + inner_num;
-    float32x4_t vmax = vmaxq_f32(vmax1, vmax2);
-
-    // sub, exp and sum
-    float32x4_t vsum0 = exp_ps(vsubq_f32(vdata0, vmax));
-    float32x4_t vsum1 = exp_ps(vsubq_f32(vdata1, vmax));
-    float32x4_t vsum2 = exp_ps(vsubq_f32(vdata2, vmax));
-    float32x4_t vsum3 = exp_ps(vsubq_f32(vdata3, vmax));
-
-    float32x4_t vsum_1 = vaddq_f32(vsum0, vsum1);
-    float32x4_t vsum_2 = vaddq_f32(vsum2, vsum3);
-
-    float32x4_t vsum = vaddq_f32(vsum_1, vsum_2);
-
-    float32x4_t vinf = div_ps(vone, vsum);
-
-    vsum0 = vmulq_f32(vsum0, vinf);
-    vsum1 = vmulq_f32(vsum1, vinf);
-    vsum2 = vmulq_f32(vsum2, vinf);
-    vsum3 = vmulq_f32(vsum3, vinf);
-
-    vst1q_f32(dout_ptr0, vsum0);
-    vst1q_f32(dout_ptr1, vsum1);
-    vst1q_f32(dout_ptr2, vsum2);
-    vst1q_f32(dout_ptr3, vsum3);
-  }
-
-  int i = cmp_cnt * 8;
-  for (; i < compute_size; i++) {
-    int idx_inner = i % inner_num;
-    int idx_outer = (i / inner_num) * axis_size;
-    int real_index = idx_outer * inner_num + idx_inner;
-
-    float max_data = din[real_index];
-    // get max
-    for (int j = 1; j < axis_size; ++j) {
-      real_index += inner_num;
-      max_data = din[real_index] > max_data ? din[real_index] : max_data;
-    }
-
-    real_index = idx_outer * inner_num + idx_inner;
-    // sub, exp and sum
-    dout[real_index] = expf(din[real_index] - max_data);
-    float sum_data = dout[real_index];
-    for (int j = 1; j < axis_size; ++j) {
-      real_index += inner_num;
-      dout[real_index] = expf(din[real_index] - max_data);
-      sum_data += dout[real_index];
-    }
-
-    float sum_inv = 1.f / sum_data;
-    real_index = idx_outer * inner_num + idx_inner;
-    // get softmax result
-    for (int j = 0; j < axis_size; ++j) {
-      dout[real_index] *= sum_inv;
-      real_index += inner_num;
-    }
-  }
-}
-
-template <>
-void softmax_inner8<float>(const float* din,
-                           float* dout,
-                           const int axis_size,
-                           const int inner_num,
-                           const int outer_num) {
-  int compute_size = inner_num * outer_num;
-  int cmp_cnt = compute_size >> 3;
-#pragma omp parallel for
-  for (int c = 0; c < cmp_cnt; ++c) {
-    int i = c * 8;
-    int idx_inner = i % inner_num;
-    int idx_outer = (i / inner_num) * axis_size;
-    int real_index = idx_outer * inner_num + idx_inner;
-
-    const float* din_ptr = din + real_index;
-    float32x4_t vmax = vld1q_f32(din_ptr);
-    float32x4_t vmax2 = vld1q_f32(din_ptr + 4);
-    // get max
-    for (int j = 1; j < axis_size; ++j) {
-      din_ptr += inner_num;
-      float32x4_t vdata = vld1q_f32(din_ptr);
-      float32x4_t vdata2 = vld1q_f32(din_ptr + 4);
-      vmax = vmaxq_f32(vmax, vdata);
-      vmax2 = vmaxq_f32(vmax2, vdata2);
-    }
-
-    // sub, exp and sum
-    din_ptr = din + real_index;
-    float* dout_ptr = dout + real_index;
-    float32x4_t vdata = vld1q_f32(din_ptr);
-    float32x4_t vdata2 = vld1q_f32(din_ptr + 4);
-    float32x4_t vsum = exp_ps(vsubq_f32(vdata, vmax));
-    float32x4_t vsum2 = exp_ps(vsubq_f32(vdata2, vmax2));
-    din_ptr += inner_num;
-    vst1q_f32(dout_ptr, vsum);
-    vst1q_f32(dout_ptr + 4, vsum2);
-    dout_ptr += inner_num;
-    for (int j = 1; j < axis_size; ++j) {
-      float32x4_t vdata0 = vld1q_f32(din_ptr);
-      float32x4_t vdata1 = vld1q_f32(din_ptr + 4);
-      vdata0 = exp_ps(vsubq_f32(vdata0, vmax));
-      vdata1 = exp_ps(vsubq_f32(vdata1, vmax2));
-      din_ptr += inner_num;
-      vsum = vaddq_f32(vsum, vdata0);
-      vsum2 = vaddq_f32(vsum2, vdata1);
-      vst1q_f32(dout_ptr, vdata0);
-      vst1q_f32(dout_ptr + 4, vdata1);
-      dout_ptr += inner_num;
-    }
-
-    float32x4_t vone = vdupq_n_f32(1.0f);
-    float32x4_t vinf = div_ps(vone, vsum);
-    float32x4_t vinf2 = div_ps(vone, vsum2);
-    dout_ptr = dout + real_index;
-    // get softmax result
-    for (int j = 0; j < axis_size; ++j) {
-      float32x4_t vdata0 = vld1q_f32(dout_ptr);
-      float32x4_t vdata1 = vld1q_f32(dout_ptr + 4);
-      vdata0 = vmulq_f32(vdata0, vinf);
-      vdata1 = vmulq_f32(vdata1, vinf2);
-      vst1q_f32(dout_ptr, vdata0);
-      vst1q_f32(dout_ptr + 4, vdata1);
-      dout_ptr += inner_num;
-    }
-  }
-
-  for (int i = cmp_cnt * 8; i < compute_size; i++) {
-    int idx_inner = i % inner_num;
-    int idx_outer = (i / inner_num) * axis_size;
-    int real_index = idx_outer * inner_num + idx_inner;
-
-    float max_data = din[real_index];
-    // get max
-    for (int j = 1; j < axis_size; ++j) {
-      real_index += inner_num;
-      max_data = din[real_index] > max_data ? din[real_index] : max_data;
-    }
-
-    real_index = idx_outer * inner_num + idx_inner;
-    // sub, exp and sum
-    dout[real_index] = expf(din[real_index] - max_data);
-    float sum_data = dout[real_index];
-    for (int j = 1; j < axis_size; ++j) {
-      real_index += inner_num;
-      dout[real_index] = expf(din[real_index] - max_data);
-      sum_data += dout[real_index];
-    }
-
-    float sum_inv = 1.f / sum_data;
-    real_index = idx_outer * inner_num + idx_inner;
-    // get softmax result
-    for (int j = 0; j < axis_size; ++j) {
-      dout[real_index] *= sum_inv;
-      real_index += inner_num;
-    }
-  }
-}
-
-template <>
-void softmax_inner4<float>(const float* din,
-                           float* dout,
-                           const int axis_size,
-                           const int inner_num,
-                           const int outer_num) {
-  int compute_size = inner_num * outer_num;
-  int cmp_cnt = compute_size >> 2;
-#pragma omp parallel for
-  for (int c = 0; c < cmp_cnt; ++c) {
-    int i = c * 4;
-    int idx_inner = i % inner_num;
-    int idx_outer = (i / inner_num) * axis_size;
-    int real_index = idx_outer * inner_num + idx_inner;
-
-    // float max_data = din[real_index];
-    const float* din_ptr = din + real_index;
-    float32x4_t vmax = vld1q_f32(din_ptr);
-    // get max
-    for (int j = 1; j < axis_size; ++j) {
-      din_ptr += inner_num;
-      float32x4_t vdata = vld1q_f32(din_ptr);
-      vmax = vmaxq_f32(vmax, vdata);
-    }
-    // sub, exp and sum
-    din_ptr = din + real_index;
-    float* dout_ptr = dout + real_index;
-    float32x4_t vdata = vld1q_f32(din_ptr);
-    float32x4_t vsum = exp_ps(vsubq_f32(vdata, vmax));
-    din_ptr += inner_num;
-    vst1q_f32(dout_ptr, vsum);
-    dout_ptr += inner_num;
-    for (int j = 1; j < axis_size; ++j) {
-      // real_index += inner_num;
-      float32x4_t vdata0 = vld1q_f32(din_ptr);
-      vdata0 = exp_ps(vsubq_f32(vdata0, vmax));
-      din_ptr += inner_num;
-      vsum = vaddq_f32(vsum, vdata0);
-      vst1q_f32(dout_ptr, vdata0);
-      dout_ptr += inner_num;
-    }
-
-    float32x4_t vone = vdupq_n_f32(1.0f);
-    float32x4_t vinf = div_ps(vone, vsum);
-    dout_ptr = dout + real_index;
-    // get softmax result
-    for (int j = 0; j < axis_size; ++j) {
-      float32x4_t vdata0 = vld1q_f32(dout_ptr);
-      vdata0 = vmulq_f32(vdata0, vinf);
-      vst1q_f32(dout_ptr, vdata0);
-      dout_ptr += inner_num;
-    }
-  }
-
-  for (int i = cmp_cnt * 4; i < compute_size; i++) {
-    int idx_inner = i % inner_num;
-    int idx_outer = (i / inner_num) * axis_size;
-    int real_index = idx_outer * inner_num + idx_inner;
-
-    float max_data = din[real_index];
-    // get max
-    for (int j = 1; j < axis_size; ++j) {
-      real_index += inner_num;
-      max_data = din[real_index] > max_data ? din[real_index] : max_data;
-    }
-
-    real_index = idx_outer * inner_num + idx_inner;
-    // sub, exp and sum
-    dout[real_index] = expf(din[real_index] - max_data);
-    float sum_data = dout[real_index];
-    for (int j = 1; j < axis_size; ++j) {
-      real_index += inner_num;
-      dout[real_index] = expf(din[real_index] - max_data);
-      sum_data += dout[real_index];
-    }
-
-    float sum_inv = 1.f / sum_data;
-    real_index = idx_outer * inner_num + idx_inner;
-    // get softmax result
-    for (int j = 0; j < axis_size; ++j) {
-      dout[real_index] *= sum_inv;
-      real_index += inner_num;
-    }
-  }
-}
-
-template <>
-void softmax_inner1_large_axis<float>(const float* din,
-                                      float* dout,
-                                      const int outer_size,
-                                      const int axis_size) {
-#pragma omp parallel for
-  for (int i = 0; i < outer_size; ++i) {
-    const float* din_ptr = din + i * axis_size;
-    float* dout_ptr = dout + i * axis_size;
-
-    const float* din_max_ptr = din_ptr;
-    int nn = axis_size >> 2;
-
-    // get max
-    float32x4_t vmax = vld1q_f32(din_max_ptr);
-    din_max_ptr += 4;
-    int j = 1;
-    for (; j < nn; ++j) {
-      vmax = vmaxq_f32(vmax, vld1q_f32(din_max_ptr));
-      din_max_ptr += 4;
-    }
-    float32x2_t vhmax = vmax_f32(vget_high_f32(vmax), vget_low_f32(vmax));
-    float max_data = std::max(vget_lane_f32(vhmax, 0), vget_lane_f32(vhmax, 1));
-    for (j = 4 * j; j < axis_size; ++j) {
-      max_data = std::max(max_data, din_max_ptr[0]);
-      din_max_ptr++;
-    }
-
-    // sub, exp and sum
-    const float* din_sum_ptr = din_ptr;
-    float* dout_sum_ptr = dout_ptr;
-    vmax = vdupq_n_f32(max_data);
-    float32x4_t vsub_exp = exp_ps(vsubq_f32(vld1q_f32(din_sum_ptr), vmax));
-    float32x4_t vsum = vsub_exp;
-    vst1q_f32(dout_sum_ptr, vsub_exp);
-    din_sum_ptr += 4;
-    dout_sum_ptr += 4;
-
-    j = 1;
-    for (; j < nn; ++j) {
-      vsub_exp = exp_ps(vsubq_f32(vld1q_f32(din_sum_ptr), vmax));
-      vst1q_f32(dout_sum_ptr, vsub_exp);
-      vsum = vaddq_f32(vsum, vsub_exp);
-      din_sum_ptr += 4;
-      dout_sum_ptr += 4;
-    }
-    float32x2_t vhsum = vadd_f32(vget_high_f32(vsum), vget_low_f32(vsum));
-    float sum_data = vget_lane_f32(vhsum, 0) + vget_lane_f32(vhsum, 1);
-
-    for (j = 4 * j; j < axis_size; ++j) {
-      dout_sum_ptr[0] = expf(din_sum_ptr[0] - max_data);
-      sum_data += dout_sum_ptr[0];
-      din_sum_ptr++;
-      dout_sum_ptr++;
-    }
-
-    float sum_inv = 1.f / sum_data;
-    float* dout_res_ptr = dout_ptr;
-    float32x4_t vinv = vdupq_n_f32(sum_inv);
-    // get softmax result
-    j = 0;
-    for (; j < nn; ++j) {
-      float32x4_t vout = vld1q_f32(dout_res_ptr);
-      float32x4_t vres = vmulq_f32(vout, vinv);
-      vst1q_f32(dout_res_ptr, vres);
-      dout_res_ptr += 4;
-    }
-    for (j = nn * 4; j < axis_size; ++j) {
-      dout_ptr[j] *= sum_inv;
-    }
-  }
-}
-
-template <>
-void softmax_inner1_small_axis<float>(const float* din,
-                                      float* dout,
-                                      const int outer_size,
-                                      const int axis_size) {
-#pragma omp parallel for
-  for (int i = 0; i < outer_size; ++i) {
-    const float* din_ptr = din + i * axis_size;
-    float* dout_ptr = dout + i * axis_size;
-    // get max
-    float max_data = din_ptr[0];
-    for (int j = 1; j < axis_size; ++j) {
-      max_data = std::max(max_data, din_ptr[j]);
-    }
-
-    // sub, exp and sum
-    float sum_data = 0.f;
-    for (int j = 0; j < axis_size; ++j) {
-      dout_ptr[j] = expf(din_ptr[j] - max_data);
-      sum_data += dout_ptr[j];
-    }
-
-    float sum_inv = 1.f / sum_data;
-    for (int j = 0; j < axis_size; ++j) {
-      dout_ptr[j] *= sum_inv;
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/softmax.h b/lite/backends/arm/math/softmax.h
deleted file mode 100644
index cc1957a73e..0000000000
--- a/lite/backends/arm/math/softmax.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <typename T>
-void softmax_basic(const T* din,
-                   T* dout,
-                   const int axis_size,
-                   const int inner_num,
-                   const int outer_num);
-
-template <typename T>
-void softmax_inner8_axis4(const T* din,
-                          T* dout,
-                          const int axis_size,
-                          const int inner_num,
-                          const int outer_num);
-
-template <typename T>
-void softmax_inner4_axis4(const T* din,
-                          T* dout,
-                          const int axis_size,
-                          const int inner_num,
-                          const int outer_num);
-template <typename T>
-void softmax_inner8(const T* din,
-                    T* dout,
-                    const int axis_size,
-                    const int inner_num,
-                    const int outer_num);
-
-template <typename T>
-void softmax_inner4(const T* din,
-                    T* dout,
-                    const int axis_size,
-                    const int inner_num,
-                    const int outer_num);
-
-template <typename T>
-void softmax_inner1_large_axis(const T* din,
-                               T* dout,
-                               const int outer_size,
-                               const int axis_size);
-
-template <typename T>
-void softmax_inner1_small_axis(const T* din,
-                               T* dout,
-                               const int outer_size,
-                               const int axis_size);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/split.cc b/lite/backends/arm/math/split.cc
deleted file mode 100644
index 54ea7e62c2..0000000000
--- a/lite/backends/arm/math/split.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/split.h"
-#include <algorithm>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <>
-void split_cpy<float>(const float* din, float* dout, int num) {
-  int cnt = num >> 4;
-  int remain = num % 16;
-#pragma omp parallel for
-  for (int i = 0; i < cnt; i++) {
-    const float* din_ptr = din + (i << 4);
-    float* dout_ptr = dout + (i << 4);
-
-    float32x4_t din0 = vld1q_f32(din_ptr);
-    float32x4_t din1 = vld1q_f32(din_ptr + 4);
-    float32x4_t din2 = vld1q_f32(din_ptr + 8);
-    float32x4_t din3 = vld1q_f32(din_ptr + 12);
-
-    vst1q_f32(dout_ptr, din0);
-    vst1q_f32(dout_ptr + 4, din1);
-    vst1q_f32(dout_ptr + 8, din2);
-    vst1q_f32(dout_ptr + 12, din3);
-  }
-  if (remain > 0) {
-    const float* din_ptr = din + (cnt << 4);
-    float* dout_ptr = dout + (cnt << 4);
-    for (int i = 0; i < remain; i++) {
-      *dout_ptr = *din_ptr;
-      dout_ptr++;
-      din_ptr++;
-    }
-  }
-}
-
-template <>
-void split<float>(const float* din,
-                  const std::vector<lite::Tensor*>& dout,
-                  const int axis,
-                  const std::vector<int>& in_strides) {
-  int input_offset = 0;
-  for (auto out : dout) {
-    auto out_dim = out->dims();
-    std::vector<int> out_strides(out_dim.size());
-    out_strides[out_dim.size() - 1] = out_dim[out_dim.size() - 1];
-    for (int i = out_dim.size() - 2; i >= 0; --i) {
-      out_strides[i] = out_strides[i + 1] * out_dim[i];
-    }
-
-    float* out_data = out->mutable_data<float>();
-    int before = out_strides[0] / out_strides[axis];
-    int in_after = in_strides[axis];
-    int out_after = out_strides[axis];
-
-    for (int i = 0; i < before; ++i) {
-      split_cpy(din + input_offset + i * in_after,
-                out_data + i * out_after,
-                out_after);
-    }
-    input_offset += out_strides[axis];
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/split.h b/lite/backends/arm/math/split.h
deleted file mode 100644
index 2c6f392cc5..0000000000
--- a/lite/backends/arm/math/split.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <vector>
-#include "lite/core/op_lite.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <typename T>
-void split_cpy(const T* din, T* dout, int num);
-
-template <typename T>
-void split(const T* din,
-           const std::vector<lite::Tensor*>& dout,
-           const int axis,
-           const std::vector<int>& in_strides);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/stack.cc b/lite/backends/arm/math/stack.cc
deleted file mode 100644
index e017a8d01e..0000000000
--- a/lite/backends/arm/math/stack.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/stack.h"
-#include <cstddef>
-#include <utility>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void stack(std::vector<lite::Tensor *> x, lite::Tensor *y, int axis) {
-  if (axis < 0) axis += (x[0]->dims().size() + 1);
-  int n = x.size();
-  auto *y_data = y->mutable_data<float>();
-  std::vector<const float *> x_datas(n);
-  for (int i = 0; i < n; i++) x_datas[i] = x[i]->data<float>();
-
-  int pre = 1, post = 1;
-  auto &dim = x[0]->dims();
-  for (auto i = 0; i < axis; ++i) pre *= dim[i];
-  for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
-
-  auto x_data_arr = x_datas.data();
-
-  size_t x_offset = 0;
-  size_t y_offset = 0;
-  for (int i = 0; i < pre; i++) {
-    for (int j = 0; j < n; j++) {
-      std::memcpy(
-          y_data + y_offset, x_data_arr[j] + x_offset, post * sizeof(float));
-      y_offset += post;
-    }
-    x_offset += post;
-  }
-}
-
-} /* namespace math */
-} /* namespace arm */
-} /* namespace lite */
-} /* namespace paddle */
diff --git a/lite/backends/arm/math/stack.h b/lite/backends/arm/math/stack.h
deleted file mode 100644
index 2000b3da60..0000000000
--- a/lite/backends/arm/math/stack.h
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <cstddef>
-#include <vector>
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void stack(std::vector<lite::Tensor*> x, lite::Tensor* out, int axis);
-
-} /* namespace math */
-} /* namespace arm */
-} /* namespace lite */
-} /* namespace paddle */
diff --git a/lite/backends/arm/math/topk.cc b/lite/backends/arm/math/topk.cc
deleted file mode 100644
index c9239134e1..0000000000
--- a/lite/backends/arm/math/topk.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/topk.h"
-#include <utility>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-bool comp_func(std::pair<float, int> a, std::pair<float, int> b) {
-  return (a.first > b.first);
-}
-
-void topk(const float* in_data,
-          float* out_val,
-          int* out_ind,
-          int m,
-          int n,
-          int k,
-          Context<TARGET(kARM)>* ctx) {
-  for (int i = 0; i < m; i++) {
-    const float* in_tmp = in_data + i * n;
-    float* out_val_tmp = out_val + i * k;
-    int* out_ind_tmp = out_ind + i * k;
-    std::vector<std::pair<float, int>> vec;
-    for (int j = 0; j < n; j++) {
-      vec.push_back(std::make_pair(in_tmp[j], j));
-    }
-    std::partial_sort(vec.begin(), vec.begin() + k, vec.end(), comp_func);
-    for (int q = 0; q < k; q++) {
-      out_val_tmp[q] = vec[q].first;
-      out_ind_tmp[q] = vec[q].second;
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/topk.h b/lite/backends/arm/math/topk.h
deleted file mode 100644
index 5bf472e1af..0000000000
--- a/lite/backends/arm/math/topk.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/context.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void topk(const float* din,
-          float* out_val,
-          int* out_ind,
-          int m,
-          int n,
-          int k,
-          Context<TARGET(kARM)>* ctx);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/type_trans.cc b/lite/backends/arm/math/type_trans.cc
deleted file mode 100644
index 6ded50e752..0000000000
--- a/lite/backends/arm/math/type_trans.cc
+++ /dev/null
@@ -1,919 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/type_trans.h"
-#include <arm_neon.h>
-#include <string.h>
-#include <vector>
-#include "lite/backends/arm/math/saturate.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <typename dtype>
-void int32_to_dtype(const int* din,
-                    dtype* dout,
-                    const float* scale,
-                    int axis_size,
-                    int64_t outer_size,
-                    int64_t inner_size);
-
-void fp32_to_int8(const float* din,
-                  int8_t* dout,
-                  const float* scale,
-                  int axis_size,
-                  int64_t outer_size,
-                  int64_t inner_size) {
-  int cnt = inner_size / 16;
-  int remain = inner_size & 15;
-  int64_t loop_size = outer_size * axis_size;
-
-#pragma omp parallel for
-  for (int j = 0; j < loop_size; ++j) {
-    float inv_scale = 1.f / scale[j % axis_size];
-    float32x4_t vzero = vdupq_n_f32(0.f);
-    float32x4_t vscale = vdupq_n_f32(inv_scale);
-    float32x4_t vpoff = vdupq_n_f32(0.5f);
-    float32x4_t vnoff = vdupq_n_f32(-0.5f);
-    const float* din_c = din + j * inner_size;
-    signed char* dout_c = dout + j * inner_size;
-    if (cnt > 0) {
-      int cnt_loop = cnt;
-      const float* din_ptr = din_c;
-      signed char* dout_ptr = dout_c;
-#ifdef __aarch64__
-      asm volatile(
-          "ldp q0, q1, [%[in]], #32                           \n"
-          "ldp q2, q3, [%[in]], #32                   \n"
-          "0:                                         \n" /* main loop */
-          "fmul v4.4s, v0.4s, %[scale].4s             \n"
-          "fmul v5.4s, v1.4s, %[scale].4s             \n"
-          "fmul v6.4s, v2.4s, %[scale].4s             \n"
-          "fmul v7.4s, v3.4s, %[scale].4s             \n"
-          "ldp q0, q1, [%[in]], #32                   \n"
-          "subs %[cnt], %[cnt], #1                    \n"
-          "FCVTAS v8.4s, v4.4s                        \n"
-          "FCVTAS v9.4s, v5.4s                        \n"
-          "FCVTAS v10.4s, v6.4s                       \n"
-          "FCVTAS v11.4s, v7.4s                       \n"
-          "ldp q2, q3, [%[in]], #32                   \n"
-          "sqxtn    v4.4h, v8.4s                      \n"
-          "sqxtn2   v4.8h, v9.4s                      \n"
-          "sqxtn    v5.4h, v10.4s                     \n"
-          "sqxtn2   v5.8h, v11.4s                     \n"
-          "sqxtn    v8.8b, v4.8h                      \n"
-          "sqxtn2   v8.16b, v5.8h                     \n"
-          "str q8, [%[out]], #16                      \n"
-          "bne    0b                                  \n"
-          : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop)
-          : [scale] "w"(vscale)
-          : "v0",
-            "v1",
-            "v2",
-            "v3",
-            "v4",
-            "v5",
-            "v6",
-            "v7",
-            "v8",
-            "v9",
-            "v10",
-            "v11");
-#else
-      asm volatile(
-          "vld1.32 {d0-d3},    [%[din]]!                  @ load in0~in7\n"
-          "vld1.32    {d4-d7},    [%[din]]!       @ load in8~in16\n"
-          "0:                                     @ main loop\n"
-          "vand.i32   q4, %q[vpoff], %q[vpoff]    @ set offset, 0.5\n"
-          "vand.i32   q5, q4, q4                  @ set offset, 0.5\n"
-          "vand.i32   q6, q4, q4                  @ set offset, 0.5\n"
-          "vand.i32   q7, q4, q4                  @ set offset, 0.5\n"
-          "vcgt.f32   q8, q0, %q[vzero]           @ get mask > 0, in0\n"
-          "vcgt.f32   q9, q1, %q[vzero]           @ get mask > 0, in1\n"
-          "vcgt.f32   q10, q2, %q[vzero]          @ get mask > 0, in2\n"
-          "vcgt.f32   q11, q3, %q[vzero]          @ get mask > 0, in3\n"
-          "vbif.f32   q4, %q[vnoff], q8           @ get right offset\n"
-          "vbif.f32   q5, %q[vnoff], q9           @ get right offset\n"
-          "vbif.f32   q6, %q[vnoff], q10          @ get right offset\n"
-          "vbif.f32   q7, %q[vnoff], q11          @ get right offset\n"
-          "vmla.f32   q4, q0, %q[vscale]          @ mul scale\n"
-          "vmla.f32   q5, q1, %q[vscale]          @ mul scale\n"
-          "vmla.f32   q6, q2, %q[vscale]          @ mul scale\n"
-          "vmla.f32   q7, q3, %q[vscale]          @ mul scale\n"
-          "vcvt.s32.f32  q0, q4                   @ cvt to int32\n"
-          "vcvt.s32.f32  q1, q5                   @ cvt to int32\n"
-          "vcvt.s32.f32  q2, q6                   @ cvt to int32\n"
-          "vcvt.s32.f32  q3, q7                   @ cvt to int32\n"
-          "vqmovn.s32 d8, q0                      @ cnt to int16\n"
-          "vqmovn.s32 d9, q1                      @ cnt to int16\n"
-          "vqmovn.s32 d10, q2                     @ cnt to int16\n"
-          "vqmovn.s32 d11, q3                     @ cnt to int16\n"
-          "vld1.32 {d0-d3},    [%[din]]!          @ load in0~in7\n"
-          "vqmovn.s16 d12, q4                     @ cnt to int8\n"
-          "vqmovn.s16 d13, q5                     @ cnt to int8\n"
-          "vld1.32 {d4-d7},    [%[din]]!          @ load in8~in16\n"
-          "vst1.32    {d12-d13},  [%[dout]]!      @ write to output\n"
-          "subs   %[cnt], #1                      @ loop count -1\n"
-          "bne    0b                              @ to main loop\n"
-
-          : [dout] "+r"(dout_ptr), [din] "+r"(din_ptr), [cnt] "+r"(cnt_loop)
-          : [vscale] "w"(vscale),
-            [vpoff] "w"(vpoff),
-            [vnoff] "w"(vnoff),
-            [vzero] "w"(vzero)
-          : "q0",
-            "q1",
-            "q2",
-            "q3",
-            "q4",
-            "q5",
-            "q6",
-            "q7",
-            "q8",
-            "q9",
-            "q10",
-            "q11");
-#endif
-    }
-    const float* din_r = din_c + 16 * cnt;
-    signed char* dout_r = dout_c + 16 * cnt;
-    for (int i = 0; i < remain; ++i) {
-      dout_r[i] = saturate_cast<int8_t>(roundf(inv_scale * din_r[i]));
-    }
-  }
-}
-
-void fp32_to_int16(const float* din,
-                   int16_t* dout,
-                   const float* scale,
-                   int axis_size,
-                   int64_t outer_size,
-                   int64_t inner_size) {
-  int cnt = inner_size / 8;
-  int remain = inner_size & 7;
-  int64_t loop_size = outer_size * axis_size;
-
-#pragma omp parallel for
-  for (int j = 0; j < loop_size; ++j) {
-    float inv_scale = 1.f / scale[j % axis_size];
-    float32x4_t vzero = vdupq_n_f32(0.f);
-    float32x4_t vscale = vdupq_n_f32(inv_scale);
-    float32x4_t vpoff = vdupq_n_f32(0.5f);
-    float32x4_t vnoff = vdupq_n_f32(-0.5f);
-    const float* din_c = din + j * inner_size;
-    int16_t* dout_c = dout + j * inner_size;
-    if (cnt > 0) {
-      int cnt_loop = cnt;
-      const float* din_ptr = din_c;
-      int16_t* dout_ptr = dout_c;
-#ifdef __aarch64__
-      asm volatile(
-          "ldp q0, q1, [%[in]], #32                   \n"
-          "0:                                         \n" /* main loop */
-          "fmul v4.4s, v0.4s, %[scale].4s             \n"
-          "fmul v5.4s, v1.4s, %[scale].4s             \n"
-          "ldp q0, q1, [%[in]], #32                   \n"
-          "subs %[cnt], %[cnt], #1                    \n"
-          "FCVTAS v8.4s, v4.4s                        \n"
-          "FCVTAS v9.4s, v5.4s                        \n"
-          "sqxtn    v4.4h, v8.4s                      \n"
-          "sqxtn2   v4.8h, v9.4s                      \n"
-          "str q4, [%[out]], #16                      \n"
-          "bne    0b                                  \n"
-          : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop)
-          : [scale] "w"(vscale)
-          : "v0", "v1", "v4", "v5", "v8", "v9");
-#else
-      asm volatile(
-          "vld1.32 {d0-d3}, [%[din]]!             @ load in0~in7\n"
-          "0:                                     @ main loop\n"
-          "vand.i32   q4, %q[vpoff], %q[vpoff]    @ set offset, 0.5\n"
-          "vand.i32   q5, q4, q4                  @ set offset, 0.5\n"
-          "vand.i32   q6, q4, q4                  @ set offset, 0.5\n"
-          "vand.i32   q7, q4, q4                  @ set offset, 0.5\n"
-          "vcgt.f32   q8, q0, %q[vzero]           @ get mask > 0, in0\n"
-          "vcgt.f32   q9, q1, %q[vzero]           @ get mask > 0, in1\n"
-          "vbif.f32   q4, %q[vnoff], q8           @ get right offset\n"
-          "vbif.f32   q5, %q[vnoff], q9           @ get right offset\n"
-          "vmla.f32   q4, q0, %q[vscale]          @ mul scale\n"
-          "vmla.f32   q5, q1, %q[vscale]          @ mul scale\n"
-          "vcvt.s32.f32  q0, q4                   @ cvt to int32\n"
-          "vcvt.s32.f32  q1, q5                   @ cvt to int32\n"
-          "vqmovn.s32 d8, q0                      @ cnt to int16\n"
-          "vqmovn.s32 d9, q1                      @ cnt to int16\n"
-          "vld1.32 {d0-d3},  [%[din]]!            @ load in0~in7\n"
-          "vst1.32 {d8-d9},  [%[dout]]!           @ write to output\n"
-          "subs   %[cnt], #1                      @ loop count -1\n"
-          "bne    0b                              @ to main loop\n"
-
-          : [dout] "+r"(dout_ptr), [din] "+r"(din_ptr), [cnt] "+r"(cnt_loop)
-          : [vscale] "w"(vscale),
-            [vpoff] "w"(vpoff),
-            [vnoff] "w"(vnoff),
-            [vzero] "w"(vzero)
-          : "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9");
-#endif
-    }
-    const float* din_r = din_c + 8 * cnt;
-    int16_t* dout_r = dout_c + 8 * cnt;
-    for (int i = 0; i < remain; ++i) {
-      dout_r[i] = saturate_cast<int16_t>(roundf(inv_scale * din_r[i]));
-    }
-  }
-}
-
-void int8_to_fp32(const int8_t* in,
-                  float* out,
-                  const float* scale,
-                  int axis_size,
-                  int64_t outer_size,
-                  int64_t inner_size) {
-  int cnt = inner_size / 16;
-  int remain = inner_size & 15;
-  int64_t loop_size = axis_size * outer_size;
-#pragma omp parallel for
-  for (int64_t n = 0; n < loop_size; ++n) {
-    float in_scale = scale[n % axis_size];
-    const signed char* din_c = in + n * inner_size;
-    float* dout_c = out + n * inner_size;
-    float32x4_t vscale = vdupq_n_f32(in_scale);
-    if (cnt > 0) {
-      int loop = cnt;
-      const signed char* din_ptr = din_c;
-      float* dout_ptr = dout_c;
-#ifdef __aarch64__
-      asm volatile(
-          "ldp     d0, d1, [%[in]], #16               \n" /* load 16 int8*/
-          "0:                                 \n"         /* main loop */
-          "sshll   v2.8h, v0.8b, #0           \n"         /* trans to int16*/
-          "sshll   v3.8h, v1.8b, #0           \n"         /* trans to int16*/
-
-          "sshll   v4.4s, v2.4h, #0           \n" /* trans to int32*/
-          "sshll2  v5.4s, v2.8h, #0           \n" /* trans to int32*/
-          "sshll   v6.4s, v3.4h, #0           \n" /* trans to int32*/
-          "sshll2  v7.4s, v3.8h, #0           \n" /* trans to int32*/
-
-          "ldp     d0, d1, [%[in]], #16       \n" /* load 16 int8*/
-
-          "scvtf   v8.4s, v4.4s               \n" /* trans to fp32*/
-          "scvtf   v9.4s, v5.4s               \n" /* trans to fp32*/
-          "scvtf   v10.4s, v6.4s              \n" /* trans to fp32*/
-          "scvtf   v11.4s, v7.4s              \n" /* trans to fp32*/
-
-          "subs    %[loop], %[loop], #1       \n"
-
-          "fmul    v4.4s, v8.4s, %[scale].4s  \n" /* mul with scale*/
-          "fmul    v5.4s, v9.4s, %[scale].4s  \n" /* mul with scale*/
-          "fmul    v6.4s, v10.4s, %[scale].4s \n" /* mul with scale*/
-          "fmul    v7.4s, v11.4s, %[scale].4s \n" /* mul with scale*/
-
-          "stp     q4, q5, [%[out]], #32      \n" /* write to memory*/
-          "stp     q6, q7, [%[out]], #32      \n" /* write to memory*/
-
-          "bne     0b                         \n"
-          : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
-          : [scale] "w"(vscale)
-          : "v0",
-            "v1",
-            "v2",
-            "v3",
-            "v4",
-            "v5",
-            "v6",
-            "v7",
-            "v8",
-            "v9",
-            "v10",
-            "v11");
-#else
-      asm volatile(
-          "vld1.32    {d0-d1},    [%[in]]!            @ load 16 int8\n"
-          "0:                                 @ main loop\n"
-          "vmovl.s8      q2, d0               @ trans to int16\n"
-          "vmovl.s8      q3, d1               @ trans to int16\n"
-          "vmovl.s16     q4, d4               @ trans to int32\n"
-          "vmovl.s16     q5, d5               @ trans to int32\n"
-          "vmovl.s16     q6, d6               @ trans to int32\n"
-          "vmovl.s16     q7, d7               @ trans to int32\n"
-          "vcvt.f32.s32  q0, q4               @ trans to fp32\n"
-          "vcvt.f32.s32  q1, q5               @ trans to fp32\n"
-          "vcvt.f32.s32  q2, q6               @ trans to fp32\n"
-          "vcvt.f32.s32  q3, q7               @ trans to fp32\n"
-          "vmul.f32      q4, q0, %q[scale]    @ mul with scale\n"
-          "vmul.f32      q5, q1, %q[scale]    @ mul with scale\n"
-          "vmul.f32      q6, q2, %q[scale]    @ mul with scale\n"
-          "vmul.f32      q7, q3, %q[scale]    @ mul with scale\n"
-
-          "vld1.32    {d0-d1},    [%[in]]!    @ load 16 int8\n"
-
-          "subs          %[loop], #1            \n"
-
-          "vst1.f32      {d8-d11}, [%[out]]!  @ write to memory\n"
-          "vst1.f32      {d12-d15}, [%[out]]! @ write to memory\n"
-
-          "bne           0b                     \n"
-          : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
-          : [scale] "w"(vscale)
-          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
-#endif  // __aarch64__
-    }
-    const signed char* din_r = din_c + 16 * cnt;
-    float* dout_r = dout_c + 16 * cnt;
-    for (int i = 0; i < remain; ++i) {
-      dout_r[i] = in_scale * din_r[i];
-    }
-  }
-}
-
-void int16_to_fp32(const int16_t* in,
-                   float* out,
-                   const float* scale,
-                   int axis_size,
-                   int64_t outer_size,
-                   int64_t inner_size) {
-  int cnt = inner_size / 16;
-  int remain = inner_size & 15;
-  int64_t loop_size = axis_size * outer_size;
-#pragma omp parallel for
-  for (int64_t n = 0; n < loop_size; ++n) {
-    float in_scale = scale[n % axis_size];
-    const int16_t* din_c = in + n * inner_size;
-    float* dout_c = out + n * inner_size;
-    float32x4_t vscale = vdupq_n_f32(in_scale);
-    if (cnt > 0) {
-      int loop = cnt;
-      const int16_t* din_ptr = din_c;
-      float* dout_ptr = dout_c;
-#ifdef __aarch64__
-      asm volatile(
-          "ldp     q0, q1, [%[in]], #32               \n" /* load 16 int16*/
-          "0:                                 \n"         /* main loop */
-          "sshll   v4.4s, v0.4h, #0           \n"         /* trans to int32*/
-          "sshll2  v5.4s, v0.8h, #0           \n"         /* trans to int32*/
-          "sshll   v6.4s, v1.4h, #0           \n"         /* trans to int32*/
-          "sshll2  v7.4s, v1.8h, #0           \n"         /* trans to int32*/
-
-          "ldp     q0, q1, [%[in]], #32       \n" /* load 16 int16*/
-
-          "scvtf   v8.4s, v4.4s               \n" /* trans to fp32*/
-          "scvtf   v9.4s, v5.4s               \n" /* trans to fp32*/
-          "scvtf   v10.4s, v6.4s              \n" /* trans to fp32*/
-          "scvtf   v11.4s, v7.4s              \n" /* trans to fp32*/
-
-          "subs    %[loop], %[loop], #1       \n"
-
-          "fmul    v4.4s, v8.4s, %[scale].4s  \n" /* mul with scale*/
-          "fmul    v5.4s, v9.4s, %[scale].4s  \n" /* mul with scale*/
-          "fmul    v6.4s, v10.4s, %[scale].4s \n" /* mul with scale*/
-          "fmul    v7.4s, v11.4s, %[scale].4s \n" /* mul with scale*/
-
-          "stp     q4, q5, [%[out]], #32      \n" /* write to memory*/
-          "stp     q6, q7, [%[out]], #32      \n" /* write to memory*/
-
-          "bne     0b                         \n"
-          : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
-          : [scale] "w"(vscale)
-          : "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
-#else
-      asm volatile(
-          "vld1.32    {d0-d3},    [%[in]]!            @ load 16 int16\n"
-          "0:                                 @ main loop\n"
-          "vmovl.s16     q4, d0               @ trans to int32\n"
-          "vmovl.s16     q5, d1               @ trans to int32\n"
-          "vmovl.s16     q6, d2               @ trans to int32\n"
-          "vmovl.s16     q7, d3               @ trans to int32\n"
-          "vcvt.f32.s32  q0, q4               @ trans to fp32\n"
-          "vcvt.f32.s32  q1, q5               @ trans to fp32\n"
-          "vcvt.f32.s32  q2, q6               @ trans to fp32\n"
-          "vcvt.f32.s32  q3, q7               @ trans to fp32\n"
-          "vmul.f32      q4, q0, %q[scale]    @ mul with scale\n"
-          "vmul.f32      q5, q1, %q[scale]    @ mul with scale\n"
-          "vmul.f32      q6, q2, %q[scale]    @ mul with scale\n"
-          "vmul.f32      q7, q3, %q[scale]    @ mul with scale\n"
-
-          "vld1.32    {d0-d3},    [%[in]]!    @ load 16 int8\n"
-
-          "subs          %[loop], #1            \n"
-
-          "vst1.f32      {d8-d11}, [%[out]]!  @ write to memory\n"
-          "vst1.f32      {d12-d15}, [%[out]]! @ write to memory\n"
-
-          "bne           0b                     \n"
-          : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
-          : [scale] "w"(vscale)
-          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
-#endif  // __aarch64__
-    }
-    const int16_t* din_r = din_c + 16 * cnt;
-    float* dout_r = dout_c + 16 * cnt;
-    for (int i = 0; i < remain; ++i) {
-      dout_r[i] = in_scale * din_r[i];
-    }
-  }
-}
-
-void int32_to_fp32(const int* din,
-                   float* dout,
-                   const float* scale,
-                   int axis_size,
-                   int64_t outer_size,
-                   int64_t inner_size) {
-  int cnt = inner_size / 16;
-  int remain = inner_size & 15;
-  int64_t loop_size = axis_size * outer_size;
-#pragma omp parallel for
-  for (int64_t n = 0; n < loop_size; ++n) {
-    float in_scale = scale[n % axis_size];
-    const int* din_c = din + n * inner_size;
-    float* dout_c = dout + n * inner_size;
-    float32x4_t vscale = vdupq_n_f32(in_scale);
-    if (cnt > 0) {
-      int loop = cnt;
-      const int* din_ptr = din_c;
-      float* dout_ptr = dout_c;
-#ifdef __aarch64__
-      asm volatile(
-          "ldp     q0, q1, [%[in]], #32               \n"
-          "ldp  q2, q3, [%[in]], #32          \n"
-          "0:                                 \n"
-          "scvtf   v4.4s, v0.4s               \n"
-          "scvtf   v5.4s, v1.4s               \n"
-          "scvtf   v6.4s, v2.4s               \n"
-          "scvtf   v7.4s, v3.4s               \n"
-          "ldp  q0, q1, [%[in]], #32          \n"
-          "fmul    v8.4s, v4.4s, %[scale].4s  \n"
-          "fmul    v9.4s, v5.4s, %[scale].4s  \n"
-          "fmul    v10.4s, v6.4s, %[scale].4s \n"
-          "fmul    v11.4s, v7.4s, %[scale].4s \n"
-          "ldp  q2, q3, [%[in]], #32          \n"
-          "stp     q8, q9, [%[out]], #32      \n"
-          "stp     q10, q11, [%[out]], #32    \n"
-          "subs    %[loop], %[loop], #1       \n"
-          "bne     0b                         \n"
-          : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
-          : [scale] "w"(vscale)
-          : "v0",
-            "v1",
-            "v2",
-            "v3",
-            "v4",
-            "v5",
-            "v6",
-            "v7",
-            "v8",
-            "v9",
-            "v10",
-            "v11");
-#else
-      asm volatile(
-          "vld1.s32       {d0-d3}, [%[in]]!               \n"
-          "vld1.s32       {d4-d7}, [%[in]]!       \n"
-          "0:                                     \n"
-          "vcvt.f32.s32   q4, q0                  \n"
-          "vcvt.f32.s32   q5, q1                  \n"
-          "vcvt.f32.s32   q6, q2                  \n"
-          "vcvt.f32.s32   q7, q3                  \n"
-          "vld1.s32       {d0-d3}, [%[in]]!       \n"
-          "vmul.f32       q8, q4, %q[scale]       \n"
-          "vmul.f32       q9, q5, %q[scale]       \n"
-          "vmul.f32       q10, q6, %q[scale]      \n"
-          "vmul.f32       q11, q7, %q[scale]      \n"
-          "vld1.s32       {d4-d7}, [%[in]]!       \n"
-          "subs           %[loop], #1             \n"
-          "vst1.f32       {d16-d19}, [%[out]]!    \n"
-          "vst1.f32       {d20-d23}, [%[out]]!    \n"
-          "bne            0b                      \n"
-          : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
-          : [scale] "w"(vscale)
-          : "q0",
-            "q1",
-            "q2",
-            "q3",
-            "q4",
-            "q5",
-            "q6",
-            "q7",
-            "q8",
-            "q9",
-            "q10",
-            "q11");
-#endif  // __aarch64__
-    }
-    const int* din_r = din_c + 16 * cnt;
-    float* dout_r = dout_c + 16 * cnt;
-    for (int i = 0; i < remain; ++i) {
-      dout_r[i] = in_scale * din_r[i];
-    }
-  }
-}
-
-void int32_to_int8(const int* din,
-                   int8_t* dout,
-                   const float* scale,
-                   int axis_size,
-                   int64_t outer_size,
-                   int64_t inner_size) {
-  int cnt = inner_size / 16;
-  int remain = inner_size & 15;
-  int64_t loop_size = outer_size * axis_size;
-#pragma omp parallel for
-  for (int64_t n = 0; n < loop_size; ++n) {
-    float in_scale = scale[n % axis_size];
-    const int* din_c = din + n * inner_size;
-    int8_t* dout_c = dout + n * inner_size;
-    float32x4_t vscale = vdupq_n_f32(in_scale);
-    float32x4_t vzero = vdupq_n_f32(0.f);
-    float32x4_t vpoff = vdupq_n_f32(0.5f);
-    float32x4_t vnoff = vdupq_n_f32(-0.5f);
-    if (cnt > 0) {
-      int loop = cnt;
-      const int* din_ptr = din_c;
-      int8_t* dout_ptr = dout_c;
-#ifdef __aarch64__
-      asm volatile(
-          "0:                                        \n"
-          "ld1     {v0.4s, v1.4s}, [%[in]], #32      \n"
-          "ld1     {v2.4s, v3.4s}, [%[in]], #32      \n"
-
-          "scvtf   v4.4s, v0.4s                      \n"
-          "scvtf   v5.4s, v1.4s                      \n"
-          "scvtf   v6.4s, v2.4s                      \n"
-          "scvtf   v7.4s, v3.4s                      \n"
-
-          "fmul    v0.4s, v4.4s, %[scale].4s         \n"
-          "fmul    v1.4s, v5.4s, %[scale].4s         \n"
-          "fmul    v2.4s, v6.4s, %[scale].4s         \n"
-          "fmul    v3.4s, v7.4s, %[scale].4s         \n"
-
-          "fcvtas  v4.4s, v0.4s                      \n"
-          "fcvtas  v5.4s, v1.4s                      \n"
-          "fcvtas  v6.4s, v2.4s                      \n"
-          "fcvtas  v7.4s, v3.4s                      \n"
-
-          "sqxtn   v0.4h, v4.4s                      \n"
-          "sqxtn2  v0.8h, v5.4s                      \n"
-          "sqxtn   v1.4h, v6.4s                      \n"
-          "sqxtn2  v1.8h, v7.4s                      \n"
-
-          "sqxtn   v2.8b, v0.8h                      \n"
-          "sqxtn2  v2.16b, v1.8h                     \n"
-
-          "st1     {v2.16b}, [%[out]], #16           \n"
-          "subs    %[loop], %[loop], #1              \n"
-          "bne     0b                                \n"
-          : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
-          : [scale] "w"(vscale)
-          : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-#else
-      asm volatile(
-          "vld1.32 {d0-d3},    [%[din]]!                  @ load in0~in7\n"
-          "vld1.32    {d4-d7},    [%[din]]!       @ load in8~in16\n"
-          "0:                                     @ main loop\n"
-          "vcvt.f32.s32   q4, q0                  @ cvt to float\n"
-          "vcvt.f32.s32   q5, q1                  @ cvt to float\n"
-          "vcvt.f32.s32   q6, q2                  @ cvt to float\n"
-          "vcvt.f32.s32   q7, q3                  @ cvt to float\n"
-          "vand.i32   q0, %q[vpoff], %q[vpoff]    @ set offset, 0.5\n"
-          "vand.i32   q1, q0, q0                  @ set offset, 0.5\n"
-          "vand.i32   q2, q0, q0                  @ set offset, 0.5\n"
-          "vand.i32   q3, q0, q0                  @ set offset, 0.5\n"
-          "vcgt.f32   q8, q4, %q[vzero]           @ get mask > 0, in0\n"
-          "vcgt.f32   q9, q5, %q[vzero]           @ get mask > 0, in1\n"
-          "vcgt.f32   q10, q6, %q[vzero]          @ get mask > 0, in2\n"
-          "vcgt.f32   q11, q7, %q[vzero]          @ get mask > 0, in3\n"
-          "vbif.f32   q0, %q[vnoff], q8           @ get right offset\n"
-          "vbif.f32   q1, %q[vnoff], q9           @ get right offset\n"
-          "vbif.f32   q2, %q[vnoff], q10          @ get right offset\n"
-          "vbif.f32   q3, %q[vnoff], q11          @ get right offset\n"
-          "vmla.f32   q0, q4, %q[vscale]          @ mul scale\n"
-          "vmla.f32   q1, q5, %q[vscale]          @ mul scale\n"
-          "vmla.f32   q2, q6, %q[vscale]          @ mul scale\n"
-          "vmla.f32   q3, q7, %q[vscale]          @ mul scale\n"
-          "vcvt.s32.f32  q4, q0                   @ cvt to int32\n"
-          "vcvt.s32.f32  q5, q1                   @ cvt to int32\n"
-          "vcvt.s32.f32  q6, q2                   @ cvt to int32\n"
-          "vcvt.s32.f32  q7, q3                   @ cvt to int32\n"
-          "vqmovn.s32 d16, q4                     @ cnt to int16\n"
-          "vqmovn.s32 d17, q5                     @ cnt to int16\n"
-          "vqmovn.s32 d18, q6                     @ cnt to int16\n"
-          "vqmovn.s32 d19, q7                     @ cnt to int16\n"
-          "vld1.32 {d0-d3},    [%[din]]!          @ load in0~in7\n"
-          "vqmovn.s16 d8, q8                      @ cnt to int8\n"
-          "vqmovn.s16 d9, q9                      @ cnt to int8\n"
-          "vld1.32 {d4-d7},    [%[din]]!          @ load in8~in16\n"
-          "vst1.32 {d8-d9},    [%[dout]]!         @ write to output\n"
-          "subs   %[loop], #1                     @ loop count -1\n"
-          "bne    0b                              @ to main loop\n"
-          : [loop] "+r"(loop), [din] "+r"(din_ptr), [dout] "+r"(dout_ptr)
-          : [vscale] "w"(vscale),
-            [vzero] "w"(vzero),
-            [vnoff] "w"(vnoff),
-            [vpoff] "w"(vpoff)
-          : "q0",
-            "q1",
-            "q2",
-            "q3",
-            "q4",
-            "q5",
-            "q6",
-            "q7",
-            "q8",
-            "q9",
-            "q10",
-            "q11");
-#endif  // __aarch64__
-    }
-    const int* din_r = din_c + 16 * cnt;
-    int8_t* dout_r = dout_c + 16 * cnt;
-    for (int i = 0; i < remain; ++i) {
-      dout_r[i] = saturate_cast<int8_t>(roundf(in_scale * din_r[i]));
-    }
-  }
-}
-
-/******************************************/
-/********    kernel implement     *********/
-/******************************************/
-float compute_max_kernel(const float* din, int64_t size) {
-  float max_value = 0.f;
-  int cnt = size / 16;
-  int remain = size & 15;
-  float32x4_t vmax_val = vdupq_n_f32(0.f);
-  const float* ptr_in = din;
-  if (cnt > 0) {
-    int loop_cnt = cnt;
-#ifdef __aarch64__
-    asm volatile(
-        "ld1 {v0.4s, v1.4s}, [%[in]], #32               \n"
-        "ld1 {v2.4s, v3.4s}, [%[in]], #32               \n"
-        "0:                                             \n"
-        "fabs v4.4s, v0.4s                              \n"
-        "fabs v5.4s, v1.4s                              \n"
-        "fabs v6.4s, v2.4s                              \n"
-        "fabs v7.4s, v3.4s                              \n"
-        "ld1 {v0.4s, v1.4s}, [%[in]], #32               \n"
-        "fmax v2.4s, v4.4s, v5.4s                       \n"
-        "fmax v3.4s, v6.4s, v7.4s                       \n"
-        "fmax v4.4s, v2.4s, v3.4s                       \n"
-        "ld1 {v2.4s, v3.4s}, [%[in]], #32               \n"
-        "fmax %[max_val].4s, v4.4s, %[max_val].4s       \n"
-        "subs %[cnt], %[cnt], #1                        \n"
-        "bne    0b                                 \n"
-        : [in] "+r"(ptr_in), [cnt] "+r"(loop_cnt), [max_val] "+w"(vmax_val)
-        :
-        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-#else
-    asm volatile(
-        "vld1.32   {d0-d3}, [%[in]]!                        @ load 8 float\n"
-        "vld1.32   {d4-d7}, [%[in]]!                @ load 8 float\n"
-        "0:                                         @ main loop\n"
-        "vabs.f32 q4, q0                            @ abs \n"
-        "vabs.f32 q5, q1                            @ abs \n"
-        "vabs.f32 q6, q2                            @ abs \n"
-        "vabs.f32 q7, q3                            @ abs \n"
-        "vld1.32   {d0-d3}, [%[in]]!                @ load 8 float\n"
-        "vmax.f32 q2, q4, q5                        @ max \n"
-        "vmax.f32 q3, q6, q7                        @ max \n"
-        "vmax.f32 q4, q2, q3                        @ max \n"
-        "vld1.32   {d4-d7}, [%[in]]!                @ load 8 float\n"
-        "vmax.f32 %q[max_val], q4, %q[max_val]      @ max \n"
-        "subs %[cnt], #1                            @ loop count -1\n"
-        "bne    0b                                  @ jump to main loop\n"
-
-        : [in] "+r"(ptr_in), [cnt] "+r"(loop_cnt), [max_val] "+w"(vmax_val)
-        :
-        : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
-#endif
-    float32x2_t vmax_p =
-        vpmax_f32(vget_high_f32(vmax_val), vget_low_f32(vmax_val));
-    float max0 = vget_lane_f32(vmax_p, 0);
-    float max1 = vget_lane_f32(vmax_p, 1);
-    float max2 = max0 > max1 ? max0 : max1;
-    max_value = max_value > max2 ? max_value : max2;
-  }
-  ptr_in = din + 16 * cnt;
-  for (int i = 0; i < remain; ++i) {
-    float data = fabsf(*(ptr_in++));
-    max_value = fmaxf(max_value, data);
-  }
-  return max_value;
-}
-
-std::vector<float> get_tensor_scale_n(const float* in_data,
-                                      int axis_size,
-                                      int64_t inner_size,
-                                      float scale_factor) {
-  std::vector<float> scale_out(axis_size);
-#pragma omp parallel for
-  for (int c = 0; c < axis_size; ++c) {              // num
-    const float* ptr_in = in_data + c * inner_size;  // channel*width*height
-    scale_out[c] = compute_max_kernel(ptr_in, inner_size) / scale_factor;
-  }
-  return scale_out;
-}
-
-std::vector<float> get_tensor_scale_chw(const float* in_data,
-                                        int axis_size,
-                                        int64_t outer_size,
-                                        int64_t inner_size,
-                                        float scale_factor) {
-  std::vector<float> scale_out(axis_size);
-  int64_t inner_size_with_axis = axis_size * inner_size;
-#pragma omp parallel for
-  for (int c = 0; c < axis_size; ++c) {
-    const float* din = in_data + c * inner_size;
-    float max_val = 0.f;
-    for (int j = 0; j < outer_size; ++j) {
-      const float* ptr_in = din + j * inner_size_with_axis;
-      max_val = fmaxf(compute_max_kernel(ptr_in, inner_size), max_val);
-    }
-    scale_out[c] = max_val / scale_factor;
-  }
-  return scale_out;
-}
-
-void int32_to_int32(const int* din,
-                    int* dout,
-                    const float* scale,
-                    int axis_size,
-                    int64_t outer_size,
-                    int64_t inner_size) {
-  int size_all = outer_size * axis_size * inner_size;
-  memmove(dout, din, size_all * sizeof(int));
-}
-
-template <>
-void int32_to_dtype(const int* din,
-                    float* dout,
-                    const float* scale,
-                    int axis_size,
-                    int64_t outer_size,
-                    int64_t inner_size) {
-  return int32_to_fp32(din, dout, scale, axis_size, outer_size, inner_size);
-}
-
-template <>
-void int32_to_dtype(const int* din,
-                    signed char* dout,
-                    const float* scale,
-                    int axis_size,
-                    int64_t outer_size,
-                    int64_t inner_size) {
-  return int32_to_int8(din, dout, scale, axis_size, outer_size, inner_size);
-}
-
-template <>
-void int32_to_dtype(const int* din,
-                    int* dout,
-                    const float* scale,
-                    int axis_size,
-                    int64_t outer_size,
-                    int64_t inner_size) {
-  return int32_to_int32(din, dout, scale, axis_size, outer_size, inner_size);
-}
-
-bool trans_tensor_int32_to_int8(Tensor* tin,
-                                Tensor* tout,
-                                float input_scale,
-                                float output_scale,
-                                std::vector<float> weights_scale,
-                                int axis) {
-  tout->Resize(tin->dims());
-
-  // compute scale
-  std::vector<float> scale(weights_scale.size());
-  for (int i = 0; i < weights_scale.size(); ++i) {
-    scale[i] = input_scale * weights_scale[i] / output_scale;
-  }
-
-  auto i_dims = tin->dims();
-  int outer_size = i_dims.count(0, axis);
-  int axis_size = i_dims[axis];
-  int inner_size = i_dims.count(axis + 1, i_dims.size());
-
-  const int* i_data = tin->data<int32_t>();
-  int8_t* o_data = tout->mutable_data<int8_t>();
-  int32_to_int8(
-      i_data, o_data, scale.data(), axis_size, outer_size, inner_size);
-
-  return true;
-}
-
-template <>
-bool get_tensor_scale<PRECISION(kFloat)>(const Tensor& tin,
-                                         std::vector<float>* scale_out,
-                                         int axis,
-                                         float scale_factor) {
-  int axis_size = 1;
-  if (axis >= 0 && axis < tin.dims().size()) {
-    axis_size = tin.dims()[axis];
-  }
-  int outer_size = 1;
-  if (axis >= 0) {
-    outer_size = tin.dims().count(0, axis);
-  }
-  int64_t inner_size = tin.dims().count(axis + 1, tin.dims().size());
-
-  const float* in_data = static_cast<const float*>(tin.data<float>());
-  if (axis <= 0) {
-    *scale_out =
-        get_tensor_scale_n(in_data, axis_size, inner_size, scale_factor);
-  } else {
-    *scale_out = get_tensor_scale_chw(
-        in_data, axis_size, outer_size, inner_size, scale_factor);
-  }
-  return true;
-}
-
-bool trans_tensor_int32_to_fp32(Tensor* tin,
-                                Tensor* tout,
-                                float input_scale,
-                                std::vector<float> weights_scale,
-                                int axis) {
-  tout->Resize(tin->dims());
-
-  // compute scale
-  std::vector<float> scale(weights_scale.size());
-  for (int i = 0; i < weights_scale.size(); ++i) {
-    scale[i] = input_scale * weights_scale[i];
-  }
-
-  auto i_dims = tin->dims();
-  int outer_size = i_dims.count(0, axis);
-  int axis_size = i_dims[axis];
-  int inner_size = i_dims.count(axis + 1, i_dims.size());
-
-  const auto* i_data = tin->data<int32_t>();
-  float* o_data = tout->mutable_data<float>();
-  //! convert to fp32
-  int32_to_fp32(
-      i_data, o_data, scale.data(), axis_size, outer_size, inner_size);
-  return true;
-}
-
-bool trans_tensor_fp32_to_int8(Tensor* tin, Tensor* tout, float input_scale) {
-  tout->Resize(tin->dims());
-
-  // compute scale
-  std::vector<float> scale({input_scale});
-  int inner_size = tin->dims().production();
-
-  const auto* i_data = tin->data<float>();
-  int8_t* o_data = tout->mutable_data<int8_t>();
-  fp32_to_int8(i_data, o_data, scale.data(), 1, 1, inner_size);
-  return true;
-}
-
-bool trans_fp32_bias_to_int32_basic(Tensor* tin,
-                                    Tensor* tout,
-                                    float in_scale,
-                                    std::vector<float> vector_weight_scale) {
-  tout->Resize(tin->dims());
-
-  const float* i_data = tin->data<float>();
-  int* o_data = tout->mutable_data<int>();
-  for (int i = 0; i < tin->dims().production(); ++i) {
-    o_data[i] =
-        static_cast<int>(roundf(i_data[i] / in_scale / vector_weight_scale[i]));
-  }
-  return true;
-}
-
-template <>
-bool trans_tensor_dtype<PRECISION(kInt32), PRECISION(kInt8)>(
-    Tensor* tin,
-    Tensor* tout,
-    float input_scale,
-    float output_scale,
-    std::vector<float> weights_scale) {
-  return trans_tensor_int32_to_int8(
-      tin, tout, input_scale, output_scale, weights_scale, 1);
-}
-
-template <>
-bool trans_tensor_dtype<PRECISION(kInt32), PRECISION(kFloat)>(
-    Tensor* tin,
-    Tensor* tout,
-    float input_scale,
-    float output_scale,
-    std::vector<float> weights_scale) {
-  return trans_tensor_int32_to_fp32(tin, tout, input_scale, weights_scale, 1);
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/type_trans.h b/lite/backends/arm/math/type_trans.h
deleted file mode 100644
index e07d798b10..0000000000
--- a/lite/backends/arm/math/type_trans.h
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <stdint.h>
-#include <vector>
-#include "lite/core/target_wrapper.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <PrecisionType IN, PrecisionType OUT>
-bool trans_tensor_dtype(Tensor* tin,
-                        Tensor* tout,
-                        float input_scale,
-                        float output_scale,
-                        std::vector<float> weights_scale) {
-  LOG(FATAL) << "trans_tensor_dtype has no impl";
-  return false;
-}
-
-template <>
-bool trans_tensor_dtype<PRECISION(kInt32), PRECISION(kInt8)>(
-    Tensor* tin,
-    Tensor* tout,
-    float input_scale,
-    float output_scale,
-    std::vector<float> weights_scale);
-
-template <>
-bool trans_tensor_dtype<PRECISION(kInt32), PRECISION(kFloat)>(
-    Tensor* tin,
-    Tensor* tout,
-    float input_scale,
-    float output_scale,
-    std::vector<float> weights_scale);
-
-template <PrecisionType IN>
-bool get_tensor_scale(const Tensor& tin,
-                      std::vector<float>* scale_out,
-                      int axis,
-                      float scale_factor) {
-  return false;
-}
-
-std::vector<float> get_tensor_scale_n(const float* in_data,
-                                      int axis_size,
-                                      int64_t inner_size,
-                                      float scale_factor);
-
-bool trans_fp32_bias_to_int32_basic(Tensor* tin,
-                                    Tensor* tout,
-                                    float in_scale,
-                                    std::vector<float> vector_weight_scale);
-
-bool trans_tensor_int32_to_int8(Tensor* tin,
-                                Tensor* tout,
-                                float input_scale,
-                                float output_scale,
-                                std::vector<float> weights_scale,
-                                int axis = 1);
-
-bool trans_tensor_int32_to_fp32(Tensor* tin,
-                                Tensor* tout,
-                                float input_scale,
-                                std::vector<float> weights_scale,
-                                int axis = 1);
-
-bool trans_tensor_fp32_to_int8(Tensor* tin, Tensor* tout, float input_scale);
-
-template <>
-bool get_tensor_scale<PRECISION(kFloat)>(const Tensor& tin,
-                                         std::vector<float>* scale_out,
-                                         int axis,
-                                         float scale_factor);
-
-template <typename dtype>
-void int32_to_dtype(const int* din,
-                    dtype* dout,
-                    const float* scale,
-                    int axis_size,
-                    int64_t outer_size,
-                    int64_t inner_size);
-
-void fp32_to_int8(const float* din,
-                  int8_t* dout,
-                  const float* scale,
-                  int axis_size,
-                  int64_t outer_size,
-                  int64_t inner_size);
-
-void int8_to_fp32(const int8_t* in,
-                  float* out,
-                  const float* scale,
-                  int axis_size,
-                  int64_t outer_size,
-                  int64_t inner_size);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/yolo_box.cc b/lite/backends/arm/math/yolo_box.cc
deleted file mode 100644
index 72e67cf693..0000000000
--- a/lite/backends/arm/math/yolo_box.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/yolo_box.h"
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-namespace {
-inline float sigmoid(float x) { return 1.f / (1.f + expf(-x)); }
-
-inline void get_yolo_box(float* box,
-                         const float* x,
-                         const int* anchors,
-                         int i,
-                         int j,
-                         int an_idx,
-                         int grid_size,
-                         int input_size,
-                         int index,
-                         int stride,
-                         int img_height,
-                         int img_width) {
-  box[0] = (i + sigmoid(x[index])) * img_width / grid_size;
-  box[1] = (j + sigmoid(x[index + stride])) * img_height / grid_size;
-  box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width /
-           input_size;
-  box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] *
-           img_height / input_size;
-}
-
-inline int get_entry_index(int batch,
-                           int an_idx,
-                           int hw_idx,
-                           int an_num,
-                           int an_stride,
-                           int stride,
-                           int entry) {
-  return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
-}
-
-inline void calc_detection_box(float* boxes,
-                               float* box,
-                               const int box_idx,
-                               const int img_height,
-                               const int img_width) {
-  boxes[box_idx] = box[0] - box[2] / 2;
-  boxes[box_idx + 1] = box[1] - box[3] / 2;
-  boxes[box_idx + 2] = box[0] + box[2] / 2;
-  boxes[box_idx + 3] = box[1] + box[3] / 2;
-
-  boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast<float>(0);
-  boxes[box_idx + 1] =
-      boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast<float>(0);
-  boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1
-                           ? boxes[box_idx + 2]
-                           : static_cast<float>(img_width - 1);
-  boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1
-                           ? boxes[box_idx + 3]
-                           : static_cast<float>(img_height - 1);
-}
-
-inline void calc_label_score(float* scores,
-                             const float* input,
-                             const int label_idx,
-                             const int score_idx,
-                             const int class_num,
-                             const float conf,
-                             const int stride) {
-  for (int i = 0; i < class_num; i++) {
-    scores[score_idx + i] = conf * sigmoid(input[label_idx + i * stride]);
-  }
-}
-}  // namespace
-
-void yolobox(lite::Tensor* X,
-             lite::Tensor* ImgSize,
-             lite::Tensor* Boxes,
-             lite::Tensor* Scores,
-             std::vector<int> anchors,
-             int class_num,
-             float conf_thresh,
-             int downsample_ratio) {
-  const int n = X->dims()[0];
-  const int h = X->dims()[2];
-  const int w = X->dims()[3];
-  const int b_num = Boxes->dims()[1];
-  const int an_num = anchors.size() / 2;
-  int X_size = downsample_ratio * h;
-
-  const int stride = h * w;
-  const int an_stride = (class_num + 5) * stride;
-
-  auto anchors_data = anchors.data();
-
-  const float* X_data = X->data<float>();
-  float* ImgSize_data = ImgSize->mutable_data<float>();
-
-  float* Boxes_data = Boxes->mutable_data<float>();
-
-  float* Scores_data = Scores->mutable_data<float>();
-
-  float box[4];
-  for (int i = 0; i < n; i++) {
-    int img_height = static_cast<int>(ImgSize_data[2 * i]);
-    int img_width = static_cast<int>(ImgSize_data[2 * i + 1]);
-
-    for (int j = 0; j < an_num; j++) {
-      for (int k = 0; k < h; k++) {
-        for (int l = 0; l < w; l++) {
-          int obj_idx =
-              get_entry_index(i, j, k * w + l, an_num, an_stride, stride, 4);
-          float conf = sigmoid(X_data[obj_idx]);
-          if (conf < conf_thresh) {
-            continue;
-          }
-
-          int box_idx =
-              get_entry_index(i, j, k * w + l, an_num, an_stride, stride, 0);
-          get_yolo_box(box,
-                       X_data,
-                       anchors_data,
-                       l,
-                       k,
-                       j,
-                       h,
-                       X_size,
-                       box_idx,
-                       stride,
-                       img_height,
-                       img_width);
-          box_idx = (i * b_num + j * stride + k * w + l) * 4;
-          calc_detection_box(Boxes_data, box, box_idx, img_height, img_width);
-
-          int label_idx =
-              get_entry_index(i, j, k * w + l, an_num, an_stride, stride, 5);
-          int score_idx = (i * b_num + j * stride + k * w + l) * class_num;
-          calc_label_score(Scores_data,
-                           X_data,
-                           label_idx,
-                           score_idx,
-                           class_num,
-                           conf,
-                           stride);
-        }
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/yolo_box.h b/lite/backends/arm/math/yolo_box.h
deleted file mode 100644
index e454308700..0000000000
--- a/lite/backends/arm/math/yolo_box.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <vector>
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void yolobox(lite::Tensor* X,
-             lite::Tensor* ImgSize,
-             lite::Tensor* Boxes,
-             lite::Tensor* Scores,
-             std::vector<int> anchors,
-             int class_num,
-             float conf_thresh,
-             int downsample_ratio);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/cuda/CMakeLists.txt b/lite/backends/cuda/CMakeLists.txt
deleted file mode 100644
index c0418f6b6a..0000000000
--- a/lite/backends/cuda/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-if(NOT LITE_WITH_CUDA)
-    return()
-endif()
-
-nv_library(target_wrapper_cuda SRCS target_wrapper.cc)
-nv_library(cuda_blas SRCS blas.cc)
- 
-add_subdirectory(math)
diff --git a/lite/backends/cuda/blas.cc b/lite/backends/cuda/blas.cc
deleted file mode 100644
index c9d2d46cfe..0000000000
--- a/lite/backends/cuda/blas.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/cuda/blas.h"
-
-namespace paddle {
-namespace lite {
-namespace cuda {
-
-template <>
-class Blas<float> : public BlasBase {
-  using T = float;
-
-  void sgemm(cublasOperation_t transa,
-             cublasOperation_t transb,  //
-             int m,
-             int n,
-             int k,           //
-             const T* alpha,  //
-             const T* A,
-             int lda,  //
-             const T* B,
-             int ldb,        //
-             const T* beta,  //
-             T* C,
-             int ldc) const {
-    CUBLAS_CALL(cublasSgemm(handle(),
-                            transa,
-                            transb,
-                            m,
-                            n,
-                            k,
-                            alpha,
-                            A,
-                            lda,
-                            B,
-                            ldb,
-                            beta,
-                            C,
-                            ldc));
-  }
-};
-
-}  // namespace cuda
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/cuda/blas.h b/lite/backends/cuda/blas.h
deleted file mode 100644
index f73bb576b8..0000000000
--- a/lite/backends/cuda/blas.h
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <cublasXt.h>
-#include <cublas_api.h>
-#include <cublas_v2.h>
-#include <library_types.h>
-#include "lite/backends/cuda/cuda_utils.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace cuda {
-
-#define CUBLAS_CHECK(xxx) CHECK_EQ((xxx), CUBLAS_STATUS_SUCCESS);
-
-/*
- * Some basic methods.
- */
-struct BlasBase {
-  /*
-  BlasBase() { CUBLAS_CHECK(cublasCreate(&handle_)); }
-  ~BlasBase() { CUBLAS_CHECK(cublasDestroy(handle_)); }
-   */
-
-  void SetStream(cudaStream_t stream) {
-    CUBLAS_CHECK(cublasSetStream(handle_, stream));
-  }
-
-  cudaStream_t GetStream() const {
-    cudaStream_t stream;
-    CUBLAS_CHECK(cublasGetStream_v2(handle_, &stream));
-    return stream;
-  }
-
-  int GetVersion() const {
-    int version{};
-    CUBLAS_CHECK(cublasGetVersion_v2(handle_, &version));
-    return version;
-  }
-
-  cublasHandle_t& handle() const { return handle_; }
-
- protected:
-  // Not thread-safe, should created for each thread.
-  // According to cublas doc.
-  mutable cublasHandle_t handle_;
-};
-
-// T: Scalar type.
-template <typename T>
-class Blas : public lite::cuda::BlasBase {
- public:
-  void sgemm(cublasOperation_t transa,
-             cublasOperation_t transb,  //
-             int m,
-             int n,
-             int k,           //
-             const T* alpha,  //
-             const T* A,
-             int lda,  //
-             const T* B,
-             int ldb,        //
-             const T* beta,  //
-             T* C,
-             int ldc) const {
-    CHECK_EQ(CUBLAS_STATUS_SUCCESS,
-             cublasSgemm(handle_,  //
-                         CUBLAS_OP_N,
-                         CUBLAS_OP_N,  //
-                         m,
-                         n,
-                         k,
-                         alpha,
-                         A,
-                         lda,
-                         B,
-                         ldb,
-                         beta,
-                         C,
-                         ldc));
-  }
-};
-
-}  // namespace cuda
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/cuda/cuda_utils.h b/lite/backends/cuda/cuda_utils.h
deleted file mode 100644
index 13bf8190ef..0000000000
--- a/lite/backends/cuda/cuda_utils.h
+++ /dev/null
@@ -1,124 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <cublasXt.h>
-#include <cublas_api.h>
-#include <cublas_v2.h>
-#include <cuda.h>
-#include <cudnn.h>
-#include "lite/utils/cp_logging.h"
-
-/*
- * This file contains some CUDA specific utils.
- */
-
-// For quickly implementing the prototype, some of the following code snippets
-// are borrowed from project MXNet, great thanks for the original developers.
-
-#define CHECK_CUDA_ERROR(msg)                                                \
-  {                                                                          \
-    auto e = cudaGetLastError();                                             \
-    CHECK_EQ(e, cudaSuccess) << (msg) << " CUDA: " << cudaGetErrorString(e); \
-  }
-
-#define CUDA_CALL(func)                                      \
-  {                                                          \
-    auto e = (func);                                         \
-    CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \
-        << "CUDA: " << cudaGetErrorString(e);                \
-  }
-
-#define CUBLAS_CALL(func)                                        \
-  {                                                              \
-    auto e = (func);                                             \
-    CHECK_EQ(e, CUBLAS_STATUS_SUCCESS)                           \
-        << "cuBlas: " << paddle::lite::cuda::CublasErrorInfo(e); \
-  }
-
-#define CUDNN_VERSION_MIN(major, minor, patch) \
-  (CUDNN_VERSION >= (major * 1000 + minor * 100 + patch))
-
-#define CUDNN_CHECK(condition)                                           \
-  {                                                                      \
-    cudnnStatus_t status = condition;                                    \
-    CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << CudnnGetErrorInfo(status); \
-  }
-
-namespace paddle {
-namespace lite {
-namespace cuda {
-
-static const char* CublasErrorInfo(int error) {
-  switch (error) {
-#define LITE_CUBLAS_ERROR_INFO(xx) \
-  case xx:                         \
-    return #xx;                    \
-    break;
-    LITE_CUBLAS_ERROR_INFO(CUBLAS_STATUS_NOT_INITIALIZED);
-    LITE_CUBLAS_ERROR_INFO(CUBLAS_STATUS_ALLOC_FAILED);
-    LITE_CUBLAS_ERROR_INFO(CUBLAS_STATUS_INVALID_VALUE);
-    LITE_CUBLAS_ERROR_INFO(CUBLAS_STATUS_ARCH_MISMATCH);
-    LITE_CUBLAS_ERROR_INFO(CUBLAS_STATUS_MAPPING_ERROR);
-    LITE_CUBLAS_ERROR_INFO(CUBLAS_STATUS_EXECUTION_FAILED);
-    LITE_CUBLAS_ERROR_INFO(CUBLAS_STATUS_INTERNAL_ERROR);
-    LITE_CUBLAS_ERROR_INFO(CUBLAS_STATUS_NOT_SUPPORTED);
-    LITE_CUBLAS_ERROR_INFO(CUBLAS_STATUS_LICENSE_ERROR);
-#undef LITE_CUBLAS_ERROR_INFO
-    default:
-      return "unknown error";
-  }
-}
-
-static const char* CudnnGetErrorInfo(cudnnStatus_t status) {
-  switch (status) {
-    case CUDNN_STATUS_SUCCESS:
-      return "CUDNN_STATUS_SUCCESS";
-    case CUDNN_STATUS_NOT_INITIALIZED:
-      return "CUDNN_STATUS_NOT_INITIALIZED";
-    case CUDNN_STATUS_ALLOC_FAILED:
-      return "CUDNN_STATUS_ALLOC_FAILED";
-    case CUDNN_STATUS_BAD_PARAM:
-      return "CUDNN_STATUS_BAD_PARAM";
-    case CUDNN_STATUS_INTERNAL_ERROR:
-      return "CUDNN_STATUS_INTERNAL_ERROR";
-    case CUDNN_STATUS_INVALID_VALUE:
-      return "CUDNN_STATUS_INVALID_VALUE";
-    case CUDNN_STATUS_ARCH_MISMATCH:
-      return "CUDNN_STATUS_ARCH_MISMATCH";
-    case CUDNN_STATUS_MAPPING_ERROR:
-      return "CUDNN_STATUS_MAPPING_ERROR";
-    case CUDNN_STATUS_EXECUTION_FAILED:
-      return "CUDNN_STATUS_EXECUTION_FAILED";
-    case CUDNN_STATUS_NOT_SUPPORTED:
-      return "CUDNN_STATUS_NOT_SUPPORTED";
-    case CUDNN_STATUS_LICENSE_ERROR:
-      return "CUDNN_STATUS_LICENSE_ERROR";
-#if CUDNN_VERSION_MIN(6, 0, 0)
-    case CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING:
-      return "CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING";
-#endif
-#if CUDNN_VERSION_MIN(7, 0, 0)
-    case CUDNN_STATUS_RUNTIME_IN_PROGRESS:
-      return "CUDNN_STATUS_RUNTIME_IN_PROGRESS";
-    case CUDNN_STATUS_RUNTIME_FP_OVERFLOW:
-      return "CUDNN_STATUS_RUNTIME_FP_OVERFLOW";
-#endif
-  }
-  return "Unknown cudnn status";
-}
-
-}  // namespace cuda
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/cuda/math/CMakeLists.txt b/lite/backends/cuda/math/CMakeLists.txt
deleted file mode 100644
index c49713fbfe..0000000000
--- a/lite/backends/cuda/math/CMakeLists.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-if(NOT LITE_WITH_CUDA)
-    return()
-endif()
-
-nv_library(cuda_activation SRCS activation.cu)
-nv_library(cuda_scale SRCS scale.cu)
-nv_library(cuda_type_trans SRCS type_trans.cu)
-nv_library(cuda_transpose SRCS transpose.cu)
-nv_library(cudnn_conv SRCS cudnn_conv.cc DEPS cuda_activation cuda_scale
-cuda_type_trans)
-
-set (
- math_cuda
- cudnn_conv
- cuda_activation
- cuda_scale
- cuda_type_trans
- cuda_transpose
-)
-
-set(math_cuda "${math_cuda}" CACHE GLOBAL "math cuda")
diff --git a/lite/backends/cuda/math/activation.cu b/lite/backends/cuda/math/activation.cu
deleted file mode 100644
index 0f50df8e60..0000000000
--- a/lite/backends/cuda/math/activation.cu
+++ /dev/null
@@ -1,285 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <iostream>
-#include "lite/backends/cuda/math/activation.h"
-#include "lite/backends/cuda/math/utils.h"
-
-namespace paddle {
-namespace lite {
-namespace cuda {
-namespace math {
-
-template <typename T>
-__global__ void relu_kernel(const int num,
-                            const T alpha,
-                            const T* input,
-                            T* output) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < num) {
-#if __CUDA_ARCH__ >= 350
-    output[index] = __ldg(input + index) >= 0 ? __ldg(input + index)
-                                              : __ldg(input + index) * alpha;
-#else
-    output[index] = input[index] >= 0 ? input[index] : input[index] * alpha;
-#endif
-  }
-}
-
-__global__ void bias_relu_int8_nhwc4_kernel(int num,
-                                            const float4* in,
-                                            const float4* bias,
-                                            float4* out,
-                                            int N,
-                                            int K,
-                                            int H,
-                                            int W,
-                                            const float4* scale,
-                                            float alpha) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < num) {
-    int bias_idx = tid % K;
-    const float4 bias_ptr = bias[bias_idx];
-    const float4 scale_ptr = scale[bias_idx];
-    const float4 in_ptr = in[tid];
-
-    float4 packed_val;
-    packed_val.x = in_ptr.x * scale_ptr.x + bias_ptr.x;
-    packed_val.x = fmaxf(packed_val.x * alpha, packed_val.x);
-    packed_val.y = in_ptr.y * scale_ptr.y + bias_ptr.y;
-    packed_val.y = fmaxf(packed_val.y * alpha, packed_val.y);
-    packed_val.z = in_ptr.z * scale_ptr.z + bias_ptr.z;
-    packed_val.z = fmaxf(packed_val.z * alpha, packed_val.z);
-    packed_val.w = in_ptr.w * scale_ptr.w + bias_ptr.w;
-    packed_val.w = fmaxf(packed_val.w * alpha, packed_val.w);
-    out[tid] = packed_val;
-  }
-}
-
-__global__ void bias_relu_int8_nhwc4_kernel(int num,
-                                            const float4* in,
-                                            const float4* bias,
-                                            char4* out,
-                                            int N,
-                                            int K,
-                                            int H,
-                                            int W,
-                                            const float4* scale,
-                                            float alpha) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < num) {
-    int bias_idx = tid % K;
-    const float4 bias_ptr = bias[bias_idx];
-    const float4 scale_ptr = scale[bias_idx];
-    const float4 in_ptr = in[tid];
-
-    float4 packed_val;
-    char4 result_val;
-    packed_val.x = in_ptr.x * scale_ptr.x + bias_ptr.x;
-    result_val.x =
-        from_float<int8_t>(fmaxf(packed_val.x * alpha, packed_val.x));
-    packed_val.y = in_ptr.y * scale_ptr.y + bias_ptr.y;
-    result_val.y =
-        from_float<int8_t>(fmaxf(packed_val.y * alpha, packed_val.y));
-    packed_val.z = in_ptr.z * scale_ptr.z + bias_ptr.z;
-    result_val.z =
-        from_float<int8_t>(fmaxf(packed_val.z * alpha, packed_val.z));
-    packed_val.w = in_ptr.w * scale_ptr.w + bias_ptr.w;
-    result_val.w =
-        from_float<int8_t>(fmaxf(packed_val.w * alpha, packed_val.w));
-
-    out[tid] = result_val;
-  }
-}
-
-__global__ void relu_int8_nhwc4_kernel(int num,
-                                       const float4* in,
-                                       float4* out,
-                                       int N,
-                                       int K,
-                                       int H,
-                                       int W,
-                                       const float4* scale,
-                                       float alpha) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < num) {
-    int scale_idx = tid % K;
-    const float4 scale_ptr = scale[scale_idx];
-    const float4 in_ptr = in[tid];
-
-    float4 packed_val;
-    packed_val.x = in_ptr.x * scale_ptr.x;
-    packed_val.x = fmaxf(packed_val.x * alpha, packed_val.x);
-    packed_val.y = in_ptr.y * scale_ptr.y;
-    packed_val.y = fmaxf(packed_val.y * alpha, packed_val.y);
-    packed_val.z = in_ptr.z * scale_ptr.z;
-    packed_val.z = fmaxf(packed_val.z * alpha, packed_val.z);
-    packed_val.w = in_ptr.w * scale_ptr.w;
-    packed_val.w = fmaxf(packed_val.w * alpha, packed_val.w);
-    out[tid] = packed_val;
-  }
-}
-
-__global__ void relu_int8_nhwc4_kernel(int num,
-                                       const float4* in,
-                                       char4* out,
-                                       int N,
-                                       int K,
-                                       int H,
-                                       int W,
-                                       const float4* scale,
-                                       float alpha) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < num) {
-    int scale_idx = tid % K;
-    const float4 scale_ptr = scale[scale_idx];
-    const float4 in_ptr = in[tid];
-
-    float4 packed_val;
-    char4 result_val;
-    packed_val.x = in_ptr.x * scale_ptr.x;
-    result_val.x =
-        from_float<int8_t>(fmaxf(packed_val.x * alpha, packed_val.x));
-    packed_val.y = in_ptr.y * scale_ptr.y;
-    result_val.y =
-        from_float<int8_t>(fmaxf(packed_val.y * alpha, packed_val.y));
-    packed_val.z = in_ptr.z * scale_ptr.z;
-    result_val.z =
-        from_float<int8_t>(fmaxf(packed_val.z * alpha, packed_val.z));
-    packed_val.w = in_ptr.w * scale_ptr.w;
-    result_val.w =
-        from_float<int8_t>(fmaxf(packed_val.w * alpha, packed_val.w));
-
-    out[tid] = result_val;
-  }
-}
-
-template <>
-void bias_relu_int8_nhwc4<float>(int num,
-                                 const void* in,
-                                 const void* bias,
-                                 void* out,
-                                 int N,
-                                 int K,
-                                 int H,
-                                 int W,
-                                 const void* scale,
-                                 float alpha,
-                                 cudaStream_t stream) {
-  int thread = 256;
-  int block = (num + thread - 1) / thread;
-  bias_relu_int8_nhwc4_kernel<<<block, thread, 0, stream>>>(
-      num,
-      static_cast<const float4*>(in),
-      static_cast<const float4*>(bias),
-      static_cast<float4*>(out),
-      N,
-      K,
-      H,
-      W,
-      static_cast<const float4*>(scale),
-      alpha);
-}
-
-template <>
-void bias_relu_int8_nhwc4<int8_t>(int num,
-                                  const void* in,
-                                  const void* bias,
-                                  void* out,
-                                  int N,
-                                  int K,
-                                  int H,
-                                  int W,
-                                  const void* scale,
-                                  float alpha,
-                                  cudaStream_t stream) {
-  int thread = 256;
-  int block = (num + thread - 1) / thread;
-  bias_relu_int8_nhwc4_kernel<<<block, thread, 0, stream>>>(
-      num,
-      static_cast<const float4*>(in),
-      static_cast<const float4*>(bias),
-      static_cast<char4*>(out),
-      N,
-      K,
-      H,
-      W,
-      static_cast<const float4*>(scale),
-      alpha);
-}
-
-template <>
-void relu_int8_nhwc4<float>(int num,
-                            const void* in,
-                            void* out,
-                            int N,
-                            int K,
-                            int H,
-                            int W,
-                            const void* scale,
-                            float alpha,
-                            cudaStream_t stream) {
-  int thread = 256;
-  int block = (num + thread - 1) / thread;
-  relu_int8_nhwc4_kernel<<<block, thread, 0, stream>>>(
-      num,
-      static_cast<const float4*>(in),
-      static_cast<float4*>(out),
-      N,
-      K,
-      H,
-      W,
-      static_cast<const float4*>(scale),
-      alpha);
-}
-
-template <>
-void relu_int8_nhwc4<int8_t>(int num,
-                             const void* in,
-                             void* out,
-                             int N,
-                             int K,
-                             int H,
-                             int W,
-                             const void* scale,
-                             float alpha,
-                             cudaStream_t stream) {
-  int thread = 256;
-  int block = (num + thread - 1) / thread;
-  relu_int8_nhwc4_kernel<<<block, thread, 0, stream>>>(
-      num,
-      static_cast<const float4*>(in),
-      static_cast<char4*>(out),
-      N,
-      K,
-      H,
-      W,
-      static_cast<const float4*>(scale),
-      alpha);
-}
-
-template <typename T>
-void relu(int num, const T* din, T* dout, float alpha, cudaStream_t stream) {
-  int thread = 256;
-  int block = (num + thread - 1) / thread;
-  relu_kernel<<<block, thread, 0, stream>>>(num, alpha, din, dout);
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) std::cout << cudaGetErrorString(error);
-}
-template void relu(int, const float*, float*, float, cudaStream_t);
-
-}  // namespace math
-}  // namespace cuda
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/cuda/math/activation.h b/lite/backends/cuda/math/activation.h
deleted file mode 100644
index 7bcb1efdba..0000000000
--- a/lite/backends/cuda/math/activation.h
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <string>
-
-namespace paddle {
-namespace lite {
-namespace cuda {
-namespace math {
-
-// fp32
-template <typename T>
-void relu(int num, const T* din, T* dout, float alpha, cudaStream_t stream);
-
-// For int8
-template <typename out_type>
-void bias_relu_int8_nhwc4(int num,
-                          const void* in,
-                          const void* bias,
-                          void* out,
-                          int N,
-                          int K,
-                          int H,
-                          int W,
-                          const void* scale,
-                          float alpha,
-                          cudaStream_t stream);
-
-template <typename out_type>
-void relu_int8_nhwc4(int num,
-                     const void* in,
-                     void* out,
-                     int N,
-                     int K,
-                     int H,
-                     int W,
-                     const void* scale,
-                     float alpha,
-                     cudaStream_t stream);
-
-}  // namespace math
-}  // namespace cuda
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/cuda/math/cudnn_conv.cc b/lite/backends/cuda/math/cudnn_conv.cc
deleted file mode 100644
index ec7fac3187..0000000000
--- a/lite/backends/cuda/math/cudnn_conv.cc
+++ /dev/null
@@ -1,481 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/cuda/math/cudnn_conv.h"
-#include "lite/backends/cuda/math/activation.h"
-#include "lite/backends/cuda/math/scale.h"
-#include "lite/backends/cuda/math/type_trans.h"
-
-namespace paddle {
-namespace lite {
-namespace cuda {
-namespace math {
-
-template <>
-bool CudnnConv2D<PRECISION(kFloat)>::create(const operators::ConvParam& param,
-                                            Context<TARGET(kCUDA)>* ctx) {
-  auto x_dims = param.x->dims();
-  auto w_dims = param.filter->dims();
-  auto o_dims = param.output->dims();
-  int batch = x_dims[0];
-
-  int iw = x_dims[3];  // nchw
-  int ih = x_dims[2];
-  int ic = x_dims[1];
-  int ow = o_dims[3];
-  int oh = o_dims[2];
-  int oc = o_dims[1];
-  int kw = w_dims[3];
-  int kh = w_dims[2];
-  int sw = param.strides[1];
-  int sh = param.strides[0];
-  int pw = param.paddings[1];
-  int ph = param.paddings[0];
-  int dw = param.dilations[1];
-  int dh = param.dilations[0];
-
-  CHECK(ic % param.groups == 0)
-      << "The conv input channel shoud be divide group number.";
-
-  CUDNN_CHECK(cudnnSetTensor4dDescriptor(this->input_desc_,
-                                         CUDNN_TENSOR_NCHW,
-                                         CUDNN_DATA_FLOAT,
-                                         batch,
-                                         ic,
-                                         ih,
-                                         iw));
-  CUDNN_CHECK(cudnnSetFilter4dDescriptor(this->filter_desc_,
-                                         CUDNN_DATA_FLOAT,
-                                         CUDNN_TENSOR_NCHW,
-                                         oc,
-                                         ic / param.groups,
-                                         kh,
-                                         kw));
-  CUDNN_CHECK(cudnnSetConvolution2dDescriptor(this->conv_desc_,
-                                              ph,
-                                              pw,
-                                              sh,
-                                              sw,
-                                              dh,
-                                              dw,
-                                              CUDNN_CROSS_CORRELATION,
-                                              CUDNN_DATA_FLOAT));
-  CUDNN_CHECK(cudnnSetConvolutionGroupCount(this->conv_desc_, param.groups));
-  CUDNN_CHECK(cudnnSetTensor4dDescriptor(this->output_desc_,
-                                         CUDNN_TENSOR_NCHW,
-                                         CUDNN_DATA_FLOAT,
-                                         batch,
-                                         oc,
-                                         oh,
-                                         ow));
-
-  if (param.activation_param.has_active && with_relu_act_) {
-    CUDNN_CHECK(cudnnSetActivationDescriptor(
-        this->act_desc_, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0.0));
-  }
-
-  if (ic == param.groups && ic == oc && ic != 1) {
-    this->fwd_algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-  } else {
-    CUDNN_CHECK(
-        cudnnGetConvolutionForwardAlgorithm(this->handle_,
-                                            this->input_desc_,
-                                            this->filter_desc_,
-                                            this->conv_desc_,
-                                            this->output_desc_,
-                                            this->preference_,
-                                            this->workspace_limit_bytes_,
-                                            &this->fwd_algo_));
-  }
-  CUDNN_CHECK(
-      cudnnGetConvolutionForwardWorkspaceSize(this->handle_,
-                                              this->input_desc_,
-                                              this->filter_desc_,
-                                              this->conv_desc_,
-                                              this->output_desc_,
-                                              this->fwd_algo_,
-                                              &this->workspace_fwd_sizes_));
-  if (this->workspace_fwd_sizes_ > this->workspace_size_inbytes_) {
-    this->workspace_size_inbytes_ = this->workspace_fwd_sizes_;
-    if (this->workspace_data_ != NULL) {
-      cudaFree(this->workspace_data_);
-    }
-    cudaMalloc(&this->workspace_data_, this->workspace_size_inbytes_);
-    this->workspace_ = reinterpret_cast<char*>(this->workspace_data_);
-  }
-  if (param.bias) {
-    int dim_bias[] = {1, oc, 1, 1};
-    int stride_bias[] = {oc, 1, 1, 1};
-    cudnnSetTensorNdDescriptor(
-        this->bias_desc_, CUDNN_DATA_FLOAT, 4, dim_bias, stride_bias);
-  }
-  return true;
-}
-
-template <>
-bool CudnnConv2D<PRECISION(kFloat)>::init(const operators::ConvParam& param,
-                                          Context<TARGET(kCUDA)>* ctx) {
-  this->workspace_size_inbytes_ = 0;
-  this->workspace_data_ = NULL;
-  this->workspace_fwd_sizes_ = 0;
-
-  this->stream_ = ctx->exec_stream();
-  CUDNN_CHECK(cudnnCreate(&this->handle_));
-  CUDNN_CHECK(cudnnSetStream(this->handle_, this->stream_));
-
-  this->workspace_ = NULL;
-
-  cudnnCreateTensorDescriptor(&this->input_desc_);
-  cudnnCreateTensorDescriptor(&this->output_desc_);
-  cudnnCreateFilterDescriptor(&this->filter_desc_);
-  cudnnCreateConvolutionDescriptor(&this->conv_desc_);
-  cudnnCreateTensorDescriptor(&this->bias_desc_);
-
-  if (param.activation_param.has_active) {
-    if (param.activation_param.active_type == lite_api::ActivationType::kRelu) {
-      cudnnCreateActivationDescriptor(&this->act_desc_);
-    } else {
-      this->with_relu_act_ = false;
-    }
-  }
-  return create(param, ctx);
-}
-
-template <>
-bool CudnnConv2D<PRECISION(kFloat)>::run(const operators::ConvParam& param) {
-  const auto* i_data = param.x->data<float>();
-  const auto* w_data = param.filter->data<float>();
-  const auto* b_data = param.bias ? param.bias->data<float>() : nullptr;
-  auto* o_data = param.output->mutable_data<float>(TARGET(kCUDA));
-
-  if (param.activation_param.has_active && with_relu_act_) {
-    if (b_data) {
-      float alpha = 1.0f;
-      float beta = 0.0f;
-      CUDNN_CHECK(cudnnConvolutionBiasActivationForward(handle_,
-                                                        &alpha,
-                                                        input_desc_,
-                                                        i_data,
-                                                        filter_desc_,
-                                                        w_data,
-                                                        conv_desc_,
-                                                        fwd_algo_,
-                                                        workspace_,
-                                                        workspace_fwd_sizes_,
-                                                        &beta,
-                                                        output_desc_,
-                                                        o_data,
-                                                        bias_desc_,
-                                                        b_data,
-                                                        act_desc_,
-                                                        output_desc_,
-                                                        o_data));
-    } else {
-      float alpha = 1.0f;
-      float beta = 0.0f;
-      CUDNN_CHECK(cudnnConvolutionForward(handle_,
-                                          &alpha,
-                                          input_desc_,
-                                          i_data,
-                                          filter_desc_,
-                                          w_data,
-                                          conv_desc_,
-                                          fwd_algo_,
-                                          workspace_,
-                                          workspace_fwd_sizes_,
-                                          &beta,
-                                          output_desc_,
-                                          o_data));
-
-      CUDNN_CHECK(cudnnActivationForward(handle_,
-                                         act_desc_,
-                                         &alpha,
-                                         output_desc_,
-                                         o_data,
-                                         &beta,
-                                         output_desc_,
-                                         o_data));
-    }
-  } else {
-    float alpha = 1.0f;
-    float beta = 0.0f;
-    CUDNN_CHECK(cudnnConvolutionForward(handle_,
-                                        &alpha,
-                                        input_desc_,
-                                        i_data,
-                                        filter_desc_,
-                                        w_data,
-                                        conv_desc_,
-                                        fwd_algo_,
-                                        workspace_,
-                                        workspace_fwd_sizes_,
-                                        &beta,
-                                        output_desc_,
-                                        o_data));
-    if (b_data) {
-      CUDNN_CHECK(cudnnAddTensor(
-          handle_, &alpha, bias_desc_, b_data, &alpha, output_desc_, o_data));
-    }
-  }
-
-  if (!with_relu_act_) {
-    CHECK(param.activation_param.active_type ==
-          lite_api::ActivationType::kLeakyRelu)
-        << "Only support leaky relu now.";
-    auto out_dims = param.output->dims();
-    int n = out_dims[0], c = out_dims[1], h = out_dims[2], w = out_dims[3];
-    int num = n * h * w * c;
-    float alpha = param.activation_param.Leaky_relu_alpha;
-
-    relu(num, o_data, o_data, alpha, this->stream_);
-  }
-  return true;
-}
-
-template <PrecisionType Ptype_out>
-bool CudnnConv2DInt8<Ptype_out>::create(const operators::ConvParam& param,
-                                        Context<TARGET(kCUDA)>* ctx) {
-  auto x_dims = param.x->dims();
-  auto w_dims = param.filter->dims();
-  auto o_dims = param.output->dims();
-
-  int batch = x_dims[0];
-
-  int iw = x_dims[2];  // nchw
-  int ih = x_dims[1];
-  int ic = x_dims[3];
-  int ow = o_dims[2];
-  int oh = o_dims[1];
-  int oc = o_dims[3];
-
-  int kw = w_dims[2];
-  int kh = w_dims[1];
-
-  int sw = param.strides[1];
-  int sh = param.strides[0];
-  int pw = param.paddings[1];
-  int ph = param.paddings[0];
-  int dw = param.dilations[1];
-  int dh = param.dilations[0];
-
-  std::vector<float> weight_scale = param.weight_scale;
-  float input_scale = param.input_scale;
-  float output_scale = param.output_scale;
-  CHECK(weight_scale.size() == oc)
-      << "the num of the weight_scale should be equals to the output channel.";
-  if (Ptype_out == PRECISION(kInt8)) {
-    this->temp_tensor_.Resize(o_dims);
-    this->temp_tensor_.template mutable_data<float>(TARGET(kCUDA));
-    for (int i = 0; i < weight_scale.size(); i++) {
-      weight_scale[i] = (weight_scale[i] * input_scale) / output_scale;
-    }
-  } else {
-    for (int i = 0; i < weight_scale.size(); i++) {
-      weight_scale[i] = (weight_scale[i] * input_scale);
-    }
-  }
-  this->scale_.Resize({oc});
-  auto* scale_data = this->scale_.template mutable_data<float>(TARGET(kCUDA));
-  this->scale_.template Assign<float, lite::DDim, TARGET(kCUDA)>(
-      weight_scale.data(), this->scale_.dims());
-
-  CHECK(ic % param.groups == 0)
-      << "The conv input channel shoud be divide group number.";
-  CUDNN_CHECK(cudnnSetTensor4dDescriptor(this->input_desc_,
-                                         CUDNN_TENSOR_NHWC,
-                                         CUDNN_DATA_INT8,
-                                         batch,
-                                         ic,
-                                         ih,
-                                         iw));
-  CUDNN_CHECK(cudnnSetFilter4dDescriptor(this->filter_desc_,
-                                         CUDNN_DATA_INT8,
-                                         CUDNN_TENSOR_NHWC,
-                                         oc,
-                                         ic / param.groups,
-                                         kh,
-                                         kw));
-  CUDNN_CHECK(cudnnSetConvolution2dDescriptor(this->conv_desc_,
-                                              ph,
-                                              pw,
-                                              sh,
-                                              sw,
-                                              dh,
-                                              dw,
-                                              CUDNN_CROSS_CORRELATION,
-                                              CUDNN_DATA_INT32));
-
-  CUDNN_CHECK(cudnnSetTensor4dDescriptor(this->output_desc_,
-                                         CUDNN_TENSOR_NHWC,
-                                         CUDNN_DATA_FLOAT,
-                                         batch,
-                                         oc,
-                                         oh,
-                                         ow));
-
-  this->fwd_algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
-  CUDNN_CHECK(
-      cudnnGetConvolutionForwardWorkspaceSize(this->handle_,
-                                              this->input_desc_,
-                                              this->filter_desc_,
-                                              this->conv_desc_,
-                                              this->output_desc_,
-                                              this->fwd_algo_,
-                                              &(this->workspace_fwd_sizes_)));
-
-  if (this->workspace_fwd_sizes_ > this->workspace_size_inbytes_) {
-    this->workspace_size_inbytes_ = this->workspace_fwd_sizes_;
-    if (this->workspace_data_ != NULL) {
-      cudaFree(this->workspace_data_);
-    }
-    cudaMalloc(&this->workspace_data_, this->workspace_size_inbytes_);
-    this->workspace_ = reinterpret_cast<char*>(this->workspace_data_);
-  }
-
-  return true;
-}
-
-template <PrecisionType Ptype_out>
-bool CudnnConv2DInt8<Ptype_out>::init(const operators::ConvParam& param,
-                                      Context<TARGET(kCUDA)>* ctx) {
-  this->workspace_size_inbytes_ = 0;  // 64Mb
-  this->workspace_data_ = NULL;
-  this->workspace_fwd_sizes_ = 0;
-
-  this->stream_ = ctx->exec_stream();
-  CUDNN_CHECK(cudnnCreate(&this->handle_));
-  CUDNN_CHECK(cudnnSetStream(this->handle_, this->stream_));
-
-  this->workspace_ = NULL;
-
-  cudnnCreateTensorDescriptor(&this->input_desc_);
-  cudnnCreateTensorDescriptor(&this->output_desc_);
-  cudnnCreateFilterDescriptor(&this->filter_desc_);
-  cudnnCreateConvolutionDescriptor(&this->conv_desc_);
-  cudnnCreateTensorDescriptor(&this->bias_desc_);
-
-  if (param.activation_param.has_active) {
-    if (!(param.activation_param.active_type ==
-          lite_api::ActivationType::kRelu)) {
-      this->with_relu_act_ = false;
-    }
-  }
-  return create(param, ctx);
-}
-
-template <PrecisionType Ptype_out>
-bool CudnnConv2DInt8<Ptype_out>::run(const operators::ConvParam& param) {
-  const auto* i_data = param.x->data<int8_t>();
-  const auto* w_data = param.filter->data<int8_t>();
-  const auto* b_data = param.bias ? param.bias->data<float>() : nullptr;
-  float* temp_out;
-  float* scale = this->scale_.template mutable_data<float>(TARGET(kCUDA));
-  if (Ptype_out == PRECISION(kInt8)) {
-    temp_out = this->temp_tensor_.template mutable_data<float>(TARGET(kCUDA));
-  } else {
-    temp_out = param.output->mutable_data<float>(TARGET(kCUDA));
-  }
-
-  float alpha = 1.0f;
-  float beta = 0.0f;
-  CUDNN_CHECK(cudnnConvolutionForward(this->handle_,
-                                      &alpha,
-                                      this->input_desc_,
-                                      i_data,
-                                      this->filter_desc_,
-                                      w_data,
-                                      this->conv_desc_,
-                                      this->fwd_algo_,
-                                      this->workspace_,
-                                      this->workspace_fwd_sizes_,
-                                      &beta,
-                                      this->output_desc_,
-                                      temp_out));
-
-  auto out_dims = param.output->dims();
-  int n = out_dims[0], h = out_dims[1], w = out_dims[2], c = out_dims[3];
-  int num = n * h * w * c / 4;
-
-  if (!param.activation_param.has_active && !b_data) {
-    if (Ptype_out == PRECISION(kInt8)) {
-      auto* out = param.output->mutable_data<int8_t>(TARGET(kCUDA));
-      fp32_to_int8_nhwc4(num,
-                         static_cast<const void*>(temp_out),
-                         static_cast<void*>(out),
-                         static_cast<const void*>(scale),
-                         n,
-                         c / 4,
-                         h,
-                         w,
-                         this->stream_);
-    } else {
-      fp32_scale_nhwc4(num,
-                       static_cast<const void*>(temp_out),
-                       static_cast<void*>(temp_out),
-                       static_cast<const void*>(scale),
-                       n,
-                       c / 4,
-                       h,
-                       w,
-                       this->stream_);
-    }
-    return true;
-  }
-
-  if (b_data) {
-    if (param.activation_param.has_active) {
-      float alpha = 0.0;
-      if (!this->with_relu_act_)
-        alpha = param.activation_param.Leaky_relu_alpha;
-      if (Ptype_out == PRECISION(kInt8)) {
-        auto* out = param.output->mutable_data<int8_t>(TARGET(kCUDA));
-        bias_relu_int8_nhwc4<int8_t>(num,
-                                     static_cast<const void*>(temp_out),
-                                     static_cast<const void*>(b_data),
-                                     static_cast<void*>(out),
-                                     n,
-                                     c / 4,
-                                     h,
-                                     w,
-                                     static_cast<const void*>(scale),
-                                     alpha,
-                                     this->stream_);
-      } else {
-        bias_relu_int8_nhwc4<float>(num,
-                                    static_cast<const void*>(temp_out),
-                                    static_cast<const void*>(b_data),
-                                    static_cast<void*>(temp_out),
-                                    n,
-                                    c / 4,
-                                    h,
-                                    w,
-                                    static_cast<const void*>(scale),
-                                    alpha,
-                                    this->stream_);
-      }
-      return true;
-    }
-  }
-
-  CHECK(false)
-      << "Conv Int8 support Conv, Conv + bias + relu, Conv + bias + leaky_relu";
-}
-
-template class CudnnConv2DInt8<PRECISION(kInt8)>;
-template class CudnnConv2DInt8<PRECISION(kFloat)>;
-
-}  // namespace math
-}  // namespace cuda
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/cuda/math/cudnn_conv.h b/lite/backends/cuda/math/cudnn_conv.h
deleted file mode 100644
index 03612a5e5a..0000000000
--- a/lite/backends/cuda/math/cudnn_conv.h
+++ /dev/null
@@ -1,132 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <cudnn.h>
-#include <string>
-#include <vector>
-#include "lite/api/paddle_place.h"
-#include "lite/backends/cuda/cuda_utils.h"
-#include "lite/core/context.h"
-#include "lite/core/target_wrapper.h"
-#include "lite/operators/op_params.h"
-
-namespace paddle {
-namespace lite {
-namespace cuda {
-namespace math {
-
-template <PrecisionType Ptype_out>
-class CudnnConv2DBase {
- public:
-  CudnnConv2DBase()
-      : handle_(NULL),
-        workspace_data_(NULL),
-        workspace_(NULL),
-        conv_desc_(NULL),
-        input_desc_(NULL),
-        output_desc_(NULL),
-        filter_desc_(NULL),
-        act_desc_(NULL),
-        bias_desc_(NULL),
-        workspace_fwd_sizes_(0),
-        workspace_size_inbytes_(0),
-        fwd_algo_((cudnnConvolutionFwdAlgo_t)0) {}
-
-  ~CudnnConv2DBase() {
-    if (conv_desc_) {
-      CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(conv_desc_));
-    }
-    if (input_desc_) {
-      CUDNN_CHECK(cudnnDestroyTensorDescriptor(input_desc_));
-    }
-    if (output_desc_) {
-      CUDNN_CHECK(cudnnDestroyTensorDescriptor(output_desc_));
-    }
-    if (act_desc_) {
-      CUDNN_CHECK(cudnnDestroyActivationDescriptor(act_desc_));
-    }
-    if (bias_desc_) {
-      CUDNN_CHECK(cudnnDestroyTensorDescriptor(bias_desc_));
-    }
-    if (filter_desc_) {
-      CUDNN_CHECK(cudnnDestroyFilterDescriptor(filter_desc_));
-    }
-    if (handle_ != NULL) {
-      CUDNN_CHECK(cudnnDestroy(handle_));
-    }
-    if (workspace_data_ != NULL) {
-      cudaFree(workspace_data_);
-    }
-  }
-
- protected:
-  cudaStream_t stream_;
-  cudnnHandle_t handle_;
-  cudnnConvolutionFwdAlgo_t fwd_algo_;
-  cudnnTensorDescriptor_t input_desc_;
-  cudnnTensorDescriptor_t output_desc_;
-  cudnnTensorDescriptor_t bias_desc_;
-  cudnnFilterDescriptor_t filter_desc_;
-  cudnnConvolutionDescriptor_t conv_desc_;
-
-  // activation descriptor
-  cudnnActivationDescriptor_t act_desc_;
-  bool with_relu_act_{true};
-
-  size_t workspace_fwd_sizes_;
-  size_t workspace_size_inbytes_;  // size of underlying storage
-  void* workspace_data_;           // underlying storage
-  void* workspace_;                // aliases into _workspaceData
-
-  const bool use_tensor_core_ = true;
-  const size_t workspace_limit_bytes_ = 4 * 1024 * 1024;
-  const cudnnConvolutionFwdPreference_t preference_ =
-      CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
-
-  // For int8
-  Tensor temp_tensor_;
-  Tensor scale_;
-};
-
-template <PrecisionType Ptype_out>
-class CudnnConv2D : public CudnnConv2DBase<Ptype_out> {
- public:
-  CudnnConv2D() : CudnnConv2DBase<Ptype_out>() {}
-  virtual bool init(const operators::ConvParam& param,
-                    Context<TARGET(kCUDA)>* ctx);
-
-  virtual bool create(const operators::ConvParam& param,
-                      Context<TARGET(kCUDA)>* ctx);
-
-  virtual bool run(const operators::ConvParam& param);
-};
-
-template <PrecisionType Ptype_out>
-class CudnnConv2DInt8 : CudnnConv2DBase<Ptype_out> {
- public:
-  CudnnConv2DInt8() : CudnnConv2DBase<Ptype_out>() {}
-  virtual bool init(const operators::ConvParam& param,
-                    Context<TARGET(kCUDA)>* ctx);
-
-  virtual bool create(const operators::ConvParam& param,
-                      Context<TARGET(kCUDA)>* ctx);
-
-  virtual bool run(const operators::ConvParam& param);
-};
-
-}  // namespace math
-}  // namespace cuda
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/cuda/math/cudnn_helper.h b/lite/backends/cuda/math/cudnn_helper.h
deleted file mode 100644
index b7f9b2cf69..0000000000
--- a/lite/backends/cuda/math/cudnn_helper.h
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-
-namespace paddle {
-namespace lite {
-namespace cuda {
-namespace math {}  // namespace math
-}  // namespace cuda
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/cuda/math/scale.cu b/lite/backends/cuda/math/scale.cu
deleted file mode 100644
index cc49d0403d..0000000000
--- a/lite/backends/cuda/math/scale.cu
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "iostream"
-#include "lite/backends/cuda/math/scale.h"
-#include "lite/backends/cuda/math/utils.h"
-
-namespace paddle {
-namespace lite {
-namespace cuda {
-namespace math {
-
-__global__ void fp32_scale_nhwc4_kernel(int num,
-                                        const float4* in,
-                                        float4* out,
-                                        const float4* scale,
-                                        int N,
-                                        int K,
-                                        int H,
-                                        int W) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < num) {
-    int scale_idx = tid % K;
-    const float4 scale_ptr = scale[scale_idx];
-    const float4 in_ptr = in[tid];
-    float4 packed_val;
-
-    packed_val.x = in_ptr.x * scale_ptr.x;
-    packed_val.y = in_ptr.y * scale_ptr.y;
-    packed_val.z = in_ptr.z * scale_ptr.z;
-    packed_val.w = in_ptr.w * scale_ptr.w;
-    out[tid] = packed_val;
-  }
-}
-
-void fp32_scale_nhwc4(int num,
-                      const void* in,
-                      void* out,
-                      const void* scale,
-                      int N,
-                      int K,
-                      int H,
-                      int W,
-                      cudaStream_t stream) {
-  int thread = 256;
-  int block = (num + thread - 1) / thread;
-  fp32_scale_nhwc4_kernel<<<block, thread, 0, stream>>>(
-      num,
-      static_cast<const float4*>(in),
-      static_cast<float4*>(out),
-      static_cast<const float4*>(scale),
-      N,
-      K,
-      H,
-      W);
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) std::cout << cudaGetErrorString(error);
-}
-
-}  // namespace math
-}  // namespace cuda
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/cuda/math/scale.h b/lite/backends/cuda/math/scale.h
deleted file mode 100644
index e96b864c92..0000000000
--- a/lite/backends/cuda/math/scale.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-namespace paddle {
-namespace lite {
-namespace cuda {
-namespace math {
-
-void fp32_scale_nhwc4(int num,
-                      const void* din,
-                      void* dout,
-                      const void* scale,
-                      int N,
-                      int K,
-                      int H,
-                      int W,
-                      cudaStream_t stream);
-
-}  // namespace math
-}  // namespace cuda
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/cuda/math/transpose.cu b/lite/backends/cuda/math/transpose.cu
deleted file mode 100644
index 6467f00307..0000000000
--- a/lite/backends/cuda/math/transpose.cu
+++ /dev/null
@@ -1,191 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/cuda/math/transpose.h"
-#include "lite/backends/cuda/math/utils.h"
-
-namespace paddle {
-namespace lite {
-namespace cuda {
-namespace math {
-
-constexpr int kTileDim = 32;
-constexpr int kBlockRows = 8;
-constexpr int CUDA_NUM_THREADS = 128;
-
-// Splits the original matrix into submatrices with size 32 * 32.
-// Reference https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/
-template <typename T>
-__global__ void BatchTranspose2DCUDAKernel(const int N,
-                                           const int H,
-                                           const int W,
-                                           const int dh,
-                                           const int dw,
-                                           const T* input,
-                                           T* out) {
-  __shared__ T tile[kTileDim][kTileDim + 1];  // plus 1 to prevent bank confict.
-  const int n = blockIdx.x / (dh * dw);
-  const int k = blockIdx.x % (dh * dw);
-  const int r = k / dw;
-  const int c = k % dw;
-  const int offset = n * H * W;
-  int x = c * kTileDim + threadIdx.x;
-  int y = r * kTileDim + threadIdx.y;
-  if (x < W) {
-    for (int i = 0; threadIdx.y + i < kTileDim && y + i < H; i += kBlockRows) {
-#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__)
-      tile[threadIdx.y + i][threadIdx.x] =
-          __ldg(input + offset + (y + i) * W + x);
-#else
-      tile[threadIdx.y + i][threadIdx.x] = input[offset + (y + i) * W + x];
-#endif
-    }
-  }
-  __syncthreads();
-  x = r * kTileDim + threadIdx.x;
-  y = c * kTileDim + threadIdx.y;
-  if (x < H) {
-    for (int i = 0; threadIdx.y + i < kTileDim && y + i < W; i += kBlockRows) {
-      out[offset + (y + i) * H + x] = tile[threadIdx.x][threadIdx.y + i];
-    }
-  }
-}
-
-template <typename T>
-void BatchTranspose2DCUDAImpl(const int N,
-                              const int H,
-                              const int W,
-                              const T* input,
-                              T* out,
-                              CUDAContext* ctx) {
-  const int dh = (H + kTileDim - 1) / kTileDim;
-  const int dw = (W + kTileDim - 1) / kTileDim;
-  BatchTranspose2DCUDAKernel<
-      T><<<N * dh * dw, dim3(kTileDim, kBlockRows), 0, ctx->exec_stream()>>>(
-      N, H, W, dh, dw, input, out);
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
-}
-
-#define TYPE_SPECIALIZED_CUDA_NCHW2NHWC(T)             \
-  template <>                                          \
-  void NCHW2NHWC<T>(const int N,                       \
-                    const int C,                       \
-                    const int HxW,                     \
-                    const T* X,                        \
-                    T* Y,                              \
-                    CUDAContext* ctx) {                \
-    BatchTranspose2DCUDAImpl<T>(N, C, HxW, X, Y, ctx); \
-  }
-TYPE_SPECIALIZED_CUDA_NCHW2NHWC(float)
-#undef TYPE_SPECIALIZED_CUDA_NCHW2NHWC
-
-#define TYPE_SPECIALIZED_CUDA_NHWC2NCHW(T)             \
-  template <>                                          \
-  void NHWC2NCHW<T>(const int N,                       \
-                    const int C,                       \
-                    const int HxW,                     \
-                    const T* X,                        \
-                    T* Y,                              \
-                    CUDAContext* ctx) {                \
-    BatchTranspose2DCUDAImpl<T>(N, HxW, C, X, Y, ctx); \
-  }
-TYPE_SPECIALIZED_CUDA_NHWC2NCHW(float)
-#undef TYPE_SPECIALIZED_CUDA_NHWC2NCHW
-
-template <typename T>
-__global__ void TransposeCUDAKernel(const int size,
-                                    const int ndim,
-                                    const int* X_strides,
-                                    const int* Y_dims,
-                                    const T* X,
-                                    T* Y) {
-  const int Y_index = blockIdx.x * CUDA_NUM_THREADS + threadIdx.x;
-  if (Y_index < size) {
-    int X_index = 0;
-    int v = Y_index;
-#pragma unroll
-    for (int i = ndim - 1; i >= 0; --i) {
-      X_index += v % Y_dims[i] * X_strides[i];
-      v /= Y_dims[i];
-    }
-#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__)
-    Y[Y_index] = __ldg(X + X_index);
-#else
-    Y[Y_index] = X[X_index];
-#endif
-  }
-}
-
-template <typename T>
-void TransposeCUDAImpl(const std::vector<int64_t>& X_dims,
-                       const std::vector<int>& axes,
-                       const T* X,
-                       T* Y,
-                       CUDAContext* ctx) {
-  CHECK_EQ(X_dims.size(), axes.size()) << "dimension size should be equal";
-  int ndim = X_dims.size();
-  std::vector<int> strides(ndim, 0);
-  std::vector<int> Y_dims(ndim, 0);
-  std::vector<int> buf(ndim, 0);
-  int cur_stride = 1;
-  for (int i = ndim - 1; i >= 0; --i) {
-    buf[i] = cur_stride;
-    cur_stride *= X_dims[i];
-  }
-  for (int i = 0; i < ndim; ++i) {
-    strides[i] = buf[axes[i]];
-  }
-  int size = 1;
-  for (int i = 0; i < ndim; ++i) {
-    Y_dims[i] = static_cast<int>(X_dims[axes[i]]);
-    size *= X_dims[i];
-  }
-
-  lite::Tensor Y_dims_, strides_;
-  Y_dims_.Resize(std::vector<int64_t>({ndim}));
-  int* d_y_dims = Y_dims_.mutable_data<int>(TARGET(kCUDA));
-  CopySync<TARGET(kCUDA)>(
-      d_y_dims, Y_dims.data(), sizeof(int) * Y_dims.size(), IoDirection::HtoD);
-
-  strides_.Resize(std::vector<int64_t>({ndim}));
-  int* d_strides = strides_.mutable_data<int>(TARGET(kCUDA));
-  CopySync<TARGET(kCUDA)>(d_strides,
-                          strides.data(),
-                          sizeof(int) * strides.size(),
-                          IoDirection::HtoD);
-
-  const int M = (size + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
-  TransposeCUDAKernel<<<M, CUDA_NUM_THREADS, 0, ctx->exec_stream()>>>(
-      size, ndim, d_strides, d_y_dims, X, Y);
-  // cudaError_t error = cudaGetLastError();
-  // if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
-}
-
-#define TYPE_SPECIALIZED_CUDA_TRANSPOSE(T)              \
-  template <>                                           \
-  void Transpose<T>(const std::vector<int64_t>& X_dims, \
-                    const std::vector<int>& axes,       \
-                    const T* X,                         \
-                    T* Y,                               \
-                    CUDAContext* ctx) {                 \
-    TransposeCUDAImpl<T>(X_dims, axes, X, Y, ctx);      \
-  }
-TYPE_SPECIALIZED_CUDA_TRANSPOSE(float)
-#undef TYPE_SPECIALIZED_CUDA_TRANSPOSEF
-
-}  // namespace math
-}  // namespace cuda
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/cuda/math/transpose.h b/lite/backends/cuda/math/transpose.h
deleted file mode 100644
index ba2464547b..0000000000
--- a/lite/backends/cuda/math/transpose.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <string>
-#include <vector>
-#include "lite/core/context.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace cuda {
-namespace math {
-
-template <typename T>
-void NCHW2NHWC(int N, int C, int HxW, const T* X, T* Y, CUDAContext* context);
-
-template <typename T>
-void NHWC2NCHW(int N, int C, int HxW, const T* X, T* Y, CUDAContext* context);
-
-template <typename T>
-void Transpose(const std::vector<int64_t>& X_dims,
-               const std::vector<int>& axes,
-               const T* X,
-               T* Y,
-               CUDAContext* ctx);
-
-}  // namespace math
-}  // namespace cuda
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/cuda/math/type_trans.cu b/lite/backends/cuda/math/type_trans.cu
deleted file mode 100644
index 6636f98840..0000000000
--- a/lite/backends/cuda/math/type_trans.cu
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/cuda/math/type_trans.h"
-#include "lite/backends/cuda/math/utils.h"
-
-namespace paddle {
-namespace lite {
-namespace cuda {
-namespace math {
-
-__global__ void fp32_scale_nhwc4_kernel(int num,
-                                        const float4* in,
-                                        char4* out,
-                                        const float4* scale,
-                                        int N,
-                                        int K,
-                                        int H,
-                                        int W) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < num) {
-    int scale_idx = tid % K;
-    const float4 scale_ptr = scale[scale_idx];
-    const float4 in_ptr = in[tid];
-    char4 result_val;
-
-    result_val.x = from_float<int8_t>(in_ptr.x * scale_ptr.x);
-    result_val.y = from_float<int8_t>(in_ptr.y * scale_ptr.y);
-    result_val.z = from_float<int8_t>(in_ptr.z * scale_ptr.z);
-    result_val.w = from_float<int8_t>(in_ptr.w * scale_ptr.w);
-    out[tid] = result_val;
-  }
-}
-
-void fp32_to_int8_nhwc4(int num,
-                        const void* in,
-                        void* out,
-                        const void* scale,
-                        int N,
-                        int K,
-                        int H,
-                        int W,
-                        cudaStream_t stream) {
-  int thread = 256;
-  int block = (num + thread - 1) / thread;
-  fp32_scale_nhwc4_kernel<<<block, thread, 0, stream>>>(
-      num,
-      static_cast<const float4*>(in),
-      static_cast<char4*>(out),
-      static_cast<const float4*>(scale),
-      N,
-      K,
-      H,
-      W);
-}
-
-}  // namespace math
-}  // namespace cuda
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/cuda/math/type_trans.h b/lite/backends/cuda/math/type_trans.h
deleted file mode 100644
index b83830f10a..0000000000
--- a/lite/backends/cuda/math/type_trans.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-namespace paddle {
-namespace lite {
-namespace cuda {
-namespace math {
-
-void fp32_to_int8_nhwc4(int num,
-                        const void* din,
-                        void* dout,
-                        const void* scale,
-                        int N,
-                        int K,
-                        int H,
-                        int W,
-                        cudaStream_t stream);
-
-}  // namespace math
-}  // namespace cuda
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/cuda/math/utils.h b/lite/backends/cuda/math/utils.h
deleted file mode 100644
index b4cd82fd8d..0000000000
--- a/lite/backends/cuda/math/utils.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <cudnn.h>
-#include <limits>
-#include <string>
-
-namespace paddle {
-namespace lite {
-namespace cuda {
-namespace math {
-
-template <typename T>
-__device__ T from_float(float x);
-
-template <>
-__device__ __forceinline__ float from_float<float>(float x) {
-  return x;
-}
-
-template <>
-__device__ __forceinline__ half from_float<half>(float x) {
-  return __float2half(x);
-}
-
-template <>
-__device__ __forceinline__ int8_t from_float<int8_t>(float x) {
-  x = fmaxf(x, std::numeric_limits<char>::min());
-  x = fminf(x, std::numeric_limits<char>::max());
-  return __float2int_rn(x);
-}
-
-}  // namespace math
-}  // namespace cuda
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/cuda/target_wrapper.cc b/lite/backends/cuda/target_wrapper.cc
deleted file mode 100644
index b1aaadf027..0000000000
--- a/lite/backends/cuda/target_wrapper.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/cuda/target_wrapper.h"
-
-namespace paddle {
-namespace lite {
-
-size_t TargetWrapperCuda::num_devices() {
-  int count = 0;
-  cudaGetDeviceCount(&count);
-  return count;
-}
-
-void* TargetWrapperCuda::Malloc(size_t size) {
-  void* ptr{};
-  CHECK_EQ(cudaSuccess, cudaMalloc(&ptr, size));
-  return ptr;
-}
-
-void TargetWrapperCuda::Free(void* ptr) {
-  CHECK_EQ(cudaSuccess, cudaFree(ptr));
-}
-
-void TargetWrapperCuda::MemcpySync(void* dst,
-                                   const void* src,
-                                   size_t size,
-                                   IoDirection dir) {
-  switch (dir) {
-    case IoDirection::DtoD:
-      CHECK(cudaSuccess ==
-            cudaMemcpy(dst, src, size, cudaMemcpyDeviceToDevice));
-      break;
-    case IoDirection::HtoD:
-      CHECK(cudaSuccess == cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice));
-      break;
-    case IoDirection::DtoH:
-      CHECK(cudaSuccess == cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost));
-      break;
-    default:
-      LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
-  }
-}
-
-void TargetWrapperCuda::MemcpyAsync(void* dst,
-                                    const void* src,
-                                    size_t size,
-                                    IoDirection dir,
-                                    const stream_t& stream) {
-  switch (dir) {
-    case IoDirection::DtoD:
-      CHECK(cudaSuccess ==
-            cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToDevice, stream));
-      break;
-    case IoDirection::HtoD:
-      CHECK(cudaSuccess ==
-            cudaMemcpyAsync(dst, src, size, cudaMemcpyHostToDevice, stream));
-      break;
-    case IoDirection::DtoH:
-      CHECK(cudaSuccess ==
-            cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToHost, stream));
-      break;
-    default:
-      LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
-  }
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/cuda/target_wrapper.h b/lite/backends/cuda/target_wrapper.h
deleted file mode 100644
index 50063007ce..0000000000
--- a/lite/backends/cuda/target_wrapper.h
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include "lite/core/target_wrapper.h"
-
-namespace paddle {
-namespace lite {
-
-using TargetWrapperCuda = TargetWrapper<TARGET(kCUDA)>;
-
-template <>
-class TargetWrapper<TARGET(kCUDA)> {
- public:
-  using stream_t = cudaStream_t;
-  using event_t = cudaEvent_t;
-
-  static size_t num_devices();
-  static size_t maximum_stream() { return 0; }
-
-  static size_t GetCurDevice() {
-    int dev_id;
-    cudaGetDevice(&dev_id);
-    return dev_id;
-  }
-  static void CreateStream(stream_t* stream) {}
-  static void DestroyStream(const stream_t& stream) {}
-
-  static void CreateEvent(event_t* event) {}
-  static void DestroyEvent(const event_t& event) {}
-
-  static void RecordEvent(const event_t& event) {}
-  static void SyncEvent(const event_t& event) {}
-
-  static void StreamSync(const stream_t& stream) {}
-
-  static void* Malloc(size_t size);
-  static void Free(void* ptr);
-
-  static void MemcpySync(void* dst,
-                         const void* src,
-                         size_t size,
-                         IoDirection dir);
-  static void MemcpyAsync(void* dst,
-                          const void* src,
-                          size_t size,
-                          IoDirection dir,
-                          const stream_t& stream);
-};
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/fpga/CMakeLists.txt b/lite/backends/fpga/CMakeLists.txt
deleted file mode 100644
index b12fd85caf..0000000000
--- a/lite/backends/fpga/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-if (NOT LITE_WITH_FPGA)
-    return()
-endif()
-
-set(LITE_FPGA_KD_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga/KD")
-set(LITE_FPGA_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga")
-
-message("fpga_kd_path ${LITE_FPGA_KD_PATH}")
-message("fpga_path ${LITE_FPGA_PATH}")
-file(GLOB_RECURSE KD_CPP *.cpp *.cc)
-file(GLOB FPGA_CPP "${LITE_FPGA_PATH}/*.cc")
-
-cc_library(kernel_fpga SRCS ${KD_CPP} ${FPGA_CPP})
-cc_library(lite_tensor_fpga SRCS lite_tensor.cc DEPS memory)
-cc_library(fpga_target_wrapper SRCS ${LITE_FPGA_PATH}/target_wrapper.cc DEPS kernel_fpga)
diff --git a/lite/backends/fpga/KD/alignment.h b/lite/backends/fpga/KD/alignment.h
deleted file mode 100644
index 5cca79885c..0000000000
--- a/lite/backends/fpga/KD/alignment.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <stdio.h>
-
-#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
-
-namespace paddle {
-namespace zynqmp {
-
-inline int align_image(int wc) { return align_to_x(wc, IMAGE_ALIGNMENT); }
-
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/context.hpp b/lite/backends/fpga/KD/context.hpp
deleted file mode 100644
index 86109a4d1e..0000000000
--- a/lite/backends/fpga/KD/context.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdio.h>
-#include "lite/backends/fpga/KD/pe.hpp"
-#include "lite/backends/fpga/KD/pes/conv_pe.hpp"
-#include "lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp"
-#include "lite/backends/fpga/KD/pes/fully_connected_pe.hpp"
-#include "lite/backends/fpga/KD/pes/input_pe.hpp"
-#include "lite/backends/fpga/KD/pes/output_pe.hpp"
-#include "lite/backends/fpga/KD/pes/pooling_pe.hpp"
-#include "lite/backends/fpga/KD/pes/softmax_pe.hpp"
-
-namespace paddle {
-namespace zynqmp {
-
-class Context {
- public:
-  template <typename Ptype>
-  Ptype& pe() {
-    if (pe_ == nullptr) {
-      pe_ = new Ptype();
-    }
-    return static_cast<Ptype&>(*pe_);
-  }
-
-  ~Context() {
-    if (pe_ != nullptr) {
-      delete pe_;
-    }
-  }
-
- private:
-  PE* pe_ = nullptr;
-};
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/dl_engine.cpp b/lite/backends/fpga/KD/dl_engine.cpp
deleted file mode 100644
index 9849e4275b..0000000000
--- a/lite/backends/fpga/KD/dl_engine.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/fpga/KD/dl_engine.hpp"
-namespace paddle {
-namespace zynqmp {
-
-DLEngine::DLEngine() {
-  open_device();
-  struct DeviceInfo info;
-  int ret = get_device_info(info);
-  filter::set_filter_capacity(info.filter_cap);
-}
-
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/dl_engine.hpp b/lite/backends/fpga/KD/dl_engine.hpp
deleted file mode 100644
index 829f41dfeb..0000000000
--- a/lite/backends/fpga/KD/dl_engine.hpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdio.h>
-
-#include "lite/backends/fpga/KD/llapi/filter.h"
-#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
-
-namespace paddle {
-namespace zynqmp {
-
-class DLEngine {
- public:
-  static DLEngine& get_instance() {
-    static DLEngine s_instance;
-    return s_instance;
-  }
-
- private:
-  DLEngine();
-};
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/float16.hpp b/lite/backends/fpga/KD/float16.hpp
deleted file mode 100755
index 9f12317196..0000000000
--- a/lite/backends/fpga/KD/float16.hpp
+++ /dev/null
@@ -1,508 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-
-namespace paddle {
-namespace zynqmp {
-
-typedef uint16_t float16;
-
-static const uint32_t mantissatable[2048] = {
-    0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34a00000,
-    0x34c00000, 0x34e00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000,
-    0x35400000, 0x35500000, 0x35600000, 0x35700000, 0x35800000, 0x35880000,
-    0x35900000, 0x35980000, 0x35a00000, 0x35a80000, 0x35b00000, 0x35b80000,
-    0x35c00000, 0x35c80000, 0x35d00000, 0x35d80000, 0x35e00000, 0x35e80000,
-    0x35f00000, 0x35f80000, 0x36000000, 0x36040000, 0x36080000, 0x360c0000,
-    0x36100000, 0x36140000, 0x36180000, 0x361c0000, 0x36200000, 0x36240000,
-    0x36280000, 0x362c0000, 0x36300000, 0x36340000, 0x36380000, 0x363c0000,
-    0x36400000, 0x36440000, 0x36480000, 0x364c0000, 0x36500000, 0x36540000,
-    0x36580000, 0x365c0000, 0x36600000, 0x36640000, 0x36680000, 0x366c0000,
-    0x36700000, 0x36740000, 0x36780000, 0x367c0000, 0x36800000, 0x36820000,
-    0x36840000, 0x36860000, 0x36880000, 0x368a0000, 0x368c0000, 0x368e0000,
-    0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369a0000,
-    0x369c0000, 0x369e0000, 0x36a00000, 0x36a20000, 0x36a40000, 0x36a60000,
-    0x36a80000, 0x36aa0000, 0x36ac0000, 0x36ae0000, 0x36b00000, 0x36b20000,
-    0x36b40000, 0x36b60000, 0x36b80000, 0x36ba0000, 0x36bc0000, 0x36be0000,
-    0x36c00000, 0x36c20000, 0x36c40000, 0x36c60000, 0x36c80000, 0x36ca0000,
-    0x36cc0000, 0x36ce0000, 0x36d00000, 0x36d20000, 0x36d40000, 0x36d60000,
-    0x36d80000, 0x36da0000, 0x36dc0000, 0x36de0000, 0x36e00000, 0x36e20000,
-    0x36e40000, 0x36e60000, 0x36e80000, 0x36ea0000, 0x36ec0000, 0x36ee0000,
-    0x36f00000, 0x36f20000, 0x36f40000, 0x36f60000, 0x36f80000, 0x36fa0000,
-    0x36fc0000, 0x36fe0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000,
-    0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000,
-    0x370a0000, 0x370b0000, 0x370c0000, 0x370d0000, 0x370e0000, 0x370f0000,
-    0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000,
-    0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371a0000, 0x371b0000,
-    0x371c0000, 0x371d0000, 0x371e0000, 0x371f0000, 0x37200000, 0x37210000,
-    0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000,
-    0x37280000, 0x37290000, 0x372a0000, 0x372b0000, 0x372c0000, 0x372d0000,
-    0x372e0000, 0x372f0000, 0x37300000, 0x37310000, 0x37320000, 0x37330000,
-    0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000,
-    0x373a0000, 0x373b0000, 0x373c0000, 0x373d0000, 0x373e0000, 0x373f0000,
-    0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000,
-    0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374a0000, 0x374b0000,
-    0x374c0000, 0x374d0000, 0x374e0000, 0x374f0000, 0x37500000, 0x37510000,
-    0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000,
-    0x37580000, 0x37590000, 0x375a0000, 0x375b0000, 0x375c0000, 0x375d0000,
-    0x375e0000, 0x375f0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000,
-    0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000,
-    0x376a0000, 0x376b0000, 0x376c0000, 0x376d0000, 0x376e0000, 0x376f0000,
-    0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000,
-    0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377a0000, 0x377b0000,
-    0x377c0000, 0x377d0000, 0x377e0000, 0x377f0000, 0x37800000, 0x37808000,
-    0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000,
-    0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000,
-    0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000,
-    0x378a0000, 0x378a8000, 0x378b0000, 0x378b8000, 0x378c0000, 0x378c8000,
-    0x378d0000, 0x378d8000, 0x378e0000, 0x378e8000, 0x378f0000, 0x378f8000,
-    0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000,
-    0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000,
-    0x37960000, 0x37968000, 0x37970000, 0x37978000, 0x37980000, 0x37988000,
-    0x37990000, 0x37998000, 0x379a0000, 0x379a8000, 0x379b0000, 0x379b8000,
-    0x379c0000, 0x379c8000, 0x379d0000, 0x379d8000, 0x379e0000, 0x379e8000,
-    0x379f0000, 0x379f8000, 0x37a00000, 0x37a08000, 0x37a10000, 0x37a18000,
-    0x37a20000, 0x37a28000, 0x37a30000, 0x37a38000, 0x37a40000, 0x37a48000,
-    0x37a50000, 0x37a58000, 0x37a60000, 0x37a68000, 0x37a70000, 0x37a78000,
-    0x37a80000, 0x37a88000, 0x37a90000, 0x37a98000, 0x37aa0000, 0x37aa8000,
-    0x37ab0000, 0x37ab8000, 0x37ac0000, 0x37ac8000, 0x37ad0000, 0x37ad8000,
-    0x37ae0000, 0x37ae8000, 0x37af0000, 0x37af8000, 0x37b00000, 0x37b08000,
-    0x37b10000, 0x37b18000, 0x37b20000, 0x37b28000, 0x37b30000, 0x37b38000,
-    0x37b40000, 0x37b48000, 0x37b50000, 0x37b58000, 0x37b60000, 0x37b68000,
-    0x37b70000, 0x37b78000, 0x37b80000, 0x37b88000, 0x37b90000, 0x37b98000,
-    0x37ba0000, 0x37ba8000, 0x37bb0000, 0x37bb8000, 0x37bc0000, 0x37bc8000,
-    0x37bd0000, 0x37bd8000, 0x37be0000, 0x37be8000, 0x37bf0000, 0x37bf8000,
-    0x37c00000, 0x37c08000, 0x37c10000, 0x37c18000, 0x37c20000, 0x37c28000,
-    0x37c30000, 0x37c38000, 0x37c40000, 0x37c48000, 0x37c50000, 0x37c58000,
-    0x37c60000, 0x37c68000, 0x37c70000, 0x37c78000, 0x37c80000, 0x37c88000,
-    0x37c90000, 0x37c98000, 0x37ca0000, 0x37ca8000, 0x37cb0000, 0x37cb8000,
-    0x37cc0000, 0x37cc8000, 0x37cd0000, 0x37cd8000, 0x37ce0000, 0x37ce8000,
-    0x37cf0000, 0x37cf8000, 0x37d00000, 0x37d08000, 0x37d10000, 0x37d18000,
-    0x37d20000, 0x37d28000, 0x37d30000, 0x37d38000, 0x37d40000, 0x37d48000,
-    0x37d50000, 0x37d58000, 0x37d60000, 0x37d68000, 0x37d70000, 0x37d78000,
-    0x37d80000, 0x37d88000, 0x37d90000, 0x37d98000, 0x37da0000, 0x37da8000,
-    0x37db0000, 0x37db8000, 0x37dc0000, 0x37dc8000, 0x37dd0000, 0x37dd8000,
-    0x37de0000, 0x37de8000, 0x37df0000, 0x37df8000, 0x37e00000, 0x37e08000,
-    0x37e10000, 0x37e18000, 0x37e20000, 0x37e28000, 0x37e30000, 0x37e38000,
-    0x37e40000, 0x37e48000, 0x37e50000, 0x37e58000, 0x37e60000, 0x37e68000,
-    0x37e70000, 0x37e78000, 0x37e80000, 0x37e88000, 0x37e90000, 0x37e98000,
-    0x37ea0000, 0x37ea8000, 0x37eb0000, 0x37eb8000, 0x37ec0000, 0x37ec8000,
-    0x37ed0000, 0x37ed8000, 0x37ee0000, 0x37ee8000, 0x37ef0000, 0x37ef8000,
-    0x37f00000, 0x37f08000, 0x37f10000, 0x37f18000, 0x37f20000, 0x37f28000,
-    0x37f30000, 0x37f38000, 0x37f40000, 0x37f48000, 0x37f50000, 0x37f58000,
-    0x37f60000, 0x37f68000, 0x37f70000, 0x37f78000, 0x37f80000, 0x37f88000,
-    0x37f90000, 0x37f98000, 0x37fa0000, 0x37fa8000, 0x37fb0000, 0x37fb8000,
-    0x37fc0000, 0x37fc8000, 0x37fd0000, 0x37fd8000, 0x37fe0000, 0x37fe8000,
-    0x37ff0000, 0x37ff8000, 0x38000000, 0x38004000, 0x38008000, 0x3800c000,
-    0x38010000, 0x38014000, 0x38018000, 0x3801c000, 0x38020000, 0x38024000,
-    0x38028000, 0x3802c000, 0x38030000, 0x38034000, 0x38038000, 0x3803c000,
-    0x38040000, 0x38044000, 0x38048000, 0x3804c000, 0x38050000, 0x38054000,
-    0x38058000, 0x3805c000, 0x38060000, 0x38064000, 0x38068000, 0x3806c000,
-    0x38070000, 0x38074000, 0x38078000, 0x3807c000, 0x38080000, 0x38084000,
-    0x38088000, 0x3808c000, 0x38090000, 0x38094000, 0x38098000, 0x3809c000,
-    0x380a0000, 0x380a4000, 0x380a8000, 0x380ac000, 0x380b0000, 0x380b4000,
-    0x380b8000, 0x380bc000, 0x380c0000, 0x380c4000, 0x380c8000, 0x380cc000,
-    0x380d0000, 0x380d4000, 0x380d8000, 0x380dc000, 0x380e0000, 0x380e4000,
-    0x380e8000, 0x380ec000, 0x380f0000, 0x380f4000, 0x380f8000, 0x380fc000,
-    0x38100000, 0x38104000, 0x38108000, 0x3810c000, 0x38110000, 0x38114000,
-    0x38118000, 0x3811c000, 0x38120000, 0x38124000, 0x38128000, 0x3812c000,
-    0x38130000, 0x38134000, 0x38138000, 0x3813c000, 0x38140000, 0x38144000,
-    0x38148000, 0x3814c000, 0x38150000, 0x38154000, 0x38158000, 0x3815c000,
-    0x38160000, 0x38164000, 0x38168000, 0x3816c000, 0x38170000, 0x38174000,
-    0x38178000, 0x3817c000, 0x38180000, 0x38184000, 0x38188000, 0x3818c000,
-    0x38190000, 0x38194000, 0x38198000, 0x3819c000, 0x381a0000, 0x381a4000,
-    0x381a8000, 0x381ac000, 0x381b0000, 0x381b4000, 0x381b8000, 0x381bc000,
-    0x381c0000, 0x381c4000, 0x381c8000, 0x381cc000, 0x381d0000, 0x381d4000,
-    0x381d8000, 0x381dc000, 0x381e0000, 0x381e4000, 0x381e8000, 0x381ec000,
-    0x381f0000, 0x381f4000, 0x381f8000, 0x381fc000, 0x38200000, 0x38204000,
-    0x38208000, 0x3820c000, 0x38210000, 0x38214000, 0x38218000, 0x3821c000,
-    0x38220000, 0x38224000, 0x38228000, 0x3822c000, 0x38230000, 0x38234000,
-    0x38238000, 0x3823c000, 0x38240000, 0x38244000, 0x38248000, 0x3824c000,
-    0x38250000, 0x38254000, 0x38258000, 0x3825c000, 0x38260000, 0x38264000,
-    0x38268000, 0x3826c000, 0x38270000, 0x38274000, 0x38278000, 0x3827c000,
-    0x38280000, 0x38284000, 0x38288000, 0x3828c000, 0x38290000, 0x38294000,
-    0x38298000, 0x3829c000, 0x382a0000, 0x382a4000, 0x382a8000, 0x382ac000,
-    0x382b0000, 0x382b4000, 0x382b8000, 0x382bc000, 0x382c0000, 0x382c4000,
-    0x382c8000, 0x382cc000, 0x382d0000, 0x382d4000, 0x382d8000, 0x382dc000,
-    0x382e0000, 0x382e4000, 0x382e8000, 0x382ec000, 0x382f0000, 0x382f4000,
-    0x382f8000, 0x382fc000, 0x38300000, 0x38304000, 0x38308000, 0x3830c000,
-    0x38310000, 0x38314000, 0x38318000, 0x3831c000, 0x38320000, 0x38324000,
-    0x38328000, 0x3832c000, 0x38330000, 0x38334000, 0x38338000, 0x3833c000,
-    0x38340000, 0x38344000, 0x38348000, 0x3834c000, 0x38350000, 0x38354000,
-    0x38358000, 0x3835c000, 0x38360000, 0x38364000, 0x38368000, 0x3836c000,
-    0x38370000, 0x38374000, 0x38378000, 0x3837c000, 0x38380000, 0x38384000,
-    0x38388000, 0x3838c000, 0x38390000, 0x38394000, 0x38398000, 0x3839c000,
-    0x383a0000, 0x383a4000, 0x383a8000, 0x383ac000, 0x383b0000, 0x383b4000,
-    0x383b8000, 0x383bc000, 0x383c0000, 0x383c4000, 0x383c8000, 0x383cc000,
-    0x383d0000, 0x383d4000, 0x383d8000, 0x383dc000, 0x383e0000, 0x383e4000,
-    0x383e8000, 0x383ec000, 0x383f0000, 0x383f4000, 0x383f8000, 0x383fc000,
-    0x38400000, 0x38404000, 0x38408000, 0x3840c000, 0x38410000, 0x38414000,
-    0x38418000, 0x3841c000, 0x38420000, 0x38424000, 0x38428000, 0x3842c000,
-    0x38430000, 0x38434000, 0x38438000, 0x3843c000, 0x38440000, 0x38444000,
-    0x38448000, 0x3844c000, 0x38450000, 0x38454000, 0x38458000, 0x3845c000,
-    0x38460000, 0x38464000, 0x38468000, 0x3846c000, 0x38470000, 0x38474000,
-    0x38478000, 0x3847c000, 0x38480000, 0x38484000, 0x38488000, 0x3848c000,
-    0x38490000, 0x38494000, 0x38498000, 0x3849c000, 0x384a0000, 0x384a4000,
-    0x384a8000, 0x384ac000, 0x384b0000, 0x384b4000, 0x384b8000, 0x384bc000,
-    0x384c0000, 0x384c4000, 0x384c8000, 0x384cc000, 0x384d0000, 0x384d4000,
-    0x384d8000, 0x384dc000, 0x384e0000, 0x384e4000, 0x384e8000, 0x384ec000,
-    0x384f0000, 0x384f4000, 0x384f8000, 0x384fc000, 0x38500000, 0x38504000,
-    0x38508000, 0x3850c000, 0x38510000, 0x38514000, 0x38518000, 0x3851c000,
-    0x38520000, 0x38524000, 0x38528000, 0x3852c000, 0x38530000, 0x38534000,
-    0x38538000, 0x3853c000, 0x38540000, 0x38544000, 0x38548000, 0x3854c000,
-    0x38550000, 0x38554000, 0x38558000, 0x3855c000, 0x38560000, 0x38564000,
-    0x38568000, 0x3856c000, 0x38570000, 0x38574000, 0x38578000, 0x3857c000,
-    0x38580000, 0x38584000, 0x38588000, 0x3858c000, 0x38590000, 0x38594000,
-    0x38598000, 0x3859c000, 0x385a0000, 0x385a4000, 0x385a8000, 0x385ac000,
-    0x385b0000, 0x385b4000, 0x385b8000, 0x385bc000, 0x385c0000, 0x385c4000,
-    0x385c8000, 0x385cc000, 0x385d0000, 0x385d4000, 0x385d8000, 0x385dc000,
-    0x385e0000, 0x385e4000, 0x385e8000, 0x385ec000, 0x385f0000, 0x385f4000,
-    0x385f8000, 0x385fc000, 0x38600000, 0x38604000, 0x38608000, 0x3860c000,
-    0x38610000, 0x38614000, 0x38618000, 0x3861c000, 0x38620000, 0x38624000,
-    0x38628000, 0x3862c000, 0x38630000, 0x38634000, 0x38638000, 0x3863c000,
-    0x38640000, 0x38644000, 0x38648000, 0x3864c000, 0x38650000, 0x38654000,
-    0x38658000, 0x3865c000, 0x38660000, 0x38664000, 0x38668000, 0x3866c000,
-    0x38670000, 0x38674000, 0x38678000, 0x3867c000, 0x38680000, 0x38684000,
-    0x38688000, 0x3868c000, 0x38690000, 0x38694000, 0x38698000, 0x3869c000,
-    0x386a0000, 0x386a4000, 0x386a8000, 0x386ac000, 0x386b0000, 0x386b4000,
-    0x386b8000, 0x386bc000, 0x386c0000, 0x386c4000, 0x386c8000, 0x386cc000,
-    0x386d0000, 0x386d4000, 0x386d8000, 0x386dc000, 0x386e0000, 0x386e4000,
-    0x386e8000, 0x386ec000, 0x386f0000, 0x386f4000, 0x386f8000, 0x386fc000,
-    0x38700000, 0x38704000, 0x38708000, 0x3870c000, 0x38710000, 0x38714000,
-    0x38718000, 0x3871c000, 0x38720000, 0x38724000, 0x38728000, 0x3872c000,
-    0x38730000, 0x38734000, 0x38738000, 0x3873c000, 0x38740000, 0x38744000,
-    0x38748000, 0x3874c000, 0x38750000, 0x38754000, 0x38758000, 0x3875c000,
-    0x38760000, 0x38764000, 0x38768000, 0x3876c000, 0x38770000, 0x38774000,
-    0x38778000, 0x3877c000, 0x38780000, 0x38784000, 0x38788000, 0x3878c000,
-    0x38790000, 0x38794000, 0x38798000, 0x3879c000, 0x387a0000, 0x387a4000,
-    0x387a8000, 0x387ac000, 0x387b0000, 0x387b4000, 0x387b8000, 0x387bc000,
-    0x387c0000, 0x387c4000, 0x387c8000, 0x387cc000, 0x387d0000, 0x387d4000,
-    0x387d8000, 0x387dc000, 0x387e0000, 0x387e4000, 0x387e8000, 0x387ec000,
-    0x387f0000, 0x387f4000, 0x387f8000, 0x387fc000, 0x38000000, 0x38002000,
-    0x38004000, 0x38006000, 0x38008000, 0x3800a000, 0x3800c000, 0x3800e000,
-    0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801a000,
-    0x3801c000, 0x3801e000, 0x38020000, 0x38022000, 0x38024000, 0x38026000,
-    0x38028000, 0x3802a000, 0x3802c000, 0x3802e000, 0x38030000, 0x38032000,
-    0x38034000, 0x38036000, 0x38038000, 0x3803a000, 0x3803c000, 0x3803e000,
-    0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804a000,
-    0x3804c000, 0x3804e000, 0x38050000, 0x38052000, 0x38054000, 0x38056000,
-    0x38058000, 0x3805a000, 0x3805c000, 0x3805e000, 0x38060000, 0x38062000,
-    0x38064000, 0x38066000, 0x38068000, 0x3806a000, 0x3806c000, 0x3806e000,
-    0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807a000,
-    0x3807c000, 0x3807e000, 0x38080000, 0x38082000, 0x38084000, 0x38086000,
-    0x38088000, 0x3808a000, 0x3808c000, 0x3808e000, 0x38090000, 0x38092000,
-    0x38094000, 0x38096000, 0x38098000, 0x3809a000, 0x3809c000, 0x3809e000,
-    0x380a0000, 0x380a2000, 0x380a4000, 0x380a6000, 0x380a8000, 0x380aa000,
-    0x380ac000, 0x380ae000, 0x380b0000, 0x380b2000, 0x380b4000, 0x380b6000,
-    0x380b8000, 0x380ba000, 0x380bc000, 0x380be000, 0x380c0000, 0x380c2000,
-    0x380c4000, 0x380c6000, 0x380c8000, 0x380ca000, 0x380cc000, 0x380ce000,
-    0x380d0000, 0x380d2000, 0x380d4000, 0x380d6000, 0x380d8000, 0x380da000,
-    0x380dc000, 0x380de000, 0x380e0000, 0x380e2000, 0x380e4000, 0x380e6000,
-    0x380e8000, 0x380ea000, 0x380ec000, 0x380ee000, 0x380f0000, 0x380f2000,
-    0x380f4000, 0x380f6000, 0x380f8000, 0x380fa000, 0x380fc000, 0x380fe000,
-    0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810a000,
-    0x3810c000, 0x3810e000, 0x38110000, 0x38112000, 0x38114000, 0x38116000,
-    0x38118000, 0x3811a000, 0x3811c000, 0x3811e000, 0x38120000, 0x38122000,
-    0x38124000, 0x38126000, 0x38128000, 0x3812a000, 0x3812c000, 0x3812e000,
-    0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813a000,
-    0x3813c000, 0x3813e000, 0x38140000, 0x38142000, 0x38144000, 0x38146000,
-    0x38148000, 0x3814a000, 0x3814c000, 0x3814e000, 0x38150000, 0x38152000,
-    0x38154000, 0x38156000, 0x38158000, 0x3815a000, 0x3815c000, 0x3815e000,
-    0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816a000,
-    0x3816c000, 0x3816e000, 0x38170000, 0x38172000, 0x38174000, 0x38176000,
-    0x38178000, 0x3817a000, 0x3817c000, 0x3817e000, 0x38180000, 0x38182000,
-    0x38184000, 0x38186000, 0x38188000, 0x3818a000, 0x3818c000, 0x3818e000,
-    0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819a000,
-    0x3819c000, 0x3819e000, 0x381a0000, 0x381a2000, 0x381a4000, 0x381a6000,
-    0x381a8000, 0x381aa000, 0x381ac000, 0x381ae000, 0x381b0000, 0x381b2000,
-    0x381b4000, 0x381b6000, 0x381b8000, 0x381ba000, 0x381bc000, 0x381be000,
-    0x381c0000, 0x381c2000, 0x381c4000, 0x381c6000, 0x381c8000, 0x381ca000,
-    0x381cc000, 0x381ce000, 0x381d0000, 0x381d2000, 0x381d4000, 0x381d6000,
-    0x381d8000, 0x381da000, 0x381dc000, 0x381de000, 0x381e0000, 0x381e2000,
-    0x381e4000, 0x381e6000, 0x381e8000, 0x381ea000, 0x381ec000, 0x381ee000,
-    0x381f0000, 0x381f2000, 0x381f4000, 0x381f6000, 0x381f8000, 0x381fa000,
-    0x381fc000, 0x381fe000, 0x38200000, 0x38202000, 0x38204000, 0x38206000,
-    0x38208000, 0x3820a000, 0x3820c000, 0x3820e000, 0x38210000, 0x38212000,
-    0x38214000, 0x38216000, 0x38218000, 0x3821a000, 0x3821c000, 0x3821e000,
-    0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822a000,
-    0x3822c000, 0x3822e000, 0x38230000, 0x38232000, 0x38234000, 0x38236000,
-    0x38238000, 0x3823a000, 0x3823c000, 0x3823e000, 0x38240000, 0x38242000,
-    0x38244000, 0x38246000, 0x38248000, 0x3824a000, 0x3824c000, 0x3824e000,
-    0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825a000,
-    0x3825c000, 0x3825e000, 0x38260000, 0x38262000, 0x38264000, 0x38266000,
-    0x38268000, 0x3826a000, 0x3826c000, 0x3826e000, 0x38270000, 0x38272000,
-    0x38274000, 0x38276000, 0x38278000, 0x3827a000, 0x3827c000, 0x3827e000,
-    0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828a000,
-    0x3828c000, 0x3828e000, 0x38290000, 0x38292000, 0x38294000, 0x38296000,
-    0x38298000, 0x3829a000, 0x3829c000, 0x3829e000, 0x382a0000, 0x382a2000,
-    0x382a4000, 0x382a6000, 0x382a8000, 0x382aa000, 0x382ac000, 0x382ae000,
-    0x382b0000, 0x382b2000, 0x382b4000, 0x382b6000, 0x382b8000, 0x382ba000,
-    0x382bc000, 0x382be000, 0x382c0000, 0x382c2000, 0x382c4000, 0x382c6000,
-    0x382c8000, 0x382ca000, 0x382cc000, 0x382ce000, 0x382d0000, 0x382d2000,
-    0x382d4000, 0x382d6000, 0x382d8000, 0x382da000, 0x382dc000, 0x382de000,
-    0x382e0000, 0x382e2000, 0x382e4000, 0x382e6000, 0x382e8000, 0x382ea000,
-    0x382ec000, 0x382ee000, 0x382f0000, 0x382f2000, 0x382f4000, 0x382f6000,
-    0x382f8000, 0x382fa000, 0x382fc000, 0x382fe000, 0x38300000, 0x38302000,
-    0x38304000, 0x38306000, 0x38308000, 0x3830a000, 0x3830c000, 0x3830e000,
-    0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831a000,
-    0x3831c000, 0x3831e000, 0x38320000, 0x38322000, 0x38324000, 0x38326000,
-    0x38328000, 0x3832a000, 0x3832c000, 0x3832e000, 0x38330000, 0x38332000,
-    0x38334000, 0x38336000, 0x38338000, 0x3833a000, 0x3833c000, 0x3833e000,
-    0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834a000,
-    0x3834c000, 0x3834e000, 0x38350000, 0x38352000, 0x38354000, 0x38356000,
-    0x38358000, 0x3835a000, 0x3835c000, 0x3835e000, 0x38360000, 0x38362000,
-    0x38364000, 0x38366000, 0x38368000, 0x3836a000, 0x3836c000, 0x3836e000,
-    0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837a000,
-    0x3837c000, 0x3837e000, 0x38380000, 0x38382000, 0x38384000, 0x38386000,
-    0x38388000, 0x3838a000, 0x3838c000, 0x3838e000, 0x38390000, 0x38392000,
-    0x38394000, 0x38396000, 0x38398000, 0x3839a000, 0x3839c000, 0x3839e000,
-    0x383a0000, 0x383a2000, 0x383a4000, 0x383a6000, 0x383a8000, 0x383aa000,
-    0x383ac000, 0x383ae000, 0x383b0000, 0x383b2000, 0x383b4000, 0x383b6000,
-    0x383b8000, 0x383ba000, 0x383bc000, 0x383be000, 0x383c0000, 0x383c2000,
-    0x383c4000, 0x383c6000, 0x383c8000, 0x383ca000, 0x383cc000, 0x383ce000,
-    0x383d0000, 0x383d2000, 0x383d4000, 0x383d6000, 0x383d8000, 0x383da000,
-    0x383dc000, 0x383de000, 0x383e0000, 0x383e2000, 0x383e4000, 0x383e6000,
-    0x383e8000, 0x383ea000, 0x383ec000, 0x383ee000, 0x383f0000, 0x383f2000,
-    0x383f4000, 0x383f6000, 0x383f8000, 0x383fa000, 0x383fc000, 0x383fe000,
-    0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840a000,
-    0x3840c000, 0x3840e000, 0x38410000, 0x38412000, 0x38414000, 0x38416000,
-    0x38418000, 0x3841a000, 0x3841c000, 0x3841e000, 0x38420000, 0x38422000,
-    0x38424000, 0x38426000, 0x38428000, 0x3842a000, 0x3842c000, 0x3842e000,
-    0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843a000,
-    0x3843c000, 0x3843e000, 0x38440000, 0x38442000, 0x38444000, 0x38446000,
-    0x38448000, 0x3844a000, 0x3844c000, 0x3844e000, 0x38450000, 0x38452000,
-    0x38454000, 0x38456000, 0x38458000, 0x3845a000, 0x3845c000, 0x3845e000,
-    0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846a000,
-    0x3846c000, 0x3846e000, 0x38470000, 0x38472000, 0x38474000, 0x38476000,
-    0x38478000, 0x3847a000, 0x3847c000, 0x3847e000, 0x38480000, 0x38482000,
-    0x38484000, 0x38486000, 0x38488000, 0x3848a000, 0x3848c000, 0x3848e000,
-    0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849a000,
-    0x3849c000, 0x3849e000, 0x384a0000, 0x384a2000, 0x384a4000, 0x384a6000,
-    0x384a8000, 0x384aa000, 0x384ac000, 0x384ae000, 0x384b0000, 0x384b2000,
-    0x384b4000, 0x384b6000, 0x384b8000, 0x384ba000, 0x384bc000, 0x384be000,
-    0x384c0000, 0x384c2000, 0x384c4000, 0x384c6000, 0x384c8000, 0x384ca000,
-    0x384cc000, 0x384ce000, 0x384d0000, 0x384d2000, 0x384d4000, 0x384d6000,
-    0x384d8000, 0x384da000, 0x384dc000, 0x384de000, 0x384e0000, 0x384e2000,
-    0x384e4000, 0x384e6000, 0x384e8000, 0x384ea000, 0x384ec000, 0x384ee000,
-    0x384f0000, 0x384f2000, 0x384f4000, 0x384f6000, 0x384f8000, 0x384fa000,
-    0x384fc000, 0x384fe000, 0x38500000, 0x38502000, 0x38504000, 0x38506000,
-    0x38508000, 0x3850a000, 0x3850c000, 0x3850e000, 0x38510000, 0x38512000,
-    0x38514000, 0x38516000, 0x38518000, 0x3851a000, 0x3851c000, 0x3851e000,
-    0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852a000,
-    0x3852c000, 0x3852e000, 0x38530000, 0x38532000, 0x38534000, 0x38536000,
-    0x38538000, 0x3853a000, 0x3853c000, 0x3853e000, 0x38540000, 0x38542000,
-    0x38544000, 0x38546000, 0x38548000, 0x3854a000, 0x3854c000, 0x3854e000,
-    0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855a000,
-    0x3855c000, 0x3855e000, 0x38560000, 0x38562000, 0x38564000, 0x38566000,
-    0x38568000, 0x3856a000, 0x3856c000, 0x3856e000, 0x38570000, 0x38572000,
-    0x38574000, 0x38576000, 0x38578000, 0x3857a000, 0x3857c000, 0x3857e000,
-    0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858a000,
-    0x3858c000, 0x3858e000, 0x38590000, 0x38592000, 0x38594000, 0x38596000,
-    0x38598000, 0x3859a000, 0x3859c000, 0x3859e000, 0x385a0000, 0x385a2000,
-    0x385a4000, 0x385a6000, 0x385a8000, 0x385aa000, 0x385ac000, 0x385ae000,
-    0x385b0000, 0x385b2000, 0x385b4000, 0x385b6000, 0x385b8000, 0x385ba000,
-    0x385bc000, 0x385be000, 0x385c0000, 0x385c2000, 0x385c4000, 0x385c6000,
-    0x385c8000, 0x385ca000, 0x385cc000, 0x385ce000, 0x385d0000, 0x385d2000,
-    0x385d4000, 0x385d6000, 0x385d8000, 0x385da000, 0x385dc000, 0x385de000,
-    0x385e0000, 0x385e2000, 0x385e4000, 0x385e6000, 0x385e8000, 0x385ea000,
-    0x385ec000, 0x385ee000, 0x385f0000, 0x385f2000, 0x385f4000, 0x385f6000,
-    0x385f8000, 0x385fa000, 0x385fc000, 0x385fe000, 0x38600000, 0x38602000,
-    0x38604000, 0x38606000, 0x38608000, 0x3860a000, 0x3860c000, 0x3860e000,
-    0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861a000,
-    0x3861c000, 0x3861e000, 0x38620000, 0x38622000, 0x38624000, 0x38626000,
-    0x38628000, 0x3862a000, 0x3862c000, 0x3862e000, 0x38630000, 0x38632000,
-    0x38634000, 0x38636000, 0x38638000, 0x3863a000, 0x3863c000, 0x3863e000,
-    0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864a000,
-    0x3864c000, 0x3864e000, 0x38650000, 0x38652000, 0x38654000, 0x38656000,
-    0x38658000, 0x3865a000, 0x3865c000, 0x3865e000, 0x38660000, 0x38662000,
-    0x38664000, 0x38666000, 0x38668000, 0x3866a000, 0x3866c000, 0x3866e000,
-    0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867a000,
-    0x3867c000, 0x3867e000, 0x38680000, 0x38682000, 0x38684000, 0x38686000,
-    0x38688000, 0x3868a000, 0x3868c000, 0x3868e000, 0x38690000, 0x38692000,
-    0x38694000, 0x38696000, 0x38698000, 0x3869a000, 0x3869c000, 0x3869e000,
-    0x386a0000, 0x386a2000, 0x386a4000, 0x386a6000, 0x386a8000, 0x386aa000,
-    0x386ac000, 0x386ae000, 0x386b0000, 0x386b2000, 0x386b4000, 0x386b6000,
-    0x386b8000, 0x386ba000, 0x386bc000, 0x386be000, 0x386c0000, 0x386c2000,
-    0x386c4000, 0x386c6000, 0x386c8000, 0x386ca000, 0x386cc000, 0x386ce000,
-    0x386d0000, 0x386d2000, 0x386d4000, 0x386d6000, 0x386d8000, 0x386da000,
-    0x386dc000, 0x386de000, 0x386e0000, 0x386e2000, 0x386e4000, 0x386e6000,
-    0x386e8000, 0x386ea000, 0x386ec000, 0x386ee000, 0x386f0000, 0x386f2000,
-    0x386f4000, 0x386f6000, 0x386f8000, 0x386fa000, 0x386fc000, 0x386fe000,
-    0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870a000,
-    0x3870c000, 0x3870e000, 0x38710000, 0x38712000, 0x38714000, 0x38716000,
-    0x38718000, 0x3871a000, 0x3871c000, 0x3871e000, 0x38720000, 0x38722000,
-    0x38724000, 0x38726000, 0x38728000, 0x3872a000, 0x3872c000, 0x3872e000,
-    0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873a000,
-    0x3873c000, 0x3873e000, 0x38740000, 0x38742000, 0x38744000, 0x38746000,
-    0x38748000, 0x3874a000, 0x3874c000, 0x3874e000, 0x38750000, 0x38752000,
-    0x38754000, 0x38756000, 0x38758000, 0x3875a000, 0x3875c000, 0x3875e000,
-    0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876a000,
-    0x3876c000, 0x3876e000, 0x38770000, 0x38772000, 0x38774000, 0x38776000,
-    0x38778000, 0x3877a000, 0x3877c000, 0x3877e000, 0x38780000, 0x38782000,
-    0x38784000, 0x38786000, 0x38788000, 0x3878a000, 0x3878c000, 0x3878e000,
-    0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879a000,
-    0x3879c000, 0x3879e000, 0x387a0000, 0x387a2000, 0x387a4000, 0x387a6000,
-    0x387a8000, 0x387aa000, 0x387ac000, 0x387ae000, 0x387b0000, 0x387b2000,
-    0x387b4000, 0x387b6000, 0x387b8000, 0x387ba000, 0x387bc000, 0x387be000,
-    0x387c0000, 0x387c2000, 0x387c4000, 0x387c6000, 0x387c8000, 0x387ca000,
-    0x387cc000, 0x387ce000, 0x387d0000, 0x387d2000, 0x387d4000, 0x387d6000,
-    0x387d8000, 0x387da000, 0x387dc000, 0x387de000, 0x387e0000, 0x387e2000,
-    0x387e4000, 0x387e6000, 0x387e8000, 0x387ea000, 0x387ec000, 0x387ee000,
-    0x387f0000, 0x387f2000, 0x387f4000, 0x387f6000, 0x387f8000, 0x387fa000,
-    0x387fc000, 0x387fe000};
-
-static const uint16_t offsettable[64] = {
-    0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400};
-
-static const uint32_t exponenttable[64] = {
-    0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000,
-    0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000,
-    0x06000000, 0x06800000, 0x07000000, 0x07800000, 0x08000000, 0x08800000,
-    0x09000000, 0x09800000, 0x0a000000, 0x0a800000, 0x0b000000, 0x0b800000,
-    0x0c000000, 0x0c800000, 0x0d000000, 0x0d800000, 0x0e000000, 0x0e800000,
-    0x0f000000, 0x47800000, 0x80000000, 0x80800000, 0x81000000, 0x81800000,
-    0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000,
-    0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000,
-    0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8a000000, 0x8a800000,
-    0x8b000000, 0x8b800000, 0x8c000000, 0x8c800000, 0x8d000000, 0x8d800000,
-    0x8e000000, 0x8e800000, 0x8f000000, 0xc7800000};
-
-static const uint16_t basetable[512] = {
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010,
-    0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0c00, 0x1000,
-    0x1400, 0x1800, 0x1c00, 0x2000, 0x2400, 0x2800, 0x2c00, 0x3000, 0x3400,
-    0x3800, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00, 0x5000, 0x5400, 0x5800,
-    0x5c00, 0x6000, 0x6400, 0x6800, 0x6c00, 0x7000, 0x7400, 0x7800, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001,
-    0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200,
-    0x8400, 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00, 0xa000, 0xa400,
-    0xa800, 0xac00, 0xb000, 0xb400, 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800,
-    0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00, 0xe000, 0xe400, 0xe800, 0xec00,
-    0xf000, 0xf400, 0xf800, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00};
-
-static const uint8_t shifttable[512] = {
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13,
-    0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
-    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
-    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17,
-    0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d,
-    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
-    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
-    0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d};
-
-inline float16 float_to_half(float f) {
-  uint32_t v = *reinterpret_cast<uint32_t *>(&f);
-  return basetable[(v >> 23) & 0x1ff] +
-         ((v & 0x007fffff) >> shifttable[(v >> 23) & 0x1ff]);
-}
-
-inline float half_to_float(float16 h) {
-  uint32_t v = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] +
-               exponenttable[h >> 10];
-  return *reinterpret_cast<float *>(&v);
-}
-
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/fpga_cv.cpp b/lite/backends/fpga/KD/fpga_cv.cpp
deleted file mode 100644
index 15a20e368b..0000000000
--- a/lite/backends/fpga/KD/fpga_cv.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/fpga/KD/fpga_cv.hpp"
-
-using paddle::zynqmp::float16;
-
-void fpga_resize(float* input,
-                 int input_width,
-                 int input_height,
-                 int input_channel,
-                 uint8_t* output,
-                 int output_width,
-                 int output_height) {
-  paddle::zynqmp::InplaceArgs inplace_args = {0, 0, 0};
-  paddle::zynqmp::config_inplace(inplace_args);
-
-  paddle::zynqmp::ImageInputArgs input_args = {nullptr};
-  input_args.address = nullptr;
-  input_args.scale_address = nullptr;
-
-  float16* input_image_address =
-      reinterpret_cast<float16*>(paddle::zynqmp::fpga_malloc(
-          input_width * input_height * input_channel * sizeof(float16)));
-  int index = 0;
-
-  for (int i = 0; i < input_width * input_height * input_channel; i++) {
-    input_image_address[i] = float16(1.0 * input[i]);
-  }
-
-  paddle::zynqmp::ResizeArgs resize_args = {0};
-
-  resize_args.input_width = input_width;
-  resize_args.input_height = input_height;
-  resize_args.image_channel = input_channel;
-  resize_args.output_width = output_width;
-  resize_args.output_height = output_height;
-  float height_ratio = static_cast<float>(input_height) /
-                       static_cast<float>(resize_args.output_height);
-  float width_ratio = static_cast<float>(input_width) /
-                      static_cast<float>(resize_args.output_width);
-  resize_args.height_ratio = *reinterpret_cast<uint32_t*>(&height_ratio);
-  resize_args.width_ratio = *reinterpret_cast<uint32_t*>(&width_ratio);
-
-  int output_size =
-      resize_args.output_width * resize_args.output_height * input_channel;
-  float16* fpga_output = reinterpret_cast<float16*>(
-      paddle::zynqmp::fpga_malloc(output_size * sizeof(float16)));
-  resize_args.input_image_address = input_image_address;
-  resize_args.output_image_address = fpga_output;
-
-  memset(fpga_output, 0, output_size * sizeof(float16));
-  paddle::zynqmp::fpga_flush(
-      input_image_address,
-      input_width * input_height * input_channel * sizeof(float16));
-  paddle::zynqmp::fpga_flush(resize_args.output_image_address,
-                             output_size * sizeof(float16));
-  int ret = paddle::zynqmp::compute_fpga_resize(resize_args);
-  if (ret == 0) {
-    paddle::zynqmp::fpga_invalidate(resize_args.output_image_address,
-                                    output_size * sizeof(float16));
-  }
-
-  for (int i = 0; i < output_size; i++) {
-    output[i] = fpga_output[i];
-  }
-}
diff --git a/lite/backends/fpga/KD/fpga_cv.hpp b/lite/backends/fpga/KD/fpga_cv.hpp
deleted file mode 100644
index 6aa52edfbb..0000000000
--- a/lite/backends/fpga/KD/fpga_cv.hpp
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdlib.h>
-#include "lite/backends/fpga/KD/float16.hpp"
-#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
-#include "lite/backends/fpga/KD/pe.hpp"
-
-void fpga_resize(float* input,
-                 int input_width,
-                 int input_height,
-                 int input_channel,
-                 uint8_t* output,
-                 int output_width,
-                 int output_height);
diff --git a/lite/backends/fpga/KD/layout.hpp b/lite/backends/fpga/KD/layout.hpp
deleted file mode 100644
index 74819cd212..0000000000
--- a/lite/backends/fpga/KD/layout.hpp
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-
-#include "lite/backends/fpga/KD/alignment.h"
-
-namespace paddle {
-namespace zynqmp {
-
-enum LayoutType {
-  N,
-  NC,
-  NCHW,
-  NHWC,
-  NHW,
-};
-
-class Layout {
- public:
-  virtual int numIndex() = 0;
-  virtual int channelIndex() { return -1; }
-  virtual int heightIndex() { return -1; }
-  virtual int widthIndex() { return -1; }
-  virtual int alignedElementCount(const std::vector<int>& dims) = 0;
-  virtual int elementCount(const std::vector<int>& dims) = 0;
-};
-
-struct NCHW : Layout {
-  int numIndex() { return 0; }
-  int channelIndex() { return 1; }
-  int heightIndex() { return 2; }
-  int widthIndex() { return 3; }
-  int alignedElementCount(const std::vector<int>& dims) {
-    return dims[0] * dims[2] * align_image(dims[1] * dims[3]);
-  }
-  virtual int elementCount(const std::vector<int>& dims) {
-    return dims[0] * dims[1] * dims[2] * dims[3];
-  }
-};
-
-struct NHWC : Layout {
-  int numIndex() { return 0; }
-  int heightIndex() { return 1; }
-  int widthIndex() { return 2; }
-  int channelIndex() { return 3; }
-  int alignedElementCount(const std::vector<int>& dims) {
-    return dims[0] * dims[1] * align_image(dims[2] * dims[3]);
-  }
-  virtual int elementCount(const std::vector<int>& dims) {
-    return dims[0] * dims[1] * dims[2] * dims[3];
-  }
-};
-
-struct NC : Layout {
-  int numIndex() { return 0; }
-  int channelIndex() { return 1; }
-  int alignedElementCount(const std::vector<int>& dims) {
-    return dims[0] * dims[1];
-  }
-  virtual int elementCount(const std::vector<int>& dims) {
-    return dims[0] * dims[1];
-  }
-};
-
-struct N : Layout {
-  int numIndex() { return 0; }
-  int alignedElementCount(const std::vector<int>& dims) { return dims[0]; }
-  virtual int elementCount(const std::vector<int>& dims) { return dims[0]; }
-};
-
-struct NHW : Layout {
-  int numIndex() { return 0; }
-  int heightIndex() { return 1; }
-  int widthIndex() { return 2; }
-  int alignedElementCount(const std::vector<int>& dims) {
-    // TODO(chonwhite) align it;
-    return dims[0] * dims[1] * dims[2];
-  }
-  virtual int elementCount(const std::vector<int>& dims) {
-    return dims[0] * dims[1] * dims[2];
-  }
-};
-
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/llapi/bias_scale.cpp b/lite/backends/fpga/KD/llapi/bias_scale.cpp
deleted file mode 100644
index cd60f27f98..0000000000
--- a/lite/backends/fpga/KD/llapi/bias_scale.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory.h>
-
-#include "lite/backends/fpga/KD/llapi/bias_scale.h"
-#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
-
-namespace paddle {
-namespace zynqmp {
-namespace bias_scale {
-
-void align_element(float **data_in, int num_per_div_before_alignment, int num) {
-  int copynum = 0;
-  float *ptr_unaligned = *data_in;
-  int div_num =
-      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
-  int num_element =
-      2 * div_num * num_per_div_after_alignment;  // including bias & scale
-  float *ptr_aligned =
-      (float *)fpga_malloc(num_element * sizeof(float));  // NOLINT
-
-  memset(ptr_aligned, 0, num_element * sizeof(float));
-  for (int i = 0; i < div_num; i++) {
-    if (i == div_num - 1) {
-      copynum = (num_per_div_after_alignment * div_num > num)
-                    ? (num % num_per_div_after_alignment)
-                    : (num_per_div_before_alignment);
-    } else {
-      copynum = num_per_div_before_alignment;
-    }
-
-    memcpy(ptr_aligned + i * num_per_div_after_alignment,
-           ptr_unaligned + num_per_div_before_alignment * i,
-           copynum * sizeof(float));
-    memcpy(ptr_aligned + (div_num + i) * num_per_div_after_alignment,
-           ptr_unaligned + num_per_div_before_alignment * i + num,
-           copynum * sizeof(float));
-  }
-  fpga_free(ptr_unaligned);
-  *data_in = ptr_aligned;
-}
-
-void interleave(float **data_in, int num_after_alignment) {
-  float *ptr_uninterleaved = *data_in;
-  float *ptr_interleaved =
-      (float *)fpga_malloc(2 * num_after_alignment * sizeof(float));  // NOLINT
-  int num = num_after_alignment / 4;
-  for (int i = 0; i < num; i++) {
-    memcpy(
-        ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i, 4 * sizeof(float));
-    memcpy(ptr_interleaved + 8 * i + 4,
-           ptr_uninterleaved + num_after_alignment + 4 * i,
-           4 * sizeof(float));
-  }
-
-  fpga_free(ptr_uninterleaved);
-  *data_in = ptr_interleaved;
-}
-
-void format_bias_scale_array(float **bias_scale_array,
-                             int element_num_per_division,
-                             int num) {
-  align_element(bias_scale_array, element_num_per_division, num);
-  int div_num = (num + element_num_per_division - 1) / element_num_per_division;
-  int element_num_after_division =
-      align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
-  interleave(bias_scale_array, div_num * element_num_after_division);
-  fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float));
-}
-void format_bias_array(float **bias_array, int num) {
-  float *ptr_unaligned = *bias_array;
-  int num_before_align = num;
-  int num_after_align = align_to_x(num_before_align, BIAS_NUM_ALIGNMENT);
-  int16_t *ptr_aligned =
-      (int16_t *)fpga_malloc(num_after_align * sizeof(int16_t));  // NOLINT
-
-  memset(ptr_aligned, 0, num_after_align * sizeof(int16_t));
-  for (int i = 0; i < num_before_align; i++) {
-    float value = ptr_aligned[i];
-    ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]);
-  }
-  *bias_array = (float *)ptr_aligned;  // NOLINT
-  fpga_free(ptr_unaligned);
-}
-
-}  // namespace bias_scale
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/llapi/bias_scale.h b/lite/backends/fpga/KD/llapi/bias_scale.h
deleted file mode 100644
index 83f30df18f..0000000000
--- a/lite/backends/fpga/KD/llapi/bias_scale.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle {
-namespace zynqmp {
-namespace bias_scale {
-
-void align_element(float** data_in, int num_per_div_before_alignment, int num);
-void interleave(float** data_in, int num_after_alignment);
-void format_bias_scale_array(float** bias_scale_array,
-                             int element_num_per_division,
-                             int num);
-void format_bias_array(float** bias_array, int num);
-
-}  // namespace bias_scale
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/llapi/config.h b/lite/backends/fpga/KD/llapi/config.h
deleted file mode 100755
index acf8c8adf4..0000000000
--- a/lite/backends/fpga/KD/llapi/config.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#define PADDLE_LITE_ZU5
-#define FPGA_PRINT_MODE
-#define PADDLE_LITE_PROFILE
diff --git a/lite/backends/fpga/KD/llapi/filter.cpp b/lite/backends/fpga/KD/llapi/filter.cpp
deleted file mode 100644
index 0e41a204a8..0000000000
--- a/lite/backends/fpga/KD/llapi/filter.cpp
+++ /dev/null
@@ -1,317 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/fpga/KD/llapi/filter.h"
-#include <memory.h>
-#include <algorithm>
-#include "lite/backends/fpga/KD/float16.hpp"
-#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
-
-namespace paddle {
-namespace zynqmp {
-namespace filter {
-
-static int FILTER_SIZE = 2048;
-
-void set_filter_capacity(uint32_t cap) { FILTER_SIZE = cap; }
-
-int calc_division_capacity(int chw) {
-  int n = FILTER_SIZE / ((chw + 15) / 16) * 32;
-  return n < FILTER_SIZE ? n : FILTER_SIZE;
-}
-
-int calc_split_num(int num, int division_capacity) {
-  return (num + division_capacity - 1) / division_capacity;
-}
-
-int calc_division_number(int num, int group_num, int division_capacity) {
-  int split_num = calc_split_num(num, division_capacity);
-  return group_num * split_num;
-}
-
-int calc_num_per_div(int num, int group_num, int division_capacity) {
-  if (group_num == 1) {
-    if (num > division_capacity) {
-      return division_capacity;
-    } else {
-      return num;
-    }
-  } else {
-    return (num + group_num - 1) / group_num;
-  }
-}
-
-void convert_to_hwc(
-    char **data_in, int num, int channel, int height, int width) {
-  char *tmp = *data_in;
-  int chw = channel * height * width;
-  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));  // NOLINT
-  for (int n = 0; n < num; n++) {
-    int64_t amount_per_row = width * channel;
-    for (int c = 0; c < channel; c++) {
-      for (int h = 0; h < height; h++) {
-        int64_t offset_height = h * amount_per_row;
-        for (int w = 0; w < width; w++) {
-          *(data_tmp + n * chw + offset_height + w * channel + c) =
-              *((*data_in)++);
-        }
-      }
-    }
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-float find_max(float *data_in, int data_size) {
-  float max = 0.0;
-  for (int i = 0; i < data_size; ++i) {
-    float value = data_in[i];
-    float abs = value > 0 ? value : -value;
-    max = std::max(max, abs);
-  }
-  return max;
-}
-
-signed char float_to_int8(float fdata) {
-  if (fdata < 0.0) {
-    fdata -= 0.5;
-  } else {
-    fdata += 0.5;
-  }
-  return (signed char)fdata;
-}
-
-void quantize(float **data_in, int data_size, float max) {
-  float *tmp = *data_in;
-  float fix_range = 127;
-  float scale = fix_range / max;
-
-  signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
-  for (int i = 0; i < data_size; i++) {
-    tmp_data[i] = float_to_int8(
-        (*data_in)[i] * scale);  // (signed char)((*data_in)[i] * scale);
-  }
-  *data_in = (float *)tmp_data;  // NOLINT
-  fpga_free(tmp);
-}
-
-void align_element(char **data_in, int num, int chw) {
-  int j = 0;
-  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  if (align_chw != chw) {
-    char *tmp = *data_in;
-    char *data_tmp =
-        (char *)fpga_malloc(num * align_chw * sizeof(char));  // NOLINT
-
-    memset(data_tmp, 0, num * align_chw);
-    for (j = 0; j < num; j++) {
-      memcpy(data_tmp + j * align_chw, (*data_in) + j * chw, chw);
-    }
-    *data_in = data_tmp;
-    fpga_free(tmp);
-  }
-}
-
-void align_num(char **data_in,
-               int num_per_div_before_alignment,
-               int num,
-               int chw) {
-  int i = 0;
-  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-
-  char *tmp = *data_in;
-  int div_num =
-      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int num_element = div_num * num_per_div_after_alignment * align_chw;
-  char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char));  // NOLINT
-
-  memset(data_tmp, 0, num_element * sizeof(char));
-
-  for (i = 0; i < div_num - 1; i++) {
-    memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
-           *data_in + num_per_div_before_alignment * align_chw * i,
-           num_per_div_before_alignment * align_chw);
-  }
-
-  memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
-         *data_in + num_per_div_before_alignment * align_chw * i,
-         (num - (div_num - 1) * num_per_div_before_alignment) * align_chw);
-
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void reorder(char **data_in, int num_after_alignment, int chw) {
-  int index = 0;
-  int new_index = 0;
-
-  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-
-  char *data_tmp =
-      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
-                          sizeof(char));
-  char *tmp = *data_in;
-  for (index = 0; index < num_after_alignment; index++) {
-    new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) +
-                (index / 16 % 2 * 4);
-    memcpy(data_tmp + index * chw_align,
-           *data_in + new_index * chw_align,
-           chw_align);
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-size_t interleave(char **data_in, int num_after_alignment, int chw) {
-  int i = 0;
-  int j = 0;
-  int k = 0;
-  int interleave_per_num = 16;
-
-  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  char *data_tmp =
-      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
-                          sizeof(char));
-  char *tmp = *data_in;
-  int interleave_num = chw_align * 2 / interleave_per_num;
-  for (i = 0; i < num_after_alignment; i += 2) {
-    for (j = 0, k = 0; j < interleave_num; j += 2, k++) {
-      memcpy(data_tmp + i * chw_align + interleave_per_num * j,
-             *data_in + i * chw_align + interleave_per_num * k,
-             interleave_per_num);
-      memcpy(data_tmp + i * chw_align + interleave_per_num * (j + 1),
-             *data_in + (i + 1) * chw_align + interleave_per_num * k,
-             interleave_per_num);
-    }
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-  return chw_align * num_after_alignment;
-}
-
-size_t format_filter(float **data_in,
-                     int num,
-                     int channel,
-                     int height,
-                     int width,
-                     int group_num,
-                     float max) {
-  int data_size = channel * height * width * num;
-  int chw = channel * height * width;
-
-  int division_capacity = calc_division_capacity(chw);
-  int num_per_div_before_alignment =
-      calc_num_per_div(num, group_num, division_capacity);
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-  int div_num =
-      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int residual = num % num_per_div_before_alignment;
-  int num_after_alignment = num_per_div_after_alignment *
-                                ((residual == 0) ? div_num : (div_num - 1)) +
-                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
-  quantize(data_in, data_size, max);
-  char **quantize_data = (char **)data_in;  // NOLINT
-  convert_to_hwc(quantize_data, num, channel, height, width);
-  align_element(quantize_data, num, chw);
-  if (num_after_alignment != num) {
-    align_num(quantize_data, num_per_div_before_alignment, num, chw);
-  }
-
-  reorder(quantize_data, num_after_alignment, chw);
-  size_t mem_size = interleave(quantize_data, num_after_alignment, chw);
-  fpga_flush(*quantize_data,
-             align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment *
-                 sizeof(char));
-  return mem_size;
-}
-
-void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
-  int16_t *tmp = *data_in;
-  int16_t *data_tmp =
-      (int16_t *)fpga_malloc(height * width * num * sizeof(int16_t));  // NOLINT
-  for (int n = 0; n < num; n++) {
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        *(data_tmp + h * width * num + w * num + n) = *((*data_in)++);
-      }
-    }
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-size_t align_element_n(int16_t **data_in, int num, int height, int width) {
-  int unalign_n = num;
-  int align_n = align_to_x(num, FILTER_ELEMENT_ALIGNMENT);
-  int num_element = height * width * align_n;
-  if (unalign_n != align_n) {
-    int16_t *tmp = *data_in;
-
-    int num_element = height * width * align_n;
-    int16_t *data_tmp =
-        (int16_t *)fpga_malloc(num_element * sizeof(int16_t));  // NOLINT
-
-    memset(data_tmp, 0, num_element * sizeof(int16_t));
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        int offset_unalign = h * width * unalign_n + w * unalign_n;
-        int offset_align = h * width * align_n + w * align_n;
-        for (int n = 0; n < unalign_n; n++) {
-          data_tmp[offset_align + n] = *((*data_in) + offset_unalign + n);
-        }
-      }
-    }
-    *data_in = data_tmp;
-    free(tmp);
-  }
-  return num_element * sizeof(int16_t);
-}
-
-void quantize_to_fp16(
-    float **data_in, int num, int height, int width, float *scale_ptr) {
-  float *tmp = *data_in;
-  int size = num * height * width;
-
-  float16 *tmp_data = (float16 *)fpga_malloc(size * sizeof(float16));  // NOLINT
-  for (int n = 0; n < num; n++) {
-    float scale_val = scale_ptr[n];
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        int index = n * height * width + h * width + w;
-        float value = tmp[index] * scale_val;
-        tmp_data[index] = float_to_half(value);
-      }
-    }
-  }
-  fpga_flush(tmp_data, size * sizeof(int16_t));
-  *data_in = (float *)tmp_data;  // NOLINT
-  fpga_free(tmp);
-}
-size_t format_dwconv_filter(
-    float **data_in, int num, int height, int width, float *scale_ptr) {
-  quantize_to_fp16(data_in, num, height, width, scale_ptr);
-  int16_t **quantize_data = (int16_t **)data_in;  // NOLINT
-  convert_to_hwn(quantize_data, num, height, width);
-  size_t size = align_element_n(quantize_data, num, height, width);
-  fpga_flush(*quantize_data,
-             align_to_x(num, FILTER_ELEMENT_ALIGNMENT) * height * width *
-                 sizeof(int16_t));
-  return size;
-}
-}  // namespace filter
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/llapi/filter.h b/lite/backends/fpga/KD/llapi/filter.h
deleted file mode 100644
index 7d9c6c2e01..0000000000
--- a/lite/backends/fpga/KD/llapi/filter.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdint>
-#include <cstdlib>
-#include <cwchar>
-
-namespace paddle {
-namespace zynqmp {
-namespace filter {
-
-void set_filter_capacity(uint32_t cap);
-int calc_division_capacity(int chw);
-int calc_split_num(int num, int division_capacity);
-int calc_division_number(int num, int group_num, int division_capacity);
-int calc_num_per_div(int num, int group_num, int division_capacity);
-void convert_to_hwc(
-    char** data_in, int num, int channel, int height, int width);
-float find_max(float* data_in, int data_size);
-void quantize(float** data_in, int data_size, float max);
-void align_element(char** data_in, int num, int chw);
-void align_num(char** data_in,
-               int num_per_div_before_alignment,
-               int num,
-               int chw);
-void reorder(char** data_in, int num_after_alignment, int chw);
-size_t interleave(char** data_in, int num_after_alignment, int chw);
-size_t format_filter(float** data_in,
-                     int num,
-                     int channel,
-                     int height,
-                     int width,
-                     int group_num,
-                     float max);
-
-void convert_to_hwn(int16_t** data_in, int num, int height, int width);
-size_t align_element_n(int16_t** data_in, int num, int height, int width);
-void quantize_to_fp16(
-    float** data_in, int num, int height, int width, float* scale_ptr);
-size_t format_dwconv_filter(
-    float** data_in, int num, int height, int width, float* scale_ptr);
-
-}  // namespace filter
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
deleted file mode 100644
index 1f1226ead3..0000000000
--- a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
+++ /dev/null
@@ -1,327 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fcntl.h>
-#include <sys/ioctl.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <algorithm>
-#include <cstring>
-#include <map>
-#include <utility>
-
-#include "lite/backends/fpga/KD/llapi/config.h"
-#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
-
-namespace paddle {
-namespace zynqmp {
-
-#define PADDLE_LITE_OS_LINUX
-
-static int fd = -1;
-static const char *device_path = "/dev/fpgadrv0";
-static std::map<void *, size_t> memory_map;
-
-static size_t memory_size_max = 0;
-static size_t memory_size = 0;
-
-static inline int do_ioctl(uint64_t req, const void *arg) {
-  int ret = -1;
-#ifdef PADDLE_LITE_OS_LINUX
-  ret = ioctl(fd, req, arg);
-  if (ret != 0) {
-    throw - 1;
-  }
-#else
-  return ret;
-#endif
-}
-
-int open_device() {
-  if (fd == -1) {
-    fd = open(device_path, O_RDWR);
-  }
-  return fd;
-}
-
-void close_device() { close(fd); }
-
-void reset_device() {
-  FpgaResetArgs args;
-  do_ioctl(IOCTL_FPGA_RESET, &args);
-}
-
-// memory management;
-void *fpga_malloc(size_t size) {
-#ifdef PADDLE_LITE_OS_LINUX
-  void *ptr = reinterpret_cast<void *>(
-      mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
-  if (ptr == NULL) {
-    std::cout << "not enough memory !";
-    exit(-1);
-  }
-  memory_map.insert(std::make_pair(ptr, size));
-  memory_size += size;
-  if (memory_size > memory_size_max) {
-    memory_size_max = memory_size;
-  }
-  return ptr;
-#else
-  return malloc(size);
-#endif
-}
-
-size_t fpga_get_memory_size(void *ptr) { return memory_map[ptr]; }
-
-size_t fpga_get_memory_size_max() { return memory_size_max; }
-
-size_t fpga_diagnose_memory(int detailed) {
-  size_t total = 0;
-  auto iter = memory_map.begin();  // std::map<void *, size_t>::iterator
-  while (iter != memory_map.end()) {
-    total += iter->second;
-    iter++;
-  }
-  return total;
-}
-
-void fpga_free(void *ptr) {
-  size_t size = 0;
-  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
-  if (iter != memory_map.end()) {
-    size = iter->second;
-    memory_map.erase(iter);
-  }
-
-  memory_size -= size;
-
-#ifdef PADDLE_LITE_OS_LINUX
-
-  munmap(ptr, size);
-#else
-  free(ptr);
-#endif
-}
-
-void fpga_copy(void *dst, const void *src, int size) { memcpy(dst, src, size); }
-
-int fpga_flush(void *address, size_t size) {
-  struct MemoryCacheArgs args;
-  args.address = address;
-  args.size = size;
-  return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
-}
-
-int fpga_invalidate(void *address, size_t size) {
-  struct MemoryCacheArgs args;
-  args.address = address;
-  args.size = size;
-  return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
-}
-
-int invalidate_cache(void *addr, int size) {
-  struct MemoryCacheArgs args;
-  args.address = addr;
-  args.size = size;
-  return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
-}
-
-int flush_cache(void *addr, int size) {
-  struct MemoryCacheArgs args;
-  args.address = addr;
-  args.size = size;
-  return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
-}
-
-void fpga_copy(void *dest, const void *src, size_t num) {
-  memcpy(dest, src, num);
-}
-
-int ioctl_conv(const struct ConvArgs &args) {
-  return do_ioctl(IOCTL_CONFIG_CONV, &args);
-}
-
-int compute_fpga_conv_basic(const struct ConvArgs &args) {
-  return do_ioctl(IOCTL_CONFIG_CONV, &args);
-}
-
-int compute_fpga_conv(const struct SplitConvArgs &args) {
-  int split_num = args.split_num;
-  int ret = -1;
-  for (int i = 0; i < split_num; i++) {
-    ret = compute_fpga_conv_basic(args.conv_arg[i]);
-  }
-
-  if (split_num > 1) {
-    std::cout << "Split num > 1 !!!!!!!!!!!!!!!!!!" << std::endl;
-    exit(-1);
-  }
-  return ret;
-}
-
-int compute_fpga_pool(const struct PoolingArgs &args) {
-  return do_ioctl(IOCTL_CONFIG_POOLING, &args);
-}
-
-int compute_fpga_ewadd(const struct EWAddArgs &args) {
-  return do_ioctl(IOCTL_CONFIG_EW, &args);
-}
-
-int get_device_info(const struct DeviceInfo &args) {
-  int ret = do_ioctl(IOCTL_DEVICE_INFO, &args);
-  return ret;
-}
-
-int perform_bypass(const struct BypassArgs &args) {
-  int size = args.image.channels * args.image.width * args.image.height;
-  int max_size = 1 << 21;
-
-  float times = 1.0 * size / max_size;
-  int count = static_cast<int>(times);
-
-  void *input_address = args.image.address;
-  int type_size =
-      args.input_data_type == DATA_TYPE_FP32 ? sizeof(float) : sizeof(int16_t);
-
-  void *output_address = args.output.address;
-  int out_type_size =
-      args.output_data_type == DATA_TYPE_FP32 ? sizeof(float) : sizeof(int16_t);
-
-  float scales[2];
-  struct BypassArgs bypassArgs = args;
-  bypassArgs.image.width = 1;
-  bypassArgs.image.height = 1;
-  bypassArgs.output.scale_address = scales;
-
-  float scale = 0;
-  for (int i = 0; i < count; ++i) {
-    bypassArgs.image.channels = max_size;
-    bypassArgs.image.address =
-        reinterpret_cast<char *>(input_address + i * max_size * type_size);
-    bypassArgs.output.address =
-        reinterpret_cast<char *>(output_address + i * max_size * out_type_size);
-    int ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
-    scale = std::max(scale, scales[0]);
-
-    if (ret != 0) {
-      return ret;
-    }
-  }
-
-  int remainder = size - max_size * count;
-  bypassArgs.image.channels = remainder;
-  bypassArgs.image.address =
-      reinterpret_cast<char *>(input_address + count * max_size * type_size);
-  bypassArgs.output.address = reinterpret_cast<char *>(
-      output_address + count * max_size * out_type_size);
-  int ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
-  scale = std::max(scale, scales[0]);
-  args.output.scale_address[0] = scale;
-  args.output.scale_address[1] = 1.0f / scale;
-  return ret;
-}
-
-int compute_fpga_concat(const struct ConcatArgs &args) { return -1; }
-
-int compute_fpga_scale(const struct ScaleArgs &args) {
-#ifdef ENABLE_DEBUG
-  std::cout << "======Compute Scale======";
-  std::cout << "scale_address:" << args.scale_address << std::endl;
-  std::cout << "bias_address:" << args.bias_address << std::endl;
-
-  std::cout << "wc_alignment:" << args.wc_alignment << std::endl;
-  std::cout << "channel_alignment:" << args.channel_alignment << std::endl;
-
-  std::cout << "   image_address:" << args.image.address
-            << "   image_scale_address:" << args.image.scale_address
-            << "   image_channels:" << args.image.channels
-            << "   image_height:" << args.image.height
-            << "   image_width:" << args.image.width
-            << "   pad_height:" << args.image.pad_height
-            << "   pad_width:" << args.image.pad_width;
-
-  std::cout << "   out_address:" << args.output.address
-            << "   out_scale_address:" << args.output.scale_address;
-
-#endif
-  return do_ioctl(IOCTL_CONFIG_SCALE, &args);
-}
-
-int compute_fpga_dwconv(const struct DWconvArgs &args) {
-#ifdef ENABLE_DEBUG
-  std::cout << "======Compute Basic Conv======";
-  std::cout << "   relu_enabled:" << args.relu_enabled
-            << "   filter_address:" << args.filter_address;
-  std::cout << "   image_address:" << args.image.address
-            << "   image_scale_address:" << args.image.scale_address
-            << "   image_channels:" << args.image.channels
-            << "   image_height:" << args.image.height
-            << "   image_width:" << args.image.width
-            << "   pad_height:" << args.image.pad_height
-            << "   pad_width:" << args.image.pad_width;
-  std::cout << "   kernel_height:" << args.kernel.height
-            << "   kernel_width:" << args.kernel.width
-            << "   stride_h:" << args.kernel.stride_h
-            << "   stride_w:" << args.kernel.stride_w;
-  std::cout << "   out_address:" << args.output.address
-            << "   out_scale_address:" << args.output.scale_address;
-
-#endif
-  return do_ioctl(IOCTL_CONFIG_DWCONV, &args);
-}
-
-int config_inplace(const struct InplaceArgs &args) {
-  return do_ioctl(IOCTL_CONFIG_INPLACE, &args);
-}
-
-int config_norm_param(const struct NormalizeParameterArgs &args) {
-  return do_ioctl(IOCTL_CONFIG_NORMALIZE_PARAMETER, &args);
-}
-
-int compute_norm(const struct NormalizeArgs &args) {
-  return do_ioctl(IOCTL_CONFIG_NORMALIZE, &args);
-}
-
-int compute_fpga_resize(const struct ResizeArgs &args) {
-  return do_ioctl(IOCTL_CONFIG_RESIZE, &args);
-}
-
-int16_t fp32_2_fp16(float fp32_num) {
-  unsigned long tmp = *(unsigned long *)(&fp32_num);  // NOLINT
-  auto t = (int16_t)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
-                     (((tmp & 0x7f800000) >> 13) - (112 << 10)));
-  if (tmp & 0x1000) {
-    t++;  // roundoff
-  }
-  return t;
-}
-
-float fp16_2_fp32(int16_t fp16_num) {
-  if (0 == fp16_num) {
-    return 0;
-  }
-  int frac = (fp16_num & 0x3ff);
-  int exp = ((fp16_num & 0x7c00) >> 10) + 112;
-  int s = fp16_num & 0x8000;
-  int tmp = 0;
-  float fp32_num = 0;
-  tmp = s << 16 | exp << 23 | frac << 13;
-  fp32_num = *(float *)&tmp;  // NOLINT
-  return fp32_num;
-}
-
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.h b/lite/backends/fpga/KD/llapi/zynqmp_api.h
deleted file mode 100644
index 7d22de95a2..0000000000
--- a/lite/backends/fpga/KD/llapi/zynqmp_api.h
+++ /dev/null
@@ -1,347 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <cstddef>
-#include <iostream>
-#include <limits>
-
-namespace paddle {
-namespace zynqmp {
-
-typedef int16_t half;
-
-#define IMAGE_ALIGNMENT 16           // Aligned to 16
-#define FILTER_NUM_ALIGNMENT 32      // Filter number aligned to 32
-#define FILTER_ELEMENT_ALIGNMENT 16  // Filter element number aligned to 16
-#define BS_NUM_ALIGNMENT 8
-#define BIAS_NUM_ALIGNMENT 16
-
-enum DDataType {
-  DATA_TYPE_FP32 = 1,
-  DATA_TYPE_FP16 = 0,
-};
-
-enum DLayoutType {
-  LAYOUT_CHW = 1,
-  LAYOUT_HWC = 0,
-};
-
-struct VersionArgs {
-  void* buffer;
-};
-
-struct DeviceInfo {
-  uint32_t filter_cap;
-  uint32_t version;
-  uint16_t device_type;
-  uint32_t reserved0;
-  uint32_t reserved1;
-  uint32_t reserved2;
-  uint32_t reserved3;
-  uint32_t reserved4;
-  uint32_t reserved5;
-  uint32_t reserved6;
-};
-
-struct MemoryCopyArgs {
-  void* src;
-  void* dest;
-  size_t size;
-};
-
-struct MemoryCacheArgs {
-  void* address;
-  size_t size;
-};
-
-struct MemoryBarrierArgs {};
-
-struct BNArgs {
-  bool enabled;
-  void* bias_address;
-  void* scale_address;
-};
-
-/**
-Conv and Pooling kernel
-*/
-struct KernelArgs {
-  uint32_t width;
-  uint32_t height;
-  uint32_t stride_w;
-  uint32_t stride_h;
-};
-
-struct ImageInputArgs {
-  void* address;        // input featuremap virtual address
-  void* scale_address;  // input scale address;
-  uint32_t channels;
-  uint32_t width;  // featuremap width
-  uint32_t height;
-  uint32_t pad_width;  // padding width;
-  uint32_t pad_height;
-};
-
-struct ImageOutputArgs {
-  void* address;         // output result address;
-  float* scale_address;  // output scale address;
-};
-
-struct ConvArgs {
-  bool relu_enabled;
-  void* sb_address;  // scale and bias are interlaced;
-  void* filter_address;
-  void* filter_scale_address;
-  uint32_t filter_num;
-  uint32_t group_num;
-
-  struct KernelArgs kernel;
-  struct ImageInputArgs image;  // input image;
-  struct ImageOutputArgs output;
-};
-
-struct DWconvArgs {
-  bool relu_enabled;
-  void* bias_address;
-  void* filter_address;
-  struct KernelArgs kernel;
-  struct ImageInputArgs image;
-  struct ImageOutputArgs output;
-  uint16_t out_width;
-  uint16_t out_height;
-  uint16_t sub_conv_num;
-};
-
-struct PoolingArgs {
-  uint16_t mode;
-  uint16_t kernel_reciprocal;
-  struct KernelArgs kernel;
-  struct ImageInputArgs image;  // input image;
-  struct ImageOutputArgs output;
-  uint16_t out_width;
-  uint16_t out_height;
-};
-
-// elementwise add arguments
-struct EWAddArgs {
-  bool relu_enabled;
-
-  uint32_t const0;  // output0 = const0 x input0 + const1 x input1;
-  uint32_t const1;
-  struct ImageInputArgs image0;
-  struct ImageInputArgs image1;
-  struct ImageOutputArgs output;
-};
-
-struct BypassArgs {
-  enum DDataType input_data_type;
-  enum DDataType output_data_type;
-  enum DLayoutType input_layout_type;
-  enum DLayoutType output_layout_type;
-  struct ImageInputArgs image;
-  struct ImageOutputArgs output;
-};
-
-struct ScaleArgs {
-  void* scale_address;
-  void* bias_address;
-  uint32_t wc_alignment;
-  uint32_t channel_alignment;
-
-  struct ImageInputArgs image;
-  struct ImageOutputArgs output;
-};
-
-struct NormalizeArgs {
-  void* input_image_address;
-  void* output_image_address;
-  uint32_t image_width;
-  uint32_t image_height;
-  uint32_t image_channel;
-  uint32_t* output_scale_address;
-};
-
-struct ResizeArgs {
-  void* input_image_address;
-  void* output_image_address;
-  uint32_t input_width;
-  uint32_t input_height;
-  uint32_t image_channel;
-  uint32_t output_width;
-  uint32_t output_height;
-  uint32_t height_ratio;
-  uint32_t width_ratio;
-  uint32_t* output_scale_address;
-};
-
-struct PowerParameterArgs {
-  uint16_t shift;
-  uint16_t scale;
-  uint16_t power;
-};
-
-struct NormalizeParameterArgs {
-  uint32_t channel;
-  uint32_t hight_width;
-};
-
-struct InplaceArgs {
-  bool leaky_relu_enable;
-  bool relu_enable;
-  bool power_enable;
-  bool normalize_enable;
-};
-
-struct FpgaRegWriteArgs {
-  uint64_t address;  //
-  uint64_t value;
-};
-
-struct FpgaRegReadArgs {
-  uint64_t address;
-  uint64_t value;
-};
-
-struct FpgaResetArgs {};
-
-#define IOCTL_FPGA_MAGIC (('F' + 'P' + 'G' + 'A') / 4)
-
-#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs)
-#define IOCTL_DEVICE_INFO _IOW(IOCTL_FPGA_MAGIC, 100, struct DeviceInfo)
-
-#define IOCTL_SEPARATOR_0 10
-
-#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
-#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
-#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
-#define IOCTL_MEMORY_BARRIER \
-  _IOW(IOCTL_FPGA_MAGIC, 14, struct MemoryBarrierArgs)
-
-#define IOCTL_SEPARATOR_1 20
-
-#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct ConvArgs)
-#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct PoolingArgs)
-#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct EWAddArgs)
-#define IOCTL_CONFIG_BYPASS _IOW(IOCTL_FPGA_MAGIC, 24, struct BypassArgs)
-#define IOCTL_CONFIG_SCALE _IOW(IOCTL_FPGA_MAGIC, 25, struct ScaleArgs)
-#define IOCTL_CONFIG_NORMALIZE _IOW(IOCTL_FPGA_MAGIC, 26, struct NormalizeArgs)
-#define IOCTL_CONFIG_RESIZE _IOW(IOCTL_FPGA_MAGIC, 30, struct ResizeArgs)
-
-#define IOCTL_CONFIG_DWCONV _IOW(IOCTL_FPGA_MAGIC, 31, struct DWconvArgs)
-
-#define IOCTL_CONFIG_INPLACE _IOW(IOCTL_FPGA_MAGIC, 40, struct InplaceArgs)
-#define IOCTL_CONFIG_POWER_PARAMETER \
-  _IOW(IOCTL_FPGA_MAGIC, 41, struct PowerParameterArgs)
-#define IOCTL_CONFIG_NORMALIZE_PARAMETER \
-  _IOW(IOCTL_FPGA_MAGIC, 42, struct NormalizeParameterArgs)
-#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 50, struct FpgaRegReadArgs)
-#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 51, struct FpgaRegWriteArgs)
-#define IOCTL_FPGA_RESET _IOW(IOCTL_FPGA_MAGIC, 52, struct FpgaResetArgs)
-
-//============================== API =============================
-
-struct DeconvArgs {
-  uint32_t sub_conv_num;
-  uint32_t group_num;
-  uint32_t filter_num;
-  uint32_t omit_size;
-  uint32_t sub_output_width;
-  uint32_t sub_output_height;
-  struct ImageOutputArgs output;
-  struct SplitConvArgs* split_conv_args;
-};
-
-struct SplitArgs {
-  uint32_t image_num;
-  int16_t* image_in;
-  float* scale_in;
-  void** images_out;
-  float** scales_out;
-  uint32_t* out_channel_nums;
-  uint32_t height;
-  uint32_t width;
-};
-
-struct ConcatArgs {
-  uint32_t image_num;
-  half** images_in;
-  float** scales_in;
-  void* image_out;
-  float* scale_out;
-  uint32_t* channel_num;
-  uint32_t height;
-  uint32_t width;
-};
-
-struct SplitConvArgs {
-  uint32_t split_num;
-  uint32_t group_num;
-  uint32_t filter_num;
-  struct ImageOutputArgs output;
-  struct ConvArgs* conv_arg;
-  struct ConcatArgs concat_arg;
-};
-
-struct GroupConvArgs {
-  uint32_t group_num;
-  uint32_t filter_num;
-  struct ImageOutputArgs output;
-  struct SplitConvArgs* conv_args;
-  struct ConcatArgs concat_arg;
-};
-
-inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
-int open_device();
-void close_device();
-void reset_device();
-
-void* fpga_malloc(size_t size);
-void fpga_free(void* ptr);
-size_t fpga_get_memory_size(void* ptr);
-size_t fpga_get_memory_size_max();
-size_t fpga_diagnose_memory(int detailed);
-
-void fpga_copy(void* dst, const void* src, int size);
-
-int fpga_flush(void* address, size_t size);
-int fpga_invalidate(void* address, size_t size);
-
-int get_device_info(const struct DeviceInfo& args);
-
-int perform_bypass(const struct BypassArgs& args);
-int compute_fpga_conv_basic(const struct ConvArgs& args);
-int compute_fpga_conv(const struct SplitConvArgs& args);
-int compute_fpga_pool(const struct PoolingArgs& args);
-int compute_fpga_ewadd(const struct EWAddArgs& args);
-int compute_fpga_scale(const struct ScaleArgs& args);
-int compute_fpga_concat(const struct ConcatArgs& args);
-int compute_fpga_resize(const struct ResizeArgs& args);
-
-int config_power(const struct PowerArgs& args);
-int compute_fpga_dwconv(const struct DWconvArgs& args);
-int config_norm_param(const struct NormalizeParameterArgs& args);
-int compute_norm(const struct NormalizeArgs& args);
-
-int config_inplace(const struct InplaceArgs& args);
-
-int flush_cache(void* addr, int size);
-int invalidate_cache(void* addr, int size);
-
-int16_t fp32_2_fp16(float fp32_num);
-float fp16_2_fp32(int16_t fp16_num);
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/pe.hpp b/lite/backends/fpga/KD/pe.hpp
deleted file mode 100644
index d1dc3c4caa..0000000000
--- a/lite/backends/fpga/KD/pe.hpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdio.h>
-#include <iostream>
-#include "lite/backends/fpga/KD/pe_params.hpp"
-#include "lite/backends/fpga/KD/tensor_util.hpp"
-
-namespace paddle {
-namespace zynqmp {
-
-class PE {
- public:
-  virtual bool init() { return false; }
-
-  virtual void apply() {}
-
-  virtual bool dispatch() { return false; }
-
-  virtual ~PE() {}
-};
-
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/pe_params.hpp b/lite/backends/fpga/KD/pe_params.hpp
deleted file mode 100644
index 709f04d399..0000000000
--- a/lite/backends/fpga/KD/pe_params.hpp
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdio.h>
-#include <vector>
-
-#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
-#include "lite/backends/fpga/KD/tensor.hpp"
-
-namespace paddle {
-namespace zynqmp {
-
-struct ReLUParam {
- public:
-  bool enabled = false;
-};
-
-struct PEParam {
-  ReLUParam relu;
-};
-
-struct InputParam : PEParam {
- public:
-  Tensor* input = nullptr;
-  Tensor* output = nullptr;
-};
-
-struct OutputParam : PEParam {
- public:
-  Tensor* input = nullptr;
-  Tensor* output = nullptr;
-};
-
-struct BatchnormParam : PEParam {
- public:
-  Tensor* input = nullptr;
-  Tensor* output = nullptr;
-
-  Tensor* bias = nullptr;
-  Tensor* scale = nullptr;
-  Tensor* mean = nullptr;
-  Tensor* variance = nullptr;
-  float epsilon = 0;
-};
-
-struct BasicConvParam {
-  Tensor input;
-  Tensor output;
-  Tensor filter;
-  Tensor scaleBias;
-  ConvArgs args;
-};
-
-struct ConvParam : PEParam {
- public:
-  Tensor* input = nullptr;
-  Tensor* output = nullptr;
-  Tensor* filter = nullptr;
-
-  int groups = 1;
-  std::vector<int> strides;
-  std::vector<int> paddings;
-  std::vector<int> kernelSize;
-  std::vector<int> dilations;
-
-  Tensor* scale() { return scale_; }
-
-  Tensor* bias() { return bias_; }
-
-  std::vector<BasicConvParam*>& splitParams() { return splitParams_; }
-
- protected:
-  std::vector<BasicConvParam*> splitParams_;
-  Tensor* scale_ = new Tensor();
-  Tensor* bias_ = new Tensor();
-};
-
-struct DepthwiseConvParam : ConvParam {
- public:
-  Tensor* quantizedFilter() { return quantizedFilter_; }
-
-  DWconvArgs args;
-
- protected:
-  Tensor* quantizedFilter_ = new Tensor();
-};
-
-enum PoolingType : int {
-  MAX = 0,
-  AVERAGE = 1,
-};
-
-struct PoolingParam : PEParam {
- public:
-  Tensor* input = nullptr;
-  Tensor* output = nullptr;
-
-  PoolingType type = PoolingType::MAX;
-  bool globalPooling = false;
-  std::vector<int> kernelSize;
-  std::vector<int> strides;
-  std::vector<int> paddings;
-
-  PoolingArgs poolingArgs = {0};
-};
-
-struct ConcatParam : PEParam {
- public:
-  std::vector<Tensor*> inputs;
-  Tensor* output;
-  int axis = 0;
-};
-
-struct ElementwiseAddParam : PEParam {
- public:
-  std::vector<Tensor*> inputs;
-  Tensor* output = nullptr;
-  int axis = 0;
-
-  EWAddArgs ewargs;
-};
-
-struct FullyConnectedParam : PEParam {
- public:
-  Tensor* input = nullptr;
-  Tensor* filter = nullptr;
-  Tensor* bias = nullptr;
-  Tensor* output = nullptr;
-
-  Tensor* quantizedFilter() { return quantizedFilter_; }
-
-  Tensor* biasScale() { return biasScale_; }
-
- protected:
-  Tensor* quantizedFilter_ = new Tensor();
-  Tensor* biasScale_ = new Tensor();
-};
-
-struct SoftmaxParam : PEParam {
- public:
-  Tensor* input = nullptr;
-
-  Tensor* output = nullptr;
-
- private:
-  Tensor* floatInput = nullptr;
-};
-
-struct SplitParam : PEParam {
- public:
-  Tensor* input = nullptr;
-  std::vector<Tensor*> outputs;
-  int axis = 1;
-  int num = 1;
-};
-
-struct NormParam : PEParam {
- public:
-  Tensor* input = nullptr;
-
-  Tensor* output = nullptr;
-  float epsilon = 0;
-
- private:
-  Tensor* floatInput = nullptr;
-};
-
-struct PriorBoxParam : PEParam {
-  Tensor* input;
-  Tensor* image;
-  Tensor* outputBoxes;
-  Tensor* outputVariances;
-
-  std::vector<float> minSizes;
-  std::vector<float> maxSizes;
-  std::vector<float> aspectRatios;
-  std::vector<float> variances;
-
-  bool minMaxAspectRatiosOrder;
-  bool flip;
-  bool clip;
-  float stepW;
-  float stepH;
-  float offset;
-};
-
-struct ScaleParam : PEParam {
- public:
-  Tensor* input = nullptr;
-  Tensor* output = nullptr;
-  Tensor* scale = nullptr;
-  Tensor* bias = nullptr;
-
-  Tensor* alignedScale() { return alignedScale_; }
-
-  Tensor* alignedBias() { return alignedBias_; }
-
-  ScaleArgs args = {0};
-
- protected:
-  Tensor* alignedScale_ = new Tensor();
-  Tensor* alignedBias_ = new Tensor();
-};
-
-struct ResizeParam : PEParam {
- public:
-  Tensor* input = nullptr;
-  Tensor* output = nullptr;
-};
-
-struct CropParam : PEParam {
- public:
-  Tensor* input = nullptr;
-  Tensor* output = nullptr;
-  int axis = 2;
-  std::vector<int> offsets;
-  std::vector<int> shape;
-};
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/pes/batchnorm_pe.hpp b/lite/backends/fpga/KD/pes/batchnorm_pe.hpp
deleted file mode 100644
index a207875105..0000000000
--- a/lite/backends/fpga/KD/pes/batchnorm_pe.hpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <fstream>
-#include <iostream>
-
-#include "lite/backends/fpga/KD/pe.hpp"
-#include "lite/backends/fpga/KD/pe_params.hpp"
-#include "lite/backends/fpga/KD/pes/scale_pe.hpp"
-
-namespace paddle {
-namespace zynqmp {
-class BatchnormPE : public PE {
- public:
-  bool init() {
-    Tensor* output = param_.output;
-    output->setAligned(true);
-    output->setDataLocation(Device);
-
-    ScaleParam& scale_param = scalePE_.param();
-    scale_param.input = param_.input;
-    scale_param.output = param_.output;
-    Tensor* scale = new Tensor();
-    Tensor* bias = new Tensor();
-    Shape shape(N, {output->shape().channel()});
-
-    auto mean_data = param_.mean->data<float>();
-    auto variance_data = param_.variance->data<float>();
-    auto scale_data = param_.scale->data<float>();
-    auto bias_data = param_.bias->data<float>();
-    auto new_scale_ptr = scale->mutableData<float>(FP32, shape);
-    auto new_bias_ptr = bias->mutableData<float>(FP32, shape);
-
-    float epsilon = param_.epsilon;
-
-    Shape& in_shape = param_.input->shape();
-    bool match = in_shape.channel() == 128 && in_shape.height() == 128 &&
-                 in_shape.width() == 128;
-
-    for (int c = 0; c < output->shape().channel(); c++) {
-      float var = variance_data[c];
-      float inv_scale = 1.0 / (std::sqrt(var + epsilon));
-      float scale_value = inv_scale * scale_data[c];
-      float bias_value = bias_data[c] - scale_value * mean_data[c];
-      new_scale_ptr[c] = scale_value;
-      new_bias_ptr[c] = bias_value;
-    }
-
-    scale->flush();
-    bias->flush();
-
-    scale_param.scale = scale;
-    scale_param.bias = bias;
-    scale_param.relu = param_.relu;
-
-    scalePE_.init();
-
-    inplace_.relu_enable = param_.relu.enabled;
-    inplace_.relu_enable = true;
-    inplace_.power_enable = false;
-    inplace_.normalize_enable = false;
-
-    return true;
-  }
-
-  void apply() { scalePE_.apply(); }
-
-  bool dispatch() {
-    if (inplace_.relu_enable) {
-      config_inplace(inplace_);
-    }
-    bool ret = scalePE_.dispatch();
-
-    inplace_.relu_enable = false;
-    config_inplace(inplace_);
-    return ret;
-  }
-
-  BatchnormParam& param() { return param_; }
-
-  ~BatchnormPE() {
-    scalePE_.param().input = nullptr;
-    scalePE_.param().output = nullptr;
-  }
-
- private:
-  BatchnormParam param_;
-  ScalePE scalePE_;
-  InplaceArgs inplace_;
-};
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/pes/concat_pe.hpp b/lite/backends/fpga/KD/pes/concat_pe.hpp
deleted file mode 100644
index 72b480ab88..0000000000
--- a/lite/backends/fpga/KD/pes/concat_pe.hpp
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <vector>
-
-#include "lite/backends/fpga/KD/pe.hpp"
-#include "lite/backends/fpga/KD/pe_params.hpp"
-
-namespace paddle {
-namespace zynqmp {
-
-class ConcatPE : public PE {
- public:
-  bool init() {
-    Tensor* output = param_.output;
-    output->setAligned(false);
-    output->setDataLocation(CPU);
-    return true;
-  }
-
-  void apply() {}
-
-  void concat2D() {
-    int offset = 0;
-    float16* out_data = param_.output->data<float16>();
-    for (unsigned int n = 0; n < param_.inputs.size(); n++) {
-      Tensor* input = param_.inputs[n];
-      Shape& input_shape = input->shape();
-
-      float16* src = input->data<float16>();
-      memcpy(out_data + offset, src, input_shape.numel() * sizeof(float16));
-      offset += input_shape.numel();
-    }
-    Tensor* output = param_.output;
-    output->flush();
-  }
-
-  void concat3D() {
-    auto input = param_.inputs;
-    Tensor* output = param_.output;
-    int axis = param_.axis;
-    int num = input.size();
-    int rows = 1;
-    auto dim_0 = input[0]->shape().dims();
-    for (int i = 0; i < axis; ++i) {
-      rows *= dim_0[i];
-    }
-    int out_rows = rows, out_cols = 0;
-
-    std::vector<int64_t> input_cols(input.size());
-    for (int i = 0; i < num; ++i) {
-      int t_cols = input[i]->shape().numel() / rows;
-      out_cols += t_cols;
-      input_cols[i] = t_cols;
-    }
-
-    // computation
-    for (int k = 0; k < out_rows; ++k) {
-      float16* dst_ptr = output->data<float16>() + k * out_cols;
-      int col_idx = 0;
-      for (int j = 0; j < num; ++j) {
-        int col_len = input_cols[j];
-        const float16* src_prt = input[j]->data<float16>() + k * col_len;
-        memcpy(dst_ptr + col_idx, src_prt, sizeof(float16) * col_len);
-        col_idx += col_len;
-      }
-    }
-    output->flush();
-  }
-
-  bool dispatch() {
-    Tensor* output = param_.output;
-    Shape& output_shape = output->shape();
-
-    float scale = 0;
-    for (unsigned int n = 0; n < param_.inputs.size(); n++) {
-      Tensor* input = param_.inputs[n];
-      input->syncToCPU();
-      input->unalignImage();
-      scale = std::max(scale, input->scale()[0]);
-    }
-    output->scale()[0] = scale;
-    output->scale()[1] = 1.0f / scale;
-
-    if (output_shape.dimSize() == 3) {
-      concat3D();
-      return true;
-    }
-
-    if (output_shape.dimSize() == 2) {
-      concat2D();
-      return true;
-    }
-
-    float16* out_data = param_.output->data<float16>();
-    int channel_sum = 0;
-    int out_channel = output_shape.channel();
-    for (unsigned int n = 0; n < param_.inputs.size(); n++) {
-      Tensor* input = param_.inputs[n];
-      Shape& input_shape = input->shape();
-      int wh = output_shape.width() * output_shape.height();
-      for (int j = 0; j < wh; j++) {
-        float16* src = input->data<float16>() + j * input_shape.channel();
-        memcpy(out_data + j * out_channel + channel_sum,
-               src,
-               input_shape.channel() * sizeof(float16));
-      }
-      channel_sum += input_shape.channel();
-    }
-    output->flush();
-    return true;
-  }
-
-  ConcatParam& param() { return param_; }
-
- private:
-  ConcatParam param_;
-};
-
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/pes/conv_pe.hpp b/lite/backends/fpga/KD/pes/conv_pe.hpp
deleted file mode 100644
index e897f82280..0000000000
--- a/lite/backends/fpga/KD/pes/conv_pe.hpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <arm_neon.h>
-#include <vector>
-
-#include "lite/backends/fpga/KD/pe.hpp"
-#include "lite/backends/fpga/KD/pe_params.hpp"
-#include "lite/backends/fpga/KD/pes/concat_pe.hpp"
-#include "lite/backends/fpga/KD/pes/conv_pe.hpp"
-#include "lite/backends/fpga/KD/pes/conv_process.hpp"
-#include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
-#include "lite/backends/fpga/KD/pes/scale_pe.hpp"
-
-namespace paddle {
-namespace zynqmp {
-
-class ConvPE : public PE {
- public:
-  bool init() {
-    Tensor* output = param_.output;
-    output->setAligned(true);
-    output->setDataLocation(Device);
-    return true;
-  }
-
-  void apply() {
-    split_axis = fill_split_arg(param_);
-
-    if (split_axis == 0 && param_.splitParams().size() > 1) {
-      ConcatParam& concat_param = concatPE_.param();
-      for (auto conv_param : param_.splitParams()) {
-        concat_param.inputs.push_back(&conv_param->output);
-      }
-      concat_param.output = param_.output;
-      concatPE_.init();
-      concatPE_.apply();
-    }
-  }
-  void cpu_compute() {
-    Tensor* input = param_.input;
-    Tensor* output = param_.output;
-    input->syncToCPU();
-
-    Tensor float_input;
-    Tensor float_output;
-    float* image_addr = float_input.mutableData<float>(FP32, input->shape());
-    float_input.copyFrom(input);
-    float* out = float_output.mutableData<float>(FP32, output->shape());
-
-    int out_channel = output->shape().channel();
-    int in_channel = input->shape().channel();
-
-    float* filter_data = param_.filter->data<float>();
-    float* mi = new float[in_channel];
-
-    for (int i = 0; i < out_channel; i++) {
-      float* image = image_addr;
-      float* filter_ptr = filter_data + i * in_channel;
-      float* out_ptr = mi;
-#pragma omp parallel for
-      for (int j = 0; j < in_channel; j++) {
-        float value = image_addr[j] * filter_ptr[j];
-        mi[j] = value;
-      }
-
-      float sum = 0;
-      for (int j = 0; j < in_channel; j++) {
-        sum += mi[j];
-      }
-      out[i] = sum;
-    }
-    delete[] mi;
-    float_output.flush();
-    output->copyFrom(&float_output);
-  }
-
-  bool dispatch() {
-    inplace_.relu_enable = param_.relu.enabled;
-    inplace_.power_enable = false;
-    inplace_.normalize_enable = false;
-
-    if (param_.relu.enabled) {
-      inplace_.relu_enable = param_.relu.enabled;
-      config_inplace(inplace_);
-    }
-
-    std::vector<BasicConvParam*>& params = param_.splitParams();
-    int ret = 0;
-    for (auto conv_param : params) {
-      ret |= compute_fpga_conv_basic(conv_param->args);
-    }
-
-    if (param_.relu.enabled) {
-      inplace_.relu_enable = false;
-      config_inplace(inplace_);
-    }
-
-    size_t size = params.size();
-    if (split_axis == 0 && ret == 0 && size > 1) {
-      concatPE_.dispatch();
-    }
-    if (split_axis == 1 && ret == 0 && size > 1) {
-      ElementwiseAddParam& add_param = addPE_.param();
-      add_param.inputs = {&params[0]->output, &params[1]->output};
-      add_param.output = param_.output;
-      addPE_.init();
-      addPE_.apply();
-      addPE_.dispatch();
-    }
-    return ret == 0;
-  }
-
-  ConvParam& param() { return param_; }
-
- private:
-  ConvParam param_;
-  ConcatPE concatPE_;
-  ElementwiseAddPE addPE_;
-  int split_axis = 0;
-  InplaceArgs inplace_ = {0};
-};
-
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/pes/conv_process.hpp b/lite/backends/fpga/KD/pes/conv_process.hpp
deleted file mode 100644
index fd17218d06..0000000000
--- a/lite/backends/fpga/KD/pes/conv_process.hpp
+++ /dev/null
@@ -1,418 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string.h>
-#include <cmath>
-#include <vector>
-
-#include "lite/backends/fpga/KD/float16.hpp"
-#include "lite/backends/fpga/KD/llapi/bias_scale.h"
-#include "lite/backends/fpga/KD/llapi/filter.h"
-#include "lite/backends/fpga/KD/pe_params.hpp"
-#include "lite/backends/fpga/KD/tensor.hpp"
-#include "lite/backends/fpga/KD/tensor_util.hpp"
-
-namespace paddle {
-namespace zynqmp {
-
-inline int get_aligned_filter_element_num(int chw) {
-  return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-}
-
-inline int get_filter_num_per_div(Tensor* filter, int group_num) {
-  auto chw = filter->shape().channel() * filter->shape().height() *
-             filter->shape().width();
-  auto num = filter->shape().num();
-  int div_capacity = filter::calc_division_capacity(chw);
-  return filter::calc_num_per_div(num, group_num, div_capacity);
-}
-
-inline int get_split_num(Tensor* filter) {
-  auto chw = filter->shape().channel() * filter->shape().height() *
-             filter->shape().width();
-  auto num = filter->shape().num();
-  int div_capacity = filter::calc_division_capacity(chw);
-  return filter::calc_split_num(num, div_capacity);
-}
-
-inline void fill_scale_bias_const(ConvParam* param_) {
-  int channel = param_->output->shape().channel();
-  Shape sb_shape(N, {channel});
-  float* new_scale_ptr = param_->scale()->mutableData<float>(FP32, sb_shape);
-  float* new_bias_ptr = param_->bias()->mutableData<float>(FP32, sb_shape);
-  for (int i = 0; i < channel; i++) {
-    new_scale_ptr[i] = 1.0f;
-    new_bias_ptr[i] = 0.0f;
-  }
-  param_->scale()->flush();
-  param_->bias()->flush();
-}
-
-inline void combine_bn_params(BatchnormParam* bn, ConvParam* param_) {
-  int channel = param_->output->shape().channel();
-  Shape sb_shape(N, {channel});
-  float* new_scale_ptr = param_->scale()->mutableData<float>(FP32, sb_shape);
-  float* new_bias_ptr = param_->bias()->mutableData<float>(FP32, sb_shape);
-  float* bn_scale_ptr = bn->scale->data<float>();
-  float* bn_bias_ptr = bn->bias->data<float>();
-  float* bn_var_ptr = bn->variance->data<float>();
-  float* bn_mean_ptr = bn->mean->data<float>();
-  float epsilon = bn->epsilon;
-  for (int i = 0; i < channel; i++) {
-    float new_scale = bn_scale_ptr[i] /
-                      static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
-    new_scale_ptr[i] = new_scale;
-    new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
-  }
-}
-
-inline void combine_add_bn_params(BatchnormParam* bn,
-                                  Tensor* bias,
-                                  ConvParam* param_) {
-  int channel = param_->output->shape().channel();
-  Shape sb_shape(N, {channel});
-  float* new_scale_ptr = param_->scale()->mutableData<float>(FP32, sb_shape);
-  float* new_bias_ptr = param_->bias()->mutableData<float>(FP32, sb_shape);
-  if (bn != nullptr) {
-    float* bn_scale_ptr = bn->scale->data<float>();
-    float* bn_bias_ptr = bn->bias->data<float>();
-    float* bn_var_ptr = bn->variance->data<float>();
-    float* bn_mean_ptr = bn->mean->data<float>();
-    float epsilon = bn->epsilon;
-    float* bias_data = bias->data<float>();
-    for (int i = 0; i < channel; i++) {
-      float new_scale = bn_scale_ptr[i] /
-                        static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
-      new_scale_ptr[i] = new_scale;
-      new_bias_ptr[i] =
-          bn_bias_ptr[i] + (bias_data[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
-    }
-  } else {
-    for (int i = 0; i < channel; i++) {
-      new_scale_ptr[i] = 1.0f;
-      new_bias_ptr[i] = 0.0f;
-    }
-  }
-  param_->scale()->flush();
-  param_->bias()->flush();
-  param_->scale()->setDataLocation(CPU);
-  param_->bias()->setDataLocation(CPU);
-}
-
-inline void format_scale_bias(Tensor* scale,
-                              Tensor* bias,
-                              Tensor* filter,
-                              Tensor* scale_bias,
-                              int group) {
-  float* scale_data = nullptr;
-  float* bias_data = nullptr;
-  if (scale != nullptr) {
-    scale_data = scale->data<float>();
-  }
-  if (bias != nullptr) {
-    bias_data = bias->data<float>();
-  }
-  int channel = filter->shape().num();
-  Shape bias_scale_shape(N, {2 * channel});
-  float* bs_data = scale_bias->mutableData<float>(FP32, bias_scale_shape);
-  for (int i = 0; i < channel; i++) {
-    float scale_value = scale_data == nullptr ? 1 : scale_data[i];
-    float bias_value = bias_data == nullptr ? 0 : bias_data[i];
-    bs_data[i + channel] = scale_value;
-    bs_data[i] = bias_value;
-  }
-
-  int element_num_per_div = get_filter_num_per_div(filter, group);
-  bias_scale::format_bias_scale_array(&bs_data, element_num_per_div, channel);
-}
-
-inline void format_filter(Tensor* filter, Tensor* quantized_filter, int group) {
-  float max_value = find_max(*filter);
-  Shape& filter_shape = filter->shape();
-  quantized_filter->setAligned(true);
-  quantized_filter->mutableData<int8_t>(INT8, filter->shape());
-  quantized_filter->scale()[0] = max_value / 127.0f;
-  quantized_filter->scale()[1] = 127.0f / max_value;
-
-  auto memory_size = filter->shape().memorySize(sizeof(float));
-  auto new_data = reinterpret_cast<float*>(fpga_malloc(memory_size));
-  memcpy(new_data, filter->data<float>(), memory_size);
-  size_t mem_size = filter::format_filter(&new_data,
-                                          filter_shape.num(),
-                                          filter_shape.channel(),
-                                          filter_shape.height(),
-                                          filter_shape.width(),
-                                          group,
-                                          max_value);
-  int8_t* src = quantized_filter->mutableData<int8_t>(INT8, filter->shape());
-  memcpy(src, new_data, mem_size);
-  fpga_free(new_data);
-  quantized_filter->flush();
-}
-
-inline void format_dw_filter(Tensor* filter,
-                             Tensor* quantized_filter,
-                             float* scale) {
-  int num = filter->shape().num();
-  int height = filter->shape().height();
-  int width = filter->shape().width();
-  auto memory_size = filter->shape().memorySize(sizeof(float));
-  auto new_data = (float*)fpga_malloc(memory_size);  // NOLINT
-  memcpy(new_data, filter->data<float>(), memory_size);
-
-  size_t size =
-      filter::format_dwconv_filter(&new_data, num, height, width, scale);
-  float16* src = quantized_filter->mutableData<float16>(FP16, filter->shape());
-
-  memcpy(src, new_data, size);
-  quantized_filter->flush();
-
-  fpga_free(new_data);
-}
-
-inline void format_fc_filter(Tensor* filter, Tensor* quantized_filter) {
-  float max_value = find_max(*filter);
-  Shape& filter_shape = filter->shape();
-  quantized_filter->setAligned(true);
-  quantized_filter->mutableData<int8_t>(INT8, filter->shape());
-  quantized_filter->scale()[0] = max_value / 127.0f;
-  quantized_filter->scale()[1] = 127.0f / max_value;
-
-  size_t memory_size = filter->shape().memorySize(sizeof(float));
-  auto new_data = (float*)fpga_malloc(memory_size);  // NOLINT
-  memcpy(new_data, filter->data<float>(), memory_size);
-
-  int8_t* src = quantized_filter->mutableData<int8_t>(INT8, filter->shape());
-  memcpy(src, new_data, quantized_filter->shape().memorySize(sizeof(int8_t)));
-  quantized_filter->flush();
-  fpga_free(new_data);
-}
-
-inline void split_filter_num(const ConvParam& c_param) {
-  ConvParam& param = const_cast<ConvParam&>(c_param);
-  Tensor* input = param.input;
-  Tensor* out = param.output;
-  Tensor* filter = param.filter;
-  auto channel = out->shape().channel();
-
-  int split_num = param.groups == 1 ? get_split_num(param.filter) : 1;
-  int filter_num_per_div = get_filter_num_per_div(filter, param.groups);
-
-  Shape& out_shape = out->shape();
-  for (int i = 0; i < split_num; i++) {
-    BasicConvParam* conv_param = new BasicConvParam();
-    conv_param->output.setDataLocation(Device);
-    conv_param->output.setAligned(true);
-
-    int filter_num = filter->shape().num();
-    float16* out_address = nullptr;
-    float* out_scale_address = nullptr;
-
-    ConvArgs& args = conv_param->args;
-
-    if (split_num == 1) {
-      out_address = out->data<float16>();
-      out_scale_address = out->scale();
-    }
-    filter_num = i == split_num - 1
-                     ? channel - (split_num - 1) * filter_num_per_div  // NOLINT
-                     : filter_num_per_div;
-
-    if (split_num != 1) {
-      Shape shape(NHWC, {1, out_shape.height(), out_shape.width(), filter_num});
-      out_address = conv_param->output.mutableData<float16>(FP16, shape);
-      out_scale_address = conv_param->output.scale();
-    }
-    Shape f_shape(NCHW,
-                  {filter_num,
-                   filter->shape().channel(),
-                   filter->shape().height(),
-                   filter->shape().width()});
-
-    Tensor new_filter;
-    float* new_filter_data = new_filter.mutableData<float>(FP32, f_shape);
-    int filter_hwc = filter->shape().height() * filter->shape().width() *
-                     filter->shape().channel();
-
-    memcpy(new_filter_data,
-           filter->data<float>() + i * filter_num_per_div * filter_hwc,
-           filter_num * filter_hwc * sizeof(float));
-    new_filter.flush();
-
-    conv_param->filter.mutableData<float>(FP32, f_shape);
-    format_filter(&new_filter, &(conv_param->filter), param.groups);
-
-    int sb_num = 2 * align_to_x(filter_num, BS_NUM_ALIGNMENT);
-    Tensor scale;
-    Tensor bias;
-
-    int chnnnel_start = i * filter_num_per_div;
-
-    Shape s_shape(N, {filter_num});
-    float* scale_data = scale.mutableData<float>(FP32, s_shape);
-    float* bias_data = bias.mutableData<float>(FP32, s_shape);
-    for (int n = 0; n < filter_num; n++) {
-      scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
-    }
-    for (int n = 0; n < filter_num; n++) {
-      bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
-    }
-    Shape sb_shape(N, {sb_num});
-    format_scale_bias(&scale,
-                      &bias,
-                      &conv_param->filter,
-                      &conv_param->scaleBias,
-                      param.groups);
-    conv_param->scaleBias.flush();
-
-    args.group_num = param.groups;
-    args.relu_enabled = param.relu.enabled;
-    args.sb_address = conv_param->scaleBias.data<float>();
-    args.kernel.stride_h = param.strides[1];
-    args.kernel.stride_w = param.strides[0];
-    args.kernel.height = new_filter.shape().height();
-    args.kernel.width = new_filter.shape().width();
-
-    args.filter_address = conv_param->filter.data<int8_t>();
-    args.filter_num = filter_num;
-    args.filter_scale_address = conv_param->filter.scale();
-    args.image.address = input->data<void>();
-    args.image.scale_address = input->scale();
-    args.image.channels = input->shape().channel();
-    args.image.width = input->shape().width();
-    args.image.height = input->shape().height();
-    args.image.pad_width = param.paddings[1];
-    args.image.pad_height = param.paddings[0];
-    args.output.address = out_address;
-    args.output.scale_address = out_scale_address;
-    param.splitParams().push_back(conv_param);
-  }
-}
-
-inline void split_channel(const ConvParam& c_param) {
-  ConvParam& param = const_cast<ConvParam&>(c_param);
-  Tensor* input = param.input;
-  Tensor* output = param.output;
-  input->syncToCPU();
-
-  int num = ceil(input->shape().channel() * 1.0f / 2047);
-  int channel = input->shape().channel() / num;
-  std::cout << "channel::" << channel << "num::" << num << std::endl;
-  Shape bs_shape(N, {channel});
-
-  for (int i = 0; i < num; i++) {
-    BasicConvParam* conv_param = new BasicConvParam();
-
-    // input && output;
-    Shape in_shape(
-        NCHW, {1, channel, input->shape().height(), input->shape().width()});
-    conv_param->input.shareDataWith(input, in_shape, channel * i);
-    conv_param->output.mutableData<float16>(FP16, output->shape());
-
-    // filter transformation;
-    Shape f_shape(NCHW, {param.filter->shape().num(), channel, 1, 1});
-    Tensor new_filter;
-
-    float* dst = new_filter.mutableData<float>(FP32, f_shape);
-    float* src = param.filter->data<float>() + i * channel;
-    for (int n = 0; n < f_shape.num(); n++) {
-      memcpy(dst, src, channel * sizeof(float));
-      dst += channel;
-      src += param.filter->shape().channel();
-    }
-    new_filter.flush();
-    format_filter(&new_filter, &(conv_param->filter), param.groups);
-
-    Tensor bias;
-    Tensor scale;
-
-    float* bias_data = bias.mutableData<float>(FP32, bs_shape);
-    float* scale_data = scale.mutableData<float>(FP32, bs_shape);
-    for (int c = 0; c < channel; c++) {
-      scale_data[c] = 1;
-      bias_data[c] = param.bias()->data<float>()[c] / num;
-    }
-    scale.flush();
-    bias.flush();
-    format_scale_bias(&scale,
-                      &bias,
-                      &conv_param->filter,
-                      &conv_param->scaleBias,
-                      param.groups);
-    conv_param->scaleBias.flush();
-
-    ConvArgs& args = conv_param->args;
-    args.group_num = param.groups;
-    args.relu_enabled = param.relu.enabled;
-    args.sb_address = conv_param->scaleBias.data<float>();
-    args.kernel.stride_h = param.strides[1];
-    args.kernel.stride_w = param.strides[0];
-    args.kernel.height = new_filter.shape().height();
-    args.kernel.width = new_filter.shape().width();
-
-    args.filter_address = conv_param->filter.data<int8_t>();
-    args.filter_num = f_shape.num();
-    args.filter_scale_address = conv_param->filter.scale();
-    args.image.address = conv_param->input.mutableData<void>();
-    args.image.scale_address = conv_param->input.scale();
-
-    args.image.channels = conv_param->input.shape().channel();
-    args.image.width = conv_param->input.shape().width();
-    args.image.height = conv_param->input.shape().height();
-    args.image.pad_width = param.paddings[1];
-    args.image.pad_height = param.paddings[0];
-    args.output.address = conv_param->output.mutableData<void>();
-    args.output.scale_address = conv_param->output.scale();
-    param.splitParams().push_back(conv_param);
-  }
-}
-
-inline int fill_split_arg(const ConvParam& c_param) {
-  ConvParam& param = const_cast<ConvParam&>(c_param);
-  Tensor* input = param.input;
-  Tensor* output = param.output;
-  if (output->shape().dimSize() == 4 && input->shape().channel() > 2047 &&
-      input->shape().width() == 1) {
-    split_channel(c_param);
-    return 1;
-  } else {
-    split_filter_num(c_param);
-    return 0;
-  }
-}
-
-inline bool compute_conv(const ConvParam& c_conv_params) {
-  ConvParam& conv_params = const_cast<ConvParam&>(c_conv_params);
-  std::vector<BasicConvParam*>& params = conv_params.splitParams();
-  int ret = 0;
-  for (auto conv_param : params) {
-    ret |= compute_fpga_conv_basic(conv_param->args);
-  }
-  size_t size = params.size();
-  if (ret == 0 && size > 1) {
-    Tensor& img = params[0]->output;
-    for (int i = 0; i < 1; i++) {
-      for (int i = 0; i < img.shape().numel(); i++) {
-        float value = half_to_float(img.data<float16>()[i]);
-        std::cout << "value:" << value << std::endl;
-      }
-    }
-  }
-  return ret == 0;
-}
-
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/pes/crop_pe.cpp b/lite/backends/fpga/KD/pes/crop_pe.cpp
deleted file mode 100644
index c29df623aa..0000000000
--- a/lite/backends/fpga/KD/pes/crop_pe.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/fpga/KD/pes/crop_pe.hpp"
-
-#include <vector>
-
-namespace paddle {
-namespace zynqmp {
-
-bool CropPE::dispatch() {
-  Tensor* input = param_.input;
-  input->syncToCPU();
-  const auto axis = param_.axis;
-  std::vector<int> shape = param_.shape;
-  auto* out = param_.output;
-
-  Shape out_shape = out->shape();
-  float16* src_ptr = reinterpret_cast<float16*>(input->data<float16>());
-  float16* dst_ptr = reinterpret_cast<float16*>(
-      out->mutableData<float16>(DataType::FP16, out_shape));
-
-  std::vector<int> offsets = param_.offsets;
-
-  int input_c = input->shape().channel();
-  int input_h = input->shape().height();
-  int input_w = input->shape().width();
-
-  int out_c = out->shape().channel();
-  int out_h = out->shape().height();
-  int out_w = out->shape().width();
-  if (axis == 1) {
-    int index = 0;
-
-    int offset_h = offsets[0];
-    int offset_w = offsets[0];
-    int offset_c = offsets[0];
-
-    if (offsets.size() == 3) {
-      offset_h = offsets[1];
-      offset_w = offsets[2];
-      offset_c = offsets[0];
-    }
-
-    for (int h = 0; h < out_h; h++) {
-      for (int w = 0; w < out_w; w++) {
-        float16* crop_start = src_ptr + (h + offset_h) * input_w * input_c +
-                              (offset_w * input_c) + offset_c;
-        std::memcpy(dst_ptr + h * (out_w * out_c) + w * out_c,
-                    crop_start,
-                    out_c * sizeof(float16));
-      }
-    }
-  } else if (axis == 2) {
-    int offset_h = offsets[0];
-    int offset_w = offsets[0];
-
-    if (offsets.size() == 2) {
-      offset_h = offsets[0];
-      offset_w = offsets[1];
-    }
-
-    for (int h = 0; h < out_h; h++) {
-      float16* crop_start =
-          src_ptr + (h + offset_h) * input_w * input_c + (offset_w * input_c);
-      std::memcpy(dst_ptr + h * out_w * input_c,
-                  crop_start,
-                  out_w * input_c * sizeof(float16));
-    }
-  }
-  out->flush();
-  out->copyScaleFrom(input);
-  return true;
-}
-
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/pes/crop_pe.hpp b/lite/backends/fpga/KD/pes/crop_pe.hpp
deleted file mode 100755
index 6ebbcdb31f..0000000000
--- a/lite/backends/fpga/KD/pes/crop_pe.hpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstring>
-#include <vector>
-
-#include "lite/backends/fpga/KD/float16.hpp"
-#include "lite/backends/fpga/KD/pe.hpp"
-#include "lite/backends/fpga/KD/pe_params.hpp"
-
-namespace paddle {
-namespace zynqmp {
-class CropPE : public PE {
- public:
-  bool init() {
-    Tensor* output = param_.output;
-    output->setAligned(true);
-    output->setDataLocation(CPU);
-    return true;
-  }
-
-  void apply() {}
-
-  bool dispatch();
-
-  CropParam& param() { return param_; }
-
- private:
-  CropParam param_;
-};
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
deleted file mode 100755
index 9d7b9b544b..0000000000
--- a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "lite/backends/fpga/KD/float16.hpp"
-#include "lite/backends/fpga/KD/pe.hpp"
-#include "lite/backends/fpga/KD/pe_params.hpp"
-#include "lite/backends/fpga/KD/pes/conv_process.hpp"
-
-namespace paddle {
-namespace zynqmp {
-
-class DepthwiseConvPE : public PE {
- public:
-  bool init() {
-    Tensor* output = param_.output;
-    output->setAligned(true);
-    output->setDataLocation(Device);
-    return true;
-  }
-
-  void apply() {
-    DepthwiseConvParam& param = param_;
-    Tensor* input = param.input;
-    Tensor* output = param.output;
-    int channel = output->shape().channel();
-
-    float* new_scale_data = param_.scale()->data<float>();
-    float* new_bias_data = param_.bias()->data<float>();
-
-    float16* b_data = bias_.mutableData<float16>(FP16, param_.bias()->shape());
-    for (int i = 0; i < channel; i++) {
-      b_data[i] = float_to_half(new_bias_data[i]);
-    }
-    bias_.flush();
-
-    Tensor* quantized_filter = param.quantizedFilter();
-    quantized_filter->mutableData<float16>(FP16, param.filter->shape());
-    format_dw_filter(param.filter, param.quantizedFilter(), new_scale_data);
-
-    DWconvArgs args = {0};
-    args.bias_address = b_data;
-    args.filter_address = param.quantizedFilter()->data<void>();
-    args.kernel.width = param.filter->shape().height();
-    args.kernel.height = param.filter->shape().width();
-    args.kernel.stride_w = param.strides[0];
-    args.kernel.stride_h = param.strides[1];
-    args.image.address = input->data<void>();
-    args.image.channels = input->shape().channel();
-    args.image.height = input->shape().height();
-    args.image.width = input->shape().width();
-    args.image.pad_width = param.paddings[0];
-    args.image.pad_height = param.paddings[1];
-    args.image.scale_address = input->scale();
-    args.output.address = output->data<void>();
-    args.output.scale_address = output->scale();
-    args.out_width = param.output->shape().width();
-    args.out_height = param.output->shape().height();
-    args.sub_conv_num = 1;
-    param.args = args;
-
-    inplace_.relu_enable = param_.relu.enabled;
-    inplace_.power_enable = false;
-    inplace_.normalize_enable = false;
-  }
-
-  bool dispatch() {
-    param_.input->syncToDevice();
-    if (param_.relu.enabled) {
-      inplace_.relu_enable = param_.relu.enabled;
-      config_inplace(inplace_);
-    }
-    bool ret = compute_fpga_dwconv(param_.args) == 0;
-    if (param_.relu.enabled) {
-      inplace_.relu_enable = false;
-      config_inplace(inplace_);
-    }
-    return ret;
-  }
-
-  DepthwiseConvParam& param() { return param_; }
-
- private:
-  DepthwiseConvParam param_;
-  Tensor bias_;
-  InplaceArgs inplace_ = {0};
-};
-
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/pes/elementwise_add_pe.hpp b/lite/backends/fpga/KD/pes/elementwise_add_pe.hpp
deleted file mode 100755
index a498a2bde9..0000000000
--- a/lite/backends/fpga/KD/pes/elementwise_add_pe.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "lite/backends/fpga/KD/pe.hpp"
-#include "lite/backends/fpga/KD/pe_params.hpp"
-
-namespace paddle {
-namespace zynqmp {
-
-class ElementwiseAddPE : public PE {
- public:
-  bool init() {
-    Tensor* output = param_.output;
-    output->setAligned(true);
-    output->setDataLocation(Device);
-    return true;
-  }
-
-  void apply() {
-    Tensor* input0 = param_.inputs[0];
-    Tensor* input1 = param_.inputs[1];
-    Tensor* output = param_.output;
-    EWAddArgs args = {0};
-    args.const0 = 0x3c00;
-    args.const1 = 0x3c00;  // =1
-    args.image0.address = input0->data<float16>();
-    args.image0.channels = input0->shape().channel();
-    args.image0.scale_address = input0->scale();
-    args.image0.height = input0->shape().height();
-    args.image0.width = input0->shape().width();
-    args.image0.pad_height = 0;
-    args.image0.pad_width = 0;
-    args.image1.address = input1->data<float16>();
-    args.image1.channels = input1->shape().channel();
-    args.image1.scale_address = input1->scale();
-    args.image1.height = input1->shape().height();
-    args.image1.width = input1->shape().width();
-    args.image1.pad_height = 0;
-    args.image1.pad_width = 0;
-    args.output.scale_address = output->scale();
-    args.output.address = output->data<float16>();
-    param_.ewargs = args;
-  }
-
-  bool dispatch() {
-    param_.inputs[0]->syncToDevice();
-    param_.inputs[1]->syncToDevice();
-    InplaceArgs inplace_args = {0};
-    if (param_.relu.enabled) {
-      inplace_args.relu_enable = true;
-      config_inplace(inplace_args);
-    }
-    compute_fpga_ewadd(param_.ewargs);
-    if (param_.relu.enabled) {
-      inplace_args.relu_enable = false;
-      config_inplace(inplace_args);
-    }
-    return true;
-  }
-
-  ElementwiseAddParam& param() { return param_; }
-
- private:
-  ElementwiseAddParam param_;
-};
-
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/pes/fully_connected_pe.hpp b/lite/backends/fpga/KD/pes/fully_connected_pe.hpp
deleted file mode 100644
index 2179a142ad..0000000000
--- a/lite/backends/fpga/KD/pes/fully_connected_pe.hpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-
-#include "lite/backends/fpga/KD/pe.hpp"
-#include "lite/backends/fpga/KD/pe_params.hpp"
-#include "lite/backends/fpga/KD/pes/conv_pe.hpp"
-#include "lite/backends/fpga/KD/pes/conv_process.hpp"
-
-namespace paddle {
-namespace zynqmp {
-
-class FullyConnectedPE : public PE {
- public:
-  bool init() {
-    Tensor* output = param_.output;
-    output->setAligned(true);
-    output->setDataLocation(Device);
-    return true;
-  }
-
-  void apply() {
-    ConvParam& convParam_ = convPE_.param();
-    Tensor* input = param_.input;
-    convParam_.input = param_.input;
-    convParam_.output = param_.output;
-    convParam_.groups = 1;
-    convParam_.strides = {1, 1};
-    convParam_.paddings = {0, 0};
-    convParam_.kernelSize = {input->shape().width(), input->shape().height()};
-    convParam_.dilations = {1, 1};
-
-    int num = param_.filter->shape().channel();
-    int chw = param_.filter->shape().num();
-
-    int height = param_.input->shape().height();
-    int width = param_.input->shape().width();
-    int filter_channel = chw / height / width;
-
-    int channel = param_.output->shape().channel();
-    Shape shape(NCHW, {num, filter_channel, height, width});
-    Tensor* conv_filter = new Tensor();
-    float* new_filter_data = conv_filter->mutableData<float>(FP32, shape);
-    float* filter_data = param_.filter->data<float>();
-
-    for (int i = 0; i < num; i++) {
-      for (int j = 0; j < chw; j++) {
-        float scale = filter_data[j * num + i];
-        new_filter_data[i * chw + j] = scale;
-      }
-    }
-
-    conv_filter->flush();
-    convParam_.filter = conv_filter;
-
-    Shape sb_shape(N, {channel});
-    float* scale_data = convParam_.scale()->mutableData<float>(FP32, sb_shape);
-    float* bias_data = convParam_.bias()->mutableData<float>(FP32, sb_shape);
-
-    for (int i = 0; i < channel; i++) {
-      scale_data[i] = 1.0f;
-      bias_data[i] = param_.bias->data<float>()[i];
-    }
-    convParam_.scale()->flush();
-    convParam_.bias()->flush();
-
-    convPE_.init();
-    convPE_.apply();
-  }
-
-  bool dispatch() { return convPE_.dispatch(); }
-
-  FullyConnectedParam& param() { return param_; }
-
- private:
-  FullyConnectedParam param_;
-  ConvPE convPE_;
-};
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/pes/input_pe.hpp b/lite/backends/fpga/KD/pes/input_pe.hpp
deleted file mode 100755
index 380c85e17e..0000000000
--- a/lite/backends/fpga/KD/pes/input_pe.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "lite/backends/fpga/KD/pe.hpp"
-#include "lite/backends/fpga/KD/pe_params.hpp"
-namespace paddle {
-namespace zynqmp {
-
-class InputPE : public PE {
- public:
-  bool init() {
-    Tensor* output = param_.output;
-    output->setAligned(true);
-    output->setDataLocation(Device);
-    return true;
-  }
-
-  bool dispatch() {
-    Tensor* input = param_.input;
-    Tensor* output = param_.output;
-
-    Tensor* src = input;
-    input->flush();
-    Tensor half_tensor;
-    if (input->dataType() == DataType::FP32) {
-      half_tensor.mutableData<void*>(DataType::FP16, input->shape());
-      half_tensor.copyFrom(input);
-      src = &half_tensor;
-    }
-    output->mutableData<void>();
-    src->alignImage(output, true);
-    return true;
-  }
-
-  InputParam& param() { return param_; }
-
- private:
-  InputParam param_;
-};
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/pes/norm_pe.hpp b/lite/backends/fpga/KD/pes/norm_pe.hpp
deleted file mode 100644
index 3e2fd80627..0000000000
--- a/lite/backends/fpga/KD/pes/norm_pe.hpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstring>
-#include <vector>
-
-#include "lite/backends/fpga/KD/float16.hpp"
-#include "lite/backends/fpga/KD/pe.hpp"
-#include "lite/backends/fpga/KD/pe_params.hpp"
-
-namespace paddle {
-namespace zynqmp {
-class NormPE : public PE {
- public:
-  bool init() {
-    Tensor* output = param_.output;
-    output->setAligned(true);
-    output->setDataLocation(Device);
-    return true;
-  }
-
-  void apply() {
-    inplace_args_.relu_enable = false;
-    inplace_args_.power_enable = false;
-    inplace_args_.normalize_enable = true;
-
-    Shape& input_shape = param_.input->shape();
-
-    norm_param_args_.channel = input_shape.channel();
-    norm_param_args_.hight_width = input_shape.height() * input_shape.width();
-
-    float16* mid_data =
-        mid_out_.mutableData<float16>(FP16, param_.output->shape());
-
-    bypass_args_.input_data_type = DATA_TYPE_FP16;
-    bypass_args_.output_data_type = DATA_TYPE_FP16;
-    bypass_args_.input_layout_type = LAYOUT_HWC;
-    bypass_args_.output_layout_type = LAYOUT_HWC;
-    bypass_args_.image.address = param_.input->data<void>();
-    bypass_args_.image.scale_address = param_.input->scale();
-    bypass_args_.image.channels = input_shape.channel();
-    bypass_args_.image.height = input_shape.height();
-    bypass_args_.image.width = input_shape.width();
-    bypass_args_.output.address = mid_out_.data<void>();
-    bypass_args_.output.scale_address = mid_out_.scale();
-
-    norm_args_.input_image_address = mid_data;
-    norm_args_.image_width = input_shape.width();
-    norm_args_.image_height = input_shape.height();
-    norm_args_.image_channel = input_shape.channel();
-    norm_args_.output_image_address = param_.output->data<float>();
-    norm_args_.output_scale_address =
-        reinterpret_cast<uint32_t*>(param_.output->scale());
-  }
-
-  void cpuCompute() {
-    Tensor input_float;
-    Tensor float_out;
-    input_float.mutableData<float>(FP32, param_.input->shape());
-    float_out.mutableData<float>(FP32, param_.output->shape());
-
-    input_float.copyFrom(param_.input);
-    input_float.syncToCPU();
-
-    int channel = input_float.shape().channel();
-    int height = input_float.shape().height();
-    int width = input_float.shape().width();
-    int cw = channel * width;
-
-    Tensor* input = &input_float;
-    float* input_ptr = input->data<float>();
-    float* out_ptr = float_out.data<float>();
-
-    int loop = height * width;
-    for (int i = 0; i < loop; i++) {
-      float sum = param_.epsilon;
-      for (int c = 0; c < channel; c++) {
-        float value = input_ptr[i * channel + c];
-        sum += value * value;
-      }
-      float norm = sqrtf(sum);
-#pragma omp parallel for
-      for (int c = 0; c < channel; c++) {
-        out_ptr[i * channel + c] = input_ptr[i * channel + c] / norm;
-      }
-    }
-    float_out.flush();
-    param_.output->copyFrom(&float_out);
-  }
-
-  bool dispatch() {
-    cpuCompute();
-    return true;
-  }
-
-  NormParam& param() { return param_; }
-
- private:
-  NormParam param_;
-  Tensor mid_out_;
-  InplaceArgs inplace_args_ = {0};
-  NormalizeParameterArgs norm_param_args_ = {0};
-  BypassArgs bypass_args_;
-
-  NormalizeArgs norm_args_ = {0};
-};
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/pes/output_pe.hpp b/lite/backends/fpga/KD/pes/output_pe.hpp
deleted file mode 100644
index 1c99386ab1..0000000000
--- a/lite/backends/fpga/KD/pes/output_pe.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "lite/backends/fpga/KD/pe.hpp"
-#include "lite/backends/fpga/KD/pe_params.hpp"
-
-namespace paddle {
-namespace zynqmp {
-
-class OutputPE : public PE {
- public:
-  bool init() {
-    Tensor* output = param_.output;
-    output->setAligned(false);
-    return true;
-  }
-
-  bool dispatch() {
-    Tensor* input = param_.input;
-    Tensor* output = param_.output;
-    if (input->aligned()) {
-      Tensor tmp;
-      tmp.setAligned(true);
-      tmp.mutableData<float16>(FP16, input->shape());
-      tmp.copyFrom(input);
-      tmp.unalignImage();
-      output->copyFrom(&tmp);
-    } else {
-      output->copyFrom(input);
-    }
-    return true;
-  }
-
-  OutputParam& param() { return param_; }
-
- private:
-  OutputParam param_;
-};
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/pes/pooling_pe.hpp b/lite/backends/fpga/KD/pes/pooling_pe.hpp
deleted file mode 100644
index fd3be1f463..0000000000
--- a/lite/backends/fpga/KD/pes/pooling_pe.hpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-
-#include "lite/backends/fpga/KD/pe.hpp"
-#include "lite/backends/fpga/KD/pe_params.hpp"
-
-namespace paddle {
-namespace zynqmp {
-
-class PoolingPE : public PE {
- public:
-  bool init() {
-    Tensor* output = param_.output;
-    output->setAligned(true);
-    output->setDataLocation(Device);
-    return true;
-  }
-
-  void apply() {
-    Tensor* input = param_.input;
-    Tensor* output = param_.output;
-
-    uint32_t k_width = param_.kernelSize[0];
-    uint32_t k_height = param_.kernelSize[1];
-
-    if (param_.globalPooling) {
-      k_width = input->shape().width();
-      k_height = input->shape().height();
-    }
-
-    PoolingArgs args = {0};
-    args.mode = param_.type;
-    args.kernel_reciprocal = fp32_2_fp16(1.0f / (k_width * k_height));
-    args.image.address = input->data<float16>();
-    args.image.channels = input->shape().channel();
-    args.image.height = input->shape().height();
-    args.image.width = input->shape().width();
-    args.image.pad_height = param_.paddings[0];
-    args.image.pad_width = param_.paddings[1];
-    args.image.scale_address = input->scale();
-    args.output.address = output->mutableData<float16>();
-    args.output.scale_address = output->scale();
-    args.kernel.height = k_height;
-    args.kernel.width = k_width;
-    args.kernel.stride_h = param_.strides[0];
-    args.kernel.stride_w = param_.strides[1];
-    args.out_height = output->shape().height();
-    args.out_width = output->shape().width();
-    param_.poolingArgs = args;
-
-    use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
-               (k_width > 7 || k_height > 7);
-  }
-
-  void compute() {
-    Tensor* input = param_.input;
-    Tensor* output = param_.output;
-    input->syncToCPU();
-
-    Tensor float_input;
-    float* image_addr = float_input.mutableData<float>(FP32, input->shape());
-    float_input.copyFrom(input);
-    float16* data_out = output->data<float16>();
-
-    int image_height = input->shape().height();
-    int image_width = input->shape().width();
-    int image_channels = input->shape().channel();
-    int image_pad_h = param_.paddings[0];
-    int image_pad_w = param_.paddings[1];
-    int kernel_height = param_.kernelSize[1];
-    int kernel_width = param_.kernelSize[0];
-    int kernel_step_h = param_.strides[0];
-    int kernel_step_w = param_.strides[1];
-
-    int pooled_height_ = output->shape().height();
-    int pooled_width_ = output->shape().width();
-
-    int kernel = kernel_height * kernel_width;
-
-    float max = 0;
-
-    for (int ph = 0; ph < pooled_height_; ++ph) {
-      for (int pw = 0; pw < pooled_width_; ++pw) {
-        int hstart = ph * kernel_step_h - image_pad_h;
-        int wstart = pw * kernel_step_w - image_pad_w;
-        int hend = std::min(hstart + kernel_height, image_height);
-        int wend = std::min(wstart + kernel_width, image_width);
-        hstart = std::max(hstart, 0);
-        wstart = std::max(wstart, 0);
-
-        kernel = (hend - hstart) * (wend - wstart);
-        for (int c = 0; c < image_channels; ++c) {
-          const int pool_index = (ph * pooled_width_ + pw) * image_channels + c;
-          float sum = 0;
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              const int index = (h * image_width + w) * image_channels + c;
-              float value = image_addr[index];
-              sum += value;
-            }
-          }
-          float value = sum / kernel;
-          if (value > max) {
-            max = value;
-          }
-          data_out[pool_index] = float_to_half(value);
-        }
-      }
-    }
-    output->scale()[0] = max / 127.0f;
-    output->scale()[1] = 127.0f / max;
-    output->flush();
-  }
-
-  void cpu_compute() {
-    Tensor* input = param_.input;
-    Tensor* output = param_.output;
-    input->syncToCPU();
-
-    Tensor float_input;
-    float_input.mutableData<float>(FP32, input->shape());
-    float_input.copyFrom(input);
-    float16* data_out = output->data<float16>();
-
-    int kernel_hw = param_.kernelSize[0] * param_.kernelSize[1];
-
-    float scale_max = 0;
-    for (int i = 0; i < output->shape().channel(); i++) {
-      float sum = 0;
-      for (int j = 0; j < kernel_hw; j++) {
-        float value = half_to_float(input->data<float16>()[i * kernel_hw + j]);
-        sum += value;
-      }
-      float value = sum / kernel_hw;
-      data_out[i] = float_to_half(value);
-      scale_max = std::max(scale_max, std::abs(value));
-    }
-    output->scale()[0] = scale_max / 127.0f;
-    output->scale()[1] = 127.0f / scale_max;
-    std::cout << "pool scale:" << scale_max / 127.0f << std::endl;
-    output->flush();
-  }
-
-  bool dispatch() {
-    if (use_cpu_) {
-      compute();
-      return true;
-    }
-    param_.input->syncToDevice();
-    return compute_fpga_pool(param_.poolingArgs) == 0;
-  }
-
-  PoolingParam& param() { return param_; }
-
- private:
-  PoolingParam param_;
-  bool use_cpu_;
-};
-
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/pes/prior_box_pe.cpp b/lite/backends/fpga/KD/pes/prior_box_pe.cpp
deleted file mode 100644
index d6a503a31d..0000000000
--- a/lite/backends/fpga/KD/pes/prior_box_pe.cpp
+++ /dev/null
@@ -1,273 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <math.h>
-#include <algorithm>
-#include <vector>
-
-#include "lite/backends/fpga/KD/pes/prior_box_pe.hpp"
-
-namespace paddle {
-namespace zynqmp {
-
-struct Transform {
-  template <typename InputIter, typename OutputIter, typename UnaryOperation>
-  void operator()(InputIter first,
-                  InputIter last,
-                  OutputIter result,
-                  UnaryOperation op) {
-    std::transform(first, last, result, op);
-  }
-
-  template <typename InputIter1,
-            typename InputIter2,
-            typename OutputIter,
-            typename BinaryOperation>
-  void operator()(InputIter1 first1,
-                  InputIter1 last1,
-                  InputIter2 first2,
-                  OutputIter result,
-                  BinaryOperation op) {
-    std::transform(first1, last1, first2, result, op);
-  }
-};
-
-inline void ExpandAspectRatios(const std::vector<float> &input_aspect_ratior,
-                               bool flip,
-                               std::vector<float> *output_aspect_ratior) {
-  constexpr float epsilon = 1e-6;
-  output_aspect_ratior->clear();
-  output_aspect_ratior->push_back(1.0f);
-  for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
-    float ar = input_aspect_ratior[i];
-    bool already_exist = false;
-    for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
-      if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
-        already_exist = true;
-        break;
-      }
-    }
-    if (!already_exist) {
-      output_aspect_ratior->push_back(ar);
-      if (flip) {
-        output_aspect_ratior->push_back(1.0f / ar);
-      }
-    }
-  }
-}
-
-template <typename T>
-struct ClipFunctor {
-  inline T operator()(T in) const {
-    return std::min<T>(std::max<T>(in, 0.), 1.);
-  }
-};
-
-void PriorBoxPE::compute_prior_box() {
-  PriorBoxParam &param = param_;
-  Tensor *input = param.input;
-  Shape &input_shape = input->shape();
-
-  Tensor *input_image = param.image;
-  Shape &image_shape = input_image->shape();
-
-  const auto &min_sizes = param.minSizes;
-  const auto &max_sizes = param.maxSizes;
-  const auto &input_aspect_ratio = param.aspectRatios;
-  const bool &flip = param.flip;
-  const bool &clip = param.clip;
-  const float &step_w = param.stepW;
-  const float &step_h = param.stepH;
-  const float &offset = param.offset;
-
-  Tensor *output_boxes = this->cachedBoxes_;
-  Tensor *output_variances = this->cachedVariances_;
-
-  Tensor boxes;
-  Tensor variances;
-
-  float *output_boxes_dataptr =
-      boxes.mutableData<float>(FP32, output_boxes->shape());
-  memset(output_boxes_dataptr, 0, boxes.memorySize());
-  float *output_variances_dataptr =
-      variances.mutableData<float>(FP32, output_boxes->shape());
-
-  std::vector<float> aspect_ratios;
-  ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
-
-  auto img_width = image_shape.width();
-  auto img_height = image_shape.height();
-  auto feature_width = input_shape.width();
-  auto feature_height = input_shape.height();
-
-  auto stride0 = output_boxes->shape().channel() *
-                 output_boxes->shape().height() * output_boxes->shape().width();
-  auto stride1 = output_boxes->shape().height() * output_boxes->shape().width();
-  auto stride2 = output_boxes->shape().width();
-
-  float step_width = step_w;
-  float step_height = step_h;
-  if (step_w == 0 || step_h == 0) {
-    step_width = static_cast<float>(img_width) / feature_width;
-    step_height = static_cast<float>(img_height) / feature_height;
-  }
-
-  int num_priors = aspect_ratios.size() * min_sizes.size();
-  if (!max_sizes.empty()) {
-    num_priors += max_sizes.size();
-  }
-
-  for (int h = 0; h < feature_height; ++h) {
-    for (int w = 0; w < feature_width; ++w) {
-      /// map origin image
-      float center_x = (w + offset) * step_width;
-      float center_y = (h + offset) * step_height;
-      float box_width, box_height;
-      int idx = 0;
-      for (size_t s = 0; s < min_sizes.size(); ++s) {
-        auto min_size = min_sizes[s];
-        if (param.minMaxAspectRatiosOrder) {
-          box_width = box_height = min_size / 2.;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 0] =
-              (center_x - box_width) / img_width;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 1] =
-              (center_y - box_height) / img_height;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 2] =
-              (center_x + box_width) / img_width;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 3] =
-              (center_y + box_height) / img_height;
-          idx++;
-
-          if (max_sizes.size() > 0) {
-            auto max_size = max_sizes[s];
-            // square prior with size sqrt(minSize * maxSize)
-            box_width = box_height = sqrt(min_size * max_size) / 2.;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 0] = (center_x - box_width) / img_width;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 1] = (center_y - box_height) / img_height;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 2] = (center_x + box_width) / img_width;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 3] = (center_y + box_height) / img_height;
-
-            idx++;
-          }
-
-          // priors with different aspect ratios
-          for (float ar : aspect_ratios) {
-            if (fabs(ar - 1.) < 1e-6) {
-              continue;
-            }
-            box_width = min_size * sqrt(ar) / 2.;
-            box_height = min_size / sqrt(ar) / 2.;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 0] = (center_x - box_width) / img_width;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 1] = (center_y - box_height) / img_height;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 2] = (center_x + box_width) / img_width;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 3] = (center_y + box_height) / img_height;
-
-            idx++;
-          }
-
-        } else {
-          // priors with different aspect ratios
-          for (float ar : aspect_ratios) {
-            box_width = min_size * sqrt(ar) / 2.;
-            box_height = min_size / sqrt(ar) / 2.;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 0] = (center_x - box_width) / img_width;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 1] = (center_y - box_height) / img_height;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 2] = (center_x + box_width) / img_width;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 3] = (center_y + box_height) / img_height;
-            idx++;
-          }
-          if (!max_sizes.empty()) {
-            auto max_size = max_sizes[s];
-            // square prior with size sqrt(minSize * maxSize)
-            box_width = box_height = sqrt(min_size * max_size) / 2.;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 0] = (center_x - box_width) / img_width;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 1] = (center_y - box_height) / img_height;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 2] = (center_x + box_width) / img_width;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 3] = (center_y + box_height) / img_height;
-            idx++;
-          }
-        }
-      }
-    }
-  }
-  if (clip) {
-    for (int i = 0; i < output_boxes->shape().numel(); i++) {
-      float value = output_boxes_dataptr[i];
-      value = std::min(std::max(0.0f, value), 1.0f);
-      output_boxes_dataptr[i] = value;
-    }
-  }
-
-  if ((param.variances.size() != 4)) {
-    // TODO(chonwhite) throw error;
-  }
-
-  int64_t box_num = feature_height * feature_width * num_priors;
-
-  for (int i = 0; i < box_num; i++) {
-    output_variances_dataptr[4 * i] = param.variances[0];
-    output_variances_dataptr[4 * i + 1] = param.variances[1];
-    output_variances_dataptr[4 * i + 2] = param.variances[2];
-    output_variances_dataptr[4 * i + 3] = param.variances[3];
-  }
-
-  boxes.flush();
-  boxes.syncToCPU();
-  variances.flush();
-  output_boxes->copyFrom(&boxes);
-  output_variances->copyFrom(&variances);
-}
-
-void PriorBoxPE::apply() {}
-
-bool PriorBoxPE::dispatch() {
-  if (cachedBoxes_ == nullptr) {
-    cachedBoxes_ = new Tensor();
-    cachedVariances_ = new Tensor();
-    cachedBoxes_->mutableData<float16>(FP16, param_.outputBoxes->shape());
-    cachedVariances_->mutableData<float16>(FP16,
-                                           param_.outputVariances->shape());
-    cachedBoxes_->setDataLocation(CPU);
-    cachedVariances_->setDataLocation(CPU);
-    compute_prior_box();
-  }
-
-  param_.outputBoxes->copyFrom(this->cachedBoxes_);
-
-  param_.outputVariances->copyFrom(this->cachedVariances_);
-  param_.outputBoxes->flush();
-  param_.outputBoxes->syncToCPU();
-  param_.outputVariances->flush();
-}
-
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/pes/prior_box_pe.hpp b/lite/backends/fpga/KD/pes/prior_box_pe.hpp
deleted file mode 100755
index 8afe40dd30..0000000000
--- a/lite/backends/fpga/KD/pes/prior_box_pe.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "lite/backends/fpga/KD/pe.hpp"
-#include "lite/backends/fpga/KD/pe_params.hpp"
-namespace paddle {
-namespace zynqmp {
-
-class PriorBoxPE : public PE {
- public:
-  bool init() {
-    param_.outputBoxes->setAligned(false);
-    param_.outputVariances->setAligned(false);
-    param_.outputBoxes->setDataLocation(CPU);
-    param_.outputVariances->setDataLocation(CPU);
-    return true;
-  }
-
-  bool dispatch();
-
-  void apply();
-
-  PriorBoxParam& param() { return param_; }
-
- private:
-  PriorBoxParam param_;
-  Tensor* cachedBoxes_ = nullptr;
-  Tensor* cachedVariances_ = nullptr;
-
-  void compute_prior_box();
-};
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/pes/relu_pe.hpp b/lite/backends/fpga/KD/pes/relu_pe.hpp
deleted file mode 100755
index 5c125010c2..0000000000
--- a/lite/backends/fpga/KD/pes/relu_pe.hpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "lite/backends/fpga/KD/pe.hpp"
-#include "lite/backends/fpga/KD/pe_params.hpp"
-namespace paddle {
-namespace zynqmp {
-
-class ReluPE : public PE {
- public:
-  bool init() {
-    Tensor* output = param_.output;
-    output->setAligned(true);
-    output->setDataLocation(Device);
-    return true;
-  }
-
-  void apply() {
-    Tensor* src = param_.input;
-
-    args_.input_data_type = DATA_TYPE_FP16;
-    args_.output_data_type = DATA_TYPE_FP16;
-    args_.input_layout_type = LAYOUT_HWC;
-    args_.output_layout_type = LAYOUT_HWC;
-    args_.image = {.address = src->data<void>(),
-                   .scale_address = src->scale(),
-                   .channels = (uint32_t)src->shape().channel(),
-                   .width = (uint32_t)src->shape().width(),
-                   .height = (uint32_t)src->shape().height(),
-                   .pad_width = 0u,
-                   .pad_height = 0u};
-    args_.output = {
-        .address = param_.output->data<void>(),
-        .scale_address = param_.output->scale(),
-    };
-
-    inplace_.relu_enable = false;
-    inplace_.power_enable = false;
-    inplace_.normalize_enable = false;
-  }
-
-  bool dispatch() {
-    inplace_.relu_enable = true;
-    config_inplace(inplace_);
-    param_.input->syncToDevice();
-    param_.output->copyFrom(param_.input);
-    param_.output->invalidate();
-    inplace_.relu_enable = false;
-    config_inplace(inplace_);
-    return true;
-  }
-
-  InputParam& param() { return param_; }
-
- private:
-  InputParam param_;
-  BypassArgs args_;
-  InplaceArgs inplace_;
-};
-
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/pes/resize.hpp b/lite/backends/fpga/KD/pes/resize.hpp
deleted file mode 100644
index f83896d2c7..0000000000
--- a/lite/backends/fpga/KD/pes/resize.hpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "lite/backends/fpga/KD/pe.hpp"
-#include "lite/backends/fpga/KD/pe_params.hpp"
-
-namespace paddle {
-namespace zynqmp {
-class ResizePE : public PE {
- public:
-  bool init() {
-    Tensor* output = param_.output;
-    output->setAligned(true);
-    output->setDataLocation(Device);
-    return true;
-  }
-
-  void apply() {
-    Tensor* input = param_.input;
-    Tensor* output = param_.output;
-    ResizeArgs& args = args_;
-
-    int input_width = input->shape().width();
-    int input_height = input->shape().height();
-    int input_channel = input->shape().channel();
-
-    int output_width = output->shape().width();
-    int output_height = output->shape().height();
-
-    args.input_width = input_width;
-    args.input_height = input_height;
-    args.image_channel = input_channel;
-    args.output_width = output_width;
-    args.output_height = output_height;
-    float height_ratio = static_cast<float>(input_height) /
-                         static_cast<float>(args.output_height);
-    float width_ratio =
-        static_cast<float>(input_width) / static_cast<float>(args.output_width);
-    args.height_ratio = *reinterpret_cast<uint32_t*>(&height_ratio);
-    args.width_ratio = *reinterpret_cast<uint32_t*>(&width_ratio);
-
-    args.input_image_address = input->mutableData<void>();
-    args.output_image_address = output->mutableData<void>();
-    args.output_scale_address = reinterpret_cast<uint32_t*>(output->scale());
-  }
-
-  void compute_scale(Tensor* src, float* scale) {
-    float16* data = src->data<float16>();
-    src->invalidate();
-    float max = 0;
-    for (int i = 0; i < src->shape().numel(); i++) {
-      float value = half_to_float(data[i]);
-      if (value < 0) {
-        value = -value;
-      }
-      if (value > max) {
-        max = value;
-      }
-    }
-    scale[0] = max / 127.0;
-    scale[1] = 127.0 / max;
-  }
-
-  bool dispatch() {
-    bool ret = compute_fpga_resize(args_) == 0;
-    return true;
-  }
-
-  ResizeParam& param() { return param_; }
-
- private:
-  ResizeParam param_;
-  ResizeArgs args_;
-};
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/pes/scale_pe.hpp b/lite/backends/fpga/KD/pes/scale_pe.hpp
deleted file mode 100755
index d5e16615d9..0000000000
--- a/lite/backends/fpga/KD/pes/scale_pe.hpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "lite/backends/fpga/KD/pe.hpp"
-#include "lite/backends/fpga/KD/pe_params.hpp"
-
-namespace paddle {
-namespace zynqmp {
-class ScalePE : public PE {
- public:
-  inline int gcd(int a, int b) {
-    while (b) {
-      int temp = a;
-      a = b;
-      b = temp % b;
-    }
-    return a;
-  }
-
-  inline int lcm(int a, int b) { return a * b / gcd(a, b); }
-  bool init() {
-    Tensor* output = param_.output;
-    output->setAligned(true);
-    output->setDataLocation(Device);
-    return true;
-  }
-
-  void apply() {
-    Tensor* input = param_.input;
-    Tensor* output = param_.output;
-    Shape& input_shape = input->shape();
-    int channel = input_shape.channel();
-    int repeat = 1;
-    int alignment = 16;
-    int length = channel;
-
-    if (channel % alignment != 0 || channel < alignment) {
-      int c_lcm = lcm(channel, alignment);
-      repeat = c_lcm / (channel);
-    }
-    Shape shape(N, {channel * repeat});
-    param_.alignedBias()->mutableData<float16>(FP16, shape);
-    param_.alignedScale()->mutableData<float16>(FP16, shape);
-
-    float16* bias_data = param_.alignedBias()->data<float16>();
-    float16* scale_data = param_.alignedScale()->data<float16>();
-
-    if (param_.bias != nullptr) {
-      float* bias_data_float = param_.bias->data<float>();
-      for (int i = 0; i < repeat; i++) {
-        for (int j = 0; j < length; j++) {
-          float16 value = float_to_half(bias_data_float[j]);
-          bias_data[i * length + j] = value;
-        }
-      }
-    } else {
-      float16 zero = float_to_half(0.0f);
-      for (int i = 0; i < repeat; i++) {
-        for (int j = 0; j < length; j++) {
-          bias_data[i * length + j] = zero;
-        }
-      }
-    }
-
-    float* scale_data_float = param_.scale->data<float>();
-    for (int i = 0; i < repeat; i++) {
-      for (int j = 0; j < length; j++) {
-        float16 value = float_to_half(scale_data_float[j]);
-        scale_data[i * length + j] = value;
-      }
-    }
-
-    param_.alignedScale()->flush();
-    param_.alignedBias()->flush();
-
-    int wc = input_shape.width() * input_shape.channel();
-    int wc_aligned = align_image(wc);
-
-    ScaleArgs& args = param_.args;
-    args.scale_address = param_.alignedScale()->data<void>();
-    args.bias_address = param_.alignedBias()->data<void>();
-    args.wc_alignment = wc_aligned;
-    args.channel_alignment = channel * repeat;
-
-    args.image.address = input->data<void>();
-    args.image.scale_address = input->scale();
-    args.image.channels = channel;
-    args.image.height = input_shape.height();
-    args.image.width = input_shape.width();
-    args.image.pad_width = 0;
-    args.image.pad_height = 0;
-    args.output.address = output->data<void>();
-    args.output.scale_address = output->scale();
-  }
-
-  bool dispatch() {
-    param_.input->syncToDevice();
-    return compute_fpga_scale(param_.args) == 0;
-  }
-
-  ScaleParam& param() { return param_; }
-
- private:
-  ScaleParam param_;
-};
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/pes/softmax_pe.cpp b/lite/backends/fpga/KD/pes/softmax_pe.cpp
deleted file mode 100755
index 099ed20b8f..0000000000
--- a/lite/backends/fpga/KD/pes/softmax_pe.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/fpga/KD/pes/softmax_pe.hpp"
-
-#include <vector>
-
-namespace paddle {
-namespace zynqmp {
-
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-#ifndef __aarch64__
-static inline float32_t vmaxvq_f32(const float32x4_t &r) {
-  float32x2_t v = vmax_f32(vget_high_f32(r), vget_low_f32(r));
-  return vget_lane_f32(vpmax_f32(v, v), 0);
-}
-
-static inline float32_t vaddvq_f32(const float32x4_t &r) {
-  float32x2_t v = vadd_f32(vget_high_f32(r), vget_low_f32(r));
-  return vget_lane_f32(vpadd_f32(v, v), 0);
-}
-#endif  // __aarch64__
-#endif  // __ARM_NEON__
-
-static float find_max(const float *input, const int num_classes) {
-  int remain = num_classes;
-  float max = -std::numeric_limits<float>::max();
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-  int loop = num_classes >> 3;
-  remain = num_classes & 0x7;
-  float32x4_t __max = vdupq_n_f32(max);
-  for (int i = 0; i < loop; ++i, input += 8) {
-    float32x4_t x0 = vld1q_f32(input);
-    float32x4_t x1 = vld1q_f32(input + 4);
-    __max = vmaxq_f32(x0, __max);
-    __max = vmaxq_f32(x1, __max);
-  }
-  max = vmaxvq_f32(__max);
-#endif
-  for (int i = 0; i < remain; ++i) {
-    max = std::max(max, input[i]);
-  }
-  return max;
-}
-
-static void softmax(Tensor *X, Tensor *Y) {
-  std::vector<int> dims = X->shape().dims();
-  int batch_size = X->shape().num();
-  int num_classes = dims[X->shape().dimSize() - 1];
-  int channels = X->shape().numel() / batch_size / num_classes;
-  float *x = X->data<float>();
-  float *y = Y->mutableData<float>();
-
-#pragma omp parallel for collapse(2)
-  for (int batch = 0; batch < batch_size; ++batch) {
-    for (int channel = 0; channel < channels; ++channel) {
-      size_t offset = (batch * channels + channel) * num_classes;
-      const float *input = x + offset;
-      float *output = y + offset;
-      // find max
-      float max = find_max(input, num_classes);
-
-      // exp(x - max)
-      int remain = num_classes;
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-      int loop = num_classes >> 3;
-      remain = num_classes & 0x7;
-      float32x4_t __max = vdupq_n_f32(max);
-      for (int i = 0; i < loop; ++i, input += 8, output += 8) {
-        float32x4_t x0 = vld1q_f32(input);
-        float32x4_t x1 = vld1q_f32(input + 4);
-        x0 = vsubq_f32(x0, __max);
-        x1 = vsubq_f32(x1, __max);
-        x0 = lite::arm::math::exp_ps(x0);
-        x1 = lite::arm::math::exp_ps(x1);
-        vst1q_f32(output, x0);
-        vst1q_f32(output + 4, x1);
-      }
-#endif  // __ARM_NEON__
-      for (int i = 0; i < remain; ++i) {
-        output[i] = expf(input[i] - max);
-      }
-
-      // sum(exp(x - max))
-      float sum = 0.f;
-      output = y + offset;
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-      float32x4_t __sum = vdupq_n_f32(0.f);
-      for (int i = 0; i < loop; ++i, output += 8) {
-        float32x4_t x0 = vld1q_f32(output);
-        float32x4_t x1 = vld1q_f32(output + 4);
-        __sum = vaddq_f32(x0, __sum);
-        __sum = vaddq_f32(x1, __sum);
-      }
-      sum += vaddvq_f32(__sum);
-#endif  // __ARM_NEON__
-      for (int i = 0; i < remain; ++i) {
-        sum += output[i];
-      }
-
-      // exp(x - max) / sum
-      float inv_sum = 1.f / sum;
-      output = y + offset;
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-      float32x4_t __inv_sum = vdupq_n_f32(inv_sum);
-      for (int i = 0; i < loop; ++i, output += 8) {
-        float32x4_t x0 = vld1q_f32(output);
-        float32x4_t x1 = vld1q_f32(output + 4);
-        x0 = vmulq_f32(x0, __inv_sum);
-        x1 = vmulq_f32(x1, __inv_sum);
-        vst1q_f32(output, x0);
-        vst1q_f32(output + 4, x1);
-      }
-#endif
-      for (int i = 0; i < remain; ++i) {
-        output[i] *= inv_sum;
-      }
-    }
-  }
-}
-
-bool SoftmaxPE::init() {
-  Tensor *output = param_.output;
-  output->setAligned(false);
-  output->setDataLocation(CPU);
-  return true;
-}
-
-bool SoftmaxPE::dispatch() {
-  Tensor *input = param_.input;
-  Tensor *output = param_.output;
-  input->syncToCPU();
-
-  Tensor float_input;
-  Tensor float_output;
-  float_input.mutableData<float>(DataType::FP32, input->shape());
-  float_input.copyFrom(input);
-
-  float *out_data =
-      float_output.mutableData<float>(DataType::FP32, input->shape());
-
-  softmax(&float_input, &float_output);
-  float_output.flush();
-
-  output->copyFrom(&float_output);
-  return true;
-}
-
-SoftmaxParam &SoftmaxPE::param() { return param_; }
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/pes/softmax_pe.hpp b/lite/backends/fpga/KD/pes/softmax_pe.hpp
deleted file mode 100644
index 5733f873a4..0000000000
--- a/lite/backends/fpga/KD/pes/softmax_pe.hpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <math.h>
-#include <algorithm>
-#include <limits>
-
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-#include <arm_neon.h>
-#include "lite/backends/arm/math/funcs.h"
-#endif
-
-#include "lite/backends/fpga/KD/pe.hpp"
-#include "lite/backends/fpga/KD/pe_params.hpp"
-
-namespace paddle {
-namespace zynqmp {
-
-class SoftmaxPE : public PE {
- public:
-  bool init();
-  bool dispatch();
-
-  SoftmaxParam& param();
-
- private:
-  SoftmaxParam param_;
-};
-
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/pes/split_pe.hpp b/lite/backends/fpga/KD/pes/split_pe.hpp
deleted file mode 100644
index 26598a4c87..0000000000
--- a/lite/backends/fpga/KD/pes/split_pe.hpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-
-#include "lite/backends/fpga/KD/pe.hpp"
-#include "lite/backends/fpga/KD/pe_params.hpp"
-namespace paddle {
-namespace zynqmp {
-
-class SplitPE : public PE {
- public:
-  bool init() {
-    std::vector<Tensor*> outputs = param_.outputs;
-    for (size_t i = 0; i < outputs.size(); i++) {
-      Tensor* out = outputs[i];
-      out->setAligned(false);
-      out->setDataLocation(CPU);
-    }
-    return true;
-  }
-
-  std::vector<int> stride_numel(std::vector<int> ddim) {
-    std::vector<int> strides(ddim.size());
-    strides[ddim.size() - 1] = ddim[ddim.size() - 1];
-    for (int i = ddim.size() - 2; i >= 0; --i) {
-      strides[i] = strides[i + 1] * ddim[i];
-    }
-    return strides;
-  }
-
-  template <typename T>
-  inline void StridedNumelCopyWithAxis(int64_t axis,
-                                       T* dst,
-                                       const std::vector<int>& dst_stride_numel,
-                                       T* src,
-                                       const std::vector<int>& src_stride_numel,
-                                       int64_t size) {
-    int64_t before = dst_stride_numel[0] / dst_stride_numel[axis];
-    int64_t src_after = src_stride_numel[axis];
-    int64_t dst_after = dst_stride_numel[axis];
-
-    for (int64_t i = 0; i < axis; ++i) {
-      if (i < axis) {
-      } else if (i == axis) {
-        continue;
-      } else {
-      }
-    }
-
-    for (int64_t i = 0; i < before; ++i) {
-      memory::Copy(dst + i * dst_after, src + i * src_after, sizeof(T) * size);
-    }
-  }
-
-  void split3D() { int axis = param_.axis; }
-
-  bool dispatch() {
-    Tensor* input = param_.input;
-    input->syncToCPU();
-    if (input->shape().dimSize() <= 3) {
-      auto in_stride = stride_numel(input->shape().dims());
-      int64_t axis = param_.axis;
-      size_t input_offset = 0;
-      float16* in_data = input->data<float16>();
-
-      for (auto& out : param_.outputs) {
-        float16* out_data = out->mutableData<float16>();
-        auto out_stride = stride_numel(out->shape().dims());
-
-        StridedNumelCopyWithAxis<float16>(axis,
-                                          out_data,
-                                          out_stride,
-                                          in_data + input_offset,
-                                          in_stride,
-                                          out_stride[axis]);
-        input_offset += out_stride[axis];
-      }
-      return true;
-    }
-
-    std::vector<Tensor*> outputs = param_.outputs;
-
-    int in_channel = input->shape().channel();
-    int split_channel = input->shape().channel() / param_.num;
-    int hw = input->shape().height() * input->shape().width();
-
-    float16* in_data = input->data<float16>();
-    for (int i = 0; i < hw; i++) {
-      for (int n = 0; n < outputs.size(); n++) {
-        Tensor* out = outputs[n];
-        float16* out_data = out->data<float16>();
-        memcpy(out_data + i * split_channel,
-               in_data + i * in_channel + n * split_channel,
-               split_channel * sizeof(float16));
-      }
-    }
-    for (int n = 0; n < outputs.size(); n++) {
-      Tensor* out = outputs[n];
-      out->copyScaleFrom(input);
-    }
-    return true;
-  }
-
-  SplitParam& param() { return param_; }
-
- private:
-  SplitParam param_;
-};
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/shape.hpp b/lite/backends/fpga/KD/shape.hpp
deleted file mode 100755
index 566ad8e6ff..0000000000
--- a/lite/backends/fpga/KD/shape.hpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdio.h>
-#include <vector>
-
-#include "lite/backends/fpga/KD/alignment.h"
-#include "lite/backends/fpga/KD/layout.hpp"
-
-namespace paddle {
-namespace zynqmp {
-
-static struct NCHW nchw_;
-static struct NHWC nhwc_;
-static struct NC nc_;
-static struct NHW nhw_;
-static struct N n_;
-
-class Shape {
- public:
-  explicit Shape(std::vector<int> dims) { dims_ = dims; }
-
-  Shape(LayoutType type, std::vector<int> dims) {
-    dims_ = dims;
-    setLayoutType(type);
-  }
-
-  Shape(const Shape& src) {
-    dims_ = src.dims_;
-    setLayoutType(src.layoutType_);
-  }
-
-  bool shouldAlign() {
-    return layout_->alignedElementCount(dims_) != layout_->elementCount(dims_);
-  }
-
-  int num() {
-    int index = layout_->numIndex();
-    return index == -1 ? 1 : dims_[index];
-  }
-
-  int channel() {
-    int index = layout_->channelIndex();
-    return index == -1 ? 1 : dims_[index];
-  }
-
-  int height() {
-    int index = layout_->heightIndex();
-    return index == -1 ? 1 : dims_[index];
-  }
-
-  int width() {
-    int index = layout_->widthIndex();
-    return index == -1 ? 1 : dims_[index];
-  }
-
-  int dimSize() { return dims_.size(); }
-
-  std::vector<int> dims() { return dims_; }
-
-  size_t memorySize(int cellSize) {
-    return layout_->alignedElementCount(dims_) * cellSize;
-  }
-
-  int numel() { return layout_->elementCount(dims_); }
-
-  int alignedElementCount() { return layout_->alignedElementCount(dims_); }
-
-  void setLayoutType(LayoutType layout) {
-    this->layoutType_ = layout;
-    switch (layout) {
-      case NCHW:
-        layout_ = &nchw_;
-        break;
-      case NHWC:
-        layout_ = &nhwc_;
-        break;
-      case NC:
-        layout_ = &nc_;
-        break;
-      case NHW:
-        layout_ = &nhw_;
-        break;
-      case N:
-        layout_ = &n_;
-        break;
-      default:
-        break;
-    }
-  }
-
-  void print() {}
-
-  int operator[](int index) { return dims_[index]; }
-
- private:
-  LayoutType layoutType_;
-  Layout* layout_ = &nhwc_;
-  std::vector<int> dims_;
-};
-
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/tensor.hpp b/lite/backends/fpga/KD/tensor.hpp
deleted file mode 100644
index f003ded33e..0000000000
--- a/lite/backends/fpga/KD/tensor.hpp
+++ /dev/null
@@ -1,456 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdio.h>
-#include <algorithm>
-#include <cmath>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <string>
-#include <vector>
-
-// #include "lite/core/tensor.h"
-
-#include "lite/backends/fpga/KD/dl_engine.hpp"
-#include "lite/backends/fpga/KD/float16.hpp"
-#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
-#include "lite/backends/fpga/KD/shape.hpp"
-// #include "lite/backends/fpga/KD/types.hpp"
-
-namespace paddle {
-namespace zynqmp {
-
-enum DataType : int {
-  FP32 = 0,
-  FP16 = 1,
-  INT8 = 2,
-  INT32 = 3,
-};
-
-enum DataSyncStatus : int {
-  Synched = 0,
-  Device = 1,
-  CPU = 2,
-};
-
-typedef uint16_t float16;
-
-inline int CellSize(DataType type) {
-  switch (type) {
-    case FP32:
-      return sizeof(float);
-    case FP16:
-      return sizeof(float16);
-    case INT32:
-      return sizeof(int32_t);
-    case INT8:
-      return sizeof(int8_t);
-    default:
-      return 0;
-  }
-  return 0;
-}
-
-class PlaceHolder {
- public:
-  PlaceHolder() {}
-  explicit PlaceHolder(size_t size) {
-    size_ = size;
-    data_ = fpga_malloc(size_);
-  }
-
-  void* data() { return data_; }
-  void set_data(const void* ptr) { data_ = const_cast<void*>(ptr); }
-
-  size_t memorySize() { return size_; }
-  void set_size(size_t new_size) { size_ = new_size; }
-
-  ~PlaceHolder() { fpga_free(data_); }
-
-  float scale_[2];
-
- private:
-  void* data_ = nullptr;
-  size_t size_ = 0;
-};
-
-class Tensor {
- public:
-  Tensor() { DLEngine::get_instance(); }
-
-  int id() { return id_; }
-
-  template <typename Dtype>
-  Dtype* data() {
-    if (placeHolder_ == nullptr) {
-      return nullptr;
-    }
-    void* ptr = reinterpret_cast<char*>(this->placeHolder_->data()) +
-                offset * CellSize(dataType_);
-    return reinterpret_cast<Dtype*>(ptr);
-  }
-
-  template <typename Dtype>
-  Dtype* mutableData(DataType dataType, const Shape& shape) {
-    if (this->shape_ != nullptr) {
-      delete shape_;
-    }
-    this->shape_ = new Shape(shape);
-    this->dataType_ = dataType;
-    return mutableData<Dtype>();
-  }
-
-  template <typename Dtype>
-  Dtype* mutableData() {
-    size_t memorySize = shape_->memorySize(CellSize(dataType_));
-    if (placeHolder_ != nullptr) {
-      if (memorySize > placeHolder_->memorySize()) {
-        placeHolder_.reset(new PlaceHolder(memorySize));
-      }
-    } else {
-      placeHolder_.reset(new PlaceHolder(memorySize));
-    }
-    return data<Dtype>();
-  }
-
-  size_t memorySize() {
-    if (placeHolder_ == nullptr) {
-      return 0;
-    }
-    return placeHolder_->memorySize();
-  }
-
-  void setDataType(DataType dataType) { this->dataType_ = dataType; }
-
-  DataType dataType() { return this->dataType_; }
-
-  Shape& shape() { return *shape_; }
-
-  bool aligned() { return this->aligned_; }
-
-  void setAligned(bool aligned) { this->aligned_ = aligned; }
-
-  float* scale() { return placeHolder_->scale_; }
-
-  void alignImage(Tensor* dst = nullptr, bool copy = false) {
-    if (shape_->shouldAlign()) {
-      int cell_size = CellSize(this->dataType_);
-      char* dst_data = nullptr;
-      size_t mem_size = shape_->memorySize(cell_size);
-      if (dst == nullptr) {
-        dst_data = reinterpret_cast<char*>(fpga_malloc(mem_size));
-      } else {
-        dst_data = dst->data<char>();
-      }
-      int wc = shape_->width() * shape_->channel();
-      int wc_aligned = align_image(wc);
-      int remainder = wc_aligned - wc;
-
-      char* src_start = data<char>();
-      char* dst_start = dst_data;
-      for (int n = 0; n < shape_->num(); n++) {
-        for (int h = 0; h < shape_->height(); h++) {
-          memcpy(dst_start, src_start, wc * cell_size);
-          memset(dst_start + wc * cell_size, 0, remainder * cell_size);
-          src_start += wc * cell_size;
-          dst_start += wc_aligned * cell_size;
-        }
-      }
-      if (dst == nullptr) {
-        memcpy(data<void>(), dst_data, mem_size);
-        flush();
-        fpga_free(dst_data);
-      } else {
-        dst->flush();
-      }
-    } else {
-      if (copy) {
-        dst->copyFrom(this);
-      } else {
-        // TODO(chonwhite) share data.
-      }
-    }
-    if (dst != nullptr) {
-      dst->copyScaleFrom(this);
-    }
-  }
-
-  inline void copyScaleFrom(Tensor* src) {
-    placeHolder_->scale_[0] = src->placeHolder_->scale_[0];
-    placeHolder_->scale_[1] = src->placeHolder_->scale_[1];
-  }
-
-  void unalignImage(Tensor* dst = nullptr, bool copy = false) {
-    Tensor* target = dst == nullptr ? this : dst;
-    if (!target->aligned_) {
-      if (copy && dst != nullptr) {
-        dst->copyFrom(this);
-      }
-      return;
-    }
-    target->syncToCPU();
-    if (shape_->shouldAlign()) {
-      int cell_size = CellSize(this->dataType_);
-      char* dst_data = nullptr;
-      size_t mem_size = shape_->memorySize(cell_size);
-      if (dst == nullptr) {
-        dst_data = reinterpret_cast<char*>(fpga_malloc(mem_size));
-      } else {
-        dst_data = dst->data<char>();
-      }
-      int wc = shape_->width() * shape_->channel();
-      int wc_aligned = align_image(wc);
-
-      char* src_start = data<char>();
-      char* dst_start = dst_data;
-      for (int n = 0; n < shape_->num(); n++) {
-        for (int h = 0; h < shape_->height(); h++) {
-          memcpy(dst_start, src_start, wc * cell_size);
-          src_start += wc_aligned * cell_size;
-          dst_start += wc * cell_size;
-        }
-      }
-      if (dst == nullptr) {
-        memcpy(data<void>(), dst_data, mem_size);
-        flush();
-        fpga_free(dst_data);
-      } else {
-        dst->flush();
-      }
-    } else {
-      if (copy) {
-        dst->copyFrom(this);
-      } else {
-        // TODO(chonwhite) share data.
-      }
-    }
-  }
-
-  void shareDataWith(Tensor* src) { shareDataWith(src, src->shape()); }
-
-  void shareDataWith(Tensor* src, const Shape& shape, int offset = 0) {
-    if (shape_ != nullptr) {
-      delete shape_;
-    }
-    this->placeHolder_ = src->placeHolder_;
-    this->dataType_ = src->dataType_;
-    this->aligned_ = src->aligned_;
-    this->dateLocation_ = src->dateLocation_;
-    this->offset = offset;
-    shape_ = new Shape(const_cast<Shape&>(shape));
-  }
-
-  void copyFrom(Tensor* src) {
-    if (src->dataType_ == dataType_) {
-      src->syncToCPU();
-      memcpy(data<void>(), src->data<void>(), memorySize());
-      copyScaleFrom(src);
-      flush();
-      return;
-    }
-    BypassArgs args;
-    args.input_data_type =
-        src->dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
-    args.output_data_type = dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
-    args.input_layout_type = LAYOUT_HWC;
-    args.output_layout_type = LAYOUT_HWC;
-    args.image = {.address = src->data<void>(),
-                  .scale_address = src->scale(),
-                  .channels = (uint32_t)src->shape().numel(),
-                  .width = 1,
-                  .height = 1,
-                  .pad_width = 0u,
-                  .pad_height = 0u};
-    args.output = {
-        .address = data<void>(), .scale_address = scale(),
-    };
-    src->syncToDevice();
-    size_t aligned_remainder = src->shape().numel() % 16;
-    if (aligned_remainder > 0) {
-      size_t dtype_size =
-          src->dataType_ == FP32 ? sizeof(float) : sizeof(float16);
-      void* dst = src->data<char>() + src->shape().numel() * dtype_size;
-      memset(dst, 0, aligned_remainder * dtype_size);
-      fpga_flush(dst, aligned_remainder * dtype_size);
-    }
-    src->syncToDevice();
-    this->invalidate();
-    perform_bypass(args);
-    this->invalidate();
-  }
-
-  void flush() { fpga_flush(placeHolder_->data(), placeHolder_->memorySize()); }
-
-  void invalidate() {
-    fpga_invalidate(placeHolder_->data(), placeHolder_->memorySize());
-  }
-
-  void sync() {
-    switch (synchedStatus_) {
-      case CPU:
-        flush();
-        break;
-      case Device:
-        invalidate();
-        break;
-      default:
-        break;
-    }
-  }
-
-  void syncToCPU() {
-    if (dateLocation_ == Device) {
-      invalidate();
-    }
-  }
-
-  void syncToDevice() {
-    if (dateLocation_ == CPU) {
-      flush();
-    }
-  }
-
-  DataSyncStatus synchedStatus() { return synchedStatus_; }
-
-  void setSynchedStatus(DataSyncStatus status) { synchedStatus_ = status; }
-
-  void setDataLocation(DataSyncStatus location) { dateLocation_ = location; }
-
-  void print() {}
-
-  void printScale() {
-    if (placeHolder_ == nullptr) {
-      return;
-    }
-  }
-
-  std::string dimsFileName() {
-    return std::to_string(shape_->num()) + "_" +
-           std::to_string(shape_->channel()) + "_" +
-           std::to_string(shape_->height()) + "_" +
-           std::to_string(shape_->width()) + ".txt";
-  }
-
-  void saveToFile() { std::string path = dimsFileName(); }
-
-  void saveToFile(std::string prefix, bool with_shape) {
-    std::string path = prefix;
-    if (with_shape) {
-      path = path + "_" + dimsFileName();
-    } else {
-      path = path + ".txt";
-    }
-    saveToFile(path);
-  }
-
-  friend std::ostream& operator<<(std::ostream& os, Tensor& tensor) {
-    os << "tensor:"
-       << "\n";
-    os << "dims: {";
-    for (int i = 0; i < tensor.shape().dimSize(); ++i) {
-      os << tensor.shape()[i] << " ";
-    }
-    os << "}\n";
-    for (int i = 0; i < tensor.shape().numel(); i++) {
-      float value = 0;
-      if (tensor.dataType() == FP32) {
-        value = tensor.data<float>()[i];
-      } else {
-        value = half_to_float(tensor.data<float16>()[i]);
-      }
-      os << value << " ";
-    }
-    os << "\n";
-    return os;
-  }
-
-  void saveToFile(std::string path) {
-    syncToCPU();
-    std::ofstream ofs;
-    static int counter = 0;
-    std::string npath = std::to_string(counter) + "_" + path;
-    counter++;
-    save_file_with_name(npath);
-  }
-
-  void save_file_with_name(std::string path) {
-    // return;
-    invalidate();
-    std::ofstream ofs;
-
-    ofs.open(path);
-    for (int i = 0; i < shape_->numel(); i++) {
-      float value = 0;
-      if (dataType_ == FP32) {
-        value = data<float>()[i];
-      } else {
-        value = half_to_float(data<float16>()[i]);
-      }
-      ofs << value << std::endl;
-    }
-    ofs.close();
-  }
-
-  void readFromFile(std::string path) {
-    std::ifstream file_stream;
-    file_stream.open(path);
-    if (!file_stream) {
-      return;
-    }
-    int num = shape_->numel();
-    invalidate();
-    float max = 0.0f;
-    float16* data = mutableData<float16>();
-    for (int i = 0; i < num; ++i) {
-      float value = 0;
-      file_stream >> value;
-      max = std::max(std::abs(value), max);
-      data[i] = float_to_half(value);
-    }
-    flush();
-    placeHolder_->scale_[0] = max / 127.0f;
-    placeHolder_->scale_[1] = 127.0f / max;
-  }
-
-  ~Tensor() {
-    if (shape_ != nullptr) {
-      delete shape_;
-      shape_ = nullptr;
-    }
-  }
-
- private:
-  int offset = 0;
-  std::shared_ptr<PlaceHolder> placeHolder_;
-  Shape* shape_ = nullptr;
-  DataType dataType_ = FP32;
-  bool aligned_ = false;
-  DataSyncStatus synchedStatus_ = Synched;
-  DataSyncStatus dateLocation_ = Device;
-
-  static int generateID() {
-    static int sID = 0;
-    int id = sID++;
-    return id;
-  }
-
-  int id_ = generateID();
-};
-
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/tensor_util.cpp b/lite/backends/fpga/KD/tensor_util.cpp
deleted file mode 100644
index cbf5df15cd..0000000000
--- a/lite/backends/fpga/KD/tensor_util.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-
-#include "lite/backends/fpga/KD/tensor_util.hpp"
-
-namespace paddle {
-namespace zynqmp {
-float find_max(const Tensor& tensor) {
-  float max = 0;
-  Tensor& t = const_cast<Tensor&>(tensor);
-  float* data = t.data<float>();
-  for (int i = 0; i < t.shape().numel(); i++) {
-    float value = data[i] > 0 ? data[i] : -data[i];
-    max = std::max(value, max);
-  }
-  return max;
-}
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/KD/tensor_util.hpp b/lite/backends/fpga/KD/tensor_util.hpp
deleted file mode 100644
index 01f5757039..0000000000
--- a/lite/backends/fpga/KD/tensor_util.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdio.h>
-
-#include "lite/backends/fpga/KD/tensor.hpp"
-
-namespace paddle {
-namespace zynqmp {
-float find_max(const Tensor& tensor);
-}  // namespace zynqmp
-}  // namespace paddle
diff --git a/lite/backends/fpga/lite_tensor.cc b/lite/backends/fpga/lite_tensor.cc
deleted file mode 100644
index 43218173fd..0000000000
--- a/lite/backends/fpga/lite_tensor.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/fpga/lite_tensor.h"
-#include <string>
-
-namespace paddle {
-namespace lite {
-
-using value_type = int64_t;
-
-value_type DDimLite::production() const {
-  value_type res = 1;
-  for (size_t i = 0; i < this->size(); i++) {
-    res *= (*this)[i];
-  }
-  return res;
-}
-
-value_type DDimLite::count(int start, int end) const {
-  if (start < 0) {
-    start = 0;
-  }
-  if (end > size()) {
-    end = size();
-  }
-  if (end < start) {
-    end = start;
-  }
-  value_type sum = 1;
-  for (auto i = start; i < end; ++i) {
-    sum *= data_[i];
-  }
-  return sum;
-}
-
-DDimLite DDimLite::Slice(int start, int end) const {
-  std::vector<value_type> vec;
-  for (int i = start; i < end; i++) {
-    vec.push_back((*this)[i]);
-  }
-  return DDimLite(vec);
-}
-
-std::string DDimLite::repr() const {
-  std::stringstream ss;
-  if (empty()) {
-    ss << "{}";
-    return ss.str();
-  }
-  ss << "{";
-  for (size_t i = 0; i < this->size() - 1; i++) {
-    ss << (*this)[i] << ",";
-  }
-  if (!this->empty()) ss << (*this)[size() - 1];
-  ss << "}";
-  return ss.str();
-}
-
-void TensorLite::ShareDataWith(const TensorLite &other) {
-  buffer_ = other.buffer_;
-  dims_ = other.dims_;
-  zynq_tensor_ = other.zynq_tensor_;
-  target_ = other.target_;
-  lod_ = other.lod_;
-  memory_size_ = other.memory_size_;
-  throw - 1;
-}
-
-void *TensorLite::mutable_data(size_t memory_size) {
-  memory_size_ = memory_size;
-  buffer_->ResetLazy(target_, memory_size_);
-  // throw -1;
-  std::cout << memory_size << std::endl;
-  return buffer_->data();
-}
-
-void *TensorLite::mutable_data(TargetType target, size_t memory_size) {
-  target_ = target;
-  return mutable_data(memory_size);
-}
-
-void TensorLite::CopyDataFrom(const TensorLite &other) {
-  dims_ = other.dims_;
-  target_ = other.target_;
-  lod_ = other.lod_;
-  // memory_size_ = other.memory_size_;
-  // buffer_->CopyDataFrom(*other.buffer_, memory_size_);
-  zynq_tensor_->mutableData<void>(other.zynq_tensor_->dataType(),
-                                  other.zynq_tensor_->shape());
-}
-
-// template <typename T>
-// void TensorLite::mutable_data_internal() {
-
-// }
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/fpga/lite_tensor.h b/lite/backends/fpga/lite_tensor.h
deleted file mode 100644
index 2f9df3abb0..0000000000
--- a/lite/backends/fpga/lite_tensor.h
+++ /dev/null
@@ -1,251 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <functional>  // for multiplies
-#include <memory>
-#include <numeric>
-#include <string>
-#include <vector>
-
-#include "lite/backends/fpga/KD/tensor.hpp"
-#include "lite/core/memory.h"
-
-namespace paddle {
-namespace lite {
-
-class DDimLite;
-class TensorLite;
-
-using DDim = lite::DDimLite;
-using Tensor = lite::TensorLite;
-
-class DDimLite {
- public:
-  using value_type = int64_t;
-
-  DDimLite() = default;
-
-  explicit DDimLite(const std::vector<value_type> &x) { ConstructFrom(x); }
-
-  void ConstructFrom(const std::vector<value_type> &x) { data_ = x; }
-
-  value_type operator[](int offset) const { return data_[offset]; }
-  value_type &operator[](int offset) { return data_[offset]; }
-  std::vector<int64_t> Vectorize() const { return data_; }
-
-  size_t size() const { return data_.size(); }
-  bool empty() const { return data_.empty(); }
-
-  value_type production() const;
-
-  const std::vector<value_type> &data() const { return data_; }
-  value_type count(int start, int end) const;
-
-  DDimLite Slice(int start, int end) const;
-
-  DDimLite Flatten2D(int col) const {
-    return DDimLite(std::vector<value_type>(
-        {Slice(0, col).production(), Slice(col, size()).production()}));
-  }
-
-  std::string repr() const;
-
-  friend std::ostream &operator<<(std::ostream &os, const DDimLite &dims) {
-    os << dims.repr();
-    return os;
-  }
-
-  friend bool operator==(const DDimLite &a, const DDimLite &b) {
-    if (a.size() != b.size()) return false;
-    for (size_t i = 0; i < a.size(); i++) {
-      if (a[i] != b[i]) return false;
-    }
-    return true;
-  }
-
-  friend bool operator!=(const DDimLite &a, const DDimLite &b) {
-    return !(a == b);
-  }
-
- private:
-  std::vector<value_type> data_;
-};
-
-using LoD = std::vector<std::vector<uint64_t>>;
-
-// A light-weight tensor implementation.
-class TensorLite {
- public:
-  TensorLite() : buffer_(std::make_shared<Buffer>()) {}
-
-  template <typename DType, typename DimT, TargetType Target>
-  void Assign(DType *data, const DimT &dim) {
-    Resize(dim);
-    auto *dst = mutable_data<DType, void>(Target);
-    CopySync<Target>(
-        dst, data, dim.production() * sizeof(DType), IoDirection::HtoD);
-  }
-
-  // T is the data type and R is the return type
-  // For OpenCL, the return type can be cl::Buffer
-  // and the data type can be float/int8_t.
-  // For other devices, T and R may be the same type.
-  template <typename T, typename R = T>
-  const R *data() const {
-    return zynq_tensor_->data<R>();
-  }
-
-  void Resize(const DDimLite &ddim) { dims_ = ddim; }
-  void Resize(const std::vector<int64_t> &x) { dims_ = DDimLite(x); }
-
-  const DDimLite &dims() const { return dims_; }
-  int64_t numel() const { return dims_.production(); }
-
-  const LoD &lod() const { return lod_; }
-  LoD *mutable_lod() { return &lod_; }
-
-  void set_lod(const LoD &lod) { lod_ = lod; }
-
-  PrecisionType precision() const { return precision_; }
-  void set_precision(PrecisionType precision) { precision_ = precision; }
-
-  bool persistable() const { return persistable_; }
-  void set_persistable(bool persistable) { persistable_ = persistable; }
-  // T is the data type and R is the return type
-  // For OpenCL, the return type can be cl::Buffer
-  // and the data type can be float/int8_t.
-  // For other devices, T and R may be the same type.
-  template <typename T, typename R = T>
-  R *mutable_data();
-
-  // T is the data type and R is the return type
-  // For OpenCL, the return type can be cl::Buffer
-  // and the data type can be float/int8_t.
-  // For other devices, T and R may be the same type.
-  template <typename T, typename R = T>
-  R *mutable_data(TargetType target);
-  void *mutable_data(size_t memory_size);
-  void *mutable_data(TargetType target, size_t memory_size);
-
-  const void *raw_data() const { return buffer_->data(); }
-
-  size_t data_size() const { return this->dims().production(); }
-
-  size_t memory_size() const { return zynq_tensor_->memorySize(); }
-
-  bool IsInitialized() const { return buffer_->data(); }
-
-  // Other share data to this.
-  void ShareDataWith(const TensorLite &other);
-
-  void CopyDataFrom(const TensorLite &other);
-
-  template <typename T>
-  TensorLite Slice(int64_t begin, int64_t end) const;
-
-  TargetType target() const { return target_; }
-
-  zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; }
-
-  friend std::ostream &operator<<(std::ostream &os, const TensorLite &tensor) {
-    os << "Tensor:" << '\n';
-    os << "dim: " << tensor.dims() << '\n';
-    for (int i = 0; i < tensor.dims().production(); i++) {
-      os << tensor.template data<float>()[i] << " ";
-    }
-    os << "\n";
-    return os;
-  }
-
- private:
-  TargetType target_{TargetType::kHost};
-  DDimLite dims_;
-  std::shared_ptr<Buffer> buffer_;
-  LoD lod_;
-  size_t memory_size_{};
-
-  size_t offset_{0};
-
-  PrecisionType precision_{PrecisionType::kUnk};
-  bool persistable_{false};
-
-  zynqmp::Tensor *zynq_tensor_ = new zynqmp::Tensor();
-
-  template <typename T>
-  void mutable_data_internal();
-};
-
-template <typename T, typename R>
-R *TensorLite::mutable_data() {
-  std::vector<int> v;
-  for (int i = 0; i < dims_.size(); i++) {
-    v.push_back(dims_[i]);
-  }
-  zynqmp::LayoutType layout_type = zynqmp::NCHW;
-  switch (v.size()) {
-    case 1:
-      layout_type = zynqmp::N;
-      break;
-    case 2:
-      layout_type = zynqmp::NC;
-      break;
-    case 3:
-      layout_type = zynqmp::NHW;
-      break;
-    case 4:
-      layout_type = zynqmp::NCHW;
-      break;
-  }
-  zynqmp::Shape input_shape(layout_type, v);
-
-  zynqmp::DataType data_type = zynqmp::FP32;
-  if (typeid(T) == typeid(float)) {
-    data_type = zynqmp::FP32;
-  }
-  if (typeid(T) == typeid(zynqmp::float16)) {
-    data_type = zynqmp::FP16;
-  }
-  return zynq_tensor_->mutableData<R>(data_type, input_shape);
-}
-
-template <typename T, typename R>
-R *TensorLite::mutable_data(TargetType target) {
-  target_ = target;
-  return mutable_data<T>();
-}
-
-template <typename TensorT>
-bool TensorCompareWith(const TensorT &a, const TensorT &b) {
-  if (a.dims() != b.dims()) return false;
-  if (memcmp(a.raw_data(), b.raw_data(), a.data_size()) != 0) return false;
-  return true;
-}
-template <typename T>
-TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
-  int64_t base = numel() / dims_[0];
-
-  TensorLite dst;
-  dst.buffer_ = buffer_;
-  dst.target_ = target_;
-  auto dst_dims = dims_;
-  dst_dims[0] = end - begin;
-  dst.Resize(dst_dims);
-  dst.offset_ = offset_ + static_cast<size_t>(begin * base) * sizeof(T);
-  return dst;
-}
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/fpga/target_wrapper.cc b/lite/backends/fpga/target_wrapper.cc
deleted file mode 100644
index 653384b061..0000000000
--- a/lite/backends/fpga/target_wrapper.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/target_wrapper.h"
-#include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
-#include "lite/utils/all.h"
-#ifdef LITE_WITH_FPGA
-namespace paddle {
-namespace lite {
-
-void* TargetWrapper<TARGET(kFPGA)>::Malloc(size_t size) {
-  return zynqmp::fpga_malloc(size);
-}
-
-void TargetWrapper<TARGET(kFPGA)>::Free(void* ptr) { zynqmp::fpga_free(ptr); }
-
-void TargetWrapper<TARGET(kFPGA)>::MemcpySync(void* dst,
-                                              const void* src,
-                                              size_t size,
-                                              IoDirection dir) {
-  memcpy(dst, src, size);
-}
-
-}  // namespace lite
-}  // namespace paddle
-#endif
diff --git a/lite/backends/host/CMakeLists.txt b/lite/backends/host/CMakeLists.txt
deleted file mode 100644
index 8c22d8da75..0000000000
--- a/lite/backends/host/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-lite_cc_library(target_wrapper_host SRCS target_wrapper.cc)
- 
- 
diff --git a/lite/backends/host/target_wrapper.cc b/lite/backends/host/target_wrapper.cc
deleted file mode 100644
index 5f020662a9..0000000000
--- a/lite/backends/host/target_wrapper.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/target_wrapper.h"
-#include <cstring>
-#include <memory>
-
-namespace paddle {
-namespace lite {
-
-const int MALLOC_ALIGN = 64;
-
-void* TargetWrapper<TARGET(kHost)>::Malloc(size_t size) {
-  size_t offset = sizeof(void*) + MALLOC_ALIGN - 1;
-  char* p = static_cast<char*>(malloc(offset + size));
-  if (!p) {
-    return nullptr;
-  }
-  void* r = reinterpret_cast<void*>(reinterpret_cast<size_t>(p + offset) &
-                                    (~(MALLOC_ALIGN - 1)));
-  static_cast<void**>(r)[-1] = p;
-  memset(r, 0, size);
-  return r;
-}
-void TargetWrapper<TARGET(kHost)>::Free(void* ptr) {
-  if (ptr) {
-    free(static_cast<void**>(ptr)[-1]);
-  }
-}
-void TargetWrapper<TARGET(kHost)>::MemcpySync(void* dst,
-                                              const void* src,
-                                              size_t size,
-                                              IoDirection dir) {
-  memcpy(dst, src, size);
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/npu/CMakeLists.txt b/lite/backends/npu/CMakeLists.txt
deleted file mode 100644
index abe567566b..0000000000
--- a/lite/backends/npu/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-if(NOT LITE_WITH_NPU)
-  return()
-endif()
-
-lite_cc_library(npu_helper SRCS npu_helper.cc DEPS ${npu_ddk_libs})
-add_subdirectory(bridge)
diff --git a/lite/backends/npu/bridge/CMakeLists.txt b/lite/backends/npu/bridge/CMakeLists.txt
deleted file mode 100644
index cf3ad99055..0000000000
--- a/lite/backends/npu/bridge/CMakeLists.txt
+++ /dev/null
@@ -1,67 +0,0 @@
-
-lite_cc_library(npu_bridge_registry SRCS registry.cc DEPS ${npu_ddk_libs})
-lite_cc_library(npu_bridge_utils SRCS utils.cc DEPS ${npu_ddk_libs} tensor op mir_node scope)
-
-set(npu_bridge_deps npu_bridge_registry npu_bridge_utils op)
-
-lite_cc_library(npu_bridge_fc_op SRCS fc_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_conv_op SRCS conv_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_mul_op SRCS mul_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_act_op SRCS act_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_scale_op SRCS scale_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_softmax_op SRCS softmax_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_pool_op SRCS pool_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_batch_norm_op SRCS batch_norm_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_elementwise_op SRCS elementwise_ops.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_reshape_op SRCS reshape_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_conv_transpose_op SRCS conv_transpose_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_interpolate_op SRCS interpolate_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_transpose_op SRCS transpose_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_split_op SRCS split_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_concat_op SRCS concat_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_shuffle_channel_op SRCS shuffle_channel_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_pad2d_op SRCS pad2d_op.cc DEPS ${npu_bridge_deps})
-
-set(npu_bridges
-        npu_bridge_registry
-        npu_bridge_utils
-        npu_bridge_fc_op
-        npu_bridge_conv_op
-        npu_bridge_mul_op
-        npu_bridge_act_op
-        npu_bridge_scale_op
-        npu_bridge_softmax_op
-        npu_bridge_pool_op
-        npu_bridge_batch_norm_op
-        npu_bridge_elementwise_op
-        npu_bridge_reshape_op
-        npu_bridge_conv_transpose_op
-        npu_bridge_interpolate_op
-        npu_bridge_transpose_op
-        npu_bridge_split_op
-        npu_bridge_concat_op
-        npu_bridge_shuffle_channel_op
-        npu_bridge_pad2d_op
-        CACHE INTERNAL "npu_bridges")
-
-lite_cc_library(npu_test_helper SRCS test_helper.cc DEPS npu_helper ${npu_ddk_libs} ${npu_bridges} ${npu_kernels} ${ops})
-
-lite_cc_test(test_npu_bridge_fc_op SRCS fc_op_test.cc DEPS npu_test_helper)
-lite_cc_test(test_npu_bridge_conv_op SRCS conv_op_test.cc DEPS npu_test_helper)
-lite_cc_test(test_npu_bridge_mul_op SRCS mul_op_test.cc DEPS npu_test_helper)
-lite_cc_test(test_npu_bridge_act_op SRCS act_op_test.cc DEPS npu_test_helper)
-lite_cc_test(test_npu_bridge_scale_op SRCS scale_op_test.cc DEPS npu_test_helper)
-lite_cc_test(test_npu_bridge_softmax_op SRCS softmax_op_test.cc DEPS npu_test_helper)
-lite_cc_test(test_npu_bridge_pool_op SRCS pool_op_test.cc DEPS npu_test_helper)
-lite_cc_test(test_npu_bridge_batch_norm_op SRCS batch_norm_op_test.cc DEPS npu_test_helper)
-lite_cc_test(test_npu_bridge_elementwise_op SRCS elementwise_ops_test.cc DEPS npu_test_helper)
-lite_cc_test(test_npu_bridge_reshape_op SRCS reshape_op_test.cc DEPS npu_test_helper)
-lite_cc_test(test_npu_bridge_conv_transpose_op SRCS conv_transpose_op_test.cc DEPS npu_test_helper)
-lite_cc_test(test_npu_bridge_interpolate_op SRCS interpolate_op_test.cc DEPS npu_test_helper)
-lite_cc_test(test_npu_bridge_transpose_op SRCS transpose_op_test.cc DEPS npu_test_helper)
-lite_cc_test(test_npu_bridge_split_op SRCS split_op_test.cc DEPS npu_test_helper)
-lite_cc_test(test_npu_bridge_concat_op SRCS concat_op_test.cc DEPS npu_test_helper)
-lite_cc_test(test_npu_bridge_shuffle_channel_op SRCS shuffle_channel_op_test.cc DEPS npu_test_helper)
-lite_cc_test(test_npu_bridge_pad2d_op SRCS pad2d_op_test.cc DEPS npu_test_helper)
-
-message(STATUS "+++++ npu_bridges: ${npu_bridges}")
diff --git a/lite/backends/npu/bridge/act_op.cc b/lite/backends/npu/bridge/act_op.cc
deleted file mode 100644
index 9573f7d7e9..0000000000
--- a/lite/backends/npu/bridge/act_op.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/utils.h"
-#include "lite/operators/relu_op.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-node_map_type ActConverter(const std::shared_ptr<lite::OpLite> act_op,
-                           const node_map_type& inputs_map) {
-  auto scope = act_op->scope();
-  auto op_info = act_op->op_info();
-  auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
-  LOG(INFO) << "Converting " + op_type + "...";
-
-  // create act node and set input node from inputs_map
-  auto x_var_name = op_info->Input("X").front();
-  auto act_node = std::make_shared<ge::op::Activation>(unique_op_type);
-  CHECK(inputs_map.count(x_var_name));
-  act_node->set_input_x(*inputs_map.at(x_var_name));
-  OpList::Global().add(inputs_map.at(x_var_name));
-  OpList::Global().add(act_node);
-
-  // parse and set activation type
-  int act_mode = 1;
-  if (op_type == "sigmod") {
-    act_mode = 0;
-  } else if (op_type == "relu") {
-    act_mode = 1;
-  } else if (op_type == "tanh") {
-    act_mode = 2;
-  } else if (op_type == "elu") {
-    act_mode = 4;
-  } else if (op_type == "abs") {
-    act_mode = 6;
-  } else if (op_type == "softsign") {
-    act_mode = 8;
-  } else if (op_type == "softplus") {
-    act_mode = 9;
-  } else if (op_type == "hardsigmoid") {
-    act_mode = 10;
-  } else {
-    // TODO(hong19860320) add more activation mode, and set the coef value
-    // clipped ReLU, LEAKY_RELU, relu1, threshold, selu and linear
-    LOG(FATAL) << "Unsupported activation type " << op_type;
-  }
-  act_node->set_attr_mode(act_mode);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = act_node;
-  return outputs_map;
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_NPU_BRIDGE(sigmod, paddle::lite::npu::bridge::ActConverter);
-REGISTER_NPU_BRIDGE(relu, paddle::lite::npu::bridge::ActConverter);
-REGISTER_NPU_BRIDGE(tanh, paddle::lite::npu::bridge::ActConverter);
-REGISTER_NPU_BRIDGE(elu, paddle::lite::npu::bridge::ActConverter);
-REGISTER_NPU_BRIDGE(abs, paddle::lite::npu::bridge::ActConverter);
-REGISTER_NPU_BRIDGE(softsign, paddle::lite::npu::bridge::ActConverter);
-REGISTER_NPU_BRIDGE(softplus, paddle::lite::npu::bridge::ActConverter);
-REGISTER_NPU_BRIDGE(hardsigmoid, paddle::lite::npu::bridge::ActConverter);
diff --git a/lite/backends/npu/bridge/act_op_test.cc b/lite/backends/npu/bridge/act_op_test.cc
deleted file mode 100644
index edbfbb416f..0000000000
--- a/lite/backends/npu/bridge/act_op_test.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/test_helper.h"
-#include "lite/core/op_registry.h"
-#include "lite/operators/relu_op.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-void relu_ref(const std::shared_ptr<operators::ReluOp> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto x_data = x->data<float>();
-  auto out_data = out->mutable_data<float>();
-  DDim x_dims = x->dims();
-  DDim out_dims = out->dims();
-  CHECK_EQ(x_dims.production(), out_dims.production());
-  for (int i = 0; i < out_dims.production(); i++) {
-    out_data[i] = std::max(0.f, x_data[i]);
-  }
-}
-
-void test_relu(int bs, int ic, int ih, int iw) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name("x");
-  std::string out_var_name("out");
-  std::string out_ref_var_name("out_ref");
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-
-  // initialize input&output data
-  FillTensor<float, int>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("relu");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::ReluOp>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // execute reference implementation and save to output tensor
-  relu_ref(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    VLOG(5) << i;
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
-  }
-}
-
-TEST(NPUBridges, relu) {
-  for (auto bs : {1, 3}) {
-    for (auto ic : {3, 4}) {
-      for (auto ih : {2, 5}) {
-        for (auto iw : {5, 9}) {
-          VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih
-                  << " iw: " << iw;
-          test_relu(bs, ic, ih, iw);
-        }
-      }
-    }
-  }
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(relu);
-USE_NPU_BRIDGE(relu);
diff --git a/lite/backends/npu/bridge/batch_norm_op.cc b/lite/backends/npu/bridge/batch_norm_op.cc
deleted file mode 100644
index 76b4ac3d9b..0000000000
--- a/lite/backends/npu/bridge/batch_norm_op.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/batch_norm_op.h"
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/utils.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-node_map_type BatchNormConverter(
-    const std::shared_ptr<lite::OpLite> batch_norm_op,
-    const node_map_type& inputs_map) {
-  auto scope = batch_norm_op->scope();
-  auto op_info = batch_norm_op->op_info();
-  auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
-  LOG(INFO) << "Converting " + op_type + "...";
-
-  std::shared_ptr<ge::op::BatchNorm> batch_norm_node =
-      std::make_shared<ge::op::BatchNorm>(unique_op_type);
-  auto x_var_name = op_info->Input("X").front();
-
-  auto scale_var_name = op_info->Input("Scale").front();
-  lite::Tensor* scale = scope->FindVar(scale_var_name)->GetMutable<Tensor>();
-  auto npu_scale = std::make_shared<ge::op::Const>(scale_var_name);
-  npu_scale->set_attr_value(CvtFromLiteTensor(scale));
-  OpList::Global().add(npu_scale);
-
-  auto bias_var_name = op_info->Input("Bias").front();
-  lite::Tensor* bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
-  auto npu_bias = std::make_shared<ge::op::Const>(bias_var_name);
-  npu_bias->set_attr_value(CvtFromLiteTensor(bias));
-  OpList::Global().add(npu_bias);
-
-  auto mean_var_name = op_info->Input("Mean").front();
-  lite::Tensor* mean = scope->FindVar(mean_var_name)->GetMutable<Tensor>();
-  auto npu_mean = std::make_shared<ge::op::Const>(mean_var_name);
-  npu_mean->set_attr_value(CvtFromLiteTensor(mean));
-  OpList::Global().add(npu_mean);
-
-  auto variance_var_name = op_info->Input("Variance").front();
-  lite::Tensor* variance =
-      scope->FindVar(variance_var_name)->GetMutable<Tensor>();
-  auto npu_variance = std::make_shared<ge::op::Const>(variance_var_name);
-  npu_variance->set_attr_value(CvtFromLiteTensor(variance));
-  OpList::Global().add(npu_variance);
-
-  float npu_momentum = op_info->GetAttr<float>("momentum");
-  float npu_epsilon = op_info->GetAttr<float>("epsilon");
-  int npu_mode = 1;  // bnScale, bnBias tensor dims are 1xCx1x1
-  bool npu_use_global_stats = op_info->GetAttr<bool>("use_global_stats");
-
-  batch_norm_node->set_input_x(*inputs_map.at(x_var_name));
-  batch_norm_node->set_input_scale(*npu_scale);
-  batch_norm_node->set_input_b(*npu_bias);
-  batch_norm_node->set_input_mean(*npu_mean);
-  batch_norm_node->set_input_variance(*npu_variance);
-  batch_norm_node->set_attr_momentum(npu_momentum);
-  batch_norm_node->set_attr_epsilon(npu_epsilon);
-  batch_norm_node->set_attr_mode(npu_mode);
-  batch_norm_node->set_attr_use_global_stats(npu_use_global_stats);
-
-  OpList::Global().add(inputs_map.at(x_var_name));
-  OpList::Global().add(batch_norm_node);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Y").front()] = batch_norm_node;
-  return outputs_map;
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_NPU_BRIDGE(batch_norm, paddle::lite::npu::bridge::BatchNormConverter);
diff --git a/lite/backends/npu/bridge/batch_norm_op_test.cc b/lite/backends/npu/bridge/batch_norm_op_test.cc
deleted file mode 100644
index ec5898f6c8..0000000000
--- a/lite/backends/npu/bridge/batch_norm_op_test.cc
+++ /dev/null
@@ -1,166 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/batch_norm_op.h"
-#include <gtest/gtest.h>
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/test_helper.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-template <typename dtype>
-void batch_norm_ref(const std::shared_ptr<operators::BatchNormOp> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto y = scope->FindVar(op_info->Output("Y").front())->GetMutable<Tensor>();
-  auto bias =
-      scope->FindVar(op_info->Input("Bias").front())->GetMutable<Tensor>();
-  auto scale =
-      scope->FindVar(op_info->Input("Scale").front())->GetMutable<Tensor>();
-  auto mean =
-      scope->FindVar(op_info->Input("Mean").front())->GetMutable<Tensor>();
-  auto variance =
-      scope->FindVar(op_info->Input("Variance").front())->GetMutable<Tensor>();
-
-  auto x_data = x->data<dtype>();
-  auto y_data = y->mutable_data<dtype>();
-  auto scale_data = scale->mutable_data<dtype>();
-  auto bias_data = bias->mutable_data<dtype>();
-  auto mean_data = mean->mutable_data<dtype>();
-  auto variance_data = variance->mutable_data<dtype>();
-  DDim x_dims = x->dims();
-
-  float epsilon = op_info->GetAttr<float>("epsilon");
-  float momentum = op_info->GetAttr<float>("momentum");
-  auto data_layout = op_info->GetAttr<std::string>("data_layout");
-
-  bool global_stats = op_info->GetAttr<bool>("use_global_stats");
-  if (global_stats) {
-    int64_t outer_size = 0;
-    int64_t channel_size = 0;
-    int64_t inner_size = 0;
-    if (data_layout == "NCHW") {
-      outer_size = x_dims[0];
-      channel_size = x_dims[1];
-      inner_size = x_dims.Slice(2, x_dims.size()).production();
-    } else {
-      LOG(FATAL) << "Unknown storage order: " << data_layout;
-    }
-    auto x_ptr = x_data;
-    auto y_ptr = y_data;
-    for (int o = 0; o < outer_size; o++) {
-      for (int c = 0; c < channel_size; c++) {
-        for (int i = 0; i < inner_size; i++) {
-          dtype norm_x =
-              (*x_ptr - mean_data[c]) / std::sqrt(variance_data[c] + epsilon);
-          *y_ptr = norm_x * scale_data[c] + bias_data[c];
-          x_ptr++;
-          y_ptr++;
-        }
-      }
-    }
-  }
-}
-
-void test_batch_norm(
-    int bs, int ic, int ih, int iw, float epsilon, float momentum) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  std::string scale_var_name = "scale";
-  std::string bias_var_name = "bias";
-  std::string mean_var_name = "mean";
-  std::string variance_var_name = "variance";
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* scale = scope.Var(scale_var_name)->GetMutable<Tensor>();
-  auto* bias = scope.Var(bias_var_name)->GetMutable<Tensor>();
-  auto* mean = scope.Var(mean_var_name)->GetMutable<Tensor>();
-  auto* variance = scope.Var(variance_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-  scale->Resize({ic});
-  bias->Resize({ic});
-  mean->Resize({ic});
-  variance->Resize({ic});
-
-  // initialize input&output data
-  FillTensor<float, int>(x);
-  FillTensor<float, int>(scale);
-  FillTensor<float, int>(bias);
-  FillTensor<float, int>(mean);
-  // variance > 0
-  FillTensor<float, int>(variance, 1.f, 5.f);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("batch_norm");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetInput("Scale", {scale_var_name});
-  opdesc.SetInput("Bias", {bias_var_name});
-  opdesc.SetInput("Mean", {mean_var_name});
-  opdesc.SetInput("Variance", {variance_var_name});
-  opdesc.SetOutput("Y", {out_var_name});
-  opdesc.SetAttr("is_test", 1);
-  opdesc.SetAttr("use_global_stats", true);
-  opdesc.SetAttr("epsilon", epsilon);
-  opdesc.SetAttr("momentum", momentum);
-  opdesc.SetAttr("data_layout", std::string("NCHW"));
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::BatchNormOp>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // execute reference implementation and save to output tensor
-  batch_norm_ref<float>(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
-  }
-}
-
-TEST(NPUBridges, batch_norm) {
-  for (auto bs : {1, 4, 7}) {
-    for (auto ic : {1, 4, 7}) {
-      for (auto ih : {1, 4, 7}) {
-        for (auto iw : {1, 4, 7}) {
-          for (auto epsilon : {1e-4f, 1e-5f}) {
-            for (auto momentum : {0.9f, 0.99f}) {
-              test_batch_norm(bs, ic, ih, iw, epsilon, momentum);
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(batch_norm);
-USE_NPU_BRIDGE(batch_norm);
diff --git a/lite/backends/npu/bridge/concat_op.cc b/lite/backends/npu/bridge/concat_op.cc
deleted file mode 100644
index 8548225181..0000000000
--- a/lite/backends/npu/bridge/concat_op.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/concat_op.h"
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/utils.h"
-#include "lite/backends/npu/npu_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-node_map_type ConcatConverter(const std::shared_ptr<lite::OpLite> concat_op,
-                              const node_map_type& inputs_map) {
-  lite::Scope* scope = concat_op->scope();
-  const lite::OpInfo* op_info = concat_op->op_info();
-  auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
-  LOG(INFO) << "converting " << op_type << " ... ";
-
-  auto x_var_names = op_info->Input("X");
-  auto axis = op_info->GetAttr<int>("axis");
-  int num = x_var_names.size();
-  int index = 0;
-
-  std::shared_ptr<ge::op::Concat> output_node =
-      std::make_shared<ge::op::Concat>(unique_op_type);
-  output_node->set_attr_axis(axis);
-  output_node->set_attr_N(num);
-  output_node->create_dynamic_input_x(num);
-  for (auto x_var_name : x_var_names) {
-    if (inputs_map.find(x_var_name) != inputs_map.end()) {
-      output_node->set_dynamic_input_x(index + 1, *inputs_map.at(x_var_name));
-      OpList::Global().add(inputs_map.at(x_var_name));
-    } else {
-      auto consty = std::make_shared<ge::op::Const>(x_var_name);
-      auto* x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
-      consty->set_attr_value(CvtFromLiteTensor(x));
-      output_node->set_dynamic_input_x(index + 1, *consty);
-      OpList::Global().add(consty);
-    }
-    index++;
-  }
-  OpList::Global().add(output_node);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = output_node;
-  return outputs_map;
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_NPU_BRIDGE(concat, paddle::lite::npu::bridge::ConcatConverter);
diff --git a/lite/backends/npu/bridge/concat_op_test.cc b/lite/backends/npu/bridge/concat_op_test.cc
deleted file mode 100644
index f1bf3101b2..0000000000
--- a/lite/backends/npu/bridge/concat_op_test.cc
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/concat_op.h"
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/test_helper.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-std::vector<size_t> stride_numel(const DDim& ddim) {
-  std::vector<size_t> strides(ddim.size());
-  strides[ddim.size() - 1] = ddim[ddim.size() - 1];
-  for (int i = ddim.size() - 2; i >= 0; --i) {
-    strides[i] = strides[i + 1] * ddim[i];
-  }
-  return strides;
-}
-
-void concat_ref(const std::shared_ptr<operators::ConcatOpLite> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = op_info->Input("X");
-  std::vector<lite::Tensor*> inputs;
-  for (auto var : x) {
-    inputs.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
-  }
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  int axis = op_info->GetAttr<int>("axis");
-  std::vector<lite::Tensor*> inputs_concat(inputs.size());
-  for (int j = 0; j < inputs.size(); ++j) {
-    inputs_concat[j] = inputs[j];
-  }
-  size_t num = inputs.size();
-  int rows = 1;
-  auto dim_0 = inputs[0]->dims();
-  for (int i = 0; i < axis; ++i) {
-    rows *= dim_0[i];
-  }
-  int out_rows = rows, out_cols = 0;
-  std::vector<int64_t> inputs_cols(inputs.size());
-  for (int i = 0; i < num; ++i) {
-    int t_cols = inputs[i]->numel() / rows;
-    out_cols += t_cols;
-    inputs_cols[i] = t_cols;
-  }
-  for (int k = 0; k < out_rows; ++k) {
-    float* dst_ptr = out->mutable_data<float>() + k * out_cols;
-    int col_idx = 0;
-    for (int j = 0; j < num; ++j) {
-      int col_len = inputs_cols[j];
-      const float* src_prt = inputs[j]->data<float>() + k * col_len;
-      std::memcpy(dst_ptr + col_idx, src_prt, sizeof(float) * col_len);
-      col_idx += col_len;
-    }
-  }
-}
-
-void test_concat(std::vector<vector<int64_t>> input, int axis) {
-  std::string x_var_name = "x";
-  std::string y_var_name = "y";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-
-  // prepare input&output variables
-  Scope scope;
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* y = scope.Var(y_var_name)->GetMutable<Tensor>();
-  x->Resize(DDim(input[0]));
-  y->Resize(DDim(input[1]));
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  CHECK_EQ(out->dims(), out_ref->dims());
-
-  // initialize input&output data
-  FillTensor<float>(x);
-  FillTensor<float>(y);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("concat");
-  opdesc.SetInput("X", {x_var_name, y_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("axis", axis);
-
-  auto op = CreateOp<operators::ConcatOpLite>(opdesc, &scope);
-  LauchOp(op, {x_var_name, y_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-  concat_ref(op);
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    VLOG(5) << i;
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 5e-4);
-  }
-}
-
-TEST(NPUBridges, concat) {
-  test_concat({{3, 3, 5, 2}, {2, 3, 5, 2}}, 0);
-  test_concat({{3, 5, 5, 2}, {3, 1, 5, 2}}, 1);
-  test_concat({{3, 3, 2, 2}, {3, 3, 4, 2}}, 2);
-  test_concat({{3, 3, 5, 2}, {3, 3, 5, 6}}, 3);
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(concat);
-USE_NPU_BRIDGE(concat);
diff --git a/lite/backends/npu/bridge/conv_op.cc b/lite/backends/npu/bridge/conv_op.cc
deleted file mode 100644
index 1be3d17cb6..0000000000
--- a/lite/backends/npu/bridge/conv_op.cc
+++ /dev/null
@@ -1,216 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/conv_op.h"
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/utils.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
-                            const node_map_type& inputs_map) {
-  auto scope = conv_op->scope();
-  auto op_info = conv_op->op_info();
-  auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
-  LOG(INFO) << "Converting " << op_type << "... ";
-
-  // get input, filter and op attributes
-  auto input_var_name = op_info->Input("Input").front();
-  auto input = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>();
-  auto input_dims = input->dims();
-  auto output_var_name = op_info->Output("Output").front();
-  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
-  auto output_dims = output->dims();
-  auto filter_var_name = op_info->Input("Filter").front();
-  auto filter = scope->FindVar(filter_var_name)->GetMutable<lite::Tensor>();
-  auto filter_dims = filter->dims();
-  auto bs = input_dims[0];
-  auto ic = input_dims[1];
-  auto oc = filter_dims[0];
-  CHECK_EQ(input_dims.size(), 4);
-  CHECK_EQ(output_dims.size(), 4);
-  CHECK_EQ(filter_dims.size(), 4);
-  CHECK_EQ(output_dims[0], bs);
-  CHECK_EQ(output_dims[1], oc);
-  auto strides = op_info->GetAttr<std::vector<int>>("strides");
-  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
-  auto groups = op_info->GetAttr<int>("groups");
-  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
-  auto fuse_relu = op_info->GetAttr<bool>("fuse_relu");
-  CHECK_EQ(strides.size(), 2);
-  CHECK_EQ(paddings.size(), 2);
-  CHECK_EQ(dilations.size(), 2);
-
-  // check depthwise mode, and decide whether use ConvolutionDepthwise Op
-  bool use_depthwise_conv =
-      false;  // whether use ge::op::ConvolutionDepthwise ?
-  bool is_depthwise_mode = ic == groups && oc == groups;
-  if (is_depthwise_mode &&
-      !((groups == 1 || groups >= 5) && dilations[0] == 1 &&
-        dilations[1] == 1)) {
-    use_depthwise_conv = true;
-    LOG(WARNING) << "For depthwise mode, dilation = 1 and groups >= 5 (or "
-                    "groups = 1) is only supported in "
-                    "Convolution Op, so force to use ConvolutionDepthwise Op, "
-                    "but may lead poor performance.";
-  }
-
-  // check input
-  CHECK(inputs_map.count(input_var_name));
-  OpList::Global().add(inputs_map.at(input_var_name));
-
-  // create filter node
-  CHECK(!inputs_map.count(filter_var_name));
-  auto filter_const_node = std::make_shared<ge::op::Const>(filter_var_name);
-  filter_const_node->set_attr_value(CvtFromLiteTensor(filter));
-  OpList::Global().add(filter_const_node);
-
-  // create bias node if has bias
-  // supports the bias nodes with the following dimensions
-  // 0: {oc}
-  // 1: {1, oc, oh, ow}
-  // 2: {n, oc, oh, ow}
-  std::shared_ptr<ge::Operator> bias_node = nullptr;
-  bool is_channel_bias = false;
-  if (HasInputArg(op_info, scope, "Bias")) {
-    auto bias_var_name = op_info->Input("Bias").front();
-    auto* bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
-    auto bias_dims = bias->dims();
-    auto bias_data_size = bias_dims.production();
-    auto output_data_size = output_dims.production();
-    std::vector<int64_t> bias_shape;
-    if (bias_data_size == oc) {
-      // 0: {oc}
-      bias_shape = {1, oc, 1, 1};
-      is_channel_bias = true;
-    } else if (bias_data_size == output_data_size / bs) {
-      // 1: {1, oc, oh, ow}
-      bias_shape = {1, output_dims[1], output_dims[2], output_dims[3]};
-    } else if (bias_data_size == output_data_size) {
-      // 2: {n, oc, oh, ow}
-      bias_shape = output_dims.Vectorize();
-    } else {
-      LOG(ERROR) << "bias dimension " << bias_dims
-                 << " isn't supported in conv2d Op when output dimension is "
-                 << output_dims;
-    }
-    if (inputs_map.count(bias_var_name)) {
-      // bias node from input map
-      bias_node = inputs_map.at(bias_var_name);
-    } else {
-      // bias node with const data
-      auto bias_const_node = std::make_shared<ge::op::Const>(bias_var_name);
-      bias_const_node->set_attr_value(CvtFromLiteTensor(bias, bias_shape));
-      bias_node = bias_const_node;
-    }
-    OpList::Global().add(bias_node);
-  }
-
-  // create conv node and set input, filter, bias nodes and attributes
-  std::shared_ptr<ge::Operator> conv_node = nullptr;
-  if (use_depthwise_conv && is_depthwise_mode) {
-    auto depthwise_conv_node =
-        std::make_shared<ge::op::ConvolutionDepthwise>(unique_op_type);
-    depthwise_conv_node->set_input_x(*inputs_map.at(input_var_name));
-    depthwise_conv_node->set_input_filter(*filter_const_node);
-    depthwise_conv_node->set_attr_mode(1);
-    depthwise_conv_node->set_attr_algo(0);
-    depthwise_conv_node->set_attr_format(0);    // NCHW
-    depthwise_conv_node->set_attr_pad_mode(5);  // VALID
-    depthwise_conv_node->set_attr_group(groups);
-    depthwise_conv_node->set_attr_pad(ge::AttrValue::LIST_INT(
-        {paddings[0], paddings[0], paddings[1], paddings[1]}));
-    depthwise_conv_node->set_attr_dilation(
-        ge::AttrValue::LIST_INT({dilations[0], dilations[1]}));
-    depthwise_conv_node->set_attr_stride(
-        ge::AttrValue::LIST_INT({strides[0], strides[1]}));
-    depthwise_conv_node->set_attr_kernel(
-        ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]}));
-    OpList::Global().add(depthwise_conv_node);
-    conv_node = depthwise_conv_node;
-    // ConvolutionDepthwise Op doesn't support bias, so append Add node to
-    // support bias
-    if (bias_node != nullptr) {
-      auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add");
-      add_node->set_input_x1(*depthwise_conv_node);
-      add_node->set_input_x2(*bias_node);
-      OpList::Global().add(add_node);
-      conv_node = add_node;
-    }
-  } else {
-    auto common_conv_node =
-        std::make_shared<ge::op::Convolution>(unique_op_type);
-    common_conv_node->set_input_x(*inputs_map.at(input_var_name));
-    common_conv_node->set_input_w(*filter_const_node);
-    common_conv_node->set_attr_mode(1);
-    common_conv_node->set_attr_pad_mode(0);  // NOTSET
-    common_conv_node->set_attr_group(groups);
-    common_conv_node->set_attr_pad(ge::AttrValue::LIST_INT(
-        {paddings[0], paddings[0], paddings[1], paddings[1]}));
-    common_conv_node->set_attr_dilation(
-        ge::AttrValue::LIST_INT({dilations[0], dilations[1]}));
-    common_conv_node->set_attr_stride(
-        ge::AttrValue::LIST_INT({strides[0], strides[1]}));
-    common_conv_node->set_attr_kernel(
-        ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]}));
-    OpList::Global().add(common_conv_node);
-    conv_node = common_conv_node;
-    // Convolution Op only support bias with dimension {1, oc, 1, 1},
-    // so append Add node if dimension is {1, oc, oh, ow} or (n, oc, oh, ow)
-    if (bias_node != nullptr) {
-      if (is_channel_bias) {
-        common_conv_node->set_input_b(*bias_node);
-      } else {
-        auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add");
-        add_node->set_input_x1(*common_conv_node);
-        add_node->set_input_x2(*bias_node);
-        OpList::Global().add(add_node);
-        conv_node = add_node;
-      }
-    }
-  }
-  CHECK(conv_node);
-
-  node_map_type outputs_map;
-  if (fuse_relu) {
-    // append relu node if fuse_relu is true
-    auto relu_node =
-        std::make_shared<ge::op::Activation>(unique_op_type + "/relu");
-    relu_node->set_input_x(*conv_node);
-    relu_node->set_attr_mode(1);
-    OpList::Global().add(relu_node);
-    outputs_map[op_info->Output("Output").front()] = relu_node;
-  } else {
-    outputs_map[op_info->Output("Output").front()] = conv_node;
-  }
-  return outputs_map;
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_NPU_BRIDGE(conv2d, paddle::lite::npu::bridge::ConvConverter);
-REGISTER_NPU_BRIDGE(depthwise_conv2d, paddle::lite::npu::bridge::ConvConverter);
diff --git a/lite/backends/npu/bridge/conv_op_test.cc b/lite/backends/npu/bridge/conv_op_test.cc
deleted file mode 100644
index 27e1226eaf..0000000000
--- a/lite/backends/npu/bridge/conv_op_test.cc
+++ /dev/null
@@ -1,280 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/conv_op.h"
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/test_helper.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-void conv_ref(const std::shared_ptr<operators::ConvOpLite> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto input =
-      scope->FindVar(op_info->Input("Input").front())->GetMutable<Tensor>();
-  auto filter =
-      scope->FindVar(op_info->Input("Filter").front())->GetMutable<Tensor>();
-  auto output =
-      scope->FindVar(op_info->Output("Output").front())->GetMutable<Tensor>();
-  std::vector<int32_t> strides =
-      op_info->GetAttr<std::vector<int32_t>>("strides");
-  std::vector<int32_t> paddings =
-      op_info->GetAttr<std::vector<int32_t>>("paddings");
-  int32_t groups = op_info->GetAttr<int32_t>("groups");
-  std::vector<int32_t> dilations =
-      op_info->GetAttr<std::vector<int32_t>>("dilations");
-  bool fuse_relu = op_info->GetAttr<bool>("fuse_relu");
-  auto input_dims = input->dims();
-  auto filter_dims = filter->dims();
-  auto output_dims = output->dims();
-  auto input_data = input->mutable_data<float>();
-  auto filter_data = filter->mutable_data<float>();
-  auto output_data = output->mutable_data<float>();
-  int kernel_w = filter_dims[3];
-  int kernel_h = filter_dims[2];
-  int stride_w = strides[1];
-  int stride_h = strides[0];
-  int dila_w = dilations[1];
-  int dila_h = dilations[0];
-  int pad_w = paddings[1];
-  int pad_h = paddings[0];
-  int batch_size = input_dims[0];
-  int in_ch_size = input_dims[1];
-  int in_h = input_dims[2];
-  int in_w = input_dims[3];
-  int out_ch_size = output_dims[1];
-  int out_h = output_dims[2];
-  int out_w = output_dims[3];
-  int out_c_group = out_ch_size / groups;
-  int in_c_group = in_ch_size / groups;
-  Tensor* bias = nullptr;
-  float* bias_data = nullptr;
-  bool is_channel_bias = false;
-  if (op_info->HasInput("Bias")) {
-    auto bias_var_names = op_info->Input("Bias");
-    if (bias_var_names.size() > 0) {
-      auto bias_var_name = bias_var_names.front();
-      bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
-      auto bias_dims = bias->dims();
-      is_channel_bias = bias_dims.production() == out_ch_size;
-      bias_data = bias->mutable_data<float>();
-    }
-  }
-  for (int n = 0; n < batch_size; ++n) {
-    for (int g = 0; g < groups; ++g) {
-      for (int oc = 0; oc < out_c_group; ++oc) {
-        for (int oh = 0; oh < out_h; ++oh) {
-          for (int ow = 0; ow < out_w; ++ow) {
-            int out_idx = n * groups * out_c_group * out_h * out_w +
-                          g * out_c_group * out_h * out_w + oc * out_h * out_w +
-                          oh * out_w + ow;
-            float out_value =
-                bias_data != nullptr
-                    ? (is_channel_bias ? bias_data[g * out_c_group + oc]
-                                       : bias_data[out_idx])
-                    : 0;
-            // + out_value *= beta;
-            for (int ic = 0; ic < in_c_group; ++ic) {
-              for (int kh = 0; kh < kernel_h; ++kh) {
-                for (int kw = 0; kw < kernel_w; ++kw) {
-                  int iw = ow * stride_w - pad_w + kw * (dila_w);
-                  int ih = oh * stride_h - pad_h + kh * (dila_h);
-                  if (iw < 0 || iw >= in_w) continue;
-                  if (ih < 0 || ih >= in_h) continue;
-                  int in_idx = n * in_ch_size * in_h * in_w +
-                               g * in_c_group * in_h * in_w + ic * in_h * in_w +
-                               ih * in_w + iw;
-                  int filter_idx =
-                      g * out_c_group * in_c_group * kernel_h * kernel_w +
-                      oc * in_c_group * kernel_h * kernel_w +
-                      ic * kernel_h * kernel_w + kh * kernel_w + kw;
-                  out_value += input_data[in_idx] * filter_data[filter_idx];
-                }
-              }
-            }
-            if (fuse_relu) {
-              out_value = out_value > 0 ? out_value : 0;
-            }
-            output_data[out_idx] = out_value;
-          }
-        }
-      }
-    }
-  }
-}
-
-void test_conv(int bs,
-               int ic,
-               int oc,
-               int ih,
-               int iw,
-               bool has_bias,
-               bool is_channel_bias,
-               bool fuse_relu,
-               bool depthwise,
-               int dilation,
-               int stride,
-               int padding,
-               int kernel) {
-  // prepare input&output variables
-  Scope scope;
-  std::string input_var_name("input");
-  std::string filter_var_name("filter");
-  std::string bias_var_name("bias");
-  std::string output_var_name("output");
-  std::string output_ref_var_name("output_ref");
-  auto* input = scope.Var(input_var_name)->GetMutable<Tensor>();
-  auto* filter = scope.Var(filter_var_name)->GetMutable<Tensor>();
-  auto* bias = scope.Var(bias_var_name)->GetMutable<Tensor>();
-  auto* output = scope.Var(output_var_name)->GetMutable<Tensor>();
-  auto* output_ref = scope.Var(output_ref_var_name)->GetMutable<Tensor>();
-
-  // get group size and input&filter shape
-  int groups = 1;
-  if (depthwise) {  // depthwise convolution ?
-    groups = oc = ic;
-  }
-  std::vector<int64_t> input_shape = {bs, ic, ih, iw};
-  std::vector<int64_t> filter_shape = {oc, ic / groups, kernel, kernel};
-  std::vector<int64_t> output_shape({bs, oc});
-  for (size_t i = 0; i < 2; i++) {
-    const int dkernel = dilation * (kernel - 1) + 1;
-    int output_size = (input_shape[i + 2] + 2 * padding - dkernel) / stride + 1;
-    output_shape.push_back(output_size);
-  }
-  input->Resize(input_shape);
-  filter->Resize(filter_shape);
-
-  // initialize input&output data
-  FillTensor<float, int>(input);
-  FillTensor<float, int>(filter);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType(depthwise ? "depthwise_conv2d" : "conv2d");
-  opdesc.SetInput("Input", {input_var_name});
-  opdesc.SetInput("Filter", {filter_var_name});
-  opdesc.SetOutput("Output", {output_var_name});
-  opdesc.SetAttr("dilations", std::vector<int32_t>({dilation, dilation}));
-  opdesc.SetAttr("strides", std::vector<int32_t>({stride, stride}));
-  opdesc.SetAttr("paddings", std::vector<int32_t>({padding, padding}));
-  opdesc.SetAttr("groups", groups);
-  opdesc.SetAttr("fuse_relu", static_cast<bool>(fuse_relu));
-  if (has_bias) {
-    if (is_channel_bias) {
-      bias->Resize({1, oc, 1, 1});
-    } else {
-      bias->Resize({output_shape});
-    }
-    FillTensor<float, int>(bias);
-    opdesc.SetInput("Bias", {bias_var_name});
-  }
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::ConvOpLite>(opdesc, &scope);
-  LauchOp(op, {input_var_name}, {output_var_name});
-  output_ref->CopyDataFrom(*output);
-
-  // execute reference implementation and save to output tensor('out')
-  conv_ref(op);
-
-  // compare results
-  auto* output_data = output->mutable_data<float>();
-  auto* output_ref_data = output_ref->mutable_data<float>();
-  for (int i = 0; i < output->dims().production(); i++) {
-    VLOG(5) << i;
-    EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-  }
-}
-
-TEST(NPUBridges, conv) {
-#if 1
-  for (auto bs : {1, 2}) {
-    for (auto ic : {3, 6}) {
-      for (auto oc : {6, 9}) {
-        for (auto ih : {14, 28}) {
-          for (auto iw : {14, 28}) {
-            for (auto has_bias : {false, true}) {
-              for (auto is_channel_bias : {false, true}) {
-                for (auto fuse_relu : {false, true}) {
-                  for (auto depthwise : {false, true}) {
-                    for (auto dilation : {1, 2}) {
-                      for (auto stride : {1, 2}) {
-                        for (auto kernel : {1, 3, 5}) {
-                          std::vector<int> paddings = {kernel / 2};
-                          if (kernel / 2 != 0) {
-                            paddings.push_back(0);
-                          }
-                          for (auto padding : paddings) {
-                            VLOG(3) << "bs: " << bs << " ic: " << ic
-                                    << " oc: " << oc << " ih: " << ih
-                                    << " iw: " << iw
-                                    << " has_bias: " << has_bias
-                                    << " is_channel_bias: " << is_channel_bias
-                                    << " fuse_relu: " << fuse_relu
-                                    << " depthwise: " << depthwise
-                                    << " dilation: " << dilation
-                                    << " stride: " << stride
-                                    << " padding: " << padding
-                                    << " kernel: " << kernel;
-                            test_conv(bs,
-                                      ic,
-                                      oc,
-                                      ih,
-                                      iw,
-                                      has_bias,
-                                      is_channel_bias,
-                                      fuse_relu,
-                                      depthwise,
-                                      dilation,
-                                      stride,
-                                      padding,
-                                      kernel);
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-#else
-  test_conv(1, 3, 6, 14, 14, false, false, false, true, 2, 1, 1, 3);
-  test_conv(1, 3, 6, 14, 14, false, false, false, true, 2, 1, 0, 3);
-  test_conv(1, 3, 6, 14, 14, false, false, false, true, 2, 1, 2, 5);
-  test_conv(1, 3, 6, 14, 14, false, false, false, true, 2, 1, 0, 5);
-#endif
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(conv2d);
-USE_NPU_BRIDGE(conv2d);
-
-USE_LITE_OP(depthwise_conv2d);
-USE_NPU_BRIDGE(depthwise_conv2d);
diff --git a/lite/backends/npu/bridge/conv_transpose_op.cc b/lite/backends/npu/bridge/conv_transpose_op.cc
deleted file mode 100644
index e27132c216..0000000000
--- a/lite/backends/npu/bridge/conv_transpose_op.cc
+++ /dev/null
@@ -1,146 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/conv_transpose_op.h"
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/utils.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-node_map_type ConvTransposeConverter(
-    const std::shared_ptr<lite::OpLite> conv_transpose_op,
-    const node_map_type& inputs_map) {
-  auto scope = conv_transpose_op->scope();
-  auto op_info = conv_transpose_op->op_info();
-  auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
-  LOG(INFO) << "Converting " << op_type << "... ";
-
-  // get input, output and op attributes
-  auto input_var_name = op_info->Input("Input").front();
-  auto input = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>();
-  auto input_shape = input->dims().Vectorize();
-  auto filter_var_name = op_info->Input("Filter").front();
-  auto filter = scope->FindVar(filter_var_name)->GetMutable<lite::Tensor>();
-  auto filter_shape = filter->dims().Vectorize();
-  CHECK_EQ(input_shape.size(), 4);
-  CHECK_EQ(filter_shape.size(), 4);
-  auto strides = op_info->GetAttr<std::vector<int>>("strides");
-  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
-  auto groups = op_info->GetAttr<int>("groups");
-  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
-  auto fuse_relu = op_info->GetAttr<bool>("fuse_relu");
-  CHECK_EQ(strides.size(), 2);
-  CHECK_EQ(paddings.size(), 2);
-  CHECK_EQ(dilations.size(), 2);
-
-  // create deconv node
-  auto conv_transpose_node =
-      std::make_shared<ge::op::Deconvolution>(unique_op_type);
-
-  // create input sizes node to describe the dimensions of input tensor
-  std::vector<int32_t> output_shape;
-  output_shape.push_back(input_shape[0]);
-  output_shape.push_back(filter_shape[1] * groups);
-  for (int i = 0; i < strides.size(); i++) {
-    int kernel_ext = dilations[i] * (filter_shape[i + 2] - 1) + 1;
-    int output_size =
-        (input_shape[i + 2] - 1) * strides[i] + kernel_ext - 2 * paddings[i];
-    output_shape.push_back(output_size);
-  }
-  auto input_sizes_const_node =
-      std::make_shared<ge::op::Const>(unique_op_type + "/input_size");
-  input_sizes_const_node->set_attr_value(CreateTensorAndFillData(output_shape));
-  conv_transpose_node->set_input_input_sizes(*input_sizes_const_node);
-  OpList::Global().add(input_sizes_const_node);
-
-  // create filter node
-  CHECK(!inputs_map.count(filter_var_name));
-  auto filter_const_node = std::make_shared<ge::op::Const>(filter_var_name);
-  filter_const_node->set_attr_value(CvtFromLiteTensor(filter));
-  conv_transpose_node->set_input_filter(*filter_const_node);
-  OpList::Global().add(filter_const_node);
-
-  // set input node
-  CHECK(inputs_map.count(input_var_name));
-  conv_transpose_node->set_input_x(*inputs_map.at(input_var_name));
-  OpList::Global().add(inputs_map.at(input_var_name));
-
-  // set attributes
-  conv_transpose_node->set_attr_mode(1);
-  conv_transpose_node->set_attr_format(0);    // NCHW
-  conv_transpose_node->set_attr_pad_mode(0);  // NOTSET
-  conv_transpose_node->set_attr_group(groups);
-  conv_transpose_node->set_attr_pad(ge::AttrValue::LIST_INT(
-      {paddings[0], paddings[0], paddings[1], paddings[1]}));
-  conv_transpose_node->set_attr_dilation(
-      ge::AttrValue::LIST_INT({dilations[0], dilations[1]}));
-  conv_transpose_node->set_attr_stride(
-      ge::AttrValue::LIST_INT({strides[0], strides[1]}));
-  conv_transpose_node->set_attr_kernel(
-      ge::AttrValue::LIST_INT({filter_shape[2], filter_shape[3]}));
-  OpList::Global().add(conv_transpose_node);
-
-  // append add node to add bias if has bias
-  std::shared_ptr<ge::Operator> output_node = conv_transpose_node;
-  if (HasInputArg(op_info, scope, "Bias")) {
-    // create bias node
-    auto bias_var_name = op_info->Input("Bias").front();
-    CHECK(!inputs_map.count(bias_var_name));
-    auto* bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
-    auto channel_size = bias->dims().production();
-    CHECK_EQ(channel_size, filter_shape[1] * groups);
-    auto bias_const_node = std::make_shared<ge::op::Const>(bias_var_name);
-    bias_const_node->set_attr_value(
-        CvtFromLiteTensor(bias, {1, channel_size, 1, 1}));
-    OpList::Global().add(bias_const_node);
-    // append add node to add bias node
-    auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add");
-    add_node->set_input_x1(*conv_transpose_node);
-    add_node->set_input_x2(*bias_const_node);
-    OpList::Global().add(add_node);
-    output_node = add_node;
-  }
-
-  node_map_type outputs_map;
-  if (fuse_relu) {
-    // append relu node if fuse_relu is true
-    auto relu_node =
-        std::make_shared<ge::op::Activation>(unique_op_type + "/relu");
-    relu_node->set_input_x(*output_node);
-    relu_node->set_attr_mode(1);
-    OpList::Global().add(relu_node);
-    outputs_map[op_info->Output("Output").front()] = relu_node;
-  } else {
-    outputs_map[op_info->Output("Output").front()] = output_node;
-  }
-  return outputs_map;
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_NPU_BRIDGE(conv2d_transpose,
-                    paddle::lite::npu::bridge::ConvTransposeConverter);
diff --git a/lite/backends/npu/bridge/conv_transpose_op_test.cc b/lite/backends/npu/bridge/conv_transpose_op_test.cc
deleted file mode 100644
index 02e3c7a1ce..0000000000
--- a/lite/backends/npu/bridge/conv_transpose_op_test.cc
+++ /dev/null
@@ -1,369 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/conv_transpose_op.h"
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/test_helper.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-template <typename DType>
-void add_bias_with_relu(DType* data,
-                        const DType* bias,
-                        int channel_size,
-                        int inner_size,
-                        bool has_relu) {
-  for (int c = 0; c < channel_size; ++c) {
-    DType bias_val = bias != nullptr ? bias[c] : 0;
-    for (int i = 0; i < inner_size; i++) {
-      DType data_val = data[i];
-      data_val += bias_val;
-      if (has_relu) {
-        data_val = data_val > 0 ? data_val : 0.f;
-      }
-      data[i] = data_val;
-    }
-    data += inner_size;
-  }
-}
-
-template <typename DType>
-void col2im(const DType* data_col,
-            const int channel_size,
-            const int height,
-            const int width,
-            const int kernel_h,
-            const int kernel_w,
-            const int pad_h,
-            const int pad_w,
-            const int stride_h,
-            const int stride_w,
-            const int dilation_h,
-            const int dilation_w,
-            DType* data_im) {
-  memset(data_im, 0, height * width * channel_size * sizeof(DType));
-  const int output_h =
-      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
-  const int output_w =
-      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
-  const int inner_size = height * width;
-  for (int c = channel_size; c--; data_im += inner_size) {
-    for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
-      for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
-        int input_row = -pad_h + kernel_row * dilation_h;
-        for (int output_rows = output_h; output_rows; output_rows--) {
-          if (input_row < 0 || input_row >= height) {
-            data_col += output_w;
-          } else {
-            int input_col = -pad_w + kernel_col * dilation_w;
-            for (int output_col = output_w; output_col; output_col--) {
-              if (input_col >= 0 && input_col < width) {
-                data_im[input_row * width + input_col] += *data_col;
-              }
-              data_col++;
-              input_col += stride_w;
-            }
-          }
-          input_row += stride_h;
-        }
-      }
-    }
-  }
-}
-
-template <typename IType, typename OType>
-void gemm(int M,
-          int N,
-          int K,
-          const IType* A,
-          const IType* B,
-          OType* C,
-          OType alpha,
-          OType beta,
-          bool is_trans_A = false,
-          bool is_trans_B = false) {
-  for (int m = 0; m < M; ++m) {
-    for (int n = 0; n < N; ++n) {
-      OType sum = static_cast<OType>(0);
-      for (int k = 0; k < K; ++k) {
-        IType a;
-        IType b;
-        if (is_trans_A) {
-          a = A[k * M + m];
-        } else {
-          a = A[m * K + k];
-        }
-        if (is_trans_B) {
-          b = B[n * K + k];
-        } else {
-          b = B[k * N + n];
-        }
-        sum += a * b;
-      }
-      C[m * N + n] = alpha * sum + beta * C[m * N + n];
-    }
-  }
-}
-
-template <typename IType, typename OType>
-void conv_transpose_ref(
-    const std::shared_ptr<operators::ConvTransposeOpLite> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto input =
-      scope->FindVar(op_info->Input("Input").front())->GetMutable<Tensor>();
-  auto filter =
-      scope->FindVar(op_info->Input("Filter").front())->GetMutable<Tensor>();
-  auto output =
-      scope->FindVar(op_info->Output("Output").front())->GetMutable<Tensor>();
-  std::vector<int32_t> strides =
-      op_info->GetAttr<std::vector<int32_t>>("strides");
-  std::vector<int32_t> paddings =
-      op_info->GetAttr<std::vector<int32_t>>("paddings");
-  int32_t groups = op_info->GetAttr<int32_t>("groups");
-  std::vector<int32_t> dilations =
-      op_info->GetAttr<std::vector<int32_t>>("dilations");
-  bool fuse_relu = op_info->GetAttr<bool>("fuse_relu");
-  Tensor* bias = nullptr;
-  OType* bias_data = nullptr;
-  if (op_info->HasInput("Bias")) {
-    auto bias_var_names = op_info->Input("Bias");
-    if (bias_var_names.size() > 0) {
-      auto bias_var_name = bias_var_names.front();
-      bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
-      bias_data = bias->mutable_data<OType>();
-    }
-  }
-  auto input_dims = input->dims();
-  auto filter_dims = filter->dims();
-  auto output_dims = output->dims();
-  auto input_data = input->mutable_data<IType>();
-  auto filter_data = filter->mutable_data<IType>();
-  auto output_data = output->mutable_data<OType>();
-  int kernel_w = filter_dims[3];
-  int kernel_h = filter_dims[2];
-  int stride_w = strides[1];
-  int stride_h = strides[0];
-  int dila_w = dilations[1];
-  int dila_h = dilations[0];
-  int pad_w = paddings[1];
-  int pad_h = paddings[0];
-  int batch_size = input_dims[0];
-  int in_ch_size = input_dims[1];
-  int in_h = input_dims[2];
-  int in_w = input_dims[3];
-  int out_ch_size = output_dims[1];
-  int out_h = output_dims[2];
-  int out_w = output_dims[3];
-
-  int M = out_ch_size * kernel_w * kernel_h / groups;
-  int N = in_h * in_w;
-  int K = in_ch_size / groups;
-
-  if (in_ch_size != out_ch_size || groups != in_ch_size) {
-    CHECK_EQ(in_ch_size % groups, 0);
-    CHECK_EQ(out_ch_size % groups, 0);
-  }
-
-  auto workspace = std::vector<OType>(groups * M * N);
-  int group_input_size = in_w * in_h * in_ch_size / groups;
-  int group_output_size = out_w * out_h * out_ch_size / groups;
-  int group_col_size = M * N;
-  int group_filter_size =
-      in_ch_size * out_ch_size * kernel_w * kernel_h / (groups * groups);
-  bool flag_1x1s1p1 = (kernel_w == 1) && (kernel_h == 1) && (stride_h == 1) &&
-                      (stride_w == 1) && (pad_w == 1) && (pad_h == 1) &&
-                      (dila_w == 1) && (dila_h == 1);
-  for (int n = 0; n < batch_size; ++n) {
-    input_data += n * in_ch_size * in_h * in_w;
-    output_data += n * out_ch_size * out_h * out_w;
-    auto col_data = workspace.data();
-    if (flag_1x1s1p1) {
-      col_data = output_data;
-    }
-    memset(col_data, 0, sizeof(OType) * group_col_size);
-    for (int g = 0; g < groups; ++g) {
-      auto input_group_data = input_data + g * group_input_size;
-      auto filter_group_data = filter_data + g * group_filter_size;
-      auto col_group_data = col_data + g * group_col_size;
-      gemm<IType, OType>(M,
-                         N,
-                         K,
-                         filter_group_data,
-                         input_group_data,
-                         col_group_data,
-                         static_cast<OType>(1),
-                         static_cast<OType>(0),
-                         true,
-                         false);
-    }
-    if (!flag_1x1s1p1) {
-      col2im(col_data,
-             out_ch_size,
-             out_h,
-             out_w,
-             kernel_h,
-             kernel_w,
-             pad_h,
-             pad_w,
-             stride_h,
-             stride_w,
-             dila_h,
-             dila_w,
-             output_data);
-    }
-    add_bias_with_relu(
-        output_data, bias_data, out_ch_size, out_w * out_h, fuse_relu);
-  }
-}
-
-void test_conv_transpose(int bs,
-                         int ic,
-                         int ih,
-                         int iw,
-                         bool has_bias,
-                         bool fuse_relu,
-                         int filters,
-                         int groups,
-                         int dilation,
-                         int stride,
-                         int padding,
-                         int kernel) {
-  // prepare input&output variables
-  Scope scope;
-  std::string input_var_name("input");
-  std::string filter_var_name("filter");
-  std::string bias_var_name("bias");
-  std::string output_var_name("output");
-  std::string output_ref_var_name("output_ref");
-  auto* input = scope.Var(input_var_name)->GetMutable<Tensor>();
-  auto* filter = scope.Var(filter_var_name)->GetMutable<Tensor>();
-  auto* bias = scope.Var(bias_var_name)->GetMutable<Tensor>();
-  auto* output = scope.Var(output_var_name)->GetMutable<Tensor>();
-  auto* output_ref = scope.Var(output_ref_var_name)->GetMutable<Tensor>();
-
-  // get group size and input&filter shape
-  std::vector<int64_t> input_shape = {bs, ic, ih, iw};
-  std::vector<int64_t> filter_shape = {ic, filters, kernel, kernel};
-  input->Resize(input_shape);
-  filter->Resize(filter_shape);
-
-  // initialize input&output data
-  FillTensor<float, int>(input);
-  FillTensor<float, int>(filter);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("conv2d_transpose");
-  opdesc.SetInput("Input", {input_var_name});
-  opdesc.SetInput("Filter", {filter_var_name});
-  opdesc.SetOutput("Output", {output_var_name});
-  opdesc.SetAttr("dilations", std::vector<int32_t>({dilation, dilation}));
-  opdesc.SetAttr("strides", std::vector<int32_t>({stride, stride}));
-  opdesc.SetAttr("paddings", std::vector<int32_t>({padding, padding}));
-  opdesc.SetAttr("groups", groups);
-  opdesc.SetAttr("fuse_relu", static_cast<bool>(fuse_relu));
-  if (has_bias) {
-    bias->Resize({1, filters * groups, 1, 1});
-    FillTensor<float, int>(bias);
-    opdesc.SetInput("Bias", {bias_var_name});
-  }
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::ConvTransposeOpLite>(opdesc, &scope);
-  LauchOp(op, {input_var_name}, {output_var_name});
-  output_ref->CopyDataFrom(*output);
-
-  // execute reference implementation and save to output tensor('out')
-  conv_transpose_ref<float, float>(op);
-
-  // compare results
-  auto* output_data = output->mutable_data<float>();
-  auto* output_ref_data = output_ref->mutable_data<float>();
-  for (int i = 0; i < output->dims().production(); i++) {
-    VLOG(5) << i;
-    EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-  }
-}
-
-TEST(NPUBridges, conv_transpose) {
-#if 1
-  for (auto bs : {1, 2}) {
-    for (auto ic : {3, 6}) {
-      for (auto ih : {14, 28}) {
-        for (auto iw : {14, 28}) {
-          for (auto has_bias : {false, true}) {
-            for (auto fuse_relu : {false, true}) {
-              for (auto filters : {1, 2, 5}) {
-                for (auto groups : {1 /* , 2, 5*/}) {
-                  for (auto dilation : {1, 2}) {
-                    for (auto stride : {1, 2}) {
-                      for (auto kernel : {1, 3, 5}) {
-                        std::vector<int> paddings = {kernel / 2};
-                        if (kernel / 2 != 0) {
-                          paddings.push_back(0);
-                        }
-                        for (auto padding : paddings) {
-                          VLOG(3) << "bs: " << bs << " ic: " << ic
-                                  << " ih: " << ih << " iw: " << iw
-                                  << " has_bias: " << has_bias
-                                  << " fuse_relu: " << fuse_relu
-                                  << " filters: " << filters
-                                  << " groups: " << groups
-                                  << " dilation: " << dilation
-                                  << " stride: " << stride
-                                  << " padding: " << padding
-                                  << " kernel: " << kernel;
-                          test_conv_transpose(bs,
-                                              ic,
-                                              ih,
-                                              iw,
-                                              has_bias,
-                                              fuse_relu,
-                                              filters,
-                                              groups,
-                                              dilation,
-                                              stride,
-                                              padding,
-                                              kernel);
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-#else
-  test_conv_transpose(1, 6, 8, 8, false, false, 5, 2, 1, 1, 1, 3);
-#endif
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(conv2d_transpose);
-USE_NPU_BRIDGE(conv2d_transpose);
diff --git a/lite/backends/npu/bridge/elementwise_ops.cc b/lite/backends/npu/bridge/elementwise_ops.cc
deleted file mode 100644
index 5459d819bb..0000000000
--- a/lite/backends/npu/bridge/elementwise_ops.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/elementwise_ops.h"
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/utils.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-node_map_type ElementwiseConverter(
-    const std::shared_ptr<lite::OpLite> elementwise_op,
-    const node_map_type& inputs_map) {
-  auto scope = elementwise_op->scope();
-  auto op_info = elementwise_op->op_info();
-  auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
-  LOG(INFO) << "converting elementwise...";
-
-  std::shared_ptr<ge::op::Eltwise> elementwise_node =
-      std::make_shared<ge::op::Eltwise>(unique_op_type);
-
-  auto x_var_name = op_info->Input("X").front();
-  auto y_var_name = op_info->Input("Y").front();
-
-  CHECK_EQ(op_info->GetAttr<int>("axis"), -1)
-      << "npu elementwise only support inputs with same size";
-
-  CHECK(inputs_map.find(x_var_name) != inputs_map.end());
-  elementwise_node->set_input_x1(*inputs_map.at(x_var_name));
-  OpList::Global().add(inputs_map.at(x_var_name));
-
-  if (inputs_map.find(y_var_name) != inputs_map.end()) {
-    elementwise_node->set_input_x2(*inputs_map.at(y_var_name));
-    OpList::Global().add(inputs_map.at(y_var_name));
-  } else {
-    auto consty = std::make_shared<ge::op::Const>(y_var_name);
-    auto* y = scope->FindVar(y_var_name)->GetMutable<Tensor>();
-    consty->set_attr_value(CvtFromLiteTensor(y));
-    elementwise_node->set_input_x2(*consty);
-    OpList::Global().add(consty);
-  }
-
-  OpList::Global().add(elementwise_node);
-
-  // paddlelite has sum only
-  elementwise_node->set_attr_mode(1);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = elementwise_node;
-  return outputs_map;
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_NPU_BRIDGE(elementwise_add,
-                    paddle::lite::npu::bridge::ElementwiseConverter);
diff --git a/lite/backends/npu/bridge/elementwise_ops_test.cc b/lite/backends/npu/bridge/elementwise_ops_test.cc
deleted file mode 100644
index ff82daec10..0000000000
--- a/lite/backends/npu/bridge/elementwise_ops_test.cc
+++ /dev/null
@@ -1,182 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/elementwise_ops.h"
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/test_helper.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-template <typename dtype>
-void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto y = scope->FindVar(op_info->Input("Y").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-
-  auto x_data = x->data<dtype>();
-  auto y_data = y->data<dtype>();
-  dtype* out_data = out->mutable_data<dtype>();
-
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  int axis = op_info->GetAttr<int>("axis");
-
-  if (axis < 0) {
-    axis = x_dims.size() - y_dims.size();
-  }
-  int batch = 1;
-  int channels = 1;
-  int num = 1;
-  for (int i = 0; i < axis; ++i) {
-    batch *= x_dims[i];
-  }
-  for (int i = 0; i < y_dims.size(); ++i) {
-    channels *= y_dims[i];
-  }
-  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
-    num *= x_dims[i];
-  }
-  // do elementwise add/sub/max...
-  std::string elt_type = "add";
-  if (elt_type == "add") {
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
-        int offset = (i * channels + j) * num;
-        const dtype* din_ptr = x_data + offset;
-        const dtype diny_data = y_data[j];
-        dtype* dout_ptr = out_data + offset;
-        for (int k = 0; k < num; ++k) {
-          *dout_ptr = *din_ptr + diny_data;
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  } else if (elt_type == "sub") {
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
-        int offset = (i * channels + j) * num;
-        const dtype* din_ptr = x_data + offset;
-        const dtype diny_data = y_data[j];
-        dtype* dout_ptr = out_data + offset;
-        for (int k = 0; k < num; ++k) {
-          *dout_ptr = *din_ptr - diny_data;
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  } else if (elt_type == "mul") {
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
-        int offset = (i * channels + j) * num;
-        const dtype* din_ptr = x_data + offset;
-        const dtype diny_data = y_data[j];
-        dtype* dout_ptr = out_data + offset;
-        for (int k = 0; k < num; ++k) {
-          *dout_ptr = *din_ptr * diny_data;
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  } else if (elt_type == "max") {
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
-        int offset = (i * channels + j) * num;
-        const dtype* din_ptr = x_data + offset;
-        const dtype diny_data = y_data[j];
-        dtype* dout_ptr = out_data + offset;
-        for (int k = 0; k < num; ++k) {
-          *dout_ptr = std::max(*din_ptr, diny_data);
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  } else {
-    LOG(FATAL) << "unsupported Elementwise type: " << elt_type;
-  }
-}
-
-void test_elementwise_add(int bs, int ic, int ih, int iw, int axis) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string y_var_name = "y";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* y = scope.Var(y_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-  y->Resize({bs, ic, ih, iw});
-
-  // initialize input&output data
-  FillTensor<float>(x);
-  FillTensor<float>(y);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("elementwise_add");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetInput("Y", {y_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("axis", axis);
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::ElementwiseOp>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // execute reference implementation and save to output tensor
-  elementwise_add_ref<float>(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-1);
-  }
-}
-
-TEST(NPUBridges, elementwise_add) {
-  for (auto bs : {1, 4, 7}) {
-    for (auto ic : {1, 4, 7}) {
-      for (auto ih : {1, 4, 7}) {
-        for (auto iw : {1, 4, 7}) {
-          for (auto axis : {-1}) test_elementwise_add(bs, ic, ih, iw, axis);
-        }
-      }
-    }
-  }
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(elementwise_add);
-USE_NPU_BRIDGE(elementwise_add);
diff --git a/lite/backends/npu/bridge/fc_op.cc b/lite/backends/npu/bridge/fc_op.cc
deleted file mode 100644
index 1321498db6..0000000000
--- a/lite/backends/npu/bridge/fc_op.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/fc_op.h"
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/utils.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-node_map_type FCConverter(const std::shared_ptr<lite::OpLite> fc_op,
-                          const node_map_type& inputs_map) {
-  LOG(INFO) << "Converting fc...";
-  lite::Scope* scope = fc_op->scope();
-  const lite::OpInfo* op_info = fc_op->op_info();
-  auto output_node = std::make_shared<ge::op::MatMul>(UniqueName("fc"));
-
-  auto x_var_name = op_info->Input("Input").front();
-  auto w_var_name = op_info->Input("W").front();
-
-  int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
-  auto* xtensor = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
-  auto* wtensor = scope->FindVar(w_var_name)->GetMutable<lite::Tensor>();
-  auto x_dims = xtensor->dims();
-  auto w_dims = wtensor->dims();
-
-  CHECK_GE(x_dims.size(), 2UL);
-  CHECK_EQ(w_dims.size(), 2UL);
-
-  int m = x_dims.Slice(0, in_num_col_dims).production();
-  int k = x_dims.Slice(in_num_col_dims, x_dims.size()).production();
-  int n = w_dims[1];
-
-  CHECK(inputs_map.count(x_var_name));
-  CHECK(!inputs_map.count(w_var_name));
-
-  LOG(INFO) << "m:" << m << ",n:" << n << ",k:" << k;
-  LOG(INFO) << "x_var_name:" << x_var_name
-            << ", is data: " << inputs_map.count(x_var_name);
-  LOG(INFO) << "w_var_name:" << w_var_name
-            << ", is data: " << inputs_map.count(w_var_name);
-
-  auto xsrc = inputs_map.at(x_var_name);
-  auto reshapex = std::make_shared<ge::op::Reshape>(x_var_name + "_reshape");
-  reshapex->set_input_tensor(*xsrc);
-  reshapex->set_attr_shape({m, k});
-  reshapex->set_attr_axis(0);
-  OpList::Global().add(xsrc);
-  OpList::Global().add(reshapex);
-  output_node->set_input_x(*reshapex);
-
-  auto wconst = std::make_shared<ge::op::Const>(w_var_name);
-  ge::TensorDesc wdesc(ge::Shape({k, n}), ge::FORMAT_NCHW, ge::DT_FLOAT);
-  auto size = wdesc.GetShape().GetShapeSize();
-  CHECK_EQ(size, w_dims.production());
-  ge::TensorPtr ptensor = std::make_shared<ge::Tensor>();
-  ptensor->SetTensorDesc(wdesc);
-  auto* pdata = reinterpret_cast<uint8_t*>(wtensor->mutable_data<float>());
-  ptensor->SetData(pdata, size * sizeof(float));
-  wconst->set_attr_value(ptensor);
-  OpList::Global().add(wconst);
-  output_node->set_input_w(*wconst);
-
-  if (HasInputArg(op_info, scope, "Bias")) {
-    auto b_var_name = op_info->Input("Bias").front();
-    auto* btensor = scope->FindVar(b_var_name)->GetMutable<lite::Tensor>();
-
-    LOG(INFO) << "b_var_name:" << b_var_name
-              << ", is data: " << inputs_map.count(b_var_name);
-    CHECK(!inputs_map.count(b_var_name));
-    CHECK_EQ(btensor->numel(), n);
-
-    auto bconst = std::make_shared<ge::op::Const>(b_var_name);
-    ge::TensorDesc bdesc(
-        ge::Shape({1, n, 1, 1}), ge::FORMAT_NCHW, ge::DT_FLOAT);
-    auto size = bdesc.GetShape().GetShapeSize();
-    CHECK_EQ(size, n);
-    ge::TensorPtr ptensor = std::make_shared<ge::Tensor>();
-    ptensor->SetTensorDesc(bdesc);
-    auto* pdata = reinterpret_cast<uint8_t*>(btensor->mutable_data<float>());
-    ptensor->SetData(pdata, size * sizeof(float));
-    bconst->set_attr_value(ptensor);
-    OpList::Global().add(bconst);
-    output_node->set_input_bias(*bconst);
-    output_node->set_attr_has_bias(ge::AttrValue::BOOL{true});
-  }
-
-  OpList::Global().add(output_node);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = output_node;
-  return outputs_map;
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_NPU_BRIDGE(fc, paddle::lite::npu::bridge::FCConverter);
diff --git a/lite/backends/npu/bridge/fc_op_test.cc b/lite/backends/npu/bridge/fc_op_test.cc
deleted file mode 100644
index 7bfee2034f..0000000000
--- a/lite/backends/npu/bridge/fc_op_test.cc
+++ /dev/null
@@ -1,146 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/fc_op.h"
-#include <gtest/gtest.h>
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/test_helper.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-void fc_ref(const std::shared_ptr<operators::FcOpLite> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto input =
-      scope->FindVar(op_info->Input("Input").front())->GetMutable<Tensor>();
-  auto w = scope->FindVar(op_info->Input("W").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  int32_t in_num_col_dims = op_info->GetAttr<int32_t>("in_num_col_dims");
-  Tensor* bias = nullptr;
-  float* bias_data = nullptr;
-  if (op_info->HasInput("Bias")) {
-    auto bias_var_names = op_info->Input("Bias");
-    if (bias_var_names.size() > 0) {
-      auto bias_var_name = bias_var_names.front();
-      bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
-      bias_data = bias->mutable_data<float>();
-    }
-  }
-  auto input_data = input->data<float>();
-  auto w_data = w->mutable_data<float>();
-  auto out_data = out->mutable_data<float>();
-  auto in_mat_dims = input->dims().Flatten2D(in_num_col_dims);
-  int out_num_classes = w->dims()[1];
-  const int M = in_mat_dims[0];
-  const int K = in_mat_dims[1];
-  const int N = out_num_classes;
-  for (int m = 0; m < M; ++m) {
-    for (int n = 0; n < N; ++n) {
-      out_data[m * N + n] = 0;
-      for (int k = 0; k < K; ++k) {
-        out_data[m * N + n] += input_data[m * K + k] * w_data[k * N + n];
-      }
-    }
-  }
-  if (bias_data != nullptr) {
-    for (int m = 0; m < M; ++m) {
-      for (int n = 0; n < N; ++n) {
-        out_data[m * N + n] += bias_data[n];
-      }
-    }
-  }
-}
-
-void test_fc(const std::vector<int64_t>& x_shape,
-             const std::vector<int64_t>& w_shape,
-             int in_num_col_dims,
-             bool has_bias) {
-  CHECK_EQ(w_shape.size(), 2UL);
-
-  const auto& bridges = lite::npu::bridge::Factory::Instance();
-  const auto& supported_lists = bridges.AllFunctions();
-  CHECK(bridges.HasType("fc"));
-
-  Scope scope;
-  std::string x_var_name("Input");
-  std::string w_var_name("W");
-  std::string bias_var_name("Bias");
-  std::string out_var_name("Out");
-  std::string out_ref_var_name("out_ref");
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* w = scope.Var(w_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize(x_shape);
-  input->Resize({bs, ic, ih, iw});
-
-  // get w shape
-  auto in_mat_dims = input->dims().Flatten2D(in_num_col_dims);
-  std::vector<int64_t> w_shape = {in_mat_dims[1], out_num_classes};
-  w->Resize(w_shape);
-
-  FillTensor<float, int>(x);
-  FillTensor<float, int>(w);
-
-  // create fc op
-  cpp::OpDesc fc_op_desc;
-  fc_op_desc.SetType("fc");
-  fc_op_desc.SetInput("Input", {x_var_name});
-  fc_op_desc.SetInput("W", {w_var_name});
-  fc_op_desc.SetOutput("Out", {out_var_name});
-  fc_op_desc.SetAttr("in_num_col_dims", static_cast<int>(in_num_col_dims));
-  if (has_bias) {
-    auto* bias = scope.Var(bias_var_name)->GetMutable<Tensor>();
-    bias->Resize({w_shape[1]});
-    FillTensor<float, int>(bias);
-    fc_op_desc.SetInput("Bias", {bias_var_name});
-  }
-
-  auto fc_op = CreateOp<operators::FcOpLite>(fc_op_desc, &scope);
-  LauchOp(fc_op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // compare results
-  fc_ref(fc_op);
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
-  }
-
-  // model release
-  npu::OpList::Global().clear();
-  npu::DeviceInfo::Global().Clear();
-}
-
-TEST(NPUBridges, fc) {
-  for (bool use_bias : {true, false}) {
-    test_fc({1, 8, 8, 1}, {8, 4}, 2, use_bias);
-    test_fc({1, 5, 5, 1}, {5, 7}, 2, use_bias);
-    test_fc({1, 4, 1, 1}, {4, 8}, 1, use_bias);
-  }
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(fc);
-USE_NPU_BRIDGE(fc);
diff --git a/lite/backends/npu/bridge/interpolate_op.cc b/lite/backends/npu/bridge/interpolate_op.cc
deleted file mode 100644
index 83cae61e3f..0000000000
--- a/lite/backends/npu/bridge/interpolate_op.cc
+++ /dev/null
@@ -1,143 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/utils.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-node_map_type InterpolateConverter(
-    const std::shared_ptr<lite::OpLite> interpolate_op,
-    const node_map_type& inputs_map) {
-  auto scope = interpolate_op->scope();
-  auto op_info = interpolate_op->op_info();
-  auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
-  LOG(INFO) << "Converting " + op_type + "...";
-
-  // get input, output and attributes from lite op
-  auto x_var_name = op_info->Input("X").front();
-  CHECK(inputs_map.count(x_var_name));
-  OpList::Global().add(inputs_map.at(x_var_name));
-
-  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
-  auto x_dims = x->dims();
-  auto x_h = x_dims[2];
-  auto x_w = x_dims[3];
-  CHECK_EQ(x_dims.size(), 4);
-  auto scale = op_info->GetAttr<float>("scale");
-  auto out_w = op_info->GetAttr<int>("out_w");
-  auto out_h = op_info->GetAttr<int>("out_h");
-  auto align_corners = op_info->GetAttr<bool>("align_corners");
-  int align_mode = op_info->GetAttr<int>("align_mode");
-  CHECK(!(align_mode == 0 && !align_corners))
-      << "align_mode = 0 && align_corners = false isn't supported in NPU DDK";
-
-  // priority: OutSize > scale > out_h/out_w
-  if (scale > 0) {
-    out_h = static_cast<int>(x_h * scale);
-    out_w = static_cast<int>(x_w * scale);
-    out_h = out_h > 0 ? out_h : -1;
-    out_w = out_w > 0 ? out_w : -1;
-  }
-
-  // update out_h and out_w if has OutSize
-  bool inputs_map_has_w = false;
-  if (HasInputArg(op_info, scope, "OutSize")) {
-    auto out_size_var_name = op_info->Input("OutSize").front();
-    if (inputs_map.count(out_size_var_name)) {
-      inputs_map_has_w = true;
-    } else {
-      auto out_size =
-          scope->FindVar(out_size_var_name)->GetMutable<lite::Tensor>();
-      CHECK_EQ(out_size->numel(), 2);
-      auto out_size_data = out_size->mutable_data<int>();
-      // update out_h and out_w if has OutSize
-      out_h = out_size_data[0];
-      out_w = out_size_data[1];
-    }
-  }
-
-  node_map_type outputs_map;
-  auto interp_method = op_info->GetAttr<std::string>("interp_method");
-  if (interp_method == "bilinear") {
-    auto interp_node = std::make_shared<ge::op::ResizeBilinear>(unique_op_type);
-    OpList::Global().add(interp_node);
-    interp_node->set_input_x(*inputs_map.at(x_var_name));
-    if (inputs_map_has_w) {
-      auto out_size_var_name = op_info->Input("OutSize").front();
-      interp_node->set_input_w(*inputs_map.at(out_size_var_name));
-      OpList::Global().add(inputs_map.at(out_size_var_name));
-    } else {
-      const float largest_multiple = 7.0f;
-      float multiple = static_cast<float>(x_h * x_w) / (out_h * out_w);
-      CHECK_LT(multiple, largest_multiple)
-          << "multiple=(ih*iw)/(oh*ow)=" << multiple
-          << " is too large, should not exceed " << largest_multiple
-          << " in NPU DDK";
-      auto w_const_node =
-          std::make_shared<ge::op::Const>(unique_op_type + "/w");
-      w_const_node->set_attr_value(
-          CreateTensorAndFillData(std::vector<int>({out_h, out_w})));
-      interp_node->set_input_w(*w_const_node);
-      OpList::Global().add(w_const_node);
-    }
-    interp_node->set_attr_output_dim_mode(
-        2);  // 0: zoom_factor, 1: shrink_factor, 2: height/width
-    interp_node->set_attr_align_corners(align_corners);
-    outputs_map[op_info->Output("Out").front()] = interp_node;
-  } else if (interp_method == "nearest") {
-    auto interp_node =
-        std::make_shared<ge::op::ResizeNearestNeighbor>(unique_op_type);
-    OpList::Global().add(interp_node);
-    interp_node->set_input_image(*inputs_map.at(x_var_name));
-    if (inputs_map_has_w) {
-      auto out_size_var_name = op_info->Input("OutSize").front();
-      interp_node->set_input_size(*inputs_map.at(out_size_var_name));
-      OpList::Global().add(inputs_map.at(out_size_var_name));
-    } else {
-      auto w_const_node =
-          std::make_shared<ge::op::Const>(unique_op_type + "/w");
-      w_const_node->set_attr_value(
-          CreateTensorAndFillData(std::vector<int>({out_h, out_w})));
-      interp_node->set_input_size(*w_const_node);
-      OpList::Global().add(w_const_node);
-    }
-    interp_node->set_attr_align_corners(align_corners);
-    outputs_map[op_info->Output("Out").front()] = interp_node;
-  } else {
-    LOG(FATAL) << "unsupported interpolate method: " << interp_method;
-  }
-
-  return outputs_map;
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_NPU_BRIDGE(bilinear_interp,
-                    paddle::lite::npu::bridge::InterpolateConverter);
-REGISTER_NPU_BRIDGE(nearest_interp,
-                    paddle::lite::npu::bridge::InterpolateConverter);
diff --git a/lite/backends/npu/bridge/interpolate_op_test.cc b/lite/backends/npu/bridge/interpolate_op_test.cc
deleted file mode 100644
index 79dd612c59..0000000000
--- a/lite/backends/npu/bridge/interpolate_op_test.cc
+++ /dev/null
@@ -1,405 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/interpolate_op.h"
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/test_helper.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-template <typename DType>
-void bilinear_interp_ref(const std::shared_ptr<operators::InterpolateOp> op) {
-  auto scope = op->scope();
-  auto op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto x_dims = x->dims();
-  int batch_size = x_dims[0];
-  int channel_size = x_dims[1];
-  auto x_h = x_dims[2];
-  auto x_w = x_dims[3];
-  CHECK_EQ(x_dims.size(), 4);
-  auto scale = op_info->GetAttr<float>("scale");
-  auto out_w = op_info->GetAttr<int>("out_w");
-  auto out_h = op_info->GetAttr<int>("out_h");
-  auto align_corners = op_info->GetAttr<bool>("align_corners");
-  int align_mode = op_info->GetAttr<int>("align_mode");
-  auto interp_method = op_info->GetAttr<std::string>("interp_method");
-
-  // calc real out_h and out_w
-  if (scale > 0) {
-    out_h = static_cast<int>(x_h * scale);
-    out_w = static_cast<int>(x_w * scale);
-  }
-  if (op_info->HasInput("OutSize")) {
-    auto out_size_var_names = op_info->Input("OutSize");
-    if (out_size_var_names.size() > 0) {
-      auto out_size_var_name = out_size_var_names.front();
-      auto out_size =
-          scope->FindVar(out_size_var_name)->GetMutable<lite::Tensor>();
-      auto out_size_dims = out_size->dims();
-      CHECK_EQ(out_size_dims.size(), 1);
-      CHECK_EQ(out_size_dims.production(), 2);
-      auto out_size_data = out_size->mutable_data<int>();
-      out_h = out_size_data[0];
-      out_w = out_size_data[1];
-    }
-  }
-  CHECK_GT(out_h, 0);
-  CHECK_GT(out_w, 0);
-  out->Resize({batch_size, channel_size, out_h, out_w});
-
-  // copy from x if no change
-  if (x_h == out_h && x_w == out_w) {
-    out->CopyDataFrom(*x);
-    return;
-  }
-
-  float ratio_h = 0.f;
-  float ratio_w = 0.f;
-  if (out_h > 1) {
-    ratio_h = (align_corners) ? static_cast<float>(x_h - 1) / (out_h - 1)
-                              : static_cast<float>(x_h) / out_h;
-  }
-  if (out_w > 1) {
-    ratio_w = (align_corners) ? static_cast<float>(x_w - 1) / (out_w - 1)
-                              : static_cast<float>(x_w) / out_w;
-  }
-
-  // naive bilinear interpolation
-  auto x_data = x->mutable_data<DType>();
-  auto out_data = out->mutable_data<DType>();
-  bool align_flag = (align_mode == 0 && !align_corners);
-
-  std::vector<int> vy_n, vy_s;
-  std::vector<float> vd_n, vd_s;
-  vy_n.reserve(out_h);
-  vy_s.reserve(out_h);
-  vd_n.reserve(out_h);
-  vd_s.reserve(out_h);
-  for (int k = 0; k < out_h; k++) {
-    int yn = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
-                        : static_cast<int>(ratio_h * k);
-    yn = (yn > 0) ? yn : 0;
-    int ys = (yn + 1) < (x_h - 1) ? (yn + 1) : (x_h - 1);
-    float idx_src_y = ratio_h * (k + 0.5) - 0.5;
-    idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
-    float dn = align_flag ? idx_src_y - yn : ratio_h * k - yn;
-    float ds = 1.f - dn;
-    {
-      vy_n[k] = yn;
-      vy_s[k] = ys;
-      vd_n[k] = dn;
-      vd_s[k] = ds;
-    }
-  }
-
-  std::vector<int> vx_w, vx_e;
-  std::vector<float> vd_w, vd_e;
-  vx_w.reserve(out_w);
-  vx_e.reserve(out_w);
-  vd_w.reserve(out_w);
-  vd_e.reserve(out_w);
-  for (int l = 0; l < out_w; l++) {
-    int xw = (align_mode == 0 && !align_corners)
-                 ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
-                 : static_cast<int>(ratio_w * l);
-    xw = (xw > 0) ? xw : 0;
-    int xe = (xw + 1) < (x_w - 1) ? (xw + 1) : (x_w - 1);
-    float idx_src_x = ratio_w * (l + 0.5) - 0.5;
-    idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
-    float dw = align_flag ? idx_src_x - xw : ratio_w * l - xw;
-    float de = 1.f - dw;
-    {
-      vx_w[l] = xw;
-      vx_e[l] = xe;
-      vd_w[l] = dw;
-      vd_e[l] = de;
-    }
-  }
-
-  std::vector<int64_t> x_strides(x_dims.size(), 1);
-  for (int idx = x_strides.size() - 2; idx >= 0; idx--) {
-    x_strides[idx] = x_strides[idx + 1] * x_dims[idx + 1];
-  }
-  for (int i = 0; i < batch_size; i++) {
-    for (int j = 0; j < channel_size; j++) {
-      for (int k = 0; k < out_h; k++) {
-        for (int l = 0; l < out_w; l++) {
-          DType x0 = x_data[i * x_strides[0] + j * x_strides[1] +
-                            vy_n[k] * x_strides[2] + vx_w[l] * x_strides[3]];
-          DType x1 = x_data[i * x_strides[0] + j * x_strides[1] +
-                            vy_s[k] * x_strides[2] + vx_w[l] * x_strides[3]];
-          DType x2 = x_data[i * x_strides[0] + j * x_strides[1] +
-                            vy_n[k] * x_strides[2] + vx_e[l] * x_strides[3]];
-          DType x3 = x_data[i * x_strides[0] + j * x_strides[1] +
-                            vy_s[k] * x_strides[2] + vx_e[l] * x_strides[3]];
-          *out_data = x0 * vd_s[k] * vd_e[l] + x1 * vd_n[k] * vd_e[l] +
-                      x2 * vd_s[k] * vd_w[l] + x3 * vd_n[k] * vd_w[l];
-          out_data++;
-        }
-      }
-    }
-  }
-}
-
-template <typename DType>
-void nearest_interp_ref(const std::shared_ptr<operators::InterpolateOp> op) {
-  auto scope = op->scope();
-  auto op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto x_dims = x->dims();
-  CHECK_EQ(x_dims.size(), 4);
-  auto scale = op_info->GetAttr<float>("scale");
-  auto out_w = op_info->GetAttr<int>("out_w");
-  auto out_h = op_info->GetAttr<int>("out_h");
-  auto align_corners = op_info->GetAttr<bool>("align_corners");
-  // int align_mode = op_info->GetAttr<int>("align_mode");
-  auto interp_method = op_info->GetAttr<std::string>("interp_method");
-  CHECK_EQ(interp_method, "nearest");
-
-  int x_h = x_dims[2];
-  int x_w = x_dims[3];
-  if (scale > 0) {
-    out_h = static_cast<int>(x_h * scale);
-    out_w = static_cast<int>(x_w * scale);
-  }
-  if (op_info->HasInput("OutSize")) {
-    auto out_size_var_names = op_info->Input("OutSize");
-    if (out_size_var_names.size() > 0) {
-      auto out_size_var_name = out_size_var_names.front();
-      auto out_size =
-          scope->FindVar(out_size_var_name)->GetMutable<lite::Tensor>();
-      CHECK_EQ(out_size->numel(), 2);
-      auto out_size_data = out_size->mutable_data<int>();
-      out_h = out_size_data[0];
-      out_w = out_size_data[1];
-    }
-  }
-  CHECK_GT(out_h, 0);
-  CHECK_GT(out_w, 0);
-  out->Resize({x_dims[0], x_dims[1], out_h, out_w});
-
-  float ratio_h = 0.f;
-  float ratio_w = 0.f;
-  if (out_h > 1) {
-    ratio_h = align_corners ? static_cast<float>(x_h - 1.0) / (out_h - 1.0)
-                            : static_cast<float>(x_h) / out_h;
-  }
-  if (out_w > 1) {
-    ratio_w = align_corners ? static_cast<float>(x_w - 1.0) / (out_w - 1.0)
-                            : static_cast<float>(x_w) / out_w;
-  }
-
-  auto x_data = x->data<DType>();
-  auto out_data = out->mutable_data<DType>();
-  auto out_dims = out->dims();
-  std::vector<int64_t> x_strides(x_dims.size(), 1);
-  for (int idx = x_strides.size() - 2; idx >= 0; idx--) {
-    x_strides[idx] = x_strides[idx + 1] * x_dims[idx + 1];
-  }
-
-  for (int n = 0; n < out_dims[0]; n++) {
-    for (int c = 0; c < out_dims[1]; c++) {
-      for (int h = 0; h < out_dims[2]; h++) {
-        for (int w = 0; w < out_dims[3]; w++) {
-          int in_i = ratio_h * h;
-          int in_j = ratio_w * w;
-          if (align_corners) {
-            in_i = ratio_h * h + 0.5;
-            in_j = ratio_w * w + 0.5;
-          }
-          *out_data = x_data[n * x_strides[0] + c * x_strides[1] +
-                             in_i * x_strides[2] + in_j * x_strides[3]];
-          out_data++;
-        }
-      }
-    }
-  }
-}
-
-void test_interpolate(int bs,
-                      int ic,
-                      int ih,
-                      int iw,
-                      int oh,
-                      int ow,
-                      float scale,
-                      int out_size_h,
-                      int out_size_w,
-                      bool align_corners,
-                      int align_mode,
-                      std::string interp_method) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name("x");
-  std::string out_size_var_name("out_size");
-  std::string out_var_name("out");
-  std::string out_ref_var_name("out_ref");
-  auto x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto out_size = scope.Var(out_size_var_name)->GetMutable<Tensor>();
-  auto out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-  out_size->Resize({2});
-
-  // initialize input&output data
-  FillTensor<float, int>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType(interp_method + "_interp");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("out_h", oh);
-  opdesc.SetAttr("out_w", ow);
-  opdesc.SetAttr("scale", scale);
-  opdesc.SetAttr("align_corners", static_cast<bool>(align_corners));
-  opdesc.SetAttr("align_mode", static_cast<int>(align_mode));
-  opdesc.SetAttr("interp_method", interp_method);
-  if (out_size_h > 0 && out_size_w > 0) {
-    auto out_size_dims = out_size->dims();
-    CHECK_EQ(out_size_dims.size(), 1);
-    CHECK_EQ(out_size_dims.production(), 2);
-    auto out_size_data = out_size->mutable_data<int>();
-    out_size_data[0] = out_size_h;
-    out_size_data[1] = out_size_w;
-    opdesc.SetInput("OutSize", {out_size_var_name});
-  }
-
-  // create op and execute reference implementation
-  auto op = CreateOp<operators::InterpolateOp>(opdesc, &scope);
-  if (interp_method == "bilinear") {
-    bilinear_interp_ref<float>(op);
-  } else {
-    nearest_interp_ref<float>(op);
-  }
-  out_ref->CopyDataFrom(*out);
-
-  // convert op to NPU model, then run it on NPU
-  LauchOp(op, {x_var_name}, {out_var_name});
-
-  // compare results
-  auto out_dims = out->dims();
-  auto out_ref_dims = out_ref->dims();
-  CHECK_EQ(out_dims.size(), out_ref_dims.size());
-  for (int i = 0; i < out_dims.size(); i++) {
-    CHECK_EQ(out_dims[i], out_ref_dims[i]);
-  }
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    VLOG(5) << i;
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2f);
-  }
-}
-
-TEST(NPUBridges, bilinear_interp) {
-#if 1
-  for (auto bs : {1, 3}) {
-    for (auto ic : {3, 4}) {
-      for (auto ih : {4, 5}) {
-        for (auto iw : {3, 6}) {
-          for (auto oh : {0, 3, 8}) {
-            for (auto ow : {0, 4, 9}) {
-              for (auto scale : {0.f, 0.5f, 0.6f, 2.0f, 2.2f}) {
-                for (auto out_size_h : {0, 3, 11}) {
-                  for (auto out_size_w : {0, 2, 12}) {
-                    for (auto align_corners : {true, false}) {
-                      for (auto align_mode : {0, 1}) {
-                        for (auto interp_method : {"bilinear", "nearest"}) {
-                          int act_oh = 0, act_ow = 0;
-                          if (out_size_h > 0 && out_size_w > 0) {
-                            act_oh = out_size_h;
-                            act_ow = out_size_w;
-                          } else if (scale > 1e-5) {
-                            act_oh = static_cast<int>(ih * scale);
-                            act_ow = static_cast<int>(iw * scale);
-                          } else if (oh > 0 && ow > 0) {
-                            act_oh = oh;
-                            act_ow = ow;
-                          }
-                          if (act_oh <= 0 || act_ow <= 0) {
-                            continue;
-                          }
-                          // TODO(hong19860320) multiple=(ih*iw)/(oh*ow)
-                          // should
-                          // not exceed 7.0 in NPU DDK, delete the following
-                          // lines
-                          // if the limination is removed.
-                          const float largest_multiple = 7.0f;
-                          float multiple =
-                              static_cast<float>(ih * iw) / (act_oh * act_ow);
-                          if (multiple > largest_multiple) {
-                            continue;
-                          }
-                          if (align_mode == 0 && !align_corners) {
-                            continue;
-                          }
-                          VLOG(3) << "bs: " << bs << " ic: " << ic
-                                  << " ih: " << ih << " iw: " << iw
-                                  << " oh: " << oh << " ow: " << ow
-                                  << " scale: " << scale
-                                  << " out_size: " << out_size_h << ","
-                                  << out_size_w
-                                  << " align_corners: " << align_corners
-                                  << " align_mode: " << align_mode;
-                          test_interpolate(bs,
-                                           ic,
-                                           ih,
-                                           iw,
-                                           oh,
-                                           ow,
-                                           scale,
-                                           out_size_h,
-                                           out_size_w,
-                                           align_corners,
-                                           align_mode,
-                                           interp_method);
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-#else
-  test_interpolate(1, 1, 4, 3, 0, 0, 1.f, 3, 6, false, 1, "nearest");
-#endif
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(bilinear_interp);
-USE_NPU_BRIDGE(bilinear_interp);
-
-USE_LITE_OP(nearest_interp);
-USE_NPU_BRIDGE(nearest_interp);
diff --git a/lite/backends/npu/bridge/mul_op.cc b/lite/backends/npu/bridge/mul_op.cc
deleted file mode 100644
index 290f3d88f8..0000000000
--- a/lite/backends/npu/bridge/mul_op.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/mul_op.h"
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/utils.h"
-#include "lite/backends/npu/npu_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-// Note: inputs_map the var_name contains only the data, the weight should be
-// handle in this converter
-node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op,
-                           const node_map_type& inputs_map) {
-  LOG(INFO) << "converting mul...";
-  lite::Scope* scope = mul_op->scope();
-  const lite::OpInfo* op_info = mul_op->op_info();
-  auto output_node = std::make_shared<ge::op::MatMul>(UniqueName("mul"));
-
-  auto x_var_name = op_info->Input("X").front();
-  auto y_var_name = op_info->Input("Y").front();
-  int x_num_col_dims = op_info->GetAttr<int>("x_num_col_dims");
-  int y_num_col_dims = op_info->GetAttr<int>("y_num_col_dims");
-  auto* xtensor = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
-  auto* ytensor = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>();
-
-  int m = xtensor->dims().Slice(0, x_num_col_dims).production();
-  int x_w = xtensor->dims()
-                .Slice(x_num_col_dims, xtensor->dims().size())
-                .production();
-  int y_h = ytensor->dims().Slice(0, y_num_col_dims).production();
-  int n = ytensor->dims()
-              .Slice(y_num_col_dims, ytensor->dims().size())
-              .production();
-  CHECK_EQ(x_w, y_h) << "x_w must be equal with y_h";
-  int k = x_w;
-  LOG(INFO) << "m:" << m << ",n:" << n << ",k:" << k;
-  LOG(INFO) << "x_var_name:" << x_var_name
-            << ", is data: " << inputs_map.count(x_var_name);
-  LOG(INFO) << "y_var_name:" << y_var_name
-            << ", is data: " << inputs_map.count(y_var_name);
-  CHECK(inputs_map.count(x_var_name))
-      << "[NPU] MatMul only support X is data, Y is const yet";
-  if (inputs_map.count(x_var_name)) {
-    auto xsrc = inputs_map.at(x_var_name);
-    auto reshapex = std::make_shared<ge::op::Reshape>(x_var_name + "_reshape");
-    reshapex->set_input_tensor(*xsrc);
-    reshapex->set_attr_shape({m, k});
-    reshapex->set_attr_axis(0);
-    OpList::Global().add(xsrc);
-    OpList::Global().add(reshapex);
-    output_node->set_input_x(*reshapex);
-  } else {
-    auto constx = std::make_shared<ge::op::Const>(x_var_name);
-    ge::TensorDesc desc(ge::Shape({m, k}), ge::FORMAT_NCHW, ge::DT_FLOAT);
-    auto size = desc.GetShape().GetShapeSize();
-    CHECK_EQ(size, xtensor->dims().production());
-    ge::TensorPtr ptensor = std::make_shared<ge::Tensor>();
-    ptensor->SetTensorDesc(desc);
-    auto* pdata = reinterpret_cast<uint8_t*>(xtensor->mutable_data<float>());
-    ptensor->SetData(pdata, size * sizeof(float));
-    constx->set_attr_value(ptensor);
-    OpList::Global().add(constx);
-    output_node->set_input_x(*constx);
-  }
-
-  if (inputs_map.count(y_var_name)) {
-    auto ysrc = inputs_map.at(y_var_name);
-    auto reshapey = std::make_shared<ge::op::Reshape>(y_var_name + "_reshape");
-    reshapey->set_input_tensor(*ysrc);
-    reshapey->set_attr_shape({k, n});
-    reshapey->set_attr_axis(0);
-    OpList::Global().add(ysrc);
-    OpList::Global().add(reshapey);
-    output_node->set_input_w(*reshapey);
-  } else {
-    auto consty = std::make_shared<ge::op::Const>(y_var_name);
-    ge::TensorDesc desc(ge::Shape({k, n}), ge::FORMAT_NCHW, ge::DT_FLOAT);
-    auto size = desc.GetShape().GetShapeSize();
-    CHECK_EQ(size, ytensor->dims().production());
-    ge::TensorPtr ptensor = std::make_shared<ge::Tensor>();
-    ptensor->SetTensorDesc(desc);
-    auto* pdata = reinterpret_cast<uint8_t*>(ytensor->mutable_data<float>());
-    ptensor->SetData(pdata, size * sizeof(float));
-    consty->set_attr_value(ptensor);
-    OpList::Global().add(consty);
-    output_node->set_input_w(*consty);
-  }
-
-  OpList::Global().add(output_node);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = output_node;
-  return outputs_map;
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_NPU_BRIDGE(mul, paddle::lite::npu::bridge::MulConverter);
diff --git a/lite/backends/npu/bridge/mul_op_test.cc b/lite/backends/npu/bridge/mul_op_test.cc
deleted file mode 100644
index c28d0487cc..0000000000
--- a/lite/backends/npu/bridge/mul_op_test.cc
+++ /dev/null
@@ -1,125 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/mul_op.h"
-#include <gtest/gtest.h>
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/test_helper.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-void mul_ref(const std::shared_ptr<operators::MulOpLite> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto y = scope->FindVar(op_info->Input("Y").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  int32_t x_num_col_dims = op_info->GetAttr<int32_t>("x_num_col_dims");
-  int32_t y_num_col_dims = op_info->GetAttr<int32_t>("y_num_col_dims");
-  auto x_data = x->mutable_data<float>();
-  auto y_data = y->mutable_data<float>();
-  auto out_data = out->mutable_data<float>();
-  auto x_mat_dims = x->dims().Flatten2D(x_num_col_dims);
-  auto y_mat_dims = y->dims().Flatten2D(y_num_col_dims);
-  CHECK_EQ(x_mat_dims[1], y_mat_dims[0]);
-  const int M = x_mat_dims[0];
-  const int K = x_mat_dims[1];
-  const int N = y_mat_dims[1];
-  for (int m = 0; m < M; ++m) {
-    for (int n = 0; n < N; ++n) {
-      out_data[m * N + n] = 0;
-      for (int k = 0; k < K; ++k) {
-        out_data[m * N + n] += x_data[m * K + k] * y_data[k * N + n];
-      }
-    }
-  }
-}
-
-void test_mul(const std::vector<int64_t>& x_shape,
-              const std::vector<int64_t>& y_shape,
-              int x_num_col_dims,
-              int y_num_col_dims) {
-  const auto& bridges = lite::npu::bridge::Factory::Instance();
-  const auto& supported_lists = bridges.AllFunctions();
-  CHECK(bridges.HasType("mul"));
-
-  Scope scope;
-  std::string x_var_name("X");
-  std::string y_var_name("Y");
-  std::string out_var_name("Out");
-  std::string out_ref_var_name("out_ref");
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* y = scope.Var(y_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize(x_shape);
-
-  // get y shape
-  auto x_mat_dims = x->dims().Flatten2D(x_num_col_dims);
-  std::vector<int64_t> y_shape;
-  for (int i = 0; i < y_num_col_dims - 1; i++) {
-    y_shape.push_back(1);
-  }
-  y_shape.push_back(x_mat_dims[1]);
-  y_shape.push_back(o);
-  y->Resize(y_shape);
-
-  FillTensor<float, int>(x);
-  FillTensor<float, int>(y);
-
-  // create mul op
-  cpp::OpDesc mul_op_desc;
-  mul_op_desc.SetType("mul");
-  mul_op_desc.SetInput("X", {x_var_name});
-  mul_op_desc.SetInput("Y", {y_var_name});
-  mul_op_desc.SetOutput("Out", {out_var_name});
-  mul_op_desc.SetAttr("x_num_col_dims", static_cast<int>(x_num_col_dims));
-  mul_op_desc.SetAttr("y_num_col_dims", static_cast<int>(y_num_col_dims));
-
-  auto mul_op = CreateOp<operators::MulOpLite>(mul_op_desc, &scope);
-  LauchOp(mul_op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  mul_ref(mul_op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
-  }
-
-  // model release
-  npu::OpList::Global().clear();
-  npu::DeviceInfo::Global().Clear();
-}
-
-TEST(NPUBridges, mul) {
-  test_mul({1, 8, 8, 1}, {1, 8, 2, 2}, 2, 2);
-  test_mul({1, 5, 5, 1}, {1, 5, 7, 7}, 2, 2);
-  test_mul({1, 4, 1, 1}, {4, 8}, 1, 1);
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(mul);
-USE_NPU_BRIDGE(mul);
diff --git a/lite/backends/npu/bridge/pad2d_op.cc b/lite/backends/npu/bridge/pad2d_op.cc
deleted file mode 100644
index 2c67383c0c..0000000000
--- a/lite/backends/npu/bridge/pad2d_op.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/utils.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-node_map_type Pad2dConverter(const std::shared_ptr<lite::OpLite> pad2d_op,
-                             const node_map_type& inputs_map) {
-  auto scope = pad2d_op->scope();
-  auto op_info = pad2d_op->op_info();
-  auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
-  LOG(INFO) << "Converting " + op_type + "...";
-
-  std::shared_ptr<ge::op::Pad> pad2d_node =
-      std::make_shared<ge::op::Pad>(unique_op_type);
-  auto x_var_name = op_info->Input("X").front();
-  pad2d_node->set_input_x(*inputs_map.at(x_var_name));
-  OpList::Global().add(inputs_map.at(x_var_name));
-  OpList::Global().add(pad2d_node);
-
-  auto mode = op_info->GetAttr<std::string>("mode");
-  if (mode == "constant") {
-    pad2d_node->set_attr_mode(0);
-  } else if (mode == "reflect") {
-    LOG(FATAL) << "NPU doesn't support this pad mod: " << mode;
-    pad2d_node->set_attr_mode(1);
-  } else {
-    LOG(FATAL) << "NPU doesn't support this pad mod: " << mode;
-  }
-
-  auto x_dims = scope->FindTensor(x_var_name)->dims();
-  auto padding = op_info->GetAttr<std::vector<int>>("paddings");
-  CHECK_EQ(padding.size(), 4);
-  int xds = x_dims.size();
-  padding.insert(padding.begin(), xds * 2 - 4, 0);
-  auto npu_padding =
-      std::make_shared<ge::op::Const>(unique_op_type + "/padding");
-  npu_padding->set_attr_value(CreateTensorAndFillData<int>(padding, {xds, 2}));
-  pad2d_node->set_input_padding(*npu_padding);
-  OpList::Global().add(npu_padding);
-
-  if (mode == "constant") {
-    auto pad_value = op_info->GetAttr<float>("pad_value");
-    auto npu_pad_value =
-        std::make_shared<ge::op::Const>(unique_op_type + "/pad_value");
-    npu_pad_value->set_attr_value(CreateTensorAndFillData<float>({pad_value}));
-    pad2d_node->set_input_constant_values(*npu_pad_value);
-    OpList::Global().add(npu_pad_value);
-
-    pad2d_node->set_attr_T(0);  // type of pad_value:  0:float  3:int32
-  }
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = pad2d_node;
-  return outputs_map;
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_NPU_BRIDGE(pad2d, paddle::lite::npu::bridge::Pad2dConverter);
diff --git a/lite/backends/npu/bridge/pad2d_op_test.cc b/lite/backends/npu/bridge/pad2d_op_test.cc
deleted file mode 100644
index 7a10e0a559..0000000000
--- a/lite/backends/npu/bridge/pad2d_op_test.cc
+++ /dev/null
@@ -1,189 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/pad2d_op.h"
-#include <gtest/gtest.h>
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/test_helper.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-template <typename dtype>
-void pad2d_ref(const std::shared_ptr<operators::Pad2dOpLite> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindMutableTensor(op_info->Input("X").front());
-  auto out = scope->FindMutableTensor(op_info->Output("Out").front());
-
-  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
-  int pad_top = paddings[0];
-  int pad_bottom = paddings[1];
-  int pad_left = paddings[2];
-  int pad_right = paddings[3];
-
-  auto mode = op_info->GetAttr<std::string>("mode");
-  int pad_mode;
-  if (mode == "constant") {
-    pad_mode = 0;
-  } else if (mode == "reflect") {
-    pad_mode = 1;
-  } else if (mode == "edge") {
-    pad_mode = 2;
-  } else {
-    LOG(FATAL) << "Unknown mode type";
-  }
-  float pad_value = op_info->GetAttr<float>("pad_value");
-
-  auto out_dims = out->dims();
-  int n = out_dims[0];
-  int c = out_dims[1];
-  int h = out_dims[2];
-  int w = out_dims[3];
-
-  int in_w = w - pad_left - pad_right;
-  int in_h = h - pad_bottom - pad_top;
-  int spatial_size_out = w * h;
-  int spatial_size_in = in_w * in_h;
-
-  auto x_data = x->data<float>();
-  auto out_data = out->mutable_data<float>();
-#pragma omp parallel for
-  for (int i = 0; i < n * c; ++i) {
-    const float* din_batch = x_data + i * spatial_size_in;
-    float* dout_batch = out_data + i * spatial_size_out;
-    int in_y = 0;
-    int in_x = 0;
-    for (int y = 0; y < h; ++y) {
-      for (int x = 0; x < w; ++x) {
-        switch (pad_mode) {
-          case 0:
-            in_y = y - pad_top;
-            in_x = x - pad_left;
-            dout_batch[y * w + x] =
-                (in_x >= 0 && in_x < in_w) && (in_y >= 0 && in_y < in_h)
-                    ? din_batch[in_y * in_w + in_x]
-                    : pad_value;
-            break;
-          case 1:
-            in_x =
-                std::min(std::max(pad_left, x), in_w + pad_left - 1) - pad_left;
-            in_y = std::min(std::max(pad_top, y), in_h + pad_top - 1) - pad_top;
-            dout_batch[y * w + x] = din_batch[in_y * in_w + in_x];
-            break;
-          case 2:
-            in_y = y - pad_top;
-            in_x = x - pad_left;
-            in_y = std::max(in_y, -in_y);
-            in_y = std::min(in_y, 2 * in_h - in_y - 2);
-            in_x = std::max(in_x, -in_x);
-            in_x = std::min(in_x, 2 * in_w - in_x - 2);
-            dout_batch[y * w + x] = din_batch[in_y * in_w + in_x];
-            break;
-          default:
-            LOG(ERROR) << "ERROR: unknown pad mode:" << pad_mode;
-        }
-      }
-    }
-  }
-}
-
-void test_pad2d(int bs,
-                int ic,
-                int ih,
-                int iw,
-                std::vector<int> paddings,
-                float pad_value,
-                std::string mode) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  auto* x = scope.NewTensor(x_var_name);
-  auto* out = scope.NewTensor(out_var_name);
-  auto* out_ref = scope.NewTensor(out_ref_var_name);
-  x->Resize({bs, ic, ih, iw});
-
-  // initialize input&output data
-  //  FillTensor<float, int>(x);
-  auto x_data = x->mutable_data<float>();
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("pad2d");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("paddings", paddings);
-  opdesc.SetAttr("pad_value", pad_value);
-  opdesc.SetAttr("mode", mode);
-  opdesc.SetAttr("data_format", std::string("NCHW"));
-
-  auto op = CreateOp<operators::Pad2dOpLite>(opdesc, &scope);
-  pad2d_ref<float>(op);
-  out_ref->CopyDataFrom(*out);
-
-  LauchOp(op, {x_var_name}, {out_var_name});
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->numel(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2) << "-----" << i;
-  }
-}
-
-TEST(NPUBridges, pad2d) {
-#if 1
-  for (auto bs : {1, 4, 7}) {
-    for (auto ic : {1, 4, 7}) {
-      for (auto ih : {1, 4, 7}) {
-        for (auto iw : {1, 4, 7}) {
-          for (auto paddings : {/*std::vector<int>{0, 0, 0, 0},*/
-                                std::vector<int>{0, 0, 0, 1},
-                                std::vector<int>{0, 1, 0, 2},
-                                std::vector<int>{1, 2, 3, 4}}) {
-            // npu not support pad_value!=0
-            for (auto pad_value : {0.f /*,1.f*/}) {
-              // npu only support constant
-              for (auto mode : {"constant" /*, "reflect", "edge"*/}) {
-                if (mode == "edge") continue;
-                VLOG(3) << "bs: " << bs << "  ic: " << ic << "  ih: " << ih
-                        << "  iw: " << iw << "  paddings: {" << paddings[0]
-                        << "," << paddings[1] << "," << paddings[2] << ","
-                        << paddings[3] << "}"
-                        << "  pad_value: " << pad_value << "  mode: " << mode;
-                test_pad2d(bs, ic, ih, iw, paddings, pad_value, mode);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-#else
-  test_pad2d(1, 1, 1, 1, {0, 0, 0, 1}, 0, "constant");
-#endif
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(pad2d);
-USE_NPU_BRIDGE(pad2d);
diff --git a/lite/backends/npu/bridge/paddle_use_npu_bridges.h b/lite/backends/npu/bridge/paddle_use_npu_bridges.h
deleted file mode 100644
index 404d003954..0000000000
--- a/lite/backends/npu/bridge/paddle_use_npu_bridges.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "lite/backends/npu/bridge/registry.h"
-
-USE_NPU_BRIDGE(mul);
-USE_NPU_BRIDGE(fc);
-USE_NPU_BRIDGE(conv2d);
-USE_NPU_BRIDGE(depthwise_conv2d);
-USE_NPU_BRIDGE(pool2d);
-USE_NPU_BRIDGE(relu);
-USE_NPU_BRIDGE(elementwise_add);
-USE_NPU_BRIDGE(scale);
-USE_NPU_BRIDGE(softmax);
-USE_NPU_BRIDGE(concat);
-USE_NPU_BRIDGE(split);
-USE_NPU_BRIDGE(transpose);
-USE_NPU_BRIDGE(transpose2);
-USE_NPU_BRIDGE(shuffle_channel);
-USE_NPU_BRIDGE(batch_norm);
-USE_NPU_BRIDGE(bilinear_interp);
-USE_NPU_BRIDGE(conv2d_transpose);
-USE_NPU_BRIDGE(reshape);
-USE_NPU_BRIDGE(reshape2);
diff --git a/lite/backends/npu/bridge/pool_op.cc b/lite/backends/npu/bridge/pool_op.cc
deleted file mode 100644
index aebfd68856..0000000000
--- a/lite/backends/npu/bridge/pool_op.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/pool_op.h"
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/utils.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op,
-                            const node_map_type& inputs_map) {
-  auto scope = pool_op->scope();
-  auto op_info = pool_op->op_info();
-  auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
-  LOG(INFO) << "Converting " + op_type + "...";
-
-  std::shared_ptr<ge::op::Pooling> pool_node =
-      std::make_shared<ge::op::Pooling>(unique_op_type);
-  auto x_var_name = op_info->Input("X").front();
-  auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
-  int npu_mode = 0;
-  if (pooling_type == "max") {
-    npu_mode = 0;
-  } else if (pooling_type == "avg") {
-    npu_mode = 1;
-    CHECK(op_info->GetAttr<bool>("exclusive"))
-        << "exclusive must be true when use npu";
-  } else {
-    LOG(FATAL) << "Unsupported pooling type: " << pooling_type;
-  }
-  bool npu_global_pooling = op_info->GetAttr<bool>("global_pooling");
-  auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
-  auto npu_window = ge::AttrValue::LIST_INT(ksize.begin(), ksize.end());
-
-  auto padding = op_info->GetAttr<std::vector<int>>("paddings");
-  auto npu_pad =
-      ge::AttrValue::LIST_INT{padding[0], padding[0], padding[1], padding[1]};
-  auto strides = op_info->GetAttr<std::vector<int>>("strides");
-  auto npu_stride = ge::AttrValue::LIST_INT(strides.begin(), strides.end());
-  int npu_ceil_mode = 0;
-  if (op_info->HasAttr("ceil_mode")) {
-    npu_ceil_mode = op_info->GetAttr<bool>("ceil_mode") ? 1 : 0;
-  }
-
-  pool_node->set_input_x(*inputs_map.at(x_var_name));
-  pool_node->set_attr_mode(npu_mode);
-  pool_node->set_attr_pad_mode(0);
-  pool_node->set_attr_global_pooling(npu_global_pooling);
-  pool_node->set_attr_window(npu_window);
-  pool_node->set_attr_pad(npu_pad);
-  pool_node->set_attr_stride(npu_stride);
-  pool_node->set_attr_ceil_mode(npu_ceil_mode);
-  // output_node->set_attr_data_mode(npu_data_mode);
-
-  OpList::Global().add(inputs_map.at(x_var_name));
-  OpList::Global().add(pool_node);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = pool_node;
-  return outputs_map;
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_NPU_BRIDGE(pool2d, paddle::lite::npu::bridge::PoolConverter);
diff --git a/lite/backends/npu/bridge/pool_op_test.cc b/lite/backends/npu/bridge/pool_op_test.cc
deleted file mode 100644
index 86ad893084..0000000000
--- a/lite/backends/npu/bridge/pool_op_test.cc
+++ /dev/null
@@ -1,249 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/pool_op.h"
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/test_helper.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto& in_dims = x->dims();
-  auto& out_dims = out->dims();
-
-  const float* src_ptr = x->data<const float>();
-  float* dst_ptr = out->mutable_data<float>();
-
-  std::vector<int> ksize = op_info->GetAttr<std::vector<int>>("ksize");
-  std::vector<int> strides = op_info->GetAttr<std::vector<int>>("strides");
-  std::vector<int> paddings = op_info->GetAttr<std::vector<int>>("paddings");
-  bool exclusive = op_info->GetAttr<bool>("exclusive");
-  std::string pooling_type = op_info->GetAttr<std::string>("pooling_type");
-  bool global_pooling = op_info->GetAttr<bool>("global_pooling");
-
-  int in_n = in_dims[0];
-  int in_c = in_dims[1];
-  int in_h = in_dims[2];
-  int in_w = in_dims[3];
-  int size_in_n = in_c * in_h * in_w;
-  int size_in_c = in_h * in_w;
-
-  int out_h = out_dims[2];
-  int out_w = out_dims[3];
-  int size_out_n = in_c * out_h * out_w;
-  int size_out_c = out_h * out_w;
-
-  int window_h = ksize[0];
-  int window_w = ksize[1];
-  int stride_h = strides[0];
-  int stride_w = strides[1];
-  int pad_h = paddings[0];
-  int pad_w = paddings[1];
-
-  if (global_pooling == true) {
-    for (int n = 0; n < in_n; ++n) {
-      for (int c = 0; c < in_c; ++c) {
-        const float* src = src_ptr + n * size_in_n + c * size_in_c;
-        float res = src[0];
-        if (pooling_type == "max") {
-          for (int i = 1; i < size_in_c; ++i) {
-            float cur_val = src[i];
-            res = cur_val > res ? cur_val : res;
-          }
-        } else if (pooling_type == "avg") {
-          for (int i = 1; i < size_in_c; ++i) {
-            float cur_val = src[i];
-            res += cur_val;
-          }
-          res /= size_in_c;
-        }
-        dst_ptr[n * size_out_n + c] = res;
-      }
-    }
-  } else {
-    for (int n = 0; n < in_n; ++n) {
-      for (int c = 0; c < in_c; ++c) {
-        for (int h = 0; h < out_h; ++h) {
-          int sh = h * stride_h;
-          int eh = sh + window_h;
-          sh = (sh - pad_h) < 0 ? 0 : sh - pad_h;
-          eh = (eh - pad_h) > in_h ? in_h : eh - pad_h;
-          for (int w = 0; w < out_w; ++w) {
-            int sw = w * stride_w;
-            int ew = sw + window_w;
-            sw = (sw - pad_w) < 0 ? 0 : sw - pad_w;
-            ew = (ew - pad_w) > in_w ? in_w : ew - pad_w;
-            int pooling_size = (ew - sw) * (eh - sh);
-            if (pooling_size == 0) continue;
-            float res = 0.f;
-            for (int kh = sh; kh < eh; ++kh) {
-              for (int kw = sw; kw < ew; ++kw) {
-                int src_idx = n * size_in_n + c * size_in_c + kh * in_w + kw;
-                if (kh == sh && kw == sw) {
-                  res = src_ptr[src_idx];
-                } else {
-                  if (pooling_type == "max") {
-                    res = res >= src_ptr[src_idx] ? res : src_ptr[src_idx];
-                  }
-                  if (pooling_type == "avg") {
-                    res += src_ptr[src_idx];
-                  }
-                }
-              }
-            }
-            if (pooling_type == "avg") {
-              if (exclusive) {
-                res /= pooling_size;
-              } else {
-                res /= window_h * window_w;
-              }
-            }
-            dst_ptr[n * size_out_n + c * size_out_c + h * out_w + w] = res;
-          }
-        }
-      }
-    }
-  }
-}
-
-void test_pool(int bs,
-               int ic,
-               int ih,
-               int iw,
-               std::string pooling_type,
-               bool ceil_mode,
-               bool global_pooling,
-               bool exclusive,
-               int ksize,
-               int stride,
-               int padding) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-
-  // initialize input&output data
-  FillTensor<float>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("pool2d");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("pooling_type", pooling_type);
-  opdesc.SetAttr("ksize", std::vector<int>({ksize, ksize}));
-  opdesc.SetAttr("global_pooling", global_pooling);
-  opdesc.SetAttr("exclusive", exclusive);
-  opdesc.SetAttr("strides", std::vector<int>({stride, stride}));
-  opdesc.SetAttr("paddings", std::vector<int>({padding, padding}));
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::PoolOpLite>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // execute reference implementation and save to output tensor
-  pool_ref(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
-  }
-}
-
-TEST(NPUBridges, pool) {
-  for (auto pooling_type : {"max", "avg"}) {
-    for (auto ceil_mode : {true, false}) {
-      for (auto global_pooling : {/*true, */ false}) {
-        for (auto exclusive : {true /*, false*/}) {
-          for (auto ksize : {2, 3}) {
-            for (auto stride : {1, 2}) {
-              for (auto padding : {0, 1}) {
-                for (auto bs : {1, 3}) {
-                  for (auto ic : {1, 3}) {
-                    for (auto ih : {3, 7}) {
-                      for (auto iw : {3, 7}) {
-                        test_pool(bs,
-                                  ic,
-                                  ih,
-                                  iw,
-                                  pooling_type,
-                                  ceil_mode,
-                                  global_pooling,
-                                  exclusive,
-                                  ksize,
-                                  stride,
-                                  padding);
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  for (auto pooling_type : {"max", "avg"}) {
-    for (auto ceil_mode : {true, false}) {
-      bool global_pooling = true;
-      bool exclusive = true;
-      int ksize = 2;
-      int stride = 1;
-      int padding = 0;
-      int bs = 6;
-      int ic = 6;
-      int ih = 6;
-      int iw = 6;
-      test_pool(bs,
-                ic,
-                ih,
-                iw,
-                pooling_type,
-                ceil_mode,
-                global_pooling,
-                exclusive,
-                ksize,
-                stride,
-                padding);
-    }
-  }
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(pool2d);
-USE_NPU_BRIDGE(pool2d);
diff --git a/lite/backends/npu/bridge/registry.cc b/lite/backends/npu/bridge/registry.cc
deleted file mode 100644
index 180e0aa46e..0000000000
--- a/lite/backends/npu/bridge/registry.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/npu/bridge/registry.h"
-#include <utility>
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-Factory& Factory::Instance() {
-  static Factory g_npu_bridge;
-  return g_npu_bridge;
-}
-
-bool Factory::HasType(const std::string& op_type) const {
-  return map_.count(op_type);
-}
-
-void Factory::Insert(const std::string& op_type, const func_type& func_name) {
-  map_.insert(std::make_pair(op_type, func_name));
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/npu/bridge/registry.h b/lite/backends/npu/bridge/registry.h
deleted file mode 100644
index 979760c816..0000000000
--- a/lite/backends/npu/bridge/registry.h
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "ai_ddk_lib/include/graph/operator_reg.h"
-#include "lite/core/op_lite.h"
-#include "lite/utils/macros.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-// var_name, npu node point
-using node_map_type =
-    std::unordered_map<std::string, std::shared_ptr<ge::Operator>>;
-
-using func_type = std::function<node_map_type(const std::shared_ptr<OpLite>,
-                                              const node_map_type&)>;
-using cvt_map_type = std::unordered_map<std::string, func_type>;
-class Factory {
- public:
-  static Factory& Instance();
-
-  const cvt_map_type& AllFunctions() const { return map_; }
-  bool HasType(const std::string& op_type) const;
-  void Insert(const std::string& op_type, const func_type& func_name);
-  Factory() = default;
-
- private:
-  cvt_map_type map_;
-  DISALLOW_COPY_AND_ASSIGN(Factory);
-};
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-// some platform-independent defintion
-#if defined(_WIN32)
-#define UNUSED
-#define __builtin_expect(EXP, C) (EXP)
-#else
-#define UNUSED __attribute__((unused))
-#endif
-
-#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg)              \
-  struct __test_global_namespace_##uniq_name##__ {};                          \
-  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
-                             __test_global_namespace_##uniq_name##__>::value, \
-                msg)
-
-#define REGISTER_NPU_BRIDGE(op_type, cvt_func_name)                         \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                                 \
-      __reg_npu_bridge_##op_type##__,                                       \
-      "REGISTER_NPU_BRIDGE must be called in global namespace only once!"); \
-  int __reg_npu_bridge_##op_type##_Insert() {                               \
-    paddle::lite::npu::bridge::Factory::Instance().Insert(#op_type,         \
-                                                          cvt_func_name);   \
-    return 0;                                                               \
-  }
-
-#define USE_NPU_BRIDGE(op_type)                                  \
-  extern int __reg_npu_bridge_##op_type##_Insert();              \
-  static int __reg_npu_bridge_##op_type##_Insert_return UNUSED = \
-      __reg_npu_bridge_##op_type##_Insert();
diff --git a/lite/backends/npu/bridge/reshape_op.cc b/lite/backends/npu/bridge/reshape_op.cc
deleted file mode 100644
index af160f9c72..0000000000
--- a/lite/backends/npu/bridge/reshape_op.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/reshape_op.h"
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/utils.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op,
-                               const node_map_type& inputs_map) {
-  auto scope = reshape_op->scope();
-  auto op_info = reshape_op->op_info();
-  auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
-  LOG(INFO) << "Converting " + op_type + "...";
-
-  // get input, output and op attributes
-  auto x_var_name = op_info->Input("X").front();
-  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
-  auto x_dims = x->dims();
-
-  // create reshape node and set input node from inputs_map
-  auto reshape_node = std::make_shared<ge::op::Reshape>(unique_op_type);
-  CHECK(inputs_map.count(x_var_name));
-  reshape_node->set_input_tensor(*inputs_map.at(x_var_name));
-  OpList::Global().add(inputs_map.at(x_var_name));
-
-  // read shape from actual shape tensor as input "w" if 'Shape' is found
-  if (HasInputArg(op_info, scope, "Shape")) {
-    auto actual_shape_var_name = op_info->Input("Shape").front();
-    if (!inputs_map.count(actual_shape_var_name)) {
-      auto actual_shape =
-          scope->FindVar(actual_shape_var_name)->GetMutable<lite::Tensor>();
-      auto actual_shape_dims = actual_shape->dims();
-      auto actual_shape_data = actual_shape->mutable_data<int>();
-      auto shape =
-          std::vector<int>(actual_shape_data,
-                           actual_shape_data + actual_shape_dims.production());
-      auto out_dims = operators::ValidateShape(shape, x_dims);
-      auto out_shape = out_dims.Vectorize();
-      if (out_shape.size() > 4) {
-        LOG(WARNING)
-            << "NPU DDK only supports less than 4 dimensions, but Shape has "
-            << out_shape.size();
-      }
-      auto actual_shape_const_node =
-          std::make_shared<ge::op::Const>(actual_shape_var_name);
-      actual_shape_const_node->set_attr_value(CreateTensorAndFillData(
-          std::vector<int>(out_shape.begin(), out_shape.end())));
-      reshape_node->set_input_w(*actual_shape_const_node);
-      OpList::Global().add(actual_shape_const_node);
-    } else {
-      reshape_node->set_input_w(*inputs_map.at(actual_shape_var_name));
-      OpList::Global().add(inputs_map.at(actual_shape_var_name));
-    }
-  } else {
-    auto shape = op_info->GetAttr<std::vector<int>>("shape");
-    auto out_dims = operators::ValidateShape(shape, x_dims);
-    auto out_shape = out_dims.Vectorize();
-    if (out_shape.size() > 4) {
-      LOG(WARNING)
-          << "NPU DDK only supports less than 4 dimensions, but shape has "
-          << out_shape.size();
-    }
-    reshape_node->set_attr_shape(
-        ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end()));
-  }
-  OpList::Global().add(reshape_node);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = reshape_node;
-  if (op_type == "reshape2") {
-    // append an extra reshape node to calc XShape
-    std::vector<int64_t> xshape_dims(x_dims.size() + 1, 1);
-    for (size_t i = 0; i < x_dims.size(); i++) {
-      xshape_dims[i + 1] = x_dims[i];
-    }
-    if (xshape_dims.size() > 4) {
-      LOG(WARNING)
-          << "NPU DDK only supports less than 4 dimensions, but XShape has "
-          << xshape_dims.size();
-    }
-    auto xshape_node =
-        std::make_shared<ge::op::Reshape>(unique_op_type + "/xshape");
-    xshape_node->set_input_tensor(*inputs_map.at(x_var_name));
-    xshape_node->set_attr_shape(
-        ge::AttrValue::LIST_INT(xshape_dims.begin(), xshape_dims.end()));
-    OpList::Global().add(xshape_node);
-    outputs_map[op_info->Output("XShape").front()] = xshape_node;
-  }
-  return outputs_map;
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_NPU_BRIDGE(reshape, paddle::lite::npu::bridge::ReshapeConverter);
-REGISTER_NPU_BRIDGE(reshape2, paddle::lite::npu::bridge::ReshapeConverter);
diff --git a/lite/backends/npu/bridge/reshape_op_test.cc b/lite/backends/npu/bridge/reshape_op_test.cc
deleted file mode 100644
index 4a75961fdf..0000000000
--- a/lite/backends/npu/bridge/reshape_op_test.cc
+++ /dev/null
@@ -1,202 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/reshape_op.h"
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/test_helper.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-void reshape_ref(const std::shared_ptr<lite::OpLite> op) {
-  auto scope = op->scope();
-  auto op_info = op->op_info();
-  auto op_type = op_info->Type();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto x_dims = x->dims();
-  auto shape = op_info->GetAttr<std::vector<int>>("shape");
-  auto inplace = op_info->GetAttr<bool>("inplace");
-  if (op_info->HasInput("Shape")) {
-    auto actual_shape_var_names = op_info->Input("Shape");
-    if (actual_shape_var_names.size() > 0) {
-      auto actual_shape = scope->FindVar(actual_shape_var_names.front())
-                              ->GetMutable<lite::Tensor>();
-      auto actual_shape_dims = actual_shape->dims();
-      auto* actual_shape_data = actual_shape->data<int>();
-      shape =
-          std::vector<int>(actual_shape_data,
-                           actual_shape_data + actual_shape_dims.production());
-    }
-  }
-  if (inplace) {
-    out->ShareDataWith(*x);
-  } else {
-    out->CopyDataFrom(*x);
-  }
-  auto out_dims = operators::ValidateShape(shape, x_dims);
-  out->Resize(out_dims);
-}
-
-void test_reshape(const std::vector<int64_t>& x_shape,
-                  const std::vector<int>& shape,
-                  const std::vector<int>& act_shape,
-                  bool inplace,
-                  bool reshape2) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name("x");
-  std::string actual_shape_var_name("actual_shape");
-  std::string out_var_name("out");
-  std::string out_ref_var_name("out_ref");
-  std::string xshape_var_name("xshape");
-  std::string xshape_ref_var_name("xshape_ref");
-  auto x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto actual_shape = scope.Var(actual_shape_var_name)->GetMutable<Tensor>();
-  auto out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  auto xshape = scope.Var(xshape_var_name)->GetMutable<Tensor>();
-  auto xshape_ref = scope.Var(xshape_ref_var_name)->GetMutable<Tensor>();
-
-  x->Resize(x_shape);
-
-  // initialize input&output data
-  FillTensor<float, int>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType(reshape2 ? "reshape2" : "reshape");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("shape", shape);
-  opdesc.SetAttr("inplace", inplace);
-  if (!act_shape.empty()) {
-    int64_t act_shape_size = act_shape.size();
-    actual_shape->Resize({act_shape_size});
-    memcpy(actual_shape->mutable_data<int>(),
-           act_shape.data(),
-           act_shape_size * sizeof(int));
-    opdesc.SetInput("Shape", {actual_shape_var_name});
-  }
-  if (reshape2) {
-    opdesc.SetOutput("XShape", {xshape_var_name});
-  }
-
-  // create op and execute reference implementation
-  auto op = reshape2 ? CreateOp<operators::Reshape2Op>(opdesc, &scope)
-                     : CreateOp<operators::ReshapeOp>(opdesc, &scope);
-  reshape_ref(op);
-  out_ref->CopyDataFrom(*out);
-  if (reshape2) {
-    xshape_ref->CopyDataFrom(*xshape);
-  }
-
-  // convert op to NPU model, then run it on NPU
-  LauchOp(op,
-          {x_var_name},
-          {out_var_name});  // TODO(hong19860320) support XShape for reshape2
-
-  // compare results
-  auto out_dims = out->dims();
-  auto out_ref_dims = out_ref->dims();
-  CHECK_EQ(out_dims.size(), out_ref_dims.size());
-  for (int i = 0; i < out_dims.size(); i++) {
-    CHECK_EQ(out_dims[i], out_ref_dims[i]);
-  }
-  auto out_data = out->mutable_data<float>();
-  auto out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    VLOG(5) << i;
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
-  }
-  // if (reshape2) {
-  //   auto xshape_dims = xshape->dims();
-  //   auto xshape_ref_dims = xshape_ref->dims();
-  //   CHECK_EQ(xshape_dims.size(), xshape_ref_dims.size());
-  //   for (size_t i = 0; i < xshape_dims.size(); i++) {
-  //     CHECK_EQ(xshape_dims[i], xshape_ref_dims[i]);
-  //   }
-  // }
-}
-
-TEST(NPUBridges, reshape) {
-#if 1
-  std::map<std::vector<int64_t>, std::vector<std::vector<int>>> tests = {
-      {{1, 2, 4, 6},
-       {{},
-        {-1},
-        {48},
-        {-1, 48},
-        {1, 48},
-        {0, 48},
-        {48, -1},
-        {48, 1},
-        {-1, 24},
-        {2, 24},
-        {24, 0},
-        {-1, 0, 3, 2},
-        {4, 2, 3, 2},
-        {0, -1, 3, 2},
-        {1, 8, 3, 2}}}};
-  for (auto& i : tests) {
-    for (auto& shape : i.second) {
-      if (shape.empty()) {
-        continue;
-      }
-      for (auto& act_shape : i.second) {
-        for (auto& inplace : {true, false}) {
-          for (auto& reshape2 : {true, false}) {
-            std::stringstream ss;
-            ss << "x:{ ";
-            for (auto s : i.first) {
-              ss << s << " ";
-            }
-            ss << "} shape:{ ";
-            for (auto s : shape) {
-              ss << s << " ";
-            }
-            ss << "} act_shape:{ ";
-            for (auto s : act_shape) {
-              ss << s << " ";
-            }
-            VLOG(3) << ss.str() << "} inplace:" << inplace
-                    << " reshape2:" << reshape2;
-            test_reshape(i.first, shape, act_shape, inplace, reshape2);
-          }
-        }
-      }
-    }
-  }
-#else
-  test_reshape({2, 4, 6}, {-1, 0, 4, 3}, {}, true, true);
-  test_reshape({1, 232, 14, 14}, {-1, 2, 116, 14, 14}, {}, true, true);
-#endif
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(reshape);
-USE_NPU_BRIDGE(reshape);
-
-USE_LITE_OP(reshape2);
-USE_NPU_BRIDGE(reshape2);
diff --git a/lite/backends/npu/bridge/scale_op.cc b/lite/backends/npu/bridge/scale_op.cc
deleted file mode 100644
index a884b34856..0000000000
--- a/lite/backends/npu/bridge/scale_op.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/scale_op.h"
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/utils.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-node_map_type ScaleConverter(const std::shared_ptr<lite::OpLite> scale_op,
-                             const node_map_type& inputs_map) {
-  auto scope = scale_op->scope();
-  auto op_info = scale_op->op_info();
-  auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
-  LOG(INFO) << "Converting " + op_type + "...";
-
-  // get input, output and op attributes
-  auto x_var_name = op_info->Input("X").front();
-  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
-  auto x_dims = x->dims().Vectorize();
-  CHECK_GE(x_dims.size(), 2);
-  std::vector<int64_t> scale_bias_shape = {x_dims[1]};
-  float scale = op_info->GetAttr<float>("scale");
-  float bias = op_info->GetAttr<float>("bias");
-  bool bias_after_scale = op_info->GetAttr<bool>("bias_after_scale");
-  if (!bias_after_scale) {
-    bias *= scale;
-  }
-
-  // create scale node and set input node from inputs_map
-  auto scale_node = std::make_shared<ge::op::Scale>(unique_op_type);
-  CHECK(inputs_map.count(x_var_name));
-  scale_node->set_input_x(*inputs_map.at(x_var_name));
-  OpList::Global().add(inputs_map.at(x_var_name));
-  OpList::Global().add(scale_node);
-
-  // add filter node(fill with scale)
-  auto filter_const_node =
-      std::make_shared<ge::op::Const>(unique_op_type + "/filter");
-  filter_const_node->set_attr_value(
-      CreateTensorAndFillData(scale, scale_bias_shape));
-  scale_node->set_input_filter(*filter_const_node);
-  OpList::Global().add(filter_const_node);
-
-  // add bias node(fill with bias)
-  if (fabs(bias) > 1e-6f) {
-    auto bias_const_node =
-        std::make_shared<ge::op::Const>(unique_op_type + "/bias");
-    bias_const_node->set_attr_value(
-        CreateTensorAndFillData(bias, scale_bias_shape));
-    scale_node->set_input_bias(*bias_const_node);
-    scale_node->set_attr_has_bias_value(true);
-    OpList::Global().add(bias_const_node);
-  }
-
-  scale_node->set_attr_axis(1);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = scale_node;
-  return outputs_map;
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_NPU_BRIDGE(scale, paddle::lite::npu::bridge::ScaleConverter);
diff --git a/lite/backends/npu/bridge/scale_op_test.cc b/lite/backends/npu/bridge/scale_op_test.cc
deleted file mode 100644
index f4a241c8d9..0000000000
--- a/lite/backends/npu/bridge/scale_op_test.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/scale_op.h"
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/test_helper.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-void scale_ref(const std::shared_ptr<operators::ScaleOp> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  float scale = op_info->GetAttr<float>("scale");
-  float bias = op_info->GetAttr<float>("bias");
-  bool bias_after_scale = op_info->GetAttr<bool>("bias_after_scale");
-  if (!bias_after_scale) {
-    bias *= scale;
-  }
-  auto x_data = x->data<float>();
-  auto out_data = out->mutable_data<float>();
-  DDim x_dims = x->dims();
-  DDim out_dims = out->dims();
-  CHECK_EQ(x_dims.production(), out_dims.production());
-  for (int i = 0; i < out_dims.production(); i++) {
-    out_data[i] = x_data[i] * scale + bias;
-  }
-}
-
-void test_scale(int bs,
-                int ic,
-                int ih,
-                int iw,
-                bool bias_after_scale,
-                float scale,
-                float bias) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name("x");
-  std::string out_var_name("out");
-  std::string out_ref_var_name("out_ref");
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-
-  // initialize input&output data
-  FillTensor<float, int>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("scale");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("bias_after_scale", bias_after_scale);
-  opdesc.SetAttr("scale", scale);
-  opdesc.SetAttr("bias", bias);
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::ScaleOp>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // execute reference implementation and save to output tensor('out')
-  scale_ref(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    VLOG(5) << i;
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
-  }
-}
-
-TEST(NPUBridges, scale) {
-  for (auto bs : {1, 3}) {
-    for (auto ic : {1, 3}) {
-      for (auto ih : {3, 4}) {
-        for (auto iw : {4, 3}) {
-          for (auto bias_after_scale : {true, false}) {
-            for (auto scale : {-1.0f, 5.0f}) {
-              for (auto bias : {-2.0f, 30.0f}) {
-                VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih
-                        << " iw: " << iw
-                        << " bias_after_scale: " << bias_after_scale
-                        << " scale: " << scale << " bias: " << bias;
-                test_scale(bs, ic, ih, iw, bias_after_scale, scale, bias);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(scale);
-USE_NPU_BRIDGE(scale);
diff --git a/lite/backends/npu/bridge/shuffle_channel_op.cc b/lite/backends/npu/bridge/shuffle_channel_op.cc
deleted file mode 100644
index ac4ae58d34..0000000000
--- a/lite/backends/npu/bridge/shuffle_channel_op.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/shuffle_channel_op.h"
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/utils.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-node_map_type ShuffleChannelConverter(
-    const std::shared_ptr<lite::OpLite> shuffle_channel_op,
-    const node_map_type& inputs_map) {
-  auto scope = shuffle_channel_op->scope();
-  auto op_info = shuffle_channel_op->op_info();
-  auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
-  LOG(INFO) << "Converting " + op_type + "...";
-
-  std::shared_ptr<ge::op::ShuffleChannel> shuffle_channel_node =
-      std::make_shared<ge::op::ShuffleChannel>(unique_op_type);
-  auto x_var_name = op_info->Input("X").front();
-
-  shuffle_channel_node->set_input_x(*inputs_map.at(x_var_name));
-  shuffle_channel_node->set_attr_group(op_info->GetAttr<int>("group"));
-
-  OpList::Global().add(inputs_map.at(x_var_name));
-  OpList::Global().add(shuffle_channel_node);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = shuffle_channel_node;
-  return outputs_map;
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_NPU_BRIDGE(shuffle_channel,
-                    paddle::lite::npu::bridge::ShuffleChannelConverter);
diff --git a/lite/backends/npu/bridge/shuffle_channel_op_test.cc b/lite/backends/npu/bridge/shuffle_channel_op_test.cc
deleted file mode 100644
index c37c97a3b4..0000000000
--- a/lite/backends/npu/bridge/shuffle_channel_op_test.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/shuffle_channel_op.h"
-#include <gtest/gtest.h>
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/test_helper.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-void shuffle_channel_ref(
-    const std::shared_ptr<operators::ShuffleChannelOpLite> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto x_data = x->mutable_data<float>();
-  auto out_data = out->mutable_data<float>();
-  int group = op_info->GetAttr<int>("group");
-  auto x_dims = x->dims();
-
-  int n_size = x_dims.production() / x_dims[0];
-  int c_size = n_size / x_dims[1];
-  for (int n = 0; n < x_dims[0]; n++) {
-    int g_num = x_dims[1] / group;
-    auto tmp_out_data = out_data;
-    for (int g = 0; g < g_num; g++) {
-      auto tmp_x_data = x_data + g * c_size;
-      for (int i = 0; i < group; i++) {
-        std::memcpy(tmp_out_data,
-                    tmp_x_data + i * g_num * c_size,
-                    c_size * sizeof(float));
-        tmp_out_data += c_size;
-      }
-    }
-    x_data += n_size;
-    out_data += n_size;
-  }
-}
-
-void test_shuffle_channel(int bs, int ic, int ih, int iw, int group) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-
-  // initialize input&output data
-  FillTensor<float>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("shuffle_channel");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("group", group);
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::ShuffleChannelOpLite>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // execute reference implementation and save to output tensor
-  shuffle_channel_ref(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
-  }
-}
-
-TEST(NPUBridges, softmax) {
-  for (auto bs : {1, 4}) {
-    for (auto ic : {1, 24, 35}) {
-      for (auto ih : {1, 4}) {
-        for (auto iw : {1, 4}) {
-          for (auto group : {1, 3, 7, 24, 35}) {
-            if (ic % group != 0) continue;
-            test_shuffle_channel(bs, ic, ih, iw, group);
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(shuffle_channel);
-USE_NPU_BRIDGE(shuffle_channel);
diff --git a/lite/backends/npu/bridge/softmax_op.cc b/lite/backends/npu/bridge/softmax_op.cc
deleted file mode 100644
index 6c556e6ca7..0000000000
--- a/lite/backends/npu/bridge/softmax_op.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/softmax_op.h"
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/utils.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-node_map_type SoftmaxConverter(const std::shared_ptr<lite::OpLite> softmax_op,
-                               const node_map_type& inputs_map) {
-  auto scope = softmax_op->scope();
-  auto op_info = softmax_op->op_info();
-  auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
-  LOG(INFO) << "Converting " + op_type + "...";
-
-  std::shared_ptr<ge::op::Softmax> softmax_node =
-      std::make_shared<ge::op::Softmax>(unique_op_type);
-  auto x_var_name = op_info->Input("X").front();
-
-  auto x_dims = scope->FindVar(x_var_name)->GetMutable<Tensor>()->dims();
-  auto axis = op_info->GetAttr<int>("axis");
-  if (x_dims.size() > 3) {
-    CHECK(!(axis == 2 && x_dims[3] > 1))
-        << "unsupported npu softmax params: axis = " << axis
-        << "  :x_w = " << x_dims[3];
-  }
-
-  CHECK(inputs_map.count(x_var_name));
-  softmax_node->set_input_x(*inputs_map.at(x_var_name));
-  softmax_node->set_attr_axis(axis);
-
-  OpList::Global().add(inputs_map.at(x_var_name));
-  OpList::Global().add(softmax_node);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = softmax_node;
-  return outputs_map;
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_NPU_BRIDGE(softmax, paddle::lite::npu::bridge::SoftmaxConverter);
diff --git a/lite/backends/npu/bridge/softmax_op_test.cc b/lite/backends/npu/bridge/softmax_op_test.cc
deleted file mode 100644
index c3114f5360..0000000000
--- a/lite/backends/npu/bridge/softmax_op_test.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/softmax_op.h"
-#include <gtest/gtest.h>
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/test_helper.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-template <typename dtype>
-void softmax_ref(const std::shared_ptr<operators::SoftmaxOp> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto x_data = x->data<dtype>();
-  auto out_data = out->mutable_data<dtype>();
-  DDim x_dims = x->dims();
-
-  auto x_rank = x_dims.size();
-  int axis = op_info->GetAttr<int>("axis");
-  if (axis < 0) {
-    axis += x_rank;
-  }
-  int axis_size = x_dims[axis];
-  int outer_num = x_dims.Slice(0, axis).production();
-  int inner_num = x_dims.Slice(axis + 1, x_rank).production();
-  int compute_size = outer_num * inner_num;
-  for (int i = 0; i < compute_size; i++) {
-    int idx_inner = i % inner_num;
-    int idx_outer = (i / inner_num) * axis_size;
-    int start = idx_outer * inner_num + idx_inner;
-    int offset;
-
-    offset = start;
-    dtype max_data = std::numeric_limits<dtype>::lowest();
-    for (int j = 0; j < axis_size; j++) {
-      max_data = x_data[offset] > max_data ? x_data[offset] : max_data;
-      offset += inner_num;
-    }
-
-    offset = start;
-    dtype sum_data = (dtype)0;
-    for (int j = 0; j < axis_size; j++) {
-      out_data[offset] = exp(x_data[offset] - max_data);
-      sum_data += out_data[offset];
-      offset += inner_num;
-    }
-
-    offset = start;
-    for (int j = 0; j < axis_size; j++) {
-      out_data[offset] /= sum_data;
-      offset += inner_num;
-    }
-  }
-}
-
-void test_softmax(int bs, int ic, int ih, int iw, int axis) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-
-  // initialize input&output data
-  FillTensor<float>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("softmax");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("axis", axis);
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::SoftmaxOp>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // execute reference implementation and save to output tensor
-  softmax_ref<float>(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
-  }
-}
-
-TEST(NPUBridges, softmax) {
-  for (auto bs : {1, 4, 7}) {
-    for (auto ic : {1, 4, 7}) {
-      for (auto ih : {1, 4, 7}) {
-        for (auto iw : {1, 4, 7}) {
-          for (auto axis : {-3, -1, 0, 1, 2, 3}) {
-            // npu softmax exists bugs when axis is 2 and iw > 1
-            if (axis == 2 && iw > 1) continue;
-            test_softmax(bs, ic, ih, iw, axis);
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(softmax);
-USE_NPU_BRIDGE(softmax);
diff --git a/lite/backends/npu/bridge/split_op.cc b/lite/backends/npu/bridge/split_op.cc
deleted file mode 100644
index 86de45fedf..0000000000
--- a/lite/backends/npu/bridge/split_op.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/split_op.h"
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/utils.h"
-#include "lite/backends/npu/npu_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-node_map_type SplitConverter(const std::shared_ptr<lite::OpLite> split_op,
-                             const node_map_type& inputs_map) {
-  lite::Scope* scope = split_op->scope();
-  const lite::OpInfo* op_info = split_op->op_info();
-  auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
-  LOG(INFO) << "Converting " << op_type << " ... ";
-
-  auto x_var_name = op_info->Input("X").front();
-  auto axis = op_info->GetAttr<int>("axis");
-  auto num = op_info->GetAttr<int>("num");
-  auto sections = op_info->GetAttr<std::vector<int>>("sections");
-  int64_t sections_num = static_cast<int64_t>(sections.size());
-
-  std::shared_ptr<ge::op::Split> output_node =
-      std::make_shared<ge::op::Split>(unique_op_type);
-  CHECK(inputs_map.count(x_var_name));
-  output_node->set_input_x(*inputs_map.at(x_var_name));
-  OpList::Global().add(inputs_map.at(x_var_name));
-
-  output_node->set_attr_axis(static_cast<int64_t>(axis));
-  if (num > 0) {
-    output_node->set_attr_output_num(static_cast<int64_t>(num));
-  } else {
-    output_node->set_attr_output_num(sections_num);
-    auto size_split = ge::AttrValue::LIST_INT(sections.begin(), sections.end());
-    output_node->set_attr_size_split(size_split);
-  }
-
-  node_map_type outputs_map;
-  auto out_var_names = op_info->Output("Out");
-  output_node->create_dynamic_output_y(out_var_names.size());
-  int index = 1;
-  for (auto out_var_name : out_var_names) {
-    auto const_node = std::make_shared<ge::op::Const>(
-        unique_op_type + "/const_zero" + std::to_string(index));
-    const_node->set_attr_value(CreateTensorAndFillData(0));
-    OpList::Global().add(const_node);
-    auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add" +
-                                                  std::to_string(index));
-    add_node->set_input_x1(*output_node, "y" + std::to_string(index));
-    add_node->set_input_x2(*const_node);
-    outputs_map[out_var_name] = add_node;
-    OpList::Global().add(add_node);
-    index++;
-  }
-
-  OpList::Global().add(output_node);
-  return outputs_map;
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_NPU_BRIDGE(split, paddle::lite::npu::bridge::SplitConverter);
diff --git a/lite/backends/npu/bridge/split_op_test.cc b/lite/backends/npu/bridge/split_op_test.cc
deleted file mode 100644
index 91629a70fc..0000000000
--- a/lite/backends/npu/bridge/split_op_test.cc
+++ /dev/null
@@ -1,170 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/split_op.h"
-#include <gtest/gtest.h>
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/test_helper.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-template <typename dtype>
-void split_ref(const std::shared_ptr<operators::SplitOp> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  int num = op_info->GetAttr<int>("num");
-  int axis = op_info->GetAttr<int>("axis");
-  std::vector<int> sections = op_info->GetAttr<std::vector<int>>("sections");
-  std::vector<lite::Tensor*> output_vec;
-  auto output = op_info->Output("Out");
-  for (auto out_var : output) {
-    output_vec.push_back(scope->Var(out_var)->GetMutable<Tensor>());
-  }
-  auto in_dims = x->dims();
-  auto rank = in_dims.size();
-  int outs_number = output_vec.size();
-  std::vector<lite::DDimLite> outs_dims;
-  outs_dims.reserve(outs_number);
-  if (axis < 0) {
-    axis += rank;
-  }
-  if (num > 0) {
-    int out_axis_dim = in_dims[axis] / num;
-    for (int i = 0; i < outs_number; ++i) {
-      auto dim = in_dims;
-      dim[axis] = out_axis_dim;
-      outs_dims.push_back(dim);
-    }
-  } else if (sections.size() > 0) {
-    for (size_t i = 0; i < outs_number; ++i) {
-      auto dim = in_dims;
-      dim[axis] = sections[i];
-      outs_dims.push_back(dim);
-    }
-  }
-  for (int j = 0; j < outs_dims.size(); ++j) {
-    output_vec[j]->Resize(outs_dims[j]);
-  }
-
-  const dtype* din = x->mutable_data<const dtype>();
-  std::vector<int> in_strides(in_dims.size());
-  in_strides[in_dims.size() - 1] = in_dims[in_dims.size() - 1];
-  for (int i = in_dims.size() - 2; i >= 0; --i) {
-    in_strides[i] = in_strides[i + 1] * in_dims[i];
-  }
-
-  int input_offset = 0;
-  for (auto out : output_vec) {
-    auto out_dim = out->dims();
-    std::vector<int> out_strides(out_dim.size());
-    out_strides[out_dim.size() - 1] = out_dim[out_dim.size() - 1];
-    for (int i = out_dim.size() - 2; i >= 0; --i) {
-      out_strides[i] = out_strides[i + 1] * out_dim[i];
-    }
-
-    dtype* out_data = out->mutable_data<dtype>();
-    int before = out_strides[0] / out_strides[axis];
-    int in_after = in_strides[axis];
-    int out_after = out_strides[axis];
-
-    for (int i = 0; i < before; ++i) {
-      std::memcpy(out_data + i * out_after,
-                  din + input_offset + i * in_after,
-                  sizeof(dtype) * out_after);
-    }
-    input_offset += out_strides[axis];
-  }
-}
-
-void test_split(int bs,
-                int ic,
-                int ih,
-                int iw,
-                int axis,
-                int num,
-                std::vector<int> sections) {
-  const auto& bridges = lite::npu::bridge::Factory::Instance();
-  const auto& supported_lists = bridges.AllFunctions();
-  CHECK(bridges.HasType("split"));
-  // prepare input&output variables
-  std::string x_var_name = "x";
-  std::string out_var_name_1 = "out_1";
-  std::string out_var_name_2 = "out_2";
-  std::string out_ref_var_name_1 = "out_ref_1";
-  std::string out_ref_var_name_2 = "out_ref_2";
-
-  Scope scope;
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* out_1 = scope.Var(out_var_name_1)->GetMutable<Tensor>();
-  auto* out_2 = scope.Var(out_var_name_2)->GetMutable<Tensor>();
-  auto* out_ref_1 = scope.Var(out_ref_var_name_1)->GetMutable<Tensor>();
-  auto* out_ref_2 = scope.Var(out_ref_var_name_2)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-  // initialize input&output data
-  FillTensor<float>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("split");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name_1, out_var_name_2});
-  opdesc.SetAttr("axis", axis);
-  opdesc.SetAttr("sections", sections);
-  opdesc.SetAttr("num", num);
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::SplitOp>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name_1, out_var_name_2});
-  out_ref_1->CopyDataFrom(*out_1);
-  out_ref_2->CopyDataFrom(*out_2);
-  // execute reference implementation and save to output tensor
-  split_ref<float>(op);
-
-  // compare results
-  auto* out_data_1 = out_1->mutable_data<float>();
-  auto* out_data_2 = out_2->mutable_data<float>();
-  auto* out_ref_data_1 = out_ref_1->mutable_data<float>();
-  auto* out_ref_data_2 = out_ref_2->mutable_data<float>();
-  for (int i = 0; i < out_1->dims().production(); i++) {
-    VLOG(5) << i;
-    EXPECT_NEAR(out_data_1[i], out_ref_data_1[i], 5e-4);
-  }
-  for (int i = 0; i < out_2->dims().production(); i++) {
-    VLOG(5) << i;
-    EXPECT_NEAR(out_data_2[i], out_ref_data_2[i], 5e-4);
-  }
-}
-
-TEST(NPUBridges, split) {
-  test_split(4, 2, 3, 1, 0, 2, {});
-  test_split(4, 2, 3, 1, 0, 0, {3, 1});
-  test_split(4, 6, 3, 1, 1, 2, {});
-  test_split(4, 6, 3, 1, 1, 0, {2, 4});
-  test_split(4, 2, 2, 1, 2, 2, {});
-  test_split(4, 2, 6, 1, 2, 0, {3, 3});
-  test_split(4, 2, 3, 4, 3, 2, {});
-  test_split(4, 2, 3, 6, 3, 0, {5, 1});
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(split);
-USE_NPU_BRIDGE(split);
diff --git a/lite/backends/npu/bridge/test_helper.cc b/lite/backends/npu/bridge/test_helper.cc
deleted file mode 100644
index 3d6dc03481..0000000000
--- a/lite/backends/npu/bridge/test_helper.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/npu/bridge/test_helper.h"
-#include <utility>
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/utils.h"
-#include "lite/core/op_registry.h"
-#include "lite/operators/graph_op.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-void LauchOp(const std::shared_ptr<lite::OpLite> op,
-             const std::vector<std::string>& input_var_names,
-             const std::vector<std::string>& output_var_names) {
-  auto scope = op->scope();
-  auto op_type = op->op_info()->Type();
-
-  // convert op to IR graph
-  const auto& bridges = lite::npu::bridge::Factory::Instance();
-  const auto& supported_lists = bridges.AllFunctions();
-  CHECK(bridges.HasType(op_type));
-
-  node_map_type inputs_map;
-  for (auto input_var_name : input_var_names) {
-    auto input = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>();
-    ge::TensorDesc input_desc(
-        ge::Shape(input->dims().Vectorize()), ge::FORMAT_NCHW, ge::DT_FLOAT);
-    auto input_node = std::make_shared<ge::op::Data>(input_var_name);
-    input_node->update_input_desc_x(input_desc);
-    npu::OpList::Global().add(input_node);
-    inputs_map[input_var_name] = input_node;
-  }
-  auto outputs_map = supported_lists.at(op_type)(op, inputs_map);
-  CHECK_GT(outputs_map.size(), 0);
-
-  // compile IR graph to om model
-  std::vector<ge::Operator> graph_inputs;
-  for (auto input_var_name : input_var_names) {
-    graph_inputs.push_back(*inputs_map[input_var_name]);
-  }
-  std::vector<ge::Operator> graph_outputs;
-  for (auto output_var_name : output_var_names) {
-    graph_outputs.push_back(*outputs_map[output_var_name]);
-  }
-  std::string model_name(UniqueName("test_" + op_type) + ".om");
-  CHECK(npu::BuildNPUClient(graph_inputs, graph_outputs, model_name));
-
-  // create graph op and set inputs and outputs
-  cpp::OpDesc graph_op_desc;
-  graph_op_desc.SetType("graph_op");
-  graph_op_desc.SetInput("Inputs", input_var_names);
-  graph_op_desc.SetOutput("Outputs", output_var_names);
-  graph_op_desc.SetAttr("model_name", model_name);
-
-  auto graph_op =
-      std::make_shared<operators::GraphOpLite>(graph_op_desc.Type());
-  graph_op->SetValidPlaces({Place{TARGET(kNPU), PRECISION(kFloat)}});
-  CHECK(graph_op->Attach(graph_op_desc, scope));
-  CHECK(graph_op->CheckShape());
-  CHECK(graph_op->InferShape());
-
-  // create graph op kernel and set NPU context
-  auto graph_kernels =
-      graph_op->CreateKernels({Place{TARGET(kNPU), PRECISION(kFloat)}});
-  CHECK(!graph_kernels.empty());
-  auto graph_kernel =
-      std::move(graph_kernels.front());  // use the first kernel by default
-  auto graph_ctx = ContextScheduler::Global().NewContext(TARGET(kNPU));
-  graph_kernel->SetContext(std::move(graph_ctx));
-
-  // perform graph op kernel and store to output variables
-  graph_kernel->Launch();
-
-  // release all of resources of generated model
-  npu::OpList::Global().clear();
-  npu::DeviceInfo::Global().Clear();
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(graph_op);
-USE_LITE_KERNEL(graph_op, kNPU, kFloat, kNCHW, def);
diff --git a/lite/backends/npu/bridge/test_helper.h b/lite/backends/npu/bridge/test_helper.h
deleted file mode 100644
index 537f737640..0000000000
--- a/lite/backends/npu/bridge/test_helper.h
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <random>
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-template <typename T>
-std::shared_ptr<T> CreateOp(const cpp::OpDesc& opdesc, lite::Scope* scope) {
-  auto op = std::make_shared<T>(opdesc.Type());
-  op->SetValidPlaces({Place{TARGET(kHost), PRECISION(kFloat)},
-                      Place{TARGET(kARM), PRECISION(kFloat)},
-                      Place{TARGET(kNPU), PRECISION(kFloat)}});
-  CHECK(op->Attach(opdesc, scope));
-  CHECK(op->CheckShape());
-  CHECK(op->InferShape());
-  return op;
-}
-
-// T is the target data type
-// R is the range data type, e.g. int, half
-template <typename T, typename R = float>
-void FillTensor(Tensor* x,
-                T lower = static_cast<T>(-2),
-                T upper = static_cast<T>(2)) {
-  static unsigned int seed = 100;
-  std::mt19937 rng(seed++);
-  std::uniform_real_distribution<double> uniform_dist(0, 1);
-
-  T* x_data = x->mutable_data<T>();
-  for (int i = 0; i < x->dims().production(); ++i) {
-    auto r = uniform_dist(rng) * (upper - lower) + lower;
-    x_data[i] = static_cast<T>(static_cast<R>(r));
-  }
-}
-
-void LauchOp(const std::shared_ptr<lite::OpLite> op,
-             const std::vector<std::string>& input_var_names,
-             const std::vector<std::string>& output_var_names);
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/npu/bridge/transpose_op.cc b/lite/backends/npu/bridge/transpose_op.cc
deleted file mode 100644
index ad00e599ce..0000000000
--- a/lite/backends/npu/bridge/transpose_op.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/transpose_op.h"
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/utils.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-node_map_type TransposeConverter(
-    const std::shared_ptr<lite::OpLite> transpose_op,
-    const node_map_type& inputs_map) {
-  auto scope = transpose_op->scope();
-  auto op_info = transpose_op->op_info();
-  auto op_type = op_info->Type();
-  auto unique_op_type = UniqueName(op_type);
-  LOG(INFO) << "Converting " + op_type + "...";
-
-  std::shared_ptr<ge::op::Permute> transpose_node =
-      std::make_shared<ge::op::Permute>(unique_op_type);
-  auto x_var_name = op_info->Input("X").front();
-
-  // paddlelite doesn't have this input
-  // w must be set, but it does nothing
-  auto w_var_name = unique_op_type + "/w";
-  auto* w = scope->Var(w_var_name)->GetMutable<Tensor>();
-  w->Resize({1});
-  auto* w_data = w->mutable_data<float>();
-  for (int i = 0; i < w->numel(); i++) {
-    w_data[i] = 1.f;
-  }
-  auto npu_w = std::make_shared<ge::op::Const>(w_var_name);
-  npu_w->set_attr_value(CvtFromLiteTensor(w));
-  OpList::Global().add(npu_w);
-
-  auto axis = op_info->GetAttr<std::vector<int>>("axis");
-  auto npu_axis = ge::AttrValue::LIST_INT(axis.begin(), axis.end());
-
-  CHECK(inputs_map.count(x_var_name));
-  transpose_node->set_input_x(*inputs_map.at(x_var_name));
-  transpose_node->set_input_w(*npu_w);
-  transpose_node->set_attr_order(npu_axis);
-
-  OpList::Global().add(inputs_map.at(x_var_name));
-  OpList::Global().add(transpose_node);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = transpose_node;
-  return outputs_map;
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_NPU_BRIDGE(transpose, paddle::lite::npu::bridge::TransposeConverter);
-REGISTER_NPU_BRIDGE(transpose2, paddle::lite::npu::bridge::TransposeConverter);
diff --git a/lite/backends/npu/bridge/transpose_op_test.cc b/lite/backends/npu/bridge/transpose_op_test.cc
deleted file mode 100644
index 9bbfb11123..0000000000
--- a/lite/backends/npu/bridge/transpose_op_test.cc
+++ /dev/null
@@ -1,151 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/transpose_op.h"
-#include <gtest/gtest.h>
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/test_helper.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-int data_index(std::vector<int> pos, DDimLite dims) {
-  int d1 = dims[1];
-  int d2 = dims[2];
-  int d3 = dims[3];
-  return pos[3] + pos[2] * d3 + pos[1] * d3 * d2 + pos[0] * d3 * d2 * d1;
-}
-
-std::vector<int> pos_trans(std::vector<int> in_pos, std::vector<int> axis) {
-  std::vector<int> out_pos(in_pos.size());
-  for (int i = 0; i < axis.size(); i++) {
-    out_pos[axis[i]] = in_pos[i];
-  }
-  return out_pos;
-}
-
-void transpose_ref(const std::shared_ptr<operators::TransposeOp> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto input =
-      scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto output =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto x_dims = input->dims();
-  auto y_dims = output->dims();
-  auto axis = op_info->GetAttr<std::vector<int>>("axis");
-
-  auto* input_data = input->data<float>();
-  auto* output_data = output->mutable_data<float>();
-
-  int input_n = x_dims[0];
-  int input_c = x_dims[1];
-  int input_h = x_dims[2];
-  int input_w = x_dims[3];
-  int output_n = y_dims[0];
-  int output_c = y_dims[1];
-  int output_h = y_dims[2];
-  int output_w = y_dims[3];
-
-  for (int n = 0; n < input_n; ++n) {
-    for (int c = 0; c < input_c; ++c) {
-      for (int h = 0; h < input_h; ++h) {
-        for (int w = 0; w < input_w; ++w) {
-          std::vector<int> in_pos{n, c, h, w};
-          std::vector<int> out_pos = pos_trans(in_pos, axis);
-          int in_index = data_index(in_pos, x_dims);
-          int out_index = data_index(out_pos, y_dims);
-          output_data[out_index] = input_data[in_index];
-        }
-      }
-    }
-  }
-}
-
-void test_transpose(int bs, int ic, int ih, int iw, std::vector<int> axis) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-
-  // initialize input&output data
-  FillTensor<float>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("transpose");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("axis", axis);
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::TransposeOp>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // execute reference implementation and save to output tensor
-  transpose_ref(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
-  }
-}
-
-TEST(NPUBridges, transpose) {
-#if 0
-  for (auto bs : {1, 4, 7}) {
-    for (auto ic : {1, 4, 7}) {
-      for (auto ih : {1, 4, 7}) {
-        for (auto iw : {1, 4, 7}) {
-          for (auto axis : {std::vector<int>{0, 1, 2, 3},
-                            std::vector<int>{0, 1, 3, 2},
-                            std::vector<int>{0, 3, 1, 2},
-                            std::vector<int>{1, 2, 3, 0},
-                            std::vector<int>{3, 2, 1, 0},
-                            std::vector<int>{2, 3, 1, 0}}) {
-            test_transpose(bs, ic, ih, iw, axis);
-          }
-        }
-      }
-    }
-  }
-#endif
-  test_transpose(2, 3, 4, 5, std::vector<int>{0, 1, 3, 2});
-  // test_transpose(2, 3, 4, 5, std::vector<int>{0, 1, 2, 3});
-  // test_transpose(2, 2, 2, 2, std::vector<int>{0,1,3,2});
-  // test_transpose(1, 1, 2, 2, std::vector<int>{0,1,3,2});
-  // test_transpose(1, 1, 1, 2, std::vector<int>{0,1,2,3});
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(transpose);
-USE_NPU_BRIDGE(transpose);
-
-USE_LITE_OP(transpose2);
-USE_NPU_BRIDGE(transpose2);
diff --git a/lite/backends/npu/bridge/utils.cc b/lite/backends/npu/bridge/utils.cc
deleted file mode 100644
index 8abd7dbda4..0000000000
--- a/lite/backends/npu/bridge/utils.cc
+++ /dev/null
@@ -1,137 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/npu/bridge/utils.h"
-#include <memory>
-#include <mutex>  // NOLINT
-#include <string>
-#include <unordered_map>
-#include "ai_ddk_lib/include/graph/op/all_ops.h"  // for ge::op::Data
-#include "ai_ddk_lib/include/graph/tensor.h"      // for ge::TensorUtils
-#include "lite/core/op_lite.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-std::string UniqueName(const std::string& prefix) {
-  static std::mutex counter_mtx;
-  static std::unordered_map<std::string, int> counter_map;
-  std::unique_lock<std::mutex> counter_lck(counter_mtx);
-  int counter = 1;
-  auto it = counter_map.find(prefix);
-  if (it == counter_map.end()) {
-    counter_map[prefix] = counter;
-  } else {
-    counter = ++(it->second);
-  }
-  return prefix + "_" + std::to_string(counter);
-}
-
-ge::DataType PrecisionConverter(PrecisionType itype) {
-  ge::DataType otype = ge::DT_FLOAT;
-  switch (itype) {
-    case PRECISION(kFloat):
-      otype = ge::DT_FLOAT;
-      break;
-    case PRECISION(kInt8):
-      otype = ge::DT_INT8;
-      break;
-    case PRECISION(kInt32):
-      otype = ge::DT_INT32;
-      break;
-    default:
-      LOG(FATAL) << "Can not convert precision type(" << PrecisionToStr(itype)
-                 << ") from Lite to NPU";
-      break;
-  }
-  return otype;
-}
-
-ge::Format DataLayoutConverter(DataLayoutType itype) {
-  ge::Format otype = ge::FORMAT_NCHW;
-  switch (itype) {
-    case DATALAYOUT(kNCHW):
-      otype = ge::FORMAT_NCHW;
-      break;
-    // TODO(hong19860320) support more data layout type
-    default:
-      LOG(FATAL) << "Can not convert data layout type("
-                 << DataLayoutToStr(itype) << ") from Lite to NPU";
-      break;
-  }
-  return otype;
-}
-
-ge::TensorPtr CvtFromLiteTensor(lite::Tensor* in_tensor,
-                                std::vector<int64_t> out_shape,
-                                PrecisionType in_ptype,
-                                DataLayoutType in_ltype) {
-  uint8_t* in_data = nullptr;
-  auto in_size = in_tensor->dims().production();
-  auto in_shape = in_tensor->dims().Vectorize();
-  if (out_shape.empty()) {
-    out_shape = in_shape;
-  }
-  int in_bytes;
-  if (in_ptype == PRECISION(kFloat)) {
-    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<float>());
-    in_bytes = in_size * sizeof(float);
-  } else if (in_ptype == PRECISION(kInt32)) {
-    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int32_t>());
-    in_bytes = in_size * sizeof(int32_t);
-  } else if (in_ptype == PRECISION(kInt8)) {
-    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int8_t>());
-    in_bytes = in_size * sizeof(int8_t);
-  } else {
-    LOG(FATAL) << "Unknow precision type " << PrecisionToStr(in_ptype);
-  }
-  ge::DataType out_ptype = PrecisionConverter(in_ptype);
-  ge::Format out_ltype = DataLayoutConverter(in_ltype);
-
-  ge::TensorDesc out_desc(ge::Shape(out_shape), out_ltype, out_ptype);
-  CHECK_EQ(out_ltype, ge::FORMAT_NCHW);
-
-  auto out_size = out_desc.GetShape().GetShapeSize();
-  CHECK_EQ(out_size, in_size);
-
-  ge::TensorPtr out_tensor = std::make_shared<ge::Tensor>();
-  out_tensor->SetTensorDesc(out_desc);
-  out_tensor->SetData(in_data, in_bytes);
-  return out_tensor;
-}
-
-bool HasInputArg(const OpInfo* op_info,
-                 const Scope* scope,
-                 const std::string& argname) {
-  auto iarg_names = op_info->input_argnames();
-  if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
-      iarg_names.end()) {
-    auto inputs = op_info->Input(argname);
-    if (inputs.empty()) {
-      return false;
-    }
-    auto var_name = inputs.front();
-    auto var = scope->FindVar(var_name);
-    return var != nullptr;
-  } else {
-    return false;
-  }
-}
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/npu/bridge/utils.h b/lite/backends/npu/bridge/utils.h
deleted file mode 100644
index 169b7ca80c..0000000000
--- a/lite/backends/npu/bridge/utils.h
+++ /dev/null
@@ -1,94 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "ai_ddk_lib/include/graph/operator_reg.h"
-#include "lite/core/mir/node.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/target_wrapper.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-namespace bridge {
-
-std::string UniqueName(const std::string& prefix);
-
-ge::DataType PrecisionConverter(PrecisionType itype);
-
-ge::Format DataLayoutConverter(DataLayoutType itype);
-
-ge::TensorPtr CvtFromLiteTensor(Tensor* in_tensor,
-                                std::vector<int64_t> out_shape = {},
-                                PrecisionType in_ptype = PRECISION(kFloat),
-                                DataLayoutType in_ltype = DATALAYOUT(kNCHW));
-
-template <typename T>
-ge::TensorPtr CreateTensorAndFillData(std::vector<T> data,
-                                      std::vector<int64_t> shape = {},
-                                      ge::Format format = ge::FORMAT_NCHW) {
-  const std::type_info& info = typeid(T);
-  ge::DataType type = ge::DT_FLOAT;
-  if (info == typeid(float)) {
-    type = ge::DT_FLOAT;
-  } else if (info == typeid(int8_t)) {
-    type = ge::DT_INT8;
-  } else if (info == typeid(int32_t)) {
-    type = ge::DT_INT32;
-  } else {
-    LOG(FATAL) << "Unknow value type " << info.name();
-  }
-  if (shape.empty()) {
-    shape = {static_cast<int64_t>(data.size())};
-  } else {
-    int size = 1;
-    for (auto i : shape) {
-      size *= i;
-    }
-    CHECK_EQ(data.size(), size);
-  }
-  ge::TensorDesc desc(ge::Shape(shape), format, type);
-  ge::TensorPtr tensor = std::make_shared<ge::Tensor>();
-  tensor->SetTensorDesc(desc);
-  tensor->SetData(reinterpret_cast<uint8_t*>(data.data()),
-                  data.size() * sizeof(T));
-  return tensor;
-}
-
-template <typename T>
-ge::TensorPtr CreateTensorAndFillData(T value,
-                                      std::vector<int64_t> shape = {1},
-                                      ge::Format format = ge::FORMAT_NCHW) {
-  int64_t size = 1;
-  for (auto i : shape) {
-    size *= i;
-  }
-  std::vector<T> data(size, value);
-  return CreateTensorAndFillData(data, shape, format);
-}
-
-bool HasInputArg(const OpInfo* op_info,
-                 const Scope* scope,
-                 const std::string& argname);
-
-}  // namespace bridge
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/npu/npu_helper.cc b/lite/backends/npu/npu_helper.cc
deleted file mode 100644
index 688c62c7f6..0000000000
--- a/lite/backends/npu/npu_helper.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/npu/npu_helper.h"
-#include <fstream>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "ai_ddk_lib/include/HiAiModelManagerService.h"
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/hiai_ir_build.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-
-bool SaveNPUModel(const void* om_model_data,
-                  const size_t om_model_size,
-                  const std::string& om_file_path) {
-  std::FILE* fp;
-  fp = std::fopen(om_file_path.c_str(), "wb");
-  if (fp == NULL) {
-    LOG(WARNING) << "[NPU] " << om_file_path << " open failed!";
-    return false;
-  }
-
-  size_t write_size = std::fwrite(om_model_data, 1, om_model_size, fp);
-  if (write_size != om_model_size) {
-    std::fclose(fp);
-    LOG(WARNING) << "[NPU] Write NPU model failed: " << om_file_path;
-    return false;
-  }
-  std::fclose(fp);
-  return true;
-}
-
-bool BuildNPUClient(const void* om_model_data,
-                    const size_t om_model_size,
-                    const std::string& name) {
-  std::unique_ptr<hiai::AiModelMngerClient> client(
-      new hiai::AiModelMngerClient);
-  int ret = client->Init(nullptr);
-  if (ret != hiai::AI_SUCCESS) {
-    LOG(WARNING) << "[NPU] Failed building NPU client " << name
-                 << ", ret: " << ret;
-    throw std::runtime_error("");
-    return false;
-  }
-
-  auto desc = std::make_shared<hiai::AiModelDescription>(
-      name,
-      DeviceInfo::Global().freq_level(),
-      DeviceInfo::Global().framework_type(),
-      DeviceInfo::Global().model_type(),
-      DeviceInfo::Global().device_type());
-  desc->SetModelBuffer(om_model_data, om_model_size);
-
-  std::vector<std::shared_ptr<hiai::AiModelDescription>> model_desc;
-  model_desc.push_back(desc);
-  if (client->Load(model_desc) != hiai::AI_SUCCESS) {
-    LOG(WARNING) << "[NPU] Model Load Failed: " << desc->GetName();
-    throw std::runtime_error("");
-    return false;
-  }
-
-  DeviceInfo::Global().Insert(name, std::move(client));
-  return true;
-}
-
-// If build from inputs and outputs will save the npu offline model
-bool BuildNPUClient(std::vector<ge::Operator>& inputs,   // NOLINT
-                    std::vector<ge::Operator>& outputs,  // NOLINT
-                    const std::string& name) {
-  LOG(INFO) << "[NPU] Building Client";
-  ge::Graph npu_subgraph("npu_subgraph" + name);
-  npu_subgraph.SetInputs(inputs).SetOutputs(outputs);
-
-  ge::Model npu_model("model", "npu_model" + name);
-  npu_model.SetGraph(npu_subgraph);
-
-  // compile IR graph and output om model to memory
-  domi::HiaiIrBuild ir_build;
-  domi::ModelBufferData om_model_buffer;
-  if (!ir_build.CreateModelBuff(npu_model, om_model_buffer)) {
-    LOG(WARNING) << "[NPU] Failed CreateModelBuff: " << npu_model.GetName();
-    return false;
-  }
-  if (!ir_build.BuildIRModel(npu_model, om_model_buffer)) {
-    LOG(WARNING) << "[NPU] Failed BuildIRModel: " << npu_model.GetName();
-    return false;
-  }
-
-  if (BuildNPUClient(om_model_buffer.data, om_model_buffer.length, name)) {
-    // save npu offline model
-    if (!SaveNPUModel(om_model_buffer.data, om_model_buffer.length, name)) {
-      LOG(WARNING) << "[NPU] Save model " << name << " failed.";
-    }
-    ir_build.ReleaseModelBuff(om_model_buffer);
-    return true;
-  }
-  return false;
-}
-
-// If build from path will not save the npu offline model
-bool BuildNPUClient(const std::string& om_model_file_path,
-                    const std::string& name) {
-  // load om model from file
-  std::ifstream file(om_model_file_path, std::ios::binary);
-  CHECK(file.is_open()) << "[NPU] Unable to open om model file: "
-                        << om_model_file_path;
-  const auto fbegin = file.tellg();
-  file.seekg(0, std::ios::end);
-  const auto fend = file.tellg();
-  size_t om_model_size = fend - fbegin;
-  VLOG(5) << "[NPU] om model file size: " << om_model_size;
-  file.seekg(0, std::ios::beg);
-  std::vector<char> om_model_data(om_model_size);
-  file.read(om_model_data.data(), om_model_size);
-
-  return BuildNPUClient(
-      reinterpret_cast<void*>(om_model_data.data()), om_model_size, name);
-}
-
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/npu/npu_helper.h b/lite/backends/npu/npu_helper.h
deleted file mode 100644
index 95c290315b..0000000000
--- a/lite/backends/npu/npu_helper.h
+++ /dev/null
@@ -1,110 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include "ai_ddk_lib/include/HiAiModelManagerService.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-
-class DeviceInfo {
- public:
-  static DeviceInfo& Global() {
-    static DeviceInfo x;
-    return x;
-  }
-  DeviceInfo() {}
-  void Insert(const std::string& name,
-              std::unique_ptr<hiai::AiModelMngerClient> client) {
-    if (clients_.find(name) != clients_.end()) {
-      LOG(WARNING) << "[NPU] Already insert " << name;
-      return;
-    }
-    clients_.emplace(std::make_pair(name, std::move(client)));
-  }
-
-  void Clear() { clients_.clear(); }
-
-  hiai::AiModelMngerClient* client(const std::string& model_name) const {
-    if (clients_.find(model_name) != clients_.end()) {
-      return clients_.at(model_name).get();
-    } else {
-      return nullptr;
-    }
-  }
-  std::vector<std::string> AllClientNames() {
-    std::vector<std::string> names;
-    for (auto& i : clients_) {
-      names.push_back(i.first);
-    }
-    return names;
-  }
-
-  int freq_level() { return freq_level_; }
-  int framework_type() { return framework_type_; }
-  int model_type() { return model_type_; }
-  int device_type() { return device_type_; }
-
- private:
-  int freq_level_{3};
-  int framework_type_{0};
-  int model_type_{0};
-  int device_type_{0};
-  // TODO(TJ): find better place
-  std::unordered_map<std::string, std::unique_ptr<hiai::AiModelMngerClient>>
-      clients_;
-};
-
-class OpList {
- public:
-  static OpList& Global() {
-    static thread_local OpList x;
-    return x;
-  }
-  void clear() { lists_.clear(); }
-  void add(std::shared_ptr<ge::Operator> p) { lists_.push_back(p); }
-
- private:
-  std::vector<std::shared_ptr<ge::Operator>> lists_;
-};
-
-bool SaveNPUModel(const void* om_model_data,
-                  const size_t om_model_size,
-                  const std::string& om_file_path);
-
-// If build from inputs and outputs will save the npu offline model
-bool BuildNPUClient(std::vector<ge::Operator>& inputs,   // NOLINT
-                    std::vector<ge::Operator>& outputs,  // NOLINT
-                    const std::string& name);
-
-// If build from path will not save the npu offline model
-bool BuildNPUClient(const std::string& om_model_file_path,
-                    const std::string& name);
-
-bool BuildNPUClient(const void* om_model_data,
-                    const size_t om_model_size,
-                    const std::string& name);
-
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/opencl/CMakeLists.txt b/lite/backends/opencl/CMakeLists.txt
deleted file mode 100644
index 1acb983218..0000000000
--- a/lite/backends/opencl/CMakeLists.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-if (NOT LITE_WITH_OPENCL)
-    return()
-endif()
-
-lite_cc_library(cl_wrapper SRCS cl_wrapper.cc)
-lite_cc_library(cl_utility SRCS cl_utility.cc DEPS cl_wrapper)
-lite_cc_library(cl_runtime SRCS cl_runtime.cc DEPS cl_utility)
-lite_cc_library(cl_context SRCS cl_context.cc DEPS cl_runtime)
-lite_cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS tensor)
-lite_cc_library(cl_image SRCS cl_image.cc DEPS tensor cl_image_converter cl_runtime)
-lite_cc_library(cl_caller SRCS cl_caller.cc  DEPS cl_context cl_image)
-lite_cc_library(cl_target_wrapper SRCS target_wrapper.cc DEPS cl_runtime)
-lite_cc_test(test_cl_functions SRCS cl_functions_test.cc DEPS cl_context cl_image cl_caller cl_wrapper cl_target_wrapper
-             ARGS --cl_path=${CMAKE_SOURCE_DIR}/paddle/fluid/lite/backends/opencl)
-lite_cc_test(test_cl_im2col SRCS cl_im2col_test.cc DEPS tensor cl_context cl_wrapper cl_target_wrapper
-             ARGS --cl_path=${CMAKE_SOURCE_DIR}/paddle/fluid/lite/backends/opencl)
-
-add_dependencies(cl_wrapper opencl_clhpp)
diff --git a/lite/backends/opencl/cl_caller.cc b/lite/backends/opencl/cl_caller.cc
deleted file mode 100644
index ae755b756d..0000000000
--- a/lite/backends/opencl/cl_caller.cc
+++ /dev/null
@@ -1,169 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/opencl/cl_caller.h"
-#include <string>
-#include "lite/backends/opencl/cl_context.h"
-#include "lite/backends/opencl/cl_image.h"
-#include "lite/backends/opencl/cl_runtime.h"
-#include "lite/backends/opencl/cl_utility.h"
-#include "lite/core/tensor.h"
-#include "lite/utils/string.h"
-
-namespace paddle {
-namespace lite {
-static void CopyImageData(CLContext* context,
-                          const CLImage& cl_image,
-                          float* out) {
-  int width = cl_image.image_dims()[0];
-  int height = cl_image.image_dims()[1];
-
-  float* image_data = new float[height * width * 4];
-  cl::Image* image = cl_image.cl_image();
-  const std::array<size_t, 3> origin{0, 0, 0};
-  const std::array<size_t, 3> region{
-      static_cast<size_t>(width), static_cast<size_t>(height), 1};
-  cl_int err = context->GetCommandQueue().enqueueReadImage(
-      *image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr);
-  CL_CHECK_FATAL(err);
-
-  auto* converter = cl_image.image_converter();
-  converter->ImageToNCHW(
-      image_data, out, cl_image.image_dims(), cl_image.tensor_dims());
-
-  delete[] image_data;
-}
-
-bool InitOpenCLRuntime(std::string cl_path) {
-  auto* runtime = CLRuntime::Global();
-  runtime->set_cl_path(cl_path);
-  return runtime->IsInitSuccess();
-}
-
-void elementwise_add(CLContext* context,
-                     const float* in,
-                     const DDim& in_dim,
-                     const float* bias,
-                     const DDim& bias_dim,
-                     float* out,
-                     const DDim& out_dim) {
-  if (!(bias_dim.size() == 1 || bias_dim.size() == 4)) {
-    LOG(FATAL) << "Error: bias dims is error";
-    return;
-  }
-  auto kernel = bias_dim.size() == 1 ? context->GetKernel("channel_add")
-                                     : context->GetKernel("elementwise_add");
-  CLImage in_image;
-  in_image.set_tensor_data(in, in_dim);
-  in_image.InitNormalCLImage(context->GetContext());
-  VLOG(3) << " --- Inpu image: " << in_image << " --- ";
-  CLImage bias_image;
-  bias_image.set_tensor_data(bias, bias_dim);
-  bias_image.InitCLImage(context->GetContext());
-  VLOG(3) << " --- Bias image: " << bias_image << " --- ";
-  CLImage out_image;
-  out_image.InitEmptyImage(context->GetContext(), out_dim);
-  cl_int status;
-  status = kernel.setArg(0, *in_image.cl_image());
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(1, *bias_image.cl_image());
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(2, *out_image.cl_image());
-  CL_CHECK_FATAL(status);
-
-  if (bias_dim.size() == 1) {
-    int tensor_w = in_dim[3];
-    status = kernel.setArg(3, tensor_w);
-    CL_CHECK_FATAL(status);
-  }
-  size_t width = in_image.ImageWidth();
-  size_t height = in_image.ImageHeight();
-  auto global_work_size = cl::NDRange{width, height};
-  status = context->GetCommandQueue().enqueueNDRangeKernel(
-      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
-  CL_CHECK_FATAL(status);
-
-  status = context->GetCommandQueue().finish();
-  CL_CHECK_FATAL(status);
-  VLOG(3) << " --- Out image: " << out_image << " --- ";
-  CopyImageData(context, out_image, out);
-}
-
-void pool(CLContext* context,
-          const std::string pooling_type,
-          const int pad_h,
-          const int pad_w,
-          const int stride_h,
-          const int stride_w,
-          const int ksize_h,
-          const int ksize_w,
-          const float* in,
-          const DDim& in_dim,
-          float* out,
-          const DDim& out_dim) {
-  auto kernel =
-      context->GetKernel(string_format("pool_%s", pooling_type.c_str()));
-  CLImage in_image;
-  in_image.set_tensor_data(in, in_dim);
-  in_image.InitNormalCLImage(context->GetContext());
-  VLOG(3) << " --- Inpu image: " << in_image << " --- ";
-  CLImage out_image;
-  out_image.InitEmptyImage(context->GetContext(), out_dim);
-  auto global_work_size = context->DefaultWorkSize(out_image);
-  auto* in_converter =
-      dynamic_cast<CLImageConverterNormal*>(in_image.image_converter());
-  auto* out_converter =
-      dynamic_cast<CLImageConverterNormal*>(out_image.image_converter());
-  const int in_height = in_converter->HeightOfOneBlock();
-  const int in_width = in_converter->WidthOfOneBlock();
-  const int out_height = out_converter->HeightOfOneBlock();
-  const int out_width = out_converter->WidthOfOneBlock();
-  cl_int status;
-  status = kernel.setArg(0, in_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(1, in_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(2, out_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(3, out_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(4, pad_h);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(5, pad_w);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(6, stride_h);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(7, stride_w);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(8, ksize_h);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(9, ksize_w);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(10, *in_image.cl_image());
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(11, *out_image.cl_image());
-  CL_CHECK_FATAL(status);
-
-  status = context->GetCommandQueue().enqueueNDRangeKernel(
-      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
-  CL_CHECK_FATAL(status);
-
-  status = context->GetCommandQueue().finish();
-  CL_CHECK_FATAL(status);
-  VLOG(3) << " --- Out image: " << out_image << " --- ";
-  CopyImageData(context, out_image, out);
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/opencl/cl_caller.h b/lite/backends/opencl/cl_caller.h
deleted file mode 100644
index ed5c9153d3..0000000000
--- a/lite/backends/opencl/cl_caller.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "lite/backends/opencl/cl_context.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-
-bool InitOpenCLRuntime(std::string cl_path);
-
-/// An elementwise_add method to embed OpenCL logic inside, it is used as a
-/// black box so that the framework can remain simple.
-/// NOTE Currently, these methods are quite expensive, we will optimize them
-/// latter.
-void elementwise_add(CLContext* context,
-                     const float* in,
-                     const DDim& in_dim,
-                     const float* bias,
-                     const DDim& bias_dim,
-                     float* out,
-                     const DDim& out_dim);
-
-void pool(CLContext* context,
-          const std::string pooling_type,
-          const int pad_h,
-          const int pad_w,
-          const int stride_h,
-          const int stride_w,
-          const int ksize_h,
-          const int ksize_w,
-          const float* in,
-          const DDim& in_dim,
-          float* out,
-          const DDim& out_dim);
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/opencl/cl_context.cc b/lite/backends/opencl/cl_context.cc
deleted file mode 100644
index 0fcb99486e..0000000000
--- a/lite/backends/opencl/cl_context.cc
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/opencl/cl_context.h"
-#include <memory>
-#include <string>
-#include <utility>
-#include "lite/backends/opencl/cl_runtime.h"
-#include "lite/backends/opencl/cl_utility.h"
-#include "lite/utils/cp_logging.h"
-#include "lite/utils/replace_stl/stream.h"
-
-namespace paddle {
-namespace lite {
-
-cl::CommandQueue &CLContext::GetCommandQueue() {
-  return CLRuntime::Global()->command_queue();
-}
-
-cl::Context &CLContext::GetContext() { return CLRuntime::Global()->context(); }
-
-cl::Program &CLContext::GetProgram(const std::string &file_name,
-                                   const std::string &options) {
-  STL::stringstream program_key_ss;
-  program_key_ss << file_name << options;
-  std::string program_key = program_key_ss.str();
-  auto it = programs_.find(program_key);
-  if (it != programs_.end()) {
-    VLOG(3) << " --- program -> " << program_key << " has been built --- ";
-    return *(it->second);
-  }
-
-  auto program = CLRuntime::Global()->CreateProgram(
-      GetContext(), CLRuntime::Global()->cl_path() + "/cl_kernel/" + file_name);
-
-  VLOG(3) << " --- begin build program -> " << program_key << " --- ";
-  CLRuntime::Global()->BuildProgram(program.get(), options);
-  VLOG(3) << " --- end build program -> " << program_key << " --- ";
-
-  programs_[program_key] = std::move(program);
-
-  return *(programs_[program_key]);
-}
-
-void CLContext::AddKernel(const std::string &kernel_name,
-                          const std::string &file_name,
-                          const std::string &options) {
-  cl_int status{CL_SUCCESS};
-  VLOG(3) << " --- to get program " << file_name << " --- ";
-  auto program = GetProgram(file_name, options);
-  VLOG(3) << " --- end get program --- ";
-  VLOG(3) << " --- to create kernel: " << kernel_name << " --- ";
-  std::unique_ptr<cl::Kernel> kernel(
-      new cl::Kernel(program, kernel_name.c_str(), &status));
-  CL_CHECK_FATAL(status);
-  VLOG(3) << " --- end create kernel --- ";
-  kernels_.emplace_back(std::move(kernel));
-  STL::stringstream kernel_key;
-  kernel_key << kernel_name << options;
-  kernel_offset_[kernel_key.str()] = kernels_.size() - 1;
-}
-
-cl::Kernel &CLContext::GetKernel(const int index) {
-  VLOG(3) << " --- kernel count: " << kernels_.size() << " --- ";
-  CHECK(static_cast<size_t>(index) < kernels_.size())
-      << "The index must be less than the size of kernels.";
-  CHECK(kernels_[index] != nullptr)
-      << "The target kernel pointer cannot be null.";
-  return *(kernels_[index]);
-}
-
-cl::Kernel &CLContext::GetKernel(const std::string &name) {
-  auto it = kernel_offset_.find(name);
-  CHECK(it != kernel_offset_.end()) << "Cannot find the kernel function: "
-                                    << name;
-  return GetKernel(it->second);
-}
-
-cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) {
-  // n c h w
-  auto image_dim = image.tensor_dims();
-  if (image_dim.size() == 4) {
-    auto n = image_dim[0];
-    auto h = image_dim[2];
-    auto w = image_dim[3];
-    auto image_width = image.ImageWidth();
-    auto work_size_0 = image_width / w;
-    auto work_size_1 = w;
-    auto work_size_2 = n * h;
-    return cl::NDRange{static_cast<size_t>(work_size_0),
-                       static_cast<size_t>(work_size_1),
-                       static_cast<size_t>(work_size_2)};
-  } else if (image_dim.size() == 2) {
-    return cl::NDRange{static_cast<size_t>(1),
-                       static_cast<size_t>(image.ImageWidth()),
-                       static_cast<size_t>(image.ImageHeight())};
-  } else if (image_dim.size() == 1) {
-    return cl::NDRange{static_cast<size_t>(1),
-                       static_cast<size_t>(image.ImageWidth()),
-                       static_cast<size_t>(1)};
-  } else if (image_dim.size() == 3) {
-    auto c = image_dim[0];
-    auto h = image_dim[1];
-    auto w = image_dim[2];
-    return cl::NDRange{static_cast<size_t>((c + 3) / 4),
-                       static_cast<size_t>(w),
-                       static_cast<size_t>(h)};
-  } else {
-    LOG(FATAL) << "Not support this dimension, need to be implemented!";
-    return cl::NDRange{};
-  }
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/opencl/cl_context.h b/lite/backends/opencl/cl_context.h
deleted file mode 100644
index a28f82f40e..0000000000
--- a/lite/backends/opencl/cl_context.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "lite/backends/opencl/cl_image.h"
-#include "lite/backends/opencl/cl_include.h"
-
-namespace paddle {
-namespace lite {
-
-class CLContext {
- public:
-  cl::CommandQueue &GetCommandQueue();
-
-  cl::Context &GetContext();
-
-  cl::Program &GetProgram(const std::string &file_name,
-                          const std::string &options);
-
-  void AddKernel(const std::string &kernel_name,
-                 const std::string &file_name,
-                 const std::string &options = "");
-
-  cl::Kernel &GetKernel(const int index);
-
-  cl::Kernel &GetKernel(const std::string &name);
-
-  cl::NDRange DefaultWorkSize(const CLImage &image);
-
- private:
-  std::unordered_map<std::string, std::unique_ptr<cl::Program>> programs_;
-  std::vector<std::unique_ptr<cl::Kernel>> kernels_;
-  std::map<std::string, int> kernel_offset_;
-};
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/opencl/cl_functions_test.cc b/lite/backends/opencl/cl_functions_test.cc
deleted file mode 100644
index b041952b34..0000000000
--- a/lite/backends/opencl/cl_functions_test.cc
+++ /dev/null
@@ -1,451 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <algorithm>
-#include <array>
-#include <memory>
-#include <random>
-#include <vector>
-#include "lite/backends/opencl/cl_caller.h"
-#include "lite/backends/opencl/cl_context.h"
-#include "lite/backends/opencl/cl_image.h"
-#include "lite/backends/opencl/cl_runtime.h"
-#include "lite/backends/opencl/target_wrapper.h"
-#include "lite/core/tensor.h"
-#include "lite/utils/cp_logging.h"
-
-DEFINE_string(cl_path, "/data/local/tmp/opencl", "The OpenCL kernels path.");
-
-namespace paddle {
-namespace lite {
-
-TEST(cl_test, runtime_test) {
-  auto *runtime = CLRuntime::Global();
-  CHECK(runtime->IsInitSuccess());
-  runtime->set_cl_path(FLAGS_cl_path);
-  runtime->platform();
-  runtime->device();
-  runtime->command_queue();
-  auto &context = runtime->context();
-  auto program = runtime->CreateProgram(
-      context,
-      runtime->cl_path() + "/cl_kernel/" + "image/elementwise_add_kernel.cl");
-  auto event = runtime->CreateEvent(context);
-  CHECK(runtime->BuildProgram(program.get()));
-}
-
-TEST(cl_test, context_test) {
-  auto *runtime = CLRuntime::Global();
-  CHECK(runtime->IsInitSuccess());
-  runtime->set_cl_path(FLAGS_cl_path);
-  CLContext context;
-  context.AddKernel("pool_max", "image/pool_kernel.cl", "");
-  context.AddKernel("elementwise_add", "image/elementwise_add_kernel.cl", "");
-  context.AddKernel("elementwise_add", "image/elementwise_add_kernel.cl", "");
-}
-
-TEST(cl_test, kernel_test) {
-  auto *runtime = CLRuntime::Global();
-  CHECK(runtime->IsInitSuccess());
-  runtime->set_cl_path(FLAGS_cl_path);
-  std::unique_ptr<CLContext> context(new CLContext);
-  context->AddKernel("elementwise_add", "image/elementwise_add_kernel.cl");
-  context->AddKernel("pool_max", "image/pool_kernel.cl");
-  context->AddKernel("elementwise_add", "image/elementwise_add_kernel.cl");
-  auto kernel = context->GetKernel(2);
-
-  std::unique_ptr<float[]> in_data(new float[4 * 3 * 256 * 512]);
-  for (int i = 0; i < 4 * 3 * 256 * 512; i++) {
-    in_data[i] = 1.f;
-  }
-  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 3, 256, 512});
-  CLImage in_image;
-  in_image.set_tensor_data(in_data.get(), in_dim);
-  in_image.InitNormalCLImage(context->GetContext());
-  LOG(INFO) << in_image;
-
-  std::unique_ptr<float[]> bias_data(new float[4 * 3 * 256 * 512]);
-  for (int i = 0; i < 4 * 3 * 256 * 512; i++) {
-    bias_data[i] = 2.f;
-  }
-  const DDim bias_dim = DDim(std::vector<DDim::value_type>{4, 3, 256, 512});
-  CLImage bias_image;
-  bias_image.set_tensor_data(bias_data.get(), bias_dim);
-  bias_image.InitNormalCLImage(context->GetContext());
-  LOG(INFO) << bias_image;
-
-  CLImage out_image;
-  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 3, 256, 512});
-  out_image.InitEmptyImage(context->GetContext(), out_dim);
-  LOG(INFO) << out_image;
-
-  cl_int status;
-  status = kernel.setArg(0, *in_image.cl_image());
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(1, *bias_image.cl_image());
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(2, *out_image.cl_image());
-  CL_CHECK_FATAL(status);
-
-  size_t width = in_image.ImageWidth();
-  size_t height = in_image.ImageHeight();
-  auto global_work_size = cl::NDRange{width, height};
-  cl::Event event;
-  status = context->GetCommandQueue().enqueueNDRangeKernel(
-      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, &event);
-  CL_CHECK_FATAL(status);
-  status = context->GetCommandQueue().finish();
-  CL_CHECK_FATAL(status);
-  double start_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
-  double stop_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
-  double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
-  LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros << " us.";
-  LOG(INFO) << out_image;
-}
-
-TEST(cl_test, channel_add_test) {
-  std::default_random_engine engine;
-  std::uniform_real_distribution<float> dist(-5, 5);
-
-  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
-  std::unique_ptr<float[]> in_data(new float[4 * 16 * 256 * 512]);
-  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
-    in_data[i] = dist(engine);
-  }
-
-  const DDim bias_dim = DDim(std::vector<DDim::value_type>{16});
-  std::unique_ptr<float[]> bias_data(new float[16]);
-  for (int i = 0; i < 16; i++) {
-    bias_data[i] = dist(engine);
-  }
-
-  std::unique_ptr<float[]> out_ref(new float[4 * 16 * 256 * 512]);
-  for (int i = 0; i < 4; i++) {
-    for (int j = 0; j < 16; j++) {
-      float b = bias_data[j];
-      for (int k = 0; k < 256 * 512; k++) {
-        int index = (i * 16 + j) * 256 * 512 + k;
-        out_ref[index] = in_data[index] + b;
-      }
-    }
-  }
-
-  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
-  std::unique_ptr<float[]> out(new float[4 * 16 * 256 * 512]);
-
-  bool status = InitOpenCLRuntime(FLAGS_cl_path);
-  CHECK(status) << "Fail to initialize OpenCL runtime.";
-  std::unique_ptr<CLContext> context(new CLContext);
-  context->AddKernel("elementwise_add", "image/elementwise_add_kernel.cl");
-  context->AddKernel("channel_add", "image/channel_add_kernel.cl");
-  elementwise_add(context.get(),
-                  in_data.get(),
-                  in_dim,
-                  bias_data.get(),
-                  bias_dim,
-                  out.get(),
-                  out_dim);
-
-  int stride = 4 * 16 * 256 * 512 / 20;
-  for (int i = 0; i < 4 * 16 * 256 * 512; i += stride) {
-    std::cout << out[i] << " ";
-  }
-  std::cout << std::endl;
-
-  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
-    EXPECT_NEAR(out[i], out_ref[i], 1e-6);
-  }
-}
-
-TEST(cl_test, elementwise_add_test) {
-  std::default_random_engine engine;
-  std::uniform_real_distribution<float> dist(-5, 5);
-
-  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
-  std::unique_ptr<float[]> in_data(new float[4 * 16 * 256 * 512]);
-  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
-    in_data[i] = dist(engine);
-  }
-
-  const DDim bias_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
-  std::unique_ptr<float[]> bias_data(new float[4 * 16 * 256 * 512]);
-  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
-    bias_data[i] = dist(engine);
-  }
-
-  std::unique_ptr<float[]> out_ref(new float[4 * 16 * 256 * 512]);
-  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
-    out_ref[i] = in_data[i] + bias_data[i];
-  }
-
-  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
-  std::unique_ptr<float[]> out(new float[4 * 16 * 256 * 512]);
-
-  bool status = InitOpenCLRuntime(FLAGS_cl_path);
-  CHECK(status) << "Fail to initialize OpenCL runtime.";
-  std::unique_ptr<CLContext> context(new CLContext);
-  context->AddKernel("elementwise_add", "image/elementwise_add_kernel.cl");
-  context->AddKernel("channel_add", "image/channel_add_kernel.cl");
-  elementwise_add(context.get(),
-                  in_data.get(),
-                  in_dim,
-                  bias_data.get(),
-                  bias_dim,
-                  out.get(),
-                  out_dim);
-
-  int stride = 4 * 16 * 256 * 512 / 20;
-  for (int i = 0; i < 4 * 16 * 256 * 512; i += stride) {
-    std::cout << out[i] << " ";
-  }
-  std::cout << std::endl;
-
-  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
-    EXPECT_NEAR(out[i], out_ref[i], 1e-6);
-  }
-}
-
-void pool_avg(const int padding_height,
-              const int padding_width,
-              const int stride_height,
-              const int stride_width,
-              const int ksize_height,
-              const int ksize_width,
-              const float *input_data,
-              const DDim &in_dim,
-              float *output_data,
-              const DDim &out_dim) {
-  const int batch_size = in_dim[0];
-  const int input_height = in_dim[2];
-  const int input_width = in_dim[3];
-  const int output_channels = out_dim[1];
-  const int output_height = out_dim[2];
-  const int output_width = out_dim[3];
-
-  const size_t input_spatial_size = input_height * input_width;
-  const size_t output_spatial_size = output_height * output_width;
-
-  for (int i = 0; i < batch_size; i++) {
-    for (int c = 0; c < output_channels; ++c) {
-      int channel = i * output_channels + c;
-      const float *input_ptr = input_data + channel * input_spatial_size;
-      float *output_ptr = output_data + channel * output_spatial_size;
-
-      for (int ph = 0; ph < output_height; ++ph) {
-        int hstart = ph * stride_height - padding_height;
-        int hend = std::min(hstart + ksize_height, input_height);
-        hstart = std::max(hstart, 0);
-        for (int pw = 0; pw < output_width; ++pw) {
-          int wstart = pw * stride_width - padding_width;
-          int wend = std::min(wstart + ksize_width, input_width);
-          wstart = std::max(wstart, 0);
-
-          float val = 0.f;
-          int count = 0;
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              val += input_ptr[h * input_width + w];
-              ++count;
-            }
-          }
-          output_ptr[ph * output_width + pw] =
-              (count > 0) ? val * (1.f / count) : 0.f;
-        }
-      }
-    }
-  }
-}
-
-TEST(cl_test, pool_test) {
-  std::default_random_engine engine;
-  std::uniform_real_distribution<float> dist(-5, 5);
-
-  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 1024, 7, 7});
-  std::unique_ptr<float[]> in_data(new float[4 * 1024 * 7 * 7]);
-  for (int i = 0; i < 4 * 1024 * 7 * 7; i++) {
-    in_data[i] = dist(engine);
-  }
-
-  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 1024, 1, 1});
-  std::unique_ptr<float[]> out(new float[4 * 1024 * 1 * 1]);
-  std::unique_ptr<float[]> out_ref(new float[4 * 1024 * 1 * 1]);
-
-  bool status = InitOpenCLRuntime(FLAGS_cl_path);
-  CHECK(status) << "Fail to initialize OpenCL runtime.";
-  std::unique_ptr<CLContext> context(new CLContext);
-  context->AddKernel("pool_max", "image/pool_kernel.cl");
-  context->AddKernel("pool_avg", "image/pool_kernel.cl");
-  pool(context.get(),
-       "avg",
-       0,
-       0,
-       1,
-       1,
-       7,
-       7,
-       in_data.get(),
-       in_dim,
-       out.get(),
-       out_dim);
-  pool_avg(0, 0, 1, 1, 7, 7, in_data.get(), in_dim, out_ref.get(), out_dim);
-
-  for (int i = 0; i < 4 * 1024 * 1 * 1; i++) {
-    EXPECT_NEAR(out[i], out_ref[i], 1e-6);
-  }
-}
-
-TEST(cl_test, target_wrapper_buffer_test) {
-  bool inited = InitOpenCLRuntime(FLAGS_cl_path);
-  CHECK(inited) << "Fail to initialize OpenCL runtime.";
-  std::unique_ptr<CLContext> context(new CLContext);
-  std::string kernel_name = "elementwise_add";
-  std::string build_options = "-DCL_DTYPE=float";
-  context->AddKernel(
-      kernel_name, "buffer/elementwise_add_kernel.cl", build_options);
-  std::vector<float> h_a;
-  std::vector<float> h_b;
-  std::vector<float> h_out;
-  std::vector<float> h_ref;
-  for (int i = 0; i < 10; i++) {
-    h_a.push_back(3.14f * i);
-    h_b.push_back(6.28f * i);
-    h_out.push_back(0);
-    h_ref.push_back((3.14f + 6.28f) * i);
-  }
-  auto *d_a = static_cast<cl::Buffer *>(
-      TargetWrapperCL::Malloc(sizeof(float) * h_a.size()));
-  auto *d_b = static_cast<cl::Buffer *>(
-      TargetWrapperCL::Malloc(sizeof(float) * h_b.size()));
-  auto *d_out =
-      static_cast<cl::Buffer *>(TargetWrapperCL::Malloc(sizeof(float) * 10));
-  auto *d_copy =
-      static_cast<cl::Buffer *>(TargetWrapperCL::Malloc(sizeof(float) * 10));
-  TargetWrapperCL::MemcpySync(
-      d_a, h_a.data(), sizeof(float) * h_a.size(), IoDirection::HtoD);
-  TargetWrapperCL::MemcpySync(
-      d_b, h_b.data(), sizeof(float) * h_b.size(), IoDirection::HtoD);
-  // x + y: x[n=1, c=10, h=1, w=1], y[c=10]
-  auto kernel = context->GetKernel(kernel_name + build_options);
-  cl_int status = kernel.setArg(0, *d_a);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(1, *d_b);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(2, *d_out);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(3, 1);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(4, 10);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(5, 1);
-  CL_CHECK_FATAL(status);
-  auto global_work_size = cl::NDRange{10, 1};
-  status = context->GetCommandQueue().enqueueNDRangeKernel(
-      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
-  CL_CHECK_FATAL(status);
-  status = context->GetCommandQueue().finish();
-  CL_CHECK_FATAL(status);
-  TargetWrapperCL::MemcpySync(
-      h_out.data(), d_out, sizeof(float) * 10, IoDirection::DtoH);
-
-  for (int i = 0; i < 10; i++) {
-    std::cout << h_out[i] << " ";
-  }
-  std::cout << std::endl;
-
-  for (int i = 0; i < 10; i++) {
-    EXPECT_NEAR(h_out[i], h_ref[i], 1e-5);
-  }
-
-  TargetWrapperCL::MemcpySync(
-      d_copy, d_out, sizeof(float) * 10, IoDirection::DtoD);
-  std::fill(h_out.begin(), h_out.end(), 0);
-  for (int i = 0; i < 10; i++) {
-    EXPECT_NEAR(h_out[i], 0, 1e-5);
-  }
-  TargetWrapperCL::MemcpySync(
-      h_out.data(), d_copy, sizeof(float) * 10, IoDirection::DtoH);
-  for (int i = 0; i < 10; i++) {
-    EXPECT_NEAR(h_out[i], h_ref[i], 1e-5);
-  }
-
-  auto *mapped_ptr =
-      static_cast<float *>(TargetWrapperCL::Map(d_copy, 0, sizeof(float) * 10));
-  for (int i = 0; i < 10; i++) {
-    EXPECT_NEAR(mapped_ptr[i], h_ref[i], 1e-5);
-  }
-  TargetWrapperCL::Unmap(d_copy, mapped_ptr);
-
-  TargetWrapperCL::Free(d_copy);
-  TargetWrapperCL::Free(d_out);
-  TargetWrapperCL::Free(d_b);
-  TargetWrapperCL::Free(d_a);
-}
-
-TEST(cl_test, target_wrapper_image_test) {
-  const std::array<size_t, 2> image_shape{28, 32};
-  auto *d_image = static_cast<cl::Image2D *>(
-      TargetWrapperCL::MallocImage(image_shape, PRECISION(kFloat)));
-  std::array<size_t, 2> image_pitch;
-  // Map/Unmap test
-  auto *h_image = static_cast<float *>(
-      TargetWrapperCL::MapImage(d_image, image_shape, &image_pitch));
-  // row_pitch = 448 = 28 * 4 (RGBA: 4 floats) * 4 (float in bytes)
-  // slice_pitch = 0
-  size_t row_pitch = image_pitch[0];
-  size_t slice_pitch = image_pitch[1];
-  CHECK_EQ(row_pitch, 448);
-  CHECK_EQ(slice_pitch, 0);
-  LOG(INFO) << "row_pitch = " << row_pitch << ", slice_pitch " << slice_pitch;
-
-  for (int i = 0; i < 10; i++) {
-    h_image[i] = 3.14f * i;
-  }
-  TargetWrapperCL::Unmap(d_image, h_image);
-
-  auto *h_ptr = static_cast<float *>(
-      TargetWrapperCL::MapImage(d_image, image_shape, &image_pitch));
-  for (int i = 0; i < 10; i++) {
-    EXPECT_NEAR(h_ptr[i], 3.14f * i, 1e-6);
-  }
-  TargetWrapperCL::Unmap(d_image, h_ptr);
-
-  // Imagecpy test
-  std::vector<float> h_image_cpy(28 * 4 * 32);
-  for (int i = 0; i < 28 * 4 * 32; i++) {
-    h_image_cpy[i] = 3.14f;
-  }
-  TargetWrapperCL::ImgcpySync(
-      d_image, h_image_cpy.data(), image_shape, image_pitch, IoDirection::HtoD);
-  auto *d_image_cpy = static_cast<cl::Image2D *>(
-      TargetWrapperCL::MallocImage(image_shape, PRECISION(kFloat)));
-  TargetWrapperCL::ImgcpySync(
-      d_image_cpy, d_image, image_shape, image_pitch, IoDirection::DtoD);
-  std::fill(h_image_cpy.begin(), h_image_cpy.end(), 0);
-  TargetWrapperCL::ImgcpySync(h_image_cpy.data(),
-                              d_image_cpy,
-                              image_shape,
-                              image_pitch,
-                              IoDirection::DtoH);
-  for (int i = 0; i < 28 * 4 * 32; i++) {
-    EXPECT_NEAR(h_image_cpy[i], 3.14f, 1e-6);
-  }
-
-  TargetWrapperCL::FreeImage(d_image_cpy);
-  TargetWrapperCL::FreeImage(d_image);
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/opencl/cl_im2col_test.cc b/lite/backends/opencl/cl_im2col_test.cc
deleted file mode 100644
index a0770d34ee..0000000000
--- a/lite/backends/opencl/cl_im2col_test.cc
+++ /dev/null
@@ -1,330 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <memory>
-#include <random>
-#include <vector>
-#include "lite/backends/opencl/cl_context.h"
-#include "lite/backends/opencl/cl_runtime.h"
-#include "lite/backends/opencl/target_wrapper.h"
-#include "lite/core/tensor.h"
-#include "lite/utils/cp_logging.h"
-
-DEFINE_string(cl_path, "/data/local/tmp/opencl", "The OpenCL kernels path.");
-
-namespace paddle {
-namespace lite {
-
-template <typename Dtype>
-void PrintData(std::string name, Dtype *a, const int rows, const int cols) {
-  std::cout << "==== " << name << " ====" << std::endl;
-  for (int r = 0; r < rows; ++r) {
-    for (int c = 0; c < cols; ++c) {
-      std::cout << " " << a[r * cols + c];
-    }
-    std::cout << std::endl;
-  }
-}
-
-inline bool is_a_ge_zero_and_a_lt_b(int a, int b) {
-  return static_cast<unsigned>(a) < static_cast<unsigned>(b);
-}
-
-template <typename Dtype>
-void im2col(const Dtype *data_im,
-            const int channels,
-            const int height,
-            const int width,
-            const int kernel_h,
-            const int kernel_w,
-            const int pad_h,
-            const int pad_w,
-            const int stride_h,
-            const int stride_w,
-            const int dilation_h,
-            const int dilation_w,
-            Dtype *data_col) {
-  const int output_h =
-      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
-  const int output_w =
-      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
-  const int channel_size = height * width;
-
-  for (int channel = 0; channel++ < channels; data_im += channel_size) {
-    for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
-      for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
-        int input_row = -pad_h + kernel_row * dilation_h;
-        for (int output_rows = 0; output_rows < output_h; ++output_rows) {
-          if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
-            for (int output_cols = 0; output_cols < output_w; ++output_cols) {
-              *(data_col++) = 0;
-            }
-          } else {
-            int input_col = -pad_w + kernel_col * dilation_w;
-            for (int output_col = 0; output_col < output_w; ++output_col) {
-              *(data_col++) = (is_a_ge_zero_and_a_lt_b(input_col, width))
-                                  ? data_im[input_row * width + input_col]
-                                  : 0;
-              input_col += stride_w;
-            }
-          }
-          input_row += stride_h;
-        }
-      }
-    }
-  }
-}
-
-// #define CHECK_ERROR
-// #define PRINT_RESULT
-// #define LOOP_TEST
-TEST(cl_test, im2col_test) {
-  using T = float;
-  std::string kernel_func_name = "im2col";
-  std::string kernel_func_path = "buffer/im2col_kernel.cl";
-
-#ifdef LOOP_TEST
-  for (int n : {1}) {
-    for (int c : {32}) {
-      for (int h : {224}) {
-        for (int w : {224}) {
-          for (int kernel_h : {3}) {
-            for (int kernel_w : {3}) {
-              for (int pad_h : {1}) {
-                for (int pad_w : {1}) {
-                  for (int stride_h : {2}) {
-                    for (int stride_w : {2}) {
-                      for (int dilation_h : {1}) {
-                        for (int dilation_w : {1}) {
-// TODO(yuanshuai): support group for im2col
-#else
-  int n = 8;
-  int c = 32;
-  int h = 224;
-  int w = 224;
-  int kernel_h = 3;
-  int kernel_w = 3;
-  int pad_h = 1;
-  int pad_w = 1;
-  int stride_h = 2;
-  int stride_w = 2;
-  int dilation_h = 1;
-  int dilation_w = 1;
-#endif
-
-                          int img_offset = 0;
-                          int col_offset = 0;
-
-                          std::vector<DDim::value_type> input_shape{n, c, h, w};
-                          int channels = input_shape[1];
-                          int height = input_shape[2];
-                          int width = input_shape[3];
-
-                          int height_col = (height + 2 * pad_h -
-                                            (dilation_h * (kernel_h - 1) + 1)) /
-                                               stride_h +
-                                           1;
-                          int width_col = (width + 2 * pad_w -
-                                           (dilation_w * (kernel_w - 1) + 1)) /
-                                              stride_w +
-                                          1;
-                          int col_chw = channels * kernel_h * kernel_w *
-                                        height_col * width_col;
-                          if (col_chw <= 0 || height_col <= 0 ||
-                              width_col <= 0 || channels <= 0) {
-                            VLOG(4) << "col_chw <= 0, skipped";
-#ifdef LOOP_TEST
-                            continue;
-#else
-                            return;
-#endif
-                          }
-
-                          VLOG(4) << "kernel_func_name:" << kernel_func_name
-                                  << " kernel_func_path:" << kernel_func_path;
-                          VLOG(4) << "input_shape:" << input_shape[0] << ", "
-                                  << input_shape[1] << ", " << input_shape[2]
-                                  << ", " << input_shape[3];
-                          VLOG(4) << "kernel_h:" << kernel_h
-                                  << " kernel_w:" << kernel_w
-                                  << " pad_h:" << pad_h << " pad_w:" << pad_w
-                                  << " stride_h:" << stride_h
-                                  << " stride_w:" << stride_w
-                                  << " dilation_h:" << dilation_h
-                                  << " dilation_w:" << dilation_w;
-                          VLOG(4) << "height_col:" << height_col
-                                  << " width_col:" << width_col
-                                  << " img_offset:" << img_offset
-                                  << " col_offset:" << col_offset
-                                  << " col_chw:" << col_chw;
-
-                          const DDim input_dim = DDim(input_shape);
-                          const int input_elem_num = input_dim.production();
-                          T *in_data = static_cast<T *>(
-                              calloc(sizeof(T), input_elem_num));
-                          T *out_data =
-                              static_cast<T *>(calloc(sizeof(T), n * col_chw));
-                          T *out_ref_data =
-                              static_cast<T *>(calloc(sizeof(T), n * col_chw));
-                          for (int i = 0; i < input_elem_num; ++i) {
-                            in_data[i] = i;
-                          }
-
-                          // CPU im2col
-                          for (int b = 0; b < n; b++) {
-                            im2col<T>(in_data + b * channels * height * width,
-                                      channels,
-                                      height,
-                                      width,
-                                      kernel_h,
-                                      kernel_w,
-                                      pad_h,
-                                      pad_w,
-                                      stride_h,
-                                      stride_w,
-                                      dilation_h,
-                                      dilation_w,
-                                      out_ref_data + b * col_chw);
-                          }
-
-                          // OpenCL im2col
-                          auto *runtime = CLRuntime::Global();
-                          CHECK(runtime->IsInitSuccess())
-                              << "Fail to initialize OpenCL runtime.";
-                          runtime->set_cl_path(FLAGS_cl_path);
-
-                          std::unique_ptr<CLContext> context(new CLContext);
-                          context->AddKernel(kernel_func_name,
-                                             kernel_func_path);
-                          auto kernel = context->GetKernel(kernel_func_name);
-
-                          auto *d_in =
-                              static_cast<cl::Buffer *>(TargetWrapperCL::Malloc(
-                                  sizeof(T) * input_elem_num));
-                          auto *d_out = static_cast<cl::Buffer *>(
-                              TargetWrapperCL::Malloc(sizeof(T) * n * col_chw));
-                          TargetWrapperCL::MemcpySync(
-                              d_in,
-                              in_data,
-                              sizeof(T) * input_elem_num,
-                              IoDirection::HtoD);
-
-                          int n_threads = channels * height_col * width_col;
-                          cl_int status;
-                          int arg_idx = 0;
-                          for (int b = 0; b < n; b++) {
-                            img_offset = b * channels * height * width;
-                            col_offset = b * col_chw;
-                            arg_idx = 0;
-                            status = kernel.setArg(arg_idx, *d_in);
-                            CL_CHECK_FATAL(status);
-                            status = kernel.setArg(++arg_idx, img_offset);
-                            CL_CHECK_FATAL(status);
-                            status = kernel.setArg(++arg_idx, n_threads);
-                            CL_CHECK_FATAL(status);
-                            status = kernel.setArg(++arg_idx, height);
-                            CL_CHECK_FATAL(status);
-                            status = kernel.setArg(++arg_idx, width);
-                            CL_CHECK_FATAL(status);
-                            status = kernel.setArg(++arg_idx, kernel_h);
-                            CL_CHECK_FATAL(status);
-                            status = kernel.setArg(++arg_idx, kernel_w);
-                            CL_CHECK_FATAL(status);
-                            status = kernel.setArg(++arg_idx, pad_h);
-                            CL_CHECK_FATAL(status);
-                            status = kernel.setArg(++arg_idx, pad_w);
-                            CL_CHECK_FATAL(status);
-                            status = kernel.setArg(++arg_idx, stride_h);
-                            CL_CHECK_FATAL(status);
-                            status = kernel.setArg(++arg_idx, stride_w);
-                            CL_CHECK_FATAL(status);
-                            status = kernel.setArg(++arg_idx, dilation_h);
-                            CL_CHECK_FATAL(status);
-                            status = kernel.setArg(++arg_idx, dilation_w);
-                            CL_CHECK_FATAL(status);
-                            status = kernel.setArg(++arg_idx, height_col);
-                            CL_CHECK_FATAL(status);
-                            status = kernel.setArg(++arg_idx, width_col);
-                            CL_CHECK_FATAL(status);
-                            status = kernel.setArg(++arg_idx, *d_out);
-                            CL_CHECK_FATAL(status);
-                            status = kernel.setArg(++arg_idx, col_offset);
-                            CL_CHECK_FATAL(status);
-
-                            auto global_work_size =
-                                cl::NDRange{static_cast<size_t>(col_chw)};
-                            status =
-                                context->GetCommandQueue().enqueueNDRangeKernel(
-                                    kernel,
-                                    cl::NullRange,
-                                    global_work_size,
-                                    cl::NullRange,
-                                    nullptr,
-                                    nullptr);
-                            CL_CHECK_FATAL(status);
-                          }
-
-                          status = context->GetCommandQueue().finish();
-                          CL_CHECK_FATAL(status);
-
-                          TargetWrapperCL::MemcpySync(out_data,
-                                                      d_out,
-                                                      sizeof(T) * n * col_chw,
-                                                      IoDirection::DtoH);
-
-#ifdef PRINT_RESULT
-                          PrintData("in", in_data, height, width);
-                          PrintData("out_ref", out_ref_data, height, width);
-                          PrintData("out", out_data, height, width);
-#endif
-
-                          for (int i = 0; i < n * col_chw; ++i) {
-                            EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
-#ifdef CHECK_ERROR
-                            if (abs(out_data[i] - out_ref_data[i]) > 1e-5) {
-                              std::cout << "i:" << i << std::endl;
-                              PrintData("in", in_data, height, width);
-                              PrintData("out_ref", out_ref_data, height, width);
-                              PrintData("out", out_data, height, width);
-                              exit(0);
-                            }
-#endif
-                          }
-
-                          free(in_data);
-                          free(out_data);
-                          free(out_ref_data);
-                          TargetWrapperCL::Free(d_in);
-                          TargetWrapperCL::Free(d_out);
-
-#ifdef LOOP_TEST
-                        }  // dilation_w
-                      }    // dilation_h
-                    }      // stride_w
-                  }        // stride_h
-                }          // pad_w
-              }            // pad_h
-            }              // kernel_w
-          }                // kernel_h
-        }                  // w
-      }                    // h
-    }                      // c
-  }                        // n
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/opencl/cl_image.cc b/lite/backends/opencl/cl_image.cc
deleted file mode 100644
index f6dcd4bbef..0000000000
--- a/lite/backends/opencl/cl_image.cc
+++ /dev/null
@@ -1,160 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/opencl/cl_image.h"
-#include <array>
-#include "lite/backends/opencl/cl_runtime.h"
-#include "lite/backends/opencl/cl_utility.h"
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-
-std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) {
-  int width = cl_image.image_dims_[0];
-  int height = cl_image.image_dims_[1];
-
-  float* image_data = new float[height * width * 4];
-  cl::Image* image = cl_image.cl_image();
-  const std::array<size_t, 3> origin{0, 0, 0};
-  const std::array<size_t, 3> region{
-      static_cast<size_t>(width), static_cast<size_t>(height), 1};
-  cl_int err = CLRuntime::Global()->command_queue().enqueueReadImage(
-      *image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr);
-  CL_CHECK_FATAL(err);
-
-  float* tensor_data = new float[cl_image.numel()];
-  auto* converter = cl_image.image_converter();
-  converter->ImageToNCHW(
-      image_data, tensor_data, cl_image.image_dims_, cl_image.tensor_dims_);
-  int stride = cl_image.numel() / 20;
-  stride = stride > 0 ? stride : 1;
-
-  os << " dims: " << cl_image.tensor_dims_ << "\n";
-  for (int i = 0; i < cl_image.numel(); i += stride) {
-    os << tensor_data[i] << " ";
-  }
-
-  delete[] tensor_data;
-  delete[] image_data;
-
-  return os;
-}
-
-void CLImage::set_tensor_data(const float* tensor_data, const DDim& dim) {
-  auto numel = dim.production();
-  tensor_data_.reset(new float[numel]);
-  memcpy(tensor_data_.get(), tensor_data, numel * sizeof(float));
-  tensor_dims_ = dim;
-}
-
-void CLImage::InitCLImage(const cl::Context& context) {
-  CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!";
-  image_converter_.reset(new CLImageConverterFolder);
-  InitCLImage(context, image_converter_.get());
-}
-
-void CLImage::InitNormalCLImage(const cl::Context& context) {
-  CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!";
-  image_converter_.reset(new CLImageConverterNormal);
-  InitCLImage(context, image_converter_.get());
-}
-
-void CLImage::InitNImage(const cl::Context& context) {
-  CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!";
-  CHECK(tensor_dims_.size() == 4) << " Tensor dim is not 4.";
-  image_converter_.reset(new CLImageConverterNWBlock);
-  InitCLImage(context, image_converter_.get());
-}
-
-void CLImage::InitDWImage(const cl::Context& context) {
-  CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!";
-  CHECK(tensor_dims_.size() == 4) << " Tensor dim is not 4.";
-  image_converter_.reset(new CLImageConverterDWBlock);
-  InitCLImage(context, image_converter_.get());
-}
-
-void CLImage::InitEmptyImage(const cl::Context& context, const DDim& dim) {
-  CHECK(tensor_data_ == nullptr)
-      << " Empty image tensor data shouldn't have value";
-
-  tensor_dims_ = dim;
-  image_converter_.reset(new CLImageConverterNormal);
-
-  VLOG(3) << " to get image dims ";
-  image_dims_ = image_converter_->InitImageDimInfoWith(tensor_dims_);
-  VLOG(3) << " end get image dims " << image_dims_;
-
-  InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
-
-  cl_event_ = CLRuntime::Global()->CreateEvent(context);
-  initialized_ = true;
-  VLOG(3) << " end init cl image ";
-}
-
-void CLImage::InitEmptyWithImageDim(const cl::Context& context,
-                                    const DDim& image_dims) {
-  VLOG(3) << " to get image dims ";
-  image_dims_ = image_dims;
-  VLOG(3) << " end get image dims " << image_dims_;
-
-  InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
-
-  cl_event_ = CLRuntime::Global()->CreateEvent(context);
-  initialized_ = true;
-  VLOG(3) << " end init cl image";
-}
-
-void CLImage::InitCLImage(const cl::Context& context,
-                          CLImageConverterBase* converter) {
-  CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!";
-
-  VLOG(3) << " begin init cl image ";
-  image_dims_ = converter->InitImageDimInfoWith(tensor_dims_);
-
-  float* image_data = new float[image_dims_.production() * 4];
-
-  VLOG(3) << " convert to image ";
-  converter->NCHWToImage(tensor_data_.get(), image_data, tensor_dims_);
-  VLOG(3) << " end convert to image ";
-
-  InitCLImage(context, image_dims_[0], image_dims_[1], image_data);
-
-  delete[] image_data;
-  tensor_data_ = nullptr;
-  cl_event_ = CLRuntime::Global()->CreateEvent(context);
-  initialized_ = true;
-  VLOG(3) << " end init cl image ";
-}
-
-void CLImage::InitCLImage(const cl::Context& context,
-                          int width,
-                          int height,
-                          void* data) {
-  cl::ImageFormat img_format(CL_RGBA, CL_FLOAT);
-  cl_int err;
-  cl_image_.reset(
-      new cl::Image2D(context,
-                      CL_MEM_READ_WRITE | (data ? CL_MEM_COPY_HOST_PTR : 0),
-                      img_format,
-                      width,
-                      height,
-                      0,
-                      data,
-                      &err));
-  CL_CHECK_FATAL(err);
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/opencl/cl_image.h b/lite/backends/opencl/cl_image.h
deleted file mode 100644
index f3a5f6361f..0000000000
--- a/lite/backends/opencl/cl_image.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iostream>
-#include <memory>
-#include <vector>
-#include "lite/backends/opencl/cl_image_converter.h"
-#include "lite/backends/opencl/cl_include.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-
-class CLImage {
-  // For debug
-  friend std::ostream& operator<<(std::ostream& os, const CLImage& image);
-
- public:
-  CLImage() = default;
-  /*
-   * Will not hold input tensor data, memcpy in this method.
-   * */
-  void set_tensor_data(const float* tensor_data, const DDim& dim);
-
-  bool IsInit() { return initialized_; }
-  /*
-   * Need call set_tensor_data first.
-   * Folder when one dim or two dim.
-   * */
-  void InitCLImage(const cl::Context& context);
-
-  void InitNormalCLImage(const cl::Context& context);
-
-  void InitNImage(const cl::Context& context);
-
-  void InitDWImage(const cl::Context& context);
-
-  void InitEmptyImage(const cl::Context& context, const DDim& dim);
-
-  void InitEmptyWithImageDim(const cl::Context& context,
-                             const DDim& image_dims);
-
-  cl::Image* cl_image() const { return cl_image_.get(); }
-
-  const DDim& image_dims() const { return image_dims_; }
-
-  inline size_t ImageWidth() const { return image_dims_[0]; }
-
-  inline size_t ImageHeight() const { return image_dims_[1]; }
-
-  const DDim& tensor_dims() const { return tensor_dims_; }
-
-  /*with_da
-   * Resize original tensor dim.
-   * */
-  inline CLImage& Resize(const DDim& dims) {
-    tensor_dims_ = dims;
-    return *this;
-  }
-
-  template <typename T>
-  T* data() const {
-    CHECK(!initialized_) << "CL image has initialized, tensor data has been "
-                            "deleted, can't use tensor data!";
-    return reinterpret_cast<T*>(tensor_data_);
-  }
-
-  /*
-   *  Numel of tensor dim
-   * */
-  inline int64_t numel() const { return tensor_dims_.production(); }
-
-  /*
-   *  Original tensor dim
-   * */
-
-  cl::UserEvent& cl_event() const { return *cl_event_; }
-
-  CLImageConverterBase* image_converter() const {
-    return image_converter_.get();
-  }
-
- private:
-  void InitCLImage(const cl::Context& context, CLImageConverterBase* converter);
-
-  void InitCLImage(const cl::Context& context,
-                   int width,
-                   int height,
-                   void* data);
-
-  bool initialized_ = false;
-  std::unique_ptr<cl::Image2D> cl_image_{nullptr};
-  std::unique_ptr<cl::UserEvent> cl_event_{nullptr};
-  DDim tensor_dims_;
-  DDim image_dims_;
-  std::unique_ptr<float> tensor_data_{nullptr};
-  std::unique_ptr<CLImageConverterBase> image_converter_{nullptr};
-};
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/opencl/cl_image_converter.cc b/lite/backends/opencl/cl_image_converter.cc
deleted file mode 100644
index 402f710d7a..0000000000
--- a/lite/backends/opencl/cl_image_converter.cc
+++ /dev/null
@@ -1,461 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/opencl/cl_image_converter.h"
-#include <vector>
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-
-DDim CLImageConverterDefault::InitImageDimInfoWith(const DDim &tensor_dim) {
-  size_t new_dims[] = {1, 1, 1, 1};
-  for (size_t j = 0; j < tensor_dim.size(); ++j) {
-    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
-  }
-  size_t N, C, H, W;
-  N = new_dims[0];
-  C = new_dims[1];
-  H = new_dims[2];
-  W = new_dims[3];
-  size_t width = W * ((C + 3) / 4);
-  size_t height = H * N;
-  return DDim(
-      std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
-                                     static_cast<DDim::value_type>(height)}));
-}
-
-void CLImageConverterDefault::NCHWToImage(float *nchw,
-                                          float *image,
-                                          const DDim &tensor_dim) {
-  size_t new_dims[] = {1, 1, 1, 1};
-  for (size_t j = 0; j < tensor_dim.size(); ++j) {
-    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
-  }
-
-  size_t N, C, H, W;
-  N = new_dims[0];
-  C = new_dims[1];
-  H = new_dims[2];
-  W = new_dims[3];
-
-  DDim in_image_dim = InitImageDimInfoWith(tensor_dim);
-
-  VLOG(3) << " tensor dim: " << tensor_dim;
-  VLOG(3) << " image dim: " << in_image_dim;
-
-  size_t width = in_image_dim[0];
-  size_t w_block = width / W;
-
-  float *p = nchw;
-  size_t i0 = 0;
-  for (size_t n = 0; n < N; n++) {
-    for (size_t c = 0; c < w_block * 4; c++) {
-      size_t i1 = i0 + (c / 4) * W;
-      for (size_t h = 0; h < H; h++) {
-        size_t i2 = (i1 << 2) + c % 4;
-        for (size_t w = 0; w < W; w++) {
-          if (c < C) {
-            // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
-            // (c % 4);
-            image[i2] = *p;
-            i2 += 4;
-            p++;
-          } else {
-            image[i2] = 0.0;
-            i2 += 4;
-          }
-        }
-        i1 += width;
-      }
-    }
-    i0 += width * H;
-  }
-}
-
-void CLImageConverterDefault::ImageToNCHW(float *image,
-                                          float *tensor,
-                                          const DDim &image_dim,
-                                          const DDim &tensor_dim) {
-  size_t new_dims[] = {1, 1, 1, 1};
-  for (size_t j = 0; j < tensor_dim.size(); ++j) {
-    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
-  }
-
-  size_t N, C, H, W;
-  N = new_dims[0];
-  C = new_dims[1];
-  H = new_dims[2];
-  W = new_dims[3];
-
-  size_t width = image_dim[0];
-  float *p = tensor;
-
-  size_t i0 = 0;
-  for (size_t n = 0; n < N; n++) {
-    for (size_t c = 0; c < C; c++) {
-      size_t i1 = i0 + (c / 4) * W;
-      for (size_t h = 0; h < H; h++) {
-        size_t i2 = (i1 << 2) + c % 4;
-        for (size_t w = 0; w < W; w++) {
-          *p = image[i2];
-          i2 += 4;
-          p++;
-        }
-        i1 += width;
-      }
-    }
-    i0 += width * H;
-  }
-}
-
-DDim CLImageConverterFolder::InitImageDimInfoWith(const DDim &tensor_dim) {
-  if (tensor_dim.size() <= 2) {
-    size_t tdim[2] = {1, 1};
-    if (tensor_dim.size() == 1) {
-      tdim[1] = tensor_dim[0];
-    } else {
-      tdim[0] = tensor_dim[0];
-      tdim[1] = tensor_dim[1];
-    }
-    size_t width = (tdim[1] + 3) / 4;
-    size_t height = tdim[0];
-
-    width_of_one_block_ = width;
-    height_of_one_block_ = height;
-    c_block_ = 1;
-
-    return DDim(
-        std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
-                                       static_cast<DDim::value_type>(height)}));
-
-  } else {
-    size_t new_dims[] = {1, 1, 1, 1};
-    for (size_t j = 0; j < tensor_dim.size(); ++j) {
-      new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
-    }
-    size_t N, C, H, W;
-    N = new_dims[0];
-    C = new_dims[1];
-    H = new_dims[2];
-    W = new_dims[3];
-    size_t width = W * ((C + 3) / 4);
-    size_t height = H * N;
-
-    width_of_one_block_ = W;
-    height_of_one_block_ = H;
-    c_block_ = width / W;
-
-    return DDim(
-        std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
-                                       static_cast<DDim::value_type>(height)}));
-  }
-}
-
-void CLImageConverterFolder::NCHWToImage(float *tensor,
-                                         float *image,
-                                         const DDim &tensor_dim) {
-  CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0)
-      << " Tensor dim is not support!";
-
-  if (tensor_dim.size() > 2) {
-    CLImageConverterDefault default_converter;
-    default_converter.NCHWToImage(tensor, image, tensor_dim);
-
-  } else {
-    size_t tdim[2] = {1, 1};
-    if (tensor_dim.size() == 1) {
-      tdim[1] = tensor_dim[0];
-    } else {
-      tdim[0] = tensor_dim[0];
-      tdim[1] = tensor_dim[1];
-    }
-
-    DDim image_dim = InitImageDimInfoWith(tensor_dim);
-    size_t width = image_dim[0];
-
-    for (size_t h = 0; h < tdim[0]; h++) {
-      for (size_t w = 0; w < tdim[1]; w++) {
-        image[(h * width + w / 4) * 4 + (w % 4)] = tensor[h * tdim[1] + w];
-      }
-    }
-  }
-}
-
-void CLImageConverterFolder::ImageToNCHW(float *image,
-                                         float *tensor,
-                                         const DDim &image_dim,
-                                         const DDim &tensor_dim) {
-  if (tensor_dim.size() > 2) {
-    CLImageConverterDefault default_converter;
-    default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim);
-
-  } else {
-    size_t width = image_dim[0];
-    size_t H = 1, W = 1;
-
-    if (tensor_dim.size() == 2) {
-      H = tensor_dim[0];
-      W = tensor_dim[1];
-    } else if (tensor_dim.size() == 1) {
-      W = tensor_dim[0];
-    }
-
-    float *p = tensor;
-
-    for (size_t h = 0; h < H; h++) {
-      for (size_t w = 0; w < W; w++) {
-        p[h * W + w] = image[(h * width + w / 4) * 4 + (w % 4)];
-      }
-    }
-  }
-}
-
-DDim CLImageConverterNWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
-  CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
-  size_t N, C, H, W;
-  N = tensor_dim[0];
-  C = tensor_dim[1];
-  H = tensor_dim[2];
-  W = tensor_dim[3];
-  size_t width = W * ((N + 3) / 4);
-  size_t height = C * H;
-  return DDim(
-      std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
-                                     static_cast<DDim::value_type>(height)}));
-}
-
-void CLImageConverterNWBlock::NCHWToImage(float *tensor,
-                                          float *image,
-                                          const DDim &tensor_dim) {
-  CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
-  auto image_dim = InitImageDimInfoWith(tensor_dim);
-  float *p = tensor;
-  size_t N = tensor_dim[0];
-  size_t C = tensor_dim[1];
-  size_t H = tensor_dim[2];
-  size_t W = tensor_dim[3];
-  size_t width = image_dim[0];
-  size_t height = image_dim[1];
-  size_t block = image_dim[0] / tensor_dim[3];
-
-  for (size_t n = 0; n < block * 4; n++) {
-    for (size_t c = 0; c < C; c++) {
-      for (size_t h = 0; h < H; ++h) {
-        for (size_t w = 0; w < W; ++w) {
-          size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
-                         w * 4 + n % 4;
-          if (n < N) {
-            image[index] = *p;
-            p++;
-          } else {
-            image[index] = 0.0;
-          }
-          if (index >= (width * height * 4)) {
-            LOG(INFO) << " index out of range ";
-          }
-        }
-      }
-    }
-  }
-  VLOG(3) << " init done";
-}
-
-void CLImageConverterNWBlock::ImageToNCHW(float *image,
-                                          float *tensor,
-                                          const DDim &image_dim,
-                                          const DDim &tensor_dim) {
-  CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
-  float *p = tensor;
-  size_t N = tensor_dim[0];
-  size_t C = tensor_dim[1];
-  size_t H = tensor_dim[2];
-  size_t W = tensor_dim[3];
-  size_t width = image_dim[0];
-  size_t height = image_dim[1];
-
-  for (size_t n = 0; n < N; n++) {
-    for (size_t c = 0; c < C; c++) {
-      for (size_t h = 0; h < H; ++h) {
-        for (size_t w = 0; w < W; ++w) {
-          size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
-                         w * 4 + n % 4;
-          *p = image[index];
-          p++;
-          if (index >= (width * height * 4)) {
-            LOG(INFO) << " index out of range ";
-          }
-        }
-      }
-    }
-  }
-  VLOG(3) << " init done";
-}
-
-DDim CLImageConverterDWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
-  CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
-  size_t N, C, H, W;
-  N = tensor_dim[0];
-  C = tensor_dim[1];
-  H = tensor_dim[2];
-  W = tensor_dim[3];
-  size_t width = W * ((N + 3) / 4);
-  size_t height = C * H;
-  return DDim(
-      std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
-                                     static_cast<DDim::value_type>(height)}));
-}
-
-void CLImageConverterDWBlock::NCHWToImage(float *tensor,
-                                          float *image,
-                                          const DDim &tensor_dim) {
-  size_t new_dims[] = {1, 1, 1, 1};
-  for (size_t j = 0; j < tensor_dim.size(); ++j) {
-    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
-  }
-
-  size_t N, C, H, W;
-  N = new_dims[1];
-  C = new_dims[0];
-  H = new_dims[2];
-  W = new_dims[3];
-
-  DDim in_image_dim = InitImageDimInfoWith(tensor_dim);
-
-  VLOG(3) << " tensor dim: " << tensor_dim;
-  VLOG(3) << " image dim: " << in_image_dim;
-
-  size_t width = in_image_dim[0];
-  size_t w_block = width / W;
-
-  float *p = tensor;
-  size_t i0 = 0;
-  for (size_t n = 0; n < N; n++) {
-    for (size_t c = 0; c < w_block * 4; c++) {
-      size_t i1 = i0 + (c / 4) * W;
-      for (size_t h = 0; h < H; h++) {
-        size_t i2 = (i1 << 2) + c % 4;
-        for (size_t w = 0; w < W; w++) {
-          if (c < C) {
-            // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
-            // (c % 4);
-            image[i2] = *p;
-            i2 += 4;
-            p++;
-          } else {
-            image[i2] = 0.0;
-            i2 += 4;
-          }
-        }
-        i1 += width;
-      }
-    }
-    i0 += width * H;
-  }
-}
-
-void CLImageConverterDWBlock::ImageToNCHW(float *image,
-                                          float *tensor,
-                                          const DDim &image_dim,
-                                          const DDim &tensor_dim) {
-  CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
-  float *p = tensor;
-  size_t N = tensor_dim[1];
-  size_t C = tensor_dim[0];
-  size_t H = tensor_dim[2];
-  size_t W = tensor_dim[3];
-  size_t width = image_dim[0];
-
-  size_t i0 = 0;
-  for (size_t n = 0; n < N; n++) {
-    for (size_t c = 0; c < C; c++) {
-      size_t i1 = i0 + (c / 4) * W;
-      for (size_t h = 0; h < H; h++) {
-        size_t i2 = (i1 << 2) + c % 4;
-        for (size_t w = 0; w < W; w++) {
-          *p = image[i2];
-          i2 += 4;
-          p++;
-        }
-        i1 += width;
-      }
-    }
-    i0 += width * H;
-  }
-}
-
-DDim CLImageConverterNormal::InitImageDimInfoWith(const DDim &tensor_dim) {
-  size_t new_dims[] = {1, 1, 1, 1};
-  for (size_t j = 0; j < tensor_dim.size(); ++j) {
-    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
-  }
-  size_t N, C, H, W;
-  N = new_dims[0];
-  C = new_dims[1];
-  H = new_dims[2];
-  W = new_dims[3];
-  size_t width = W * ((C + 3) / 4);
-  size_t height = H * N;
-
-  width_of_one_block_ = W;
-  height_of_one_block_ = H;
-  c_block_ = width / W;
-
-  return DDim(
-      std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
-                                     static_cast<DDim::value_type>(height)}));
-}
-
-void CLImageConverterNormal::NCHWToImage(float *tensor,
-                                         float *image,
-                                         const DDim &tensor_dim) {
-  CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0)
-      << " Tensor dim is not support!";
-
-  CLImageConverterDefault default_converter;
-  default_converter.NCHWToImage(tensor, image, tensor_dim);
-}
-
-void CLImageConverterNormal::ImageToNCHW(float *image,
-                                         float *tensor,
-                                         const DDim &image_dim,
-                                         const DDim &tensor_dim) {
-  CLImageConverterDefault default_converter;
-  default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim);
-}
-
-DDim CLImageConverterWinoTransWeight::InitImageDimInfoWith(
-    const DDim &tensor_dim) {
-  CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
-  size_t N, C;
-  N = tensor_dim[0];
-  C = tensor_dim[1];
-  size_t width = (C + 3) / 4;
-  size_t height = N * 16;  // N * (wino_blk_size + 2) * (wino_blk_size + 2)
-  return DDim(
-      std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
-                                     static_cast<DDim::value_type>(height)}));
-}
-
-void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor,
-                                                  float *image,
-                                                  const DDim &tensor_dim) {}
-
-void CLImageConverterWinoTransWeight::ImageToNCHW(float *image,
-                                                  float *tensor,
-                                                  const DDim &image_dim,
-                                                  const DDim &tensor_dim) {}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/opencl/cl_image_converter.h b/lite/backends/opencl/cl_image_converter.h
deleted file mode 100644
index 6faa804557..0000000000
--- a/lite/backends/opencl/cl_image_converter.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-
-class CLImageConverterBase {
- public:
-  virtual ~CLImageConverterBase() {}
-
-  virtual void NCHWToImage(float *nchw,
-                           float *image,
-                           const DDim &tensor_dim) = 0;
-
-  virtual void ImageToNCHW(float *image,
-                           float *nchw,
-                           const DDim &image_dim,
-                           const DDim &tensor_dim) = 0;
-  virtual DDim InitImageDimInfoWith(const DDim &tensor_dim) = 0;
-};
-
-class CLImageConverterDefault : public CLImageConverterBase {
- public:
-  DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
-  void NCHWToImage(float *nchw, float *image, const DDim &tensor_dim) override;
-  void ImageToNCHW(float *image,
-                   float *tensor,
-                   const DDim &image_dim,
-                   const DDim &tensor_dim) override;
-};
-
-class CLImageConverterFolder : public CLImageConverterBase {
- public:
-  DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
-  void NCHWToImage(float *tensor,
-                   float *image,
-                   const DDim &tensor_dim) override;
-  void ImageToNCHW(float *image,
-                   float *tensor,
-                   const DDim &image_dim,
-                   const DDim &tensor_dim) override;
-
-  /*
-   *  width of original tensor
-   * */
-  inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
-
-  /*
-   *  height of original tensor
-   * */
-  inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
-
-  int GetCBlock() const { return c_block_; }
-
- private:
-  int c_block_;
-  int width_of_one_block_;
-  int height_of_one_block_;
-};
-
-class CLImageConverterNormal : public CLImageConverterBase {
- public:
-  DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
-  void NCHWToImage(float *tensor,
-                   float *image,
-                   const DDim &tensor_dim) override;
-  void ImageToNCHW(float *image,
-                   float *tensor,
-                   const DDim &image_dim,
-                   const DDim &tensor_dim) override;
-
-  /*
-   *  width of original tensor
-   * */
-  inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
-
-  /*
-   *  height of original tensor
-   * */
-  inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
-
-  int GetCBlock() const { return c_block_; }
-
- private:
-  int c_block_;
-  int width_of_one_block_;
-  int height_of_one_block_;
-};
-
-class CLImageConverterNWBlock : public CLImageConverterBase {
-  DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
-  void NCHWToImage(float *tensor,
-                   float *image,
-                   const DDim &tensor_dim) override;
-  void ImageToNCHW(float *image,
-                   float *tensor,
-                   const DDim &image_dim,
-                   const DDim &tensor_dim) override;
-};
-class CLImageConverterDWBlock : public CLImageConverterBase {
-  DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
-  void NCHWToImage(float *tensor,
-                   float *image,
-                   const DDim &tensor_dim) override;
-  void ImageToNCHW(float *image,
-                   float *tensor,
-                   const DDim &image_dim,
-                   const DDim &tensor_dim) override;
-};
-
-class CLImageConverterWinoTransWeight : public CLImageConverterBase {
- public:
-  DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
-  void NCHWToImage(float *tensor,
-                   float *image,
-                   const DDim &tensor_dim) override;
-  void ImageToNCHW(float *image,
-                   float *tensor,
-                   const DDim &image_dim,
-                   const DDim &tensor_dim) override;
-};
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/opencl/cl_include.h b/lite/backends/opencl/cl_include.h
deleted file mode 100644
index 254782d629..0000000000
--- a/lite/backends/opencl/cl_include.h
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#define CL_TARGET_OPENCL_VERSION 200
-#define CL_HPP_TARGET_OPENCL_VERSION 200
-#define CL_HPP_MINIMUM_OPENCL_VERSION 110
-
-#include <CL/cl2.hpp>
diff --git a/lite/backends/opencl/cl_kernel/buffer/depthwise_conv2d_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/depthwise_conv2d_kernel.cl
deleted file mode 100644
index ab575ba9b3..0000000000
--- a/lite/backends/opencl/cl_kernel/buffer/depthwise_conv2d_kernel.cl
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cl_common.h>
-
-__kernel void depthwise_conv2d(const int numel, // num of elements
-                       __global CL_DTYPE* input_data,
-                       const int height,
-                       const int width,
-                       const int conved_channel,
-                       const int conved_height,
-                       const int conved_width,
-                       const int kernel_h,
-                       const int kernel_w,
-                       const int stride_h,
-                       const int stride_w,
-                       const int pad_h,
-                       const int pad_w,
-                       __global CL_DTYPE* output_data,
-                       __global CL_DTYPE* weight_data,
-                       __global CL_DTYPE* bias_data) {
-      int index = get_global_id(0);
-      int tmp = get_global_size(0);
-      for(index; index < numel; index += tmp) {
-          const int pw = index % conved_width;
-          const int ph = (index / conved_width) % conved_height;
-          const int c = (index / conved_width / conved_height) % conved_channel;
-          const int n = index / conved_width / conved_height / conved_channel;
-          int hstart = ph * stride_h - pad_h;
-          int wstart = pw * stride_w - pad_w;
-          int hend = min(hstart + kernel_h, height + pad_h);
-          int wend = min(wstart + kernel_w, width + pad_w);
-          hstart = max(hstart, 0);
-          wstart = max(wstart, 0);
-          hend = min(hend, height);
-          wend = min(wend, width);
-          CL_DTYPE v = 0;
-          __global CL_DTYPE* input_slice =
-              input_data + (n * conved_channel + c) * height * width;
-          __global CL_DTYPE* weight_slice =
-              weight_data + c * kernel_h * kernel_w;
-          int khstart = hend < kernel_h ? kernel_h - hend : 0;
-          int kwstart = wend < kernel_w? kernel_w - wend : 0;
-          for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                   v += input_slice[h * width + w]
-                        * weight_slice[(khstart + h - hstart) * kernel_w + (kwstart + w - wstart)];
-              }
-          }
-          if(bias_data != NULL){
-              v += bias_data[c];
-          }
-#ifdef RELU
-          output_data[index] = activation(v);
-#else
-          output_data[index] = v;
-#endif
-      }
-}
diff --git a/lite/backends/opencl/cl_kernel/buffer/elementwise_add_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/elementwise_add_kernel.cl
deleted file mode 100644
index bb6faea629..0000000000
--- a/lite/backends/opencl/cl_kernel/buffer/elementwise_add_kernel.cl
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cl_common.h>
-
-__kernel void elementwise_add(__global const CL_DTYPE* x_data,
-                  __global const CL_DTYPE* y_data,
-                  __global CL_DTYPE* out_data,
-                  const int batch,
-                  const int channels,
-                  const int num) {
-
-  const int c = get_global_id(0); // c: [0, channels)
-  const int b = get_global_id(1); // b: [0, batch)
-
-  if ((c >= channels) || (b >= batch)) {
-    return;
-  }
-
-  const int offset = (b * channels + c) * num;
-
-  __global const CL_DTYPE* din_ptr = x_data + offset;
-  const CL_DTYPE diny_data = y_data[c];
-  __global CL_DTYPE* dout_ptr = out_data + offset;
-
-  for (int n = 0; n < num; ++n) { // n: [0, h*w)
-    *dout_ptr = *din_ptr + diny_data;
-#ifdef RELU
-    *dout_ptr = activation(*dout_ptr);
-#endif
-    ++dout_ptr;
-    ++din_ptr;
-  }
-}
diff --git a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl
deleted file mode 100644
index b8dbf62c06..0000000000
--- a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl
+++ /dev/null
@@ -1,424 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cl_common.h>
-
-
-#define SRC(i, j) src[i * src_width + j]
-#define DST(i, j) dst[i * src_height + j]
-__kernel
-void mat_transpose(__global const CL_DTYPE* src,
-                   __global CL_DTYPE* dst,
-                   const int src_height, const int src_width) {
-  const int col = get_global_id(0); // [0, src_width)  columns of src
-  const int row = get_global_id(1); // [0, src_height) rows of src
-  DST(col, row) = SRC(row, col);
-}
-
-
-// fc_gemm_naive: keep for check
-// a: x_d
-// b: filter_d
-// c: output_d
-__kernel
-void fc_gemm_naive(__global const CL_DTYPE* a,
-                   __global const CL_DTYPE* b,
-                   __global const CL_DTYPE* bias,
-                   __global CL_DTYPE* c,
-                   const int M, const int N, const int K) {
-  const int row = get_global_id(0); // [0, M) height of out == m
-  const int col = get_global_id(1); // [0, N) width of out == n
-
-  if ((col >= N) || (row >= M)) {
-    return;
-  }
-
-  CL_DTYPE a0, b0,
-      c0 = (bias && col < N) ? bias[col] : 0;
-
-  for (int p = 0; p < K; ++p) {
-    a0 = *(a + row * K + p);
-    b0 = *(b + p * N + col);
-    c0 += a0 * b0;
-  }
-
-#ifdef RELU
-  c[row * N + col] = activation(c0);
-#else
-  c[row * N + col] = c0;
-#endif
-}
-
-
-// gemm_batch_naive: used for conv1x1, gemm of im2col_gemm
-// a: filter_d
-// b: x_d
-// c: output_d
-__kernel
-void gemm_batch_naive(__global const CL_DTYPE* a,
-                      __global const CL_DTYPE* b,
-                      __global const CL_DTYPE* bias,
-                      __global CL_DTYPE* c,
-                      const int M, const int N, const int K, const int batch_size) {
-  const int row = get_global_id(0); // [0, M) height of out == m
-  const int col = get_global_id(1); // [0, N) width of out == n
-  const int bidx = get_global_id(2); // [0, batch_size)
-
-  const __global CL_DTYPE* cur_b = b + K * N * bidx;
-  __global CL_DTYPE* cur_c = c + M * N * bidx;
-
-  if ((col >= N) || (row >= M) || (bidx >= batch_size)) {
-    return;
-  }
-
-  CL_DTYPE a0, b0,
-      c0 = (bias && col < N) ? bias[row] : 0;
-
-  for (int p = 0; p < K; ++p) {
-    a0 = *(a + row * K + p);
-    b0 = *(cur_b + p * N + col);
-    c0 += a0 * b0;
-  }
-
-#ifdef RELU
-  cur_c[row * N + col] = activation(c0);
-#else
-  cur_c[row * N + col] = c0;
-#endif
-}
-
-
-// gemm_batch_8x4_buf_buf_N_N: used for conv1x1, gemm of im2col_gemm
-// a: filter_d
-// b: x_d
-// c: output_d
-
-//#define PRINT_KERNEL
-__kernel
-void gemm_batch(__global const CL_DTYPE* Aptr,
-                __global const CL_DTYPE* Bptr,
-                __global const CL_DTYPE* bias,
-                __global CL_DTYPE* Cptr,
-                const int M, const int N, const int K, const int batch_size) {
-
-    int row = get_global_id(0) << 3; // [0, M >> 3) height of out == m
-    int col = get_global_id(1) << 2; // [0, N >> 2) width of out == n
-    const int bidx = get_global_id(2); // [0, batch_size)
-
-    // update B(input), C(output) with batch_size
-    Aptr += mul24(row, K); // A += row * K
-    Bptr += mad24(mul24(K, N), bidx, col); // B += K * N * bidx + col
-    Cptr += mad24(mul24(M, N), bidx, mul24(row, N)); // C += M * N * bidx + row * N
-
-    CL_DTYPE4 a8x4[8];
-    CL_DTYPE4 b4x4[4] = {0.f, 0.f, 0.f, 0.f};
-    CL_DTYPE4 c8x4[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
-
-    if (bias) {
-        c8x4[0] = bias[row];
-        c8x4[1] = bias[row + 1];
-        c8x4[2] = bias[row + 2];
-        c8x4[3] = bias[row + 3];
-        c8x4[4] = bias[row + 4];
-        c8x4[5] = bias[row + 5];
-        c8x4[6] = bias[row + 6];
-        c8x4[7] = bias[row + 7];
-    }
-
-    // main loop of K
-    short pos = 0;
-    for (; pos < K - 3; pos += 4) {
-        b4x4[0] = vload4(0, Bptr + mul24(pos, N));
-        b4x4[1] = vload4(0, Bptr + mul24(pos+1, N));
-        b4x4[2] = vload4(0, Bptr + mul24(pos+2, N));
-        b4x4[3] = vload4(0, Bptr + mul24(pos+3, N));
-
-        // main compute of main loop K: pos + 3 < K
-        #pragma unroll(8)
-        for (int i = 0; i < 8 && i < M; ++i) { // M direction
-            a8x4[i] = vload4(0, Aptr + mad24(i, K, pos));
-
-            c8x4[i] += a8x4[i].x * b4x4[0];
-            c8x4[i] += a8x4[i].y * b4x4[1];
-            c8x4[i] += a8x4[i].z * b4x4[2];
-            c8x4[i] += a8x4[i].w * b4x4[3];
-        }
-    }
-
-    // compute left K
-    if (pos < K) {
-        b4x4[0] = 0.0f;
-        b4x4[1] = 0.0f;
-        b4x4[2] = 0.0f;
-        // b4x4[3] = 0.0f; // impossible used
-        switch (K - pos) {
-            case 3:
-                b4x4[2] = vload4(0, Bptr + mul24(pos+2, N));
-
-            case 2:
-                b4x4[1] = vload4(0, Bptr + mul24(pos+1, N));
-
-            case 1:
-                b4x4[0] = vload4(0, Bptr + mul24(pos, N));
-        }
-
-        #pragma unroll(8)
-        for (int i = 0; i < 8; i++) {
-            a8x4[i] = vload4(0, Aptr + mad24(i, K, pos));
-
-            c8x4[i] += a8x4[i].x * b4x4[0] +
-                       a8x4[i].y * b4x4[1] +
-                       a8x4[i].z * b4x4[2];
-        }
-    }
-
-#ifdef RELU
-    #pragma unroll(8)
-    for (int i = 0; i < 8; ++i) {
-        c8x4[i] = fmax(c8x4[i], (CL_DTYPE4)0.f);
-    }
-#endif
-
-    // store c
-    if (row + 7 < M && col + 3 < N) {
-        #pragma unroll(8)
-        for (int i = 0; i < 8; i++) { // M direction
-            vstore4(c8x4[i], 0, Cptr + mad24(i, N, col));
-        }
-    } else {
-        for (int i = 0; i < 8 && i + row < M; ++i) { // M direction
-            if (col + 3 < N) {
-                vstore4(c8x4[i], 0, Cptr + mad24(i, N, col));
-            } else {
-                switch (N - col) {
-                    case 3:
-                        *(Cptr + mad24(i, N, col + 2))  = c8x4[i].s2;
-                    case 2:
-                        *(Cptr + mad24(i, N, col + 1))  = c8x4[i].s1;
-                    case 1:
-                        *(Cptr + mad24(i, N, col))  = c8x4[i].s0;
-               }
-            }
-        }
-    }
-}
-
-
-// fc_gemv_naive: keep for check
-// used for fc with M = 1
-// a: param.input  {M, K}
-// b: param.w      {K, N}
-// c: param.output {M, N}
-__kernel
-void fc_gemv_naive(__global const CL_DTYPE* a,
-                   __global const CL_DTYPE* b,
-                   __global const CL_DTYPE* bias,
-                   __global CL_DTYPE* c,
-                   const int M, const int N, const int K) {
-    const int col = get_global_id(0); // gws[0]: [0, N) width of B == N
-
-    if (col >= N) {
-        return;
-    }
-    CL_DTYPE c0 = bias ? bias[col] : 0;
-    for (int p = 0; p < K; ++p) {
-      CL_DTYPE a0 = *(a + p);
-      CL_DTYPE b0 = *(b + p * N + col);
-      c0 += a0 * b0;
-    }
-
-#ifdef RELU
-  c[col] = activation(c0);
-#else
-  c[col] = c0;
-#endif
-}
-
-
-// fc_gemv_1x4: for fc with M = 1
-// a: param.input  {M, K}
-// b: param.w      {K, N}
-// c: param.output {M, N}
-__kernel
-void fc_gemv_1x4(__global const CL_DTYPE* a,
-                 __global const CL_DTYPE* b,
-                 __global const CL_DTYPE* bias,
-                 __global CL_DTYPE* c,
-                 const int M, const int N, const int K) {
-    const int col = get_global_id(0) << 2; // gws[0]: [0, N >> 2) height of B == N
-
-    if (col + 3 < N) {
-        CL_DTYPE4 c0 = 0.0f;
-        if (bias) {
-            c0.x = bias[col];
-            c0.y = bias[col+1];
-            c0.z = bias[col+2];
-            c0.w = bias[col+3];
-        }
-
-        // main loop of K
-        int p = 0;
-        for (; p < K - 3; p += 4) {
-            CL_DTYPE4 a0 = vload4(0, a + p);
-            CL_DTYPE4 b0 = vload4(0, b + p * N + col);
-            CL_DTYPE4 b1 = vload4(0, b + (p+1) * N + col);
-            CL_DTYPE4 b2 = vload4(0, b + (p+2) * N + col);
-            CL_DTYPE4 b3 = vload4(0, b + (p+3) * N + col);
-
-            c0 += a0.x * b0;
-            c0 += a0.y * b1;
-            c0 += a0.z * b2;
-            c0 += a0.w * b3;
-        }
-
-        // compute left K
-        CL_DTYPE4 b2 = 0.0f,
-                  b1 = 0.0f,
-                  b0 = 0.0f,
-                  a0 = 0.0f;
-        switch (K - p) {
-            case 3: {
-                b2 = vload4(0, b + (p+2) * N + col);
-                a0.z = a[p + 2];
-            }
-            case 2: {
-                b1 = vload4(0, b + (p+1) * N + col);
-                a0.y = a[p + 1];
-            }
-            case 1: {
-                b0 = vload4(0, b + (p) * N + col);
-                a0.x = a[p];
-            }
-        }
-        c0 += a0.x * b0;
-        c0 += a0.y * b1;
-        c0 += a0.z * b2;
-
-        // store res
-#ifdef RELU
-       if (col % 4 == 0) {
-            vstore4(fmax(c0, (CL_DTYPE4)0.f), 0, c + col);
-        } else {
-            switch (col % 4) {
-                case 3:
-                    c[col + 2] = activation(c0.z);
-                case 2:
-                    c[col + 1] = activation(c0.y);
-                case 1:
-                    c[col] = activation(c0.x);
-            }
-        }
-#else
-       if (col % 4 == 0) {
-            vstore4(c0, 0, c + col);
-        } else {
-            switch (col % 4) {
-                case 3:
-                    c[col + 2] = c0.z;
-                case 2:
-                    c[col + 1] = c0.y;
-                case 1:
-                    c[col] = c0.x;
-            }
-        }
-#endif
-    } else {
-       const int left_col = N - col;
-       for (int col_offset = 0; col_offset < left_col; ++col_offset) {
-           CL_DTYPE c0 = bias ? bias[col] : 0;
-           for (int p = 0; p < K; ++p) {
-               CL_DTYPE b0 = *(b + p * N + col + col_offset);
-               CL_DTYPE a0 = *(a + p);
-               c0 += a0 * b0;
-           }
-#ifdef RELU
-           c[col + col_offset] = activation(c0);
-#else
-           c[col + col_offset] = c0;
-#endif
-       }
-    }
-}
-
-
-// fc_gemm_4x4: for fc with M = 1
-// a: param.input  {M, K}
-// b: param.w      {K, N}
-// c: param.output {M, N}
-__kernel
-void fc_gemm_4x4(__global const CL_DTYPE* a,
-                 __global const CL_DTYPE* b,
-                 __global const CL_DTYPE* bias,
-                 __global CL_DTYPE* c,
-                 const int M, const int N, const int K) {
-    const int row = get_global_id(0) << 2; // id: [0, M>>2) height of out == M
-    const int col = get_global_id(1) << 2; // id: [0, N>>2) width of out == N
-
-    if (row+3 < M && col+3 < N) {
-        CL_DTYPE bias0 = bias ? bias[col]   : 0,
-                 bias1 = bias ? bias[col+1] : 0,
-                 bias2 = bias ? bias[col+2] : 0,
-                 bias3 = bias ? bias[col+3] : 0;
-
-        CL_DTYPE c00 = bias0, c01 = bias1, c02 = bias2, c03 = bias3,
-                 c10 = bias0, c11 = bias1, c12 = bias2, c13 = bias3,
-                 c20 = bias0, c21 = bias1, c22 = bias2, c23 = bias3,
-                 c30 = bias0, c31 = bias1, c32 = bias2, c33 = bias3;
-
-       for (int p = 0; p < K; ++p) {
-            CL_DTYPE
-                a00 = *(a + row       * K + p),
-                a10 = *(a + (row + 1) * K + p),
-                a20 = *(a + (row + 2) * K + p),
-                a30 = *(a + (row + 3) * K + p),
-
-                b00 = *(b + p * N + col),
-                b01 = *(b + p * N + (col + 1)),
-                b02 = *(b + p * N + (col + 2)),
-                b03 = *(b + p * N + (col + 3));
-
-            c00 += a00 * b00; c01 += a00 * b01; c02 += a00 * b02; c03 += a00 * b03;
-            c10 += a10 * b00; c11 += a10 * b01; c12 += a10 * b02; c13 += a10 * b03;
-            c20 += a20 * b00; c21 += a20 * b01; c22 += a20 * b02; c23 += a20 * b03;
-            c30 += a30 * b00; c31 += a30 * b01; c32 += a30 * b02; c33 += a30 * b03;
-        }
-#if defined(RELU)
-        c[row*N+col] = fmax(c00, 0);     c[row*N+(col+1)] = fmax(c01, 0);     c[row*N+(col+2)] = fmax(c02, 0);     c[row*N+(col+3)] = fmax(c03, 0);
-        c[(row+1)*N+col] = fmax(c10, 0); c[(row+1)*N+(col+1)] = fmax(c11, 0); c[(row+1)*N+(col+2)] = fmax(c12, 0); c[(row+1)*N+(col+3)] = fmax(c13, 0);
-        c[(row+2)*N+col] = fmax(c20, 0); c[(row+2)*N+(col+1)] = fmax(c21, 0); c[(row+2)*N+(col+2)] = fmax(c22, 0); c[(row+2)*N+(col+3)] = fmax(c23, 0);
-        c[(row+3)*N+col] = fmax(c30, 0); c[(row+3)*N+(col+1)] = fmax(c31, 0); c[(row+3)*N+(col+2)] = fmax(c32, 0); c[(row+3)*N+(col+3)] = fmax(c33, 0);
-#else
-        c[row*N+col] = c00;     c[row*N+(col+1)] = c01;     c[row*N+(col+2)] = c02;     c[row*N+(col+3)] = c03;
-        c[(row+1)*N+col] = c10; c[(row+1)*N+(col+1)] = c11; c[(row+1)*N+(col+2)] = c12; c[(row+1)*N+(col+3)] = c13;
-        c[(row+2)*N+col] = c20; c[(row+2)*N+(col+1)] = c21; c[(row+2)*N+(col+2)] = c22; c[(row+2)*N+(col+3)] = c23;
-        c[(row+3)*N+col] = c30; c[(row+3)*N+(col+1)] = c31; c[(row+3)*N+(col+2)] = c32; c[(row+3)*N+(col+3)] = c33;
-#endif
-    } else {
-        for (int cidx = col; cidx < N; ++cidx) {
-            for (int ridx = row; ridx < M; ++ridx) {
-                CL_DTYPE a0, b0, c0 = bias ? bias[cidx] : 0;
-                for (int p = 0; p < K; ++p) {
-                    a0 = *(a + ridx * K + p);
-                    b0 = *(b + p * N + cidx),
-                    c0 += a0 * b0;
-                }
-#if defined(RELU)
-                c[ridx * N + cidx] = fmax(c0, 0);
-#else
-                c[ridx * N + cidx] = c0;
-#endif
-            }
-        }
-    }
-}
diff --git a/lite/backends/opencl/cl_kernel/buffer/im2col_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/im2col_kernel.cl
deleted file mode 100644
index fe71f4c6ff..0000000000
--- a/lite/backends/opencl/cl_kernel/buffer/im2col_kernel.cl
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#define CL_DTYPE float
-
-__kernel
-void im2col(__global const CL_DTYPE* data_im, const int img_offset,
-            const int col_chw,
-            const int height, const int width,
-            const int kernel_h, const int kernel_w,
-            const int pad_h, const int pad_w,
-            const int stride_h, const int stride_w,
-            const int dilation_h, const int dilation_w,
-            const int height_col, const int width_col,
-            __global CL_DTYPE* col_data, const int col_offset) {
-  int index = get_global_id(0); // [0, col_chw)
-
-  data_im = data_im + img_offset;
-  col_data = col_data + col_offset;
-
-  if(index < col_chw) {
-    int w_out = index % width_col;
-    int h_index = index / width_col;
-    int h_out = h_index % height_col;
-    int channel_in = h_index / height_col;
-
-    int channel_out = channel_in * kernel_h * kernel_w;
-    int h_in = h_out * stride_h - pad_h;
-    int w_in = w_out * stride_w - pad_w;
-
-    __global CL_DTYPE* col_data_ptr = col_data;
-    col_data_ptr += (channel_out * height_col + h_out) * width_col + w_out;
-    __global const CL_DTYPE* data_im_ptr = data_im;
-    data_im_ptr += (channel_in * height + h_in) * width + w_in;
-
-    int dh = 0;
-    for (int i = 0; i < kernel_h; ++i) {
-      int dw = 0;
-      for (int j = 0; j < kernel_w; ++j) {
-        int h = h_in + dh;
-        int w = w_in + dw;
-        *col_data_ptr = (h >= 0 && w >= 0 && h < height && w < width)
-        ? data_im_ptr[dh * width + dw]
-        : 0;
-        col_data_ptr += height_col * width_col;
-        dw += dilation_w;
-      }
-      dh += dilation_h;
-    }
-  }
-}
-
diff --git a/lite/backends/opencl/cl_kernel/buffer/mat_mul_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/mat_mul_kernel.cl
deleted file mode 100644
index f6c88c9430..0000000000
--- a/lite/backends/opencl/cl_kernel/buffer/mat_mul_kernel.cl
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cl_common.h>
-
-#if 0
-// naive gemm: keep for check
-__kernel
-void mat_mul(__global const CL_DTYPE* x,
-             __global const CL_DTYPE* y,
-             __global CL_DTYPE* out,
-             const int M, const int N, const int K) {
-  const int row = get_global_id(0); // [0, M) columns of out == m
-  const int col = get_global_id(1); // [0, N) rows of out == n
-
-  if ((col >= N) || (row >= M)) {
-    return;
-  }
-
-  CL_DTYPE x0, y0,
-        out0 = 0;
-
-  for (int p = 0; p < K; ++p) {
-    x0 = *(x + row * K + p);
-    y0 = *(y + p * N + col);
-    out0 += x0 * y0;
-  }
-
-  out[row * N + col] = out0;
-}
-#endif // naive gemm
-
-__kernel
-void mat_mul(__global const CL_DTYPE* a,
-             __global const CL_DTYPE* b,
-             __global CL_DTYPE* c,
-             const int M, const int N, const int K) {
-    const int row = get_global_id(0) << 2; // id: [0, M>>2) height of out == M
-    const int col = get_global_id(1) << 2; // id: [0, N>>2) width of out == N
-
-    if (row+3 < M && col+3 < N) {
-        CL_DTYPE c00 = 0, c01 = 0, c02 = 0, c03 = 0,
-                 c10 = 0, c11 = 0, c12 = 0, c13 = 0,
-                 c20 = 0, c21 = 0, c22 = 0, c23 = 0,
-                 c30 = 0, c31 = 0, c32 = 0, c33 = 0;
-
-        for (int p = 0; p < K; p++) {
-
-            CL_DTYPE a00 = *(a + row       * K + p), 
-                     a10 = *(a + (row + 1) * K + p), 
-                     a20 = *(a + (row + 2) * K + p), 
-                     a30 = *(a + (row + 3) * K + p),  
-
-                     b00 = *(b + p * N + col),
-                     b01 = *(b + p * N + (col+1)),
-                     b02 = *(b + p * N + (col+2)),
-                     b03 = *(b + p * N + (col+3));
-        
-            c00 += a00 * b00; c01 += a00 * b01; c02 += a00 * b02; c03 += a00 * b03;
-            c10 += a10 * b00; c11 += a10 * b01; c12 += a10 * b02; c13 += a10 * b03;
-            c20 += a20 * b00; c21 += a20 * b01; c22 += a20 * b02; c23 += a20 * b03;
-            c30 += a30 * b00; c31 += a30 * b01; c32 += a30 * b02; c33 += a30 * b03;
-        }
-        c[row*N+col] = c00;     c[row*N+(col+1)] = c01;     c[row*N+(col+2)] = c02;     c[row*N+(col+3)] = c03;
-        c[(row+1)*N+col] = c10; c[(row+1)*N+(col+1)] = c11; c[(row+1)*N+(col+2)] = c12; c[(row+1)*N+(col+3)] = c13;
-        c[(row+2)*N+col] = c20; c[(row+2)*N+(col+1)] = c21; c[(row+2)*N+(col+2)] = c22; c[(row+2)*N+(col+3)] = c23;
-        c[(row+3)*N+col] = c30; c[(row+3)*N+(col+1)] = c31; c[(row+3)*N+(col+2)] = c32; c[(row+3)*N+(col+3)] = c33;
-    } else {
-        for(int cidx = col; cidx < N; ++cidx) {
-            for (int ridx = row; ridx < M; ++ridx) {
-                CL_DTYPE a0, b0, c0 = 0;
-                for (int p = 0; p < K; ++p) {
-                    a0 = *(a + ridx * K + p);
-                    b0 = *(b + p * N + cidx),
-                    c0 += a0 * b0;
-                }
-                c[ridx * N + cidx] = c0;
-            }
-        }
-    }
-} 
-
diff --git a/lite/backends/opencl/cl_kernel/buffer/pool_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/pool_kernel.cl
deleted file mode 100644
index edf8f119eb..0000000000
--- a/lite/backends/opencl/cl_kernel/buffer/pool_kernel.cl
+++ /dev/null
@@ -1,112 +0,0 @@
-/*************************************************************************************
- * Copyright (c) 2015, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without modification,
- * are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation and/or
- *  other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
- * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- **************************************************************************************/
-
-#include <cl_common.h>
-
-#define MIN_VALUE -FLT_MAX
-
-__kernel void pool_max(const int numel, // num of elements
-                       __global CL_DTYPE* input_data,
-                       const int channels,
-                       const int height,
-                       const int width,
-                       const int pooled_height,
-                       const int pooled_width,
-                       const int kernel_h,
-                       const int kernel_w,
-                       const int stride_h,
-                       const int stride_w,
-                       const int pad_h,
-                       const int pad_w,
-                       __global CL_DTYPE* output_data) {
-  int index = get_global_id(0);
-  int tmp = get_global_size(0);
-  for(index; index < numel; index += tmp) {
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-    int hstart = ph * stride_h - pad_h;
-    int wstart = pw * stride_w - pad_w;
-    const int hend = min(hstart + kernel_h, height);
-    const int wend = min(wstart + kernel_w, width);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    CL_DTYPE maxval = MIN_VALUE;
-    int maxidx = -1;
-    input_data =
-    input_data + (n * channels + c) * height * width;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        if (input_data[h * width + w] > maxval) {
-          maxidx = h * width + w;
-          maxval = input_data[maxidx];
-        }
-      }
-    }
-    output_data[index] = maxval;
-  }
-}
-
-__kernel void pool_avg(const int numel,
-                       __global CL_DTYPE* input_data,
-                       const int channels,
-                       const int height,
-                       const int width,
-                       const int pooled_height,
-                       const int pooled_width,
-                       const int kernel_h,
-                       const int kernel_w,
-                       const int stride_h,
-                       const int stride_w,
-                       const int pad_h,
-                       const int pad_w,
-                       __global CL_DTYPE* output_data) {
-  int index = get_global_id(0);
-  int tmp = get_global_size(0);
-  for(index; index < numel; index+=tmp) {
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels; int hstart = ph * stride_h - pad_h; int wstart = pw * stride_w - pad_w;
-    int hend = min(hstart + kernel_h, height + pad_h);
-    int wend = min(wstart + kernel_w, width + pad_w);
-    const int pool_size = (hend - hstart) * (wend - wstart);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    hend = min(hend, height);
-    wend = min(wend, width);
-    CL_DTYPE aveval = 0;
-    input_data =
-    input_data + (n * channels + c) * height * width;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        aveval += input_data[h * width + w];
-      }
-    }
-    output_data[index] = aveval / pool_size;
-  }
-}
diff --git a/lite/backends/opencl/cl_kernel/buffer/relu_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/relu_kernel.cl
deleted file mode 100644
index b07dc8132f..0000000000
--- a/lite/backends/opencl/cl_kernel/buffer/relu_kernel.cl
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cl_common.h>
-
-__kernel void relu(__global const CL_DTYPE* x_data, const int count, __global CL_DTYPE* out_data) {
-  const int index = get_global_id(0); 
-  if (index < count) {
-    out_data[index] = activation(x_data[index]);
-  }
-}
diff --git a/lite/backends/opencl/cl_kernel/cl_common.h b/lite/backends/opencl/cl_kernel/cl_common.h
deleted file mode 100644
index ec67aa676d..0000000000
--- a/lite/backends/opencl/cl_kernel/cl_common.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-#define GET_VEC_TYPE(type__, size__) type__##size__
-#define VECTORIZED_TYPE(type__, size__) GET_VEC_TYPE(type__, size__)
-#define CL_DTYPE4 VECTORIZED_TYPE(CL_DTYPE, 4)
-
-inline CL_DTYPE activation(CL_DTYPE in
-#ifdef PRELU
-                           ,
-                           CL_DTYPE prelu_alpha
-#endif
-                           ) {
-  CL_DTYPE output;
-#ifdef PRELU
-  output = select(prelu_alpha * in, in, in >= (CL_DTYPE)0);
-#endif
-
-#ifdef RELU
-  output = fmax(in, (CL_DTYPE)0);
-#endif
-  return output;
-}
diff --git a/lite/backends/opencl/cl_kernel/image/channel_add_kernel.cl b/lite/backends/opencl/cl_kernel/image/channel_add_kernel.cl
deleted file mode 100644
index c106377830..0000000000
--- a/lite/backends/opencl/cl_kernel/image/channel_add_kernel.cl
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-__kernel void channel_add(__read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t outputImage, __private const int w) {
-     int x = get_global_id(0);
-     int y = get_global_id(1);
-     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-     int2 coords;
-     coords.x = x;
-     coords.y = y;
-     int2 coords_bias;
-     coords_bias.x = x/w;
-     coords_bias.y = 0;
-     float4 in = read_imagef(input, sampler, coords);
-     float4 biase = read_imagef(bias, sampler, coords_bias);
-     float4 output = in + biase;
-     write_imagef(outputImage, coords, output);
- }
diff --git a/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl b/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl
deleted file mode 100644
index ecf719ae93..0000000000
--- a/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-__kernel void elementwise_add(__read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t outputImage) {
-     int x = get_global_id(0);
-     int y = get_global_id(1);
-     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-     int2 coords;
-     coords.x = x;
-     coords.y = y;
-     float4 in = read_imagef(input, sampler, coords);
-     float4 biase = read_imagef(bias, sampler, coords);
-     float4 output = in + biase;
-     write_imagef(outputImage,coords,output);
- }
diff --git a/lite/backends/opencl/cl_kernel/image/pool_kernel.cl b/lite/backends/opencl/cl_kernel/image/pool_kernel.cl
deleted file mode 100644
index 0ca3b9141d..0000000000
--- a/lite/backends/opencl/cl_kernel/image/pool_kernel.cl
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define MIN_VALUE -FLT_MAX
-
-__kernel void pool_max(
-    __private const int in_height, __private const int in_width,
-    __private const int out_height, __private const int out_width,
-    __private const int pad_top, __private const int pad_left,
-    __private const int stride_h, __private const int stride_w,
-    __private const int ksize_h, __private const int ksize_w,
-    __read_only image2d_t input, __write_only image2d_t output) {
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-  const int out_n = out_nh / out_height;
-  const int out_h = out_nh % out_height;
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  int start_h = out_h * stride_h - pad_top;
-  int end_h = min(start_h + ksize_h, in_height);
-  start_h = max(start_h,0);
-
-  int start_w = out_w * stride_w - pad_left;
-  int end_w = min(start_w + ksize_w, in_width);
-  start_w = max(start_w,0);
-
-  const int pos_in_x = out_c * in_width;
-  const int pos_in_y = out_n * in_height;
-  float4 max_value = (float4)(MIN_VALUE);
-  for (int y = start_h; y < end_h; ++y) {
-    for (int x = start_w; x < end_w; ++x) {
-      float4 tmp = read_imagef(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
-      max_value = max(max_value, tmp);
-    }
-  }
-
-  const int pos_out_x = mad24(out_c, out_width, out_w);
-  write_imagef(output, (int2)(pos_out_x, out_nh), max_value);
-}
-
-__kernel void pool_avg(
-    __private const int in_height, __private const int in_width,
-    __private const int out_height, __private const int out_width,
-    __private const int pad_top, __private const int pad_left,
-    __private const int stride_h, __private const int stride_w,
-    __private const int ksize_h, __private const int ksize_w,
-    __read_only image2d_t input, __write_only image2d_t output) {
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-  const int out_n = out_nh / out_height;
-  const int out_h = out_nh % out_height;
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  int start_h = max(out_h * stride_h - pad_top, 0);
-  int end_h = min(start_h + ksize_h, in_height);
-
-  int start_w = max(out_w * stride_w - pad_left, 0);
-  int end_w = min(start_w + ksize_w, in_width);
-
-  const int pos_in_x = out_c * in_width;
-  const int pos_in_y = out_n * in_height;
-  float4 sum = (float4)(0.0f);
-  int num = 0;
-  for (int y = start_h; y < end_h; ++y) {
-    for (int x = start_w; x < end_w; ++x) {
-      sum += read_imagef(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
-      num++;
-    }
-  }
-  float4 avg = sum / num;
-  const int pos_out_x = mad24(out_c, out_width, out_w);
-  write_imagef(output, (int2)(pos_out_x, out_nh), avg);
-}
diff --git a/lite/backends/opencl/cl_runtime.cc b/lite/backends/opencl/cl_runtime.cc
deleted file mode 100644
index c2504ab611..0000000000
--- a/lite/backends/opencl/cl_runtime.cc
+++ /dev/null
@@ -1,170 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/opencl/cl_runtime.h"
-#include <string>
-#include <utility>
-#include <vector>
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-
-CLRuntime* CLRuntime::Global() {
-  static CLRuntime cl_runtime_;
-  cl_runtime_.Init();
-  return &cl_runtime_;
-}
-
-CLRuntime::~CLRuntime() {
-  if (command_queue_ != nullptr) {
-    command_queue_->finish();
-  }
-  // For controlling the destruction order:
-  command_queue_.reset();
-  context_.reset();
-  device_.reset();
-  platform_.reset();
-}
-
-bool CLRuntime::Init() {
-  if (initialized_) {
-    return true;
-  }
-  bool is_platform_init = InitializePlatform();
-  bool is_device_init = InitializeDevice();
-  is_init_success_ = is_platform_init && is_device_init;
-  initialized_ = true;
-  return initialized_;
-}
-
-cl::Platform& CLRuntime::platform() {
-  CHECK(platform_ != nullptr) << "platform_ is not initialized!";
-  return *platform_;
-}
-
-cl::Context& CLRuntime::context() {
-  if (context_ == nullptr) {
-    context_ = CreateContext();
-  }
-  return *context_;
-}
-
-cl::Device& CLRuntime::device() {
-  CHECK(device_ != nullptr) << "device_ is not initialized!";
-  return *device_;
-}
-
-cl::CommandQueue& CLRuntime::command_queue() {
-  if (command_queue_ == nullptr) {
-    command_queue_ = CreateCommandQueue(context());
-  }
-  return *command_queue_;
-}
-
-std::unique_ptr<cl::Program> CLRuntime::CreateProgram(
-    const cl::Context& context, std::string file_name) {
-  std::ifstream file{file_name, std::ios::binary | std::ios::ate};
-  CHECK(file.is_open()) << "Can't open file from " << file_name;
-  auto size = file.tellg();
-  CHECK(size > 0) << "size is too small.";
-  std::string content(size, '\0');
-  file.seekg(0);
-  file.read(&content[0], size);
-  cl::Program::Sources sources;
-  sources.push_back(content);
-  auto prog =
-      std::unique_ptr<cl::Program>(new cl::Program(context, sources, &status_));
-  VLOG(4) << "OpenCL kernel file name: " << file_name;
-  VLOG(4) << "Program source size: " << content.size();
-  CL_CHECK_FATAL(status_);
-  return std::move(prog);
-}
-
-std::unique_ptr<cl::UserEvent> CLRuntime::CreateEvent(
-    const cl::Context& context) {
-  auto event =
-      std::unique_ptr<cl::UserEvent>(new cl::UserEvent(context, &status_));
-  CL_CHECK_FATAL(status_);
-  return std::move(event);
-}
-
-bool CLRuntime::BuildProgram(cl::Program* program, const std::string& options) {
-  std::string build_option = options + " -cl-fast-relaxed-math -I " +
-                             CLRuntime::Global()->cl_path() + "/cl_kernel";
-  status_ = program->build({*device_}, build_option.c_str());
-  CL_CHECK_ERROR(status_);
-
-  if (status_ != CL_SUCCESS) {
-    if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(device()) ==
-        CL_BUILD_ERROR) {
-      std::string log = program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(device());
-      LOG(FATAL) << "Program build error: " << log;
-    }
-    return false;
-  }
-
-  return true;
-}
-
-bool CLRuntime::InitializePlatform() {
-  std::vector<cl::Platform> all_platforms;
-  status_ = cl::Platform::get(&all_platforms);
-  CL_CHECK_ERROR(status_);
-  if (all_platforms.empty()) {
-    LOG(FATAL) << "No OpenCL platform found!";
-    return false;
-  }
-  platform_ = std::make_shared<cl::Platform>();
-  *platform_ = all_platforms[0];
-  return true;
-}
-
-bool CLRuntime::InitializeDevice() {
-  std::vector<cl::Device> all_devices;
-  status_ = platform_->getDevices(CL_DEVICE_TYPE_GPU, &all_devices);
-  CL_CHECK_ERROR(status_);
-  if (all_devices.empty()) {
-    LOG(FATAL) << "No OpenCL GPU device found!";
-    return false;
-  }
-  device_ = std::make_shared<cl::Device>();
-  *device_ = all_devices[0];
-
-  auto device_name = device_->getInfo<CL_DEVICE_NAME>();
-  LOG(INFO) << "Using device: " << device_name;
-  auto image_support = device_->getInfo<CL_DEVICE_IMAGE_SUPPORT>();
-  if (image_support) {
-    LOG(INFO) << "The chosen device supports image processing.";
-  } else {
-    LOG(INFO) << "The chosen device doesn't support image processing!";
-    return false;
-  }
-  auto ext_data = device_->getInfo<CL_DEVICE_EXTENSIONS>();
-  VLOG(4) << "The extensions supported by this device: " << ext_data;
-  if (ext_data.find("cl_khr_fp16") != std::string::npos) {
-    LOG(INFO) << "The chosen device supports the half data type.";
-  } else {
-    LOG(INFO) << "The chosen device doesn't support the half data type!";
-  }
-  auto max_units = device_->getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
-  LOG(INFO) << "The chosen device has " << max_units << " compute units.";
-  auto local_mem = device_->getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
-  LOG(INFO) << "The local memory size of the chosen device is "
-            << static_cast<float>(local_mem) / 1024 << " KB.";
-  return true;
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/opencl/cl_runtime.h b/lite/backends/opencl/cl_runtime.h
deleted file mode 100644
index 0859780c69..0000000000
--- a/lite/backends/opencl/cl_runtime.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <fstream>
-#include <memory>
-#include <string>
-#include <vector>
-#include "lite/backends/opencl/cl_include.h"
-#include "lite/backends/opencl/cl_utility.h"
-
-namespace paddle {
-namespace lite {
-
-class CLRuntime {
- public:
-  static CLRuntime* Global();
-
-  bool Init();
-
-  cl::Platform& platform();
-
-  cl::Context& context();
-
-  cl::Device& device();
-
-  cl::CommandQueue& command_queue();
-
-  std::unique_ptr<cl::Program> CreateProgram(const cl::Context& context,
-                                             std::string file_name);
-
-  std::unique_ptr<cl::UserEvent> CreateEvent(const cl::Context& context);
-
-  bool BuildProgram(cl::Program* program, const std::string& options = "");
-
-  bool IsInitSuccess() { return is_init_success_; }
-
-  std::string cl_path() { return cl_path_; }
-
-  void set_cl_path(std::string cl_path) { cl_path_ = cl_path; }
-
- private:
-  CLRuntime() = default;
-
-  ~CLRuntime();
-
-  bool InitializePlatform();
-
-  bool InitializeDevice();
-
-  std::shared_ptr<cl::Context> CreateContext() {
-    auto context = std::make_shared<cl::Context>(
-        std::vector<cl::Device>{device()}, nullptr, nullptr, nullptr, &status_);
-    CL_CHECK_FATAL(status_);
-    return context;
-  }
-
-  std::shared_ptr<cl::CommandQueue> CreateCommandQueue(
-      const cl::Context& context) {
-    cl_command_queue_properties properties = 0;
-
-#ifdef LITE_WITH_PROFILE
-    properties |= CL_QUEUE_PROFILING_ENABLE;
-#endif  // LITE_WITH_PROFILE
-    auto queue = std::make_shared<cl::CommandQueue>(
-        context, device(), properties, &status_);
-    CL_CHECK_FATAL(status_);
-    return queue;
-  }
-
-  std::string cl_path_;
-
-  std::shared_ptr<cl::Platform> platform_{nullptr};
-
-  std::shared_ptr<cl::Context> context_{nullptr};
-
-  std::shared_ptr<cl::Device> device_{nullptr};
-
-  std::shared_ptr<cl::CommandQueue> command_queue_{nullptr};
-
-  cl_int status_{CL_SUCCESS};
-
-  bool initialized_{false};
-
-  bool is_init_success_{false};
-};
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/opencl/cl_utility.cc b/lite/backends/opencl/cl_utility.cc
deleted file mode 100644
index 7c8cca414f..0000000000
--- a/lite/backends/opencl/cl_utility.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/opencl/cl_utility.h"
-
-namespace paddle {
-namespace lite {
-
-const char *opencl_error_to_str(cl_int error) {
-#define CASE_CL_CONSTANT(NAME) \
-  case NAME:                   \
-    return #NAME;
-  // Suppose that no combinations are possible.
-  switch (error) {
-    CASE_CL_CONSTANT(CL_SUCCESS)
-    CASE_CL_CONSTANT(CL_DEVICE_NOT_FOUND)
-    CASE_CL_CONSTANT(CL_DEVICE_NOT_AVAILABLE)
-    CASE_CL_CONSTANT(CL_COMPILER_NOT_AVAILABLE)
-    CASE_CL_CONSTANT(CL_MEM_OBJECT_ALLOCATION_FAILURE)
-    CASE_CL_CONSTANT(CL_OUT_OF_RESOURCES)
-    CASE_CL_CONSTANT(CL_OUT_OF_HOST_MEMORY)
-    CASE_CL_CONSTANT(CL_PROFILING_INFO_NOT_AVAILABLE)
-    CASE_CL_CONSTANT(CL_MEM_COPY_OVERLAP)
-    CASE_CL_CONSTANT(CL_IMAGE_FORMAT_MISMATCH)
-    CASE_CL_CONSTANT(CL_IMAGE_FORMAT_NOT_SUPPORTED)
-    CASE_CL_CONSTANT(CL_BUILD_PROGRAM_FAILURE)
-    CASE_CL_CONSTANT(CL_MAP_FAILURE)
-    CASE_CL_CONSTANT(CL_MISALIGNED_SUB_BUFFER_OFFSET)
-    CASE_CL_CONSTANT(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST)
-    CASE_CL_CONSTANT(CL_INVALID_VALUE)
-    CASE_CL_CONSTANT(CL_INVALID_DEVICE_TYPE)
-    CASE_CL_CONSTANT(CL_INVALID_PLATFORM)
-    CASE_CL_CONSTANT(CL_INVALID_DEVICE)
-    CASE_CL_CONSTANT(CL_INVALID_CONTEXT)
-    CASE_CL_CONSTANT(CL_INVALID_QUEUE_PROPERTIES)
-    CASE_CL_CONSTANT(CL_INVALID_COMMAND_QUEUE)
-    CASE_CL_CONSTANT(CL_INVALID_HOST_PTR)
-    CASE_CL_CONSTANT(CL_INVALID_MEM_OBJECT)
-    CASE_CL_CONSTANT(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)
-    CASE_CL_CONSTANT(CL_INVALID_IMAGE_SIZE)
-    CASE_CL_CONSTANT(CL_INVALID_SAMPLER)
-    CASE_CL_CONSTANT(CL_INVALID_BINARY)
-    CASE_CL_CONSTANT(CL_INVALID_BUILD_OPTIONS)
-    CASE_CL_CONSTANT(CL_INVALID_PROGRAM)
-    CASE_CL_CONSTANT(CL_INVALID_PROGRAM_EXECUTABLE)
-    CASE_CL_CONSTANT(CL_INVALID_KERNEL_NAME)
-    CASE_CL_CONSTANT(CL_INVALID_KERNEL_DEFINITION)
-    CASE_CL_CONSTANT(CL_INVALID_KERNEL)
-    CASE_CL_CONSTANT(CL_INVALID_ARG_INDEX)
-    CASE_CL_CONSTANT(CL_INVALID_ARG_VALUE)
-    CASE_CL_CONSTANT(CL_INVALID_ARG_SIZE)
-    CASE_CL_CONSTANT(CL_INVALID_KERNEL_ARGS)
-    CASE_CL_CONSTANT(CL_INVALID_WORK_DIMENSION)
-    CASE_CL_CONSTANT(CL_INVALID_WORK_GROUP_SIZE)
-    CASE_CL_CONSTANT(CL_INVALID_WORK_ITEM_SIZE)
-    CASE_CL_CONSTANT(CL_INVALID_GLOBAL_OFFSET)
-    CASE_CL_CONSTANT(CL_INVALID_EVENT_WAIT_LIST)
-    CASE_CL_CONSTANT(CL_INVALID_EVENT)
-    CASE_CL_CONSTANT(CL_INVALID_OPERATION)
-    CASE_CL_CONSTANT(CL_INVALID_GL_OBJECT)
-    CASE_CL_CONSTANT(CL_INVALID_BUFFER_SIZE)
-    CASE_CL_CONSTANT(CL_INVALID_MIP_LEVEL)
-    CASE_CL_CONSTANT(CL_INVALID_GLOBAL_WORK_SIZE)
-    CASE_CL_CONSTANT(CL_INVALID_PROPERTY)
-
-    default:
-      return "UNKNOWN ERROR CODE";
-  }
-#undef CASE_CL_CONSTANT
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/opencl/cl_utility.h b/lite/backends/opencl/cl_utility.h
deleted file mode 100644
index b7f14c15e6..0000000000
--- a/lite/backends/opencl/cl_utility.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "lite/backends/opencl/cl_include.h"
-#include "lite/utils/cp_logging.h"
-#include "lite/utils/string.h"
-
-namespace paddle {
-namespace lite {
-
-const char* opencl_error_to_str(cl_int error);
-
-#define CL_CHECK_ERROR(err_code__)                                   \
-  if (err_code__ != CL_SUCCESS) {                                    \
-    LOG(ERROR) << string_format(                                     \
-        "OpenCL error with code %s happened in file %s at line %d. " \
-        "Exiting.\n",                                                \
-        opencl_error_to_str(err_code__),                             \
-        __FILE__,                                                    \
-        __LINE__);                                                   \
-  }
-
-#define CL_CHECK_FATAL(err_code__)                                   \
-  if (err_code__ != CL_SUCCESS) {                                    \
-    LOG(FATAL) << string_format(                                     \
-        "OpenCL error with code %s happened in file %s at line %d. " \
-        "Exiting.\n",                                                \
-        opencl_error_to_str(err_code__),                             \
-        __FILE__,                                                    \
-        __LINE__);                                                   \
-  }
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/opencl/cl_wrapper.cc b/lite/backends/opencl/cl_wrapper.cc
deleted file mode 100644
index 357ac8c2d6..0000000000
--- a/lite/backends/opencl/cl_wrapper.cc
+++ /dev/null
@@ -1,732 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/opencl/cl_wrapper.h"
-#include <dlfcn.h>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace lite {
-CLWrapper *CLWrapper::Global() {
-  static CLWrapper wrapper;
-  return &wrapper;
-}
-
-CLWrapper::CLWrapper() {
-  CHECK(InitHandle()) << "Fail to initialize the OpenCL library!";
-  InitFunctions();
-}
-
-bool CLWrapper::InitHandle() {
-  const std::vector<std::string> paths = {
-    "libOpenCL.so",
-#if defined(__aarch64__)
-    // Qualcomm Adreno with Android
-    "/system/vendor/lib64/libOpenCL.so",
-    "/system/lib64/libOpenCL.so",
-    // Arm Mali with Android
-    "/system/vendor/lib64/egl/libGLES_mali.so",
-    "/system/lib64/egl/libGLES_mali.so",
-    // Arm Linux
-    "/usr/lib/aarch64-linux-gnu/libOpenCL.so",
-#else
-    // Qualcomm Adreno with Android
-    "/system/vendor/lib/libOpenCL.so",
-    "/system/lib/libOpenCL.so",
-    // Arm Mali with Android
-    "/system/vendor/lib/egl/libGLES_mali.so",
-    "/system/lib/egl/libGLES_mali.so",
-    // Arm Linux
-    "/usr/lib/arm-linux-gnueabihf/libOpenCL.so",
-#endif
-  };
-  std::string target_lib = "Unknown";
-  for (auto path : paths) {
-    handle_ = dlopen(path.c_str(), RTLD_LAZY);
-    if (handle_ != nullptr) {
-      target_lib = path;
-      break;
-    }
-  }
-  VLOG(4) << "Load the OpenCL library from " << target_lib;
-  if (handle_ != nullptr) {
-    return true;
-  } else {
-    return false;
-  }
-}
-
-void CLWrapper::InitFunctions() {
-  CHECK(handle_ != nullptr) << "The library handle can't be null!";
-
-#define PADDLE_DLSYM(cl_func)                                        \
-  do {                                                               \
-    cl_func##_ = (cl_func##Type)dlsym(handle_, #cl_func);            \
-    if (cl_func##_ == nullptr) {                                     \
-      LOG(ERROR) << "Cannot find the " << #cl_func                   \
-                 << " symbol in libOpenCL.so!";                      \
-      break;                                                         \
-    }                                                                \
-    VLOG(4) << "Loaded the " << #cl_func << " symbol successfully."; \
-  } while (false)
-
-  PADDLE_DLSYM(clGetPlatformIDs);
-  PADDLE_DLSYM(clGetPlatformInfo);
-  PADDLE_DLSYM(clBuildProgram);
-  PADDLE_DLSYM(clEnqueueNDRangeKernel);
-  PADDLE_DLSYM(clSetKernelArg);
-  PADDLE_DLSYM(clRetainMemObject);
-  PADDLE_DLSYM(clReleaseMemObject);
-  PADDLE_DLSYM(clEnqueueUnmapMemObject);
-  PADDLE_DLSYM(clRetainCommandQueue);
-  PADDLE_DLSYM(clCreateContext);
-  PADDLE_DLSYM(clCreateContextFromType);
-  PADDLE_DLSYM(clReleaseContext);
-  PADDLE_DLSYM(clWaitForEvents);
-  PADDLE_DLSYM(clReleaseEvent);
-  PADDLE_DLSYM(clEnqueueWriteBuffer);
-  PADDLE_DLSYM(clEnqueueReadBuffer);
-  PADDLE_DLSYM(clEnqueueReadImage);
-  PADDLE_DLSYM(clGetProgramBuildInfo);
-  PADDLE_DLSYM(clRetainProgram);
-  PADDLE_DLSYM(clEnqueueMapBuffer);
-  PADDLE_DLSYM(clEnqueueMapImage);
-  PADDLE_DLSYM(clCreateCommandQueue);
-  PADDLE_DLSYM(clCreateCommandQueueWithProperties);
-  PADDLE_DLSYM(clReleaseCommandQueue);
-  PADDLE_DLSYM(clCreateProgramWithBinary);
-  PADDLE_DLSYM(clRetainContext);
-  PADDLE_DLSYM(clGetContextInfo);
-  PADDLE_DLSYM(clReleaseProgram);
-  PADDLE_DLSYM(clFlush);
-  PADDLE_DLSYM(clFinish);
-  PADDLE_DLSYM(clGetProgramInfo);
-  PADDLE_DLSYM(clCreateKernel);
-  PADDLE_DLSYM(clRetainKernel);
-  PADDLE_DLSYM(clCreateBuffer);
-  PADDLE_DLSYM(clCreateImage2D);
-  PADDLE_DLSYM(clCreateImage);
-  PADDLE_DLSYM(clCreateUserEvent);
-  PADDLE_DLSYM(clCreateProgramWithSource);
-  PADDLE_DLSYM(clReleaseKernel);
-  PADDLE_DLSYM(clGetDeviceInfo);
-  PADDLE_DLSYM(clGetDeviceIDs);
-  PADDLE_DLSYM(clRetainDevice);
-  PADDLE_DLSYM(clReleaseDevice);
-  PADDLE_DLSYM(clRetainEvent);
-  PADDLE_DLSYM(clGetKernelWorkGroupInfo);
-  PADDLE_DLSYM(clGetEventInfo);
-  PADDLE_DLSYM(clGetEventProfilingInfo);
-  PADDLE_DLSYM(clGetImageInfo);
-  PADDLE_DLSYM(clEnqueueCopyBuffer);
-  PADDLE_DLSYM(clEnqueueWriteImage);
-  PADDLE_DLSYM(clEnqueueCopyImage);
-
-#undef PADDLE_DLSYM
-}
-
-}  // namespace lite
-}  // namespace paddle
-
-CL_API_ENTRY cl_int CL_API_CALL clGetPlatformIDs(cl_uint num_entries,
-                                                 cl_platform_id *platforms,
-                                                 cl_uint *num_platforms)
-    CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clGetPlatformIDs()(
-      num_entries, platforms, num_platforms);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL clGetPlatformInfo(cl_platform_id platform,
-                                                  cl_platform_info param_name,
-                                                  size_t param_value_size,
-                                                  void *param_value,
-                                                  size_t *param_value_size_ret)
-    CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clGetPlatformInfo()(
-      platform,
-      param_name,
-      param_value_size,
-      param_value,
-      param_value_size_ret);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL clBuildProgram(
-    cl_program program,
-    cl_uint num_devices,
-    const cl_device_id *device_list,
-    const char *options,
-    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
-    void *user_data) CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clBuildProgram()(
-      program, num_devices, device_list, options, pfn_notify, user_data);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueNDRangeKernel(cl_command_queue command_queue,
-                       cl_kernel kernel,
-                       cl_uint work_dim,
-                       const size_t *global_work_offset,
-                       const size_t *global_work_size,
-                       const size_t *local_work_size,
-                       cl_uint num_events_in_wait_list,
-                       const cl_event *event_wait_list,
-                       cl_event *event) CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clEnqueueNDRangeKernel()(
-      command_queue,
-      kernel,
-      work_dim,
-      global_work_offset,
-      global_work_size,
-      local_work_size,
-      num_events_in_wait_list,
-      event_wait_list,
-      event);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL clSetKernelArg(cl_kernel kernel,
-                                               cl_uint arg_index,
-                                               size_t arg_size,
-                                               const void *arg_value)
-    CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clSetKernelArg()(
-      kernel, arg_index, arg_size, arg_value);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL clRetainMemObject(cl_mem memobj)
-    CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clRetainMemObject()(memobj);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL clReleaseMemObject(cl_mem memobj)
-    CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clReleaseMemObject()(memobj);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueUnmapMemObject(cl_command_queue command_queue,
-                        cl_mem memobj,
-                        void *mapped_ptr,
-                        cl_uint num_events_in_wait_list,
-                        const cl_event *event_wait_list,
-                        cl_event *event) CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clEnqueueUnmapMemObject()(
-      command_queue,
-      memobj,
-      mapped_ptr,
-      num_events_in_wait_list,
-      event_wait_list,
-      event);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL clRetainCommandQueue(
-    cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clRetainCommandQueue()(
-      command_queue);
-}
-
-CL_API_ENTRY cl_context CL_API_CALL
-clCreateContext(const cl_context_properties *properties,
-                cl_uint num_devices,
-                const cl_device_id *devices,
-                void(CL_CALLBACK *pfn_notify)(const char *errinfo,
-                                              const void *private_info,
-                                              size_t cb,
-                                              void *user_data),
-                void *user_data,
-                cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clCreateContext()(
-      properties, num_devices, devices, pfn_notify, user_data, errcode_ret);
-}
-
-CL_API_ENTRY cl_context CL_API_CALL
-clCreateContextFromType(const cl_context_properties *properties,
-                        cl_device_type device_type,
-                        void(CL_CALLBACK *pfn_notify)(const char *errinfo,
-                                                      const void *private_info,
-                                                      size_t cb,
-                                                      void *user_data),
-                        void *user_data,
-                        cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clCreateContextFromType()(
-      properties, device_type, pfn_notify, user_data, errcode_ret);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL clReleaseContext(cl_context context)
-    CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clReleaseContext()(context);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL clWaitForEvents(
-    cl_uint num_events, const cl_event *event_list) CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clWaitForEvents()(num_events,
-                                                              event_list);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL clReleaseEvent(cl_event event)
-    CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clReleaseEvent()(event);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueWriteBuffer(cl_command_queue command_queue,
-                     cl_mem buffer,
-                     cl_bool blocking_write,
-                     size_t offset,
-                     size_t size,
-                     const void *ptr,
-                     cl_uint num_events_in_wait_list,
-                     const cl_event *event_wait_list,
-                     cl_event *event) CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clEnqueueWriteBuffer()(
-      command_queue,
-      buffer,
-      blocking_write,
-      offset,
-      size,
-      ptr,
-      num_events_in_wait_list,
-      event_wait_list,
-      event);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReadBuffer(cl_command_queue command_queue,
-                    cl_mem buffer,
-                    cl_bool blocking_read,
-                    size_t offset,
-                    size_t size,
-                    void *ptr,
-                    cl_uint num_events_in_wait_list,
-                    const cl_event *event_wait_list,
-                    cl_event *event) CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clEnqueueReadBuffer()(
-      command_queue,
-      buffer,
-      blocking_read,
-      offset,
-      size,
-      ptr,
-      num_events_in_wait_list,
-      event_wait_list,
-      event);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReadImage(cl_command_queue command_queue,
-                   cl_mem image,
-                   cl_bool blocking_read,
-                   const size_t *origin,
-                   const size_t *region,
-                   size_t row_pitch,
-                   size_t slice_pitch,
-                   void *ptr,
-                   cl_uint num_events_in_wait_list,
-                   const cl_event *event_wait_list,
-                   cl_event *event) CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clEnqueueReadImage()(
-      command_queue,
-      image,
-      blocking_read,
-      origin,
-      region,
-      row_pitch,
-      slice_pitch,
-      ptr,
-      num_events_in_wait_list,
-      event_wait_list,
-      event);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL
-clGetProgramBuildInfo(cl_program program,
-                      cl_device_id device,
-                      cl_program_build_info param_name,
-                      size_t param_value_size,
-                      void *param_value,
-                      size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clGetProgramBuildInfo()(
-      program,
-      device,
-      param_name,
-      param_value_size,
-      param_value,
-      param_value_size_ret);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL clRetainProgram(cl_program program)
-    CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clRetainProgram()(program);
-}
-
-CL_API_ENTRY void *CL_API_CALL
-clEnqueueMapBuffer(cl_command_queue command_queue,
-                   cl_mem buffer,
-                   cl_bool blocking_map,
-                   cl_map_flags map_flags,
-                   size_t offset,
-                   size_t size,
-                   cl_uint num_events_in_wait_list,
-                   const cl_event *event_wait_list,
-                   cl_event *event,
-                   cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clEnqueueMapBuffer()(
-      command_queue,
-      buffer,
-      blocking_map,
-      map_flags,
-      offset,
-      size,
-      num_events_in_wait_list,
-      event_wait_list,
-      event,
-      errcode_ret);
-}
-
-CL_API_ENTRY void *CL_API_CALL
-clEnqueueMapImage(cl_command_queue command_queue,
-                  cl_mem image,
-                  cl_bool blocking_map,
-                  cl_map_flags map_flags,
-                  const size_t *origin,
-                  const size_t *region,
-                  size_t *image_row_pitch,
-                  size_t *image_slice_pitch,
-                  cl_uint num_events_in_wait_list,
-                  const cl_event *event_wait_list,
-                  cl_event *event,
-                  cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clEnqueueMapImage()(
-      command_queue,
-      image,
-      blocking_map,
-      map_flags,
-      origin,
-      region,
-      image_row_pitch,
-      image_slice_pitch,
-      num_events_in_wait_list,
-      event_wait_list,
-      event,
-      errcode_ret);
-}
-
-CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_command_queue CL_API_CALL
-clCreateCommandQueue(cl_context context,
-                     cl_device_id device,
-                     cl_command_queue_properties properties,
-                     cl_int *errcode_ret)
-    CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED {
-  return paddle::lite::CLWrapper::Global()->clCreateCommandQueue()(
-      context, device, properties, errcode_ret);
-}
-
-CL_API_ENTRY cl_command_queue CL_API_CALL clCreateCommandQueueWithProperties(
-    cl_context context,
-    cl_device_id device,
-    const cl_queue_properties *properties,
-    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_2_0 {
-  return paddle::lite::CLWrapper::Global()
-      ->clCreateCommandQueueWithProperties()(
-          context, device, properties, errcode_ret);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL clReleaseCommandQueue(
-    cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clReleaseCommandQueue()(
-      command_queue);
-}
-
-CL_API_ENTRY cl_program CL_API_CALL
-clCreateProgramWithBinary(cl_context context,
-                          cl_uint num_devices,
-                          const cl_device_id *device_list,
-                          const size_t *lengths,
-                          const unsigned char **binaries,
-                          cl_int *binary_status,
-                          cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clCreateProgramWithBinary()(
-      context,
-      num_devices,
-      device_list,
-      lengths,
-      binaries,
-      binary_status,
-      errcode_ret);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL clRetainContext(cl_context context)
-    CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clRetainContext()(context);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL clGetContextInfo(cl_context context,
-                                                 cl_context_info param_name,
-                                                 size_t param_value_size,
-                                                 void *param_value,
-                                                 size_t *param_value_size_ret)
-    CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clGetContextInfo()(
-      context, param_name, param_value_size, param_value, param_value_size_ret);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL clReleaseProgram(cl_program program)
-    CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clReleaseProgram()(program);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL clFlush(cl_command_queue command_queue)
-    CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clFlush()(command_queue);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL clFinish(cl_command_queue command_queue)
-    CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clFinish()(command_queue);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL clGetProgramInfo(cl_program program,
-                                                 cl_program_info param_name,
-                                                 size_t param_value_size,
-                                                 void *param_value,
-                                                 size_t *param_value_size_ret)
-    CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clGetProgramInfo()(
-      program, param_name, param_value_size, param_value, param_value_size_ret);
-}
-
-CL_API_ENTRY cl_kernel CL_API_CALL clCreateKernel(cl_program program,
-                                                  const char *kernel_name,
-                                                  cl_int *errcode_ret)
-    CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clCreateKernel()(
-      program, kernel_name, errcode_ret);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL clRetainKernel(cl_kernel kernel)
-    CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clRetainKernel()(kernel);
-}
-
-CL_API_ENTRY cl_mem CL_API_CALL clCreateBuffer(cl_context context,
-                                               cl_mem_flags flags,
-                                               size_t size,
-                                               void *host_ptr,
-                                               cl_int *errcode_ret)
-    CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clCreateBuffer()(
-      context, flags, size, host_ptr, errcode_ret);
-}
-
-CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
-clCreateImage2D(cl_context context,
-                cl_mem_flags flags,
-                const cl_image_format *image_format,
-                size_t image_width,
-                size_t image_height,
-                size_t image_row_pitch,
-                void *host_ptr,
-                cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED {
-  return paddle::lite::CLWrapper::Global()->clCreateImage2D()(context,
-                                                              flags,
-                                                              image_format,
-                                                              image_width,
-                                                              image_height,
-                                                              image_row_pitch,
-                                                              host_ptr,
-                                                              errcode_ret);
-}
-
-CL_API_ENTRY cl_mem CL_API_CALL
-clCreateImage(cl_context context,
-              cl_mem_flags flags,
-              const cl_image_format *image_format,
-              const cl_image_desc *image_desc,
-              void *host_ptr,
-              cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2 {
-  return paddle::lite::CLWrapper::Global()->clCreateImage()(
-      context, flags, image_format, image_desc, host_ptr, errcode_ret);
-}
-
-CL_API_ENTRY cl_event CL_API_CALL clCreateUserEvent(
-    cl_context context, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_1 {
-  return paddle::lite::CLWrapper::Global()->clCreateUserEvent()(context,
-                                                                errcode_ret);
-}
-
-CL_API_ENTRY cl_program CL_API_CALL
-clCreateProgramWithSource(cl_context context,
-                          cl_uint count,
-                          const char **strings,
-                          const size_t *lengths,
-                          cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clCreateProgramWithSource()(
-      context, count, strings, lengths, errcode_ret);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL clReleaseKernel(cl_kernel kernel)
-    CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clReleaseKernel()(kernel);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL clGetDeviceInfo(cl_device_id device,
-                                                cl_device_info param_name,
-                                                size_t param_value_size,
-                                                void *param_value,
-                                                size_t *param_value_size_ret)
-    CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clGetDeviceInfo()(
-      device, param_name, param_value_size, param_value, param_value_size_ret);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDs(cl_platform_id platform,
-                                               cl_device_type device_type,
-                                               cl_uint num_entries,
-                                               cl_device_id *devices,
-                                               cl_uint *num_devices)
-    CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clGetDeviceIDs()(
-      platform, device_type, num_entries, devices, num_devices);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL clRetainDevice(cl_device_id device)
-    CL_API_SUFFIX__VERSION_1_2 {
-  return paddle::lite::CLWrapper::Global()->clRetainDevice()(device);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL clReleaseDevice(cl_device_id device)
-    CL_API_SUFFIX__VERSION_1_2 {
-  return paddle::lite::CLWrapper::Global()->clReleaseDevice()(device);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL clRetainEvent(cl_event event)
-    CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clRetainEvent()(event);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL clGetKernelWorkGroupInfo(
-    cl_kernel kernel,
-    cl_device_id device,
-    cl_kernel_work_group_info param_name,
-    size_t param_value_size,
-    void *param_value,
-    size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clGetKernelWorkGroupInfo()(
-      kernel,
-      device,
-      param_name,
-      param_value_size,
-      param_value,
-      param_value_size_ret);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL clGetEventInfo(cl_event event,
-                                               cl_event_info param_name,
-                                               size_t param_value_size,
-                                               void *param_value,
-                                               size_t *param_value_size_ret)
-    CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clGetEventInfo()(
-      event, param_name, param_value_size, param_value, param_value_size_ret);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL clGetEventProfilingInfo(
-    cl_event event,
-    cl_profiling_info param_name,
-    size_t param_value_size,
-    void *param_value,
-    size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clGetEventProfilingInfo()(
-      event, param_name, param_value_size, param_value, param_value_size_ret);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL clGetImageInfo(cl_mem image,
-                                               cl_image_info param_name,
-                                               size_t param_value_size,
-                                               void *param_value,
-                                               size_t *param_value_size_ret)
-    CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clGetImageInfo()(
-      image, param_name, param_value_size, param_value, param_value_size_ret);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueCopyBuffer(cl_command_queue command_queue,
-                    cl_mem src_buffer,
-                    cl_mem dst_buffer,
-                    size_t src_offset,
-                    size_t dst_offset,
-                    size_t size,
-                    cl_uint num_events_in_wait_list,
-                    const cl_event *event_wait_list,
-                    cl_event *event) CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clEnqueueCopyBuffer()(
-      command_queue,
-      src_buffer,
-      dst_buffer,
-      src_offset,
-      dst_offset,
-      size,
-      num_events_in_wait_list,
-      event_wait_list,
-      event);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueWriteImage(cl_command_queue command_queue,
-                    cl_mem image,
-                    cl_bool blocking_write,
-                    const size_t *origin,
-                    const size_t *region,
-                    size_t input_row_pitch,
-                    size_t input_slice_pitch,
-                    const void *ptr,
-                    cl_uint num_events_in_wait_list,
-                    const cl_event *event_wait_list,
-                    cl_event *event) CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clEnqueueWriteImage()(
-      command_queue,
-      image,
-      blocking_write,
-      origin,
-      region,
-      input_row_pitch,
-      input_slice_pitch,
-      ptr,
-      num_events_in_wait_list,
-      event_wait_list,
-      event);
-}
-
-CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueCopyImage(cl_command_queue command_queue,
-                   cl_mem src_image,
-                   cl_mem dst_image,
-                   const size_t *src_origin,
-                   const size_t *dst_origin,
-                   const size_t *region,
-                   cl_uint num_events_in_wait_list,
-                   const cl_event *event_wait_list,
-                   cl_event *event) CL_API_SUFFIX__VERSION_1_0 {
-  return paddle::lite::CLWrapper::Global()->clEnqueueCopyImage()(
-      command_queue,
-      src_image,
-      dst_image,
-      src_origin,
-      dst_origin,
-      region,
-      num_events_in_wait_list,
-      event_wait_list,
-      event);
-}
diff --git a/lite/backends/opencl/cl_wrapper.h b/lite/backends/opencl/cl_wrapper.h
deleted file mode 100644
index 35ef33e5a2..0000000000
--- a/lite/backends/opencl/cl_wrapper.h
+++ /dev/null
@@ -1,572 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "lite/backends/opencl/cl_include.h"
-#include "lite/utils/cp_logging.h"
-
-#if CL_HPP_TARGET_OPENCL_VERSION < 200
-#define CL_API_SUFFIX__VERSION_2_0
-#endif
-
-namespace paddle {
-namespace lite {
-
-class CLWrapper final {
- public:
-  static CLWrapper *Global();
-  // Platform APIs
-  using clGetPlatformIDsType = cl_int (*)(cl_uint, cl_platform_id *, cl_uint *);
-  using clGetPlatformInfoType =
-      cl_int (*)(cl_platform_id, cl_platform_info, size_t, void *, size_t *);
-  using clBuildProgramType = cl_int (*)(cl_program,
-                                        cl_uint,
-                                        const cl_device_id *,
-                                        const char *,
-                                        void (*pfn_notify)(cl_program, void *),
-                                        void *);
-  using clEnqueueNDRangeKernelType = cl_int (*)(cl_command_queue,
-                                                cl_kernel,
-                                                cl_uint,
-                                                const size_t *,
-                                                const size_t *,
-                                                const size_t *,
-                                                cl_uint,
-                                                const cl_event *,
-                                                cl_event *);
-  using clSetKernelArgType = cl_int (*)(cl_kernel,
-                                        cl_uint,
-                                        size_t,
-                                        const void *);
-  using clRetainMemObjectType = cl_int (*)(cl_mem);
-  using clReleaseMemObjectType = cl_int (*)(cl_mem);
-  using clEnqueueUnmapMemObjectType = cl_int (*)(
-      cl_command_queue, cl_mem, void *, cl_uint, const cl_event *, cl_event *);
-  using clRetainCommandQueueType = cl_int (*)(cl_command_queue command_queue);
-  using clCreateContextType = cl_context (*)(const cl_context_properties *,
-                                             cl_uint,
-                                             const cl_device_id *,
-                                             void(CL_CALLBACK *)(  // NOLINT
-                                                 const char *,
-                                                 const void *,
-                                                 size_t,
-                                                 void *),
-                                             void *,
-                                             cl_int *);
-  using clCreateContextFromTypeType =
-      cl_context (*)(const cl_context_properties *,
-                     cl_device_type,
-                     void(CL_CALLBACK *)(  // NOLINT
-                         const char *,
-                         const void *,
-                         size_t,
-                         void *),
-                     void *,
-                     cl_int *);
-  using clReleaseContextType = cl_int (*)(cl_context);
-  using clWaitForEventsType = cl_int (*)(cl_uint, const cl_event *);
-  using clReleaseEventType = cl_int (*)(cl_event);
-  using clEnqueueWriteBufferType = cl_int (*)(cl_command_queue,
-                                              cl_mem,
-                                              cl_bool,
-                                              size_t,
-                                              size_t,
-                                              const void *,
-                                              cl_uint,
-                                              const cl_event *,
-                                              cl_event *);
-  using clEnqueueReadBufferType = cl_int (*)(cl_command_queue,
-                                             cl_mem,
-                                             cl_bool,
-                                             size_t,
-                                             size_t,
-                                             void *,
-                                             cl_uint,
-                                             const cl_event *,
-                                             cl_event *);
-  using clEnqueueReadImageType = cl_int (*)(cl_command_queue,
-                                            cl_mem,
-                                            cl_bool,
-                                            const size_t *,
-                                            const size_t *,
-                                            size_t,
-                                            size_t,
-                                            void *,
-                                            cl_uint,
-                                            const cl_event *,
-                                            cl_event *);
-  using clGetProgramBuildInfoType = cl_int (*)(cl_program,
-                                               cl_device_id,
-                                               cl_program_build_info,
-                                               size_t,
-                                               void *,
-                                               size_t *);
-  using clRetainProgramType = cl_int (*)(cl_program program);
-  using clEnqueueMapBufferType = void *(*)(cl_command_queue,
-                                           cl_mem,
-                                           cl_bool,
-                                           cl_map_flags,
-                                           size_t,
-                                           size_t,
-                                           cl_uint,
-                                           const cl_event *,
-                                           cl_event *,
-                                           cl_int *);
-  using clEnqueueMapImageType = void *(*)(cl_command_queue,
-                                          cl_mem,
-                                          cl_bool,
-                                          cl_map_flags,
-                                          const size_t *,
-                                          const size_t *,
-                                          size_t *,
-                                          size_t *,
-                                          cl_uint,
-                                          const cl_event *,
-                                          cl_event *,
-                                          cl_int *);
-  using clCreateCommandQueueType = cl_command_queue(CL_API_CALL *)(  // NOLINT
-      cl_context,
-      cl_device_id,
-      cl_command_queue_properties,
-      cl_int *);
-  using clCreateCommandQueueWithPropertiesType = cl_command_queue (*)(
-      cl_context, cl_device_id, const cl_queue_properties *, cl_int *);
-  using clReleaseCommandQueueType = cl_int (*)(cl_command_queue);
-  using clCreateProgramWithBinaryType = cl_program (*)(cl_context,
-                                                       cl_uint,
-                                                       const cl_device_id *,
-                                                       const size_t *,
-                                                       const unsigned char **,
-                                                       cl_int *,
-                                                       cl_int *);
-  using clRetainContextType = cl_int (*)(cl_context context);
-  using clGetContextInfoType =
-      cl_int (*)(cl_context, cl_context_info, size_t, void *, size_t *);
-  using clReleaseProgramType = cl_int (*)(cl_program program);
-  using clFlushType = cl_int (*)(cl_command_queue command_queue);
-  using clFinishType = cl_int (*)(cl_command_queue command_queue);
-  using clGetProgramInfoType =
-      cl_int (*)(cl_program, cl_program_info, size_t, void *, size_t *);
-  using clCreateKernelType = cl_kernel (*)(cl_program, const char *, cl_int *);
-  using clRetainKernelType = cl_int (*)(cl_kernel kernel);
-  using clCreateBufferType =
-      cl_mem (*)(cl_context, cl_mem_flags, size_t, void *, cl_int *);
-  using clCreateImage2DType = cl_mem(CL_API_CALL *)(cl_context,  // NOLINT
-                                                    cl_mem_flags,
-                                                    const cl_image_format *,
-                                                    size_t,
-                                                    size_t,
-                                                    size_t,
-                                                    void *,
-                                                    cl_int *);
-  using clCreateImageType = cl_mem (*)(cl_context,
-                                       cl_mem_flags,
-                                       const cl_image_format *,
-                                       const cl_image_desc *,
-                                       void *,
-                                       cl_int *);
-  using clCreateUserEventType = cl_event (*)(cl_context, cl_int *);
-  using clCreateProgramWithSourceType = cl_program (*)(
-      cl_context, cl_uint, const char **, const size_t *, cl_int *);
-  using clReleaseKernelType = cl_int (*)(cl_kernel kernel);
-  using clGetDeviceInfoType =
-      cl_int (*)(cl_device_id, cl_device_info, size_t, void *, size_t *);
-  using clGetDeviceIDsType = cl_int (*)(
-      cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *);
-  using clRetainDeviceType = cl_int (*)(cl_device_id);
-  using clReleaseDeviceType = cl_int (*)(cl_device_id);
-  using clRetainEventType = cl_int (*)(cl_event);
-  using clGetKernelWorkGroupInfoType = cl_int (*)(cl_kernel,
-                                                  cl_device_id,
-                                                  cl_kernel_work_group_info,
-                                                  size_t,
-                                                  void *,
-                                                  size_t *);
-  using clGetEventInfoType = cl_int (*)(cl_event event,
-                                        cl_event_info param_name,
-                                        size_t param_value_size,
-                                        void *param_value,
-                                        size_t *param_value_size_ret);
-  using clGetEventProfilingInfoType = cl_int (*)(cl_event event,
-                                                 cl_profiling_info param_name,
-                                                 size_t param_value_size,
-                                                 void *param_value,
-                                                 size_t *param_value_size_ret);
-  using clGetImageInfoType =
-      cl_int (*)(cl_mem, cl_image_info, size_t, void *, size_t *);
-
-  using clEnqueueCopyBufferType = cl_int (*)(cl_command_queue,
-                                             cl_mem,
-                                             cl_mem,
-                                             size_t,
-                                             size_t,
-                                             size_t,
-                                             cl_uint,
-                                             const cl_event *,
-                                             cl_event *);
-  using clEnqueueWriteImageType = cl_int (*)(cl_command_queue,
-                                             cl_mem,
-                                             cl_bool,
-                                             const size_t *,
-                                             const size_t *,
-                                             size_t,
-                                             size_t,
-                                             const void *,
-                                             cl_uint,
-                                             const cl_event *,
-                                             cl_event *);
-  using clEnqueueCopyImageType = cl_int (*)(cl_command_queue,
-                                            cl_mem,
-                                            cl_mem,
-                                            const size_t *,
-                                            const size_t *,
-                                            const size_t *,
-                                            cl_uint,
-                                            const cl_event *,
-                                            cl_event *);
-
-  clGetPlatformIDsType clGetPlatformIDs() {
-    CHECK(clGetPlatformIDs_ != nullptr) << "Cannot load clGetPlatformIDs!";
-    return clGetPlatformIDs_;
-  }
-
-  clGetPlatformInfoType clGetPlatformInfo() {
-    CHECK(clGetPlatformInfo_ != nullptr) << "Cannot load clGetPlatformInfo!";
-    return clGetPlatformInfo_;
-  }
-
-  clBuildProgramType clBuildProgram() {
-    CHECK(clBuildProgram_ != nullptr) << "Cannot load clBuildProgram!";
-    return clBuildProgram_;
-  }
-
-  clEnqueueNDRangeKernelType clEnqueueNDRangeKernel() {
-    CHECK(clEnqueueNDRangeKernel_ != nullptr)
-        << "Cannot load clEnqueueNDRangeKernel!";
-    return clEnqueueNDRangeKernel_;
-  }
-
-  clSetKernelArgType clSetKernelArg() {
-    CHECK(clSetKernelArg_ != nullptr) << "Cannot load clSetKernelArg!";
-    return clSetKernelArg_;
-  }
-
-  clRetainMemObjectType clRetainMemObject() {
-    CHECK(clRetainMemObject_ != nullptr) << "Cannot load clRetainMemObject!";
-    return clRetainMemObject_;
-  }
-
-  clReleaseMemObjectType clReleaseMemObject() {
-    CHECK(clReleaseMemObject_ != nullptr) << "Cannot load clReleaseMemObject!";
-    return clReleaseMemObject_;
-  }
-
-  clEnqueueUnmapMemObjectType clEnqueueUnmapMemObject() {
-    CHECK(clEnqueueUnmapMemObject_ != nullptr)
-        << "Cannot load clEnqueueUnmapMemObject!";
-    return clEnqueueUnmapMemObject_;
-  }
-
-  clRetainCommandQueueType clRetainCommandQueue() {
-    CHECK(clRetainCommandQueue_ != nullptr)
-        << "Cannot load clRetainCommandQueue!";
-    return clRetainCommandQueue_;
-  }
-
-  clCreateContextType clCreateContext() {
-    CHECK(clCreateContext_ != nullptr) << "Cannot load clCreateContext!";
-    return clCreateContext_;
-  }
-
-  clCreateContextFromTypeType clCreateContextFromType() {
-    CHECK(clCreateContextFromType_ != nullptr)
-        << "Cannot load clCreateContextFromType!";
-    return clCreateContextFromType_;
-  }
-
-  clReleaseContextType clReleaseContext() {
-    CHECK(clReleaseContext_ != nullptr) << "Cannot load clReleaseContext!";
-    return clReleaseContext_;
-  }
-
-  clWaitForEventsType clWaitForEvents() {
-    CHECK(clWaitForEvents_ != nullptr) << "Cannot load clWaitForEvents!";
-    return clWaitForEvents_;
-  }
-
-  clReleaseEventType clReleaseEvent() {
-    CHECK(clReleaseEvent_ != nullptr) << "Cannot load clReleaseEvent!";
-    return clReleaseEvent_;
-  }
-
-  clEnqueueWriteBufferType clEnqueueWriteBuffer() {
-    CHECK(clEnqueueWriteBuffer_ != nullptr)
-        << "Cannot loadcl clEnqueueWriteBuffer!";
-    return clEnqueueWriteBuffer_;
-  }
-
-  clEnqueueReadBufferType clEnqueueReadBuffer() {
-    CHECK(clEnqueueReadBuffer_ != nullptr)
-        << "Cannot load clEnqueueReadBuffer!";
-    return clEnqueueReadBuffer_;
-  }
-
-  clEnqueueReadImageType clEnqueueReadImage() {
-    CHECK(clEnqueueReadImage_ != nullptr) << "Cannot load clEnqueueReadImage!";
-    return clEnqueueReadImage_;
-  }
-
-  clGetProgramBuildInfoType clGetProgramBuildInfo() {
-    CHECK(clGetProgramBuildInfo_ != nullptr)
-        << "Cannot load clGetProgramBuildInfo!";
-    return clGetProgramBuildInfo_;
-  }
-
-  clRetainProgramType clRetainProgram() {
-    CHECK(clRetainProgram_ != nullptr) << "Cannot load clRetainProgram!";
-    return clRetainProgram_;
-  }
-
-  clEnqueueMapBufferType clEnqueueMapBuffer() {
-    CHECK(clEnqueueMapBuffer_ != nullptr) << "Cannot load clEnqueueMapBuffer!";
-    return clEnqueueMapBuffer_;
-  }
-
-  clEnqueueMapImageType clEnqueueMapImage() {
-    CHECK(clEnqueueMapImage_ != nullptr) << "Cannot load clEnqueueMapImage!";
-    return clEnqueueMapImage_;
-  }
-
-  clCreateCommandQueueType clCreateCommandQueue() {
-    CHECK(clCreateCommandQueue_ != nullptr)
-        << "Cannot load clCreateCommandQueue!";
-    return clCreateCommandQueue_;
-  }
-
-  clCreateCommandQueueWithPropertiesType clCreateCommandQueueWithProperties() {
-    CHECK(clCreateCommandQueueWithProperties_ != nullptr)
-        << "Cannot load clCreateCommandQueueWithProperties!";
-    return clCreateCommandQueueWithProperties_;
-  }
-
-  clReleaseCommandQueueType clReleaseCommandQueue() {
-    CHECK(clReleaseCommandQueue_ != nullptr)
-        << "Cannot load clReleaseCommandQueue!";
-    return clReleaseCommandQueue_;
-  }
-
-  clCreateProgramWithBinaryType clCreateProgramWithBinary() {
-    CHECK(clCreateProgramWithBinary_ != nullptr)
-        << "Cannot load clCreateProgramWithBinary!";
-    return clCreateProgramWithBinary_;
-  }
-
-  clRetainContextType clRetainContext() {
-    CHECK(clRetainContext_ != nullptr) << "Cannot load clRetainContext!";
-    return clRetainContext_;
-  }
-
-  clGetContextInfoType clGetContextInfo() {
-    CHECK(clGetContextInfo_ != nullptr) << "Cannot load clGetContextInfo!";
-    return clGetContextInfo_;
-  }
-
-  clReleaseProgramType clReleaseProgram() {
-    CHECK(clReleaseProgram_ != nullptr) << "Cannot load clReleaseProgram!";
-    return clReleaseProgram_;
-  }
-
-  clFlushType clFlush() {
-    CHECK(clFlush_ != nullptr) << "Cannot load clFlush!";
-    return clFlush_;
-  }
-
-  clFinishType clFinish() {
-    CHECK(clFinish_ != nullptr) << "Cannot load clFinish!";
-    return clFinish_;
-  }
-
-  clGetProgramInfoType clGetProgramInfo() {
-    CHECK(clGetProgramInfo_ != nullptr) << "Cannot load clGetProgramInfo!";
-    return clGetProgramInfo_;
-  }
-
-  clCreateKernelType clCreateKernel() {
-    CHECK(clCreateKernel_ != nullptr) << "Cannot load clCreateKernel!";
-    return clCreateKernel_;
-  }
-
-  clRetainKernelType clRetainKernel() {
-    CHECK(clRetainKernel_ != nullptr) << "Cannot load clRetainKernel!";
-    return clRetainKernel_;
-  }
-
-  clCreateBufferType clCreateBuffer() {
-    CHECK(clCreateBuffer_ != nullptr) << "Cannot load clCreateBuffer!";
-    return clCreateBuffer_;
-  }
-
-  clCreateImage2DType clCreateImage2D() {
-    CHECK(clCreateImage2D_ != nullptr) << "Cannot load clCreateImage2D!";
-    return clCreateImage2D_;
-  }
-
-  clCreateImageType clCreateImage() {
-    CHECK(clCreateImage_ != nullptr) << "Cannot load clCreateImage!";
-    return clCreateImage_;
-  }
-
-  clCreateUserEventType clCreateUserEvent() {
-    CHECK(clCreateUserEvent_ != nullptr) << "Cannot load clCreateUserEvent!";
-    return clCreateUserEvent_;
-  }
-
-  clCreateProgramWithSourceType clCreateProgramWithSource() {
-    CHECK(clCreateProgramWithSource_ != nullptr)
-        << "Cannot load clCreateProgramWithSource!";
-    return clCreateProgramWithSource_;
-  }
-
-  clReleaseKernelType clReleaseKernel() {
-    CHECK(clReleaseKernel_ != nullptr) << "Cannot load clReleaseKernel!";
-    return clReleaseKernel_;
-  }
-
-  clGetDeviceInfoType clGetDeviceInfo() {
-    CHECK(clGetDeviceInfo_ != nullptr) << "Cannot load clGetDeviceInfo!";
-    return clGetDeviceInfo_;
-  }
-
-  clGetDeviceIDsType clGetDeviceIDs() {
-    CHECK(clGetDeviceIDs_ != nullptr) << "Cannot load clGetDeviceIDs!";
-    return clGetDeviceIDs_;
-  }
-
-  clRetainDeviceType clRetainDevice() {
-    CHECK(clRetainDevice_ != nullptr) << "Cannot load clRetainDevice!";
-    return clRetainDevice_;
-  }
-
-  clReleaseDeviceType clReleaseDevice() {
-    CHECK(clReleaseDevice_ != nullptr) << "Cannot load clReleaseDevice!";
-    return clReleaseDevice_;
-  }
-
-  clRetainEventType clRetainEvent() {
-    CHECK(clRetainEvent_ != nullptr) << "Cannot load clRetainEvent!";
-    return clRetainEvent_;
-  }
-
-  clGetKernelWorkGroupInfoType clGetKernelWorkGroupInfo() {
-    CHECK(clGetKernelWorkGroupInfo_ != nullptr)
-        << "Cannot load clGetKernelWorkGroupInfo!";
-    return clGetKernelWorkGroupInfo_;
-  }
-
-  clGetEventInfoType clGetEventInfo() {
-    CHECK(clGetEventInfo_ != nullptr) << "Cannot load clGetEventInfo!";
-    return clGetEventInfo_;
-  }
-
-  clGetEventProfilingInfoType clGetEventProfilingInfo() {
-    CHECK(clGetEventProfilingInfo_ != nullptr)
-        << "Cannot load clGetEventProfilingInfo!";
-    return clGetEventProfilingInfo_;
-  }
-
-  clGetImageInfoType clGetImageInfo() {
-    CHECK(clGetImageInfo_ != nullptr) << "Cannot load clGetImageInfo!";
-    return clGetImageInfo_;
-  }
-
-  clEnqueueCopyBufferType clEnqueueCopyBuffer() {
-    CHECK(clEnqueueCopyBuffer_ != nullptr)
-        << "Cannot load clEnqueueCopyBuffer!";
-    return clEnqueueCopyBuffer_;
-  }
-
-  clEnqueueWriteImageType clEnqueueWriteImage() {
-    CHECK(clEnqueueWriteImage_ != nullptr)
-        << "Cannot load clEnqueueWriteImage!";
-    return clEnqueueWriteImage_;
-  }
-
-  clEnqueueCopyImageType clEnqueueCopyImage() {
-    CHECK(clEnqueueCopyImage_ != nullptr) << "Cannot load clEnqueueCopyImage!";
-    return clEnqueueCopyImage_;
-  }
-
- private:
-  CLWrapper();
-  CLWrapper(const CLWrapper &) = delete;
-  CLWrapper &operator=(const CLWrapper &) = delete;
-  bool InitHandle();
-  void InitFunctions();
-  void *handle_{nullptr};
-  clGetPlatformIDsType clGetPlatformIDs_{nullptr};
-  clGetPlatformInfoType clGetPlatformInfo_{nullptr};
-  clBuildProgramType clBuildProgram_{nullptr};
-  clEnqueueNDRangeKernelType clEnqueueNDRangeKernel_{nullptr};
-  clSetKernelArgType clSetKernelArg_{nullptr};
-  clRetainMemObjectType clRetainMemObject_{nullptr};
-  clReleaseMemObjectType clReleaseMemObject_{nullptr};
-  clEnqueueUnmapMemObjectType clEnqueueUnmapMemObject_{nullptr};
-  clRetainCommandQueueType clRetainCommandQueue_{nullptr};
-  clCreateContextType clCreateContext_{nullptr};
-  clCreateContextFromTypeType clCreateContextFromType_{nullptr};
-  clReleaseContextType clReleaseContext_{nullptr};
-  clWaitForEventsType clWaitForEvents_{nullptr};
-  clReleaseEventType clReleaseEvent_{nullptr};
-  clEnqueueWriteBufferType clEnqueueWriteBuffer_{nullptr};
-  clEnqueueReadBufferType clEnqueueReadBuffer_{nullptr};
-  clEnqueueReadImageType clEnqueueReadImage_{nullptr};
-  clGetProgramBuildInfoType clGetProgramBuildInfo_{nullptr};
-  clRetainProgramType clRetainProgram_{nullptr};
-  clEnqueueMapBufferType clEnqueueMapBuffer_{nullptr};
-  clEnqueueMapImageType clEnqueueMapImage_{nullptr};
-  clCreateCommandQueueType clCreateCommandQueue_{nullptr};
-  clCreateCommandQueueWithPropertiesType clCreateCommandQueueWithProperties_{
-      nullptr};
-  clReleaseCommandQueueType clReleaseCommandQueue_{nullptr};
-  clCreateProgramWithBinaryType clCreateProgramWithBinary_{nullptr};
-  clRetainContextType clRetainContext_{nullptr};
-  clGetContextInfoType clGetContextInfo_{nullptr};
-  clReleaseProgramType clReleaseProgram_{nullptr};
-  clFlushType clFlush_{nullptr};
-  clFinishType clFinish_{nullptr};
-  clGetProgramInfoType clGetProgramInfo_{nullptr};
-  clCreateKernelType clCreateKernel_{nullptr};
-  clRetainKernelType clRetainKernel_{nullptr};
-  clCreateBufferType clCreateBuffer_{nullptr};
-  clCreateImage2DType clCreateImage2D_{nullptr};
-  clCreateImageType clCreateImage_{nullptr};
-  clCreateUserEventType clCreateUserEvent_{nullptr};
-  clCreateProgramWithSourceType clCreateProgramWithSource_{nullptr};
-  clReleaseKernelType clReleaseKernel_{nullptr};
-  clGetDeviceInfoType clGetDeviceInfo_{nullptr};
-  clGetDeviceIDsType clGetDeviceIDs_{nullptr};
-  clRetainDeviceType clRetainDevice_{nullptr};
-  clReleaseDeviceType clReleaseDevice_{nullptr};
-  clRetainEventType clRetainEvent_{nullptr};
-  clGetKernelWorkGroupInfoType clGetKernelWorkGroupInfo_{nullptr};
-  clGetEventInfoType clGetEventInfo_{nullptr};
-  clGetEventProfilingInfoType clGetEventProfilingInfo_{nullptr};
-  clGetImageInfoType clGetImageInfo_{nullptr};
-  clEnqueueCopyBufferType clEnqueueCopyBuffer_{nullptr};
-  clEnqueueWriteImageType clEnqueueWriteImage_{nullptr};
-  clEnqueueCopyImageType clEnqueueCopyImage_{nullptr};
-};
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/opencl/target_wrapper.cc b/lite/backends/opencl/target_wrapper.cc
deleted file mode 100644
index eb324fcb0f..0000000000
--- a/lite/backends/opencl/target_wrapper.cc
+++ /dev/null
@@ -1,341 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/opencl/target_wrapper.h"
-#include <algorithm>
-#include <array>
-#include "lite/backends/opencl/cl_include.h"
-#include "lite/backends/opencl/cl_runtime.h"
-#include "lite/backends/opencl/cl_utility.h"
-
-namespace paddle {
-namespace lite {
-
-static cl_channel_type GetCLChannelType(const PrecisionType type) {
-  switch (type) {
-    case PRECISION(kFloat):
-      return CL_FLOAT;
-    case PRECISION(kInt32):
-      return CL_SIGNED_INT32;
-    case PRECISION(kInt8):
-      return CL_SIGNED_INT8;
-    default:
-      LOG(FATAL) << "Unsupported image channel type: " << PrecisionToStr(type);
-      return 0;
-  }
-}
-
-void *TargetWrapperCL::Malloc(size_t size) {
-  cl_int status;
-  cl::Buffer *buffer = new cl::Buffer(CLRuntime::Global()->context(),
-                                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
-                                      size,
-                                      nullptr,
-                                      &status);
-  if (status != CL_SUCCESS) {
-    delete buffer;
-    buffer = nullptr;
-  }
-  CL_CHECK_FATAL(status);
-  return buffer;
-}
-
-void TargetWrapperCL::Free(void *ptr) {
-  if (ptr != nullptr) {
-    cl::Buffer *cl_buffer = static_cast<cl::Buffer *>(ptr);
-    delete cl_buffer;
-  }
-}
-
-void *TargetWrapperCL::MallocImage(const std::array<size_t, 2> &image_shape,
-                                   PrecisionType data_type) {
-  cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(data_type));
-  cl_int status;
-  size_t width = image_shape[0];
-  size_t height = image_shape[1];
-  cl::Image2D *cl_image =
-      new cl::Image2D(CLRuntime::Global()->context(),
-                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
-                      img_format,
-                      width,
-                      height,
-                      0,
-                      nullptr,
-                      &status);
-  if (status != CL_SUCCESS) {
-    delete cl_image;
-    cl_image = nullptr;
-  }
-  CL_CHECK_FATAL(status);
-  return cl_image;
-}
-
-void TargetWrapperCL::FreeImage(void *image) {
-  if (image != nullptr) {
-    cl::Image2D *cl_image = static_cast<cl::Image2D *>(image);
-    delete cl_image;
-  }
-}
-
-void *TargetWrapperCL::Map(void *buffer, size_t offset, size_t size) {
-  cl::Buffer *cl_buffer = static_cast<cl::Buffer *>(buffer);
-  cl_int status;
-  void *mapped_ptr = CLRuntime::Global()->command_queue().enqueueMapBuffer(
-      *cl_buffer,
-      CL_TRUE,
-      CL_MAP_READ | CL_MAP_WRITE,
-      offset,
-      size,
-      nullptr,
-      nullptr,
-      &status);
-  if (status != CL_SUCCESS) {
-    mapped_ptr = nullptr;
-  }
-  CL_CHECK_FATAL(status);
-  return mapped_ptr;
-}
-
-void *TargetWrapperCL::MapImage(void *image,
-                                const std::array<size_t, 2> &image_shape,
-                                std::array<size_t, 2> *image_pitch) {
-  cl::Image2D *cl_image = static_cast<cl::Image2D *>(image);
-  size_t width = image_shape[0];
-  size_t height = image_shape[1];
-  size_t *row_pitch = image_pitch->data();
-  size_t *slice_pitch = image_pitch->data() + 1;
-  std::array<size_t, 3> origin{{0, 0, 0}};
-  std::array<size_t, 3> region{{width, height, 1}};
-  cl_int status;
-  void *mapped_ptr = CLRuntime::Global()->command_queue().enqueueMapImage(
-      *cl_image,
-      CL_TRUE,
-      CL_MAP_READ | CL_MAP_WRITE,
-      origin,
-      region,
-      row_pitch,
-      slice_pitch,
-      nullptr,
-      nullptr,
-      &status);
-  if (status != CL_SUCCESS) {
-    mapped_ptr = nullptr;
-  }
-  CL_CHECK_FATAL(status);
-  return mapped_ptr;
-}
-
-void TargetWrapperCL::Unmap(void *cl_obj, void *mapped_ptr) {
-  cl::Memory *mem_obj = static_cast<cl::Memory *>(cl_obj);
-  cl_int status = CLRuntime::Global()->command_queue().enqueueUnmapMemObject(
-      *mem_obj, mapped_ptr, nullptr, nullptr);
-  CL_CHECK_FATAL(status);
-}
-
-void TargetWrapperCL::MemcpySync(void *dst,
-                                 const void *src,
-                                 size_t size,
-                                 IoDirection dir) {
-  cl_int status;
-  cl::Event event;
-  auto stream = CLRuntime::Global()->command_queue();
-  switch (dir) {
-    case IoDirection::DtoD:
-      status = stream.enqueueCopyBuffer(*static_cast<const cl::Buffer *>(src),
-                                        *static_cast<cl::Buffer *>(dst),
-                                        0,
-                                        0,
-                                        size,
-                                        nullptr,
-                                        &event);
-      CL_CHECK_FATAL(status);
-      event.wait();
-      break;
-    case IoDirection::HtoD:
-      status = stream.enqueueWriteBuffer(*static_cast<cl::Buffer *>(dst),
-                                         CL_TRUE,
-                                         0,
-                                         size,
-                                         src,
-                                         nullptr,
-                                         nullptr);
-      CL_CHECK_FATAL(status);
-      break;
-    case IoDirection::DtoH:
-      status = stream.enqueueReadBuffer(*static_cast<const cl::Buffer *>(src),
-                                        CL_TRUE,
-                                        0,
-                                        size,
-                                        dst,
-                                        nullptr,
-                                        nullptr);
-      CL_CHECK_FATAL(status);
-      break;
-    default:
-      LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
-  }
-}
-
-void TargetWrapperCL::MemcpyAsync(void *dst,
-                                  const void *src,
-                                  size_t size,
-                                  IoDirection dir,
-                                  const stream_t &stream) {
-  cl_int status;
-  switch (dir) {
-    case IoDirection::DtoD:
-      status = stream.enqueueCopyBuffer(*static_cast<const cl::Buffer *>(src),
-                                        *static_cast<cl::Buffer *>(dst),
-                                        0,
-                                        0,
-                                        size,
-                                        nullptr,
-                                        nullptr);
-      CL_CHECK_FATAL(status);
-      break;
-    case IoDirection::HtoD:
-      status = stream.enqueueWriteBuffer(*static_cast<cl::Buffer *>(dst),
-                                         CL_FALSE,
-                                         0,
-                                         size,
-                                         src,
-                                         nullptr,
-                                         nullptr);
-      CL_CHECK_FATAL(status);
-      break;
-    case IoDirection::DtoH:
-      status = stream.enqueueReadBuffer(*static_cast<const cl::Buffer *>(src),
-                                        CL_FALSE,
-                                        0,
-                                        size,
-                                        dst,
-                                        nullptr,
-                                        nullptr);
-      CL_CHECK_FATAL(status);
-      break;
-    default:
-      LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
-  }
-}
-
-void TargetWrapperCL::ImgcpySync(void *dst,
-                                 const void *src,
-                                 const std::array<size_t, 2> &image_shape,
-                                 const std::array<size_t, 2> &image_pitch,
-                                 IoDirection dir) {
-  size_t width = image_shape[0];
-  size_t height = image_shape[1];
-  size_t row_pitch = image_pitch[0];
-  size_t slice_pitch = image_pitch[1];
-  std::array<size_t, 3> origin{{0, 0, 0}};
-  std::array<size_t, 3> region{{width, height, 1}};
-  cl_int status;
-  cl::Event event;
-  auto stream = CLRuntime::Global()->command_queue();
-  switch (dir) {
-    case IoDirection::DtoD:
-      status = stream.enqueueCopyImage(*static_cast<const cl::Image2D *>(src),
-                                       *static_cast<cl::Image2D *>(dst),
-                                       origin,
-                                       origin,
-                                       region,
-                                       nullptr,
-                                       &event);
-      CL_CHECK_FATAL(status);
-      event.wait();
-      break;
-    case IoDirection::HtoD:
-      status = stream.enqueueWriteImage(*static_cast<cl::Image2D *>(dst),
-                                        CL_TRUE,
-                                        origin,
-                                        region,
-                                        row_pitch,
-                                        slice_pitch,
-                                        src,
-                                        nullptr,
-                                        nullptr);
-      CL_CHECK_FATAL(status);
-      break;
-    case IoDirection::DtoH:
-      status = stream.enqueueReadImage(*static_cast<const cl::Image2D *>(src),
-                                       CL_TRUE,
-                                       origin,
-                                       region,
-                                       row_pitch,
-                                       slice_pitch,
-                                       dst,
-                                       nullptr,
-                                       nullptr);
-      CL_CHECK_FATAL(status);
-      break;
-    default:
-      LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
-  }
-}
-
-void TargetWrapperCL::ImgcpyAsync(void *dst,
-                                  const void *src,
-                                  const std::array<size_t, 2> &image_shape,
-                                  const std::array<size_t, 2> &image_pitch,
-                                  IoDirection dir,
-                                  const stream_t &stream) {
-  size_t width = image_shape[0];
-  size_t height = image_shape[1];
-  size_t row_pitch = image_pitch[0];
-  size_t slice_pitch = image_pitch[1];
-  std::array<size_t, 3> origin{{0, 0, 0}};
-  std::array<size_t, 3> region{{width, height, 1}};
-  cl_int status;
-  switch (dir) {
-    case IoDirection::DtoD:
-      status = stream.enqueueCopyImage(*static_cast<const cl::Image2D *>(src),
-                                       *static_cast<cl::Image2D *>(dst),
-                                       origin,
-                                       origin,
-                                       region,
-                                       nullptr,
-                                       nullptr);
-      CL_CHECK_FATAL(status);
-      break;
-    case IoDirection::HtoD:
-      status = stream.enqueueWriteImage(*static_cast<cl::Image2D *>(dst),
-                                        CL_FALSE,
-                                        origin,
-                                        region,
-                                        row_pitch,
-                                        slice_pitch,
-                                        src,
-                                        nullptr,
-                                        nullptr);
-      CL_CHECK_FATAL(status);
-      break;
-    case IoDirection::DtoH:
-      status = stream.enqueueReadImage(*static_cast<const cl::Image2D *>(src),
-                                       CL_FALSE,
-                                       origin,
-                                       region,
-                                       row_pitch,
-                                       slice_pitch,
-                                       dst,
-                                       nullptr,
-                                       nullptr);
-      CL_CHECK_FATAL(status);
-      break;
-    default:
-      LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
-  }
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/opencl/target_wrapper.h b/lite/backends/opencl/target_wrapper.h
deleted file mode 100644
index 8ff8e6fd40..0000000000
--- a/lite/backends/opencl/target_wrapper.h
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <array>
-#include "lite/backends/opencl/cl_include.h"
-#include "lite/core/target_wrapper.h"
-
-namespace paddle {
-namespace lite {
-
-using TargetWrapperCL =
-    TargetWrapper<TARGET(kOpenCL), cl::CommandQueue, cl::Event>;
-// This interface should be specified by each kind of target.
-template <>
-class TargetWrapper<TARGET(kOpenCL), cl::CommandQueue, cl::Event> {
- public:
-  using stream_t = cl::CommandQueue;
-  using event_t = cl::Event;
-
-  static size_t num_devices() { return 0; }
-  static size_t maximum_stream() { return 0; }
-
-  static void CreateStream(stream_t* stream) {}
-  static void DestroyStream(const stream_t& stream) {}
-
-  static void CreateEvent(event_t* event) {}
-  static void DestroyEvent(const event_t& event) {}
-
-  static void RecordEvent(const event_t& event) {}
-  static void SyncEvent(const event_t& event) {}
-
-  static void StreamSync(const stream_t& stream) {}
-
-  static void* Malloc(size_t size);
-  static void Free(void* ptr);
-
-  static void* MallocImage(const std::array<size_t, 2>& image_shape,
-                           PrecisionType data_type);
-  static void FreeImage(void* image);
-
-  static void* Map(void* buffer, size_t offset, size_t size);
-  static void* MapImage(void* image,
-                        const std::array<size_t, 2>& image_shape,
-                        std::array<size_t, 2>* image_pitch);
-  static void Unmap(void* cl_obj, void* mapped_ptr);
-
-  static void MemcpySync(void* dst,
-                         const void* src,
-                         size_t size,
-                         IoDirection dir);
-  static void MemcpyAsync(void* dst,
-                          const void* src,
-                          size_t size,
-                          IoDirection dir,
-                          const stream_t& stream);
-  static void ImgcpySync(void* dst,
-                         const void* src,
-                         const std::array<size_t, 2>& image_shape,
-                         const std::array<size_t, 2>& image_pitch,
-                         IoDirection dir);
-  static void ImgcpyAsync(void* dst,
-                          const void* src,
-                          const std::array<size_t, 2>& image_shape,
-                          const std::array<size_t, 2>& image_pitch,
-                          IoDirection dir,
-                          const stream_t& stream);
-};
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/CMakeLists.txt b/lite/backends/x86/CMakeLists.txt
deleted file mode 100644
index 34e0800130..0000000000
--- a/lite/backends/x86/CMakeLists.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-if (NOT LITE_WITH_X86)
-    return()
-endif()
-
-configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
-configure_file(warpctc_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/warpctc_lib_path.h)
-
-lite_cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)
-lite_cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
-lite_cc_library(target_wrapper_x86 SRCS target_wrapper.cc)
-lite_cc_library(x86_cpu_info SRCS cpu_info.cc DEPS xbyak)
-
-add_subdirectory(jit)
-add_subdirectory(math)
diff --git a/lite/backends/x86/cpu_info.cc b/lite/backends/x86/cpu_info.cc
deleted file mode 100644
index c2759d6191..0000000000
--- a/lite/backends/x86/cpu_info.cc
+++ /dev/null
@@ -1,160 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/x86/cpu_info.h"
-
-#ifdef PADDLE_WITH_XBYAK
-#include "xbyak/xbyak.h"
-#include "xbyak/xbyak_util.h"
-#endif
-
-#ifdef __APPLE__
-#include <sys/sysctl.h>
-#include <sys/types.h>
-#elif defined(_WIN32)
-#define NOMINMAX  // msvc max/min macro conflict with std::min/max
-#include <windows.h>
-#else
-#include <unistd.h>
-#endif  // _WIN32
-
-#include <gflags/gflags.h>
-#include <algorithm>
-
-DEFINE_double(fraction_of_cpu_memory_to_use,
-              1,
-              "Default use 100% of CPU memory for PaddlePaddle,"
-              "reserve the rest for page tables, etc");
-DEFINE_uint64(initial_cpu_memory_in_mb,
-              500ul,
-              "Initial CPU memory for PaddlePaddle, in MD unit.");
-
-DEFINE_double(
-    fraction_of_cuda_pinned_memory_to_use,
-    0.5,
-    "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
-    "reserve the rest for page tables, etc");
-
-// If use_pinned_memory is true, CPUAllocator calls mlock, which
-// returns pinned and locked memory as staging areas for data exchange
-// between host and device.  Allocates too much would reduce the amount
-// of memory available to the system for paging.  So, by default, we
-// should set false to use_pinned_memory.
-DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-
-size_t CpuTotalPhysicalMemory() {
-#ifdef __APPLE__
-  int mib[2];
-  mib[0] = CTL_HW;
-  mib[1] = HW_MEMSIZE;
-  int64_t size = 0;
-  size_t len = sizeof(size);
-  if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size;
-  return 0L;
-#elif defined(_WIN32)
-  MEMORYSTATUSEX sMeminfo;
-  sMeminfo.dwLength = sizeof(sMeminfo);
-  GlobalMemoryStatusEx(&sMeminfo);
-  return sMeminfo.ullTotalPhys;
-#else
-  int64_t pages = sysconf(_SC_PHYS_PAGES);
-  int64_t page_size = sysconf(_SC_PAGE_SIZE);
-  return pages * page_size;
-#endif
-}
-
-size_t CpuMaxAllocSize() {
-  // For distributed systems, it requires configuring and limiting
-  // the fraction of memory to use.
-  return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
-}
-
-size_t CpuMinChunkSize() {
-  // Allow to allocate the minimum chunk size is 4 KB.
-  return 1 << 12;
-}
-
-size_t CpuMaxChunkSize() {
-  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory,
-  // or the initial_cpu_memory_in_mb.
-  return std::min(
-      static_cast<size_t>(CpuMaxAllocSize() / 32),
-      static_cast<size_t>(FLAGS_initial_cpu_memory_in_mb * 1 << 20));
-}
-
-size_t CUDAPinnedMaxAllocSize() {
-  // For distributed systems, it requires configuring and limiting
-  // the fraction of memory to use.
-  return FLAGS_fraction_of_cuda_pinned_memory_to_use * CpuTotalPhysicalMemory();
-}
-
-size_t CUDAPinnedMinChunkSize() {
-  // Allow to allocate the minimum chunk size is 64 KB.
-  return 1 << 16;
-}
-
-size_t CUDAPinnedMaxChunkSize() {
-  // Allow to allocate the maximum chunk size is roughly 1/256 of CUDA_PINNED
-  // memory.
-  return CUDAPinnedMaxAllocSize() / 256;
-}
-
-#ifdef PADDLE_WITH_XBYAK
-static Xbyak::util::Cpu cpu;
-bool MayIUse(const cpu_isa_t cpu_isa) {
-  using namespace Xbyak::util;  // NOLINT
-  switch (cpu_isa) {
-    case sse42:
-      return cpu.has(Cpu::tSSE42);
-    case avx:
-      return cpu.has(Cpu::tAVX);
-    case avx2:
-      return cpu.has(Cpu::tAVX2);
-    case avx512f:
-      return cpu.has(Cpu::tAVX512F);
-    case avx512_core:
-      return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512BW) &&
-             cpu.has(Cpu::tAVX512VL) && cpu.has(Cpu::tAVX512DQ);
-    case avx512_core_vnni:
-      return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512BW) &&
-             cpu.has(Cpu::tAVX512VL) && cpu.has(Cpu::tAVX512DQ) &&
-             cpu.has(Cpu::tAVX512_VNNI);
-    case avx512_mic:
-      return true && cpu.has(Cpu::tAVX512F) && cpu.has(Cpu::tAVX512CD) &&
-             cpu.has(Cpu::tAVX512ER) && cpu.has(Cpu::tAVX512PF);
-    case avx512_mic_4ops:
-      return true && MayIUse(avx512_mic) && cpu.has(Cpu::tAVX512_4FMAPS) &&
-             cpu.has(Cpu::tAVX512_4VNNIW);
-    case isa_any:
-      return true;
-  }
-  return false;
-}
-#else
-bool MayIUse(const cpu_isa_t cpu_isa) {
-  if (cpu_isa == isa_any) {
-    return true;
-  } else {
-    return false;
-  }
-}
-#endif
-
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/cpu_info.h b/lite/backends/x86/cpu_info.h
deleted file mode 100644
index c60cc4798c..0000000000
--- a/lite/backends/x86/cpu_info.h
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <stddef.h>
-
-#ifdef _WIN32
-#if defined(__AVX2__)
-#include <immintrin.h>  // avx2
-#elif defined(__AVX__)
-#include <intrin.h>  // avx
-#endif               // AVX
-#else                // WIN32
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
-#endif  // WIN32
-
-#if defined(_WIN32)
-#define ALIGN32_BEG __declspec(align(32))
-#define ALIGN32_END
-#else
-#define ALIGN32_BEG
-#define ALIGN32_END __attribute__((aligned(32)))
-#endif  // _WIN32
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-
-size_t CpuTotalPhysicalMemory();
-
-//! Get the maximum allocation size for a machine.
-size_t CpuMaxAllocSize();
-
-//! Get the maximum allocation size for a machine.
-size_t CUDAPinnedMaxAllocSize();
-
-//! Get the minimum chunk size for buddy allocator.
-size_t CpuMinChunkSize();
-
-//! Get the maximum chunk size for buddy allocator.
-size_t CpuMaxChunkSize();
-
-//! Get the minimum chunk size for buddy allocator.
-size_t CUDAPinnedMinChunkSize();
-
-//! Get the maximum chunk size for buddy allocator.
-size_t CUDAPinnedMaxChunkSize();
-
-typedef enum {
-  isa_any,
-  sse42,
-  avx,
-  avx2,
-  avx512f,
-  avx512_core,
-  avx512_core_vnni,
-  avx512_mic,
-  avx512_mic_4ops,
-} cpu_isa_t;  // Instruction set architecture
-
-// May I use some instruction
-bool MayIUse(const cpu_isa_t cpu_isa);
-
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/cupti_lib_path.h.in b/lite/backends/x86/cupti_lib_path.h.in
deleted file mode 100644
index 017384bfbb..0000000000
--- a/lite/backends/x86/cupti_lib_path.h.in
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#define CUPTI_LIB_PATH "@CUPTI_LIBRARY_PATH@"
diff --git a/lite/backends/x86/dynamic_loader.cc b/lite/backends/x86/dynamic_loader.cc
deleted file mode 100644
index 0f27a19cf5..0000000000
--- a/lite/backends/x86/dynamic_loader.cc
+++ /dev/null
@@ -1,263 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "lite/backends/x86/dynamic_loader.h"
-
-#include <memory>
-#include <mutex>  // NOLINT
-#include <string>
-
-#include "gflags/gflags.h"
-#include "glog/logging.h"
-#include "lite/backends/x86/cupti_lib_path.h"
-#include "lite/backends/x86/port.h"
-#include "lite/backends/x86/warpctc_lib_path.h"
-#include "lite/utils/paddle_enforce.h"
-
-DEFINE_string(cudnn_dir,
-              "",
-              "Specify path for loading libcudnn.so. For instance, "
-              "/usr/local/cudnn/lib. If empty [default], dlopen "
-              "will search cudnn from LD_LIBRARY_PATH");
-
-DEFINE_string(cuda_dir,
-              "",
-              "Specify path for loading cuda library, such as libcublas, "
-              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
-              "dlopen will search cuda from LD_LIBRARY_PATH");
-
-DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
-
-DEFINE_string(nccl_dir,
-              "",
-              "Specify path for loading nccl library, such as libcublas, "
-              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
-              "dlopen will search cuda from LD_LIBRARY_PATH");
-
-DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
-
-DEFINE_string(
-    tensorrt_dir,
-    "",
-    "Specify path for loading tensorrt library, such as libnvinfer.so.");
-
-DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH;
-static constexpr char warpctc_lib_path[] = WARPCTC_LIB_PATH;
-
-#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-static constexpr char* win_cublas_lib = "cublas64_" PADDLE_CUDA_BINVER ".dll";
-static constexpr char* win_curand_lib = "curand64_" PADDLE_CUDA_BINVER ".dll";
-static constexpr char* win_cudnn_lib = "cudnn64_" PADDLE_CUDNN_BINVER ".dll";
-#endif
-
-static inline std::string join(const std::string& part1,
-                               const std::string& part2) {
-  // directory separator
-  const char sep = '/';
-  if (!part2.empty() && part2.front() == sep) {
-    return part2;
-  }
-  std::string ret;
-  ret.reserve(part1.size() + part2.size() + 1);
-  ret = part1;
-  if (!ret.empty() && ret.back() != sep) {
-    ret += sep;
-  }
-  ret += part2;
-  return ret;
-}
-
-static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
-                                                int dynload_flags) {
-  VLOG(3) << "Try to find library: " << dso_path
-          << " from default system path.";
-  // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
-  // and /usr/local/lib path
-  void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-
-// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
-// bring System Integrity Projection (SIP), if dso_handle
-// is null, search from default package path in Mac OS.
-#if defined(__APPLE__) || defined(__OSX__)
-  if (nullptr == dso_handle) {
-    dso_handle =
-        dlopen(join("/usr/local/cuda/lib/", dso_path).c_str(), dynload_flags);
-    if (nullptr == dso_handle) {
-      if (dso_path == "libcudnn.dylib") {
-        LOG(WARNING) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
-                        "For instance, sudo tar -xzf "
-                        "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
-                        "chmod a+r /usr/local/cuda/include/cudnn.h "
-                        "/usr/local/cuda/lib/libcudnn*";
-      }
-    }
-  }
-#endif
-
-  if (nullptr == dso_handle) {
-    LOG(WARNING) << "Can not find library: " << dso_path
-                 << ". The process maybe hang. Please try to add the lib path "
-                    "to LD_LIBRARY_PATH.";
-  }
-  return dso_handle;
-}
-
-static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
-                                               const std::string& dso_name,
-                                               bool throw_on_error = true) {
-#if !defined(_WIN32)
-  int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
-#else
-  int dynload_flags = 0;
-#endif  // !_WIN32
-  void* dso_handle = nullptr;
-
-  std::string dlPath = dso_name;
-  if (search_root.empty()) {
-    dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
-  } else {
-    // search xxx.so from custom path
-    dlPath = join(search_root, dso_name);
-    dso_handle = dlopen(dlPath.c_str(), dynload_flags);
-#if !defined(_WIN32)
-    auto errorno = dlerror();
-#else
-    auto errorno = GetLastError();
-#endif  // !_WIN32
-    // if not found, search from default path
-    if (nullptr == dso_handle) {
-      LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
-                   << errorno << ")";
-      if (dlPath.find("nccl") != std::string::npos) {
-        LOG(INFO)
-            << "You may need to install 'nccl2' from NVIDIA official website: "
-            << "https://developer.nvidia.com/nccl/nccl-download"
-            << "before install PaddlePaddle";
-      }
-      dlPath = dso_name;
-      dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
-    }
-  }
-  auto error_msg =
-      "Failed to find dynamic library: %s ( %s ) \n Please specify "
-      "its path correctly using following ways: \n Method. set "
-      "environment variable LD_LIBRARY_PATH on Linux or "
-      "DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: "
-      "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, "
-      "using the DYLD_LIBRARY_PATH is impossible unless System "
-      "Integrity Protection (SIP) is disabled.";
-#if !defined(_WIN32)
-  auto errorno = dlerror();
-#else
-  auto errorno = GetLastError();
-#endif  // !_WIN32
-  if (throw_on_error) {
-    CHECK(dso_handle != nullptr);
-    // PADDLE_ENFORCE(nullptr != dso_handle, error_msg, dlPath, errorno);
-  } else if (nullptr == dso_handle) {
-    // LOG(WARNING) << string::Sprintf(error_msg, dlPath, errorno);
-  }
-
-  return dso_handle;
-}
-
-void* GetCublasDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib);
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
-#endif
-}
-
-void* GetCUDNNDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false);
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib);
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false);
-#endif
-}
-
-void* GetCUPTIDsoHandle() {
-  std::string cupti_path = cupti_lib_path;
-  if (!FLAGS_cupti_dir.empty()) {
-    cupti_path = FLAGS_cupti_dir;
-  }
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", false);
-#else
-  return GetDsoHandleFromSearchPath(cupti_path, "libcupti.so", false);
-#endif
-}
-
-void* GetCurandDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib);
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
-#endif
-}
-
-void* GetWarpCTCDsoHandle() {
-  std::string warpctc_dir = warpctc_lib_path;
-  if (!FLAGS_warpctc_dir.empty()) {
-    warpctc_dir = FLAGS_warpctc_dir;
-  }
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(warpctc_dir, "warpctc.dll");
-#else
-  return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.so");
-#endif
-}
-
-void* GetNCCLDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so");
-#endif
-}
-
-void* GetTensorRtDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so");
-#endif
-}
-
-void* GetMKLMLDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so");
-#endif
-}
-
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/dynamic_loader.h b/lite/backends/x86/dynamic_loader.h
deleted file mode 100644
index 81c277ffc8..0000000000
--- a/lite/backends/x86/dynamic_loader.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-
-#ifndef _WIN32
-#define DECLARE_TYPE(__name, ...) decltype(__name(__VA_ARGS__))
-#else
-#define DECLARE_TYPE(__name, ...) decltype(auto)
-#endif
-
-void* GetCublasDsoHandle();
-void* GetCUDNNDsoHandle();
-void* GetCUPTIDsoHandle();
-void* GetCurandDsoHandle();
-void* GetWarpCTCDsoHandle();
-void* GetNCCLDsoHandle();
-void* GetTensorRtDsoHandle();
-void* GetMKLMLDsoHandle();
-
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/jit/CMakeLists.txt b/lite/backends/x86/jit/CMakeLists.txt
deleted file mode 100644
index e4113832c6..0000000000
--- a/lite/backends/x86/jit/CMakeLists.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-
-set(jit_file ${PADDLE_BINARY_DIR}/lite/backends/x86/jit/kernels.h)
-file(WRITE ${jit_file} "// Generated by the lite/backends/x86/jit/CMakeLists.txt.  DO NOT EDIT!\n\n")
-file(APPEND ${jit_file} "\#pragma once\n")
-file(APPEND ${jit_file} "\#include \"lite/backends/x86/jit/helper.h\"\n")
-file(APPEND ${jit_file} "\#include \"lite/backends/x86/jit/registry.h\"\n\n")
-
-set(JIT_KERNEL_DEPS x86_cpu_info cblas gflags xxhash)
-
-file(GLOB jit_kernel_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
-list(REMOVE_ITEM jit_kernel_cc_srcs test.cc benchmark.cc)
-lite_cc_library(jit_kernel_base SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS})
-
-# refer must go first
-add_subdirectory(refer)
-add_subdirectory(more)
-if(WITH_XBYAK)
-    add_subdirectory(gen)
-endif()
-
-lite_cc_library(jit_kernel_helper SRCS ${jit_kernel_cc_srcs} DEPS ${JIT_KERNEL_DEPS})
-#lite_cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel_helper)
-
-#if(NOT WIN32)
-    #lite_cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper tensor)
-#endif()
diff --git a/lite/backends/x86/jit/README.en.md b/lite/backends/x86/jit/README.en.md
deleted file mode 100644
index cd2aa5c242..0000000000
--- a/lite/backends/x86/jit/README.en.md
+++ /dev/null
@@ -1,103 +0,0 @@
-# JIT Kernel
-
-JIT(Just In Time) Kernel contains actually generated code and some other implemenations with the same logic.
-Each implementation has its own condition to use, defined in `CanBeUsed`.
-They are combined together to get the best performance of one single independent function.
-They could be some very simple functions like vector multiply, or some complicated functions like LSTM.
-And they can be composed with some other exited jit kernels to build up a complex function. 
-Currently it's only supported on CPU yet.
-
-## Contents
-
-```txt
-PaddlePaddle/Paddle/paddle/fluid/
-├── ...
-└── lite/
-    ├── .../
-    └── jit/
-        ├── ...
-        ├── gen/
-        │   └── ...
-        |── more/
-        │   ├── ...
-        │   ├── mkl/
-        │   │   └── ...
-        │   ├── mkldnn/
-        │   │   └── ...
-        │   ├── mix/
-        │   │   └── ...
-        │   ├── intrinsic/
-        │   │   └── ...
-        │   └── openblas/
-        │       └── ...
-        └── refer/
-            └── ...
-```
-
-All basical definations of jit kernels are addressed in `lite/backends/x86/jit` including these three key folders `refer`, `gen`, `more`. There is only one unique name for each kernel while may have seraval implementations with same functionality.
-
-- `refer`: Each kernel must have one reference implementation on CPU, and it should only focus on the correctness and should not depends on any third-party libraries.
-- `gen`: The code generated should be kept here. They should be designed focusing on the best performance, which depends on Xbyak.
-- `more`: All other implementations should be kept in this folder with one directory corresponding to one library kind or method kind, such as mkl, mkldnn, openblas or intrinsic code. Each implementation should have it advantage. 
-
-## How to use
-
-We present these methods to get the functions:
-- `GetAllCandidateFuncs`. It can return all the implementations supported. All of the implementations can get the same result. You can do some runtime benchmark to choose which should actually be used.
-- `GetDefaultBestFunc`. It only return one default function pointer, which is tuning offline with some genenal configures and attributes. This should cover most situations.
-- `KernelFuncs::Cache()`. It can get the default functions and save it for next time with the same attribute. 
-- `GetReferFunc`. It can only get the reference code in CPU, and all the others implementations have same logic with this reference code.
-
-And here are some examples:
-
-Get from cache:
-
-```cpp
-    using T = float;
-    jit::seq_pool_attr_t attr(width, jit::SeqPoolType::kSum);
-    auto seqpool_func = jit::KernelFuncs<jit::SeqPoolTuple<T>, platform::CPUPlace>::Cache().At(attr);
-    seqpool_func(src_data, dst_data, &attr);
-```
-
-Get all implementations and run once:
-
-```cpp
-    using T = float;
-    jit::seq_pool_attr_t attr(width, jit::SeqPoolType::kSum);
-    auto funcs = jit::GetAllCandidateFuncsWithTypes<jit::SeqPoolTuple<T>, platform::CPUPlace>(attr);
-    for (auto f : funcs) {
-        LOG(INFO) << "Kernel implementation type: " << f.first;
-        f.second(src_data, dst_data, &attr);
-    }
-```
-
-All kernels are inlcuded in `lite/backends/x86/jit/kernels.h`, which is automatically generated in compile time, you can only include this one header to get all the registered kernels.
-
-## Solid Test
-
-- Unit Test
-    All functions should be compared with the corresponding reference functions, including data tyep `float` and `double`.
-- Benchmark
-    All functions should be tested, and make sure the `jit::GetDefaultBestFunc` function obtain the best performance with all attributes.
-
-# How to add new kernel
-
-## Required
-
-1. Add `your_key` at `KernelType`.
-2. Add your new `KernelTuple` which must include `your_key`. It should be a combination of the data type, attribute type and function type. You can refer `SeqPoolTuple`.
-3. Add reference function of `your_key`. 
-Note:
-    - this should be run on CPU and do not depend on any third-party.
-    - Add `USE_JITKERNEL_REFER(your_key)` in `refer/CmakeLists.txt` to make sure this code can be used.
-4. Add unit test in `test.cc`, and verfiy at least `float` and `double`.
-Test more data type for some special functions if necessary, for example `int8`.
-5. Add functions in `benchmark.cc` to test all function of same `KernelType`. Make sure `GetDefaultBestFunc` always get the best one.
-
-## Optional
-
-Add more implementations of `your_kery` for performance enhancement.
-
-1. Add functions based on generated code in `gen`. It should be derived from `JitCode` and should have correpsonding creator from `JitCodeCreator` which will be registered on the `your_key`.
-2. If new attribute type is added, you should specialize `JitCodeKey` of this type.
-3. Add more functions in `more`，you can use any third party you wish, like mkl, mkldnn or intrinsic code to reach the best performance.
diff --git a/lite/backends/x86/jit/README.md b/lite/backends/x86/jit/README.md
deleted file mode 100644
index 6998c5d867..0000000000
--- a/lite/backends/x86/jit/README.md
+++ /dev/null
@@ -1,94 +0,0 @@
-# JIT Kernel
-
-结合函数模板和JIT生成需要的kernel函数。
-这里的kernel是比Operator中kernel更小级别的算子单元，更侧重的是在不同硬件上的性能。可以有多重第三方库的实现，每种实现有自己的`CanBeUsed`函数负责什么条件下可以被调用。
-这里实现的函数可以非常细粒度的函数方法，比如Vector MUL， 也可以是一个复杂的逻辑比如LSTM等。复杂的逻辑也可以由自己的底层函数拼接而成。
-目前仅支持CPU上的高性能计算。
-
-## 目录结构
-
-```txt
-PaddlePaddle/Paddle/paddle/fluid/
-├── ...
-└── lite/
-    ├── .../
-    └── jit/
-        ├── ...
-        ├── gen/
-        │   └── ...
-        |── more/
-        │   ├── ...
-        │   ├── mkl/
-        │   │   └── ...
-        │   ├── mkldnn/
-        │   │   └── ...
-        │   ├── mix/
-        │   │   └── ...
-        │   ├── intrinsic/
-        │   │   └── ...
-        │   └── openblas/
-        │       └── ...
-        └── refer/
-            └── ...
-```
-
-基本类的定义都放在根目录下，根目录下包括gen,more和refer三个目录。每个目录下都是一种或者多种实现，每种kernel算子都需要有reference的实现，用作单元测试的基准，其他的实现都是可选的。
-- gen: 代表使用jit生成的code，需要依赖xbyak库。该实现最关心的就是性能。
-- refer: 代表reference的实现，每种kernel算子都需要有在CPU上的reference的实现，他主要关心的算法逻辑的正确性。
-- more: 下面可以放入跟多实现，可以包括mkl，mkldnn，intrinsic，openblas等，也可以是自身已有的kernel组合。
-
-## 动态获取
-
-- 提供`GetAllCandidateFuncs`方法，根据输入的kernel类别，获取满足要求的所有函数实现。所有实现保证结果一致，但是速度不一致，可以根据具体输入属性大小，动态测试得到当前最优实现，手动选择最优函数。
-- 提供`GetDefaultBestFunc`方法，返回一个默认最优的函数实现。该函数是根据一些通用配置离线tuning之后的结果，能覆盖大多数情况下最优结果。
-- 提供`KernelFuncs::Cache()`方法，该方法会返回默认最优的函数，同时会缓存该函数指针，如果出现属性一致的情况，直接返回上次的函数指针，如果不存在则根据属性新建。
-- 提供`GetReferFunc` 方法，返回该kernel最原始的逻辑函数。该方法与kernel的输入大小和属性没有任何关系，有且并只有一个在CPU上的实现。该方法表征了kernel的原始逻辑，其他所有实现的逻辑与它保持一致。
-
-### 例子
-
-所有kernel的调用只需要在头文件中包含`"lite/backends/x86/jit/kernels.h"`， 该文件是编译时自动生成的。
-
-直接从缓存中获取默认最优的函数。
-
-```cpp
-    using T = float;
-    jit::seq_pool_attr_t attr(width, jit::SeqPoolType::kSum);
-    auto seqpool_func = jit::KernelFuncs<jit::SeqPoolTuple<T>, platform::CPUPlace>::Cache().At(attr);
-    seqpool_func(src_data, dst_data, &attr);
-```
-
-跑一遍所有实现，并输出实现类别。
-
-```cpp
-    using T = float;
-    jit::seq_pool_attr_t attr(width, jit::SeqPoolType::kSum);
-    auto funcs = jit::GetAllCandidateFuncsWithTypes<jit::SeqPoolTuple<T>, platform::CPUPlace>(attr);
-    for (auto f : funcs) {
-        LOG(INFO) << "Kernel implementation type: " << f.first;
-        f.second(src_data, dst_data, &attr);
-    }
-```
-
-## 测试
-
-- 逻辑测试
-    所有实现都要与refer的code对比，需要满足精度要求， 包括float和double的数据类型
-- 性能测试
-    所有实现的性能对比，并且与最终的`jit::GetDefaultBestFunc`方法对比，该方法拿到的性能需要在各种条件下都是最好的。
-
-# 如何添加新的算子
-
-1. 在`KernelType` 中添加 `your_key` 。
-2. 实现Reference 的逻辑，这个是必须是在CPU上的实现，并且不能依赖任何第三方库。实现后在`refer/CmakeLists.txt`中添加`USE_JITKERNEL_REFER(your_key)`来使用该kernel。
-3. (optional) 实现更多的算法在`more`目录下，可以依赖mkl，intrinsic或者mkldnn等第三方库。
-4. (optional) 实现基于Xbyak的生成code，在`gen`目下。 jitcode需要实现自己的`JitCodeCreator`，并注册在与refer相同的`KernelType`上。
-5. 添加新的`KernelTuple`，需要与`KernelType`一一对应，是所有类型的一个打包，包括数据类型，属性的类型，以及返回的函数类型。可以参考`SeqPoolTuple`，新加的Attr类型需要特例化`JitCodeKey`方法。
-6. 在`test.cc`中添加unit test，至少需要测试`float`和`double`两种数据类型，如有必要需要支持额外的数据类型，比如`int8`的相关函数。
-7. 在`benchmark.cc`中添加相应的性能对比，同一种kernel需要对比所有实现，并且确保`GetDefaultBestFunc`得到的实现一直是速度最快的。
-
-# 优点
-- 接口方便，灵活调用。
-- 同一套逻辑可以有多套实现，可以依赖多套第三方库，互不影响。
-- 目录结构清晰，不会在某个文件中有多个宏定义，导致的可读性差问题。
-- 优化方便，可以直接针对某种属性针对性优化，并不影响其他属性下的性能。
-- 可以支持多种平台，包括Linux，Mac 和 Windows，至少可以保证每种平台都可以正常work。后期也可以针对不同平台有针对的优化。框架层面可以使用统一接口，不必关心底层实现。
diff --git a/lite/backends/x86/jit/benchmark.cc b/lite/backends/x86/jit/benchmark.cc
deleted file mode 100644
index c49984691e..0000000000
--- a/lite/backends/x86/jit/benchmark.cc
+++ /dev/null
@@ -1,576 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include <iostream>
-#include <random>
-#include <string>
-#include <vector>
-#include "gflags/gflags.h"
-#include "glog/logging.h"
-#include "lite/backends/x86/jit/kernels.h"
-#include "lite/backends/x86/legacy_place.h"
-#include "lite/core/tensor.h"
-
-DEFINE_int32(burning, 10, "Burning times.");
-DEFINE_int32(repeat, 3000, "Repeat times.");
-DEFINE_int32(max_size, 1000, "The Max size would be tested.");
-DEFINE_string(filter, "", "The Benchmark name would be run.");
-
-class BenchJITKernel {
- public:
-  BenchJITKernel() = default;
-  virtual ~BenchJITKernel() = default;
-  virtual void Run() = 0;
-  virtual const char* Name() = 0;
-  virtual const char* Dtype() = 0;
-  virtual const char* Place() = 0;
-};
-
-static std::vector<BenchJITKernel*> g_all_benchmarks;
-
-BenchJITKernel* InsertBenchmark(BenchJITKernel* b) {
-  g_all_benchmarks.push_back(b);
-  return b;
-}
-
-#define BENCH_JITKERNEL(name, dtype, place)                                    \
-  class BenchJITKernel_##name##_##dtype##_##place##_ : public BenchJITKernel { \
-   public:                                                                     \
-    const char* Name() override { return #name; }                              \
-    const char* Dtype() override { return #dtype; }                            \
-    const char* Place() override { return #place; }                            \
-    void Run() override;                                                       \
-  };                                                                           \
-  static auto inserted_##name##_##dtype##_##place##_ UNUSED =                  \
-      InsertBenchmark(new BenchJITKernel_##name##_##dtype##_##place##_());     \
-  void BenchJITKernel_##name##_##dtype##_##place##_::Run()
-
-void RUN_ALL_BENCHMARK() {
-  for (auto p : g_all_benchmarks) {
-    if (!FLAGS_filter.empty() && FLAGS_filter != p->Name()) {
-      continue;
-    }
-    LOG(INFO) << "Benchmark " << p->Name() << "." << p->Dtype() << "."
-              << p->Place();
-    p->Run();
-  }
-}
-
-template <typename T>
-void RandomVec(const int n,
-               T* a,
-               const T lower = static_cast<T>(-20.f),
-               const T upper = static_cast<T>(20.f),
-               unsigned int seed = 100) {
-  std::mt19937 rng(seed);
-  std::uniform_real_distribution<double> uniform_dist(0, 1);
-  for (int i = 0; i < n; ++i) {
-    a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
-  }
-}
-
-std::vector<int> TestSizes() {
-  std::vector<int> s;
-  for (int i = 1; i <= FLAGS_max_size; ++i) {
-    s.push_back(i);
-  }
-  return s;
-}
-
-template <typename KernelTuple, typename... Args>
-struct BenchFunc {
-  // return this function avg time
-  // TODO(TJ): clear cache every time
-  double operator()(const typename KernelTuple::func_type tgt, Args... args) {
-    for (int i = 0; i < FLAGS_burning; ++i) {
-      tgt(args...);
-    }
-    auto start = paddle::lite::PosixInNsec() * 1e-3;
-    for (int i = 0; i < FLAGS_repeat; ++i) {
-      tgt(args...);
-    }
-    auto end = paddle::lite::PosixInNsec() * 1e-3;
-    return static_cast<double>(end - start) / FLAGS_repeat;
-  }
-};
-
-namespace jit = paddle::lite::jit;
-
-template <typename KernelTuple, typename PlaceType, typename... Args>
-void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) {
-  BenchFunc<KernelTuple, Args...> benchmark;
-  std::vector<std::pair<std::string, double>> infos;
-  auto funcs = jit::GetAllCandidateFuncsWithTypes<KernelTuple, PlaceType>(attr);
-  for (auto f : funcs) {
-    infos.push_back(std::make_pair(f.first, benchmark(f.second, args...)));
-  }
-
-  // Test result from Get function
-  auto tgt = jit::KernelFuncs<KernelTuple, PlaceType>::Cache().At(attr);
-  if (!tgt) {
-    LOG(FATAL) << "Target can not be empty!";
-  }
-  infos.push_back(std::make_pair("Target", benchmark(tgt, args...)));
-
-  // print
-  std::ostringstream loginfos;
-  loginfos << "Kernel Type " << jit::to_string(KernelTuple::kernel_type) << ": "
-           << attr << ": ";
-  for (auto pair : infos) {
-    loginfos << pair.first << " takes " << pair.second << " us; ";
-  }
-  LOG(INFO) << loginfos.str();
-}
-
-using Tensor = paddle::framework::Tensor;
-
-template <typename KernelTuple, typename PlaceType>
-void BenchKernelXYZN() {
-  using T = typename KernelTuple::data_type;
-  for (int d : TestSizes()) {
-    Tensor x, y, z;
-    x.Resize({d});
-    y.Resize({d});
-    z.Resize({d});
-    T* x_data = x.mutable_data<T>(PlaceType());
-    T* y_data = y.mutable_data<T>(PlaceType());
-    T* z_data = z.mutable_data<T>(PlaceType());
-    RandomVec<T>(d, x_data);
-    RandomVec<T>(d, y_data);
-    BenchAllImpls<KernelTuple, PlaceType>(
-        d, x.data<T>(), y.data<T>(), z_data, d);
-    // test inplace
-    BenchAllImpls<KernelTuple, PlaceType>(d, x.data<T>(), z_data, z_data, d);
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void BenchKernelAXYN() {
-  using T = typename KernelTuple::data_type;
-  for (int d : TestSizes()) {
-    const T a = static_cast<T>(3);
-    Tensor x, y;
-    x.Resize({d});
-    y.Resize({d});
-    T* x_data = x.mutable_data<T>(PlaceType());
-    T* y_data = y.mutable_data<T>(PlaceType());
-    RandomVec<T>(d, x_data);
-    BenchAllImpls<KernelTuple, PlaceType>(d, &a, x.data<T>(), y_data, d);
-    // test inplace
-    BenchAllImpls<KernelTuple, PlaceType>(d, &a, x.data<T>(), x_data, d);
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void BenchKernelXRN() {
-  using T = typename KernelTuple::data_type;
-  for (int d : TestSizes()) {
-    Tensor x;
-    RandomVec<T>(d, x.mutable_data<T>({d}, PlaceType()));
-    T res;
-    BenchAllImpls<KernelTuple, PlaceType>(d, x.data<T>(), &res, d);
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void BenchKernelXYN() {
-  using T = typename KernelTuple::data_type;
-  for (int d : TestSizes()) {
-    Tensor x, y;
-    x.Resize({d});
-    y.Resize({d});
-    T* x_data = x.mutable_data<T>(PlaceType());
-    T* y_data = y.mutable_data<T>(PlaceType());
-    RandomVec<T>(d, x_data);
-    BenchAllImpls<KernelTuple, PlaceType>(d, x.data<T>(), y_data, d);
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void BenchKernelLSTM() {
-  using T = typename KernelTuple::data_type;
-  for (bool use_peephole : {true, false}) {
-    for (int d : TestSizes()) {
-      const jit::lstm_attr_t attr(
-          d, jit::kVSigmoid, jit::kVTanh, jit::kVTanh, use_peephole);
-      Tensor x, ct_1, ct, ht, wp, checked;
-      x.Resize({4 * d});
-      ct_1.Resize({d});
-      ct.Resize({d});
-      ht.Resize({d});
-      wp.Resize({3 * d});
-      checked.Resize({2 * d});
-      auto place = PlaceType();
-      RandomVec<T>(x.numel(), x.mutable_data<T>(place), -2.f, 2.f);
-      RandomVec<T>(wp.numel(), wp.mutable_data<T>(place), -2.f, 2.f);
-      RandomVec<T>(ct_1.numel(), ct_1.mutable_data<T>(place), -2.f, 2.f);
-      const T* ct_1_data = ct_1.data<T>();
-      const T* wp_data = wp.data<T>();
-      T* x_data = x.mutable_data<T>(place);
-      T* checked_data = checked.mutable_data<T>(place);
-      T* ct_data = ct.mutable_data<T>(place);
-      T* ht_data = ht.mutable_data<T>(place);
-      jit::lstm_t step;
-      step.gates = x_data;
-      step.ct_1 = ct_1_data;
-      step.ct = ct_data;
-      step.ht = ht_data;
-      if (use_peephole) {
-        step.wp = wp_data;
-        step.checked = checked_data;
-      }
-      BenchAllImpls<KernelTuple, PlaceType>(attr, &step, &attr);
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void BenchKernelGRU() {
-  using T = typename KernelTuple::data_type;
-  for (int d : TestSizes()) {
-    const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh);
-    auto place = PlaceType();
-    Tensor x, ht_1, ht;
-    x.Resize({3 * d});
-    ht_1.Resize({d});
-    ht.Resize({d});
-    RandomVec<T>(3 * d, x.mutable_data<T>(place), -2.f, 2.f);
-    RandomVec<T>(d, ht_1.mutable_data<T>(place), -2.f, 2.f);
-    const T* ht_1_data = ht_1.data<T>();
-    T* x_data = x.mutable_data<T>(place);
-    T* ht_data = ht.mutable_data<T>(place);
-    jit::gru_t step;
-    step.gates = x_data;
-    step.ht_1 = ht_1_data;
-    step.ht = ht_data;
-    BenchAllImpls<KernelTuple, PlaceType>(attr, &step, &attr);
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void BenchKernelSeqPool() {
-  using T = typename KernelTuple::data_type;
-  std::vector<jit::SeqPoolType> pool_types = {
-      jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt};
-  for (auto type : pool_types) {
-    for (int w : TestSizes()) {
-      jit::seq_pool_attr_t attr(w, type);
-      for (int h : TestSizes()) {
-        attr.h = h;
-        Tensor x, y;
-        x.Resize({h * w});
-        y.Resize({w});
-        RandomVec<T>(h * w, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
-        const T* x_data = x.data<T>();
-        T* y_data = y.mutable_data<T>(PlaceType());
-        BenchAllImpls<KernelTuple, PlaceType>(attr, x_data, y_data, &attr);
-      }
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void BenchKernelEmbSeqPool() {
-  using T = typename KernelTuple::data_type;
-  std::vector<jit::SeqPoolType> pool_types = {jit::SeqPoolType::kSum};
-  int64_t tbl_h = 1e4;
-  for (int tbl_w : {10, 16, 256}) {
-    Tensor table;
-    table.Resize({tbl_h, tbl_w});
-    RandomVec<T>(tbl_h * tbl_w, table.mutable_data<T>(PlaceType()), -2.f, 2.f);
-    const T* table_data = table.data<T>();
-    for (auto type : pool_types) {
-      for (int idx_w : {1, 2, 10, 16}) {
-        for (int idx_h : {1, 2, 9, 13, 16}) {
-          int64_t out_w = tbl_w * idx_w;
-          jit::emb_seq_pool_attr_t attr(
-              tbl_h, tbl_w, idx_h, idx_w, out_w, type);
-          Tensor idx, out;
-          idx.Resize({idx_h, idx_w});
-          out.Resize({out_w});
-          RandomVec<int64_t>(idx_h * idx_w,
-                             idx.mutable_data<int64_t>(PlaceType()),
-                             0,
-                             tbl_h - 1);
-          const int64_t* idx_data = idx.data<int64_t>();
-          T* o_data = out.mutable_data<T>(PlaceType());
-          BenchAllImpls<KernelTuple, PlaceType>(
-              attr, table_data, idx_data, o_data, &attr);
-        }
-      }
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void BenchKernelSgd() {
-  using T = typename KernelTuple::data_type;
-  const T lr = 0.1;
-  auto UnDuplicatedRandomVec = [](
-      int n, const int64_t lower, const int64_t upper) -> std::vector<int64_t> {
-    PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1);
-    PADDLE_ENFORCE_GT(n, 0);
-    std::vector<int64_t> all, out;
-    for (int i = 0; i < n; ++i) {
-      all.push_back(i);
-    }
-    std::random_shuffle(all.begin(), all.end());
-    out.insert(out.begin(), all.begin(), all.begin() + n);
-    return out;
-  };
-  for (int param_h : {1, 1000}) {
-    for (int grad_w : {1, 2, 8, 16, 30, 256}) {
-      // only benchmark inplace
-      Tensor param;
-      param.Resize({param_h, grad_w});
-      T* param_data = param.mutable_data<T>(PlaceType());
-      RandomVec<T>(param_h * grad_w, param_data, -2.f, 2.f);
-      for (int rows_size = 1; rows_size <= std::min(param_h, 10); ++rows_size) {
-        Tensor grad;
-        grad.Resize({rows_size, grad_w});
-        std::vector<int64_t> rows =
-            UnDuplicatedRandomVec(rows_size, 0, rows_size - 1);
-        RandomVec<T>(
-            rows_size * grad_w, grad.mutable_data<T>(PlaceType()), -2.f, 2.f);
-        const T* grad_data = grad.data<T>();
-        const int64_t* rows_data = rows.data();
-        jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size);
-        BenchAllImpls<KernelTuple, PlaceType>(
-            attr, &lr, param_data, grad_data, rows_data, param_data, &attr);
-      }
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void BenchKernelMatMul() {
-  using T = typename KernelTuple::data_type;
-  for (int m : {1, 2, 3, 4}) {
-    for (int n : TestSizes()) {
-      for (int k : TestSizes()) {
-        Tensor a, b, c;
-        a.Resize({m * k});
-        b.Resize({k * n});
-        c.Resize({m * n});
-        RandomVec<T>(m * k, a.mutable_data<T>(PlaceType()), -2.f, 2.f);
-        RandomVec<T>(k * n, b.mutable_data<T>(PlaceType()), -2.f, 2.f);
-        const T* a_data = a.data<T>();
-        const T* b_data = b.data<T>();
-        T* c_data = c.mutable_data<T>(PlaceType());
-        const jit::matmul_attr_t attr{m, n, k};
-        BenchAllImpls<KernelTuple, PlaceType>(
-            attr, a_data, b_data, c_data, &attr);
-      }
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void BenchKernelSoftmax() {
-  using T = typename KernelTuple::data_type;
-  for (int bs : {1, 2, 10}) {
-    for (int n : TestSizes()) {
-      Tensor x, y;
-      x.Resize({bs, n});
-      y.Resize({bs, n});
-      RandomVec<T>(bs * n, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
-      const T* x_data = x.data<T>();
-      T* y_data = y.mutable_data<T>(PlaceType());
-      BenchAllImpls<KernelTuple, PlaceType>(n, x_data, y_data, n, bs, 1);
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void BenchKernelLayerNorm() {
-  using T = typename KernelTuple::data_type;
-  const T epsilon = 9.99999975e-06;
-  for (int n : {1, 2, 10}) {
-    for (int x_dim_0 : {1, 9, 17, 50}) {
-      int left = n * x_dim_0;
-      for (int x_dim_1 : TestSizes()) {
-        int right = x_dim_1;
-        int sz = left * right;
-        Tensor x, mean, var, scale, bias, out;
-        x.Resize({n, x_dim_0, x_dim_1});
-        out.Resize({n, x_dim_0, x_dim_1});
-        mean.Resize({n, x_dim_0});
-        var.Resize({n, x_dim_0});
-        scale.Resize({x_dim_1});
-        bias.Resize({x_dim_1});
-
-        RandomVec<T>(sz, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
-        RandomVec<T>(left, mean.mutable_data<T>(PlaceType()), -2.f, 2.f);
-        RandomVec<T>(left, var.mutable_data<T>(PlaceType()), -2.f, 2.f);
-        RandomVec<T>(right, scale.mutable_data<T>(PlaceType()), -2.f, 2.f);
-        RandomVec<T>(right, bias.mutable_data<T>(PlaceType()), -2.f, 2.f);
-
-        const T* scale_data = scale.data<T>();
-        const T* bias_data = bias.data<T>();
-        T* x_data = x.data<T>();
-        T* mean_data = mean.data<T>();
-        T* var_data = var.data<T>();
-        T* out_data = out.mutable_data<T>(PlaceType());
-
-        BenchAllImpls<KernelTuple, PlaceType>(right,
-                                              x_data,
-                                              out_data,
-                                              mean_data,
-                                              var_data,
-                                              scale_data,
-                                              bias_data,
-                                              left,
-                                              epsilon,
-                                              right);
-      }
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void BenchKernelCRFDecoding() {
-  using T = typename KernelTuple::data_type;
-  constexpr int state_trans_base_idx = 2;
-  for (int seq_len : {1, 11, 17, 50}) {
-    for (int tag_num : TestSizes()) {
-      int x_sz = seq_len * tag_num;
-      int w_sz = (tag_num + state_trans_base_idx) * tag_num;
-      Tensor x, w, alpha, track;
-      x.Resize({seq_len, tag_num});
-      w.Resize({tag_num + state_trans_base_idx, tag_num});
-      alpha.Resize({seq_len, tag_num});
-      track.Resize({seq_len, tag_num});
-
-      RandomVec<T>(x_sz, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
-      RandomVec<T>(w_sz, w.mutable_data<T>(PlaceType()), -2.f, 2.f);
-
-      const T* x_data = x.data<T>();
-      const T* w_data = w.data<T>();
-      T* alpha_data = alpha.mutable_data<T>(PlaceType());
-      int* track_data = track.mutable_data<int>(PlaceType());
-
-      BenchAllImpls<KernelTuple, PlaceType>(
-          tag_num, seq_len, x_data, w_data, alpha_data, track_data, tag_num);
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void BenchKernelVBroadcast() {
-  using T = typename KernelTuple::data_type;
-  for (int64_t w : {1, 16, 64, 100, 256}) {
-    Tensor x;
-    x.Resize({w});
-    RandomVec<T>(w, x.mutable_data<T>(PlaceType()));
-    const T* x_data = x.data<T>();
-    for (int h : TestSizes()) {
-      Tensor y;
-      y.Resize({h * w});
-      T* y_data = y.mutable_data<T>(PlaceType());
-      BenchAllImpls<KernelTuple, PlaceType>(
-          w, x_data, y_data, static_cast<int64_t>(h), w);
-    }
-  }
-}
-
-#define BenchKernelVMul BenchKernelXYZN
-#define BenchKernelVAdd BenchKernelXYZN
-#define BenchKernelVAddRelu BenchKernelXYZN
-#define BenchKernelVSub BenchKernelXYZN
-
-#define BenchKernelVScal BenchKernelAXYN
-#define BenchKernelVAddBias BenchKernelAXYN
-
-#define BenchKernelVRelu BenchKernelXYN
-#define BenchKernelVIdentity BenchKernelXYN
-#define BenchKernelVSquare BenchKernelXYN
-#define BenchKernelVExp BenchKernelXYN
-#define BenchKernelVSigmoid BenchKernelXYN
-#define BenchKernelVTanh BenchKernelXYN
-#define BenchKernelVCopy BenchKernelXYN
-
-#define BenchKernelHMax BenchKernelXRN
-#define BenchKernelHSum BenchKernelXRN
-
-#define BenchKernelLSTMCtHt BenchKernelLSTM
-#define BenchKernelLSTMC1H1 BenchKernelLSTM
-
-#define BenchKernelGRUH1 BenchKernelGRU
-#define BenchKernelGRUHtPart1 BenchKernelGRU
-#define BenchKernelGRUHtPart2 BenchKernelGRU
-
-using CPUPlace = paddle::lite::fluid::CPUPlace;
-
-#define BENCH_FP32_CPU(name)                                \
-  BENCH_JITKERNEL(name, FP32, CPU) {                        \
-    BenchKernel##name<jit::name##Tuple<float>, CPUPlace>(); \
-  }
-
-// xyzn
-BENCH_FP32_CPU(VMul);
-BENCH_FP32_CPU(VAdd);
-BENCH_FP32_CPU(VAddRelu);
-BENCH_FP32_CPU(VSub);
-
-// axyn
-BENCH_FP32_CPU(VScal);
-BENCH_FP32_CPU(VAddBias);
-
-// xyn
-BENCH_FP32_CPU(VRelu);
-BENCH_FP32_CPU(VIdentity);
-BENCH_FP32_CPU(VSquare);
-BENCH_FP32_CPU(VExp);
-BENCH_FP32_CPU(VSigmoid);
-BENCH_FP32_CPU(VTanh);
-BENCH_FP32_CPU(VCopy);
-
-// xrn
-BENCH_FP32_CPU(HMax);
-BENCH_FP32_CPU(HSum);
-
-// LSTM
-BENCH_FP32_CPU(LSTMCtHt);
-BENCH_FP32_CPU(LSTMC1H1);
-
-// GRU
-BENCH_FP32_CPU(GRUH1);
-BENCH_FP32_CPU(GRUHtPart1);
-BENCH_FP32_CPU(GRUHtPart2);
-
-BENCH_FP32_CPU(LayerNorm);
-BENCH_FP32_CPU(CRFDecoding);
-
-BENCH_FP32_CPU(SeqPool);
-BENCH_FP32_CPU(EmbSeqPool);
-BENCH_FP32_CPU(MatMul);
-BENCH_FP32_CPU(Softmax);
-BENCH_FP32_CPU(Sgd);
-BENCH_FP32_CPU(VBroadcast);
-
-// Benchmark all jit kernels including jitcode, mkl and refer.
-// To use this tool, run command: ./benchmark [options...]
-// Options:
-//     --burning: the burning time before count
-//     --repeat: the repeat times
-//     --max_size: the max size would be tested
-//     --filter: the bench name would be run
-int main(int argc, char* argv[]) {
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-  google::InitGoogleLogging(argv[0]);
-  LOG(INFO) << "Burning " << FLAGS_burning << " times, Repeat " << FLAGS_repeat
-            << " times.";
-
-  RUN_ALL_BENCHMARK();
-}
diff --git a/lite/backends/x86/jit/gen/CMakeLists.txt b/lite/backends/x86/jit/gen/CMakeLists.txt
deleted file mode 100644
index 99244ea9bd..0000000000
--- a/lite/backends/x86/jit/gen/CMakeLists.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-
-file(GLOB jitcode_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
-
-cc_library(jit_kernel_jitcode SRCS ${jitcode_cc_srcs} DEPS jit_kernel_base xbyak)
-set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} xbyak jit_kernel_jitcode PARENT_SCOPE)
-
-function(USE_JITKERNEL_GEN TARGET)
-    file(APPEND ${jit_file} "USE_JITKERNEL_GEN(${TARGET});\n")
-endfunction()
-
-# use gen jitcode kernel by name
-USE_JITKERNEL_GEN(kMatMul)
-USE_JITKERNEL_GEN(kVMul)
-USE_JITKERNEL_GEN(kVAdd)
-USE_JITKERNEL_GEN(kVSub)
-USE_JITKERNEL_GEN(kVAddRelu)
-USE_JITKERNEL_GEN(kVScal)
-USE_JITKERNEL_GEN(kVAddBias)
-USE_JITKERNEL_GEN(kVRelu)
-USE_JITKERNEL_GEN(kVSquare)
-USE_JITKERNEL_GEN(kVIdentity)
-USE_JITKERNEL_GEN(kVExp)
-USE_JITKERNEL_GEN(kVSigmoid)
-USE_JITKERNEL_GEN(kVTanh)
-USE_JITKERNEL_GEN(kLSTMCtHt)
-USE_JITKERNEL_GEN(kLSTMC1H1)
-USE_JITKERNEL_GEN(kGRUH1)
-USE_JITKERNEL_GEN(kGRUHtPart1)
-USE_JITKERNEL_GEN(kGRUHtPart2)
-USE_JITKERNEL_GEN(kNCHW16CMulNC)
-USE_JITKERNEL_GEN(kSeqPool)
-USE_JITKERNEL_GEN(kHMax)
-USE_JITKERNEL_GEN(kHSum)
-USE_JITKERNEL_GEN(kEmbSeqPool)
-USE_JITKERNEL_GEN(kSgd)
-USE_JITKERNEL_GEN(kVBroadcast)
diff --git a/lite/backends/x86/jit/gen/act.cc b/lite/backends/x86/jit/gen/act.cc
deleted file mode 100644
index f1f261c199..0000000000
--- a/lite/backends/x86/jit/gen/act.cc
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "lite/backends/x86/jit/gen/act.h"
-#include <memory>
-#include "lite/backends/x86/cpu_info.h"
-#include "lite/backends/x86/jit/registry.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace gen {
-
-const float ALIGN32_BEG exp_float_consts[] ALIGN32_END = {
-    REPEAT_8TIMES(1.f),
-    REPEAT_8TIMES(2.f),
-    REPEAT_8TIMES(0.5f),
-    REPEAT_8TIMES(EXP_HIG),
-    REPEAT_8TIMES(EXP_LOW),
-    REPEAT_8TIMES(CEPHES_LOG2EF),
-    REPEAT_8TIMES(CEPHES_EXP_C1),
-    REPEAT_8TIMES(CEPHES_EXP_C2),
-    REPEAT_8TIMES(CEPHES_EXP_P0),
-    REPEAT_8TIMES(CEPHES_EXP_P1),
-    REPEAT_8TIMES(CEPHES_EXP_P2),
-    REPEAT_8TIMES(CEPHES_EXP_P3),
-    REPEAT_8TIMES(CEPHES_EXP_P4),
-    REPEAT_8TIMES(CEPHES_EXP_P5),
-    REPEAT_8TIMES(EXP_MAX_INPUT),
-    REPEAT_8TIMES(SIGMOID_THRESHOLD_MAX),
-    REPEAT_8TIMES(SIGMOID_THRESHOLD_MIN)};
-
-const int ALIGN32_BEG exp_int_0x7f[] ALIGN32_END = {REPEAT_8TIMES(0x7f)};
-int ALIGN32_BEG g_tmp_mem[16] ALIGN32_END = {0};
-
-void VActJitCode::genCode() {
-  int offset = 0;
-  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
-    vmovups(ymm_src, ptr[param1 + offset]);
-    act<ymm_t>(ymm_dst, ymm_src, type_);
-    vmovups(ptr[param2 + offset], ymm_dst);
-    offset += sizeof(float) * YMM_FLOAT_BLOCK;
-  }
-  int rest = num_ % YMM_FLOAT_BLOCK;
-  while (rest > 0) {
-    int block = XMM_FLOAT_BLOCK;
-    if (rest >= 4) {
-      block = 4;
-      vmovups(xmm_src, ptr[param1 + offset]);
-    } else if (rest >= 2) {
-      block = 2;
-      vmovq(xmm_src, ptr[param1 + offset]);
-    } else {
-      block = 1;
-      vmovss(xmm_src, ptr[param1 + offset]);
-    }
-    act<xmm_t>(xmm_dst, xmm_src, type_);
-    if (rest >= 4) {
-      vmovups(ptr[param2 + offset], xmm_dst);
-    } else if (rest >= 2) {
-      vmovq(ptr[param2 + offset], xmm_dst);
-    } else {
-      vmovss(ptr[param2 + offset], xmm_dst);
-    }
-    offset += sizeof(float) * block;
-    rest -= block;
-  }
-  ret();
-}
-
-#define DECLARE_ACT_CREATOR(name)                                            \
-  class name##Creator : public JitCodeCreator<int> {                         \
-   public:                                                                   \
-    bool CanBeUsed(const int& attr) const override;                          \
-    size_t CodeSize(const int& d) const override;                            \
-    std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { \
-      return make_unique<name##JitCode>(attr, CodeSize(attr));               \
-    }                                                                        \
-  }
-
-DECLARE_ACT_CREATOR(VRelu);
-DECLARE_ACT_CREATOR(VSquare);
-DECLARE_ACT_CREATOR(VIdentity);
-DECLARE_ACT_CREATOR(VExp);
-DECLARE_ACT_CREATOR(VSigmoid);
-DECLARE_ACT_CREATOR(VTanh);
-
-// TODO(TJ): tuning use me
-bool VReluCreator::CanBeUsed(const int& d) const {
-  return x86::MayIUse(x86::avx);
-}
-
-bool VSquareCreator::CanBeUsed(const int& d) const {
-  return x86::MayIUse(x86::avx);
-}
-
-bool VIdentityCreator::CanBeUsed(const int& d) const {
-  return x86::MayIUse(x86::avx);
-}
-
-bool VExpCreator::CanBeUsed(const int& d) const {
-  return x86::MayIUse(x86::avx) && d < 32;
-}
-
-bool VSigmoidCreator::CanBeUsed(const int& d) const {
-  return x86::MayIUse(x86::avx);
-}
-
-bool VTanhCreator::CanBeUsed(const int& d) const {
-  return x86::MayIUse(x86::avx);
-}
-
-size_t VReluCreator::CodeSize(const int& d) const {
-  return 96 /* init size */ +
-         (d / YMM_FLOAT_BLOCK + 3) * 4 /* instructions */ *
-             8 /* average bytes for each instruction */;
-}
-
-size_t VSquareCreator::CodeSize(const int& d) const {
-  return 96 + (d / YMM_FLOAT_BLOCK + 3) * 4 * 8;
-}
-
-size_t VIdentityCreator::CodeSize(const int& d) const {
-  return 96 + (d / YMM_FLOAT_BLOCK + 3) * 4 * 8;
-}
-
-size_t VExpCreator::CodeSize(const int& d) const {
-  return 96 + (d / YMM_FLOAT_BLOCK + 3) * 70 * 8;
-}
-
-size_t VSigmoidCreator::CodeSize(const int& d) const {
-  return 96 + (d / YMM_FLOAT_BLOCK + 3) * 82 * 8;
-}
-
-size_t VTanhCreator::CodeSize(const int& d) const {
-  return 96 + (d / YMM_FLOAT_BLOCK + 3) * 84 * 8;
-}
-
-#undef DECLARE_ACT_CREATOR
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
-
-namespace gen = paddle::lite::jit::gen;
-
-REGISTER_JITKERNEL_GEN(kVRelu, gen::VReluCreator);
-REGISTER_JITKERNEL_GEN(kVSquare, gen::VSquareCreator);
-REGISTER_JITKERNEL_GEN(kVIdentity, gen::VIdentityCreator);
-REGISTER_JITKERNEL_GEN(kVExp, gen::VExpCreator);
-REGISTER_JITKERNEL_GEN(kVSigmoid, gen::VSigmoidCreator);
-REGISTER_JITKERNEL_GEN(kVTanh, gen::VTanhCreator);
diff --git a/lite/backends/x86/jit/gen/act.h b/lite/backends/x86/jit/gen/act.h
deleted file mode 100644
index 6366cff3c8..0000000000
--- a/lite/backends/x86/jit/gen/act.h
+++ /dev/null
@@ -1,347 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <glog/logging.h>
-#include <string>
-#include "lite/backends/x86/jit/gen/jitcode.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace gen {
-
-extern const float exp_float_consts[];
-extern const int exp_int_0x7f[];
-extern int g_tmp_mem[];
-
-#define EXP_HIG 88.3762626647949f
-#define EXP_LOW -88.3762626647949f
-#define CEPHES_LOG2EF 1.44269504088896341
-#define CEPHES_EXP_C1 0.693359375
-#define CEPHES_EXP_C2 -2.12194440e-4
-#define CEPHES_EXP_P0 1.9875691500E-4
-#define CEPHES_EXP_P1 1.3981999507E-3
-#define CEPHES_EXP_P2 8.3334519073E-3
-#define CEPHES_EXP_P3 4.1665795894E-2
-#define CEPHES_EXP_P4 1.6666665459E-1
-#define CEPHES_EXP_P5 5.0000001201E-1
-
-#define REPEAT_8TIMES(val) val, val, val, val, val, val, val, val
-
-#define OFFSET_EXP_ONE 0 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_TWO 1 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_0P5 2 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_HIG 3 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_LOW 4 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_LOG2EF 5 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_C1 6 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_C2 7 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P0 8 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P1 9 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P2 10 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P3 11 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P4 12 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_P5 13 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_EXP_MAX_INPUT 14 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_SIGMOID_MAX 15 * YMM_FLOAT_BLOCK * sizeof(float)
-#define OFFSET_SIGMOID_MIN 16 * YMM_FLOAT_BLOCK * sizeof(float)
-
-class VActFunc : public JitCode {
- public:
-  explicit VActFunc(size_t code_size, void* code_ptr)
-      : JitCode(code_size, code_ptr) {}
-  virtual void genCode() = 0;
-
- protected:
-  // compute RELU with ymm, xmm
-  template <typename JMM>
-  void relu_jmm(JMM& dst, JMM& src, int zero_idx = 15) {  // NOLINT
-    JMM zero = JMM(zero_idx);
-    vxorps(zero, zero, zero);
-    vmaxps(dst, src, zero);
-  }
-
-  // compute SQUARE with ymm, xmm
-  template <typename JMM>
-  void square_jmm(JMM& dst, JMM& src) {  // NOLINT
-    vmulps(dst, src, src);
-  }
-
-  // compute EXP with ymm, xmm
-  template <typename JMM>
-  void exp_jmm(JMM& dst,  // NOLINT
-               JMM& src,  // NOLINT
-               int src_idx = 11,
-               int fx_idx = 12,  // NOLINT
-               int fy_idx = 13,
-               int mask_idx = 14,
-               int tmp_idx = 15) {
-    using namespace x86;  // NOLINT
-    // check all idx can not equal
-    JMM jmm_src = JMM(src_idx);
-    JMM jmm_fx = JMM(fx_idx);
-    JMM jmm_fy = JMM(fy_idx);
-    JMM jmm_mask = JMM(mask_idx);
-    JMM jmm_tmp = JMM(tmp_idx);
-    reg64_t reg_ptr_global = rax;
-    push(reg_ptr_global);
-    vmovaps(jmm_src, src);
-    mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]);
-    vminps(jmm_src, jmm_src, jmm_tmp);
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]);
-    vmaxps(jmm_src, jmm_src, jmm_tmp);
-    // express exp(x) as exp(g + n*log(2))
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]);
-    vmulps(jmm_fx, jmm_src, jmm_tmp);
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]);
-    vaddps(jmm_fx, jmm_fx, jmm_tmp);
-    vroundps(jmm_fy, jmm_fx, 0x01);
-    // if greater, substract 1
-    vcmpgtps(jmm_mask, jmm_fy, jmm_fx);
-    vmovaps(jmm_tmp, ptr[reg_ptr_global]);
-    vandps(jmm_mask, jmm_mask, jmm_tmp);
-    vsubps(jmm_fx, jmm_fy, jmm_mask);
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C1]);
-    vmulps(jmm_fy, jmm_fx, jmm_tmp);
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]);
-    JMM ymm_z = JMM(jmm_mask.getIdx());
-    vmulps(ymm_z, jmm_fx, jmm_tmp);
-    vsubps(jmm_src, jmm_src, jmm_fy);
-    vsubps(jmm_src, jmm_src, ymm_z);
-    vmulps(ymm_z, jmm_src, jmm_src);
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]);
-    vmulps(dst, jmm_src, jmm_tmp);
-    for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5;
-         i += (YMM_FLOAT_BLOCK * sizeof(float))) {
-      vmovaps(jmm_tmp, ptr[reg_ptr_global + i]);  // P1~P4
-      vaddps(dst, dst, jmm_tmp);
-      vmulps(dst, dst, jmm_src);
-    }
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]);
-    vaddps(dst, dst, jmm_tmp);
-    vmulps(dst, dst, ymm_z);
-    vaddps(dst, dst, jmm_src);
-    vmovaps(jmm_tmp, ptr[reg_ptr_global]);
-    vaddps(dst, dst, jmm_tmp);
-    // build 2^n
-    JMM ymm_int = jmm_fx;
-    vcvttps2dq(ymm_int, jmm_fx);
-    mov(reg_ptr_global, reinterpret_cast<size_t>(exp_int_0x7f));
-    vmovdqa(jmm_tmp, ptr[reg_ptr_global]);
-    if (MayIUse(avx2) || std::is_same<JMM, xmm_t>::value) {
-      vpaddd(ymm_int, ymm_int, jmm_tmp);
-      vpslld(ymm_int, ymm_int, 23);
-    } else if (MayIUse(avx)) {
-      xmm_t xtmp1 = xmm_t(ymm_int.getIdx());
-      xmm_t xtmp2 = xmm_t(jmm_tmp.getIdx());
-      reg64_t reg_ptr_tmp = reg_ptr_global;
-      mov(reg_ptr_tmp, reinterpret_cast<size_t>(g_tmp_mem));
-      vmovdqa(ptr[reg_ptr_tmp], ymm_int);
-      vmovdqa(ptr[reg_ptr_tmp + YMM_FLOAT_BLOCK * sizeof(float)], jmm_tmp);
-      vpaddd(xtmp1, xtmp1, xtmp2);
-      vpslld(xtmp1, xtmp1, 23);
-      vmovdqa(ptr[reg_ptr_tmp], xtmp1);
-      // next 128bits
-      vmovdqa(xtmp1, ptr[reg_ptr_tmp + XMM_FLOAT_BLOCK * sizeof(float)]);
-      vmovdqa(xtmp2,
-              ptr[reg_ptr_tmp +
-                  (YMM_FLOAT_BLOCK + XMM_FLOAT_BLOCK) * sizeof(float)]);
-      vpaddd(xtmp1, xtmp1, xtmp2);
-      vpslld(xtmp1, xtmp1, 23);
-      vmovdqa(ptr[reg_ptr_tmp + XMM_FLOAT_BLOCK * sizeof(float)], xtmp1);
-      // load out
-      vmovdqa(ymm_int, ptr[reg_ptr_tmp]);
-    }
-    vmulps(dst, dst, ymm_int);
-    pop(reg_ptr_global);
-  }
-
-  // compute SIGMOID with ymm, xmm
-  template <typename JMM>
-  void sigmoid_jmm(JMM& dst,          // NOLINT
-                   JMM& src,          // NOLINT
-                   int src_idx = 11,  // NOLINT
-                   int fx_idx = 12,
-                   int fy_idx = 13,
-                   int mask_idx = 14,
-                   int tmp_idx = 15) {
-    // y = 1 / (1 + e^-x)
-    JMM jmm_tmp = JMM(tmp_idx);
-    JMM jmm_src = JMM(src_idx);
-    reg64_t reg_ptr_global = rax;
-    push(reg_ptr_global);
-    vmovaps(jmm_src, src);
-    mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MAX]);
-    vminps(jmm_src, jmm_src, jmm_tmp);
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MIN]);
-    vmaxps(jmm_src, jmm_src, jmm_tmp);
-    vxorps(jmm_tmp, jmm_tmp, jmm_tmp);
-    vsubps(jmm_src, jmm_tmp, jmm_src);
-    exp_jmm<JMM>(dst, jmm_src, src_idx, fx_idx, fy_idx, mask_idx, tmp_idx);
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
-    vaddps(dst, dst, jmm_tmp);
-    vdivps(dst, jmm_tmp, dst);
-    pop(reg_ptr_global);
-  }
-
-  // compute TANH with ymm, xmm
-  template <typename JMM>
-  void tanh_jmm(JMM& dst,          // NOLINT
-                JMM& src,          // NOLINT
-                int src_idx = 11,  // NOLINT
-                int fx_idx = 12,
-                int fy_idx = 13,
-                int mask_idx = 14,
-                int tmp_idx = 15) {
-    // y = 2 / (1 + e^(-2x)) - 1
-    JMM jmm_src = JMM(src_idx);
-    JMM jmm_tmp = JMM(tmp_idx);
-    JMM jmm_zero = JMM(mask_idx);
-    reg64_t reg_ptr_global = rax;
-    push(reg_ptr_global);
-    vmovaps(jmm_src, src);
-    mov(reg_ptr_global, reinterpret_cast<size_t>(exp_float_consts));
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
-    vxorps(jmm_zero, jmm_zero, jmm_zero);
-    vsubps(jmm_tmp, jmm_zero, jmm_tmp);
-    vmulps(jmm_src, jmm_src, jmm_tmp);
-    exp_jmm<JMM>(dst, jmm_src, src_idx, fx_idx, fy_idx, mask_idx, tmp_idx);
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
-    vaddps(dst, dst, jmm_tmp);
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]);
-    vdivps(dst, jmm_tmp, dst);
-    vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]);
-    vsubps(dst, dst, jmm_tmp);
-    pop(reg_ptr_global);
-  }
-
-  // compute IDENTITY with ymm, xmm
-  template <typename JMM>
-  void identity_jmm(JMM& dst, JMM& src, int zero_idx) {  // NOLINT
-    JMM zero = JMM(zero_idx);
-    vxorps(zero, zero, zero);
-    vaddps(dst, src, zero);
-    // TODO(TJ): use below
-    // dst.setIdx(src.getIdx());
-  }
-
-  template <typename JMM>
-  void act(JMM& dst, JMM& src, operand_type type) {  // NOLINT
-    // use 11~15
-    switch (type) {
-      case operand_type::RELU:
-        relu_jmm<JMM>(dst, src, 15);
-        break;
-      case operand_type::SQUARE:
-        square_jmm<JMM>(dst, src);
-        break;
-      case operand_type::EXP:
-        exp_jmm<JMM>(dst, src, 11, 12, 13, 14, 15);
-        break;
-      case operand_type::SIGMOID:
-        sigmoid_jmm<JMM>(dst, src, 11, 12, 13, 14, 15);
-        break;
-      case operand_type::TANH:
-        tanh_jmm<JMM>(dst, src, 11, 12, 13, 14, 15);
-        break;
-      case operand_type::IDENTITY:
-        identity_jmm<JMM>(dst, src, 15);
-        break;
-      default:
-        LOG(FATAL) << "Do not support this operand type: " << type;
-        break;
-    }
-  }
-};
-
-class VActJitCode : public VActFunc {
- public:
-  explicit VActJitCode(int d,
-                       operand_type type,
-                       size_t code_size,
-                       void* code_ptr = nullptr)
-      : VActFunc(code_size, code_ptr), num_(d), type_(type) {
-    if (!(type_ == operand_type::RELU || type_ == operand_type::EXP ||
-          type_ == operand_type::SIGMOID || type_ == operand_type::TANH ||
-          type_ == operand_type::IDENTITY || type_ == operand_type::SQUARE)) {
-      LOG(FATAL) << "Do not support this operand type: " << type_;
-    }
-    this->genCode();
-  }
-
-  std::string name() const override {
-    std::string base = "VActJitCode";
-    switch (type_) {
-      case operand_type::RELU:
-        base += "_Relu";
-        break;
-      case operand_type::SQUARE:
-        base += "_Square";
-        break;
-      case operand_type::EXP:
-        base += "_Exp";
-        break;
-      case operand_type::SIGMOID:
-        base += "_Sigmoid";
-        break;
-      case operand_type::TANH:
-        base += "_Tanh";
-        break;
-      case operand_type::IDENTITY:
-        base += "_Identity";
-        break;
-      default:
-        break;
-    }
-    return base;
-  }
-  void genCode() override;
-
- protected:
-  int num_;
-  operand_type type_;
-  reg64_t param1{abi_param1};
-  reg64_t param2{abi_param2};
-
-  xmm_t xmm_src = xmm_t(0);
-  ymm_t ymm_src = ymm_t(0);
-
-  xmm_t xmm_dst = xmm_t(1);
-  ymm_t ymm_dst = ymm_t(1);
-};
-
-#define DECLARE_ACT_JITCODE(name, op_type)                                    \
-  class name##JitCode : public VActJitCode {                                  \
-   public:                                                                    \
-    explicit name##JitCode(int d, size_t code_size, void* code_ptr = nullptr) \
-        : VActJitCode(d, op_type, code_size, code_ptr) {}                     \
-  };
-
-DECLARE_ACT_JITCODE(VRelu, operand_type::RELU);
-DECLARE_ACT_JITCODE(VSquare, operand_type::SQUARE);
-DECLARE_ACT_JITCODE(VIdentity, operand_type::IDENTITY);
-DECLARE_ACT_JITCODE(VExp, operand_type::EXP);
-DECLARE_ACT_JITCODE(VSigmoid, operand_type::SIGMOID);
-DECLARE_ACT_JITCODE(VTanh, operand_type::TANH);
-
-#undef DECLARE_ACT_JITCODE
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/jit/gen/blas.cc b/lite/backends/x86/jit/gen/blas.cc
deleted file mode 100644
index 0bddea6ace..0000000000
--- a/lite/backends/x86/jit/gen/blas.cc
+++ /dev/null
@@ -1,190 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "lite/backends/x86/jit/gen/blas.h"
-#include <memory>
-#include "lite/backends/x86/jit/registry.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace gen {
-
-void VXXJitCode::genCode() {
-  // do not need push stack, and do not need save avx512reg if do not use avx512
-  int offset = 0;
-  if (with_relu_) {
-    vxorps(ymm_zero, ymm_zero, ymm_zero);
-  }
-  if (scalar_index_ == 1) {
-    vbroadcastss(ymm_src1, ptr[param1]);
-  } else if (scalar_index_ == 2) {
-    vbroadcastss(ymm_src2, ptr[param2]);
-  }
-  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
-    if (scalar_index_ != 1) {
-      vmovups(ymm_src1, ptr[param1 + offset]);
-    }
-    if (scalar_index_ != 2) {
-      vmovups(ymm_src2, ptr[param2 + offset]);
-    }
-    if (type_ == operand_type::MUL) {
-      vmulps(ymm_dst, ymm_src1, ymm_src2);
-    } else if (type_ == operand_type::ADD) {
-      vaddps(ymm_dst, ymm_src1, ymm_src2);
-    } else if (type_ == operand_type::SUB) {
-      vsubps(ymm_dst, ymm_src1, ymm_src2);
-    }
-    if (with_relu_) {
-      vmaxps(ymm_dst, ymm_zero, ymm_dst);
-    }
-    vmovups(ptr[param3 + offset], ymm_dst);
-    offset += sizeof(float) * YMM_FLOAT_BLOCK;
-  }
-  int rest = num_ % YMM_FLOAT_BLOCK;
-  while (rest > 0) {
-    int block = XMM_FLOAT_BLOCK;
-    if (rest >= 4) {
-      block = 4;
-      if (scalar_index_ != 1) {
-        vmovups(xmm_src1, ptr[param1 + offset]);
-      }
-      if (scalar_index_ != 2) {
-        vmovups(xmm_src2, ptr[param2 + offset]);
-      }
-    } else if (rest >= 2) {
-      block = 2;
-      if (scalar_index_ != 1) {
-        vmovq(xmm_src1, ptr[param1 + offset]);
-      }
-      if (scalar_index_ != 2) {
-        vmovq(xmm_src2, ptr[param2 + offset]);
-      }
-    } else {
-      block = 1;
-      if (scalar_index_ != 1) {
-        vmovss(xmm_src1, ptr[param1 + offset]);
-      }
-      if (scalar_index_ != 2) {
-        vmovss(xmm_src2, ptr[param2 + offset]);
-      }
-    }
-    switch (type_) {
-      case operand_type::MUL:
-        vmulps(xmm_dst, xmm_src1, xmm_src2);
-        break;
-      case operand_type::ADD:
-        vaddps(xmm_dst, xmm_src1, xmm_src2);
-        break;
-      case operand_type::SUB:
-        vsubps(xmm_dst, xmm_src1, xmm_src2);
-        break;
-      default:
-        break;
-    }
-    if (with_relu_) {
-      vmaxps(xmm_dst, xmm_zero, xmm_dst);
-    }
-    if (rest >= 4) {
-      vmovups(ptr[param3 + offset], xmm_dst);
-    } else if (rest >= 2) {
-      vmovq(ptr[param3 + offset], xmm_dst);
-    } else {
-      vmovss(ptr[param3 + offset], xmm_dst);
-    }
-    offset += sizeof(float) * block;
-    rest -= block;
-  }
-  ret();
-}
-
-void NCHW16CMulNCJitCode::genCode() {
-  // RDI is ptr x_input
-  // RSI is ptr y_input
-  // RDX is ptr output
-  // RCX is height
-  // r8 is width
-
-  push(rbx);
-
-  xor_(rax, rax);
-  xor_(r10, r10);
-  vmovups(zmm3, ptr[rsi]);
-
-  L("h_loop");
-  xor_(rbx, rbx);
-  L("w_loop");
-  vmovups(zmm2, ptr[rdi + rax]);
-  vmulps(zmm1, zmm2, zmm3);
-  vmovups(ptr[rdx + rax], zmm1);
-  add(rax, 64);
-  inc(rbx);
-  cmp(r8, rbx);
-  jnz("w_loop");
-  inc(r10);
-  cmp(r10, rcx);
-  jnz("h_loop");
-
-  pop(rbx);
-  ret();
-}
-
-class NCHW16CMulNCCreator : public JitCodeCreator<int> {
- public:
-  bool CanBeUsed(const int& attr) const override {
-    return x86::MayIUse(x86::avx512f);
-  }
-  size_t CodeSize(const int& d) const override { return 256 * 1024; }
-  std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override {
-    return make_unique<NCHW16CMulNCJitCode>(attr, CodeSize(attr));
-  }
-};
-
-#define DECLARE_BLAS_CREATOR(name)                                           \
-  class name##Creator : public JitCodeCreator<int> {                         \
-   public:                                                                   \
-    bool CanBeUsed(const int& attr) const override {                         \
-      return x86::MayIUse(x86::avx) && attr <= 1024;                         \
-    }                                                                        \
-    size_t CodeSize(const int& d) const override {                           \
-      return 96 + d / YMM_FLOAT_BLOCK * 4 * 8;                               \
-    }                                                                        \
-    std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { \
-      return make_unique<name##JitCode>(attr, CodeSize(attr));               \
-    }                                                                        \
-  }
-
-DECLARE_BLAS_CREATOR(VMul);
-DECLARE_BLAS_CREATOR(VAdd);
-DECLARE_BLAS_CREATOR(VSub);
-DECLARE_BLAS_CREATOR(VAddRelu);
-DECLARE_BLAS_CREATOR(VScal);
-DECLARE_BLAS_CREATOR(VAddBias);
-
-#undef DECLARE_BLAS_CREATOR
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
-
-namespace gen = paddle::lite::jit::gen;
-
-REGISTER_JITKERNEL_GEN(kVMul, gen::VMulCreator);
-REGISTER_JITKERNEL_GEN(kVAdd, gen::VAddCreator);
-REGISTER_JITKERNEL_GEN(kVSub, gen::VSubCreator);
-REGISTER_JITKERNEL_GEN(kVAddRelu, gen::VAddReluCreator);
-REGISTER_JITKERNEL_GEN(kVScal, gen::VScalCreator);
-REGISTER_JITKERNEL_GEN(kVAddBias, gen::VAddBiasCreator);
-REGISTER_JITKERNEL_GEN(kNCHW16CMulNC, gen::NCHW16CMulNCCreator);
diff --git a/lite/backends/x86/jit/gen/blas.h b/lite/backends/x86/jit/gen/blas.h
deleted file mode 100644
index 39920195b2..0000000000
--- a/lite/backends/x86/jit/gen/blas.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "glog/logging.h"
-#include "lite/backends/x86/jit/gen/jitcode.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace gen {
-
-// function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu)
-class VXXJitCode : public JitCode {
- public:
-  explicit VXXJitCode(int d,
-                      operand_type type,
-                      int scalar_index,
-                      bool with_relu,
-                      size_t code_size = 256 * 1024,
-                      void* code_ptr = nullptr)
-      : JitCode(code_size, code_ptr),
-        num_(d),
-        type_(type),
-        scalar_index_(scalar_index),
-        with_relu_(with_relu) {
-    if (!(type_ == operand_type::MUL || type_ == operand_type::ADD ||
-          type_ == operand_type::SUB)) {
-      LOG(FATAL) << "Do not support this operand type: " << type_;
-    }
-    this->genCode();
-  }
-
-  std::string name() const override {
-    std::string base = "VXXJitCode";
-    if (scalar_index_ == 1) {
-      base += "_Scalar";
-    } else {
-      base += "_Vec";
-    }
-    if (type_ == operand_type::MUL) {
-      base += "_Mul";
-    } else if (type_ == operand_type::ADD) {
-      base += "_Add";
-    } else if (type_ == operand_type::SUB) {
-      base += "_SUB";
-    }
-    if (scalar_index_ == 2) {
-      base += "_Scalar";
-    } else {
-      base += "_Vec";
-    }
-    base += (with_relu_ ? "_Relu" : "");
-    base += "_D" + std::to_string(num_);
-    return base;
-  }
-  void genCode() override;
-
- private:
-  int num_;
-  operand_type type_;
-  int scalar_index_;
-  bool with_relu_;
-  reg64_t param1{abi_param1};
-  reg64_t param2{abi_param2};
-  reg64_t param3{abi_param3};
-
-  xmm_t xmm_src1 = xmm_t(0);
-  xmm_t xmm_src2 = xmm_t(1);
-  xmm_t xmm_dst = xmm_t(2);
-  xmm_t xmm_zero = xmm_t(3);
-
-  ymm_t ymm_src1 = ymm_t(0);
-  ymm_t ymm_src2 = ymm_t(1);
-  ymm_t ymm_dst = ymm_t(2);
-  ymm_t ymm_zero = ymm_t(3);
-};
-
-#define DECLARE_BLAS_JITCODE(name, op_type, scalar_idx, with_relu)             \
-  class name##JitCode : public VXXJitCode {                                    \
-   public:                                                                     \
-    explicit name##JitCode(int d, size_t code_size, void* code_ptr = nullptr)  \
-        : VXXJitCode(d, op_type, scalar_idx, with_relu, code_size, code_ptr) { \
-    }                                                                          \
-  };
-
-DECLARE_BLAS_JITCODE(VMul, operand_type::MUL, 0, false);
-DECLARE_BLAS_JITCODE(VAdd, operand_type::ADD, 0, false);
-DECLARE_BLAS_JITCODE(VSub, operand_type::SUB, 0, false);
-DECLARE_BLAS_JITCODE(VAddRelu, operand_type::ADD, 0, true);
-DECLARE_BLAS_JITCODE(VScal, operand_type::MUL, 1, false);
-DECLARE_BLAS_JITCODE(VAddBias, operand_type::ADD, 1, false);
-
-#undef DECLARE_BLAS_JITCODE
-
-// nChw16c = nChw16c .* NC
-class NCHW16CMulNCJitCode : public JitCode {
- public:
-  DECLARE_JIT_CODE(NCHW16CMulNCJitCode);
-  explicit NCHW16CMulNCJitCode(int d /*unused*/,
-                               size_t code_size,
-                               void* code_ptr = nullptr)
-      : JitCode(code_size, code_ptr) {
-    this->genCode();
-  }
-  void genCode() override;
-};
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/jit/gen/embseqpool.cc b/lite/backends/x86/jit/gen/embseqpool.cc
deleted file mode 100644
index 2ff6894383..0000000000
--- a/lite/backends/x86/jit/gen/embseqpool.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "lite/backends/x86/jit/gen/embseqpool.h"
-#include <stddef.h>  // offsetof
-#include <memory>
-#include <vector>
-#include "lite/backends/x86/jit/gen/act.h"  // for exp_float_consts ones
-#include "lite/backends/x86/jit/registry.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace gen {
-
-void EmbSeqPoolJitCode::genCode() {
-  preCode();
-  constexpr int block = YMM_FLOAT_BLOCK;
-  constexpr int max_num_regs = 8;
-  const int num_block = tbl_w_ / block;
-  const int num_groups = num_block / max_num_regs;
-  const size_t block_size = sizeof(float) * block;
-  std::vector<int> groups(num_groups, max_num_regs);
-  int rest_num_regs = num_block % max_num_regs;
-  if (rest_num_regs > 0) {
-    groups.push_back(rest_num_regs);
-  }
-
-  // protect param_dst
-  mov(reg_ptr_param_dst, param_dst);
-  mov(reg_idx_width_in_byte,
-      qword[param_attr + offsetof(emb_seq_pool_attr_t, index_width)]);
-  mov(reg_idx_height,
-      qword[param_attr + offsetof(emb_seq_pool_attr_t, index_height)]);
-  mov(rax, sizeof(int64_t));
-  mul(reg_idx_width_in_byte);
-  mov(reg_idx_width_in_byte, rax);
-  const size_t tbl_width_in_byte = sizeof(float) * tbl_w_;
-  int acc_num_regs = 0;
-  for (int num_regs : groups) {
-    Label l_next_idx_w, l_next_idx_h, l_save_now;
-    xor_(reg_idx_w_i_in_byte, reg_idx_w_i_in_byte);
-    mov(reg_ptr_dst_i, reg_ptr_param_dst);
-    add(reg_ptr_dst_i, acc_num_regs * block_size);
-
-    L(l_next_idx_w);
-    {
-      // h == 0
-      mov(reg_ptr_idx_i, param_idx);
-      add(reg_ptr_idx_i, reg_idx_w_i_in_byte);
-      mov(reg_idx, qword[reg_ptr_idx_i]);
-      mov(rax, tbl_width_in_byte);
-      mul(reg_idx);
-      mov(reg_ptr_tbl_i, rax);        // reg is offset now
-      add(reg_ptr_tbl_i, param_tbl);  // reg is ptr_i now
-      size_t w_offset = 0;
-      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
-        vmovups(ymm_t(reg_i + num_regs), ptr[reg_ptr_tbl_i + w_offset]);
-        w_offset += block_size;
-      }
-      add(reg_ptr_idx_i, reg_idx_width_in_byte);
-
-      // end condition of idx h
-      mov(reg_idx_h_end, reg_idx_height);
-      mov(rax, reg_idx_width_in_byte);
-      mul(reg_idx_h_end);
-      mov(reg_idx_h_end, rax);
-      add(reg_idx_h_end, reg_idx_w_i_in_byte);
-      add(reg_idx_h_end, param_idx);
-
-      cmp(reg_ptr_idx_i, reg_idx_h_end);
-      jge(l_save_now, T_NEAR);
-      L(l_next_idx_h);
-      {
-        mov(reg_idx, qword[reg_ptr_idx_i]);
-        mov(reg_ptr_tbl_i, reg_idx);
-        mov(rax, tbl_width_in_byte);
-        mul(reg_idx);
-        mov(reg_ptr_tbl_i, rax);
-        add(reg_ptr_tbl_i, param_tbl);
-        size_t w_offset = 0;
-        for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
-          vmovups(ymm_t(reg_i), ptr[reg_ptr_tbl_i + w_offset]);
-          vaddps(
-              ymm_t(reg_i + num_regs), ymm_t(reg_i + num_regs), ymm_t(reg_i));
-          w_offset += block_size;
-        }
-        add(reg_ptr_idx_i, reg_idx_width_in_byte);
-        cmp(reg_ptr_idx_i, reg_idx_h_end);
-        jl(l_next_idx_h, T_NEAR);
-      }  // end of idx h
-      L(l_save_now);
-      // avg or sqrt here, if needed
-      w_offset = 0;
-      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
-        vmovups(ptr[reg_ptr_dst_i + w_offset], ymm_t(reg_i + num_regs));
-        w_offset += block_size;
-      }
-      add(reg_ptr_dst_i, tbl_width_in_byte);
-      add(reg_idx_w_i_in_byte, sizeof(int64_t));
-      cmp(reg_idx_w_i_in_byte, reg_idx_width_in_byte);
-      jl(l_next_idx_w, T_NEAR);
-    }  // end of idx w
-
-    acc_num_regs += num_regs;
-    add(param_tbl, num_regs * block_size);  // do not use acc_num_regs
-  }                                         // end of groups
-  postCode();
-}
-
-class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
- public:
-  bool CanBeUsed(const emb_seq_pool_attr_t& attr) const override {
-    return x86::MayIUse(x86::avx) && attr.table_width % YMM_FLOAT_BLOCK == 0;
-  }
-  size_t CodeSize(const emb_seq_pool_attr_t& attr) const override {
-    return 96 + (attr.table_width / YMM_FLOAT_BLOCK) * 96 * 8;
-  }
-  std::unique_ptr<GenBase> CreateJitCode(
-      const emb_seq_pool_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.table_height, 0);
-    PADDLE_ENFORCE_GT(attr.table_width, 0);
-    PADDLE_ENFORCE_GT(attr.index_height, 0);
-    PADDLE_ENFORCE_GT(attr.index_width, 0);
-    PADDLE_ENFORCE_GT(attr.out_width, 0);
-    return make_unique<EmbSeqPoolJitCode>(attr, CodeSize(attr));
-  }
-};
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
-
-namespace gen = paddle::lite::jit::gen;
-
-REGISTER_JITKERNEL_GEN(kEmbSeqPool, gen::EmbSeqPoolCreator);
diff --git a/lite/backends/x86/jit/gen/embseqpool.h b/lite/backends/x86/jit/gen/embseqpool.h
deleted file mode 100644
index 7cae76f9dd..0000000000
--- a/lite/backends/x86/jit/gen/embseqpool.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <glog/logging.h>
-#include <string>
-#include "lite/backends/x86/jit/gen/jitcode.h"
-#include "lite/utils/paddle_enforce.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace gen {
-
-class EmbSeqPoolJitCode : public JitCode {
- public:
-  explicit EmbSeqPoolJitCode(const emb_seq_pool_attr_t& attr,
-                             size_t code_size = 256 * 1024,
-                             void* code_ptr = nullptr)
-      : JitCode(code_size, code_ptr),
-        tbl_w_(attr.table_width),
-        type_(attr.pool_type) {
-    if (type_ != SeqPoolType::kSum) {
-      LOG(FATAL) << "Only support sum pool yet ";
-    }
-    this->genCode();
-  }
-
-  std::string name() const override {
-    std::string base = "EmbSeqPoolJitCode";
-    if (type_ == SeqPoolType::kSum) {
-      base += "_Sum";
-    } else if (type_ == SeqPoolType::kAvg) {
-      base += "_Avg";
-    } else if (type_ == SeqPoolType::kSqrt) {
-      base += "_Sqrt";
-    }
-    base += ("_W" + std::to_string(tbl_w_));
-    return base;
-  }
-  void genCode() override;
-
- private:
-  int tbl_w_;
-  SeqPoolType type_;
-  reg64_t param_tbl{abi_param1};
-  reg64_t param_idx{abi_param2};
-  reg64_t param_dst{abi_param3};
-  reg64_t param_attr{abi_param4};
-
-  reg64_t reg_tmp{rax};
-
-  reg64_t reg_idx_width_in_byte{r8};
-  reg64_t reg_idx_height{r9};
-
-  reg64_t reg_ptr_tbl_i{r10};
-  reg64_t reg_idx{r10};  // could use same of reg_ptr_tbl_i
-  reg64_t reg_ptr_idx_i{r11};
-  reg64_t reg_ptr_dst_i{r12};
-  reg64_t reg_ptr_param_dst{r13};  // rdx is used in mul so protect param_dst
-
-  reg64_t reg_idx_w_i_in_byte{r14};
-  reg64_t reg_idx_h_end{r15};
-};
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/jit/gen/gru.cc b/lite/backends/x86/jit/gen/gru.cc
deleted file mode 100644
index c5737faf13..0000000000
--- a/lite/backends/x86/jit/gen/gru.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "lite/backends/x86/jit/gen/gru.h"
-#include <stddef.h>  // offsetof
-#include <memory>
-#include "lite/backends/x86/jit/registry.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace gen {
-
-void GRUJitCode::genCode() {
-  reg64_t reg_ptr_gates = rax;
-  reg64_t reg_ptr_ht_1 = r9;
-  reg64_t reg_ptr_ht = r10;
-  mov(reg_ptr_gates, ptr[param1 + offsetof(gru_t, gates)]);
-  mov(reg_ptr_ht_1, ptr[param1 + offsetof(gru_t, ht_1)]);
-  mov(reg_ptr_ht, ptr[param1 + offsetof(gru_t, ht)]);
-  ymm_t ymm_one = ymm_t(0);
-
-  if (id_ == 2) {
-    reg64_t reg_ptr_tmp = r11;
-    mov(reg_ptr_tmp, reinterpret_cast<size_t>(exp_float_consts));
-    vmovaps(ymm_one, ptr[reg_ptr_tmp + OFFSET_EXP_ONE]);
-  }
-  int offset = 0;
-  int d = num_ * sizeof(float);
-  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
-    ymm_t ymm_u = ymm_t(1);
-    ymm_t ymm_r = ymm_t(2);
-    ymm_t ymm_s = ymm_t(3);
-    ymm_t ymm_ht_1 = ymm_t(4);
-    // W: {W_update, W_reset; W_state}
-    if (id_ == 0 || id_ == 2) {
-      vmovups(ymm_u, ptr[reg_ptr_gates + offset]);
-      vmovups(ymm_s, ptr[reg_ptr_gates + offset + 2 * d]);
-    }
-    if (id_ == 1) {
-      vmovups(ymm_r, ptr[reg_ptr_gates + offset + d]);
-    }
-    if (id_ == 1 || id_ == 2) {
-      vmovups(ymm_ht_1, ptr[reg_ptr_ht_1 + offset]);
-    }
-
-    if (id_ == 0) {
-      // ht = act_gate(u) * act_cand(s)
-      act<ymm_t>(ymm_u, ymm_u, act_gate_);
-      act<ymm_t>(ymm_s, ymm_s, act_cand_);
-      vmulps(ymm_s, ymm_s, ymm_u);
-      vmovups(ptr[reg_ptr_ht + offset], ymm_s);
-    } else if (id_ == 1) {
-      // ht = act_gate(r) * ht_1
-      act<ymm_t>(ymm_r, ymm_r, act_gate_);
-      vmulps(ymm_r, ymm_r, ymm_ht_1);
-      vmovups(ptr[reg_ptr_ht + offset], ymm_r);
-    } else if (id_ == 2) {
-      // ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1
-      ymm_t ymm_one_inner = ymm_t(ymm_one.getIdx());
-      act<ymm_t>(ymm_u, ymm_u, act_gate_);
-      act<ymm_t>(ymm_s, ymm_s, act_cand_);
-      vmulps(ymm_s, ymm_s, ymm_u);
-      vsubps(ymm_u, ymm_one_inner, ymm_u);
-      vmulps(ymm_u, ymm_ht_1, ymm_u);
-      vaddps(ymm_u, ymm_s, ymm_u);
-      vmovups(ptr[reg_ptr_ht + offset], ymm_u);
-    }
-    offset += sizeof(float) * YMM_FLOAT_BLOCK;
-  }
-  ret();
-}
-
-#define DECLARE_GRU_CREATOR(name)                              \
-  class name##Creator : public JitCodeCreator<gru_attr_t> {    \
-   public:                                                     \
-    /* TODO(TJ): enable more */                                \
-    bool CanBeUsed(const gru_attr_t& attr) const override {    \
-      return x86::MayIUse(x86::avx) && attr.d % 8 == 0;        \
-    }                                                          \
-    size_t CodeSize(const gru_attr_t& attr) const override {   \
-      return 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 2 * 8;       \
-    }                                                          \
-    std::unique_ptr<GenBase> CreateJitCode(                    \
-        const gru_attr_t& attr) const override {               \
-      return make_unique<name##JitCode>(attr, CodeSize(attr)); \
-    }                                                          \
-  }
-
-DECLARE_GRU_CREATOR(GRUH1);
-DECLARE_GRU_CREATOR(GRUHtPart1);
-DECLARE_GRU_CREATOR(GRUHtPart2);
-
-#undef DECLARE_GRU_CREATOR
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
-
-namespace gen = paddle::lite::jit::gen;
-
-REGISTER_JITKERNEL_GEN(kGRUH1, gen::GRUH1Creator);
-REGISTER_JITKERNEL_GEN(kGRUHtPart1, gen::GRUHtPart1Creator);
-REGISTER_JITKERNEL_GEN(kGRUHtPart2, gen::GRUHtPart2Creator);
diff --git a/lite/backends/x86/jit/gen/gru.h b/lite/backends/x86/jit/gen/gru.h
deleted file mode 100644
index 408f25746d..0000000000
--- a/lite/backends/x86/jit/gen/gru.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "glog/logging.h"
-#include "lite/backends/x86/jit/gen/act.h"
-#include "lite/backends/x86/jit/gen/jitcode.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace gen {
-
-class GRUJitCode : public VActFunc {
- public:
-  explicit GRUJitCode(int id,
-                      const gru_attr_t& attr,
-                      size_t code_size,
-                      void* code_ptr = nullptr)
-      : VActFunc(code_size, code_ptr), id_(id), num_(attr.d) {
-    auto typeExchange = [](KernelType type) -> gen::operand_type {
-      if (type == KernelType::kVSigmoid) {
-        return operand_type::SIGMOID;
-      } else if (type == KernelType::kVRelu) {
-        return operand_type::RELU;
-      } else if (type == KernelType::kVTanh) {
-        return operand_type::TANH;
-      } else if (type == KernelType::kVIdentity) {
-        return operand_type::IDENTITY;
-      } else {
-        LOG(FATAL) << "Do not support this jit::KernelType: " << type;
-      }
-      return operand_type::IDENTITY;
-    };
-    act_gate_ = typeExchange(attr.act_gate);
-    act_cand_ = typeExchange(attr.act_cand);
-
-    this->genCode();
-  }
-
-  std::string name() const override {
-    std::string base = "GRUJitCode";
-    if (id_ == 0) {
-      base += "_H1";
-    } else if (id_ == 1) {
-      base += "_HtPart1";
-    } else if (id_ == 2) {
-      base += "_HtPart2";
-    }
-    auto AddTypeStr = [&](operand_type type) {
-      switch (type) {
-        case operand_type::RELU:
-          base += "_Relu";
-          break;
-        case operand_type::EXP:
-          base += "_Exp";
-          break;
-        case operand_type::SIGMOID:
-          base += "_Sigmoid";
-          break;
-        case operand_type::TANH:
-          base += "_Tanh";
-          break;
-        case operand_type::IDENTITY:
-          base += "_Identity";
-          break;
-        default:
-          break;
-      }
-    };
-    AddTypeStr(act_gate_);
-    AddTypeStr(act_cand_);
-    return base;
-  }
-  void genCode() override;
-
- protected:
-  int id_;
-  int num_;
-  operand_type act_gate_;
-  operand_type act_cand_;
-  reg64_t param1{abi_param1};
-};
-
-#define DECLARE_GRU_JITCODE(name, id)                  \
-  class name##JitCode : public GRUJitCode {            \
-   public:                                             \
-    explicit name##JitCode(const gru_attr_t& attr,     \
-                           size_t code_size,           \
-                           void* code_ptr = nullptr)   \
-        : GRUJitCode(id, attr, code_size, code_ptr) {} \
-  };
-
-DECLARE_GRU_JITCODE(GRUH1, 0);
-DECLARE_GRU_JITCODE(GRUHtPart1, 1);
-DECLARE_GRU_JITCODE(GRUHtPart2, 2);
-
-#undef DECLARE_GRU_JITCODE
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/jit/gen/hopv.cc b/lite/backends/x86/jit/gen/hopv.cc
deleted file mode 100644
index 4304dc48c5..0000000000
--- a/lite/backends/x86/jit/gen/hopv.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "lite/backends/x86/jit/gen/hopv.h"
-#include <memory>
-#include "lite/backends/x86/jit/registry.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace gen {
-
-void HOPVJitCode::genCode() {
-  const int num_blocks = num_ / YMM_FLOAT_BLOCK;
-  int offset = 0;
-
-  if (num_blocks > 0) {
-    // load one firstly
-    vmovups(ymm_tmp, ptr[param_src]);
-    offset += sizeof(float) * YMM_FLOAT_BLOCK;
-    for (int i = 1; i < num_blocks; ++i) {
-      vmovups(ymm_src, ptr[param_src + offset]);
-      process(ymm_tmp, ymm_src, ymm_tmp);
-      offset += sizeof(float) * YMM_FLOAT_BLOCK;
-    }
-    vextractf128(xmm_dst, ymm_tmp, 1);
-    process(xmm_dst, xmm_dst, xmm_tmp);
-  } else {
-    if (type_ == operand_type::MAX) {
-      vbroadcastss(ymm_dst, ptr[param_src]);
-    } else if (type_ == operand_type::ADD) {
-      vxorps(ymm_dst, ymm_dst, ymm_dst);
-    }
-  }
-
-  int rest = num_ % YMM_FLOAT_BLOCK;
-  if (rest >= 4) {
-    vmovups(xmm_src, ptr[param_src + offset]);
-    offset += sizeof(float) * 4;
-    rest -= 4;
-    process(xmm_dst, xmm_dst, xmm_src);
-  }
-
-  vpermilps(xmm_tmp, xmm_dst, 16 + 8 + 3);
-  process(xmm_dst, xmm_dst, xmm_tmp);
-
-  if (rest >= 2) {
-    vmovq(xmm_src, ptr[param_src + offset]);
-    offset += sizeof(float) * 2;
-    rest -= 2;
-    process(xmm_dst, xmm_dst, xmm_src);
-  }
-
-  vpermilps(xmm_tmp, xmm_dst, 1);
-  process(xmm_dst, xmm_dst, xmm_tmp);
-
-  if (rest >= 1) {
-    vmovss(xmm_src, ptr[param_src + offset]);
-    process(xmm_dst, xmm_dst, xmm_src);
-  }
-  vmovss(ptr[param_dst], xmm_dst);
-  ret();
-}
-
-#define DECLARE_HOP_CREATOR(name)                                            \
-  class name##Creator : public JitCodeCreator<int> {                         \
-   public:                                                                   \
-    bool CanBeUsed(const int& attr) const override {                         \
-      return x86::MayIUse(x86::avx);                                         \
-    }                                                                        \
-    size_t CodeSize(const int& d) const override {                           \
-      return 96 + d / YMM_FLOAT_BLOCK * 4 * 8;                               \
-    }                                                                        \
-    std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { \
-      return make_unique<name##JitCode>(attr, CodeSize(attr));               \
-    }                                                                        \
-  }
-
-DECLARE_HOP_CREATOR(HMax);
-DECLARE_HOP_CREATOR(HSum);
-
-#undef DECLARE_HOP_CREATOR
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
-
-namespace gen = paddle::lite::jit::gen;
-
-REGISTER_JITKERNEL_GEN(kHMax, gen::HMaxCreator);
-REGISTER_JITKERNEL_GEN(kHSum, gen::HSumCreator);
diff --git a/lite/backends/x86/jit/gen/hopv.h b/lite/backends/x86/jit/gen/hopv.h
deleted file mode 100644
index 801131d630..0000000000
--- a/lite/backends/x86/jit/gen/hopv.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "glog/logging.h"
-#include "lite/backends/x86/jit/gen/jitcode.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace gen {
-
-// horizontal operand vector
-class HOPVJitCode : public JitCode {
- public:
-  explicit HOPVJitCode(int d,
-                       operand_type type,
-                       size_t code_size = 256 * 1024,
-                       void* code_ptr = nullptr)
-      : JitCode(code_size, code_ptr), num_(d), type_(type) {
-    if (!(type_ == operand_type::MAX || type_ == operand_type::ADD)) {
-      LOG(FATAL) << "Do not support this operand type: " << type_;
-    }
-    this->genCode();
-  }
-
-  std::string name() const override {
-    std::string base = "VXXJitCode";
-    if (type_ == operand_type::MAX) {
-      base += "_MAX";
-    } else {
-      base += "_SUM";
-    }
-    return base;
-  }
-  void genCode() override;
-
- protected:
-  template <typename JMM>
-  void process(JMM& dst, JMM& src1, JMM& src2) {  // NOLINT
-    if (type_ == operand_type::MAX) {
-      vmaxps(dst, src1, src2);
-    } else if (type_ == operand_type::ADD) {
-      vaddps(dst, src1, src2);
-    }
-  }
-
- private:
-  int num_;
-  operand_type type_;
-  reg64_t param_src{abi_param1};
-  reg64_t param_dst{abi_param2};
-  reg64_t param_attr{abi_param3};
-
-  ymm_t ymm_tmp = ymm_t(0);
-  ymm_t ymm_src = ymm_t(1);
-  ymm_t ymm_dst = ymm_t(2);
-
-  xmm_t xmm_tmp = xmm_t(0);
-  xmm_t xmm_src = xmm_t(1);
-  xmm_t xmm_dst = xmm_t(2);
-};
-
-#define DECLARE_HOP_JITCODE(name, op_type)                                    \
-  class name##JitCode : public HOPVJitCode {                                  \
-   public:                                                                    \
-    explicit name##JitCode(int d, size_t code_size, void* code_ptr = nullptr) \
-        : HOPVJitCode(d, op_type, code_size, code_ptr) {}                     \
-  };
-
-DECLARE_HOP_JITCODE(HMax, operand_type::MAX);
-DECLARE_HOP_JITCODE(HSum, operand_type::ADD);
-
-#undef DECLARE_HOP_JITCODE
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/jit/gen/jitcode.h b/lite/backends/x86/jit/gen/jitcode.h
deleted file mode 100644
index 1840dcac68..0000000000
--- a/lite/backends/x86/jit/gen/jitcode.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <type_traits>
-#include "lite/backends/x86/cpu_info.h"
-#include "lite/backends/x86/jit/gen_base.h"
-
-#define XBYAK_USE_MMAP_ALLOCATOR
-#include "xbyak/xbyak.h"
-#include "xbyak/xbyak_util.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace gen {
-
-// Application Binary Interface
-constexpr Xbyak::Operand::Code abi_param1(Xbyak::Operand::RDI),
-    abi_param2(Xbyak::Operand::RSI), abi_param3(Xbyak::Operand::RDX),
-    abi_param4(Xbyak::Operand::RCX), abi_param5(Xbyak::Operand::R8),
-    abi_param6(Xbyak::Operand::R9);
-
-constexpr Xbyak::Operand::Code g_abi_regs[] = {Xbyak::Operand::RBX,
-                                               Xbyak::Operand::RBP,
-                                               Xbyak::Operand::R12,
-                                               Xbyak::Operand::R13,
-                                               Xbyak::Operand::R14,
-                                               Xbyak::Operand::R15};
-
-constexpr int num_g_abi_regs = sizeof(g_abi_regs) / sizeof(g_abi_regs[0]);
-
-using reg64_t = const Xbyak::Reg64;
-using reg32_t = const Xbyak::Reg32;
-using xmm_t = const Xbyak::Xmm;
-using ymm_t = const Xbyak::Ymm;
-using zmm_t = const Xbyak::Zmm;
-using Label = Xbyak::Label;
-
-typedef enum {
-  MUL = 0,
-  MAX,
-  ADD,
-  SUB,
-  RELU,
-  EXP,
-  SQUARE,
-  SIGMOID,
-  TANH,
-  IDENTITY
-} operand_type;
-
-#define DECLARE_JIT_CODE(codename) \
-  std::string name() const override { return #codename; }
-
-class JitCode : public GenBase, public Xbyak::CodeGenerator {
- public:
-  explicit JitCode(size_t code_size, void* code_ptr = nullptr)
-      : Xbyak::CodeGenerator(
-            (code_size % 4096 != 0 ? (code_size / 4096 + 1) * 4096 : code_size),
-            code_ptr) {}
-
-  virtual void genCode() = 0;
-
-  size_t getSize() const override { return CodeGenerator::getSize(); }
-  const unsigned char* getCodeInternal() const override {
-    const Xbyak::uint8* code = CodeGenerator::getCode();
-    return code;
-  }
-
- protected:
-  Xbyak::Reg64 param1{abi_param1};
-  const int EVEX_max_8b_offt = 0x200;
-  const Xbyak::Reg64 reg_EVEX_max_8b_offt = rbp;
-
-  virtual void preCode() {
-    for (int i = 0; i < num_g_abi_regs; ++i) {
-      push(Xbyak::Reg64(g_abi_regs[i]));
-    }
-    if (x86::MayIUse(x86::avx512f)) {
-      mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt);
-    }
-  }
-  virtual void postCode() {
-    for (int i = 0; i < num_g_abi_regs; ++i) {
-      pop(Xbyak::Reg64(g_abi_regs[num_g_abi_regs - 1 - i]));
-    }
-    ret();
-  }
-  void L(const char* label) { Xbyak::CodeGenerator::L(label); }
-  void L(const Xbyak::Label& label) { Xbyak::CodeGenerator::L(label); }
-  // Enhanced vector extension
-  Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base,
-                                    int offt,
-                                    bool bcast = false) {
-    int scale = 0;
-    // Learn from https://github.com/intel/mkl-dnn
-    if (EVEX_max_8b_offt <= offt && offt < 3 * EVEX_max_8b_offt) {
-      offt = offt - 2 * EVEX_max_8b_offt;
-      scale = 1;
-    } else if (3 * EVEX_max_8b_offt <= offt && offt < 5 * EVEX_max_8b_offt) {
-      offt = offt - 4 * EVEX_max_8b_offt;
-      scale = 2;
-    }
-    auto re = Xbyak::RegExp() + base + offt;
-    if (scale) {
-      re = re + reg_EVEX_max_8b_offt * scale;
-    }
-    if (bcast) {
-      return zword_b[re];
-    } else {
-      return zword[re];
-    }
-  }
-};
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/jit/gen/lstm.cc b/lite/backends/x86/jit/gen/lstm.cc
deleted file mode 100644
index 44e58d0b75..0000000000
--- a/lite/backends/x86/jit/gen/lstm.cc
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "lite/backends/x86/jit/gen/lstm.h"
-#include <stddef.h>  // offsetof
-#include <memory>
-#include "lite/backends/x86/jit/registry.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace gen {
-
-void LSTMJitCode::genCode() {
-  if (use_peephole_) {
-    preCode();
-  }
-  reg64_t reg_ptr_gates = rax;
-  reg64_t reg_ptr_ct_1 = r9;
-  reg64_t reg_ptr_ct = r10;
-  reg64_t reg_ptr_ht = r11;
-  reg64_t reg_ptr_wp = r12;
-  mov(reg_ptr_gates, ptr[param1 + offsetof(lstm_t, gates)]);
-  mov(reg_ptr_ct_1, ptr[param1 + offsetof(lstm_t, ct_1)]);
-  mov(reg_ptr_ct, ptr[param1 + offsetof(lstm_t, ct)]);
-  mov(reg_ptr_ht, ptr[param1 + offsetof(lstm_t, ht)]);
-  if (use_peephole_) {
-    mov(reg_ptr_wp, ptr[param1 + offsetof(lstm_t, wp)]);
-  }
-
-  int offset = 0;
-  int d = num_ * sizeof(float);
-  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
-    /* gates: W_ch, W_ih, W_fh, W_oh */
-    ymm_t ymm_c = ymm_t(0);
-    ymm_t ymm_i = ymm_t(1);
-    ymm_t ymm_f = ymm_t(2);
-    ymm_t ymm_o = ymm_t(3);
-    ymm_t ymm_ct_1 = ymm_t(4);
-    ymm_t ymm_wp0 = ymm_t(5);
-    ymm_t ymm_wp1 = ymm_t(6);
-    ymm_t ymm_wp2 = ymm_t(7);
-    vmovups(ymm_c, ptr[reg_ptr_gates + offset]);
-    vmovups(ymm_i, ptr[reg_ptr_gates + offset + d]);
-    vmovups(ymm_f, ptr[reg_ptr_gates + offset + 2 * d]);
-    vmovups(ymm_o, ptr[reg_ptr_gates + offset + 3 * d]);
-    if (!compute_c1h1_) {
-      vmovups(ymm_ct_1, ptr[reg_ptr_ct_1 + offset]);
-    }
-    if (use_peephole_) {
-      vmovups(ymm_wp0, ptr[reg_ptr_wp + offset]);
-      vmovups(ymm_wp1, ptr[reg_ptr_wp + offset + d]);
-      vmovups(ymm_wp2, ptr[reg_ptr_wp + offset + 2 * d]);
-    }
-    /* C_t = act_cand(c) * act_gate(i) + C_t-1 * act_gate(f) */
-    // act_cand(c)
-    act<ymm_t>(ymm_c, ymm_c, act_cand_);
-    // act_gate(i) or act_gate(ct_1 * wp0 + i)
-    if (!compute_c1h1_ && use_peephole_) {
-      vmulps(ymm_wp0, ymm_ct_1, ymm_wp0);
-      vaddps(ymm_i, ymm_i, ymm_wp0);
-    }
-    act<ymm_t>(ymm_i, ymm_i, act_gate_);
-    vmulps(ymm_c, ymm_c, ymm_i);
-    if (!compute_c1h1_) {
-      // act_gate(f) or act_gate(ct_1 * wp1 + f)
-      if (use_peephole_) {
-        vmulps(ymm_wp1, ymm_ct_1, ymm_wp1);
-        vaddps(ymm_f, ymm_f, ymm_wp1);
-      }
-      act<ymm_t>(ymm_f, ymm_f, act_gate_);
-      // ct
-      vmulps(ymm_f, ymm_f, ymm_ct_1);
-      vaddps(ymm_f, ymm_f, ymm_c);
-    }
-    /* H_t = act_cell(C_t) * act_gate(o) */
-    // act_cell(C_t)
-    ymm_t ymm_ct = compute_c1h1_ ? ymm_c : ymm_f;
-    ymm_t ymm_tmp = ymm_i;
-    act<ymm_t>(ymm_tmp, ymm_ct, act_cell_);
-    // act_gate(o) or act_gate(ct * wp2 + o)
-    if (use_peephole_) {
-      vmulps(ymm_wp2, ymm_ct, ymm_wp2);
-      vaddps(ymm_o, ymm_o, ymm_wp2);
-    }
-    act<ymm_t>(ymm_o, ymm_o, act_gate_);
-    // ht
-    vmulps(ymm_o, ymm_o, ymm_tmp);
-    // save ct and ht
-    vmovups(ptr[reg_ptr_ct + offset], ymm_ct);
-    vmovups(ptr[reg_ptr_ht + offset], ymm_o);
-    offset += sizeof(float) * YMM_FLOAT_BLOCK;
-  }
-
-  if (use_peephole_) {
-    postCode();
-  } else {
-    ret();
-  }
-}
-
-#define DECLARE_LSTM_CREATOR(name)                             \
-  class name##Creator : public JitCodeCreator<lstm_attr_t> {   \
-   public:                                                     \
-    /* TODO(TJ): enable more */                                \
-    bool CanBeUsed(const lstm_attr_t& attr) const override {   \
-      return x86::MayIUse(x86::avx) && attr.d % 8 == 0;        \
-    }                                                          \
-    size_t CodeSize(const lstm_attr_t& attr) const override {  \
-      return 96 + attr.d / YMM_FLOAT_BLOCK * 90 * 4 * 8;       \
-    }                                                          \
-    std::unique_ptr<GenBase> CreateJitCode(                    \
-        const lstm_attr_t& attr) const override {              \
-      return make_unique<name##JitCode>(attr, CodeSize(attr)); \
-    }                                                          \
-  }
-
-DECLARE_LSTM_CREATOR(LSTMCtHt);
-DECLARE_LSTM_CREATOR(LSTMC1H1);
-
-#undef DECLARE_LSTM_CREATOR
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
-
-namespace gen = paddle::lite::jit::gen;
-
-REGISTER_JITKERNEL_GEN(kLSTMCtHt, gen::LSTMCtHtCreator);
-REGISTER_JITKERNEL_GEN(kLSTMC1H1, gen::LSTMC1H1Creator);
diff --git a/lite/backends/x86/jit/gen/lstm.h b/lite/backends/x86/jit/gen/lstm.h
deleted file mode 100644
index 141419505c..0000000000
--- a/lite/backends/x86/jit/gen/lstm.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "glog/logging.h"
-#include "lite/backends/x86/jit/gen/act.h"
-#include "lite/backends/x86/jit/gen/jitcode.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace gen {
-
-class LSTMJitCode : public VActFunc {
- public:
-  explicit LSTMJitCode(bool compute_c1h1,
-                       const lstm_attr_t& attr,
-                       size_t code_size,
-                       void* code_ptr = nullptr)
-      : VActFunc(code_size, code_ptr),
-        num_(attr.d),
-        compute_c1h1_(compute_c1h1),
-        use_peephole_(attr.use_peephole) {
-    auto typeExchange = [](KernelType type) -> gen::operand_type {
-      if (type == KernelType::kVSigmoid) {
-        return operand_type::SIGMOID;
-      } else if (type == KernelType::kVRelu) {
-        return operand_type::RELU;
-      } else if (type == KernelType::kVTanh) {
-        return operand_type::TANH;
-      } else if (type == KernelType::kVIdentity) {
-        return operand_type::IDENTITY;
-      } else {
-        LOG(FATAL) << "Do not support this jit::KernelType: " << type;
-      }
-      return operand_type::IDENTITY;
-    };
-    act_gate_ = typeExchange(attr.act_gate);
-    act_cand_ = typeExchange(attr.act_cand);
-    act_cell_ = typeExchange(attr.act_cell);
-
-    this->genCode();
-  }
-
-  std::string name() const override {
-    std::string base = "LSTMJitCode";
-    if (use_peephole_) {
-      base += "_Peephole";
-    }
-    if (compute_c1h1_) {
-      base += "_C1H1";
-    }
-    auto AddTypeStr = [&](operand_type type) {
-      switch (type) {
-        case operand_type::RELU:
-          base += "_Relu";
-          break;
-        case operand_type::EXP:
-          base += "_Exp";
-          break;
-        case operand_type::SIGMOID:
-          base += "_Sigmoid";
-          break;
-        case operand_type::TANH:
-          base += "_Tanh";
-          break;
-        case operand_type::IDENTITY:
-          base += "_Identity";
-          break;
-        default:
-          break;
-      }
-    };
-    AddTypeStr(act_gate_);
-    AddTypeStr(act_cand_);
-    AddTypeStr(act_cell_);
-    return base;
-  }
-  void genCode() override;
-
- protected:
-  int num_;
-  bool compute_c1h1_;
-  bool use_peephole_;
-  operand_type act_gate_;
-  operand_type act_cand_;
-  operand_type act_cell_;
-  reg64_t param1{abi_param1};
-};
-
-#define DECLARE_LSTM_JITCODE(name, compute_c1h1)                  \
-  class name##JitCode : public LSTMJitCode {                      \
-   public:                                                        \
-    explicit name##JitCode(const lstm_attr_t& attr,               \
-                           size_t code_size,                      \
-                           void* code_ptr = nullptr)              \
-        : LSTMJitCode(compute_c1h1, attr, code_size, code_ptr) {} \
-  };
-
-DECLARE_LSTM_JITCODE(LSTMCtHt, false);
-DECLARE_LSTM_JITCODE(LSTMC1H1, true);
-
-#undef DECLARE_LSTM_JITCODE
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/jit/gen/matmul.cc b/lite/backends/x86/jit/gen/matmul.cc
deleted file mode 100644
index 103b9101ba..0000000000
--- a/lite/backends/x86/jit/gen/matmul.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "lite/backends/x86/jit/gen/matmul.h"
-#include <stddef.h>  // offsetof
-#include <memory>
-#include <vector>
-#include "lite/backends/x86/jit/registry.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace gen {
-
-void MatMulJitCode::genCode() {
-  preCode();
-  int block, rest;
-  const auto groups = packed_groups(n_, k_, &block, &rest);
-  PADDLE_ENFORCE_GT(groups.front(), 0);
-
-  const int block_len = sizeof(float) * block;
-  const int x_reg_idx = (block == ZMM_FLOAT_BLOCK ? 32 : 16) - 1;
-  const int w_reg_idx = x_reg_idx - 1;
-  // from packed mov(reg_ptr_wgt, ptr[param_attr + offsetof(matmul_attr_t,
-  // packed_weight)]);
-  mov(reg_ptr_wgt, param_y);
-  size_t z_offset = 0;
-  size_t wgt_offset = 0;
-  for (size_t g = 0; g < groups.size(); ++g) {
-    size_t x_offset = 0;
-    for (int k = 0; k < k_; ++k) {
-      vbroadcastss(zmm_t(x_reg_idx), ptr[param_x + x_offset]);
-      // clean
-      if (k == 0) {
-        for (int i = 0; i < groups[g]; ++i) {
-          vxorps(zmm_t(i), zmm_t(i), zmm_t(i));
-        }
-      }
-      for (int i = 0; i < groups[g]; ++i) {
-        vmovups(zmm_t(w_reg_idx), ptr[reg_ptr_wgt + wgt_offset]);
-        vfmadd231ps(zmm_t(i), zmm_t(w_reg_idx), zmm_t(x_reg_idx));
-        wgt_offset += block_len;
-      }
-      // last one, save
-      if (k == k_ - 1) {
-        for (int i = 0; i < groups[g]; ++i) {
-          // only rest save should be careful
-          if (rest != 0 && g == groups.size() - 1 && i == groups[g] - 1) {
-            break;
-          }
-          vmovups(ptr[param_z + z_offset + i * block_len], zmm_t(i));
-        }
-      }
-      x_offset += sizeof(float);
-    }
-    z_offset += block_len * groups[g];
-  }
-
-  if (rest != 0) {
-    // below should refine with mask
-    int reg_idx = groups.back() - 1;
-    z_offset = (n_ - rest) * sizeof(float);
-    int inner_block = 8;
-    while (rest > 0) {
-      if (rest >= 8) {
-        inner_block = 8;
-        vmovups(ptr[param_z + z_offset], ymm_t(reg_idx));
-        // shift zmm of inner_block, change reg_idx if update
-      } else if (rest >= 4) {
-        inner_block = 4;
-        vmovups(ptr[param_z + z_offset], xmm_t(reg_idx));
-      } else if (rest >= 2) {
-        inner_block = 2;
-        vmovq(ptr[param_z + z_offset], xmm_t(reg_idx));
-      } else {
-        inner_block = 1;
-        vmovss(ptr[param_z + z_offset], xmm_t(reg_idx));
-      }
-      z_offset += inner_block * sizeof(float);
-      rest -= inner_block;
-    }
-  }
-
-  postCode();
-}
-
-class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
- public:
-  bool CanBeUsed(const matmul_attr_t& attr) const override {
-    return attr.m == 1 && x86::MayIUse(x86::avx512f) &&
-           attr.n % ZMM_FLOAT_BLOCK == 0 && attr.k < 512;
-  }
-  size_t CodeSize(const matmul_attr_t& attr) const override {
-    int block = YMM_FLOAT_BLOCK;
-    if (x86::MayIUse(x86::avx512f)) {
-      block = ZMM_FLOAT_BLOCK;
-    }
-    return 96 + 4 * attr.k * (attr.n / block + 1) * 8;
-  }
-  std::unique_ptr<GenBase> CreateJitCode(
-      const matmul_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.m, 0);
-    PADDLE_ENFORCE_GT(attr.n, 0);
-    PADDLE_ENFORCE_GT(attr.k, 0);
-    return make_unique<MatMulJitCode>(attr, CodeSize(attr));
-  }
-};
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
-
-namespace gen = paddle::lite::jit::gen;
-
-REGISTER_JITKERNEL_GEN(kMatMul, gen::MatMulCreator);
diff --git a/lite/backends/x86/jit/gen/matmul.h b/lite/backends/x86/jit/gen/matmul.h
deleted file mode 100644
index b1b302b790..0000000000
--- a/lite/backends/x86/jit/gen/matmul.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <stdlib.h>  // for malloc and free
-#include <string>
-#include <vector>
-#include "glog/logging.h"
-#include "lite/backends/x86/jit/gen/jitcode.h"
-#include "lite/utils/paddle_enforce.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace gen {
-
-class MatMulJitCode : public JitCode {
- public:
-  explicit MatMulJitCode(const matmul_attr_t& attr,
-                         size_t code_size = 256 * 1024,
-                         void* code_ptr = nullptr)
-      : JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) {
-    PADDLE_ENFORCE_EQ(m_, 1, "Only support m==1 yet");
-    this->genCode();
-  }
-
-  std::string name() const override {
-    std::string base = "MatMulJitCode";
-    base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" +
-           std::to_string(k_);
-    return base;
-  }
-  void genCode() override;
-
- private:
-  int m_, n_, k_;
-
-  reg64_t param_x{abi_param1};
-  reg64_t param_y{abi_param2};
-  reg64_t param_z{abi_param3};
-  reg64_t param_attr{abi_param4};
-  reg64_t reg_tmp{rax};
-
-  reg64_t reg_ptr_wgt{r10};
-};
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/jit/gen/seqpool.cc b/lite/backends/x86/jit/gen/seqpool.cc
deleted file mode 100644
index e0cf5e5a5a..0000000000
--- a/lite/backends/x86/jit/gen/seqpool.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "lite/backends/x86/jit/gen/seqpool.h"
-#include <memory>
-#include "lite/backends/x86/jit/gen/act.h"  // for exp_float_consts ones
-#include "lite/backends/x86/jit/registry.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace gen {
-
-void SeqPoolJitCode::genCode() {
-  constexpr int block = YMM_FLOAT_BLOCK;
-  constexpr int max_num_regs = 8;
-  const int num_block = w_ / block;
-  const int num_groups = num_block / max_num_regs;
-  int rest_num_regs = num_block % max_num_regs;
-  mov(reg32_int_h, dword[param_attr]);
-  if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) {
-    mov(reg_tmp, reinterpret_cast<size_t>(exp_float_consts));
-    vmovups(xmm_t(1), ptr[reg_tmp + OFFSET_EXP_ONE]);
-    mov(reg_tmp, reinterpret_cast<size_t>(fp_h_));
-    fild(dword[param_attr]);
-    fstp(dword[reg_tmp]);
-    vmovss(xmm_t(0), ptr[reg_tmp]);
-    if (type_ == SeqPoolType::kSqrt) {
-      vsqrtps(xmm_t(0), xmm_t(0));
-    }
-    vdivps(xmm_t(1), xmm_t(1), xmm_t(0));
-    vmovss(ptr[reg_tmp], xmm_t(1));
-  }
-  const int group_len = max_num_regs * block * sizeof(float);
-  for (int g = 0; g < num_groups; ++g) {
-    pool_height<ymm_t>(g * group_len, block, max_num_regs);
-  }
-  if (rest_num_regs > 0) {
-    pool_height<ymm_t>(num_groups * group_len, block, rest_num_regs);
-  }
-  // part of rest_w * height
-  const int rest = w_ % block;
-  pool_height_of_rest_width(rest, (w_ - rest) * sizeof(float), max_num_regs);
-  ret();
-}
-
-class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
- public:
-  bool CanBeUsed(const seq_pool_attr_t& attr) const override {
-    return x86::MayIUse(x86::avx);
-  }
-  size_t CodeSize(const seq_pool_attr_t& attr) const override {
-    return 96 +
-           ((attr.w / YMM_FLOAT_BLOCK + 4 /* for rest */) *
-                4 /* load, mul and save */ +
-            256) *
-               8;
-  }
-  std::unique_ptr<GenBase> CreateJitCode(
-      const seq_pool_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.w, 0);
-    PADDLE_ENFORCE_GT(attr.h, 0);
-    return make_unique<SeqPoolJitCode>(attr, CodeSize(attr));
-  }
-};
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
-
-namespace gen = paddle::lite::jit::gen;
-
-REGISTER_JITKERNEL_GEN(kSeqPool, gen::SeqPoolCreator);
diff --git a/lite/backends/x86/jit/gen/seqpool.h b/lite/backends/x86/jit/gen/seqpool.h
deleted file mode 100644
index 346179cfbb..0000000000
--- a/lite/backends/x86/jit/gen/seqpool.h
+++ /dev/null
@@ -1,216 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <glog/logging.h>
-#include <string>
-#include "lite/backends/x86/jit/gen/jitcode.h"
-#include "lite/utils/paddle_enforce.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace gen {
-
-class SeqPoolJitCode : public JitCode {
- public:
-  explicit SeqPoolJitCode(const seq_pool_attr_t& attr,
-                          size_t code_size = 256 * 1024,
-                          void* code_ptr = nullptr)
-      : JitCode(code_size, code_ptr), w_(attr.w), type_(attr.type) {
-    if (!(type_ == SeqPoolType::kSum || type_ == SeqPoolType::kAvg ||
-          type_ == SeqPoolType::kSqrt)) {
-      LOG(FATAL) << "Only supported pool type: sum, avg and sqrt.";
-    }
-    fp_h_[0] = 1.f;
-    this->genCode();
-  }
-
-  std::string name() const override {
-    std::string base = "SeqPoolJitCode";
-    if (type_ == SeqPoolType::kSum) {
-      base += "_Sum";
-    } else if (type_ == SeqPoolType::kAvg) {
-      base += "_Avg";
-    } else if (type_ == SeqPoolType::kSqrt) {
-      base += "_Sqrt";
-    }
-    base += ("_W" + std::to_string(w_));
-    return base;
-  }
-  void genCode() override;
-
- protected:
-  template <typename JMM>
-  void pool_height(int w_offset, int block, int max_num_regs) {
-    int offset = w_offset;
-    for (int i = 0; i < max_num_regs; ++i) {
-      vmovups(JMM(i), ptr[param_src + offset]);
-      offset += sizeof(float) * block;
-    }
-    cmp(reg32_int_h, 1);
-    Label l_next_h, l_h_done;
-    jle(l_h_done, T_NEAR);
-    mov(reg_h_i, 1);
-    mov(reg_tmp, param_src);
-    add(reg_tmp, w_ * sizeof(float) + w_offset);
-    L(l_next_h);
-    {
-      mov(reg_ptr_src_i, reg_tmp);
-      for (int i = 0; i < max_num_regs; ++i) {
-        vmovups(JMM(i + max_num_regs), ptr[reg_ptr_src_i]);
-        // sum anyway
-        vaddps(JMM(i), JMM(i), JMM(i + max_num_regs));
-        add(reg_ptr_src_i, sizeof(float) * block);
-      }
-      inc(reg_h_i);
-      add(reg_tmp, w_ * sizeof(float));
-      cmp(reg_h_i, reg32_int_h);
-      jl(l_next_h, T_NEAR);
-    }
-    L(l_h_done);
-    // save right now
-    if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) {
-      mov(reg_tmp, reinterpret_cast<size_t>(fp_h_));
-      vbroadcastss(JMM(max_num_regs), ptr[reg_tmp]);
-    }
-    offset = w_offset;
-    for (int i = 0; i < max_num_regs; ++i) {
-      if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) {
-        vmulps(JMM(i), JMM(i), JMM(max_num_regs));
-      }
-      vmovups(ptr[param_dst + offset], JMM(i));
-      offset += sizeof(float) * block;
-    }
-  }
-
-  void pool_height_of_rest_width(int rest, int w_offset, int max_num_regs) {
-    const int rest_used_num_regs = load_rest(rest, w_offset, 0);
-    const bool has_block4 = rest / 4 > 0;
-    const bool has_block2 = (rest % 4) / 2 > 0;
-    const bool has_block1 = (rest % 2) == 1;
-    cmp(reg32_int_h, 1);
-    Label l_next_h, l_h_done;
-    jle(l_h_done, T_NEAR);
-    mov(reg_h_i, 1);
-    mov(reg_tmp, param_src);
-    add(reg_tmp, w_ * sizeof(float) + w_offset);
-    L(l_next_h);
-    {
-      int reg_idx = 0;
-      mov(reg_ptr_src_i, reg_tmp);
-      if (has_block4) {
-        vmovups(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
-        add(reg_ptr_src_i, sizeof(float) * 4);
-        reg_idx++;
-      }
-      if (has_block2) {
-        vmovups(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
-        add(reg_ptr_src_i, sizeof(float) * 2);
-        reg_idx++;
-      }
-      if (has_block1) {
-        vmovss(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
-        reg_idx++;
-      }
-      PADDLE_ENFORCE_EQ(
-          reg_idx, rest_used_num_regs, "All heights should use same regs");
-      for (int i = 0; i < reg_idx; ++i) {
-        vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs));
-      }
-      inc(reg_h_i);
-      add(reg_tmp, w_ * sizeof(float));
-      cmp(reg_h_i, reg32_int_h);
-      jl(l_next_h, T_NEAR);
-    }
-    L(l_h_done);
-    // save right now
-    if (type_ == SeqPoolType::kAvg || type_ == SeqPoolType::kSqrt) {
-      mov(reg_tmp, reinterpret_cast<size_t>(fp_h_));
-      vbroadcastss(xmm_t(max_num_regs), ptr[reg_tmp]);
-      for (int i = 0; i < rest_used_num_regs; ++i) {
-        vmulps(xmm_t(i), xmm_t(i), xmm_t(max_num_regs));
-      }
-    }
-    save_rest(rest, w_offset);
-  }
-
-  // return the number of used regs, use start from reg 0
-  int load_rest(int rest,
-                int w_offset,
-                const int num_shift_regs,
-                const int reg_start = 0) {
-    const bool has_block4 = rest / 4 > 0;
-    const bool has_block2 = (rest % 4) / 2 > 0;
-    const bool has_block1 = (rest % 2) == 1;
-    int reg_idx = reg_start;
-    if (has_block4) {
-      vmovups(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]);
-      w_offset += sizeof(float) * 4;
-      reg_idx++;
-    }
-    if (has_block2) {
-      vmovq(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]);
-      w_offset += sizeof(float) * 2;
-      reg_idx++;
-    }
-    if (has_block1) {
-      vmovss(xmm_t(reg_idx + num_shift_regs), ptr[param_src + w_offset]);
-      reg_idx++;
-    }
-    return reg_idx;
-  }
-
-  // use reg start from 0
-  void save_rest(int rest, int w_offset, int reg_start = 0) {
-    const bool has_block4 = rest / 4 > 0;
-    const bool has_block2 = (rest % 4) / 2 > 0;
-    const bool has_block1 = (rest % 2) == 1;
-    int reg_idx = reg_start;
-    if (has_block4) {
-      vmovups(ptr[param_dst + w_offset], xmm_t(reg_idx));
-      w_offset += sizeof(float) * 4;
-      reg_idx++;
-    }
-    if (has_block2) {
-      vmovq(ptr[param_dst + w_offset], xmm_t(reg_idx));
-      w_offset += sizeof(float) * 2;
-      reg_idx++;
-    }
-    if (has_block1) {
-      vmovss(ptr[param_dst + w_offset], xmm_t(reg_idx));
-    }
-  }
-
- private:
-  float ALIGN32_BEG fp_h_[1] ALIGN32_END;
-  int w_;
-  SeqPoolType type_;
-  reg64_t param_src{abi_param1};
-  reg64_t param_dst{abi_param2};
-  reg64_t param_attr{abi_param3};
-  reg64_t reg_tmp{rax};
-
-  reg32_t reg32_int_h{r8d};
-  reg32_t reg32_fp_h{r9d};
-
-  reg64_t reg_h_i{r10};
-  reg64_t reg_ptr_src_i{r11};
-};
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/jit/gen/sgd.cc b/lite/backends/x86/jit/gen/sgd.cc
deleted file mode 100644
index 10659f5084..0000000000
--- a/lite/backends/x86/jit/gen/sgd.cc
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "lite/backends/x86/jit/gen/sgd.h"
-#include <stddef.h>  // offsetof
-#include <memory>
-#include <vector>
-#include "lite/backends/x86/jit/registry.h"
-#include "lite/utils/paddle_enforce.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace gen {
-
-void SgdJitCode::genCode() {
-  preCode();
-  constexpr int block = YMM_FLOAT_BLOCK;
-  constexpr int max_num_regs = 7;
-  const int num_block = w_ / block;
-  const int num_groups = num_block / max_num_regs;
-  const size_t block_size = sizeof(float) * block;
-  const size_t width_size = w_ * sizeof(float);
-  std::vector<int> groups(num_groups, max_num_regs);
-  int rest_num_regs = num_block % max_num_regs;
-  if (rest_num_regs > 0) {
-    groups.push_back(rest_num_regs);
-  }
-
-  vbroadcastss(ymm_lr, ptr[param_lr]);
-  // protect rdx
-  mov(reg_ptr_grad_i, param_grad);
-  mov(reg_ptr_rows_i, param_rows);
-
-  mov(reg_rows_size_in_byte,
-      qword[param_attr + offsetof(sgd_attr_t, selected_rows_size)]);
-  mov(rax, sizeof(int64_t));
-  mul(reg_rows_size_in_byte);
-  mov(reg_rows_size_in_byte, rax);
-  add(reg_rows_size_in_byte, reg_ptr_rows_i);
-
-  Label l_next_row;
-  L(l_next_row);
-  {
-    mov(reg_row, qword[reg_ptr_rows_i]);
-    mov(rax, width_size);
-    mul(reg_row);
-    mov(reg_row, rax);
-
-    mov(reg_ptr_param_i, param_param);
-    mov(reg_ptr_out_i, param_out);
-    add(reg_ptr_param_i, reg_row);
-    add(reg_ptr_out_i, reg_row);
-
-    size_t w_offset = 0;
-    for (int num_regs : groups) {
-      // load grad
-      size_t inner_offfset = w_offset;
-      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
-        vmovups(ymm_t(reg_i), ptr[reg_ptr_grad_i + inner_offfset]);
-        inner_offfset += block_size;
-      }
-
-      // load param
-      inner_offfset = w_offset;
-      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
-        vmovups(ymm_t(reg_i + num_regs), ptr[reg_ptr_param_i + inner_offfset]);
-        inner_offfset += block_size;
-      }
-
-      // compute out
-      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
-        vmulps(ymm_t(reg_i), ymm_t(reg_i), ymm_lr);
-        vsubps(ymm_t(reg_i + num_regs), ymm_t(reg_i + num_regs), ymm_t(reg_i));
-      }
-
-      // save out
-      inner_offfset = w_offset;
-      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
-        vmovups(ptr[reg_ptr_out_i + inner_offfset], ymm_t(reg_i + num_regs));
-        inner_offfset += block_size;
-      }
-      w_offset += (block_size * num_regs);
-    }
-
-    add(reg_ptr_grad_i, width_size);
-    add(reg_ptr_rows_i, sizeof(int64_t));
-    cmp(reg_ptr_rows_i, reg_rows_size_in_byte);
-    jl(l_next_row, T_NEAR);
-  }
-
-  postCode();
-}
-
-class SgdCreator : public JitCodeCreator<sgd_attr_t> {
- public:
-  bool CanBeUsed(const sgd_attr_t& attr) const override {
-    return x86::MayIUse(x86::avx) && attr.grad_width % YMM_FLOAT_BLOCK == 0;
-  }
-  size_t CodeSize(const sgd_attr_t& attr) const override {
-    return 96 + (attr.grad_width / YMM_FLOAT_BLOCK) * 32 * 8;
-  }
-  std::unique_ptr<GenBase> CreateJitCode(
-      const sgd_attr_t& attr) const override {
-    PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width);
-    PADDLE_ENFORCE_LE(attr.selected_rows_size, attr.grad_height);
-    PADDLE_ENFORCE_GE(attr.selected_rows_size, 0);
-    return make_unique<SgdJitCode>(attr, CodeSize(attr));
-  }
-};
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
-
-namespace gen = paddle::lite::jit::gen;
-
-REGISTER_JITKERNEL_GEN(kSgd, gen::SgdCreator);
diff --git a/lite/backends/x86/jit/gen/sgd.h b/lite/backends/x86/jit/gen/sgd.h
deleted file mode 100644
index 303d94f2ab..0000000000
--- a/lite/backends/x86/jit/gen/sgd.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "glog/logging.h"
-#include "lite/backends/x86/jit/gen/jitcode.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace gen {
-
-class SgdJitCode : public JitCode {
- public:
-  explicit SgdJitCode(const sgd_attr_t& attr,
-                      size_t code_size = 256 * 1024,
-                      void* code_ptr = nullptr)
-      : JitCode(code_size, code_ptr), w_(attr.grad_width) {
-    this->genCode();
-  }
-
-  DECLARE_JIT_CODE(SgdJitCode);
-  void genCode() override;
-
- private:
-  int w_;
-  reg64_t param_lr{abi_param1};
-  reg64_t param_param{abi_param2};
-  reg64_t param_grad{abi_param3};
-  reg64_t param_rows{abi_param4};
-  reg64_t param_out{abi_param5};
-  reg64_t param_attr{abi_param6};
-
-  ymm_t ymm_lr = ymm_t(15);
-
-  reg64_t reg_ptr_grad_i{r10};
-  reg64_t reg_ptr_rows_i{r11};
-  reg64_t reg_rows_size_in_byte{r12};
-  reg64_t reg_row{r13};
-  reg64_t reg_ptr_param_i{r14};
-  reg64_t reg_ptr_out_i{r15};
-};
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/jit/gen/vbroadcast.cc b/lite/backends/x86/jit/gen/vbroadcast.cc
deleted file mode 100644
index 9e02dca8c4..0000000000
--- a/lite/backends/x86/jit/gen/vbroadcast.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "lite/backends/x86/jit/gen/vbroadcast.h"
-#include <memory>
-#include <vector>
-#include "lite/backends/x86/jit/registry.h"
-#include "lite/utils/paddle_enforce.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace gen {
-
-void VBroadcastJitCode::genCode() {
-  preCode();
-  constexpr int block = YMM_FLOAT_BLOCK;
-  constexpr int max_num_regs = 16;
-  const int num_block = w_ / block;
-  const int num_groups = num_block / max_num_regs;
-  const size_t block_size = sizeof(float) * block;
-  std::vector<int> groups(num_groups, max_num_regs);
-  int rest_num_regs = num_block % max_num_regs;
-  if (rest_num_regs > 0) {
-    groups.push_back(rest_num_regs);
-  }
-
-  // protect param_h
-  mov(reg_height, param_h);
-  Label l_next_h;
-  xor_(reg_h_i, reg_h_i);
-  mov(reg_ptr_dst_i, param_dst);
-  L(l_next_h);
-  {
-    mov(reg_ptr_src_i, param_src);
-    for (int num_regs : groups) {
-      size_t w_offset = 0;
-      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
-        vmovups(ymm_t(reg_i), ptr[reg_ptr_src_i + w_offset]);
-        w_offset += block_size;
-      }
-      add(reg_ptr_src_i, num_regs * block_size);
-
-      w_offset = 0;
-      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
-        vmovups(ptr[reg_ptr_dst_i + w_offset], ymm_t(reg_i));
-        w_offset += block_size;
-      }
-      add(reg_ptr_dst_i, num_regs * block_size);
-    }  // end of groups
-    inc(reg_h_i);
-    cmp(reg_h_i, reg_height);
-    jl(l_next_h, T_NEAR);
-  }  // end of l_next_h
-
-  postCode();
-}
-
-class VBroadcastCreator : public JitCodeCreator<int64_t> {
- public:
-  bool CanBeUsed(const int64_t& w) const override {
-    return x86::MayIUse(x86::avx) && w % YMM_FLOAT_BLOCK == 0;
-  }
-  size_t CodeSize(const int64_t& w) const override {
-    return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8;
-  }
-  std::unique_ptr<GenBase> CreateJitCode(const int64_t& w) const override {
-    PADDLE_ENFORCE_GT(w, 0);
-    return make_unique<VBroadcastJitCode>(w, CodeSize(w));
-  }
-};
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
-
-namespace gen = paddle::lite::jit::gen;
-
-REGISTER_JITKERNEL_GEN(kVBroadcast, gen::VBroadcastCreator);
diff --git a/lite/backends/x86/jit/gen/vbroadcast.h b/lite/backends/x86/jit/gen/vbroadcast.h
deleted file mode 100644
index 39bcd4965f..0000000000
--- a/lite/backends/x86/jit/gen/vbroadcast.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "glog/logging.h"
-#include "lite/backends/x86/jit/gen/jitcode.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace gen {
-
-class VBroadcastJitCode : public JitCode {
- public:
-  explicit VBroadcastJitCode(const int64_t& w,
-                             size_t code_size = 256 * 1024,
-                             void* code_ptr = nullptr)
-      : JitCode(code_size, code_ptr), w_(w) {
-    this->genCode();
-  }
-
-  DECLARE_JIT_CODE(VBroadcastJitCode);
-  void genCode() override;
-
- private:
-  int w_;
-  reg64_t param_src{abi_param1};
-  reg64_t param_dst{abi_param2};
-  reg64_t param_h{abi_param3};
-  reg64_t param_w{abi_param4};
-
-  reg64_t reg_height{r9};
-  reg64_t reg_h_i{r10};
-  reg64_t reg_ptr_src_i{r11};
-  reg64_t reg_ptr_dst_i{r12};
-};
-
-}  // namespace gen
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/jit/gen_base.cc b/lite/backends/x86/jit/gen_base.cc
deleted file mode 100644
index 38250d533d..0000000000
--- a/lite/backends/x86/jit/gen_base.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "lite/backends/x86/jit/gen_base.h"
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <vector>
-// #include "paddle/fluid/memory/allocation/cpu_allocator.h"  // for
-// posix_memalign
-#include "lite/backends/x86/cpu_info.h"
-#include "lite/backends/x86/jit/macro.h"
-#include "lite/utils/paddle_enforce.h"
-
-#ifndef _WIN32
-#define posix_memalign_free free
-#endif
-
-DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
-
-namespace paddle {
-namespace lite {
-namespace jit {
-
-// refer do not need CanBeUsed, it would be the last one.
-void GenBase::dumpCode(const unsigned char* code) const {
-  if (code) {
-    static int counter = 0;
-    std::ostringstream filename;
-    filename << "paddle_jitcode_" << name() << "." << counter << ".bin";
-    counter++;
-    std::ofstream fout(filename.str(), std::ios::out);
-    if (fout.is_open()) {
-      fout.write(reinterpret_cast<const char*>(code), this->getSize());
-      fout.close();
-    }
-  }
-}
-
-void* GenBase::operator new(size_t size) {
-  void* ptr;
-  constexpr size_t alignment = 32ul;
-  PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size),
-                    0,
-                    "GenBase Alloc %ld error!",
-                    size);
-  PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size);
-  return ptr;
-}
-
-void GenBase::operator delete(void* ptr) { posix_memalign_free(ptr); }
-
-std::vector<int> packed_groups(int n, int k, int* block_out, int* rest_out) {
-  int block;
-  int max_num_regs;
-  if (x86::MayIUse(x86::avx512f)) {
-    block = ZMM_FLOAT_BLOCK;
-    max_num_regs = 32;
-  } else {
-    block = YMM_FLOAT_BLOCK;
-    max_num_regs = 16;
-  }
-  // one for x, one for y, others for z
-  const int max_used_regs_for_n = max_num_regs - 2;
-  const int aligned_n = n % block == 0 ? n : (n / block + 1) * block;
-  const int num_block = aligned_n / block;
-  const int num_groups = num_block / max_used_regs_for_n;
-  std::vector<int> groups(num_groups, max_used_regs_for_n);
-  int rest_num_regs = num_block % max_used_regs_for_n;
-  if (rest_num_regs != 0) {
-    groups.push_back(rest_num_regs);
-  }
-  if (block_out) {
-    *block_out = block;
-  }
-  if (rest_out) {
-    *rest_out = n % block;
-  }
-  return groups;
-}
-
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/jit/gen_base.h b/lite/backends/x86/jit/gen_base.h
deleted file mode 100644
index b5f942615a..0000000000
--- a/lite/backends/x86/jit/gen_base.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <gflags/gflags.h>
-#include <memory>  // for unique_ptr
-#include <string>
-#include <vector>
-#include "lite/backends/x86/jit/kernel_base.h"
-
-DECLARE_bool(dump_jitcode);
-
-namespace paddle {
-namespace lite {
-namespace jit {
-
-class GenBase : public Kernel {
- public:
-  virtual ~GenBase() = default;
-  virtual std::string name() const = 0;
-  virtual size_t getSize() const = 0;
-  virtual const unsigned char* getCodeInternal() const = 0;
-  const char* ImplType() const override { return "JitCode"; }
-  template <typename Func>
-  Func getCode() const {
-    const unsigned char* code = this->getCodeInternal();
-    if (FLAGS_dump_jitcode) {
-      this->dumpCode(code);
-    }
-    // Note: failed to cast with reinterpret_cast<const Func> on Mac clang,
-    // then workaround with const_cast. Any better idea is appreciated.
-    return reinterpret_cast<Func>(const_cast<unsigned char*>(code));
-  }
-
-  void* operator new(size_t size);
-  void operator delete(void* ptr);
-  void* operator new[](size_t size) { return operator new(size); }
-  void operator delete[](void* ptr) { operator delete(ptr); }
-
- protected:
-  void dumpCode(const unsigned char* code) const;
-};
-
-// Creator is used to creat the jitcode and save in pool.
-// Every JitCode should have one creator.
-class GenCreator {
- public:
-  virtual ~GenCreator() = default;
-};
-
-template <typename Attr>
-class JitCodeCreator : public GenCreator {
- public:
-  virtual ~JitCodeCreator() = default;
-
-  // condition when this jit code can be used.
-  virtual bool CanBeUsed(const Attr& attr) const = 0;
-
-  // estimate this code size
-  virtual size_t CodeSize(const Attr& attr) const = 0;
-
-  // create this code
-  virtual std::unique_ptr<GenBase> CreateJitCode(const Attr& attr) const = 0;
-};
-
-// unify the method of packed groups
-// output the packed groups which used in weights, the block size and rest size
-std::vector<int> packed_groups(int n,
-                               int k,
-                               int* block = nullptr,
-                               int* rest = nullptr);
-
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/jit/helper.cc b/lite/backends/x86/jit/helper.cc
deleted file mode 100644
index 8322f7ebd2..0000000000
--- a/lite/backends/x86/jit/helper.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "lite/backends/x86/jit/helper.h"
-#include <algorithm>  // tolower
-#include <numeric>
-#include <string>
-#include "lite/utils/paddle_enforce.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-
-#define ONE_CASE(key) \
-  case key:           \
-    return #key
-
-const char* to_string(KernelType kt) {
-  switch (kt) {
-    ONE_CASE(kNone);
-    ONE_CASE(kVMul);
-    ONE_CASE(kVAdd);
-    ONE_CASE(kVAddRelu);
-    ONE_CASE(kVSub);
-    ONE_CASE(kVScal);
-    ONE_CASE(kStrideScal);
-    ONE_CASE(kVAddBias);
-    ONE_CASE(kVRelu);
-    ONE_CASE(kVBroadcast);
-    ONE_CASE(kVCopy);
-    ONE_CASE(kVIdentity);
-    ONE_CASE(kVExp);
-    ONE_CASE(kVSquare);
-    ONE_CASE(kVSigmoid);
-    ONE_CASE(kVTanh);
-    ONE_CASE(kLSTMCtHt);
-    ONE_CASE(kLSTMC1H1);
-    ONE_CASE(kGRUH1);
-    ONE_CASE(kGRUHtPart1);
-    ONE_CASE(kGRUHtPart2);
-    ONE_CASE(kCRFDecoding);
-    ONE_CASE(kLayerNorm);
-    ONE_CASE(kNCHW16CMulNC);
-    ONE_CASE(kSeqPool);
-    ONE_CASE(kMatMul);
-    ONE_CASE(kHMax);
-    ONE_CASE(kHSum);
-    ONE_CASE(kStrideASum);
-    ONE_CASE(kSoftmax);
-    ONE_CASE(kEmbSeqPool);
-    ONE_CASE(kSgd);
-    default:
-      LOG(FATAL) << "Not support type: %d, or forget to add it.";
-      return "NOT JITKernel";
-  }
-  return nullptr;
-}
-
-const char* to_string(SeqPoolType tp) {
-  switch (tp) {
-    ONE_CASE(kNonePoolType);
-    ONE_CASE(kSum);
-    ONE_CASE(kAvg);
-    ONE_CASE(kSqrt);
-    default:
-      LOG(FATAL) << "Not support type: %d, or forget to add it.";
-      return "NOT PoolType";
-  }
-  return nullptr;
-}
-#undef ONE_CASE
-
-KernelType to_kerneltype(const std::string& act) {
-  std::string lower = act;
-  std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower);
-  if (lower == "relu" || lower == "vrelu") {
-    return kVRelu;
-  } else if (lower == "identity" || lower == "videntity" || lower == "") {
-    return kVIdentity;
-  } else if (lower == "exp" || lower == "vexp") {
-    return kVExp;
-  } else if (lower == "sigmoid" || lower == "vsigmoid") {
-    return kVSigmoid;
-  } else if (lower == "tanh" || lower == "vtanh") {
-    return kVTanh;
-  }
-  LOG(FATAL) << "Not support type: %s, or forget to add this case";
-  return kNone;
-}
-
-template <>
-void pack_weights<float>(const float* src, float* dst, int n, int k) {
-  int block, rest;
-  const auto groups = packed_groups(n, k, &block, &rest);
-  std::for_each(groups.begin(), groups.end(), [&](int i) {
-    PADDLE_ENFORCE_GT(i, 0, "each element of groups should be larger than 0.");
-  });
-  int sum = std::accumulate(groups.begin(), groups.end(), 0);
-  std::memset(dst, 0, k * sum * block * sizeof(float));
-  PADDLE_ENFORCE_GE(
-      sum * block, n, "The packed n should be equal to or larger than n");
-
-  const int block_len = sizeof(float) * block;
-  int n_offset = 0;
-
-  for (size_t g = 0; g < groups.size(); ++g) {
-    const float* from = src + n_offset;
-    for (int j = 0; j < k; ++j) {
-      size_t copy_sz = groups[g] * block_len;
-      if (g == groups.size() - 1 && rest != 0) {
-        copy_sz = (groups[g] - 1) * block_len + rest * sizeof(float);
-      }
-      std::memcpy(dst, from + j * n, copy_sz);
-      dst += groups[g] * block;
-    }
-    n_offset += groups[g] * block;
-  }
-}
-
-template <typename T>
-typename std::enable_if<!std::is_same<T, float>::value>::type pack_weights(
-    const T* src, T* dst, int n, int k) {
-  LOG(FATAL) << "Only support pack with float type.";
-}
-
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/jit/helper.h b/lite/backends/x86/jit/helper.h
deleted file mode 100644
index b21be9466c..0000000000
--- a/lite/backends/x86/jit/helper.h
+++ /dev/null
@@ -1,267 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <iostream>
-#include <string>
-#include <unordered_map>
-#include <utility>  // for std::move
-#include <vector>
-#include "lite/backends/x86/jit/gen_base.h"
-#include "lite/backends/x86/jit/kernel_base.h"
-#include "lite/backends/x86/jit/kernel_key.h"
-#include "lite/backends/x86/jit/kernel_pool.h"
-#include "lite/utils/paddle_enforce.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-
-template <typename KernelTuple, typename PlaceType>
-inline typename std::enable_if<
-    std::is_same<typename KernelTuple::data_type, float>::value,
-    const Kernel*>::type
-GetJitCode(const typename KernelTuple::attr_type& attr) {
-  using Attr = typename KernelTuple::attr_type;
-  int64_t key = JitCodeKey<Attr>(attr);
-  auto& codes = JitCodePool<KernelTuple::kernel_type>::Instance();
-  if (codes.Has(key)) {
-    return codes.AllKernels().at(key).get();
-  }
-
-  // creator is not related with attr, so can use KernelKey as key
-  KernelKey kkey(KernelTuple::kernel_type, PlaceType());
-  // pool: (KernelKey(type, place), vector<GenCreatorPtr>)
-  auto& creator_map = JitCodeCreatorPool::Instance().AllCreators();
-  auto iter = creator_map.find(kkey);
-  if (iter != creator_map.end()) {
-    auto& creators = iter->second;
-    for (auto& cur : creators) {
-      auto i = dynamic_cast<const JitCodeCreator<Attr>*>(cur.get());
-      if (i && i->CanBeUsed(attr)) {
-        auto p = i->CreateJitCode(attr);
-        if (p) {
-          auto res = p.get();
-          codes.Insert(key, std::move(p));
-          return res;
-        }
-      }
-    }
-  }
-  return nullptr;
-}
-
-template <typename KernelTuple, typename PlaceType>
-inline typename std::enable_if<
-    !std::is_same<typename KernelTuple::data_type, float>::value,
-    const Kernel*>::type
-GetJitCode(const typename KernelTuple::attr_type& attr) {
-  return nullptr;
-}
-
-// Refer code do not related with attr, which is just for cast
-// Refer is always on CPUPlace
-template <typename KernelTuple>
-inline const Kernel* GetReferKernel() {
-  auto& ref_pool = ReferKernelPool::Instance().AllKernels();
-  KernelKey kkey(KernelTuple::kernel_type, lite::fluid::CPUPlace());
-  auto ref_iter = ref_pool.find(kkey);
-  PADDLE_ENFORCE(ref_iter != ref_pool.end(),
-                 "Every Kernel should have reference function.");
-  auto& ref_impls = ref_iter->second;
-  for (auto& impl : ref_impls) {
-    auto i = dynamic_cast<const ReferKernel<KernelTuple>*>(impl.get());
-    if (i) {
-      return i;
-    }
-  }
-  return nullptr;
-}
-
-template <typename KernelTuple>
-inline typename KernelTuple::func_type GetReferFunc() {
-  auto ker = GetReferKernel<KernelTuple>();
-  auto p = dynamic_cast<const ReferKernel<KernelTuple>*>(ker);
-  PADDLE_ENFORCE(p, "The Refer kernel should exsit");
-  return p->GetFunc();
-}
-
-// Return all Kernels that can be used
-template <typename KernelTuple, typename PlaceType>
-std::vector<const Kernel*> GetAllCandidateKernels(
-    const typename KernelTuple::attr_type& attr) {
-  // the search order shoudl be jitcode > more > refer
-  std::vector<const Kernel*> res;
-  auto jitker = GetJitCode<KernelTuple, PlaceType>(attr);
-  if (jitker) {
-    res.emplace_back(jitker);
-  }
-
-  // more kernelpool: (KernelKey(type, place), vector<KernelPtr>)
-  KernelKey kkey(KernelTuple::kernel_type, PlaceType());
-  auto& pool = KernelPool::Instance().AllKernels();
-  auto iter = pool.find(kkey);
-  if (iter != pool.end()) {
-    auto& impls = iter->second;
-    for (auto& impl : impls) {
-      auto i = dynamic_cast<const KernelMore<KernelTuple>*>(impl.get());
-      if (i && i->CanBeUsed(attr)) {
-        res.emplace_back(i);
-      }
-    }
-  }
-
-  // The last implementation should be reference function on CPUPlace.
-  auto ref = GetReferKernel<KernelTuple>();
-  PADDLE_ENFORCE(ref != nullptr, "Refer Kernel can not be empty.");
-  res.emplace_back(ref);
-  return res;
-}
-
-template <typename KernelTuple, typename PlaceType = lite::fluid::CPUPlace>
-std::vector<std::pair<std::string, typename KernelTuple::func_type>>
-GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) {
-  using Func = typename KernelTuple::func_type;
-  auto kers = GetAllCandidateKernels<KernelTuple, PlaceType>(attr);
-  std::vector<std::pair<std::string, Func>> res;
-  for (auto k : kers) {
-    std::string name = k->ImplType();
-    if (name == "JitCode") {
-      auto i = dynamic_cast<const GenBase*>(k);
-      PADDLE_ENFORCE(i, "jitcode kernel cast can not fail.");
-      res.emplace_back(std::make_pair(name, i->template getCode<Func>()));
-    } else {
-      auto i = dynamic_cast<const KernelMore<KernelTuple>*>(k);
-      PADDLE_ENFORCE(i, "kernel cast can not fail.");
-      res.emplace_back(std::make_pair(name, i->GetFunc()));
-    }
-  }
-  return res;
-}
-
-template <typename KernelTuple, typename PlaceType = lite::fluid::CPUPlace>
-std::vector<typename KernelTuple::func_type> GetAllCandidateFuncs(
-    const typename KernelTuple::attr_type& attr) {
-  auto funcs = GetAllCandidateFuncsWithTypes<KernelTuple, PlaceType>(attr);
-  std::vector<typename KernelTuple::func_type> res;
-  for (auto& i : funcs) {
-    res.emplace_back(i.second);
-  }
-  return res;
-}
-
-template <typename KernelTuple, typename PlaceType = lite::fluid::CPUPlace>
-typename KernelTuple::func_type GetDefaultBestFunc(
-    const typename KernelTuple::attr_type& attr) {
-  auto funcs = GetAllCandidateFuncs<KernelTuple, PlaceType>(attr);
-  PADDLE_ENFORCE_GE(funcs.size(), 1UL);
-  // Here could do some runtime benchmark of this attr and return the best one.
-  // But yet just get the first one as the default best one,
-  // which is searched in order and tuned by offline.
-  return funcs[0];
-}
-
-template <typename KernelTuple, typename PlaceType>
-class KernelFuncs {
- public:
-  KernelFuncs() = default;
-  static KernelFuncs& Cache() {
-    static thread_local KernelFuncs<KernelTuple, PlaceType> g_func_cache;
-    return g_func_cache;
-  }
-
-  // the exposed interface to use
-  typename KernelTuple::func_type At(
-      const typename KernelTuple::attr_type& attr) {
-    // Maybe here is not good enough, not all kernels should have jitcode
-    int64_t key = JitCodeKey<typename KernelTuple::attr_type>(attr);
-    if (Has(key)) {
-      return funcs_.at(key);
-    }
-    // If do not have this attr in cache then get the default best
-    auto func = GetDefaultBestFunc<KernelTuple, PlaceType>(attr);
-    Insert(key, func);
-    return func;
-  }
-
-  typename KernelTuple::func_type operator[](
-      const typename KernelTuple::attr_type& attr) {
-    return At(attr);
-  }
-
- protected:
-  bool Has(int64_t key) const { return funcs_.find(key) != funcs_.end(); }
-  void Insert(int64_t key, typename KernelTuple::func_type func) {
-    funcs_.emplace(key, func);
-  }
-
- private:
-  std::unordered_map<int64_t, typename KernelTuple::func_type> funcs_;
-};
-
-const char* to_string(KernelType kt);
-const char* to_string(SeqPoolType kt);
-
-KernelType to_kerneltype(const std::string& act);
-
-inline std::ostream& operator<<(std::ostream& os, const lstm_attr_t& attr) {
-  os << "dim_size[" << attr.d << "],act_gate[" << to_string(attr.act_gate)
-     << "],act_cand[" << to_string(attr.act_cand) << "],act_cell["
-     << to_string(attr.act_cell) << "],use_peephole["
-     << (attr.use_peephole ? "True" : "False") << "]";
-  return os;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const gru_attr_t& attr) {
-  os << "dim_size[" << attr.d << "],act_gate[" << to_string(attr.act_gate)
-     << "],act_cand[" << to_string(attr.act_cand) << "]";
-  return os;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const seq_pool_attr_t& attr) {
-  os << "height_size[" << attr.h << "],width_size[" << attr.w << "],pool_type["
-     << to_string(attr.type) << "]";
-  return os;
-}
-
-inline std::ostream& operator<<(std::ostream& os,
-                                const emb_seq_pool_attr_t& attr) {
-  os << "table_height[" << attr.table_height << "],table_width["
-     << attr.table_width << "],index_height[" << attr.index_height
-     << "],index_width[" << attr.index_width << "],output_width["
-     << attr.out_width << "],pool_type[" << to_string(attr.pool_type) << "]";
-  return os;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const sgd_attr_t& attr) {
-  os << "param_height[" << attr.param_height << "],param_width["
-     << attr.param_width << "],grad_height[" << attr.grad_height
-     << "],grad_width[" << attr.grad_width << "],selected_rows_size["
-     << attr.selected_rows_size << "]";
-  return os;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const matmul_attr_t& attr) {
-  os << "M[" << attr.m << "],N[" << attr.n << "],K[" << attr.k << "]";
-  return os;
-}
-
-// expose the method to pack matmul weight
-template <typename T>
-void pack_weights(const T* src, T* dst, int n, int k);
-
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/jit/kernel_base.h b/lite/backends/x86/jit/kernel_base.h
deleted file mode 100644
index dbe44a78ac..0000000000
--- a/lite/backends/x86/jit/kernel_base.h
+++ /dev/null
@@ -1,365 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-#include <cstdint>
-#include "lite/backends/x86/jit/macro.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-
-typedef enum {
-  kNone = 0,
-  // sort by alphabet
-  kCRFDecoding = 1,
-  kEmbSeqPool = 2,
-  kGRUH1,
-  kGRUHtPart1,
-  kGRUHtPart2,
-  kHSum,  // horizontal max
-  kHMax,  // horizontal sum
-  kLSTMCtHt,
-  kLSTMC1H1,
-  kLayerNorm,
-  kMatMul,
-  kNCHW16CMulNC,
-  kSeqPool,
-  kSoftmax,
-  kStrideASum,
-  kStrideScal,
-  kVAdd,
-  kVAddBias,
-  kVAddRelu,
-  kVBroadcast,
-  kVCopy,
-  kVExp,
-  kVIdentity,
-  kVMul,
-  kVRelu,
-  kVScal,
-  kSgd,
-  kVSigmoid,
-  kVSquare,
-  kVSub,
-  kVTanh,
-} KernelType;
-
-typedef enum {
-  kNonePoolType = 0,
-  kSum = 1,
-  kAvg,
-  kSqrt,
-} SeqPoolType;
-
-// x, y, z, n
-template <typename T>
-struct XYZNTuple {
-  typedef T data_type;
-  typedef int attr_type;
-  typedef void (*func_type)(const T*, const T*, T*, int);
-};
-
-// a, x, y, n
-template <typename T>
-struct AXYNTuple : public XYZNTuple<T> {};
-
-// a, x, y, n, stride
-template <typename T>
-struct AXYNSTuple {
-  typedef T data_type;
-  typedef int attr_type;
-  typedef void (*func_type)(const T*, const T*, T*, int, int);
-};
-
-// x, y, n
-template <typename T>
-struct XYNTuple {
-  typedef T data_type;
-  typedef int attr_type;
-  typedef void (*func_type)(const T*, T*, int);
-};
-
-// x, returned value, n
-template <typename T>
-struct XRNTuple : public XYNTuple<T> {};
-
-// x, returned value, n, stride
-template <typename T>
-struct XRNSTuple {
-  typedef T data_type;
-  typedef int attr_type;
-  typedef void (*func_type)(const T*, T*, int, int);
-};
-
-#define DECLARE_KERNELTUPLE(kernel_tuple, type)        \
-  template <typename T>                                \
-  struct type##Tuple : public kernel_tuple<T> {        \
-    static constexpr KernelType kernel_type = k##type; \
-  }
-
-// Tuple should be corresponding to the KernelType
-DECLARE_KERNELTUPLE(XYZNTuple, VMul);
-DECLARE_KERNELTUPLE(XYZNTuple, VAdd);
-DECLARE_KERNELTUPLE(XYZNTuple, VAddRelu);
-DECLARE_KERNELTUPLE(XYZNTuple, VSub);
-
-DECLARE_KERNELTUPLE(AXYNTuple, VScal);
-DECLARE_KERNELTUPLE(AXYNTuple, VAddBias);
-
-DECLARE_KERNELTUPLE(AXYNSTuple, StrideScal);
-
-DECLARE_KERNELTUPLE(XYNTuple, VRelu);
-DECLARE_KERNELTUPLE(XYNTuple, VIdentity);
-DECLARE_KERNELTUPLE(XYNTuple, VSquare);
-DECLARE_KERNELTUPLE(XYNTuple, VExp);
-DECLARE_KERNELTUPLE(XYNTuple, VSigmoid);
-DECLARE_KERNELTUPLE(XYNTuple, VTanh);
-DECLARE_KERNELTUPLE(XYNTuple, VCopy);
-
-DECLARE_KERNELTUPLE(XRNTuple, HMax);
-DECLARE_KERNELTUPLE(XRNTuple, HSum);
-
-DECLARE_KERNELTUPLE(XRNSTuple, StrideASum);
-
-typedef struct {
-  void* gates;  // gates: x_ch, x_ih, x_fh, x_oh
-  const void* ct_1;
-  void* ct;
-  void* ht;
-  /* weight_peephole and checked data are only used in peephole*/
-  const void* wp{nullptr};  //  W_ic, W_fc, W_oc
-  void* checked{nullptr};   // size: 2 * d
-} lstm_t;
-
-typedef struct {
-  void* gates;  // gates: {x_update, x_reset; x_state}
-  const void* ht_1;
-  void* ht;
-} gru_t;
-
-struct rnn_attr_s {
-  int d;
-  KernelType act_gate, act_cand;
-  rnn_attr_s() = default;
-  explicit rnn_attr_s(int _d, KernelType _act_gate, KernelType _act_cand)
-      : d(_d), act_gate(_act_gate), act_cand(_act_cand) {}
-};
-
-struct lstm_attr_s : public rnn_attr_s {
-  bool use_peephole;
-  KernelType act_cell;
-  lstm_attr_s() = default;
-  explicit lstm_attr_s(int _d,
-                       KernelType _act_gate,
-                       KernelType _act_cand,
-                       KernelType _act_cell,
-                       bool _use_peephole = false)
-      : rnn_attr_s(_d, _act_gate, _act_cand),
-        use_peephole(_use_peephole),
-        act_cell(_act_cell) {}
-};
-
-typedef struct rnn_attr_s gru_attr_t;
-typedef struct lstm_attr_s lstm_attr_t;
-
-template <typename T>
-struct LSTMTuple {
-  typedef T data_type;
-  typedef lstm_attr_t attr_type;
-  typedef void (*func_type)(lstm_t*, const lstm_attr_t*);
-};
-
-template <typename T>
-struct GRUTuple {
-  typedef T data_type;
-  typedef gru_attr_t attr_type;
-  typedef void (*func_type)(gru_t*, const gru_attr_t*);
-};
-
-DECLARE_KERNELTUPLE(LSTMTuple, LSTMCtHt);
-DECLARE_KERNELTUPLE(LSTMTuple, LSTMC1H1);
-
-DECLARE_KERNELTUPLE(GRUTuple, GRUH1);
-DECLARE_KERNELTUPLE(GRUTuple, GRUHtPart1);
-DECLARE_KERNELTUPLE(GRUTuple, GRUHtPart2);
-
-#undef DECLARE_KERNELTUPLE
-
-template <typename T>
-struct VBroadcastTuple {
-  static constexpr KernelType kernel_type = kVBroadcast;
-  typedef T data_type;
-  typedef int64_t attr_type;
-  typedef void (*func_type)(const T*, T*, int64_t, int64_t);
-};
-
-typedef struct seq_pool_attr_s {
-  int h, w;  // h should always be the first one
-  SeqPoolType type;
-  seq_pool_attr_s() = default;
-  explicit seq_pool_attr_s(int width, SeqPoolType pool_type, int height = 1)
-      : h(height), w(width), type(pool_type) {}
-} seq_pool_attr_t;
-
-template <typename T>
-struct SeqPoolTuple {
-  static constexpr KernelType kernel_type = kSeqPool;
-  typedef T data_type;
-  typedef seq_pool_attr_t attr_type;
-  typedef void (*func_type)(const T*, T*, const seq_pool_attr_t*);
-};
-
-typedef struct emb_seq_pool_attr_s {
-  int64_t table_height, table_width;
-  int64_t index_height, index_width;
-  int64_t out_width;
-  SeqPoolType pool_type;
-  emb_seq_pool_attr_s() = default;
-  explicit emb_seq_pool_attr_s(int64_t tbl_height,
-                               int64_t tbl_width,
-                               int64_t idx_height,
-                               int64_t idx_width,
-                               int64_t output_width,
-                               SeqPoolType seqpool_type = SeqPoolType::kSum)
-      : table_height(tbl_height),
-        table_width(tbl_width),
-        index_height(idx_height),
-        index_width(idx_width),
-        out_width(output_width),
-        pool_type(seqpool_type) {}
-} emb_seq_pool_attr_t;
-
-template <typename T>
-struct EmbSeqPoolTuple {
-  static constexpr KernelType kernel_type = kEmbSeqPool;
-  typedef T data_type;
-  typedef emb_seq_pool_attr_t attr_type;
-  typedef void (*func_type)(const T*,
-                            const int64_t*,
-                            T*,
-                            const emb_seq_pool_attr_t*);
-};
-
-typedef struct sgd_attr_s {
-  int64_t param_height, param_width;
-  int64_t grad_height, grad_width;
-  int64_t selected_rows_size;
-  sgd_attr_s() = default;
-  explicit sgd_attr_s(int64_t param_h,
-                      int64_t param_w,
-                      int64_t grad_h,
-                      int64_t grad_w,
-                      int64_t selected_rows_sz)
-      : param_height(param_h),
-        param_width(param_w),
-        grad_height(grad_h),
-        grad_width(grad_w),
-        selected_rows_size(selected_rows_sz) {}
-} sgd_attr_t;
-
-template <typename T>
-struct SgdTuple {
-  static constexpr KernelType kernel_type = kSgd;
-  typedef T data_type;
-  typedef sgd_attr_t attr_type;
-  typedef void (*func_type)(
-      const T*, const T*, const T*, const int64_t*, T*, const sgd_attr_t*);
-};
-
-typedef struct matmul_attr_s {
-  int m, n, k;
-  void* packed_weight{nullptr};
-  matmul_attr_s() = default;
-  explicit matmul_attr_s(int m_, int n_, int k_, void* packed_weight_ = nullptr)
-      : m(m_), n(n_), k(k_), packed_weight(packed_weight_) {}
-} matmul_attr_t;
-
-template <typename T>
-struct MatMulTuple {
-  static constexpr KernelType kernel_type = kMatMul;
-  typedef T data_type;
-  typedef matmul_attr_t attr_type;
-  typedef void (*func_type)(const T*, const T*, T*, const matmul_attr_t*);
-};
-
-template <typename T>
-struct CRFDecodingTuple {
-  static constexpr KernelType kernel_type = kCRFDecoding;
-  typedef T data_type;
-  typedef int attr_type;
-  typedef void (*func_type)(const int, const T*, const T*, T*, int*, int);
-};
-
-template <typename T>
-struct LayerNormTuple {
-  static constexpr KernelType kernel_type = kLayerNorm;
-  typedef T data_type;
-  typedef int attr_type;
-  typedef void (*func_type)(
-      T*, T*, T*, T*, const T*, const T*, int, const float, int);
-};
-
-template <typename T>
-struct SoftmaxTuple {
-  static constexpr KernelType kernel_type = kSoftmax;
-  typedef T data_type;
-  typedef int attr_type;
-  typedef void (*func_type)(const T*, T*, int, int, int);
-};
-
-// nChw16c = nChw16c .* NC
-template <typename T>
-struct NCHW16CMulNCTuple {
-  static constexpr KernelType kernel_type = kNCHW16CMulNC;
-  typedef T data_type;
-  typedef int attr_type;
-  typedef void (*func_type)(const T*, const T*, T*, int, int);
-};
-
-// Just for adding to kernel pool without template
-class Kernel {
- public:
-  Kernel() = default;
-  virtual ~Kernel() = default;
-  virtual const char* ImplType() const = 0;
-};
-
-template <typename KernelTuple>
-class KernelMore : public Kernel {
- public:
-  using T = typename KernelTuple::data_type;
-  using Func = typename KernelTuple::func_type;
-  using Attr = typename KernelTuple::attr_type;
-  virtual Func GetFunc() const { return func; }
-  // specify this kernel can be used, means it should not fail if use it.
-  virtual bool CanBeUsed(const Attr& attr) const = 0;
-
- protected:
-  Func func{nullptr};
-};
-
-template <typename KernelTuple>
-class ReferKernel : public KernelMore<KernelTuple> {
- public:
-  // Refer code can always be used
-  bool CanBeUsed(const typename KernelTuple::attr_type& attr) const override {
-    return true;
-  }
-  const char* ImplType() const override { return "Refer"; }
-};
-
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/jit/kernel_key.cc b/lite/backends/x86/jit/kernel_key.cc
deleted file mode 100644
index a6288fcf19..0000000000
--- a/lite/backends/x86/jit/kernel_key.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "lite/backends/x86/jit/kernel_key.h"
-#include <xxhash.h>  // XXH64: 13.8 GB/s
-#include "lite/utils/paddle_enforce.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-
-template <>
-int64_t JitCodeKey<int>(const int& d) {
-  return d;
-}
-
-template <>
-int64_t JitCodeKey<int64_t>(const int64_t& d) {
-  return d;
-}
-
-template <>
-int64_t JitCodeKey<gru_attr_t>(const gru_attr_t& attr) {
-  return XXH64(&attr, sizeof(gru_attr_t), 0);
-}
-
-template <>
-int64_t JitCodeKey<lstm_attr_t>(const lstm_attr_t& attr) {
-  int keys[5] = {attr.d,
-                 static_cast<int>(attr.act_gate),
-                 static_cast<int>(attr.act_cand),
-                 static_cast<int>(attr.act_cell),
-                 static_cast<int>(attr.use_peephole)};
-  return XXH64(keys, sizeof(int) * 5, 0);
-}
-
-template <>
-int64_t JitCodeKey<seq_pool_attr_t>(const seq_pool_attr_t& attr) {
-  int keys[2] = {attr.w, static_cast<int>(attr.type)};
-  return XXH64(keys, sizeof(int) * 2, 0);
-}
-
-template <>
-int64_t JitCodeKey<matmul_attr_t>(const matmul_attr_t& attr) {
-  return XXH64(&attr, sizeof(int) * 3, 0);  // m, n, k
-}
-
-template <>
-int64_t JitCodeKey<emb_seq_pool_attr_t>(const emb_seq_pool_attr_t& attr) {
-  return attr.table_width;
-}
-
-template <>
-int64_t JitCodeKey<sgd_attr_t>(const sgd_attr_t& attr) {
-  return attr.grad_width;
-}
-
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/jit/kernel_key.h b/lite/backends/x86/jit/kernel_key.h
deleted file mode 100644
index 6df3a20a4b..0000000000
--- a/lite/backends/x86/jit/kernel_key.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-#include <cstddef>
-#include <functional>
-#include "lite/backends/x86/jit/kernel_base.h"
-#include "lite/backends/x86/legacy_place.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-
-struct KernelKey {
-  struct Hash {
-    size_t operator()(const KernelKey& key) const {
-      int place = key.place_.which();               // less than 2^8
-      int type = static_cast<int>(key.type_) << 8;  // less than 2^(32-8)
-      std::hash<int> hasher;
-      return hasher(place + type);
-    }
-  };
-
-  KernelType type_;
-  lite::fluid::Place place_;
-
-  KernelKey(KernelType type, lite::fluid::Place place)
-      : type_(type), place_(place) {}
-  size_t hash_key() const { return Hash()(*this); }
-
-  bool operator==(const KernelKey& o) const {
-    return /*platform::places_are_same_class(place_, o.place_)*/ true &&
-           type_ == o.type_;
-  }
-  bool operator!=(const KernelKey& o) const { return !(*this == o); }
-};
-
-// Every JitCode should have a method to get the key from attribution
-template <typename Attr>
-int64_t JitCodeKey(const Attr& attr);
-
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/jit/kernel_pool.cc b/lite/backends/x86/jit/kernel_pool.cc
deleted file mode 100644
index 43ad20c90c..0000000000
--- a/lite/backends/x86/jit/kernel_pool.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "lite/backends/x86/jit/kernel_pool.h"
-#include <memory>  // for shared_ptr
-#include <string>
-#include <unordered_map>
-
-namespace paddle {
-namespace lite {
-namespace jit {
-
-JitCodeCreatorPool& JitCodeCreatorPool::Instance() {
-  static JitCodeCreatorPool g_creator_pool;
-  return g_creator_pool;
-}
-
-KernelPool& KernelPool::Instance() {
-  static KernelPool g_kernel_pool;
-  return g_kernel_pool;
-}
-
-ReferKernelPool& ReferKernelPool::Instance() {
-  static ReferKernelPool g_refer_kernel_pool;
-  return g_refer_kernel_pool;
-}
-
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/jit/kernel_pool.h b/lite/backends/x86/jit/kernel_pool.h
deleted file mode 100644
index dc0b1bbf2e..0000000000
--- a/lite/backends/x86/jit/kernel_pool.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <memory>  // for unique_ptr
-#include <string>
-#include <unordered_map>
-#include <utility>  // for move
-#include <vector>
-#include "lite/backends/x86/jit/gen_base.h"
-#include "lite/backends/x86/jit/kernel_base.h"
-#include "lite/backends/x86/jit/kernel_key.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-
-template <KernelType KT>
-class JitCodePool {
-  typedef std::unique_ptr<GenBase> GenBasePtr;
-  typedef std::unordered_map<int64_t, GenBasePtr> JitCodeMap;
-
- public:
-  JitCodePool() = default;
-  static JitCodePool& Instance() {
-    static thread_local JitCodePool<KT> g_jit_codes;
-    return g_jit_codes;
-  }
-
-  const JitCodeMap& AllKernels() { return codes_; }
-
-  bool Has(int64_t key) const { return codes_.find(key) != codes_.end(); }
-
-  void Insert(int64_t key, GenBasePtr value) {
-    codes_.emplace(key, std::move(value));
-  }
-
- private:
-  JitCodeMap codes_;
-};
-
-class JitCodeCreatorPool {
-  typedef std::unique_ptr<const GenCreator> GenCreatorPtr;
-  typedef std::unordered_map<KernelKey,
-                             std::vector<GenCreatorPtr>,
-                             KernelKey::Hash>
-      GenCreatorPtrMap;
-
- public:
-  JitCodeCreatorPool() = default;
-  static JitCodeCreatorPool& Instance();
-  GenCreatorPtrMap& AllCreators() { return creators_; }
-  void Insert(const KernelKey& key, GenCreatorPtr value) {
-    if (creators_.find(key) == creators_.end()) {
-      creators_.emplace(key, std::vector<GenCreatorPtr>());
-    }
-    creators_.at(key).emplace_back(std::move(value));
-  }
-
- private:
-  GenCreatorPtrMap creators_;
-};
-
-typedef std::unique_ptr<const Kernel> KernelPtr;
-typedef std::unordered_map<KernelKey, std::vector<KernelPtr>, KernelKey::Hash>
-    KernelMap;
-
-class KernelPool {
- public:
-  static KernelPool& Instance();
-  KernelPool() = default;
-  KernelMap& AllKernels() { return pool_; }
-  void Insert(const KernelKey& key, KernelPtr value) {
-    if (pool_.find(key) == pool_.end()) {
-      pool_.emplace(key, std::vector<KernelPtr>());
-    }
-    pool_.at(key).emplace_back(std::move(value));
-  }
-
- private:
-  KernelMap pool_;
-};
-
-// Every kernel should have refer code and it should be used in unit tests,
-// so refer kernels should have it's independent kernel pool
-class ReferKernelPool {
- public:
-  static ReferKernelPool& Instance();
-  ReferKernelPool() = default;
-  KernelMap& AllKernels() { return pool_; }
-  void Insert(const KernelKey& key, KernelPtr value) {
-    if (pool_.find(key) == pool_.end()) {
-      pool_.emplace(key, std::vector<KernelPtr>());
-    }
-    pool_.at(key).emplace_back(std::move(value));
-  }
-
- private:
-  KernelMap pool_;
-};
-
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/jit/macro.h b/lite/backends/x86/jit/macro.h
deleted file mode 100644
index 703342252f..0000000000
--- a/lite/backends/x86/jit/macro.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-#include <type_traits>
-
-namespace paddle {
-namespace lite {
-namespace jit {
-
-#define SIGMOID_THRESHOLD_MIN -40.0
-#define SIGMOID_THRESHOLD_MAX 13.0
-#define EXP_MAX_INPUT 40.0
-
-#define XMM_FLOAT_BLOCK 4
-#define YMM_FLOAT_BLOCK 8
-#define ZMM_FLOAT_BLOCK 16
-
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/jit/more/CMakeLists.txt b/lite/backends/x86/jit/more/CMakeLists.txt
deleted file mode 100644
index 2ddbbcd16a..0000000000
--- a/lite/backends/x86/jit/more/CMakeLists.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-
-function(USE_JITKERNEL_MORE TARGET TYPE)
-    file(APPEND ${jit_file} "USE_JITKERNEL_MORE(${TARGET} ${TYPE});\n")
-endfunction()
-
-# enable it latter
- if(WITH_MKLML)
-     add_subdirectory(mkl)
- endif()
-
-if(WITH_AVX)
-    add_subdirectory(intrinsic)
-endif()
-
-# mix should be last
-add_subdirectory(mix)
-
-set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} PARENT_SCOPE)
diff --git a/lite/backends/x86/jit/more/intrinsic/CMakeLists.txt b/lite/backends/x86/jit/more/intrinsic/CMakeLists.txt
deleted file mode 100644
index 468937a4f6..0000000000
--- a/lite/backends/x86/jit/more/intrinsic/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-
-file(GLOB jit_kernel_cc_intrinsic RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
-cc_library(jit_kernel_intrinsic SRCS ${jit_kernel_cc_intrinsic} DEPS jit_kernel_base)
-
-set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_intrinsic PARENT_SCOPE)
-
-# use mkl kernels by name and type
-USE_JITKERNEL_MORE(kCRFDecoding, intrinsic)
-USE_JITKERNEL_MORE(kLayerNorm, intrinsic)
diff --git a/lite/backends/x86/jit/more/intrinsic/crf_decoding.cc b/lite/backends/x86/jit/more/intrinsic/crf_decoding.cc
deleted file mode 100644
index d9c939f7ef..0000000000
--- a/lite/backends/x86/jit/more/intrinsic/crf_decoding.cc
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "lite/backends/x86/jit/more/intrinsic/crf_decoding.h"
-#include <limits>
-#include "lite/backends/x86/cpu_info.h"
-#include "lite/backends/x86/jit/registry.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace more {
-namespace intrinsic {
-// Note: intrinsic code is not runtime build.
-// For example, if you build code on AVX, and run on AVX512 it can only use AVX
-
-void CRFDecoding(const int seq_len,
-                 const float* x,
-                 const float* w,
-                 float* alpha,
-                 int* track,
-                 int tag_num) {
-#ifdef __AVX512F__
-  const int step_size = ZMM_FLOAT_BLOCK;
-#else
-  const int step_size = YMM_FLOAT_BLOCK;
-#endif
-  const int end = tag_num / step_size;
-  const int rest = tag_num % step_size;
-  /* Setup the alpha initial value.*/
-  int i_offset = 0;
-  int last_offset = rest - step_size;
-  for (int i = 0; i <= end; ++i) {
-#ifdef __AVX512F__
-    // Declare the variable for the content of weights, input and alpha values.
-    __m512 w_content, x_content, alpha_content;
-    // Load the relevant data into the variables from un-aligned address.
-    w_content = _mm512_loadu_ps(w + i_offset);
-    x_content = _mm512_loadu_ps(x + i_offset);
-    alpha_content = _mm512_add_ps(w_content, x_content);
-    // Save the alpha value.
-    _mm512_storeu_ps(alpha_value + i_offset, alpha_content);
-#else
-    // AVX or AVX2
-    // weights, input and alpha values.
-    __m256 w_content, x_content, alpha_content;
-    // Load the relevant data into the variables from un-aligned address.
-    w_content = _mm256_loadu_ps(w + i_offset);
-    x_content = _mm256_loadu_ps(x + i_offset);
-    alpha_content = _mm256_add_ps(w_content, x_content);
-    _mm256_storeu_ps(alpha + i_offset, alpha_content);
-#endif
-    i_offset += step_size;
-    if (i == end - 1) {
-      if (rest > 0) {
-        i_offset += last_offset;
-      } else {
-        break;
-      }
-    }
-  }
-  // Use the column-major strategy to get the location of maximum score.
-  int seq_offset = 0;
-  constexpr int state_trans_base_idx = 2;
-  for (int k = 1; k < seq_len; ++k) {
-    int j_offset = 0;
-    for (int j = 0; j <= end; ++j) {
-/* Initialize the variables of maximum score and location.*/
-#ifdef __AVX512F__
-      __m512 max_score = _mm512_set1_ps(-std::numeric_limits<float>::max());
-      __m512i max_j = _mm512_setzero_si512();
-#else
-      __m256 max_score = _mm256_set1_ps(-std::numeric_limits<float>::max());
-      __m256i max_j = _mm256_set1_epi32(0);
-#endif
-      /* Calculate the offset of transition_weights.*/
-      int trans_offset = state_trans_base_idx * tag_num + j_offset;
-      for (int i = 0; i < tag_num; ++i) {
-/* Initalize the content of alpha variable with related offset.*/
-#ifdef __AVX512F__
-        __m512 alpha_content = _mm512_set1_ps(*(alpha + seq_offset + i));
-        /* Obtain the content of weights from un-aligned address.*/
-        __m512 w_content = _mm512_loadu_ps(w + trans_offset);
-        __m512 score_v = _mm512_add_ps(alpha_content, w_content);
-        __mmask16 mask = _mm512_cmp_ps_mask(score_v, max_score, _CMP_GT_OS);
-        /* AVX512 instructions.*/
-        max_j = _mm512_mask_set1_epi32(max_j, mask, i);
-        /* Update the max_score value.*/
-        max_score = _mm512_max_ps(max_score, score_v);
-
-#else
-        __m256 alpha_content = _mm256_broadcast_ss(alpha + seq_offset + i);
-        /* Obtain the content of weights from un-aligned address.*/
-        __m256 w_content = _mm256_loadu_ps(w + trans_offset);
-        __m256 score_v = _mm256_add_ps(alpha_content, w_content);
-        __m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS);
-/* According to the mask value, update the index of the max_score.*/
-#ifdef __AVX2__
-        max_j = _mm256_or_si256(
-            _mm256_andnot_si256((__m256i)mask, max_j),
-            _mm256_and_si256((__m256i)mask, _mm256_set1_epi32(i)));
-#else
-        __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0);
-        __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1);
-        __m128i lo_mask =
-            _mm256_extractf128_si256(*(__m256i*)&mask, 0);  // NOLINT
-        __m128i hi_mask =
-            _mm256_extractf128_si256(*(__m256i*)&mask, 1);  // NOLINT
-        lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j);
-        hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j);
-        lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i));
-        hi_mask = _mm_and_si128(hi_mask, _mm_set1_epi32(i));
-        lo_max_j = _mm_or_si128(lo_mask, lo_max_j);
-        hi_max_j = _mm_or_si128(hi_mask, hi_max_j);
-        max_j = _mm256_insertf128_si256(max_j, lo_max_j, 0);
-        max_j = _mm256_insertf128_si256(max_j, hi_max_j, 1);
-#endif
-        /* Update the max_score value.*/
-        max_score = _mm256_max_ps(max_score, score_v);
-
-#endif
-
-        trans_offset += tag_num;
-      }
-/* Update the alpha and track values. */
-#ifdef __AVX512F__
-      __m512 x_content =
-          _mm512_loadu_ps(x + seq_offset + this->num_ + j_offset);
-      max_score = _mm512_add_ps(max_score, x_content);
-      _mm512_storeu_ps(alpha + seq_offset + this->num_ + j_offset, max_score);
-      _mm512_storeu_si512(reinterpret_cast<__m512i*>(track + seq_offset +
-                                                     this->num_ + j_offset),
-                          max_j);
-#else
-      __m256 x_content = _mm256_loadu_ps(x + seq_offset + tag_num + j_offset);
-      max_score = _mm256_add_ps(max_score, x_content);
-      _mm256_storeu_ps(alpha + seq_offset + tag_num + j_offset, max_score);
-      _mm256_storeu_si256(
-          reinterpret_cast<__m256i*>(track + seq_offset + tag_num + j_offset),
-          max_j);
-#endif
-
-      /* Calculate the offset of next step*/
-      j_offset += step_size;
-      if (j == end - 1) {
-        if (rest > 0) {
-          j_offset += last_offset;
-        } else {
-          break;
-        }
-      }
-    }
-    seq_offset += tag_num;
-  }
-}
-
-bool CRFDecodingKernel::CanBeUsed(const int& d) const {
-#ifdef __AVX512F__
-  constexpr int block = ZMM_FLOAT_BLOCK;
-#else
-  constexpr int block = YMM_FLOAT_BLOCK;
-#endif
-  return x86::MayIUse(x86::avx) && d >= block;
-}
-
-}  // namespace intrinsic
-}  // namespace more
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
-
-namespace intrinsic = paddle::lite::jit::more::intrinsic;
-
-REGISTER_JITKERNEL_MORE(kCRFDecoding, intrinsic, intrinsic::CRFDecodingKernel);
diff --git a/lite/backends/x86/jit/more/intrinsic/crf_decoding.h b/lite/backends/x86/jit/more/intrinsic/crf_decoding.h
deleted file mode 100644
index 8a425fb491..0000000000
--- a/lite/backends/x86/jit/more/intrinsic/crf_decoding.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <type_traits>
-#include "lite/backends/x86/jit/kernel_base.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace more {
-namespace intrinsic {
-
-void CRFDecoding(const int seq_len,
-                 const float* x,
-                 const float* w,
-                 float* alpha,
-                 int* track,
-                 int tag_num);
-
-class CRFDecodingKernel : public KernelMore<CRFDecodingTuple<float>> {
- public:
-  CRFDecodingKernel() { this->func = CRFDecoding; }
-  bool CanBeUsed(
-      const typename CRFDecodingTuple<float>::attr_type&) const override;
-  const char* ImplType() const override { return "Intrinsic"; }
-};
-
-}  // namespace intrinsic
-}  // namespace more
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/jit/more/intrinsic/layer_norm.cc b/lite/backends/x86/jit/more/intrinsic/layer_norm.cc
deleted file mode 100644
index bfd3409e65..0000000000
--- a/lite/backends/x86/jit/more/intrinsic/layer_norm.cc
+++ /dev/null
@@ -1,181 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "lite/backends/x86/jit/more/intrinsic/layer_norm.h"
-#include <limits>
-#include "lite/backends/x86/jit/registry.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace more {
-namespace intrinsic {
-
-void LayerNorm(float* x,
-               float* out,
-               float* mean,
-               float* var,
-               const float* scale,
-               const float* bias,
-               int height,
-               const float epsilon,
-               int right) {
-  __m256 sum;
-  __m256 mean_vec, var_vec;
-  __m128 hi, lo;
-  __m256 tmp;
-  size_t offset;
-  size_t j;
-  int block = YMM_FLOAT_BLOCK;
-  const int rest = right % block;
-  const int end = right - rest;
-
-  __m256 reverse_num_vec =
-      _mm256_div_ps(_mm256_set1_ps(1.0), _mm256_set1_ps(right));
-  __m256 epsilon_vec = _mm256_set1_ps(epsilon);
-  int rest_mask =
-      ((-1) & (~((~0U) >> (sizeof(int) * 8 - (block - rest))))) & 0x0ff;
-  __m256i mask_vec = _mm256_set_epi32(rest_mask & 0x80 ? 0xffffffff : 0,
-                                      rest_mask & 0x40 ? 0xffffffff : 0,
-                                      rest_mask & 0x20 ? 0xffffffff : 0,
-                                      rest_mask & 0x10 ? 0xffffffff : 0,
-                                      rest_mask & 0x8 ? 0xffffffff : 0,
-                                      rest_mask & 0x4 ? 0xffffffff : 0,
-                                      rest_mask & 0x2 ? 0xffffffff : 0,
-                                      rest_mask & 0x1 ? 0xffffffff : 0);
-
-  for (int i = 0; i < height; ++i) {
-    offset = i * right;
-
-    /* get mean */
-    sum = _mm256_setzero_ps();
-    for (j = offset; j < end + offset; j += block) {
-      sum = _mm256_add_ps(sum, _mm256_loadu_ps((const float*)x + j));
-    }
-    if (rest != 0) {
-      j = offset + right - block;
-      tmp = _mm256_loadu_ps((const float*)x + j);
-      tmp = _mm256_blendv_ps(_mm256_setzero_ps(),
-                             tmp,
-                             *(__m256*)&mask_vec);  // NOLINT
-      sum = _mm256_add_ps(sum, tmp);
-    }
-    hi = _mm256_extractf128_ps(sum, 1);
-    lo = _mm256_extractf128_ps(sum, 0);
-    sum = _mm256_add_ps(
-        sum,
-        _mm256_insertf128_ps(
-            _mm256_insertf128_ps(_mm256_setzero_ps(), hi, 0), lo, 1));
-    sum = _mm256_hadd_ps(sum, sum);
-    sum = _mm256_hadd_ps(sum, sum);
-    mean_vec = _mm256_mul_ps(sum, reverse_num_vec);
-    mean[i] = *reinterpret_cast<float*>(&mean_vec);
-
-    /* get variance */
-    sum = _mm256_setzero_ps();
-    for (j = offset; j < end + offset; j += block) {
-      tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec);
-      tmp = _mm256_mul_ps(tmp, tmp);
-      sum = _mm256_add_ps(sum, tmp);
-    }
-    if (rest != 0) {
-      j = offset + right - block;
-      tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec);
-      tmp = _mm256_mul_ps(tmp, tmp);
-      tmp = _mm256_blendv_ps(_mm256_setzero_ps(),
-                             tmp,
-                             *(__m256*)&mask_vec);  // NOLINT
-      sum = _mm256_add_ps(sum, tmp);
-    }
-    hi = _mm256_extractf128_ps(sum, 1);
-    lo = _mm256_extractf128_ps(sum, 0);
-    sum = _mm256_add_ps(
-        sum,
-        _mm256_insertf128_ps(
-            _mm256_insertf128_ps(_mm256_setzero_ps(), hi, 0), lo, 1));
-    sum = _mm256_hadd_ps(sum, sum);
-    sum = _mm256_hadd_ps(sum, sum);
-    var_vec = _mm256_mul_ps(sum, reverse_num_vec);
-    var[i] = *reinterpret_cast<float*>(&var_vec);
-
-    /* get x_norm and calculate output*/
-    for (j = offset; j < end + offset; j += block) {
-      tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec);
-      tmp = _mm256_div_ps(tmp,
-                          _mm256_sqrt_ps(_mm256_add_ps(var_vec, epsilon_vec)));
-      _mm256_storeu_ps(reinterpret_cast<float*>(out) + j, tmp);
-    }
-    if (rest != 0) {
-      j = offset + right - block;
-      tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec);
-      tmp = _mm256_div_ps(tmp,
-                          _mm256_sqrt_ps(_mm256_add_ps(var_vec, epsilon_vec)));
-      _mm256_storeu_ps(reinterpret_cast<float*>(out) + j, tmp);
-    }
-
-    if (scale) {
-      if (rest != 0) {
-        j = offset + right - block;
-        tmp = _mm256_loadu_ps((const float*)out + j);
-      }
-      for (j = offset; j < end + offset; j += block) {
-        _mm256_storeu_ps(
-            reinterpret_cast<float*>(out) + j,
-            _mm256_mul_ps(_mm256_loadu_ps((const float*)out + j),
-                          _mm256_loadu_ps((const float*)scale + j - offset)));
-      }
-      if (rest != 0) {
-        j = offset + right - block;
-        _mm256_storeu_ps(
-            reinterpret_cast<float*>(out) + j,
-            _mm256_mul_ps(tmp,
-                          _mm256_loadu_ps((const float*)scale + j - offset)));
-      }
-    }
-
-    if (bias) {
-      if (rest != 0) {
-        j = offset + right - block;
-        tmp = _mm256_loadu_ps((const float*)out + j);
-      }
-      for (j = offset; j < end + offset; j += block) {
-        _mm256_storeu_ps(
-            reinterpret_cast<float*>(out) + j,
-            _mm256_add_ps(_mm256_loadu_ps((const float*)out + j),
-                          _mm256_loadu_ps((const float*)bias + j - offset)));
-      }
-      if (rest != 0) {
-        j = offset + right - block;
-        _mm256_storeu_ps(
-            reinterpret_cast<float*>(out) + j,
-            _mm256_add_ps(tmp,
-                          _mm256_loadu_ps((const float*)bias + j - offset)));
-      }
-    }
-  }
-}
-
-bool LayerNormKernel::CanBeUsed(const int& d) const {
-  return x86::MayIUse(x86::avx) && d >= YMM_FLOAT_BLOCK;
-}
-
-}  // namespace intrinsic
-}  // namespace more
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
-
-namespace intrinsic = paddle::lite::jit::more::intrinsic;
-
-REGISTER_JITKERNEL_MORE(kLayerNorm, intrinsic, intrinsic::LayerNormKernel);
diff --git a/lite/backends/x86/jit/more/intrinsic/layer_norm.h b/lite/backends/x86/jit/more/intrinsic/layer_norm.h
deleted file mode 100644
index d8768d52ed..0000000000
--- a/lite/backends/x86/jit/more/intrinsic/layer_norm.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <type_traits>
-#include "lite/backends/x86/jit/kernel_base.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace more {
-namespace intrinsic {
-
-void LayerNorm(float* x,
-               float* out,
-               float* mean,
-               float* var,
-               const float* scale,
-               const float* bias,
-               int height,
-               const float epsilon,
-               int right);
-
-class LayerNormKernel : public KernelMore<LayerNormTuple<float>> {
- public:
-  LayerNormKernel() { this->func = LayerNorm; }
-  bool CanBeUsed(
-      const typename LayerNormTuple<float>::attr_type&) const override;
-  const char* ImplType() const override { return "Intrinsic"; }
-};
-
-}  // namespace intrinsic
-}  // namespace more
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/jit/more/mix/CMakeLists.txt b/lite/backends/x86/jit/more/mix/CMakeLists.txt
deleted file mode 100644
index dd039d2915..0000000000
--- a/lite/backends/x86/jit/more/mix/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-
-
-file(GLOB jit_kernel_mix_cc RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
-cc_library(jit_kernel_mix SRCS ${jit_kernel_mix_cc} DEPS jit_kernel_base)
-
-set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_mix PARENT_SCOPE)
-
-USE_JITKERNEL_MORE(kVSigmoid, mix)
-USE_JITKERNEL_MORE(kVTanh, mix)
-USE_JITKERNEL_MORE(kLSTMCtHt, mix)
-USE_JITKERNEL_MORE(kLSTMC1H1, mix)
-USE_JITKERNEL_MORE(kGRUH1, mix)
-USE_JITKERNEL_MORE(kGRUHtPart1, mix)
-USE_JITKERNEL_MORE(kGRUHtPart2, mix)
-USE_JITKERNEL_MORE(kSoftmax, mix)
diff --git a/lite/backends/x86/jit/more/mix/mix.cc b/lite/backends/x86/jit/more/mix/mix.cc
deleted file mode 100644
index b904b8a24c..0000000000
--- a/lite/backends/x86/jit/more/mix/mix.cc
+++ /dev/null
@@ -1,255 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "lite/backends/x86/jit/more/mix/mix.h"
-#include "lite/backends/x86/jit/kernels.h"
-#include "lite/backends/x86/jit/registry.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace more {
-namespace mix {
-
-using CPUPlace = lite::fluid::CPUPlace;
-
-void VSigmoid(const T* x, T* y, int n) {
-  const float min = SIGMOID_THRESHOLD_MIN;
-  const float max = SIGMOID_THRESHOLD_MAX;
-  for (int i = 0; i < n; ++i) {
-    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
-    y[i] = static_cast<T>(0) - y[i];
-  }
-  auto compute = KernelFuncs<VExpTuple<T>, CPUPlace>::Cache().At(n);
-  compute(y, y, n);
-  for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
-  }
-}
-
-void VTanh(const T* x, T* y, int n) {
-  const T a = 2, b = -1;
-  auto compute_scal = KernelFuncs<VScalTuple<T>, CPUPlace>::Cache().At(n);
-  auto compute_addbias = KernelFuncs<VAddBiasTuple<T>, CPUPlace>::Cache().At(n);
-  auto compute_sigmoid = KernelFuncs<VSigmoidTuple<T>, CPUPlace>::Cache().At(n);
-  compute_scal(&a, x, y, n);
-  compute_sigmoid(y, y, n);
-  compute_scal(&a, y, y, n);
-  compute_addbias(&b, y, y, n);
-}
-
-// remain is the product of dimension shapes after the axis dimension
-void Softmax(const T* x, T* y, int n, int bs, int remain) {
-  auto compute_hmax = KernelFuncs<HMaxTuple<T>, CPUPlace>::Cache().At(n);
-  auto compute_hsum = KernelFuncs<HSumTuple<T>, CPUPlace>::Cache().At(n);
-  auto compute_vscal = KernelFuncs<VScalTuple<T>, CPUPlace>::Cache().At(n);
-  auto compute_strideasum =
-      KernelFuncs<StrideASumTuple<T>, CPUPlace>::Cache().At(n);
-  auto compute_stridescal =
-      KernelFuncs<StrideScalTuple<T>, CPUPlace>::Cache().At(n);
-  auto compute_vaddbias =
-      KernelFuncs<VAddBiasTuple<T>, CPUPlace>::Cache().At(n);
-  auto compute_vexp = KernelFuncs<VExpTuple<T>, CPUPlace>::Cache().At(n);
-
-  for (int i = 0; i < bs; ++i) {
-    T scalar;
-    compute_hmax(x, &scalar, n);
-    scalar = static_cast<T>(0) - scalar;
-    compute_vaddbias(&scalar, x, y, n);  // x - max
-    compute_vexp(y, y, n);
-    if (remain == 1) {
-      compute_hsum(y, &scalar, n);
-      scalar = static_cast<T>(1) / scalar;
-      compute_vscal(&scalar, y, y, n);
-    } else {
-      for (int j = 0; j < remain; ++j) {
-        compute_strideasum(&y[j], &scalar, n, remain);
-        scalar = static_cast<T>(1) / scalar;
-        compute_stridescal(&scalar, &y[j], &y[j], n, remain);
-      }
-    }
-    x += n;
-    y += n;
-  }
-}
-
-void (*getActFunc(KernelType type, int d))(const T*, T*, int) {  // NOLINT
-  if (type == kVSigmoid) {
-    return KernelFuncs<VSigmoidTuple<T>, CPUPlace>::Cache().At(d);
-  } else if (type == kVRelu) {
-    return KernelFuncs<VReluTuple<T>, CPUPlace>::Cache().At(d);
-  } else if (type == kVTanh) {
-    return KernelFuncs<VTanhTuple<T>, CPUPlace>::Cache().At(d);
-  } else if (type == kVIdentity) {
-    return KernelFuncs<VIdentityTuple<T>, CPUPlace>::Cache().At(d);
-  }
-  LOG(FATAL) << "Not support type: " << type;
-  return nullptr;
-}
-
-void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) {
-  T* gates = reinterpret_cast<T*>(step->gates);
-  const T* ct_1 = reinterpret_cast<const T*>(step->ct_1);
-  T* ct = reinterpret_cast<T*>(step->ct);
-  T* ht = reinterpret_cast<T*>(step->ht);
-  const T* wp = reinterpret_cast<const T*>(step->wp);
-  T* checked = reinterpret_cast<T*>(step->checked);
-  const int d = attr->d;
-  const int d2 = d * 2;
-  const int d3 = d * 3;
-  auto vmul_d = KernelFuncs<VMulTuple<T>, CPUPlace>::Cache().At(d);
-  auto vadd_d = KernelFuncs<VAddTuple<T>, CPUPlace>::Cache().At(d);
-  auto vadd_d2 = KernelFuncs<VAddTuple<T>, CPUPlace>::Cache().At(d2);
-  auto act_gate_d = getActFunc(attr->act_gate, d);
-  auto act_gate_d2 = getActFunc(attr->act_gate, d2);
-  auto act_gate_d3 = getActFunc(attr->act_gate, d3);
-  auto act_cand_d = getActFunc(attr->act_cand, d);
-  auto act_cell_d = getActFunc(attr->act_cell, d);
-
-  if (attr->use_peephole) {
-    vmul_d(wp, ct_1, checked, d);
-    vmul_d(wp + d, ct_1, checked + d, d);
-    vadd_d2(checked, gates + d, gates + d, d2);
-    act_gate_d2(gates + d, gates + d, d2);
-  } else {
-    act_gate_d3(gates + d, gates + d, d3);
-  }
-
-  // C_t = C_t-1 * fgated + cand_gated * igated
-  act_cand_d(gates, gates, d);
-  vmul_d(gates, gates + d, gates + d, d);
-  vmul_d(ct_1, gates + d2, gates + d2, d);
-  vadd_d(gates + d, gates + d2, ct, d);
-
-  if (attr->use_peephole) {
-    // get ogated
-    vmul_d(wp + d2, ct, gates + d, d);
-    vadd_d(gates + d, gates + d3, gates + d3, d);
-    act_gate_d(gates + d3, gates + d3, d);
-  }
-  // H_t = act_cell(C_t) * ogated
-  act_cell_d(ct, gates + d2, d);
-  vmul_d(gates + d2, gates + d3, ht, d);
-}
-
-void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) {
-  T* gates = reinterpret_cast<T*>(step->gates);
-  T* ct = reinterpret_cast<T*>(step->ct);
-  T* ht = reinterpret_cast<T*>(step->ht);
-  int d = attr->d;
-  int d2 = d * 2;
-  int d3 = d * 3;
-  auto vmul_d = KernelFuncs<VMulTuple<T>, CPUPlace>::Cache().At(d);
-  auto vadd_d = KernelFuncs<VAddTuple<T>, CPUPlace>::Cache().At(d);
-  auto act_gate_d = getActFunc(attr->act_gate, d);
-  auto act_cand_d = getActFunc(attr->act_cand, d);
-  auto act_cell_d = getActFunc(attr->act_cell, d);
-  /* C_t = igated * cgated*/
-  act_gate_d(gates + d, gates + d, d);
-  act_cand_d(gates, gates, d);
-  vmul_d(gates, gates + d, ct, d);
-  if (attr->use_peephole) {
-    // get outgated, put W_oc * C_t on igated
-    const T* wp = reinterpret_cast<const T*>(step->wp);
-    vmul_d(wp + d2, ct, gates + d, d);
-    vadd_d(gates + d, gates + d3, gates + d3, d);
-  }
-  /* H_t = act_cell(C_t) * ogated */
-  act_gate_d(gates + d3, gates + d3, d);
-  act_cell_d(ct, gates + d2, d);
-  vmul_d(gates + d2, gates + d3, ht, d);
-}
-
-// compute h1 without h0
-void GRUH1(gru_t* step, const gru_attr_t* attr) {
-  T* gates = reinterpret_cast<T*>(step->gates);
-  T* ht = reinterpret_cast<T*>(step->ht);
-  int d = attr->d;
-  int d2 = d * 2;
-  auto act_gate = getActFunc(attr->act_gate, d);
-  auto act_cand = getActFunc(attr->act_cand, d);
-  auto vmul_d = KernelFuncs<VMulTuple<T>, CPUPlace>::Cache().At(d);
-  act_gate(gates, gates, d);
-  act_cand(gates + d2, gates + d2, d);
-  vmul_d(gates, gates + d2, ht, d);
-}
-
-// compute the first part of GRU: ht = act_gate(r) * ht_1
-void GRUHtPart1(gru_t* step, const gru_attr_t* attr) {
-  // W: {W_update, W_reset; W_state}
-  T* gates = reinterpret_cast<T*>(step->gates);
-  T* ht = reinterpret_cast<T*>(step->ht);
-  const T* ht_1 = reinterpret_cast<const T*>(step->ht_1);
-  auto act_gate = getActFunc(attr->act_gate, attr->d);
-  auto vmul_d = KernelFuncs<VMulTuple<T>, CPUPlace>::Cache().At(attr->d);
-  act_gate(gates + attr->d, gates + attr->d, attr->d);
-  vmul_d(ht_1, gates + attr->d, ht, attr->d);
-}
-
-// compute the second part of GRU:
-// ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1
-void GRUHtPart2(gru_t* step, const gru_attr_t* attr) {
-  T* gates = reinterpret_cast<T*>(step->gates);
-  T* ht = reinterpret_cast<T*>(step->ht);
-  const T* ht_1 = reinterpret_cast<const T*>(step->ht_1);
-  int d = attr->d;
-  auto act_gate = getActFunc(attr->act_gate, d);
-  auto act_cand = getActFunc(attr->act_cand, d);
-  T* y = gates + d * 2;
-  act_gate(gates, gates, d);
-  act_cand(y, y, d);
-  // out = zt*ht~ + (1-zt)*ht_1
-  for (int i = 0; i < d; ++i) {
-    ht[i] = gates[i] * y[i] + (static_cast<T>(1) - gates[i]) * ht_1[i];
-  }
-}
-
-// TODO(TJ): tuning me
-bool VSigmoidKernel::CanBeUsed(const int& d) const { return true; }
-
-bool VTanhKernel::CanBeUsed(const int& d) const { return true; }
-
-bool SoftmaxKernel::CanBeUsed(const int& d) const { return true; }
-
-bool LSTMCtHtKernel::CanBeUsed(const lstm_attr_t& attr) const { return true; }
-
-bool LSTMC1H1Kernel::CanBeUsed(const lstm_attr_t& attr) const { return true; }
-
-bool GRUH1Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; }
-
-bool GRUHtPart1Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; }
-
-bool GRUHtPart2Kernel::CanBeUsed(const gru_attr_t& attr) const { return true; }
-
-}  // namespace mix
-}  // namespace more
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
-
-namespace mix = paddle::lite::jit::more::mix;
-
-#define REGISTER_MORE_KERNEL(func) \
-  REGISTER_JITKERNEL_MORE(k##func, mix, mix::func##Kernel)
-
-REGISTER_MORE_KERNEL(VSigmoid);
-REGISTER_MORE_KERNEL(VTanh);
-REGISTER_MORE_KERNEL(Softmax);
-REGISTER_MORE_KERNEL(LSTMCtHt);
-REGISTER_MORE_KERNEL(LSTMC1H1);
-REGISTER_MORE_KERNEL(GRUH1);
-REGISTER_MORE_KERNEL(GRUHtPart1);
-REGISTER_MORE_KERNEL(GRUHtPart2);
-
-#undef REGISTER_MORE_KERNEL
diff --git a/lite/backends/x86/jit/more/mix/mix.h b/lite/backends/x86/jit/more/mix/mix.h
deleted file mode 100644
index 6ade67182c..0000000000
--- a/lite/backends/x86/jit/more/mix/mix.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <type_traits>
-#include "lite/backends/x86/jit/kernel_base.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace more {
-namespace mix {
-using T = float;
-
-void VSigmoid(const T* x, T* y, int n);
-void VTanh(const T* x, T* y, int n);
-void Softmax(const T* x, T* y, int n, int bs, int remain);
-
-void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr);
-void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr);
-void GRUH1(gru_t* step, const gru_attr_t* attr);
-void GRUHtPart1(gru_t* step, const gru_attr_t* attr);
-void GRUHtPart2(gru_t* step, const gru_attr_t* attr);
-
-#define DECLARE_MORE_KERNEL(name)                                             \
-  class name##Kernel : public KernelMore<name##Tuple<T>> {                    \
-   public:                                                                    \
-    name##Kernel() { this->func = name; }                                     \
-    bool CanBeUsed(const typename name##Tuple<T>::attr_type&) const override; \
-    const char* ImplType() const override { return "Mixed"; }                 \
-  }
-
-// XYN
-DECLARE_MORE_KERNEL(VSigmoid);
-DECLARE_MORE_KERNEL(VTanh);
-
-// XRN
-DECLARE_MORE_KERNEL(Softmax);
-
-DECLARE_MORE_KERNEL(LSTMCtHt);
-DECLARE_MORE_KERNEL(LSTMC1H1);
-
-DECLARE_MORE_KERNEL(GRUH1);
-DECLARE_MORE_KERNEL(GRUHtPart1);
-DECLARE_MORE_KERNEL(GRUHtPart2);
-
-#undef DECLARE_MORE_KERNEL
-
-}  // namespace mix
-}  // namespace more
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/jit/more/mkl/CMakeLists.txt b/lite/backends/x86/jit/more/mkl/CMakeLists.txt
deleted file mode 100644
index 56f1a62ad4..0000000000
--- a/lite/backends/x86/jit/more/mkl/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-
-cc_library(jit_kernel_mkl SRCS mkl.cc DEPS jit_kernel_base dynload_mklml)
-set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} dynload_mklml jit_kernel_mkl PARENT_SCOPE)
-
-# use mkl kernels by name and type
-USE_JITKERNEL_MORE(kMatMul, mkl)
-USE_JITKERNEL_MORE(kVMul, mkl)
-USE_JITKERNEL_MORE(kVAdd, mkl)
-USE_JITKERNEL_MORE(kVScal, mkl)
-USE_JITKERNEL_MORE(kStrideScal, mkl)
-USE_JITKERNEL_MORE(kVExp, mkl)
-USE_JITKERNEL_MORE(kVSquare, mkl)
-USE_JITKERNEL_MORE(kVCopy, mkl)
-USE_JITKERNEL_MORE(kVSigmoid, mkl)
-USE_JITKERNEL_MORE(kVTanh, mkl)
-USE_JITKERNEL_MORE(kSeqPool, mkl)
-USE_JITKERNEL_MORE(kSoftmax, mkl)
-USE_JITKERNEL_MORE(kEmbSeqPool, mkl)
-USE_JITKERNEL_MORE(kSgd, mkl)
-USE_JITKERNEL_MORE(kVBroadcast, mkl)
diff --git a/lite/backends/x86/jit/more/mkl/mkl.cc b/lite/backends/x86/jit/more/mkl/mkl.cc
deleted file mode 100644
index 7df930f6c0..0000000000
--- a/lite/backends/x86/jit/more/mkl/mkl.cc
+++ /dev/null
@@ -1,336 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "lite/backends/x86/jit/more/mkl/mkl.h"
-#include "lite/backends/x86/cpu_info.h"
-#include "lite/backends/x86/jit/refer/refer.h"
-#include "lite/backends/x86/jit/registry.h"
-#include "lite/backends/x86/mklml.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace more {
-namespace mkl {
-
-template <>
-void MatMul<float>(const float* a,
-                   const float* b,
-                   float* c,
-                   const matmul_attr_t* attr) {
-  lite::x86::cblas_sgemm(CblasRowMajor,
-                         CblasNoTrans,
-                         CblasNoTrans,
-                         attr->m,
-                         attr->n,
-                         attr->k,
-                         1.f,
-                         a,
-                         attr->k,
-                         b,
-                         attr->n,
-                         0.f,
-                         c,
-                         attr->n);
-}
-
-template <>
-void MatMul<double>(const double* a,
-                    const double* b,
-                    double* c,
-                    const matmul_attr_t* attr) {
-  lite::x86::cblas_dgemm(CblasRowMajor,
-                         CblasNoTrans,
-                         CblasNoTrans,
-                         attr->m,
-                         attr->n,
-                         attr->k,
-                         1.0,
-                         a,
-                         attr->k,
-                         b,
-                         attr->n,
-                         0.0,
-                         c,
-                         attr->n);
-}
-
-template <>
-void VMul<float>(const float* x, const float* y, float* z, int n) {
-  lite::x86::vsMul(n, x, y, z);
-}
-
-template <>
-void VMul<double>(const double* x, const double* y, double* z, int n) {
-  lite::x86::vdMul(n, x, y, z);
-}
-
-template <>
-void VAdd<float>(const float* x, const float* y, float* z, int n) {
-  lite::x86::vsAdd(n, x, y, z);
-}
-
-template <>
-void VAdd<double>(const double* x, const double* y, double* z, int n) {
-  lite::x86::vdAdd(n, x, y, z);
-}
-
-template <>
-void VScal<float>(const float* a, const float* x, float* y, int n) {
-  if (x == y) {
-    lite::x86::cblas_sscal(n, *a, y, 1);
-  } else {
-    refer::VScal<float>(a, x, y, n);
-  }
-}
-
-template <>
-void VScal<double>(const double* a, const double* x, double* y, int n) {
-  if (x == y) {
-    lite::x86::cblas_dscal(n, *a, y, 1);
-  } else {
-    refer::VScal<double>(a, x, y, n);
-  }
-}
-
-template <>
-void StrideScal<float>(
-    const float* a, const float* x, float* y, int n, int stride) {
-  if (x == y) {
-    lite::x86::cblas_sscal(n / stride, *a, y, stride);
-  } else {
-    refer::StrideScal<float>(a, x, y, n, stride);
-  }
-}
-
-template <>
-void StrideScal<double>(
-    const double* a, const double* x, double* y, int n, int stride) {
-  if (x == y) {
-    lite::x86::cblas_dscal(n / stride, *a, y, stride);
-  } else {
-    refer::StrideScal<double>(a, x, y, n, stride);
-  }
-}
-
-template <>
-void VExp<float>(const float* x, float* y, int n) {
-  lite::x86::vsExp(n, x, y);
-}
-
-template <>
-void VExp<double>(const double* x, double* y, int n) {
-  lite::x86::vdExp(n, x, y);
-}
-
-template <>
-void VSquare<float>(const float* x, float* y, int n) {
-  lite::x86::vsSqr(n, x, y);
-}
-
-template <>
-void VSquare<double>(const double* x, double* y, int n) {
-  lite::x86::vdSqr(n, x, y);
-}
-
-template <>
-void VCopy<float>(const float* x, float* y, int n) {
-  lite::x86::cblas_scopy(n, x, 1, y, 1);
-}
-
-template <>
-void VCopy<double>(const double* x, double* y, int n) {
-  lite::x86::cblas_dcopy(n, x, 1, y, 1);
-}
-
-template <>
-void VAXPY<float>(float a, const float* x, float* y, int n) {
-  lite::x86::cblas_saxpy(n, a, x, 1, y, 1);
-}
-
-template <>
-void VAXPY<double>(double a, const double* x, double* y, int n) {
-  lite::x86::cblas_daxpy(n, a, x, 1, y, 1);
-}
-
-template <>
-void ASum<float>(const float* x, float* res, int n) {
-  res[0] = lite::x86::cblas_sasum(n, x, 1);
-}
-
-template <>
-void ASum<double>(const double* x, double* res, int n) {
-  res[0] = lite::x86::cblas_dasum(n, x, 1);
-}
-
-template <>
-void StrideASum<float>(const float* x, float* res, int n, int stride) {
-  res[0] = lite::x86::cblas_sasum(n / stride, x, stride);
-}
-
-template <>
-void StrideASum<double>(const double* x, double* res, int n, int stride) {
-  res[0] = lite::x86::cblas_dasum(n / stride, x, stride);
-}
-
-// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
-template <>
-bool VMulKernel<float>::CanBeUsed(const int& d) const {
-  return x86::MayIUse(x86::avx512f) && d > 512;
-}
-
-template <>
-bool VAddKernel<float>::CanBeUsed(const int& d) const {
-  return x86::MayIUse(x86::avx) && d > 512;
-}
-
-template <>
-bool VScalKernel<float>::CanBeUsed(const int& d) const {
-  return x86::MayIUse(x86::avx512f) && d > 512;
-}
-
-template <>
-bool StrideScalKernel<float>::CanBeUsed(const int& d) const {
-  return true;
-}
-
-template <>
-bool VExpKernel<float>::CanBeUsed(const int& d) const {
-  return d > 7;
-}
-
-template <>
-bool VSquareKernel<float>::CanBeUsed(const int& d) const {
-  return d > 7;
-}
-
-template <>
-bool VCopyKernel<float>::CanBeUsed(const int& d) const {
-  return d > 15;
-}
-
-template <>
-bool VBroadcastKernel<float>::CanBeUsed(const int64_t& d) const {
-  return d > 127;
-}
-
-template <>
-bool VBroadcastKernel<double>::CanBeUsed(const int64_t& attr) const {
-  return true;
-}
-
-template <>
-bool VSigmoidKernel<float>::CanBeUsed(const int& d) const {
-  return d > 7;
-}
-
-template <>
-bool VTanhKernel<float>::CanBeUsed(const int& d) const {
-  return d > 7;
-}
-
-template <>
-bool SeqPoolKernel<float>::CanBeUsed(const seq_pool_attr_t& attr) const {
-  return true;
-}
-
-template <>
-bool SeqPoolKernel<double>::CanBeUsed(const seq_pool_attr_t& attr) const {
-  return true;
-}
-
-template <>
-bool EmbSeqPoolKernel<float>::CanBeUsed(const emb_seq_pool_attr_t& attr) const {
-  return true;
-}
-
-template <>
-bool EmbSeqPoolKernel<double>::CanBeUsed(
-    const emb_seq_pool_attr_t& attr) const {
-  return true;
-}
-
-template <>
-bool SgdKernel<float>::CanBeUsed(const sgd_attr_t& attr) const {
-  return true;
-}
-
-template <>
-bool SgdKernel<double>::CanBeUsed(const sgd_attr_t& attr) const {
-  return true;
-}
-
-template <>
-bool MatMulKernel<float>::CanBeUsed(const matmul_attr_t& attr) const {
-  return x86::MayIUse(x86::avx);
-}
-
-template <>
-bool MatMulKernel<double>::CanBeUsed(const matmul_attr_t& attr) const {
-  return true;
-}
-
-template <>
-bool SoftmaxKernel<float>::CanBeUsed(const int& d) const {
-  // tuned on avx2
-  return x86::MayIUse(x86::avx) && d < 60;
-}
-
-#define AWALYS_USE_ME_WITH_DOUBLE(func)                      \
-  template <>                                                \
-  bool func##Kernel<double>::CanBeUsed(const int& d) const { \
-    return true;                                             \
-  }
-
-AWALYS_USE_ME_WITH_DOUBLE(VMul);
-AWALYS_USE_ME_WITH_DOUBLE(VAdd);
-AWALYS_USE_ME_WITH_DOUBLE(VScal);
-AWALYS_USE_ME_WITH_DOUBLE(StrideScal);
-AWALYS_USE_ME_WITH_DOUBLE(VExp);
-AWALYS_USE_ME_WITH_DOUBLE(VSigmoid);
-AWALYS_USE_ME_WITH_DOUBLE(VTanh);
-AWALYS_USE_ME_WITH_DOUBLE(VSquare);
-AWALYS_USE_ME_WITH_DOUBLE(VCopy);
-AWALYS_USE_ME_WITH_DOUBLE(Softmax);
-
-#undef AWALYS_USE_ME_WITH_DOUBLE
-}  // namespace mkl
-}  // namespace more
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
-
-namespace mkl = paddle::lite::jit::more::mkl;
-
-#define REGISTER_MKL_KERNEL(func) \
-  REGISTER_JITKERNEL_MORE(        \
-      k##func, mkl, mkl::func##Kernel<float>, mkl::func##Kernel<double>)
-
-REGISTER_MKL_KERNEL(MatMul);
-REGISTER_MKL_KERNEL(VMul);
-REGISTER_MKL_KERNEL(VAdd);
-REGISTER_MKL_KERNEL(VScal);
-REGISTER_MKL_KERNEL(StrideScal);
-REGISTER_MKL_KERNEL(VExp);
-REGISTER_MKL_KERNEL(VSquare);
-REGISTER_MKL_KERNEL(VCopy);
-REGISTER_MKL_KERNEL(VBroadcast);
-REGISTER_MKL_KERNEL(VSigmoid);
-REGISTER_MKL_KERNEL(VTanh);
-REGISTER_MKL_KERNEL(SeqPool);
-REGISTER_MKL_KERNEL(EmbSeqPool);
-REGISTER_MKL_KERNEL(Softmax);
-REGISTER_MKL_KERNEL(Sgd);
-
-#undef REGISTER_MKL_KERNEL
diff --git a/lite/backends/x86/jit/more/mkl/mkl.h b/lite/backends/x86/jit/more/mkl/mkl.h
deleted file mode 100644
index 8b713e537e..0000000000
--- a/lite/backends/x86/jit/more/mkl/mkl.h
+++ /dev/null
@@ -1,244 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <cmath>
-#include <type_traits>
-#include <vector>
-#include "lite/backends/x86/jit/kernel_base.h"
-#include "lite/utils/paddle_enforce.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace more {
-namespace mkl {
-
-template <typename T>
-void MatMul(const T* a, const T* b, T* c, const matmul_attr_t* attr);
-
-template <typename T>
-void VMul(const T* x, const T* y, T* z, int n);
-
-template <typename T>
-void VAdd(const T* x, const T* y, T* z, int n);
-
-template <typename T>
-void VScal(const T* a, const T* x, T* y, int n);
-
-template <typename T>
-void VExp(const T* x, T* y, int n);
-
-template <typename T>
-void VSquare(const T* x, T* y, int n);
-
-template <typename T>
-void VCopy(const T* x, T* y, int n);
-
-template <typename T>
-void VAXPY(T a, const T* x, T* y, int n);
-
-template <typename T>
-void VBroadcast(const T* x, T* y, int64_t y_h, int64_t x_len) {
-  for (int64_t h = 0; h < y_h; ++h) {
-    VCopy(x, y + h * x_len, x_len);
-  }
-}
-
-template <typename T>
-void VSigmoid(const T* x, T* y, int n) {
-  const T min = SIGMOID_THRESHOLD_MIN;
-  const T max = SIGMOID_THRESHOLD_MAX;
-  for (int i = 0; i < n; ++i) {
-    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
-    y[i] = static_cast<T>(0) - y[i];
-  }
-  VExp(y, y, n);
-  for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
-  }
-}
-
-template <typename T>
-void VTanh(const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(2) * x[i];
-  }
-  VSigmoid(y, y, n);
-  for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(2) * y[i] - static_cast<T>(1);
-  }
-}
-
-template <typename T>
-void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
-  VCopy<T>(x, y, attr->w);
-  for (int h = 1; h != attr->h; ++h) {
-    VAXPY<T>(static_cast<T>(1), x + h * attr->w, y, attr->w);
-  }
-  if (attr->type == SeqPoolType::kAvg || attr->type == SeqPoolType::kSqrt) {
-    T scalar = static_cast<T>(1);
-    if (attr->type == SeqPoolType::kAvg) {
-      scalar = scalar / static_cast<T>(attr->h);
-    } else {
-      scalar = scalar / std::sqrt(static_cast<T>(attr->h));
-    }
-    VScal<T>(&scalar, y, y, attr->w);
-  }
-}
-
-template <typename T>
-void EmbSeqPool(const T* table,
-                const int64_t* idx,
-                T* out,
-                const emb_seq_pool_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width);
-  auto check_idx_value_valid = [&](int64_t i) {
-    PADDLE_ENFORCE_LT(
-        idx[i], attr->table_height, "idx value: %d, i: %d", idx[i], i);
-    PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i);
-  };
-
-  for (int64_t w = 0; w != attr->index_width; ++w) {
-    check_idx_value_valid(w);
-    VCopy<T>(table + idx[w] * attr->table_width,
-             out + w * attr->table_width,
-             attr->table_width);
-  }
-
-  for (int64_t h = 1; h < attr->index_height; ++h) {
-    for (int64_t w = 0; w < attr->index_width; ++w) {
-      int64_t i = h * attr->index_width + w;
-      check_idx_value_valid(i);
-      VAXPY<T>(static_cast<T>(1),
-               table + idx[i] * attr->table_width,
-               out + w * attr->table_width,
-               attr->table_width);
-    }
-  }
-}
-
-template <typename T>
-void ASum(const T* x, T* res, int n);
-
-template <typename T>
-void StrideASum(const T* x, T* res, int n, int stride);
-
-template <typename T>
-void StrideScal(const T* a, const T* x, T* y, int n, int stride);
-
-// remain is the product of dimension shapes after the axis dimension
-template <typename T>
-void Softmax(const T* x, T* y, int n, int bs, int remain = 1) {
-  std::vector<T> entities(bs);
-  for (int i = 0; i < bs; ++i) {
-    entities[i] = x[i * n];
-    for (int c = 1; c < n; ++c) {
-      entities[i] = x[i * n + c] > entities[i] ? x[i * n + c] : entities[i];
-    }
-    for (int c = 0; c < n; ++c) {
-      y[i * n + c] = x[i * n + c] - entities[i];
-    }
-  }
-  VExp(y, y, n * bs);
-  for (int i = 0; i < bs; ++i) {
-    T sum;
-    if (remain == 1) {
-      ASum(&y[i * n], &sum, n);
-      sum = static_cast<T>(1) / sum;
-      VScal(&sum, &y[i * n], &y[i * n], n);
-    } else {
-      for (int j = 0; j < remain; ++j) {
-        StrideASum(&y[i * n + j], &sum, n, remain);
-        sum = static_cast<T>(1) / sum;
-        StrideScal(&sum, &y[i * n + j], &y[i * n + j], n, remain);
-      }
-    }
-  }
-}
-
-template <typename T>
-void Sgd(const T* lr,
-         const T* param,
-         const T* grad,
-         const int64_t* rows,
-         T* out,
-         const sgd_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width);
-  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height);
-  T scalar = -lr[0];
-  int width = attr->grad_width;
-  if (out == param) {
-    for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
-      auto h_idx = rows[i];
-      PADDLE_ENFORCE_LT(h_idx, attr->param_height);
-      PADDLE_ENFORCE_GE(h_idx, 0);
-      VAXPY(scalar, grad + i * width, out + h_idx * width, width);
-    }
-  } else {
-    for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
-      auto h_idx = rows[i];
-      PADDLE_ENFORCE_LT(h_idx, attr->param_height);
-      PADDLE_ENFORCE_GE(h_idx, 0);
-      VScal(&scalar, grad + i * width, out + h_idx * width, width);
-      VAdd(param + h_idx * width,
-           out + h_idx * width,
-           out + h_idx * width,
-           width);
-    }
-  }
-}
-
-#define DECLARE_MKL_KERNEL(name)                                              \
-  template <typename T>                                                       \
-  class name##Kernel : public KernelMore<name##Tuple<T>> {                    \
-   public:                                                                    \
-    name##Kernel() { this->func = name<T>; }                                  \
-    bool CanBeUsed(const typename name##Tuple<T>::attr_type&) const override; \
-    const char* ImplType() const override { return "MKL"; }                   \
-  }
-
-// ABCMNK
-DECLARE_MKL_KERNEL(MatMul);
-
-// XYZN
-DECLARE_MKL_KERNEL(VMul);
-DECLARE_MKL_KERNEL(VAdd);
-
-// AXYN
-DECLARE_MKL_KERNEL(VScal);
-DECLARE_MKL_KERNEL(StrideScal);
-
-// XYN
-DECLARE_MKL_KERNEL(VExp);
-DECLARE_MKL_KERNEL(VSigmoid);
-DECLARE_MKL_KERNEL(VTanh);
-DECLARE_MKL_KERNEL(VSquare);
-DECLARE_MKL_KERNEL(VCopy);
-
-// others
-DECLARE_MKL_KERNEL(SeqPool);
-DECLARE_MKL_KERNEL(EmbSeqPool);
-DECLARE_MKL_KERNEL(Softmax);
-DECLARE_MKL_KERNEL(Sgd);
-DECLARE_MKL_KERNEL(VBroadcast);
-
-#undef DECLARE_MKL_KERNEL
-
-}  // namespace mkl
-}  // namespace more
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/jit/refer/CMakeLists.txt b/lite/backends/x86/jit/refer/CMakeLists.txt
deleted file mode 100644
index 7133f59662..0000000000
--- a/lite/backends/x86/jit/refer/CMakeLists.txt
+++ /dev/null
@@ -1,40 +0,0 @@
-
-cc_library(jit_kernel_refer SRCS refer.cc DEPS jit_kernel_base)
-set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_refer PARENT_SCOPE)
-
-function(USE_JITKERNEL_REFER TARGET)
-    file(APPEND ${jit_file} "USE_JITKERNEL_REFER(${TARGET});\n")
-endfunction()
-
-# use refer kernel by name
-USE_JITKERNEL_REFER(kVMul)
-USE_JITKERNEL_REFER(kVAdd)
-USE_JITKERNEL_REFER(kVAddRelu)
-USE_JITKERNEL_REFER(kVSub)
-USE_JITKERNEL_REFER(kVScal)
-USE_JITKERNEL_REFER(kStrideScal)
-USE_JITKERNEL_REFER(kVAddBias)
-USE_JITKERNEL_REFER(kVCopy)
-USE_JITKERNEL_REFER(kVRelu)
-USE_JITKERNEL_REFER(kVIdentity)
-USE_JITKERNEL_REFER(kVExp)
-USE_JITKERNEL_REFER(kVSigmoid)
-USE_JITKERNEL_REFER(kVTanh)
-USE_JITKERNEL_REFER(kLSTMCtHt)
-USE_JITKERNEL_REFER(kLSTMC1H1)
-USE_JITKERNEL_REFER(kGRUH1)
-USE_JITKERNEL_REFER(kGRUHtPart1)
-USE_JITKERNEL_REFER(kGRUHtPart2)
-USE_JITKERNEL_REFER(kCRFDecoding)
-USE_JITKERNEL_REFER(kLayerNorm)
-USE_JITKERNEL_REFER(kNCHW16CMulNC)
-USE_JITKERNEL_REFER(kSeqPool)
-USE_JITKERNEL_REFER(kMatMul)
-USE_JITKERNEL_REFER(kVSquare)
-USE_JITKERNEL_REFER(kHSum)
-USE_JITKERNEL_REFER(kHMax)
-USE_JITKERNEL_REFER(kStrideASum)
-USE_JITKERNEL_REFER(kSoftmax)
-USE_JITKERNEL_REFER(kEmbSeqPool)
-USE_JITKERNEL_REFER(kSgd)
-USE_JITKERNEL_REFER(kVBroadcast)
diff --git a/lite/backends/x86/jit/refer/refer.cc b/lite/backends/x86/jit/refer/refer.cc
deleted file mode 100644
index e1b1240c5d..0000000000
--- a/lite/backends/x86/jit/refer/refer.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#include "lite/backends/x86/jit/refer/refer.h"
-#include "lite/backends/x86/jit/registry.h"
-
-namespace refer = paddle::lite::jit::refer;
-
-#define REGISTER_REFER_KERNEL(func) \
-  REGISTER_JITKERNEL_REFER(         \
-      k##func, refer::func##Kernel<float>, refer::func##Kernel<double>)
-
-REGISTER_REFER_KERNEL(VMul);
-REGISTER_REFER_KERNEL(VAdd);
-REGISTER_REFER_KERNEL(VAddRelu);
-REGISTER_REFER_KERNEL(VSub);
-
-REGISTER_REFER_KERNEL(VScal);
-REGISTER_REFER_KERNEL(StrideScal);
-REGISTER_REFER_KERNEL(VAddBias);
-
-REGISTER_REFER_KERNEL(VRelu);
-REGISTER_REFER_KERNEL(VCopy);
-REGISTER_REFER_KERNEL(VIdentity);
-REGISTER_REFER_KERNEL(VSquare);
-REGISTER_REFER_KERNEL(VExp);
-REGISTER_REFER_KERNEL(VSigmoid);
-REGISTER_REFER_KERNEL(VTanh);
-
-REGISTER_REFER_KERNEL(LSTMCtHt);
-REGISTER_REFER_KERNEL(LSTMC1H1);
-
-REGISTER_REFER_KERNEL(GRUH1);
-REGISTER_REFER_KERNEL(GRUHtPart1);
-REGISTER_REFER_KERNEL(GRUHtPart2);
-
-REGISTER_REFER_KERNEL(CRFDecoding);
-REGISTER_REFER_KERNEL(LayerNorm);
-REGISTER_REFER_KERNEL(NCHW16CMulNC);
-REGISTER_REFER_KERNEL(SeqPool);
-REGISTER_REFER_KERNEL(MatMul);
-REGISTER_REFER_KERNEL(HMax);
-REGISTER_REFER_KERNEL(HSum);
-REGISTER_REFER_KERNEL(StrideASum);
-REGISTER_REFER_KERNEL(Softmax);
-REGISTER_REFER_KERNEL(EmbSeqPool);
-REGISTER_REFER_KERNEL(Sgd);
-REGISTER_REFER_KERNEL(VBroadcast);
-
-#undef REGISTER_REFER_KERNEL
diff --git a/lite/backends/x86/jit/refer/refer.h b/lite/backends/x86/jit/refer/refer.h
deleted file mode 100644
index 119ec7469e..0000000000
--- a/lite/backends/x86/jit/refer/refer.h
+++ /dev/null
@@ -1,603 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <glog/logging.h>
-#include <cmath>
-#include <cstring>
-#include <limits>
-#include <string>
-#include "lite/backends/x86/jit/helper.h"
-#include "lite/backends/x86/jit/kernel_base.h"
-#include "lite/backends/x86/jit/macro.h"
-#include "lite/utils/paddle_enforce.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-namespace refer {
-
-// Refer code only focus on correctness
-template <typename T>
-void VMul(const T* x, const T* y, T* z, int n) {
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] * y[i];
-  }
-}
-
-template <typename T>
-void VAdd(const T* x, const T* y, T* z, int n) {
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] + y[i];
-  }
-}
-
-template <typename T>
-void VAddRelu(const T* x, const T* y, T* z, int n) {
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] + y[i];
-    z[i] = z[i] > 0 ? z[i] : 0;
-  }
-}
-
-template <typename T>
-void VSub(const T* x, const T* y, T* z, int n) {
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] - y[i];
-  }
-}
-
-template <typename T>
-void VScal(const T* a, const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = a[0] * x[i];
-  }
-}
-
-template <typename T>
-void VAddBias(const T* a, const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = a[0] + x[i];
-  }
-}
-
-template <typename T>
-void VCopy(const T* x, T* y, int n) {
-  std::memcpy(y, x, n * sizeof(T));
-}
-
-// x shape: (x_len)
-// y shape: (h, x_len)
-template <typename T>
-void VBroadcast(const T* x, T* y, int64_t y_h, int64_t x_len) {
-  for (int64_t h = 0; h < y_h; ++h) {
-    VCopy(x, y + h * x_len, x_len);
-  }
-}
-
-template <typename T>
-void VRelu(const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] > 0 ? x[i] : 0;
-  }
-}
-
-template <typename T>
-inline void VIdentity(const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i];
-  }
-}
-
-template <typename T>
-inline void VSquare(const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] * x[i];
-  }
-}
-
-template <typename T>
-void VExp(const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::exp(x[i]);
-  }
-}
-
-template <typename T>
-void VSigmoid(const T* x, T* y, int n) {
-  // y = 1 / (1 + e^-x)
-  const T min = SIGMOID_THRESHOLD_MIN;
-  const T max = SIGMOID_THRESHOLD_MAX;
-  for (int i = 0; i < n; ++i) {
-    T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
-    y[i] = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-tmp));
-  }
-}
-
-template <typename T>
-void VTanh(const T* x, T* y, int n) {
-  // y = 2 * sigmoid(2x) - 1
-  for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(2) * x[i];
-  }
-  VSigmoid(y, y, n);
-  for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(2) * y[i] - static_cast<T>(1);
-  }
-}
-
-template <typename T>
-void (*getActFunc(KernelType type))(const T*, T*, int) {  // NOLINT
-  if (type == kVSigmoid) {
-    return VSigmoid<T>;
-  } else if (type == kVRelu) {
-    return VRelu<T>;
-  } else if (type == kVTanh) {
-    return VTanh<T>;
-  } else if (type == kVIdentity) {
-    return VIdentity<T>;
-  }
-  LOG(FATAL) << "Not support type: " << type;
-  return nullptr;
-}
-
-// TODO(TJ): add refer gemm and make LSTM kernels combine as same GRU kernels
-
-// compute ct and ht
-template <typename T>
-void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) {
-  T* gates = reinterpret_cast<T*>(step->gates);
-  const T* ct_1 = reinterpret_cast<const T*>(step->ct_1);
-  T* ct = reinterpret_cast<T*>(step->ct);
-  T* ht = reinterpret_cast<T*>(step->ht);
-  const T* wp = reinterpret_cast<const T*>(step->wp);
-  T* checked = reinterpret_cast<T*>(step->checked);
-  auto act_gate = getActFunc<T>(attr->act_gate);
-  auto act_cand = getActFunc<T>(attr->act_cand);
-  auto act_cell = getActFunc<T>(attr->act_cell);
-  int d = attr->d;
-  int d2 = d * 2;
-  int d3 = d * 3;
-  // gates: W_ch, W_ih, W_fh, W_oh
-  if (attr->use_peephole) {
-    VMul(wp, ct_1, checked, d);
-    VMul(wp + d, ct_1, checked + d, d);
-    VAdd(checked, gates + d, gates + d, d2);
-    act_gate(gates + d, gates + d, d2);
-  } else {
-    act_gate(gates + d, gates + d, d3);
-  }
-
-  // C_t = C_t-1 * fgated + cand_gated * igated
-  act_cand(gates, gates, d);
-  VMul(gates, gates + d, gates + d, d);
-  VMul(ct_1, gates + d2, gates + d2, d);
-  VAdd(gates + d, gates + d2, ct, d);
-
-  if (attr->use_peephole) {
-    // get ogated
-    VMul(wp + d2, ct, gates + d, d);
-    VAdd(gates + d, gates + d3, gates + d3, d);
-    act_gate(gates + d3, gates + d3, d);
-  }
-  // H_t = act_cell(C_t) * ogated
-  act_cell(ct, gates + d2, d);
-  VMul(gates + d2, gates + d3, ht, d);
-}
-
-// compute c1 and h1 without c0 or h0
-template <typename T>
-void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) {
-  T* gates = reinterpret_cast<T*>(step->gates);
-  T* ct = reinterpret_cast<T*>(step->ct);
-  T* ht = reinterpret_cast<T*>(step->ht);
-  auto act_gate = getActFunc<T>(attr->act_gate);
-  auto act_cand = getActFunc<T>(attr->act_cand);
-  auto act_cell = getActFunc<T>(attr->act_cell);
-  int d = attr->d;
-  int d2 = d * 2;
-  int d3 = d * 3;
-  /* C_t = igated * cgated*/
-  act_gate(gates + d, gates + d, d);
-  act_cand(gates, gates, d);
-  VMul(gates, gates + d, ct, d);
-  if (attr->use_peephole) {
-    // get outgated, put W_oc * C_t on igated
-    const T* wp = reinterpret_cast<const T*>(step->wp);
-    VMul(wp + d2, ct, gates + d, d);
-    VAdd(gates + d, gates + d3, gates + d3, d);
-  }
-  /* H_t = act_cell(C_t) * ogated */
-  act_gate(gates + d3, gates + d3, d);
-  act_cell(ct, gates + d2, d);
-  VMul(gates + d2, gates + d3, ht, d);
-}
-
-// compute h1 without h0
-template <typename T>
-void GRUH1(gru_t* step, const gru_attr_t* attr) {
-  T* gates = reinterpret_cast<T*>(step->gates);
-  T* ht = reinterpret_cast<T*>(step->ht);
-  auto act_gate = getActFunc<T>(attr->act_gate);
-  auto act_cand = getActFunc<T>(attr->act_cand);
-  int d = attr->d;
-  int d2 = d * 2;
-  act_gate(gates, gates, d);
-  act_cand(gates + d2, gates + d2, d);
-  VMul(gates, gates + d2, ht, d);
-}
-
-// compute the first part of GRU: ht = act_gate(r) * ht_1
-template <typename T>
-void GRUHtPart1(gru_t* step, const gru_attr_t* attr) {
-  // W: {W_update, W_reset; W_state}
-  T* gates = reinterpret_cast<T*>(step->gates);
-  T* ht = reinterpret_cast<T*>(step->ht);
-  const T* ht_1 = reinterpret_cast<const T*>(step->ht_1);
-  auto act_gate = getActFunc<T>(attr->act_gate);
-  act_gate(gates + attr->d, gates + attr->d, attr->d);
-  VMul(ht_1, gates + attr->d, ht, attr->d);
-}
-
-// compute the second part of GRU:
-// ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1
-template <typename T>
-void GRUHtPart2(gru_t* step, const gru_attr_t* attr) {
-  T* gates = reinterpret_cast<T*>(step->gates);
-  T* ht = reinterpret_cast<T*>(step->ht);
-  const T* ht_1 = reinterpret_cast<const T*>(step->ht_1);
-  auto act_gate = getActFunc<T>(attr->act_gate);
-  auto act_cand = getActFunc<T>(attr->act_cand);
-  int d = attr->d;
-  T* y = gates + d * 2;
-  act_gate(gates, gates, d);
-  act_cand(y, y, d);
-  // out = zt*ht~ + (1-zt)*ht_1
-  for (int i = 0; i < d; ++i) {
-    ht[i] = gates[i] * y[i] + (static_cast<T>(1) - gates[i]) * ht_1[i];
-  }
-}
-
-template <typename T>
-void CRFDecoding(const int seq_len,
-                 const T* x,
-                 const T* w,
-                 T* alpha,
-                 int* track,
-                 int right) {
-  constexpr int state_trans_base_idx = 2;
-  for (int i = 0; i < right; ++i) {
-    alpha[i] = w[i] + x[i];
-  }
-  for (int k = 1; k < seq_len; ++k) {
-    for (int i = 0; i < right; ++i) {
-      T max_score = -std::numeric_limits<T>::max();
-      int max_j = 0;
-      for (int j = 0; j < right; ++j) {
-        T score = alpha[(k - 1) * right + j] +
-                  w[(j + state_trans_base_idx) * right + i];
-        if (score > max_score) {
-          max_score = score;
-          max_j = j;
-        }
-      }
-      alpha[k * right + i] = max_score + x[k * right + i];
-      track[k * right + i] = max_j;
-    }
-  }
-}
-
-template <typename T>
-void LayerNorm(T* x,
-               T* out,
-               T* mean,
-               T* var,
-               const T* scale,
-               const T* bias,
-               int height,
-               const float epsilon,
-               int right) {
-  // get mean
-  for (int i = 0; i < height; i++) {
-    T sum = 0.0;
-    int offset = i * right;
-    for (int j = 0; j < right; j++) {
-      sum += x[offset + j];
-    }
-    mean[i] = sum / right;
-  }
-
-  // get variance
-  for (int i = 0; i < height; i++) {
-    T sum = 0.0;
-    int offset = i * right;
-    for (int j = 0; j < right; j++) {
-      sum += (x[offset + j] - mean[i]) * (x[offset + j] - mean[i]);
-    }
-    var[i] = sum / right;
-  }
-
-  for (int i = 0; i < height; i++) {
-    int offset = i * right;
-    T sqrt_var = std::sqrt(var[i] + (T)epsilon);
-    for (int j = 0; j < right; j++) {
-      out[offset + j] = (x[offset + j] - mean[i]) / sqrt_var;
-    }
-  }
-  if (scale) {
-    for (int i = 0; i < height; i++) {
-      int offset = i * right;
-      for (int j = 0; j < right; j++) {
-        out[offset + j] *= scale[j];
-      }
-    }
-  }
-
-  if (bias) {
-    for (int i = 0; i < height; i++) {
-      int offset = i * right;
-      for (int j = 0; j < right; j++) {
-        out[offset + j] += bias[j];
-      }
-    }
-  }
-}
-
-template <typename T>
-void NCHW16CMulNC(const T* x, const T* y, T* z, int height, int width) {
-  int offset = 0;
-  for (int h = 0; h < height; ++h) {
-    for (int w = 0; w < width; ++w) {
-      for (int i = 0; i < 16; ++i) {
-        z[i + offset] = y[i] * x[i + offset];
-      }
-      offset += ZMM_FLOAT_BLOCK;
-    }
-  }
-}
-
-template <typename T>
-void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
-  for (int w = 0; w < attr->w; ++w) {
-    const T* src = x + w;
-    T* dst = y + w;
-    *dst = static_cast<T>(0);
-    for (int h = 0; h < attr->h; ++h) {
-      *dst = *dst + *src;
-      src += attr->w;
-    }
-  }
-  if (attr->type == SeqPoolType::kAvg || attr->type == SeqPoolType::kSqrt) {
-    T scalar = static_cast<T>(1);
-    if (attr->type == SeqPoolType::kAvg) {
-      scalar = scalar / static_cast<T>(attr->h);
-    } else {
-      scalar = scalar / std::sqrt(static_cast<T>(attr->h));
-    }
-    VScal<T>(&scalar, y, y, attr->w);
-  }
-}
-
-// A(M,K) * B(K,N) = C(M,N)
-template <typename T>
-void MatMul(const T* A, const T* B, T* C, const matmul_attr_t* attr) {
-  int M = attr->m;
-  int N = attr->n;
-  int K = attr->k;
-  for (int m = 0; m < M; ++m) {
-    const T* pa = A + m * K;
-    T* pc = C + m * N;
-    for (int n = 0; n < N; ++n) {
-      const T* pb = B + n;
-      pc[n] = pa[0] * pb[0];
-      for (int k = 1; k < K; ++k) {
-        pc[n] += pa[k] * pb[k * N];
-      }
-    }
-  }
-}
-
-template <typename T>
-void HMax(const T* x, T* res, int n) {
-  res[0] = x[0];
-  for (int i = 1; i < n; ++i) {
-    res[0] = res[0] < x[i] ? x[i] : res[0];
-  }
-}
-
-template <typename T>
-void HSum(const T* x, T* res, int n) {
-  res[0] = x[0];
-  for (int i = 1; i < n; ++i) {
-    res[0] += x[i];
-  }
-}
-
-template <typename T>
-void StrideASum(const T* x, T* res, int n, int stride) {
-  res[0] = x[0];
-  for (int i = stride; i < n; i += stride) {
-    res[0] += std::abs(x[i]);
-  }
-}
-
-template <typename T>
-void StrideScal(const T* a, const T* x, T* y, int n, int stride) {
-  for (int i = 0; i < n; ++i) {
-    if (i % stride == 0) {
-      y[i] = x[i] * a[0];
-    } else {
-      y[i] = x[i];
-    }
-  }
-}
-
-// y = e^(x - max(x))
-// y = y / sum(y)
-// remain is the product of dimension shapes after the axis dimension
-template <typename T>
-void Softmax(const T* x, T* y, int n, int bs = 1, int remain = 1) {
-  for (int i = 0; i < bs; ++i) {
-    T scalar;
-    HMax(x, &scalar, n);
-    scalar = static_cast<T>(0) - scalar;
-    VAddBias(&scalar, x, y, n);  // x - max
-    VExp(y, y, n);
-    if (remain == 1) {
-      HSum(y, &scalar, n);
-      scalar = static_cast<T>(1) / scalar;
-      VScal(&scalar, y, y, n);
-    } else {
-      for (int j = 0; j < remain; j++) {
-        StrideASum(&y[j], &scalar, n, remain);
-        scalar = static_cast<T>(1) / scalar;
-        StrideScal(&scalar, &y[j], &y[j], n, remain);
-      }
-    }
-    x += n;
-    y += n;
-  }
-}
-
-// embedding seq pool
-// table is a matrix with (tbl_h, tbl_w)
-// idx is a matrix with (idx_h, idx_w)
-// output is a vector with length tbl_w * idx_w
-template <typename T>
-void EmbSeqPool(const T* table,
-                const int64_t* idx,
-                T* out,
-                const emb_seq_pool_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width);
-
-  auto check_idx_value_valid = [&](int64_t i) {
-    PADDLE_ENFORCE_LT(
-        idx[i], attr->table_height, "idx value: %d, i: %d", idx[i], i);
-    PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i);
-  };
-
-  for (int64_t w = 0; w != attr->index_width; ++w) {
-    check_idx_value_valid(w);
-    std::memcpy(out + w * attr->table_width,
-                table + idx[w] * attr->table_width,
-                attr->table_width * sizeof(T));
-  }
-
-  for (int64_t h = 1; h < attr->index_height; ++h) {
-    for (int64_t w = 0; w < attr->index_width; ++w) {
-      int64_t i = h * attr->index_width + w;
-      check_idx_value_valid(i);
-      VAdd(table + idx[i] * attr->table_width,
-           out + w * attr->table_width,
-           out + w * attr->table_width,
-           attr->table_width);
-    }
-  }
-}
-
-// SGD algorithm:
-// lr is pointor of learning rate scalar
-// param is an input matrix with (param_h, param_w)
-// grad is an input matrix with (grad_h, grad_w), here grad_w == param_w
-// selected_rows is a vectot<int64_t> with size selected_rows_size( <= grad_h )
-// out is an output matrix with (param_h, param_w)
-//
-// support both regular and sparse grad
-// regular SGD: out[:] = param[:] - lr[0] * grad[:];
-// sparse SGD: out[rows[i]][:] = param[rows[i]][:] - lr[0] * grad[i][:]
-//
-// Note: when use sparse SGD, and if out != param,
-// the out rows which are not selected have not beed changed, which maybe empty
-template <typename T>
-void Sgd(const T* lr,
-         const T* param,
-         const T* grad,
-         const int64_t* rows,
-         T* out,
-         const lite::jit::sgd_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width);
-  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height);
-  for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
-    auto h_idx = rows[i];
-    PADDLE_ENFORCE_LT(h_idx, attr->param_height);
-    PADDLE_ENFORCE_GE(h_idx, 0);
-    for (int64_t j = 0; j < attr->grad_width; ++j) {
-      out[h_idx * attr->grad_width + j] =
-          param[h_idx * attr->grad_width + j] -
-          lr[0] * grad[i * attr->grad_width + j];
-    }
-  }
-}
-
-#define DECLARE_REFER_KERNEL(name)                                     \
-  template <typename T>                                                \
-  class name##Kernel : public lite::jit::ReferKernel<name##Tuple<T>> { \
-   public:                                                             \
-    name##Kernel() { this->func = name<T>; }                           \
-  }
-
-// const T* x, const T* y, T* z, int n
-DECLARE_REFER_KERNEL(VMul);
-DECLARE_REFER_KERNEL(VAdd);
-DECLARE_REFER_KERNEL(VAddRelu);
-DECLARE_REFER_KERNEL(VSub);
-
-// const T* a, const T* x, T* y, int n
-DECLARE_REFER_KERNEL(VScal);
-DECLARE_REFER_KERNEL(VAddBias);
-
-// const T* a, const T* x, T* y, int n, int stride
-DECLARE_REFER_KERNEL(StrideScal);
-
-// const T* x, T* y, int n
-DECLARE_REFER_KERNEL(VRelu);
-DECLARE_REFER_KERNEL(VIdentity);
-DECLARE_REFER_KERNEL(VExp);
-DECLARE_REFER_KERNEL(VSigmoid);
-DECLARE_REFER_KERNEL(VTanh);
-DECLARE_REFER_KERNEL(VSquare);
-DECLARE_REFER_KERNEL(VCopy);
-
-// lstm_t*, const lstm_attr_t*
-DECLARE_REFER_KERNEL(LSTMCtHt);
-DECLARE_REFER_KERNEL(LSTMC1H1);
-
-// gru_t*, const gru_attr_t*
-DECLARE_REFER_KERNEL(GRUH1);
-DECLARE_REFER_KERNEL(GRUHtPart1);
-DECLARE_REFER_KERNEL(GRUHtPart2);
-
-DECLARE_REFER_KERNEL(HMax);
-DECLARE_REFER_KERNEL(HSum);
-
-DECLARE_REFER_KERNEL(StrideASum);
-
-// others
-DECLARE_REFER_KERNEL(CRFDecoding);
-DECLARE_REFER_KERNEL(LayerNorm);
-DECLARE_REFER_KERNEL(NCHW16CMulNC);
-DECLARE_REFER_KERNEL(SeqPool);
-DECLARE_REFER_KERNEL(MatMul);
-DECLARE_REFER_KERNEL(Softmax);
-DECLARE_REFER_KERNEL(EmbSeqPool);
-DECLARE_REFER_KERNEL(Sgd);
-DECLARE_REFER_KERNEL(VBroadcast);
-
-#undef DECLARE_REFER_KERNEL
-
-}  // namespace refer
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/jit/registry.h b/lite/backends/x86/jit/registry.h
deleted file mode 100644
index 7613a8dd43..0000000000
--- a/lite/backends/x86/jit/registry.h
+++ /dev/null
@@ -1,178 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <tuple>
-#include <type_traits>
-#include <utility>  // for std::move
-#include "lite/backends/x86/cpu_info.h"
-#include "lite/backends/x86/jit/kernel_base.h"
-#include "lite/backends/x86/jit/kernel_pool.h"
-#include "lite/backends/x86/legacy_place.h"
-#include "lite/utils/macros.h"
-
-namespace paddle {
-namespace lite {
-namespace jit {
-
-// make_unique is supported since c++14
-template <typename T, typename... Args>
-inline std::unique_ptr<T> make_unique(Args&&... args) {
-  static_assert(!std::is_array<T>::value, "T must not be array");
-  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
-}
-
-template <typename Pool,
-          typename PlaceType,
-          bool IsEnd,
-          size_t I,
-          typename... KernelImpls>
-struct JitKernelRegistrarFunctor;
-
-template <typename Pool, typename PlaceType, size_t I, typename... KernelImpls>
-struct JitKernelRegistrarFunctor<Pool, PlaceType, true, I, KernelImpls...> {
-  void operator()(KernelType kt) const {}
-};
-
-template <typename Pool, typename PlaceType, size_t I, typename... KernelImpls>
-struct JitKernelRegistrarFunctor<Pool, PlaceType, false, I, KernelImpls...> {
-  using KERNEL_IMPL_TYPE =
-      typename std::tuple_element<I, std::tuple<KernelImpls...>>::type;
-
-  void operator()(KernelType kt) const {
-    KernelKey kkey(kt, PlaceType());
-    Pool::Instance().Insert(kkey,
-                            std::move(make_unique<const KERNEL_IMPL_TYPE>()));
-    constexpr auto size = std::tuple_size<std::tuple<KernelImpls...>>::value;
-    JitKernelRegistrarFunctor<Pool,
-                              PlaceType,
-                              I + 1 == size,
-                              I + 1,
-                              KernelImpls...>
-        func;
-    func(kt);
-  }
-};
-
-template <typename Pool, typename PlaceType, typename... KernelImpls>
-class JitKernelRegistrar {
- public:
-  explicit JitKernelRegistrar(KernelType kt) {
-    JitKernelRegistrarFunctor<Pool, PlaceType, false, 0, KernelImpls...> func;
-    func(kt);
-  }
-  void Touch() {}
-};
-
-#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg)              \
-  struct __test_global_namespace_##uniq_name##__ {};                          \
-  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
-                             __test_global_namespace_##uniq_name##__>::value, \
-                msg)
-
-// Refer always on CPUPlace
-#define REGISTER_JITKERNEL_REFER(kernel_type, ...)                  \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                         \
-      __reg_jitkernel_##kernel_type##_refer_CPUPlace,               \
-      "REGISTER_KERNEL_REFER must be called in global namespace");  \
-  static ::paddle::lite::jit::JitKernelRegistrar<                   \
-      ::paddle::lite::jit::ReferKernelPool,                         \
-      ::paddle::lite::fluid::CPUPlace,                              \
-      __VA_ARGS__>                                                  \
-      __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_(       \
-          ::paddle::lite::jit::KernelType::kernel_type);            \
-  int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_() {         \
-    __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_.Touch(); \
-    return 0;                                                       \
-  }
-
-// kernel_type: should be in paddle::lite::jit::KernelType
-// place_type: should be one of CPUPlace and GPUPlace in paddle::platform
-#define REGISTER_KERNEL_MORE(kernel_type, impl_type, place_type, ...)         \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                                   \
-      __reg_jitkernel_##kernel_type##_##impl_type##_##place_type,             \
-      "REGISTER_KERNEL_MORE must be called in global namespace");             \
-  extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();             \
-  static int __assert_##kernel_type##_##impl_type##_##place_type##_has_refer_ \
-      UNUSED = TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();           \
-  static ::paddle::lite::jit::JitKernelRegistrar<                             \
-      ::paddle::lite::jit::KernelPool,                                        \
-      ::paddle::lite::fluid::place_type,                                      \
-      __VA_ARGS__>                                                            \
-      __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_(   \
-          ::paddle::lite::jit::KernelType::kernel_type);                      \
-  int TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_() {     \
-    __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_      \
-        .Touch();                                                             \
-    return 0;                                                                 \
-  }
-
-#define REGISTER_JITKERNEL_MORE(kernel_type, impl_type, ...) \
-  REGISTER_KERNEL_MORE(kernel_type, impl_type, CPUPlace, __VA_ARGS__)
-
-#define REGISTER_GPUKERNEL_MORE(kernel_type, impl_type, ...) \
-  REGISTER_KERNEL_MORE(kernel_type, impl_type, GPUPlace, __VA_ARGS__)
-
-#define REGISTER_JITKERNEL_GEN(kernel_type, ...)                    \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                         \
-      __reg_jitkernel_gen_##kernel_type##_CPUPlace_,                \
-      "REGISTER_JITKERNEL_GEN must be called in global namespace"); \
-  extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();   \
-  static int __assert_gen_##kernel_type##_has_refer_ UNUSED =       \
-      TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();          \
-  static ::paddle::lite::jit::JitKernelRegistrar<                   \
-      ::paddle::lite::jit::JitCodeCreatorPool,                      \
-      ::paddle::lite::fluid::CPUPlace,                              \
-      __VA_ARGS__>                                                  \
-      __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_(         \
-          ::paddle::lite::jit::KernelType::kernel_type);            \
-  int TouchJitKernelReg_gen_##kernel_type##_CPUPlace_() {           \
-    __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_.Touch();   \
-    return 0;                                                       \
-  }
-
-#define USE_JITKERNEL_GEN(kernel_type)                            \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                       \
-      __reg_jitkernel_gen_##kernel_type##_CPUPlace_,              \
-      "USE_JITKERNEL_GEN must be called in global namespace");    \
-  extern int TouchJitKernelReg_gen_##kernel_type##_CPUPlace_();   \
-  static int use_jitkernel_gen_##kernel_type##_CPUPlace_ UNUSED = \
-      TouchJitKernelReg_gen_##kernel_type##_CPUPlace_()
-
-#define USE_JITKERNEL_REFER(kernel_type)                            \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                         \
-      __reg_jitkernel_##kernel_type##_refer_CPUPlace_,              \
-      "USE_JITKERNEL_REFER must be called in global namespace");    \
-  extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();   \
-  static int use_jitkernel_##kernel_type##_refer_CPUPlace_ UNUSED = \
-      TouchJitKernelReg_##kernel_type##_refer_CPUPlace_()
-
-#define USE_KERNEL_MORE(kernel_type, impl_type, place_type)              \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                              \
-      __reg_jitkernel_##kernel_type##_##impl_type##_##place_type##_,     \
-      "USE_JITKERNEL_MORE must be called in global namespace");          \
-  extern int                                                             \
-      TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_(); \
-  static int use_jitkernel_##kernel_type##_##impl_type##_##place_type##_ \
-      UNUSED =                                                           \
-          TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_()
-
-#define USE_JITKERNEL_MORE(kernel_type, impl_type) \
-  USE_KERNEL_MORE(kernel_type, impl_type, CPUPlace)
-
-}  // namespace jit
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/jit/test.cc b/lite/backends/x86/jit/test.cc
deleted file mode 100644
index aafcad579f..0000000000
--- a/lite/backends/x86/jit/test.cc
+++ /dev/null
@@ -1,1447 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gflags/gflags.h>
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-#include <algorithm>
-#include <iostream>
-#include <random>
-#include <string>
-#include <vector>
-#include "lite/backends/x86/cpu_info.h"
-#include "lite/backends/x86/jit/kernels.h"
-#include "lite/backends/x86/legacy_place.h"
-
-DEFINE_double(acc, 1e-5, "Test accuracy threshold.");
-
-template <typename T>
-void RandomVec(const int n,
-               T* a,
-               const T lower = static_cast<T>(-2.f),
-               const T upper = static_cast<T>(2.f)) {
-  static unsigned int seed = 100;
-  std::mt19937 rng(seed++);
-  std::uniform_real_distribution<double> uniform_dist(0, 1);
-  for (int i = 0; i < n; ++i) {
-    a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
-  }
-}
-
-template <typename T>
-void ExpectEQ(const T* target, const T* refer, size_t n) {
-  if (std::is_floating_point<T>::value) {
-    for (size_t i = 0; i < n; ++i) {
-      EXPECT_NEAR(target[i], refer[i], FLAGS_acc) << " at index : " << i;
-    }
-  } else {
-    for (size_t i = 0; i < n; ++i) {
-      EXPECT_EQ(target[i], refer[i]) << " at index : " << i;
-    }
-  }
-}
-
-std::vector<int> TestSizes() {
-  std::vector<int> s;
-  for (int i = 1; i < 32; ++i) {
-    s.push_back(i);
-  }
-  // test some large size
-  s.push_back(100);
-  s.push_back(1000);
-  s.push_back(2000);
-  return s;
-}
-
-namespace jit = paddle::lite::jit;
-using CPUPlace = paddle::lite::fluid::CPUPlace;
-
-template <typename KernelTuple,
-          typename PlaceType,
-          typename Tester,
-          typename... Args>
-void TestAllImpls(const typename KernelTuple::attr_type& attr,
-                  const Tester& verifier,
-                  const Args&... args) {
-  auto funcs = jit::GetAllCandidateFuncsWithTypes<KernelTuple, PlaceType>(attr);
-  for (auto f : funcs) {
-    VLOG(10) << "Test Kernel " << f.first;
-    verifier(f.second, args...);
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelXYZN() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  for (int d : TestSizes()) {
-    auto ref = jit::GetReferFunc<KernelTuple>();
-    EXPECT_TRUE(ref != nullptr);
-
-    std::vector<T> x(d), y(d), zref(d);
-    RandomVec<T>(d, x.data());
-    RandomVec<T>(d, y.data());
-
-    std::vector<T> xinp(d), yinp(d);  // inplace test
-    std::copy(x.begin(), x.end(), xinp.begin());
-    std::copy(y.begin(), y.end(), yinp.begin());
-
-    const T* x_data = x.data();
-    const T* y_data = y.data();
-    T* zref_data = zref.data();
-    T* xinp_data = xinp.data();
-    T* yinp_data = yinp.data();
-
-    // test refer code inplace
-    ref(x_data, y_data, zref_data, d);
-    ref(x_data, yinp_data, yinp_data, d);
-    ref(xinp_data, y_data, xinp_data, d);
-    ExpectEQ<T>(xinp_data, zref_data, d);
-    ExpectEQ<T>(yinp_data, zref_data, d);
-
-    auto verifier = [](const typename KernelTuple::func_type tgt,
-                       const std::vector<T>& x,
-                       const std::vector<T>& y,
-                       const std::vector<T>& zref) {
-      EXPECT_TRUE(tgt != nullptr);
-      EXPECT_EQ(zref.size(), x.size());
-      EXPECT_EQ(zref.size(), y.size());
-      const T* x_data = x.data();
-      const T* y_data = y.data();
-      const T* zref_data = zref.data();
-      const int d = zref.size();
-
-      std::vector<T> ztgt(d);
-      T* ztgt_data = ztgt.data();
-      // test normal
-      tgt(x_data, y_data, ztgt_data, d);
-      ExpectEQ<T>(ztgt_data, zref_data, d);
-      // test inplace x
-      std::copy(x.begin(), x.end(), ztgt.begin());
-      tgt(ztgt_data, y_data, ztgt_data, d);
-      ExpectEQ<T>(ztgt_data, zref_data, d);
-      // test inplace y
-      std::copy(y.begin(), y.end(), ztgt.begin());
-      tgt(x_data, ztgt_data, ztgt_data, d);
-      ExpectEQ<T>(ztgt_data, zref_data, d);
-    };
-
-    TestAllImpls<KernelTuple, PlaceType>(d, verifier, x, y, zref);
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelAXYN() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  for (int d : TestSizes()) {
-    auto ref = jit::GetReferFunc<KernelTuple>();
-    EXPECT_TRUE(ref != nullptr);
-
-    const T a = static_cast<T>(3);
-    std::vector<T> x(d), yref(d);
-    std::vector<T> xinp(d);  // inplace test
-    RandomVec<T>(d, x.data());
-    std::copy(x.begin(), x.end(), xinp.begin());
-
-    const T* x_data = x.data();
-    T* yref_data = yref.data();
-    T* xinp_data = xinp.data();
-    // test refer code inplace
-    ref(&a, x_data, yref_data, d);
-    ref(&a, xinp_data, xinp_data, d);
-    ExpectEQ<T>(xinp_data, yref_data, d);
-
-    auto verifier = [](const typename KernelTuple::func_type tgt,
-                       const T a,
-                       const std::vector<T>& x,
-                       const std::vector<T>& yref) {
-      EXPECT_TRUE(tgt != nullptr);
-      EXPECT_EQ(yref.size(), x.size());
-      const T* x_data = x.data();
-      const T* yref_data = yref.data();
-      const int d = yref.size();
-      std::vector<T> ytgt(d);
-      T* ytgt_data = ytgt.data();
-      // test normal
-      tgt(&a, x_data, ytgt_data, d);
-      ExpectEQ<T>(ytgt_data, yref_data, d);
-      // test inplace x
-      std::copy(x.begin(), x.end(), ytgt.begin());
-      tgt(&a, ytgt_data, ytgt_data, d);
-      ExpectEQ<T>(ytgt_data, yref_data, d);
-    };
-    TestAllImpls<KernelTuple, PlaceType>(d, verifier, a, x, yref);
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelXYN() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  for (int d : TestSizes()) {
-    auto ref = jit::GetReferFunc<KernelTuple>();
-    EXPECT_TRUE(ref != nullptr);
-
-    std::vector<T> x(d), yref(d);
-    std::vector<T> xinp(d);  // inplace test
-    RandomVec<T>(d, x.data());
-    std::copy(x.begin(), x.end(), xinp.begin());
-
-    const T* x_data = x.data();
-    T* yref_data = yref.data();
-    T* xinp_data = xinp.data();
-    // test refer code inplace
-    ref(x_data, yref_data, d);
-    ref(xinp_data, xinp_data, d);
-    ExpectEQ<T>(xinp_data, yref_data, d);
-    auto verifier = [](const typename KernelTuple::func_type tgt,
-                       const std::vector<T>& x,
-                       const std::vector<T>& yref) {
-      EXPECT_TRUE(tgt != nullptr);
-      EXPECT_EQ(yref.size(), x.size());
-      const T* x_data = x.data();
-      const T* yref_data = yref.data();
-      const int d = yref.size();
-      std::vector<T> ytgt(d);
-      T* ytgt_data = ytgt.data();
-      // test normal
-      tgt(x_data, ytgt_data, d);
-      ExpectEQ<T>(ytgt_data, yref_data, d);
-      // test inplace x
-      std::copy(x.begin(), x.end(), ytgt.begin());
-      tgt(ytgt_data, ytgt_data, d);
-      ExpectEQ<T>(ytgt_data, yref_data, d);
-    };
-    TestAllImpls<KernelTuple, PlaceType>(d, verifier, x, yref);
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelXRN() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  auto last_acc = FLAGS_acc;
-  FLAGS_acc = 1e-4;
-  for (int d : TestSizes()) {
-    auto ref = jit::GetReferFunc<KernelTuple>();
-    EXPECT_TRUE(ref != nullptr);
-    std::vector<T> x(d);
-    RandomVec<T>(d, x.data());
-    T ref_res;
-    ref(x.data(), &ref_res, d);
-
-    auto verifier = [](const typename KernelTuple::func_type tgt,
-                       const std::vector<T>& x,
-                       const T ref_res) {
-      EXPECT_TRUE(tgt != nullptr);
-      T tgt_res;
-      tgt(x.data(), &tgt_res, x.size());
-      ExpectEQ<T>(&tgt_res, &ref_res, 1);
-    };
-    TestAllImpls<KernelTuple, PlaceType>(d, verifier, x, ref_res);
-  }
-  FLAGS_acc = last_acc;
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelLSTM() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  std::vector<std::string> all_acts = {"sigmoid", "tanh", "relu", "identity"};
-  auto test_sizes = TestSizes();
-  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
-  for (int d : test_sizes) {
-    for (bool use_peephole : {true, false}) {
-      for (auto& act_gate : all_acts) {
-        for (auto& act_cand : all_acts) {
-          for (auto& act_cell : all_acts) {
-            const jit::lstm_attr_t attr(d,
-                                        jit::to_kerneltype(act_gate),
-                                        jit::to_kerneltype(act_cand),
-                                        jit::to_kerneltype(act_cell),
-                                        use_peephole);
-            auto ref = jit::GetReferFunc<KernelTuple>();
-            EXPECT_TRUE(ref != nullptr);
-            std::vector<T> xsrc(4 * d), wp(3 * d), ct_1(d);
-            std::vector<T> ct_ref(d), ht_ref(d), checked(2 * d);
-            RandomVec<T>(4 * d, xsrc.data());
-            RandomVec<T>(3 * d, wp.data(), -1.f, 1.f);
-            RandomVec<T>(d, ct_1.data(), -1.f, 1.f);
-            // x could be changed after compute, so copy to save src
-            std::vector<T> x(xsrc.size());
-            std::copy(xsrc.begin(), xsrc.end(), x.begin());
-            const T* ct_1_data = ct_1.data();
-            const T* wp_data = wp.data();
-            T* x_data = x.data();
-            T* checked_data = checked.data();
-            T* ct_ref_data = ct_ref.data();
-            T* ht_ref_data = ht_ref.data();
-            jit::lstm_t step;
-            step.gates = x_data;
-            step.ct_1 = ct_1_data;
-            step.ct = ct_ref_data;
-            step.ht = ht_ref_data;
-            if (use_peephole) {
-              step.wp = wp_data;
-              step.checked = checked_data;
-            }
-            ref(&step, &attr);
-            VLOG(10) << attr;
-
-            auto verifier = [](const typename KernelTuple::func_type tgt,
-                               const std::vector<T>& xsrc,
-                               const std::vector<T>& wp,
-                               const std::vector<T>& ct_1,
-                               const std::vector<T>& ct_ref,
-                               const std::vector<T>& ht_ref,
-                               const typename KernelTuple::attr_type& attr) {
-              EXPECT_TRUE(tgt != nullptr);
-              EXPECT_EQ(ct_ref.size(), ht_ref.size());
-              EXPECT_EQ(ct_1.size(), ht_ref.size());
-              EXPECT_EQ(xsrc.size(), 4 * ht_ref.size());
-              EXPECT_EQ(wp.size(), 3 * ht_ref.size());
-
-              // x could be changed after compute, so copy to save src
-              int d = ht_ref.size();
-              std::vector<T> x(xsrc.size()), ct(ct_ref.size()),
-                  ht(ht_ref.size());
-              std::vector<T> checked(2 * d);
-              std::copy(xsrc.begin(), xsrc.end(), x.begin());
-
-              const T* ct_1_data = ct_1.data();
-              const T* wp_data = wp.data();
-              const T* ct_ref_data = ct_ref.data();
-              const T* ht_ref_data = ht_ref.data();
-              T* x_data = x.data();
-              T* ct_data = ct.data();
-              T* ht_data = ht.data();
-              T* checked_data = checked.data();
-
-              jit::lstm_t step;
-              step.gates = x_data;
-              step.ct_1 = ct_1_data;
-              step.ct = ct_data;
-              step.ht = ht_data;
-              if (attr.use_peephole) {
-                step.wp = wp_data;
-                step.checked = checked_data;
-              }
-
-              tgt(&step, &attr);
-              ExpectEQ<T>(ct_data, ct_ref_data, d);
-              ExpectEQ<T>(ht_data, ht_ref_data, d);
-            };
-            TestAllImpls<KernelTuple, PlaceType>(
-                attr, verifier, xsrc, wp, ct_1, ct_ref, ht_ref, attr);
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelGRU() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  std::vector<std::string> all_acts = {"sigmoid", "tanh", "relu", "identity"};
-  auto test_sizes = TestSizes();
-  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
-  for (int d : test_sizes) {
-    for (auto& act_gate : all_acts) {
-      for (auto& act_cand : all_acts) {
-        const jit::gru_attr_t attr(
-            d, jit::to_kerneltype(act_gate), jit::to_kerneltype(act_cand));
-        auto ref = jit::GetReferFunc<KernelTuple>();
-        EXPECT_TRUE(ref != nullptr);
-        std::vector<T> xsrc(3 * d), ht_1(d), ht_ref(d);
-        RandomVec<T>(3 * d, xsrc.data());
-        RandomVec<T>(d, ht_1.data());
-        // x could be changed after compute, so copy to save src
-        std::vector<T> x(xsrc.size());
-        std::copy(xsrc.begin(), xsrc.end(), x.begin());
-        const T* ht_1_data = ht_1.data();
-        T* x_data = x.data();
-        T* ht_ref_data = ht_ref.data();
-        jit::gru_t step;
-        step.gates = x_data;
-        step.ht_1 = ht_1_data;
-        step.ht = ht_ref_data;
-        ref(&step, &attr);
-        VLOG(10) << attr;
-        auto verifier = [](const typename KernelTuple::func_type tgt,
-                           const std::vector<T>& xsrc,
-                           const std::vector<T>& ht_1,
-                           const std::vector<T>& ht_ref,
-                           const typename KernelTuple::attr_type& attr) {
-          EXPECT_TRUE(tgt != nullptr);
-          EXPECT_EQ(ht_1.size(), ht_ref.size());
-          EXPECT_EQ(xsrc.size(), 3 * ht_ref.size());
-
-          // x could be changed after compute, so copy to save src
-          int d = ht_ref.size();
-          std::vector<T> x(xsrc.size()), ht(ht_ref.size());
-          std::copy(xsrc.begin(), xsrc.end(), x.begin());
-          const T* ht_1_data = ht_1.data();
-          const T* ht_ref_data = ht_ref.data();
-          T* x_data = x.data();
-          T* ht_data = ht.data();
-          jit::gru_t step;
-          step.gates = x_data;
-          step.ht_1 = ht_1_data;
-          step.ht = ht_data;
-          tgt(&step, &attr);
-          ExpectEQ<T>(ht_data, ht_ref_data, d);
-        };
-        TestAllImpls<KernelTuple, PlaceType>(
-            attr, verifier, xsrc, ht_1, ht_ref, attr);
-      }
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelNCHW16CMulNC() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  const int n = 3, c = 16 * 4, h = 10, w = 10;
-  auto ref = jit::GetReferFunc<KernelTuple>();
-  EXPECT_TRUE(ref != nullptr);
-  int sz = n * c * h * w;
-  std::vector<T> x(sz), y(n * c), zref(sz);
-  std::vector<T> ztgt(sz), zjit(sz);
-  RandomVec<T>(sz, x.data());
-  RandomVec<T>(n * c, y.data());
-
-  const T* x_data = x.data();
-  const T* y_data = y.data();
-  T* zref_data = zref.data();
-  T* ztgt_data = ztgt.data();
-  T* zjit_data = zjit.data();
-  constexpr int simd_width = ZMM_FLOAT_BLOCK;
-  int C = c / simd_width;
-  auto tgt = jit::KernelFuncs<KernelTuple, PlaceType>::Cache().At(0);
-  auto funcs = jit::GetAllCandidateFuncs<KernelTuple, PlaceType>(0);
-  EXPECT_GT(funcs.size(), 0UL);
-  auto jitcode = funcs[0];
-  EXPECT_TRUE(tgt != nullptr);
-
-  if (std::is_same<T, float>::value &&
-      paddle::lite::x86::MayIUse(paddle::lite::x86::avx512f)) {
-    EXPECT_TRUE(jitcode != nullptr);
-  }
-  for (int ni = 0; ni < n; ni++) {
-    for (int ci = 0; ci < C; ci++) {
-      auto ptr_x =
-          x_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
-      auto ptr_y = y_data + ni * C * simd_width + ci * simd_width;
-      auto ptr_zref =
-          zref_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
-      auto ptr_ztgt =
-          ztgt_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
-
-      ref(ptr_x, ptr_y, ptr_zref, h, w);
-      tgt(ptr_x, ptr_y, ptr_ztgt, h, w);
-
-      if (jitcode) {
-        auto ptr_zjit =
-            zjit_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
-        jitcode(ptr_x, ptr_y, ptr_zjit, h, w);
-      }
-    }
-  }
-  ExpectEQ<T>(ztgt_data, zref_data, sz);
-  if (jitcode) {
-    ExpectEQ<T>(zjit_data, zref_data, sz);
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelLayerNorm() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  const T epsilon = 9.99999975e-06;
-  for (int n : {1, 2, 10}) {
-    for (int x_dim_0 : {1, 9, 17, 50}) {
-      int left = n * x_dim_0;
-      for (int x_dim_1 : TestSizes()) {
-        int right = x_dim_1;
-        auto ref = jit::GetReferFunc<KernelTuple>();
-        EXPECT_TRUE(ref != nullptr);
-        int sz = left * right;
-        std::vector<T> x(sz), mean(left), var(left), scale(right), bias(right),
-            outref(sz);
-        RandomVec<T>(sz, x.data());
-        RandomVec<T>(left, mean.data());
-        RandomVec<T>(left, var.data());
-        RandomVec<T>(right, scale.data());
-        RandomVec<T>(right, bias.data());
-
-        const T* scale_data = scale.data();
-        const T* bias_data = bias.data();
-        T* x_data = x.data();
-        T* mean_data = mean.data();
-        T* var_data = var.data();
-        T* outref_data = outref.data();
-
-        ref(x_data,
-            outref_data,
-            mean_data,
-            var_data,
-            scale_data,
-            bias_data,
-            left,
-            epsilon,
-            right);
-
-        auto verifier = [](const typename KernelTuple::func_type tgt,
-                           const std::vector<T>& x_,
-                           const std::vector<T>& outref_,
-                           const std::vector<T>& mean_,
-                           const std::vector<T>& var_,
-                           const std::vector<T>& scale,
-                           const std::vector<T>& bias,
-                           const int& left,
-                           const float& epsilon,
-                           const typename KernelTuple::attr_type& right) {
-          EXPECT_TRUE(tgt != nullptr);
-          std::vector<T> outtgt(outref_.size());
-          std::vector<T> x(x_.size());
-          std::vector<T> mean(mean_.size());
-          std::vector<T> var(var_.size());
-          std::vector<T> outref(outref_.size());
-          std::copy(x_.begin(), x_.end(), x.begin());
-          std::copy(mean_.begin(), mean_.end(), mean.begin());
-          std::copy(var_.begin(), var_.end(), var.begin());
-          std::copy(outref_.begin(), outref_.end(), outref.begin());
-
-          EXPECT_EQ(x.size(), static_cast<size_t>(left * right));
-          EXPECT_EQ(outref.size(), static_cast<size_t>(left * right));
-          EXPECT_EQ(mean.size(), static_cast<size_t>(left));
-          EXPECT_EQ(var.size(), static_cast<size_t>(left));
-          EXPECT_EQ(scale.size(), static_cast<size_t>(right));
-          EXPECT_EQ(bias.size(), static_cast<size_t>(right));
-
-          const T* scale_data = scale.data();
-          const T* bias_data = bias.data();
-          T* x_data = x.data();
-          T* mean_data = mean.data();
-          T* var_data = var.data();
-          T* outref_data = outref.data();
-          T* outtgt_data = outtgt.data();
-          tgt(x_data,
-              outtgt_data,
-              mean_data,
-              var_data,
-              scale_data,
-              bias_data,
-              left,
-              epsilon,
-              right);
-          ExpectEQ<T>(outtgt_data, outref_data, left * right);
-        };
-        TestAllImpls<KernelTuple, PlaceType>(right,
-                                             verifier,
-                                             x,
-                                             outref,
-                                             mean,
-                                             var,
-                                             scale,
-                                             bias,
-                                             left,
-                                             epsilon,
-                                             right);
-      }
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelCRFDecoding() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  constexpr int state_trans_base_idx = 2;
-  auto test_sizes = TestSizes();
-  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 2000));
-  for (int seq_len : {1, 11, 17, 50}) {
-    for (int tag_num : test_sizes) {
-      auto ref = jit::GetReferFunc<KernelTuple>();
-      EXPECT_TRUE(ref != nullptr);
-      int x_sz = seq_len * tag_num;
-      int w_sz = (tag_num + state_trans_base_idx) * tag_num;
-      std::vector<T> x(x_sz), w(w_sz), alpharef(x_sz);
-      std::vector<int> trackref(x_sz);
-      RandomVec<T>(x_sz, x.data());
-      RandomVec<T>(w_sz, w.data());
-
-      ref(seq_len,
-          (const T*)x.data(),
-          (const T*)w.data(),
-          alpharef.data(),
-          trackref.data(),
-          tag_num);
-
-      auto verifier = [](const typename KernelTuple::func_type tgt,
-                         const int& seq_len,
-                         const std::vector<T>& x,
-                         const std::vector<T>& w,
-                         const std::vector<T>& alpharef,
-                         const std::vector<int>& trackref,
-                         const typename KernelTuple::attr_type& tag_num) {
-        constexpr int state_trans_base_idx = 2;
-        EXPECT_TRUE(tgt != nullptr);
-        EXPECT_EQ(x.size(), static_cast<size_t>(seq_len * tag_num));
-        EXPECT_EQ(
-            w.size(),
-            static_cast<size_t>((tag_num + state_trans_base_idx) * tag_num));
-        EXPECT_EQ(alpharef.size(), static_cast<size_t>(seq_len * tag_num));
-        EXPECT_EQ(trackref.size(), static_cast<size_t>(seq_len * tag_num));
-        std::vector<T> alphatgt(alpharef.size());
-        std::vector<int> tracktgt(trackref.size());
-        memcpy(tracktgt.data(), trackref.data(), tag_num * sizeof(int));
-        tgt(seq_len,
-            (const T*)x.data(),
-            (const T*)w.data(),
-            alphatgt.data(),
-            tracktgt.data(),
-            tag_num);
-        ExpectEQ<T>(alpharef.data(), alphatgt.data(), seq_len * tag_num);
-        ExpectEQ<int>(trackref.data(), tracktgt.data(), seq_len * tag_num);
-      };
-      TestAllImpls<KernelTuple, PlaceType>(
-          tag_num, verifier, seq_len, x, w, alpharef, trackref, tag_num);
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelSeqPool() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  std::vector<jit::SeqPoolType> pool_types = {
-      jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt};
-  auto test_sizes = TestSizes();
-  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
-  for (auto type : pool_types) {
-    for (int w : test_sizes) {
-      jit::seq_pool_attr_t attr(w, type);
-      for (int h : test_sizes) {
-        attr.h = h;
-        auto ref = jit::GetReferFunc<KernelTuple>();
-        EXPECT_TRUE(ref != nullptr);
-        std::vector<T> x(h * w), yref(w);
-        RandomVec<T>(h * w, x.data());
-        const T* x_data = x.data();
-        T* yref_data = yref.data();
-        ref(x_data, yref_data, &attr);
-        VLOG(10) << attr;
-        auto verifier = [](const typename KernelTuple::func_type tgt,
-                           const std::vector<T>& x,
-                           const std::vector<T>& yref,
-                           const typename KernelTuple::attr_type& attr) {
-          EXPECT_TRUE(tgt != nullptr);
-          EXPECT_EQ(x.size() % yref.size(), static_cast<size_t>(0));
-          int w = yref.size();
-          std::vector<T> y(w);
-          const T* x_data = x.data();
-          const T* yref_data = yref.data();
-          T* y_data = y.data();
-          tgt(x_data, y_data, &attr);
-          ExpectEQ<T>(y_data, yref_data, w);
-        };
-        TestAllImpls<KernelTuple, PlaceType>(attr, verifier, x, yref, attr);
-      }
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelEmbSeqPool() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  int64_t tbl_h = 1e4;
-  std::vector<jit::SeqPoolType> pool_types = {
-      jit::SeqPoolType::kSum};  // only support sum yet
-  auto test_sizes = TestSizes();
-  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
-  for (int tbl_w : test_sizes) {
-    std::vector<T> table(tbl_h * tbl_w);
-    RandomVec<T>(tbl_h * tbl_w, table.data());
-    const T* table_data = table.data();
-    for (auto type : pool_types) {
-      for (int idx_w : {1, 2, 10, 16}) {
-        for (int idx_h : {1, 2, 9, 13, 16}) {
-          auto ref = jit::GetReferFunc<KernelTuple>();
-          EXPECT_TRUE(ref != nullptr);
-          std::vector<int64_t> idx(idx_h * idx_w);
-          RandomVec<int64_t>(idx_h * idx_w, idx.data(), 0, tbl_h - 1);
-          int64_t out_w = tbl_w * idx_w;
-          std::vector<T> oref(out_w);
-          const int64_t* idx_data = idx.data();
-          T* o_data = oref.data();
-          jit::emb_seq_pool_attr_t attr(
-              tbl_h, tbl_w, idx_h, idx_w, out_w, type);
-          ref(table_data, idx_data, o_data, &attr);
-
-          auto verifier = [](const typename KernelTuple::func_type tgt,
-                             const std::vector<T>& table,
-                             const std::vector<int64_t>& idx,
-                             const std::vector<T>& oref,
-                             const typename KernelTuple::attr_type& attr) {
-            EXPECT_TRUE(tgt != nullptr);
-            EXPECT_EQ(
-                table.size(),
-                static_cast<size_t>(attr.table_height * attr.table_width));
-            EXPECT_EQ(
-                idx.size(),
-                static_cast<size_t>(attr.index_height * attr.index_width));
-            EXPECT_EQ(oref.size(),
-                      static_cast<size_t>(attr.table_width * attr.index_width));
-            const T* table_data = table.data();
-            const int64_t* idx_data = idx.data();
-            const T* oref_data = oref.data();
-            int o_w = oref.size();
-            std::vector<T> out(o_w);
-            T* o_data = out.data();
-            tgt(table_data, idx_data, o_data, &attr);
-            ExpectEQ<T>(o_data, oref_data, o_w);
-          };
-          TestAllImpls<KernelTuple, PlaceType>(
-              attr, verifier, table, idx, oref, attr);
-        }
-      }
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelMatMul() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  auto last_acc = FLAGS_acc;
-  // export MKL_CBWR=AVX would make MKL force to use AVX
-  // export KMP_DETERMINISTIC_REDUCTION=yes would make the result deterministic
-  FLAGS_acc = 1e-3;
-  for (int m : {1, 2, 3, 4}) {
-    for (int n : {1, 2, 3, 4}) {
-      for (int k : TestSizes()) {
-        auto ref = jit::GetReferFunc<KernelTuple>();
-        EXPECT_TRUE(ref != nullptr);
-        std::vector<T> a(m * k), b(k * n), c(m * n);
-        RandomVec<T>(m * k, a.data());
-        RandomVec<T>(k * n, b.data());
-        const T* a_data = a.data();
-        const T* b_data = b.data();
-        T* c_data = c.data();
-        const jit::matmul_attr_t attr{m, n, k};
-        ref(a_data, b_data, c_data, &attr);
-        auto verifier = [](const typename KernelTuple::func_type tgt,
-                           const std::vector<T>& a,
-                           const std::vector<T>& b,
-                           const std::vector<T>& cref,
-                           const typename KernelTuple::attr_type& attr) {
-          EXPECT_TRUE(tgt != nullptr);
-          EXPECT_EQ(a.size(), static_cast<size_t>(attr.m * attr.k));
-          EXPECT_EQ(b.size(), static_cast<size_t>(attr.k * attr.n));
-          EXPECT_EQ(cref.size(), static_cast<size_t>(attr.m * attr.n));
-          std::vector<T> c(cref.size());
-          const T* a_data = a.data();
-          const T* b_data = b.data();
-          const T* cref_data = cref.data();
-          T* c_data = c.data();
-          tgt(a_data, b_data, c_data, &attr);
-          ExpectEQ<T>(c_data, cref_data, attr.m * attr.n);
-        };
-        TestAllImpls<KernelTuple, PlaceType>(attr, verifier, a, b, c, attr);
-      }
-    }
-  }
-  FLAGS_acc = last_acc;
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelSoftmax() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  for (int bs : {1, 2, 10}) {
-    for (int n : TestSizes()) {
-      for (int m : {1, 2, 3}) {  // remain
-        if (m > n || n % m != 0) {
-          continue;
-        }
-        auto ref = jit::GetReferFunc<KernelTuple>();
-        EXPECT_TRUE(ref != nullptr);
-        std::vector<T> x(bs * n), y(bs * n);
-        RandomVec<T>(bs * n, x.data());
-        const T* x_data = x.data();
-        T* y_data = y.data();
-
-        std::vector<T> xinp(x.size());  // inplace test
-        std::copy(x.begin(), x.end(), xinp.begin());
-        ref(x_data, y_data, n, bs, m);
-        T* xinp_data = xinp.data();
-        ref(xinp_data, xinp_data, n, bs, m);
-        ExpectEQ<T>(xinp_data, y_data, n * bs);
-
-        auto verifier = [](const typename KernelTuple::func_type tgt,
-                           const std::vector<T>& x,
-                           const std::vector<T>& yref,
-                           int n,
-                           int bs,
-                           int m) {
-          EXPECT_TRUE(tgt != nullptr);
-          EXPECT_EQ(yref.size(), x.size());
-          EXPECT_EQ(x.size(), static_cast<size_t>(n * bs));
-          const T* x_data = x.data();
-          const T* yref_data = yref.data();
-          std::vector<T> ytgt(n * bs);
-          T* ytgt_data = ytgt.data();
-          // test normal
-          tgt(x_data, ytgt_data, n, bs, m);
-          ExpectEQ<T>(ytgt_data, yref_data, n * bs);
-          // test inplace x
-          std::copy(x.begin(), x.end(), ytgt.begin());
-          tgt(ytgt_data, ytgt_data, n, bs, m);
-          ExpectEQ<T>(ytgt_data, yref_data, n * bs);
-        };
-        TestAllImpls<KernelTuple, PlaceType>(n, verifier, x, y, n, bs, m);
-      }
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelStrideASum() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  for (int d : TestSizes()) {
-    for (int m : {1, 2, 3}) {  // stride
-      if (m > d || d % m != 0) {
-        continue;
-      }
-      auto ref = jit::GetReferFunc<KernelTuple>();
-      EXPECT_TRUE(ref != nullptr);
-      std::vector<T> x(d);
-      RandomVec<T>(d, x.data());
-      T ref_res;
-      ref(x.data(), &ref_res, d, m);
-
-      auto verifier = [](const typename KernelTuple::func_type tgt,
-                         const std::vector<T>& x,
-                         const T ref_res,
-                         const int m) {
-        EXPECT_TRUE(tgt != nullptr);
-        T tgt_res;
-        tgt(x.data(), &tgt_res, x.size(), m);
-        ExpectEQ<T>(&tgt_res, &ref_res, 1);
-      };
-      TestAllImpls<KernelTuple, PlaceType>(d, verifier, x, ref_res, m);
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelStrideScal() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  for (int d : TestSizes()) {
-    for (int m : {1, 2, 3}) {  // stride
-      if (m > d || d % m != 0) {
-        continue;
-      }
-      auto ref = jit::GetReferFunc<KernelTuple>();
-      EXPECT_TRUE(ref != nullptr);
-
-      const T a = static_cast<T>(3);
-      std::vector<T> x(d), yref(d);
-      std::vector<T> xinp(d);  // inplace test
-      RandomVec<T>(d, x.data());
-      std::copy(x.begin(), x.end(), xinp.begin());
-
-      const T* x_data = x.data();
-      T* yref_data = yref.data();
-      T* xinp_data = xinp.data();
-      // test refer code inplace
-      ref(&a, x_data, yref_data, d, m);
-      ref(&a, xinp_data, xinp_data, d, m);
-      ExpectEQ<T>(xinp_data, yref_data, d);
-
-      auto verifier = [](const typename KernelTuple::func_type tgt,
-                         const T a,
-                         const std::vector<T>& x,
-                         const std::vector<T>& yref,
-                         const int m) {
-        EXPECT_TRUE(tgt != nullptr);
-        EXPECT_EQ(yref.size(), x.size());
-        const T* x_data = x.data();
-        const T* yref_data = yref.data();
-        const int d = yref.size();
-        std::vector<T> ytgt(d);
-        T* ytgt_data = ytgt.data();
-        // test normal
-        tgt(&a, x_data, ytgt_data, d, m);
-        ExpectEQ<T>(ytgt_data, yref_data, d);
-        // test inplace x
-        std::copy(x.begin(), x.end(), ytgt.begin());
-        tgt(&a, ytgt_data, ytgt_data, d, m);
-        ExpectEQ<T>(ytgt_data, yref_data, d);
-      };
-      TestAllImpls<KernelTuple, PlaceType>(d, verifier, a, x, yref, m);
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelSgd() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  const T lr = 0.1;
-  auto UnDuplicatedRandomVec = [](
-      int n, const int64_t lower, const int64_t upper) -> std::vector<int64_t> {
-    PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1);
-    PADDLE_ENFORCE_GT(n, 0);
-    std::vector<int64_t> all, out;
-    for (int i = 0; i < n; ++i) {
-      all.push_back(i);
-    }
-    std::random_shuffle(all.begin(), all.end());
-    out.insert(out.begin(), all.begin(), all.begin() + n);
-    return out;
-  };
-  for (int param_h : {1, 10}) {
-    for (int grad_w : TestSizes()) {
-      std::vector<T> param(param_h * grad_w);
-      std::vector<T> param_out(param_h * grad_w);
-      RandomVec<T>(param_h * grad_w, param.data());
-      const T* param_data = param.data();
-      T* out_data = param_out.data();
-      for (int rows_size = 1; rows_size <= param_h; ++rows_size) {
-        std::vector<T> grad(rows_size * grad_w);
-        std::vector<int64_t> rows =
-            UnDuplicatedRandomVec(rows_size, 0, rows_size - 1);
-        RandomVec<T>(rows_size * grad_w, grad.data());
-        const int64_t* rows_data = rows.data();
-        const T* grad_data = grad.data();
-        auto ref = jit::GetReferFunc<KernelTuple>();
-        EXPECT_TRUE(ref != nullptr);
-        jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size);
-        ref(&lr, param_data, grad_data, rows_data, out_data, &attr);
-
-        // inplace test
-        std::vector<T> inp(param.size());
-        std::copy(param.begin(), param.end(), inp.begin());
-        T* inp_data = inp.data();
-        ref(&lr, inp_data, grad_data, rows_data, inp_data, &attr);
-        // only the selected rows should be equal
-        for (int i = 0; i < rows_size; ++i) {
-          ExpectEQ<T>(
-              inp_data + rows[i] * grad_w, out_data + rows[i] * grad_w, grad_w);
-        }
-
-        auto verifier = [](const typename KernelTuple::func_type tgt,
-                           const T lr,
-                           const std::vector<T>& param,
-                           const std::vector<T>& grad,
-                           const std::vector<int64_t>& rows,
-                           const std::vector<T>& oref,
-                           const typename KernelTuple::attr_type& attr) {
-          EXPECT_TRUE(tgt != nullptr);
-          EXPECT_EQ(param.size(),
-                    static_cast<size_t>(attr.param_height * attr.param_width));
-          EXPECT_EQ(grad.size(),
-                    static_cast<size_t>(attr.grad_height * attr.grad_width));
-          EXPECT_EQ(rows.size(), static_cast<size_t>(attr.selected_rows_size));
-          EXPECT_EQ(param.size(), oref.size());
-          const T* param_data = param.data();
-          const T* grad_data = grad.data();
-          const int64_t* rows_data = rows.data();
-          const T* oref_data = oref.data();
-
-          std::vector<T> out(oref.size());
-          T* o_data = out.data();
-          tgt(&lr, param_data, grad_data, rows_data, o_data, &attr);
-          // only the selected rows should be equal
-          for (size_t i = 0; i < rows.size(); ++i) {
-            ExpectEQ<T>(o_data + rows[i] * attr.grad_width,
-                        oref_data + rows[i] * attr.grad_width,
-                        attr.grad_width);
-          }
-
-          // inplace
-          std::copy(param.begin(), param.end(), out.begin());
-          tgt(&lr, o_data, grad_data, rows_data, o_data, &attr);
-          for (size_t i = 0; i < rows.size(); ++i) {
-            ExpectEQ<T>(o_data + rows[i] * attr.grad_width,
-                        oref_data + rows[i] * attr.grad_width,
-                        attr.grad_width);
-          }
-        };
-        TestAllImpls<KernelTuple, PlaceType>(
-            attr, verifier, lr, param, grad, rows, param_out, attr);
-      }
-    }
-  }
-}
-
-template <typename KernelTuple, typename PlaceType>
-void TestKernelVBroadcast() {
-  using T = typename KernelTuple::data_type;
-  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
-  for (int w : TestSizes()) {
-    std::vector<T> x(w);
-    RandomVec<T>(w, x.data());
-    const T* x_data = x.data();
-    for (int64_t h : {1, 2, 6}) {
-      auto ref = jit::GetReferFunc<KernelTuple>();
-      EXPECT_TRUE(ref != nullptr);
-      std::vector<T> y(w * h);
-      T* y_data = y.data();
-      ref(x_data, y_data, h, w);
-
-      auto verifier = [](const typename KernelTuple::func_type tgt,
-                         const std::vector<T>& x,
-                         const std::vector<T>& yref,
-                         const int64_t& h,
-                         const typename KernelTuple::attr_type& attr) {
-        EXPECT_TRUE(tgt != nullptr);
-        EXPECT_EQ(x.size(), static_cast<size_t>(attr));
-        EXPECT_EQ(yref.size(), x.size() * h);
-        std::vector<T> y(yref.size());
-        const T* x_data = x.data();
-        const T* yref_data = yref.data();
-        T* y_data = y.data();
-        tgt(x_data, y_data, h, attr);
-        ExpectEQ<T>(y_data, yref_data, yref.size());
-      };
-      TestAllImpls<KernelTuple, PlaceType>(
-          static_cast<int64_t>(w), verifier, x, y, h, static_cast<int64_t>(w));
-    }
-  }
-}
-
-// test pool
-TEST(JITKernel_pool, jitcreator) {
-  const auto& jitcreators = jit::JitCodeCreatorPool::Instance().AllCreators();
-#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__)
-  EXPECT_EQ(jitcreators.size(), 0UL);
-#else
-  EXPECT_EQ(jitcreators.size(), 25UL);
-#endif
-}
-
-TEST(JITKernel_pool, jitpool) {
-  // jitpool is related with attr
-  const auto& kers = jit::JitCodePool<jit::kVAdd>().Instance().AllKernels();
-  EXPECT_EQ(kers.size(), 0UL);
-  jit::GetAllCandidateKernels<jit::VAddTuple<float>, CPUPlace>(3);
-// after call GetAllCandidateKernels, it will create jitcode Automatically
-#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__)
-  EXPECT_EQ(kers.size(), 0UL);
-#else
-  EXPECT_EQ(kers.size(), 1UL);
-#endif
-}
-
-TEST(JITKernel_pool, more) {
-  const auto& kers = jit::KernelPool::Instance().AllKernels();
-  size_t target_num = 8;
-
-#ifdef __AVX__
-  target_num += 2;
-#endif
-
-#ifdef PADDLE_WITH_MKLML
-  target_num += 12;
-#endif
-
-  EXPECT_EQ(kers.size(), target_num);
-}
-
-TEST(JITKernel_pool, refer) {
-  const auto& kers = jit::ReferKernelPool::Instance().AllKernels();
-  EXPECT_EQ(kers.size(), 31UL);
-}
-
-// test helper
-TEST(JITKernel_helper, GetAllCandidateKernels) {
-  auto fp_kers =
-      jit::GetAllCandidateKernels<jit::VExpTuple<float>, CPUPlace>(10);
-#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__)
-  EXPECT_GE(fp_kers.size(), 1UL);  // refer
-#else
-#ifdef PADDLE_WITH_MKLML
-  EXPECT_GE(fp_kers.size(), 3UL);  // jitcode, mkl, refer
-#else
-  EXPECT_GE(fp_kers.size(), 2UL);  // jitcode, refer
-#endif
-#endif
-
-  auto db_kers =
-      jit::GetAllCandidateKernels<jit::VExpTuple<double>, CPUPlace>(10);
-#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__)
-  EXPECT_GE(db_kers.size(), 1UL);  // refer
-#else
-#ifdef PADDLE_WITH_MKLML
-  EXPECT_GE(db_kers.size(), 2UL);  // mkl, refer
-#else
-  EXPECT_GE(db_kers.size(), 1UL);  // refer
-#endif
-#endif
-}
-
-TEST(JITKernel_helper, GetAllCandidateFuncsWithTypes) {
-  auto fp_kers =
-      jit::GetAllCandidateFuncsWithTypes<jit::VExpTuple<float>, CPUPlace>(10);
-#if defined(__APPLE__) || defined(__OSX__)
-  EXPECT_GE(fp_kers.size(), 1UL);  // refer
-#else
-#if !defined(PADDLE_WITH_MKLML) || defined(_WIN32)
-  EXPECT_GE(fp_kers.size(), 2UL);  // jitcode/mkl, refer
-#else
-  EXPECT_GE(fp_kers.size(), 3UL);  // jitcode, mkl, refer
-#endif
-#endif
-
-  auto db_kers =
-      jit::GetAllCandidateFuncsWithTypes<jit::VExpTuple<double>, CPUPlace>(10);
-#if defined(__APPLE__) || defined(__OSX__) || !defined(PADDLE_WITH_MKLML)
-  EXPECT_GE(db_kers.size(), 1UL);  // refer
-#else
-  EXPECT_GE(db_kers.size(), 2UL);  // mkl, refer
-#endif
-}
-
-TEST(JITKernel_helper, KernelFuncs) {
-  auto f1 = jit::KernelFuncs<jit::VAddTuple<float>, CPUPlace>::Cache().At(3);
-  auto f2 = jit::KernelFuncs<jit::VAddTuple<float>, CPUPlace>::Cache()[3];
-  EXPECT_TRUE(f1 != nullptr);
-  EXPECT_TRUE(f1 == f2);
-
-  auto f3 = jit::KernelFuncs<jit::VAddTuple<float>, CPUPlace>::Cache()[5];
-#if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__)
-  EXPECT_TRUE(f2 == f3);
-#else
-  EXPECT_TRUE(f2 != f3);
-#endif
-}
-
-TEST(JITKernel_helper, GetAllCandidateFuncs) {
-  auto funcs = jit::GetAllCandidateFuncs<jit::VExpTuple<float>, CPUPlace>(10);
-  auto kers = jit::GetAllCandidateKernels<jit::VExpTuple<float>, CPUPlace>(10);
-  EXPECT_EQ(funcs.size(), kers.size());
-
-  std::vector<float> x(10), tgt(10);
-  RandomVec<float>(10, x.data());
-  auto best = jit::GetDefaultBestFunc<jit::VExpTuple<float>, CPUPlace>(10);
-  best(x.data(), tgt.data(), 10);
-  for (auto f : funcs) {
-    std::vector<float> y(10);
-    f(x.data(), y.data(), 10);
-    ExpectEQ<float>(y.data(), tgt.data(), 10);
-  }
-}
-
-TEST(JITKernel_helper, pack_weights) {
-  const int N = 8 * 60, K = 2;
-  float src[K][N], yref[K][N], y[K * N];
-  float* x = &(src[0][0]);
-  float* ref = &(yref[0][0]);
-  for (int i = 0; i < N * K; ++i) {
-    *(x + i) = static_cast<float>(i);
-  }
-  int block = 0;
-  std::vector<int> groups;
-  if (paddle::lite::x86::MayIUse(paddle::lite::x86::avx512f)) {
-    block = ZMM_FLOAT_BLOCK;
-    groups.push_back(30);
-  } else {
-    block = YMM_FLOAT_BLOCK;
-    groups.insert(groups.end(), {14, 14, 14, 14, 4});
-  }
-
-  int offset = 0;
-  int acc = 0;
-  for (int g : groups) {
-    g = g * block;
-    for (int k = 0; k < K; ++k) {
-      for (int i = 0; i < g; ++i) {
-        *(ref + offset) = src[k][i + acc];
-        offset++;
-      }
-    }
-    acc += g;
-  }
-
-  jit::pack_weights<float>(x, y, N, K);
-  ExpectEQ<float>(y, ref, N * K);
-}
-
-TEST(JITKernel_helper, attr) {
-  std::ostringstream out;
-  // KernelTypes
-  out << jit::to_string(jit::kNone) << jit::to_string(jit::kCRFDecoding)
-      << jit::to_string(jit::kEmbSeqPool) << jit::to_string(jit::kGRUH1)
-      << jit::to_string(jit::kGRUHtPart1) << jit::to_string(jit::kGRUHtPart2)
-      << jit::to_string(jit::kHSum) << jit::to_string(jit::kHMax)
-      << jit::to_string(jit::kLSTMCtHt) << jit::to_string(jit::kLSTMC1H1)
-      << jit::to_string(jit::kLayerNorm) << jit::to_string(jit::kMatMul)
-      << jit::to_string(jit::kNCHW16CMulNC) << jit::to_string(jit::kSeqPool)
-      << jit::to_string(jit::kSoftmax) << jit::to_string(jit::kVAdd)
-      << jit::to_string(jit::kVAddBias) << jit::to_string(jit::kVAddRelu)
-      << jit::to_string(jit::kVBroadcast) << jit::to_string(jit::kVCopy)
-      << jit::to_string(jit::kVExp) << jit::to_string(jit::kVIdentity)
-      << jit::to_string(jit::kVMul) << jit::to_string(jit::kVRelu)
-      << jit::to_string(jit::kVScal) << jit::to_string(jit::kSgd)
-      << jit::to_string(jit::kVSigmoid) << jit::to_string(jit::kVSquare)
-      << jit::to_string(jit::kVSub) << jit::to_string(jit::kVTanh);
-  EXPECT_EQ(out.str().size(), 234);
-
-  // SeqPoolTypes
-  out.str("");
-  out << jit::to_string(jit::kSum) << jit::to_string(jit::kAvg)
-      << jit::to_string(jit::kSqrt);
-  EXPECT_EQ(out.str().size(), 13);
-
-  EXPECT_EQ(jit::to_kerneltype("relu"), jit::kVRelu);
-  EXPECT_EQ(jit::to_kerneltype("Identity"), jit::kVIdentity);
-  EXPECT_EQ(jit::to_kerneltype("VEXP"), jit::kVExp);
-  EXPECT_EQ(jit::to_kerneltype("SigmoiD"), jit::kVSigmoid);
-  EXPECT_EQ(jit::to_kerneltype("VTanh"), jit::kVTanh);
-
-  out.str("");
-  out << jit::lstm_attr_t(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);
-  EXPECT_EQ(out.str().size(), 89);
-
-  out.str("");
-  out << jit::gru_attr_t(8, jit::kVIdentity, jit::kVSigmoid);
-  EXPECT_EQ(out.str().size(), 52);
-
-  out.str("");
-  out << jit::seq_pool_attr_t(8, jit::SeqPoolType::kSum);
-  EXPECT_EQ(out.str().size(), 44);
-
-  out.str("");
-  out << jit::emb_seq_pool_attr_t(1, 2, 3, 4, 5, jit::SeqPoolType::kAvg);
-  EXPECT_EQ(out.str().size(), 93);
-
-  out.str("");
-  out << jit::sgd_attr_t(1, 2, 3, 4, 5);
-  EXPECT_EQ(out.str().size(), 81);
-
-  out.str("");
-  out << jit::matmul_attr_t(1, 2, 3);
-  EXPECT_EQ(out.str().size(), 14);
-}
-
-// test keys
-TEST(JITKernel_key, int) {
-  EXPECT_TRUE(jit::JitCodeKey<int>(2) == jit::JitCodeKey<int>(2));
-  EXPECT_TRUE(jit::JitCodeKey<int>(2) == jit::JitCodeKey<int64_t>(2));
-  EXPECT_TRUE(jit::JitCodeKey<int>(2) != jit::JitCodeKey<int>(3));
-}
-
-TEST(JITKernel_key, gru) {
-  jit::gru_attr_t attr1(8, jit::kVSigmoid, jit::kVTanh);
-  jit::gru_attr_t attr2(8, jit::kVSigmoid, jit::kVTanh);
-  jit::gru_attr_t attr3(9, jit::kVSigmoid, jit::kVTanh);
-  jit::gru_attr_t attr4(9, jit::kVSigmoid, jit::kVIdentity);
-  jit::gru_attr_t attr5(9, jit::kVTanh, jit::kVIdentity);
-
-  auto key1 = jit::JitCodeKey<jit::gru_attr_t>(attr1);
-  auto key2 = jit::JitCodeKey<jit::gru_attr_t>(attr2);
-  auto key3 = jit::JitCodeKey<jit::gru_attr_t>(attr3);
-  auto key4 = jit::JitCodeKey<jit::gru_attr_t>(attr4);
-  auto key5 = jit::JitCodeKey<jit::gru_attr_t>(attr5);
-
-  EXPECT_TRUE(key1 == key2);
-  EXPECT_TRUE(key2 != key3);
-  EXPECT_TRUE(key2 != key4);
-  EXPECT_TRUE(key2 != key5);
-  EXPECT_TRUE(key3 != key4);
-  EXPECT_TRUE(key3 != key5);
-  EXPECT_TRUE(key4 != key5);
-}
-
-TEST(JITKernel_key, lstm) {
-  jit::lstm_attr_t attr1(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);
-  jit::lstm_attr_t attr2(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);
-  jit::lstm_attr_t attr3(9, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);
-  jit::lstm_attr_t attr4(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh);
-  jit::lstm_attr_t attr5(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh, true);
-  jit::lstm_attr_t attr6(9, jit::kVRelu, jit::kVSigmoid, jit::kVTanh, true);
-
-  auto key1 = jit::JitCodeKey<jit::lstm_attr_t>(attr1);
-  auto key2 = jit::JitCodeKey<jit::lstm_attr_t>(attr2);
-  auto key3 = jit::JitCodeKey<jit::lstm_attr_t>(attr3);
-  auto key4 = jit::JitCodeKey<jit::lstm_attr_t>(attr4);
-  auto key5 = jit::JitCodeKey<jit::lstm_attr_t>(attr5);
-  auto key6 = jit::JitCodeKey<jit::lstm_attr_t>(attr6);
-
-  EXPECT_TRUE(key1 == key2);
-  EXPECT_TRUE(key2 != key3);
-  EXPECT_TRUE(key2 != key4);
-  EXPECT_TRUE(key2 != key5);
-  EXPECT_TRUE(key3 != key4);
-  EXPECT_TRUE(key3 != key5);
-  EXPECT_TRUE(key4 != key5);
-  EXPECT_TRUE(key5 == key6);
-}
-
-TEST(JITKernel_key, seq_pool) {
-  jit::seq_pool_attr_t attr1(2, jit::SeqPoolType::kSum, 1);
-  jit::seq_pool_attr_t attr2(2, jit::SeqPoolType::kSum, 3);
-  jit::seq_pool_attr_t attr3(3, jit::SeqPoolType::kSum, 3);
-  jit::seq_pool_attr_t attr4(3, jit::SeqPoolType::kAvg, 3);
-
-  auto key1 = jit::JitCodeKey<jit::seq_pool_attr_t>(attr1);
-  auto key2 = jit::JitCodeKey<jit::seq_pool_attr_t>(attr2);
-  auto key3 = jit::JitCodeKey<jit::seq_pool_attr_t>(attr3);
-  auto key4 = jit::JitCodeKey<jit::seq_pool_attr_t>(attr4);
-
-  EXPECT_TRUE(key1 == key2);
-  EXPECT_TRUE(key2 != key3);
-  EXPECT_TRUE(key2 != key4);
-  EXPECT_TRUE(key3 != key4);
-}
-
-TEST(JITKernel_key, matmul) {
-  jit::matmul_attr_t attr1(1, 2, 3);
-  jit::matmul_attr_t attr2(1, 2, 3);
-  jit::matmul_attr_t attr3(1, 3, 3);
-  jit::matmul_attr_t attr4(2, 3, 4);
-
-  auto key1 = jit::JitCodeKey<jit::matmul_attr_t>(attr1);
-  auto key2 = jit::JitCodeKey<jit::matmul_attr_t>(attr2);
-  auto key3 = jit::JitCodeKey<jit::matmul_attr_t>(attr3);
-  auto key4 = jit::JitCodeKey<jit::matmul_attr_t>(attr4);
-
-  EXPECT_TRUE(key1 == key2);
-  EXPECT_TRUE(key2 != key3);
-  EXPECT_TRUE(key2 != key4);
-  EXPECT_TRUE(key3 != key4);
-}
-
-TEST(JITKernel_key, emb_seq_pool) {
-  jit::emb_seq_pool_attr_t attr1(1, 2, 3, 4, 5, jit::SeqPoolType::kSum);
-  jit::emb_seq_pool_attr_t attr2(1, 2, 3, 4, 5, jit::SeqPoolType::kSum);
-  jit::emb_seq_pool_attr_t attr3(10, 2, 9, 8, 7, jit::SeqPoolType::kAvg);
-  jit::emb_seq_pool_attr_t attr4(10, 3, 9, 8, 7, jit::SeqPoolType::kSum);
-  jit::emb_seq_pool_attr_t attr5(1, 6, 3, 4, 5, jit::SeqPoolType::kSum);
-
-  auto key1 = jit::JitCodeKey<jit::emb_seq_pool_attr_t>(attr1);
-  auto key2 = jit::JitCodeKey<jit::emb_seq_pool_attr_t>(attr2);
-  auto key3 = jit::JitCodeKey<jit::emb_seq_pool_attr_t>(attr3);
-  auto key4 = jit::JitCodeKey<jit::emb_seq_pool_attr_t>(attr4);
-  auto key5 = jit::JitCodeKey<jit::emb_seq_pool_attr_t>(attr5);
-
-  EXPECT_TRUE(key1 == key2);
-  EXPECT_TRUE(key2 == key3);
-  EXPECT_TRUE(key2 != key4);
-  EXPECT_TRUE(key2 != key5);
-  EXPECT_TRUE(key4 != key5);
-}
-
-TEST(JITKernel_key, sgd) {
-  jit::sgd_attr_t attr1(1, 2, 3, 4, 5);
-  jit::sgd_attr_t attr2(1, 2, 3, 4, 5);
-  jit::sgd_attr_t attr3(9, 8, 7, 4, 6);
-  jit::sgd_attr_t attr4(1, 2, 3, 6, 5);
-  jit::sgd_attr_t attr5(10, 9, 8, 7, 6);
-
-  auto key1 = jit::JitCodeKey<jit::sgd_attr_t>(attr1);
-  auto key2 = jit::JitCodeKey<jit::sgd_attr_t>(attr2);
-  auto key3 = jit::JitCodeKey<jit::sgd_attr_t>(attr3);
-  auto key4 = jit::JitCodeKey<jit::sgd_attr_t>(attr4);
-  auto key5 = jit::JitCodeKey<jit::sgd_attr_t>(attr5);
-
-  EXPECT_TRUE(key1 == key2);
-  EXPECT_TRUE(key2 == key3);
-  EXPECT_TRUE(key3 != key4);
-  EXPECT_TRUE(key3 != key5);
-  EXPECT_TRUE(key4 != key5);
-}
-
-// test kernerls
-#define TestKernelVMul TestKernelXYZN
-#define TestKernelVAdd TestKernelXYZN
-#define TestKernelVAddRelu TestKernelXYZN
-#define TestKernelVSub TestKernelXYZN
-
-#define TestKernelVScal TestKernelAXYN
-#define TestKernelVAddBias TestKernelAXYN
-
-#define TestKernelVRelu TestKernelXYN
-#define TestKernelVIdentity TestKernelXYN
-#define TestKernelVSquare TestKernelXYN
-#define TestKernelVExp TestKernelXYN
-#define TestKernelVSigmoid TestKernelXYN
-#define TestKernelVTanh TestKernelXYN
-#define TestKernelVCopy TestKernelXYN
-
-#define TestKernelHMax TestKernelXRN
-#define TestKernelHSum TestKernelXRN
-
-#define TestKernelLSTMCtHt TestKernelLSTM
-#define TestKernelLSTMC1H1 TestKernelLSTM
-
-#define TestKernelGRUH1 TestKernelGRU
-#define TestKernelGRUHtPart1 TestKernelGRU
-#define TestKernelGRUHtPart2 TestKernelGRU
-
-#define TEST_CPU_KERNEL(kernel_type)                                      \
-  TEST(JITKernel, kernel_type) {                                          \
-    TestKernel##kernel_type<jit::kernel_type##Tuple<float>, CPUPlace>();  \
-    TestKernel##kernel_type<jit::kernel_type##Tuple<double>, CPUPlace>(); \
-  }
-
-TEST_CPU_KERNEL(VMul);
-TEST_CPU_KERNEL(VAdd);
-TEST_CPU_KERNEL(VAddRelu);
-TEST_CPU_KERNEL(VSub);
-
-TEST_CPU_KERNEL(VScal);
-TEST_CPU_KERNEL(VAddBias);
-
-TEST_CPU_KERNEL(VRelu);
-TEST_CPU_KERNEL(VIdentity);
-TEST_CPU_KERNEL(VSquare);
-TEST_CPU_KERNEL(VExp);
-TEST_CPU_KERNEL(VSigmoid);
-TEST_CPU_KERNEL(VTanh);
-TEST_CPU_KERNEL(VCopy);
-
-TEST_CPU_KERNEL(HMax);
-TEST_CPU_KERNEL(HSum);
-
-TEST_CPU_KERNEL(LSTMCtHt);
-TEST_CPU_KERNEL(LSTMC1H1);
-
-TEST_CPU_KERNEL(GRUH1);
-TEST_CPU_KERNEL(GRUHtPart1);
-TEST_CPU_KERNEL(GRUHtPart2);
-
-TEST_CPU_KERNEL(NCHW16CMulNC);
-TEST_CPU_KERNEL(LayerNorm);
-TEST_CPU_KERNEL(CRFDecoding);
-
-TEST_CPU_KERNEL(SeqPool);
-TEST_CPU_KERNEL(EmbSeqPool);
-TEST_CPU_KERNEL(MatMul);
-TEST_CPU_KERNEL(Softmax);
-TEST_CPU_KERNEL(Sgd);
-TEST_CPU_KERNEL(VBroadcast);
-
-TEST_CPU_KERNEL(StrideASum);
-TEST_CPU_KERNEL(StrideScal);
diff --git a/lite/backends/x86/legacy_place.h b/lite/backends/x86/legacy_place.h
deleted file mode 100644
index 8f96bbd7da..0000000000
--- a/lite/backends/x86/legacy_place.h
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-namespace paddle {
-namespace lite {
-namespace fluid {
-
-// Fake the legacy Place.
-struct Place {
-  int which() const { return 1; }  // fake
-};
-
-struct CPUPlace : Place {};
-
-}  // namespace fluid
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/CMakeLists.txt b/lite/backends/x86/math/CMakeLists.txt
deleted file mode 100644
index 5f440947fe..0000000000
--- a/lite/backends/x86/math/CMakeLists.txt
+++ /dev/null
@@ -1,62 +0,0 @@
-add_subdirectory(detail)
-
-function(math_library TARGET)
-    # math_library is a function to create math library.
-    # The interface is the same as lite_cc_library.
-    # But it handle split GPU/CPU code and link some common library.
-    set(cc_srcs)
-    set(hip_srcs)
-    set(math_common_deps context framework_proto)
-    set(multiValueArgs DEPS)
-    cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
-            "${multiValueArgs}" ${ARGN})
-
-    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
-        list(APPEND cc_srcs ${TARGET}.cc)
-    endif()
-
-    list(LENGTH cc_srcs cc_srcs_len)
-    lite_cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps} eigen3 dynload_mklml)
-endfunction()
-
-# please add new math_library in alphabetical order
-math_library(concat_and_split)
-math_library(context_project DEPS im2col math_function)
-math_library(cross_entropy)
-math_library(cos_sim_functor)
-## math_library(depthwise_conv DEPS cub)
-math_library(im2col)
-math_library(sample_prob)
-math_library(sampler)
-
-math_library(gru_compute DEPS activation_functions math_function)
-## math_library(lstm_compute DEPS activation_functions)
-
-lite_cc_library(blas SRCS blas.cc DEPS cblas framework_proto eigen3)
-math_library(math_function DEPS blas)
-math_library(maxouting)
-math_library(pooling)
-# math_library(selected_rows_functor DEPS selected_rows math_function blas)
-math_library(sequence2batch)
-math_library(sequence_padding)
-math_library(sequence_pooling DEPS math_function jit_kernel_helper)
-math_library(sequence_scale)
-math_library(softmax DEPS math_function jit_kernel_helper)
-math_library(beam_search DEPS math_function)
-#
-## math_library(matrix_bit_code)
-#
-math_library(unpooling)
-math_library(vol2col)
-## math_library(prelu)
-math_library(tree2col DEPS math_function)
-
-# cc_test(math_function_test SRCS math_function_test.cc DEPS math_function)
-# cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
-# cc_test(im2col_test SRCS im2col_test.cc DEPS im2col)
-# cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col)
-# cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding)
-# cc_test(sequence_pooling_test SRCS sequence_pooling_test.cc DEPS sequence_pooling)
-# cc_test(beam_search_test SRCS beam_search_test.cc DEPS beam_search)
-# cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
-# cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
diff --git a/lite/backends/x86/math/beam_search.cc b/lite/backends/x86/math/beam_search.cc
deleted file mode 100644
index 93726afcc2..0000000000
--- a/lite/backends/x86/math/beam_search.cc
+++ /dev/null
@@ -1,322 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/x86/math/beam_search.h"
-#include <algorithm>
-#include <map>
-#include "lite/fluid/lod.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-template <typename T>
-class BeamSearchFunctor<TARGET(kX86), T> {
- public:
-  void operator()(const lite::X86Context &context,
-                  const lite::Tensor *pre_ids,
-                  const lite::Tensor *pre_scores,
-                  const lite::Tensor *ids,
-                  const lite::Tensor *scores,
-                  lite::Tensor *selected_ids,
-                  lite::Tensor *selected_scores,
-                  lite::Tensor *parent_idx,
-                  size_t level,
-                  size_t beam_size,
-                  int end_id,
-                  bool is_accumulated) {
-    auto abs_lod = lite::fluid::ToAbsOffset(scores->lod());
-    auto &high_level = abs_lod[level];
-
-    auto items = SelectTopBeamSizeItems(pre_ids,
-                                        pre_scores,
-                                        ids,
-                                        scores,
-                                        level,
-                                        beam_size,
-                                        end_id,
-                                        is_accumulated);
-    auto selected_items = ToMap(items, high_level.back());
-    if (FLAGS_v == 3) {
-      VLOG(3) << "selected_items:";
-      for (size_t i = 0; i < selected_items.size(); ++i) {
-        VLOG(3) << "offset: " << i;
-        for (auto &item : selected_items[i]) {
-          VLOG(3) << item.ToString();
-        }
-      }
-    }
-
-    PruneEndBeams(pre_ids, abs_lod, &selected_items, level, end_id);
-    // calculate the output tensor's height
-    size_t num_instances = std::accumulate(
-        std::begin(selected_items),
-        std::end(selected_items),
-        0,
-        [](size_t a, std::vector<Item> &b) { return a + b.size(); });
-    // the output tensor shape should be [num_instances, 1]
-    // auto dims = framework::make_ddim(
-    //     std::vector<int64_t>({static_cast<int>(num_instances), 1}));
-    lite::DDim dims(std::vector<int64_t>({num_instances, 1L}));
-
-    selected_ids->Resize(dims);
-    auto *selected_ids_data = selected_ids->mutable_data<int64_t>(TARGET(kX86));
-
-    selected_scores->Resize(dims);
-    auto *selected_scores_data =
-        selected_scores->mutable_data<int64_t>(TARGET(kX86));
-
-    // auto *selected_ids_data =
-    //    selected_ids->mutable_data<int64_t>(dims, platform::CPUPlace());
-    // auto *selected_scores_data =
-    //    selected_scores->mutable_data<float>(dims, platform::CPUPlace());
-    parent_idx->Resize({static_cast<int64_t>(num_instances)});
-    auto *parent_idx_data =
-        parent_idx ? parent_idx->mutable_data<int>(TARGET(kX86)) : nullptr;
-    // auto *parent_idx_data =
-    //    parent_idx
-    //        ? parent_idx->mutable_data<int>(
-    //              {static_cast<int64_t>(num_instances)}, platform::CPUPlace())
-    //        : nullptr;
-
-    // fill in data
-    std::vector<size_t> low_level;
-    size_t low_offset = 0;
-    for (auto &items : selected_items) {
-      low_level.push_back(low_offset);
-      for (auto &item : items) {
-        if (parent_idx) {
-          parent_idx_data[low_offset] = static_cast<int>(low_level.size() - 1);
-        }
-        selected_ids_data[low_offset] = item.id;
-        selected_scores_data[low_offset] = item.score;
-        low_offset++;
-      }
-    }
-    low_level.push_back(low_offset);
-
-    // fill lod
-    lite::LoD lod(2);
-    lod[0].assign(high_level.begin(), high_level.end());
-    lod[1].assign(low_level.begin(), low_level.end());
-    // if (!lite::fluid::CheckLoD(lod)) {
-    //  //PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
-    //}
-    selected_ids->set_lod(lod);
-    selected_scores->set_lod(lod);
-  }
-
-  /*
-   * The basic items help to sort.
-   */
-  struct Item {
-    Item() {}
-    Item(size_t offset, size_t id, float score)
-        : offset(offset), id(id), score(score) {}
-    // offset in the higher lod level.
-    size_t offset;
-    // prefix id in the lower lod level.
-    // size_t prefix;
-    // the candidate id
-    size_t id;
-    // the corresponding score
-    float score;
-
-    inline bool operator<(const Item &in) const {
-      return (score < in.score) ||
-             ((score == in.score) && (offset < in.offset));
-    }
-
-    inline void operator=(const Item &in) {
-      offset = in.offset;
-      id = in.id;
-      score = in.score;
-    }
-
-    std::string ToString() {
-      std::ostringstream os;
-      os << "{";
-      os << "offset: " << offset << ", ";
-      os << "id: " << id << ", ";
-      os << "score: " << score << "";
-      os << "}";
-      return os.str();
-    }
-  };
-
- protected:
-  /*
-   * Prune the source sentences all branchs finished, and it is optional.
-   * Pruning must one step later than finishing (thus pre_ids is needed here),
-   * since the end tokens must be writed out.
-   */
-  void PruneEndBeams(const lite::Tensor *pre_ids,
-                     const lite::LoD &abs_lod,
-                     std::vector<std::vector<Item>> *items,
-                     size_t lod_level,
-                     int end_id) {
-    auto *pre_ids_data = pre_ids->data<int64_t>();
-    auto &high_level = abs_lod[lod_level];
-    for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) {
-      size_t src_prefix_start = high_level[src_idx];
-      size_t src_prefix_end = high_level[src_idx + 1];
-      bool finish_flag = true;
-      for (size_t offset = src_prefix_start; offset < src_prefix_end;
-           offset++) {
-        for (auto &item : items->at(offset)) {
-          if (item.id != static_cast<size_t>(end_id) ||
-              pre_ids_data[offset] != end_id) {
-            finish_flag = false;
-            break;
-          }
-        }
-        if (!finish_flag) break;
-      }
-      if (finish_flag) {  // all branchs of the beam (source sentence) end and
-                          // prune this beam
-        for (size_t offset = src_prefix_start; offset < src_prefix_end;
-             offset++)
-          items->at(offset).clear();
-      }
-    }
-  }
-
-  /*
-   * Transform the items into a map whose key is offset, value is the items.
-   * NOTE low performance.
-   */
-  std::vector<std::vector<Item>> ToMap(
-      const std::vector<std::vector<Item>> &items, size_t element_num) {
-    std::vector<std::vector<Item>> result;
-    result.resize(element_num);
-    for (auto &entries : items) {
-      for (const auto &item : entries) {
-        result[item.offset].push_back(item);
-      }
-    }
-    return result;
-  }
-
-  void Insert(std::vector<Item> *top_beam_ptr,
-              const Item &item,
-              size_t beam_size) {
-    std::vector<Item> &top_beam = *top_beam_ptr;
-
-    size_t num_beams = top_beam.size();
-    if (num_beams < beam_size) {
-      top_beam.resize(num_beams + 1);
-      num_beams++;
-    } else {
-      if (item < top_beam[beam_size - 1]) {
-        return;
-      }
-    }
-
-    for (int k = static_cast<int>(num_beams) - 2; k >= 0; --k) {
-      if (top_beam[k] < item) {
-        top_beam[k + 1] = top_beam[k];
-      } else {
-        top_beam[k + 1] = item;
-        return;
-      }
-    }
-    top_beam[0] = item;
-  }
-
-  /*
-   * For each source, select top beam_size records.
-   */
-  std::vector<std::vector<Item>> SelectTopBeamSizeItems(
-      const lite::Tensor *pre_ids,
-      const lite::Tensor *pre_scores,
-      const lite::Tensor *ids,
-      const lite::Tensor *scores,
-      size_t lod_level,
-      size_t beam_size,
-      int end_id,
-      bool is_accumulated) {
-    std::vector<std::vector<Item>> result;
-
-    // find the current candidates
-    auto abs_lod = lite::fluid::ToAbsOffset(scores->lod());
-
-    auto *pre_ids_data = pre_ids->data<int64_t>();
-    auto *pre_scores_data = pre_scores->data<float>();
-
-    auto *ids_data = ids ? ids->data<int64_t>() : nullptr;
-    auto *scores_data = scores->data<float>();
-
-    // size_t num_seqs = scores->NumElements(lod_level);
-    size_t num_seqs = scores->lod()[lod_level].size() - 1;
-    size_t seq_width = 1;
-    for (int i = 1; i < scores->dims().size(); i++) {
-      seq_width *= scores->dims()[i];
-    }
-
-    for (size_t seq_id = 0; seq_id < num_seqs; ++seq_id) {
-      size_t seq_offset_start = abs_lod[lod_level][seq_id];
-      size_t seq_offset_end = abs_lod[lod_level][seq_id + 1];
-
-      std::vector<Item> top_beam;
-      top_beam.reserve(beam_size);
-
-      for (size_t offset = seq_offset_start; offset < seq_offset_end;
-           ++offset) {
-        auto pre_id = pre_ids_data[offset];
-        auto pre_score = pre_scores_data[offset];
-        if (pre_id == end_id) {
-          // Allocate all probability mass to end_id for finished branchs and
-          // the other candidate ids can be ignored.
-          Item item(offset, end_id, pre_score);
-          Insert(&top_beam, item, beam_size);
-        } else {
-          size_t index = offset * seq_width;
-          for (size_t d = 0; d < seq_width; d++, index++) {
-            int64_t id = ids_data ? ids_data[index] : static_cast<int64_t>(d);
-            float score = is_accumulated
-                              ? scores_data[index]
-                              : pre_score + std::log(scores_data[index]);
-            Item item(offset, id, score);
-            Insert(&top_beam, item, beam_size);
-          }
-        }
-      }
-
-      result.emplace_back(top_beam);
-    }
-
-    if (FLAGS_v == 3) {
-      VLOG(3) << "SelectTopBeamSizeItems result size " << result.size();
-      for (auto &items : result) {
-        VLOG(3) << "item set:";
-        for (auto &item : items) {
-          VLOG(3) << item.ToString();
-        }
-      }
-    }
-
-    return result;
-  }
-};
-
-template class BeamSearchFunctor<TARGET(kX86), int>;
-template class BeamSearchFunctor<TARGET(kX86), int64_t>;
-template class BeamSearchFunctor<TARGET(kX86), float>;
-template class BeamSearchFunctor<TARGET(kX86), double>;
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/beam_search.h b/lite/backends/x86/math/beam_search.h
deleted file mode 100644
index 40998c89f9..0000000000
--- a/lite/backends/x86/math/beam_search.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "lite/core/context.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-/*
- * This is an implementation of beam search.
- *
- * To explain the details, lets take machine translation task for example, in
- * this task, one source sentence is translated to multiple target sentences,
- * during this period, one sentence will be translated to multiple translation
- * prefixes(target sentence that have not ended), in each time step a prefix
- * will have some candidates, input the candidate ids and their corresponding
- * scores (probabilities), it will sort and select the top beam_size candidates
- * for each source sentence, and store the selected candidates's score and their
- * corresponding ids to LoDTensors.
- *
- * A detailed example:
- *
- *  Input
- *
- *    ids:
- *      - LoD (should have 2 levels)
- *        - first level: [0, 1, 4]
- *        - second level: [0, 1, 2, 3, 4]
- *      - tensor's data:
- *          [[4, 2, 5]
- *           [2, 1, 3]
- *           [3, 5, 2]
- *           [8, 2, 1]]
- *
- *    scores:
- *      - LoD same as `ids`
- *      - tensor's data
- *          [[0.5, 0.3, 0.2]
- *           [0.6, 0.3, 0.1]
- *           [0.9, 0.5, 0.1]
- *           [0.7, 0.5, 0.1]]
- *
- * The inputs means that there are 2 source sentences to translate, and the
- * first source has 1 prefix, the second source has 2 prefix.
- *
- * Lets assume beam size is 2, and the beam search's output should be
- *      - LoD
- *        - first level: [0, 1, 2]
- *        - second level: [0, 2, 4]
- *      - id tensor's data
- *          [[4,
- *            1,
- *            3,
- *            8]]
- *      - score tensor's data
- *          [[0.5,
- *            0.3,
- *            0.9,
- *            0.7]]
- *
- * TODO all the prune operations should be in the beam search, so it is better
- * to split the beam search algorithm into a sequence of smaller operators, and
- * the prune operators can be inserted in this sequence.
- */
-template <lite::TargetType Target, typename T>
-class BeamSearchFunctor {
- public:
-  /*
-   * The main function of beam search.
-   *
-   * @selected_ids: a [None, 1]-shaped tensor with LoD.
-   *   In a machine translation model, it might be the candidate term id sets,
-   *   each set stored as a varience-length sequence.
-   *   The format might be described with a two-level LoD
-   *   - [[0 1],
-   *      [0 1 2]]
-   *   - [[]
-   *      [0 1]]
-   *   the first level of LoD tells that there are two source sentences. The
-   *   second level describes the details of the candidate id set's offsets in
-   * the source sentences.
-   *
-   *  @selected_scores: a LoD tensor with the same shape and LoD with
-   * selected_ids.
-   *   It stores the corresponding scores of candidate ids in selected_ids.
-   *
-   * Return false if all the input tensor is empty, in machine translation task
-   * that means no candidates is provided, and the task will stop running.
-   */
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor* pre_ids,
-                  const lite::Tensor* pre_scores,
-                  const lite::Tensor* ids,
-                  const lite::Tensor* scores,
-                  lite::Tensor* selected_ids,
-                  lite::Tensor* selected_scores,
-                  lite::Tensor* parent_idx,
-                  size_t level,
-                  size_t beam_size,
-                  int end_id,
-                  bool is_accumulated);
-};
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/beam_search_test.cc b/lite/backends/x86/math/beam_search_test.cc
deleted file mode 100644
index 904870207b..0000000000
--- a/lite/backends/x86/math/beam_search_test.cc
+++ /dev/null
@@ -1,152 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/beam_search.h"
-#include <gtest/gtest.h>
-#include <vector>
-
-void PrepareCPUTensors(paddle::framework::LoDTensor* ids,
-                       paddle::framework::LoDTensor* scores,
-                       paddle::framework::LoDTensor* pre_ids,
-                       paddle::framework::LoDTensor* pre_scores) {
-  // lod
-  paddle::framework::LoD lod;
-  std::vector<size_t> level0({0, 2, 4});
-  std::vector<size_t> level1({0, 1, 2, 3, 4});
-  lod.push_back(level0);
-  lod.push_back(level1);
-  ids->set_lod(lod);
-  scores->set_lod(lod);
-
-  auto dims = paddle::framework::make_ddim({4, 3});
-  ids->Resize(dims);
-  scores->Resize(dims);
-
-  paddle::platform::CPUPlace place;
-  auto* ids_data = ids->mutable_data<int64_t>(place);
-  auto* scores_data = scores->mutable_data<float>(place);
-  std::vector<int64_t> ids_vec_data({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1});
-  std::vector<float> scores_vec_data(
-      {0.6f, 0.3f, 0.5f, 0.2f, 0.3f, 0.1f, 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f});
-
-  CHECK_EQ(static_cast<size_t>(ids->numel()), ids_vec_data.size());
-  CHECK_EQ(static_cast<size_t>(ids->numel()), scores_vec_data.size());
-
-  for (int i = 0; i < ids->numel(); i++) {
-    ids_data[i] = ids_vec_data[i];
-    scores_data[i] = scores_vec_data[i];
-  }
-
-  // pre_ids
-  pre_ids->Resize(paddle::framework::make_ddim({4, 1}));
-  for (int i = 0; i < 4; i++) {
-    pre_ids->mutable_data<int64_t>(place)[i] = i + 1;
-  }
-
-  // pre_scores
-  pre_scores->Resize(paddle::framework::make_ddim({4, 1}));
-  for (int i = 0; i < 4; i++) {
-    pre_scores->mutable_data<float>(place)[i] = 0.1 * (i + 1);
-  }
-}
-
-template <typename DeviceContext, typename Place>
-void TestBeamSearch() {
-  paddle::framework::LoDTensor ids;
-  paddle::framework::LoDTensor scores;
-  paddle::framework::LoDTensor pre_ids;
-  paddle::framework::LoDTensor pre_scores;
-
-  auto* place = new Place();
-  DeviceContext* context = new DeviceContext(*place);
-  if (paddle::platform::is_cpu_place(*place)) {
-    PrepareCPUTensors(&ids, &scores, &pre_ids, &pre_scores);
-  } else {
-    paddle::framework::LoDTensor cpu_ids;
-    paddle::framework::LoDTensor cpu_scores;
-    paddle::framework::LoDTensor cpu_pre_ids;
-    paddle::framework::LoDTensor cpu_pre_scores;
-
-    PrepareCPUTensors(&cpu_ids, &cpu_scores, &cpu_pre_ids, &cpu_pre_scores);
-
-    TensorCopySync(cpu_ids, *place, &ids);
-    TensorCopySync(cpu_scores, *place, &scores);
-    TensorCopySync(cpu_pre_ids, *place, &pre_ids);
-    TensorCopySync(cpu_pre_scores, *place, &pre_scores);
-
-    ids.set_lod(cpu_ids.lod());
-    scores.set_lod(cpu_scores.lod());
-    pre_ids.set_lod(cpu_pre_ids.lod());
-    pre_scores.set_lod(cpu_pre_scores.lod());
-  }
-
-  paddle::framework::LoDTensor selected_ids;
-  paddle::framework::LoDTensor selected_scores;
-  paddle::framework::LoDTensor parent_idx;
-
-  size_t level = 0;
-  size_t beam_size = 2;
-  int end_id = 0;
-  paddle::operators::math::BeamSearchFunctor<DeviceContext, float> beamsearch;
-  beamsearch(*context,
-             &pre_ids,
-             &pre_scores,
-             &ids,
-             &scores,
-             &selected_ids,
-             &selected_scores,
-             &parent_idx,
-             level,
-             beam_size,
-             end_id,
-             true);
-
-  ASSERT_EQ(selected_ids.lod(), selected_scores.lod());
-
-  paddle::framework::LoDTensor cpu_selected_ids;
-  paddle::framework::LoDTensor cpu_selected_scores;
-  if (paddle::platform::is_cpu_place(*place)) {
-    cpu_selected_ids = selected_ids;
-    cpu_selected_scores = selected_scores;
-  } else {
-    TensorCopySync(
-        selected_ids, paddle::platform::CPUPlace(), &cpu_selected_ids);
-    TensorCopySync(
-        selected_scores, paddle::platform::CPUPlace(), &cpu_selected_scores);
-    cpu_selected_ids.set_lod(selected_ids.lod());
-    cpu_selected_scores.set_lod(selected_scores.lod());
-  }
-
-  std::vector<int64_t> expected_ids({4, 5, 3, 8});
-  std::vector<float> expected_scores({0.6f, 0.5f, 0.9f, 0.7f});
-  for (int i = 0; i < 4; i++) {
-    ASSERT_EQ(expected_ids[i], cpu_selected_ids.data<int64_t>()[i]);
-    ASSERT_EQ(expected_scores[i], cpu_selected_scores.data<float>()[i]);
-  }
-
-  delete place;
-  delete context;
-}
-
-TEST(BeamSearch, CPU) {
-  TestBeamSearch<paddle::platform::CPUDeviceContext,
-                 paddle::platform::CPUPlace>();
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(BeamSearch, GPU) {
-  TestBeamSearch<paddle::platform::CUDADeviceContext,
-                 paddle::platform::CUDAPlace>();
-}
-#endif
diff --git a/lite/backends/x86/math/blas.cc b/lite/backends/x86/math/blas.cc
deleted file mode 100644
index 2d21adaf5d..0000000000
--- a/lite/backends/x86/math/blas.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/x86/math/blas.h"
-
-#include <utility>
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim,
-                                     int num_flatten_cols,
-                                     bool trans) {
-  PADDLE_ENFORCE_GT(tensor_dim.size(), 1);
-  MatDescriptor retv;
-  if (num_flatten_cols > 1) {
-    auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols);
-    retv.height_ = flatten_dim[0];
-    retv.width_ = flatten_dim[1];
-  } else {
-    if (tensor_dim.size() == 2) {
-      retv.height_ = tensor_dim[0];
-      retv.width_ = tensor_dim[1];
-    } else {
-      auto dim_vec = tensor_dim.Vectorize();
-      retv.batch_size_ = 1;
-      for (size_t i = 0; i < dim_vec.size() - 2; ++i) {
-        retv.batch_size_ *= dim_vec[i];
-      }
-      retv.height_ = dim_vec[dim_vec.size() - 2];
-      retv.width_ = dim_vec[dim_vec.size() - 1];
-      retv.stride_ = retv.height_ * retv.width_;
-    }
-  }
-  if (trans) {
-    std::swap(retv.width_, retv.height_);
-  }
-  retv.trans_ = trans;
-  return retv;
-}
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/blas.h b/lite/backends/x86/math/blas.h
deleted file mode 100644
index c7d5abfce3..0000000000
--- a/lite/backends/x86/math/blas.h
+++ /dev/null
@@ -1,408 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "lite/core/op_lite.h"
-#include "lite/core/tensor.h"
-
-#ifdef PADDLE_WITH_MKLML
-#include "lite/backends/x86/mklml.h"
-#endif
-
-#ifdef PADDLE_WITH_LIBXSMM
-#include <libxsmm.h>
-#endif
-
-#ifdef PADDLE_USE_OPENBLAS
-#include <cblas.h>
-#endif
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-/**
- * Matrix Descriptor of a memory buffer.
- *
- * It is used for Blas::MatMul. MatMul operator can be batched.
- * if Mat A is [BatchSize, H, W], Mat B is [BatchSize, H, W]. It will be a
- * `batch_size` times of GEMM. The batched GEMM could be faster base on the
- * implementation of the blas library. The batch size could be zero. If any
- * matrix of `matmul` has a batch size, the will be a batched GEMM, too. e.g.,
- * Mat A is [BatchSize, H1, W2], and Mat B [H2, W2], The result matrix wil be
- * [BatchSize, H1, W2]
- *
- * The boolean flag, `trans`, describe the memory is the transpose of matrix or
- * not. If the trans is true, the last two dims of matrix are transposed. The
- * memory layout of the matrix is [Width, Height] or [BatchSize, Width, Height].
- *
- * The MatDescriptor is not only the dimension or shape of a matrix, it also
- * contains the layout, stride of matrix. It is clearer to have a structure than
- * reuse `DDim`.
- */
-struct MatDescriptor {
-  int64_t height_;
-  int64_t width_;
-  int64_t stride_{0};
-  int64_t batch_size_{0};
-  bool trans_;
-};
-
-/**
- * Create Matrix Descriptor from a tensor dim, num_flatten_cols, and transpose
- * flag
- *
- * @param tensor_dim: The dimension of the tensor. The rank of this dimension
- * must larger than 1.
- *
- * @param num_flatten_cols:  Reshape a tensor to a matrix. The matrix's first
- * dimension(column length) will be the product of tensor's first `num_col_dims`
- * dimensions. If num_flatten_cols is zero, the first N-2 dimension will be the
- * batch_size of descriptor.
- *
- * @param trans: True if the matrix is transposed.
- */
-extern MatDescriptor CreateMatrixDescriptor(const lite::DDimLite& tensor_dim,
-                                            int num_flatten_cols,
-                                            bool trans);
-
-template <lite::TargetType Target>
-class Blas {
- public:
-  explicit Blas(const lite::Context<Target>& context) : context_(context) {}
-
-  template <typename T>
-  void GEMM(CBLAS_TRANSPOSE transA,
-            CBLAS_TRANSPOSE transB,
-            int M,
-            int N,
-            int K,
-            T alpha,
-            const T* A,
-            const T* B,
-            T beta,
-            T* C) const;
-
-  template <typename T>
-  void GEMM(bool transA,
-            bool transB,
-            int M,
-            int N,
-            int K,
-            T alpha,
-            const T* A,
-            int lda,
-            const T* B,
-            int ldb,
-            T beta,
-            T* C,
-            int ldc) const;
-
-  template <typename T>
-  void GEMM(CBLAS_TRANSPOSE transA,
-            CBLAS_TRANSPOSE transB,
-            int M,
-            int N,
-            int K,
-            T alpha,
-            const T* A,
-            int lda,
-            const T* B,
-            int ldb,
-            T beta,
-            T* C,
-            int ldc) const;
-
-#ifdef PADDLE_WITH_MKLML
-  template <typename T>
-  T* GEMM_ALLOC(const CBLAS_IDENTIFIER id,
-                const int M,
-                const int N,
-                const int K) const;
-
-  template <typename T>
-  void GEMM_PACK(const CBLAS_IDENTIFIER id,
-                 const CBLAS_TRANSPOSE trans,
-                 int M,
-                 int N,
-                 int K,
-                 const T alpha,
-                 const T* src,
-                 const int ld,
-                 T* dst) const;
-
-  template <typename T>
-  void GEMM_COMPUTE(int transA,
-                    int transB,
-                    int M,
-                    int N,
-                    int K,
-                    const T* A,
-                    const int lda,
-                    const T* B,
-                    const int ldb,
-                    T beta,
-                    T* C,
-                    const int ldc) const;
-
-  template <typename T>
-  void GEMM_FREE(T* data) const;
-#endif
-
-  template <typename T>
-  void MatMul(const int M,
-              const int N,
-              const int K,
-              const T* A,
-              const T* B,
-              T* C) const;
-
-  template <typename T>
-  void MatMul(const lite::TensorLite& mat_a,
-              bool trans_a,
-              const lite::TensorLite& mat_b,
-              bool trans_b,
-              T alpha,
-              lite::TensorLite* mat_out,
-              T beta) const;
-
-  template <typename T>
-  void MatMul(const lite::TensorLite& mat_a,
-              bool trans_a,
-              const lite::TensorLite& mat_b,
-              bool trans_b,
-              lite::TensorLite* mat_out) const {
-    MatMul(mat_a,
-           trans_a,
-           mat_b,
-           trans_b,
-           static_cast<T>(1.0),
-           mat_out,
-           static_cast<T>(0.0));
-  }
-
-  template <typename T>
-  void MatMul(const lite::TensorLite& mat_a,
-              const lite::TensorLite& mat_b,
-              lite::TensorLite* mat_out) const {
-    this->template MatMul<T>(mat_a, false, mat_b, false, mat_out);
-  }
-
-  template <typename T>
-  void AXPY(int n, T alpha, const T* x, T* y) const;
-
-  template <typename T>
-  void VADD(int n, const T* x, const T* y, T* z) const;
-
-  template <typename T>
-  void VMUL(int n, const T* x, const T* y, T* z) const;
-
-  template <typename T>
-  void VCOPY(int n, const T* x, T* y) const;
-
-  template <typename T>
-  void VEXP(int n, const T* x, T* y) const;
-
-  template <typename T>
-  void VSQUARE(int n, const T* x, T* y) const;
-
-  template <typename T>
-  void VPOW(int n, const T* x, T alpha, T* y) const;
-
-  template <typename T>
-  void GEMV(bool trans_a,
-            int M,
-            int N,
-            T alpha,
-            const T* A,
-            const T* B,
-            T beta,
-            T* C) const;
-
-  template <typename T>
-  T DOT(int n, const T* x, const T* y) const;
-
-  template <typename T>
-  void SCAL(int n, const T a, T* x) const;
-
-  template <typename T>
-  T ASUM(int n, T* x, int inc) const;
-
-  template <typename T>
-  void BatchedGEMM(CBLAS_TRANSPOSE transA,
-                   CBLAS_TRANSPOSE transB,
-                   int M,
-                   int N,
-                   int K,
-                   T alpha,
-                   const T* A,
-                   const T* B,
-                   T beta,
-                   T* C,
-                   int batchCount,
-                   int64_t strideA,
-                   int64_t strideB) const;
-
-  template <typename T>
-  void MatMul(const lite::TensorLite& mat_a,
-              const MatDescriptor& dim_a,
-              const lite::TensorLite& mat_b,
-              const MatDescriptor& dim_b,
-              T alpha,
-              lite::TensorLite* mat_out,
-              T beta) const;
-
-  template <typename T>
-  void VINV(int n, const T* a, T* y) const;
-
-  template <typename T>
-  void VMERF(int n, const T* a, T* y, int64_t mode) const;
-
- private:
-  const lite::Context<Target>& context_;
-};
-
-template <lite::TargetType Target, typename T>
-class BlasT : private Blas<Target> {
- public:
-  using Blas<Target>::Blas;
-
-  template <typename... ARGS>
-  void GEMM(ARGS... args) const {
-    Base()->template GEMM<T>(args...);
-  }
-
-#ifdef PADDLE_WITH_MKLML
-  template <typename... ARGS>
-  T* GEMM_ALLOC(ARGS... args) const {
-    return Base()->template GEMM_ALLOC<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void GEMM_PACK(ARGS... args) const {
-    Base()->template GEMM_PACK<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void GEMM_COMPUTE(ARGS... args) const {
-    Base()->template GEMM_COMPUTE<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void GEMM_FREE(ARGS... args) const {
-    Base()->template GEMM_FREE<T>(args...);
-  }
-#endif
-
-  template <typename... ARGS>
-  void MatMul(ARGS... args) const {
-    Base()->template MatMul<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void AXPY(ARGS... args) const {
-    Base()->template AXPY<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VADD(ARGS... args) const {
-    Base()->template VADD<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VMUL(ARGS... args) const {
-    Base()->template VMUL<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VCOPY(ARGS... args) const {
-    Base()->template VCOPY<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VEXP(ARGS... args) const {
-    Base()->template VEXP<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VSQUARE(ARGS... args) const {
-    Base()->template VSQUARE<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VPOW(ARGS... args) const {
-    Base()->template VPOW<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void GEMV(ARGS... args) const {
-    Base()->template GEMV<T>(args...);
-  }
-
-  template <typename... ARGS>
-  T DOT(ARGS... args) const {
-    return Base()->template DOT<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void SCAL(ARGS... args) const {
-    Base()->template SCAL<T>(args...);
-  }
-
-  template <typename... ARGS>
-  T ASUM(ARGS... args) const {
-    return Base()->template ASUM<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void BatchedGEMM(ARGS... args) const {
-    Base()->template BatchedGEMM<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VINV(ARGS... args) const {
-    Base()->template VINV<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VMERF(ARGS... args) const {
-    Base()->template VMERF<T>(args...);
-  }
-
- private:
-  const Blas<Target>* Base() const {
-    return static_cast<const Blas<Target>*>(this);
-  }
-};
-
-// template <lite::TargetType Target, typename T>
-// inline BlasT<Target, T> GetBlas(
-//    const framework::ExecutionContext& exe_ctx) {
-//  return BlasT<DeviceContext, T>(
-//      exe_ctx.template device_context<DeviceContext>());
-//}
-
-template <lite::TargetType Target, typename T>
-inline BlasT<Target, T> GetBlas(const lite::Context<Target>& dev_ctx) {
-  return BlasT<Target, T>(dev_ctx);
-}
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
-
-#include "lite/backends/x86/math/blas_impl.h"
diff --git a/lite/backends/x86/math/blas_impl.h b/lite/backends/x86/math/blas_impl.h
deleted file mode 100644
index c4844a4df3..0000000000
--- a/lite/backends/x86/math/blas_impl.h
+++ /dev/null
@@ -1,812 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <cmath>
-#include <limits>
-#include <vector>
-#include "lite/backends/x86/math/math_function.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-template <typename T>
-struct CBlas;
-
-#ifdef PADDLE_WITH_MKLML
-template <>
-struct CBlas<float> {
-  template <typename... ARGS>
-  static void GEMM(ARGS... args) {
-    lite::x86::cblas_sgemm(args...);
-  }
-
-  template <typename... ARGS>
-  static float *GEMM_ALLOC(ARGS... args) {
-    return lite::x86::cblas_sgemm_alloc(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_PACK(ARGS... args) {
-    lite::x86::cblas_sgemm_pack(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_COMPUTE(ARGS... args) {
-    lite::x86::cblas_sgemm_compute(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_FREE(ARGS... args) {
-    lite::x86::cblas_sgemm_free(args...);
-  }
-
-#ifdef PADDLE_WITH_LIBXSMM
-  template <typename... ARGS>
-  static void SMM_GEMM(ARGS... args) {
-    libxsmm_sgemm(args...);
-  }
-#endif
-
-  template <typename... ARGS>
-  static void AXPY(ARGS... args) {
-    lite::x86::cblas_saxpy(args...);
-  }
-
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    lite::x86::cblas_scopy(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMV(ARGS... args) {
-    lite::x86::cblas_sgemv(args...);
-  }
-
-  template <typename... ARGS>
-  static float DOT(ARGS... args) {
-    return lite::x86::cblas_sdot(args...);
-  }
-
-  template <typename... ARGS>
-  static void SCAL(ARGS... args) {
-    lite::x86::cblas_sscal(args...);
-  }
-
-  template <typename... ARGS>
-  static float ASUM(ARGS... args) {
-    return lite::x86::cblas_sasum(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_BATCH(ARGS... args) {
-    lite::x86::cblas_sgemm_batch(args...);
-  }
-
-  template <typename... ARGS>
-  static void VADD(ARGS... args) {
-    lite::x86::vsAdd(args...);
-  }
-
-  template <typename... ARGS>
-  static void VMUL(ARGS... args) {
-    lite::x86::vsMul(args...);
-  }
-
-  template <typename... ARGS>
-  static void VEXP(ARGS... args) {
-    lite::x86::vsExp(args...);
-  }
-
-  template <typename... ARGS>
-  static void VSQUARE(ARGS... args) {
-    lite::x86::vsSqr(args...);
-  }
-
-  template <typename... ARGS>
-  static void VPOW(ARGS... args) {
-    lite::x86::vsPowx(args...);
-  }
-
-  template <typename... ARGS>
-  static void VINV(ARGS... args) {
-    lite::x86::vsInv(args...);
-  }
-
-  template <typename... ARGS>
-  static void VMERF(ARGS... args) {
-    lite::x86::vmsErf(args...);
-  }
-};
-
-template <>
-struct CBlas<double> {
-  template <typename... ARGS>
-  static void GEMM(ARGS... args) {
-    lite::x86::cblas_dgemm(args...);
-  }
-
-  template <typename... ARGS>
-  static double *GEMM_ALLOC(ARGS... args) {
-    return lite::x86::cblas_dgemm_alloc(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_PACK(ARGS... args) {
-    lite::x86::cblas_dgemm_pack(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_COMPUTE(ARGS... args) {
-    lite::x86::cblas_dgemm_compute(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_FREE(ARGS... args) {
-    lite::x86::cblas_dgemm_free(args...);
-  }
-
-#ifdef PADDLE_WITH_LIBXSMM
-  template <typename... ARGS>
-  static void SMM_GEMM(ARGS... args) {
-    libxsmm_dgemm(args...);
-  }
-#endif
-
-  template <typename... ARGS>
-  static void AXPY(ARGS... args) {
-    lite::x86::cblas_daxpy(args...);
-  }
-
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    lite::x86::cblas_dcopy(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMV(ARGS... args) {
-    lite::x86::cblas_dgemv(args...);
-  }
-
-  template <typename... ARGS>
-  static double DOT(ARGS... args) {
-    return lite::x86::cblas_ddot(args...);
-  }
-
-  template <typename... ARGS>
-  static void SCAL(ARGS... args) {
-    lite::x86::cblas_dscal(args...);
-  }
-
-  template <typename... ARGS>
-  static double ASUM(ARGS... args) {
-    return lite::x86::cblas_dasum(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_BATCH(ARGS... args) {
-    lite::x86::cblas_dgemm_batch(args...);
-  }
-
-  template <typename... ARGS>
-  static void VADD(ARGS... args) {
-    lite::x86::vdAdd(args...);
-  }
-
-  template <typename... ARGS>
-  static void VMUL(ARGS... args) {
-    lite::x86::vdMul(args...);
-  }
-
-  template <typename... ARGS>
-  static void VEXP(ARGS... args) {
-    lite::x86::vdExp(args...);
-  }
-
-  template <typename... ARGS>
-  static void VSQUARE(ARGS... args) {
-    lite::x86::vdSqr(args...);
-  }
-
-  template <typename... ARGS>
-  static void VPOW(ARGS... args) {
-    lite::x86::vdPowx(args...);
-  }
-
-  template <typename... ARGS>
-  static void VINV(ARGS... args) {
-    lite::x86::vdInv(args...);
-  }
-
-  template <typename... ARGS>
-  static void VMERF(ARGS... args) {
-    lite::x86::vmdErf(args...);
-  }
-};
-
-#else
-
-template <>
-struct CBlas<float> {
-  template <typename... ARGS>
-  static void GEMM(ARGS... args) {
-    cblas_sgemm(args...);
-  }
-
-  template <typename... ARGS>
-  static void AXPY(ARGS... args) {
-    cblas_saxpy(args...);
-  }
-
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    cblas_scopy(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMV(ARGS... args) {
-    cblas_sgemv(args...);
-  }
-};
-
-template <>
-struct CBlas<double> {
-  template <typename... ARGS>
-  static void GEMM(ARGS... args) {
-    cblas_dgemm(args...);
-  }
-
-  template <typename... ARGS>
-  static void AXPY(ARGS... args) {
-    cblas_daxpy(args...);
-  }
-
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    cblas_dcopy(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMV(ARGS... args) {
-    cblas_dgemv(args...);
-  }
-};
-#endif
-
-template <>
-struct CBlas<lite::fluid::float16> {
-  static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
-  static void SMM_GEMM(...) {
-    PADDLE_THROW("float16 SMM_GEMM not supported on CPU");
-  }
-  static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
-  static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); }
-  static void VSQUARE(...) {
-    PADDLE_THROW("float16 VSQUARE not supported on CPU");
-  }
-  static void VPOW(...) { PADDLE_THROW("float16 VPOW not supported on CPU"); }
-  static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
-  static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); };
-  static void ASUM(...) { PADDLE_THROW("float16 ASUM not supported on CPU"); };
-#ifdef PADDLE_WITH_MKLML
-  static void GEMM_BATCH(...) {
-    PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
-  }
-#endif
-};
-
-#ifdef PADDLE_WITH_MKLML
-template <>
-template <typename T>
-T *Blas<lite::TargetType::kX86>::GEMM_ALLOC(const CBLAS_IDENTIFIER id,
-                                            const int M,
-                                            const int N,
-                                            const int K) const {
-  return CBlas<T>::GEMM_ALLOC(id, M, N, K);
-}
-
-template <>
-template <typename T>
-void Blas<lite::TargetType::kX86>::GEMM_PACK(const CBLAS_IDENTIFIER id,
-                                             const CBLAS_TRANSPOSE trans,
-                                             int M,
-                                             int N,
-                                             int K,
-                                             const T alpha,
-                                             const T *src,
-                                             const int ld,
-                                             T *dst) const {
-  CBlas<T>::GEMM_PACK(CblasRowMajor, id, trans, M, N, K, alpha, src, ld, dst);
-}
-
-template <>
-template <typename T>
-void Blas<lite::TargetType::kX86>::GEMM_COMPUTE(int transA,
-                                                int transB,
-                                                int M,
-                                                int N,
-                                                int K,
-                                                const T *A,
-                                                const int lda,
-                                                const T *B,
-                                                const int ldb,
-                                                T beta,
-                                                T *C,
-                                                const int ldc) const {
-  CBlas<T>::GEMM_COMPUTE(
-      CblasRowMajor, transA, transB, M, N, K, A, lda, B, ldb, beta, C, ldc);
-}
-
-template <>
-template <typename T>
-void Blas<lite::TargetType::kX86>::GEMM_FREE(T *data) const {
-  CBlas<T>::GEMM_FREE(data);
-}
-#endif
-
-template <>
-template <typename T>
-void Blas<lite::TargetType::kX86>::GEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int M,
-                                        int N,
-                                        int K,
-                                        T alpha,
-                                        const T *A,
-                                        const T *B,
-                                        T beta,
-                                        T *C) const {
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  CBlas<T>::GEMM(CblasRowMajor,
-                 transA,
-                 transB,
-                 M,
-                 N,
-                 K,
-                 alpha,
-                 A,
-                 lda,
-                 B,
-                 ldb,
-                 beta,
-                 C,
-                 ldc);
-}
-
-template <>
-template <typename T>
-void Blas<lite::TargetType::kX86>::GEMM(bool transA,
-                                        bool transB,
-                                        int M,
-                                        int N,
-                                        int K,
-                                        T alpha,
-                                        const T *A,
-                                        int lda,
-                                        const T *B,
-                                        int ldb,
-                                        T beta,
-                                        T *C,
-                                        int ldc) const {
-  CBlas<T>::GEMM(CblasRowMajor,
-                 transA == false ? CblasNoTrans : CblasTrans,
-                 transB == false ? CblasNoTrans : CblasTrans,
-                 M,
-                 N,
-                 K,
-                 alpha,
-                 A,
-                 lda,
-                 B,
-                 ldb,
-                 beta,
-                 C,
-                 ldc);
-}
-
-template <>
-template <typename T>
-void Blas<lite::TargetType::kX86>::GEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int M,
-                                        int N,
-                                        int K,
-                                        T alpha,
-                                        const T *A,
-                                        int lda,
-                                        const T *B,
-                                        int ldb,
-                                        T beta,
-                                        T *C,
-                                        int ldc) const {
-  CBlas<T>::GEMM(CblasRowMajor,
-                 transA,
-                 transB,
-                 M,
-                 N,
-                 K,
-                 alpha,
-                 A,
-                 lda,
-                 B,
-                 ldb,
-                 beta,
-                 C,
-                 ldc);
-}
-
-template <lite::TargetType Target>
-template <typename T>
-void Blas<Target>::MatMul(const lite::Tensor &mat_a,
-                          bool trans_a,
-                          const lite::Tensor &mat_b,
-                          bool trans_b,
-                          T alpha,
-                          lite::Tensor *mat_out,
-                          T beta) const {
-  auto dim_a = mat_a.dims();
-  auto dim_b = mat_b.dims();
-  auto dim_out = mat_out->dims();
-  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-                 "The input and output of matmul be matrix");
-  PADDLE_ENFORCE(
-      mat_a.target() == mat_b.target() && mat_a.target() == mat_out->target(),
-      "The targets of matrices must be same");
-
-  int M = dim_out[0];
-  int N = dim_out[1];
-  int K = !trans_a ? dim_a[1] : dim_a[0];
-
-  CBLAS_TRANSPOSE transA = !trans_a ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE transB = !trans_b ? CblasNoTrans : CblasTrans;
-
-  this->GEMM(transA,
-             transB,
-             M,
-             N,
-             K,
-             alpha,
-             mat_a.data<T>(),
-             mat_b.data<T>(),
-             beta,
-             mat_out->mutable_data<T>());
-}
-
-template <>
-template <typename T>
-void Blas<lite::TargetType::kX86>::AXPY(int n,
-                                        T alpha,
-                                        const T *x,
-                                        T *y) const {
-  CBlas<T>::AXPY(n, alpha, x, 1, y, 1);
-}
-
-template <>
-template <typename T>
-void Blas<lite::TargetType::kX86>::VCOPY(int n, const T *x, T *y) const {
-  CBlas<T>::VCOPY(n, x, 1, y, 1);
-}
-
-template <>
-template <typename T>
-void Blas<lite::TargetType::kX86>::VADD(int n,
-                                        const T *x,
-                                        const T *y,
-                                        T *z) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VADD(n, x, y, z);
-#else
-  this->template VCOPY<T>(n, y, z);
-  this->template AXPY<T>(n, 1., x, z);
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<lite::TargetType::kX86>::VMUL(int n,
-                                        const T *x,
-                                        const T *y,
-                                        T *z) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VMUL(n, x, y, z);
-#else
-  // try to find if openblas support vmul
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] * y[i];
-  }
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<lite::TargetType::kX86>::VEXP(int n, const T *x, T *y) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VEXP(n, x, y);
-#else
-  // try to find if openblas support vexp
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::exp(x[i]);
-  }
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<lite::TargetType::kX86>::VSQUARE(int n, const T *x, T *y) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VSQUARE(n, x, y);
-#else
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] * x[i];
-  }
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<lite::TargetType::kX86>::VPOW(int n, const T *x, T a, T *y) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VPOW(n, x, a, y);
-#else
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::pow(x[i], a);
-  }
-#endif
-}
-
-template <>
-template <typename T>
-T Blas<lite::TargetType::kX86>::DOT(int n, const T *x, const T *y) const {
-#ifdef PADDLE_WITH_MKLML
-  return CBlas<T>::DOT(n, x, 1, y, 1);
-#else
-  // try to find if openblas support cblas_dot
-  T sum = 0;
-  for (int i = 0; i < n; ++i) {
-    sum += x[i] * y[i];
-  }
-  return sum;
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<lite::TargetType::kX86>::SCAL(int n, const T a, T *x) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::SCAL(n, a, x, 1);
-#else
-  // try to find if openblas support cblas_scal
-  for (int i = 0; i < n; ++i) {
-    x[i] = a * x[i];
-  }
-#endif
-}
-
-template <>
-template <typename T>
-T Blas<lite::TargetType::kX86>::ASUM(int n, T *x, int inc) const {
-  auto sum = static_cast<T>(0.0);
-#ifdef PADDLE_WITH_MKLML
-  sum = CBlas<T>::ASUM(n, x, inc);
-#else
-  // TODO(jczaja): check if openblas does provide cblas_sasum/cblas_dasum
-  for (int c = 0; c < n; ++c) {
-    sum += x[c];
-  }
-#endif
-  return sum;
-}
-
-template <>
-template <typename T>
-void Blas<lite::TargetType::kX86>::GEMV(bool trans_a,
-                                        int M,
-                                        int N,
-                                        T alpha,
-                                        const T *A,
-                                        const T *B,
-                                        T beta,
-                                        T *C) const {
-  CBLAS_TRANSPOSE transA = !trans_a ? CblasNoTrans : CblasTrans;
-  CBlas<T>::GEMV(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
-}
-
-template <>
-template <typename T>
-void Blas<lite::TargetType::kX86>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                               CBLAS_TRANSPOSE transB,
-                                               int M,
-                                               int N,
-                                               int K,
-                                               T alpha,
-                                               const T *A,
-                                               const T *B,
-                                               T beta,
-                                               T *C,
-                                               int batchCount,
-                                               int64_t strideA,
-                                               int64_t strideB) const {
-#ifdef PADDLE_WITH_MKLML
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  auto a_array = std::vector<const T *>(batchCount);
-  auto b_array = std::vector<const T *>(batchCount);
-  auto c_array = std::vector<T *>(batchCount);
-  for (int k = 0; k < batchCount; ++k) {
-    a_array[k] = &A[k * strideA];
-    b_array[k] = &B[k * strideB];
-    c_array[k] = &C[k * M * N];
-  }
-
-  CBlas<T>::GEMM_BATCH(CblasRowMajor,
-                       &transA,
-                       &transB,
-                       &M,
-                       &N,
-                       &K,
-                       &alpha,
-                       a_array.data(),
-                       &lda,
-                       b_array.data(),
-                       &ldb,
-                       &beta,
-                       c_array.data(),
-                       &ldc,
-                       1 /* group_count */,
-                       &batchCount);
-#else
-  for (int k = 0; k < batchCount; ++k) {
-    auto *Ak = &A[k * strideA];
-    auto *Bk = &B[k * strideB];
-    auto *Ck = &C[k * M * N];
-    this->template GEMM<T>(transA, transB, M, N, K, alpha, Ak, Bk, beta, Ck);
-  }
-#endif
-}
-
-template <lite::TargetType Target>
-template <typename T>
-void Blas<Target>::MatMul(
-    const int M, const int N, const int K, const T *A, const T *B, T *C) const {
-  this->template GEMM<T>(CblasRowMajor,
-                         CblasNoTrans,
-                         CblasNoTrans,
-                         M,
-                         N,
-                         K,
-                         static_cast<T>(1),
-                         A,
-                         K,
-                         B,
-                         N,
-                         static_cast<T>(0),
-                         C,
-                         N);
-}
-
-template <>
-template <typename T>
-void Blas<lite::TargetType::kX86>::MatMul(
-    const int M, const int N, const int K, const T *A, const T *B, T *C) const {
-#ifdef PADDLE_WITH_LIBXSMM
-  // Refer to https://github.com/hfp/libxsmm/blob/master/README.md
-  // But the threshold is custom constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20;
-
-  // Since the matrix is very small,
-  // so the unit of calculation is already very fast,
-  // and the if( M*N*K < LIBXSMM_THRESHOLD) would be overhead,
-  // use xsmm directly.
-  // Note: SMM use ColMajor
-  const char transa = 'N';
-  const char transb = 'N';
-  const T alpha = static_cast<T>(1);
-  const T beta = static_cast<T>(0);
-  CBlas<T>::SMM_GEMM(
-      &transa, &transb, &N, &M, &K, &alpha, B, &N, A, &K, &beta, C, &N);
-  return;
-#endif
-
-  CBlas<T>::GEMM(CblasRowMajor,
-                 CblasNoTrans,
-                 CblasNoTrans,
-                 M,
-                 N,
-                 K,
-                 static_cast<T>(1),
-                 A,
-                 K,
-                 B,
-                 N,
-                 static_cast<T>(0),
-                 C,
-                 N);
-}
-
-template <lite::TargetType Target>
-template <typename T>
-void Blas<Target>::MatMul(const lite::Tensor &mat_a,
-                          const MatDescriptor &dim_a,
-                          const lite::Tensor &mat_b,
-                          const MatDescriptor &dim_b,
-                          T alpha,
-                          lite::Tensor *mat_out,
-                          T beta) const {
-  PADDLE_ENFORCE_EQ(dim_a.width_, dim_b.height_);
-  CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
-  if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
-    this->template GEMM<T>(transA,
-                           transB,
-                           dim_a.height_,
-                           dim_b.width_,
-                           dim_a.width_,
-                           alpha,
-                           mat_a.data<T>(),
-                           mat_b.data<T>(),
-                           beta,
-                           mat_out->mutable_data<T>());
-  } else {
-    PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ ||
-                   dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0);
-    this->template BatchedGEMM<T>(
-        transA,
-        transB,
-        dim_a.height_,
-        dim_b.width_,
-        dim_a.width_,
-        alpha,
-        mat_a.data<T>(),
-        mat_b.data<T>(),
-        beta,
-        mat_out->mutable_data<T>(),
-        dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_,
-        dim_a.stride_,
-        dim_b.stride_);
-  }
-}
-template <lite::TargetType Target>
-template <typename T>
-void Blas<Target>::VINV(int n, const T *a, T *y) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VINV(n, a, y);
-#else
-  for (int i = 0; i < n; ++i) {
-    y[i] = 1.0 / a[i];
-  }
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<lite::TargetType::kX86>::VMERF(int n,
-                                         const T *a,
-                                         T *y,
-                                         int64_t mode) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VMERF(n, a, y, mode);
-#else
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::erf(a[i]);
-  }
-#endif
-}
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/concat_and_split.cc b/lite/backends/x86/math/concat_and_split.cc
deleted file mode 100644
index bec93dde41..0000000000
--- a/lite/backends/x86/math/concat_and_split.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/x86/math/concat_and_split.h"
-#include <algorithm>
-#include <vector>
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-/*
- * All tensors' dimension should be the same and the values of
- * each dimension must be the same, except the axis dimension.
- */
-template <typename T>
-class ConcatFunctor<lite::TargetType::kX86, T> {
- public:
-  void operator()(const lite::X86Context& context,
-                  const std::vector<lite::Tensor>& input,
-                  int axis,
-                  lite::Tensor* output) {
-    // TODO(zcd): Add input data validity checking
-    int num = input.size();
-
-    int rows = 1;
-    auto dim_0 = input[0].dims();
-    for (int i = 0; i < axis; ++i) {
-      rows *= dim_0[i];
-    }
-    int out_rows = rows, out_cols = 0;
-
-    std::vector<int64_t> input_cols(input.size());
-    for (int i = 0; i < num; ++i) {
-      int t_cols = input[i].numel() / rows;
-      out_cols += t_cols;
-      input_cols[i] = t_cols;
-    }
-    // auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
-
-    // computation
-    auto output_data = output->mutable_data<T>();
-    int col_idx = 0;
-    for (int j = 0; j < num; ++j) {
-      int col_len = input_cols[j];
-      auto* input_data = input[j].data<T>();
-      for (int k = 0; k < out_rows; ++k) {
-        // memory::Copy(cpu_place, output_data + k * out_cols + col_idx,
-        // cpu_place,
-        //             input_data + k * col_len, sizeof(T) * col_len);
-        std::copy_n(input_data + k * col_len,
-                    col_len,
-                    output_data + k * out_cols + col_idx);
-      }
-      col_idx += col_len;
-    }
-  }
-};
-
-/*
- * All tensors' dimension should be the same and the values of
- * each dimension must be the same, except the axis dimension.
- */
-template <typename T>
-class SplitFunctor<lite::TargetType::kX86, T> {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::Tensor& input,
-                  const std::vector<const lite::Tensor*>& ref_inputs,
-                  const int axis,
-                  std::vector<lite::Tensor*>* outputs) {
-    // TODO(zcd): Add input data validity checking
-    size_t num = outputs->size();
-
-    int input_rows = 1;
-    auto dim_0 = ref_inputs[0]->dims();
-    for (int i = 0; i < axis; ++i) {
-      input_rows *= dim_0[i];
-    }
-
-    int input_cols = 0;
-
-    std::vector<int64_t> output_cols(outputs->size());
-    for (size_t i = 0; i < num; ++i) {
-      int t_cols = ref_inputs[i]->numel() / input_rows;
-      input_cols += t_cols;
-      output_cols[i] = t_cols;
-    }
-    // auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
-
-    // computation
-    for (int k = 0; k < input_rows; ++k) {
-      const T* src_ptr = input.data<T>() + k * input_cols;
-      int col_idx = 0;
-      for (size_t j = 0; j < num; ++j) {
-        int col_len = output_cols[j];
-        auto* out_tensor = outputs->at(j);
-        if (out_tensor != nullptr) {
-          T* dst_ptr = out_tensor->mutable_data<T>() + k * col_len;
-          std::copy_n(src_ptr + col_idx, col_len, dst_ptr);
-          // memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
-          //             sizeof(T) * col_len);
-        }
-        col_idx += col_len;
-      }
-    }
-  }
-};
-
-#define DEFINE_FUNCTOR(type)                                  \
-  template class ConcatFunctor<lite::TargetType::kX86, type>; \
-  template class SplitFunctor<lite::TargetType::kX86, type>;
-
-FOR_ALL_TYPES(DEFINE_FUNCTOR);
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/concat_and_split.h b/lite/backends/x86/math/concat_and_split.h
deleted file mode 100644
index 8c996411cd..0000000000
--- a/lite/backends/x86/math/concat_and_split.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "lite/core/context.h"
-#include "lite/core/tensor.h"
-#include "lite/fluid/data_type.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-/*
- * \brief Concatenate the input tensors along the dimension axis.
- *  TODO(zcd): maybe it needs to be more detailed.
- *  Examples:
- *     Input[0] = [[1,2],[3,4]]
- *     Input[1] = [[5,6]]
- *     axis = 0
- *
- *     Output = [[1,2],
- *               [3,4],
- *               [5,6]]
- */
-template <lite::TargetType Target, typename T>
-class ConcatFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const std::vector<lite::Tensor>& input,
-                  int axis,
-                  lite::Tensor* output);
-};
-
-/*
- * \brief Split the input tensors along the dimension axis into outputs.
- *  TODO(zcd): maybe it needs to be more detailed.
- *  Examples:
- *     Input = [[1,2],
- *              [3,4],
- *              [5,6]]
- *     axis = 0
- *
- *     Output[0] = [[1,2],[3,4]]
- *     Output[1] = [[5,6]]
- */
-template <lite::TargetType Target, typename T>
-class SplitFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& input,
-                  const std::vector<const lite::Tensor*>& ref_inputs,
-                  int axis,
-                  std::vector<lite::Tensor*>* outputs);
-};
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
-
-#define FOR_ALL_TYPES(macro) \
-  macro(int);                \
-  macro(float);              \
-  macro(double);             \
-  macro(bool);               \
-  macro(int64_t);            \
-  macro(int16_t);            \
-  macro(uint8_t);            \
-  macro(int8_t);             \
-  macro(::paddle::lite::fluid::float16)
diff --git a/lite/backends/x86/math/context_project.cc b/lite/backends/x86/math/context_project.cc
deleted file mode 100644
index dafced7780..0000000000
--- a/lite/backends/x86/math/context_project.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/x86/math/context_project.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-template class ContextProjectFunctor<lite::TargetType::kX86, float>;
-template class ContextProjectFunctor<lite::TargetType::kX86, double>;
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/context_project.h b/lite/backends/x86/math/context_project.h
deleted file mode 100644
index 0c56e0d759..0000000000
--- a/lite/backends/x86/math/context_project.h
+++ /dev/null
@@ -1,361 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <vector>
-#include "lite/backends/x86/math/blas.h"
-#include "lite/backends/x86/math/im2col.h"
-#include "lite/core/context.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-/*
- * \brief Context projection concatenates features in adjacent time-steps in
- * a sequence. The i-th row of the output is the concatenation of
- * context_length rows of the input. The context_length rows are the
- * consecutive rows from the i+shift_start row.
- * ContextProjectGradFunctor is the inverse process of ContextProjectFunctor.
- *
- * \param in            Input data.
- * \param Shape         The shape of Input data:
- *                        [mini-batch, input_hidden_size].
- *
- * \param padding_data  Padding data.
- * \param Shape         The shape of Padding data:
- *                        [up_pad + down_pad, input_hidden_size].
- *
- * \param col           Col data.
- * \param Shape         The shape of Col data:
- *                        [mini-batch, context_length * input_hidden_size].
- *
- * For a mini-batch of 2 variable lengths sentences, containing 3, and 1
- * time-steps:
- *
- * Assumed input (X) is a [4, M, N] float LoDTensor, and X->lod()[0] = [0, 3,
- * 4].
- * Besides, for the sake of simplicity, we assume M=1 and N=2.
- *
- * X = [[a1, a2;
- *       b1, b2;
- *       c1, c2]
- *      [d1, d2]]
- *
- * This is to say that input (X) has 4 words and the dimension of each word
- * representation is 2.
- *
- * - Case1:
- *   If context_start is -1 and padding_trainable is false, we use zero to pad
- *   instead of learned weight to pad,
- *   and the context_length is 3, the output (Out) is:
- *
- *   Out =[[0,  0,  a1, a2, b1, b2;
- *          a1, a2, b1, b2, c1, c2;
- *          b1, b2, c1, c2, 0,  0 ]
- *          [0,  0, d1, d2, 0,  0 ]]
- *
- * - Case2:
- *   If context_start is -1 and padding_trainable is true, we use learned weight
- *   to pad,
- *   and the context_length is 3, the output (Out) is:
- *
- *   Out = [[w1, w2, a1, a2, b1, b2;
- *           a1, a2, b1, b2, c1, c2;
- *           b1, b2, c1, c2, w3, w4]
- *          [w1, w2, d1, d2, w3, w4]]
- *
- */
-
-template <lite::TargetType Target, typename T>
-class ContextProjectFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& in,
-                  const lite::Tensor* padding_data,
-                  bool padding_trainable,
-                  const int context_start,
-                  const int context_length,
-                  const int context_stride,
-                  const int up_pad,
-                  const int down_pad,
-                  lite::Tensor* col) {
-    auto lod_level_0 = in.lod()[0];
-
-    math::Im2ColFunctor<math::ColFormat::kOCF, Target, float> im2col_ocf;
-
-    std::vector<int> dilation({1, 1});
-    std::vector<int> padding({up_pad, 0, down_pad, 0});
-    std::vector<int> stride({context_stride, 1});
-
-    int input_row_begin, input_row_end;
-    int sequence_height, sequence_width;
-    sequence_width = in.dims()[1];
-
-    for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
-      if (lod_level_0[i] == lod_level_0[i + 1]) continue;
-
-      input_row_begin = (context_start > 0)
-                            ? static_cast<int>(lod_level_0[i]) + context_start
-                            : static_cast<int>(lod_level_0[i]);
-      input_row_end = static_cast<int>(lod_level_0[i + 1]);
-
-      // lite::Tensor out_t =
-      // col->Slice<float>(static_cast<int>(lod_level_0[i]),
-      //                          static_cast<int>(lod_level_0[i + 1]));
-      lite::Tensor out_t =
-          col->Slice<float>(static_cast<int64_t>(lod_level_0[i]),
-                            static_cast<int>(lod_level_0[i + 1]));
-
-      sequence_height = static_cast<int>(out_t.dims()[0]);
-
-      if (input_row_begin < input_row_end) {
-        lite::Tensor in_t = in.Slice<float>(input_row_begin, input_row_end);
-
-        std::vector<int64_t> output_shape(
-            {sequence_height,
-             1,
-             1,
-             context_length,
-             sequence_width});  // output_height, output_width,
-        // input_channels, filter_height, filter_width
-        out_t.Resize(output_shape);
-
-        std::vector<int64_t> input_shape(
-            {1,
-             input_row_end - input_row_begin,
-             sequence_width});  // input_channels, input_height, input_width
-        in_t.Resize(input_shape);
-        im2col_ocf(context, in_t, dilation, stride, padding, &out_t);
-        out_t.Resize({sequence_height, context_length * sequence_width});
-      }
-    }
-    if (padding_trainable) {
-      PADDLE_ENFORCE(padding_data != nullptr);
-      for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
-        if (lod_level_0[i] == lod_level_0[i + 1]) continue;
-
-        lite::Tensor out_t =
-            col->Slice<float>(static_cast<int>(lod_level_0[i]),
-                              static_cast<int>(lod_level_0[i + 1]));
-
-        sequence_height = static_cast<int>(out_t.dims()[0]);
-
-        // add up trainable data
-        out_t.Resize({static_cast<int64_t>(sequence_height) * context_length,
-                      sequence_width});
-
-        if (up_pad > 0) {  // add up pad
-          int padding_rows = std::min(
-              up_pad, static_cast<int>(lod_level_0[i + 1] - lod_level_0[i]));
-
-          for (int k = 0; k < padding_rows; ++k) {
-            int padding_size =
-                k + context_length < up_pad ? context_length : up_pad - k;
-            lite::Tensor out_t_sub = out_t.Slice<float>(
-                k * context_length, k * context_length + padding_size);
-            lite::Tensor w_sub =
-                padding_data->Slice<float>(k, k + padding_size);
-
-            out_t_sub.CopyDataFrom(w_sub);
-
-            // framework::TensorCopy(w_sub, context.GetPlace(), context,
-            //                      &out_t_sub);
-          }
-        }
-        if (down_pad > 0) {  // add down pad
-          int down_pad_begin_row =
-              std::max(0,
-                       (sequence_height - context_start - context_length) + 1) +
-              1;
-          int padding_begin = std::max(0, context_start - sequence_height);
-          int padding_size =
-              sequence_height - context_start >= context_length
-                  ? 1
-                  : context_length - (sequence_height - context_start);
-          if (context_start >= sequence_height) padding_size = context_length;
-          int padding_idx = padding_begin;
-          for (int t = 0; t + down_pad_begin_row <= sequence_height;
-               ++t, ++padding_size) {
-            if (context_start >= sequence_height) padding_size = context_length;
-            if (padding_size > context_length) {
-              padding_size = context_length;
-              padding_idx++;
-            }
-            if (padding_begin > 0 || sequence_height == context_start)
-              padding_idx = padding_begin + t;
-
-            lite::Tensor out_t_sub = out_t.Slice<float>(
-                (down_pad_begin_row + t) * context_length - padding_size,
-                (down_pad_begin_row + t) * context_length);
-            lite::Tensor w_sub = padding_data->Slice<float>(
-                up_pad + padding_idx, up_pad + padding_idx + padding_size);
-            out_t_sub.CopyDataFrom(w_sub);
-            // framework::TensorCopy(w_sub, context.GetPlace(), context,
-            //                      &out_t_sub);
-          }
-        }
-        out_t.Resize({sequence_height,
-                      static_cast<int64_t>(context_length) * sequence_width});
-      }
-    }
-  }
-};
-
-template <lite::TargetType Target, typename T>
-class ContextProjectGradFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& in,
-                  bool padding_trainable,
-                  const int context_start,
-                  const int context_length,
-                  const int context_stride,
-                  const int up_pad,
-                  const int down_pad,
-                  bool pad_grad,
-                  bool input_grad,
-                  lite::Tensor* padding_data,
-                  lite::Tensor* col) {
-    auto lod_level_0 = in.lod()[0];
-
-    math::Col2ImFunctor<math::ColFormat::kOCF, Target, float> col2im_ocf;
-
-    std::vector<int> dilation({1, 1});
-    std::vector<int> padding({up_pad, 0, down_pad, 0});
-    std::vector<int> stride({context_stride, 1});
-
-    int input_row_begin, input_row_end;
-    int sequence_height, sequence_width;
-    sequence_width = in.dims()[1];
-    auto blas = math::GetBlas<lite::Context<Target>, T>(context);
-
-    if (input_grad) {
-      for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
-        if (lod_level_0[i] == lod_level_0[i + 1]) continue;
-
-        input_row_begin = (context_start > 0)
-                              ? static_cast<int>(lod_level_0[i]) + context_start
-                              : static_cast<int>(lod_level_0[i]);
-        input_row_end = static_cast<int>(lod_level_0[i + 1]);
-
-        lite::Tensor out_t =
-            col->Slice<float>(static_cast<int>(lod_level_0[i]),
-                              static_cast<int>(lod_level_0[i + 1]));
-
-        sequence_height = static_cast<int>(out_t.dims()[0]);
-
-        if (input_row_begin < input_row_end) {
-          lite::Tensor in_t = in.Slice<float>(input_row_begin, input_row_end);
-
-          std::vector<int64_t> output_shape(
-              {sequence_height,
-               1,
-               1,
-               context_length,
-               sequence_width});  // output_height, output_width,
-          // input_channels, filter_height, filter_width
-          out_t.Resize(output_shape);
-
-          std::vector<int64_t> input_shape(
-              {1,
-               input_row_end - input_row_begin,
-               sequence_width});  // input_channels, input_height, input_width
-          in_t.Resize(input_shape);
-
-          col2im_ocf(context, out_t, dilation, stride, padding, &in_t);
-          out_t.Resize({sequence_height, context_length * sequence_width});
-        }
-      }
-    }
-    if (pad_grad) {
-      if (padding_trainable) {
-        for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
-          if (lod_level_0[i] == lod_level_0[i + 1]) continue;
-
-          lite::Tensor out_t =
-              col->Slice<float>(static_cast<int>(lod_level_0[i]),
-                                static_cast<int>(lod_level_0[i + 1]));
-
-          sequence_height = static_cast<int>(out_t.dims()[0]);
-          out_t.Resize({static_cast<int64_t>(sequence_height) * context_length,
-                        sequence_width});
-
-          if (up_pad > 0) {
-            int padding_rows = std::min(
-                up_pad, static_cast<int>(lod_level_0[i + 1] - lod_level_0[i]));
-
-            for (int k = 0; k < padding_rows; ++k) {
-              int padding_size =
-                  k + context_length < up_pad ? context_length : up_pad - k;
-              lite::Tensor out_t_sub = out_t.Slice<float>(
-                  k * context_length, k * context_length + padding_size);
-              lite::Tensor w_sub =
-                  padding_data->Slice<float>(k, k + padding_size);
-              blas.AXPY(w_sub.numel(),
-                        static_cast<T>(1),
-                        out_t_sub.data<T>(),
-                        w_sub.data<T>());
-            }
-          }
-          if (down_pad > 0) {
-            int down_pad_begin_row =
-                std::max(
-                    0, (sequence_height - context_start - context_length) + 1) +
-                1;
-            int padding_begin = std::max(0, context_start - sequence_height);
-            int padding_size =
-                sequence_height - context_start >= context_length
-                    ? 1
-                    : context_length - (sequence_height - context_start);
-            if (context_start >= sequence_height) padding_size = context_length;
-            int padding_idx = padding_begin;
-            for (int t = 0; t + down_pad_begin_row <= sequence_height;
-                 ++t, ++padding_size) {
-              if (context_start >= sequence_height)
-                padding_size = context_length;
-              if (padding_size > context_length) {
-                padding_size = context_length;
-                padding_idx++;
-              }
-              if (padding_begin > 0 || sequence_height == context_start)
-                padding_idx = padding_begin + t;
-
-              lite::Tensor out_t_sub = out_t.Slice<float>(
-                  (down_pad_begin_row + t) * context_length - padding_size,
-                  (down_pad_begin_row + t) * context_length);
-              lite::Tensor w_sub = padding_data->Slice<float>(
-                  up_pad + padding_idx, up_pad + padding_idx + padding_size);
-              blas.AXPY(w_sub.numel(),
-                        static_cast<T>(1),
-                        out_t_sub.data<T>(),
-                        w_sub.data<T>());
-            }
-          }
-          out_t.Resize({sequence_height,
-                        static_cast<int64_t>(context_length) * sequence_width});
-        }
-      }
-    }
-  }
-};
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/cos_sim_functor.cc b/lite/backends/x86/math/cos_sim_functor.cc
deleted file mode 100644
index 8dffa380f1..0000000000
--- a/lite/backends/x86/math/cos_sim_functor.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/x86/math/cos_sim_functor.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-template <typename T>
-struct CosSimDyFunctor<lite::TargetType::kX86, T> {
-  void operator()(const lite::X86Context& ctx,
-                  const T* x_norm,
-                  const T* y_norm,
-                  const T* x,
-                  const T* y,
-                  const T* z,
-                  const T* dz,
-                  const size_t rows,
-                  const size_t cols,
-                  T* dy) const {
-    for (size_t row_id = 0; row_id < rows; ++row_id) {
-      auto xy_norm_prod = x_norm[row_id] * y_norm[0];
-      auto dz_data = dz[row_id];
-      auto z_data = z[row_id];
-      auto* x_data = x + cols * row_id;
-      auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
-
-      auto y_norm_square = y_norm[0] * y_norm[0];
-      auto reciprocal_y_norm_square = 1 / y_norm_square;
-      for (size_t i = 0; i < cols; ++i) {
-        dy[i] += dz_data * (x_data[i] * reciprocal_xy_norm_prod -
-                            z_data * y[i] * reciprocal_y_norm_square);
-      }
-    }
-  }
-};
-
-template struct CosSimDyFunctor<lite::TargetType::kX86, float>;
-template struct CosSimDyFunctor<lite::TargetType::kX86, double>;
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/cos_sim_functor.h b/lite/backends/x86/math/cos_sim_functor.h
deleted file mode 100644
index 16470f302a..0000000000
--- a/lite/backends/x86/math/cos_sim_functor.h
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>
-#include <stdlib.h>
-#include "lite/core/context.h"
-#include "lite/utils/macros.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-template <typename T, bool same_row>
-struct CosSimFunctor {
-  CosSimFunctor(const T* x, const T* y, T* x_norm, T* y_norm, T* z, int cols)
-      : x_norm_(x_norm),
-        y_norm_(y_norm),
-        x_(x),
-        y_(y),
-        z_(z),
-        cols_(static_cast<size_t>(cols)) {}
-
-  inline HOSTDEVICE void operator()(size_t row_id) const {
-    auto* x = x_ + cols_ * row_id;
-    T xx = 0, xy = 0, yy = 0;
-    if (same_row) {
-      auto* y = y_ + cols_ * row_id;
-      T tep_x, tep_y;
-      for (size_t i = 0; i < cols_; ++i) {
-        tep_x = x[i];
-        tep_y = y[i];
-        xx += tep_x * tep_x;
-        yy += tep_y * tep_y;
-        xy += tep_x * tep_y;
-      }
-      xx = sqrt(xx);
-      yy = sqrt(yy);
-      y_norm_[row_id] = yy;
-      x_norm_[row_id] = xx;
-      z_[row_id] = xy / (xx * yy);
-    } else {  // This can be wrote in a better way.
-      T tep_x, tep_y;
-      for (size_t i = 0; i < cols_; ++i) {
-        tep_x = x[i];
-        tep_y = y_[i];
-        xx += tep_x * tep_x;
-        yy += tep_y * tep_y;
-        xy += tep_x * tep_y;
-      }
-      xx = sqrt(xx);
-      yy = sqrt(yy);
-      if (row_id == 0) y_norm_[0] = yy;
-      x_norm_[row_id] = xx;
-      z_[row_id] = xy / (xx * yy);
-    }
-  }
-
-  T* x_norm_;
-  T* y_norm_;
-  const T* x_;
-  const T* y_;
-  T* z_;
-  const size_t cols_;
-};
-
-template <typename T>
-struct CosSimGradFunctor {
-  CosSimGradFunctor(const T* x_norm,
-                    const T* y_norm,
-                    const T* x,
-                    const T* y,
-                    const T* z,
-                    const T* dz,
-                    T* dx,
-                    int cols)
-      : x_norm_(x_norm),
-        y_norm_(y_norm),
-        x_(x),
-        y_(y),
-        z_(z),
-        dz_(dz),
-        dx_(dx),
-        cols_(static_cast<size_t>(cols)) {}
-
-  inline HOSTDEVICE void operator()(size_t row_id) const {
-    auto x_norm_square = x_norm_[row_id] * x_norm_[row_id];
-    auto xy_norm_prod = x_norm_[row_id] * y_norm_[row_id];
-    auto dz = dz_[row_id];
-    auto z = z_[row_id];
-
-    auto* dx = dx_ + cols_ * row_id;
-    auto* x = x_ + cols_ * row_id;
-    auto* y = y_ + cols_ * row_id;
-
-    auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
-    auto reciprocal_x_norm_square = 1 / x_norm_square;
-    for (size_t i = 0; i < cols_; ++i) {
-      dx[i] = dz * (y[i] * reciprocal_xy_norm_prod -
-                    z * x[i] * reciprocal_x_norm_square);
-    }
-  }
-
-  const T* x_norm_;
-  const T* y_norm_;
-  const T* x_;
-  const T* y_;
-  const T* z_;
-  const T* dz_;
-  T* dx_;
-  const size_t cols_;
-};
-
-template <typename T>
-struct CosSimDxFunctor {
-  CosSimDxFunctor(const T* x_norm,
-                  const T* y_norm,
-                  const T* x,
-                  const T* y,
-                  const T* z,
-                  const T* dz,
-                  T* dx,
-                  int cols)
-      : x_norm_(x_norm),
-        y_norm_(y_norm),
-        x_(x),
-        y_(y),
-        z_(z),
-        dz_(dz),
-        dx_(dx),
-        cols_(static_cast<size_t>(cols)) {}
-
-  inline HOSTDEVICE void operator()(size_t row_id) const {
-    auto xy_norm_prod = x_norm_[row_id] * y_norm_[0];
-    auto dz = dz_[row_id];
-    auto z = z_[row_id];
-    auto* x = x_ + cols_ * row_id;
-    auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
-    auto x_norm_square = x_norm_[row_id] * x_norm_[row_id];
-    auto* dx = dx_ + cols_ * row_id;
-    auto reciprocal_x_norm_square = 1 / x_norm_square;
-
-    for (size_t i = 0; i < cols_; ++i) {
-      dx[i] = dz * (y_[i] * reciprocal_xy_norm_prod -
-                    z * x[i] * reciprocal_x_norm_square);
-    }
-  }
-  const T* x_norm_;
-  const T* y_norm_;
-  const T* x_;
-  const T* y_;
-  const T* z_;
-  const T* dz_;
-  T* dx_;
-  const size_t cols_;
-};
-
-template <lite::TargetType Target, typename T>
-struct CosSimDyFunctor {
-  void operator()(const lite::Context<Target>& ctx,
-                  const T* x_norm,
-                  const T* y_norm,
-                  const T* x,
-                  const T* y,
-                  const T* z,
-                  const T* dz,
-                  const size_t rows,
-                  const size_t cols,
-                  T* dy) const;
-};
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/cpu_vec.h b/lite/backends/x86/math/cpu_vec.h
deleted file mode 100644
index 9ff64d53f0..0000000000
--- a/lite/backends/x86/math/cpu_vec.h
+++ /dev/null
@@ -1,662 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cmath>
-#include <functional>
-#include <string>
-#include "lite/backends/x86/cpu_info.h"
-#include "lite/utils/paddle_enforce.h"
-
-#ifdef PADDLE_WITH_MKLML
-#include "lite/backends/x86/mklml.h"
-#endif
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-#define SIGMOID_THRESHOLD_MIN -40.0
-#define SIGMOID_THRESHOLD_MAX 13.0
-
-#define YMM_FLOAT_BLOCK 8
-#define AVX_DOUBLE_BLOCK 4
-#define YMM_FLOAT_BLOCK 8
-#define AVX2_DOUBLE_BLOCK 4
-#define ZMM_FLOAT_BLOCK 16
-#define AVX512_DOUBLE_BLOCK 8
-
-template <typename T>
-inline void vec_exp(const int n, const T* x, T* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::exp(x[i]);
-  }
-}
-
-template <typename T>
-inline void vec_scal(const int n, const T a, T* x) {
-  for (int i = 0; i < n; ++i) {
-    x[i] = a * x[i];
-  }
-}
-
-#ifdef PADDLE_WITH_MKLML
-template <>
-inline void vec_exp<float>(const int n, const float* x, float* y) {
-  constexpr int small_enough = 128;
-  if (n < small_enough) {
-    for (int i = 0; i < n; ++i) {
-      y[i] = std::exp(x[i]);
-    }
-  } else {
-    lite::x86::vsExp(n, x, y);
-  }
-}
-
-template <>
-inline void vec_exp<double>(const int n, const double* x, double* y) {
-  lite::x86::vdExp(n, x, y);
-}
-
-template <>
-inline void vec_scal<float>(const int n, const float a, float* x) {
-  lite::x86::cblas_sscal(n, a, x, 1);
-}
-
-template <>
-inline void vec_scal<double>(const int n, const double a, double* x) {
-  lite::x86::cblas_dscal(n, a, x, 1);
-}
-#endif
-
-// MKL scal only support inplace, choose this if src and dst are not equal
-template <typename T, lite::x86::cpu_isa_t isa = lite::x86::isa_any>
-inline void vec_scal(const int n, const T a, const T* x, T* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = a * x[i];
-  }
-}
-
-template <>
-inline void vec_scal<float, lite::x86::avx>(const int n,
-                                            const float a,
-                                            const float* x,
-                                            float* y) {
-#ifdef __AVX__
-  constexpr int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_scal<float, lite::x86::isa_any>(n, a, x, y);
-    return;
-  }
-  const int rest = n % block;
-  const int end = n - rest;
-  int i = 0;
-  __m256 scalar = _mm256_set1_ps(a);
-  __m256 tmp;
-#define MOVE_ONE_STEP               \
-  tmp = _mm256_loadu_ps(x + i);     \
-  tmp = _mm256_mul_ps(tmp, scalar); \
-  _mm256_storeu_ps(y + i, tmp)
-  for (i = 0; i < end; i += block) {
-    MOVE_ONE_STEP;
-  }
-#undef MOVE_ONE_STEP
-  if (rest == 0) {
-    return;
-  }
-  // can not continue move step if src and dst are inplace
-  for (i = n - rest; i < n; ++i) {
-    y[i] = a * x[i];
-  }
-#else
-  vec_scal<float, lite::x86::isa_any>(n, a, x, y);
-#endif
-}
-
-template <>
-inline void vec_scal<float, lite::x86::avx2>(const int n,
-                                             const float a,
-                                             const float* x,
-                                             float* y) {
-  vec_scal<float, lite::x86::avx>(n, a, x, y);
-}
-
-template <>
-inline void vec_scal<float, lite::x86::avx512f>(const int n,
-                                                const float a,
-                                                const float* x,
-                                                float* y) {
-  // TODO(TJ): enable me
-  vec_scal<float, lite::x86::avx2>(n, a, x, y);
-}
-
-template <typename T, lite::x86::cpu_isa_t isa = lite::x86::isa_any>
-inline void vec_sum(const size_t n, const T* x, T* s) {
-  s[0] = x[0];
-  for (size_t i = 1; i < n; ++i) {
-    s[0] += x[i];
-  }
-}
-
-template <>
-inline void vec_sum<float, lite::x86::avx>(const size_t n,
-                                           const float* x,
-                                           float* s) {
-#ifdef __AVX__
-  constexpr unsigned int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_sum<float, lite::x86::isa_any>(n, x, s);
-    return;
-  }
-
-  unsigned int i, end;
-  i = end = 0;
-  s[0] = 0.f;
-
-  end = n & ~(block - 1);
-  __m256 tmp = _mm256_setzero_ps();
-  for (i = 0; i < end; i += block) {
-    tmp = _mm256_add_ps(tmp, _mm256_load_ps(x + i));
-  }
-
-  __m256 hsum = _mm256_hadd_ps(tmp, tmp);
-  hsum = _mm256_add_ps(hsum, _mm256_permute2f128_ps(hsum, hsum, 0x1));
-  _mm_store_ss(
-      s,
-      _mm_hadd_ps(_mm256_castps256_ps128(hsum), _mm256_castps256_ps128(hsum)));
-
-  for (; i < n; i++) {
-    s[0] += x[i];
-  }
-#else
-  vec_sum<float, lite::x86::isa_any>(n, x, s);
-#endif
-}
-
-template <typename T, lite::x86::cpu_isa_t isa = lite::x86::isa_any>
-inline void vec_mul(const size_t n, const T* x, const T* y, T* z) {
-  for (size_t i = 0; i < n; ++i) {
-    z[i] = x[i] * y[i];
-  }
-}
-
-template <>
-inline void vec_mul<float, lite::x86::avx>(const size_t n,
-                                           const float* x,
-                                           const float* y,
-                                           float* z) {
-#ifdef __AVX__
-  constexpr unsigned int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_mul<float, lite::x86::isa_any>(n, x, y, z);
-    return;
-  }
-
-  unsigned int i = 0, end = 0;
-  end = n & ~(block - 1);
-  for (i = 0; i < end; i += block) {
-    _mm256_storeu_ps(
-        z + i, _mm256_mul_ps(_mm256_loadu_ps(x + i), _mm256_loadu_ps(y + i)));
-  }
-
-  for (; i < n; i++) {
-    z[i] = x[i] * y[i];
-  }
-#else
-  vec_mul<float, lite::x86::isa_any>(n, x, y, z);
-#endif
-}
-
-template <typename T, lite::x86::cpu_isa_t isa = lite::x86::isa_any>
-inline void vec_mul_reduce(const size_t n, const T* x, const T* y, T* z) {
-  z[0] = x[0] * y[0];
-  for (size_t i = 1; i < n; ++i) {
-    z[0] += x[i] * y[i];
-  }
-}
-
-template <>
-inline void vec_mul_reduce<float, lite::x86::avx>(const size_t n,
-                                                  const float* x,
-                                                  const float* y,
-                                                  float* z) {
-#ifdef __AVX__
-  constexpr unsigned int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_mul_reduce<float, lite::x86::isa_any>(n, x, y, z);
-    return;
-  }
-
-  unsigned int i = 0, end = 0;
-  z[0] = 0.f;
-
-  end = n & ~(block - 1);
-  __m256 tmp = _mm256_setzero_ps();
-  for (i = 0; i < end; i += block) {
-    tmp = _mm256_add_ps(
-        tmp, _mm256_mul_ps(_mm256_loadu_ps(x + i), _mm256_loadu_ps(y + i)));
-  }
-
-  __m256 hsum = _mm256_hadd_ps(tmp, tmp);
-  hsum = _mm256_add_ps(hsum, _mm256_permute2f128_ps(hsum, hsum, 0x1));
-  _mm_store_ss(
-      z,
-      _mm_hadd_ps(_mm256_castps256_ps128(hsum), _mm256_castps256_ps128(hsum)));
-
-  for (; i < n; i++) {
-    z[0] += x[i] * y[i];
-  }
-#else
-  vec_mul_reduce<float, lite::x86::isa_any>(n, x, y, z);
-#endif
-}
-
-template <typename T, lite::x86::cpu_isa_t isa = lite::x86::isa_any>
-inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = a - x[i];
-  }
-}
-
-template <>
-inline void vec_bias_sub<float, lite::x86::avx>(const int n,
-                                                const float a,
-                                                const float* x,
-                                                float* y) {
-#ifdef __AVX__
-  constexpr int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_bias_sub<float, lite::x86::isa_any>(n, a, x, y);
-    return;
-  }
-  const int rest = n % block;
-  const int end = n - rest;
-  int i = 0;
-  __m256 bias = _mm256_set1_ps(a);
-  __m256 tmp;
-#define MOVE_ONE_STEP             \
-  tmp = _mm256_loadu_ps(x + i);   \
-  tmp = _mm256_sub_ps(bias, tmp); \
-  _mm256_storeu_ps(y + i, tmp)
-  for (i = 0; i < end; i += block) {
-    MOVE_ONE_STEP;
-  }
-#undef MOVE_ONE_STEP
-  if (rest == 0) {
-    return;
-  }
-  // can not continue move step if src and dst are inplace
-  for (i = n - rest; i < n; ++i) {
-    y[i] = a - x[i];
-  }
-#else
-  vec_bias_sub<float, lite::x86::isa_any>(n, a, x, y);
-#endif
-}
-
-template <>
-inline void vec_bias_sub<float, lite::x86::avx2>(const int n,
-                                                 const float a,
-                                                 const float* x,
-                                                 float* y) {
-  vec_bias_sub<float, lite::x86::avx>(n, a, x, y);
-}
-
-template <>
-inline void vec_bias_sub<float, lite::x86::avx512f>(const int n,
-                                                    const float a,
-                                                    const float* x,
-                                                    float* y) {
-  // TODO(TJ): enable me
-  vec_bias_sub<float, lite::x86::avx2>(n, a, x, y);
-}
-
-// out = x*y + (1-x)*z
-template <typename T, lite::x86::cpu_isa_t isa = lite::x86::isa_any>
-inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
-  for (int i = 0; i < n; ++i) {
-    out[i] = x[i] * y[i] + (static_cast<T>(1) - x[i]) * z[i];
-  }
-}
-
-template <>
-inline void vec_cross<float, lite::x86::avx>(
-    const int n, const float* x, const float* y, const float* z, float* out) {
-#ifdef __AVX__
-  constexpr int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_cross<float, lite::x86::isa_any>(n, x, y, z, out);
-    return;
-  }
-  const int rest = n % block;
-  const int end = n - rest;
-  int i = 0;
-  __m256 bias = _mm256_set1_ps(1.f);
-  __m256 tmpx, tmpy, tmpz;
-  for (i = 0; i < end; i += block) {
-    tmpx = _mm256_loadu_ps(x + i);
-    tmpy = _mm256_loadu_ps(y + i);
-    tmpz = _mm256_loadu_ps(z + i);
-    tmpy = _mm256_mul_ps(tmpx, tmpy);
-    tmpx = _mm256_sub_ps(bias, tmpx);
-    tmpz = _mm256_mul_ps(tmpx, tmpz);
-    tmpz = _mm256_add_ps(tmpy, tmpz);
-    _mm256_storeu_ps(out + i, tmpz);
-  }
-  if (rest == 0) {
-    return;
-  }
-  // can not continue move step if src and dst are inplace
-  for (i = n - rest; i < n; ++i) {
-    out[i] = x[i] * y[i] + (1.f - x[i]) * z[i];
-  }
-#else
-  vec_cross<float, lite::x86::isa_any>(n, x, y, z, out);
-#endif
-}
-
-template <>
-inline void vec_cross<float, lite::x86::avx2>(
-    const int n, const float* x, const float* y, const float* z, float* out) {
-  vec_cross<float, lite::x86::avx>(n, x, y, z, out);
-}
-
-template <>
-inline void vec_cross<float, lite::x86::avx512f>(
-    const int n, const float* x, const float* y, const float* z, float* out) {
-  // TODO(TJ): enable me
-  vec_cross<float, lite::x86::avx>(n, x, y, z, out);
-}
-
-template <typename T, lite::x86::cpu_isa_t isa = lite::x86::isa_any>
-inline void vec_clip(const size_t n, const T a, const T* x, T* y) {
-  for (size_t i = 0; i < n; ++i) {
-    y[i] = x[i] < a ? a : x[i];
-  }
-}
-
-template <>
-inline void vec_clip<float, lite::x86::avx>(const size_t n,
-                                            const float a,
-                                            const float* x,
-                                            float* y) {
-#ifdef __AVX__
-  constexpr unsigned int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_clip<float, lite::x86::isa_any>(n, a, x, y);
-    return;
-  }
-
-  unsigned int i = 0, end = 0;
-  end = n & ~(block - 1);
-  __m256 threshold = _mm256_set1_ps(a);
-
-  for (i = 0; i < end; i += block) {
-    _mm256_storeu_ps(y + i, _mm256_max_ps(_mm256_loadu_ps(x + i), threshold));
-  }
-
-  for (; i < n; i++) {
-    y[i] = x[i] < a ? a : x[i];
-  }
-#else
-  vec_clip<float, lite::x86::isa_any>(n, a, x, y);
-#endif
-}
-
-template <typename T, lite::x86::cpu_isa_t isa = lite::x86::isa_any>
-inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] + a;
-  }
-}
-
-template <>
-inline void vec_add_bias<float, lite::x86::avx>(const int n,
-                                                const float a,
-                                                const float* x,
-                                                float* y) {
-#ifdef __AVX__
-  constexpr int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_add_bias<float, lite::x86::isa_any>(n, a, x, y);
-    return;
-  }
-  const int rest = n % block;
-  const int end = n - rest;
-  int i = 0;
-  __m256 bias = _mm256_set1_ps(a);
-  __m256 tmp;
-#define MOVE_ONE_STEP             \
-  tmp = _mm256_loadu_ps(x + i);   \
-  tmp = _mm256_add_ps(tmp, bias); \
-  _mm256_storeu_ps(y + i, tmp)
-  for (i = 0; i < end; i += block) {
-    MOVE_ONE_STEP;
-  }
-#undef MOVE_ONE_STEP
-  if (rest == 0) {
-    return;
-  }
-  // can not continue move step if src and dst are inplace
-  for (i = n - rest; i < n; ++i) {
-    y[i] = x[i] + a;
-  }
-#else
-  vec_add_bias<float, lite::x86::isa_any>(n, a, x, y);
-#endif
-}
-
-template <>
-inline void vec_add_bias<float, lite::x86::avx2>(const int n,
-                                                 const float a,
-                                                 const float* x,
-                                                 float* y) {
-  vec_add_bias<float, lite::x86::avx>(n, a, x, y);
-}
-
-template <>
-inline void vec_add_bias<float, lite::x86::avx512f>(const int n,
-                                                    const float a,
-                                                    const float* x,
-                                                    float* y) {
-  // TODO(TJ): enable me
-  vec_add_bias<float, lite::x86::avx2>(n, a, x, y);
-}
-
-template <typename T, lite::x86::cpu_isa_t isa = lite::x86::isa_any>
-inline void vec_identity(const int n, const T* x, T* y) {
-  // do nothing
-  return;
-}
-
-template <typename T, lite::x86::cpu_isa_t isa = lite::x86::isa_any>
-inline void vec_sigmoid(const int n, const T* x, T* y) {
-  const T min = SIGMOID_THRESHOLD_MIN;
-  const T max = SIGMOID_THRESHOLD_MAX;
-  for (int i = 0; i < n; ++i) {
-    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
-    y[i] = static_cast<T>(0) - y[i];
-  }
-  vec_exp<T>(n, y, y);
-  for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
-  }
-}
-
-template <>
-inline void vec_sigmoid<float, lite::x86::avx>(const int n,
-                                               const float* x,
-                                               float* y) {
-#ifdef __AVX__
-  constexpr int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_sigmoid<float, lite::x86::isa_any>(n, x, y);
-    return;
-  }
-  const int rest = n % block;
-  const int end = n - rest;
-  int i = 0;
-  __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
-  __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
-  __m256 zeros = _mm256_setzero_ps();
-  __m256 tmp;
-#define MOVE_ONE_STEP              \
-  tmp = _mm256_loadu_ps(x + i);    \
-  tmp = _mm256_max_ps(tmp, min);   \
-  tmp = _mm256_min_ps(tmp, max);   \
-  tmp = _mm256_sub_ps(zeros, tmp); \
-  _mm256_storeu_ps(y + i, tmp)
-  for (i = 0; i < end; i += block) {
-    MOVE_ONE_STEP;
-  }
-#undef MOVE_ONE_STEP
-  if (rest != 0) {
-    // can not continue move step since the src and dst address could be equal
-    const float xmin = SIGMOID_THRESHOLD_MIN;
-    const float xmax = SIGMOID_THRESHOLD_MAX;
-    for (i = n - rest; i < n; ++i) {
-      y[i] = 0.f - ((x[i] < xmin) ? xmin : ((x[i] > xmax) ? xmax : x[i]));
-    }
-  }
-
-  vec_exp<float>(n, y, y);
-
-  __m256 ones = _mm256_set1_ps(1.0f);
-#define MOVE_ONE_STEP             \
-  tmp = _mm256_loadu_ps(y + i);   \
-  tmp = _mm256_add_ps(ones, tmp); \
-  tmp = _mm256_div_ps(ones, tmp); \
-  _mm256_storeu_ps(y + i, tmp)
-  for (i = 0; i < end; i += block) {
-    MOVE_ONE_STEP;
-  }
-#undef MOVE_ONE_STEP
-  if (rest == 0) {
-    return;
-  }
-  // can not continue move step
-  for (i = n - rest; i < n; ++i) {
-    y[i] = 1.f / (1.f + y[i]);
-  }
-#else
-  vec_sigmoid<float, lite::x86::isa_any>(n, x, y);
-#endif
-}
-
-template <>
-inline void vec_sigmoid<float, lite::x86::avx2>(const int n,
-                                                const float* x,
-                                                float* y) {
-  vec_sigmoid<float, lite::x86::avx>(n, x, y);
-}
-
-template <>
-inline void vec_sigmoid<float, lite::x86::avx512f>(const int n,
-                                                   const float* x,
-                                                   float* y) {
-  // TODO(TJ): enable me
-  vec_sigmoid<float, lite::x86::avx2>(n, x, y);
-}
-
-template <typename T, lite::x86::cpu_isa_t isa = lite::x86::isa_any>
-inline void vec_tanh(const int n, const T* x, T* y) {
-  vec_scal<T, isa>(n, static_cast<T>(2), x, y);
-  vec_sigmoid<T, isa>(n, y, y);
-  vec_scal<T>(n, static_cast<T>(2), y);
-  vec_add_bias<T, isa>(n, static_cast<T>(-1), y, y);
-}
-
-// TODO(TJ): make relu clip
-template <typename T, lite::x86::cpu_isa_t isa = lite::x86::isa_any>
-inline void vec_relu(const int n, const T* x, T* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] > 0 ? x[i] : 0;
-  }
-}
-
-template <>
-inline void vec_relu<float, lite::x86::avx>(const int n,
-                                            const float* x,
-                                            float* y) {
-#ifdef __AVX__
-  constexpr int block = YMM_FLOAT_BLOCK;
-  if (n < block * 4) {
-    vec_relu<float, lite::x86::isa_any>(n, x, y);
-    return;
-  }
-
-  const int rest = n % block;
-  const int end = n - rest;
-  int i = 0;
-  __m256 zeros = _mm256_setzero_ps();
-  __m256 tmp;
-#define MOVE_ONE_STEP              \
-  tmp = _mm256_loadu_ps(x + i);    \
-  tmp = _mm256_max_ps(tmp, zeros); \
-  _mm256_storeu_ps(y + i, tmp)
-  for (i = 0; i < end; i += block) {
-    MOVE_ONE_STEP;
-  }
-  if (rest == 0) {
-    return;
-  }
-  i = n - block;
-  MOVE_ONE_STEP;
-#undef MOVE_ONE_STEP
-
-#else
-  vec_relu<float, lite::x86::isa_any>(n, x, y);
-#endif
-}
-
-template <>
-inline void vec_relu<float, lite::x86::avx2>(const int n,
-                                             const float* x,
-                                             float* y) {
-  vec_relu<float, lite::x86::avx>(n, x, y);
-}
-
-template <>
-inline void vec_relu<float, lite::x86::avx512f>(const int n,
-                                                const float* x,
-                                                float* y) {
-  // TODO(TJ): enable me
-  vec_relu<float, lite::x86::avx2>(n, x, y);
-}
-
-// TODO(TJ): optimize double of sigmoid, tanh and relu if necessary
-
-template <typename T, lite::x86::cpu_isa_t isa = lite::x86::isa_any>
-class VecActivations {
- public:
-  std::function<void(const int, const T*, T*)> operator()(
-      const std::string& type) {
-    if (type == "sigmoid") {
-      return vec_sigmoid<T, isa>;
-    } else if (type == "relu") {
-      return vec_relu<T, isa>;
-    } else if (type == "tanh") {
-      return vec_tanh<T, isa>;
-    } else if (type == "identity" || type == "") {
-      return vec_identity<T, isa>;
-    }
-    PADDLE_THROW("Not support type: %s", type);
-  }
-};
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/cross_entropy.cc b/lite/backends/x86/math/cross_entropy.cc
deleted file mode 100644
index 366486924a..0000000000
--- a/lite/backends/x86/math/cross_entropy.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/x86/math/cross_entropy.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = lite::fluid::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T>
-class CrossEntropyFunctor<lite::TargetType::kX86, T> {
- public:
-  void operator()(const lite::X86Context& ctx,
-                  lite::Tensor* out,
-                  const lite::Tensor* prob,
-                  const lite::Tensor* labels,
-                  const bool softLabel,
-                  const int ignore_index,
-                  const int axis_dim) {
-    const int batch_size = prob->dims()[0];
-    const int num_classes = prob->dims()[1];
-    const int num_remain = num_classes / axis_dim;
-
-    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
-
-    if (softLabel) {
-      auto in = EigenMatrix<T>::From(*prob);
-      auto lbl = EigenMatrix<T>::From(*labels);
-      auto loss = EigenMatrix<T>::From(*out);
-
-      loss.device(lite::fluid::EigenDeviceType<lite::TargetType::kX86>()) =
-          -((lbl * in.log().unaryExpr(math::TolerableValue<T>()))
-                .reshape(batch_axis_remain)
-                .sum(Eigen::DSizes<int, 1>(1)));
-    } else {
-      const T* prob_data = prob->data<T>();
-      T* loss_data = out->mutable_data<T>();
-
-      const int64_t* label_data = labels->data<int64_t>();
-      for (int i = 0; i < batch_size; ++i) {
-        for (int j = 0; j < num_remain; j++) {
-          int lbl = label_data[i * num_remain + j];
-          PADDLE_ENFORCE((lbl >= 0 && lbl < axis_dim) || lbl == ignore_index);
-          int index = i * num_classes + lbl * num_remain + j;
-          int loss_idx = i * num_remain + j;
-          loss_data[loss_idx] =
-              lbl == ignore_index
-                  ? 0
-                  : -math::TolerableValue<T>()(std::log(prob_data[index]));
-        }
-      }
-    }
-  }
-};
-
-template class CrossEntropyFunctor<lite::TargetType::kX86, float>;
-template class CrossEntropyFunctor<lite::TargetType::kX86, double>;
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/cross_entropy.h b/lite/backends/x86/math/cross_entropy.h
deleted file mode 100644
index 6b66f0b085..0000000000
--- a/lite/backends/x86/math/cross_entropy.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <limits>
-#include "lite/core/context.h"
-#include "lite/core/tensor.h"
-#include "lite/fluid/eigen.h"
-#include "lite/utils/macros.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-template <typename T>
-struct TolerableValue {
-  HOSTDEVICE T operator()(const T& x) const {
-    PADDLE_ENFORCE(static_cast<bool>(std::is_floating_point<T>::value));
-    const T kApproInf = 1e20;
-
-    if (x == INFINITY) return kApproInf;
-    if (x == -INFINITY) return -kApproInf;
-    return x;
-  }
-};
-
-// NOTE(dzh): float16 value clip behave different.
-// 1. Our ValueClipping has a  hardcore threshold 1e20
-// for float number. 1e20 will resulting in overflow in float16.
-// 2. float16 should expose the the real number overflow to python.
-// because mixed-training depends the inf/nan value to determine
-// if the scale value will be adjusted.
-// Also. In standard implementation of cross entropy, other
-// framework not has the ValueClipping.
-template <>
-struct TolerableValue<lite::fluid::float16> {
-  HOSTDEVICE lite::fluid::float16 operator()(
-      const lite::fluid::float16& x) const {
-    if (lite::fluid::isfinite(x))
-      return x;
-    else if (x > static_cast<lite::fluid::float16>(0))
-      return std::numeric_limits<lite::fluid::float16>::max();
-    else
-      return std::numeric_limits<lite::fluid::float16>::min();
-  }
-};
-
-template <lite::TargetType Target, typename T>
-class CrossEntropyFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  lite::Tensor* out,
-                  const lite::Tensor* prob,
-                  const lite::Tensor* labels,
-                  const bool softLabel,
-                  const int ignore_index,
-                  const int axis_dim);
-};
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/detail/CMakeLists.txt b/lite/backends/x86/math/detail/CMakeLists.txt
deleted file mode 100644
index 0df1c060f9..0000000000
--- a/lite/backends/x86/math/detail/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-cc_library(activation_functions SRCS avx_functions.cc)
diff --git a/lite/backends/x86/math/detail/activation_functions.h b/lite/backends/x86/math/detail/activation_functions.h
deleted file mode 100644
index cb215df722..0000000000
--- a/lite/backends/x86/math/detail/activation_functions.h
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>
-#include <string>
-#include "lite/backends/x86/cpu_info.h"
-#include "lite/utils/paddle_enforce.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-namespace detail {
-
-#define SIGMOID_THRESHOLD_MIN -40.0
-#define SIGMOID_THRESHOLD_MAX 13.0
-#define EXP_MAX_INPUT 40.0
-
-enum ActivationType {
-  kSigmoid,
-  kReLU,
-  kTanh,
-  kIdentity,
-};
-
-inline ActivationType GetActivationType(const std::string &type) {
-  if (type == "sigmoid") {
-    return ActivationType::kSigmoid;
-  } else if (type == "relu") {
-    return ActivationType::kReLU;
-  } else if (type == "tanh") {
-    return ActivationType::kTanh;
-  } else if (type == "identity" || type == "") {
-    return ActivationType::kIdentity;
-  }
-  PADDLE_ENFORCE(false, "Not support type %s", type);
-  // PADDLE_THROW("Not support type %s.", type);
-}
-
-namespace forward {
-
-template <typename T>
-T Identity(const T a) {
-  return a;
-}
-
-template <typename T>
-T Relu(const T a) {
-  return a > static_cast<T>(0.0) ? a : static_cast<T>(0.0);
-}
-
-template <typename T>
-T Sigmoid(const T a) {
-  const T min = SIGMOID_THRESHOLD_MIN;
-  const T max = SIGMOID_THRESHOLD_MAX;
-  T tmp = (a < min) ? min : ((a > max) ? max : a);
-  return static_cast<T>(1.0) / (static_cast<T>(1.0) + exp(-tmp));
-}
-
-template <typename T>
-T Tanh(const T a) {
-  T tmp = -2.0 * a;
-  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-  return (2.0 / (1.0 + exp(tmp))) - 1.0;
-}
-
-}  // namespace forward
-
-namespace backward {
-
-template <typename T>
-T Identity(const T a, const T b) {
-  return a;
-}
-
-template <typename T>
-T Relu(const T a, const T b) {
-  return a * (b > 0.0 ? 1.0 : 0.0);
-}
-
-template <typename T>
-T Sigmoid(const T a, const T b) {
-  return a * b * (1.0 - b);
-}
-
-template <typename T>
-T Tanh(const T a, const T b) {
-  return a * (1.0 - b * b);
-}
-
-}  // namespace backward
-
-template <typename T>
-struct Active {
-  typedef T (*Act)(T);
-  typedef T (*ActGrad)(T, T);
-};
-
-static Active<float>::Act kActFloat[] = {&forward::Sigmoid<float>,
-                                         &forward::Relu<float>,
-                                         &forward::Tanh<float>,
-                                         &forward::Identity<float>};
-
-static Active<float>::ActGrad kActGradFloat[] = {&backward::Sigmoid<float>,
-                                                 &backward::Relu<float>,
-                                                 &backward::Tanh<float>,
-                                                 &backward::Identity<float>};
-
-static Active<double>::Act kActDouble[] = {&forward::Sigmoid<double>,
-                                           &forward::Relu<double>,
-                                           &forward::Tanh<double>,
-                                           &forward::Identity<double>};
-
-static Active<double>::ActGrad kActGradDouble[] = {&backward::Sigmoid<double>,
-                                                   &backward::Relu<double>,
-                                                   &backward::Tanh<double>,
-                                                   &backward::Identity<double>};
-
-namespace forward {
-inline float activation(float a, int index) { return kActFloat[index](a); }
-
-inline double activation(double a, int index) { return kActDouble[index](a); }
-
-}  // namespace forward
-
-namespace backward {
-inline float activation(float a, float b, int index) {
-  return kActGradFloat[index](a, b);
-}
-
-inline double activation(double a, double b, int index) {
-  return kActGradDouble[index](a, b);
-}
-}  // namespace backward
-
-#ifdef __AVX__
-namespace forward {
-namespace avx {
-__m256 Relu(const __m256 a);
-__m256 Sigmoid(const __m256 a);
-__m256 Tanh(const __m256 a);
-__m256 Identity(const __m256 a);
-}  // namespace avx
-}  // namespace forward
-
-namespace backward {
-namespace avx {
-__m256 Relu(const __m256 a, const __m256 b);
-__m256 Sigmoid(const __m256 a, const __m256 b);
-__m256 Tanh(const __m256 a, const __m256 b);
-__m256 Identity(const __m256 a, const __m256 b);
-}  // namespace avx
-}  // namespace backward
-
-static Active<__m256>::Act kActAvx[] = {&forward::avx::Sigmoid,
-                                        &forward::avx::Relu,
-                                        &forward::avx::Tanh,
-                                        &forward::avx::Identity};
-
-static Active<__m256>::ActGrad kActGradAvx[] = {&backward::avx::Sigmoid,
-                                                &backward::avx::Relu,
-                                                &backward::avx::Tanh,
-                                                &backward::avx::Identity};
-
-namespace forward {
-inline __m256 activation(__m256 a, int index) { return kActAvx[index](a); }
-}  // namespace forward
-
-namespace backward {
-inline __m256 activation(__m256 a, __m256 b, int index) {
-  return kActGradAvx[index](a, b);
-}
-}  // namespace backward
-
-#endif
-
-}  // namespace detail
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/detail/avx_functions.cc b/lite/backends/x86/math/detail/avx_functions.cc
deleted file mode 100644
index 0b0c5b977b..0000000000
--- a/lite/backends/x86/math/detail/avx_functions.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef __AVX__
-
-#include "lite/backends/x86/math/detail/activation_functions.h"
-#include "lite/backends/x86/math/detail/avx_mathfun.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-namespace detail {
-
-__m256 Exp(__m256 a) { return exp256_ps(a); }
-
-namespace forward {
-namespace avx {
-__m256 Relu(const __m256 a) {
-  __m256 tmp = _mm256_set1_ps(0.0f);
-  return _mm256_max_ps(a, tmp);
-}
-
-__m256 Sigmoid(const __m256 a) {
-  __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
-  __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
-  __m256 tmp = _mm256_max_ps(a, min);
-  tmp = _mm256_min_ps(tmp, max);
-  tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp);
-  tmp = Exp(tmp);
-  tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);
-  tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
-  return tmp;
-}
-
-__m256 Tanh(const __m256 a) {
-  __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
-  __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
-  tmp = _mm256_min_ps(tmp, max);
-  tmp = Exp(tmp);
-  return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f),
-                                     _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)),
-                       _mm256_set1_ps(1.0f));
-}
-
-__m256 Identity(const __m256 a) { return a; }
-
-}  // namespace avx
-}  // namespace forward
-
-namespace backward {
-namespace avx {
-__m256 Relu(const __m256 a, const __m256 b) {
-  return _mm256_mul_ps(
-      a,
-      _mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS),
-                    _mm256_set1_ps(1.0f)));
-}
-
-__m256 Sigmoid(const __m256 a, const __m256 b) {
-  return _mm256_mul_ps(_mm256_mul_ps(a, b),
-                       _mm256_sub_ps(_mm256_set1_ps(1.0f), b));
-}
-
-__m256 Tanh(const __m256 a, const __m256 b) {
-  return _mm256_mul_ps(
-      a, _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b)));
-}
-
-__m256 Identity(const __m256 a, const __m256 b) { return a; }
-}  // namespace avx
-}  // namespace backward
-
-}  // namespace detail
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
-
-#endif
diff --git a/lite/backends/x86/math/detail/avx_mathfun.h b/lite/backends/x86/math/detail/avx_mathfun.h
deleted file mode 100644
index c95c881512..0000000000
--- a/lite/backends/x86/math/detail/avx_mathfun.h
+++ /dev/null
@@ -1,731 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-/*
-   AVX implementation of sin, cos, sincos, exp and log
-
-   Based on "sse_mathfun.h", by Julien Pommier
-   http://gruntthepeon.free.fr/ssemath/
-
-   Copyright (C) 2012 Giovanni Garberoglio
-   Interdisciplinary Laboratory for Computational Science (LISC)
-   Fondazione Bruno Kessler and University of Trento
-   via Sommarive, 18
-   I-38123 Trento (Italy)
-
-  This software is provided 'as-is', without any express or implied
-  warranty.  In no event will the authors be held liable for any damages
-  arising from the use of this software.
-
-  Permission is granted to anyone to use this software for any purpose,
-  including commercial applications, and to alter it and redistribute it
-  freely, subject to the following restrictions:
-
-  1. The origin of this software must not be misrepresented; you must not
-     claim that you wrote the original software. If you use this software
-     in a product, an acknowledgment in the product documentation would be
-     appreciated but is not required.
-  2. Altered source versions must be plainly marked as such, and must not be
-     misrepresented as being the original software.
-  3. This notice may not be removed or altered from any source distribution.
-
-  (this is the zlib license)
-*/
-
-#include "lite/backends/x86/cpu_info.h"
-
-/* __m128 is ugly to write */
-typedef __m256 v8sf;   // vector of 8 float (avx)
-typedef __m256i v8si;  // vector of 8 int   (avx)
-typedef __m128i v4si;  // vector of 8 int   (avx)
-
-#define _PI32AVX_CONST(Name, Val)                                 \
-  static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { \
-      Val, Val, Val, Val}
-
-_PI32AVX_CONST(1, 1);
-_PI32AVX_CONST(inv1, ~1);
-_PI32AVX_CONST(2, 2);
-_PI32AVX_CONST(4, 4);
-
-/* declare some AVX constants -- why can't I figure a better way to do that? */
-#define _PS256_CONST(Name, Val)                                   \
-  static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { \
-      Val, Val, Val, Val, Val, Val, Val, Val}
-#define _PI32_CONST256(Name, Val)                                  \
-  static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { \
-      Val, Val, Val, Val, Val, Val, Val, Val}
-#define _PS256_CONST_TYPE(Name, Type, Val)                       \
-  static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { \
-      Val, Val, Val, Val, Val, Val, Val, Val}
-
-_PS256_CONST(1, 1.0f);
-_PS256_CONST(0p5, 0.5f);
-/* the smallest non denormalized float number */
-_PS256_CONST_TYPE(min_norm_pos, int, 0x00800000);
-_PS256_CONST_TYPE(mant_mask, int, 0x7f800000);
-_PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
-
-_PS256_CONST_TYPE(sign_mask, int, (int)0x80000000);
-_PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
-
-_PI32_CONST256(0, 0);
-_PI32_CONST256(1, 1);
-_PI32_CONST256(inv1, ~1);
-_PI32_CONST256(2, 2);
-_PI32_CONST256(4, 4);
-_PI32_CONST256(0x7f, 0x7f);
-
-_PS256_CONST(cephes_SQRTHF, 0.707106781186547524);
-_PS256_CONST(cephes_log_p0, 7.0376836292E-2);
-_PS256_CONST(cephes_log_p1, -1.1514610310E-1);
-_PS256_CONST(cephes_log_p2, 1.1676998740E-1);
-_PS256_CONST(cephes_log_p3, -1.2420140846E-1);
-_PS256_CONST(cephes_log_p4, +1.4249322787E-1);
-_PS256_CONST(cephes_log_p5, -1.6668057665E-1);
-_PS256_CONST(cephes_log_p6, +2.0000714765E-1);
-_PS256_CONST(cephes_log_p7, -2.4999993993E-1);
-_PS256_CONST(cephes_log_p8, +3.3333331174E-1);
-_PS256_CONST(cephes_log_q1, -2.12194440e-4);
-_PS256_CONST(cephes_log_q2, 0.693359375);
-
-#ifndef __AVX2__
-
-typedef union imm_xmm_union {
-  v8si imm;
-  v4si xmm[2];
-} imm_xmm_union;
-
-#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_)  \
-  {                                          \
-    imm_xmm_union ALIGN32_BEG u ALIGN32_END; \
-    u.imm = imm_;                            \
-    xmm0_ = u.xmm[0];                        \
-    xmm1_ = u.xmm[1];                        \
-  }
-
-#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_)  \
-  {                                          \
-    imm_xmm_union ALIGN32_BEG u ALIGN32_END; \
-    u.xmm[0] = xmm0_;                        \
-    u.xmm[1] = xmm1_;                        \
-    imm_ = u.imm;                            \
-  }
-
-#define AVX2_BITOP_USING_SSE2(fn)                        \
-  static inline v8si avx2_mm256_##fn(v8si x, int a) {    \
-    /* use SSE2 instruction to perform the bitop AVX2 */ \
-    v4si x1, x2;                                         \
-    v8si ret;                                            \
-    COPY_IMM_TO_XMM(x, x1, x2);                          \
-    x1 = _mm_##fn(x1, a);                                \
-    x2 = _mm_##fn(x2, a);                                \
-    COPY_XMM_TO_IMM(x1, x2, ret);                        \
-    return (ret);                                        \
-  }
-
-//#warning "Using SSE2 to perform AVX2 bitshift ops"
-AVX2_BITOP_USING_SSE2(slli_epi32)
-AVX2_BITOP_USING_SSE2(srli_epi32)
-
-#define AVX2_INTOP_USING_SSE2(fn)                                     \
-  static inline v8si avx2_mm256_##fn(v8si x, v8si y) {                \
-    /* use SSE2 instructions to perform the AVX2 integer operation */ \
-    v4si x1, x2;                                                      \
-    v4si y1, y2;                                                      \
-    v8si ret;                                                         \
-    COPY_IMM_TO_XMM(x, x1, x2);                                       \
-    COPY_IMM_TO_XMM(y, y1, y2);                                       \
-    x1 = _mm_##fn(x1, y1);                                            \
-    x2 = _mm_##fn(x2, y2);                                            \
-    COPY_XMM_TO_IMM(x1, x2, ret);                                     \
-    return (ret);                                                     \
-  }
-
-//#warning "Using SSE2 to perform AVX2 integer ops"
-AVX2_INTOP_USING_SSE2(and_si128)
-AVX2_INTOP_USING_SSE2(andnot_si128)
-AVX2_INTOP_USING_SSE2(cmpeq_epi32)
-AVX2_INTOP_USING_SSE2(sub_epi32)
-AVX2_INTOP_USING_SSE2(add_epi32)
-#define avx2_mm256_and_si256 avx2_mm256_and_si128
-#define avx2_mm256_andnot_si256 avx2_mm256_andnot_si128
-#else
-#define avx2_mm256_slli_epi32 _mm256_slli_epi32
-#define avx2_mm256_srli_epi32 _mm256_srli_epi32
-#define avx2_mm256_and_si256 _mm256_and_si256
-#define avx2_mm256_andnot_si256 _mm256_andnot_si256
-#define avx2_mm256_cmpeq_epi32 _mm256_cmpeq_epi32
-#define avx2_mm256_sub_epi32 _mm256_sub_epi32
-#define avx2_mm256_add_epi32 _mm256_add_epi32
-#endif /* __AVX2__ */
-
-/* natural logarithm computed for 8 simultaneous float
-   return NaN for x <= 0
-*/
-v8sf log256_ps(v8sf x) {
-  v8si imm0;
-  v8sf one = *(v8sf *)_ps256_1;
-
-  // v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
-  v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
-
-  x = _mm256_max_ps(
-      x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */
-
-  // can be done with AVX2
-  imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23);
-
-  /* keep only the fractional part */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask);
-  x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5);
-
-  // this is again another AVX2 instruction
-  imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f);
-  v8sf e = _mm256_cvtepi32_ps(imm0);
-
-  e = _mm256_add_ps(e, one);
-
-  /* part2:
-     if( x < SQRTHF ) {
-       e -= 1;
-       x = x + x - 1.0;
-     } else { x = x - 1.0; }
-  */
-  // v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
-  v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS);
-  v8sf tmp = _mm256_and_ps(x, mask);
-  x = _mm256_sub_ps(x, one);
-  e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
-  x = _mm256_add_ps(x, tmp);
-
-  v8sf z = _mm256_mul_ps(x, x);
-
-  v8sf y = *(v8sf *)_ps256_cephes_log_p0;
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8);
-  y = _mm256_mul_ps(y, x);
-
-  y = _mm256_mul_ps(y, z);
-
-  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1);
-  y = _mm256_add_ps(y, tmp);
-
-  tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
-  y = _mm256_sub_ps(y, tmp);
-
-  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2);
-  x = _mm256_add_ps(x, y);
-  x = _mm256_add_ps(x, tmp);
-  x = _mm256_or_ps(x, invalid_mask);  // negative arg will be NAN
-  return x;
-}
-
-_PS256_CONST(exp_hi, 88.3762626647949f);
-_PS256_CONST(exp_lo, -88.3762626647949f);
-
-_PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
-_PS256_CONST(cephes_exp_C1, 0.693359375);
-_PS256_CONST(cephes_exp_C2, -2.12194440e-4);
-
-_PS256_CONST(cephes_exp_p0, 1.9875691500E-4);
-_PS256_CONST(cephes_exp_p1, 1.3981999507E-3);
-_PS256_CONST(cephes_exp_p2, 8.3334519073E-3);
-_PS256_CONST(cephes_exp_p3, 4.1665795894E-2);
-_PS256_CONST(cephes_exp_p4, 1.6666665459E-1);
-_PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
-
-v8sf exp256_ps(v8sf x) {
-  v8sf tmp = _mm256_setzero_ps(), fx;
-  v8si imm0;
-  v8sf one = *(v8sf *)_ps256_1;
-
-  x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi);
-  x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo);
-
-  /* express exp(x) as exp(g + n*log(2)) */
-  fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF);
-  fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5);
-
-  /* how to perform a floorf with SSE: just below */
-  // imm0 = _mm256_cvttps_epi32(fx);
-  // tmp  = _mm256_cvtepi32_ps(imm0);
-
-  tmp = _mm256_floor_ps(fx);
-
-  /* if greater, substract 1 */
-  // v8sf mask = _mm256_cmpgt_ps(tmp, fx);
-  v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
-  mask = _mm256_and_ps(mask, one);
-  fx = _mm256_sub_ps(tmp, mask);
-
-  tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1);
-  v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2);
-  x = _mm256_sub_ps(x, tmp);
-  x = _mm256_sub_ps(x, z);
-
-  z = _mm256_mul_ps(x, x);
-
-  v8sf y = *(v8sf *)_ps256_cephes_exp_p0;
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5);
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, x);
-  y = _mm256_add_ps(y, one);
-
-  /* build 2^n */
-  imm0 = _mm256_cvttps_epi32(fx);
-  // another two AVX2 instructions
-  imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f);
-  imm0 = avx2_mm256_slli_epi32(imm0, 23);
-  v8sf pow2n = _mm256_castsi256_ps(imm0);
-  y = _mm256_mul_ps(y, pow2n);
-  return y;
-}
-
-_PS256_CONST(minus_cephes_DP1, -0.78515625);
-_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
-_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
-_PS256_CONST(sincof_p0, -1.9515295891E-4);
-_PS256_CONST(sincof_p1, 8.3321608736E-3);
-_PS256_CONST(sincof_p2, -1.6666654611E-1);
-_PS256_CONST(coscof_p0, 2.443315711809948E-005);
-_PS256_CONST(coscof_p1, -1.388731625493765E-003);
-_PS256_CONST(coscof_p2, 4.166664568298827E-002);
-_PS256_CONST(cephes_FOPI, 1.27323954473516);  // 4 / M_PI
-
-/* evaluation of 8 sines at onces using AVX intrisics
-
-   The code is the exact rewriting of the cephes sinf function.
-   Precision is excellent as long as x < 8192 (I did not bother to
-   take into account the special handling they have for greater values
-   -- it does not return garbage for arguments over 8192, though, but
-   the extra precision is missing).
-
-   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
-   surprising but correct result.
-
-*/
-v8sf sin256_ps(v8sf x) {  // any x
-  v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
-  v8si imm0, imm2;
-
-#ifndef __AVX2__
-  v4si imm0_1, imm0_2;
-  v4si imm2_1, imm2_2;
-#endif
-
-  sign_bit = x;
-  /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
-  /* extract the sign bit (upper one) */
-  sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask);
-
-  /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
-
-/*
-  Here we start a series of integer operations, which are in the
-  realm of AVX2.
-  If we don't have AVX, let's perform them using SSE2 directives
-*/
-
-#ifdef __AVX2__
-  /* store the integer part of y in mm0 */
-  imm2 = _mm256_cvttps_epi32(y);
-  /* j=(j+1) & (~1) (see the cephes sources) */
-  // another two AVX2 instruction
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
-  y = _mm256_cvtepi32_ps(imm2);
-
-  /* get the swap sign flag */
-  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
-  imm0 = avx2_mm256_slli_epi32(imm0, 29);
-  /* get the polynom selection mask
-     there is one polynom for 0 <= x <= Pi/4
-     and another one for Pi/4<x<=Pi/2
-
-     Both branches will be computed.
-  */
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
-#else
-  /* we use SSE2 routines to perform the integer ops */
-  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
-
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
-
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
-
-  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
-  y = _mm256_cvtepi32_ps(imm2);
-
-  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
-  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
-
-  imm0_1 = _mm_slli_epi32(imm0_1, 29);
-  imm0_2 = _mm_slli_epi32(imm0_2, 29);
-
-  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
-
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
-
-  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
-  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
-
-  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
-#endif
-
-  v8sf swap_sign_bit = _mm256_castsi256_ps(imm0);
-  v8sf poly_mask = _mm256_castsi256_ps(imm2);
-  sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);
-
-  /* The magic pass: "Extended precision modular arithmetic"
-     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
-  xmm1 = _mm256_mul_ps(y, xmm1);
-  xmm2 = _mm256_mul_ps(y, xmm2);
-  xmm3 = _mm256_mul_ps(y, xmm3);
-  x = _mm256_add_ps(x, xmm1);
-  x = _mm256_add_ps(x, xmm2);
-  x = _mm256_add_ps(x, xmm3);
-
-  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = *(v8sf *)_ps256_coscof_p0;
-  v8sf z = _mm256_mul_ps(x, x);
-
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
-  y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
-
-  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-
-  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_mul_ps(y2, x);
-  y2 = _mm256_add_ps(y2, x);
-
-  /* select the correct result from the two polynoms */
-  xmm3 = poly_mask;
-  y2 = _mm256_and_ps(xmm3, y2);  //, xmm3);
-  y = _mm256_andnot_ps(xmm3, y);
-  y = _mm256_add_ps(y, y2);
-  /* update the sign */
-  y = _mm256_xor_ps(y, sign_bit);
-
-  return y;
-}
-
-/* almost the same as sin_ps */
-v8sf cos256_ps(v8sf x) {  // any x
-  v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
-  v8si imm0, imm2;
-
-#ifndef __AVX2__
-  v4si imm0_1, imm0_2;
-  v4si imm2_1, imm2_2;
-#endif
-
-  /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
-
-  /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
-
-#ifdef __AVX2__
-  /* store the integer part of y in mm0 */
-  imm2 = _mm256_cvttps_epi32(y);
-  /* j=(j+1) & (~1) (see the cephes sources) */
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
-  y = _mm256_cvtepi32_ps(imm2);
-  imm2 = avx2_mm256_sub_epi32(imm2, *(v8si *)_pi32_256_2);
-
-  /* get the swap sign flag */
-  imm0 = avx2_mm256_andnot_si256(imm2, *(v8si *)_pi32_256_4);
-  imm0 = avx2_mm256_slli_epi32(imm0, 29);
-  /* get the polynom selection mask */
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
-#else
-
-  /* we use SSE2 routines to perform the integer ops */
-  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
-
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
-
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
-
-  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
-  y = _mm256_cvtepi32_ps(imm2);
-
-  imm2_1 = _mm_sub_epi32(imm2_1, *(v4si *)_pi32avx_2);
-  imm2_2 = _mm_sub_epi32(imm2_2, *(v4si *)_pi32avx_2);
-
-  imm0_1 = _mm_andnot_si128(imm2_1, *(v4si *)_pi32avx_4);
-  imm0_2 = _mm_andnot_si128(imm2_2, *(v4si *)_pi32avx_4);
-
-  imm0_1 = _mm_slli_epi32(imm0_1, 29);
-  imm0_2 = _mm_slli_epi32(imm0_2, 29);
-
-  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
-
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
-
-  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
-  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
-
-  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
-#endif
-
-  v8sf sign_bit = _mm256_castsi256_ps(imm0);
-  v8sf poly_mask = _mm256_castsi256_ps(imm2);
-
-  /* The magic pass: "Extended precision modular arithmetic"
-     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
-  xmm1 = _mm256_mul_ps(y, xmm1);
-  xmm2 = _mm256_mul_ps(y, xmm2);
-  xmm3 = _mm256_mul_ps(y, xmm3);
-  x = _mm256_add_ps(x, xmm1);
-  x = _mm256_add_ps(x, xmm2);
-  x = _mm256_add_ps(x, xmm3);
-
-  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = *(v8sf *)_ps256_coscof_p0;
-  v8sf z = _mm256_mul_ps(x, x);
-
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
-  y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
-
-  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-
-  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_mul_ps(y2, x);
-  y2 = _mm256_add_ps(y2, x);
-
-  /* select the correct result from the two polynoms */
-  xmm3 = poly_mask;
-  y2 = _mm256_and_ps(xmm3, y2);  //, xmm3);
-  y = _mm256_andnot_ps(xmm3, y);
-  y = _mm256_add_ps(y, y2);
-  /* update the sign */
-  y = _mm256_xor_ps(y, sign_bit);
-
-  return y;
-}
-
-/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could
-   replace both of them..
-   it is almost as fast, and gives you a free cosine with your sine */
-void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
-  v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
-  v8si imm0, imm2, imm4;
-
-#ifndef __AVX2__
-  v4si imm0_1, imm0_2;
-  v4si imm2_1, imm2_2;
-  v4si imm4_1, imm4_2;
-#endif
-
-  sign_bit_sin = x;
-  /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
-  /* extract the sign bit (upper one) */
-  sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf *)_ps256_sign_mask);
-
-  /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
-
-#ifdef __AVX2__
-  /* store the integer part of y in imm2 */
-  imm2 = _mm256_cvttps_epi32(y);
-
-  /* j=(j+1) & (~1) (see the cephes sources) */
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
-
-  y = _mm256_cvtepi32_ps(imm2);
-  imm4 = imm2;
-
-  /* get the swap sign flag for the sine */
-  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
-  imm0 = avx2_mm256_slli_epi32(imm0, 29);
-  // v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
-
-  /* get the polynom selection mask for the sine*/
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
-// v8sf poly_mask = _mm256_castsi256_ps(imm2);
-#else
-  /* we use SSE2 routines to perform the integer ops */
-  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
-
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
-
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
-
-  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
-  y = _mm256_cvtepi32_ps(imm2);
-
-  imm4_1 = imm2_1;
-  imm4_2 = imm2_2;
-
-  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
-  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
-
-  imm0_1 = _mm_slli_epi32(imm0_1, 29);
-  imm0_2 = _mm_slli_epi32(imm0_2, 29);
-
-  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
-
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
-
-  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
-  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
-
-  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
-#endif
-  v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
-  v8sf poly_mask = _mm256_castsi256_ps(imm2);
-
-  /* The magic pass: "Extended precision modular arithmetic"
-     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
-  xmm1 = _mm256_mul_ps(y, xmm1);
-  xmm2 = _mm256_mul_ps(y, xmm2);
-  xmm3 = _mm256_mul_ps(y, xmm3);
-  x = _mm256_add_ps(x, xmm1);
-  x = _mm256_add_ps(x, xmm2);
-  x = _mm256_add_ps(x, xmm3);
-
-#ifdef __AVX2__
-  imm4 = avx2_mm256_sub_epi32(imm4, *(v8si *)_pi32_256_2);
-  imm4 = avx2_mm256_andnot_si256(imm4, *(v8si *)_pi32_256_4);
-  imm4 = avx2_mm256_slli_epi32(imm4, 29);
-#else
-  imm4_1 = _mm_sub_epi32(imm4_1, *(v4si *)_pi32avx_2);
-  imm4_2 = _mm_sub_epi32(imm4_2, *(v4si *)_pi32avx_2);
-
-  imm4_1 = _mm_andnot_si128(imm4_1, *(v4si *)_pi32avx_4);
-  imm4_2 = _mm_andnot_si128(imm4_2, *(v4si *)_pi32avx_4);
-
-  imm4_1 = _mm_slli_epi32(imm4_1, 29);
-  imm4_2 = _mm_slli_epi32(imm4_2, 29);
-
-  COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
-#endif
-
-  v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);
-
-  sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
-
-  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  v8sf z = _mm256_mul_ps(x, x);
-  y = *(v8sf *)_ps256_coscof_p0;
-
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
-  y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
-
-  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-
-  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_mul_ps(y2, x);
-  y2 = _mm256_add_ps(y2, x);
-
-  /* select the correct result from the two polynoms */
-  xmm3 = poly_mask;
-  v8sf ysin2 = _mm256_and_ps(xmm3, y2);
-  v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
-  y2 = _mm256_sub_ps(y2, ysin2);
-  y = _mm256_sub_ps(y, ysin1);
-
-  xmm1 = _mm256_add_ps(ysin1, ysin2);
-  xmm2 = _mm256_add_ps(y, y2);
-
-  /* update the sign */
-  *s = _mm256_xor_ps(xmm1, sign_bit_sin);
-  *c = _mm256_xor_ps(xmm2, sign_bit_cos);
-}
diff --git a/lite/backends/x86/math/detail/gru_cpu_kernel.h b/lite/backends/x86/math/detail/gru_cpu_kernel.h
deleted file mode 100644
index 400431810d..0000000000
--- a/lite/backends/x86/math/detail/gru_cpu_kernel.h
+++ /dev/null
@@ -1,608 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <type_traits>
-#include "lite/backends/x86/math/detail/activation_functions.h"
-#include "lite/backends/x86/math/gru_compute.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-namespace detail {
-
-#ifndef __NVCC__
-
-template <class OpResetOutput, typename T>
-void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
-                                       T *gate_value,
-                                       T *reset_output_value,
-                                       T *prev_output_value,
-                                       int frame_size,
-                                       ActivationType active_gate) {
-  T r_value_update_gate;
-  T r_value_reset_gate;
-  T r_value_reset_output;
-  T r_prev_out = 0;
-  T *update_gate = gate_value;
-  T *reset_gate = gate_value + frame_size;
-
-  for (int i = 0; i < frame_size; i++) {
-    r_value_update_gate = update_gate[i];
-    r_value_reset_gate = reset_gate[i];
-    if (prev_output_value) {
-      r_prev_out = prev_output_value[i];
-    }
-
-    op_reset_output(&r_value_update_gate,
-                    &r_value_reset_gate,
-                    &r_prev_out,
-                    &r_value_reset_output,
-                    active_gate);
-
-    update_gate[i] = r_value_update_gate;
-    reset_gate[i] = r_value_reset_gate;
-    reset_output_value[i] = r_value_reset_output;
-  }
-}
-
-template <class OpFinalOutput, typename T>
-void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
-                                       T *gate_value,
-                                       T *prev_output_value,
-                                       T *output_value,
-                                       int frame_size,
-                                       ActivationType active_node,
-                                       bool origin_mode) {
-  T r_value_update_gate;
-  T r_value_frame_state;
-  T r_prev_out = 0;
-  T r_output;
-  T *update_gate = gate_value;
-  T *frame_state = gate_value + frame_size * 2;
-
-  for (int i = 0; i < frame_size; i++) {
-    r_value_update_gate = update_gate[i];
-    r_value_frame_state = frame_state[i];
-    if (prev_output_value) {
-      r_prev_out = prev_output_value[i];
-    }
-
-    op_final_output(&r_value_update_gate,
-                    &r_value_frame_state,
-                    &r_prev_out,
-                    &r_output,
-                    active_node,
-                    origin_mode);
-
-    frame_state[i] = r_value_frame_state;
-    output_value[i] = r_output;
-  }
-}
-
-template <class OpResetOutput, typename T>
-void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
-                                     T *gate_value,
-                                     T *reset_output_value,
-                                     T *prev_output_value,
-                                     int frame_size,
-                                     ActivationType active_gate) {
-#ifdef __AVX__
-  __m256 r_value_update_gate, r_value_update_gate_last = _mm256_set1_ps(0.0f);
-  __m256 r_value_reset_gate, r_value_reset_gate_last = _mm256_set1_ps(0.0f);
-  __m256 r_value_reset_output;
-  __m256 r_prev_out = _mm256_set1_ps(0.0f),
-         r_prev_out_last = _mm256_set1_ps(0.0f);
-  T *update_gate = gate_value;
-  T *reset_gate = gate_value + frame_size;
-  int block = 8;
-  const int n = frame_size;
-  const int rest = n % block;
-  const int end = n - rest;
-  int i = 0;
-
-  if (rest > 0) {
-    i = n - block;
-    r_value_update_gate_last =
-        _mm256_loadu_ps((const float *)(update_gate + i));
-    r_value_reset_gate_last = _mm256_loadu_ps((const float *)(reset_gate + i));
-    if (prev_output_value) {
-      r_prev_out_last = _mm256_loadu_ps((const float *)(prev_output_value + i));
-    }
-  }
-
-  for (i = 0; i < end; i += block) {
-    r_value_update_gate = _mm256_loadu_ps((const float *)(update_gate + i));
-    r_value_reset_gate = _mm256_loadu_ps((const float *)(reset_gate + i));
-    if (prev_output_value) {
-      r_prev_out = _mm256_loadu_ps((const float *)(prev_output_value + i));
-    }
-
-    op_reset_output(&r_value_update_gate,
-                    &r_value_reset_gate,
-                    &r_prev_out,
-                    &r_value_reset_output,
-                    active_gate);
-
-    _mm256_storeu_ps(reinterpret_cast<float *>(update_gate + i),
-                     r_value_update_gate);
-    _mm256_storeu_ps(reinterpret_cast<float *>(reset_gate + i),
-                     r_value_reset_gate);
-    _mm256_storeu_ps(reinterpret_cast<float *>(reset_output_value + i),
-                     r_value_reset_output);
-  }
-
-  if (rest > 0) {
-    i = n - block;
-
-    op_reset_output(&r_value_update_gate_last,
-                    &r_value_reset_gate_last,
-                    &r_prev_out_last,
-                    &r_value_reset_output,
-                    active_gate);
-
-    _mm256_storeu_ps(reinterpret_cast<float *>(update_gate + i),
-                     r_value_update_gate_last);
-    _mm256_storeu_ps(reinterpret_cast<float *>(reset_gate + i),
-                     r_value_reset_gate_last);
-    _mm256_storeu_ps(reinterpret_cast<float *>(reset_output_value + i),
-                     r_value_reset_output);
-  }
-#endif
-}
-
-template <class OpFinalOutput, typename T>
-void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
-                                     T *gate_value,
-                                     T *prev_output_value,
-                                     T *output_value,
-                                     int frame_size,
-                                     ActivationType active_node,
-                                     bool origin_mode) {
-#ifdef __AVX__
-  __m256 r_value_update_gate, r_value_update_gate_last = _mm256_set1_ps(0.0f);
-  __m256 r_value_frame_state, r_value_frame_state_last = _mm256_set1_ps(0.0f);
-  __m256 r_prev_out = _mm256_set1_ps(0.0f),
-         r_prev_out_last = _mm256_set1_ps(0.0f);
-  __m256 r_output;
-  T *update_gate = gate_value;
-  T *frame_state = gate_value + frame_size * 2;
-  int block = 8;
-  const int n = frame_size;
-  const int rest = n % block;
-  const int end = n - rest;
-  int i = 0;
-
-  if (rest > 0) {
-    i = n - block;
-    r_value_update_gate_last =
-        _mm256_loadu_ps((const float *)(update_gate + i));
-    r_value_frame_state_last =
-        _mm256_loadu_ps((const float *)(frame_state + i));
-    if (prev_output_value) {
-      r_prev_out_last = _mm256_loadu_ps((const float *)(prev_output_value + i));
-    }
-  }
-
-  for (i = 0; i < end; i += block) {
-    r_value_update_gate = _mm256_loadu_ps((const float *)(update_gate + i));
-    r_value_frame_state = _mm256_loadu_ps((const float *)(frame_state + i));
-    if (prev_output_value) {
-      r_prev_out = _mm256_loadu_ps((const float *)(prev_output_value + i));
-    }
-
-    op_final_output(&r_value_update_gate,
-                    &r_value_frame_state,
-                    &r_prev_out,
-                    &r_output,
-                    active_node,
-                    origin_mode);
-
-    _mm256_storeu_ps(reinterpret_cast<float *>(frame_state + i),
-                     r_value_frame_state);
-    _mm256_storeu_ps(reinterpret_cast<float *>(output_value + i), r_output);
-  }
-
-  if (rest > 0) {
-    i = n - block;
-    op_final_output(&r_value_update_gate_last,
-                    &r_value_frame_state_last,
-                    &r_prev_out_last,
-                    &r_output,
-                    active_node,
-                    origin_mode);
-
-    _mm256_storeu_ps(reinterpret_cast<float *>(frame_state + i),
-                     r_value_frame_state_last);
-    _mm256_storeu_ps(reinterpret_cast<float *>(output_value + i), r_output);
-  }
-
-#endif
-}
-
-template <class OpResetOutput, typename T>
-inline void forward_reset_output(OpResetOutput op_reset_output,
-                                 GRUMetaValue<T> value,
-                                 int frame_size,
-                                 int batch_size,
-                                 ActivationType active_gate) {
-  for (int b = 0; b < batch_size; b++) {
-    if (OpResetOutput::avx && (frame_size > static_cast<int>(8 - 1)) &&
-        (sizeof(T) == 4)) {
-      hl_avx_gru_forward_reset_output(op_reset_output,
-                                      value.gate_value,
-                                      value.reset_output_value,
-                                      value.prev_out_value,
-                                      frame_size,
-                                      active_gate);
-    } else {
-      hl_naive_gru_forward_reset_output(op_reset_output,
-                                        value.gate_value,
-                                        value.reset_output_value,
-                                        value.prev_out_value,
-                                        frame_size,
-                                        active_gate);
-    }
-
-    value.gate_value += frame_size * 3;
-    value.reset_output_value += frame_size;
-    if (value.prev_out_value) {
-      value.prev_out_value += frame_size;
-    }
-  }
-}
-
-template <class OpFinalOutput, typename T>
-inline void forward_final_output(OpFinalOutput op_final_output,
-                                 GRUMetaValue<T> value,
-                                 int frame_size,
-                                 int batch_size,
-                                 ActivationType active_node,
-                                 bool origin_mode) {
-  for (int b = 0; b < batch_size; b++) {
-    if (OpFinalOutput::avx && (frame_size > static_cast<int>(8 - 1)) &&
-        (sizeof(T) == 4)) {
-      hl_avx_gru_forward_final_output(op_final_output,
-                                      value.gate_value,
-                                      value.prev_out_value,
-                                      value.output_value,
-                                      frame_size,
-                                      active_node,
-                                      origin_mode);
-    } else {
-      hl_naive_gru_forward_final_output(op_final_output,
-                                        value.gate_value,
-                                        value.prev_out_value,
-                                        value.output_value,
-                                        frame_size,
-                                        active_node,
-                                        origin_mode);
-    }
-
-    value.gate_value += frame_size * 3;
-    value.output_value += frame_size;
-    if (value.prev_out_value) {
-      value.prev_out_value += frame_size;
-    }
-  }
-}
-
-template <class OpStateGrad, typename T>
-void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad,
-                                      T *gate_value,
-                                      T *gate_grad,
-                                      T *prev_out_value,
-                                      T *prev_out_grad,
-                                      T *output_grad,
-                                      int frame_size,
-                                      ActivationType active_node,
-                                      bool origin_mode) {
-  T r_update_gate_value;
-  T r_update_gate_grad;
-  T r_frame_state_value;
-  T r_frame_state_grad;
-  T r_out_grad;
-  T r_prev_out_value = 0;
-  T r_prev_out_grad = 0;
-  T *update_gate_value = gate_value;
-  T *update_gate_grad = gate_grad;
-  T *frame_state_value = gate_value + frame_size * 2;
-  T *frame_state_grad = gate_grad + frame_size * 2;
-
-  for (int i = 0; i < frame_size; i++) {
-    r_update_gate_value = update_gate_value[i];
-    r_frame_state_value = frame_state_value[i];
-    r_out_grad = output_grad[i];
-    if (prev_out_value) {
-      r_prev_out_value = prev_out_value[i];
-    }
-    if (prev_out_grad) {
-      r_prev_out_grad = prev_out_grad[i];
-    }
-
-    op_state_grad(&r_update_gate_value,
-                  &r_update_gate_grad,
-                  &r_frame_state_value,
-                  &r_frame_state_grad,
-                  &r_prev_out_value,
-                  &r_prev_out_grad,
-                  &r_out_grad,
-                  active_node,
-                  origin_mode);
-
-    update_gate_grad[i] = r_update_gate_grad;
-    frame_state_grad[i] = r_frame_state_grad;
-    if (prev_out_grad) {
-      prev_out_grad[i] = r_prev_out_grad;
-    }
-  }
-}
-
-template <class OpResetGrad, typename T>
-void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad,
-                                      T *gate_value,
-                                      T *gate_grad,
-                                      T *prev_out_value,
-                                      T *prev_out_grad,
-                                      T *reset_output_grad,
-                                      int frame_size,
-                                      ActivationType active_gate) {
-  T r_update_gate_value;
-  T r_update_gate_grad;
-  T r_reset_gate_value;
-  T r_reset_gate_grad;
-  T r_reset_output_grad = 0;
-  T r_prev_out_value = 0;
-  T r_prev_out_grad = 0;
-  T *update_gate_value = gate_value;
-  T *update_gate_grad = gate_grad;
-  T *reset_gate_value = gate_value + frame_size;
-  T *reset_gate_grad = gate_grad + frame_size;
-
-  for (int i = 0; i < frame_size; i++) {
-    r_update_gate_value = update_gate_value[i];
-    r_update_gate_grad = update_gate_grad[i];
-    r_reset_gate_value = reset_gate_value[i];
-
-    if (prev_out_value && prev_out_grad) {
-      r_reset_output_grad = reset_output_grad[i];
-    }
-    if (prev_out_value) {
-      r_prev_out_value = prev_out_value[i];
-    }
-    if (prev_out_grad) {
-      r_prev_out_grad = prev_out_grad[i];
-    }
-
-    op_reset_grad(&r_update_gate_value,
-                  &r_update_gate_grad,
-                  &r_reset_gate_value,
-                  &r_reset_gate_grad,
-                  &r_prev_out_value,
-                  &r_prev_out_grad,
-                  &r_reset_output_grad,
-                  active_gate);
-
-    update_gate_grad[i] = r_update_gate_grad;
-    reset_gate_grad[i] = r_reset_gate_grad;
-    if (prev_out_grad) {
-      prev_out_grad[i] = r_prev_out_grad;
-    }
-  }
-}
-
-template <class OpStateGrad, typename T>
-void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad,
-                                    T *gate_value,
-                                    T *gate_grad,
-                                    T *prev_out_value,
-                                    T *prev_out_grad,
-                                    T *output_grad,
-                                    int frame_size,
-                                    ActivationType active_node,
-                                    bool origin_mode) {
-#ifdef __AVX__
-  __m256 r_update_gate_value;
-  __m256 r_update_gate_grad;
-  __m256 r_frame_state_value;
-  __m256 r_frame_state_grad;
-  __m256 r_out_grad;
-  __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
-  __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
-  __m256 *update_gate_value = reinterpret_cast<__m256 *>(gate_value);
-  __m256 *update_gate_grad = reinterpret_cast<__m256 *>(gate_grad);
-  __m256 *frame_state_value =
-      reinterpret_cast<__m256 *>(gate_value + frame_size * 2);
-  __m256 *frame_state_grad =
-      reinterpret_cast<__m256 *>(gate_grad + frame_size * 2);
-
-  for (int i = 0; i < frame_size / 8; i++) {
-    r_update_gate_value = update_gate_value[i];
-    r_frame_state_value = frame_state_value[i];
-    r_out_grad = (reinterpret_cast<__m256 *>(output_grad))[i];
-    if (prev_out_value) {
-      r_prev_out_value = (reinterpret_cast<__m256 *>(prev_out_value))[i];
-    }
-    if (prev_out_grad) {
-      r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i];
-    }
-
-    op_state_grad(&r_update_gate_value,
-                  &r_update_gate_grad,
-                  &r_frame_state_value,
-                  &r_frame_state_grad,
-                  &r_prev_out_value,
-                  &r_prev_out_grad,
-                  &r_out_grad,
-                  active_node,
-                  origin_mode);
-
-    update_gate_grad[i] = r_update_gate_grad;
-    frame_state_grad[i] = r_frame_state_grad;
-    if (prev_out_grad) {
-      (reinterpret_cast<__m256 *>(prev_out_grad))[i] = r_prev_out_grad;
-    }
-  }
-#endif
-}
-
-template <class OpResetGrad, typename T>
-void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad,
-                                    T *gate_value,
-                                    T *gate_grad,
-                                    T *prev_out_value,
-                                    T *prev_out_grad,
-                                    T *reset_output_grad,
-                                    int frame_size,
-                                    ActivationType active_gate) {
-#ifdef __AVX__
-  __m256 r_update_gate_value;
-  __m256 r_update_gate_grad;
-  __m256 r_reset_gate_value;
-  __m256 r_reset_gate_grad;
-  __m256 r_reset_output_grad = _mm256_set1_ps(0.0f);
-  __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
-  __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
-  __m256 *update_gate_value = reinterpret_cast<__m256 *>(gate_value);
-  __m256 *update_gate_grad = reinterpret_cast<__m256 *>(gate_grad);
-  __m256 *reset_gate_value =
-      reinterpret_cast<__m256 *>(gate_value + frame_size);
-  __m256 *reset_gate_grad = reinterpret_cast<__m256 *>(gate_grad + frame_size);
-
-  for (int i = 0; i < frame_size / 8; i++) {
-    r_update_gate_value = update_gate_value[i];
-    r_update_gate_grad = update_gate_grad[i];
-    r_reset_gate_value = reset_gate_value[i];
-
-    if (prev_out_value && prev_out_grad) {
-      r_reset_output_grad = (reinterpret_cast<__m256 *>(reset_output_grad))[i];
-    }
-    if (prev_out_value) {
-      r_prev_out_value = (reinterpret_cast<__m256 *>(prev_out_value))[i];
-    }
-    if (prev_out_grad) {
-      r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i];
-    }
-
-    op_reset_grad(&r_update_gate_value,
-                  &r_update_gate_grad,
-                  &r_reset_gate_value,
-                  &r_reset_gate_grad,
-                  &r_prev_out_value,
-                  &r_prev_out_grad,
-                  &r_reset_output_grad,
-                  active_gate);
-
-    update_gate_grad[i] = r_update_gate_grad;
-    reset_gate_grad[i] = r_reset_gate_grad;
-    if (prev_out_grad) {
-      (reinterpret_cast<__m256 *>(prev_out_grad))[i] = r_prev_out_grad;
-    }
-  }
-#endif
-}
-
-template <class OpStateGrad, typename T>
-inline void backward_state_grad(OpStateGrad op_state_grad,
-                                GRUMetaValue<T> value,
-                                GRUMetaGrad<T> grad,
-                                int frame_size,
-                                int batch_size,
-                                ActivationType active_node,
-                                bool origin_mode) {
-  for (int b = 0; b < batch_size; b++) {
-    if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
-      hl_avx_gru_backward_state_grad(op_state_grad,
-                                     value.gate_value,
-                                     grad.gate_grad,
-                                     value.prev_out_value,
-                                     grad.prev_out_grad,
-                                     grad.output_grad,
-                                     frame_size,
-                                     active_node,
-                                     origin_mode);
-    } else {
-      hl_naive_gru_backward_state_grad(op_state_grad,
-                                       value.gate_value,
-                                       grad.gate_grad,
-                                       value.prev_out_value,
-                                       grad.prev_out_grad,
-                                       grad.output_grad,
-                                       frame_size,
-                                       active_node,
-                                       origin_mode);
-    }
-
-    value.gate_value += frame_size * 3;
-    if (value.prev_out_value) {
-      value.prev_out_value += frame_size;
-    }
-
-    grad.gate_grad += frame_size * 3;
-    grad.output_grad += frame_size;
-    if (grad.prev_out_grad) {
-      grad.prev_out_grad += frame_size;
-    }
-  }
-}
-
-template <class OpResetGrad, typename T>
-inline void backward_reset_grad(OpResetGrad op_reset_grad,
-                                GRUMetaValue<T> value,
-                                GRUMetaGrad<T> grad,
-                                int frame_size,
-                                int batch_size,
-                                ActivationType active_gate) {
-  for (int b = 0; b < batch_size; b++) {
-    if (OpResetGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
-      hl_avx_gru_backward_reset_grad(op_reset_grad,
-                                     value.gate_value,
-                                     grad.gate_grad,
-                                     value.prev_out_value,
-                                     grad.prev_out_grad,
-                                     grad.reset_output_grad,
-                                     frame_size,
-                                     active_gate);
-    } else {
-      hl_naive_gru_backward_reset_grad(op_reset_grad,
-                                       value.gate_value,
-                                       grad.gate_grad,
-                                       value.prev_out_value,
-                                       grad.prev_out_grad,
-                                       grad.reset_output_grad,
-                                       frame_size,
-                                       active_gate);
-    }
-
-    value.gate_value += frame_size * 3;
-    if (value.prev_out_value) {
-      value.prev_out_value += frame_size;
-    }
-
-    grad.gate_grad += frame_size * 3;
-    grad.reset_output_grad += frame_size;
-    if (grad.prev_out_grad) {
-      grad.prev_out_grad += frame_size;
-    }
-  }
-}
-
-#endif
-
-}  // namespace detail
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/detail/gru_kernel.h b/lite/backends/x86/math/detail/gru_kernel.h
deleted file mode 100644
index 91c753c685..0000000000
--- a/lite/backends/x86/math/detail/gru_kernel.h
+++ /dev/null
@@ -1,222 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <type_traits>
-#include "lite/backends/x86/math/detail/activation_functions.h"
-#include "lite/utils/macros.h"
-
-// TODO(guosheng): refine code style in gru_kernel
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-namespace detail {
-
-namespace forward {
-
-template <typename T>
-class gru_resetOutput {
- public:
-  HOSTDEVICE void operator()(T *value_update_gate,
-                             T *value_reset_gate,
-                             T *prev_out,
-                             T *value_reset_output,
-                             ActivationType act_gate) {
-    *value_update_gate = activation(*value_update_gate, act_gate);
-    *value_reset_gate = activation(*value_reset_gate, act_gate);
-    *value_reset_output = (*prev_out) * (*value_reset_gate);
-  }
-#ifndef __AVX__
-  static const bool avx = false;
-#else
-  static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 *value_update_gate,
-                             __m256 *value_reset_gate,
-                             __m256 *prev_out,
-                             __m256 *value_reset_output,
-                             ActivationType act_gate) {
-    *value_update_gate = activation(*value_update_gate, act_gate);
-    *value_reset_gate = activation(*value_reset_gate, act_gate);
-    *value_reset_output = _mm256_mul_ps(*prev_out, *value_reset_gate);
-  }
-#endif
-};
-
-template <typename T>
-class gru_finalOutput {
- public:
-  HOSTDEVICE void operator()(T *value_update_gate,
-                             T *value_frame_state,
-                             T *prev_out,
-                             T *value_output,
-                             ActivationType act_input,
-                             bool origin_mode) {
-    *value_frame_state = activation(*value_frame_state, act_input);
-    if (origin_mode) {
-      *value_output = ((*value_update_gate) * (*prev_out)) +
-                      *value_frame_state -
-                      ((*value_update_gate) * (*value_frame_state));
-    } else {
-      *value_output = *prev_out - ((*value_update_gate) * (*prev_out)) +
-                      ((*value_update_gate) * (*value_frame_state));
-    }
-  }
-#ifndef __AVX__
-  static const bool avx = false;
-#else
-  static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 *value_update_gate,
-                             __m256 *value_frame_state,
-                             __m256 *prev_out,
-                             __m256 *value_output,
-                             ActivationType act_input,
-                             bool origin_mode) {
-    *value_frame_state = activation(*value_frame_state, act_input);
-    if (origin_mode) {
-      *value_output = _mm256_sub_ps(
-          _mm256_add_ps(_mm256_mul_ps(*value_update_gate, *prev_out),
-                        *value_frame_state),
-          _mm256_mul_ps(*value_update_gate, *value_frame_state));
-    } else {
-      *value_output = _mm256_add_ps(
-          _mm256_sub_ps(*prev_out,
-                        _mm256_mul_ps(*value_update_gate, *prev_out)),
-          _mm256_mul_ps(*value_update_gate, *value_frame_state));
-    }
-  }
-#endif
-};
-}  // namespace forward
-
-namespace backward {
-
-template <typename T>
-class gru_stateGrad {
- public:
-  HOSTDEVICE void operator()(T *value_update_gate,
-                             T *grad_update_gate,
-                             T *value_frame_state,
-                             T *grad_frame_state,
-                             T *value_prev_out,
-                             T *grad_prev_out,
-                             T *grad_output,
-                             ActivationType act_input,
-                             bool origin_mode) {
-    if (origin_mode) {
-      *grad_update_gate =
-          (*grad_output) * ((*value_prev_out) - (*value_frame_state));
-      *grad_prev_out += (*grad_output * (*value_update_gate));
-      *grad_frame_state = activation(
-          *grad_output * (static_cast<T>(1.0) - (*value_update_gate)),
-          *value_frame_state,
-          act_input);
-    } else {
-      *grad_update_gate =
-          (*grad_output) * ((*value_frame_state) - (*value_prev_out));
-      *grad_prev_out +=
-          (*grad_output * (static_cast<T>(1.0) - *value_update_gate));
-      *grad_frame_state = activation(
-          *grad_output * (*value_update_gate), *value_frame_state, act_input);
-    }
-  }
-#ifndef __AVX__
-  static const bool avx = false;
-#else
-  static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 *value_update_gate,
-                             __m256 *grad_update_gate,
-                             __m256 *value_frame_state,
-                             __m256 *grad_frame_state,
-                             __m256 *value_prev_out,
-                             __m256 *grad_prev_out,
-                             __m256 *grad_output,
-                             ActivationType act_input,
-                             bool origin_mode) {
-    if (origin_mode) {
-      *grad_update_gate = _mm256_mul_ps(
-          *grad_output, _mm256_sub_ps(*value_prev_out, *value_frame_state));
-      *grad_prev_out = _mm256_add_ps(
-          *grad_prev_out, _mm256_mul_ps(*grad_output, *value_update_gate));
-      *grad_frame_state = activation(
-          _mm256_mul_ps(
-              *grad_output,
-              _mm256_sub_ps(_mm256_set1_ps(1.0f), *value_update_gate)),
-          *value_frame_state,
-          act_input);
-    } else {
-      *grad_update_gate = _mm256_mul_ps(
-          *grad_output, _mm256_sub_ps(*value_frame_state, *value_prev_out));
-      *grad_prev_out = _mm256_add_ps(
-          *grad_prev_out,
-          _mm256_mul_ps(
-              *grad_output,
-              _mm256_sub_ps(_mm256_set1_ps(1.0f), *value_update_gate)));
-      *grad_frame_state =
-          activation(_mm256_mul_ps(*grad_output, *value_update_gate),
-                     *value_frame_state,
-                     act_input);
-    }
-  }
-#endif
-};
-
-template <typename T>
-class gru_resetGrad {
- public:
-  HOSTDEVICE void operator()(T *value_update_gate,
-                             T *grad_update_gate,
-                             T *value_reset_gate,
-                             T *grad_reset_gate,
-                             T *value_prev_out,
-                             T *grad_prev_out,
-                             T *grad_reset_output,
-                             ActivationType act_gate) {
-    *grad_reset_gate = (*grad_reset_output * (*value_prev_out));
-    *grad_prev_out += (*grad_reset_output * (*value_reset_gate));
-    *grad_update_gate =
-        activation(*grad_update_gate, *value_update_gate, act_gate);
-    *grad_reset_gate =
-        activation(*grad_reset_gate, *value_reset_gate, act_gate);
-  }
-#ifndef __AVX__
-  static const bool avx = false;
-#else
-  static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 *value_update_gate,
-                             __m256 *grad_update_gate,
-                             __m256 *value_reset_gate,
-                             __m256 *grad_reset_gate,
-                             __m256 *value_prev_out,
-                             __m256 *grad_prev_out,
-                             __m256 *grad_reset_output,
-                             ActivationType act_gate) {
-    *grad_reset_gate = _mm256_mul_ps(*grad_reset_output, *value_prev_out);
-    *grad_prev_out = _mm256_add_ps(
-        *grad_prev_out, _mm256_mul_ps(*grad_reset_output, *value_reset_gate));
-    *grad_update_gate =
-        activation(*grad_update_gate, *value_update_gate, act_gate);
-    *grad_reset_gate =
-        activation(*grad_reset_gate, *value_reset_gate, act_gate);
-  }
-#endif
-};
-
-}  // namespace backward
-
-}  // namespace detail
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/gru_compute.cc b/lite/backends/x86/math/gru_compute.cc
deleted file mode 100644
index b1fdfe18a5..0000000000
--- a/lite/backends/x86/math/gru_compute.cc
+++ /dev/null
@@ -1,181 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/x86/math/gru_compute.h"
-#include "lite/backends/x86/math/blas.h"
-#include "lite/backends/x86/math/detail/gru_cpu_kernel.h"
-#include "lite/backends/x86/math/detail/gru_kernel.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-template <typename T>
-struct GRUUnitFunctor<lite::TargetType::kX86, T> {
-  static void compute(const lite::X86Context &context,
-                      GRUMetaValue<T> value,
-                      int frame_size,
-                      int batch_size,
-                      const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate,
-                      bool origin_mode) {
-#ifndef __NVCC__
-    auto blas = math::GetBlas<lite::TargetType::kX86, T>(context);
-    if (value.prev_out_value) {
-      blas.GEMM(false,
-                false,
-                batch_size,
-                frame_size * 2,
-                frame_size,
-                1,
-                value.prev_out_value,
-                frame_size,
-                value.gate_weight,
-                frame_size * 2,
-                1,
-                value.gate_value,
-                frame_size * 3);
-    }
-
-    detail::forward_reset_output(detail::forward::gru_resetOutput<T>(),
-                                 value,
-                                 frame_size,
-                                 batch_size,
-                                 active_gate);
-
-    if (value.prev_out_value) {
-      blas.GEMM(false,
-                false,
-                batch_size,
-                frame_size,
-                frame_size,
-                1,
-                value.reset_output_value,
-                frame_size,
-                value.state_weight,
-                frame_size,
-                1,
-                value.gate_value + frame_size * 2,
-                frame_size * 3);
-    }
-
-    detail::forward_final_output(detail::forward::gru_finalOutput<T>(),
-                                 value,
-                                 frame_size,
-                                 batch_size,
-                                 active_node,
-                                 origin_mode);
-#endif
-  }
-};
-
-template <typename T>
-struct GRUUnitGradFunctor<lite::TargetType::kX86, T> {
-  static void compute(const lite::X86Context &context,
-                      GRUMetaValue<T> value,
-                      GRUMetaGrad<T> grad,
-                      int frame_size,
-                      int batch_size,
-                      const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate,
-                      bool origin_mode) {
-#ifndef __NVCC__
-    detail::backward_state_grad(detail::backward::gru_stateGrad<T>(),
-                                value,
-                                grad,
-                                frame_size,
-                                batch_size,
-                                active_node,
-                                origin_mode);
-    auto blas = math::GetBlas<lite::TargetType::kX86, T>(context);
-    if (value.prev_out_value && grad.prev_out_grad) {
-      blas.GEMM(false,
-                true,
-                batch_size,
-                frame_size,
-                frame_size,
-                1,
-                grad.gate_grad + frame_size * 2,
-                frame_size * 3,
-                value.state_weight,
-                frame_size,
-                0,
-                grad.reset_output_grad,
-                frame_size);
-
-      if (grad.state_weight_grad) {
-        blas.GEMM(true,
-                  false,
-                  frame_size,
-                  frame_size,
-                  batch_size,
-                  1,
-                  value.reset_output_value,
-                  frame_size,
-                  grad.gate_grad + frame_size * 2,
-                  frame_size * 3,
-                  1,
-                  grad.state_weight_grad,
-                  frame_size);
-      }
-    }
-
-    detail::backward_reset_grad(detail::backward::gru_resetGrad<T>(),
-                                value,
-                                grad,
-                                frame_size,
-                                batch_size,
-                                active_gate);
-    if (grad.prev_out_grad && value.prev_out_value) {
-      blas.GEMM(false,
-                true,
-                batch_size,
-                frame_size,
-                frame_size * 2,
-                1,
-                grad.gate_grad,
-                frame_size * 3,
-                value.gate_weight,
-                frame_size * 2,
-                1,
-                grad.prev_out_grad,
-                frame_size);
-
-      if (grad.gate_weight_grad) {
-        blas.GEMM(true,
-                  false,
-                  frame_size,
-                  frame_size * 2,
-                  batch_size,
-                  1,
-                  value.prev_out_value,
-                  frame_size,
-                  grad.gate_grad,
-                  frame_size * 3,
-                  1,
-                  grad.gate_weight_grad,
-                  frame_size * 2);
-      }
-    }
-#endif
-  }
-};
-
-template struct GRUUnitFunctor<lite::TargetType::kX86, float>;
-template struct GRUUnitFunctor<lite::TargetType::kX86, double>;
-template struct GRUUnitGradFunctor<lite::TargetType::kX86, float>;
-template struct GRUUnitGradFunctor<lite::TargetType::kX86, double>;
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/gru_compute.h b/lite/backends/x86/math/gru_compute.h
deleted file mode 100644
index 86b7a91f41..0000000000
--- a/lite/backends/x86/math/gru_compute.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "lite/backends/x86/math/detail/activation_functions.h"
-#include "lite/core/context.h"
-#include "lite/utils/paddle_enforce.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-template <typename T>
-struct GRUMetaValue {
-  T *gate_weight;
-  T *state_weight;
-  T *gate_value;
-  T *reset_output_value;
-  T *output_value;
-  T *prev_out_value;
-};
-
-template <typename T>
-struct GRUMetaGrad {
-  T *gate_weight_grad;
-  T *state_weight_grad;
-  T *gate_grad;
-  T *reset_output_grad;
-  T *output_grad;
-  T *prev_out_grad;
-};
-
-template <lite::TargetType Target, typename T>
-struct GRUUnitFunctor {
-  static void compute(const lite::Context<Target> &context,
-                      GRUMetaValue<T> value,
-                      int frame_size,
-                      int batch_size,
-                      const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate,
-                      bool origin_mode);
-};
-
-template <lite::TargetType Target, typename T>
-struct GRUUnitGradFunctor {
-  static void compute(const lite::Context<Target> &context,
-                      GRUMetaValue<T> value,
-                      GRUMetaGrad<T> grad,
-                      int frame_size,
-                      int batch_size,
-                      const detail::ActivationType active_node,
-                      const detail::ActivationType active_gate,
-                      bool origin_mode);
-};
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/im2col.cc b/lite/backends/x86/math/im2col.cc
deleted file mode 100644
index 1c4c6a49f5..0000000000
--- a/lite/backends/x86/math/im2col.cc
+++ /dev/null
@@ -1,292 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/x86/math/im2col.h"
-#include <vector>
-#include "lite/backends/x86/math/im2col_cfo_cpu.h"
-#include "lite/utils/paddle_enforce.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [input_channels, filter_height, filter_width, output_height, output_width]
- */
-template <class T>
-class Im2ColFunctor<lite::x86::math::ColFormat::kCFO,
-                    lite::TargetType::kX86,
-                    T> {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::Tensor& im,
-                  const std::vector<int>& dilation,
-                  const std::vector<int>& stride,
-                  const std::vector<int>& padding,
-                  lite::Tensor* col) {
-    PADDLE_ENFORCE(im.dims().size() == 3);
-    PADDLE_ENFORCE(col->dims().size() == 5);
-
-    if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 &&
-        dilation[1] == 1) {
-      if (padding[0] == 0 && padding[1] == 0) {
-        im2col_sh1sw1dh1dw1ph0pw0<T>(im, col);
-        return;
-      } else if (padding[0] == 1 && padding[1] == 1) {
-        im2col_sh1sw1dh1dw1ph1pw1<T>(im, col);
-        return;
-      }
-      // TODO(TJ): complete padding >=2
-    }
-    im2col_common<T>(im, dilation, stride, padding, col);
-  }
-};
-
-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [input_channels, filter_height, filter_width, output_height, output_width]
- */
-template <class T>
-class Col2ImFunctor<lite::x86::math::ColFormat::kCFO,
-                    lite::TargetType::kX86,
-                    T> {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::Tensor& col,
-                  const std::vector<int>& dilation,
-                  const std::vector<int>& stride,
-                  const std::vector<int>& padding,
-                  lite::Tensor* im) {
-    PADDLE_ENFORCE(im->dims().size() == 3);
-    PADDLE_ENFORCE(col.dims().size() == 5);
-    int im_channels = im->dims()[0];
-    int im_height = im->dims()[1];
-    int im_width = im->dims()[2];
-    int filter_height = col.dims()[1];
-    int filter_width = col.dims()[2];
-    int col_height = col.dims()[3];
-    int col_width = col.dims()[4];
-
-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       ((dilation[0] * (filter_height - 1) + 1))) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       ((dilation[1] * (filter_width - 1) + 1))) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-
-    int channels_col = im_channels * filter_height * filter_width;
-
-    T* im_data = im->mutable_data<T>();
-    const T* col_data = col.data<T>();
-
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int c_im = c / (filter_width * filter_height);
-      for (int h = 0; h < col_height; ++h) {
-        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
-        for (int w = 0; w < col_width; ++w) {
-          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
-          if ((im_row_idx) >= 0 && (im_row_idx) < im_height &&
-              (im_col_idx) >= 0 && (im_col_idx) < im_width) {
-            im_data[(im_row_idx + c_im * im_height) * im_width + im_col_idx] +=
-                col_data[(c * col_height + h) * col_width + w];
-          }
-        }
-      }
-    }
-  }
-};
-
-template class Im2ColFunctor<lite::x86::math::ColFormat::kCFO,
-                             lite::TargetType::kX86,
-                             float>;
-template class Im2ColFunctor<lite::x86::math::ColFormat::kCFO,
-                             lite::TargetType::kX86,
-                             double>;
-template class Col2ImFunctor<lite::x86::math::ColFormat::kCFO,
-                             lite::TargetType::kX86,
-                             float>;
-template class Col2ImFunctor<lite::x86::math::ColFormat::kCFO,
-                             lite::TargetType::kX86,
-                             double>;
-
-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [output_height, output_width, input_channels, filter_height, filter_width]
- */
-template <class T>
-class Im2ColFunctor<lite::x86::math::ColFormat::kOCF,
-                    lite::TargetType::kX86,
-                    T> {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::Tensor& im,
-                  const std::vector<int>& dilation,
-                  const std::vector<int>& stride,
-                  const std::vector<int>& padding,
-                  lite::Tensor* col) {
-    PADDLE_ENFORCE(im.dims().size() == 3);
-    PADDLE_ENFORCE(col->dims().size() == 5);
-    int im_channels = im.dims()[0];
-    int im_height = im.dims()[1];
-    int im_width = im.dims()[2];
-    int filter_height = col->dims()[3];
-    int filter_width = col->dims()[4];
-    int col_height = col->dims()[0];
-    int col_width = col->dims()[1];
-
-    const T* im_data = im.data<T>();
-    T* col_data = col->mutable_data<T>();
-
-    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
-      for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
-        for (int channel = 0; channel < im_channels; ++channel) {
-          for (int filter_row_idx = 0; filter_row_idx < filter_height;
-               ++filter_row_idx) {
-            int im_row_offset =
-                col_row_idx * stride[0] + filter_row_idx - padding[0];
-            for (int filter_col_idx = 0; filter_col_idx < filter_width;
-                 ++filter_col_idx) {
-              int im_col_offset =
-                  col_col_idx * stride[1] + filter_col_idx - padding[1];
-
-              int col_offset =
-                  ((((col_row_idx)*col_width + col_col_idx) * im_channels +
-                    channel) *
-                       filter_height +
-                   filter_row_idx) *
-                      filter_width +
-                  filter_col_idx;
-
-              int im_offset = (channel * im_height + im_row_offset) * im_width +
-                              im_col_offset;
-              col_data[col_offset] =
-                  (im_row_offset < 0 || im_row_offset >= im_height ||
-                   im_col_offset < 0 || im_col_offset >= im_width)
-                      ? static_cast<T>(0)
-                      : im_data[im_offset];
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [output_height, output_width, input_channels, filter_height, filter_width]
- */
-template <class T>
-class Col2ImFunctor<lite::x86::math::ColFormat::kOCF,
-                    lite::TargetType::kX86,
-                    T> {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::Tensor& col,
-                  const std::vector<int>& dilation,
-                  const std::vector<int>& stride,
-                  const std::vector<int>& padding,
-                  lite::Tensor* im) {
-    PADDLE_ENFORCE(im->dims().size() == 3);
-    PADDLE_ENFORCE(col.dims().size() == 5);
-    int im_channels = im->dims()[0];
-    int im_height = im->dims()[1];
-    int im_width = im->dims()[2];
-    int filter_height = col.dims()[3];
-    int filter_width = col.dims()[4];
-    int col_height = col.dims()[0];
-    int col_width = col.dims()[1];
-
-    PADDLE_ENFORCE_EQ(
-        (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
-        col_height,
-        "Output_height and padding(padding_up, padding_down) are "
-        "inconsistent.");
-    PADDLE_ENFORCE_EQ(
-        (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
-        col_width,
-        "col_width and padding(padding_left, padding_right) are "
-        "inconsistent.");
-
-    T* im_data = im->mutable_data<T>();
-    const T* col_data = col.data<T>();
-
-    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
-      for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
-        for (int channel = 0; channel < im_channels; ++channel) {
-          for (int filter_row_idx = 0; filter_row_idx < filter_height;
-               ++filter_row_idx) {
-            int im_row_offset =
-                col_row_idx * stride[0] + filter_row_idx - padding[0];
-            for (int filter_col_idx = 0; filter_col_idx < filter_width;
-                 ++filter_col_idx) {
-              int im_col_offset =
-                  col_col_idx * stride[1] + filter_col_idx - padding[1];
-
-              int col_offset =
-                  (((col_row_idx * col_width + col_col_idx) * im_channels +
-                    channel) *
-                       filter_height +
-                   filter_row_idx) *
-                      filter_width +
-                  filter_col_idx;
-
-              if (im_row_offset >= 0 && im_row_offset < im_height &&
-                  im_col_offset >= 0 && im_col_offset < im_width) {
-                int im_offset =
-                    (channel * im_height + im_row_offset) * im_width +
-                    im_col_offset;
-                im_data[im_offset] += col_data[col_offset];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-template class Im2ColFunctor<lite::x86::math::ColFormat::kOCF,
-                             lite::TargetType::kX86,
-                             float>;
-template class Im2ColFunctor<lite::x86::math::ColFormat::kOCF,
-                             lite::TargetType::kX86,
-                             double>;
-template class Col2ImFunctor<lite::x86::math::ColFormat::kOCF,
-                             lite::TargetType::kX86,
-                             float>;
-template class Col2ImFunctor<lite::x86::math::ColFormat::kOCF,
-                             lite::TargetType::kX86,
-                             double>;
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/im2col.h b/lite/backends/x86/math/im2col.h
deleted file mode 100644
index 8fb89ccb5f..0000000000
--- a/lite/backends/x86/math/im2col.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "lite/core/context.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */
-enum class ColFormat { kCFO = 0, kOCF = 1 };
-
-/*
- * \brief Converts the image data of three dimensions(CHW) into a colData of
- *        five dimensions in the Im2ColFunctor calculation,
- *        And in the Col2ImFunctor calculation, it is reversed.
- *
- * \param imData   Image data.
- * \param imShape  The shape of imData,
- *                 [input_channels, input_height, input_width].
- * \param colData  Column data.
- * \param colShape The shape of colData.
- *
- * \param dilations    dilation data.
- * \param 2-dimension  [dilation_height, dilation_width].
- *
- * \param strides      stride data.
- * \param 2-dimension  [stride_height, stride_width].
- *
- * \param paddings     padding data.
- * \param 4-dimension  [up_pad, left_pad, down_pad, right_pad].
- *
- * If the template argument Format is kCFO, the shape of colData is:
- * [input_channels, filter_height, filter_width, output_height, output_width]
- * So, it is easy to reshape into a convolution matrix for convolution
- * calculation based on matrix multiplication.
- * The shape of convolution matrix is [height, width], where the height is equal
- * input_channels * filter_height * filter_width, and the width is equal
- * output_height * output_width.
- *
- * Reshape:
- *     shape of colData           shape of convolution matrix
- *     [input_channels,
- *      filter_height,
- *      filter_width,      ======>      [height, width]
- *      output_height,
- *      output_width]
- *
- * If the template argument Format is kOCF, the shape of colData is:
- * [output_height, output_width, input_channels, filter_height, filter_width]
- * So, it is easy to reshape into a sequence matrix for rnn calculation.
- * The shape of sequence matrix is [seq_length, step_size], where the seq_length
- * is equal output_height * output_width, and the step_size is equal
- * input_channels * filter_height * filter_width.
- *
- * Reshape:
- *     shape of colData             shape of sequence matrix
- *     [output_height,
- *      output_width,
- *      input_channels,    ======>    [seqLength, stepSize]
- *      filter_height,
- *      filter_width]
- *
- * \note The caller needs to ensure that imShape.inputChannels is equal to
- *       colShape.inputChannels.
- */
-template <ColFormat Format, lite::TargetType Target, typename T>
-class Im2ColFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& im,
-                  const std::vector<int>& dilation,
-                  const std::vector<int>& stride,
-                  const std::vector<int>& padding,
-                  lite::Tensor* col);
-};
-
-template <ColFormat Format, lite::TargetType Target, typename T>
-class Col2ImFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& col,
-                  const std::vector<int>& dilation,
-                  const std::vector<int>& stride,
-                  const std::vector<int>& padding,
-                  lite::Tensor* im);
-};
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/im2col_cfo_cpu.h b/lite/backends/x86/math/im2col_cfo_cpu.h
deleted file mode 100644
index 4623f045bb..0000000000
--- a/lite/backends/x86/math/im2col_cfo_cpu.h
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-/**
- * The most common im2col algorithm.
- * Support dilation, stride and padding.
- */
-template <typename T>
-inline void im2col_common(const lite::Tensor& im,
-                          const std::vector<int>& dilation,
-                          const std::vector<int>& stride,
-                          const std::vector<int>& padding,
-                          lite::Tensor* col) {
-  int im_channels = im.dims()[0];
-  int im_height = im.dims()[1];
-  int im_width = im.dims()[2];
-  int filter_height = col->dims()[1];
-  int filter_width = col->dims()[2];
-  int output_height = col->dims()[3];
-  int output_width = col->dims()[4];
-  int channels_col = im_channels * filter_height * filter_width;
-
-  const T* im_data = im.data<T>();
-  T* col_data = col->mutable_data<T>();
-  for (int c = 0; c < channels_col; ++c) {
-    int w_offset = c % filter_width;
-    int h_offset = (c / filter_width) % filter_height;
-    int c_im = c / (filter_width * filter_height);
-    for (int h = 0; h < output_height; ++h) {
-      int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
-      for (int w = 0; w < output_width; ++w) {
-        int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
-        int col_idx = (c * output_height + h) * output_width + w;
-        int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
-        col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
-                             im_col_idx < 0 || im_col_idx >= im_width)
-                                ? static_cast<T>(0)
-                                : im_data[im_idx];
-      }
-    }
-  }
-}
-
-/**
- * im2col algorithm with strides == 1, dilations == 1, paddings == 0
- */
-template <typename T>
-inline void im2col_sh1sw1dh1dw1ph0pw0(const lite::Tensor& im,
-                                      lite::Tensor* col) {
-  int im_channels = im.dims()[0];
-  int im_height = im.dims()[1];
-  int im_width = im.dims()[2];
-  int filter_height = col->dims()[1];
-  int filter_width = col->dims()[2];
-  int output_height = col->dims()[3];
-  int output_width = col->dims()[4];
-
-  const T* im_data = im.data<T>();
-  T* col_data = col->mutable_data<T>();
-  int col_matrix_width = output_width * output_height;
-  int im_size = im_height * im_width;
-  size_t copy_size = sizeof(T) * output_width;
-  const T* im_data_oh = im_data;
-  T* dst_data_oh = col_data;
-  for (int oh = 0; oh < output_height; ++oh) {
-    const T* src_data_ic = im_data_oh;
-    T* dst_data = dst_data_oh;
-    for (int ic = 0; ic < im_channels; ++ic) {
-      const T* src_data = src_data_ic;
-      for (int kh = 0; kh < filter_height; ++kh) {
-        for (int kw = 0; kw < filter_width; ++kw) {
-          std::memcpy(dst_data, src_data + kw, copy_size);
-          dst_data = dst_data + col_matrix_width;
-        }
-        src_data = src_data + im_width;
-      }
-      src_data_ic = src_data_ic + im_size;
-    }
-    im_data_oh = im_data_oh + im_width;
-    dst_data_oh = dst_data_oh + output_width;
-  }
-}
-
-/**
- * im2col algorithm with strides == 1, dilations == 1, paddings == 1
- * and filter_width == 1 have a special implementation
- */
-template <typename T>
-inline void im2col_sh1sw1dh1dw1ph1pw1(const lite::Tensor& im,
-                                      lite::Tensor* col) {
-  int im_channels = im.dims()[0];
-  int im_height = im.dims()[1];
-  int im_width = im.dims()[2];
-  int filter_height = col->dims()[1];
-  int filter_width = col->dims()[2];
-  int output_height = col->dims()[3];
-  int output_width = col->dims()[4];
-
-  constexpr int plh = 1;
-  constexpr int prh = 1;
-  constexpr int plw = 1;
-  constexpr int prw = 1;
-
-  const T* im_data = im.data<T>();
-  T* col_data = col->mutable_data<T>();
-  int im_size = im_height * im_width;
-  int col_matrix_width = output_width * output_height;
-  int col_block_fh = filter_width * col_matrix_width;  // fw*oh*ow
-  int col_block_ic = filter_height * col_block_fh;     // fh*fw*oh*ow
-
-  // fill height padding
-  {
-    size_t copy_size = sizeof(T) * output_width;
-    T* col_start_l = col_data;
-    T* col_start_r = col_data + (filter_height - 1) * col_block_fh +
-                     col_matrix_width - output_width;
-    for (int ic = 0; ic < im_channels; ++ic) {
-      T* dst_data_l = col_start_l;
-      T* dst_data_r = col_start_r;
-      for (int kw = 0; kw < filter_width; ++kw) {
-        std::memset(dst_data_l, 0, copy_size);
-        std::memset(dst_data_r, 0, copy_size);
-        dst_data_l = dst_data_l + col_matrix_width;
-        dst_data_r = dst_data_r + col_matrix_width;
-      }
-      col_start_l = col_start_l + col_block_ic;
-      col_start_r = col_start_r + col_block_ic;
-    }
-  }
-
-  auto pad = static_cast<T>(0);
-  if (filter_width == 1) {
-    // fill width padding
-    T* dst_data_ic = col_data;
-    for (int ic = 0; ic < im_channels; ++ic) {
-      T* dst_data_kh = dst_data_ic;
-      for (int kh = 0; kh < filter_height; ++kh) {
-        T* dst_data = dst_data_kh;
-        for (int oh = 0; oh < output_height; ++oh) {
-          *dst_data = pad;
-          dst_data = dst_data + output_width - 1;
-          *dst_data = pad;
-          ++dst_data;
-        }
-        dst_data_kh = dst_data_kh + col_block_fh;
-      }
-      dst_data_ic = dst_data_ic + col_block_ic;
-    }
-    // fill core
-    size_t copy_size = sizeof(T) * (output_width - plw - prw);
-    for (int oh = 0; oh < output_height; ++oh) {
-      const T* im_data_start =
-          im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
-      T* dst_data = col_data + oh * output_width;
-      for (int ic = 0; ic < im_channels; ++ic) {
-        const T* src_data = im_data_start + ic * im_size;
-        for (int kh = 0; kh < filter_height; ++kh) {
-          if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) &&
-                                         kh > (filter_height - prh - 1))) {
-            dst_data = dst_data + col_matrix_width;
-            continue;
-          }
-          std::memcpy(dst_data + plw, src_data, copy_size);
-          dst_data = dst_data + col_matrix_width;
-          src_data = src_data + im_width;
-        }
-      }
-    }
-    return;
-  }
-
-  // filter_width != 1
-  // fill width padding
-  T* dst_data_ic = col_data;
-  for (int ic = 0; ic < im_channels; ++ic) {
-    T* dst_data_kh = dst_data_ic;
-    for (int kh = 0; kh < filter_height; ++kh) {
-      for (T* dst_data :
-           {dst_data_kh,
-            dst_data_kh + (filter_width - prw) * col_matrix_width +
-                output_width - 1}) {
-        // TODO(TJ): from plh, saving repeated assignment
-        for (int oh = 0; oh < output_height; ++oh) {
-          *dst_data = pad;
-          dst_data = dst_data + output_width;
-        }
-      }
-      dst_data_kh = dst_data_kh + col_block_fh;
-    }
-    dst_data_ic = dst_data_ic + col_block_ic;
-  }
-
-  // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) *
-  // (output_width-1)}
-  // length of copy_size is equal kw.
-  for (int oh = 0; oh < output_height; ++oh) {
-    const T* im_data_start = im_data + (oh - plh > 0 ? oh - plh : 0) * im_width;
-    T* dst_data = col_data + oh * output_width;
-    for (int ic = 0; ic < im_channels; ++ic) {
-      const T* src_data = im_data_start + ic * im_size;
-      for (int kh = 0; kh < filter_height; ++kh) {
-        if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) &&
-                                       kh > (filter_height - prh - 1))) {
-          dst_data = dst_data + filter_width * col_matrix_width;
-          continue;
-        }
-        // TODO(TJ): reuse plw-kw outside this for
-        // try to unify
-        for (int kw = 0; kw < plw; ++kw) {
-          std::memcpy(dst_data + (plw - kw),
-                      src_data,
-                      sizeof(T) * (output_width - (plw - kw)));
-          dst_data = dst_data + col_matrix_width;
-        }
-        for (int kw = plw; kw < filter_width - prw; ++kw) {
-          std::memcpy(
-              dst_data, src_data + (kw - plw), sizeof(T) * output_width);
-          dst_data = dst_data + col_matrix_width;
-        }
-        int i = 1;
-        for (int kw = filter_width - prw; kw < filter_width; ++kw, ++i) {
-          std::memcpy(
-              dst_data, src_data + (kw - plw), sizeof(T) * (output_width - i));
-          dst_data = dst_data + col_matrix_width;
-        }
-        src_data = src_data + im_width;
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/im2col_test.cc b/lite/backends/x86/math/im2col_test.cc
deleted file mode 100644
index 3881d5ff33..0000000000
--- a/lite/backends/x86/math/im2col_test.cc
+++ /dev/null
@@ -1,331 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/im2col.h"
-#include <gtest/gtest.h>
-#include <vector>
-#include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
-#include "paddle/fluid/platform/port.h"
-
-template <typename DeviceContext, typename Place>
-void testIm2col() {
-  paddle::framework::Tensor input_tmp;
-  paddle::framework::Tensor input;
-  paddle::framework::Tensor output_cfo;
-  paddle::framework::Tensor output_ocf;
-  paddle::framework::Tensor output_tmp;
-
-  /**
-   * input = [0, 1, 2,
-   *          3, 4, 5]
-   *
-   * output_cfo = [0, 1
-   *               1, 2
-   *               3, 4
-   *               4, 5]
-   *
-   * output_ocf = [0, 1, 3, 4
-   *               1, 2, 4, 5]
-   *
-   * col2im_cfo = [0, 2, 2
-   *               3, 4, 5]
-   *
-   * col2im_ocf = [0, 2, 2
-   *               3, 4, 5]
-   */
-  int input_height = 2;
-  int input_width = 3;
-  int filter_size = 2;
-  std::vector<int> stride({1, 1});  // stride_y, stride_x
-  std::vector<int> padding(
-      {0, 0, 0, 0});                  // up_pad, left_pad, down_pad, right_pad
-  std::vector<int> dilation({1, 1});  // dilation_y, dilation_x
-  int output_height =
-      (input_height - filter_size + padding[0] + padding[1]) / stride[0] + 1;
-  int output_width =
-      (input_width - filter_size + padding[2] + padding[3]) / stride[1] + 1;
-  float* input_ptr = input_tmp.mutable_data<float>(
-      {1, input_height, input_width}, paddle::platform::CPUPlace());
-  float arr[6] = {0, 1, 2, 3, 4, 5};
-  memcpy(input_ptr, arr, 6 * sizeof(float));
-
-  auto* place = new Place();
-  DeviceContext* context = new DeviceContext(*place);
-  if (paddle::platform::is_cpu_place(*place)) {
-    input = input_tmp;
-  } else {
-    TensorCopySync(input_tmp, *place, &input);
-  }
-  output_cfo.mutable_data<float>(
-      {1, filter_size, filter_size, output_height, output_width}, *place);
-  output_ocf.mutable_data<float>(
-      {output_height, output_width, 1, filter_size, filter_size}, *place);
-
-  // Im2Col
-  paddle::operators::math::Im2ColFunctor<
-      paddle::operators::math::ColFormat::kCFO,
-      DeviceContext,
-      float>
-      im2col;
-  paddle::operators::math::Im2ColFunctor<
-      paddle::operators::math::ColFormat::kOCF,
-      DeviceContext,
-      float>
-      im2col_ocf;
-
-  im2col(*context, input, dilation, stride, padding, &output_cfo);
-  im2col_ocf(*context, input, dilation, stride, padding, &output_ocf);
-
-  float out_cfo_data[] = {0, 1, 1, 2, 3, 4, 4, 5};
-  float out_ocf_data[] = {0, 1, 3, 4, 1, 2, 4, 5};
-
-  float* out_cfo_ptr;
-  if (paddle::platform::is_cpu_place(*place)) {
-    out_cfo_ptr = output_cfo.data<float>();
-  } else {
-    TensorCopySync(output_cfo, paddle::platform::CPUPlace(), &output_tmp);
-    out_cfo_ptr = output_tmp.data<float>();
-  }
-  for (int i = 0; i < 6; ++i) {
-    EXPECT_EQ(out_cfo_ptr[i], out_cfo_data[i]);
-  }
-
-  float* out_ocf_ptr;
-  if (paddle::platform::is_cpu_place(*place)) {
-    out_ocf_ptr = output_ocf.data<float>();
-  } else {
-    TensorCopySync(output_ocf, paddle::platform::CPUPlace(), &output_tmp);
-    out_ocf_ptr = output_tmp.data<float>();
-  }
-
-  for (int i = 0; i < 6; ++i) {
-    EXPECT_EQ(out_ocf_ptr[i], out_ocf_data[i]);
-  }
-
-  // Col2Im: kCFO
-  paddle::operators::math::Col2ImFunctor<
-      paddle::operators::math::ColFormat::kCFO,
-      DeviceContext,
-      float>
-      col2im;
-  paddle::operators::math::Col2ImFunctor<
-      paddle::operators::math::ColFormat::kOCF,
-      DeviceContext,
-      float>
-      col2im_ocf;
-  float col2im_data[] = {0, 2, 2, 3, 8, 5};
-
-  memset(input_ptr, 0, 6 * sizeof(float));
-  if (paddle::platform::is_cpu_place(*place)) {
-    input = input_tmp;
-  } else {
-    TensorCopySync(input_tmp, *place, &input);
-  }
-
-  col2im(*context, output_cfo, dilation, stride, padding, &input);
-
-  float* in_ptr;
-  if (paddle::platform::is_cpu_place(*place)) {
-    in_ptr = input.data<float>();
-  } else {
-    TensorCopySync(input, paddle::platform::CPUPlace(), &input_tmp);
-    in_ptr = input_tmp.data<float>();
-  }
-  for (int i = 0; i < 6; ++i) {
-    EXPECT_EQ(in_ptr[i], col2im_data[i]);
-  }
-
-  // Col2Im: kOCF
-  memset(input_ptr, 0, 6 * sizeof(float));
-  if (paddle::platform::is_cpu_place(*place)) {
-    input = input_tmp;
-  } else {
-    TensorCopySync(input_tmp, *place, &input);
-  }
-
-  col2im_ocf(*context, output_ocf, dilation, stride, padding, &input);
-
-  if (paddle::platform::is_cpu_place(*place)) {
-    in_ptr = input.data<float>();
-  } else {
-    TensorCopySync(input, paddle::platform::CPUPlace(), &input_tmp);
-    in_ptr = input_tmp.data<float>();
-  }
-  for (int i = 0; i < 6; ++i) {
-    EXPECT_EQ(in_ptr[i], col2im_data[i]);
-  }
-
-  delete place;
-  delete context;
-}
-
-TEST(math, im2col) {
-  testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
-#ifdef PADDLE_WITH_CUDA
-  testIm2col<paddle::platform::CUDADeviceContext,
-             paddle::platform::CUDAPlace>();
-#endif
-}
-
-#define PREPARE_IM2COL_CPU                                                   \
-  paddle::platform::CPUPlace place;                                          \
-  paddle::platform::CPUDeviceContext context(place);                         \
-  paddle::framework::Tensor input;                                           \
-  paddle::framework::Tensor out;                                             \
-  paddle::framework::Tensor ref;                                             \
-  std::vector<int> padding({ph, pw});                                        \
-  std::vector<int> stride({1, 1});                                           \
-  std::vector<int> dilation({1, 1});                                         \
-  float* input_ptr = input.mutable_data<float>({ic, ih, iw}, place);         \
-  for (int i = 0; i < input.numel(); ++i) {                                  \
-    input_ptr[i] = static_cast<float>(i + 1);                                \
-  }                                                                          \
-  int output_height = (ih - fh + padding[0] * 2) / stride[0] + 1;            \
-  int output_width = (iw - fw + padding[1] * 2) / stride[1] + 1;             \
-  out.mutable_data<float>({ic, fh, fw, output_height, output_width}, place); \
-  ref.mutable_data<float>({ic, fh, fw, output_height, output_width}, place); \
-  paddle::operators::math::Im2ColFunctor<                                    \
-      paddle::operators::math::ColFormat::kCFO,                              \
-      paddle::platform::CPUDeviceContext,                                    \
-      float>                                                                 \
-      im2col
-
-void testIm2colCPU(int ic, int ih, int iw, int fh, int fw, int ph, int pw) {
-  PREPARE_IM2COL_CPU;
-
-  im2col(context, input, dilation, stride, padding, &out);
-  paddle::operators::math::im2col_common<float>(
-      input, dilation, stride, padding, &ref);
-
-  float* ref_data = ref.data<float>();
-  float* out_data = out.data<float>();
-  for (int i = 0; i < out.numel(); ++i) {
-    EXPECT_EQ(out_data[i], ref_data[i]);
-  }
-}
-
-void benchIm2col(int ic, int ih, int iw, int fh, int fw, int ph, int pw) {
-  PREPARE_IM2COL_CPU;
-  constexpr int repeat = 100;
-  auto GetCurrentMs = []() -> double {
-    struct timeval time;
-    gettimeofday(&time, NULL);
-    return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec;
-  };
-  auto t1 = GetCurrentMs();
-  for (int i = 0; i < repeat; ++i) {
-    im2col(context, input, dilation, stride, padding, &out);
-  }
-  auto t2 = GetCurrentMs();
-
-  for (int i = 0; i < repeat; ++i) {
-    paddle::operators::math::im2col_common<float>(
-        input, dilation, stride, padding, &ref);
-  }
-  auto t3 = GetCurrentMs();
-
-  LOG(INFO) << "before: " << (t3 - t2) / repeat
-            << ",after: " << (t2 - t1) / repeat
-            << ",boost: " << ((t3 - t2) / (t2 - t1) - 1) * 100 << "%";
-}
-
-TEST(math, im2col_cputest) {
-  // padding_h == padding_w
-  for (int p = 0; p < 4; ++p) {
-    // width == height
-    testIm2colCPU(/*ic*/ 2,
-                  /*ih*/ 5,
-                  /*iw*/ 5,
-                  /*fh*/ 4,
-                  /*fw*/ 4,
-                  /*ph*/ p,
-                  /*pw*/ p);
-    testIm2colCPU(/*ic*/ 2,
-                  /*ih*/ 4,
-                  /*iw*/ 4,
-                  /*fh*/ 3,
-                  /*fw*/ 3,
-                  /*ph*/ p,
-                  /*pw*/ p);
-    testIm2colCPU(/*ic*/ 2,
-                  /*ih*/ 4,
-                  /*iw*/ 4,
-                  /*fh*/ 2,
-                  /*fw*/ 2,
-                  /*ph*/ p,
-                  /*pw*/ p);
-
-    // height != width
-    testIm2colCPU(/*ic*/ 2,
-                  /*ih*/ 5,
-                  /*iw*/ 4,
-                  /*fh*/ 2,
-                  /*fw*/ 3,
-                  /*ph*/ p,
-                  /*pw*/ p);
-    testIm2colCPU(/*ic*/ 2,
-                  /*ih*/ 5,
-                  /*iw*/ 4,
-                  /*fh*/ 1,
-                  /*fw*/ 3,
-                  /*ph*/ p,
-                  /*pw*/ p);
-    testIm2colCPU(/*ic*/ 2,
-                  /*ih*/ 4,
-                  /*iw*/ 5,
-                  /*fh*/ 3,
-                  /*fw*/ 1,
-                  /*ph*/ p,
-                  /*pw*/ p);
-
-    // filter == 1
-    testIm2colCPU(/*ic*/ 3,
-                  /*ih*/ 4,
-                  /*iw*/ 4,
-                  /*fh*/ 1,
-                  /*fw*/ 1,
-                  /*ph*/ p,
-                  /*pw*/ p);
-    testIm2colCPU(/*ic*/ 3,
-                  /*ih*/ 3,
-                  /*iw*/ 4,
-                  /*fh*/ 1,
-                  /*fw*/ 1,
-                  /*ph*/ p,
-                  /*pw*/ p);
-  }
-
-  // padding_h != padding_w
-  testIm2colCPU(/*ic*/ 2,
-                /*ih*/ 4,
-                /*iw*/ 4,
-                /*fh*/ 2,
-                /*fw*/ 3,
-                /*ph*/ 1,
-                /*pw*/ 2);
-
-  // benchmark
-  for (int p : {0, 1}) {
-    for (int k : {1, 3, 5}) {
-      LOG(INFO) << "padding == " << p << ", filter == " << k;
-      benchIm2col(/*ic*/ 3,
-                  /*ih*/ 224,
-                  /*iw*/ 224,
-                  /*fh*/ k,
-                  /*fw*/ k,
-                  /*ph*/ p,
-                  /*pw*/ p);
-    }
-  }
-}
diff --git a/lite/backends/x86/math/math_function.cc b/lite/backends/x86/math/math_function.cc
deleted file mode 100644
index 822b7df936..0000000000
--- a/lite/backends/x86/math/math_function.cc
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/x86/math/math_function.h"
-
-#ifdef PADDLE_WITH_MKLML
-#include "lite/backends/x86/mklml.h"
-#endif
-
-#ifdef PADDLE_USE_OPENBLAS
-#include <cblas.h>
-#endif
-
-#include <vector>
-#include "lite/backends/x86/math/math_function_impl.h"
-#include "lite/fluid/data_type.h"
-#include "lite/fluid/float16.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-template struct SetConstant<lite::TargetType::kX86, lite::fluid::float16>;
-template struct SetConstant<lite::TargetType::kX86, float>;
-template struct SetConstant<lite::TargetType::kX86, double>;
-template struct SetConstant<lite::TargetType::kX86, int>;
-template struct SetConstant<lite::TargetType::kX86, int64_t>;
-template struct SetConstant<lite::TargetType::kX86, bool>;
-template struct SetConstant<lite::TargetType::kX86, uint8_t>;
-
-#define DEFINE_CPU_TRANS(RANK)                                      \
-  template struct Transpose<lite::TargetType::kX86,                 \
-                            lite::fluid::float16,                   \
-                            RANK>;                                  \
-  template struct Transpose<lite::TargetType::kX86, float, RANK>;   \
-  template struct Transpose<lite::TargetType::kX86, double, RANK>;  \
-  template struct Transpose<lite::TargetType::kX86, int, RANK>;     \
-  template struct Transpose<lite::TargetType::kX86, int64_t, RANK>; \
-  template struct Transpose<lite::TargetType::kX86, bool, RANK>;    \
-  template struct Transpose<lite::TargetType::kX86, int16_t, RANK>; \
-  template struct Transpose<lite::TargetType::kX86, uint8_t, RANK>; \
-  template struct Transpose<lite::TargetType::kX86, int8_t, RANK>;
-
-DEFINE_CPU_TRANS(1);
-DEFINE_CPU_TRANS(2);
-DEFINE_CPU_TRANS(3);
-DEFINE_CPU_TRANS(4);
-DEFINE_CPU_TRANS(5);
-DEFINE_CPU_TRANS(6);
-
-struct TensorSetConstantCPU {
-  TensorSetConstantCPU(lite::Tensor* tensor, float value)
-      : tensor_(tensor), value_(value) {}
-  template <typename T>
-  void apply() const {
-    auto* begin = tensor_->mutable_data<T>(lite::TargetType::kX86);
-    std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
-  }
-  lite::Tensor* tensor_;
-  float value_;
-};
-
-template <>
-void set_constant_with_place<lite::TargetType::kX86>(
-    const lite::Context<lite::TargetType::kX86>& context,
-    lite::Tensor* tensor,
-    float value) {
-  // lite::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
-  TensorSetConstantCPU(tensor, value).apply<float>();
-}
-
-// template <>
-// void set_constant_with_place<platform::CUDAPinnedPlace>(
-//    const platform::DeviceContext& context, framework::Tensor* tensor,
-//    float value) {
-//  framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor,
-//  value));
-//}
-
-template <lite::TargetType Target>
-struct TensorSetConstantWithTarget /*: public boost::static_visitor<void>*/ {
-  TensorSetConstantWithTarget(const lite::Context<Target>& context,
-                              lite::Tensor* tensor,
-                              float value)
-      : context_(context), tensor_(tensor), value_(value) {}
-
-  void operator()() const {
-    set_constant_with_place<Target>(context_, tensor_, value_);
-  }
-
-  const lite::Context<Target>& context_;
-  lite::Tensor* tensor_;
-  float value_;
-};
-
-template <lite::TargetType Target>
-void set_constant(const lite::Context<Target>& context,
-                  lite::Tensor* tensor,
-                  float value) {
-  TensorSetConstantWithTarget<Target> func(context, tensor, value);
-  //#ifdef PADDLE_WITH_CUDA
-  // tensor->target().apply_visitor(func);
-  //#else
-  func();
-  //#endif
-}
-
-template <typename T>
-struct RowwiseAdd<lite::TargetType::kX86, T> {
-  void operator()(const lite::Context<lite::TargetType::kX86>& context,
-                  const lite::Tensor& input,
-                  const lite::Tensor& vector,
-                  lite::Tensor* output) {
-    auto in_dims = input.dims();
-    auto size = input.numel() / in_dims[0];
-    PADDLE_ENFORCE_EQ(vector.numel(), size);
-    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
-
-    auto in = lite::fluid::EigenMatrix<T>::From(input);
-    auto vec = lite::fluid::EigenVector<T>::Flatten(vector);
-    auto out = lite::fluid::EigenMatrix<T>::From(*output);
-
-    for (int64_t i = 0; i < in_dims[0]; ++i) {
-      out.chip(i, 0) = in.chip(i, 0) + vec;
-    }
-  }
-};
-
-template struct RowwiseAdd<lite::TargetType::kX86, float>;
-template struct RowwiseAdd<lite::TargetType::kX86, double>;
-
-template struct ColwiseSum<lite::TargetType::kX86, float>;
-template struct ColwiseSum<lite::TargetType::kX86, double>;
-template struct ColwiseSum<lite::TargetType::kX86, int>;
-template struct ColwiseSum<lite::TargetType::kX86, int64_t>;
-
-template struct RowwiseSum<lite::TargetType::kX86, float>;
-template struct RowwiseSum<lite::TargetType::kX86, double>;
-
-template struct RowwiseMean<lite::TargetType::kX86, float>;
-template struct RowwiseMean<lite::TargetType::kX86, double>;
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/math_function.h b/lite/backends/x86/math/math_function.h
deleted file mode 100644
index 8f629b5f17..0000000000
--- a/lite/backends/x86/math/math_function.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cmath>
-#include <vector>
-
-#include "lite/core/context.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/tensor.h"
-#include "lite/fluid/float16.h"
-#include "lite/utils/paddle_enforce.h"
-//#include "lite/tensor_util.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-// template <typename T, int Rank>
-//    struct Transpose {
-//        void operator()(const lite::Context<Target::kX86> &context)
-//    };
-
-template <lite::TargetType Target, typename T, int Rank>
-struct Transpose {
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& in,
-                  lite::Tensor* out,
-                  const std::vector<int>& axis);
-};
-
-template <lite::TargetType Target, typename T>
-struct SetConstant {
-  void operator()(const lite::Context<Target>& context,
-                  lite::Tensor* tensor,
-                  T num);
-};
-
-template <lite::TargetType Target>
-void set_constant_with_place(const lite::Context<Target>& context,
-                             lite::Tensor* tensor,
-                             float value);
-
-template <lite::TargetType Target>
-void set_constant(const lite::Context<Target>& context,
-                  lite::Tensor* tensor,
-                  float value);
-
-template <lite::TargetType Target, typename T>
-struct RowwiseAdd {
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& input,
-                  const lite::Tensor& vec,
-                  lite::Tensor* output);
-};
-
-template <lite::TargetType Target, typename T>
-struct ColwiseSum {
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& input,
-                  lite::Tensor* vec);
-};
-
-template <lite::TargetType Target, typename T>
-struct RowwiseSum {
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& input,
-                  lite::Tensor* vec);
-};
-
-template <lite::TargetType Target, typename T>
-struct RowwiseMean {
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& input,
-                  lite::Tensor* vec);
-};
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/math_function_impl.h b/lite/backends/x86/math/math_function_impl.h
deleted file mode 100644
index 3aaca2e593..0000000000
--- a/lite/backends/x86/math/math_function_impl.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "lite/backends/x86/math/math_function.h"
-#include "lite/fluid/data_type.h"
-#include "lite/fluid/eigen.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-template <lite::TargetType Target, typename T>
-void SetConstant<Target, T>::operator()(const lite::Context<Target>& context,
-                                        lite::Tensor* tensor,
-                                        T num) {
-  auto t = lite::fluid::EigenVector<T>::Flatten(*tensor);
-
-  // t.device(*Eigen::DefaultDevice()) = t.constant(static_cast<T>(num));
-  // t.device(*context.eigen_device()) = t.constant(static_cast<T>(num));
-  t.device(typename lite::fluid::EigenDevice<Target>::Type()) =
-      t.constant(static_cast<T>(num));
-}
-
-template <lite::TargetType Target, typename T, int Rank>
-void Transpose<Target, T, Rank>::operator()(
-    const lite::Context<Target>& context,
-    const lite::TensorLite& in,
-    lite::TensorLite* out,
-    const std::vector<int>& axis) {
-  Eigen::array<int, Rank> permute;
-  for (int i = 0; i < Rank; i++) {
-    permute[i] = axis[i];
-  }
-  auto eigen_in = lite::fluid::EigenTensor<T, Rank>::From(in);
-  auto eigen_out = lite::fluid::EigenTensor<T, Rank>::From(*out);
-  // auto* dev = context.eigen_device();
-  // eigen_out.device(*dev) = eigen_in.shuffle(permute);
-  eigen_out.device(typename lite::fluid::EigenDevice<Target>::Type()) =
-      eigen_in.shuffle(permute);
-}
-
-template <lite::TargetType Target, typename T>
-void ColwiseSum<Target, T>::operator()(const lite::Context<Target>& context,
-                                       const lite::TensorLite& input,
-                                       lite::TensorLite* out) {
-  auto in_dims = input.dims();
-  auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(out->numel(), size);
-
-  auto in = lite::fluid::EigenMatrix<T>::From(input);
-  auto vec = lite::fluid::EigenVector<T>::Flatten(*out);
-
-  // vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{0}}));
-  vec.device(typename lite::fluid::EigenDevice<Target>::Type()) =
-      in.sum(Eigen::array<int, 1>({{0}}));
-}
-
-// Specialize for CPU, since Eigen implement a general reduce. However,
-// colwise-sum can be easily implemented. General reduce has a huge overhead in
-// CPU
-template <typename T>
-class ColwiseSum<lite::TargetType::kX86, T> {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::TensorLite& input,
-                  lite::TensorLite* out) {
-    auto& in_dims = input.dims();
-    auto height = in_dims[0];
-    auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), size);
-
-    T* out_buf = out->mutable_data<T>(out->target());
-    const T* in_buf = input.data<T>();
-
-    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
-      for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
-        if (i == 0) {
-          out_buf[j] = in_buf[i * size + j];
-        } else {
-          out_buf[j] += in_buf[i * size + j];
-        }
-      }
-    }
-  }
-};
-
-template <lite::TargetType Target, typename T>
-void RowwiseMean<Target, T>::operator()(const lite::Context<Target>& context,
-                                        const lite::TensorLite& input,
-                                        lite::TensorLite* out) {
-  auto in_dims = input.dims();
-  PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
-  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
-
-  auto in = lite::fluid::EigenMatrix<T>::From(input);
-  auto vec = lite::fluid::EigenVector<T>::Flatten(*out);
-
-  // vec.device(*context.eigen_device()) = in.mean(Eigen::array<int, 1>({{1}}));
-  vec.device(typename lite::fluid::EigenDevice<Target>::Type()) =
-      in.mean(Eigen::array<int, 1>({{1}}));
-}
-// TODO(zcd): Following ColwiseSum format, need to confirm.
-// Specialize for CPU, since Eigen implement a general reduce. However,
-// rowwise-sum can be easily implemented. General reduce has a huge overhead in
-// CPU
-template <typename T>
-class RowwiseMean<lite::TargetType::kX86, T> {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::TensorLite& input,
-                  lite::TensorLite* out) {
-    auto& in_dims = input.dims();
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
-    auto height = in_dims[0];
-    auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), height);
-    auto inv_size = 1.0 / size;
-    T* out_buf = out->mutable_data<T>(out->target());
-    const T* in_buf = input.data<T>();
-
-    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
-      T sum = 0;
-      for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
-        sum += in_buf[i * size + j];
-      }
-      out_buf[i] = sum * inv_size;
-    }
-  }
-};
-
-template <lite::TargetType Target, typename T>
-void RowwiseSum<Target, T>::operator()(const lite::Context<Target>& context,
-                                       const lite::TensorLite& input,
-                                       lite::TensorLite* out) {
-  auto in_dims = input.dims();
-  PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
-  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
-
-  auto in = lite::fluid::EigenMatrix<T>::From(input);
-  auto vec = lite::fluid::EigenVector<T>::Flatten(*out);
-
-  // vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{1}}));
-  vec.device(typename lite::fluid::EigenDevice<Target>::Type()) =
-      in.sum(Eigen::array<int, 1>({{1}}));
-}
-// TODO(zcd): Following ColwiseSum format, need to confirm.
-// Specialize for CPU, since Eigen implement a general reduce. However,
-// rowwise-sum can be easily implemented. General reduce has a huge overhead in
-// CPU
-template <typename T>
-class RowwiseSum<lite::TargetType::kX86, T> {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::TensorLite& input,
-                  lite::TensorLite* out) {
-    auto& in_dims = input.dims();
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
-    auto height = in_dims[0];
-    auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), height);
-
-    T* out_buf = out->mutable_data<T>(out->target());
-    const T* in_buf = input.data<T>();
-
-    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
-      T sum = 0;
-      for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
-        sum += in_buf[i * size + j];
-      }
-      out_buf[i] = sum;
-    }
-  }
-};
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/math_function_test.cc b/lite/backends/x86/math/math_function_test.cc
deleted file mode 100644
index 19122a6169..0000000000
--- a/lite/backends/x86/math/math_function_test.cc
+++ /dev/null
@@ -1,344 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/math/math_function.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/operators/math/blas.h"
-
-template <typename T>
-inline paddle::operators::math::BlasT<paddle::platform::CPUDeviceContext, T>
-GetBlas(const paddle::platform::CPUDeviceContext& context) {
-  return paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext,
-                                          T>(context);
-}
-
-TEST(math_function, gemm_notrans_cblas) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input2;
-  paddle::framework::Tensor input3;
-
-  int m = 2;
-  int n = 3;
-  int k = 3;
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
-  float arr1[6] = {0, 1, 2, 3, 4, 5};
-  memcpy(input1_ptr, arr1, 6 * sizeof(float));
-  float* input2_ptr = input2.mutable_data<float>({3, 4}, *cpu_place);
-  float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  memcpy(input2_ptr, arr2, 12 * sizeof(float));
-  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
-  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  memcpy(input3_ptr, arr3, 8 * sizeof(float));
-
-  paddle::platform::CPUDeviceContext context(*cpu_place);
-  GetBlas<float>(context).GEMM(false,
-                               false,
-                               m,
-                               n,
-                               k,
-                               1,
-                               input1_ptr,
-                               3,
-                               input2_ptr + 1,
-                               4,
-                               1,
-                               input3_ptr + 1,
-                               4);
-
-  EXPECT_EQ(input3_ptr[0], 0);
-  EXPECT_EQ(input3_ptr[1], 24);
-  EXPECT_EQ(input3_ptr[2], 28);
-  EXPECT_EQ(input3_ptr[3], 32);
-  EXPECT_EQ(input3_ptr[4], 4);
-  EXPECT_EQ(input3_ptr[5], 73);
-  EXPECT_EQ(input3_ptr[6], 86);
-  EXPECT_EQ(input3_ptr[7], 99);
-}
-#ifdef PADDLE_WITH_LIBXSMM
-template <typename T>
-void MklSmmCompare(int m, int n, int k) {
-  paddle::framework::Tensor mat_a;
-  paddle::framework::Tensor mat_b;
-  paddle::framework::Tensor mat_c_smm;
-  paddle::framework::Tensor mat_c_mkl;
-  auto* cpu_place = new paddle::platform::CPUPlace();
-
-  T* A = mat_a.mutable_data<T>({m, k}, *cpu_place);
-  T* B = mat_b.mutable_data<T>({k, n}, *cpu_place);
-  T* CSMM = mat_c_smm.mutable_data<T>({m, n}, *cpu_place);
-  T* CMKL = mat_c_mkl.mutable_data<T>({m, n}, *cpu_place);
-  T alpha = static_cast<T>(1);
-  T beta = static_cast<T>(0);
-  for (int i = 0; i < mat_a.numel(); ++i) {
-    A[i] = static_cast<T>(i);
-  }
-  for (int i = 0; i < mat_b.numel(); ++i) {
-    B[i] = static_cast<T>(i);
-  }
-  // lda,ldb,ldc follow RowMajor
-  int lda = k;
-  int ldb = n;
-  int ldc = n;
-
-  auto smm = [&, m, n, k, lda, ldb, ldc, alpha, beta]() {
-    const char transa = 'N';
-    const char transb = 'N';
-    paddle::operators::math::CBlas<T>::SMM_GEMM(&transa,
-                                                &transb,
-                                                &n,
-                                                &m,
-                                                &k,
-                                                &alpha,
-                                                B,
-                                                &ldb,
-                                                A,
-                                                &lda,
-                                                &beta,
-                                                CSMM,
-                                                &ldc);
-  };
-
-  auto mkl = [&, m, n, k, lda, ldb, ldc, alpha, beta]() {
-    paddle::operators::math::CBlas<T>::GEMM(CblasRowMajor,
-                                            CblasNoTrans,
-                                            CblasNoTrans,
-                                            m,
-                                            n,
-                                            k,
-                                            alpha,
-                                            A,
-                                            lda,
-                                            B,
-                                            ldb,
-                                            beta,
-                                            CMKL,
-                                            ldc);
-  };
-
-  smm();
-  mkl();
-  ASSERT_EQ(mat_c_mkl.numel(), mat_c_smm.numel());
-  for (int i = 0; i < mat_c_mkl.numel(); ++i) {
-    EXPECT_FLOAT_EQ(CSMM[i], CMKL[i]);
-  }
-}
-TEST(math_function, gemm_mkl_vs_smm) {
-  MklSmmCompare<float>(1, 2, 3);
-  MklSmmCompare<double>(1, 2, 3);
-  MklSmmCompare<float>(3, 2, 1);
-  MklSmmCompare<double>(3, 2, 1);
-  MklSmmCompare<float>(3, 8, 5);
-  MklSmmCompare<double>(3, 8, 5);
-}
-#endif
-
-TEST(math_function, gemm_trans_cblas) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input2;
-  paddle::framework::Tensor input3;
-
-  int m = 2;
-  int n = 3;
-  int k = 3;
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
-  float arr1[6] = {0, 1, 2, 3, 4, 5};
-  memcpy(input1_ptr, arr1, 6 * sizeof(float));
-  float* input2_ptr = input2.mutable_data<float>({4, 3}, *cpu_place);
-  float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11};
-  memcpy(input2_ptr, arr2, 12 * sizeof(float));
-  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
-  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  memcpy(input3_ptr, arr3, 8 * sizeof(float));
-
-  paddle::platform::CPUDeviceContext context(*cpu_place);
-  GetBlas<float>(context).GEMM(false,
-                               true,
-                               m,
-                               n,
-                               k,
-                               1,
-                               input1_ptr,
-                               3,
-                               input2_ptr + 3,
-                               3,
-                               1,
-                               input3_ptr + 1,
-                               4);
-  delete cpu_place;
-  cpu_place = NULL;
-
-  EXPECT_EQ(input3_ptr[0], 0);
-  EXPECT_EQ(input3_ptr[1], 24);
-  EXPECT_EQ(input3_ptr[2], 28);
-  EXPECT_EQ(input3_ptr[3], 32);
-  EXPECT_EQ(input3_ptr[4], 4);
-  EXPECT_EQ(input3_ptr[5], 73);
-  EXPECT_EQ(input3_ptr[6], 86);
-  EXPECT_EQ(input3_ptr[7], 99);
-}
-
-TEST(math_function, zero) {
-  paddle::framework::Tensor tensor;
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  float* t = tensor.mutable_data<float>({2, 2}, *cpu_place);
-  paddle::platform::CPUDeviceContext context(*cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
-                                       float>
-      functor;
-  functor(context, &tensor, 0);
-  EXPECT_EQ(t[0], 0);
-  EXPECT_EQ(t[1], 0);
-  EXPECT_EQ(t[2], 0);
-  EXPECT_EQ(t[3], 0);
-
-  functor(context, &tensor, 1);
-
-  EXPECT_EQ(t[0], 1);
-  EXPECT_EQ(t[1], 1);
-  EXPECT_EQ(t[2], 1);
-  EXPECT_EQ(t[3], 1);
-}
-
-template <typename T>
-void GemvTest(int m, int n, bool trans) {
-  paddle::framework::Tensor mat_a;
-  paddle::framework::Tensor vec_b;
-  paddle::framework::Tensor vec_c;
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  int b_num = trans ? m : n;
-  int c_num = trans ? n : m;
-
-  T* data_a = mat_a.mutable_data<T>({m, n}, *cpu_place);
-  T* data_b = vec_b.mutable_data<T>({b_num}, *cpu_place);
-  T* data_c = vec_c.mutable_data<T>({c_num}, *cpu_place);
-  for (int i = 0; i < mat_a.numel(); ++i) {
-    data_a[i] = static_cast<T>(i);
-  }
-  for (int i = 0; i < vec_b.numel(); ++i) {
-    data_b[i] = static_cast<T>(i);
-  }
-
-  paddle::platform::CPUDeviceContext context(*cpu_place);
-  GetBlas<T>(context).GEMV(trans,
-                           static_cast<int>(m),
-                           static_cast<int>(n),
-                           1.,
-                           data_a,
-                           data_b,
-                           0.,
-                           data_c);
-
-  if (!trans) {
-    for (int i = 0; i < m; ++i) {
-      T sum = 0.0;
-      for (int j = 0; j < n; ++j) {
-        sum += data_a[i * n + j] * data_b[j];
-      }
-      ASSERT_FLOAT_EQ(data_c[i], sum);
-    }
-  } else {
-    for (int i = 0; i < n; ++i) {
-      T sum = 0.0;
-      for (int j = 0; j < m; ++j) {
-        sum += data_a[j * n + i] * data_b[j];
-      }
-      ASSERT_FLOAT_EQ(data_c[i], sum);
-    }
-  }
-}
-
-TEST(math_function, gemv) {
-  GemvTest<float>(3, 13, false);
-  GemvTest<double>(4, 5, false);
-  GemvTest<float>(12, 7, true);
-  GemvTest<double>(7, 9, true);
-}
-
-TEST(math_funciton, set_constant) {
-  paddle::framework::Tensor t;
-  t.Resize({10, 10});
-  t.mutable_data<int>(paddle::platform::CPUPlace());
-  auto* ctx = new paddle::platform::CPUDeviceContext();
-  paddle::operators::math::set_constant(*ctx, &t, 10);
-  for (int64_t i = 0; i < t.numel(); ++i) {
-    PADDLE_ENFORCE_EQ(10, t.data<int>()[i]);
-  }
-  delete ctx;
-}
-
-template <typename T>
-void GemmWarpTest(int m, int n, int k, T alpha, T beta) {
-  paddle::framework::Tensor mat_a;
-  paddle::framework::Tensor mat_b;
-  paddle::framework::Tensor mat_c_ref;
-  paddle::framework::Tensor mat_c_mkl;
-  auto* cpu_place = new paddle::platform::CPUPlace();
-
-  T* A = mat_a.mutable_data<T>({m, k}, *cpu_place);
-  T* B = mat_b.mutable_data<T>({k, n}, *cpu_place);
-  T* CREF = mat_c_ref.mutable_data<T>({m, n}, *cpu_place);
-  T* CMKL = mat_c_mkl.mutable_data<T>({m, n}, *cpu_place);
-
-  ASSERT_EQ(mat_c_mkl.numel(), mat_c_ref.numel());
-  for (int i = 0; i < mat_a.numel(); ++i) {
-    A[i] = static_cast<T>(i);
-  }
-  for (int i = 0; i < mat_b.numel(); ++i) {
-    B[i] = static_cast<T>(i + 1);
-  }
-  for (int i = 0; i < mat_c_ref.numel(); ++i) {
-    CREF[i] = static_cast<T>(i + 2);
-    CMKL[i] = CREF[i];
-  }
-
-  // this would call gemm_warp
-  paddle::platform::CPUDeviceContext context(*cpu_place);
-  GetBlas<T>(context).GEMM(
-      CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, B, beta, CREF);
-
-  // lda,ldb,ldc follow RowMajor
-  int lda = k;
-  int ldb = n;
-  int ldc = n;
-  paddle::operators::math::CBlas<T>::GEMM(CblasRowMajor,
-                                          CblasNoTrans,
-                                          CblasNoTrans,
-                                          m,
-                                          n,
-                                          k,
-                                          alpha,
-                                          A,
-                                          lda,
-                                          B,
-                                          ldb,
-                                          beta,
-                                          CMKL,
-                                          ldc);
-
-  for (int i = 0; i < mat_c_mkl.numel(); ++i) {
-    EXPECT_FLOAT_EQ(CREF[i], CMKL[i]);
-  }
-}
-
-TEST(math_function, gemm_warp) {
-  GemmWarpTest<float>(3, 2, 5, 1.f, 0.f);
-  GemmWarpTest<float>(3, 2, 5, 2.f, 1.f);
-  GemmWarpTest<float>(8, 5, 6, 1.f, 0.f);
-  GemmWarpTest<float>(8, 5, 6, 2.f, 1.f);
-  GemmWarpTest<double>(3, 2, 5, 1.0, 0.0);
-  GemmWarpTest<double>(3, 2, 5, 2.0, 1.0);
-  GemmWarpTest<double>(8, 5, 6, 1.0, 0.0);
-  GemmWarpTest<double>(8, 5, 6, 2.0, 1.0);
-}
diff --git a/lite/backends/x86/math/maxouting.cc b/lite/backends/x86/math/maxouting.cc
deleted file mode 100644
index 20b40fe7c5..0000000000
--- a/lite/backends/x86/math/maxouting.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/x86/math/maxouting.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-// All tensors are in NCHW format, and the groups must be greater than 1
-template <typename T>
-class MaxOutFunctor<lite::TargetType::kX86, T> {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::Tensor& input,
-                  lite::Tensor* output,
-                  int groups) {
-    const int batch_size = input.dims()[0];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output->dims()[1];
-    int fea_size = input_height * input_width;
-    // c_size means the output size of each sample
-    int c_size = fea_size * output_channels;
-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(lite::TargetType::kX86);
-
-    for (int i = 0; i < batch_size; ++i) {
-      int new_bindex = c_size * i;
-      for (int c = 0; c < output_channels; ++c) {
-        int new_cindex = fea_size * c;
-        for (int f = 0; f < fea_size; ++f) {
-          T ele = static_cast<T>(-FLT_MAX);
-          for (int ph = 0; ph < groups; ++ph) {
-            T x = input_data[(new_bindex + new_cindex) * groups +
-                             ph * fea_size + f];
-            ele = ele > x ? ele : x;
-          }
-          output_data[(new_bindex + new_cindex + f)] = ele;
-        }
-      }
-    }
-  }
-};
-
-template <class T>
-class MaxOutGradFunctor<lite::TargetType::kX86, T> {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::Tensor& input,
-                  lite::Tensor* input_grad,
-                  const lite::Tensor& output,
-                  const lite::Tensor& output_grad,
-                  int groups) {
-    const int batch_size = input.dims()[0];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    int fea_size = input_height * input_width;
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
-
-    for (int i = 0; i < batch_size; ++i) {
-      int blen = fea_size * output_channels * i;
-      for (int c = 0; c < output_channels; ++c) {
-        int clen = fea_size * c;
-        for (int f = 0; f < fea_size; ++f) {
-          int input_idx0 = (blen + clen) * groups + f;
-          bool continue_match = true;
-          int output_idx = blen + clen + f;
-          for (int g = 0; g < groups && continue_match; ++g) {
-            int input_idx = input_idx0 + fea_size * g;
-            if (input_data[input_idx] == output_data[output_idx]) {
-              input_grad_data[input_idx] += output_grad_data[output_idx];
-              continue_match = false;
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-template class MaxOutGradFunctor<lite::TargetType::kX86, float>;
-template class MaxOutGradFunctor<lite::TargetType::kX86, double>;
-template class MaxOutFunctor<lite::TargetType::kX86, float>;
-template class MaxOutFunctor<lite::TargetType::kX86, double>;
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/maxouting.h b/lite/backends/x86/math/maxouting.h
deleted file mode 100644
index f84d2f6c9d..0000000000
--- a/lite/backends/x86/math/maxouting.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "lite/core/context.h"
-#include "lite/core/tensor.h"
-#include "lite/utils/macros.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-template <lite::TargetType Target, typename T>
-class MaxOutFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& input,
-                  lite::Tensor* output,
-                  int groups);
-};
-
-template <lite::TargetType Target, class T>
-class MaxOutGradFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& input,
-                  lite::Tensor* input_grad,
-                  const lite::Tensor& output,
-                  const lite::Tensor& output_grad,
-                  int groups);
-};
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/pooling.cc b/lite/backends/x86/math/pooling.cc
deleted file mode 100644
index e700c5f7c7..0000000000
--- a/lite/backends/x86/math/pooling.cc
+++ /dev/null
@@ -1,906 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/x86/math/pooling.h"
-#include <algorithm>
-#include <vector>
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-/*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
-template <typename PoolProcess, typename T>
-class Pool2dFunctor<lite::TargetType::kX86, PoolProcess, T> {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  PoolProcess pool_process,
-                  bool exclusive,
-                  bool adaptive,
-                  lite::Tensor* output) {
-    const int batch_size = input.dims()[0];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output->dims()[1];
-    const int output_height = output->dims()[2];
-    const int output_width = output->dims()[3];
-    const int ksize_height = ksize[0];
-    const int ksize_width = ksize[1];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-
-    const int input_stride = input_height * input_width;
-    const int output_stride = output_height * output_width;
-
-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(lite::TargetType::kX86);
-
-    int hstart, hend;
-    int wstart, wend;
-    for (int i = 0; i < batch_size; i++) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int ph = 0; ph < output_height; ++ph) {
-          if (adaptive) {
-            hstart = AdaptStartIndex(ph, input_height, output_height);
-            hend = AdaptEndIndex(ph, input_height, output_height);
-          } else {
-            hstart = ph * stride_height - padding_height;
-            hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
-          }
-          for (int pw = 0; pw < output_width; ++pw) {
-            if (adaptive) {
-              wstart = AdaptStartIndex(pw, input_width, output_width);
-              wend = AdaptEndIndex(pw, input_width, output_width);
-            } else {
-              wstart = pw * stride_width - padding_width;
-              wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
-            }
-
-            T ele = pool_process.initial();
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                pool_process.compute(input_data[h * input_width + w], &ele);
-              }
-            }
-            int pool_size = (exclusive || adaptive)
-                                ? (hend - hstart) * (wend - wstart)
-                                : ksize_height * ksize_width;
-            pool_process.finalize(static_cast<T>(pool_size), &ele);
-            output_data[ph * output_width + pw] = ele;
-          }
-        }
-        input_data += input_stride;
-        output_data += output_stride;
-      }
-    }
-  }
-};
-
-/*
-* All tensors are in NCHW format.
-* Ksize, strides, paddings are two elements. These two elements represent height
-* and width, respectively.
-*/
-template <typename PoolProcess, class T>
-class Pool2dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::Tensor& input,
-                  const lite::Tensor& output,
-                  const lite::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  PoolProcess pool_grad_process,
-                  bool exclusive,
-                  bool adaptive,
-                  lite::Tensor* input_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
-    const int ksize_height = ksize[0];
-    const int ksize_width = ksize[1];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-    const int input_stride = input_height * input_width;
-    const int output_stride = output_height * output_width;
-
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
-
-    int hstart, hend;
-    int wstart, wend;
-    for (int i = 0; i < batch_size; i++) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int ph = 0; ph < output_height; ++ph) {
-          if (adaptive) {
-            hstart = AdaptStartIndex(ph, input_height, output_height);
-            hend = AdaptEndIndex(ph, input_height, output_height);
-          } else {
-            hstart = ph * stride_height - padding_height;
-            hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
-          }
-          for (int pw = 0; pw < output_width; ++pw) {
-            if (adaptive) {
-              wstart = AdaptStartIndex(pw, input_width, output_width);
-              wend = AdaptEndIndex(pw, input_width, output_width);
-            } else {
-              wstart = pw * stride_width - padding_width;
-              wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
-            }
-            int pool_size = (exclusive || adaptive)
-                                ? (hend - hstart) * (wend - wstart)
-                                : ksize_height * ksize_width;
-            float scale = 1.0 / pool_size;
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                pool_grad_process.compute(
-                    input_data[h * input_width + w],
-                    output_data[ph * output_width + pw],
-                    output_grad_data[ph * output_width + pw],
-                    static_cast<T>(scale),
-                    input_grad_data + h * input_width + w);
-              }
-            }
-          }
-        }
-        input_data += input_stride;
-        output_data += output_stride;
-        input_grad_data += input_stride;
-        output_grad_data += output_stride;
-      }
-    }
-  }
-};
-
-/*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
-template <class T>
-class MaxPool2dGradFunctor<lite::TargetType::kX86, T> {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::Tensor& input,
-                  const lite::Tensor& output,
-                  const lite::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  lite::Tensor* input_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
-    const int ksize_height = ksize[0];
-    const int ksize_width = ksize[1];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-    const int input_stride = input_height * input_width;
-    const int output_stride = output_height * output_width;
-
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
-
-    for (int i = 0; i < batch_size; i++) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int ph = 0; ph < output_height; ++ph) {
-          int hstart = ph * stride_height - padding_height;
-          int hend = std::min(hstart + ksize_height, input_height);
-          hstart = std::max(hstart, 0);
-          for (int pw = 0; pw < output_width; ++pw) {
-            int wstart = pw * stride_width - padding_width;
-            int wend = std::min(wstart + ksize_width, input_width);
-            wstart = std::max(wstart, 0);
-
-            bool stop = false;
-            for (int h = hstart; h < hend && !stop; ++h) {
-              for (int w = wstart; w < wend && !stop; ++w) {
-                int input_idx = h * input_width + w;
-                int output_idx = ph * output_width + pw;
-                if (input_data[input_idx] == output_data[output_idx]) {
-                  input_grad_data[input_idx] += output_grad_data[output_idx];
-                  stop = true;
-                }
-              }
-            }
-          }
-        }
-        input_data += input_stride;
-        output_data += output_stride;
-        input_grad_data += input_stride;
-        output_grad_data += output_stride;
-      }
-    }
-  }
-};
-
-template class MaxPool2dGradFunctor<lite::TargetType::kX86, float>;
-template class MaxPool2dGradFunctor<lite::TargetType::kX86, double>;
-
-template class Pool2dFunctor<lite::TargetType::kX86,
-                             lite::x86::math::MaxPool<float>,
-                             float>;
-template class Pool2dFunctor<lite::TargetType::kX86,
-                             lite::x86::math::AvgPool<float>,
-                             float>;
-template class Pool2dGradFunctor<lite::TargetType::kX86,
-                                 lite::x86::math::MaxPoolGrad<float>,
-                                 float>;
-template class Pool2dGradFunctor<lite::TargetType::kX86,
-                                 lite::x86::math::AvgPoolGrad<float>,
-                                 float>;
-template class Pool2dFunctor<lite::TargetType::kX86,
-                             lite::x86::math::MaxPool<double>,
-                             double>;
-template class Pool2dFunctor<lite::TargetType::kX86,
-                             lite::x86::math::AvgPool<double>,
-                             double>;
-template class Pool2dGradFunctor<lite::TargetType::kX86,
-                                 lite::x86::math::MaxPoolGrad<double>,
-                                 double>;
-template class Pool2dGradFunctor<lite::TargetType::kX86,
-                                 lite::x86::math::AvgPoolGrad<double>,
-                                 double>;
-
-/*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
-template <typename PoolProcess, class T>
-class Pool3dFunctor<lite::TargetType::kX86, PoolProcess, T> {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  PoolProcess pool_process,
-                  bool exclusive,
-                  bool adaptive,
-                  lite::Tensor* output) {
-    const int batch_size = input.dims()[0];
-    const int input_depth = input.dims()[2];
-    const int input_height = input.dims()[3];
-    const int input_width = input.dims()[4];
-    const int output_channels = output->dims()[1];
-    const int output_depth = output->dims()[2];
-    const int output_height = output->dims()[3];
-    const int output_width = output->dims()[4];
-    const int ksize_depth = ksize[0];
-    const int ksize_height = ksize[1];
-    const int ksize_width = ksize[2];
-    const int stride_depth = strides[0];
-    const int stride_height = strides[1];
-    const int stride_width = strides[2];
-    const int padding_depth = paddings[0];
-    const int padding_height = paddings[1];
-    const int padding_width = paddings[2];
-
-    const int input_stride = input_depth * input_height * input_width;
-    const int output_stride = output_depth * output_height * output_width;
-
-    const T* input_data = input.data<T>();
-    T* output_data = output->mutable_data<T>(lite::TargetType::kX86);
-
-    int dstart, dend;
-    int hstart, hend;
-    int wstart, wend;
-    for (int i = 0; i < batch_size; i++) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int pd = 0; pd < output_depth; ++pd) {
-          if (adaptive) {
-            dstart = AdaptStartIndex(pd, input_depth, output_depth);
-            dend = AdaptEndIndex(pd, input_depth, output_depth);
-          } else {
-            dstart = pd * stride_depth - padding_depth;
-            dend = std::min(dstart + ksize_depth, input_depth);
-            dstart = std::max(dstart, 0);
-          }
-          for (int ph = 0; ph < output_height; ++ph) {
-            if (adaptive) {
-              hstart = AdaptStartIndex(ph, input_height, output_height);
-              hend = AdaptEndIndex(ph, input_height, output_height);
-            } else {
-              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
-            }
-            for (int pw = 0; pw < output_width; ++pw) {
-              if (adaptive) {
-                wstart = AdaptStartIndex(pw, input_width, output_width);
-                wend = AdaptEndIndex(pw, input_width, output_width);
-              } else {
-                wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
-                wstart = std::max(wstart, 0);
-              }
-              int output_idx = (pd * output_height + ph) * output_width + pw;
-              T ele = pool_process.initial();
-              for (int d = dstart; d < dend; ++d) {
-                for (int h = hstart; h < hend; ++h) {
-                  for (int w = wstart; w < wend; ++w) {
-                    pool_process.compute(
-                        input_data[(d * input_height + h) * input_width + w],
-                        &ele);
-                  }
-                }
-              }
-              int pool_size =
-                  (exclusive || adaptive)
-                      ? (dend - dstart) * (hend - hstart) * (wend - wstart)
-                      : ksize_depth * ksize_height * ksize_width;
-              pool_process.finalize(static_cast<T>(pool_size), &ele);
-              output_data[output_idx] = ele;
-            }
-          }
-        }
-        input_data += input_stride;
-        output_data += output_stride;
-      }
-    }
-  }
-};
-
-/*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
-template <typename PoolProcess, class T>
-class Pool3dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::Tensor& input,
-                  const lite::Tensor& output,
-                  const lite::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  PoolProcess pool_grad_process,
-                  bool exclusive,
-                  bool adaptive,
-                  lite::Tensor* input_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_depth = input.dims()[2];
-    const int input_height = input.dims()[3];
-    const int input_width = input.dims()[4];
-    const int output_channels = output.dims()[1];
-    const int output_depth = output.dims()[2];
-    const int output_height = output.dims()[3];
-    const int output_width = output.dims()[4];
-    const int ksize_depth = ksize[0];
-    const int ksize_height = ksize[1];
-    const int ksize_width = ksize[2];
-    const int stride_depth = strides[0];
-    const int stride_height = strides[1];
-    const int stride_width = strides[2];
-    const int padding_depth = paddings[0];
-    const int padding_height = paddings[1];
-    const int padding_width = paddings[2];
-    const int input_stride = input_depth * input_height * input_width;
-    const int output_stride = output_depth * output_height * output_width;
-
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
-
-    int dstart, dend;
-    int hstart, hend;
-    int wstart, wend;
-    for (int i = 0; i < batch_size; i++) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int pd = 0; pd < output_depth; ++pd) {
-          if (adaptive) {
-            dstart = AdaptStartIndex(pd, input_depth, output_depth);
-            dend = AdaptEndIndex(pd, input_depth, output_depth);
-          } else {
-            dstart = pd * stride_depth - padding_depth;
-            dend = std::min(dstart + ksize_depth, input_depth);
-            dstart = std::max(dstart, 0);
-          }
-          for (int ph = 0; ph < output_height; ++ph) {
-            if (adaptive) {
-              hstart = AdaptStartIndex(ph, input_height, output_height);
-              hend = AdaptEndIndex(ph, input_height, output_height);
-            } else {
-              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
-            }
-            for (int pw = 0; pw < output_width; ++pw) {
-              if (adaptive) {
-                wstart = AdaptStartIndex(pw, input_width, output_width);
-                wend = AdaptEndIndex(pw, input_width, output_width);
-              } else {
-                wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
-                wstart = std::max(wstart, 0);
-              }
-
-              int pool_size =
-                  (exclusive || adaptive)
-                      ? (dend - dstart) * (hend - hstart) * (wend - wstart)
-                      : ksize_depth * ksize_height * ksize_width;
-              float scale = 1.0 / pool_size;
-              for (int d = dstart; d < dend; ++d) {
-                for (int h = hstart; h < hend; ++h) {
-                  for (int w = wstart; w < wend; ++w) {
-                    int input_idx = (d * input_height + h) * input_width + w;
-                    int output_idx =
-                        (pd * output_height + ph) * output_width + pw;
-                    pool_grad_process.compute(input_data[input_idx],
-                                              output_data[output_idx],
-                                              output_grad_data[output_idx],
-                                              static_cast<T>(scale),
-                                              input_grad_data + input_idx);
-                  }
-                }
-              }
-            }
-          }
-        }
-        input_data += input_stride;
-        output_data += output_stride;
-        input_grad_data += input_stride;
-        output_grad_data += output_stride;
-      }
-    }
-  }
-};
-
-/*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
-template <class T>
-class MaxPool3dGradFunctor<lite::TargetType::kX86, T> {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::Tensor& input,
-                  const lite::Tensor& output,
-                  const lite::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  lite::Tensor* input_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_depth = input.dims()[2];
-    const int input_height = input.dims()[3];
-    const int input_width = input.dims()[4];
-    const int output_channels = output.dims()[1];
-    const int output_depth = output.dims()[2];
-    const int output_height = output.dims()[3];
-    const int output_width = output.dims()[4];
-    const int ksize_depth = ksize[0];
-    const int ksize_height = ksize[1];
-    const int ksize_width = ksize[2];
-    const int stride_depth = strides[0];
-    const int stride_height = strides[1];
-    const int stride_width = strides[2];
-    const int padding_depth = paddings[0];
-    const int padding_height = paddings[1];
-    const int padding_width = paddings[2];
-    const int input_stride = input_depth * input_height * input_width;
-    const int output_stride = output_depth * output_height * output_width;
-
-    const T* input_data = input.data<T>();
-    const T* output_data = output.data<T>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
-
-    for (int i = 0; i < batch_size; i++) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int pd = 0; pd < output_depth; ++pd) {
-          int dstart = pd * stride_depth - padding_depth;
-          int dend = std::min(dstart + ksize_depth, input_depth);
-          dstart = std::max(dstart, 0);
-          for (int ph = 0; ph < output_height; ++ph) {
-            int hstart = ph * stride_height - padding_height;
-            int hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
-            for (int pw = 0; pw < output_width; ++pw) {
-              int wstart = pw * stride_width - padding_width;
-              int wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
-              bool stop = false;
-              for (int d = dstart; d < dend && !stop; ++d) {
-                for (int h = hstart; h < hend && !stop; ++h) {
-                  for (int w = wstart; w < wend && !stop; ++w) {
-                    int input_idx = (d * input_height + h) * input_width + w;
-                    int output_idx =
-                        (pd * output_height + ph) * output_width + pw;
-
-                    if (input_data[input_idx] == output_data[output_idx]) {
-                      input_grad_data[input_idx] +=
-                          output_grad_data[output_idx];
-                      stop = true;
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-        input_data += input_stride;
-        output_data += output_stride;
-        input_grad_data += input_stride;
-        output_grad_data += output_stride;
-      }
-    }
-  }
-};
-
-template class MaxPool3dGradFunctor<lite::TargetType::kX86, float>;
-template class MaxPool3dGradFunctor<lite::TargetType::kX86, double>;
-
-template class Pool3dFunctor<lite::TargetType::kX86,
-                             lite::x86::math::MaxPool<float>,
-                             float>;
-template class Pool3dFunctor<lite::TargetType::kX86,
-                             lite::x86::math::AvgPool<float>,
-                             float>;
-template class Pool3dGradFunctor<lite::TargetType::kX86,
-                                 lite::x86::math::MaxPoolGrad<float>,
-                                 float>;
-template class Pool3dGradFunctor<lite::TargetType::kX86,
-                                 lite::x86::math::AvgPoolGrad<float>,
-                                 float>;
-template class Pool3dFunctor<lite::TargetType::kX86,
-                             lite::x86::math::MaxPool<double>,
-                             double>;
-template class Pool3dFunctor<lite::TargetType::kX86,
-                             lite::x86::math::AvgPool<double>,
-                             double>;
-template class Pool3dGradFunctor<lite::TargetType::kX86,
-                                 lite::x86::math::MaxPoolGrad<double>,
-                                 double>;
-template class Pool3dGradFunctor<lite::TargetType::kX86,
-                                 lite::x86::math::AvgPoolGrad<double>,
-                                 double>;
-
-/*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
-template <typename T1, typename T2>
-class MaxPool2dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  bool adaptive,
-                  lite::Tensor* output,
-                  lite::Tensor* mask) {
-    const int batch_size = input.dims()[0];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output->dims()[1];
-    const int output_height = output->dims()[2];
-    const int output_width = output->dims()[3];
-    const int ksize_height = ksize[0];
-    const int ksize_width = ksize[1];
-    const int stride_height = strides[0];
-    const int stride_width = strides[1];
-    const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
-    const int input_stride = input_height * input_width;
-    const int output_stride = output_height * output_width;
-
-    const T1* input_data = input.data<T1>();
-    T1* output_data = output->mutable_data<T1>(lite::TargetType::kX86);
-    T2* mask_data = mask->mutable_data<T2>(lite::TargetType::kX86);
-
-    int hstart, hend;
-    int wstart, wend;
-    for (int i = 0; i < batch_size; i++) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int ph = 0; ph < output_height; ++ph) {
-          if (adaptive) {
-            hstart = AdaptStartIndex(ph, input_height, output_height);
-            hend = AdaptEndIndex(ph, input_height, output_height);
-          } else {
-            hstart = ph * stride_height - padding_height;
-            hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
-          }
-          for (int pw = 0; pw < output_width; ++pw) {
-            if (adaptive) {
-              wstart = AdaptStartIndex(pw, input_width, output_width);
-              wend = AdaptEndIndex(pw, input_width, output_width);
-            } else {
-              wstart = pw * stride_width - padding_width;
-              wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
-            }
-
-            T1 ele = static_cast<T1>(-FLT_MAX);
-            int index = -1;
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                if (ele < input_data[h * input_width + w]) {
-                  ele = input_data[h * input_width + w];
-                  index = h * input_width + w;
-                }
-              }
-            }
-            output_data[ph * output_width + pw] = ele;
-            mask_data[ph * output_width + pw] = index;
-          }
-        }
-        // offset
-        input_data += input_stride;
-        output_data += output_stride;
-        mask_data += output_stride;
-      }
-    }
-  }
-};
-
-/*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
-template <typename T1, typename T2>
-class MaxPool2dWithIndexGradFunctor<lite::TargetType::kX86, T1, T2> {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::Tensor& output_grad,
-                  const lite::Tensor& mask,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  bool adaptive,
-                  lite::Tensor* input_grad) {
-    const int batch_size = input_grad->dims()[0];
-    const int input_height = input_grad->dims()[2];
-    const int input_width = input_grad->dims()[3];
-    const int output_channels = output_grad.dims()[1];
-    const int output_height = output_grad.dims()[2];
-    const int output_width = output_grad.dims()[3];
-    const int input_stride = input_height * input_width;
-    const int output_stride = output_height * output_width;
-
-    const T2* mask_data = mask.data<T2>();
-    const T1* output_grad_data = output_grad.data<T1>();
-    T1* input_grad_data = input_grad->mutable_data<T1>(lite::TargetType::kX86);
-
-    for (int n = 0; n < batch_size; ++n) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int ph = 0; ph < output_height; ++ph) {
-          for (int pw = 0; pw < output_width; ++pw) {
-            const int output_idx = ph * output_width + pw;
-            const int input_idx = static_cast<int>(mask_data[output_idx]);
-            input_grad_data[input_idx] += output_grad_data[output_idx];
-          }
-        }
-        // offset
-        input_grad_data += input_stride;
-        output_grad_data += output_stride;
-        mask_data += output_stride;
-      }
-    }
-  }
-};
-
-template class MaxPool2dWithIndexFunctor<lite::TargetType::kX86, float, int>;
-template class MaxPool2dWithIndexGradFunctor<lite::TargetType::kX86,
-                                             float,
-                                             int>;
-template class MaxPool2dWithIndexFunctor<lite::TargetType::kX86, double, int>;
-template class MaxPool2dWithIndexGradFunctor<lite::TargetType::kX86,
-                                             double,
-                                             int>;
-
-/*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
-template <typename T1, typename T2>
-class MaxPool3dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  bool adaptive,
-                  lite::Tensor* output,
-                  lite::Tensor* mask) {
-    const int batch_size = input.dims()[0];
-    const int input_depth = input.dims()[2];
-    const int input_height = input.dims()[3];
-    const int input_width = input.dims()[4];
-    const int output_channels = output->dims()[1];
-    const int output_depth = output->dims()[2];
-    const int output_height = output->dims()[3];
-    const int output_width = output->dims()[4];
-    const int ksize_depth = ksize[0];
-    const int ksize_height = ksize[1];
-    const int ksize_width = ksize[2];
-    const int stride_depth = strides[0];
-    const int stride_height = strides[1];
-    const int stride_width = strides[2];
-    const int padding_depth = paddings[0];
-    const int padding_height = paddings[1];
-    const int padding_width = paddings[2];
-    const int input_stride = input_depth * input_height * input_width;
-    const int output_stride = output_depth * output_height * output_width;
-
-    const T1* input_data = input.data<T1>();
-    T1* output_data = output->mutable_data<T1>(lite::TargetType::kX86);
-    T2* mask_data = mask->mutable_data<T2>(lite::TargetType::kX86);
-
-    int dstart, dend;
-    int hstart, hend;
-    int wstart, wend;
-    for (int i = 0; i < batch_size; i++) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int pd = 0; pd < output_depth; ++pd) {
-          if (adaptive) {
-            dstart = AdaptStartIndex(pd, input_depth, output_depth);
-            dend = AdaptEndIndex(pd, input_depth, output_depth);
-          } else {
-            dstart = pd * stride_depth - padding_depth;
-            dend = std::min(dstart + ksize_depth, input_depth);
-            dstart = std::max(dstart, 0);
-          }
-          for (int ph = 0; ph < output_height; ++ph) {
-            if (adaptive) {
-              hstart = AdaptStartIndex(ph, input_height, output_height);
-              hend = AdaptEndIndex(ph, input_height, output_height);
-            } else {
-              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
-            }
-            for (int pw = 0; pw < output_width; ++pw) {
-              if (adaptive) {
-                wstart = AdaptStartIndex(pw, input_width, output_width);
-                wend = AdaptEndIndex(pw, input_width, output_width);
-              } else {
-                wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
-                wstart = std::max(wstart, 0);
-              }
-
-              int output_idx = (pd * output_height + ph) * output_width + pw;
-              T1 ele = static_cast<T1>(-FLT_MAX);
-              int index = -1;
-              for (int d = dstart; d < dend; ++d) {
-                for (int h = hstart; h < hend; ++h) {
-                  for (int w = wstart; w < wend; ++w) {
-                    int input_idx = (d * input_height + h) * input_width + w;
-                    if (ele < input_data[input_idx]) {
-                      index = input_idx;
-                      ele = input_data[input_idx];
-                    }
-                  }
-                }
-              }
-              output_data[output_idx] = ele;
-              mask_data[output_idx] = index;
-            }
-          }
-        }
-        // offset
-        input_data += input_stride;
-        output_data += output_stride;
-        mask_data += output_stride;
-      }
-    }
-  }
-};
-
-/*
- * All tensors are in NCDHW format.
- * Ksize, strides, paddings are three elements. These three elements represent
- * depth, height and width, respectively.
- */
-template <typename T1, typename T2>
-class MaxPool3dWithIndexGradFunctor<lite::TargetType::kX86, T1, T2> {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::Tensor& output_grad,
-                  const lite::Tensor& mask,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  bool adaptive,
-                  lite::Tensor* input_grad) {
-    const int batch_size = input_grad->dims()[0];
-    const int input_depth = input_grad->dims()[2];
-    const int input_height = input_grad->dims()[3];
-    const int input_width = input_grad->dims()[4];
-    const int output_channels = output_grad.dims()[1];
-    const int output_depth = output_grad.dims()[2];
-    const int output_height = output_grad.dims()[3];
-    const int output_width = output_grad.dims()[4];
-    const int input_stride = input_depth * input_height * input_width;
-    const int output_stride = output_depth * output_height * output_width;
-
-    const T2* mask_data = mask.data<T2>();
-    const T1* output_grad_data = output_grad.data<T1>();
-    T1* input_grad_data = input_grad->mutable_data<T1>(lite::TargetType::kX86);
-
-    for (int n = 0; n < batch_size; ++n) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int pd = 0; pd < output_depth; ++pd) {
-          for (int ph = 0; ph < output_height; ++ph) {
-            for (int pw = 0; pw < output_width; ++pw) {
-              const int output_idx =
-                  (pd * output_height + ph) * output_width + pw;
-              const int input_idx = static_cast<int>(mask_data[output_idx]);
-              input_grad_data[input_idx] += output_grad_data[output_idx];
-            }
-          }
-        }
-        // offset
-        input_grad_data += input_stride;
-        output_grad_data += output_stride;
-        mask_data += output_stride;
-      }
-    }
-  }
-};
-
-template class MaxPool3dWithIndexFunctor<lite::TargetType::kX86, float, int>;
-template class MaxPool3dWithIndexGradFunctor<lite::TargetType::kX86,
-                                             float,
-                                             int>;
-template class MaxPool3dWithIndexFunctor<lite::TargetType::kX86, double, int>;
-template class MaxPool3dWithIndexGradFunctor<lite::TargetType::kX86,
-                                             double,
-                                             int>;
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/pooling.h b/lite/backends/x86/math/pooling.h
deleted file mode 100644
index 64015e32c8..0000000000
--- a/lite/backends/x86/math/pooling.h
+++ /dev/null
@@ -1,258 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "lite/core/context.h"
-#include "lite/core/tensor.h"
-#include "lite/fluid/eigen.h"
-#include "lite/utils/macros.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-/*
- * \brief Extracting simple operations from pooling.
- *        Both MaxPool and AvgPool need "initial", "compute" and "finalize"
- * operation.
- *        MaxPool initializes temp variable to the negative maximum to find the
- * maximum value in the pooling field.
- *        AvgPool initializes temp variable to the zero to accumulate all values
- * in pool pooling, and finally takes the average.
- *        MaxPoolGrad and AvgPoolGrad are gradient operations respectively.
- */
-template <class T>
-class MaxPool {
- public:
-  DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
-  DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
-  DEVICE inline void finalize(const T& pool_field, T* y) {}
-};
-
-template <class T>
-class AvgPool {
- public:
-  DEVICE inline T initial() { return static_cast<T>(0); }
-  DEVICE inline void compute(const T& x, T* y) { *y += x; }
-  DEVICE inline void finalize(const T& pool_field, T* y) { *y /= pool_field; }
-};
-
-template <class T>
-class MaxPoolGrad {
- public:
-  DEVICE inline void compute(
-      const T& x, const T& y, const T& dy, T scale, T* dx) {
-    *dx += dy * (x == y);
-  }
-};
-
-template <class T>
-class AvgPoolGrad {
- public:
-  DEVICE inline void compute(
-      const T& x, const T& y, const T& dy, T scale, T* dx) {
-    *dx += (scale * dy);
-  }
-};
-
-/* used for adaptive pool to calculate start and end index of each divided grid
- */
-HOSTDEVICE inline int AdaptStartIndex(int ph, int input_size, int output_size) {
-  return static_cast<int>(
-      floor(static_cast<double>(ph * input_size) / output_size));
-}
-
-HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) {
-  return static_cast<int>(
-      ceil(static_cast<double>((ph + 1) * input_size) / output_size));
-}
-
-/*
- * \brief Getting pooling results, and calculating gradient.
- *
- * In pool2d, all tensors are in NCHW format. Where N is batch size, C is the
- * number of channels, H and W is the height and width of feature.
- * In pool3d, all tensors are in NCDHW format. Where N is batch size, C is the
- * number of channels, D, H and W is the depth, height and width of feature.
- *
- * In max pooling, it is possible that the pooling region has multiple maximum
- * elements. In this case, we should compute the gradient of the first maximum
- * element.
- * This is different from average pooling. So we rewrite the max_pool_grad:
- * MaxPool2dGradFunctor, MaxPool3dGradFunctor.
- */
-//#ifdef PADDLE_WITH_CUDA
-// template <typename PoolProcess, typename T>
-// class Pool2dDirectCUDAFunctor {
-// public:
-//  void operator()(const T* input, const std::vector<int>& input_shape,
-//                  const std::vector<int>& output_shape,
-//                  const std::vector<int>& ksize,
-//                  const std::vector<int>& strides,
-//                  const std::vector<int>& paddings, PoolProcess pool_compute,
-//                  bool exclusive, T* output, cudaStream_t stream);
-//};
-//#endif
-
-template <lite::TargetType Target, typename PoolProcess, typename T>
-class Pool2dFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  PoolProcess pool_compute,
-                  bool exclusive,
-                  bool adaptive,
-                  lite::Tensor* output);
-};
-
-template <lite::TargetType Target, typename PoolProcess, typename T>
-class Pool2dGradFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& input,
-                  const lite::Tensor& output,
-                  const lite::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  PoolProcess pool_compute,
-                  bool exclusive,
-                  bool adaptive,
-                  lite::Tensor* input_grad);
-};
-
-template <lite::TargetType Target, class T>
-class MaxPool2dGradFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& input,
-                  const lite::Tensor& output,
-                  const lite::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  lite::Tensor* input_grad);
-};
-
-template <lite::TargetType Target, typename PoolProcess, typename T>
-class Pool3dFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  PoolProcess pool_compute,
-                  bool exclusive,
-                  bool adaptive,
-                  lite::Tensor* output);
-};
-
-template <lite::TargetType Target, typename PoolProcess, typename T>
-class Pool3dGradFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& input,
-                  const lite::Tensor& output,
-                  const lite::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  PoolProcess pool_compute,
-                  bool exclusive,
-                  bool adaptive,
-                  lite::Tensor* input_grad);
-};
-
-template <lite::TargetType Target, class T>
-class MaxPool3dGradFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& input,
-                  const lite::Tensor& output,
-                  const lite::Tensor& output_grad,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  lite::Tensor* input_grad);
-};
-
-/*
- * \brief Getting max pooling results and corresponding max index, and
- * calculating gradient.
- * In up-sampling-pooling, it is necessary to know max element index.
- * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in
- * NCDHW format.
- */
-template <lite::TargetType Target, typename T1, typename T2>
-class MaxPool2dWithIndexFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  bool adaptive,
-                  lite::Tensor* output,
-                  lite::Tensor* mask);
-};
-
-template <lite::TargetType Target, typename T1, typename T2>
-class MaxPool2dWithIndexGradFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& output_grad,
-                  const lite::Tensor& mask,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  bool adaptive,
-                  lite::Tensor* input_grad);
-};
-
-template <lite::TargetType Target, typename T1, typename T2>
-class MaxPool3dWithIndexFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& input,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  bool adaptive,
-                  lite::Tensor* output,
-                  lite::Tensor* mask);
-};
-
-template <lite::TargetType Target, typename T1, typename T2>
-class MaxPool3dWithIndexGradFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& output_grad,
-                  const lite::Tensor& mask,
-                  const std::vector<int>& ksize,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  bool adaptive,
-                  lite::Tensor* input_grad);
-};
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/prelu.h b/lite/backends/x86/math/prelu.h
deleted file mode 100644
index 049397c72c..0000000000
--- a/lite/backends/x86/math/prelu.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "lite/backends/x86/math/math_function.h"
-// #include "paddle/fluid/platform/cudnn_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-// #ifdef PADDLE_WITH_CUDA
-//  template <typename T>
-//  class PreluChannelWiseDirectCUDAFunctor {
-//  public:
-//   void operator()(cudaStream_t stream, const T *input, const T *alpha,
-//                   T *output, std::vector<int> input_shape);
-// };
-//
-//  template <typename T>
-//  class PreluElementWiseDirectCUDAFunctor {
-//  public:
-//   void operator()(cudaStream_t stream, const T *input, const T *alpha,
-//                   T *output, std::vector<int> input_shape);
-// };
-//
-//  template <typename T>
-//  class PreluScalarDirectCUDAFunctor {
-//  public:
-//   void operator()(cudaStream_t stream, const T *input, const T *alpha,
-//                   T *output, std::vector<int> input_shape);
-// };
-// #endif
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/sample_prob.cc b/lite/backends/x86/math/sample_prob.cc
deleted file mode 100644
index ecf1ca8e1a..0000000000
--- a/lite/backends/x86/math/sample_prob.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/x86/math/sample_prob.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-template class SampleWithProb<lite::TargetType::kX86, float>;
-template class SampleWithProb<lite::TargetType::kX86, double>;
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/sample_prob.h b/lite/backends/x86/math/sample_prob.h
deleted file mode 100644
index 5312b3df10..0000000000
--- a/lite/backends/x86/math/sample_prob.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <iostream>
-#include <unordered_set>
-#include <vector>
-#include "lite/backends/x86/math/sampler.h"
-#include "lite/core/context.h"
-#include "lite/core/tensor.h"
-#include "lite/fluid/eigen.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-/* UNDERSTAND: utility function to adjust probability for unique sampling,
-return whatever as it is if not using unique samping */
-template <typename T>
-static T adjust_prob(const T prob, const int num_samples, const int num_tries) {
-  if (num_samples == num_tries) {
-    return prob * num_samples;
-  } else {
-    return -expm1(num_tries * log1p(-prob));
-  }
-}
-
-template <lite::TargetType Target, typename T>
-class SampleWithProb {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const Sampler& sampler,
-                  const std::size_t num_samples,
-                  const lite::Tensor* L,
-                  lite::Tensor* S,
-                  lite::Tensor* P) {
-    // UNDERSTAND: dimension issues
-    const auto lbl_dim = L->dims();
-    const int batch_size = lbl_dim[0];
-    const int num_true = lbl_dim[1];
-    const int num_sampled_classes = num_true + num_samples;
-    // std::vector<int64_t> ret_dim_vec = {batch_size, num_sampled_classes};
-    // lite::DDim ret_dim(ret_dim_vec);
-
-    // UNDERSTAND: raw data view
-    const int64_t* label_data = L->data<int64_t>();
-    // int64_t* samples_data =
-    //    S->mutable_data<int64_t>(ret_dim, Target);
-    // T* probabilities_data = P->mutable_data<T>(ret_dim, Target);
-    S->Resize({batch_size, num_sampled_classes});
-    auto* samples_data = S->mutable_data<int64_t>(Target);
-    P->Resize({batch_size, num_sampled_classes});
-    auto* probabilities_data = P->mutable_data<T>(Target);
-
-    // temp sets for unique sampling
-    std::unordered_set<int64_t> tmp_samples;
-    int j = 0;  // column index
-    // add true labels, not that efficient
-    while (j < num_true) {
-      for (int i = 0; i < batch_size; ++i) {
-        auto samples_index = i * num_sampled_classes + j;
-        auto v = label_data[i * num_true + j];
-        samples_data[samples_index] = v;
-        probabilities_data[samples_index] = sampler.Probability(v);
-      }
-      ++j;
-    }
-
-    // sample num_samles unique samples for an example, note that they are not
-    // all negative samples
-    tmp_samples.clear();
-    int num_tries = 0;
-    while (j < num_sampled_classes) {
-      ++num_tries;
-      auto v = sampler.Sample();
-      auto insert_ok = tmp_samples.insert(v).second;
-      if (!insert_ok) {
-        continue;
-      }
-      auto p = sampler.Probability(v);
-      for (int i = 0; i < batch_size; ++i) {
-        auto samples_index = i * num_sampled_classes + j;
-        samples_data[samples_index] = v;
-        probabilities_data[samples_index] = p;
-      }
-      ++j;
-    }
-
-    // compute Q(y|x), because of unique sampling, probabilities need to be
-    // adjusted
-    for (int k = 0; k < num_sampled_classes; ++k) {
-      for (int i = 0; i < batch_size; ++i) {
-        auto samples_index = i * num_sampled_classes + k;
-        probabilities_data[samples_index] = adjust_prob(
-            probabilities_data[samples_index], num_samples, num_tries);
-      }
-    }
-  }
-};
-
-// #ifdef PADDLE_WITH_CUDA
-//  template <typename T>
-//  class GPUSampleWithProb {
-//  public:
-//   void operator()(const platform::CUDAlite::Context<Target>& context, const
-//   int seed,
-//                   const int dict_size, const bool uniq,
-//                   const std::size_t num_samples, const lite::Tensor* L,
-//                   lite::Tensor* S,
-//                   lite::Tensor* P);
-// };
-// #endif
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/sampler.cc b/lite/backends/x86/math/sampler.cc
deleted file mode 100644
index 1246806372..0000000000
--- a/lite/backends/x86/math/sampler.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/x86/math/sampler.h"
-#include <iostream>
-#include <queue>
-#include <utility>
-#include <vector>
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-Sampler::~Sampler() {}
-
-UniformSampler::UniformSampler(int64_t range, unsigned int seed)
-    : Sampler(range, seed), inv_range_(1.0 / (range + 1)) {
-  random_engine_ = std::make_shared<std::mt19937_64>(seed_);
-  dist_ = std::make_shared<std::uniform_int_distribution<>>(0, range);
-}
-
-int64_t UniformSampler::Sample() const { return (*dist_)(*random_engine_); }
-
-float UniformSampler::Probability(int64_t value) const { return inv_range_; }
-
-LogUniformSampler::LogUniformSampler(int64_t range, unsigned int seed)
-    : Sampler(range, seed), log_range_(log(range + 1)) {
-  random_engine_ = std::make_shared<std::mt19937_64>(seed_);
-  dist_ = std::make_shared<std::uniform_real_distribution<>>(0, 1);
-}
-
-int64_t LogUniformSampler::Sample() const {
-  // Got Log Uniform distribution from uniform distribution by
-  // inverse_transform_sampling method
-  // More details:
-  // https://wanghaoshuang.github.io/2017/11/Log-uniform-distribution-sampler/
-  const int64_t value =
-      static_cast<int64_t>(exp((*dist_)(*random_engine_) * log_range_)) - 1;
-  // Mathematically, value should be <= range_, but might not be due to some
-  // floating point roundoff, so we mod by range_.
-  return value % range_;
-}
-
-float LogUniformSampler::Probability(int64_t value) const {
-  // Given f(x) = 1/[(x+1) * log_range_]
-  // The value's  probability  is integral of f(x) from value to (value + 1)
-  // More details:
-  // https://wanghaoshuang.github.io/2017/11/Log-uniform-distribution-sampler
-  return (log((value + 2.0) / (value + 1.0))) / log_range_;
-}
-
-CustomSampler::CustomSampler(int64_t range,
-                             const float *probabilities,
-                             const int *alias,
-                             const float *alias_probabilities,
-                             unsigned int seed)
-    : Sampler(range, seed) {
-  random_engine_ = std::make_shared<std::mt19937>(seed_);
-  real_dist_ = std::make_shared<std::uniform_real_distribution<>>(0, 1);
-  int_dist_ = std::make_shared<std::uniform_int_distribution<>>(0, range);
-
-  alias_probs_ = alias_probabilities;
-  probs_ = probabilities;
-  alias_ = alias;
-}
-
-int64_t CustomSampler::Sample() const {
-  auto index = (*int_dist_)(*random_engine_);
-  auto p = (*real_dist_)(*random_engine_);
-  if (p > alias_probs_[index]) {
-    int alias = alias_[index];
-
-    if (alias == exceptional_val) {
-      LOG(WARNING) << "WARNING: CustomSampler get alias " << exceptional_val;
-      return index;
-    }
-
-    return alias;
-  } else {
-    return index;
-  }
-}
-
-float CustomSampler::Probability(int64_t value) const { return probs_[value]; }
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/sampler.h b/lite/backends/x86/math/sampler.h
deleted file mode 100644
index efd9e48e54..0000000000
--- a/lite/backends/x86/math/sampler.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-#include <random>
-#include <vector>
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-// TODO(wanghaoshuang): Support for GPU
-
-/**
-* Sample integers from [0, range).
-*/
-class Sampler {
- public:
-  explicit Sampler(int64_t range, unsigned int seed = 0UL) : range_(range) {
-    //    PADDLE_ENFORCE_GT(range, 0, "Range should be greater than 0.");
-    if (seed == 0) {
-      std::random_device r;
-      seed_ = r();
-    } else {
-      seed_ = seed;
-    }
-  }
-
-  virtual ~Sampler();
-
-  // Sample a single value
-  virtual int64_t Sample() const = 0;
-
-  // The probability that a single call to Sample() returns the given value.
-  virtual float Probability(int64_t value) const = 0;
-
-  int64_t range() { return range_; }
-
- protected:
-  const int64_t range_;
-  unsigned int seed_;
-};
-
-/**
- * Sample integers from [0, range).
- * And the distribution function is:
- * P(x) = 1 / range
- */
-class UniformSampler : public Sampler {
- public:
-  explicit UniformSampler(int64_t range, unsigned int seed = 0UL);
-
-  ~UniformSampler() override {}
-
-  int64_t Sample() const override;
-
-  float Probability(int64_t value) const override;
-
- private:
-  const float inv_range_;
-  std::shared_ptr<std::mt19937_64> random_engine_;
-  std::shared_ptr<std::uniform_int_distribution<>> dist_;
-};
-
-/**
- * Sample integers from [0, range).
- * And the distribution function is:
- * P(x) = (1/ln(range+1)) * ln(1 + 1/(x + 1))
- */
-class LogUniformSampler : public Sampler {
- public:
-  explicit LogUniformSampler(int64_t range, unsigned int seed = 0UL);
-
-  ~LogUniformSampler() override {}
-
-  int64_t Sample() const override;
-
-  float Probability(int64_t value) const override;
-
- private:
-  const float log_range_;
-  std::shared_ptr<std::mt19937_64> random_engine_;
-  std::shared_ptr<std::uniform_real_distribution<>> dist_;
-};
-
-/**
- * Sample integers from [0, range) from custom distribution.
- */
-class CustomSampler : public Sampler {
- public:
-  explicit CustomSampler(int64_t range,
-                         const float* probabilities,
-                         const int* alias,
-                         const float* alias_probabilities,
-                         unsigned int seed = 0UL);
-
-  ~CustomSampler() override {}
-
-  int64_t Sample() const override;
-
-  float Probability(int64_t value) const override;
-
- private:
-  const float* alias_probs_;
-  const int* alias_;
-  const float* probs_;
-  const int exceptional_val = -1;
-  std::shared_ptr<std::mt19937> random_engine_;
-  std::shared_ptr<std::uniform_real_distribution<>> real_dist_;
-  std::shared_ptr<std::uniform_int_distribution<>> int_dist_;
-};
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/sequence2batch.cc b/lite/backends/x86/math/sequence2batch.cc
deleted file mode 100644
index ff215781f1..0000000000
--- a/lite/backends/x86/math/sequence2batch.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/x86/math/sequence2batch.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-template <typename T>
-class CopyMatrixRowsFunctor<lite::TargetType::kX86, T> {
- public:
-  void operator()(const lite::Context<lite::TargetType::kX86>& context,
-                  const lite::Tensor& src,
-                  std::vector<size_t> index_lod,
-                  lite::Tensor* dst,
-                  bool is_src_index) {
-    size_t* index = index_lod.data();
-    auto src_dims = src.dims();
-    auto dst_dims = dst->dims();
-    PADDLE_ENFORCE_EQ(
-        src_dims.size(), 2UL, "The src must be matrix with rank 2.");
-    PADDLE_ENFORCE_EQ(
-        dst_dims.size(), 2UL, "The dst must be matrix with rank 2.");
-    PADDLE_ENFORCE_EQ(
-        src_dims[1], dst_dims[1], "The width of src and dst must be same.");
-    auto height = dst_dims[0];
-    auto width = dst_dims[1];
-    auto* src_data = src.data<T>();
-    auto* dst_data = dst->mutable_data<T>();
-    const int sz = width * sizeof(T);
-    if (is_src_index) {
-      for (int i = 0; i < height; ++i) {
-        memcpy(dst_data + i * width, src_data + index[i] * width, sz);
-      }
-    } else {
-      for (int i = 0; i < height; ++i) {
-        memcpy(dst_data + index[i] * width, src_data + i * width, sz);
-      }
-    }
-  }
-};
-
-template class CopyMatrixRowsFunctor<lite::TargetType::kX86, float>;
-template class CopyMatrixRowsFunctor<lite::TargetType::kX86, double>;
-
-template class LoDTensor2BatchFunctor<lite::TargetType::kX86, float>;
-template class LoDTensor2BatchFunctor<lite::TargetType::kX86, double>;
-template class Batch2LoDTensorFunctor<lite::TargetType::kX86, float>;
-template class Batch2LoDTensorFunctor<lite::TargetType::kX86, double>;
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/sequence2batch.h b/lite/backends/x86/math/sequence2batch.h
deleted file mode 100644
index 807558e9d8..0000000000
--- a/lite/backends/x86/math/sequence2batch.h
+++ /dev/null
@@ -1,190 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <vector>
-
-#include "lite/core/context.h"
-#include "lite/core/tensor.h"
-#include "lite/fluid/eigen.h"
-#include "lite/fluid/lod.h"
-#include "lite/utils/paddle_enforce.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = lite::fluid::EigenMatrix<T, MajorType, IndexType>;
-
-template <lite::TargetType Target, typename T>
-class CopyMatrixRowsFunctor {
- public:
-  // If is_src_index is true,
-  // copy the indexed rows of input src to the output dst.
-  // If is_src_index is false,
-  // copy the input src to the indexed rows of output dst.
-  // The indexed rows are based on the input index.
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& src,
-                  std::vector<size_t> index_lod,
-                  lite::Tensor* dst,
-                  bool is_src_index);
-};
-
-template <lite::TargetType Target, typename T>
-class LoDTensor2BatchFunctor {
-  // Calculate the length of each sequence and
-  // sort sequence index by the length.
-  // example:  sequences = {s0, s1, s2}
-  //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
-  //           seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
-  //
-  struct SeqInfo {
-    SeqInfo(int start, int length, int seq_idx)
-        : start(start), length(length), seq_idx(seq_idx) {}
-    int start;
-    int length;
-    int seq_idx;
-  };
-
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& lod_tensor,
-                  lite::Tensor* batch,
-                  bool is_cal_batch_lod,
-                  bool is_reverse = false) const {
-    if (!is_cal_batch_lod) {
-      auto lods = batch->lod();
-      PADDLE_ENFORCE_GT(lods.size(),
-                        2UL,
-                        "The LoD of LoDTensor should inlcude at least 2-level "
-                        "sequence information.");
-      PADDLE_ENFORCE_EQ(
-          lods[1].size(),
-          static_cast<size_t>(lod_tensor.dims()[0]),
-          "The LoD information should be consistent with the dims.");
-      CopyMatrixRowsFunctor<Target, T> to_batch;
-      to_batch(context, lod_tensor, lods[1], batch, true);
-      return;
-    }
-
-    auto lods = lod_tensor.lod();
-    PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
-
-    const auto& lod = lods[0];
-
-    std::vector<SeqInfo> seq_info;
-    for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
-      int length = lod[seq_id + 1] - lod[seq_id];
-      seq_info.emplace_back(lod[seq_id], length, seq_id);
-    }
-
-    std::sort(seq_info.begin(), seq_info.end(), [](SeqInfo a, SeqInfo b) {
-      return a.length > b.length;
-    });
-
-    // Calculate the start position of each batch.
-    // example:  sequences = {s0, s1, s2}
-    //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
-    //           max_seqlen = 5,
-    //           batchIndex = {b0, b1, b2, b3, b4}
-    //           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
-    //           batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
-    //              batch_start_positions[0] = len(b0)
-    //              batch_start_positions[1] = len(b0) + len(b1)
-    //              batch_start_positions[2] = len(b0) + len(b1) + len(b2)
-    //              ...
-    //           seq2batch_idx[12] = {4, 0, 9,
-    //                                5, 1, 10,
-    //                                6, 2, 11,
-    //                                7, 3,
-    //                                8}
-    //           seq_order = {1, 0, 2}, the sort order.
-    //               where 1 is the second sequence,
-    //                     0 is the first sequence,
-    //                     2 is the third sequence.
-    // The max_seqlen represents batch size after rearranging the
-    // input LodTensor. It is also the maximum length of input sequence.
-
-    lite::LoD batch_lods;
-    batch_lods.emplace_back(std::vector<size_t>{0});
-    batch_lods.emplace_back(std::vector<size_t>{0});
-    batch_lods.emplace_back(std::vector<size_t>{0});
-
-    // batch_lods[0] is the start positions for batch LoDTensor
-    int max_seqlen = seq_info[0].length;
-    batch_lods[0].resize(static_cast<size_t>(max_seqlen + 1));
-    // batch_lods[1] is the raw index in the input LoDTensor
-    batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
-    // batch_lods[2] is the sort order for the input LoDTensor.
-    batch_lods[2].resize(seq_info.size());
-
-    size_t* batch_starts = batch_lods[0].data();
-    size_t* seq2batch_idx = batch_lods[1].data();
-    batch_starts[0] = 0;
-    for (int n = 0; n < max_seqlen; n++) {
-      auto batch_id = static_cast<int>(batch_starts[n]);
-      for (size_t i = 0; i < seq_info.size(); ++i) {
-        int seq_len = seq_info[i].length;
-        int start = seq_info[i].start;
-        if (n < seq_len) {
-          seq2batch_idx[batch_id] =
-              is_reverse ? start + seq_len - 1 - n : start + n;
-          batch_id++;
-        } else {
-          break;
-        }
-      }
-      batch_starts[n + 1] = static_cast<size_t>(batch_id);
-    }
-    size_t* seq_order = batch_lods[2].data();
-    for (size_t i = 0; i < seq_info.size(); ++i) {
-      seq_order[i] = seq_info[i].seq_idx;
-    }
-    batch->set_lod(batch_lods);
-
-    CopyMatrixRowsFunctor<Target, T> to_batch;
-    to_batch(context, lod_tensor, batch_lods[1], batch, true);
-  }
-};
-
-template <lite::TargetType Target, typename T>
-class Batch2LoDTensorFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& batch,
-                  lite::Tensor* lod_tensor) const {
-    auto in_lod = batch.lod();
-    PADDLE_ENFORCE_GT(in_lod.size(),
-                      2UL,
-                      "The LoD of LoDTensor should inlcude at least 2-level "
-                      "sequence information.");
-    PADDLE_ENFORCE_EQ(
-        in_lod[1].size(),
-        static_cast<size_t>(lod_tensor->dims()[0]),
-        "The LoD information should be consistent with the dims.");
-    CopyMatrixRowsFunctor<Target, T> to_seq;
-    to_seq(context, batch, in_lod[1], lod_tensor, false);
-  }
-};
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/sequence_padding.cc b/lite/backends/x86/math/sequence_padding.cc
deleted file mode 100644
index fbb6c11a5f..0000000000
--- a/lite/backends/x86/math/sequence_padding.cc
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/x86/math/sequence_padding.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-template <typename T>
-void CopyValidData(lite::Tensor* dst_tensor,
-                   const lite::Tensor* src_tensor,
-                   const std::vector<size_t>& seq_offsets,
-                   int pad_seq_len,
-                   int step_width,
-                   bool norm_by_len,
-                   CopyType type,
-                   PadLayout layout) {
-  int seq_num = seq_offsets.size() - 1;
-  const T* src_data = src_tensor->data<T>();
-  T* dst_data = dst_tensor->mutable_data<T>();
-
-  int seq_cpy_gap = step_width;
-  int pad_cpy_gap =
-      layout == kBatchLengthWidth ? step_width : seq_num * step_width;
-  for (int seq_idx = 0; seq_idx < seq_num; ++seq_idx) {
-    int valid_seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
-    PADDLE_ENFORCE_GE(
-        pad_seq_len,
-        valid_seq_len,
-        "The padded sequence length can not be less than its original length.");
-    int seq_data_offset = seq_offsets[seq_idx] * step_width;
-    int pad_data_offset = layout == kBatchLengthWidth
-                              ? seq_idx * pad_seq_len * step_width
-                              : seq_idx * step_width;
-    float scale = 1.0f / static_cast<float>(valid_seq_len);
-
-    for (int step_idx = 0; step_idx < valid_seq_len; ++step_idx) {
-      const T* src =
-          src_data + (type == kSeqToPad ? seq_data_offset : pad_data_offset);
-      T* dst =
-          dst_data + (type == kSeqToPad ? pad_data_offset : seq_data_offset);
-      memcpy(dst, src, step_width * sizeof(T));
-      if (norm_by_len) {
-        for (int i = 0; i < step_width; ++i) {
-          *(dst + i) *= scale;
-        }
-      }
-      seq_data_offset += seq_cpy_gap;
-      pad_data_offset += pad_cpy_gap;
-    }
-  }
-}
-
-template <typename T>
-static void fast_mem_init(void* dest,
-                          size_t dest_size,
-                          const T* src,
-                          size_t num_bytes) {
-  if (dest == nullptr || dest_size == 0 || src == nullptr) return;
-
-  memcpy(dest, src, num_bytes);
-
-  dest_size *= num_bytes;
-  while (dest_size > num_bytes) {
-    size_t remaining = dest_size - num_bytes;
-    size_t count = (remaining > num_bytes) ? num_bytes : remaining;
-    memcpy((unsigned char*)dest + num_bytes, dest, count);
-    num_bytes += count;
-  }
-}
-
-template <typename T>
-class PaddingLoDTensorFunctor<lite::TargetType::kX86, T> {
- public:
-  void operator()(const lite::Context<lite::TargetType::kX86>& context,
-                  const lite::Tensor& seq_tensor,
-                  lite::Tensor* pad_tensor,
-                  const lite::Tensor& pad_value,
-                  int pad_seq_len = -1,
-                  int lod_level = 0,
-                  bool norm_by_times = false,
-                  const PadLayout layout = kBatchLengthWidth) {
-    auto seq_lod = seq_tensor.lod();
-    const auto seq_offsets = lite::fluid::ToAbsOffset(seq_lod)[lod_level];
-    const auto& seq_tensor_dims = seq_tensor.dims();
-    const auto& pad_tensor_dims = pad_tensor->dims();
-    if (pad_seq_len == -1) {
-      pad_seq_len = MaximumSequenceLength(seq_offsets);
-    }
-    int step_width = seq_tensor.numel() / seq_tensor_dims[0];
-
-    CheckDims(seq_tensor_dims,
-              pad_tensor_dims,
-              seq_offsets,
-              pad_seq_len,
-              step_width,
-              layout);
-    PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width,
-                   "The numel of 'pad_value' can only be 1 or be equal to the "
-                   "'step_width'.");
-
-    // fill padding value
-    T* pad_data = pad_tensor->mutable_data<T>();
-    const T* pad_value_data = pad_value.data<T>();
-    if (pad_value.numel() == 1) {
-      fast_mem_init<T>(
-          pad_data, pad_tensor->numel(), pad_value_data, sizeof(T));
-    } else {
-      for (int i = 0; i < pad_tensor->numel(); i += step_width) {
-        memcpy(pad_data + i, pad_value_data, step_width * sizeof(T));
-      }
-    }
-
-    CopyValidData<T>(pad_tensor,
-                     &seq_tensor,
-                     seq_offsets,
-                     pad_seq_len,
-                     step_width,
-                     norm_by_times,
-                     kSeqToPad,
-                     layout);
-  }
-};
-
-template <typename T>
-class UnpaddingLoDTensorFunctor<lite::TargetType::kX86, T> {
- public:
-  void operator()(const lite::Context<lite::TargetType::kX86>& context,
-                  const lite::Tensor& pad_tensor,
-                  lite::Tensor* seq_tensor,
-                  int pad_seq_len = -1,
-                  int lod_level = 0,
-                  bool norm_by_times = false,
-                  const PadLayout layout = kBatchLengthWidth) {
-    auto seq_offsets = lite::fluid::ToAbsOffset(seq_tensor->lod())[lod_level];
-    const auto& seq_tensor_dims = seq_tensor->dims();
-    const auto& pad_tensor_dims = pad_tensor.dims();
-    if (pad_seq_len == -1) {
-      pad_seq_len = MaximumSequenceLength(seq_offsets);
-    }
-    int step_width = seq_tensor->numel() / seq_tensor_dims[0];
-
-    CheckDims(seq_tensor_dims,
-              pad_tensor_dims,
-              seq_offsets,
-              pad_seq_len,
-              step_width,
-              layout);
-
-    CopyValidData<T>(seq_tensor,
-                     &pad_tensor,
-                     seq_offsets,
-                     pad_seq_len,
-                     step_width,
-                     norm_by_times,
-                     kPadToSeq,
-                     layout);
-  }
-};
-
-template class PaddingLoDTensorFunctor<lite::TargetType::kX86, int>;
-template class PaddingLoDTensorFunctor<lite::TargetType::kX86, int64_t>;
-template class PaddingLoDTensorFunctor<lite::TargetType::kX86, float>;
-template class PaddingLoDTensorFunctor<lite::TargetType::kX86, double>;
-
-template class UnpaddingLoDTensorFunctor<lite::TargetType::kX86, int>;
-template class UnpaddingLoDTensorFunctor<lite::TargetType::kX86, int64_t>;
-template class UnpaddingLoDTensorFunctor<lite::TargetType::kX86, float>;
-template class UnpaddingLoDTensorFunctor<lite::TargetType::kX86, double>;
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/sequence_padding.h b/lite/backends/x86/math/sequence_padding.h
deleted file mode 100644
index a3f4512042..0000000000
--- a/lite/backends/x86/math/sequence_padding.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <vector>
-#include "lite/core/context.h"
-#include "lite/core/tensor.h"
-#include "lite/fluid/lod.h"
-#include "lite/utils/paddle_enforce.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-enum PadLayout { kBatchLengthWidth = 0, kLengthBatchWidth };
-
-enum CopyType { kSeqToPad, kPadToSeq };
-
-inline static size_t MaximumSequenceLength(
-    const std::vector<size_t>& seq_offset) {
-  size_t seq_num = seq_offset.size() - 1;
-  size_t max_seq_len = 0;
-  for (size_t i = 0; i < seq_num; ++i) {
-    max_seq_len = std::max(max_seq_len, seq_offset[i + 1] - seq_offset[i]);
-  }
-  return max_seq_len;
-}
-
-inline static void CheckDims(const lite::DDim& seq_tensor_dims,
-                             const lite::DDim& pad_tensor_dims,
-                             const std::vector<size_t>& seq_offset,
-                             int64_t padded_seq_len,
-                             int64_t step_width,
-                             const PadLayout& layout) {
-  PADDLE_ENFORCE_EQ(static_cast<size_t>(seq_tensor_dims[0]),
-                    seq_offset.back(),
-                    "Value of 1st dimension of the sequence tensor should be "
-                    "equal to sum of lengths of all sequences.");
-
-  PADDLE_ENFORCE(seq_tensor_dims.size() + 1 == pad_tensor_dims.size() ||
-                     seq_tensor_dims.size() == pad_tensor_dims.size(),
-                 "pad_tensor's rank should be 1 greater than seq_tensor's "
-                 "rank, or be equal with it.");
-}
-
-/*
- * \brief   Padding/Unpadding LoDTensor to/from normal Tensor of the shape
- *          [max_sequence_length, num_sequences, sequence_width].
- *
- *  Padding sequence:
- *        padding[i] = seq[lod[level][i]]
- *  Unpadding sequence:
- *        seq[lod[level][i]] = padding[i]
- *
- *  All sequences will be padded to the same length and stored in a transposed
- * shape.
- *  Example:
- *    seq     (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3)
- *    padding (s0, s1, s2, s3; s0, s1, s2, 0; s0, 0, s2, 0; s0, 0, 0, 0)
- *
- * \param context       device context of this functor.
- * \param seq           LoDTensor which is stored in sequence format, the shape
- *                      is [total_sequence_length, sequence_width] where
- *                      total_sequence_length is the sum of all sequences'
- *                      length.
- * \param padding       Tensor which is padded to the same length, the shape is
- *                      [max_sequence_length, num_sequences, sequence_width].
- * \param norm_by_times whether dividing sequence's length.
- *
- * \note  transposition is also done in this functor.
- */
-template <lite::TargetType Target, typename T>
-class PaddingLoDTensorFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& seq_tensor,
-                  lite::Tensor* pad_tensor,
-                  const lite::Tensor& pad_value,
-                  int pad_seq_len = -1,
-                  int lod_level = 0,
-                  bool norm_by_times = false,
-                  const PadLayout layout = kBatchLengthWidth);
-};
-
-template <lite::TargetType Target, typename T>
-class UnpaddingLoDTensorFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& pad_tensor,
-                  lite::Tensor* seq_tensor,
-                  int pad_seq_len = -1,
-                  int lod_level = 0,
-                  bool norm_by_times = false,
-                  const PadLayout layout = kBatchLengthWidth);
-};
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/sequence_pooling.cc b/lite/backends/x86/math/sequence_pooling.cc
deleted file mode 100644
index 186b8b5543..0000000000
--- a/lite/backends/x86/math/sequence_pooling.cc
+++ /dev/null
@@ -1,406 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-
-#include "lite/backends/x86/jit/kernels.h"
-#include "lite/backends/x86/legacy_place.h"
-#include "lite/backends/x86/math/blas.h"
-#include "lite/backends/x86/math/math_function.h"
-#include "lite/backends/x86/math/sequence_pooling.h"
-#include "lite/fluid/eigen.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = lite::fluid::EigenVector<T, MajorType, IndexType>;
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = lite::fluid::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T, bool is_test>
-class MaxSeqPoolFunctor {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::Tensor& input,
-                  T pad_value,
-                  lite::Tensor* output,
-                  lite::Tensor* index) {
-    auto in_dims = input.dims();
-    auto out_dims = output->dims();
-    auto idx_dims = index->dims();
-    PADDLE_ENFORCE_GT(in_dims.size(), 1);
-    PADDLE_ENFORCE_GT(out_dims.size(), 1);
-    for (int64_t i = 1; i < in_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
-    }
-    PADDLE_ENFORCE_EQ(idx_dims, out_dims);
-
-    auto starts = input.lod()[0];
-    const T* in_data = input.data<T>();
-    T* out_data = output->mutable_data<T>();
-    int* max_index = index->mutable_data<int>();
-
-    int64_t num_seq = out_dims[0];
-    int64_t dim = output->numel() / num_seq;
-    for (int64_t i = 0; i < num_seq; ++i) {
-      if (starts[i] == starts[i + 1]) {
-        for (int64_t k = 0; k < dim; ++k) {
-          out_data[i * dim + k] = pad_value;
-          max_index[i * dim + k] = -1;
-        }
-        continue;
-      }
-      for (int64_t k = 0; k < dim; ++k) {
-        out_data[i * dim + k] = in_data[starts[i] * dim + k];
-        max_index[i * dim + k] = starts[i];
-      }
-      for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) {
-        for (int64_t k = 0; k < dim; ++k) {
-          if (in_data[j * dim + k] > out_data[i * dim + k]) {
-            out_data[i * dim + k] = in_data[j * dim + k];
-            max_index[i * dim + k] = j;
-          }
-        }
-      }
-    }
-  }
-};
-// Instantisation of Max Sequence Pooling for test phase eg. no need to fill
-// index buffer
-template <typename T>
-class MaxSeqPoolFunctor<T, true> {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::Tensor& input,
-                  T pad_value,
-                  lite::Tensor* output,
-                  lite::Tensor* index) {
-    auto in_dims = input.dims();
-    auto out_dims = output->dims();
-    PADDLE_ENFORCE_GT(in_dims.size(), 1);
-    PADDLE_ENFORCE_GT(out_dims.size(), 1);
-    for (int64_t i = 1; i < in_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
-    }
-
-    auto starts = input.lod()[0];
-    const T* in_data = input.data<T>();
-    T* out_data = output->mutable_data<T>();
-
-    int64_t num_seq = out_dims[0];
-    int64_t dim = output->numel() / num_seq;
-    for (int64_t i = 0; i < num_seq; ++i) {
-      if (starts[i] == starts[i + 1]) {
-        for (int64_t k = 0; k < dim; ++k) {
-          out_data[i * dim + k] = pad_value;
-        }
-        continue;
-      }
-      std::memcpy(
-          &out_data[i * dim], &in_data[starts[i] * dim], dim * sizeof(T));
-      for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) {
-        for (int64_t k = 0; k < dim; ++k) {
-          if (in_data[j * dim + k] > out_data[i * dim + k]) {
-            out_data[i * dim + k] = in_data[j * dim + k];
-          }
-        }
-      }
-    }
-  }
-};
-template <typename T>
-class MaxSeqPoolGradFunctor {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::Tensor& out_grad,
-                  const lite::Tensor& index,
-                  lite::Tensor* in_grad) {
-    auto og_dims = out_grad.dims();
-    auto ig_dims = in_grad->dims();
-    auto idx_dims = index.dims();
-    PADDLE_ENFORCE_GT(og_dims.size(), 1);
-    PADDLE_ENFORCE_GT(ig_dims.size(), 1);
-    for (int64_t i = 1; i < og_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
-    }
-    PADDLE_ENFORCE_EQ(idx_dims, og_dims);
-
-    const T* og_data = out_grad.data<T>();
-    const int* max_index = index.data<int>();
-    T* ig_data = in_grad->mutable_data<T>();
-
-    SetConstant<TARGET(kX86), T> set_zero;
-    set_zero(context, in_grad, static_cast<T>(0.0));
-    int64_t num_seq = og_dims[0];
-    int64_t dim = out_grad.numel() / num_seq;
-    for (int64_t i = 0; i < num_seq; ++i) {
-      for (int64_t j = 0; j < dim; ++j) {
-        int step_id = max_index[i * dim + j];
-        if (step_id == -1) continue;
-        ig_data[step_id * dim + j] = og_data[i * dim + j];
-      }
-    }
-  }
-};
-
-template <typename T>
-class LastSeqPoolFunctor {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::Tensor& input,
-                  T pad_value,
-                  lite::Tensor* output) {
-    // Create pointers to input and output data
-    auto* in_data = input.data<T>();
-    auto* out_data = output->mutable_data<T>();
-
-    // Calculate the size of each item in sequence
-    int64_t item_size = input.numel() / input.dims()[0];
-    auto lod = input.lod()[0];
-    int seq_num = static_cast<int>(lod.size()) - 1;
-    for (int i = 0; i < seq_num; ++i) {
-      // Calculate the length of each sequence
-      int64_t seq_len = static_cast<int64_t>(lod[i + 1] - lod[i]);
-      if (seq_len == 0) {
-        for (int j = 0; j < item_size; ++j) {
-          out_data[j] = pad_value;
-        }
-      } else {
-        // Point to the begin of next sequence
-        in_data += seq_len * item_size;
-        // Copy the last item of sequence to output
-        std::memcpy(out_data, (in_data - item_size), item_size * sizeof(T));
-      }
-      out_data += item_size;
-    }
-  }
-};
-
-template <typename T>
-class FirstSeqPoolFunctor {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::Tensor& input,
-                  T pad_value,
-                  lite::Tensor* output) {
-    // Create pointers to input and output data
-    auto* in_data = input.data<T>();
-    auto* out_data = output->mutable_data<T>();
-
-    // Calculate the size of each item in sequence
-    int64_t item_size = input.numel() / input.dims()[0];
-    auto lod = input.lod()[0];
-    int seq_num = static_cast<int>(lod.size()) - 1;
-    for (int i = 0; i < seq_num; ++i) {
-      // Calculate the length of each sequence
-      int64_t seq_len = static_cast<int64_t>(lod[i + 1] - lod[i]);
-      if (seq_len == 0) {
-        for (int j = 0; j < item_size; ++j) {
-          out_data[j] = pad_value;
-        }
-      } else {
-        // Copy the first item of sequence to output
-        std::memcpy(out_data, in_data, item_size * sizeof(T));
-        // Point to the next sequence
-        in_data += seq_len * item_size;
-      }
-      out_data += item_size;
-    }
-  }
-};
-
-template <typename T>
-class SumSeqPoolGradFunctor {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::Tensor& out_grad,
-                  lite::Tensor* in_grad) {
-    auto lod = in_grad->lod()[0];
-    int64_t out_w = out_grad.numel() / out_grad.dims()[0];
-    int64_t in_w = in_grad->numel() / in_grad->dims()[0];
-    PADDLE_ENFORCE(in_w == out_w);
-    const T* out_g_data = out_grad.data<T>();
-    T* in_g_data = in_grad->mutable_data<T>(TARGET(kX86));
-    auto blas = math::GetBlas<TARGET(kX86), T>(context);
-    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
-      if (h == 0) continue;
-      int64_t in_offset = lod[i] * in_w;
-      const T* out_pos = out_g_data + i * out_w;
-      T* in_pos = in_g_data + in_offset;
-      for (int r = 0; r != h; ++r) {
-        blas.VCOPY(in_w, out_pos, in_pos + r * in_w);
-      }
-    }
-  }
-};
-
-template <typename T>
-class SequencePoolFunctor<TARGET(kX86), T> {
- public:
-  /* max pool has index output */
-  void operator()(const lite::X86Context& context,
-                  const std::string pooltype,
-                  T pad_value,
-                  const lite::Tensor& input,
-                  lite::Tensor* output,
-                  bool is_test,
-                  lite::Tensor* index = nullptr) {
-    if (pooltype == "MAX") {
-      if (is_test) {
-        math::MaxSeqPoolFunctor<T, true> max_pool;
-        max_pool(context, input, pad_value, output, index);
-      } else {
-        math::MaxSeqPoolFunctor<T, false> max_pool;
-        max_pool(context, input, pad_value, output, index);
-      }
-      return;
-    }
-    if (pooltype == "LAST") {
-      math::LastSeqPoolFunctor<T> last_pool;
-      last_pool(context, input, pad_value, output);
-      return;
-    }
-    if (pooltype == "FIRST") {
-      math::FirstSeqPoolFunctor<T> first_pool;
-      first_pool(context, input, pad_value, output);
-      return;
-    }
-
-    auto lod = input.lod()[0];
-    if (pooltype == "SUM") {
-      const T* src = input.data<T>();
-      T* dst = output->mutable_data<T>(TARGET(kX86));
-      jit::seq_pool_attr_t attr(
-          static_cast<int>(input.numel() / input.dims()[0]),
-          jit::SeqPoolType::kSum);
-      auto seqpool =
-          jit::KernelFuncs<jit::SeqPoolTuple<T>, lite::fluid::CPUPlace>::Cache()
-              .At(attr);
-      for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-        attr.h = static_cast<int>(lod[i + 1] - lod[i]);
-        if (attr.h == 0) {
-          for (int j = 0; j < attr.w; ++j) {
-            dst[j] = pad_value;
-          }
-        } else {
-          seqpool(src, dst, &attr);
-        }
-        dst += attr.w;
-        src += attr.h * attr.w;
-      }
-      return;
-    }
-    auto eigen_device = lite::fluid::EigenDeviceType<TARGET(kX86)>();
-    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-      Tensor out_t = output->Slice<float>(i, i + 1);
-      int64_t w = input.numel() / input.dims()[0];
-      if (lod[i] == lod[i + 1]) {
-        for (int j = 0; j < w; ++j) {
-          out_t.mutable_data<T>()[j] = pad_value;
-        }
-        continue;
-      }
-      Tensor in_t = input.Slice<float>(static_cast<int>(lod[i]),
-                                       static_cast<int>(lod[i + 1]));
-      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
-      auto in_e = EigenMatrix<T>::From(in_t, lite::DDim({h, w}));
-      auto out_e = EigenVector<T>::Flatten(out_t);
-      if (pooltype == "AVERAGE") {
-        out_e.device(eigen_device) = in_e.mean(Eigen::array<int, 1>({{0}}));
-      } else if (pooltype == "SQRT") {
-        out_e.device(eigen_device) = in_e.sum(Eigen::array<int, 1>({{0}})) /
-                                     std::sqrt(static_cast<T>(h));
-      } else {
-        PADDLE_THROW("unsupported pooling pooltype");
-      }
-    }
-  }
-};
-
-template <typename T>
-class SequencePoolGradFunctor<TARGET(kX86), T> {
- public:
-  void operator()(const lite::X86Context& context,
-                  const std::string pooltype,
-                  const lite::Tensor& out_grad,
-                  lite::Tensor* in_grad,
-                  /* max pool has index */
-                  const lite::Tensor* index = nullptr) {
-    if (pooltype == "MAX") {
-      math::MaxSeqPoolGradFunctor<T> max_pool_grad;
-      max_pool_grad(context, out_grad, *index, in_grad);
-      return;
-    }
-
-    if (pooltype == "LAST" || pooltype == "FIRST") {
-      // set X@Grad be zero at first when pooltype is LAST/FIRST
-      math::SetConstant<TARGET(kX86), T> functor;
-      functor(context, in_grad, 0);
-    }
-
-    if (pooltype == "SUM") {
-      math::SumSeqPoolGradFunctor<T> sum_pool_grad;
-      sum_pool_grad(context, out_grad, in_grad);
-      return;
-    }
-
-    auto lod = in_grad->lod()[0];
-
-    auto eigen_device = lite::fluid::EigenDeviceType<TARGET(kX86)>();
-    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-      if (lod[i] == lod[i + 1]) continue;
-      auto in_g_t = in_grad->Slice<float>(static_cast<int>(lod[i]),
-                                          static_cast<int>(lod[i + 1]));
-      auto out_g_t = out_grad.Slice<float>(i, i + 1);
-      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
-      int64_t w = in_grad->numel() / in_grad->dims()[0];
-      auto in_g_e = EigenMatrix<T>::From(in_g_t, DDim({h, w}));
-      auto out_g_e = EigenMatrix<T>::From(out_g_t, DDim({1, w}));
-      auto out_g_e_v = EigenVector<T>::Flatten(out_g_t);
-      Eigen::DSizes<int, 2> bcast(h, 1);
-
-      if (pooltype == "AVERAGE") {
-        in_g_e.device(eigen_device) =
-            (out_g_e / static_cast<T>(h)).broadcast(bcast);
-      } else if (pooltype == "SQRT") {
-        in_g_e.device(eigen_device) =
-            (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
-      } else if (pooltype == "LAST") {
-        in_g_e.chip(h - 1, 0).device(eigen_device) = out_g_e_v;
-      } else if (pooltype == "FIRST") {
-        in_g_e.chip(0, 0).device(eigen_device) = out_g_e_v;
-      } else {
-        PADDLE_THROW("unsupported pooling pooltype");
-      }
-    }
-  }
-};
-
-template class SequencePoolFunctor<TARGET(kX86), float>;
-template class SequencePoolFunctor<TARGET(kX86), double>;
-template class SequencePoolGradFunctor<TARGET(kX86), float>;
-template class SequencePoolGradFunctor<TARGET(kX86), double>;
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/sequence_pooling.h b/lite/backends/x86/math/sequence_pooling.h
deleted file mode 100644
index d1a9f88f62..0000000000
--- a/lite/backends/x86/math/sequence_pooling.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "lite/core/context.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-template <lite::TargetType Target, typename T>
-class SequencePoolFunctor {
- public:
-  /* max pool has index output */
-  void operator()(const lite::Context<Target>& context,
-                  const std::string pooltype,
-                  T pad_value,
-                  const lite::Tensor& input,
-                  lite::Tensor* output,
-                  bool is_test = false,
-                  lite::Tensor* index = nullptr);
-};
-
-template <lite::TargetType Target, typename T>
-class SequencePoolGradFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const std::string pooltype,
-                  const lite::Tensor& out_grad,
-                  lite::Tensor* in_grad,
-                  /* max pool has index */
-                  const lite::Tensor* index = nullptr);
-};
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/sequence_pooling_test.cc b/lite/backends/x86/math/sequence_pooling_test.cc
deleted file mode 100644
index a730147673..0000000000
--- a/lite/backends/x86/math/sequence_pooling_test.cc
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/sequence_pooling.h"
-#include <gtest/gtest.h>
-#include <vector>
-
-template <typename DeviceContext, typename Place, typename T>
-void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
-  paddle::framework::LoDTensor cpu_out_grad;
-  paddle::framework::LoDTensor cpu_in_grad;
-  paddle::framework::LoDTensor out_grad;
-  paddle::framework::LoDTensor in_grad;
-  const size_t second_dim = 128u;
-
-  // construct out_grad's tensor in cpu
-  const size_t out_first_dim = lod[0].size() - 1;
-  auto out_dims = paddle::framework::make_ddim(
-      {static_cast<int64_t>(out_first_dim), static_cast<int64_t>(second_dim)});
-
-  cpu_out_grad.mutable_data<T>(out_dims, paddle::platform::CPUPlace());
-  for (int64_t i = 0; i < cpu_out_grad.numel(); ++i) {
-    cpu_out_grad.data<T>()[i] = static_cast<T>(i);
-  }
-
-  // copy to dst out_grad
-  auto* place = new Place();
-  DeviceContext* context = new DeviceContext(*place);
-  if (paddle::platform::is_cpu_place(*place)) {
-    out_grad = cpu_out_grad;
-  } else {
-    TensorCopySync(cpu_out_grad, *place, &out_grad);
-  }
-
-  // construct in_grad
-  in_grad.set_lod(lod);
-  auto in_dims = paddle::framework::make_ddim(
-      {static_cast<int64_t>(lod[0].back()), static_cast<int64_t>(second_dim)});
-  in_grad.mutable_data<T>(in_dims, context->GetPlace());
-
-  // check tensor contruction result
-  PADDLE_ENFORCE_EQ(in_grad.dims().size(), out_grad.dims().size());
-  for (int64_t i = 1; i < out_grad.dims().size(); ++i) {
-    PADDLE_ENFORCE_EQ(in_grad.dims()[i], out_grad.dims()[i]);
-  }
-
-  // call functor
-  paddle::operators::math::SequencePoolGradFunctor<DeviceContext, T>()(
-      *context, "SUM", out_grad, &in_grad);
-
-  if (paddle::platform::is_cpu_place(*place)) {
-    cpu_in_grad = in_grad;
-  } else {
-    TensorCopySync(in_grad, paddle::platform::CPUPlace(), &cpu_in_grad);
-    cpu_in_grad.set_lod(in_grad.lod());
-  }
-
-  EXPECT_EQ(in_grad.numel(), static_cast<int64_t>(lod[0].back() * second_dim));
-  EXPECT_EQ(in_grad.lod(), lod);
-
-  if (paddle::platform::is_cpu_place(*place)) {
-    for (size_t i = 0; i < in_grad.lod()[0].size() - 1; ++i) {
-      int64_t begin = in_grad.lod()[0][i];
-      int64_t end = in_grad.lod()[0][i + 1];
-      paddle::framework::Tensor tmp = in_grad.Slice(begin, end);
-      for (size_t j = 0; j != tmp.numel() / second_dim; ++j) {
-        for (int64_t m = 0; m != second_dim; ++m) {
-          EXPECT_EQ(tmp.data<T>()[m + j * second_dim],
-                    out_grad.data<T>()[m + i * second_dim]);
-        }
-      }
-    }
-  } else {
-    for (size_t i = 0; i < cpu_in_grad.lod()[0].size() - 1; ++i) {
-      int64_t begin = cpu_in_grad.lod()[0][i];
-      int64_t end = cpu_in_grad.lod()[0][i + 1];
-      paddle::framework::Tensor tmp = cpu_in_grad.Slice(begin, end);
-      for (size_t j = 0; j != tmp.numel() / second_dim; ++j) {
-        for (int64_t m = 0; m != second_dim; ++m) {
-          EXPECT_EQ(tmp.data<T>()[m + j * second_dim],
-                    cpu_out_grad.data<T>()[m + i * second_dim]);
-        }
-      }
-    }
-  }
-
-  delete place;
-  delete context;
-}
-
-TEST(SequencePoolingGrad, CPU_SUM) {
-  paddle::framework::LoD lod1;
-  lod1.push_back(std::vector<size_t>{0, 10});
-  TestSequencePoolingSum<paddle::platform::CPUDeviceContext,
-                         paddle::platform::CPUPlace,
-                         float>(lod1);
-
-  paddle::framework::LoD lod2;
-  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
-  TestSequencePoolingSum<paddle::platform::CPUDeviceContext,
-                         paddle::platform::CPUPlace,
-                         float>(lod2);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(SequencePoolingGrad, CUDA_SUM) {
-  paddle::framework::LoD lod1;
-  lod1.push_back(std::vector<size_t>{0, 10});
-  TestSequencePoolingSum<paddle::platform::CUDADeviceContext,
-                         paddle::platform::CUDAPlace,
-                         float>(lod1);
-
-  paddle::framework::LoD lod2;
-  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
-  TestSequencePoolingSum<paddle::platform::CUDADeviceContext,
-                         paddle::platform::CUDAPlace,
-                         float>(lod2);
-}
-#endif
diff --git a/lite/backends/x86/math/sequence_scale.cc b/lite/backends/x86/math/sequence_scale.cc
deleted file mode 100644
index fad0628de1..0000000000
--- a/lite/backends/x86/math/sequence_scale.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/x86/math/sequence_scale.h"
-#include "lite/fluid/lod.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-template <typename T>
-class ScaleLoDTensorFunctor<lite::TargetType::kX86, T> {
- public:
-  void operator()(const lite::Context<lite::TargetType::kX86>& context,
-                  const T* scales,
-                  lite::Tensor* seq) {
-    const size_t level = 0;
-    auto lod = seq->lod();
-    const size_t num_seq = lod[level].size() - 1;
-    size_t seq_width = seq->dims()[1];
-    lite::LoD abs_offset_lod = lite::fluid::ToAbsOffset(lod);
-
-    T* seq_data = seq->mutable_data<T>(lite::TargetType::kX86);
-    for (size_t i = 0; i < num_seq; ++i) {
-      for (size_t j = lod[level][i] * seq_width;
-           j < lod[level][i + 1] * seq_width;
-           ++j) {
-        seq_data[j] *= scales[i];
-      }
-    }
-  }
-};
-
-template class ScaleLoDTensorFunctor<lite::TargetType::kX86, float>;
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/sequence_scale.h b/lite/backends/x86/math/sequence_scale.h
deleted file mode 100644
index 44c1a233d9..0000000000
--- a/lite/backends/x86/math/sequence_scale.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "lite/core/context.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-/*
- * \brief   Scale a sequence.
- *
- *  All sequences will be padded to the same length and stored in a transposed
- * shape.
- *  Example:
- *    Given:
- *      seq = (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3)
- *      scales = (2, 3, 4, 5)
- *    then:
- *      result = (2*s0, 2*s0, 2*s0, 2*s0; 3*s1, 3*s1; 4*s2, 4*s2, 4*s2; 5*s3)
-
- *
- * \param context       Device context of this functor.
- * \param seq           LoDTensor which is stored in sequence format, the shape
- *                      is [total_sequence_length, sequence_width] where
- *                      total_sequence_length is the sum of all sequences'
- *                      length.
- * \param scales        Array<T>. The i-th sequence will be scaled by scales[i].
- * \param num_seq       Number of sequence
- *
- */
-
-template <lite::TargetType Target, typename T>
-class ScaleLoDTensorFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const T* scales,
-                  lite::Tensor* seq);
-};
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/softmax.cc b/lite/backends/x86/math/softmax.cc
deleted file mode 100644
index 1f7144dd8b..0000000000
--- a/lite/backends/x86/math/softmax.cc
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/x86/math/softmax.h"
-#include "lite/backends/x86/math/softmax_impl.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-template class SoftmaxFunctor<lite::TargetType::kX86, float, true>;
-template class SoftmaxFunctor<lite::TargetType::kX86, float, false>;
-template class SoftmaxFunctor<lite::TargetType::kX86, double, true>;
-template class SoftmaxFunctor<lite::TargetType::kX86, double, false>;
-template class SoftmaxGradFunctor<lite::TargetType::kX86, float>;
-template class SoftmaxGradFunctor<lite::TargetType::kX86, double>;
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/softmax.h b/lite/backends/x86/math/softmax.h
deleted file mode 100644
index 299ccef58a..0000000000
--- a/lite/backends/x86/math/softmax.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "lite/core/context.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-template <lite::TargetType Target,
-          typename T,
-          bool is_test,
-          typename Enable = void>
-class SoftmaxFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const int axis_dim,
-                  const lite::Tensor* X,
-                  lite::Tensor* Y);
-};
-
-template <lite::TargetType Target, typename T, typename Enable = void>
-class SoftmaxGradFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const int axis_dim,
-                  const lite::TensorLite* y,
-                  const lite::TensorLite* y_grad,
-                  lite::TensorLite* x_grad);
-};
-
-//#ifdef PADDLE_WITH_CUDA
-// template <typename T>
-// class SoftmaxCUDNNFunctor {
-// public:
-//  void operator()(const platform::CUDADeviceContext& context,
-//                  const lite::TensorLite* X, lite::TensorLite* Y);
-//};
-//
-// template <typename T>
-// class SoftmaxGradCUDNNFunctor {
-// public:
-//  void operator()(const platform::CUDADeviceContext& context,
-//                  const lite::TensorLite* Y, const lite::TensorLite* y_grad,
-//                  lite::TensorLite* x_grad);
-//};
-//
-//#endif
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/softmax_impl.h b/lite/backends/x86/math/softmax_impl.h
deleted file mode 100644
index ae997a8680..0000000000
--- a/lite/backends/x86/math/softmax_impl.h
+++ /dev/null
@@ -1,245 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "lite/backends/x86/cpu_info.h"
-#include "lite/backends/x86/jit/helper.h"
-#include "lite/backends/x86/jit/kernel_base.h"
-#include "lite/backends/x86/jit/kernels.h"
-#include "lite/backends/x86/math/cpu_vec.h"
-#include "lite/core/tensor.h"
-#include "lite/fluid/eigen.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = lite::fluid::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T>
-struct ValueClip {
-  HOSTDEVICE T operator()(const T& x) const {
-    const T kThreshold = static_cast<T>(-64.);
-    return x < kThreshold ? kThreshold : x;
-  }
-};
-
-template <lite::TargetType Target, typename T, bool is_test>
-void SoftmaxEigen(const lite::Context<Target>& context,
-                  const int axis_dim,
-                  const lite::Tensor* X,
-                  lite::Tensor* Y) {
-  constexpr int kBatchDim = 0;
-  constexpr int kClassDim = 1;
-
-  auto logits = EigenMatrix<T>::From(*X);
-  auto softmax = EigenMatrix<T>::From(*Y);
-
-  const int batch_size = logits.dimension(kBatchDim);
-  const int num_classes = logits.dimension(kClassDim);
-  const int num_remain = num_classes / axis_dim;
-
-  Eigen::DSizes<int, 1> along_class(kClassDim);
-  Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
-  Eigen::DSizes<int, 2> one_by_class(1, num_classes);
-  Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
-  Eigen::DSizes<int, 2> one_axis(1, axis_dim);
-
-  auto shifted_logits = (logits -
-                         logits.maximum(along_class)
-                             .eval()
-                             .reshape(batch_by_one)
-                             .broadcast(one_by_class))
-                            .unaryExpr(ValueClip<T>());
-
-  softmax.device(typename lite::fluid::EigenDevice<Target>::Type()) =
-      shifted_logits.exp();
-  softmax.device(typename lite::fluid::EigenDevice<Target>::Type()) =
-      (softmax *
-       softmax.reshape(batch_axis_remain)
-           .sum(along_class)
-           .inverse()
-           .eval()
-           .broadcast(one_axis));
-}
-
-template <lite::TargetType Target, typename T, bool is_test, typename Enable>
-void SoftmaxFunctor<Target, T, is_test, Enable>::operator()(
-    const lite::Context<Target>& context,
-    const int axis_dim,
-    const lite::Tensor* X,
-    lite::Tensor* Y) {
-  SoftmaxEigen<lite::Context<Target>, T, is_test>(context, axis_dim, X, Y);
-}
-
-template <lite::TargetType Target>
-using enable_if_CPU = typename std::enable_if<
-    std::is_same<lite::Context<Target>, lite::X86Context>::value>::type;
-
-template <lite::TargetType Target, typename T, bool is_test>
-class SoftmaxFunctor<Target, T, is_test, enable_if_CPU<Target>> {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const int axis_dim,
-                  const lite::Tensor* X,
-                  lite::Tensor* Y) {
-    auto in_dims = X->dims();
-    constexpr int kBatchDim = 0;
-    constexpr int kClassDim = 1;
-
-    const int num_classes = in_dims[kClassDim];
-    const int batch_size = in_dims[kBatchDim];
-    const int num_remain = num_classes / axis_dim;
-
-    if (num_remain == 1 && lite::x86::MayIUse(lite::x86::avx)) {
-      const T* in_data = X->data<T>();
-      auto* out_data = Y->mutable_data<T>();
-      for (int bs = 0; bs < batch_size; ++bs) {
-        T max_val = *std::max_element(in_data, in_data + num_classes);
-        max_val *= static_cast<T>(-1);
-        vec_add_bias<T, lite::x86::avx>(
-            num_classes, max_val, in_data, out_data);
-        vec_clip<T, lite::x86::avx>(
-            num_classes, static_cast<T>(-64), out_data, out_data);
-        vec_exp<T>(num_classes, out_data, out_data);
-
-        T sum = 0;
-        vec_sum<T, lite::x86::avx>(num_classes, out_data, &sum);
-        sum = static_cast<T>(1) / sum;
-        vec_scal<T, lite::x86::avx>(num_classes, sum, out_data, out_data);
-
-        in_data += num_classes;
-        out_data += num_classes;
-      }
-    } else {
-      SoftmaxEigen<Target, T, is_test>(context, axis_dim, X, Y);
-    }
-  }
-};
-
-template <lite::TargetType Target>
-class SoftmaxFunctor<Target, float, true, enable_if_CPU<Target>> {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const int axis_dim,
-                  const lite::Tensor* X,
-                  lite::Tensor* Y) {
-    auto in_dims = X->dims();
-    const float* in_data = X->data<float>();
-    float* out_data = Y->mutable_data<float>();
-    const int kBatchDim = 0;
-    const int kClassDim = 1;
-    // 2D data. Batch x C
-    auto compute_softmax =
-        lite::jit::KernelFuncs<lite::jit::SoftmaxTuple<float>,
-                               fluid::CPUPlace>::Cache()
-            .At(in_dims[kClassDim]);
-    compute_softmax(in_data,
-                    out_data,
-                    in_dims[kClassDim],
-                    in_dims[kBatchDim],
-                    in_dims[kClassDim] / axis_dim);
-  }
-};
-
-template <lite::TargetType Target, typename T>
-void SoftmaxGradEigen(const lite::Context<Target>& context,
-                      const int axis_dim,
-                      const lite::Tensor* y,
-                      const lite::Tensor* y_grad,
-                      lite::Tensor* x_grad) {
-  auto softmax = EigenMatrix<T>::From(*y);
-  auto softmax_grad = EigenMatrix<T>::From(*y_grad);
-  auto logits_grad = EigenMatrix<T>::From(*x_grad);
-
-  constexpr int kBatchDim = 0;
-  constexpr int kClassDim = 1;
-
-  const int batch_size = softmax.dimension(kBatchDim);
-  const int num_classes = softmax.dimension(kClassDim);
-  const int num_remain = num_classes / axis_dim;
-
-  Eigen::DSizes<int, 1> along_class(kClassDim);
-  Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
-  Eigen::DSizes<int, 2> one_by_class(1, num_classes);
-  Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
-  Eigen::DSizes<int, 2> one_axis(1, axis_dim);
-
-  auto dot = (softmax * softmax_grad)
-                 .reshape(batch_axis_remain)
-                 .sum(along_class)
-                 .eval()
-                 .broadcast(one_axis);
-  // logits_grad.device(*context.eigen_device()) = (softmax_grad - dot) *
-  // softmax;
-  logits_grad.device(typename lite::fluid::EigenDevice<Target>::Type()) =
-      (softmax_grad - dot) * softmax;
-}
-
-template <lite::TargetType Target, typename T, typename Enable>
-void SoftmaxGradFunctor<Target, T, Enable>::operator()(
-    const lite::Context<Target>& context,
-    const int axis_dim,
-    const lite::Tensor* y,
-    const lite::Tensor* y_grad,
-    lite::Tensor* x_grad) {
-  SoftmaxGradEigen<lite::Context<Target>, T>(
-      context, axis_dim, y, y_grad, x_grad);
-}
-
-template <lite::TargetType Target, typename T>
-class SoftmaxGradFunctor<Target, T, enable_if_CPU<Target>> {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const int axis_dim,
-                  const lite::Tensor* y,
-                  const lite::Tensor* y_grad,
-                  lite::Tensor* x_grad) {
-    auto out_dims = y->dims();
-    constexpr int kBatchDim = 0;
-    constexpr int kClassDim = 1;
-    const int num_classes = out_dims[kClassDim];
-    const int batch_size = out_dims[kBatchDim];
-    const int num_remain = num_classes / axis_dim;
-
-    if (num_remain == 1 && lite::x86::MayIUse(lite::x86::avx)) {
-      const T* out_data = y->data<T>();
-      const T* out_grad = y_grad->data<T>();
-      T* in_grad = x_grad->mutable_data<T>();
-      for (int bs = 0; bs < batch_size; ++bs) {
-        T scalar;
-        vec_mul_reduce<T, lite::x86::avx>(
-            num_classes, out_grad, out_data, &scalar);
-        scalar *= static_cast<T>(-1);
-        vec_add_bias<T, lite::x86::avx>(num_classes, scalar, out_grad, in_grad);
-        vec_mul<T, lite::x86::avx>(num_classes, out_data, in_grad, in_grad);
-        out_data += num_classes;
-        out_grad += num_classes;
-        in_grad += num_classes;
-      }
-    } else {
-      SoftmaxGradEigen<Target, T>(context, axis_dim, y, y_grad, x_grad);
-    }
-  }
-};
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/tree2col.cc b/lite/backends/x86/math/tree2col.cc
deleted file mode 100644
index 8a34bebef0..0000000000
--- a/lite/backends/x86/math/tree2col.cc
+++ /dev/null
@@ -1,204 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/x86/math/tree2col.h"
-#include <deque>
-#include <stack>
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-std::vector<TreeNode> Tree2ColUtil::construct_patch(
-    size_t root, int max_depth, const std::vector<std::vector<int>> &tr) {
-  std::stack<TreeNode, std::deque<TreeNode>> stack;
-  std::unordered_map<int, bool> visited;
-  std::vector<TreeNode> patch;
-
-  stack.push(TreeNode(root, 1, 1, 0));
-  patch.emplace_back(TreeNode(root, 1, 1, 0));
-  visited[root] = true;
-
-  while (!stack.empty()) {
-    TreeNode &u = stack.top();
-    bool end = true;
-    size_t node = u.get_node(), sz = tr[node].size();
-    visited[node] = true;
-    for (size_t i = 0; i < sz; i++) {
-      size_t v = tr[node][i];
-      if (!visited[v] && static_cast<int>(u.get_depth()) + 1 < max_depth) {
-        visited[v] = true;
-        stack.push(TreeNode(v, i, sz, u.get_depth() + 1));
-        patch.push_back(TreeNode(v, i + 1, sz, u.get_depth() + 1));
-        end = false;
-      }
-    }
-    if (end) {
-      stack.pop();
-    }
-  }
-  return patch;
-}
-
-void Tree2ColUtil::construct_tree(const lite::Tensor &EdgeSet,
-                                  std::vector<std::vector<int>> *tr,
-                                  size_t *node_count) {
-  auto edge_set_dims = EdgeSet.dims();
-  PADDLE_ENFORCE_EQ(edge_set_dims[1], 2);
-  int64_t edge_count = EdgeSet.numel();
-
-  const int *edge_data = EdgeSet.data<int>();
-
-  for (int64_t i = 0; i < edge_count; i += 2) {
-    int u = edge_data[i], v = edge_data[i + 1];
-    if (u != 0 && v != 0) (*node_count)++;
-  }
-  (*node_count)++;
-
-  tr->resize(static_cast<size_t>(*node_count + 1));
-
-  for (int64_t i = 0; i < edge_count; i += 2) {
-    int u = edge_data[i], v = edge_data[i + 1];
-    if (u != 0 && v != 0) {
-      tr->at(u).push_back(v);
-    } else {
-      break;
-    }
-  }
-}
-
-template <typename T>
-class Tree2ColFunctor<lite::TargetType::kX86, T> {
- public:
-  void operator()(const lite::X86Context &context,
-                  const lite::Tensor &EdgeSet,
-                  const lite::Tensor &node_features,
-                  lite::Tensor *patch,
-                  int max_depth) {
-    std::vector<std::vector<int>> tr;
-    auto feature_dims = node_features.dims();
-    math::SetConstant<lite::TargetType::kX86, T> constant;
-    int64_t feature_size = feature_dims[1];
-    size_t patch_elem_size = 3 * static_cast<size_t>(feature_size);
-    size_t node_count = 0, patch_count = 0, patch_size;
-    Tree2ColUtil::construct_tree(EdgeSet, &tr, &node_count);
-    std::vector<std::vector<TreeNode>> processing_list;
-    for (size_t u = 1; u <= node_count; u++) {
-      std::vector<TreeNode> temp_patch =
-          Tree2ColUtil::construct_patch(u, max_depth, tr);
-      if (!temp_patch.empty()) {
-        processing_list.emplace_back(temp_patch);
-      }
-    }
-    patch_size = processing_list.size();
-
-    // T *patch_data =
-    //    patch->mutable_data<T>({static_cast<int64_t>(patch_size),
-    //                            static_cast<int64_t>(patch_elem_size)},
-    //                           cpu_place);
-    patch->Resize({static_cast<int64_t>(patch_size, patch_elem_size)});
-    auto *patch_data = patch->mutable_data<T>(lite::TargetType::kX86);
-    constant(context, patch, 0);
-    const T *features = node_features.data<T>();
-
-    for (auto &patch_item : processing_list) {
-      size_t pointer_base = patch_count * patch_elem_size;
-      for (auto &v : patch_item) {
-        T eta_l = v.eta_l<T>(max_depth), eta_r = v.eta_r<T>(max_depth),
-          eta_t = v.eta_t<T>(max_depth);
-        size_t id = v.get_node() - 1;
-        for (int i = 0; i < feature_size; i++) {
-          patch_data[pointer_base + i * 3] +=
-              eta_l * features[id * feature_size + i];
-          patch_data[pointer_base + i * 3 + 1] +=
-              eta_r * features[id * feature_size + i];
-          patch_data[pointer_base + i * 3 + 2] +=
-              eta_t * features[id * feature_size + i];
-        }
-      }
-      patch_count++;
-    }
-    patch->Resize({static_cast<int64_t>(patch_count),
-                   static_cast<int64_t>(patch_elem_size)});
-  }
-};
-template <typename T>
-class Col2TreeFunctor<lite::TargetType::kX86, T> {
- public:
-  void operator()(const lite::X86Context &context,
-                  const lite::Tensor &EdgeSet,
-                  const lite::Tensor &out_grad,
-                  lite::Tensor *in_grad,
-                  int max_depth) {
-    std::vector<std::vector<int>> tr;
-    auto output_dims = out_grad.dims();
-    // auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
-    math::SetConstant<lite::TargetType::kX86, T> constant;
-    int64_t output_size = output_dims[1];
-    size_t grad_elem_size = 3 * static_cast<size_t>(output_size);
-    size_t node_count = 0, grad_count = 0;
-    Tree2ColUtil::construct_tree(EdgeSet, &tr, &node_count);
-    std::vector<std::vector<TreeNode>> processing_list;
-    std::vector<std::vector<TreeNode>> grad_list;
-    grad_list.resize(node_count);
-    for (size_t u = 1; u <= node_count; u++) {
-      std::vector<TreeNode> tmp =
-          Tree2ColUtil::construct_patch(u, max_depth, tr);
-      if (!tmp.empty()) {
-        processing_list.push_back(tmp);
-      }
-    }
-    for (size_t patch_id = 0; patch_id < processing_list.size(); patch_id++) {
-      for (auto v : processing_list[patch_id]) {
-        grad_list[v.get_node() - 1].push_back(v.change_node(patch_id + 1));
-      }
-    }
-    // T *grad_data =
-    //    in_grad->mutable_data<T>({static_cast<int64_t>(node_count),
-    //                              static_cast<int64_t>(grad_elem_size)},
-    //                             cpu_place);
-    in_grad->Resize({static_cast<int64_t>(node_count),
-                     static_cast<int64_t>(grad_elem_size)});
-    auto *grad_data = in_grad->mutable_data<T>(lite::TargetType::kX86);
-
-    constant(context, in_grad, 0);
-    const T *out_g = out_grad.data<T>();
-    for (auto &patch_item : grad_list) {
-      size_t pointer_base = grad_count * grad_elem_size;
-      for (auto &v : patch_item) {
-        T eta_l = v.eta_l<T>(max_depth), eta_r = v.eta_r<T>(max_depth),
-          eta_t = v.eta_t<T>(max_depth);
-        size_t id = v.get_node() - 1;
-        for (int i = 0; i < output_size; i++) {
-          grad_data[pointer_base + i * 3] +=
-              eta_l * out_g[id * output_size + i];
-          grad_data[pointer_base + i * 3 + 1] +=
-              eta_r * out_g[id * output_size + i];
-          grad_data[pointer_base + i * 3 + 2] +=
-              eta_t * out_g[id * output_size + i];
-        }
-      }
-      grad_count++;
-    }
-  }
-};
-
-template class Tree2ColFunctor<lite::TargetType::kX86, float>;
-template class Tree2ColFunctor<lite::TargetType::kX86, double>;
-template class Col2TreeFunctor<lite::TargetType::kX86, float>;
-template class Col2TreeFunctor<lite::TargetType::kX86, double>;
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/tree2col.h b/lite/backends/x86/math/tree2col.h
deleted file mode 100644
index 3a48c2f40a..0000000000
--- a/lite/backends/x86/math/tree2col.h
+++ /dev/null
@@ -1,95 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <array>
-#include <unordered_map>
-#include <vector>
-#include "lite/backends/x86/math/math_function.h"
-#include "lite/core/context.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-class TreeNode {
- public:
-  size_t node;
-  explicit TreeNode(size_t node = 0,
-                    size_t index = 0,
-                    size_t pclen = 0,
-                    size_t depth = 0)
-      : node(node), index(index), pclen(pclen), depth(depth) {}
-  template <typename T>
-  T eta_t(T filter_depth) {
-    return ((filter_depth - this->depth) / filter_depth);
-  }
-  template <typename T>
-  T eta_l(T filter_depth) {
-    T temp;
-    if (this->pclen == 1) {
-      temp = 0.5;
-    } else {
-      temp = (this->index - 1.0) / (this->pclen - 1.0);
-    }
-    return (1.0 - this->eta_t<T>(filter_depth)) * temp;
-  }
-  template <typename T>
-  T eta_r(T filter_depth) {
-    return (1.0 - this->eta_t<T>(filter_depth)) *
-           (1.0 - this->eta_l<T>(filter_depth));
-  }
-  TreeNode change_node(size_t v) {
-    return TreeNode(v, this->index, this->pclen, this->depth);
-  }
-  size_t get_node() { return this->node; }
-  size_t get_depth() { return this->depth; }
-
- private:
-  size_t index, pclen, depth;
-};
-class Tree2ColUtil {
- public:
-  static std::vector<TreeNode> construct_patch(
-      size_t root, int max_depth, const std::vector<std::vector<int>> &tr);
-
-  static void construct_tree(const lite::Tensor &EdgeSet,
-                             std::vector<std::vector<int>> *tr,
-                             size_t *node_count);
-};
-
-template <lite::TargetType Target, typename T>
-class Tree2ColFunctor {
- public:
-  void operator()(const lite::Context<Target> &context,
-                  const lite::Tensor &EdgeSet,
-                  const lite::Tensor &node_features,
-                  lite::Tensor *patch,
-                  int max_depth);
-};
-template <lite::TargetType Target, typename T>
-class Col2TreeFunctor {
- public:
-  void operator()(const lite::Context<Target> &context,
-                  const lite::Tensor &EdgeSet,
-                  const lite::Tensor &out_grad,
-                  lite::Tensor *in_grad,
-                  int max_depth);
-};
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/unpooling.cc b/lite/backends/x86/math/unpooling.cc
deleted file mode 100644
index 568f9952ca..0000000000
--- a/lite/backends/x86/math/unpooling.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/x86/math/unpooling.h"
-#include "lite/utils/paddle_enforce.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-template <typename T>
-class Unpool2dMaxFunctor<lite::TargetType::kX86, T> {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::Tensor& input,
-                  const lite::Tensor& indices,
-                  lite::Tensor* output) {
-    const int batch_size = input.dims()[0];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output->dims()[1];
-    const int output_height = output->dims()[2];
-    const int output_width = output->dims()[3];
-    int input_feasize = input_height * input_width;
-    int output_feasize = output_height * output_width;
-    const T* input_data = input.data<T>();
-    const int* indices_data = indices.data<int>();
-    T* output_data = output->mutable_data<T>(lite::TargetType::kX86);
-    for (int b = 0; b < batch_size; ++b) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int i = 0; i < input_feasize; ++i) {
-          int index = indices_data[i];
-          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
-          output_data[index] = input_data[i];
-        }
-        input_data += input_feasize;
-        indices_data += input_feasize;
-        output_data += output_feasize;
-      }
-    }
-  }
-};
-template <class T>
-class Unpool2dMaxGradFunctor<lite::TargetType::kX86, T> {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::Tensor& input,
-                  const lite::Tensor& indices,
-                  const lite::Tensor& output,
-                  const lite::Tensor& output_grad,
-                  lite::Tensor* input_grad) {
-    const int batch_size = input.dims()[0];
-    const int input_height = input.dims()[2];
-    const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
-    int input_feasize = input_height * input_width;
-    int output_feasize = output_height * output_width;
-    const int* indices_data = indices.data<int>();
-    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
-
-    for (int b = 0; b < batch_size; ++b) {
-      for (int c = 0; c < output_channels; ++c) {
-        for (int i = 0; i < input_feasize; ++i) {
-          int index = indices_data[i];
-          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
-          input_grad_data[i] = output_grad_data[index];
-        }
-        input_grad_data += input_feasize;
-        indices_data += input_feasize;
-        output_grad_data += output_feasize;
-      }
-    }
-  }
-};
-template class Unpool2dMaxGradFunctor<lite::TargetType::kX86, float>;
-template class Unpool2dMaxGradFunctor<lite::TargetType::kX86, double>;
-template class Unpool2dMaxFunctor<lite::TargetType::kX86, float>;
-template class Unpool2dMaxFunctor<lite::TargetType::kX86, double>;
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/unpooling.h b/lite/backends/x86/math/unpooling.h
deleted file mode 100644
index 18948465f3..0000000000
--- a/lite/backends/x86/math/unpooling.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "lite/core/context.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-template <lite::TargetType Target, typename T>
-class Unpool2dMaxFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& input,
-                  const lite::Tensor& indices,
-                  lite::Tensor* output);
-};
-template <lite::TargetType Target, class T>
-class Unpool2dMaxGradFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& input,
-                  const lite::Tensor& indices,
-                  const lite::Tensor& output,
-                  const lite::Tensor& output_grad,
-                  lite::Tensor* input_grad);
-};
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/vol2col.cc b/lite/backends/x86/math/vol2col.cc
deleted file mode 100644
index 8fd5e8954e..0000000000
--- a/lite/backends/x86/math/vol2col.cc
+++ /dev/null
@@ -1,204 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/x86/math/vol2col.h"
-#include <vector>
-#include "lite/utils/paddle_enforce.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-
-/*
- * vol = [input_channels, input_depth, input_height, input_width]
- * col =
- *   [input_channels, filter_depth, filter_height, filter_width,
- *                    output_depth, output_height, output_width]
- */
-template <class T>
-class Vol2ColFunctor<lite::TargetType::kX86, T> {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::Tensor& vol,
-                  const std::vector<int>& dilations,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  lite::Tensor* col) const {
-    PADDLE_ENFORCE(vol.dims().size() == 4);
-    PADDLE_ENFORCE(col->dims().size() == 7);
-
-    int input_channels = vol.dims()[0];
-    int input_depth = vol.dims()[1];
-    int input_height = vol.dims()[2];
-    int input_width = vol.dims()[3];
-    int filter_depth = col->dims()[1];
-    int filter_height = col->dims()[2];
-    int filter_width = col->dims()[3];
-    int output_depth = col->dims()[4];
-    int output_height = col->dims()[5];
-    int output_width = col->dims()[6];
-    int channels_col =
-        input_channels * filter_depth * filter_height * filter_width;
-
-    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "mismatching.");
-
-    const T* vol_data = vol.data<T>();
-    T* col_data = col->mutable_data<T>();
-
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int d_offset = (c / filter_width / filter_height) % filter_depth;
-      int c_in = c / filter_width / filter_height / filter_depth;
-      for (int d = 0; d < output_depth; ++d) {
-        int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0];
-        for (int h = 0; h < output_height; ++h) {
-          int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1];
-          for (int w = 0; w < output_width; ++w) {
-            int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2];
-
-            int col_idx =
-                ((c * output_depth + d) * output_height + h) * output_width + w;
-            int vol_idx =
-                ((c_in * input_depth + d_pad) * input_height + h_pad) *
-                    input_width +
-                w_pad;
-            col_data[col_idx] =
-                (h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
-                 w_pad >= input_width || d_pad < 0 || d_pad >= input_depth)
-                    ? static_cast<T>(0)
-                    : vol_data[vol_idx];
-          }
-        }
-      }
-    }
-  }
-};
-
-/*
- * vol = [input_channels,input_depth, input_height, input_width]
- * col =
- *   [input_channels, filter_depth, filter_height, filter_width,
- *                    output_depth, output_height, output_width]
- */
-template <class T>
-class Col2VolFunctor<lite::TargetType::kX86, T> {
- public:
-  void operator()(const lite::X86Context& context,
-                  const lite::Tensor& col,
-                  const std::vector<int>& dilations,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  lite::Tensor* vol) const {
-    PADDLE_ENFORCE(vol->dims().size() == 4);
-    PADDLE_ENFORCE(col.dims().size() == 7);
-
-    int input_channels = vol->dims()[0];
-    int input_depth = vol->dims()[1];
-    int input_height = vol->dims()[2];
-    int input_width = vol->dims()[3];
-    int filter_depth = col.dims()[1];
-    int filter_height = col.dims()[2];
-    int filter_width = col.dims()[3];
-    int output_depth = col.dims()[4];
-    int output_height = col.dims()[5];
-    int output_width = col.dims()[6];
-    int channels_col =
-        input_channels * filter_depth * filter_height * filter_width;
-
-    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "mismatching.");
-    T* vol_data = vol->mutable_data<T>();
-    const T* col_data = col.data<T>();
-
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int d_offset = (c / filter_width / filter_height) % filter_depth;
-      int cIm = c / filter_width / filter_height / filter_depth;
-      for (int d = 0; d < output_depth; ++d) {
-        int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0];
-        for (int h = 0; h < output_height; ++h) {
-          int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1];
-          for (int w = 0; w < output_width; ++w) {
-            int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2];
-
-            if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 &&
-                w_pad < input_width && d_pad >= 0 && d_pad < input_depth) {
-              int vol_idx =
-                  ((cIm * input_depth + d_pad) * input_height + h_pad) *
-                      input_width +
-                  w_pad;
-
-              int col_idx =
-                  ((c * output_depth + d) * output_height + h) * output_width +
-                  w;
-              vol_data[vol_idx] += col_data[col_idx];
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-template class Vol2ColFunctor<lite::TargetType::kX86, float>;
-template class Vol2ColFunctor<lite::TargetType::kX86, double>;
-template class Col2VolFunctor<lite::TargetType::kX86, float>;
-template class Col2VolFunctor<lite::TargetType::kX86, double>;
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/math/vol2col.h b/lite/backends/x86/math/vol2col.h
deleted file mode 100644
index 4583fde6b2..0000000000
--- a/lite/backends/x86/math/vol2col.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "lite/core/context.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-namespace math {
-/*
- * \brief Converts the feature data of four dimensions(CDHW) into a colData of
- *        seven dimensions in the Vol2ColFunctor calculation,
- *        And in the Col2VolFunctor calculation, it is reversed.
- *
- * \param volData   Vol data.
- * \param volShape  The shape of volData,
- *                 [input_channels, input_depth, input_height, input_width].
- * \param colData  Column data.
- * \param colShape The shape of colData.
- *
- * \param dilations    dilation data.
- * \param 3-dimension  [dilation_depth, dilation_height, dilation_width].
- *
- * \param strides      stride data.
- * \param 3-dimension  [stride_depth, stride_height, stride_width].
- *
- * \param paddings     padding data.
- * \param 3-dimension  [d_pad, h_pad, w_pad].
- *
- * The shape of colData is:
- * [input_channels, filter_depth, filter_height, filter_width, output_depth,
- * output_height, output_width]
- * So, it is easy to reshape into a convolution matrix for convolution
- * calculation based on matrix multiplication.
- * The shape of convolution matrix is [height, width], where the height is equal
- * input_channels * filter_depth * filter_height * filter_width, and the width
- * is equal output_depth * output_height * output_width.
- *
- * Reshape:
- *     shape of colData           shape of convolution matrix
- *     [input_channels,
- *      filter_depth,
- *      filter_height,
- *      filter_width,      ======>      [height, width]
- *      output_depth,
- *      output_height,
- *      output_width]
- *
- * \note The caller needs to ensure that volShape.inputChannels is equal to
- *       colShape.inputChannels.
- */
-template <lite::TargetType Target, typename T>
-class Vol2ColFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& vol,
-                  const std::vector<int>& dilations,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  lite::Tensor* col) const;
-};
-
-template <lite::TargetType Target, typename T>
-class Col2VolFunctor {
- public:
-  void operator()(const lite::Context<Target>& context,
-                  const lite::Tensor& col,
-                  const std::vector<int>& dilations,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  lite::Tensor* vol) const;
-};
-
-}  // namespace math
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/mklml.cc b/lite/backends/x86/mklml.cc
deleted file mode 100644
index 1c3c3c3bde..0000000000
--- a/lite/backends/x86/mklml.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/backends/x86/mklml.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-
-std::once_flag mklml_dso_flag;
-void* mklml_dso_handle = nullptr;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-MKLML_ROUTINE_EACH(DEFINE_WRAP);
-
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/mklml.h b/lite/backends/x86/mklml.h
deleted file mode 100644
index 753c42f295..0000000000
--- a/lite/backends/x86/mklml.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <mkl.h>
-#include <mutex>  // NOLINT
-#include "lite/backends/x86/dynamic_loader.h"
-#include "lite/backends/x86/port.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {
-
-extern std::once_flag mklml_dso_flag;
-extern void* mklml_dso_handle;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load mklml routine
- * via operator overloading.
- */
-#define DYNAMIC_LOAD_MKLML_WRAP(__name)                              \
-  struct DynLoad__##__name {                                         \
-    template <typename... Args>                                      \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
-      using mklmlFunc = decltype(&::__name);                         \
-      std::call_once(mklml_dso_flag, []() {                          \
-        mklml_dso_handle = paddle::lite::x86::GetMKLMLDsoHandle();   \
-      });                                                            \
-      static void* p_##_name = dlsym(mklml_dso_handle, #__name);     \
-      return reinterpret_cast<mklmlFunc>(p_##_name)(args...);        \
-    }                                                                \
-  };                                                                 \
-  extern DynLoad__##__name __name
-
-#define DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) DYNAMIC_LOAD_MKLML_WRAP(__name)
-
-#define MKLML_ROUTINE_EACH(__macro) \
-  __macro(cblas_sgemm);             \
-  __macro(cblas_dgemm);             \
-  __macro(cblas_saxpy);             \
-  __macro(cblas_daxpy);             \
-  __macro(cblas_scopy);             \
-  __macro(cblas_dcopy);             \
-  __macro(cblas_sgemv);             \
-  __macro(cblas_dgemv);             \
-  __macro(cblas_sgemm_alloc);       \
-  __macro(cblas_dgemm_alloc);       \
-  __macro(cblas_sgemm_pack);        \
-  __macro(cblas_dgemm_pack);        \
-  __macro(cblas_sgemm_compute);     \
-  __macro(cblas_dgemm_compute);     \
-  __macro(cblas_sgemm_free);        \
-  __macro(cblas_dgemm_free);        \
-  __macro(cblas_sgemm_batch);       \
-  __macro(cblas_dgemm_batch);       \
-  __macro(cblas_sdot);              \
-  __macro(cblas_ddot);              \
-  __macro(cblas_sasum);             \
-  __macro(cblas_dasum);             \
-  __macro(cblas_isamax);            \
-  __macro(cblas_idamax);            \
-  __macro(cblas_sscal);             \
-  __macro(cblas_dscal);             \
-  __macro(vsAdd);                   \
-  __macro(vdAdd);                   \
-  __macro(vsMul);                   \
-  __macro(vdMul);                   \
-  __macro(vsExp);                   \
-  __macro(vdExp);                   \
-  __macro(vsSqr);                   \
-  __macro(vdSqr);                   \
-  __macro(vsPowx);                  \
-  __macro(vdPowx);                  \
-  __macro(vsInv);                   \
-  __macro(vdInv);                   \
-  __macro(vmsErf);                  \
-  __macro(vmdErf);                  \
-  __macro(MKL_Set_Num_Threads)
-
-MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
-
-#undef DYNAMIC_LOAD_MKLML_WRAP
-
-}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/port.h b/lite/backends/x86/port.h
deleted file mode 100644
index c1b81159ac..0000000000
--- a/lite/backends/x86/port.h
+++ /dev/null
@@ -1,175 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cstdio>
-#include <stdexcept>
-
-#include <time.h>
-#include <memory>
-#include <string>
-
-#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-#include "glog/logging.h"
-
-#if !defined(_WIN32)
-#include <dlfcn.h>     //  dladdr
-#include <execinfo.h>  // backtrace
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <algorithm>  // std::accumulate
-#else
-#define NOMINMAX  // msvc max/min macro conflict with std::min/max
-// solve static linking error in windows
-// https://github.com/google/glog/issues/301
-#define GOOGLE_GLOG_DLL_DECL
-#include <io.h>  // _popen, _pclose
-#include <stdio.h>
-#include <windows.h>
-#include <numeric>  // std::accumulate in msvc
-#ifndef S_ISDIR     // windows port for sys/stat.h
-#define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
-#endif  // S_ISDIR
-
-static void *dlsym(void *handle, const char *symbol_name) {
-  FARPROC found_symbol;
-  found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
-
-  if (found_symbol == NULL) {
-    throw std::runtime_error(std::string(symbol_name) + " not found.");
-  }
-  return reinterpret_cast<void *>(found_symbol);
-}
-
-static void *dlopen(const char *filename, int flag) {
-  std::string file_name(filename);
-  HMODULE hModule = LoadLibrary(file_name.c_str());
-  if (!hModule) {
-    throw std::runtime_error(file_name + " not found.");
-  }
-  return reinterpret_cast<void *>(hModule);
-}
-
-static int gettimeofday(struct timeval *tp, void *tzp) {
-  time_t clock;
-  struct tm tm;
-  SYSTEMTIME wtm;
-
-  GetLocalTime(&wtm);
-  tm.tm_year = wtm.wYear - 1900;
-  tm.tm_mon = wtm.wMonth - 1;
-  tm.tm_mday = wtm.wDay;
-  tm.tm_hour = wtm.wHour;
-  tm.tm_min = wtm.wMinute;
-  tm.tm_sec = wtm.wSecond;
-  tm.tm_isdst = -1;
-  clock = mktime(&tm);
-  tp->tv_sec = clock;
-  tp->tv_usec = wtm.wMilliseconds * 1000;
-
-  return (0);
-}
-#endif  // !_WIN32
-
-static void ExecShellCommand(const std::string &cmd, std::string *message) {
-  char buffer[128];
-#if !defined(_WIN32)
-  std::shared_ptr<FILE> pipe(popen(cmd.c_str(), "r"), pclose);
-#else
-  std::shared_ptr<FILE> pipe(_popen(cmd.c_str(), "r"), _pclose);
-#endif  // _WIN32
-  if (!pipe) {
-    LOG(ERROR) << "error running command: " << cmd;
-    return;
-  }
-  while (!feof(pipe.get())) {
-    if (fgets(buffer, 128, pipe.get()) != nullptr) {
-      *message += buffer;
-    }
-  }
-}
-
-static bool PathExists(const std::string &path) {
-#if !defined(_WIN32)
-  struct stat statbuf;
-  if (stat(path.c_str(), &statbuf) != -1) {
-    if (S_ISDIR(statbuf.st_mode)) {
-      return true;
-    }
-  }
-#else
-  struct _stat statbuf;
-  if (_stat(path.c_str(), &statbuf) != -1) {
-    if (S_ISDIR(statbuf.st_mode)) {
-      return true;
-    }
-  }
-#endif  // !_WIN32
-  return false;
-}
-
-// TODO(yuyang18): If the functions below are needed by other files, move them
-// to paddle::filesystem namespace.
-#if !defined(_WIN32)
-constexpr char kSEP = '/';
-#else
-constexpr char kSEP = '\\';
-#endif  // _WIN32
-
-static bool FileExists(const std::string &filepath) {
-#if !defined(_WIN32)
-  struct stat buffer;
-  return (stat(filepath.c_str(), &buffer) == 0);
-#else
-  struct _stat buffer;
-  return (_stat(filepath.c_str(), &buffer) == 0);
-#endif  // !_WIN32
-}
-
-static std::string DirName(const std::string &filepath) {
-  auto pos = filepath.rfind(kSEP);
-  if (pos == std::string::npos) {
-    return "";
-  }
-  return filepath.substr(0, pos);
-}
-
-static void MkDir(const char *path) {
-  std::string path_error(path);
-  path_error += " mkdir failed!";
-#if !defined(_WIN32)
-  if (mkdir(path, 0755)) {
-    if (errno != EEXIST) {
-      throw std::runtime_error(path_error);
-    }
-  }
-#else
-  BOOL return_value = CreateDirectory(path, NULL);
-  if (!return_value) {
-    auto errorno = GetLastError();
-    if (errorno != ERROR_ALREADY_EXISTS) {
-      throw std::runtime_error(path_error);
-    }
-  }
-#endif  // !_WIN32
-}
-
-static void MkDirRecursively(const char *fullpath) {
-  if (*fullpath == '\0') return;  // empty string
-  if (FileExists(fullpath)) return;
-
-  MkDirRecursively(DirName(fullpath).c_str());
-  MkDir(fullpath);
-}
diff --git a/lite/backends/x86/target_wrapper.cc b/lite/backends/x86/target_wrapper.cc
deleted file mode 100644
index 34227abd98..0000000000
--- a/lite/backends/x86/target_wrapper.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/target_wrapper.h"
-#include <algorithm>
-#include "lite/backends/x86/target_wrapper.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-
-template <>
-void TargetWrapper<TARGET(kX86)>::MemcpySync(void *dst,
-                                             const void *src,
-                                             size_t size,
-                                             IoDirection dir) {
-  std::copy_n(reinterpret_cast<const uint8_t *>(src),
-              size,
-              reinterpret_cast<uint8_t *>(dst));
-}
-
-template class TargetWrapper<TARGET(kX86)>;
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/target_wrapper.h b/lite/backends/x86/target_wrapper.h
deleted file mode 100644
index a57f51d8f1..0000000000
--- a/lite/backends/x86/target_wrapper.h
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/target_wrapper.h"
-
-namespace paddle {
-namespace lite {
-namespace x86 {}  // namespace x86
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/x86/warpctc_lib_path.h.in b/lite/backends/x86/warpctc_lib_path.h.in
deleted file mode 100644
index dc5064f457..0000000000
--- a/lite/backends/x86/warpctc_lib_path.h.in
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#define WARPCTC_LIB_PATH "@WARPCTC_INSTALL_DIR@/lib/"
diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt
deleted file mode 100644
index ff80accbb7..0000000000
--- a/lite/core/CMakeLists.txt
+++ /dev/null
@@ -1,124 +0,0 @@
-if (WITH_TESTING)
-    lite_cc_library(lite_gtest_main SRCS lite_gtest_main.cc DEPS gtest gflags)
-endif()
-lite_cc_library(target_wrapper SRCS target_wrapper.cc
-  DEPS target_wrapper_host place
-  X86_DEPS target_wrapper_x86
-  CUDA_DEPS target_wrapper_cuda
-  CL_DEPS cl_target_wrapper
-  FPGA_DEPS fpga_target_wrapper)
-
-lite_cc_library(memory SRCS memory.cc DEPS target_wrapper CL_DEPS cl_target_wrapper)
-
-set(tensor_extra_deps "")
-if (LITE_WITH_FPGA)
-    set(tensor_extra_deps lite_tensor_fpga)
-endif()
-lite_cc_library(tensor SRCS tensor.cc DEPS memory ${tensor_extra_deps})
-
-
-if (NOT LITE_ON_TINY_PUBLISH)
-    proto_library(framework_proto SRCS framework.proto)
-endif()
-
-if (LITE_WITH_X86)
-lite_cc_library(variable SRCS variable.cc DEPS tensor)
-lite_cc_library(types SRCS types.cc)
-else()
-lite_cc_library(variable SRCS variable.cc DEPS tensor)
-lite_cc_library(types SRCS types.cc)
-endif()
-lite_cc_library(op_registry SRCS op_registry.cc DEPS kernel)
-lite_cc_library(scope SRCS scope.cc DEPS tensor)
-lite_cc_library(device_info SRCS device_info.cc DEPS tensor)
-
-if (LITE_WITH_ARM)
-lite_cc_library(context SRCS context.cc DEPS tensor any device_info CL_DEPS cl_context gflags NPU_DEPS ${npu_ddk_libs})
-else()
-lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context gflags)
-endif()
-
-#----------------------------------------------- NOT CHANGE -----------------------------------------------
-# A trick to generate the paddle_use_kernels.h
-add_custom_command(
-  COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/parse_kernel_registry.py
-  ${kernels_src_list}
-  ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_kernels.h
-  OUTPUT ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_kernels.h
-  )
-# A trick to generate the paddle_use_ops.h
-add_custom_command(
-  COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/parse_op_registry.py
-  ${ops_src_list}
-  ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_ops.h
-  OUTPUT ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_ops.h
-  )
-# generate fake kernels for memory_optimize_tool
-add_custom_command(
-  COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/create_fake_kernel_registry.py
-  ${kernels_src_list}
-  ${CMAKE_BINARY_DIR}/all_kernel_faked.cc
-  OUTPUT ${CMAKE_BINARY_DIR}/all_kernel_faked.cc
-  )
-add_custom_target(op_list_h DEPENDS ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_ops.h)
-add_custom_target(kernel_list_h DEPENDS ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_kernels.h)
-add_custom_target(all_kernel_faked_cc DEPENDS ${CMAKE_BINARY_DIR}/all_kernel_faked.cc)
-
-#----------------------------------------------- NOT CHANGE -----------------------------------------------
-lite_cc_library(kernel SRCS kernel.cc DEPS context type_system target_wrapper any op_params tensor
-  )
-lite_cc_library(op SRCS op_lite.cc DEPS scope op_registry target_wrapper kernel
-  cpp_op_desc tensor
-  )
-
-add_dependencies(kernel kernel_list_h)
-add_dependencies(op op_list_h)
-
-
-lite_cc_library(type_system SRCS type_system.cc DEPS tensor target_wrapper)
-
-lite_cc_library(program SRCS program.cc
-    DEPS op kernel model_parser ${ops} ${cpp_wrapper}
-    PROFILE_DEPS basic_profiler)
-
-if (NOT LITE_ON_TINY_PUBLISH)
-  lite_cc_library(optimizer SRCS optimizer.cc DEPS mir_pass_manager model_parser program)
-  add_subdirectory(mir)
-  add_subdirectory(profile)
-  add_subdirectory(arena)
-endif()
-
-# for mobile, unnecessary to compile the following testings.
-if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-    return()
-endif()
-
-# lite_cc_library(program_fake_utils SRCS program_fake_utils.cc DEPS mir_ssa_graph
-#         scope op_registry proto_desc op
-#         ${ops}
-#         ${host_kernels}
-#         )
-
-lite_cc_test(test_scope SRCS scope_test.cc DEPS scope)
-lite_cc_test(test_kernel SRCS kernel_test.cc DEPS kernel target_wrapper any)
-lite_cc_test(test_op SRCS op_lite_test.cc DEPS op)
-lite_cc_test(test_tensor SRCS lite_tensor_test.cc DEPS tensor)
-lite_cc_test(test_type_system SRCS type_system_test.cc DEPS type_system utils)
-#lite_cc_test(test_optimizer SRCS optimizer_test.cc DEPS mir_pass_manager program_fake_utils mir_passes optimizer fc_op)
-lite_cc_test(test_types SRCS types_test.cc DEPS types)
-lite_cc_test(test_memory SRCS memory_test.cc DEPS memory)
-lite_cc_test(test_context SRCS context_test.cc DEPS context)
-
-
-# # A trick to generate the paddle_use_kernels.h
-# execute_process(
-#   COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/parse_kernel_registry.py
-#   ${kernels_src_list}
-#   ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_kernels.h
-#   )
-# # A trick to generate the paddle_use_ops.h
-# execute_process(
-#   COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/parse_op_registry.py
-#   ${ops_src_list}
-#   ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_ops.h
-#   )
diff --git a/lite/core/arena/CMakeLists.txt b/lite/core/arena/CMakeLists.txt
deleted file mode 100644
index 854d2f4172..0000000000
--- a/lite/core/arena/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-# To make sure the test framework is only actived in TESTING mode.
-if(NOT WITH_TESTING)
-    return()
-endif()
-
-lite_cc_library(arena_framework SRCS framework.cc DEPS program)
-
-if(NOT LITE_WITH_OPENCL AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${x86_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-endif()
diff --git a/lite/core/arena/framework.cc b/lite/core/arena/framework.cc
deleted file mode 100644
index c59c078787..0000000000
--- a/lite/core/arena/framework.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/arena/framework.h"
-#include "lite/core/context.h"
-
-namespace paddle {
-namespace lite {
-namespace arena {
-
-void TestCase::CreateInstruction() {
-  auto op = LiteOpRegistry::Global().Create(op_desc().Type());
-  CHECK(op) << "no op for " << op_desc().Type();
-  op->Attach(*op_desc_, inst_scope_);
-  auto kernels = op->CreateKernels({place_});
-  // filter out the target kernel
-  CHECK(!kernels.empty()) << "No kernel found for place "
-                          << place_.DebugString();
-  auto it = std::remove_if(
-      kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& k) {
-        return k->alias() == alias_;
-      });
-  CHECK(it != kernels.end()) << "failed to create the kernel in "
-                             << place_.DebugString()
-                             << " with alias: " << alias_;
-  // prepare context
-  (*it)->SetContext(std::move(ctx_));
-  instruction_.reset(new Instruction(op, std::move(*it)));
-}
-
-void TestCase::PrepareInputsForInstruction() {
-  for (auto& arg : op_desc().InputArgumentNames()) {
-    for (auto& var : op_desc().Input(arg)) {
-      std::string kernel_key = instruction_->kernel()->key_with_alias();
-      const auto* param_type = ParamTypeRegistry::Global().RetrieveInArgument(
-          place_, kernel_key, arg);
-
-      const auto* inst_type = Type::GetTensorTy(TARGET(kHost));
-      CHECK(scope_->FindVar(var));
-      const auto* shared_tensor = scope_->FindTensor((var));
-      if (!TargetCompatibleTo(*inst_type, *param_type->type)) {
-        /// Create a tensor in the instruction's scope, alloc memory and then
-        /// copy data there.
-        auto* target_tensor = inst_scope_->NewTensor(var);
-        CHECK(!shared_tensor->dims().empty()) << "shared_tensor is empty yet";
-        target_tensor->Resize(shared_tensor->dims());
-        TargetCopy(param_type->type->target(),
-                   target_tensor->mutable_data(param_type->type->target(),
-                                               shared_tensor->memory_size()),
-                   shared_tensor->raw_data(),
-                   shared_tensor->memory_size());
-      }
-    }
-  }
-}
-
-}  // namespace arena
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/arena/framework.h b/lite/core/arena/framework.h
deleted file mode 100644
index 48a8571a19..0000000000
--- a/lite/core/arena/framework.h
+++ /dev/null
@@ -1,258 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <gtest/gtest.h>
-#include <time.h>
-#include <algorithm>
-#include <chrono>  // NOLINT
-#include <iomanip>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "lite/core/op_registry.h"
-#include "lite/core/program.h"
-#include "lite/core/scope.h"
-#include "lite/core/types.h"
-#include "lite/model_parser/cpp/op_desc.h"
-
-namespace paddle {
-namespace lite {
-namespace arena {
-
-/*
- * Init data and prepare the op.
- */
-class TestCase {
- public:
-  explicit TestCase(const Place& place, const std::string& alias)
-      : place_(place), scope_(new Scope), alias_(alias) {
-    ctx_ = ContextScheduler::Global().NewContext(place_.target);
-  }
-
-  void Prepare() {
-    PrepareScopes();
-    PrepareData();
-    op_desc_.reset(new cpp::OpDesc);
-    PrepareOpDesc(op_desc_.get());
-
-    PrepareOutputsForInstruction();
-    CreateInstruction();
-    PrepareInputsForInstruction();
-  }
-
-  /// Run the target instruction, that is run the test operator.
-  void RunInstruction() { instruction_->Run(); }
-
-  KernelContext* context() { return ctx_.get(); }
-
-  /// The baseline should be implemented, which acts similar to an operator,
-  /// that is take several tensors as input and output several tensors as
-  /// output.
-  virtual void RunBaseline(Scope* scope) = 0;
-
-  /// Check the precision of the output tensors. It will compare the same tensor
-  /// in two scopes, one of the instruction execution, and the other for the
-  /// baseline.
-  template <typename T>
-  bool CheckPrecision(const std::string& var_name, float abs_error);
-
-  const cpp::OpDesc& op_desc() { return *op_desc_; }
-
-  // Check whether the output tensor is consistent with the output definition in
-  // kernel registry.
-  void CheckKernelConsistWithDefinition() {}
-
-  Scope& scope() { return *scope_; }
-
-  Scope* baseline_scope() { return base_scope_; }
-  Scope* inst_scope() { return inst_scope_; }
-
- protected:
-  // Prepare inputs in scope() for Tester.
-  virtual void PrepareData() = 0;
-
-  /// Prepare a tensor in host. The tensors will be created in scope_.
-  /// Need to specify the targets other than X86 or ARM.
-  template <typename T>
-  void SetCommonTensor(const std::string& var_name,
-                       const DDim& ddim,
-                       const T* data,
-                       const LoD& lod = {}) {
-    auto* tensor = scope_->NewTensor(var_name);
-    tensor->Resize(ddim);
-    auto* d = tensor->mutable_data<T>();
-    memcpy(d, data, ddim.production() * sizeof(T));
-
-    // set lod
-    if (!lod.empty()) *tensor->mutable_lod() = lod;
-  }
-
-  // Prepare for the operator.
-  virtual void PrepareOpDesc(cpp::OpDesc* op_desc) = 0;
-
- public:
-  const Instruction& instruction() { return *instruction_; }
-
- private:
-  std::unique_ptr<KernelContext> ctx_;
-  void CreateInstruction();
-
-  void PrepareScopes() {
-    inst_scope_ = &scope_->NewScope();
-    base_scope_ = &scope_->NewScope();
-  }
-
-  // Check shape
-  // TODO(Superjomn) Move this method to utils or DDim?
-  bool ShapeEquals(const DDim& a, const DDim& b) {
-    if (a.size() != b.size()) return false;
-    for (int i = 0; i < a.size(); i++) {
-      if (a[i] != b[i]) return false;
-    }
-    return true;
-  }
-
-  /// Copy the input tensors to target devices needed by the instruction.
-  void PrepareInputsForInstruction();
-
-  // Create output tensors and variables.
-  void PrepareOutputsForInstruction() {
-    for (auto x : op_desc().output_vars()) {
-      inst_scope_->NewTensor(x);
-      base_scope_->NewTensor(x);
-    }
-  }
-
- private:
-  std::shared_ptr<Scope> scope_;
-  // The workspace for the Instruction.
-  Scope* inst_scope_{};
-  // The workspace for the baseline implementation.
-  Scope* base_scope_{};
-  std::unique_ptr<cpp::OpDesc> op_desc_;
-  std::unique_ptr<Instruction> instruction_;
-  Place place_;
-  std::string alias_;
-};
-
-class Arena {
-  float abs_error_{};
-
- public:
-  Arena(std::unique_ptr<TestCase>&& tester,
-        const Place& place,
-        float abs_error = 1e-5)
-      : tester_(std::move(tester)), place_(place), abs_error_(abs_error) {
-    tester_->Prepare();
-  }
-
-  bool TestPrecision() {
-    tester_->RunBaseline(tester_->baseline_scope());
-    tester_->RunInstruction();
-
-    bool success = true;
-    for (auto& out : tester_->op_desc().OutputArgumentNames()) {
-      for (auto& var : tester_->op_desc().Output(out)) {
-        success = success && CompareTensor(out, var);
-      }
-    }
-    LOG(INFO) << "done";
-    return success;
-  }
-
-  void TestPerformance(int times = 100) {
-    auto timer = std::chrono::high_resolution_clock::now();
-    for (int i = 0; i < times; i++) {
-      tester_->RunInstruction();
-    }
-    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
-        std::chrono::high_resolution_clock::now() - timer);
-    LOG(INFO) << "average duration: " << duration.count() << " ms";
-  }
-
- private:
-  // input_name: X
-  bool CompareTensor(const std::string& arg_name, const std::string& var_name) {
-    // get tensor type.
-    const Type* type =
-        tester_->instruction().kernel()->GetOutputDeclType(arg_name);
-
-    switch (type->precision()) {
-      case PRECISION(kFloat):
-        return tester_->CheckPrecision<float>(var_name, abs_error_);
-      case PRECISION(kInt8):
-        return tester_->CheckPrecision<int8_t>(var_name, abs_error_);
-      case PRECISION(kInt32):
-        return tester_->CheckPrecision<int32_t>(var_name, abs_error_);
-      case PRECISION(kBool):
-        return tester_->CheckPrecision<bool>(var_name, abs_error_);
-
-      default:
-        LOG(FATAL) << "not support type " << PrecisionToStr(type->precision());
-    }
-  }
-
- private:
-  std::unique_ptr<TestCase> tester_;
-  Place place_;
-};
-
-template <typename T>
-bool TestCase::CheckPrecision(const std::string& var_name, float abs_error) {
-  auto a_tensor = inst_scope_->FindTensor(var_name);
-  auto b_tensor = base_scope_->FindTensor(var_name);
-  CHECK(a_tensor);
-  CHECK(b_tensor);
-
-  CHECK(ShapeEquals(a_tensor->dims(), b_tensor->dims()));
-
-  CHECK(a_tensor->lod() == b_tensor->lod()) << "lod not match";
-
-  // The baseline should output in host devices.
-  CHECK(b_tensor->target() == TARGET(kHost) ||
-        b_tensor->target() == TARGET(kX86) ||
-        b_tensor->target() == TARGET(kARM));
-
-  const T* a_data{};
-  switch (a_tensor->target()) {
-    case TARGET(kX86):
-    case TARGET(kHost):
-    case TARGET(kARM):
-      a_data = static_cast<const T*>(a_tensor->raw_data());
-      break;
-
-    default:
-      // Before compare, need to copy data from `target` device to host.
-      LOG(FATAL) << "Not supported";
-  }
-
-  CHECK(a_data);
-
-  const T* b_data = static_cast<const T*>(b_tensor->raw_data());
-
-  bool success = true;
-  for (int i = 0; i < a_tensor->dims().production(); i++) {
-    EXPECT_NEAR(a_data[i], b_data[i], abs_error);
-    if (fabsf(a_data[i] - b_data[i]) > abs_error) {
-      success = false;
-    }
-  }
-  return success;
-}
-
-}  // namespace arena
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/arena/framework_test.cc b/lite/core/arena/framework_test.cc
deleted file mode 100644
index 411ab26a71..0000000000
--- a/lite/core/arena/framework_test.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/arena/framework.h"
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-
-namespace paddle {
-namespace lite {
-
-class ScaleComputeTester : public arena::TestCase {
-  // common attributes for this op.
-  std::string input_ = "x";
-  std::string output_ = "out";
-  float scale_ = 1.2f;
-  float bias_ = 0.f;
-  DDim dims_{{3, 2, 10}};
-
- public:
-  explicit ScaleComputeTester(const Place& place, const std::string& alias)
-      : TestCase(place, alias) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(input_);
-    const auto* x_data = x->data<float>();
-
-    for (int i = 0; i < dims_.production(); i++) {
-      out_data[i] = x_data[i] * scale_ + bias_;
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("scale");
-    op_desc->SetInput("X", {input_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("scale", scale_);
-    op_desc->SetAttr("bias", bias_);
-    op_desc->SetAttr("bias_after_scale", false);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(input_, dims_, data.data());
-  }
-};
-
-TEST(scale, basic) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-#endif
-  std::unique_ptr<arena::TestCase> tester(new ScaleComputeTester(place, "def"));
-  arena::Arena arena(std::move(tester), place);
-
-  arena.TestPrecision();
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/context.cc b/lite/core/context.cc
deleted file mode 100644
index 948aac0c79..0000000000
--- a/lite/core/context.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/context.h"
-
-#ifdef LITE_WITH_OPENCL
-DEFINE_string(cl_path, "/data/local/tmp/opencl", "The OpenCL kernels path.");
-#endif
-
-namespace paddle {
-namespace lite {}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/context.h b/lite/core/context.h
deleted file mode 100644
index bac0e3a627..0000000000
--- a/lite/core/context.h
+++ /dev/null
@@ -1,400 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "lite/utils/any.h"
-#ifdef LITE_WITH_CUDA
-#include "lite/backends/cuda/blas.h"
-#include "lite/backends/cuda/cuda_utils.h"
-#endif
-#ifdef LITE_WITH_OPENCL
-#include <gflags/gflags.h>
-#include <unordered_map>
-#include "lite/backends/opencl/cl_context.h"
-#include "lite/backends/opencl/cl_runtime.h"
-#endif
-#ifdef LITE_WITH_NPU
-#include "lite/backends/npu/npu_helper.h"
-#endif
-
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-#include <utility>
-#include <vector>
-#include "lite/core/device_info.h"
-#include "lite/core/target_wrapper.h"
-#include "lite/core/tensor.h"
-#include "lite/utils/all.h"
-
-#ifdef LITE_WITH_OPENCL
-DECLARE_string(cl_path);
-#endif
-
-namespace paddle {
-namespace lite {
-
-template <TargetType Type>
-class Context;
-
-using HostContext = Context<TargetType::kHost>;
-using X86Context = Context<TargetType::kX86>;
-using CUDAContext = Context<TargetType::kCUDA>;
-using ARMContext = Context<TargetType::kARM>;
-using NPUContext = Context<TargetType::kNPU>;
-using OpenCLContext = Context<TargetType::kOpenCL>;
-using FPGAContext = Context<TargetType::kFPGA>;
-
-template <>
-class Context<TargetType::kHost> {
- public:
-  // NOTE: InitOnce should only be used by ContextScheduler
-  void InitOnce() {}
-
-  void CopySharedTo(HostContext* ctx) {}
-
-  std::string name() const { return "HostContext"; }
-};
-
-#ifdef LITE_WITH_NPU
-template <>
-class Context<TargetType::kNPU> {
- public:
-  Context() {}
-  explicit Context(const NPUContext& ctx);
-  // NOTE: InitOnce should only be used by ContextScheduler
-  void InitOnce() {}
-  void CopySharedTo(NPUContext* ctx) {}
-
-  NPUContext& operator=(const NPUContext& ctx) {}
-  std::string name() const { return "NPUContext"; }
-  hiai::AiModelMngerClient* client(const std::string& model_name) const {
-    return npu::DeviceInfo::Global().client(model_name);
-  }
-};
-#endif
-
-#ifdef LITE_WITH_ARM
-template <>
-class Context<TargetType::kARM> {
- public:
-  Context() {}
-  explicit Context(const ARMContext& ctx);
-
-  ARMContext& operator=(const ARMContext& ctx) {}
-
-  // NOTE: InitOnce should only be used by ContextScheduler
-  void InitOnce() { DeviceInfo::Init(); }
-
-  void CopySharedTo(ARMContext* ctx) {}
-
-  void SetRunMode(lite_api::PowerMode mode, int threads) {
-    return DeviceInfo::Global().SetRunMode(mode, threads);
-  }
-  void SetCache(int l1size, int l2size, int l3size) {
-    return DeviceInfo::Global().SetCache(l1size, l2size, l3size);
-  }
-  void SetArch(ARMArch arch) { return DeviceInfo::Global().SetArch(arch); }
-
-  lite_api::PowerMode mode() const { return DeviceInfo::Global().mode(); }
-  int threads() const { return DeviceInfo::Global().threads(); }
-  ARMArch arch() const { return DeviceInfo::Global().arch(); }
-  int l1_cache_size() const { return DeviceInfo::Global().l1_cache_size(); }
-  int l2_cache_size() const { return DeviceInfo::Global().l2_cache_size(); }
-  int l3_cache_size() const { return DeviceInfo::Global().l3_cache_size(); }
-  int llc_size() const { return DeviceInfo::Global().llc_size(); }
-  bool has_dot() const { return DeviceInfo::Global().has_dot(); }
-  bool has_fp16() const { return DeviceInfo::Global().has_fp16(); }
-
-  template <typename T>
-  T* workspace_data() {
-    return DeviceInfo::Global().workspace_data<T>();
-  }
-
-  bool ExtendWorkspace(size_t size) {
-    return DeviceInfo::Global().ExtendWorkspace(size);
-  }
-
-  std::string name() const { return "ARMContext"; }
-};
-#endif
-
-#ifdef LITE_WITH_FPGA
-// TODO(tianxiaogang): add needed implementation to context
-template <>
-class Context<TargetType::kFPGA> {
- public:
-  Context() {}
-  void InitOnce() {}
-
-  FPGAContext& operator=(const FPGAContext& ctx) {}
-
-  void CopySharedTo(FPGAContext* ctx) {}
-
-  std::string name() const { return "FPGAContext"; }
-};
-#endif
-
-#ifdef LITE_WITH_CUDA
-// Only works with CUDA kernels.
-template <>
-class Context<TargetType::kCUDA> {
- public:
-  typename Env<TargetType::kCUDA>::Devs& devs =
-      Env<TargetType::kCUDA>::Global();
-  // NOTE: InitOnce should only be used by ContextScheduler
-  void InitOnce() {
-    cublas_fp32_ = std::make_shared<lite::cuda::Blas<float>>();
-  }
-  void Init(int dev_id, int exec_stream_id = 0, int io_stream_id = 0) {
-    CHECK_GT(devs.size(), 0)
-        << "Env is not initialized or current target is not exit!";
-    if (dev_id >= devs.size()) {
-      LOG(WARNING) << "device index exceeds the number of devices, set to "
-                      "default device(0)!";
-      device_id_ = 0;
-    } else {
-      device_id_ = dev_id;
-    }
-    if (io_stream_id >= devs[dev_id].max_stream()) {
-      LOG(WARNING) << "data stream index exceeds the maximum stream number, "
-                      "set to default stream(0)!";
-      io_stream_id = 0;
-    }
-    if (exec_stream_id >= devs[dev_id].max_stream()) {
-      LOG(WARNING) << "exec stream index exceeds the maximum stream number, "
-                      "set to default stream(0)!";
-      exec_stream_id = 0;
-    }
-
-    exec_stream_ = devs[dev_id].exec_streams()[exec_stream_id];
-    io_stream_ = devs[dev_id].io_streams()[io_stream_id];
-
-    exec_stream_id_ = exec_stream_id;
-    io_stream_id_ = io_stream_id;
-  }
-  void CopySharedTo(CUDAContext* ctx) {
-    CHECK(ctx);
-    CHECK(cublas_fp32_) << "cublas_fp32 should be set first";
-    ctx->cublas_fp32_ = cublas_fp32_;
-  }
-
-  const cudaStream_t exec_stream() { return exec_stream_; }
-  void SetExecStream(cudaStream_t stream) { exec_stream_ = stream; }
-
-  const cudaStream_t io_stream() { return io_stream_; }
-  void SetIoStream(cudaStream_t stream) { io_stream_ = stream; }
-
-  std::shared_ptr<cuda::Blas<float>> cublas_fp32() { return cublas_fp32_; }
-  void SetCuBlasFP32(std::shared_ptr<cuda::Blas<float>> cublas_fp32) {
-    cublas_fp32_ = cublas_fp32;
-  }
-
-  const std::vector<cudaEvent_t>& input_events() { return input_events_; }
-  void SetInputEvents(const std::vector<cudaEvent_t>& input_events) {
-    input_events_.clear();
-    input_events_.assign(input_events.begin(), input_events.end());
-  }
-
-  const std::vector<cudaEvent_t>& output_events() { return output_events_; }
-  void SetOutputEvents(const std::vector<cudaEvent_t>& output_events) {
-    output_events_.clear();
-    output_events_.assign(output_events.begin(), output_events.end());
-  }
-
-  std::string name() const { return "CUDAContext"; }
-
- private:
-  int device_id_;
-  // overall information
-  int exec_stream_id_;
-  int io_stream_id_;
-  cudaStream_t exec_stream_;
-  cudaStream_t io_stream_;
-
-  // not thread-safe, should allocate for each thread.
-  std::shared_ptr<cuda::Blas<float>> cublas_fp32_;
-
-  // kernel information
-  std::vector<cudaEvent_t> input_events_;
-  std::vector<cudaEvent_t> output_events_;
-};
-#endif
-
-#ifdef LITE_WITH_X86
-template <>
-class Context<TargetType::kX86> {
- public:
-  Context() {}
-
-  Context(Context&& ctx) {}
-
-  // NOTE: InitOnce should only be used by ContextScheduler
-  void InitOnce() {}
-
-  void CopySharedTo(X86Context* ctx) {}
-
-  std::string name() const { return "X86Context"; }
-
- private:
-  // overall information
-  //
-  // kernel information
-};
-#endif
-
-#ifdef LITE_WITH_OPENCL
-template <>
-class Context<TargetType::kOpenCL> {
-  std::shared_ptr<CLContext> cl_context_;
-  using WaitListType =
-      std::unordered_map<decltype(static_cast<const cl::Buffer*>(nullptr)),
-                         std::shared_ptr<cl::Event>>;
-  std::shared_ptr<WaitListType> cl_wait_list_;
-
- public:
-  CLContext* cl_context() { return cl_context_.get(); }
-  WaitListType* cl_wait_list() { return cl_wait_list_.get(); }
-
-  void InitOnce() {
-    // Init cl runtime.
-    CHECK(CLRuntime::Global()->IsInitSuccess()) << "OpenCL runtime init failed";
-    CLRuntime::Global()->set_cl_path(FLAGS_cl_path);
-
-    cl_context_ = std::make_shared<CLContext>();
-    cl_wait_list_ = std::make_shared<WaitListType>();
-  }
-
-  void CopySharedTo(OpenCLContext* ctx) {
-    ctx->cl_context_ = cl_context_;
-    ctx->cl_wait_list_ = cl_wait_list_;
-  }
-};
-#endif
-
-// Context for running a kernel.
-// Holds the necessary resource and information.
-class KernelContext {
- public:
-  template <typename ContextT>
-  ContextT& As() {
-    if (!ctx_.valid()) {
-      ctx_.set<ContextT>();
-    }
-    return *ctx_.get_mutable<ContextT>();
-  }
-
- private:
-  Any ctx_;
-};
-
-// The ContextScheduler helps to assign different context for each kernel.
-class ContextScheduler {
- public:
-  static ContextScheduler& Global() {
-    static auto* x = new ContextScheduler;
-    return *x;
-  }
-
-  std::unique_ptr<KernelContext> NewContext(TargetType target) {
-    std::unique_ptr<KernelContext> ctx(new KernelContext);
-    switch (target) {
-      case TARGET(kHost):
-        kernel_contexts_[TargetType::kHost].As<HostContext>().CopySharedTo(
-            &ctx->As<HostContext>());
-        break;
-#ifdef LITE_WITH_X86
-      case TARGET(kX86):
-        kernel_contexts_[TargetType::kX86].As<X86Context>().CopySharedTo(
-            &ctx->As<X86Context>());
-        break;
-#endif
-#ifdef LITE_WITH_CUDA
-      case TARGET(kCUDA): {
-        int dev_id = TargetWrapper<TargetType::kCUDA>::GetCurDevice();
-        auto& context = ctx->As<CUDAContext>();
-        context.Init(dev_id);
-        kernel_contexts_[TargetType::kCUDA].As<CUDAContext>().CopySharedTo(
-            &context);
-      } break;
-#endif
-#ifdef LITE_WITH_ARM
-      case TARGET(kARM):
-        kernel_contexts_[TargetType::kARM].As<ARMContext>().CopySharedTo(
-            &ctx->As<ARMContext>());
-        break;
-#endif
-#ifdef LITE_WITH_NPU
-      case TARGET(kNPU):
-        kernel_contexts_[TargetType::kNPU].As<NPUContext>().CopySharedTo(
-            &ctx->As<NPUContext>());
-        break;
-#endif
-#ifdef LITE_WITH_OPENCL
-      case TARGET(kOpenCL):
-        kernel_contexts_[TargetType::kOpenCL].As<OpenCLContext>().CopySharedTo(
-            &ctx->As<OpenCLContext>());
-        break;
-#endif
-#ifdef LITE_WITH_FPGA
-      case TARGET(kFPGA):
-        kernel_contexts_[TargetType::kFPGA].As<FPGAContext>().CopySharedTo(
-            &ctx->As<FPGAContext>());
-        break;
-#endif
-      default:
-#ifndef LITE_ON_MODEL_OPTIMIZE_TOOL
-        LOG(FATAL) << "unsupported target " << TargetToStr(target);
-#endif
-        break;
-    }
-    return ctx;
-  }
-
- private:
-  template <TargetType Type, typename ContextT>
-  void InitContext() {
-    kernel_contexts_[Type].As<ContextT>().InitOnce();
-  }
-
-  ContextScheduler() {
-    InitContext<TargetType::kHost, HostContext>();
-#ifdef LITE_WITH_X86
-    InitContext<TargetType::kX86, X86Context>();
-#endif
-#ifdef LITE_WITH_CUDA
-    InitContext<TargetType::kCUDA, CUDAContext>();
-#endif
-#ifdef LITE_WITH_ARM
-    InitContext<TargetType::kARM, ARMContext>();
-#endif
-#ifdef LITE_WITH_OPENCL
-    InitContext<TargetType::kOpenCL, OpenCLContext>();
-#endif
-#ifdef LITE_WITH_FPGA
-    InitContext<TargetType::kFPGA, FPGAContext>();
-#endif
-#ifdef LITE_WITH_NPU
-    InitContext<TargetType::kNPU, NPUContext>();
-#endif
-  }
-
- private:
-  std::map<TargetType, KernelContext> kernel_contexts_;
-};
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/context_test.cc b/lite/core/context_test.cc
deleted file mode 100644
index 80b642bfad..0000000000
--- a/lite/core/context_test.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/context.h"
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace lite {
-
-// #ifdef LITE_WITH_X86
-// TEST(ContextScheduler, NewContext) {
-//   auto ctx1_p = ContextScheduler::Global().NewContext(TargetType::kX86);
-//   auto ctx2_p = ContextScheduler::Global().NewContext(TargetType::kX86);
-//   ASSERT_FALSE(ctx1_p.get() == ctx2_p.get());
-
-//   auto& ctx1 = ctx1_p->As<X86Context>();
-//   auto& ctx2 = ctx2_p->As<X86Context>();
-
-//   ASSERT_EQ(ctx1.name(), "X86Context");
-//   ASSERT_EQ(ctx2.name(), "X86Context");
-
-//   ASSERT_FALSE(ctx1.x86_device_context() == nullptr ||
-//                ctx2.x86_device_context() == nullptr);
-//   ASSERT_FALSE(ctx1.x86_execution_context() == nullptr ||
-//                ctx2.x86_execution_context() == nullptr);
-
-//   ASSERT_TRUE(ctx1.x86_device_context() != ctx2.x86_device_context());
-//   ASSERT_TRUE(ctx1.x86_execution_context() != ctx2.x86_execution_context());
-
-//   using device_ctx_t = ::paddle::platform::CPUDeviceContext;
-//   using exec_ctx_t = ::paddle::framework::ExecutionContext;
-//   auto* device_ctx = new device_ctx_t;
-//   ctx1.SetX86DeviceContext(std::unique_ptr<device_ctx_t>(device_ctx));
-//   ctx1.SetX86ExecutionContext(
-//       std::unique_ptr<exec_ctx_t>(new exec_ctx_t(*device_ctx)));
-// }
-// #endif
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/device_info.cc b/lite/core/device_info.cc
deleted file mode 100644
index de53d9ba67..0000000000
--- a/lite/core/device_info.cc
+++ /dev/null
@@ -1,1151 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Parts of the following code in this file refs to
-// https://github.com/Tencent/ncnn/blob/master/src/cpu.cpp
-// Tencent is pleased to support the open source community by making ncnn
-// available.
-//
-// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this
-// file except in compliance with the License. You may obtain a copy of the
-// License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-// License for the specific language governing permissions and limitations under
-// the License.
-
-#ifdef LITE_WITH_LINUX
-#include <sys/syscall.h>
-#include <unistd.h>
-#endif
-#if __APPLE__
-#include "TargetConditionals.h"
-#if LITE_WITH_IPHONE
-#include <mach/machine.h>
-#include <sys/sysctl.h>
-#include <sys/types.h>
-#endif  // LITE_WITH_IPHONE
-#endif  // __APPLE__
-
-#ifdef ARM_WITH_OMP
-#include <omp.h>
-#endif
-
-#include <algorithm>
-#include <limits>
-#include "lite/core/device_info.h"
-
-namespace paddle {
-namespace lite {
-
-#ifdef LITE_WITH_ARM
-
-#ifdef TARGET_IOS
-const int DEFAULT_L1_CACHE_SIZE = 64 * 1024;
-const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024;
-const int DEFAULT_L3_CACHE_SIZE = 0;
-#else
-const int DEFAULT_L1_CACHE_SIZE = 32 * 1024;
-const int DEFAULT_L2_CACHE_SIZE = 512 * 1024;
-const int DEFAULT_L3_CACHE_SIZE = 0;
-#endif
-
-int get_cpu_num() {
-#ifdef LITE_WITH_LINUX
-  // get cpu count from /sys/devices/system/cpu/cpunum/uevent
-  int max_cpu_num = 20;
-  int cpu_num = 0;
-  for (int i = 0; i < max_cpu_num; ++i) {
-    char path[256];
-    snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i);
-    FILE* fp = fopen(path, "rb");
-    if (!fp) {
-      break;
-    }
-    cpu_num++;
-    fclose(fp);
-  }
-  if (cpu_num < 1) {
-    cpu_num = 1;
-  }
-  return cpu_num;
-#elif defined(TARGET_IOS)
-  int cpu_num = 0;
-  size_t len = sizeof(cpu_num);
-  sysctlbyname("hw.ncpu", &cpu_num, &len, NULL, 0);
-  if (cpu_num < 1) {
-    cpu_num = 1;
-  }
-  return cpu_num;
-#else
-  return 1;
-#endif
-}
-
-size_t get_mem_size() {
-#ifdef LITE_WITH_LINUX
-  // get cpu count from /proc/cpuinfo
-  FILE* fp = fopen("/proc/meminfo", "rb");
-  if (!fp) {
-    return 1;
-  }
-  size_t memsize = 0;
-  char line[1024];
-  while (!feof(fp)) {
-    char* s = fgets(line, 1024, fp);
-    if (!s) {
-      break;
-    }
-    sscanf(s, "MemTotal:        %d kB", &memsize);
-  }
-  fclose(fp);
-  return memsize;
-#elif defined(TARGET_IOS)
-  // to be implemented
-  printf("not implemented, set to default 4GB\n");
-  return 4096 * 1024;
-#endif
-  return 0;
-}
-
-void get_cpu_arch(std::vector<ARMArch>* archs, const int cpu_num) {
-  archs->resize(cpu_num);
-  for (int i = 0; i < cpu_num; ++i) {
-    archs->at(i) = kARMArch_UNKOWN;
-  }
-#ifdef LITE_WITH_LINUX
-  //! get CPU ARCH
-  FILE* fp = fopen("/proc/cpuinfo", "rb");
-  if (!fp) {
-    return;
-  }
-  int cpu_idx = 0;
-  char line[1024];
-  while (!feof(fp)) {
-    char* s = fgets(line, 1024, fp);
-    if (!s) {
-      break;
-    }
-    if (strstr(line, "part") != NULL) {
-      ARMArch arch_type = kARMArch_UNKOWN;
-      int arch_id = 0;
-      sscanf(s, "CPU part\t: %x", &arch_id);
-      switch (arch_id) {
-        case 0xd03:
-          arch_type = kA53;
-          break;
-        case 0xd05:
-          arch_type = kA55;
-          break;
-        case 0xd07:
-          arch_type = kA57;
-          break;
-        case 0xd08:
-          arch_type = kA72;
-          break;
-        case 0xd09:
-          arch_type = kA73;
-          break;
-        case 0xd0a:
-          arch_type = kA75;
-          break;
-        case 0xd40:
-          arch_type = kA76;
-          break;
-        case 0x804:
-          // 855
-          arch_type = kA76;
-          break;
-        case 0x805:
-          // 855
-          arch_type = kA55;
-          break;
-        case 0x802:
-          // 845
-          arch_type = kA75;
-          break;
-        case 0x803:
-          // 845
-          arch_type = kA55;
-          break;
-        case 0x801:
-          // 835
-          arch_type = kA73;
-          break;
-        case 0x800:
-          // 835
-          arch_type = kA73;
-          break;
-        case 0x205:
-          // 820
-          arch_type = kA72;
-          break;
-        default:
-          LOG(ERROR) << "Unknow cpu arch: " << arch_id;
-      }
-      archs->at(cpu_idx) = arch_type;
-      cpu_idx++;
-    }
-  }
-  fclose(fp);
-  for (; cpu_idx > 0 && cpu_idx < cpu_num; ++cpu_idx) {
-    archs->at(cpu_idx) = archs->at(cpu_idx - 1);
-  }
-#elif defined(TARGET_IOS)
-  for (int i = 0; i < cpu_num; ++i) {
-    archs->at(i) = kAPPLE;
-  }
-#endif
-}
-
-#ifdef LITE_WITH_LINUX
-
-std::string get_cpu_name() {
-  FILE* fp = fopen("/proc/cpuinfo", "rb");
-  if (!fp) {
-    return "";
-  }
-  char line[1024];
-  while (!feof(fp)) {
-    char* s = fgets(line, 1024, fp);
-    if (!s) {
-      break;
-    }
-    if (strstr(line, "Hardware") != NULL) {
-      fclose(fp);
-      return std::string(line);
-    }
-  }
-  fclose(fp);
-  return "";
-}
-
-void get_cpu_max_min_freq(int cpu_id, int* max_freq, int* min_freq) {
-  *max_freq = 0;
-  *min_freq = 0;
-  // first try, for all possible cpu
-  char path[256];
-  snprintf(path,
-           sizeof(path),
-           "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state",
-           cpu_id);
-  FILE* fp = fopen(path, "rb");
-  if (!fp) {
-    // second try, for online cpu
-    snprintf(path,
-             sizeof(path),
-             "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state",
-             cpu_id);
-    fp = fopen(path, "rb");
-    if (!fp) {
-      // third try, for online cpu
-      // get max_freq
-      snprintf(path,
-               sizeof(path),
-               "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq",
-               cpu_id);
-      fp = fopen(path, "rb");
-      if (!fp) {
-        return;
-      }
-      fscanf(fp, "%d", max_freq);
-      fclose(fp);
-      // get min_freq
-      snprintf(path,
-               sizeof(path),
-               "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_min_freq",
-               cpu_id);
-      fp = fopen(path, "rb");
-      if (!fp) {
-        return;
-      }
-      fscanf(fp, "%d", min_freq);
-      fclose(fp);
-      return;
-    }
-  }
-  *min_freq = std::numeric_limits<int>::max();
-  while (!feof(fp)) {
-    int freq = 0;
-    int nscan = fscanf(fp, "%d %*d", &freq);
-    if (nscan != 1) {
-      break;
-    }
-    if (freq > *max_freq) {
-      *max_freq = freq;
-    }
-    if (freq < *min_freq) {
-      *min_freq = freq;
-    }
-  }
-  fclose(fp);
-}
-
-void sort_cpuid_by_max_freq(const std::vector<int>& max_freqs,
-                            std::vector<int>* cpu_ids,
-                            std::vector<int>* cluster_ids) {
-  int cpu_num = max_freqs.size();
-  if (cpu_num == 0) {
-    return;
-  }
-  cpu_ids->resize(cpu_num);
-  cluster_ids->resize(cpu_num);
-  for (int i = 0; i < cpu_num; i++) {
-    cpu_ids->at(i) = i;
-  }
-  // sort cpuid as big core first
-  // simple bubble sort
-  for (int i = 0; i < cpu_num; i++) {
-    for (int j = i + 1; j < cpu_num; j++) {
-      if (max_freqs[i] < max_freqs[j]) {
-        // swap
-        int tmp = cpu_ids->at(i);
-        cpu_ids->at(i) = cpu_ids->at(j);
-        cpu_ids->at(j) = tmp;
-      }
-    }
-  }
-  // SMP
-  int mid_max_freq =
-      (max_freqs[cpu_ids->at(0)] + max_freqs[cpu_ids->at(cpu_num - 1)]) / 2;
-
-  for (int i = 0; i < cpu_num; i++) {
-    cpu_ids->at(i) = i;
-    if (max_freqs[i] >= mid_max_freq) {
-      cluster_ids->at(i) = 0;
-    } else {
-      cluster_ids->at(i) = 1;
-    }
-  }
-}
-
-void get_cpu_cache_size(int cpu_id,
-                        int* l1_cache_size,
-                        int* l2_cache_size,
-                        int* l3_cache_size) {
-  int max_cache_idx_num = 10;
-  *l1_cache_size = DEFAULT_L1_CACHE_SIZE;
-  *l2_cache_size = DEFAULT_L2_CACHE_SIZE;
-  *l3_cache_size = DEFAULT_L3_CACHE_SIZE;
-  for (int i = 0; i < max_cache_idx_num; i++) {
-    char path[256];
-    snprintf(path,
-             sizeof(path),
-             "/sys/devices/system/cpu/cpu%d/cache/index%d/level",
-             cpu_id,
-             i);
-    FILE* fp = fopen(path, "rb");
-    if (fp) {
-      int level = -1;
-      fscanf(fp, "%d", &level);
-      fclose(fp);
-      snprintf(path,
-               sizeof(path),
-               "/sys/devices/system/cpu/cpu%d/cache/index%d/size",
-               cpu_id,
-               i);
-      fp = fopen(path, "rb");
-      if (fp) {
-        int size = -1;
-        fscanf(fp, "%d", &size);
-        fclose(fp);
-        if (size >= 0) {
-          if (level == 1) {
-            *l1_cache_size = size * 1024;
-          } else if (level == 2) {
-            *l2_cache_size = size * 1024;
-          } else if (level == 3) {
-            *l3_cache_size = size * 1024;
-          }
-        }
-      }
-    }
-  }
-}
-
-bool check_cpu_online(const std::vector<int>& cpu_ids) {
-  if (cpu_ids.size() == 0) {
-    return false;
-  }
-  char path[256];
-  bool all_online = true;
-  for (int i = 0; i < cpu_ids.size(); ++i) {
-    snprintf(
-        path, sizeof(path), "/sys/devices/system/cpu/cpu%d/online", cpu_ids[i]);
-    FILE* fp = fopen(path, "rb");
-    int is_online = 0;
-    if (fp) {
-      fscanf(fp, "%d", &is_online);
-      fclose(fp);
-    } else {
-      LOG(ERROR) << "Failed to query the online statue of CPU id:"
-                 << cpu_ids[i];
-    }
-    if (is_online == 0) {
-      all_online = false;
-      LOG(ERROR) << "CPU id:" << cpu_ids[i] << " is offine";
-    }
-  }
-  return all_online;
-}
-
-int set_sched_affinity(const std::vector<int>& cpu_ids) {
-// #define CPU_SETSIZE 1024
-// #define __NCPUBITS  (8 * sizeof (unsigned long))
-// typedef struct
-// {
-//    unsigned long __bits[CPU_SETSIZE / __NCPUBITS];
-// } cpu_set_t;
-
-// set affinity for thread
-#ifdef __GLIBC__
-  pid_t pid = syscall(SYS_gettid);
-#else
-  pid_t pid = gettid();
-#endif
-  cpu_set_t mask;
-  CPU_ZERO(&mask);
-  for (int i = 0; i < cpu_ids.size(); ++i) {
-    CPU_SET(cpu_ids[i], &mask);
-  }
-  int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask);
-  if (syscallret) {
-    return -1;
-  }
-  return 0;
-}
-
-bool bind_threads(const std::vector<int> cpu_ids) {
-#ifdef ARM_WITH_OMP
-  int thread_num = cpu_ids.size();
-  omp_set_num_threads(thread_num);
-  std::vector<int> ssarets;
-  for (int i = 0; i < thread_num; ++i) {
-    ssarets.push_back(0);
-  }
-#pragma omp parallel for
-  for (int i = 0; i < thread_num; i++) {
-    ssarets[i] = set_sched_affinity(cpu_ids);
-  }
-  for (int i = 0; i < thread_num; i++) {
-    if (ssarets[i] != 0) {
-      LOG(ERROR) << "Set cpu affinity failed, core id: " << cpu_ids[i];
-      return false;
-    }
-  }
-#else   // ARM_WITH_OMP
-  std::vector<int> first_cpu_id;
-  first_cpu_id.push_back(cpu_ids[0]);
-  int ssaret = set_sched_affinity(first_cpu_id);
-  if (ssaret != 0) {
-    LOG(ERROR) << "Set cpu affinity failed, core id: " << cpu_ids[0];
-    return false;
-  }
-#endif  // ARM_WITH_OMP
-  return true;
-}
-
-#endif  // LITE_WITH_LINUX
-
-void DeviceInfo::SetDotInfo(int argc, ...) {
-  va_list arg_ptr;
-  va_start(arg_ptr, argc);
-  dot_.resize(core_num_);
-  if (argc == 1) {
-    bool flag = va_arg(arg_ptr, int) > 0;
-    for (int i = 0; i < core_num_; ++i) {
-      dot_[i] = flag;
-    }
-  } else {
-    bool flag_big_core = va_arg(arg_ptr, int) > 0;
-    bool flag_little_core = va_arg(arg_ptr, int) > 0;
-    int big_core_num = big_core_ids_.size();
-    int little_core_num = little_core_ids_.size();
-    for (int i = 0; i < big_core_num; ++i) {
-      dot_[big_core_ids_[i]] = flag_big_core;
-    }
-    for (int i = 0; i < little_core_num; ++i) {
-      dot_[little_core_ids_[i]] = flag_little_core;
-    }
-  }
-  va_end(arg_ptr);
-}
-
-void DeviceInfo::SetFP16Info(int argc, ...) {
-  va_list arg_ptr;
-  va_start(arg_ptr, argc);
-  fp16_.resize(core_num_);
-  if (argc == 1) {
-    bool flag = va_arg(arg_ptr, int) > 0;
-    for (int i = 0; i < core_num_; ++i) {
-      fp16_[i] = flag;
-    }
-  } else {
-    bool flag_big_core = va_arg(arg_ptr, int) > 0;
-    bool flag_little_core = va_arg(arg_ptr, int) > 0;
-    int big_core_num = big_core_ids_.size();
-    int little_core_num = little_core_ids_.size();
-    for (int i = 0; i < big_core_num; ++i) {
-      fp16_[big_core_ids_[i]] = flag_big_core;
-    }
-    for (int i = 0; i < little_core_num; ++i) {
-      fp16_[little_core_ids_[i]] = flag_little_core;
-    }
-  }
-  va_end(arg_ptr);
-}
-
-void DeviceInfo::SetFP32Info(int argc, ...) {
-  va_list arg_ptr;
-  va_start(arg_ptr, argc);
-  fp32_.resize(core_num_);
-  if (argc == 1) {
-    bool flag = va_arg(arg_ptr, int) > 0;
-    for (int i = 0; i < core_num_; ++i) {
-      fp32_[i] = flag;
-    }
-  } else {
-    bool flag_big_core = va_arg(arg_ptr, int) > 0;
-    bool flag_little_core = va_arg(arg_ptr, int) > 0;
-    int big_core_num = big_core_ids_.size();
-    int little_core_num = little_core_ids_.size();
-    for (int i = 0; i < big_core_num; ++i) {
-      fp32_[big_core_ids_[i]] = flag_big_core;
-    }
-    for (int i = 0; i < little_core_num; ++i) {
-      fp32_[little_core_ids_[i]] = flag_little_core;
-    }
-  }
-  va_end(arg_ptr);
-}
-
-// cache_id : 0 -> L1, 1 -> L2, 2 -> L3
-void DeviceInfo::SetCacheInfo(int cache_id, int argc, ...) {
-  va_list arg_ptr;
-  va_start(arg_ptr, argc);
-  std::vector<int>* cache;
-  switch (cache_id) {
-    case 0:
-      cache = &L1_cache_;
-      break;
-    case 1:
-      cache = &L2_cache_;
-      break;
-    case 2:
-      cache = &L3_cache_;
-      break;
-    default:
-      break;
-  }
-  cache->resize(core_num_);
-  if (argc == 1) {
-    int cache_size = va_arg(arg_ptr, int);
-    for (int i = 0; i < core_num_; ++i) {
-      (*cache)[i] = cache_size;
-    }
-  } else {
-    int big_core_num = big_core_ids_.size();
-    int little_core_num = little_core_ids_.size();
-    int big_core_cache_size = va_arg(arg_ptr, int);
-    int little_core_cache_size = va_arg(arg_ptr, int);
-    for (int i = 0; i < big_core_num; ++i) {
-      (*cache)[big_core_ids_[i]] = big_core_cache_size;
-    }
-    for (int i = 0; i < little_core_num; ++i) {
-      (*cache)[little_core_ids_[i]] = little_core_cache_size;
-    }
-  }
-  va_end(arg_ptr);
-}
-
-void DeviceInfo::SetArchInfo(int argc, ...) {
-  va_list arg_ptr;
-  va_start(arg_ptr, argc);
-  archs_.resize(core_num_);
-  if (argc == 1) {
-    ARMArch arch = (ARMArch)va_arg(arg_ptr, int);
-    for (int i = 0; i < core_num_; ++i) {
-      archs_[i] = arch;
-    }
-  } else {
-    ARMArch big_core_arch = (ARMArch)va_arg(arg_ptr, int);
-    ARMArch little_core_arch = (ARMArch)va_arg(arg_ptr, int);
-    int big_core_num = big_core_ids_.size();
-    int little_core_num = little_core_ids_.size();
-    for (int i = 0; i < big_core_num; ++i) {
-      archs_[big_core_ids_[i]] = big_core_arch;
-    }
-    for (int i = 0; i < little_core_num; ++i) {
-      archs_[little_core_ids_[i]] = little_core_arch;
-    }
-  }
-  va_end(arg_ptr);
-}
-
-bool DeviceInfo::SetCPUInfoByName() {
-  /* Snapdragon */
-  if (dev_name_.find("SM8150") != std::string::npos) {  // 855
-    core_num_ = 8;
-    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    big_core_ids_ = {4, 5, 6, 7};
-    little_core_ids_ = {0, 1, 2, 3};
-    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    SetArchInfo(2, kA76, kA55);
-    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
-    SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
-    SetCacheInfo(2, 1, 2048 * 1024);
-    SetFP16Info(1, 1);
-    SetDotInfo(1, 1);
-    return true;
-  } else if (dev_name_.find("SDM845") != std::string::npos) {  // 845
-    core_num_ = 8;
-    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    big_core_ids_ = {4, 5, 6, 7};
-    little_core_ids_ = {0, 1, 2, 3};
-    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    SetArchInfo(2, kA75, kA55);
-    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
-    SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
-    SetCacheInfo(2, 1, 2048 * 1024);
-    SetFP16Info(1, 1);
-    return true;
-  } else if (dev_name_.find("SDM710") != std::string::npos) {  // 710
-    core_num_ = 8;
-    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    big_core_ids_ = {6, 7};
-    little_core_ids_ = {0, 1, 2, 3, 4, 5};
-    cluster_ids_ = {1, 1, 1, 1, 1, 1, 0, 0};
-    SetArchInfo(2, kA75, kA55);
-    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
-    SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
-    SetCacheInfo(2, 1, 1024 * 1024);
-    return true;
-  } else if (dev_name_.find("MSM8998") != std::string::npos) {  // 835
-    core_num_ = 8;
-    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    big_core_ids_ = {4, 5, 6, 7};
-    little_core_ids_ = {0, 1, 2, 3};
-    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    SetArchInfo(2, kA73, kA53);
-    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
-    SetCacheInfo(1,
-                 2,
-                 1024 * 1024,
-                 /*real cache size is 2M, while that will get bad performace
-                    on conv3x3s1 or gemm, set to 1M or 512K*/
-                 1024 * 1024);
-    return true;
-  } else if (dev_name_.find("MSM8996") != std::string::npos) {  // 820
-    core_num_ = 4;
-    core_ids_ = {0, 1, 2, 3};
-    big_core_ids_ = {2, 3};
-    little_core_ids_ = {0, 1};
-    cluster_ids_ = {1, 1, 0, 0};
-    SetArchInfo(1, kA72);
-    SetCacheInfo(0, 1, 24 * 1024);
-    SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
-    return true;
-  } else if (dev_name_.find("SDM660") != std::string::npos ||
-             dev_name_.find("SDM636") != std::string::npos) {  // 660, 636
-    core_num_ = 8;
-    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    big_core_ids_ = {4, 5, 6, 7};
-    little_core_ids_ = {0, 1, 2, 3};
-    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    SetArchInfo(1, kA73);
-    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
-    SetCacheInfo(1, 1, 1024 * 1024);
-    return true;
-  } else if (dev_name_.find("MSM8976") != std::string::npos) {  // 652,653
-    core_num_ = 8;
-    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    big_core_ids_ = {4, 5, 6, 7};
-    little_core_ids_ = {0, 1, 2, 3};
-    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    SetArchInfo(2, kA72, kA53);
-    SetCacheInfo(0, 1, 32 * 1024);
-    SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
-    return true;
-  } else if (dev_name_.find("MSM8953") != std::string::npos) {  // 625
-    core_num_ = 8;
-    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    little_core_ids_ = {};
-    cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
-    SetArchInfo(1, kA53);
-    SetCacheInfo(0, 1, 32 * 1024);
-    SetCacheInfo(1, 1, 1024 * 1024);
-    return true;
-  } else if (dev_name_.find("MSM8939") != std::string::npos) {  // 615
-    core_num_ = 8;
-    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    big_core_ids_ = {0, 1, 2, 3};
-    little_core_ids_ = {4, 5, 6, 7};
-    cluster_ids_ = {0, 0, 0, 0, 1, 1, 1, 1};
-    SetArchInfo(1, kA53);
-    SetCacheInfo(0, 1, 32 * 1024);
-    SetCacheInfo(1, 2, 512 * 1024, 256 * 1024);
-    return true;
-    /* MediaTek */
-  } else if (dev_name_.find("MT6797") !=
-             std::string::npos) {  // X20/X23/X25/X27
-    core_num_ = 10;
-    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-    big_core_ids_ = {8, 9};
-    little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
-    SetArchInfo(2, kA72, kA53);
-    SetCacheInfo(0, 1, 32 * 1024);
-    SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
-    return true;
-  } else if (dev_name_.find("MT6799") != std::string::npos) {  // X30
-    core_num_ = 10;
-    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-    big_core_ids_ = {8, 9};
-    little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
-    SetArchInfo(2, kA73, kA53);
-    return true;
-  } else if (dev_name_.find("MT6795") != std::string::npos ||
-             dev_name_.find("MT6762") != std::string::npos ||
-             dev_name_.find("MT6755T") != std::string::npos ||
-             dev_name_.find("MT6755S") != std::string::npos ||
-             dev_name_.find("MT6753") != std::string::npos ||
-             dev_name_.find("MT6752") != std::string::npos ||
-             dev_name_.find("MT6750") != std::string::npos) {
-    // X10, P22, P15/P18, MT6753, MT6752/MT6752M, MT6750
-    core_num_ = 8;
-    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    little_core_ids_ = {};
-    cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
-    SetArchInfo(1, kA53);
-    return true;
-  } else if (dev_name_.find("MT6758") != std::string::npos ||
-             dev_name_.find("MT6757") != std::string::npos ||
-             dev_name_.find("MT6763") != std::string::npos ||
-             dev_name_.find("MT6755M") != std::string::npos ||
-             dev_name_.find("MT6755") !=
-                 std::string::npos) {  // P30, P20/P25, P23, P10
-    core_num_ = 8;
-    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    big_core_ids_ = {4, 5, 6, 7};
-    little_core_ids_ = {0, 1, 2, 3};
-    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    SetArchInfo(1, kA53);
-    return true;
-  } else if (dev_name_.find("MT6771") != std::string::npos) {  // P60
-    core_num_ = 8;
-    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    big_core_ids_ = {4, 5, 6, 7};
-    little_core_ids_ = {0, 1, 2, 3};
-    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    SetArchInfo(2, kA73, kA53);
-    return true;
-  } else if (dev_name_.find("MT6765") != std::string::npos ||
-             dev_name_.find("MT6739") != std::string::npos ||
-             dev_name_.find("MT6738") != std::string::npos ||
-             dev_name_.find("MT6737") !=
-                 std::string::npos) {  // A22, MT6739, MT6738, MT6767
-    core_num_ = 4;
-    core_ids_ = {0, 1, 2, 3};
-    big_core_ids_ = {0, 1, 2, 3};
-    little_core_ids_ = {};
-    cluster_ids_ = {0, 0, 0, 0};
-    SetArchInfo(1, kA53);
-    return true;
-  } else if (dev_name_.find("KIRIN980") != std::string::npos) {  // Kirin 980
-    core_num_ = 8;
-    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    big_core_ids_ = {4, 5, 6, 7};
-    little_core_ids_ = {0, 1, 2, 3};
-    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    SetArchInfo(2, kA76, kA55);
-    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
-    SetCacheInfo(1, 2, 512 * 1024, 128 * 1024);
-    SetCacheInfo(2, 1, 4096 * 1024);
-    SetFP16Info(1, 1);
-    SetDotInfo(1, 1);
-    return true;
-  }
-  return false;
-}
-
-void DeviceInfo::SetCPUInfoByProb() {
-#ifdef LITE_WITH_LINUX
-  // get big.LITTLE cores by sorting CPU frequency
-  sort_cpuid_by_max_freq(max_freqs_, &core_ids_, &cluster_ids_);
-  big_core_ids_.clear();
-  little_core_ids_.clear();
-  for (int i = 0; i < cluster_ids_.size(); ++i) {
-    if (cluster_ids_[i] == 0) {
-      big_core_ids_.push_back(core_ids_[i]);
-    } else {
-      little_core_ids_.push_back(core_ids_[i]);
-    }
-  }
-  // get l1, l2, l3 cache size for each core
-  for (int i = 0; i < core_num_; i++) {
-    get_cpu_cache_size(i, &(L1_cache_[i]), &(L2_cache_[i]), &(L3_cache_[i]));
-  }
-#endif  // LITE_WITH_LINUX
-}
-
-void DeviceInfo::RequestPowerFullMode(int thread_num) {
-  int big_core_size = big_core_ids_.size();
-  int little_core_size = little_core_ids_.size();
-  active_ids_.clear();
-  for (int i = 0; i < thread_num; ++i) {
-    if (i < big_core_size) {
-      active_ids_.push_back(big_core_ids_[i]);
-    } else if (i < big_core_size + little_core_size) {
-      active_ids_.push_back(little_core_ids_[i - big_core_size]);
-    }
-  }
-  mode_ = lite_api::PowerMode::LITE_POWER_FULL;
-}
-
-void DeviceInfo::RequestPowerHighMode(int thread_num) {
-  int big_core_size = big_core_ids_.size();
-  int little_core_size = little_core_ids_.size();
-  active_ids_.clear();
-  if (big_core_size > 0) {
-    mode_ = lite_api::PowerMode::LITE_POWER_HIGH;
-    if (thread_num > big_core_size) {
-      LOG(ERROR) << "Request thread num: " << thread_num
-                 << ", exceed the big cores size: " << big_core_size
-                 << ", truncate thread num to " << big_core_size;
-      active_ids_ = big_core_ids_;
-    } else {
-      for (int i = 0; i < thread_num; ++i) {
-        active_ids_.push_back(big_core_ids_[i]);
-      }
-    }
-  } else {
-    mode_ = lite_api::PowerMode::LITE_POWER_LOW;
-    LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores.";
-    if (thread_num > little_core_size) {
-      active_ids_ = little_core_ids_;
-    } else {
-      for (int i = 0; i < thread_num; ++i) {
-        active_ids_.push_back(little_core_ids_[i]);
-      }
-    }
-  }
-}
-
-void DeviceInfo::RequestPowerLowMode(int thread_num) {
-  int big_core_size = big_core_ids_.size();
-  int little_core_size = little_core_ids_.size();
-  active_ids_.clear();
-  if (little_core_size > 0) {
-    mode_ = lite_api::PowerMode::LITE_POWER_LOW;
-    if (thread_num > little_core_size) {
-      LOG(WARNING) << "Request thread num: " << thread_num
-                   << ", exceed the little cores size: " << little_core_size
-                   << ", truncate thread num to " << little_core_size;
-      active_ids_ = little_core_ids_;
-    } else {
-      for (int i = 0; i < thread_num; i++) {
-        active_ids_.push_back(little_core_ids_[i]);
-      }
-    }
-  } else {
-    mode_ = lite_api::PowerMode::LITE_POWER_HIGH;
-    LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
-    if (thread_num > big_core_size) {
-      active_ids_ = big_core_ids_;
-    } else {
-      for (int i = 0; i < thread_num; i++) {
-        active_ids_.push_back(big_core_ids_[i]);
-      }
-    }
-  }
-}
-
-void DeviceInfo::RequestPowerNoBindMode(int thread_num) {
-  active_ids_.clear();
-  if (thread_num > core_ids_.size()) {
-    active_ids_ = core_ids_;
-  } else {
-    active_ids_.resize(thread_num);
-    for (int i = 0; i < thread_num; ++i) {
-      if (i < big_core_ids_.size()) {
-        active_ids_[i] = big_core_ids_[i];
-      } else {
-        active_ids_[i] = little_core_ids_[i - big_core_ids_.size()];
-      }
-    }
-  }
-  mode_ = lite_api::PowerMode::LITE_POWER_NO_BIND;
-}
-
-void DeviceInfo::RequestPowerRandHighMode(int shift_num, int thread_num) {
-  int big_core_size = big_core_ids_.size();
-  int little_core_size = little_core_ids_.size();
-  active_ids_.clear();
-  if (big_core_size > 0) {
-    mode_ = lite_api::PowerMode::LITE_POWER_RAND_HIGH;
-    if (thread_num > big_core_size) {
-      LOG(WARNING) << "Request thread num: " << thread_num
-                   << ", exceed the big cores size: " << big_core_size
-                   << ", truncate thread num to " << big_core_size;
-      active_ids_ = big_core_ids_;
-    } else {
-      for (int i = 0; i < thread_num; ++i) {
-        active_ids_.push_back(big_core_ids_[(i + shift_num) % big_core_size]);
-      }
-    }
-  } else {
-    mode_ = lite_api::PowerMode::LITE_POWER_LOW;
-    LOG(WARNING) << "HIGH POWER MODE is not support, switch to little cores.";
-    if (thread_num > little_core_size) {
-      active_ids_ = little_core_ids_;
-    } else {
-      for (int i = 0; i < thread_num; ++i) {
-        active_ids_.push_back(little_core_ids_[i]);
-      }
-    }
-  }
-}
-
-void DeviceInfo::RequestPowerRandLowMode(int shift_num, int thread_num) {
-  int big_core_size = big_core_ids_.size();
-  int little_core_size = little_core_ids_.size();
-  active_ids_.clear();
-  if (little_core_size > 0) {
-    mode_ = lite_api::PowerMode::LITE_POWER_RAND_LOW;
-    if (thread_num > little_core_size) {
-      LOG(WARNING) << "Request thread num: " << thread_num
-                   << ", exceed the little cores size: " << little_core_size
-                   << ", truncate thread num to " << little_core_size;
-      active_ids_ = little_core_ids_;
-    } else {
-      for (int i = 0; i < thread_num; ++i) {
-        active_ids_.push_back(
-            little_core_ids_[(i + shift_num) % little_core_size]);
-      }
-    }
-  } else {
-    mode_ = lite_api::PowerMode::LITE_POWER_HIGH;
-    LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores.";
-    if (thread_num > big_core_size) {
-      active_ids_ = big_core_ids_;
-    } else {
-      for (int i = 0; i < thread_num; ++i) {
-        active_ids_.push_back(big_core_ids_[i]);
-      }
-    }
-  }
-}
-
-int DeviceInfo::Setup() {
-  core_num_ = get_cpu_num();
-  mem_size_ = get_mem_size();
-  get_cpu_arch(&archs_, core_num_);
-  // set defalut CPU info
-  SetCacheInfo(0, 1, DEFAULT_L1_CACHE_SIZE);
-  SetCacheInfo(1, 1, DEFAULT_L2_CACHE_SIZE);
-  SetCacheInfo(2, 1, DEFAULT_L3_CACHE_SIZE);
-  SetFP32Info(1, 1);
-  SetFP16Info(1, 0);
-  SetDotInfo(1, 0);
-  max_freqs_.resize(core_num_);
-  min_freqs_.resize(core_num_);
-#ifdef LITE_WITH_LINUX
-  // get max&min freq
-  for (int i = 0; i < core_num_; ++i) {
-    int max_freq, min_freq;
-    get_cpu_max_min_freq(i, &max_freq, &min_freq);
-    max_freqs_[i] = max_freq / 1000;
-    min_freqs_[i] = min_freq / 1000;
-  }
-  // get cache size and big.LITTLE core ids
-  dev_name_ = get_cpu_name();
-  if (!SetCPUInfoByName()) {
-    SetCPUInfoByProb();
-  }
-  core_ids_.resize(core_num_);
-  cluster_ids_.resize(core_num_);
-  for (int i = 0; i < core_num_; ++i) {
-    max_freqs_[i] = 1000000;
-    min_freqs_[i] = 1000000;
-    cluster_ids_[i] = 0;
-  }
-#else
-#ifdef TARGET_IOS
-  dev_name_ = "Apple";
-#else
-  dev_name_ = "Unknown";
-#endif
-  core_ids_.resize(core_num_);
-  cluster_ids_.resize(core_num_);
-  big_core_ids_.resize(core_num_);
-  for (int i = 0; i < core_num_; ++i) {
-    max_freqs_[i] = 1000000;
-    min_freqs_[i] = 1000000;
-    cluster_ids_[i] = 0;
-    core_ids_[i] = i;
-    big_core_ids_[i] = i;
-  }
-#endif
-  // output info
-  LOG(INFO) << "ARM multiprocessors name: " << dev_name_;
-  LOG(INFO) << "ARM multiprocessors number: " << core_num_;
-  for (int i = 0; i < core_num_; ++i) {
-    LOG(INFO) << "ARM multiprocessors ID: " << core_ids_[i]
-              << ", max freq: " << max_freqs_[i]
-              << ", min freq: " << min_freqs_[i]
-              << ", cluster ID: " << cluster_ids_[core_ids_[i]]
-              << ", CPU ARCH: A" << archs_[i];
-  }
-  LOG(INFO) << "L1 DataCache size is: ";
-  for (int i = 0; i < core_num_; ++i) {
-    LOG(INFO) << L1_cache_[i] / 1024 << " KB";
-  }
-  LOG(INFO) << "L2 Cache size is: ";
-  for (int i = 0; i < core_num_; ++i) {
-    LOG(INFO) << L2_cache_[i] / 1024 << " KB";
-  }
-  LOG(INFO) << "L3 Cache size is: ";
-  for (int i = 0; i < core_num_; ++i) {
-    LOG(INFO) << L3_cache_[i] / 1024 << " KB";
-  }
-  LOG(INFO) << "Total memory: " << mem_size_ << "KB";
-  // set default run mode
-  SetRunMode(lite_api::PowerMode::LITE_POWER_NO_BIND,
-             1);  // use single thread by default
-  return 0;
-}
-
-void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) {
-#ifdef ARM_WITH_OMP
-  thread_num = std::min(thread_num, core_num_);
-#else
-  thread_num = 1;  // force thread_num to 1 if OpenMP is disabled
-#endif
-#ifdef LITE_WITH_LINUX
-  int big_core_size = big_core_ids_.size();
-  int little_core_size = little_core_ids_.size();
-  int big_little_core_size = big_core_size + little_core_size;
-  thread_num = std::min(thread_num, big_little_core_size);
-  count_++;
-  int shift_num = (count_ / 10) % big_core_size;
-  switch (mode) {
-    case lite_api::LITE_POWER_FULL:
-      RequestPowerFullMode(thread_num);
-      break;
-    case lite_api::LITE_POWER_HIGH:
-      RequestPowerHighMode(thread_num);
-      break;
-    case lite_api::LITE_POWER_LOW:
-      RequestPowerLowMode(thread_num);
-      break;
-    case lite_api::LITE_POWER_NO_BIND:
-      RequestPowerNoBindMode(thread_num);
-      break;
-    case lite_api::LITE_POWER_RAND_HIGH:
-      RequestPowerRandHighMode(shift_num, thread_num);
-      break;
-    case lite_api::LITE_POWER_RAND_LOW:
-      RequestPowerRandLowMode(shift_num, thread_num);
-      break;
-    default:
-      LOG(FATAL) << "Unsupported power mode: " << mode;
-      break;
-  }
-  if (active_ids_.empty()) {
-    active_ids_.push_back(0);
-  }
-#ifdef ARM_WITH_OMP
-  omp_set_num_threads(active_ids_.size());
-#endif
-  if (mode_ != lite_api::LITE_POWER_NO_BIND) {
-    if (check_cpu_online(active_ids_)) {
-      bind_threads(active_ids_);
-    } else {
-      LOG(WARNING) << "Some cores are offline, switch to NO BIND MODE";
-      mode_ = lite_api::LITE_POWER_NO_BIND;
-    }
-  }
-#else  // LITE_WITH_LINUX
-  // only LITE_POWER_NO_BIND is supported in other OS
-  RequestPowerNoBindMode(thread_num);
-#ifdef ARM_WITH_OMP
-  omp_set_num_threads(active_ids_.size());
-#endif
-#endif  // LITE_WITH_LINUX
-  //! alloc memory for sgemm in this context
-  workspace_.Resize({llc_size()});
-  workspace_.mutable_data<int8_t>();
-  arch_ = archs_[active_ids_[0]];
-}
-
-void DeviceInfo::SetCache(int l1size, int l2size, int l3size) {
-  SetCacheInfo(0, 1, l1size);
-  SetCacheInfo(1, 1, l2size);
-  SetCacheInfo(2, 1, l3size);
-  workspace_.Resize({2 * (l1size + l2size)});
-}
-
-bool DeviceInfo::ExtendWorkspace(int size) {
-  workspace_.Resize({size + llc_size()});
-  workspace_.mutable_data<int8_t>();
-  return true;
-}
-
-#endif  // LITE_WITH_ARM
-
-#ifdef LITE_WITH_CUDA
-
-void Device<TARGET(kCUDA)>::Init() {
-  GetInfo();
-  CreateStream();
-}
-
-void Device<TARGET(kCUDA)>::GetInfo() {
-  cudaGetDeviceProperties(&device_prop_, idx_);
-  cudaRuntimeGetVersion(&runtime_version_);
-  sm_version_ = (device_prop_.major << 8 | device_prop_.minor);
-  has_hmma_ =
-      (sm_version_ == 0x0700 || sm_version_ == 0x0702 || sm_version_ == 0x0705);
-  has_fp16_ = (sm_version_ == 0x0602 || sm_version_ == 0x0600 ||
-               sm_version_ == 0x0503 || has_hmma_);
-  has_imma_ = (sm_version_ == 0x0702 || sm_version_ == 0x0705);
-  has_int8_ = (sm_version_ == 0x0601 || sm_version_ == 0x0700 || has_imma_);
-}
-
-void Device<TARGET(kCUDA)>::CreateStream() {
-  exec_stream_.clear();
-  io_stream_.clear();
-  for (int i = 0; i < max_stream_; i++) {
-    cudaStream_t exec_stream;
-    cudaStream_t io_stream;
-    cudaStreamCreate(&exec_stream);
-    cudaStreamCreate(&io_stream);
-    exec_stream_.push_back(exec_stream);
-    io_stream_.push_back(io_stream);
-  }
-}
-
-#endif
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/device_info.h b/lite/core/device_info.h
deleted file mode 100644
index 96f4680135..0000000000
--- a/lite/core/device_info.h
+++ /dev/null
@@ -1,209 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cstdarg>
-#include <string>
-#include <vector>
-#include "lite/core/tensor.h"
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-
-#ifdef LITE_WITH_ARM
-
-typedef enum {
-  kAPPLE = 0,
-  kA53 = 53,
-  kA55 = 55,
-  kA57 = 57,
-  kA72 = 72,
-  kA73 = 73,
-  kA75 = 75,
-  kA76 = 76,
-  kARMArch_UNKOWN = -1
-} ARMArch;
-
-class DeviceInfo {
- public:
-  static DeviceInfo& Global() {
-    static auto* x = new DeviceInfo;
-    return *x;
-  }
-
-  static int Init() {
-    static int ret = Global().Setup();
-    return ret;
-  }
-
-  int Setup();
-
-  void SetRunMode(lite_api::PowerMode mode, int thread_num);
-  void SetCache(int l1size, int l2size, int l3size);
-  void SetArch(ARMArch arch) { arch_ = arch; }
-
-  lite_api::PowerMode mode() const { return mode_; }
-  int threads() const { return active_ids_.size(); }
-  ARMArch arch() const { return arch_; }
-  int l1_cache_size() const { return L1_cache_[active_ids_[0]]; }
-  int l2_cache_size() const { return L2_cache_[active_ids_[0]]; }
-  int l3_cache_size() const { return L3_cache_[active_ids_[0]]; }
-  int llc_size() const {
-    auto size = L3_cache_[active_ids_[0]] > 0 ? L3_cache_[active_ids_[0]]
-                                              : L2_cache_[active_ids_[0]];
-    return size > 0 ? size : 512 * 1024;
-  }
-  bool has_dot() const { return dot_[active_ids_[0]]; }
-  bool has_fp16() const { return fp16_[active_ids_[0]]; }
-
-  template <typename T>
-  T* workspace_data() {
-    return reinterpret_cast<T*>(workspace_.mutable_data<int8_t>());
-  }
-  bool ExtendWorkspace(int size);
-
- private:
-  int core_num_;
-  std::vector<int> max_freqs_;
-  std::vector<int> min_freqs_;
-  int mem_size_;
-  std::string dev_name_;
-
-  std::vector<int> L1_cache_;
-  std::vector<int> L2_cache_;
-  std::vector<int> L3_cache_;
-  std::vector<int> core_ids_;
-  std::vector<int> big_core_ids_;
-  std::vector<int> little_core_ids_;
-  std::vector<int> cluster_ids_;
-  std::vector<ARMArch> archs_;
-  std::vector<bool> fp32_;
-  std::vector<bool> fp16_;
-  std::vector<bool> dot_;
-
-  ARMArch arch_;
-  // LITE_POWER_HIGH stands for using big cores,
-  // LITE_POWER_LOW stands for using small core,
-  // LITE_POWER_FULL stands for using all cores
-  lite_api::PowerMode mode_;
-  std::vector<int> active_ids_;
-  TensorLite workspace_;
-  int64_t count_{0};
-
-  void SetDotInfo(int argc, ...);
-  void SetFP16Info(int argc, ...);
-  void SetFP32Info(int argc, ...);
-  void SetCacheInfo(int cache_id, int argc, ...);
-  void SetArchInfo(int argc, ...);
-  bool SetCPUInfoByName();
-  void SetCPUInfoByProb();
-  void RequestPowerFullMode(int thread_num);
-  void RequestPowerHighMode(int thread_num);
-  void RequestPowerLowMode(int thread_num);
-  void RequestPowerNoBindMode(int thread_num);
-  void RequestPowerRandHighMode(int shift_num, int thread_num);
-  void RequestPowerRandLowMode(int shift_num, int thread_num);
-
-  DeviceInfo() = default;
-};
-
-#endif  // LITE_WITH_ARM
-
-template <TargetType Type>
-class Device;
-
-template <TargetType Type>
-class Env {
- public:
-  typedef TargetWrapper<Type> API;
-  typedef std::vector<Device<Type>> Devs;
-  static Devs& Global() {
-    static Devs* devs = new Devs();
-    return *devs;
-  }
-  static void Init(int max_stream = 4) {
-    Devs& devs = Global();
-    if (devs.size() > 0) {
-      return;
-    }
-    int count = 0;
-    // Get device count
-    count = API::num_devices();
-    if (count == 0) {
-      CHECK(false) << "No device found!";
-    } else {
-      LOG(INFO) << "Found " << count << " device(s)";
-    }
-    // create all device
-    for (int i = 0; i < count; i++) {
-      auto dev = Device<Type>(i, max_stream);
-      dev.Init();
-      devs.push_back(dev);
-    }
-    LOG(INFO) << "dev size = " << devs.size();
-  }
-};
-
-#ifdef LITE_WITH_CUDA
-template <>
-class Device<TARGET(kCUDA)> {
- public:
-  Device(int dev_id, int max_stream = 1)
-      : idx_(dev_id), max_stream_(max_stream) {}
-  void Init();
-
-  int id() { return idx_; }
-  int max_stream() { return max_stream_; }
-  int SetId(int idx) { idx_ = idx; }
-  std::string name() { return device_prop_.name; }
-  int core_num() { return device_prop_.multiProcessorCount; }
-  float max_memory() { return device_prop_.totalGlobalMem / 1048576.; }
-  std::vector<cudaStream_t> exec_streams() { return exec_stream_; }
-  std::vector<cudaStream_t> io_streams() { return io_stream_; }
-
-  int sm_version() { return sm_version_; }
-  bool has_fp16() { return has_fp16_; }
-  bool has_int8() { return has_fp16_; }
-  bool has_hmma() { return has_fp16_; }
-  bool has_imma() { return has_fp16_; }
-  int runtime_version() { return runtime_version_; }
-
- private:
-  void CreateStream();
-  void GetInfo();
-
- private:
-  int max_stream_;
-  int idx_{0};
-  cudaDeviceProp device_prop_;
-  std::string device_name_;
-  float max_memory_;
-
-  int sm_version_;
-  bool has_fp16_;
-  bool has_int8_;
-  bool has_hmma_;
-  bool has_imma_;
-  int runtime_version_;
-  std::vector<cudaStream_t> exec_stream_;
-  std::vector<cudaStream_t> io_stream_;
-};
-
-template class Env<TARGET(kCUDA)>;
-#endif
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/framework.proto b/lite/core/framework.proto
deleted file mode 100644
index 6c60a041a1..0000000000
--- a/lite/core/framework.proto
+++ /dev/null
@@ -1,188 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-syntax = "proto2";
-// option optimize_for = LITE_RUNTIME;
-package paddle.framework.proto;
-
-// Any incompatible changes to ProgramDesc and its dependencies should
-// raise the version defined version.h.
-//
-// Serailization and Deserialization codes should be modified in a way
-// that supports old versions following the version and compatibility policy.
-message Version { optional int64 version = 1 [ default = 0 ]; }
-
-enum AttrType {
-  INT = 0;
-  FLOAT = 1;
-  STRING = 2;
-  INTS = 3;
-  FLOATS = 4;
-  STRINGS = 5;
-  BOOLEAN = 6;
-  BOOLEANS = 7;
-  BLOCK = 8;
-  LONG = 9;
-  BLOCKS = 10;
-  LONGS = 11;
-}
-
-// OpDesc describes an instance of a C++ framework::OperatorBase
-// derived class type.
-message OpDesc {
-
-  message Attr {
-    required string name = 1;
-    required AttrType type = 2;
-    optional int32 i = 3;
-    optional float f = 4;
-    optional string s = 5;
-    repeated int32 ints = 6;
-    repeated float floats = 7;
-    repeated string strings = 8;
-    optional bool b = 10;
-    repeated bool bools = 11;
-    optional int32 block_idx = 12;
-    optional int64 l = 13;
-    repeated int32 blocks_idx = 14;
-    repeated int64 longs = 15;
-  };
-
-  message Var {
-    required string parameter = 1;
-    repeated string arguments = 2;
-  };
-
-  required string type = 3;
-  repeated Var inputs = 1;
-  repeated Var outputs = 2;
-  repeated Attr attrs = 4;
-  optional bool is_target = 5 [ default = false ];
-};
-
-// OpProto describes a C++ framework::OperatorBase derived class.
-message OpProto {
-
-  // VarProto describes the C++ type framework::Variable.
-  message Var {
-    required string name = 1;
-    required string comment = 2;
-
-    optional bool duplicable = 3 [ default = false ];
-    optional bool intermediate = 4 [ default = false ];
-    optional bool dispensable = 5 [ default = false ];
-  }
-
-  // AttrProto describes the C++ type Attribute.
-  message Attr {
-    required string name = 1;
-    required AttrType type = 2;
-    required string comment = 3;
-    // If that attribute is generated, it means the Paddle third
-    // language binding has responsibility to fill that
-    // attribute. End-User should not set that attribute.
-    optional bool generated = 4 [ default = false ];
-  }
-
-  required string type = 1;
-  repeated Var inputs = 2;
-  repeated Var outputs = 3;
-  repeated Attr attrs = 4;
-  required string comment = 5;
-}
-
-message VarType {
-  enum Type {
-    // Pod Types
-    BOOL = 0;
-    INT16 = 1;
-    INT32 = 2;
-    INT64 = 3;
-    FP16 = 4;
-    FP32 = 5;
-    FP64 = 6;
-    // Tensor<size_t> is used in C++.
-    SIZE_T = 19;
-    UINT8 = 20;
-    INT8 = 21;
-
-    // Other types that may need additional descriptions
-    LOD_TENSOR = 7;
-    SELECTED_ROWS = 8;
-    FEED_MINIBATCH = 9;
-    FETCH_LIST = 10;
-    STEP_SCOPES = 11;
-    LOD_RANK_TABLE = 12;
-    LOD_TENSOR_ARRAY = 13;
-    PLACE_LIST = 14;
-    READER = 15;
-    // Any runtime decided variable type is raw
-    // raw variables should manage their own allocations
-    // in operators like nccl_op
-    RAW = 17;
-    TUPLE = 18;
-  }
-
-  required Type type = 1;
-
-  message TensorDesc {
-    // Should only be PODType. Is enforced in C++
-    required Type data_type = 1;
-    repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
-  }
-  optional TensorDesc selected_rows = 2;
-
-  message LoDTensorDesc {
-    required TensorDesc tensor = 1;
-    optional int32 lod_level = 2 [ default = 0 ];
-  }
-  optional LoDTensorDesc lod_tensor = 3;
-
-  message LoDTensorArrayDesc {
-    required TensorDesc tensor = 1;
-    optional int32 lod_level = 2 [ default = 0 ];
-  }
-  optional LoDTensorArrayDesc tensor_array = 4;
-
-  message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; }
-  optional ReaderDesc reader = 5;
-
-  message Tuple { repeated Type element_type = 1; }
-  optional Tuple tuple = 7;
-}
-
-message VarDesc {
-  required string name = 1;
-  required VarType type = 2;
-  optional bool persistable = 3 [ default = false ];
-}
-
-message BlockDesc {
-  required int32 idx = 1;
-  required int32 parent_idx = 2;
-  repeated VarDesc vars = 3;
-  repeated OpDesc ops = 4;
-  optional int32 forward_block_idx = 5 [ default = -1 ];
-}
-
-// Please refer to
-// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
-// for more details.
-// TODO(panyx0718): A model can have multiple programs. Need a
-// way to distinguish them. Maybe ID or name?
-message ProgramDesc {
-  repeated BlockDesc blocks = 1;
-
-  optional Version version = 2;
-}
diff --git a/lite/core/kernel.cc b/lite/core/kernel.cc
deleted file mode 100644
index 7ec718cb38..0000000000
--- a/lite/core/kernel.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/kernel.h"
-#include <cstdlib>
-#include "lite/utils/string.h"
-
-namespace paddle {
-namespace lite {
-
-std::string KernelBase::summary() const {
-  STL::stringstream ss;
-  ss << op_type() << ":" << TargetToStr(target()) << "/"
-     << PrecisionToStr(precision()) << "/" << DataLayoutToStr(layout()) << "("
-     << alias() << ")";
-  return ss.str();
-}
-
-const Type *KernelBase::GetInputDeclType(const std::string &arg_name) const {
-  CHECK(!op_type_.empty()) << "op_type should be set first";
-  const auto *type = ParamTypeRegistry::Global().RetrieveInArgument(
-      place(), GenParamTypeKey(), arg_name);
-  CHECK(type) << "no type registered for kernel [" << op_type_
-              << "] input argument [" << arg_name << "]"
-              << " with key " << GenParamTypeKey();
-  return type->type;
-}
-
-const Type *KernelBase::GetOutputDeclType(const std::string &arg_name) const {
-  CHECK(!op_type_.empty()) << "op_type should be set first";
-  const auto *type = ParamTypeRegistry::Global().RetrieveOutArgument(
-      place(), GenParamTypeKey(), arg_name);
-  CHECK(type) << "no type registered for kernel [" << GenParamTypeKey()
-              << "] output argument [" << arg_name << "]";
-  return type->type;
-}
-
-std::string KernelBase::GenParamTypeKey() const {
-  STL::stringstream ss;
-  ss << op_type() << "/" << alias_;
-  return ss.str();
-}
-
-void KernelBase::ParseKernelType(const std::string &kernel_type,
-                                 std::string *op_type,
-                                 std::string *alias,
-                                 Place *place) {
-  auto parts = Split(kernel_type, "/");
-  CHECK_EQ(parts.size(), 5);
-  *op_type = parts[0];
-  *alias = parts[1];
-
-  std::string target, precision, layout;
-
-  target = parts[2];
-  precision = parts[3];
-  layout = parts[4];
-
-  place->target = static_cast<TargetType>(std::atoi(target.c_str()));
-  place->precision = static_cast<PrecisionType>(std::atoi(precision.c_str()));
-  place->layout = static_cast<DataLayoutType>(std::atoi(layout.c_str()));
-}
-
-std::string KernelBase::SerializeKernelType(const std::string &op_type,
-                                            const std::string &alias,
-                                            const Place &place) {
-  STL::stringstream ss;
-  ss << op_type << "/";
-  ss << alias << "/";
-  // We serialize the place value not the string representation here for
-  // easier deserialization.
-  ss << static_cast<int>(place.target) << "/";
-  ss << static_cast<int>(place.precision) << "/";
-  ss << static_cast<int>(place.layout);
-  return ss.str();
-}
-
-bool ParamTypeRegistry::KeyCmp::operator()(
-    const ParamTypeRegistry::key_t &a,
-    const ParamTypeRegistry::key_t &b) const {
-  return a.hash() < b.hash();
-}
-
-STL::ostream &operator<<(STL::ostream &os,
-                         const ParamTypeRegistry::KernelIdTy &other) {
-  std::string io_s = other.io == ParamTypeRegistry::IO::kInput ? "in" : "out";
-  os << other.kernel_type << ":" << other.arg_name << ":" << io_s << ":"
-     << other.place.DebugString();
-  return os;
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/kernel.h b/lite/core/kernel.h
deleted file mode 100644
index 92eca6af54..0000000000
--- a/lite/core/kernel.h
+++ /dev/null
@@ -1,189 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-#include <utility>
-#include <vector>
-#include "lite/backends/arm/math/type_trans.h"
-#include "lite/core/context.h"
-#include "lite/core/target_wrapper.h"
-#include "lite/core/type_system.h"
-#include "lite/core/types.h"
-#include "lite/core/workspace.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/all.h"
-#include "lite/utils/replace_stl/stream.h"
-
-namespace paddle {
-namespace lite {
-
-// An base with virtual functions to unify all the kernel implementation on
-// different targets.
-class KernelBase {
- public:
-  // type_infer_handler is used to inference a output type by considering the
-  // input types in the type system.
-  using type_infer_handler_t = std::function<const Type*(
-      const std::map<std::string, const Type*>& input_types,
-      const std::string& out_arg)>;
-
- protected:
-  /// Run some initialization before `Run`, it will invoke after `SetParam` and
-  /// `SetContext`, that is both the param_ and context_ are valid.
-  virtual void PrepareForRun() {}
-
-  /// Run the kernel. Before Run, both the param_ and context_ should be valid.
-  virtual void Run() = 0;
-
- public:
-  void Launch() {
-    if (is_first_epoch_) {
-      PrepareForRun();
-      is_first_epoch_ = false;
-    }
-
-    // Reset the workspace to make every kernel in the same thread to share the
-    // temporary memory.
-    WorkSpace::Global_Host().AllocReset();
-#if defined(LITE_WITH_X86)
-    WorkSpace::Global_X86().AllocReset();
-#endif
-#if defined(LITE_WITH_CUDA)
-    WorkSpace::Global_CUDA().AllocReset();
-#endif
-    Run();
-  }
-
-  void SetContext(std::unique_ptr<KernelContext>&& ctx) {
-    ctx_ = std::move(ctx);
-  }
-  template <typename T>
-  void SetParam(T param) {
-    param_.set<T>(param);
-  }
-  template <typename P>
-  P& Param() const {
-    return *param_.get_mutable<P>();
-  }
-
-  // This is used in the kernels that takes 'kAny' places and inference the
-  // output place. For `ScaleCompute` and `IoCopyCompute`, their input types are
-  // declared as 'kAny' in some Place field, and the output is also `kAny`, but
-  // when in real execution, when takes some non-kAny type as input, the
-  // output's kAny-fields can be determained. For example, when the
-  // `ScaleCompute` takes `TensorFp32NCHWTy` as input, its output should be also
-  // `TensorFp32NCHWTy`. This type inference rule is different for each kernel,
-  // so we make it a virtual method.
-  // One can custom this handler to make a specific type inference rule for a
-  // kernel, or leave the default to force the kernel use the system's
-  // type-inference rules.
-  virtual std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() {
-    return nullptr;
-  }
-
-  void set_op_type(const std::string& type) { op_type_ = type; }
-  const std::string& op_type() const { return op_type_; }
-
-  // Get input declaration Type.
-  const Type* GetInputDeclType(const std::string& arg_name) const;
-
-  // Get output declaration Type.
-  const Type* GetOutputDeclType(const std::string& arg_name) const;
-
-  void set_alias(const std::string& x) { alias_ = x; }
-  const std::string& alias() const { return alias_; }
-
-  virtual Place place() const = 0;
-  virtual TargetType target() const = 0;
-  virtual PrecisionType precision() const = 0;
-  virtual DataLayoutType layout() const = 0;
-  const KernelContext* context() const { return ctx_.get(); }
-  KernelContext* mutable_context() { return ctx_.get(); }
-  virtual std::string name() const = 0;
-
-  // Short human-readable document.
-  std::string summary() const;
-  // Long human-readable document.
-  virtual std::string doc() const { return ""; }
-  // Generate the key of the parameter type.
-  std::string GenParamTypeKey() const;
-
-  // Used to serialize the kernel.
-  std::string SerializedKernelType() const {
-    return SerializeKernelType(op_type(), alias(), place());
-  }
-
-  static std::string SerializeKernelType(const std::string& op_type,
-                                         const std::string& alias,
-                                         const Place& place);
-
-  static void ParseKernelType(const std::string& kernel_type,
-                              std::string* op_type,
-                              std::string* alias,
-                              Place* place);
-
-  std::string key_with_alias() const { return op_type() + "/" + alias(); }
-
-  virtual ~KernelBase() = default;
-  void Torch() {}
-
- protected:
-  std::unique_ptr<KernelContext> ctx_{nullptr};
-  mutable operators::param_t param_;
-  // The corresponding op type.
-  std::string op_type_{};
-  // The extra identity to help defficiate a specific kernel, op_type_ + alias_
-  // is the unique ID for the kernel.
-  std::string alias_{};
-  bool is_first_epoch_{true};
-};
-
-// Light-weight kernel implementation.
-// The OpKernel is designed to implement the specific algorithm on a target
-// device.
-// TODO(Superjomn) Consider to add a Platform type to differentiate CUDNN,
-// MKLDNN, plain CUDA C implementations.
-template <TargetType Target,
-          PrecisionType Precision,
-          DataLayoutType DataLayout = DataLayoutType::kNCHW>
-class KernelLite : public KernelBase {
- public:
-  // Run the kernel.
-  virtual void Run() { CHECK(false) << "Not Implemented"; }
-
-  TargetType target() const override { return Target; }
-  PrecisionType precision() const override { return Precision; }
-  DataLayoutType layout() const override { return DataLayout; }
-  Place place() const override { return Place{Target, Precision, DataLayout}; }
-  std::string name() const override;
-
-  void Touch() {}
-
-  KernelLite() = default;
-  virtual ~KernelLite() = default;
-};
-
-template <TargetType Target, PrecisionType Precision, DataLayoutType DataLayout>
-std::string KernelLite<Target, Precision, DataLayout>::name() const {
-  return op_type() + ":" + TargetToStr(Target) + "/" +
-         PrecisionToStr(Precision) + "/" + DataLayoutToStr(DataLayout);
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/kernel_test.cc b/lite/core/kernel_test.cc
deleted file mode 100644
index 8ad8b47744..0000000000
--- a/lite/core/kernel_test.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/kernel.h"
-#include <gtest/gtest.h>
-#include "lite/core/op_lite.h"
-
-namespace paddle {
-namespace lite {
-namespace core {
-
-int test_code{-1};
-class SomeKernel : public KernelLite<TARGET(kHost), PRECISION(kFloat)> {
- public:
-  void Run() override {
-    LOG(INFO) << "SomeKernel executed";
-    LOG(INFO) << Param<operators::FcParam>().in_num_col_dims;
-    test_code = Param<operators::FcParam>().in_num_col_dims;
-  }
-
-  TargetType target() const override { return TARGET(kHost); }
-  PrecisionType precision() const override { return PRECISION(kFloat); }
-};
-
-TEST(Kernel, test) {
-  SomeKernel kernel;
-  operators::FcParam param;
-  param.in_num_col_dims = 100;
-  kernel.SetParam<operators::FcParam>(param);
-  kernel.Run();
-  ASSERT_EQ(test_code, 100);
-}
-
-TEST(Kernel, kernel_type) {
-  const std::string op_type = "fc";
-  const std::string alias = "def";
-  Place place(TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
-  auto kernel_type = KernelBase::SerializeKernelType(op_type, alias, place);
-  LOG(INFO) << "kernel_type: " << kernel_type;
-  ASSERT_EQ(kernel_type, "fc/def/1/1/1");
-
-  std::string op_type1, alias1;
-  Place place1;
-  KernelBase::ParseKernelType(kernel_type, &op_type1, &alias1, &place1);
-  ASSERT_EQ(op_type, op_type1);
-  ASSERT_EQ(alias, alias1);
-  ASSERT_EQ(place, place1);
-}
-
-}  // namespace core
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/lite.map b/lite/core/lite.map
deleted file mode 100644
index 31adae4219..0000000000
--- a/lite/core/lite.map
+++ /dev/null
@@ -1,6 +0,0 @@
-{
-    global:
-        *paddle*;
-    local:
-        *;
-};
diff --git a/lite/core/lite_gtest_main.cc b/lite/core/lite_gtest_main.cc
deleted file mode 100644
index 9784fc7994..0000000000
--- a/lite/core/lite_gtest_main.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  google::ParseCommandLineFlags(&argc, &argv, false);
-
-  return RUN_ALL_TESTS();
-}
diff --git a/lite/core/lite_tensor_test.cc b/lite/core/lite_tensor_test.cc
deleted file mode 100644
index d667a9f885..0000000000
--- a/lite/core/lite_tensor_test.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-
-TEST(tensor, test) {
-  TensorLite tensor;
-  DDimLite ddim({1, 8});
-  tensor.Resize(ddim);
-
-  for (int i = 0; i < 8; i++) {
-    tensor.mutable_data<int>()[i] = i;
-  }
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/memory.cc b/lite/core/memory.cc
deleted file mode 100644
index 463e10b9f9..0000000000
--- a/lite/core/memory.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/memory.h"
-
-namespace paddle {
-namespace lite {
-
-void* TargetMalloc(TargetType target, size_t size) {
-  void* data{nullptr};
-  switch (target) {
-    case TargetType::kHost:
-    case TargetType::kX86:
-    case TargetType::kARM:
-      data = TargetWrapper<TARGET(kHost)>::Malloc(size);
-      break;
-#ifdef LITE_WITH_CUDA
-    case TargetType::kCUDA:
-      data = TargetWrapper<TARGET(kCUDA)>::Malloc(size);
-      break;
-#endif  // LITE_WITH_CUDA
-#ifdef LITE_WITH_OPENCL
-    case TargetType::kOpenCL:
-      data = TargetWrapperCL::Malloc(size);
-      break;
-#endif  // LITE_WITH_OPENCL
-#ifdef LITE_WITH_FPGA
-    case TargetType::kFPGA:
-      data = TargetWrapper<TARGET(kFPGA)>::Malloc(size);
-      break;
-#endif  // LITE_WITH_OPENCL
-    default:
-      LOG(FATAL) << "Unknown supported target " << TargetToStr(target);
-  }
-  return data;
-}
-
-void TargetFree(TargetType target, void* data) {
-  switch (target) {
-    case TargetType::kHost:
-    case TargetType::kX86:
-    case TargetType::kARM:
-      TargetWrapper<TARGET(kHost)>::Free(data);
-      break;
-
-#ifdef LITE_WITH_CUDA
-    case TargetType::kCUDA:
-      TargetWrapper<TARGET(kCUDA)>::Free(data);
-      break;
-#endif  // LITE_WITH_CUDA
-#ifdef LITE_WITH_OPENCL
-    case TargetType::kOpenCL:
-      TargetWrapperCL::Free(data);
-      break;
-#endif  // LITE_WITH_OPENCL
-#ifdef LITE_WITH_FPGA
-    case TargetType::kFPGA:
-      TargetWrapper<TARGET(kFPGA)>::Free(data);
-      break;
-#endif  // LITE_WITH_CUDA
-    default:
-      LOG(FATAL) << "Unknown type";
-  }
-}
-
-void TargetCopy(TargetType target, void* dst, const void* src, size_t size) {
-  switch (target) {
-    case TargetType::kHost:
-    case TargetType::kX86:
-    case TargetType::kARM:
-      TargetWrapper<TARGET(kHost)>::MemcpySync(
-          dst, src, size, IoDirection::DtoD);
-      break;
-
-#ifdef LITE_WITH_CUDA
-    case TargetType::kCUDA:
-      TargetWrapper<TARGET(kCUDA)>::MemcpySync(
-          dst, src, size, IoDirection::DtoD);
-      break;
-#endif
-#ifdef LITE_WITH_FPGA
-    case TargetType::kFPGA:
-      TargetWrapper<TARGET(kFPGA)>::MemcpySync(
-          dst, src, size, IoDirection::DtoD);
-      break;
-#endif
-#ifdef LITE_WITH_OPENCL
-    case TargetType::kOpenCL:
-      TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD);
-      break;
-#endif  // LITE_WITH_OPENCL
-    default:
-      LOG(FATAL) << "unsupported type";
-  }
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/memory.h b/lite/core/memory.h
deleted file mode 100644
index 31d7fd34e1..0000000000
--- a/lite/core/memory.h
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/api/paddle_place.h"
-#include "lite/core/target_wrapper.h"
-#include "lite/utils/macros.h"
-
-#ifdef LITE_WITH_OPENCL
-#include "lite/backends/opencl/target_wrapper.h"
-#endif  // LITE_WITH_OPENCL
-
-#ifdef LITE_WITH_CUDA
-#include "lite/backends/cuda/target_wrapper.h"
-#endif  // LITE_WITH_CUDA
-
-namespace paddle {
-namespace lite {
-
-// Malloc memory for a specific Target. All the targets should be an element in
-// the `switch` here.
-LITE_API void* TargetMalloc(TargetType target, size_t size);
-
-// Free memory for a specific Target. All the targets should be an element in
-// the `switch` here.
-void LITE_API TargetFree(TargetType target, void* data);
-
-// Copy a buffer from host to another target.
-void TargetCopy(TargetType target, void* dst, const void* src, size_t size);
-
-template <TargetType Target>
-void CopySync(void* dst, const void* src, size_t size, IoDirection dir) {
-  switch (Target) {
-    case TARGET(kX86):
-    case TARGET(kHost):
-    case TARGET(kARM):
-      TargetWrapper<TARGET(kHost)>::MemcpySync(
-          dst, src, size, IoDirection::HtoH);
-      break;
-#ifdef LITE_WITH_CUDA
-    case TARGET(kCUDA):
-      TargetWrapperCuda::MemcpySync(dst, src, size, dir);
-      break;
-#endif
-#ifdef LITE_WITH_OPENCL
-    case TargetType::kOpenCL:
-      TargetWrapperCL::MemcpySync(dst, src, size, dir);
-      break;
-#endif  // LITE_WITH_OPENCL
-#ifdef LITE_WITH_FPGA
-    case TARGET(kFPGA):
-      TargetWrapper<TARGET(kFPGA)>::MemcpySync(dst, src, size, dir);
-      break;
-#endif
-  }
-}
-
-// Memory buffer manager.
-class Buffer {
- public:
-  Buffer() = default;
-  Buffer(TargetType target, size_t size) : space_(size), target_(target) {}
-
-  void* data() const { return data_; }
-  TargetType target() const { return target_; }
-  size_t space() const { return space_; }
-
-  void ResetLazy(TargetType target, size_t size) {
-    if (target != target_ || space_ < size) {
-      Free();
-      data_ = TargetMalloc(target, size);
-      target_ = target;
-      space_ = size;
-    }
-  }
-
-  void ResizeLazy(size_t size) { ResetLazy(target_, size); }
-
-  void Free() {
-    if (space_ > 0) {
-      TargetFree(target_, data_);
-    }
-    target_ = TargetType::kHost;
-    space_ = 0;
-  }
-
-  void CopyDataFrom(const Buffer& other, size_t nbytes) {
-    target_ = other.target_;
-    ResizeLazy(nbytes);
-    // TODO(Superjomn) support copy between different targets.
-    TargetCopy(target_, data_, other.data_, nbytes);
-  }
-
-  ~Buffer() { Free(); }
-
- private:
-  // memory it actually malloced.
-  size_t space_{0};
-  void* data_{nullptr};
-  TargetType target_{TargetType::kHost};
-};
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/memory_test.cc b/lite/core/memory_test.cc
deleted file mode 100644
index cd9062afca..0000000000
--- a/lite/core/memory_test.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/memory.h"
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace lite {
-
-TEST(memory, test) {
-  auto* buf = TargetMalloc(TARGET(kX86), 10);
-  ASSERT_TRUE(buf);
-  TargetFree(TARGET(kX86), buf);
-
-#ifdef LITE_WITH_CUDA
-  auto* buf_cuda = TargetMalloc(TARGET(kCUDA), 10);
-  ASSERT_TRUE(buf_cuda);
-  TargetFree(TARGET(kCUDA), buf_cuda);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/CMakeLists.txt b/lite/core/mir/CMakeLists.txt
deleted file mode 100644
index 6dfc2bd295..0000000000
--- a/lite/core/mir/CMakeLists.txt
+++ /dev/null
@@ -1,109 +0,0 @@
-lite_cc_library(mir_node SRCS node.cc DEPS kernel)
-lite_cc_library(mir_ssa_graph SRCS ssa_graph.cc DEPS mir_node program)
-lite_cc_library(mir_pass SRCS pass.cc DEPS mir_ssa_graph)
-lite_cc_library(mir_pass_manager SRCS pass_manager.cc DEPS mir_pass mir_ssa_graph mir_passes)
-lite_cc_library(mir_pass_registry SRCS pass_registry.cc DEPS mir_pass_manager)
-
-add_subdirectory(fusion)
-add_subdirectory(elimination)
-add_subdirectory(subgraph)
-
-lite_cc_library(mir_passes
-  SRCS
-      fusion/fc_fuse_pass.cc
-      fusion/shuffle_channel_fuse_pass.cc
-      fusion/transpose_softmax_transpose_fuse_pass.cc
-      fusion/interpolate_fuse_pass.cc
-      fusion/conv_elementwise_fuse_pass.cc
-      fusion/conv_activation_fuse_pass.cc
-      fusion/conv_bn_fuse_pass.cc
-      fusion/elementwise_add_activation_fuse_pass.cc
-      fusion/quant_dequant_fuse_pass.cc
-      elimination/identity_scale_eliminate_pass.cc
-      static_kernel_pick_pass.cc
-      variable_place_inference_pass.cc
-      type_target_cast_pass.cc
-      type_layout_cast_pass.cc
-      type_precision_cast_pass.cc
-      io_copy_kernel_pick_pass.cc
-      graph_visualize_pass.cc
-      generate_program_pass.cc
-      argument_type_display_pass.cc
-      demo_pass.cc
-      runtime_context_assign_pass.cc
-  DEPS mir_pass types context ${mir_fusers} ${subgraph_passes})
-
-# lite_cc_test(test_ssa_graph SRCS ssa_graph_test.cc DEPS
-        #mir_ssa_graph scope op
-        #fc_op
-        #${host_kernels}
-        #mir_passes
-        #mir_pass_manager
-        #program_fake_utils
-        #)
-# lite_cc_test(test_variable_place_infrence_pass SRCS variable_place_inference_pass_test.cc
-#   DEPS
-#       mul_op
-#       feed_op
-#       fetch_op
-#       io_copy_op
-#       ${host_kernels}
-#       mir_passes
-#       mir_pass_manager
-#       optimizer
-#       program_fake_utils
-#       target_wrapper_host
-#   PROFILE_DEPS basic_profiler
-#   CUDA_DEPS target_wrapper_cuda kernels_cuda
-#   ARM_DEPS mul_compute_arm
-#   X86_DEPS mul_compute_x86
-# )
-
-set(pattern_deps mir_node mir_ssa_graph op)
-if (WITH_TESTING)
-  list(APPEND pattern_deps gtest)
-endif()
-lite_cc_library(pattern_matcher SRCS pattern_matcher.cc DEPS ${pattern_deps})
-lite_cc_test(test_pattern_matcher SRCS pattern_matcher_test.cc DEPS pattern_matcher)
-
-lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc DEPS pattern_matcher)
-
-
-# for mobile, unnecessary to compile the following testings.
-if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-    return()
-endif()
-lite_cc_test(test_mir_pass_manager SRCS pass_manager_test.cc DEPS mir_pass_manager mir_passes)
-
-
-# TODO(wz) replace framework/proto to lite proto.
-if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-    # it depends on the fluid/framework/proto, that is too heavy for mobile execution.
-    # TODO(wz) enable it latter.
-    # lite_cc_test(test_pattern_matcher_high_api SRCS pattern_matcher_high_api_test.cc DEPS
-    #     pattern_matcher_high_api proto_desc mir_pass_manager fc_op mul_op elementwise_ops
-    #     mir_passes compatible_pb program ${ops})
-endif()
-
-message(STATUS "----> Ops lite: ${ops}")
-message(STATUS "----> Host kernels: ${host_kernels}")
-message(STATUS "----> X86 kernels: ${x86_kernels}")
-
-# lite_cc_test(test_lite_fc_fuse SRCS fusion/fc_fuse_pass_test.cc
-#    DEPS cxx_api mir_passes
-#    ${ops} ${host_kernels} ${x86_kernels} ${arm_kernels}
-#    ARGS --model_dir=${LITE_MODEL_DIR}/lite_fc_model
-#         --optimized_model=${LITE_MODEL_DIR}/lite_fc_model_opt SERIAL)
-
-# lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "lite_fc_model.tar.gz")
-# add_dependencies(test_lite_fc_fuse extern_lite_download_lite_fc_model_tar_gz)
-
-
-# lite_cc_test(test_lite_conv_elementwise_add_activation_fuse
-#              SRCS fusion/conv_elementwise_add_activation_fuse_pass_test.cc
-#              DEPS cxx_api mir_passes
-#              ${ops} ${host_kernels} ${x86_kernels})
-# lite_cc_test(test_lite_elementwise_add_activation_fuse
-#              SRCS fusion/elementwise_add_activation_fuse_pass_test.cc
-#              DEPS cxx_api mir_passes
-#              ${ops} ${host_kernels} ${x86_kernels})
diff --git a/lite/core/mir/argument_type_display_pass.cc b/lite/core/mir/argument_type_display_pass.cc
deleted file mode 100644
index ea44245225..0000000000
--- a/lite/core/mir/argument_type_display_pass.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/pass.h"
-#include "lite/core/mir/pass_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-class ArgumentTypeDisplayPass : public DebugPass {
- public:
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
-    VLOG(3) << "== Argument types ==";
-    for (auto& node : graph->mutable_nodes()) {
-      if (!node.IsArg()) continue;
-
-      auto* type = node.AsArg().type;
-      if (type) {
-        VLOG(3) << "* ARG " << node.AsArg().name << " type: " << *type;
-      } else {
-        VLOG(3) << "* ARG " << node.AsArg().name << " type: UNK";
-      }
-    }
-    VLOG(3) << "---------------------";
-  }
-};
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(argument_type_display_pass,
-                  paddle::lite::mir::ArgumentTypeDisplayPass)
-    .SetTargets({TARGET(kAny)});
diff --git a/lite/core/mir/demo_pass.cc b/lite/core/mir/demo_pass.cc
deleted file mode 100644
index b92a2b0751..0000000000
--- a/lite/core/mir/demo_pass.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/pass.h"
-#include "lite/core/mir/pass_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-class DemoPass : public mir::DebugPass {
- public:
-  void Apply(const std::unique_ptr<SSAGraph> &graph) override {}
-};
-
-/*
-bool RegisterDemoPass() {
-  return PassManager::Global().AddNewPass("demo", new DemoPass);
-}
- */
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(demo, paddle::lite::mir::DemoPass).SetTargets({TARGET(kAny)});
diff --git a/lite/core/mir/dot.h b/lite/core/mir/dot.h
deleted file mode 100644
index df70565c07..0000000000
--- a/lite/core/mir/dot.h
+++ /dev/null
@@ -1,167 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/*
- * This file implements some helper classes and methods for DOT programming
- * support. It will give a visualization of the graph and that helps to debug
- * the logics of each Pass.
- */
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "lite/utils/cp_logging.h"
-#include "lite/utils/replace_stl/stream.h"
-#include "lite/utils/string.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-static size_t dot_node_counter{0};
-
-/*
- * A Dot template that helps to build a DOT graph definition.
- */
-class Dot {
- public:
-  struct Attr {
-    std::string key;
-    std::string value;
-
-    Attr(const std::string& key, const std::string& value)
-        : key(key), value(value) {}
-
-    std::string repr() const {
-      STL::stringstream ss;
-      ss << key << "=" << '"' << value << '"';
-      return ss.str();
-    }
-  };
-
-  struct Node {
-    std::string name;
-    std::vector<Attr> attrs;
-
-    Node(const std::string& name, const std::vector<Attr>& attrs)
-        : name(name), attrs(attrs) {
-      STL::stringstream ss;
-      ss << "node_" << dot_node_counter++;
-      id_ = ss.str();
-    }
-
-    std::string id() const { return id_; }
-
-    std::string repr() const {
-      STL::stringstream ss;
-      CHECK(!name.empty());
-      ss << id_;
-      if (attrs.empty()) {
-        ss << "[label=" << '"' << name << '"' << "]";
-        return ss.str();
-      }
-      for (size_t i = 0; i < attrs.size(); i++) {
-        if (i == 0) {
-          ss << "[label=" << '"' << name << '"' << " ";
-        }
-        ss << attrs[i].repr();
-        ss << ((i < attrs.size() - 1) ? " " : "]");
-      }
-      return ss.str();
-    }
-
-   private:
-    std::string id_;
-  };
-
-  struct Edge {
-    std::string source;
-    std::string target;
-    std::vector<Attr> attrs;
-
-    Edge(const std::string& source,
-         const std::string& target,
-         const std::vector<Attr>& attrs)
-        : source(source), target(target), attrs(attrs) {}
-
-    std::string repr() const {
-      STL::stringstream ss;
-      CHECK(!source.empty());
-      CHECK(!target.empty());
-      ss << source << "->" << target;
-      for (size_t i = 0; i < attrs.size(); i++) {
-        if (i == 0) {
-          ss << "[";
-        }
-        ss << attrs[i].repr();
-        ss << ((i < attrs.size() - 1) ? " " : "]");
-      }
-      return ss.str();
-    }
-  };
-
-  Dot() = default;
-
-  explicit Dot(const std::vector<Attr>& attrs) : attrs_(attrs) {}
-
-  void AddNode(const std::string& id,
-               const std::vector<Attr>& attrs,
-               std::string label = "") {
-    CHECK(!nodes_.count(id)) << "duplicate Node '" << id << "'";
-    if (label.empty()) label = id;
-    nodes_.emplace(id, Node{label, attrs});
-  }
-
-  void AddEdge(const std::string& source,
-               const std::string& target,
-               const std::vector<Attr>& attrs) {
-    CHECK(!source.empty());
-    CHECK(!target.empty());
-    auto sid = nodes_.at(source).id();
-    auto tid = nodes_.at(target).id();
-    edges_.emplace_back(sid, tid, attrs);
-  }
-
-  // Compile to DOT language codes.
-  std::string Build() const {
-    STL::stringstream ss;
-    const std::string indent = "   ";
-    ss << "digraph G {" << '\n';
-
-    // Add graph attrs
-    for (const auto& attr : attrs_) {
-      ss << indent << attr.repr() << '\n';
-    }
-    // add nodes
-    for (auto& item : nodes_) {
-      ss << indent << item.second.repr() << '\n';
-    }
-    // add edges
-    for (auto& edge : edges_) {
-      ss << indent << edge.repr() << '\n';
-    }
-    ss << "} // end G";
-    return ss.str();
-  }
-
- private:
-  std::unordered_map<std::string, Node> nodes_;
-  std::vector<Edge> edges_;
-  std::vector<Attr> attrs_;
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/lite/core/mir/elimination/CMakeLists.txt b/lite/core/mir/elimination/CMakeLists.txt
deleted file mode 100644
index 9b6598630b..0000000000
--- a/lite/core/mir/elimination/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-  # NOTE disabled for the proto_desc is not valid yet.
-  # TODO(Superjomn) enable them if valid latter.
-  # lite_cc_test(test_identity_scale_eliminate_pass
-  #   SRCS identity_scale_eliminate_pass_test.cc
-  #   DEPS mir_passes program proto_desc cpp_op_desc
-  #   ${ops}
-  #   )
-endif()
- 
diff --git a/lite/core/mir/elimination/identity_scale_eliminate_pass.cc b/lite/core/mir/elimination/identity_scale_eliminate_pass.cc
deleted file mode 100644
index 00290937b2..0000000000
--- a/lite/core/mir/elimination/identity_scale_eliminate_pass.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/pass.h"
-#include "lite/core/mir/pass_registry.h"
-#include "lite/core/mir/pattern_matcher_high_api.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-namespace {
-
-class Eliminator : public FuseBase {
- public:
-  void BuildPattern() override {
-    auto* pre_op = OpNode("preop");  // the previous op's output need update
-    // TODO(Superjomn) check has only one output
-    auto* x = VarNode("x")->assert_is_op_input("scale", "X");
-    auto* scale_op = OpNode("scale", "scale")
-                         ->assert_op_attr<float>("scale", 1.)
-                         ->assert_op_attr<float>("bias", 0.);
-    auto* out = VarNode("out")->assert_is_op_output("scale", "Out");
-
-    *pre_op >> *x >> *scale_op >> *out;
-
-    // The pre_op will be eliminated, and a new output-updated op will insert.
-    x->AsIntermediate();  // x is pre_op's output, need to update
-  }
-
- private:
-  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
-    auto& pre_op = matched.at("preop")->AsStmt();
-    auto op_info = *pre_op.op_info();
-
-    op_info.UpdateAllOutputs(matched.at("x")->AsArg().name,
-                             matched.at("out")->AsArg().name);
-    pre_op.ResetOp(op_info, graph->valid_places());
-
-    GraphSafeRemoveNodes(graph, {matched.at("scale")});
-
-    IR_NODE_LINK_TO(matched.at("preop"), matched.at("out"));
-  }
-};
-
-}  // namespace
-
-class IdentityScaleEliminatePass : public ProgramPass {
- public:
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
-    Eliminator eliminator;
-    eliminator(graph.get());
-  }
-};
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(identity_scale_eliminate_pass,
-                  paddle::lite::mir::IdentityScaleEliminatePass)
-    .SetTargets({TARGET(kAny)});
diff --git a/lite/core/mir/elimination/identity_scale_eliminate_pass_test.cc b/lite/core/mir/elimination/identity_scale_eliminate_pass_test.cc
deleted file mode 100644
index 7130a13c47..0000000000
--- a/lite/core/mir/elimination/identity_scale_eliminate_pass_test.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/core/mir/graph_visualize_pass.h"
-#include "lite/core/mir/pass_registry.h"
-#include "lite/core/mir/ssa_graph.h"
-#include "paddle/fluid/framework/program_desc.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-std::unique_ptr<SSAGraph> BuildGraph(framework::ProgramDesc* program_desc,
-                                     const std::shared_ptr<Scope>& scope,
-                                     const std::vector<Place>& valid_places) {
-  // Op list:
-  // (x)->feed -> (feed) -> scale -> (scale_out) -> fetch->(fetch)
-  // After pass
-  // (x)->feed->(scale_out)->fetch->(fetch)
-  auto* main_block = program_desc->MutableBlock(0);
-  auto* feed_op = main_block->AppendOp();
-  auto* scale_op = main_block->AppendOp();
-  auto* fetch_op = main_block->AppendOp();
-  main_block->Var("x");
-  main_block->Var("feed");
-  main_block->Var("scale_out");
-  main_block->Var("fetch_out");
-
-  scope->Var("x")->GetMutable<lite::Tensor>();
-  scope->Var("feed")->GetMutable<lite::Tensor>();
-  scope->Var("scale_out")->GetMutable<lite::Tensor>();
-  scope->Var("fetch_out")->GetMutable<lite::Tensor>();
-
-  feed_op->SetType("feed");
-  feed_op->SetInput("X", {"x"});
-  feed_op->SetAttr("col", 1);
-  feed_op->SetOutput("Out", {"feed"});
-
-  scale_op->SetType("scale");
-  scale_op->SetInput("X", {"feed"});
-  scale_op->SetOutput("Out", {"scale_out"});
-  scale_op->SetAttr("scale", 1.f);
-  scale_op->SetAttr("bias", 0.f);
-  scale_op->SetAttr("bias_after_scale", true);
-
-  fetch_op->SetType("fetch");
-  fetch_op->SetInput("X", {"scale_out"});
-  fetch_op->SetOutput("Out", {"fetch"});
-  fetch_op->SetAttr("col", 1);
-
-  program_desc->Flush();
-
-  lite::Program program(*program_desc->Proto(), scope, valid_places);
-  auto graph = std::unique_ptr<SSAGraph>(new SSAGraph());
-  graph->Build(program, valid_places);
-
-  VLOG(5) << Visualize(graph.get());
-
-  return graph;
-}
-
-TEST(identity_test, test) {
-  framework::ProgramDesc program_desc;
-  std::vector<Place> places{{TARGET(kHost), PRECISION(kFloat)}};
-  auto scope = std::make_shared<Scope>();
-  auto graph = BuildGraph(&program_desc, scope, places);
-  const int num_nodes = graph->nodes().size();
-  auto pass = PassManager::Global().LookUp("identity_scale_eliminate_pass");
-  ASSERT_TRUE(pass);
-  pass->Apply(graph);
-  ASSERT_EQ(graph->nodes().size(), num_nodes - 2UL);
-}
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(feed)
-USE_LITE_OP(fetch)
-USE_LITE_OP(scale)
-USE_MIR_PASS(identity_scale_eliminate_pass)
diff --git a/lite/core/mir/fusion/CMakeLists.txt b/lite/core/mir/fusion/CMakeLists.txt
deleted file mode 100644
index 5ac5283755..0000000000
--- a/lite/core/mir/fusion/CMakeLists.txt
+++ /dev/null
@@ -1,48 +0,0 @@
-lite_cc_library(fuse_fc
-        SRCS fc_fuser.cc
-        DEPS pattern_matcher_high_api)
-lite_cc_library(fuse_shuffle_channel
-        SRCS shuffle_channel_fuser.cc
-        DEPS pattern_matcher_high_api)
-lite_cc_library(fuse_conv_elementwise
-        SRCS conv_elementwise_fuser.cc
-        DEPS pattern_matcher_high_api)
-lite_cc_library(fuse_conv_activation
-        SRCS conv_activation_fuser.cc
-        DEPS pattern_matcher_high_api)
-lite_cc_library(fuse_conv_bn
-        SRCS conv_bn_fuser.cc
-        DEPS pattern_matcher_high_api)
-lite_cc_library(fuse_elementwise_add_activation
-        SRCS elementwise_add_activation_fuser.cc
-        DEPS pattern_matcher_high_api)
-lite_cc_library(fuse_quant_dequant
-        SRCS quant_dequant_op_fuser.cc
-        DEPS pattern_matcher_high_api)
-lite_cc_library(fuse_transpose_softmax_transpose
-        SRCS transpose_softmax_transpose_fuser.cc
-        DEPS pattern_matcher_high_api)
-lite_cc_library(fuse_interpolate
-        SRCS interpolate_fuser.cc
-        DEPS pattern_matcher_high_api)       
-
-set(mir_fusers
-    fuse_fc
-    fuse_shuffle_channel
-    fuse_conv_elementwise
-    fuse_conv_activation
-    fuse_conv_bn
-    fuse_quant_dequant
-    fuse_elementwise_add_activation
-    fuse_transpose_softmax_transpose
-    fuse_interpolate
-    CACHE INTERNAL "fusers")
-
-if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-    return()
-endif()
-
-# TODO(Superjomn) Enable it latter
-# NOTE disabled for the proto_desc is not valid yet.
-# lite_cc_test(test_lite_conv_bn_fuse SRCS conv_bn_fuse_pass_test.cc
-#    DEPS elementwise_ops batch_norm_op conv_op proto_desc compatible_pb program mir_pass mir_pass_manager pattern_matcher_high_api)
diff --git a/lite/core/mir/fusion/conv_activation_fuse_pass.cc b/lite/core/mir/fusion/conv_activation_fuse_pass.cc
deleted file mode 100644
index c6939e1983..0000000000
--- a/lite/core/mir/fusion/conv_activation_fuse_pass.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/fusion/conv_activation_fuse_pass.h"
-#include <memory>
-#include <vector>
-#include "lite/core/mir/fusion/conv_activation_fuser.h"
-#include "lite/core/mir/pass_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-void ConvActivationFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  for (auto conv_type : {"conv2d", "depthwise_conv2d"}) {
-    for (auto act_type : {"relu"}) {
-      for (auto has_bias : {true, false}) {
-        fusion::ConvActivationFuser fuser(conv_type, act_type, has_bias);
-        fuser(graph.get());
-      }
-    }
-  }
-}
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(lite_conv_activation_fuse_pass,
-                  paddle::lite::mir::ConvActivationFusePass)
-    .SetTargets({TARGET(kAny)});
diff --git a/lite/core/mir/fusion/conv_activation_fuse_pass.h b/lite/core/mir/fusion/conv_activation_fuse_pass.h
deleted file mode 100644
index e6f0f34be0..0000000000
--- a/lite/core/mir/fusion/conv_activation_fuse_pass.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include "lite/core/mir/pass.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-class ConvActivationFusePass : public ProgramPass {
- public:
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
-};
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/fusion/conv_activation_fuser.cc b/lite/core/mir/fusion/conv_activation_fuser.cc
deleted file mode 100644
index 8e18b368f4..0000000000
--- a/lite/core/mir/fusion/conv_activation_fuser.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/fusion/conv_activation_fuser.h"
-#include <memory>
-#include <vector>
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace fusion {
-
-void ConvActivationFuser::BuildPattern() {
-  // create nodes.
-  auto* input =
-      VarNode("input")->assert_is_op_input(conv_type_, "Input")->AsInput();
-  auto* filter =
-      VarNode("filter")->assert_is_op_input(conv_type_, "Filter")->AsInput();
-  PMNode* bias = nullptr;
-  if (has_bias_) {
-    bias = VarNode("bias")->assert_is_op_input(conv_type_, "Bias")->AsInput();
-  }
-  auto* conv2d = OpNode("conv2d", conv_type_)->AsIntermediate();
-
-  auto* act = OpNode("act", act_type_)->AsIntermediate();
-
-  auto* conv2d_out = VarNode("conv2d_out")
-                         ->assert_is_op_output(conv_type_, "Output")
-                         ->assert_is_op_input(act_type_, "X")
-                         ->AsIntermediate();
-
-  auto* out =
-      VarNode("output")->assert_is_op_output(act_type_, "Out")->AsOutput();
-
-  // create topology.
-  std::vector<PMNode*> conv2d_inputs{filter, input};
-  conv2d_inputs >> *conv2d >> *conv2d_out >> *act >> *out;
-  if (has_bias_) {
-    *bias >> *conv2d;
-  }
-}
-
-void ConvActivationFuser::InsertNewNode(SSAGraph* graph,
-                                        const key2nodes_t& matched) {
-  auto op_desc = GenOpDesc(matched);
-  auto conv_op = LiteOpRegistry::Global().Create(conv_type_);
-  auto conv_old = matched.at("conv2d")->stmt()->op();
-  auto* scope = conv_old->scope();
-  auto& valid_places = conv_old->valid_places();
-  conv_op->Attach(op_desc, scope);
-
-  auto* new_op_node = graph->GraphCreateInstructNode(conv_op, valid_places);
-
-  IR_NODE_LINK_TO(matched.at("input"), new_op_node);
-  IR_NODE_LINK_TO(matched.at("filter"), new_op_node);
-  if (has_bias_) {
-    IR_NODE_LINK_TO(matched.at("bias"), new_op_node);
-  }
-  IR_NODE_LINK_TO(new_op_node, matched.at("output"));
-}
-
-cpp::OpDesc ConvActivationFuser::GenOpDesc(const key2nodes_t& matched) {
-  cpp::OpDesc op_desc = *matched.at("conv2d")->stmt()->op_info();
-  op_desc.SetOutput("Output", {matched.at("output")->arg()->name});
-  op_desc.SetAttr("fuse_relu", true);
-  return op_desc;
-}
-
-}  // namespace fusion
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/fusion/conv_activation_fuser.h b/lite/core/mir/fusion/conv_activation_fuser.h
deleted file mode 100644
index 0d09c9dce2..0000000000
--- a/lite/core/mir/fusion/conv_activation_fuser.h
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include "lite/core/mir/pattern_matcher_high_api.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace fusion {
-
-class ConvActivationFuser : public FuseBase {
- public:
-  explicit ConvActivationFuser(const std::string& conv_type,
-                               const std::string& act_type,
-                               bool has_bias) {
-    CHECK(act_type == "relu") << "Only relu activation be supported now";
-    conv_type_ = conv_type;
-    act_type_ = act_type;
-    has_bias_ = has_bias;
-  }
-
-  void BuildPattern() override;
-  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
-
- private:
-  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
-  std::string conv_type_;
-  std::string act_type_;
-  bool has_bias_;
-};
-
-}  // namespace fusion
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/fusion/conv_bn_fuse_pass.cc b/lite/core/mir/fusion/conv_bn_fuse_pass.cc
deleted file mode 100644
index 2e962017bc..0000000000
--- a/lite/core/mir/fusion/conv_bn_fuse_pass.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/fusion/conv_bn_fuse_pass.h"
-#include <memory>
-#include <vector>
-#include "lite/core/mir/fusion/conv_bn_fuser.h"
-#include "lite/core/mir/pass_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-void ConvBNFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  fusion::ConvBNFuser fuser("conv2d");
-  fuser(graph.get());
-
-  fusion::ConvBNFuser fuser2("depthwise_conv2d");
-  fuser2(graph.get());
-}
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(lite_conv_bn_fuse_pass, paddle::lite::mir::ConvBNFusePass)
-    .SetTargets({TARGET(kAny)});
diff --git a/lite/core/mir/fusion/conv_bn_fuse_pass.h b/lite/core/mir/fusion/conv_bn_fuse_pass.h
deleted file mode 100644
index b2c56d1802..0000000000
--- a/lite/core/mir/fusion/conv_bn_fuse_pass.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include "lite/core/mir/pass.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-class ConvBNFusePass : public ProgramPass {
- public:
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
-};
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/fusion/conv_bn_fuse_pass_test.cc b/lite/core/mir/fusion/conv_bn_fuse_pass_test.cc
deleted file mode 100644
index 7e720bcc3d..0000000000
--- a/lite/core/mir/fusion/conv_bn_fuse_pass_test.cc
+++ /dev/null
@@ -1,140 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/fusion/conv_bn_fuse_pass.h"
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/core/mir/graph_visualize_pass.h"
-#include "lite/core/program.h"
-#include "lite/core/tensor.h"
-#include "paddle/fluid/framework/program_desc.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace fusion {
-
-std::unique_ptr<SSAGraph> BuildGraph(framework::ProgramDesc* program_desc,
-                                     const std::shared_ptr<Scope>& scope,
-                                     const std::vector<Place>& valid_places) {
-  auto* main_block = program_desc->MutableBlock(0);
-  auto* conv_op = main_block->AppendOp();
-  auto* bn_op = main_block->AppendOp();
-  main_block->Var("conv_i");
-  main_block->Var("conv_param");
-  main_block->Var("conv_out");
-
-  main_block->Var("bn_scale");
-  main_block->Var("bn_bias");
-  main_block->Var("bn_mean");
-  main_block->Var("bn_var");
-  main_block->Var("bn_out");
-  main_block->Var("bn_mean_out");
-  main_block->Var("bn_var_out");
-  main_block->Var("bn_saved_mean");
-  main_block->Var("bn_saved_var");
-
-  scope->Var("conv_i")->GetMutable<lite::Tensor>();
-  auto conv_param_t = scope->Var("conv_param")->GetMutable<lite::Tensor>();
-  std::vector<int64_t> conv_param_shape = {3, 1, 2, 2};
-  conv_param_t->Resize(lite::DDim(conv_param_shape));
-  conv_param_t->mutable_data<float>();
-  scope->Var("conv_out")->GetMutable<lite::Tensor>();
-  auto bn_scale_t = scope->Var("bn_scale")->GetMutable<lite::Tensor>();
-  std::vector<int64_t> bn_scale_shape = {3};
-  bn_scale_t->Resize(lite::DDim(bn_scale_shape));
-  bn_scale_t->mutable_data<float>();
-
-  auto bn_bias_t = scope->Var("bn_bias")->GetMutable<lite::Tensor>();
-  std::vector<int64_t> bn_bias_shape = {3};
-  bn_bias_t->Resize(lite::DDim(bn_bias_shape));
-  bn_bias_t->mutable_data<float>();
-
-  auto bn_mean_t = scope->Var("bn_mean")->GetMutable<lite::Tensor>();
-  bn_mean_t->Resize(lite::DDim(bn_bias_shape));
-  bn_mean_t->mutable_data<float>();
-
-  auto bn_var_t = scope->Var("bn_var")->GetMutable<lite::Tensor>();
-  bn_var_t->Resize(lite::DDim(bn_bias_shape));
-  bn_var_t->mutable_data<float>();
-
-  scope->Var("bn_out")->GetMutable<lite::Tensor>();
-  scope->Var("bn_mean_out")->GetMutable<lite::Tensor>();
-  scope->Var("bn_var_out")->GetMutable<lite::Tensor>();
-  scope->Var("bn_saved_mean")->GetMutable<lite::Tensor>();
-  scope->Var("bn_saved_var")->GetMutable<lite::Tensor>();
-
-  conv_op->SetType("conv2d");
-  conv_op->SetInput("Input", {"conv_i"});
-  conv_op->SetInput("Filter", {"conv_param"});
-  conv_op->SetOutput("Output", {"conv_out"});
-  const std::vector<int> strides({1, 1});
-  const std::vector<int> paddings({1, 1});
-  const std::vector<int> dilations({1, 1});
-  const int groups = 1;
-  conv_op->SetAttr("strides", strides);
-  conv_op->SetAttr("paddings", paddings);
-  conv_op->SetAttr("dilations", dilations);
-  conv_op->SetAttr("groups", groups);
-  conv_op->SetAttr("fuse_relu", false);
-
-  bn_op->SetType("batch_norm");
-  bn_op->SetInput("X", {"conv_out"});
-  bn_op->SetInput("Bias", {"bn_bias"});
-  bn_op->SetInput("Mean", {"bn_mean"});
-  bn_op->SetInput("Scale", {"bn_scale"});
-  bn_op->SetInput("Variance", {"bn_var"});
-
-  bn_op->SetOutput("Y", {"bn_out"});
-  bn_op->SetOutput("MeanOut", {"bn_mean_out"});
-  bn_op->SetOutput("VarianceOut", {"bn_var_out"});
-  bn_op->SetOutput("SavedMean", {"bn_saved_mean"});
-  bn_op->SetOutput("SavedVariance", {"bn_saved_var"});
-  float eps = 1e-5;
-  bn_op->SetAttr("epsilon", eps);
-  bn_op->SetAttr("is_test", static_cast<int>(1));
-  bn_op->SetAttr("use_global_stats", false);
-  bn_op->SetAttr("momentum", 0.9f);
-  bn_op->SetAttr("data_layout", std::string("NCHW"));
-
-  program_desc->Flush();
-
-  lite::Program program(*program_desc->Proto(), scope, valid_places);
-  auto graph = std::unique_ptr<SSAGraph>(new SSAGraph());
-  graph->Build(program, valid_places);
-
-  return graph;
-}
-
-TEST(pattern_matcher2, test) {
-  framework::ProgramDesc program_desc;
-  std::vector<Place> places{{TARGET(kHost), PRECISION(kFloat)}};
-  auto scope = std::make_shared<Scope>();
-  auto graph = BuildGraph(&program_desc, scope, places);
-  const int num_nodes = graph->nodes().size();
-  auto* fuser = new ConvBNFusePass;
-  fuser->Apply(graph);
-  ASSERT_EQ(graph->nodes().size(),
-            num_nodes - 8UL /*nodes removed */ + 1UL /* eltwise_add node*/);
-}
-
-}  // namespace fusion
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(conv2d);
-USE_LITE_OP(batch_norm);
-USE_LITE_OP(elementwise_add);
diff --git a/lite/core/mir/fusion/conv_bn_fuser.cc b/lite/core/mir/fusion/conv_bn_fuser.cc
deleted file mode 100644
index 77ad8237fe..0000000000
--- a/lite/core/mir/fusion/conv_bn_fuser.cc
+++ /dev/null
@@ -1,163 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/fusion/conv_bn_fuser.h"
-#include <memory>
-#include <vector>
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace fusion {
-
-void ConvBNFuser::BuildPattern() {
-  auto* conv_input =
-      VarNode("conv_input")->assert_is_op_input(conv_type_, "Input")->AsInput();
-  auto* conv_weight = VarNode("conv_weight")
-                          ->assert_is_op_input(conv_type_, "Filter")
-                          ->AsInput();
-  auto* conv = OpNode("conv2d", conv_type_)->assert_is_op(conv_type_);
-  auto* conv_out = VarNode("conv_out")
-                       ->assert_is_op_output(conv_type_, "Output")
-                       ->assert_is_op_input("batch_norm", "X");
-
-  auto* bn_scale = VarNode("bn_scale")
-                       ->assert_is_op_input("batch_norm", "Scale")
-                       ->AsIntermediate();
-  auto* bn_bias =
-      VarNode("bn_bias")->assert_is_op_input("batch_norm", "Bias")->AsInput();
-  auto* bn_mean = VarNode("bn_mean")
-                      ->assert_is_op_input("batch_norm", "Mean")
-                      ->AsIntermediate();
-  auto* bn_var = VarNode("bn_variance")
-                     ->assert_is_op_input("batch_norm", "Variance")
-                     ->AsIntermediate();
-  auto* bn =
-      OpNode("bn", "batch_norm")->assert_is_op("batch_norm")->AsIntermediate();
-
-  auto* bn_out =
-      VarNode("bn_out")->assert_is_op_output("batch_norm", "Y")->AsOutput();
-  auto* bn_mean_out = VarNode("bn_mean_out")
-                          ->assert_is_op_output("batch_norm", "MeanOut")
-                          ->AsIntermediate();
-  auto* bn_var_out = VarNode("bn_var_out")
-                         ->assert_is_op_output("batch_norm", "VarianceOut")
-                         ->AsIntermediate();
-  auto* bn_saved_mean = VarNode("bn_saved_mean")
-                            ->assert_is_op_output("batch_norm", "SavedMean")
-                            ->AsIntermediate();
-  auto* bn_saved_var = VarNode("bn_saved_var")
-                           ->assert_is_op_output("batch_norm", "SavedVariance")
-                           ->AsIntermediate();
-
-  conv->LinksFrom({conv_input, conv_weight}).LinksTo({conv_out});
-
-  bn->LinksFrom({conv_out, bn_scale, bn_bias, bn_mean, bn_var})
-      .LinksTo({bn_out, bn_mean_out, bn_saved_mean, bn_saved_var, bn_var_out});
-}
-
-void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
-  auto op_desc = GenOpDesc(matched);
-  auto eltwise_op = LiteOpRegistry::Global().Create("elementwise_add");
-
-  auto conv_instruct = matched.at("conv2d")->stmt();
-  auto conv = conv_instruct->op();
-  auto* scope = conv->scope();
-  auto& valid_places = conv->valid_places();
-
-  auto conv_weight_t = scope->FindVar(matched.at("conv_weight")->arg()->name)
-                           ->GetMutable<lite::Tensor>();
-  auto conv_weight_dims = conv_weight_t->dims();
-  size_t weight_num = conv_weight_t->data_size();
-
-  auto bn_scale_t = scope->FindVar(matched.at("bn_scale")->arg()->name)
-                        ->GetMutable<lite::Tensor>();
-  size_t bias_size = bn_scale_t->data_size();
-  auto bn_scale_d = bn_scale_t->mutable_data<float>();
-  CHECK_EQ(bias_size, static_cast<size_t>(conv_weight_dims[0]))
-      << "The BN bias's size should be equal to the size of the first "
-      << "dim size of the conv weights";
-
-  auto bn_mean_t = scope->FindVar(matched.at("bn_mean")->arg()->name)
-                       ->GetMutable<lite::Tensor>();
-  auto bn_mean_d = bn_mean_t->mutable_data<float>();
-
-  auto bn_var_t = scope->FindVar(matched.at("bn_variance")->arg()->name)
-                      ->GetMutable<lite::Tensor>();
-  auto bn_var_d = bn_var_t->mutable_data<float>();
-
-  auto bn_bias_t = scope->FindVar(matched.at("bn_bias")->arg()->name)
-                       ->GetMutable<lite::Tensor>();
-  auto bn_bias_d = bn_bias_t->mutable_data<float>();
-  auto eps = matched.at("bn")->stmt()->op_info()->GetAttr<float>("epsilon");
-
-  auto conv_op_desc = conv_instruct->mutable_op_info();
-
-  bool enable_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false;
-  Tensor alpha_tensor, beta_tensor;
-  alpha_tensor.CopyDataFrom(*bn_bias_t);
-  beta_tensor.CopyDataFrom(*bn_bias_t);
-  auto alpha_data = alpha_tensor.mutable_data<float>();
-  auto beta_data = beta_tensor.mutable_data<float>();
-
-  int h = bias_size;
-  int w = weight_num / bias_size;
-  ComputeAlphaAndBeta(
-      bn_scale_d, bn_mean_d, bn_var_d, alpha_data, beta_data, eps, h, w);
-
-  if (enable_int8) {
-    PADDLE_ENFORCE(conv_op_desc->HasAttr("weight_scale"),
-                   "INT8 mode: Conv should has weight_scale attr");
-    auto weight_scale =
-        conv_op_desc->GetAttr<std::vector<float>>("weight_scale");
-    for (int i = 0; i < h; i++) {
-      weight_scale[i] *= alpha_data[i];
-    }
-    // Interface like this should be abandoned.
-    conv_op_desc->SetAttr("weight_scale", weight_scale);
-    auto update_conv_desc = *conv_instruct->mutable_op_info();
-    conv_instruct->ResetOp(update_conv_desc, graph->valid_places());
-  } else {
-    auto conv_weight_d = conv_weight_t->mutable_data<float>();
-    for (int i = 0; i < h; i++) {
-      for (int j = 0; j < w; j++) {
-        conv_weight_d[i * w + j] *= alpha_data[i];
-      }
-    }
-  }
-  for (int i = 0; i < bias_size; i++) {
-    bn_bias_d[i] += beta_data[i];
-  }
-  eltwise_op->Attach(op_desc, scope);
-  auto* new_op_node = graph->GraphCreateInstructNode(eltwise_op, valid_places);
-
-  IR_NODE_LINK_TO(matched.at("conv_out"), new_op_node);
-  IR_NODE_LINK_TO(matched.at("bn_bias"), new_op_node);
-  IR_NODE_LINK_TO(new_op_node, matched.at("bn_out"));
-}
-
-cpp::OpDesc ConvBNFuser::GenOpDesc(const key2nodes_t& matched) {
-  cpp::OpDesc op_desc;
-  op_desc.SetType("elementwise_add");
-  op_desc.SetInput("X", {matched.at("conv_out")->arg()->name});
-  op_desc.SetInput("Y", {matched.at("bn_bias")->arg()->name});
-  op_desc.SetOutput("Out", {matched.at("bn_out")->arg()->name});
-  op_desc.SetAttr("axis", 1);
-  return op_desc;
-}
-
-}  // namespace fusion
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/fusion/conv_bn_fuser.h b/lite/core/mir/fusion/conv_bn_fuser.h
deleted file mode 100644
index 9acf65f9e2..0000000000
--- a/lite/core/mir/fusion/conv_bn_fuser.h
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include "lite/core/mir/pattern_matcher_high_api.h"
-#include "lite/utils/paddle_enforce.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace fusion {
-
-class ConvBNFuser : public FuseBase {
- public:
-  explicit ConvBNFuser(const std::string& conv_type) : conv_type_(conv_type) {}
-  void BuildPattern() override;
-  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
-
- private:
-  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
-  void ComputeAlphaAndBeta(float* scale_d,
-                           float* mean_d,
-                           float* var_d,
-                           float* alpha,
-                           float* beta,
-                           float eps,
-                           int h,
-                           int w) {
-    for (int i = 0; i < h; i++) {
-      alpha[i] = scale_d[i] / std::sqrt(var_d[i] + eps);
-    }
-    for (int i = 0; i < h; i++) {
-      beta[i] = (-mean_d[i]) * alpha[i];
-    }
-  }
-
- private:
-  std::string conv_type_{"conv2d"};
-};
-
-}  // namespace fusion
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/fusion/conv_elementwise_add_activation_fuse_pass_test.cc b/lite/core/mir/fusion/conv_elementwise_add_activation_fuse_pass_test.cc
deleted file mode 100644
index 59bf7035e7..0000000000
--- a/lite/core/mir/fusion/conv_elementwise_add_activation_fuse_pass_test.cc
+++ /dev/null
@@ -1,157 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/api/cxx_api.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/core/mir/fusion/conv_activation_fuse_pass.h"
-#include "lite/core/mir/fusion/conv_elementwise_fuse_pass.h"
-#include "lite/core/mir/graph_visualize_pass.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/program.h"
-#include "lite/core/tensor.h"
-#include "paddle/fluid/framework/program_desc.h"
-
-DEFINE_string(model_dir, "", "");
-DEFINE_string(optimized_model, "", "");
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace fusion {
-
-std::unique_ptr<SSAGraph> BuildGraph(framework::ProgramDesc* program_desc,
-                                     const std::shared_ptr<Scope>& scope,
-                                     const std::vector<Place>& valid_places) {
-  auto* main_block = program_desc->MutableBlock(0);
-
-  auto* conv2d_1 = main_block->AppendOp();
-  auto* conv2d_2 = main_block->AppendOp();
-  auto* add_1 = main_block->AppendOp();
-  auto* relu_1 = main_block->AppendOp();
-  auto* add_2 = main_block->AppendOp();
-  auto* relu_2 = main_block->AppendOp();
-
-  main_block->Var("input_1");
-  main_block->Var("input_2");
-  main_block->Var("filter_1");
-  main_block->Var("filter_2");
-  main_block->Var("conv2d_1_out");
-  main_block->Var("conv2d_2_out");
-  main_block->Var("bias_1");
-  main_block->Var("add_1_out");
-  main_block->Var("add_2_out");
-  main_block->Var("relu_1_out");
-  main_block->Var("out");
-
-  scope->Var("input_1")->GetMutable<lite::Tensor>();
-  scope->Var("input_2")->GetMutable<lite::Tensor>();
-  scope->Var("filter_1")->GetMutable<lite::Tensor>();
-  scope->Var("filter_2")->GetMutable<lite::Tensor>();
-  scope->Var("conv2d_1_out")->GetMutable<lite::Tensor>();
-  scope->Var("conv2d_2_out")->GetMutable<lite::Tensor>();
-  scope->Var("bias_1")->GetMutable<lite::Tensor>();
-  scope->Var("add_1_out")->GetMutable<lite::Tensor>();
-  scope->Var("add_2_out")->GetMutable<lite::Tensor>();
-  scope->Var("relu_1_out")->GetMutable<lite::Tensor>();
-  scope->Var("out")->GetMutable<lite::Tensor>();
-
-  conv2d_1->SetType("conv2d");
-  conv2d_1->SetInput("Input", {"input_1"});
-  conv2d_1->SetInput("Filter", {"filter_1"});
-  conv2d_1->SetOutput("Output", {"conv2d_1_out"});
-  conv2d_1->SetAttr("strides", std::vector<int>({1, 1}));
-  conv2d_1->SetAttr("paddings", std::vector<int>({0, 0}));
-  conv2d_1->SetAttr("groups", 1);
-  conv2d_1->SetAttr("dilations", std::vector<int>({1, 1}));
-  conv2d_1->SetAttr("fuse_relu", false);
-
-  add_1->SetType("elementwise_add");
-  add_1->SetInput("X", {"conv2d_1_out"});
-  add_1->SetInput("Y", {"bias_1"});
-  add_1->SetOutput("Out", {"add_1_out"});
-  add_1->SetAttr("axis", 1);
-
-  relu_1->SetType("relu");
-  relu_1->SetInput("X", {"add_1_out"});
-  relu_1->SetOutput("Out", {"relu_1_out"});
-
-  conv2d_2->SetType("conv2d");
-  conv2d_2->SetInput("Input", {"input_2"});
-  conv2d_2->SetInput("Filter", {"filter_2"});
-  conv2d_2->SetOutput("Output", {"conv2d_2_out"});
-  conv2d_2->SetAttr("strides", std::vector<int>({1, 1}));
-  conv2d_2->SetAttr("paddings", std::vector<int>({0, 0}));
-  conv2d_2->SetAttr("groups", 1);
-  conv2d_2->SetAttr("dilations", std::vector<int>({1, 1}));
-  conv2d_2->SetAttr("fuse_relu", false);
-
-  add_2->SetType("elementwise_add");
-  add_2->SetInput("X", {"conv2d_2_out"});
-  add_2->SetInput("Y", {"relu_1_out"});
-  add_2->SetOutput("Out", {"add_2_out"});
-  add_2->SetAttr("axis", 1);
-
-  relu_2->SetType("relu");
-  relu_2->SetInput("X", {"add_2_out"});
-  relu_2->SetOutput("Out", {"out"});
-
-  program_desc->Flush();
-
-  lite::Program program(*program_desc->Proto(), scope, valid_places);
-  auto graph = std::unique_ptr<SSAGraph>(new SSAGraph());
-  graph->Build(program, valid_places);
-
-  return graph;
-}
-
-TEST(conv_elementwise_add_relu_fuse_pass, graph_test) {
-  framework::ProgramDesc program_desc;
-  std::vector<Place> places{{TARGET(kHost), PRECISION(kFloat)}};
-  auto scope = std::make_shared<Scope>();
-  auto graph = BuildGraph(&program_desc, scope, places);
-
-  Visualize(graph.get());
-  ASSERT_EQ(graph->nodes().size(), 11UL /*vars*/ + 6UL /*ops*/);
-  Visualize(graph.get());
-}
-
-TEST(conv_elementwise_add_relu_fuse_pass, fuse_test_op) {
-  framework::ProgramDesc program_desc;
-  std::vector<Place> places{{TARGET(kHost), PRECISION(kFloat)}};
-  auto scope = std::make_shared<Scope>();
-  auto graph = BuildGraph(&program_desc, scope, places);
-  Visualize(graph.get());
-  const int num_nodes = graph->nodes().size();
-  auto* fuser_eltwise = new ConvElementwiseFusePass;
-  auto* fuser_act = new ConvActivationFusePass;
-  fuser_eltwise->Apply(graph);
-  fuser_act->Apply(graph);
-
-  Visualize(graph.get());
-  ASSERT_EQ(graph->nodes().size(),
-            num_nodes - 5UL * 2 /*nodes removed */ + 1UL * 2 /* fused nodes*/);
-}
-
-}  // namespace fusion
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(elementwise_add);
-USE_LITE_OP(conv2d);
-USE_LITE_OP(depthwise_conv2d);
-USE_LITE_OP(relu);
diff --git a/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc b/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc
deleted file mode 100644
index 631c6b883e..0000000000
--- a/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/fusion/conv_elementwise_fuse_pass.h"
-#include <memory>
-#include <vector>
-#include "lite/core/mir/fusion/conv_elementwise_fuser.h"
-#include "lite/core/mir/pass_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-void ConvElementwiseFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  fusion::ConvElementwiseFuser fuser("conv2d");
-  fuser(graph.get());
-
-  fusion::ConvElementwiseFuser depthwise_fuser("depthwise_conv2d");
-  depthwise_fuser(graph.get());
-
-  fusion::ConvElementwiseFuser conv2d_transpose_fuser("conv2d_transpose");
-  conv2d_transpose_fuser(graph.get());
-}
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(lite_conv_elementwise_fuse_pass,
-                  paddle::lite::mir::ConvElementwiseFusePass)
-    .SetTargets({TARGET(kAny)});
diff --git a/lite/core/mir/fusion/conv_elementwise_fuse_pass.h b/lite/core/mir/fusion/conv_elementwise_fuse_pass.h
deleted file mode 100644
index 11953e9b10..0000000000
--- a/lite/core/mir/fusion/conv_elementwise_fuse_pass.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include "lite/core/mir/pass.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-class ConvElementwiseFusePass : public ProgramPass {
- public:
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
-};
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/fusion/conv_elementwise_fuser.cc b/lite/core/mir/fusion/conv_elementwise_fuser.cc
deleted file mode 100644
index c3ab3e4c4c..0000000000
--- a/lite/core/mir/fusion/conv_elementwise_fuser.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/fusion/conv_elementwise_fuser.h"
-#include <memory>
-#include <vector>
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace fusion {
-
-void ConvElementwiseFuser::BuildPattern() {
-  // create input nodes.
-  auto* input =
-      VarNode("input")->assert_is_op_input(conv_type_, "Input")->AsInput();
-  auto* filter =
-      VarNode("filter")->assert_is_op_input(conv_type_, "Filter")->AsInput();
-  auto* bias =
-      VarNode("bias")->assert_is_op_input("elementwise_add", "Y")->AsInput();
-
-  // create op nodes
-  auto* conv2d =
-      OpNode("conv2d", conv_type_)->assert_is_op(conv_type_)->AsIntermediate();
-  auto* add = OpNode("add", "elementwise_add")
-                  ->assert_is_op("elementwise_add")
-                  ->AsIntermediate();
-
-  // create intermediate nodes
-  auto* conv2d_out = VarNode("conv2d_out")
-                         ->assert_is_op_output(conv_type_, "Output")
-                         ->assert_is_op_input("elementwise_add", "X")
-                         ->AsIntermediate();
-  // create output node
-  auto* add_out = VarNode("output")
-                      ->assert_is_op_output("elementwise_add", "Out")
-                      ->AsOutput();
-
-  // create topology.
-  std::vector<PMNode*> conv2d_inputs{filter, input};
-  std::vector<PMNode*> add_inputs{conv2d_out, bias};
-  conv2d_inputs >> *conv2d >> *conv2d_out;
-  add_inputs >> *add >> *add_out;
-}
-
-void ConvElementwiseFuser::InsertNewNode(SSAGraph* graph,
-                                         const key2nodes_t& matched) {
-  auto op_desc = GenOpDesc(matched);
-  auto conv_op = LiteOpRegistry::Global().Create(conv_type_);
-  auto conv_old = matched.at("conv2d")->stmt()->op();
-  auto* scope = conv_old->scope();
-  auto& valid_places = conv_old->valid_places();
-  conv_op->Attach(op_desc, scope);
-
-  auto* new_op_node = graph->GraphCreateInstructNode(conv_op, valid_places);
-
-  IR_NODE_LINK_TO(matched.at("input"), new_op_node);
-  IR_NODE_LINK_TO(matched.at("filter"), new_op_node);
-  IR_NODE_LINK_TO(matched.at("bias"), new_op_node);
-  IR_NODE_LINK_TO(new_op_node, matched.at("output"));
-}
-
-cpp::OpDesc ConvElementwiseFuser::GenOpDesc(const key2nodes_t& matched) {
-  auto* desc = matched.at("conv2d")->stmt()->op_info();
-
-  cpp::OpDesc op_desc = *desc;
-  op_desc.SetType(conv_type_);
-  op_desc.SetInput("Input", {matched.at("input")->arg()->name});
-  op_desc.SetInput("Filter", {matched.at("filter")->arg()->name});
-  op_desc.SetInput("Bias", {matched.at("bias")->arg()->name});
-  op_desc.SetOutput("Output", {matched.at("output")->arg()->name});
-  // Other inputs. See operators/conv_op.h
-  std::vector<std::string> input_arg_names = desc->InputArgumentNames();
-
-  if (std::find(input_arg_names.begin(),
-                input_arg_names.end(),
-                "ResidualData") != input_arg_names.end()) {
-    op_desc.SetInput("ResidualData", desc->Input("ResidualData"));
-  }
-  // Only consider strides, padding, groups, dilations for now
-  op_desc.SetAttr("strides", desc->GetAttr<std::vector<int>>("strides"));
-  op_desc.SetAttr("paddings", desc->GetAttr<std::vector<int>>("paddings"));
-  op_desc.SetAttr("groups", desc->GetAttr<int>("groups"));
-  op_desc.SetAttr("dilations", desc->GetAttr<std::vector<int>>("dilations"));
-  return op_desc;
-}
-
-}  // namespace fusion
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/fusion/conv_elementwise_fuser.h b/lite/core/mir/fusion/conv_elementwise_fuser.h
deleted file mode 100644
index 4514fc5010..0000000000
--- a/lite/core/mir/fusion/conv_elementwise_fuser.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include "lite/core/mir/pattern_matcher_high_api.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace fusion {
-
-class ConvElementwiseFuser : public FuseBase {
- public:
-  explicit ConvElementwiseFuser(const std::string& conv_type) {
-    conv_type_ = conv_type;
-  }
-
-  void BuildPattern() override;
-  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
-
- private:
-  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
-  std::string conv_type_;
-};
-
-}  // namespace fusion
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc
deleted file mode 100644
index 71dc31d49a..0000000000
--- a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/fusion/elementwise_add_activation_fuse_pass.h"
-#include <memory>
-#include <vector>
-#include "lite/core/mir/fusion/elementwise_add_activation_fuser.h"
-#include "lite/core/mir/pass_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-void ElementwiseAddActivationFusePass::Apply(
-    const std::unique_ptr<SSAGraph>& graph) {
-  fusion::ElementwiseAddActivationFuser fuser("relu");
-  fuser(graph.get());
-}
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(lite_elementwise_add_activation_fuse_pass,
-                  paddle::lite::mir::ElementwiseAddActivationFusePass)
-    .SetTargets({TARGET(kAny)});
diff --git a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.h b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.h
deleted file mode 100644
index 299b6b89a0..0000000000
--- a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include "lite/core/mir/pass.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-class ElementwiseAddActivationFusePass : public ProgramPass {
- public:
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
-};
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass_test.cc b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass_test.cc
deleted file mode 100644
index ca5127db16..0000000000
--- a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass_test.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/fusion/elementwise_add_activation_fuse_pass.h"
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/api/cxx_api.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/core/mir/graph_visualize_pass.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/program.h"
-#include "lite/core/tensor.h"
-#include "paddle/fluid/framework/program_desc.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace fusion {
-
-std::unique_ptr<SSAGraph> BuildGraph(framework::ProgramDesc* program_desc,
-                                     const std::shared_ptr<Scope>& scope,
-                                     const std::vector<Place>& valid_places) {
-  auto* main_block = program_desc->MutableBlock(0);
-
-  auto* add_1 = main_block->AppendOp();
-  auto* add_2 = main_block->AppendOp();
-  auto* relu_1 = main_block->AppendOp();
-  auto* relu_2 = main_block->AppendOp();
-
-  main_block->Var("x_1");
-  main_block->Var("y_1");
-  main_block->Var("add_out_1");
-  main_block->Var("relu_out_1");
-  main_block->Var("y_2");
-  main_block->Var("add_out_2");
-  main_block->Var("out");
-
-  scope->Var("x_1")->GetMutable<lite::Tensor>();
-  scope->Var("y_1")->GetMutable<lite::Tensor>();
-  scope->Var("add_out_1")->GetMutable<lite::Tensor>();
-  scope->Var("relu_out_1")->GetMutable<lite::Tensor>();
-  scope->Var("y_2")->GetMutable<lite::Tensor>();
-  scope->Var("add_out_2")->GetMutable<lite::Tensor>();
-  scope->Var("out")->GetMutable<lite::Tensor>();
-
-  add_1->SetType("elementwise_add");
-  add_1->SetInput("X", {"x_1"});
-  add_1->SetInput("Y", {"y_1"});
-  add_1->SetOutput("Out", {"add_out_1"});
-  add_1->SetAttr("axis", 1);
-
-  relu_1->SetType("relu");
-  relu_1->SetInput("X", {"add_out_1"});
-  relu_1->SetOutput("Out", {"relu_out_1"});
-
-  add_2->SetType("elementwise_add");
-  add_2->SetInput("X", {"relu_out_1"});
-  add_2->SetInput("Y", {"y_2"});
-  add_2->SetOutput("Out", {"add_out_2"});
-  add_2->SetAttr("axis", 1);
-
-  relu_2->SetType("relu");
-  relu_2->SetInput("X", {"add_out_2"});
-  relu_2->SetOutput("Out", {"out"});
-
-  program_desc->Flush();
-
-  lite::Program program(*program_desc->Proto(), scope, valid_places);
-  auto graph = std::unique_ptr<SSAGraph>(new SSAGraph());
-  graph->Build(program, valid_places);
-
-  return graph;
-}
-
-TEST(elementwise_add_activation_fuse_pass, graph_test) {
-  framework::ProgramDesc program_desc;
-  std::vector<Place> places{{TARGET(kHost), PRECISION(kFloat)}};
-  auto scope = std::make_shared<Scope>();
-  auto graph = BuildGraph(&program_desc, scope, places);
-  ASSERT_EQ(graph->nodes().size(),
-            7UL /*vars*/ + 4UL /*ops*/ + 1UL /* SSAGraph tmp node*/);
-}
-
-TEST(elementwise_add_activation_fuse_pass, fuse_test_op) {
-  framework::ProgramDesc program_desc;
-  std::vector<Place> places{{TARGET(kHost), PRECISION(kFloat)}};
-  auto scope = std::make_shared<Scope>();
-  auto graph = BuildGraph(&program_desc, scope, places);
-  Visualize(graph.get());
-  const int num_nodes = graph->nodes().size();
-  auto* fuser = new ElementwiseAddActivationFusePass;
-  fuser->Apply(graph);
-  Visualize(graph.get());
-  ASSERT_EQ(graph->nodes().size(),
-            num_nodes - 3UL * 2 /*nodes removed */ + 1UL * 2 /* fused nodes*/);
-}
-
-}  // namespace fusion
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(elementwise_add);
-USE_LITE_OP(fusion_elementwise_add_activation);
-USE_LITE_OP(relu);
diff --git a/lite/core/mir/fusion/elementwise_add_activation_fuser.cc b/lite/core/mir/fusion/elementwise_add_activation_fuser.cc
deleted file mode 100644
index 3c6bf4768b..0000000000
--- a/lite/core/mir/fusion/elementwise_add_activation_fuser.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/fusion/elementwise_add_activation_fuser.h"
-#include <memory>
-#include <vector>
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace fusion {
-
-void ElementwiseAddActivationFuser::BuildPattern() {
-  // create input nodes.
-  auto* x = VarNode("x")->assert_is_op_input("elementwise_add", "X")->AsInput();
-  auto* y = VarNode("y")->assert_is_op_input("elementwise_add", "Y")->AsInput();
-
-  // create op nodes
-  auto* add = OpNode("add", "elementwise_add")
-                  ->assert_is_op("elementwise_add")
-                  ->AsIntermediate();
-  auto* act =
-      OpNode("act", act_type_)->assert_is_op(act_type_)->AsIntermediate();
-
-  // create intermediate nodes
-  auto* add_out = VarNode("add_out")
-                      ->assert_is_op_output("elementwise_add", "Out")
-                      ->assert_is_op_input(act_type_, "X")
-                      ->AsIntermediate();
-
-  // create output node
-  auto* out =
-      VarNode("output")->assert_is_op_output(act_type_, "Out")->AsOutput();
-
-  // create topology.
-  std::vector<PMNode*> add_inputs{x, y};
-  add_inputs >> *add >> *add_out;
-  *add_out >> *act >> *out;
-}
-
-void ElementwiseAddActivationFuser::InsertNewNode(SSAGraph* graph,
-                                                  const key2nodes_t& matched) {
-  auto op_desc = GenOpDesc(matched);
-  auto op =
-      LiteOpRegistry::Global().Create("fusion_elementwise_add_activation");
-  auto old_op = matched.at("add")->stmt()->op();
-  auto* scope = old_op->scope();
-  auto& valid_places = old_op->valid_places();
-  op->Attach(op_desc, scope);
-
-  auto* new_op_node = graph->GraphCreateInstructNode(op, valid_places);
-
-  IR_NODE_LINK_TO(matched.at("x"), new_op_node);
-  IR_NODE_LINK_TO(matched.at("y"), new_op_node);
-  IR_NODE_LINK_TO(new_op_node, matched.at("output"));
-}
-
-cpp::OpDesc ElementwiseAddActivationFuser::GenOpDesc(
-    const key2nodes_t& matched) {
-  auto* desc = matched.at("add")->stmt()->op_info();
-
-  cpp::OpDesc op_desc;
-  op_desc.SetType("fusion_elementwise_add_activation");
-  op_desc.SetInput("X", {matched.at("x")->arg()->name});
-  op_desc.SetInput("Y", {matched.at("y")->arg()->name});
-  op_desc.SetOutput("Out", {matched.at("output")->arg()->name});
-
-  op_desc.SetAttr("axis", desc->GetAttr<int>("axis"));
-  op_desc.SetAttr("act_type", act_type_);
-  return op_desc;
-}
-
-}  // namespace fusion
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/fusion/elementwise_add_activation_fuser.h b/lite/core/mir/fusion/elementwise_add_activation_fuser.h
deleted file mode 100644
index 47bb2fcf82..0000000000
--- a/lite/core/mir/fusion/elementwise_add_activation_fuser.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include "lite/core/mir/pattern_matcher_high_api.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace fusion {
-
-class ElementwiseAddActivationFuser : public FuseBase {
- public:
-  explicit ElementwiseAddActivationFuser(const std::string& act_type)
-      : act_type_(act_type) {}
-  void BuildPattern() override;
-  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
-
- private:
-  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
-  std::string act_type_;
-};
-
-}  // namespace fusion
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/fusion/fc_fuse_pass.cc b/lite/core/mir/fusion/fc_fuse_pass.cc
deleted file mode 100644
index 3a68fd19bf..0000000000
--- a/lite/core/mir/fusion/fc_fuse_pass.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/fusion/fc_fuse_pass.h"
-#include <memory>
-#include <vector>
-#include "lite/core/mir/fusion/fc_fuser.h"
-#include "lite/core/mir/pass_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  fusion::FcFuser fuser;
-  fuser(graph.get());
-}
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(lite_fc_fuse_pass, paddle::lite::mir::FcFusePass)
-    .SetTargets({TARGET(kAny)});
diff --git a/lite/core/mir/fusion/fc_fuse_pass.h b/lite/core/mir/fusion/fc_fuse_pass.h
deleted file mode 100644
index 44771345a7..0000000000
--- a/lite/core/mir/fusion/fc_fuse_pass.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include "lite/core/mir/pass.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-class FcFusePass : public ProgramPass {
- public:
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
-};
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/fusion/fc_fuse_pass_test.cc b/lite/core/mir/fusion/fc_fuse_pass_test.cc
deleted file mode 100644
index cbf77084dd..0000000000
--- a/lite/core/mir/fusion/fc_fuse_pass_test.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/fusion/fc_fuse_pass.h"
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/api/cxx_api.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/core/op_registry.h"
-
-DEFINE_string(model_dir, "", "");
-DEFINE_string(optimized_model, "", "");
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-TEST(fc_fuse_pass, fuse_test) {
-  lite::Predictor predictor;
-#ifndef LITE_WITH_CUDA
-  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
-                                   Place{TARGET(kX86), PRECISION(kFloat)}});
-#else
-  std::vector<Place> valid_places({
-      Place{TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)},
-      Place{TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)},
-      Place{TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kNCHW)},
-      Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kNCHW)},
-      Place{TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kAny)},
-      Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)},
-  });
-#endif
-
-  predictor.Build(FLAGS_model_dir,
-                  "",
-                  "",
-                  Place{TARGET(kX86), PRECISION(kFloat)},  // origin cuda
-                  valid_places);
-
-  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({100, 100})));
-  auto* data = input_tensor->mutable_data<float>();
-  for (int i = 0; i < 100 * 100; i++) {
-    data[i] = i;
-  }
-
-  predictor.Run();
-
-  auto* out = predictor.GetOutput(0);
-  LOG(INFO) << out << " memory size " << out->data_size();
-  LOG(INFO) << "out " << out->data<float>()[0];
-  LOG(INFO) << "out " << out->data<float>()[1];
-  LOG(INFO) << "dims " << out->dims();
-  EXPECT_NEAR(out->data<float>()[0], 38.120617f, 1e-5);
-  EXPECT_NEAR(out->data<float>()[1], 10.109812f, 1e-5);
-  CHECK_EQ(out->dims()[0], 100);
-  CHECK_EQ(out->dims()[1], 500);
-}
-
-#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-TEST(fc_fuse_pass, save_model_test) {
-  lite::Predictor predictor;
-  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
-                                   Place{TARGET(kX86), PRECISION(kFloat)}});
-  predictor.Build(FLAGS_model_dir,
-                  "",
-                  "",
-                  Place{TARGET(kX86), PRECISION(kFloat)},
-                  valid_places);
-
-  LOG(INFO) << "Save optimized model to " << FLAGS_optimized_model;
-  predictor.SaveModel(FLAGS_optimized_model);
-}
-#endif  // LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(mul);
-USE_LITE_OP(elementwise_add);
-USE_LITE_OP(elementwise_sub);
-USE_LITE_OP(fc);
-USE_LITE_OP(feed);
-USE_LITE_OP(fetch);
-USE_LITE_OP(io_copy);
-USE_LITE_OP(softmax);
-USE_LITE_OP(scale);
-USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
-
-// #ifdef LITE_WITH_X86
-// USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
-// USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def);
-// USE_LITE_KERNEL(elementwise_sub, kX86, kFloat, kNCHW, def);
-// USE_LITE_KERNEL(elementwise_add, kX86, kFloat, kNCHW, def);
-// USE_LITE_KERNEL(softmax, kX86, kFloat, kNCHW, def);
-// USE_LITE_KERNEL(scale, kX86, kFloat, kNCHW, def);
-// #endif
-
-#ifdef LITE_WITH_CUDA
-USE_LITE_KERNEL(mul, kCUDA, kFloat, kNCHW, def);
-USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device);
-USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, device_to_host);
-#endif
diff --git a/lite/core/mir/fusion/fc_fuser.cc b/lite/core/mir/fusion/fc_fuser.cc
deleted file mode 100644
index 72e1a4684d..0000000000
--- a/lite/core/mir/fusion/fc_fuser.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/fusion/fc_fuser.h"
-#include <memory>
-#include <vector>
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace fusion {
-
-void FcFuser::BuildPattern() {
-  // create nodes.
-  auto* x = VarNode("x")->assert_is_op_input("mul", "X");
-  auto* W = VarNode("W")->assert_is_op_input("mul", "Y");
-  auto* b = VarNode("b");
-  auto* mul = OpNode("mul", "mul");
-  auto* mul_out = VarNode("mul_out");
-  auto* add = OpNode("add", "elementwise_add");
-  auto* Out = VarNode("Out");
-
-  // create topology.
-  std::vector<PMNode*> mul_inputs{W, x};
-  std::vector<PMNode*> add_inputs{mul_out, b};
-  mul_inputs >> *mul >> *mul_out;
-  add_inputs >> *add >> *Out;
-
-  // Some op specialities.
-  mul_out->AsIntermediate();
-  mul->AsIntermediate();
-  add->AsIntermediate();
-}
-
-void FcFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
-  auto op_desc = GenOpDesc(matched);
-  auto fc_op = LiteOpRegistry::Global().Create("fc");
-  auto mul = matched.at("mul")->stmt()->op();
-  auto* scope = mul->scope();
-  auto& valid_places = mul->valid_places();
-  fc_op->Attach(op_desc, scope);
-
-  auto* new_op_node = graph->GraphCreateInstructNode(fc_op, valid_places);
-
-  IR_NODE_LINK_TO(matched.at("W"), new_op_node);
-  IR_NODE_LINK_TO(matched.at("x"), new_op_node);
-  IR_NODE_LINK_TO(matched.at("b"), new_op_node);
-  IR_NODE_LINK_TO(new_op_node, matched.at("Out"));
-}
-
-cpp::OpDesc FcFuser::GenOpDesc(const key2nodes_t& matched) {
-  cpp::OpDesc op_desc = *matched.at("mul")->stmt()->op_info();
-  op_desc.SetType("fc");
-  op_desc.SetInput("Input", {matched.at("x")->arg()->name});
-  op_desc.SetInput("W", {matched.at("W")->arg()->name});
-  op_desc.SetInput("Bias", {matched.at("b")->arg()->name});
-  op_desc.SetOutput("Out", {matched.at("Out")->arg()->name});
-  op_desc.SetAttr(
-      "in_num_col_dims",
-      matched.at("mul")->stmt()->op_info()->GetAttr<int>("x_num_col_dims"));
-  return op_desc;
-}
-
-}  // namespace fusion
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/fusion/fc_fuser.h b/lite/core/mir/fusion/fc_fuser.h
deleted file mode 100644
index 7ba0752789..0000000000
--- a/lite/core/mir/fusion/fc_fuser.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include "lite/core/mir/pattern_matcher_high_api.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace fusion {
-
-class FcFuser : public FuseBase {
- public:
-  void BuildPattern() override;
-  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
-
- private:
-  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
-};
-
-}  // namespace fusion
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/fusion/interpolate_fuse_pass.cc b/lite/core/mir/fusion/interpolate_fuse_pass.cc
deleted file mode 100644
index 5a0e1384a7..0000000000
--- a/lite/core/mir/fusion/interpolate_fuse_pass.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/fusion/interpolate_fuse_pass.h"
-#include <memory>
-#include <vector>
-#include "lite/core/mir/fusion/interpolate_fuser.h"
-#include "lite/core/mir/pass_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-void InterpolateFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  fusion::InterpolateFuser bilinear_interp_fuser("bilinear_interp");
-  bilinear_interp_fuser(graph.get());
-
-  fusion::InterpolateFuser nearest_interp_fuser("nearest_interp");
-  nearest_interp_fuser(graph.get());
-}
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(lite_interpolate_fuse_pass,
-                  paddle::lite::mir::InterpolateFusePass)
-    .SetTargets({TARGET(kAny)});
diff --git a/lite/core/mir/fusion/interpolate_fuse_pass.h b/lite/core/mir/fusion/interpolate_fuse_pass.h
deleted file mode 100644
index 2beb4bb5b0..0000000000
--- a/lite/core/mir/fusion/interpolate_fuse_pass.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include "lite/core/mir/pass.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-class InterpolateFusePass : public ProgramPass {
- public:
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
-};
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/fusion/interpolate_fuser.cc b/lite/core/mir/fusion/interpolate_fuser.cc
deleted file mode 100644
index 458ef76cb4..0000000000
--- a/lite/core/mir/fusion/interpolate_fuser.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/fusion/interpolate_fuser.h"
-#include <memory>
-#include <vector>
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace fusion {
-
-void InterpolateFuser::BuildPattern() {
-  auto* x = VarNode("x");
-  auto* shape = OpNode("shape", "shape")->AsIntermediate();
-  auto* shape_out = VarNode("shape_out")->AsIntermediate();
-  auto* slice = OpNode("slice", "slice")
-                    ->assert_op_attr_satisfied<std::vector<int>>(
-                        "axes",
-                        [](const std::vector<int>& attr) {
-                          return attr.size() == 1 && attr[0] == 0;
-                        })
-                    ->assert_op_attr_satisfied<std::vector<int>>(
-                        "starts",
-                        [](const std::vector<int>& attr) {
-                          return attr.size() == 1 && attr[0] == 2;
-                        })
-                    ->assert_op_attr_satisfied<std::vector<int>>(
-                        "ends",
-                        [](const std::vector<int>& attr) {
-                          return attr.size() == 1 && attr[0] == 4;
-                        })
-                    ->AsIntermediate();
-  auto* slice_out = VarNode("slice_out")->AsIntermediate();
-  auto* cast = OpNode("cast", "cast")->AsIntermediate();
-  auto* cast_out = VarNode("cast_out")->AsIntermediate();
-  auto* fill_constant =
-      OpNode("fill_constant", "fill_constant")->AsIntermediate();
-  auto* fill_constant_out = VarNode("fill_constant_out")->AsIntermediate();
-  auto* elementwise_mul =
-      OpNode("elementwise_mul", "elementwise_mul")
-          ->assert_op_attr_satisfied<int>(
-              "axis", [](int attr) { return attr == -1 || attr == 0; })
-          ->AsIntermediate();
-  auto* elementwise_mul_out = VarNode("elementwise_mul_out")->AsIntermediate();
-  auto* interpolate = OpNode("interpolate", interp_type_)->AsIntermediate();
-  auto* interpolate_out = VarNode("interpolate_out");
-
-  // create topology.
-  *x >> *shape >> *shape_out >> *slice >> *slice_out >> *cast >> *cast_out >>
-      *elementwise_mul >> *elementwise_mul_out >> *interpolate >>
-      *interpolate_out;
-  *fill_constant >> *fill_constant_out >> *elementwise_mul;
-  *x >> *interpolate;
-}
-
-void InterpolateFuser::InsertNewNode(SSAGraph* graph,
-                                     const key2nodes_t& matched) {
-  auto op_desc = GenOpDesc(matched);
-  auto interp_op = LiteOpRegistry::Global().Create(interp_type_);
-  auto interp_old = matched.at("interpolate")->stmt()->op();
-  auto* scope = interp_old->scope();
-  auto& valid_places = interp_old->valid_places();
-  interp_op->Attach(op_desc, scope);
-
-  auto* new_op_node = graph->GraphCreateInstructNode(interp_op, valid_places);
-
-  IR_NODE_LINK_TO(matched.at("x"), new_op_node);
-  IR_NODE_LINK_TO(new_op_node, matched.at("interpolate_out"));
-}
-
-cpp::OpDesc InterpolateFuser::GenOpDesc(const key2nodes_t& matched) {
-  auto op_desc = *matched.at("interpolate")->stmt()->op_info();
-  op_desc.SetInput("OutSize", {});
-  op_desc.SetAttr(
-      "scale",
-      matched.at("fill_constant")->stmt()->op_info()->GetAttr<float>("value"));
-  return op_desc;
-}
-
-}  // namespace fusion
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/fusion/interpolate_fuser.h b/lite/core/mir/fusion/interpolate_fuser.h
deleted file mode 100644
index 51f5655e76..0000000000
--- a/lite/core/mir/fusion/interpolate_fuser.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include "lite/core/mir/pattern_matcher_high_api.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace fusion {
-
-class InterpolateFuser : public FuseBase {
- public:
-  explicit InterpolateFuser(const std::string& interp_type)
-      : interp_type_(interp_type) {}
-
-  void BuildPattern() override;
-  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
-
- private:
-  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
-  std::string interp_type_;
-};
-
-}  // namespace fusion
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
deleted file mode 100644
index 9773caa3c1..0000000000
--- a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/fusion/quant_dequant_fuse_pass.h"
-#include <memory>
-#include <vector>
-#include "lite/api/paddle_place.h"
-#include "lite/core/mir/fusion/quant_dequant_op_fuser.h"
-#include "lite/core/mir/pass_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-void QuantDequantFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  std::unordered_set<std::string> quant_types = {
-      "fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"};
-  std::unordered_set<std::string> quantized_op_types = {
-      "conv2d", "mul", "depthwise_conv2d"};
-  for (auto& quant_type : quant_types) {
-    for (auto& op_type : quantized_op_types) {
-      for (int i = 6; i >= 1; i--) {
-        fusion::QuantDequantOpFuser fuser(op_type, quant_type, i);
-        fuser(graph.get());
-      }
-    }
-  }
-}
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(lite_quant_dequant_fuse_pass,
-                  paddle::lite::mir::QuantDequantFusePass)
-    .SetTargets({TARGET(kAny)});
diff --git a/lite/core/mir/fusion/quant_dequant_fuse_pass.h b/lite/core/mir/fusion/quant_dequant_fuse_pass.h
deleted file mode 100644
index 243241bfb7..0000000000
--- a/lite/core/mir/fusion/quant_dequant_fuse_pass.h
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_set>
-#include "lite/core/mir/pass.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-class QuantDequantFusePass : public ProgramPass {
- public:
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
-};
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.cc b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
deleted file mode 100644
index 1c7cf866b9..0000000000
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc
+++ /dev/null
@@ -1,200 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/fusion/quant_dequant_op_fuser.h"
-#include <memory>
-#include <vector>
-#include "lite/utils/string.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace fusion {
-
-void QuantDequantOpFuser::BuildPattern() {
-  const int kNumFields = 5;
-  const int kQuantizedWeightOffset = 0;
-  const int kQuantizedOpOffset = 1;
-  const int kQuantizedOpOutOffset = 2;
-  const int kDequantOpOffset = 3;
-  const int kDequantOpOutOffset = 4;
-
-  std::string weight_name = "";
-  if (op_type_ == "conv2d" || op_type_ == "depthwise_conv2d") {
-    weight_name = "Filter";
-  } else {
-    weight_name = "Y";
-  }
-  auto* quant_op_input = VarNode("quant_op_input")
-                             ->assert_is_op_input(quant_type_, "X")
-                             ->AsInput();
-  auto* quant_op_in_scale = VarNode("quant_op_in_scale")
-                                ->assert_is_op_input(quant_type_, "InScale")
-                                ->AsIntermediate();
-  auto* quant_op = OpNode("quant_op", quant_type_)
-                       ->assert_is_op(quant_type_)
-                       ->AsIntermediate();
-
-  auto* quant_op_out_scale =
-      VarNode("quant_op_out_scale")
-          ->assert_is_op_output(quant_type_, "OutScale")
-          ->assert_is_op_input("fake_dequantize_max_abs", "Scale")
-          ->AsIntermediate();
-
-  auto* quant_op_out = VarNode("quant_op_out")
-                           ->assert_is_op_output(quant_type_, "Out")
-                           ->assert_is_op_input(op_type_)
-                           ->AsIntermediate();
-  std::vector<PMNode*> nodes;
-  for (int i = 0; i < times_; i++) {
-    nodes.push_back(VarNode(string_format("quantized_op_weight%d", i))
-                        ->assert_is_op_input(op_type_, weight_name)
-                        ->AsInput());
-
-    nodes.push_back(OpNode(string_format("quantized_op%d", i), op_type_)
-                        ->assert_is_op(op_type_)
-                        ->AsIntermediate());
-
-    nodes.push_back(VarNode(string_format("quantized_op_out%d", i))
-                        ->assert_is_op_output(op_type_)
-                        ->assert_is_op_input("fake_dequantize_max_abs", "X")
-                        ->AsIntermediate());
-
-    nodes.push_back(
-        OpNode(string_format("dequant_op%d", i), "fake_dequantize_max_abs")
-            ->assert_is_op("fake_dequantize_max_abs")
-            ->AsIntermediate());
-    nodes.push_back(VarNode(string_format("dequant_op_out%d", i))
-                        ->assert_is_op_output("fake_dequantize_max_abs", "Out")
-                        ->AsOutput());
-  }
-
-  quant_op->LinksFrom({quant_op_input, quant_op_in_scale});
-  quant_op_out->LinksFrom({quant_op});
-  quant_op_out_scale->LinksFrom({quant_op});
-  for (int i = 0; i < times_; i++) {
-    nodes[i * kNumFields + kQuantizedOpOffset]->LinksFrom(
-        {quant_op_out, nodes[i * kNumFields + kQuantizedWeightOffset]});
-    nodes[i * kNumFields + kQuantizedOpOutOffset]->LinksFrom(
-        {nodes[i * kNumFields + kQuantizedOpOffset]});
-    nodes[i * kNumFields + kDequantOpOffset]->LinksFrom(
-        {nodes[i * kNumFields + kQuantizedOpOutOffset], quant_op_out_scale});
-    nodes[i * kNumFields + kDequantOpOutOffset]->LinksFrom(
-        {nodes[i * kNumFields + kDequantOpOffset]});
-  }
-}
-
-void QuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
-                                        const key2nodes_t& matched) {
-  const int kNumFields = 5;
-  const int kQuantizedWeightOffset = 0;
-  const int kQuantizedOpOffset = 1;
-  const int kDequantOpOffset = 3;
-  const int kDequantOpOutOffset = 4;
-
-  auto* quant_op_input = matched.at("quant_op_input");
-  auto* quant_op_in_scale = matched.at("quant_op_in_scale");
-  auto* quant_op = matched.at("quant_op");
-
-  std::vector<Node*> nodes;
-  for (int i = 0; i < times_; i++) {
-    nodes.push_back(matched.at(string_format("quantized_op_weight%d", i)));
-    nodes.push_back(matched.at(string_format("quantized_op%d", i)));
-    nodes.push_back(matched.at(string_format("quantized_op_out%d", i)));
-    nodes.push_back(matched.at(string_format("dequant_op%d", i)));
-    nodes.push_back(matched.at(string_format("dequant_op_out%d", i)));
-  }
-  int bit_length = quant_op->stmt()->op_info()->GetAttr<int>("bit_length");
-  auto* scope = quant_op->stmt()->op()->scope();
-  auto& valid_places = quant_op->stmt()->op()->valid_places();
-  int range = ((1 << (bit_length - 1)) - 1);
-  auto input_scale_t = scope->FindVar(quant_op_in_scale->arg()->name)
-                           ->GetMutable<lite::Tensor>();
-  float input_scale = input_scale_t->data<float>()[0] / range;
-
-  VLOG(4) << "range: " << range << " input_scale: " << input_scale;
-  for (int i = 0; i < times_; i++) {
-    float max_range = nodes[i * kNumFields + kDequantOpOffset]
-                          ->stmt()
-                          ->op_info()
-                          ->GetAttr<float>("max_range");
-    // weight_scale = max(abs(weight))
-    float whole_weight_scale =
-        static_cast<float>(range * range) / max_range / range;
-
-    cpp::OpDesc op_desc =
-        *nodes[i * kNumFields + kQuantizedOpOffset]->stmt()->op_info();
-
-    auto quantized_weight_var_name =
-        nodes[i * kNumFields + kQuantizedWeightOffset]->arg()->name;
-    auto quantized_weight_t =
-        scope->FindVar(quantized_weight_var_name)->GetMutable<lite::Tensor>();
-    std::vector<float> weight_scale;
-    int weight_scale_size;
-
-    if (op_type_ == "conv2d" || op_type_ == "depthwise_conv2d") {
-      op_desc.SetInput("Input", {matched.at("quant_op_input")->arg()->name});
-      op_desc.SetOutput(
-          "Output", {nodes[i * kNumFields + kDequantOpOutOffset]->arg()->name});
-      // Conv weight shape: Cout * Cin * kh * hw, the weight_scale_size should
-      // be Cout.
-      weight_scale_size = quantized_weight_t->dims()[0];
-    } else if (op_type_ == "mul") {
-      op_desc.SetInput("X", {matched.at("quant_op_input")->arg()->name});
-      op_desc.SetOutput(
-          "Out", {nodes[i * kNumFields + kDequantOpOutOffset]->arg()->name});
-      // Fc weight: Cin * Cout, the weight_scale_size should be Cout.
-      weight_scale_size = quantized_weight_t->dims()[1];
-    }
-    for (int i = 0; i < weight_scale_size; i++) {
-      weight_scale.push_back(whole_weight_scale);
-    }
-    op_desc.SetAttr("enable_int8", true);
-    op_desc.SetAttr("input_scale", input_scale);
-    op_desc.SetAttr("weight_scale", weight_scale);
-
-    Tensor temp_tensor;
-    temp_tensor.CopyDataFrom(*quantized_weight_t);
-    float* temp_data = temp_tensor.mutable_data<float>();
-
-    size_t weight_num = quantized_weight_t->data_size();
-    int8_t* quantized_weight_data = quantized_weight_t->mutable_data<int8_t>();
-
-    // change the weight from the float type to int8 type.
-    for (size_t i = 0; i < weight_num; i++) {
-      quantized_weight_data[i] = static_cast<int8_t>(temp_data[i]);
-    }
-    quantized_weight_t->set_persistable(true);
-    quantized_weight_t->set_precision(PRECISION(kInt8));
-    auto quantized_op = LiteOpRegistry::Global().Create(op_type_);
-
-    quantized_op->Attach(op_desc, scope);
-    auto* new_op_node =
-        graph->GraphCreateInstructNode(quantized_op, valid_places);
-    IR_NODE_LINK_TO(quant_op_input, new_op_node);
-    IR_NODE_LINK_TO(nodes[i * kNumFields + kQuantizedWeightOffset],
-                    new_op_node);
-    IR_NODE_LINK_TO(new_op_node, nodes[i * kNumFields + kDequantOpOutOffset]);
-  }
-}
-
-cpp::OpDesc QuantDequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
-  cpp::OpDesc op_desc;
-  return op_desc;
-}
-
-}  // namespace fusion
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.h b/lite/core/mir/fusion/quant_dequant_op_fuser.h
deleted file mode 100644
index 15833ad258..0000000000
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.h
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include "lite/core/mir/pattern_matcher_high_api.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace fusion {
-
-/* The model trained by fluid quantization is a simulation of real int8.
- * The quantized Ops(conv2d, mul, depthwise conv2d etc) have fake_quantop
- * in front and fake_dequantop behind.
- *
- * When in int8 mode, the pattern like "fake_quant + quantized_op +
- * fake_dequant"
- * can be detected by this fuser. The fuser extract the input_scale and
- * the weight_scale info from fake_quant, fake_dequant op and fuse those into
- * the quantized_op.
- * In addition, the fuser delete fake_quant and fake_dequant op in the graph at
- * the last.
- */
-class QuantDequantOpFuser : public FuseBase {
- public:
-  explicit QuantDequantOpFuser(const std::string& op_type,
-                               const std::string& quant_type,
-                               int times)
-      : op_type_(op_type), quant_type_(quant_type), times_(times) {}
-  void BuildPattern() override;
-  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
-
- private:
-  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
-
- private:
-  std::string op_type_{"conv2d"};
-  std::string quant_type_;
-  int times_;
-};
-
-}  // namespace fusion
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/fusion/shuffle_channel_fuse_pass.cc b/lite/core/mir/fusion/shuffle_channel_fuse_pass.cc
deleted file mode 100644
index 049be721e9..0000000000
--- a/lite/core/mir/fusion/shuffle_channel_fuse_pass.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/fusion/shuffle_channel_fuse_pass.h"
-#include <memory>
-#include <vector>
-#include "lite/core/mir/fusion/shuffle_channel_fuser.h"
-#include "lite/core/mir/pass_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-void ShuffleChannelFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  fusion::ShuffleChannelFuser fuser("reshape", "transpose");
-  fuser(graph.get());
-
-  fusion::ShuffleChannelFuser fuser2("reshape2", "transpose2");
-  fuser2(graph.get());
-}
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(lite_shuffle_channel_fuse_pass,
-                  paddle::lite::mir::ShuffleChannelFusePass)
-    .SetTargets({TARGET(kAny)});
diff --git a/lite/core/mir/fusion/shuffle_channel_fuse_pass.h b/lite/core/mir/fusion/shuffle_channel_fuse_pass.h
deleted file mode 100644
index 0524aff395..0000000000
--- a/lite/core/mir/fusion/shuffle_channel_fuse_pass.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include "lite/core/mir/pass.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-class ShuffleChannelFusePass : public ProgramPass {
- public:
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
-};
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/fusion/shuffle_channel_fuser.cc b/lite/core/mir/fusion/shuffle_channel_fuser.cc
deleted file mode 100644
index f0087f8991..0000000000
--- a/lite/core/mir/fusion/shuffle_channel_fuser.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/fusion/shuffle_channel_fuser.h"
-#include <memory>
-#include <vector>
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace fusion {
-
-void ShuffleChannelFuser::BuildPattern() {
-  // create nodes.
-  auto* x1 = VarNode("x1")->assert_is_op_input(reshape_type_, "X");
-  auto* y1 = VarNode("y1")->assert_is_op_output(reshape_type_, "Out");
-  auto* y2 = VarNode("y2")->assert_is_op_output(transpose_type_, "Out");
-  auto* out = VarNode("out")->assert_is_op_output(reshape_type_, "Out");
-
-  PMNode* xshape1 = nullptr;
-  PMNode* xshape2 = nullptr;
-  PMNode* xshape3 = nullptr;
-  if (reshape_type_ == "reshape2") {
-    xshape1 = VarNode("xshape1")->assert_is_op_output(reshape_type_, "XShape");
-    xshape3 = VarNode("xshape3")->assert_is_op_output(reshape_type_, "XShape");
-  }
-  if (transpose_type_ == "transpose2") {
-    xshape2 =
-        VarNode("xshape2")->assert_is_op_output(transpose_type_, "XShape");
-  }
-
-  auto* reshape1 = OpNode("reshape1", reshape_type_)
-                       ->assert_op_attr_satisfied<std::vector<int>>(
-                           "shape", [](const std::vector<int>& attr) {
-                             return attr.size() >= 5 && attr[1] > 0;
-                           });
-  auto* transpose =
-      OpNode("transpose_op", transpose_type_)
-          ->assert_op_attr_satisfied<std::vector<int>>(
-              "axis", [](const std::vector<int>& attr) {
-                return attr.size() >= 5 && attr[1] == 2 && attr[2] == 1;
-              });
-  auto* reshape2 = OpNode("reshape2", reshape_type_)
-                       ->assert_op_attr_satisfied<std::vector<int>>(
-                           "shape", [](const std::vector<int>& attr) {
-                             return attr.size() >= 4;
-                           });
-
-  // create topology.
-  *x1 >> *reshape1 >> *y1 >> *transpose >> *y2 >> *reshape2 >> *out;
-  if (xshape1) *reshape1 >> *xshape1;
-  if (xshape2) *transpose >> *xshape2;
-  if (xshape3) *reshape2 >> *xshape3;
-
-  // Some op specialities.
-  y1->AsIntermediate();
-  y2->AsIntermediate();
-  if (xshape1) xshape1->AsIntermediate();
-  if (xshape2) xshape2->AsIntermediate();
-  if (xshape3) xshape3->AsIntermediate();
-  reshape1->AsIntermediate();
-  transpose->AsIntermediate();
-  reshape2->AsIntermediate();
-}
-
-void ShuffleChannelFuser::InsertNewNode(SSAGraph* graph,
-                                        const key2nodes_t& matched) {
-  auto op_desc = GenOpDesc(matched);
-  auto shuffle_channel_op = LiteOpRegistry::Global().Create("shuffle_channel");
-  auto transpose = matched.at("transpose_op")->stmt()->op();
-  auto* scope = transpose->scope();
-  auto& valid_places = transpose->valid_places();
-  shuffle_channel_op->Attach(op_desc, scope);
-
-  auto* new_op_node =
-      graph->GraphCreateInstructNode(shuffle_channel_op, valid_places);
-
-  IR_NODE_LINK_TO(matched.at("x1"), new_op_node);
-  IR_NODE_LINK_TO(new_op_node, matched.at("out"));
-}
-
-cpp::OpDesc ShuffleChannelFuser::GenOpDesc(const key2nodes_t& matched) {
-  cpp::OpDesc op_desc;
-  op_desc.SetType("shuffle_channel");
-  op_desc.SetInput("X", {matched.at("x1")->arg()->name});
-  op_desc.SetOutput("Out", {matched.at("out")->arg()->name});
-  op_desc.SetAttr("group",
-                  matched.at("reshape1")
-                      ->stmt()
-                      ->op_info()
-                      ->GetAttr<std::vector<int>>("shape")[1]);
-  return op_desc;
-}
-
-}  // namespace fusion
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/fusion/shuffle_channel_fuser.h b/lite/core/mir/fusion/shuffle_channel_fuser.h
deleted file mode 100644
index 4fb99ab5c8..0000000000
--- a/lite/core/mir/fusion/shuffle_channel_fuser.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include "lite/core/mir/pattern_matcher_high_api.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace fusion {
-
-class ShuffleChannelFuser : public FuseBase {
- public:
-  explicit ShuffleChannelFuser(const std::string& reshape_type,
-                               const std::string& transpose_type)
-      : reshape_type_(reshape_type), transpose_type_(transpose_type) {}
-
-  void BuildPattern() override;
-  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
-
- private:
-  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
-  std::string reshape_type_;
-  std::string transpose_type_;
-};
-
-}  // namespace fusion
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/fusion/transpose_softmax_transpose_fuse_pass.cc b/lite/core/mir/fusion/transpose_softmax_transpose_fuse_pass.cc
deleted file mode 100644
index 47c866d87a..0000000000
--- a/lite/core/mir/fusion/transpose_softmax_transpose_fuse_pass.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/fusion/transpose_softmax_transpose_fuse_pass.h"
-#include <memory>
-#include <vector>
-#include "lite/core/mir/fusion/transpose_softmax_transpose_fuser.h"
-#include "lite/core/mir/pass_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-void TransposeSoftmaxTransposeFusePass::Apply(
-    const std::unique_ptr<SSAGraph>& graph) {
-  fusion::TransposeSoftmaxTransposeFuser fuser("transpose", "softmax");
-  fuser(graph.get());
-
-  fusion::TransposeSoftmaxTransposeFuser fuser2("transpose2", "softmax");
-  fuser2(graph.get());
-}
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass,
-                  paddle::lite::mir::TransposeSoftmaxTransposeFusePass)
-    .SetTargets({TARGET(kAny)});
diff --git a/lite/core/mir/fusion/transpose_softmax_transpose_fuse_pass.h b/lite/core/mir/fusion/transpose_softmax_transpose_fuse_pass.h
deleted file mode 100644
index 4ae6ce83c4..0000000000
--- a/lite/core/mir/fusion/transpose_softmax_transpose_fuse_pass.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include "lite/core/mir/pass.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-class TransposeSoftmaxTransposeFusePass : public ProgramPass {
- public:
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
-};
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/fusion/transpose_softmax_transpose_fuser.cc b/lite/core/mir/fusion/transpose_softmax_transpose_fuser.cc
deleted file mode 100644
index d578b725ec..0000000000
--- a/lite/core/mir/fusion/transpose_softmax_transpose_fuser.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/fusion/transpose_softmax_transpose_fuser.h"
-#include <memory>
-#include <vector>
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace fusion {
-
-void TransposeSoftmaxTransposeFuser::BuildPattern() {
-  // create nodes.
-  auto* x1 = VarNode("x1")->assert_is_op_input(transpose_type_, "X");
-  auto* y1 = VarNode("y1")->assert_is_op_output(transpose_type_, "Out");
-  auto* y2 = VarNode("y2")->assert_is_op_output(softmax_type_, "Out");
-  auto* out = VarNode("out")->assert_is_op_output(transpose_type_, "Out");
-
-  PMNode* xshape1 = nullptr;
-  PMNode* xshape2 = nullptr;
-  if (transpose_type_ == "transpose2") {
-    xshape1 =
-        VarNode("xshape1")->assert_is_op_output(transpose_type_, "XShape");
-    xshape2 =
-        VarNode("xshape2")->assert_is_op_output(transpose_type_, "XShape");
-  }
-
-  auto* transpose1 =
-      OpNode("transpose1", transpose_type_)->assert_is_op(transpose_type_);
-
-  auto* softmax = OpNode("softmax", softmax_type_)
-                      ->assert_op_attr_satisfied<int>(
-                          "axis", [](int attr) { return attr == -1; });
-
-  auto* transpose2 =
-      OpNode("transpose2", transpose_type_)->assert_is_op(transpose_type_);
-
-  // create topology.
-  *x1 >> *transpose1 >> *y1 >> *softmax >> *y2 >> *transpose2 >> *out;
-  if (xshape1) *transpose1 >> *xshape1;
-  if (xshape2) *transpose2 >> *xshape2;
-
-  // nodes to remove
-  y1->AsIntermediate();
-  y2->AsIntermediate();
-  if (xshape1) xshape1->AsIntermediate();
-  if (xshape2) xshape2->AsIntermediate();
-  transpose1->AsIntermediate();
-  softmax->AsIntermediate();
-  transpose2->AsIntermediate();
-}
-
-void TransposeSoftmaxTransposeFuser::InsertNewNode(SSAGraph* graph,
-                                                   const key2nodes_t& matched) {
-  auto op_desc = GenOpDesc(matched);
-  auto softmax_op = LiteOpRegistry::Global().Create(softmax_type_);
-  auto softmax_old = matched.at("softmax")->stmt()->op();
-  auto* scope = softmax_old->scope();
-  auto& valid_places = softmax_old->valid_places();
-  softmax_op->Attach(op_desc, scope);
-
-  auto* new_op_node = graph->GraphCreateInstructNode(softmax_op, valid_places);
-
-  IR_NODE_LINK_TO(matched.at("x1"), new_op_node);
-  IR_NODE_LINK_TO(new_op_node, matched.at("out"));
-}
-
-cpp::OpDesc TransposeSoftmaxTransposeFuser::GenOpDesc(
-    const key2nodes_t& matched) {
-  cpp::OpDesc op_desc;
-  op_desc.SetType("softmax");
-  op_desc.SetInput("X", {matched.at("x1")->arg()->name});
-  op_desc.SetOutput("Out", {matched.at("out")->arg()->name});
-  op_desc.SetAttr("axis",
-                  matched.at("transpose1")
-                      ->stmt()
-                      ->op_info()
-                      ->GetAttr<std::vector<int>>("axis")
-                      .back());
-
-  return op_desc;
-}
-
-}  // namespace fusion
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/fusion/transpose_softmax_transpose_fuser.h b/lite/core/mir/fusion/transpose_softmax_transpose_fuser.h
deleted file mode 100644
index fbccfd2c6a..0000000000
--- a/lite/core/mir/fusion/transpose_softmax_transpose_fuser.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include "lite/core/mir/pattern_matcher_high_api.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace fusion {
-
-class TransposeSoftmaxTransposeFuser : public FuseBase {
- public:
-  explicit TransposeSoftmaxTransposeFuser(const std::string& transpose_type,
-                                          const std::string& softmax_type)
-      : transpose_type_(transpose_type), softmax_type_(softmax_type) {}
-
-  void BuildPattern() override;
-  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
-
- private:
-  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
-  std::string transpose_type_;
-  std::string softmax_type_;
-};
-
-}  // namespace fusion
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/generate_program_pass.cc b/lite/core/mir/generate_program_pass.cc
deleted file mode 100644
index 23f2de564e..0000000000
--- a/lite/core/mir/generate_program_pass.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/generate_program_pass.h"
-#include <memory>
-#include <utility>
-#include <vector>
-#include "lite/core/mir/graph_visualize_pass.h"
-#include "lite/core/mir/pass_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-void GenerateProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  VLOG(4) << "final program \n" << Visualize(graph.get());
-  for (auto& item : graph->StmtTopologicalOrder()) {
-    if (item->IsStmt()) {
-      auto& stmt = item->AsStmt();
-      VLOG(4) << stmt;
-      insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front()));
-    }
-  }
-}
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(generate_program_pass, paddle::lite::mir::GenerateProgramPass)
-    .SetTargets({TARGET(kAny)});
diff --git a/lite/core/mir/generate_program_pass.h b/lite/core/mir/generate_program_pass.h
deleted file mode 100644
index b126b4aba4..0000000000
--- a/lite/core/mir/generate_program_pass.h
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <list>
-#include <memory>
-#include <utility>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/mir/pass.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-/*
- * GenerateProgramPass will build the execution program for executor from a mir
- * graph.
- */
-class GenerateProgramPass : public ProgramPass {
- public:
-  void Apply(const std::unique_ptr<SSAGraph> &graph) override;
-
-  std::unique_ptr<RuntimeProgram> GenProgram() {
-    LOG(INFO) << "insts.size " << insts_.size();
-    std::unique_ptr<RuntimeProgram> program(
-        new RuntimeProgram(std::move(insts_)));
-
-    return program;
-  }
-
- private:
-  std::vector<Instruction> insts_;
-};
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/graph_visualize_pass.cc b/lite/core/mir/graph_visualize_pass.cc
deleted file mode 100644
index f97dbfc7cd..0000000000
--- a/lite/core/mir/graph_visualize_pass.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/graph_visualize_pass.h"
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-#include <utility>
-#include "lite/core/mir/pass_registry.h"
-#include "lite/utils/string.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-using inference::analysis::Dot;
-
-void GraphVisualizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  Visualize(graph.get());
-}
-
-std::string Visualize(mir::SSAGraph* graph) {
-  inference::analysis::Dot dot;
-
-  int id = 0;
-  std::set<std::string> exists_args;
-  std::map<int, std::string> graph_col;  // Different colors of subgraphs
-  graph_col.insert({{1, "red"},
-                    {2, "green"},
-                    {3, "cyan"},
-                    {4, "bisque3"},
-                    {5, "coral"},
-                    {6, "darkseagreen1"},
-                    {7, "goldenrod1"},
-                    {8, "darkorchid"}});
-  for (auto& node : graph->mutable_nodes()) {
-    std::string key;
-    if (node.IsArg()) {
-      key = node.AsArg().name;
-    } else {
-      key = string_format("%s%d", node.AsStmt().op_type().c_str(), id++);
-    }
-
-    if (node.IsStmt()) {
-      auto& stmt = node.AsStmt();
-      auto sub_id = stmt.subgraph_id();
-      auto it = graph_col.find(sub_id);
-      if (sub_id > 0 && it != graph_col.end()) {
-        dot.AddNode(key,
-                    {Dot::Attr("shape", "box"),
-                     Dot::Attr("style", "filled"),
-                     Dot::Attr("color", "black"),
-                     Dot::Attr("fillcolor", it->second)});
-      } else {
-        dot.AddNode(key,
-                    {Dot::Attr("shape", "box"),
-                     Dot::Attr("style", "filled"),
-                     Dot::Attr("color", "black"),
-                     Dot::Attr("fillcolor", "yellow")});
-      }
-      for (auto& x : node.inlinks) {
-        auto name = x->AsArg().name;
-        if (!exists_args.count(name)) {
-          dot.AddNode(name, {});
-        }
-        dot.AddEdge(name, key, {});
-        exists_args.insert(name);
-      }
-      for (auto& x : node.outlinks) {
-        auto name = x->AsArg().name;
-        if (!exists_args.count(name)) {
-          dot.AddNode(name, {});
-        }
-        dot.AddEdge(key, name, {});
-        exists_args.insert(name);
-      }
-    }
-  }
-
-  auto res = dot.Build();
-  LOG(INFO) << "dot:\n" << res;
-  return res;
-}
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(graph_visualze, paddle::lite::mir::GraphVisualizePass)
-    .SetTargets({TARGET(kAny)});
diff --git a/lite/core/mir/graph_visualize_pass.h b/lite/core/mir/graph_visualize_pass.h
deleted file mode 100644
index bde58a63b3..0000000000
--- a/lite/core/mir/graph_visualize_pass.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include "lite/core/mir/dot.h"
-#include "lite/core/mir/pass.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-/*
- * GraphVisualizePass helps to visualize an mir graph by exporting a DOT
- * language file.
- */
-class GraphVisualizePass : public DebugPass {
- public:
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
-};
-
-std::string Visualize(mir::SSAGraph* graph);
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/io_copy_kernel_pick_pass.cc b/lite/core/mir/io_copy_kernel_pick_pass.cc
deleted file mode 100644
index b2ea823e0b..0000000000
--- a/lite/core/mir/io_copy_kernel_pick_pass.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/pass.h"
-#include "lite/core/mir/pass_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-class IoCopyKernelPickPass : public StmtPass {
- public:
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
-    for (auto& node : graph->mutable_nodes()) {
-      if (!node.IsStmt()) continue;
-      auto& inst = node.AsStmt();
-      if (inst.op_type() != "io_copy") continue;
-
-      LOG(INFO) << "....> picking a IO COPY kernel";
-
-      auto& kernels = node.AsStmt().kernels();
-      CHECK(!kernels.empty()) << "No valid kernels found for IoCopy Op";
-      const auto* inty = node.inlinks.front()->AsArg().type;
-      const auto* outy = node.outlinks.front()->AsArg().type;
-      LOG(INFO) << "input type " << *inty;
-      LOG(INFO) << "output type " << *outy;
-
-      bool is_found = false;
-      LOG(INFO) << "kernels size " << kernels.size();
-      for (auto& kernel : kernels) {
-        CHECK_EQ(node.inlinks.size(), 1UL);
-        CHECK_EQ(node.outlinks.size(), 1UL);
-
-        const Type* in_arg_ty = kernel->GetInputDeclType("Input");
-        const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
-        LOG(INFO) << "checking kernel candidate " << *in_arg_ty << "->"
-                  << *out_arg_ty;
-        if (TargetCompatibleTo(*inty, *in_arg_ty)) {
-          // Both the input and output type matches, remove other kernels
-          // directly.
-          if (TargetCompatibleTo(*outy, *out_arg_ty)) {
-            LOG(INFO) << "get a IOCopy kernel";
-            auto x = std::move(kernel);
-            kernels.clear();
-            kernels.emplace_back(std::move(x));
-            is_found = true;
-            break;
-          }
-        }
-      }
-
-      CHECK(is_found) << "Can't find a IoCopy kernel for IO: " << *inty << "->"
-                      << *outy;
-    }
-  }
-};
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(io_copy_kernel_pick_pass,
-                  paddle::lite::mir::IoCopyKernelPickPass)
-    .SetTargets({TARGET(kAny)});
diff --git a/lite/core/mir/node.cc b/lite/core/mir/node.cc
deleted file mode 100644
index 61d3d317e7..0000000000
--- a/lite/core/mir/node.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/node.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-
-const OpInfo *mir::Node::Stmt::op_info() const {
-  CHECK(op_);
-  return op_->op_info();
-}
-
-Place mir::Node::Stmt::place() const {
-  CHECK(!valid_kernels_.empty());
-  return valid_kernels_.front()->place();
-}
-
-KernelBase &mir::Node::Stmt::picked_kernel() {
-  CHECK(!valid_kernels_.empty()) << "no kernel for " << op_type();
-  return *valid_kernels_.front();
-}
-
-OpInfo *mir::Node::Stmt::mutable_op_info() {
-  CHECK(op_);
-  return op_->mutable_op_info();
-}
-
-void mir::Node::Stmt::ResetOp(const cpp::OpDesc &op_desc,
-                              const std::vector<Place> &valid_places,
-                              lite::Scope *scope) {
-  CHECK((op_ && op_->scope()) || scope) << "Either scope should be set";
-  lite::Scope *the_scope = scope ? scope : op_->scope();
-  op_->Attach(op_desc, the_scope);
-  // Recreate the kernels with the latest OpInfo.
-  valid_kernels_.clear();
-
-  if (!op_ || op_->op_info()->Type() != op_desc.Type()) {
-    op_ = LiteOpRegistry::Global().Create(op_desc.Type());
-    CHECK(op_) << "No op found for " << op_desc.Type();
-  }
-  valid_kernels_ = op_->CreateKernels(valid_places);
-}
-
-std::ostream &mir::operator<<(std::ostream &os, const mir::Node::Stmt &other) {
-  os << "Statement " << other.op_type() << " " << other.place().DebugString();
-  return os;
-}
-
-mir::Node::Arg &mir::Node::AsArg(const std::string &name, int id) {
-  auto &x = AsArg();
-  x.name = name;
-  x.id = id;
-  return x;
-}
-mir::Node::Arg &mir::Node::AsArg(const std::string &name) {
-  auto &x = AsArg();
-  x.name = name;
-  return x;
-}
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/node.h b/lite/core/mir/node.h
deleted file mode 100644
index 9c7d441ca3..0000000000
--- a/lite/core/mir/node.h
+++ /dev/null
@@ -1,173 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <list>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_lite.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-// Node in a MIR graph.
-class Node {
- public:
-  std::list<Node*> inlinks;
-  std::list<Node*> outlinks;
-
-  Node() = default;
-
-  enum class Role {
-    kArg = 0,
-    kStmt,
-    kNumRoles, /*should be last*/
-    kUnk,
-  };
-
-  class Stmt {
-    // The kernel instances this Statement contains.
-    std::vector<std::unique_ptr<KernelBase>> valid_kernels_;
-    // TODO(Superjomn) make this a shared_ptr for resource safety.
-    std::shared_ptr<OpLite> op_;  // we hold op to run InferShape
-
-   public:
-    // Refresh the operator and kernels with the latest OpInfo.
-    void ResetOp(const cpp::OpDesc& op_desc,
-                 const std::vector<Place>& valid_places,
-                 lite::Scope* scope = nullptr);
-
-    std::string op_type() const { return op_info()->Type(); }
-    const OpInfo* op_info() const;
-    OpInfo* mutable_op_info();
-
-    void SetKernels(std::vector<std::unique_ptr<KernelBase>>&& kernels) {
-      valid_kernels_ = std::move(kernels);
-    }
-    std::vector<std::unique_ptr<KernelBase>>& kernels() {
-      return valid_kernels_;
-    }
-
-    void ClearSubgraphID() { subgraph_id_ = -1 /* note: not 0 */; }
-    void SetSubgraphID(int id) { subgraph_id_ = id; }
-    int subgraph_id() const { return subgraph_id_; }
-    void SetOp(const std::shared_ptr<OpLite>& op) { op_ = op; }
-    const std::shared_ptr<OpLite> op() const { return op_; }
-
-    Place place() const;
-
-    KernelBase& picked_kernel();
-
-    friend std::ostream& operator<<(std::ostream& os, const Stmt& other);
-
-    // Description.
-    std::string desc;
-
-   protected:
-    // -1 means not in subgraph, 0 means supported but not one id, id started
-    // from 1
-    int subgraph_id_{-1};
-  };
-
-  struct Arg {
-    std::string name;
-    int id{0};
-    const Type* type{};
-    // Weight is a special kind of argument, it is marked as weight explicitly
-    // so that some weight related optimization can take place.
-    bool is_weight{false};
-    // is_persist indicate that whether is the argument trans from Weight
-    // if the need more than one tool operator(eg. io_copy layout calib), the
-    // argument between them should be persist to make sure it's only run once
-    bool is_persist{false};
-  };
-
-  Arg& AsArg(const std::string& name, int id);
-
-  Arg& AsArg(const std::string& name);
-
-  Stmt& AsStmt(const std::string& op_type,
-               std::vector<std::unique_ptr<KernelBase>>&& kernels,
-               const std::shared_ptr<OpLite>& op) {
-    auto& x = AsStmt();
-    x.SetOp(op);
-    x.SetKernels(std::move(kernels));
-    return x;
-  }
-
-  Stmt* stmt() const {
-    CHECK(IsStmt());
-    return stmt_.get();
-  }
-
-  Arg* arg() const {
-    CHECK(IsArg());
-    return arg_.get();
-  }
-
-  // Set roles.
-  Arg& AsArg() {
-    if (role_ != Role::kUnk) {
-      CHECK(role_ == Role::kArg);
-      return *arg_;
-    }
-    role_ = Role::kArg;
-    arg_.reset(new Arg);
-    return *arg_;
-  }
-  Stmt& AsStmt() {
-    if (role_ != Role::kUnk) {
-      CHECK(role_ == Role::kStmt);
-      return *stmt_;
-    }
-    role_ = Role::kStmt;
-    stmt_.reset(new Stmt);
-    return *stmt_;
-  }
-
-  friend std::ostream& operator<<(std::ostream& os, Node& other) {
-    os << static_cast<int>(other.role_) << " ";
-    if (!other.IsRoleSet()) {
-      os << "Unk role node";
-    }
-    if (other.IsArg()) {
-      auto& arg = other.AsArg();
-      os << "Argument " << arg.name;
-    }
-    if (other.IsStmt()) {
-      auto& arg = other.AsStmt();
-      os << "Statement " << arg.op_type();
-    }
-    return os;
-  }
-
-  // Check roles.
-  bool IsRoleSet() const { return role_ != Role::kUnk; }
-  bool IsStmt() const { return role_ == Role::kStmt; }
-  bool IsArg() const { return role_ == Role::kArg; }
-
- private:
-  // Either stmt_ or argument_ is used.
-  std::unique_ptr<Stmt> stmt_;
-  std::unique_ptr<Arg> arg_;
-  Role role_{Role::kUnk};
-};
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/pass.cc b/lite/core/mir/pass.cc
deleted file mode 100644
index 2aaa5a4a17..0000000000
--- a/lite/core/mir/pass.cc
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/pass.h"
diff --git a/lite/core/mir/pass.h b/lite/core/mir/pass.h
deleted file mode 100644
index cd7684ae32..0000000000
--- a/lite/core/mir/pass.h
+++ /dev/null
@@ -1,88 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <set>
-#include <string>
-
-#include "lite/core/mir/node.h"
-#include "lite/core/mir/ssa_graph.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-class Pass {
- public:
-  // Some appoint here, one pass should be only one of the following kinds.
-  enum class Kind {
-    // Will modify the program/graph topology.
-    kProgramWise = 0,
-    // Will modify the statement, with the graph topology fixed.
-    kStmtWise,
-    // Will not modify the IR, just collect information or visualization.
-    kDebug,
-  };
-
-  explicit Pass(Kind kind) : kind_(kind) {}
-
-  virtual void Apply(const std::unique_ptr<SSAGraph>& graph) = 0;
-
-  void set_name(const std::string& name) { name_ = name; }
-  const std::string& name() const { return name_; }
-
-  void set_doc(const std::string& doc) { doc_ = doc; }
-  const std::string& doc() const { return doc_; }
-
-  void set_targets(const std::set<TargetType>& targets) { targets_ = targets; }
-  const std::set<TargetType>& targets() const { return targets_; }
-  bool is_supported_target(TargetType target) const {
-    if (targets_.find(TARGET(kAny)) != targets_.end()) return true;
-    return (targets_.find(target) != targets_.end());
-  }
-
-  Kind kind() const { return kind_; }
-  bool is_debug_pass() const { return kind_ == Kind::kDebug; }
-  bool is_program_pass() const { return kind_ == Kind::kProgramWise; }
-  bool is_stmt_pass() const { return kind_ == Kind::kStmtWise; }
-
-  virtual ~Pass() = default;
-
- private:
-  const Kind kind_;
-  std::string name_;
-  std::string doc_;
-  std::set<TargetType> targets_;
-};
-
-// Different kinds.
-class ProgramPass : public Pass {
- public:
-  ProgramPass() : Pass(Kind::kProgramWise) {}
-};
-
-class StmtPass : public Pass {
- public:
-  StmtPass() : Pass(Kind::kStmtWise) {}
-};
-
-class DebugPass : public Pass {
- public:
-  DebugPass() : Pass(Kind::kDebug) {}
-};
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/pass_manager.cc b/lite/core/mir/pass_manager.cc
deleted file mode 100644
index 17f81b3bdd..0000000000
--- a/lite/core/mir/pass_manager.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/pass_manager.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/pass_manager.h b/lite/core/mir/pass_manager.h
deleted file mode 100644
index ca40f2deca..0000000000
--- a/lite/core/mir/pass_manager.h
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <list>
-#include <map>
-#include <memory>
-#include <string>
-#include "lite/core/mir/pass.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-class PassManager {
- public:
-  static PassManager& Global() {
-    static PassManager x;
-    return x;
-  }
-
-  PassManager() {}
-
-  void Run(const std::unique_ptr<SSAGraph>& graph) {
-    for (auto& pass : passes_) {
-      LOG(INFO) << "Running MIR pass " << pass->name();
-      pass->Apply(graph);
-    }
-  }
-
-  bool AddNewPass(const std::string& name, Pass* pass) {
-    passes_.emplace_back(pass);
-    pass_map_.emplace(name, passes_.back().get());
-    passes_.back()->set_name(name);
-    return true;
-  }
-
-  // Clear all the passes.
-  void Clear() { passes_.clear(); }
-
-  std::list<std::unique_ptr<mir::Pass>>::iterator passes_begin() {
-    return passes_.begin();
-  }
-  std::list<std::unique_ptr<mir::Pass>>::iterator passes_end() {
-    return passes_.end();
-  }
-  std::list<std::unique_ptr<mir::Pass>>::const_iterator passes_const_begin()
-      const {
-    return passes_.begin();
-  }
-  std::list<std::unique_ptr<mir::Pass>>::const_iterator passes_const_end()
-      const {
-    return passes_.end();
-  }
-
-  Pass* LookUp(const std::string& key) {
-    auto it = pass_map_.find(key);
-    if (it != pass_map_.end()) return it->second;
-    return nullptr;
-  }
-
-  template <typename PassTy>
-  PassTy* LookUp(const std::string& key) {
-    auto it = pass_map_.find(key);
-    if (it != pass_map_.end()) return dynamic_cast<PassTy*>(it->second);
-    return nullptr;
-  }
-
- private:
-  std::list<std::unique_ptr<mir::Pass>> passes_;
-  std::map<std::string, mir::Pass*> pass_map_;
-};
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/pass_manager_test.cc b/lite/core/mir/pass_manager_test.cc
deleted file mode 100644
index 05e11ed5d1..0000000000
--- a/lite/core/mir/pass_manager_test.cc
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/pass_manager.h"
-#include <gtest/gtest.h>
-#include "lite/core/mir/pass_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-TEST(PassManager, test) {
-  auto* pass = PassManager::Global().LookUp("demo");
-  LOG(INFO) << "pass: " << pass;
-  ASSERT_TRUE(pass != nullptr);
-}
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-USE_MIR_PASS(demo);
diff --git a/lite/core/mir/pass_registry.cc b/lite/core/mir/pass_registry.cc
deleted file mode 100644
index e80db5d4ca..0000000000
--- a/lite/core/mir/pass_registry.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/pass_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/pass_registry.h b/lite/core/mir/pass_registry.h
deleted file mode 100644
index cc5c119ecb..0000000000
--- a/lite/core/mir/pass_registry.h
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <set>
-#include <string>
-#include "lite/api/paddle_lite_factory_helper.h"
-#include "lite/api/paddle_place.h"
-#include "lite/core/mir/pass_manager.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-class PassRegistry {
- public:
-  PassRegistry(const std::string& name, mir::Pass* pass)
-      : name_(name), pass_(pass) {
-    PassManager::Global().AddNewPass(name_, pass_);
-  }
-  PassRegistry& SetTargets(const std::set<TargetType>& targets) {
-    pass_->set_targets(targets);
-    return *this;
-  }
-  bool Touch() const { return true; }
-
- private:
-  std::string name_;
-  mir::Pass* pass_;
-};
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-#define REGISTER_MIR_PASS(name__, class__)                                \
-  paddle::lite::mir::PassRegistry mir_pass_registry##name__(#name__,      \
-                                                            new class__); \
-  bool mir_pass_registry##name__##_fake() {                               \
-    return mir_pass_registry##name__.Touch();                             \
-  }                                                                       \
-  static paddle::lite::mir::PassRegistry mir_pass_registry_func_##name__  \
-      __attribute__((unused)) = mir_pass_registry##name__
diff --git a/lite/core/mir/pattern_matcher.cc b/lite/core/mir/pattern_matcher.cc
deleted file mode 100644
index 8ec85a4ef1..0000000000
--- a/lite/core/mir/pattern_matcher.cc
+++ /dev/null
@@ -1,528 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <array>
-#include <string>
-#include <vector>
-
-#include "lite/core/mir/dot.h"
-#include "lite/core/mir/pattern_matcher.h"
-#include "lite/core/op_lite.h"
-#include "lite/utils/string.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-size_t PMPattern::id_ = 0UL;
-
-PMNode &PMNode::operator>>(PMNode &right) {
-  pattern_->AddEdge(this, &right);
-  // automatically add out op link relation.
-  if (right.IsOp()) {
-    CHECK(!right.op_type_.empty());
-    this->assert_is_op_input(right.op_type_);
-  }
-
-  return right;
-}
-
-PMNode &PMNode::operator>>(std::vector<PMNode *> &nodes) {
-  for (auto *node : nodes) {
-    *this >> *node;
-  }
-  return *this;
-}
-
-PMNode &operator>>(std::vector<PMNode *> &others, PMNode &me) {
-  for (auto *o : others) {
-    *o >> me;
-  }
-  return me;
-}
-
-PMNode *PMPattern::NewNode(const std::string &name) {
-  if (!name.empty()) {
-    CHECK_EQ(node_map_.count(name), 0UL)
-        << "PMNode's name should be unique, get duplicate " << name;
-  }
-
-  nodes_.emplace_back(new PMNode(this, name));
-  auto *cur = nodes_.back().get();
-  node_map_[name] = cur;
-  return cur;
-}
-
-PMNode *PMPattern::NewNode(PMNode::teller_t &&teller, const std::string &name) {
-  if (!name.empty()) {
-    CHECK_EQ(node_map_.count(name), 0UL)
-        << "PMNode's name should be unique, get duplicate " << name;
-  }
-
-  nodes_.emplace_back(new PMNode(std::move(teller), this, name));
-  auto *cur = nodes_.back().get();
-  node_map_[name] = cur;
-  return cur;
-}
-
-PMNode *PMPattern::RetrieveNode(const std::string &id) const {
-  auto it = node_map_.find(id);
-  if (it == node_map_.end()) {
-    return nullptr;
-  }
-
-  return it->second;
-}
-
-void PMPattern::AddEdge(PMNode *a, PMNode *b) {
-  CHECK(a);
-  CHECK(b);
-  CHECK_NE(a, b) << "Can't connect to the same nodes.";
-  edges_.emplace_back(a, b);
-}
-
-void PatternMatcher::operator()(SSAGraph *graph,
-                                PatternMatcher::handle_t handler) {
-  if (!MarkPMNodesInGraph(graph)) {
-    return;
-  }
-
-  auto subgraphs = DetectPatterns();
-  UniquePatterns(&subgraphs);
-  RemoveOverlappedMatch(&subgraphs);
-  ValidateByNodeRole(&subgraphs);
-
-  if (subgraphs.empty()) return;
-  LOG(INFO) << "detected " << subgraphs.size() << " subgraph";
-  int id = 0;
-  for (auto &g : subgraphs) {
-    VLOG(3) << "optimizing #" << id++ << " subgraph";
-    handler(g, graph);
-  }
-}
-
-bool PatternMatcher::MarkPMNodesInGraph(SSAGraph *graph) {
-  VLOG(3) << "mark pmnodes in graph";
-  if (graph->nodes().empty()) return false;
-  for (auto &node : graph->mutable_nodes()) {
-    for (const auto &pmnode : pattern_.nodes()) {
-      if (pmnode->Tell(&node)) {
-        pmnodes2nodes_[pmnode.get()].insert(&node);
-      }
-    }
-  }
-  // Check to early stop if some PMNode can't find matched Node.
-  for (auto &pmnode : pattern_.nodes()) {
-    if (!pmnodes2nodes_.count(pmnode.get())) {
-      VLOG(4) << pmnode->name() << " can't find matched Node, early stop";
-      // return false;
-    }
-  }
-  VLOG(3) << pmnodes2nodes_.size() << " nodes marked";
-
-  return !pmnodes2nodes_.empty();
-}
-
-// The intermediate Nodes can only link to the nodes inside the pattern, or this
-// subgraph will be droped.
-void PatternMatcher::ValidateByNodeRole(
-    std::vector<PatternMatcher::subgraph_t> *subgraphs) {
-  std::vector<PatternMatcher::subgraph_t> result;
-
-  subgraphs->erase(
-      std::remove_if(subgraphs->begin(),
-                     subgraphs->end(),
-                     [](const PatternMatcher::subgraph_t &subgraph) -> bool {
-                       // Collect the inlinks and outlinks.
-                       std::unordered_set<Node *> ios;
-                       for (auto &item : subgraph) {
-                         ios.insert(item.second);
-                       }
-                       for (auto &item : subgraph) {
-                         if (item.first->IsIntermediate()) {
-                           for (auto *x : item.second->inlinks) {
-                             if (!ios.count(x)) {
-                               return true;
-                             }
-                           }
-                           for (auto *x : item.second->outlinks) {
-                             if (!ios.count(x)) {
-                               return true;
-                             }
-                           }
-                         }
-                       }
-                       return false;
-                     }),
-      subgraphs->end());
-}
-
-struct HitGroup {
-  std::unordered_map<PMNode *, Node *> roles;
-
-  bool Match(Node *node, PMNode *pat) {
-    if (nodes_.count(node)) {
-      if (roles.count(pat) && roles[pat] == node) return true;
-      return false;
-    } else {
-      if (roles.count(pat) && roles[pat] != node) return false;
-      return true;
-    }
-  }
-
-  void Register(Node *node, PMNode *pat) {
-    roles[pat] = node;
-    nodes_.insert(node);
-  }
-
- private:
-  std::unordered_set<Node *> nodes_;
-};
-
-// Tell whether Node a links to b.
-bool IsNodesLink(Node *a, Node *b) {
-  for (auto *node : a->outlinks) {
-    if (b == node) {
-      return true;
-    }
-  }
-  return false;
-}
-
-std::vector<PatternMatcher::subgraph_t> PatternMatcher::DetectPatterns() {
-  // Init empty subgraphs.
-  std::vector<PatternMatcher::subgraph_t> result;
-  std::vector<HitGroup> init_groups;
-  std::array<std::vector<HitGroup>, 2> bi_records;
-  auto *first_pnode = pattern_.edges().empty() ? pattern().nodes().front().get()
-                                               : pattern_.edges().front().first;
-  if (!pmnodes2nodes_.count(first_pnode)) return result;
-  for (auto *node : pmnodes2nodes_[first_pnode]) {
-    HitGroup group;
-    group.roles[first_pnode] = node;
-    init_groups.emplace_back(group);
-  }
-
-  int step = 0;
-  bi_records[0] = std::move(init_groups);
-
-  // Extend a PMNode to subgraphs by deducing the connection relations defined
-  // in edges of PMNodes.
-  for (const auto &edge : pattern_.edges()) {
-    VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name();
-    // TODO(Superjomn) Fix bug here, the groups might be duplicate here.
-    // Each role has two PMNodes, which indicates two roles.
-    // Detect two Nodes that can match these two roles and they are connected.
-    auto &pre_groups = bi_records[step % 2];
-    auto &cur_groups = bi_records[1 - (step++ % 2)];
-    cur_groups.clear();
-    if (pre_groups.empty()) break;
-    // source -> target
-    for (Node *source : pmnodes2nodes_[edge.first]) {
-      for (Node *target : pmnodes2nodes_[edge.second]) {
-        // TODO(Superjomn) add some prune strategies.
-        for (const auto &group : pre_groups) {
-          if (IsNodesLink(source, target)) {
-            HitGroup new_group = group;
-            bool flag = new_group.Match(source, edge.first) &&
-                        new_group.Match(target, edge.second);
-            if (flag) {
-              new_group.Register(source, edge.first);
-              new_group.Register(target, edge.second);
-              cur_groups.push_back(new_group);
-              // TODO(Superjomn) need to unique
-            }
-          }
-        }
-      }
-    }
-    VLOG(3) << "step " << step << " get records: " << cur_groups.size();
-  }
-
-  for (auto &group : bi_records[step % 2]) {
-    PatternMatcher::subgraph_t subgraph;
-    for (auto &role : group.roles) {
-      subgraph.emplace(role.first, role.second);
-    }
-    result.emplace_back(subgraph);
-  }
-  return result;
-}
-
-struct GraphItemLessThan {
-  bool operator()(const std::pair<PMNode *, Node *> &a,
-                  const std::pair<PMNode *, Node *> &b) {
-    if (a.first != b.first) {
-      return a.first < b.first;
-    } else {
-      return a.second < b.second;
-    }
-  }
-};
-
-// TODO(Superjomn) enhance the function as it marks unique unique as duplicates
-// see https://github.com/PaddlePaddle/Paddle/issues/13550
-void PatternMatcher::UniquePatterns(
-    std::vector<PatternMatcher::subgraph_t> *subgraphs) {
-  if (subgraphs->empty()) return;
-  std::vector<PatternMatcher::subgraph_t> result;
-
-  std::unordered_set<size_t> set;
-  std::hash<std::string> hasher;
-  for (auto &g : *subgraphs) {
-    // Sort the items in the sub-graph, and transform to a string key.
-    std::vector<std::pair<PMNode *, Node *>> sorted_keys(g.begin(), g.end());
-    std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemLessThan());
-    STL::stringstream ss;
-    for (auto &item : sorted_keys) {
-      ss << reinterpret_cast<size_t>(item.first) << ":"
-         << reinterpret_cast<size_t>(item.second);
-    }
-    auto key = hasher(ss.str());
-    if (!set.count(key)) {
-      result.emplace_back(g);
-      set.insert(key);
-    }
-  }
-  *subgraphs = result;
-}
-
-void PatternMatcher::RemoveOverlappedMatch(std::vector<subgraph_t> *subgraphs) {
-  std::vector<subgraph_t> result;
-  std::unordered_set<Node *> node_set;
-
-  for (const auto &subgraph : *subgraphs) {
-    bool valid = true;
-    for (auto &item : subgraph) {
-      if (item.first->IsIntermediate() && node_set.count(item.second)) {
-        valid = false;
-        break;
-      }
-    }
-    if (valid) {
-      for (auto &item : subgraph) {
-        node_set.insert(item.second);
-      }
-      result.push_back(subgraph);
-    }
-  }
-  *subgraphs = result;
-}
-
-std::string PMPattern::DotString() const {
-  using inference::analysis::Dot;
-  Dot dot;
-  int id = 0;
-  // Create Nodes
-  std::unordered_map<PMNode *, std::string> node2dot;
-  for (const auto &node : nodes()) {
-    std::string node_id = string_format("Node%d", id++);
-    dot.AddNode(node_id, {}, node->name());
-    node2dot[node.get()] = node_id;
-  }
-  // Create Edges
-  for (const auto &edge : edges()) {
-    if (!node2dot.count(edge.first) || !node2dot.count(edge.second)) {
-      continue;
-    }
-    auto &src = node2dot.at(edge.first);
-    auto &trg = node2dot.at(edge.second);
-    dot.AddEdge(src, trg, {});
-  }
-  return dot.Build();
-}
-
-PMNode &PMNode::LinksTo(const std::vector<PMNode *> &others) {
-  // extend outlinks.
-  for (PMNode *x : others) {
-    pattern_->AddEdge(this, x);
-  }
-  return *this;
-}
-
-PMNode &PMNode::LinksFrom(const std::vector<PMNode *> &others) {
-  // extend outlinks.
-  for (PMNode *x : others) {
-    pattern_->AddEdge(x, this);
-  }
-  return *this;
-}
-
-PMNode *PMNode::assert_is_op() {
-  asserts_.emplace_back([](const Node *x) { return x && x->IsStmt(); });
-  return this;
-}
-
-PMNode *PMNode::assert_is_op(const std::string &op_type) {
-  asserts_.emplace_back([op_type](const Node *x) {
-    if (x && x->IsStmt()) {
-      auto *op_info = x->stmt()->op_info();
-      return op_info->Type() == op_type;
-    } else {
-      return false;
-    }
-  });
-  return this;
-}
-
-PMNode *PMNode::assert_is_var() {
-  asserts_.emplace_back([](const Node *x) { return x && x->IsArg(); });
-  return this;
-}
-
-PMNode *PMNode::assert_var_not_persistable() {
-  assert_is_var();
-  asserts_.emplace_back([](const Node *x) { return !x->arg()->is_weight; });
-  return this;
-}
-
-PMNode *PMNode::assert_is_persistable_var() {
-  assert_is_var();
-  asserts_.emplace_back([=](const Node *x) { return x->arg()->is_weight; });
-  return this;
-}
-
-PMNode *PMNode::assert_is_op_output(const std::string &op_type) {
-  assert_is_var();
-  asserts_.emplace_back([=](const Node *x) {
-    for (auto *op : x->inlinks) {
-      if (op && op->IsStmt()) {
-        auto *op_info = op->stmt()->op_info();
-        if (op_info->Type() == op_type) return true;
-      }
-    }
-    return false;
-  });
-  return this;
-}
-
-bool IsNthOutput(const Node *var,
-                 const Node *op,
-                 const std::string &argument,
-                 size_t nth) {
-  CHECK(var->IsArg());
-  CHECK(op->IsStmt());
-  auto op_info = op->stmt()->op_info();
-  if (op_info->Output(argument).size() <= nth) return false;
-  return var->arg()->name == op_info->Output(argument)[nth];
-}
-
-bool IsNthInput(const Node *var,
-                const Node *op,
-                const std::string &argument,
-                size_t nth) {
-  CHECK(var->IsArg());
-  CHECK(op->IsStmt());
-  auto op_info = op->stmt()->op_info();
-  if (op_info->Input(argument).size() <= nth) return false;
-  return var->arg()->name == op_info->Input(argument)[nth];
-}
-
-PMNode *PMNode::assert_is_op_input(const std::string &op_type,
-                                   const std::string &argument) {
-  assert_is_var();
-  assert_is_op_nth_input(op_type, argument, 0);
-  return this;
-}
-
-PMNode *PMNode::assert_is_op_nth_input(const std::string &op_type,
-                                       const std::string &argument,
-                                       int nth) {
-  assert_is_var();
-  assert_is_op_input(op_type);
-  asserts_.emplace_back([=](const Node *x) {
-    for (auto *op : x->outlinks) {
-      if (op && op->IsStmt() && op->stmt()->op_info()->Type() == op_type &&
-          IsNthInput(x, op, argument, nth))
-        return true;
-    }
-    return false;
-  });
-  return this;
-}
-
-PMNode *PMNode::assert_is_op_output(const std::string &op_type,
-                                    const std::string &argument) {
-  assert_is_var();
-  assert_is_op_nth_output(op_type, argument, 0);
-  return this;
-}
-
-PMNode *PMNode::assert_is_op_nth_output(const std::string &op_type,
-                                        const std::string &argument,
-                                        int nth) {
-  assert_is_var();
-  asserts_.emplace_back([=](const Node *x) {
-    for (auto *op : x->inlinks) {
-      if (op && op->IsStmt() && op->stmt()->op_info()->Type() == op_type &&
-          IsNthOutput(x, op, argument, nth))
-        return true;
-    }
-    return false;
-  });
-  return this;
-}
-
-PMNode *PMNode::assert_is_op_input(const std::string &op_type) {
-  assert_is_var();
-  asserts_.emplace_back([=](const Node *x) {
-    for (auto *op : x->outlinks) {
-      if (op && op->IsStmt()) {
-        auto *op_info = op->stmt()->op_info();
-        if (op_info->Type() == op_type) {
-          return true;
-        }
-      }
-    }
-    return false;
-  });
-  return this;
-}
-
-bool HasInput(const Node &op, const std::string &argument) {
-  CHECK(op.IsStmt());
-  auto const &names = op.stmt()->op_info()->input_argnames();
-  if (std::find(names.begin(), names.end(), argument) == names.end())
-    return false;
-  return true;
-}
-
-void GraphSafeRemoveNodes(SSAGraph *graph,
-                          const std::unordered_set<const Node *> &nodes) {
-  for (auto *node : nodes) {
-    graph->RemoveNode(node);
-  }
-
-  for (auto &node : graph->mutable_nodes()) {
-    for (auto it = node.inlinks.begin(); it != node.inlinks.end();) {
-      if (nodes.count(*it)) {
-        it = node.inlinks.erase(it);
-      } else {
-        it++;
-      }
-    }
-    for (auto it = node.outlinks.begin(); it != node.outlinks.end();) {
-      if (nodes.count(*it)) {
-        it = node.outlinks.erase(it);
-      } else {
-        it++;
-      }
-    }
-  }
-}
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/pattern_matcher.h b/lite/core/mir/pattern_matcher.h
deleted file mode 100644
index 47a0a30b56..0000000000
--- a/lite/core/mir/pattern_matcher.h
+++ /dev/null
@@ -1,432 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#ifdef PADDLE_WITH_TESTING
-#include <gtest/gtest_prod.h>
-#endif
-
-#include <memory>
-#include <numeric>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "lite/core/mir/node.h"
-#include "lite/core/mir/ssa_graph.h"
-#include "lite/model_parser/pb/op_desc.h"
-#include "lite/utils/cp_logging.h"
-#include "lite/utils/replace_stl/stream.h"
-#include "lite/utils/string.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-class PMPattern;
-
-// Some basic terminologies:
-//   - PMPattern: a pattern defined as a data flow graph.
-//   - PMNode: the node in the pattern, each PMNode represents an `mir::Node`
-//     that meets some conditions defined in `PMNode.teller`.
-//   - A pattern is defined with PMNodes with edges.
-
-// Pattern matcher node. This node helps to build a pattern.
-struct PMNode {
-  // tell whether an mir::Node* is a candidation for a PMNode.
-  using teller_t = std::function<bool(const Node*)>;
-  enum class Type { kOp, kVar };
-  enum class Role {
-    kUnknown,      // No role,
-    kInput,        // an input and will be retained,
-    kOutput,       // an output and will be retained,
-    kIntermediate  // will be removed after handler.
-  };
-
-  // this link to others
-  PMNode& LinksTo(const std::vector<PMNode*>& others);
-  PMNode& LinksFrom(const std::vector<PMNode*>& others);
-
-  // Link this to another node.
-  PMNode& operator>>(PMNode& right);
-
-  // Link many nodes to this node.
-  friend PMNode& operator>>(std::vector<PMNode*>& others, PMNode& me);
-
-  // Link this to many other nodes.
-  PMNode& operator>>(std::vector<PMNode*>& nodes);
-
-  bool Tell(const Node* node) const {
-    if (teller_) return teller_(node);
-
-    for (auto& asrt : asserts_) {
-      if (!asrt(node)) return false;
-    }
-    return true;
-  }
-
-  bool IsOp() const { return type_ == Type::kOp; }
-  bool IsVar() const { return type_ == Type::kVar; }
-
-  const std::string& name() const { return name_; }
-
-  PMNode& operator=(const PMNode&) = delete;
-  PMNode(const PMNode&) = delete;
-
-  // Mark this node is an Input of a subgraph and will be retained.
-  PMNode* AsInput() {
-    role_ = Role::kInput;
-    return this;
-  }
-  // Mark this node is an Output of a subgraph and will be retained.
-  PMNode* AsOutput() {
-    role_ = Role::kOutput;
-    return this;
-  }
-  // Mark this node will be removed, so all the links should be inside a matched
-  // sub-graph.
-  PMNode* AsIntermediate() {
-    role_ = Role::kIntermediate;
-    return this;
-  }
-
-  PMNode* AsVar() {
-    type_ = Type::kVar;
-    assert_is_var();
-    return this;
-  }
-
-  PMNode* AsOp(const std::string& op_type) {
-    type_ = Type::kOp;
-    assert_is_op(op_type);
-    return this;
-  }
-
-  void set_op_type(const std::string& op_type) { op_type_ = op_type; }
-
-  bool IsIntermediate() const { return role_ == Role::kIntermediate; }
-  bool IsInput() const { return role_ == Role::kInput; }
-  bool IsOutput() const { return role_ == Role::kOutput; }
-
-  // Assertions, helper functions to simplify the pattern definition.
-  PMNode* assert_is_op();
-  PMNode* assert_is_op(const std::string& op_type);
-  PMNode* assert_is_var();
-  PMNode* assert_var_not_persistable();
-  PMNode* assert_is_persistable_var();
-  PMNode* assert_is_op_output(const std::string& op_type);
-  PMNode* assert_is_op_input(const std::string& op_type);
-  PMNode* assert_is_op_input(const std::string& op_type,
-                             const std::string& argument);
-  PMNode* assert_is_op_output(const std::string& op_type,
-                              const std::string& argument);
-
-  PMNode* assert_is_op_nth_input(const std::string& op_type,
-                                 const std::string& argument,
-                                 int nth);
-  PMNode* assert_is_op_nth_output(const std::string& op_type,
-                                  const std::string& argument,
-                                  int nth);
-
-  template <typename T>
-  PMNode* assert_op_attr_satisfied(
-      const std::string& attr_name,
-      const std::function<bool(const T&)>& condition) {
-    asserts_.push_back([=](const Node* x) {
-      if (x && x->IsStmt()) {
-        auto* op_info = x->stmt()->op_info();
-        return op_info->HasAttr(attr_name) &&
-               condition(op_info->GetAttr<T>(attr_name));
-      }
-      return false;
-    });
-    return this;
-  }
-
-  template <typename T>
-  PMNode* assert_op_attr(const std::string& attr_name, const T& attr) {
-    return assert_op_attr_satisfied<T>(
-        attr_name, [=](const T& src) { return src == attr; });
-  }
-
- private:
-  PMNode(PMPattern* pattern,
-         const std::string& name = "",
-         Type type = Type::kVar)
-      : pattern_(pattern), name_(name), type_(type) {}
-  PMNode(teller_t&& teller,
-         PMPattern* pattern,
-         const std::string& name = "",
-         Type type = Type::kVar)
-      : teller_(std::move(teller)),
-        pattern_(pattern),
-        name_(name),
-        type_(type) {
-    CHECK(teller_ != nullptr) << "invalid teller functer is set.";
-  }
-
-  PMNode(PMNode&& other) = default;
-
-  friend class PMPattern;
-
-  // Will removed latter.
-  teller_t teller_;
-  std::vector<teller_t> asserts_;
-  PMPattern* pattern_;
-  std::string name_;
-  std::string op_type_;
-  Type type_;
-  Role role_{Role::kUnknown};
-};
-
-/*
- * A pattern in a graph, which defined with PMNode and edges. Most graph
- * patterns can be divided into PMNodes and link relations between them.
- *
- * For example, the FC fusion need to filter the MUL and ELEMENTWISE_ADD
- * operators from the computation graph, the MUL's output should have only one
- * consumer which is the ELEMENTWISE_ADD.
- * This pattern can be defined as with the following pseudo codes
- *
- *     // Create two operator PMNodes.
- *     MUL = PMPattern.NewNode().assert_is_op("mul");
- *     ELE = PMPattern.NewNode().assert_is_op("elementwise_add");
- *     // Create the variable PMNodes.
- *     MUL_out = PMPattern.NewNode().assert_is_op_output("mul") \
- *                                  .assert_is_op_input("elementwise_add") \
- *                                  .AsIntermediate();
- *     // Add relations.
- *     MUL->LinksTo({MUL_out});
- *     MUL_out->LinksTo({ELE});
- *
- * One can add more specific asserts for PMNodes or edges, both the Operator
- * and Variable Nodes can be ruled in PMNode.assert_more(...).
- *
- * PMPattern can record the general patterns, such as the pattern represents
- *   - Op in CPU -> Op in GPU -> Op in CPU, to findout the IO abnormal place.
- *   - Ops whose inputs and outputs share the same variables
- */
-class PMPattern {
- public:
-  using edge_t = std::pair<PMNode*, PMNode*>;
-
-  void AddEdge(PMNode* a, PMNode* b);
-
-  PMNode* NewNode(PMNode::teller_t&& teller, const std::string& name = NewID());
-  PMNode* NewNode(const std::string& name = NewID());
-  PMNode* NewNode(const std::string& prefix, const std::string& name) {
-    return NewNode(prefix + "/" + name);
-  }
-  PMNode* RetrieveNode(const std::string& id) const;
-
-  const std::vector<std::unique_ptr<PMNode>>& nodes() const { return nodes_; }
-  const std::vector<edge_t>& edges() const { return edges_; }
-
-  std::string DotString() const;
-
- private:
-#ifdef PADDLE_WITH_TESTING
-  FRIEND_TEST(PMPattern, AddEdge);
-  FRIEND_TEST(PMPattern, NewNode);
-#endif
-
-  static std::string NewID() { return string_format("pmnode-%d", id_++); }
-
-  std::vector<std::unique_ptr<PMNode>> nodes_;
-  std::vector<edge_t> edges_;
-  std::unordered_map<std::string, PMNode*> node_map_;
-  static size_t id_;
-};
-
-/*
- * PatternMatcher helps to detect the specific patterns in the graph.
- * Input a pattern, output a list of the matched subgraphs/nodes.
- * This helper can be used to support fuse(conv+batchnorm => batchnorm e.g.).
- *
- * The algorithm has three phases:
- *   1. Mark the nodes that match the defined PMNodes in a PMPattern,
- *   2. Extend a PMNode to subgraphs by deducing the connection relation defined
- *      in PAPattern(the edges),
- *   3. Get the filtered subgraphs and treat them with a pre-defined handler.
- *
- * Usage:
- *    // Create a matcher
- *    PatternMatcher matcher;
- *    // Define the matcher's pattern, by adding PMNode and define the edges.
- *    auto* node0 = matcher.mutable_pattern().AddNode(...)
- *    auto* node1 = matcher.mutable_pattern().AddNode(...)
- *    node0->teller = some lambda.
- *    node1->teller = some lambda.
- *    matcher.mutable_pattern().AddEdge(node0, node1);
- *    // Create an handler, to define the behavior of treating the filtered
- *    // subgraphs that comply with the patterns.
- *    PatternMatcher::handle_t handler = some labmda
- *    // Execute the matcher.
- *    matcher(&graph, handler);
- */
-class PatternMatcher {
- public:
-  using subgraph_t = std::unordered_map<PMNode*, Node*>;
-
-  // Operate on the detected pattern.
-  using handle_t =
-      std::function<void(const subgraph_t& /*hitted pattern*/, SSAGraph*)>;
-
-  void operator()(SSAGraph* graph, handle_t handler);
-
-  const PMPattern& pattern() const { return pattern_; }
-  PMPattern* mutable_pattern() { return &pattern_; }
-
- private:
-  // Mark the nodes that fits the pattern.
-  bool MarkPMNodesInGraph(SSAGraph* graph);
-
-  // Detect all the pattern and output the hit records.
-  std::vector<subgraph_t> DetectPatterns();
-
-  // Remove duplicate patterns.
-  void UniquePatterns(std::vector<subgraph_t>* subgraphs);
-
-  // Remove overlapped match subgraphs, when overlapped, keep the previous one.
-  // The intermediate PMNodes will be removed, so can't shared by multiple
-  // patterns.
-  void RemoveOverlappedMatch(std::vector<subgraph_t>* subgraphs);
-
-  // Validate whether the intermediate nodes are linked by external nodes.
-  void ValidateByNodeRole(std::vector<subgraph_t>* subgraphs);
-
-#ifdef PADDLE_WITH_TESTING
-  FRIEND_TEST(PatternMatcher, MarkPMNodesInGraph);
-  FRIEND_TEST(PatternMatcher, DetectPatterns);
-#endif
-
- private:
-  using hit_rcd_t =
-      std::pair<Node* /*node in graph*/, PMNode* /*node in pattern*/>;
-  PMPattern pattern_;
-  std::unordered_map<const PMNode*, std::unordered_set<Node*>> pmnodes2nodes_;
-};
-
-// Check whether a var node is a op node's nth input.
-bool IsNthInput(const Node& var,
-                const Node& op,
-                const std::string& argument,
-                int nth);
-
-// Check whether the op node has input of given name.
-bool HasInput(const Node& op, const std::string& argument);
-
-// Graph safely remove some nodes, will automatically clean up the edges.
-void GraphSafeRemoveNodes(SSAGraph* graph,
-                          const std::unordered_set<const Node*>& nodes);
-
-// Some pre-defined patterns those can be reused in multiple passes.
-// The related Fluid Layer or Op should be one pattern here for better re-usage
-// across different fusion.
-namespace patterns {
-
-struct KeyCounter {
-  static KeyCounter& Instance() {
-    static KeyCounter x;
-    return x;
-  }
-
-  int IncCounter(const std::string& key) { return dic_[key]++; }
-
- private:
-  std::unordered_map<std::string, size_t> dic_;
-};
-
-// Generate a unique PMNode's name with name_scope and id.
-// The format is {name_scope}/{repr}/{id}/{name}
-static std::string PMNodeName(const std::string& name_scope,
-                              const std::string& repr,
-                              size_t id,
-                              const std::string& name) {
-  STL::stringstream ss;
-  ss << name_scope << "/" << repr << "/" << id << "/" << name;
-  return ss.str();
-}
-// Generate a unique PMNode's name.
-// The format is {name_scope}/{repr}/{id}
-static std::string PMNodeName(const std::string& name_scope,
-                              const std::string& repr) {
-  STL::stringstream ss;
-  ss << name_scope << "/" << repr << "/"
-     << KeyCounter::Instance().IncCounter(repr);
-  return ss.str();
-}
-// Generate a unique key. It can be used for a universally unique temporary
-// name.
-// The format is {repr}/{id}
-static std::string UniqueKey(const std::string& repr) {
-  STL::stringstream ss;
-  ss << repr << "/" << KeyCounter::Instance().IncCounter(repr);
-  return ss.str();
-}
-
-// Declare a PMNode in a pattern, will create two methods:
-// std::string xxx_repr(); return this PMNode's string id.
-// PMNode* xxx_n(); return the corresponding PMNode.
-#define PATTERN_DECL_NODE(name__)                        \
-  std::string name__##_repr() const {                    \
-    return PMNodeName(name_scope_, repr_, id_, #name__); \
-  }                                                      \
-  PMNode* name__##_n() const { return pattern->RetrieveNode(name__##_repr()); }
-
-// Get an mir::Node* from the matched subgraph.
-// var: variable.
-// arg: the argument declared by PATTERN_DECL_NODE in a pattern definition.
-// pat: the pattern object.
-#define GET_IR_NODE_FROM_SUBGRAPH(var, arg, pat)        \
-  CHECK(subgraph.count(pat.arg##_n()))                  \
-      << "Node not found for PMNode " pat.arg##_repr(); \
-  Node* var = subgraph.at(pat.arg##_n());               \
-  CHECK(var) << "node " << #arg << "not exists in the sub-graph"
-
-// The base class of all the patterns.
-struct PatternBase {
-  PatternBase(PMPattern* pattern,
-              const std::string& name_scope,
-              const std::string& repr)
-      : pattern(pattern),
-        name_scope_(name_scope),
-        repr_(repr),
-        id_(KeyCounter::Instance().IncCounter(repr)) {}
-
-  PMPattern* pattern;
-
- protected:
-  std::string name_scope_;
-  std::string repr_;
-  size_t id_;
-};
-
-}  // namespace patterns
-
-// Link two mir::Nodes from each other.
-#define IR_NODE_LINK_TO(a, b) \
-  a->outlinks.push_back(b);   \
-  b->inlinks.push_back(a);
-
-// Set the out_var as the output of the op
-#define IR_OP_VAR_LINK(op, out_var) \
-  op->outlinks.push_back(out_var);  \
-  out_var->inlinks.clear();         \
-  out_var->inlinks.push_back(op);
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/pattern_matcher_high_api.cc b/lite/core/mir/pattern_matcher_high_api.cc
deleted file mode 100644
index 620f4ebbea..0000000000
--- a/lite/core/mir/pattern_matcher_high_api.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/pattern_matcher_high_api.h"
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-void FuseBase::PerformPatternMatcher(SSAGraph *graph) {
-  VLOG(4) << "\n" << matcher_.pattern().DotString();
-  // Get subgraphs and record the mir::Node pointers for each PMNode.
-  auto handler = [&](const PatternMatcher::subgraph_t &subgraph, SSAGraph *g) {
-    // get all the reigistered nodes.
-    key2nodes_.emplace_back();
-    for (auto &item : nodes_) {
-      key2nodes_.back()[item.first] = subgraph.at(item.second);
-    }
-  };
-
-  matcher_(graph, handler);
-}
-
-void FuseBase::DeleteInterNodes(SSAGraph *graph) {
-  std::set<std::string> keys;
-  for (auto &node : nodes_) {
-    if (node.second->IsIntermediate()) {
-      keys.insert(node.first);
-    }
-  }
-
-  VLOG(4) << "keys: " << key2nodes_.size();
-  std::unordered_set<const Node *> nodes2rm;
-  for (auto &matched : key2nodes_) {
-    for (const auto &key : keys) {
-      nodes2rm.insert(matched.at(key));
-    }
-  }
-
-  VLOG(3) << "clean nodes " << nodes2rm.size();
-  GraphSafeRemoveNodes(graph, nodes2rm);
-}
-
-PMNode *FuseBase::GetOrCreateNode(const std::string &key) {
-  auto it = nodes_.find(key);
-  if (it != nodes_.end()) {
-    return it->second;
-  }
-  nodes_.emplace(key,
-                 matcher_.mutable_pattern()->NewNode(patterns::UniqueKey(key)));
-  it = nodes_.find(key);
-  return it->second;
-}
-
-PMNode *FuseBase::OpNode(const std::string &key, const std::string &op_type) {
-  GetOrCreateNode(key)->set_op_type(op_type);
-  GetOrCreateNode(key)->AsOp(op_type);
-  return GetOrCreateNode(key);
-}
-
-PMNode *FuseBase::VarNode(const std::string &key) {
-  GetOrCreateNode(key)->AsVar();
-  return GetOrCreateNode(key);
-}
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/pattern_matcher_high_api.h b/lite/core/mir/pattern_matcher_high_api.h
deleted file mode 100644
index e62a4fc749..0000000000
--- a/lite/core/mir/pattern_matcher_high_api.h
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <map>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "lite/core/mir/node.h"
-#include "lite/core/mir/pattern_matcher.h"
-#include "lite/core/mir/ssa_graph.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-class FuseBase {
- public:
-  using key2nodes_t = std::map<std::string, Node*>;
-
-  virtual ~FuseBase() = default;
-
-  void operator()(SSAGraph* graph) {
-    BuildPattern();
-    PerformPatternMatcher(graph);
-
-    for (const auto& matched : key2nodes_) {
-      InsertNewNode(graph, matched);
-    }
-
-    DeleteInterNodes(graph);
-  }
-
-  // Build a PMPattern using PMNode.
-  virtual void BuildPattern() = 0;
-
-  // Generate an operator desc with a matched subgraph.
-  virtual cpp::OpDesc GenOpDesc(const key2nodes_t& matched) {
-    return cpp::OpDesc();
-  }
-
-  PMNode* OpNode(const std::string& key) {
-    return GetOrCreateNode(key)->assert_is_op();
-  }
-
-  PMNode* OpNode(const std::string& key, const std::string& op_type);
-
-  PMNode* VarNode(const std::string& key);
-
- protected:
-  virtual void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) = 0;
-
- private:
-  void PerformPatternMatcher(SSAGraph* graph);
-
-  // Delete nodes that are marked as Intermediate
-  void DeleteInterNodes(SSAGraph* graph);
-
-  PMNode* GetOrCreateNode(const std::string& key);
-
- protected:
-  PatternMatcher matcher_;
-  std::map<std::string, PMNode*> nodes_;
-  std::vector<key2nodes_t> key2nodes_;
-};
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/pattern_matcher_high_api_test.cc b/lite/core/mir/pattern_matcher_high_api_test.cc
deleted file mode 100644
index 61914c5a0b..0000000000
--- a/lite/core/mir/pattern_matcher_high_api_test.cc
+++ /dev/null
@@ -1,150 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/pattern_matcher_high_api.h"
-#include <gtest/gtest.h>
-#include <memory>
-#include "lite/core/mir/graph_visualize_pass.h"
-#include "lite/core/program.h"
-#include "lite/core/tensor.h"
-#include "paddle/fluid/framework/program_desc.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-// An demo.
-class FcFuser : public FuseBase {
- public:
-  void BuildPattern() override {
-    // create nodes.
-    auto* x = VarNode("x")->assert_is_op_input("mul", "X");
-    auto* W = VarNode("W")->assert_is_op_input("mul", "Y");
-    auto* b = VarNode("b");
-    auto* mul = OpNode("mul", "mul");
-    auto* mul_out = VarNode("mul_out");
-    auto* add = OpNode("add", "elementwise_add");
-    auto* Out = VarNode("Out");
-
-    // create topology.
-    std::vector<PMNode*> mul_inputs{W, x};
-    std::vector<PMNode*> add_inputs{mul_out, b};
-    mul_inputs >> *mul >> *mul_out;
-    add_inputs >> *add >> *Out;
-
-    // Some op specialities.
-    mul_out->AsIntermediate();
-    mul->AsIntermediate();
-    add->AsIntermediate();
-  }
-
-  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
-    auto op_desc = GenOpDesc(matched);
-    auto fc_op = LiteOpRegistry::Global().Create("fc");
-    auto mul = matched.at("mul")->stmt()->op();
-    auto* scope = mul->scope();
-    auto& valid_places = mul->valid_places();
-    fc_op->Attach(op_desc, scope);
-
-    auto* new_op_node = graph->GraphCreateInstructNode(fc_op, valid_places);
-
-    IR_NODE_LINK_TO(matched.at("W"), new_op_node);
-    IR_NODE_LINK_TO(matched.at("x"), new_op_node);
-    IR_NODE_LINK_TO(matched.at("b"), new_op_node);
-    IR_NODE_LINK_TO(new_op_node, matched.at("Out"));
-  }
-
- private:
-  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override {
-    cpp::OpDesc op_desc;
-    op_desc.SetType("fc");
-    op_desc.SetInput("Input", {matched.at("x")->arg()->name});
-    op_desc.SetInput("W", {matched.at("W")->arg()->name});
-    op_desc.SetInput("Bias", {matched.at("b")->arg()->name});
-    op_desc.SetOutput("Out", {matched.at("Out")->arg()->name});
-    op_desc.SetAttr("in_num_col_dims", 1);
-    return op_desc;
-  }
-};
-
-std::unique_ptr<SSAGraph> BuildGraph(framework::ProgramDesc* program_desc,
-                                     const std::shared_ptr<Scope>& scope,
-                                     const std::vector<Place>& valid_places) {
-  auto* main_block = program_desc->MutableBlock(0);
-  auto* mul = main_block->AppendOp();
-  auto* add = main_block->AppendOp();
-  main_block->Var("x");
-  main_block->Var("b");
-  main_block->Var("mul_out");
-  main_block->Var("w");
-  main_block->Var("out");
-
-  scope->Var("x")->GetMutable<lite::Tensor>();
-  scope->Var("b")->GetMutable<lite::Tensor>();
-  scope->Var("mul_out")->GetMutable<lite::Tensor>();
-  scope->Var("w")->GetMutable<lite::Tensor>();
-  scope->Var("out")->GetMutable<lite::Tensor>();
-
-  mul->SetInput("X", {"x"});
-  mul->SetInput("Y", {"w"});
-  mul->SetOutput("Out", {"mul_out"});
-  mul->SetType("mul");
-  mul->SetAttr("x_num_col_dims", 1);
-  mul->SetAttr("y_num_col_dims", 1);
-
-  add->SetInput("X", {"mul_out"});
-  add->SetInput("Y", {"b"});
-  add->SetOutput("Out", {"out"});
-  add->SetType("elementwise_add");
-  add->SetAttr("axis", 1);
-
-  program_desc->Flush();
-
-  lite::Program program(*program_desc->Proto(), scope, valid_places);
-  auto graph = std::unique_ptr<SSAGraph>(new SSAGraph());
-  graph->Build(program, valid_places);
-
-  return graph;
-}
-
-TEST(pattern_matcher_high_api, graph_test) {
-  framework::ProgramDesc program_desc;
-  std::vector<Place> places{{TARGET(kHost), PRECISION(kFloat)}};
-  auto scope = std::make_shared<Scope>();
-  auto graph = BuildGraph(&program_desc, scope, places);
-
-  ASSERT_EQ(graph->nodes().size(), 7UL /*real nodes*/);
-  Visualize(graph.get());
-}
-
-TEST(pattern_matcher_high_api, fuse_test) {
-  framework::ProgramDesc program_desc;
-  std::vector<Place> places{{TARGET(kHost), PRECISION(kFloat)}};
-  auto scope = std::make_shared<Scope>();
-  auto graph = BuildGraph(&program_desc, scope, places);
-  const int num_nodes = graph->nodes().size();
-  FcFuser fuser;
-  fuser(graph.get());
-  ASSERT_EQ(graph->nodes().size(),
-            num_nodes - 3UL /*nodes removed */ + 1UL /* fused fc node*/);
-  Visualize(graph.get());
-}
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(fc);
-USE_LITE_OP(mul);
-USE_LITE_OP(elementwise_add);
diff --git a/lite/core/mir/pattern_matcher_test.cc b/lite/core/mir/pattern_matcher_test.cc
deleted file mode 100644
index 728681a459..0000000000
--- a/lite/core/mir/pattern_matcher_test.cc
+++ /dev/null
@@ -1,233 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/pattern_matcher.h"
-
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-void BuildGraph(SSAGraph* g) {
-  g->mutable_nodes().emplace_back();
-  Node& o1 = g->mutable_nodes().back();
-  o1.AsStmt().desc = "op1";
-  g->mutable_nodes().emplace_back();
-  Node& o2 = g->mutable_nodes().back();
-  o2.AsStmt().desc = "op2";
-  g->mutable_nodes().emplace_back();
-  Node& o3 = g->mutable_nodes().back();
-  o3.AsStmt().desc = "op3";
-  g->mutable_nodes().emplace_back();
-  Node& o4 = g->mutable_nodes().back();
-  o4.AsStmt().desc = "op4";
-  g->mutable_nodes().emplace_back();
-  Node& o5 = g->mutable_nodes().back();
-  o5.AsStmt().desc = "op5";
-  g->mutable_nodes().emplace_back();
-  Node& v1 = g->mutable_nodes().back();
-  v1.AsArg("var1");
-  g->mutable_nodes().emplace_back();
-  Node& v2 = g->mutable_nodes().back();
-  v2.AsArg("var2");
-  g->mutable_nodes().emplace_back();
-  Node& v3 = g->mutable_nodes().back();
-  v3.AsArg("var3");
-  g->mutable_nodes().emplace_back();
-  Node& v4 = g->mutable_nodes().back();
-  v4.AsArg("var4");
-
-  // o1->v1->o2
-  o1.outlinks.push_back(&v1);
-  o2.inlinks.push_back(&v1);
-  v1.inlinks.push_back(&o1);
-  v1.outlinks.push_back(&o2);
-  // o2->v2->o3
-  // o2->v2->o4
-  o2.outlinks.push_back(&v2);
-  o3.inlinks.push_back(&v2);
-  o4.inlinks.push_back(&v2);
-  v2.inlinks.push_back(&o2);
-  v2.outlinks.push_back(&o3);
-  v2.outlinks.push_back(&o4);
-  // o2->v3->o5
-  o2.outlinks.push_back(&v3);
-  o5.inlinks.push_back(&v3);
-  v3.inlinks.push_back(&o2);
-  v3.outlinks.push_back(&o5);
-  // o3-v4->o5
-  o3.outlinks.push_back(&v4);
-  o5.inlinks.push_back(&v4);
-  v4.inlinks.push_back(&o3);
-  v4.outlinks.push_back(&o5);
-}
-
-TEST(PMPattern, NewNode) {
-  PMPattern x;
-  auto* n = x.NewNode([](const Node* x) { return true; });
-  ASSERT_TRUE(n);
-  ASSERT_EQ(x.nodes_.size(), 1UL);
-}
-
-TEST(PMPattern, AddEdge) {
-  PMPattern x;
-  auto* a = x.NewNode([](const Node* x) { return true; });
-  auto* b = x.NewNode([](const Node* x) { return true; });
-  ASSERT_TRUE(a);
-  ASSERT_TRUE(b);
-  x.AddEdge(a, b);
-  ASSERT_EQ(x.nodes_.size(), 2UL);
-  ASSERT_EQ(x.edges_.size(), 1UL);
-  ASSERT_EQ(x.edges_.front().first, a);
-  ASSERT_EQ(x.edges_.front().second, b);
-
-  ASSERT_EQ(x.nodes().size(), 2UL);
-  ASSERT_EQ(x.edges().size(), 1UL);
-  ASSERT_EQ(x.edges().front().first, a);
-  ASSERT_EQ(x.edges().front().second, b);
-}
-
-TEST(PatternMatcher, MarkPMNodesInGraph) {
-  PatternMatcher x;
-  // mark o2, o3, v2
-
-  // The pattern is a graph:
-  //   o2(a node named o2) -> v2(a node named v2)
-  //   v2 -> o3(a node named o3)
-  auto* o2 = x.pattern_.NewNode([](const Node* node) {
-    // The teller can be any condition, such as op type, or variable's shape.
-    return node && node->IsStmt() && node->stmt()->desc == "op2";
-  });
-  auto* o3 = x.pattern_.NewNode([](const Node* node) {
-    // The teller can be any condition, such as op type, or variable's shape.
-    return node && node->IsStmt() && node->stmt()->desc == "op3";
-  });
-  auto* v2 = x.pattern_.NewNode([](const Node* node) {
-    // The teller can be any condition, such as op type, or variable's shape.
-    return node && node->IsArg() && node->arg()->name == "var2";
-  });
-
-  ASSERT_FALSE(o2->Tell(nullptr));
-  ASSERT_FALSE(o3->Tell(nullptr));
-  ASSERT_FALSE(v2->Tell(nullptr));
-
-  x.pattern_.AddEdge(o2, v2);
-  x.pattern_.AddEdge(v2, o3);
-
-  ASSERT_EQ(x.pattern_.edges().size(), 2UL);
-  ASSERT_EQ(x.pattern_.edges()[0].first, o2);
-  ASSERT_EQ(x.pattern_.edges()[0].second, v2);
-  ASSERT_EQ(x.pattern_.edges()[1].first, v2);
-  ASSERT_EQ(x.pattern_.edges()[1].second, o3);
-
-  SSAGraph graph;
-  BuildGraph(&graph);
-
-  x.MarkPMNodesInGraph(&graph);
-
-  ASSERT_EQ(x.pmnodes2nodes_.size(), 3UL);
-
-  auto subgraphs = x.DetectPatterns();
-  ASSERT_EQ(subgraphs.size(), 1UL);
-}
-
-TEST(PatternMatcher, MultiSubgraph) {
-  SSAGraph graph;
-  BuildGraph(&graph);
-
-  PatternMatcher x;
-
-  // The pattern is a graph:
-  //   op -> var
-  auto* any_op = x.mutable_pattern()->NewNode(
-      [](const Node* node) {
-        return node->IsStmt() &&
-               (node->stmt()->desc == "op2" || node->stmt()->desc == "op3");
-      },
-      "OP0");
-  auto* any_var =
-      x.mutable_pattern()
-          ->NewNode([](const Node* node) { return node->IsArg(); }, "VAR")
-          ->AsIntermediate();
-  auto* any_op1 = x.mutable_pattern()->NewNode(
-      [](const Node* node) { return node->IsStmt(); }, "OP1");
-
-  x.mutable_pattern()->AddEdge(any_op, any_var);
-  x.mutable_pattern()->AddEdge(any_var, any_op1);
-
-  int count = 0;
-  PatternMatcher::handle_t handle = [&](const PatternMatcher::subgraph_t& s,
-                                        SSAGraph* g) {
-    LOG(INFO) << "Detect " << s.at(any_op)->stmt()->desc << " -> "
-              << s.at(any_var)->arg()->name << " -> "
-              << s.at(any_op1)->stmt()->desc;
-    count++;
-  };
-
-  x(&graph, handle);
-
-  // 1. Detect op3 -> var4 -> op5
-  // 2. Detect op2 -> var2 -> op3
-  // 3. Detect op2 -> var2 -> op4
-  // 4. Detect op2 -> var3 -> op5
-  // But 2 and 3 and 4 overlapped, so keep 2, so the final choices are 1 and 2
-  ASSERT_GE(count, 1);
-  ASSERT_LE(count, 2);
-}
-
-TEST(PatternMatcher, IntermediateCheck) {
-  SSAGraph graph;
-  BuildGraph(&graph);
-
-  // o2->v2->o3
-  // o2->v2->o4
-  // check o2+o3 fuse, should fail because v2 also link to o4.
-  PatternMatcher matcher;
-  auto* op2 = matcher.mutable_pattern()->NewNode(
-      [](const Node* x) {
-        return x && x->IsStmt() && x->stmt()->desc == "op2";
-      },
-      "op2");
-  auto* op3 = matcher.mutable_pattern()->NewNode(
-      [](const Node* x) {
-        return x && x->IsStmt() && x->stmt()->desc == "op3";
-      },
-      "op3");
-  auto* v2 = matcher.mutable_pattern()
-                 ->NewNode(
-                     [](const Node* x) {
-                       return x && x->IsArg() && x->arg()->name == "var2";
-                     },
-                     "var2")
-                 ->AsIntermediate();
-  v2->LinksFrom({op2}).LinksTo({op3});
-
-  int count = 0;
-  matcher(&graph, [&](const PatternMatcher::subgraph_t& g, SSAGraph* graph) {
-    ++count;
-  });
-  EXPECT_EQ(count, 0);
-
-  count = 0;
-  v2->AsInput();
-  matcher(&graph, [&](const PatternMatcher::subgraph_t& g, SSAGraph* graph) {
-    ++count;
-  });
-  ASSERT_EQ(count, 1);
-}
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/pattern_matcher_tester.cc b/lite/core/mir/pattern_matcher_tester.cc
deleted file mode 100644
index a62c3af62f..0000000000
--- a/lite/core/mir/pattern_matcher_tester.cc
+++ /dev/null
@@ -1,233 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/pattern_matcher.h"
-
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-void BuildGraph(SSAGraph* g) {
-  g->mutable_nodes().emplace_back();
-  Node& o1 = g->mutable_nodes().back();
-  o1.AsStmt().op_type = "op1";
-  g->mutable_nodes().emplace_back();
-  Node& o2 = g->mutable_nodes().back();
-  o2.AsStmt().op_type = "op2";
-  g->mutable_nodes().emplace_back();
-  Node& o3 = g->mutable_nodes().back();
-  o3.AsStmt().op_type = "op3";
-  g->mutable_nodes().emplace_back();
-  Node& o4 = g->mutable_nodes().back();
-  o4.AsStmt().op_type = "op4";
-  g->mutable_nodes().emplace_back();
-  Node& o5 = g->mutable_nodes().back();
-  o5.AsStmt().op_type = "op5";
-  g->mutable_nodes().emplace_back();
-  Node& v1 = g->mutable_nodes().back();
-  v1.AsArg("var1");
-  g->mutable_nodes().emplace_back();
-  Node& v2 = g->mutable_nodes().back();
-  v2.AsArg("var2");
-  g->mutable_nodes().emplace_back();
-  Node& v3 = g->mutable_nodes().back();
-  v3.AsArg("var3");
-  g->mutable_nodes().emplace_back();
-  Node& v4 = g->mutable_nodes().back();
-  v4.AsArg("var4");
-
-  // o1->v1->o2
-  o1.outlinks.push_back(&v1);
-  o2.inlinks.push_back(&v1);
-  v1.inlinks.push_back(&o1);
-  v1.outlinks.push_back(&o2);
-  // o2->v2->o3
-  // o2->v2->o4
-  o2.outlinks.push_back(&v2);
-  o3.inlinks.push_back(&v2);
-  o4.inlinks.push_back(&v2);
-  v2.inlinks.push_back(&o2);
-  v2.outlinks.push_back(&o3);
-  v2.outlinks.push_back(&o4);
-  // o2->v3->o5
-  o2.outlinks.push_back(&v3);
-  o5.inlinks.push_back(&v3);
-  v3.inlinks.push_back(&o2);
-  v3.outlinks.push_back(&o5);
-  // o3-v4->o5
-  o3.outlinks.push_back(&v4);
-  o5.inlinks.push_back(&v4);
-  v4.inlinks.push_back(&o3);
-  v4.outlinks.push_back(&o5);
-}
-
-TEST(PMPattern, NewNode) {
-  PMPattern x;
-  auto* n = x.NewNode([](const Node* x) { return true; });
-  ASSERT_TRUE(n);
-  ASSERT_EQ(x.nodes_.size(), 1UL);
-}
-
-TEST(PMPattern, AddEdge) {
-  PMPattern x;
-  auto* a = x.NewNode([](const Node* x) { return true; });
-  auto* b = x.NewNode([](const Node* x) { return true; });
-  ASSERT_TRUE(a);
-  ASSERT_TRUE(b);
-  x.AddEdge(a, b);
-  ASSERT_EQ(x.nodes_.size(), 2UL);
-  ASSERT_EQ(x.edges_.size(), 1UL);
-  ASSERT_EQ(x.edges_.front().first, a);
-  ASSERT_EQ(x.edges_.front().second, b);
-
-  ASSERT_EQ(x.nodes().size(), 2UL);
-  ASSERT_EQ(x.edges().size(), 1UL);
-  ASSERT_EQ(x.edges().front().first, a);
-  ASSERT_EQ(x.edges().front().second, b);
-}
-
-TEST(PatternMatcher, MarkPMNodesInGraph) {
-  PatternMatcher x;
-  // mark o2, o3, v2
-
-  // The pattern is a graph:
-  //   o2(a node named o2) -> v2(a node named v2)
-  //   v2 -> o3(a node named o3)
-  auto* o2 = x.pattern_.NewNode([](const Node* node) {
-    // The teller can be any condition, such as op type, or variable's shape.
-    return node && node->IsStmt() && node->stmt()->op_type == "op2";
-  });
-  auto* o3 = x.pattern_.NewNode([](const Node* node) {
-    // The teller can be any condition, such as op type, or variable's shape.
-    return node && node->IsStmt() && node->stmt()->op_type == "op3";
-  });
-  auto* v2 = x.pattern_.NewNode([](const Node* node) {
-    // The teller can be any condition, such as op type, or variable's shape.
-    return node && node->IsArg() && node->arg()->name == "var2";
-  });
-
-  ASSERT_FALSE(o2->Tell(nullptr));
-  ASSERT_FALSE(o3->Tell(nullptr));
-  ASSERT_FALSE(v2->Tell(nullptr));
-
-  x.pattern_.AddEdge(o2, v2);
-  x.pattern_.AddEdge(v2, o3);
-
-  ASSERT_EQ(x.pattern_.edges().size(), 2UL);
-  ASSERT_EQ(x.pattern_.edges()[0].first, o2);
-  ASSERT_EQ(x.pattern_.edges()[0].second, v2);
-  ASSERT_EQ(x.pattern_.edges()[1].first, v2);
-  ASSERT_EQ(x.pattern_.edges()[1].second, o3);
-
-  SSAGraph graph;
-  BuildGraph(&graph);
-
-  x.MarkPMNodesInGraph(&graph);
-
-  ASSERT_EQ(x.pmnodes2nodes_.size(), 3UL);
-
-  auto subgraphs = x.DetectPatterns();
-  ASSERT_EQ(subgraphs.size(), 1UL);
-}
-
-TEST(PatternMatcher, MultiSubgraph) {
-  SSAGraph graph;
-  BuildGraph(&graph);
-
-  PatternMatcher x;
-
-  // The pattern is a graph:
-  //   op -> var
-  auto* any_op = x.mutable_pattern()->NewNode(
-      [](const Node* node) {
-        return node->IsStmt() && (node->stmt()->op_type == "op2" ||
-                                  node->stmt()->op_type == "op3");
-      },
-      "OP0");
-  auto* any_var =
-      x.mutable_pattern()
-          ->NewNode([](const Node* node) { return node->IsArg(); }, "VAR")
-          ->AsIntermediate();
-  auto* any_op1 = x.mutable_pattern()->NewNode(
-      [](const Node* node) { return node->IsStmt(); }, "OP1");
-
-  x.mutable_pattern()->AddEdge(any_op, any_var);
-  x.mutable_pattern()->AddEdge(any_var, any_op1);
-
-  int count = 0;
-  PatternMatcher::handle_t handle = [&](const PatternMatcher::subgraph_t& s,
-                                        SSAGraph* g) {
-    LOG(INFO) << "Detect " << s.at(any_op)->stmt()->op_type << " -> "
-              << s.at(any_var)->arg()->name << " -> "
-              << s.at(any_op1)->stmt()->op_type;
-    count++;
-  };
-
-  x(&graph, handle);
-
-  // 1. Detect op3 -> var4 -> op5
-  // 2. Detect op2 -> var2 -> op3
-  // 3. Detect op2 -> var2 -> op4
-  // 4. Detect op2 -> var3 -> op5
-  // But 2 and 3 and 4 overlapped, so keep 2, so the final choices are 1 and 2
-  ASSERT_GE(count, 1);
-  ASSERT_LE(count, 2);
-}
-
-TEST(PatternMatcher, IntermediateCheck) {
-  SSAGraph graph;
-  BuildGraph(&graph);
-
-  // o2->v2->o3
-  // o2->v2->o4
-  // check o2+o3 fuse, should fail because v2 also link to o4.
-  PatternMatcher matcher;
-  auto* op2 = matcher.mutable_pattern()->NewNode(
-      [](const Node* x) {
-        return x && x->IsStmt() && x->stmt()->op_type == "op2";
-      },
-      "op2");
-  auto* op3 = matcher.mutable_pattern()->NewNode(
-      [](const Node* x) {
-        return x && x->IsStmt() && x->stmt()->op_type == "op3";
-      },
-      "op3");
-  auto* v2 = matcher.mutable_pattern()
-                 ->NewNode(
-                     [](const Node* x) {
-                       return x && x->IsArg() && x->arg()->name == "var2";
-                     },
-                     "var2")
-                 ->AsIntermediate();
-  v2->LinksFrom({op2}).LinksTo({op3});
-
-  int count = 0;
-  matcher(&graph, [&](const PatternMatcher::subgraph_t& g, SSAGraph* graph) {
-    ++count;
-  });
-  EXPECT_EQ(count, 0);
-
-  count = 0;
-  v2->AsInput();
-  matcher(&graph, [&](const PatternMatcher::subgraph_t& g, SSAGraph* graph) {
-    ++count;
-  });
-  ASSERT_EQ(count, 1);
-}
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/runtime_context_assign_pass.cc b/lite/core/mir/runtime_context_assign_pass.cc
deleted file mode 100644
index 652932c149..0000000000
--- a/lite/core/mir/runtime_context_assign_pass.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/pass.h"
-#include "lite/core/mir/pass_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-class RuntimeContextAssignPass : public StmtPass {
- public:
-  RuntimeContextAssignPass() {}
-
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
-    for (auto& node : graph->mutable_nodes()) {
-      if (!node.IsStmt()) continue;
-      auto& inst = node.AsStmt();
-      inst.picked_kernel().SetContext(
-          ContextScheduler::Global().NewContext(inst.picked_kernel().target()));
-    }
-  }
-};
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(runtime_context_assign_pass,
-                  paddle::lite::mir::RuntimeContextAssignPass)
-    .SetTargets({TARGET(kAny)});
diff --git a/lite/core/mir/ssa_graph.cc b/lite/core/mir/ssa_graph.cc
deleted file mode 100644
index 5193d9c899..0000000000
--- a/lite/core/mir/ssa_graph.cc
+++ /dev/null
@@ -1,240 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/ssa_graph.h"
-#include <algorithm>
-#include <memory>
-#include <set>
-#include <unordered_map>
-#include <utility>
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-bool SSAGraph::CheckBidirectionalConnection() {
-  VLOG(4) << "node count " << node_storage_.size();
-  for (auto &node : node_storage_) {
-    if (node.IsStmt()) VLOG(4) << node.AsStmt().op_info()->Type();
-    if (node.IsArg()) VLOG(4) << node.AsArg().name << " " << node.AsArg().id;
-    for (auto *in : node.inlinks) {
-      CHECK(in->outlinks.end() !=
-            std::find(in->outlinks.begin(), in->outlinks.end(), &node));
-    }
-    for (auto *out : node.outlinks) {
-      CHECK(out->inlinks.end() !=
-            std::find(out->inlinks.begin(), out->inlinks.end(), &node));
-    }
-  }
-  return true;
-}
-
-std::map<mir::Node *, std::set<mir::Node *>> SSAGraph::BuildOperationAdjList() {
-  std::map<mir::Node *, std::set<mir::Node *>> adj_list;
-
-  for (auto &n : mutable_nodes()) {
-    if (!n.IsStmt()) continue;
-    if (adj_list.find(&n) == adj_list.end()) {
-      adj_list[&n] = std::set<mir::Node *>();
-    }
-    std::vector<mir::Node *> nodes;
-    for (auto &var : n.inlinks) {
-      for (auto &adj_n : var->inlinks) {
-        CHECK(adj_n->IsStmt());
-        nodes.push_back(adj_n);
-      }
-    }
-    std::sort(nodes.begin(),
-              nodes.end(),
-              [](mir::Node *node1, mir::Node *node2) { return node1 > node2; });
-    adj_list[&n].insert(std::make_move_iterator(nodes.begin()),
-                        std::make_move_iterator(nodes.end()));
-  }
-  return adj_list;
-}
-
-void SSAGraph::SortHelper(
-    const std::map<mir::Node *, std::set<mir::Node *>> &adj_list,
-    mir::Node *node,
-    std::set<mir::Node *> *visited,
-    std::vector<mir::Node *> *ret) {
-  visited->insert(node);
-
-  for (auto adj : adj_list.at(node)) {
-    if (visited->find(adj) == visited->end()) {
-      SortHelper(adj_list, adj, visited, ret);
-    }
-  }
-
-  ret->push_back(node);
-}
-
-std::vector<mir::Node *> SSAGraph::StmtTopologicalOrder() {
-  CheckBidirectionalConnection();
-
-  std::stack<mir::Node *> stack;
-  std::set<mir::Node *> visited;
-  std::vector<mir::Node *> res;
-
-  auto adj_list = BuildOperationAdjList();
-
-  for (auto adj : adj_list) {
-    if (visited.find(adj.first) == visited.end()) {
-      SortHelper(adj_list, adj.first, &visited, &res);
-    }
-  }
-
-  return res;
-}
-
-Node *SSAGraph::GraphCreateInstructNode(
-    const std::shared_ptr<OpLite> &op, const std::vector<Place> &valid_places) {
-  node_storage_.emplace_back();
-  // TODO(Superjomn) remove one valid_places here.
-  op->SetValidPlaces(valid_places);
-  auto &new_node = node_storage_.back();
-  auto kernels = op->CreateKernels(valid_places);
-  node_storage_.back().AsStmt(op->op_type_, std::move(kernels), op);
-
-  CHECK(new_node.inlinks.empty()) << "duplicate Build found";
-  CHECK(new_node.outlinks.empty()) << "duplicate Build found";
-  return &node_storage_.back();
-}
-
-void SSAGraph::Build(const Program &program,
-                     const std::vector<Place> &valid_places) {
-  CHECK(node_storage_.empty());
-
-  auto weights_name = program.weights();
-  auto is_weights = [&](const std::string &name) -> bool {
-    auto it = std::find(weights_name.begin(), weights_name.end(), name);
-    if (it == weights_name.end()) return false;
-    return true;
-  };
-
-  std::unordered_map<std::string, mir::Node *> arg_update_node_map_;
-  for (auto &op : program.ops()) {
-    VLOG(3) << op->op_info()->Type();
-    auto *op_node = GraphCreateInstructNode(op, valid_places);
-    for (const std::string &name : op->op_info()->input_names()) {
-      mir::Node *arg_node = nullptr;
-      if (arg_update_node_map_.count(name)) {
-        arg_node = arg_update_node_map_.at(name);
-      } else {
-        node_storage_.emplace_back();
-        arg_node = &node_storage_.back();
-        arg_node->AsArg(name, node_storage_.size() - 1);
-        arg_update_node_map_[name] = arg_node;
-      }
-      if (is_weights(name)) arg_node->AsArg().is_weight = true;
-      CHECK(arg_node->IsRoleSet());
-      DirectedLink(arg_node, op_node);
-    }
-    for (const std::string &name : op->op_info()->output_names()) {
-      node_storage_.emplace_back();
-      auto *arg_node = &node_storage_.back();
-      arg_node->AsArg(name, node_storage_.size() - 1);
-      arg_update_node_map_[name] = arg_node;
-
-      if (is_weights(name)) arg_node->AsArg().is_weight = true;
-      CHECK(arg_node->IsRoleSet());
-      DirectedLink(op_node, arg_node);
-    }
-    CHECK(CheckLinksRoleSet());
-  }
-
-  CHECK(CheckNodesRoleSet());
-  CheckValid();
-}
-
-void SSAGraph::RemoveNode(const mir::Node *node) {
-  auto pos = std::find_if(node_storage_.begin(),
-                          node_storage_.end(),
-                          [&node](mir::Node &n) { return &n == node; });
-  CHECK(pos != node_storage_.end());
-  node_storage_.erase(pos);
-}
-
-mir::Node *SSAGraph::Argument(const std::string &name) {
-  auto it = arguments_.find(name);
-  CHECK(it != arguments_.end()) << "no argument called " << name;
-  return it->second;
-}
-
-std::vector<mir::Node *> SSAGraph::inputs() {
-  std::vector<mir::Node *> res;
-  for (auto &node : node_storage_) {
-    if (node.inlinks.empty()) {
-      res.push_back(&node);
-    }
-  }
-  return res;
-}
-
-std::vector<mir::Node *> SSAGraph::outputs() {
-  std::vector<mir::Node *> res;
-  for (auto &node : node_storage_) {
-    if (node.outlinks.empty()) {
-      res.push_back(&node);
-    }
-  }
-  return res;
-}
-
-mir::Node *SSAGraph::RetrieveArgument(const std::string &arg) {
-  auto it = arguments_.find(arg);
-  if (it != arguments_.end()) {
-    return it->second;
-  }
-  return nullptr;
-}
-
-bool SSAGraph::CheckNodesRoleSet() {
-  for (auto &node : mutable_nodes()) {
-    CHECK_OR_FALSE(node.IsRoleSet());
-  }
-  return true;
-}
-
-bool SSAGraph::CheckLinksRoleSet() {
-  for (auto &node : mutable_nodes()) {
-    CHECK_OR_FALSE(node.IsRoleSet());
-    if (!node.IsStmt()) continue;
-    for (auto *x : node.inlinks) {
-      CHECK_OR_FALSE(x->IsRoleSet());
-      CHECK_OR_FALSE(x->IsArg());
-    }
-    for (auto *x : node.outlinks) {
-      CHECK_OR_FALSE(x->IsRoleSet());
-      CHECK_OR_FALSE(x->IsArg());
-    }
-  }
-  return true;
-}
-
-Node *SSAGraph::NewArgumentNode(const std::string &name) {
-  node_storage_.emplace_back();
-  auto &arg_node = node_storage_.back();
-  arg_node.AsArg(name, node_storage_.size() - 1);
-  return &arg_node;
-}
-
-Node *SSAGraph::NewInstructNode() {
-  node_storage_.emplace_back();
-  return &node_storage_.back();
-}
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/ssa_graph.h b/lite/core/mir/ssa_graph.h
deleted file mode 100644
index b5b9fb1cb2..0000000000
--- a/lite/core/mir/ssa_graph.h
+++ /dev/null
@@ -1,144 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <list>
-#include <map>
-#include <memory>
-#include <set>
-#include <stack>
-#include <string>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/mir/node.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/program.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-// An Graph for MIR. It is built from a list of Op and a scope.
-class GraphBase {};
-
-class SSAGraph : GraphBase {
- public:
-  // @param program: the op program
-  // @param valid_places: the valid places user set for the system.
-  void Build(const Program &program, const std::vector<Place> &valid_places);
-  void RemoveNode(const mir::Node *node);
-
-  std::vector<mir::Node *> StmtTopologicalOrder();
-
-  // The inputs of the graph.
-  std::vector<mir::Node *> inputs();
-
-  // The outputs of the graph.
-  std::vector<mir::Node *> outputs();
-
-  const std::list<mir::Node> &nodes() const { return node_storage_; }
-  std::list<mir::Node> &mutable_nodes() { return node_storage_; }
-
-  mir::Node *RetrieveArgument(const std::string &arg);
-
-  Node *NewArgumentNode(const std::string &name);
-  Node *NewInstructNode();
-
-  void CheckValid() {
-    CHECK(CheckBidirectionalConnection());
-    CHECK(CheckNodesRoleSet());
-    CHECK(CheckLinksRoleSet());
-  }
-
-  Node *GraphCreateInstructNode(const std::shared_ptr<OpLite> &op,
-                                const std::vector<Place> &valid_places);
-
-  // Device related attributes
-  const std::vector<Place> &valid_places() const { return valid_places_; }
-  void SetValidPlaces(const std::vector<Place> &x) { valid_places_ = x; }
-
- private:
-  mir::Node *Argument(const std::string &name);
-  // Check the bidirectional connection.
-  bool CheckBidirectionalConnection();
-  bool CheckNodesRoleSet();
-  // Check all the items's role in inlinks and outlinks is set.
-  bool CheckLinksRoleSet();
-
-  void MarkArgumentWeights(const Program &program) {
-    for (const auto &name : program.weights()) {
-      arguments_[name]->AsArg().is_weight = true;
-    }
-  }
-
-  // Build operator inlink edge table.
-  std::map<mir::Node *, std::set<mir::Node *>> BuildOperationAdjList();
-
-  void SortHelper(const std::map<mir::Node *, std::set<mir::Node *>> &adj_list,
-                  mir::Node *node,
-                  std::set<mir::Node *> *visited,
-                  std::vector<mir::Node *> *ret);
-
- private:
-  std::list<mir::Node> node_storage_;
-  std::map<std::string, mir::Node *> arguments_;
-  std::vector<Place> valid_places_;
-};
-
-// Remove the link between a -> b.
-static void RemoveDirectedLink(Node *a, Node *b) {
-  auto it = std::find(b->inlinks.begin(), b->inlinks.end(), a);
-  if (it != b->inlinks.end()) {
-    b->inlinks.erase(it);
-  }
-
-  auto it1 = std::find(a->outlinks.begin(), a->outlinks.end(), b);
-  if (it1 != a->outlinks.end()) {
-    a->outlinks.erase((it1));
-  }
-}
-
-// Link a -> b.
-static void DirectedLink(Node *a, Node *b) {
-  // Eagerly remove first, to avoid duplicate link.
-  RemoveDirectedLink(a, b);
-  a->outlinks.push_back(b);
-  b->inlinks.push_back(a);
-}
-
-static void LocalInferenceType(Node *a, Node *b, const std::string &arg_name) {
-  // instr -> output argument
-  if (a->IsStmt() && b->IsArg()) {
-    auto &inst = a->AsStmt();
-    auto &output = b->AsArg();
-
-    if (!output.type) {
-      output.type = inst.picked_kernel().GetOutputDeclType(arg_name);
-    }
-  }
-
-  // input argument -> instr
-  if (a->IsArg() && b->IsStmt()) {
-    auto &input = a->AsArg();
-    auto &inst = b->AsStmt();
-    if (!input.type) {
-      input.type = inst.picked_kernel().GetInputDeclType(arg_name);
-    }
-  }
-}
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/ssa_graph_test.cc b/lite/core/mir/ssa_graph_test.cc
deleted file mode 100644
index ef49001ba2..0000000000
--- a/lite/core/mir/ssa_graph_test.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/ssa_graph.h"
-#include <gtest/gtest.h>
-#include <memory>
-#include "lite/api/paddle_use_passes.h"
-#include "lite/core/mir/graph_visualize_pass.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/program_fake_utils.h"
-#include "paddle/fluid/framework/program_desc.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-void BuildFc(framework::ProgramDesc* desc,
-             const std::string& x,
-             const std::string& w,
-             const std::string& b,
-             const std::string& out) {
-  auto* fc = desc->MutableBlock(0)->AppendOp();
-  fc->SetInput("Input", {x});
-  fc->SetInput("W", {w});
-  fc->SetInput("Bias", {b});
-  fc->SetOutput("Out", {out});
-}
-
-TEST(SSAGraph, test) {
-  auto program_faker = ProgramFaker();
-  SSAGraph graph;
-  std::vector<Place> places{{TARGET(kHost), PRECISION(kFloat)}};
-  auto scope = std::make_shared<lite::Scope>();
-
-  lite::Program program(*program_faker.program()->Proto(), scope, places);
-  graph.Build(program, places);
-
-  Visualize(&graph);
-}
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(fc);
-#ifdef LITE_WITH_X86
-// USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def);
-#endif
diff --git a/lite/core/mir/static_kernel_pick_pass.cc b/lite/core/mir/static_kernel_pick_pass.cc
deleted file mode 100644
index 37bcb1e317..0000000000
--- a/lite/core/mir/static_kernel_pick_pass.cc
+++ /dev/null
@@ -1,136 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/static_kernel_pick_pass.h"
-#include <algorithm>
-#include <memory>
-#include <utility>
-#include <vector>
-#include "lite/core/mir/pass_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-bool KernelScoreCmp(const std::pair<size_t, std::unique_ptr<KernelBase>>& a,
-                    const std::pair<size_t, std::unique_ptr<KernelBase>>& b) {
-  return a.first > b.first;
-}
-
-void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  CHECK(kernel_pick_factors_.any_factor_considered())
-      << "kernel_pick_factors should be specified first";
-  CHECK(graph) << "graph not valid";
-  // sort kernels by the factors.
-
-  for (auto& node : graph->mutable_nodes()) {
-    if (!node.IsStmt()) continue;
-    auto& instruct = node.AsStmt();
-
-    // Get candidate kernels
-    std::vector<std::pair<size_t, std::unique_ptr<KernelBase>>> scored;
-    CHECK(!instruct.kernels().empty()) << "No kernels found for "
-                                       << instruct.op_type();
-    for (auto&& kernel : instruct.kernels()) {
-      size_t score = KernelGrade(*kernel);
-      scored.emplace_back(score, std::move(kernel));
-    }
-    std::sort(scored.begin(), scored.end(), KernelScoreCmp);
-    instruct.kernels().clear();
-
-    if (!instruct.op_info()->HasAttr("enable_int8")) {
-      // Move kernel back
-      // Just keep a single best kernel.
-      // TODO(Superjomn) reconsider this.
-      instruct.kernels().emplace_back(std::move(scored.front().second));
-      VLOG(2) << "pick " << instruct.kernels().front()->name();
-
-    } else {
-      bool out_type_int8 = true;
-      // Only if all ops linked to this op output has enable_int8 attr,
-      // then the op output type is int8, or fp32.
-      for (auto* out_n : node.outlinks) {
-        CHECK(out_n->IsArg());
-        for (auto* tmp_op : out_n->outlinks) {
-          CHECK(tmp_op->IsStmt());
-          if (!tmp_op->AsStmt().op_info()->HasAttr("enable_int8")) {
-            out_type_int8 = false;
-            break;
-          }
-        }
-        if (!out_type_int8) break;
-      }
-      // If the out_type_int8 is true, it turns out that the output type of this
-      // op can be int8.
-      // So we need to specify output scale for this op.
-      if (out_type_int8) {
-        auto out_node = node.outlinks.front();
-        CHECK(out_node->IsArg());
-        auto one_adj_op_node = out_node->outlinks.front();
-        CHECK(one_adj_op_node->IsStmt());
-        auto& one_adj_instruct = one_adj_op_node->AsStmt();
-        CHECK(one_adj_instruct.op_info()->HasAttr("enable_int8"));
-        CHECK(one_adj_instruct.op_info()->HasAttr("input_scale"));
-
-        instruct.mutable_op_info()->SetAttr(
-            "output_scale",
-            one_adj_instruct.op_info()->GetAttr<float>("input_scale"));
-
-        auto update_desc = *instruct.mutable_op_info();
-        instruct.ResetOp(update_desc, graph->valid_places());
-        scored.clear();
-        for (auto&& kernel : instruct.kernels()) {
-          size_t score = KernelGrade(*kernel);
-          scored.emplace_back(score, std::move(kernel));
-        }
-        std::sort(scored.begin(), scored.end(), KernelScoreCmp);
-        instruct.kernels().clear();
-      }
-      // If the out_type_int8 is true, we should pick the kernel with the
-      // int8 input and int8 output.
-      // If the out_type_int8 is false, we should pick the kernel with the
-      // int8 input and fp32 output.
-      auto output_arguments = instruct.op_info()->OutputArgumentNames();
-      for (auto& candidate : scored) {
-        bool all_output_type_match = true;
-        auto expect_output_type =
-            out_type_int8 ? PRECISION(kInt8) : PRECISION(kFloat);
-
-        for (auto& arg_name : output_arguments) {
-          const Type* out_arg_ty =
-              candidate.second->GetOutputDeclType(arg_name);
-          if (out_arg_ty->precision() != expect_output_type) {
-            all_output_type_match = false;
-          }
-        }
-
-        if (all_output_type_match) {
-          instruct.kernels().emplace_back(std::move(candidate.second));
-          VLOG(2) << "pick " << instruct.kernels().front()->name();
-          break;
-        }
-      }
-      CHECK(!instruct.kernels().empty()) << "No kernels found for "
-                                         << instruct.op_type();
-    }
-  }
-}
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(static_kernel_pick_pass,
-                  paddle::lite::mir::StaticKernelPickPass)
-    .SetTargets({TARGET(kAny)});
diff --git a/lite/core/mir/static_kernel_pick_pass.h b/lite/core/mir/static_kernel_pick_pass.h
deleted file mode 100644
index 3412278229..0000000000
--- a/lite/core/mir/static_kernel_pick_pass.h
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <limits>
-#include <memory>
-#include "lite/core/mir/pass.h"
-#include "lite/core/types.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-/*
- * StaticKernelPickPass is a simple strategy for picking the kernel for each
- * Operator using operator developer defined rule, there are many other tactics
- * such as considering IO or kernel execution latency and we will implement them
- * latter.
- *
- * There are two argument for this pass:
- * - place, the target place.
- * - kernel_pick_factors, the factors to consider in picking kernels.
- * Set them first before execute the pass.
- */
-class StaticKernelPickPass : public mir::StmtPass {
- public:
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
-
-  void SetPreferPlace(const Place& place) { place_ = place; }
-  const Place& place() const { return place_; }
-  const core::KernelPickFactor& kernel_pick_factors() const {
-    return kernel_pick_factors_;
-  }
-  core::KernelPickFactor* mutable_kernel_pick_factors() {
-    return &kernel_pick_factors_;
-  }
-
- private:
-  // Score the kernel.
-  size_t KernelGrade(const lite::KernelBase& kernel) {
-    size_t score{};
-    const int kMax =
-        std::numeric_limits<core::KernelPickFactor::value_type>::max();
-
-    // The more important factor comes first
-    if (kernel_pick_factors_.IsTargetConsidered() &&
-        (place().target == kernel.target() || kernel.target() == TARGET(kAny) ||
-         place().target == TARGET(kAny))) {
-      score +=
-          kMax / static_cast<int>(core::KernelPickFactor::Factor::TargetFirst);
-    }
-    if (kernel_pick_factors_.IsPrecisionConsidered() &&
-        (place().precision == kernel.precision() ||
-         kernel.precision() == PRECISION(kAny) ||
-         place().precision == PRECISION(kAny))) {
-      score += kMax /
-               static_cast<int>(core::KernelPickFactor::Factor::PrecisionFirst);
-    }
-    if (kernel_pick_factors_.IsDataLayoutConsidered() &&
-        (place().layout == kernel.layout() ||
-         kernel.layout() == DATALAYOUT(kAny) ||
-         place().layout == DATALAYOUT(kAny))) {
-      score += kMax / static_cast<int>(
-                          core::KernelPickFactor::Factor::DataLayoutFirst);
-    }
-    VLOG(4) << "picker tactic " << kernel_pick_factors_;
-    VLOG(4) << "kernel place " << kernel.place().DebugString();
-    VLOG(4) << "picker place " << place().DebugString();
-    VLOG(4) << "score " << score;
-
-    // The data layout is not considered, for the input and output arguments
-    // might have different data layout.
-    // TODO(Superjomn) reconsider the idea of taking the data layout as a kernel
-    // specification.
-    return score;
-  }
-
- private:
-  core::KernelPickFactor kernel_pick_factors_;
-  Place place_;
-};
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/subgraph/CMakeLists.txt b/lite/core/mir/subgraph/CMakeLists.txt
deleted file mode 100644
index 9984e202db..0000000000
--- a/lite/core/mir/subgraph/CMakeLists.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-
-lite_cc_library(subgraph_pass
-    SRCS subgraph_program_pass.cc
-    DEPS mir_pass types ${mir_fusers})
-lite_cc_test(test_subgraph_pass SRCS subgraph_program_pass_test.cc
-  DEPS subgraph_pass mir_passes gflags model_parser cxx_api
-  ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1 SERIAL)
-if (WITH_TESTING)
-  add_dependencies(test_subgraph_pass extern_lite_download_mobilenet_v1_tar_gz)
-  add_dependencies(test_subgraph_pass extern_lite_download_mobilenet_v2_relu_tar_gz)
-  set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map")
-  set_target_properties(test_subgraph_pass PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-endif()
-
-set(subgraph_passes subgraph_pass)
-
-if(LITE_WITH_NPU)
-  lite_cc_library(npu_pass SRCS generate_npu_program_pass.cc
-      DEPS mir_pass types context ${mir_fusers} ${npu_bridges} npu_helper ${npu_ddk_libs} graph_op subgraph_pass)
-  list(APPEND subgraph_passes npu_pass)
-  lite_cc_test(test_npu_pass SRCS generate_npu_program_pass_test.cc
-    DEPS npu_pass cxx_api mir_passes gflags
-    ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1
-         --optimized_model=${LITE_MODEL_DIR}/lite_npu_model_opt SERIAL)
-  if (WITH_TESTING)
-    add_dependencies(test_npu_pass extern_lite_download_mobilenet_v1_tar_gz)
-    add_dependencies(test_subgraph_pass extern_lite_download_mobilenet_v2_relu_tar_gz)
-    set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map")
-    set_target_properties(test_npu_pass PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-  endif()
-endif()
-
-set(subgraph_passes ${subgraph_passes} CACHE INTERNAL "subgraph_passes")
-message(STATUS "----> subgraph_passes: ${subgraph_passes}")
diff --git a/lite/core/mir/subgraph/generate_npu_program_pass.cc b/lite/core/mir/subgraph/generate_npu_program_pass.cc
deleted file mode 100644
index 76e295c7af..0000000000
--- a/lite/core/mir/subgraph/generate_npu_program_pass.cc
+++ /dev/null
@@ -1,218 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/subgraph/generate_npu_program_pass.h"
-#include <memory>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "lite/core/mir/graph_visualize_pass.h"
-#include "lite/core/mir/pass_registry.h"
-#include "lite/core/mir/pattern_matcher.h"
-
-#include "ai_ddk_lib/include/HiAiModelManagerService.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"  // for ge::op::Data
-#include "ai_ddk_lib/include/graph/operator_reg.h"
-#include "lite/backends/npu/bridge/paddle_use_npu_bridges.h"
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/bridge/utils.h"
-#include "lite/backends/npu/npu_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace subgraph {
-
-std::shared_ptr<ge::Operator> GenerateNPUProgramPass::CvtVarNode(
-    lite::mir::Node* var_node, const Scope* scope) {
-  CHECK(var_node->IsArg());
-  const auto& arg = var_node->AsArg();
-  VLOG(4) << "Convert var node " << arg.name;
-
-  auto* var = scope->FindVar(arg.name);
-  CHECK(var);
-  auto* tensor = var->GetMutable<lite::Tensor>();
-  CHECK(tensor);
-  auto dims = tensor->dims();
-  if (arg.is_weight) {
-    auto wgt = std::make_shared<ge::op::Const>(arg.name);
-    LOG(INFO) << "in convert const:" << arg.name;
-    VLOG(4) << dims;
-    wgt->set_attr_value(lite::npu::bridge::CvtFromLiteTensor(tensor));
-    return wgt;
-  } else {
-    CHECK_EQ(dims.size(), 4);
-    LOG(INFO) << "in convert data:" << arg.name;
-    LOG(INFO) << dims;
-    // TODO(xxx): support more types and dims size
-    ge::TensorDesc desc(ge::Shape(dims.Vectorize()),
-                        ge::Format::FORMAT_NCHW,
-                        ge::DataType::DT_FLOAT);
-
-    //   auto size = desc.GetShape().GetShapeSize();
-    //  ge::TensorUtils::SetSize(desc, size*sizeof(float));
-    //  ge::TensorUtils::SetRealDimCnt(desc, 4);
-    auto data = std::make_shared<ge::op::Data>(arg.name);
-    data->update_input_desc_x(desc);
-    return data;
-  }
-  return nullptr;
-}
-
-void GenerateNPUProgramPass::CvtAllOpNodes(
-    const std::vector<Node*>& nodes2cvt,
-    lite::npu::bridge::node_map_type* converted_vars) {
-  const auto& bridges = lite::npu::bridge::Factory::Instance();
-  const auto& cvtfunc_map = bridges.AllFunctions();
-  // return record all converted vars
-  // op node's inputs must be found in converted_vars
-  for (auto& node : nodes2cvt) {
-    lite::npu::bridge::node_map_type node_inputs;
-    auto& stmt = node->AsStmt();
-    for (auto& var_node : node->inlinks) {
-      auto& arg = var_node->AsArg();
-      // weight should be handled in the converter, so skip here
-      if (arg.is_weight) {
-        continue;
-      }
-      auto var_name = arg.name;
-      if (!converted_vars->count(var_name)) {
-        converted_vars->insert(
-            std::make_pair(var_name, CvtVarNode(var_node, stmt.op()->scope())));
-      }
-      node_inputs.insert(*converted_vars->find(var_name));
-    }
-    auto node_outputs = cvtfunc_map.at(stmt.op_type())(stmt.op(), node_inputs);
-    converted_vars->insert(node_outputs.begin(), node_outputs.end());
-  }
-}
-
-std::string GenerateNPUProgramPass::BuildNPUGraph(
-    const std::unordered_set<Node*>& op_nodes,
-    const std::unordered_set<Node*>& in_data_vars,
-    const std::unordered_set<Node*>& out_data_vars,
-    int sub_id) {
-  auto ordered_nodes = GetTopologicalOrder(op_nodes);
-  lite::npu::bridge::node_map_type converted_vars;
-  CvtAllOpNodes(ordered_nodes, &converted_vars);
-
-  std::vector<std::string> in_var_names;
-  std::vector<std::string> out_var_names;
-  std::vector<ge::Operator> inputs;
-  std::vector<ge::Operator> outputs;
-  for (auto i : in_data_vars) {
-    auto argname = i->AsArg().name;
-    in_var_names.push_back(argname);
-    inputs.push_back(*converted_vars.at(argname));
-  }
-  for (auto i : out_data_vars) {
-    auto argname = i->AsArg().name;
-    out_var_names.push_back(argname);
-    outputs.push_back(*converted_vars.at(argname));
-  }
-
-  std::string model_name("hiai_npu_client_" + std::to_string(sub_id) + ".om");
-  if (!npu::BuildNPUClient(inputs, outputs, model_name)) {
-    LOG(WARNING) << "Build NPU failed subgraph " << sub_id;
-    throw std::runtime_error("Build NPU failed subgraph.");
-  }
-  LOG(INFO) << "[NPU] Build NPU Client success subgraph " << sub_id;
-  return model_name;
-}
-
-void GenerateNPUProgramPass::GenNPUSubgraph(
-    const std::unique_ptr<SSAGraph>& graph,
-    const std::unordered_set<Node*>& op_nodes,
-    int sub_id) {
-  std::unordered_set<Node*> in_data_vars;
-  std::unordered_set<Node*> in_wgt_vars;
-  std::unordered_set<Node*> out_data_vars;
-  std::unordered_set<Node*> out_unused_vars;
-  FindInputOutputVars(
-      op_nodes, &in_data_vars, &in_wgt_vars, &out_data_vars, &out_unused_vars);
-
-  auto model_name =
-      BuildNPUGraph(op_nodes, in_data_vars, out_data_vars, sub_id);
-
-  auto any_op = (*op_nodes.begin())->AsStmt().op();
-  InsertNewNode(graph,
-                model_name,
-                any_op->scope(),
-                any_op->valid_places(),
-                in_data_vars,
-                in_wgt_vars,
-                out_data_vars,
-                out_unused_vars);
-
-  auto nodes2rm = GetNode2rm(
-      op_nodes, {in_data_vars, in_wgt_vars, out_data_vars, out_unused_vars});
-
-  GraphSafeRemoveNodes(graph.get(), nodes2rm);
-}
-
-void GenerateNPUProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  LOG(INFO) << "Before NPU Pass \n" << Visualize(graph.get());
-  const auto& bridges = lite::npu::bridge::Factory::Instance();
-  const auto& op_map = bridges.AllFunctions();
-  std::vector<std::string> supported_op_types;
-  for (auto& i : op_map) {
-    LOG(INFO) << "Supported type: " << i.first;
-    supported_op_types.push_back(i.first);
-  }
-
-  try {
-    int num_subgraph = FuseSubgraph(graph, supported_op_types);
-    InferOnce(graph);
-    auto op_nodes_all = ClassifySubgraph(graph);
-    CHECK_EQ(op_nodes_all.size(), num_subgraph);
-    int id = 1;
-    for (auto& op_nodes : op_nodes_all) {
-      LOG(INFO) << "Converting subgraph_id:" << id;
-      GenNPUSubgraph(graph, op_nodes.second, id);
-      LOG(INFO) << "After NPU Pass Subgraph " << id << "\n"
-                << Visualize(graph.get());
-      id++;
-    }
-  } catch (...) {
-    LOG(WARNING) << "Build NPU graph failed";
-    throw std::runtime_error("Build NPU graph failed");
-  }
-
-  for (auto& item : graph->StmtTopologicalOrder()) {
-    if (item->IsStmt()) {
-      auto& stmt = item->AsStmt();
-      LOG(INFO) << stmt;
-      insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front()));
-    }
-  }
-}
-
-std::unique_ptr<RuntimeProgram> GenerateNPUProgramPass::GenProgram() {
-  LOG(INFO) << "insts.size " << insts_.size();
-  std::unique_ptr<RuntimeProgram> program(
-      new RuntimeProgram(std::move(insts_)));
-  return program;
-}
-
-}  // namespace subgraph
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(generate_npu_program_pass,
-                  paddle::lite::mir::subgraph::GenerateNPUProgramPass)
-    .SetTargets({TARGET(kAny)});
diff --git a/lite/core/mir/subgraph/generate_npu_program_pass.h b/lite/core/mir/subgraph/generate_npu_program_pass.h
deleted file mode 100644
index 9e030287cb..0000000000
--- a/lite/core/mir/subgraph/generate_npu_program_pass.h
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "lite/backends/npu/bridge/registry.h"
-#include "lite/backends/npu/npu_helper.h"
-#include "lite/core/mir/pass.h"
-#include "lite/core/mir/subgraph/subgraph_program_pass.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace subgraph {
-
-class GenerateNPUProgramPass : public SubgraphProgramPass {
- public:
-  using key2nodes_t = std::map<std::string, Node*>;
-
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
-  std::unique_ptr<RuntimeProgram> GenProgram();
-
- protected:
-  // nodes2cvt: op nodes to convert
-  // return cvted_vars: converted var nodes
-  void CvtAllOpNodes(const std::vector<Node*>& nodes2cvt,
-                     lite::npu::bridge::node_map_type* cvted_vars);
-
-  std::shared_ptr<ge::Operator> CvtVarNode(lite::mir::Node* var_node,
-                                           const Scope* scope);
-
-  std::string BuildNPUGraph(const std::unordered_set<Node*>& op_nodes,
-                            const std::unordered_set<Node*>& in_data_vars,
-                            const std::unordered_set<Node*>& out_data_vars,
-                            int sub_id);
-
-  void GenNPUSubgraph(const std::unique_ptr<SSAGraph>& graph,
-                      const std::unordered_set<Node*>& op_nodes,
-                      int sub_id);
-
- private:
-  std::vector<Instruction> insts_;
-};
-
-}  // namespace subgraph
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc b/lite/core/mir/subgraph/generate_npu_program_pass_test.cc
deleted file mode 100644
index a1f39441cb..0000000000
--- a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/core/mir/graph_visualize_pass.h"
-#include "lite/core/mir/subgraph/subgraph_program_pass.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/program.h"
-#include "lite/core/tensor.h"
-
-#include "lite/api/cxx_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/api/test_helper.h"
-
-#include "lite/model_parser/pb/program_desc.h"
-
-DEFINE_string(optimized_model, "", "optimized_model");
-DEFINE_int32(batch_size, 1, "batch size");
-DEFINE_int32(im_channel, 3, "im_channel");
-
-namespace paddle {
-namespace lite {
-
-void TestModel(lite::Predictor* predictor,
-               const std::vector<Place>& valid_places,
-               const std::string& model_dir) {
-  predictor->Build(model_dir,
-                   model_dir + "/model",
-                   model_dir + "/params",
-                   Place{TARGET(kARM), PRECISION(kFloat)},
-                   valid_places);
-
-  auto* input_tensor = predictor->GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>(
-      {FLAGS_batch_size, FLAGS_im_channel, FLAGS_im_height, FLAGS_im_width})));
-  auto* data = input_tensor->mutable_data<float>();
-  auto item_size = input_tensor->dims().production();
-  for (int i = 0; i < item_size; i++) {
-    data[i] = 1;
-  }
-
-  predictor->Run();
-  if (model_dir != FLAGS_optimized_model &&
-      std::find(valid_places.begin(),
-                valid_places.end(),
-                Place{TARGET(kNPU), PRECISION(kFloat)}) != valid_places.end()) {
-    predictor->SaveModel(FLAGS_optimized_model);
-  }
-}
-
-void CompareOutData(const lite::Predictor& tgt, const lite::Predictor& ref) {
-  auto* tgt_otensor = tgt.GetOutput(0);
-  auto* ref_otensor = ref.GetOutput(0);
-  const auto* tgt_pdata = tgt_otensor->data<float>();
-  const auto* ref_pdata = ref_otensor->data<float>();
-  EXPECT_EQ(tgt_otensor->dims().production(), ref_otensor->dims().production());
-  for (size_t i = 0; i < tgt_otensor->dims().production(); ++i) {
-    auto diff = std::fabs(tgt_pdata[i] - ref_pdata[i]) /
-                (std::fabs(ref_pdata[i]) + 1e-6);
-    VLOG(3) << diff;
-    EXPECT_LT(diff, 0.1);
-  }
-}
-
-TEST(NPUSubgraph, compare) {
-  DeviceInfo::Init();
-  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, 1);
-
-  lite::Predictor predictor_arm, predictor_npu, predictor_npu_savedmodel;
-  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
-                                   Place{TARGET(kARM), PRECISION(kFloat)}});
-
-  TestModel(&predictor_arm, valid_places, FLAGS_model_dir);
-
-  valid_places.push_back(Place{TARGET(kNPU), PRECISION(kFloat)});
-  TestModel(&predictor_npu, valid_places, FLAGS_model_dir);
-
-  CompareOutData(predictor_npu, predictor_arm);
-  LOG(INFO) << " ================ NPU speed ================== ";
-  for (int i = 0; i < FLAGS_repeats; ++i) {
-    auto start = GetCurrentUS();
-    predictor_npu.Run();
-    LOG(INFO) << i << ", " << GetCurrentUS() - start << "us";
-  }
-
-  LOG(INFO) << " =================== ARM CPU speed =================== ";
-  for (int i = 0; i < FLAGS_repeats; ++i) {
-    auto start = GetCurrentUS();
-    predictor_arm.Run();
-    LOG(INFO) << i << ", " << GetCurrentUS() - start << "us";
-  }
-
-  TestModel(&predictor_npu_savedmodel, valid_places, FLAGS_optimized_model);
-
-  CompareOutData(predictor_npu_savedmodel, predictor_arm);
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/subgraph/subgraph_program_pass.cc b/lite/core/mir/subgraph/subgraph_program_pass.cc
deleted file mode 100644
index 2b6206f891..0000000000
--- a/lite/core/mir/subgraph/subgraph_program_pass.cc
+++ /dev/null
@@ -1,314 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/subgraph/subgraph_program_pass.h"
-#include <memory>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "lite/core/mir/graph_visualize_pass.h"
-#include "lite/core/mir/pass_registry.h"
-#include "lite/core/mir/pattern_matcher.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace subgraph {
-
-std::unordered_map<int, std::unordered_set<Node*>>
-SubgraphProgramPass::ClassifySubgraph(const std::unique_ptr<SSAGraph>& graph) {
-  std::unordered_map<int, std::unordered_set<Node*>> op_nodes;
-  for (auto& item : graph->StmtTopologicalOrder()) {
-    if (!item->IsStmt()) continue;
-    auto& stmt = item->AsStmt();
-    int sub_id = stmt.subgraph_id();
-    if (sub_id < 1) continue;
-    if (!op_nodes.count(sub_id)) {
-      op_nodes[sub_id] = std::unordered_set<Node*>();
-    }
-    op_nodes.at(sub_id).insert(item);
-  }
-  return op_nodes;
-}
-
-cpp::OpDesc SubgraphProgramPass::GenGraphOpDesc(
-    const std::string& model_name,
-    const std::vector<std::string>& in_var_names,
-    const std::vector<std::string>& out_var_names) {
-  cpp::OpDesc op_desc;
-  op_desc.SetType("graph_op");
-  op_desc.SetInput("Inputs", in_var_names);
-  op_desc.SetOutput("Outputs", out_var_names);
-  op_desc.SetAttr("model_name", model_name);
-  return op_desc;
-}
-
-void SubgraphProgramPass::InsertNewNode(
-    const std::unique_ptr<SSAGraph>& graph,
-    const std::string& model_name,
-    Scope* scope,
-    const std::vector<Place>& valid_places,
-    std::unordered_set<Node*> in_data_vars,
-    std::unordered_set<Node*> in_wgt_vars,
-    std::unordered_set<Node*> out_data_vars,
-    std::unordered_set<Node*> out_unused_vars) {
-  std::vector<std::string> in_var_names;
-  std::vector<std::string> out_var_names;
-  for (auto i : in_data_vars) {
-    in_var_names.push_back(i->AsArg().name);
-  }
-  for (auto i : out_data_vars) {
-    out_var_names.push_back(i->AsArg().name);
-  }
-
-  auto op_desc = GenGraphOpDesc(model_name, in_var_names, out_var_names);
-
-  auto graph_op = LiteOpRegistry::Global().Create("graph_op");
-  graph_op->Attach(op_desc, scope);
-  auto* new_op_node = graph->GraphCreateInstructNode(graph_op, valid_places);
-
-  for (auto& in_var : in_data_vars) {
-    IR_NODE_LINK_TO(in_var, new_op_node);
-  }
-  for (auto& in_var : in_wgt_vars) {
-    IR_NODE_LINK_TO(in_var, new_op_node);
-  }
-  for (auto& out_var : out_data_vars) {
-    IR_OP_VAR_LINK(new_op_node, out_var);
-  }
-  for (auto& out_var : out_unused_vars) {
-    IR_OP_VAR_LINK(new_op_node, out_var);
-  }
-
-  // assign context
-  auto& inst = new_op_node->AsStmt();
-  inst.picked_kernel().SetContext(
-      ContextScheduler::Global().NewContext(inst.picked_kernel().target()));
-}
-
-void SubgraphProgramPass::SortHelper(
-    Node* node,
-    const std::unordered_set<Node*>& nodes_all,
-    std::unordered_set<const Node*>* visited_nodes,
-    std::vector<Node*>* ret) {
-  for (auto& var_node : node->inlinks) {
-    if (var_node->inlinks.empty()) continue;
-    auto* op_node = var_node->inlinks.front();
-    if (nodes_all.count(op_node) && !visited_nodes->count(op_node)) {
-      SortHelper(op_node, nodes_all, visited_nodes, ret);
-    }
-  }
-  ret->push_back(node);
-  visited_nodes->insert(node);
-}
-
-std::vector<Node*> SubgraphProgramPass::GetTopologicalOrder(
-    const std::unordered_set<Node*>& nodes) {
-  std::unordered_set<const Node*> visited;
-  std::vector<Node*> ret;
-  for (auto& node : nodes) {
-    if (!node->IsStmt()) continue;
-    if (visited.count(node)) continue;
-    SortHelper(node, nodes, &visited, &ret);
-  }
-  return ret;
-}
-
-void SubgraphProgramPass::FindInputOutputVars(
-    const std::unordered_set<Node*>& op_nodes,
-    std::unordered_set<Node*>* in_data_vars,
-    std::unordered_set<Node*>* in_wgt_vars,
-    std::unordered_set<Node*>* out_data_vars,
-    std::unordered_set<Node*>* out_unused_vars) {
-  for (auto& op_node : op_nodes) {
-    for (auto& in_var : op_node->inlinks) {
-      if (in_var->AsArg().is_weight) {
-        in_wgt_vars->insert(in_var);
-        continue;
-      }
-      if (!in_var->inlinks.empty()) {
-        // var can only come from one op node, so use front
-        auto* pre_op_node = in_var->inlinks.front();
-        if (op_nodes.count(pre_op_node)) {
-          continue;
-        }
-      }
-      in_data_vars->insert(in_var);
-    }
-    for (auto& out_var : op_node->outlinks) {
-      if (out_var->outlinks.empty()) {
-        // the next op is empty so this var is actually unused
-        out_unused_vars->insert(out_var);
-        continue;
-      }
-      // var can have more than one next op node
-      // so, if any one in the op_nodes then continue
-      bool next_op_in_nodes = false;
-      for (auto& next_op_node : out_var->outlinks) {
-        if (op_nodes.count(next_op_node)) {
-          next_op_in_nodes = true;
-        }
-      }
-      if (next_op_in_nodes) {
-        continue;
-      }
-
-      out_data_vars->insert(out_var);
-    }
-  }
-}
-
-std::unordered_set<const Node*> SubgraphProgramPass::GetNode2rm(
-    const std::unordered_set<Node*>& op_nodes,
-    const std::vector<std::unordered_set<Node*>>& excluded_nodes) {
-  std::unordered_set<const Node*> nodes2rm(op_nodes.begin(), op_nodes.end());
-  for (auto& op_node : op_nodes) {
-    for (auto& in_var : op_node->inlinks) {
-      if (!nodes2rm.count(in_var)) {
-        nodes2rm.insert(in_var);
-      }
-    }
-    for (auto& out_var : op_node->outlinks) {
-      if (!nodes2rm.count(out_var)) {
-        nodes2rm.insert(out_var);
-      }
-    }
-  }
-  // some nodes should not be removed
-  for (auto& e : excluded_nodes) {
-    for (auto& i : e) {
-      if (nodes2rm.count(i)) {
-        nodes2rm.erase(i);
-      }
-    }
-  }
-  return nodes2rm;
-}
-
-void SubgraphProgramPass::InferOnce(const std::unique_ptr<SSAGraph>& graph) {
-  for (auto& item : graph->StmtTopologicalOrder()) {
-    if (!item->IsStmt()) continue;
-    auto& stmt = item->AsStmt();
-    auto& op = stmt.op();
-    op->CheckShape();
-    op->InferShape();
-    // TOOD(xxx): remove Launch() at last
-    auto& kkks = stmt.kernels();
-    if (!kkks.empty()) {
-      auto& kk = stmt.kernels().front();
-      if (kk) {
-        kk->Launch();
-      }
-    }
-  }
-}
-
-void SubgraphProgramPass::InitSubgraphID(
-    const std::unique_ptr<SSAGraph>& graph,
-    const std::vector<std::string>& supported_op_types) {
-  for (auto& item : graph->StmtTopologicalOrder()) {
-    if (!item->IsStmt()) continue;
-    auto& stmt = item->AsStmt();
-    stmt.ClearSubgraphID();
-    if (std::find(supported_op_types.begin(),
-                  supported_op_types.end(),
-                  stmt.op_type()) != supported_op_types.end()) {
-      stmt.SetSubgraphID(0);
-      LOG(INFO) << "supported " << stmt.op_type();
-    } else {
-      LOG(INFO) << "======= not supported " << stmt.op_type();
-    }
-  }
-}
-
-// mark current and all output supported nodes
-void SubgraphProgramPass::ChangeAllOutConnectedID(Node* node,
-                                                  int to_id,
-                                                  int from_id) {
-  if (!node) return;
-  if (node->IsStmt()) {
-    auto& stmt = node->AsStmt();
-    if (stmt.subgraph_id() == from_id) {
-      stmt.SetSubgraphID(to_id);
-      for (auto& i : node->outlinks) {
-        ChangeAllOutConnectedID(i, to_id, from_id);
-      }
-    } else {
-      LOG(INFO) << "failed op type:" << stmt.op_type();
-      return;
-    }
-  } else {
-    // this it arg node
-    bool all_out_op_supported = true;
-    for (auto& i : node->outlinks) {
-      if (!i->IsStmt()) return;
-      auto& stmt = i->AsStmt();
-      if (stmt.subgraph_id() < from_id) {
-        all_out_op_supported = false;
-      }
-    }
-    if (!all_out_op_supported) {
-      return;
-    }
-    for (auto& i : node->outlinks) {
-      CHECK(i->IsStmt());
-      auto& stmt = i->AsStmt();
-      if (stmt.subgraph_id() == from_id) {
-        stmt.SetSubgraphID(to_id);
-        for (auto& o : i->outlinks) {
-          ChangeAllOutConnectedID(o, to_id, from_id);
-        }
-      }
-    }
-  }
-}
-
-int SubgraphProgramPass::FuseSubgraphID(
-    const std::unique_ptr<SSAGraph>& graph) {
-  int sub_id = 1;  // id start from 1 not 0
-  for (auto& item : graph->StmtTopologicalOrder()) {
-    bool inputvar = 0;
-    if (!item->IsStmt()) continue;
-    auto& stmt = item->AsStmt();
-    if (stmt.subgraph_id() == -1) {
-      for (auto& i : item->outlinks) {
-        for (auto& j : i->outlinks) {
-          if (j->IsStmt()) {
-            auto& jstmt = j->AsStmt();
-            if (jstmt.subgraph_id() == 0) inputvar = 1;
-          }
-        }
-      }
-    }
-    if (stmt.subgraph_id() != 0) continue;
-    ChangeAllOutConnectedID(item, sub_id);
-    sub_id++;
-  }
-  return sub_id - 1;
-}
-
-int SubgraphProgramPass::FuseSubgraph(
-    const std::unique_ptr<SSAGraph>& graph,
-    const std::vector<std::string>& supported_op_types) {
-  InitSubgraphID(graph, supported_op_types);
-  return FuseSubgraphID(graph);
-}
-}  // namespace subgraph
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(subgraph_program_pass,
-                  paddle::lite::mir::subgraph::SubgraphProgramPass)
-    .SetTargets({TARGET(kAny)});
diff --git a/lite/core/mir/subgraph/subgraph_program_pass.h b/lite/core/mir/subgraph/subgraph_program_pass.h
deleted file mode 100644
index 51e9367539..0000000000
--- a/lite/core/mir/subgraph/subgraph_program_pass.h
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "lite/core/mir/pass.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace subgraph {
-
-class SubgraphProgramPass : public ProgramPass {
- public:
-  using key2nodes_t = std::map<std::string, Node*>;
-
-  // make all the linked ops in subgraph with same subgraph_id
-  // return the fused subgraph numbers
-  int FuseSubgraph(const std::unique_ptr<SSAGraph>& graph,
-                   const std::vector<std::string>& supported_op_types);
-
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override{};
-
- protected:
-  void InferOnce(const std::unique_ptr<SSAGraph>& graph);
-
-  // clear all subgraph id and mark all ops, which could be fuse, as id zero
-  void InitSubgraphID(const std::unique_ptr<SSAGraph>& graph,
-                      const std::vector<std::string>& supported_op_types);
-
-  // make all the linked ops in subgraph with same subgraph_id
-  // return the fused subgraph numbers
-  int FuseSubgraphID(const std::unique_ptr<SSAGraph>& graph);
-
-  // // GenerateFusedGraph:
-  // std::unique_ptr<SSAGraph> GenerateFusedGraph(const
-  // std::unique_ptr<SSAGraph>& graph, int sub_num);
-  void ChangeAllOutConnectedID(Node* node, int to_id, int from_id = 0);
-
-  // Below function cloud be useful in child classes //
-  // classify node by subgraph id
-  std::unordered_map<int, std::unordered_set<Node*>> ClassifySubgraph(
-      const std::unique_ptr<SSAGraph>& graph);
-
-  // generate the graph op desc
-  cpp::OpDesc GenGraphOpDesc(const std::string& model_name,
-                             const std::vector<std::string>& in_var_names,
-                             const std::vector<std::string>& out_var_names);
-
-  // insert a new graph op node
-  void InsertNewNode(const std::unique_ptr<SSAGraph>& graph,
-                     const std::string& model_name,
-                     Scope* scope,
-                     const std::vector<Place>& valid_places,
-                     std::unordered_set<Node*> in_data_vars,
-                     std::unordered_set<Node*> in_wgt_vars,
-                     std::unordered_set<Node*> out_data_vars,
-                     std::unordered_set<Node*> out_unused_vars);
-
-  // Sort and return the topology order of nodes set
-  std::vector<Node*> GetTopologicalOrder(
-      const std::unordered_set<Node*>& nodes);
-
-  // find all input data vars, input weight vars,
-  // output data vars and output vars from the nodes
-  void FindInputOutputVars(const std::unordered_set<Node*>& op_nodes,
-                           std::unordered_set<Node*>* in_data_vars,
-                           std::unordered_set<Node*>* in_wgt_vars,
-                           std::unordered_set<Node*>* out_data_vars,
-                           std::unordered_set<Node*>* out_unused_vars);
-
-  // return the node to remove in the subgraph
-  std::unordered_set<const Node*> GetNode2rm(
-      const std::unordered_set<Node*>& op_nodes,
-      const std::vector<std::unordered_set<Node*>>& excluded_nodes);
-
- private:
-  // sort nodes to operational sequence
-  void SortHelper(Node* node,
-                  const std::unordered_set<Node*>& nodes_all,
-                  std::unordered_set<const Node*>* visited_nodes,
-                  std::vector<Node*>* ret);
-};
-
-}  // namespace subgraph
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/subgraph/subgraph_program_pass_test.cc b/lite/core/mir/subgraph/subgraph_program_pass_test.cc
deleted file mode 100644
index de4acec91d..0000000000
--- a/lite/core/mir/subgraph/subgraph_program_pass_test.cc
+++ /dev/null
@@ -1,223 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/subgraph/subgraph_program_pass.h"
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/core/mir/graph_visualize_pass.h"
-#include "lite/core/mir/ssa_graph.h"
-#include "lite/core/program.h"
-#include "lite/model_parser/cpp/program_desc.h"
-#include "lite/model_parser/model_parser.h"
-
-DEFINE_string(model_dir, "", "model_dir");
-
-namespace paddle {
-namespace lite {
-
-TEST(SubgraphTest, models) {
-  cpp::ProgramDesc program_desc;
-  auto scope = std::make_shared<Scope>();
-  // LoadModelPb(FLAGS_model_dir,
-  //             FLAGS_model_dir + "/model",
-  //             FLAGS_model_dir + "/params",
-  //             scope.get(),
-  //             &program_desc,
-  //             true);
-  LoadModelPb(FLAGS_model_dir, "", "", scope.get(), &program_desc);
-  std::vector<Place> valid_places({
-      Place{TARGET(kHost), PRECISION(kFloat)},
-#ifdef LITE_WITH_ARM
-      Place{TARGET(kARM), PRECISION(kFloat)},
-#endif
-#ifdef LITE_WITH_NPU
-      Place{TARGET(kNPU), PRECISION(kFloat)},
-#endif
-  });
-  lite::Program program(program_desc, scope, valid_places);
-  auto graph = std::unique_ptr<mir::SSAGraph>(new mir::SSAGraph());
-  graph->Build(program, valid_places);
-
-  std::vector<std::string> supported_op_types{"concat",
-                                              "conv2d",
-                                              "depthwise_conv2d",
-                                              "batch_norm",
-                                              "scale",
-                                              "pool2d",
-                                              "mul",
-                                              "elementwise_add",
-                                              "softmax",
-                                              "split",
-                                              "relu",
-                                              "reshape2",
-                                              "transpose2"};
-  auto* pass = new mir::subgraph::SubgraphProgramPass;
-  ASSERT_EQ(pass->FuseSubgraph(graph, supported_op_types), 1);
-  LOG(INFO) << "After NPU Pass \n" << Visualize(graph.get());
-}
-
-// return output_var_names
-std::vector<std::string> AddFCDesc(
-    cpp::BlockDesc* block_desc,
-    const std::shared_ptr<Scope>& scope,
-    const std::vector<std::string>& input_var_names,
-    const std::vector<int64_t>& wshape) {
-  CHECK_EQ(input_var_names.size(), 1);
-  CHECK_EQ(wshape.size(), 2);
-  static int id = 0;
-  std::string prefix = "fc_" + std::to_string(id);
-  auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
-  auto* wgt = block_desc->AddVar<cpp::VarDesc>();
-  auto* bias = block_desc->AddVar<cpp::VarDesc>();
-  auto* out = block_desc->AddVar<cpp::VarDesc>();
-
-  wgt->SetName(prefix + "_W");
-  bias->SetName(prefix + "_Bias");
-  out->SetName(prefix + "_Out");
-  std::vector<std::string> out_var_names{prefix + "_Out"};
-
-  auto* wtensor = scope->Var(prefix + "_W")->GetMutable<lite::Tensor>();
-  wtensor->Resize(wshape);
-  wtensor->mutable_data<float>();
-
-  auto* btensor = scope->Var(prefix + "_Bias")->GetMutable<lite::Tensor>();
-  btensor->Resize({wshape[1]});
-  btensor->mutable_data<float>();
-
-  scope->Var(prefix + "_Out")->GetMutable<lite::Tensor>();
-
-  op_desc->SetType("fc");
-  op_desc->SetInput("Input", input_var_names);
-  op_desc->SetInput("W", {prefix + "_W"});
-  op_desc->SetInput("Bias", {prefix + "_Bias"});
-  op_desc->SetAttr<int>("in_num_col_dims", 1);
-  op_desc->SetOutput("Out", out_var_names);
-  id++;
-  return out_var_names;
-}
-
-std::vector<std::string> AddElementwiseAddDesc(
-    cpp::BlockDesc* block_desc,
-    const std::shared_ptr<Scope>& scope,
-    const std::vector<std::string>& input_X_names,
-    const std::vector<std::string>& input_Y_names) {
-  // CHECK_EQ(input_var_names.size(), 2);
-  static int id = 0;
-  std::string prefix = "elementwise_add_" + std::to_string(id);
-  auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
-  auto* out = block_desc->AddVar<cpp::VarDesc>();
-
-  out->SetName(prefix + "_Out");
-  std::vector<std::string> out_var_names{prefix + "_Out"};
-
-  scope->Var(prefix + "_Out")->GetMutable<lite::Tensor>();
-
-  op_desc->SetType("elementwise_add");
-  op_desc->SetInput("X", input_X_names);
-  op_desc->SetInput("Y", input_Y_names);
-  op_desc->SetOutput("Out", out_var_names);
-  op_desc->SetAttr("axis", -1);
-  id++;
-  return out_var_names;
-}
-
-std::vector<std::string> AddFeedDesc(
-    cpp::BlockDesc* block_desc,
-    const std::shared_ptr<Scope>& scope,
-    const std::vector<std::string>& input_X_names) {
-  // CHECK_EQ(input_var_names.size(), 1);
-  static int id = 0;
-  std::string prefix = "feed_" + std::to_string(id);
-  auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
-  auto* out = block_desc->AddVar<cpp::VarDesc>();
-
-  out->SetName(prefix + "_Out");
-  std::vector<std::string> out_var_names{prefix + "_Out"};
-
-  scope->Var(prefix + "_Out")->GetMutable<lite::Tensor>();
-
-  op_desc->SetType("feed");
-  op_desc->SetInput("X", input_X_names);
-  op_desc->SetOutput("Out", out_var_names);
-  op_desc->SetAttr("col", 1);
-  id++;
-  return out_var_names;
-}
-
-std::vector<std::string> AddFetchDesc(
-    cpp::BlockDesc* block_desc,
-    const std::shared_ptr<Scope>& scope,
-    const std::vector<std::string>& input_X_names) {
-  // CHECK_EQ(input_var_names.size(), 1);
-  static int id = 0;
-  std::string prefix = "fetch_" + std::to_string(id);
-  auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
-  auto* out = block_desc->AddVar<cpp::VarDesc>();
-
-  out->SetName(prefix + "_Out");
-  std::vector<std::string> out_var_names{prefix + "_Out"};
-
-  scope->Var(prefix + "_Out")->GetMutable<lite::Tensor>();
-
-  op_desc->SetType("fetch");
-  op_desc->SetInput("X", input_X_names);
-  op_desc->SetOutput("Out", out_var_names);
-  op_desc->SetAttr("col", 1);
-  id++;
-  return out_var_names;
-}
-
-std::unique_ptr<mir::SSAGraph> BuildSimpleNet(
-    cpp::ProgramDesc* program_desc,
-    const std::shared_ptr<Scope>& scope,
-    const std::vector<Place>& valid_places) {
-  program_desc->ClearBlocks();
-  auto* block_desc = program_desc->AddBlock<cpp::BlockDesc>();
-  block_desc->ClearOps();
-  block_desc->ClearVars();
-
-  auto* var_desc = block_desc->AddVar<cpp::VarDesc>();
-  var_desc->SetName("feed_var");
-  auto* feed_var = scope->Var("feed_var")->GetMutable<lite::Tensor>();
-  feed_var->Resize({1, 4});
-  auto fc1_out = AddFCDesc(block_desc, scope, {"feed_var"}, {4, 5});
-  auto fc2_out = AddFCDesc(block_desc, scope, fc1_out, {5, 2});
-
-  lite::Program program(*program_desc, scope, valid_places);
-  auto graph = std::unique_ptr<mir::SSAGraph>(new mir::SSAGraph());
-  graph->Build(program, valid_places);
-
-  return graph;
-}
-
-TEST(SubGraphTest, SimpleNet) {
-  cpp::ProgramDesc program_desc;
-  std::vector<Place> places{{TARGET(kHost), PRECISION(kFloat)}};
-  auto scope = std::make_shared<Scope>();
-  auto graph = BuildSimpleNet(&program_desc, scope, places);
-
-  std::vector<std::string> supported_op_types{"fc"};
-  auto* pass = new mir::subgraph::SubgraphProgramPass;
-  ASSERT_EQ(pass->FuseSubgraph(graph, supported_op_types), 1);
-
-  const int num_nodes = graph->nodes().size();
-  ASSERT_EQ(graph->nodes().size(), 9);
-  // LOG(INFO) << "After NPU Pass \n" << Visualize(graph.get());
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/type_layout_cast_pass.cc b/lite/core/mir/type_layout_cast_pass.cc
deleted file mode 100644
index fbd3f9e1d2..0000000000
--- a/lite/core/mir/type_layout_cast_pass.cc
+++ /dev/null
@@ -1,177 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/type_layout_cast_pass.h"
-#include <list>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "lite/core/mir/graph_visualize_pass.h"
-#include "lite/core/mir/pass_registry.h"
-#include "lite/utils/string.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-void TypeLayoutTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  // Start from inputs of the graph, those should have place set.
-  std::list<Node*> nodes;
-  for (auto& node : graph->mutable_nodes()) {
-    nodes.push_back(&node);
-  }
-
-  for (auto& node : nodes) {
-    if (!node->IsStmt()) continue;
-    auto inlinks = node->inlinks;
-    for (auto* in : inlinks) {
-      ComplementInputs(graph.get(), node, in);
-    }
-  }
-  VLOG(3) << "\n" << Visualize(graph.get());
-}
-
-void TypeLayoutTransformPass::ComplementInputs(SSAGraph* graph,
-                                               Node* inst_node,
-                                               Node* in) {
-  // If this input is out of date.
-  if (inst_node->inlinks.end() ==
-      std::find(inst_node->inlinks.begin(), inst_node->inlinks.end(), in))
-    return;
-
-  CHECK(inst_node->IsStmt());
-  auto& inst = inst_node->AsStmt();
-  CHECK(in->IsRoleSet());
-  CHECK(in->IsArg());
-  auto in_arg_name = in->AsArg().name;
-  std::string tmp;
-  CHECK(inst.op_info()->GetInputArgname(in_arg_name, &tmp));
-  auto decl_arg_type = inst.picked_kernel().GetInputDeclType(tmp);
-  CHECK(in->AsArg().type);
-  if (!DataLayoutCompatible(*in->AsArg().type, *decl_arg_type)) {
-    VLOG(4) << "found Layout unmatched tensor: " << in->AsArg().name
-            << " for kernel " << inst.op()->DebugString() << " "
-            << *in->AsArg().type << " -> " << *decl_arg_type;
-    AddLayoutInst(*in->AsArg().type,
-                  *decl_arg_type,
-                  in,
-                  graph,
-                  inst_node,
-                  graph->valid_places());
-  }
-}
-
-void TypeLayoutTransformPass::AddLayoutInst(
-    const Type& from,
-    const Type& to,
-    Node* in,
-    SSAGraph* graph,
-    Node* inst_node,
-    const std::vector<Place>& valid_places) {
-  CHECK(!valid_places.empty()) << "valid_place should be set";
-
-  CHECK(in->IsArg());
-  auto node_id = [&] { return graph->nodes().size(); };
-  auto layout_output_name =
-      string_format("%s/trans/%d", in->AsArg().name.c_str(), node_id());
-  auto* layout_output_arg = graph->NewArgumentNode(layout_output_name);
-  auto* layout_inst = graph->NewInstructNode();
-
-  bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist;
-  std::string layout_type = in_persist ? "layout_once" : "layout";
-  // create Op and kernels.
-  auto layout_op = LiteOpRegistry::Global().Create(layout_type);
-  CHECK(layout_op) << "create op [" << layout_op << "] failed";
-  layout_output_arg->AsArg().is_persist = in_persist;
-  // Create the new var manually.
-  inst_node->AsStmt().op()->scope()->Var(layout_output_name);
-
-  // Create IoCopy Instruction.
-  cpp::OpDesc op_desc;
-  op_desc.SetType(layout_type);
-  op_desc.SetInput("Input", {in->AsArg().name});
-  op_desc.SetOutput("Out", {layout_output_name});
-
-  layout_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
-  auto kernels = layout_op->CreateKernels(valid_places);
-  std::vector<std::unique_ptr<KernelBase>> selected_kernels;
-  bool is_found = false;
-  for (auto& kernel : kernels) {
-    const Type* in_arg_ty = kernel->GetInputDeclType("Input");
-    const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
-    if (TypeCompatible(*in_arg_ty, from)) {
-      is_found = true;
-      selected_kernels.emplace_back(std::move(kernel));
-      // we pick the kernel
-      layout_inst->AsStmt(layout_type, std::move(kernels), layout_op);
-      break;
-    }
-  }
-  CHECK(is_found) << "Can't find a layout  kernel for layout op: " << from
-                  << ":" << in->AsArg().name << "->" << to << ":"
-                  << inst_node->AsStmt().op_info()->Type();
-
-  // Remove the old link
-  RemoveDirectedLink(in, inst_node);
-
-  // Update the original instruction OpDesc.
-  // Update its input to the layout_output_name
-  // Add new link, var -> new_inst, new_inst->newarg, newarg->inst
-  DirectedLink(in, layout_inst);
-  DirectedLink(layout_inst, layout_output_arg);
-  DirectedLink(layout_output_arg, inst_node);
-
-  // reset opdesc and update kernel information
-  UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(),
-                in->AsArg().name,
-                layout_output_name);
-  auto original_selected_kernel =
-      std::move(inst_node->AsStmt().kernels().front());
-  auto update_op_info = *inst_node->AsStmt().op_info();
-  // ResetOp() will change the Stmt op_info_ value,
-  // after that the old op_info_ value will be nullified.
-  // So, we can't pass `*inst_node->AsStmt().op_info()` into ResetOp.
-  // `update_op_info` is the copy of `*inst_node->AsStmt().op_info().
-  // Whenever update the op_info of a stmt, we should call its ResetOp().
-  inst_node->AsStmt().ResetOp(update_op_info, graph->valid_places());
-  inst_node->AsStmt().kernels().clear();
-  inst_node->AsStmt().kernels().emplace_back(
-      std::move(original_selected_kernel));
-
-  std::string tmp;
-  if (inst_node->AsStmt().op_info()->GetInputArgname("a", &tmp)) {
-    CHECK(false) << "get old a " << tmp;
-  }
-
-  for (auto& kernel : inst_node->AsStmt().kernels()) {
-    inst_node->AsStmt().op()->AttachKernel(kernel.get());
-  }
-
-  graph->CheckValid();
-}
-
-void TypeLayoutTransformPass::SetValidPlaces(
-    const std::vector<Place>& valid_places) {
-  CHECK(!valid_places.empty());
-  valid_places_ = valid_places;
-}
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(type_layout_cast_pass,
-                  paddle::lite::mir::TypeLayoutTransformPass)
-    .SetTargets({TARGET(kAny)});
diff --git a/lite/core/mir/type_layout_cast_pass.h b/lite/core/mir/type_layout_cast_pass.h
deleted file mode 100644
index bf36214e1d..0000000000
--- a/lite/core/mir/type_layout_cast_pass.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-#include "lite/core/mir/pass.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-static void UpdateInputTo(cpp::OpDesc* desc,
-                          const std::string& from,
-                          const std::string& to) {
-  for (auto& item : *desc->mutable_inputs()) {
-    for (auto& input : item.second) {
-      if (input == from) {
-        input = to;
-      }
-    }
-  }
-}
-
-class TypeLayoutTransformPass : public ProgramPass {
- public:
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
-
-  void ComplementInputs(SSAGraph* graph, Node* inst_node, Node* in);
-
-  void AddLayoutInst(const Type& from,
-                     const Type& to,
-                     Node* in,
-                     SSAGraph* graph,
-                     Node* inst_node,
-                     const std::vector<Place>& valid_places);
-
-  void SetValidPlaces(const std::vector<Place>& valid_places);
-
-  const std::vector<Place>& valid_places() const { return valid_places_; }
-
- private:
-  std::vector<Place> valid_places_;
-};
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/type_precision_cast_pass.cc b/lite/core/mir/type_precision_cast_pass.cc
deleted file mode 100644
index 7cd22e25ac..0000000000
--- a/lite/core/mir/type_precision_cast_pass.cc
+++ /dev/null
@@ -1,183 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/type_precision_cast_pass.h"
-#include <list>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "lite/core/mir/graph_visualize_pass.h"
-#include "lite/core/mir/pass_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-void PrecisionCastPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  // Start from inputs of the graph, those should have place set.
-  std::list<Node*> nodes;
-  for (auto& node : graph->mutable_nodes()) {
-    nodes.push_back(&node);
-  }
-
-  for (auto& node : nodes) {
-    if (!node->IsStmt()) continue;
-    auto inlinks = node->inlinks;
-    for (auto* in : inlinks) {
-      ComplementInputs(graph.get(), node, in);
-    }
-  }
-}
-
-void PrecisionCastPass::ComplementInputs(SSAGraph* graph,
-                                         Node* inst_node,
-                                         Node* in) {
-  // If this input is out of date.
-  if (inst_node->inlinks.end() ==
-      std::find(inst_node->inlinks.begin(), inst_node->inlinks.end(), in))
-    return;
-
-  CHECK(inst_node->IsStmt());
-  auto& inst = inst_node->AsStmt();
-  CHECK(in->IsRoleSet());
-  CHECK(in->IsArg());
-  auto in_arg_name = in->AsArg().name;
-  std::string tmp;
-  CHECK(inst.op_info()->GetInputArgname(in_arg_name, &tmp));
-  auto decl_arg_type = inst.picked_kernel().GetInputDeclType(tmp);
-  CHECK(in->AsArg().type);
-  VLOG(4) << inst.picked_kernel().name();
-  // if (!in->AsArg().is_weight && !PrecisionCompatibleTo(*in->AsArg().type,
-  // *decl_arg_type)) {
-  if (!PrecisionCompatibleTo(*in->AsArg().type, *decl_arg_type)) {
-    VLOG(4) << "found Target unmatched tensor: " << in->AsArg().name
-            << " for kernel " << inst.op()->DebugString() << " "
-            << *in->AsArg().type << " -> " << *decl_arg_type;
-    // Add an Cast instruction to make the input compatible with other dist.
-    AddCastInst(*in->AsArg().type,
-                *decl_arg_type,
-                in,
-                graph,
-                inst_node,
-                graph->valid_places());
-  }
-}
-
-void PrecisionCastPass::AddCastInst(const Type& from,
-                                    const Type& to,
-                                    Node* in,
-                                    SSAGraph* graph,
-                                    Node* inst_node,
-                                    const std::vector<Place>& valid_places) {
-  CHECK(!valid_places.empty()) << "valid_place should be set";
-
-  // var -> new_transform_op -> new_var -> inst
-  // So there will be a new Argument node and a new Cast Statement Node.
-  CHECK(in->IsArg());
-  auto node_id = [&] { return graph->nodes().size(); };
-  auto cast_op_output_name =
-      in->AsArg().name + "/trans/" + std::to_string(node_id());
-  auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name);
-  auto* cast_inst = graph->NewInstructNode();
-
-  // create Op and kernels.
-  bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist;
-  std::string cast_type = in_persist ? "calib_once" : "calib";
-  cast_op_output_arg->AsArg().is_persist = in_persist;
-  auto cast_op = LiteOpRegistry::Global().Create(cast_type);
-  CHECK(cast_op) << "create op [" << cast_op << "] failed";
-
-  // Create the new var manually.
-  inst_node->AsStmt().op()->scope()->Var(cast_op_output_name);
-
-  // Create Calib Instruction.
-  cpp::OpDesc op_desc;
-  op_desc.SetType(cast_type);
-  op_desc.SetInput("Input", {in->AsArg().name});
-  op_desc.SetOutput("Out", {cast_op_output_name});
-  if (inst_node->AsStmt().op_info()->HasAttr("input_scale")) {
-    op_desc.SetAttr(
-        "scale", inst_node->AsStmt().op_info()->GetAttr<float>("input_scale"));
-  }
-  cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
-  auto kernels = cast_op->CreateKernels(valid_places);
-  std::vector<std::unique_ptr<KernelBase>> selected_kernels;
-  bool is_found = false;
-  for (auto& kernel : kernels) {
-    const Type* in_arg_ty = kernel->GetInputDeclType("Input");
-    const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
-// TODO(xg): to optimize this
-#ifndef LITE_WITH_FPGA
-    if (in_arg_ty->precision() == from.precision() &&
-        out_arg_ty->precision() == to.precision()) {
-#else
-    if (TypeCompatible(*in_arg_ty, from)) {
-#endif
-      is_found = true;
-      selected_kernels.emplace_back(std::move(kernel));
-      // we pick the kernel
-      cast_inst->AsStmt(cast_type, std::move(selected_kernels), cast_op);
-      break;
-    }
-  }
-
-  CHECK(is_found) << "Can't find a Cast kernel for Cast op: " << from << ":"
-                  << in->AsArg().name << "->" << to << ":"
-                  << inst_node->AsStmt().op_info()->Type();
-
-  // Remove the old link
-  RemoveDirectedLink(in, inst_node);
-
-  // Update the original instruction OpDesc.
-  // Update its input to the io_copy_output_name
-
-  // Add new link, var -> new_inst, new_inst->newarg, newarg->inst
-  DirectedLink(in, cast_inst);
-  DirectedLink(cast_inst, cast_op_output_arg);
-  DirectedLink(cast_op_output_arg, inst_node);
-
-  // reset opdesc and update kernel information
-  UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(),
-                in->AsArg().name,
-                cast_op_output_name);
-
-  // recreate the op
-  auto original_selected_kernel =
-      std::move(inst_node->AsStmt().kernels().front());
-  auto updated_op_info = *inst_node->AsStmt().mutable_op_info();
-
-  inst_node->AsStmt().ResetOp(updated_op_info, graph->valid_places());
-  inst_node->AsStmt().kernels().clear();
-  inst_node->AsStmt().kernels().emplace_back(
-      std::move(original_selected_kernel));
-  for (auto& kernel : inst_node->AsStmt().kernels()) {
-    VLOG(4) << "kernel info: " << kernel->name();
-    inst_node->AsStmt().op()->AttachKernel(kernel.get());
-  }
-  graph->CheckValid();
-}
-
-void PrecisionCastPass::SetValidPlaces(const std::vector<Place>& valid_places) {
-  CHECK(!valid_places.empty());
-  valid_places_ = valid_places;
-}
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(type_precision_cast_pass,
-                  paddle::lite::mir::PrecisionCastPass)
-    .SetTargets({TARGET(kAny)});
diff --git a/lite/core/mir/type_precision_cast_pass.h b/lite/core/mir/type_precision_cast_pass.h
deleted file mode 100644
index 3f55e52ef9..0000000000
--- a/lite/core/mir/type_precision_cast_pass.h
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-#include "lite/core/mir/pass.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-static void UpdateInputTo(cpp::OpDesc* desc,
-                          const std::string& from,
-                          const std::string& to) {
-  for (auto& item : *desc->mutable_inputs()) {
-    for (auto& input : item.second) {
-      if (input == from) {
-        input = to;
-      }
-    }
-  }
-}
-
-/*
- * The pass complement the necessary instruction to make data
- * transferring or transformation between different places.
- */
-class PrecisionCastPass : public ProgramPass {
- public:
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
-
-  void ComplementInputs(SSAGraph* graph, Node* inst_node, Node* in);
-
-  void AddCastInst(const Type& from,
-                   const Type& to,
-                   Node* in,
-                   SSAGraph* graph,
-                   Node* inst_node,
-                   const std::vector<Place>& valid_places);
-
-  void SetValidPlaces(const std::vector<Place>& valid_places);
-
-  const std::vector<Place>& valid_places() const { return valid_places_; }
-
- private:
-  std::vector<Place> valid_places_;
-};
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/type_target_cast_pass.cc b/lite/core/mir/type_target_cast_pass.cc
deleted file mode 100644
index 5a07fdd9d9..0000000000
--- a/lite/core/mir/type_target_cast_pass.cc
+++ /dev/null
@@ -1,183 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/type_target_cast_pass.h"
-#include <list>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "lite/core/mir/graph_visualize_pass.h"
-#include "lite/core/mir/pass_registry.h"
-#include "lite/utils/string.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-void TypeTargetTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  // Start from inputs of the graph, those should have place set.
-  std::list<Node*> nodes;
-  for (auto& node : graph->mutable_nodes()) {
-    nodes.push_back(&node);
-  }
-
-  CHECK(!valid_places_.empty());
-
-  for (auto& node : nodes) {
-    if (!node->IsStmt()) continue;
-    auto inlinks = node->inlinks;
-    for (auto* in : inlinks) {
-      ComplementInputs(graph.get(), node, in);
-    }
-  }
-}
-
-void TypeTargetTransformPass::ComplementInputs(SSAGraph* graph,
-                                               Node* inst_node,
-                                               Node* in) {
-  // If this input is out of date.
-  if (inst_node->inlinks.end() ==
-      std::find(inst_node->inlinks.begin(), inst_node->inlinks.end(), in))
-    return;
-
-  CHECK(inst_node->IsStmt());
-  auto& inst = inst_node->AsStmt();
-  LOG(INFO) << "found Target tensor: " << in->AsArg().name;
-  CHECK(in->IsRoleSet());
-  CHECK(in->IsArg());
-  auto in_arg_name = in->AsArg().name;
-  std::string tmp;
-  CHECK(inst.op_info()->GetInputArgname(in_arg_name, &tmp));
-  auto decl_arg_type = inst.picked_kernel().GetInputDeclType(tmp);
-  CHECK(in->AsArg().type);
-  if (!TargetCompatibleTo(*in->AsArg().type, *decl_arg_type)) {
-    LOG(INFO) << "found Target unmatched tensor: " << in->AsArg().name
-              << " for kernel " << inst.op()->DebugString() << " "
-              << *in->AsArg().type << " -> " << *decl_arg_type;
-    // Add an IoCopy instruction to make the input compatible with other dist.
-    AddIoCopyInst(
-        *in->AsArg().type, *decl_arg_type, in, graph, inst_node, valid_places_);
-  }
-}
-
-void TypeTargetTransformPass::AddIoCopyInst(
-    const Type& from,
-    const Type& to,
-    Node* in,
-    SSAGraph* graph,
-    Node* inst_node,
-    const std::vector<Place>& valid_places) {
-  CHECK(!valid_places.empty()) << "valid_place should be set";
-  // var -> new_transform_op -> new_var -> inst
-  // So there will be a new Argument node and a new IoCopy Statement Node.
-
-  CHECK(in->IsArg());
-  auto node_id = [&] { return graph->nodes().size(); };
-  auto io_copy_output_name =
-      string_format("%s/trans/%d", in->AsArg().name.c_str(), node_id());
-  // TODO(MyPandaShaoxiang) should set same place with input?
-  auto* io_copy_output_arg = graph->NewArgumentNode(io_copy_output_name);
-  auto* io_copy_inst = graph->NewInstructNode();
-
-  bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist;
-  std::string io_copy_type = in_persist ? "io_copy_once" : "io_copy";
-  io_copy_output_arg->AsArg().is_persist = in_persist;
-  // create Op and kernels.
-  auto io_copy_op = LiteOpRegistry::Global().Create(io_copy_type);
-  CHECK(io_copy_op) << "create op [" << io_copy_op << "] failed";
-  // CHECK(io_copy_op);
-  // Create the new var manually.
-  inst_node->AsStmt().op()->scope()->Var(io_copy_output_name);
-
-  // Create IoCopy Instruction.
-  cpp::OpDesc op_desc;
-  op_desc.SetType(io_copy_type);
-  op_desc.SetInput("Input", {in->AsArg().name});
-  op_desc.SetOutput("Out", {io_copy_output_name});
-
-  io_copy_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
-  auto kernels = io_copy_op->CreateKernels(valid_places);
-  // fix(MyPandaShaoxiang): select kernel that input_dcl_type same as in.type
-  bool is_found = false;
-  std::vector<std::unique_ptr<KernelBase>> selected_kernels;
-  for (auto& kernel : kernels) {
-    const Type* in_arg_ty = kernel->GetInputDeclType("Input");
-    const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
-    if (TypeCompatible(*in_arg_ty, from)) {
-      is_found = true;
-      selected_kernels.emplace_back(std::move(kernel));
-      // we pick the kernel
-      io_copy_inst->AsStmt(
-          io_copy_type, std::move(selected_kernels), io_copy_op);
-      break;
-    }
-  }
-  CHECK(is_found) << "Can't find a io_copy  kernel for io_copy op: " << from
-                  << ":" << in->AsArg().name << "->" << to << ":"
-                  << inst_node->AsStmt().op_info()->Type();
-
-  // Remove the old link
-  RemoveDirectedLink(in, inst_node);
-
-  // Update the original instruction OpDesc.
-  // Update its input to the io_copy_output_name
-  // Add new link, var -> new_inst, new_inst->newarg, newarg->inst
-  DirectedLink(in, io_copy_inst);
-  DirectedLink(io_copy_inst, io_copy_output_arg);
-  DirectedLink(io_copy_output_arg, inst_node);
-
-  // reset opdesc and update kernel information
-  UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(),
-                in->AsArg().name,
-                io_copy_output_name);
-  auto original_selected_kernel =
-      std::move(inst_node->AsStmt().kernels().front());
-  auto update_op_info = *inst_node->AsStmt().op_info();
-  // ResetOp() will change the Stmt op_info_ value,
-  // after that the old op_info_ value will be nullified.
-  // So, we can't pass `*inst_node->AsStmt().op_info()` into ResetOp.
-  // `update_op_info` is the copy of `*inst_node->AsStmt().op_info().
-  // Whenever update the op_info of a stmt, we should call its ResetOp().
-  inst_node->AsStmt().ResetOp(update_op_info, graph->valid_places());
-  inst_node->AsStmt().kernels().clear();
-  inst_node->AsStmt().kernels().emplace_back(
-      std::move(original_selected_kernel));
-
-  std::string tmp;
-  if (inst_node->AsStmt().op_info()->GetInputArgname("a", &tmp)) {
-    CHECK(false) << "get old a " << tmp;
-  }
-
-  for (auto& kernel : inst_node->AsStmt().kernels()) {
-    VLOG(4) << "kernel info: " << kernel->name();
-    inst_node->AsStmt().op()->AttachKernel(kernel.get());
-  }
-
-  graph->CheckValid();
-}
-
-void TypeTargetTransformPass::SetValidPlaces(
-    const std::vector<Place>& valid_places) {
-  CHECK(!valid_places.empty());
-  valid_places_ = valid_places;
-}
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(type_target_cast_pass,
-                  paddle::lite::mir::TypeTargetTransformPass)
-    .SetTargets({TARGET(kAny)});
diff --git a/lite/core/mir/type_target_cast_pass.h b/lite/core/mir/type_target_cast_pass.h
deleted file mode 100644
index 8a8cfaf9f9..0000000000
--- a/lite/core/mir/type_target_cast_pass.h
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-#include "lite/core/mir/pass.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-static void UpdateInputTo(cpp::OpDesc* desc,
-                          const std::string& from,
-                          const std::string& to) {
-  for (auto& item : *desc->mutable_inputs()) {
-    for (auto& input : item.second) {
-      if (input == from) {
-        input = to;
-      }
-    }
-  }
-}
-
-/*
- * IoComplementPass complement the necessary instruction to make data
- * transferring or transformation between different places.
- */
-class TypeTargetTransformPass : public ProgramPass {
- public:
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
-
-  void ComplementInputs(SSAGraph* graph, Node* inst_node, Node* in);
-
-  void AddIoCopyInst(const Type& from,
-                     const Type& to,
-                     Node* in,
-                     SSAGraph* graph,
-                     Node* inst_node,
-                     const std::vector<Place>& valid_places);
-
-  void SetValidPlaces(const std::vector<Place>& valid_places);
-
-  const std::vector<Place>& valid_places() const { return valid_places_; }
-
- private:
-  std::vector<Place> valid_places_;
-};
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/variable_place_inference_pass.cc b/lite/core/mir/variable_place_inference_pass.cc
deleted file mode 100644
index 1f8aea8172..0000000000
--- a/lite/core/mir/variable_place_inference_pass.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/variable_place_inference_pass.h"
-#include <memory>
-#include "lite/core/mir/pass_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-void VariablePlaceInferencePass::Apply(const std::unique_ptr<SSAGraph> &graph) {
-  MarkInputPlace(graph.get());
-  InferenceArgumentPlace(graph.get());
-  CheckAllArgumentTypeDetermined(graph.get());
-}
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(variable_place_inference_pass,
-                  paddle::lite::mir::VariablePlaceInferencePass)
-    .SetTargets({TARGET(kAny)});
diff --git a/lite/core/mir/variable_place_inference_pass.h b/lite/core/mir/variable_place_inference_pass.h
deleted file mode 100644
index d5b0bb8aa6..0000000000
--- a/lite/core/mir/variable_place_inference_pass.h
+++ /dev/null
@@ -1,157 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-#include "lite/core/mir/pass.h"
-#include "lite/core/target_wrapper.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-/*
- * Mark the place of the variables in the SSAGrpah, it will inference the
- * variables' place by the kernels outputs them.
- */
-class VariablePlaceInferencePass : public DebugPass {
- public:
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
-
- private:
-  // Mark the place of input arguments.
-  void MarkInputPlace(SSAGraph* graph) {
-    CHECK(!graph->inputs().empty()) << "graph's inputs should be set";
-    for (const auto& v : graph->inputs()) {
-      // the feed op might in the inputs
-      if (v->IsStmt()) {
-        VLOG(4) << "found kernel in inputs " << v->AsStmt().op_type();
-        continue;
-      }
-    }
-  }
-
-  void CheckAllArgumentTypeDetermined(SSAGraph* graph) {
-    for (auto& node : graph->mutable_nodes()) {
-      if (node.IsArg()) {
-        CHECK(node.AsArg().type) << "node " << node.AsArg().name
-                                 << " type not determined, " << &node;
-      }
-    }
-  }
-
-  // Set the tye of the weight
-  void SetWeightType(Node* w, const LiteType& type) {
-// TODO(xg) to optimize this
-#ifndef LITE_WITH_FPGA
-    w->AsArg().type =
-        LiteType::GetTensorTy(TARGET(kHost), type.precision(), type.layout());
-#else
-    w->AsArg().type = LiteType::GetTensorTy(
-        TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
-#endif
-  }
-
-  void InferenceArgumentPlace(SSAGraph* graph) {
-    VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global();
-    for (auto& x : graph->StmtTopologicalOrder()) {
-      auto& inst = x->AsStmt();
-// The IoCopyOp is a tool operator, it won't support the type inference.
-// in fpga, we has io_copy+cali+layout tool ops, so we need type inference for
-// tool operator
-#ifndef LITE_WITH_FPGA
-      if (inst.op_type() == "io_copy") continue;
-#endif
-      // deal with inputs
-      VLOG(4) << "Infering op " << inst.op_info()->Repr();
-      // TODO(zhaolong): Add check if the node's name in op's arguments.
-
-      auto get_argname = [&](
-          const std::string& node_name,
-          const std::map<std::string, std::vector<std::string>>& argname_map)
-          -> std::string {
-            for (auto& ele : argname_map) {
-              auto it =
-                  std::find(ele.second.begin(), ele.second.end(), node_name);
-              if (it != ele.second.end()) return ele.first;
-            }
-            return "";
-          };
-
-      for (auto* x_in : x->inlinks) {
-        std::string node_name = x_in->AsArg().name;
-        std::string arg_name = get_argname(node_name, inst.op_info()->inputs());
-        CHECK(arg_name.size() > 0) << "can not found op arguments for node "
-                                   << node_name;
-        VLOG(4) << "-- input arg_name " << arg_name
-                << "-- node name :" << node_name;
-        auto type = inst.picked_kernel().GetInputDeclType(arg_name);
-        if (!x_in->AsArg().type) {
-          VLOG(4) << "set type " << *type << " " << x_in->AsArg().name;
-          if (x_in->AsArg().is_weight) {
-            SetWeightType(x_in, *type);
-          } else {
-            x_in->AsArg().type = type;
-          }
-        }
-      }
-
-      VLOG(4) << "inst " << inst.op_info()->Repr();
-      for (auto* x_out : x->outlinks) {
-        std::string node_name = x_out->AsArg().name;
-        std::string arg_name =
-            get_argname(node_name, inst.op_info()->outputs());
-        CHECK(arg_name.size() > 0) << "can not found op arguments for node "
-                                   << node_name << " in Inst "
-                                   << inst.op_type();
-        VLOG(4) << "-- output arg_name " << arg_name;
-        auto type = inst.picked_kernel().GetOutputDeclType(arg_name);
-        if (!x_out->AsArg().type) {
-          VLOG(4) << "set type " << *type << " " << x_out->AsArg().name;
-          if (x_out->AsArg().is_weight) {
-            SetWeightType(x_out, *type);
-          } else {
-            x_out->AsArg().type = type;
-          }
-        }
-      }
-    }
-  }
-
-  // Update me's kUnk fields by other's fields.
-  void UpdatePlace(Place* me, const Place& other) {
-    CHECK(other.is_valid());
-    if (me->target == TARGET(kUnk)) {
-      me->target = other.target;
-    }
-    if (me->precision == PRECISION(kUnk)) {
-      me->precision = other.precision;
-    }
-    if (me->layout == DATALAYOUT(kUnk)) {
-      me->layout = other.layout;
-    }
-  }
-
- private:
-  // The default target for arguments, e.g. load weights to CPU memory for CUDA
-  // computation by default.
-  TargetType argument_default_target_{TARGET(kHost)};
-};
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/variable_place_inference_pass_test.cc b/lite/core/mir/variable_place_inference_pass_test.cc
deleted file mode 100644
index cf86afd590..0000000000
--- a/lite/core/mir/variable_place_inference_pass_test.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_passes.h"
-#include "lite/core/optimizer.h"
-#include "lite/core/program_fake_utils.h"
-#include "lite/kernels/cuda/use_kernels.h"
-#include "lite/kernels/host/use_kernels.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-
-TEST(variable_place_inference_pass, test) {
-  std::shared_ptr<Scope> scope(new lite::Scope);
-  ProgramFaker program_faker;
-  program_faker.AddFeed("a", 0);
-  program_faker.AddMul("a", "W", "a1");
-  program_faker.AddMul("a1", "W1", "a2");
-  program_faker.AddFetch("a2", 0);
-  program_faker.CreateVars(scope.get());
-
-  auto* desc = program_faker.program();
-
-  Optimizer optimizer;
-  std::vector<Place> places({
-      Place{
-          TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW),
-      },
-      Place{
-          TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW),
-      },
-      Place{
-          TARGET(kX86), PRECISION(kFloat), DATALAYOUT(kNCHW),
-      },
-      Place{
-          TARGET(kX86), PRECISION(kAny), DATALAYOUT(kAny),
-      },
-  });
-
-  Program program(*desc->Proto(), scope, places);
-
-  core::KernelPickFactor factor;
-  factor.ConsiderTarget();
-
-  std::vector<std::string> passes({
-      "static_kernel_pick_pass",        //
-      "argument_type_display_pass",     //
-      "variable_place_inference_pass",  //
-      "argument_type_display_pass",     //
-      "type_target_cast_pass",          //
-  });
-
-  Place prefered_place{
-#ifdef PADDLE_WITH_CUDA
-      TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW),
-#else
-#ifdef PADDLE_WITH_ARM
-      TARGET(kARM), PRECISION(kFloat), DATALAYOUT(kNCHW),
-#else   // X86
-      TARGET(kX86), PRECISION(kFloat), DATALAYOUT(kNCHW),
-#endif  // ARM
-#endif
-  };
-  optimizer.KernelPickPreferPlace(prefered_place);
-  optimizer.Run(std::move(program), places, factor, passes);
-}
-
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(mul);
-USE_LITE_OP(feed);
-USE_LITE_OP(fetch);
-USE_LITE_OP(io_copy);
-
-#ifdef LITE_WITH_X86
-USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
-#endif
-
-#ifdef LITE_WITH_ARM
-USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
-#endif
-
-#ifdef LITE_WITH_CUDA
-USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device);
-USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, device_to_host);
-#endif
diff --git a/lite/core/naive_test_model.py b/lite/core/naive_test_model.py
deleted file mode 100644
index f89a5e115f..0000000000
--- a/lite/core/naive_test_model.py
+++ /dev/null
@@ -1,56 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy
-import sys, os
-import numpy as np
-import paddle.fluid as fluid
-from paddle.fluid.backward import append_backward
-
-a = fluid.layers.data(name="a", shape=[2], dtype='float32')
-label = fluid.layers.data(name="label", shape=[10], dtype='float32')
-
-a1 = fluid.layers.fc(input=a, size=3, act=None, bias_attr=False)
-
-cost = fluid.layers.square_error_cost(a1, label)
-avg_cost = fluid.layers.mean(cost)
-
-optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-optimizer.minimize(cost)
-
-cpu = fluid.core.CPUPlace()
-loss = exe = fluid.Executor(cpu)
-
-exe.run(fluid.default_startup_program())
-with open('startup_program.pb', 'wb') as f:
-    f.write(fluid.default_startup_program().desc.serialize_to_string())
-
-#data_1 = np.array(numpy.random.random([100, 100]), dtype='float32')
-
-#fluid.default_main_program().desc.
-
-#prog = fluid.compiler.CompiledProgram(fluid.default_main_program())
-prog = fluid.default_main_program()
-
-#append_backward(loss)
-
-with open('main_program.pb', 'wb') as f:
-    f.write(prog.desc.serialize_to_string())
-
-#outs = exe.run(program=prog, feed={'a':data_1, }, fetch_list=[cost])
-
-#sys.exit(0)
-fluid.io.save_inference_model("./model2", [a.name], [a1], exe)
-
-#print(numpy.array(outs))
diff --git a/lite/core/op_lite.cc b/lite/core/op_lite.cc
deleted file mode 100644
index 412b299339..0000000000
--- a/lite/core/op_lite.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/op_lite.h"
-#include <list>
-#include <set>
-#include <utility>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-
-std::vector<std::unique_ptr<KernelBase>> OpLite::CreateKernels(
-    const std::vector<Place> &places, const std::string &kernel_type) {
-  std::vector<std::unique_ptr<KernelBase>> kernels;
-  CHECK(!op_type_.empty()) << "op_type_ should be set first";
-
-  auto pick_kernel = [&](const Place &place) {
-    auto ks = KernelRegistry::Global().Create(
-        op_type_, place.target, place.precision, place.layout);
-    VLOG(5) << "pick kernel for " << op_info()->Type() << " "
-            << place.DebugString() << " get " << ks.size() << " kernels";
-    for (auto &&it : ks) {
-      AttachKernel(it.get());
-      kernels.emplace_back(std::move(it));
-    }
-  };
-
-  if (!kernel_type.empty()) {
-    Place place;
-    std::string op_type, alias;
-    KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
-    pick_kernel(place);
-    CHECK(!kernels.empty()) << "no kernel for kernel type " << kernel_type;
-    return kernels;
-  }
-
-  std::set<Place> place_set;
-  for (auto place : places) {
-    place_set.insert(place);
-    // Pick kernels those support any Precision and any DataLayout
-    place.precision = PRECISION(kAny);
-    place_set.insert(place);
-    place.layout = DATALAYOUT(kAny);
-    place_set.insert(place);
-  }
-
-  std::set<TargetType> targets;
-  for (auto place : place_set) {
-    pick_kernel(place);
-    targets.insert(place.target);
-  }
-
-  VLOG(4) << "op " << op_type_ << " get " << kernels.size() << " kernels";
-  return kernels;
-}
-
-bool OpLite::Run() {
-  CHECK(kernel_);
-  SyncInputEvents();
-
-  kernel_->Launch();
-
-  RecordOutputEvents();
-  return true;
-}
-
-bool OpLite::Attach(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  // valid_places_.clear();
-  CHECK(scope != nullptr);
-  // CHECK(!op_info_.get());
-  scope_ = scope;
-  op_info_.reset(
-      new OpInfo(opdesc));  // Force clean the out-of-date infomation.
-  return AttachImpl(*op_info(), scope);
-}
-
-const Tensor *OpLite::GetTensor(lite::Scope *scope,
-                                const std::string &name) const {
-  auto *var = scope->FindVar(name);
-  CHECK(var) << "no variable called " << name << " found";
-  return &var->Get<lite::Tensor>();
-}
-
-Tensor *OpLite::GetMutableTensor(lite::Scope *scope,
-                                 const std::string &name) const {
-  auto *var = scope->FindVar(name);
-  CHECK(var) << "no variable called " << name << " found";
-  return var->GetMutable<lite::Tensor>();
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/op_lite.h b/lite/core/op_lite.h
deleted file mode 100644
index f843ef6f2b..0000000000
--- a/lite/core/op_lite.h
+++ /dev/null
@@ -1,231 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <list>
-#include <map>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "lite/core/context.h"
-#include "lite/core/kernel.h"
-#include "lite/core/scope.h"
-#include "lite/model_parser/cpp/op_desc.h"
-
-namespace paddle {
-namespace lite {
-
-// For registry factory.
-struct Registry {
-  void Touch() {}
-};
-
-namespace mir {
-class Node;
-class SSAGraph;
-}
-
-class OpInfo;
-
-/**
- * The base class of an light-weight operators, currently just used in inference
- * to eliminate overhead of some operations in current framework.
- *
- * The Operator are designed as follows:
- * - it can has some members to hold the argument and some other computation
- * resources,
- * - it should act like a function call, no more logic included.
- */
-class OpLite : public Registry {
- public:
-  OpLite() = default;
-  explicit OpLite(const std::string &type) : op_type_(type) {}
-  explicit OpLite(const std::vector<Place> &valid_places)
-      : valid_places_(valid_places) {}
-
-  void SetValidPlaces(const std::vector<Place> &places) {
-    VLOG(3) << "valid places " << valid_places_.size();
-    valid_places_ = places;
-  }
-  const std::vector<Place> &valid_places() const { return valid_places_; }
-  // Check the shape.
-  virtual bool CheckShape() const { return true; }
-  // Inference the outputs' shape.
-  virtual bool InferShape() const { return true; }
-  // Run this operator.
-  virtual bool Run();
-  // Indicate whether the Op runs only once or not
-  virtual bool run_once() const { return false; }
-  std::string Type() { return op_type_; }
-
-  // Link the external execution environ to internal context.
-  bool Attach(const cpp::OpDesc &opdesc, lite::Scope *scope);
-
-  const OpInfo *op_info() const { return op_info_.get(); }
-  OpInfo *mutable_op_info() { return op_info_.get(); }
-
-  // Human-readable information.
-  virtual std::string DebugString() const = 0;
-
-  const Place &kernel_place() const { return kernel_place_; }
-
-  // Create all the kernels for the valid targets.
-  std::vector<std::unique_ptr<KernelBase>> CreateKernels(
-      const std::vector<Place> &places, const std::string &kernel_type = "");
-
-  lite::Scope *scope() { return scope_; }
-
-  // Assign op param to kernel.
-  virtual void AttachKernel(KernelBase *kernel) = 0;
-  void SetKernel(std::vector<std::unique_ptr<KernelBase>> &kernels) {  // NOLINT
-    kernel_ = std::move(kernels.front());
-    kernel_->SetContext(
-        ContextScheduler::Global().NewContext(kernel_->target()));
-  }
-
-  KernelBase *GetKernel() {  // NOLINT
-    return kernel_.get();
-  }
-
-  virtual ~OpLite() = default;
-
- protected:
-  // Attach it with the runtime environment.
-  virtual bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) = 0;
-
-  // Specify the kernel to run by default. This will specify the value of
-  // `kernel_place_`.
-  virtual void StaticPickKernel(const std::vector<Place> &valid_targets) {
-    auto kernels = CreateKernels(valid_targets);
-    kernel_ = std::move(kernels.front());
-  }
-
-  // Wait until all the inputs' events are ready.
-  void SyncInputEvents() {}
-
-  // Record the output events, and that will tell all the dependent operators
-  // some inputs are ready.
-  void RecordOutputEvents() {}
-
-  const Tensor *GetTensor(lite::Scope *scope, const std::string &name) const;
-  Tensor *GetMutableTensor(lite::Scope *scope, const std::string &name) const;
-
-  friend class mir::Node;
-  friend class mir::SSAGraph;
-
- protected:
-  // some helper functions.
-  template <typename T>
-  const T *GetVar(Scope *scope, const std::string &name) {
-    auto *var = scope->FindVar(name);
-    CHECK(var) << "No var found for " << name;
-    return &var->Get<T>();
-  }
-  template <typename T>
-  T *GetMutableVar(Scope *scope, const std::string &name) {
-    auto *var = scope->FindVar(name);
-    CHECK(var) << "No var found for " << name;
-    return var->GetMutable<T>();
-  }
-
- protected:
-  lite::Scope *scope_{nullptr};
-  std::unique_ptr<KernelBase> kernel_;
-  std::string op_type_;
-  std::vector<Place> valid_places_;
-  Place kernel_place_{TARGET(kHost), PRECISION(kFloat)};
-  std::unique_ptr<OpInfo> op_info_;
-};
-
-/*
- * Operator Information, such as some description. It will be shared by all the
- * kernels of the same operator.
- */
-class OpInfo : public cpp::OpDesc {
- public:
-  OpInfo(const OpInfo &) = default;
-  explicit OpInfo(const cpp::OpDesc &other) : cpp::OpDesc(other) {}
-
-  // Collect all the input variable's name.
-  std::vector<std::string> input_names() const {
-    std::vector<std::string> res;
-    for (auto &param : InputArgumentNames()) {
-      for (auto &x : Input(param)) {
-        res.push_back(x);
-      }
-    }
-    return res;
-  }
-
-  // Collect all the output variable's name.
-  std::vector<std::string> output_names() const {
-    std::vector<std::string> res;
-    for (auto &param : OutputArgumentNames()) {
-      for (auto &x : Output(param)) {
-        res.push_back(x);
-      }
-    }
-    return res;
-  }
-
-  std::vector<std::string> input_argnames() const {
-    return InputArgumentNames();
-  }
-
-  std::vector<std::string> output_argnames() const {
-    return OutputArgumentNames();
-  }
-
-  bool GetInputArgname(const std::string &value_name, std::string *out) const {
-    for (auto &item : inputs_) {
-      auto it = std::find(item.second.begin(), item.second.end(), value_name);
-      if (it != item.second.end()) {
-        *out = item.first;
-        return true;
-      }
-    }
-    return false;
-  }
-  bool GetOutputArgname(const std::string &value_name, std::string *out) const {
-    for (auto &item : outputs_) {
-      auto it = std::find(item.second.begin(), item.second.end(), value_name);
-      if (it != item.second.end()) {
-        *out = item.first;
-        return true;
-      }
-    }
-    return false;
-  }
-
-  void UpdateAllInputs(const std::string &from, const std::string &to) {
-    for (auto &item : inputs_) {
-      for (auto &var : item.second) {
-        if (var == from) var = to;
-      }
-    }
-  }
-
-  void UpdateAllOutputs(const std::string &from, const std::string &to) {
-    for (auto &item : outputs_) {
-      for (auto &var : item.second) {
-        if (var == from) var = to;
-      }
-    }
-  }
-};
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/op_lite_test.cc b/lite/core/op_lite_test.cc
deleted file mode 100644
index a18607834a..0000000000
--- a/lite/core/op_lite_test.cc
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/op_lite.h"
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace lite {
-
-TEST(OpLite, test) {}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/op_registry.cc b/lite/core/op_registry.cc
deleted file mode 100644
index 53d4afa9ff..0000000000
--- a/lite/core/op_registry.cc
+++ /dev/null
@@ -1,154 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/op_registry.h"
-#include <list>
-#include <set>
-
-namespace paddle {
-namespace lite {
-
-std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
-    const std::string &op_type,
-    TargetType target,
-    PrecisionType precision,
-    DataLayoutType layout) {
-  Place place{target, precision, layout};
-  VLOG(5) << "creating " << op_type << " kernel for " << place.DebugString();
-#define CREATE_KERNEL1(target__, precision__)                                \
-  switch (layout) {                                                          \
-    case DATALAYOUT(kNCHW):                                                  \
-      return Create<TARGET(target__),                                        \
-                    PRECISION(precision__),                                  \
-                    DATALAYOUT(kNCHW)>(op_type);                             \
-    case DATALAYOUT(kAny):                                                   \
-      return Create<TARGET(target__),                                        \
-                    PRECISION(precision__),                                  \
-                    DATALAYOUT(kAny)>(op_type);                              \
-    case DATALAYOUT(kNHWC):                                                  \
-      return Create<TARGET(target__),                                        \
-                    PRECISION(precision__),                                  \
-                    DATALAYOUT(kNHWC)>(op_type);                             \
-    default:                                                                 \
-      LOG(FATAL) << "unsupported kernel layout " << DataLayoutToStr(layout); \
-  }
-
-#define CREATE_KERNEL(target__)                         \
-  switch (precision) {                                  \
-    case PRECISION(kFloat):                             \
-      CREATE_KERNEL1(target__, kFloat);                 \
-    case PRECISION(kInt8):                              \
-      CREATE_KERNEL1(target__, kInt8);                  \
-    case PRECISION(kFP16):                              \
-      CREATE_KERNEL1(target__, kFP16);                  \
-    case PRECISION(kAny):                               \
-      CREATE_KERNEL1(target__, kAny);                   \
-    default:                                            \
-      CHECK(false) << "not supported kernel precision " \
-                   << PrecisionToStr(precision);        \
-  }
-
-  switch (target) {
-    case TARGET(kHost): {
-      CREATE_KERNEL(kHost);
-    } break;
-    case TARGET(kX86): {
-      CREATE_KERNEL(kX86);
-    } break;
-    case TARGET(kCUDA): {
-      CREATE_KERNEL(kCUDA);
-    } break;
-    case TARGET(kARM): {
-      CREATE_KERNEL(kARM);
-    } break;
-    case TARGET(kOpenCL): {
-      CREATE_KERNEL(kOpenCL);
-    } break;
-    case TARGET(kNPU): {
-      CREATE_KERNEL(kNPU);
-    } break;
-    case TARGET(kFPGA): {
-      CREATE_KERNEL(kFPGA);
-    } break;
-    default:
-      CHECK(false) << "not supported kernel target " << TargetToStr(target);
-  }
-
-#undef CREATE_KERNEL
-  return std::list<std::unique_ptr<KernelBase>>();
-}
-
-KernelRegistry::KernelRegistry()
-    : registries_(static_cast<int>(TARGET(NUM)) *
-                  static_cast<int>(PRECISION(NUM)) *
-                  static_cast<int>(DATALAYOUT(NUM))) {
-#define INIT_FOR(target__, precision__, layout__)                      \
-  registries_[KernelRegistry::GetKernelOffset<TARGET(target__),        \
-                                              PRECISION(precision__),  \
-                                              DATALAYOUT(layout__)>()] \
-      .set<KernelRegistryForTarget<TARGET(target__),                   \
-                                   PRECISION(precision__),             \
-                                   DATALAYOUT(layout__)> *>(           \
-          &KernelRegistryForTarget<TARGET(target__),                   \
-                                   PRECISION(precision__),             \
-                                   DATALAYOUT(layout__)>::Global());
-  // Currently, just register 2 kernel targets.
-  INIT_FOR(kCUDA, kFloat, kNCHW);
-  INIT_FOR(kCUDA, kInt8, kNCHW);
-  INIT_FOR(kCUDA, kAny, kNCHW);
-  INIT_FOR(kCUDA, kAny, kAny);
-  INIT_FOR(kCUDA, kInt8, kNHWC);
-
-  INIT_FOR(kHost, kFloat, kNCHW);
-  INIT_FOR(kHost, kAny, kNCHW);
-  INIT_FOR(kHost, kFloat, kNHWC);
-  INIT_FOR(kHost, kFloat, kAny);
-  INIT_FOR(kHost, kAny, kNHWC);
-  INIT_FOR(kHost, kAny, kAny);
-  INIT_FOR(kHost, kAny, kNHWC);
-  INIT_FOR(kHost, kAny, kAny);
-
-  INIT_FOR(kX86, kFloat, kNCHW);
-  INIT_FOR(kX86, kAny, kNCHW);
-  INIT_FOR(kX86, kAny, kAny);
-
-  INIT_FOR(kARM, kFloat, kNCHW);
-  INIT_FOR(kARM, kInt8, kNCHW);
-  INIT_FOR(kARM, kAny, kNCHW);
-  INIT_FOR(kARM, kAny, kAny);
-
-  INIT_FOR(kOpenCL, kFloat, kNCHW);
-  INIT_FOR(kOpenCL, kAny, kNCHW);
-  INIT_FOR(kOpenCL, kAny, kAny);
-
-  INIT_FOR(kNPU, kFloat, kNCHW);
-  INIT_FOR(kNPU, kInt8, kNCHW);
-  INIT_FOR(kNPU, kAny, kNCHW);
-  INIT_FOR(kNPU, kAny, kAny);
-
-  INIT_FOR(kFPGA, kFP16, kNHWC);
-  INIT_FOR(kFPGA, kFP16, kAny);
-  INIT_FOR(kFPGA, kFloat, kNHWC);
-  INIT_FOR(kFPGA, kAny, kNHWC);
-  INIT_FOR(kFPGA, kAny, kAny);
-#undef INIT_FOR
-}
-
-KernelRegistry &KernelRegistry::Global() {
-  static auto *x = new KernelRegistry;
-  return *x;
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/op_registry.h b/lite/core/op_registry.h
deleted file mode 100644
index 5b48c251c8..0000000000
--- a/lite/core/op_registry.h
+++ /dev/null
@@ -1,306 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <list>
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-#include <tuple>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include "lite/api/paddle_lite_factory_helper.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/target_wrapper.h"
-#include "lite/utils/all.h"
-#include "lite/utils/macros.h"
-
-using LiteType = paddle::lite::Type;
-
-namespace paddle {
-namespace lite {
-
-using KernelFunc = std::function<void()>;
-using KernelFuncCreator = std::function<std::unique_ptr<KernelFunc>()>;
-class LiteOpRegistry final : public Factory<OpLite, std::shared_ptr<OpLite>> {
- public:
-  static LiteOpRegistry &Global() {
-    static auto *x = new LiteOpRegistry;
-    return *x;
-  }
-
- private:
-  LiteOpRegistry() = default;
-};
-
-template <typename OpClass>
-class OpLiteRegistor : public Registor<OpClass> {
- public:
-  explicit OpLiteRegistor(const std::string &op_type)
-      : Registor<OpClass>([&] {
-          LiteOpRegistry::Global().Register(
-              op_type, [op_type]() -> std::unique_ptr<OpLite> {
-                return std::unique_ptr<OpLite>(new OpClass(op_type));
-              });
-        }) {}
-};
-
-template <TargetType Target, PrecisionType Precision, DataLayoutType Layout>
-using KernelRegistryForTarget =
-    Factory<KernelLite<Target, Precision, Layout>, std::unique_ptr<KernelBase>>;
-
-class KernelRegistry final {
- public:
-  using any_kernel_registor_t =
-      variant<KernelRegistryForTarget<TARGET(kCUDA),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kCUDA),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kCUDA),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNHWC)> *,  //
-              KernelRegistryForTarget<TARGET(kX86),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kX86),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kHost),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kHost),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNHWC)> *,  //
-              KernelRegistryForTarget<TARGET(kHost),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kHost),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kCUDA),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kARM),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kARM),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kARM),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kNPU),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kNPU),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kNPU),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kFPGA),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kFPGA),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kFPGA),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kFPGA),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNHWC)> *,  //
-              KernelRegistryForTarget<TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC)> *,  //
-              KernelRegistryForTarget<TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kFPGA),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *  //
-              >;
-
-  KernelRegistry();
-
-  static KernelRegistry &Global();
-
-  template <TargetType Target, PrecisionType Precision, DataLayoutType Layout>
-  void Register(
-      const std::string &name,
-      typename KernelRegistryForTarget<Target, Precision, Layout>::creator_t
-          &&creator) {
-    using kernel_registor_t =
-        KernelRegistryForTarget<Target, Precision, Layout>;
-    auto &varient = registries_[GetKernelOffset<Target, Precision, Layout>()];
-    auto *reg = varient.template get<kernel_registor_t *>();
-    CHECK(reg) << "Can not be empty of " << name;
-    reg->Register(name, std::move(creator));
-#ifdef LITE_ON_MODEL_OPTIMIZE_TOOL
-    kernel_info_map_[name].push_back(
-        std::make_tuple(Target, Precision, Layout));
-#endif  // LITE_ON_MODEL_OPTIMIZE_TOOL
-  }
-
-  template <TargetType Target,
-            PrecisionType Precision = PRECISION(kFloat),
-            DataLayoutType Layout = DATALAYOUT(kNCHW)>
-  std::list<std::unique_ptr<KernelBase>> Create(const std::string &op_type) {
-    using kernel_registor_t =
-        KernelRegistryForTarget<Target, Precision, Layout>;
-    return registries_[GetKernelOffset<Target, Precision, Layout>()]
-        .template get<kernel_registor_t *>()
-        ->Creates(op_type);
-  }
-
-  std::list<std::unique_ptr<KernelBase>> Create(const std::string &op_type,
-                                                TargetType target,
-                                                PrecisionType precision,
-                                                DataLayoutType layout);
-
-  // Get a kernel registry offset in all the registries.
-  template <TargetType Target, PrecisionType Precision, DataLayoutType Layout>
-  static int GetKernelOffset() {
-    CHECK_LT(static_cast<int>(Target), static_cast<int>(TARGET(NUM)));
-    CHECK_LT(static_cast<int>(Precision), static_cast<int>(PRECISION(NUM)));
-    CHECK_LT(static_cast<int>(Layout), static_cast<int>(DATALAYOUT(NUM)));
-    return static_cast<int>(Target) * static_cast<int>(PRECISION(NUM)) *
-               static_cast<int>(DATALAYOUT(NUM)) +                            //
-           static_cast<int>(Precision) * static_cast<int>(DATALAYOUT(NUM)) +  //
-           static_cast<int>(Layout);
-  }
-
-  std::string DebugString() const {
-#ifndef LITE_ON_MODEL_OPTIMIZE_TOOL
-    return "No more debug info";
-#else   // LITE_ON_MODEL_OPTIMIZE_TOOL
-    STL::stringstream ss;
-    ss << "\n";
-    ss << "Count of kernel kinds: ";
-    int count = 0;
-    for (auto &item : kernel_info_map_) {
-      for (auto &kernel : item.second) ++count;
-    }
-    ss << count << "\n";
-
-    ss << "Count of registered kernels: " << kernel_info_map_.size() << "\n";
-    for (auto &item : kernel_info_map_) {
-      ss << "op: " << item.first << "\n";
-      for (auto &kernel : item.second) {
-        ss << "   - (" << TargetToStr(std::get<0>(kernel)) << ",";
-        ss << PrecisionToStr(std::get<1>(kernel)) << ",";
-        ss << DataLayoutToStr(std::get<2>(kernel));
-        ss << ")";
-        ss << "\n";
-      }
-    }
-
-    return ss.str();
-#endif  // LITE_ON_MODEL_OPTIMIZE_TOOL
-  }
-
- private:
-  mutable std::vector<any_kernel_registor_t> registries_;
-#ifndef LITE_ON_TINY_PUBLISH
-  mutable std::map<
-      std::string,
-      std::vector<std::tuple<TargetType, PrecisionType, DataLayoutType>>>
-      kernel_info_map_;
-#endif
-};
-
-template <TargetType target,
-          PrecisionType precision,
-          DataLayoutType layout,
-          typename KernelType>
-class KernelRegistor : public lite::Registor<KernelType> {
- public:
-  KernelRegistor(const std::string &op_type, const std::string &alias)
-      : Registor<KernelType>([=] {
-          KernelRegistry::Global().Register<target, precision, layout>(
-              op_type, [=]() -> std::unique_ptr<KernelType> {
-                std::unique_ptr<KernelType> x(new KernelType);
-                x->set_op_type(op_type);
-                x->set_alias(alias);
-                return x;
-              });
-        }) {}
-};
-
-}  // namespace lite
-}  // namespace paddle
-
-// Operator registry
-#define LITE_OP_REGISTER_INSTANCE(op_type__) op_type__##__registry__instance__
-#define REGISTER_LITE_OP(op_type__, OpClass)                              \
-  static paddle::lite::OpLiteRegistor<OpClass> LITE_OP_REGISTER_INSTANCE( \
-      op_type__)(#op_type__);                                             \
-  int touch_op_##op_type__() {                                            \
-    return LITE_OP_REGISTER_INSTANCE(op_type__).Touch();                  \
-  }
-
-// Kernel registry
-#define LITE_KERNEL_REGISTER(op_type__, target__, precision__) \
-  op_type__##__##target__##__##precision__##__registor__
-#define LITE_KERNEL_REGISTER_INSTANCE(                   \
-    op_type__, target__, precision__, layout__, alias__) \
-  op_type__##__##target__##__##precision__##__registor__instance__##alias__
-#define LITE_KERNEL_REGISTER_FAKE(op_type__, target__, precision__, alias__) \
-  LITE_KERNEL_REGISTER_INSTANCE(op_type__, target__, precision__, alias__)
-
-#define REGISTER_LITE_KERNEL(                                                  \
-    op_type__, target__, precision__, layout__, KernelClass, alias__)          \
-  static paddle::lite::KernelRegistor<TARGET(target__),                        \
-                                      PRECISION(precision__),                  \
-                                      DATALAYOUT(layout__),                    \
-                                      KernelClass>                             \
-      LITE_KERNEL_REGISTER_INSTANCE(                                           \
-          op_type__, target__, precision__, layout__, alias__)(#op_type__,     \
-                                                               #alias__);      \
-  static KernelClass LITE_KERNEL_INSTANCE(                                     \
-      op_type__, target__, precision__, layout__, alias__);                    \
-  int touch_##op_type__##target__##precision__##layout__##alias__() {          \
-    LITE_KERNEL_INSTANCE(op_type__, target__, precision__, layout__, alias__)  \
-        .Touch();                                                              \
-    return 0;                                                                  \
-  }                                                                            \
-  static bool LITE_KERNEL_PARAM_INSTANCE(                                      \
-      op_type__, target__, precision__, layout__, alias__)                     \
-      __attribute__((unused)) =                                                \
-          paddle::lite::ParamTypeRegistry::NewInstance<TARGET(target__),       \
-                                                       PRECISION(precision__), \
-                                                       DATALAYOUT(layout__)>(  \
-              #op_type__ "/" #alias__)
-
-#define LITE_KERNEL_INSTANCE(                            \
-    op_type__, target__, precision__, layout__, alias__) \
-  op_type__##target__##precision__##layout__##alias__
-#define LITE_KERNEL_PARAM_INSTANCE(                      \
-    op_type__, target__, precision__, layout__, alias__) \
-  op_type__##target__##precision__##layout__##alias__##param_register
diff --git a/lite/core/optimizer.cc b/lite/core/optimizer.cc
deleted file mode 100644
index 38a64a589f..0000000000
--- a/lite/core/optimizer.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/optimizer.h"
-#include <fstream>
-#include "lite/core/mir/static_kernel_pick_pass.h"
-#include "lite/core/mir/type_target_cast_pass.h"
-#include "lite/model_parser/model_parser.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-
-void Optimizer::SpecifyKernelPickTactic(core::KernelPickFactor factor) {
-  auto* pass = mir::PassManager::Global().LookUp<mir::StaticKernelPickPass>(
-      "static_kernel_pick_pass");
-  CHECK(pass);
-
-  *pass->mutable_kernel_pick_factors() = factor;
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h
deleted file mode 100644
index 031ffded45..0000000000
--- a/lite/core/optimizer.h
+++ /dev/null
@@ -1,213 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <string>
-#include <vector>
-#include "lite/core/mir/generate_program_pass.h"
-#include "lite/core/mir/pass_manager.h"
-#include "lite/core/mir/ssa_graph.h"
-#include "lite/core/mir/static_kernel_pick_pass.h"
-#include "lite/core/mir/type_target_cast_pass.h"
-#include "lite/core/program.h"
-#include "lite/core/types.h"
-#include "lite/model_parser/model_parser.h"
-#ifdef LITE_WITH_NPU
-#include "lite/core/mir/subgraph/generate_npu_program_pass.h"
-#endif
-
-namespace paddle {
-namespace lite {
-
-/*
- * lite::Optimizer optimize a program. It utilize the mir passes to analysis the
- * program and export an optimized program.
- */
-class Optimizer {
- public:
-  void Run(Program&& program,
-           const std::vector<Place>& valid_places,
-           core::KernelPickFactor kernel_pick_factor,
-           const std::vector<std::string>& passes = {}) {
-    program_ = &program;
-    valid_places_ = valid_places;
-    CHECK(!valid_places.empty()) << "At least one valid_place should be set";
-    CHECK(!graph_) << "duplicate optimize found";
-    graph_.reset(new mir::SSAGraph);
-    graph_->Build(program, valid_places);
-    graph_->SetValidPlaces(valid_places);
-
-    SpecifyKernelPickTactic(kernel_pick_factor);
-    InitTargetTypeTransformPass();
-
-    if (passes.empty()) {
-      RunPasses(std::vector<std::string>{
-          {"lite_quant_dequant_fuse_pass",     //
-           "lite_conv_elementwise_fuse_pass",  // conv-elemwise-bn
-           "lite_conv_bn_fuse_pass",           //
-           "lite_conv_elementwise_fuse_pass",  // conv-bn-elemwise
-           // This pass is disabled to force some opencl kernels selected for
-           // final running, otherwise, they will be fused to ARM fusion
-           // kernels, and the OpenCL devices will be discarded.
-           // TODO(Superjomn) Refine the fusion related design to select fusion
-           // kernels for devices automatically.
-           "lite_conv_activation_fuse_pass",              //
-           "lite_fc_fuse_pass",                           //
-           "lite_shuffle_channel_fuse_pass",              //
-           "lite_transpose_softmax_transpose_fuse_pass",  //
-           "lite_interpolate_fuse_pass",                  //
-           "identity_scale_eliminate_pass",               //
-#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-           "lite_elementwise_add_activation_fuse_pass",  //
-#endif
-           "static_kernel_pick_pass",        //
-           "variable_place_inference_pass",  //
-           "argument_type_display_pass",     //
-
-           "type_target_cast_pass",          //
-           "variable_place_inference_pass",  //
-           "argument_type_display_pass",     //
-
-           "io_copy_kernel_pick_pass",       //
-           "variable_place_inference_pass",  //
-           "argument_type_display_pass",     //
-
-           "type_precision_cast_pass",       //
-           "variable_place_inference_pass",  //
-           "argument_type_display_pass",     //
-
-           "type_layout_cast_pass",          //
-           "variable_place_inference_pass",  //
-           "argument_type_display_pass",     //
-
-           "runtime_context_assign_pass",
-           "graph_visualze"}});
-    } else {
-      RunPasses(passes);
-    }
-    exec_scope_ = program.exec_scope();
-  }
-
-  void KernelPickPreferPlace(const Place& place) {
-    auto* pass = mir::PassManager::Global().LookUp<mir::StaticKernelPickPass>(
-        "static_kernel_pick_pass");
-    CHECK(pass);
-    pass->SetPreferPlace(place);
-  }
-
-  const lite::Scope* exec_scope() const { return exec_scope_; }
-
-  // Generate a new program based on the mir graph.
-  std::unique_ptr<RuntimeProgram> GenRuntimeProgram() {
-#ifdef LITE_WITH_NPU
-    if (std::find(valid_places_.begin(),
-                  valid_places_.end(),
-                  Place{TARGET(kNPU), PRECISION(kFloat)}) !=
-        valid_places_.end()) {
-      CheckInputDimsNotEmpty(exec_scope_);
-      auto pass = mir::PassManager::Global()
-                      .LookUp<mir::subgraph::GenerateNPUProgramPass>(
-                          "generate_npu_program_pass");
-      try {
-        pass->Apply(graph_);
-        auto program = pass->GenProgram();
-        CHECK(exec_scope_);
-        program->set_exec_scope(exec_scope_);
-        return program;
-      } catch (...) {
-        LOG(WARNING) << "Build NPU graph failed";
-      }
-    }
-#endif
-    auto pass = mir::PassManager::Global().LookUp<mir::GenerateProgramPass>(
-        "generate_program_pass");
-    pass->Apply(graph_);
-    auto program = pass->GenProgram();
-    CHECK(exec_scope_);
-    program->set_exec_scope(exec_scope_);
-    return program;
-  }
-
-  // check the input dims in the scope, must not be empty
-  void CheckInputDimsNotEmpty(const lite::Scope* scope) {
-    CHECK(scope);
-    auto* feed_var = scope->FindVar("feed");
-    CHECK(feed_var) << "no feed variable in exec_scope: " << scope;
-    auto* feed_tensor_list = feed_var->GetMutable<std::vector<lite::Tensor>>();
-    CHECK_GE(feed_tensor_list->size(), 1);
-    for (size_t i = 0; i < feed_tensor_list->size(); ++i) {
-      CHECK(!feed_tensor_list->at(i).dims().empty())
-          << "Input " << i << " dims can not be empty.";
-    }
-  }
-
-  void InitTargetTypeTransformPass() {
-    auto* pass =
-        mir::PassManager::Global().LookUp<mir::TypeTargetTransformPass>(
-            "type_target_cast_pass");
-    CHECK(pass);
-    CHECK(!valid_places_.empty());
-    pass->SetValidPlaces(valid_places_);
-  }
-
-  // Generate C++ code which combines the inference program, model and weights.
-  void GenCode(const std::string& code_dir);
-
-  const mir::SSAGraph& ssa_graph() const {
-    CHECK(graph_);
-    return *graph_;
-  }
-
-  mir::SSAGraph* mutable_ssa_graph() {
-    CHECK(graph_);
-    return graph_.get();
-  }
-
-  lite::Scope* exec_scope() { return exec_scope_; }
-
- protected:
-  void SpecifyKernelPickTactic(core::KernelPickFactor factor);
-
-  // Specify the passes and run them.
-  void RunPasses(const std::vector<std::string>& passes) {
-    for (auto& x : passes) {
-      LOG(INFO) << "== Running pass: " << x;
-      mir::Pass* pass = mir::PassManager::Global().LookUp(x);
-      CHECK(pass) << "Can not find pass: " << x;
-      bool supported = false;
-      for (const auto& place : valid_places_) {
-        if (pass->is_supported_target(place.target)) {
-          supported = true;
-        }
-      }
-      if (!supported) {
-        LOG(WARNING) << "Skip " << x
-                     << " pass because the target does not match.";
-      } else {
-        pass->Apply(graph_);
-        LOG(INFO) << "== Finished running: " << x;
-      }
-    }
-  }
-
- private:
-  std::unique_ptr<mir::SSAGraph> graph_;
-  std::vector<Place> valid_places_;
-  lite::Scope* exec_scope_{};
-  Program* program_{};
-};
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/optimizer_test.cc b/lite/core/optimizer_test.cc
deleted file mode 100644
index ba5bc01b58..0000000000
--- a/lite/core/optimizer_test.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/optimizer.h"
-#include <gtest/gtest.h>
-#include <memory>
-#include <utility>
-#include "lite/api/paddle_use_passes.h"
-#include "lite/core/mir/generate_program_pass.h"
-#include "lite/core/mir/pass_manager.h"
-#include "lite/core/mir/static_kernel_pick_pass.h"
-#include "lite/core/program_fake_utils.h"
-
-namespace paddle {
-namespace lite {
-
-TEST(Optimizer, test) {
-  Optimizer optimizer;
-  auto program_faker = ProgramFaker();
-  program_faker.AddFeed("X", 0);
-  program_faker.AddFetch("X", 0);
-
-  std::vector<Place> places({Place{TARGET(kHost), PRECISION(kFloat)}});
-
-  core::KernelPickFactor factor;
-  factor.ConsiderTarget();
-
-  auto scope = std::make_shared<lite::Scope>();
-  auto program_proto = *program_faker.program()->Proto();
-  Program program(program_proto, scope, places);
-  optimizer.Run(std::move(program), places, factor);
-  auto runtime_program = optimizer.GenRuntimeProgram();
-  LOG(INFO) << "num statements " << runtime_program->num_instructions();
-}
-
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(fc);
-USE_LITE_KERNEL(fc, kHost, kFloat, kNCHW, def);
diff --git a/lite/core/profile/CMakeLists.txt b/lite/core/profile/CMakeLists.txt
deleted file mode 100644
index de8a60bdc2..0000000000
--- a/lite/core/profile/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-if (NOT LITE_WITH_PROFILE)
-  return()
-endif()
-
-lite_cc_library(basic_profiler SRCS basic_profiler.cc)
-lite_cc_test(test_basic_profiler SRCS basic_profiler_test.cc DEPS basic_profiler)
- 
- 
diff --git a/lite/core/profile/basic_profiler.cc b/lite/core/profile/basic_profiler.cc
deleted file mode 100644
index 031b86beb6..0000000000
--- a/lite/core/profile/basic_profiler.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/profile/basic_profiler.h"
-
-namespace paddle {
-namespace lite {
-namespace profile {
-
-const int BasicTimer::data_w = 10;
-const int BasicTimer::name_w = 15;
-
-}  // namespace profile
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/profile/basic_profiler.h b/lite/core/profile/basic_profiler.h
deleted file mode 100644
index f55a5764a0..0000000000
--- a/lite/core/profile/basic_profiler.h
+++ /dev/null
@@ -1,210 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/*
- * This file implements BasicProfile, a profiler that helps to profile the basic
- * CPU execution. It can display the min, max, average lantency of the execution
- * of each kernel.
- */
-#pragma once
-#include <time.h>
-#include <algorithm>
-#include <chrono>  // NOLINT
-#include <limits>
-#include <memory>
-#include <string>
-#include <vector>
-#include "lite/utils/cp_logging.h"
-#include "lite/utils/replace_stl/stream.h"
-#include "lite/utils/string.h"
-
-namespace paddle {
-namespace lite {
-namespace profile {
-
-/* Base class of all the profile records */
-template <typename ChildT>
-class TimerBase {
- public:
-  void Start() { self()->Start(); }
-  void Stop() { self()->Stop(); }
-  void Log(uint32_t x) { return self()->Log(x); }
-  std::string basic_repr() const { return const_self()->basic_repr(); }
-
-  void SetId(int id) { self()->SetId(id); }
-  void SetKey(const std::string &key) { self()->SetKey(key); }
-
-  int id() const { return const_self()->id(); }
-
- protected:
-  ChildT *self() { return reinterpret_cast<ChildT *>(this); }
-  const ChildT *const_self() const {
-    return reinterpret_cast<const ChildT *>(this);
-  }
-};
-
-class BasicTimer : TimerBase<BasicTimer> {
-  uint64_t total_{};
-  uint64_t count_{};
-  uint32_t max_{std::numeric_limits<uint32_t>::min()};
-  uint32_t min_{std::numeric_limits<uint32_t>::max()};
-  int id_{-1};
-  std::string key_;
-  uint64_t timer_{};
-
-  // TODO(Superjomn) make static
-  static const int name_w;
-  static const int data_w;
-
- public:
-  BasicTimer() = default;
-  BasicTimer(int id, const std::string &key) : id_(id), key_(key) {}
-
-  void SetId(int id) { id_ = id; }
-  void SetKey(const std::string &key) { key_ = key; }
-  void Start() {
-    timer_ = static_cast<uint64_t>(
-        std::chrono::duration_cast<std::chrono::microseconds>(
-            std::chrono::system_clock::now().time_since_epoch())
-            .count());
-  }
-  void Stop() {
-    auto duration = static_cast<
-        uint64_t>(  // timer unit: microsecond, 1second = 1e6 microsecond
-        std::chrono::duration_cast<std::chrono::microseconds>(
-            std::chrono::system_clock::now().time_since_epoch())
-            .count() -
-        timer_);
-    Log(duration);
-  }
-
-  int count() const { return count_; }
-
-  void Log(uint32_t timespan) {
-    total_ += timespan;
-    max_ = std::max(max_, timespan);
-    min_ = std::min(min_, timespan);
-    count_++;
-  }
-
-  static std::string basic_repr_header() {
-    STL::stringstream ss;
-    ss << std::setw(name_w) << "kernel"   //
-       << std::setw(data_w) << "average"  //
-       << std::setw(data_w) << "min"      //
-       << std::setw(data_w) << "max"      //
-       << std::setw(data_w) << "count";
-    return ss.str();
-  }
-
-  std::string basic_repr() const {
-    STL::stringstream ss;
-    ss << std::setw(name_w) << key()  //
-       << std::setw(data_w) << ave()  //
-       << std::setw(data_w) << min()  //
-       << std::setw(data_w) << max()  //
-       << std::setw(data_w) << count_;
-    return ss.str();
-  }
-
-  const std::string &key() const { return key_; }
-
-  int id() const {
-    CHECK_GE(id_, 0) << "id is not inited";
-    return id_;
-  }
-
-  double ave() const { return total_ * 1. / count_; }
-  double max() const { return max_; }
-  double min() const { return min_; }
-
-  // BasicRecord(const BasicRecord &) = delete;
-  void operator=(const BasicTimer &) = delete;
-};
-
-/*
- * A basic profiler, with each record logs the total latency.
- */
-template <typename TimerT>
-class BasicProfiler {
- public:
-  explicit BasicProfiler(const std::string &name) : name_(name) {}
-  using record_t = TimerT;
-
-  static BasicProfiler &Global() {
-    static std::unique_ptr<BasicProfiler> x(new BasicProfiler("[global]"));
-    return *x;
-  }
-
-  record_t &NewRcd(const std::string &key) {
-    records_.emplace_back();
-    records_.back().SetId(records_.size() - 1);
-    records_.back().SetKey(key);
-    return records_.back();
-  }
-
-  const record_t &record(int id) {
-    CHECK_LT(id, records_.size());
-    CHECK_GE(id, 0);
-    return records_[id];
-  }
-
-  record_t *mutable_record(int id) {
-    CHECK_GE(id, 0);
-    CHECK_LT(static_cast<size_t>(id), records_.size());
-    return &records_[id];
-  }
-
-  std::string basic_repr() const {
-    STL::stringstream ss;
-    for (const auto &rcd : records_) {
-      ss << rcd.basic_repr() << "\n";
-    }
-    return ss.str();
-  }
-
-  ~BasicProfiler() {
-    LOG(INFO) << "Profile dumps:";
-    LOG(INFO) << "\n" + BasicTimer::basic_repr_header() + "\n" + basic_repr();
-  }
-
- private:
-  std::string name_;
-  std::vector<record_t> records_;
-};
-
-struct ProfileBlock {
-  explicit ProfileBlock(int id) : id_(id) {
-    BasicProfiler<BasicTimer>::Global().mutable_record(id_)->Start();
-  }
-
-  ~ProfileBlock() {
-    BasicProfiler<BasicTimer>::Global().mutable_record(id_)->Stop();
-  }
-
- private:
-  int id_{};
-};
-
-#define LITE_PROFILE_ONE(key__)                          \
-  static int key__##__profiler_id =                      \
-      ::paddle::lite::profile::BasicProfiler<            \
-          ::paddle::lite::profile::BasicTimer>::Global() \
-          .NewRcd(#key__)                                \
-          .id();                                         \
-  ::paddle::lite::profile::ProfileBlock key__##profiler__(key__##__profiler_id);
-
-}  // namespace profile
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/profile/basic_profiler_test.cc b/lite/core/profile/basic_profiler_test.cc
deleted file mode 100644
index 928fdd61cb..0000000000
--- a/lite/core/profile/basic_profiler_test.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/profile/basic_profiler.h"
-#include <gtest/gtest.h>
-#include <chrono>  // NOLINT
-#include <thread>  // NOLINT
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace profile {
-
-TEST(basic_record, init) {
-  BasicTimer timer;
-  timer.SetKey("hello");
-}
-
-TEST(basic_profile, init) {
-  auto& rcd = BasicProfiler<BasicTimer>::Global().NewRcd("fc");
-  for (int i = 11; i < 100; i++) {
-    rcd.Log(i);
-  }
-
-  LOG(INFO) << BasicProfiler<BasicTimer>::Global().basic_repr();
-}
-
-TEST(basic_profile, real_latency) {
-  LITE_PROFILE_ONE(test0);
-  std::this_thread::sleep_for(std::chrono::milliseconds(1200));
-}
-
-}  // namespace profile
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/profile/precision_profiler.h b/lite/core/profile/precision_profiler.h
deleted file mode 100644
index d9111e5c46..0000000000
--- a/lite/core/profile/precision_profiler.h
+++ /dev/null
@@ -1,137 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/*
- * This file implements BasicProfile, a profiler that helps to profile the basic
- * CPU execution. It can display the min, max, average lantency of the execution
- * of each kernel.
- */
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/program.h"
-
-namespace paddle {
-namespace lite {
-namespace profile {
-
-template <typename dtype>
-static void write_tensorfile(const Tensor* tensor, const std::string& locate) {
-  if (locate.find('/') != std::string::npos) {
-    return;
-  }
-  FILE* fp = fopen(locate.c_str(), "w");
-  if (fp == nullptr) {
-    LOG(ERROR) << "file open field " << locate;
-  } else {
-    const dtype* data = tensor->data<dtype>();
-    for (int i = 0; i < tensor->numel(); ++i) {
-      fprintf(fp, "[%d] %f \n", i, static_cast<float>(data[i]));
-    }
-  }
-  fclose(fp);
-}
-
-class PrecisionProfiler {
- public:
-  explicit PrecisionProfiler(const Instruction* inst) : inst_(inst) {}
-  ~PrecisionProfiler() {
-    LOG(INFO) << ">> Running kernel: " << inst_->op()->op_info()->Repr()
-              << " on Target " << TargetToStr(inst_->kernel()->target()) << " "
-              << PrecisionToStr(inst_->kernel()->precision());
-    auto tensor_mean = [](const Tensor* in,
-                          PrecisionType ptype,
-                          std::string name = "inst") -> double {
-      if (!in->data<int8_t>()) {
-        return -99999;
-      }
-      double sum = 0.;
-      switch (ptype) {
-        case PRECISION(kFloat): {
-          auto ptr = in->data<float>();
-          // write_tensorfile<float>(in, name);
-          for (int i = 0; i < in->numel(); ++i) {
-            sum += ptr[i];
-          }
-          return sum / in->numel();
-        }
-        case PRECISION(kAny): {
-          auto ptr = in->data<float>();
-          // write_tensorfile<float>(in, name);
-          for (int i = 0; i < in->numel(); ++i) {
-            sum += ptr[i];
-          }
-          return sum / in->numel();
-        }
-        case PRECISION(kInt8): {
-          auto ptr = in->data<int8_t>();
-          // write_tensorfile<int8_t>(in, name);
-          for (int i = 0; i < in->numel(); ++i) {
-            sum += ptr[i];
-          }
-          return sum / in->numel();
-        }
-        case PRECISION(kInt32): {
-          auto ptr = in->data<int32_t>();
-          // write_tensorfile<int32_t>(in, name);
-          for (int i = 0; i < in->numel(); ++i) {
-            sum += ptr[i];
-          }
-          return sum / in->numel();
-        }
-        default:
-          LOG(INFO) << "unsupport data type: " << PrecisionToStr(ptype);
-          return 0.;
-      }
-    };
-    if (inst_->op()->op_info()->Type() != "fetch") {
-      auto op = const_cast<lite::OpLite*>(inst_->op());
-      auto kernel = inst_->kernel();
-      auto op_scope = op->scope();
-      auto out_names = op->op_info()->output_names();
-      for (auto& out_name : out_names) {
-        std::string out_arg_name;
-        op->op_info()->GetOutputArgname(out_name, &out_arg_name);
-        auto type = kernel->GetOutputDeclType(out_arg_name);
-
-        if (type->IsTensor()) {
-          auto tout = op_scope->FindVar(out_name)->GetMutable<Tensor>();
-          double mean = tensor_mean(tout, type->precision(), out_name);
-          LOG(INFO) << "output name: " << out_name << ", dims: " << tout->dims()
-                    << ", precision: " << PrecisionToStr(type->precision())
-                    << ", mean value: " << mean << " shape:" << tout->dims();
-        } else if (type->IsTensorList()) {
-          auto tout =
-              op_scope->FindVar(out_name)->GetMutable<std::vector<Tensor>>();
-          for (auto& t : *tout) {
-            double mean = tensor_mean(&t, type->precision(), out_name);
-            LOG(INFO) << "output name: " << out_name << ", dims: " << t.dims()
-                      << ", precision: " << PrecisionToStr(type->precision())
-                      << ", mean value: " << mean;
-          }
-        }
-      }
-    }
-  }
-
- private:
-  const Instruction* inst_{nullptr};
-};
-
-}  // namespace profile
-}  // namespace lite
-}  // namespace paddle
-
-#define LITE_PRECISION_PROFILE(inst) \
-  { auto a = paddle::lite::profile::PrecisionProfiler(&inst); }
diff --git a/lite/core/program.cc b/lite/core/program.cc
deleted file mode 100644
index 179cdf909a..0000000000
--- a/lite/core/program.cc
+++ /dev/null
@@ -1,208 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/program.h"
-#include <unordered_map>
-#include "lite/model_parser/cpp/block_desc.h"
-#include "lite/model_parser/cpp/op_desc.h"
-#include "lite/model_parser/cpp/var_desc.h"
-#include "lite/operators/while_op.h"
-#ifdef LITE_WITH_PROFILE
-#include "lite/core/profile/precision_profiler.h"
-#endif
-
-namespace paddle {
-namespace lite {
-
-void RuntimeProgram::SaveOpInfosToProgram(cpp::ProgramDesc* desc) {
-  CHECK(desc);
-  // NOTE: RuntimeProgram do not has all meta info, so save model just update
-  // upon origin model
-  CHECK(desc->BlocksSize());
-  auto& main_block = *desc->GetBlock<cpp::BlockDesc>(0);
-  main_block.ClearOps();
-  for (auto& node : instructions_) {
-    auto* op = main_block.AddOp<cpp::OpDesc>();
-    *op = *node.op()->op_info();
-    op->SetAttr(kKernelTypeAttr, node.kernel()->SerializedKernelType());
-  }
-}
-
-// `UpdateVarsOfProgram` will remove unused var_descs and add new created
-// vars' descs in the block 0. Now, the type of a new created var can only
-// be LOD_TENSOR.
-void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
-  CHECK(desc);
-  CHECK(desc->BlocksSize());
-  std::unordered_map<std::string, cpp::VarDesc> origin_var_maps;
-  auto& main_block = *desc->GetBlock<cpp::BlockDesc>(0);
-  auto var_size = main_block.VarsSize();
-  for (int i = 0; i < var_size; i++) {
-    auto v = main_block.GetVar<cpp::VarDesc>(i);
-    auto name = v->Name();
-    origin_var_maps.emplace(name, *v);
-  }
-
-  main_block.ClearVars();
-  for (auto& node : instructions_) {
-    auto* op = const_cast<lite::OpLite*>(node.op());
-    auto* kernel = node.kernel();
-    auto* scope = op->scope();
-    auto in_names = op->op_info()->input_names();
-    auto out_names = op->op_info()->output_names();
-    for (auto& in_name : in_names) {
-      auto it = origin_var_maps.find(in_name);
-      if (it != origin_var_maps.end()) {
-        auto* v = main_block.AddVar<cpp::VarDesc>();
-        v->SetName((it->second).Name());
-        v->SetType((it->second).GetType());
-        v->SetPersistable((it->second).Persistable());
-      } else {
-        // New created vars must be LOD_TENSOR
-        auto* v = main_block.AddVar<cpp::VarDesc>();
-        v->SetName(in_name);
-        v->SetType(cpp::VarDesc::Type::LOD_TENSOR);
-        std::string in_arg_name;
-        op->op_info()->GetInputArgname(in_name, &in_arg_name);
-        auto type = kernel->GetInputDeclType(in_arg_name);
-        if (type->IsTensor()) {
-          auto tensor = scope->FindVar(in_name)->GetMutable<Tensor>();
-          v->SetPersistable(tensor->persistable());
-        } else {
-          CHECK(false) << "unsupported var type";
-        }
-      }
-    }
-
-    for (auto& out_name : out_names) {
-      auto it = origin_var_maps.find(out_name);
-      if (it != origin_var_maps.end()) {
-        auto* v = main_block.AddVar<cpp::VarDesc>();
-        v->SetName((it->second).Name());
-        v->SetType((it->second).GetType());
-        v->SetPersistable((it->second).Persistable());
-      } else {
-        // New created vars must be LOD_TENSOR
-        auto* v = main_block.AddVar<cpp::VarDesc>();
-        v->SetName(out_name);
-        v->SetType(cpp::VarDesc::Type::LOD_TENSOR);
-        std::string out_arg_name;
-        op->op_info()->GetOutputArgname(out_name, &out_arg_name);
-        auto type = kernel->GetOutputDeclType(out_arg_name);
-        if (type->IsTensor()) {
-          auto tensor = scope->FindVar(out_name)->GetMutable<Tensor>();
-          v->SetPersistable(tensor->persistable());
-        } else {
-          CHECK(false) << "unsupported var type";
-        }
-      }
-    }
-  }
-}
-
-void RuntimeProgram::Run() {
-  for (auto& inst : instructions_) {
-    VLOG(4) << ">> Running kernel: " << inst.op()->op_info()->Repr()
-            << " on Target " << TargetToStr(inst.kernel()->target());
-
-    inst.Run();
-#ifdef LITE_WITH_PROFILE
-#ifdef LITE_WITH_PRECISION_PROFILE
-    LITE_PRECISION_PROFILE(inst)
-#endif  // LITE_WITH_PRECISION_PROFILE
-#endif  // LITE_WITH_PROFILE
-  }
-}
-
-void Program::Build(const cpp::ProgramDesc& prog) {
-  CHECK(ops_.empty()) << "Executor duplicate Build found";
-
-  // Create operators.
-  auto program = prog;
-  CHECK(program.BlocksSize());
-  auto& main_block = *program.GetBlock<cpp::BlockDesc>(0);
-  for (size_t i = 0; i < main_block.OpsSize(); ++i) {
-    auto& op_desc = *main_block.GetOp<cpp::OpDesc>(i);
-    auto op_type = op_desc.Type();
-    // if (op_type == "feed" || op_type == "fetch") continue;
-    VLOG(4) << "create Op [" << op_type << "]";
-    auto op = LiteOpRegistry::Global().Create(op_type);
-    CHECK(op) << "no Op found for " << op_type;
-    if (op_type == "while") {
-      auto sub_block_idx = op_desc.GetAttr<int16_t>("sub_block");
-      auto sub_block =
-          const_cast<cpp::ProgramDesc&>(prog).GetBlock<cpp::BlockDesc>(
-              sub_block_idx);
-      static_cast<operators::WhileOpLite*>(op.get())->SetSubBlock(sub_block);
-    }
-    ops_.emplace_back(std::move(op));
-    ops_.back()->Attach(op_desc, exec_scope_);
-  }
-}
-
-void Program::PrepareWorkspace(const cpp::ProgramDesc& prog) {
-  CHECK(!exec_scope_) << "Duplicate PrepareWorkspace found";
-  exec_scope_ = &scope_->NewScope();
-  // Create Feed and Fetch var.
-  scope_->Var("feed")->GetMutable<std::vector<lite::Tensor>>();
-  scope_->Var("fetch")->GetMutable<std::vector<lite::Tensor>>();
-  tmp_vars_.push_back("feed");
-  tmp_vars_.push_back("fetch");
-
-  auto program = prog;
-  CHECK(program.BlocksSize());
-  for (size_t b = 0; b < program.BlocksSize(); ++b) {
-    auto& main_block = *program.GetBlock<cpp::BlockDesc>(b);
-    for (size_t i = 0; i < main_block.VarsSize(); ++i) {
-      auto& var_desc = *main_block.GetVar<cpp::VarDesc>(i);
-      if (!var_desc.Persistable()) {
-        tmp_vars_.push_back(var_desc.Name());
-        exec_scope_->Var(var_desc.Name());
-        if (b > 0) {
-          VLOG(4) << "var: " << var_desc.Name();
-        }
-      } else {
-        if (var_desc.Name() == "feed" || var_desc.Name() == "fetch") continue;
-        weights_.push_back(var_desc.Name());
-        if (var_desc.Persistable()) scope_->Var(var_desc.Name());
-      }
-    }
-  }
-}
-
-void Instruction::Run() {
-#ifdef LITE_WITH_PROFILE
-  profile::ProfileBlock x(profile_id_);
-#endif  // LITE_WITH_PROFILE
-  CHECK(op_) << "op null";
-  CHECK(kernel_) << "kernel null";
-  if (first_epoch_) {
-    first_epoch_ = false;
-    CHECK(op_->CheckShape());
-  }
-
-  if (op_->run_once() && has_run_) return;
-  VLOG(4) << "kernel launch";
-  op_->InferShape();
-  kernel_->Launch();
-  has_run_ = true;
-}
-
-STL::ostream& operator<<(STL::ostream& os, const Instruction& other) {
-  os << other.kernel_->summary() << "\t(" << other.kernel_->doc() << ")";
-  return os;
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/program.h b/lite/core/program.h
deleted file mode 100644
index 1b3c036db5..0000000000
--- a/lite/core/program.h
+++ /dev/null
@@ -1,156 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <list>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-#include "lite/model_parser/cpp/program_desc.h"
-#ifdef LITE_WITH_PROFILE
-#include "lite/core/profile/basic_profiler.h"
-#endif  // LITE_WITH_PROFILE
-
-namespace paddle {
-namespace lite {
-
-static const char kKernelTypeAttr[] = "__@kernel_type_attr@__";
-
-// A program is used to represent a code program, in Paddle, a code program
-// contains:
-// - main block, which is a list of OpLite
-// - scope: which contains all the weights
-struct Program {
- public:
-  explicit Program(const std::shared_ptr<Scope>& root) { scope_ = root; }
-  Program(const cpp::ProgramDesc& desc,
-          const std::shared_ptr<Scope>& root,
-          const std::vector<Place>& valid_places)
-      : scope_(root), valid_places_(valid_places), desc_(desc) {
-    CHECK(scope_) << "scope should be init first";
-    VLOG(4) << "prepare work";
-    PrepareWorkspace(desc);
-    VLOG(4) << "build desc";
-    Build(desc);
-    VLOG(4) << "build desc finished";
-  }
-
-  std::unique_ptr<Program> Clone() const {
-    std::unique_ptr<Program> res(new Program(desc_, scope_, valid_places_));
-    return res;
-  }
-
-  const std::list<std::string>& weights() const { return weights_; }
-  const std::list<std::string>& tmp_vars() const { return tmp_vars_; }
-  std::list<std::string>* mutable_weights() { return &weights_; }
-  std::list<std::string>* mutable_tmp_vars() { return &tmp_vars_; }
-
-  const std::list<std::shared_ptr<OpLite>>& ops() const { return ops_; }
-  std::list<std::shared_ptr<OpLite>>* mutable_ops() { return &ops_; }
-
-  lite::Scope* exec_scope() { return exec_scope_; }
-  lite::Scope* scope() { return scope_.get(); }
-
- private:
-  // Build from a program and scope.
-  void Build(const cpp::ProgramDesc& program);
-  // Create temporary variables.
-  void PrepareWorkspace(const cpp::ProgramDesc& program);
-
- private:
-  std::list<std::string> tmp_vars_;
-  std::list<std::string> weights_;
-  std::list<std::shared_ptr<OpLite>> ops_;
-  // the scope to run the kernels, NOTE this is the execution scope.
-  std::shared_ptr<lite::Scope> scope_;
-  std::vector<Place> valid_places_;
-  // Runtime scope.
-  lite::Scope* exec_scope_{};
-  cpp::ProgramDesc desc_;
-};
-
-struct Instruction {
-  Instruction(const std::shared_ptr<OpLite>& op,
-              std::unique_ptr<KernelBase>&& kernel)
-      : op_(op), kernel_(std::move(kernel)) {
-#ifdef LITE_WITH_PROFILE
-    profile_id_ = profile::BasicProfiler<profile::BasicTimer>::Global()
-                      .NewRcd(kernel_->SerializedKernelType())
-                      .id();
-#endif  // LITE_WITH_PROFILE
-  }
-
-  // Run the instruction.
-  void Run();
-
-  friend STL::ostream& operator<<(STL::ostream& os, const Instruction& other);
-
-  const OpLite* op() const { return op_.get(); }
-  const KernelBase* kernel() const { return kernel_.get(); }
-  KernelBase* mutable_kernel() { return kernel_.get(); }
-
- private:
-  std::shared_ptr<OpLite> op_;
-  std::unique_ptr<KernelBase> kernel_;
-  bool first_epoch_{true};
-  bool has_run_{false};
-
-#ifdef LITE_WITH_PROFILE
-  // for profiler
-  int profile_id_{-1};
-#endif  // LITE_WITH_PROFILE
-};
-
-/*
- * A program contains kernels for runtime.
- */
-class LITE_API RuntimeProgram {
- public:
-  explicit RuntimeProgram(std::vector<Instruction>&& insts)
-      : instructions_(std::move(insts)) {
-    if (instructions_.empty()) {
-      LOG(FATAL) << "no instructions";
-    }
-  }
-
-  void Run();
-
-  void set_exec_scope(lite::Scope* x) { exec_scope_ = x; }
-  lite::Scope* exec_scope() { return exec_scope_; }
-
-  size_t num_instructions() const { return instructions_.size(); }
-
-  const std::vector<Instruction>& instructions() const { return instructions_; }
-
-  // `SaveOpInfosToProgram` will update the op list(ops_) of the block 0
-  // in ProgramDesc.
-  void SaveOpInfosToProgram(cpp::ProgramDesc* desc);
-
-  // `UpdateVarsOfProgram` will update the var list(vars_) of the block 0 in
-  // ProgramDesc. Namely, if a new var created in some passes, its var_desc will
-  // be added in vars_.
-  void UpdateVarsOfProgram(cpp::ProgramDesc* desc);
-
- private:
-  RuntimeProgram(const RuntimeProgram&) = delete;
-  std::vector<Instruction> instructions_;
-  lite::Scope* exec_scope_{};
-};
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/program_fake_utils.cc b/lite/core/program_fake_utils.cc
deleted file mode 100644
index b4d7a00dfa..0000000000
--- a/lite/core/program_fake_utils.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/program_fake_utils.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/program_fake_utils.h b/lite/core/program_fake_utils.h
deleted file mode 100644
index edcbb101aa..0000000000
--- a/lite/core/program_fake_utils.h
+++ /dev/null
@@ -1,142 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <set>
-#include <string>
-#include <utility>
-#include <vector>
-#include "lite/core/mir/ssa_graph.h"
-#include "lite/core/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-
-namespace paddle {
-namespace lite {
-
-Program FakeProgram() {
-  Program program(std::make_shared<lite::Scope>());
-
-  auto add_fc = [&](int id, std::string x) {
-    // create variables
-    std::string w1 = "w" + std::to_string(id);
-    std::string b1 = "b" + std::to_string(id);
-    std::string out1 = "out" + std::to_string(id);
-    auto w1v = program.scope()->Var(w1)->GetMutable<lite::Tensor>();
-    auto b1v = program.scope()->Var(b1)->GetMutable<lite::Tensor>();
-    auto out1v = program.scope()->Var(out1)->GetMutable<lite::Tensor>();
-
-    cpp::OpDesc desc;
-    desc.SetInput("Input", {x});
-    desc.SetInput("W", {w1});
-    desc.SetInput("Bias", {b1});
-    desc.SetOutput("Out", {out1});
-    desc.SetType("fc");
-    desc.SetAttr("in_num_col_dims", 1);
-
-    // add to input
-    program.mutable_tmp_vars()->push_back(w1);
-    program.mutable_tmp_vars()->push_back(b1);
-
-    auto fc_op = LiteOpRegistry::Global().Create("fc");
-    fc_op->Attach(desc, program.scope());
-    program.mutable_ops()->emplace_back(std::move(fc_op));
-
-    w1v->Resize(DDimHvy(std::vector<int64_t>({100, 100})));
-    b1v->Resize(DDimHvy(std::vector<int64_t>({100, 1})));
-    out1v->Resize(DDimHvy(std::vector<int64_t>({100, 100})));
-
-    return out1;
-  };
-
-  // x1, w1, b1 -fc-> out1
-  // out1, w2, b2 -fc-> out2
-
-  std::string x = "x";
-  program.mutable_tmp_vars()->push_back(x);
-  auto* xv = program.scope()->Var(x)->GetMutable<lite::Tensor>();
-  xv->Resize(DDimHvy(std::vector<int64_t>({100, 100})));
-
-  for (int i = 0; i < 3; i++) {
-    x = add_fc(i, x);
-  }
-  return program;
-}
-
-class ProgramFaker {
- public:
-  ProgramFaker() {}
-
-  framework::ProgramDesc* program() {
-    desc_.Flush();
-    return &desc_;
-  }
-
-  void CreateVars(lite::Scope* scope) {
-    for (auto& var : tmp_vars_) {
-      auto* x = scope->Var(var);
-      x->GetMutable<lite::Tensor>();
-    }
-
-    for (auto& x : tmp_vars_) {
-      desc_.MutableBlock(0)->Var(x);
-    }
-  }
-
-  void AddMul(const std::string& X,
-              const std::string& Y,
-              const std::string& out) {
-    tmp_vars_.insert(X);
-    tmp_vars_.insert(Y);
-    tmp_vars_.insert(out);
-
-    auto* block = desc_.MutableBlock(0);
-    auto* op = block->AppendOp();
-    op->SetType("mul");
-    op->SetInput("X", {X});
-    op->SetInput("Y", {Y});
-    op->SetOutput("Out", {Y});
-    op->SetAttr("x_num_col_dims", 1);
-    op->SetAttr("y_num_col_dims", 1);
-  }
-
-  void AddFeed(const std::string& Out, int col) {
-    tmp_vars_.insert(Out);
-
-    auto* block = desc_.MutableBlock(0);
-    auto* op = block->AppendOp();
-    op->SetType("feed");
-    op->SetInput("X", {"feed"});
-    op->SetOutput("Out", {Out});
-    op->SetAttr("col", col);
-  }
-
-  void AddFetch(const std::string& Input, int col) {
-    tmp_vars_.insert(Input);
-    auto* block = desc_.MutableBlock(0);
-    auto* op = block->AppendOp();
-    op->SetType("fetch");
-    op->SetInput("X", {Input});
-    op->SetOutput("Out", {"fetch"});
-    op->SetAttr("col", col);
-  }
-
- private:
-  std::set<std::string> tmp_vars_;
-  std::vector<std::string> weight_vars_;
-  framework::ProgramDesc desc_;
-};
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/scope.cc b/lite/core/scope.cc
deleted file mode 100644
index 775652e2a0..0000000000
--- a/lite/core/scope.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/scope.h"
-
-namespace paddle {
-namespace lite {
-
-Scope::~Scope() {
-  for (auto *x : kids_) {
-    if (x) {
-      delete x;
-    }
-  }
-}
-
-Scope &Scope::NewScope() const {
-  kids_.push_back(new Scope);
-  kids_.back()->parent_ = this;
-  return *kids_.back();
-}
-
-Variable *Scope::Var(const std::string &name) {
-  auto *var = FindVar(name);
-  if (var) return var;
-
-  // create a new variable.
-  vars_.emplace(name, std::unique_ptr<Variable>(new Variable));
-  return vars_[name].get();
-}
-
-Variable *Scope::FindVar(const std::string &name) const {
-  Variable *var{nullptr};
-  var = FindLocalVar(name);
-  const Scope *cur_scope = this;
-  while (!var && cur_scope->parent()) {
-    cur_scope = cur_scope->parent();
-    var = cur_scope->FindLocalVar(name);
-  }
-
-  return var;
-}
-
-Variable *Scope::FindLocalVar(const std::string &name) const {
-  auto it = vars_.find(name);
-  if (it != vars_.end()) {
-    return it->second.get();
-  }
-  return nullptr;
-}
-
-std::vector<std::string> Scope::LocalVarNames() const {
-  std::vector<std::string> keys;
-  for (const auto &item : vars_) {
-    keys.push_back(item.first);
-  }
-  return keys;
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/scope.h b/lite/core/scope.h
deleted file mode 100644
index 2593c36522..0000000000
--- a/lite/core/scope.h
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <list>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include "lite/core/variable.h"
-
-namespace paddle {
-namespace lite {
-
-class Scope final {
- public:
-  Scope() {}
-  // delete below two functions to allow pybind to recognise it cannot make a
-  // copy
-  // link:
-  // https://stackoverflow.com/questions/53807248/pybind11-returning-a-pointer-to-a-container-of-unique-ptr
-  Scope(const Scope&) = delete;
-  Scope& operator=(const Scope&) = delete;
-  ~Scope();
-
-  Scope& NewScope() const;
-
-  Variable* Var(const std::string& name);
-
-  Variable* FindVar(const std::string& name) const;
-
-  Variable* FindLocalVar(const std::string& name) const;
-
-  const Scope* parent() const { return parent_; }
-
-  // Following the legacy scope interface.
-  std::vector<std::string> LocalVarNames() const;
-
-  /// ------------------------------------- helper functions for Tensor
-  /// ----------------------------------
-  // Create a Tensor variable. This will create a new Variable called `name`.
-  Tensor* NewTensor(const std::string& name) {
-    auto* var = Var(name);
-    return var->GetMutable<TensorLite>();
-  }
-
-  const Tensor* FindTensor(const std::string& name) {
-    auto* var = FindVar(name);
-    if (!var) return nullptr;
-    return &var->Get<TensorLite>();
-  }
-
-  Tensor* FindMutableTensor(const std::string& name) {
-    auto* var = FindVar(name);
-    if (!var) return nullptr;
-    return var->GetMutable<TensorLite>();
-  }
-
- private:
-  // Scope in `kids_` are owned by this class.
-  mutable std::list<Scope*> kids_;
-  const Scope* parent_{nullptr};
-  std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
-};
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/scope_test.cc b/lite/core/scope_test.cc
deleted file mode 100644
index 8806e6b1c0..0000000000
--- a/lite/core/scope_test.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/scope.h"
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace lite {
-
-TEST(Scope, Var) {
-  Scope scope;
-  auto* x = scope.Var("x");
-  *x->GetMutable<int>() = 100;
-
-  ASSERT_EQ(x->Get<int>(), 100);
-}
-
-TEST(Scope, FindVar) {
-  Scope scope;
-  ASSERT_FALSE(scope.FindVar("x"));
-  scope.Var("x");
-  ASSERT_TRUE(scope.FindVar("x"));
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/target_wrapper.cc b/lite/core/target_wrapper.cc
deleted file mode 100644
index 046336036b..0000000000
--- a/lite/core/target_wrapper.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/target_wrapper.h"
-#include <string>
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/target_wrapper.h b/lite/core/target_wrapper.h
deleted file mode 100644
index aa7dd6cc12..0000000000
--- a/lite/core/target_wrapper.h
+++ /dev/null
@@ -1,170 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <iostream>
-#include <sstream>
-#include <string>
-#include "lite/api/paddle_place.h"
-#include "lite/utils/cp_logging.h"
-
-#ifdef LITE_WITH_CUDA
-#include <cuda.h>
-#include <cuda_runtime.h>
-#endif  // LITE_WITH_CUDA
-
-namespace paddle {
-namespace lite {
-
-using lite_api::TargetType;
-using lite_api::PrecisionType;
-using lite_api::DataLayoutType;
-using lite_api::PrecisionTypeLength;
-using lite_api::TargetToStr;
-using lite_api::Place;
-using lite_api::PrecisionToStr;
-using lite_api::DataLayoutToStr;
-using lite_api::TargetRepr;
-using lite_api::PrecisionRepr;
-using lite_api::DataLayoutRepr;
-
-// Memory copy directions.
-enum class IoDirection {
-  HtoH = 0,  // Host to host
-  HtoD,      // Host to device
-  DtoH,      // Device to host
-  DtoD,      // Device to device
-};
-
-// This interface should be specified by each kind of target.
-template <TargetType Target, typename StreamTy = int, typename EventTy = int>
-class TargetWrapper {
- public:
-  using stream_t = StreamTy;
-  using event_t = EventTy;
-
-  static size_t num_devices() { return 0; }
-  static size_t maximum_stream() { return 0; }
-
-  static void CreateStream(stream_t* stream) {}
-  static void DestroyStream(const stream_t& stream) {}
-
-  static void CreateEvent(event_t* event) {}
-  static void DestroyEvent(const event_t& event) {}
-
-  static void RecordEvent(const event_t& event) {}
-  static void SyncEvent(const event_t& event) {}
-
-  static void StreamSync(const stream_t& stream) {}
-
-  static void* Malloc(size_t size) {
-    LOG(FATAL) << "Unimplemented malloc for " << TargetToStr(Target);
-    return nullptr;
-  }
-  static void Free(void* ptr) { LOG(FATAL) << "Unimplemented"; }
-
-  static void MemcpySync(void* dst,
-                         const void* src,
-                         size_t size,
-                         IoDirection dir) {
-    LOG(FATAL) << "Unimplemented";
-  }
-  static void MemcpyAsync(void* dst,
-                          const void* src,
-                          size_t size,
-                          IoDirection dir,
-                          const stream_t& stream) {
-    MemcpySync(dst, src, size, dir);
-  }
-};
-
-// This interface should be specified by each kind of target.
-using TargetWrapperHost = TargetWrapper<TARGET(kHost)>;
-using TargetWrapperX86 = TargetWrapperHost;
-template <>
-class TargetWrapper<TARGET(kHost)> {
- public:
-  using stream_t = int;
-  using event_t = int;
-
-  static size_t num_devices() { return 0; }
-  static size_t maximum_stream() { return 0; }
-
-  static void CreateStream(stream_t* stream) {}
-  static void DestroyStream(const stream_t& stream) {}
-
-  static void CreateEvent(event_t* event) {}
-  static void DestroyEvent(const event_t& event) {}
-
-  static void RecordEvent(const event_t& event) {}
-  static void SyncEvent(const event_t& event) {}
-
-  static void StreamSync(const stream_t& stream) {}
-
-  static void* Malloc(size_t size);
-  static void Free(void* ptr);
-
-  static void MemcpySync(void* dst,
-                         const void* src,
-                         size_t size,
-                         IoDirection dir);
-  static void MemcpyAsync(void* dst,
-                          const void* src,
-                          size_t size,
-                          IoDirection dir,
-                          const stream_t& stream) {
-    MemcpySync(dst, src, size, dir);
-  }
-};
-
-#ifdef LITE_WITH_FPGA
-template <>
-class TargetWrapper<TARGET(kFPGA)> {
- public:
-  using stream_t = int;
-  using event_t = int;
-
-  static size_t num_devices() { return 0; }
-  static size_t maximum_stream() { return 0; }
-
-  static void CreateStream(stream_t* stream) {}
-  static void DestroyStream(const stream_t& stream) {}
-
-  static void CreateEvent(event_t* event) {}
-  static void DestroyEvent(const event_t& event) {}
-
-  static void RecordEvent(const event_t& event) {}
-  static void SyncEvent(const event_t& event) {}
-
-  static void StreamSync(const stream_t& stream) {}
-
-  static void* Malloc(size_t size);
-  static void Free(void* ptr);
-
-  static void MemcpySync(void* dst,
-                         const void* src,
-                         size_t size,
-                         IoDirection dir);
-  static void MemcpyAsync(void* dst,
-                          const void* src,
-                          size_t size,
-                          IoDirection dir,
-                          const stream_t& stream) {
-    MemcpySync(dst, src, size, dir);
-  }
-};
-#endif
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/tensor.cc b/lite/core/tensor.cc
deleted file mode 100644
index 4dd4f5319d..0000000000
--- a/lite/core/tensor.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef LITE_WITH_FPGA
-
-#include "lite/core/tensor.h"
-#include <string>
-#include "lite/utils/string.h"
-
-namespace paddle {
-namespace lite {
-
-using value_type = int64_t;
-
-value_type DDimLite::production() const {
-  value_type res = 1;
-  for (size_t i = 0; i < this->size(); i++) {
-    res *= (*this)[i];
-  }
-  return res;
-}
-
-value_type DDimLite::count(int start, int end) const {
-  if (start < 0) {
-    start = 0;
-  }
-  if (end > size()) {
-    end = size();
-  }
-  if (end < start) {
-    end = start;
-  }
-  value_type sum = 1;
-  for (auto i = start; i < end; ++i) {
-    sum *= data_[i];
-  }
-  return sum;
-}
-
-DDimLite DDimLite::Slice(int start, int end) const {
-  std::vector<value_type> vec;
-  for (int i = start; i < end; i++) {
-    vec.push_back((*this)[i]);
-  }
-  return DDimLite(vec);
-}
-
-std::string DDimLite::repr() const {
-  STL::stringstream ss;
-  if (empty()) {
-    ss << "{}";
-    return ss.str();
-  }
-  ss << "{";
-  for (size_t i = 0; i < this->size() - 1; i++) {
-    ss << (*this)[i] << ",";
-  }
-  if (!this->empty()) ss << (*this)[size() - 1];
-  ss << "}";
-  return ss.str();
-}
-
-void TensorLite::ShareDataWith(const TensorLite &other) {
-  buffer_ = other.buffer_;
-  dims_ = other.dims_;
-  target_ = other.target_;
-  lod_ = other.lod_;
-  memory_size_ = other.memory_size_;
-}
-
-void *TensorLite::mutable_data(size_t memory_size) {
-  memory_size_ = memory_size;
-  buffer_->ResetLazy(target_, memory_size_);
-  return buffer_->data();
-}
-
-void *TensorLite::mutable_data(TargetType target, size_t memory_size) {
-  target_ = target;
-  return mutable_data(memory_size);
-}
-
-void TensorLite::CopyDataFrom(const TensorLite &other) {
-  dims_ = other.dims_;
-  target_ = other.target_;
-  lod_ = other.lod_;
-  memory_size_ = other.memory_size_;
-  buffer_->CopyDataFrom(*other.buffer_, memory_size_);
-}
-
-// static LoD TensorLite::ToAbsOffset(const LoD &lod) {
-//  if (lod.empty() || lod.size() == 1) return lod;
-//  LoD ret = lod;
-//  for (int level = static_cast<int>(lod.size()) - 2; level >= 0; --level) {
-//    for (size_t i = 0; i < lod[level].size(); ++i) {
-//      size_t index = lod[level][i];
-//      result[level][i] = result[level + 1][index];
-//    }
-//  }
-//}
-
-}  // namespace lite
-}  // namespace paddle
-
-#endif
diff --git a/lite/core/tensor.h b/lite/core/tensor.h
deleted file mode 100644
index aa4cb1b3c5..0000000000
--- a/lite/core/tensor.h
+++ /dev/null
@@ -1,249 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#ifdef LITE_WITH_FPGA
-#include "lite/backends/fpga/lite_tensor.h"
-#endif
-
-#ifndef LITE_WITH_FPGA
-
-#include <algorithm>
-#include <functional>  // for multiplies
-#include <memory>
-#include <numeric>
-#include <string>
-#include <vector>
-#include "lite/core/memory.h"
-#include "lite/utils/replace_stl/stream.h"
-
-namespace paddle {
-namespace lite {
-
-class DDimLite;
-class TensorLite;
-
-using DDim = lite::DDimLite;
-using Tensor = lite::TensorLite;
-
-class DDimLite {
- public:
-  using value_type = int64_t;
-
-  DDimLite() = default;
-
-  explicit DDimLite(const std::vector<value_type> &x) { ConstructFrom(x); }
-  // DDimLite(std::initializer_list<value_type> init_list) :
-  // DDimLite(std::vector<value_type>(init_list)) {}
-
-  void ConstructFrom(const std::vector<value_type> &x) { data_ = x; }
-
-  value_type operator[](int offset) const { return data_[offset]; }
-  value_type &operator[](int offset) { return data_[offset]; }
-  std::vector<int64_t> Vectorize() const { return data_; }
-
-  size_t size() const { return data_.size(); }
-  bool empty() const { return data_.empty(); }
-
-  value_type production() const;
-
-  const std::vector<value_type> &data() const { return data_; }
-  value_type count(int start, int end) const;
-
-  DDimLite Slice(int start, int end) const;
-
-  DDimLite Flatten2D(int col) const {
-    return DDimLite(std::vector<value_type>(
-        {Slice(0, col).production(), Slice(col, size()).production()}));
-  }
-
-  std::string repr() const;
-
-  friend STL::ostream &operator<<(STL::ostream &os, const DDimLite &dims) {
-    os << dims.repr();
-    return os;
-  }
-
-  friend bool operator==(const DDimLite &a, const DDimLite &b) {
-    if (a.size() != b.size()) return false;
-    for (size_t i = 0; i < a.size(); i++) {
-      if (a[i] != b[i]) return false;
-    }
-    return true;
-  }
-
-  friend bool operator!=(const DDimLite &a, const DDimLite &b) {
-    return !(a == b);
-  }
-
- private:
-  std::vector<value_type> data_;
-};
-
-using LoD = std::vector<std::vector<uint64_t>>;
-
-// A light-weight tensor implementation.
-class TensorLite {
- public:
-  TensorLite() : buffer_(std::make_shared<Buffer>()) {}
-
-  template <typename DType, typename DimT, TargetType Target>
-  void Assign(DType *data, const DimT &dim) {
-    Resize(dim);
-    auto *dst = mutable_data<DType, void>(Target);
-    CopySync<Target>(
-        dst, data, dim.production() * sizeof(DType), IoDirection::HtoD);
-  }
-
-  // T is the data type and R is the return type
-  // For OpenCL, the return type can be cl::Buffer
-  // and the data type can be float/int8_t.
-  // For other devices, T and R may be the same type.
-  template <typename T, typename R = T>
-  const R *data() const {
-    return reinterpret_cast<const R *>(static_cast<char *>(buffer_->data()) +
-                                       offset_);
-  }
-
-  void Resize(const DDimLite &ddim) { dims_ = ddim; }
-  void Resize(const std::vector<int64_t> &x) { dims_ = DDimLite(x); }
-
-  const DDimLite &dims() const { return dims_; }
-  int64_t numel() const { return dims_.production(); }
-
-  const LoD &lod() const { return lod_; }
-  LoD *mutable_lod() { return &lod_; }
-  void set_lod(const LoD &lod) { lod_ = lod; }
-
-  PrecisionType precision() const { return precision_; }
-  void set_precision(PrecisionType precision) { precision_ = precision; }
-
-  bool persistable() const { return persistable_; }
-  void set_persistable(bool persistable) { persistable_ = persistable; }
-
-  // T is the data type and R is the return type
-  // For OpenCL, the return type can be cl::Buffer
-  // and the data type can be float/int8_t.
-  // For other devices, T and R may be the same type.
-  template <typename T, typename R = T>
-  R *mutable_data();
-
-  // T is the data type and R is the return type
-  // For OpenCL, the return type can be cl::Buffer
-  // and the data type can be float/int8_t.
-  // For other devices, T and R may be the same type.
-  template <typename T, typename R = T>
-  R *mutable_data(TargetType target);
-  void *mutable_data(size_t memory_size);
-  void *mutable_data(TargetType target, size_t memory_size);
-
-  const void *raw_data() const {
-    return static_cast<char *>(
-        (static_cast<char *>(buffer_->data()) + offset_));
-  }
-
-  size_t data_size() const { return this->dims().production(); }
-
-  size_t memory_size() const { return memory_size_; }
-
-  size_t offset() const { return offset_; }
-
-  bool IsInitialized() const { return buffer_->data(); }
-
-  // Other share data to this.
-  void ShareDataWith(const TensorLite &other);
-
-  void CopyDataFrom(const TensorLite &other);
-
-  TargetType target() const { return target_; }
-
-  template <typename T>
-  TensorLite Slice(int64_t begin, int64_t end) const;
-
-  friend STL::ostream &operator<<(STL::ostream &os, const TensorLite &tensor) {
-    os << "Tensor:" << '\n';
-    os << "dim: " << tensor.dims() << '\n';
-    for (int i = 0; i < tensor.dims().production(); i++) {
-      os << tensor.template data<float>()[i] << " ";
-    }
-    os << "\n";
-    return os;
-  }
-
- private:
-  TargetType target_{TargetType::kHost};
-  // precision_ and persistable_ are only used for persistable vars.
-  // If your tensor wants to be saved and loaded correctly, you must
-  // set values of precision_ and persistable_ after updating it.
-  // If your tensor is just a temp tensor, such as activations,
-  // you can ignore these two attributes.
-  PrecisionType precision_{PrecisionType::kUnk};
-  bool persistable_{false};
-
-  DDimLite dims_;
-  std::shared_ptr<Buffer> buffer_;
-  LoD lod_;
-  size_t memory_size_{};
-
-  /// @brief Buffer may be shared with other tensors
-  size_t offset_{0};
-};
-
-template <typename T, typename R>
-R *TensorLite::mutable_data() {
-  memory_size_ = dims_.production() * sizeof(T);
-  buffer_->ResetLazy(target_, memory_size_);
-  return reinterpret_cast<R *>(static_cast<char *>(buffer_->data()) + offset_);
-}
-
-template <typename T, typename R>
-R *TensorLite::mutable_data(TargetType target) {
-  target_ = target;
-  memory_size_ = dims_.production() * sizeof(T);
-  buffer_->ResetLazy(target, memory_size());
-  return reinterpret_cast<R *>(static_cast<char *>(buffer_->data()) + offset_);
-}
-
-template <typename T>
-TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
-  CHECK_GE(begin, 0);
-  CHECK_LE(end, dims_[0]);
-  CHECK_LT(begin, end);
-  if (dims_[0] == 1) {
-    return *this;
-  } else {
-    int64_t base = numel() / dims_[0];
-    TensorLite dst;
-    dst.buffer_ = buffer_;
-    dst.target_ = target_;
-    auto dst_dims = dims_;
-    dst_dims[0] = end - begin;
-    dst.Resize(dst_dims);
-    dst.offset_ = offset_ + static_cast<size_t>(begin * base) * sizeof(T);
-    return dst;
-  }
-}
-
-template <typename TensorT>
-bool TensorCompareWith(const TensorT &a, const TensorT &b) {
-  if (a.dims() != b.dims()) return false;
-  if (memcmp(a.raw_data(), b.raw_data(), a.data_size()) != 0) return false;
-  return true;
-}
-
-}  // namespace lite
-}  // namespace paddle
-
-#endif
diff --git a/lite/core/type_system.cc b/lite/core/type_system.cc
deleted file mode 100644
index 276d0c4a34..0000000000
--- a/lite/core/type_system.cc
+++ /dev/null
@@ -1,157 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/type_system.h"
-#include "lite/utils/string.h"
-
-namespace paddle {
-namespace lite {
-
-size_t ParamTypeRegistry::KernelIdTy::hash() const {
-  std::hash<std::string> h;
-  size_t hash = h(kernel_type);
-  hash = hash_combine(hash, place.hash());
-  hash = hash_combine(hash, std::hash<int>()(static_cast<int>(io)));
-  hash = hash_combine(hash, std::hash<std::string>()(arg_name));
-  return hash;
-}
-
-STL::ostream &operator<<(STL::ostream &os, const Type &other) {
-  os << other.name();
-  return os;
-}
-
-// An map is used to maintain a global repo for types. We don't use
-// MACROs with static variables for that the TypeSystem should only used in
-// compile time, that is not performance sensitive, and a map-based way is
-// easier to implement and maintain.
-//
-// The map is declared in each Type::GetXXX method other than in the Type class
-// so that it will force to construct before any usage.
-
-const Type *Type::GetTensorTy(TargetType target,
-                              PrecisionType precision,
-                              DataLayoutType layout,
-                              int device) {
-  static std::map<size_t, const Type *> type_repo;
-  // NOTE quite naive implementation here, but not performance sensitive.
-  DataType::ID type_id = DataType::ID::Tensor;
-
-#define HASH_ONE(x) v = hash_combine(v, hasher(static_cast<int>(x)))
-
-  std::hash<int> hasher;
-  size_t v = hasher(static_cast<int>(type_id));
-  HASH_ONE(target);
-  HASH_ONE(precision);
-  HASH_ONE(layout);
-  HASH_ONE(device);
-#undef HASH_ONE
-
-  STL::stringstream name;
-  name << "Tensor<";
-  name << TargetToStr(target) << ",";
-  name << PrecisionToStr(precision) << ",";
-  name << DataLayoutToStr(layout) << ",";
-  name << device;
-  name << ">";
-
-  if (!type_repo[v])
-    // The Types should alive across the process life, no need to delete.
-    type_repo[v] =
-        new Type(type_id, name.str(), target, precision, layout, device);
-  return type_repo[v];
-}
-
-const Type *Type::GetTensorListTy(TargetType target,
-                                  PrecisionType precision,
-                                  DataLayoutType layout,
-                                  int device) {
-  static std::map<size_t, const Type *> type_repo;
-  DataType::ID type_id = DataType::ID::TensorList;
-
-#define HASH_ONE(x) v = hash_combine(v, hasher(static_cast<int>(x)))
-
-  std::hash<int> hasher;
-  size_t v = hasher(static_cast<int>(type_id));
-  HASH_ONE(target);
-  HASH_ONE(precision);
-  HASH_ONE(layout);
-  HASH_ONE(device);
-#undef HASH_ONE
-
-  STL::stringstream name;
-  name << "TensorList<";
-  name << TargetToStr(target) << ",";
-  name << PrecisionToStr(precision) << ",";
-  name << DataLayoutToStr(layout) << ",";
-  name << device;
-  name << ">";
-
-  if (!type_repo[v])
-    // The Types should alive across the process life, no need to delete.
-    type_repo[v] =
-        new Type(type_id, name.str(), target, precision, layout, device);
-  return type_repo[v];
-}
-
-const Type *Type::GetUnsupportedTy() {
-  static std::map<size_t, const Type *> type_repo;
-  std::hash<int> hasher;
-  size_t v = hasher(static_cast<int>(DataType::ID::Unsupported));
-  if (!type_repo[v])
-    type_repo[v] = new Type(DataType::ID::Unsupported,
-                            "Unsupported",
-                            TARGET(kUnk),
-                            PRECISION(kUnk),
-                            DATALAYOUT(kUnk),
-                            -1);
-  return type_repo[v];
-}
-
-const Type *Type::GetVoidTy() {
-  static std::map<size_t, const Type *> type_repo;
-  std::hash<int> hasher;
-  size_t v = hasher(static_cast<int>(DataType::ID::Void));
-  if (!type_repo[v])
-    type_repo[v] = new Type(DataType::ID::Void,
-                            "Void",
-                            TARGET(kAny),
-                            PRECISION(kAny),
-                            DATALAYOUT(kAny),
-                            -1);
-  return type_repo[v];
-}
-
-const Type *Type::Get(DataType::ID type_id,
-                      TargetType target,
-                      PrecisionType precision,
-                      DataLayoutType layout,
-                      int device) {
-  switch (type_id) {
-    case DataType::ID::Void:
-      return GetVoidTy();
-    case DataType::ID::Unsupported:
-      return GetUnsupportedTy();
-    case DataType::ID::Tensor:
-      return GetTensorTy(target, precision, layout, device);
-    case DataType::ID::TensorList:
-      return GetTensorListTy(target, precision, layout, device);
-    default:
-      LOG(FATAL) << "Unknown Type found";
-      return nullptr;
-  }
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/type_system.h b/lite/core/type_system.h
deleted file mode 100644
index 722cdca0eb..0000000000
--- a/lite/core/type_system.h
+++ /dev/null
@@ -1,390 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-// This file contains the file system of the lite system. Every data type in
-// Variable should be registered here, and the analysis phase will check the
-// data type correction.
-// This mechanism is made for keeping our system simpler and more stable, for
-// the dubious typed Variables in the Operators' inputs and outputs are disaster
-// for analysis and runtime.
-
-#include <map>
-#include <string>
-#include <typeinfo>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "lite/core/tensor.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-
-// Type is the definition of all the types that supported by the Variable that
-// represents as the input and output of an operator or kernel.
-// The DNN system is simple, just a list of operators, and the architecture
-// can not process that many data types as a compiler, or that will turn out to
-// a chaos.
-//
-// We should make sure that the supported data types be registered here, and
-// keep the set small and avoid using some special data types as op's
-// inputs or outputs, such as some runtime cache, those types can't be processed
-// by the MIR.
-//
-// A tensor with different places(target, precision, data layout or device)
-// should be treated as different types. Different types might be compatible
-// with each other, for example, the `VoidTy` means any type, so any other types
-// can be treated as a `VoidTy`.
-//
-// The Different Types can transform to others by adding some special
-// transforming operators, for example, a DataLayoutTransformOp can convert a
-// `TensorFp32NCHWTy` to a `TensorFp32NHWCTy`; a IoCopyOp can convert a
-// `TensorFp32NCHWTy(kHost)` to `TensorFp32NCHWTy(kCUDA)`. There are many other
-// convertions between different Types, but there are some unsupported type
-// convertions, for example, there is noway to convert a `UnsupportedTy` to a
-// `TensorAnyTy`.
-//
-// We use Types to declare the definition of a kernel, each inputs' and outputs'
-// arguments have a specific Types.
-//
-// REGISTER_LITE_KERNEL(mul, kHost, kFloat,
-//     paddle::lite::kernels::host::MulCompute, def)
-//   .BindInput("X", {paddle::lite::Type::Get<paddle::lite::TensorFp32NCHWTy>(
-//       TARGET(kHost))})
-//   .BindInput("Y", {paddle::lite::Type::Get<paddle::lite::TensorFp32NCHWTy>(
-//       TARGET(kHost))})
-//   .BindOutput("Out",
-//   {paddle::lite::Type::Get<paddle::lite::TensorFp32NCHWTy>(TARGET(kHost))})
-//   .Finalize();
-//
-// The above definition will be used in MIR by Type inference and uncompatible
-// types check.
-//
-// TODO(Superjomn) Add operator/kernel-wise static checking to avoid unsupported
-// type mixed in the system.
-class DataType {
- public:
-  // The Void type can cast to any other type.
-  // The Unsupported is the data type that developed include in the system, for
-  // example, some `std::set` is used as input of some operator. It wan't be
-  // analyzed or optimized by the system, that way results in many bugs in
-  // previous system, so it should be avoided.
-  enum class ID : int {
-    Void = 0,     // unknown type that can be cast to any data type.
-    Unsupported,  // Unsupported data type that will not be analyzed.
-    // Tensor_Any represents a Tensor with any place, data, layout. It is used
-    // in some IO kernels those doesn't care the data.
-    Tensor,
-    // A tensor list, but all the elements should have the same type.
-    TensorList,
-    // ---------
-    NumTypes,  // Must remains as last defined ID.
-  };
-
-  ID id() const { return id_; }
-
-  // type check.
-  bool IsVoid() const { return id_ == ID::Void; }
-  bool IsUnsupported() const { return id_ == ID::Unsupported; }
-  bool IsTensor() const { return id_ == ID::Tensor; }
-  bool IsTensorList() const { return id_ == ID::TensorList; }
-  // Get number of types.
-  int num_types() const { return static_cast<int>(ID::NumTypes); }
-
- protected:
-  // Can only extended by subclass.
-  explicit DataType(ID id) : id_(id) {}
-
-  ID id_{ID::Unsupported};
-};
-
-/*
- * Datatype with device info considered.
- * NOTE A Type with different device is treated as different DeviceDataType.
- */
-class Type : public DataType {
- public:
-  // Can cast to another type. This is heavily used in MIR, by determine whether
-  // is is possible to add a statement to transform a type to another.
-  virtual bool TypeCastable(const Type& type) const { return id_ == type.id(); }
-
-  /// Get a Tensor type.
-  static const Type* GetTensorTy(TargetType target,
-                                 PrecisionType precision = PRECISION(kFloat),
-                                 DataLayoutType layout = DATALAYOUT(kNCHW),
-                                 int device = 0);
-  /// Get a TensorList type.
-  static const Type* GetTensorListTy(
-      TargetType target,
-      PrecisionType precision = PRECISION(kFloat),
-      DataLayoutType layout = DATALAYOUT(kNCHW),
-      int device = 0);
-  /// Get an Unsupported type.
-  static const Type* GetUnsupportedTy();
-  /// Get an Void type.
-  static const Type* GetVoidTy();
-
-  static const Type* Get(DataType::ID type_id,
-                         TargetType target = TARGET(kUnk),
-                         PrecisionType precision = PRECISION(kUnk),
-                         DataLayoutType layout = DATALAYOUT(kUnk),
-                         int device = 0);
-
-  TargetType target() const { return place_.target; }
-  PrecisionType precision() const { return place_.precision; }
-  DataLayoutType layout() const { return place_.layout; }
-  int16_t device() const { return place().device; }
-  const Place& place() const { return place_; }
-  const std::string& name() const { return name_; }
-
-  bool operator==(const Type& other) {
-    return id_ == other.id() && place_ == other.place();
-  }
-  friend STL::ostream& operator<<(STL::ostream& os, const Type& other);
-
-  virtual ~Type() = default;
-
- protected:
-  /// One should avoid using this construct.
-  Type(ID id,
-       const std::string& name,
-       TargetType target = TargetType::kHost,
-       PrecisionType precision = PrecisionType::kFloat,
-       DataLayoutType layout = DataLayoutType::kNCHW,
-       int16_t device = 0)
-      : DataType(id), place_{target, precision, layout, device}, name_(name) {}
-
-  Place place_;
-  const std::string name_;
-};
-
-// -------------------------------- compatible check ---------------------------
-static bool TargetCompatibleTo(const Type& a, const Type& b) {
-  auto is_host = [](TargetType x) -> bool {
-    return x == TARGET(kHost) || x == TARGET(kX86) || x == TARGET(kARM);
-  };
-  if (a.IsVoid() || b.IsVoid()) return true;
-  if (a.IsTensor() || b.IsTensor()) {
-    if (a.IsTensor() && b.IsTensor()) {
-      return is_host(a.target()) ? is_host(b.target())
-                                 : a.target() == b.target();
-    }
-    return false;
-  }
-  return true;
-}
-
-static bool DataLayoutCompatibleTo(const Type& a, const Type& b) {
-  return a.IsVoid() ||                  //
-         ((a.layout() == b.layout() ||  //
-           b.layout() == DATALAYOUT(kAny)));
-}
-static bool DataLayoutCompatible(const Type& a, const Type& b) {
-  return a.IsVoid() || b.IsVoid() ||    //
-         ((a.layout() == b.layout() ||  //
-           b.layout() == DATALAYOUT(kAny) ||
-           a.layout() == DATALAYOUT(kAny)));
-}
-
-static bool PrecisionCompatibleTo(const Type& a, const Type& b) {
-  return a.IsVoid() ||  //
-         (((a.IsTensor() && b.IsTensor()) ||
-           (a.IsTensorList() && b.IsTensorList())) &&
-          (a.precision() == b.precision() ||  //
-           b.precision() == PRECISION(kAny) ||
-           a.precision() == PRECISION(kAny)));
-}
-static bool PrecisionCompatible(const Type& a, const Type& b) {
-  return a.IsVoid() || b.IsVoid() ||                                          //
-         (a.IsTensor() && b.IsTensor() && (a.precision() == b.precision() ||  //
-                                           b.precision() == PRECISION(kAny) ||
-                                           a.precision() == PRECISION(kAny)));
-}
-
-static bool DeviceCompatibleTo(const Type& a, const Type& b) {
-  return a.IsVoid() ||  //
-         (a.IsTensor() && b.IsTensor() && (a.device() == b.device()));
-}
-
-// Can type 'a' be passed to 'b' directly.
-static bool TypeCompatibleTo(const Type& a, const Type& b) {
-  return TargetCompatibleTo(a, b) && DataLayoutCompatibleTo(a, b) &&
-         PrecisionCompatibleTo(a, b) && DeviceCompatibleTo(a, b);
-}
-static bool TypeCompatible(const Type& a, const Type& b) {
-  return TargetCompatibleTo(a, b) && DataLayoutCompatible(a, b) &&
-         PrecisionCompatible(a, b) && DeviceCompatibleTo(a, b);
-}
-
-/*
- * ParamType is used to represent a data type of a parameter for the kernel. It
- * can represent any Variable data type.
- * The element_type_hash is the hash code of the element, it should be
- * registered in the `TypeSystem`.
- */
-struct ParamType {
-  const Type* type;
-
-  ParamType() = default;
-  ParamType(const Type* type) : type(type) {}  // NOLINT
-
-  std::string DebugString() const { return type->name(); }
-};
-
-/*
- * The data types of kernel parameters. It is used to track the type of kernel's
- * inputs and outputs.
- */
-struct ParamTypeRecorder {
-  std::map<std::string, ParamType> inputs;
-  std::map<std::string, ParamType> outputs;
-
-  void RegisterInputType(const std::string& arg_name, const ParamType& type) {
-    Register(&inputs, arg_name, type);
-  }
-
-  void RegisterOutputType(const std::string& arg_name, const ParamType& type) {
-    Register(&outputs, arg_name, type);
-  }
-
- private:
-  void Register(std::map<std::string, ParamType>* ts,
-                const std::string& arg_name,
-                ParamType type) {
-    (*ts)[arg_name] = type;
-  }
-};
-
-/*
- * The ParamTypeRegistry help register the input and output data types for all
- * the kernels. It is made singleton so that all the objects of the same kernel
- * can share the same information.
- *
- * Usage:
- * for register a kernel for FC operator.
- * ParamTypeRegistry::Global().Register(
- *        "fc", {TARGET(kCUDA), PRECISION(kFloat)}, 0,
- *        {typeid(Tensor), {TARGET(kCUDA)}});
- */
-class ParamTypeRegistry {
- public:
-  enum class IO : int { kInput = 0, kOutput };
-
-  template <TargetType target,
-            PrecisionType precision,
-            DataLayoutType layout = DataLayoutType::kNCHW>
-  /*
-   * Helper class for registering a ParamType for a Kernel.
-   * Usage:
-   *
-   * NewInstance<TARGET(kHost), PRECISION(kFloat)>("fc")
-   *   .BindInput(0, {typeid(Tensor).hash_code(), {TARGET(kHost)})
-   *   .BindInput(1, {typeid(Tensor).hash_code(), {TARGET(kHost),
-   *                                               PRECISION(kFloat)});
-   */
-  struct NewInstance {
-    explicit NewInstance(const std::string& kernel_type)
-        : kernel_type_(kernel_type) {}
-
-    NewInstance& BindInput(const std::string& arg_name,
-                           const ParamType& ptype) {
-      ParamTypeRegistry::Global().Register<IO::kInput>(
-          kernel_type_, Place{target, precision, layout}, arg_name, ptype);
-      return *this;
-    }
-    NewInstance& BindOutput(const std::string& arg_name,
-                            const ParamType& ptype) {
-      ParamTypeRegistry::Global().Register<IO::kOutput>(
-          kernel_type_, Place{target, precision, layout}, arg_name, ptype);
-      return *this;
-    }
-
-    bool Finalize() { return true; }
-
-   private:
-    std::string kernel_type_;
-  };
-
-  template <IO io>
-  void Register(const std::string& kernel_type,
-                const Place& place,
-                const std::string& arg_name,
-                ParamType data_type) {
-    KernelIdTy key{kernel_type, place, io, arg_name};
-    types_[key] = data_type;
-    CHECK(types_.count(key));
-  }
-
-  const ParamType* RetrieveInArgument(const Place& place,
-                                      const std::string& op_type,
-                                      const std::string& arg_name) {
-    return Retrieve<IO::kInput>(place, op_type, arg_name);
-  }
-  const ParamType* RetrieveOutArgument(const Place& place,
-                                       const std::string& op_type,
-                                       const std::string& arg_name) {
-    return Retrieve<IO::kOutput>(place, op_type, arg_name);
-  }
-
-  static ParamTypeRegistry& Global() {
-    static ParamTypeRegistry x;
-    return x;
-  }
-
-  friend STL::ostream& operator<<(STL::ostream& os,
-                                  const ParamTypeRegistry& other) {
-    for (auto& item : other.types_) {
-      os << item.first << " " << item.second.DebugString() << "\n";
-    }
-    return os;
-  }
-
- protected:
-  template <IO io>
-  const ParamType* Retrieve(const Place& place,
-                            const std::string& op_type,
-                            const std::string& arg_name) {
-    KernelIdTy key{op_type, place, io, arg_name};
-    auto it = types_.find(key);
-    if (it == types_.end()) return nullptr;
-    return &it->second;
-  }
-
- private:
-  ParamTypeRegistry() = default;
-
- public:
-  // Identification for a Kernel.
-  struct KernelIdTy {
-    std::string kernel_type;
-    Place place;
-    IO io;
-    std::string arg_name;
-
-    size_t hash() const;
-    friend STL::ostream& operator<<(STL::ostream& os, const KernelIdTy& other);
-  };
-
-  using key_t = KernelIdTy;
-  struct KeyCmp {
-    bool operator()(const key_t& a, const key_t& b) const;
-  };
-
- private:
-  std::map<key_t, ParamType, ParamTypeRegistry::KeyCmp> types_;
-};
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/type_system_test.cc b/lite/core/type_system_test.cc
deleted file mode 100644
index 224a779fcb..0000000000
--- a/lite/core/type_system_test.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/type_system.h"
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace lite {
-
-TEST(TypeSystem, CheckDuplicateGet) {
-  auto* tensor_ty =
-      Type::GetTensorTy(TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
-  auto* tensor_ty1 =
-      Type::GetTensorTy(TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
-
-  ASSERT_EQ(tensor_ty, tensor_ty1);
-
-  ASSERT_EQ(tensor_ty->target(), TARGET(kHost));
-  ASSERT_EQ(tensor_ty->precision(), PRECISION(kFloat));
-  ASSERT_EQ(tensor_ty->layout(), DATALAYOUT(kNCHW));
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/types.cc b/lite/core/types.cc
deleted file mode 100644
index ec89e83e58..0000000000
--- a/lite/core/types.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/types.h"
-
-namespace paddle {
-namespace lite {
-namespace core {
-
-KernelPickFactor& KernelPickFactor::ConsiderDataLayout() {
-  data_ |= static_cast<int>(Factor::DataLayoutFirst);
-  return *this;
-}
-KernelPickFactor& KernelPickFactor::ConsiderPrecision() {
-  data_ |= static_cast<int>(Factor::PrecisionFirst);
-  return *this;
-}
-KernelPickFactor& KernelPickFactor::ConsiderTarget() {
-  data_ |= static_cast<int>(Factor::TargetFirst);
-  return *this;
-}
-KernelPickFactor& KernelPickFactor::ConsiderDevice() {
-  data_ |= static_cast<int>(Factor::DeviceFirst);
-  return *this;
-}
-bool KernelPickFactor::IsPrecisionConsidered() const {
-  return data_ & static_cast<int>(Factor::PrecisionFirst);
-}
-bool KernelPickFactor::IsTargetConsidered() const {
-  return data_ & static_cast<int>(Factor::TargetFirst);
-}
-bool KernelPickFactor::IsDataLayoutConsidered() const {
-  return data_ & static_cast<int>(Factor::DataLayoutFirst);
-}
-bool KernelPickFactor::IsDeviceConsidered() const {
-  return data_ & static_cast<int>(Factor::DeviceFirst);
-}
-
-STL::ostream& operator<<(STL::ostream& os, const KernelPickFactor& k) {
-  std::stack<bool> bits;
-  auto data = k.data_;
-  while (data) {
-    bits.push(data % 2);
-    data /= 2;
-  }
-  int nbits = bits.size();
-  for (size_t i = 0; i < sizeof(data) * 8 - nbits; i++) {
-    os << 0;
-  }
-  while (!bits.empty()) {
-    os << bits.top();
-    bits.pop();
-  }
-  return os;
-}
-
-template <>
-Type StdTypeToRepr<int32_t>() {
-  return Type::_int32;
-}
-template <>
-Type StdTypeToRepr<int64_t>() {
-  return Type::_int64;
-}
-template <>
-Type StdTypeToRepr<float>() {
-  return Type::_float32;
-}
-template <>
-Type StdTypeToRepr<double>() {
-  return Type::_float64;
-}
-template <>
-Type StdTypeToRepr<std::string>() {
-  return Type::_string;
-}
-template <>
-Type StdTypeToRepr<bool>() {
-  return Type::_bool;
-}
-
-}  // namespace core
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/types.h b/lite/core/types.h
deleted file mode 100644
index efb8a096e5..0000000000
--- a/lite/core/types.h
+++ /dev/null
@@ -1,147 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <stack>
-#include <string>
-#include "lite/api/paddle_place.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace core {
-
-/*
- * Type representations used to represent standard types.
- */
-// TODO(Superjomn) unify all the type representation across the lite framework.
-enum class Type {
-  _unk = -1,
-  // primary types
-  _int32,
-  _int64,
-  _float32,
-  _float64,
-  _bool,
-  _string,
-  // primary list types
-  _list,
-  // enum type
-  _enum,
-  _float16,
-  // number of types
-  __num__,
-};
-
-enum class FluidType {
-  // Pod Types
-  BOOL = 0,
-  INT16 = 1,
-  INT32 = 2,
-  INT64 = 3,
-  FP16 = 4,
-  FP32 = 5,
-  FP64 = 6,
-  // Tensor<size_t> is used in C++.
-  SIZE_T = 19,
-  UINT8 = 20,
-  INT8 = 21,
-
-  // Other types that may need additional descriptions
-  LOD_TENSOR = 7,
-  SELECTED_ROWS = 8,
-  FEED_MINIBATCH = 9,
-  FETCH_LIST = 10,
-  STEP_SCOPES = 11,
-  LOD_RANK_TABLE = 12,
-  LOD_TENSOR_ARRAY = 13,
-  PLACE_LIST = 14,
-  READER = 15,
-  // Any runtime decided variable type is raw
-  // raw variables should manage their own allocations
-  // in operators like nccl_op
-  RAW = 17,
-  TUPLE = 18,
-};
-
-template <typename T>
-Type StdTypeToRepr() {
-  return Type::_unk;
-}
-template <>
-Type StdTypeToRepr<int32_t>();
-template <>
-Type StdTypeToRepr<int64_t>();
-template <>
-Type StdTypeToRepr<float>();
-template <>
-Type StdTypeToRepr<bool>();
-template <>
-Type StdTypeToRepr<std::string>();
-
-// Factors that impact the kernel picking strategy. Multiple factors can be
-// considered together by using statement like 'factor1 | factor2'
-class KernelPickFactor {
- public:
-  using value_type = unsigned char;
-  enum class Factor : int {
-    // The following factors are sorted by priority.
-    TargetFirst = 1,
-    PrecisionFirst = 1 << 1,
-    DataLayoutFirst = 1 << 2,
-    DeviceFirst = 1 << 3,
-  };
-
-  // Has any factors considered.
-  bool any_factor_considered() const { return data_; }
-
-  KernelPickFactor& ConsiderTarget();
-  // Prefer a specific target, e.g. prefer CUDA kernels.
-  KernelPickFactor& ConsiderPrecision();
-  KernelPickFactor& ConsiderDataLayout();
-  KernelPickFactor& ConsiderDevice();
-
-  bool IsTargetConsidered() const;
-  bool IsPrecisionConsidered() const;
-  bool IsDataLayoutConsidered() const;
-  bool IsDeviceConsidered() const;
-
-  friend STL::ostream& operator<<(STL::ostream& os, const KernelPickFactor& k);
-
- private:
-  unsigned char data_{};
-  lite_api::TargetType target_{TARGET(kUnk)};
-};
-
-struct dim2 {
-  int x{};
-  int y{};
-
-  dim2(int x, int y) : x(x), y(y) {}
-};
-
-struct dim3 {
-  int x{};
-  int y{};
-  int z{};
-
-  dim3(int x, int y, int z) : x(x), y(y), z(z) {}
-};
-
-using byte_t = uint8_t;
-
-}  // namespace core
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/types_test.cc b/lite/core/types_test.cc
deleted file mode 100644
index 9b7e5b6f05..0000000000
--- a/lite/core/types_test.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/types.h"
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace lite {
-namespace core {
-
-TEST(KernelPickFactor, Default) {
-  KernelPickFactor factor;
-  ASSERT_FALSE(factor.IsTargetConsidered());
-  ASSERT_FALSE(factor.IsPrecisionConsidered());
-  ASSERT_FALSE(factor.IsDataLayoutConsidered());
-}
-
-TEST(KernelPickFactor, Set) {
-  KernelPickFactor factor;
-  factor.ConsiderTarget();
-  ASSERT_TRUE(factor.IsTargetConsidered());
-  factor.ConsiderPrecision();
-  ASSERT_TRUE(factor.IsPrecisionConsidered());
-  factor.ConsiderDataLayout();
-  ASSERT_TRUE(factor.IsDataLayoutConsidered());
-
-  LOG(INFO) << "factor " << factor;
-}
-
-}  // namespace core
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/variable.cc b/lite/core/variable.cc
deleted file mode 100644
index a344da63f1..0000000000
--- a/lite/core/variable.cc
+++ /dev/null
@@ -1,19 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/variable.h"
-
-namespace paddle {
-namespace lite {}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/variable.h b/lite/core/variable.h
deleted file mode 100644
index 2c1e737a93..0000000000
--- a/lite/core/variable.h
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <set>
-#include <string>
-#include <vector>
-#include "lite/core/tensor.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-
-using FeedFetchList = std::vector<lite::Tensor>;
-
-class Variable {
- public:
-  template <typename T>
-  const T& Get() const {
-    return blob_.get<T>();
-  }
-
-  template <typename T>
-  T* GetMutable() {
-    if (!blob_.is<T>()) blob_.set<T>();
-    return blob_.get_mutable<T>();
-  }
-
-  template <typename T>
-  bool IsType() {
-    return blob_.type() == typeid(T).hash_code();
-  }
-
- private:
-  // variant<int, float, std::string, lite::Tensor> blob_;
-  variant<int, float, std::string, lite::Tensor, std::vector<lite::Tensor>>
-      blob_;
-};
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/workspace.cc b/lite/core/workspace.cc
deleted file mode 100644
index 196536f955..0000000000
--- a/lite/core/workspace.cc
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/workspace.h"
diff --git a/lite/core/workspace.h b/lite/core/workspace.h
deleted file mode 100644
index 117b80aaa7..0000000000
--- a/lite/core/workspace.h
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include "lite/core/memory.h"
-#include "lite/core/types.h"
-#include "lite/utils/macros.h"
-
-namespace paddle {
-namespace lite {
-
-/*
- * WorkSpace is a container that help to manage the temporary memory that are
- * shared across kernels during the serial execution.
- *
- * Due to the mobile library size limit, a complex allocator or GC algorithm is
- * not suitable here, one need to carefully manage the workspace inside a single
- * kernel.
- *
- * NOTE
- *
- * For kernel developers, one need to call the workspace as follows:
- *
- * - call `WorkSpace::Global().Alloc()` if needed to allocate some temporary
- * buffer.
- */
-class WorkSpace {
- public:
-  // Reset the workspace, and treat the workspace as empty.
-  void AllocReset() { cursor_ = 0; }
-
-  // Allocate a memory buffer.
-  core::byte_t* Alloc(size_t size) {
-    buffer_.ResetLazy(target_, cursor_ + size);
-    auto* data = static_cast<core::byte_t*>(buffer_.data()) + cursor_;
-    cursor_ += size;
-    return data;
-  }
-
-  static WorkSpace& Global_Host() {
-    thread_local std::unique_ptr<WorkSpace> x(new WorkSpace(TARGET(kHost)));
-    return *x;
-  }
-
-#if defined(LITE_WITH_X86)
-  static WorkSpace& Global_X86() { return Global_Host(); }
-#endif
-
-#if defined(LITE_WITH_ARM)
-  static WorkSpace& Global_ARM() { return Global_Host(); }
-#endif
-
-#if defined(LITE_WITH_CUDA)
-  static WorkSpace& Global_CUDA() {
-    thread_local std::unique_ptr<WorkSpace> x(new WorkSpace(TARGET(kCUDA)));
-    return *x;
-  }
-#endif
-
- private:
-  explicit WorkSpace(TargetType x) : target_(x) {}
-
-  TargetType target_;
-  Buffer buffer_;
-  size_t cursor_;
-
-  DISALLOW_COPY_AND_ASSIGN(WorkSpace);
-};
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/demo/cxx/Makefile.def b/lite/demo/cxx/Makefile.def
deleted file mode 100644
index f0a0ec1dcb..0000000000
--- a/lite/demo/cxx/Makefile.def
+++ /dev/null
@@ -1,35 +0,0 @@
-CXX_DEFINES = -DARM_WITH_OMP -DHPPL_STUB_FUNC -DLITE_WITH_ARM -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK \
-	      -DLITE_WITH_LINUX -DPADDLE_DISABLE_PROFILER -DPADDLE_NO_PYTHON -DPADDLE_WITH_TESTING
-LDFLAGS = -latomic -pthread -ldl
-
-SYSROOT_COMPLILE = --sysroot=/opt/android-ndk-r17c/sysroot
-
-THIRD_PARTY_LIBS = ../../../third_party/gflags/lib/libgflags.a
-
-SYSTEM_INCLUDES = -I/opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/include \
-	          -I/opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++abi/include \
-	          -I/opt/android-ndk-r17c/sources/android/support/include \
-	          -I/opt/android-ndk-r17c/sysroot/usr/include \
-
-THIRD_PARTY_INCLUDES = -I../../../third_party/gflags/include
-
-ifeq ($(ARM_ABI), arm8)
-    CC = /opt/android-ndk-r17c/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-g++ 
-    CXX_FLAGS = -funwind-tables -no-canonical-prefixes -D__ANDROID_API__=22 -fexceptions -frtti  -std=c++11 -fopenmp -O3 -DNDEBUG -fPIE
-    CXXFLAGS_LINK = $(CXX_FLAGS) -pie -Wl,--gc-sections 
-    SYSROOT_LINK = --sysroot=/opt/android-ndk-r17c/platforms/android-24/arch-arm64
-    SYSTEM_LIBS = /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++_static.a \
-                  /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++abi.a
-    INCLUDES = $(SYSTEM_INCLUDES) -I/opt/android-ndk-r17c/sysroot/usr/include/aarch64-linux-android $(THIRD_PARTY_INCLUDES)
-else
-    CC = /opt/android-ndk-r17c/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-g++
-    CXX_FLAGS = -march=armv7-a -mthumb -mfpu=neon -mfloat-abi=softfp -funwind-tables -no-canonical-prefixes \
-		-D__ANDROID_API__=22 -fexceptions -frtti  -std=c++11 -fopenmp -O3 -DNDEBUG -fPIE  
-    CXXFLAGS_LINK = $(CXX_FLAGS) -pie -Wl,--fix-cortex-a8 -Wl,--gc-sections -Wl,-z,nocopyreloc
-    SYSROOT_LINK = --sysroot=/opt/android-ndk-r17c/platforms/android-22/arch-arm
-    SYSTEM_LIBS = /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libc++_static.a \
-                  /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libc++abi.a \
-                  /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libandroid_support.a \
-                  /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libunwind.a
-    INCLUDES = $(SYSTEM_INCLUDES) -I/opt/android-ndk-r17c/sysroot/usr/include/arm-linux-androideabi $(THIRD_PARTY_INCLUDES)
-endif
diff --git a/lite/demo/cxx/README.md b/lite/demo/cxx/README.md
deleted file mode 100644
index ec72c044e3..0000000000
--- a/lite/demo/cxx/README.md
+++ /dev/null
@@ -1,42 +0,0 @@
-# C++ Demo
-1. 使用`lite/tools/Dockerfile.mobile`生成docker镜像
-2. 运行并进入docker镜像环境，执行`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/r0.1/inference_lite_lib.android.armv8.tar.gz `下载所需demo环境。(armv7 demo可使用命令`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/r0.1/inference_lite_lib.android.armv7.tar.gz` 进行下载)。
-3. 解压下载文件`tar zxvf inference_lite_lib.android.armv8.tar.gz `
-4. 执行以下命令准备模拟器环境
-```shell
-# armv8
-adb kill-server
-adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
-echo n | avdmanager create avd -f -n paddle-armv8 -k "system-images;android-24;google_apis;arm64-v8a"
-echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv8 -noaudio -no-window -gpu off -port 5554 &
-sleep 1m
-```
-```shell
-# armv7
-adb kill-server
-adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
-echo n | avdmanager create avd -f -n paddle-armv7 -k "system-images;android-24;google_apis;armeabi-v7a"
-echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -port 5554 &
-sleep 1m
-```
-5. 准备模型、编译并运行完整api的demo
-```shell
-cd inference_lite_lib.android.armv8/demo/cxx/mobile_full
-wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
-tar zxvf mobilenet_v1.tar.gz
-make
-adb -s emulator-5554 push mobilenet_v1 /data/local/tmp/
-adb -s emulator-5554 push mobilenetv1_full_api /data/local/tmp/
-adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_full_api
-adb -s emulator-5554 shell "/data/local/tmp/mobilenetv1_full_api --model_dir=/data/local/tmp/mobilenet_v1 --optimized_model_dir=/data/local/tmp/mobilenet_v1.opt"
-```
-运行成功将在控制台输出预测结果的前10个类别的预测概率
-
-6. 编译并运行轻量级api的demo
-```shell
-cd ../mobile_light
-make
-adb -s emulator-5554 push mobilenetv1_light_api /data/local/tmp/
-adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_light_api
-adb -s emulator-5554 shell "/data/local/tmp/mobilenetv1_light_api --model_dir=/data/local/tmp/mobilenet_v1.opt"
-```
diff --git a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7 b/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7
deleted file mode 100644
index f795b41d46..0000000000
--- a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7
+++ /dev/null
@@ -1,22 +0,0 @@
-ARM_ABI = arm7
-export ARM_ABI
-
-include ../Makefile.def
-
-LITE_ROOT=../../../
-
-CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include
-
-CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a $(SYSTEM_LIBS)
-
-mobilenetv1_full_api: mobilenetv1_full_api.o
-	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_full_api.o -o mobilenetv1_full_api  $(CXX_LIBS) $(LDFLAGS)
-
-mobilenetv1_full_api.o: mobilenetv1_full_api.cc
-	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobilenetv1_full_api.o -c mobilenetv1_full_api.cc
-
-
-.PHONY: clean
-clean:
-	rm -f mobilenetv1_full_api.o
-	rm -f mobilenetv1_full_api
diff --git a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8 b/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8
deleted file mode 100644
index d0767145b0..0000000000
--- a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8
+++ /dev/null
@@ -1,22 +0,0 @@
-ARM_ABI = arm8
-export ARM_ABI
-
-include ../Makefile.def
-
-LITE_ROOT=../../../
-
-CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include
-
-CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a $(SYSTEM_LIBS)
-
-mobilenetv1_full_api: mobilenetv1_full_api.o
-	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_full_api.o -o mobilenetv1_full_api  $(CXX_LIBS) $(LDFLAGS)
-
-mobilenetv1_full_api.o: mobilenetv1_full_api.cc
-	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobilenetv1_full_api.o -c mobilenetv1_full_api.cc
-
-
-.PHONY: clean
-clean:
-	rm -f mobilenetv1_full_api.o
-	rm -f mobilenetv1_full_api
diff --git a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7 b/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7
deleted file mode 100644
index d235d6e25f..0000000000
--- a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7
+++ /dev/null
@@ -1,22 +0,0 @@
-ARM_ABI = arm7
-export ARM_ABI
-
-include ../Makefile.def
-
-LITE_ROOT=../../../
-
-CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include
-
-CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
-
-mobilenetv1_light_api: mobilenetv1_light_api.o
-	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_light_api.o -o mobilenetv1_light_api  $(CXX_LIBS) $(LDFLAGS)
-
-mobilenetv1_light_api.o: mobilenetv1_light_api.cc
-	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobilenetv1_light_api.o -c mobilenetv1_light_api.cc
-
-
-.PHONY: clean
-clean:
-	rm -f mobilenetv1_light_api.o
-	rm -f mobilenetv1_light_api
diff --git a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8 b/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8
deleted file mode 100644
index b91aadcef8..0000000000
--- a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8
+++ /dev/null
@@ -1,22 +0,0 @@
-ARM_ABI = arm8
-export ARM_ABI
-
-include ../Makefile.def
-
-LITE_ROOT=../../../
-
-CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include
-
-CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
-
-mobilenetv1_light_api: mobilenetv1_light_api.o
-	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_light_api.o -o mobilenetv1_light_api  $(CXX_LIBS) $(LDFLAGS)
-
-mobilenetv1_light_api.o: mobilenetv1_light_api.cc
-	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobilenetv1_light_api.o -c mobilenetv1_light_api.cc
-
-
-.PHONY: clean
-clean:
-	rm -f mobilenetv1_light_api.o
-	rm -f mobilenetv1_light_api
diff --git a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
deleted file mode 100644
index 18167e3ca1..0000000000
--- a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <stdio.h>
-#include <vector>
-#include "paddle_api.h"          // NOLINT
-#include "paddle_use_kernels.h"  // NOLINT
-#include "paddle_use_ops.h"      // NOLINT
-#include "paddle_use_passes.h"   // NOLINT
-
-using namespace paddle::lite_api;  // NOLINT
-
-DEFINE_string(model_dir, "", "Model dir path.");
-DEFINE_string(optimized_model_dir, "", "Optimized model dir.");
-DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels");
-
-int64_t ShapeProduction(const shape_t& shape) {
-  int64_t res = 1;
-  for (auto i : shape) res *= i;
-  return res;
-}
-
-void RunModel() {
-  // 1. Set CxxConfig
-  CxxConfig config;
-  config.set_model_dir(FLAGS_model_dir);
-  std::vector<Place> valid_places{Place{TARGET(kARM), PRECISION(kFloat)}};
-  if (FLAGS_prefer_int8_kernel) {
-    valid_places.push_back(Place{TARGET(kARM), PRECISION(kInt8)});
-    config.set_preferred_place(Place{TARGET(kARM), PRECISION(kInt8)});
-  } else {
-    config.set_preferred_place(Place{TARGET(kARM), PRECISION(kFloat)});
-  }
-  config.set_valid_places(valid_places);
-
-  // 2. Create PaddlePredictor by CxxConfig
-  std::shared_ptr<PaddlePredictor> predictor =
-      CreatePaddlePredictor<CxxConfig>(config);
-
-  // 3. Save the optimized model
-  // WARN: The `predictor->SaveOptimizedModel` method must be executed
-  // before the `predictor->Run` method. Because some kernels' `PrepareForRun`
-  // method maybe change some parameters' values.
-  predictor->SaveOptimizedModel(FLAGS_optimized_model_dir,
-                                LiteModelType::kNaiveBuffer);
-
-  // 4. Prepare input data
-  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
-  input_tensor->Resize(shape_t({1, 3, 224, 224}));
-  auto* data = input_tensor->mutable_data<float>();
-  for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
-    data[i] = 1;
-  }
-
-  // 5. Run predictor
-  predictor->Run();
-
-  // 6. Get output
-  std::unique_ptr<const Tensor> output_tensor(
-      std::move(predictor->GetOutput(0)));
-  printf("Output dim: %d\n", output_tensor->shape()[1]);
-  for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
-    printf("Output[%d]: %f\n", i, output_tensor->data<float>()[i]);
-  }
-}
-
-int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
-  RunModel();
-  return 0;
-}
diff --git a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
deleted file mode 100644
index e1833814ca..0000000000
--- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include <stdio.h>
-#include <vector>
-#include "paddle_api.h"          // NOLINT
-#include "paddle_use_kernels.h"  // NOLINT
-#include "paddle_use_ops.h"      // NOLINT
-
-using namespace paddle::lite_api;  // NOLINT
-
-DEFINE_string(model_dir, "", "Model dir path.");
-
-int64_t ShapeProduction(const shape_t& shape) {
-  int64_t res = 1;
-  for (auto i : shape) res *= i;
-  return res;
-}
-
-void RunModel() {
-  // 1. Set MobileConfig
-  MobileConfig config;
-  config.set_model_dir(FLAGS_model_dir);
-
-  // 2. Create PaddlePredictor by MobileConfig
-  std::shared_ptr<PaddlePredictor> predictor =
-      CreatePaddlePredictor<MobileConfig>(config);
-
-  // 3. Prepare input data
-  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
-  input_tensor->Resize({1, 3, 224, 224});
-  auto* data = input_tensor->mutable_data<float>();
-  for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
-    data[i] = 1;
-  }
-
-  // 4. Run predictor
-  predictor->Run();
-
-  // 5. Get output
-  std::unique_ptr<const Tensor> output_tensor(
-      std::move(predictor->GetOutput(0)));
-  printf("Output dim: %d\n", output_tensor->shape()[1]);
-  for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
-    printf("Output[%d]: %f\n", i, output_tensor->data<float>()[i]);
-  }
-}
-
-int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
-  RunModel();
-  return 0;
-}
diff --git a/lite/demo/java/README.md b/lite/demo/java/README.md
deleted file mode 100644
index 904726d744..0000000000
--- a/lite/demo/java/README.md
+++ /dev/null
@@ -1,118 +0,0 @@
-# Java Android Demo
-
-要编译和跑起 ./android 文件夹下的 Android demo 程序 PaddlePredictor，你需要准备：
-
-1. 一台能运行安卓程序的安卓手机
-2. 一台带有AndroidStudio的开发机
-
-## 编译
-
-首先在PaddleLite的开发Docker镜像中，拉取最新PaddleLite代码，编译对应你手机架构的预测库，
-下面我们以arm8 架构举例。进入paddlelite 目录，运行以下cmake 和make 命令：
-
-```
-mkdir -p build.lite.android.arm8.gcc
-cd build.lite.android.arm8.gcc
-
-cmake .. \
--DWITH_GPU=OFF \
--DWITH_MKL=OFF \
--DWITH_LITE=ON \
--DLITE_WITH_JAVA=ON \
--DLITE_WITH_CUDA=OFF \
--DLITE_WITH_X86=OFF \
--DLITE_WITH_ARM=ON \
--DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
--DWITH_TESTING=OFF \
--DLITE_SHUTDOWN_LOG=ON \
--DLITE_ON_TINY_PUBLISH=ON \
--DARM_TARGET_OS=android -DARM_TARGET_ARCH_ABI=armv8 -DARM_TARGET_LANG=gcc
-
-make publish_inference -j4
-```
-
-Make完成后查看要存在
-```
-build.lite.android.arm8.gcc/lite/api/android/jni/native/libpaddle_lite_jni.so
-build.lite.android.arm8.gcc/lite/api/android/jni/PaddlePredictor.jar
-```
-这两个文件。他们分别为 PaddleLite c++ 动态链接库和 Java jar 包。包含 PaddleLite Java API，接下来 Android Java 代
-码会使用这些api 
-
-## 准备 demo 需要的其他文件
-
-Demo 除了代码，还需要准备 JNI .so 库（上节提到的`libpaddle_lite_jni.so`），Java .jar 包（上文提到的
-`PaddlePredictor.jar` ），和模型文件。我们提供了自动化的脚本和手动拷贝两种方法，用户可以根据自己需要选择：
-
-### 脚本方法
-
-进入 `build.lite.android.armv8/inference_lite_lib.android.armv8/demo/java/android/`，我们准备了
-一个脚本`prepare_demo.bash`，脚本输入一个参数，为你要拷贝的.so 对应的架构文件夹名。
-
-例如运行
-```
-bash prepare_demo.bash armv8
-```
-该脚本自动下载并解压缩模型文件，拷贝了 .jar 包进demo，还有生成的.so包进  `PaddlePredictor/app/src/main/jinLibs/架构文件夹下`，
-在我们这个例子里，armv8 就是架构文件夹。备注：这种方式构建的 demo 在 armv8 手机运行正常。如果要 demo 程序
-在别的手机架构（如 armv7）上也运行正常，需要添加别的架构。
-
-### 手动拷贝方法
-
-接下来我们介绍手动拷贝，如果使用了脚本，那么可以跳过以下手动方法的介绍。
-
-### 把 .so 动态库和 .jar 拷贝进安卓demo程序：
-
-把本文件夹下 demo/PaddlePredictor 载入到AndroidStudio。把上一步提到的`libpaddle_lite_jni.so`
-拷贝进 `PaddlePredictor/app/src/main/jinLibs/架构文件夹下` 比如文件夹arm8里要包含该 .so文件：
-把上一步提到的 `PaddlePredictor.jar` 拷贝进 `PaddlePredictor/app/libs` 下
-
-### 把demo使用到的模型文件拷贝进安卓程序：
-
-下载我们的5个模型文件，并解压缩到 `PaddlePredictor/app/src/main/assets` 这个文件夹中
-需要拷贝的模型文件和下载地址：
-
-    inception_v4_simple_opt.nb http://paddle-inference-dist.bj.bcebos.com/inception_v4_simple_opt.nb.tar.gz
-    lite_naive_model_opt.nb    http://paddle-inference-dist.bj.bcebos.com/lite_naive_model_opt.nb.tar.gz
-    mobilenet_v1_opt.nb        http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1_opt.nb.tar.gz
-    mobilenet_v2_relu_opt.nb   http://paddle-inference-dist.bj.bcebos.com/mobilenet_v2_relu_opt.nb.tar.gz
-    resnet50_opt.nb            http://paddle-inference-dist.bj.bcebos.com/resnet50_opt.nb.tar.gz
-
-下载完后，assets文件夹里要包含解压后的上面五个模型文件夹，但demo里不需要保存原压缩.tar.gz 文件。
-
-## 运行 Android 程序结果
-
-以上准备工作完成，就可以开始Build ，安装，和跑安卓demo程序。当你运行PaddlePredictor 程序时，大概会等10秒，
-然后看到类似以下字样：
-
-    lite_naive_model output: 50.213173, -28.872887
-    expected: 50.2132, -28.8729
-
-    inception_v4_simple test:true
-    time: xxx ms
-
-    resnet50 test:true
-    time: xxx ms
-
-    mobilenet_v1 test:true
-    time: xxx ms
-
-    mobilenet_v2 test:true
-    time: xxx ms
-
-该 demo 程序跑我们的 5 个模型，第一个模型结果将真正的头两个数字输出，并在第二行附上期望的正确值。你应该要
-看到他们的误差小于0.001。后面四个模型如果你看到 test:true 字样，说明模型输出通过了我们在 demo 程序里对其输出
-的测试。time 代表该测试花费的时间。 
-
-## Android demo 程序的 Instrumented Test 
-
-本节对于想通过命令行自动化demo程序的测试人员
-
-要通过命令行运行demo程序在手机上，进入 demo 的 `PaddlePredictor` 文件夹，运行
-```
-./gradlew init
-```
-以上命令只要运行一次，其初始化demo能运行的任务。之后可以通过以下命令运行我们的测试
-```
-./gradlew connectedAndroidTest
-```
diff --git a/lite/demo/java/android/PaddlePredictor/.gitignore b/lite/demo/java/android/PaddlePredictor/.gitignore
deleted file mode 100644
index 2b75303ac5..0000000000
--- a/lite/demo/java/android/PaddlePredictor/.gitignore
+++ /dev/null
@@ -1,13 +0,0 @@
-*.iml
-.gradle
-/local.properties
-/.idea/caches
-/.idea/libraries
-/.idea/modules.xml
-/.idea/workspace.xml
-/.idea/navEditor.xml
-/.idea/assetWizardSettings.xml
-.DS_Store
-/build
-/captures
-.externalNativeBuild
diff --git a/lite/demo/java/android/PaddlePredictor/app/.gitignore b/lite/demo/java/android/PaddlePredictor/app/.gitignore
deleted file mode 100644
index 796b96d1c4..0000000000
--- a/lite/demo/java/android/PaddlePredictor/app/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-/build
diff --git a/lite/demo/java/android/PaddlePredictor/app/build.gradle b/lite/demo/java/android/PaddlePredictor/app/build.gradle
deleted file mode 100644
index b86d2f8e3d..0000000000
--- a/lite/demo/java/android/PaddlePredictor/app/build.gradle
+++ /dev/null
@@ -1,28 +0,0 @@
-apply plugin: 'com.android.application'
-
-android {
-    compileSdkVersion 28
-    defaultConfig {
-        applicationId "com.baidu.paddle.lite"
-        minSdkVersion 23
-        targetSdkVersion 28
-        versionCode 1
-        versionName "1.0"
-        testInstrumentationRunner "android.support.test.runner.AndroidJUnitRunner"
-    }
-    buildTypes {
-        release {
-            minifyEnabled false
-            proguardFiles getDefaultProguardFile('proguard-android-optimize.txt'), 'proguard-rules.pro'
-        }
-    }
-}
-
-dependencies {
-    implementation fileTree(dir: 'libs', include: ['*.jar'])
-    implementation 'com.android.support:appcompat-v7:28.0.0'
-    implementation 'com.android.support.constraint:constraint-layout:1.1.3'
-    testImplementation 'junit:junit:4.12'
-    androidTestImplementation 'com.android.support.test:runner:1.0.2'
-    androidTestImplementation 'com.android.support.test.espresso:espresso-core:3.0.2'
-}
diff --git a/lite/demo/java/android/PaddlePredictor/app/proguard-rules.pro b/lite/demo/java/android/PaddlePredictor/app/proguard-rules.pro
deleted file mode 100644
index f1b424510d..0000000000
--- a/lite/demo/java/android/PaddlePredictor/app/proguard-rules.pro
+++ /dev/null
@@ -1,21 +0,0 @@
-# Add project specific ProGuard rules here.
-# You can control the set of applied configuration files using the
-# proguardFiles setting in build.gradle.
-#
-# For more details, see
-#   http://developer.android.com/guide/developing/tools/proguard.html
-
-# If your project uses WebView with JS, uncomment the following
-# and specify the fully qualified class name to the JavaScript interface
-# class:
-#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
-#   public *;
-#}
-
-# Uncomment this to preserve the line number information for
-# debugging stack traces.
-#-keepattributes SourceFile,LineNumberTable
-
-# If you keep the line number information, uncomment this to
-# hide the original source file name.
-#-renamesourcefileattribute SourceFile
diff --git a/lite/demo/java/android/PaddlePredictor/app/src/androidTest/java/com/baidu/paddle/lite/ExampleInstrumentedTest.java b/lite/demo/java/android/PaddlePredictor/app/src/androidTest/java/com/baidu/paddle/lite/ExampleInstrumentedTest.java
deleted file mode 100644
index ca40855be7..0000000000
--- a/lite/demo/java/android/PaddlePredictor/app/src/androidTest/java/com/baidu/paddle/lite/ExampleInstrumentedTest.java
+++ /dev/null
@@ -1,114 +0,0 @@
-package com.baidu.paddle.lite;
-
-import android.content.Context;
-import android.support.test.InstrumentationRegistry;
-import android.support.test.runner.AndroidJUnit4;
-
-import org.junit.Test;
-import org.junit.runner.RunWith;
-
-import java.util.ArrayList;
-
-import static org.junit.Assert.*;
-
-/**
- * Lite example Instrument test
- */
-@RunWith(AndroidJUnit4.class)
-public class ExampleInstrumentedTest {
-    @Test
-    public void naiveModel_isCorrect() {
-        Context appContext = InstrumentationRegistry.getTargetContext();
-        ArrayList<Tensor> result = MainActivity.setInputAndRunNaiveModel("lite_naive_model", appContext);
-        Tensor output = result.get(0);
-        long[] shape = output.shape();
-        assertEquals(2, shape.length);
-        assertEquals(100L, shape[0]);
-        assertEquals(500L, shape[1]);
-
-        float[] outputBuffer = output.getFloatData();
-        assertEquals(50000, outputBuffer.length);
-        assertEquals(50.2132f, outputBuffer[0], 1e-4);
-        assertEquals(-28.8729, outputBuffer[1], 1e-4);
-    }
-
-    @Test
-    public void inceptionV4Simple_isCorrect() {
-        Context appContext = InstrumentationRegistry.getTargetContext();
-        ArrayList<Tensor> result = MainActivity.setInputAndRunImageModel("inception_v4_simple", appContext);
-        float[] expected = {0.0011684548f, 0.0010390386f, 0.0011301535f, 0.0010133048f,
-                0.0010259597f, 0.0010982729f, 0.00093195855f, 0.0009141837f,
-                0.00096620916f, 0.00089982944f, 0.0010064574f, 0.0010474789f,
-                0.0009782845f, 0.0009230255f, 0.0010548076f, 0.0010974824f,
-                0.0010612885f, 0.00089107914f, 0.0010112736f, 0.00097655767f};
-        assertImageResult(expected, result);
-    }
-
-    @Test
-    public void mobilenetV1_isCorrect() {
-        Context appContext = InstrumentationRegistry.getTargetContext();
-        ArrayList<Tensor> result = MainActivity.setInputAndRunImageModel("mobilenet_v1", appContext);
-        float[] expected = {0.00019130898f, 9.467885e-05f, 0.00015971427f, 0.0003650665f,
-                0.00026431272f, 0.00060884043f, 0.0002107942f, 0.0015819625f,
-                0.0010323516f, 0.00010079765f, 0.00011006987f, 0.0017364529f,
-                0.0048292773f, 0.0013995157f, 0.0018453331f, 0.0002428986f,
-                0.00020211363f, 0.00013668182f, 0.0005855956f, 0.00025901722f};
-        assertImageResult(expected, result);
-    }
-
-    @Test
-    public void mobilenetV2Relu_isCorrect() {
-        Context appContext = InstrumentationRegistry.getTargetContext();
-        ArrayList<Tensor> result = MainActivity.setInputAndRunImageModel("mobilenet_v2_relu", appContext);
-        float[] expected = {0.00017082224f, 5.699624e-05f, 0.000260885f, 0.00016412718f,
-                0.00034818667f, 0.00015230637f, 0.00032959113f, 0.0014772735f,
-                0.0009059976f, 9.5378724e-05f, 5.386537e-05f, 0.0006427285f,
-                0.0070957416f, 0.0016094646f, 0.0018807327f, 0.00010506048f,
-                6.823785e-05f, 0.00012269315f, 0.0007806194f, 0.00022354358f};
-        assertImageResult(expected, result);
-    }
-
-    @Test
-    public void resnet50_isCorrect() {
-        Context appContext = InstrumentationRegistry.getTargetContext();
-        ArrayList<Tensor> result = MainActivity.setInputAndRunImageModel("resnet50", appContext);
-        float[] expected = {0.00024139918f, 0.00020566184f, 0.00022418296f, 0.00041731037f,
-                0.0005366107f, 0.00016948722f, 0.00028638865f, 0.0009257241f,
-                0.00072681636f, 8.531815e-05f, 0.0002129998f, 0.0021168243f,
-                0.006387163f, 0.0037145028f, 0.0012812682f, 0.00045948103f,
-                0.00013535398f, 0.0002483765f, 0.00076759676f, 0.0002773295f};
-        assertImageResult(expected, result);
-    }
-
-    public void assertImageResult(float[] expected, ArrayList<Tensor> result) {
-        assertEquals(2, result.size());
-        assertEquals(20, expected.length);
-
-        Tensor tensor = result.get(0);
-        Tensor tensor1 = result.get(1);
-        long[] shape = tensor.shape();
-        long[] shape1 = tensor1.shape();
-
-        assertEquals(2, shape.length);
-        assertEquals(2, shape1.length);
-
-        assertEquals(1L, shape[0]);
-        assertEquals(1L, shape1[0]);
-        assertEquals(1000L, shape[1]);
-        assertEquals(1000L, shape1[1]);
-
-        float[] output = tensor.getFloatData();
-        float[] output1 = tensor.getFloatData();
-
-        assertEquals(1000, output.length);
-        assertEquals(1000, output1.length);
-        for (int i = 0; i < output.length; ++i) {
-            assertEquals(output[i], output1[i], 1e-6f);
-        }
-        int step = 50;
-        for (int i = 0; i < expected.length; ++i) {
-            assertEquals(output[i * step], expected[i], 1e-6f);
-        }
-    }
-}
-
diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/AndroidManifest.xml b/lite/demo/java/android/PaddlePredictor/app/src/main/AndroidManifest.xml
deleted file mode 100644
index 240078a587..0000000000
--- a/lite/demo/java/android/PaddlePredictor/app/src/main/AndroidManifest.xml
+++ /dev/null
@@ -1,21 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    package="com.baidu.paddle.lite">
-
-    <application
-        android:allowBackup="true"
-        android:icon="@mipmap/ic_launcher"
-        android:label="@string/app_name"
-        android:roundIcon="@mipmap/ic_launcher_round"
-        android:supportsRtl="true"
-        android:theme="@style/AppTheme">
-        <activity android:name=".MainActivity">
-            <intent-filter>
-                <action android:name="android.intent.action.MAIN" />
-
-                <category android:name="android.intent.category.LAUNCHER" />
-            </intent-filter>
-        </activity>
-    </application>
-
-</manifest>
\ No newline at end of file
diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/assets/README.txt b/lite/demo/java/android/PaddlePredictor/app/src/main/assets/README.txt
deleted file mode 100644
index 14aace8f9b..0000000000
--- a/lite/demo/java/android/PaddlePredictor/app/src/main/assets/README.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-After build PaddleLite in your build folder, download and decompress the
-following models in this directory:
-
-inception_v4_simple_opt.nb http://paddle-inference-dist.bj.bcebos.com/inception_v4_simple_opt.nb.tar.gz
-lite_naive_model_opt.nb    http://paddle-inference-dist.bj.bcebos.com/lite_naive_model_opt.nb.tar.gz
-mobilenet_v1_opt.nb        http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1_opt.nb.tar.gz
-mobilenet_v2_relu_opt.nb   http://paddle-inference-dist.bj.bcebos.com/mobilenet_v2_relu_opt.nb.tar.gz
-resnet50_opt.nb            http://paddle-inference-dist.bj.bcebos.com/resnet50_opt.nb.tar.gz
diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/java/com/baidu/paddle/lite/MainActivity.java b/lite/demo/java/android/PaddlePredictor/app/src/main/java/com/baidu/paddle/lite/MainActivity.java
deleted file mode 100644
index e8eb01bd55..0000000000
--- a/lite/demo/java/android/PaddlePredictor/app/src/main/java/com/baidu/paddle/lite/MainActivity.java
+++ /dev/null
@@ -1,206 +0,0 @@
-package com.baidu.paddle.lite;
-
-import android.content.Context;
-import android.support.v7.app.AppCompatActivity;
-import android.os.Bundle;
-import android.widget.TextView;
-
-import java.io.BufferedOutputStream;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.Date;
-
-public class MainActivity extends AppCompatActivity {
-
-    @Override
-    protected void onCreate(Bundle savedInstanceState) {
-        super.onCreate(savedInstanceState);
-        setContentView(R.layout.activity_main);
-
-        String textOutput = "";
-        Tensor output;
-        output = setInputAndRunNaiveModel("lite_naive_model_opt.nb", this);
-        textOutput += "lite_naive_model output: " + output.getFloatData()[0] + ", "
-                + output.getFloatData()[1] + "\n";
-        textOutput += "expected: 50.2132, -28.8729\n";
-
-        Date start = new Date();
-        output = setInputAndRunImageModel("inception_v4_simple_opt.nb", this);
-        Date end = new Date();
-        textOutput += "\ninception_v4_simple test: " + testInceptionV4Simple(output) + "\n";
-        textOutput += "time: " + (end.getTime() - start.getTime()) + " ms\n";
-
-        start = new Date();
-        output = setInputAndRunImageModel("resnet50_opt.nb", this);
-        end = new Date();
-        textOutput += "\nresnet50 test: " + testResnet50(output) + "\n";
-        textOutput += "time: " + (end.getTime() - start.getTime()) + " ms\n";
-
-        start = new Date();
-        output = setInputAndRunImageModel("mobilenet_v1_opt.nb", this);
-        end = new Date();
-        textOutput += "\nmobilenet_v1 test: " + testMobileNetV1(output) + "\n";
-        textOutput += "time: " + (end.getTime() - start.getTime()) + " ms\n";
-
-        start = new Date();
-        output = setInputAndRunImageModel("mobilenet_v2_relu_opt.nb", this);
-        end = new Date();
-        textOutput += "\nmobilenet_v2 test: " + testMobileNetV2Relu(output) + "\n";
-        textOutput += "time: " + (end.getTime() - start.getTime()) + " ms\n";
-
-        TextView textView = findViewById(R.id.text_view);
-        textView.setText(textOutput);
-    }
-
-    public static String copyFromAssetsToCache(String modelPath, Context context) {
-        String newPath = context.getCacheDir() + "/" + modelPath;
-        // String newPath = "/sdcard/" + modelPath;
-        File desDir = new File(newPath);
-
-        try {
-            if (!desDir.exists()) {
-                desDir.mkdir();
-            }
-            for (String fileName : context.getAssets().list(modelPath)) {
-                InputStream stream = context.getAssets().open(modelPath + "/" + fileName);
-                OutputStream output = new BufferedOutputStream(new FileOutputStream(newPath + "/" + fileName));
-
-                byte data[] = new byte[1024];
-                int count;
-
-                while ((count = stream.read(data)) != -1) {
-                    output.write(data, 0, count);
-                }
-
-                output.flush();
-                output.close();
-                stream.close();
-            }
-
-        } catch (Exception e) {
-            throw new RuntimeException(e);
-        }
-
-        return desDir.getPath();
-    }
-
-    public static Tensor runModel(String modelName, long[] dims, float[] inputBuffer, Context context) {
-        String modelPath = copyFromAssetsToCache(modelName, context);
-
-        MobileConfig config = new MobileConfig();
-        config.setModelDir(modelPath);
-        config.setPowerMode(PowerMode.LITE_POWER_HIGH);
-        config.setThreads(1);
-        PaddlePredictor predictor = PaddlePredictor.createPaddlePredictor(config);
-
-        Tensor input = predictor.getInput(0);
-        input.resize(dims);
-        input.setData(inputBuffer);
-        predictor.run();
-
-        Tensor output = predictor.getOutput(0);
-
-        return output;
-    }
-
-
-    public static Tensor setInputAndRunNaiveModel(String modelName, Context context) {
-        long[] dims = {100, 100};
-        float[] inputBuffer = new float[10000];
-        for (int i = 0; i < 10000; ++i) {
-            inputBuffer[i] = i;
-        }
-        return runModel(modelName, dims, inputBuffer, context);
-    }
-
-    /**
-     * Input size is 3 * 224 * 224
-     *
-     * @param modelName
-     * @return
-     */
-    public static Tensor setInputAndRunImageModel(String modelName, Context context) {
-        long[] dims = {1, 3, 224, 224};
-        int item_size = 3 * 224 * 224;
-        float[] inputBuffer = new float[item_size];
-        for (int i = 0; i < item_size; ++i) {
-            inputBuffer[i] = 1;
-        }
-        return runModel(modelName, dims, inputBuffer, context);
-    }
-
-    public boolean equalsNear(float a, float b, float delta) {
-        return a >= b - delta && a <= b + delta;
-    }
-
-    public boolean expectedResult(float[] expected, Tensor result) {
-        if (expected.length != 20) {
-            return false;
-        }
-
-        long[] shape = result.shape();
-
-        if (shape.length != 2) {
-            return false;
-        }
-
-        if (shape[0] != 1 || shape[1] != 1000) {
-            return false;
-        }
-
-        float[] output = result.getFloatData();
-
-        if (output.length != 1000) {
-            return false;
-        }
-
-        int step = 50;
-        for (int i = 0; i < expected.length; ++i) {
-            if (!equalsNear(output[i * step], expected[i], 1e-6f)) {
-                return false;
-            }
-        }
-
-        return true;
-    }
-
-    public boolean testInceptionV4Simple(Tensor output) {
-        float[] expected = {0.0011684548f, 0.0010390386f, 0.0011301535f, 0.0010133048f,
-                0.0010259597f, 0.0010982729f, 0.00093195855f, 0.0009141837f,
-                0.00096620916f, 0.00089982944f, 0.0010064574f, 0.0010474789f,
-                0.0009782845f, 0.0009230255f, 0.0010548076f, 0.0010974824f,
-                0.0010612885f, 0.00089107914f, 0.0010112736f, 0.00097655767f};
-        return expectedResult(expected, output);
-    }
-
-    public boolean testResnet50(Tensor output) {
-        float[] expected = {0.00024139918f, 0.00020566184f, 0.00022418296f, 0.00041731037f,
-                0.0005366107f, 0.00016948722f, 0.00028638865f, 0.0009257241f,
-                0.00072681636f, 8.531815e-05f, 0.0002129998f, 0.0021168243f,
-                0.006387163f, 0.0037145028f, 0.0012812682f, 0.00045948103f,
-                0.00013535398f, 0.0002483765f, 0.00076759676f, 0.0002773295f};
-        return expectedResult(expected, output);
-    }
-
-    public boolean testMobileNetV1(Tensor output) {
-        float[] expected = {0.00019130898f, 9.467885e-05f, 0.00015971427f, 0.0003650665f,
-                0.00026431272f, 0.00060884043f, 0.0002107942f, 0.0015819625f,
-                0.0010323516f, 0.00010079765f, 0.00011006987f, 0.0017364529f,
-                0.0048292773f, 0.0013995157f, 0.0018453331f, 0.0002428986f,
-                0.00020211363f, 0.00013668182f, 0.0005855956f, 0.00025901722f};
-        return expectedResult(expected, output);
-    }
-
-    public boolean testMobileNetV2Relu(Tensor output) {
-        float[] expected = {0.00017082224f, 5.699624e-05f, 0.000260885f, 0.00016412718f,
-                0.00034818667f, 0.00015230637f, 0.00032959113f, 0.0014772735f,
-                0.0009059976f, 9.5378724e-05f, 5.386537e-05f, 0.0006427285f,
-                0.0070957416f, 0.0016094646f, 0.0018807327f, 0.00010506048f,
-                6.823785e-05f, 0.00012269315f, 0.0007806194f, 0.00022354358f};
-        return expectedResult(expected, output);
-    }
-
-}
-
diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/drawable-v24/ic_launcher_foreground.xml b/lite/demo/java/android/PaddlePredictor/app/src/main/res/drawable-v24/ic_launcher_foreground.xml
deleted file mode 100644
index 1f6bb29060..0000000000
--- a/lite/demo/java/android/PaddlePredictor/app/src/main/res/drawable-v24/ic_launcher_foreground.xml
+++ /dev/null
@@ -1,34 +0,0 @@
-<vector xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:aapt="http://schemas.android.com/aapt"
-    android:width="108dp"
-    android:height="108dp"
-    android:viewportWidth="108"
-    android:viewportHeight="108">
-    <path
-        android:fillType="evenOdd"
-        android:pathData="M32,64C32,64 38.39,52.99 44.13,50.95C51.37,48.37 70.14,49.57 70.14,49.57L108.26,87.69L108,109.01L75.97,107.97L32,64Z"
-        android:strokeWidth="1"
-        android:strokeColor="#00000000">
-        <aapt:attr name="android:fillColor">
-            <gradient
-                android:endX="78.5885"
-                android:endY="90.9159"
-                android:startX="48.7653"
-                android:startY="61.0927"
-                android:type="linear">
-                <item
-                    android:color="#44000000"
-                    android:offset="0.0" />
-                <item
-                    android:color="#00000000"
-                    android:offset="1.0" />
-            </gradient>
-        </aapt:attr>
-    </path>
-    <path
-        android:fillColor="#FFFFFF"
-        android:fillType="nonZero"
-        android:pathData="M66.94,46.02L66.94,46.02C72.44,50.07 76,56.61 76,64L32,64C32,56.61 35.56,50.11 40.98,46.06L36.18,41.19C35.45,40.45 35.45,39.3 36.18,38.56C36.91,37.81 38.05,37.81 38.78,38.56L44.25,44.05C47.18,42.57 50.48,41.71 54,41.71C57.48,41.71 60.78,42.57 63.68,44.05L69.11,38.56C69.84,37.81 70.98,37.81 71.71,38.56C72.44,39.3 72.44,40.45 71.71,41.19L66.94,46.02ZM62.94,56.92C64.08,56.92 65,56.01 65,54.88C65,53.76 64.08,52.85 62.94,52.85C61.8,52.85 60.88,53.76 60.88,54.88C60.88,56.01 61.8,56.92 62.94,56.92ZM45.06,56.92C46.2,56.92 47.13,56.01 47.13,54.88C47.13,53.76 46.2,52.85 45.06,52.85C43.92,52.85 43,53.76 43,54.88C43,56.01 43.92,56.92 45.06,56.92Z"
-        android:strokeWidth="1"
-        android:strokeColor="#00000000" />
-</vector>
diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/drawable/ic_launcher_background.xml b/lite/demo/java/android/PaddlePredictor/app/src/main/res/drawable/ic_launcher_background.xml
deleted file mode 100644
index 0d025f9bf6..0000000000
--- a/lite/demo/java/android/PaddlePredictor/app/src/main/res/drawable/ic_launcher_background.xml
+++ /dev/null
@@ -1,170 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<vector xmlns:android="http://schemas.android.com/apk/res/android"
-    android:width="108dp"
-    android:height="108dp"
-    android:viewportWidth="108"
-    android:viewportHeight="108">
-    <path
-        android:fillColor="#008577"
-        android:pathData="M0,0h108v108h-108z" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M9,0L9,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,0L19,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M29,0L29,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M39,0L39,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M49,0L49,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M59,0L59,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M69,0L69,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M79,0L79,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M89,0L89,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M99,0L99,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,9L108,9"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,19L108,19"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,29L108,29"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,39L108,39"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,49L108,49"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,59L108,59"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,69L108,69"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,79L108,79"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,89L108,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,99L108,99"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,29L89,29"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,39L89,39"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,49L89,49"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,59L89,59"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,69L89,69"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,79L89,79"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M29,19L29,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M39,19L39,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M49,19L49,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M59,19L59,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M69,19L69,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M79,19L79,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-</vector>
diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/layout/activity_main.xml b/lite/demo/java/android/PaddlePredictor/app/src/main/res/layout/activity_main.xml
deleted file mode 100644
index 0d1e60b97e..0000000000
--- a/lite/demo/java/android/PaddlePredictor/app/src/main/res/layout/activity_main.xml
+++ /dev/null
@@ -1,19 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<android.support.constraint.ConstraintLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:app="http://schemas.android.com/apk/res-auto"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    tools:context=".MainActivity">
-
-    <TextView
-        android:id="@+id/text_view"
-        android:layout_width="wrap_content"
-        android:layout_height="wrap_content"
-        android:text="Hello World!"
-        app:layout_constraintBottom_toBottomOf="parent"
-        app:layout_constraintLeft_toLeftOf="parent"
-        app:layout_constraintRight_toRightOf="parent"
-        app:layout_constraintTop_toTopOf="parent" />
-
-</android.support.constraint.ConstraintLayout>
\ No newline at end of file
diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml b/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml
deleted file mode 100644
index eca70cfe52..0000000000
--- a/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml
+++ /dev/null
@@ -1,5 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
-    <background android:drawable="@drawable/ic_launcher_background" />
-    <foreground android:drawable="@drawable/ic_launcher_foreground" />
-</adaptive-icon>
\ No newline at end of file
diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml b/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml
deleted file mode 100644
index eca70cfe52..0000000000
--- a/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml
+++ /dev/null
@@ -1,5 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
-    <background android:drawable="@drawable/ic_launcher_background" />
-    <foreground android:drawable="@drawable/ic_launcher_foreground" />
-</adaptive-icon>
\ No newline at end of file
diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-hdpi/ic_launcher.png b/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-hdpi/ic_launcher.png
deleted file mode 100644
index 898f3ed59ac9f3248734a00e5902736c9367d455..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2963
zcmV;E3vBd>P)<h;3K|Lk000e1NJLTq002k;002k`1^@s6RqeA!000YGNkl<Zc-rlp
z32YSS6~~v;2U0^g);4x*ZZ`qK*^oekNg|Z0s+3kr6p2DrK{4SpzA;{}cfIS|yY~9R
zUK`BaHaJn}MMF6XNd$=?DWMTPP?H84lpY}@-~bDHegAKF$Ft-40IzL)Xh-^Ed)~}^
z@BQZczW3(BU<fUtCA5T=(9+KrG7K_UEJFvB5L34-7R%rPd*Ztso9Y0OWv1^(mgU(-
zl$o3nWu}S|rFjmctH3d#w#0$o^%U*0I})np_83=@BeuFoYKINm$m}>a+K}1d8+^p?
z!e{m!F(8(%L-Or7x3OYORF&;mRAm8a^;km%J=s!AdNyc=+ezQqUM;oHYO18U%`T}O
zHf$ra<Jf3JW`}R^QQvMmHnpN+OmL7?<C72eM}s|^47MzfR%4x3Phwq}N6!XDR~C3C
z*OW-@u`atOzNScOhYj1v>^L^sklEoIeAKmbOvX~v2@Y|vHs<^3JwwH?D$4l*XnPNs
zMOqozmbkT?^lZ?$DjQ9%E0x+GsV=1PwZ&39Y}iI-$Fb3d%nsk+qrN@cV=OmQMEdF%
z)iHMl(4Yu=cIkixWXtwMIV=>BvDSrHg8?)+vLJKozy*}$iE>&gGGonlG0cJhG&DRv
ztzkg-AO<u%ZB+|m4GK_XetP3T2Nao_yd!8pW9->(q)B7~G^EwE#tK@nqmJ}!(Bqtf
z=eN{I?X#P!Xx=uL)D9cAk=b!<v>~&@H~6S)=a?R4fDdP{-5E5X_!5&FwFJ^7&W2WS
z;CnxBCOsSU^v-%(vad;MPukr;&+ciI+F`>sGCPiqHe`1A1|N0p^<|#<+iECwOG@y7
zBF$;;0YAhxtqK7O0SW;M0SW;ckbsQ#9QTYyC*g`2j%bA%1Zh^g9=9l*Cy!I<J4fl6
z-HoKTFN8X0{`MNF9X4zuvp=`jO_irkQ1RPGMZKf7!#DV-w{Hx`G`+z=qqEls4=Aq4
z<i<T0+*ZKZ(h7yzlD$nG);k!^?K^kKbNe>^{_p2$PP2>j_D2AybM$NwY}iJ(ZH9O3
zlM8g4+dw;}V{dlY2EM^Z-Q(AmcmO|Ub1&3EFTS>iuHC#rcNo$wkB3@5c#lSunxsQ)
zaA7tLFV3Oxk}X2`9qVL6?4fcq?f>Yk0E0IEcm0~^P5ovLLV$&D9ibbZTOt4ivg_<=
zu^#q8tYJktl(egXwj4c3u6N&}S3mj_9pv5y{gQvL;&nM}TeNE{4K3O%_QAdpCAswa
z`Ev>!oQREY9uPqL)g(QPVc1U`Q3An`+x_7g8edZ^0zdcpXNv7^!ZsgV{ugB){w+<Y
zEulC5@@K_%lY8f8KGqNh*e|nKd`%HGo;fK35s*08wp%yF&wGK<110wL{oQu!7CBEH
zziXE37ik_@1jDPdK<*{?nNys(9R0fvignp3{SUvU%U@mXc0d=eUZG8gekI!IH9%2C
zxo*@nc(Q=5tQGFnAQkJ`FxRAT09V@Dx*d*RU$kAPjfW11w)A{}Z}3sihL~n@?;g8(
zL-2rN7!aOK;qePnu2!fcaL-1&egZ4&$nWT@8{hO89KX;;`r+S*Hno1hH~6UMJ7SvC
zR7J+F019@9eVf<)^}qk_aX5aR|Ig>5&3-Wlp}yI7?tN)6*ST)-XSL4g8_rtDVlw+a
zE+K|#(tV!<wCuny7=nHv?_K<ao@#K?^8LH%(<@)xHyljzFYIZeB`x)I@}D0|z~7{0
zukY>KfQE22d-}7B(mLkHukIp4?na@q<A?7X&{n=675kY|=tujuNej%=mp-K-mh`?0
zXyJ}}`ufHVUFT8Ym2+hsmY+u7D#+rV@Q?cjbm(vAX>?%@4Kb%u!@F-ww?o?tn_Ohb
zPi3Do`yL?Y$rDPYtEV;|250yzpS^rZT*TflAZ&YqC;by2Ul7NTZHKm<?;IlogEa7o
zNUSgGSwNjh^ONdI>C)9NA6Vv+>C%^1XhNlp5*!7zxTTKfHTPhe?@XbH=VzWEuCcmX
z@L_&qCB;=(Xi;-D&DvT)kGOiMQ0&YQTezdH&j4D;U@#9&WiZClJThS7w)OHH^fIT|
z+jn{&5bhMbynmM$P<0U*%ksp0WUy)=J!n9<aQvFf{%f2`6me0A!Z2W+O4qx=eobRX
zCfjuI4FN>~WJ&YNn$e3{jMFOW6n~uqMHg+M3FY|#>(q)ZF;RS(xqTh>S1Ez_jfFig
z#ivbPnZ26mv{5wdB5SFYrUNM5D?g-OsiZZK?hPof9gqf&7m!5-C=d>yOsw<)(t*<s
z$lMM`0Adwn^y2*>G@h5zIY2saaEx|99pU%^#gv<M;h@(DK;`B@I96XTvSxN2P_U1E
zCUc!J(P5=0>dI(Qqf<Z%ltt(%PQl;hYqN=*lCiy3PrIvV0q;N2S)h_lfPs>>)zFjf
zN}5z<W7*rq*$aAZXFWXuyQ_#MRjXBub)I=#!o+kX%?JD7P&}C9v-5aOv4Di0q^LMo
z(d?QMng=`2MS7aYy>m9~oT`PmH~EF012{9eT8?4piYolF(86uiGy`^r#V4yu7SA-c
zjm})#d$(Kx2|Yn~i19Fr<)Gs+1XaUIJs~G>kg>3<hLw6kM$Z8h^{`S;$W!?Q87s`H
zVWk!iF+P4|N^d`#$Fujmln!tNAnpkXen1*HUO-lg6M#Cx@dC291Jb~O5^Dh{zat#X
zlh&+00thcJz#)HjQ%%GQARRd9H3<$4AWgPX0O`O%uh9hrUkWnY0ri=;J5T~3@Vf#E
z%vJ&r2O+C-1fYt1T{vDqC4qqaaQuKAmOl3Siw2SxkVRVO;EG%TlGHiKR9-;ox~V(>
zkQ$CqUj*cb1ORzHKmZ`Ab2<VFtaH48ERs4$1xO3W)=hwV>^0!}Qkq&-DC(S~W*1GV
zw9}L-zX}y4ZLblxEO1qhqE9Q-IY{NmR+w+RDpB<O9ycxEIxdk<k|Iv+LUpB{`Mhsz
zS6+YV(HF`2?9F6{QRj>;$@R(PRjCP|D$yJ+BvI$!mIbb<+GQ3MGKxUdIY{N`DOv%}
zWA){tEw8M2f!r&ugC6C5AMVXM=w7ej#c_{G;Obab=fD={ut@71RLCd*b?Y1+R_HMR
zqYNuWxFqU^Yq9YB)SmxVgNKR;UMH207l5qNItP~xUO*YTsayf1g`)yAJoRV6f2$Fh
z|A1cNgyW)@1ZJ!8eBC7gN$MOgAgg|zqX4pYgkw{E4wcr09u#3tt$JW@xgr2dT0piE
zfSguooznr3CR>T88cu6RII0io!Z)mN2S3C%toVr+P`0PTJ>8yo4OoHX161h;q+jRY
zs$2o2lgirxY2o-j$>c;3w)BT<1fb;PVV(V`cL*zHj5+On;kX@;0)<bC@YRfVy<d3M
z7H^Zv?Cl@*s@8LTV#v>6rF-I?1)gyZtM6}?#ji{u+_Jz`IW9a=87nIA3aK2~3iFMS
zzYP&fCXLEibCzR_6R~#sKN@)HB>);Za`ud*QCaKG8jEwqgoknK7rwW`Cq?RYYE5r+
zh-YUqJ082>*;EG`_lhV^<FokqNaInQ*73Jj`HJRpn#U~*TvI7>v<r0(QhDI9Io)$f
zF)Hnq4<DMA_$*>HEM7d+5Y#e$d^rC*jx{U%h3B^nU%7N|*y`o4g{@w;KP-89>&W#h
zTBB2vTk*S|My+4jYTPKdk6yR3b?nAfcd`FeC@gttYuGBEl9wuf8`rOD9VP6`bhNxR
znvXql-3ssVUSXfvcf^2L5R-^4E-s=g|M$Wm!?<yg{4<(0g*A;ey`O8EFP11nRMZF&
zW0#k?hGE>BMl!51d{AS*7Ggjwh^YsbK?6jgCA5T=(9$oK{{z$fCe9x5IJ^J=002ov
JPDHLkV1g@XpTGbB

diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-hdpi/ic_launcher_round.png b/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-hdpi/ic_launcher_round.png
deleted file mode 100644
index dffca3601eba7bf5f409bdd520820e2eb5122c75..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4905
zcmV+^6V~jBP)<h;3K|Lk000e1NJLTq002k;002k`1^@s6RqeA!000v3Nkl<Zc-rM$
z33wD$w!Xb2KoVA2lMs>sCJ+Khgs=qzz9*aFfTF@MBLc!81jy1$_D*`qMnYCeSOOSS
zh~l6kD7e75FgOnvP=_arGNJ+k0uBt2?%a3It*Y+o?&`L?*#fV=?@xECZq+^KuXD~l
z_tdQ><aTb9$_OPgqSMJGfyWU4Z#Go6ak>JOSF%q}x5h@>Id>gloHZ!fr_@%N)Qad*
zI}<}@Poh`#X29>b50CkB%{yWf?z(t0rQf48W{j1a($$IrZ9{N{@#9Wqx}%DM^fL-m
z`X#_s9{BwX>^};}KMtudHpmMyRCq34!+|XCtnqeli6}6}7JiE;H+GAtDViHuQ~X9`
zP0^{y>Ov~ufreT-w7!yx_c;QOV>|0UxJK{lqSx`7cx`b!OLV*;Ez4q9Y_XdB$PKk4
z+Aq(kmz%Wb<VG#!pg~yVMzt9cyQ<6_R$;Jk`!vlCu*eM>OV3IpYsa0#_Vd?)>*2Lc
zn)<dS*(2=Suf<uE_|ECIi87OR*YNc<G^jY65`!9HCI%HiZQ=iCh^Z^3p%q3d-scEB
zj_pMIEc~_1!anQ=`rD-}&?ZXCbejV(XEpk?2x8Q{LioEod|fpNo{35{yHegqg2ZS>
zvVw}USbx|rlL2LMl<$^rb@TnK-;J83fd3GKh6#=C5WlXv83lKz{0$(8x1g-%;q}$b
z1=&8M<_eQZO4eJk#nshu9TsZZ11Z~hVkpt8oA4831ZP3Fj3C~EG*%gSnciYD-cpkI
zj{J=o1Bg-kJrjfz${Js8D?vh>vJwR{=4)c@ZtTqt#tHRR<NXUWu87F76SxHsWM|QU
zE3NdZ2{$)*^On=5DE;xojde7jD3dBp8ne=*rH=3~NT;JWJWo3;&!JMs1q@nMMtfD6
z+_u5v*v?%W><9b9ew~kVG6oc8(lNE=Pu>)F6HIf=`kIH3oJBkSO2;+SnG--LDU5kx
zC0($63w`LN)znoR#GhW@M5n&8!EGBnj_usF!G5qm>{qhQ`sdB#K<?+>+CoQF7f-se
z?#7!W#vF7jw48A-)Ulxz@0b)?7iKWQI+f<SY=aoQdIgQBE^ylhk7GM`ZLlBg5Bqi1
z22&2WskBW&`V~LbvcWgT-Gm%Mr)vj(%mOl+Yyxl2u3|!qi9sTuyl6>E6Ud#Le4H#?
z*wIeM>mtaY-X;WO^yfR4Adp*W)N+A4Yv~TqOy)a5g8AjAEfJ4acRWELKhbNNKrc!(
z&!ze1YQkhsw=A3()t7B^pu2=1)CJq>k}s1bv-{fV>=i+J^=8Lh=Pn_L(@77X+QqLi
zSM!u0YfVL$I)-o^+D$g^8iKevTQlfM$<k-Wd+us{(2^O{JGmMTtHTJj9M%!$8#ttj
z_Y1hER7WFg3h7wLJKB|GOSTaZ3khIYrO|C0JdW+$wP_FiJow@&+PkKRC5=*TP+pW9
z<+ttUbdzNyz3*AT#xf~vBi2+7!+H+h6`YmhrvS`YF)2%U;dv_Fj~E5#ynFOp`oo1^
z={rXbK?1kNsf`#j{`wMqq;Z|wHh3J{xob0N(^~q$cSq=n3+L%;|2!bs2IWP$QGQ_`
z(+y-ER=0x28Z={2_vt(bZbz=Rg_&!x2!+WNny(mmhcI)+<jnEd125E4i^alV2E6*s
zmvC1Lo!SW)R$ob@Ru;QFj=<yC&eidLwae*_!~eAQ+hjJgeU9x=UX&Z<w?76t7G=>k
z8A}@MLX0cd>SIdp0%mtcJaTy&g94$WW9QB?a!}a+T)Rd$eDM!(fgHCnNCsx!svv{S
z@9-M<iT~q%cQL3-SsvZ;%>jC~sfoKOK+dN>{)_sV(mjhof{qxwvX-7Df1DQTI(g)o
z>s6XRhgIhE&g6I!q!Sxz>EW}#SnudH5WeBSekYPp`9~Vp)1-G^r@B46=-SWs(Z;X8
z02evPKG%G<kLK>)Nf*Dpl|H<Aog_J14}HZfM(FDwK+XUcWV#^;^CAn$OQrH9a~8P{
z*4!{Bi>NSeWdw0`U#|(mpohWGktDRF;Bo`A2K9T}=|{(p(X*E>(aYDag2maC6ay^+
zk7K(%-yfyPJKv<yD@$m2RROacF*#qI`kBV9siZP*MlmZmU1jQp+kpd#qQCI8a_-zt
z5mg00^TVZ279_NJ=FLWW^0$jt1<>6-`qy{#2oNV$%o|*T^A7!TivIn?ahqEKj{ka&
z1#*R?@}3aHxtTmO=~U-w(|Xu(B2EmI8B50EvnOk9*GGbcJZK_}E{D#X@`(&j@%hg`
zvgc+#<w0WLBPGa*LXs9v=G-h%=@ftDIms%HAhr2M5Y!(=lbuKY_ofnrBUlu;-Srcs
zGRyTU8k(M*!{<awM+=fGbZHGYilD@XI+hDv^U<9=1-jaHAPM`sE`lHq$zfU*mYcTO
zJ`S-ZDC|+OqDN`ohiI<&Bol+F6b7v>V--FuV!3MbUy#-AgE(<BiLndjfW1F_&-!&G
z6UvKnqx>$~;1gULUs<ph>w`94gkTgN-nwH+_TiyxD=9t>#{5GHSR=+VC|3HUj>p$m
zF=5TOh#WCVpZxG0Mfs)VLU~bclwVS}a)Tud>)$I3M@i?-ZEb;CNQ$OT?W!<oPVqt7
z;;W05?%i!2VQw|9G+m=5pM2n>i>WPgI2K-%bDAV3iV{YFpxIA_D~#F;z7mA_2ToA0
zz;J#$$gz?H{f~tykIYwsN^&ofDHEcc3HtMs_ksmo_H~%=S!trXzdzzq@XJ@P(yd>A
zNh?17fF3z>nk9kWDu3|gPt>$~7yTPdOfi9U)o%B9hiOkpO1&hgnGv)+?=lcH(3zlF
z)1$73Anp4*+{T@4Fog)rOQR%n2^~~bNRNp!ZBKCK-@noL+ER9Y8^~8Se*UT3c%<F+
zCMDRPF9bb2evmUkk4=tLnA8`2aWR}{?f>b7TLtsqf14?X2rJH|pTWGz8-n&h;14Ov
z#z`fWWiO*ed){^1em`8ly%A*0PxH#fdX?n<MkXfow-F?aV&fBfDD-Ndg7k!!*V1#B
zFS!-??=a1{zovv)(VW*_a_b^e_!X}I?AA@Ry<S5HR2I@Ne&>dqyYz250dgaflgvo+
zJV<AsCc_7Z+eR^gpw6VNJYRj75oGY3ApC#j|9s+cQg`^5(>{-K7`Kl9diHm3hJcly
zengd6QU#LyA&GQLke(wb%#d-6v?HDD3F1f!>{yWg5#|xN?9J0WD7v<U{&?~HZ5_E>
z;l~T-X%q||!6msgyeyyoVe>kdc~D4&(TwHYfu@{&z(qUzHQHR6u}wE)#*5x&(o-7O
zw@7jXJiKu=?N?bq2i6qRnT;Fhz}ixmnKagt?l)w<id+!8(knMH^OvVC$nthBHfd9R
zizaB&##Q*JnfgN<8o9chDsg1XS4mG+p-qoPb4_fjAu(B1D&{^lnGe#`Q-bEdwWUQ9
zH2;lPtOSi(UBMBgOR*<O&&nNC4(+<kc<c$}!oJ+W`I?;E*RNMZx#1^o3TDO+-*3?b
z8Tahs2~yEfYeWQzNz(HLGwpLHTSZ6T%eZiQ<$+_YeXpXdH#velE4eNN3)=kUemgWC
z*iUzT`wiW4@N=hQVr)P36@BBI1NQq5*aMI6IP|q>-)BzP^3@k~*Wp97@gTqNpbZPR
zy$S@S*a*rO5riY0Ud8DORwP?Adna(v!QOi8<4{14v_(t!#gLwrT(JX4+=L_$A%|pc
zXmt?{(xut$cSLlVo(30Y+4jMCjtGY2uwS_m`dG?inGHD{f(#luthNkXB!$a+a>Yn-
zK~O4(yi`tCXd{2}Q7v*n=1Z+W<4npgXvmO$@_f~4uO9n2kmNBzD-1S*B*<|l$eA1@
z#7Yn<E<p|>NRI?n@&u)dVc}PLoFRSt;=(FF*KZU}pY9KTJIT}LH;AkK9+f+gq?~2G
z5#)j#B*jLMG&xp+>KqBOk%JavBS>X$J^3kS)@II(S5WsDjsv%=Is#fvo%C=}VJ79C
zu4XlR`eZez2+jdtZkwl~W8jW<Go>?<?2u=}kg6iur!*hFt7uTZH2SX4)41nL*mdmy
zqzLE+az3!OinfEB7X;xU$f&i!#Gw<&_TYw<v;*pea^bSP0&O$u8JGhECXRfZx!3P4
zVCXY(BEF;5eaLddd+Zr@eYM%sXY*Y9N;d1B_3}DK5?YcC#ddfMlDG-$YUofPz#hx3
zm<UQuhq>O+mCNa{m8IZH0?Igm<vdVdMQ3l>NQbXlLF4NHs~k~IN5KqX9?a!NuC1W)
zYsz_4m;p2<C-p<nxd*SiM=Q(PlF1XMLo5>B(rNZ|bq7KTK$6gs(A^{fuF@Y|C$u<+
zeYYY<jAR-L(}eyoUBDcLLy8SL4rn<@4i4e?=!PLrj5OLBEG+?u^v8z^5Od`Dsm*e*
zM0t7=CF4f^iRnB)&cl_jiQ%o`Z!lRp_jQ&;`Lcit7aZWZQt^k6G5yL7EXIgqj^k(*
z3<UP8FxkZ%EBJVWZiSYE#0W9RZYT%%gDj(gASK31j6FJ46Ul|oIZth8x^g;uxJ_rV
zs4W>3Gn!;AyU4%y;QbOj@OvR}OAX~1e60jYkYi7fGch)Tw9J(lK@#LJf(#;pbZHir
zB&II7NTQ;~GF=lBy<Df5n23yaC$pyMSSS4a%<C0q;uOGPl(G`!f*d555mae%Ne)_!
zm7u&-NpjF)tOVt&u|8xb86eNpNd-*D5hL38vd&dTqS)h)NV0g&JB)F?#<yj_0!hD1
zkn>QEr3##lyCO%LAbWBIf<~=H3(^R#^&aTfo7d6DH>o+Z>qt5T4kD_BN0|i~wM{;)
zQDk{ivKxY=^BgNdF34d7nZyJ+lfx0Dp`+JSH331CES`Ogv=4}5y2Zs^=PL<uaRKnR
z8yP%!GD%-_QOU1Z_h}KQn51kmL9UZPX@cyNK#rhtHd;xMgT;HWAoFNQDHN%H`HjRz
zJ<4?Abh`mte4lpkH`MQ;v2)twrdq~rT*a2I`Sh*^z72#mVo+>gRUr*8)xq~v8}M$U
zLOie%h{Y~;4ui@DJqJtzG0(xF97ij3CmS@3983s@mls%CJveFs=+cwd>4yDCfvm&e
z!5#1cb>BZeo;3I6^_Foju7YH-rfKy08n55>!E;8!9e--mI{HXM9UTG5-bio}4&^qi
zE~isoTuo;*ZeZWBo`Vxk8!8zvL!O6k1VIoUEds_IbStzRBxm^3Gm}w=_OY=YZzMUw
zCMRKGc;U#1X^+ec$Xs%Pdmk&k3F4CX?~8#O4uI@BY`Kmq!J0Uv+5@a9tSpblLOV<X
zt;iQc#ikD<fiScf9L5>))hr-m%u%E*xX4>hBnb`e#B{kyo18?4;4dFUw7M^53Rybu
z824~aV-c4}JY7hR>xV*sAg3fy6mLS7LnaNbD2_RfLpjc^aO!{=GM5BGo|C6yB@D9o
z>0^ok{idSKZKI>_xtZixNop4pgLk193Gf?Ao}Iaq1y@!>f+5tPYW8ZSJw77VrMS#<
zkU%RzE|Nf;cya`#HnR*FQxeQ`<~;c>Y2!DH$r^KWEyp=Wij2g!i9-Mb<kKb_iRv%s
zSzQI^V+L(ge7zU1h2!YvqS$36$OSoQF*qxaUtQjWb1Qb8l=<i7kB%<{*<zWDoUFI}
zS7T%(I5>cG4!}i^_bU5@kB8)I8_7rlg4C4#@<K_b*;QrI>0J#r1#qtCFoLQJrO9E%
zt`s&x4TB&q*Dj{y&(q&hhKJ${y!SHMP)2fle^N(DLRef11H>ps$3G)mFl*0{%0f#}
zK?dh~_$b?`;>l7qyL_2N&lj^qc}_^Fh@jk*X2^mq@ZAj7%2fh^%)qQ<OMmUMJ&IBH
z&qW!SESyX&=EqJrYw8-<jg1t94D9;g$XRXX&TJ&9^G~uMjLLIlaq~JL3?J6v$}W<x
z=7Kx|zat4bDHvoe!KFHLbWO1tT*=G^1mUR!&h3`XJ4t3e*4R4!J}g%ZWPPIyEx>AA
zZ3@z-Q#;=6kf<1C_wHkrQ^se@o}KxQJaxedR`bDn4a5ufwojD_f5pWfSc3vWaa8IF
z!+Z?HAa-6lxNq{aCuDPGysez_-`RL=-eMvHI(P2D`bHV<fKAakDynT@*RI`I)=CJj
z*otl4rAtDmaS8LfKm9~WkED59;tZL42NYy|+cRJLQ<yRHY_G!X3kdv<`$DslzwMuy
zvUi|5X-n5xPn6)fcn#iK@S0()59sSoA;vjx>O)$w1e0^WP&R`mBpOFQKR>_w07I2s
zIwmM1dOoD+-D@HOzvDhQc0abkw){E0*){N5cul<aEgZUUK3OrupOqIdo9))ED+vtj
zj<sN(xG<Kla0uh~^<ZtVEuMqt;x+CfQrri2R5TGrnmFP|`t)gqOOYAyJMK$lqmcY|
b2HgJxw7%AYvJx@g00000NkvXXu0mjfkw#Z`

diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-mdpi/ic_launcher.png b/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-mdpi/ic_launcher.png
deleted file mode 100644
index 64ba76f75e9ce021aa3d95c213491f73bcacb597..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2060
zcmV+n2=n)eP)<h;3K|Lk000e1NJLTq001xm001xu1^@s6R|5Hm000NmNkl<Zc-rk*
zX>3$g6n-PcZs4>q4bV;KlnN~%kbn}!V8maBKN?~PDN77Zj6xT>KxccMrJYVYoo)<E
zD@z15pd^^^lOHN7J8Be#*ji{ofi5#70;QcC&$(~z@LoGprqI&H@RBd@-E;4K_dEBV
zbMCqC3Bt2$JYR=bPY^^gVoX6ChJA_Ra}49<8WtaU!}9G@1$#-VVAZ7%rZ}xB{Q03w
zH%Jy8LLR*TU|F<N0vnq+1miX&c~OkfxXRpuwd5bcQq;wvaf)5?G9j+o)D>adu8>W%
zmv*U9KCo@D{=sCEstjFGl{%?R9Bd_S;`C@G{FNG~X;+5Z0h*dJ1r|5g4wB8=?S#Zy
zt3sAsXM@aL)nWAyCYz08&uXYp$}38nkeVvA0^C`|ts22ve2Y2>mf~J~_Til&y|FUz
z%#l)O^+i>bDr7NsoiC}@GN^5^{=sAkPSF?VF#7ysBZm@DnF?;l<mJpza2+uF5$wKY
z!)(D^*yC4#UIHAmcQ`;^CZyEcAj)2<ki}H$AgRtesBN(R!DI<Ic3hQFDl-nKfLP#}
zv~g{wf@@F(Hm%FRj@{!@f%R*$!c$<pu0Zzt^B$?d<c~IGpcD6ybCG@0030h1Hi5iM
zc;moM=E!o{{M|M1`hNGIw!!)blfAOH0iC;!J#!A}E}JLigY0?#{}p(qz%vD&*ooI=
zVCNokOxmP~AO|@tf2N)KR7V>e_~|Un-B}Itc2u|IlX``0V1M3jKlcCTY73+_+5_^1
zO|_7<%PEyPhbqxCEnFv#uom}FdO$lY%`OKi#h<5Co8ZPBFZA{I!|wAx!c?aisEfxs
z?T$*AUTc9D8_Hpt%L37MoudCVml+QIa-Q{X>F$I{4t=051yd2KXJy7g2ho;dPy9%m
z&|3%hK)bgG?)N=_y3^l5BAU(HpEX16sc+%jjdr-wd5e*w`^js6LDPj(u<}q7%axih
zoQB@MKIp*y%l0*noe!-3>L8Nvz`X|#;P=}%;m-Yg;Pd%Hg6jXkc0~S4=WWP7_Qlvb
zG1>9)E0=~O9SWcSdXd@th$;|?3QV+Z@1bR;tdb%M2ko%(GTA+u#e@F7$5Mb+;mB`4
z!xVgv{Jp95<uC*Nhm{!zJ;Yi|!t()&LTZdcl+RO2OyQ7&(gXfX3hujj2KvznABQ{#
z^=~?VioK7&d%+$Ti)^gSIXH$9g~BnR<rora1_MM8|MvDWhaQSB)l<9v0kYTj?|6DX
zK;iA|^=k@Z7q7R&%qkJGk9-<(RY<<>%Y!hpT7-)jrQ~&IJFY@h`L?H{0L^~?0CJaZ
z{tZjr)sT1m=#VQw^-Fg;S$l<Yh~>@ofMbuY0uykS+-JWJI=h~`ci}FY$50ATJ+%wA
zO77DqVS>075^y6_kJfo$5r(}BH#(lkaYNw(n&Hbh&XQd<uxPhybj|}LZ|&|H6$M&x
z*VbUjOhBkBFhMjIjl2nIFlL|l^D>-lYhgIk-UdHhZ4HzOR6cX9O(7$kLq}D}u9EB;
z-dh<?v|RQpV2~6*a}vsmMqwuw3qa6qVMwy;A=3D;Oo%pZg&7rksjSZpK<mVJ8t~bW
z2ZB+8785<em=A)9qd=*ExRpMwfXX=1sf1sNkP4t&PM(Nson{4jBCd5=73Ar?2@X;~
zhqLcjpsWZ~hWwE~)TiU_xy<c^IW-nY!##i_(}i*jmD=huHtP&EW?0eOfOm@omxYmk
zb8jQ0W9rU;?;|Wl2VR8dfpknMv~@OLdVK}H%Q8BL`eY;*X&SHsVj(2*6)@xm%!Y{b
zndfFh#QDriz%c4Cg4)hkpwPbpd82rwq-U-Gl9m)OD9AH=udyh|2NaNzC(;R3T!BJK
z0rN0S=&{elG`Qd1qjBKGN<&x<OC}>HFDZZ<8Lc2GP(}(AKLrJ-Oau&a1s?6Nk^&FO
z6KSRZhEqx_SQs6S0+E<b<N36QK5^4mV#-T6=!==WOrQe^onZqqT>ca!Fb^G1gONmI
zC+HbyhfVOuc?OI&<gjG()Q^Z0<}^A$+facig-B)kT21f5bUI80Fe~h)@ALGDo02LH
z`rIck6X@#!9l&{+a+v8j@JqEeP}{a4wu8D{!a{cy-b-1Gm!M<OhU{H}p3OXwPo@48
zPAd&QwZlaHJySM?^Kmz4&73FxE`(%~Fl>h7uoNn}=`c_>iW5NO1q-GUX8K1^!Zxzl
z4XfveR)GIBSo>}=cI+IH9~|U>#(X~teA-&84{aZTo0BMk;yjBqEL^gX=_9kDnP=}a
z`+sm4^17nldnZj&U`51GznG$gf}Fz|OlbvM2~cNtN6bbO;LjW>4doDpXIHr_#-WEK
zTp3oTSyarnG|L?64R(Lh#u7IM@+CF;0?j-dAKR%u-gp$bMThf`Y=V%QniZFqb4;b%
z+^sU^c~$y+58W}2ds$fqbXadxS)oD}YcBF8+Kmro`dqK7bh9_jZo>N(2|7ZqH?6u%
zs@LZQps|*E)s_+u&N{X0R(-hsYauy#KI0bVpUP;&tcc8vw<4D;UKP1mLj0?AU!cHb
ztdAKWi}A~qZL?OzGg+1b@q^keUNsrViJ`HuE@E!RO5*b9*&nDxR@U?Q6pMIaj1kMY
qJl2nQa+aK&iDQb84*TpHAJ>1BQ$$nT?9A!_0000<MNUMnLSTaDRpP7w

diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-mdpi/ic_launcher_round.png b/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-mdpi/ic_launcher_round.png
deleted file mode 100644
index dae5e082342fcdeee5db8a6e0b27028e2d2808f5..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2783
zcmV<53Ly1~P)<h;3K|Lk000e1NJLTq001xm001xu1^@s6R|5Hm000W6Nkl<Zc-qBU
zdr(x@8NW~z#1%veBnAx$Vt6=KYaVUNME^*g8q{%&8(%TC!Plq=;khg<yX>+Hy9+Dw
zQlg?UKB$_cZ8RBMYcyI%jkQf{#wz1Xr!PxQ>w~B~cKP~!=iIw{_rdOp7<T42_ug~Q
zJ?Hm3=lj0%-E&tE9`y%8h;UH(9ies22zCtglh>tZhwZ1+g(AXy-HL10DFmbXNx@L~
z3<I6(u*`QmVXUp;BtE82J9(_N;%CuT?Y5XY-N}?@Ye%$2+ZkP}y~V)KJ5pOLC#eqA
zrDwi<AvE=}QJ%E5c11)@*)c>H0wQYEps<v~l-4?S?ulvJH_NWsGshy#Wh<ucv5xlD
z$I$%%LY($Zc3Y)dr?6_SKHX>np{iIyzhEeKgc((i$;}oAoqHl}Yb`&gx~}ISy|wl#
zwdwQ;nvEgzkAnwYj%g}=Nide26RJwsNTUEE)Q2P-5}7cQ3Z84R%7r<Rv8_&i?R?f*
zPHj^k;!i2>dvN4sQKhOlPcRnSrOp+WGP}nNJgfkDx!pMkypKGe90p51ezT#4MxAxQ
zN3CC+fuRy0nP8u@+)%h}@FHZ<Af?5!4{aA>>vWFTTCD?*bPf|6Oz4#LAYDsH*sO<_
z+8Vve2|wE19JrkK!TNc*tzkb>2=OxIfDS8-yiLEA$m0k(kQf0ZJlj+Q&+pg*@-o6x
zTdEi#&vL>m?`;jX+>v0bbWnM`S<~tiA>-z6^m&Xo6y=iH&}dMDp40vqOvn?CbR0P3
z0YX_`z8klIalWefMaf}lN@-MvK>)C@OTMQsv<oGhUiQCE$X7u_(K*RsCA%ID0jykd
zY)#$;WESeh)!TFm*NPE`EjYt2fGv6@4ndEmq*}kxw<(E=#<%Js73Q+5M2_M-9Vyz7
z!}79nz&r*TFPs76>EFV1j6zbmglN3)tDNw{&IYft@#yp|U;GYg&z^)Rt7d@u#0Bpe
zimnOEmq&Tef~aWH7SjqERa#-iBMX%jZKUfNcy71bp|`IOKD_d0nA~D<-XkQV*jewl
zx|K<Ib?<h!>$GjP@M*^t)>e04FWS7-Uwy|!6q{ICob5gfvYaErq&g;Btk^VqnotOu
zSN-|V;a*P<^rDbv9KD!YExR|ex)jop)as*$VeKa$K<SB3L4{az^aJ?Gft?Ighw4(B
zd%nB~eb~m`?gyYd`5XVHC2>-3I_~rZ#$8<Hq6|eT5hit)Gfx>n0D;V;;rwan!I2{&
zEnl34toAlI^wpPe<z^?`$1=Oa0S*y#-qqa=9sj)v2fz9w6GG%4^!CF2-X3N`WGVr3
zCnNf%_MNVA8&+qyR%kISjWHBwQ%dZaDGubs-?df3-aq{wXQ9tY^kG2*p!w_=Jdd^>
zlye)Ao4ycY%W~JdLaI0e(MHvF%G1SkH=uyAXf{=!ABS!n#lZ@o8CZ4XFmw8#1n{&R
zVs(YP+3GCIkwRjs%TCiYQa(?iP=b^m$jib}=-N*{ggXx&44S-zukU>W+LOO#ZOZ!~
zOnukpUM6x&FsRNVXIChVTfbhB(rD_SHz|4<p*Svq5;Mwe<YlJQCz$O&DtQUimj2Xn
zw>}839cXjAmbiVtspfigR#uEFjIMj@si>Ore+Oei$<1cCarcfF2@0*j682U1A9rp;
zlE=d6(}XYz#@Cd03QHCwxdi0=G&$N_{=Yy1XfbK~!v(L-Fa7gxu<_$VaOSVq1CpmY
z8$Ujb&-~r%UfZSfpfHyQ7GTlb5>~#R>JqSaSxPVhD7~ea?b-3_j}BnQxCvh0zmvuF
zfymQ6C7Oj$o(rpg(e8EsF8b6fI~#$e4S@tKotNPf@Ro97lv&dmNB}MOzKDHx{Td^7
z^e>kK&H&X>w(nxk__|+v<^;uhpfq|w0oCgN2n*&Uy98ur#zdLa9sUH2!{g=78$;%}
z1L1P#zaX{-%}ARM>G(3`OF*1abzPV`HC~?1g-^B_&(OXN<=~`T0!1J)ouwb`hnx4h
z9=m{>-*my^gYQ9FLp5Z*znzJYxJcY)*bL{8bEG_x3mc;?*yV2q=Kg#a+<I{Tu?x6$
z|31v!w}Y7xi4b%J%$(d7vf`Y*$aw_k&QpmHMd7-QOaM<M`aK0?A34M_#>Xvy`pEue
zJ2#<55|A&7Ku(lOR2IUxb#E82l~|riL@t>>J=|1!XP{(Gfq7D*RSSuh3Wmux1H9O5
zbzVzIvg#nSb+dS_bpfB9xub!%!Jvc0T8>$5O?a$?#5xXzQ6&nfaS6~B@Yl=oyt`5J
zUi|^Lo>^h?bXpN!k$b{#I*o}Gg+L0KqjiNap+>{b<p|T{!6YQZ>dB$Wh1B{gdNt&z
zkU*wl;*p0Tp96`fH`Pew34JvBLf)EFl)AaU3W$CXzIJ5}*_hmnyplOlgkJ%5dN1-^
zfYFOQ7f|g*o(nK@@|F3Nh4!=hOBWWfJjm^}QhYrdl{|g|c5+Shdb>Od$s<#GvjwI%
znqg*ZJ*3tdIBXmlNOJbhCP>{}#ZfQ82y=FCgS0Is7aB~A{A+vOWk<4kG8-CsBA>N)
z2Ro)Vo9)zRim|LCBI$`F-!JxDQG~E+nVNaMk<Y?F>GbGoHB3M|cbfqm?Jyjr6ln%D
z61dqAY5B-YX2WN|HS&_#uo<W<z|k+_qrYT<mT~mCKjc#MB)<J$3~F0ti_z&5lDRkq
zc#tRJT!1@y_h{oHkHm4N$%G2I?QdFj3ArWmZR>&dO1ZLdVcx6-*l>@yGiUd^twKIQ
z1myy3dN1;B0z4enBib<Xgw!UBo#jQPIX%LUGnI_nnm`{wxQMaB$~TOpJl0m(Py3X9
z_l0C4!nBfrK}H*qJdV60y~Wzk+E}{4%s2(yT9iVQQlKdsFRWX!FBV_m6Y32P&lTEh
zV~a!}Q7*tR$Y|ptk0bA3^|P0>GcLp_=&v^1A84wc`CetouQG9=$!N7f##SDg2(;-$
z`!;UT3E!5cpgGLm)#4Fpf{Qj}^JF&E4%N%lmmNV4&oVB`hy6ytSLkp=a!l^3{cMD2
zTZ1ifMFW4}K)*?$c>mDR24g)rEZIEGUiM-d`ALieTX6^VNp)73C?Y9z`9d?=c(?d1
zs~_K-`cOc>&%IHK9z-;#Xp`TMv(d*wB}E%mPIu_y`4;N)(a6iqDI;Sfv%{G`Tq?Y?
z`XY5qua{3ZRrAk6vM-O$&0Shch^Vh<qs7>+#oUI{16*NgkrFgmFX!!x!YeN<XMJS|
zBOl&u{C%}iDoF(G1f+#7nXezM&bc<xrkB1hisn_~{+SgW5s5p5ihOoVYplj2Iw*}a
zIXyWnr5;u4YoTw<HDKH6tPkqzvHT5z$6zw4Cxpnz>2Yr^QVW|_o)XG(ZcBN)a|R?)
zB#;P8w$4loZCthCwy<I4snCUU)FZGh)@D#~ekf*xcQOi*fOsx^+O({wHy3Hp+1ukw
z^Uh7T={w@8R5#*HrC$@`3?(<>D)Kv~>DA|AHfFa+EnB3aXYkonv5irz&0+e_1c`|f
ziIC%^3DMCrgrvlo!j#n640IkHIfLEfbrQs9Mtu8!_VBgvQKZl*M~Z$T%?|zlVT_2;
lV%Z2*hu);<nIW+A{{w(@HP+rHXcqtg002ovPDHLkV1knrU~2#X

diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xhdpi/ic_launcher.png b/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xhdpi/ic_launcher.png
deleted file mode 100644
index e5ed46597ea8447d91ab1786a34e30f1c26b18bd..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4490
zcmV;55q0i~P)<h;3K|Lk000e1NJLTq003YB003YJ1^@s6;+S_h000qDNkl<Zc-rk<
z3v^V~xjyNTFnI~m1R;rpU;u%T&<GX+wN$K?w$xjDX>6rydA(}wUDXPCF_W1vnaRBK
zeoR6LNsxyaZGA2++G?*?dRwg0Dq5+E#aFEgnub(`IsN<pd!FayJOUYF9%rxhtvNI2
zKmR%V`}Wy;|9kIqhU2uPB`s-5OIp&Bmb9d$ag;2UFK~Sub<*h5$?rQ?+rG23Js;<n
z*+*QAS_++1OQX-rT8m{$_{;WN<2iSE7U$eLkF%BLMB}0sTh27EDM$Rf7<!v2C%eUq
z+v>LD^CGWJ)s74L)DOcaT_gD&woh@MDDT7paS^E*rkp>8F->o<OiyrCRwuZO&rNk1
z-^b5>#K*x;hPkb-{<L~)fAnusNcNcf7qr**C%H_jwpq>g{@G1-RXg&<e4=kP+w=x&
ze|C#Y)lToKWIj{80-r3;K@OfHPB`Qyz*V2_yp;T}?CDrMug(2RPMaHYT3xdE`raL|
zd}p(w4dK#uFKoD{7v}pss<v0$)edWR`Bd!|pfB>d5PhrJUf$gT>-Kc2+T~(?$>*Yu
zT4h`0W>J$pZ%Azsi;{nVW%G=At*)awy8+_t6`#e`RGh(2zZ43)n*13}cE8;I5R%*`
z|5tXk`=>gMs<sQ;YyG)CkE)&CWn?~++?A@f7dJcn%XT!X+QmBz%x9{{ENv6u;<GsW
zR!Fv0{AJedx2H#Q7ZU*z90nVe{5X60X|AUHUkH%Hmv%HlMq>>q*$@(4m8?`JI1Q?{
zRHAd+JgRmHP9yV))rP7q3IO??4XSoJ$5!Su*=<s?`xj>~JDub(K$fM<8yf*a-K*Qz
zPelO^(`|+V_|-0Wk_vz*qdO0>?1mS)wM$Y29FC;)bEP-uAW0uG0ct9EO#m6#%K0RZ
z39?+K6Wk5gE*|+^5I8uFyX{ALNYa2Nz%T`Hn@(}pU9*C57Xtylz}>iUsV2Z#2;ejg
zaNoZ2a>iW@1kiDtzFVLPa8^~&DQ^ARm5e)008Ic*fO8jsh19y~Ki*W3-Qpae2p0nv
zo(NXL_4n_CukY&uHM^BPt?*wD_pyjn&Gy=Rcfp3fUR68tMLx;5n(a64-U;9T#U52V
zit5Q{QE!`~T|s99zY=X$w0cfmaNYW#0DU9B1CnnlE=a4Z9-s@!Y^>p_bSr_8-_-<a
zcJo@@j93&`)fOXBSl#PWwVR8+$R|eas<xD$b$dHi?dVP+pL7Q*+BVquc|H#;uCF<j
zxZx_T3ewsDO#n>*O#n>*O#n>*O#n>*O#n@Ra~B|fQ*l9(%QQf9xcJEvaY~>ll!7d&
zeMy*!>i>NLUU=_aXnXb`eD~hF-~w+IsQDzK^0wEj+D$`WSMKSA3v0K*aIW*wzx){v
z|Lq;P{lJ5=b}1e+^O;s(t?biT$<IKW98cA$`1M!CZYW~rk8dhi;PVd9#PsSOA5W!0
z2^6+d<S4c8e;&IZK09*;c%Fwden0&F?c+=YHJ?AYe;*Y8Vy~**+TGjPIfyVNJM#61
zci&>yLHOtC&t(07^{x))^Qyf&6nz%;wDIf6##eu8#&sKFHx$9)9f0Z%(CUS$4kJ%h
zh7xEzhK3iU_R;u@KbYx|2=~79C&+BFEBd6;PpcBt&P}D2M4-D$&W5VeCtg1)xQ^3!
z9dwsT*;DBzpVRTKQar!Iz)wS)Y_}P!pfNfWp?4YK(O3Tre#~%m=I?&-Fr?${tJVhS
z>=lrTBvW+|8iS#2`i=IfwE<-R;44R%@X>{!`|u$=e(U6DgfD8a!sD+U6_7w8>_2iC
zX4F|kjj91=H`?IFhx(x5cTdB<7oUfx-gpfTz4Im<`TO4(Xq$f9`@-{Je(C_+`S?TZ
z4vcpQ8~0gw-iMFABs?!xhr3^RjtMxadO=JCss=<Qk*XtVNK-H)-`w8~dyt$rJ~|<=
zQv=z)e&R#e^~Yyfy-mcWp|Fp>`ts28z5FLd@+WjRbPjd{sS);z$b0hGtE^P}he^1i
z7>H-yd;^|7eoS~C1QmcUcehUNIDmRU&%AkT#6+Jh?!%J56dPSF5W|cS2~^FD7Wvd}
zT-<EeB4HBXTl;%pFE*^FgR#MAfZ~wO=Y9uEv1>c21)vi6B=%lT`_GJe6+|LDhTUPB
z>Kqr7@|jIF1GGeZq0h@xpIiwP1yjb9Y*zKO!2wZMbhJU|{xvrEbS+BPy11i`MdHh_
zU@6%x@Ok(Gv{}~ZjMb!kP=K2@70hm|8K6>-+veseAW{OYUZ4qdx&3t8|MsoFVo&7r
zBR|p`^0RB9Ym&QOBA13Klxzr>w7U5`YS<f4fDP+oITR7wDyZ~VkBn-auLimnx~56d
z1}yHVg^&I@IA+KXfv#yWmh=fNK%%>n4T7nW@sCeFfg|s|3n!5j{|JLH@<G1-@#!b3
z_D{U_3M4v>6H|aVdj<UTmk+3t@W#mxVQI4yxXL>q+q(_^fRXaK3P8tZdo9e@(iRu<
zt#-^$ANe`N*~%uK05m~D0gxI2h64{X!b14LJ-fp52WMNa-_Ungz>n!?42H)aRu9tf
zZn@BbcY(EZVhL~!%>xXh%jx{h69NHlePI7Nbyew@+aBx-lTRSu!x_l?#;y+Fs_qPn
zFzyAQVd36CK07Sp-tGSwzO%a%W;so;wyOnR9>!fGhokSm2Wxk>z$}*;zO!cs^F5s7
zdN4|kx0C?4Z8H;L+zUX*9sl^`u!*Ba_}GaL;N;-QdrRble38%L9&`MolaSM3!@FQJ
z6G4Z0_?!g@Oi9v1(0V6LNg6>3G$lEgO-Tm6-~7mZF&SDOz2J<8TOPaz5~@oX5^WXm
zRgCN}thFfSJHcV(r^j|mGB%U)4;_7J+>jr_V@F?x)tyaH)Y%AYx|-ou6lC4*?Vr!2
zJS|H}beRSgvSlfiJk7T%A+RjP#kOg-=>Ybx$D05Lj~|1Xc<p*yIV9KM9njqhGh8OD
z0Sxkar)4R1)i5`9%Th+(22l6&|KIxSxD()YY&UW!57Y)!mq5{uCKO^Te&2$1ecAYF
zglXFGl@I<t?gV)M)Br4Lbq56~6$)UH5n(I}uw;83e1Mr5ZvqT_`Y9B4)H1@u0?@J*
zOHcsnriw)vivrNppXcdcL*4&9IRuRdpMrgd4?)8-PfG5O#?C{(g|0&fRqYy{A)n8p
z@83Wj`BlJn^w<0R?<2n*jGz|3XH!>HQh<^OqD2_9kucVwoaqihgiFwGD}j~1T8KAq
z9<oo2+*&Kl3y$kV4=;$3fP}5;s0Tfg#+mUq^PNIeMUaX9@~D~#_V<O@5=KCP=AZy2
zLR3eIFs4UCB}6!90c6N$0w5_70p!Ui(!&BMkS`R#FqHi2C=tf|e1J%Va}q#-d@Av>
z0*J_$7eGipRXI8<3eY7Ipjr$(pS5fpOv=;6o~r=0)r#cH3Lrr~6QEWsz)<of1&|`2
zU@7Hk`F+d<pa-?n>#GN7h+$5Xou}0dN}v_c^boY%{;YZ{WV+0(M1QNN9kM;!AOnLO
zA<P1-Mp9R`*FmP+jHnXA6G09JLK}8>!aO<$`pxu4!x90Kzr3RkuIy=J+gW&=9H=qA
z_U>+&-|S@9p4AWyTLkr1J{JXz;e*<dRS|QVodfiOyZp`uK0mmkOb+z;KRto2WV6)g
zg~Ap$T)(dy3NT|!8tpQ0FkI8)gR8sSAg{?01t;V+3-P5k5B%_+U2qxt&uei-fW(MF
z<p0OMov;*-5yBe*`dQXq3!8qv7cR%1&{EtEAzy^N_F8@g7QE$+uGqe+;Jhu&!d``H
z6%?#ua_ekEs~~eU0@V$xg0?q+-dZME?L^o2RPu>%scI*>vDKlk)jL}tnO0kitDO+6
z?2}J&RYIn-a{R1}qm0E@ZB`_oFkdWy1o<HWiUa~klTV=PMIe9#`Ah(tSO5v~nE++u
zPXP9io~?XnoFHms1p>&B&jg?@V^{!r@`-SP05aqg;X(mq$fxs-TLGNGl11do^z)ej
zbyh|4sl+n@Iva%o$n^8W0w|C#6u>A?ev|-N<5GZdoFLuJoL?^%Ksv}8B7j1W6%fFy
zNPbv=Zjk_D@+X7<OoY;52p}Kf3ItGza3cc9lP@WN6!`)0?7gAmT7b}<U>5dvA_6E6
zFN6iKm8nL!k^)EsSvqW^!UD*VZ;KXSB0MP{62Yt>fJB5F5ujW(!es*ZyvoB1VF6kp
z*=dv~|NIJ2T%dOv2k0&0@pc1G%QTb_ih|Yb=$T%62%3bDw82d2XhH;WDF$Wp8)|TS
zO9Yk>O2SA)vS<#MrV(i-iw4q$z#0HWxD;ejKcAgz2+A3z)@+3<dfVUvw+VFQR}6a)
zSU6KL8lU4FW0Uf7pr_gC3G}rJlijO#wz9zLKQUtd9!9OST@*$mEA7#^0(&dBJg}e5
z;nEns5X}K}1wg|K6yZ`2M}Y2Vv@-H<d*r^rq<l|n5N)ugSBRqSR-Q%+*7)jTJSl%u
zaY2gPG(b<Pr@GBNt&dRxy<L#j%d6SZ1J?8yDJ|ttw8dLLpN0>bosdkE<O~%;)C4dc
zZNG-(&;&Y7Au`ybVF-i9c4&Bk`u7nSg+PB)@`?PJ7&#|9D+6$eLDy;G)9S4-k0#|O
z+DrT9<GC+ki=~u@7ZR{Kq0$~`1C%3N8d#$-9mU}Vs;*F2D)mIXpw17YJOFF?`56rz
zGq0;@aSDl-=;sd+Kq1JY(aeYd@)52;06Fp}g#Z+_5-AE1u0Q~l2sa`?sbYj15TK1!
zHL~Q}L@)1-sQ@=_TsEd&*hJ*W1X*DLkTkUjmjuWc3n0<YX9Dz!1rU)ho)iA|x>d0g
z;D&1#CpZiz#?%|L1R`t^3D6uAKsmytNfdzqGC|f*0VK$e7Qk*e$z8qXvXKiA`1=hV
zmpdyx!B&1`%>9K46G0ec(a5T#01`o#KmdgZm-_e-0c6Mz|AmPOGO9|Ba#>%@WZZ2W
z>Ho;wdKvvm*|hl5+kCX*InGgW8c#HK{=|ok`9yj<On@$t05TD7SOBHr1%Uu2dGc+@
ziM{+Z1%fkwa$_%X7UYhLg#amB-ooV<+YE2U(TVAwlx!7e=$50FUyOq7_5AF5tK`0D
zBswbjqMlYh0i$a*+XZcI{*o4#^nObj&sA+<NT@F7S9Uh?dMj3yRV6&PO^nhoGV7}O
zJfD}R$#mu;8r2c#Y!!S_cMG5FFyZ&|07B=sF`QwXplJF#Z+Sa$Q7$#WsocgT30w?P
z9Jhr1BT#1g)oY6rOMdcpBA$rGJgB~t0=>eW-XboyKLmQg9WCdk*L<nj(i7<P5VWif
zy$}!sscBeBzea2M-DdV(01T?4K@;dShs%x?!yN$)FRWuP?X$9%4l+ca8^04L`k&qW
zj(+uu8;C1z;lfnLU5wHcE+ZoiKXa28Ex2mtEt~db818s2(^m0uW{q($)4uf-`wU`v
zqhoN9$1)iGn-u0cDhG>NJcD!Wm8!M{^|rzMI;*ms)i5}x+Az2Z&!25I4rWwWL}BX?
zEOKufEUd2?%)sM9ARn2w5R42L+weM@-Ge!fsOt>oIm=qnPh6z`_Ydz*&dt4=I7*o{
zE1hu`!$e9>O-f74pc5eSr(Br2T9<$6_jJqiuh$jk6-OgwWnppRih^SC?_wkr78Flg
zxdOM<ZOLcKib7R89rIa;zVlRV=^DxJ@O-W)P7vpb@{SxJ7i)+U#!Y~usj)2mp+r3*
zJvTG?B5s;K2e;=V79kc-ax7w85GTY9aTMU{yd+YU8s%gzHT8>Jdh#qTEon9)Lx{AD
zp})x??JVrlV(c?%q&{ae4u}ilB*0A^Hwr0^^>G9BT>K=*lpq(QLcEr=q$MqBNlRMN
c(!@yr22-Ey)4s~&`~Uy|07*qoM6N<$g6%nSQUCw|

diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xhdpi/ic_launcher_round.png b/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xhdpi/ic_launcher_round.png
deleted file mode 100644
index 14ed0af35023e4f1901cf03487b6c524257b8483..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6895
zcmV<L8W81)P)<h;3K|Lk000e1NJLTq003YB003YJ1^@s6;+S_h000`cNkl<Zc-rk<
z2Ygdi`@T&lbQe0TP!NhO6a=X(LE7TS6qO-MLHHw}K%sObZPS%DNt2eAZfJo**$Ak(
z75{?YaNu5u3l%3KLq9i3dcOBP_uk}Y-=-;)k>BruHaWfboaZ^`J@5OTb59uN+UwfO
z>5DKPj6xxy*f-15A^38Hcw8gS)fY>m7X^~)>WdY`i-Y7Ev5tB;lGU`#+aci!MOUUM
zD}qsF_F|N>IHn{!fdYTV_wX|;<46$x9(d2I{>ArDOEMG+AD^=P{ywF-GrY99`C;pd
zTVmI*ebJ{Z?*lK5{2OnL{2bsnz#klb<KFuk@PU7M!He-5Dq8%?^YZZ;crCo94YSt1
z5q7!0l8-37X6@T&*6_Hxvgp1I<#PayvFpp<0m6TU|4D2^85!1GOQ!7ELT==KcZPvY
z>&V^vTF8LL3idsEt+KcA+ISDVmw89n=b3!uh}YH8<Apu0Ocv;)$(!j@mb=?qD?jC<
zl@lKgk6a%5m^zIvY5N9U7j2#nf9GJZ0RO%&tWu#Hzk8D|Xl<cha2?~d>Am2dcyFwO
zP>3sYL|70%XiHU}0Zo+(MxFf$fG{c^GK8Lk0nm!?MOUlH=$7@wQ=P+?afrb30+O<`
ziTG*r2zL#G;JREn?w(KwKTW>kAG@~nvD;BDbNA6Sw3X7nOleNtO`EFE_iw7?Nk@V%
z2nn}DI|Z-=FUSS{e!iMKGH%z#^FftGb+nGAxybACovek#YjQ#vb&d*p+t1kJZ`xQz
z;u|ZlH|p$>-hl#GilOt>$n{u0Xl)T;>j-tlI@@Z?Wzp-=)#G34?74swCQ~ERfdKmc
zFhPnTvx5a7><EyPaXXvzFlXqD<_YVp%9&?u2ezrxj|M(crNql4nNrza0r9+<p(CKP
z3BZnkLLx9vgw~z50or16n>%ShCv+=IbEiP%zhTLzjnoMn+{p#7s56cR+1Ip9!b!Tb
z`Sm7~BP+1z^;S0iG7&)FAn@&x7D5ZD8A|Rn^8#NH904lXb|d*p^Im_M3cx}s7!4)T
z9gHH`t8+}w++;htxjC@gx{~KPlVjj*{S_ks3$9(+#6u-Jl&IAP3pu!CJwK#M5t6c_
z>9wdD74a&~(E(Zk#1U@ZTtm|Z&dTxVSzAiRZr?zO5>r03qKN!s*CrAGLWn8vUzShH
zLj>)tEVfOD(e%jX+M_)bim*#E5_p?Gy16VcdB?_AS3UnYnfh>x4oMP&MNjS{^B><!
z#Fn4Fe>++6>|-QpN0X@X6L&Y0v_nr&QpJ?Nedk76e$t+1QRS1iuh%{F%%f!H-mR|<
zQLG8Eng=h6w*&uot15mDdp?pMw_z>mzOGmllD0RJTU#1Lm&egEdG8hyS)~+JzIUCL
zOasw+)T%|5zrIFI%imD16;(cBT?v`6d!z2=P1Pi<EbkDrj59+P3C51`^NneR>}_cC
zaY){_eM2i&Osq}6Oy>Y2JfPjfx74>{k`N|n!sM^n$$Li~8z=DouS%NFPq=6oaadk$
z0*u&FPkPm9z)j6IfM-M)d8(pgV+4M-S4t-d{CpIET*U$q-ZNqpnS{w$epknMM*J)<
zPm6>bel7I#uL*$fN%fSIg0yd#CHM7kuV;h_C^iY@0i^Gty<ZSyT@!#0*0Nzf08^Bc
zdH}eR2zYZ-?=P0l5#)ASuL)ow0CIx-C^EnlExF%4qkx}Kva^HX;-7K(5WTQ6*8~uL
z#+jnLv=hv{dEMJTDYJj<!4zkeu)W`${A&VetUp)F^uQNkGMAN8?pzjt|3gK3A0Yo6
zH}M*-3E)Hk<On-!e%g6v#e)7Uhsj>9+J2aLrPcO&e_I4V!m|%QLzX;!0D_phPA9;f
z54Vuq!_U%`L{EsIT^4|j0x3HRvX(Vc4%<2x@Oh2+Dn;)>o2t)Xj~&>w&Vc`00uyVP
z+rjjLt<Mqo*|)#^OmuoZ`CF$WN54Hub3yjXTVmYKjbs6o9|EDjnDQgH){_UGIzWas
z*NG}WZT~j%?y2v{1*qQzy^b6^eTsGxd*ug1J?=fYhfLVL+2-dwR|BfmXM3kEi$^XK
z+Y%dao2ESC8p;ZA_d<MQg#ml4EnuDo$;8}_y)xLd3<iS{dT8h?Uw=t%+qS{(-v}`l
z1f2KSZju0X5mP>~xt1(^VjmUESy@cLz5nC)L@%fx;yxhQ-ro#ptR%A^-9<Zw-D@{h
z8sh7<q))M|6uF!wI#0ysGFHqA0JC$<mgmz}6NH~5>B0u$XgK)sha_CY+|f}c==vHJ
zIsE14R^<PA;R5N?pmp5@89gHSKWD|q%*0V);`2nD?e|e;PP66ryCMLVX?*il^2de0
ztwea_qxWgPUz7k`?mn`rgv8X9l78#UDez}}4?pt|DFAL+eD3q(RvqJ)KmR0oho7S)
zw^IN%E+|YVOzMJr#mxn#=uP6IlI?W1wAhwOxLc)yY<~MFwX?zo<DBr=@efF_4G#o+
zJd&CMLw)TlFOh~fUMA0ccAWeN%-5UWe?va}@dpb08Q**Q(~n5q>;ECC&mE-m5-zZK
z+8{Cl>U!wJC$s|y>+%=$e8oRsp!aOoBrJ@MF;SPkbU$$FNuOD87#(v%q_;vE<)g{{
z)}HI>svC+uv;Os$twg|H_&AuO>#CKsTo>rM<9BT$m9M@;K7t9+k|;62$@KkG-xKZ2
zhe^_oMi>opdhOmo+KXR&YGro*f{q}Ep3j$aj{uxYnw$E)-`r`v*$LKBT)@uM9ye4J
z-Q#1bNUOU9;6>Q;!8^3)TN3u@@%O2>^UtqNkTbvkW<`=Kz-yfT?N{=`iBIXo`W%cP
zOF@78`!8CjaFJ~gEr7rbg{*#HA!~+a`8W%{Bz>w?4Y=;<toOSbh@r>y{O2FrCCt!4
zuy^g+qyHvTAKvPoK+M_<8JLnR5|X`g3r*75jg0vjI+5}2Tc>@aBLzSo8U5@X@4sm^
z5-ujt+fn`dMM}KeB4Jx*2>uVv&wPi8j_zvT3~}C%Z`$&>zV&72aX)=W3XlNt!|X?Q
zQm^Au32^rJ-)S6xb54f}0OiA!vY*2j%^E_@&@x*=87F{e-s<W2WC|5P)o}vgE{Bsp
zpSxt_^Eh$t3|;W=FacDY0Ht!$ZRG<C?7O3!{r4)&$VDF+pE?n<b-b+tpr0;%@oDIN
zzg=?jd0?8P0^Ho-ju8MSVnZ0Xw&7V1scg>!CjZ|nOe1f`XR>1IG<pnxoB@L<+_!@%
z2{C1tJavG2C7p$Q9_Rk}o#0*{tfRAb;5L>iFlvUuJSK*t=o+=Yf5Tc5TadL2IQF()
zEi;A4K7Fc758(rGN!uFr7=1be_I@-cIE<Z@#bE09zyI!Z<nzF~VI7^dW6ZYov?F21
z6T{KytsC?cAg;Nky!a*HxT|$qg8jvVNk|CrQDwi2Pa}5G%JmqDIs>M1amN~NnsQVQ
zGnAj7{i)NE&jag-b#>GhG`pj=Hqeb+VmN|mT#uW%u2aZ9WP0=nqg<PLLj20tU3W!#
ztFpgf=YSl&7<(vY4!|4^M!I?zBHD`$J}P&V_-o@FKofJ_fS>D1a!xX1#>7~!l<@*A
zoYvP%oqLK3P?~FShX9z1Sqj6ovlDNLrBCj+nMZO-0B}XA0IJ;6%pJ)C?Fk@Zmdxqz
zt<F8~P5}}(*O1T7o)*>UAO8CbdHVQ=<lwP)$(P_Bx{FM_*1==%(QD&9#Qe=?XHJtu
za4g&@K(t!^Gjr#-q1K)dFTnjXZ}Kk9{ja+O@P=4o--qvt^FqKaow2)_conQ69{Fh`
zX>%<(ai;xq23`ZNh1c{dOsDraC(;Gp_x{_&8?%}28UgCOUzsT>BkT#_$;_WV*qs7k
zaPyN$mvj4DM~Poi24V76Q+NQ14?o+kc?17edH8v_RvLR<5W!E8Nw&XzRMg*N-BY$S
zuzP*nCBWq5k(6tj0?eD4;4Tw{lUUiyM?|NRtpotF6fZvOQYu;~fC>eGYcU+!A^_gI
z>|g&+Jh5H^5!z*f#wXumUx4XTZuC;;xMdO!D9;DmFW!WFarO)uTvuikAf~*Cy!Q2%
z?KVMgd~=fYTB|S$Fu1;)-b?J?fAZ6hBmmb%3fCA#XxAj1GG?%S0g^}b05|kYcetUL
z-fe4Y`Q-Vtqy|P!>5)U^_~}z_aa-{kcrCnU&C4&rJ<hc`g*_F5C@KIhFTx1Ibc#!E
zs6uv<89E@Aa&5L(hz-OfOpl;2?-JR&?G*rnvzB+?xa?d{duRcZ<y$Oe1ps&yXMq59
z;ssEH0F??7Rhao96Yd{Hxi$;LxOHGakbjBv1<dP&RVwtjdgToGKpZ}kg^vaC?+pdl
z^V2h@uNVQaEkid{f+@<O2vp~iB#4CZJR&h11+LB4$JD4v4|zJKBEl%w3PzBMz{TQZ
zIwS)6l`FR5VMh~oZKelUI0HTihr7w}87uxh{QZwkobzcbaJ@L|xyQksts_azh<dW@
z;9fEd+q}aGM>`sE|B!wvbkd<kxqE@*K+3h%p@QrpL03QN%0g5&Xl-GupEeIc904B%
zMG^3UIR3r4kM6!=$mfy%>_OtElu>j6jNVj3Vxd?2fw$+FBYCS|S$=CYSc<5Xi_2*;
z&gOy)`=+1ggA3<z>j5q=$gF`8aHR>b`OQ}eQ<P5O0p8HC#&slST?zaSo>6h8^930&
zTfz6uT#6in{r9oABIe_L$ArY#I_=r^EJ;?q_OB~WfagCwZZ1HRKmdgU5x6DEkfO}<
zfwzyo4LP<w0@#TdKM3zrrF^430v!10y(>-t+{?-ekO2Z@S_?o$$g;aAA0l1(9&md-
z<=AWj7QQA=_Jw~#d#mJ4?b#K9JJqf<0gnCn1538001ANs_@tzj2-yZ49YM<%;c8eY
z$FZH)D*9o-^{baHqyo6OF>A<%3Ni|8q&>{r+d^jT-r}%~5L31_lEnvhk<m?6=AYs3
zHjn~d%0mYq25+U7jM`WwK($Sb1<z*r6Z^@ymi6p=o2)UkvC=|-DiA<vPCgH#2;9aj
zh`_tdxhoa9Wfmp-IoOv@gM%&sBW@nxqm+Iv971i1gz;%X9Ijki!4e5oYI0*`iQUgF
z5mlN;Ml@6q&-^sg`-1$AB?3xhBxz#}@k1gC;Jt!1)+jUqgDiA0191_iFx_bDO~^2V
zwE&qG0su;~Oj{E(mwo5mzdxp-yKwHlgI8=Uj)Dm9?9@HxXXUuC9$-g=b_?Kud?Y+T
zDHOml`AE2#08YrK0u&1cut9z?_j8F<M}K}`?m^%*lH=Cd(-{fqB87M_oO3@t2cbRN
z)0JHK?lu9|I3?ey02bt%3y|xSeC+Xd1t=CGqm_<=+*O3R<MtKEsbl+ZZic!t!NGl@
z@yE;_HgIK4*oHZBS%88yPRTb4u*rr1Hpn*$&}=DyZSsu*z?>3OrL;pn_Wlg^IkA<C
zK4Ag^`cqDmMLw(-otTJQCn9`O7i)Y=<(OPE09rB0W(nL&kLT^+UKv}T8&M)9K7|<+
zLu$1op;~SH-5Fxa3dxwpD$+%fMSP1gt)Van?m`>4rJe+-a^UwY7R5qH&49$;zI8q6
zuFa?QWFa#_X%0VCHo0|kEkwel#20?HhOE_Boonzd$ROVHrqv>s49lswR{|TU1x4L9
zYWUdAHK)eyY$D^fHyXs|f^6qRnrJT@3q;P}(?aHg7lc1M1q}7Ow>ObxkL;#qWh{6p
zNoJ@q2lV_2;LW5yv5(xor2$M!4PBBnq0SsoCnSIMQwPW-xK9!YXN?9Ewl<Jtz=Lr8
zS^Kt;B)FF)Qnof2Qot&VTw8KF{N_Ppk;_<7=EwA+dxUT>1gu%s7*t+Bg35~wxOdVL
z_!J6maK$|`wmvrlW(J|R4Qp6SZiZ11h`rAlpa;f+xk}ztOG1=6^mika+17v_cwJcm
znb@*{glqHQ_Z$<{mdK^Ro{!{5S13qeX|4t2CTLg$Yx3A^XhS&(#Cr%31fK<I^9@+P
zt_x>xLk>AE+jwroWIAJqGD8O53ik6ycRr{+uucnefYQ1B=j?lwCZCL0Z!rfHSi)rM
z13-u*5X=u3)NR;&OIH(34)$~;+?LI^bTx53U>L*(G1V#y+YdHhk;R@Ll=i?+OkCd-
z%3*SEKUbcW_h90><I(Otn5Fkt(u?E625t~w5JZvjH5yUm^8$?AP-zmt0r^G&HVXx?
zPrh9N?2u0dD6u5~B_IBm8W=g$f7aCXlslF~7f($|pe|_`6B;m(k*)aHR$SonQ3Vh!
z+YQz%JRCCtoRCiiPzeQaOg@s06Rnv54#-Ectp&(pQIe^rBCNRoGw>pZQtm|g{tib$
zTp&#%&A4L)t+45A(Dt7dVJl9s;bIyEC|u)|eC+Xd1+WujnF-*8d}{%+%uSDM1z{$R
z&7_>g#s<0G`%Nz|CMXD((fWe2kI<W`O<#o2?!!8if{59Uk8ctn-6{Dd0qV>Ja1h~|
z1dux=-=+ZA>r1lqv|jhme3Ej-a^{v(vpkqY`fO7a6BRX#kuLv&l7`Q~y7ROYB*UHn
z+5!+@oj?G`=>;nRoTL}fw?`M#BtWKv2$vOLIJmo103=_5DF<qLzG5OGMp3R3!UNpV
z`}voIiHX54G>Bm)B`<<WM#p<aZWBOhM}z`P0i5S{aN*o2fQa0Vxd4vIX9bW71u#%k
zn(?5H89n?#$`Q+z+fPsQW+EdaFkL6PZ)Q1dID2lW!L8>7DKe~FO@{*5NG})#;LV$p
z^ny_Ujoc~u*wc9ddR8e}^0QYE$@Iz9$PLF)hny$v0ZvsH#-G7`E%D3)bN6Cny)?Oo
z+qSv+;8rB2z(RmV8v@wL?N9-lEd{Wj+o1w%wGhA#`MdzbHr2Go)TqJbTt%3<(;lIm
zAUDzU378K1rVR-b78b-Utqt;cXu%;L^r5#m;S(UOxMfca@Vp&7^2Kf$-2R72FCZ2X
z4Uz3AJnS1&!MHIBQ6xl$8R)*9=6bq&fnG<!=J3r>Yy#$XFui~gt_LO97NkaamPlJi
zG}q~I`=rPHvkwCoH&ISlZaVxMHavs*`M}$I$W4lzSC%}s2RCQw@i<@HvgZtV*b$z$
z<G5TMP5Ise5isU<7Vh6chJz<`3&an%(EsfKMhdQQ;l3T*I)b|waQV=LDs#YS#Kaor
zc5oD+cVh`OH$1TCBl-93Z6UW7%8mjDiO5B82ibkwrU>1usHku}*8?kXySDgM-1OS3
zUTf%8r$G=$z>}u%up?*XVrolC&vhjv5k$Ci$41h-vY7O&P;e-=MkR~*S`E2p?^e2R
z2iI-Qp)^O8l4dnAv4*)FoLKDvZ9bYE?D@AANMDDx52qZkTzGY)>9HjOKPle;xH&j=
z@eBOKOmjv`Hyzps*NFnc=^TJ|TSRUrK%GPVdOzN?a*|%a6f$NpF_~t|=CiIQ=k0*a
z_gF9s&CV^f?WRfhqJP7Z2i@Zm5rN+@gx^9pm|1YoJ~}B;5wdmmL}=@&iPu5z8@0Jc
zAb{iaf=vM&M7XvE5Rxy|@!k$I=PsOZhtM{&ZTGnp<eP~Qzz#ti34<iF@2Ft1mS3P@
zxtjY*LS_4pt;7>nJdqF)xt#!<goOe)NiSFnAn5fN;o|S<%PvT!O+;675Vn<T^CczU
z+#Cf4c4PefP=o}}<dI95)J0#=-fCksxPESEHcOC`^n#fHB62(C0tk9Nb|Q>N9$N6F
zgblJ1XdAJum&oim79o@gW<o;nWT^gL;o)PzBpqb3R}$I~+@OSO%StyK+=|3Gh;WSp
zsIo=nc6b4BF$I3t)08rbAbp51^X`5E+JdWE{b}1mn2^p!zEA{o!~DS??g{_*L;qMZ
zJ}HA)vFKx(Y-7KV%Yf%<#y3{9w(H<lGZ8Xu2+$x9z;Sv3*Klze5m&NkXURz=%!Bhk
zk&GLi#rgRCxHkE7ZS2%vX(*vPfFt}PnP6Z1X3_!PL0v{lW>2kW(w3Y;Pl=9zrpi`&
z!mJaI$>Fh;R0Qh?H=tA~fP;NIicACUUhq}tw&EHtE`c(si%&^rOkR(5#=6rsU|X<F
z7a_=UuSvp(5kf;_xY01kt9Q?dlDX5i#wl`sxDoeNqG!Q=8;6eF`XX($C5FRuL(yhi
zxSEay$Y=%N#HyqFc5uf;{u(k|Q}~1bebZY!d-a&iO|<>Ex(9YvlOxt7`7r?j;Y@Ha
zPS9~Uq=Rp`VM6r6xi!r4g~#X|fyA-jV9L%Fxb&&yzc@|W8V$kHtq`T!J->k$fwT9f
zIY8D*dwEf&fqFE>)T?2)4Pu@N7f&9Xf6RBr<l3;ly(e+zCXUt>>&*6g&&!c~>&O}H
zr#}qk$lyMl5QDrSl9VKmNn_^Ee2iK3e<!K!MVro#3=S{?45M2%=siob_1s9&a}zoz
z6#Pt=1Jj{rr|SbWa(xombbNY+T`J6Cdp~+K=*zEM^c8dOjK}caP=^s*kB_5u^QY#<
z|6)hB7i(_7q3rJu0gPW1^_I{yB>)M7{i32${3oSk1TC7gGkDd~w?cAO{}c+|2tHX7
zU#BJGcQlcR%3^u|EI#sS6Kjh|H*En;OH2Zj6;&!Hp+#ASkepSggI6tnD`?^Do&Mky
z_(gS3!Fy7-66*lojXxVy`EzxYFjw%47oscmr^CW}fN#x@ih)QBU|84q*gJzJCZ~13
zcV=bGip38P%u7EKDP8$aq&)5O$o!1&t}Dv=F{)U027y0E7G!>hpM_^Fe<y0y(jx=b
zEZ-Y3e`aOSgp}2IExaa2`apUwyeHn9_367_4aD1<Bj_^;)aK%(hkH_SVFUB)ELk@J
zaYF!!XfC`$LdJT<#Ekchz2Q#J0kL;ez|Z)eS7_*MScWS*gtO@bxogGJYvVmw`yRmE
z+w)2@HLeID;cTuiH3PvA(%}Qm=zC-E!;X-?oXLvg2#u$~kqPnKXZ>hd{2TmRyarwi
zugRJiU+!L#tDSf;g80yf8j!fq&|tdLATY2y^~;e|A@Du?49j3d&XV1QyT&!b+bIYy
pii9&6o*bz{@b60mWOsVP{|BB8eXZ|AYE1wD002ovPDHLkV1li`I!yoo

diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxhdpi/ic_launcher.png b/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxhdpi/ic_launcher.png
deleted file mode 100644
index b0907cac3bfd8fbfdc46e1108247f0a1055387ec..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6387
zcma($WmFVQySpr~^b#u_OG=0|(kva)DP1B+cP_AmARxJ*NC=Wrg0zUl5(`L)gp{N-
z(%_OG?|Z*r_s2c=$2@ap&UtF)$(eXP9W_!SdLjS-K&qjxY;ZTH{xb;h@8E{&N(%r$
z+p3|gU=%dFmq%!1q&9_NsUvvk-GvvZ<wcVzG@v|e|4Jy?LX<FGUmoL}ZkA3}ghcj_
z`#riOiz=uiNLcwO^g&<i6_yL-S(oodl@z~|6+dIO+1GBfu<fz=`|BT5UAKa2sf~z&
z<%RHX*Iz3vjL#>jaIJ%uU(o!Ypc=Wv%E8e<<)SFdRM{tz(T@!nKT{;0jT2A&dgKu3
zk|GDUX<&73+f+CnZza0G4g29@hmNkl+2wP#$0yi6=u-4CD#*a8LxJLG9KlkveQ7v}
z>E#)-tL=xh89y&5li1I!>Zzc!_i6V~nKP^5-+!69FtnX*f=*tr+cf&UpZtLBY|wv<
zJ6r*Z5<h}|-&d9@4B^3BbJ<qA=QJsOM-QK-cw*5kwc=;EviGyFMayb7i<-5~i>374
zi$7+B3A@szy#|*$Tb~kkzc_N~h3;oe8q95K$w@e#5FRGcF}wXTR}t#^!OnNc>Z52w
zu23YrlIQY7UrLLcFSW5ctMBzwrTz=X-m{1Y!*LWUbO~;u&&q8Lu;wlGFqO2h4olL;
z{rpPfr}7f=Z)eZhFw1_ITpft-VzPF1CHv-W>u;OCBJBEOEn$HmTpFjX=xN6-H5#V{
zn6Si;q3V*@lFMd>H8;M}vOp8McQcJ<pRUux#A0XW2B9<QzT$0OwsFg86_hGxRD;t7
zmm0rcB<z+MQ5#+!u1Vy&g@VEq9Zx7;4Ss*v<RWX%yS}C1tQsU$Tn*gMaLx<#5v?U~
zO$nakLr#uV#feu(^BDIg>}^bBfV`1xb0g0`9ZZa9(wb+L_RGO6wD&I8ouM<}YVDFU
ztMSz*yMDz3AkS0YO)3_lYDarEUyj?A#9s@-l<mqD6wKozIF$z;Ffjo4CzS;oYjqf}
zft$R8D~~&V=G~R5!LFi-)9=k(qPXz(KJggY%X7=e&jzX|*)^85Z)ehC{cj1WuBs5*
z=vj=Cy5(o+oUvO|st1`~4v;|Iwt7Fa4*ySr6_)6fvz)|Ix)et6E=sEKHBLh^Xkr~O
zuDl(Jy2j^JWnynXp;WY*<P$ORXGZ5r<_$#Vsv-r|R{@P;NRtJEeto6XkW({Oj^g{L
z<&U$4eD{vua4He>n${-1Op^nD7zREi=%4Hy%V?=YS7G`L@>`3kHM4eAD%)t@F};|C
zfj?B^Kox-WuPMuDp2=LPZU3Obgnl7{dD>|>*A`fn-0|^8uAHJz;<)tkTXA8lI<v&Y
z>&dHt&xG(4Il=e~QNN6o9YD7H{TR?17eM>#Z8#Y@_=7fZ?HkZX8i|mEGs5mR`uBi^
zzFh5AG^3EMyvpx(a*)!eOI1?nPTn?v0Ly$)KlQ16Xfrzh+}+Ua_I!5XU@ciwrAZ>O
z<7!MU$n6`x${EB6YH$hWOMuSEw+72Lb~rgO*Yp26LGdNp*;^;HAD@(SAr(Dk;j7w!
zQ>!M4rxUFYn7E?v7)2q)2rJ2%PY>A>-1O7bY~nt&n)jYnG$(iR#hvlih1p}c)I+|I
zy^C;<W4pxrQdxdoC_9<3`(|%ZXD^@a5ujf7^A-(?!Sk8-$M%igEog!MN5IQRc5Q^K
z0B@wk|ETv8)B{}ke=MuX3sOlX^-c#ay{^7^!VdpZ_y8X@K1W(tERg2N-O%Jp_|g;o
zl2&-%!haMQ@-T-*xCN#XN5k^wNtdy3t+?&I^<!Jj*j#st2G59Al1Co%1>=uIJImfY
zL~pm6t6Zw8FiOIY<1>EBS(<5`Cv8DBcZEpTCQ{@@-|2$Bhi;6H?Pofq1Z%b2@)&at
zUA{9iaqi62D1|=T{xTe3Czr|z52P;M7EB|V-ss{qspYc0Cj~hUUURef8?i5H?e;kA
z<~qW5`JIc(rCLz_oJ~>x8O2IVR%>+7%}`TBSQt%i+m+4tV?z0(?5cf&1v8cNlz7Lg
z%ZS>-e!({r)+sH_1+QJvE5BqOgmfK_$X*P0*x6be<T#@bau++ibWSYwem*#+-J}M@
z@D!W+pQe*DNr$CBoSMi6(pzVcvD|L7amdhwU#kS@4Fe6xl^uqBtjbV+o$>oRN|0FV
zBu+T9^1E5}1I>g&wC|Bn^{(R$!_A@+E4<}3n|QMU=H|GuQZRAZ+zSZ}SS{MNj&mi0
zRY+fp&8IQn-}zGeIVj+qntrIP-IpXF?2xAoyT|i)X+@HL$+|t{#ZAvBrd?L!=9aLy
z%@CY;X7U41O6VpHq<1UBk2vi<Psd!b&p_}S+Zgt7kh5v#0M|mk)6;Y^NyPT!O@-Eh
zrY7c|vyYAC4d90T%F(6fooR)#=^xB_6M^xB%{BJ}oFkSN^SkeYqA33HnDXqNNGjm$
z=0$z_;^~jcO229)h#c8kYoxN9PA!)Yl<Mov=>~afo_h1Xrb{vQ%cE|Fvi8EjFCP^~
zabJnB#=NPyBD*BaNSQW*V<yD>I+TbEmlu2&HD<4U_UQNUR<ygolvXm7NkRkxdo??~
zIraDkkK0sT;<8$m4!-vur`L)#hu3tF-ExdoNnJ>_`K~u~XWideSoLc(k)vEtG^CT*
zG`Zdarw^M&6C=~oi^6W<E_?`PQZOslNR@c>#WL!BMe{E&Gg9Arbg2gg;cO^sJ#+L$
zWBP!R+lcV(p-B#aK<&Ly>?*3fngF)TwSRSmGJ!zET{Brabip#AUPyChm}S9IFG!l{
z%+I_?Cl?zVm9nbGSU`<Su#CMqUz~-?Jj=U1b$a=+@;9In3O`59Zd?h-H){k9wVuq?
z2>Ksi%z1{vEPpxnv}!StZLIR4yl9y>GM~KIIbNdVs|xsuCpX=<EVQ#bD*xTl<4FxL
zIovgwrDPa(+*tEa?7@8fy=Tx3(TDph;|Q3gZH4YflQ;p_?sNBLElC~CIwDjdIlg#~
zwpXmBUmy}wB_e;T?n?D~M=<Q0>J#rE`8<@v*FO%Lb)=#c`~s7W#9EDhRI!G*VBK(y
z5D`)jJo4o1={q}Kg%YGhdH~@PGate(xi{(OiQn~MMSZM;!kHNh*1-e<+YS5-j3b?2
zq7SYPWMn1a!^Gqxr4d1gZ5G`QQ(&4Ag*OcnWO}~9rz5xeE3Ycol5cj$@jggn@8x2*
z)UpG-U2|Av7a)Hi=b^@SNp#`PEDfswF$nyx&rD*+4SF}`_U48`=1VnBn}aEm{Funk
zSWQuC>r8yUkd_D(dKEqo`7<i*fhS{OsE`tU`sDx-?b87Jz%BuQ1WGLvCpub`e&7F|
zqA|47Vc2I|t6qvDru%)3Tn8tB348H)^8=CREXz!@+HNJew^vQ!{<m9<#nRha14=fs
zR1&1%wb7RIasNCAU(gUy<wDqFKmx~bGN;V5q#<n+@kxb|%a3lP1jSrblz03z0SS@;
zdY_c``3K4IxR;&x_<`zHf&FC66JK}gpb<_a@z*gerV1~c<6eT+a0}!lw{y0Gf8RMW
zV#VZ1*AKquVpluV!}AeAHxEv9%pCAibgwk~uNNQPi`Y;rR+|Fn3sOY>i}}{#+a?O4
zDIg~&^q#d5-Ji>``G%gDDzV<~+=*qePTy_lbVjK?!d`>ygnhxwtyL65_G4A=A}{Dh
zq;iS@h|Y-wJdeGj1b{KBTkst|klERM7*Hwy#ZO<~Q$5~GzC~WjZHz>=z3~>oAVbbv
zzmgOw2JQ#Kv)GT9dwrXGJKz5(Jw%&rYPjfi;TI|dyVJrvaZ*ivGRT;i>R6}8B>7*j
zbJi0%9UfLcYKp+TU9qXLSp`rm`)3(g6YOdHa4cv2Y)-JCPZ&g1Z*%F~T@dw@_HA~-
zxeq6NeOi{(yh(ziMZ)4yIfDP6nhTg;)$=9N_-{KO!ZB@c@e$(SVH`%0b3YF`lgX)?
zmPOF$H%(2yD*LrQ;d*vDgW=s=2h+1RYg?DCXa2gXNT~<m!o{H`ZDcegLB$n4f(?R6
za_l3#2o5ldNYi!buoH{7*Mo*V6a*$HLV;7C9n*DDXEiA(lW$p%Xxd?ZlIqsdEXwj{
zSZDlFJAc<23xB7ffAD8!UqXU9<b0@k=`H!Q-N77DK|>W+Hu+pBZ$bO8IlS+nqXw^|
zBM2iS@v_S^5P@J5V0gw2hamKs7Wro(xWlv)U$%_D)AA{;Mb;l$7?FOK*2{U?f_M(W
z4#aOFFlOC*Grkxzi#w)?qgNP48e=dJ*`EYNKfLm6BlZ-j@VMi+{0T>$Y6e%gC|6;v
z4=~J;U-H`Rv(<}l7sEXpm?7;<B_ukVgD{N&G?AbC)6nAEQu($)S%O>(jXl{O>aLca
zP;<5GjkKb?74YTOqJAtFKzq|v(-+j{(@?GPIKVS9<F51=Uzl9;{cL!8zw;-kp$or2
z&{ad+>5tsog!>*S60XwAsnYHqG)dW<#@2UIte}({hi5+*r;^rQeDpKps%Ql|LRink
z=CR6^g!&1h1Ks5JplDey{0{E~MNPgvQNeH21%lrCFFh~_7#;b73>@zaFo0B}hXo(J
z#OVP*a2!ZeK|x0L<O>fazsE0=vAP5xpQ58{e}Xtzn5B`l%b)PM2PI{UmZ`}XbW%<w
zkRQx|>4eE<Q-Duh>=4-VAbQ|zojxNh6BnLDzTlx-stKQP0|=pi5R7qw0g}ivih_z$
zN`Pc6h9K3P5vFz^s^};EaGwq5yEdpH4Um!3Lju85e*w5hg)|yEkihSklp#pqhWjij
zaK_T%_)PG>g`7N9$25qwhR3WB{&pp8G2;J-#qe6%xdFHO2AeceqW`Q#`J1X4*a>V4
z;Y4EVTMA!^vxOA;$ZDCt!CPots~0yn*Erio(G!n)@W*|^D_=Wy;f*k=tF~9Zmr)dn
zCzfODoJ@UXXs>1NP-A4#YmmhGXavn<+z_gJ`<km%!n*`RuL>>cZaGo@Iz2J)=M7{{
zJ;n45y6T86%gls;?`*1bFl=sXf1H<+2AiBU`}H6YM=+eFPoz%Sg=s>Dva{ls1mJO?
zTWP*i(U7Ec^3%Z$g`f%l##*mSt_wOa-d<yud2yeR>&(0A0@(ms#pY$P8SX-ZAVg)>
zpsk00`SNH__*AQ#=>~|-wScS`e><aHbz0t(7&S;t+TzUd?0(&LR|4CUhO+FLkKpq7
zkko8D`trz9PfKE1b$7OEXV>RBCs6NsQ18sz`Q({qI(fOQUY10Mt%YO^v{>w>TEBSR
zi>oS_n(}3A8W+^iWG~}cr3Bv#s3W>CFUJm0ejS>=V^<Rt>X>!UmDV@|xH@hWB5yhc
zuXagN9&cY%tMFc@?PqIxYmy+OSGU`O5gvK2Yaic7tFAiaz`*T*dLafG4tz~<{L=*n
z1iRA9k6#TYhCWcSFW6P4&4yOea4q&Fy6Mbkfl&!{&@KmDXMWs7;2Q2bRU~gBtDs>o
zNeUgzt#lWV4oq=C=5{Id0)=a+u5HaCtDZwXnX5u!bO%{LbXF-L40}KeG<SfcY6X!)
zLy>4lG*uU{E_AOMMd4ch=Q9&rc=;3fB`I@EFBuF!XcuT783*FH`4z<TlJV8PC=T=+
zK4eNl2k0X#Yllb9E;9vC5kWhE$Rj2d(oSrPP(Vc@J!@f+yM0dKZU;O)^9Ld_+JH>O
zxZ=AOG#fzwnh^u6!|A7Fqf5u{$IesB&EF?V9g5dyhcmbVh)|M3^!U*}qJEYbGFaK2
z#0I`dWniJzl~+;sJs^jty%7`^Yv#{r+=Q<#CleH22pEWpQ)lwX9b5uv064&fPlS+b
zqZM<&o~(2`QgUJ$O29zuo%|4(uP+zAeibd;jfc(zz|+6+9EUrZ?#^|ymX-knV0Dsz
zFn=Bg(*p-JjWR}+{_C#CZ~dR&on|-C9&{&ij%~0x9gtgIMPCkr_rc{WE_}pL*bCnZ
z3d?M3AYq3)iUS7jPOFD3m9DVG)E&SJ1*`YXzZQib9R(``({n~0aGXEhgZnJU3vy*N
zlEAeqef_?@nqICTH{?wuZFw#7F{`&i?NLpf<7G2noyziDxMHBmK=Z&P8jf>~^fSVF
zFmD1h)DVg7D8erkb}OkfElv2i`s#7j5-;7~&l>SlgLRqNM90B`oFJ!3Z!I+~g7^$B
zkD<7Y^U2QID5DVT!a*uS%0aL5KAD#Lk5^|WCC!!OQcFyxCl$386q*ohKGP#?pNL0_
zG0d|NfxU%N?);5-{u0rA@S7+4>7&sDwppXmJaj`?8D#?9@k90l(a-Vg>E`q1zXh9B
zEsyo)21!OKE@yf_^P?a!d>O%I$~z&B<zAUWN^ediS|{i$|F8xE+oi4I`H5Jf`^;ZW
zj--fw4*!Wu;rxe740z*FZG2{6QR#Z*asgO5n16bKXw4M0PpG4OMrkihyMDZcc->g|
z{KuO5lVh07O|keMJh@ks$3EfHm`nFk6qNS&_PxPbKN1c~Ds8?;y>OzV;B0$XV<Ce1
zRb*{x*P4K&P>Q=LQx12PJ2~x!&?qm%Tl)eivoas}<)&`&84*`tT{?o<W`*`&H<A=e
zko<O+1oDk(@YJbgo+1BQ<apA<5D}X{{okr`(;5mR;@Nir_F2YnNK<=Y@0s9r%gdhb
z9@a?Bx}@ZIr^f@epqE0N-ORA23=+P7%}h)lzV2NZgstEH*8=ZSMy@1hH-VlcY3j_<
zb5ek;$R#9-%V6IGV5Z37;896Y;P#L7?C_G4=Y(enW-7RtJwP%VLkXi)mKWXvP6I`Q
z>u45c+RPjX;imIsuwmXJs;5Klbii3#Q0kSLKcW+Y@xKcRce+GJ-RTlpMp(c)D`xrv
zd|#_rj!Bm<&cad=Pq($+uKOY#CGCK-8EXOLAo{LJ2l({+_%87YR(e2EErULI*gm@X
z*m6LuczdHTQHH`3=)x;unt9KH-4duW3nu}xk&Cu4-DS4wjNG}S$tO5H_$l1*S3Go6
z0HH1rN4WcDUK${}+a@ICZ(ZC#*`6h6EK7)q2OePook_w)c5%-9AxwoT6E*>!XDxpM
zy_C$yP!`aN2TiCVLn_z`_E((J%LUYuw%2%(GBL3Cve+5zmepidD|^#$=@2Wfp!?NR
zUpV2SwaMg68}9+`X#n-Ust|TK-Qk@H<O0?Rn8&eQ*MB>Xu7dM*@>KO~@YA_S!geT;
zxLp>TbIo9^WI=ZuT?ErRN;LqRSZX$7)+{MdSSiDnSdSwQ+6Yqb#nF393O_Ow-rRZD
z1MtC55vP=~4kwe+$#2C8b3Q6*<^!T_D^X($HS$<Vzs_{5c};BxtpIa5eYtvSY+AdB
z9mL~o${;$|y;ronYV5#PCC&blZn7j+h;!zs$uy2Z9qvA^2`Y~y@fG(JB^jI=8#UtP
zc`-xy?<0oSdl15eDL;ce(_D)JUu&S#%s-d5chsm5ql2(JBL+^dmT-=FtWWrz<k_76
z0euiilJh5oN0nEfl(;*+c^H_MU(`DCt_dB1CEf__d!gV-Nn;n2vY_mgZD~f5c<%mB
z0k&?NfIcJYJtIDMD-pS<b64+VOnnO%6I25w2D(Rn(IBNxEl)RjokeM<Y$tWs4z_6G
z%N21xY&<kiAoy>*Ns2(pd5~m<_QgfsetRt77rwh}yjg#yx`@p|%;<R89SRx;t9u}I
zH%7V#Mr}Jr^6sDH-9GhLI--$hWc++(pm$Z_sx<MPREg4br^L@D)?8jz-fG{OK`n&)
zzM1@6_3~?nr;Aj=qFUk#72IKZXx{6C81CznT%+s1#S&q9h~lr5j69kwRaQl}h924B
zB-r?TcQNIAdwQ#eE2kp;hbdz{sjCT1@;d%TAnZHW=pyT{(7%JkEci0Ak5c{;U=UH-
za>RnzvAN8~6i5D;E<e-4@F*c#hP}N#+2l=#_05f;j9Lm})_1-iFL}cIss(SMFU4lb
zO`2yxVOL*s&wG*S-s~lA(RB-6`)U+TQ2L_TZ-?2A4_{whUnWCJ&*YMu7-U!IPVf!~
zf=);;`qwJMxfUA({r4Qqh=a2$<YiLZ*Yu+LyOO98h`!WO2Q5MG#@aEOiTj%jClnWh
zn<6$Sh-?fs?`!e}g$Wb^Xs2rp9+wu)%<6YUxA!DZ^>Qg*azSU-+F9W;M>-%sM=r4J
zY%}@{t+!28<QbBUm|QvdsTf1$&7=j6i!>83WSGMgw_85U#I}O75Rr0Q_D5;Du8|l@
zHWBq-r2&(pezi>6+daPx-qwVIQ3A6$h}GxIH72G*;HeRgyXKy?Uf!HvVg$M3Vs?lo
j7HB*8-{6~e<}KKy%g|C8?m&3=nE}vH(NX@WXdCq(XawjJ

diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.png b/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.png
deleted file mode 100644
index d8ae03154975f397f8ed1b84f2d4bf9783ecfa26..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 10413
zcmV;eC{ovnP)<h;3K|Lk000e1NJLTq0058x0058(1^@s6=SJeV001a-Nkl<Zc-rlq
z33yY*7RM)PXbas71uR=jWi6DYAe#bpK~YPgP-T-vphDTYr3=t~ryI}$ZRx^lSw)b=
zr?TinQJ%~5p`t#wr-&kM2ny1q^XAN*q)B>{+^kJY@_qlWNt)byXXcl4&di)UgOL4U
zf7l=Phy7uH*dML-fsqKMr;DlfM>yz|;&bpF`{OQzgo8jbktkySeg~64fbWuHz_H+%
zO2F)JwJEE@HLSkR79_Z#oHbogc3dx%o7^AeCk{b5(&1F_9NvTf!DryJ`XFJT+JS0q
z&?sCD-y=8K2W2PRhjJ3<`jzFS2UeBViE<NPL)nS{Mmb5}koSKjx<%ALBqzd5a`xUW
z?la`G^gD^Kc^9YG1#K%$cP~zxOOFlb&;}I94wJ1xgQ**#QhktLzF%2xfB((-^ZYg!
zZ1pS4Kkrwbf0YP>9@x1RKUQCZdv7kl1SX?3WZMS(_}*GPxT+MhW0P|fyhZ+Qq30&o
zK&_A(Oze8$+U<`PdXPq;v4_f|Urm8qVAY042UnGp45})9cTiQyEh4N`WieG?WwHFJ
zL%SQEJASBPNL8tfyeEVAm>Ttneh$6^dT@7TL)6K`4dZuI$Q8$@YC7*NxE8o3xHh;(
z)oY%paC7#DbzBq#z7eX{hBSaAFX=&XZgM%%7vkI`tW*yCO_Yg=`yqnAa-v2eeE;?>
zc<O;2*fB2|!)s&ZU!I3&9NdL_Y|6px*m8C*c21S+p@M5nE|=u22|=nq0OH;v>{iKw
z56$?22D^!CP)@={l~{!+p^?NV4J00s5s~K!m``K3Z^mK!w_^!uRBfLTqF!aWIQ-yF
z+-+mFw$C)OYiVHDrh2UxX&Im_YA#t%&~JYj4^H@@?c?sN*|d{1z)fXCWK#h&a-j`x
zMSwIVr!Zx+>*mUE)45>nPAFTm4uSn)0ywG_n3eP}spMCtk;WQXTc!X<VQCC*_!b06
zRWb!kq->a#?G<8~9?@D4_J^SH8;MHSdkm@M;{c4Zl4~|K=yFf32q2}KbIxDWFpb1y
zO+OA&=Iq3=s<EDZ2ta?&stKIDBtMj$v{NO%A(gzg0RmqFkV%bhg8*Ry5Dn@_R&U~B
zC8c{NbpUn2S4Z{-kJW+a*@+3H@~*1{n=-XZZp#2Ql})A+BVDI;qOd@AYornDD0^1K
zWalKls}dZmxvIMiAVm+ZK?S7u)ZE%#9~)Ihwo^XONy=1lSG+x{yAU9io*SJ?Vf91m
zimwjux;ZP_yNdTGmGZiV)iI`$+cH23sam7gY2ERSk+t<H2<7w4#HUC_{H|hk8qHA2
zZ5bfu)|A%hca$N0nzwG-X1t>^1(B1GFU0ED0TN)1GUEzJjf&cITr}~_843H9IFf?D
zpy-;D=W+{Ha$5$7>!~TGM>3^{(aM!hTwS-Zu6}T3B@Ohtm!x|WXwD0DS$2Sg4MHki
zT4wy)C@!)S)O94Q^ENX$IJLgcuiK`aOAMYnR<7i>43I*17(|~2Z^{a28-tFl06j}G
z1E(L_b%g+AG(2{IghMo@X493&wrmJ$)etG%R?khj1IO;za&76!!+2C}`5mZmW7T)d
zdc5TLAso7|4x4fu(6j?P@#13#aX@*#Nyh;YpF8maDO(w~k+R(hKe!7&`(pji{+WqG
zRNJD}1<c`ls(Z~iwCj6-r|A6IJ4=oqq`y{UHS>i%xZuq*IN{U@la2#gbNVFCfAchs
zIJDcO;{ZH`Z=Jz5RkkxH?-ZOri>KGuU75U|b7#s<Ol#??_smQ3>b@!GV{ltwd6tl0
z`-tj|)YKcR-o#ogdg%auyuQ|?<AA2@+m4?l$Lh5y*QDcs<*~`K9o>Hi%I3R1^-|ZB
z3w@dmquBHyVR{7VswXIVTX$?MPH4+9kb2qjlDK$t-RcV{VoZD69&BtHN{89>gQ~qP
zJ3uX1wj2^zXGt+iUU`JHjaZ|tY;IN^;K@-L=fQS>Y@uwVEi&RUN?2Y*+sNids}(cC
z+40kwrYD*P3GD#2c-goFwX_(F;ug=ctyz2p&FRs8<RX7jO#rh4WK(7164chM{uM2s
z*Vi`Y&D7(b*R@TI+5xhavXQpQB=JzvinmCcIkbE{#Eo)_$VFvn#yxg`Y_x39MrpAa
z#YC>BZP#KW)rz1wGkz3b++zpGX3NIKL+e&!v|_<T4x)*YmU3NUaxrh~%jvNLWNT$(
zt}Jo+d!!B9S>Kf@T~~axF4tuT$cD=XZI()UWvicEV_jFqjbw^Y;_9AkJsqs?mSQ_V
zHd!_~?Uk)r`5Rg=yAOj%Y^~TwjIt7{g{Gt00kYMyk+w^ZgMfMuZBvVP>lJ}>TFiaQ
z6}$vw71{x^*|Ko~^_rD(w0N!+0&330f%Q3TNHV+~AX_dQo92j#JW0ofEat`()+cpU
zNK-<*Wh>c%oF}ld7(cPM7T>>P3+`N++2#S7TwjYH+FeDL-}5iew@%rhE!V8XXvx!0
zTFweF>(f3j`6XB-!?_??289+P$hL!oDad&d`knUqYw_}zU&NQL{fPhk`)_>p#vk~F
zOaH-9ClAxr#e^P5n<nh6peu{H7OXSrIL4HXPo71oIPSG$`28!F@o#_qiErG#jo-Qa
zKm6prI?I<0+A=2zJI615)go4iEh#5@a>v&DV0je~`L#5{FGh$URTHx9AYn@Acj8H9
z-fn2Xa=Bbhm#_bhv)?!+_&C~>bovC&J9ipS=gMNVj42zRq^}*vKi$01ti15vyd!%p
zUA9JO)5+CkcwA~i2(<M?b9U<)K;q!)4OOsy4Ur7oh)_YEFQjThww;jA-$%~z{D0nW
zR+$Q2x^|UTsv*~)Dab~Qd-WI|T~}_>aSSaRpH~0l2>#}`U$mAt<;*`UUpCUF!4<_g
zFf*C<$Rf;^y{H)XiCNlB=(vxmae|1Pqx`~~S}Rm0li_pUevNx<%Eh8q90Q566Y<b(
z8}KJT{iwC9pan<wTcd2;xKP5f-%-Srduf|Tn4w$GOqm#^#>DZZYFMh0VeMrAMOVe1
z|Lz;ye`{f@1!x?J<v#oQC!J;d{nlSNU~_@x%a%ohX&Fk1#wc=bI&JevQ*(y6C1*hr
za)Ni6^|1;-?Q)ax_J98i-@0>0yCotz`^}fMr`Fm4f<X7fAFt?zM|$eG^~wgu0n%S%
zBL73$+?lq$Ny`qi*axj$eSgzxsD`giXlx~SF3G@qKl(stC4-d$OXN|QYIG%o%Y<u#
zMhzoDCAf^}S6;&DufIwK&*8l(<n!>Et{bxGcZ@CDfQlmg-(RljEY}^<fk-|33hj+r
z@yEdpEGDnOXhFnd(Do(;)4`kxa}Z}pi;Q(&rkZ&GW83?0;|4wM?DW_FC8j=K(AZK?
zRVhwC`zn6%(nb2cA8uUNd2Bvi`Qvx|%9o$v%J<IW(zo9rU~Q!fkLx@C$l6W#qo01z
zdF<Q7>PEkElrDm9b@vQz3{qdC=2bx32OI6ixaob7Peg<(shE$A37*Y0*ydf7hWB3l
zfOPA%yE6dn<bN;G{VO!T@49{+9$Zy|SCQV%@y|cOUy~k>F4t(NpuypoFMj$Fe(uB}
zYGE`j2L$`WNWctZJGzc_^Y7cZ=&iGKe5Qp4N#!&iijDjXjTz(3xiMo>J=mmazv7G#
zF};w)79FkiA@1zpCm-spe1PcGSD#bY2j6kZTSF>x2d*b>5aJ1Q0i#dXZr;STA6&qX
z?AfNYN-*H~;g8?zcE?0p{`DpSKBZ+x+2NX#R$#Yh=T4y^j8P-g+?ON+%kpw5Ksi!b
zOAq(oLt>AA{_iWD?hG2?wJ$%XV>2K8a2fw~=WnZlqj?=Lg8tUGU(+#}_pV&l`FXI2
z2R{Cgj<UD-G!29*sWI4n<>GSMfif5%=Dvs=1Gg5Q<1A2u%ogU0AeaR=a7WglGq9Gm
z05rN_()Itp2xw&&&f%Gd_t?ff9{`jo#qQFme-Q@S8}7!~yjOSWsy>00CD&oc8B<n<
zH}`~N=)UdvkREr|S_H6Xke*)0;&fY@D5kwJk9E;Ey3LLlO5K=2gNm&8)gvcmChU1d
z0Fv8U$^*niFMfIv|J~5gYT*GF2Xj^(U~<})QdPJ?MY(5&6#K_K7fB~rBO)9Pjepwp
zD#|F~05aZvSqdQN)l7Q6ngpyjS}j1|{{9;tUcHf)#pZIINSk}+uE7CGtG3a^d%<xK
z___#s)0tDR4cVt`5g=Gn<-y%m_^V%kY4tAb(`#4p_?=s6x8Jq^lFGbDZ?JD#{F^im
z0wx5-L5i%y(21S=HsyW`+c&~4e6|7*EC^M6{&o86Z3{6F2xj6T&k4$AL&<V-uc9>E
zFMG|E_M?KjbKQ9%c|x42azM)$4)-h1zrz4(v;}}*K(PA#cWCU;R^U<SkK4C!%?Iz`
z(Y56^0+6)Piv;!q)~7c(&kCJDnTQN-Cp3VTpoFLyunRG9VU0qD=q(KpSRcsPlgVXm
z4j$Ek&|DH!n=%S*Hh@SV?NyxCIItuOcbFfsh%zB0kY;lqjy>~Jl3;7>rw{Cu!{8QN
zl(B*ZEn!VUSbEKv??13(3(hAM`|DqSwpn--f-*wJC6w9N`i?w)2q&I8VbU?i)Rp5$
zpRbmO?ySVUW0vO8F+m{!u@5;7*qFB&61$hYbWjGt9T07-U^P?#05ata{Vwd{2a}a;
z(QWDK-j|R#Z<>+y4)Emu^ECb8n$m7_4%f@(9^8ck*T(DwCIkV5Cej$Fy(m5INbk)B
z81_|%Sz$1T#tN3wg#Zy2eKhpDFrV~OEAFZrs~>OtfgjpaWmJ8G<pFrW6`Im+KBbwm
zAr&RqJ$eZw+jn4W=mQ8z=!Y}n-hr9zffYsamWqLTk@LUtzc24Pcoce_l)V>Ec7e5$
z<-7`0<%3Bl$~A83zX=m=j13)K`E<GnqXc<-yQ9qbODzcyBy81Rd|z)KP+Jz19qK@2
zVEvmDprEP}>?&RU1#)%u;U-p*j;=g6-ytEUsw>Kreg^;rRu)?wAO})#2n1X6G=;eY
zbpY#7JLDu;AE2T%dC;~}?3TFl3JMDHXKYCH0n`pX@o;Z)fS+3mpgvpH<B#>+sc<*x
z<Eua5J_aP#n;oF!RllMUBL-161OViSW`*`A2e`oivg$YxSf7e_&zoH7*KhrWKOq%=
z?Z%B3N$rr)#IhUo8m^lmnQhBE=deSTwE;?L1c2PvM&Cpsqk@G4G$(8@%1*r15&%6+
zdR@>1F}9*_-oA}DzIg@@Ei1s?3sQ04(rg@i;xN56+FJ0yx!{~|Zn%b_<pt!L&eCf$
zx>xqcb^P%5t(dMXW@Ug}*T&pN4~-o|+0Y3PH&pF}W=|bT0Q%e706_}svCls?Dd?;u
zzf`BxSd7-LQcApTHC}%70KMPb((ph|^QvQq=sA_wK%P6L#o@{e=S=Dp9Q*VlcFK&`
z3z4}2a!ZM6K#x2yj<OLE=%^o{nNx<f1VC_SgGuQTzx|2_ZpcxCN1=A9G5ErF-&h==
zcQ1cSV2M?ON1=fmbLlmja{LL^Wf-MK!^U#f187~$9bo`*=E(_7pAv*}68^F#KrraF
z<HL7Ns@zK?@OI6R@&W{^|M$BV2k79%4|w4Lxn?HNYc%EfJKn#5ouQ1dzM&R~%n#7&
zTWA7xVNNJm*MzD9ps>jU$pQYbW-n|+%|^QNhAEZ%^{+o;|Dp_Dctk{ReEnaG1N7!M
zUvln?NB+f`^cqb${^jex;SpPlIV(gVl3I2ghz8NCZ=kUwM+yh%k@0;{mh_r60fM<7
zQyUMG(-U4kq8@)Rcpf7Gs5P<|e<tU6d}k#VX}}{+iN*>4I7+Y4)N_=QfSdz}A0i8M
z<9|WJh7HjV5X(eFBM0>$=J8u=0pwnoia*!0$bca|pm_&(<4!rrxI=n8_RLDeAtY}2
z=*KHo>(0ZuLTbvfXLb_qK-^8I+%<UKp*?2n34VU<Dt3o~MFl__5SbmIgs3a1pMO8f
z1~;9d2OxwxqKx=YtTmMZxy|Oh@el44egFG!@qvr)<8#ENK(ba#B#6Pe50ID$uI08F
z?}UZAnzXZ`2_8^d6GUd8%7|O~1!3YIE{H}t@6He-Ke{S|H%gB^Zw-K8;1KRikln>|
zUdG%Cl=sFd>;Oyj@<24U&RhVc(aBVo=p`QzCVUthI@4N3$j=Wx<b_hAU!)m{mBlhk
z0t9OVp8NJII~WEEtQ*`lp#~5rJr<9xEi(^5#p$xaTMMzvviYx4qr$PF2cY2K4&<+=
zM_T-rA=@_KhjvxT{LAw!wl#Lhu1d2mba1x_>TE)7Iqpe%ok|sRnzE-FFFLy4v@Ojy
z<zN#3$i|cSvCFcBu<V^DV?*bi8KUlE=;o{*{j-zTz$#(Ojvtc4il3I+8oTKHSvwF$
zYCUmdVY|0+BG4!#J^3QeI{h-9alnM_!w>Ah^N;M6&#AA&{i2o>0u#PM074u<?k-Ua
zXAQ0@g$0!wdk`8ewKevJx8JbCV03r>4E9~0hJ6dw^~A0!+7s<rl8L)*$TF_o$oB?T
z7B`M5E5Ke6Q<eb}f-Df4e6JJDJ`0;PmITqaEO}cGEcSHt(~s;h7^$O|E|NexmJ6O$
zG`g}BKS>~xzzXy*t&$}*`nH~ad24Swg^YQW%SiNd)(;TZ&v!xo_w?$uA?IrfP_|`m
zEQFQk^)0w$mv+7L-8Z=N`c!^^cB=rCZUjVG+>M2OQ>B-YZ>N5giD0_7nBKcn9Z(nY
zVT8K$EKGZqvp|-)wRvDgk=|8G?b5E#u3g0gVLJp(fT}bAG6o{JwYgv&4v1g=CLIIv
zMIDs;tm=7)QDC4e`P->SW@4!&?~R8=%fD+ww<ud%P<JB<Eb0KoY6nOtL|JTch3q8k
zw|rqOu)vrQ3j-+F7ijbDx_a?E4j{|KK;w68#aHjbTXxKuxHg{r2vy<X1t?urarz<R
zmUn6NNR7qaXHQ#2$ACB{1OX&M_tHSx!}H<s!vZUceqjqt$;}NA-1BA4wF}#tpza=l
zDGe2PngNs^%OkxI(Y@e25~6<hx^MUczycdY6bS<ef+cU#Z9}NrfDPHNQ-HF}4-o7`
zV;@8`Wiat?{N$c$HF(qj%8ciSM<HzxWsy!3fzK^!J}hz9ld>Q%fNlz;`*m_7f4lZg
zPs+CxK;6mf8GGySjQUzZnze5S&OQAymYz5)_&eH^bn*y2)>B%~UnfXQkL<$*XJ5rj
zUfj!-MX2_vYu16CIG-E`Qa)zv+b&q$i!-$Vw2cR#ICW+4KtvPw2|#OCVb?j+tDrN5
z?)7#T8bCM2K|x)hC)UY#!K_emE(FoWtx~UdHXaJ8k-wu&kn8+J-4;A-Q@)_j>(YJY
zg?Mu97A%3iAvFK5B_WJYJ=Uk;DLX5%Z$S!1DXUc!tzD^_ios5qQXIOg3I}f~YCb`#
zRk6GpUA2J+pg4XtgGkD)Rv#BBbDlJQ4i`ZC2o9iC;vkyV;Ys8tPL2MM0+eN;g~p)}
z0w6LgK%2DyWB@z>N{>Q5fDD62D?moT1F($VrU{S^crr8~0`~=JA&cjHO4_~;Wq@Nr
zWEemQNj!S?^ny4@yn0cIMFA2Bk;MTr5FUPj42OpoAS2;v4v+wNsNimoCijJ&no<L8
z@7}$;H@zbj8NOOQiG%t<9Ml<wPMS;({<p<9ThZgrEDDej@H7X=ICz=?WCA=*096x!
zqyphls*$#FOynIjZsgO@_TKm2YjFJ26bC^DnwwikaARoA%JarrA%kvh)?tY(29S~P
z2nI+GJZb=KEiwrnB|v+$0F<FiQaOTq>Ykkmt8oOdws$f#{!w*f?U)Jch8E3A=KN%$
z+~TWqXo1Kw0L2&$j}jo#@V*79M#G~7Xtyqagu%lBw2>bmUGSvS8y4j#ei=rgkL1%f
z@7Ap&y`32$qxTGRKt41A?~MHXhN9HfKQK2YxA^)%Jnqcg06k8QB}t7j8Xmm>352H!
zplw<J5)Kc5!V8d2%M%ZxV?Su=Ge4p3-AB<j??i`kOxpI6b7Nh*!rCys99J!=an4IN
zXv=Stw#FviNheZx00-nwDkN-21s+8jU-6!?y1$|XhY{D~k|CwH9qUxO7Lm%ow<HTs
zCynJ-oIzg4YgcAGg+!Y?a+7iR^EG%(WvTK#=PXcyc4ccSk!pNxIS#L{!!9|=%J-ly
z`ISWePn4S^b6OjX9cND62W^eIbm?r0`%>$Td3)1=B;S71raVS|C4XCE+i!)Y)YsxC
zwr{1D2jEFPc?7RGyqCV#udVzd$BRCC0H?lu6o-;y!s{o=UxTz0REZZH+>J9|JAt3s
zzmvYE+Eq#889~}zMJ*4&lX>bSjy`sXzE)_;9zIn!*Yltns(4batkeI%Q%T*?_v-l-
zwzrm3eQo2^eRVjbFzZgQkn!Qr)?Qv-9>(^*n!7QC+Pie_+=cw@9hkfB2xJx-vh}yA
zTVn@TmEvJ#1=R8YJWubbp>9m4%JS)VG&LMlUV!KB-HunhxDSsc$As6z%h&U3vo;k{
zO$HcWI*2C`VCj2X3Q12&RYlshwMk%k0G`<dMDD#}epP6yq?zWw@H~z_aTt#x!SsmA
z5+0S}5mlUm;q|r+vSAzZa6r_vkUZa;wRL+_-IrQ0jk>!-Fx?$J^uSaSsW%wXr8mn$
z;~AVgF)0R8iD^b{(GvruXp?%J)1xrGDF!ki=FyCE)MFsSVjfM6Au&)Wu}Bi=^k|QH
z6l$achszhr(CFcFXd8EPGdXzH1jvCdyxFM(++21qTCwm28srMxgw9+m)jJWN4erJ$
zfHVLZMJ&MM<XF4_34})tp!KOn!=nyRu}SbyfU0!?qyi7^dBw|6@=A=J4ttrlE!W11
z<T0}VqzI-Rkvp|QJ<+&flhL|We+soi=G-d-d^fluQ{d4FkOA;001Df!36Np%fFNMj
z3XlQt03_N0is!dJQS<&nL4nh0+j3(+j;03B+N_WZ>e#UxB{gzxExlj?R><7D^?>gd
zIsvP#Th0rRf$)HO7NyhMYMKBt93Bp!1R5YW1IR#lv;!2+Z+#M@Fq;1OKH8?<-rZ>%
zn<;qKH8R~3_2@bhB`p7*PXFr}owme&VS;Ayb&TsY1IP$?02p<EjDrV2;RVPDc=!Q|
z=K)A6hyRI|%>EJib{@y9PbYJ9-F0^9DWM#x0cd9E8d{Nhwu7<=K>8+N^$ZNE0c0dR
zf&mgRx77?FBjITdP&~i&$sz#7EWzl}kQ~~U7Pda>u@Fr0w?{q5-~J?^euK+yOKh+@
zK-wS@FtV&4AYl`uO#r1C4No(GOn|2epc(>Df)>{$ZJ_HW%?-am+He4COHWJ0KH7U^
zJ}zBh%m57^@+5I(e{q>?{I1NR0BKHp2%Oha0+beGG(36%GGJC+2~b6`N$@BEs@DQg
zX1pBgOSE*}Efmy$I&DJ>^}KXhp?36ES5Hqr^0%LO&a^z*cv>b}Ee=pNt0)6z*0lp<
zSV{&gYQPJSfhidrK-D||#TlBCfycn$tyX}D>xy2C#ZNx60osnWp*w3+F|xu#VTHJL
zgq)pW3H*WRxp}YA%HipiSp^_NAR?fQ+R6uz;rTqg02z_b!w-<*@IW1C1t<%~d{$u5
ztf~K`ZN{~oH)~6)SfAzrbq8wx0#N79V@ObTnO>*{L{<Yx;m&kHl%fqF<C4Le0c2z{
z7)*w$17sXLS^)xh$Tf(bop}ZM_>8A*)}e#1H3DaS0kwz1l{q{-VIh)6$u;94s{*9U
z5~XMZ$oNb`HGoXWBy0kx#3Xo{0hGz&9?~NdEngrPj~y9BU6+T4KW#fJ1kU<gS$n#s
zqPzROU>3zQ!wON-a=10NQ87wwb%6LRQHnNzVok~<R033(W>O}hUVsF`(;T3<MgmVG
zTD^$GJr9>r*TuC}N0kXv5o)1FlPiM+Bqt}hut8}4Q~S}Hl}cCEA^@pEl%fTo9TnOE
z5;!qR0U`~r9Ux&7qZFX$wE$!QJWT-AasYwrihB-=rayj^whh-tom<j%X>(<6q$B9d
zZUq^P7R@|EduBNavK9kK0a0o+4?xA*0Wx4#9hQ{S4v_F!bx8Vx+?{3s83>O8AUKu;
z7R5-2!lIdB=SZ6jp>5M1b)#+7g073t3W?bexF?D1dr=>Y&`=aP=RG=KRF>NSOQy95
zK)et|<53k_05UKoLpwl*rDX5|WCT1=*3s1jpuM#X5*RF;GwnaH88>Ycu5CP3rYl6q
zMjop1khimkM{gLVb|XErK`9BJ!`9JjPoHd<vbWBp?ZYSRtiVa9PSD+-4f<{!5rGb%
z0m}n_7arV=V_rIh$5xhd?-PwCjn_<-+Ehw4g~UIwCLt^<r+5!;E#}t-ddGyiHc~+v
zDr6EuG8p9Zz+j+Ok+NtLz!Q1&K*OrTd-0&ys4v}oeC~&~U~S@U3vrLWcnC%`J30CI
zfUh0Usb9a4sm1wUrN43vFM8=<gDKlk!(yMXrgYq^$Bg<u#05_fn8ES@JZa>bLU(bm
z;eEj(uqd?P&>oz1`XpVG5SEpLMGg41O+(c*@m(RvVTLqR$Rvb$EPmC{;Fw=5eU(@q
zfM-E*{{K4m?)@;dfs>DWA9{;2*ESMcghxGlkqgj#6g@N7fPjz(bJITSk)MJkc}X&3
zx1n||Scj*RSZZ`#x$)as6IUTgi=&nY;DLm932`IpiqozPb@`WM;c2AddJtCz%c<}x
zlTT7LK>|GFFhd$DOoH+&LAOZEBO#raL9xrfVDKn<Tt2K#+SxKZno}Lz?Lh97{()}3
zoyT=rvG6?@2}RtBXN^{ifp}t~QN%sb7zknz6EX>#VxV-BG6@wi5acWy8uM^nb<*3C
zF2kbP(>^3_>j4H&AJ*e?wdPcXIU#bR%Y(SN^(B7;+qG*q9L<Y3jS`lzaiT=UprdsG
zB7%4U+>ts!hUfDDKvSRB0+0c->J*@QZ2-mV0!U8Bd1526=;<kMkUAa#b?wxN)_E7!
z2JLJuXcpQtb7TeR>cl}bkQ8tzni+Ng#wO^Uu3(L_tPc<b)^N*^gzh(Cd?uk*fDBB@
zfP1v~0nz~vSRF`7UG*#$g-<?A>UJ2^F{|sY8r}6)1CKU{y0Ag40i>Wq#8V$DMynRd
zXk`mr#M7(*DR#7h*J;LQ680?4Yz~kS`8@mp>4Aq_pJ?eknRs%@Ca6=I+r!mym(~ss
zA4IM+m~%${$kj2BJP&es;J(Eua`v~}s5PX5=yquq0SGoEfnRZ&amirK05UQetT{mO
z+VYs?G@CFn3XA4Hby++zco~HU>eLzaW&yLSEe#Z!GbVCj-N~NF)fFHbEb;NWAI%Ow
z1wNeH15|rvqs0JH3^oD)2Bu^v0V+y2DU+}Xpi&+1NE_($Rg19bsnD~MPM#C!sK1x%
zAX=wf-MX~Km`A83YRASRU?Q&vfoLGi&p=!xesa=!(en8>x#^F@M!Hf~mK6a~LS$G<
zhHij_&#Ef{sw!;`4kW-spbWV@OXl1ZKNeC#V@a6X;(mxdSe<rgi}3IbHo|BqtbXQ8
ze0E>t;y4)0u*1N9VQ6mnIhyQEZyBO%Gb%x{I6!oXH>p9h>Ks5dJOCM%k^un0ed<bp
z(%Fg`aO({m53J72u(urg_YS8ni_{sdJ4ELny;z+}+LGH6lF{hS0zN<TO9C?tAPo~g
zHZe^UIp;gt-B<L~vJ)De^Q-`n!V)#x#Dsrt3wFe54v^vbJg`az02EAPAh_X=fCJlm
zOP6BN!kOQT9~qa)dRc>6UHP%Pb8m@^LR*1I5nOkq_hdUc^+S%FHIjIFJs_SQx=R!_
z{|}V3f?1%o4b%2-m&4)?76nK(Cekx8+8iL`lEGk!m8tc$a$<oRXi%TH9E+ZN<_~e$
zqkANto)f6mA$3F4pVk#uXNq9?u7ZazE^5zaiMwi8qoEE10;Zyf(4(Ri3vWB6L~|CX
zwoImsk+LiVkkXy$o>f-|$Uu0~<CHNDtj_~WtPCy9!J=jJZ;NI;am=A_z%(AK)0Nd#
z`?enM7;252*w{rU&Ke;S8@U8QcONRzA$ag?$C*=4JXnzV=aAwo&O$)2t(X?cvH(ET
z60udjv<wn90hA#eo^-aLWhNa6l*4PMq?I@%H|@`W#m}5{7&2%s11E^q0n`O&bvm&+
za=Kfsj!<jl!TftqYO-BiV32GGRO&q=hllrGIA<4$kuKBGZ0K^+MIP8Vl0cXY+8R4k
zs&$Z%0#ski4^T5`VjGFYk#oT@Ip(@(SP>PAo}G2sF?{mwdqxbK&cGQ$%gni}UaT%W
z>{iFH*vN(TF1pf6baWg*dmhXpN!;AVi65PqEqZ491+;wOpOAS+8#RZ)#91aeU3opr
zM1U0TES(RaEFAz5U^3zeEO9c{qvEDbq@;7OZ2q63IpG(?4?U1W%5uNL;yAjv45nq}
z!0F2Bz~yd^b&Rz}5@xDhSt1nNKIG>}ewB_*u5Bn$utQM)S>h>^Dn$#P{*b_Qi}v2A
zWlB&7DvMeu3e}jpavVlt4oQvyTVrcNloqGbjn8N#ujME$ULBYWcGoQFO`)jyw?y-1
zd?*fmxYA*8|JiWuY&?g$Do4)Z__4Bjv$8v>bkFVZm;oftBGK_9@@pl%lXjej!A!LC
zh#}9ohCi{{ZQ-mp-B&KY>P}({57N+{xyjh8FctPfr+T!$Mn30oz09XHQwIB^dljb1
z$^SVOsXW(wZ+)uVGjE;TvtW(PvtX@k@RmZ^+(Uch12(V6o&_nG{11DO9u@4h<xYB1
zZp#;5b)En8xeikw-|RdzXg;~dk?dNBAUd8y%D}}q3X7uv$Jq{!`9X4KmFvawTp+OQ
zO-%kkYLy%v6;=rvXt+y{9+SFE3R%^C&Wx&FOBb98kgoWmPfFbN{yA&$<jReB=AIfH
zRK6Y)2=L&_VjNUe0(+(F7WvGeiXuFyEFTA!X5(?>`w=yp@yLR7+-F_P_1>{dzv%Vc
z{4?EWO|R<m$i&s;8XjeqW*D_L<Qmzv4y4!4m7y~$lilK22_#20Z12u`Cik$ppa9x1
z&_GR=NeL9uXve;OQ`=7%z0x;)N<qNlxjS6uMjY?3Xy$pBrSm^_iCXxz>#D_<U85s^
za*ciN7q_@&zq!RP`^`0W=`ZxX)r&8?tXS|Be760<+2@^Sg&+5s6~4o7Mp(Z4gt03f
z1N^2+B$6@gT*KJ86_!Q;O#r*rd)T#mu(CMX!Lcq#4y;F_uu2{(Fu~lAKlv^9L#~d4
zXyds!D&i(hsD?8OF%lYI;^`S8x#yn89J+OT!l9>cC>41Q@6rEpfZPY}Qsw(iu+VtM
zk?VfLxt-`8D*o)6RH0G0sdlU^c5qq%Bu%TN3R6ec{q<$PcmS#o?ctDy1vk>p({m{8
zE>kOk6c$U>a;ZxBKlm)ODnpQ`%TPxJEO2ZmdS9GBJEt$ZhK?H0Xj&UPI5rAX2R88L
z$%0cK7N~Y(7NHkw?B3M1K;whO01!A0WE#NW=*IvFVBhg)$LPV1*_EBco1N2*U4tE(
zRtl2?YqWMOIBn0yR9sp7qyVcUb1gnBpzXq7P*oT9KOgqljw+zIvtzojb2zbcN;KS)
z9hz1SlqysTupC)~JF~`b&#VTY6#sW--*Hp{MHLo1Fn0-5nsA9VKvNapXEcv<*FF9Z
XdJ+W}DiIkV00000NkvXXu0mjfKBlg6

diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png b/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png
deleted file mode 100644
index 2c18de9e66108411737e910f5c1972476f03ddbf..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 9128
zcmb`NcT^K!5btji2)!5SAPPuNq)Ls56s4*38hVo^(nUfO6%ZAH(6N9hNR=iCp@USV
zNUs_|I-wKc#ou}5-}laWIcKxU$(_yIot@8o_s%{sGSH@@=As4w(CO-E-X`sF|29fE
z>HYT9T?zm$_~>e0H4dIw&!!4C9vSZxNlr9*d^_s#H!1R~WS_6MVY<X9KRt@NZt=}!
zJmw-<%vu<9{2*46DzYqEKtRB}W?;0m`tPvir)n_|ws`U9&cRKD9E*zHuR$C4n1#E;
za+c2YE_XNhr=4w1`ERIhUJ+(t6EiL}daV&l{r~#GniUa(6@#hjzj$F=5S2gk$9PWl
zWp`VZF*ZXCuk-YS!7^kQ8v14=bEdJ<xKM{JEPw1Hx{SNYPnUkh{wY0gi>z@X@%G!e
zXHz-tb|VivQj`iFZDUWNj>i`*9rwT8VC9f`)ww2)D0tG&WBFX^J|oMigqUy#_eV)Q
z<3?;pz6pkr(;Z)thNWZ3Tu^XIU(m2~K2{iFEAS`~Gy5VW_tC>i*Cl0kv`b9xtW+!e
zPD_a1*)E4YGCWy+8(ZVrP7}Y9URLg*>8E8fyY^0u;VQCkoBQJ<_5zdXl(d!zb~b;b
z)6|dkG<QZi+WaF%@Udof3WFE!ox1A^l~w%sK3pfKy8oIly<B>)>oK`*erN6Q98nTc
z*T4b)onLqyA@?UYxy_MYQjd+D&|e(Pm(0oT&BjWQ4@?kFIoB**?M#(;rSUW9SnG<-
zSt-|WaL6iG_P3uZd9eIpr{TtNWC*$Hh2Qz?uBS}bIbRfO#e{zRE!IEy&YexD%F}@N
zL-y@k#YdI*GK@^S9Mw$gu9^2z1mSnEkrdxz+MPN|ZNhhS)_oYvhM)cLTYGn3J-&{3
z*gO%dE$+F=!pgEJp;TQOxUvmXY0MZXd)l&aIQ@q%&TOO4FwrA~ak$>;=zXV4zzr%`
z=0~OcyNxrVAu`L~2ctf1)jOUXrl5QhI{u_3cR4;2>t<wU@pMagBKNuwtKgG}A=ehS
z7p$kgZ$Qt&T)CR+i}f$#aao&&*xYW3ZEI7k6bouugE^}-_AB;YSZ14i*x}t}Z@S~Q
zCK<G`wV)M>?n_c`o(TMz?xA14+Wh$Va%BY<W2mbT=DT^|yby<@pz#}rnYMjS$BGO~
zbG_V{M8v_MH+{+#DPtYI%|kkb=d1yJipYtDre1<f-5ZWEHnxU@&f@Ev!J2V6UcZ5M
zf675C?`dT9%=-573+5~I()RMe<2sKuG_MbWJHLhYJHDum!)@h#kf&^zs@bn$_fJZ|
z66C>0&2$WKO9mM2sYf3h-OCY*=ZOJ$Ngw)1D_iorRZXHQZi4&2K7qT927nQC0Lrg3
z(#lL522bDvLQQ|!4#s}u&v;Yf6v=QytSm1*VR`JzNHPFHGlJ!`WMgHC3lNnE^`=*0
zy?^9tJWsJlLSn+d=%5(DNQYCcv%)omexK}h<gNWFsPA%6^w&%!<h`iN-}gM21=@du
zpeQ>yZmUHWQF=7JRFKXB_b-*?UD4{x!=d<XPfD$2zp>VwazRjll3YN!e1GQ6{ViI{
zhkd)N+MWKT`q_V0)j;tA_oAca{;nI(Y$Pb7t7Zgb7)DUREOEf@igE4Q;TqcgkX-wd
zJ;8G+7!?>DALr#bk)GNchOvQs{BBN~iU1F0&RMR&ou$CHl>C|ZrZ@PkAenI@K>Al%
zQ7|N8uxRTq4vM*lnm?oa%}HLn-3G$yJC_b75?=65k%LM)%(H@{N`65=i4pdO>Mz+=
zLeav25<ur3!O+tj|13g5ftpHmnAeZ8XB$vV91Qaw3L_23;{?l2FkEXkSk@$+eUdXM
z@L3hO_yQ%#5V5(`;X)7D;m<n`GzG{ve%IZs-PMX!!)f^@QbZ*1(lxHz*yO--RPiB(
zx44q7B&X-)=3y0?a2a>B?f086=X6O6;%!2@%ZP1|;Nvbnj_2aSc+8ZOx$k{x3Drh^
zc*UWh!@lFm$>1}Uo>u2rUqXSar;=W-2Mqo41Pl(rQD;>HWC;@e#W@Z29HUt(caNqC
zC&6BqG(7E8;B^rX*m6|Ejm>-6L>RWQs{?%J*!{N&Cn3FMX$DmBS8~(Emio*Dj(^J_
zk~mE@d*561epZk|Er>78iC#q_4Sp0Y3GD6B@JKKrmyoJG4WGBh)HqTZZw>kH>(OJH
zlp#iE)N?g*Z@4^*MV+s+H!!1<ug8?(BT`4A7%E&ah1dhHo?l3QQE66of_LCZD^_OT
zJ8ov&!NH;L%keL!;02WEZnps|bESIGUK*b|j_#3}7!wjn&lQ0zI^4P;B<R8)NXtK!
z<H<Qt0v>LJlIN*`JxC#o-v0{2|BS}}kDUMqX8%d%;Zo1pF*{G_rVrzNd`M2y<S4{E
zZ~?S6-J`cXoKw9Nyc!T319^R*JfQ;CQ+w%_vzSd2SA1Cl*~}5w4Th=Lu(-yB*lQ|@
zw+22g$NGGQm=zT@JCysH{2Xn`Jf|L6NF%L*D^anZd_sk<&BmUS^e{P4u2`5)XQ2pr
z6FxwTMEVRQEOwl&)%);vlI`^ws_!&-ek~dpP&K9}B(25TN;E!RZ*vT`(=BjbD|t}s
zzW9-Ab|ra=0R$O7HH=tvF02}sSNt9rzWq07iqcC`RPA>a!T0DJTesuRVwL9u7n&PS
ze_~l@1G?`(riUCq#<3T)^gi`sw~pk^JSP})C#_iBKTD*{^N7d0$A0wJ3#IRYe;0q4
zA*$YJb_LE1lo-`!M^fB~U00SLiLywh>%-_CXgSb{j<vY99&V=OvL;cr6v)Z*d1f(_
zmttDS1_bxRDJN&U^J7zW>u=7v+FzB+78O;y>TeZvRv&RoWxTLP?d+9Zi&Ypua2+{3
z?&P=TOQKt{%~L~p0$j8^;iia9j_>fKovkcwq%sUQ@nh>Z!)%cfJ0$;z4CPrz6I0OU
z@+^ZT$qbq`@V*LyaM7l>CZ1ZQo!IplAN5a81(Tt~ztAbY<i$mt9=sHzoH<;ZC3+j&
zfeoH_mod8oV=NQrKMgqIHo>c(d{@u2@?f2YdnGcoX!#60Ixw-Nvi<wZu~uCAx&ly4
zo*oEU`pRLNt<<tfYu^`o+<I^Yy&3+OpZ+3sM8|<a_+ikgx`j_8w8@oMw?QOnaxma#
ztvBdMW5dPGTHkg)r8zqmnR>x#$k1X*NJg)beT<!BI6I|$nDc()yoI)D@}gyRaOQsB
zfLbI&;}MtI-;K*eW0U+p^%s4#O>LqL8^6*<{2f@@ns|Q}RjZ!$JIHK8NbS8xrmu#@
z6ulfiVr7xxNb~dV#acSrSX_pQm;bUeyjdV!{OZy#M4(A`<QC!mJh4g5c)EF5T=q}<
zW@D_(8IdA)@LZ+^?@Re^`LN;LFa-XJ%jqe+&k%erD@TpV9_a{?l|I;EqV3fWd@!mm
znc+jjwSD4gmN~q8qb2O9b0&hq#dISkvqnp)dh^_5#awo2qtW#2hbA8~6gL-boX8(>
zwu81?V`O!?oZ`D{REMi+x!1hB*6Cy(I?k8T%kET=uKQWo39E<N!G@XLj^mB!`H3yJ
z!Qj!7#@(T$KMAIH7Uc`l?xcM4A`;H!Hh)WwbC<nD7gsWuv}r>}=ca$my=uHTEyP8y
z54Nz1YH*)(w%#ztIo^C*PQO<Sq}rhOxcNMb8@#ITb!uTvQ!QoC#4~;pzZIQh<`o5a
zsK0adCDkZzpSM2v?Dnq&Op#4|i{m+alHOF!rV!zABsraQWow3tn+iY$9*OuBUkT9M
zi!L7(cz}}k8jufu5FrW~Ooib3(c5RI3=e|#JTj3@HiyIuXNINCubwhP^CY1d5lV_I
zPR^pCAA7A;o{WSY&MoJ@^H3B3wlqudLt$03a92|ml`_fBD5^e1rQ0zZqI!!*)mUaX
zS<|#vzUIE4nH$}&9h{~-x}y>jte`Hel~gpFN_jZaXoFZnUzuu<)94E6T<5ZU?s4>c
zpU3Uo@d?+!hgYmVil!6X(ly;KNm*OwbI8{z3v|%I_4HT>Nt&7^q0@@SPXaA`iAvAR
zS<h!Nlet>r*v1muELwpeL3wqu$P7L5q4m)-N%|J6fE`4!V+xyr<AsF=N3Nw_)k%q=
z@sLeUChJ87|BuT3)Zfb(d_7rqA;;q<0RDq*yVB#%g~>Okr+X2!LT$k#tFYksHJH=n
z3F!I2Qe4B5pnFmAer;+($yQcgD*uHlDurPx@2dd)<Aw<hN8bwlTitDQCh*D4lJ&t@
zbPcg<?vGEB!6osFz!C<6J?T3elgq1CBGk`ViMLy*FsZblKacU=z9OgP`zQlmav8|r
zJqXVFXZ&oqcpL3VmE6|0LfYJxazx~g`We&de*5*>1-RjhQe(5`*~SLS`q|S9v+`3~
zQ>IMi+hcTX^%}_YWT=}koWlGSwSH~mOvRNJ&Sfrc>H__ux(6*kTUubhdoQN>V2}J<
zR)ymBx4g=I%zlp1J+QjI7joltSLskIt}qG%d@lfB@0(d>+A&l+Glwv&La86NxDmfT
zNv>`p7eT?@iBSF8R6M^wCx1D;HRt!F#6s8>2mF;&B-MF;2m~@G4CaiZ!p=4aG-$V0
zYR+PtSNvY$YwW0OPY<JADCVjnBFGw<R<nA2e$FKXH?9jddT!dyLUX$W(ij<mHx%^4
z#Kfw$Kapr+aI3ij<W<<!53T`L;(&MsUOV8aErX8@QL`(^=6IWe<B80#a`AX(GBOM|
zz);Yd#W2Z7Ls!=V<0lI#Q6KUeAF<oNH|r-BdT$SG_m>xL-i+8&!G0&s(?(IcQ&Iv2
z0Nx*-7_~pZT6#2L-so8nF7QMgH5}#22w+dCGMyllm->HAO8q%eYuJ_BHB7343cyG+
zgo9$W05T7<fr-GKzd>{CPl`Zw^P=q+<TZ&c<;8(qxZT}Mh0AA>#rx_<NkOp>`T2%M
zMCeCJLfZT%fI{csusPnQ7Xv@XSzVNmPU{iX2w134>~=<wxr-oc;}JE{7q$d%-;{bl
zFyTl|p*($(`P%HBVuIwixycDuqnx9y@GX3{y_Fp>VfgQ82*rq^p^97wA647vgT`a#
z85e!NpbSl#8uA*dnopv4RMby4F4MY{UFn^r{Li3l%Ume;QtBh5?8wCixw0*zSQ${*
z6)@M`djm|Nz;H2K_j1ACvx90`pqKN#`9b8Cd=@J|$6R{ZYc5yw){(D1GtABWH=Zy`
z-HxQuV(8LOB`UjI4iAOJ34LY@KVEmPb@XIC)FfA6m5B&*8T*hQyR{mweAL1#*kA9n
z;O}eZUE%DcD;yjrQM!F!8~hPzPrCH2Fvr-ItjJE$$pV*gv9>ye(q2lsB=uQP$h%X%
zlekK6q~fP4niGy&O9mR~_I;)G@;?e;L8#rja{}{3_rR(d$+fAsX?PiFx`2ashkOGP
zw9A><#);kE3G}H}!W&WxH1$sg*P@*n!{=#L{PK)<Hmz*c2ksnqz0=Q?Pv6<G2+l)H
ze|)2q=-l^eWMDHck0nk36M||B=_FV8jf%B632f$l@mg+rN>y~GHI;RsgpA$#8cpY~
zct*9kjG$l!k{*0T43n={dVV!idt6Zw;lPW%!2K;#E>?J>D|V%r^A`&*)MdYZJT>jL
z*;x5TTDFevc8OARtqyN`Wyt;0MTTO-DDG|<F-ksrpkirCX{-2<+9zqjKbqqU&G0*b
zj=R|>wtNxUqM1$~ye0&&wUtZ&eqI0=0|Y{WT*|Ia1An)J!bjzf9y3P874R^|FamuD
zD47YqkS6Zsd3^fEq_zq1i3zN7fM#ldxb7Z@0Y;<&n|qFI`e8q;TO3t$s`geh?U*oK
zp&F$0C<k*I-$^>KJFD-a%BYO^4KA!5J4T1f9rK@Izkpt4qui#^S_s8AE_pvL7$dKQ
z*TXfMJYx+MC<N%EC($kbK<8T>q$g?pCj@15ZQdjbAm~v`@A?MCg`$$;e!iKvcv423
z^QOF{_mgOGh3-cDZ={G<PvoTYnY2do@!iBn&T;!V$NX5|&QzV%MkUePwpLjq%5rda
zo=Z!u=Qi{bT{f@F4VyX#bda6508cLh%9p7$B-g9nUfss@=-CL0oE;`Ib{UM?^a4w@
z_wV{|otJPyPQ@?c^G~&fV{hrI^;#RgO2aJ^jS-HQZ;WMVg__#V-?pT&?R{|Awr>yr
z_&&UYqVw>f(5K`SHp~Mm5XB0N9$~=XOXd$uQNj=bO9<e_Ye%1Gib3+GiBG|@tU85&
zB{eX^oBbQ6B>5ChnZX9K@n&#T?vXPDfqt07xJZVvBuujM>H*4hP6HvbJ~#$K=z-<U
z5?=xi*(Moqexg^Jgqpmdd)91>vNQnRCryVz5<Fq3>?3YqR02@1#K{#%aX?h4VQ45b
zcmM<+1V?|eCnx}P7(IWh<1mpP1d4*Z4r1WAfB;C4dhrfKPC^**Pz;nD$YOJ0I9i3T
zdQ`v*Ujt<nWWcZ!lm#f~#fgkS>nCM$WL`J8L<$;~1_X+Oyzj(IKG(tLOn!YS8Vny{
z@>lc1XCA-~hhrD7h1@0<F6iC-DLMb3PLJt}xHM;uT1A@cqm`%4Qu#CH1Z3-2fYpL9
zw$)?wyn8v_jDld(m=U%zj630*ZuP@luIrcFdGDiz5B1ySIjmmwySjHA@H~BW`*{O)
zevpX8sl`V=<+NUR&^Qb;CGQcpym6ks%JD=k|1;ooEUWHZOx*0O{Sg{6uQw~vk15vd
z=~~d+qV^j~K!h3iZkpVLK4O1olH^KZe?&Nu^rug0Ee=<Xbvt?jd%oX6ZXQa+Y;edN
z+>O)T))gw+GcvsVwxcnaCv{EQzu|qcwKGyiwb`TTP(}njGXHh$KxOryTWq$<JW(m$
zCVycdJ0@?D!XB<%eK@mgLtacLov-1XVj<@lgw)IphsbBE!DYtf2hf{+I5P{jo1Q`0
zE+(%~hvO`aeboW%uzloy{hR%}$s>B1F6I8!hh2O<$rL^FOXZoKME=~3M&0eN93bd-
zfpL<(mU)+asMc@#Mvb?Ws^Rw;E;iny$Mb$bu)1ovt0lOm4f(~cAmY<65o0ePN*$EX
zrmHUhGI1J_t=@d`{#mmFd?eV^Q&jw>g^;Pf)7JHdLzQB*87{77<m=g1^_teVTci6*
z_IUOTewHz0BGYUp&Q|WpRcpI3qD91?OLrY6m7X*!+MDG>?Kto0xMvGjC=&M5EOW+c
zXpXOY6|Uf)0a<V8o?cS>m19ZLde+hX5J6c11*#mSinvk^A4NWc#m<TD8sWQ9vt^cd
zeJyE&D*|h8mJ2@<4@jhnoh=i_>5P)?v~|Bppv*0~T;-^rI9{w3{`~5)bC}`nF?zGx
z#@S`#(Q@kl-1Fmze)A@u^#@9=c>MA>$*eslP^G`Zvb5N|sKK{mQ*V?4eX_x+nT?*N
zalRRl;P=w1HG57g+d^AJQCZh4&g{?mbJZuj*>jJpGL#!`*C>{MRd4-HML#+<pYrl6
zi%Pa!Tmyn|(=qO+-$`YTig}hA=XUCRtovM@w<tt_76_kxdV$E~yr}?~@f$o}b2?v<
zquw5^!nROXGswV1-zET=F>BNUG#EHx5`rs8QUMda13u9eMG(lKCYTHCS2gO0L&PIU
zkkI-^jv5$aR|blKRsJ6xJ^?au7%A7>eD6+l!ALkEL&*RPl442Nll#UeUv)cn5=YV~
zP)$eQ=SZYMG+hSAy@o*c95}KXP7(~*M%`ovFuZo<iKZMJhdpA047xuI0lS&Gz|8;*
z)8^gT*8E8YqW_^H+n&$^MdOG+0EbKy3%E*#8iMALMSeR^M+X?ur;orpk}LK)l`JIo
zY|`RFTk$>s#RM5t0XkRn?DdjD!7zh+HMGoz6C^Gk*}xdzg{VaE0-<l9P_~H|)0n=^
zju(OEje|mlNG%J<SO?HycX%icn!kZ%9hY*b(Oq(s7qmV;(u*F5v4TPLL+G3q6decj
zMOndmw*+Z^y}w!_@BRSGGEFeR<9U$+qeWPhefmIA7UCNX2C1)N#lf!gy>2L4An_I#
z_)DVjA|u=a+{fkuUkWg+!HA~@f87&ENbQ{u_}}LPin9T}<o}Eles3v31Oy^&<N()8
zi;Mrz><ZivQ;9GTE0T{Svo8UN9NY}RbOmTIkMV-F-bKgYgp64#5SDL`Ba&h2A%h%I
z2s5A}elc<yf_lLWCVp@rt!R?XogjlRItpnqfdm$?L5bmoc8gN4LY^l}w2`p9e+iPO
z8fV4?vad7*0vAO{-T*NGTqLPqa?IafH&8f8p%Fl7LzSg8BpEP`WPuid_F~!1f8~Mn
zUAT|(CVLu_3ybikWAE$O562GZQDy^*g|0bE8k<FV9FNQBtvb=2bgO5yQUMm$SlqKp
z#YLrt8I@~A4+GO0C8YGwojkUB=w>}BZ5K1W#~XT5z0gcc+cy7@$?+tH6Ta*1qVBL@
zBwd%m=LAwRv8~~Cx3MfLmwax@N%=M`ciGYizcDPi#Qug{`#^)V(iZGpR*3ayNFiWv
zCT;%Yg?Tn;SO3Pvyu6Dolgt$Pq@8;O(nD{uHM<__6!t9UUP@K#N73GQB){T~9Hpci
z<4P6T>Kb;ktBMTne4`e~@)E&sIdENQj5G9OYu`7~bvsRTeRl1z?i^aI{)?VNlekCC
zXJKVy+B;Z0|Abe1cpfcW)93y`*4%NW#+1!-OVtut{#3Q5fvBQ-b<*<x<H2_yO6@T1
z1Dh=D{%X<GjwOmAT}D|e#oaQAoNi(=GmC36^R)EhNv18b`^t@R5&ou;6*pF6>gu<b
z47SMvv%b&YN)x+h8ggyEm{u_iCS0~Spor186>4x4f6pmz-<O0>x)Q<VQZ7D9yI)_t
z@0~W{nE3f{*|sM+F0eIkOn>8wc+4G^!kGq??b_{28Zdu9+dS0=wgR`1Va^@f*j96v
zE?<?v?v9Wk;(|hp^(fgFX`lGUkg&V{2gBz|x|QQN|GZw%%fUTc_0bixG&pb25)$V7
zhq-vXd?l(iEx*dvwwx}ukFWUXJ8^y7Z_7cSM}!o>=;Q{AtjKXi>F3-EkrPfL<`s@S
z(Cl$t|NBt^_k;7j{U(%~9iLt{7g5yFfhq?^mE$`_Z>W$9l{seeXUdzmz8$X$3_fz0
zNc_d*naeGkU7&S83}C%)Owd-QTjWCq)4F3puS?Y*tOH3*JX`9t7=HyB%;}BFw)~fX
zP3M8Ef?E#|5Tf;EuVktd)#&vh7trJcyxkI{{O|eok{tE^hzi3_4LW$*rN)J?Qmy@$
z@GmJ)5nOLC0(h_C(Ayd(aO3hP5pxuMsRZfvoFgBCNNrsu!(1gLl_W1XDWi)1KiM4&
z4TFIN4Z44<qXS(eM*U+KlzB`Jl6TMeJqcIoq+cX~>?71-@F^T<l2hrTOJM@J433O1
za2o)<8AuL>Gn<^DjNF#jfDTD;qdJ36mB3{oK$>kk1T9x32)H^4{v<&J$?GFZQeeKn
zog^e?9JHCkaVAg{99*Xytpn)yWZ-y+!;hT(I=Fwaat_Fckc87LJ*r7!)y;@7k^fUK
zxl{eySNWG_U%a8X+L`q+Pwk<%iyJN!iw;Q%=1>$p(4~A8CwtPS13^pt$BA_79TEm3
z!hx@gB4KmstaCTszUdc8*ch3y0f@{;*awP0cxYg(J0u?XLQsFzBA;#(`vHd`I*lBM
z;(99!j{626=)R8+$DgEz-MfuzaGI&_b*%9#-BUQaw^>IHgp<=gob@UA0r`@#>-qw0
zpfFP4HZ?#}t^J2jFG?J|6<^ALo3?t>Oz5`IuInteCESw+$NTFo3L77A?}>NbqA$vz
z-v81kRTwtLT8^1Hkf#X&iRsn`fKmr-Mu&N{*qwp;$qBXyT}BAQ@L;wB^UWEXX)3_b
zh&*ke8czIhFd!IxCi_N!jnrKGIQpfPR2xJo1%*JNF^Pv<Z+U_B<9<#M?w&vigez1E
z$S?&{o^aFyw}8!ezROLaE6TlAe``A*b{qp!ph9Y%7#8dcN1Sb}?d?nXyGrp@%fXd|
zyjrr$7y*~S<5y7hR~4Y-AIyYF4+LaFW%XpFTCQLInmvpAv)LAP?Vj0?>DwB;>G~7@
zQVZ23Q}9_P0C|)?QPY(DS0!&Y!!<B4*qd$g#oQ`S6bBT=M#g;bPvAfQM>b^`S|XCy
zKNy*Ki<wmBUDfRrO06kR2|0zDhJHS;I|n_z$uM8}F+ow@mjjjulfUjef2sf5a-~yc
zRFa&*dOgYJTbXKHHdl$F7pbF2M!=RIJBhfR7H4@o19b0sa;w<(wkH6A5AQAh;()GC
zORq6#o{QJrro~WL=;;L>l!;HIXgI}+mn{ko*V0S7_|JPJm`{p{nOe9Vi^>B;a*toh
zNY>_;v-<w8E!7l#o1BNs?z_wf#wYtVen}QzT>=$AgIA44ebwp@a!75wJN7K9j;+SW
z8uoQjVUb03=55d=@#Y_9`Fs=Ut|9xs?0ce>@0mn&q+oSJdb^!tTO8;mb$%l));(4-
zKPebA<Qs+-=No32z+@IB#h;m4<`kll+u<fplT9C)z{U<F)EYXY;*Abm)fWX>@3lPn
z@G1otTd9DCo-AAllf-ruy4anJn=H{RXLG>6j;g|@m(&__Lzek=U-sRZzRO1lOrtOJ
zm+5k9slTfFKsku7%a$T6ENphjA3uy9eG=kh6ii90n}D&mc!E$-XY)ycsx6qljq9PY
zpDzzbG!`4}xmvrE+7f*Jx351b!!}L5XmvDjt;&0$*g9U$nbVZwscA2!5>S?vG~K*d
zPzXIIrnkt|yfEO5^dk>cVc0*&Hh$%zYA8nPL(Hwwk?vVuZpJ+&#LxCsujZ^dalGUq
zk8X*2y(traI^+1KZEu-(_j%t<)w?tI>hVd#CUfisw!-|mSM{#>X=67C83>oRW^)Nc
z_@hYvV5!q}p#c+`qTV9*kqk5GkA6Z;&)MXHw7m;gzS)ito45k#Ejt_oX>5cfT<WWC
z2KQPilzc#zGBNtM6S+21XguLQ^dDB(z7%x$|9-GPGjusax-^CEce?|_6p!`LQeA*t
zPSMyw*{1IgF(Ju&%pGerqJ=JKbDhMW8MAdWH7W5Ps%BEf`_@XE%%LsfzQ^&li+p2k
z4foU))Cu$ljG^6sL~Dxwa@&z_QQ5dYY+?^vZ!Cx`c*gs}DQ3>LfXUX@_N^+#UicK@
zbUwcCAj!Nyi??H{sraN8Ni<n^weAk;y~J0UjKj{n{e-Mf;FOdSzH<=QAwgI~ZoYW2
zUpvi%)TT`4kALFV_jZJM_O<6bY?*wQEEJ~NlBv4Pxl13~^(Vhy-lxuaeReAyvv)Zl
z83s-MpuSp7Yp*OfRqsg=gRbR;Gk4?3H|XwA2=7vd7W5Mp##Mg*?k8z)v^N<f(0Z%i
z$mgMWkCQ{6;E>TB?aleSuG-iy_c^*{zg2xn*m1e+7r<Ch@N`8VIpTKl0Hu9*17Vn_
zP|UaDMbN9!!YkA(gT(V&zb_o<5of%DCQ${->BnP~o!PuP9z$Gcf(C!4f_G&|`v9JI
zHr460gE4qwW4yYiYMyx4c#(d_<1JDCcBZLe=D9DE4fC#q8)2D2Dpnaszf0h1)i*7)
zxyKd8y*&dyiKySsH2Uj5(~gfdkoWmaI$)6ycN3CquawfZ+R8$$x+k;L>%Fd*;XYy0
zkq~3{maC~f(~h3ZUsXWo-EodvK!+KO{DW8g|IOnpPq%l@9Ky`Dd0%sz0@6$Ox`Aei
I20H400LcNok^lez

diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png b/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png
deleted file mode 100644
index beed3cdd2c32af5114a7dc70b9ef5b698eb8797e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 15132
zcmZvDWmr_-8||54h>`B@4yC)hOQZ#cM!EzfhmdZRPLWXQlpaz*O1gvrk&^D_^84TW
z@jlOq4`=WFp4extwb#3MjE<Hv_A`oS003aCswljCIwStuG0>ilFPELs0YL1Js)Fn*
zzr}qsbfZ_wbNOa4S@vf>;bE~>+%RD!>v%IFV#WTd^7(B=#T|Xno7mV6xS4f=u6692
zQq~7{i;;}Y46D{(Y+R?~SpnS3W=+e#JKDJX-SSUi>9(#}mwE5Tv-r0dn5ZY||9_k1
zWM~Q&Gt=O&6oAqZ3T;9&9$g)JWBOFs0NWF6vYJZJ24_?zn}`jXIHjr$^?F69z!2p<
zy%t?XyTRP;!zMXPY^&6kR$$J?UW%?3bCC4XDqr@?ukqAzCEf6lUi%~QE1bZLYf8h#
zNIFjy{z&gk+iBasaZQZklPN%Bhl~H-pewWJX`t_4w;I)?=gcrEWq1%u$-pwhg=Fn&
zj3nJfbY`j%G4F^8@$CZRg?Lweh*w;b>{2YdOIAi*x9?W^yUNov<sQ+UmPU)VO4uj0
zDDQH3K8p7Ks;G!<)!}->n|q?NJ#6TPeU_fVowC-#v9#b~gYH6zAw5m28>MUeJ4Tj*
znI<k2jowvXwz1FqzRTH_Ok0mJ`IJ?muj9Nf&_cE}p?PZ*PG-IdxC|+>Vgljj#XhW$
zhiz?z_2X4xbgPrk6@%1I-IDPigjXj6D_rk=N!MHKhrgxgN|sX9wAG{r8mKBc5uYx!
zD6;oWKPFPVaeKY+;_tfGk8dnA3*mxhD6c6ylsqfXvWFU-T3PF_*(Y_!aR4ycp@UiK
zL{0B(1-*H{F=ezF{RJj(g)4PzJx50@A1Bg2>XU|TM&*KjHze0G!vbN}?9#L0`)Mh&
zSDg1vm!sTu701b=n&--{Q{n2DpuDb{%No!D^gwg<Uj2vl;fNLaN0dNie5%og@3JQi
z-)(XlzBg~YseL5qbu?htPN<|S{XjUXg>^bAW&J!~L20v4&-T0QrdY*80B?ozklkW%
z0rk7=VB9&#oB_RdT&R<N*F#8FK%9DHfWAi+Ir^%BXMO=Bt5+J4q7j3`jI;UH>hUD^
z<%mehua9i+?=)hn7$V<hnfX&)FhVkec6pe?I}a(Xe`A0O?^nb<Yc(D(*Rq`sWY1R~
zN-nW3amL}k2!6HXjbDG5yylPS+|X(?lc;5Z8p@LXyU%D?fI2Fse>mdJdx(xObB8b;
z<S{?XzJ`)H&Az7qG+mJ6Jw4@sqrEgPWkw_5uKi0t8NxxY!YFGoj!_tcb*IDi`f?_V
z6&F=Au;_D<nuvZj{;uU-=jNl?AX{_0na2RSY}rp9dcY2<Gc<^Df3nHT1X!@>d)9+r
z`yz+r{dSM<Y3FULFlEVk6Q<?95a3tLwb)XN9>5hDz=4ys1#(+WoWqC+KtBRNG8x2R
zkNK+s#C-E*)s>kZCpyIRfB`}hQ6FwUXyKlgYs)!v{kjY>{yEe5^Qr5JEe^d*zcU@;
zK#oE%1w&_PZ%A@P#G}S>`1qbU0tkHPO<2-5_Uhe0Y6$FovD9c;Ov~qVD?l$$zpcmn
z8BGk}4~3UeEkzOUc<9FqtY1TqoY%<J>qGS&?kSM=O3g}NY85}H(VQS~6J6eJsX=%$
zf%etV-q-i9X(#Qm$6xDNs6>@0-*1b4*6TC?1v|R@FkpbQLy%N<#0-I&1swvEMn?Y(
zQKWmqz2#a=uq>R|^cdhnkaB3z*DB@@Q=Jpj%9EBXLuo{WDl~W0E}qH^aARnpD#`Dn
zAO=+iepMRRSE1j%9nTDc{=3ACQK(De^37Zvsl54F9`aO8G+M-hmV$3r9l|3HavVov
z=cO%-IO<MiBG5betc0Exb?ZA??sg@Yt^jxuzzfV}RgZ+#5?^zuGU#>Vsvo}L%}Jm>
zX9gR60KV3P&h$KA;XH%c12K@uFzJy5i9S6?U7BKXLk4&WhD>E$HbfP_Ojp5OF9rfm
zT$`)n#dWaGB<22Cl)AZ@Gv7i0;!*>IUJv7##H1X4+Wx!Jki<;jka&jGH6W2$nzJ4>
z6yD|%yOMzcBZj~}DSWA5Qj5Q$P>edSrrCzs=X;k&irN=Q9KBA<jVLfalIBx9QGwJv
z1M{R)VE;9*;S$~5GJi*2oV7iH&#qx&<{V2kez<x->fO4RZ>klxjm*H%`2m5c(y7Pw
zcP@DyYA!WftG!MB6T>V!I>_ym+&LEFyikRHI`-j@U5hGl(;JWZbO|orN^1|6{D4+0
z>5k@1pQ`!&UM0WB;(#4ds`}Zu6)B_YebI)X)jZRhJn}_frc0jF4SFi~JHS=t;knPP
z&yEu(+8%qK><?Utqp=kc>YIlcGahTfF6Ze^7edgT$J`6#2qm|n26OTFDY|d8s~3hl
zpLtuXp@mq2GW8<6|E)D{#yU2)#iuPY!=|5Hmo-<*yo(QYr$3HQqx#%vtHjS|I7N<c
zY$cj1UY%X6wN$tJeGNO~&kwjmhTCJ-m4eYk9uHOHE(YzWrLVYBBtv=!agp4e?@-Xw
z#@@d%cg74l#(ELF^|t-^FR7({=JtNC<VL$kUcR-|b+hOJ{pRqy#fQajq$873%ktez
z)obm-7DbOFZTECm$8#?%C&!6(4ae3HCmok8F5G0J$;d@QIWP+ri_?e3*&omC|0ES&
z%U)<mw&J1}h$k+Q+QOPgDHVk$Mkl1=bNhagblj7)sQQtl;JPOoyc@D>iRxC6lDQq<
zTXIalFx_Ncd(TZ(!iRaFymyh~tc4h-VJo_vaMKP(y_b-@V9j{@6aA&=*?g2r3#HBa
z-Q(IP$--;P*a%%PO{^%D$`G{5nl&>sUgEN|<ob+8jM$#{SCP$0+qsi#@yG5w+QWXn
z7wjB9Eu8Pj4v*yXkaVgLKQI*WUL_Rr&!8iU1R@Y1LAdQW{AO?vo7QmrUkb(%gb<zz
zhtcyA6wiMwgDN>s^PG}Jh>ISvD%;O|psp}p`-pKAK?pbIHTV?a9?u}(q*GCDRrVm>
z0lC9`wd;C96R!Yg%?DnK2`W*_@jf%9IPnwdr@BgGxWS)z)J>cDasy)mt3Y7)p=txP
zM)#~H^+!85n&7b%<Hq6l%gb%8C)dnR*L4=OlNIhlx07*oW#FdgjvgIT_wm7LDLIQ@
z-T31afEb`rrG^))IJ%^bJ<{>$l{U`iUrdD?1+BT#+yClM)OQek##8!6GFE0paMGl~
znJT5wR_VzqeBv^?U47rJ0!hXwG=8QSN^}EyUNDp2J?(D#FGFgCo^@;lRCMe2zczB^
zM%9XHn3ccHp;wqZ^Uy8mD<>D6R1W$5gqQ>%@AfWuiX0~?SIt2=9&6BS)f-v(V+-C6
zBfbm+ypV$sk2v=A1#JUeO~Sbved*o%-1Huvn%MCF?%m%fP5;xCPP|-(b1@laO;e4-
zd6?k_0KN;j`6NXEVgi#X0MXBw38O@O`lZ=y4(f@Vx@QT9*Vpgk{{$@lzYwyh%?NrN
zGtU^<Xp?^sLam{TiaAnSu<!UI1|2>kn)F6?fKBPA{djTaw^L#(7F&HK0b>+C#os)3
zXBq#MC^QE6lzK^4733pD>UE36G;-{`GpU&0a|`(V-vTwp@G~>2EL6F$*&3YMPp-<3
z$pGu8`_-xR9b-}m{9;+irLXejrTbK_!ep%zGnh;U{^iGo^_=F2)RW>Gnr99OXB*dm
zfO+ugGg0L-0>cKR_lG&~a#|_x2{kD1`&ncdCyi6M^Lm931EU`O+-XCCFYRAnjs5f6
zUa^V+z|fk5UB$rN`lRE$u7^I~$Cjw-;Cp6f)HA(2LU;};f)pd4T8-D?I2up<CIE-_
z0BM~3fYu&CvC@lKT)~T`_^r?t_rr_OF~QC}uE3jm?W1tzq?(ssDw<-4)$UN~!5^N7
zuIp1H{Szdjgu<U*<ONwr)R<|#q8qLAbP;vKy@zed#$$gM`u78~!!AwP@sMCx*XkoW
z>+3G(m$&;vg0~+JOD};L`gqqk*eJg+xpbq{T}SE4${0xj>in~=ldQi1rE&?>CiYw2
z#vg0Xtv2hPZfP@t{cR}nkn`imMzN%Ni-Y?Fuhn*~A(k1`mx6vQI)vLRy&;WKU0n}B
z@ZJ|)Fn=>TPu!<>B>2~#eYSLuW5D_)A)V<f*OsqS8u}JIOQ4c#d-MbLo<0s46EY{A
zDm;!R*<T#xOV%2cy%%o9dW}tEfA*H7gL~tKR9uG79r==tbU*8@(G<ES&F_cHF3361
zBE(ese9+URMmmq{w2r)*{#F`lifGeQQz2;z?>?!{Y4XguE!i#eiyl1d{uE|RTBFea
zM(g%RB^85qT#!n<!>$qYwxcyR1CEXmt{nlJiLD0Zs8{OI%+d`MxVXSwT?e&2t6`t3
za4o!LrCv}!1now|E(qC6Hf>E@-0qF^3NbW7_qjxU<9CDT$8j)VXDt{8H;2Pzmw@Nb
zJ}1NB7;d^GlLw5^EU`sTe0n9Pg~GmQIXwnxEAeh@zS%X#f?&FG!fvUXW1I^%m4Huq
zFb9-|D>sEz%pg}Dy}4S#5$%jBg@1FfhQKlNSk?MlP{oDv8s=i*#<tu1V3Ix_fW<(J
zJfe^b3;ryhuK-4@%J*%Rbv+W?wIkbHPNyoO0Z4$3Hk<bluEW50Ee(aixv^zPjV#o%
zTd=x6zN(G{3}E1foJa*5>C%7KTfKRpT((!vAA*0?h5%4doY~|3y<zwG&L#TgF$bYi
zNIhPm7*i{_xSzk6%sHD=KN}hpwotTDjyX^&nHfM0Je#8FI!{3+qTU|;pw&3QlZOIy
z5rT0_Xk2_TvcOf_uPEPjT5PgwOPQDs|NIwhCgdP8ELV(hZ2mb6Axa)6Jv)^ovay7R
z+Mhwte-AdR;PM6`Fg#Q~@Tk}qJ5~O0<vL%)0hNUiU=YYh-SIXcH*JcGy!C0nWRbe<
ze}1<s-27$lW;1EE{2=nZ7|)e(;01`DA#m&cxOD%L>q_DA32&6T2RHbNq-AItD)b&W
z5)Ng>T|a!hlRxqb6(lwy3n#TR>Q{5$zoTQ(7Yp23btrx0L6lb;lMIld_ZsBm;X65W
zhL~-DK~O*?iR1lG`e>ZDti=^0@Hu{22<C9G{5GMA?`hN_MpkF91>rk-ri$|Mhlfjx
zz}x1wtNp{S65T4sf<QQm7FFT+Q1VCP=nD<5Rm~Wdvt{>tJev1F_{R<Kn;g^+4*MKN
z<XUB4CO?VyLZefp!_fqjOBBqkG4S-?j6#3_NA+h?>MAe{B#a1+VB3lE#HN&bH7Rc8
z9d*c27p;2<s6OEf8b2kT)hXp1dpW(E^6DV&D!;;o1j82mqXn;0@F^i&q?fW+7Of+u
zeff1?hHSjG<rE8hQ-Oq_!K-WQ181GfyH|k^KTR9tv9q88Lq!7+2Syy{A5l`lNr|6%
z#0*U^&a$x#IxOE_aPrB2Xum}>oA4ZYZSk)abazBuwEu8=L?5J?TG~{R3V8o868I?F
z#Lt>o_|ohZd7psYl9Vtz6-np(@R<I2C;)T8oILvZc&}!k9l=rnh(SMx>&^Q6yKF@#
zKK_Phwv=G^eE6%t(B0N4(**az{<hbQ4M>Z$|8Nab8SLz)m@0bPk@Wo;!3I&BJu}Fl
z{}e^!Iy||DQ~DlD9=@%{OB>I8fpV4ZTC})4v8^-k&+wR4`hMI|wtCe3@xtk*M_gV&
zT7}a{1ERd3c8RiWPPBv<xo_~|`%6M>InQ4k+GP<n7v_pttFwk+8IrZHab-uhkGCsy
zKju>xSExF}CJt9v>(EoD>AsA|3ioYaprn4PVQ}7|zFbK2=iyU{SL8K#I2+N-*;IUC
zGNwTD;XDPHkYcjzxc(jT?|J#?A9c3l*&Jc_`dkI4Rs7QC{PM6ty6TzkxCMvgm=@WZ
zf59SoAflkydVV7?TYoT5`U(N`-HxGa2z_V)YRIz`HRRE3`12J1-lEtmojvMCPtH+1
z)V=IiqG9TR@`K%FOk2#6!1{1OD;*%xRAYo%)EDc|<)I;%EXi}?^()_B6K<n~w!o^m
z{^B@A7{e})%_k}+m7;q)#Pl|qj(aul-6|g>`pYE*`4Sg)tmZ&*^v8jAGJgK-rh(nO
znii&AGyPojK+Ee9+EI?hH-rm&m>=`lAO7{E>D1JKm7n{&r&z%Cwi})WQZ*k0bJ<u7
zd6Z`^+Vh~3QS&C=uzWkNq0?l}$C|uZiR#V2h?`ss(Gg)$R3rey<bV=Up%131fcyo~
zuYRuRQ*|jB%|Y+Me5=-!GIK)sAe|gwJDy9lY^UhXagIWU^)o*15x0zan~F-$T0Iol
zkE`eS#}A^@eg1Etk=5kFfnOy#C!jRfwXI?m68&w@EICw?K@OP*#v$reszm1rv59NX
z5fitq2x{yk1enIxl2)trdFA+m*MFk~B8z=JkTK!s*+5W}YtO!w!K%W+3)iQL`0(TW
zeI4Xyp+w_<ov2{%@-j$`YC_2j2Hg)uaO=aKo=QInC(3k>6u=B0Pn1}ek~+ch_lXwn
zuc_uu<oWjKV#rOl>@YRZb$iGWq5BG|g|^Wd_oh(t2hEHAQ>~0CE_L3eNN1(NZ={TZ
z*Q&K4gY{whUfZO+x8Pi73^^HTU(N+4u|z~}-7IGjQufEje1K4zazaTk96zyU#Oomt
z{bZ_BZ#I(ren>G~3QNkj-ElHS()&+TCR+bjq4vO-*_o`jyU7mwVd?J!edfIxKubK~
znqmum7Gd^m1|fh?4|kW$?Yo6*!cTvq_fNlm%+Ol<W$IOHKNwa5xAbLOaV}&W!ABGs
zYB|nN1T7s|frHs%iEJW2y|f}8c-ef{Z2l497Iq)}C^Z&vO8o0azOTn<5%U8+m!g@d
z;os(O7`{f-SXo6lO87v+d0kY6*iyn;4ke-4N{C@--M=PHZxaYPQ>mz3Wf^<BjsW!H
zEPF8@r8cFn?CP`|1v<#GSYQK?cZN^~m6l$pqgd70gKk=$yqjMhkj+$LZ5xA>I(4mQ
zO~z#3)9fPojD(VbPK-c6xq)}DM$borMa#X!P?x0&SBqzQG-BST1On6bd~bfeDWpmL
zg;dMkgsT6muQ^9L>bR6T?+9!G07EA3XvMR&Q}8^<F(#gvvQ7#Q$TgpjcoAy&I^5oI
z_d+EO4=&OpCEm=|(frwZ;Y75W5UY)cvuAK%0<Rq0(kdjUn&LLc3hp!Spi>MSfgNeA
zEzFXFy<gB7nMrL~Z){BGS}hdnu*m}fXLrf7Y@)jcL3F@IolN(<h2V#abPcMfVI#d#
z)<;61X0e40^QXbsBksx0u41h-0J3pjo~F_C>ts}my(yK#E3|dx>wH+PW-82HFn_p_
z{;sH%Izw2f?je+3ZGMKbJJ%-MUk6I$Q3lW<rax1QgTLZJqB}itr1=V&KG`VQC8yd`
z9j}wKJg#IzYu*y3N4@Bb%cqox9)eOF(B%TwKPK_mvIQ411a8_I34xP9BsPy-OxP=f
z^$=Rgo=aLuf$|15vr>`X#vZ{OC+X9zuDb|vQX4W2a2z2W*Oj)w$<7+lPbGYqEE4!Y
z5j4*J(;o`UAc^wryi7M1qZAX{UySopT5y$cT@|8wdo0j-F+*z55(QN4-0X9E2(%0w
z->Pj3_BQrPW?JjaUyorsqkqgQ;wow+pkug_qLB3byas`FE+^x`c+_Iv!A2o)GczmY
zAV6d5;m~?7FDJ}pHp;5ORZwuDRq(s2BNghbg+aq0nsM$z_3LiUp~h}O&p9WQTkF%8
zM=j%0_<0RSBT*koU?wS=bWkoexJwQclztyKASoPa^=_gN4ebgz`-%PQ4pC%-=4Vq0
zfe#O}LUsDlrtPI4qXRa|3{g~nzfS$+u@EI(83`y$`zM*F4ZrP)V>J3FyYXx}ZGKDg
zcnAHvt{Rs*n3G9nWAYgvN_?47{`Qg%8)$<Aq7w7Y;i=U#C>u7L&yUCg=`X~0xo?Nm
zOT?BaawiXVZT^N9@PB8m9mlRme!pMhW#CUp&O)q1Ff49V5&%z22#hJ2F`M#8APaP0
z$_Rp4aJOUiQWa7(@mp|%WL)nG$d&Zv_rF<$bdOHX?n0#JYw}R-L?73ZR{Dh~d)_hC
zut16KfP{BGRQ-I6p%4Q2bsb~&j&!tu<3}y`>iw3ht$>i661@OYn_Xr&XV#5d@S|oP
zA@W{))lxW_UJQXd+s5{jYwPj)u*;o$QivH&LtwNF#bMPtindqcy_Sg_0jNOW`<v7!
znJEO8*|0yQ<j#3cQNLl6We)bUW6e|yCa-^2a$l;DheS8^{5s<Nt1Ne}%Cap2BVH(i
zuT&AoSPH9k16RQ62%{Iuvz(S9^H3a+LU~=#Sc&r8O|`!{<aWlnQ0y|KV8S6u=x8Z5
z<;6)4`!|oUKjwj;r5>lS26z`VMFkJaH+Sv!=ug__rdCdmKpW)`?T6Ob{o>w!vsy+D
z-B>}mgAw_|pUbN&6M&;nPF~<=LStpG+Z5n5r71uf?m?gQ-F4dx9x_V$5%CbECK$Gw
zzJ2<^i95T446#0C`xOGneN913e!;7o!R%C)^uMCe0=Tn<*P?H{k7Z&~3QPz=NJW=T
zj3CEU61-h1U6W|>zbw|;d_CCnt>k5|J0cEO>N_La+8&pSKU3E{M-On-Vw%ehQ{LlX
zxIB8%LF!fTxKT!H6<|d62Qh9ehYjV*#xl%&Z~JpAI7ZChyU6I`b9k!^*geM*&r!)0
z`P_*C_$(P{7dfN3zXX2lZVtYo4StL|JW2|=e>3xO1G$K#=;n=dYTEcI0n01mkFdT*
zZlxjCcP7Y<Xe4eK!`8~!(%b0Y7eRosBKBC^`mfl$I^R{l?pp_;1@WD32ai5kdcd3M
z^m+UN=V;a5x_*9(vTzYtR(Heec{Ls)LdV&W7y{V+ZuqPMyu+XY%$fCdKj_hAA412X
z9#03Ex55W|536?9y#sOUVDTF@5$8YaHi%%9&MDkZ)|3Gp8mgcZJl50?7fGwkHO60p
zZhyFar{50Jgv_g8$k>5aQ>oPVpawo8YKRl#hc>oIaxO{*fKmVk?3H*sQ8bIy$$PNS
zm^QUJj;!T<|8X&Tmhjigq?%e(ppMY%uLMndna;mU<hf+(U9BDY8Q!?K2c9o|Cp?z;
z^Pj4|R$@COdz6|--+R3VyzF@q@V^d8Qf<+uE=L)2F(2$~G*r*u$<=O*7m}ns(1{9D
z;Y^{|9oF*p@@9q1kiw^&M)}vDtuk~W`5T^_=WBU3ZEl@0_8<1Y)SggaGLZ!A2#f`s
zbKrEiPfENE9s6MTM4C}Q+(s);GKp&lc$TM^mKI>(!hA{kXVc%0H6AUg<LVxARbB+I
zubB-?iJ!V3Qn!4fXz%tQ2mNOfI{4Q&9U?eom<x@YKL(w>IMB;Y2q3as&sY398#kE0
zW83CIlm!|%<nHchawVWv5AyxXiITYg<9_nbqbH51Hvx={H@rm_lFaDa>OO&SzQ41d
zS$iN9BrRi!79<Ui{~hrneIXRry?g+D;*8pl@}~bWkR^0vcz>O=xyI?ngbQV~+RpO`
zgt2WYwEdm=V<3qZ)gKkzTAP9<tsQxrWc$7BL*3@Y4)#dX=FLeu$Q)WKBe%`??5j6_
zGpXJtq$_g|SJdxLBtN+S%b;aWHg5TSbFIABDVieIv5TfvgPR0+myltdME7weY1tJv
zIi+{V*DR%5%@ab+A>Zf$LsE<)l0?cLpV{+UkiYYIQGnS~Bad;H{xUx0IA93P!Z$Ub
zRs}&&XlPF1+UESgi+B-d`JNY2Bfq~xE9@Kpnx?;#;mg;m75vQ*?*d4Tztw|nTLS^Y
zH-`iqEf>b-r);F3Q~_D`cZH$BGWu)siXg~pRDs3<QlxV3MAV*`OkO*y-}U=6g73RA
z&!q!2FhL9$AfbPBK&?WWip;9TujkAMy8K<br`rsf898zEPQX{lwvyq*G7(Ck|Bds{
z>)1|az7kgqJm2#$NR_{p2Y23-4BY)UL<O0}%Tx&U<y8w{_|@^Jr_<};cMy*m3XeO!
zh9J>yBE<v`ft>a^$KdzDc9uq0^ACB~H-gaD=Y4z@9VVD}V$kHmZY*Zd<tOLK{JHGE
z$tWALwM*HX4GU!Qfar7fAMFkm`pwhW<a=>--RR|Y0w6WlPWsSq`9?!a)pOu312EGz
zk4m+W%p>D^0mr(5WfHSjGm4$@-XbLhSU&;M=<@H`iuaG1?)qq49eVAA5|f{k5V){}
z8uBYG8s*=a?&=i4q?=aPx<^%phdi8kO`X$JJFg~83BLUMcYF-+MJbGo^^<XZ_r1UG
zp!%(v7DBC*GL8_rOxn!kK0_#ed0o~>{rW9<qih$wOO}bQlRZWAg5ImkYkyNRqEwcW
z9Q5*Sgsi-1G0VoPi^P<0E(`<D+5<cy(nK4TFK*euY~BLDZ1+OhYWGBq%9FNyX`0Qo
zey95SWRvrZs>Z@->vG69q4q3;`%j1PYG2lz1;eHL<hBnTu5y~9xtP4yyTU!ozLKep
zdOf9u_~P3$JQyv#Og7IECOzmM(Cee$_5@wJ_lJAS5i^fXgXO~gUWq)fW?g=_vR^MO
zO8V;)F@|ni8GTfoDk>UAMDldZP&8yIZ=zAT!_W^5Gh_b#n<M+3C{!PZ+i&eloZ_|D
zhm3x6UzNX$-HH|Kp())EXHc2epEfRFQXafiIMh=U26yIovzjEqiu!xSNW8?j>%EiU
zZ%Fin+oCFPL;K`A8?8xGtUp%fnKU^o)jCC>R2*P%Cfi#_LmHjMEJxhmc}|a?*)R;#
zbyHfgLFFpb00`ZaHUnRQmT#aiiK}x0gu+pd23%n_RUjE4QhiC3{(j_k)DA`~jo|p#
z#u5J(u73}=8;tpFvdM1RcA}^T|4=?G_T`x+6LdEhU<tF325F#lX@R4G;8tcnT|y8;
zM7RYZbpD7{f^}i}jWC9ixK0IM8zO%8@oO+3hds|;k{}`C@I2t<ubv__!$|%x$Xhi4
zA5onHl!jYyf)v1Ac3`q6kYg%b(<Y*cPg&j8gzJ)g!`ArqxEl=bz$AR>m=K9erRBQI
z%4?gf+wXzRB%6mX!*t}t3Kv1nsQ~!hZbTr0bFyUkaDfV!snDh2##9g(Hhul2EW747
zgi;TxQ%{3b>Mc4N=<s3V+t6sd`1<n_g$zkH2L&lWp2A0?KbSyNVu^_FI3OY%a3BWE
z75dux<p+XFv-h_3W#-m@wY{wQWzL5RBNIn+nhQl|y~--UW#PxD8U?5t{)(xH|I`{D
z`CTKF?#0zLB-2|51B8XrB@4j=2WJ)$Fn(l9IeYy2o-BcyM7wsO!dSiOb5bQ)ahIdW
zbpQJlSsI$4iB`daU+xZ`)Pyp0&|PfMIx1cvoxC&%3lbs-!oj46uOX{hWK<Jf!5FC0
z@eeUJXXsnLvdFo?j(#9kGx{-y>|y#vIG(4HW=>NnpTpmFun$Rj02m`#o`ex0ONfET
z4F{r7@emkC;R~!#dbkG?-M#lhIS+y-buu?tP{T}iowTIQI|Q3D*0|PFM=K&Z8(ngl
zIFhy237n_38l?NRLR4+dQiB2V$&rEkfgtk?a6l=H7ExIM41_<)P%KaggZNGFqMZAL
zMY&tS8=|yPYSZZFA&!dSI@Tu^@(_*Fml5a%4cZC)7jK+63+eEuZ3PCX_~(AjQOo`=
zNPnlQ)GVKn42^BzfT?X|&6O%hoWj^?UbjQVlhMl_0`x{xa=q49T>Mx-$^2R5#O^pn
z>2!Sz?&CdJ65j%GFWASd4pIV3tzxpdURHySx^q=6dV<a9LJ7&|zmM+^-ETMAxR98+
z;3)9nE~i9=C(<H*$MQJ^XE5|xIdYK)4;Zjn8Nk}I(zsDb*jXs2rh@`%|0eBzB9qff
zj4xhW@Ey7nox=HOfR-;2fs#b4GbZAJM`^8d$p4fi{Epn8JL;)duC3j77s``Beu+}6
zee6tiH1#|K#=d)klygO6%l5c+uk(VJs+KO?eoPGx7M!gc=e|8B__ZUxW3Pt;-=DA+
zKVOm^Fvt@PtGDQkmmjD3_7KnieRZ)(y^Fev7YHGFBaLE?{gVLXY_i}AYAS2`_{{{U
z-B!1EKX95m`s(!~Ydmpp#!b2VLVxM3`1Zrmb@>RBZ3a7`JP?PSBjkcQPh@?pe)x&(
zA66UTKY_1wx3-Ur8<y0+O-29COF73sbq*HuuK3fHK-6(by~qTYD7S}=qHBCSt>yZU
zi(!nn?u&oDM9#cLFP7RGZ@liCG@JKro%!fz2GqHc@fk04klM@5*ths6nRZJ%lI|p)
ztyuO1VIcggf?H~xX6i7k&p4~V9`G>zjntUEflyoQ^SD~$lBIr*#v)di`!hHHzZ~Wd
zJ-QNEBRBq)fz4l2#_xXm8YV8KB%v!-2Is(P`1=|D+zIhS-F?ZUgd{4ZvFP};cKr74
zvi0T|HHv$hL!f3guj8b`g!f?>1v<?j45gfkRATgu9tJo<3CbZ93QWlzMw3yj))Xg9
z$!T~{CY_6;N_!AzXl$KEx@6(;47Lrf1(+Qv<!xLi6gTTYXMt8jP|lem^Nhj8+cX|6
zPM;$h^0LAt+C>>B0gS~UEbJ?|HOB?fc^jFhtGDY1pfHBHP3X70`g0Pl;1%{(WPrw)
zLA={hi)#y_&B|CHDe{&@tUa4*`Gx7EV=fZARJ1+2VgS0L3UZC@{Wc`R>bF^Y|J_=)
z6@<ereL`p)6Mc~}dJ$3ptLE%4Bs2y8jxP>zu_xnjZE0yN`sSuL5S5%*$tR?_Sn;IN
zk+q_-5?}{FkQtG0br0boxa+}qf_r@ocNJU^!H6bY#l--XDfxMU;d>>l#G-kxw=U|n
z4oX{wIsAKre7G+PF-<zQIXe6@QBy19fj)^RTNi7+B1;2!j%Z>;OsE5di0T5MG_-(T
zhUl%sTLJ_I(vT32H{#nS1y<Olxi3z)Hb#CHp8wVLJWf{GORQ_rFw%0a!xY7rB9h{E
z?{$OEOmFsm)~q7j=lzb~?2!C->2{d~Bk*>z;1fMDT#15#7$-u6_Yo!o9QuS!|5#-{
z<Ai??f%M(eexmRE$+o#n(O~&4UBt-oteTjh1j}0{f11=tYKtr=GnpyFIp`@dh<?C<
zI7zVl6!rv13P6_+I2t7w_^t(J%9MdC&_kl9wc*h)F3e2#6f10=AMp=&6J`Av2+`g<
z<3bdQ4gN?CUFly$P!!LICyU_sd!u{;m-!1d5NC@Cl#bvEwD$z=G*?eg-k0*kfedfe
zX#weXT6OP3eRVI#^w?%!DTkP<hgnE64Y#XT=oVjWlnH0^GiQWFx@N|Fty2R}bYOum
z2{ANmC;p?XYB{(Z9sE0TP7O#wRObOVO2O+_EQF?wKrxiDNiSZ|aXQp0u{*zKvbc-I
zq7dJ40?9tCq1<NPM<1N@PSsQ6Js$fjRtR3IqNB_U?dRZVtYCex%`Ph(KgZY+Uq<1`
ziACn~!!K#}ui*X&*uvgW!KX-{P9kgU@^yz-q>C0)T!;?6@2clqJa$)sMARqIYV;r+
zk0)L=B>56L<Qz$eI0_7od~&h~5rcWJpDc_-0^&O~Lio>%h)=EE^|VE0=oK*K#|t8-
zuPFs$^fLQzLGuZ2ZmXe@id)*N@}ZDUnL1)Z8A52hime?+&Bx7u|5)K3ImXEMUQge<
zM`(Zo{DDFnt^k6F1jF&@<b^@0(R(0pg?aSf2kS%~jfJHE-FRb>18xC^>12aHE)&2k
zs@Nwb?4XI^>w*cbU-d#dTM%R#VlaWL2MW8>deH&l@xZNi1uJB>M`h5<RPlhBdSkg&
z3fL{^GSKeXOR1<N*apZdjJXcDy-9RxuE|DI;^IiL^(CdfrD9*-z!%yp%$^yef;5;y
zdBB-X3A(Hxg?R+d&d=n2<Nu_}DjgUxh|4`|E^%rhFySlOpBksG?4vD~acWIm9I~&W
z%OWuIOmxXNY)0(e#58#DHw>y{I|JcKhaAgcz;0;FD<ODa{KJ1i@V;j$D%p|cK~%n`
zNYxRa)-Jo*u$vv*)=bCt8-%xF1W+8{@lO%I_S0_gzTkXiTIP^0XP^iqLMVH&7Kci5
zfOTisf2v14W12-K!T~v_4rj_ggW?X<?GVz>w2<~EhliI5igwCTS&^FLFZSoB$eD>H
zD10LcRu|WoR}}rm2%pHJGsgh+eOu9q0~qG^b(v)v%8_%bfYg<>q0IYcTAhF-kNC49
zGRJPK;g!YDNi0#B-0xu-ox&gG{wQ(DTXtXWgzKH6KjnvR?85x$A$ZN+G0#8>XkFb9
z9zWb_5-`)TxAZ%jIz@ik!2)usZWY?tyjjOd<;04s^5^fjU8zy`7I$70NYN82zW6h|
z$X=NbEUMsfM*!<{`)e40n^{H-)`KJX!(mZdv-cC!9L+JvSVnSO(VKcNP;t?UGtk!b
zSPgVYsnD9ejE;FGyPg{6YW6R5Q$rGiy%J(H)2LXP4eT;Slga?wulT3;iy&;Ia=@Rj
z!U(jtPyK}8ZWprMhYw6rMgQS66{Y=o_anEEOn1Vj*{8icX-1vaY{+vNoJDFj0{pO(
zMG_NH%h3QMU|oF!Z9ocohL5ayn*Z36RiYk>2PU&<Rp16`0MZ3Oy(0#K%cs&f!WKkm
z!8pHNY1#d=o1$1MQia~A!QJ)_6tIgs-{?*mojwvw+F$iq%cFzt$w|`z?-9!Ygg%~c
zXVihI-HUA|cnoq2V<JNvpg1-Tvkgs?ZgSbY5uiNT@b3Dy7-rbP056T#ouze0W1k*v
z(z{KmE0MJroUI!0*{S_ljI-Y-3n|nTVug`|%I9bECEOiCj`HQ4<uUBNO(QUl2E0_1
zHz}QUXq$a0^JOuRKpr-x=x&p4&ijADo@1_k9#~AkzW#^zf1xAxf+R(&1dYZ_g5^sn
zCF+I#D9*4gsVGz~OOlc)=Rsi7O_?Si8^ShL@d2Nm3pd+yX`nZV3kDVyeY>{vAU1j?
zkRdJ8tizF;3llfJ+zh|bK4_O(7pI-9<s2)hi0<-%N(U4|E}xH*{=Q4x8<=>w<Ny2p
z@de@1QVZ&>^Y4gTB0F9sU?J)5ad=AE{p>o;579Jw#@~5OWbag~+3Mnyph?f@wbwu8
z=fB{(_w#nycZtQsdzOuJ=!+1W3GvhPtLJ9m8OpCA&1MCEcLm9=MUSexJUgvMnqDuz
zd3!`HT>912mxR#8IDT6FH+LT`QmrCDq@~pdJ?clm$SLSgUD~0uNXRqN&U+KZqw7Df
zzDBzgap!mUAGRk7ciu7Jh?&{>=jdQn1ag0rfaz2*?e8k)dfhWi<lN=lQenN!bXuah
zH>h%4+tNn18&<Eg`QcOFlDB*4Ht$b4D?3qs;0*nwuhWeFfNh-PkKI}J!>)E9RC<4z
zeXoG((fW36d;|?kq_y=zW+bjMr=HBC9G6~Oz67sXY9iWf{^(T=lY^M^#K>_LyRTd#
zP2auGUqc^`u^ubR5w4Vs@kxf)dChil)2=KRi>a|4o@pNTPdUTmaKG~`#_vwS6!#k6
z{+4VvCc;c#xd<Hd&gM!+{y5|=I$W4>y8hCDR;Cl~`TpA&O_}1i*3^LT54QK|MZcr>
z_WFbw0$>}L+Ody2Uo6A7WL7!Jjsi|{&4b%5B5BgX4~e|uY}|YIqYsLi98Q<{`IYRM
zg6GJnsy+;=)vhXW#}ZcT6Xz)uFQxpe`U{DB-KsDH#Ubr*#odC)p9`{S*v9t${JC%W
zNwRP4qvDI=x+u!)g-*90R-vYQbpgwWYEHiCSSi3znGDt6hfK_&?&t8e#l%}MMpBFl
zxE>$Q97^qR@(KeM*(xar8JyGv7=1lKpu)}4U@!(Ggn@EP+h#cPr~OUH-`QqXhlhNd
zjl-d^u9-i0$Gp!aVs!#8LeIRnr-PZYrSHxBwm7LpU-rGj%`%3{jJ$YGlC;!ih7QtL
z?Zt!uX4Po`%PTiH$H>#58o08=3zvG`f%ntyD#+pAjuhI>e65GIil<XG$-yg$1LPbL
z&08B`5qG9S{?N*tQ#yfwU45d~%$?j<Qx-#xKH$$k)?)s)ec^xNnI~*Nd}gU*_CGQU
z29pV#7*00#RAeUEKBXBQ|LsadB;F+^O1?w-TJs+D!7Zj(D5RiYgTUhA;<N%m>-1!j
zY|&2)#*BgVwZTom3H=~rSH4u71~5Evh9-a_APuJ-&g8=GsZ%XZ`qc>;Jya=i6~{(4
zze`0_$3fz?k)M$&6Q&2k9O@)|ms0J}WX+PQI!AD_7a~rK?MmT=*{6>HgTC8@7F?wW
zQvP*i_&d*0XyEkG>uvdgHGS``HxH~dcZ(_r(SdxGqHQ%PTNR$W9pbwF`p%+Ykchrg
zd;ZKP$e_{BKpcRu)<0Yc9BtI9zz>QDE10>pjI*RY^gW>ul4rjnPF^nE9*z_fjWPsx
z;rz(NO!21+*w8E;HQ$iEs5?KQdY&WrS6@)|)f2@QGGUNb`pZ9QAe|~5VNk^MzNK=|
z;9m<V(z6F_O59723H#57{`JNPnsyyl()}m76%JXpBG<DF^T_p2japPb5N<VUavqZw
z$Pvhj7X#aei!s7Wvk-%$TpSR*76*M7a399EC6pk5IEQrXN?FRM&&S~K(&n~;aVhHg
zyi^c_T%OhQbU~|Axk9whq@J(NkSZ|#!g0BUWw<t7LOPg(xm0e|S*Pqjy%Lf4dIrKa
zs{~A492pQ61~&VEnQHz%E3%?w0?W~aAU~T5p#*S;c3L8|Z-?Xd+w9tj&7pT_pEX5l
zRCRSwQacl49EMzV{P7z~%uw1MPS~w4htpAiEC2Fd!-_6!Faf)J4nD=e0$t#PZ77xh
zC$PT02cYd7qem6R>AK2uc9Z4dpSjUqcHr9b7A0l!Z0R|#ihlchp@I~KLoS?6Doh)_
zu=K%3UGOn9lpxZdn;Jp5l_rCG^PfI$I}&ztJSpaMC0Dy0lkx;${plYda`3~ne*P2}
z9ns|~NVrt6b{V?dJkGZr?$|N@3Us`o=$|_;^#S3=1iixlG*FRl!;~WTtHWQYrv4vi
zfe1%Iyo&Usa1;vcWijV9f7lG3%s-7n>1JhqP#>q+%Q)cm8&5xe%t7J#7D4;Pq!ZrW
z*g^ioamw?yQzmW9rs}H{8t5HMq^f8a;yr5&UFlvWAEjU8sr=MHK{6`(@8X=pB5QW2
z)rThuRkfKID&7*$00)V;uz|kjA&u<%qJ(-ftQI~Y0{FUqmAQ!dX>BIlbU4uR1a+&@
zkmj#sFi6@RVdl;od8!Nb$k?GwV+%UZN9AD$I^SFxGhyZiYBo6^FlHMmi!Ic%74vOR
zTbAhK$tdDL$9G>b!@nzjgEd46*Yv8FuSvFht22=+*r<om43#E!xlUF4@<^D9?WWml
z7KrliHrw9zeB7s0Jv5ae@F;ru;4Bk}qNv@M3@4D4@|G!g-rNwjrT5}kWa>v|+4$3b
zZ!3S9Pw}ln%eG1#?EZ^BG{yxDUxw|9&~c^5s(?Zdx-((jv<d4!5hbpJL*0f74rtA>
z13BIiNg7v<)1Ffv6D%?fSr_TBhX^49!*M=iw(6`RQc?jsR0}$}pNjkz<Y_5ag}wty
zT=szx99lFnvS`K7<cjwYs0oJqD*$u`DhN)Rxzu=7_Kc*C2><6%^oMiYn`-l$ug_5e
zS1DRhObQInw-Hk}ce)nOJZ9INf!2B`WzZ4KR@X3E!~FpiZ)K(=-8Jv@E0_O7vHoC^
z*mjWnD^9@x&n<51a}BtoDA5<;<}xSCC+OaWNZ$ME3m&cIdTfwC4Zm$M?e4xF(O$|$
zrSzuPFiN2WDjj&+{!K)`jnAnWe@$`zFB!7C_VUHc>G-^C$sIK&2Yo??dG8%0cY(-P
z1rmXM{)O0gYP&rAn2vYb`0|l9nE3ECc_<5>4C^-IkP5A?DipVEh9TOz&DpiYx%6@C
z#Dno^dc`iX8XU-yP(<05{clKW%B~$F$=^>896~*gwp&*&IxfA9fhpjF$7_{qs|GRM
zLX+R<MN|4&n$SgODsrEQHA?`~#a6OZLvyv8(8kbx`illkNWOa$$Pgmgx4-ZfM!|u;
zqpg&Hb6T&^?OQTGv>8N{JxU6-9q%_r?JeOsI^WN_t7?pj&xEkHMow{;zu80jt}tvI
zFD>(I?F<}NeZm5#`PrYw0M)P3Kz3*VPJFh2r$Th$n@AOsr`1d<ctu$NWw#qFLAr;v
z4!*VWioE3mJeVeZl=V`;WtsTeOMc3EvGMp%!Uf*UanX3hm*pP`R|gVE@%r&s^ooAF
zNn9^k!NQDwHU6_X@!pbet{Mg`(R8ln0~$~D;{91<@p;yAtya#siRn)tbtBNNQIE;@
zEc*qXr-v;|1yxQ;F;)=-i`fovmObTaa(zqNhnYF^16j4|OE7s(Bo#fe444$Z8I3<J
z81hQoH@tCPs%X$o)aQ8liIX@Y{+XN>hA9WkD|k=MnY0PQDYtoFoJo3AVzoQ(6}uJ5
zwBXm2)hE`7bwu6b&XTa}cPj9p2ZnQpcF_<Pk^dW%y=#@k-jVGTyTXWv=+KADPnGuO
z;L_GrjCw!zT*Pg4PV;&%WaRzndDzV+i)$-$Lv(oRc^rOnUJm&|(tfh8O7-?H70vgu
z^5JFy3+QIOS6uA*LQ2`xtIjq_!yUcUUSnb|o%a?JikLR%W8(-(Vdg&vZ;rqJ8z5U2
zWjU)({FWZL;A%Oc?oz7F;V9*e^z_^x_q^<e=Py&jSE|J9%Ls3=7VE}8XF=|M6kupM
zFSkP5;JrhPt_Ir!sog)SJpM=pW<eOspD?YL689`?HL&MYwbD*Ya-&mZ+dqdmPRfn6
zas8m?EaKL>$!1-P{a=mYqW?0lIKJ;w@^$6in|X0*YF`$DQZHSS134zF#>yPW_`4AM
znjWs@7CMvwH&w=voOp3Nmp*fLCy%HIhrP5`8tIG_zpnAcnl=|XlAwc5huL$3P(55h
z>c_yBe?U^0$VIy65!`OulJGuDnbnWNi(Y(X%(q+=wc|?Q2Wu_JnDJ&$*`0Aw!ZUIi
zLNC5ADY4@dQNnc>jc?!5JbOc?nNQyEX>`M5$mfqT$&v=S?+6QQU0tZYtev?)e4p?-
zY{z1l6g8L;7w5*j(|auG#MUb~C2FLD6F18@<AGAjR4usKZ&vu^J%8tiaqKIyhgrMU
zRa?r(T4k33DbqbZe+4yl&%{Xc{4{4uamikkHr&Hj|K7%v4NX$PiDq5O{ilwywXB8+
z0Y}4%Z4KP5&)qmzMkYevVu`1@`IZ%ay2*l9^3p5yIwFr%yR*14Cm))|aO7P~1!-NT
zn$8t+E@<>z+LutDU_~ID;*L^^u`B!#;k#f{-zo9?Ko4_oPY}^K;S}Z+?xf&NYM^|v
z*pkvo9N^|^q7*<0z0x+Hj+W+}ccPQ$H(-$H-?fpVpC<>uExt9k+(1qEU9M}<R@%oV
zI#^MphSytBL_LMpptz_oA=J0^?Q(HaL{hlt6ttS&v6;RnSsuP(?$VI)No=x0N0Res
z2sx-0F;JnkiSaA@BLR(bMLP;7uUIW?Zv<S=$?v2Sb|&{)4TRmhc&^KP;B^pyg=m`+
z%4`y=ck#6vp9z(0Mh|#i`3~;ZV_uILfV*3mp%k!#r-6n#ZjZ$19>vo%HvX0RkxaW5
z=KK>pm4^BzfJRm1U%B1g>RZ@jDfLn$`jQ>x1y$v|mymsRDCL?c!YkXHKGa-HgE^c<
z&YfRD-oQYl9&jEJOV>1l30cc7hM{sP6OEbF4?M=-nqywL<<z<eS}wVc<p0c)^p#)1
zsD>U9Y?sIr@s$(G5wcSm@dzPD$+RR=zaQD*X%5`4WL^3uN+b)z#*3hP*#P%bC@!UE
zZ>`)nYW}1sbTh`W{0WJAY;H1vzX&xGt4PFK9HgI<Bhf5O{|B@{{P2?=(I-n1yNogg
SFP~Pu0;-By3RQBju>S)leN-3#

diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/colors.xml b/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/colors.xml
deleted file mode 100644
index 69b22338c6..0000000000
--- a/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/colors.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<resources>
-    <color name="colorPrimary">#008577</color>
-    <color name="colorPrimaryDark">#00574B</color>
-    <color name="colorAccent">#D81B60</color>
-</resources>
diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/strings.xml b/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/strings.xml
deleted file mode 100644
index 168adfb0a0..0000000000
--- a/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/strings.xml
+++ /dev/null
@@ -1,3 +0,0 @@
-<resources>
-    <string name="app_name">PaddlePredictor</string>
-</resources>
diff --git a/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/styles.xml b/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/styles.xml
deleted file mode 100644
index 5885930df6..0000000000
--- a/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/styles.xml
+++ /dev/null
@@ -1,11 +0,0 @@
-<resources>
-
-    <!-- Base application theme. -->
-    <style name="AppTheme" parent="Theme.AppCompat.Light.DarkActionBar">
-        <!-- Customize your theme here. -->
-        <item name="colorPrimary">@color/colorPrimary</item>
-        <item name="colorPrimaryDark">@color/colorPrimaryDark</item>
-        <item name="colorAccent">@color/colorAccent</item>
-    </style>
-
-</resources>
diff --git a/lite/demo/java/android/PaddlePredictor/app/src/test/java/com/baidu/paddle/lite/ExampleUnitTest.java b/lite/demo/java/android/PaddlePredictor/app/src/test/java/com/baidu/paddle/lite/ExampleUnitTest.java
deleted file mode 100644
index 99dc6d27b3..0000000000
--- a/lite/demo/java/android/PaddlePredictor/app/src/test/java/com/baidu/paddle/lite/ExampleUnitTest.java
+++ /dev/null
@@ -1,17 +0,0 @@
-package com.baidu.paddle.lite;
-
-import org.junit.Test;
-
-import static org.junit.Assert.*;
-
-/**
- * Example local unit test, which will execute on the development machine (host).
- *
- * @see <a href="http://d.android.com/tools/testing">Testing documentation</a>
- */
-public class ExampleUnitTest {
-    @Test
-    public void addition_isCorrect() {
-        assertEquals(4, 2 + 2);
-    }
-}
\ No newline at end of file
diff --git a/lite/demo/java/android/PaddlePredictor/build.gradle b/lite/demo/java/android/PaddlePredictor/build.gradle
deleted file mode 100644
index 02199bb823..0000000000
--- a/lite/demo/java/android/PaddlePredictor/build.gradle
+++ /dev/null
@@ -1,27 +0,0 @@
-// Top-level build file where you can add configuration options common to all sub-projects/modules.
-
-buildscript {
-    repositories {
-        google()
-        jcenter()
-        
-    }
-    dependencies {
-        classpath 'com.android.tools.build:gradle:3.4.1'
-        
-        // NOTE: Do not place your application dependencies here; they belong
-        // in the individual module build.gradle files
-    }
-}
-
-allprojects {
-    repositories {
-        google()
-        jcenter()
-        
-    }
-}
-
-task clean(type: Delete) {
-    delete rootProject.buildDir
-}
diff --git a/lite/demo/java/android/PaddlePredictor/gradle.properties b/lite/demo/java/android/PaddlePredictor/gradle.properties
deleted file mode 100644
index 743d692ce1..0000000000
--- a/lite/demo/java/android/PaddlePredictor/gradle.properties
+++ /dev/null
@@ -1,13 +0,0 @@
-# Project-wide Gradle settings.
-# IDE (e.g. Android Studio) users:
-# Gradle settings configured through the IDE *will override*
-# any settings specified in this file.
-# For more details on how to configure your build environment visit
-# http://www.gradle.org/docs/current/userguide/build_environment.html
-# Specifies the JVM arguments used for the daemon process.
-# The setting is particularly useful for tweaking memory settings.
-org.gradle.jvmargs=-Xmx1536m
-# When configured, Gradle will run in incubating parallel mode.
-# This option should only be used with decoupled projects. More details, visit
-# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
-# org.gradle.parallel=true
diff --git a/lite/demo/java/android/PaddlePredictor/gradle/wrapper/gradle-wrapper.jar b/lite/demo/java/android/PaddlePredictor/gradle/wrapper/gradle-wrapper.jar
deleted file mode 100644
index f6b961fd5a86aa5fbfe90f707c3138408be7c718..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 54329
zcmagFV|ZrKvM!pAZQHhO+qP}9lTN<awrzcJ(s9S^*tV^{eb3tK-m}(Od!HXQ=loIi
zj8XF$V^r1q)=-uM14jn|frbVF0TKBtARwUs>j?q^^Y^VFp)SH8qbSJ)2BQ2giqr}t
zFG7D6)c?v~^Z#E_K}1nTQbJ9gQ9<%vVRAxVj)8FwL5_iTdUB>&m3fhE=kRWl;g`&m
z!W5kh{WsV%fO*<Mi7rh$7Dd@XiRrI_)~u7vTi09HZ~r#n-!cBbnSuZPc5=1)KPL6R
z>%je&j+Lv4xxK~zsEYQls$Q-p&dwID|A)!7uWtJF-=Tm1{V@#x*+kUI$=%KUuf2ka
zjiZ{oiL1MXE2EjciJM!jrjFNwCh`~hL>iemrqwqnX?T*MX;U>>8yRcZb{Oy+VKZos
zLiFKYPw=LcaaQt8tj=eoo3-@bG_342HQ%?jpgAE?KCLEHC+DmjxAfJ%Og^$dpC8Xw
zAcp-)tfJm}BPNq_+6m4gBgBm3+CvmL>4|$2N$^Bz7W(}fz1?U-u;nE`+9`KCLuqg}
zwNstNM!J4Uw|78&Y9~9>MLf56to!@qGkJw5Thx%zkzj%Ek9Nn1QA@8NBXbwyWC>9H
z#EPwjMNYPigE>*Ofz)HfTF&%PFj$U6mCe-AFw$U%-L?~-+nSXHHKkdgC5KJRTF}`G
zE_HNdrE}S0zf4j{r_f-V2imSqW?}3w-4=f@o@-q+c<jxZ)hJGoUQe?`w={^dq{L*;
zo#MT%&<y5(Xhp#!?qX!hl87p;R?!hIpXe#kMGshFw+C8Kd9<p_X4=M~uw2+;Pb%sg
z$^33?cTFtvlm_)Z_va(O%DPX_=rK70hyM+uO<&(!lk4n5&a;xyeyPCZ^EqWXBR}af
zaj&NXw6QBwNh(mGxywqFtF$2N>ZgaAbZ((hn))@|eWWhcT2pLpTpL!;_5*vM=sRL8
zqU##{U#lJKuyqW^X$ETU5ETeEVzhU|1m1750#f}38_5N9)B_2|v@1hUu=Kt7-@dhA
zq_`OMgW01n`%1dB*}C)qxC8q;?zPeF_r;>}%JYmlER_1CUbKa07+=TV45~symC*g8
zW-8(gag#cAOuM0B1xG8eTp5HGVLE}+gYTmK=`XVVV*U!>H`~j4+ROIQ+NkN$LY>h4
zqpwdeE_@AX@PL};e5vTn`Ro(EjHVf$;^oiA%@IBQq>R7_D>m2D4OwwEepkg}R_k*M
zM-o;+P27087eb+%*+6vWFCo9UEGw>t&WI17Pe7QVuoAoGHdJ(TEQNlJOqnjZ8adCb
zI`}op16D@v7UOEo%8E-~m?c8FL1utPYlg@m$q@q7%mQ4?OK1h%ODjTj<zl>Fvqd!C
z-PI?8qX8{a@6d&Lb_X+hKxCI<?sRKEGp0kXECYz$Y`2aJFm_ruTJgdH&EikAH~SRF
zey>mb*3GFemm?W_du5_&EqRq!+H?5#xiX#w$eLti-?E$;Dhu`{R(o>LzM4CjO>ICf
z&DMfES#FW7npnbcuqREgjPQM#gs6h>`av_oEWwOJZ2i2|D|0~pYd#WazE2Bbsa}X@
zu;(9fi~%!VcjK6)?_wMAW-YXJAR{QHxrD5g(ou9mR6LPSA4BRG1QSZT6A?kelP_g-
zH(JQjLc!`H4N=oLw=f3{+WmPA*s8QEeEUf6Vg}@!xwnsnR0bl~^2GSa5vb!Yl&4!>
zWb|KQUsC$lT=3A|7vM9+d;mq=@L%uWKwXiO9}a~gP4s_4Yohc!fKEgV7WbV<YDeeg
zQ?eWqkDKATh@$=^kQK|Zk0*`Xf6$X8x1&1wqaixwkT+yU?opKnm;TIqyG0x&ow8sa
z)=S))qB&;v{b@ppB_k95vf4BVVz;!?%B@#z@4NSjRg64B1eG@=EjBDc#m){H_`sJD
zQPcp_pRarLaMnT-CL|?hRf&U{+yVF`y}a+x-?AvBM^F8uh+Qh&4md*yb4Zv2-BF$u
ziaZL^*|!IDKv$Gfg`}tw!`qURCKO3u__(iMWWJ#OHCR}Om|GVAJu!d(6|(<)a*8{7
zIodm!n3J>o<Gc)rQGgv_Mh@Gy=Fp|{WD0>>2ITb<Ig>E*i`a|V!^p@~^<={#?Gz57
zyPWeM2@p>D*FW#W5Q`1`#5NW62XduP1XNO(bhg&cX`-LYZa|m-**bu|>}S;3)eP8_
zp<y0TbX2emKIh^Fa-%^()iEHVuFR61v6x$OO459|%LCMbo$9Pk(Dy<K?l{N-{^dlW
zv!w8%lGu<$;jx1V9t;El{}!Ldop{}l{)SHBzY&7{|Kg*vtCP2nh=;qig`>NTnTfm8
ze+7wDH3KJ95p)5tlwk`S7mbD`SqHnYD*6`;gpp8VdHDz%RR_~I_Ar>5)vE-Pgu7^Y
z|9Px+>pi3!DV%E%4N;ii0U3VBd2ZJNUY1YC^-e+{DYq+<s62Rxu9!=x?KWlvcrf;q
z4fY#)oIvMwNpv5$b<8W~q5Rjv`g}I=`!E53wgM-_pz0>l@cGtmu(H#Oh%ibUBOd?C
z{y5jW3v=0eV0r@qMLgv1JjZC|cZ9l9Q)k1lLgm))UR@#FrJd>w^`+iy$c9F@ic-|q
zVHe@S2UAnc5VY_U4253QJxm&Ip!XKP8WNcnx9^cQ;KH6PlW8%pSihSH2(@{2m_o+m
zr((MvBja2ctg0d0&U5XTD;5?d?h%JcRJp{_1BQW1xu&BrA3(a4Fh9hon-ly$pyeHq
zG&<CL?eS1$gJN*;*inDHGT<|Xs>;6q?m%NJ36K1Sq_=fdP(4f{Hop;_G_(i?sPzvB
zDM}>*(uOsY0I1j^{$yn3#U(;B*g4cy$-1DTOkh3P!LQ;lJlP%jY8}Nya=h8$XD~%Y
zbV&HJ%eCD9nui-0cw!+n`V~p6VCRqh5fR<OO9ra1Pl6`~x$5GYS_C-}DD{ww$JZZ%
z=T(bF-n%!zaxtTJD5#J!5jT#KG0^JLQEpCV#$XRjn~Y#yGVR9DCn~E&eC2Y^tdTQB
zh0qU)tu{|o{DAz|O_XqpGL3`?0z!!o0z&dXFR*`Xp<Ep}WAtSY0j0G~oLEUBP3Z>X
z8`GbdZ@73r7~myQLBW%db;+BI?c-a>Y)m-FW~M=1^|<21_Sh9RT3iGbO{o-hpN%d6
z7%++#WekoBOP^d0$$|5npPe>u3PLvX_gjH2x(?{&z{jJ2tAOWTznPxv-pAv<*V7r$
z6&glt>7CAClWz6FEi3bToz-soY^{ScrjwVPV51=>n->c(NJngMj6TyHty`bfkF1hc
zkJS%A@cL~QV0-aK4>Id!9dh7>0IV;1J9(myDO+gv76L3NLMUm9XyPauvNu$S<)-|F
zZS}(kK_WnB)Cl`U?jsdYfAV4nrgzIF@+%1U8$poW&h^c6>kCx3;||fS1_7JvQT~CV
zQ8Js+!p)3oW>Df(-}uqC`Tcd%E7GdJ0p}kYj5j8NKMp(KUs9u7?jQ94C)}0rba($~
zqyBx$(1ae^HEDG`Zc@-rXk1cqc7v0wibOR4qpgRDt#>-*8N3P;uKV0CgJE2SP>#8h
z=+;i_CGlv+B^+$5a}SicVaSeaNn29K`C&=}`=#Nj&WJP9Xhz4mVa<+yP6hkrq1vo=
z1rX4qg8dc4pmEvq%NAkpMK>mf2g?tg_1k2%v}<3`$6~Wlq<DlOQ=}&%93mb{Vn~Ry
zHv_6my*MG0QdZB_mSSOt?UzC^Ff*hPNF^k7u-Y8h(v3`<>@ItJ*PhHPoEh1Yi>v57
z4k0JMO)*=S`tKvR5gb-(VTEo>5Y>DZJZzgR+j6{Y`kd|jCVr<S=g{oaepfM8<qKkC
zElzx{_tUc3MD9$yG|RFvX^C!qvpq6Yr`K2~b_No)saZ8St1KkQCthfa-6qthD`c4+
zGV6Rc3T2Br8#HtrCO$Ge(@KyZtK<qjHXNyyMO{^v+fy#vCoGOxuDPcN@*~X?Dg~2<
zo3xP+;)XFlr)oO_@Yk;iiSQ<`v)sFoDAa&!D$S@cm9Q)=IVs9AD>g!>2hVjz({kZR
z`dLlKhoqT!aI8=S+fVp(5*Dn6RrbpyO~0+?fy;bm$0jmTN|t5i6rxqr4=O}dY+ROd
zo9Et|x}!u*xi~>-y>!M^+f&jc;IAsGiM_^}+4|pHRn{LThFFpD{bZ|TA*wcGm}XV^
zr*C6~@^5X-*R%FrHIgo-hJTBcyQ|3QEj+cSqp#>&t`ZzB?cXM6S(lRQw$I2?m5=wd
z78ki`R?%;o%VUhXH?Z#(uwAn9$m`npJ=cA+lHGk@T7qq_M6Zoy1Lm9E0UUysN)I_x
zW__OAqvku^>`J&CB=ie@yNWsaFmem}#L3T(x?a`oZ+$;3O-icj2(5z72Hnj=9Z0w%
z<2#q-R=>hig*(t0^v)eGq2DHC%GymE-_j1WwBVGoU=GORGjtaqr0BNigOCqyt;O(S
zKG+DoBsZU~okF<7ahjS}bzwXxbAxFfQAk&O@>LsZMsZ`?N?|CDWM(vOm%B3CBPC3o
z%2t@%H$fwur}SSnckUm0-k)mOtht`?nwsDz=2#v=RBPGg39i#%odKq{K^;bTD!6A9
zskz$}t)sU^=a#jLZP@I=bPo?f-L}wpMs{Tc!m7-bi!Ldqj3EA~V;4(dltJmTXqH0r
z%HAWKGutEc9vOo3P6Q;JdC^YTnby->VZ6&X8f{obffZ??1(cm&L2<AcM~Uq<(8Ng(
z=Otn3JE$IgV`95rhf%=i7-nH1pH+w7+?S|;5r@T$Fl?EhLmgqP#9BRA4Od_xjk7ad
zM5h|cpfoMxrWsLSApyv{$l`9&hh@@ig<>h7q)*w**+sE6dG*;(H|_Q!WxU{g)CeoT
z(K<vy4U2)MD?P@=!pGsq#g)TUpCl$zNfpzIh^JdcRQUBj6{U$eEIgsR1-jF~hzD@#
zN_VKRN+8h%NuhPCbRU>Y&bv!Usc|m+Fqfmk;h&RNF|LWuNZ!+DdX*L=s-=_iH=@i`
z?Z+Okq^cFO4}_n|G*!)Wl_i%qiMBaH8(WuXtgI7EO=M>=i_+<rx1le^rMzvb;2h?0
z5sc4KeR7b9d(*GtIX!Y8kET`N&XUeFRlEW3zfBJ3g<;m3WGbd>;MDjf3aY~6S9w0K
zUuDO7O5Ta6+k40~xh~)D{=L&?Y0?c$s9cw*Ufe18)zzk%#ZY>Tr^|e%8KPb0ht`b(
zuP@8#Ox@nQIqz9}AbW0RzE`Cf>39bOWz5N3qzS}ocxI=o$W|(nD~@EhW13Rj5nAp;
zu2obEJa=kGC*#3=MkdkWy_%RKcN=?g$7!AZ8vBYKr$ePY(8aIQ&yRPlQ=mudv#q$q
z4%WzAx=B{i)UdLFx4os?rZp6poShD7Vc&mSD@RdBJ=_m^&OlkEE1DFU@csgKcBifJ
zz4N<X6zjOG6Pd$1)TIqw@?KA4;{l<rWcd@b+&$U8lC6nP$yh8NT>7+XEJhYzzO=86
z#%eBQZ$Nsf2+X0XPHUNmg#(sNt^NW1Y0|M(${e<0kW6f2q5M!2YE|hSEQ*X-%qo(V
zHaFwyGZ0on=I{=fhe<=zo{=Og-_(to3?cvL4m6PymtNsdDINsBh8m>a%!5o3s(en)
z<K3a<NOCWAOnN#<ht7DdTp3h4WBk0Vi+Wi^QJ?}w8I92)imOJ|Ldj_FrC*44f>=1I
z6O+YNertC|OFNqd6P=$gMyvmfa`w~p9*gKDESFqNBy(~Zw3TFDYh}$iudn)9HxPBi
zdokK@o~nu?%imcURr5Y~?6oo_JB<T{06ROY)xvOYMQE?=Q{O2ChH1OG#K_}G(cC`M
z&0>e}t|pU5qjai|#JDyG=i^V~7+a{dEnO<(y>ahND#_X_fcEBNiZ)uc&%1HVtx8Ts
z*H_Btvx^IhkfOB#{szN*n6;y05A>3eARDXslaE>tnLa>+`V&cgho?ED+&vv5KJszf
zG4@G;7i;4_bVvZ>!mli3j7~tPgybF5|J6=Lt`u$D%X0l}#iY9nOXH@(%FFJLtzb%p
zzHfABnSs;v-9(&nzbZytLiqqDIWzn>JQDk#JULcE5CyPq_<R&*SSGLH#zOQ)FlXn-
zeg1Seb_q|UjEx36*7zdeFSRyySOmGz$V?RugesvL_QkU_Uw$ig6DWIrw*hBf?YI=u
z6?y0yrM{}#t*E!whC^ujiG3Rmj(Xx6dhZ(QZ61aa1q`U|QO0_s8lmk43}c({v6q^%
z>m#4QV!}3421haQ+LcfO*>r;rg6K|r#<M0Jf@axA{?h0RnOe}CLphrI-z23J4V*Pb
zVp9l<M&T$}(G@+AZZhe<!`mWw?nN-0vcSy!i3v<^cqbB%>5Sh|y@h1ao%Cl)t*u`4
zMTP!deC?aL7uTxm5^nUv#q2vS-5QbBKP|drbDXS%erB>f<x@jf<5FQBD0zF-9QcM@
zF3)BRXXVX$gzhiTp_lKpMq3j`hBjHB!nCwXfktaf$<<%akp?@H7g;otp#Cy7Q*u_I
zGo1Hx=jpU*Q)w;-S}GZkW6w3Opl=(fJ<=c5nYdwc1tCu{k>YM84Kpk^au99-BQBZR
z7CDynflrIAi&ahza+kUryju5LR<m1AGV0-a%_}al3TuA0JZPQ44IeFBc<Fj)f?+^p
zUNT<~HwDbgnV|FBsPL!r7;q=yc<|<vv+iKvG<|3O=u#eF{5(rPp1u1cWMIwCJD1Xr
zz(`{|=QJq88uaR@A{_mB{^LC&`g3SG;NkikZz-1F+-T4{OT@PrR1y~a5%4(X7f2jy
zsmK~L@mXd9)(|Qee5K?wH16R1GP}UM#eA|yyNG=_?pUT>_}-Z27g)jqOc(!Lx9y)e
z{cYc&_r947s9pteaa4}dc|!$$N9+M38sUr7h(%@Ehq`4HJtTpA>B8CLNO__@%(F5d
z`SmX5jbux6i#qc}xOhumzbAELh*Mfr2SW99=WNOZRZgoCU4A2|4i|ZVFQt6qEhH#B
zK_9G;&h*LO6tB`5dXRSBF0hq0tk{2q__aCKXYkP#9n^)@cq}`&Lo)1KM{W+>5mSed
zKp~=}$p7>~nK@va`vN{mYzWN1(t<LDUfYz{(EH{AD)IZ$<F28%{RrOoqF0D|nWDiQ
zCUtv0Ea#-e)5}r9ecRbr_!SOK2o&&2@t#>E=u2BZhga5(VtPKk(*TvE&zmn5vSbjo
zZLVobTl%;t@6;4SsZ>5+U-XEGUZGG;+~|V(pE&qqrp_f~{_1h@5ZrNETqe{bt9ioZ
z#Qn~gWCH!t#Ha^n&fT2?{`}D@s4?9kXj;E;lWV9Zw8_4yM0Qg-6YSsKgvQ*fF{#Pq
z{=(nyV>#*`RloBVCs;Lp*R1PBIQOY=EK4CQa*BD0MsYcg=opP?8;xYQDSAJBeJpw5
zPBc_Ft9?;<0?pBhCmOtWU*pN*;CkjJ_}qVic`}V@$TwFi15!mF1*m2wVX+>5p%(+R
zQ~JUW*zWkalde{90@2v+oVlkxOZFihE&ZJ){c?hX3L2@R7jk*xjYtHi=}qb+4B(XJ
z$gYcNudR~4Kz_WRq8eS((>ALWCO)&R-MXE<vEqnorPNm+Fsb;o_Wlx}NSX#*my3$+
zHp_NEmALLgtv-PdAdM+8+m9(x^RHIWJtH>+YxDn9V#X{_H@j616<|P(8h(7z?q*r+
zmpqR#7+g$cT@e&(%_|ipI&A%9+47%30TLY(yuf&*knx1wNx|%*H^;YB%ftt%5>QM=
z^i;*6_KTSRzQm%qz*>cK&EISvF^ovbS4|R%)zKhTH_2K>jP3mBGn5{95&G9^a#4|K
zv+!>fIsR8z{^x4)FIr*cYT@Q4Z{y}};rLHL+atCgHbfX*;+k&37DIgENn&=k(*lKD
zG;uL-KAdLn*JQ?@r6Q!0V$xXP=J2i~;_+i3|F;_En;oAMG|I-RX#FwnmU&G}w`7R{
z788CrR-g1DW4h_`&$Z`ctN~{A)Hv_-Bl!%+pfi<f@n#kgTY9XT^jaBh>f8wN32rMD
zJDs$eVWBYQx1&2sCdB0!vU5~uf)=vy*{}t{2VBpcz<+~h0wb7F3?V^44*&83Z2#F`
z32!rd4>uc63rQP$3lTH3zb-47IGR}f)8kZ4JvX#toIpXH`L%NnPDE~$QI1)0)|HS4
zVcITo$$oWWw<eeUP|>CN@E-5h>N?Hua!N9CYb6f8vTFd>h3q5Jg-lCI6y%vu{Z_Uf
z$MU{{^o~;nD_@m2|E{J)q;|BK7rx%`<ACjBKdqq8Ybhcy?$CAKynzlx{zw=Od0Xy!
zvN#$9qlLsFo>m``+OqZAqAVj-Dy+pD4-S3xK?($>wn5bi90CFAQ+ACd;&m6DQB8_o
zjAq^=<LYcW-lN@?;-ZU=%Nt&mVJ`QVic@H1CE-sGqJTOyEf<AB4+J>eUYc1o{#+p+
zn;K<)Pn*4u742P!;H^E3^Qu%2dM{2slouc$AN_3V^M<Z*M}NxUVyUo{u~vx|(uf^s
z5phH18INK=LP^kX*F_W8A<xU52w_qKYV$JiJ)e5{;j5y{aka<sm<Os_94Xo$!!g$~
z`fbhkBNP6NW*Z8<hmX~Z8mCl(%$_5?-&GIIqjl*yzEi!lqxMp9X|he4cy0Q2ns4A)
zEdyY|bMENjmlmlT&MnnYz+?s=$I7esd4wjP373@7U3t01a;v&uss%A)=Njuq#t~G|
zW4E&RpnRkv>7H_KY3H)#n7qd5_p~Za7zAj|s9{l)RdbV9e||_67`#Tu*c<8!I=zb@
z(MSvQ9;Wrkq6d)!9afh+G`!f$Ip!F<4ADdc*OY-y7BZMsau%y?EN6*hW4mOF%Q~bw
z2==Z3^~?q<1G<X$MIxfi$y$Yn#2NLfUorr)W&ye%GOR8ep7_J*KzBguntmgurTNya
z?VbH*TJWIzqrd}-p|}JU6Q8V{UhwErq6s^i;U*KudB^+Q1)l0SB*1y_XCOW^a|9Tu
zq@6WnZ<rGtfA3ouLG8Mt&KqR!k89i)LPHWIK+b%3u-5N><X)#8sCJtj4vr6GE}e^T
z&gtI<kND}yjLyv0zjK>TeS>xGN-?CHZ7a#M4k<u0(*?HGTUeRU(rtF65{s7+a(>DL
zQxQr~1ZM<I{lt4|1mt7C&}^|Q548jGab9==NoIL)S|Jy@?5nf`$E@s8#cdSRrtH!P
zn73x%@8npDx$Gbfp((9wUs8kHZh!zIycYyV>zCSKFK5+32C%+C1kE#(2L=<Qz%s{v
zu9mDQk{@NHhPg0om|+{q(=~ykxDtvw7>15AR!er7GKbp?Xd1qkkGipx5Q~FI-6zt<
z*PTpeVI)Ngnnyaz5noIIgNZtb4bQdKG{Bs~&tf)?nM$a;7>r36djllw%hQxeCXeW^
z(i6@TEIuxD<2ulwLTt|&gZP%Ei+l!(%p5Yij6U(H#HMkqM8U$@OKB|5@vUiuY^d6X
zW}fP3;Kps6051OEO(|JzmVU6SX(8q>*y<zQy7G#dujaC7=7@%=f_$R$@)sIMF<2XO
zV&<O)@VvenGdrDHtq9XDTL>f*x5QoxDK={PH^F?!VCzES_Qs>()_y|jg6LJlJWp;L
zKM*g5DK7>W_*uv}{0WUB0>MHZ#oJZmO!b3MjEc}VhsLD~;E-qNNd<fQf`>?x7Q6~v
zR=0$u>Zc2Xr}>x_5$-s#l!oz6I>W?lw;m9Ae{Tf9eMX;TI-Wf_mZ6sVrMnY#F}cDd
z%CV*}fDsXUF7Vbw>PuDaGhu631+3|{xp<@Kl|%WxU+vuLlcrklMC!Aq+7n~I3cmQ!
z`e3cA!XUEGdEPSu``&lZEKD1IKO(-VGvcnSc153m(i!8ohi`)N2n>U<u}iBLDuDW*
z5SRX;S?=hSKmN>_BemYJ`uY>8B*Epj!oXRLV}XK}>D*^DHQ7?NY*&LJ9VSo`Ogi9J
zGa;clWI8vIQqkngv2>xKd91K>?0`Sw;E&TMg&6dcd20|FcTsnUT7Yn{oI5V4@Ow~m
zz#k~8TM!A9L7T!|colrC0P2WKZW7PNj_X4MfESbt<-soq*0LzShZ}fyUx!(xIIDwx
zRHt^_GAWe0-Vm~bDZ(}XG%E+`XhKpPlMBo*5q_z$BGxYef8O!ToS8aT8pmjbPq)nV
z%x*PF5ZuSHRJqJ!`5<4xC*xb2vC?7u1iljB_*iUGl6+yPyjn?F?GOF2_KW&gOkJ?w
z3e^qc-te;zez`H$rsUCE0<@7PKGW?7sT1SPYWId|FJ8H`uEdNu4YJjre`8F*D}6Wh
z|FQ`xf7yiphHIAkU&OYCn}w^ilY@o4larl?^M7&8YI;hzBIsX|i3UrLsx{QDKwCX<
zy;a>yjfJ6!sz`NcVi+a!Fqk^VE^{6G53L?@Tif|j!3QZ0fk9QeUq8CWI;OmO-Hs+F
zuZ4sHLA3{}LR2Qlyo+{d@?;`tpp6YB^<N^9B^!u$J3<mbdXRlpBvq{y2IOGm;FI;<
zp58=gjn>BMoJt?&MHFY!JQwoa0nTSD+#Ku^4b{5SZVFwU<IJ0}bQVpgF?my1a(=#v
zCe{m$czGs%5QDa^dwpTc&v1h5?Urk0&xwUXdDP6{oVl<RF(vZZ*Iaht8&i1Um{<Ya
z7K60dbu=HI%sT{`2mtYMmulVL9JH9rf?bk5(*_RZhlK3yUFVD5<1UI>9<~APYbaLO
zu~Z)nS#dxI-5lmS-Bnw!(u15by(80LlC@|ynj{TzW)XcspC*}z0~8VRZq>#Z49G`I
zgl|C#H&=}n-ajxfo{=pxPV(L*7g}gHET9b*s=cGV7VFa<;Htgjk>KyW@S!|z`lR1(
zGSYkEl&@-bZ*d2WQ~hw3NpP=YNHF^XC{TMG$Gn+{b6pZn+5=<()>C!N^jncl0w6BJ
zdHdnmSEGK5BlMeZD!v4t5m7ct7{k~$1Ie3GLFoHjAH*b?++s<|=yTF+^I&jT#zuMx
z)MLhU+;LFk8bse|_{j+d*a=&cm2}M?*arjBPnfPgLwv)86D$6L<v-FBc*O=@Aayd9
z!$`&9Ni`!p;p1hdi2)tw1E`qPlur~3XMJ_)G<yDOTF@9w1))%tUp$L9$dPXFLE-dI
z1CAFIqON5co=;5wnnj-1*d~|3{w2JyK|pB#b5`|_Wvca$o{ImsE#kk(32i8Uv}K$>
zLJ0wPul7IenMvVAK$z^q5<^!)7aI|<&GGEbOr=E;UmGOIa}yO~EIr5xWU_(ol$&fa
zR5E(2vB?S3EvJglTXdU#@qfDbCYs#82Yo^aZN6`{Ex#M)easBTe_J8utXu(fY1j|R
z9o(sQbj$bKU{IjyhosYahY{63>}$9_+hWxB3j}VQkJ@2$D@vpeRSldU?&7I;qd2MF
zSYmJ>zA(@N_iK}m*AMPIJG#Y&1KR)6`LJ83qg~`Do3v^B0<c@vrWjl0ra05o-pa$c
zV~2wLB|D#i{=yOjf)%9EFs~dm=}2RTCPHN&-r`|I@#CLnu+vrE_(YnwDE{ihqc<)X
z`9INM1uJ*dXn{L282KYsA1^f}uQ^2dS~n1{uM$2lDf}Nv<_{@EMrkp<d!t$y{t9cZ
zZTh`eqXc))c!B<E!)-SX5Zkx=T}gk>>fU&wUx(qefuTgzFED{sJ65!iw{F2}1fQ3=
ziFIP{kezQxmlx-!yo+sC4PEtG#K=5VM9YIN0z9~c4XTX?*4e@m;hFM!zVo>A`#566
z>f&3g94lJ{r)QJ5m7Xe3SLau_lOpL;A($wsj<FPGOPs;4j|gI*y@+n1^HrpcC3D#_
z_?)Ug-getbQrU}r{iT(L54IrQY8th5jSYt6V*yfXZ_*pk`O>HR`;xTXgIiZ#o&vt~
zGR6KdU$FFbLfZCC3<gTENk*-sjo;5)w@S;wX$fL7aJC4e@%U@;l`{ETOeJ+i!BbLf
zbaPqu(&El5Tl(XFzxqn<H#&!BN}r6O*jy0eg~yB+S>AEu$b`tj!9XgOGLSV=QPIYW
zjI!hSP#?8pn0@ezuenOzoka8!8~jXTbiJ6+ZuItsWW03uzASFyn*zV2kIgPFR$Yzm
zE<$cZlF>R8?Nr2_i?KiripBc+TGgJvG@vRTY2o?(_Di}D30!k&CT`>+7ry2!!iC*X
z<@=U0_C#16=PN7bB39w+zPwDOHX}h20Ap);dx}kjXX0-QkRk=cr};GYsjSvyLZa-t
zzHONWddi*)RDUH@RTAsGB_#&O+QJaaL+H<<9LLSE+nB@eGF1fALwjVOl8X_sdOYme
z0lk!X=S(@25=TZHR7LlPp}fY~yNeThMIjD}pd9+q=j<_inh0$>mIzWVY+Z9<urh;8
zH|z!)CyJ7QAUOQ_BY2kV(F}Clz(^ySmsY^`PTLR<rLrqM8rw;;!XGIZ+ZTes`dy@#
z;t(O57q+i~k}R5vTwmWBDWc^Ytq+C4?OP-Sn~lm{^t{Uc0Rz~t%3Y(Ewx6m<&mI%K
z`0>p<{D^#0Xk+b_@eNSiR8;KzSZ#7lUsk~NGMcB8C2c=m2l5paHPq`q{S(kdA7Z1a
zyfk2Y;w?^t`?@yC5Pz9&pzo}Hc#}mLgDmhKV|PJ3lKOY(Km@Fi2AV~CuET*YfUi}u
zfInZnqDX(<#vaS<^fszuR=l)AbqG{}9{rnyx?PbZz3Pyu!eSJK`<jn!(pTPC`j`BC
z6h+KNp*mSbL}^yL^w~L|C>uwkJU!ORQXy4x83r!PNgOyD33}}L=>xX_9<OPcXk91C
z7JeZa4I4_-T@qhSd93p0n!CY74f`dZ|6+u0mNto|Y$S<7P10V7Nmo*~nj+22-JGop
zi`cpyz&<4*8a3+*Rh=Xmzc2;2KCLd2!?c!iDUvsM;@99MM`6jPz38b687?0xo1|#t
z0nbYtb&(+&LVMihYH<&<D}g1zy;*?$5>3l6njNTuqL8J{l%*3FVn3MG4&Fv*`lBXZ
z?=;kn6HTT^#SrPX-N)4EZiIZI!0ByXTWy;;J-Tht{jq1mjh`DSy7yGjHxIaY%*sTx
zuy9#9CqE#qi>1misx=KRWm=qx4rk|}vd+LMY3M`ow8)}m$3Ggv&)Ri*ON+}<^P%T5
z_7JPVPfdM=Pv-oH&lttecoE}(0O7|YZc*d8`Uv_M*3Rzv7$yZnJE6N_<lW4-Y3nv8
z&0M}}4zz%H)@xku37-xNYl7+e>W=AQ3_BgU_TjA_T?a)U1csCmJ&YqMp-lJe`y6>N
zt++Bi;ZMOD%%1c&-Q;bKsYg!SmS^#J@8UFY|G3!rtyaTFb!5@e(@l?1t(87ln8rG?
z--$1)YC~vWnXiW3GXm`FNSyzu!m$qT=Eldf$sMl#PEfGmzQs^oUd=GIQfj(X=}dw+
zT*oa0*oS%@cLgvB&PKIQ=Ok?>x#c#dC#sQifgMwtAG^l3D9nIg<uQ!0`kZ2#jQjY=
z^fp6|svg0-4m9^9&y=jgtP8tOHf$4)G{z8?_hB9;^J!V@4SpkSI6&jw_YRX{M)aNa
zhHy=Usx=Vv1l2K9Y=4Ue+(9yRy1}0bT3PF<Ui&bXrp~l%srldKWk1^=Rtvnapw6|^
zFoloYAD>(Zqi;D%807TtUUCL3_;kjyte#cAg?S%e4S2W>9^A(uy8Ss0Tc++ZTjJw1
z&Em2g!3lo@LlDyri(P^I8BPpn$RE7n*q9Q-c^>rfOMM6Pd5671I=ZBjAvpj8oIi$!
zl0exNl(>NIiQpX~FRS9UgK|0l#s@#)p4?^?XAz}Gjb1?4Qe4?j&cL$C8u}n)?A@YC
zfmbSM`Hl5pQFwv$CQBF=<Q-UGZ@hw~4An$EBbxn9OWL^<{5oG0g9fW54P1`=a-g7|
zC0f|-l)OqWwLsMR6x}wWi8^{%XA#n;spL1&1D6shHJ6;;#FElpktRn4r=rQKbAvtX
zL<gj%yy+phsG2qh{)3Y7BRFH;0IsWU_HZXomlw%T4~EU|xU{7?1B40*vb18D>_$Sq
zxsV?BHI5bGZTk?B6B&KLdIN-40S426X3j_|ceLla*M3}3gx3(_7MVY1++4mzhH#7#
zD>2gTHy*%i$~}mqc#gK83288SKp@y3wz1L_e8fF$Rb}ex+`(h)j}%~Ld^3DUZkgez
zOUNy^%>>HHE|-y$V@B}-M|_{h!vXpk01xaD%{l{oQ|~+^>rR*rv9iQen5t?{BHg|%
zR`;S|KtUb!X<22RTBA4AAUM6#M?=w5VY-hEV)b`!y1^mPNEoy2K)a>OyA?Q~Q*&(O
zRzQI~y_W=IPi?-OJX*&&8dvY0zWM2%yXdFI!D-n@6FsG)pEYdJbuA`g4yy;qrgR?G
z8Mj7gv1oiWq)+_$GqqQ$(ZM@#|0j7})=#$S&hZwdoijFI4aCFLVI3tMH5fLreZ;KD
zqA`)0l~D2tuIBYOy+LGw&hJ5OyE+@cnZ0L5+<w{cWEo@_^~M|2=36q$z3_?Gf)VJr
zUpe}NVgM}-;&|6HhxkYp7HzE53izT81kTG@Jmq2@d-$W+^KDXF(P7@EuYfbYIyV7U
zV*wOvC$^_oi0@eKQ5Lg<*9K4ZW~-9Em>;yo2pIMdt@4$r^5Y!x7nHs{@<B%ZFKsNr
zUMwD7Y!0QZu_?1dF&#e)6E>>|W(MzJjATyWGNwZ^4j+EPU0RpAl-oTM@<pD`jR@;t
z`dx{{CIdGvrizSwgHXZPcy<mdC17yqEl4I024K6slY7B=b!@b?Z_t%9OP67!Fb4;O
z;drPKrpu;B8_&tqVEsY**KGIrLfftCZ<f3MS4jVJwkz|`KDK{kxe|Xt(g9|8z@N#}
zBXpE*Si?ut+wN*uFcIYk!!}k<S-vxO!E7j^pW&!{sW{MM=^wB6{+mP~m?wTGzR=Kt
zMRXJ#O3=xtl*d}rDb%YbSYbxv?0QzJq0M9@+MA{-DVds9NCFdn0W6Ib&!HkMA39U#
zVguUQDWw1^)rRq#B4hvu?!b*1Q-|#8Q%ZC@dCXVwky^O0vocgiD5m~|gPh0Z%7gv&
z&mgdW^)$B{`(^PjTFQy^-+Sul=5Atd|IdECT&;IMbS<1fr6<-2%kRj#&U)5!k}LEG
zaD~b+!LTq6V(G$oqTps8O?>u{lx*i0^yyWPfHt6QwPvYpk9xFMWfBFt!+Gu6TlAmr
zeQ#PX71vzN*_-xh&__N`IXv6`>CgV#eA_%e@7wjgkj8jlKzO~Ic6g$cT`^W{R{606
zCDP~+NVZ6DMO$jhL~#+!g*$T!XW63#(ngDn#Qwy71yj^gazS{e;3jGRM0HedGD@pt
z?(ln3pCUA(ekqAvvnKy0G@?-|-dh=eS%4Civ&c}s%wF@0K5Bltaq^2Os1n6Z3%?-Q
zAlC4goQ&vK6TpgtzkHVt*1!tBYt-`|5HLV1V7*#45Vb+GACuU+QB&hZ=N_flPy0TY
zR^HIrdskB#<<v3VZZ(6)Q<hH<WFIP7EzWSwOI2$+%qFf>$aU;HY(K{a3(OQa$0<!Z
zwV)dCsnAJ%s$60=Taf${mur@a6L!U%;lMq7fsK(vim><9qH(oa)lg@Uf>M5g2W0U5
zk!JSlhrw8quBx9A>RJ6}=;W&wt@2E$7J=9SVHsdC?K(L(KACb#z)@C$xXD8^!7|uv
zZh$6fkq)aoD}^79VqdJ!Nz-8$IrU(_-&^cHBI;4<r+1$HU?qG1J088{7lCH{fI}f^
zvy+vSe{z?zfJ#0~_d*#~6_b~IvofLQ3@^|Q9eojowZuM$JzNi`=-rXVDv!mXDZ;xe
zE9ba^4q9OO5o3vd<M~xJ;2KX?9umvZyxy44nb9eY+)`&yt0U~q@kb3&9<Rj_#e5S>
z^$B+1aPe|LG)C55LjP;jab{dTf$0~xbXS9!<M2ebiR^>!Qd<NroZru}b;ry@LG!l-
zM!n%>cmDYLbL^jvxu2y*qn<cdgNOutE`4#&#_4e){y%!N#7<u*3DvdNWs5&CCKxmF
z;-bApM*r4Onl90Px}#p^0ufjq{#e@!w*feT#7*fpVhBR>x2%jbL%<aHndMtEpHH-p
z6qWPGl6!=`TPxhax_zvd(m12tCV-av2X5b-3q&(-ReP0*;wVRV)oy3pc2xd$@S>rB
z{aP85qBJe#(&O~Prk%IJARcdEypZ)vah%ZZ%;Zk{eW(U)Bx7VlzgOi8<yX;uy|>)x
z`rh4l`@l_Ada7z&yUK>ZF;i6YLGwI*Sg#Fk#Qr0Jg&VLax(nNN$u-XJ5=MsP3<SnL
za}V*@pmgL3Baam<X2AKjxwE0R!TMUJ&9ZWO)ZZ^6gQ3>|(lEdIOJ7|(x3iY<!N++B
z=U1Q6TGt3S3YV*F+Ahywu6XP(b~Lktd&^7%Z@PEqtF(8!YzKB%4=X&Y*xq898JuT&
z&Vh%4r;T-A)?v5-+2&x+>;ea)5#BW*mDV%^=8qOeYO&gIdJVuLLN3cFaN=xZtFB=b
zH{l)PZl_j^u+qx@89}gAQW7ofb+k)QwX=aegihossZq*+@PlCpb$rpp>Cbk9UJO<~
zDjlXQ_Ig#W0zdD3&*e<b)v&6o<r;N-*6|7H8Mqq3Q2=Q3wWhIae70i~T<EsK{q2f7
zHJ5f3S3TaWCv=OH6R0kmo8ysOJ=6-#H_t!YTE&)H&)crJr8s~dk41huQv00%j*0bY
z&7rQ&g9dPJr}`;=)*qSWmdy7%@RoTs-JMb|cteD`*M|!z?!x=sEX_Wcv~Jh*ysQDu
z5$39MI^t}VEA*zB>i(FwlN#3b%FSR%&M^ywF@Fr>d~do@-kIS$e%wkIVfJ|Ohh=zc
zF&Rnic^|>@R%v?@jO}a9;nY3Qrg_!xC=ZWUcYiA5R+|2nsM*$+c$TOs6pm!}Z}dfM
zGeBhMGWw3$6KZXav^>YNA=r6Es>p<6HRYcZY)z{>yasbC81A*G-le8~QoV;rtKnkx
z;+os8BvEe?0A6W*a#<J2Tf?&<fZTof%8x8JT(_GWg%u}o_rMs&m4F@52Hp`aqUv%*
zA+83O5;aq3wKpGRSnm_3aeGBuJ>dOudsv3aWs?<dJ*490)E?9FR?9;?D^I}Y)t=~s
z9|7TF=C(4azs;8zWq`Os@;d<Ej^~L?LGh9_-_?Ac+((@_ylkxK_<&Fn(ZD2iyJUE~
zk?7A)<gYN3Rzj#nN<&#zZ*7F#f~p-9k8k0N_t+XtdkwxY@sKr{L#%H;AV@5VDQVfa
z%=vpSR66V-Kg<z*)b`bN1tO(hG)+_q$g{)_v_vjPsWS;p4%(bE<l|UJXnn8CFU>d%
z0oNngyVMjavLjtjiG`!007#?62ClTqqU$@<hq&JblU=*qC?|bLqd#83w}`s4hoCZG
zZ-O+2XV_of{|)QPh3gO||9e&Fi3S9O_`iw%|A>kIY`=x^$2e>iqIy1>o|<Za5N)p^
zLmeaZWT(bWB5ouhHen&&Blk***`S!tl|wz(B+}HqEb?f>@Tw@)P)B8_1$r#6>DB_5
zmaOaoE~^9TolgDgooKFuEFB#klSF<Vq&r#m9@hME>%9-~d2~_|kQ0Y{Ek=HH5yq9s
zDq#1S551c`kSiWPZbweN^A4kWiP#Qg6er1}HcKv{fxb1*BULboD0fwfaNM_<55>qM
zETZ8TJDO4V)=aPp_eQjX%||Ud<>wkIzvDlpNjqW>I}W!-j7M^TNe5J<O&m1qXTjRg
z-giV*V;CYIguyZ;;Y5qE=ch~mnI~pSK6XWUcE{l(6PM6OJHWW(XN#ZNOC=G^o`Kn|
z)!fw}(M2@Gu1+p9%z`Xz(5247JK&gKsJQWg4PyMIAYw>IFh#-}zAV!$ICOju8Kx)N
z0vLtzDdy*rQN!7r>Xz7rLw8J-(GzQlYYVH$WK#F`i_i^qVlzTNAh>gBWKV@XC$T-`
z3|kj#iCquDhiO7NKum07i|<-NuVsX}Q}mIP$jBJDMfUiaWR3c|F_kWBMw0_Sr|6h4
zk`_r5=0&rCR^*tOy<?~|OBH2^xi@ovc<0m6Z9pQ}p}$L)86o2kGm22nSVfgAnM9+%
z-O%Z&wlG5Q7|Vdi#a_48(&%DvANSabx6F*eZcubRUtR3m-P}10ob*11EpzdR^qybZ
zf3g(F3Srb@fhdZcRva|VnoDWmt>$A8K;@|NqwncjZ>Y-75vlpxq%Cl3EgH`}^^~=u
zoll6xxY@a>0f%Ddpi;=cY}fyG!K2N-dEyXXmUP5u){4VnyS^T4?pjN@Ot4zjL(Puw
z_U#wMH2Z#8Pts{olG5Dy0tZj;N@;fHheu>YKYQU=4Bk|wcD9MbA`3O4bj$hNRHwzb
zSLcG0SLV%zywdbuwl(^E_!@&)TdXge4O{MRWk2RKOt@!8E{$B<l(@e{<+(eX3IivK
zR*O|UTWTINjWzl=zX^M}v1dxUarP|=BCd-w1w34qQf5C{x%6J^)21}_7#me}_4*lH
z^%q-ruGlL}T~~d&8N6oL9E@FkCtaLEfQKf*k=~T!D}W&wtynLO0^<vGBc%UH9sOtu
z)!T7Qnzqz%ZvULsH*;mkh>U-AH(@4{gxs=YAz9LIob|Hzto0}9cWoz6Tp2x0&xi#$
zHh$dwO&UCR1Ob2w00-2eG7d4=cN(Y>0R#$q8?||q@iTi+7-w-xR%uMr&StFIthC<#
zvK(aPduwuNB}oJUV8+Zl)%cnfsHI%4`;x6XW^UF^e4s3Z@S<&EV8?56Wya;HNs0E>
z`$0dgRdiUz9RO9Au3RmYq>K#G=X%*_dUbSJHP`lSfBaN8t-~@F>)BL1RT*9I851A3
z<-+Gb#_QRX>~av#Ni<#zLswtu-c6{jGHR>wflhKLzC4P@b%8&~u)fosoNjk4r#GvC
zlU#UU9&0Hv;d%g72Wq?Ym<&&vtA3AB##L}=ZjiTR4hh7<lQjQPD7BH3(LxrPlKZV_
z(AQ%i>J)e>e<PJRC|ur!@X9?=ab6p0zN|sRcBVRzCz(8|n1EsiKD}%a;v@kHJ{>i}
zt*u+>h%MwN`%3}b4wYpV=QwbY!jwfIj#{me)TDOG`?tI!%l=AwL2G@9I~}?_dA5g6
zCKgK(;6Q0&P&K21Tx~k=o6jwV{dI_G+Ba*Zts|Tl6q1zeC?iYJTb{hel*<TxGDp1t
z93yVek$KyL5=fjxatJAN*YDud$cY|0pGn7|DKyyuDwvG`Z(2)p)Yci42_hErHf2UU
zS$_$$j&c&-LV>x>^wb|2RkHkU$!+S4OU4ZOKPZjV>9OVsqNnv5jK8TRAE$A&^yRwK
zj-MJ3Pl?)KA~fq#*K~W0l4$0=8<bVxkhVxe=la}*Hbv)==tjpYows*bL@c2eYWT3W
z01=rCX`^R^f`_THwcLEMklb`A^)+)^x|)kEA{OtGmozckxKB4Ey0h}MC>GRx^9+?w
z!QT8*-)w|S^B0)ZeY5gZPI2G(QtQf?DjuK(s^$rMA!C%P22vynZY4SuOE=wX2f8$R
z)A}mzJi4WJnZ`!bHG1=$lwaxm!GOnRbR15F$nRC-M*H<*VfF|pQw(;tbSfp({>9^5
zw_M1-SJ9eGF~m<G!kjc;F?%%**IztSuXp$ob-V2#!>(0dvp*P8uaA0Yw+EkP-SWqu
zqal$hK8SmM7#Mrs0@OD+%_J%H*bMyZiWAZdsIBj#lkZ!l2c&IpLu(5^T0Ge5PHzR}
zn;TXs$+I<V5KeQon$dKjn%XTNOq1iDY~a}L->Q_&;O~u=Jz+XE0wbOy`=6>m9JVG}
zJ~Kp1e5m?K3x@@>!D)piw^eMIHjD4RebtR`|IlckplP1;r21wTi8v((KqNqn%2CB<
zifaQc&T}*M&0i|LW^LgdjIaX|o~I$`owHolRqeH_CFrqCUCleN130&vH}dK|^kC>)
z-r2P~mApHotL4dRX$25lIcRh_*kJaxi^%ZN5-GAAMOxfB!6flLPY-p&QzL9TE%ho(
zRwft<s>E3sy5<*^)qYzKkL|rE>n@hyr;xPqncY6QJ8125!<x6~mJ8#t72J`9j7Vt+
z{)(shXdhRK*hg{^0-o)*=-BD_{=G?(3Uxw|igce$D@5V+;|AwbJ^|mBZT}P+{e?ev
z^Hm#S8hQoc_g2THK_hq+^7dV+oJJtv89LQULvUb*<|80Ah}f7O7thWnN;ldB%tWj;
zXGuVR(uM%0S~B_My>MWr`UCWuC~A#G1AqF1@V$kv>@NBvN&2ygy*{QvxolkRRb%Ui
zsmKRO<HOo)OXBaW@%o9D)qKMSmek)As&f%&XJh-+5w*Sni?higYzm6~sL9T}q|+D9
z%qzvC4v{11N6=sMD9x29k-{?M)q53}>R%{*g*WjUUod@@cS^4eF^}yQ1>;WlGwOli
z+Y$(8I`0(^d|w>{eaf!_BBM;NpCoeem2>J}82*!em=}}ymoXk>QEfJ>G(3LNA2-46
z5PGvjr)Xh9>aSe>vEzM*>xp{tJyZox1ZRl}QjcvX2TEgNc^(_-hir@Es>NySoa1g^
zFow_twnHdx<e!lgrNvI*WwBW)9wT|Gdp=c5apgD*amgLuUgeizAc}`P6%lbB0d#Z7
zM;4Blo<T=VRwcg`wRx_6=?ltdSX=X;(4baV;MUcCy|{FBA`Zxe#<lM|b7@g?@2fGg
zIvWGt;sS3yPx@x}jSYQsvJ=J#PvK6X<0Fh+*Q@3wty9k8PJk&wC(b6g5hW5g=_%s~
zV<m~1V;vNeZD@7?COKF+T!CB6pDv{fSFU@)-jTV7?mWdMju#)3pR&V^9*<dsEofq{
zF+@6BVl-^4$mc*8L3np=n^m5nn8IF%8bAI=nujF$D~jYJ3+6&3Mm)2am@cr9QtAFN
z6?MSMSmK(1OxK*zdYZhr8j$3c-8K0_%mPH{twF*(|Ie~m)^78KdHi44QLneiwch+8
zD=yq1et$wHz{iz|?7L^3fb6J#`$6{DTVV(S=?Mz={o?+%SMad-iF)}cnoEkSidJPM
z1tAxRVlVg^!)Vef0KfMogS||RlXA1A@sW2l3oT@37DDbNV^ovJi3(g(k=k-tSw$0w
z>(j?Q_3q51t3XI7YlJ4_q&(0#)&a+RUy{IcBq?)eaWo*=H2UUVIqtp&lW9JTJiP&u
zw8+4vo~_IJXZIJb_U^&=GI1nSD%e;P!c{kZAL<I?m@N;2QRs5XV@LYa&h7gsYW*aK
zkiR{aVR>NCm5c%%oF+I3DrA63_@4)(v4(t~JiddILp7jmoy+>cD~ivwoctFfEL<GF
zS2?j=;5_KN<!_59Fr3**B&RM#EV)k5&x%z^$4i+F6t$2SQeVakt1E=3m%N!0egN->
zP*#2Rx?_&bCpX26MBgp^4G>@h`Hxc(lnqyj!*t>9sOBcXN(hTwEDpn^X{x!!gPX?1
z*uM$}cYRwHXuf+gYTB}gDTcw{TXSOUU$S?8BeP&sc!Lc{{pEv}x#ELX>6*ipI1#>8
zKes$bHjiJ1OygZge_ak^Hz#k;=od1wZ=o71ba7oClBMq>Uk6hVq|ePPt)@FM5bW$I
z;d2Or@wBjbTyZj|;+iHp%Bo!Vy(X3YM-}lasMItEV_QrP-Kk_J4C>)L&I3Xxj=E?|
zsAF(IfVQ4w+dRRnJ>)}o^3_012YYgFWE)5TT=l2657*L8_u1KC><OI7!CHt-I8C=_
zLRniH(rk?7HtU~~$OrcZ%C_13=5$L<fZMyx5o=K4pR3}aI_Tp!*usOF04brWlk!!L
zSy#n#5F3stC!GdnY(#!PB8CicslniZojp3F1sV>Y-R{7w^S<!A!SNx8PxrI522z82
z;05AcZ?RKoY9K8k=Y(xm=o4tfrYxE@T|pxPV|T*QtY?-IR=+aeiM#C&9#VHkoR(nT
z6Vo&D<jxb-PViYakd_Ry0YqwV7R_xqvcvy}v9Exs`_0nE-Q8V_Yk}hK?(XjH6uVFy
zio3fz6sNcqcXxNU@6OKb?#zE@zia{t2?TQao%^2iKIf5Xug1iSf>&A^X^U}h20jpS
zQsdeaA#WIE*<8KG*oXc~$izYilTc#z{5xhpXmdT-YUnGh9v4c#lrHG6X82F2-t35}
zB`jo$HjKe~E*W$=g|j&P>70_cI`GnOQ;Jp<HTnC@pzHe*%-Vsf^<VjPxxJSjDb)5A
z@dUf*L9{G?#w|lxdVuotsY2r;fOm2iST%qnxNji!k#TFb0f-8Qgf3z4%40AK4^h19
z-A+mZ8|9~Z!1|i<Zp6yzDsXa7@B)f*{d=<-alUJ|78%CxgzIn2oJb7qBrmLuCNBss
zF_q8l@r%nqD9F5rSuHz`Pr79~L*nR*J%<m`h7U96U%SEtApNd^esFL!Ci%{fS?37}
zF(N_1dt4bLh|7V0L`!`*oUtV^<c06LL!nbvk{4Amm*FX0Igd2sZ20U7d%<>*JK#CT
zuEGCn{8A@bC)~0%wsEv?O^hSZF*iqjO~_h|>xv>PO+?525Nw2472(yqS>(#R)D7O(
zg)Zrj9n9$}=~b00=Wjf?E418qP-@8%MQ%PBiCTX=$B)e5cHFDu$LnOeJ~NC;xmOk#
z>z&TbsK>Qzk)!88lNI8fOE2$Uxso^j*1fz>6Ot49y@=po)j4hbTIcVR`ePHpuJSfp
zxaD^Dn3X}Na3@<_Pc>a;-|^Pon(>|ytG_+U^8j_JxP=_d>L$Hj?|0lz>_qQ#a|$+(
z(<egK_JoCQRjdK;AXx%}_!KIQhP{J$V&g4kO;k^0n4w8Q6O42Hy~+M7^^jg-Xnl|q
zua@Ow?sWRM>x=Lipuc8p4^}1EQhI|TubffZvB~lu$zz9ao%T?%ZLyV5S9}cLeT?c}
z>yCN9<04NRi~1oR)CiBakoNhY9BPnv)kw%*iv8vdr&&VgLGIs(-FbJ?d_gfbL2={-
zBk4lkdPk~7+jIxd<O?6fKBm-sx3(2vnx)%n4Ktsv_->4{M(-W1AC_WcN&Oza@jZoj
zaE*9Y;g83#m(OhA!w~LNfUJNUuRz*H-=$s*z+q+;snKPRm9EptejugC-@7-a-}Tz0
z@KHra#Y@OXK+KsaSN9WiGf?&jlZ!V7L||%KHP;SLksMFfjkeIMf<1e~t?!G3{n)H8
zQAlFY#QwfKuj;l@<$YDATA<ukA}h2$W&;+e0r*}D;-38IJ>k;%PtD%B(0<|8>rXU<
zJ66rkAVW_~Dj!7JGdGGi4NFuE?7ZafdMxIh65Sz7yQoA7fBZCE@WwysB=+`kT^LFX
zz8#FlSA5)6FG9(qL3~A24m<P6Vythc;LAwJmg~^%V;+>pzL@@2D#>0J7mMS1T*9UJ
zvOq!!a(%IYY69+h45CE?(&v9H4FCr>g<n&7wakQnQ`O|IzdV}gb4_uD%4%`E8Wyr_
zB*ahMr*K>K0>mK~F}5RdOuH2{4|}k@5XpsX7+LZo^Qa4sH5`eUj>iffoBVm<qdY>+
zz4Mtf`h?NW$*q1yr|}E&eNl)J``SZvTf6Qr*&S%tVv_OBpbjnA0&Vz#(;QmGiq-k!
zgS0br4I&+^2mgA15*~Cd00cXLYOLA#Ep}_)eED>m+K@JTPr_|lSN}(OzFXQSBc6fM
z@f-%<Ys?31?U5L7iFk_3XVAlD-h+YS+#5FBuWsR^&q+ZrQ8qFxk>2;1@BzhZa*LFV
z-LrLmkmB%<<&jEURBEW>soaZ*rSIJNwaV%-RSaCZi4X)qYy^PxZ=oL?6N-5OGOMD2
z;q_JK?zkwQ@b3~ln&sDtT5SpW9a0q+5Gm|fpVY2|zqlNYBR}E5+ahgdj!CvK$Tlk0
z9g$5N;aar=CqMsudQV>yb4l@hN(9Jcc=1(|OHsqH6|g=K-WBd8GxZ`Ak<hfgdZW0z
zKi-oN61LSSAoR(f9ZwQCaOPZ0{;FStt#m|s?TG5#XpA*N7&TI22y;2Um4})X>T?OO
z-z_Ued-??Z*R4~L7jwJ%-`s~FK|qNAJ;EmIVDVpk{Lr7T4l{}vL)|GuUuswe9c5F|
zv*5%u01hlv08?00Vpwyk*Q&&fY8k6MjOfpZfKa@F-^6d=Zv|0@&4_544RP5(s|4<x
z+b;nhA1{boC<;F%x6Hv(9O(jAOR1EC=|%OzzX`7;Ig$v)|1k`YErIgDU4rdjVe4mI
zJ4C2EkvdP+xAW=2t=H?{sVV8__t+@&`zg=)tGPIRsN4kR8&_0Y$zVv{Toa+*G@+ZJ
z^L{}=jQE;Tdvmo*$p8w6T~?mrd3G7G>VPVP-f>%u(J@23BHqo2=zJ#v9g=F!cP((h
zpt0|(s++ej?|$;2PE%+kc6JMmJjDW)3BXvBK!h!E`8Y&*7hS{c_Z?4SFP&Y<3evqf
z9-ke+bSj$%Pk{CJlJbWwlBg^mE<tg;u(mK;v`;J3wh$h&l4L=+gfK7G4(vW`7Hj|D
z=bOWeg>C^@%Ou?o>*|O)rl&`KIbHrjcpqsc$Zqt0^^F-gU2O=BusO+(Op}!jNzLMc
zT;0YT%$@ClS%V+6lM<b-p>Tfhuzzxomoat=1H?1$5Ei7&M|gxo`~{UiV5w64Np6xV
zVK^nL$)#^tjhCpTQMspXI({TW^U5h&Wi1Jl8g?P1YCV4=%ZYyjSo#5$SX&`r&1PyC
zzc;uzCd)VTIih|8eNqFNeBMe#j_FS6rq81b>5?aXg+E#&$m++Gz9<+2)h=K(xtn}F
ziV{rmu+Y>A)qvF}ms}4X^Isy!M&1%$E!rT<ZkEuVlksHeNSQ&HvE~V#;h-tRoskLU
z!R%bS+IjeENm`Rw$2Ee2fT)9mfN=f$l2kNuwG?u3F|#waHFKtrbuclq6$L6~oUM%A
zT&*1bL3d12){(&nQgSEF*XLa;oSK?fbQ-<FZBAEqbYz6kAV?cSv50+XWD`i`+m{i4
z%0+%k_bOiH3G@)F@v;GCN>O~5(p+8{U6#hWu>(Ll1}eD64Xa>~73A*538wry?v$vW
z>^O#FRdbj(k0Nr&)U`Tl(4PI*%IV~;ZcI2z&rmq=(k^}zGOYZF3b2~Klpzd2eZJl>
zB=MOLwI1{$RxQ7Y4e30&yOx?BvAvDkTBvWPpl4V8B7o>4SJn*+h1Ms&fHso%XLN5j
z-zEwT%dTefp~)J_C8;Q6i$t!dnlh-!%haR1X_NuYUuP-)`IGWjwzAvp!9@h`kPZhf
zwLwFk{m3arCdx8rD~K2`42mIN4}m%OQ|f)4kf%pL?Af5Ul<3M2fv>;nlhEPR8b)u}
zIV*2-wyy<L1TA?TG4MnL%S`E4kQ~sEpED27z7e$gGD+x=E8gMD-+6}!-<d-^Mx7#I
z(TAlejnA3QA=8rz{h%{5H=)OR4vvmWHX$#tZzM7@tHivO^PLhsq{;jsf3#K7>D%%)
zl$G@KrC#cU<!=K&+xo$$vCQyv{yPi%4XqvQUk@kaCo8%m;7T0tZx$T?hf4VS5+F(a
z*#LhL^^;LjTdUHJ-sGF1q?s%_DQNlnpq<=KGF>woL?YdQyf9WH)@gWB{jd5w4evI&
zOFF)p_D8>;3-N1z6mES!OPe>B^<;9xsh)){Cw$Vs-ez5nXS95NOr3s$IU;>VZSzKn
zBvub8_J~I%(DozZW@{)Vp37-zevxMRZ8$8iRfwHmYvyjOxIOAF2FUngKj289!(uxY
zaClWm!%x&teKmr^ABrvZ(ikx{{I-lEzw5&4t3P0eX%M~>$wG0ZjA4Mb&op+0$#SO_
z--R`>X!aqFu^F|a!{Up-iF(K+alKB{MNMs>e(i@Tpy+7Z-dK%IEjQFO(G+2mOb@BO
zP><k~U-~U(n}*BH%iq@7+uF^_$lCvY)Ycy8o#Dj)%|%Pd_XXguGKJw*5vP?4TESet
zbtO9Dth?@<a&J7K=xA#1|L~*i$G}S&Ii$`Mb~NQsZA~JlSpQd(WY*XWv0y5)*!1LM
zedRyd1iI=d=`{Og?yf71QE8(G=|)T^Hew=g@K0vzoMwc!fHlOQrLQG2?xefFM%U`^
zotJCbT-txJ43=`1S<Oz@?fEb6oLlxfKHk=)+nSxy2ADT4P6!Vyh<UhPF?1qz{BYd1
z^{c^zWSD4lEhT9${x%rJC@Ox-ei}*WUiO>WHlS#fSQm0et)bG8^ZDScGnh-qRKIFz
zfUdnk=m){ej0i(VBd@RLtRq3Ep=>&2zZ2%&vvf?Iex01hx1X!8U+?>ER;yJlR-2q4
z;Y@hzhEC=d+Le%=esE>OQ!Q|E%6yG3V_2*uh&_nguPcZ{q?DNq8h_2ahaP6=pP-+x
zK!(ve(yfoYC+n(_+chiJ6N(ZaN+XSZ{|H{TR1J_s8x4jpis-Z-rlRvRK#U%SMJ(`C
z?T2<!8lc0Y0`9pNuNna*%oU`OR{DFL$sBJm9_+l(SsLxQ1D*r)2Z%cTv0{hRPqerr
zmQQJw;@2*S-6+Q?tp!_e-)bD|uIVtl7np;!Hcr;|$iMg(Y_~jRMQqwUfq#^4D?Bv>
zF(NNfO_&W%2roEC2j#v*(nRgl1X)V-USp-H|CwFNs?n@&vpRcj@W@xCJwR6<k+ZF3
zMlV*YZiVe;7ytcG5tZ7vdF6rV0knuI7xmRxbL0Ffh{&;KauYl`7^z{zhCh<g91&pY
zsfWfKp|_B2`e7VYl*F7RI^ezWWt}Ybw~mrzPPC!<qCL)O2P$&+#MnsxNbi9)@5bEx
zU?<+GbaUm2`960|0{PD9C>@T!jt377?XjZ06=`d*MFyTdyvW!`mQm~t3luzYzvh^F
zM|V}rO>IlBjZc}9<ICqAZa1}Q=c8?w<|HA(b6|9N!!rbIeD+pLjiR$6o@Sx)<S8Rx
z$j`+=NK*=8kZ_8IkI5fER%E?$)r+Tzg;PJ{dP2%{WM^<J&+Z7mPQ&7WX{4j%2x&>Z
zd$&!tthvr>5)m;5;96LWiAV0?t)7suqdh0cZis`^Pyg@?t>Ms~7{nCU;z`Xl+raSr
zXpp=W1oHB*98s!Tpw=R5C)O{{Inl>9l7M*kq%#w9a$6N~v?BY2GKOVRkXYCgg*d<P
zS^9~2kSj9qx~FG%4r4tPN3zI3M5ligwGT|7NI1j`<~_UUXP3L76h$B<2^BN$(i;<B
zS*AnDj9TL&%4iY58fK~xFv}sz2yoA<9yXZ9LpmpGRu+NQ(8dhgmEdv><5G2M1WZP5
zzqSuO91lJod(SBDDw<*sX(+F6Uq~YAeY<z`M%hj^r3vPe7FQsZ`y;Z{SBt%ZN$kqk
zxSC4FSBdGLqi@c*;(0>V#2A;XQu_p=N5X+#cmu19Qk>QAnV=k!?wbk5I;tDWg<Ke3
z+Zln*k(p^Fx)V(N(0j>Fc}0NkvC*G=V+Yh1cyeJVq~9czZiDXe+S=VfL2g`LWo8om
z$Y~FQc6MFjV-t1Y`^D9XMwY*U_re2R?&(O~68T&D4S{X`6JYU-pz=}ew-)V0AOUT1
zVOkHAB-8uBcRjLvz<9HS#a@X*Kc@|W)nyiSgi|u5$Md|P()%2(?olGg@ypoJwp6>m
z*<ymn-fswTb^f%!tlC*e)Zfu(S723{ZOkO!ZOHx%3245^gD1Y{EuGWsRjV0(K1O+}
zFJo}`9v%{DBbDmyDdOiLSiwDk=hGPIM~tP$#l?|Hb+U{q{JFH-xG^lPux^8#ZA4(}
z#1*JkW6Z8kcFm-<?IE@GNtul<PBnXNiWyu(lHMNKoaM^9U&T{&w0rDh$O!PDU=Zs3
zEul%RKW%yeBy??93cs+Q-s5`vC39<<xgGgbpPLdMQ!E4_RQep=j_0Zp8i=bC{u=%K
zi`drGdweqis9fR$I%P8d`(CNy1r!t80j0%2r%hKYGZ#rCd!S9{f0<?KsOYF+d~ix}
zkrIq{vP<d|HihUYGf7f5e!-x?1fzs0(mx^1qzVJL?2Jf(I^4f#8xUBW1Un#2a!t;q
zFb_xhJxY{q%9HIajGERztl0SZoHunhEy{nqyv7QEc7!|l9Ssx)(Gs69ANRY-(xqPa
zl1Do{=NhWZlPUko+p>dnfjjWC>?_1p;%1brqZyDRR;8EntVA92EJ3ByOxj6a+bhPl
z;a?m4rQAV1@QU^#M1HX)0+}A<7TCO`ZR_RzF}X9-M>cRLyN4C+lCk2)kT^3gN^`IT
zNP~fAm(wyIoR+l^lQDA(e1Yv}&$I!n?&*p6?lZcQ+vGLLd~fM)qt}wsbf3r=tmVYe
zl)ntf#E!P7wlakP9MXS7m0nsAmqxZ*)#j;M&0De`oNmFgi$ov#!`6^4)iQyxg5Iuj
zjLAhzQ)r`^hf7`*1`Rh`X;LVBtDSz@0T?kkT1o!ijeyTGt5vc^Cd*tmNgiNo<V{^&
zLfFrG?6N-Vfh*a7zRa-hI?AoBxjDCsq~taz78sN8P99@#6GQn^Ac=oe%U({zYXvM?
z>^EaWvaC8$e+nb_{W01j3%=1Y&92YacjCi>eNbwk%-gPQ@H-+4xskQ}f_c=jg^S-#
zYFBDf)2?@5cy@^@FHK5$YdAK9cI;!?Jgd}25lOW%xbCJ>By3=HiK@1EM+I46A)Lsd
zeT|ZH;KlCml=@;5+hfYf>QNOr^XNH%J-lvev)$Omy8MZ`!{`j>(J5cG&ZXXgv)TaF
zg;cz99i$4CX_@3MIb?GL0s*8J=3`#P(jXF(_(6DXZjc@(@h&=M&JG)9&Te1?(^XMW
zjjC_70|b=9hB6pKQi`S^Ls7JyJw^@P>Ko^&q8F&?>6i;#CbxUiLz1ZH4lNyd@QACd
zu>{!sqjB!2Dg}pbAXD>d!<GKfmsHPhVDo}xP&THD##-_#3(aR4z908zAE_6^yD1x<
z>3jW}=5aN0b;rw*W>*PAxm7D)aw(c*RX2@bTGEI|RRp}vw7;NR2wa;rXN{L{Q#=Fa
z$x@ms6pqb>!8Au<UU`1%O<^p0K#_51IaBWaV0=e@l}O`A>V(prv>|aU8oWV={C&$c
zMa=p=CDNOC2tISZcd8~18GN5oTbKY+Vrq;3_obJlfSKRMk;Hdp1`y`&LNSOqeauR_
z^j*Ojl3Ohzb5-a49A8s|UnM*NM8tg}BJXdci5%h&;$afbmRpN0&~9rCnBA`<MBW8q
zii=3oqIYG@2L#$U`5a-*@qUd0xRZIiAX#A3@!*+Qo*}2I?GS_O{vJpcD{tJc0)1E0
zdu;^XgM9wTO*wzK_a683EROklxUtnAKe%zc5wM?+B;tNRJ8=uVumGIcXULI>#lG!p
zc{(9Y?A0Y9yo?wSYn>iigf~KP$0*@bGZ>*YM4&D;@{<%G<K9Dm7(GMQY-vOtNa-3k
z^1Gs8?lBtOp?IXNh`6FJOL*ILM5J7?(#|^u(@7jtHZ{gL-R)TD_S(>g5^uUJGRrV4
z(aZOGB&{_0f*O=Oi0k{@8vN^BU>s3jJRS&CJOl3o|BE{FAA&a#2YYiX3pZz@|Go-F
z|Fly;7eX2OTs>R}<`4RwpHFs9nwh)B28*o5qK1Ge=_^w0m`uJOv!=&!tzt#Save(C
zgKU=Bsgql|`ui(e1KVxR`?>Dx>(rD1$iWp&m`v)3A!j5(6vBm*z|aKm*T*)mo(W;R
zNGo2`KM!^SS7+*9YxTm6YMm_oSrLceqN*nDOAtagULuZl5Q<7mOnB@Hq&P|#9y{5B
z!2x+2s<%Cv2Aa0+u{bjZXS<eYvH2TwY^!y_lmH+heIicN?XfJ|VJYUm*=Mf4<I%hV
zedGKhEz54U;aQ6Uy*KcjZ4R$`k#i-xZL=<FD%;D~)@x9HPebppVEw|I%7S#wcNK1d
zv7<w=YQ$mc)mf#^yDqFDxkzbx_DjhwV|CeCMS58I=R>);#IFPk(Ph-K7K?3i|4ro>
zRbqJoiOEYo(Im^((r}U4b8nvo_>4<`)ut`24?ILnglT;Pd&U}$lV3U$F9#PD(O=yV
zgNNA=GW|(E=&m_1;uaNmipQe?pon4{T=zK!N!2_CJL0E*R^XXIKf*wi!>@l}3_P9Z
zF~JyMbW!+n-+>!u=A1ESxzkJy$DRuG+$o<yICUm+39&}07-fE7jWYct6i={)gpF`c
zq%};;XNgi&kAzY(kK*me)WuA}+4Ev33y?TCU9L?;$&CDIvE!tRzC(Eub8YYU0+HsD
zsGFKaGBM%U=Qu}6btMVEIqFFG6@$bH*HjsnRcIFz_*J1eUMDilC-d8mj&xih0&4+8
zTNRYlD2Yp9+9kL$3dv9DxJrcjT!^*;sF6{USgG4WWJ#s(va~|Vtivvpl4^(v2|~$I
z4KvV%6oS^$ag7Md8bL=uJOiO#K7G+oa1_jxXKw8s{$Fc*<s%Gq2xtbL2&B5v{QKVh
zkJ=XVG&6DgZ|_o#A3o^7n{HJ+y_XikFitJY4-jzS1Qw6h1PpwYwHHMOeCydkEg2i1
zAC;IDm|N^t!rq0)dMk~joeJJVjfT^$uzY!Pm#_SvasF9ezbe&@^X_gBA2&ME9y4|)
z#&tSxa<C1cY6(A)w(C%ckas6oPp(8^6h>ioG7(@Et|xVbJ#BCt;J43Nvj@MKvTxzy
zMmjNuc#LXBxFAwIGZJk~^!q$*`FME}yKE8<LVNO0eIwqR59{+xK19B~if7K=eFkgD
zIh>d1f5Mp}KHNq(@=Z8YxV}0@;YS~|SpGg$_jG7>_8WWYcVx#4SxpzlV9N4aO>K{c
z$P?a_fyDzGX$Of3@<FPf)Zo`EIjBzhUAP0YL?@0Y2PsSLwG1MnlXE=W3qk@MV%!sa
z1w_PRZva4oQuIZ>ykvedGd<@-R;M^Shlj*SswJLD+j@hi_&_>6WZ}#AYLR0iWMK|A
zH_NBeu(tMyG=6VO-=Pb>-Q#$F*or}KmEGg*-n?vWQREURdB#+6AvOj*I%!R-4E_2$
zU5n9m>RWs|Wr;h2<xNYO6LIQU3LtCy=I2Xd2!kJhd!E~@d^vN$Yfu&5wfeS|&jUU!
zp-j6yzC=XsCqT+u-WvC`)_%q^wE@k2puG1oWXVN+Eb#~@5VAd*UaPrLVaf))C=GUk
zK%IJNK{6*^$z@1Y#bRaIw41OKL#4hSRCb<AJ3w*jbO9WVb5V1jxSU~#LejNz`6mRQ
z1)Fy7HJ72>DaO&mFBdDb-Z{APGQx$(L`if?C|njd*fC=rTS%{o69U|meRvu?N;Z|Y
zbT|ojL>j;q*?xXmnHH#3R4O-59NV1j=uapkK7}6@Wo*^Nd#(;$iuGsb;H315xh3pl
zHaJ>h-_$hdNl{+|Zb%DZH%ES;*P*v0#}g|vrKm9;j-9e1M4qX@zkl&5OiwnCz=tb6
zz<6HXD+rGIV<!n~=b=nvG!TOPVF%Z<h|!Lxb5_bC!p~HE#l0w~R!i$%;iU6+rk1&q
zNvMcxRx{Q1OKjBor!}odgT(nSx<VF%(%Q1DFG^AtWdws15DKet6Nys*tNYUdvJth+
zq**er^8_6w6&M{{gT9wfJ=VRT7IDho)_ZpK{iH4G?dd`6U2^5`5$Rq`*UXh$X1WC*
zx=x;@i7j`y6IINxQbWa-ki0xngQ8y@=<Q?75y@@=nDdj$$F@c3Wrm3Ijv{c|-{86;
zFYEd)RaV=^PL!_jI@EhnJG6V1?!I`YP7n^H1`%(C6;hwl&NDWZ(zeFn!@-e!`u|QZ
zr%*2EsM;E$zlLMW=p4Umly;DBXD`^I<`kFeRnU_4Qe`SUCS%v>pGtkb{Q^LIgExOm
zz?I|oO9)!BOLW#krLmWvX5(k!h{i>ots*EhpvAE;06K|u_c~y{#b|UxQ*O@Ks=bca
z^_F0a@61j3I(Ziv{xLb8AXQj3;R{f_l6a#H5ukg5rxwF9A$?Qp-Mo54`N-SKc}fWp
z0T)-L@V$$&my;l#Ha{O@!fK4-FSA)L&3<${Hcwa7ue`=f&YsXY(NgeDU#sRlT3+9J
z6;(^(<LDM+AvzTU!-C{iPXaX4Nc}0`D$xG$>sjSK@3?oMo$%L-nqy*E;3pb0nZLx6
z;h5)T$y8GXK1DS-F@bGun8|J(v-9<Nj(YoS^a@X%WZ9CzyPzt0MsL^*li+Uz8R464
zO-KqEgSa2^s7jV3sjlYQyBA#{n=mhA!#eK4E&>o=42&nLJy#}M5D0T^5VWBNn$RpC
zZzG6Bt66VY4_?<IE3S-C%&9DDOR8D|Xhv=kV0CuLf#wqH{>W=PX$DMpKAI!d`INr)
zkMB{XPQ<52rvWVQqgI0OL_NWxoe`xxw&X8yVftdODPj5|t}S6*VMqN$-h9)1MBe0N
zYq?g0+e8fJCoAksr0af1)FYtz?Me!Cxn`g<M_?!7yYXUX5LhS#3<X_Le=@)07fxfX
zg<wQS_`%em|3SoY1|=4jyh^zC3dtEdfn<*o-|Lal<ZY+P?t&|dqFmI}T2qi9$C}2W
zj)GYEvpt>Ux&|T;)695GG6HF7!Kg1zzRf_{VWv^bo81v4$?F6u2g|wxHc6eJQAg&V
z#%0DnWm2Rmu71rPJ8#xFUNFC*V{+N_qqFH@gYRLZ6C?GAcVRi>^n3zQxORPG)$-B~
z%_oB?-%Zf7d*Fe;cf%tQwcGv2S?rD$Z&>QC2X^vwYjnr5pa5u#38cHCt4G3|efuci
z@3z=#A13`+ztmp;%zjXwPY_aq-;isu*hecWWX_=Z8paSqq7;XYnUjK*T>c4~PR4W7
z#C*%_H&tfGx`Y$w7`dXvVhmovDnT>btmy~SLf>>~84jkoQ%cv=MMb+a{JV&t0+1`I
z32g_Y@yDhKe|K^PevP~MiiVl{Ou7^Mt9{lOnXEQ`xY^6L8<d-9S$!~O7k-KROY*PL
zCA|!)_d-_?xx)Gcs#Ag@Of}YkY_9U`To>D$705GON{!1?1&YJEl#fTf5Z)da=yiEQ
zGgtC-soFGOEBE<kWyJ42cr@+Ng;c_*w#9AHzDDTzeU(l2#nqVc#nsY}WyV2@_UTD^
z0p3$t>B~ZF_{7b(76En>d}mI~XIwNw{e>=Fv)sgcw@qOsykWr?+qAOZSVrQfg}TNI
ztKNG)1SRrAt6#Q?(me%)>&A_^DM`pL>J{2xu>xa$3d@90xR61TQDl@fu%_85DuUUA
za9tn64?At;{`BAW6oykwntxHeDpXsV#{tmt5RqdN7LtcF4vR~_kZNT|wqyR#z^Xcd
zFdymVRZvyLfTpBT>w9<)Ozv@;Yk@dOSVWbbtm^y@@C>?flP^EgQPAwsy75bveo=}T
zFxl(f)s)j(0#N_>Or(xEuV(n$M+`#;Pc$1@OjXEJZumkaekVqgP_i}p`oTx;terTx
zZpT+0dpUya2hqlf`SpXN<YE2ffZ2e*Y$SI>{}>PfhajNk_J0`H|2<5E;U5Vh4F8er
z;RxLSFgpGhkU>W?IwdW~NZTyOBrQ84H7_?gviIf71l`EETodG9a1!8e{jW?DpwjL?
zGEM&eCzwoZt^P*<N!;*VGB5Djp#t2G{>8KHZ$B<%{I}>46IT%jJ3AnnB5P%D2E2Z_
z1M!vr#8r}1|KTqWA4%67ZdbMW2YJ81b(KF&SQ2L1Qn(y-=J${p?xLMx3W7*MK;LFQ
z6Z`aU;;mTL4XrrE<i<Or!wqhq%QlyTjdg*5fDb6gaB^_iv!f*rl-?Ln!l|T3VJ~h%
zFG5H{F@Wd5y{?v7_`yvHg7*5QBVZnIdi~R=W<qSt+5=YVtMBSbwsxP|D{Se=)W!F_
zxvNkUp1u>;HY*Rkh6N%?qviUGNAKiCB~!P}Z->IpO6E(g<M&OUIhRLNqDkr%x|E#j
zMC1rVZG&lU_Gx-!F5@j2`I=*FOTQj)Tg<qL0g{oF1T*!`;X&0>Gd7I#eDuT7j|?nZ
zK}I(EJ>$Kb&@338M~O+em9(L!+=0zBR;JAQesx|3?Ok90)D1aS9P?yTh6Poh8Cr4X
zk3zc=f2rE7jj+aP7nUsr@~?^EGP>Q>h#NHS?F{Cn`g-gD<8F&dqOh-0sa%pfL`b+1
zUsF*4a~)KGb4t<Bv2oG)5VkWX1^EnN#5~luJsu$ucbCPf?=LgPz2nRsx?GLVUI`=(
zY$9SFA5^Me2%6^ZY_wkJC<KL%Hc?4sCh|C>e&K0}bE>z3yb8%<cvwb58{*M5QVoG>
zibb5Q%Sfiv7fe<uW#_+&tCRnFhL4ZM4$sDrCbtY#b_!YAfVsY$g`+bS@`A0L^(QY?
z#W|G!LhF#VQQPxNK_Mhx#q)buV#Q=UEEgyBA<mFM`k+AZ6ZM`=2=73Hpf@19E@eR7
zu;s6N(QU6mW?3M@cL+3p`Sw36_diSVzwy2Qy(p{Hbu{pSV4pmzkzuOyk3!5m%(%Ib
z!r$;kO;Xc|%J_)ba*$RwXB4=7e@v++Wtrb#H}%Fm3|3Q=bY(Ph-U&aD+;XHO(Wzz<
zDYY_P9<u$OA2uv5(*!<0&XGYb3rM%o__3~kghWc-?#G0vttHh0d@|n>b1r0tfmiMv
z@^4XYwg@KZI=;`wC<fA3q+i3K`%`-~GM5g<V!~_MKB~jR$cR`F3fc7<G;rSHk2r&C
ztrJxN)0M>)`1jUA9K<xCa85y51*NLZm@2!jk+e&%=>v{HKe2t$WmRcR4y8)VAFjRi
zaz&O7Y2tDmc5+SX(bj6yGHYk$dBkWc96u3u&F)2yEE~*i0F%t9Kg^L6MJSb&?wrXi
zGSc;_rln$!^ybwYBeacEFRsVGq-&4uC{F)*Y;<0y7~USXswMo>j4?~5%Zm!m@i@->
zXzi82sa-vpU{6MFRktJy+E0j#w`f`>Lbog{zP|9~hg(r{RCa!uGe>Yl536cn$;ouH
za#@8<bb6U~GCb`PT?W5hHtV|47ZFG$+hAyN&fq$4(Ot12lKB<21MEZg_Q-YY!%}x=
zuWFCT;tpouD&fMRN=Y>XMvS-k<e;&>ddc1`!1LVq;h57~zV`7IYR}pp3u!JtE6Q67
zq3H9ZUcWPm2V4IukS}MCHSdF0qg2@~ufNx9+VMjQP&exiG_u9TZAeAEj*jw($G)zL
zq9%#v{wVyOAC4A~AF=dPX|M}MZV)s(qI9@aIK?Pe+~ch|>QYb+78lDF*Nxz2-v<D%
zcRd<z99X^7c7PW8j+gcz?IBq-;oMn%;{H0yI^JQO`G+@}_ePz1Wy+Mh|7pZC=@Ii3
z0*m9!w~(oyyl|dFC58MGT~tAUx$vd#eA>pRbtQ*F4$0fDbvNM#CCatgQ@z1+EZWrt
z2dZfywXkiW=no5jus-92>gXn5rFQ-COvKyegmL=4+NPzw6o@a?wGE-1Bt;<PJi94(
z<6c`oa?1fKdAjFJW5W_g?tBjcWSvia^nQZ6V4vu{o{O_DSW2)fWnWP>pCHe;34K%Z
z-FnOb%!nH;)gX+!a3nCk?5(f1HaWZBMmmC@lc({dUah+E;NOros{?ui1zPC-Q0);w
zEbJmdE$oU$AVGQPdm{?xx<!1KpS@)5+lwbpwbHgBb5t;Tv7q=WjZP!H5`~PMu(R<=
zb_w2iXtGulObR{20>I_0CKNG$LbY*i?YRQ$(&;NiA#h@DCxC(U@AJ$Yt}}^xt-EC_
z4!;QlLkjvSOhdx!bR~W|Ezmuf6A#@T`2tsjkr>TvW*lFCMY>Na_v8+{Y|=MCu1P8y
z89vP<f7GYh$B_>iH5+CKcG-5lzk0oY>~aJC_0+4rS@c@ZVKLAp`G-sJB$$)^4*A!B
zmcf}lIw|VxV9NSoJ8Ag3CwN&d7`|@>&B|l9G8tXT^BDHOUPrtC70NgwN4${$k~d_4
zJ@eo6%YQnOgq$th?0{h`KnqYa$Nz@vlHw<%!C5du6<*j1nwquk=uY}B8r7f|lY+v7
zm|JU$US08u<I_Z0*-mWQzO-I}`qHpS$zU=BX|3&Luf49L-+WxZKy4vRpusJ%8m)@0
ziv{;^9|o211-;}2C=izc*@WKjs!pixu>gor8E$h3wH$c&i~;guC|3-tqJy#T;v(g(
zBZtPMSyv%jzf->435yM(-UfyHq_D=6;ouL4!ZoD+xI5uCM5ay2m)RPmm$I}h>()hS
zO!0gzMxc`BPkUZ)WXaXam%1;)gedA7SM8~8yIy@6TPg!hR0=T>4$Zxd)j&P-pXeSF
z9W`lg6@~YDhd19B9ETv(%er^Xp8Yj@AuFVR_8t*KS;6VHkEDKI#!@l!l3v6`W1`1~
zP{C@keuV4Q`Rjc08lx?zmT$e$!3esc9&$XZf4nRL(Z*@keUbk!GZi(2Bmyq*saOD?
z3Q$V<*P-X1p2}aQmuMw9nSMbOzuASsxten7DKd6A@ftZ=NhJ(0IM|Jr<91uAul4JR
zADqY^AOVT3a(NIxg|U;fyc#ZnSzw2cr}#a5lZ38>nP{05D)7~ad7JPhw!LqOwA<Y{
z)yrJ_La;uB3SP!wHE^grv11_tcxh#A!E8bJQhBK(9Mo&`u*lMyrj4p(I8aUmo)2i*
zK1Ri%OzatjYYNK8-7I{EKb1H$SU6ydR@xd)2@5&9s<Q^eFvM0hFpYmGTORfJP~Sm0
zbzAdRW+^A`^K2`yHoag|woQA(>TXtRhK!w0X4HgS1i<%AxbFmGJx9?sEURV+S{k~g
zGYF$IWSlQonq6}e;B(X(sIH|;52+(LYW}v_gBcp|x%rEAVB`5LXg_d5{Q5tMDu0_2
z|LOm$@K2?lrLN<F4(!5Ih-vJ&D`=Lp&e(8;uy%9Wg4#kfE!IquHgsYkR;<Orv$CnO
zt>F=mr%YP|U-t)~9bqd+wHb4KuPmNK<}PK6e@aosGZK57=Zt+kcszVOSbe;`E^dN!
ze7`ha3WUUU7(nS0{?@!}{0+-VO4A{7+nL~UOPW9<FTzMloH2%%9rtpS=Jy|^du^;_
z#!4Hd4(#1Ul%Gj75ynFpB2oy8p)HhvdITMDu^`)I+jaV6N>_P(6^GL0h${SLtqG!}
zKl~Ng5#@Sy?65wk9z*3SA`Dpd4b4T^@C8Fhd8O)k_4%0RZL5?#b~jmgU+0|DB%0Z)
zql-c<cx#eek}u-o<6!z^!jA`Z&7_*Rf08gH81%Bz6+y1+Co``NPc_0V2NfM4GGwsb
zowc^r=VRUe^j6An3gYrL@v1Vl;+|i)K_fHqR%(frk}US!rna3>PC>A9HPjdOTpPC`
zQwvF}uB5kG$Xr4XnaH#ruSjM*xG?_hT7y3G+8Ox`flzU^QIgb_>2&-f+XB6MDr-na
zSi#S+c!ToK84<&m6s<?C_%KbO28)G|23ii3xXG@;CM=`fG3)1$3S3z7vhRja3qMoH
zK?O+5-P#^X7Tjd7ulJI^d48(+_u6aya_BTAPgbwS>CiGTd^8pNdXo+$3^l3FL_E`0
z>8it5YIDxtTp2Tm(?}FX^w{fbfgh7>^8mtvN>9fWgFN_*a1P`Gz*dyOZF{OV7BC#j
zQV=FQM5m>47xXgapI$WbPM5V`V<7J9tD)oz@d~MDoM`R^Y6-Na(lO~uvZlpu?;zw6
zVO1faor3dg#JEb5Q*gz4<<AnFRC*cC)M_OVz>W8tgC3nE2BG2je<udTI%)`On6Q}o
zR&JsDj?T>IQs1)<{In&7hJ39x=;ih;CJDy)>0S1at*7n?Wr0ahYCpFjZ|@u91Zl7(
zv;CSBRC65-6f+*JPf4p1UZ)k=XivKTX6_bWT~7V#rq0Xjas6hMO!HJN8GdpBKg_$B
zwDHJF6;z?h<;GXFZan8W{XFNPpOj!(&I1`&kWO86p?Xz`a$`7qV7Xqev|7nn_lQuX
ziGpU1MMYt&5dE2A62iX3;*0WzNB9*nSTzI%62A+N?f?;S>N@8M=|ef3gtQTIA*=yq
zQAAjOqa!CkHOQo4?TsqrrsJLclXcP?dlAVv?v`}YUjo1Htt;6djP@NPFH+&p1I+f_
z)Y279{7OWomY8baT(4TAOlz1OyD{4P?(DGv3XyJTA2IXe=kqD)^h(@*E3{I~w;ws8
z)ZWv<c6*)h1$(Z51qcP+DRirB9I?!9z{4!tKo&NOtiNe!3Nskjf;<7)ZGR)Ce+8XH
z$7`qTDUDKj%cwKeq%F!E5f^m5JlenxKeZ*MX)t~+9P40{hT}M-SFnMQu)KvR6_Mv-
z1os%YtNb@bzIhWOjpZuY=nBo_$5tYbzK&6)5bkkIr((WN-0LPmQiEL5b_>7E)pbEM
zd3MO<B?Qx@?L~w=ib1`&Z<WC=541d*cau`)HK)ilxYxk^H+U-}O0<j|!$=c%YHv$(
zB62PGmLx4Gcyu!<9<2m@eK#f-9?|M1Q^a4}1sMCZnN)#sp5?k^i`?k>XRH3mQhks9
zv6{s;k0y5vrcjXaVfw8^>YyPo=oIqd5IGI{)+TZq5Z5O&hXAw%ZlL}^6FugH;-%vP
zAaKFtt3i^ag226=f0YjzdPn6|4(C2sC5wHFX{7QF!tG1E-JFA`>eZ`}$ymcRJK?0c
zN363o{&ir)QySOFY0vcu6)kX#;l??|7o{HBDVJN+17rt|w3;(C_1b>d;g9Gp=8YVl
zYTtA5<a+evm|Uh9(^nt7y}^=R%V&07M;`wL$Z6eB3P>2@!7AUEkTm@P&h#eg+F*lR
zQ7iotZTcMR1frJ0*V@Hw__~CL>_~2H2cCtuzYIUD24=Cv!1j6s{QS!v=PzwQ(a0HS
zBKx04KA}-Ue+%9d`?PG*hIij@54RDSQpA7|>qY<VUdmtvCUx(~$ooAEO+~lek^feJ
z*nH#k-4kpWB@9MHG~~$EzIn^8Dz$c4ZdF?E!e)Znig4A9{4xQ^OBKe|I6diEpHCz;
z4=JikDy^i)j1->VIrK_G6%6;#ZkR}NjUgmGju)2F`>|WJoljo)DJgZr4eo1k1i1+o
z<qL)5lk*b&hIr4A=KB{MWeJIAjTgw!9R>1D{>^RlpIY8OUaOEf5EBu%a&~c5aWnqM
zxBpJq98f=%M^{4mm~5`CWl%)<vVlq$sK10I+C!$T$f&F>nFR64U{(chmST&2jp+-r
z3675V<;Qi-kJud%oWnCLdaU-)xTnMM%rx%Jw6v@=J|Ir=4n-1Z23r-EVf91CGMGNz
zb~wyv4V{H-hkr3j3WbGnComiqmS0vn?n?5v2`Vi>{Ip3OZUEPN7N8XeUtF)Ry6>y>
zvn0BTLCiqGroFu|m2zG-;Xb6;W`UyLw)@v}H&(M}XCEVXZQoWF=Ykr5lX3XWwyNyF
z#jHv)A*L~2BZ4lX?AlN3X#axMwOC)PoVy^6lCGse9bkGjb=qz%kDa6}MOmSwK`cVO
zt(e*MW-x}XtU?GY5}9{MKhRhYOlLhJE5=<R+-QqP#4_qUQ*fYhz-TKyg5<VHOV^iF
zmw{U2tLdX6=8e$W`cctWz+|!Xu%{|!xm;hl(Bfg6K$gw?-RM@Bzr)89_7_)3i&GcH
z6YFo3@<W2)L^k*F4xWPR2w3%Hgqypk=|CBOh(C-wWyEZ$m@J1wyxA;wt!BYSYxD8g
zFU9s}C)A08k5*p<s<(MOY~!sL?}I{Bwt`D&jolU!RoLd_l5{TA0Zdwec$X$MJ{In;
z#qO9Yd_=QW5qS-dut~psFz4yPhQd!|M-JzeY1y$#z7SRyus~%_uGKZ>ca+-RmO04^
z66z{40J=s=<Mmb*Zx^<grQ^dX@zjQdeLQA7+rGP3X0qLLUJC0KRO>ey9OCdc(RCzy
zd7Zr1%!y3}MG(D=wM_ebhXnJ@MLi7cImDkhm0y{d-Vm81j`0mbi4lF=eirlr)oW~a
zCd?26&j^m4AeXEsIUXiTal)+SPM4)HX%%YWF1?(FV47BaA`h9m67S9x>hWMVHx~Hg
z1meUYoL<blDsj63TBv640kUaLAx#!=rnp@1u_p};Z5nB5f$LgFFKoN)PJO29x+f3U
zGA(28c!A^u21zIZ<FO6vnD|j2sRb9EgK?g^yqSvsY{aPo%19I#i9^C=?qc-KdQuFf
zsR6Hn1<EP(2*WHlNndxJqD|^9Swomu2_pSS6s%xNp$a0Q@DbGy=0j4(aFot4&~k<D
z{V#LA=j-W8;ooL_=>L(p@b3?x|9DgWeI|AJ`<t;U$xdMwsPOn~!9P8M!lwzJf)5ge
zPt3~?XF>Ia84*P{Mb%H$ZRROouR4wZhOPX15=KiBMHl!^JnCt$Az`KiH^_d>cev&f
zaG2>cWf$=A@&GP~DubsgYb|L~o<gBF&<LO~=SWReQBq}@iYsfq=fR#&V4w#~C!EfY
zrw>)cn5h%2`i^!2)bzOTw2UR!>q5^r&2Vy}JaWFUQE04v>2;Z@ZPwXr?y&G(B^@&y
zsd6kC=hHdKV>!NDLIj+3rgZJ|dF`%N$DNd;B)9BbiT9Ju^Wt%%u}SvfM^=|q-nxDG
zuWCQG9e#~Q5cyf8@y76#kkR^}{c<_KnZ0QsZcAT|YLRo~&tU|N@BjxOuy`#>`X~Q<
z?R?-Gsk$$!oo(BveQLlUrcL#eirhgBLh`qHEMg`+sR1`A=1QX7)ZLMRT+GBy?&mM8
zQG^z-!Oa&J-k7I(3_2#Q6Bg=NX<|@X&+YMIOzfEO2$6Mnh}YV!m!e^__{W@-CTprr
zbdh3f=BeCD$gHwCrmwgM3LAv3!Mh$wM)~KWzp^w)Cu6roO7uUG5z*}i0_0j47}pK;
ztN530`ScGatLOL06~zO)Qmuv`h!gq5l#wx(EliKe&rz-5qH(hb1*fB#B+q`9=jLp@
zOa2)>JTl7ovxMbrif`Xe9;+fqB1K<t$PN%b0*juJk@B^zQ;tiy5(~*|#DUYAgor}T
zM8=J^f)tau_?D0X_`rotGny(P?~w+fXV=r|(S=EzflR<iVA^pG*GB>#l=Dv!iT;xF
zdkCvS>C5q|O;}ns3AgoE({Ua-zNT-9_5|P0iANmC6O76Sq_(AN?UeEQJ>#b54fi3k
zFmh+P%b1x3^)0M;QxXLP!BZ^h|AhOde*{9A=f3|Xq*JAs^Y{eViF|=EBfS6L%k4ip
zk+7M$gEKI3<lnX;I~8;d41fE2N2dl$NyMaP%R(U%rVx-XQi||%EoeXN5;;rlz8cAC
z{M55(C<Dd|hkU?)n?=$dY~JLVWqK!mN6pIqg)hP!h#156?rbpWcfmd7_psLRSOaWI
zh=O=PQqk`j7Hd}4KFyusk+@oV@<<s_-9gM4h2N&3CTczWNu;}9hMGuq4_irhsrY}g
zM7hOuM4@nU!4b6JLY-gUiMz%akvVA3tIjL8*l3q4&#Ks%cUI%AdaeJluTU3ha1PO5
zKwR${cl2Aq|Dft)aq!r3X3IdBNpGuS$wUD}n|D@fdCWOS)8}<)DcNL{7e#uF(ZIGN
zY^$B;Sy?T*Rv2Mt>?bQg?H3zaE@;cyv9kv;cqK$VxQbFEsy^iM{XXW0@2|DOu$!-k
zSFl}Y=jt-VaT>Cx*KQnHTyXt}f9XswFB9ibYh+k2J!ofO+nD?1iw@mwtrqI4_i?nE
zhLkPp41ED62me}J<`3RN80#vjW;wt`pP?%oQ!oqy7`miL>d-35a=qotK$p{IzeSk#
ze_$CFYp_zIkrPFVaW^s#U4xT1lI^A0IBe~Y<4uS%zSV=wcuLr%gQT=&5$&K*bwqx|
zWzCMiz>7t^Et@9CRUm9E+@hy~sBpm9fri$sE1zgLU((1?Yg{N1Sars=DiW&~Zw=3I
zi7y)&oTC?UWD<bim(U(Kcv26Hs1MgPYCZBQyaz{OU-1!oDtd9;K0gr>2w97xQ&5vx
zRXEBGeJ(I?Y}eR0_O{$<B>~)bMJRTsNUPIfR!xU9PE7A>AMNr_wbrFK>&vVw=Y;RH
zO$mlpmM<e5@>sQ}-FQ2cSj7s7GpC+~^<qIE<1Uq4Aa|;IK^K)Yh(A_XUCB|Y_R?~k
zei=zP?!DdEmTszWYaP-@z07H$=_l%jx(w*~q_p3m(MQdLR!iM5qQ}A5&l2~45^XB!
zUW8`<CFCM-FHQde?_1|-4E>Q~dC?y>M}%!-3kq(F3hGWo9B-Gn02AwUgJ>Z-pKOaj
zysJBQx{1>Va=*e@sLb2z&RmQ7ira;aBijM-xQ&cpR>X3wP^foXM~u1>sv9xOjzZpX
z<BQ#2^vCHKQ$F!+WCtF?gZbu_QJAuhoSUjsyz8+;?vb5qdIDvheozYu-4++t9hTcY
zbR)6?>0K;EGouSYD~oQ&lAafj3~EaXfFShC+><iKQbwWBYIkb|tCE~^kzi;pS;%$l
zKgeQc!Ja|tXHhs|vuNBpq7;why7Y~OkBN=s5_$h(<&luxUowJL<gbi2wNJlu_eEYQ
zb_?aVix=V!OXdJWUufB<?#(vny@AY4z`w$l-OaG&e89%J0Nk&L|6k4!5hE8fDSH<)
zdlxHLV0@7ZP<v)?0i0?bjexwv|IC&yQdza16-41X)m?)M12iHks%lH;wkdt}6QWUq
z5EYKV097vUa!80?9<8HWEqlWJAZ6;n_yPT!Eg+<Iaz!0Qjzp4Xbn@(JJdJzzy5#aw
z0Hk!62ApsJL47X1kI!vwv96|%k`R+njgSN2s=~u+$wDE$j}9<hC4>VsRlEMa9cg9i
zFxhCKO}K0ax6g4@DEA?dg{mo>s+~RPI^ybb^u--^nTF>**0l5R9pocwB?_K)BG_)S
zyLb&k%XZhBVr7U$wlhMqwL)_r&&n%*N$}~qijbkfM|dIWP{MyLx}X&}ES?}7i;9bW
zmTVK@zR)7kE2+L42Q`n4m0VVg5l5(W`SC9HsfrLZ=v%lpef=Gj)W59VTLe+Z$8T8i
z4V%5+T0t8LnM&H>Rsm5C%qpWBFqgTwL{=_4mE{S3EnBXknM&u8n}A^IIM4$s3m(Rd
z>zq=CP-!9p9es2C*)_hoL@tDYABn+o#*l;6@7;knWIyDrt5EuakO99S$}n((Fj4y}
zD!VvuRzghcE{!s;jC*<_H$y6!6QpePo2A3ZbX*ZzRnQq*b%KK^NF^z96CHaWmzU@f
z#j<acd!VSzE28qM%06`oTXBbQEt6zJ&2X&)^HOYF6Pnpz<>;y?X=UP&+YS3kZx7;{
zDA{9(wfz7GF`1A6iB6fnXu0?&d|^p|6)%3$aG<Kg5Jl{;b)9vyrW+~P_>0Uor~8o?
z*e}u#qz7Ri?8Uxp4m_u{a@%bztvz-BzewR6bh*1Xp+G=tQGpcy|4V_&*aOqu|32CM
zz3r*E8o8SNea2hYJpLQ-_}R&M9^%@AMx&`1H8aDx4j%-gE+baf2+9zI*+Pmt+v{39
zDZ3Ix_vPYSc;Y;yn68kW4CG>PE5RoaV0n@#eVmk?p$u&Fy&KDTy!f^Hy6&^-H*)#u
zdrSCTJPJw?(hLf56%2;_3n|uj<S!~jd!&;hUy<>USJOU8VPOTlDULwt0jS@j^t1WS
z!n7dZIoT+|O9hFUUMbID4Ec$!cc($DuQWkocVRcYSikFeM&RZ=?BW)mG4?fh#)KVG
zcJ!<=-8{&MdE)+}?C8s{k@l49I|Zwswy^<hSY@OWrCd=daYhyVsh3++Yv9P!Otyxd
zYr!|~;J5!$7otrpXL(;Gt`M6)w8EjcUYeS0ogAHFK&g17jnaBRmS2tFge^7{t|dx<
zUgs1<p==05JEfOIW~$+{WKl}T&QH1g-J2AoD4r7s&R3K8i}<Z<e!RF()LS`-mtZYo
zezQv0sh3)<Sa%bi-o4*w%h1YNuhKF@#1?2f1$C)eaGkKn;xb6f@Jm-3j`p?BErB+l
zHwQJSKqYTvG)b1Mx5%)zS8k>ZN3;E!FKyglY~Aq?4m74P-0)sMTGXqd5(S<-(DjjM
z&7dL<EW0B_YAuips^)+S>-Mr8jhUCAG$5^m<srgKVh%Jz7M@*Ies!?2)w)Kdl4a_a
zuHwQSNhF~YHc5%rQCXctfet<O8za-I?YydOFk8iN?JqT#U=4tC4f@I)EBP`j*`--{
zNGBM|)9Ytkc3T^M7RDsL<yKyN7!^_SO>I<|%`;JI5FVUnNj!VO2?Jiqa|c2;4^n!R
z`5KK0hyB*F4w%cJ@Un6GC{mY&r%g`OX|1w2$B7wxu97%<@~9>NlXYd9RMF2UM>(z0
zouu4*+u+1*k;+nFPk%ly!nuMBgH4sL5Z`@Rok&?Ef=JrTmvBAS1h?C0)ty5+yEFRz
zY$G=coQtNmT@1O5uk#_MQM1&bPPnspy5#>=_7%WcEL*n$;sSAZcXxMpcXxLe;_mLA
z5F_paad+bGZV*oh@8h0(|D2P<J(t&2Qw&v^wfgJcGu^$r_u9Q*RQLJ1p@nF4Tco@2
z(1Ps*g@KQZFol96Tqf`~w0w8i@XjVE!)t~{G1+Ted=d-`Ysb%GXG84BqJ~^?`UyR8
zw*)YgFW}yav*K+Tc;as<ycKp3q1UreEHgV%{-DD&Lw587I$PqFzfY`h9kC<heY*xX
zeTV-1r@oi~Jzv?6`f+rA)2}J0FW8Yx)AXHWVAC=Kl{qRb_;ZZenWU{)T$~^kw>!q#
zTHjmiphJ=AazSeKQPkGOR-D8``LjzToyx{lfK-1CDD6M7?pMZOdLKFtjZaZMPk4}k
zW)97Fh(Z+_Fqv(Q_C<zhRDIaVb+X5HoUF<TQrGZ$uH?09fna{AQ6dwyMUF<j(@?_{
z7dup3{JX?S)cqadoWhnBFSYmMD_->MH-YYi?fR5fBnz7KOt0*t^cxmDoIokc=+`o#
zrud|^h_?KW=Gv%byo~(Ln@({?3gnd?DUf-j2J}|$Mk>mOB+1{ZQ8HgY#SA8END(Zw
z3T+W)a&;OO54~m}ffemh^oZ!Vv;!O&yhL0~hs(p^(Yv=(3c+PzPXlS5W79Er8B1o*
z`c`NyS{Zj_mKChj+q=w<HOP7Tq0RxJ;u<fTlvk?B9o$Wi&hPOY?Y2c&mNuZvb`-Y2
z_5}8ELw*g`I6xv4ops05GS?1{(ES#Zm-kpClr;oo6ZRu~upD(%jyh&9`ledY>)B}K
za*zzPhs?c^`EQ;keH{-OXdXJet1EsQ)7;{3eF!-t^4_Srg4(Ot7M*E~91gwnfhqaM
zNR7dFaWm7MlDYWS*m}CH${o?+YgHiPC|4?X?`vV+ws&Hf1ZO-w@OGG^o4|`b{bLZj
z&9l=aA-Y(L11!EvRjc3Zpxk7lc@yH1e$a}8$_-r$)5++<GK}kbqxgH=4Tct(TwL^`
z^f<xeT@)HfJ0rR60)NutB#?@dlV#K-$S%4<q|dyJ9(<Vo8k-QWkML>`_eUr1+dTb@
zU~2P1HM#W8qiNN3b*=f+FfG1!rFxnNlGx{15}BTI<gZnXqCb;nB49D)H>HgxO>Cq4
z;#9H9YjH%>Z2frJDJ8=xq>Z@H%GxXosS@Z>cY9ppF+)e~t_hWXYlrO6)0p7NBMa`+
z^L>-#GTh;k_XnE)Cg<Bw9XsVmKp2?IBmR@^xjFIWwfvA(PR_0Y3bH<dHA$ukL^nqU
zFc)9w=@dA}yE)^wFlkFa(%m~`8h+w~I9AY&!?Z<t9kWjGo)R&+#Vu8>y|0Dw;(c0*
zSzW14ZXozu)|I@5mRFF1eO%JM=f~R1dkNpZM+Jh(?&Zje3NgM{2ezg1N`AQg5%+3Y
z64PZ0rPq6;_)Pj-hyIOgH_Gh`1$j1!jhml7ksHA1`CH3FDKiHLz+~=^u@kUM{ilI5
z^FPiJ7mSrzBs9{HXi2{sFhl5AyqwUnU{sPcUD{3+l-ZHAQ)C;c$=g1bdoxeG(5N01
zZy=t8i{*w9m?Y>V;uE&Uy~i<w#eexJWxGnFfWNZGkrIlf$sI~zk&Tqr=Ed!9rbKTc
z`|lpRCJ}6^>Y{pY4AV3_N;RL_jT_QtLFx^KjcUy~q9K<?m7^TR%~{qqS!wi!O`F=v
zm8o|@KNdytDU%MB5>cLE3$QJ{!)@$@En{UGG7&}lc*5Kuc^780;7Bj;)X?1CSy*^^
zPP^M)Pr5R>mvp3_hmCtS?5;W^e@5BjE>Cs<`lHDxj<|gtOK4De?Sf0YuK5GX9G93i
zMYB{8X|hw|T6HqCf7Cv&r8A$S@AcgG1cF&iJ5=%+x;3yB`!lQ}2Hr(DE8=LuNb~Vs
z=FO&2pdc16nD$1QL7j+!U^XWTI?2qQKt3H8=beVTdHHa9=MiJ&tM1RRQ-=+vy!~iz
zj3O{pyRhCQ+b(>jC*H)J)%Wq}p>;?@W*Eut@P&?VU+Sdw^4kE8lvX|6czf{l*~L;J
zFm*V~UC;3oQY(ytD|D*%*uVrBB}BbAfjK&%S;z;7$w68(8PV_whC~yvkZmX)xD^s6
z{$1Q}q;99W?*YkD2*;)tRCS{q2s@JzlO~<8x9}X<0?hCD5vpydvOw#<cvxXp^iJBo
zLh*WQB{twDd_ndGde;Ycv4>Z$2;$@cZkYrp83J0PsS~!CFtY%BP=yxG?<@#{7%2sy
zOc&^FJxsUYN36kSY)d7W=*1-{7ghPAQAXwT7z+NlESlkUH&8ODlpc8iC*iQ^MAe(B
z?*xO4i{zFz^G=^G#9MsLKIN64rRJykiuIVX5~0#vAyDWc9-=6BDNT_aggS2G{B>dD
ze-B%d3b6iCfc5{@yz$>=@1kdK^tX9qh0=ocv@9$ai``a_ofxT=>X7_Y0`X}a^M?d#
z%EG)4@`^Ej_=%0_J-{ga!gFtji_byY&Vk@T1c|ucNAr(JNr@)nCWj?QnCy<Q#dA2E
z;P8C%0;>vXg&?FW;S-VOmNL6^km_dqiVjJuIASVGSFEos@EVF7St$WE&Z%)`Q##+0
zjaZ=JI1G@0!?l|^+-ZrNd$WrHBi)DA0-Eke>dp=_XpV<%CO_Wf5kQx}5e<90dt>8k
zAi00d0rQ821nA>B4JHN7U8Zz=0;9&U6LOTKOaC1FC8GgO&kc=_wHIOGycL@c*<bG)
zwo>$`ce703t%>S}mvxEnD-V!;6c`2(p74V7D0No1Xxt`urE66$0(ThaAZ1YVG#QP$
zy~NN%kB*zhZ2Y!kjn826pw4bh)75*e!dse+2Db(;bN34Uq7bLpr47XTX{8UEeC?2i
z*{$`3dP}32${8pF$!$2Vq^gY|#w+VA_|o(oWmQX8^iw#n_crb(K3{69*iU?<%C-%H
zuKi)3M1BhJ@3VW>JA`M>L~5*_bxH@Euy@niFrI$82C1}fwR$p2E&Z<Ob;#(TU74Ph
ztf8J0&fyeMMw)zi=hIx%&y{|{T_5jY)NMBe+>Ynu?jlS}u7W9AyfdXh2pM>78bIt3
z)JBh&XE@zA!kyCDfvZ1qN^np20c1u#%P6;6tU&dx0phT1l=(mw7`u!-0e=PxEjDds
z9E}{E!7f9>jaCQhw)&2TtG-qiD)lD(4jQ!q{`x|<ESM5KH=eknfZ>8l&nmtHkdul#
zy+CIF8lKbp9_w{;oR+jSLtTfE+B@tOd6h=QePP>rh4@~!8c;Hlg9m%%&<QnUTcKE;
z)YrxkvA7z9t4D^hQ9oX+mA>?e`*Z?qz5-zLEWfi>`ord5uHF-s{^bexKAoMEV@9nU
z^5nA{f{dW&g$)BAGfkq@r5D)jr%!Ven~Q58c!Kr;*Li#`4Bu_?BU0`Y`nVQGhNZk@
z!>Yr$+nB=`z#o2nR0)V3M7-eVLuY`z@6CT#OTUXKnxZn$fNLPv7w1<nPf|^3AoMwV
zKsB+NSiodoZIAhe1$EQSDVdL)?SZ(PH*jg!xwhJ*#U_F)oKXO&*jTkSXt<YKQT&PH
zl+Mukt=H*1QL&wAQ_xuxbV-d>y7eGE=Qv@Hey`n;`U=xEl|q@CCV^#l)s0ZfT+mUf
z^(j5r4)L5i2jnHW4+!6Si3q_LdOLQi<^fu?6WdohIkn79=jf%Fs3JkeXwF(?_tcF?
z?z#j6iXEd(wJy4|p6v?xNk-)iIf2oX5^^Y3q3ziw16p9C6B;{COXul%)`>nuUoM*q
zzmr|NJ5n)+sF$!yH5zwp=iM1#ZR`O%L83tyog-q<XVH~w)?V;YEs2<xxAL9u8Cn)q
z1@19=BrkPVp`q0nx*9KBOD=73^U!k`$Oz)fql$!XXe-VDQxD5No*n!+(&gJj6>h1I
z0%dcj{NUs?{myT~33H^(%0QOM>-$hGFeP;U$puxoJ>>o-<CEgGXJLB#!{-C@Jwxpl
zUVta>%Lk*8X^rx1>j|LtH$*)>1C!Pv&gd16%`qw5LdOIUbkNhaBBTo}5iuE%K&ZV^
zAr_)kkeNKNYJRgjsR%vexa~&8qMrQYY}+RbZ)egRg9_$vkoyV|Nc&MH@8L)`&rpqd
zXnVaI@~A;Z^c3+{x=xgdhnocA&OP6^rr@rTvCnhG6^tMox$ulw2U7NgUtW%|-5VeH
z_<gxR5|IS!q~A=25>qyd47}1?IbuKtqNbNx$HR`*+9o=8`%vM8&SIKbkX9&%TS++x
z5|&6P<%=F$C?owUI`%uvUq^yW0>`>yz!|WjzsoB9dT;2Dx8iSuK%%_XPgy0dTD4kd
zDXF@&O_vBVVKQq(9YTClUPM30Sk7B!v7nOyV`XC!BA;BIVwphh+c)?5VJ^(C;GoQ$
zvBxr7_p*k$T%I1ke}`U&)$uf}I_T~#3XTi53OX)PoXVgxEcLJgZG^i47U&>LY(l%_
z;9vVDEtuMCyu2fqZeez|RbbIE7@)UtJvgAcVwVZNLccswxm+*L&w`&t=ttT=sv6Aq
z!Hou<k0lNu5^&RexWJv?GQ<(f-tAdV`ej!bdbIg1R|4AZ>Sc-24Y9;0q$>j<b^i|8
zppCpm&+RogSUV(8s{BI8BL#}F;4aOJZKcE!Pf-lH9R<RO^Qvy3pp`?zlkS$40ETip
z#iwAYNc9U|m)^oU39^nY%y|fh6uw!3zzx%TC0aynt(FtcFlv}PC-h|0p?DEJQYrjq
zBKI!p<K#dC6MPk}7LWi#I^%1f&O0$*|AhoYuOGhe5ch<EZ*Rey>X<1DnnGmAsP))-
z^F~o99gHZw`S&Aw7e4id6Lg7kMk-e)B~=tZ!kE7sGTOJ)8@q}np@j7&7Sy{2`D^FH
zI7aX%06vKsfJ168QnCM2=l|i>{I{%<!ihA(&wjT8>@gcr>ExM0Dw{PX6ozEuqFYEt
z087%MKC;wVsMV}kIiuu9Zz9~H!21d!;Cu#b;hMDIP7nw3xSX~#?5#SSjyyg+Y@xh|
z%(~fv3`0j#5CA2D8!M2TrG=8{%>YFr(j)I0DYlcz(2~92?G*?DeuoadkcjmZszH5&
zKI@Lis%;RPJ8mNsbrxH@?J8Y2LaVjUIhRUiO-oqjy<&{2X~*f|)YxnUc6OU&5iac=
z*^0qwD~L%FKiPmlzi&~a*9sk2$u<7Al=_`Ox^o2*kEv?p`#G(p(&i|ot8}T;8KLk-
zPVf_4A9R`5^e`Om2LV*cK59EshYXse&IoByj}4WZaBomoHAPKqxRKbPcD`lMBI)g-
zeMRY{gFaUuecSD6q!+b5(?vAnf>c`Z(8@RJy%Ulf?W~xB1dFAjw?CjSn$ph>st5bc
zUac1aD_m6{l|$#g_v6;=32(mw<xx{81gP??-5Uh*EV=|5q0G)(9Um);{*OY?3hgGs
zHqE$wRv9KDE!`nta<{9Y4QRqGn${O@+U-}(7qm0?ojE3A!<eQO9on1lM0ltLJ}Kpf
zPO%OShmlRE#Wmx(J9Hyj$tUDEO>pveQDWhmjR7{|B=$oBhz`7_g7qNp)n20|^^op3
zSfTdWV#Q>cb{CMKlWk91^;mHap{mk<B1o0O$~&VV_U2#V<W~>)o?udk$^Q^^u@&jd
zfZ;)saW6<!B#!OgYCYl~S`2$rXNu%O_6_#lt;!oR!`vSRj~T^(P81s=*gVNND(5&m
za4R|>{e*yoL6#0}oVPb2!}r{pAUYtn4{P~ES9tTfC5hXZnM{HrC8^=Pof{G4%Bh#8
ze~?C9m*|fd8MK;{L^!+wMy>=f^8b&y?yr6KnTq28$pFMBW9Oy7!oV<DBr1GmQ?eRF
zSbp)4f>5z|VM$s-cZ{I|Xf@}-)1=$V&x7e;9v81eiTi4O5-vs?^5pCKy2l>q);!MA
zS!}M48l$scB~+Umz}7NbwyTn=rqt@`YtuwiQSMvCMFk2$83k<zeO0U*)I|(yg*Kj9
zONJ(J+8Nxd$mc?9L-e3_6F%}5XllimT|8wU-RBL2UO3qjy<3$=B&}4tijES(#plr#
zo#B)LRZ<DaC0ur2d6y>50Q>OK5&fe*xCddIm)3D0I6vBU<+!3=6?(OhkO|b4fE_-j
zimOzyfBB_*7*p8AmZi~X2bgVhyPy>KyGLAnOpou~sx9)S9%r)5dE%ADs<phjJ+~cu
z6e^C%cj-bwkDqeTKwT845#%IrytZTl)MM>4v%fFybDa_w*0?+>PsEHTbhKK^G=pFz
z@IxLTCROWiKy*)cV3y%0FwrDvf53Ob_XuA1#tHbyn%Ko!1D#sdhBo`;VC*e1YlhrC
z?*y3rp86m#qI|qeo8)_xH*G4q@70aXN|SP+6MQ!fJQqo1kwO_v7zqvUfU=Gwx`CR@
zRFb*O8+54%_8tS(ADh}-hUJzE`s*8wLI>1c4b@$al)l}^%GuIXjzBK!EWFO8W`>F^
ze7y#qPS0NI7*aU)g$_ziF(1ft;2<}6Hfz10cR8P}67FD=+}MfhrpOkF3hFhQu;Q1y
zu%=jJHTr;0;oC94Hi@LAF5quAQ(rJG(uo%BiR<W&PP;_;Uh7C_pCS&MdX%OcW)hw?
zL}{(HSc2IOGWvq06nVXQXW)q)t<UD4PDnDq6M`=+mQENt0<@wGM<nU_;&^eWn$!ar
zz_42ZyHoVPGok(apsld>Q@8U;nhX)j0i?0SL2g-A*YeAqF>RVCBOTrn{0R27vu}_S
zS>tX4!#&U4W;ikTE!eFH+PKw%p+B(MR2I%n#+m0{#?qRP_tR@zpgCb=4rcrL!F=;A
zh%EIF8m6%JG+qb&mEfuFTLHSxUAZEvC-+kvZKyX~SA3Umt`k}}c!5dy?-sLIM{h@>
z!2=C)@nx>`;c9DdwZ&zeUc(7t<21D7qBj!|1^Mp1eZ6)PuvHx+poKSDCSBMFF{bKy
z;9*&EyKitD99N}%mK8431rvbT+^%|O|HV23{;RhmS{$5tf!bIPoH9RKps`-EtoW5h
zo6H_!s)Dl}2gCeGF6>aZtah9iLuGd19^z0*OryPNt{70RvJSM<#Ox9?HxGg04}b^f
zrVEPceD%)#0)v5$YDE?f`73bQ6TA6wV;b^x*u2Ofe|S}+q{s5gr&m~4<L!Kst{sot
z5KH0dt;&=upf|M07weKb>qGd!wOu|cZ||#h_u<art*duxwj*&{AKvt_C_~Rbj+&9M
zkydF-zES=7nwWu)>=k*fB;R6&k?FoM+c&J;ISg70h!J7*xGus)ta4veTdW)S^@sU@
z4$OBS=a~@F*V0ECic;ht4@?Jw<9kpjBgHfr2FDPykCCz|v2)`JxTH55?b3IM={@DU
z!^|9nVO-R#s{`VHypWyH0%cs;0GO3E;It6W@0gX6wZ%W|Dzz&O%m17pa19db(er}C
zUId1a4#I+O<l=|m7ZxfZst4BS&JikX?HdNANhau=>u8E1MU$g=zo%g7K(=0Pn$)Rk
z<4T2u<0rD)*j+tcy2<G9Ivw5qe%5#fv~8!s2fYQ!PCLn0mO&fewcXp9;Ab`>XvY+0
z0d2pqm4)4lDewsAGThQi{2Kc3&C=|OQF!vOd#WB_`4gG3@inh-4>BoL!&#ij8bw7?
zqjFRDaQz!J-YGitV4}$*$hg`vv%N)@#UdzHFI2E<&_@0Uw@h_ZHf}7)G;_NUD3@18
zH5;EtugNT0*RXVK*by>WS>jaDDfe!A61Da=VpIK?mcp^W?!1S2oah^w<GqG&Q{GKG
zsZrmPGpG`5;Zw5KjP_T(w6*Oc#l;%A<XJKoa;gjzf{rQ<msaCqJm@pc8fC%^aHdwx
za;%pk=(M*8?1xrs9O?8~P(Pm^@-SApIj`E()NzqXYNwFt6`={MznUJN%?Q(&I7`j@
zUShz8uo+42K1njt7G_PTc!^uW&zBsg2Q6Hkvq_Ac-_%QcGd|r(JA=3c^ko`1$x6iV
zeuEh$OpkHJoFOO;=9K^m4f2__2er<ga^Us%$iDhlGu0Gyf;e4o?^T0h5Mgqa<m(1*
zz7Tzd6V3Btto~j{F-a}ip<@6RQ&-IuP<M>owRnrYjl~`lgP-mv$?yb6{{S55CCu{R
z$9;`dyf0Y>uM1=X<U>Sl_$01Lc1Iy68IosWN8Q9Op=~I(F<0+_kKfgC*JggjxNgK6
z-3gQm6;sm?J&;bYe&(dx4BEjvq}b`OT^RqF$J4enP1YkeBK#>l1@-K`ajbn05`<Oq
zmqj5{969^1c=`CM^D;dPl`V1bO3H_#G}K_47zMZo;VH~XSUf;WZ1OW;I6MlYB}z!$
ziM)W7<C{za(FgW$rM|O>0J?0daOtnzh@l3^=BkedW1EahZlRp;`j*CaT;-21&f2wU
z+Nh-gc4I36Cw+;3<L&ul7M-c)rmHy9{KG4CN7|Yi5BPNiq@XCF2M`heJTxqm_-j<)
zZv0h3mch@o2qwh*nupqD^@}aY<ug;}r8BckctukkvlZML584k|XHJvXR2{vA6<U@R
zGOK6H91N*q)Wp}(%MPb(Yd4uT-&f8gwV(D=^MR0u<D2LG8^3H>UAc<%ySb`#+c@5y
ze~en&bYV|kn?Cn|@fqmGxgfz}U!98$=drjAkMi`43I4R%&H0GKEgx-=7PF}y`+j>r
zg&JF`jomnu2G{%QV~Gf_-1gx<3Ky=Md9Q3VnK=;;u0lyTBCuf^aUi?+1+`4lLE6ZK
zT#(Bf`5rmr(tgTbIt?yA@y`(Ar=f>-aZ}T~>G32EM%XyFvhn&@PWCm#-<&ApLDCXT
z<x33$GxGAak;#YXrG}i!b5RQTMf?{_;>D#(9m|V(OOo7PmE@`vD4$S5;+9IQm19dd
zvMEU`)E1_F+0o0-z>YCWqg0u8ciIknU#{q02{~YX)gc_u;8;i233D66pf(IkTDxeN
zL=4z2)?S$TV9=ORVr&AkZMl<4tTh(v;Ix1{`pPVqI3n2ci&4Dg+W|N8TBUfZ*WeLF
zqCH_1Q0W&<NGK!A^MDXcf{IX8(y4apmdd$^)KF8&_-`QJW>f9T$lx3CFJ$o@Lz$99
zW!G&@zFHxTaP!o#z^~xgF|(vrHz8R_r9eo;TX9}2ZyjslrtH=%6O)?1?cL&BT(Amp
zTGFU1%%#xl&6sH-UIJk_PGk_McFn7=%yd6tAjm|lnmr8bE2le3I~L{0(ffo}TQjyo
zHZZI{-}{E4ohYTlZaS$blB!h$Jq^Rf#(ch}@S+Ww&$b);8+>g84IJcLU%B-W?+IY&
zslcZIR>+U4v3O9RFEW;8NpCM0w1ROG84=WpKxQ^R`{=0MZCubg3s<Etz&NY5gvG$o
zt!yR2s+eA9B|xgrHuO<dDs_6qmbK**o8#w0$rg#6N2V3+@h}vZqT~<9AqzG~0(P}r
zmXo1RiMgVr)#~eH6ebDnM|qh^Q`sc-mnRmD`WlG2Ikp0I-Ox5n26$IlnwP}x{0W>t
z48AyJNEvyxn-jCPTlTwp4EKvyEwD3e%kpdY?^BH0!3n6Eb57_L%J1=a*3>|k68A}v
zaW`*4YitylfD}ua8V)vb79)N_Ixw_mpp}yJGbNu+5YYOP9K-7nf*jA1#<^rb4#AcS
zKg%zCI)7cotx}L&J8Bqo8O1b0q;B1J#B5N5Z$Zq=wX~nQFgUfAE{@u0+EnmK{1hg>
zC{vMfFLD;L8b4L+B51&LCm|scVLPe6h02rws@kGv@R+#IqE8>Xn8i|vRq_Z`V;x6F
zNeot$1Zsu`lLS92QlLWF54za6vOEKGYQMdX($0JN*cjG7HP&qZ#3+bEN$8O_PfeAb
z0R5;=zXac2IZ?fxu59?Nka;1lKm|;0)6|#RxkD05P5qz;*AL@ig!+f=lW5^Jbag%2
z%9@iM0ph$WFlxS!`p31t92z~TB}P-*CS+1Oo_g;7`6k(Jyj8m8U|Q3Sh7o-Icp4kV
zK}%qri5>?%IPfamXIZ8pXbm-#{ytiam<{a5A+3dVP^xz!Pvirsq7Btv?*d7eYgx7q
zWFxrzb3-%^lDgMc=Vl7^={=VDEKabTG?VWqOngE`Kt7hs236QKidsoeeUQ<dJdpn~
z8ugX}n?|k0G&ZNsQXiIdub6%BbDrFoQP{UHgSJr?H1haK6mY(Yh7KKv?iO`4vz^#t
zxo0DGapnn_ZyGm|->_^FzsXjprCDd@pW25rNx#6x&L6ZEpoX9Ffzv@olnH3rGOSW(
zG-D|cV0Q~qJ>-L}NIyT?T-+x+wU%;+_GY{>t(l9dI%Ximm+Kmwhee;FK$%{dnF;C%
zFjM2&$W68Sz#d*wtfX?*WIOXwT;P6NUw}IHdk|)fw*YnGa0rHx#paG!m=Y6GkS4VX
zX`T$4eW9k1W!=q8!(#8A9h67fw))k_G)Q9~Q1e3f`aV@kbcSv7!priDUN}gX(iXTy
zr$|kU0Vn%*ylmyDCO&G0Z3g>%JeEPFAW!5*H2Ydl>39w3W+gEUjL&vrRs(xGP{(ze
zy7EMWF14@Qh>X>st8_029||TP0>7SG9on_xxeR2Iam3G~Em$}aGsNt$iES9zFa<3W
zxtOF*!G@=PhfHO<!$-(3o&U_UkqyK40n=6Vlx|zXzLEF_{kGb3A?Bg4s<?A8=0T~?
zO=7Dsw!Ft7XCkIj^h-?lS0!wDbCClJ$pTlqYJJ-6S4tKPD||BXU`XGAJbWXRLvm)V
zf)*66PqtX-R?cdA?IdZmRnc`+F~d;OH&Tq8uTfmSYG72eTO=Uss91Xs!~`obeW_$1
zh&9cdKR%|6GCoh+jx11S$4qnWiXaNR35MtDg(^dtMQwVOJ-Lk$#EBnm{#8l|?g5yC
zn5>!=9pVPXMUVi30WmkPoy$02w}&6A7mF)G6-<G0>`~EVq5CwD2`9Zu`kd)52``#V
zNSb`9dG~8(dooi1*-aSMf!fun7Sc`-C$-E(3BoSC$2kKrVcI!&yC*+ff2+C-@!AT_
zsvlAIV+%bRDfd{R*TMF><1&_a%@yZ0G0lg2K;F>7b+7A6pv3-S7qWIgx+Z?dt8}|S
z>Qbb6x(+^aoV7FQ!Ph8|RUA6vX<EwOZP?_l8E>WQH*1$GJC+wXLXizNIc9p2yLzw9
z0=MdQ!{NnOwIICJc8!+Jp!zG}**r#E!<}&Te&}|B4q;U57$+pQI^}{qj669zMMe_I
z&z0uUCqG%YwtUc8HVN7?0GHpu=bL7&{C>hcd5d(iFV{I5c~jpX&!(a{yS*4MEoYXh
z*X4|Y@RVfn;piRm-C%b@{0R;<vec$F=k8NwAb#fTD<{6|sRCl8En=5xM-0JZd-!rE
zs8JFw4w^fq(Co60(caS?->aXrjBtvx^HO;6(>i*RnoG0Rtcd25BT6edxTNOgUAOjn
zJ2)l{ipj8IP$KID2}*#F=M%^n&=bA0tY<hfH<LW9Lo;6?J5TRypd=4|!3w{=vW6;g
zfMUqyTlucEN|n*{H6D!}R83xo!DD>98@+2I+7~A&T-tw%W#3GV>GTmkHaqftl)#+E
zMU*P(Rjo>8%P@_@#UNq(_L{}j(&-@1iY0TRizhiATJrnvwSH0v>lYfCI2ex^><3$q
znzZgpW0JlQx?JB#0^^s-Js1}}wKh6f>(e%NrMwS`Q(FhazkZb|uyB@d%_9)_xb$6T
zS*#-Bn)9gmobhAtvBmL+9H-+0_0US?g6^TOvE8f3v=z3o%NcPjOaf{5EMRnn(_z8-
z$|m<k1+Y2M80uitGacvNl9R2EG12kWV0*95H-KkW?UVAYdtp6iv(`p<v|~>0D$FTU
zDy;21v-#0i)9%_bZ7eo6B9@Q@&XprR&oKl4m>zIj-fi<A_ml^c*yyE~9KBeV?m&C{
z?zC$KvvQZ{0k<TvO$pP1byryt+m|`yvw<0(cY4VrGVL^;IHYW?KvVF8&gSy)h4$0M
z?pQwA7YCHlk$luMaQh|UoJJ+izzn@@h#*~PJ@IegUzf;B4Z#&O$b;oRANjHDWA!n1
z^0^6(LV~RIUb^czO|XQZf$dD@R92b7GQ7*v1+HC0JUUK#)to%s$(LaZOA<rBPjC_H
z&swsIbNNpHEd(`5r)GK%@jitrQ;3_BcW_MzVsDr-lsR9A8%>Ry4Dqy@VVVs?rscG|
zmzaDQ%>AQTi<^vYCmv#KOTd@l7#2VIpsj?nm_WfRZzJako`^uU%Nt3e;cU*y*|$7W
zLm%fX#i_*HoUXu!NI$ey>BA<5HQB=|nRAwK!$L#n-Qz;~`zACig0PhAq#^5QS<8L2
zS3A+8%vbVMa7LOtTEM?55apt(DcWh#L}R^P2AY*c8B}Cx=6OFAdMPj1f>k3#^#+Hk
z6uW1WJW&RlBRh*1DLb7mJ+KO>!t^t8hX1#_Wk`gjDio9)9IGbyCAGI4DJ~orK+YRv
znjxRMtshZQHc$#Y-<-JOV6g^Cr@odj&Xw5B(FmI)*qJ9NHmIz_r{t)TxyB`L-%q5l
ztzHgD;S6cw?7Atg*6E1!c6*gPRCb%t7D%z<(xm+K{%EJNiI2N0l8ud0Ch@_av_RW?
zIr!nO4dL5466WslE6MsfMss7<)-S!e)2@r2o=7_W)OO`~CwklRWzHTfpB)_HYwg<t
zz{HB>z=BzLhgZ9S<{nLBOwOIgJU=94uj6r!m>Xyn9>&xP+=5!zG_*yEoRgM0`aYts
z^)&8(>z5C-QQ*o_s(8E4*?AX#S^0)aqB)OTyX>4BMy8h(cHjA8ji1PRlox@jB*1n?
zDIfyDjzeg91Ao(;Q;KE@zei$}>EnrF6I}q&Xd=~&$WdDsyH0H7fJX|E+O~%LS*7^Q
zYzZ4`pBdY{b7u72gZm6^5~O-57HwzwAz{)NvVaowo`X02tL3PpgLjwA`^i9F^vSpN
zAqH3mRjG8VeJNHZ(1{%!XqC+)Z%D}58Qel{_weSEHoy<sQiA0B_Gqb=CByVMyX%!w
z4Eje5Bt-T0q$B3%(R6oxFvh1CN<`TxM65`>gT<GyFDwf<^GY6Vfrl5#gR3&m)nrt)
zCWR3-*#eZPX3RXX`pjGOfi3o17!KQK8dwfP^~8MI@{GlTZ?WiL#7ES_s$*l2K2Z}H
zas@Kh)G&#NThp?T8I`3G;i;?O${)4keDi<B5`os2ipc!lEq_U9t1+4j6&f+i3E1Qt
zV(COdPse*sjcYv9z&+bej1ic_Zq~W{eH1#pFTwi$HEL#iDucQfEGnaMi)KRHfim?X
z6PkMl?xaac7nJR0Ns&WP-SdXEOTpQyN=*w7lw%g(K13?>9pN@i<m@Psd5jg42?4iP
z1TJCbd#Y_)triFdU(An6c0*yy6t5RbQdYS-sCDoqS+j+^Kq%-VNX#0B3ajQlsOh;>
zi=G;!Vj6XQk2tuJC>lza%ywz|`f7TIz*EN2Gdt!s199Dr4Tfd_%~fu8gXo~|ogt5Q
zlEy_CXEe^BgsYM^o@L?s33WM14}7^T(kqohOX_iN@U?u;$l|rAvn{rwy>!yfZw13U
zB@X9)qt&4;(C6dP?yRsoTMI!j-f1KC!<%~i1}u7yLXYn)(#a;Z6~r>hp~kfP));mi
zcG%kdaB9H)z9M=H!f>kM->fTjRVOELNwh1amgKQT=I8J66kI)u_?0@$$~5f`u%;zl
zC?pkr^p2Fe=J~WK%4ItSzKA+<GP*jGNYka`Iu;bBcRXMzDVt^~dFbWv7OyePceS}f
z-0Rr6B-P~VbM+kR0+dK+p>QHqJ@~m|Cduv=Q&-P8I5rQ-#G@bYH}<jhJxi*3)1Jx`
z)AAx_CCf(b?oSg9xO-v~d&OUM2BAEz9qyGod)3A7h^QvW_MGq?LHCy~ZEPKLo>YJr
zUS(~(w|vKyU(T(*py}jTUp%I<o#}x3P!`rEap=Wdb060(Y75-D1W6-~{J!!Ed<3tq
zwp&hUHFog~v-Jls<xc8HK1Uq!smqAM<}r!S9d^J@R;5>%{2!W!K(i$uvotcPjVddW
z8_5HKY!oBCwGZcs-q`4Yt`Zk~>K?mcxg51wkZlX5e#B08I75F7#dgn5yf&Hrp`*%$
zQ;_Qg>TYRzBe$x=T(@WI9SC!ReSas9vDm(yslQjBJZde5z8GDU``r|N(MHcxNopGr
z_}u39W_zwWDL*XYYt>#Xo!9kL#97|EAGyGBcRXtLTd59x%m=3<KX%+}BsUGy9@h>i
zL^9joWYA)HfL15l9%H?q`$mY27!<9$7GH(kxb%MV>`}hR4a?+*LH6aR{dzrX@?6X4
z3e`9L;cjqYb`cJmophbm(OX0b)!AFG?5`c#zLagzM<qLWA@@_TdWUT#E|2fIBK4QC
zX>W~o)?-!@e80lvk!p#&CD8u5_r&wp4O0zQ>y!k5U$h_K;rWGk=U)zX!#@Q%|9g*A
zWx)qS1?fq6X<$mQTB$#3g;;5tHOYuAh;YKSBz%il3Ui6fPRv#v62SsrCdMRTav)Sg
zTq1WOu&@v$Ey;@^+_!)cf|w_X<@RC>!=~+A1-65O0bOFYiH-)abINwZvFB;hJjL_$
z(9iScmUdMp2O$WW!520Hd0Q^Yj?DK%YgJD^ez$Z^?@9@Ab-=KgW@n8nC&88)TDC+E
zlJM)L3r+ZJfZW_T$;Imq*#2<(j+FIk8ls7)WJ6CjUu#r5PoXxQs4b)mZza<<Vrg%t
z(8`S(Rc|dIPl3K8yS7-~cO9uc>8=v{o)VlLRM<9yw^0En#tXAj`Sylxvki{<1DPe^
zhjHwx^;c8tb?Vr$6ZB;$Ff$+3(*oinbwpN-#F)bTsXq@Sm?43MC#jQ~`F|twI=7oC
zH4TJtu#;ngRA|Y~w5N=UfMZi?s0%ZmKUFTAye&6Y*y-%c1oD3yQ%IF2q2385Zl+=>
zfz=o`Bedy|U;oxbyb^rB9ixG{Gb-{h$U0hVe`J;{ql!s_OJ_>>eoQn(G6h7+b^P48
zG<=Wg2;xGD-+d@UMZ!c;0>#3nws$9kIDkK13IfloGT@s14AY>&>>^#>`PT7GV$2Hp
zN<{bN*ztlZu_%W=&3+=#3bE(mka6VoHEs~0BjZ$+=0`a@R$iaW)6>wp2w)=v2@|2d
z%?34!+iOc5S@;AAC4hELWLH56RGxo4jw8MDMU0Wk2k_G}=Vo(>eRFo(g3@HjG|`H3
zm8b*dK=moM*oB<)*A$M9!!5o~4U``e)wxavm@O_R(`P|u%9^LGi(_%IF<6o;NLp*0
zKsfZ0#24GT8(G`i4UvoMh$^;kOhl?`0yNiyrC#HJH=tqOH^T_d<Sn^_)gv)I><2Z+
zeN>Y9Zn!X4*DMCK^o75Zk2621bdmV7Rx@AX^alBG4%~;G_vUoxhfhFRlR&+3WwF^T
zaL)8xPq|wCZoNT^>3J0K?e{J-kl+hu2rZI>CUv#-z&u@`hjeb+bBZ>bcciQVZ{SbW
zez04s9oFEgc8Z+Kp{XFX`MVf-s&w9*dx7wLen(_@y34}Qz@&`$2+osqfxz4&d}{Ql
z*g1ag00Gu+$C`0avds{Q65BfGsu9`_`dML*rX~hyWIe$T>CsPRoLIr%MTk3pJ^2<X
zSh&)U43JF%=<Yj{veBb(Ckb8gdgCpYh}%kpZZYDF{AgLe$}UrMaOOvu)YU9;n?BkT
zNxs(4`%n);ebfU@MeFR!NNvHzh>zH1qub1MBzPG}PO;Wmav9w%F7?%l=xIf#LlP`!
z_Nw;xBQY9anH5-c8A4mME}?{iewjz(Sq-29r{fV;Fc>fv%0!W@(+{={Xl-sJ6aMoc
z)9Q+$bchoTGTyWU_oI19!)bD=IG&OImfy;VxNXoIO2hYEfO~MkE#IXTK(~?Z&!ae!
zl8z{D&2PC$Q*OBC(rS~-*-GHNJ6AC$@eve>LB@Iq;jbBZj`wk4|LGogE||Ie=M5g=
z9d`uYQ1^Sr_q2wmZE>w2WG)!F%^KiqyaD<N`p2N#PIS`Mf%==Eao;Uu2_^UDs-(J<
zG74hi&Q&b4n<JGE>tIAct?}D~JP4shTJy5Bg+-(EA8aXaxbd~BKMtTf2iQ69jD1o*
zZF9*S3!v-TdqwK$%&?91Sh2=e63;X0Lci@n7y3XOu2ofyL9^-I767eHESAq{m+@*r
zbVDx!FQ|AjT;!bYsXv8ilQjy~Chiu&HNhFXt3R_6kMC8~ChEFqG@MWu#1Q1#=~#ix
zrkHpJre_?#r=N0wv`-7cHHqU`phJX2M_^{H0~{VP79Dv{6YP)oA1&TSfKPEPZn2)G
z9o{U1huZBLL;Tp_0OYw@+9z(jkrwIGd<rJ;;|g97;QyKudFl=p>UrOhKJUbwy?WBt
zlIK)*K0lQCY0qZ!$%1?3A#-S70F#YyUnmJF*`xx?aH5;gE5pe-15w)EB#nuf6B*c~
z8Z25NtY%6Wlb)bUA$w%HKs5$!Z*W?YKV-lE0@w^{4vw;J>=rn?u!rv$&eM+rpU<lK
zDSaRV>6rc=j9>N2Op+C{D^mospMCjF2ZGhe4eADA#skp2EA26%p3Ex9wHW8l&Y@HX
z$Qv)<C51V>mHM}4*@M*#*ll5^hE9M^=q~eyWEai*P;4z<9ZYy!SlNE5nlc7gm;M&Q
zKhKE4d*%A>^m0R?{N}y|i6i^k>^n4(wzKvlQeHq{l&JuFD~sTsdhs`(?lFK@Q{pU~
zb!M3c@*3IwN1RUOVjY5>uT+s-2QLW<CJ6(zZl(Y=4_QOGX*=xBx0IU{bjOr8d3j>Y
z4T2>fiSn>>Fob+%B868-v9D@AfWr#M8eM6w#eAlhc#zk6jkLxGBGk`E3$!A@*am!R
zy>29&ptYK6>cvP`b!syNp)Q$0UOW|-O@)8!?94GOYF_}+zlW%fCEl|Tep_zx05g6q
z>tp47e-&R*hSNe{6{H!mL?+j$c^TXT{C&@T-<poA8}E*C#VxAw^S6u#n%mQpO;WFU
zvLaxQPs)-49m4`q`S=1vU=9~A0zKn1+mPA-d!D#}Ga{6K-?jPMxc<i>xIaesNCl05
z9SLb@q&mSb)I{VXMaiWa3PWj=Ed!>*GwUe;^|uk=Pz$njNnfFY^MM>E?zqhf6^{}0
zx&~~dA5#}1ig~7HvOQ#;d9JZBeEQ+}<CQKP1(~6&tehmCAOE|1+U&fYps;i?%7U2p
zS$$NL#C96qXCIm<KHY5B6l`{cfkDE1j9A*K@D<ezU#F~o4Jc6wIHIE1FM;gV*B;)0
zG;Z-u_NyJF6sUBMs*2d5FC-N2g4b5#W7L^fgw?1&N?2BbL-v_8w{+M;T4E-PtC(jp
zwx$;ua6pZ4<M2z^rf4wO59&T7p0g;L(4p_&xl(GK)K%7!w_@bd(@>-~v$at`m!(ai
z$w(H&mWCC~;PQ1$%iuz3`>dWeb3_p}X>L2LK%2l59Tyc}4m0>9A!8rhoU3m>i2+hl
zx?*qs*c^j}+WPs>&v1%1Ko8_ivAGIn@QK7A`hDz-Emkcgv2@wTbYhkiwX2l=xz*XG
zaiNg+j4F-I>9v+LjosI-QECrtKj<k3_!Y>p&0T@xIMKVr+&)gyb4@b3y?2CA?=ooN
zT#;r<xDf1CATz!G(dQOqWZ;oJ_aQiWEGp<Ugklw>U86WLh(e@#mF*rk(NV-qSIZyr
z$6!ZUmzD)%yO-ot`rw3rp6?*_l*@Z*IB0xn4|BGPWHNc-1ZUnNSMWmDh=EzWJRP`)
z<NV4A@-;kC0%10Sy6M-PAN9ZN;dIx>l%d%J613oXzh5;VY^XWJi{lB`f#u+ThvtP7
zq(HK<4>tw(=yzSBWtYO}XI`S1pMBe3!jFxBHIuwJ(@%zdQFi1Q_hU2eDuHqXte7Ki
zOV55H2D6u#4oTfr7|u*3p75K<e&o873&m~>F&jaLEDpxk!4*bhPc%mpfj)Us3XIG3
zIKMX^s^1wt8YK7Ky^UOG=w!o5e7W-<&c|fw2{;Q11vm@J{)@N3-p1U>!0~sKWHaL=
zWV(0}1IIyt1p%=_-Fe5Kfzc71wg}`RDDntVZv;4!=&XXF-$48jS0Sc;eDy@Sg;+{A
zFStc{dXT}kcIjMXb4F7MbX~2%i;UrBxm%qmLKb|2=?uPr00-$MEUIGR5+JG2l2Nq`
zkM{{1RO_R)+8oQ6x&-^kCj)W8Z}TJjS*Wm4>hf+4#VJP)OBaDF%3pms7DclusBUw}
z{ND#!*I6h85g6DzNvdAmnwWY{&+!KZM4DGzeHI?MR@+~|su0{y-5-nICz_MIT_#FE
zm<5f3zlaKq!XyvY3H`9s&T};z!cK}G%;~!rpzk9-6L}4Rg7vXtKFsl}@sT#U#7)x-
z7UWue5sa$R>N&b{J61&gvKcKlozH*;OjoDR+elkh|4bJ!_3AZNMOu?n9&|L>OTD78
z^i->ah_Mqc|Ev)KNDzfu1P3grBIM#%`QZqj5W{qu(HocQhjyS;UINoP`{J+DvV?|1
z_sw6Yr3z6%e7JKVDY<$P=M)dbk@~Yw9|2!Cw!io3%j92wTD!c^e9Vj+7VqXo3>u#=
zv#M{HHJ=e$X5vQ>><L7PK}M0Mkv;HE!=8>ML?E8#UlmvJgTnb73{PSPTf*0)mcj6C
z{KsfUbDK|F$E(k;ER%8HMdDi`=BfpZzP3cl5yJHu;v^o2FkHNk;cXc17tL8T!CsYI
zfeZ6sw@;8ia|mY_AXjCS?kUfxdjDB28)~Tz1dGE|{VfBS9`0m2!m<b+8mQK3%NGoh
zaOY+g1^h6C*Gw2j`j9qWgH0kild})ej>1yG?hR})er^pl4c@9Aq+|}ZlDaHL)K$O|
z%9Jp-imI-Id0|(d5{v~w6mx)tUKfbuVD`xNt04Mry%M+jXzE>4(TBsx#&=@wT2Vh)
z1yeEY&~17>0%P(eHP0HB^|7C+WJxQBTG$uyOWY@iDloRIb-Cf!p<{WQHR!422#F34
zG`v|#CJ^G}y9U*7jgTlD{D&y$Iv{6&PYG>{Ixg$pGk?lWrE#PJ8KunQC@}^6OP!|<
zS;}p3to{S|uZz%kKe|;A0bL0XxPB&Q{<jo5;Kc4f%b@SP6h8+C2M0XP2e_UKD&SV%
z$;{G_%HGJp$jsL9&w}a_RODkLfKcI4DRHVArb&i30QE00@B{9IZ-5jvK!7zd;Qy~)
zBLV?x_a|t8<^8JwB*9)3DIrBZDlutcnwO7}O7-3TJbR=FxPJcB=kimh^#2}{;*%B=
z7E(~8mJ)u6@biiP5ChJ~{i&n@?rrQ%{>J(9PyX`+Kr`k~r2}yP^ND{8!v7Q1&vtk&
z2Y}l@J@{|2`oA%sxvM9i0V+8IXrZ4;tey)d;LZI70Kbim<4=WoTPZy=Yd|34v#$Kh
zx|#YJ8s`J>W&jt#GcMpx84w2Z3ur-rK7gf-p5cE)=w1R2*|0mj12hvapuUWM0b~dG
zMg9p8FmAZI@i{q~0@QuY44&mMUNXd7z>U58shA3o`p5eVLpq>+{(<3->DWuSFVZwC
zxd50Uz(w~LxC4}bgag#q#NNokK@yNc+Q|Ap!u>Ddy+df>v;j@I12CDNN9do+0^n8p
zMQs7X#+FVF0C5muGfN{r0|Nkql%BQT|K(DDNdR2pzM=_ea5+GO|J67`05AV92t@4l
z0Qno0078PIHdaQGHZ~Scw!dzgqjK~3B7kf>BcP__&lLyU(cu3B^uLo%{j|Mb<L(`+
zI+OxXmH-z3=uZU;@C$xXWwHNQ-LLU7%TRB>0NR)tkeT7Hcwp4O#<O;C)U&kwZ#J9>
z)yzu>cvG(d9~0a^)eZ;<gXy2}eqFmlH`*3uKnDQ<)($^E$Irz}^p|LoHYWeIw_j#C
ze+Gk<_$AnH<vf3BQ!g`UKeM1q{co0EH^a+JqtBSafUo#3I{oF;<1cwfU)K0#p1)^w
zVzpnQ|Jz7^k@fE-^vg7B&(N<m{|Wl9N!MPYzRXwhjQY;#|Ay>;%3ksk@F&1eEBje~
zW+-_s)&RgiweQc!otF>4%vbXKaOU41{!hw?|2`Ld3I8$&#WOsq>EG)1ANb!{N4z9@
zsU!bPG-~-bqCeIDzo^Q;gnucB{tRzm{ZH^Orphm2U+REA!*<*J6YQV83@<TXipoA?
z4mtf3<{vQt|Lx28Qoi*WwZZu}Q2)D_>&xoDl%#wnl5qcBqCcAF-vX5{30}(oJrnSH
z{RY85hylK2dMOh2%oO1J8%)0?8TOL%rS8)+CsDv}aQ>4D)Jv+DLK)9gI^n-T^$)Tc
zFPUD75<D|$Mg0cTALhXKONoM)R4-$}pQ+?xf13*MeX;$suG>qJm!Y-KBqj;JP4dV4
z`X{lGmn<)1IG<SzlYX1!&k>z330}s}Jrjtf{(lnuuNHe5(ezA(pYa=1|Ff-LhPFK8
zyJh_b{yzu0yll6ZkpRzRjezyYivjyjW7QwO;@6X`m;2Apn2EK2!~7S}-*=;5*7K$B
z`x(=!^?zgj(-`&ApZJXI09aDLXaT@<;CH=?fBOY5d|b~wBA@@p^K#nxr<VU8!}v?T
z%QFpn*Z)oF{+k!(KXv|(&*jfflb=49=VAc>`)?i?SqTupI_PJ(A3cx`z~9mX_*)>L
F{|7XC?P&l2

diff --git a/lite/demo/java/android/PaddlePredictor/gradle/wrapper/gradle-wrapper.properties b/lite/demo/java/android/PaddlePredictor/gradle/wrapper/gradle-wrapper.properties
deleted file mode 100644
index 2d135d7b25..0000000000
--- a/lite/demo/java/android/PaddlePredictor/gradle/wrapper/gradle-wrapper.properties
+++ /dev/null
@@ -1,6 +0,0 @@
-#Wed Jun 26 10:57:21 CST 2019
-distributionBase=GRADLE_USER_HOME
-distributionPath=wrapper/dists
-zipStoreBase=GRADLE_USER_HOME
-zipStorePath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-5.1.1-all.zip
diff --git a/lite/demo/java/android/PaddlePredictor/gradlew b/lite/demo/java/android/PaddlePredictor/gradlew
deleted file mode 100755
index cccdd3d517..0000000000
--- a/lite/demo/java/android/PaddlePredictor/gradlew
+++ /dev/null
@@ -1,172 +0,0 @@
-#!/usr/bin/env sh
-
-##############################################################################
-##
-##  Gradle start up script for UN*X
-##
-##############################################################################
-
-# Attempt to set APP_HOME
-# Resolve links: $0 may be a link
-PRG="$0"
-# Need this for relative symlinks.
-while [ -h "$PRG" ] ; do
-    ls=`ls -ld "$PRG"`
-    link=`expr "$ls" : '.*-> \(.*\)$'`
-    if expr "$link" : '/.*' > /dev/null; then
-        PRG="$link"
-    else
-        PRG=`dirname "$PRG"`"/$link"
-    fi
-done
-SAVED="`pwd`"
-cd "`dirname \"$PRG\"`/" >/dev/null
-APP_HOME="`pwd -P`"
-cd "$SAVED" >/dev/null
-
-APP_NAME="Gradle"
-APP_BASE_NAME=`basename "$0"`
-
-# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
-DEFAULT_JVM_OPTS=""
-
-# Use the maximum available, or set MAX_FD != -1 to use that value.
-MAX_FD="maximum"
-
-warn () {
-    echo "$*"
-}
-
-die () {
-    echo
-    echo "$*"
-    echo
-    exit 1
-}
-
-# OS specific support (must be 'true' or 'false').
-cygwin=false
-msys=false
-darwin=false
-nonstop=false
-case "`uname`" in
-  CYGWIN* )
-    cygwin=true
-    ;;
-  Darwin* )
-    darwin=true
-    ;;
-  MINGW* )
-    msys=true
-    ;;
-  NONSTOP* )
-    nonstop=true
-    ;;
-esac
-
-CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
-
-# Determine the Java command to use to start the JVM.
-if [ -n "$JAVA_HOME" ] ; then
-    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
-        # IBM's JDK on AIX uses strange locations for the executables
-        JAVACMD="$JAVA_HOME/jre/sh/java"
-    else
-        JAVACMD="$JAVA_HOME/bin/java"
-    fi
-    if [ ! -x "$JAVACMD" ] ; then
-        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
-
-Please set the JAVA_HOME variable in your environment to match the
-location of your Java installation."
-    fi
-else
-    JAVACMD="java"
-    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
-
-Please set the JAVA_HOME variable in your environment to match the
-location of your Java installation."
-fi
-
-# Increase the maximum file descriptors if we can.
-if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
-    MAX_FD_LIMIT=`ulimit -H -n`
-    if [ $? -eq 0 ] ; then
-        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
-            MAX_FD="$MAX_FD_LIMIT"
-        fi
-        ulimit -n $MAX_FD
-        if [ $? -ne 0 ] ; then
-            warn "Could not set maximum file descriptor limit: $MAX_FD"
-        fi
-    else
-        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
-    fi
-fi
-
-# For Darwin, add options to specify how the application appears in the dock
-if $darwin; then
-    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
-fi
-
-# For Cygwin, switch paths to Windows format before running java
-if $cygwin ; then
-    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
-    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
-    JAVACMD=`cygpath --unix "$JAVACMD"`
-
-    # We build the pattern for arguments to be converted via cygpath
-    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
-    SEP=""
-    for dir in $ROOTDIRSRAW ; do
-        ROOTDIRS="$ROOTDIRS$SEP$dir"
-        SEP="|"
-    done
-    OURCYGPATTERN="(^($ROOTDIRS))"
-    # Add a user-defined pattern to the cygpath arguments
-    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
-        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
-    fi
-    # Now convert the arguments - kludge to limit ourselves to /bin/sh
-    i=0
-    for arg in "$@" ; do
-        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
-        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
-
-        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
-            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
-        else
-            eval `echo args$i`="\"$arg\""
-        fi
-        i=$((i+1))
-    done
-    case $i in
-        (0) set -- ;;
-        (1) set -- "$args0" ;;
-        (2) set -- "$args0" "$args1" ;;
-        (3) set -- "$args0" "$args1" "$args2" ;;
-        (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
-        (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
-        (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
-        (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
-        (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
-        (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
-    esac
-fi
-
-# Escape application args
-save () {
-    for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
-    echo " "
-}
-APP_ARGS=$(save "$@")
-
-# Collect all arguments for the java command, following the shell quoting and substitution rules
-eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
-
-# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
-if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
-  cd "$(dirname "$0")"
-fi
-
-exec "$JAVACMD" "$@"
diff --git a/lite/demo/java/android/PaddlePredictor/gradlew.bat b/lite/demo/java/android/PaddlePredictor/gradlew.bat
deleted file mode 100644
index f9553162f1..0000000000
--- a/lite/demo/java/android/PaddlePredictor/gradlew.bat
+++ /dev/null
@@ -1,84 +0,0 @@
-@if "%DEBUG%" == "" @echo off
-@rem ##########################################################################
-@rem
-@rem  Gradle startup script for Windows
-@rem
-@rem ##########################################################################
-
-@rem Set local scope for the variables with windows NT shell
-if "%OS%"=="Windows_NT" setlocal
-
-set DIRNAME=%~dp0
-if "%DIRNAME%" == "" set DIRNAME=.
-set APP_BASE_NAME=%~n0
-set APP_HOME=%DIRNAME%
-
-@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
-set DEFAULT_JVM_OPTS=
-
-@rem Find java.exe
-if defined JAVA_HOME goto findJavaFromJavaHome
-
-set JAVA_EXE=java.exe
-%JAVA_EXE% -version >NUL 2>&1
-if "%ERRORLEVEL%" == "0" goto init
-
-echo.
-echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
-echo.
-echo Please set the JAVA_HOME variable in your environment to match the
-echo location of your Java installation.
-
-goto fail
-
-:findJavaFromJavaHome
-set JAVA_HOME=%JAVA_HOME:"=%
-set JAVA_EXE=%JAVA_HOME%/bin/java.exe
-
-if exist "%JAVA_EXE%" goto init
-
-echo.
-echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
-echo.
-echo Please set the JAVA_HOME variable in your environment to match the
-echo location of your Java installation.
-
-goto fail
-
-:init
-@rem Get command-line arguments, handling Windows variants
-
-if not "%OS%" == "Windows_NT" goto win9xME_args
-
-:win9xME_args
-@rem Slurp the command line arguments.
-set CMD_LINE_ARGS=
-set _SKIP=2
-
-:win9xME_args_slurp
-if "x%~1" == "x" goto execute
-
-set CMD_LINE_ARGS=%*
-
-:execute
-@rem Setup the command line
-
-set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
-
-@rem Execute Gradle
-"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
-
-:end
-@rem End local scope for the variables with windows NT shell
-if "%ERRORLEVEL%"=="0" goto mainEnd
-
-:fail
-rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
-rem the _cmd.exe /c_ return code!
-if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
-exit /b 1
-
-:mainEnd
-if "%OS%"=="Windows_NT" endlocal
-
-:omega
diff --git a/lite/demo/java/android/PaddlePredictor/settings.gradle b/lite/demo/java/android/PaddlePredictor/settings.gradle
deleted file mode 100644
index e7b4def49c..0000000000
--- a/lite/demo/java/android/PaddlePredictor/settings.gradle
+++ /dev/null
@@ -1 +0,0 @@
-include ':app'
diff --git a/lite/demo/java/android/prepare_demo.bash b/lite/demo/java/android/prepare_demo.bash
deleted file mode 100644
index e0dbdaf75f..0000000000
--- a/lite/demo/java/android/prepare_demo.bash
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-# Script to download model files and copy .Jar and JNI lib for Android demo
-# $1 will be the arch name
-
-if [ x$1 != x ]; then
-  cp ../../../java/so/libpaddle_lite_jni.so PaddlePredictor/app/src/main/jniLibs/$1/
-else
-  echo "Warning: didn't copy JNI .so lib because arch name is empty"
-fi
-
-MODELS=(inception_v4_simple_opt.nb lite_naive_model_opt.nb mobilenet_v1_opt.nb mobilenet_v2_relu_opt.nb resnet50_opt.nb)
-MODELS_DIR=PaddlePredictor/app/src/main/assets/
-
-for m in "${MODELS[@]}"
-do
-  wget --no-check-certificate -q http://paddle-inference-dist.bj.bcebos.com/${m}.tar.gz \
-      -O ${MODELS_DIR}${m}.tar.gz
-  tar xzf ${MODELS_DIR}${m}.tar.gz -C ${MODELS_DIR}
-  rm -rf ${MODELS_DIR}${m}.tar.gz
-done
-
-cp ../../../java/jar/PaddlePredictor.jar PaddlePredictor/app/libs/
diff --git a/lite/fluid/CMakeLists.txt b/lite/fluid/CMakeLists.txt
deleted file mode 100644
index 308dcb2c30..0000000000
--- a/lite/fluid/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-if (LITE_WITH_X86)
-lite_cc_library(fluid_data_type SRCS data_type.cc DEPS framework_proto eigen3)
-# lite_cc_library(selected_rows SRCS selected_rows.cc)
-endif()
diff --git a/lite/fluid/data_type.cc b/lite/fluid/data_type.cc
deleted file mode 100644
index aa8971499f..0000000000
--- a/lite/fluid/data_type.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/fluid/data_type.h"
-#include <stdint.h>
-#include <string>
-#include <unordered_map>
-
-using float16 = paddle::lite::fluid::float16;
-
-namespace paddle {
-namespace lite {
-namespace fluid {
-
-struct DataTypeMap {
-  std::unordered_map<std::type_index, framework::proto::VarType::Type>
-      cpp_to_proto_;
-  std::unordered_map<int, std::type_index> proto_to_cpp_;
-  std::unordered_map<int, std::string> proto_to_str_;
-  std::unordered_map<int, size_t> proto_to_size_;
-};
-
-static DataTypeMap* InitDataTypeMap();
-// C++11 removes the need for manual locking. Concurrent execution shall wait if
-// a static local variable is already being initialized.
-// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex
-static DataTypeMap& gDataTypeMap() {
-  static DataTypeMap* g_data_type_map_ = InitDataTypeMap();
-  return *g_data_type_map_;
-}
-
-template <typename T>
-static inline void RegisterType(DataTypeMap* map,
-                                framework::proto::VarType::Type proto_type,
-                                const std::string& name) {
-  map->proto_to_cpp_.emplace(static_cast<int>(proto_type), typeid(T));
-  map->cpp_to_proto_.emplace(typeid(T), proto_type);
-  map->proto_to_str_.emplace(static_cast<int>(proto_type), name);
-  map->proto_to_size_.emplace(static_cast<int>(proto_type), sizeof(T));
-}
-
-static DataTypeMap* InitDataTypeMap() {
-  auto retv = new DataTypeMap();
-
-#define RegType(cc_type, proto_type) \
-  RegisterType<cc_type>(retv, proto_type, #cc_type)
-
-  _ForEachDataType_(RegType);
-
-#undef RegType
-  return retv;
-}
-
-framework::proto::VarType::Type ToDataType(std::type_index type) {
-  auto it = gDataTypeMap().cpp_to_proto_.find(type);
-  if (it != gDataTypeMap().cpp_to_proto_.end()) {
-    return it->second;
-  }
-  PADDLE_THROW("Not support %s as tensor type", type.name());
-}
-
-std::type_index ToTypeIndex(framework::proto::VarType::Type type) {
-  auto it = gDataTypeMap().proto_to_cpp_.find(static_cast<int>(type));
-  if (it != gDataTypeMap().proto_to_cpp_.end()) {
-    return it->second;
-  }
-  PADDLE_THROW("Not support framework::proto::VarType::Type(%d) as tensor type",
-               static_cast<int>(type));
-}
-
-std::string DataTypeToString(const framework::proto::VarType::Type type) {
-  auto it = gDataTypeMap().proto_to_str_.find(static_cast<int>(type));
-  if (it != gDataTypeMap().proto_to_str_.end()) {
-    return it->second;
-  }
-  PADDLE_THROW("Not support framework::proto::VarType::Type(%d) as tensor type",
-               static_cast<int>(type));
-}
-
-size_t SizeOfType(framework::proto::VarType::Type type) {
-  auto it = gDataTypeMap().proto_to_size_.find(static_cast<int>(type));
-  if (it != gDataTypeMap().proto_to_size_.end()) {
-    return it->second;
-  }
-  PADDLE_THROW("Not support %s as tensor type", DataTypeToString(type));
-}
-
-}  // namespace fluid
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/fluid/data_type.h b/lite/fluid/data_type.h
deleted file mode 100644
index a8b11ec465..0000000000
--- a/lite/fluid/data_type.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include <typeindex>
-#include "lite/core/framework.pb.h"
-#include "lite/fluid/float16.h"
-#include "lite/utils/paddle_enforce.h"
-
-namespace paddle {
-namespace lite {
-namespace fluid {
-
-template <typename T>
-struct DataTypeTrait {};
-
-// Stub handle for void
-template <>
-struct DataTypeTrait<void> {
-  constexpr static auto DataType = framework::proto::VarType::RAW;
-};
-
-#define _ForEachDataTypeHelper_(callback, cpp_type, proto_type) \
-  callback(cpp_type, ::paddle::framework::proto::VarType::proto_type);
-
-#define _ForEachDataType_(callback)                                        \
-  _ForEachDataTypeHelper_(callback, float, FP32);                          \
-  _ForEachDataTypeHelper_(callback, ::paddle::lite::fluid::float16, FP16); \
-  _ForEachDataTypeHelper_(callback, double, FP64);                         \
-  _ForEachDataTypeHelper_(callback, int, INT32);                           \
-  _ForEachDataTypeHelper_(callback, int64_t, INT64);                       \
-  _ForEachDataTypeHelper_(callback, bool, BOOL);                           \
-  _ForEachDataTypeHelper_(callback, uint8_t, UINT8);                       \
-  _ForEachDataTypeHelper_(callback, int16_t, INT16);                       \
-  _ForEachDataTypeHelper_(callback, int8_t, INT8)
-
-#define DefineDataTypeTrait(cpp_type, proto_type) \
-  template <>                                     \
-  struct DataTypeTrait<cpp_type> {                \
-    constexpr static auto DataType = proto_type;  \
-  }
-
-_ForEachDataType_(DefineDataTypeTrait);
-
-#undef DefineDataTypeTrait
-
-extern framework::proto::VarType::Type ToDataType(std::type_index type);
-extern std::type_index ToTypeIndex(framework::proto::VarType::Type type);
-
-template <typename Visitor>
-inline void VisitDataType(framework::proto::VarType::Type type,
-                          Visitor visitor) {
-#define VisitDataTypeCallback(cpp_type, proto_type) \
-  do {                                              \
-    if (type == proto_type) {                       \
-      visitor.template apply<cpp_type>();           \
-      return;                                       \
-    }                                               \
-  } while (0)
-
-  _ForEachDataType_(VisitDataTypeCallback);
-#undef VisitDataTypeCallback
-  PADDLE_THROW("Not supported %d", type);
-}
-
-extern std::string DataTypeToString(const framework::proto::VarType::Type type);
-extern size_t SizeOfType(framework::proto::VarType::Type type);
-inline std::ostream& operator<<(std::ostream& out,
-                                const framework::proto::VarType::Type& type) {
-  out << DataTypeToString(type);
-  return out;
-}
-
-}  // namespace fluid
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/fluid/data_type_test.cc b/lite/fluid/data_type_test.cc
deleted file mode 100644
index 2a380201f2..0000000000
--- a/lite/fluid/data_type_test.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/framework/data_type.h"
-
-#include <string>
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/tensor.h"
-
-TEST(DataType, float16) {
-  using paddle::framework::Tensor;
-  using paddle::platform::CPUPlace;
-  using paddle::platform::float16;
-  namespace f = paddle::framework;
-  f::proto::VarType::Type dtype = f::proto::VarType::FP16;
-
-  Tensor tensor;
-  CPUPlace cpu;
-  tensor.mutable_data(cpu, dtype);
-
-  // test fp16 tensor
-  EXPECT_EQ(tensor.type(), f::ToDataType(typeid(float16)));
-
-  // test fp16 size
-  EXPECT_EQ(f::SizeOfType(dtype), 2u);
-
-  // test debug info
-  std::string type = "::paddle::platform::float16";
-  EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str());
-}
diff --git a/lite/fluid/eigen.h b/lite/fluid/eigen.h
deleted file mode 100644
index f5d5e4b5e5..0000000000
--- a/lite/fluid/eigen.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "lite/core/tensor.h"
-#include "lite/fluid/float16.h"
-#include "lite/utils/paddle_enforce.h"
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace paddle {
-namespace lite {
-namespace fluid {
-
-// EigenDim converts paddle::platform::DDim into Eigen::DSizes.
-template <int D>
-struct EigenDim {
-  using Type = Eigen::DSizes<Eigen::DenseIndex, D>;
-
-  static Type From(const lite::DDim& dims) {
-    PADDLE_ENFORCE(dims.size() == D, "D must match DDim::size");
-    Type ret;
-    for (int64_t d = 0; d < dims.size(); d++) {
-      ret[d] = dims[d];
-    }
-    return ret;
-  }
-};
-
-// Interpret paddle::platform::Tensor as EigenTensor and EigenConstTensor.
-template <typename T,
-          size_t D,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-struct EigenTensor {
-  // TODO(qijun) Now, default type in unaligned, and we will make a benchmark on
-  // the speed of aligned and unaligned version in future.
-  using Type = Eigen::TensorMap<Eigen::Tensor<T, D, MajorType, IndexType>>;
-
-  using ConstType =
-      Eigen::TensorMap<Eigen::Tensor<const T, D, MajorType, IndexType>>;
-
-  static Type From(Tensor& tensor, lite::DDim dims) {  // NOLINT
-    return Type(const_cast<T*>(tensor.data<T>()),
-                EigenDim<D>::From(dims));  // NOLINT
-  }
-
-  static Type From(Tensor& tensor) {  // NOLINT
-    return From(tensor, tensor.dims());
-  }  // NOLINT
-
-  static ConstType From(const Tensor& tensor, lite::DDim dims) {
-    return ConstType(tensor.data<T>(), EigenDim<D>::From(dims));
-  }
-
-  static ConstType From(const Tensor& tensor) {
-    return From(tensor, tensor.dims());
-  }
-};
-
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
-  static typename EigenMatrix::Type Reshape(Tensor& tensor,  // NOLINT
-                                            int num_col_dims) {
-    int rank = tensor.dims().size();
-    PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank,
-                   "`num_col_dims` must be between (0, rank_of_tensor).");
-    return EigenMatrix::From(tensor, tensor.dims().Flatten2D(num_col_dims));
-  }
-
-  static typename EigenMatrix::ConstType Reshape(const Tensor& tensor,
-                                                 int num_col_dims) {
-    int rank = tensor.dims().size();
-    PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank,
-                   "`num_col_dims` must be between (0, rank_of_tensor).");
-    return EigenMatrix::From(tensor, tensor.dims().Flatten2D(num_col_dims));
-  }
-};
-
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
-  // Flatten reshapes a Tensor into an EigenVector.
-  static typename EigenVector::Type Flatten(Tensor& tensor) {  // NOLINT
-    return EigenVector::From(
-        tensor, lite::DDim(std::vector<int64_t>({tensor.dims().production()})));
-  }
-
-  static typename EigenVector::ConstType Flatten(
-      const Tensor& tensor) {  // NOLINT
-    return EigenVector::From(
-        tensor, lite::DDim(std::vector<int64_t>({tensor.dims().production()})));
-  }
-};
-
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-struct EigenScalar {
-  // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
-  using Type = Eigen::TensorMap<
-      Eigen::TensorFixedSize<T, Eigen::Sizes<>, MajorType, IndexType>>;
-  using ConstType = Eigen::TensorMap<
-      Eigen::TensorFixedSize<const T, Eigen::Sizes<>, MajorType, IndexType>>;
-
-  static Type From(Tensor& tensor) { return Type(tensor.data<T>()); }  // NOLINT
-
-  static ConstType From(const Tensor& tensor) {
-    return ConstType(tensor.data<T>());
-  }
-};
-
-template <lite::TargetType Target>
-struct EigenDevice;
-
-template <>
-struct EigenDevice<lite::TargetType::kX86> {
-  using Type = ::Eigen::DefaultDevice;
-};
-
-template <lite::TargetType Target>
-using EigenDeviceType = typename EigenDevice<Target>::Type;
-
-}  // namespace fluid
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/fluid/float16.h b/lite/fluid/float16.h
deleted file mode 100644
index d1ef6f7dc5..0000000000
--- a/lite/fluid/float16.h
+++ /dev/null
@@ -1,1100 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <limits>
-
-#ifdef PADDLE_WITH_CUDA
-#include <cuda.h>
-#endif  // PADDLE_WITH_CUDA
-
-#ifdef __GNUC__
-#define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__)
-#else
-#define PADDLE_GNUC_VER 0
-#endif  // __GNUC__
-
-#ifdef __clang__
-#define PADDLE_CLANG_VER (__clang_major__ * 10 + __clang_minor__)
-#else
-#define PADDLE_CLANG_VER 0
-#endif  // __clang__
-
-#if defined(__CUDACC__) && CUDA_VERSION >= 7050
-#define PADDLE_CUDA_FP16
-#include <cuda_fp16.h>
-#endif
-
-#if defined(__arm__) || defined(__aarch64__)
-#define PADDLE_ARM
-#endif
-
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-#define PADDLE_NEON
-#include <arm_neon.h>
-#endif
-
-#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
-    (PADDLE_GNUC_VER >= 62 || PADDLE_CLANG_VER >= 37)
-#define PADDLE_WITH_NATIVE_FP16
-#endif
-
-#ifndef PADDLE_ARM
-#include <immintrin.h>
-#endif  // PADDLE_ARM
-
-#if !defined(_WIN32)
-#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
-#else
-#define PADDLE_ALIGN(x) __declspec(align(x))
-#endif
-
-namespace paddle {
-namespace lite {
-namespace fluid {
-
-// Forward declare float16 for eigen.h
-struct float16;
-
-}  // namespace fluid
-}  // namespace lite
-}  // namespace paddle
-
-#include "lite/utils/macros.h"
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace paddle {
-namespace lite {
-namespace fluid {
-
-// Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
-// and aligned at least on a 2-byte boundary, which leads to efficient
-// memory access of float16 struct and also makes float16 compatible
-// with CUDA half, ARM float16_t, and Eigen::half data types.
-struct PADDLE_ALIGN(2) float16 {
- public:
-  uint16_t x;
-
-  // The following defaulted special class member functions
-  // are added to make float16 pass the std::is_trivial test
-  float16() = default;
-  float16(const float16& o) = default;
-  float16& operator=(const float16& o) = default;
-  float16(float16&& o) = default;
-  float16& operator=(float16&& o) = default;
-  ~float16() = default;
-
-// Constructors
-#ifdef PADDLE_CUDA_FP16
-  HOSTDEVICE inline explicit float16(const half& h) {
-#if CUDA_VERSION >= 9000
-    x = reinterpret_cast<__half_raw*>(const_cast<half*>(&h))->x;
-#else
-    x = h.x;
-#endif  // CUDA_VERSION >= 9000
-  }
-#endif  // PADDLE_CUDA_FP16
-
-  HOSTDEVICE inline explicit float16(const Eigen::half& h) : x(h.x) {}
-
-#ifdef PADDLE_WITH_NATIVE_FP16
-  // __fp16 is a native half precision data type for arm cpu,
-  // float16_t is an alias for __fp16
-  HOSTDEVICE inline explicit float16(const float16_t& h) {
-    x = *reinterpret_cast<const uint16_t*>(&h);
-  }
-#endif
-
-  HOSTDEVICE inline explicit float16(float val) {
-#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
-    half tmp = __float2half(val);
-    x = *reinterpret_cast<uint16_t*>(&tmp);
-
-#elif defined(PADDLE_WITH_NATIVE_FP16)
-    float32x4_t tmp = vld1q_dup_f32(&val);
-    float16_t res = vget_lane_f16(vcvt_f16_f32(tmp), 0);
-    x = *reinterpret_cast<uint16_t*>(&res);
-
-#elif defined(__F16C__)
-    x = _cvtss_sh(val, 0);
-
-#else
-    // Conversion routine adapted from
-    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
-    Bits v, s;
-    v.f = val;
-    uint32_t sign = v.si & sigN;
-    v.si ^= sign;
-    sign >>= shiftSign;  // logical shift
-    s.si = mulN;
-    s.si = s.f * v.f;  // correct subnormals
-    v.si ^= (s.si ^ v.si) & -(minN > v.si);
-    v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
-    v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
-    v.ui >>= shift;  // logical shift
-    v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
-    v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
-    x = v.ui | sign;
-
-#endif
-  }
-
-  HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {}
-
-  template <class T>
-  HOSTDEVICE inline explicit float16(const T& val)
-      : x(float16(static_cast<float>(val)).x) {}
-
-// Assignment operators
-#ifdef PADDLE_CUDA_FP16
-  HOSTDEVICE inline float16& operator=(const half& rhs) {
-#if CUDA_VERSION >= 9000
-    x = reinterpret_cast<__half_raw*>(const_cast<half*>(&rhs))->x;
-#else
-    x = rhs.x;
-#endif
-    return *this;
-  }
-#endif
-
-  HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) {
-    x = rhs.x;
-    return *this;
-  }
-
-#ifdef PADDLE_WITH_NATIVE_FP16
-  HOSTDEVICE inline float16& operator=(const float16_t& rhs) {
-    x = *reinterpret_cast<const uint16_t*>(&rhs);
-    return *this;
-  }
-#endif
-
-  HOSTDEVICE inline float16& operator=(bool b) {
-    x = b ? 0x3c00 : 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(int8_t val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(uint8_t val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(int16_t val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(uint16_t val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(int32_t val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(uint32_t val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(int64_t val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(uint64_t val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(float val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-  HOSTDEVICE inline float16& operator=(double val) {
-    x = float16(val).x;
-    return *this;
-  }
-
-// Conversion opertors
-#ifdef PADDLE_CUDA_FP16
-  HOSTDEVICE inline explicit operator half() const {
-#if CUDA_VERSION >= 9000
-    __half_raw h;
-    h.x = x;
-    return half(h);
-#else
-    half h;
-    h.x = x;
-    return h;
-#endif  // CUDA_VERSION >= 9000
-  }
-#endif  // PADDLE_CUDA_FP16
-
-  HOSTDEVICE inline explicit operator Eigen::half() const {
-    Eigen::half h;
-    h.x = x;
-    return h;
-  }
-
-#ifdef PADDLE_WITH_NATIVE_FP16
-  HOSTDEVICE inline explicit operator float16_t() const {
-    return *reinterpret_cast<const float16_t*>(this);
-  }
-#endif
-
-  HOSTDEVICE inline explicit operator float() const {
-#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
-    half tmp = *reinterpret_cast<const half*>(this);
-    return __half2float(tmp);
-
-#elif defined(PADDLE_WITH_NATIVE_FP16)
-    float16x4_t res = vld1_dup_f16(reinterpret_cast<const float16_t*>(this));
-    return vgetq_lane_f32(vcvt_f32_f16(res), 0);
-
-#elif defined(__F16C__)
-    return _cvtsh_ss(this->x);
-
-#else
-    // Conversion routine adapted from
-    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
-    Bits v;
-    v.ui = this->x;
-    int32_t sign = v.si & sigC;
-    v.si ^= sign;
-    sign <<= shiftSign;
-    v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
-    v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
-    Bits s;
-    s.si = mulC;
-    s.f *= v.si;
-    int32_t mask = -(norC > v.si);
-    v.si <<= shift;
-    v.si ^= (s.si ^ v.si) & mask;
-    v.si |= sign;
-    return v.f;
-
-#endif
-  }
-
-  HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
-
-  HOSTDEVICE inline explicit operator int8_t() const {
-    return static_cast<int8_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator uint8_t() const {
-    return static_cast<uint8_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator int16_t() const {
-    return static_cast<int16_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator uint16_t() const {
-    return static_cast<uint16_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator int32_t() const {
-    return static_cast<int32_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator uint32_t() const {
-    return static_cast<uint32_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator int64_t() const {
-    return static_cast<int64_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator uint64_t() const {
-    return static_cast<uint64_t>(static_cast<float>(*this));
-  }
-
-  HOSTDEVICE inline explicit operator double() const {
-    return static_cast<double>(static_cast<float>(*this));
-  }
-
- private:
-  union Bits {
-    float f;
-    int32_t si;
-    uint32_t ui;
-  };
-
-  static const int shift = 13;
-  static const int shiftSign = 16;
-
-  static const int32_t infN = 0x7F800000;
-  static const int32_t maxN = 0x477FE000;  // max flt16 as flt32
-  static const int32_t minN = 0x38800000;  // min flt16 normal as flt32
-  static const int32_t sigN = 0x80000000;  // sign bit
-
-  static constexpr int32_t infC = infN >> shift;
-  static constexpr int32_t nanN = (infC + 1)
-                                  << shift;  // minimum flt16 nan as float32
-  static constexpr int32_t maxC = maxN >> shift;
-  static constexpr int32_t minC = minN >> shift;
-  static constexpr int32_t sigC = sigN >> shiftSign;
-
-  static const int32_t mulN = 0x52000000;  // (1 << 23) / minN
-  static const int32_t mulC = 0x33800000;  // minN / (1 << (23 - shift))
-  static const int32_t subC = 0x003FF;     // max flt32 subnormal downshifted
-  static const int32_t norC = 0x00400;     // min flt32 normal downshifted
-
-  static constexpr int32_t maxD = infC - maxC - 1;
-  static constexpr int32_t minD = minC - subC - 1;
-};
-
-// Arithmetic operators on GPU
-// CUDA 9.0 provides built-in arithmetic operators for half while
-// CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are
-// for users to write similar CUDA code in CUDA 7.5 and 8.0 as in
-// CUDA 9.0 regarding the half data type.
-#if defined(PADDLE_CUDA_FP16) && CUDA_VERSION < 9000
-
-DEVICE inline half operator+(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hadd(a, b);
-#else
-  float res = static_cast<float>(float16(a)) + static_cast<float>(float16(b));
-  return half(float16(res));
-#endif
-}
-
-DEVICE inline half operator-(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hsub(a, b);
-#else
-  float res = static_cast<float>(float16(a)) - static_cast<float>(float16(b));
-  return half(float16(res));
-#endif
-}
-
-DEVICE inline half operator*(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hmul(a, b);
-#else
-  float res = static_cast<float>(float16(a)) * static_cast<float>(float16(b));
-  return half(float16(res));
-#endif
-}
-
-DEVICE inline half operator/(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
-  float num = __half2float(a);
-  float denom = __half2float(b);
-  return __float2half(num / denom);
-#else
-  float res = static_cast<float>(float16(a)) / static_cast<float>(float16(b));
-  return half(float16(res));
-#endif
-}
-
-DEVICE inline half operator-(const half& a) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hneg(a);
-#else
-  float res = -static_cast<float>(float16(a));
-  return half(float16(res));
-#endif
-}
-
-DEVICE inline half& operator+=(half& a, const half& b) {  // NOLINT
-  a = a + b;
-  return a;
-}
-
-DEVICE inline half& operator-=(half& a, const half& b) {  // NOLINT
-  a = a - b;
-  return a;
-}
-
-DEVICE inline half& operator*=(half& a, const half& b) {  // NOLINT
-  a = a * b;
-  return a;
-}
-
-DEVICE inline half& operator/=(half& a, const half& b) {  // NOLINT
-  a = a / b;
-  return a;
-}
-
-DEVICE inline bool operator==(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __heq(a, b);
-#else
-  return static_cast<float>(float16(a)) == static_cast<float>(float16(b));
-#endif
-}
-
-DEVICE inline bool operator!=(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hne(a, b);
-#else
-  return static_cast<float>(float16(a)) != static_cast<float>(float16(b));
-#endif
-}
-
-DEVICE inline bool operator<(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hlt(a, b);
-#else
-  return static_cast<float>(float16(a)) < static_cast<float>(float16(b));
-#endif
-}
-
-DEVICE inline bool operator<=(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hle(a, b);
-#else
-  return static_cast<float>(float16(a)) <= static_cast<float>(float16(b));
-#endif
-}
-
-DEVICE inline bool operator>(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hgt(a, b);
-#else
-  return static_cast<float>(float16(a)) > static_cast<float>(float16(b));
-#endif
-}
-
-DEVICE inline bool operator>=(const half& a, const half& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hge(a, b);
-#else
-  return static_cast<float>(float16(a)) >= static_cast<float>(float16(b));
-#endif
-}
-
-#endif  // PADDLE_CUDA_FP16
-
-// Arithmetic operators for float16 on GPU
-#if defined(PADDLE_CUDA_FP16)
-HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return float16(__hadd(half(a), half(b)));
-#else
-  return float16(static_cast<float>(a) + static_cast<float>(b));
-#endif
-}
-
-HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return float16(__hsub(half(a), half(b)));
-#else
-  return float16(static_cast<float>(a) - static_cast<float>(b));
-#endif
-}
-
-HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return float16(__hmul(half(a), half(b)));
-#else
-  return float16(static_cast<float>(a) * static_cast<float>(b));
-#endif
-}
-
-HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
-  // TODO(kexinzhao): check which cuda version starts to support __hdiv
-  float num = __half2float(half(a));
-  float denom = __half2float(half(b));
-  return float16(num / denom);
-#else
-  return float16(static_cast<float>(a) / static_cast<float>(b));
-#endif
-}
-
-HOSTDEVICE inline float16 operator-(const float16& a) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return float16(__hneg(half(a)));
-#else
-  float16 res;
-  res.x = a.x ^ 0x8000;
-  return res;
-#endif
-}
-
-HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {  // NOLINT
-  a = a + b;
-  return a;
-}
-
-HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {  // NOLINT
-  a = a - b;
-  return a;
-}
-
-HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {  // NOLINT
-  a = a * b;
-  return a;
-}
-
-HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
-  a = a / b;
-  return a;
-}
-
-HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __heq(half(a), half(b));
-#else
-  return static_cast<float>(a) == static_cast<float>(b);
-#endif
-}
-
-HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hne(half(a), half(b));
-#else
-  return static_cast<float>(a) != static_cast<float>(b);
-#endif
-}
-
-HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hlt(half(a), half(b));
-#else
-  return static_cast<float>(a) < static_cast<float>(b);
-#endif
-}
-
-HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hle(half(a), half(b));
-#else
-  return static_cast<float>(a) <= static_cast<float>(b);
-#endif
-}
-
-HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hgt(half(a), half(b));
-#else
-  return static_cast<float>(a) > static_cast<float>(b);
-#endif
-}
-
-HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hge(half(a), half(b));
-#else
-  return static_cast<float>(a) >= static_cast<float>(b);
-#endif
-}
-
-// Arithmetic operators for float16 on ARMv8.2-A CPU
-#elif defined(PADDLE_WITH_NATIVE_FP16)
-inline float16 operator+(const float16& a, const float16& b) {
-  float16 res;
-  asm volatile(
-      "ld1 {v0.h}[0], [%[a_ptr]]\n"
-      "ld1 {v1.h}[0], [%[b_ptr]]\n"
-      "fadd h0, h0, h1\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)),
-      [b_ptr] "r"(&(b.x)),
-      [res_ptr] "r"(&(res.x))
-      :  // clobbers
-      "memory", "v0", "v1");
-  return res;
-}
-
-inline float16 operator-(const float16& a, const float16& b) {
-  float16 res;
-  asm volatile(
-      "ld1 {v0.h}[0], [%[a_ptr]]\n"
-      "ld1 {v1.h}[0], [%[b_ptr]]\n"
-      "fsub h0, h0, h1\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)),
-      [b_ptr] "r"(&(b.x)),
-      [res_ptr] "r"(&(res.x))
-      :  // clobbers
-      "memory", "v0", "v1");
-  return res;
-}
-
-inline float16 operator*(const float16& a, const float16& b) {
-  float16 res;
-  asm volatile(
-      "ld1 {v0.h}[0], [%[a_ptr]]\n"
-      "ld1 {v1.h}[0], [%[b_ptr]]\n"
-      "fmul h0, h0, h1\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)),
-      [b_ptr] "r"(&(b.x)),
-      [res_ptr] "r"(&(res.x))
-      :  // clobbers
-      "memory", "v0", "v1");
-  return res;
-}
-
-inline float16 operator/(const float16& a, const float16& b) {
-  float16 res;
-  asm volatile(
-      "ld1 {v0.h}[0], [%[a_ptr]]\n"
-      "ld1 {v1.h}[0], [%[b_ptr]]\n"
-      "fdiv h0, h0, h1\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)),
-      [b_ptr] "r"(&(b.x)),
-      [res_ptr] "r"(&(res.x))
-      :  // clobbers
-      "memory", "v0", "v1");
-  return res;
-}
-
-inline float16 operator-(const float16& a) {
-  float16 res;
-  asm volatile(
-      "ld1 {v0.h}[0], [%[a_ptr]]\n"
-      "fneg h0, h0\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)),
-      [res_ptr] "r"(&(res.x))
-      :  // clobbers
-      "memory", "v0");
-  return res;
-}
-
-inline float16& operator+=(float16& a, const float16& b) {  // NOLINT
-  a = a + b;
-  return a;
-}
-
-inline float16& operator-=(float16& a, const float16& b) {  // NOLINT
-  a = a - b;
-  return a;
-}
-
-inline float16& operator*=(float16& a, const float16& b) {  // NOLINT
-  a = a * b;
-  return a;
-}
-
-inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
-  a = a / b;
-  return a;
-}
-
-inline bool operator==(const float16& a, const float16& b) {
-  uint16_t res;
-  asm volatile(
-      "ld1 {v0.h}[0], [%[a_ptr]]\n"
-      "ld1 {v1.h}[0], [%[b_ptr]]\n"
-      "fcmeq h0, h0, h1\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)),
-      [b_ptr] "r"(&(b.x)),
-      [res_ptr] "r"(&res)
-      :  // clobbers
-      "memory", "v0", "v1");
-  return (res & 0xffff) != 0;
-}
-
-inline bool operator!=(const float16& a, const float16& b) { return !(a == b); }
-
-inline bool operator<(const float16& a, const float16& b) {
-  uint16_t res;
-  asm volatile(
-      "ld1 {v1.h}[0], [%[a_ptr]]\n"
-      "ld1 {v0.h}[0], [%[b_ptr]]\n"
-      "fcmgt h0, h0, h1\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)),
-      [b_ptr] "r"(&(b.x)),
-      [res_ptr] "r"(&res)
-      :  // clobbers
-      "memory", "v0", "v1");
-  return (res & 0xffff) != 0;
-}
-
-inline bool operator<=(const float16& a, const float16& b) {
-  uint16_t res;
-  asm volatile(
-      "ld1 {v1.h}[0], [%[a_ptr]]\n"
-      "ld1 {v0.h}[0], [%[b_ptr]]\n"
-      "fcmge h0, h0, h1\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)),
-      [b_ptr] "r"(&(b.x)),
-      [res_ptr] "r"(&res)
-      :  // clobbers
-      "memory", "v0", "v1");
-  return (res & 0xffff) != 0;
-}
-
-inline bool operator>(const float16& a, const float16& b) {
-  uint16_t res;
-  asm volatile(
-      "ld1 {v0.h}[0], [%[a_ptr]]\n"
-      "ld1 {v1.h}[0], [%[b_ptr]]\n"
-      "fcmgt h0, h0, h1\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)),
-      [b_ptr] "r"(&(b.x)),
-      [res_ptr] "r"(&res)
-      :  // clobbers
-      "memory", "v0", "v1");
-  return (res & 0xffff) != 0;
-}
-
-inline bool operator>=(const float16& a, const float16& b) {
-  uint16_t res;
-  asm volatile(
-      "ld1 {v0.h}[0], [%[a_ptr]]\n"
-      "ld1 {v1.h}[0], [%[b_ptr]]\n"
-      "fcmge h0, h0, h1\n"
-      "st1 {v0.h}[0], [%[res_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [a_ptr] "r"(&(a.x)),
-      [b_ptr] "r"(&(b.x)),
-      [res_ptr] "r"(&res)
-      :  // clobbers
-      "memory", "v0", "v1");
-  return (res & 0xffff) != 0;
-}
-
-// Arithmetic operators for float16, software emulated on other CPU
-#else
-inline float16 operator+(const float16& a, const float16& b) {
-  return float16(static_cast<float>(a) + static_cast<float>(b));
-}
-
-inline float16 operator-(const float16& a, const float16& b) {
-  return float16(static_cast<float>(a) - static_cast<float>(b));
-}
-
-inline float16 operator*(const float16& a, const float16& b) {
-  return float16(static_cast<float>(a) * static_cast<float>(b));
-}
-
-inline float16 operator/(const float16& a, const float16& b) {
-  return float16(static_cast<float>(a) / static_cast<float>(b));
-}
-
-inline float16 operator-(const float16& a) {
-  float16 res;
-  res.x = a.x ^ 0x8000;
-  return res;
-}
-
-inline float16& operator+=(float16& a, const float16& b) {  // NOLINT
-  a = float16(static_cast<float>(a) + static_cast<float>(b));
-  return a;
-}
-
-inline float16& operator-=(float16& a, const float16& b) {  // NOLINT
-  a = float16(static_cast<float>(a) - static_cast<float>(b));
-  return a;
-}
-
-inline float16& operator*=(float16& a, const float16& b) {  // NOLINT
-  a = float16(static_cast<float>(a) * static_cast<float>(b));
-  return a;
-}
-
-inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
-  a = float16(static_cast<float>(a) / static_cast<float>(b));
-  return a;
-}
-
-inline bool operator==(const float16& a, const float16& b) {
-  return static_cast<float>(a) == static_cast<float>(b);
-}
-
-inline bool operator!=(const float16& a, const float16& b) {
-  return static_cast<float>(a) != static_cast<float>(b);
-}
-
-inline bool operator<(const float16& a, const float16& b) {
-  return static_cast<float>(a) < static_cast<float>(b);
-}
-
-inline bool operator<=(const float16& a, const float16& b) {
-  return static_cast<float>(a) <= static_cast<float>(b);
-}
-
-inline bool operator>(const float16& a, const float16& b) {
-  return static_cast<float>(a) > static_cast<float>(b);
-}
-
-inline bool operator>=(const float16& a, const float16& b) {
-  return static_cast<float>(a) >= static_cast<float>(b);
-}
-#endif
-
-HOSTDEVICE inline float16 raw_uint16_to_float16(uint16_t a) {
-  float16 res;
-  res.x = a;
-  return res;
-}
-
-HOSTDEVICE inline bool(isnan)(const float16& a) {
-#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hisnan(half(a));
-#else
-  return (a.x & 0x7fff) > 0x7c00;
-#endif
-}
-
-HOSTDEVICE inline bool(isinf)(const float16& a) {
-  return (a.x & 0x7fff) == 0x7c00;
-}
-
-HOSTDEVICE inline bool(isfinite)(const float16& a) {
-  return !((isnan)(a)) && !((isinf)(a));
-}
-
-inline std::ostream& operator<<(std::ostream& os, const float16& a) {
-  os << static_cast<float>(a);
-  return os;
-}
-
-}  // namespace fluid
-}  // namespace lite
-}  // namespace paddle
-
-namespace std {
-
-// Override the std::is_pod::value for float16
-// The reason is that different compilers implemented std::is_pod based on
-// different C++ standards. float16 class is a plain old data in C++11 given
-// that it is both trivial and standard_layout.
-// However, std::is_pod in nvcc 8.0 host c++ compiler follows C++0x and is
-// more restricted in that you cannot provide any customized
-// constructor in float16. Hence, we override is_pod here following C++11
-// so that .cu files can be successfully compiled by nvcc.
-template <>
-struct is_pod<paddle::lite::fluid::float16> {
-  static const bool value =
-      is_trivial<paddle::lite::fluid::float16>::value &&
-      is_standard_layout<paddle::lite::fluid::float16>::value;
-};
-
-template <>
-struct is_floating_point<paddle::lite::fluid::float16>
-    : std::integral_constant<
-          bool,
-          std::is_same<paddle::lite::fluid::float16,
-                       typename std::remove_cv<
-                           paddle::lite::fluid::float16>::type>::value> {};
-template <>
-struct is_signed<paddle::lite::fluid::float16> {
-  static const bool value = true;
-};
-
-template <>
-struct is_unsigned<paddle::lite::fluid::float16> {
-  static const bool value = false;
-};
-
-inline bool isnan(const paddle::lite::fluid::float16& a) {
-  return paddle::lite::fluid::isnan(a);
-}
-
-inline bool isinf(const paddle::lite::fluid::float16& a) {
-  return paddle::lite::fluid::isinf(a);
-}
-
-template <>
-struct numeric_limits<paddle::lite::fluid::float16> {
-  static const bool is_specialized = true;
-  static const bool is_signed = true;
-  static const bool is_integer = false;
-  static const bool is_exact = false;
-  static const bool has_infinity = true;
-  static const bool has_quiet_NaN = true;
-  static const bool has_signaling_NaN = true;
-  static const float_denorm_style has_denorm = denorm_present;
-  static const bool has_denorm_loss = false;
-  static const std::float_round_style round_style = std::round_to_nearest;
-  static const bool is_iec559 = false;
-  static const bool is_bounded = false;
-  static const bool is_modulo = false;
-  static const int digits = 11;
-  static const int digits10 = 3;
-  static const int max_digits10 = 5;
-  static const int radix = 2;
-  static const int min_exponent = -13;
-  static const int min_exponent10 = -4;
-  static const int max_exponent = 16;
-  static const int max_exponent10 = 4;
-  static const bool traps = true;
-  static const bool tinyness_before = false;
-
-  static paddle::lite::fluid::float16(min)() {
-    return paddle::lite::fluid::raw_uint16_to_float16(0x400);
-  }
-  static paddle::lite::fluid::float16 lowest() {
-    return paddle::lite::fluid::raw_uint16_to_float16(0xfbff);
-  }
-  static paddle::lite::fluid::float16(max)() {
-    return paddle::lite::fluid::raw_uint16_to_float16(0x7bff);
-  }
-  static paddle::lite::fluid::float16 epsilon() {
-    return paddle::lite::fluid::raw_uint16_to_float16(0x0800);
-  }
-  static paddle::lite::fluid::float16 round_error() {
-    return paddle::lite::fluid::float16(0.5);
-  }
-  static paddle::lite::fluid::float16 infinity() {
-    return paddle::lite::fluid::raw_uint16_to_float16(0x7c00);
-  }
-  static paddle::lite::fluid::float16 quiet_NaN() {
-    return paddle::lite::fluid::raw_uint16_to_float16(0x7e00);
-  }
-  static paddle::lite::fluid::float16 signaling_NaN() {
-    return paddle::lite::fluid::raw_uint16_to_float16(0x7e00);
-  }
-  static paddle::lite::fluid::float16 denorm_min() {
-    return paddle::lite::fluid::raw_uint16_to_float16(0x1);
-  }
-};
-
-}  // namespace std
-
-namespace Eigen {
-
-using float16 = paddle::lite::fluid::float16;
-
-template <>
-struct NumTraits<float16> : GenericNumTraits<float16> {
-  enum {
-    IsSigned = true,
-    IsInteger = false,
-    IsComplex = false,
-    RequireInitialization = false
-  };
-
-  HOSTDEVICE static inline float16 epsilon() {
-    return paddle::lite::fluid::raw_uint16_to_float16(0x0800);
-  }
-  HOSTDEVICE static inline float16 dummy_precision() { return float16(1e-2f); }
-  HOSTDEVICE static inline float16 highest() {
-    return paddle::lite::fluid::raw_uint16_to_float16(0x7bff);
-  }
-  HOSTDEVICE static inline float16 lowest() {
-    return paddle::lite::fluid::raw_uint16_to_float16(0xfbff);
-  }
-  HOSTDEVICE static inline float16 infinity() {
-    return paddle::lite::fluid::raw_uint16_to_float16(0x7c00);
-  }
-  HOSTDEVICE static inline float16 quiet_NaN() {
-    return paddle::lite::fluid::raw_uint16_to_float16(0x7c01);
-  }
-};
-
-namespace numext {
-
-template <>
-HOSTDEVICE inline bool(isnan)(const float16& a) {
-  return (paddle::lite::fluid::isnan)(a);
-}
-
-template <>
-HOSTDEVICE inline bool(isinf)(const float16& a) {
-  return (paddle::lite::fluid::isinf)(a);
-}
-
-template <>
-HOSTDEVICE inline bool(isfinite)(const float16& a) {
-  return (paddle::lite::fluid::isfinite)(a);
-}
-
-template <>
-HOSTDEVICE inline float16 exp(const float16& a) {
-  return float16(::expf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 erf(const float16& a) {
-  return float16(::erff(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 log(const float16& a) {
-  return float16(::logf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 tanh(const float16& a) {
-  return float16(::tanhf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 sqrt(const float16& a) {
-  return float16(::sqrtf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 ceil(const float16& a) {
-  return float16(::ceilf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 floor(const float16& a) {
-  return float16(::floorf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 round(const float16& a) {
-  return float16(::roundf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 pow(const float16& a, const float16& b) {
-  return float16(::powf(static_cast<float>(a), static_cast<float>(b)));
-}
-
-template <>
-HOSTDEVICE inline float16 abs(const float16& a) {
-  return float16(::fabs(static_cast<float>(a)));
-}
-
-}  // namespace numext
-
-}  // namespace Eigen
diff --git a/lite/fluid/lod.h b/lite/fluid/lod.h
deleted file mode 100644
index 68068ba1d0..0000000000
--- a/lite/fluid/lod.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <vector>
-
-namespace paddle {
-namespace lite {
-namespace fluid {
-using LoD = std::vector<std::vector<size_t>>;
-
-LoD ToAbsOffset(const LoD &in) {
-  // the lowest level stores relative offsets
-  if (in.empty() || in.size() == 1) return in;
-  LoD result = in;
-  for (auto level = static_cast<int>(in.size() - 2); level >= 0; level--) {
-    for (size_t i = 0; i < in[level].size(); ++i) {
-      size_t index = in[level][i];
-      result[level][i] = result[level + 1][index];
-    }
-  }
-  return result;
-}
-}  // namespace fluid
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/fluid/math.h b/lite/fluid/math.h
deleted file mode 100644
index 8cc24200d3..0000000000
--- a/lite/fluid/math.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/hostdevice.h"
-
-#include "math.h"  // NOLINT
-
-namespace paddle {
-namespace operators {
-
-inline HOSTDEVICE platform::float16 real_exp(platform::float16 x) {
-  return static_cast<platform::float16>(::expf(static_cast<float>(x)));
-}
-
-inline HOSTDEVICE float real_exp(float x) { return ::expf(x); }
-
-inline HOSTDEVICE double real_exp(double x) { return ::exp(x); }
-
-inline HOSTDEVICE platform::float16 real_log(platform::float16 x) {
-  return static_cast<platform::float16>(::logf(static_cast<float>(x)));
-}
-
-inline HOSTDEVICE float real_log(float x) { return ::logf(x); }
-
-inline HOSTDEVICE double real_log(double x) { return ::log(x); }
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/lite/gen_code/CMakeLists.txt b/lite/gen_code/CMakeLists.txt
deleted file mode 100644
index d83657ad3e..0000000000
--- a/lite/gen_code/CMakeLists.txt
+++ /dev/null
@@ -1,49 +0,0 @@
-if (LITE_ON_TYNY_PUBLISH)
-    return()
-endif()
-
-lite_cc_library(gen_code SRCS gen_code.cc
-        DEPS program op scope
-        cpp_op_desc
-        HVY_DEPS operator)
-lite_cc_library(paddle_infer_gencode SRCS paddle_infer.cc DEPS program utils)
-
-lite_cc_test(test_gen_code SRCS gen_code_test.cc 
-        DEPS gen_code tensor ${host_kernels} ${ops}
-        compatible_pb
-        model_parser
-        X86_DEPS ${x86_kernels}
-        ARM_DEPS ${arm_kernels}
-        NPU_DEPS ${npu_kernels}
-        CL_DEPS ${opencl_kernels}
-        FPGA_DEPS ${fpga_kernels}
-        EXCLUDE_COMPILE_DEPS "ON"
-        ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
-
-lite_cc_library(__generated_code__
-    SRCS ${CMAKE_BINARY_DIR}/lite/gen_code/__generated_code__.cc
-    DEPS scope op kernel paddle_infer_gencode
-    EXCLUDE_COMPILE_DEPS "ON"
-)
-if(WITH_TESTING)
-    add_dependencies(__generated_code__ test_gen_code)
-    add_dependencies(__generated_code__ extern_lite_download_lite_naive_model_tar_gz)
-endif(WITH_TESTING)
-
-lite_cc_binary(paddle_code_generator SRCS paddle_code_generator.cc DEPS model_parser gen_code gflags)
-
-# TODO(xxx): fix the gen code bug on ios
-if(IOS)
-    return()
-endif()
-
-lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_code__
-    ${ops} ${host_kernels}
-    X86_DEPS ${x86_kernels}
-    ARM_DEPS ${arm_kernels}
-    NPU_DEPS ${npu_kernels}
-    CL_DEPS ${opencl_kernels}
-    FPGA_DEPS ${fpga_kernels}
-    EXCLUDE_COMPILE_DEPS "ON"
-)
- 
diff --git a/lite/gen_code/gen_code.cc b/lite/gen_code/gen_code.cc
deleted file mode 100644
index 0d8f4d0d19..0000000000
--- a/lite/gen_code/gen_code.cc
+++ /dev/null
@@ -1,223 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/gen_code/gen_code.h"
-#include <algorithm>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace lite {
-namespace gencode {
-
-void Module::AddWeight(const std::string &name, const TensorRepr &tensor) {
-  auto w_name = WeightUniqueName();
-  Line(string_format("// Create weight: %s", name.c_str()));
-  // auto* w0 = scope.Var("w0")->GetMutable<lite::Tensor>();
-  Line(string_format("auto* %s = scope->Var(%s)->GetMutable<lite::Tensor>();",
-                     w_name.c_str(),
-                     Repr(name).c_str()));
-  // lite::DDim w_ddim({1, 2})
-  Line(string_format("lite::DDim %s_ddim(std::vector<int64_t>(%s));",
-                     w_name.c_str(),
-                     tensor.ddim.repr().c_str()));
-  // std::vector<float> w_data({});
-  auto w_data_repr = DataRepr(
-      std::string(static_cast<const char *>(tensor.raw_data), tensor.num_bytes),
-      tensor.dtype);
-  Line(string_format("std::vector<%s> %s_data({%s});",
-                     PrecisionToStr(tensor.dtype).c_str(),
-                     w_name.c_str(),
-                     w_data_repr.c_str()));
-  // w0->Assign<float, lite::DDim, TARGET(kX86)>(w0_data.data(), w0_ddim);
-  Line(string_format(
-      "%s->Assign<%s, lite::DDim, TARGET(kX86)>(%s_data.data(), %s_ddim);",
-      w_name.c_str(),
-      PrecisionToStr(tensor.dtype).c_str(),
-      w_name.c_str(),
-      w_name.c_str()));
-  Line("");
-}
-
-void Module::AddHeaderIncludeGenCode() {
-  Line("");
-  Line("#include <string>");
-  Line("#include <vector>");
-  Line("#include \"lite/core/tensor.h\"");
-  Line("#include \"lite/core/context.h\"");
-  Line("#include \"lite/gen_code/paddle_infer.h\"");
-  Line("#include \"lite/core/op_registry.h\"");
-  Line("#include \"lite/core/scope.h\"");
-  Line("#include \"lite/model_parser/cpp/op_desc.h\"");
-  Line("");
-  Line("");
-}
-
-std::string Module::DataRepr(const std::string &raw_data, PrecisionType dtype) {
-  STL::stringstream ss;
-  switch (dtype) {
-    case PRECISION(kFloat): {
-      const float *raw = reinterpret_cast<const float *>(raw_data.c_str());
-      int num_elems = raw_data.size() / sizeof(float);
-      if (num_elems) {
-        for (int i = 0; i < num_elems - 1; i++) {
-          ss << raw[i] << ",";
-        }
-        ss << raw[num_elems - 1];
-      }
-    } break;
-
-    default:
-      LOG(FATAL) << "Unsupported type " << PrecisionToStr(dtype);
-  }
-  return ss.str();
-}
-
-void Module::AddOpDescHelper(const std::string &op_id,
-                             const cpp::OpDesc &desc) {
-  std::string desc_var = op_id + "_desc";
-  Line(string_format("lite::cpp::OpDesc %s;", desc_var.c_str()));
-  auto vec_str_repr = [](const std::vector<std::string> &vec) {
-    return Repr(vec);
-  };
-  for (auto &item : desc.inputs()) {
-    Line(string_format("%s.SetInput(%s, %s);",
-                       desc_var.c_str(),
-                       Repr(item.first).c_str(),
-                       vec_str_repr(item.second).c_str()));
-  }
-
-  for (auto &item : desc.outputs()) {
-    Line(string_format("%s.SetOutput(%s, %s);",
-                       desc_var.c_str(),
-                       Repr(item.first).c_str(),
-                       vec_str_repr(item.second).c_str()));
-  }
-
-  auto attr_repr = [&](const std::string &name) -> std::string {
-    using AttrType = OpDescAPI::AttrType;
-    auto type = desc.GetAttrType(name);
-
-    switch (type) {
-      case AttrType::INT:
-        return std::to_string(desc.GetAttr<int>(name));
-      case AttrType::FLOAT:
-        return std::to_string(desc.GetAttr<float>(name));
-      case AttrType::BOOLEAN:
-        return std::to_string(desc.GetAttr<bool>(name));
-      case AttrType::STRING:
-        return "\"" + desc.GetAttr<std::string>(name) + "\"";
-      case AttrType::FLOATS: {
-        auto vals = desc.GetAttr<std::vector<float>>(name);
-        return "{" + Join(vals, ",") + "}";
-      }
-      case AttrType::INTS: {
-        auto vals = desc.GetAttr<std::vector<int>>(name);
-        return "{" + Join(vals, ",") + "}";
-      }
-
-      case AttrType::STRINGS: {
-        std::vector<std::string> tmp;
-        auto vals = desc.GetAttr<std::vector<std::string>>(name);
-        std::transform(vals.begin(),
-                       vals.end(),
-                       std::back_inserter(tmp),
-                       [](const std::string &x) { return Repr(x); });
-        return "{" + Join(tmp, ",") + "}";
-      }
-      default:
-        LOG(FATAL) << "Unsupported attribute type: " << static_cast<int>(type);
-    }
-    return "";
-  };
-
-  auto attr_type_repr = [&](const std::string &name) -> std::string {
-    using AttrType = OpDescAPI::AttrType;
-    auto type = desc.GetAttrType(name);
-
-    switch (type) {
-      case AttrType::INT:
-        return "int";
-      case AttrType::FLOAT:
-        return "float";
-      case AttrType::BOOLEAN:
-        return "bool";
-      case AttrType::STRING:
-        return "std::string";
-      case AttrType::FLOATS:
-        return "std::vector<float>";
-      case AttrType::STRINGS:
-        return "std::vector<std::string>";
-      case AttrType::INTS:
-        return "std::vector<int>";
-      default:
-        LOG(FATAL) << "Unsupported attribute type: " << static_cast<int>(type);
-    }
-
-    return "unk_t";
-  };
-  for (auto &item : desc.AttrNames()) {
-    // Drop the python information.
-    if (item == "op_callstack") continue;
-    auto attr_type = attr_type_repr(item);
-    auto attr_val = attr_repr(item);
-    Line(string_format("%s.SetAttr<%s>(%s, %s);",  //
-                       desc_var.c_str(),
-                       attr_type.c_str(),
-                       Repr(item).c_str(),
-                       attr_val.c_str()));
-  }
-}
-
-void Module::AddOp(const cpp::OpDesc &op) {
-  auto op_name = OpUniqueName();
-  AddOpDescHelper(op_name, op);
-
-  LOG(INFO) << "add op " << op_name;
-
-  Line(string_format("// Create Op: %s", op.Type().c_str()));
-
-  Line(string_format("auto %s = lite::LiteOpRegistry::Global().Create(\"%s\");",
-                     op_name.c_str(),
-                     op.Type().c_str()));
-
-  CHECK(op.HasAttr(kKernelTypeAttr))
-      << "the kernel type should be specified before generate code.";
-  auto kernel_type = op.GetAttr<std::string>(kKernelTypeAttr);
-  Line(string_format("%s->Attach(%s, exec_scope);",
-                     op_name.c_str(),
-                     (op_name + "_desc").c_str()));
-
-  // Create kernel
-  auto kernel_name = KernelUniqueName();
-  Line(string_format(
-      "auto %s = std::move(%s->CreateKernels(valid_places, \"%s\").front());",
-      kernel_name.c_str(),
-      op_name.c_str(),
-      kernel_type.c_str()));
-
-  // Set Context for kernel
-  // clang-format off
-  Line(string_format("%s->SetContext(lite::ContextScheduler::Global().NewContext(%s->target()));", kernel_name.c_str(), kernel_name.c_str()));  // NOLINT
-  // clang-format on
-
-  Line(string_format("ops.push_back(%s);", op_name.c_str()));
-  Line(string_format("kernels.push_back(std::move(%s));", kernel_name.c_str()));
-
-  op_kinds_.insert(op.Type());
-  kernel_kinds_.insert(kernel_type);
-}
-}  // namespace gencode
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/gen_code/gen_code.h b/lite/gen_code/gen_code.h
deleted file mode 100644
index 7dea36636a..0000000000
--- a/lite/gen_code/gen_code.h
+++ /dev/null
@@ -1,258 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <set>
-#include <string>
-#include <vector>
-#include "lite/core/framework.pb.h"
-#include "lite/core/program.h"
-#include "lite/core/target_wrapper.h"
-#include "lite/core/tensor.h"
-#include "lite/model_parser/compatible_pb.h"
-#include "lite/model_parser/cpp/op_desc.h"
-#include "lite/model_parser/desc_apis.h"
-#include "lite/model_parser/pb/op_desc.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace gencode {
-
-struct TensorRepr {
-  TensorRepr() = default;
-  TensorRepr(PrecisionType dtype,
-             const std::vector<int64_t> &ddim,
-             void *raw_data,
-             size_t num_bytes)
-      : dtype(dtype), ddim(ddim), raw_data(raw_data), num_bytes(num_bytes) {}
-
-  PrecisionType dtype;
-  lite::DDim ddim;
-  const void *raw_data;
-  size_t num_bytes{};
-};
-
-class Module {
-  std::vector<cpp::OpDesc> ops;
-  std::vector<TensorRepr> weights;
-  std::vector<std::string> tmp_vars_;
-  STL::stringstream stream_;
-  std::set<std::string> kernel_kinds_;
-  std::set<std::string> op_kinds_;
-
-  int line_indent_{};
-  const int indent_unit_{2};
-
- public:
-  void NewOp(const cpp::OpDesc &desc) { ops.push_back(desc); }
-  void NewWeight(const TensorRepr &x) { weights.push_back(x); }
-  void NewTmpVar(const std::string &x) { tmp_vars_.push_back(x); }
-
-  STL::stringstream &stream() { return stream_; }
-
-  void AddHeaderIncludeGenCode();
-
-  void AddNamespaceBegin() {
-    Line("namespace paddle {");
-    Line("namespace gencode{");
-    Line("");
-  }
-
-  void AddNamespaceEnd() {
-    Line("");
-    Line("}  // namespace gencode");
-    Line("}  // namespace paddle");
-  }
-
-  void AddInitFuncBegin() {
-    Line("void PaddlePredictor::Init() {");
-    Line("");
-    IncIndent();
-  }
-
-  void AddInitFuncEnd() {
-    DecIndent();
-    Line("");
-    Line("}");
-  }
-
-  void AddScopeDecl() {
-    Line("lite::Scope* scope = static_cast<lite::Scope*>(raw_scope_);");
-
-    // clang-format off
-    Line("lite::Scope* exec_scope = static_cast<lite::Scope*>(raw_exe_scope_);");  // NOLINT
-    // clang-format on
-
-    // Create feed and fetch in exec_scope.
-    Line(string_format("exec_scope->Var(%s);", Repr("feed").c_str()));
-    Line(string_format("exec_scope->Var(%s);", Repr("fetch").c_str()));
-  }
-
-  void AddValidPlaceDecl() {
-    // clang-format off
-    Line("std::vector<lite::Place> valid_places({lite::Place({TARGET(kX86), PRECISION(kFloat), DATALAYOUT(kNCHW)}), lite::Place({TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)})});");  // NOLINT
-    // clang-format on
-  }
-
-  void AddMemberCast() {
-    Line("// Cast the raw members");
-    // clang-format off
-    Line(string_format("auto& ops = *static_cast<std::vector<std::shared_ptr<lite::OpLite>>*>(raw_ops_);"));  // NOLINT
-    Line(string_format("auto& kernels = *static_cast<std::vector<std::unique_ptr<lite::KernelBase>>*>(raw_kernels_);"));  // NOLINT
-    // clang-format on
-    Line("");
-  }
-
-  void AddWeight(const std::string &name, const TensorRepr &tensor);
-
-  void AddTmpVar(const std::string &x) {
-    Line(string_format("// Create temporary variable: %s", x.c_str()));
-    Line(string_format("exec_scope->Var(%s);", Repr(x).c_str()));
-    Line("");
-  }
-
-  void AddOp(const cpp::OpDesc &op);
-
-  void AddOpDescHelper(const std::string &op_id, const cpp::OpDesc &desc);
-
-  void AddOpCompileDeps() {
-    Line("");
-    Line("// Add Operator compile deps");
-    for (auto &op_type : op_kinds_) {
-      Line(string_format("USE_LITE_OP(%s)", op_type.c_str()));
-    }
-    Line("");
-  }
-  void AddKernelCompileDeps() {
-    Line("// Add Kernel compile deps");
-
-    std::string op_type, alias;
-    Place place;
-    for (auto &kernel_type : kernel_kinds_) {
-      KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
-      Line(string_format("USE_LITE_KERNEL(%s, %s, %s, %s, %s)",  //
-                         op_type.c_str(),                        //
-                         TargetRepr(place.target).c_str(),
-                         PrecisionRepr(place.precision).c_str(),
-                         DataLayoutRepr(place.layout).c_str(),
-                         alias.c_str()));
-    }
-  }
-
- private:
-  std::string WeightUniqueName() const {
-    return "w_" + std::to_string(weight_counter_++);
-  }
-  std::string TmpVarUniqueName() const {
-    return "tmp_" + std::to_string(tmp_var_counter_++);
-  }
-  std::string OpUniqueName() const {
-    return "op_" + std::to_string(op_counter_++);
-  }
-  std::string KernelUniqueName() const {
-    return "kernel_" + std::to_string(kernel_counter_++);
-  }
-
-  std::string DataRepr(const std::string &raw_data, PrecisionType dtype);
-
-  void IncIndent() { line_indent_++; }
-  void DecIndent() { line_indent_--; }
-
-  void Line(const std::string &x) {
-    std::string indent_str(line_indent_ * indent_unit_, ' ');
-    stream() << indent_str << x << "\n";
-  }
-
- private:
-  mutable int weight_counter_{};
-  mutable int tmp_var_counter_{};
-  mutable int op_counter_{};
-  mutable int kernel_counter_{};
-};
-
-class ProgramCodeGenerator {
- public:
-  ProgramCodeGenerator(const framework::proto::ProgramDesc &program,
-                       const lite::Scope &exec_scope)
-      : program_(program), exec_scope_(exec_scope) {}
-
-  std::string GenCode() {
-    Module m;
-    m.AddHeaderIncludeGenCode();
-    m.AddNamespaceBegin();
-    m.AddInitFuncBegin();
-    m.AddMemberCast();
-    m.AddScopeDecl();
-    m.AddValidPlaceDecl();
-
-    AddWeights(&m);
-    AddTmpVars(&m);
-    AddOps(&m);
-
-    m.AddInitFuncEnd();
-    m.AddNamespaceEnd();
-
-    m.AddOpCompileDeps();
-    m.AddKernelCompileDeps();
-
-    return m.stream().str();
-  }
-
-  void AddWeights(Module *m) {
-    for (auto &var : program_.blocks(0).vars()) {
-      if (var.persistable()) {
-        auto name = var.name();
-        if (name == "feed" || name == "fetch") continue;
-        const auto &tensor = exec_scope_.FindVar(name)->Get<lite::Tensor>();
-        TensorRepr repr;
-        TensorToRepr(tensor, &repr);
-        m->AddWeight(name, repr);
-      }
-    }
-  }
-  void AddTmpVars(Module *m) {
-    for (auto &var : program_.blocks(0).vars()) {
-      if (!var.persistable()) {
-        m->AddTmpVar(var.name());
-      }
-    }
-  }
-  void AddOps(Module *m) {
-    for (auto &pb_op : program_.blocks(0).ops()) {
-      auto op = pb_op;
-      lite::pb::OpDesc pb_desc(&op);
-      lite::cpp::OpDesc cpp_desc;
-      TransformOpDescAnyToCpp(pb_desc, &cpp_desc);
-      m->AddOp(cpp_desc);
-    }
-  }
-
- private:
-  void TensorToRepr(const lite::Tensor &tensor, TensorRepr *repr) {
-    repr->ddim = tensor.dims();
-    // TODO(Superjomn) support other types.
-    repr->dtype = PRECISION(kFloat);
-    repr->raw_data = tensor.data<float>();
-    repr->num_bytes = repr->ddim.production() * sizeof(float);
-  }
-
- private:
-  const framework::proto::ProgramDesc &program_;
-  const lite::Scope &exec_scope_;
-};
-
-}  // namespace gencode
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/gen_code/gen_code_test.cc b/lite/gen_code/gen_code_test.cc
deleted file mode 100644
index d0b1c1f8b2..0000000000
--- a/lite/gen_code/gen_code_test.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/gen_code/gen_code.h"
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include <fstream>
-#include <string>
-#include <utility>
-#include <vector>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/context.h"
-#include "lite/core/scope.h"
-#include "lite/core/tensor.h"
-#include "lite/model_parser/compatible_pb.h"
-#include "lite/model_parser/cpp/op_desc.h"
-#include "lite/model_parser/model_parser.h"
-#include "lite/model_parser/pb/program_desc.h"
-
-DEFINE_string(optimized_model, "", "");
-DEFINE_string(generated_code_file, "__generated_code__.cc", "");
-
-namespace paddle {
-namespace lite {
-namespace gencode {
-
-// Manually construct a program.
-TEST(gen_code, manual) {
-  // For holding the weights.
-  lite::Scope scope;
-  // For holding the temporary variables.
-  auto &tmp_scope = scope.NewScope();
-
-  // Create weight variables.
-  auto *w0 = scope.Var("w0")->GetMutable<lite::Tensor>();
-  // Create temporary variables.
-  auto *a = tmp_scope.Var("x")->GetMutable<lite::Tensor>();
-  tmp_scope.Var("out")->GetMutable<lite::Tensor>();
-
-  // Set weights.
-  std::vector<float> w0_data({0, 1, 2, 3});
-  std::vector<float> a_data({0, 1, 2, 3});
-#ifdef LITE_WITH_ARM
-  w0->Assign<float, lite::DDim, TARGET(kARM)>(
-      w0_data.data(), lite::DDim{std::vector<int64_t>({2, 2})});
-  a->Assign<float, lite::DDim, TARGET(kARM)>(
-      a_data.data(), lite::DDim{std::vector<int64_t>({2, 2})});
-#else
-  w0->Assign<float, lite::DDim, TARGET(kX86)>(
-      w0_data.data(), lite::DDim{std::vector<int64_t>({2, 2})});
-  a->Assign<float, lite::DDim, TARGET(kX86)>(
-      a_data.data(), lite::DDim{std::vector<int64_t>({2, 2})});
-#endif
-
-  std::vector<Place> valid_places({
-#ifdef LITE_WITH_ARM
-      Place{TARGET(kARM), PRECISION(kFloat)},
-#else
-      Place{TARGET(kX86), PRECISION(kFloat)},
-#endif
-      Place{TARGET(kHost), PRECISION(kFloat)},
-      Place{TARGET(kHost), PRECISION(kAny)},
-  });
-  auto mul_op = LiteOpRegistry::Global().Create("mul");
-  cpp::OpDesc mul_op_desc;
-  mul_op_desc.SetType("mul");
-  mul_op_desc.SetInput("X", {"x"});
-  mul_op_desc.SetInput("Y", {"w0"});
-  mul_op_desc.SetAttr("x_num_col_dims", 1);
-  mul_op_desc.SetAttr("y_num_col_dims", 1);
-  mul_op_desc.SetOutput("Out", {"out"});
-
-  mul_op->Attach(mul_op_desc, &tmp_scope);
-  auto mul_kernel = std::move(mul_op->CreateKernels(valid_places).front());
-#ifdef LITE_WITH_ARM
-  auto fc_ctx = ContextScheduler::Global().NewContext(TARGET(kARM));
-#else
-  auto fc_ctx = ContextScheduler::Global().NewContext(TARGET(kX86));
-#endif
-  mul_op->CheckShape();
-  mul_op->InferShape();
-  mul_kernel->SetContext(std::move(fc_ctx));
-  mul_kernel->Launch();
-}
-
-TEST(gen_code, auto_gen) {
-  std::vector<float> w0_data({0, 1, 2, 3});
-  TensorRepr w0(PRECISION(kFloat),
-                std::vector<int64_t>({2, 2}),
-                w0_data.data(),
-                w0_data.size() * sizeof(float));
-
-  std::vector<float> w1_data({0.01, 1.2, 2.3, 3.4, 1.1, 2.2});
-  TensorRepr w1(PRECISION(kFloat),
-                std::vector<int64_t>({3, 2}),
-                w1_data.data(),
-                w1_data.size() * sizeof(float));
-
-  cpp::OpDesc op0;
-  op0.SetType("mul");
-  op0.SetInput("X", {"a", "b"});
-  op0.SetOutput("Out", {"out0"});
-  op0.SetAttr<std::string>("desc", "this is a desc");
-  op0.SetAttr<int>("x_col", 1);
-  op0.SetAttr<int>("y_col", 2);
-#ifdef LITE_WITH_ARM
-  op0.SetAttr<std::string>(kKernelTypeAttr, "arm");
-#else
-  op0.SetAttr<std::string>(kKernelTypeAttr, "x86");
-#endif
-
-  gencode::Module module;
-  module.AddHeaderIncludeGenCode();
-
-  module.AddNamespaceBegin();
-  module.AddInitFuncBegin();
-
-  module.AddMemberCast();
-
-  module.AddWeight("w0", w0);
-  module.AddWeight("w1", w1);
-  module.AddTmpVar("a");
-  module.AddTmpVar("b");
-
-  module.AddOp(op0);
-
-  module.AddInitFuncEnd();
-  module.AddNamespaceEnd();
-
-  LOG(INFO) << module.stream().str();
-}
-
-TEST(gen_code, optimized_program) {
-  lite::Scope scope;
-  cpp::ProgramDesc cpp_desc;
-  std::string model_file = FLAGS_optimized_model + "/model";
-  std::string param_file = FLAGS_optimized_model + "/params";
-  LoadModelPb(
-      FLAGS_optimized_model, model_file, param_file, &scope, &cpp_desc, true);
-
-  framework::proto::ProgramDesc pb_proto_desc;
-  lite::pb::ProgramDesc pb_desc(&pb_proto_desc);
-  TransformProgramDescCppToAny(cpp_desc, &pb_desc);
-
-  ProgramCodeGenerator codegen(pb_proto_desc, scope);
-
-  std::ofstream file(FLAGS_generated_code_file);
-
-  file << codegen.GenCode();
-
-  file.close();
-}
-
-}  // namespace gencode
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/gen_code/generated_code_test.cc b/lite/gen_code/generated_code_test.cc
deleted file mode 100644
index 199ba579d4..0000000000
--- a/lite/gen_code/generated_code_test.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/gen_code/paddle_infer.h"
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-
-TEST(PaddlePredictor, Init) {
-  gencode::PaddlePredictor predictor;
-  predictor.Init();
-}
-
-#ifdef LITE_WITH_X86
-TEST(PaddlePredictor, RunX86) {
-  gencode::PaddlePredictor predictor;
-  predictor.Init();
-
-  LOG(INFO) << "run the generated code";
-  auto input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(std::vector<int64_t>({100, 100}));
-  auto* data = input_tensor->mutable_data<float>();
-  for (int i = 0; i < 100 * 100; i++) {
-    data[i] = i;
-  }
-
-  predictor.Run();
-
-  auto output_tensor = predictor.GetOutput(0);
-  LOG(INFO) << "output: " << output_tensor->data<float>()[0];
-}
-#endif
-
-#ifdef LITE_WITH_ARM
-TEST(PaddlePredictor, RunARM) {
-  gencode::PaddlePredictor predictor;
-  predictor.Init();
-
-  LOG(INFO) << "run the generated code";
-  auto input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(std::vector<int64_t>({1, 100}));
-  auto* data = input_tensor->mutable_data<float>();
-  for (int i = 0; i < 100; i++) {
-    data[i] = 1;
-  }
-
-  predictor.Run();
-
-  std::vector<float> result({0.4350058,
-                             -0.6048313,
-                             -0.29346266,
-                             0.40377066,
-                             -0.13400325,
-                             0.37114543,
-                             -0.3407839,
-                             0.14574292,
-                             0.4104212,
-                             0.8938774});
-
-  auto output_tensor = predictor.GetOutput(0);
-  auto output_shape = output_tensor->shape();
-  ASSERT_EQ(output_shape.size(), 2);
-  ASSERT_EQ(output_shape[0], 1);
-  ASSERT_EQ(output_shape[1], 500);
-
-  int step = 50;
-  for (int i = 0; i < result.size(); i += step) {
-    EXPECT_NEAR(output_tensor->data<float>()[i], result[i], 1e-6);
-  }
-}
-#endif
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/gen_code/paddle_code_generator.cc b/lite/gen_code/paddle_code_generator.cc
deleted file mode 100644
index 578c869382..0000000000
--- a/lite/gen_code/paddle_code_generator.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gflags/gflags.h>
-#include "lite/gen_code/gen_code.h"
-#include "lite/model_parser/model_parser.h"
-#include "lite/model_parser/pb/program_desc.h"
-
-DEFINE_string(optimized_model, "", "");
-DEFINE_string(generated_code_file, "__generated_code__.cc", "");
-
-namespace paddle {
-namespace lite {
-namespace gencode {
-
-void GenCode(const std::string& model_dir, const std::string& out_file) {
-  lite::Scope scope;
-  cpp::ProgramDesc cpp_desc;
-  std::string model_file = model_dir + "/model";
-  std::string param_file = model_dir + "/params";
-  LoadModelPb(model_dir, model_file, param_file, &scope, &cpp_desc, true);
-
-  framework::proto::ProgramDesc pb_proto_desc;
-  lite::pb::ProgramDesc pb_desc(&pb_proto_desc);
-  TransformProgramDescCppToAny(cpp_desc, &pb_desc);
-
-  ProgramCodeGenerator codegen(pb_proto_desc, scope);
-
-  std::ofstream file(out_file);
-
-  file << codegen.GenCode();
-
-  file.close();
-}
-
-}  // namespace gencode
-}  // namespace lite
-}  // namespace paddle
-
-int main(int argc, char** argv) {
-  gflags::ParseCommandLineFlags(&argc, &argv, false);
-  paddle::lite::gencode::GenCode(FLAGS_optimized_model,
-                                 FLAGS_generated_code_file);
-  return 0;
-}
diff --git a/lite/gen_code/paddle_infer.cc b/lite/gen_code/paddle_infer.cc
deleted file mode 100644
index 180e75e1a6..0000000000
--- a/lite/gen_code/paddle_infer.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/gen_code/paddle_infer.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace gencode {
-
-void Tensor::Resize(const Tensor::ddim_t &shape) {
-  CHECK(raw_mutable_tensor_);
-  auto *tensor = static_cast<lite::Tensor *>(raw_mutable_tensor_);
-  tensor->Resize(shape);
-}
-
-std::vector<int64_t> Tensor::shape() const {
-  CHECK(raw_tensor_);
-  auto *tensor = static_cast<const lite::Tensor *>(raw_tensor_);
-  return tensor->dims().Vectorize();
-}
-
-#define FOR_EACH_TYPE(HANDLE) \
-  HANDLE(int);                \
-  HANDLE(float);              \
-  HANDLE(int8_t);             \
-  HANDLE(int64_t);
-
-#define IMPL_DATA(T)                                                     \
-  template <>                                                            \
-  const T *Tensor::data<T>() const {                                     \
-    CHECK(raw_tensor_);                                                  \
-    const auto *tensor = static_cast<const lite::Tensor *>(raw_tensor_); \
-    return tensor->data<T>();                                            \
-  }
-FOR_EACH_TYPE(IMPL_DATA);
-#undef IMPL_DATA
-
-#define IMPL_MUTABLE_DATA(T)                                         \
-  template <>                                                        \
-  T *Tensor::mutable_data<T>() {                                     \
-    CHECK(raw_mutable_tensor_);                                      \
-    auto *tensor = static_cast<lite::Tensor *>(raw_mutable_tensor_); \
-    return tensor->mutable_data<T>();                                \
-  }
-FOR_EACH_TYPE(IMPL_MUTABLE_DATA);
-#undef IMPL_MUTABLE_DATA
-
-PaddlePredictor::PaddlePredictor() {
-  raw_ops_ = new std::vector<std::shared_ptr<lite::OpLite>>;
-  raw_kernels_ = new std::vector<std::unique_ptr<lite::KernelBase>>;
-  raw_scope_ = new lite::Scope;
-  raw_exe_scope_ = &(static_cast<lite::Scope *>(raw_scope_)->NewScope());
-}
-
-std::unique_ptr<Tensor> PaddlePredictor::GetTensor(
-    const std::string &id) const {
-  auto *exe_scope = static_cast<lite::Scope *>(raw_exe_scope_);
-  const auto *var = exe_scope->FindVar(id);
-  const auto &tensor = var->Get<lite::Tensor>();
-  return std::unique_ptr<Tensor>(new Tensor(&tensor, nullptr));
-}
-
-std::unique_ptr<Tensor> PaddlePredictor::GetMutableTensor(
-    const std::string &id) {
-  auto *exe_scope = static_cast<lite::Scope *>(raw_exe_scope_);
-  auto *var = exe_scope->FindVar(id);
-  auto *tensor = var->GetMutable<lite::Tensor>();
-  return std::unique_ptr<Tensor>(new Tensor(nullptr, tensor));
-}
-
-#define CAST_OPS \
-  auto *ops =    \
-      static_cast<std::vector<std::shared_ptr<lite::OpLite>> *>(raw_ops_);
-#define CAST_KERNELS                                                 \
-  auto *kernels =                                                    \
-      static_cast<std::vector<std::unique_ptr<lite::KernelBase>> *>( \
-          raw_kernels_);
-#define CAST_SCOPE auto *scope = static_cast<lite::Scope *>(raw_scope_);
-
-PaddlePredictor::~PaddlePredictor() {
-  CAST_OPS
-  CAST_KERNELS
-  CAST_SCOPE
-
-  if (ops) {
-    delete ops;
-  }
-  if (kernels) {
-    delete kernels;
-  }
-  if (scope) {
-    delete scope;
-  }
-}
-
-void PaddlePredictor::Run() {
-  CAST_OPS
-  CAST_KERNELS
-
-  CHECK(ops);
-  CHECK(kernels);
-  CHECK_EQ(ops->size(), kernels->size());
-
-  for (size_t i = 0; i < ops->size(); i++) {
-    LOG(INFO) << "Running the " << i << "-th operator";
-    ops->at(i)->InferShape();
-    kernels->at(i)->Launch();
-  }
-}
-
-std::unique_ptr<Tensor> PaddlePredictor::GetInput(size_t offset) {
-  auto *exec_scope = static_cast<lite::Scope *>(raw_exe_scope_);
-  auto *_feed_list = exec_scope->FindVar("feed");
-  CHECK(_feed_list) << "no feed variable in exec_scope";
-  auto *feed_list = _feed_list->GetMutable<std::vector<lite::Tensor>>();
-  if (offset >= feed_list->size()) {
-    feed_list->resize(offset + 1);
-  }
-
-  return std::unique_ptr<Tensor>(new Tensor(nullptr, &feed_list->at(offset)));
-}
-
-std::unique_ptr<Tensor> PaddlePredictor::GetOutput(size_t offset) {
-  auto *exec_scope = static_cast<lite::Scope *>(raw_exe_scope_);
-  auto *_fetch_list = exec_scope->FindVar("fetch");
-  CHECK(_fetch_list) << "no fatch variable in exec_scope";
-  auto &fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
-  CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
-  return std::unique_ptr<Tensor>(new Tensor(&fetch_list.at(offset), nullptr));
-}
-
-}  // namespace gencode
-}  // namespace paddle
diff --git a/lite/gen_code/paddle_infer.h b/lite/gen_code/paddle_infer.h
deleted file mode 100644
index e01ffc25e2..0000000000
--- a/lite/gen_code/paddle_infer.h
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace gencode {
-
-/// Zero Copy Tensor.
-class Tensor {
- public:
-  using ddim_t = std::vector<int64_t>;
-
-  Tensor(const void *raw_tensor, void *raw_mutable_tensor)
-      : raw_tensor_(raw_tensor), raw_mutable_tensor_(raw_mutable_tensor) {}
-
-  void Resize(const ddim_t &shape);
-  template <typename T>
-  const T *data() const;
-  template <typename T>
-  T *mutable_data();
-
-  ddim_t shape() const;
-
- private:
-  const void *raw_tensor_;
-  void *raw_mutable_tensor_{};
-};
-
-/*
- * Predictor for the generated code.
- */
-class PaddlePredictor {
- public:
-  void Init();
-
-  std::unique_ptr<Tensor> GetTensor(const std::string &id) const;
-  std::unique_ptr<Tensor> GetMutableTensor(const std::string &id);
-
-  // Get offset-th col of feed.
-  std::unique_ptr<Tensor> GetInput(size_t offset);
-
-  std::unique_ptr<Tensor> GetOutput(size_t offset);
-
-  void Run();
-
-  PaddlePredictor();
-  ~PaddlePredictor();
-
- private:
-  void *raw_ops_;
-  void *raw_kernels_;
-  void *raw_scope_{};
-  void *raw_exe_scope_{};  // raw_exe_scope is not owned.
-};
-
-}  // namespace gencode
-}  // namespace paddle
diff --git a/lite/kernels/CMakeLists.txt b/lite/kernels/CMakeLists.txt
deleted file mode 100644
index 1996f50133..0000000000
--- a/lite/kernels/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-message(STATUS "add lite kernels")
-
-set(lite_kernel_deps type_system kernel op op_registry context tensor any CACHE INTERNAL "" FORCE)
-
-add_subdirectory(host)
-add_subdirectory(arm)
-add_subdirectory(cuda)
-add_subdirectory(x86)
-add_subdirectory(opencl)
-add_subdirectory(fpga)
-add_subdirectory(npu)
diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt
deleted file mode 100644
index 91550476d6..0000000000
--- a/lite/kernels/arm/CMakeLists.txt
+++ /dev/null
@@ -1,95 +0,0 @@
-add_kernel(fc_compute_arm ARM basic SRCS fc_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(activation_compute_arm ARM basic SRCS activation_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(mul_compute_arm ARM basic SRCS mul_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(matmul_compute_arm ARM basic SRCS matmul_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(scale_compute_arm ARM basic SRCS scale_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(softmax_compute_arm ARM basic SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(conv_compute_arm ARM basic SRCS conv_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(batch_norm_compute_arm ARM basic SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(elementwise_compute_arm ARM basic SRCS elementwise_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(lrn_compute_arm ARM basic SRCS lrn_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(decode_bboxes_compute_arm ARM basic SRCS decode_bboxes_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(pool_compute_arm ARM basic SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(split_compute_arm ARM basic SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(concat_compute_arm ARM basic SRCS concat_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(pad2d_compute_arm ARM basic SRCS pad2d_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(prior_box_compute_arm ARM basic SRCS prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(density_prior_box_compute_arm ARM basic SRCS density_prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(negative_compute_arm ARM basic SRCS negative_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(crop_compute_arm ARM basic SRCS crop_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(dropout_compute_arm ARM basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(calib_compute_arm ARM basic SRCS calib_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(transpose_compute_arm ARM basic SRCS transpose_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(power_compute_arm ARM basic SRCS power_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(yolo_box_compute_arm ARM basic SRCS yolo_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(shuffle_channel_compute_arm ARM basic SRCS shuffle_channel_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(argmax_compute_arm ARM basic SRCS argmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(axpy_compute_arm ARM basic SRCS axpy_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(conv_transpose_compute_arm ARM basic SRCS conv_transpose_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(norm_compute_arm ARM basic SRCS norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(interpolate_compute_arm ARM basic SRCS interpolate_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(box_coder_compute_arm ARM basic SRCS box_coder_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(shape_compute_arm ARM basic SRCS shape_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(slice_compute_arm ARM basic SRCS slice_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(cast_compute_arm ARM basic SRCS cast_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(squeeze_compute_arm ARM basic SRCS squeeze_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(expand_compute_arm ARM basic SRCS expand_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(reduce_max_compute_arm ARM basic SRCS reduce_max_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(sequence_expand_compute_arm ARM basic SRCS sequence_expand_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(im2sequence_compute_arm ARM basic SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(sequence_pool_compute_arm ARM basic SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(reduce_mean_compute_arm ARM basic SRCS reduce_mean_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(stack_compute_arm ARM basic SRCS stack_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(assign_compute_arm ARM basic SRCS assign_compute.cc DEPS ${lite_kernel_deps} math_arm) 
-add_kernel(affine_channel_compute_arm ARM basic SRCS affine_channel_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(anchor_generator_compute_arm ARM basic SRCS anchor_generator_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(generate_proposals_compute_arm ARM basic SRCS generate_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(roi_align_compute_arm ARM basic SRCS roi_align_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(box_clip_compute_arm ARM basic SRCS box_clip_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(assign_value_compute_arm ARM basic SRCS assign_value_compute.cc DEPS ${lite_kernel_deps} math_arm)
-
-# for OCR specific
-add_kernel(gru_unit_compute_arm ARM extra SRCS gru_unit_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(gru_compute_arm ARM extra SRCS gru_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(beam_search_decode_compute_arm ARM extra SRCS beam_search_decode_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(lookup_table_compute_arm ARM extra SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(logical_compute_arm ARM extra SRCS logical_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(sequence_softmax_compute_arm ARM extra SRCS sequence_softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(less_than_arm ARM extra SRCS compare_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(while_compute_arm ARM extra SRCS while_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(compare_compute_arm ARM extra SRCS compare_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(topk_compute_arm ARM extra SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(write_to_array_compute_arm ARM extra SRCS write_to_array_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(read_from_array_compute_arm ARM extra SRCS read_from_array_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(fill_constant_compute_arm ARM extra SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(lod_reset_compute_arm ARM extra SRCS lod_reset_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(is_empty_compute_arm ARM extra SRCS is_empty_compute.cc DEPS ${lite_kernel_deps} math_arm)
-
-# NOTE we leave the add_kernel not protected by LITE_WITH_LIGHT_WEIGHT_FRAMEWORK so that all the kernels will be registered
-# to the model_optimize_tool.
-if(NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM))
-    return()
-endif()
-
-message(STATUS "compile with lite ARM kernels")
-
-
-lite_cc_test(test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm math_arm)
-lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm)
-lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm)
-lite_cc_test(test_conv_compute_arm SRCS conv_compute_test.cc DEPS conv_compute_arm)
-lite_cc_test(test_batch_norm_compute_arm SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_arm)
-lite_cc_test(test_elementwise_compute_arm SRCS elementwise_compute_test.cc DEPS elementwise_compute_arm)
-lite_cc_test(test_lrn_compute_arm SRCS lrn_compute_test.cc DEPS lrn_compute_arm)
-lite_cc_test(test_decode_bboxes_compute_arm SRCS decode_bboxes_compute_test.cc DEPS decode_bboxes_compute_arm)
-lite_cc_test(test_pool_compute_arm SRCS pool_compute_test.cc DEPS pool_compute_arm)
-lite_cc_test(test_mul_compute_arm SRCS mul_compute_test.cc DEPS mul_compute_arm)
-lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm)
-lite_cc_test(test_concat_compute_arm SRCS concat_compute_test.cc DEPS concat_compute_arm)
-lite_cc_test(test_dropout_compute_arm SRCS dropout_compute_test.cc DEPS dropout_compute_arm)
-lite_cc_test(test_transpose_compute_arm SRCS transpose_compute_test.cc DEPS transpose_compute_arm COMPILE_LEVEL extra)
-lite_cc_test(test_argmax_compute_arm SRCS argmax_compute_test.cc DEPS argmax_compute_arm)
-lite_cc_test(test_axpy_compute_arm SRCS axpy_compute_test.cc DEPS axpy_compute_arm)
-lite_cc_test(test_conv_transpose_compute_arm SRCS conv_transpose_compute_test.cc DEPS conv_transpose_compute_arm)
diff --git a/lite/kernels/arm/activation_compute.cc b/lite/kernels/arm/activation_compute.cc
deleted file mode 100644
index 406ec530ac..0000000000
--- a/lite/kernels/arm/activation_compute.cc
+++ /dev/null
@@ -1,247 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/activation_compute.h"
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void ReluCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  auto x_dims = param.X->dims();
-  auto x_data = param.X->data<float>();
-  auto output_data = param.Out->mutable_data<float>();
-  lite::arm::math::act_relu<float>(
-      x_data, output_data, x_dims.production(), ctx.threads());
-}
-
-void LeakyReluCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  auto x_dims = param.X->dims();
-  auto x_data = param.X->data<float>();
-  auto alpha = param.Leaky_relu_alpha;
-  auto output_data = param.Out->mutable_data<float>();
-  lite::arm::math::act_relu_neg<float>(
-      x_data, output_data, x_dims.production(), alpha, ctx.threads());
-}
-
-void ReluClippedCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  auto x_dims = param.X->dims();
-  auto x_data = param.X->data<float>();
-  auto coef = param.Relu_clipped_coef;
-  auto output_data = param.Out->mutable_data<float>();
-  lite::arm::math::act_clipped_relu<float>(
-      x_data, output_data, x_dims.production(), coef, ctx.threads());
-}
-
-void PReluCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  auto x_dims = param.X->dims();
-  auto x_data = param.X->data<float>();
-  auto mode = param.Prelu_mode;
-  auto alpha_data = param.Prelu_alpha->data<float>();
-  auto output_data = param.Out->mutable_data<float>();
-
-  int outer_size = x_dims[0];
-  int channel_size = x_dims[1];
-  int inner_size = x_dims.count(2, x_dims.size());
-
-  lite::arm::math::act_prelu<float>(x_data,
-                                    output_data,
-                                    outer_size,
-                                    channel_size,
-                                    inner_size,
-                                    mode,
-                                    alpha_data,
-                                    ctx.threads());
-}
-
-void SigmoidCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  auto x_dims = param.X->dims();
-  auto x_data = param.X->data<float>();
-  auto output_data = param.Out->mutable_data<float>();
-  lite::arm::math::act_sigmoid<float>(
-      x_data, output_data, x_dims.production(), ctx.threads());
-}
-
-void TanhCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  auto x_dims = param.X->dims();
-  auto x_data = param.X->data<float>();
-  auto output_data = param.Out->mutable_data<float>();
-  lite::arm::math::act_tanh<float>(
-      x_data, output_data, x_dims.production(), ctx.threads());
-}
-
-void SwishCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  auto x_dims = param.X->dims();
-  auto x_data = param.X->data<float>();
-  auto beta = param.Swish_beta;
-  auto output_data = param.Out->mutable_data<float>();
-  lite::arm::math::act_swish<float>(
-      x_data, output_data, x_dims.production(), beta, ctx.threads());
-}
-
-void Relu6Compute::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  auto x_dims = param.X->dims();
-  auto x_data = param.X->data<float>();
-  float coef = 6.;
-  auto output_data = param.Out->mutable_data<float>();
-  lite::arm::math::act_clipped_relu<float>(
-      x_data, output_data, x_dims.production(), coef, ctx.threads());
-}
-
-void LogCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  auto x_dims = param.X->dims();
-  auto x_data = param.X->data<float>();
-  auto output_data = param.Out->mutable_data<float>();
-  lite::arm::math::act_log<float>(
-      x_data, output_data, x_dims.production(), ctx.threads());
-}
-
-void ExpCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  auto x_dims = param.X->dims();
-  auto x_data = param.X->data<float>();
-  auto output_data = param.Out->mutable_data<float>();
-  lite::arm::math::act_exp<float>(
-      x_data, output_data, x_dims.production(), ctx.threads());
-}
-
-void FloorCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  auto x_dims = param.X->dims();
-  auto x_data = param.X->data<float>();
-  auto output_data = param.Out->mutable_data<float>();
-  lite::arm::math::act_floor<float>(
-      x_data, output_data, x_dims.production(), ctx.threads());
-}
-
-void HardSigmoidCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  auto x_dims = param.X->dims();
-  auto x_data = param.X->data<float>();
-  float slope = param.hard_sigmoid_slope;
-  float offset = param.hard_sigmoid_offset;
-  auto output_data = param.Out->mutable_data<float>();
-  lite::arm::math::act_hard_sigmoid<float>(
-      x_data, output_data, x_dims.production(), slope, offset, ctx.threads());
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    relu, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ReluCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
-REGISTER_LITE_KERNEL(leaky_relu,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::LeakyReluCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("alpha", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
-REGISTER_LITE_KERNEL(relu_clipped,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::ReluClippedCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Relu_clipped_coef", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
-REGISTER_LITE_KERNEL(
-    prelu, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::PReluCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("mode", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Alpha", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
-REGISTER_LITE_KERNEL(sigmoid,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::SigmoidCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
-REGISTER_LITE_KERNEL(
-    tanh, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::TanhCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
-REGISTER_LITE_KERNEL(
-    swish, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::SwishCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("beta", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
-REGISTER_LITE_KERNEL(
-    relu6, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::Relu6Compute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
-REGISTER_LITE_KERNEL(
-    log, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::LogCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
-REGISTER_LITE_KERNEL(
-    exp, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ExpCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
-REGISTER_LITE_KERNEL(
-    floor, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::FloorCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
-REGISTER_LITE_KERNEL(hard_sigmoid,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::HardSigmoidCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/activation_compute.h b/lite/kernels/arm/activation_compute.h
deleted file mode 100644
index ac1b7ca177..0000000000
--- a/lite/kernels/arm/activation_compute.h
+++ /dev/null
@@ -1,136 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class ReluCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ActivationParam;
-
-  void Run() override;
-
-  virtual ~ReluCompute() = default;
-};
-
-class LeakyReluCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ActivationParam;
-
-  void Run() override;
-
-  virtual ~LeakyReluCompute() = default;
-};
-
-class ReluClippedCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ActivationParam;
-
-  void Run() override;
-
-  virtual ~ReluClippedCompute() = default;
-};
-
-class PReluCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ActivationParam;
-
-  void Run() override;
-
-  virtual ~PReluCompute() = default;
-};
-
-class SigmoidCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ActivationParam;
-
-  void Run() override;
-
-  virtual ~SigmoidCompute() = default;
-};
-
-class TanhCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ActivationParam;
-
-  void Run() override;
-
-  virtual ~TanhCompute() = default;
-};
-
-class SwishCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ActivationParam;
-
-  void Run() override;
-
-  virtual ~SwishCompute() = default;
-};
-
-class Relu6Compute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ActivationParam;
-
-  void Run() override;
-
-  virtual ~Relu6Compute() = default;
-};
-
-class LogCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ActivationParam;
-
-  void Run() override;
-
-  virtual ~LogCompute() = default;
-};
-
-class ExpCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ActivationParam;
-
-  void Run() override;
-
-  virtual ~ExpCompute() = default;
-};
-
-class FloorCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ActivationParam;
-
-  void Run() override;
-
-  virtual ~FloorCompute() = default;
-};
-
-class HardSigmoidCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ActivationParam;
-
-  void Run() override;
-
-  virtual ~HardSigmoidCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/affine_channel_compute.cc b/lite/kernels/arm/affine_channel_compute.cc
deleted file mode 100644
index 6781dab488..0000000000
--- a/lite/kernels/arm/affine_channel_compute.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/affine_channel_compute.h"
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void AffineChannelCompute::Run() {
-  auto& param = Param<operators::AffineChannelParam>();
-  const lite::Tensor* x = param.X;
-  const lite::Tensor* scale = param.Scale;
-  const lite::Tensor* bias = param.Bias;
-  const std::string data_layout = param.data_layout;
-  lite::Tensor* out = param.Out;
-
-  auto x_dims = x->dims();
-  int num = x_dims[0];
-  int channel = 0;
-  int h = 0;
-  int w = 0;
-  if (data_layout == "NCHW") {
-    channel = x_dims[1];
-    h = x_dims[2];
-    w = x_dims[3];
-  } else if (data_layout == "NHWC") {
-    channel = x_dims[3];
-    h = x_dims[1];
-    w = x_dims[2];
-  }
-  lite::arm::math::affine_channel_func(x->data<float>(),
-                                       scale->data<float>(),
-                                       bias->data<float>(),
-                                       data_layout,
-                                       num,
-                                       channel,
-                                       h,
-                                       w,
-                                       out->mutable_data<float>());
-  return;
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(affine_channel,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::AffineChannelCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/affine_channel_compute.h b/lite/kernels/arm/affine_channel_compute.h
deleted file mode 100644
index 5b50af7907..0000000000
--- a/lite/kernels/arm/affine_channel_compute.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/core/kernel.h"
-#include "lite/operators/affine_channel_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class AffineChannelCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::AffineChannelParam;
-
-  void Run() override;
-
-  virtual ~AffineChannelCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/anchor_generator_compute.cc b/lite/kernels/arm/anchor_generator_compute.cc
deleted file mode 100644
index 3f31717475..0000000000
--- a/lite/kernels/arm/anchor_generator_compute.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/anchor_generator_compute.h"
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void AnchorGeneratorCompute::Run() {
-  auto& param = Param<operators::AnchorGeneratorParam>();
-  auto* anchors = param.Anchors;
-  auto* variances = param.Variances;
-  auto* input = param.Input;
-
-  float* anchors_data = anchors->mutable_data<float>();
-  float* variances_data = variances->mutable_data<float>();
-  auto input_dims = input->dims();
-  int feature_height = input_dims[2];
-  int feature_width = input_dims[3];
-
-  lite::arm::math::anchor_generator_func(feature_height,
-                                         feature_width,
-                                         param.anchor_sizes,
-                                         param.aspect_ratios,
-                                         param.stride,
-                                         param.variances,
-                                         param.offset,
-                                         anchors_data,
-                                         variances_data);
-  return;
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(anchor_generator,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::AnchorGeneratorCompute,
-                     def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Anchors", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/anchor_generator_compute.h b/lite/kernels/arm/anchor_generator_compute.h
deleted file mode 100644
index af6a6eef02..0000000000
--- a/lite/kernels/arm/anchor_generator_compute.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/core/kernel.h"
-#include "lite/operators/anchor_generator_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class AnchorGeneratorCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::AnchorGeneratorParam;
-
-  void Run() override;
-
-  virtual ~AnchorGeneratorCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/argmax_compute.cc b/lite/kernels/arm/argmax_compute.cc
deleted file mode 100644
index ad279e8f8e..0000000000
--- a/lite/kernels/arm/argmax_compute.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/argmax_compute.h"
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void ArgmaxCompute::Run() {
-  auto& param = Param<operators::ArgmaxParam>();
-  lite::Tensor* input = param.X;
-  lite::Tensor* output = param.Out;
-  int axis = param.Axis;
-
-  lite::arm::math::argmax_func(input, axis, output);
-  return;
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(arg_max,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::ArgmaxCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/argmax_compute.h b/lite/kernels/arm/argmax_compute.h
deleted file mode 100644
index c87f5a451b..0000000000
--- a/lite/kernels/arm/argmax_compute.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/core/kernel.h"
-#include "lite/operators/argmax_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class ArgmaxCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ArgmaxParam;
-
-  void Run() override;
-
-  virtual ~ArgmaxCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/argmax_compute_test.cc b/lite/kernels/arm/argmax_compute_test.cc
deleted file mode 100644
index 58bdf18474..0000000000
--- a/lite/kernels/arm/argmax_compute_test.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/argmax_compute.h"
-#include <gtest/gtest.h>
-#include <cstdlib>
-#include <functional>
-#include <memory>
-#include <utility>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-template <typename dtype>
-void argmax_compute_ref(const operators::ArgmaxParam& param) {
-  lite::Tensor* x = param.X;
-  lite::Tensor* output = param.Out;
-  int axis = param.Axis;
-
-  auto x_data = x->data<dtype>();
-  auto output_data = output->mutable_data<dtype>();
-  DDim x_dims = x->dims();
-  DDim output_dims = output->dims();
-
-  // int in_channel = x_dims
-  const int size = x_dims[axis];
-  const int in_channel = x_dims.count(axis, x_dims.size());
-  const int out_channel = output_dims.count(axis, output_dims.size());
-  const int in_stride = x_dims.count(axis + 1, x_dims.size());
-  const int out_stride = x_dims.count(0, axis);
-
-  for (int n = 0; n < out_stride; n++) {
-    for (int k = 0; k < in_stride; k++) {
-      const dtype* in_ptr = x_data + n * in_channel + k;
-      std::vector<std::pair<dtype, int>> vec;
-      vec.resize(size);
-      for (int i = 0; i < size; i++) {
-        vec[i] = std::make_pair(in_ptr[i * in_stride], i);
-      }
-      // sort
-      std::partial_sort(vec.begin(),
-                        vec.begin() + 1,
-                        vec.end(),
-                        std::greater<std::pair<dtype, int>>());
-
-      // out
-      dtype* out_ptr = output_data + n * out_channel + k;
-      *out_ptr = vec[0].second;
-    }
-  }
-}
-
-TEST(argmax_arm, retrive_op) {
-  auto argmax =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "arg_max");
-  ASSERT_FALSE(argmax.empty());
-  ASSERT_TRUE(argmax.front());
-}
-
-TEST(argmax_arm, init) {
-  ArgmaxCompute argmax;
-  ASSERT_EQ(argmax.precision(), PRECISION(kFloat));
-  ASSERT_EQ(argmax.target(), TARGET(kARM));
-}
-TEST(argmax_arm, compute) {
-  DeviceInfo::Init();
-  for (auto n : {2, 3}) {
-    for (auto c : {3, 4 /*, 128*/}) {
-      for (auto h : {4, 5 /*, 56 , 112, 224, 512*/}) {
-        for (auto w : {5, 6 /*, 56, 112, 224, 512*/}) {
-          Tensor x;
-          Tensor output;
-          Tensor output_ref;
-          int axis = (n + c + h + w) % 4;
-
-          // get tensor x data
-          x.Resize({n, c, h, w});
-          auto* x_data = x.mutable_data<float>();
-          for (int i = 0; i < x.dims().production(); i++) {
-            float sign = i % 3 == 0 ? -1.0f : 1.0f;
-            x_data[i] = sign * static_cast<float>(i % 128) * 0.013f;
-          }
-
-          // resize output and output_ref
-          int nchw[] = {n, c, h, w};
-          std::vector<int64_t> output_size(nchw, nchw + 4);
-          output_size.erase(output_size.begin() + axis);
-          output.Resize(output_size);
-          output_ref.Resize(output_size);
-
-          // obtain output_data
-          ArgmaxCompute argmaxOp;
-          std::unique_ptr<KernelContext> ctx(new KernelContext);
-          ctx->As<ARMContext>();
-          argmaxOp.SetContext(std::move(ctx));
-          operators::ArgmaxParam param;
-          param.X = &x;
-          param.Out = &output;
-          param.Axis = axis;
-          argmaxOp.SetParam(param);
-          argmaxOp.Launch();
-          auto* output_data = output.mutable_data<float>();
-
-          // obtain output_ref_data
-          param.Out = &output_ref;
-          argmax_compute_ref<float>(param);
-          auto* output_ref_data = output_ref.mutable_data<float>();
-
-          // compare
-          for (int i = 0; i < output.dims().production(); i++) {
-            EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-USE_LITE_KERNEL(arg_max, kARM, kFloat, kNCHW, def);
diff --git a/lite/kernels/arm/assign_compute.cc b/lite/kernels/arm/assign_compute.cc
deleted file mode 100644
index b0a5529c36..0000000000
--- a/lite/kernels/arm/assign_compute.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/assign_compute.h"
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void AssignCompute::PrepareForRun() {
-  //  CHECK_OR_FALSE(param_t.Out);
-}
-
-void AssignCompute::Run() {
-  // LOG(INFO) << "into kernel compute run";
-  auto& param = Param<param_t>();
-  const lite::Tensor* input = param.X;
-  lite::Tensor* output = param.Out;
-  output->CopyDataFrom(*input);
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    assign, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::AssignCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/assign_compute.h b/lite/kernels/arm/assign_compute.h
deleted file mode 100644
index 3f0dd8e281..0000000000
--- a/lite/kernels/arm/assign_compute.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/core/kernel.h"
-#include "lite/operators/assign_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class AssignCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::AssignParam;
-  void PrepareForRun() override;
-  void Run() override;
-
-  virtual ~AssignCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/assign_value_compute.cc b/lite/kernels/arm/assign_value_compute.cc
deleted file mode 100644
index 45f28ba363..0000000000
--- a/lite/kernels/arm/assign_value_compute.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/assign_value_compute.h"
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
-#include "lite/core/types.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-template <class T>
-void TensorFromVector(const std::vector<T>& src, lite::Tensor* dst) {
-  auto* src_ptr = static_cast<const void*>(src.data());
-  auto* dst_ptr = static_cast<void*>(dst->mutable_data<T>());
-  auto size = src.size() * sizeof(T);
-  std::memcpy(dst_ptr, src_ptr, size);
-}
-
-void AssignValueCompute::Run() {
-  auto& param = Param<operators::AssignValueParam>();
-  int dtype = param.dtype;
-  std::vector<float> fp32_values = param.fp32_values;
-  std::vector<int> int32_values = param.int32_values;
-  auto* out = param.Out;
-
-  if (dtype == static_cast<int>(lite::core::FluidType::INT32)) {
-    TensorFromVector(int32_values, out);
-  } else if (dtype == static_cast<int>(lite::core::FluidType::FP32)) {
-    TensorFromVector(fp32_values, out);
-  } else {
-    LOG(FATAL) << "Unsupported dtype for assign_value_op:" << dtype;
-  }
-  return;
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(assign_value,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::AssignValueCompute,
-                     def)
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/assign_value_compute.h b/lite/kernels/arm/assign_value_compute.h
deleted file mode 100644
index f0c33f865b..0000000000
--- a/lite/kernels/arm/assign_value_compute.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/core/kernel.h"
-#include "lite/operators/assign_value_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class AssignValueCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::AssignValueParam;
-
-  void Run() override;
-
-  virtual ~AssignValueCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/axpy_compute.cc b/lite/kernels/arm/axpy_compute.cc
deleted file mode 100644
index 705aa6a0f5..0000000000
--- a/lite/kernels/arm/axpy_compute.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/axpy_compute.h"
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void AxpyCompute::Run() {
-  auto& param = Param<operators::AxpyParam>();
-  lite::Tensor* scale = param.Scale;
-  lite::Tensor* x = param.X;
-  lite::Tensor* bias = param.Bias;
-  lite::Tensor* out = param.Out;
-
-  const float* scale_ptr = scale->data<float>();
-  const float* x_ptr = x->data<float>();
-  const float* bias_ptr = bias->data<float>();
-  float* out_ptr = out->mutable_data<float>();
-
-  auto bias_dims = bias->dims();
-  int num = bias_dims[0];
-  int channel = bias_dims[1];
-  int size = bias_dims[2] * bias_dims[3];
-  int in_channel = channel * size;
-
-  lite::arm::math::axpy_kernel_fp32(
-      scale_ptr, x_ptr, bias_ptr, out_ptr, num, channel, size, in_channel);
-  return;
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    axpy, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::AxpyCompute, def)
-    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/axpy_compute.h b/lite/kernels/arm/axpy_compute.h
deleted file mode 100644
index 29983bdb99..0000000000
--- a/lite/kernels/arm/axpy_compute.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/core/kernel.h"
-#include "lite/operators/axpy_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class AxpyCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::AxpyParam;
-
-  void Run() override;
-
-  virtual ~AxpyCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/axpy_compute_test.cc b/lite/kernels/arm/axpy_compute_test.cc
deleted file mode 100644
index af145435eb..0000000000
--- a/lite/kernels/arm/axpy_compute_test.cc
+++ /dev/null
@@ -1,142 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/axpy_compute.h"
-#include <gtest/gtest.h>
-#include <cstdlib>
-#include <memory>
-#include <utility>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-template <typename dtype>
-void axpy_compute_ref(const operators::AxpyParam& param) {
-  lite::Tensor* scale = param.Scale;
-  lite::Tensor* x = param.X;
-  lite::Tensor* bias = param.Bias;
-  lite::Tensor* output = param.Out;
-
-  auto scale_data = scale->data<dtype>();
-  auto x_data = x->data<dtype>();
-  auto bias_data = bias->data<dtype>();
-  auto output_data = output->mutable_data<dtype>();
-
-  DDim x_dims = x->dims();
-  int num = x_dims[0];
-  int channel = x_dims[1];
-  int size = x_dims[2] * x_dims[3];
-  int in_channel = channel * size;
-
-  for (int i = 0; i < num; i++) {
-    auto scale_data_i = scale_data + i * channel;
-    auto x_data_i = x_data + i * in_channel;
-    auto bias_data_i = bias_data + i * in_channel;
-    auto output_data_i = output_data + i * in_channel;
-    for (int j = 0; j < channel; j++) {
-      auto scale_data_j = scale_data_i + j;
-      auto x_data_j = x_data_i + j * size;
-      auto bias_data_j = bias_data_i + j * size;
-      auto output_data_j = output_data_i + j * size;
-      for (int k = 0; k < size; k++) {
-        output_data_j[k] = scale_data_j[0] * x_data_j[k] + bias_data_j[k];
-      }
-    }
-  }
-}
-
-TEST(axpy_arm, retrive_op) {
-  auto axpy =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("axpy");
-  ASSERT_FALSE(axpy.empty());
-  ASSERT_TRUE(axpy.front());
-}
-
-TEST(axpy_arm, init) {
-  AxpyCompute axpy;
-  ASSERT_EQ(axpy.precision(), PRECISION(kFloat));
-  ASSERT_EQ(axpy.target(), TARGET(kARM));
-}
-TEST(axpy_arm, compute) {
-  DeviceInfo::Init();
-  int iter = 10;
-  for (int i = 0; i < iter; i++) {
-    Tensor scale;
-    Tensor x;
-    Tensor bias;
-    Tensor output;
-    Tensor output_ref;
-
-    // set the dims of scale, x, bias and output_ref
-    int n = 2, c = 3, h = 4, w = 5;
-    scale.Resize({n, c});
-    x.Resize({n, c, h, w});
-    bias.Resize({n, c, h, w});
-    output.Resize({n, c, h, w});
-    output_ref.Resize({n, c, h, w});
-
-    // initialize the data of scale, x, bias
-    // initialize_random_data<float>(scale);
-    // initialize_random_data<float>(x);
-    // initialize_random_data<float>(bias);
-    auto* scale_data = scale.mutable_data<float>();
-    for (int i = 0; i < scale.dims().production(); i++) {
-      float sign = i % 3 == 0 ? -1.0f : 1.0f;
-      scale_data[i] = sign * static_cast<float>(i % 128) * 0.010f;
-    }
-    auto* x_data = x.mutable_data<float>();
-    for (int i = 0; i < x.dims().production(); i++) {
-      float sign = i % 4 == 0 ? -1.0f : 1.0f;
-      x_data[i] = sign * static_cast<float>(i % 128) * 0.007f;
-    }
-    auto* bias_data = bias.mutable_data<float>();
-    for (int i = 0; i < bias.dims().production(); i++) {
-      float sign = i % 5 == 0 ? -1.0f : 1.0f;
-      bias_data[i] = sign * static_cast<float>(i % 128) * 0.005f;
-    }
-
-    // prepare kernel params and run to obtain output_data
-    AxpyCompute axpy_op;
-    std::unique_ptr<KernelContext> ctx(new KernelContext);
-    ctx->As<ARMContext>();
-    axpy_op.SetContext(std::move(ctx));
-    operators::AxpyParam param;
-    param.Scale = &scale;
-    param.X = &x;
-    param.Bias = &bias;
-    param.Out = &output;
-    axpy_op.SetParam(param);
-    axpy_op.Launch();
-    auto* output_data = output.mutable_data<float>();
-
-    // invoking ref implementation and compare results
-    param.Out = &output_ref;
-    axpy_compute_ref<float>(param);
-    auto* output_ref_data = output_ref.mutable_data<float>();
-
-    for (int i = 0; i < output.dims().production(); i++) {
-      EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-    }
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-USE_LITE_KERNEL(axpy, kARM, kFloat, kNCHW, def);
diff --git a/lite/kernels/arm/batch_norm_compute.cc b/lite/kernels/arm/batch_norm_compute.cc
deleted file mode 100644
index 1519ad624e..0000000000
--- a/lite/kernels/arm/batch_norm_compute.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/batch_norm_compute.h"
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void BatchNormCompute::PrepareForRun() {
-  auto& param = this->Param<param_t>();
-  auto x_dims = param.x->dims();
-  bool global_stats = param.is_test || param.use_global_stats;
-  if (global_stats) {
-    int64_t channel_size = 0;
-    switch (param.data_layout) {
-      case DATALAYOUT(kNCHW):
-        channel_size = x_dims[1];
-        break;
-      // case DATALAYOUT(kNHWC):
-      //   channel_size = x_dims[x_dims.size() - 1];
-      //   break;
-      default:
-        LOG(FATAL) << "Unknown storage order: "
-                   << DataLayoutToStr(param.data_layout);
-        break;
-    }
-    new_scale.Resize({channel_size});
-    new_bias.Resize({channel_size});
-    auto* scale_data = param.scale->mutable_data<float>();
-    auto* bias_data = param.bias->mutable_data<float>();
-    auto* mean_data = param.mean->mutable_data<float>();
-    auto* variance_data = param.variance->mutable_data<float>();
-    auto* new_scale_data = new_scale.mutable_data<float>();
-    auto* new_bias_data = new_bias.mutable_data<float>();
-    for (int c = 0; c < channel_size; c++) {
-      float inv_scale = 1.f / (std::sqrt(variance_data[c] + param.epsilon));
-      new_bias_data[c] =
-          bias_data[c] - inv_scale * scale_data[c] * mean_data[c];
-      new_scale_data[c] = inv_scale * scale_data[c];
-    }
-  }
-}
-
-void BatchNormCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto x_dims = param.x->dims();
-  auto x_data = param.x->mutable_data<float>();
-  auto y_data = param.y->mutable_data<float>();
-  bool global_stats = param.is_test || param.use_global_stats;
-  if (global_stats) {
-    auto* new_scale_data = new_scale.mutable_data<float>();
-    auto* new_bias_data = new_bias.mutable_data<float>();
-    int64_t outer_size = 0;
-    int64_t channel_size = 0;
-    int64_t inner_size = 0;
-    switch (param.data_layout) {
-      case DATALAYOUT(kNCHW):
-        outer_size = x_dims[0];
-        channel_size = x_dims[1];
-        inner_size = x_dims.Slice(2, x_dims.size()).production();
-        lite::arm::math::scale(x_data,
-                               y_data,
-                               outer_size,
-                               channel_size,
-                               inner_size,
-                               new_scale_data,
-                               new_bias_data);
-        break;
-      // case DATALAYOUT(kNHWC):
-      //   outer_size = x_dims.Slice(0, x_dims.size() - 1).production();
-      //   channel_size = x_dims[x_dims.size() - 1];
-      //   lite::arm::math::scale(x_data, y_data, outer_size, channel_size,
-      //                          new_scale_data, new_bias_data);
-      //   break;
-      default:
-        LOG(FATAL) << "Unknown storage order: "
-                   << DataLayoutToStr(param.data_layout);
-        break;
-    }
-  } else {
-    // TODO(hong19860320) calculate mean_out, variance_out, saved_mean and
-    // saved_variance
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(batch_norm,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::BatchNormCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Mean", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Variance", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("VarianceOut", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/batch_norm_compute.h b/lite/kernels/arm/batch_norm_compute.h
deleted file mode 100644
index 22553f55d5..0000000000
--- a/lite/kernels/arm/batch_norm_compute.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class BatchNormCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::BatchNormParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  virtual ~BatchNormCompute() = default;
-
- private:
-  Tensor new_scale;
-  Tensor new_bias;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/batch_norm_compute_test.cc b/lite/kernels/arm/batch_norm_compute_test.cc
deleted file mode 100644
index c603a04d47..0000000000
--- a/lite/kernels/arm/batch_norm_compute_test.cc
+++ /dev/null
@@ -1,221 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/batch_norm_compute.h"
-#include <gtest/gtest.h>
-#include <memory>
-#include <utility>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-template <typename dtype>
-void batch_norm_compute_ref(const operators::BatchNormParam& param) {
-  DDim x_dims = param.x->dims();
-  auto x_data = param.x->mutable_data<dtype>();
-  auto scale_data = param.scale->mutable_data<dtype>();
-  auto bias_data = param.bias->mutable_data<dtype>();
-  auto mean_data = param.mean->mutable_data<dtype>();
-  auto variance_data = param.variance->mutable_data<dtype>();
-  auto y_data = param.y->mutable_data<dtype>();
-  float epsilon = param.epsilon;
-  float momentum = param.momentum;
-  DataLayoutType data_layout = param.data_layout;
-
-  bool global_stats = param.is_test || param.use_global_stats;
-  if (global_stats) {
-    int64_t outer_size = 0;
-    int64_t channel_size = 0;
-    int64_t inner_size = 0;
-    switch (data_layout) {
-      case DATALAYOUT(kNCHW):
-        outer_size = x_dims[0];
-        channel_size = x_dims[1];
-        inner_size = x_dims.Slice(2, x_dims.size()).production();
-        break;
-      // case DATALAYOUT(kNHWC):
-      //   outer_size = x_dims.Slice(0, x_dims.size() - 1).production();
-      //   channel_size = x_dims[x_dims.size() - 1];
-      //   inner_size = 1;
-      //   break;
-      default:
-        LOG(FATAL) << "Unknown storage order: " << DataLayoutToStr(data_layout);
-        break;
-    }
-    auto x_ptr = x_data;
-    auto y_ptr = y_data;
-    for (int o = 0; o < outer_size; o++) {
-      for (int c = 0; c < channel_size; c++) {
-        for (int i = 0; i < inner_size; i++) {
-          dtype norm_x =
-              (*x_ptr - mean_data[c]) / std::sqrt(variance_data[c] + epsilon);
-          *y_ptr = norm_x * scale_data[c] + bias_data[c];
-          x_ptr++;
-          y_ptr++;
-        }
-      }
-    }
-  } else {
-    // TODO(hong19860320) calculate mean_out, variance_out, saved_mean and
-    // saved_variance
-  }
-}
-
-TEST(batch_norm_arm, retrive_op) {
-  auto batch_norm =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "batch_norm");
-  ASSERT_FALSE(batch_norm.empty());
-  ASSERT_TRUE(batch_norm.front());
-}
-
-TEST(batch_norm_arm, init) {
-  BatchNormCompute batch_norm;
-  ASSERT_EQ(batch_norm.precision(), PRECISION(kFloat));
-  ASSERT_EQ(batch_norm.target(), TARGET(kARM));
-}
-
-TEST(batch_norm_arm, compute) {
-  DeviceInfo::Init();
-  for (auto n : {1, 2}) {
-    for (auto c : {6, 32 /*, 128*/}) {
-      for (auto h : {9, 18 /*, 56 , 112, 224, 512*/}) {
-        for (auto w : {9, 18 /*, 56, 112, 224, 512*/}) {
-          for (auto is_test : {/*false, */ true}) {
-            for (auto use_global_stats : {false, true}) {
-              for (auto epsilon : {1e-4f, 1e-5f}) {
-                for (auto momentum : {0.9f, 0.99f}) {
-                  for (auto data_layout :
-                       {DATALAYOUT(kNCHW) /*, DATALAYOUT(kNHWC)*/}) {
-                    Tensor x;
-                    Tensor scale;
-                    Tensor bias;
-                    Tensor mean;
-                    Tensor variance;
-                    Tensor y;
-                    Tensor mean_out;
-                    Tensor variance_out;
-                    Tensor saved_mean;
-                    Tensor saved_variance;
-                    Tensor y_ref;
-                    Tensor mean_out_ref;
-                    Tensor variance_out_ref;
-                    Tensor saved_mean_ref;
-                    Tensor saved_variance_ref;
-                    // set the dims of input, output, ref output tensors
-                    std::vector<int64_t> in_out_shape;
-                    switch (data_layout) {
-                      case DATALAYOUT(kNCHW):
-                        in_out_shape = {n, c, h, w};
-                        break;
-                      // case DATALAYOUT(kNHWC):
-                      //   in_out_shape = {n, h, w, c};
-                      //   break;
-                      default:
-                        LOG(FATAL) << "Unknown storage order: "
-                                   << DataLayoutToStr(data_layout);
-                        break;
-                    }
-                    x.Resize(in_out_shape);
-                    scale.Resize({c});
-                    bias.Resize({c});
-                    mean.Resize({c});
-                    variance.Resize({c});
-                    y.Resize(in_out_shape);
-                    mean_out.Resize({c});
-                    variance_out.Resize({c});
-                    saved_mean.Resize({c});
-                    saved_variance.Resize({c});
-                    y_ref.Resize(in_out_shape);
-                    mean_out_ref.Resize({c});
-                    variance_out_ref.Resize({c});
-                    saved_mean_ref.Resize({c});
-                    saved_variance_ref.Resize({c});
-                    // initialize the data of input tensors
-                    auto* x_data = x.mutable_data<float>();
-                    auto* scale_data = scale.mutable_data<float>();
-                    auto* bias_data = bias.mutable_data<float>();
-                    auto* mean_data = mean.mutable_data<float>();
-                    auto* variance_data = variance.mutable_data<float>();
-                    auto* y_data = y.mutable_data<float>();
-                    for (int i = 0; i < x.dims().production(); i++) {
-                      x_data[i] = static_cast<float>(i % 64);
-                    }
-                    for (int i = 0; i < scale.dims().production(); i++) {
-                      scale_data[i] = static_cast<float>(i) * 0.01f + 0.03f;
-                    }
-                    for (int i = 0; i < bias.dims().production(); i++) {
-                      bias_data[i] = static_cast<float>(i) * 0.065f + 0.1f;
-                    }
-                    for (int i = 0; i < mean.dims().production(); i++) {
-                      mean_data[i] = static_cast<float>(i) * 0.0565f;
-                    }
-                    for (int i = 0; i < variance.dims().production(); i++) {
-                      variance_data[i] = static_cast<float>(i) * 2.08f + 1.5f;
-                    }
-                    // prepare kernel params and run
-                    BatchNormCompute batch_norm;
-                    std::unique_ptr<KernelContext> ctx(new KernelContext);
-                    ctx->As<ARMContext>();
-                    batch_norm.SetContext(std::move(ctx));
-                    operators::BatchNormParam param;
-                    param.x = &x;
-                    param.scale = &scale;
-                    param.bias = &bias;
-                    param.mean = &mean;
-                    param.variance = &variance;
-                    param.is_test = is_test;
-                    param.use_global_stats = use_global_stats;
-                    param.epsilon = epsilon;
-                    param.momentum = momentum;
-                    param.data_layout = data_layout;
-                    param.y = &y;
-                    param.mean_out = &mean_out;
-                    param.variance_out = &variance_out;
-                    param.saved_mean = &saved_mean;
-                    param.saved_variance = &saved_variance;
-                    batch_norm.SetParam(param);
-                    batch_norm.Launch();
-                    // invoking ref implementation and compare results
-                    param.y = &y_ref;
-                    param.mean_out = &mean_out_ref;
-                    param.variance_out = &variance_out_ref;
-                    param.saved_mean = &saved_mean_ref;
-                    param.saved_variance = &saved_variance_ref;
-                    batch_norm_compute_ref<float>(param);
-                    auto* y_ref_data = y_ref.mutable_data<float>();
-                    for (int i = 0; i < y.dims().production(); i++) {
-                      EXPECT_NEAR(y_data[i], y_ref_data[i], 1e-5);
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def);
diff --git a/lite/kernels/arm/beam_search_compute.cc b/lite/kernels/arm/beam_search_compute.cc
deleted file mode 100644
index 5ac53b3b96..0000000000
--- a/lite/kernels/arm/beam_search_compute.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/beam_search_compute.h"
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void BeamSearchCompute::PrepareForRun() {}
-
-void BeamSearchCompute::Run() {
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  auto& param = this->Param<operators::BeamSearchParam>();
-  lite::arm::math::beam_search(param.pre_ids,
-                               param.pre_scores,
-                               param.ids,
-                               param.scores,
-                               param.selected_ids,
-                               param.selected_scores,
-                               param.parent_idx,
-                               param.level,
-                               param.beam_size,
-                               param.end_id,
-                               param.is_accumulated,
-                               &ctx);
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(beam_search,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::BeamSearchCompute,
-                     def)
-    .BindInput("pre_ids", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("pre_scores", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("ids", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("scores", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("selected_ids", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("selected_scores", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("parent_idx", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/beam_search_compute.h b/lite/kernels/arm/beam_search_compute.h
deleted file mode 100644
index ebd72732bb..0000000000
--- a/lite/kernels/arm/beam_search_compute.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <stdint.h>
-#include "lite/backends/arm/math/type_trans.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class BeamSearchCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::BeamSearchParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  ~BeamSearchCompute() {}
-
- private:
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/beam_search_decode_compute.cc b/lite/kernels/arm/beam_search_decode_compute.cc
deleted file mode 100644
index a417baa6d7..0000000000
--- a/lite/kernels/arm/beam_search_decode_compute.cc
+++ /dev/null
@@ -1,296 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/beam_search_decode_compute.h"
-#include <algorithm>
-#include <vector>
-#include "lite/api/paddle_place.h"
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-using LoDTensor = lite::Tensor;
-using LoDTensorArray = std::vector<lite::Tensor>;
-
-// all the lod have 2 levels.
-// The first is source level, the second is sentence level.
-// source level describe how many prefixes (branchs) for each source sentece
-// (beam). sentence level describe how these candidates belong to the prefixes.
-const size_t kSourceLevel = 0;
-const size_t kSentenceLevel = 1;
-
-template <typename T>
-struct Sentence {
-  std::vector<float> word_ids;
-  std::vector<T> scores;
-};
-
-template <typename T>
-using SentenceVector = std::vector<Sentence<T>>;
-
-template <typename T>
-struct BeamSearchDecoder {
-  BeamSearchDecoder(size_t beam_size, int end_id)
-      : beam_size_(beam_size), end_id_(end_id) {}
-
-  /**
-   * convert the result sentence_vector for each source sentence into two
-   * LodTensor.
-   * One is all candidate sentences with word id, one is all candidate sentences
-   * with word score.
-   * Param:
-   *  sentence_vector_list: sentence_vector for each source sentence.
-   *  id_tensor: result LoDTensor for sentences of id.
-   *  score_tensor: result LoDTensor for sentences of score.
-   *  reverse: whether ids of sentence in sentence_vector_list is reversed
-   *  sort_by_score: whether to sort hypotheses of each sentence by scores.
-   */
-  void ConvertSentenceVectorToLodTensor(
-      std::vector<SentenceVector<T>> sentence_vector_list,
-      LoDTensor* id_tensor,
-      LoDTensor* score_tensor,
-      bool reverse = true,
-      bool sort_by_score = true) const {
-    size_t src_num = sentence_vector_list.size();
-    CHECK_GT(src_num, 0) << "src_num should not be 0";
-
-    std::vector<uint64_t> source_level_lod = {0};
-    std::vector<uint64_t> sentence_level_lod = {0};
-    std::vector<float> id_data;
-    std::vector<T> score_data;
-
-    for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
-      if (sort_by_score) {
-        sort(sentence_vector_list[src_idx].begin(),
-             sentence_vector_list[src_idx].end(),
-             [reverse](const Sentence<T>& a, const Sentence<T>& b) {
-               if (reverse)
-                 return a.scores.front() > b.scores.front();
-               else
-                 return a.scores.back() > b.scores.back();
-             });
-      }
-      for (Sentence<T>& sentence : sentence_vector_list[src_idx]) {
-        if (reverse) {
-          id_data.insert(id_data.end(),
-                         sentence.word_ids.rbegin(),
-                         sentence.word_ids.rend());
-          score_data.insert(score_data.end(),
-                            sentence.scores.rbegin(),
-                            sentence.scores.rend());
-        } else {
-          id_data.insert(id_data.end(),
-                         sentence.word_ids.begin(),
-                         sentence.word_ids.end());
-          score_data.insert(
-              score_data.end(), sentence.scores.begin(), sentence.scores.end());
-        }
-
-        sentence_level_lod.push_back(sentence_level_lod.back() +
-                                     sentence.word_ids.size());
-      }
-      source_level_lod.push_back(source_level_lod.back() +
-                                 sentence_vector_list[src_idx].size());
-    }
-
-    LoD lod;
-    lod.push_back(source_level_lod);
-    lod.push_back(sentence_level_lod);
-
-    *(id_tensor->mutable_lod()) = lod;
-
-    id_tensor->Resize({static_cast<int64_t>(id_data.size())});
-    auto id_ptr = id_tensor->mutable_data<float>();
-    TargetCopy(
-        TARGET(kARM), id_ptr, id_data.data(), id_data.size() * sizeof(float));
-
-    *(score_tensor->mutable_lod()) = lod;
-    score_tensor->Resize({static_cast<int64_t>(score_data.size())});
-    auto score_ptr = score_tensor->mutable_data<T>();
-    TargetCopy(TARGET(kARM),
-               score_ptr,
-               score_data.data(),
-               score_data.size() * sizeof(T));
-  }
-
-  /**
-   * Gather the hypotheses for each source sentence by backtrace though the
-   * LoDTensorArray step_ids whose lods reserve the path in the tree.
-   */
-  void Backtrace(const LoDTensorArray& step_ids,
-                 const LoDTensorArray& step_scores,
-                 LoDTensor* id_tensor,
-                 LoDTensor* score_tensor) const {
-    CHECK(!step_ids.empty()) << "step num should be larger than 0";
-    CHECK_EQ(step_ids.size(), step_scores.size())
-        << "step_ids and step_scores should be the same";
-    const size_t step_num = step_ids.size();
-    const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1;
-    std::vector<SentenceVector<T>> sentence_vector_list(
-        src_num, SentenceVector<T>(beam_size_));
-    std::vector<std::vector<size_t>> prefix_idx_vector_list(src_num);
-    for (int step_id = step_num - 1; step_id >= 0; --step_id) {
-      auto& cur_ids = step_ids.at(step_id);
-      auto& cur_scores = step_scores.at(step_id);
-      for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
-        // for each source sentence
-        auto& sentence_vector = sentence_vector_list.at(src_idx);
-        auto& prefix_idx_vector = prefix_idx_vector_list.at(src_idx);
-        size_t src_prefix_start = cur_ids.lod().at(kSourceLevel)[src_idx];
-        size_t src_prefix_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
-        if (prefix_idx_vector.empty()) {  // be finished and pruned at this step
-          // or the last time step
-          for (size_t prefix_idx = src_prefix_start;
-               prefix_idx < src_prefix_end;
-               ++prefix_idx) {
-            size_t candidate_start =
-                cur_ids.lod().at(kSentenceLevel)[prefix_idx];
-            size_t candidate_end =
-                cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1];
-            for (size_t candidate_idx = candidate_start;
-                 candidate_idx < candidate_end;
-                 ++candidate_idx) {
-              prefix_idx_vector.push_back(prefix_idx);
-              size_t idx = prefix_idx_vector.size() - 1;
-              auto cur_id = cur_ids.data<float>()[candidate_idx];
-              auto cur_score = cur_scores.data<T>()[candidate_idx];
-              sentence_vector.at(idx).word_ids.push_back(cur_id);
-              sentence_vector.at(idx).scores.push_back(cur_score);
-            }
-          }
-        } else {  // use prefix_idx_vector to backtrace
-          size_t src_candidate_start =
-              cur_ids.lod().at(kSentenceLevel)[src_prefix_start];
-          size_t prefix_idx = src_prefix_start;
-          size_t candidate_num =
-              cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
-              cur_ids.lod().at(kSentenceLevel)[prefix_idx];
-          for (size_t idx = 0; idx < prefix_idx_vector.size(); ++idx) {
-            auto candidate_idx = prefix_idx_vector.at(idx);
-            auto cur_id = cur_ids.data<float>()[candidate_idx];
-            auto cur_score = cur_scores.data<T>()[candidate_idx];
-            if (cur_id != end_id_ || sentence_vector.at(idx).word_ids.empty()) {
-              // to skip redundant end tokens
-              sentence_vector.at(idx).word_ids.push_back(cur_id);
-              sentence_vector.at(idx).scores.push_back(cur_score);
-            }
-
-            while (src_candidate_start + candidate_num <=
-                   candidate_idx) {  // search the corresponding prefix
-              prefix_idx++;
-              candidate_num +=
-                  cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
-                  cur_ids.lod().at(kSentenceLevel)[prefix_idx];
-            }
-            prefix_idx_vector.at(idx) = prefix_idx;
-          }
-        }
-      }
-    }
-
-    ConvertSentenceVectorToLodTensor(
-        sentence_vector_list, id_tensor, score_tensor, true, true);
-  }
-
-  size_t beam_size_;
-  int end_id_;
-};
-
-struct BeamSearchDecodeFunctor {
-  BeamSearchDecodeFunctor(const LoDTensorArray& step_ids,
-                          const LoDTensorArray& step_scores,
-                          LoDTensor* id_tensor,
-                          LoDTensor* score_tensor,
-                          size_t beam_size,
-                          int end_id)
-      : beam_size_(beam_size),
-        end_id_(end_id),
-        step_ids_(step_ids),
-        step_scores_(step_scores),
-        id_tensor_(id_tensor),
-        score_tensor_(score_tensor) {}
-
-  template <typename T>
-  void apply() const {
-    BeamSearchDecoder<T> beam_search_decoder(beam_size_, end_id_);
-    beam_search_decoder.Backtrace(
-        step_ids_, step_scores_, id_tensor_, score_tensor_);
-  }
-
-  size_t beam_size_;
-  int end_id_;
-  const LoDTensorArray& step_ids_;
-  const LoDTensorArray& step_scores_;
-  LoDTensor* id_tensor_;
-  LoDTensor* score_tensor_;
-};
-
-template <>
-void BeamSearchDecodeFunctor::apply<bool>() const {
-  LOG(FATAL) << "beam search decode op does not support bool!";
-}
-
-void BeamSearchDecodeCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  // inputs
-  auto ids = param.ids;
-  auto scores = param.scores;
-  // outputs
-  auto sentence_ids = param.sentence_ids;
-  auto sentence_scores = param.sentence_scores;
-
-  const size_t step_num = ids->size();
-  CHECK_GT(step_num, 0UL) << "beam search steps should be larger than 0";
-  const size_t source_num = ids->at(0).lod().at(0).size() - 1;
-  CHECK_GT(source_num, 0UL) << "source num should be larger than 0";
-
-  for (size_t i = 0; i < step_num; ++i) {
-    CHECK_EQ(ids->at(i).lod().size(), 2UL) << "Level of LodTensor should be 2";
-  }
-
-  //! fixme
-  // only support float score now
-  BeamSearchDecodeFunctor func(*ids,
-                               *scores,
-                               sentence_ids,
-                               sentence_scores,
-                               param.beam_size,
-                               param.end_id);
-
-  func.apply<float>();
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(beam_search_decode,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::BeamSearchDecodeCompute,
-                     def)
-    .BindInput("Ids", {LiteType::GetTensorListTy(TARGET(kARM))})
-    .BindInput("Scores", {LiteType::GetTensorListTy(TARGET(kARM))})
-    .BindOutput("SentenceIds", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("SentenceScores", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/beam_search_decode_compute.h b/lite/kernels/arm/beam_search_decode_compute.h
deleted file mode 100644
index db1961ad93..0000000000
--- a/lite/kernels/arm/beam_search_decode_compute.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/core/kernel.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class BeamSearchDecodeCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::BeamSearchDecodeParam;
-
-  BeamSearchDecodeCompute() = default;
-
-  void Run() override;
-
-  virtual ~BeamSearchDecodeCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/box_clip_compute.cc b/lite/kernels/arm/box_clip_compute.cc
deleted file mode 100644
index 9591302c58..0000000000
--- a/lite/kernels/arm/box_clip_compute.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/box_clip_compute.h"
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-template <class T>
-void ClipTiledBoxes(const Tensor& im_info,
-                    const Tensor& input_boxes,
-                    Tensor* out) {
-  T* out_data = out->mutable_data<T>();
-  const T* im_info_data = im_info.data<T>();
-  const T* input_boxes_data = input_boxes.data<T>();
-  T zero(0);
-  T im_w = round(im_info_data[1] / im_info_data[2]);
-  T im_h = round(im_info_data[0] / im_info_data[2]);
-  for (int64_t i = 0; i < input_boxes.numel(); ++i) {
-    if (i % 4 == 0) {
-      out_data[i] = std::max(std::min(input_boxes_data[i], im_w - 1), zero);
-    } else if (i % 4 == 1) {
-      out_data[i] = std::max(std::min(input_boxes_data[i], im_h - 1), zero);
-    } else if (i % 4 == 2) {
-      out_data[i] = std::max(std::min(input_boxes_data[i], im_w - 1), zero);
-    } else {
-      out_data[i] = std::max(std::min(input_boxes_data[i], im_h - 1), zero);
-    }
-  }
-}
-
-void BoxClipCompute::Run() {
-  auto& param = Param<operators::BoxClipParam>();
-  const auto* input = param.Input;
-  const auto* im_info = param.ImInfo;
-  auto* output = param.Output;
-  output->mutable_data<float>();
-  if (input->lod().size() > 1) {
-    LOG(FATAL) << "Only support 0 and 1 level of LoD.";
-  }
-
-  auto box_lod = input->lod().back();
-  int64_t n = static_cast<int64_t>(box_lod.size() - 1);
-  for (int i = 0; i < n; ++i) {
-    Tensor im_info_slice = im_info->Slice<float>(i, i + 1);
-    auto* im_info_slice_data = im_info_slice.data<float>();
-    Tensor box_slice = input->Slice<float>(box_lod[i], box_lod[i + 1]);
-    Tensor output_slice = output->Slice<float>(box_lod[i], box_lod[i + 1]);
-    ClipTiledBoxes<float>(im_info_slice, box_slice, &output_slice);
-  }
-  return;
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(box_clip,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::BoxClipCompute,
-                     def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("ImInfo", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/box_clip_compute.h b/lite/kernels/arm/box_clip_compute.h
deleted file mode 100644
index 460921b2d0..0000000000
--- a/lite/kernels/arm/box_clip_compute.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/core/kernel.h"
-#include "lite/operators/box_clip_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class BoxClipCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::BoxClipParam;
-
-  void Run() override;
-
-  virtual ~BoxClipCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/box_coder_compute.cc b/lite/kernels/arm/box_coder_compute.cc
deleted file mode 100644
index 81e79a83f2..0000000000
--- a/lite/kernels/arm/box_coder_compute.cc
+++ /dev/null
@@ -1,241 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/box_coder_compute.h"
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void EncodeCenterSize(const Tensor* target_box,
-                      const Tensor* prior_box,
-                      const Tensor* prior_box_var,
-                      const bool normalized,
-                      const std::vector<float> variance,
-                      float* output) {
-  int64_t row = target_box->dims()[0];
-  int64_t col = prior_box->dims()[0];
-  int64_t len = prior_box->dims()[1];
-  for (int64_t i = 0; i < row; ++i) {
-    for (int64_t j = 0; j < col; ++j) {
-      auto* target_box_data = target_box->data<float>();
-      auto* prior_box_data = prior_box->data<float>();
-      int64_t offset = i * col * len + j * len;
-      float prior_box_width = prior_box_data[j * len + 2] -
-                              prior_box_data[j * len] + (normalized == false);
-      float prior_box_height = prior_box_data[j * len + 3] -
-                               prior_box_data[j * len + 1] +
-                               (normalized == false);
-      float prior_box_center_x = prior_box_data[j * len] + prior_box_width / 2;
-      float prior_box_center_y =
-          prior_box_data[j * len + 1] + prior_box_height / 2;
-
-      float target_box_center_x =
-          (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
-      float target_box_center_y =
-          (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
-      float target_box_width = target_box_data[i * len + 2] -
-                               target_box_data[i * len] + (normalized == false);
-      float target_box_height = target_box_data[i * len + 3] -
-                                target_box_data[i * len + 1] +
-                                (normalized == false);
-
-      output[offset] =
-          (target_box_center_x - prior_box_center_x) / prior_box_width;
-      output[offset + 1] =
-          (target_box_center_y - prior_box_center_y) / prior_box_height;
-      output[offset + 2] =
-          std::log(std::fabs(target_box_width / prior_box_width));
-      output[offset + 3] =
-          std::log(std::fabs(target_box_height / prior_box_height));
-    }
-  }
-
-  if (prior_box_var) {
-    const float* prior_box_var_data = prior_box_var->data<float>();
-    for (int64_t i = 0; i < row; ++i) {
-      for (int64_t j = 0; j < col; ++j) {
-        for (int k = 0; k < 4; ++k) {
-          int64_t offset = i * col * len + j * len;
-          int64_t prior_var_offset = j * len;
-          output[offset + k] /= prior_box_var_data[prior_var_offset + k];
-        }
-      }
-    }
-  } else if (!(variance.empty())) {
-    for (int64_t i = 0; i < row; ++i) {
-      for (int64_t j = 0; j < col; ++j) {
-        for (int k = 0; k < 4; ++k) {
-          int64_t offset = i * col * len + j * len;
-          output[offset + k] /= static_cast<float>(variance[k]);
-        }
-      }
-    }
-  }
-}
-
-template <int axis, int var_size>
-void DecodeCenterSize(const Tensor* target_box,
-                      const Tensor* prior_box,
-                      const Tensor* prior_box_var,
-                      const bool normalized,
-                      std::vector<float> variance,
-                      float* output) {
-  int64_t row = target_box->dims()[0];
-  int64_t col = target_box->dims()[1];
-  int64_t len = target_box->dims()[2];
-
-  for (int64_t i = 0; i < row; ++i) {
-    for (int64_t j = 0; j < col; ++j) {
-      auto* target_box_data = target_box->data<float>();
-      auto* prior_box_data = prior_box->data<float>();
-
-      float var_data[4] = {1., 1., 1., 1.};
-      float* var_ptr = var_data;
-      int64_t offset = i * col * len + j * len;
-      int64_t prior_box_offset = axis == 0 ? j * len : i * len;
-
-      float prior_box_width = prior_box_data[prior_box_offset + 2] -
-                              prior_box_data[prior_box_offset] +
-                              (normalized == false);
-      float prior_box_height = prior_box_data[prior_box_offset + 3] -
-                               prior_box_data[prior_box_offset + 1] +
-                               (normalized == false);
-      float prior_box_center_x =
-          prior_box_data[prior_box_offset] + prior_box_width / 2;
-      float prior_box_center_y =
-          prior_box_data[prior_box_offset + 1] + prior_box_height / 2;
-
-      float target_box_center_x = 0, target_box_center_y = 0;
-      float target_box_width = 0, target_box_height = 0;
-      int64_t prior_var_offset = axis == 0 ? j * len : i * len;
-      if (var_size == 2) {
-        std::memcpy(var_ptr,
-                    prior_box_var->data<float>() + prior_var_offset,
-                    4 * sizeof(float));
-      } else if (var_size == 1) {
-        var_ptr = reinterpret_cast<float*>(variance.data());
-      }
-      float box_var_x = *var_ptr;
-      float box_var_y = *(var_ptr + 1);
-      float box_var_w = *(var_ptr + 2);
-      float box_var_h = *(var_ptr + 3);
-
-      target_box_center_x =
-          box_var_x * target_box_data[offset] * prior_box_width +
-          prior_box_center_x;
-      target_box_center_y =
-          box_var_y * target_box_data[offset + 1] * prior_box_height +
-          prior_box_center_y;
-      target_box_width =
-          std::exp(box_var_w * target_box_data[offset + 2]) * prior_box_width;
-      target_box_height =
-          std::exp(box_var_h * target_box_data[offset + 3]) * prior_box_height;
-
-      output[offset] = target_box_center_x - target_box_width / 2;
-      output[offset + 1] = target_box_center_y - target_box_height / 2;
-      output[offset + 2] =
-          target_box_center_x + target_box_width / 2 - (normalized == false);
-      output[offset + 3] =
-          target_box_center_y + target_box_height / 2 - (normalized == false);
-    }
-  }
-}
-
-void BoxCoderCompute::Run() {
-  /*
-  auto& param = Param<operators::BoxCoderParam>();
-  int axis = param.axis;
-  bool box_normalized = param.box_normalized;
-  std::string code_type = param.code_type;
-
-  lite::arm::math::box_coder(param.proposals,
-                             param.prior_box,
-                             param.prior_box_var,
-                             param.target_box,
-                             code_type,
-                             box_normalized,
-                             axis);
-  */
-  auto& param = Param<operators::BoxCoderParam>();
-  auto* prior_box = param.prior_box;
-  auto* prior_box_var = param.prior_box_var;
-  auto* target_box = param.target_box;
-  auto* output_box = param.proposals;
-  std::vector<float> variance = param.variance;
-  const int axis = param.axis;
-  std::string code_type = param.code_type;
-  bool normalized = param.box_normalized;
-
-  auto row = target_box->dims()[0];
-  auto col = prior_box->dims()[0];
-  if (code_type == "decode_center_size") {
-    col = target_box->dims()[1];
-  }
-  auto len = prior_box->dims()[1];
-  output_box->Resize({row, col, len});
-  auto* output = output_box->mutable_data<float>();
-
-  if (code_type == "encode_center_size") {
-    EncodeCenterSize(
-        target_box, prior_box, prior_box_var, normalized, variance, output);
-  } else if (code_type == "decode_center_size") {
-    if (prior_box_var) {
-      if (axis == 0) {
-        DecodeCenterSize<0, 2>(
-            target_box, prior_box, prior_box_var, normalized, variance, output);
-      } else {
-        DecodeCenterSize<1, 2>(
-            target_box, prior_box, prior_box_var, normalized, variance, output);
-      }
-    } else if (!(variance.empty())) {
-      if (axis == 0) {
-        DecodeCenterSize<0, 1>(
-            target_box, prior_box, prior_box_var, normalized, variance, output);
-      } else {
-        DecodeCenterSize<1, 1>(
-            target_box, prior_box, prior_box_var, normalized, variance, output);
-      }
-    } else {
-      if (axis == 0) {
-        DecodeCenterSize<0, 0>(
-            target_box, prior_box, prior_box_var, normalized, variance, output);
-      } else {
-        DecodeCenterSize<1, 0>(
-            target_box, prior_box, prior_box_var, normalized, variance, output);
-      }
-    }
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(box_coder,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::BoxCoderCompute,
-                     def)
-    .BindInput("PriorBox", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("PriorBoxVar", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("TargetBox", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("OutputBox", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/box_coder_compute.h b/lite/kernels/arm/box_coder_compute.h
deleted file mode 100644
index 0279af4ea5..0000000000
--- a/lite/kernels/arm/box_coder_compute.h
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class BoxCoderCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::BoxCoderParam;
-
-  void Run() override;
-
-  virtual ~BoxCoderCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/calib_compute.cc b/lite/kernels/arm/calib_compute.cc
deleted file mode 100644
index 525e5aefd6..0000000000
--- a/lite/kernels/arm/calib_compute.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/calib_compute.h"
-#include <vector>
-#include "lite/backends/arm/math/type_trans.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void CalibComputeFp32ToInt8::Run() {
-  auto& param = this->Param<operators::CalibParam>();
-  std::vector<float> scale = {param.scale};
-  const auto* din = param.input->data<float>();
-  auto* dout = param.output->mutable_data<signed char>();
-  lite::arm::math::fp32_to_int8(
-      din, dout, scale.data(), 1, 1, param.input->numel());
-  return;
-}
-
-void CalibComputeInt8ToFp32::Run() {
-  auto& param = this->Param<operators::CalibParam>();
-  const auto* din = param.input->data<signed char>();
-  std::vector<float> scale = {param.scale};
-  auto* dout = param.output->mutable_data<float>();
-  lite::arm::math::int8_to_fp32(
-      din, dout, scale.data(), 1, 1, param.input->numel());
-  return;
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(calib,
-                     kARM,
-                     kInt8,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CalibComputeFp32ToInt8,
-                     fp32_to_int8)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(calib,
-                     kARM,
-                     kInt8,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CalibComputeInt8ToFp32,
-                     int8_to_fp32)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
-    .Finalize();
-REGISTER_LITE_KERNEL(calib_once,
-                     kARM,
-                     kInt8,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CalibComputeFp32ToInt8,
-                     fp32_to_int8)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(calib_once,
-                     kARM,
-                     kInt8,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CalibComputeInt8ToFp32,
-                     int8_to_fp32)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
-    .Finalize();
diff --git a/lite/kernels/arm/calib_compute.h b/lite/kernels/arm/calib_compute.h
deleted file mode 100644
index 8d9a32bc24..0000000000
--- a/lite/kernels/arm/calib_compute.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/kernel.h"
-#include "lite/operators/calib_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class CalibComputeFp32ToInt8
-    : public KernelLite<TARGET(kARM), PRECISION(kInt8)> {
- public:
-  using param_t = operators::CalibParam;
-
-  void Run() override;
-
-  ~CalibComputeFp32ToInt8() override{};
-
- private:
-};
-
-class CalibComputeInt8ToFp32
-    : public KernelLite<TARGET(kARM), PRECISION(kInt8)> {
- public:
-  using param_t = operators::CalibParam;
-
-  void Run() override;
-
-  ~CalibComputeInt8ToFp32() override{};
-
- private:
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/calib_compute_test.cc b/lite/kernels/arm/calib_compute_test.cc
deleted file mode 100644
index 27049cc2c6..0000000000
--- a/lite/kernels/arm/calib_compute_test.cc
+++ /dev/null
@@ -1,156 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/calib_compute.h"
-#include <gtest/gtest.h>
-#include <algorithm>
-#include <iostream>
-#include <memory>
-#include <random>
-#include <utility>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-static int get_rand(int start, int end) {
-  int i = rand();  // NOLINT
-  i = (i % (end - start)) + start;
-  return i;
-}
-
-static void int8_to_fp32_basic(const int8_t* din,
-                               float* dout,
-                               const float* scale,
-                               int axis_size,
-                               int64_t outer_size,
-                               int64_t inner_size) {
-  int loop_size = axis_size * outer_size;
-  for (int i = 0; i < loop_size; ++i) {
-    float scale_in = scale[i % axis_size];
-    for (int j = 0; j < inner_size; ++j) {
-      dout[j] = din[j] * scale_in;
-    }
-    dout += inner_size;
-    din += inner_size;
-  }
-}
-
-static void fp32_to_int8_basic(const float* din,
-                               int8_t* dout,
-                               const float* scale,
-                               int axis_size,
-                               int64_t outer_size,
-                               int64_t inner_size) {
-  int loop_size = axis_size * outer_size;
-  for (int i = 0; i < loop_size; ++i) {
-    float inv_scale = 1.f / scale[i % axis_size];
-    for (int j = 0; j < inner_size; ++j) {
-      dout[j] = static_cast<int8_t>(roundf(din[j] * inv_scale));
-    }
-    dout += inner_size;
-    din += inner_size;
-  }
-}
-
-void calib_ref(const operators::CalibParam& param) {
-  std::vector<float> scale = {param.in_scale};
-  if (param.in_dtype == PRECISION(kFloat) &&
-      param.out_dtype == PRECISION(kInt8)) {
-    const auto* din = param.input->data<float>();
-    auto* dout = param.output->mutable_data<signed char>();
-    fp32_to_int8_basic(din, dout, scale.data(), 1, 1, param.input->numel());
-    return;
-  }
-  if (param.in_dtype == PRECISION(kInt8) &&
-      param.out_dtype == PRECISION(kFloat)) {
-    const auto* din = param.input->data<signed char>();
-    auto* dout = param.output->mutable_data<float>();
-    int8_to_fp32_basic(din, dout, scale.data(), 1, 1, param.input->numel());
-    return;
-  }
-  LOG(FATAL) << "Unsupport Dtype.";
-}
-
-TEST(calib_arm, retrive_op) {
-  auto calib =
-      KernelRegistry::Global()
-          .Create<TARGET(kARM), PRECISION(kInt8), DATALAYOUT(kNCHW)>("calib");
-  ASSERT_FALSE(calib.empty());
-  ASSERT_TRUE(calib.front());
-}
-
-TEST(calib_arm, init) {
-  CalibCompute calib;
-  ASSERT_EQ(calib.precision(), PRECISION(kInt8));
-  ASSERT_EQ(calib.target(), TARGET(kARM));
-}
-
-TEST(calib_arm, int8_to_fp32) {
-  DeviceInfo::Init();
-  for (auto n : {1, 2}) {
-    for (auto c : {6, 32 /*, 128*/}) {
-      for (auto h : {9, 18 /*, 56 , 112, 224, 512*/}) {
-        for (auto w : {9, 18 /*, 56, 112, 224, 512*/}) {
-          Tensor x;
-          Tensor output;
-          Tensor output_ref;
-          // set the dims of input, output, ref output tensors
-          x.Resize({n, c, h, w});
-          output.Resize({n, c, h, w});
-          output_ref.Resize({n, c, h, w});
-          // initialize the data of input tensors
-          auto* x_data = x.mutable_data<char>();
-          auto* output_data = output.mutable_data<float>();
-          for (int i = 0; i < x.dims().production(); i++) {
-            float sign = i % 3 == 0 ? -1.0f : 1.0f;
-            x_data[i] = sign * static_cast<float>(i % 128) * 0.013f;
-          }
-          // prepare kernel params and run
-          CalibCompute calib;
-          std::unique_ptr<KernelContext> ctx(new KernelContext);
-          ctx->As<ARMContext>();
-          calib.SetContext(std::move(ctx));
-          operators::CalibParam param;
-          param.in_scale = get_rand(0, 100) * 0.1f;
-          param.in_dtype = PRECISION(kInt8);
-          param.out_dtype = PRECISION(kFloat);
-          param.input = &x;
-          param.output = &output;
-          calib.SetParam(param);
-          calib.Launch();
-          // invoking ref implementation and compare results
-          param.output = &output_ref;
-          calib_ref(param);
-          auto* output_ref_data = output_ref.mutable_data<float>();
-          for (int i = 0; i < output.dims().production(); i++) {
-            EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, int8_to_fp32);
-USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, fp32_to_int8);
diff --git a/lite/kernels/arm/cast_compute.cc b/lite/kernels/arm/cast_compute.cc
deleted file mode 100644
index 8b6971ec13..0000000000
--- a/lite/kernels/arm/cast_compute.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/cast_compute.h"
-#include <algorithm>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-template <class in_type, class out_type>
-out_type TransOp(in_type in) {
-  return static_cast<in_type>(in);
-}
-
-void CastCompute::PrepareForRun() {}
-
-void CastCompute::Run() {
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  auto& param = this->Param<operators::CastParam>();
-
-  auto input_dims = param.X->dims();
-
-  // BOOL = 0;INT16 = 1;INT32 = 2;INT64 = 3;FP16 = 4;FP32 = 5;FP64 = 6;
-  // SIZE_T = 19;UINT8 = 20;INT8 = 21;
-  if (param.in_dtype == param.out_dtype && param.in_dtype == 2) {
-    const auto* x_data = param.X->data<float>();
-    auto* o_data = param.Out->mutable_data<float>();
-    memcpy(o_data, x_data, sizeof(float) * param.X->numel());
-  } else if (param.in_dtype == 21 && param.out_dtype == 5) {  // int8->float32
-    const char* x_data_begin = param.X->data<char>();
-    const char* x_data_end = x_data_begin + param.X->numel();
-    float* out_data = param.Out->mutable_data<float>();
-    std::transform(x_data_begin, x_data_end, out_data, TransOp<char, float>);
-  } else {
-    LOG(FATAL) << "other has not been implemented";
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    cast, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::CastCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/cast_compute.h b/lite/kernels/arm/cast_compute.h
deleted file mode 100644
index d342a405ad..0000000000
--- a/lite/kernels/arm/cast_compute.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <stdint.h>
-#include "lite/backends/arm/math/type_trans.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class CastCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::CastParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  ~CastCompute() {}
-
- private:
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/compare_compute.cc b/lite/kernels/arm/compare_compute.cc
deleted file mode 100644
index fe4b3d6587..0000000000
--- a/lite/kernels/arm/compare_compute.cc
+++ /dev/null
@@ -1,186 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/compare_compute.h"
-#include <vector>
-#include "lite/api/paddle_place.h"
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-#define COMPARE_FUNCTOR(name, op)                                           \
-  template <typename T>                                                     \
-  struct _##name##Functor {                                                 \
-    inline bool operator()(const T &a, const T &b) const { return a op b; } \
-  };
-
-COMPARE_FUNCTOR(Equal, ==);
-COMPARE_FUNCTOR(NotEqual, !=);
-COMPARE_FUNCTOR(LessThan, <);
-COMPARE_FUNCTOR(LessEqual, <=);
-COMPARE_FUNCTOR(GreaterThan, >);
-COMPARE_FUNCTOR(GreaterEqual, >=);
-
-template <>
-struct _EqualFunctor<float> {
-  inline bool operator()(const float &a, const float &b) const {
-    // It is safe to cast a and b to double.
-    return fabs(static_cast<double>(a - b)) < 1e-8;
-  }
-};
-
-template <>
-struct _NotEqualFunctor<float> {
-  inline bool operator()(const float &a, const float &b) const {
-    return !_EqualFunctor<float>()(a, b);
-  }
-};
-
-inline void get_mid_dims(const lite::DDim &x_dims,
-                         const lite::DDim &y_dims,
-                         const int axis,
-                         int *pre,
-                         int *n,
-                         int *post) {
-  *pre = 1;
-  *n = 1;
-  *post = 1;
-  for (int i = 0; i < axis; ++i) {
-    (*pre) *= x_dims[i];
-  }
-
-  for (int i = 0; i < y_dims.size(); ++i) {
-    (*n) *= y_dims[i];
-  }
-
-  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
-    (*post) *= x_dims[i];
-  }
-}
-template <template <typename T> class Functor>
-void CompareCompute<Functor>::PrepareForRun() {}
-
-template <template <typename T> class Functor>
-void CompareCompute<Functor>::Run() {
-  auto &param = this->Param<operators::CompareParam>();
-
-  using CompareFunctor = Functor<float>;
-
-  const size_t x_size = param.X->numel();
-  const size_t y_size = param.Y->numel();
-  auto x_dims = param.X->dims();
-  auto y_dims = param.Y->dims();
-  bool *z = param.Out->template mutable_data<bool>();
-  const auto *x = param.X->template data<int>();
-  const auto *y = param.Y->template data<float>();
-  auto axis = param.axis;
-  bool force_cpu = param.force_cpu;
-  if (x_size == y_size) {
-    for (int i = 0; i < x_size; ++i) {
-      z[i] = CompareFunctor()(x[i], y[i]);
-      // z[i] = x[i] < y[i];
-    }
-  } else {
-    int axis = (param.axis == -1 ? x_dims.size() - y_dims.size() : param.axis);
-    int outer_num, mid_num, inner_num;
-    get_mid_dims(x_dims, y_dims, axis, &outer_num, &mid_num, &inner_num);
-    for (int outer_id = 0; outer_id < outer_num; ++outer_id) {
-      for (int mid_id = 0; mid_id < mid_num; ++mid_id) {
-        auto y_data = y[mid_id];
-        for (int inner_id = 0; inner_id < inner_num; ++inner_id) {
-          int index = (outer_id * mid_num + mid_id) * inner_num + inner_id;
-          z[index] = CompareFunctor()(x[index], y_data);
-          // z[index] = x[index] < y_data;
-        }
-      }
-    }
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(less_than,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_LessThanFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-REGISTER_LITE_KERNEL(equal,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_EqualFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-REGISTER_LITE_KERNEL(not_equal,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_NotEqualFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-REGISTER_LITE_KERNEL(less_equal,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_LessEqualFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-REGISTER_LITE_KERNEL(greater_than,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_GreaterThanFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-REGISTER_LITE_KERNEL(greater_equal,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_GreaterEqualFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
diff --git a/lite/kernels/arm/compare_compute.h b/lite/kernels/arm/compare_compute.h
deleted file mode 100644
index 65354022c6..0000000000
--- a/lite/kernels/arm/compare_compute.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <stdint.h>
-#include "lite/backends/arm/math/type_trans.h"
-#include "lite/core/kernel.h"
-#include "lite/operators/compare_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-template <template <typename T> class Functor>
-class CompareCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::LogicalParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  ~CompareCompute() {}
-
- private:
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/concat_compute.cc b/lite/kernels/arm/concat_compute.cc
deleted file mode 100644
index ccae9e0df6..0000000000
--- a/lite/kernels/arm/concat_compute.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/concat_compute.h"
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-std::vector<size_t> stride_numel(const DDim& ddim) {
-  std::vector<size_t> strides(ddim.size());
-  strides[ddim.size() - 1] = ddim[ddim.size() - 1];
-  for (int i = ddim.size() - 2; i >= 0; --i) {
-    strides[i] = strides[i + 1] * ddim[i];
-  }
-  return strides;
-}
-
-void ConcatCompute::Run() {
-  auto& param = Param<operators::ConcatParam>();
-  std::vector<lite::Tensor*> inputs = param.x;
-  auto* out = param.output;
-  int axis = param.axis;
-  out->mutable_data<float>();
-
-  /// Sometimes direct copies will be faster, this maybe need deeply analysis.
-  if (axis == 0 && inputs.size() < 10) {
-    size_t output_offset = 0;
-    for (auto* in : inputs) {
-      auto in_stride = stride_numel(in->dims());
-      auto out_stride = stride_numel(out->dims());
-      void* dst = out->mutable_data<float>() + output_offset;
-      const void* src = in->data<float>();
-#if 0
-      LOG(INFO) << "out_stride.size():" << out_stride.size();
-      LOG(INFO) << "out_stride[0]" << out_stride[0];
-      for (int i=0; i < out_stride.size(); ++i) {
-        LOG(INFO) << "out_stride[" << i << "]:" << out_stride[i];
-      }
-      LOG(INFO) << "in_stride.size():" << in_stride.size();
-      for (int i=0; i < in_stride.size(); ++i) {
-        LOG(INFO) << "in_stride[" << i << "]:" << in_stride[i];
-      }
-#endif
-      // src and dst tensor should have the same dims size.
-      CHECK(in_stride.size() == out_stride.size());
-      std::memcpy(dst, src, sizeof(float) * in_stride[0]);
-      output_offset += in_stride[0];
-    }
-  } else {
-    std::vector<lite::Tensor*> inputs_concat(inputs.size());
-    for (int j = 0; j < inputs.size(); ++j) {
-      inputs_concat[j] = inputs[j];
-    }
-    lite::arm::math::concat_func(inputs_concat, axis, out);
-  }
-  return;
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    concat, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ConcatCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/concat_compute.h b/lite/kernels/arm/concat_compute.h
deleted file mode 100644
index d692140420..0000000000
--- a/lite/kernels/arm/concat_compute.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/core/kernel.h"
-#include "lite/operators/concat_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class ConcatCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ConcatParam;
-
-  void Run() override;
-
-  virtual ~ConcatCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/concat_compute_test.cc b/lite/kernels/arm/concat_compute_test.cc
deleted file mode 100644
index a3131f6892..0000000000
--- a/lite/kernels/arm/concat_compute_test.cc
+++ /dev/null
@@ -1,236 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/concat_compute.h"
-#include <gtest/gtest.h>
-#include <limits>
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-bool infer_shape(const operators::ConcatParam& param) {
-  std::vector<lite::DDim> input_dims;
-  for (auto p : param.x) {
-    input_dims.push_back(p->dims());
-  }
-  size_t axis = static_cast<size_t>(param.axis);
-  const size_t n = input_dims.size();
-  CHECK_GT_OR_FALSE(n, 0);
-  auto& out_dims = input_dims[0];
-  size_t in_zero_dims_size = out_dims.size();
-  for (size_t i = 1; i < n; i++) {
-    for (size_t j = 0; j < in_zero_dims_size; j++) {
-      if (j == axis) {
-        out_dims[axis] += input_dims[i][j];
-      } else {
-        CHECK_EQ_OR_FALSE(out_dims[j], input_dims[i][j]);
-      }
-    }
-  }
-  if (out_dims[axis] < 0) {
-    out_dims[axis] = -1;
-  }
-  // Set output dims
-  param.output->Resize(lite::DDim(out_dims));
-  return true;
-}
-
-void concat_compute_ref(const operators::ConcatParam& param) {
-  std::vector<lite::Tensor*> input = param.x;
-  int axis = param.axis;
-  infer_shape(param);
-
-  lite::Tensor* output = param.output;
-  int num = input.size();
-  int rows = 1;
-  auto dim_0 = input[0]->dims();
-  for (int i = 0; i < axis; ++i) {
-    rows *= dim_0[i];
-  }
-  int out_rows = rows, out_cols = 0;
-
-  std::vector<int> input_cols(input.size());
-  for (int i = 0; i < num; ++i) {
-    int input_i_numel = input[i]->dims().size() == 0 ? 0 : 1;
-    for (int didx = 0; didx < input[i]->dims().size(); ++didx) {
-      input_i_numel *= input[i]->dims()[didx];
-    }
-    int t_cols = input_i_numel / rows;
-    out_cols += t_cols;
-    input_cols[i] = t_cols;
-  }
-
-  // computation
-  auto output_data = output->mutable_data<float>();
-  int col_idx = 0;
-  for (int j = 0; j < num; ++j) {
-    int col_len = input_cols[j];
-    auto input_data = input[j]->data<float>();
-    for (int k = 0; k < out_rows; ++k) {
-      memcpy(output_data + k * out_cols + col_idx,
-             input_data + k * col_len,
-             sizeof(float) * col_len);
-    }
-    col_idx += col_len;
-  }
-}
-
-TEST(concat_arm, init) {
-  ConcatCompute concat;
-  ASSERT_EQ(concat.precision(), PRECISION(kFloat));
-  ASSERT_EQ(concat.target(), TARGET(kARM));
-}
-
-TEST(concat_arm, compute_input_single) {
-  ConcatCompute concat;
-  operators::ConcatParam param;
-
-  LOG(INFO) << "test concat start";
-  lite::Tensor output;
-  lite::Tensor output_ref;
-  lite::Tensor tensorA;
-  DDimLite ddimA({10, 4, 3, 2});
-  tensorA.Resize(ddimA);
-
-  for (int i = 0; i < ddimA.data()[0] * ddimA.data()[1] * ddimA.data()[2] *
-                          ddimA.data()[3];
-       i++) {
-    tensorA.mutable_data<float>()[i] = i;
-  }
-
-  param.x.push_back(&tensorA);
-  for (int cur_axis : {0, 1}) {
-    param.output = &output;
-    param.axis = cur_axis;
-    CHECK(infer_shape(param));
-    concat.SetParam(param);
-    LOG(INFO) << "test concat start cur_axis:" << cur_axis;
-
-    concat.Run();
-    LOG(INFO) << "concat.Run end";
-    param.output = &output_ref;
-    LOG(INFO) << "concat_compute_ref start";
-    concat_compute_ref(param);
-    LOG(INFO) << "concat_compute_ref end";
-
-    auto* output_data = output.data<float>();
-    auto* output_ref_data = output_ref.data<float>();
-    for (int i = 0; i < (ddimA.data()[0]) * ddimA.data()[1] * ddimA.data()[2] *
-                            ddimA.data()[3];
-         i++) {
-      // LOG(INFO) << "output[" << i << "]:" << output_data[i] << "
-      // output_ref_data[" << i << "]:" << output_ref_data[i];
-      EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-    }
-  }
-}
-
-TEST(concat_arm, compute_input_multi) {
-  ConcatCompute concat;
-  operators::ConcatParam param;
-
-  LOG(INFO) << "test concat start";
-  // init param
-  // x: tensorA, tensorB, tensorC, tensorD
-  // axis: 0
-  lite::Tensor output;
-  lite::Tensor output_ref;
-  lite::Tensor tensorA;
-  lite::Tensor tensorB;
-  lite::Tensor tensorC;
-  lite::Tensor tensorD;
-
-  DDimLite ddimA({10, 4, 3, 2});
-  DDimLite ddimB({20, 4, 3, 2});
-  DDimLite ddimC({30, 4, 3, 2});
-  DDimLite ddimD({40, 4, 3, 2});
-
-  tensorA.Resize(ddimA);
-  tensorB.Resize(ddimB);
-  tensorC.Resize(ddimC);
-  tensorD.Resize(ddimD);
-
-  for (int i = 0; i < ddimA.data()[0] * ddimA.data()[1] * ddimA.data()[2] *
-                          ddimA.data()[3];
-       i++) {
-    tensorA.mutable_data<float>()[i] = i;
-  }
-  for (int i = 0; i < ddimB.data()[0] * ddimB.data()[1] * ddimB.data()[2] *
-                          ddimB.data()[3];
-       i++) {
-    tensorB.mutable_data<float>()[i] = i + 1;
-  }
-  for (int i = 0; i < ddimC.data()[0] * ddimC.data()[1] * ddimC.data()[2] *
-                          ddimC.data()[3];
-       i++) {
-    tensorC.mutable_data<float>()[i] = i + 2;
-  }
-  for (int i = 0; i < ddimD.data()[0] * ddimD.data()[1] * ddimD.data()[2] *
-                          ddimD.data()[3];
-       i++) {
-    tensorD.mutable_data<float>()[i] = i + 3;
-  }
-
-  param.x.push_back(&tensorA);
-  param.x.push_back(&tensorB);
-  param.x.push_back(&tensorC);
-  param.x.push_back(&tensorD);
-  for (int cur_axis : {0}) {
-    param.output = &output;
-    param.axis = cur_axis;
-    CHECK(infer_shape(param));
-    concat.SetParam(param);
-    LOG(INFO) << "test concat start cur_axis:" << cur_axis;
-
-    concat.Run();
-    LOG(INFO) << "concat.Run end";
-    param.output = &output_ref;
-    LOG(INFO) << "concat_compute_ref start";
-    concat_compute_ref(param);
-    LOG(INFO) << "concat_compute_ref end";
-
-    auto* output_data = output.data<float>();
-    auto* output_ref_data = output_ref.data<float>();
-    int elem_num = (ddimA.data()[0] + ddimB.data()[0] + ddimC.data()[0] +
-                    ddimD.data()[0]) *
-                   ddimA.data()[1] * ddimA.data()[2] * ddimA.data()[3];
-    for (int i = 0; i < elem_num; i++) {
-      // LOG(INFO) << "output[" << i << "]:" << output_data[i] << "
-      // output_ref_data[" << i << "]:" << output_ref_data[i];
-      EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-    }
-  }
-}
-
-TEST(concat, retrive_op) {
-  auto concat =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "concat");
-  ASSERT_FALSE(concat.empty());
-  ASSERT_TRUE(concat.front());
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
diff --git a/lite/kernels/arm/conv_compute.cc b/lite/kernels/arm/conv_compute.cc
deleted file mode 100644
index 9de1a85900..0000000000
--- a/lite/kernels/arm/conv_compute.cc
+++ /dev/null
@@ -1,241 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/conv_compute.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void ConvCompute::PrepareForRun() {
-  auto& param = this->Param<param_t>();
-  auto x_dims = param.x->dims();
-  auto w_dims = param.filter->dims();
-  auto o_dims = param.output->dims();
-
-  auto& ctx = this->ctx_->template As<ARMContext>();
-
-  int win = x_dims[3];  // nchw
-  int hin = x_dims[2];
-  int ic = x_dims[1];
-  int bs = x_dims[0];
-  int ow = o_dims[3];
-  int oh = o_dims[2];
-  int oc = o_dims[1];
-  int kh = w_dims[2];  // oihw
-  int kw = w_dims[3];
-  int pad = param.paddings[0];
-  int stride = param.strides[0];
-
-  const auto* i_data = param.x->data<float>();
-  const auto* w_data = param.filter->data<float>();
-  const auto* b_data = param.bias ? param.bias->data<float>() : nullptr;
-  auto* o_data = param.output->mutable_data<float>();
-
-  bool kps_equal = (param.paddings[0] == param.paddings[1]) &&
-                   (param.strides[0] == param.strides[1]) && (kw == kh);
-  bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1);
-  bool flag_dw_3x3 =
-      (kw == 3 && (pad == 0 || pad == 1) && (stride == 1 || stride == 2));
-  bool flag_dw_5x5 =
-      (kw == 5 && stride == 1) || (kw == 5 && stride == 2 && pad == 2);
-  bool flag_dw = flag_dw_3x3 || flag_dw_5x5;
-
-  // select conv impl
-  if (param.groups == ic && ic == oc && kps_equal && no_dilation && flag_dw) {
-    // dw conv impl
-    impl_ = new lite::arm::math::DepthwiseConv<PRECISION(kFloat)>;
-    VLOG(3) << "invoking dw conv";
-  } else if (param.groups == 1 && kw == 3 && stride == 1 && kps_equal &&
-             no_dilation) {
-    if (ic >= 32 && oc >= 32 && oh > 16 && ow > 16) {
-      // winograd conv impl
-      impl_ = new lite::arm::math::WinogradConv<PRECISION(kFloat)>;
-      VLOG(3) << "invoking winograd conv";
-    } else {
-      // direct conv impl
-      impl_ = new lite::arm::math::DirectConv<PRECISION(kFloat)>;
-      VLOG(3) << "invoking direct conv";
-    }
-  } else if (param.groups == 1 && kw == 3 && stride == 2 && kps_equal &&
-             no_dilation) {
-    // direct conv impl
-    impl_ = new lite::arm::math::DirectConv<PRECISION(kFloat)>;
-    VLOG(3) << "invoking direct conv";
-  } else {
-    impl_ = new lite::arm::math::GemmLikeConv<PRECISION(kFloat)>;
-    VLOG(3) << "invoking gemm like conv";
-  }
-  CHECK(this->impl_->create(param, &ctx));
-}
-
-void ConvCompute::Run() {
-  auto& param = this->Param<param_t>();
-  CHECK(impl_);
-  impl_->run(param);
-  // if (this->act_ != nullptr) {
-  //   this->act_->run(outputs, outputs, param.activation_param);
-  // }
-}
-
-template <PrecisionType Ptype_out>
-void ConvComputeInt8<Ptype_out>::PrepareForRun() {
-  auto& param = this->Param<param_t>();
-  auto x_dims = param.x->dims();
-  auto w_dims = param.filter->dims();
-  auto o_dims = param.output->dims();
-
-  auto& ctx = this->ctx_->template As<ARMContext>();
-
-  int win = x_dims[3];  // nchw
-  int hin = x_dims[2];
-  int ic = x_dims[1];
-  int bs = x_dims[0];
-  int ow = o_dims[3];
-  int oh = o_dims[2];
-  int oc = o_dims[1];
-  int kh = w_dims[2];  // oihw
-  int kw = w_dims[3];
-  int ph = param.paddings[1];
-  int pw = param.paddings[0];
-  int sh = param.strides[1];
-  int sw = param.strides[0];
-
-  bool with_bias = param.bias;
-  bool kps_equal = (pw == ph) && (sh == sw) && (kw == kh);
-  bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1);
-  bool flag_dw_3x3 = (kw == 3) && (ph == 1) && (sw == 1 || sw == 2);
-  bool flag_dw_5x5 = (kw == 5 && sw == 1 && ph == 2);
-  bool flag_dw = flag_dw_3x3 || flag_dw_5x5;
-
-  if (param.groups == ic && ic == oc && kps_equal && no_dilation && flag_dw) {
-    impl_ = new lite::arm::math::DepthwiseConvInt8<Ptype_out>;
-    VLOG(3) << "Run DepthwiseConv Int8";
-  } else if (param.groups == 1 && kw == 3 && (sw == 1 || sw == 2) &&
-             kps_equal && no_dilation) {
-    VLOG(3) << "Run DirectConv Int8";
-    impl_ = new lite::arm::math::DirectConvInt8<Ptype_out>;
-  } else {
-    VLOG(3) << "Run GemmLikeConvInt8";
-    impl_ = new lite::arm::math::GemmLikeConvInt8<Ptype_out>;
-  }
-  // Convert fp32 bias to int32 bias.
-  if (with_bias) {
-    Tensor temp_tensor;
-    temp_tensor.CopyDataFrom(*param.bias);
-    lite::arm::math::trans_fp32_bias_to_int32_basic(
-        &temp_tensor, param.bias, param.input_scale, param.weight_scale);
-  }
-  // param.bias->data<int32_t>();
-  CHECK(this->impl_->create(param, &ctx));
-}
-
-template <PrecisionType Ptype_out>
-void ConvComputeInt8<Ptype_out>::Run() {
-  auto& param = this->Param<param_t>();
-  CHECK(impl_);
-  impl_->run(param);
-}
-
-template class ConvComputeInt8<PRECISION(kInt8)>;
-template class ConvComputeInt8<PRECISION(kFloat)>;
-template class ConvComputeInt8<PRECISION(kInt32)>;
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    conv2d, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ConvCompute, def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(depthwise_conv2d,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::ConvCompute,
-                     def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(
-    conv2d,
-    kARM,
-    kInt8,
-    kNCHW,
-    paddle::lite::kernels::arm::ConvComputeInt8<PRECISION(kInt8)>,
-    int8_out)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindInput("Filter",
-               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .BindOutput("Output",
-                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(
-    conv2d,
-    kARM,
-    kInt8,
-    kNCHW,
-    paddle::lite::kernels::arm::ConvComputeInt8<PRECISION(kFloat)>,
-    fp32_out)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindInput("Filter",
-               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .BindOutput("Output",
-                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(
-    depthwise_conv2d,
-    kARM,
-    kInt8,
-    kNCHW,
-    paddle::lite::kernels::arm::ConvComputeInt8<PRECISION(kInt8)>,
-    int8_out)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindInput("Filter",
-               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .BindOutput("Output",
-                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(
-    depthwise_conv2d,
-    kARM,
-    kInt8,
-    kNCHW,
-    paddle::lite::kernels::arm::ConvComputeInt8<PRECISION(kFloat)>,
-    fp32_out)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindInput("Filter",
-               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .BindOutput("Output",
-                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
-    .Finalize();
diff --git a/lite/kernels/arm/conv_compute.h b/lite/kernels/arm/conv_compute.h
deleted file mode 100644
index 28b8d40677..0000000000
--- a/lite/kernels/arm/conv_compute.h
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/kernel.h"
-#include "lite/operators/conv_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class ConvCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ConvParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  ~ConvCompute() {
-    if (impl_ != nullptr) {
-      delete impl_;
-    }
-  }
-
- private:
-  lite::arm::math::ImplBase<TARGET(kARM), PRECISION(kFloat), param_t>* impl_{
-      nullptr};
-};
-
-template <PrecisionType Ptype_out>
-class ConvComputeInt8 : public KernelLite<TARGET(kARM), PRECISION(kInt8)> {
- public:
-  using param_t = operators::ConvParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  ~ConvComputeInt8() {
-    if (impl_ != nullptr) {
-      delete impl_;
-    }
-  }
-
- private:
-  lite::arm::math::ImplBase<TARGET(kARM), PRECISION(kInt8), param_t>* impl_{
-      nullptr};
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/conv_compute_test.cc b/lite/kernels/arm/conv_compute_test.cc
deleted file mode 100644
index 40f678164e..0000000000
--- a/lite/kernels/arm/conv_compute_test.cc
+++ /dev/null
@@ -1,1045 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/conv_compute.h"
-#include <gtest/gtest.h>
-#include <limits>
-#include <memory>
-#include <utility>
-#include <vector>
-#include "lite/backends/arm/math/type_trans.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-static int get_rand(int start, int end) {
-  int i = rand();  // NOLINT
-  i = (i % (end - start)) + start;
-  return i;
-}
-
-template <typename Dtype1, typename Dtype2>
-static void conv_basic(const Dtype1* din,
-                       Dtype2* dout,
-                       int num,
-                       int chout,
-                       int hout,
-                       int wout,
-                       int chin,
-                       int hin,
-                       int win,
-                       const Dtype1* weights,
-                       const Dtype2* bias,
-                       int group,
-                       int kernel_w,
-                       int kernel_h,
-                       int stride_w,
-                       int stride_h,
-                       int dila_w,
-                       int dila_h,
-                       int pad_w,
-                       int pad_h,
-                       bool flag_bias,
-                       bool flag_relu) {
-  Dtype2 beta = 0;
-  auto src_data = din;
-  auto dst_data_ref = dout;
-  auto weights_data = weights;
-  auto with_bias = flag_bias;
-  auto bias_data = bias;
-
-  int in_num = num;
-  int out_channels = chout;
-  int out_h = hout;
-  int out_w = wout;
-
-  int in_channel = chin;
-  int in_h = hin;
-  int in_w = win;
-  int out_c_group = out_channels / group;
-  int in_c_group = in_channel / group;
-
-  for (int n = 0; n < in_num; ++n) {
-    for (int g = 0; g < group; ++g) {
-      for (int oc = 0; oc < out_c_group; ++oc) {
-        for (int oh = 0; oh < out_h; ++oh) {
-          for (int ow = 0; ow < out_w; ++ow) {
-            int out_idx = n * group * out_c_group * out_h * out_w +
-                          g * out_c_group * out_h * out_w + oc * out_h * out_w +
-                          oh * out_w + ow;
-            Dtype2 bias_d =
-                with_bias ? (bias_data[g * out_c_group + oc]) : (Dtype2)0;
-            dst_data_ref[out_idx] = bias_d;  // + dst_data_ref[out_idx] * beta;
-            for (int ic = 0; ic < in_c_group; ++ic) {
-              for (int kh = 0; kh < kernel_h; ++kh) {
-                for (int kw = 0; kw < kernel_w; ++kw) {
-                  int iw = ow * stride_w - pad_w + kw * (dila_w);
-                  int ih = oh * stride_h - pad_h + kh * (dila_h);
-                  if (iw < 0 || iw >= in_w) continue;
-                  if (ih < 0 || ih >= in_h) continue;
-
-                  int iidx = n * in_channel * in_h * in_w +
-                             g * in_c_group * in_h * in_w + ic * in_h * in_w +
-                             ih * in_w + iw;
-                  int widx =
-                      g * out_c_group * in_c_group * kernel_h * kernel_w +
-                      oc * in_c_group * kernel_h * kernel_w +
-                      ic * kernel_h * kernel_w + kh * kernel_w + kw;
-
-                  dst_data_ref[out_idx] += src_data[iidx] * weights_data[widx];
-                }
-              }
-            }
-            if (flag_relu) {
-              dst_data_ref[out_idx] = dst_data_ref[out_idx] > (Dtype2)0
-                                          ? dst_data_ref[out_idx]
-                                          : (Dtype2)0;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename Dtype1, typename Dtype2>
-void conv_compute_ref(const operators::ConvParam& param) {
-  const Dtype1* din = param.x->data<Dtype1>();
-  Dtype2* dout = param.output->mutable_data<Dtype2>();
-
-  int num = param.x->dims()[0];
-  int chout = param.output->dims()[1];
-  int hout = param.output->dims()[2];
-  int wout = param.output->dims()[3];
-
-  int chin = param.x->dims()[1];
-  int hin = param.x->dims()[2];
-  int win = param.x->dims()[3];
-
-  const Dtype1* weights = param.filter->mutable_data<Dtype1>();
-  Dtype2* bias = nullptr;
-  if (param.bias != nullptr) {
-    bias = param.bias->mutable_data<Dtype2>();
-  }
-
-  int group = param.groups;
-  int kernel_w = param.filter->dims()[2];
-  int kernel_h = param.filter->dims()[3];
-  int stride_w = param.strides[0];
-  int stride_h = param.strides[1];
-  int dila_w = param.dilations[0];
-  int dila_h = param.dilations[1];
-
-  int pad_w = param.paddings[0];
-  int pad_h = param.paddings[1];
-  bool flag_bias = (param.bias != nullptr);
-  bool flag_relu = param.fuse_relu;
-
-  conv_basic(din,
-             dout,
-             num,
-             chout,
-             hout,
-             wout,
-             chin,
-             hin,
-             win,
-             weights,
-             bias,
-             group,
-             kernel_w,
-             kernel_h,
-             stride_w,
-             stride_h,
-             dila_w,
-             dila_h,
-             pad_w,
-             pad_h,
-             flag_bias,
-             flag_relu);
-}
-
-TEST(conv_arm, retrive_op) {
-  auto conv = KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-      "conv2d");
-  ASSERT_FALSE(conv.empty());
-  ASSERT_TRUE(conv.front());
-}
-
-TEST(conv_arm_int8, retrive_op) {
-  auto conv =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kInt8)>("conv2d");
-  ASSERT_FALSE(conv.empty());
-  ASSERT_TRUE(conv.front());
-}
-
-TEST(conv_arm, init) {
-  ConvCompute conv;
-  ASSERT_EQ(conv.precision(), PRECISION(kFloat));
-  ASSERT_EQ(conv.target(), TARGET(kARM));
-}
-
-TEST(conv_arm_int8, init) {
-  ConvComputeInt8<PRECISION(kFloat)> float_out;
-  ASSERT_EQ(float_out.precision(), PRECISION(kInt8));
-  ASSERT_EQ(float_out.target(), TARGET(kARM));
-  ConvComputeInt8<PRECISION(kInt8)> int8_out;
-  ASSERT_EQ(float_out.precision(), PRECISION(kInt8));
-  ASSERT_EQ(float_out.target(), TARGET(kARM));
-}
-
-TEST(conv_arm_int8, int8_int32) {
-  DeviceInfo::Init();
-  for (auto n : {2}) {
-    for (auto ic : {6}) {
-      for (auto oc : {6}) {
-        for (auto ih : {9}) {
-          for (auto iw : {9}) {
-            for (auto flag_bias : {false, true}) {
-              for (auto flag_relu : {false, true}) {
-                for (auto depthwise : {false, /*true*/}) {
-                  for (auto dilation : {1}) {
-                    for (auto stride : {1}) {
-                      for (auto padding : {0}) {
-                        for (auto ks : {1}) {
-                          int group = 1;
-                          if (depthwise) {  // depthwise convolution ?
-                            group = oc = ic;
-                          }
-
-                          const int dks = dilation * (ks - 1) + 1;
-                          int oh = (ih + 2 * padding - dks) / stride + 1;
-                          int ow = (iw + 2 * padding - dks) / stride + 1;
-                          std::vector<int64_t> input_shape = {n, ic, ih, iw};
-                          std::vector<int64_t> filter_shape = {
-                              oc, ic / group, ks, ks};
-                          std::vector<int64_t> output_shape({n, oc, oh, ow});
-
-                          Tensor input_int8;
-                          Tensor filter_int8;
-                          Tensor output_int32, output_int32_ref;
-
-                          input_int8.Resize(input_shape);
-                          filter_int8.Resize(filter_shape);
-                          output_int32.Resize(output_shape);
-                          output_int32_ref.Resize(output_shape);
-
-                          int8_t* input_int8_data =
-                              input_int8.mutable_data<int8_t>();
-                          int8_t* filter_int8_data =
-                              filter_int8.mutable_data<int8_t>();
-                          for (int i = 0; i < input_int8.dims().production();
-                               i++) {
-                            input_int8_data[i] = i % 10 * (i % 3 - 1);
-                          }
-                          for (int i = 0; i < filter_int8.dims().production();
-                               i++) {
-                            filter_int8_data[i] = i % 10 * (i % 3 - 1);
-                          }
-
-                          operators::ConvParam param;
-                          param.x = &input_int8;
-                          param.filter = &filter_int8;
-                          param.bias = nullptr;
-                          param.fuse_relu = false;
-                          param.paddings = std::vector<int>({padding, padding});
-                          param.strides = std::vector<int>({stride, stride});
-                          param.dilations =
-                              std::vector<int>({dilation, dilation});
-                          param.groups = group;
-                          param.output = &output_int32_ref;
-                          conv_compute_ref<int8_t, int>(param);
-
-                          param.output = &output_int32;
-                          std::unique_ptr<KernelContext> ctx(new KernelContext);
-                          lite::arm::math::GemmLikeConvInt8<PRECISION(kInt32)>
-                              int8gemm_int32;
-                          int8gemm_int32.init(param, &ctx->As<ARMContext>());
-                          int8gemm_int32.create(param, &ctx->As<ARMContext>());
-                          int8gemm_int32.run(param);
-
-                          int* output_int32_data =
-                              output_int32.mutable_data<int>();
-                          int* output_int32_ref_data =
-                              output_int32_ref.mutable_data<int>();
-
-                          for (int i = 0; i < output_int32.dims().production();
-                               i++) {
-                            EXPECT_NEAR(output_int32_data[i],
-                                        output_int32_ref_data[i],
-                                        1e-3);
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(conv_arm_int8, int8_fp32) {
-  DeviceInfo::Init();
-  for (auto n : {2}) {
-    for (auto ic : {6}) {
-      for (auto oc : {6}) {
-        for (auto ih : {9}) {
-          for (auto iw : {9}) {
-            for (auto flag_bias : {false, true}) {
-              for (auto flag_relu : {false, true}) {
-                for (auto depthwise : {false, /*true*/}) {
-                  for (auto dilation : {1}) {
-                    for (auto stride : {1}) {
-                      for (auto padding : {0}) {
-                        for (auto ks : {1}) {
-                          int group = 1;
-                          if (depthwise) {  // depthwise convolution ?
-                            group = oc = ic;
-                          }
-
-                          LOG(INFO) << "flag_bias: " << flag_bias;
-
-                          const int dks = dilation * (ks - 1) + 1;
-                          int oh = (ih + 2 * padding - dks) / stride + 1;
-                          int ow = (iw + 2 * padding - dks) / stride + 1;
-                          std::vector<int64_t> input_shape = {n, ic, ih, iw};
-                          std::vector<int64_t> filter_shape = {
-                              oc, ic / group, ks, ks};
-                          std::vector<int64_t> bias_shape({1, oc, 1, 1});
-                          std::vector<int64_t> output_shape({n, oc, oh, ow});
-
-                          Tensor input_fp32, input_int8;
-                          Tensor filter_fp32, filter_int8;
-                          Tensor bias_fp32, bias_int32;
-                          Tensor output_int32_ref, output_int32;
-                          Tensor output_fp32_ref, output_fp32;
-                          Tensor output_int8_ref, output_int8;
-
-                          input_fp32.Resize(input_shape);
-                          input_int8.Resize(input_shape);
-                          filter_fp32.Resize(filter_shape);
-                          filter_int8.Resize(filter_shape);
-                          bias_fp32.Resize(bias_shape);
-                          bias_int32.Resize(bias_shape);
-                          output_int32.Resize(output_shape);
-                          output_int32_ref.Resize(output_shape);
-                          output_fp32_ref.Resize(output_shape);
-                          output_fp32.Resize(output_shape);
-                          output_int8_ref.Resize(output_shape);
-                          output_int8.Resize(output_shape);
-
-                          float* input_fp32_data =
-                              input_fp32.mutable_data<float>();
-                          int8_t* input_int8_data =
-                              input_int8.mutable_data<int8_t>();
-
-                          float* filter_fp32_data =
-                              filter_fp32.mutable_data<float>();
-                          int8_t* filter_int8_data =
-                              filter_int8.mutable_data<int8_t>();
-
-                          float* bias_fp32_data =
-                              bias_fp32.mutable_data<float>();
-                          int* bias_int32_data = bias_int32.mutable_data<int>();
-
-                          for (int i = 0; i < input_fp32.dims().production();
-                               i++) {
-                            input_fp32_data[i] = i % 10 * (i % 3 - 1);
-                          }
-                          for (int i = 0; i < filter_fp32.dims().production();
-                               i++) {
-                            filter_fp32_data[i] = i % 10 * (i % 3 - 1);
-                          }
-                          for (int i = 0; i < bias_fp32.dims().production();
-                               i++) {
-                            bias_fp32_data[i] = i % 10 * (i % 3 - 1);
-                          }
-
-                          std::vector<float> in_scale;
-                          lite::arm::math::get_tensor_scale<PRECISION(kFloat)>(
-                              input_fp32, &in_scale, -1, 127.f);
-                          lite::arm::math::trans_tensor_fp32_to_int8(
-                              &input_fp32, &input_int8, in_scale[0]);
-
-                          std::vector<float> w_scale;
-                          lite::arm::math::get_tensor_scale<PRECISION(kFloat)>(
-                              filter_fp32, &w_scale, -1, 127.f);
-                          int axis_size = oc;
-                          int inner_size = ic / group * ks * ks;
-                          w_scale = lite::arm::math::get_tensor_scale_n(
-                              filter_fp32_data, axis_size, inner_size, 127.f);
-                          lite::arm::math::fp32_to_int8(filter_fp32_data,
-                                                        filter_int8_data,
-                                                        w_scale.data(),
-                                                        axis_size,
-                                                        1,
-                                                        inner_size);
-
-                          // lite::arm::math::trans_fp32_bias_to_int32_basic(&bias_fp32,
-                          // &bias_int32, in_scale[0], w_scale);
-                          for (int i = 0; i < bias_int32.dims().production();
-                               i++) {
-                            bias_int32_data[i] = 1;
-                          }
-
-                          operators::ConvParam param;
-                          param.x = &input_int8;
-                          param.filter = &filter_int8;
-                          if (flag_bias) {
-                            param.bias = &bias_int32;
-                          } else {
-                            param.bias = nullptr;
-                          }
-                          param.fuse_relu = false;
-                          param.paddings = std::vector<int>({padding, padding});
-                          param.strides = std::vector<int>({stride, stride});
-                          param.dilations =
-                              std::vector<int>({dilation, dilation});
-                          param.groups = group;
-                          param.output = &output_int32_ref;
-                          conv_compute_ref<int8_t, int>(param);
-
-                          int* output_int32_ref_data =
-                              output_int32_ref.mutable_data<int>();
-
-                          // ============ int8gemm_int32 ============
-                          /*
-                          param.output = &output_int32;
-                          std::unique_ptr<KernelContext> ctx_int32(
-                              new KernelContext);
-                          lite::arm::math::GemmLikeConvInt8<PRECISION(kInt32)>
-                              int8gemm_int32;
-                          int8gemm_int32.init(param,
-                                              &ctx_int32->As<ARMContext>());
-                          int8gemm_int32.create(param,
-                                                &ctx_int32->As<ARMContext>());
-                          int8gemm_int32.run(param);
-                          int* output_int32_data =
-                              output_int32.mutable_data<int>();
-                          for (int i = 0; i < output_int32.dims().production();
-                               i++) {
-                            EXPECT_NEAR(output_int32_data[i],
-                                        output_int32_ref_data[i], 1e-3);
-                          }
-                          */
-                          // ============ int8gemm_int8 ============
-                          int8_t* output_int8_ref_data =
-                              output_int8_ref.mutable_data<int8_t>();
-                          lite::arm::math::trans_tensor_int32_to_int8(
-                              &output_int32_ref,
-                              &output_int8_ref,
-                              in_scale[0],
-                              1,
-                              w_scale);
-                          param.output = &output_int8;
-                          param.input_scale = in_scale[0];
-                          param.output_scale = 1;
-                          param.weight_scale = w_scale;
-                          std::unique_ptr<KernelContext> ctx_int8(
-                              new KernelContext);
-                          lite::arm::math::GemmLikeConvInt8<PRECISION(kInt8)>
-                              int8gemm_int8;
-                          int8gemm_int8.init(param,
-                                             &ctx_int8->As<ARMContext>());
-                          int8gemm_int8.create(param,
-                                               &ctx_int8->As<ARMContext>());
-                          int8gemm_int8.run(param);
-                          int8_t* output_int8_data =
-                              output_int8.mutable_data<int8_t>();
-                          for (int i = 0; i < output_int8.dims().production();
-                               i++) {
-                            EXPECT_NEAR(output_int8_data[i],
-                                        output_int8_ref_data[i],
-                                        1e-3);
-                          }
-
-                          // ============ int8gemm_float32 ============
-                          float* output_fp32_ref_data =
-                              output_fp32_ref.mutable_data<float>();
-                          lite::arm::math::trans_tensor_int32_to_fp32(
-                              &output_int32_ref,
-                              &output_fp32_ref,
-                              in_scale[0],
-                              w_scale);
-                          param.output = &output_fp32;
-                          param.input_scale = in_scale[0];
-                          param.output_scale = 1;
-                          param.weight_scale = w_scale;
-                          std::unique_ptr<KernelContext> ctx_fp32(
-                              new KernelContext);
-                          lite::arm::math::GemmLikeConvInt8<PRECISION(kFloat)>
-                              int8gemm_fp32;
-                          int8gemm_fp32.init(param,
-                                             &ctx_fp32->As<ARMContext>());
-                          int8gemm_fp32.create(param,
-                                               &ctx_fp32->As<ARMContext>());
-                          int8gemm_fp32.run(param);
-                          float* output_fp32_data =
-                              output_fp32.mutable_data<float>();
-                          for (int i = 0; i < output_fp32.dims().production();
-                               i++) {
-                            EXPECT_NEAR(output_fp32_data[i],
-                                        output_fp32_ref_data[i],
-                                        1e-3);
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(conv_direct_int8, compute) {
-  DeviceInfo::Init();
-  for (auto n : {1, 2}) {
-    for (auto ic : {1, 3, 8}) {
-      for (auto oc : {1, 3, 8}) {
-        for (auto ih : {5, 15, 28}) {
-          for (auto iw : {5, 15, 28}) {
-            for (auto flag_bias : {false, true}) {
-              for (auto flag_relu : {false, true}) {
-                for (auto depthwise : {false, /*true*/}) {
-                  for (auto dilation : {1}) {
-                    for (auto stride : {1, 2}) {
-                      for (auto padding : {1}) {
-                        for (auto ks : {3}) {
-                          int group = 1;
-                          if (depthwise) {  // depthwise convolution ?
-                            group = oc = ic;
-                          }
-
-                          const int dks = dilation * (ks - 1) + 1;
-                          int oh = (ih + 2 * padding - dks) / stride + 1;
-                          int ow = (iw + 2 * padding - dks) / stride + 1;
-                          std::vector<int64_t> input_shape = {n, ic, ih, iw};
-                          std::vector<int64_t> filter_shape = {
-                              oc, ic / group, ks, ks};
-                          std::vector<int64_t> bias_shape({1, oc, 1, 1});
-                          std::vector<int64_t> output_shape({n, oc, oh, ow});
-
-                          Tensor input_fp32, input_int8;
-                          Tensor filter_fp32, filter_int8;
-                          Tensor bias_int32;
-                          Tensor output_int32_ref, output_int32;
-                          Tensor output_fp32_ref, output_fp32;
-                          Tensor output_int8_ref, output_int8;
-
-                          input_fp32.Resize(input_shape);
-                          input_int8.Resize(input_shape);
-                          filter_fp32.Resize(filter_shape);
-                          filter_int8.Resize(filter_shape);
-                          bias_int32.Resize(bias_shape);
-                          output_int32.Resize(output_shape);
-                          output_int32_ref.Resize(output_shape);
-                          output_fp32_ref.Resize(output_shape);
-                          output_fp32.Resize(output_shape);
-                          output_int8_ref.Resize(output_shape);
-                          output_int8.Resize(output_shape);
-
-                          float* input_fp32_data =
-                              input_fp32.mutable_data<float>();
-                          int8_t* input_int8_data =
-                              input_int8.mutable_data<int8_t>();
-
-                          float* filter_fp32_data =
-                              filter_fp32.mutable_data<float>();
-                          int8_t* filter_int8_data =
-                              filter_int8.mutable_data<int8_t>();
-
-                          int* bias_int32_data =
-                              bias_int32.mutable_data<int32_t>();
-
-                          for (int i = 0; i < input_fp32.dims().production();
-                               i++) {
-                            input_fp32_data[i] = i % 10 * (i % 3 - 1);
-                          }
-                          for (int i = 0; i < filter_fp32.dims().production();
-                               i++) {
-                            filter_fp32_data[i] = i % 10 * (i % 3 - 1);
-                          }
-                          for (int i = 0; i < bias_int32.dims().production();
-                               i++) {
-                            bias_int32_data[i] = i % 10 * (i % 3 - 1);
-                          }
-
-                          std::vector<float> in_scale;
-                          lite::arm::math::get_tensor_scale<PRECISION(kFloat)>(
-                              input_fp32, &in_scale, -1, 127.f);
-                          lite::arm::math::trans_tensor_fp32_to_int8(
-                              &input_fp32, &input_int8, in_scale[0]);
-
-                          std::vector<float> w_scale;
-                          lite::arm::math::get_tensor_scale<PRECISION(kFloat)>(
-                              filter_fp32, &w_scale, -1, 127.f);
-                          int axis_size = oc;
-                          int inner_size = ic / group * ks * ks;
-                          w_scale = lite::arm::math::get_tensor_scale_n(
-                              filter_fp32_data, axis_size, inner_size, 127.f);
-                          lite::arm::math::fp32_to_int8(filter_fp32_data,
-                                                        filter_int8_data,
-                                                        w_scale.data(),
-                                                        axis_size,
-                                                        1,
-                                                        inner_size);
-
-                          operators::ConvParam param;
-                          param.x = &input_int8;
-                          param.filter = &filter_int8;
-                          if (flag_bias) {
-                            param.bias = &bias_int32;
-                          }
-                          param.fuse_relu = false;
-                          param.paddings = std::vector<int>({padding, padding});
-                          param.strides = std::vector<int>({stride, stride});
-                          param.dilations =
-                              std::vector<int>({dilation, dilation});
-                          param.groups = group;
-                          param.output = &output_int32_ref;
-                          conv_compute_ref<int8_t, int>(param);
-
-                          int* output_int32_ref_data =
-                              output_int32_ref.mutable_data<int>();
-
-                          // ============ int8direct_int32 ============
-                          param.output = &output_int32;
-                          std::unique_ptr<KernelContext> ctx_int32(
-                              new KernelContext);
-                          lite::arm::math::DirectConvInt8<PRECISION(kInt32)>
-                              int8direct_int32;
-                          int8direct_int32.init(param,
-                                                &ctx_int32->As<ARMContext>());
-                          int8direct_int32.create(param,
-                                                  &ctx_int32->As<ARMContext>());
-                          int8direct_int32.run(param);
-                          int* output_int32_data =
-                              output_int32.mutable_data<int>();
-                          for (int i = 0; i < output_int32.dims().production();
-                               i++) {
-                            EXPECT_NEAR(output_int32_data[i],
-                                        output_int32_ref_data[i],
-                                        1e-3);
-                          }
-
-                          // ============ int8direct_int8 ============
-                          int8_t* output_int8_ref_data =
-                              output_int8_ref.mutable_data<int8_t>();
-                          lite::arm::math::trans_tensor_int32_to_int8(
-                              &output_int32_ref,
-                              &output_int8_ref,
-                              in_scale[0],
-                              1,
-                              w_scale);
-                          param.output = &output_int8;
-                          param.input_scale = in_scale[0];
-                          param.output_scale = 1;
-                          param.weight_scale = w_scale;
-                          std::unique_ptr<KernelContext> ctx_int8(
-                              new KernelContext);
-                          lite::arm::math::DirectConvInt8<PRECISION(kInt8)>
-                              int8direct_int8;
-                          int8direct_int8.init(param,
-                                               &ctx_int8->As<ARMContext>());
-                          int8direct_int8.create(param,
-                                                 &ctx_int8->As<ARMContext>());
-                          int8direct_int8.run(param);
-                          int8_t* output_int8_data =
-                              output_int8.mutable_data<int8_t>();
-                          for (int i = 0; i < output_int8.dims().production();
-                               i++) {
-                            EXPECT_NEAR(output_int8_data[i],
-                                        output_int8_ref_data[i],
-                                        1e-3);
-                          }
-
-                          // ============ int8direct_float32 ============
-                          float* output_fp32_ref_data =
-                              output_fp32_ref.mutable_data<float>();
-                          lite::arm::math::trans_tensor_int32_to_fp32(
-                              &output_int32_ref,
-                              &output_fp32_ref,
-                              in_scale[0],
-                              w_scale);
-                          param.output = &output_fp32;
-                          param.input_scale = in_scale[0];
-                          param.output_scale = 1;
-                          param.weight_scale = w_scale;
-                          std::unique_ptr<KernelContext> ctx_fp32(
-                              new KernelContext);
-                          lite::arm::math::DirectConvInt8<PRECISION(kFloat)>
-                              int8direct_fp32;
-                          int8direct_fp32.init(param,
-                                               &ctx_fp32->As<ARMContext>());
-                          int8direct_fp32.create(param,
-                                                 &ctx_fp32->As<ARMContext>());
-                          int8direct_fp32.run(param);
-                          float* output_fp32_data =
-                              output_fp32.mutable_data<float>();
-                          for (int i = 0; i < output_fp32.dims().production();
-                               i++) {
-                            EXPECT_NEAR(output_fp32_data[i],
-                                        output_fp32_ref_data[i],
-                                        1e-3);
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(conv_depthwise_int8, compute) {
-  DeviceInfo::Init();
-  for (auto n : {1, 2}) {
-    for (auto ic : {1, 3, 8}) {
-      for (auto ih : {5, 15, 28}) {
-        for (auto iw : {5, 15, 28}) {
-          for (auto flag_bias : {false, true}) {
-            for (auto flag_relu : {false, true}) {
-              for (auto dilation : {1}) {
-                for (auto stride : {1, 2}) {
-                  for (auto padding : {1, 2}) {
-                    for (auto ks : {3, /*5 */}) {
-                      int group = ic;
-                      int oc = ic;
-
-                      bool flag_dw_3x3 = (ks == 3) && (padding == 1) &&
-                                         (stride == 1 || stride == 2);
-                      bool flag_dw_5x5 =
-                          (ks == 5 && stride == 1 && padding == 2);
-                      bool flag_dw = flag_dw_3x3 || flag_dw_5x5;
-                      if (!flag_dw) continue;
-
-                      const int dks = dilation * (ks - 1) + 1;
-                      int oh = (ih + 2 * padding - dks) / stride + 1;
-                      int ow = (iw + 2 * padding - dks) / stride + 1;
-                      std::vector<int64_t> input_shape = {n, ic, ih, iw};
-                      std::vector<int64_t> filter_shape = {
-                          oc, ic / group, ks, ks};
-                      std::vector<int64_t> bias_shape({1, oc, 1, 1});
-                      std::vector<int64_t> output_shape({n, oc, oh, ow});
-
-                      Tensor input_fp32, input_int8;
-                      Tensor filter_fp32, filter_int8;
-                      Tensor bias_int32;
-                      Tensor output_int32_ref, output_int32;
-                      Tensor output_fp32_ref, output_fp32;
-                      Tensor output_int8_ref, output_int8;
-
-                      input_fp32.Resize(input_shape);
-                      input_int8.Resize(input_shape);
-                      filter_fp32.Resize(filter_shape);
-                      filter_int8.Resize(filter_shape);
-                      bias_int32.Resize(bias_shape);
-
-                      output_int32.Resize(output_shape);
-                      output_int32_ref.Resize(output_shape);
-                      output_fp32_ref.Resize(output_shape);
-                      output_fp32.Resize(output_shape);
-                      output_int8_ref.Resize(output_shape);
-                      output_int8.Resize(output_shape);
-
-                      float* input_fp32_data = input_fp32.mutable_data<float>();
-                      int8_t* input_int8_data =
-                          input_int8.mutable_data<int8_t>();
-                      float* filter_fp32_data =
-                          filter_fp32.mutable_data<float>();
-                      int8_t* filter_int8_data =
-                          filter_int8.mutable_data<int8_t>();
-
-                      int* bias_int32_data = bias_int32.mutable_data<int32_t>();
-
-                      for (int i = 0; i < input_fp32.dims().production(); i++) {
-                        input_fp32_data[i] = i % 10 * (i % 3 - 1);
-                      }
-                      for (int i = 0; i < filter_fp32.dims().production();
-                           i++) {
-                        filter_fp32_data[i] = i % 10 * (i % 3 - 1);
-                      }
-                      for (int i = 0; i < bias_int32.dims().production(); i++) {
-                        bias_int32_data[i] = i % 10 * (i % 3 - 1);
-                      }
-
-                      std::vector<float> in_scale;
-                      lite::arm::math::get_tensor_scale<PRECISION(kFloat)>(
-                          input_fp32, &in_scale, -1, 127.f);
-                      lite::arm::math::trans_tensor_fp32_to_int8(
-                          &input_fp32, &input_int8, in_scale[0]);
-
-                      std::vector<float> w_scale;
-                      lite::arm::math::get_tensor_scale<PRECISION(kFloat)>(
-                          filter_fp32, &w_scale, -1, 127.f);
-                      int axis_size = oc;
-                      int inner_size = ic / group * ks * ks;
-                      w_scale = lite::arm::math::get_tensor_scale_n(
-                          filter_fp32_data, axis_size, inner_size, 127.f);
-                      lite::arm::math::fp32_to_int8(filter_fp32_data,
-                                                    filter_int8_data,
-                                                    w_scale.data(),
-                                                    axis_size,
-                                                    1,
-                                                    inner_size);
-
-                      operators::ConvParam param;
-                      param.x = &input_int8;
-                      param.filter = &filter_int8;
-                      if (flag_bias) {
-                        param.bias = &bias_int32;
-                      }
-                      param.fuse_relu = false;
-                      param.paddings = std::vector<int>({padding, padding});
-                      param.strides = std::vector<int>({stride, stride});
-                      param.dilations = std::vector<int>({dilation, dilation});
-                      param.groups = group;
-                      param.output = &output_int32_ref;
-                      conv_compute_ref<int8_t, int>(param);
-
-                      int* output_int32_ref_data =
-                          output_int32_ref.mutable_data<int>();
-
-                      // ============ int8depthwise_int32 ============
-                      param.output = &output_int32;
-                      std::unique_ptr<KernelContext> ctx_int32(
-                          new KernelContext);
-                      lite::arm::math::DepthwiseConvInt8<PRECISION(kInt32)>
-                          int8depthwise_int32;
-                      int8depthwise_int32.init(param,
-                                               &ctx_int32->As<ARMContext>());
-                      int8depthwise_int32.create(param,
-                                                 &ctx_int32->As<ARMContext>());
-                      int8depthwise_int32.run(param);
-                      int* output_int32_data = output_int32.mutable_data<int>();
-                      for (int i = 0; i < output_int32.dims().production();
-                           i++) {
-                        EXPECT_NEAR(output_int32_data[i],
-                                    output_int32_ref_data[i],
-                                    1e-3);
-                      }
-
-                      // ============ int8depthwise_int8============
-                      int8_t* output_int8_ref_data =
-                          output_int8_ref.mutable_data<int8_t>();
-                      lite::arm::math::trans_tensor_int32_to_int8(
-                          &output_int32_ref,
-                          &output_int8_ref,
-                          in_scale[0],
-                          1,
-                          w_scale);
-                      param.output = &output_int8;
-                      param.input_scale = in_scale[0];
-                      param.output_scale = 1;
-                      param.weight_scale = w_scale;
-                      std::unique_ptr<KernelContext> ctx_int8(
-                          new KernelContext);
-                      lite::arm::math::DepthwiseConvInt8<PRECISION(kInt8)>
-                          int8depthwise_int8;
-                      int8depthwise_int8.init(param,
-                                              &ctx_int8->As<ARMContext>());
-                      int8depthwise_int8.create(param,
-                                                &ctx_int8->As<ARMContext>());
-                      int8depthwise_int8.run(param);
-                      int8_t* output_int8_data =
-                          output_int8.mutable_data<int8_t>();
-                      for (int i = 0; i < output_int8.dims().production();
-                           i++) {
-                        EXPECT_NEAR(
-                            output_int8_data[i], output_int8_ref_data[i], 1e-3);
-                      }
-
-                      // ============int8depthwise_float32 ============
-                      float* output_fp32_ref_data =
-                          output_fp32_ref.mutable_data<float>();
-                      lite::arm::math::trans_tensor_int32_to_fp32(
-                          &output_int32_ref,
-                          &output_fp32_ref,
-                          in_scale[0],
-                          w_scale);
-                      param.output = &output_fp32;
-                      param.input_scale = in_scale[0];
-                      param.output_scale = 1;
-                      param.weight_scale = w_scale;
-                      std::unique_ptr<KernelContext> ctx_fp32(
-                          new KernelContext);
-                      lite::arm::math::DepthwiseConvInt8<PRECISION(kFloat)>
-                          int8depthwise_fp32;
-                      int8depthwise_fp32.init(param,
-                                              &ctx_fp32->As<ARMContext>());
-                      int8depthwise_fp32.create(param,
-                                                &ctx_fp32->As<ARMContext>());
-                      int8depthwise_fp32.run(param);
-                      float* output_fp32_data =
-                          output_fp32.mutable_data<float>();
-                      for (int i = 0; i < output_fp32.dims().production();
-                           i++) {
-                        EXPECT_NEAR(
-                            output_fp32_data[i], output_fp32_ref_data[i], 1e-3);
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(conv_arm, compute) {
-  DeviceInfo::Init();
-#if 1
-  for (auto n : {2}) {
-    for (auto ic : {6}) {
-      for (auto oc : {6}) {
-        for (auto ih : {9}) {
-          for (auto iw : {9}) {
-            for (auto flag_bias : {false, true}) {
-              for (auto flag_relu : {false, true}) {
-                for (auto depthwise : {false, true}) {
-                  for (auto dilation : {1}) {
-                    for (auto stride : {1, 2}) {
-                      for (auto padding : {0, 1, 2}) {
-                        for (auto ks : {1, 3, 5}) {
-#else
-  for (auto n : {1, 2}) {
-    for (auto ic : {6, 32 /*, 128*/}) {
-      for (auto oc : {6, 32 /*, 128*/}) {
-        for (auto ih : {9, 18 /*, 56 , 112, 224, 512*/}) {
-          for (auto iw : {9, 18 /*, 56, 112, 224, 512*/}) {
-            for (auto flag_bias : {false, true}) {
-              for (auto flag_relu : {false, true}) {
-                for (auto depthwise : {false, true}) {
-                  for (auto dilation : {1, 2}) {
-                    for (auto stride : {1, 2}) {
-                      for (auto padding : {0, 1, 2}) {
-                        for (auto ks : {1, 3, 5}) {
-#endif
-                          int group = 1;
-                          if (depthwise) {  // depthwise convolution ?
-                            group = oc = ic;
-                          }
-                          // get input, filter and output shape
-                          std::vector<int64_t> input_shape = {n, ic, ih, iw};
-                          std::vector<int64_t> filter_shape = {
-                              oc, ic / group, ks, ks};
-                          const int dks = dilation * (ks - 1) + 1;
-                          int oh = (ih + 2 * padding - dks) / stride + 1;
-                          int ow = (iw + 2 * padding - dks) / stride + 1;
-                          std::vector<int64_t> output_shape({n, oc, oh, ow});
-                          // resize input, filter and output
-                          Tensor input;
-                          Tensor filter;
-                          Tensor bias;
-                          Tensor output;
-                          Tensor output_ref;
-                          input.Resize(input_shape);
-                          filter.Resize(filter_shape);
-                          output.Resize(output_shape);
-                          output_ref.Resize(output_shape);
-                          VLOG(3) << "input: " << input.dims();
-                          VLOG(3) << "filter: " << filter.dims()
-                                  << " padding:" << padding
-                                  << " stride:" << stride
-                                  << " dilation:" << dilation;
-                          VLOG(3) << "output: " << output.dims();
-                          auto* input_data = input.mutable_data<float>();
-                          auto* filter_data = filter.mutable_data<float>();
-                          auto* output_data = output.mutable_data<float>();
-                          for (int i = 0; i < input.dims().production(); i++) {
-                            float sign = i % 3 == 0 ? -1.0f : 1.0f;
-                            input_data[i] = sign * static_cast<float>(i % 128);
-                          }
-                          for (int i = 0; i < filter.dims().production(); i++) {
-                            filter_data[i] =
-                                i * 0.001f /
-                                static_cast<float>(filter.dims().production());
-                          }
-                          // prepare kernel params and run
-                          ConvCompute conv;
-                          std::unique_ptr<KernelContext> ctx(new KernelContext);
-                          ctx->As<ARMContext>();
-                          conv.SetContext(std::move(ctx));
-                          operators::ConvParam param;
-                          param.x = &input;
-                          param.filter = &filter;
-                          param.output = &output;
-                          param.bias = nullptr;
-                          if (flag_bias) {
-                            bias.Resize({oc});
-                            auto* bias_data = bias.mutable_data<float>();
-                            for (int i = 0; i < bias.dims().production(); i++) {
-                              bias_data[i] = static_cast<float>(i);
-                            }
-                            param.bias = &bias;
-                          }
-                          param.fuse_relu = flag_relu;
-                          param.paddings = std::vector<int>({padding, padding});
-                          param.strides = std::vector<int>({stride, stride});
-                          param.dilations =
-                              std::vector<int>({dilation, dilation});
-                          param.groups = group;
-                          conv.SetParam(param);
-                          conv.Launch();
-                          // invoking ref implementation and compare results
-                          param.output = &output_ref;
-                          conv_compute_ref<float, float>(param);
-                          auto* output_ref_data =
-                              output_ref.mutable_data<float>();
-                          for (int i = 0; i < output.dims().production(); i++) {
-                            EXPECT_NEAR(
-                                output_data[i], output_ref_data[i], 1e-3);
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
diff --git a/lite/kernels/arm/conv_transpose_compute.cc b/lite/kernels/arm/conv_transpose_compute.cc
deleted file mode 100644
index fdadbabfc1..0000000000
--- a/lite/kernels/arm/conv_transpose_compute.cc
+++ /dev/null
@@ -1,164 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/conv_transpose_compute.h"
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void Conv2DTransposeCompute::PrepareForRun() {
-  auto& param = this->Param<param_t>();
-  auto x_dims = param.x->dims();
-  auto w_dims = param.filter->dims();
-  auto o_dims = param.output->dims();
-  int win = x_dims[3];  // nchw
-  int hin = x_dims[2];
-  int chin = x_dims[1];
-  int num = x_dims[0];
-  int wout = o_dims[3];
-  int hout = o_dims[2];
-  int chout = o_dims[1];
-  int kw = w_dims[3];  // oihw
-  int kh = w_dims[2];
-  int group = param.groups;
-
-  // deconv weights layout: chin * chout * kh * kw
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  int m = chout * kw * kh / group;
-  int n = hin * win;
-  int k = chin / group;
-
-  ctx.ExtendWorkspace(group * m * n * sizeof(float));
-
-  lite::Tensor tmp_weights;
-  lite::arm::math::prepackA(
-      &tmp_weights, *(param.filter), 1., m, k, group, true, &ctx);
-  param.filter->Resize(tmp_weights.dims());
-  param.filter->CopyDataFrom(tmp_weights);
-  param.filter->Resize(w_dims);
-}
-
-void Conv2DTransposeCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto x_dims = param.x->dims();
-  auto o_dims = param.output->dims();
-  auto w_dims = param.filter->dims();
-  int num = x_dims[0];
-  int chin = x_dims[1];
-  int hin = x_dims[2];
-  int win = x_dims[3];
-  int chout = o_dims[1];
-  int hout = o_dims[2];
-  int wout = o_dims[3];
-  int kw = w_dims[3];  // oihw
-  int kh = w_dims[2];
-  int group = param.groups;
-  bool fuse_relu = param.fuse_relu;
-  bool flag_bias = (param.bias != nullptr);
-
-  int m = chout * kw * kh / group;
-  int n = hin * win;
-  int k = chin / group;
-  int group_size_in = win * hin * chin / group;
-  int group_size_out = wout * hout * chout / group;
-  int group_size_coldata = m * n;
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  int hblock = lite::arm::math::get_hblock(ctx.arch());
-  int m_roundup = hblock * ((m + hblock - 1) / hblock);
-  int group_size_weights = ((m_roundup * k + 15) / 16) * 16;
-  bool flag_1x1s1p1 = (kw == 1) && (kh == 1) && (param.strides[0] == 1) &&
-                      (param.strides[1] == 1) && (param.paddings[0] == 0) &&
-                      (param.paddings[1] == 0) && (param.dilations[0] == 1) &&
-                      (param.dilations[1] == 1);
-  ctx.ExtendWorkspace(sizeof(float) * group * m * n);
-
-  auto din = param.x->data<float>();
-  auto dout = param.output->mutable_data<float>();
-  auto weights = param.filter->data<float>();
-  for (int i = 0; i < num; i++) {
-    const float* din_batch = din + i * chin * hin * win;
-    float* dout_batch = dout + i * chout * hout * wout;
-    float* col_data = static_cast<float*>(ctx.workspace_data<float>()) +
-                      ctx.l2_cache_size() / sizeof(float);
-    if (flag_1x1s1p1) {
-      col_data = dout_batch;
-    }
-    for (int g = 0; g < group; g++) {
-      const float* din_group = din_batch + g * group_size_in;
-      const float* weights_group = weights + g * group_size_weights;
-      float* coldata_group = col_data + g * group_size_coldata;
-
-      lite::arm::math::sgemm_prepack(false,
-                                     m,
-                                     n,
-                                     k,
-                                     weights_group,
-                                     din_group,
-                                     n,
-                                     0.,
-                                     coldata_group,
-                                     n,
-                                     nullptr,
-                                     false,
-                                     fuse_relu && (!flag_bias),
-                                     &ctx);
-    }
-    if (!flag_1x1s1p1) {
-      lite::arm::math::col2im<float>(col_data,
-                                     chout,
-                                     hout,
-                                     wout,
-                                     kh,
-                                     kw,
-                                     param.paddings[0],
-                                     param.paddings[1],
-                                     param.strides[0],
-                                     param.strides[1],
-                                     param.dilations[0],
-                                     param.dilations[1],
-                                     dout_batch);
-    }
-    if (flag_bias) {
-      lite::arm::math::fill_bias_relu<float>(
-          dout_batch,
-          static_cast<const float*>(param.bias->data<float>()),
-          chout,
-          wout * hout,
-          flag_bias,
-          fuse_relu);
-    }
-  }
-}
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(conv2d_transpose,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::Conv2DTransposeCompute,
-                     def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/conv_transpose_compute.h b/lite/kernels/arm/conv_transpose_compute.h
deleted file mode 100644
index 8b87c7cfad..0000000000
--- a/lite/kernels/arm/conv_transpose_compute.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/kernel.h"
-#include "lite/operators/conv_transpose_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class Conv2DTransposeCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ConvParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  ~Conv2DTransposeCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/conv_transpose_compute_test.cc b/lite/kernels/arm/conv_transpose_compute_test.cc
deleted file mode 100644
index 298c651d9f..0000000000
--- a/lite/kernels/arm/conv_transpose_compute_test.cc
+++ /dev/null
@@ -1,371 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/conv_transpose_compute.h"
-#include <gtest/gtest.h>
-#include <cmath>
-#include <cstdlib>
-#include <functional>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-template <typename type, typename type2>
-static void basic_gemm(int m,
-                       int n,
-                       int k,
-                       const type* a,
-                       const type* b,
-                       const type2* bias,
-                       type2* c,
-                       type2 alpha,
-                       type2 beta,
-                       bool trans_a = false,
-                       bool trans_b = false,
-                       bool flag_bias = false,
-                       bool flag_relu = false) {
-#pragma omp parallel for
-  for (int i = 0; i < m; ++i) {
-    type2 bias_data = (type2)0;
-    if (flag_bias) {
-      bias_data = bias[i];
-    }
-    for (int j = 0; j < n; ++j) {
-      type2 sum = static_cast<type2>(0);
-      for (int l = 0; l < k; ++l) {
-        type av;
-        type bv;
-        if (trans_a) {
-          av = a[l * m + i];
-        } else {
-          av = a[i * k + l];
-        }
-        if (trans_b) {
-          bv = b[j * k + l];
-        } else {
-          bv = b[l * n + j];
-        }
-        sum += av * bv;
-      }
-      type2 tmp = alpha * sum + beta * c[i * n + j] + bias_data;
-      if (flag_relu) {
-        c[i * n + j] = tmp > (type2)0 ? tmp : (type2)0;
-      } else {
-        c[i * n + j] = tmp;
-      }
-    }
-  }
-}
-
-//! for float, dtype1 and type2 is float
-//! for int8, dytpe1 is char, dtype2 is int
-template <typename Dtype1, typename Dtype2>
-bool deconv_basic(const Dtype1* din,
-                  Dtype2* dout,
-                  int num,
-                  int chout,
-                  int hout,
-                  int wout,
-                  int chin,
-                  int hin,
-                  int win,
-                  const Dtype1* weights,
-                  const Dtype2* bias,
-                  int group,
-                  int kernel_w,
-                  int kernel_h,
-                  int stride_w,
-                  int stride_h,
-                  int dila_w,
-                  int dila_h,
-                  int pad_w,
-                  int pad_h,
-                  bool flag_bias,
-                  bool flag_relu) {
-  int m = chout * kernel_w * kernel_h / group;
-  int n = hin * win;
-  int k = chin / group;
-
-  if (chin != chout || group != chin) {
-    CHECK_OR_FALSE(chin % group == 0);
-    CHECK_OR_FALSE(chout % group == 0);
-  }
-
-  lite::Tensor workspace_tensor;
-  std::vector<int64_t> wt_shape = {1, 1, 1, group * m * n};
-  workspace_tensor.Resize(wt_shape);
-  auto* workspace_ptr = workspace_tensor.mutable_data<Dtype2>();
-
-  int group_size_in = win * hin * chin / group;
-  int group_size_out = wout * hout * chout / group;
-  int group_size_coldata = m * n;
-  int group_size_weights = chin * chout * kernel_w * kernel_h / (group * group);
-  bool flag_1x1s1p1 = (kernel_w == 1) && (kernel_h == 1) && (stride_h == 1) &&
-                      (stride_w == 1) && (pad_w == 1) && (pad_h == 1) &&
-                      (dila_w == 1) && (dila_h == 1);
-
-  for (int i = 0; i < num; ++i) {
-    const Dtype1* din_batch = din + i * chin * hin * win;
-    Dtype2* dout_batch = dout + i * chout * hout * wout;
-
-    Dtype2* col_data = workspace_ptr;
-    if (flag_1x1s1p1) {
-      col_data = dout_batch;
-    }
-    memset(col_data, 0, sizeof(Dtype2) * group_size_coldata);
-    for (int g = 0; g < group; ++g) {
-      const Dtype1* din_group = din_batch + g * group_size_in;
-      const Dtype1* weights_group = weights + g * group_size_weights;
-      Dtype2* coldata_group = col_data + g * group_size_coldata;
-      basic_gemm<Dtype1, Dtype2>(m,
-                                 n,
-                                 k,
-                                 weights_group,
-                                 din_group,
-                                 nullptr,
-                                 coldata_group,
-                                 (Dtype2)1,
-                                 (Dtype2)0,
-                                 true,
-                                 false,
-                                 false,
-                                 (!flag_bias && flag_relu));
-    }
-    if (!flag_1x1s1p1) {
-      lite::arm::math::col2im(col_data,
-                              chout,
-                              hout,
-                              wout,
-                              kernel_h,
-                              kernel_w,
-                              pad_h,
-                              pad_w,
-                              stride_h,
-                              stride_w,
-                              dila_h,
-                              dila_w,
-                              dout_batch);
-    }
-    if (flag_bias) {
-      lite::arm::math::fill_bias_relu(
-          dout_batch, bias, chout, wout * hout, flag_bias, flag_relu);
-    }
-  }
-  return true;
-}
-
-template <typename Dtype1, typename Dtype2>
-void conv2d_transpose_compute_ref(const operators::ConvParam& param) {
-  const Dtype1* din = param.x->data<Dtype1>();
-  Dtype2* dout = param.output->mutable_data<Dtype2>();
-
-  int num = param.x->dims()[0];
-  int chout = param.output->dims()[1];
-  int hout = param.output->dims()[2];
-  int wout = param.output->dims()[3];
-
-  int chin = param.x->dims()[1];
-  int hin = param.x->dims()[2];
-  int win = param.x->dims()[3];
-
-  const Dtype1* weights = param.filter->mutable_data<Dtype1>();
-  Dtype2* bias = nullptr;
-  if (param.bias != nullptr) {
-    bias = param.bias->mutable_data<Dtype2>();
-  }
-
-  int group = param.groups;
-  int kernel_h = param.filter->dims()[2];
-  int kernel_w = param.filter->dims()[3];
-  int stride_h = param.strides[0];
-  int stride_w = param.strides[1];
-  int dila_h = param.dilations[0];
-  int dila_w = param.dilations[1];
-
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
-  bool flag_bias = (param.bias != nullptr);
-  bool flag_relu = param.fuse_relu;
-
-  deconv_basic<float, float>(din,
-                             dout,
-                             num,
-                             chout,
-                             hout,
-                             wout,
-                             chin,
-                             hin,
-                             win,
-                             weights,
-                             bias,
-                             group,
-                             kernel_w,
-                             kernel_h,
-                             stride_w,
-                             stride_h,
-                             dila_w,
-                             dila_h,
-                             pad_w,
-                             pad_h,
-                             flag_bias,
-                             flag_relu);
-}
-
-TEST(conv2d_transpose_arm, retrive_op) {
-  auto op = KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-      "conv2d_transpose");
-  ASSERT_FALSE(op.empty());
-  ASSERT_TRUE(op.front());
-}
-
-TEST(conv2d_transpose_arm, init) {
-  Conv2DTransposeCompute compute;
-  ASSERT_EQ(compute.precision(), PRECISION(kFloat));
-  ASSERT_EQ(compute.target(), TARGET(kARM));
-}
-
-TEST(conv2d_transpose_arm, compute) {
-  DeviceInfo::Init();
-  for (auto n : {1, 2}) {
-    for (auto ic : {1, 3 /*, 128*/}) {
-      for (auto oc : {1, 3 /*, 128*/}) {
-        for (auto ih : {2, 8 /*, 56 , 112, 224, 512*/}) {
-          for (auto iw : {2, 8 /*, 56, 112, 224, 512*/}) {
-            for (auto flag_bias : {false, true}) {
-              for (auto flag_relu : {false, true}) {
-                for (auto dilation : {1, 2}) {
-                  for (auto stride : {1, 2}) {
-                    for (auto padding : {0, 1, 2}) {
-                      for (auto ks : {2, 3, 5}) {
-                        for (auto group : {1, 2}) {
-                          // obtain shape
-                          if (ic % group != 0 || oc % group != 0) {
-                            group = 1;
-                          }
-                          std::vector<int64_t> input_shape = {n, ic, ih, iw};
-                          std::vector<int64_t> filter_shape = {
-                              oc / group, ic, ks, ks};
-                          int oh = (ih - 1) * stride - 2 * padding +
-                                   dilation * (ks - 1) + 1;
-                          int ow = (iw - 1) * stride - 2 * padding +
-                                   dilation * (ks - 1) + 1;
-                          if (oh < 1 || ow < 1) {
-                            break;
-                          }
-                          std::vector<int64_t> output_shape = {n, oc, oh, ow};
-                          std::vector<int64_t> bias_shape = {1, oc, 1, 1};
-
-                          // define and resize tensor
-                          Tensor input;
-                          Tensor filter;
-                          Tensor filter_copy;
-                          Tensor bias;
-                          Tensor output;
-                          Tensor output_ref;
-                          input.Resize(input_shape);
-                          filter.Resize(filter_shape);
-                          filter_copy.Resize(filter_shape);
-                          output.Resize(output_shape);
-                          output_ref.Resize(output_shape);
-                          auto* input_data = input.mutable_data<float>();
-                          auto* filter_data = filter.mutable_data<float>();
-                          auto* filter_copy_data =
-                              filter_copy.mutable_data<float>();
-                          auto* output_data = output.mutable_data<float>();
-
-                          // initialize tensor
-                          for (int i = 0; i < input.dims().production(); i++) {
-                            float sign = i % 3 == 0 ? -1.0f : 1.0f;
-                            input_data[i] = sign * static_cast<float>(i % 128);
-                          }
-                          for (int i = 0; i < filter.dims().production(); i++) {
-                            filter_data[i] =
-                                i /
-                                static_cast<float>(filter.dims().production());
-                            filter_copy_data[i] =
-                                i / static_cast<float>(
-                                        filter_copy.dims().production());
-                          }
-                          if (flag_bias) {
-                            bias.Resize(bias_shape);
-                            auto* bias_data = bias.mutable_data<float>();
-                            for (int i = 0; i < bias.dims().production(); i++) {
-                              bias_data[i] = static_cast<float>(i);
-                            }
-                          }
-
-                          // prepare kernel params and run
-                          std::unique_ptr<KernelContext> ctx(new KernelContext);
-                          ctx->As<ARMContext>();
-                          Conv2DTransposeCompute conv2d_transpose;
-                          conv2d_transpose.SetContext(std::move(ctx));
-                          operators::ConvParam param;
-                          param.x = &input;
-                          param.filter = &filter;
-                          param.output = &output;
-                          param.bias = nullptr;
-                          if (flag_bias) {
-                            bias.Resize(bias_shape);
-                            auto* bias_data = bias.mutable_data<float>();
-                            for (int i = 0; i < bias.dims().production(); i++) {
-                              bias_data[i] = static_cast<float>(i);
-                            }
-                            param.bias = &bias;
-                          }
-                          param.fuse_relu = flag_relu;
-                          param.paddings = std::vector<int>({padding, padding});
-                          param.strides = std::vector<int>({stride, stride});
-                          param.dilations =
-                              std::vector<int>({dilation, dilation});
-                          param.groups = group;
-                          conv2d_transpose.SetParam(param);
-                          conv2d_transpose.Launch();
-
-                          // invoking ref implementation and compare results
-                          param.filter = &filter_copy;
-                          param.output = &output_ref;
-                          conv2d_transpose_compute_ref<float, float>(param);
-                          auto* output_ref_data =
-                              output_ref.mutable_data<float>();
-                          for (int i = 0; i < output.dims().production(); i++) {
-                            EXPECT_NEAR(
-                                output_data[i], output_ref_data[i], 1e-3);
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-USE_LITE_KERNEL(conv2d_transpose, kARM, kFloat, kNCHW, def);
diff --git a/lite/kernels/arm/crop_compute.cc b/lite/kernels/arm/crop_compute.cc
deleted file mode 100644
index e99c162995..0000000000
--- a/lite/kernels/arm/crop_compute.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/crop_compute.h"
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void CropCompute::crop_fun(const lite::Tensor* input, lite::Tensor* output) {
-  auto input_dims = input->dims();
-  int num = input_dims[0];
-  int in_c = input_dims[1];
-  int in_h = input_dims[2];
-  int in_w = input_dims[3];
-  const float* ptr_in = input->data<float>();
-  float* ptr_out = output->mutable_data<float>();
-  for (int i = 0; i < num; ++i) {
-    int offset_n = i * in_c * in_h * in_w;
-    for (int j = c_off; j < c_end; ++j) {
-      int offset_c = offset_n + j * in_h * in_w;
-      for (int k = h_off; k < h_end; ++k) {
-        int offset_h = offset_c + k * in_w;
-        for (int l = w_off; l < w_end; ++l) {
-          ptr_out[0] = ptr_in[offset_h + l];
-          ptr_out++;
-        }
-      }
-    }
-  }
-}
-void CropCompute::Run() {
-  auto& param = Param<operators::CropParam>();
-  const lite::Tensor* inputs = param.X;
-  auto* out = param.Out;
-  offsets_ = param.offsets;
-  shape_ = param.shape;
-
-  c_off = param.offsets[1];
-  h_off = param.offsets[2];
-  w_off = param.offsets[3];
-  c_end = shape_[1] + c_off;
-  h_end = shape_[2] + h_off;
-  w_end = shape_[3] + w_off;
-  crop_fun(inputs, out);
-
-  return;
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    crop, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::CropCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/crop_compute.h b/lite/kernels/arm/crop_compute.h
deleted file mode 100644
index a0b4171375..0000000000
--- a/lite/kernels/arm/crop_compute.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/operators/crop_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class CropCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::CropParam;
-
-  void Run() override;
-  virtual ~CropCompute() = default;
-  void crop_fun(const lite::Tensor* input, lite::Tensor* output);
-
- private:
-  std::vector<int> offsets_;
-  std::vector<int> shape_;
-
-  int c_off;
-  int h_off;
-  int w_off;
-  int c_end;
-  int h_end;
-  int w_end;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/decode_bboxes_compute.cc b/lite/kernels/arm/decode_bboxes_compute.cc
deleted file mode 100644
index 84dde34088..0000000000
--- a/lite/kernels/arm/decode_bboxes_compute.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/decode_bboxes_compute.h"
-#include <string>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void DecodeBboxesCompute::Run() {
-  auto& param = Param<operators::DecodeBboxesParam>();
-  const float* loc_data = param.loc_data->data<float>();
-  const float* prior_data = param.prior_data->data<float>();
-  float* bbox_data = param.bbox_data->mutable_data<float>();
-
-  // CHECK_EQ(param.loc_data->dims(), 2); // loc_data {N, boxes * 4}
-  // CHECK_EQ(param.prior_data->dims(), 3); // prior_data {1, 2, boxes * 4(xmin,
-  // ymin, xmax, ymax)}
-
-  int batch_num = param.batch_num;
-  int num_priors = param.num_priors;
-  int num_loc_classes = param.num_loc_classes;
-  int background_label_id = param.background_label_id;
-  bool share_location = param.share_location;
-  bool variance_encoded_in_target = param.variance_encoded_in_target;
-  std::string code_type = param.code_type;
-
-  lite::arm::math::decode_bboxes(batch_num,
-                                 loc_data,
-                                 prior_data,
-                                 code_type,
-                                 variance_encoded_in_target,
-                                 num_priors,
-                                 share_location,
-                                 num_loc_classes,
-                                 background_label_id,
-                                 bbox_data);
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(decode_bboxes,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::DecodeBboxesCompute,
-                     def)
-    .BindInput("Loc", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Prior", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Bbox", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/decode_bboxes_compute.h b/lite/kernels/arm/decode_bboxes_compute.h
deleted file mode 100644
index 7529246aff..0000000000
--- a/lite/kernels/arm/decode_bboxes_compute.h
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class DecodeBboxesCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::DecodeBboxesParam;
-
-  void Run() override;
-
-  virtual ~DecodeBboxesCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/decode_bboxes_compute_test.cc b/lite/kernels/arm/decode_bboxes_compute_test.cc
deleted file mode 100644
index 262fd7b19a..0000000000
--- a/lite/kernels/arm/decode_bboxes_compute_test.cc
+++ /dev/null
@@ -1,185 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/decode_bboxes_compute.h"
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-template <typename dtype>
-void decode_bboxes_compute_ref(const operators::DecodeBboxesParam& param) {
-  const dtype* loc_data = param.loc_data->data<const dtype>();
-  const dtype* prior_data = param.prior_data->data<const dtype>();
-  dtype* bbox_data = param.bbox_data->mutable_data<dtype>();
-
-  for (int n = 0; n < param.batch_num; ++n) {
-    const dtype* ptr_loc_batch = loc_data + n * param.num_priors * 4;
-    dtype* ptr_bbox_batch = bbox_data + n * param.num_priors * 4;
-    for (int i = 0; i < param.num_priors; ++i) {
-      int idx = i * 4;
-      const dtype* ptr_loc = ptr_loc_batch + idx;
-      const dtype* ptr_prior = prior_data + idx;
-      dtype* ptr_bbox = ptr_bbox_batch + idx;
-
-      dtype p_xmin = ptr_prior[0];
-      dtype p_ymin = ptr_prior[1];
-      dtype p_xmax = ptr_prior[2];
-      dtype p_ymax = ptr_prior[3];
-      dtype prior_width = p_xmax - p_xmin;
-      dtype prior_height = p_ymax - p_ymin;
-      dtype prior_center_x = (p_xmin + p_xmax) / 2.f;
-      dtype prior_center_y = (p_ymin + p_ymax) / 2.f;
-
-      dtype xmin = ptr_loc[0];
-      dtype ymin = ptr_loc[1];
-      dtype xmax = ptr_loc[2];
-      dtype ymax = ptr_loc[3];
-
-      if (param.code_type == "corner") {
-        if (param.variance_encoded_in_target) {
-          ptr_bbox[0] = ptr_loc[0] + ptr_prior[0];
-          ptr_bbox[1] = ptr_loc[1] + ptr_prior[1];
-          ptr_bbox[2] = ptr_loc[2] + ptr_prior[2];
-          ptr_bbox[3] = ptr_loc[3] + ptr_prior[3];
-        } else {
-          const dtype* variance_data = prior_data + 4 * param.num_priors;
-          const dtype* ptr_var = variance_data + idx;
-          ptr_bbox[0] = ptr_var[0] * ptr_loc[0] + ptr_prior[0];
-          ptr_bbox[1] = ptr_var[1] * ptr_loc[1] + ptr_prior[1];
-          ptr_bbox[2] = ptr_var[2] * ptr_loc[2] + ptr_prior[2];
-          ptr_bbox[3] = ptr_var[3] * ptr_loc[3] + ptr_prior[3];
-        }
-      } else if (param.code_type == "center_size") {
-        dtype decode_bbox_center_x;
-        dtype decode_bbox_center_y;
-        dtype decode_bbox_width;
-        dtype decode_bbox_height;
-        if (param.variance_encoded_in_target) {
-          //! variance is encoded in target, we simply need to retore the offset
-          //! predictions.
-          decode_bbox_center_x = xmin * prior_width + prior_center_x;
-          decode_bbox_center_y = ymin * prior_height + prior_center_y;
-          decode_bbox_width = std::exp(xmax) * prior_width;
-          decode_bbox_height = std::exp(ymax) * prior_height;
-        } else {
-          const dtype* variance_data = prior_data + 4 * param.num_priors;
-          const dtype* ptr_var = variance_data + idx;
-          decode_bbox_center_x =
-              ptr_var[0] * xmin * prior_width + prior_center_x;
-          decode_bbox_center_y =
-              ptr_var[1] * ymin * prior_height + prior_center_y;
-          decode_bbox_width = std::exp(ptr_var[2] * xmax) * prior_width;
-          decode_bbox_height = std::exp(ptr_var[3] * ymax) * prior_height;
-        }
-        ptr_bbox[0] = decode_bbox_center_x - decode_bbox_width / 2.f;
-        ptr_bbox[1] = decode_bbox_center_y - decode_bbox_height / 2.f;
-        ptr_bbox[2] = decode_bbox_center_x + decode_bbox_width / 2.f;
-        ptr_bbox[3] = decode_bbox_center_y + decode_bbox_height / 2.f;
-      } else if (param.code_type == "corner_size") {
-        if (param.variance_encoded_in_target) {
-          ptr_bbox[0] = p_xmin + ptr_loc[0] * prior_width;
-          ptr_bbox[1] = p_ymin + ptr_loc[1] * prior_height;
-          ptr_bbox[2] = p_xmax + ptr_loc[2] * prior_width;
-          ptr_bbox[3] = p_ymax + ptr_loc[3] * prior_height;
-        } else {
-          const dtype* variance_data = prior_data + 4 * param.num_priors;
-          const dtype* ptr_var = variance_data + idx;
-          ptr_bbox[0] = p_xmin + ptr_loc[0] * ptr_var[0] * prior_width;
-          ptr_bbox[1] = p_ymin + ptr_loc[1] * ptr_var[1] * prior_width;
-          ptr_bbox[2] = p_xmax + ptr_loc[2] * ptr_var[2] * prior_width;
-          ptr_bbox[3] = p_ymax + ptr_loc[3] * ptr_var[3] * prior_width;
-        }
-      } else {
-        LOG(FATAL) << "unsupported code type: " << param.code_type;
-      }
-    }
-  }
-}
-
-TEST(decode_bboxes_arm, retrive_op) {
-  auto decode_bboxes =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "decode_bboxes");
-  ASSERT_FALSE(decode_bboxes.empty());
-  ASSERT_TRUE(decode_bboxes.front());
-}
-
-TEST(decode_bboxes_arm, init) {
-  DecodeBboxesCompute decode_bboxes;
-  ASSERT_EQ(decode_bboxes.precision(), PRECISION(kFloat));
-  ASSERT_EQ(decode_bboxes.target(), TARGET(kARM));
-}
-
-TEST(decode_bboxes_arm, compute) {
-  DecodeBboxesCompute decode_bboxes;
-  operators::DecodeBboxesParam param;
-  lite::Tensor loc, prior, bbox, bbox_ref;
-
-  for (int batch_num : {1, 2, 3, 4}) {
-    for (int num_priors : {1, 3, 4, 8, 10}) {
-      for (std::string code_type : {"corner", "center_size", "corner_size"}) {
-        for (bool variance_encoded_in_target : {true, false}) {
-          auto loc_dim =
-              DDim(std::vector<int64_t>({batch_num, num_priors * 4}));
-          loc.Resize(loc_dim);
-          auto prior_dim = DDim(std::vector<int64_t>({1, 2, num_priors * 4}));
-          prior.Resize(prior_dim);
-          bbox.Resize(loc_dim);
-          bbox_ref.Resize(loc_dim);
-          auto* loc_data = loc.mutable_data<float>();
-          auto* prior_data = prior.mutable_data<float>();
-          auto* bbox_data = bbox.mutable_data<float>();
-          auto* bbox_ref_data = bbox_ref.mutable_data<float>();
-
-          for (int i = 0; i < loc_dim.production(); ++i) {
-            loc_data[i] = i * 1. / loc_dim.production();
-          }
-          for (int i = 0; i < prior_dim.production(); ++i) {
-            prior_data[i] = i * 1. / prior_dim.production();
-          }
-
-          param.loc_data = &loc;
-          param.prior_data = &prior;
-          param.bbox_data = &bbox;
-          param.num_loc_classes = 0;
-          param.share_location = true;
-          param.batch_num = batch_num;
-          param.num_priors = num_priors;
-          param.code_type = code_type;
-          param.variance_encoded_in_target = variance_encoded_in_target;
-          decode_bboxes.SetParam(param);
-          decode_bboxes.Run();
-          param.bbox_data = &bbox_ref;
-          decode_bboxes_compute_ref<float>(param);
-          for (int i = 0; i < bbox.dims().production(); i++) {
-            EXPECT_NEAR(bbox_data[i], bbox_ref_data[i], 1e-5);
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(decode_bboxes, kARM, kFloat, kNCHW, def);
diff --git a/lite/kernels/arm/density_prior_box_compute.cc b/lite/kernels/arm/density_prior_box_compute.cc
deleted file mode 100644
index 3a9fa85411..0000000000
--- a/lite/kernels/arm/density_prior_box_compute.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/density_prior_box_compute.h"
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
-                               bool flip,
-                               std::vector<float>* output_aspect_ratior) {
-  constexpr float epsilon = 1e-6;
-  output_aspect_ratior->clear();
-  output_aspect_ratior->push_back(1.0f);
-  for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
-    float ar = input_aspect_ratior[i];
-    bool already_exist = false;
-    for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
-      if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
-        already_exist = true;
-        break;
-      }
-    }
-    if (!already_exist) {
-      output_aspect_ratior->push_back(ar);
-      if (flip) {
-        output_aspect_ratior->push_back(1.0f / ar);
-      }
-    }
-  }
-}
-
-void DensityPriorBoxCompute::Run() {
-  auto& param = Param<operators::DensityPriorBoxParam>();
-  bool is_flip = param.flip;
-  bool is_clip = param.clip;
-  std::vector<float> min_size = param.min_sizes;
-  std::vector<float> fixed_size = param.fixed_sizes;
-  std::vector<float> fixed_ratio = param.fixed_ratios;
-  auto density_size = param.density_sizes;
-  std::vector<float> max_size = param.max_sizes;
-  std::vector<float> aspect_ratio = param.aspect_ratios;
-  std::vector<float> variance = param.variances_;
-  int img_w = param.img_w;
-  int img_h = param.img_h;
-  float step_w = param.step_w;
-  float step_h = param.step_h;
-  float offset = param.offset;
-  std::vector<float> aspect_ratios_vec;
-  ExpandAspectRatios(aspect_ratio, is_flip, &aspect_ratios_vec);
-  size_t prior_num = aspect_ratios_vec.size() * min_size.size();
-  prior_num += max_size.size();
-  if (fixed_size.size() > 0) {
-    prior_num = fixed_size.size() * fixed_ratio.size();
-  }
-  if (density_size.size() > 0) {
-    for (int i = 0; i < density_size.size(); ++i) {
-      if (fixed_ratio.size() > 0) {
-        prior_num += (fixed_ratio.size() * ((pow(density_size[i], 2)) - 1));
-      } else {
-        prior_num +=
-            ((fixed_ratio.size() + 1) * ((pow(density_size[i], 2)) - 1));
-      }
-    }
-  }
-  std::vector<std::string> order = param.order;
-
-  lite::arm::math::density_prior_box(param.input,
-                                     param.image,
-                                     &param.boxes,
-                                     &param.variances,
-                                     min_size,
-                                     fixed_size,
-                                     fixed_ratio,
-                                     density_size,
-                                     max_size,
-                                     aspect_ratio,
-                                     variance,
-                                     img_w,
-                                     img_h,
-                                     step_w,
-                                     step_h,
-                                     offset,
-                                     prior_num,
-                                     is_flip,
-                                     is_clip,
-                                     order);
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(density_prior_box,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::DensityPriorBoxCompute,
-                     def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Image", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/density_prior_box_compute.h b/lite/kernels/arm/density_prior_box_compute.h
deleted file mode 100644
index 3cb5a6f306..0000000000
--- a/lite/kernels/arm/density_prior_box_compute.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class DensityPriorBoxCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::DensityPriorBoxParam;
-
-  void Run() override;
-
-  virtual ~DensityPriorBoxCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/dropout_compute.cc b/lite/kernels/arm/dropout_compute.cc
deleted file mode 100644
index 3a3a1f301f..0000000000
--- a/lite/kernels/arm/dropout_compute.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/dropout_compute.h"
-#include <string>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void DropoutCompute::Run() {
-  auto& param = Param<operators::DropoutParam>();
-  const float* x_data = param.x->data<float>();
-  float* out_data = param.output->mutable_data<float>();
-  int num = param.x->dims().production();
-  const float prob_data = param.dropout_prob;
-  if (param.dropout_implementation == "upscale_in_train") {
-    lite::arm::math::dropout_up(x_data, out_data, num);
-  } else {
-    lite::arm::math::dropout_down(x_data, out_data, num, prob_data);
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(dropout,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::DropoutCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Mask", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/dropout_compute.h b/lite/kernels/arm/dropout_compute.h
deleted file mode 100644
index 0a284c1c9b..0000000000
--- a/lite/kernels/arm/dropout_compute.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class DropoutCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void Run() override;
-
-  virtual ~DropoutCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/dropout_compute_test.cc b/lite/kernels/arm/dropout_compute_test.cc
deleted file mode 100644
index 1c0f8db347..0000000000
--- a/lite/kernels/arm/dropout_compute_test.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/dropout_compute.h"
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-TEST(dropout_arm, init) {
-  DropoutCompute dropout;
-  ASSERT_EQ(dropout.precision(), PRECISION(kFloat));
-  ASSERT_EQ(dropout.target(), TARGET(kARM));
-}
-
-TEST(dropout, retrive_op) {
-  auto dropout =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "dropout");
-  ASSERT_FALSE(dropout.empty());
-  ASSERT_TRUE(dropout.front());
-}
-
-template <typename dtype>
-void dropout_compute_ref(const operators::DropoutParam& param) {
-  const float* x_data = param.x->data<float>();
-  float* output_data = param.output->mutable_data<float>();
-  int num = param.x->dims().production();
-  const float prob_data = param.dropout_prob;
-  if (param.dropout_implementation.compare(
-          std::string({"downgrade_in_infer"})) == 0) {
-    float scale = 1.0 - prob_data;
-    for (int i = 0; i < num; i++) {
-      output_data[i] = x_data[i] * scale;
-    }
-  } else {
-    for (int i = 0; i < num; i++) {
-      output_data[i] = x_data[i];
-    }
-  }
-}
-
-TEST(dropout_arm, compute) {
-  DropoutCompute dropout;
-  operators::DropoutParam param;
-
-  lite::Tensor x;
-  lite::Tensor output;
-  lite::Tensor output_ref;
-
-  for (auto n : {1, 3, 4}) {
-    for (auto c : {1, 3, 4}) {
-      for (auto h : {1, 3, 4}) {
-        for (auto w : {1, 3, 4}) {
-          for (auto prob : {0.2f, 0.8f})
-            for (auto impl : {std::string({"downgrade_in_infer"}),
-                              std::string({"upscale_in_train"})}) {
-              x.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
-              output.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
-              output_ref.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
-              auto* x_data = x.mutable_data<float>();
-              auto* output_data = output.mutable_data<float>();
-              auto* output_ref_data = output_ref.mutable_data<float>();
-              for (int i = 0; i < x.dims().production(); i++) {
-                x_data[i] = i;
-              }
-              param.x = &x;
-              param.output = &output;
-              param.dropout_prob = prob;
-              param.dropout_implementation = impl;
-              dropout.SetParam(param);
-              dropout.Run();
-              param.output = &output_ref;
-              dropout_compute_ref<float>(param);
-              for (int i = 0; i < output.dims().production(); i++) {
-                EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-              }
-            }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def);
diff --git a/lite/kernels/arm/elementwise_compute.cc b/lite/kernels/arm/elementwise_compute.cc
deleted file mode 100644
index 2e57b6a3b3..0000000000
--- a/lite/kernels/arm/elementwise_compute.cc
+++ /dev/null
@@ -1,417 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/elementwise_compute.h"
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-inline DDim trim_trailing_singular_dims(const DDim& dims) {
-  // Remove trailing dimensions of size 1 for y
-  auto actual_dims_size = dims.size();
-  for (; actual_dims_size != 0; --actual_dims_size) {
-    if (dims[actual_dims_size - 1] != 1) break;
-  }
-
-  std::vector<int64_t> trim_dims;
-  trim_dims.resize(actual_dims_size);
-  for (int i = 0; i < actual_dims_size; ++i) {
-    trim_dims[i] = dims[i];
-  }
-  if (trim_dims.size() == 0) {
-    return DDim();
-  }
-  return DDim(trim_dims);
-}
-
-inline bool is_broadcast(const DDim& x_dims,
-                         const DDim& y_dims,
-                         int axis,
-                         int* pre,
-                         int* n,
-                         int* post) {
-  if (axis < 0) {
-    axis = x_dims.size() - y_dims.size();
-  }
-  DDim y_dim_trim = trim_trailing_singular_dims(y_dims);
-  axis = (y_dim_trim.size() == 0) ? x_dims.size() : axis;
-  if (x_dims.size() == y_dim_trim.size()) {
-    return false;
-  }
-  *pre = 1;
-  *n = 1;
-  *post = 1;
-  for (int i = 0; i < axis; ++i) {
-    (*pre) *= x_dims[i];
-  }
-  for (int i = 0; i < y_dim_trim.size(); ++i) {
-    CHECK_EQ(x_dims[i + axis], y_dim_trim[i])
-        << "Broadcast dimension mismatch.";
-    (*n) *= y_dim_trim[i];
-  }
-  for (int i = axis + y_dim_trim.size(); i < x_dims.size(); ++i) {
-    (*post) *= x_dims[i];
-  }
-  return true;
-}
-
-void ElementwiseAddCompute::Run() {
-  auto& param = Param<operators::ElementwiseParam>();
-  const float* x_data = param.X->data<float>();
-  const float* y_data = param.Y->data<float>();
-  float* out_data = param.Out->mutable_data<float>();
-  int axis = param.axis;
-  auto x_dims = param.X->dims();
-  auto y_dims = param.Y->dims();
-  int pre, n, post;
-  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
-    lite::arm::math::elementwise_add_broadcast(
-        x_data, y_data, out_data, pre, n, post);
-  } else {
-    lite::arm::math::elementwise_add(
-        x_data, y_data, out_data, x_dims.production());
-  }
-}
-
-void ElementwiseAddActivationCompute::Run() {
-  auto& param = Param<operators::FusionElementwiseActivationParam>();
-  const float* x_data = param.X->data<float>();
-  const float* y_data = param.Y->data<float>();
-  float* out_data = param.Out->mutable_data<float>();
-  int axis = param.axis;
-  std::string act_type = param.act_type;
-  auto x_dims = param.X->dims();
-  auto y_dims = param.Y->dims();
-  int pre, n, post;
-  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
-    if (act_type == "relu") {
-      lite::arm::math::elementwise_add_relu_broadcast(
-          x_data, y_data, out_data, pre, n, post);
-    } else {
-      LOG(FATAL) << "unsupported Activation type: " << act_type;
-    }
-  } else {
-    if (act_type == "relu") {
-      lite::arm::math::elementwise_add_relu(
-          x_data, y_data, out_data, x_dims.production());
-    } else {
-      LOG(FATAL) << "unsupported Activation type: " << act_type;
-    }
-  }
-}
-
-void ElementwiseSubCompute::Run() {
-  auto& param = Param<operators::ElementwiseParam>();
-  const float* x_data = param.X->data<float>();
-  const float* y_data = param.Y->data<float>();
-  float* out_data = param.Out->mutable_data<float>();
-  int axis = param.axis;
-  auto x_dims = param.X->dims();
-  auto y_dims = param.Y->dims();
-  int pre, n, post;
-  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
-    lite::arm::math::elementwise_sub_broadcast(
-        x_data, y_data, out_data, pre, n, post);
-  } else {
-    lite::arm::math::elementwise_sub(
-        x_data, y_data, out_data, x_dims.production());
-  }
-}
-
-void ElementwiseSubActivationCompute::Run() {
-  auto& param = Param<operators::FusionElementwiseActivationParam>();
-  const float* x_data = param.X->data<float>();
-  const float* y_data = param.Y->data<float>();
-  float* out_data = param.Out->mutable_data<float>();
-  int axis = param.axis;
-  std::string act_type = param.act_type;
-  auto x_dims = param.X->dims();
-  auto y_dims = param.Y->dims();
-  int pre, n, post;
-  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
-    if (act_type == "relu") {
-      lite::arm::math::elementwise_sub_relu_broadcast(
-          x_data, y_data, out_data, pre, n, post);
-    } else {
-      LOG(FATAL) << "unsupported Activation type: " << act_type;
-    }
-  } else {
-    if (act_type == "relu") {
-      lite::arm::math::elementwise_sub_relu(
-          x_data, y_data, out_data, x_dims.production());
-    } else {
-      LOG(FATAL) << "unsupported Activation type: " << act_type;
-    }
-  }
-}
-
-void ElementwiseMulCompute::Run() {
-  auto& param = Param<operators::ElementwiseParam>();
-  const float* x_data = param.X->data<float>();
-  const float* y_data = param.Y->data<float>();
-  float* out_data = param.Out->mutable_data<float>();
-  int axis = param.axis;
-  auto x_dims = param.X->dims();
-  auto y_dims = param.Y->dims();
-  int pre, n, post;
-  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
-    lite::arm::math::elementwise_mul_broadcast(
-        x_data, y_data, out_data, pre, n, post);
-  } else {
-    lite::arm::math::elementwise_mul(
-        x_data, y_data, out_data, x_dims.production());
-  }
-}
-
-void ElementwiseMulActivationCompute::Run() {
-  auto& param = Param<operators::FusionElementwiseActivationParam>();
-  const float* x_data = param.X->data<float>();
-  const float* y_data = param.Y->data<float>();
-  float* out_data = param.Out->mutable_data<float>();
-  int axis = param.axis;
-  std::string act_type = param.act_type;
-  auto x_dims = param.X->dims();
-  auto y_dims = param.Y->dims();
-  int pre, n, post;
-  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
-    if (act_type == "relu") {
-      lite::arm::math::elementwise_mul_relu_broadcast(
-          x_data, y_data, out_data, pre, n, post);
-    } else {
-      LOG(FATAL) << "unsupported Activation type: " << act_type;
-    }
-  } else {
-    if (act_type == "relu") {
-      lite::arm::math::elementwise_mul_relu(
-          x_data, y_data, out_data, x_dims.production());
-    } else {
-      LOG(FATAL) << "unsupported Activation type: " << act_type;
-    }
-  }
-}
-
-void ElementwiseMaxCompute::Run() {
-  auto& param = Param<operators::ElementwiseParam>();
-  const float* x_data = param.X->data<float>();
-  const float* y_data = param.Y->data<float>();
-  float* out_data = param.Out->mutable_data<float>();
-  int axis = param.axis;
-  auto x_dims = param.X->dims();
-  auto y_dims = param.Y->dims();
-  int pre, n, post;
-  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
-    lite::arm::math::elementwise_max_broadcast(
-        x_data, y_data, out_data, pre, n, post);
-  } else {
-    lite::arm::math::elementwise_max(
-        x_data, y_data, out_data, x_dims.production());
-  }
-}
-
-void ElementwiseMaxActivationCompute::Run() {
-  auto& param = Param<operators::FusionElementwiseActivationParam>();
-  const float* x_data = param.X->data<float>();
-  const float* y_data = param.Y->data<float>();
-  float* out_data = param.Out->mutable_data<float>();
-  int axis = param.axis;
-  std::string act_type = param.act_type;
-  auto x_dims = param.X->dims();
-  auto y_dims = param.Y->dims();
-  int pre, n, post;
-  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
-    if (act_type == "relu") {
-      lite::arm::math::elementwise_max_relu_broadcast(
-          x_data, y_data, out_data, pre, n, post);
-    } else {
-      LOG(FATAL) << "unsupported Activation type: " << act_type;
-    }
-  } else {
-    if (act_type == "relu") {
-      lite::arm::math::elementwise_max_relu(
-          x_data, y_data, out_data, x_dims.production());
-    } else {
-      LOG(FATAL) << "unsupported Activation type: " << act_type;
-    }
-  }
-}
-
-void ElementwiseDivCompute::Run() {
-  auto& param = Param<operators::ElementwiseParam>();
-  const float* x_data = param.X->data<float>();
-  const float* y_data = param.Y->data<float>();
-  float* out_data = param.Out->mutable_data<float>();
-  int axis = param.axis;
-  auto x_dims = param.X->dims();
-  auto y_dims = param.Y->dims();
-  int pre, n, post;
-  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
-    lite::arm::math::elementwise_div_broadcast(
-        x_data, y_data, out_data, pre, n, post);
-  } else {
-    lite::arm::math::elementwise_div(
-        x_data, y_data, out_data, x_dims.production());
-  }
-}
-
-void ElementwiseDivActivationCompute::Run() {
-  auto& param = Param<operators::FusionElementwiseActivationParam>();
-  const float* x_data = param.X->data<float>();
-  const float* y_data = param.Y->data<float>();
-  float* out_data = param.Out->mutable_data<float>();
-  int axis = param.axis;
-  std::string act_type = param.act_type;
-  auto x_dims = param.X->dims();
-  auto y_dims = param.Y->dims();
-  int pre, n, post;
-  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
-    if (act_type == "relu") {
-      lite::arm::math::elementwise_div_relu_broadcast(
-          x_data, y_data, out_data, pre, n, post);
-    } else {
-      LOG(FATAL) << "unsupported Activation type: " << act_type;
-    }
-  } else {
-    if (act_type == "relu") {
-      lite::arm::math::elementwise_div_relu(
-          x_data, y_data, out_data, x_dims.production());
-    } else {
-      LOG(FATAL) << "unsupported Activation type: " << act_type;
-    }
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(elementwise_add,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::ElementwiseAddCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(
-    fusion_elementwise_add_activation,
-    kARM,
-    kFloat,
-    kNCHW,
-    paddle::lite::kernels::arm::ElementwiseAddActivationCompute,
-    def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(elementwise_sub,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::ElementwiseSubCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(
-    fusion_elementwise_sub_activation,
-    kARM,
-    kFloat,
-    kNCHW,
-    paddle::lite::kernels::arm::ElementwiseSubActivationCompute,
-    def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(elementwise_mul,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::ElementwiseMulCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(
-    fusion_elementwise_mul_activation,
-    kARM,
-    kFloat,
-    kNCHW,
-    paddle::lite::kernels::arm::ElementwiseMulActivationCompute,
-    def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(elementwise_max,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::ElementwiseMaxCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(
-    fusion_elementwise_max_activation,
-    kARM,
-    kFloat,
-    kNCHW,
-    paddle::lite::kernels::arm::ElementwiseMaxActivationCompute,
-    def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(elementwise_div,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::ElementwiseDivCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(
-    fusion_elementwise_div_activation,
-    kARM,
-    kFloat,
-    kNCHW,
-    paddle::lite::kernels::arm::ElementwiseDivActivationCompute,
-    def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/elementwise_compute.h b/lite/kernels/arm/elementwise_compute.h
deleted file mode 100644
index e76449aebc..0000000000
--- a/lite/kernels/arm/elementwise_compute.h
+++ /dev/null
@@ -1,108 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class ElementwiseAddCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void Run() override;
-
-  virtual ~ElementwiseAddCompute() = default;
-};
-
-class ElementwiseAddActivationCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void Run() override;
-
-  virtual ~ElementwiseAddActivationCompute() = default;
-};
-
-class ElementwiseSubCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void Run() override;
-
-  virtual ~ElementwiseSubCompute() = default;
-};
-
-class ElementwiseSubActivationCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void Run() override;
-
-  virtual ~ElementwiseSubActivationCompute() = default;
-};
-
-class ElementwiseMulCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void Run() override;
-
-  virtual ~ElementwiseMulCompute() = default;
-};
-
-class ElementwiseMulActivationCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void Run() override;
-
-  virtual ~ElementwiseMulActivationCompute() = default;
-};
-
-class ElementwiseMaxCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void Run() override;
-
-  virtual ~ElementwiseMaxCompute() = default;
-};
-
-class ElementwiseMaxActivationCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void Run() override;
-
-  virtual ~ElementwiseMaxActivationCompute() = default;
-};
-
-class ElementwiseDivCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void Run() override;
-
-  virtual ~ElementwiseDivCompute() = default;
-};
-
-class ElementwiseDivActivationCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void Run() override;
-
-  virtual ~ElementwiseDivActivationCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/elementwise_compute_test.cc b/lite/kernels/arm/elementwise_compute_test.cc
deleted file mode 100644
index 2bc5863a18..0000000000
--- a/lite/kernels/arm/elementwise_compute_test.cc
+++ /dev/null
@@ -1,721 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/elementwise_compute.h"
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-TEST(elementwise_add_arm, retrive_op) {
-  auto elementwise_add =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "elementwise_add");
-  ASSERT_FALSE(elementwise_add.empty());
-  ASSERT_TRUE(elementwise_add.front());
-}
-
-TEST(elementwise_add_arm, init) {
-  ElementwiseAddCompute elementwise_add;
-  ASSERT_EQ(elementwise_add.precision(), PRECISION(kFloat));
-  ASSERT_EQ(elementwise_add.target(), TARGET(kARM));
-}
-
-template <typename dtype>
-void elementwise_compute_ref(const operators::ElementwiseParam& param,
-                             const std::string elt_type,
-                             const std::string act_type) {
-  const dtype* x_data = param.X->data<const dtype>();
-  const dtype* y_data = param.Y->data<const dtype>();
-  dtype* out_data = param.Out->mutable_data<dtype>();
-  auto x_dims = param.X->dims();
-  auto y_dims = param.Y->dims();
-  int axis = param.axis;
-  if (axis < 0) {
-    axis = x_dims.size() - y_dims.size();
-  }
-  int batch = 1;
-  int channels = 1;
-  int num = 1;
-  for (int i = 0; i < axis; ++i) {
-    batch *= x_dims[i];
-  }
-  for (int i = 0; i < y_dims.size(); ++i) {
-    channels *= y_dims[i];
-  }
-  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
-    num *= x_dims[i];
-  }
-  // do elementwise add/sub/max...
-  if (elt_type == "add") {
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
-        int offset = (i * channels + j) * num;
-        const dtype* din_ptr = x_data + offset;
-        const dtype diny_data = y_data[j];
-        dtype* dout_ptr = out_data + offset;
-        for (int k = 0; k < num; ++k) {
-          *dout_ptr = *din_ptr + diny_data;
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  } else if (elt_type == "sub") {
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
-        int offset = (i * channels + j) * num;
-        const dtype* din_ptr = x_data + offset;
-        const dtype diny_data = y_data[j];
-        dtype* dout_ptr = out_data + offset;
-        for (int k = 0; k < num; ++k) {
-          *dout_ptr = *din_ptr - diny_data;
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  } else if (elt_type == "mul") {
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
-        int offset = (i * channels + j) * num;
-        const dtype* din_ptr = x_data + offset;
-        const dtype diny_data = y_data[j];
-        dtype* dout_ptr = out_data + offset;
-        for (int k = 0; k < num; ++k) {
-          *dout_ptr = *din_ptr * diny_data;
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  } else if (elt_type == "max") {
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
-        int offset = (i * channels + j) * num;
-        const dtype* din_ptr = x_data + offset;
-        const dtype diny_data = y_data[j];
-        dtype* dout_ptr = out_data + offset;
-        for (int k = 0; k < num; ++k) {
-          *dout_ptr = std::max(*din_ptr, diny_data);
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  } else {
-    LOG(FATAL) << "unsupported Elementwise type: " << elt_type;
-  }
-  // do activation relu/sigmod...
-  if (act_type.size() > 0) {
-    if (act_type == "relu") {
-      for (int i = 0; i < batch; ++i) {
-        for (int j = 0; j < channels; ++j) {
-          dtype* dout_ptr = out_data + (i * channels + j) * num;
-          for (int k = 0; k < num; ++k) {
-            *dout_ptr = *dout_ptr > 0.0f ? *dout_ptr : 0.0f;
-            dout_ptr++;
-          }
-        }
-      }
-    } else {
-      LOG(FATAL) << "unsupported Activation type: " << elt_type;
-    }
-  }
-}
-
-TEST(elementwise_add, compute) {
-  ElementwiseAddCompute elementwise_add;
-  operators::ElementwiseParam param;
-  lite::Tensor x, y, output, output_ref;
-
-#if 1
-  for (auto n : {1, 3, 4}) {
-    for (auto c : {1, 3, 4}) {
-      for (auto h : {1, 3, 4}) {
-        for (auto w : {1, 3, 4}) {
-          for (auto axis : {-1, 0, 1, 3}) {
-            for (auto yd : {std::vector<int64_t>({n}),
-                            std::vector<int64_t>({c}),
-                            std::vector<int64_t>({h}),
-                            std::vector<int64_t>({w}),
-                            std::vector<int64_t>({n, c}),
-                            std::vector<int64_t>({c, h}),
-                            std::vector<int64_t>({c, h, w}),
-                            std::vector<int64_t>({n, c, h, w})}) {
-#else
-  for (auto n : {1, 3, 4, 11}) {
-    for (auto c : {1, 3, 4, 11}) {
-      for (auto h : {1, 3, 4, 11}) {
-        for (auto w : {1, 3, 4, 11}) {
-          for (auto axis : {-1, 0, 1, 2, 3}) {
-            for (auto yd : {std::vector<int64_t>({n}),
-                            std::vector<int64_t>({c}),
-                            std::vector<int64_t>({h}),
-                            std::vector<int64_t>({w}),
-                            std::vector<int64_t>({n, c}),
-                            std::vector<int64_t>({c, h}),
-                            std::vector<int64_t>({h, w}),
-                            std::vector<int64_t>({n, c, h}),
-                            std::vector<int64_t>({c, h, w}),
-                            std::vector<int64_t>({n, c, h, w})}) {
-#endif
-              auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
-              auto y_dim = DDim(yd);
-              int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
-
-              if (axis_t + y_dim.size() > 4) continue;
-              bool flag = false;
-              for (int i = 0; i < y_dim.size(); i++) {
-                if (x_dim[i + axis_t] != y_dim[i]) flag = true;
-              }
-              if (flag) continue;
-
-              x.Resize(x_dim);
-              y.Resize(y_dim);
-              output.Resize(x_dim);
-              output_ref.Resize(x_dim);
-              auto* x_data = x.mutable_data<float>();
-              auto* y_data = y.mutable_data<float>();
-              auto* output_data = output.mutable_data<float>();
-              auto* output_ref_data = output_ref.mutable_data<float>();
-              for (int i = 0; i < x_dim.production(); i++) {
-                x_data[i] = i;
-              }
-              for (int i = 0; i < y_dim.production(); i++) {
-                y_data[i] = i;
-              }
-              param.X = &x;
-              param.Y = &y;
-              param.axis = axis;
-              param.Out = &output;
-              elementwise_add.SetParam(param);
-              elementwise_add.Run();
-              param.Out = &output_ref;
-              elementwise_compute_ref<float>(param, "add", "");
-              for (int i = 0; i < output.dims().production(); i++) {
-                EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(fusion_elementwise_add_activation_arm, retrive_op) {
-  auto fusion_elementwise_add_activation =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "fusion_elementwise_add_activation");
-  ASSERT_FALSE(fusion_elementwise_add_activation.empty());
-  ASSERT_TRUE(fusion_elementwise_add_activation.front());
-}
-
-TEST(fusion_elementwise_add_activation_arm, init) {
-  ElementwiseAddActivationCompute fusion_elementwise_add_activation;
-  ASSERT_EQ(fusion_elementwise_add_activation.precision(), PRECISION(kFloat));
-  ASSERT_EQ(fusion_elementwise_add_activation.target(), TARGET(kARM));
-}
-
-TEST(fusion_elementwise_add_activation_arm, compute) {
-  ElementwiseAddActivationCompute fusion_elementwise_add_activation;
-  operators::FusionElementwiseActivationParam param;
-  lite::Tensor x, y, output, output_ref;
-
-#if 1
-  for (auto act_type : {"relu"}) {
-    for (auto n : {1, 3, 4}) {
-      for (auto c : {1, 3, 4}) {
-        for (auto h : {1, 3, 4}) {
-          for (auto w : {1, 3, 4}) {
-            for (auto axis : {-1, 0, 1, 3}) {
-              for (auto yd : {std::vector<int64_t>({n}),
-                              std::vector<int64_t>({c}),
-                              std::vector<int64_t>({h}),
-                              std::vector<int64_t>({w}),
-                              std::vector<int64_t>({n, c}),
-                              std::vector<int64_t>({h, w}),
-                              std::vector<int64_t>({n, c, h}),
-                              std::vector<int64_t>({n, c, h, w})}) {
-#else
-  for (auto act_type : {"relu"}) {
-    for (auto n : {1, 3, 4, 11}) {
-      for (auto c : {1, 3, 4, 11}) {
-        for (auto h : {1, 3, 4, 11}) {
-          for (auto w : {1, 3, 4, 11}) {
-            for (auto axis : {-1, 0, 1, 2, 3}) {
-              for (auto yd : {std::vector<int64_t>({n}),
-                              std::vector<int64_t>({c}),
-                              std::vector<int64_t>({h}),
-                              std::vector<int64_t>({w}),
-                              std::vector<int64_t>({n, c}),
-                              std::vector<int64_t>({c, h}),
-                              std::vector<int64_t>({h, w}),
-                              std::vector<int64_t>({n, c, h}),
-                              std::vector<int64_t>({c, h, w}),
-                              std::vector<int64_t>({n, c, h, w})}) {
-#endif
-                auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
-                auto y_dim = DDim(yd);
-                int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
-
-                if (axis_t + y_dim.size() > 4) continue;
-                bool flag = false;
-                for (int i = 0; i < y_dim.size(); i++) {
-                  if (x_dim[i + axis_t] != y_dim[i]) flag = true;
-                }
-                if (flag) continue;
-
-                x.Resize(x_dim);
-                y.Resize(y_dim);
-                output.Resize(x_dim);
-                output_ref.Resize(x_dim);
-                auto* x_data = x.mutable_data<float>();
-                auto* y_data = y.mutable_data<float>();
-                auto* output_data = output.mutable_data<float>();
-                auto* output_ref_data = output_ref.mutable_data<float>();
-                for (int i = 0; i < x_dim.production(); i++) {
-                  float sign = i % 3 == 0 ? -1.0f : 1.0f;
-                  x_data[i] = i * sign;
-                }
-                for (int i = 0; i < y_dim.production(); i++) {
-                  float sign = i % 2 == 0 ? 0.5f : -0.5f;
-                  y_data[i] = i * sign;
-                }
-                param.X = &x;
-                param.Y = &y;
-                param.axis = axis;
-                param.Out = &output;
-                param.act_type = act_type;
-                fusion_elementwise_add_activation.SetParam(param);
-                fusion_elementwise_add_activation.Run();
-                param.Out = &output_ref;
-                elementwise_compute_ref<float>(param, "add", act_type);
-                for (int i = 0; i < output.dims().production(); i++) {
-                  EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(elementwise_mul_arm, retrive_op) {
-  auto elementwise_mul =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "elementwise_mul");
-  ASSERT_FALSE(elementwise_mul.empty());
-  ASSERT_TRUE(elementwise_mul.front());
-}
-
-TEST(elementwise_mul_arm, init) {
-  ElementwiseMulCompute elementwise_mul;
-  ASSERT_EQ(elementwise_mul.precision(), PRECISION(kFloat));
-  ASSERT_EQ(elementwise_mul.target(), TARGET(kARM));
-}
-
-TEST(elementwise_mul, compute) {
-  ElementwiseMulCompute elementwise_mul;
-  operators::ElementwiseParam param;
-  lite::Tensor x, y, output, output_ref;
-
-#if 1
-  for (auto n : {1, 3, 4}) {
-    for (auto c : {1, 3, 4}) {
-      for (auto h : {1, 3, 4}) {
-        for (auto w : {1, 3, 4}) {
-          for (auto axis : {-1, 0, 1, 3}) {
-            for (auto yd : {std::vector<int64_t>({n}),
-                            std::vector<int64_t>({c}),
-                            std::vector<int64_t>({h}),
-                            std::vector<int64_t>({w}),
-                            std::vector<int64_t>({n, c}),
-                            std::vector<int64_t>({c, h}),
-                            std::vector<int64_t>({c, h, w}),
-                            std::vector<int64_t>({n, c, h, w})}) {
-#else
-  for (auto n : {1, 3, 4, 11}) {
-    for (auto c : {1, 3, 4, 11}) {
-      for (auto h : {1, 3, 4, 11}) {
-        for (auto w : {1, 3, 4, 11}) {
-          for (auto axis : {-1, 0, 1, 2, 3}) {
-            for (auto yd : {std::vector<int64_t>({n}),
-                            std::vector<int64_t>({c}),
-                            std::vector<int64_t>({h}),
-                            std::vector<int64_t>({w}),
-                            std::vector<int64_t>({n, c}),
-                            std::vector<int64_t>({c, h}),
-                            std::vector<int64_t>({h, w}),
-                            std::vector<int64_t>({n, c, h}),
-                            std::vector<int64_t>({c, h, w}),
-                            std::vector<int64_t>({n, c, h, w})}) {
-#endif
-              auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
-              auto y_dim = DDim(yd);
-              int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
-
-              if (axis_t + y_dim.size() > 4) continue;
-              bool flag = false;
-              for (int i = 0; i < y_dim.size(); i++) {
-                if (x_dim[i + axis_t] != y_dim[i]) flag = true;
-              }
-              if (flag) continue;
-
-              x.Resize(x_dim);
-              y.Resize(y_dim);
-              output.Resize(x_dim);
-              output_ref.Resize(x_dim);
-              auto* x_data = x.mutable_data<float>();
-              auto* y_data = y.mutable_data<float>();
-              auto* output_data = output.mutable_data<float>();
-              auto* output_ref_data = output_ref.mutable_data<float>();
-              for (int i = 0; i < x_dim.production(); i++) {
-                x_data[i] = i;
-              }
-              for (int i = 0; i < y_dim.production(); i++) {
-                y_data[i] = i;
-              }
-              param.X = &x;
-              param.Y = &y;
-              param.axis = axis;
-              param.Out = &output;
-              elementwise_mul.SetParam(param);
-              elementwise_mul.Run();
-              param.Out = &output_ref;
-              elementwise_compute_ref<float>(param, "mul", "");
-              for (int i = 0; i < output.dims().production(); i++) {
-                EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(fusion_elementwise_mul_activation_arm, retrive_op) {
-  auto fusion_elementwise_mul_activation =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "fusion_elementwise_mul_activation");
-  ASSERT_FALSE(fusion_elementwise_mul_activation.empty());
-  ASSERT_TRUE(fusion_elementwise_mul_activation.front());
-}
-
-TEST(fusion_elementwise_mul_activation_arm, init) {
-  ElementwiseMulActivationCompute fusion_elementwise_mul_activation;
-  ASSERT_EQ(fusion_elementwise_mul_activation.precision(), PRECISION(kFloat));
-  ASSERT_EQ(fusion_elementwise_mul_activation.target(), TARGET(kARM));
-}
-
-TEST(fusion_elementwise_mul_activation_arm, compute) {
-  ElementwiseMulActivationCompute fusion_elementwise_mul_activation;
-  operators::FusionElementwiseActivationParam param;
-  lite::Tensor x, y, output, output_ref;
-
-#if 1
-  for (auto act_type : {"relu"}) {
-    for (auto n : {1, 3, 4}) {
-      for (auto c : {1, 3, 4}) {
-        for (auto h : {1, 3, 4}) {
-          for (auto w : {1, 3, 4}) {
-            for (auto axis : {-1, 0, 1, 3}) {
-              for (auto yd : {std::vector<int64_t>({n}),
-                              std::vector<int64_t>({c}),
-                              std::vector<int64_t>({h}),
-                              std::vector<int64_t>({w}),
-                              std::vector<int64_t>({n, c}),
-                              std::vector<int64_t>({h, w}),
-                              std::vector<int64_t>({n, c, h}),
-                              std::vector<int64_t>({n, c, h, w})}) {
-#else
-  for (auto act_type : {"relu"}) {
-    for (auto n : {1, 3, 4, 11}) {
-      for (auto c : {1, 3, 4, 11}) {
-        for (auto h : {1, 3, 4, 11}) {
-          for (auto w : {1, 3, 4, 11}) {
-            for (auto axis : {-1, 0, 1, 2, 3}) {
-              for (auto yd : {std::vector<int64_t>({n}),
-                              std::vector<int64_t>({c}),
-                              std::vector<int64_t>({h}),
-                              std::vector<int64_t>({w}),
-                              std::vector<int64_t>({n, c}),
-                              std::vector<int64_t>({c, h}),
-                              std::vector<int64_t>({h, w}),
-                              std::vector<int64_t>({n, c, h}),
-                              std::vector<int64_t>({c, h, w}),
-                              std::vector<int64_t>({n, c, h, w})}) {
-#endif
-                auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
-                auto y_dim = DDim(yd);
-                int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
-
-                if (axis_t + y_dim.size() > 4) continue;
-                bool flag = false;
-                for (int i = 0; i < y_dim.size(); i++) {
-                  if (x_dim[i + axis_t] != y_dim[i]) flag = true;
-                }
-                if (flag) continue;
-
-                x.Resize(x_dim);
-                y.Resize(y_dim);
-                output.Resize(x_dim);
-                output_ref.Resize(x_dim);
-                auto* x_data = x.mutable_data<float>();
-                auto* y_data = y.mutable_data<float>();
-                auto* output_data = output.mutable_data<float>();
-                auto* output_ref_data = output_ref.mutable_data<float>();
-                for (int i = 0; i < x_dim.production(); i++) {
-                  float sign = i % 3 == 0 ? -1.0f : 1.0f;
-                  x_data[i] = i * sign;
-                }
-                for (int i = 0; i < y_dim.production(); i++) {
-                  float sign = i % 2 == 0 ? 0.5f : -0.5f;
-                  y_data[i] = i * sign;
-                }
-                param.X = &x;
-                param.Y = &y;
-                param.axis = axis;
-                param.Out = &output;
-                param.act_type = act_type;
-                fusion_elementwise_mul_activation.SetParam(param);
-                fusion_elementwise_mul_activation.Run();
-                param.Out = &output_ref;
-                elementwise_compute_ref<float>(param, "mul", act_type);
-                for (int i = 0; i < output.dims().production(); i++) {
-                  EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(elementwise_max_arm, retrive_op) {
-  auto elementwise_max =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "elementwise_max");
-  ASSERT_FALSE(elementwise_max.empty());
-  ASSERT_TRUE(elementwise_max.front());
-}
-
-TEST(elementwise_max_arm, init) {
-  ElementwiseMaxCompute elementwise_max;
-  ASSERT_EQ(elementwise_max.precision(), PRECISION(kFloat));
-  ASSERT_EQ(elementwise_max.target(), TARGET(kARM));
-}
-
-TEST(elementwise_max, compute) {
-  ElementwiseMaxCompute elementwise_max;
-  operators::ElementwiseParam param;
-  lite::Tensor x, y, output, output_ref;
-
-#if 1
-  for (auto n : {1, 3, 4}) {
-    for (auto c : {1, 3, 4}) {
-      for (auto h : {1, 3, 4}) {
-        for (auto w : {1, 3, 4}) {
-          for (auto axis : {-1, 0, 1, 3}) {
-            for (auto yd : {std::vector<int64_t>({n}),
-                            std::vector<int64_t>({c}),
-                            std::vector<int64_t>({h}),
-                            std::vector<int64_t>({w}),
-                            std::vector<int64_t>({n, c}),
-                            std::vector<int64_t>({c, h}),
-                            std::vector<int64_t>({c, h, w}),
-                            std::vector<int64_t>({n, c, h, w})}) {
-#else
-  for (auto n : {1, 3, 4, 11}) {
-    for (auto c : {1, 3, 4, 11}) {
-      for (auto h : {1, 3, 4, 11}) {
-        for (auto w : {1, 3, 4, 11}) {
-          for (auto axis : {-1, 0, 1, 2, 3}) {
-            for (auto yd : {std::vector<int64_t>({n}),
-                            std::vector<int64_t>({c}),
-                            std::vector<int64_t>({h}),
-                            std::vector<int64_t>({w}),
-                            std::vector<int64_t>({n, c}),
-                            std::vector<int64_t>({c, h}),
-                            std::vector<int64_t>({h, w}),
-                            std::vector<int64_t>({n, c, h}),
-                            std::vector<int64_t>({c, h, w}),
-                            std::vector<int64_t>({n, c, h, w})}) {
-#endif
-              auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
-              auto y_dim = DDim(yd);
-              int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
-
-              if (axis_t + y_dim.size() > 4) continue;
-              bool flag = false;
-              for (int i = 0; i < y_dim.size(); i++) {
-                if (x_dim[i + axis_t] != y_dim[i]) flag = true;
-              }
-              if (flag) continue;
-
-              x.Resize(x_dim);
-              y.Resize(y_dim);
-              output.Resize(x_dim);
-              output_ref.Resize(x_dim);
-              auto* x_data = x.mutable_data<float>();
-              auto* y_data = y.mutable_data<float>();
-              auto* output_data = output.mutable_data<float>();
-              auto* output_ref_data = output_ref.mutable_data<float>();
-              for (int i = 0; i < x_dim.production(); i++) {
-                x_data[i] = i;
-              }
-              for (int i = 0; i < y_dim.production(); i++) {
-                y_data[i] = i;
-              }
-              param.X = &x;
-              param.Y = &y;
-              param.axis = axis;
-              param.Out = &output;
-              elementwise_max.SetParam(param);
-              elementwise_max.Run();
-              param.Out = &output_ref;
-              elementwise_compute_ref<float>(param, "max", "");
-              for (int i = 0; i < output.dims().production(); i++) {
-                EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(fusion_elementwise_max_activation_arm, retrive_op) {
-  auto fusion_elementwise_max_activation =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "fusion_elementwise_max_activation");
-  ASSERT_FALSE(fusion_elementwise_max_activation.empty());
-  ASSERT_TRUE(fusion_elementwise_max_activation.front());
-}
-
-TEST(fusion_elementwise_max_activation_arm, init) {
-  ElementwiseMaxActivationCompute fusion_elementwise_max_activation;
-  ASSERT_EQ(fusion_elementwise_max_activation.precision(), PRECISION(kFloat));
-  ASSERT_EQ(fusion_elementwise_max_activation.target(), TARGET(kARM));
-}
-
-TEST(fusion_elementwise_max_activation_arm, compute) {
-  ElementwiseMaxActivationCompute fusion_elementwise_max_activation;
-  operators::FusionElementwiseActivationParam param;
-  lite::Tensor x, y, output, output_ref;
-
-#if 1
-  for (auto act_type : {"relu"}) {
-    for (auto n : {1, 3, 4}) {
-      for (auto c : {1, 3, 4}) {
-        for (auto h : {1, 3, 4}) {
-          for (auto w : {1, 3, 4}) {
-            for (auto axis : {-1, 0, 1, 3}) {
-              for (auto yd : {std::vector<int64_t>({n}),
-                              std::vector<int64_t>({c}),
-                              std::vector<int64_t>({h}),
-                              std::vector<int64_t>({w}),
-                              std::vector<int64_t>({n, c}),
-                              std::vector<int64_t>({h, w}),
-                              std::vector<int64_t>({n, c, h}),
-                              std::vector<int64_t>({n, c, h, w})}) {
-#else
-  for (auto act_type : {"relu"}) {
-    for (auto n : {1, 3, 4, 11}) {
-      for (auto c : {1, 3, 4, 11}) {
-        for (auto h : {1, 3, 4, 11}) {
-          for (auto w : {1, 3, 4, 11}) {
-            for (auto axis : {-1, 0, 1, 2, 3}) {
-              for (auto yd : {std::vector<int64_t>({n}),
-                              std::vector<int64_t>({c}),
-                              std::vector<int64_t>({h}),
-                              std::vector<int64_t>({w}),
-                              std::vector<int64_t>({n, c}),
-                              std::vector<int64_t>({c, h}),
-                              std::vector<int64_t>({h, w}),
-                              std::vector<int64_t>({n, c, h}),
-                              std::vector<int64_t>({c, h, w}),
-                              std::vector<int64_t>({n, c, h, w})}) {
-#endif
-                auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
-                auto y_dim = DDim(yd);
-                int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
-
-                if (axis_t + y_dim.size() > 4) continue;
-                bool flag = false;
-                for (int i = 0; i < y_dim.size(); i++) {
-                  if (x_dim[i + axis_t] != y_dim[i]) flag = true;
-                }
-                if (flag) continue;
-
-                x.Resize(x_dim);
-                y.Resize(y_dim);
-                output.Resize(x_dim);
-                output_ref.Resize(x_dim);
-                auto* x_data = x.mutable_data<float>();
-                auto* y_data = y.mutable_data<float>();
-                auto* output_data = output.mutable_data<float>();
-                auto* output_ref_data = output_ref.mutable_data<float>();
-                for (int i = 0; i < x_dim.production(); i++) {
-                  float sign = i % 3 == 0 ? -1.0f : 1.0f;
-                  x_data[i] = i * sign;
-                }
-                for (int i = 0; i < y_dim.production(); i++) {
-                  float sign = i % 2 == 0 ? 0.5f : -0.5f;
-                  y_data[i] = i * sign;
-                }
-                param.X = &x;
-                param.Y = &y;
-                param.axis = axis;
-                param.Out = &output;
-                param.act_type = act_type;
-                fusion_elementwise_max_activation.SetParam(param);
-                fusion_elementwise_max_activation.Run();
-                param.Out = &output_ref;
-                elementwise_compute_ref<float>(param, "max", act_type);
-                for (int i = 0; i < output.dims().production(); i++) {
-                  EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(fusion_elementwise_add_activation, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(elementwise_mul, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(fusion_elementwise_mul_activation, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(elementwise_max, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(fusion_elementwise_max_activation, kARM, kFloat, kNCHW, def);
diff --git a/lite/kernels/arm/expand_compute.cc b/lite/kernels/arm/expand_compute.cc
deleted file mode 100644
index 73bcae909e..0000000000
--- a/lite/kernels/arm/expand_compute.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/expand_compute.h"
-#include <vector>
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void ExpandCompute::Run() {
-  auto& param = Param<operators::ExpandParam>();
-  const auto* x = param.X;
-  auto* out = param.Out;
-  std::vector<int> expand_times = param.expand_times;
-
-  const float* src = x->data<float>();
-  float* dst = out->mutable_data<float>();
-
-  int dims = expand_times.size();
-  DDim in_shape = x->dims();
-
-  int inner_num = 1;
-  int i = dims - 1;
-  int outer_num = in_shape.count(0, i);
-  inner_num *= in_shape[i];
-  for (int j = 0; j < outer_num; ++j) {
-    for (int k = 0; k < expand_times[i]; ++k) {
-      memcpy(dst + (j * expand_times[i] + k) * inner_num,
-             src + j * inner_num,
-             sizeof(float) * inner_num);
-    }
-  }
-  inner_num *= expand_times[i];
-  for (int i = dims - 2; i >= 0; --i) {
-    int outer_num = in_shape.count(0, i);
-    inner_num *= in_shape[i];
-    for (int j = outer_num - 1; j >= 0; --j) {
-      for (int k = expand_times[i] - 1; k >= 0; --k) {
-        memcpy(dst + (j * expand_times[i] + k) * inner_num,
-               dst + j * inner_num,
-               sizeof(float) * inner_num);
-      }
-    }
-    inner_num *= expand_times[i];
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    expand, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ExpandCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/expand_compute.h b/lite/kernels/arm/expand_compute.h
deleted file mode 100644
index d872c2a60b..0000000000
--- a/lite/kernels/arm/expand_compute.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class ExpandCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void Run() override;
-
-  virtual ~ExpandCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/fc_compute.cc b/lite/kernels/arm/fc_compute.cc
deleted file mode 100644
index 83d40362e7..0000000000
--- a/lite/kernels/arm/fc_compute.cc
+++ /dev/null
@@ -1,263 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/fc_compute.h"
-#include <vector>
-#include "lite/api/paddle_place.h"
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/backends/arm/math/gemm_prepacked_int8.h"
-#include "lite/backends/arm/math/gemv_arm_int8.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void FcCompute::PrepareForRun() {
-  auto& param = this->Param<operators::FcParam>();
-  auto x_dims = param.input->dims();
-  auto w_dims = param.w->dims();
-
-  auto& ctx = this->ctx_->template As<ARMContext>();
-
-  CHECK_GE(x_dims.size(), 2UL);
-  CHECK_EQ(w_dims.size(), 2UL);
-  CHECK_EQ(param.output->dims().size(), 2UL);
-
-  m_ = x_dims.Slice(0, param.in_num_col_dims).production();
-  k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();
-  CHECK_EQ(k_, w_dims[0]);
-  n_ = w_dims[1];
-  CHECK_EQ(k_, static_cast<int>(w_dims[0]));
-
-  if (m_ == 1) {
-    if (!transed_weight_) {
-      transed_weight_ = new Tensor;
-    }
-    transed_weight_->Resize({n_, k_});
-    const auto* w_data = param.w->data<float>();
-    auto* t_data = transed_weight_->mutable_data<float>();
-    int i = 0;
-
-    for (int nn = 0; nn < n_; ++nn) {
-      for (int kk = 0; kk < k_; ++kk) {
-        t_data[i++] = w_data[kk * n_ + nn];
-      }
-    }
-  }
-}
-
-void FcCompute::Run() {
-  auto& param = this->Param<operators::FcParam>();
-
-  const auto* i_data = param.input->data<float>();
-  const auto* w_data = param.w->data<float>();
-  const auto* b_data = param.bias ? param.bias->data<float>() : nullptr;
-  auto* o_data = param.output->mutable_data<float>();
-
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  if (m_ > 1) {
-    lite::arm::math::sgemm(false,
-                           false,
-                           m_,
-                           n_,
-                           k_,
-                           1.f,
-                           i_data,
-                           k_,
-                           w_data,
-                           n_,
-                           0.f,
-                           o_data,
-                           n_,
-                           b_data,
-                           false,
-                           false,
-                           &ctx);
-    if (param.bias) {
-      CHECK_EQ(param.bias->numel(), n_);
-      lite::arm::math::fill_bias_fc(o_data, b_data, m_, n_);
-    }
-  } else {
-    CHECK(transed_weight_);
-    const auto* t_data = transed_weight_->data<float>();
-
-    lite::arm::math::sgemv(t_data,
-                           i_data,
-                           o_data,
-                           false,
-                           n_,
-                           k_,
-                           b_data != nullptr,
-                           b_data,
-                           false);
-  }
-}
-
-template <PrecisionType Ptype_out>
-void FcComputeInt8<Ptype_out>::PrepareForRun() {
-  auto& param = this->Param<operators::FcParam>();
-  auto x_dims = param.input->dims();
-  auto w_dims = param.w->dims();
-
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  if (!tmp_int32_out_) {
-    tmp_int32_out_ = new Tensor;
-    tmp_int32_out_->Resize(param.output->dims());
-  }
-
-  CHECK_GE(x_dims.size(), 2UL);
-  CHECK_EQ(w_dims.size(), 2UL);
-  CHECK_EQ(param.output->dims().size(), 2UL);
-
-  this->m_ = x_dims.Slice(0, param.in_num_col_dims).production();
-  this->k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();
-  this->n_ = w_dims[1];
-  CHECK_EQ(k_, static_cast<int>(w_dims[0]));
-
-  if (this->m_ == 1) {
-    if (!this->transed_weight_) {
-      this->transed_weight_ = new Tensor;
-    }
-    this->transed_weight_->Resize({this->n_, this->k_});
-    const auto* w_data = param.w->template data<int8_t>();
-    auto* t_data = this->transed_weight_->template mutable_data<int8_t>();
-    int i = 0;
-
-    for (int nn = 0; nn < this->n_; ++nn) {
-      for (int kk = 0; kk < this->k_; ++kk) {
-        t_data[i++] = w_data[kk * this->n_ + nn];
-      }
-    }
-  }
-
-  if (this->m_ > 1) {
-    int hblock = lite::arm::math::get_hblock(ctx.arch());
-    int m_round = hblock * ((this->m_ + hblock - 1) / hblock);
-    ctx.ExtendWorkspace(m_round * this->k_);
-  }
-  bool with_bias = param.bias;
-  if (with_bias) {
-    Tensor temp_tensor;
-    temp_tensor.CopyDataFrom(*param.bias);
-    lite::arm::math::trans_fp32_bias_to_int32_basic(
-        &temp_tensor, param.bias, param.input_scale, param.weight_scale);
-  }
-}
-
-template <PrecisionType Ptype_out>
-void FcComputeInt8<Ptype_out>::Run() {
-  auto& param = this->Param<operators::FcParam>();
-
-  const auto* i_data = param.input->template data<int8_t>();
-  const auto* w_data = param.w->template data<int8_t>();
-  const auto* b_data = param.bias ? param.bias->template data<int>() : nullptr;
-  int* o_data = nullptr;
-
-  auto& ctx = this->ctx_->template As<ARMContext>();
-
-  o_data = this->tmp_int32_out_->template mutable_data<int>();
-  if (m_ > 1) {
-    int8_t* packed_in =
-        static_cast<int8_t*>(ctx.template workspace_data<int8_t>()) +
-        ctx.llc_size() / sizeof(int8_t);
-    lite::arm::math::prepackA_int8(
-        packed_in, i_data, k_, 0, m_, 0, k_, false, &ctx);
-    lite::arm::math::gemm_prepack_int8(packed_in,
-                                       w_data,
-                                       b_data,
-                                       o_data,
-                                       m_,
-                                       n_,
-                                       k_,
-                                       false,
-                                       false,
-                                       false,
-                                       nullptr,
-                                       &ctx);
-    if (param.bias) {
-      CHECK_EQ(param.bias->numel(), n_);
-      lite::arm::math::fill_bias_fc(o_data, b_data, m_, n_);
-    }
-  } else {
-    CHECK(transed_weight_);
-    const auto* t_data = transed_weight_->template data<int8_t>();
-    lite::arm::math::gemv_int8(t_data,
-                               i_data,
-                               o_data,
-                               false,
-                               n_,
-                               k_,
-                               nullptr,
-                               b_data != nullptr,
-                               b_data,
-                               false);
-  }
-
-  float i_scale = param.input_scale;
-  std::vector<float> weight_scale = param.weight_scale;
-  if (Ptype_out == PRECISION(kInt8)) {
-    float o_scale = param.output_scale;
-    param.output->template mutable_data<int8_t>();
-    lite::arm::math::trans_tensor_dtype<PRECISION(kInt32), PRECISION(kInt8)>(
-        tmp_int32_out_, param.output, i_scale, o_scale, weight_scale);
-  } else if (Ptype_out == PRECISION(kFloat)) {
-    param.output->template mutable_data<float>();
-    lite::arm::math::trans_tensor_dtype<PRECISION(kInt32), PRECISION(kFloat)>(
-        tmp_int32_out_, param.output, i_scale, 1.f, weight_scale);
-  } else {
-    LOG(ERROR) << "unsupported precision type!!";
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    fc, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::FcCompute, def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(
-    fc,
-    kARM,
-    kInt8,
-    kNCHW,
-    paddle::lite::kernels::arm::FcComputeInt8<PRECISION(kInt8)>,
-    int8out)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(
-    fc,
-    kARM,
-    kInt8,
-    kNCHW,
-    paddle::lite::kernels::arm::FcComputeInt8<PRECISION(kFloat)>,
-    fp32out)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
-    .Finalize();
diff --git a/lite/kernels/arm/fc_compute.h b/lite/kernels/arm/fc_compute.h
deleted file mode 100644
index 2af3845ebb..0000000000
--- a/lite/kernels/arm/fc_compute.h
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <stdint.h>
-#include "lite/backends/arm/math/type_trans.h"
-#include "lite/core/kernel.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class FcCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::FcParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  ~FcCompute() override {
-    if (transed_weight_) {
-      delete transed_weight_;
-    }
-  };
-
- private:
-  lite::Tensor* transed_weight_{nullptr};
-  int m_, n_, k_;
-};
-
-template <PrecisionType Ptype_out>
-class FcComputeInt8 : public KernelLite<TARGET(kARM), PRECISION(kInt8)> {
- public:
-  using param_t = operators::FcParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  ~FcComputeInt8() override {
-    if (transed_weight_) {
-      delete transed_weight_;
-    }
-  };
-
- private:
-  lite::Tensor* transed_weight_{nullptr};
-  Tensor* tmp_int32_out_{nullptr};
-  int m_, n_, k_;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/fc_compute_test.cc b/lite/kernels/arm/fc_compute_test.cc
deleted file mode 100644
index acda901667..0000000000
--- a/lite/kernels/arm/fc_compute_test.cc
+++ /dev/null
@@ -1,211 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/fc_compute.h"
-#include <gtest/gtest.h>
-#include <algorithm>
-#include <iostream>
-#include <memory>
-#include <random>
-#include <utility>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-#define A(i, j) a[i * lda + j]
-#define B(i, j) b[i * ldb + j]
-#define C(i, j) c[i * ldc + j]
-
-template <typename T>
-void gemm_bias(const T* a,
-               const int M,
-               const int K,
-               const T* b,
-               const int K_,
-               const int N,
-               T* biases,
-               T* c) {
-  EXPECT_TRUE(K_ == K && M > 0 && N > 0 && K > 0);
-  EXPECT_TRUE(a && b && c);
-  const int lda = K;
-  const int ldb = N;
-  const int ldc = N;
-  for (int m = 0; m < M; ++m) {
-    for (int n = 0; n < N; ++n) {
-      C(m, n) = 0.0f;
-      for (int k = 0; k < K; ++k) {
-        C(m, n) += A(m, k) * B(k, n);
-      }
-    }
-  }
-  if (biases) {
-    for (int m = 0; m < M; ++m) {
-      for (int n = 0; n < N; ++n) {
-        C(m, n) += biases[n];
-      }
-    }
-  }
-}
-
-template <typename T>
-void FillData(T* a,
-              const int n,
-              const T lower = static_cast<T>(-2.f),
-              const T upper = static_cast<T>(2.f)) {
-  static unsigned int seed = 100;
-  std::mt19937 rng(seed++);
-  std::uniform_real_distribution<double> uniform_dist(0, 1);
-  for (int i = 0; i < n; ++i) {
-    a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
-  }
-}
-
-TEST(fc_arm, retrive_op) {
-  auto fc =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("fc");
-  ASSERT_FALSE(fc.empty());
-  ASSERT_TRUE(fc.front());
-}
-
-TEST(fc_arm, init) {
-  FcCompute fc;
-  ASSERT_EQ(fc.precision(), PRECISION(kFloat));
-  ASSERT_EQ(fc.target(), TARGET(kARM));
-}
-
-TEST(fc_arm, compare_test) {
-  using T = float;
-
-  for (int m : {1, 2, 3, 4}) {
-    for (int n : {1, 2, 3, 4}) {
-      for (int k : {1, 2, 3, 4}) {
-        for (bool with_bias : {true, false}) {
-          VLOG(3) << "m: " << m << ", n: " << n << ", k: " << k
-                  << (with_bias ? ", with bias" : "");
-          lite::Tensor x, w, b, out, ref;
-
-          x.Resize({m, k});
-          w.Resize({k, n});
-          b.Resize({1, n});
-          out.Resize({m, n});
-          ref.Resize({m, n});
-
-          auto* x_data = x.mutable_data<T>();
-          auto* w_data = w.mutable_data<T>();
-          auto* b_data = with_bias ? b.mutable_data<T>() : nullptr;
-
-          auto* out_data = out.mutable_data<T>();
-          auto* ref_data = ref.mutable_data<T>();
-
-          FillData<T>(x_data, x.dims().production());
-          FillData<T>(w_data, w.dims().production());
-          FillData<T>(out_data, out.dims().production(), 0, 0);
-          FillData<T>(ref_data, ref.dims().production(), 0, 0);
-
-          if (with_bias) {
-            FillData<T>(b_data, b.dims().production());
-          }
-
-          FcCompute fc;
-          operators::FcParam param;
-
-          param.input = &x;
-          param.w = &w;
-          param.bias = with_bias ? &b : nullptr;
-          param.output = &out;
-          param.in_num_col_dims = 1;
-          param.in_mat_dims = x.dims();
-
-          DeviceInfo::Init();
-          std::unique_ptr<KernelContext> ctx(new KernelContext);
-          ctx->As<ARMContext>();
-          fc.SetParam(param);
-          fc.SetContext(std::move(ctx));
-          fc.PrepareForRun();
-          fc.Run();
-
-          gemm_bias<T>(x_data, m, k, w_data, k, n, b_data, ref_data);
-
-          for (int i = 0; i < out.dims().production(); i++) {
-            EXPECT_NEAR(out_data[i], ref_data[i], 1e-3);
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(fc_arm, num_col_dims) {
-  using T = float;
-
-  for (bool with_bias : {true, false}) {
-    lite::Tensor x, w, b, out, ref;
-
-    x.Resize({1, 2, 3});
-    w.Resize({3, 4});
-    b.Resize({1, 4});
-    out.Resize({2, 4});
-    ref.Resize({2, 4});
-
-    auto* x_data = x.mutable_data<float>();
-    auto* w_data = w.mutable_data<float>();
-    auto* b_data = with_bias ? b.mutable_data<T>() : nullptr;
-
-    auto* out_data = out.mutable_data<T>();
-    auto* ref_data = ref.mutable_data<T>();
-
-    FillData<T>(x_data, x.dims().production());
-    FillData<T>(w_data, w.dims().production());
-    FillData<T>(out_data, out.dims().production(), 0, 0);
-    FillData<T>(ref_data, ref.dims().production(), 0, 0);
-    if (with_bias) {
-      FillData<T>(b_data, b.dims().production());
-    }
-    FcCompute fc;
-    operators::FcParam param;
-    param.input = &x;
-    param.w = &w;
-    param.bias = with_bias ? &b : nullptr;
-    param.output = &out;
-    param.in_num_col_dims = 2;
-    param.in_mat_dims = x.dims();
-
-    std::unique_ptr<KernelContext> ctx(new KernelContext);
-    ctx->As<ARMContext>();
-    DeviceInfo::Init();
-
-    fc.SetParam(param);
-    fc.SetContext(std::move(ctx));
-    fc.PrepareForRun();
-    fc.Run();
-
-    gemm_bias<T>(x_data, 2, 3, w_data, 3, 4, b_data, ref_data);
-
-    for (int i = 0; i < out.dims().production(); i++) {
-      EXPECT_NEAR(out_data[i], ref_data[i], 1e-3);
-    }
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
diff --git a/lite/kernels/arm/fill_constant_compute.cc b/lite/kernels/arm/fill_constant_compute.cc
deleted file mode 100644
index 1e4a58fc97..0000000000
--- a/lite/kernels/arm/fill_constant_compute.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-template <typename T>
-class FillConstantCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::FillConstantParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto& context = ctx_->As<ARMContext>();
-
-    auto data = param.Out->template mutable_data<T>();
-    for (int i = 0; i < param.Out->numel(); i++) {
-      data[i] = param.value;
-    }
-  }
-
-  virtual ~FillConstantCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-// float
-REGISTER_LITE_KERNEL(fill_constant,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::FillConstantCompute<float>,
-                     def)
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/generate_proposals_compute.cc b/lite/kernels/arm/generate_proposals_compute.cc
deleted file mode 100644
index a9768f25f0..0000000000
--- a/lite/kernels/arm/generate_proposals_compute.cc
+++ /dev/null
@@ -1,494 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/generate_proposals_compute.h"
-#include <string>
-#include <utility>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-static const double kBBoxClipDefault = std::log(1000.0 / 16.0);
-
-static void permute(const Tensor &input,
-                    Tensor *output,
-                    const std::vector<int> &orders) {
-  auto in_dims = input.dims();
-  auto out_dims = output->dims();
-  int num_axes = in_dims.size();
-  int count = in_dims.production();
-
-  const float *din = input.data<float>();
-  float *dout = output->mutable_data<float>();
-  std::vector<int> old_steps(
-      {static_cast<int>(in_dims[1] * in_dims[2] * in_dims[3]),
-       static_cast<int>(in_dims[2] * in_dims[3]),
-       static_cast<int>(in_dims[3]),
-       1});
-  std::vector<int> new_steps(
-      {static_cast<int>(out_dims[1] * out_dims[2] * out_dims[3]),
-       static_cast<int>(out_dims[2] * out_dims[3]),
-       static_cast<int>(out_dims[3]),
-       1});
-
-  for (int i = 0; i < count; ++i) {
-    int old_idx = 0;
-    int idx = i;
-    for (int j = 0; j < num_axes; ++j) {
-      int order = orders[j];
-      old_idx += (idx / new_steps[j]) * old_steps[order];
-      idx %= new_steps[j];
-    }
-    dout[i] = din[old_idx];
-  }
-}
-
-template <typename T, typename IndexT = int>
-static void gather(const Tensor &src, const Tensor &index, Tensor *output) {
-  auto *p_src = src.data<T>();
-  auto *p_index = index.data<IndexT>();
-  auto *p_output = output->mutable_data<T>();
-
-  auto src_dims = src.dims();
-  int slice_size = 1;
-  for (int i = 1; i < src_dims.size(); i++) slice_size *= src_dims[i];
-  size_t slice_bytes = slice_size * sizeof(T);
-
-  int64_t index_size = index.numel();
-  for (int64_t i = 0; i < index_size; i++) {
-    IndexT index_ = p_index[i];
-    memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
-  }
-}
-
-template <class T>
-static void BoxCoder(Tensor *all_anchors,
-                     Tensor *bbox_deltas,
-                     Tensor *variances,
-                     Tensor *proposals) {
-  T *proposals_data = proposals->mutable_data<T>();
-
-  int64_t row = all_anchors->dims()[0];
-  int64_t len = all_anchors->dims()[1];
-
-  auto *bbox_deltas_data = bbox_deltas->data<T>();
-  auto *anchor_data = all_anchors->data<T>();
-  const T *variances_data = nullptr;
-  if (variances) {
-    variances_data = variances->data<T>();
-  }
-
-  for (int64_t i = 0; i < row; ++i) {
-    T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0;
-    T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0;
-
-    T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width;
-    T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height;
-
-    T bbox_center_x = 0, bbox_center_y = 0;
-    T bbox_width = 0, bbox_height = 0;
-
-    if (variances) {
-      bbox_center_x =
-          variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width +
-          anchor_center_x;
-      bbox_center_y = variances_data[i * len + 1] *
-                          bbox_deltas_data[i * len + 1] * anchor_height +
-                      anchor_center_y;
-      bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
-                                            bbox_deltas_data[i * len + 2],
-                                        kBBoxClipDefault)) *
-                   anchor_width;
-      bbox_height = std::exp(std::min<T>(variances_data[i * len + 3] *
-                                             bbox_deltas_data[i * len + 3],
-                                         kBBoxClipDefault)) *
-                    anchor_height;
-    } else {
-      bbox_center_x =
-          bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
-      bbox_center_y =
-          bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
-      bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
-                                        kBBoxClipDefault)) *
-                   anchor_width;
-      bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
-                                         kBBoxClipDefault)) *
-                    anchor_height;
-    }
-
-    proposals_data[i * len] = bbox_center_x - bbox_width / 2;
-    proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
-    proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1;
-    proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1;
-  }
-  // return proposals;
-}
-
-template <class T>
-static void ClipTiledBoxes(const Tensor &im_info, Tensor *boxes) {
-  T *boxes_data = boxes->mutable_data<T>();
-  const T *im_info_data = im_info.data<T>();
-  T zero(0);
-  for (int64_t i = 0; i < boxes->numel(); ++i) {
-    if (i % 4 == 0) {
-      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
-    } else if (i % 4 == 1) {
-      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
-    } else if (i % 4 == 2) {
-      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
-    } else {
-      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
-    }
-  }
-}
-
-template <class T>
-static void FilterBoxes(Tensor *boxes,
-                        float min_size,
-                        const Tensor &im_info,
-                        Tensor *keep) {
-  T *boxes_data = boxes->mutable_data<T>();
-  const T *im_info_data = im_info.data<T>();
-  T im_scale = im_info_data[2];
-  min_size = std::max(min_size, 1.0f);
-  keep->Resize(std::vector<int64_t>({boxes->dims()[0]}));
-  int *keep_data = keep->mutable_data<int>();
-
-  int keep_len = 0;
-  for (int i = 0; i < boxes->dims()[0]; ++i) {
-    T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1;
-    T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1;
-    T ws_origin_scale =
-        (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1;
-    T hs_origin_scale =
-        (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1;
-    T x_ctr = boxes_data[4 * i] + ws / 2;
-    T y_ctr = boxes_data[4 * i + 1] + hs / 2;
-    if (ws_origin_scale >= min_size && hs_origin_scale >= min_size &&
-        x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) {
-      keep_data[keep_len++] = i;
-    }
-  }
-  keep->Resize(std::vector<int64_t>({keep_len}));
-}
-
-template <class T>
-static std::vector<std::pair<T, int>> GetSortedScoreIndex(
-    const std::vector<T> &scores) {
-  std::vector<std::pair<T, int>> sorted_indices;
-  sorted_indices.reserve(scores.size());
-  for (size_t i = 0; i < scores.size(); ++i) {
-    sorted_indices.emplace_back(scores[i], i);
-  }
-  // Sort the score pair according to the scores in descending order
-  std::stable_sort(sorted_indices.begin(),
-                   sorted_indices.end(),
-                   [](const std::pair<T, int> &a, const std::pair<T, int> &b) {
-                     return a.first < b.first;
-                   });
-  return sorted_indices;
-}
-
-template <class T>
-static T BBoxArea(const T *box, bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
-    }
-  }
-}
-
-template <class T>
-static T JaccardOverlap(const T *box1, const T *box2, bool normalized) {
-  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
-      box2[3] < box1[1]) {
-    return static_cast<T>(0.);
-  } else {
-    const T inter_xmin = std::max(box1[0], box2[0]);
-    const T inter_ymin = std::max(box1[1], box2[1]);
-    const T inter_xmax = std::min(box1[2], box2[2]);
-    const T inter_ymax = std::min(box1[3], box2[3]);
-    const T inter_w = std::max(T(0), inter_xmax - inter_xmin + 1);
-    const T inter_h = std::max(T(0), inter_ymax - inter_ymin + 1);
-    const T inter_area = inter_w * inter_h;
-    const T bbox1_area = BBoxArea<T>(box1, normalized);
-    const T bbox2_area = BBoxArea<T>(box2, normalized);
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
-}
-
-template <class T>
-static Tensor VectorToTensor(const std::vector<T> &selected_indices,
-                             int selected_num) {
-  Tensor keep_nms;
-  keep_nms.Resize(std::vector<int64_t>({selected_num}));
-  auto *keep_data = keep_nms.mutable_data<T>();
-  for (int i = 0; i < selected_num; ++i) {
-    keep_data[i] = selected_indices[i];
-  }
-  return keep_nms;
-}
-
-template <class T>
-static Tensor NMS(Tensor *bbox, Tensor *scores, T nms_threshold, float eta) {
-  int64_t num_boxes = bbox->dims()[0];
-  int64_t box_size = bbox->dims()[1];  // 4: [xmin ymin xmax ymax]
-
-  std::vector<T> scores_data(num_boxes);
-  std::copy_n(scores->data<T>(), num_boxes, scores_data.begin());
-  std::vector<std::pair<T, int>> sorted_indices =
-      GetSortedScoreIndex<T>(scores_data);
-
-  std::vector<int> selected_indices;
-  int selected_num = 0;
-  T adaptive_threshold = nms_threshold;
-  const T *bbox_data = bbox->data<T>();
-  while (sorted_indices.size() != 0) {
-    int idx = sorted_indices.back().second;
-    bool flag = true;
-    for (int kept_idx : selected_indices) {
-      if (flag) {
-        T overlap = JaccardOverlap<T>(
-            bbox_data + idx * box_size, bbox_data + kept_idx * box_size, false);
-        flag = (overlap <= adaptive_threshold);
-      } else {
-        break;
-      }
-    }
-    if (flag) {
-      selected_indices.push_back(idx);
-      ++selected_num;
-    }
-    sorted_indices.erase(sorted_indices.end() - 1);
-    if (flag && eta < 1 && adaptive_threshold > 0.5) {
-      adaptive_threshold *= eta;
-    }
-  }
-  return VectorToTensor(selected_indices, selected_num);
-}
-
-static std::pair<Tensor, Tensor> ProposalForOneImage(
-    const Tensor &im_info_slice,
-    const Tensor &anchors,
-    const Tensor &variances,          // H * W * A * 4
-    const Tensor &bbox_deltas_slice,  // [A, 4]
-    const Tensor &scores_slice,       // [A, 1]
-    int pre_nms_top_n,
-    int post_nms_top_n,
-    float nms_thresh,
-    float min_size,
-    float eta) {
-  // sort scores_slice
-  Tensor index_t;
-  index_t.Resize(std::vector<int64_t>({scores_slice.numel()}));
-  auto *index = index_t.mutable_data<int>();
-  for (int i = 0; i < index_t.numel(); i++) {
-    index[i] = i;
-  }
-  auto *scores_data = scores_slice.data<float>();
-  auto compare_func = [scores_data](const int64_t &i, const int64_t &j) {
-    return scores_data[i] > scores_data[j];
-  };
-  if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) {
-    std::sort(index, index + scores_slice.numel(), compare_func);
-  } else {
-    std::nth_element(index,
-                     index + pre_nms_top_n,
-                     index + scores_slice.numel(),
-                     compare_func);
-    index_t.Resize({pre_nms_top_n});
-  }
-
-  Tensor scores_sel, bbox_sel, anchor_sel, var_sel;
-  scores_sel.Resize(std::vector<int64_t>({index_t.numel(), 1}));
-  bbox_sel.Resize(std::vector<int64_t>({index_t.numel(), 4}));
-  anchor_sel.Resize(std::vector<int64_t>({index_t.numel(), 4}));
-  var_sel.Resize(std::vector<int64_t>({index_t.numel(), 4}));
-  gather<float>(scores_slice, index_t, &scores_sel);
-  gather<float>(bbox_deltas_slice, index_t, &bbox_sel);
-  gather<float>(anchors, index_t, &anchor_sel);
-  gather<float>(variances, index_t, &var_sel);
-
-  Tensor proposals;
-  proposals.Resize(std::vector<int64_t>({index_t.numel(), 4}));
-  BoxCoder<float>(&anchor_sel, &bbox_sel, &var_sel, &proposals);
-
-  ClipTiledBoxes<float>(im_info_slice, &proposals);
-
-  Tensor keep;
-  FilterBoxes<float>(&proposals, min_size, im_info_slice, &keep);
-  Tensor scores_filter;
-  scores_filter.Resize(std::vector<int64_t>({keep.numel(), 1}));
-  bbox_sel.Resize(std::vector<int64_t>({keep.numel(), 4}));
-  gather<float>(scores_sel, keep, &scores_filter);
-  gather<float>(proposals, keep, &bbox_sel);
-  if (nms_thresh <= 0) {
-    return std::make_pair(bbox_sel, scores_filter);
-  }
-
-  Tensor keep_nms = NMS<float>(&bbox_sel, &scores_filter, nms_thresh, eta);
-  if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
-    keep_nms.Resize(std::vector<int64_t>({post_nms_top_n}));
-  }
-  proposals.Resize(std::vector<int64_t>({keep_nms.numel(), 4}));
-  scores_sel.Resize(std::vector<int64_t>({keep_nms.numel(), 1}));
-  gather<float>(bbox_sel, keep_nms, &proposals);
-  gather<float>(scores_filter, keep_nms, &scores_sel);
-  return std::make_pair(proposals, scores_sel);
-}
-
-void AppendTensor(Tensor *dst, int64_t offset, const Tensor &src) {
-  auto *out_data = static_cast<void *>(dst->mutable_data<float>());
-  auto *to_add_data = static_cast<const void *>(src.data<float>());
-  size_t size_of_t = sizeof(float);
-  offset *= size_of_t;
-  std::memcpy(
-      reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(out_data) + offset),
-      to_add_data,
-      src.numel() * size_of_t);
-}
-
-void GenerateProposalsCompute::Run() {
-  auto &ctx = this->ctx_->template As<ARMContext>();
-  auto &param = Param<operators::GenerateProposalsParam>();
-  auto *scores = param.Scores;              // N * A * H * W
-  auto *bbox_deltas = param.BboxDeltas;     // N * 4A * H * W
-  auto *im_info = param.ImInfo;             // N * 3
-  auto *anchors = param.Anchors;            // H * W * A * 4
-  auto *variances = param.Variances;        // H * W * A * 4
-  auto *rpn_rois = param.RpnRois;           // A * 4
-  auto *rpn_roi_probs = param.RpnRoiProbs;  // A * 1
-  int pre_nms_top_n = param.pre_nms_topN;
-  int post_nms_top_n = param.post_nms_topN;
-  float nms_thresh = param.nms_thresh;
-  float min_size = param.min_size;
-  float eta = param.eta;
-
-  auto &scores_dim = scores->dims();
-  int64_t num = scores_dim[0];
-  int64_t c_score = scores_dim[1];
-  int64_t h_score = scores_dim[2];
-  int64_t w_score = scores_dim[3];
-  auto &bbox_dim = bbox_deltas->dims();
-  int64_t c_bbox = bbox_dim[1];
-  int64_t h_bbox = bbox_dim[2];
-  int64_t w_bbox = bbox_dim[3];
-
-  rpn_rois->Resize({scores->numel(), 4});
-  rpn_roi_probs->Resize(std::vector<int64_t>({scores->numel(), 1}));
-
-  Tensor bbox_deltas_swap, scores_swap;
-  scores_swap.Resize(std::vector<int64_t>({num, h_score, w_score, c_score}));
-  bbox_deltas_swap.Resize(std::vector<int64_t>({num, h_bbox, w_bbox, c_bbox}));
-  std::vector<int> orders({0, 2, 3, 1});
-  permute(*scores, &scores_swap, orders);
-  permute(*bbox_deltas, &bbox_deltas_swap, orders);
-
-  LoD lod;
-  lod.resize(1);
-  auto &lod0 = lod[0];
-  lod0.push_back(0);
-  anchors->Resize(std::vector<int64_t>({anchors->numel() / 4, 4}));
-  variances->Resize(std::vector<int64_t>({variances->numel() / 4, 4}));
-
-  int64_t num_proposals = 0;
-  for (int64_t i = 0; i < num; ++i) {
-    Tensor im_info_slice = im_info->Slice<float>(i, i + 1);
-    Tensor bbox_deltas_slice = bbox_deltas_swap.Slice<float>(i, i + 1);
-    Tensor scores_slice = scores_swap.Slice<float>(i, i + 1);
-
-    bbox_deltas_slice.Resize(
-        std::vector<int64_t>({c_bbox * h_bbox * w_bbox / 4, 4}));
-    scores_slice.Resize(std::vector<int64_t>({c_score * h_score * w_score, 1}));
-
-    std::pair<Tensor, Tensor> tensor_pair =
-        ProposalForOneImage(im_info_slice,
-                            *anchors,
-                            *variances,
-                            bbox_deltas_slice,
-                            scores_slice,
-                            pre_nms_top_n,
-                            post_nms_top_n,
-                            nms_thresh,
-                            min_size,
-                            eta);
-    Tensor &proposals = tensor_pair.first;
-    Tensor &scores = tensor_pair.second;
-
-    AppendTensor(rpn_rois, 4 * num_proposals, proposals);
-    AppendTensor(rpn_roi_probs, num_proposals, scores);
-
-    num_proposals += proposals.dims()[0];
-    lod0.push_back(num_proposals);
-  }
-  rpn_rois->set_lod(lod);
-  rpn_roi_probs->set_lod(lod);
-  rpn_rois->Resize({num_proposals, 4});
-  rpn_roi_probs->Resize({num_proposals, 1});
-
-  /*
-  auto* rpn_roi_probs_data = rpn_roi_probs->data<float>();
-  LOG(INFO) << "rpn_roi_probs:" << rpn_roi_probs->dims();
-  for (int i = 0; i < rpn_roi_probs->numel() - 4; i = i + 4) {
-    LOG(INFO) << rpn_roi_probs_data[i] << " " << rpn_roi_probs_data[i+1]
-      << " " << rpn_roi_probs_data[i+2] << " " << rpn_roi_probs_data[i+3];
-  }
-  auto* rpn_roi_data = rpn_rois->data<float>();
-  LOG(INFO) << "rpn_roi:" << rpn_rois->dims();
-  for (int i = 0; i < rpn_rois->numel() - 4; i = i + 4) {
-    LOG(INFO) << rpn_roi_data[i] << " " << rpn_roi_data[i+1]
-      << " " << rpn_roi_data[i+2] << " " << rpn_roi_data[i+3];
-  }
-  */
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(generate_proposals,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::GenerateProposalsCompute,
-                     def)
-    .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("BboxDeltas", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("ImInfo", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Anchors", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Variances", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("RpnRois", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("RpnRoiProbs", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/generate_proposals_compute.h b/lite/kernels/arm/generate_proposals_compute.h
deleted file mode 100644
index fa566572f0..0000000000
--- a/lite/kernels/arm/generate_proposals_compute.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/core/kernel.h"
-#include "lite/operators/generate_proposals_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class GenerateProposalsCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::GenerateProposalsParam;
-
-  void Run() override;
-
-  virtual ~GenerateProposalsCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/gru_compute.cc b/lite/kernels/arm/gru_compute.cc
deleted file mode 100644
index fde909aba4..0000000000
--- a/lite/kernels/arm/gru_compute.cc
+++ /dev/null
@@ -1,146 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/gru_compute.h"
-#include <string>
-#include <vector>
-#include "lite/api/paddle_place.h"
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/backends/arm/math/gru_utils.h"
-#include "lite/backends/arm/math/sequence2batch.h"
-#include "lite/backends/arm/math/sgemm.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-inline lite_api::ActivationType get_gru_act_type(const std::string& type) {
-  if (type == "sigmoid") {
-    return lite_api::ActivationType::kSigmoid;
-  } else if (type == "tanh") {
-    return lite_api::ActivationType::kTanh;
-  } else if (type == "relu") {
-    return lite_api::ActivationType::kRelu;
-  } else if (type == "identity") {
-    return lite_api::ActivationType::kIndentity;
-  } else {
-    LOG(FATAL) << "unsupported activation type: " << type;
-  }
-}
-
-void GRUCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  // inputs
-  auto input = param.input;
-  auto h0 = param.h0;
-  auto weight = param.weight;
-  auto bias = param.bias;
-  // outputs
-  auto batch_gate = param.batch_gate;
-  auto batch_reset_hidden_prev = param.batch_reset_hidden_prev;
-  auto batch_hidden = param.batch_hidden;
-  auto hidden = param.hidden;
-
-  auto hidden_dims = hidden->dims();
-  int frame_size = hidden_dims[1];
-  auto batch_size = input->dims()[0];
-
-  const float* weight_data = weight->data<float>();
-  float* batch_gate_data = batch_gate->mutable_data<float>();
-
-  lite::arm::math::LoDTensor2BatchFunctor<float> to_batch;
-  to_batch(*input, batch_gate, true, param.is_reverse);
-
-  if (bias) {
-    auto bias_data = bias->data<float>();
-    lite::arm::math::gru_add_with_bias(batch_gate_data,
-                                       bias_data,
-                                       batch_gate_data,
-                                       batch_size,
-                                       frame_size * 3);
-  }
-
-  lite::arm::math::GRUMetaValue<float> gru_value;
-
-  gru_value.gate_weight = const_cast<float*>(weight_data);
-  gru_value.state_weight =
-      const_cast<float*>(weight_data + 2 * frame_size * frame_size);
-  Tensor ordered_h0;
-
-  std::vector<uint64_t> order(batch_gate->lod()[2]);
-
-  if (h0) {
-    // Since the batch computing for GRU reorders the input sequences
-    // according to their length. The initialized cell state also needs
-    // to reorder.
-    lite::arm::math::ReorderInitState<float>(*h0, order, &ordered_h0, true);
-    gru_value.prev_out_value = ordered_h0.mutable_data<float>();
-  } else {
-    gru_value.prev_out_value = nullptr;
-  }
-  auto batch_starts = batch_gate->lod()[0];
-  size_t seq_len = batch_starts.size() - 1;
-  auto active_node = get_gru_act_type(param.activation);
-  auto active_gate = get_gru_act_type(param.gate_activation);
-
-  for (size_t n = 0; n < seq_len; n++) {
-    int bstart = static_cast<int>(batch_starts[n]);
-    int bend = static_cast<int>(batch_starts[n + 1]);
-    int cur_batch_size = bend - bstart;
-
-    gru_value.output_value =
-        batch_hidden->mutable_data<float>() + bstart * batch_hidden->dims()[1];
-    gru_value.gate_value =
-        batch_gate->mutable_data<float>() + bstart * batch_gate->dims()[1];
-    gru_value.reset_output_value =
-        batch_reset_hidden_prev->mutable_data<float>() +
-        bstart * batch_reset_hidden_prev->dims()[1];
-
-    lite::arm::math::GRUUnitFunctor<float>::compute(gru_value,
-                                                    frame_size,
-                                                    cur_batch_size,
-                                                    active_node,
-                                                    active_gate,
-                                                    param.origin_mode,
-                                                    &ctx);
-
-    gru_value.prev_out_value = gru_value.output_value;
-  }
-  lite::arm::math::Batch2LoDTensorFunctor<float> to_seq;
-  *(batch_hidden->mutable_lod()) = batch_gate->lod();
-  batch_hidden->mutable_data<float>();
-  to_seq(*batch_hidden, hidden);
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    gru, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::GRUCompute, def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("H0", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Weight", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("BatchGate", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("BatchResetHiddenPrev", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("BatchHidden", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Hidden", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/gru_compute.h b/lite/kernels/arm/gru_compute.h
deleted file mode 100644
index 1fe2653df3..0000000000
--- a/lite/kernels/arm/gru_compute.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/core/kernel.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class GRUCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::GRUParam;
-
-  GRUCompute() = default;
-
-  void Run() override;
-
-  virtual ~GRUCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/gru_unit_compute.cc b/lite/kernels/arm/gru_unit_compute.cc
deleted file mode 100644
index 10909e1124..0000000000
--- a/lite/kernels/arm/gru_unit_compute.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/gru_unit_compute.h"
-#include "lite/api/paddle_place.h"
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/backends/arm/math/gru_utils.h"
-#include "lite/backends/arm/math/sgemm.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-inline lite_api::ActivationType convert_gru_act_type(int act_type) {
-  switch (act_type) {
-    case 0:
-      return lite_api::ActivationType::kIndentity;
-    case 1:
-      return lite_api::ActivationType::kSigmoid;
-    case 2:
-      return lite_api::ActivationType::kTanh;
-    case 3:
-      return lite_api::ActivationType::kRelu;
-    default:
-      return lite_api::ActivationType::kIndentity;
-  }
-}
-
-void GRUUnitCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  // inputs
-  auto input = param.input;
-  auto hidden_prev = param.hidden_prev;
-  auto weight = param.weight;
-  auto bias = param.bias;
-  // outputs
-  auto gate = param.gate;
-  auto reset_hidden_prev = param.reset_hidden_prev;
-  auto hidden = param.hidden;
-
-  int batch_size = input->dims()[0];
-  int frame_size = hidden_prev->dims()[1];
-  const float* input_data = input->data<float>();
-  const float* hidden_prev_data = hidden_prev->data<float>();
-  const float* weight_data = weight->data<float>();
-  float* gate_data = gate->mutable_data<float>();
-  float* reset_hidden_prev_data = reset_hidden_prev->mutable_data<float>();
-  float* hidden_data = hidden->mutable_data<float>();
-
-  if (bias) {
-    auto bias_data = bias->data<float>();
-    lite::arm::math::gru_add_with_bias<float>(
-        input_data, bias_data, gate_data, batch_size, frame_size * 3);
-  } else {
-    for (int i = 0; i < batch_size; ++i) {
-      TargetCopy(TargetType::kARM,
-                 gate_data + i * frame_size * 3,
-                 input_data,
-                 frame_size * 3 * sizeof(float));
-    }
-  }
-
-  lite::arm::math::GRUMetaValue<float> gru_value;
-  gru_value.gate_weight = const_cast<float*>(weight_data);
-  gru_value.state_weight =
-      const_cast<float*>(weight_data + 2 * frame_size * frame_size);
-  gru_value.prev_out_value = const_cast<float*>(hidden_prev_data);
-  gru_value.output_value = hidden_data;
-  gru_value.gate_value = gate_data;
-  gru_value.reset_output_value = reset_hidden_prev_data;
-
-  lite::arm::math::GRUUnitFunctor<float>::compute(
-      gru_value,
-      frame_size,
-      batch_size,
-      convert_gru_act_type(param.activation),
-      convert_gru_act_type(param.gate_activation),
-      param.origin_mode,
-      &ctx);
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(gru_unit,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::GRUUnitCompute,
-                     def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("HiddenPrev", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Weight", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Gate", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("ResetHiddenPrev", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Hidden", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/gru_unit_compute.h b/lite/kernels/arm/gru_unit_compute.h
deleted file mode 100644
index 1679b248ab..0000000000
--- a/lite/kernels/arm/gru_unit_compute.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/core/kernel.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class GRUUnitCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::GRUUnitParam;
-
-  GRUUnitCompute() = default;
-
-  void Run() override;
-
-  virtual ~GRUUnitCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/im2sequence_compute.cc b/lite/kernels/arm/im2sequence_compute.cc
deleted file mode 100644
index e727b61604..0000000000
--- a/lite/kernels/arm/im2sequence_compute.cc
+++ /dev/null
@@ -1,141 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/im2sequence_compute.h"
-#include <vector>
-#include "lite/api/paddle_place.h"
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void Im2SequenceCompute::PrepareForRun() {}
-
-void Im2SequenceCompute::Run() {
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  auto& param = this->Param<operators::Im2SequenceParam>();
-  auto kernels = param.kernels;
-  auto strides = param.strides;
-  auto paddings = param.paddings;
-
-  const auto* x_data = param.X->data<float>();
-  auto* o_data = param.Out->mutable_data<float>();
-  auto input_dims = param.X->dims();
-  int im_num = input_dims[0];
-  int im_size = param.X->numel() / im_num;
-  int out_cols = input_dims[1] * kernels[0] * kernels[1];
-
-  int total_rows = 0;
-  std::vector<uint64_t> im_offset;
-  im_offset.push_back(total_rows);
-  if (param.Y) {
-    const auto* y_data = param.Y->data<int>();
-    auto out_strides = param.out_strides;
-    std::vector<int> im_real_h;
-    std::vector<int> im_real_w;
-    std::vector<int> out_h_vec;
-    std::vector<int> out_w_vec;
-    for (int im_id = 0; im_id < im_num; im_id++) {
-      int real_h = y_data[im_id * 2 + 0];
-      int real_w = y_data[im_id * 2 + 1];
-      int tmp_real_h = (real_h + out_strides[0] - 1) / out_strides[0];
-      int tmp_real_w = (real_w + out_strides[1] - 1) / out_strides[1];
-      im_real_h.push_back(tmp_real_h);
-      im_real_w.push_back(tmp_real_w);
-      int out_h =
-          (tmp_real_h + paddings[0] + paddings[1] - kernels[0]) / strides[0] +
-          1;
-      int out_w =
-          (tmp_real_w + paddings[2] + paddings[3] - kernels[1]) / strides[1] +
-          1;
-      out_h_vec.push_back(out_h);
-      out_w_vec.push_back(out_w);
-      total_rows += out_h * out_w;
-      im_offset.push_back(total_rows);
-    }
-    auto out_dims = param.Out->dims();
-    out_dims[0] = total_rows;
-    param.Out->Resize(out_dims);
-
-    int out_offset = 0;
-    for (int im_id = 0; im_id < im_num; im_id++) {
-      lite::arm::math::im2sequence(x_data + im_id * im_size,
-                                   input_dims[1],
-                                   input_dims[2],
-                                   input_dims[3],
-                                   param.kernels[0],
-                                   param.kernels[1],
-                                   param.paddings[0],
-                                   param.paddings[1],
-                                   param.paddings[2],
-                                   param.paddings[3],
-                                   param.strides[0],
-                                   param.strides[1],
-                                   out_h_vec[im_id],
-                                   out_w_vec[im_id],
-                                   o_data + im_offset[im_id] * out_cols,
-                                   &ctx);
-    }
-  } else {
-    int out_h =
-        (input_dims[2] + paddings[0] + paddings[1] - kernels[0]) / strides[0] +
-        1;
-    int out_w =
-        (input_dims[3] + paddings[2] + paddings[3] - kernels[1]) / strides[1] +
-        1;
-    for (int im_id = 0; im_id < im_num; im_id++) {
-      int out_size_per_im = out_h * out_w * out_cols;
-      lite::arm::math::im2sequence(x_data + im_id * im_size,
-                                   input_dims[1],
-                                   input_dims[2],
-                                   input_dims[3],
-                                   param.kernels[0],
-                                   param.kernels[1],
-                                   param.paddings[0],
-                                   param.paddings[1],
-                                   param.paddings[2],
-                                   param.paddings[3],
-                                   param.strides[0],
-                                   param.strides[1],
-                                   out_h,
-                                   out_w,
-                                   o_data + im_id * out_size_per_im,
-                                   &ctx);
-      im_offset.push_back(uint64_t((im_id + 1) * out_h * out_w));
-    }
-    auto lod = param.Out->mutable_lod();
-    lod->resize(1);
-    (*lod)[0] = im_offset;
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(im2sequence,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::Im2SequenceCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/im2sequence_compute.h b/lite/kernels/arm/im2sequence_compute.h
deleted file mode 100644
index 1db0e1fb40..0000000000
--- a/lite/kernels/arm/im2sequence_compute.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <stdint.h>
-#include "lite/backends/arm/math/type_trans.h"
-#include "lite/core/kernel.h"
-#include "lite/operators/im2sequence_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class Im2SequenceCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::Im2SequenceParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  ~Im2SequenceCompute() {}
-
- private:
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/increment_compute.cc b/lite/kernels/arm/increment_compute.cc
deleted file mode 100644
index fd548f91f9..0000000000
--- a/lite/kernels/arm/increment_compute.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/increment_compute.h"
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void IncrementCompute::PrepareForRun() {}
-
-void IncrementCompute::Run() {
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  auto& param = this->Param<operators::IncrementParam>();
-
-  int total_num = param.X->dims().production();
-
-  const auto* x_data = param.X->data<int>();
-  auto* o_data = param.Out->mutable_data<int>();
-  lite::arm::math::increment(x_data, total_num, param.step, o_data, &ctx);
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(increment,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::IncrementCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/increment_compute.h b/lite/kernels/arm/increment_compute.h
deleted file mode 100644
index d26ddd4177..0000000000
--- a/lite/kernels/arm/increment_compute.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <stdint.h>
-#include "lite/backends/arm/math/type_trans.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class IncrementCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::IncrementParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  ~IncrementCompute() {}
-
- private:
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/interpolate_compute.cc b/lite/kernels/arm/interpolate_compute.cc
deleted file mode 100644
index a26777826d..0000000000
--- a/lite/kernels/arm/interpolate_compute.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/interpolate_compute.h"
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void BilinearInterpCompute::Run() {
-  auto& param = Param<operators::InterpolateParam>();
-  lite::Tensor* X = param.X;
-  lite::Tensor* OutSize = param.OutSize;
-  lite::Tensor* Out = param.Out;
-  float scale = param.scale;
-  int out_w = param.out_w;
-  int out_h = param.out_h;
-  bool align_corners = param.align_corners;
-  std::string interp_method = "Bilinear";
-  lite::arm::math::interpolate(X,
-                               OutSize,
-                               Out,
-                               out_h,
-                               out_w,
-                               scale,
-                               scale,
-                               align_corners,
-                               interp_method);
-}
-
-void NearestInterpCompute::Run() {
-  auto& param = Param<operators::InterpolateParam>();
-  lite::Tensor* X = param.X;
-  lite::Tensor* OutSize = param.OutSize;
-  lite::Tensor* Out = param.Out;
-  float scale = param.scale;
-  int out_w = param.out_w;
-  int out_h = param.out_h;
-  bool align_corners = param.align_corners;
-  std::string interp_method = "Nearest";
-  lite::arm::math::interpolate(X,
-                               OutSize,
-                               Out,
-                               out_h,
-                               out_w,
-                               scale,
-                               scale,
-                               align_corners,
-                               interp_method);
-}
-
-} /* namespace arm */
-} /* namespace kernels */
-} /* namespace lite */
-} /* namespace paddle */
-
-REGISTER_LITE_KERNEL(bilinear_interp,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::BilinearInterpCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("OutSize", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(nearest_interp,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::NearestInterpCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("OutSize", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/interpolate_compute.h b/lite/kernels/arm/interpolate_compute.h
deleted file mode 100644
index 52b4eaa665..0000000000
--- a/lite/kernels/arm/interpolate_compute.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class BilinearInterpCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void Run() override;
-
-  virtual ~BilinearInterpCompute() = default;
-};
-
-class NearestInterpCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void Run() override;
-
-  virtual ~NearestInterpCompute() = default;
-};
-
-} /* namespace arm */
-} /* namespace kernels */
-} /* namespace lite */
-} /* namespace paddle */
diff --git a/lite/kernels/arm/is_empty_compute.cc b/lite/kernels/arm/is_empty_compute.cc
deleted file mode 100644
index fac4e34bc2..0000000000
--- a/lite/kernels/arm/is_empty_compute.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/is_empty_compute.h"
-#include <vector>
-#include "lite/api/paddle_place.h"
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void IsEmptyCompute::PrepareForRun() {}
-
-void IsEmptyCompute::Run() {
-  auto& param = this->Param<operators::IsEmptyParam>();
-  const size_t count = param.X->numel();
-  param.Out->mutable_data<bool>()[0] = (count == 0);
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-REGISTER_LITE_KERNEL(is_empty,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::IsEmptyCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/is_empty_compute.h b/lite/kernels/arm/is_empty_compute.h
deleted file mode 100644
index 072c8feac3..0000000000
--- a/lite/kernels/arm/is_empty_compute.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <stdint.h>
-#include "lite/backends/arm/math/type_trans.h"
-#include "lite/core/kernel.h"
-#include "lite/operators/logical_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class IsEmptyCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::IsEmptyParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  ~IsEmptyCompute() {}
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/lod_reset_compute.cc b/lite/kernels/arm/lod_reset_compute.cc
deleted file mode 100644
index cb995d265e..0000000000
--- a/lite/kernels/arm/lod_reset_compute.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/lod_reset_compute.h"
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-void LodResetCompute::PrepareForRun() {}
-
-void LodResetCompute::Run() {
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  auto& param = this->Param<operators::LodResetParam>();
-  const auto* x_data = param.X->data<float>();
-  auto* o_data = param.Out->mutable_data<float>();
-  memcpy(o_data, x_data, sizeof(float) * param.X->numel());
-  auto lod = param.Out->mutable_lod();
-  if (param.Y) {
-    if (param.Y->lod().size()) {
-      *lod = param.Y->lod();
-    } else {
-      const auto* y_data = param.Y->data<int>();
-      (*lod).resize(1);
-      (*lod)[0].resize(param.Y->numel());
-      for (int i = 0; i < param.Y->numel(); i++) {
-        (*lod)[0][i] = y_data[i];
-      }
-    }
-  } else {
-    (*lod).resize(1);
-    for (auto id : param.target_lod) {
-      (*lod)[0].push_back(id);
-    }
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(lod_reset,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::LodResetCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/lod_reset_compute.h b/lite/kernels/arm/lod_reset_compute.h
deleted file mode 100644
index 7ecb271967..0000000000
--- a/lite/kernels/arm/lod_reset_compute.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <stdint.h>
-#include "lite/backends/arm/math/type_trans.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-class LodResetCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::LodResetParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  ~LodResetCompute() {}
-
- private:
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/logical_compute.cc b/lite/kernels/arm/logical_compute.cc
deleted file mode 100644
index 1e47329d8f..0000000000
--- a/lite/kernels/arm/logical_compute.cc
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/logical_compute.h"
-#include <vector>
-#include "lite/api/paddle_place.h"
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-#define LOGICAL_FUNCTOR(name, op)                                           \
-  template <typename T>                                                     \
-  struct _##name##Functor {                                                 \
-    inline bool operator()(const T& a, const T& b) const { return a op b; } \
-  };
-
-LOGICAL_FUNCTOR(LogicalAnd, &&);
-LOGICAL_FUNCTOR(LogicalOr, ||);
-
-template <typename T>
-struct _LogicalXorFunctor {
-  inline bool operator()(const T& a, const T& b) const {
-    return (a || b) && !(a && b);
-  }
-};
-
-template <typename T>
-struct _LogicalNotFunctor {
-  inline bool operator()(const T& a) const { return !a; }
-};
-
-// template<typename Functor>
-template <template <typename T> class Functor>
-void BinaryLogicalCompute<Functor>::PrepareForRun() {}
-
-template <template <typename T> class Functor>
-// template<typename Functor>
-void BinaryLogicalCompute<Functor>::Run() {
-  auto& param = this->Param<operators::LogicalParam>();
-  const size_t count = param.X->numel();
-  bool* z = param.Out->template mutable_data<bool>();
-  const bool* x = param.X->template data<bool>();
-  const bool* y = param.Y->template data<bool>();
-  using LogicalFunctor = Functor<bool>;
-  for (int i = 0; i < count; ++i) {
-    z[i] = LogicalFunctor()(x[i], y[i]);
-  }
-}
-
-template <template <typename> class Functor>
-void UnaryLogicalCompute<Functor>::PrepareForRun() {}
-
-template <template <typename> class Functor>
-void UnaryLogicalCompute<Functor>::Run() {
-  auto& param = this->Param<operators::LogicalParam>();
-  const size_t count = param.X->numel();
-  bool* z = param.Out->template mutable_data<bool>();
-  const auto x = param.X->template data<bool>();
-  using LogicalFunctor = Functor<bool>;
-  for (int i = 0; i < count; ++i) {
-    z[i] = LogicalFunctor()(x[i]);
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(logical_xor,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::BinaryLogicalCompute<
-                         paddle::lite::kernels::arm::_LogicalXorFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-REGISTER_LITE_KERNEL(logical_and,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::BinaryLogicalCompute<
-                         paddle::lite::kernels::arm::_LogicalAndFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-REGISTER_LITE_KERNEL(logical_or,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::BinaryLogicalCompute<
-                         paddle::lite::kernels::arm::_LogicalOrFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-REGISTER_LITE_KERNEL(logical_not,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::UnaryLogicalCompute<
-                         paddle::lite::kernels::arm::_LogicalNotFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
diff --git a/lite/kernels/arm/logical_compute.h b/lite/kernels/arm/logical_compute.h
deleted file mode 100644
index fe7ef1e92d..0000000000
--- a/lite/kernels/arm/logical_compute.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <stdint.h>
-#include "lite/backends/arm/math/type_trans.h"
-#include "lite/core/kernel.h"
-#include "lite/operators/logical_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-// template <typename Functor>
-template <template <typename> class Functor>
-class BinaryLogicalCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::LogicalParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  ~BinaryLogicalCompute() {}
-};
-
-template <template <typename> class Functor>
-class UnaryLogicalCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::LogicalParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  ~UnaryLogicalCompute() {}
-};
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/lookup_table_compute.cc b/lite/kernels/arm/lookup_table_compute.cc
deleted file mode 100644
index d39d7ccb60..0000000000
--- a/lite/kernels/arm/lookup_table_compute.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/lookup_table_compute.h"
-#include <string>
-#include <vector>
-#include "lite/api/paddle_place.h"
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void LookupTableCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  // inputs
-  auto w = param.W;
-  auto ids = param.Ids;
-  // outputs
-  auto out = param.Out;
-
-  auto table_dim = w->dims();
-  int64_t ids_numel = ids->numel();
-  auto ids_data = ids->data<float>();
-  int ids_int = ids_data[0];
-  int64_t row_number = table_dim[0];
-  int64_t row_width = table_dim[1];
-  auto table_data = w->data<float>();
-  auto dout = out->mutable_data<float>();
-
-  for (int64_t i = 0; i < ids_numel; ++i) {
-    if (param.padding_idx != -1 && ids_data[i] == param.padding_idx) {
-      memset(dout + i * row_width, 0, row_width * sizeof(float));
-    } else {
-      CHECK_LT(ids_data[i], row_number)
-          << "look uptable ids[i] < row_number check failed";
-      CHECK_GE(ids_data[i], 0) << "lookuptable ids[i] >= 0 check failed";
-
-      memcpy(dout + i * row_width,
-             table_data + ids_int * row_width,
-             row_width * sizeof(float));
-    }
-  }
-  *(out->mutable_lod()) = ids->lod();
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(lookup_table,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::LookupTableCompute,
-                     def)
-    .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/lookup_table_compute.h b/lite/kernels/arm/lookup_table_compute.h
deleted file mode 100644
index 2b66835e71..0000000000
--- a/lite/kernels/arm/lookup_table_compute.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/core/kernel.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class LookupTableCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::LookupTableParam;
-
-  LookupTableCompute() = default;
-
-  void Run() override;
-
-  virtual ~LookupTableCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/lrn_compute.cc b/lite/kernels/arm/lrn_compute.cc
deleted file mode 100644
index 18e6654282..0000000000
--- a/lite/kernels/arm/lrn_compute.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/lrn_compute.h"
-#include <string>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void LrnCompute::Run() {
-  auto& param = Param<operators::LrnParam>();
-  const float* x_data = param.X->data<float>();
-  float* out_data = param.Out->mutable_data<float>();
-  auto x_dims = param.X->dims();
-  CHECK_EQ(x_dims.size(), 4);
-  int num = x_dims[0];
-  int channel = x_dims[1];
-  int h = x_dims[2];
-  int w = x_dims[3];
-  const int local_size = param.local_size;
-  const float alpha = param.alpha;
-  const float beta = param.beta;
-  const float k = param.k;
-  if (param.norm_region == "AcrossChannels") {
-    lite::arm::math::compute_across_channels(
-        x_data, out_data, num, channel, h, w, local_size, alpha, beta, k);
-  } else {
-    lite::arm::math::compute_within_channels(
-        x_data, out_data, num, channel, h, w, local_size, alpha, beta, k);
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    lrn, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::LrnCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/lrn_compute.h b/lite/kernels/arm/lrn_compute.h
deleted file mode 100644
index 836ac93cfc..0000000000
--- a/lite/kernels/arm/lrn_compute.h
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class LrnCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::LrnParam;
-
-  void Run() override;
-
-  virtual ~LrnCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/lrn_compute_test.cc b/lite/kernels/arm/lrn_compute_test.cc
deleted file mode 100644
index 03683aa212..0000000000
--- a/lite/kernels/arm/lrn_compute_test.cc
+++ /dev/null
@@ -1,196 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/lrn_compute.h"
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-/**
- * @brief get sum of x^2 between channels [size elements]
- *
- * @tparam dtype
- * @param input
- * @param channel_id: the c-th channel within n-th graph.
- * @param offset_within_channel: the pixel's offset within a channel.
- * @param offset_num: the first address of n-th graph.
- * @param c
- * @param h
- * @param w
- * @param size
- * @return dtype
- */
-template <typename dtype>
-dtype lrn_square(const dtype* input,
-                 int channel_id,
-                 int offset_within_channel,
-                 int offset_num,
-                 int c,
-                 int h,
-                 int w,
-                 int size) {
-  int pre_pad = (size - 1) / 2;
-  dtype res = 0;
-  const dtype* src = input + offset_num;
-
-  // handle left channels with padding situation.
-  if (channel_id - pre_pad < 0) {
-    for (int i = 0; i <= channel_id; ++i) {
-      res += src[i * h * w + offset_within_channel] *
-             src[i * h * w + offset_within_channel];
-    }
-  }
-
-  // handle left channels.
-  if (channel_id - pre_pad >= 0) {
-    for (int i = channel_id - pre_pad; i <= channel_id; ++i) {
-      res += src[i * h * w + offset_within_channel] *
-             src[i * h * w + offset_within_channel];
-    }
-  }
-
-  // handle right channels.
-  if (channel_id + pre_pad < c) {
-    for (int i = channel_id + 1; i <= channel_id + pre_pad; ++i) {
-      res += src[i * h * w + offset_within_channel] *
-             src[i * h * w + offset_within_channel];
-    }
-  }
-
-  // handle right channels with padding situation.
-  if (channel_id + pre_pad >= c && channel_id + 1 < c) {
-    for (int i = channel_id + 1; i < c; ++i) {
-      res += src[i * h * w + offset_within_channel] *
-             src[i * h * w + offset_within_channel];
-    }
-  }
-
-  return res;
-}
-
-template <typename dtype>
-void lrn_compute_ref(const operators::LrnParam& param) {
-  const dtype* x_data = param.X->data<const dtype>();
-  dtype* out_data = param.Out->mutable_data<dtype>();
-  auto x_dims = param.X->dims();
-  int local_size = param.local_size;
-  float alpha = param.alpha;
-  float beta = param.beta;
-  float k = param.k;
-  std::string norm_region = param.norm_region;
-
-  int N = x_dims[0];
-  int C = x_dims[1];
-  int H = x_dims[2];
-  int W = x_dims[3];
-
-  int pre_pad = (local_size - 1) / 2;
-  int offset_num = 0;
-  int offset_within_channel = 0;
-  int dst_id;
-
-  dtype square;
-
-  for (int n = 0; n < N; ++n) {
-    offset_num = n * C * H * W;
-
-    for (int c = 0; c < C; ++c) {
-      for (int h = 0; h < H; ++h) {
-        for (int w = 0; w < W; ++w) {
-          offset_within_channel = h * W + w;
-          dst_id = offset_num + c * H * W + offset_within_channel;
-          square = lrn_square<dtype>(x_data,
-                                     c,
-                                     offset_within_channel,
-                                     offset_num,
-                                     C,
-                                     H,
-                                     W,
-                                     local_size);
-          out_data[dst_id] = x_data[dst_id] * pow(k + alpha * square, -beta);
-        }
-      }
-    }
-  }
-}
-
-TEST(lrn_arm, retrive_op) {
-  auto lrn =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("lrn");
-  ASSERT_FALSE(lrn.empty());
-  ASSERT_TRUE(lrn.front());
-}
-
-TEST(lrn_arm, init) {
-  LrnCompute lrn;
-  ASSERT_EQ(lrn.precision(), PRECISION(kFloat));
-  ASSERT_EQ(lrn.target(), TARGET(kARM));
-}
-
-TEST(lrn_arm, compute) {
-  LrnCompute lrn;
-  operators::LrnParam param;
-  lite::Tensor x, output, output_ref;
-
-  int local_size = 5;
-  float alpha = 1.0f;
-  float beta = 0.75;
-  float k = 1.0f;
-  std::string norm_region = "AcrossChannels";
-  for (int w : {1, 2, 4, 8}) {
-    for (int h : {1, 2, 4, 8}) {
-      for (int c : {1, 2, 3, 4}) {
-        for (int n : {1, 2, 3, 4}) {
-          auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
-          x.Resize(x_dim);
-          output.Resize(x_dim);
-          output_ref.Resize(x_dim);
-          auto* x_data = x.mutable_data<float>();
-          auto* output_data = output.mutable_data<float>();
-          auto* output_ref_data = output_ref.mutable_data<float>();
-          for (int i = 0; i < x_dim.production(); i++) {
-            x_data[i] = i;
-          }
-          param.X = &x;
-          param.Out = &output;
-          param.local_size = local_size;
-          param.alpha = alpha;
-          param.beta = beta;
-          param.k = k;
-          param.norm_region = norm_region;
-          lrn.SetParam(param);
-          lrn.Run();
-          param.Out = &output_ref;
-          lrn_compute_ref<float>(param);
-          for (int i = 0; i < output.dims().production(); i++) {
-            EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(lrn, kARM, kFloat, kNCHW, def);
diff --git a/lite/kernels/arm/matmul_compute.cc b/lite/kernels/arm/matmul_compute.cc
deleted file mode 100644
index 29be34d0c2..0000000000
--- a/lite/kernels/arm/matmul_compute.cc
+++ /dev/null
@@ -1,277 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/matmul_compute.h"
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void MatMulCompute::PrepareForRun() {
-  auto& ctx = this->ctx_->template As<ARMContext>();
-}
-
-void MatMulCompute::Run() {
-  auto& param = Param<param_t>();
-
-  const auto* x_data = param.X->data<float>();
-  const auto* y_data = param.Y->data<float>();
-  auto* o_data = param.Out->mutable_data<float>();
-
-  auto x_dims = param.X->dims();
-  auto y_dims = param.Y->dims();
-  auto o_dims = param.Out->dims();
-  bool x_transpose = param.transpose_X;
-  bool y_transpose = param.transpose_Y;
-  float alpha = param.alpha;
-  auto& ctx = this->ctx_->template As<ARMContext>();
-
-  if (x_dims.size() > 2 && y_dims.size() >= 2) {
-    // x: [B, ..., M, K], y: [B, ..., K, N], out: [B, ..., M, N]
-    // x: [B, M, K], y: [K, N], out: [B, M, N]
-
-    if (!x_transpose && !y_transpose) {
-      CHECK_EQ(x_dims[x_dims.size() - 1], y_dims[y_dims.size() - 2])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << ") x_transpose is " << x_transpose << "y_transpose is "
-          << y_transpose;
-    } else if (!x_transpose && y_transpose) {
-      CHECK_EQ(x_dims[x_dims.size() - 1], y_dims[y_dims.size() - 1])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << ") x_transpose is " << x_transpose << "y_transpose is "
-          << y_transpose;
-    } else if (x_transpose && !y_transpose) {
-      CHECK_EQ(x_dims[x_dims.size() - 2], y_dims[y_dims.size() - 2])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << ") x_transpose is " << x_transpose << "y_transpose is "
-          << y_transpose;
-    } else {
-      CHECK_EQ(x_dims[x_dims.size() - 2], y_dims[y_dims.size() - 1])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << ") x_transpose is " << x_transpose << "y_transpose is "
-          << y_transpose;
-    }
-
-    int lda, ldb, ldc;
-    if (!x_transpose) {
-      m_ = x_dims[x_dims.size() - 2];
-      k_ = x_dims[x_dims.size() - 1];
-      lda = k_;
-    } else {
-      m_ = x_dims[x_dims.size() - 1];
-      k_ = x_dims[x_dims.size() - 2];
-      lda = m_;
-    }
-
-    if (!y_transpose) {
-      n_ = y_dims[y_dims.size() - 1];
-      ldb = n_;
-    } else {
-      n_ = y_dims[y_dims.size() - 2];
-      ldb = k_;
-    }
-
-    ldc = n_;
-
-    int x_inner = x_dims[x_dims.size() - 2] * x_dims[x_dims.size() - 1];
-    int y_inner = y_dims[y_dims.size() - 2] * y_dims[y_dims.size() - 1];
-    int out_inner = o_dims[o_dims.size() - 2] * o_dims[o_dims.size() - 1];
-
-    float* x_data_trans = nullptr;
-    if (x_transpose) {
-      x_data_trans = static_cast<float*>(malloc(sizeof(float) * x_inner));
-    }
-
-    if (y_dims.size() > 2) {
-      for (size_t i = 0; i < x_dims.count(0, x_dims.size() - 2); ++i) {
-        lite::arm::math::sgemm(x_transpose,
-                               y_transpose,
-                               m_,
-                               n_,
-                               k_,
-                               alpha,
-                               x_data + i * x_inner,
-                               lda,
-                               y_data + i * y_inner,
-                               ldb,
-                               0.f,
-                               o_data + i * out_inner,
-                               ldc,
-                               nullptr,
-                               false,
-                               false,
-                               &ctx);
-      }
-    } else {
-      for (size_t i = 0; i < x_dims.count(0, x_dims.size() - 2); ++i) {
-        lite::arm::math::sgemm(x_transpose,
-                               y_transpose,
-                               m_,
-                               n_,
-                               k_,
-                               alpha,
-                               x_data + i * x_inner,
-                               lda,
-                               y_data,
-                               ldb,
-                               0.f,
-                               o_data + i * out_inner,
-                               ldc,
-                               nullptr,
-                               false,
-                               false,
-                               &ctx);
-      }
-    }
-    if (x_data_trans) {
-      free(x_data_trans);
-    }
-  } else if (x_dims.size() == 2 && y_dims.size() == 2) {
-    // x: [M, K], y: [K, N], out: [M, N]
-    if (!x_transpose && !y_transpose) {
-      CHECK_EQ(x_dims[1], y_dims[0])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << "), x_transpose is " << x_transpose << ", y_transpose is "
-          << y_transpose;
-    } else if (!x_transpose && y_transpose) {
-      CHECK_EQ(x_dims[1], y_dims[1])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << "), x_transpose is " << x_transpose << ", y_transpose is "
-          << y_transpose;
-    } else if (x_transpose && !y_transpose) {
-      CHECK_EQ(x_dims[0], y_dims[0])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << "), x_transpose is " << x_transpose << ", y_transpose is "
-          << y_transpose;
-    } else {
-      CHECK_EQ(x_dims[0], y_dims[1])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << "), x_transpose is " << x_transpose << ", y_transpose is "
-          << y_transpose;
-    }
-
-    int lda, ldb, ldc;
-    if (!x_transpose) {
-      m_ = x_dims[0];
-      k_ = x_dims[1];
-      lda = k_;
-    } else {
-      m_ = x_dims[1];
-      k_ = x_dims[0];
-      lda = m_;
-    }
-    if (!y_transpose) {
-      n_ = y_dims[1];
-      ldb = n_;
-    } else {
-      n_ = y_dims[0];
-      ldb = k_;
-    }
-    ldc = n_;
-
-    lite::arm::math::sgemm(x_transpose,
-                           y_transpose,
-                           m_,
-                           n_,
-                           k_,
-                           alpha,
-                           x_data,
-                           lda,
-                           y_data,
-                           ldb,
-                           0.f,
-                           o_data,
-                           ldc,
-                           nullptr,
-                           false,
-                           false,
-                           &ctx);
-  } else if (x_dims.size() > 2 && y_dims.size() == 1) {
-    // x: [B, M, K], y: [K], out: [B, M]
-    CHECK_EQ(x_dims[x_dims.size() - 1], y_dims[0])
-        << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-        << ")";
-    for (size_t i = 0; i < x_dims.count(0, x_dims.size() - 1); ++i) {
-      o_data[i] = 0;
-      for (size_t j = 0; j < y_dims[0]; ++j) {
-        o_data[i] += x_data[i * y_dims[0] + j] * y_data[j] * alpha;
-      }
-    }
-  } else if (x_dims.size() == 1 && y_dims.size() == 1) {
-    // x: [K], y: [K], out: [1]
-    if (x_dims[0] == y_dims[0] && x_transpose == false &&
-        y_transpose == false) {
-      o_data[0] = 0.;
-      for (size_t i = 0; i < x_dims[0]; ++i) {
-        o_data[0] += x_data[i] * y_data[i] * alpha;
-      }
-    }
-    // x: [M], y: [N], x_transpose: true, y_transpose: true, out: [M, N]
-    if (x_transpose == true && y_transpose == true) {
-      m_ = x_dims[0];
-      k_ = 1;
-      n_ = y_dims[0];
-      int lda = k_;
-      int ldb = n_;
-      int ldc = n_;
-      if (n_ == 1) {
-        lite::arm::math::sgemv(
-            x_data, y_data, o_data, false, m_, k_, false, nullptr, false);
-        if (fabsf(alpha - 1.f) > 1e-8f) {
-          for (size_t i = 0; i < param.Out->dims().production(); ++i) {
-            o_data[i] *= alpha;
-          }
-        }
-      } else {
-        lite::arm::math::sgemm(false,
-                               false,
-                               m_,
-                               n_,
-                               k_,
-                               alpha,
-                               x_data,
-                               lda,
-                               y_data,
-                               ldb,
-                               0.f,
-                               o_data,
-                               ldc,
-                               nullptr,
-                               false,
-                               false,
-                               &ctx);
-      }
-    }
-  } else {
-    LOG(FATAL) << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-               << ")";
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    matmul, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::MatMulCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/matmul_compute.h b/lite/kernels/arm/matmul_compute.h
deleted file mode 100644
index 7050a05fcf..0000000000
--- a/lite/kernels/arm/matmul_compute.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/types.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class MatMulCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::MatMulParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  virtual ~MatMulCompute() = default;
-
- private:
-  int m_, n_, k_;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/mul_compute.cc b/lite/kernels/arm/mul_compute.cc
deleted file mode 100644
index d0ae2d0df1..0000000000
--- a/lite/kernels/arm/mul_compute.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/mul_compute.h"
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void MulCompute::PrepareForRun() {
-  auto& ctx = this->ctx_->template As<ARMContext>();
-}
-
-void MulCompute::Run() {
-  auto& param = Param<param_t>();
-
-  const auto* x_data = param.x->data<float>();
-  const auto* y_data = param.y->data<float>();
-  auto* o_data = param.output->mutable_data<float>();
-
-  m_ = static_cast<int>(
-      param.x->dims().Slice(0, param.x_num_col_dims).production());
-  int x_w =
-      static_cast<int>(param.x->dims()
-                           .Slice(param.x_num_col_dims, param.x->dims().size())
-                           .production());
-  int y_h = static_cast<int>(
-      param.y->dims().Slice(0, param.y_num_col_dims).production());
-  n_ = static_cast<int>(param.y->dims()
-                            .Slice(param.y_num_col_dims, param.y->dims().size())
-                            .production());
-
-  CHECK_EQ(x_w, y_h) << "x_w must be equal with y_h";
-  k_ = x_w;
-
-  if (n_ == 1) {
-    lite::arm::math::sgemv(
-        x_data, y_data, o_data, false, m_, k_, false, nullptr, false);
-
-  } else {
-    constexpr bool is_tranposed_y = false;
-    auto& ctx = this->ctx_->template As<ARMContext>();
-    int hblock = lite::arm::math::get_hblock(ctx.arch());
-    int m_round = hblock * ((m_ + hblock - 1) / hblock);
-    ctx.ExtendWorkspace(m_round * k_ * sizeof(float));
-
-    float* packed_x = static_cast<float*>(ctx.workspace_data<float>()) +
-                      ctx.llc_size() / sizeof(float);
-    lite::arm::math::prepackA(
-        packed_x, x_data, 1.f, k_, 0, m_, 0, k_, false, &ctx);
-    int ldb = n_;
-    if (is_tranposed_y) {
-      ldb = k_;
-    }
-    lite::arm::math::sgemm_prepack(is_tranposed_y,
-                                   m_,
-                                   n_,
-                                   k_,
-                                   packed_x,
-                                   y_data,
-                                   ldb,
-                                   0.f,
-                                   o_data,
-                                   n_,
-                                   nullptr,
-                                   false,
-                                   false,
-                                   &ctx);
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    mul, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::MulCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/mul_compute.h b/lite/kernels/arm/mul_compute.h
deleted file mode 100644
index e1e4956edf..0000000000
--- a/lite/kernels/arm/mul_compute.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/types.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class MulCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::MulParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  virtual ~MulCompute() = default;
-
- private:
-  int m_, n_, k_;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/mul_compute_test.cc b/lite/kernels/arm/mul_compute_test.cc
deleted file mode 100644
index cddee81fe2..0000000000
--- a/lite/kernels/arm/mul_compute_test.cc
+++ /dev/null
@@ -1,182 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/mul_compute.h"
-#include <gtest/gtest.h>
-#include <algorithm>
-#include <iostream>
-#include <memory>
-#include <random>
-#include <utility>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-#define A(i, j) a[i * lda + j]
-#define B(i, j) b[i * ldb + j]
-#define C(i, j) c[i * ldc + j]
-
-template <typename T>
-void mul_gemm(const T* a,
-              const int M,
-              const int K,
-              const T* b,
-              const int K_,
-              const int N,
-              T* c) {
-  EXPECT_TRUE(K_ == K && M > 0 && N > 0 && K > 0);
-  EXPECT_TRUE(a && b && c);
-  const int lda = K;
-  const int ldb = N;
-  const int ldc = N;
-  for (int m = 0; m < M; ++m) {
-    for (int n = 0; n < N; ++n) {
-      C(m, n) = 0.0f;
-      for (int k = 0; k < K; ++k) {
-        C(m, n) += A(m, k) * B(k, n);
-      }
-    }
-  }
-}
-
-template <typename T>
-void FillData(T* a,
-              const int n,
-              const T lower = static_cast<T>(-2.f),
-              const T upper = static_cast<T>(2.f)) {
-  static unsigned int seed = 100;
-  std::mt19937 rng(seed++);
-  std::uniform_real_distribution<double> uniform_dist(0, 1);
-  for (int i = 0; i < n; ++i) {
-    a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
-  }
-}
-
-TEST(mul_arm, retrive_op) {
-  auto mul =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("mul");
-  ASSERT_FALSE(mul.empty());
-  ASSERT_TRUE(mul.front());
-}
-
-TEST(mul_arm, init) {
-  MulCompute mul;
-  ASSERT_EQ(mul.precision(), PRECISION(kFloat));
-  ASSERT_EQ(mul.target(), TARGET(kARM));
-}
-
-TEST(mul_arm, compare_test) {
-  using T = float;
-
-  for (int m : {1, 2, 3, 4}) {
-    for (int n : {1, 2, 3, 4}) {
-      for (int k : {1, 2, 3, 4}) {
-        VLOG(3) << "m: " << m << ", n: " << n << ", k: " << k;
-        lite::Tensor x, y, out, ref;
-        x.Resize({m, k});
-        y.Resize({k, n});
-        out.Resize({m, n});
-        ref.Resize({m, n});
-
-        auto* x_data = x.mutable_data<T>();
-        auto* y_data = y.mutable_data<T>();
-        auto* out_data = out.mutable_data<T>();
-        auto* ref_data = ref.mutable_data<T>();
-
-        FillData<T>(x_data, x.dims().production());
-        FillData<T>(y_data, y.dims().production());
-        FillData<T>(out_data, out.dims().production(), 0, 0);
-        FillData<T>(ref_data, ref.dims().production(), 0, 0);
-
-        MulCompute mul;
-        operators::MulParam param;
-
-        param.x = &x;
-        param.y = &y;
-        param.output = &out;
-
-        DeviceInfo::Init();
-        std::unique_ptr<KernelContext> ctx(new KernelContext);
-        ctx->As<ARMContext>();
-        mul.SetParam(param);
-        mul.SetContext(std::move(ctx));
-        mul.PrepareForRun();
-
-        mul.Run();
-
-        mul_gemm<T>(x_data, m, k, y_data, k, n, ref_data);
-
-        for (int i = 0; i < out.dims().production(); i++) {
-          EXPECT_NEAR(out_data[i], ref_data[i], 1e-3);
-        }
-      }
-    }
-  }
-}
-
-TEST(mul_arm, num_col_dims) {
-  using T = float;
-
-  lite::Tensor x, y, out, ref;
-  x.Resize({2, 3, 4});
-  y.Resize({3, 4, 5});
-  out.Resize({2, 5});
-  ref.Resize({2, 5});
-
-  auto* x_data = x.mutable_data<T>();
-  auto* y_data = y.mutable_data<T>();
-  auto* out_data = out.mutable_data<T>();
-  auto* ref_data = ref.mutable_data<T>();
-
-  FillData<T>(x_data, x.dims().production());
-  FillData<T>(y_data, y.dims().production());
-  FillData<T>(out_data, out.dims().production());
-  FillData<T>(ref_data, out.dims().production());
-
-  MulCompute mul;
-  operators::MulParam param;
-
-  param.x = &x;
-  param.y = &y;
-  param.output = &out;
-  param.x_num_col_dims = 1;
-  param.y_num_col_dims = 2;
-
-  DeviceInfo::Init();
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  ctx->As<ARMContext>();
-  mul.SetParam(param);
-  mul.SetContext(std::move(ctx));
-  mul.PrepareForRun();
-
-  mul.Run();
-
-  mul_gemm<T>(x_data, 2, 12, y_data, 12, 5, ref_data);
-
-  for (int i = 0; i < out.dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], ref_data[i], 1e-3);
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
diff --git a/lite/kernels/arm/negative_compute.cc b/lite/kernels/arm/negative_compute.cc
deleted file mode 100644
index b5f59d842e..0000000000
--- a/lite/kernels/arm/negative_compute.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/negative_compute.h"
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void NegativeCompute::PrepareForRun() {
-  LOG(INFO) << "into negative kernels prepare run";
-}
-
-void NegativeCompute::Run() {
-  LOG(INFO) << "into kernel compute run";
-  auto& param = Param<param_t>();
-  const auto* x_data = param.X->data<float>();
-  auto* o_data = param.Out->mutable_data<float>();
-  int num = param.X->dims().production();
-  LOG(INFO) << "into negative fun";
-  lite::arm::math::negative_func<float>(x_data, o_data, num);
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(negative,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::NegativeCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/negative_compute.h b/lite/kernels/arm/negative_compute.h
deleted file mode 100644
index 5069240b3c..0000000000
--- a/lite/kernels/arm/negative_compute.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/core/kernel.h"
-#include "lite/operators/negative_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class NegativeCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::NegativeParam;
-  void PrepareForRun() override;
-  void Run() override;
-
-  virtual ~NegativeCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/norm_compute.cc b/lite/kernels/arm/norm_compute.cc
deleted file mode 100644
index 3cc1645fc6..0000000000
--- a/lite/kernels/arm/norm_compute.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/norm_compute.h"
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void NormCompute::PrepareForRun() {}
-
-void NormCompute::Run() {
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  auto& param = this->Param<operators::NormParam>();
-
-  auto input_dims = param.X->dims();
-  int dim_size = param.X->dims().size();
-  auto axis = (param.axis < 0) ? param.axis + dim_size : param.axis;
-
-  const auto* x_data = param.X->data<float>();
-  auto* o_data = param.Out->mutable_data<float>();
-  int pre_n = input_dims.count(0, axis);
-  int post_n = input_dims.count(axis + 1, dim_size);
-  int n = input_dims[axis];
-  lite::arm::math::norm(x_data, pre_n, n, post_n, param.epsilon, o_data, &ctx);
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    norm, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::NormCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/norm_compute.h b/lite/kernels/arm/norm_compute.h
deleted file mode 100644
index c9783d999b..0000000000
--- a/lite/kernels/arm/norm_compute.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <stdint.h>
-#include "lite/backends/arm/math/type_trans.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class NormCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::NormParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  ~NormCompute() {}
-
- private:
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/pad2d_compute.cc b/lite/kernels/arm/pad2d_compute.cc
deleted file mode 100644
index 2c3952018f..0000000000
--- a/lite/kernels/arm/pad2d_compute.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/pad2d_compute.h"
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void Pad2dCompute::Run() {
-  auto& param = Param<operators::Pad2dParam>();
-  const lite::Tensor* inputs = param.X;
-  auto* out = param.Out;
-
-  if (param.mode == "constant") {
-    mode_ = 0;
-  } else if (param.mode == "reflect") {
-    mode_ = 1;
-  } else if (param.mode == "edge") {
-    mode_ = 2;
-  } else {
-    LOG(FATAL) << "Unknown mode type";
-  }
-
-  pad_h_ = {param.paddings[0], param.paddings[1]};
-  pad_w_ = {param.paddings[2], param.paddings[3]};
-  pad_value_ = param.pad_value;
-  data_format_ = param.data_format;
-  if (mode_ == 2) {
-    // nchw
-    auto input_dims = inputs->dims();
-    CHECK_LE(pad_h_[0], input_dims[2] - 1)
-        << "pad top size must <= inputs height - 1";
-    CHECK_LE(pad_h_[1], input_dims[2] - 1)
-        << "pad bottom size must <= inputs height - 1";
-    CHECK_LE(pad_w_[0], input_dims[3] - 1)
-        << "pad left size must <= inputs width - 1";
-    CHECK_LE(pad_w_[1], input_dims[3] - 1)
-        << "pad right size must  <= inputs width - 1";
-  }
-  lite::arm::math::pad2d_func(inputs, out, mode_, pad_h_, pad_w_, pad_value_);
-  return;
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    pad2d, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::Pad2dCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/pad2d_compute.h b/lite/kernels/arm/pad2d_compute.h
deleted file mode 100644
index 66a233b1a7..0000000000
--- a/lite/kernels/arm/pad2d_compute.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/operators/pad2d_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class Pad2dCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::Pad2dParam;
-
-  void Run() override;
-
-  virtual ~Pad2dCompute() = default;
-
- private:
-  int mode_;
-  std::vector<int> pad_h_{0, 0};
-  std::vector<int> pad_w_{0, 0};
-  float pad_value_ = 0.f;
-  std::string data_format_{"NCHW"};
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/pool_compute.cc b/lite/kernels/arm/pool_compute.cc
deleted file mode 100644
index d95d658cf9..0000000000
--- a/lite/kernels/arm/pool_compute.cc
+++ /dev/null
@@ -1,228 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/pool_compute.h"
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void PoolCompute::PrepareForRun() {
-  auto& ctx = this->ctx_->template As<ARMContext>();
-}
-
-void PoolCompute::Run() {
-  auto& param = Param<operators::PoolParam>();
-  auto& in_dims = param.x->dims();
-  auto& out_dims = param.output->dims();
-
-  const float* din = param.x->data<float>();
-  float* dout = param.output->mutable_data<float>();
-
-  std::vector<int>& ksize = param.ksize;
-  std::vector<int>& strides = param.strides;
-  std::vector<int>& paddings = param.paddings;
-
-  std::string& pooling_type = param.pooling_type;
-  bool global_pooling = param.global_pooling;
-  bool exclusive = param.exclusive;
-  bool adaptive = param.adaptive;
-  bool ceil_mode = param.ceil_mode;
-  bool use_quantizer = param.use_quantizer;
-  std::string& data_format = param.data_format;
-
-  bool kps_equal = (ksize[0] == ksize[1]) && (strides[0] == strides[1]) &&
-                   (paddings[0] == paddings[1]);
-
-  if (global_pooling) {
-    for (size_t i = 0; i < ksize.size(); ++i) {
-      paddings[i] = 0;
-      ksize[i] = static_cast<int>(in_dims[i + 2]);
-    }
-    if (pooling_type == "max") {
-      lite::arm::math::pooling_global_max(din,
-                                          dout,
-                                          out_dims[0],
-                                          out_dims[1],
-                                          out_dims[2],
-                                          out_dims[3],
-                                          in_dims[1],
-                                          in_dims[2],
-                                          in_dims[3]);
-      VLOG(3) << "invoking pooling_global_max";
-      return;
-    } else if (pooling_type == "avg") {
-      lite::arm::math::pooling_global_avg(din,
-                                          dout,
-                                          out_dims[0],
-                                          out_dims[1],
-                                          out_dims[2],
-                                          out_dims[3],
-                                          in_dims[1],
-                                          in_dims[2],
-                                          in_dims[3]);
-      VLOG(3) << "invoking pooling_global_ave";
-      return;
-    }
-  } else {
-    if (ksize[0] == 2 && strides[0] == 2 && paddings[0] == 0 && kps_equal) {
-      if (pooling_type == "max") {
-        lite::arm::math::pooling2x2s2_max(din,
-                                          dout,
-                                          out_dims[0],
-                                          out_dims[1],
-                                          out_dims[2],
-                                          out_dims[3],
-                                          in_dims[1],
-                                          in_dims[2],
-                                          in_dims[3]);
-        VLOG(3) << "invoking pooling2x2s2_max";
-        return;
-      } else if (pooling_type == "avg") {
-        lite::arm::math::pooling2x2s2_avg(din,
-                                          dout,
-                                          out_dims[0],
-                                          out_dims[1],
-                                          out_dims[2],
-                                          out_dims[3],
-                                          in_dims[1],
-                                          in_dims[2],
-                                          in_dims[3],
-                                          exclusive);
-        VLOG(3) << "invoking pooling2x2s2_avg";
-        return;
-      }
-    } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 1 &&
-               kps_equal) {
-      if (pooling_type == "max") {
-        lite::arm::math::pooling3x3s1p1_max(din,
-                                            dout,
-                                            out_dims[0],
-                                            out_dims[1],
-                                            out_dims[2],
-                                            out_dims[3],
-                                            in_dims[1],
-                                            in_dims[2],
-                                            in_dims[3]);
-        VLOG(3) << "invokingpooling3x3s1p1_max";
-        return;
-      } else if (pooling_type == "avg") {
-        lite::arm::math::pooling3x3s1p1_avg(din,
-                                            dout,
-                                            out_dims[0],
-                                            out_dims[1],
-                                            out_dims[2],
-                                            out_dims[3],
-                                            in_dims[1],
-                                            in_dims[2],
-                                            in_dims[3],
-                                            exclusive);
-        VLOG(3) << "invoking pooling3x3s1p1_avg";
-        return;
-      }
-    } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 0 &&
-               kps_equal) {
-      if (pooling_type == "max") {
-        lite::arm::math::pooling3x3s2p0_max(din,
-                                            dout,
-                                            out_dims[0],
-                                            out_dims[1],
-                                            out_dims[2],
-                                            out_dims[3],
-                                            in_dims[1],
-                                            in_dims[2],
-                                            in_dims[3]);
-        VLOG(3) << "pooling3x3s2p0_max";
-        return;
-      } else if (pooling_type == "avg") {
-        lite::arm::math::pooling3x3s2p0_avg(din,
-                                            dout,
-                                            out_dims[0],
-                                            out_dims[1],
-                                            out_dims[2],
-                                            out_dims[3],
-                                            in_dims[1],
-                                            in_dims[2],
-                                            in_dims[3],
-                                            exclusive);
-        VLOG(3) << "invoking pooling3x3s2p0_avg";
-        return;
-      }
-    } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 1 &&
-               kps_equal) {
-      if (pooling_type == "max") {
-        lite::arm::math::pooling3x3s2p1_max(din,
-                                            dout,
-                                            out_dims[0],
-                                            out_dims[1],
-                                            out_dims[2],
-                                            out_dims[3],
-                                            in_dims[1],
-                                            in_dims[2],
-                                            in_dims[3]);
-        VLOG(3) << "invoking pooling3x3s2p1_max";
-        return;
-      } else if (pooling_type == "avg") {
-        lite::arm::math::pooling3x3s2p1_avg(din,
-                                            dout,
-                                            out_dims[0],
-                                            out_dims[1],
-                                            out_dims[2],
-                                            out_dims[3],
-                                            in_dims[1],
-                                            in_dims[2],
-                                            in_dims[3],
-                                            exclusive);
-        VLOG(3) << "invoking pooling3x3s2p1_avg";
-        return;
-      }
-    }
-  }
-  lite::arm::math::pooling_basic(din,
-                                 dout,
-                                 out_dims[0],
-                                 out_dims[1],
-                                 out_dims[2],
-                                 out_dims[3],
-                                 in_dims[1],
-                                 in_dims[2],
-                                 in_dims[3],
-                                 ksize,
-                                 strides,
-                                 paddings,
-                                 global_pooling,
-                                 exclusive,
-                                 adaptive,
-                                 ceil_mode,
-                                 use_quantizer,
-                                 pooling_type);
-  VLOG(3) << "invoking pooling_basic";
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    pool2d, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::PoolCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/pool_compute.h b/lite/kernels/arm/pool_compute.h
deleted file mode 100644
index 7a25663f14..0000000000
--- a/lite/kernels/arm/pool_compute.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/core/kernel.h"
-#include "lite/operators/pool_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class PoolCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::PoolParam;
-
-  void PrepareForRun() override;
-  void Run() override;
-
-  virtual ~PoolCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/pool_compute_test.cc b/lite/kernels/arm/pool_compute_test.cc
deleted file mode 100644
index e04129cf0b..0000000000
--- a/lite/kernels/arm/pool_compute_test.cc
+++ /dev/null
@@ -1,286 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/pool_compute.h"
-#include <gtest/gtest.h>
-#include <limits>
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-int PoolOutputSize(
-    int input_size, int filter_size, int padding, int stride, bool ceil_mode) {
-  int output_size;
-  if (!ceil_mode) {
-    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
-  } else {
-    output_size =
-        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
-  }
-  return output_size;
-}
-
-std::vector<int64_t> compute_output_shape(operators::PoolParam* param_) {
-  const auto x_dims = param_->x->dims();
-  std::vector<int>& ksize = param_->ksize;
-  if (param_->global_pooling) {
-    ksize.resize(static_cast<size_t>(x_dims.size()) - 2);
-    for (size_t i = 0; i < ksize.size(); ++i) {
-      param_->paddings[i] = 0;
-      ksize[i] = static_cast<int>(x_dims[i + 2]);
-    }
-  }
-
-  std::vector<int64_t> output_shape({x_dims[0], x_dims[1]});
-  if (param_->adaptive) {
-    output_shape.insert(
-        output_shape.end(), param_->ksize.begin(), param_->ksize.end());
-  } else {
-    for (size_t i = 0; i < param_->ksize.size(); ++i) {
-      output_shape.push_back(PoolOutputSize(x_dims[i + 2],
-                                            param_->ksize[i],
-                                            param_->paddings[i],
-                                            param_->strides[i],
-                                            param_->ceil_mode));
-    }
-  }
-  return output_shape;
-}
-
-void pool_compute_ref(const operators::PoolParam& param) {
-  auto& in_dims = param.x->dims();
-  auto& out_dims = param.output->dims();
-
-  const float* src_ptr = param.x->data<const float>();
-  float* dst_ptr = param.output->mutable_data<float>();
-
-  std::vector<int> ksize = param.ksize;
-  std::vector<int> strides = param.strides;
-  std::vector<int> paddings = param.paddings;
-
-  std::string pooling_type = param.pooling_type;
-  bool global_pooling = param.global_pooling;
-  bool exclusive = param.exclusive;
-  bool adaptive = param.adaptive;
-  bool ceil_mode = param.ceil_mode;
-  bool use_quantizer = param.use_quantizer;
-  std::string data_format = param.data_format;
-
-  int in_n = in_dims[0];
-  int in_c = in_dims[1];
-  int in_h = in_dims[2];
-  int in_w = in_dims[3];
-  int size_in_n = in_c * in_h * in_w;
-  int size_in_c = in_h * in_w;
-
-  int out_h = out_dims[2];
-  int out_w = out_dims[3];
-  int size_out_n = in_c * out_h * out_w;
-  int size_out_c = out_h * out_w;
-
-  int window_h = ksize[0];
-  int window_w = ksize[1];
-  int stride_h = strides[0];
-  int stride_w = strides[1];
-  int pad_h = paddings[0];
-  int pad_w = paddings[1];
-
-  if (global_pooling == true) {
-    for (int n = 0; n < in_n; ++n) {
-      for (int c = 0; c < in_c; ++c) {
-        const float* src = src_ptr + n * size_in_n + c * size_in_c;
-        float res = src[0];
-        if (pooling_type == "max") {
-          for (int i = 1; i < size_in_c; ++i) {
-            float cur_val = src[i];
-            res = cur_val > res ? cur_val : res;
-          }
-        } else if (pooling_type == "avg") {
-          for (int i = 1; i < size_in_c; ++i) {
-            float cur_val = src[i];
-            res += cur_val;
-          }
-          res /= size_in_c;
-        }
-        dst_ptr[n * size_out_n + c] = res;
-      }
-    }
-  } else {
-    for (int n = 0; n < in_n; ++n) {
-      for (int c = 0; c < in_c; ++c) {
-        for (int h = 0; h < out_h; ++h) {
-          int sh = h * stride_h;
-          int eh = sh + window_h;
-          sh = (sh - pad_h) < 0 ? 0 : sh - pad_h;
-          eh = (eh - pad_h) > in_h ? in_h : eh - pad_h;
-          for (int w = 0; w < out_w; ++w) {
-            int sw = w * stride_w;
-            int ew = sw + window_w;
-            sw = (sw - pad_w) < 0 ? 0 : sw - pad_w;
-            ew = (ew - pad_w) > in_w ? in_w : ew - pad_w;
-            int pooling_size = (ew - sw) * (eh - sh);
-            if (pooling_size == 0) continue;
-            float res = 0.f;
-            for (int kh = sh; kh < eh; ++kh) {
-              for (int kw = sw; kw < ew; ++kw) {
-                int src_idx = n * size_in_n + c * size_in_c + kh * in_w + kw;
-                if (kh == sh && kw == sw) {
-                  res = src_ptr[src_idx];
-                } else {
-                  if (pooling_type == "max") {
-                    res = res >= src_ptr[src_idx] ? res : src_ptr[src_idx];
-                  }
-                  if (pooling_type == "avg") {
-                    res += src_ptr[src_idx];
-                  }
-                }
-              }
-            }
-            if (pooling_type == "avg") {
-              if (exclusive) {
-                res /= pooling_size;
-              } else {
-                res /= window_h * window_w;
-              }
-            }
-            dst_ptr[n * size_out_n + c * size_out_c + h * out_w + w] = res;
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(pool_arm, init) {
-  PoolCompute pool;
-  ASSERT_EQ(pool.precision(), PRECISION(kFloat));
-  ASSERT_EQ(pool.target(), TARGET(kARM));
-}
-
-TEST(pool_arm, compute) {
-  PoolCompute pool;
-  operators::PoolParam param;
-
-  lite::Tensor x;
-  lite::Tensor output;
-  lite::Tensor output_ref;
-
-  // speedup for ci
-  for (auto pooling_type : {"max", "avg"}) {
-    for (auto ceil_mode : {true, false}) {
-      for (auto global_pooling : {true, false}) {
-        for (auto exclusive : {true, false}) {
-          for (auto ksize : {2, 3}) {
-            for (auto stride : {1, 2}) {
-              for (auto pad : {0, 1}) {
-                for (auto n : {1, 2}) {
-                  for (auto c : {1, 3}) {
-#if 1
-                    for (auto h : {2, 3, 4, 11}) {
-                      for (auto w : {2, 3, 4, 11}) {
-#else
-                    for (int h = 2; h < 25; h++) {
-                      for (int w = 2; w < 25; w++) {
-#endif
-                        VLOG(3) << "n:" << n << " c:" << c << " h:" << h
-                                << " w:" << w << " ksize:" << ksize
-                                << " stride:" << stride << " pad:" << pad
-                                << " exclusive:" << exclusive
-                                << " global_pooling:" << global_pooling
-                                << " ceil_mode: " << ceil_mode
-                                << " pooling_type:" << pooling_type;
-
-                        // init x, output
-                        x.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
-                        auto* x_data = x.mutable_data<float>();
-                        for (int i = 0; i < x.dims().production(); ++i) {
-                          float sign = i % 3 == 0 ? -0.03 : 0.05f;
-                          x_data[i] = sign * (i % 128);
-                        }
-
-                        // fill param
-                        param.x = &x;
-                        param.output = &output;
-                        param.pooling_type = pooling_type;
-                        if (global_pooling) {
-                          param.ksize = {h, w};
-                        } else {
-                          param.ksize = {ksize, ksize};
-                        }
-                        param.global_pooling = global_pooling;
-                        param.strides = {stride, stride};
-                        param.paddings = {pad, pad};
-                        param.exclusive = exclusive;
-                        param.ceil_mode = ceil_mode;
-                        param.adaptive = false;
-                        param.use_quantizer = false;
-
-                        const std::vector<int64_t>& output_shape =
-                            compute_output_shape(&param);
-                        output.Resize(DDim(output_shape));
-                        output_ref.Resize(DDim(output_shape));
-
-                        auto* output_data = output.mutable_data<float>();
-                        auto* output_ref_data =
-                            output_ref.mutable_data<float>();
-                        for (int i = 0; i < output.dims().production(); ++i) {
-                          output_data[i] = -2;
-                          output_ref_data[i] = -2;
-                        }
-
-                        // compute
-                        pool.SetParam(param);
-                        pool.Run();
-
-                        // compute ref
-                        param.output = &output_ref;
-                        pool_compute_ref(param);
-
-                        // compare
-                        for (int i = 0; i < output.dims().production(); i++) {
-                          EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-4);
-                        }
-                        VLOG(3) << "compare pass";
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(pool_arm, retrive_op) {
-  auto pool = KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-      "pool2d");
-  ASSERT_FALSE(pool.empty());
-  ASSERT_TRUE(pool.front());
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
diff --git a/lite/kernels/arm/power_compute.cc b/lite/kernels/arm/power_compute.cc
deleted file mode 100644
index 2b946dccc0..0000000000
--- a/lite/kernels/arm/power_compute.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/power_compute.h"
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void PowerCompute::Run() {
-  auto& param = Param<operators::PowerParam>();
-  const float* x_data = param.X->data<float>();
-  float* output_data = param.Out->mutable_data<float>();
-  DDim x_dims = param.X->dims();
-  float scale = param.scale;
-  float shift = param.shift;
-  float power = param.power;
-
-  lite::arm::math::power(
-      x_data, output_data, x_dims.production(), scale, shift, power);
-}
-
-} /* namespace arm */
-} /* namespace kernels */
-} /* namespace lite */
-} /* namespace paddle */
-
-REGISTER_LITE_KERNEL(
-    power, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::PowerCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/power_compute.h b/lite/kernels/arm/power_compute.h
deleted file mode 100644
index 0e6b712f37..0000000000
--- a/lite/kernels/arm/power_compute.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class PowerCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void Run() override;
-
-  virtual ~PowerCompute() = default;
-};
-
-} /* namespace arm */
-} /* namespace kernels */
-} /* namespace lite */
-} /* namespace paddle */
diff --git a/lite/kernels/arm/prior_box_compute.cc b/lite/kernels/arm/prior_box_compute.cc
deleted file mode 100644
index 203f483351..0000000000
--- a/lite/kernels/arm/prior_box_compute.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/prior_box_compute.h"
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
-                               bool flip,
-                               std::vector<float>* output_aspect_ratior) {
-  constexpr float epsilon = 1e-6;
-  output_aspect_ratior->clear();
-  output_aspect_ratior->push_back(1.0f);
-  for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
-    float ar = input_aspect_ratior[i];
-    bool already_exist = false;
-    for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
-      if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
-        already_exist = true;
-        break;
-      }
-    }
-    if (!already_exist) {
-      output_aspect_ratior->push_back(ar);
-      if (flip) {
-        output_aspect_ratior->push_back(1.0f / ar);
-      }
-    }
-  }
-}
-
-void PriorBoxCompute::Run() {
-  auto& param = Param<operators::PriorBoxParam>();
-
-  bool is_flip = param.flip;
-  bool is_clip = param.clip;
-  std::vector<float> min_size = param.min_sizes;
-  std::vector<float> max_size = param.max_sizes;
-  std::vector<float> aspect_ratio = param.aspect_ratios;
-  std::vector<float> variance = param.variances_;
-  int img_w = param.img_w;
-  int img_h = param.img_h;
-  float step_w = param.step_w;
-  float step_h = param.step_h;
-  float offset = param.offset;
-  std::vector<float> aspect_ratios_vec;
-  ExpandAspectRatios(aspect_ratio, is_flip, &aspect_ratios_vec);
-  size_t prior_num = aspect_ratios_vec.size() * min_size.size();
-  prior_num += max_size.size();
-  std::vector<std::string> order = param.order;
-
-  lite::arm::math::prior_box(param.input,
-                             param.image,
-                             &param.boxes,
-                             &param.variances,
-                             min_size,
-                             max_size,
-                             aspect_ratios_vec,
-                             variance,
-                             img_w,
-                             img_h,
-                             step_w,
-                             step_h,
-                             offset,
-                             prior_num,
-                             is_flip,
-                             is_clip,
-                             order);
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(prior_box,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::PriorBoxCompute,
-                     def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Image", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/prior_box_compute.h b/lite/kernels/arm/prior_box_compute.h
deleted file mode 100644
index d44406a98f..0000000000
--- a/lite/kernels/arm/prior_box_compute.h
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class PriorBoxCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::PriorBoxParam;
-
-  void Run() override;
-
-  virtual ~PriorBoxCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/read_from_array_compute.cc b/lite/kernels/arm/read_from_array_compute.cc
deleted file mode 100644
index 945ada8c65..0000000000
--- a/lite/kernels/arm/read_from_array_compute.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/read_from_array_compute.h"
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void ReadFromArrayCompute::PrepareForRun() {}
-
-void ReadFromArrayCompute::Run() {
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  auto& param = this->Param<operators::ReadFromArrayParam>();
-
-  int in_num = param.X->size();
-  CHECK_EQ(param.I->numel(), 1) << "I should have only one element";
-  int id = param.I->data<int>()[0];
-  CHECK_LE(id, in_num) << "id is not valid";
-  int input_size = (*param.X)[id].numel();
-
-  param.Out->Resize((*param.X)[id].dims());
-  auto* o_data = param.Out->mutable_data<float>();
-  const auto* x_data = (*param.X)[id].data<float>();
-  memcpy(o_data, x_data, sizeof(float) * input_size);
-  auto out_lod = param.Out->mutable_lod();
-  *out_lod = (*param.X)[id].lod();
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(read_from_array,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::ReadFromArrayCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorListTy(TARGET(kARM))})
-    .BindInput("I", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/read_from_array_compute.h b/lite/kernels/arm/read_from_array_compute.h
deleted file mode 100644
index c8ba6d6d25..0000000000
--- a/lite/kernels/arm/read_from_array_compute.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <stdint.h>
-#include "lite/backends/arm/math/type_trans.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class ReadFromArrayCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ReadFromArrayParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  ~ReadFromArrayCompute() {}
-
- private:
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/reduce_max_compute.cc b/lite/kernels/arm/reduce_max_compute.cc
deleted file mode 100644
index 7a4a4313e0..0000000000
--- a/lite/kernels/arm/reduce_max_compute.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/reduce_max_compute.h"
-#include <string>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void ReduceMaxCompute::Run() {
-  auto& param = Param<operators::ReduceMaxParam>();
-  const float* input = param.X->data<float>();
-  auto x_dims = param.X->dims();
-  int x_rank = x_dims.size();
-  float* output = param.Out->mutable_data<float>();
-  bool keep_dim = param.keep_dim;
-  auto dim = param.dim;
-
-  if (!dim.empty()) {
-    for (int i = 0; i < dim.size(); i++) {
-      if (dim[i] < 0) {
-        dim[i] += x_rank;
-      }
-    }
-  }
-  int n_in = x_dims[0];
-  int c_in = x_dims[1];
-  int h_in = x_dims[2];
-  int w_in = x_dims[3];
-  if (dim.size() == 0) {
-    lite::arm::math::reduce_all(input, output, n_in, c_in, h_in, w_in);
-  } else if (dim.size() == 1) {
-    switch (dim[0]) {
-      case 0:
-        lite::arm::math::reduce_n(input, output, n_in, c_in, h_in, w_in);
-        break;
-      case 1:
-        lite::arm::math::reduce_c(input, output, n_in, c_in, h_in, w_in);
-        break;
-      case 2:
-        lite::arm::math::reduce_h(input, output, n_in, c_in, h_in, w_in);
-        break;
-      case 3:
-        lite::arm::math::reduce_w(input, output, n_in, c_in, h_in, w_in);
-        break;
-      default:
-        LOG(FATAL) << "error!!!";
-    }
-  } else if (dim.size() == 2) {
-    if (dim[0] == 0 && dim[1] == 1) {
-      lite::arm::math::reduce_nc(input, output, n_in, c_in, h_in, w_in);
-    } else if (dim[0] == 1 && dim[1] == 2) {
-      lite::arm::math::reduce_ch(input, output, n_in, c_in, h_in, w_in);
-    } else if (dim[0] == 2 && dim[1] == 3) {
-      lite::arm::math::reduce_hw(input, output, n_in, c_in, h_in, w_in);
-    } else {
-      LOG(FATAL) << "invalid dim!!";
-    }
-  } else {
-    LOG(FATAL) << "dim's size over than 2, which is not supported now!!";
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(reduce_max,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::ReduceMaxCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/reduce_max_compute.h b/lite/kernels/arm/reduce_max_compute.h
deleted file mode 100644
index f2228284ae..0000000000
--- a/lite/kernels/arm/reduce_max_compute.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <stdint.h>
-#include "lite/backends/arm/math/type_trans.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class ReduceMaxCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void Run() override;
-
-  virtual ~ReduceMaxCompute() = default;
-
- private:
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/reduce_mean_compute.cc b/lite/kernels/arm/reduce_mean_compute.cc
deleted file mode 100644
index 418134a05f..0000000000
--- a/lite/kernels/arm/reduce_mean_compute.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/reduce_mean_compute.h"
-#include <string>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void ReduceMeanCompute::Run() {
-  auto& param = Param<operators::ReduceMeanParam>();
-  const float* input = param.X->data<float>();
-  auto x_dims = param.X->dims();
-  int x_rank = x_dims.size();
-  float* output = param.Out->mutable_data<float>();
-  bool keep_dim = param.keep_dim;
-  auto dim = param.dim;
-
-  if (!dim.empty()) {
-    for (int i = 0; i < dim.size(); i++) {
-      if (dim[i] < 0) {
-        dim[i] += x_rank;
-      }
-    }
-  }
-  int n_in = x_dims[0];
-  int c_in = x_dims[1];
-  int h_in = x_dims[2];
-  int w_in = x_dims[3];
-  if (dim.size() == 0) {
-    lite::arm::math::reduce_mean_all(input, output, n_in, c_in, h_in, w_in);
-  } else if (dim.size() == 1) {
-    switch (dim[0]) {
-      case 0:
-        lite::arm::math::reduce_mean_n(input, output, n_in, c_in, h_in, w_in);
-        break;
-      case 1:
-        lite::arm::math::reduce_mean_c(input, output, n_in, c_in, h_in, w_in);
-        break;
-      case 2:
-        lite::arm::math::reduce_mean_h(input, output, n_in, c_in, h_in, w_in);
-        break;
-      case 3:
-        lite::arm::math::reduce_mean_w(input, output, n_in, c_in, h_in, w_in);
-        break;
-      default:
-        LOG(FATAL) << "error!!!";
-    }
-  } else if (dim.size() == 2) {
-    if (dim[0] == 0 && dim[1] == 1) {
-      lite::arm::math::reduce_mean_nc(input, output, n_in, c_in, h_in, w_in);
-    } else if (dim[0] == 1 && dim[1] == 2) {
-      lite::arm::math::reduce_mean_ch(input, output, n_in, c_in, h_in, w_in);
-    } else if (dim[0] == 2 && dim[1] == 3) {
-      lite::arm::math::reduce_mean_hw(input, output, n_in, c_in, h_in, w_in);
-    } else {
-      LOG(FATAL) << "invalid dim!!";
-    }
-  } else {
-    LOG(FATAL) << "dim's size over than 2, which is not supported now!!";
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(reduce_mean,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::ReduceMeanCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/reduce_mean_compute.h b/lite/kernels/arm/reduce_mean_compute.h
deleted file mode 100644
index f388512021..0000000000
--- a/lite/kernels/arm/reduce_mean_compute.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <stdint.h>
-#include "lite/backends/arm/math/type_trans.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class ReduceMeanCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void Run() override;
-
-  virtual ~ReduceMeanCompute() = default;
-
- private:
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/roi_align_compute.cc b/lite/kernels/arm/roi_align_compute.cc
deleted file mode 100644
index 9ac6b1abac..0000000000
--- a/lite/kernels/arm/roi_align_compute.cc
+++ /dev/null
@@ -1,236 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/roi_align_compute.h"
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-static constexpr int kROISize = 4;
-
-template <class T>
-void PreCalcForBilinearInterpolate(const int height,
-                                   const int width,
-                                   const int pooled_height,
-                                   const int pooled_width,
-                                   const int iy_upper,
-                                   const int ix_upper,
-                                   T roi_ymin,
-                                   T roi_xmin,
-                                   T bin_size_h,
-                                   T bin_size_w,
-                                   int roi_bin_grid_h,
-                                   int roi_bin_grid_w,
-                                   Tensor* pre_pos,
-                                   Tensor* pre_w) {
-  int pre_calc_index = 0;
-  int* pre_pos_data = pre_pos->mutable_data<int>();
-  T* pre_w_data = pre_w->mutable_data<T>();
-  for (int ph = 0; ph < pooled_height; ph++) {
-    for (int pw = 0; pw < pooled_width; pw++) {
-      for (int iy = 0; iy < iy_upper; iy++) {
-        // calculate y of sample points
-        T y = roi_ymin + ph * bin_size_h +
-              static_cast<T>(iy + .5f) * bin_size_h /
-                  static_cast<T>(roi_bin_grid_h);
-        // calculate x of samle points
-        for (int ix = 0; ix < ix_upper; ix++) {
-          T x = roi_xmin + pw * bin_size_w +
-                static_cast<T>(ix + .5f) * bin_size_w /
-                    static_cast<T>(roi_bin_grid_w);
-          // deal with elements out of map
-          if (y < -1.0 || y > height || x < -1.0 || x > width) {
-            for (int i = 0; i < kROISize; ++i) {
-              pre_pos_data[i + pre_calc_index * kROISize] = 0;
-              pre_w_data[i + pre_calc_index * kROISize] = 0;
-            }
-            pre_calc_index += 1;
-            continue;
-          }
-          y = y <= 0 ? 0 : y;
-          x = x <= 0 ? 0 : x;
-
-          int y_low = static_cast<int>(y);
-          int x_low = static_cast<int>(x);
-          int y_high;
-          int x_high;
-          if (y_low >= height - 1) {
-            y_high = y_low = height - 1;
-            y = static_cast<T>(y_low);
-          } else {
-            y_high = y_low + 1;
-          }
-          if (x_low >= width - 1) {
-            x_high = x_low = width - 1;
-            x = static_cast<T>(x_low);
-          } else {
-            x_high = x_low + 1;
-          }
-          T ly = y - y_low, lx = x - x_low;
-          T hy = 1. - ly, hx = 1. - lx;
-          pre_pos_data[pre_calc_index * kROISize] = y_low * width + x_low;
-          pre_pos_data[pre_calc_index * kROISize + 1] = y_low * width + x_high;
-          pre_pos_data[pre_calc_index * kROISize + 2] = y_high * width + x_low;
-          pre_pos_data[pre_calc_index * kROISize + 3] = y_high * width + x_high;
-          pre_w_data[pre_calc_index * kROISize] = hy * hx;
-          pre_w_data[pre_calc_index * kROISize + 1] = hy * lx;
-          pre_w_data[pre_calc_index * kROISize + 2] = ly * hx;
-          pre_w_data[pre_calc_index * kROISize + 3] = ly * lx;
-          pre_calc_index += 1;
-        }
-      }
-    }
-  }
-}
-
-void RoiAlignCompute::Run() {
-  auto& param = Param<operators::RoiAlignParam>();
-  auto* in = param.X;
-  auto* rois = param.ROIs;
-  auto* out = param.Out;
-  float spatial_scale = param.spatial_scale;
-  int pooled_height = param.pooled_height;
-  int pooled_width = param.pooled_width;
-  int sampling_ratio = param.sampling_ratio;
-
-  auto in_dims = in->dims();
-  int batch_size = in_dims[0];
-  int channels = in_dims[1];
-  int height = in_dims[2];
-  int width = in_dims[3];
-  auto rois_dims = rois->dims();
-  int rois_num = rois_dims[0];
-  auto out_dims = out->dims();
-  if (rois_num == 0) {
-    return;
-  }
-
-  DDim in_stride({static_cast<int>(in_dims[1] * in_dims[2] * in_dims[3]),
-                  static_cast<int>(in_dims[2] * in_dims[3]),
-                  static_cast<int>(in_dims[3]),
-                  1});
-  DDim roi_stride({static_cast<int>(rois_dims[1]), 1});
-  DDim out_stride({static_cast<int>(out_dims[1] * out_dims[2] * out_dims[3]),
-                   static_cast<int>(out_dims[2] * out_dims[3]),
-                   static_cast<int>(out_dims[3]),
-                   1});
-
-  auto* input_data = in->data<float>();
-  Tensor roi_batch_id_list;
-  roi_batch_id_list.Resize({rois_num});
-  int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>();
-
-  auto rois_lod = rois->lod().back();
-  int rois_batch_size = rois_lod.size() - 1;
-  // CHECK_OR_FALSE(rois_batch_size == batch_size);
-  int rois_num_with_lod = rois_lod[rois_batch_size];
-  // CHECK_OR_FALSE(rois_num_with_lod == rois_num);
-  for (int n = 0; n < rois_batch_size; ++n) {
-    for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-      roi_batch_id_data[i] = n;
-    }
-  }
-
-  auto* output_data = out->mutable_data<float>();
-  auto* rois_data = rois->data<float>();
-  for (int n = 0; n < rois_num; ++n) {
-    int roi_batch_id = roi_batch_id_data[n];
-    float roi_xmin = rois_data[0] * spatial_scale;
-    float roi_ymin = rois_data[1] * spatial_scale;
-    float roi_xmax = rois_data[2] * spatial_scale;
-    float roi_ymax = rois_data[3] * spatial_scale;
-
-    float roi_width = std::max(roi_xmax - roi_xmin, 1.0f);
-    float roi_height = std::max(roi_ymax - roi_ymin, 1.0f);
-    float bin_size_h = roi_height / pooled_height;
-    float bin_size_w = roi_width / pooled_width;
-    const float* batch_data = input_data + roi_batch_id * in_stride[0];
-
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-    const float count = roi_bin_grid_h * roi_bin_grid_w;
-    Tensor pre_pos;
-    Tensor pre_w;
-    int pre_size = count * out_stride[1];
-    pre_pos.Resize({pre_size, kROISize});
-    pre_w.Resize({pre_size, kROISize});
-    PreCalcForBilinearInterpolate<float>(height,
-                                         width,
-                                         pooled_height,
-                                         pooled_width,
-                                         roi_bin_grid_h,
-                                         roi_bin_grid_w,
-                                         roi_ymin,
-                                         roi_xmin,
-                                         bin_size_h,
-                                         bin_size_w,
-                                         roi_bin_grid_h,
-                                         roi_bin_grid_w,
-                                         &pre_pos,
-                                         &pre_w);
-
-    const int* pre_pos_data = pre_pos.data<int>();
-    const float* pre_w_data = pre_w.data<float>();
-    for (int c = 0; c < channels; c++) {
-      int pre_calc_index = 0;
-      for (int ph = 0; ph < pooled_height; ph++) {
-        for (int pw = 0; pw < pooled_width; pw++) {
-          const int pool_index = ph * pooled_width + pw;
-          float output_val = 0;
-          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-              for (int i = 0; i < kROISize; i++) {
-                int pos = pre_pos_data[pre_calc_index * kROISize + i];
-                float w = pre_w_data[pre_calc_index * kROISize + i];
-                output_val += w * batch_data[pos];
-              }
-              pre_calc_index += 1;
-            }
-          }
-          output_val /= count;
-          output_data[pool_index] = output_val;
-        }
-      }
-      batch_data += in_stride[1];
-      output_data += out_stride[1];
-    }
-    rois_data += roi_stride[0];
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(roi_align,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::RoiAlignCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("ROIs", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/roi_align_compute.h b/lite/kernels/arm/roi_align_compute.h
deleted file mode 100644
index f2d434305e..0000000000
--- a/lite/kernels/arm/roi_align_compute.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/core/kernel.h"
-#include "lite/operators/roi_align_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class RoiAlignCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::RoiAlignParam;
-
-  void Run() override;
-
-  virtual ~RoiAlignCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/scale_compute.cc b/lite/kernels/arm/scale_compute.cc
deleted file mode 100644
index 2a46d2212e..0000000000
--- a/lite/kernels/arm/scale_compute.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/scale_compute.h"
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void ScaleCompute::Run() {
-  auto& param = Param<operators::ScaleParam>();
-  const float* x_data = param.x->data<float>();
-  float* output_data = param.output->mutable_data<float>();
-  DDim x_dims = param.x->dims();
-  bool bias_after_scale = param.bias_after_scale;
-  float scale = param.scale;
-  float bias = param.bias;
-  if (!bias_after_scale) {
-    bias *= scale;
-  }
-  lite::arm::math::scale(x_data, output_data, x_dims.production(), scale, bias);
-  if (!param.x->lod().empty()) {
-    param.output->set_lod(param.x->lod());
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    scale, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ScaleCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/scale_compute.h b/lite/kernels/arm/scale_compute.h
deleted file mode 100644
index 4eacfaf8e1..0000000000
--- a/lite/kernels/arm/scale_compute.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class ScaleCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void Run() override;
-
-  virtual ~ScaleCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/scale_compute_test.cc b/lite/kernels/arm/scale_compute_test.cc
deleted file mode 100644
index 2683f341a2..0000000000
--- a/lite/kernels/arm/scale_compute_test.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/scale_compute.h"
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-template <typename dtype>
-void scale_compute_ref(const operators::ScaleParam& param) {
-  const dtype* x_data = param.x->mutable_data<const dtype>();
-  dtype* output_data = param.output->mutable_data<dtype>();
-  DDim x_dims = param.x->dims();
-  DDim output_dims = param.output->dims();
-  ASSERT_EQ(x_dims.data(), output_dims.data());
-  bool bias_after_scale = param.bias_after_scale;
-  float scale = param.scale;
-  float bias = param.bias;
-  if (!bias_after_scale) {
-    bias *= scale;
-  }
-  for (int i = 0; i < output_dims.production(); i++) {
-    output_data[i] = x_data[i] * scale + bias;
-  }
-}
-
-TEST(scale_arm, init) {
-  ScaleCompute scale;
-  ASSERT_EQ(scale.precision(), PRECISION(kFloat));
-  ASSERT_EQ(scale.target(), TARGET(kARM));
-}
-
-TEST(scale_arm, compute) {
-  ScaleCompute scale;
-  operators::ScaleParam param;
-
-  lite::Tensor x;
-  lite::Tensor output;
-  lite::Tensor output_ref;
-
-#if 1  // for ci speedup
-  for (auto n : {1, 3}) {
-    for (auto c : {1, 3}) {
-      for (auto h : {3, 4}) {
-        for (auto w : {4, 3}) {
-          for (auto bias_after_scale : {true, false}) {
-            for (auto s : {-1.0f, 0.13f}) {
-              for (auto b : {-15.f, 0.11234f}) {
-#else
-  for (auto n : {1, 3, 4, 11}) {
-    for (auto c : {1, 3, 11, 4}) {
-      for (auto h : {3, 1, 11, 4}) {
-        for (auto w : {1, 3, 4, 12}) {
-          for (auto bias_after_scale : {true, false}) {
-            for (auto s : {-100.25f, -1.0f, 0.13f, 3840.975f}) {
-              for (auto b : {-3075.495f, -15.f, 0.11234f, 128.15f}) {
-#endif
-
-                x.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
-                output.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
-                output_ref.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
-                auto* x_data = x.mutable_data<float>();
-                auto* output_data = output.mutable_data<float>();
-                auto* output_ref_data = output_ref.mutable_data<float>();
-                for (int i = 0; i < x.dims().production(); i++) {
-                  x_data[i] = i;
-                }
-                param.x = &x;
-                param.output = &output;
-                param.bias_after_scale = bias_after_scale;
-                param.scale = s;
-                param.bias = b;
-                scale.SetParam(param);
-                scale.Run();
-                param.output = &output_ref;
-                scale_compute_ref<float>(param);
-                for (int i = 0; i < output.dims().production(); i++) {
-                  EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(scale, retrive_op) {
-  auto scale =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("scale");
-  ASSERT_FALSE(scale.empty());
-  ASSERT_TRUE(scale.front());
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
diff --git a/lite/kernels/arm/sequence_expand_compute.cc b/lite/kernels/arm/sequence_expand_compute.cc
deleted file mode 100644
index 524461e193..0000000000
--- a/lite/kernels/arm/sequence_expand_compute.cc
+++ /dev/null
@@ -1,132 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/sequence_expand_compute.h"
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void SequenceExpandFunc(const Tensor& x,
-                        const std::vector<uint64_t>& x_lod,
-                        const std::vector<uint64_t>& ref_lod,
-                        Tensor* out) {
-  uint64_t out_offset = 0;
-  int64_t x_item_length = x.numel() / x.dims()[0];
-  auto out_data = out->mutable_data<float>();
-  auto x_data = x.data<float>();
-  for (size_t i = 1; i < ref_lod.size(); ++i) {
-    uint64_t repeat_num = ref_lod[i] - ref_lod[i - 1];
-    uint64_t x_start = x_lod[i - 1];
-    uint64_t x_end = x_lod[i];
-    uint64_t x_seq_len = x_end - x_start;
-    if (repeat_num > 0) {
-      uint64_t out_start = out_offset;
-      if (out->lod().size() == 1) {
-        out_start = out->lod()[0][out_offset];
-      }
-      for (uint64_t j = 0; j < repeat_num; j++) {
-        for (uint64_t k = 0; k < x_seq_len; k++) {
-          for (int l = 0; l < x_item_length; l++) {
-            out_data[(out_start + j * x_seq_len + k) * x_item_length + l] =
-                x_data[(x_start + k) * x_item_length + l];
-          }
-        }
-      }
-    }
-    out_offset += repeat_num;
-  }
-}
-
-void SequenceExpandCompute::PrepareForRun() {}
-
-void SequenceExpandCompute::Run() {
-  /*
-  auto& param = Param<operators::SequenceExpandParam>();
-  const float* x_data = param.X->data<float>();
-  int width = param.X->numel() / param.X->dims()[0];
-  auto& output = param.Out;
-  const auto x_lod = param.X->lod();
-  const auto y_lod = param.Y->lod();
-  int ref_level = param.ref_level;
-  if (ref_level == -1) {
-    ref_level = y_lod.size() - 1;
-  }
-  lite::arm::math::SequenceExpandImpl(
-      x_data, x_lod, width, y_lod[ref_level], output);
-  */
-  auto& param = Param<operators::SequenceExpandParam>();
-  auto* x = param.X;
-  auto* y = param.Y;
-  auto* out = param.Out;
-  int ref_level = param.ref_level;
-  auto x_lod = x->lod();
-  auto y_lod = y->lod();
-
-  if (ref_level == -1) ref_level = y_lod.size() - 1;
-
-  out->mutable_data<float>();
-  if (y_lod[ref_level].size() <= 1) {
-    out->CopyDataFrom(*x);
-    return;
-  }
-
-  std::vector<uint64_t> out_lod;
-  if (x_lod.size() == 1) {
-    out_lod.push_back(0);
-    uint64_t out_offset = 0;
-    for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
-      uint64_t repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
-      uint64_t x_start = x_lod[0][i - 1];
-      uint64_t x_end = x_lod[0][i];
-      uint64_t x_seq_len = x_end - x_start;
-      for (uint64_t j = 0; j < repeat_num; ++j) {
-        out_lod.push_back(out_lod.back() + x_seq_len);
-        out_offset++;
-      }
-    }
-    // write lod to out if x has lod
-    auto& ref_lod = *out->mutable_lod();
-    ref_lod[0] = out_lod;
-  }
-
-  std::vector<uint64_t> ref_x_lod;
-  if (x->lod().size() == 1) {
-    ref_x_lod = x->lod()[0];
-  } else {
-    ref_x_lod.resize(x->dims()[0] + 1);
-    std::iota(ref_x_lod.begin(), ref_x_lod.end(), 0);
-  }
-
-  SequenceExpandFunc(*x, ref_x_lod, y_lod[ref_level], out);
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(sequence_expand,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::SequenceExpandCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/sequence_expand_compute.h b/lite/kernels/arm/sequence_expand_compute.h
deleted file mode 100644
index b7d099fdb5..0000000000
--- a/lite/kernels/arm/sequence_expand_compute.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class SequenceExpandCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  virtual ~SequenceExpandCompute() = default;
-
- private:
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/sequence_pool_compute.cc b/lite/kernels/arm/sequence_pool_compute.cc
deleted file mode 100644
index 8fcbb8cffe..0000000000
--- a/lite/kernels/arm/sequence_pool_compute.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "lite/kernels/arm/sequence_pool_compute.h"
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void SequencePoolCompute::PrepareForRun() {}
-
-void SequencePoolCompute::Run() {
-  auto& param = Param<operators::SequencePoolParam>();
-  auto& output = param.Out;
-  const auto* din = param.X->data<float>();
-  float* dout = output->mutable_data<float>();
-  const auto pool_type = param.pool_type;
-  const auto lod = param.X->lod()[0];
-
-  int64_t width = param.X->numel() / param.X->dims()[0];
-
-  if (pool_type == "SUM") {
-    lite::arm::math::seq_pool_sum(din, dout, lod, width);
-  } else if (pool_type == "AVERAGE") {
-    lite::arm::math::seq_pool_average(din, dout, lod, width);
-  } else if (pool_type == "SQRT") {
-    lite::arm::math::seq_pool_sqrt(din, dout, lod, width);
-  } else if (pool_type == "MAX") {
-    lite::arm::math::seq_pool_max(din, dout, lod, width);
-  } else if (pool_type == "MIN") {
-    lite::arm::math::seq_pool_min(din, dout, lod, width);
-  } else if (pool_type == "FIRST") {
-    lite::arm::math::seq_pool_first(din, dout, lod, width);
-  } else if (pool_type == "LAST") {
-    lite::arm::math::seq_pool_last(din, dout, lod, width);
-  } else {
-    LOG(ERROR) << " UNKNOWN sequence pool type";
-  }
-  int batch_size = lod.size() - 1;
-  std::vector<uint64_t> offset_new(static_cast<uint64_t>(batch_size + 1));
-  for (int i = 0; i <= batch_size; i++) {
-    offset_new[i] = i;
-  }
-  (output->mutable_lod())->push_back(offset_new);
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(sequence_pool,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::SequencePoolCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("MaxIndex", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/sequence_pool_compute.h b/lite/kernels/arm/sequence_pool_compute.h
deleted file mode 100644
index 67265eb8ea..0000000000
--- a/lite/kernels/arm/sequence_pool_compute.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <stdint.h>
-#include "lite/backends/arm/math/type_trans.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class SequencePoolCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  virtual ~SequencePoolCompute() = default;
-
- private:
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/sequence_softmax_compute.cc b/lite/kernels/arm/sequence_softmax_compute.cc
deleted file mode 100644
index 093e8a61d8..0000000000
--- a/lite/kernels/arm/sequence_softmax_compute.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/sequence_softmax_compute.h"
-#include <vector>
-#include "lite/api/paddle_place.h"
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void SequenceSoftmaxCompute::PrepareForRun() {}
-
-void SequenceSoftmaxCompute::Run() {
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  auto& param = this->Param<operators::SequenceSoftmaxParam>();
-
-  const auto* x_data = param.X->data<float>();
-  auto* o_data = param.Out->mutable_data<float>();
-  auto input_dims = param.X->dims();
-  int in_h = input_dims[0];
-  int in_w = param.X->numel() / in_h;
-  CHECK_EQ(in_w, 1) << "input dims is not valid";
-  auto seq_offset = param.X->lod()[0];
-  CHECK_EQ(in_h, seq_offset.back()) << "input dims is not valid";
-
-  lite::arm::math::sequence_softmax(x_data, seq_offset, o_data, &ctx);
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(sequence_softmax,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::SequenceSoftmaxCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/sequence_softmax_compute.h b/lite/kernels/arm/sequence_softmax_compute.h
deleted file mode 100644
index 03c6c58479..0000000000
--- a/lite/kernels/arm/sequence_softmax_compute.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <stdint.h>
-#include "lite/backends/arm/math/type_trans.h"
-#include "lite/core/kernel.h"
-#include "lite/operators/sequence_softmax_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class SequenceSoftmaxCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::SequenceSoftmaxParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  ~SequenceSoftmaxCompute() {}
-
- private:
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/shape_compute.cc b/lite/kernels/arm/shape_compute.cc
deleted file mode 100644
index f382004d7a..0000000000
--- a/lite/kernels/arm/shape_compute.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/shape_compute.h"
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void ShapeCompute::Run() {
-  auto& param = Param<operators::ShapeParam>();
-  int* output_data = param.Out->mutable_data<int>();
-  auto in_dims = param.X->dims();
-  for (int i = 0; i < in_dims.size(); ++i) {
-    output_data[i] = in_dims[i];
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    shape, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ShapeCompute, def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/shape_compute.h b/lite/kernels/arm/shape_compute.h
deleted file mode 100644
index 267df75624..0000000000
--- a/lite/kernels/arm/shape_compute.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class ShapeCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void Run() override;
-
-  virtual ~ShapeCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/shuffle_channel_compute.cc b/lite/kernels/arm/shuffle_channel_compute.cc
deleted file mode 100644
index 06750bcd5a..0000000000
--- a/lite/kernels/arm/shuffle_channel_compute.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/shuffle_channel_compute.h"
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void ShuffleChannelCompute::Run() {
-  auto& param = Param<operators::ShuffleChannelParam>();
-  const float* x_data = param.X->data<float>();
-  float* output_data = param.Out->mutable_data<float>();
-  DDim x_dims = param.X->dims();
-  int group = param.group;
-  int num = param.X->dims()[0];
-  int channel = param.X->dims()[1];
-  int height = param.X->dims()[2];
-  int width = param.X->dims()[3];
-  lite::arm::math::shuffle_channel(
-      x_data, output_data, group, num, channel, height, width);
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(shuffle_channel,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::ShuffleChannelCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/shuffle_channel_compute.h b/lite/kernels/arm/shuffle_channel_compute.h
deleted file mode 100644
index 57e9823d42..0000000000
--- a/lite/kernels/arm/shuffle_channel_compute.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class ShuffleChannelCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void Run() override;
-
-  virtual ~ShuffleChannelCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/slice_compute.cc b/lite/kernels/arm/slice_compute.cc
deleted file mode 100644
index c71ed3292f..0000000000
--- a/lite/kernels/arm/slice_compute.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "lite/kernels/arm/slice_compute.h"
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void SliceCompute::PrepareForRun() {}
-
-void SliceCompute::Run() {
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  auto& param = this->Param<operators::SliceParam>();
-
-  auto input_dims = param.X->dims();
-  int dim_size = param.X->dims().size();
-
-  std::vector<int> starts = param.starts;
-  std::vector<int> ends = param.ends;
-  std::vector<int> axes = param.axes;
-  const auto* x_data = param.X->data<int>();
-  auto* o_data = param.Out->mutable_data<int>();
-  lite::arm::math::slice(
-      x_data, input_dims.data(), axes, starts, ends, o_data, &ctx);
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    slice, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::SliceCompute, def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
-
-// REGISTER_LITE_KERNEL(
-//    slice, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::SliceCompute, def)
-//    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), Precision(kINT32))})
-//    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM),
-//    Precision(kINT32))})
-//    .Finalize();
diff --git a/lite/kernels/arm/slice_compute.h b/lite/kernels/arm/slice_compute.h
deleted file mode 100644
index 701dacfbb4..0000000000
--- a/lite/kernels/arm/slice_compute.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <stdint.h>
-#include "lite/backends/arm/math/type_trans.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-class SliceCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::SliceParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  ~SliceCompute() {}
-
- private:
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/softmax_compute.cc b/lite/kernels/arm/softmax_compute.cc
deleted file mode 100644
index 3409d0f5c5..0000000000
--- a/lite/kernels/arm/softmax_compute.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/softmax_compute.h"
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void SoftmaxCompute::Run() {
-  auto& param = Param<operators::SoftmaxParam>();
-  const float* din = param.x->data<float>();
-  float* dout = param.output->mutable_data<float>();
-  auto x_dims = param.x->dims();
-  auto x_rank = x_dims.size();
-  int axis = param.axis;
-  if (axis < 0) {
-    axis += x_rank;
-  }
-  int outer_num = x_dims.Slice(0, axis).production();
-  int inner_num = x_dims.Slice(axis + 1, x_rank).production();
-  int axis_size = x_dims[axis];
-  if (inner_num == 1) {
-    if (axis_size >= 4) {
-      lite::arm::math::softmax_inner1_large_axis(
-          din, dout, outer_num, axis_size);
-    } else {
-      lite::arm::math::softmax_inner1_small_axis(
-          din, dout, outer_num, axis_size);
-    }
-  } else {
-    int compute_size = outer_num * inner_num;
-    if (axis_size == 4 && inner_num % 8 == 0) {
-      lite::arm::math::softmax_inner8_axis4(
-          din, dout, axis_size, inner_num, outer_num);
-    } else if (axis_size == 4 && inner_num % 4 == 0) {
-      lite::arm::math::softmax_inner4_axis4(
-          din, dout, axis_size, inner_num, outer_num);
-    } else {
-      if (inner_num % 8 == 0) {
-        lite::arm::math::softmax_inner8(
-            din, dout, axis_size, inner_num, outer_num);
-      } else if (inner_num % 4 == 0) {
-        lite::arm::math::softmax_inner4(
-            din, dout, axis_size, inner_num, outer_num);
-      } else {
-        lite::arm::math::softmax_basic(
-            din, dout, axis_size, inner_num, outer_num);
-      }
-    }
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(softmax,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::SoftmaxCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/softmax_compute.h b/lite/kernels/arm/softmax_compute.h
deleted file mode 100644
index c0ec4af8ef..0000000000
--- a/lite/kernels/arm/softmax_compute.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class SoftmaxCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void Run() override;
-
-  virtual ~SoftmaxCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/softmax_compute_test.cc b/lite/kernels/arm/softmax_compute_test.cc
deleted file mode 100644
index 5a883e4ebe..0000000000
--- a/lite/kernels/arm/softmax_compute_test.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/softmax_compute.h"
-#include <gtest/gtest.h>
-#include <limits>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-template <typename dtype>
-void softmax_compute_ref(const operators::SoftmaxParam& param) {
-  const dtype* x_data = param.x->mutable_data<const dtype>();
-  dtype* output_data = param.output->mutable_data<dtype>();
-  DDim x_dims = param.x->dims();
-  ASSERT_EQ(x_dims.data(), param.output->dims().data());
-  auto x_rank = x_dims.size();
-  int axis = param.axis;
-  if (axis < 0) {
-    axis += x_rank;
-  }
-  int axis_size = x_dims[axis];
-  int outer_num = x_dims.Slice(0, axis).production();
-  int inner_num = x_dims.Slice(axis + 1, x_rank).production();
-  int compute_size = outer_num * inner_num;
-  for (int i = 0; i < compute_size; i++) {
-    int idx_inner = i % inner_num;
-    int idx_outer = (i / inner_num) * axis_size;
-    int start = idx_outer * inner_num + idx_inner;
-    int offset;
-
-    offset = start;
-    dtype max_data = std::numeric_limits<dtype>::lowest();
-    for (int j = 0; j < axis_size; j++) {
-      max_data = x_data[offset] > max_data ? x_data[offset] : max_data;
-      offset += inner_num;
-    }
-
-    offset = start;
-    dtype sum_data = (dtype)0;
-    for (int j = 0; j < axis_size; j++) {
-      output_data[offset] = exp(x_data[offset] - max_data);
-      sum_data += output_data[offset];
-      offset += inner_num;
-    }
-
-    offset = start;
-    for (int j = 0; j < axis_size; j++) {
-      output_data[offset] /= sum_data;
-      offset += inner_num;
-    }
-  }
-}
-
-TEST(softmax_arm, init) {
-  SoftmaxCompute softmax;
-  ASSERT_EQ(softmax.precision(), PRECISION(kFloat));
-  ASSERT_EQ(softmax.target(), TARGET(kARM));
-}
-
-TEST(softmax_arm, compute) {
-  SoftmaxCompute softmax;
-  operators::SoftmaxParam param;
-
-  lite::Tensor x;
-  lite::Tensor output;
-  lite::Tensor output_ref;
-#if 1
-  for (auto n : {1, 3}) {
-    for (auto c : {1, 4}) {
-      for (auto h : {5, 1}) {
-        for (auto w : {1, 6}) {
-          for (auto axis : {-2, -1, 0, 1, 2}) {
-#else
-  for (auto n : {1, 3, 4, 11}) {
-    for (auto c : {1, 3, 11, 4}) {
-      for (auto h : {3, 1, 11, 4}) {
-        for (auto w : {1, 3, 4, 12}) {
-          for (auto axis : {-4, -3, -2, -1, 0, 1, 2, 3}) {
-#endif
-            x.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
-            output.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
-            output_ref.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
-            auto* x_data = x.mutable_data<float>();
-            auto* output_data = output.mutable_data<float>();
-            auto* output_ref_data = output_ref.mutable_data<float>();
-            for (int i = 0; i < x.dims().production(); i++) {
-              x_data[i] = i;
-            }
-            param.x = &x;
-            param.axis = axis;
-            param.output = &output;
-            softmax.SetParam(param);
-            softmax.Run();
-            param.output = &output_ref;
-            softmax_compute_ref<float>(param);
-            for (int i = 0; i < output.dims().production(); i++) {
-              EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(softmax, retrive_op) {
-  auto softmax =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "softmax");
-  ASSERT_FALSE(softmax.empty());
-  ASSERT_TRUE(softmax.front());
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
diff --git a/lite/kernels/arm/split_compute.cc b/lite/kernels/arm/split_compute.cc
deleted file mode 100644
index 27606e2d76..0000000000
--- a/lite/kernels/arm/split_compute.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/split_compute.h"
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void SplitCompute::Run() {
-  auto& param = Param<operators::SplitParam>();
-  const float* din = param.x->data<float>();
-  auto& dout = param.output;
-  auto in_dim = param.x->dims();
-  std::vector<int> in_strides(in_dim.size());
-  in_strides[in_dim.size() - 1] = in_dim[in_dim.size() - 1];
-  for (int i = in_dim.size() - 2; i >= 0; --i) {
-    in_strides[i] = in_strides[i + 1] * in_dim[i];
-  }
-  lite::arm::math::split(din, dout, param.axis, in_strides);
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    split, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::SplitCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/split_compute.h b/lite/kernels/arm/split_compute.h
deleted file mode 100644
index 2e46ed2c77..0000000000
--- a/lite/kernels/arm/split_compute.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class SplitCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void Run() override;
-
-  virtual ~SplitCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/split_compute_test.cc b/lite/kernels/arm/split_compute_test.cc
deleted file mode 100644
index 034fbb85c4..0000000000
--- a/lite/kernels/arm/split_compute_test.cc
+++ /dev/null
@@ -1,179 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/split_compute.h"
-#include <gtest/gtest.h>
-#include <cstring>
-#include <limits>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void splite_resize_out(const lite::Tensor* din,
-                       const std::vector<lite::Tensor*>& dout,
-                       int axis,
-                       int num,
-                       const std::vector<int>& sections) {
-  auto in_dims = din->dims();
-  int outs_number = dout.size();
-
-  std::vector<lite::DDimLite> outs_dims;
-  outs_dims.reserve(outs_number);
-
-  if (num > 0) {
-    int out_axis_dim = in_dims[axis] / num;
-    for (int i = 0; i < outs_number; ++i) {
-      auto dim = in_dims;
-      dim[axis] = out_axis_dim;
-      outs_dims.push_back(dim);
-    }
-  } else if (sections.size() > 0) {
-    for (size_t i = 0; i < outs_number; ++i) {
-      auto dim = in_dims;
-      dim[axis] = sections[i];
-      outs_dims.push_back(dim);
-    }
-  }
-
-  for (int j = 0; j < outs_dims.size(); ++j) {
-    dout[j]->Resize(outs_dims[j]);
-  }
-}
-
-template <typename dtype>
-void split_compute_ref(const operators::SplitParam& param) {
-  const dtype* din = param.x->mutable_data<const dtype>();
-  auto& dout = param.output;
-  auto in_dim = param.x->dims();
-  int axis = param.axis;
-  std::vector<int> in_strides(in_dim.size());
-  in_strides[in_dim.size() - 1] = in_dim[in_dim.size() - 1];
-  for (int i = in_dim.size() - 2; i >= 0; --i) {
-    in_strides[i] = in_strides[i + 1] * in_dim[i];
-  }
-
-  int input_offset = 0;
-  for (auto out : dout) {
-    auto out_dim = out->dims();
-    std::vector<int> out_strides(out_dim.size());
-    out_strides[out_dim.size() - 1] = out_dim[out_dim.size() - 1];
-    for (int i = out_dim.size() - 2; i >= 0; --i) {
-      out_strides[i] = out_strides[i + 1] * out_dim[i];
-    }
-
-    dtype* out_data = out->mutable_data<dtype>();
-    int before = out_strides[0] / out_strides[axis];
-    int in_after = in_strides[axis];
-    int out_after = out_strides[axis];
-
-    for (int i = 0; i < before; ++i) {
-      std::memcpy(out_data + i * out_after,
-                  din + input_offset + i * in_after,
-                  sizeof(dtype) * out_after);
-    }
-    input_offset += out_strides[axis];
-  }
-}
-
-TEST(split_arm, init) {
-  SplitCompute split;
-  ASSERT_EQ(split.precision(), PRECISION(kFloat));
-  ASSERT_EQ(split.target(), TARGET(kARM));
-}
-
-TEST(split_arm, compute) {
-  SplitCompute split;
-  operators::SplitParam param;
-
-  lite::Tensor x;
-  std::vector<lite::Tensor*> output;
-  std::vector<lite::Tensor*> output_ref;
-
-  for (auto n : {1, 3, 4}) {
-    for (auto c : {1, 3, 4}) {
-      for (auto h : {1, 3, 4}) {
-        for (auto w : {1, 3, 4}) {
-          for (auto axis : {0, 1, 2, 3}) {
-            for (auto num : {0, 1, 2, 3}) {
-              for (auto sections : {std::vector<int>{1, 1, 1},
-                                    std::vector<int>{2, 2},
-                                    std::vector<int>{1, 2}}) {
-                auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
-                x.Resize(x_dim);
-                if ((num != 0 && x_dim[axis] % num != 0) ||
-                    (num == 0 && x_dim[axis] % sections.size() != 0))
-                  continue;
-                auto* x_data = x.mutable_data<float>();
-                for (int i = 0; i < x.dims().production(); i++) {
-                  x_data[i] = i;
-                }
-                for (auto out : output) delete out;
-                for (auto out : output_ref) delete out;
-                output.clear();
-                output_ref.clear();
-
-                int outs_number;
-                if (num > 0) {
-                  outs_number = num;
-                } else {
-                  outs_number = sections.size();
-                }
-                for (int i = 0; i < outs_number; i++) {
-                  output.push_back(new lite::Tensor);
-                  output_ref.push_back(new lite::Tensor);
-                }
-                splite_resize_out(&x, output, axis, num, sections);
-                splite_resize_out(&x, output_ref, axis, num, sections);
-                param.x = &x;
-                param.axis = axis;
-                param.num = num;
-                param.sections = sections;
-                param.output = output;
-                split.SetParam(param);
-                split.Run();
-                param.output = output_ref;
-                split_compute_ref<float>(param);
-                for (int i = 0; i < output.size(); i++) {
-                  float* output_data = output[i]->mutable_data<float>();
-                  float* output_ref_data = output_ref[i]->mutable_data<float>();
-                  for (int j = 0; j < output[i]->dims().production(); j++) {
-                    EXPECT_NEAR(output_data[j], output_ref_data[j], 1e-5);
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(split, retrive_op) {
-  auto split =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("split");
-  ASSERT_FALSE(split.empty());
-  ASSERT_TRUE(split.front());
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(split, kARM, kFloat, kNCHW, def);
diff --git a/lite/kernels/arm/squeeze_compute.cc b/lite/kernels/arm/squeeze_compute.cc
deleted file mode 100644
index 0f79d5c385..0000000000
--- a/lite/kernels/arm/squeeze_compute.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/squeeze_compute.h"
-#include <vector>
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace host {
-
-void SqueezeCompute::Run() {
-  auto& param = Param<operators::SqueezeParam>();
-  auto x = param.X;
-  auto output = param.Out;
-  auto x_dims = x->dims();
-  auto* x_data = x->data<float>();
-  auto* out_data = output->mutable_data<float>();
-  memcpy(out_data, x_data, x_dims.production() * sizeof(float));
-}
-
-void Squeeze2Compute::Run() {
-  auto& param = Param<operators::SqueezeParam>();
-  auto x = param.X;
-  auto output = param.Out;
-  auto xshape = param.XShape;
-  auto x_dims = x->dims();
-  auto* x_data = x->data<float>();
-  auto* out_data = output->mutable_data<float>();
-  auto* xshape_data = xshape->mutable_data<float>();
-  memcpy(out_data, x_data, x_dims.production() * sizeof(float));
-  memcpy(xshape_data, x_data, x_dims.production() * sizeof(float));
-}
-
-}  // namespace host
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(squeeze,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::host::SqueezeCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(squeeze2,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::host::Squeeze2Compute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/squeeze_compute.h b/lite/kernels/arm/squeeze_compute.h
deleted file mode 100644
index c9e4c2a17c..0000000000
--- a/lite/kernels/arm/squeeze_compute.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace host {
-
-class SqueezeCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void Run() override;
-
-  virtual ~SqueezeCompute() = default;
-};
-
-class Squeeze2Compute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void Run() override;
-
-  virtual ~Squeeze2Compute() = default;
-};
-
-}  // namespace host
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/stack_compute.cc b/lite/kernels/arm/stack_compute.cc
deleted file mode 100644
index 0e98c5328e..0000000000
--- a/lite/kernels/arm/stack_compute.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/stack_compute.h"
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void StackCompute::Run() {
-  auto& param = Param<operators::StackParam>();
-  std::vector<lite::Tensor*> x = param.X;
-  lite::Tensor* out = param.Out;
-  int axis = param.axis;
-
-  lite::arm::math::stack(x, out, axis);
-}
-
-} /* namespace arm */
-} /* namespace kernels */
-} /* namespace lite */
-} /* namespace paddle */
-
-REGISTER_LITE_KERNEL(
-    stack, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::StackCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/stack_compute.h b/lite/kernels/arm/stack_compute.h
deleted file mode 100644
index 2dbb1d7a21..0000000000
--- a/lite/kernels/arm/stack_compute.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class StackCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void Run() override;
-
-  virtual ~StackCompute() = default;
-};
-
-} /* namespace arm */
-} /* namespace kernels */
-} /* namespace lite */
-} /* namespace paddle */
diff --git a/lite/kernels/arm/topk_compute.cc b/lite/kernels/arm/topk_compute.cc
deleted file mode 100644
index 994ef3f8dd..0000000000
--- a/lite/kernels/arm/topk_compute.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/topk_compute.h"
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void TopkCompute::Run() {
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  auto& param = Param<operators::TopkParam>();
-  const float* x_data = param.X->data<float>();
-  float* out_val = param.Out->mutable_data<float>();
-  int* out_ind = param.Indices->mutable_data<int>();
-  DDim x_dims = param.X->dims();
-  int K = param.K;
-  int dim_size = x_dims.size();
-  int m = x_dims.production() / x_dims[dim_size - 1];
-  int n = x_dims[dim_size - 1];
-  lite::arm::math::topk(x_data, out_val, out_ind, m, n, K, &ctx);
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    top_k, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::TopkCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Indices", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/topk_compute.h b/lite/kernels/arm/topk_compute.h
deleted file mode 100644
index 489ea63613..0000000000
--- a/lite/kernels/arm/topk_compute.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class TopkCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void Run() override;
-
-  virtual ~TopkCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/transpose_compute.cc b/lite/kernels/arm/transpose_compute.cc
deleted file mode 100644
index 8d1964de53..0000000000
--- a/lite/kernels/arm/transpose_compute.cc
+++ /dev/null
@@ -1,185 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/transpose_compute.h"
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-bool IsShuffleChannel(const std::vector<int> &axis) {
-  bool is_shuffle_channel = true;
-  if (axis.size() > 2 && axis[0] == 0 && axis[1] == 2 && axis[2] == 1) {
-    for (int i = 3; i < axis.size(); ++i) {
-      if (axis[i] != i) {
-        is_shuffle_channel = false;
-        break;
-      }
-    }
-  } else {
-    return false;
-  }
-  return is_shuffle_channel;
-}
-
-template <typename Dtype>
-void ShuffleChannelCompute(const std::vector<int> &axis,
-                           const lite::Tensor *input,
-                           lite::Tensor *output) {
-  const Dtype *input_ptr = input->data<Dtype>();
-  Dtype *output_ptr = output->mutable_data<Dtype>();
-  // input and output's shape dimension must >= 2 && <= 6.
-  const DDim &in_dim = input->dims();
-  const DDim &out_dim = output->dims();
-  size_t offset = 1;
-  for (int i = 3; i < axis.size(); ++i) {
-    offset *= in_dim[i];
-  }
-
-#pragma omp parallel for collapse(3)
-  for (int batch = 0; batch < out_dim[0]; ++batch) {
-    for (int c1 = 0; c1 < out_dim[1]; ++c1) {
-      for (int c2 = 0; c2 < out_dim[2]; ++c2) {
-        size_t out_offset =
-            ((batch * out_dim[1] + c1) * out_dim[2] + c2) * offset;
-        size_t in_offset = ((batch * in_dim[1] + c2) * in_dim[2] + c1) * offset;
-        memcpy(output_ptr + out_offset,
-               input_ptr + in_offset,
-               offset * sizeof(Dtype));
-      }
-    }
-  }
-}
-
-template <typename Dtype>
-void TransposeCompute_(const std::vector<int> &axis,
-                       const lite::Tensor *input,
-                       lite::Tensor *output) {
-  // const Dtype *input_ptr = input->data<Dtype>();
-  const Dtype *input_ptr = input->data<float>();
-  Dtype *output_ptr = output->mutable_data<Dtype>();
-
-  // input and output's shape dimension must >= 2 && <= 6.
-  const DDim &in_dim = input->dims();
-  const DDim &out_dim = output->dims();
-
-  // precompute inverted output dim and strides
-  size_t rout_dim[6], strides[6];
-  int permute = axis.size();  // permute must >=2 && <= 6.
-  for (int i = 0; i < permute; ++i) {
-    int k = permute - 1 - i;
-    strides[k] = 1;
-    for (int j = axis[i] + 1; j < permute; ++j) {
-      strides[k] *= in_dim[j];
-    }
-    rout_dim[k] = out_dim[i];
-  }
-
-  // unroll the first 2 dimensions
-  int reamin_dim = 1;
-  for (int i = 2; i < out_dim.size(); ++i) {
-    reamin_dim *= out_dim[i];
-  }
-
-#pragma omp parallel for collapse(2)
-  for (int batch = 0; batch < out_dim[0]; ++batch) {
-    for (int j = 0; j < out_dim[1]; ++j) {
-      size_t offset = batch * strides[permute - 1] + j * strides[permute - 2];
-      Dtype *out_ptr = output_ptr + (batch * out_dim[1] + j) * reamin_dim;
-      int indics[4] = {0, 0, 0, 0};
-      for (int k = 0; k < reamin_dim; ++k) {
-        out_ptr[k] = input_ptr[offset];
-        indics[0] += 1;
-        offset += strides[0];
-        for (int p = 0; p < permute - 3; ++p) {
-          if (indics[p] == rout_dim[p]) {
-            indics[p + 1] += 1;
-            indics[p] = 0;
-            offset += strides[p + 1];
-            offset -= rout_dim[p] * strides[p];
-          } else {
-            break;
-          }
-        }
-      }
-    }
-  }
-}
-
-// Transpose
-void TransposeCompute::Run() {
-  auto &param = Param<operators::TransposeParam>();
-  auto *input = param.x;
-  auto *output = param.output;
-  const std::vector<int> axis = param.axis;
-
-  bool shuffle_channel = IsShuffleChannel(axis);
-  if (shuffle_channel) {
-    ShuffleChannelCompute<float>(axis, input, output);
-  } else {
-    TransposeCompute_<float>(axis, input, output);
-  }
-  return;
-}
-
-// Transpose2
-void Transpose2Compute::Run() {
-  auto &param = Param<operators::TransposeParam>();
-  auto *input = param.x;
-  auto *output = param.output;
-  const std::vector<int> axis = param.axis;
-
-  bool shuffle_channel = IsShuffleChannel(axis);
-  if (shuffle_channel) {
-    ShuffleChannelCompute<float>(axis, input, output);
-  } else {
-    TransposeCompute_<float>(axis, input, output);
-  }
-  return;
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-// Transpose
-REGISTER_LITE_KERNEL(transpose,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::TransposeCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
-
-// Transpose2
-REGISTER_LITE_KERNEL(transpose2,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::Transpose2Compute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/transpose_compute.h b/lite/kernels/arm/transpose_compute.h
deleted file mode 100644
index 64e885412f..0000000000
--- a/lite/kernels/arm/transpose_compute.h
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/core/kernel.h"
-#include "lite/operators/transpose_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-// Transpose
-class TransposeCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::TransposeParam;
-
-  void Run() override;
-
-  virtual ~TransposeCompute() = default;
-};
-
-// Transpose2
-class Transpose2Compute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::TransposeParam;
-
-  void Run() override;
-
-  virtual ~Transpose2Compute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/transpose_compute_test.cc b/lite/kernels/arm/transpose_compute_test.cc
deleted file mode 100644
index aaf3f138a5..0000000000
--- a/lite/kernels/arm/transpose_compute_test.cc
+++ /dev/null
@@ -1,205 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/transpose_compute.h"
-#include <gtest/gtest.h>
-#include <limits>
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-#define IN(n, c, h, w)                                 \
-  input_data[w + h * input_w + c * input_h * input_w + \
-             n * input_c * input_h * input_w]
-#define OUT(n, c, h, w)                                    \
-  output_data[w + h * output_w + c * output_h * output_w + \
-              n * output_c * output_h * output_w]
-void transpose_compute_ref(const operators::TransposeParam& param) {
-  const lite::Tensor* input = param.x;
-  lite::Tensor* output = param.output;
-  std::vector<int> axis = param.axis;
-
-  auto* input_data = input->data<float>();
-  auto* output_data = output->mutable_data<float>();
-
-  int input_n = input->dims()[0];
-  int input_c = input->dims()[1];
-  int input_h = input->dims()[2];
-  int input_w = input->dims()[3];
-  int output_n = output->dims()[0];
-  int output_c = output->dims()[1];
-  int output_h = output->dims()[2];
-  int output_w = output->dims()[3];
-
-  for (int n = 0; n < input_n; ++n) {
-    for (int c = 0; c < input_c; ++c) {
-      for (int h = 0; h < input_h; ++h) {
-        for (int w = 0; w < input_w; ++w) {
-          OUT(n, h, w, c) = IN(n, c, h, w);
-        }
-      }
-    }
-  }
-}
-
-// Transpose
-TEST(transpose_arm, init) {
-  TransposeCompute transpose;
-  ASSERT_EQ(transpose.precision(), PRECISION(kFloat));
-  ASSERT_EQ(transpose.target(), TARGET(kARM));
-}
-
-TEST(transpose_arm, compute_shape_nchw) {
-  TransposeCompute transpose;
-  operators::TransposeParam param;
-
-  std::vector<int> axis{0, 2, 3, 1};
-  param.axis = axis;
-
-  lite::Tensor input;
-  lite::Tensor output;
-  lite::Tensor output_ref;
-
-  const std::vector<int64_t> input_shape{1, 24, 2, 2};
-  const std::vector<int64_t> output_shape{1, 2, 2, 24};
-
-  DDimLite ddimInput(input_shape);
-  DDimLite ddimOutput(output_shape);
-
-  input.Resize(ddimInput);
-  output.Resize(ddimOutput);
-  output_ref.Resize(ddimOutput);
-
-  for (int i = 0;
-       i < input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
-       i += 4) {
-    input.mutable_data<float>()[i] = i;
-    input.mutable_data<float>()[i + 1] = i + 1;
-    input.mutable_data<float>()[i + 2] = i + 2;
-    input.mutable_data<float>()[i + 3] = i + 3;
-  }
-  for (int i = 0;
-       i < input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
-       i += 4) {
-  }
-  param.x = &input;
-  param.output = &output;
-
-  // run transpose_compute
-  transpose.SetParam(param);
-  transpose.Run();
-
-  // run transpose_compute_ref
-  param.output = &output_ref;
-  transpose_compute_ref(param);
-
-  auto* output_data = output.data<float>();
-  auto* output_ref_data = output_ref.data<float>();
-  for (int i = 0;
-       i < input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
-       i += 4) {
-    EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-  }
-}
-
-TEST(transpose, retrive_op) {
-  auto transpose =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "transpose");
-  ASSERT_FALSE(transpose.empty());
-  ASSERT_TRUE(transpose.front());
-}
-
-// Transpose2
-TEST(transpose2_arm, init) {
-  Transpose2Compute transpose2;
-  ASSERT_EQ(transpose2.precision(), PRECISION(kFloat));
-  ASSERT_EQ(transpose2.target(), TARGET(kARM));
-}
-
-TEST(transpose2_arm, compute_shape_nchw) {
-  Transpose2Compute transpose2;
-  operators::TransposeParam param;
-
-  std::vector<int> axis{0, 2, 3, 1};
-  param.axis = axis;
-
-  lite::Tensor input;
-  lite::Tensor output;
-  lite::Tensor output_ref;
-
-  const std::vector<int64_t> input_shape{1, 24, 2, 2};
-  const std::vector<int64_t> output_shape{1, 2, 2, 24};
-
-  DDimLite ddimInput(input_shape);
-  DDimLite ddimOutput(output_shape);
-
-  input.Resize(ddimInput);
-  output.Resize(ddimOutput);
-  output_ref.Resize(ddimOutput);
-
-  for (int i = 0;
-       i < input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
-       i += 4) {
-    input.mutable_data<float>()[i] = i;
-    input.mutable_data<float>()[i + 1] = i + 1;
-    input.mutable_data<float>()[i + 2] = i + 2;
-    input.mutable_data<float>()[i + 3] = i + 3;
-  }
-  for (int i = 0;
-       i < input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
-       i += 4) {
-  }
-  param.x = &input;
-  param.output = &output;
-
-  // run transpose_compute
-  transpose2.SetParam(param);
-  transpose2.Run();
-
-  // run transpose_compute_ref
-  param.output = &output_ref;
-  transpose_compute_ref(param);
-
-  auto* output_data = output.data<float>();
-  auto* output_ref_data = output_ref.data<float>();
-  for (int i = 0;
-       i < input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
-       i += 4) {
-    EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-  }
-}
-
-TEST(transpose2, retrive_op) {
-  auto transpose2 =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "transpose2");
-  ASSERT_FALSE(transpose2.empty());
-  ASSERT_TRUE(transpose2.front());
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(transpose, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(transpose2, kARM, kFloat, kNCHW, def);
diff --git a/lite/kernels/arm/while_compute.cc b/lite/kernels/arm/while_compute.cc
deleted file mode 100644
index ab3da93acc..0000000000
--- a/lite/kernels/arm/while_compute.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/while_compute.h"
-#include <memory>
-#include <string>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/tensor.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void WhileCompute::PrepareForRun() {
-  auto &param = Param<operators::WhileParam>();
-  auto cur_scope = param.scope;
-
-  executor_ =
-      std::make_shared<StepExecutor>(param.sub_block, cur_scope, place());
-}
-void WhileCompute::Run() {
-  auto &param = Param<operators::WhileParam>();
-  while (param.cond->data<bool>()[0]) {
-    executor_->Run();
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    while, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::WhileCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Condition",
-               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .BindOutput("Out", {LiteType::GetTensorListTy(TARGET(kARM))})
-    .BindOutput("StepScopes", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/while_compute.h b/lite/kernels/arm/while_compute.h
deleted file mode 100644
index c8ddc6086c..0000000000
--- a/lite/kernels/arm/while_compute.h
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include <memory>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/operators/while_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class StepExecutor {
-  typedef std::shared_ptr<OpLite> OpPtr;
-
- public:
-  StepExecutor(cpp::BlockDesc *block, Scope *scope, Place place)
-      : scope_(scope), place_(place) {
-    int32_t op_size = block->OpsSize();
-    for (int32_t i = 0; i < op_size; ++i) {
-      auto &op_desc = *block->template GetOp<cpp::OpDesc>(i);
-      auto op_type = op_desc.Type();
-      auto op_handler = lite::LiteOpRegistry::Global().Create(op_desc.Type());
-      VLOG(4) << "while: creating Op [" << op_type << "]";
-      op_handler->Attach(op_desc, scope);
-
-      auto hostplace = place_;
-      hostplace.target = TARGET(kHost);
-      auto kernels = op_handler->CreateKernels({place_, hostplace});
-      CHECK_GT(kernels.size(), 0) << "cannot create kernel";
-      op_handler->AttachKernel(kernels[0].get());
-      op_handler->SetKernel(kernels);
-      ops_of_block_.push_back(op_handler);
-    }
-  }
-
-  void Run() {
-    for (auto &op_handler : ops_of_block_) {
-      VLOG(4) << op_handler->op_info()->Repr();
-      op_handler->InferShape();
-      VLOG(4) << "while: infered shape";
-      op_handler->Run();
-    }
-  }
-
- private:
-  Scope *scope_;
-  Place place_;
-  std::vector<OpPtr> ops_of_block_;
-};
-
-class WhileCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::WhileParam;
-
-  void Run() override;
-  void PrepareForRun() override;
-
-  virtual ~WhileCompute() = default;
-
- private:
-  std::shared_ptr<StepExecutor> executor_;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/write_to_array_compute.cc b/lite/kernels/arm/write_to_array_compute.cc
deleted file mode 100644
index 42498e77f2..0000000000
--- a/lite/kernels/arm/write_to_array_compute.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/write_to_array_compute.h"
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void WriteToArrayCompute::PrepareForRun() {}
-
-void WriteToArrayCompute::Run() {
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  auto& param = this->Param<operators::WriteToArrayParam>();
-
-  CHECK_EQ(param.I->numel(), 1) << "input2 should have only one element";
-  const auto* x_data = param.X->data<float>();
-  int id = param.I->data<int>()[0];
-  int id_test = param.I->data<int64_t>()[0];
-  if (id >= param.Out->size()) {
-    for (int i = param.Out->size(); i < id + 1; i++) {
-      lite::Tensor tmp;
-      param.Out->push_back(tmp);
-    }
-  }
-  (*param.Out)[id].Resize(param.X->dims());
-  auto out_lod = (*param.Out)[id].mutable_lod();
-  *out_lod = param.X->lod();
-  auto* o_data = (*param.Out)[id].mutable_data<float>(TARGET(kHost));
-  int input_size = param.X->numel();
-  memcpy(o_data, x_data, sizeof(float) * input_size);
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(write_to_array,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::WriteToArrayCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("I", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/write_to_array_compute.h b/lite/kernels/arm/write_to_array_compute.h
deleted file mode 100644
index c7b7c64c34..0000000000
--- a/lite/kernels/arm/write_to_array_compute.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <stdint.h>
-#include "lite/backends/arm/math/type_trans.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class WriteToArrayCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::WriteToArrayParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  ~WriteToArrayCompute() {}
-
- private:
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/arm/yolo_box_compute.cc b/lite/kernels/arm/yolo_box_compute.cc
deleted file mode 100644
index 1336e5e1e0..0000000000
--- a/lite/kernels/arm/yolo_box_compute.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/yolo_box_compute.h"
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void YoloBoxCompute::Run() {
-  auto& param = Param<operators::YoloBoxParam>();
-  lite::Tensor* X = param.X;
-  lite::Tensor* ImgSize = param.ImgSize;
-  lite::Tensor* Boxes = param.Boxes;
-  lite::Tensor* Scores = param.Scores;
-  std::vector<int> anchors = param.anchors;
-  int class_num = param.class_num;
-  float conf_thresh = param.conf_thresh;
-  int downsample_ratio = param.downsample_ratio;
-  lite::arm::math::yolobox(X,
-                           ImgSize,
-                           Boxes,
-                           Scores,
-                           anchors,
-                           class_num,
-                           conf_thresh,
-                           downsample_ratio);
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(yolo_box,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::YoloBoxCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("ImgSize", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Scores", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/yolo_box_compute.h b/lite/kernels/arm/yolo_box_compute.h
deleted file mode 100644
index d65af3a83d..0000000000
--- a/lite/kernels/arm/yolo_box_compute.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class YoloBoxCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  void Run() override;
-
-  virtual ~YoloBoxCompute() = default;
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/cuda/CMakeLists.txt b/lite/kernels/cuda/CMakeLists.txt
deleted file mode 100644
index f28c4b55c1..0000000000
--- a/lite/kernels/cuda/CMakeLists.txt
+++ /dev/null
@@ -1,42 +0,0 @@
-if(NOT LITE_WITH_CUDA)
-    return()
-endif()
-
-message(STATUS "compile with lite CUDA kernels")
-
-nv_library(mul_compute_cuda SRCS mul_compute.cc DEPS ${lite_kernel_deps} context)
-
-lite_cc_library(io_copy_compute_cuda SRCS io_copy_compute.cc DEPS ${lite_kernel_deps})
-
-nv_library(leaky_relu_compute_cuda SRCS leaky_relu_compute.cu DEPS ${lite_kernel_deps})
-nv_library(yolo_box_compute_cuda SRCS yolo_box_compute.cu DEPS ${lite_kernel_deps})
-nv_library(transpose_compute_cuda SRCS transpose_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
-nv_library(nearest_interp_compute_cuda SRCS nearest_interp_compute.cu DEPS ${lite_kernel_deps})
-nv_library(conv2d_cuda SRCS conv_compute.cc DEPS ${lite_kernel_deps} ${math_cuda})
-nv_library(concat_compute_cuda SRCS concat_compute.cu DEPS ${lite_kernel_deps})
-nv_library(elementwise_add_compute_cuda SRCS elementwise_add_compute.cu DEPS ${lite_kernel_deps})
-
-nv_test(conv2d_cuda_test SRCS conv_compute_test.cc DEPS conv2d_cuda)
-nv_test(nearest_interp_compute_cuda_test SRCS nearest_interp_compute_test.cc DEPS nearest_interp_compute_cuda)
-nv_test(leaky_relu_compute_cuda_test SRCS leaky_relu_compute_test.cc DEPS leaky_relu_compute_cuda)
-nv_test(yolo_box_compute_cuda_test SRCS yolo_box_compute_test.cc DEPS yolo_box_compute_cuda)
-nv_test(transpose_compute_cuda_test SRCS transpose_compute_test.cc DEPS transpose_compute_cuda)
-nv_test(concat_compute_cuda_test SRCS concat_compute_test.cc DEPS concat_compute_cuda)
-nv_test(elementwise_add_compute_cuda_test SRCS elementwise_add_compute_test.cc DEPS elementwise_add_compute_cuda)
-
-nv_library(calib_compute_cuda SRCS calib_compute.cu DEPS ${lite_kernel_deps})
-lite_cc_test(calib_compute_cuda_test SRCS calib_compute_cuda_test.cc DEPS calib_compute_cuda)
-
-set(cuda_kernels
-conv2d_cuda
-mul_compute_cuda
-io_copy_compute_cuda
-leaky_relu_compute_cuda
-nearest_interp_compute_cuda
-concat_compute_cuda
-elementwise_add_compute_cuda
-yolo_box_compute_cuda
-transpose_compute_cuda
-)
-
-set(cuda_kernels "${cuda_kernels}" CACHE GLOBAL "cuda kernels")
diff --git a/lite/kernels/cuda/calib_compute.cu b/lite/kernels/cuda/calib_compute.cu
deleted file mode 100644
index 04f199e91f..0000000000
--- a/lite/kernels/cuda/calib_compute.cu
+++ /dev/null
@@ -1,131 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-#include "lite/kernels/cuda/calib_compute.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-
-__device__ __forceinline__ int8_t float2int8(float x) {
-  x = fmaxf(x, INT8_MIN);
-  x = fminf(x, INT8_MAX);
-  return __float2int_rn(x);
-}
-
-__global__ void Fp32ToInt8Kernel(const int num,
-                                 const float scale,
-                                 const float* input,
-                                 int8_t* output) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < num) {
-    output[index] = float2int8(input[index] / scale);
-  }
-}
-
-__global__ void Int8ToFp32Kernel(const int num,
-                                 const float scale,
-                                 const int8_t* input,
-                                 float* output) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < num) {
-    output[index] = input[index] * scale;
-  }
-}
-
-void CalibComputeFp32ToInt8::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->As<CUDAContext>();
-  auto stream = ctx.exec_stream();
-
-  auto scale = param.scale;
-  const auto* din = param.input->data<float>();
-  auto* dout = param.output->mutable_data<int8_t>(TARGET(kCUDA));
-  int num = static_cast<int>(param.input->numel());
-  int threads = 1024;
-  int blocks = (num + threads - 1) / threads;
-  Fp32ToInt8Kernel<<<blocks, threads, 0, stream>>>(num, scale, din, dout);
-  cudaError_t error = cudaGetLastError();
-  CHECK(error == cudaSuccess) << cudaGetErrorString(error);
-}
-
-void CalibComputeInt8ToFp32::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->As<CUDAContext>();
-  auto stream = ctx.exec_stream();
-
-  auto scale = param.scale;
-  const auto* din = param.input->data<int8_t>();
-  auto* dout = param.output->mutable_data<float>(TARGET(kCUDA));
-  int num = static_cast<int>(param.input->numel());
-  int threads = 1024;
-  int blocks = (num + threads - 1) / threads;
-  Int8ToFp32Kernel<<<blocks, threads, 0, stream>>>(num, scale, din, dout);
-  cudaError_t error = cudaGetLastError();
-  CHECK(error == cudaSuccess) << cudaGetErrorString(error);
-}
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(calib,
-                     kCUDA,
-                     kInt8,
-                     kNCHW,
-                     paddle::lite::kernels::cuda::CalibComputeFp32ToInt8,
-                     fp32_to_int8)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt8))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(calib,
-                     kCUDA,
-                     kInt8,
-                     kNCHW,
-                     paddle::lite::kernels::cuda::CalibComputeInt8ToFp32,
-                     int8_to_fp32)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt8))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(calib_once,
-                     kCUDA,
-                     kInt8,
-                     kNCHW,
-                     paddle::lite::kernels::cuda::CalibComputeFp32ToInt8,
-                     fp32_to_int8)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt8))})
-    .Finalize();
-REGISTER_LITE_KERNEL(calib_once,
-                     kCUDA,
-                     kInt8,
-                     kNCHW,
-                     paddle::lite::kernels::cuda::CalibComputeInt8ToFp32,
-                     int8_to_fp32)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt8))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))})
-    .Finalize();
diff --git a/lite/kernels/cuda/calib_compute.h b/lite/kernels/cuda/calib_compute.h
deleted file mode 100644
index f161f69992..0000000000
--- a/lite/kernels/cuda/calib_compute.h
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include "lite/core/kernel.h"
-#include "lite/operators/calib_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-
-class CalibComputeFp32ToInt8
-    : public KernelLite<TARGET(kCUDA), PRECISION(kInt8)> {
- public:
-  using param_t = operators::CalibParam;
-
-  void Run() override;
-
-  virtual ~CalibComputeFp32ToInt8() = default;
-
-  std::string doc() const override { return "Fp32 --> Int8"; }
-};
-
-class CalibComputeInt8ToFp32
-    : public KernelLite<TARGET(kCUDA), PRECISION(kInt8)> {
- public:
-  using param_t = operators::CalibParam;
-
-  void Run() override;
-
-  virtual ~CalibComputeInt8ToFp32() = default;
-
-  std::string doc() const override { return "Int8 --> Fp32"; }
-};
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/cuda/calib_compute_cuda_test.cc b/lite/kernels/cuda/calib_compute_cuda_test.cc
deleted file mode 100644
index 691b52d257..0000000000
--- a/lite/kernels/cuda/calib_compute_cuda_test.cc
+++ /dev/null
@@ -1,178 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <algorithm>
-#include <memory>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-
-static void int8_to_fp32_basic(const int8_t* din,
-                               float* dout,
-                               const float scale,
-                               int num) {
-  for (int j = 0; j < num; ++j) {
-    dout[j] = din[j] * scale;
-  }
-}
-
-static void fp32_to_int8_basic(const float* din,
-                               int8_t* dout,
-                               const float scale,
-                               int num) {
-  for (int j = 0; j < num; ++j) {
-    auto v = din[j] / scale;
-    v = std::max(v, static_cast<float>(INT8_MIN));
-    v = std::min(v, static_cast<float>(INT8_MAX));
-    v = roundf(v);
-    dout[j] = static_cast<int8_t>(v);
-  }
-}
-
-void calib_ref(const operators::CalibParam& param, bool to_float = true) {
-  auto scale = param.scale;
-  if (to_float) {
-    const auto* din = param.input->data<int8_t>();
-    auto* dout = param.output->mutable_data<float>();
-    int8_to_fp32_basic(din, dout, scale, param.input->numel());
-  } else {
-    const auto* din = param.input->data<float>();
-    auto* dout = param.output->mutable_data<int8_t>();
-    fp32_to_int8_basic(din, dout, scale, param.input->numel());
-  }
-}
-
-TEST(calib_cuda, int8_to_fp32) {
-  LOG(INFO) << "to get kernel ...";
-  auto kernels = KernelRegistry::Global().Create(
-      "calib", TARGET(kCUDA), PRECISION(kInt8), DATALAYOUT(kNCHW));
-  ASSERT_FALSE(kernels.empty());
-  auto calib = std::move(*std::next(kernels.begin(), 1));
-  LOG(INFO) << "get kernel: " << calib->doc();
-  const int n = 64, c = 32, h = 18, w = 18;
-  Tensor x;
-  Tensor x_cpu;
-  Tensor output;
-  Tensor output_cpu;
-  // set the dims of input, output tensors
-  x.Resize({n, c, h, w});
-  x_cpu.Resize({n, c, h, w});
-  output.Resize({n, c, h, w});
-  output_cpu.Resize({n, c, h, w});
-  // initialize the data of input tensors
-  auto* x_data = x.mutable_data<int8_t>(TARGET(kCUDA));
-  auto* x_cpu_data = x_cpu.mutable_data<int8_t>();
-  for (int i = 0; i < x.dims().production(); i++) {
-    float sign = i % 3 == 0 ? -1.0f : 1.0f;
-    x_cpu_data[i] = static_cast<int8_t>(sign * (i % 127));
-  }
-  x.Assign<int8_t, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
-  // prepare kernel params and run
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  auto& context = ctx->As<CUDAContext>();
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-  context.SetExecStream(stream);
-  calib->SetContext(std::move(ctx));
-
-  operators::CalibParam param;
-  param.scale = 0.013f;
-  param.input = &x;
-  param.output = &output;
-  calib->SetParam(param);
-  calib->Launch();
-  cudaDeviceSynchronize();
-  // invoking ref implementation and compare results
-  param.input = &x_cpu;
-  param.output = &output_cpu;
-  calib_ref(param);
-  auto* output_data = output.mutable_data<float>();
-  std::unique_ptr<float[]> output_gpu_copy(new float[output.numel()]);
-  CopySync<TARGET(kCUDA)>(output_gpu_copy.get(),
-                          output_data,
-                          sizeof(float) * output.numel(),
-                          IoDirection::DtoH);
-  const auto* output_cpu_data = output_cpu.data<float>();
-  for (int i = 0; i < output.dims().production(); i++) {
-    EXPECT_NEAR(output_gpu_copy[i], output_cpu_data[i], 1e-5);
-  }
-}
-
-TEST(calib_cuda, fp32_to_int8) {
-  LOG(INFO) << "to get kernel ...";
-  auto kernels = KernelRegistry::Global().Create(
-      "calib", TARGET(kCUDA), PRECISION(kInt8), DATALAYOUT(kNCHW));
-  ASSERT_FALSE(kernels.empty());
-  auto calib = std::move(kernels.front());
-  LOG(INFO) << "get kernel: " << calib->doc();
-  const int n = 64, c = 32, h = 18, w = 18;
-  Tensor x;
-  Tensor x_cpu;
-  Tensor output;
-  Tensor output_cpu;
-  // set the dims of input, output tensors
-  x.Resize({n, c, h, w});
-  x_cpu.Resize({n, c, h, w});
-  output.Resize({n, c, h, w});
-  output_cpu.Resize({n, c, h, w});
-  // initialize the data of input tensors
-  auto* x_data = x.mutable_data<float>(TARGET(kCUDA));
-  auto* x_cpu_data = x_cpu.mutable_data<float>();
-  for (int i = 0; i < x.dims().production(); i++) {
-    float sign = i % 3 == 0 ? -1.0f : 1.0f;
-    x_cpu_data[i] = sign * (i % 127) * 0.013f;
-  }
-  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
-  // prepare kernel params and run
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  auto& context = ctx->As<CUDAContext>();
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-  context.SetExecStream(stream);
-  calib->SetContext(std::move(ctx));
-
-  operators::CalibParam param;
-  param.scale = 0.013f;
-  param.input = &x;
-  param.output = &output;
-  calib->SetParam(param);
-  calib->Launch();
-  cudaDeviceSynchronize();
-  // invoking ref implementation and compare results
-  param.input = &x_cpu;
-  param.output = &output_cpu;
-  calib_ref(param, false);
-  auto* output_data = output.mutable_data<int8_t>();
-  std::unique_ptr<int8_t[]> output_gpu_copy(new int8_t[output.numel()]);
-  CopySync<TARGET(kCUDA)>(output_gpu_copy.get(),
-                          output_data,
-                          sizeof(int8_t) * output.numel(),
-                          IoDirection::DtoH);
-  const auto* output_cpu_data = output_cpu.data<int8_t>();
-  for (int i = 0; i < output.dims().production(); i++) {
-    EXPECT_EQ(output_gpu_copy[i], output_cpu_data[i]);
-  }
-}
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(calib, kCUDA, kInt8, kNCHW, int8_to_fp32);
-USE_LITE_KERNEL(calib, kCUDA, kInt8, kNCHW, fp32_to_int8);
diff --git a/lite/kernels/cuda/concat_compute.cu b/lite/kernels/cuda/concat_compute.cu
deleted file mode 100644
index 10a9414935..0000000000
--- a/lite/kernels/cuda/concat_compute.cu
+++ /dev/null
@@ -1,276 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/cuda/concat_compute.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-using Tensor = lite::Tensor;
-
-template <typename T>
-__global__ void ConcatKernel(const T** inputs,
-                             const int* input_cols,
-                             int col_size,
-                             const int output_rows,
-                             const int output_cols,
-                             T* output) {
-  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int curr_segment = 0;
-  int curr_offset = input_cols[0];
-  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
-    int curr_col_offset = input_cols[curr_segment + 1];
-    while (curr_col_offset <= tid_x) {
-      curr_offset = curr_col_offset;
-      ++curr_segment;
-      curr_col_offset = input_cols[curr_segment + 1];
-    }
-
-    int local_col = tid_x - curr_offset;
-    int segment_width = curr_col_offset - curr_offset;
-
-    const T* input_ptr = inputs[curr_segment];
-    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
-      output[tid_y * output_cols + tid_x] =
-          input_ptr[tid_y * segment_width + local_col];
-  }
-}
-
-template <typename T>
-__device__ void ConcatKernelDetail(const T** inputs_data,
-                                   const int fixed_in_col,
-                                   const int out_rows,
-                                   const int out_cols,
-                                   T* output_data) {
-  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  for (; tid_x < out_cols; tid_x += blockDim.x * gridDim.x) {
-    int split = tid_x * 1.0 / fixed_in_col;
-    int in_offset = tid_x - split * fixed_in_col;
-    const T* input_ptr = inputs_data[split];
-    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < out_rows; tid_y += blockDim.y * gridDim.y) {
-      output_data[tid_y * out_cols + tid_x] =
-          input_ptr[tid_y * fixed_in_col + in_offset];
-    }
-  }
-  // for (int i = 0; i < 4; i++){
-  //   printf("input[0][%d] = %.1f\n", i, inputs_data[0][i]);
-  //   printf("output[%d] = %.1f\n", i, output_data[i]);
-  // }
-}
-
-template <typename T>
-__global__ void ConcatKernel(const T* input_addr0,
-                             const T* input_addr1,
-                             const int fixed_in_col,
-                             const int out_rows,
-                             const int out_cols,
-                             T* output_data) {
-  const T* inputs_data[2];
-  inputs_data[0] = input_addr0;
-  inputs_data[1] = input_addr1;
-  ConcatKernelDetail<T>(
-      inputs_data, fixed_in_col, out_rows, out_cols, output_data);
-}
-
-template <typename T>
-__global__ void ConcatKernel(const T* input_addr0,
-                             const T* input_addr1,
-                             const T* input_addr2,
-                             const int fixed_in_col,
-                             const int out_rows,
-                             const int out_cols,
-                             T* output_data) {
-  const T* inputs_data[3];
-  inputs_data[0] = input_addr0;
-  inputs_data[1] = input_addr1;
-  inputs_data[2] = input_addr2;
-  ConcatKernelDetail<T>(
-      inputs_data, fixed_in_col, out_rows, out_cols, output_data);
-}
-
-template <typename T>
-__global__ void ConcatKernel(const T* input_addr0,
-                             const T* input_addr1,
-                             const T* input_addr2,
-                             const T* input_addr3,
-                             const int fixed_in_col,
-                             const int out_rows,
-                             const int out_cols,
-                             T* output_data) {
-  const T* inputs_data[4];
-  inputs_data[0] = input_addr0;
-  inputs_data[1] = input_addr1;
-  inputs_data[2] = input_addr2;
-  inputs_data[3] = input_addr3;
-  ConcatKernelDetail<T>(
-      inputs_data, fixed_in_col, out_rows, out_cols, output_data);
-}
-
-template <typename T>
-__global__ void ConcatKernel(const T** inputs_data,
-                             const int in_num,
-                             const int fixed_in_col,
-                             const int out_rows,
-                             const int out_cols,
-                             T* output_data) {
-  ConcatKernelDetail<T>(
-      inputs_data, fixed_in_col, out_rows, out_cols, output_data);
-}
-
-static inline void GetBlockDims(const CUDAContext& context,
-                                int num_rows,
-                                int num_cols,
-                                dim3* block_dims,
-                                dim3* grid_dims) {
-  // Set the thread block and grid according to CurrentDeviceId
-  const int kThreadsPerBlock = 1024;
-  int block_cols = kThreadsPerBlock;
-  if (num_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
-    block_cols = ((num_cols + 31) >> 5) << 5;
-  }
-  int block_rows = kThreadsPerBlock / block_cols;
-  *block_dims = dim3(block_cols, block_rows, 1);
-
-  int grid_cols = (num_cols + block_cols - 1) / block_cols;
-  int grid_rows = std::max(num_rows / block_rows, 1);
-  *grid_dims = dim3(grid_cols, grid_rows, 1);
-}
-
-void ConcatCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<CUDAContext>();
-  auto stream = ctx.exec_stream();
-
-  std::vector<Tensor*> input = param.x;
-  Tensor* output = param.output;
-  int axis = param.axis;
-
-  int in_num = input.size();
-  int in_row = 1;
-  auto dim_0 = input[0]->dims();
-  for (int i = 0; i < axis; ++i) {
-    in_row *= dim_0[i];
-  }
-  int in_col = input[0]->numel() / in_row;
-  int out_row = in_row, out_col = 0;
-
-  std::vector<const float*> inputs_data(in_num);
-  std::vector<int> inputs_col(in_num + 1);
-  inputs_col[0] = 0;
-  bool has_same_shape = true;
-  for (int i = 0; i < in_num; ++i) {
-    int t_cols = input[i]->numel() / in_row;
-    if (has_same_shape) {
-      if (t_cols != in_col) has_same_shape = false;
-    }
-    out_col += t_cols;
-    inputs_col[i + 1] = out_col;
-    inputs_data[i] = input[i]->data<float>();
-  }
-  dim3 block_dims;
-  dim3 grid_dims;
-  GetBlockDims(ctx, out_row, out_col, &block_dims, &grid_dims);
-  const float** dev_ins_data = nullptr;
-  if (!has_same_shape || in_num < 2 || in_num > 4) {
-    float* tmp_dev_ins_data = nullptr;
-    CHECK(cudaSuccess ==
-          cudaMalloc(&tmp_dev_ins_data, inputs_data.size() * sizeof(float*)));
-    CHECK(cudaSuccess == cudaMemcpy(tmp_dev_ins_data,
-                                    static_cast<void*>(inputs_data.data()),
-                                    inputs_data.size() * sizeof(float*),
-                                    cudaMemcpyHostToDevice));
-    dev_ins_data = reinterpret_cast<const float**>(tmp_dev_ins_data);
-  }
-  if (has_same_shape) {
-    if (in_num == 2) {
-      ConcatKernel<float><<<grid_dims, block_dims, 0, stream>>>(
-          inputs_data[0],
-          inputs_data[1],
-          in_col,
-          out_row,
-          out_col,
-          output->mutable_data<float>());
-    } else if (in_num == 3) {
-      ConcatKernel<float><<<grid_dims, block_dims, 0, stream>>>(
-          inputs_data[0],
-          inputs_data[1],
-          inputs_data[2],
-          in_col,
-          out_row,
-          out_col,
-          output->mutable_data<float>());
-    } else if (in_num == 4) {
-      ConcatKernel<float><<<grid_dims, block_dims, 0, stream>>>(
-          inputs_data[0],
-          inputs_data[1],
-          inputs_data[2],
-          inputs_data[3],
-          in_col,
-          out_row,
-          out_col,
-          output->mutable_data<float>());
-    } else {
-      ConcatKernel<float><<<grid_dims, block_dims, 0, stream>>>(
-          dev_ins_data,
-          in_num,
-          in_col,
-          out_row,
-          out_col,
-          output->mutable_data<float>());
-      cudaFree(dev_ins_data);
-    }
-  } else {
-    int* tmp_dev_ins_col_data = nullptr;
-
-    CHECK(cudaSuccess ==
-          cudaMalloc(&tmp_dev_ins_col_data, inputs_col.size() * sizeof(int)));
-    CHECK(cudaSuccess == cudaMemcpy(tmp_dev_ins_col_data,
-                                    static_cast<void*>(inputs_col.data()),
-                                    inputs_col.size() * sizeof(int),
-                                    cudaMemcpyHostToDevice));
-    int* dev_ins_col_data = static_cast<int*>(tmp_dev_ins_col_data);
-    ConcatKernel<float><<<grid_dims, block_dims, 0, stream>>>(
-        dev_ins_data,
-        dev_ins_col_data,
-        static_cast<int>(inputs_col.size()),
-        out_row,
-        out_col,
-        output->mutable_data<float>());
-    cudaFree(dev_ins_data);
-    cudaFree(dev_ins_col_data);
-  }
-
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
-}
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(concat,
-                     kCUDA,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::cuda::ConcatCompute,
-                     def)
-    .BindInput("x", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .BindInput("axis", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .BindOutput("output", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .Finalize();
diff --git a/lite/kernels/cuda/concat_compute.h b/lite/kernels/cuda/concat_compute.h
deleted file mode 100644
index 342ab5cba7..0000000000
--- a/lite/kernels/cuda/concat_compute.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/kernel.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-
-class ConcatCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ConcatParam;
-
-  void Run() override;
-  virtual ~ConcatCompute() = default;
-};
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/cuda/concat_compute_test.cc b/lite/kernels/cuda/concat_compute_test.cc
deleted file mode 100644
index 8dc097be48..0000000000
--- a/lite/kernels/cuda/concat_compute_test.cc
+++ /dev/null
@@ -1,227 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/cuda/concat_compute.h"
-#include <gtest/gtest.h>
-#include <limits>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-
-bool infer_shape(const operators::ConcatParam& param) {
-  std::vector<lite::DDim> input_dims;
-  for (auto p : param.x) {
-    input_dims.push_back(p->dims());
-  }
-  size_t axis = static_cast<size_t>(param.axis);
-  const size_t n = input_dims.size();
-  CHECK_GT_OR_FALSE(n, 0);
-  auto& out_dims = input_dims[0];
-  size_t in_zero_dims_size = out_dims.size();
-  for (size_t i = 1; i < n; i++) {
-    for (size_t j = 0; j < in_zero_dims_size; j++) {
-      if (j == axis) {
-        out_dims[axis] += input_dims[i][j];
-      } else {
-        CHECK_EQ_OR_FALSE(out_dims[j], input_dims[i][j]);
-      }
-    }
-  }
-  if (out_dims[axis] < 0) {
-    out_dims[axis] = -1;
-  }
-  // Set output dims
-  param.output->Resize(lite::DDim(out_dims));
-  return true;
-}
-
-void concat_compute_ref(const operators::ConcatParam& param) {
-  std::vector<lite::Tensor*> input = param.x;
-  int axis = param.axis;
-  infer_shape(param);
-
-  lite::Tensor* output = param.output;
-  int num = input.size();
-  int rows = 1;
-  auto dim_0 = input[0]->dims();
-  for (int i = 0; i < axis; ++i) {
-    rows *= dim_0[i];
-  }
-  int out_rows = rows, out_cols = 0;
-
-  std::vector<int> input_cols(input.size());
-  for (int i = 0; i < num; ++i) {
-    int input_i_numel = input[i]->dims().size() == 0 ? 0 : 1;
-    for (int didx = 0; didx < input[i]->dims().size(); ++didx) {
-      input_i_numel *= input[i]->dims()[didx];
-    }
-    int t_cols = input_i_numel / rows;
-    out_cols += t_cols;
-    input_cols[i] = t_cols;
-  }
-
-  auto output_data = output->mutable_data<float>();
-  int col_idx = 0;
-  for (int j = 0; j < num; ++j) {
-    int col_len = input_cols[j];
-    auto input_data = input[j]->data<float>();
-    for (int k = 0; k < out_rows; ++k) {
-      memcpy(output_data + k * out_cols + col_idx,
-             input_data + k * col_len,
-             sizeof(float) * col_len);
-    }
-    col_idx += col_len;
-  }
-}
-
-TEST(concat, init) {
-  ConcatCompute concat;
-  ASSERT_EQ(concat.precision(), PRECISION(kFloat));
-  ASSERT_EQ(concat.target(), TARGET(kCUDA));
-}
-
-TEST(concat, compute_input_multi) {
-  ConcatCompute concat_kernel;
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  auto& context = ctx->As<CUDAContext>();
-
-  operators::ConcatParam param;
-  operators::ConcatParam param_ref;
-
-  LOG(INFO) << "test concat start";
-  // init param
-  std::vector<lite::Tensor*> x;
-  std::vector<lite::Tensor*> x_cpu;
-  std::vector<lite::Tensor*> x_ref;
-  lite::Tensor out;
-  lite::Tensor out_cpu;
-  lite::Tensor out_ref;
-  lite::Tensor tensorA;
-  lite::Tensor tensorB;
-  lite::Tensor tensorC;
-  lite::Tensor tensorD;
-  lite::Tensor tensorA_cpu;
-  lite::Tensor tensorB_cpu;
-  lite::Tensor tensorC_cpu;
-  lite::Tensor tensorD_cpu;
-  lite::Tensor tensorA_ref;
-  lite::Tensor tensorB_ref;
-  lite::Tensor tensorC_ref;
-  lite::Tensor tensorD_ref;
-
-  DDimLite ddimA({1, 3, 1, 2});
-  DDimLite ddimB({1, 4, 1, 2});
-  DDimLite ddimC({1, 5, 1, 2});
-  DDimLite ddimD({1, 6, 1, 2});
-
-  tensorA.Resize(ddimA);
-  tensorB.Resize(ddimB);
-  tensorC.Resize(ddimC);
-  tensorD.Resize(ddimD);
-  tensorA_cpu.Resize(ddimA);
-  tensorB_cpu.Resize(ddimB);
-  tensorC_cpu.Resize(ddimC);
-  tensorD_cpu.Resize(ddimD);
-  tensorA_ref.Resize(ddimA);
-  tensorB_ref.Resize(ddimB);
-  tensorC_ref.Resize(ddimC);
-  tensorD_ref.Resize(ddimD);
-
-  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
-  auto* out_cpu_data = out_cpu.mutable_data<float>();
-  auto* out_ref_data = out_ref.mutable_data<float>();
-  for (int i = 0; i < tensorA_cpu.numel(); i++) {
-    tensorA_cpu.mutable_data<float>()[i] = i;
-    tensorA_ref.mutable_data<float>()[i] = i;
-  }
-  for (int i = 0; i < tensorB_cpu.numel(); i++) {
-    tensorB_cpu.mutable_data<float>()[i] = i + 3;
-    tensorB_ref.mutable_data<float>()[i] = i + 3;
-  }
-  for (int i = 0; i < tensorC_cpu.numel(); i++) {
-    tensorC_cpu.mutable_data<float>()[i] = i + 6;
-    tensorC_ref.mutable_data<float>()[i] = i + 6;
-  }
-  for (int i = 0; i < tensorD_cpu.numel(); i++) {
-    tensorD_cpu.mutable_data<float>()[i] = i + 9;
-    tensorD_ref.mutable_data<float>()[i] = i + 9;
-  }
-  tensorA.Assign<float, lite::DDim, TARGET(kCUDA)>(
-      tensorA_cpu.mutable_data<float>(), tensorA_cpu.dims());
-  tensorB.Assign<float, lite::DDim, TARGET(kCUDA)>(
-      tensorB_cpu.mutable_data<float>(), tensorB_cpu.dims());
-  tensorC.Assign<float, lite::DDim, TARGET(kCUDA)>(
-      tensorC_cpu.mutable_data<float>(), tensorC_cpu.dims());
-  tensorD.Assign<float, lite::DDim, TARGET(kCUDA)>(
-      tensorD_cpu.mutable_data<float>(), tensorD_cpu.dims());
-
-  x.push_back(&tensorA);
-  x.push_back(&tensorB);
-  x.push_back(&tensorC);
-  x.push_back(&tensorD);
-  x_cpu.push_back(&tensorA_cpu);
-  x_cpu.push_back(&tensorB_cpu);
-  x_cpu.push_back(&tensorC_cpu);
-  x_cpu.push_back(&tensorD_cpu);
-  x_ref.push_back(&tensorA_ref);
-  x_ref.push_back(&tensorB_ref);
-  x_ref.push_back(&tensorC_ref);
-  x_ref.push_back(&tensorD_ref);
-
-  for (int cur_axis : {1}) {
-    param.x = x;
-    param.axis = cur_axis;
-    param.output = &out;
-
-    concat_kernel.SetParam(param);
-    LOG(INFO) << "test concat start cur_axis:" << cur_axis;
-
-    cudaStream_t stream;
-    cudaStreamCreate(&stream);
-    context.SetExecStream(stream);
-
-    concat_kernel.SetContext(std::move(ctx));
-    concat_kernel.Launch();
-    cudaDeviceSynchronize();
-    LOG(INFO) << "sync end";
-    CHECK(cudaSuccess == cudaMemcpy(out_cpu_data,
-                                    out_data,
-                                    sizeof(float) * out.numel(),
-                                    cudaMemcpyDeviceToHost));
-    LOG(INFO) << "concat.Run end";
-
-    param_ref.x = x_ref;
-    param_ref.axis = cur_axis;
-    param_ref.output = &out_ref;
-
-    LOG(INFO) << "concat_compute_ref start";
-    concat_compute_ref(param_ref);
-    LOG(INFO) << "concat_compute_ref end";
-
-    for (int i = 0; i < out.numel(); i++) {
-      EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
-    }
-  }
-}
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/cuda/conv_compute.cc b/lite/kernels/cuda/conv_compute.cc
deleted file mode 100644
index 1602153a7b..0000000000
--- a/lite/kernels/cuda/conv_compute.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/cuda/conv_compute.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-
-void ConvCompute::PrepareForRun() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<CUDAContext>();
-  conv_impl_.reset(new lite::cuda::math::CudnnConv2D<PRECISION(kFloat)>);
-  conv_impl_->init(param, &ctx);
-}
-
-void ConvCompute::Run() {
-  auto& param = this->Param<param_t>();
-  conv_impl_->run(param);
-}
-
-template <PrecisionType Ptype_out>
-void ConvComputeInt8<Ptype_out>::PrepareForRun() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<CUDAContext>();
-  conv_impl_.reset(new lite::cuda::math::CudnnConv2DInt8<Ptype_out>);
-  conv_impl_->init(param, &ctx);
-}
-
-template <PrecisionType Ptype_out>
-void ConvComputeInt8<Ptype_out>::Run() {
-  auto& param = this->Param<param_t>();
-  conv_impl_->run(param);
-}
-
-template class ConvComputeInt8<PRECISION(kInt8)>;
-template class ConvComputeInt8<PRECISION(kFloat)>;
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    conv2d, kCUDA, kFloat, kNCHW, paddle::lite::kernels::cuda::ConvCompute, def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(
-    conv2d,
-    kCUDA,
-    kInt8,
-    kNHWC,
-    paddle::lite::kernels::cuda::ConvComputeInt8<PRECISION(kFloat)>,
-    fp32_out)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt8))})
-    .BindInput("Bias",
-               {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))})
-    .BindInput("Filter",
-               {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt8))})
-    .BindOutput("Output",
-                {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(
-    conv2d,
-    kCUDA,
-    kInt8,
-    kNHWC,
-    paddle::lite::kernels::cuda::ConvComputeInt8<PRECISION(kInt8)>,
-    int8_out)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kCUDA),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNHWC))})
-    .BindInput("Bias",
-               {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))})
-    .BindInput("Filter",
-               {LiteType::GetTensorTy(TARGET(kCUDA),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNHWC))})
-    .BindOutput("Output",
-                {LiteType::GetTensorTy(TARGET(kCUDA),
-                                       PRECISION(kInt8),
-                                       DATALAYOUT(kNHWC))})
-    .Finalize();
diff --git a/lite/kernels/cuda/conv_compute.h b/lite/kernels/cuda/conv_compute.h
deleted file mode 100644
index 79a8e8bd5c..0000000000
--- a/lite/kernels/cuda/conv_compute.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include "lite/backends/cuda/math/cudnn_conv.h"
-#include "lite/core/kernel.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-
-class ConvCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ConvParam;
-
-  void PrepareForRun() override;
-  void Run() override;
-  virtual ~ConvCompute() = default;
-
- private:
-  std::unique_ptr<lite::cuda::math::CudnnConv2D<PRECISION(kFloat)>> conv_impl_;
-};
-
-template <PrecisionType Ptype_out>
-class ConvComputeInt8 : public KernelLite<TARGET(kCUDA), PRECISION(kInt8)> {
- public:
-  using param_t = operators::ConvParam;
-
-  void PrepareForRun() override;
-  void Run() override;
-  virtual ~ConvComputeInt8() = default;
-
- private:
-  std::unique_ptr<lite::cuda::math::CudnnConv2DInt8<Ptype_out>> conv_impl_;
-};
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/cuda/conv_compute_test.cc b/lite/kernels/cuda/conv_compute_test.cc
deleted file mode 100644
index 68b4475399..0000000000
--- a/lite/kernels/cuda/conv_compute_test.cc
+++ /dev/null
@@ -1,248 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/cuda/conv_compute.h"
-#include <gtest/gtest.h>
-#include <memory>
-#include <utility>
-#include <vector>
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-
-float random(float low, float high) {
-  static std::mt19937 mt(100);
-  std::uniform_real_distribution<double> dist(low, high);
-  return dist(mt);
-}
-
-TEST(conv_compute, fp32) {
-  ConvCompute conv_fp32;
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  auto& context = ctx->As<CUDAContext>();
-
-  operators::ActivationParam act_param;
-  act_param.has_active = true;
-  // act_param.active_type = core::ActiveType::Active_relu;
-  act_param.active_type = lite_api::ActivationType::kLeakyRelu;
-  act_param.Leaky_relu_alpha = 0.1;
-  operators::ConvParam param;
-  param.activation_param = act_param;
-  param.paddings = {1, 1};
-  param.groups = 1;
-
-  Tensor x, filter, bias, y, x_cpu, filter_cpu, bias_cpu, y_cpu;
-  int n = 1, c = 1, h = 3, w = 3;
-  int c_o = 1, h_o = 3, w_o = 3;
-  y.Resize({n, c_o, h_o, w_o});
-  x_cpu.Resize({n, c, h, w});
-  filter_cpu.Resize({c_o, c / param.groups, 3, 3});
-  y_cpu.Resize({n, c_o, h_o, w_o});
-  bias_cpu.Resize({c_o});
-
-  auto* x_data = x.mutable_data<float>(TARGET(kCUDA));
-  auto* y_data = y.mutable_data<float>(TARGET(kCUDA));
-  float* x_cpu_data = x_cpu.mutable_data<float>();
-  float* filter_cpu_data = filter_cpu.mutable_data<float>();
-  float* y_cpu_data = y_cpu.mutable_data<float>();
-  float* bias_cpu_data = bias_cpu.mutable_data<float>();
-
-  for (int i = 0; i < x_cpu.numel(); i++) {
-    x_cpu_data[i] = i;
-  }
-  std::vector<float> weight = {-0.2209115,
-                               -0.17199445,
-                               -0.2059412,
-                               0.6763207,
-                               -0.12260777,
-                               -0.43123743,
-                               -0.49696392,
-                               -0.27471393,
-                               -0.81017196};
-  for (int i = 0; i < filter_cpu.numel(); i++) {
-    filter_cpu_data[i] = weight[i];
-  }
-  for (int i = 0; i < bias_cpu.numel(); i++) {
-    bias_cpu_data[i] = 0;
-  }
-
-  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
-  filter.Assign<float, lite::DDim, TARGET(kCUDA)>(filter_cpu_data,
-                                                  filter_cpu.dims());
-  bias.Assign<float, lite::DDim, TARGET(kCUDA)>(bias_cpu_data, bias_cpu.dims());
-
-  param.x = &x;
-  param.filter = &filter;
-  param.output = &y;
-  // param.bias = &bias;
-
-  conv_fp32.SetParam(param);
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-  context.SetExecStream(stream);
-
-  conv_fp32.SetContext(std::move(ctx));
-  conv_fp32.Launch();
-  cudaDeviceSynchronize();
-
-  CopySync<TARGET(kCUDA)>(
-      y_cpu_data, y_data, sizeof(float) * y.numel(), IoDirection::DtoH);
-
-  std::vector<float> real_results = {-0.8, -0.7};
-  for (int i = 0; i < y.numel(); i++) {
-    LOG(INFO) << y_cpu_data[i];
-  }
-}
-
-TEST(conv_compute, int8) {
-  ConvComputeInt8<PRECISION(kFloat)> int8_conv_fp32out;
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  auto& context = ctx->As<CUDAContext>();
-
-  operators::ActivationParam act_param;
-  act_param.has_active = true;
-  act_param.active_type = lite_api::ActivationType::kRelu;
-  operators::ConvParam param;
-  // param.activation_param = act_param;
-  param.groups = 1;
-
-  Tensor x, filter, bias, y, x_cpu, filter_cpu, bias_cpu, y_cpu;
-  int n = 1, c = 4, h = 3, w = 3;
-  y.Resize({1, 1, 1, c});
-  x_cpu.Resize({n, h, w, c});
-  filter_cpu.Resize({c, 3, 3, c / param.groups});
-  y_cpu.Resize({1, 1, 1, c});
-  bias_cpu.Resize({c});
-
-  auto* x_data = x.mutable_data<int8_t>(TARGET(kCUDA));
-  auto* y_data = y.mutable_data<float>(TARGET(kCUDA));
-  auto* x_cpu_data = x_cpu.mutable_data<int8_t>();
-  auto* filter_cpu_data = filter_cpu.mutable_data<int8_t>();
-  auto* y_cpu_data = x_cpu.mutable_data<float>();
-  auto* bias_cpu_data = bias_cpu.mutable_data<float>();
-
-  for (int i = 0; i < x_cpu.numel(); i++) {
-    x_cpu_data[i] = static_cast<int8_t>(1);
-  }
-  for (int i = 0; i < filter_cpu.numel(); i++) {
-    filter_cpu_data[i] = static_cast<int8_t>(1);
-  }
-  for (int i = 0; i < bias_cpu.numel(); i++) {
-    bias_cpu_data[i] = i + 1.0;
-  }
-
-  x.Assign<int8_t, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
-  filter.Assign<int8_t, lite::DDim, TARGET(kCUDA)>(filter_cpu_data,
-                                                   filter_cpu.dims());
-  bias.Assign<float, lite::DDim, TARGET(kCUDA)>(bias_cpu_data,
-                                                filter_cpu.dims());
-
-  param.x = &x;
-  param.filter = &filter;
-  param.output = &y;
-  param.weight_scale = {1, 2, 3, 4};
-
-  int8_conv_fp32out.SetParam(param);
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-  context.SetExecStream(stream);
-
-  int8_conv_fp32out.SetContext(std::move(ctx));
-  int8_conv_fp32out.Launch();
-  cudaDeviceSynchronize();
-
-  CopySync<TARGET(kCUDA)>(
-      y_cpu_data, y_data, sizeof(float) * y.numel(), IoDirection::DtoH);
-  std::vector<float> real_results = {36, 72, 108, 144};
-  for (int i = 0; i < y.numel(); i++) {
-    EXPECT_NEAR(y_cpu_data[i], real_results[i], 1e-5);
-  }
-}
-
-TEST(conv_compute, int8_int8_out) {
-  ConvComputeInt8<PRECISION(kInt8)> int8_conv_fp32out;
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  auto& context = ctx->As<CUDAContext>();
-
-  operators::ActivationParam act_param;
-  act_param.has_active = true;
-  // act_param.active_type = core::ActiveType::Active_relu;
-  act_param.active_type = lite_api::ActivationType::kLeakyRelu;
-  act_param.Leaky_relu_alpha = 0.1;
-  operators::ConvParam param;
-  param.activation_param = act_param;
-  param.groups = 1;
-
-  Tensor x, filter, bias, y, x_cpu, filter_cpu, bias_cpu, y_cpu;
-  int n = 1, c = 4, h = 3, w = 3;
-  y.Resize({1, 1, 1, c});
-  x_cpu.Resize({n, h, w, c});
-  filter_cpu.Resize({c, 3, 3, c / param.groups});
-  y_cpu.Resize({1, 1, 1, c});
-  bias_cpu.Resize({c});
-
-  auto* x_data = x.mutable_data<int8_t>(TARGET(kCUDA));
-  auto* y_data = y.mutable_data<int8_t>(TARGET(kCUDA));
-  auto* x_cpu_data = x_cpu.mutable_data<int8_t>();
-  auto* filter_cpu_data = filter_cpu.mutable_data<int8_t>();
-  auto* y_cpu_data = x_cpu.mutable_data<int8_t>();
-  auto* bias_cpu_data = bias_cpu.mutable_data<float>();
-
-  for (int i = 0; i < x_cpu.numel(); i++) {
-    x_cpu_data[i] = static_cast<int8_t>(random(-36, 36));
-  }
-  for (int i = 0; i < filter_cpu.numel(); i++) {
-    filter_cpu_data[i] = static_cast<int8_t>(random(-10, 10));
-  }
-  for (int i = 0; i < bias_cpu.numel(); i++) {
-    bias_cpu_data[i] = i + 1.0;
-  }
-
-  x.Assign<int8_t, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
-  filter.Assign<int8_t, lite::DDim, TARGET(kCUDA)>(filter_cpu_data,
-                                                   filter_cpu.dims());
-  bias.Assign<float, lite::DDim, TARGET(kCUDA)>(bias_cpu_data,
-                                                filter_cpu.dims());
-
-  param.x = &x;
-  param.filter = &filter;
-  param.output = &y;
-  param.weight_scale = {0.01, 0.02, 0.03, 0.04};
-  param.bias = &bias;
-
-  int8_conv_fp32out.SetParam(param);
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-  context.SetExecStream(stream);
-
-  int8_conv_fp32out.SetContext(std::move(ctx));
-  int8_conv_fp32out.Launch();
-  cudaDeviceSynchronize();
-
-  CopySync<TARGET(kCUDA)>(
-      y_cpu_data, y_data, sizeof(int8_t) * y.numel(), IoDirection::DtoH);
-
-  std::vector<float> real_results = {-1, 4, 0, -2};
-  for (int i = 0; i < y.numel(); i++) {
-    // EXPECT_NEAR(y_cpu_data[i], real_results[i], 1e-5);
-    LOG(INFO) << float(y_cpu_data[i]);
-  }
-}
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/cuda/elementwise_add_compute.cu b/lite/kernels/cuda/elementwise_add_compute.cu
deleted file mode 100644
index fd3b74fb6e..0000000000
--- a/lite/kernels/cuda/elementwise_add_compute.cu
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/cuda/elementwise_add_compute.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-
-__global__ void KeElementwiseAdd(const float* x_data,
-                                 const float* y_data,
-                                 float* out_data,
-                                 const size_t total) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (; tid < total; tid += stride) {
-    out_data[tid] = x_data[tid] + y_data[tid];
-  }
-}
-
-void ElementwiseAddCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<CUDAContext>();
-  auto stream = ctx.exec_stream();
-
-  const lite::Tensor* x = param.X;
-  const lite::Tensor* y = param.Y;
-  lite::Tensor* out = param.Out;
-
-  CHECK(x->dims() == y->dims());
-
-  const int n = x->dims()[0];
-  const int c = x->dims()[1];
-  const int h = x->dims()[2];
-  const int w = x->dims()[3];
-
-  auto* x_data = x->data<float>();
-  auto* y_data = y->data<float>();
-  auto out_data = out->mutable_data<float>(TARGET(kCUDA));
-
-  int pixel_num = x->numel();
-  int threads = 512;
-  int blocks = (pixel_num + threads - 1) / threads;
-  blocks = blocks > 8 ? 8 : blocks;
-
-  KeElementwiseAdd<<<blocks, threads, 0, stream>>>(
-      x_data, y_data, out_data, pixel_num);
-
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
-}
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(elementwise_add,
-                     kCUDA,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::cuda::ElementwiseAddCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .Finalize();
diff --git a/lite/kernels/cuda/elementwise_add_compute.h b/lite/kernels/cuda/elementwise_add_compute.h
deleted file mode 100644
index 772dda8aba..0000000000
--- a/lite/kernels/cuda/elementwise_add_compute.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/kernel.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-
-class ElementwiseAddCompute
-    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ElementwiseParam;
-
-  void Run() override;
-  virtual ~ElementwiseAddCompute() = default;
-};
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/cuda/elementwise_add_compute_test.cc b/lite/kernels/cuda/elementwise_add_compute_test.cc
deleted file mode 100644
index 7ba67e8eb7..0000000000
--- a/lite/kernels/cuda/elementwise_add_compute_test.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/cuda/elementwise_add_compute.h"
-#include <gtest/gtest.h>
-#include <memory>
-#include <utility>
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-
-using Tensor = lite::Tensor;
-
-static void ElementwiseAddRef(float* x, float* y, float* out, int num) {
-  for (int i = 0; i < num; ++i) {
-    out[i] = x[i] + y[i];
-    // LOG(INFO) << x[i] << " + " << y[i] << " = " << out[i];
-  }
-}
-
-TEST(elementwise_add, normal) {
-  ElementwiseAddCompute elementwise_add_kernel;
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  auto& context = ctx->As<CUDAContext>();
-
-  operators::ElementwiseParam param;
-  Tensor x, y, out;
-  Tensor x_cpu, y_cpu, out_cpu;
-  Tensor x_ref, y_ref, out_ref;
-
-  const int n = 1;
-  const int c = 3;
-  const int h = 2000;
-  const int w = 2000;
-
-  x.Resize({n, c, h, w});
-  y.Resize({n, c, h, w});
-  out.Resize({n, c, h, w});
-  x_cpu.Resize({n, c, h, w});
-  y_cpu.Resize({n, c, h, w});
-  out_cpu.Resize({n, c, h, w});
-  x_ref.Resize({n, c, h, w});
-  y_ref.Resize({n, c, h, w});
-  out_ref.Resize({n, c, h, w});
-
-  auto* x_data = x.mutable_data<float>(TARGET(kCUDA));
-  auto* y_data = y.mutable_data<float>(TARGET(kCUDA));
-  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
-
-  auto* x_cpu_data = x_cpu.mutable_data<float>();
-  auto* y_cpu_data = y_cpu.mutable_data<float>();
-  auto* out_cpu_data = out_cpu.mutable_data<float>();
-
-  auto* x_ref_data = x_ref.mutable_data<float>();
-  auto* y_ref_data = y_ref.mutable_data<float>();
-  auto* out_ref_data = out_ref.mutable_data<float>();
-
-  for (int i = 0; i < x_cpu.numel(); ++i) {
-    x_cpu_data[i] = i + 5.0;
-    x_ref_data[i] = i + 5.0;
-  }
-  for (int i = 0; i < y_cpu.numel(); ++i) {
-    y_cpu_data[i] = i - 5.0;
-    y_ref_data[i] = i - 5.0;
-  }
-
-  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
-  y.Assign<float, lite::DDim, TARGET(kCUDA)>(y_cpu_data, y_cpu.dims());
-
-  param.X = &x;
-  param.Y = &y;
-  param.Out = &out;
-  elementwise_add_kernel.SetParam(param);
-
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-  context.SetExecStream(stream);
-
-  elementwise_add_kernel.SetContext(std::move(ctx));
-  elementwise_add_kernel.Launch();
-  cudaDeviceSynchronize();
-
-  CopySync<TARGET(kCUDA)>(
-      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
-  ElementwiseAddRef(x_ref_data, y_ref_data, out_ref_data, out.numel());
-  for (int i = 0; i < out.numel(); i++) {
-    EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
-  }
-}
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/cuda/io_copy_compute.cc b/lite/kernels/cuda/io_copy_compute.cc
deleted file mode 100644
index 8f8d29f154..0000000000
--- a/lite/kernels/cuda/io_copy_compute.cc
+++ /dev/null
@@ -1,143 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/cuda/target_wrapper.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-
-using TargetW = TargetWrapper<TARGET(kCUDA)>;
-
-// Host to CUDA memory.
-void CopyFromHostSync(void* target, const void* source, size_t size) {
-  TargetW::MemcpySync(target, source, size, IoDirection::HtoD);
-}
-
-void CopyFromHostAsync(void* target,
-                       const void* source,
-                       size_t size,
-                       TargetW::stream_t stream) {
-  TargetW::MemcpyAsync(target, source, size, IoDirection::HtoD, stream);
-}
-
-// Host to Host memory.
-void CopyToHostSync(void* target, const void* source, size_t size) {
-  TargetW::MemcpySync(target, source, size, IoDirection::DtoH);
-}
-
-/*
- * This kernel copies a tensor from host to CUDA space.
- */
-class IoCopyHostToCudaCompute
-    : public KernelLite<TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kAny)> {
- public:
-  void Run() override {
-    auto& param = Param<operators::IoCopyParam>();
-    CHECK(param.x->target() == TARGET(kHost) ||
-          param.x->target() == TARGET(kX86));
-    auto mem_size = param.x->memory_size();
-    LOG(INFO) << "copy size " << mem_size;
-    auto* data = param.y->mutable_data(TARGET(kCUDA), mem_size);
-    CopyFromHostSync(data, param.x->raw_data(), mem_size);
-  }
-
-  std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
-    std::unique_ptr<type_infer_handler_t> res(new type_infer_handler_t);
-    *res = [](const std::map<std::string, const Type*>& inputs,
-              const std::string& out) -> const Type* {
-      CHECK(!inputs.empty());
-      auto* type = inputs.at("Input");
-      CHECK(type->target() == TARGET(kHost));
-
-      auto out_place = type->place();
-      out_place.target = TARGET(kCUDA);
-      auto* out_type = Type::Get(type->id(),
-                                 out_place.target,
-                                 out_place.precision,
-                                 out_place.layout,
-                                 out_place.device);
-      return out_type;
-    };
-    return res;
-  }
-
-  std::string doc() const override { return "Copy IO from HOST to CUDA"; }
-};
-
-/*
- * This kernel copies a tensor from CUDA to host space.
- */
-class IoCopyCudaToHostCompute
-    : public KernelLite<TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kAny)> {
- public:
-  void Run() override {
-    auto& param = Param<operators::IoCopyParam>();
-    CHECK(param.x->target() == TARGET(kCUDA));
-    auto mem_size = param.x->memory_size();
-    LOG(INFO) << "io copy cuda to host " << mem_size;
-    auto* data = param.y->mutable_data(TARGET(kHost), mem_size);
-    CopyToHostSync(data, param.x->raw_data(), mem_size);
-  }
-
-  std::string doc() const override { return "Copy IO from CUDA to HOST"; }
-};
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(io_copy,
-                     kCUDA,
-                     kAny,
-                     kAny,
-                     paddle::lite::kernels::cuda::IoCopyHostToCudaCompute,
-                     host_to_device)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(io_copy,
-                     kCUDA,
-                     kAny,
-                     kAny,
-                     paddle::lite::kernels::cuda::IoCopyCudaToHostCompute,
-                     device_to_host)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(io_copy_once,
-                     kCUDA,
-                     kAny,
-                     kAny,
-                     paddle::lite::kernels::cuda::IoCopyHostToCudaCompute,
-                     host_to_device)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(io_copy_once,
-                     kCUDA,
-                     kAny,
-                     kAny,
-                     paddle::lite::kernels::cuda::IoCopyCudaToHostCompute,
-                     device_to_host)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
-    .Finalize();
diff --git a/lite/kernels/cuda/leaky_relu_compute.cu b/lite/kernels/cuda/leaky_relu_compute.cu
deleted file mode 100644
index 5d09c038e8..0000000000
--- a/lite/kernels/cuda/leaky_relu_compute.cu
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/op_registry.h"
-#include "lite/kernels/cuda/leaky_relu_compute.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-
-template <typename T>
-__global__ void LeakyReluKernel(const int num,
-                                const T alpha,
-                                const T* input,
-                                T* output) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < num) {
-#if __CUDA_ARCH__ >= 350
-    output[index] = __ldg(input + index) >= 0 ? __ldg(input + index)
-                                              : __ldg(input + index) * alpha;
-#else
-    output[index] = input[index] >= 0 ? input[index] : input[index] * alpha;
-#endif
-  }
-}
-
-void LeakyReluCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<CUDAContext>();
-  auto stream = ctx.exec_stream();
-
-  int num = static_cast<int>(param.X->numel());
-  float alpha = param.Leaky_relu_alpha;
-  auto input = param.X->data<float>();
-  auto output = param.Out->mutable_data<float>(TARGET(kCUDA));
-
-  int threads = 1024;
-  int blocks = (num + threads - 1) / threads;
-  LeakyReluKernel<<<blocks, threads, 0, stream>>>(num, alpha, input, output);
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
-}
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(leaky_relu,
-                     kCUDA,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::cuda::LeakyReluCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .Finalize();
diff --git a/lite/kernels/cuda/leaky_relu_compute.h b/lite/kernels/cuda/leaky_relu_compute.h
deleted file mode 100644
index fea6d448d8..0000000000
--- a/lite/kernels/cuda/leaky_relu_compute.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/kernel.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-
-class LeakyReluCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ActivationParam;
-
-  void Run() override;
-  virtual ~LeakyReluCompute() = default;
-};
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/cuda/leaky_relu_compute_test.cc b/lite/kernels/cuda/leaky_relu_compute_test.cc
deleted file mode 100644
index 9fb5a5eddf..0000000000
--- a/lite/kernels/cuda/leaky_relu_compute_test.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/cuda/leaky_relu_compute.h"
-#include <gtest/gtest.h>
-#include <memory>
-#include <utility>
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-
-TEST(leaky_relu, normal) {
-  LeakyReluCompute leaky_relu_kernel;
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  auto& context = ctx->As<CUDAContext>();
-
-  operators::ActivationParam param;
-
-  Tensor x, y, x_cpu, y_cpu;
-  int h = 3, w = 3;
-  y.Resize({h, w});
-  x_cpu.Resize({h, w});
-  y_cpu.Resize({h, w});
-
-  auto* x_data = x.mutable_data<float>(TARGET(kCUDA));
-  auto* y_data = y.mutable_data<float>(TARGET(kCUDA));
-  float* x_cpu_data = x_cpu.mutable_data<float>();
-  float* y_cpu_data = x_cpu.mutable_data<float>();
-
-  for (int i = 0; i < x_cpu.numel(); i++) {
-    x_cpu_data[i] = i - 5.0;
-  }
-
-  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
-
-  param.X = &x;
-  param.Out = &y;
-  param.Leaky_relu_alpha = 10.0;
-  leaky_relu_kernel.SetParam(param);
-
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-  context.SetExecStream(stream);
-
-  leaky_relu_kernel.SetContext(std::move(ctx));
-  leaky_relu_kernel.Launch();
-  cudaDeviceSynchronize();
-
-  CopySync<TARGET(kCUDA)>(
-      y_cpu_data, y_data, sizeof(float) * y.numel(), IoDirection::DtoH);
-  for (int i = 0; i < y.numel(); i++) {
-    LOG(INFO) << y_cpu_data[i];
-  }
-}
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/cuda/mul_compute.cc b/lite/kernels/cuda/mul_compute.cc
deleted file mode 100644
index f59e9e5046..0000000000
--- a/lite/kernels/cuda/mul_compute.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/cuda/mul_compute.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    mul, kCUDA, kFloat, kNCHW, paddle::lite::kernels::cuda::MulCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .Finalize();
diff --git a/lite/kernels/cuda/mul_compute.h b/lite/kernels/cuda/mul_compute.h
deleted file mode 100644
index 4a542104d6..0000000000
--- a/lite/kernels/cuda/mul_compute.h
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/backends/cuda/blas.h"
-#include "lite/core/context.h"
-#include "lite/core/kernel.h"
-#include "lite/core/types.h"
-#include "lite/operators/op_params.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-
-template <typename T>
-void mul_compute(const lite::cuda::Blas<float>& blas,
-                 const T* x,
-                 int x_h,
-                 int x_w,
-                 const T* y,
-                 int y_h,
-                 int y_w,
-                 T* out) {
-  blas.sgemm(CUBLAS_OP_N,
-             CUBLAS_OP_N,
-             x_h,
-             y_w,
-             x_w,
-             nullptr,
-             x,
-             x_w,
-             y,
-             y_w,
-             nullptr,
-             out,
-             x_h);
-}
-
-class MulCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
- public:
-  using param_t = operators::MulParam;
-
-  void Run() override {
-    CHECK(ctx_) << "running context should be set first";
-    auto& context = this->ctx_->template As<CUDAContext>();
-    CHECK(context.cublas_fp32()) << "blas should init first";
-    /*
-    auto& blas = *context.cublas_fp32();
-    CHECK(param.x->target() == TARGET(kCUDA));
-    auto* x = param.x->data<float>();
-    int x_h = param.x->dims()[0];
-    int x_w = param.x->dims()[1];
-
-    auto* y = param.y->data<float>();
-    int y_h = param.y->dims()[0];
-    int y_w = param.y->dims()[1];
-     */
-
-    const auto& param = Param<operators::MulParam>();
-    param.output->mutable_data<float>(TARGET(kCUDA));
-    LOG(INFO) << "mul output memory size " << param.output->data_size();
-
-    // mul_compute<float>(blas, x, x_h, x_w, y, y_h, y_w, out);
-  }
-
-  virtual ~MulCompute() = default;
-};
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/cuda/nearest_interp_compute.cu b/lite/kernels/cuda/nearest_interp_compute.cu
deleted file mode 100644
index 8edeacfe5a..0000000000
--- a/lite/kernels/cuda/nearest_interp_compute.cu
+++ /dev/null
@@ -1,160 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/cuda/nearest_interp_compute.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-using Tensor = lite::Tensor;
-
-__global__ void KeNearestNeighborInterp(const float* in,
-                                        const size_t in_img_h,
-                                        const size_t in_img_w,
-                                        const size_t input_h,
-                                        const size_t input_w,
-                                        float* out,
-                                        const size_t out_img_h,
-                                        const size_t out_img_w,
-                                        const size_t output_h,
-                                        const size_t output_w,
-                                        const size_t num_channels,
-                                        const float ratio_h,
-                                        const float ratio_w,
-                                        const bool align_corners) {
-  int nthreads = output_h * output_w;
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (; tid < nthreads; tid += stride) {
-    int out_id_h = tid / output_w;
-    int out_id_w = tid % output_w;
-    int in_img_size = input_w / num_channels;
-    int out_img_size = output_w / num_channels;
-    int channel_id = out_id_w / out_img_size;
-
-    int out_img_idy = (out_id_w % out_img_size) / out_img_w;
-    int in_img_idy = (align_corners)
-                         ? static_cast<int>(ratio_h * out_img_idy + 0.5)
-                         : static_cast<int>(ratio_h * out_img_idy);
-
-    int out_img_idx = tid % out_img_w;
-    int in_img_idx = (align_corners)
-                         ? static_cast<int>(ratio_w * out_img_idx + 0.5)
-                         : static_cast<int>(ratio_w * out_img_idx);
-
-    out[tid] = in[out_id_h * input_w + channel_id * in_img_size +
-                  in_img_idy * in_img_w + in_img_idx];
-  }
-}
-
-void NearestInterpCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<CUDAContext>();
-  auto stream = ctx.exec_stream();
-
-  Tensor* input = param.X;
-  Tensor* output = param.Out;
-  Tensor* out_size = param.OutSize;
-
-  auto* input_data = input->data<float>();
-
-  const int n = input->dims()[0];
-  const int c = input->dims()[1];
-  const int in_h = input->dims()[2];
-  const int in_w = input->dims()[3];
-
-  int out_h = param.out_h;
-  int out_w = param.out_w;
-  float scale = param.scale;
-  bool align_corners = param.align_corners;
-  if (scale > 0) {
-    out_h = static_cast<int>(in_h * scale);
-    out_w = static_cast<int>(in_w * scale);
-  }
-
-  if (out_size != nullptr) {
-    Tensor sizes;
-    float* size_data = sizes.mutable_data<float>();
-    float* outsize_data = out_size->mutable_data<float>(TARGET(kCUDA));
-    cudaMemcpy(
-        size_data, outsize_data, sizeof(float) * 2, cudaMemcpyDeviceToHost);
-    out_h = static_cast<int>(size_data[0]);
-    out_w = static_cast<int>(size_data[1]);
-  }
-
-  auto output_data = output->mutable_data<float>(TARGET(kCUDA));
-
-  if (in_h == out_h && in_w == out_w) {
-    cudaMemcpy(output_data,
-               input_data,
-               sizeof(float) * n * c * in_h * in_w,
-               cudaMemcpyHostToDevice);
-    return;
-  }
-
-  float ratio_h = 0.f;
-  float ratio_w = 0.f;
-  if (out_h > 1) {
-    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
-  }
-  if (out_w > 1) {
-    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
-  }
-
-  int in_hw = in_h * in_w;
-  int out_hw = out_h * out_w;
-  int in_chw = c * in_hw;
-  int out_chw = c * out_hw;
-
-  int pixelNum = n * out_chw;
-  int threads = 512;
-  int blocks = (pixelNum + threads - 1) / threads;
-  blocks = blocks > 8 ? 8 : blocks;
-
-  KeNearestNeighborInterp<<<blocks, threads, 0, stream>>>(input_data,
-                                                          in_h,
-                                                          in_w,
-                                                          n,
-                                                          in_chw,
-                                                          output_data,
-                                                          out_h,
-                                                          out_w,
-                                                          n,
-                                                          out_chw,
-                                                          c,
-                                                          ratio_h,
-                                                          ratio_w,
-                                                          align_corners);
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
-}
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(nearest_interp,
-                     kCUDA,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::cuda::NearestInterpCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .BindInput("OutSize", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .Finalize();
diff --git a/lite/kernels/cuda/nearest_interp_compute.h b/lite/kernels/cuda/nearest_interp_compute.h
deleted file mode 100644
index d4fb0f43c6..0000000000
--- a/lite/kernels/cuda/nearest_interp_compute.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/kernel.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-
-class NearestInterpCompute
-    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
- public:
-  using param_t = operators::InterpolateParam;
-
-  void Run() override;
-  virtual ~NearestInterpCompute() = default;
-};
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/cuda/nearest_interp_compute_test.cc b/lite/kernels/cuda/nearest_interp_compute_test.cc
deleted file mode 100644
index 6b98bf143b..0000000000
--- a/lite/kernels/cuda/nearest_interp_compute_test.cc
+++ /dev/null
@@ -1,152 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/cuda/nearest_interp_compute.h"
-#include <gtest/gtest.h>
-#include <memory>
-#include <utility>
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-
-using Tensor = lite::Tensor;
-
-void NearestInterpRef(Tensor* input, Tensor* output, bool with_align) {
-  int hin = input->dims()[2];
-  int win = input->dims()[3];
-  int channels = input->dims()[1];
-  int num = input->dims()[0];
-  int hout = output->dims()[2];
-  int wout = output->dims()[3];
-  float scale_w = (with_align) ? (static_cast<float>(win - 1) / (wout - 1))
-                               : (static_cast<float>(win) / (wout));
-  float scale_h = (with_align) ? (static_cast<float>(hin - 1) / (hout - 1))
-                               : (static_cast<float>(hin) / (hout));
-  const float* src = input->data<float>();
-  float* dst = output->mutable_data<float>();
-  int dst_stride_w = 1;
-  int dst_stride_h = wout;
-  int dst_stride_c = wout * hout;
-  int dst_stride_batch = wout * hout * channels;
-  int src_stride_w = 1;
-  int src_stride_h = win;
-  int src_stride_c = win * hin;
-  int src_stride_batch = win * hin * channels;
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      int src_index = n * src_stride_batch + c * src_stride_c;
-      for (int h = 0; h < hout; ++h) {
-        for (int w = 0; w < wout; ++w) {
-          int fw = (with_align) ? static_cast<int>(scale_w * w + 0.5)
-                                : static_cast<int>(scale_w * w);
-          fw = (fw < 0) ? 0 : fw;
-          int fh = (with_align) ? static_cast<int>(scale_h * h + 0.5)
-                                : static_cast<int>(scale_h * h);
-          fh = (fh < 0) ? 0 : fh;
-          int w_start = static_cast<int>(fw);
-          int h_start = static_cast<int>(fh);
-          int dst_index = n * dst_stride_batch + c * dst_stride_c +
-                          h * dst_stride_h + w * dst_stride_w;
-          dst[dst_index] =
-              src[src_index + w_start * src_stride_w + h_start * src_stride_h];
-        }
-      }
-    }
-  }
-}
-
-TEST(nearest_interp, normal) {
-  NearestInterpCompute nearest_interp_kernel;
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  auto& context = ctx->As<CUDAContext>();
-
-  operators::InterpolateParam param;
-
-  Tensor x, osz, out;
-  Tensor x_cpu, osz_cpu, out_cpu;
-  Tensor x_ref, osz_ref, out_ref;
-
-  int n = 1, c = 3, in_h = 40, in_w = 40;
-  int in_chw = c * in_h * in_w;
-  int out_h = 80, out_w = 80;
-  float scale = 2.0;
-
-  param.out_h = out_h;
-  param.out_w = out_w;
-  param.scale = scale;
-  param.align_corners = false;
-
-  x.Resize({n, c, in_h, in_w});
-  osz.Resize({2});
-  out.Resize({n, c, out_h, out_w});
-
-  x_cpu.Resize({n, c, in_h, in_w});
-  osz_cpu.Resize({2});
-  out_cpu.Resize({n, c, out_h, out_w});
-
-  x_ref.Resize({n, c, in_h, in_w});
-  osz_ref.Resize({2});
-  out_ref.Resize({n, c, out_h, out_w});
-
-  auto* x_data = x.mutable_data<float>(TARGET(kCUDA));
-  auto* osz_data = osz.mutable_data<float>(TARGET(kCUDA));
-  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
-
-  float* x_cpu_data = x_cpu.mutable_data<float>();
-  float* osz_cpu_data = osz_cpu.mutable_data<float>();
-  float* out_cpu_data = out_cpu.mutable_data<float>();
-
-  float* x_ref_data = x_ref.mutable_data<float>();
-  float* osz_ref_data = osz_ref.mutable_data<float>();
-  float* out_ref_data = out_ref.mutable_data<float>();
-
-  for (int i = 0; i < x_cpu.numel(); ++i) {
-    x_cpu_data[i] = i + 5.0;
-    x_ref_data[i] = i + 5.0;
-  }
-  osz_cpu_data[0] = out_h;
-  osz_cpu_data[1] = out_w;
-  osz_ref_data[0] = out_h;
-  osz_ref_data[1] = out_w;
-
-  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
-  osz.Assign<float, lite::DDim, TARGET(kCUDA)>(osz_cpu_data, osz_cpu.dims());
-
-  param.X = &x;
-  param.OutSize = &osz;
-  param.Out = &out;
-  nearest_interp_kernel.SetParam(param);
-
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-  context.SetExecStream(stream);
-
-  nearest_interp_kernel.SetContext(std::move(ctx));
-  nearest_interp_kernel.Launch();
-  cudaDeviceSynchronize();
-
-  CopySync<TARGET(kCUDA)>(
-      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
-  NearestInterpRef(&x_ref, &out_ref, false);
-  for (int i = 0; i < out.numel(); i++) {
-    EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
-  }
-}
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/cuda/transpose_compute.cu b/lite/kernels/cuda/transpose_compute.cu
deleted file mode 100644
index f7eb7ffc30..0000000000
--- a/lite/kernels/cuda/transpose_compute.cu
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/cuda/transpose_compute.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-
-void TransposeCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<CUDAContext>();
-
-  const lite::Tensor* X = param.x;
-  lite::Tensor* Out = param.output;
-  std::vector<int> axes = param.axis;
-
-  const float* in = X->data<float>();
-  float* out = Out->mutable_data<float>(TARGET(kCUDA));
-
-  int ndim = X->dims().size();
-  std::vector<int64_t> dims = X->dims().data();
-
-  // NCHW -> NHWC
-  if (axes.size() == 4 && axes[0] == 0 && axes[1] == 2 && axes[2] == 3 &&
-      axes[3] == 1) {
-    lite::cuda::math::NCHW2NHWC(
-        dims[0], dims[1], dims[2] * dims[3], in, out, &ctx);
-    cudaError_t error = cudaGetLastError();
-    if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
-    return;
-  }
-
-  // NHWC -> NCHW
-  if (axes.size() == 4 && axes[0] == 0 && axes[1] == 3 && axes[2] == 1 &&
-      axes[3] == 2) {
-    lite::cuda::math::NHWC2NCHW(
-        dims[0], dims[3], dims[1] * dims[2], in, out, &ctx);
-    cudaError_t error = cudaGetLastError();
-    if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
-    return;
-  }
-
-  lite::cuda::math::Transpose(dims, axes, in, out, &ctx);
-}
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(transpose,
-                     kCUDA,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::cuda::TransposeCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .Finalize();
-
-// REGISTER_LITE_KERNEL(transpose2,
-//                      kCUDA,
-//                      kFloat,
-//                      kNCHW,
-//                      paddle::lite::kernels::cuda::TransposeCompute,
-//                      def)
-//     .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
-//     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
-//     .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kCUDA))})
-//     .Finalize();
diff --git a/lite/kernels/cuda/transpose_compute.h b/lite/kernels/cuda/transpose_compute.h
deleted file mode 100644
index f85f43993d..0000000000
--- a/lite/kernels/cuda/transpose_compute.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/backends/cuda/math/transpose.h"
-#include "lite/core/kernel.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-
-class TransposeCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
- public:
-  using param_t = operators::TransposeParam;
-
-  void Run() override;
-  virtual ~TransposeCompute() = default;
-
- private:
-  lite::Tensor axes_, dims_;
-};
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/cuda/transpose_compute_test.cc b/lite/kernels/cuda/transpose_compute_test.cc
deleted file mode 100644
index d0b4844dcf..0000000000
--- a/lite/kernels/cuda/transpose_compute_test.cc
+++ /dev/null
@@ -1,290 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/cuda/transpose_compute.h"
-#include <gtest/gtest.h>
-#include <memory>
-#include <utility>
-#include <vector>
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-
-namespace {
-
-#define IN(n, c, h, w)                                 \
-  input_data[w + h * input_w + c * input_h * input_w + \
-             n * input_c * input_h * input_w]
-#define OUT(n, c, h, w)                                    \
-  output_data[w + h * output_w + c * output_h * output_w + \
-              n * output_c * output_h * output_w]
-void nchw2nhwc_ref(lite::Tensor* input,
-                   lite::Tensor* output,
-                   const std::vector<int> axies) {
-  auto* input_data = input->data<float>();
-  auto* output_data = output->mutable_data<float>();
-
-  int input_n = input->dims()[0];
-  int input_c = input->dims()[1];
-  int input_h = input->dims()[2];
-  int input_w = input->dims()[3];
-  int output_n = output->dims()[0];
-  int output_c = output->dims()[1];
-  int output_h = output->dims()[2];
-  int output_w = output->dims()[3];
-
-  for (int n = 0; n < input_n; ++n) {
-    for (int c = 0; c < input_c; ++c) {
-      for (int h = 0; h < input_h; ++h) {
-        for (int w = 0; w < input_w; ++w) {
-          OUT(n, h, w, c) = IN(n, c, h, w);
-        }
-      }
-    }
-  }
-}
-#undef IN
-#undef OUT
-
-#define IN(n, h, w, c)                                 \
-  input_data[c + w * input_c + h * input_w * input_c + \
-             n * input_h * input_w * input_c]
-#define OUT(n, h, w, c)                                    \
-  output_data[c + w * output_c + h * output_w * output_c + \
-              n * output_h * output_w * output_c]
-void nhwc2nchw_ref(lite::Tensor* input,
-                   lite::Tensor* output,
-                   const std::vector<int> axies) {
-  auto* input_data = input->data<float>();
-  auto* output_data = output->mutable_data<float>();
-
-  int input_n = input->dims()[0];
-  int input_h = input->dims()[1];
-  int input_w = input->dims()[2];
-  int input_c = input->dims()[3];
-  int output_n = output->dims()[0];
-  int output_h = output->dims()[1];
-  int output_w = output->dims()[2];
-  int output_c = output->dims()[3];
-
-  for (int n = 0; n < input_n; ++n) {
-    for (int c = 0; c < input_c; ++c) {
-      for (int h = 0; h < input_h; ++h) {
-        for (int w = 0; w < input_w; ++w) {
-          OUT(n, c, h, w) = IN(n, h, w, c);
-        }
-      }
-    }
-  }
-}
-
-void transpose_ref(lite::Tensor* input,
-                   lite::Tensor* output,
-                   const std::vector<int> axes) {
-  auto* input_data = input->data<float>();
-  auto* output_data = output->mutable_data<float>();
-
-  int ndim = input->dims().size();
-  auto dims = input->dims();
-  std::vector<int> strides(ndim, 0);
-  std::vector<int> buf(ndim, 0);
-  int cur_stride = 1;
-  for (int i = ndim - 1; i >= 0; --i) {
-    buf[i] = cur_stride;
-    cur_stride *= dims[i];
-  }
-  for (int i = 0; i < ndim; ++i) {
-    strides[i] = buf[axes[i]];
-  }
-
-  auto y_dims = output->dims();
-  int size = input->dims().production();
-  for (int i = 0; i < size; ++i) {
-    int idx = 0;
-    int v = i;
-    for (int j = ndim - 1; j >= 0; --j) {
-      idx += v % y_dims[j] * strides[j];
-      v /= y_dims[j];
-    }
-    output_data[i] = input_data[idx];
-  }
-}
-}  // namespace
-
-TEST(transpose_nchw, normal) {
-  TransposeCompute transpose_kernel;
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  auto& context = ctx->As<CUDAContext>();
-
-  operators::TransposeParam param;
-
-  lite::Tensor x, x_cpu, x_ref;
-  lite::Tensor out, out_cpu, out_ref;
-
-  int N = 5, C = 6, H = 7, W = 8;
-  std::vector<int> axes({0, 2, 3, 1});
-  x.Resize({N, C, H, W});
-  out.Resize({N, H, W, C});
-
-  x_cpu.Resize({N, C, H, W});
-  out_cpu.Resize({N, H, W, C});
-
-  x_ref.Resize({N, C, H, W});
-  out_ref.Resize({N, H, W, C});
-
-  auto* x_data = x.mutable_data<float>(TARGET(kCUDA));
-  auto* x_cpu_data = x_cpu.mutable_data<float>();
-  auto* out_cpu_data = out_cpu.mutable_data<float>();
-  auto* x_ref_data = x_ref.mutable_data<float>();
-
-  for (int i = 0; i < x_cpu.numel(); ++i) {
-    x_cpu_data[i] = i + 1;
-    x_ref_data[i] = i + 1;
-  }
-
-  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
-
-  param.x = &x;
-  param.output = &out;
-  param.axis = axes;
-  transpose_kernel.SetParam(param);
-
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-  context.SetExecStream(stream);
-  transpose_kernel.SetContext(std::move(ctx));
-  transpose_kernel.Launch();
-  cudaDeviceSynchronize();
-  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
-  CopySync<TARGET(kCUDA)>(
-      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
-  nchw2nhwc_ref(&x_ref, &out_ref, axes);
-  auto* out_ref_data = out_ref.mutable_data<float>();
-  // transpose_ref(&x_ref, &out_ref, axes);
-  for (int i = 0; i < out.numel(); i++) {
-    EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
-  }
-}
-
-TEST(transpose_nhwc, normal) {
-  TransposeCompute transpose_kernel;
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  auto& context = ctx->As<CUDAContext>();
-
-  operators::TransposeParam param;
-
-  lite::Tensor x, x_cpu, x_ref;
-  lite::Tensor out, out_cpu, out_ref;
-
-  int N = 5, C = 6, H = 7, W = 8;
-  std::vector<int> axes({0, 3, 1, 2});
-  x.Resize({N, H, W, C});
-  out.Resize({N, C, H, W});
-
-  x_cpu.Resize({N, H, W, C});
-  out_cpu.Resize({N, C, H, W});
-
-  x_ref.Resize({N, H, W, C});
-  out_ref.Resize({N, C, H, W});
-
-  auto* x_data = x.mutable_data<float>(TARGET(kCUDA));
-  auto* x_cpu_data = x_cpu.mutable_data<float>();
-  auto* out_cpu_data = out_cpu.mutable_data<float>();
-  auto* x_ref_data = x_ref.mutable_data<float>();
-
-  for (int i = 0; i < x_cpu.numel(); ++i) {
-    x_cpu_data[i] = i + 1;
-    x_ref_data[i] = i + 1;
-  }
-
-  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
-  param.x = &x;
-  param.output = &out;
-  param.axis = axes;
-  transpose_kernel.SetParam(param);
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-  context.SetExecStream(stream);
-  transpose_kernel.SetContext(std::move(ctx));
-  transpose_kernel.Launch();
-  cudaDeviceSynchronize();
-  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
-  CopySync<TARGET(kCUDA)>(
-      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
-  nhwc2nchw_ref(&x_ref, &out_ref, axes);
-  // transpose_ref(&x_ref, &out_ref, axes);
-  auto* out_ref_data = out_ref.mutable_data<float>();
-  for (int i = 0; i < out.numel(); i++) {
-    EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
-  }
-}
-
-TEST(transpose, normal) {
-  TransposeCompute transpose_kernel;
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  auto& context = ctx->As<CUDAContext>();
-
-  operators::TransposeParam param;
-
-  lite::Tensor x, x_cpu, x_ref;
-  lite::Tensor out, out_cpu, out_ref;
-
-  int C = 6, H = 7, W = 8;
-  std::vector<int> axes({2, 0, 1});
-  x.Resize({C, H, W});
-  out.Resize({W, C, H});
-
-  x_cpu.Resize({C, H, W});
-  out_cpu.Resize({W, C, H});
-
-  x_ref.Resize({C, H, W});
-  out_ref.Resize({W, C, H});
-
-  auto* x_data = x.mutable_data<float>(TARGET(kCUDA));
-  auto* x_cpu_data = x_cpu.mutable_data<float>();
-  auto* out_cpu_data = out_cpu.mutable_data<float>();
-  auto* x_ref_data = x_ref.mutable_data<float>();
-
-  for (int i = 0; i < x_cpu.numel(); ++i) {
-    x_cpu_data[i] = i + 1;
-    x_ref_data[i] = i + 1;
-  }
-
-  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
-  param.x = &x;
-  param.output = &out;
-  param.axis = axes;
-  transpose_kernel.SetParam(param);
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-  context.SetExecStream(stream);
-  transpose_kernel.SetContext(std::move(ctx));
-  transpose_kernel.Launch();
-  cudaDeviceSynchronize();
-  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
-  CopySync<TARGET(kCUDA)>(
-      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
-  transpose_ref(&x_ref, &out_ref, axes);
-  auto* out_ref_data = out_ref.mutable_data<float>();
-  for (int i = 0; i < out.numel(); i++) {
-    EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
-  }
-}
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/cuda/use_kernels.h b/lite/kernels/cuda/use_kernels.h
deleted file mode 100644
index d882f9aa22..0000000000
--- a/lite/kernels/cuda/use_kernels.h
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/op_registry.h"
-
-// TODO(Superjomn) make this file a library, that will make compile dependency
-// easier.
-#ifdef LITE_WITH_CUDA
-USE_LITE_KERNEL(mul, kCUDA, kFloat, kNCHW, def);
-USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device);
-USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, device_to_host);
-#endif
diff --git a/lite/kernels/cuda/yolo_box_compute.cu b/lite/kernels/cuda/yolo_box_compute.cu
deleted file mode 100644
index 99072de2a6..0000000000
--- a/lite/kernels/cuda/yolo_box_compute.cu
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <vector>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/cuda/yolo_box_compute.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-
-__host__ __device__ inline int GetEntryIndex(int batch,
-                                             int an_idx,
-                                             int hw_idx,
-                                             int an_num,
-                                             int an_stride,
-                                             int stride,
-                                             int entry) {
-  return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
-}
-
-template <typename T>
-__host__ __device__ inline T sigmoid(T x) {
-  return 1.0 / (1.0 + std::exp(-x));
-}
-
-template <typename T>
-__host__ __device__ inline void GetYoloBox(T* box,
-                                           const T* x,
-                                           const int* anchors,
-                                           int i,
-                                           int j,
-                                           int an_idx,
-                                           int grid_size,
-                                           int input_size,
-                                           int index,
-                                           int stride,
-                                           int img_height,
-                                           int img_width) {
-  box[0] = (i + sigmoid<T>(x[index])) * img_width / grid_size;
-  box[1] = (j + sigmoid<T>(x[index + stride])) * img_height / grid_size;
-  box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width /
-           input_size;
-  box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] *
-           img_height / input_size;
-}
-
-template <typename T>
-__host__ __device__ inline void CalcDetectionBox(T* boxes,
-                                                 T* box,
-                                                 const int box_idx,
-                                                 const int img_height,
-                                                 const int img_width) {
-  boxes[box_idx] = box[0] - box[2] / 2;
-  boxes[box_idx + 1] = box[1] - box[3] / 2;
-  boxes[box_idx + 2] = box[0] + box[2] / 2;
-  boxes[box_idx + 3] = box[1] + box[3] / 2;
-
-  boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast<T>(0);
-  boxes[box_idx + 1] =
-      boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast<T>(0);
-  boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1
-                           ? boxes[box_idx + 2]
-                           : static_cast<T>(img_width - 1);
-  boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1
-                           ? boxes[box_idx + 3]
-                           : static_cast<T>(img_height - 1);
-}
-
-template <typename T>
-__host__ __device__ inline void CalcLabelScore(T* scores,
-                                               const T* input,
-                                               const int label_idx,
-                                               const int score_idx,
-                                               const int class_num,
-                                               const T conf,
-                                               const int stride) {
-  for (int i = 0; i < class_num; ++i) {
-    scores[score_idx + i] = conf * sigmoid<T>(input[label_idx + i * stride]);
-  }
-}
-
-template <typename T>
-__global__ void KeYoloBoxFw(const T* input,
-                            const T* imgsize,
-                            T* boxes,
-                            T* scores,
-                            const float conf_thresh,
-                            const int* anchors,
-                            const int n,
-                            const int h,
-                            const int w,
-                            const int an_num,
-                            const int class_num,
-                            const int box_num,
-                            int input_size) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  T box[4];
-  for (; tid < n * box_num; tid += stride) {
-    int grid_num = h * w;
-    int i = tid / box_num;
-    int j = (tid % box_num) / grid_num;
-    int k = (tid % grid_num) / w;
-    int l = tid % w;
-
-    int an_stride = (5 + class_num) * grid_num;
-    int img_height = static_cast<int>(imgsize[2 * i]);
-    int img_width = static_cast<int>(imgsize[2 * i + 1]);
-
-    int obj_idx =
-        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4);
-    T conf = sigmoid<T>(input[obj_idx]);
-    if (conf < conf_thresh) {
-      continue;
-    }
-
-    int box_idx =
-        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0);
-    GetYoloBox<T>(box,
-                  input,
-                  anchors,
-                  l,
-                  k,
-                  j,
-                  h,
-                  input_size,
-                  box_idx,
-                  grid_num,
-                  img_height,
-                  img_width);
-    box_idx = (i * box_num + j * grid_num + k * w + l) * 4;
-    CalcDetectionBox<T>(boxes, box, box_idx, img_height, img_width);
-
-    int label_idx =
-        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5);
-    int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num;
-    CalcLabelScore<T>(
-        scores, input, label_idx, score_idx, class_num, conf, grid_num);
-  }
-}
-
-void YoloBoxCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<CUDAContext>();
-  auto stream = ctx.exec_stream();
-
-  lite::Tensor* X = param.X;
-  lite::Tensor* ImgSize = param.ImgSize;
-  lite::Tensor* Boxes = param.Boxes;
-  lite::Tensor* Scores = param.Scores;
-  std::vector<int> anchors = param.anchors;
-  int class_num = param.class_num;
-  float conf_thresh = param.conf_thresh;
-  int downsample_ratio = param.downsample_ratio;
-
-  const float* input = X->data<float>();
-  const float* imgsize = ImgSize->data<float>();
-  float* boxes = Boxes->mutable_data<float>(TARGET(kCUDA));
-  float* scores = Scores->mutable_data<float>(TARGET(kCUDA));
-
-  const int n = X->dims()[0];
-  const int h = X->dims()[2];
-  const int w = X->dims()[3];
-  const int box_num = Boxes->dims()[1];
-  const int an_num = anchors.size() / 2;
-  int input_size = downsample_ratio * h;
-
-  anchors_.Resize({static_cast<int64_t>(anchors.size())});
-  int* d_anchors = anchors_.mutable_data<int>(TARGET(kCUDA));
-  CopySync<TARGET(kCUDA)>(d_anchors,
-                          anchors.data(),
-                          sizeof(int) * anchors.size(),
-                          IoDirection::HtoD);
-
-  int threads = 512;
-  int blocks = (n * box_num + threads - 1) / threads;
-  blocks = blocks > 8 ? 8 : blocks;
-
-  KeYoloBoxFw<float><<<blocks, threads, 0, stream>>>(input,
-                                                     imgsize,
-                                                     boxes,
-                                                     scores,
-                                                     conf_thresh,
-                                                     d_anchors,
-                                                     n,
-                                                     h,
-                                                     w,
-                                                     an_num,
-                                                     class_num,
-                                                     box_num,
-                                                     input_size);
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
-}
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(yolo_box,
-                     kCUDA,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::cuda::YoloBoxCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .BindInput("ImgSize", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .BindOutput("Scores", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .Finalize();
diff --git a/lite/kernels/cuda/yolo_box_compute.h b/lite/kernels/cuda/yolo_box_compute.h
deleted file mode 100644
index c8ea9d25b6..0000000000
--- a/lite/kernels/cuda/yolo_box_compute.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/kernel.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-
-class YoloBoxCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
- public:
-  using param_t = operators::YoloBoxParam;
-
-  void Run() override;
-  virtual ~YoloBoxCompute() = default;
-
- private:
-  lite::Tensor anchors_;
-};
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/cuda/yolo_box_compute_test.cc b/lite/kernels/cuda/yolo_box_compute_test.cc
deleted file mode 100644
index 5cd9579383..0000000000
--- a/lite/kernels/cuda/yolo_box_compute_test.cc
+++ /dev/null
@@ -1,258 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/cuda/yolo_box_compute.h"
-#include <gtest/gtest.h>
-#include <memory>
-#include <utility>
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-
-inline static float sigmoid(float x) { return 1.f / (1.f + expf(-x)); }
-
-inline static void get_yolo_box(float* box,
-                                const float* x,
-                                const int* anchors,
-                                int i,
-                                int j,
-                                int an_idx,
-                                int grid_size,
-                                int input_size,
-                                int index,
-                                int stride,
-                                int img_height,
-                                int img_width) {
-  box[0] = (i + sigmoid(x[index])) * img_width / grid_size;
-  box[1] = (j + sigmoid(x[index + stride])) * img_height / grid_size;
-  box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width /
-           input_size;
-  box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] *
-           img_height / input_size;
-}
-
-inline static int get_entry_index(int batch,
-                                  int an_idx,
-                                  int hw_idx,
-                                  int an_num,
-                                  int an_stride,
-                                  int stride,
-                                  int entry) {
-  return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
-}
-
-inline static void calc_detection_box(float* boxes,
-                                      float* box,
-                                      const int box_idx,
-                                      const int img_height,
-                                      const int img_width) {
-  boxes[box_idx] = box[0] - box[2] / 2;
-  boxes[box_idx + 1] = box[1] - box[3] / 2;
-  boxes[box_idx + 2] = box[0] + box[2] / 2;
-  boxes[box_idx + 3] = box[1] + box[3] / 2;
-
-  boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast<float>(0);
-  boxes[box_idx + 1] =
-      boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast<float>(0);
-  boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1
-                           ? boxes[box_idx + 2]
-                           : static_cast<float>(img_width - 1);
-  boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1
-                           ? boxes[box_idx + 3]
-                           : static_cast<float>(img_height - 1);
-}
-
-inline static void calc_label_score(float* scores,
-                                    const float* input,
-                                    const int label_idx,
-                                    const int score_idx,
-                                    const int class_num,
-                                    const float conf,
-                                    const int stride) {
-  for (int i = 0; i < class_num; i++) {
-    scores[score_idx + i] = conf * sigmoid(input[label_idx + i * stride]);
-  }
-}
-
-template <typename T>
-static void YoloBoxRef(const T* input,
-                       const T* imgsize,
-                       T* boxes,
-                       T* scores,
-                       const float conf_thresh,
-                       const int* anchors,
-                       const int n,
-                       const int h,
-                       const int w,
-                       const int an_num,
-                       const int class_num,
-                       const int box_num,
-                       int input_size) {
-  const int stride = h * w;
-  const int an_stride = (class_num + 5) * stride;
-  float box[4];
-
-  for (int i = 0; i < n; i++) {
-    int img_height = static_cast<int>(imgsize[2 * i]);
-    int img_width = static_cast<int>(imgsize[2 * i + 1]);
-
-    for (int j = 0; j < an_num; j++) {
-      for (int k = 0; k < h; k++) {
-        for (int l = 0; l < w; l++) {
-          int obj_idx =
-              get_entry_index(i, j, k * w + l, an_num, an_stride, stride, 4);
-          float conf = sigmoid(input[obj_idx]);
-          if (conf < conf_thresh) {
-            continue;
-          }
-
-          int box_idx =
-              get_entry_index(i, j, k * w + l, an_num, an_stride, stride, 0);
-          get_yolo_box(box,
-                       input,
-                       anchors,
-                       l,
-                       k,
-                       j,
-                       h,
-                       input_size,
-                       box_idx,
-                       stride,
-                       img_height,
-                       img_width);
-          box_idx = (i * box_num + j * stride + k * w + l) * 4;
-          calc_detection_box(boxes, box, box_idx, img_height, img_width);
-
-          int label_idx =
-              get_entry_index(i, j, k * w + l, an_num, an_stride, stride, 5);
-          int score_idx = (i * box_num + j * stride + k * w + l) * class_num;
-          calc_label_score(
-              scores, input, label_idx, score_idx, class_num, conf, stride);
-        }
-      }
-    }
-  }
-}
-
-TEST(yolo_box, normal) {
-  YoloBoxCompute yolo_box_kernel;
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  auto& context = ctx->As<CUDAContext>();
-
-  operators::YoloBoxParam param;
-
-  lite::Tensor x, sz, x_cpu, sz_cpu;
-  lite::Tensor boxes, scores, boxes_cpu, scores_cpu;
-  lite::Tensor x_ref, sz_ref, boxes_ref, scores_ref;
-  int s = 3, cls = 4;
-  int n = 1, c = s * (5 + cls), h = 16, w = 16;
-  param.anchors = {2, 3, 4, 5, 8, 10};
-  param.downsample_ratio = 2;
-  param.conf_thresh = 0.5;
-  param.class_num = cls;
-  int m = h * w * param.anchors.size() / 2;
-
-  x.Resize({n, c, h, w});
-  sz.Resize({1, 2});
-  boxes.Resize({n, m, 4});
-  scores.Resize({n, cls, m});
-
-  x_cpu.Resize({n, c, h, w});
-  sz_cpu.Resize({1, 2});
-  boxes_cpu.Resize({n, m, 4});
-  scores_cpu.Resize({n, cls, m});
-
-  x_ref.Resize({n, c, h, w});
-  sz_ref.Resize({1, 2});
-  boxes_ref.Resize({n, m, 4});
-  scores_ref.Resize({n, cls, m});
-
-  auto* x_data = x.mutable_data<float>(TARGET(kCUDA));
-  auto* sz_data = sz.mutable_data<float>(TARGET(kCUDA));
-  auto* boxes_data = boxes.mutable_data<float>(TARGET(kCUDA));
-  auto* scores_data = scores.mutable_data<float>(TARGET(kCUDA));
-
-  float* x_cpu_data = x_cpu.mutable_data<float>();
-  float* sz_cpu_data = sz_cpu.mutable_data<float>();
-  float* boxes_cpu_data = boxes_cpu.mutable_data<float>();
-  float* scores_cpu_data = scores_cpu.mutable_data<float>();
-
-  float* x_ref_data = x_ref.mutable_data<float>();
-  float* sz_ref_data = sz_ref.mutable_data<float>();
-  float* boxes_ref_data = boxes_ref.mutable_data<float>();
-  float* scores_ref_data = scores_ref.mutable_data<float>();
-
-  for (int i = 0; i < x_cpu.numel(); ++i) {
-    x_cpu_data[i] = i - 5.0;
-    x_ref_data[i] = i - 5.0;
-  }
-  sz_cpu_data[0] = 16;
-  sz_cpu_data[1] = 32;
-  sz_ref_data[0] = 16;
-  sz_ref_data[1] = 32;
-
-  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
-  sz.Assign<float, lite::DDim, TARGET(kCUDA)>(sz_cpu_data, sz_cpu.dims());
-
-  param.X = &x;
-  param.ImgSize = &sz;
-  param.Boxes = &boxes;
-  param.Scores = &scores;
-  yolo_box_kernel.SetParam(param);
-
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-  context.SetExecStream(stream);
-
-  yolo_box_kernel.SetContext(std::move(ctx));
-  yolo_box_kernel.Launch();
-  cudaDeviceSynchronize();
-
-  CopySync<TARGET(kCUDA)>(boxes_cpu_data,
-                          boxes_data,
-                          sizeof(float) * boxes.numel(),
-                          IoDirection::DtoH);
-  CopySync<TARGET(kCUDA)>(scores_cpu_data,
-                          scores_data,
-                          sizeof(float) * scores.numel(),
-                          IoDirection::DtoH);
-
-  YoloBoxRef<float>(x_ref_data,
-                    sz_ref_data,
-                    boxes_ref_data,
-                    scores_ref_data,
-                    param.conf_thresh,
-                    param.anchors.data(),
-                    n,
-                    h,
-                    w,
-                    param.anchors.size() / 2,
-                    cls,
-                    m,
-                    param.downsample_ratio * h);
-
-  for (int i = 0; i < boxes.numel(); i++) {
-    EXPECT_NEAR(boxes_cpu_data[i], boxes_ref_data[i], 1e-5);
-  }
-  for (int i = 0; i < scores.numel(); i++) {
-    EXPECT_NEAR(scores_cpu_data[i], scores_ref_data[i], 1e-5);
-  }
-}
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/fpga/CMakeLists.txt b/lite/kernels/fpga/CMakeLists.txt
deleted file mode 100644
index dc88601880..0000000000
--- a/lite/kernels/fpga/CMakeLists.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-if (NOT LITE_WITH_FPGA)
-    return()
-endif()
-
-set(fpga_deps fpga_target_wrapper kernel_fpga)
-
-add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps})
-lite_cc_test(test_acivation_fpga SRCS activation_compute_test.cc DEPS ${lite_kernel_deps} activation_compute_fpga ${fpga_deps})
-
-add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps})
-lite_cc_test(test_conv_fpga SRCS conv_compute_test.cc DEPS ${lite_kernel_deps} conv_compute_fpga ${fpga_deps})
-
-add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS ${fpga_deps})
-lite_cc_test(test_elementwise_fpga SRCS elementwise_compute_test.cc DEPS ${lite_kernel_deps} elementwise_compute_fpga ${fpga_deps})
-
-
-add_kernel(pooling_compute_fpga FPGA basic SRCS pooling_compute.cc DEPS ${fpga_deps})
-lite_cc_test(test_pooling_compute_fpga SRCS pooling_compute_test.cc DEPS ${lite_kernel_deps} pooling_compute_fpga ${fpga_deps})
-
-add_kernel(scale_compute_fpga FPGA basic SRCS scale_compute.cc DEPS ${fpga_deps})
-
-add_kernel(softmax_compute_fpga FPGA basic SRCS softmax_compute.cc DEPS ${fpga_deps})
-lite_cc_test(test_softmax_compute_fpga SRCS softmax_compute_test.cc DEPS ${lite_kernel_deps} softmax_compute_fpga ${fpga_deps})
-
-add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
-lite_cc_test(test_fc_compute_fpga SRCS fc_compute_test.cc DEPS ${lite_kernel_deps} fc_compute_fpga ${fpga_deps})
-
-add_kernel(io_copy_compute_fpga FPGA basic SRCS io_copy_compute.cc DEPS ${fpga_deps})
-add_kernel(calib_compute_fpga FPGA basic SRCS calib_compute.cc DEPS ${fpga_deps})
-add_kernel(layout_compute_fpga FPGA basic SRCS layout_compute.cc DEPS ${fpga_deps})
-add_kernel(feed_compute_fpga FPGA basic SRCS feed_compute.cc DEPS ${fpga_deps})
-add_kernel(fetch_compute_fpga FPGA basic SRCS fetch_compute.cc DEPS ${fpga_deps})
diff --git a/lite/kernels/fpga/activation_compute.cc b/lite/kernels/fpga/activation_compute.cc
deleted file mode 100644
index ecd9af0f8d..0000000000
--- a/lite/kernels/fpga/activation_compute.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/fpga/activation_compute.h"
-#include "lite/backends/fpga/KD/float16.hpp"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-using float16 = zynqmp::float16;
-
-void ReluCompute::PrepareForRun() {
-  auto& param = this->Param<param_t>();
-  auto output_data = param.Out->mutable_data<float16>();
-  zynqmp::InputParam& input_param = pe_.param();
-
-  input_param.input = param.X->ZynqTensor();
-  input_param.output = param.Out->ZynqTensor();
-  pe_.init();
-  pe_.apply();
-}
-
-void ReluCompute::Run() { pe_.dispatch(); }
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    relu, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::ReluCompute, def)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kNHWC))})
-    .Finalize();
diff --git a/lite/kernels/fpga/activation_compute.h b/lite/kernels/fpga/activation_compute.h
deleted file mode 100644
index 796d54413f..0000000000
--- a/lite/kernels/fpga/activation_compute.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/backends/fpga/KD/float16.hpp"
-#include "lite/backends/fpga/KD/pes/relu_pe.hpp"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-class ReluCompute
-    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
- public:
-  using param_t = operators::ActivationParam;
-
-  void Run() override;
-  void PrepareForRun() override;
-
-  virtual ~ReluCompute() = default;
-
- private:
-  zynqmp::ReluPE pe_;
-  zynqmp::Tensor input_;
-  zynqmp::Tensor output_;
-};
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/fpga/activation_compute_test.cc b/lite/kernels/fpga/activation_compute_test.cc
deleted file mode 100644
index cef87afffc..0000000000
--- a/lite/kernels/fpga/activation_compute_test.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/fpga/activation_compute.h"
-#include <gtest/gtest.h>
-#include <memory>
-#include <utility>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-template <typename dtype>
-void activation_compute_ref(const operators::ActivationParam& param) {
-  auto x_data = param.X->data<dtype>();
-  auto output_data = param.Out->mutable_data<dtype>();
-  DDim x_dims = param.X->dims();
-  DDim output_dims = param.Out->dims();
-  ASSERT_EQ(x_dims.data(), output_dims.data());
-  for (int i = 0; i < output_dims.production(); i++) {
-    output_data[i] = std::max(0.f, x_data[i]);
-  }
-}
-
-TEST(activation_fpga, retrive_op) {
-  auto activation =
-      KernelRegistry::Global().Create<TARGET(kFPGA), PRECISION(kFP16)>("relu");
-  ASSERT_FALSE(activation.empty());
-  ASSERT_TRUE(activation.front());
-}
-
-TEST(activation_fpga, init) {
-  ReluCompute activation;
-  ASSERT_EQ(activation.precision(), PRECISION(kFP16));
-  ASSERT_EQ(activation.target(), TARGET(kFPGA));
-}
-
-TEST(activation_fpga, compute) {
-  DeviceInfo::Init();
-  for (auto n : {1, 2}) {
-    for (auto c : {6, 32 /*, 128*/}) {
-      for (auto h : {9, 18 /*, 56 , 112, 224, 512*/}) {
-        for (auto w : {9, 18 /*, 56, 112, 224, 512*/}) {
-          Tensor x;
-          Tensor output;
-          Tensor output_ref;
-          // set the dims of input, output, ref output tensors
-          x.Resize({n, c, h, w});
-          output.Resize({n, c, h, w});
-          output_ref.Resize({n, c, h, w});
-          // initialize the data of input tensors
-          auto* x_data = x.mutable_data<float>();
-          auto* output_data = output.mutable_data<float>();
-          for (int i = 0; i < x.dims().production(); i++) {
-            float sign = i % 3 == 0 ? -1.0f : 1.0f;
-            x_data[i] = sign * static_cast<float>(i % 128) * 0.013f;
-          }
-          // prepare kernel params and run
-          ReluCompute activation;
-          operators::ActivationParam param;
-          param.X = &x;
-          param.Out = &output;
-          activation.SetParam(param);
-          activation.Launch();
-          // invoking ref implementation and compare results
-          param.Out = &output_ref;
-          activation_compute_ref<float>(param);
-          auto* output_ref_data = output_ref.mutable_data<float>();
-          for (int i = 0; i < output.dims().production(); i++) {
-            EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(relu, kFPGA, kFP16, kNHWC, def);
diff --git a/lite/kernels/fpga/calib_compute.cc b/lite/kernels/fpga/calib_compute.cc
deleted file mode 100644
index 51902cb08a..0000000000
--- a/lite/kernels/fpga/calib_compute.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/fpga/calib_compute.h"
-#include <vector>
-#include "lite/backends/fpga/KD/float16.hpp"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-using float16 = zynqmp::float16;
-void CalibComputeFp32ToFP16::Run() {
-  auto& param = this->Param<operators::CalibParam>();
-  const auto* din = param.input->data<float>();
-  auto* dout = param.output->mutable_data<float16>(TARGET(kFPGA));
-
-  for (int i = 0; i < param.input->numel(); ++i) {
-    dout[i] = zynqmp::float_to_half(din[i]);
-  }
-  return;
-}
-
-void CalibComputeFP16ToFp32::Run() {
-  auto& param = this->Param<operators::CalibParam>();
-  const auto* din = param.input->data<float16>();
-  auto* dout = param.output->mutable_data<float>(TARGET(kFPGA));
-  for (int i = 0; i < param.input->numel(); ++i) {
-    dout[i] = zynqmp::half_to_float(din[i]);
-  }
-  return;
-}
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(calib,
-                     kFPGA,
-                     kFP16,
-                     kNHWC,
-                     paddle::lite::kernels::fpga::CalibComputeFp32ToFP16,
-                     fp32_to_fp16_fpga)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kNCHW))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(calib,
-                     kFPGA,
-                     kFP16,
-                     kNHWC,
-                     paddle::lite::kernels::fpga::CalibComputeFP16ToFp32,
-                     fp16_to_fp32_fpga)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kNHWC))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(calib_once,
-                     kFPGA,
-                     kFP16,
-                     kNHWC,
-                     paddle::lite::kernels::fpga::CalibComputeFp32ToFP16,
-                     fp32_to_fp16_fpga)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kNCHW))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(calib_once,
-                     kFPGA,
-                     kFP16,
-                     kNHWC,
-                     paddle::lite::kernels::fpga::CalibComputeFP16ToFp32,
-                     fp16_to_fp32_fpga)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kNHWC))})
-    .Finalize();
diff --git a/lite/kernels/fpga/calib_compute.h b/lite/kernels/fpga/calib_compute.h
deleted file mode 100644
index 3f5c399b9a..0000000000
--- a/lite/kernels/fpga/calib_compute.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/kernel.h"
-#include "lite/operators/calib_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-class CalibComputeFp32ToFP16
-    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
- public:
-  using param_t = operators::CalibParam;
-
-  void Run() override;
-
-  ~CalibComputeFp32ToFP16() override{};
-
- private:
-};
-
-class CalibComputeFP16ToFp32
-    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
- public:
-  using param_t = operators::CalibParam;
-
-  void Run() override;
-
-  ~CalibComputeFP16ToFp32() override{};
-
- private:
-};
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/fpga/conv_compute.cc b/lite/kernels/fpga/conv_compute.cc
deleted file mode 100644
index 3e06e103bb..0000000000
--- a/lite/kernels/fpga/conv_compute.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/fpga/conv_compute.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-using float16 = zynqmp::float16;
-
-void ConvCompute::PrepareForRun() {
-  auto& param = this->Param<param_t>();
-
-  // ====================================================
-  zynqmp::ConvParam& conv_param = pe_.param();
-  param.output->mutable_data<float16>();
-
-  // filter_.setDataType(zynqmp::FP32);
-  conv_param.input = param.x->ZynqTensor();
-  conv_param.output = param.output->ZynqTensor();
-  conv_param.filter = param.filter->ZynqTensor();
-  conv_param.groups = param.groups;
-  conv_param.strides = param.strides;
-  conv_param.paddings = param.paddings;
-  conv_param.dilations = param.dilations;
-  fill_scale_bias_const(&conv_param);
-  conv_param.bias()->copyFrom(param.bias->ZynqTensor());
-  conv_param.relu.enabled = param.fuse_relu;
-  pe_.init();
-  pe_.apply();
-}
-
-void ConvCompute::Run() {
-  auto& param = this->Param<param_t>();
-  zynqmp::ConvParam& conv_param = pe_.param();
-  pe_.dispatch();
-}
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    conv2d, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::ConvCompute, def)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Output",
-                {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kNHWC))})
-    .Finalize();
diff --git a/lite/kernels/fpga/conv_compute.h b/lite/kernels/fpga/conv_compute.h
deleted file mode 100644
index a023fb46fc..0000000000
--- a/lite/kernels/fpga/conv_compute.h
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "lite/backends/fpga/KD/float16.hpp"
-#include "lite/backends/fpga/KD/pes/conv_pe.hpp"
-#include "lite/core/kernel.h"
-#include "lite/operators/conv_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-using float16 = zynqmp::float16;
-class ConvCompute
-    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
- public:
-  using param_t = operators::ConvParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  ~ConvCompute() {}
-
- private:
-  zynqmp::ConvPE pe_;
-};
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/fpga/conv_compute_test.cc b/lite/kernels/fpga/conv_compute_test.cc
deleted file mode 100644
index f166974cc9..0000000000
--- a/lite/kernels/fpga/conv_compute_test.cc
+++ /dev/null
@@ -1,315 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/fpga/conv_compute.h"
-#include <gtest/gtest.h>
-#include <limits>
-#include <memory>
-#include <utility>
-#include <vector>
-#include "lite/backends/fpga/KD/float16.hpp"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-using float16 = zynqmp::float16;
-
-static int get_rand(int start, int end) {
-  int i = rand();  // NOLINT
-  i = (i % (end - start)) + start;
-  return i;
-}
-
-template <typename Dtype1, typename Dtype2>
-static void conv_basic(const Dtype1* din,
-                       Dtype2* dout,
-                       int num,
-                       int chout,
-                       int hout,
-                       int wout,
-                       int chin,
-                       int hin,
-                       int win,
-                       const Dtype1* weights,
-                       const Dtype2* bias,
-                       int group,
-                       int kernel_w,
-                       int kernel_h,
-                       int stride_w,
-                       int stride_h,
-                       int dila_w,
-                       int dila_h,
-                       int pad_w,
-                       int pad_h,
-                       bool flag_bias,
-                       bool flag_relu) {
-  Dtype2 beta = 0;
-  auto src_data = din;
-  auto dst_data_ref = dout;
-  auto weights_data = weights;
-  auto with_bias = flag_bias;
-  auto bias_data = bias;
-
-  int in_num = num;
-  int out_channels = chout;
-  int out_h = hout;
-  int out_w = wout;
-
-  int in_channel = chin;
-  int in_h = hin;
-  int in_w = win;
-  int out_c_group = out_channels / group;
-  int in_c_group = in_channel / group;
-
-  for (int n = 0; n < in_num; ++n) {
-    for (int g = 0; g < group; ++g) {
-      for (int oc = 0; oc < out_c_group; ++oc) {
-        for (int oh = 0; oh < out_h; ++oh) {
-          for (int ow = 0; ow < out_w; ++ow) {
-            int out_idx = n * group * out_c_group * out_h * out_w +
-                          g * out_c_group * out_h * out_w + oc * out_h * out_w +
-                          oh * out_w + ow;
-            Dtype2 bias_d =
-                with_bias ? (bias_data[g * out_c_group + oc]) : (Dtype2)0;
-            dst_data_ref[out_idx] = bias_d;  // + dst_data_ref[out_idx] * beta;
-            for (int ic = 0; ic < in_c_group; ++ic) {
-              for (int kh = 0; kh < kernel_h; ++kh) {
-                for (int kw = 0; kw < kernel_w; ++kw) {
-                  int iw = ow * stride_w - pad_w + kw * (dila_w);
-                  int ih = oh * stride_h - pad_h + kh * (dila_h);
-                  if (iw < 0 || iw >= in_w) continue;
-                  if (ih < 0 || ih >= in_h) continue;
-
-                  int iidx = n * in_channel * in_h * in_w +
-                             g * in_c_group * in_h * in_w + ic * in_h * in_w +
-                             ih * in_w + iw;
-                  int widx =
-                      g * out_c_group * in_c_group * kernel_h * kernel_w +
-                      oc * in_c_group * kernel_h * kernel_w +
-                      ic * kernel_h * kernel_w + kh * kernel_w + kw;
-
-                  dst_data_ref[out_idx] += src_data[iidx] * weights_data[widx];
-                }
-              }
-            }
-            if (flag_relu) {
-              dst_data_ref[out_idx] = dst_data_ref[out_idx] > (Dtype2)0
-                                          ? dst_data_ref[out_idx]
-                                          : (Dtype2)0;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename Dtype1, typename Dtype2>
-void conv_compute_ref(const operators::ConvParam& param) {
-  const Dtype1* din = param.x->data<Dtype1>();
-  Dtype2* dout = param.output->mutable_data<Dtype2>();
-
-  int num = param.x->dims()[0];
-  int chout = param.output->dims()[1];
-  int hout = param.output->dims()[2];
-  int wout = param.output->dims()[3];
-
-  int chin = param.x->dims()[1];
-  int hin = param.x->dims()[2];
-  int win = param.x->dims()[3];
-
-  const Dtype1* weights = param.filter->mutable_data<Dtype1>();
-  Dtype2* bias = nullptr;
-  if (param.bias != nullptr) {
-    bias = param.bias->mutable_data<Dtype2>();
-  }
-
-  int group = param.groups;
-  int kernel_w = param.filter->dims()[2];
-  int kernel_h = param.filter->dims()[3];
-  int stride_w = param.strides[0];
-  int stride_h = param.strides[1];
-  int dila_w = param.dilations[0];
-  int dila_h = param.dilations[1];
-
-  int pad_w = param.paddings[0];
-  int pad_h = param.paddings[1];
-  bool flag_bias = (param.bias != nullptr);
-  bool flag_relu = param.fuse_relu;
-
-  conv_basic(din,
-             dout,
-             num,
-             chout,
-             hout,
-             wout,
-             chin,
-             hin,
-             win,
-             weights,
-             bias,
-             group,
-             kernel_w,
-             kernel_h,
-             stride_w,
-             stride_h,
-             dila_w,
-             dila_h,
-             pad_w,
-             pad_h,
-             flag_bias,
-             flag_relu);
-}
-
-TEST(conv_fpga, retrive_op) {
-  auto elementwise_add =
-      KernelRegistry::Global()
-          .Create<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)>("conv2d");
-  ASSERT_FALSE(elementwise_add.empty());
-  ASSERT_TRUE(elementwise_add.front());
-}
-
-TEST(conv_fpga, init) {
-  ConvCompute conv;
-  ASSERT_EQ(conv.precision(), PRECISION(kFP16));
-  ASSERT_EQ(conv.target(), TARGET(kFPGA));
-}
-
-TEST(conv_fpga, compute) {
-  DeviceInfo::Init();
-#if 1
-  for (auto n : {2}) {
-    for (auto ic : {6}) {
-      for (auto oc : {6}) {
-        for (auto ih : {9}) {
-          for (auto iw : {9}) {
-            for (auto flag_bias : {false, true}) {
-              for (auto flag_relu : {false, true}) {
-                for (auto depthwise : {false, true}) {
-                  for (auto dilation : {1}) {
-                    for (auto stride : {1, 2}) {
-                      for (auto padding : {0, 1, 2}) {
-                        for (auto ks : {1, 3, 5}) {
-#else
-  for (auto n : {1, 2}) {
-    for (auto ic : {6, 32 /*, 128*/}) {
-      for (auto oc : {6, 32 /*, 128*/}) {
-        for (auto ih : {9, 18 /*, 56 , 112, 224, 512*/}) {
-          for (auto iw : {9, 18 /*, 56, 112, 224, 512*/}) {
-            for (auto flag_bias : {false, true}) {
-              for (auto flag_relu : {false, true}) {
-                for (auto depthwise : {false, true}) {
-                  for (auto dilation : {1, 2}) {
-                    for (auto stride : {1, 2}) {
-                      for (auto padding : {0, 1, 2}) {
-                        for (auto ks : {1, 3, 5}) {
-#endif
-                          int group = 1;
-                          if (depthwise) {  // depthwise convolution ?
-                            group = oc = ic;
-                          }
-                          // get input, filter and output shape
-                          std::vector<int64_t> input_shape = {n, ic, ih, iw};
-                          std::vector<int64_t> filter_shape = {
-                              oc, ic / group, ks, ks};
-                          const int dks = dilation * (ks - 1) + 1;
-                          int oh = (ih + 2 * padding - dks) / stride + 1;
-                          int ow = (iw + 2 * padding - dks) / stride + 1;
-                          std::vector<int64_t> output_shape({n, oc, oh, ow});
-                          // resize input, filter and outfloat16put
-                          Tensor input;
-                          Tensor filter;
-                          Tensor bias;
-                          Tensor output;
-                          Tensor output_ref;
-                          input.Resize(input_shape);
-                          filter.Resize(filter_shape);
-                          output.Resize(output_shape);
-                          output_ref.Resize(output_shape);
-                          VLOG(3) << "input: " << input.dims();
-                          VLOG(3) << "filter: " << filter.dims()
-                                  << " padding:" << padding
-                                  << " stride:" << stride
-                                  << " dilation:" << dilation;
-                          VLOG(3) << "output: " << output.dims();
-                          auto* input_data =
-                              input.mutable_data<float16>(TARGET(kFPGA));
-                          auto* filter_data =
-                              filter.mutable_data<float>(TARGET(kFPGA));
-                          auto* output_data =
-                              output.mutable_data<float16>(TARGET(kFPGA));
-                          for (int i = 0; i < input.dims().production(); i++) {
-                            float sign = i % 3 == 0 ? -1.0f : 1.0f;
-                            input_data[i] = sign * static_cast<float>(i % 128);
-                          }
-                          for (int i = 0; i < filter.dims().production(); i++) {
-                            filter_data[i] =
-                                i * 0.001f /
-                                static_cast<float>(filter.dims().production());
-                          }
-                          // prepare kernel params and run
-                          ConvCompute conv;
-                          operators::ConvParam param;
-                          param.x = &input;
-                          param.filter = &filter;
-                          param.output = &output;
-                          param.bias = nullptr;
-                          if (flag_bias) {
-                            bias.Resize({oc});
-                            auto* bias_data = bias.mutable_data<float>();
-                            for (int i = 0; i < bias.dims().production(); i++) {
-                              bias_data[i] = static_cast<float>(i);
-                            }
-                            param.bias = &bias;
-                          }
-                          param.fuse_relu = flag_relu;
-                          param.paddings = std::vector<int>({padding, padding});
-                          param.strides = std::vector<int>({stride, stride});
-                          param.dilations =
-                              std::vector<int>({dilation, dilation});
-                          param.groups = group;
-                          conv.SetParam(param);
-                          conv.Launch();
-                          // invoking ref implementation and compare results
-                          param.output = &output_ref;
-                          conv_compute_ref<float, float>(param);
-                          auto* output_ref_data =
-                              output_ref.mutable_data<float>();
-                          for (int i = 0; i < output.dims().production(); i++) {
-                            EXPECT_NEAR(
-                                output_data[i], output_ref_data[i], 1e-3);
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(conv2d, kFPGA, kFP16, kNHWC, def);
diff --git a/lite/kernels/fpga/elementwise_compute.cc b/lite/kernels/fpga/elementwise_compute.cc
deleted file mode 100644
index 2a12650ef1..0000000000
--- a/lite/kernels/fpga/elementwise_compute.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/fpga/elementwise_compute.h"
-#include <string>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-using float16 = zynqmp::float16;
-
-void ElementwiseAddCompute::PrepareForRun() {
-  zynqmp::ElementwiseAddParam& ew_param = pe_.param();
-  auto& param = Param<operators::ElementwiseParam>();
-
-  param.Out->mutable_data<float16>();
-
-  ew_param.inputs = {param.X->ZynqTensor(), param.Y->ZynqTensor()};
-  ew_param.output = param.Out->ZynqTensor();
-  ew_param.axis = param.axis;
-  ew_param.relu.enabled = false;
-
-  pe_.init();
-  pe_.apply();
-}
-void ElementwiseAddCompute::Run() { pe_.dispatch(); }
-
-void ElementwiseAddActivationCompute::PrepareForRun() {
-  zynqmp::ElementwiseAddParam& ew_param = pe_.param();
-  auto& param = Param<operators::FusionElementwiseActivationParam>();
-  if (param.act_type != "relu") {
-    LOG(FATAL) << "unsupported Activation type: " << param.act_type;
-  }
-  param.Out->mutable_data<float16>();
-  ew_param.inputs = {param.X->ZynqTensor(), param.Y->ZynqTensor()};
-  ew_param.output = param.Out->ZynqTensor();
-  ew_param.axis = param.axis;
-  ew_param.relu.enabled = true;
-  pe_.init();
-  pe_.apply();
-}
-void ElementwiseAddActivationCompute::Run() { pe_.dispatch(); }
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(elementwise_add,
-                     kFPGA,
-                     kFP16,
-                     kNHWC,
-                     paddle::lite::kernels::fpga::ElementwiseAddCompute,
-                     def)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
-    .BindInput("Y",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kNHWC))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(
-    fusion_elementwise_add_activation,
-    kFPGA,
-    kFP16,
-    kNHWC,
-    paddle::lite::kernels::fpga::ElementwiseAddActivationCompute,
-    def)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
-    .BindInput("Y",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kNHWC))})
-    .Finalize();
diff --git a/lite/kernels/fpga/elementwise_compute.h b/lite/kernels/fpga/elementwise_compute.h
deleted file mode 100644
index 7051dd7eed..0000000000
--- a/lite/kernels/fpga/elementwise_compute.h
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/backends/fpga/KD/float16.hpp"
-#include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-using float16 = zynqmp::float16;
-
-class ElementwiseAddCompute
-    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
- public:
-  void Run() override;
-  void PrepareForRun() override;
-
-  virtual ~ElementwiseAddCompute() = default;
-
- private:
-  zynqmp::ElementwiseAddPE pe_;
-};
-
-class ElementwiseAddActivationCompute
-    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
- public:
-  void Run() override;
-  void PrepareForRun() override;
-
-  virtual ~ElementwiseAddActivationCompute() = default;
-
- private:
-  zynqmp::ElementwiseAddPE pe_;
-};
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/fpga/elementwise_compute_test.cc b/lite/kernels/fpga/elementwise_compute_test.cc
deleted file mode 100644
index add60f6460..0000000000
--- a/lite/kernels/fpga/elementwise_compute_test.cc
+++ /dev/null
@@ -1,286 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/fpga/elementwise_compute.h"
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "lite/backends/fpga/KD/float16.hpp"
-#include "lite/core/op_registry.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-using float16 = zynqmp::float16;
-
-TEST(elementwise_add_fpga, retrive_op) {
-  auto elementwise_add =
-      KernelRegistry::Global()
-          .Create<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)>(
-              "elementwise_add");
-  ASSERT_FALSE(elementwise_add.empty());
-  ASSERT_TRUE(elementwise_add.front());
-}
-
-TEST(elementwise_add_fpga, init) {
-  ElementwiseAddCompute elementwise_add;
-  ASSERT_EQ(elementwise_add.precision(), PRECISION(kFP16));
-  ASSERT_EQ(elementwise_add.target(), TARGET(kFPGA));
-}
-
-template <typename dtype>
-dtype sum(dtype d1, dtype d2) {
-  return d1 + d2;
-}
-template <>
-float16 sum<float16>(float16 d1, float16 d2) {
-  return zynqmp::float_to_half(zynqmp::half_to_float(d1) +
-                               zynqmp::half_to_float(d2));
-}
-template <typename dtype>
-bool greater(dtype d1, dtype d2) {  // NOLINT
-  return d1 > d2;
-}
-template <>
-bool greater<float16>(float16 d1, float16 d2) {  // NOLINT
-  return zynqmp::half_to_float(d1) > zynqmp::half_to_float(d2);
-}
-template <typename dtype>
-dtype float_cast(float d) {
-  return d;
-}
-template <>
-float16 float_cast<float16>(float d) {
-  return zynqmp::float_to_half(d);
-}
-
-template <typename dtype>
-void elementwise_compute_ref(const operators::ElementwiseParam& param,
-                             const std::string elt_type,
-                             const std::string act_type) {
-  const dtype* x_data = param.X->data<const dtype>();
-  const dtype* y_data = param.Y->data<const dtype>();
-  dtype* out_data = param.Out->mutable_data<dtype>();
-  auto x_dims = param.X->dims();
-  auto y_dims = param.Y->dims();
-  int axis = param.axis;
-  if (axis < 0) {
-    axis = x_dims.size() - y_dims.size();
-  }
-  int batch = 1;
-  int channels = 1;
-  int num = 1;
-  for (int i = 0; i < axis; ++i) {
-    batch *= x_dims[i];
-  }
-  for (int i = 0; i < y_dims.size(); ++i) {
-    channels *= y_dims[i];
-  }
-  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
-    num *= x_dims[i];
-  }
-  // do elementwise add/sub/max...
-  if (elt_type == "add") {
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
-        int offset = (i * channels + j) * num;
-        const dtype* din_ptr = x_data + offset;
-        const dtype diny_data = y_data[j];
-        dtype* dout_ptr = out_data + offset;
-        for (int k = 0; k < num; ++k) {
-          *dout_ptr = sum(*din_ptr, diny_data);
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  } else if (elt_type == "sub") {
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
-        int offset = (i * channels + j) * num;
-        const dtype* din_ptr = x_data + offset;
-        const dtype diny_data = y_data[j];
-        dtype* dout_ptr = out_data + offset;
-        for (int k = 0; k < num; ++k) {
-          *dout_ptr = *din_ptr - diny_data;
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  } else {
-    LOG(FATAL) << "unsupported Elementwise type: " << elt_type;
-  }
-  // do activation relu/sigmod...
-  if (act_type.size() > 0) {
-    if (act_type == "relu") {
-      float16 zero = float_cast<float16>(0.0f);
-      for (int i = 0; i < batch; ++i) {
-        for (int j = 0; j < channels; ++j) {
-          dtype* dout_ptr = out_data + (i * channels + j) * num;
-          for (int k = 0; k < num; ++k) {
-            *dout_ptr = greater(*dout_ptr, zero) ? *dout_ptr : zero;
-            dout_ptr++;
-          }
-        }
-      }
-    } else {
-      LOG(FATAL) << "unsupported Activation type: " << elt_type;
-    }
-  }
-}
-
-TEST(elementwise_add, compute) {
-  ElementwiseAddCompute elementwise_add;
-  operators::ElementwiseParam param;
-  lite::Tensor x, y, output, output_ref;
-
-  for (auto n : {1}) {
-    for (auto c : {8}) {
-      for (auto h : {8}) {
-        for (auto w : {8}) {
-          for (auto axis : {0}) {
-            for (auto yd : {std::vector<int64_t>({n, c, h, w})}) {
-              auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
-              auto y_dim = DDim(yd);
-              int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
-
-              if (axis_t + y_dim.size() > 4) continue;
-              bool flag = false;
-              for (int i = 0; i < y_dim.size(); i++) {
-                if (x_dim[i + axis_t] != y_dim[i]) flag = true;
-              }
-              if (flag) continue;
-
-              x.Resize(x_dim);
-              y.Resize(y_dim);
-              output.Resize(x_dim);
-              output_ref.Resize(x_dim);
-              auto* x_data = x.mutable_data<float16>(TARGET(kFPGA));
-              auto* y_data = y.mutable_data<float16>(TARGET(kFPGA));
-              auto* output_data = output.mutable_data<float16>(TARGET(kFPGA));
-              auto* output_ref_data =
-                  output_ref.mutable_data<float16>(TARGET(kFPGA));
-              for (int i = 0; i < x_dim.production(); i++) {
-                x_data[i] = zynqmp::float_to_half(i);
-              }
-              for (int i = 0; i < y_dim.production(); i++) {
-                y_data[i] = zynqmp::float_to_half(i);
-              }
-              param.X = &x;
-              param.Y = &y;
-              param.axis = axis;
-              param.Out = &output;
-              elementwise_add.SetParam(param);
-              elementwise_add.PrepareForRun();
-              elementwise_add.Run();
-              param.Out = &output_ref;
-
-              elementwise_compute_ref<float16>(param, "add", "");
-              for (int i = 0; i < output.dims().production(); i++) {
-                EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(fusion_elementwise_add_activation_fpga, retrive_op) {
-  auto fusion_elementwise_add_activation =
-      KernelRegistry::Global()
-          .Create<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)>(
-              "fusion_elementwise_add_activation");
-  ASSERT_FALSE(fusion_elementwise_add_activation.empty());
-  ASSERT_TRUE(fusion_elementwise_add_activation.front());
-}
-
-TEST(fusion_elementwise_add_activation_fpga, init) {
-  ElementwiseAddActivationCompute fusion_elementwise_add_activation;
-  ASSERT_EQ(fusion_elementwise_add_activation.precision(), PRECISION(kFP16));
-  ASSERT_EQ(fusion_elementwise_add_activation.target(), TARGET(kFPGA));
-}
-
-TEST(fusion_elementwise_add_activation_fpga, compute) {
-  ElementwiseAddActivationCompute fusion_elementwise_add_activation;
-  operators::FusionElementwiseActivationParam param;
-  lite::Tensor x, y, output, output_ref;
-
-  for (auto act_type : {"relu"}) {
-    for (auto n : {1}) {
-      for (auto c : {8}) {
-        for (auto h : {8}) {
-          for (auto w : {8}) {
-            for (auto axis : {0}) {
-              for (auto yd : {std::vector<int64_t>({n, c, h, w})}) {
-                auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
-                auto y_dim = DDim(yd);
-                int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
-
-                if (axis_t + y_dim.size() > 4) continue;
-                bool flag = false;
-                for (int i = 0; i < y_dim.size(); i++) {
-                  if (x_dim[i + axis_t] != y_dim[i]) flag = true;
-                }
-                if (flag) continue;
-
-                x.Resize(x_dim);
-                y.Resize(y_dim);
-                output.Resize(x_dim);
-                output_ref.Resize(x_dim);
-                auto* x_data = x.mutable_data<float16>(TARGET(kFPGA));
-                auto* y_data = y.mutable_data<float16>(TARGET(kFPGA));
-                auto* output_data = output.mutable_data<float16>(TARGET(kFPGA));
-                auto* output_ref_data =
-                    output_ref.mutable_data<float16>(TARGET(kFPGA));
-                for (int i = 0; i < x_dim.production(); i++) {
-                  float sign = i % 3 == 0 ? -1.0f : 1.0f;
-                  x_data[i] = zynqmp::float_to_half(i * sign);
-                }
-                for (int i = 0; i < y_dim.production(); i++) {
-                  float sign = i % 2 == 0 ? 0.5f : -0.5f;
-                  y_data[i] = zynqmp::float_to_half(i * sign);
-                }
-                param.X = &x;
-                param.Y = &y;
-                param.axis = axis;
-                param.Out = &output;
-                param.act_type = act_type;
-                fusion_elementwise_add_activation.SetParam(param);
-                fusion_elementwise_add_activation.PrepareForRun();
-                fusion_elementwise_add_activation.Run();
-                param.Out = &output_ref;
-                elementwise_compute_ref<float16>(param, "add", act_type);
-                for (int i = 0; i < output.dims().production(); i++) {
-                  EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(elementwise_add, kFPGA, kFP16, kNHWC, def);
-USE_LITE_KERNEL(fusion_elementwise_add_activation, kFPGA, kFP16, kNHWC, def);
diff --git a/lite/kernels/fpga/fc_compute.cc b/lite/kernels/fpga/fc_compute.cc
deleted file mode 100644
index dca6dbce16..0000000000
--- a/lite/kernels/fpga/fc_compute.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/fpga/fc_compute.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-using float16 = zynqmp::float16;
-
-void FcCompute::PrepareForRun() {
-  auto& param = this->Param<param_t>();
-
-  // ====================================================
-  zynqmp::FullyConnectedParam& fc_param = pe_.param();
-
-  param.output->mutable_data<float16>();
-
-  fc_param.input = param.input->ZynqTensor();
-  fc_param.output = param.output->ZynqTensor();
-  fc_param.filter = param.w->ZynqTensor();
-  fc_param.bias = param.bias->ZynqTensor();
-
-  pe_.init();
-  pe_.apply();
-}
-
-void FcCompute::Run() {
-  auto& param = this->Param<param_t>();
-  pe_.dispatch();
-}
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    fc, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::FcCompute, def)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kNHWC))})
-    .Finalize();
diff --git a/lite/kernels/fpga/fc_compute.h b/lite/kernels/fpga/fc_compute.h
deleted file mode 100644
index f20419f02b..0000000000
--- a/lite/kernels/fpga/fc_compute.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "lite/backends/fpga/KD/float16.hpp"
-#include "lite/backends/fpga/KD/pes/fully_connected_pe.hpp"
-#include "lite/core/kernel.h"
-#include "lite/operators/fc_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-class FcCompute
-    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
- public:
-  using param_t = operators::FcParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  ~FcCompute() {}
-
- private:
-  zynqmp::FullyConnectedPE pe_;
-  zynqmp::Tensor input_;
-  zynqmp::Tensor output_;
-  zynqmp::Tensor filter_;
-  zynqmp::Tensor bias_;
-};
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/fpga/fc_compute_test.cc b/lite/kernels/fpga/fc_compute_test.cc
deleted file mode 100644
index 6ef8c02ed0..0000000000
--- a/lite/kernels/fpga/fc_compute_test.cc
+++ /dev/null
@@ -1,205 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/fpga/fc_compute.h"
-#include <gtest/gtest.h>
-#include <algorithm>
-#include <iostream>
-#include <memory>
-#include <random>
-#include <utility>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-#define A(i, j) a[i * lda + j]
-#define B(i, j) b[i * ldb + j]
-#define C(i, j) c[i * ldc + j]
-
-template <typename T>
-void gemm_bias(const T* a,
-               const int M,
-               const int K,
-               const T* b,
-               const int K_,
-               const int N,
-               T* biases,
-               T* c) {
-  EXPECT_TRUE(K_ == K && M > 0 && N > 0 && K > 0);
-  EXPECT_TRUE(a && b && c);
-  const int lda = K;
-  const int ldb = N;
-  const int ldc = N;
-  for (int m = 0; m < M; ++m) {
-    for (int n = 0; n < N; ++n) {
-      C(m, n) = 0.0f;
-      for (int k = 0; k < K; ++k) {
-        C(m, n) += A(m, k) * B(k, n);
-      }
-    }
-  }
-  if (biases) {
-    for (int m = 0; m < M; ++m) {
-      for (int n = 0; n < N; ++n) {
-        C(m, n) += biases[n];
-      }
-    }
-  }
-}
-
-template <typename T>
-void FillData(T* a,
-              const int n,
-              const T lower = static_cast<T>(-2.f),
-              const T upper = static_cast<T>(2.f)) {
-  static unsigned int seed = 100;
-  std::mt19937 rng(seed++);
-  std::uniform_real_distribution<double> uniform_dist(0, 1);
-  for (int i = 0; i < n; ++i) {
-    a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
-  }
-}
-
-TEST(fc_fpga, retrive_op) {
-  auto fc =
-      KernelRegistry::Global().Create<TARGET(kFPGA), PRECISION(kFP16)>("fc");
-  ASSERT_FALSE(fc.empty());
-  ASSERT_TRUE(fc.front());
-}
-
-TEST(fc_fpga, init) {
-  FcCompute fc;
-  ASSERT_EQ(fc.precision(), PRECISION(kFP16));
-  ASSERT_EQ(fc.target(), TARGET(kFPGA));
-}
-
-TEST(fc_fpga, compare_test) {
-  using T = float;
-
-  for (int m : {1, 2, 3, 4}) {
-    for (int n : {1, 2, 3, 4}) {
-      for (int k : {1, 2, 3, 4}) {
-        for (bool with_bias : {true, false}) {
-          VLOG(3) << "m: " << m << ", n: " << n << ", k: " << k
-                  << (with_bias ? ", with bias" : "");
-          lite::Tensor x, w, b, out, ref;
-
-          x.Resize({m, k});
-          w.Resize({k, n});
-          b.Resize({1, n});
-          out.Resize({m, n});
-          ref.Resize({m, n});
-
-          auto* x_data = x.mutable_data<T>();
-          auto* w_data = w.mutable_data<T>();
-          auto* b_data = with_bias ? b.mutable_data<T>() : nullptr;
-
-          auto* out_data = out.mutable_data<T>();
-          auto* ref_data = ref.mutable_data<T>();
-
-          FillData<T>(x_data, x.dims().production());
-          FillData<T>(w_data, w.dims().production());
-          FillData<T>(out_data, out.dims().production(), 0, 0);
-          FillData<T>(ref_data, ref.dims().production(), 0, 0);
-
-          if (with_bias) {
-            FillData<T>(b_data, b.dims().production());
-          }
-
-          FcCompute fc;
-          operators::FcParam param;
-
-          param.input = &x;
-          param.w = &w;
-          param.bias = with_bias ? &b : nullptr;
-          param.output = &out;
-          param.in_num_col_dims = 1;
-          param.in_mat_dims = x.dims();
-
-          // DeviceInfo::Init();
-          // std::unique_ptr<KernelContext> ctx(new KernelContext);
-          // ctx->As<ARMContext>();
-          fc.SetParam(param);
-          // fc.SetContext(std::move(ctx));
-          fc.PrepareForRun();
-          fc.Run();
-
-          gemm_bias<T>(x_data, m, k, w_data, k, n, b_data, ref_data);
-
-          for (int i = 0; i < out.dims().production(); i++) {
-            EXPECT_NEAR(out_data[i], ref_data[i], 1e-3);
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(fc_fpga, num_col_dims) {
-  using T = float;
-
-  for (bool with_bias : {true, false}) {
-    lite::Tensor x, w, b, out, ref;
-
-    x.Resize({1, 2, 3});
-    w.Resize({3, 4});
-    b.Resize({1, 4});
-    out.Resize({2, 4});
-    ref.Resize({2, 4});
-
-    auto* x_data = x.mutable_data<float>();
-    auto* w_data = w.mutable_data<float>();
-    auto* b_data = with_bias ? b.mutable_data<T>() : nullptr;
-
-    auto* out_data = out.mutable_data<T>();
-    auto* ref_data = ref.mutable_data<T>();
-
-    FillData<T>(x_data, x.dims().production());
-    FillData<T>(w_data, w.dims().production());
-    FillData<T>(out_data, out.dims().production(), 0, 0);
-    FillData<T>(ref_data, ref.dims().production(), 0, 0);
-    if (with_bias) {
-      FillData<T>(b_data, b.dims().production());
-    }
-    FcCompute fc;
-    operators::FcParam param;
-    param.input = &x;
-    param.w = &w;
-    param.bias = with_bias ? &b : nullptr;
-    param.output = &out;
-    param.in_num_col_dims = 2;
-    param.in_mat_dims = x.dims();
-
-    fc.SetParam(param);
-    fc.PrepareForRun();
-    fc.Run();
-
-    gemm_bias<T>(x_data, 2, 3, w_data, 3, 4, b_data, ref_data);
-
-    for (int i = 0; i < out.dims().production(); i++) {
-      EXPECT_NEAR(out_data[i], ref_data[i], 1e-3);
-    }
-  }
-}
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(fc, kFPGA, kFP16, kNHWC, def);
diff --git a/lite/kernels/fpga/feed_compute.cc b/lite/kernels/fpga/feed_compute.cc
deleted file mode 100644
index 29c080888b..0000000000
--- a/lite/kernels/fpga/feed_compute.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/fpga/feed_compute.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-using float16 = zynqmp::float16;
-
-void FeedCompute::PrepareForRun() {
-  auto& param = this->Param<param_t>();
-  // ====================================================
-  zynqmp::InputParam& conv_param = pe_.param();
-  Tensor& x = param.feed_list->at(param.col);
-
-  param.out->Resize(x.dims());
-  param.out->mutable_data<float16>();
-  conv_param.input = x.ZynqTensor();
-  conv_param.output = param.out->ZynqTensor();
-  pe_.init();
-  pe_.apply();
-}
-
-void FeedCompute::Run() {
-  auto& param = this->Param<param_t>();
-  pe_.dispatch();
-}
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    feed, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::FeedCompute, def)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNHWC))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kNHWC))})
-    .Finalize();
diff --git a/lite/kernels/fpga/feed_compute.h b/lite/kernels/fpga/feed_compute.h
deleted file mode 100644
index a15ba5636f..0000000000
--- a/lite/kernels/fpga/feed_compute.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include "lite/backends/fpga/KD/pes/input_pe.hpp"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-class FeedCompute
-    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
- public:
-  using param_t = operators::FeedParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
- private:
-  zynqmp::InputPE pe_;
-  zynqmp::Tensor input_;
-  zynqmp::Tensor output_;
-};
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/fpga/fetch_compute.cc b/lite/kernels/fpga/fetch_compute.cc
deleted file mode 100644
index cf4cf2d3e6..0000000000
--- a/lite/kernels/fpga/fetch_compute.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "lite/kernels/fpga/fetch_compute.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-using float16 = zynqmp::float16;
-
-void FetchCompute::PrepareForRun() {
-  auto& param = this->Param<param_t>();
-  // ====================================================
-  zynqmp::OutputParam& conv_param = pe_.param();
-  auto fetch_list = param.fetch_list;
-  if (fetch_list->size() <= static_cast<size_t>(param.col)) {
-    fetch_list->resize(param.col + 1);
-  }
-  Tensor& out = param.fetch_list->at(param.col);
-  out.Resize(param.input->dims());
-  out.mutable_data<float16>();
-
-  conv_param.input = param.input->ZynqTensor();
-  conv_param.output = out.ZynqTensor();
-
-  pe_.init();
-  pe_.apply();
-}
-
-void FetchCompute::Run() { pe_.dispatch(); }
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    fetch, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::FetchCompute, def)
-    .BindInput("X",
-               {LiteType::GetTensorTy(
-                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(
-                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
-    .Finalize();
diff --git a/lite/kernels/fpga/fetch_compute.h b/lite/kernels/fpga/fetch_compute.h
deleted file mode 100644
index d86fa905c4..0000000000
--- a/lite/kernels/fpga/fetch_compute.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include "lite/backends/fpga/KD/pes/output_pe.hpp"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-class FetchCompute
-    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
- public:
-  using param_t = operators::FetchParam;
-
-  void PrepareForRun() override;
-  void Run() override;
-
- private:
-  zynqmp::OutputPE pe_;
-  zynqmp::Tensor input_;
-  zynqmp::Tensor output_;
-};
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/fpga/io_copy_compute.cc b/lite/kernels/fpga/io_copy_compute.cc
deleted file mode 100644
index df85a03894..0000000000
--- a/lite/kernels/fpga/io_copy_compute.cc
+++ /dev/null
@@ -1,157 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/api/paddle_place.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/target_wrapper.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-using float16 = zynqmp::float16;
-
-void CopyFromHostSync(void* target, const void* source, size_t size) {
-  TargetWrapper<TARGET(kFPGA)>::MemcpySync(
-      target, source, size, IoDirection::HtoD);
-}
-
-void CopyToHostSync(void* target, const void* source, size_t size) {
-  TargetWrapper<TARGET(kFPGA)>::MemcpySync(
-      target, source, size, IoDirection::DtoH);
-}
-
-/*
- * This kernel copies a tensor from host to FPGA space.
- */
-class IoCopyHostToFpgaCompute
-    : public KernelLite<TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny)> {
- public:
-  void Run() override {
-    auto& param = Param<operators::IoCopyParam>();
-    CHECK(param.x->target() == TARGET(kHost) ||
-          param.x->target() == TARGET(kFPGA));
-    param.y->CopyDataFrom(*param.x);
-  }
-
-  std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
-    std::unique_ptr<type_infer_handler_t> res(new type_infer_handler_t);
-    *res = [](const std::map<std::string, const Type*>& inputs,
-              const std::string& out) -> const Type* {
-      CHECK(!inputs.empty());
-      auto* type = inputs.at("Input");
-      CHECK(type->target() == TARGET(kHost));
-
-      auto out_place = type->place();
-      out_place.target = TARGET(kFPGA);
-      auto* out_type = Type::Get(type->id(),
-                                 out_place.target,
-                                 out_place.precision,
-                                 out_place.layout,
-                                 out_place.device);
-      return out_type;
-    };
-    return res;
-  }
-
-  std::string doc() const override { return "Copy IO from HOST to FPGA"; }
-};
-
-/*
- * This kernel copies a tensor from FPGA to host space.
- */
-class IoCopyFpgaToHostCompute
-    : public KernelLite<TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kAny)> {
- public:
-  void Run() override {
-    auto& param = Param<operators::IoCopyParam>();
-    CHECK(param.x->target() == TARGET(kHost) ||
-          param.x->target() == TARGET(kFPGA));
-    param.y->CopyDataFrom(*param.x);
-  }
-
-  std::string doc() const override { return "Copy IO from FPGA to HOST"; }
-};
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(io_copy,
-                     kFPGA,
-                     kAny,
-                     kAny,
-                     paddle::lite::kernels::fpga::IoCopyHostToFpgaCompute,
-                     host_to_device)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kHost),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kNCHW))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(io_copy,
-                     kFPGA,
-                     kAny,
-                     kAny,
-                     paddle::lite::kernels::fpga::IoCopyFpgaToHostCompute,
-                     device_to_host)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kHost),
-                                       PRECISION(kAny),
-                                       DATALAYOUT(kAny))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(io_copy_once,
-                     kFPGA,
-                     kAny,
-                     kAny,
-                     paddle::lite::kernels::fpga::IoCopyHostToFpgaCompute,
-                     host_to_device_once)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kHost),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kNCHW))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(io_copy_once,
-                     kFPGA,
-                     kAny,
-                     kAny,
-                     paddle::lite::kernels::fpga::IoCopyFpgaToHostCompute,
-                     device_to_host_once)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kHost),
-                                       PRECISION(kAny),
-                                       DATALAYOUT(kAny))})
-    .Finalize();
diff --git a/lite/kernels/fpga/layout_compute.cc b/lite/kernels/fpga/layout_compute.cc
deleted file mode 100644
index 823637e3f3..0000000000
--- a/lite/kernels/fpga/layout_compute.cc
+++ /dev/null
@@ -1,146 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/api/paddle_place.h"
-#include "lite/backends/fpga/KD/float16.hpp"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/target_wrapper.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-using float16 = zynqmp::float16;
-
-void TransHwcToChw(Tensor* dest, const Tensor* src) {}
-void TransChwToHwc(Tensor* dest, const Tensor* src) {}
-
-class TransHwcToChwCompute
-    : public KernelLite<TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kNHWC)> {
- public:
-  void Run() override {
-    auto& param = Param<operators::LayoutParam>();
-    auto out_data = param.y->mutable_data<float16>(TARGET(kFPGA));
-    TransHwcToChw(param.y, param.x);
-  }
-
-  std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
-    std::unique_ptr<type_infer_handler_t> res(new type_infer_handler_t);
-    *res = [](const std::map<std::string, const Type*>& inputs,
-              const std::string& out) -> const Type* {
-      CHECK(!inputs.empty());
-      auto* type = inputs.at("Input");
-      CHECK(type->layout() == DATALAYOUT(kNHWC));
-
-      auto out_place = type->place();
-      out_place.layout = DATALAYOUT(kNHWC);
-      auto* out_type = Type::Get(type->id(),
-                                 out_place.target,
-                                 out_place.precision,
-                                 out_place.layout,
-                                 out_place.device);
-      return out_type;
-    };
-    return res;
-  }
-
-  std::string doc() const override { return "Trans Layout from NHWC to NCHW"; }
-};
-
-/*
- * This kernel copies a tensor from FPGA to host space.
- */
-class TransChwToHwcCompute
-    : public KernelLite<TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kNHWC)> {
- public:
-  void Run() override {
-    auto& param = Param<operators::LayoutParam>();
-    auto out_data = param.y->mutable_data<float16>(TARGET(kFPGA));
-    TransChwToHwc(param.y, param.x);
-  }
-
-  std::string doc() const override { return "Trans Layout from NHWC to NCHW"; }
-};
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(layout,
-                     kFPGA,
-                     kAny,
-                     kNHWC,
-                     paddle::lite::kernels::fpga::TransHwcToChwCompute,
-                     hwc_to_chw_fpga_fp16)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kNCHW))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(layout,
-                     kFPGA,
-                     kAny,
-                     kNHWC,
-                     paddle::lite::kernels::fpga::TransChwToHwcCompute,
-                     chw_to_hwc_fpga_fp16)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNCHW))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kNHWC))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(layout_once,
-                     kFPGA,
-                     kAny,
-                     kNHWC,
-                     paddle::lite::kernels::fpga::TransHwcToChwCompute,
-                     hwc_to_chw_fpga_fp16)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kNCHW))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(layout_once,
-                     kFPGA,
-                     kAny,
-                     kNHWC,
-                     paddle::lite::kernels::fpga::TransChwToHwcCompute,
-                     chw_to_hwc_fpga_fp16)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNCHW))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kNHWC))})
-    .Finalize();
diff --git a/lite/kernels/fpga/pooling_compute.cc b/lite/kernels/fpga/pooling_compute.cc
deleted file mode 100644
index e4979f8e57..0000000000
--- a/lite/kernels/fpga/pooling_compute.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/fpga/pooling_compute.h"
-#include <string>
-#include <vector>
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-using float16 = zynqmp::float16;
-
-void PoolCompute::PrepareForRun() {
-  zynqmp::PoolingParam& pool_param = pe_.param();
-  auto& param = Param<operators::PoolParam>();
-
-  param.output->mutable_data<float16>();
-
-  pool_param.input = param.x->ZynqTensor();
-  pool_param.output = param.output->ZynqTensor();
-  pool_param.relu.enabled = false;
-
-  pool_param.type = param.pooling_type == "max" ? zynqmp::PoolingType::MAX
-                                                : zynqmp::PoolingType::AVERAGE;
-  pool_param.globalPooling = param.global_pooling;
-  pool_param.kernelSize = param.ksize;
-  pool_param.strides = param.strides;
-  pool_param.paddings = param.paddings;
-  pe_.init();
-  pe_.apply();
-}
-
-void PoolCompute::Run() { pe_.dispatch(); }
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    pool2d, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::PoolCompute, def)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kNHWC))})
-    .Finalize();
diff --git a/lite/kernels/fpga/pooling_compute.h b/lite/kernels/fpga/pooling_compute.h
deleted file mode 100644
index 0f5bf106de..0000000000
--- a/lite/kernels/fpga/pooling_compute.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/backends/fpga/KD/float16.hpp"
-#include "lite/backends/fpga/KD/pes/pooling_pe.hpp"
-#include "lite/core/kernel.h"
-#include "lite/operators/pool_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-class PoolCompute
-    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
- public:
-  using param_t = operators::PoolParam;
-
-  void PrepareForRun() override;
-  void Run() override;
-
-  virtual ~PoolCompute() = default;
-
- private:
-  zynqmp::PoolingPE pe_;
-};
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/fpga/pooling_compute_test.cc b/lite/kernels/fpga/pooling_compute_test.cc
deleted file mode 100644
index 2309bf8fe4..0000000000
--- a/lite/kernels/fpga/pooling_compute_test.cc
+++ /dev/null
@@ -1,291 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/fpga/pooling_compute.h"
-#include <gtest/gtest.h>
-#include <limits>
-#include <string>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-#include "lite/backends/fpga/KD/float16.hpp"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-using float16 = zynqmp::float16;
-
-int PoolOutputSize(
-    int input_size, int filter_size, int padding, int stride, bool ceil_mode) {
-  int output_size;
-  if (!ceil_mode) {
-    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
-  } else {
-    output_size =
-        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
-  }
-  return output_size;
-}
-
-std::vector<int64_t> compute_output_shape(operators::PoolParam* param_) {
-  const auto x_dims = param_->x->dims();
-  std::vector<int>& ksize = param_->ksize;
-  if (param_->global_pooling) {
-    ksize.resize(static_cast<size_t>(x_dims.size()) - 2);
-    for (size_t i = 0; i < ksize.size(); ++i) {
-      param_->paddings[i] = 0;
-      ksize[i] = static_cast<int>(x_dims[i + 2]);
-    }
-  }
-
-  std::vector<int64_t> output_shape({x_dims[0], x_dims[1]});
-  if (param_->adaptive) {
-    output_shape.insert(
-        output_shape.end(), param_->ksize.begin(), param_->ksize.end());
-  } else {
-    for (size_t i = 0; i < param_->ksize.size(); ++i) {
-      output_shape.push_back(PoolOutputSize(x_dims[i + 2],
-                                            param_->ksize[i],
-                                            param_->paddings[i],
-                                            param_->strides[i],
-                                            param_->ceil_mode));
-    }
-  }
-  return output_shape;
-}
-
-void pool_compute_ref(const operators::PoolParam& param) {
-  auto& in_dims = param.x->dims();
-  auto& out_dims = param.output->dims();
-
-  const float* src_ptr = param.x->data<const float>();
-  float* dst_ptr = param.output->mutable_data<float>();
-
-  std::vector<int> ksize = param.ksize;
-  std::vector<int> strides = param.strides;
-  std::vector<int> paddings = param.paddings;
-
-  std::string pooling_type = param.pooling_type;
-  bool global_pooling = param.global_pooling;
-  bool exclusive = param.exclusive;
-  bool adaptive = param.adaptive;
-  bool ceil_mode = param.ceil_mode;
-  bool use_quantizer = param.use_quantizer;
-  std::string data_format = param.data_format;
-
-  int in_n = in_dims[0];
-  int in_c = in_dims[1];
-  int in_h = in_dims[2];
-  int in_w = in_dims[3];
-  int size_in_n = in_c * in_h * in_w;
-  int size_in_c = in_h * in_w;
-
-  int out_h = out_dims[2];
-  int out_w = out_dims[3];
-  int size_out_n = in_c * out_h * out_w;
-  int size_out_c = out_h * out_w;
-
-  int window_h = ksize[0];
-  int window_w = ksize[1];
-  int stride_h = strides[0];
-  int stride_w = strides[1];
-  int pad_h = paddings[0];
-  int pad_w = paddings[1];
-
-  if (global_pooling == true) {
-    for (int n = 0; n < in_n; ++n) {
-      for (int c = 0; c < in_c; ++c) {
-        const float* src = src_ptr + n * size_in_n + c * size_in_c;
-        float res = src[0];
-        if (pooling_type == "max") {
-          for (int i = 1; i < size_in_c; ++i) {
-            float cur_val = src[i];
-            res = cur_val > res ? cur_val : res;
-          }
-        } else if (pooling_type == "avg") {
-          for (int i = 1; i < size_in_c; ++i) {
-            float cur_val = src[i];
-            res += cur_val;
-          }
-          res /= size_in_c;
-        }
-        dst_ptr[n * size_out_n + c] = res;
-      }
-    }
-  } else {
-    for (int n = 0; n < in_n; ++n) {
-      for (int c = 0; c < in_c; ++c) {
-        for (int h = 0; h < out_h; ++h) {
-          int sh = h * stride_h;
-          int eh = sh + window_h;
-          sh = (sh - pad_h) < 0 ? 0 : sh - pad_h;
-          eh = (eh - pad_h) > in_h ? in_h : eh - pad_h;
-          for (int w = 0; w < out_w; ++w) {
-            int sw = w * stride_w;
-            int ew = sw + window_w;
-            sw = (sw - pad_w) < 0 ? 0 : sw - pad_w;
-            ew = (ew - pad_w) > in_w ? in_w : ew - pad_w;
-            int pooling_size = (ew - sw) * (eh - sh);
-            if (pooling_size == 0) continue;
-            float res = 0.f;
-            for (int kh = sh; kh < eh; ++kh) {
-              for (int kw = sw; kw < ew; ++kw) {
-                int src_idx = n * size_in_n + c * size_in_c + kh * in_w + kw;
-                if (kh == sh && kw == sw) {
-                  res = src_ptr[src_idx];
-                } else {
-                  if (pooling_type == "max") {
-                    res = res >= src_ptr[src_idx] ? res : src_ptr[src_idx];
-                  }
-                  if (pooling_type == "avg") {
-                    res += src_ptr[src_idx];
-                  }
-                }
-              }
-            }
-            if (pooling_type == "avg") {
-              if (exclusive) {
-                res /= pooling_size;
-              } else {
-                res /= window_h * window_w;
-              }
-            }
-            dst_ptr[n * size_out_n + c * size_out_c + h * out_w + w] = res;
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(pool_fpga, init) {
-  PoolCompute pool;
-  ASSERT_EQ(pool.precision(), PRECISION(kFP16));
-  ASSERT_EQ(pool.target(), TARGET(kFPGA));
-}
-
-TEST(pool_fpga, compute) {
-  PoolCompute pool;
-  operators::PoolParam param;
-
-  lite::Tensor x;
-  lite::Tensor output;
-  lite::Tensor output_ref;
-
-  // speedup for ci
-  for (auto pooling_type : {"max", "avg"}) {
-    for (auto ceil_mode : {true, false}) {
-      for (auto global_pooling : {true, false}) {
-        for (auto exclusive : {true, false}) {
-          for (auto ksize : {2, 3}) {
-            for (auto stride : {1, 2}) {
-              for (auto pad : {0, 1}) {
-                for (auto n : {1, 2}) {
-                  for (auto c : {8, 3}) {
-#if 1
-                    for (auto h : {8, 3, 4, 11}) {
-                      for (auto w : {8, 3, 4, 11}) {
-#else
-                    for (int h = 2; h < 25; h++) {
-                      for (int w = 2; w < 25; w++) {
-#endif
-                        VLOG(3) << "n:" << n << " c:" << c << " h:" << h
-                                << " w:" << w << " ksize:" << ksize
-                                << " stride:" << stride << " pad:" << pad
-                                << " exclusive:" << exclusive
-                                << " global_pooling:" << global_pooling
-                                << " ceil_mode: " << ceil_mode
-                                << " pooling_type:" << pooling_type;
-
-                        // init x, output
-                        x.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
-                        auto* x_data = x.mutable_data<float16>(TARGET(kFPGA));
-                        for (int i = 0; i < x.dims().production(); ++i) {
-                          float sign = i % 3 == 0 ? -0.03 : 0.05f;
-                          // x_data[i] = sign * (i % 128);
-                          x_data[i] = zynqmp::float_to_half(i);
-                        }
-
-                        // fill param
-                        param.x = &x;
-                        param.output = &output;
-                        param.pooling_type = pooling_type;
-                        if (global_pooling) {
-                          param.ksize = {h, w};
-                        } else {
-                          param.ksize = {ksize, ksize};
-                        }
-                        param.global_pooling = global_pooling;
-                        param.strides = {stride, stride};
-                        param.paddings = {pad, pad};
-                        param.exclusive = exclusive;
-                        param.ceil_mode = ceil_mode;
-                        param.adaptive = false;
-                        param.use_quantizer = false;
-
-                        const std::vector<int64_t>& output_shape =
-                            compute_output_shape(&param);
-                        output.Resize(DDim(output_shape));
-                        output_ref.Resize(DDim(output_shape));
-
-                        auto* output_data =
-                            output.mutable_data<float16>(TARGET(kFPGA));
-                        auto* output_ref_data =
-                            output_ref.mutable_data<float>();
-                        for (int i = 0; i < output.dims().production(); ++i) {
-                          output_data[i] = -2;
-                          output_ref_data[i] = -2;
-                        }
-
-                        // compute
-                        pool.SetParam(param);
-                        pool.PrepareForRun();
-                        pool.Run();
-                        // compute ref
-                        param.output = &output_ref;
-                        pool_compute_ref(param);
-
-                        // compare
-                        for (int i = 0; i < output.dims().production(); i++) {
-                          EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-4);
-                        }
-                        VLOG(3) << "compare pass";
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(pool_fpga, retrive_op) {
-  auto pool = KernelRegistry::Global().Create<TARGET(kFPGA), PRECISION(kFP16)>(
-      "pool2d");
-  ASSERT_FALSE(pool.empty());
-  ASSERT_TRUE(pool.front());
-}
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(pool2d, kFPGA, kFP16, kNHWC, def);
diff --git a/lite/kernels/fpga/scale_compute.cc b/lite/kernels/fpga/scale_compute.cc
deleted file mode 100644
index 01f9a63ad4..0000000000
--- a/lite/kernels/fpga/scale_compute.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/fpga/scale_compute.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-void ScaleCompute::Run() {}
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    scale, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::ScaleCompute, def)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kNHWC))})
-    .Finalize();
diff --git a/lite/kernels/fpga/scale_compute.h b/lite/kernels/fpga/scale_compute.h
deleted file mode 100644
index 45cd3528f8..0000000000
--- a/lite/kernels/fpga/scale_compute.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-class ScaleCompute
-    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
- public:
-  void Run() override;
-
-  virtual ~ScaleCompute() = default;
-};
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/fpga/softmax_compute.cc b/lite/kernels/fpga/softmax_compute.cc
deleted file mode 100644
index 63abc76e68..0000000000
--- a/lite/kernels/fpga/softmax_compute.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/fpga/softmax_compute.h"
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-using float16 = zynqmp::float16;
-
-void SoftmaxCompute::PrepareForRun() {
-  zynqmp::SoftmaxParam& softmax_param = pe_.param();
-  auto& param = Param<operators::SoftmaxParam>();
-
-  param.output->mutable_data<float16>();
-  softmax_param.input = param.x->ZynqTensor();
-  softmax_param.output = param.output->ZynqTensor();
-  pe_.init();
-  pe_.apply();
-}
-
-void SoftmaxCompute::Run() { pe_.dispatch(); }
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(softmax,
-                     kFPGA,
-                     kFP16,
-                     kNHWC,
-                     paddle::lite::kernels::fpga::SoftmaxCompute,
-                     def)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kNHWC))})
-    .Finalize();
diff --git a/lite/kernels/fpga/softmax_compute.h b/lite/kernels/fpga/softmax_compute.h
deleted file mode 100644
index 035c9a60ec..0000000000
--- a/lite/kernels/fpga/softmax_compute.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/backends/fpga/KD/float16.hpp"
-#include "lite/backends/fpga/KD/pes/softmax_pe.hpp"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-using float16 = zynqmp::float16;
-
-class SoftmaxCompute
-    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
- public:
-  void PrepareForRun() override;
-  void Run() override;
-
-  virtual ~SoftmaxCompute() = default;
-
- private:
-  zynqmp::SoftmaxPE pe_;
-  zynqmp::Tensor input_x_;
-  zynqmp::Tensor output_;
-};
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/fpga/softmax_compute_test.cc b/lite/kernels/fpga/softmax_compute_test.cc
deleted file mode 100644
index f92139d0f4..0000000000
--- a/lite/kernels/fpga/softmax_compute_test.cc
+++ /dev/null
@@ -1,136 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/fpga/softmax_compute.h"
-#include <gtest/gtest.h>
-#include <limits>
-#include <vector>
-#include "lite/backends/fpga/KD/float16.hpp"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace fpga {
-
-template <typename dtype>
-void softmax_compute_ref(const operators::SoftmaxParam& param) {
-  const dtype* x_data = param.x->mutable_data<const dtype>();
-  dtype* output_data = param.output->mutable_data<dtype>();
-  DDim x_dims = param.x->dims();
-  ASSERT_EQ(x_dims.data(), param.output->dims().data());
-  auto x_rank = x_dims.size();
-  int axis = param.axis;
-  if (axis < 0) {
-    axis += x_rank;
-  }
-  int axis_size = x_dims[axis];
-  int outer_num = x_dims.Slice(0, axis).production();
-  int inner_num = x_dims.Slice(axis + 1, x_rank).production();
-  int compute_size = outer_num * inner_num;
-  for (int i = 0; i < compute_size; i++) {
-    int idx_inner = i % inner_num;
-    int idx_outer = (i / inner_num) * axis_size;
-    int start = idx_outer * inner_num + idx_inner;
-    int offset;
-
-    offset = start;
-    dtype max_data = std::numeric_limits<dtype>::lowest();
-    for (int j = 0; j < axis_size; j++) {
-      max_data = x_data[offset] > max_data ? x_data[offset] : max_data;
-      offset += inner_num;
-    }
-
-    offset = start;
-    dtype sum_data = (dtype)0;
-    for (int j = 0; j < axis_size; j++) {
-      output_data[offset] = exp(x_data[offset] - max_data);
-      sum_data += output_data[offset];
-      offset += inner_num;
-    }
-
-    offset = start;
-    for (int j = 0; j < axis_size; j++) {
-      output_data[offset] /= sum_data;
-      offset += inner_num;
-    }
-  }
-}
-
-TEST(softmax_arm, init) {
-  SoftmaxCompute softmax;
-  ASSERT_EQ(softmax.precision(), PRECISION(kFP16));
-  ASSERT_EQ(softmax.target(), TARGET(kFPGA));
-}
-
-TEST(softmax_arm, compute) {
-  SoftmaxCompute softmax;
-  operators::SoftmaxParam param;
-
-  lite::Tensor x;
-  lite::Tensor output;
-  lite::Tensor output_ref;
-#if 1
-  for (auto n : {1, 3}) {
-    for (auto c : {1, 4}) {
-      for (auto h : {5, 1}) {
-        for (auto w : {1, 6}) {
-          for (auto axis : {-2, -1, 0, 1, 2}) {
-#else
-  for (auto n : {1, 3, 4, 11}) {
-    for (auto c : {1, 3, 11, 4}) {
-      for (auto h : {3, 1, 11, 4}) {
-        for (auto w : {1, 3, 4, 12}) {
-          for (auto axis : {-4, -3, -2, -1, 0, 1, 2, 3}) {
-#endif
-            x.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
-            output.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
-            output_ref.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
-            auto* x_data = x.mutable_data<float16>();
-            auto* output_data = output.mutable_data<float16>();
-            auto* output_ref_data = output_ref.mutable_data<float16>();
-            for (int i = 0; i < x.dims().production(); i++) {
-              x_data[i] = zynqmp::float_to_half(i);
-            }
-            param.x = &x;
-            param.axis = axis;
-            param.output = &output;
-            softmax.SetParam(param);
-            softmax.Run();
-            param.output = &output_ref;
-            softmax_compute_ref<float16>(param);
-            for (int i = 0; i < output.dims().production(); i++) {
-              EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(softmax, retrive_op) {
-  auto softmax =
-      KernelRegistry::Global().Create<TARGET(kFPGA), PRECISION(kFP16)>(
-          "softmax");
-  ASSERT_FALSE(softmax.empty());
-  ASSERT_TRUE(softmax.front());
-}
-
-}  // namespace fpga
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(softmax, kFPGA, kFP16, kNHWC, def);
diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt
deleted file mode 100644
index 428cc213ce..0000000000
--- a/lite/kernels/host/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-message(STATUS "compile with lite host kernels")
-
-add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_deps})
-add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
-add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)
-add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
-
-#lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host any)
-#lite_cc_test(test_multiclass_nms_compute_host SRCS multiclass_nms_compute_test.cc DEPS multiclass_nms_compute_host any)
diff --git a/lite/kernels/host/feed_compute.cc b/lite/kernels/host/feed_compute.cc
deleted file mode 100644
index b16be42031..0000000000
--- a/lite/kernels/host/feed_compute.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace host {
-
-class FeedCompute
-    : public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
- public:
-  using param_t = operators::FeedParam;
-
-  void Run() override {
-    auto &param = Param<operators::FeedParam>();
-    VLOG(4) << "feed_list.size: " << param.feed_list->size();
-    VLOG(4) << "col " << param.col;
-    const lite::Tensor &feed_item = (*param.feed_list)[param.col];
-    param.out->ShareDataWith(feed_item);
-  }
-};
-
-}  // namespace host
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    feed, kHost, kAny, kAny, paddle::lite::kernels::host::FeedCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
-    .Finalize();
diff --git a/lite/kernels/host/fetch_compute.cc b/lite/kernels/host/fetch_compute.cc
deleted file mode 100644
index c53b987b8f..0000000000
--- a/lite/kernels/host/fetch_compute.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace host {
-
-class FetchCompute
-    : public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
- public:
-  using param_t = operators::FeedParam;
-
-  void Run() override {
-    auto& param = Param<operators::FetchParam>();
-    auto* fetch_list = param.fetch_list;
-    if (fetch_list->size() <= static_cast<size_t>(param.col)) {
-      fetch_list->resize(param.col + 1);
-    }
-
-    auto& dst = fetch_list->at(param.col);
-    dst.ShareDataWith(*param.input);
-  }
-};
-
-}  // namespace host
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    fetch, kHost, kAny, kAny, paddle::lite::kernels::host::FetchCompute, def)
-    .BindInput("X",
-               {LiteType::GetTensorTy(
-                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(
-                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
-    .Finalize();
diff --git a/lite/kernels/host/multiclass_nms_compute.cc b/lite/kernels/host/multiclass_nms_compute.cc
deleted file mode 100644
index 6f6079ef88..0000000000
--- a/lite/kernels/host/multiclass_nms_compute.cc
+++ /dev/null
@@ -1,398 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/host/multiclass_nms_compute.h"
-#include <map>
-#include <utility>
-#include <vector>
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace host {
-
-template <class T>
-bool SortScorePairDescend(const std::pair<float, T>& pair1,
-                          const std::pair<float, T>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-template <class T>
-static void GetMaxScoreIndex(const std::vector<T>& scores,
-                             const T threshold,
-                             int top_k,
-                             std::vector<std::pair<T, int>>* sorted_indices) {
-  for (size_t i = 0; i < scores.size(); ++i) {
-    if (scores[i] > threshold) {
-      sorted_indices->push_back(std::make_pair(scores[i], i));
-    }
-  }
-  // Sort the score pair according to the scores in descending order
-  std::stable_sort(sorted_indices->begin(),
-                   sorted_indices->end(),
-                   SortScorePairDescend<int>);
-  // Keep top_k scores if needed.
-  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
-    sorted_indices->resize(top_k);
-  }
-}
-
-template <class T>
-static T BBoxArea(const T* box, const bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
-    }
-  }
-}
-
-template <class T>
-static T JaccardOverlap(const T* box1, const T* box2, const bool normalized) {
-  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
-      box2[3] < box1[1]) {
-    return static_cast<T>(0.);
-  } else {
-    const T inter_xmin = std::max(box1[0], box2[0]);
-    const T inter_ymin = std::max(box1[1], box2[1]);
-    const T inter_xmax = std::min(box1[2], box2[2]);
-    const T inter_ymax = std::min(box1[3], box2[3]);
-    T norm = normalized ? static_cast<T>(0.) : static_cast<T>(1.);
-    T inter_w = inter_xmax - inter_xmin + norm;
-    T inter_h = inter_ymax - inter_ymin + norm;
-    const T inter_area = inter_w * inter_h;
-    const T bbox1_area = BBoxArea<T>(box1, normalized);
-    const T bbox2_area = BBoxArea<T>(box2, normalized);
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
-}
-
-template <class T>
-T PolyIoU(const T* box1,
-          const T* box2,
-          const size_t box_size,
-          const bool normalized) {
-  LOG(FATAL) << "PolyIoU not implement.";
-}
-
-template <class T>
-void SliceOneClass(const Tensor& items,
-                   const int class_id,
-                   Tensor* one_class_item) {
-  T* item_data = one_class_item->mutable_data<T>();
-  const T* items_data = items.data<T>();
-  const int64_t num_item = items.dims()[0];
-  const int64_t class_num = items.dims()[1];
-  if (items.dims().size() == 3) {
-    int64_t item_size = items.dims()[2];
-    for (int i = 0; i < num_item; ++i) {
-      std::memcpy(item_data + i * item_size,
-                  items_data + i * class_num * item_size + class_id * item_size,
-                  sizeof(T) * item_size);
-    }
-  } else {
-    for (int i = 0; i < num_item; ++i) {
-      item_data[i] = items_data[i * class_num + class_id];
-    }
-  }
-}
-
-template <typename T>
-void NMSFast(const Tensor& bbox,
-             const Tensor& scores,
-             const T score_threshold,
-             const T nms_threshold,
-             const T eta,
-             const int64_t top_k,
-             std::vector<int>* selected_indices,
-             const bool normalized) {
-  // The total boxes for each instance.
-  int64_t num_boxes = bbox.dims()[0];
-  // 4: [xmin ymin xmax ymax]
-  // 8: [x1 y1 x2 y2 x3 y3 x4 y4]
-  // 16, 24, or 32: [x1 y1 x2 y2 ...  xn yn], n = 8, 12 or 16
-  int64_t box_size = bbox.dims()[1];
-
-  std::vector<T> scores_data(num_boxes);
-  std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
-  std::vector<std::pair<T, int>> sorted_indices;
-  GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
-
-  selected_indices->clear();
-  T adaptive_threshold = nms_threshold;
-  const T* bbox_data = bbox.data<T>();
-
-  while (sorted_indices.size() != 0) {
-    const int idx = sorted_indices.front().second;
-    bool keep = true;
-    for (size_t k = 0; k < selected_indices->size(); ++k) {
-      if (keep) {
-        const int kept_idx = (*selected_indices)[k];
-        T overlap = T(0.);
-        // 4: [xmin ymin xmax ymax]
-        if (box_size == 4) {
-          overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
-                                      bbox_data + kept_idx * box_size,
-                                      normalized);
-        }
-        // 8: [x1 y1 x2 y2 x3 y3 x4 y4] or 16, 24, 32
-        if (box_size == 8 || box_size == 16 || box_size == 24 ||
-            box_size == 32) {
-          overlap = PolyIoU<T>(bbox_data + idx * box_size,
-                               bbox_data + kept_idx * box_size,
-                               box_size,
-                               normalized);
-        }
-        keep = overlap <= adaptive_threshold;
-      } else {
-        break;
-      }
-    }
-    if (keep) {
-      selected_indices->push_back(idx);
-    }
-    sorted_indices.erase(sorted_indices.begin());
-    if (keep && eta < 1 && adaptive_threshold > 0.5) {
-      adaptive_threshold *= eta;
-    }
-  }
-}
-
-template <typename T>
-void MultiClassNMS(const operators::MulticlassNmsParam& param,
-                   const Tensor& scores,
-                   const Tensor& bboxes,
-                   const int scores_size,
-                   std::map<int, std::vector<int>>* indices,
-                   int* num_nmsed_out) {
-  int64_t background_label = param.background_label;
-  int64_t nms_top_k = param.nms_top_k;
-  int64_t keep_top_k = param.keep_top_k;
-  bool normalized = param.normalized;
-  T nms_threshold = static_cast<T>(param.nms_threshold);
-  T nms_eta = static_cast<T>(param.nms_eta);
-  T score_threshold = static_cast<T>(param.score_threshold);
-
-  int num_det = 0;
-
-  int64_t class_num = scores_size == 3 ? scores.dims()[0] : scores.dims()[1];
-  Tensor bbox_slice, score_slice;
-  for (int64_t c = 0; c < class_num; ++c) {
-    if (c == background_label) continue;
-    if (scores_size == 3) {
-      score_slice = scores.Slice<T>(c, c + 1);
-      bbox_slice = bboxes;
-    } else {
-      score_slice.Resize({scores.dims()[0], 1});
-      bbox_slice.Resize({scores.dims()[0], 4});
-      SliceOneClass<T>(scores, c, &score_slice);
-      SliceOneClass<T>(bboxes, c, &bbox_slice);
-    }
-    NMSFast(bbox_slice,
-            score_slice,
-            score_threshold,
-            nms_threshold,
-            nms_eta,
-            nms_top_k,
-            &((*indices)[c]),
-            normalized);
-    if (scores_size == 2) {
-      std::stable_sort((*indices)[c].begin(), (*indices)[c].end());
-    }
-    num_det += (*indices)[c].size();
-  }
-
-  *num_nmsed_out = num_det;
-  const T* scores_data = scores.data<T>();
-  if (keep_top_k > -1 && num_det > keep_top_k) {
-    const T* sdata;
-    std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
-    for (const auto& it : *indices) {
-      int label = it.first;
-      if (scores_size == 3) {
-        sdata = scores_data + label * scores.dims()[1];
-      } else {
-        score_slice.Resize({scores.dims()[0], 1});
-        SliceOneClass<T>(scores, label, &score_slice);
-        sdata = score_slice.data<T>();
-      }
-      const std::vector<int>& label_indices = it.second;
-      for (size_t j = 0; j < label_indices.size(); ++j) {
-        int idx = label_indices[j];
-        score_index_pairs.push_back(
-            std::make_pair(sdata[idx], std::make_pair(label, idx)));
-      }
-    }
-    // Keep top k results per image.
-    std::stable_sort(score_index_pairs.begin(),
-                     score_index_pairs.end(),
-                     SortScorePairDescend<std::pair<int, int>>);
-    score_index_pairs.resize(keep_top_k);
-
-    // Store the new indices.
-    std::map<int, std::vector<int>> new_indices;
-    for (size_t j = 0; j < score_index_pairs.size(); ++j) {
-      int label = score_index_pairs[j].second.first;
-      int idx = score_index_pairs[j].second.second;
-      new_indices[label].push_back(idx);
-    }
-    if (scores_size == 2) {
-      for (const auto& it : new_indices) {
-        int label = it.first;
-        std::stable_sort(new_indices[label].begin(), new_indices[label].end());
-      }
-    }
-    new_indices.swap(*indices);
-    *num_nmsed_out = keep_top_k;
-  }
-}
-
-template <typename T>
-void MultiClassOutput(const Tensor& scores,
-                      const Tensor& bboxes,
-                      const std::map<int, std::vector<int>>& selected_indices,
-                      const int scores_size,
-                      Tensor* outs) {
-  int64_t class_num = scores.dims()[1];
-  int64_t predict_dim = scores.dims()[1];
-  int64_t box_size = bboxes.dims()[1];
-  if (scores_size == 2) {
-    box_size = bboxes.dims()[2];
-  }
-  int64_t out_dim = box_size + 2;
-  auto* scores_data = scores.data<T>();
-  auto* bboxes_data = bboxes.data<T>();
-  auto* odata = outs->mutable_data<T>();
-  const T* sdata;
-  Tensor bbox;
-  bbox.Resize({scores.dims()[0], box_size});
-  int count = 0;
-  for (const auto& it : selected_indices) {
-    int label = it.first;
-    const std::vector<int>& indices = it.second;
-    if (scores_size == 2) {
-      SliceOneClass<T>(bboxes, label, &bbox);
-    } else {
-      sdata = scores_data + label * predict_dim;
-    }
-    for (size_t j = 0; j < indices.size(); ++j) {
-      int idx = indices[j];
-      odata[count * out_dim] = label;  // label
-      const T* bdata;
-      if (scores_size == 3) {
-        bdata = bboxes_data + idx * box_size;
-        odata[count * out_dim + 1] = sdata[idx];  // score
-      } else {
-        bdata = bbox.data<T>() + idx * box_size;
-        odata[count * out_dim + 1] = *(scores_data + idx * class_num + label);
-      }
-      // xmin, ymin, xmax, ymax or multi-points coordinates
-      std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T));
-      count++;
-    }
-  }
-}
-
-void MulticlassNmsCompute::Run() {
-  auto& param = Param<operators::MulticlassNmsParam>();
-  auto* boxes = param.bboxes;
-  auto* scores = param.scores;
-  auto* outs = param.out;
-
-  auto score_dims = scores->dims();
-  auto score_size = score_dims.size();
-
-  std::vector<std::map<int, std::vector<int>>> all_indices;
-  std::vector<uint64_t> batch_starts = {0};
-  int64_t batch_size = score_dims[0];
-  int64_t box_dim = boxes->dims()[2];
-  int64_t out_dim = box_dim + 2;
-  int num_nmsed_out = 0;
-  Tensor boxes_slice, scores_slice;
-  int n = score_size == 3 ? batch_size : boxes->lod().back().size() - 1;
-  for (int i = 0; i < n; ++i) {
-    if (score_size == 3) {
-      scores_slice = scores->Slice<float>(i, i + 1);
-      scores_slice.Resize({score_dims[1], score_dims[2]});
-      boxes_slice = boxes->Slice<float>(i, i + 1);
-      boxes_slice.Resize({score_dims[2], box_dim});
-    } else {
-      auto boxes_lod = boxes->lod().back();
-      scores_slice = scores->Slice<float>(boxes_lod[i], boxes_lod[i + 1]);
-      boxes_slice = boxes->Slice<float>(boxes_lod[i], boxes_lod[i + 1]);
-    }
-    std::map<int, std::vector<int>> indices;
-    MultiClassNMS<float>(
-        param, scores_slice, boxes_slice, score_size, &indices, &num_nmsed_out);
-    all_indices.push_back(indices);
-    batch_starts.push_back(batch_starts.back() + num_nmsed_out);
-  }
-
-  uint64_t num_kept = batch_starts.back();
-  if (num_kept == 0) {
-    outs->Resize({1, 1});
-    float* od = outs->mutable_data<float>();
-    od[0] = -1;
-    batch_starts = {0, 1};
-  } else {
-    outs->Resize({static_cast<int64_t>(num_kept), out_dim});
-    for (int i = 0; i < n; ++i) {
-      if (score_size == 3) {
-        scores_slice = scores->Slice<float>(i, i + 1);
-        boxes_slice = boxes->Slice<float>(i, i + 1);
-        scores_slice.Resize({score_dims[1], score_dims[2]});
-        boxes_slice.Resize({score_dims[2], box_dim});
-      } else {
-        auto boxes_lod = boxes->lod().back();
-        scores_slice = scores->Slice<float>(boxes_lod[i], boxes_lod[i + 1]);
-        boxes_slice = boxes->Slice<float>(boxes_lod[i], boxes_lod[i + 1]);
-      }
-      int64_t s = static_cast<int64_t>(batch_starts[i]);
-      int64_t e = static_cast<int64_t>(batch_starts[i + 1]);
-      if (e > s) {
-        Tensor out = outs->Slice<float>(s, e);
-        MultiClassOutput<float>(
-            scores_slice, boxes_slice, all_indices[i], score_dims.size(), &out);
-      }
-    }
-  }
-
-  LoD lod;
-  lod.emplace_back(batch_starts);
-
-  outs->set_lod(lod);
-}
-}  // namespace host
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(multiclass_nms,
-                     kHost,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::host::MulticlassNmsCompute,
-                     def)
-    .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
-    .Finalize();
diff --git a/lite/kernels/host/multiclass_nms_compute.h b/lite/kernels/host/multiclass_nms_compute.h
deleted file mode 100644
index 9391a0b2b5..0000000000
--- a/lite/kernels/host/multiclass_nms_compute.h
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace host {
-
-class MulticlassNmsCompute
-    : public KernelLite<TARGET(kHost), PRECISION(kFloat)> {
- public:
-  void Run() override;
-
-  virtual ~MulticlassNmsCompute() = default;
-};
-
-}  // namespace host
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/host/multiclass_nms_compute_test.cc b/lite/kernels/host/multiclass_nms_compute_test.cc
deleted file mode 100644
index 83fb717042..0000000000
--- a/lite/kernels/host/multiclass_nms_compute_test.cc
+++ /dev/null
@@ -1,368 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/host/multiclass_nms_compute.h"
-#include <gtest/gtest.h>
-#include <map>
-#include <utility>
-#include <vector>
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace host {
-
-template <typename dtype>
-static bool sort_score_pair_descend(const std::pair<float, dtype>& pair1,
-                                    const std::pair<float, dtype>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-template <typename dtype>
-void get_max_score_index(const dtype* scores,
-                         int num,
-                         float threshold,
-                         int top_k,
-                         std::vector<std::pair<dtype, int>>* score_index_vec) {
-  //! Generate index score pairs.
-  for (int i = 0; i < num; ++i) {
-    if (scores[i] > threshold) {
-      score_index_vec->push_back(std::make_pair(scores[i], i));
-    }
-  }
-
-  //! Sort the score pair according to the scores in descending order
-  std::stable_sort(score_index_vec->begin(),
-                   score_index_vec->end(),
-                   sort_score_pair_descend<int>);
-
-  //! Keep top_k scores if needed.
-  if (top_k > -1 && top_k < score_index_vec->size()) {
-    score_index_vec->resize(top_k);
-  }
-}
-
-template <typename dtype>
-dtype bbox_size(const dtype* bbox, bool normalized = true) {
-  if (bbox[2] < bbox[0] || bbox[3] < bbox[1]) {
-    // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
-    return dtype(0.);
-  } else {
-    const dtype width = bbox[2] - bbox[0];
-    const dtype height = bbox[3] - bbox[1];
-
-    if (normalized) {
-      return width * height;
-    } else {
-      // If bbox is not within range [0, 1].
-      return (width + 1) * (height + 1);
-    }
-  }
-}
-
-template <typename dtype>
-dtype jaccard_overlap(const dtype* bbox1, const dtype* bbox2) {
-  if (bbox2[0] > bbox1[2] || bbox2[2] < bbox1[0] || bbox2[1] > bbox1[3] ||
-      bbox2[3] < bbox1[1]) {
-    return dtype(0.);
-  } else {
-    const dtype inter_xmin = std::max(bbox1[0], bbox2[0]);
-    const dtype inter_ymin = std::max(bbox1[1], bbox2[1]);
-    const dtype inter_xmax = std::min(bbox1[2], bbox2[2]);
-    const dtype inter_ymax = std::min(bbox1[3], bbox2[3]);
-
-    const dtype inter_width = inter_xmax - inter_xmin;
-    const dtype inter_height = inter_ymax - inter_ymin;
-    const dtype inter_size = inter_width * inter_height;
-
-    const dtype bbox1_size = bbox_size(bbox1);
-    const dtype bbox2_size = bbox_size(bbox2);
-
-    return inter_size / (bbox1_size + bbox2_size - inter_size);
-  }
-}
-
-template <typename dtype>
-void apply_nms_fast(const dtype* bboxes,
-                    const dtype* scores,
-                    int num,
-                    float score_threshold,
-                    float nms_threshold,
-                    float eta,
-                    int top_k,
-                    std::vector<int>* indices) {
-  // Get top_k scores (with corresponding indices).
-  std::vector<std::pair<dtype, int>> score_index_vec;
-  get_max_score_index(scores, num, score_threshold, top_k, &score_index_vec);
-
-  // Do nms.
-  float adaptive_threshold = nms_threshold;
-  indices->clear();
-
-  while (score_index_vec.size() != 0) {
-    const int idx = score_index_vec.front().second;
-    bool keep = true;
-
-    for (int k = 0; k < indices->size(); ++k) {
-      if (keep) {
-        const int kept_idx = (*indices)[k];
-        float overlap =
-            jaccard_overlap(bboxes + idx * 4, bboxes + kept_idx * 4);
-        keep = overlap <= adaptive_threshold;
-      } else {
-        break;
-      }
-    }
-
-    if (keep) {
-      indices->push_back(idx);
-    }
-
-    score_index_vec.erase(score_index_vec.begin());
-
-    if (keep && eta < 1 && adaptive_threshold > 0.5) {
-      adaptive_threshold *= eta;
-    }
-  }
-}
-
-template <typename dtype>
-void multiclass_nms_compute_ref(const operators::MulticlassNmsParam& param,
-                                int class_num,
-                                const std::vector<int>& priors,
-                                bool share_location,
-                                std::vector<float>* result) {
-  int background_id = param.background_label;
-  int keep_topk = param.keep_top_k;
-  int nms_topk = param.nms_top_k;
-  float conf_thresh = param.score_threshold;
-  float nms_thresh = param.nms_threshold;
-  float nms_eta = param.nms_eta;
-  const dtype* bbox_data = param.bboxes->data<const dtype>();
-  const dtype* conf_data = param.scores->data<const dtype>();
-  dtype* out = param.out->mutable_data<dtype>();
-  (*result).clear();
-
-  int num_kept = 0;
-  std::vector<std::map<int, std::vector<int>>> all_indices;
-  int64_t conf_offset = 0;
-  int64_t bbox_offset = 0;
-  for (int i = 0; i < priors.size(); ++i) {
-    std::map<int, std::vector<int>> indices;
-    int num_det = 0;
-    int num_priors = priors[i];
-
-    int conf_idx = class_num * conf_offset;
-    int bbox_idx =
-        share_location ? bbox_offset * 4 : bbox_offset * 4 * class_num;
-
-    for (int c = 0; c < class_num; ++c) {
-      if (c == background_id) {
-        // Ignore background class
-        continue;
-      }
-
-      const dtype* cur_conf_data = conf_data + conf_idx + c * num_priors;
-      const dtype* cur_bbox_data = bbox_data + bbox_idx;
-
-      if (!share_location) {
-        cur_bbox_data += c * num_priors * 4;
-      }
-
-      apply_nms_fast(cur_bbox_data,
-                     cur_conf_data,
-                     num_priors,
-                     conf_thresh,
-                     nms_thresh,
-                     nms_eta,
-                     nms_topk,
-                     &(indices[c]));
-      num_det += indices[c].size();
-    }
-
-    if (keep_topk > -1 && num_det > keep_topk) {
-      std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
-
-      for (auto it = indices.begin(); it != indices.end(); ++it) {
-        int label = it->first;
-        const std::vector<int>& label_indices = it->second;
-
-        for (int j = 0; j < label_indices.size(); ++j) {
-          int idx = label_indices[j];
-          float score = conf_data[conf_idx + label * num_priors + idx];
-          score_index_pairs.push_back(
-              std::make_pair(score, std::make_pair(label, idx)));
-        }
-      }
-
-      // Keep top k results per image.
-      std::stable_sort(score_index_pairs.begin(),
-                       score_index_pairs.end(),
-                       sort_score_pair_descend<std::pair<int, int>>);
-      score_index_pairs.resize(keep_topk);
-      // Store the new indices.
-      std::map<int, std::vector<int>> new_indices;
-
-      for (int j = 0; j < score_index_pairs.size(); ++j) {
-        int label = score_index_pairs[j].second.first;
-        int idx = score_index_pairs[j].second.second;
-        new_indices[label].push_back(idx);
-      }
-
-      all_indices.push_back(new_indices);
-      num_kept += keep_topk;
-    } else {
-      all_indices.push_back(indices);
-      num_kept += num_det;
-    }
-    conf_offset += num_priors;
-    bbox_offset += num_priors;
-  }
-
-  if (num_kept == 0) {
-    (*result).clear();
-    (*result).resize(1);
-    (*result)[0] = -1;
-    return;
-  } else {
-    (*result).resize(num_kept * 6);
-  }
-
-  int count = 0;
-
-  conf_offset = 0;
-  bbox_offset = 0;
-  for (int i = 0; i < priors.size(); ++i) {
-    int num_priors = priors[i];
-    int conf_idx = class_num * conf_offset;
-    int bbox_idx =
-        share_location ? bbox_offset * 4 : bbox_offset * 4 * class_num;
-
-    for (auto it = all_indices[i].begin(); it != all_indices[i].end(); ++it) {
-      int label = it->first;
-      std::vector<int>& indices = it->second;
-      const dtype* cur_conf_data = conf_data + conf_idx + label * num_priors;
-      const dtype* cur_bbox_data = bbox_data + bbox_idx;
-
-      if (!share_location) {
-        cur_bbox_data += label * num_priors * 4;
-      }
-
-      for (int j = 0; j < indices.size(); ++j) {
-        int idx = indices[j];
-        (*result)[count * 6] = label;
-        (*result)[count * 6 + 1] = cur_conf_data[idx];
-
-        for (int k = 0; k < 4; ++k) {
-          (*result)[count * 6 + 2 + k] = cur_bbox_data[idx * 4 + k];
-        }
-
-        ++count;
-      }
-    }
-    conf_offset += num_priors;
-    bbox_offset += num_priors;
-  }
-}
-
-TEST(multiclass_nms_host, init) {
-  MulticlassNmsCompute multiclass_nms;
-  ASSERT_EQ(multiclass_nms.precision(), PRECISION(kFloat));
-  ASSERT_EQ(multiclass_nms.target(), TARGET(kHost));
-}
-
-TEST(multiclass_nms_host, retrive_op) {
-  auto multiclass_nms =
-      KernelRegistry::Global().Create<TARGET(kHost), PRECISION(kFloat)>(
-          "multiclass_nms");
-  ASSERT_FALSE(multiclass_nms.empty());
-  ASSERT_TRUE(multiclass_nms.front());
-}
-
-TEST(multiclass_nms_host, compute) {
-  MulticlassNmsCompute multiclass_nms;
-  operators::MulticlassNmsParam param;
-  lite::Tensor bbox, conf, out;
-  std::vector<float> out_ref;
-
-  for (std::vector<int> priors : {std::vector<int>({2, 2, 2})}) {
-    int N = priors.size();
-    for (bool share_location : {true}) {
-      for (int class_num : {1, 4, 10}) {
-        DDim* bbox_dim;
-        DDim* conf_dim;
-        int M = priors[0];
-        if (share_location) {
-          bbox_dim = new DDim({N, M, 4});
-        } else {
-          bbox_dim = new DDim({class_num, M, 4});
-        }
-        conf_dim = new DDim({N, class_num, M});
-        bbox.Resize(*bbox_dim);
-        conf.Resize(*conf_dim);
-        for (int background_id : {0}) {
-          for (int keep_topk : {1, 5, 10}) {
-            for (int nms_topk : {1, 5, 10}) {
-              for (float nms_eta : {1.0, 0.99, 0.9}) {
-                for (float nms_thresh : {0.5, 0.7}) {
-                  for (float conf_thresh : {0.5, 0.7}) {
-                    auto* conf_data = conf.mutable_data<float>();
-                    auto* bbox_data = bbox.mutable_data<float>();
-                    for (int i = 0; i < bbox_dim->production(); ++i) {
-                      bbox_data[i] = i * 1. / bbox_dim->production();
-                    }
-                    for (int i = 0; i < conf_dim->production(); ++i) {
-                      conf_data[i] = i * 1. / conf_dim->production();
-                    }
-                    param.bboxes = &bbox;
-                    param.scores = &conf;
-                    param.out = &out;
-                    param.background_label = background_id;
-                    param.keep_top_k = keep_topk;
-                    param.nms_top_k = nms_topk;
-                    param.score_threshold = conf_thresh;
-                    param.nms_threshold = nms_thresh;
-                    param.nms_eta = nms_eta;
-                    multiclass_nms.SetParam(param);
-                    multiclass_nms.Run();
-                    auto* out_data = out.mutable_data<float>();
-                    out_ref.clear();
-                    multiclass_nms_compute_ref<float>(
-                        param, class_num, priors, share_location, &out_ref);
-                    EXPECT_EQ(out.dims().production(), out_ref.size());
-                    if (out.dims().production() == out_ref.size()) {
-                      auto* out_ref_data = out_ref.data();
-                      for (int i = 0; i < out.dims().production(); i++) {
-                        EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-        delete bbox_dim;
-        delete conf_dim;
-      }
-    }
-  }
-}
-
-}  // namespace host
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(multiclass_nms, kHost, kFloat, kNCHW, def);
diff --git a/lite/kernels/host/reshape_compute.cc b/lite/kernels/host/reshape_compute.cc
deleted file mode 100644
index 72aa02782b..0000000000
--- a/lite/kernels/host/reshape_compute.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/host/reshape_compute.h"
-#include <vector>
-#include "lite/operators/reshape_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace host {
-
-void ReshapeCompute::Run() {
-  auto& param = Param<operators::ReshapeParam>();
-  auto x = param.x;
-  auto actual_shape = param.actual_shape;
-  auto output = param.output;
-  bool inplace = param.inplace;
-  auto x_dims = x->dims();
-  auto output_dims = output->dims();
-  if (actual_shape) {
-    auto actual_shape_dims = actual_shape->dims();
-    auto* actual_shape_data = actual_shape->data<int>();
-#ifdef LITE_WITH_CUDA
-    lite::Tensor cpu_actual_shape;
-    if (actual_shape->target() == TARGET(kCUDA)) {
-      cpu_actual_shape.CopyDataFrom(*actual_shape);
-      actual_shape_data = cpu_actual_shape.data<int>();
-    }
-#endif
-    auto shape = std::vector<int>(
-        actual_shape_data, actual_shape_data + actual_shape_dims.production());
-    output_dims = lite::operators::ValidateShape(shape, x_dims);
-    output->Resize(output_dims);
-  }
-  if (inplace) {
-    output->ShareDataWith(*x);
-  } else {
-    output->CopyDataFrom(*x);
-  }
-  output->Resize(output_dims);
-}
-
-}  // namespace host
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(reshape,
-                     kHost,
-                     kAny,
-                     kAny,
-                     paddle::lite::kernels::host::ReshapeCompute,
-                     def)
-    .BindInput("X",
-               {LiteType::GetTensorTy(
-                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
-    .BindInput("ShapeTensor",
-               {LiteType::GetTensorTy(
-                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
-    .BindInput("Shape",
-               {LiteType::GetTensorTy(
-                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(
-                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(reshape2,
-                     kHost,
-                     kAny,
-                     kAny,
-                     paddle::lite::kernels::host::ReshapeCompute,
-                     def)
-    .BindInput("X",
-               {LiteType::GetTensorTy(
-                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
-    .BindInput("Shape",
-               {LiteType::GetTensorTy(
-                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
-    .BindInput("ShapeTensor",
-               {LiteType::GetTensorTy(
-                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(
-                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
-    .BindOutput("XShape",
-                {LiteType::GetTensorTy(
-                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(flatten,
-                     kHost,
-                     kAny,
-                     kAny,
-                     paddle::lite::kernels::host::ReshapeCompute,
-                     def)
-    .BindInput("X",
-               {LiteType::GetTensorTy(
-                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
-    .BindInput("Shape",
-               {LiteType::GetTensorTy(
-                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(
-                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(flatten2,
-                     kHost,
-                     kAny,
-                     kAny,
-                     paddle::lite::kernels::host::ReshapeCompute,
-                     def)
-    .BindInput("X",
-               {LiteType::GetTensorTy(
-                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
-    .BindInput("Shape",
-               {LiteType::GetTensorTy(
-                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(
-                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
-    .BindOutput("XShape",
-                {LiteType::GetTensorTy(
-                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
-    .Finalize();
diff --git a/lite/kernels/host/reshape_compute.h b/lite/kernels/host/reshape_compute.h
deleted file mode 100644
index 13b0ec5513..0000000000
--- a/lite/kernels/host/reshape_compute.h
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace host {
-
-class ReshapeCompute
-    : public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
- public:
-  void Run() override;
-
-  virtual ~ReshapeCompute() = default;
-};
-
-}  // namespace host
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/host/reshape_compute_test.cc b/lite/kernels/host/reshape_compute_test.cc
deleted file mode 100644
index 23f418c3ce..0000000000
--- a/lite/kernels/host/reshape_compute_test.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/host/reshape_compute.h"
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace host {
-
-TEST(reshape_host, init) {
-  ReshapeCompute reshape;
-  ASSERT_EQ(reshape.precision(), PRECISION(kAny));
-  ASSERT_EQ(reshape.target(), TARGET(kHost));
-}
-
-TEST(reshape_host, compute) {
-  ReshapeCompute reshape;
-  operators::ReshapeParam param;
-
-  Tensor x;
-  Tensor actual_shape;
-  Tensor output;
-
-  x.Resize(DDim(std::vector<int64_t>({1, 2, 4, 6})));
-  actual_shape.Resize(DDim(std::vector<int64_t>({2})));
-
-  auto* x_data = x.mutable_data<float>();
-  auto* actual_shape_data = actual_shape.mutable_data<int>();
-  for (int i = 0; i < x.dims().production(); i++) {
-    x_data[i] = i;
-  }
-  actual_shape_data[0] = 6;
-  actual_shape_data[1] = 8;
-
-  param.x = &x;
-  param.shape = {-1, 0, 3, 2, 1};
-  param.output = &output;
-  param.actual_shape = &actual_shape;
-  param.inplace = false;
-  reshape.SetParam(param);
-  reshape.Run();
-
-  // check output dims
-  CHECK_EQ(actual_shape.dims().production(), output.dims().size());
-  for (int i = 0; i < output.dims().size(); i++) {
-    CHECK_EQ(output.dims()[i], actual_shape_data[i]);
-  }
-
-  // check output data
-  auto* output_data = output.mutable_data<float>();
-  CHECK_NE(output_data, x_data);
-  for (int i = 0; i < output.dims().production(); i++) {
-    EXPECT_NEAR(output_data[i], x_data[i], 1e-6);
-  }
-
-  // check output data if inplace = true;
-  param.inplace = true;
-  reshape.SetParam(param);
-  reshape.Run();
-  output_data = output.mutable_data<float>();
-  CHECK_EQ(output_data, x_data);
-}
-
-TEST(reshape, retrive_op) {
-  auto reshape =
-      KernelRegistry::Global()
-          .Create<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)>("reshape");
-  ASSERT_FALSE(reshape.empty());
-  ASSERT_TRUE(reshape.front());
-}
-
-TEST(reshape2, retrive_op) {
-  auto reshape2 =
-      KernelRegistry::Global()
-          .Create<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)>("reshape2");
-  ASSERT_FALSE(reshape2.empty());
-  ASSERT_TRUE(reshape2.front());
-}
-
-}  // namespace host
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(reshape, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(reshape2, kHost, kAny, kAny, def);
diff --git a/lite/kernels/host/use_kernels.h b/lite/kernels/host/use_kernels.h
deleted file mode 100644
index b5bab46a71..0000000000
--- a/lite/kernels/host/use_kernels.h
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/op_registry.h"
-
-USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(reshape, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(reshape2, kHost, kAny, kAny, def);
diff --git a/lite/kernels/npu/CMakeLists.txt b/lite/kernels/npu/CMakeLists.txt
deleted file mode 100644
index 960dbff8db..0000000000
--- a/lite/kernels/npu/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-
-if(NOT LITE_WITH_NPU)
-  return ()
-endif()
-
-message(STATUS "compile with lite NPU kernels")
-
-add_kernel(graph_compute_npu NPU basic SRCS graph_compute.cc DEPS ${lite_kernel_deps} ${npu_ddk_libs})
-# lite_cc_test(test_graph_compute_npu SRCS graph_compute_test.cc DEPS graph_compute_npu)
diff --git a/lite/kernels/npu/graph_compute.cc b/lite/kernels/npu/graph_compute.cc
deleted file mode 100644
index 9f0f557f5c..0000000000
--- a/lite/kernels/npu/graph_compute.cc
+++ /dev/null
@@ -1,151 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/npu/graph_compute.h"
-#include <sys/time.h>
-#include <time.h>
-#include <string>
-#include <vector>
-#include "ai_ddk_lib/include/HiAiModelManagerService.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-
-void GraphCompute::PrepareForRun() {
-  auto& ctx = this->ctx_->template As<NPUContext>();
-  auto& param = this->Param<param_t>();
-
-  exec_ = ctx.client(param.model_name);
-  CHECK(exec_);
-  int ret =
-      exec_->GetModelIOTensorDim(param.model_name, npu_idims_, npu_odims_);
-  CHECK_EQ(ret, hiai::AI_SUCCESS) << "[NPU] Get dims failed.";
-
-  npu_itensors_.resize(npu_idims_.size());
-  npu_otensors_.resize(npu_odims_.size());
-
-  for (size_t i = 0; i < npu_idims_.size(); ++i) {
-    VLOG(3) << "npu_idims[" << i << "]: " << npu_idims_[i].GetNumber() << ","
-            << npu_idims_[i].GetChannel() << "," << npu_idims_[i].GetHeight()
-            << "," << npu_idims_[i].GetWidth();
-    VLOG(3) << "lite_idims[" << i << "]: " << param.inputs[i]->dims();
-    CHECK_EQ(param.inputs[i]->dims().production(),
-             npu_idims_[i].GetNumber() * npu_idims_[i].GetChannel() *
-                 npu_idims_[i].GetHeight() * npu_idims_[i].GetWidth());
-    npu_itensors_[i].reset(new hiai::AiTensor);
-    npu_itensors_[i]->Init(&(npu_idims_[i]));
-  }
-
-  for (size_t i = 0; i < npu_odims_.size(); ++i) {
-    VLOG(3) << "npu_odims[" << i << "]: " << npu_odims_[i].GetNumber() << ","
-            << npu_odims_[i].GetChannel() << "," << npu_odims_[i].GetHeight()
-            << "," << npu_odims_[i].GetWidth();
-    VLOG(3) << "lite_odims[" << i << "]: " << param.outputs[i]->dims();
-    auto out_size = npu_odims_[i].GetNumber() * npu_odims_[i].GetChannel() *
-                    npu_odims_[i].GetHeight() * npu_odims_[i].GetWidth();
-    if (param.outputs[i]->dims().production() != out_size) {
-      param.outputs[i]->Resize({npu_odims_[i].GetNumber(),
-                                npu_odims_[i].GetChannel(),
-                                npu_odims_[i].GetHeight(),
-                                npu_odims_[i].GetWidth()});
-    }
-    LOG(INFO) << param.outputs[i]->dims();
-    npu_otensors_[i].reset(new hiai::AiTensor);
-    npu_otensors_[i]->Init(&(npu_odims_[i]));
-  }
-}
-
-bool GraphCompute::input_dims_changed() const {
-  auto& param = this->Param<param_t>();
-  CHECK_EQ(param.inputs.size(), npu_idims_.size());
-  for (size_t i = 0; i < param.inputs.size(); ++i) {
-    auto param_idims = param.inputs[i]->dims();
-    CHECK(!param_idims.empty());
-    CHECK_EQ(param_idims.size(), 4);
-    std::vector<int> idims{static_cast<int>(npu_idims_[i].GetNumber()),
-                           static_cast<int>(npu_idims_[i].GetChannel()),
-                           static_cast<int>(npu_idims_[i].GetHeight()),
-                           static_cast<int>(npu_idims_[i].GetWidth())};
-    for (size_t i = 0; i < 4; ++i) {
-      if (param_idims[i] != idims[i]) {
-        return true;
-      }
-    }
-  }
-
-  return false;
-}
-
-void GraphCompute::Run() {
-  CHECK(!input_dims_changed())
-      << "When NPU is enabled, the input shape could not be changed yet.";
-  auto& param = this->Param<param_t>();
-  CHECK_EQ(param.inputs.size(), npu_itensors_.size());
-  CHECK_EQ(param.outputs.size(), npu_otensors_.size());
-
-  for (size_t i = 0; i < param.inputs.size(); ++i) {
-    auto* itensor = param.inputs[i];
-    CHECK(itensor);
-    const auto* i_data = itensor->data<float>();
-    std::memcpy(
-        npu_itensors_[i]->GetBuffer(),
-        i_data,
-        sizeof(float) * static_cast<size_t>(itensor->dims().production()));
-  }
-  std::string key = "model_name";  // Note: key seems must be model_name
-  npu_context_.AddPara(key, param.model_name);
-
-  auto GetCurrentUS = []() -> double {
-    struct timeval time;
-    gettimeofday(&time, NULL);
-    return 1e+6 * time.tv_sec + time.tv_usec;
-  };
-  int istamp;
-  auto start_time = GetCurrentUS();
-  CHECK_EQ(
-      hiai::AI_SUCCESS,
-      exec_->Process(npu_context_, npu_itensors_, npu_otensors_, 1000, istamp));
-  LOG(INFO) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
-
-  for (size_t i = 0; i < param.outputs.size(); ++i) {
-    auto* otensor = param.outputs[i];
-    CHECK(otensor);
-    auto* o_data = otensor->mutable_data<float>();
-    auto* npu_obuffer = static_cast<float*>(npu_otensors_[i]->GetBuffer());
-
-    std::memcpy(
-        o_data,
-        npu_obuffer,
-        sizeof(float) * static_cast<size_t>(otensor->dims().production()));
-  }
-}
-
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(graph_op,
-                     kNPU,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::npu::GraphCompute,
-                     def)
-    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))})
-    .Finalize();
diff --git a/lite/kernels/npu/graph_compute.h b/lite/kernels/npu/graph_compute.h
deleted file mode 100644
index 908dbc55dd..0000000000
--- a/lite/kernels/npu/graph_compute.h
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <vector>
-#include "ai_ddk_lib/include/HiAiModelManagerService.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/types.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-
-class GraphCompute : public KernelLite<TARGET(kNPU), PRECISION(kFloat)> {
- public:
-  using param_t = operators::GraphParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  virtual ~GraphCompute() = default;
-
-  bool input_dims_changed() const;
-
- private:
-  hiai::AiModelMngerClient* exec_;
-  std::vector<hiai::TensorDimension> npu_idims_;
-  std::vector<hiai::TensorDimension> npu_odims_;
-
-  std::vector<std::shared_ptr<hiai::AiTensor>> npu_itensors_;
-  std::vector<std::shared_ptr<hiai::AiTensor>> npu_otensors_;
-
-  // TODO(TJ): find better place
-  hiai::AiContext npu_context_;
-};
-
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt
deleted file mode 100644
index 65145c40b8..0000000000
--- a/lite/kernels/opencl/CMakeLists.txt
+++ /dev/null
@@ -1,49 +0,0 @@
-if (NOT LITE_WITH_OPENCL)
-    return ()
-endif()
-
-set(cl_kernel_deps op_params cl_runtime cl_context cl_wrapper cl_target_wrapper)
-
-add_kernel(fc_opencl OPENCL basic SRCS fc_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(mul_opencl OPENCL basic SRCS mul_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(fusion_elementwise_add_activation_opencl
-        OPENCL basic SRCS fusion_elementwise_add_activation_compute.cc
-        DEPS elementwise_add_opencl ${cl_kernel_deps})
-add_kernel(pool_opencl OPENCL basic SRCS pool_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(io_copy_compute_opencl OPENCL basic SRCS io_copy_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps})
-add_kernel(relu_opencl OPENCL basic SRCS relu_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps})
-
-lite_cc_test(test_elementwise_add_opencl SRCS elementwise_add_compute_test.cc
-        DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context
-        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-
-lite_cc_test(test_pool_opencl SRCS pool_compute_test.cc
-        DEPS pool_opencl op_registry program context
-        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-
-lite_cc_test(test_fc_opencl SRCS fc_compute_test.cc
-        DEPS fc_opencl op_registry program context
-        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-
-lite_cc_test(test_mul_opencl SRCS mul_compute_test.cc
-        DEPS mul_opencl op_registry program context
-        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-
-lite_cc_test(test_io_copy_compute_opencl SRCS io_copy_compute_test.cc
-        DEPS io_copy_compute_opencl op_registry program context
-        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-
-lite_cc_test(test_relu_opencl SRCS relu_compute_test.cc
-        DEPS relu_opencl op_registry program context
-        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-
-lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc
-        DEPS depthwise_conv2d_opencl op_registry program context
-        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-
-lite_cc_test(test_conv_opencl SRCS conv_compute_test.cc
-        DEPS conv_opencl op_registry program context
-        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
diff --git a/lite/kernels/opencl/conv_compute.cc b/lite/kernels/opencl/conv_compute.cc
deleted file mode 100644
index 04a78face2..0000000000
--- a/lite/kernels/opencl/conv_compute.cc
+++ /dev/null
@@ -1,296 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/opencl/conv_compute.h"
-#include <sstream>
-#include "lite/backends/opencl/cl_include.h"
-#include "lite/core/op_registry.h"
-#include "lite/operators/op_params.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace opencl {
-
-void ConvCompute::PrepareForRun() {
-  const auto& param = this->Param<param_t>();
-  auto x_dims = param.x->dims();
-  auto filter_dims = param.filter->dims();
-  auto output_dims = param.output->dims();
-
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-
-  int bs = x_dims[0];
-  int c_in = x_dims[1];
-  int h_out = output_dims[2];
-  int w_out = output_dims[3];
-  int kernel_h = filter_dims[2];  // oihw
-  int kernel_w = filter_dims[3];
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
-  int stride_h = param.strides[0];
-  int stride_w = param.strides[1];
-  int groups = param.groups;
-  bool relu_fused = param.fuse_relu;
-  bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1);
-  bool zero_pad = (pad_h == 0) && (pad_w == 0);
-
-  VLOG(3) << "Is relu fused? / " << (relu_fused ? "Yes" : "No");
-  VLOG(3) << "groups:" << groups << " stride_h:" << stride_h
-          << " stride_w:" << stride_w << " pad_h:" << pad_h
-          << " pad_w:" << pad_w << " kernel_h:" << kernel_h
-          << " kernel_h:" << kernel_h;
-  VLOG(3) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2]
-          << " " << x_dims[3];
-  VLOG(3) << "output_dims:" << output_dims[0] << " " << output_dims[1] << " "
-          << output_dims[2] << " " << output_dims[3];
-  VLOG(3) << "filter_dims:" << filter_dims[0] << " " << filter_dims[1] << " "
-          << filter_dims[2] << " " << filter_dims[3];
-
-  if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 &&
-      zero_pad && no_dilation) {
-    // conv2d_1x1
-    kernel_func_names_.push_back("gemm_batch");
-    kernel_func_paths_.push_back("buffer/fc_kernel.cl");
-    if (relu_fused) {
-      build_options_.push_back("-DCL_DTYPE=float -DRELU");
-    } else {
-      build_options_.push_back("-DCL_DTYPE=float");
-    }
-    impl_ = &ConvCompute::Conv2d1x1;
-  } else {
-    kernel_func_names_.push_back("im2col");
-    kernel_func_names_.push_back("gemm_batch");
-    kernel_func_paths_.push_back("buffer/im2col_kernel.cl");
-    kernel_func_paths_.push_back("buffer/fc_kernel.cl");
-    build_options_.push_back("-DCL_DTYPE=float");
-    if (relu_fused) {
-      build_options_.push_back("-DCL_DTYPE=float -DRELU");
-    } else {
-      build_options_.push_back("-DCL_DTYPE=float");
-    }
-    impl_ = &ConvCompute::GemmlikeConv2d;
-    col_buffer_.reset(new lite::Tensor);
-    col_buffer_->Resize({bs, c_in, kernel_h * kernel_w, h_out * w_out});
-    col_buffer_->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-  }
-
-  for (size_t i = 0; i < kernel_func_names_.size(); i++) {
-    context.cl_context()->AddKernel(
-        kernel_func_names_[i], kernel_func_paths_[i], build_options_[i]);
-  }
-}
-
-void ConvCompute::GemmlikeConv2d() {
-  const auto& param = this->Param<param_t>();
-  auto x_dims = param.x->dims();
-  auto filter_dims = param.filter->dims();
-  auto output_dims = param.output->dims();
-  int bs = x_dims[0];
-  int c_in = x_dims[1];
-  int h_in = x_dims[2];
-  int w_in = x_dims[3];
-  int c_out = output_dims[1];
-  int h_out = output_dims[2];
-  int w_out = output_dims[3];
-  int kernel_h = filter_dims[2];
-  int kernel_w = filter_dims[3];
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
-  int stride_h = param.strides[0];
-  int stride_w = param.strides[1];
-  int dilation_h = param.dilations[0];
-  int dilation_w = param.dilations[1];
-
-  auto* x_buf = param.x->data<float, cl::Buffer>();
-  auto* filter_buf = param.filter->data<float, cl::Buffer>();
-  auto* bias_buf = (param.bias == nullptr)
-                       ? static_cast<cl::Buffer*>(nullptr)
-                       : param.bias->data<float, cl::Buffer>();
-  auto* output_buf =
-      param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-  auto* col_buf = col_buffer_->mutable_data<float, cl::Buffer>();
-
-  auto& context = ctx_->As<OpenCLContext>();
-  std::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto img2col_kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-  int n_threads = c_in * h_out * w_out;
-  int in_stride = c_in * h_in * w_in;
-  int out_stride = c_in * kernel_h * kernel_w * h_out * w_out;
-  int img_offset = 0;
-  int col_offset = 0;
-  int arg_idx = 0;
-  cl_int status;
-  for (int b = 0; b < bs; b++) {
-    img_offset = b * in_stride;
-    col_offset = b * out_stride;
-    arg_idx = 0;
-    status = img2col_kernel.setArg(arg_idx, *x_buf);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, img_offset);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, n_threads);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, h_in);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, w_in);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, kernel_h);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, kernel_w);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, pad_h);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, pad_w);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, stride_h);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, stride_w);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, dilation_h);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, dilation_w);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, h_out);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, w_out);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, *col_buf);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, col_offset);
-    CL_CHECK_FATAL(status);
-
-    auto global_work_size = cl::NDRange{static_cast<size_t>(out_stride)};
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        img2col_kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        nullptr);
-    CL_CHECK_FATAL(status);
-  }
-
-  int m = c_out;
-  int k = c_in * kernel_h * kernel_w;
-  int n = h_out * w_out;
-  VLOG(4) << "m = " << m << " n = " << n << " k = " << k;
-  kernel_key.str("");
-  kernel_key << kernel_func_names_[1] << build_options_[1];
-  auto gemm_kernel = context.cl_context()->GetKernel(kernel_key.str());
-  GemmBatched(
-      gemm_kernel, col_buf, filter_buf, bias_buf, output_buf, bs, m, n, k);
-}
-
-void ConvCompute::Conv2d1x1() {
-  const auto& param = *param_.get_mutable<param_t>();
-  const int batch_size = param.x->dims()[0];
-  const int k = param.x->dims()[1];  // K: input_channel
-  const int n = param.x->dims()[2] *
-                param.x->dims()[3];       // N == X_HxW == input_h * input_w
-  const int m = param.output->dims()[1];  // M: output_channel == filter number
-
-  VLOG(4) << "m = " << m << " n = " << n << " k = " << k;
-
-  if (param.groups != 1) {
-    LOG(FATAL) << "conv2d_1x1 with group > 1 not supported and param.groups = "
-               << param.groups;
-  }
-
-  auto* x_d = param.x->data<float, cl::Buffer>();
-  auto* filter_d = param.filter->data<float, cl::Buffer>();
-  auto* bias_d = (param.bias == nullptr)
-                     ? static_cast<cl::Buffer*>(nullptr)
-                     : param.bias->data<float, cl::Buffer>();
-  auto* output_d =
-      param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-
-  auto& context = ctx_->As<OpenCLContext>();
-  std::stringstream kernel_key;
-  kernel_key << kernel_func_names_.front() << build_options_.front();
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-  GemmBatched(kernel, x_d, filter_d, bias_d, output_d, batch_size, m, n, k);
-}
-
-// a: filter_d ==> <m, k> <=> <oc, ic>
-// b: x_d      ==> <k, n> <=> <ic, ih*iw>
-// c: output_d ==> <m, n> <=> <oc, ih*iw>
-void ConvCompute::GemmBatched(cl::Kernel& kernel,
-                              const cl::Buffer* x_d,
-                              const cl::Buffer* filter_d,
-                              const cl::Buffer* bias_d,
-                              cl::Buffer* output_d,
-                              const int batch_size,
-                              const int m,
-                              const int n,
-                              const int k) {
-  auto global_work_size = cl::NDRange{static_cast<size_t>((m + 7) / 8),
-                                      static_cast<size_t>((n + 3) / 4),
-                                      static_cast<size_t>(batch_size)};
-  auto local_work_size = cl::NDRange{16, 16};  // cl::NullRange;
-
-  auto& context = ctx_->As<OpenCLContext>();
-  cl_int status;
-  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, *filter_d);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *x_d);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *bias_d);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *output_d);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, m);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, n);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, k);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, batch_size);
-  CL_CHECK_FATAL(status);
-
-  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-      kernel,
-      cl::NullRange,
-      global_work_size,
-      local_work_size,
-      nullptr,
-      event_.get());
-  CL_CHECK_FATAL(status);
-
-  context.cl_wait_list()->emplace(output_d, event_);
-}
-
-void ConvCompute::Run() { (this->*impl_)(); }
-
-}  // namespace opencl
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(conv2d,
-                     kOpenCL,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::opencl::ConvCompute,
-                     def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .Finalize();
diff --git a/lite/kernels/opencl/conv_compute.h b/lite/kernels/opencl/conv_compute.h
deleted file mode 100644
index 37c8893bb8..0000000000
--- a/lite/kernels/opencl/conv_compute.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-#include "lite/backends/opencl/cl_include.h"
-#include "lite/core/kernel.h"
-#include "lite/core/tensor.h"
-#include "lite/operators/op_params.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace opencl {
-
-class ConvCompute
-    : public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
- public:
-  using param_t = operators::ConvParam;
-  using kernel_t = void (ConvCompute::*)();
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
- private:
-  void GemmlikeConv2d();
-  void Conv2d1x1();
-  void GemmBatched(cl::Kernel& kernel,
-                   const cl::Buffer* x_d,
-                   const cl::Buffer* filter_d,
-                   const cl::Buffer* bias_d,
-                   cl::Buffer* output_d,
-                   const int batch_size,
-                   const int m,
-                   const int n,
-                   const int k);
-  kernel_t impl_;
-  std::unique_ptr<lite::Tensor> col_buffer_{nullptr};
-  std::vector<std::string> kernel_func_names_{};
-  std::vector<std::string> kernel_func_paths_{};
-  std::vector<std::string> build_options_{};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-
-}  // namespace opencl
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/opencl/conv_compute_test.cc b/lite/kernels/opencl/conv_compute_test.cc
deleted file mode 100644
index a7417e3525..0000000000
--- a/lite/kernels/opencl/conv_compute_test.cc
+++ /dev/null
@@ -1,602 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/backends/opencl/target_wrapper.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-
-#define A(i, j) a[i * lda + j]
-#define B(i, j) cur_b[i * ldb + j]
-#define C(i, j) cur_c[i * ldc + j]
-
-template <typename Dtype1, typename Dtype2>
-static void conv_basic(const Dtype1* din,
-                       Dtype2* dout,
-                       int num,
-                       int chout,
-                       int hout,
-                       int wout,
-                       int chin,
-                       int hin,
-                       int win,
-                       const Dtype1* weights,
-                       const Dtype2* bias,
-                       int group,
-                       int kernel_w,
-                       int kernel_h,
-                       int stride_w,
-                       int stride_h,
-                       int dila_w,
-                       int dila_h,
-                       int pad_w,
-                       int pad_h,
-                       bool flag_bias,
-                       bool flag_relu) {
-  Dtype2 beta = 0;
-  auto src_data = din;
-  auto dst_data_ref = dout;
-  auto weights_data = weights;
-  auto with_bias = flag_bias;
-  auto bias_data = bias;
-
-  int in_num = num;
-  int out_channels = chout;
-  int out_h = hout;
-  int out_w = wout;
-
-  int in_channel = chin;
-  int in_h = hin;
-  int in_w = win;
-  int out_c_group = out_channels / group;
-  int in_c_group = in_channel / group;
-
-  for (int n = 0; n < in_num; ++n) {
-    for (int g = 0; g < group; ++g) {
-      for (int oc = 0; oc < out_c_group; ++oc) {
-        for (int oh = 0; oh < out_h; ++oh) {
-          for (int ow = 0; ow < out_w; ++ow) {
-            int out_idx = n * group * out_c_group * out_h * out_w +
-                          g * out_c_group * out_h * out_w + oc * out_h * out_w +
-                          oh * out_w + ow;
-            Dtype2 bias_d =
-                with_bias ? (bias_data[g * out_c_group + oc]) : (Dtype2)0;
-            dst_data_ref[out_idx] = bias_d;  // + dst_data_ref[out_idx] * beta;
-            for (int ic = 0; ic < in_c_group; ++ic) {
-              for (int kh = 0; kh < kernel_h; ++kh) {
-                for (int kw = 0; kw < kernel_w; ++kw) {
-                  int iw = ow * stride_w - pad_w + kw * (dila_w);
-                  int ih = oh * stride_h - pad_h + kh * (dila_h);
-                  if (iw < 0 || iw >= in_w) continue;
-                  if (ih < 0 || ih >= in_h) continue;
-
-                  int iidx = n * in_channel * in_h * in_w +
-                             g * in_c_group * in_h * in_w + ic * in_h * in_w +
-                             ih * in_w + iw;
-                  int widx =
-                      g * out_c_group * in_c_group * kernel_h * kernel_w +
-                      oc * in_c_group * kernel_h * kernel_w +
-                      ic * kernel_h * kernel_w + kh * kernel_w + kw;
-
-                  dst_data_ref[out_idx] += src_data[iidx] * weights_data[widx];
-                }
-              }
-            }
-            if (flag_relu) {
-              dst_data_ref[out_idx] = dst_data_ref[out_idx] > (Dtype2)0
-                                          ? dst_data_ref[out_idx]
-                                          : (Dtype2)0;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-void gemm_batch_bias(const int batch_size,
-                     const T* a,
-                     const int M,
-                     const int K,
-                     const T* b,
-                     const int K_,
-                     const int N,
-                     T* biases,
-                     T* c) {
-  EXPECT_TRUE(K_ == K && M > 0 && N > 0 && K > 0);
-  for (int bidx = 0; bidx < batch_size; ++bidx) {
-    const T* cur_b = b + K * N * bidx;
-    T* cur_c = c + M * N * bidx;
-    EXPECT_TRUE(a && cur_b && cur_c);
-    const int lda = K;
-    const int ldb = N;
-    const int ldc = N;
-    for (int m = 0; m < M; ++m) {
-      for (int n = 0; n < N; ++n) {
-        C(m, n) = 0.0f;
-        for (int k = 0; k < K; ++k) {
-          C(m, n) += A(m, k) * B(k, n);
-        }
-      }
-    }
-    if (biases) {
-      for (int m = 0; m < M; ++m) {
-        for (int n = 0; n < N; ++n) {
-          C(m, n) += biases[m];
-        }
-      }
-    }
-  }
-}
-
-void PrintData(std::string name,
-               float* a,
-               const int rows,
-               const int cols,
-               const int batch_size = 1) {
-  std::cout << "==== " << name << " ====" << std::endl;
-  for (int b = 0; b < batch_size; ++b) {
-    std::cout << "-- bidx = " << b << " --" << std::endl;
-    for (int r = 0; r < rows; ++r) {
-      for (int c = 0; c < cols; ++c) {
-        std::cout << " " << a[b * rows * cols + r * cols + c];
-      }
-      std::cout << std::endl;
-    }
-  }
-}
-
-// #define PRINT_RESULT
-#define LOOP_TEST
-TEST(conv2d, compute_conv2d_1x1) {
-  // conv2d 1x1 note
-  // kernel/filter size = 1x1, group = 1, pad = 0, stride = 1, dilation = 1
-  // gemm implement
-  // a: filter_d ==> <m, k> <=> <oc, ic>
-  // b: x_d      ==> <k, n> <=> <ic, ih*iw>
-  // c: output_d ==> <m, n> <=> <oc, ih*iw>
-  std::unique_ptr<KernelContext> context(new KernelContext);
-  context->As<OpenCLContext>().InitOnce();
-  const int ksize = 1;
-  const int stride = 1;
-  const int pad = 0;
-  const int group = 1;
-  const int dilation = 1;
-  int loop_cnt = 0;
-
-#ifdef LOOP_TEST
-  for (int batch_size = 1; batch_size < 4; ++batch_size) {
-    for (int oc = 1; oc < 10; oc += 1) {                       // m
-      for (int ih = 1; ih < 9; ih += 1) {                      // ih
-        /*int iw = ih;*/ for (int iw = 1; iw < 10; iw += 1) {  // iw
-          for (int ic = 1; ic < 10; ic += 1) {                 // k
-            for (bool bias_flag : {true /*, false*/}) {
-              for (bool relu_flag : {true /*, false*/}) {
-#else
-  // groups:1 stride_h:1 stride_w:1 pad_h:0 pad_w:0 kernel_h:1 kernel_h:1
-  // x_dims:1 32 112 112
-  // output_dims:1 64 112 112
-  // filter_dims:64 32 1 1
-  const bool bias_flag = true;
-  const bool relu_flag = true;
-  const int batch_size = 8;
-  const int oc = 64;
-  const int ih = 112;
-  const int iw = 112;
-  const int ic = 32;
-#endif
-                const int oh = ih;
-                const int ow = iw;
-
-                const int m = oc;
-                const int n = ih * iw;
-                const int k = ic;
-                LOG(INFO) << "loop_cnt=" << ++loop_cnt << " bs=" << batch_size
-                          << " m=oc=" << m << " n=ih*iw=" << n << " k=ic=" << ic
-                          << " relu_fuse=" << relu_flag
-                          << " bias_flag=" << bias_flag;
-
-                auto kernels =
-                    KernelRegistry::Global().Create("conv2d",
-                                                    TARGET(kOpenCL),
-                                                    PRECISION(kFloat),
-                                                    DATALAYOUT(kNCHW));
-                ASSERT_FALSE(kernels.empty());
-                auto kernel = std::move(kernels.front());
-
-                lite::Tensor x, filter, bias, out, out_ref;
-                operators::ConvParam param;
-                param.x = &x;
-                param.filter = &filter;
-                param.bias = bias_flag ? &bias : nullptr;
-                param.output = &out;
-                param.strides = {stride, stride};
-                param.paddings = {pad, pad};
-                param.groups = group;
-                param.dilations = {dilation, dilation};
-                param.fuse_relu = relu_flag;
-
-                kernel->SetParam(param);
-                std::unique_ptr<KernelContext> conv_context(new KernelContext);
-                context->As<OpenCLContext>().CopySharedTo(
-                    &(conv_context->As<OpenCLContext>()));
-                kernel->SetContext(std::move(conv_context));
-                // a: filter_d ==> <m, k> <=> <oc, ic>
-                // b: x_d      ==> <k, n> <=> <ic, ih*iw>
-                // c: output_d ==> <m, n> <=> <oc, ih*iw>
-
-                const DDim x_dim =
-                    DDim(std::vector<DDim::value_type>{batch_size, ic, ih, iw});
-                const DDim filter_dim =
-                    DDim(std::vector<DDim::value_type>{oc, ic, ksize, ksize});
-                const DDim bias_dim = DDim(std::vector<DDim::value_type>{oc});
-                const DDim out_dim =
-                    DDim(std::vector<DDim::value_type>{batch_size, oc, oh, ow});
-
-                x.Resize(x_dim);
-                filter.Resize(filter_dim);
-                bias.Resize(bias_dim);
-                out.Resize(out_dim);
-                out_ref.Resize(out_dim);
-
-                auto* x_data =
-                    x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-                auto* filter_data =
-                    filter.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-                auto* bias_data =
-                    bias.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-
-                std::default_random_engine engine;
-                std::uniform_real_distribution<float> dist(-5, 5);
-                auto* mapped_x = static_cast<float*>(TargetWrapperCL::Map(
-                    x_data, 0, sizeof(float) * x_dim.production()));
-                for (int i = 0; i < x_dim.production(); ++i) {
-                  mapped_x[i] = static_cast<int>(dist(engine));
-                }
-                auto* mapped_filter = static_cast<float*>(TargetWrapperCL::Map(
-                    filter_data, 0, sizeof(float) * filter_dim.production()));
-                for (int i = 0; i < filter_dim.production(); ++i) {
-                  mapped_filter[i] = static_cast<int>(dist(engine));
-                }
-                auto* mapped_bias = static_cast<float*>(TargetWrapperCL::Map(
-                    bias_data, 0, sizeof(float) * bias_dim.production()));
-                float* bias_cpu_data =
-                    bias_flag ? reinterpret_cast<float*>(calloc(
-                                    sizeof(float), bias_dim.production()))
-                              : nullptr;
-                for (int i = 0; i < bias_dim.production(); ++i) {
-                  mapped_bias[i] = static_cast<int>(dist(engine));
-                  bias_cpu_data[i] = mapped_bias[i];
-                }
-
-                // run opencl kernel
-                kernel->Launch();
-
-                auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-                auto* out_ptr = param.output->data<float, cl::Buffer>();
-                auto it = wait_list->find(out_ptr);
-                if (it != wait_list->end()) {
-                  VLOG(4) << "--- Find the sync event for the target cl "
-                             "tensor. ---";
-                  auto& event = *(it->second);
-                  event.wait();
-                  double start_nanos =
-                      event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
-                  double stop_nanos =
-                      event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
-                  double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
-                  LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros
-                            << " us.";
-                } else {
-                  LOG(FATAL) << "Could not find the sync event for the target "
-                                "cl tensor.";
-                }
-
-                // run cpu ref
-                auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
-                conv_basic<float, float>(mapped_x,
-                                         out_ref_data,
-                                         batch_size,
-                                         oc,
-                                         oh,
-                                         ow,
-                                         ic,
-                                         ih,
-                                         iw,
-                                         mapped_filter,
-                                         bias_cpu_data,  // mapped_bias,
-                                         group,
-                                         ksize,
-                                         ksize,
-                                         stride,
-                                         stride,
-                                         dilation,
-                                         dilation,
-                                         pad,
-                                         pad,
-                                         bias_flag,
-                                         relu_flag);
-
-                auto* out_data = out.mutable_data<float, cl::Buffer>();
-                auto* mapped_out = static_cast<float*>(TargetWrapperCL::Map(
-                    out_data, 0, sizeof(float) * out_dim.production()));
-
-#ifdef PRINT_RESULT
-                // a: filter_d ==> <m, k> <=> <oc, ic>
-                // b: x_d      ==> <k, n> <=> <ic, ih*iw>
-                // c: output_d ==> <m, n> <=> <oc, ih*iw>
-                PrintData(
-                    "mapped_filter", static_cast<float*>(mapped_filter), m, k);
-                PrintData("mapped_x",
-                          static_cast<float*>(mapped_x),
-                          k,
-                          n,
-                          batch_size);
-                PrintData(
-                    "mapped_bias", static_cast<float*>(mapped_bias), m, 1);
-                std::cout << "mapped_bias[0]:" << mapped_bias[0] << std::endl;
-                PrintData("out_ref_data",
-                          static_cast<float*>(out_ref_data),
-                          m,
-                          n,
-                          batch_size);
-                PrintData("mapped_out",
-                          static_cast<float*>(mapped_out),
-                          m,
-                          n,
-                          batch_size);
-#endif
-
-                for (int i = 0; i < out_dim.production(); i++) {
-                  EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6);
-                  if (abs(mapped_out[i] - out_ref_data[i]) > 1e-6) {
-                    LOG(FATAL) << "error idx:" << i;
-                  }
-                }
-
-                free(bias_cpu_data);
-                TargetWrapperCL::Unmap(x_data, mapped_x);
-                TargetWrapperCL::Unmap(filter_data, mapped_filter);
-                TargetWrapperCL::Unmap(bias_data, mapped_bias);
-                TargetWrapperCL::Unmap(out_data, mapped_out);
-#ifdef LOOP_TEST
-              }  // relu
-            }    // bias
-          }      // ic
-        }        // iw
-      }          // ih
-    }            // oc
-  }              // batch_size
-#endif
-}
-#undef LOOP_TEST
-#undef PRINT_RESULT
-
-// #define PRINT_RESULT
-#define LOOP_TEST
-TEST(conv2d, compute_conv2d_gemm) {
-  std::unique_ptr<KernelContext> context(new KernelContext);
-  context->As<OpenCLContext>().InitOnce();
-  // x_dims:1 3 224 224
-  // output_dims:1 32 112 112
-  // filter_dims:32 3 3 3
-  int ksize = 3;
-  const int stride = 2;
-  const int pad = 1;
-  const int group = 1;
-  const int dilation = 1;
-  int loop_cnt = 0;
-
-#ifdef LOOP_TEST
-  for (int batch_size = 1; batch_size < 3; ++batch_size) {
-    for (int oc = 1; oc < 10; oc += 1) {        // m
-      for (int ih = 1; ih < 10; ih += 1) {      // ih
-        for (int iw = 1; iw < 10; iw += 1) {    // iw
-          for (int ic = 1; ic < 10; ic += 1) {  // k
-            for (bool bias_flag : {true, false}) {
-              for (bool relu_flag : {true, false}) {
-#else
-
-                const int batch_size = 8;
-                const int oc = 32;
-                const int ih = 224;
-                const int iw = 224;
-                const int ic = 3;
-                const bool bias_flag = true;
-                const bool relu_flag = true;
-
-#endif
-                const int oh = (ih + 2 * pad - ksize) / stride + 1;
-                const int ow = (iw + 2 * pad - ksize) / stride + 1;
-                // a: filter_d ==> <m, k> <=> <oc, ic>
-                // b: x_d      ==> <k, n> <=> <ic, ih*iw>
-                // c: output_d ==> <m, n> <=> <oc, ih*iw>
-
-                int m = oc;
-                int k = ic * ksize * ksize;
-                int n = oc;
-                LOG(INFO) << "bs=" << batch_size << " oc=" << oc << " ic=" << ic
-                          << " ih=" << ih << " iw=" << iw << " oh=" << oh
-                          << " ow=" << ow << " bias_flag=" << bias_flag
-                          << " relu_flag=" << relu_flag;
-                LOG(INFO) << "m=oc=" << oc
-                          << " k=ic*ksize*ksize=" << ic * ksize * ksize
-                          << " n=oc=" << oc;
-
-                auto kernels =
-                    KernelRegistry::Global().Create("conv2d",
-                                                    TARGET(kOpenCL),
-                                                    PRECISION(kFloat),
-                                                    DATALAYOUT(kNCHW));
-                ASSERT_FALSE(kernels.empty());
-                auto kernel = std::move(kernels.front());
-
-                lite::Tensor x, filter, bias, out, out_ref;
-                operators::ConvParam param;
-                param.x = &x;
-                param.filter = &filter;
-                param.bias = bias_flag ? &bias : nullptr;
-                param.output = &out;
-                param.strides = {stride, stride};
-                param.paddings = {pad, pad};
-                param.groups = group;
-                param.dilations = {dilation, dilation};
-                param.fuse_relu = relu_flag;
-
-                kernel->SetParam(param);
-                std::unique_ptr<KernelContext> conv_context(new KernelContext);
-                context->As<OpenCLContext>().CopySharedTo(
-                    &(conv_context->As<OpenCLContext>()));
-                kernel->SetContext(std::move(conv_context));
-
-                const DDim x_dim =
-                    DDim(std::vector<DDim::value_type>{batch_size, ic, ih, iw});
-                const DDim filter_dim =
-                    DDim(std::vector<DDim::value_type>{oc, ic, ksize, ksize});
-                const DDim bias_dim = DDim(std::vector<DDim::value_type>{oc});
-                const DDim out_dim =
-                    DDim(std::vector<DDim::value_type>{batch_size, oc, oh, ow});
-
-                x.Resize(x_dim);
-                filter.Resize(filter_dim);
-                bias.Resize(bias_dim);
-                out.Resize(out_dim);
-                out_ref.Resize(out_dim);
-
-                auto* x_data =
-                    x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-                auto* filter_data =
-                    filter.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-                auto* bias_data =
-                    bias.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-
-                std::default_random_engine engine;
-                std::uniform_real_distribution<float> dist(-5, 5);
-                auto* mapped_x = static_cast<float*>(TargetWrapperCL::Map(
-                    x_data, 0, sizeof(float) * x_dim.production()));
-                for (int i = 0; i < x_dim.production(); ++i) {
-                  mapped_x[i] = static_cast<int>(dist(engine));
-                }
-                auto* mapped_filter = static_cast<float*>(TargetWrapperCL::Map(
-                    filter_data, 0, sizeof(float) * filter_dim.production()));
-                for (int i = 0; i < filter_dim.production(); ++i) {
-                  mapped_filter[i] = static_cast<int>(dist(engine));
-                }
-                auto* mapped_bias = static_cast<float*>(TargetWrapperCL::Map(
-                    bias_data, 0, sizeof(float) * bias_dim.production()));
-                for (int i = 0; i < bias_dim.production(); ++i) {
-                  mapped_bias[i] = static_cast<int>(dist(engine));
-                }
-
-                // run opencl kernel
-                kernel->Launch();
-
-                auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-                auto* out_ptr = param.output->data<float, cl::Buffer>();
-                auto it = wait_list->find(out_ptr);
-                if (it != wait_list->end()) {
-                  VLOG(4) << "--- Find the sync event for the target cl "
-                             "tensor. ---";
-                  auto& event = *(it->second);
-                  event.wait();
-                  double start_nanos =
-                      event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
-                  double stop_nanos =
-                      event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
-                  double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
-                  LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros
-                            << " us.";
-                } else {
-                  LOG(FATAL) << "Could not find the sync event for the target "
-                                "cl tensor.";
-                }
-
-                // run cpu ref
-                auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
-                conv_basic<float, float>(mapped_x,
-                                         out_ref_data,
-                                         batch_size,
-                                         oc,
-                                         oh,
-                                         ow,
-                                         ic,
-                                         ih,
-                                         iw,
-                                         mapped_filter,
-                                         mapped_bias,
-                                         group,
-                                         ksize,
-                                         ksize,
-                                         stride,
-                                         stride,
-                                         dilation,
-                                         dilation,
-                                         pad,
-                                         pad,
-                                         bias_flag,
-                                         relu_flag);
-                auto* out_data = out.mutable_data<float, cl::Buffer>();
-                auto* mapped_out = static_cast<float*>(TargetWrapperCL::Map(
-                    out_data, 0, sizeof(float) * out_dim.production()));
-
-#ifdef PRINT_RESULT
-                PrintData(
-                    "mapped_filter", static_cast<float*>(mapped_filter), k, n);
-                PrintData("mapped_x",
-                          static_cast<float*>(mapped_x),
-                          batch_size * m,
-                          k);
-                PrintData(
-                    "mapped_bias", static_cast<float*>(mapped_bias), 1, n);
-                PrintData("out_ref_data",
-                          static_cast<float*>(out_ref_data),
-                          batch_size * m,
-                          n);
-                PrintData("mapped_out",
-                          static_cast<float*>(mapped_out),
-                          batch_size * m,
-                          n);
-#endif
-
-                for (int i = 0; i < out_dim.production(); i++) {
-                  EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6);
-                  if (abs(mapped_out[i] - out_ref_data[i]) > 1e-6) {
-                    LOG(FATAL) << "error output idx:" << i;
-                  }
-                }
-
-                TargetWrapperCL::Unmap(x_data, mapped_x);
-                TargetWrapperCL::Unmap(filter_data, mapped_filter);
-                TargetWrapperCL::Unmap(bias_data, mapped_bias);
-                TargetWrapperCL::Unmap(out_data, mapped_out);
-#ifdef LOOP_TEST
-              }  // with_relu
-            }    // with_bias
-          }      // ic
-        }        // iw
-      }          // ih
-    }            // oc
-  }              // batch_size
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(conv2d, kOpenCL, kFloat, kNCHW, def);
diff --git a/lite/kernels/opencl/depthwise_conv2d_compute.cc b/lite/kernels/opencl/depthwise_conv2d_compute.cc
deleted file mode 100644
index 62734610e2..0000000000
--- a/lite/kernels/opencl/depthwise_conv2d_compute.cc
+++ /dev/null
@@ -1,132 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-#include "lite/backends/opencl/cl_include.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/replace_stl/stream.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace opencl {
-
-class DepthwiseConv2dCompute
-    : public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
- public:
-  using param_t = operators::ConvParam;
-
-  void PrepareForRun() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    if (param.fuse_relu) {
-      build_options_ += " -DRELU";
-    }
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/depthwise_conv2d_kernel.cl", build_options_);
-  }
-
-  void Run() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    auto x_dims = param.x->dims();
-    auto filter_dims = param.filter->dims();
-    auto output_dims = param.output->dims();
-    auto paddings = param.paddings;
-    auto strides = param.strides;
-
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    auto* input_buf = param.x->data<float, cl::Buffer>();
-    auto* filter_buf = param.filter->data<float, cl::Buffer>();
-    auto* bias_buf = param.bias == nullptr
-                         ? static_cast<cl::Buffer*>(nullptr)
-                         : param.bias->data<float, cl::Buffer>();
-    auto* output_buf =
-        param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-    cl_int status;
-    auto numel = output_dims.production();
-    int arg_idx = 0;
-    status = kernel.setArg(arg_idx, static_cast<const int>(numel));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *input_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[2]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[3]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[1]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[3]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(filter_dims[2]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(filter_dims[3]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(strides[1]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[1]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *output_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *filter_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *bias_buf);
-    CL_CHECK_FATAL(status);
-    auto global_work_size = cl::NDRange(static_cast<size_t>(numel));
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(output_buf, event_);
-  }
-
- private:
-  std::string kernel_func_name_{"depthwise_conv2d"};
-  std::string build_options_{"-DCL_DTYPE=float"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-
-}  // namespace opencl
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(depthwise_conv2d,
-                     kOpenCL,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::opencl::DepthwiseConv2dCompute,
-                     def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .Finalize();
diff --git a/lite/kernels/opencl/depthwise_conv2d_compute_test.cc b/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
deleted file mode 100644
index a189acaf91..0000000000
--- a/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
+++ /dev/null
@@ -1,181 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/backends/opencl/target_wrapper.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-
-template <typename T, int STRIDE_H = 1, int STRIDE_W = 1>
-void depth_conv(const T* input_data,
-                const lite::DDim& input_dims,
-                const T* filter_data,
-                const lite::DDim& filter_dims,
-                T* output_data,
-                const lite::DDim& output_dims) {
-  int stride_h = STRIDE_H, stride_w = STRIDE_W;
-
-  int64_t batches = input_dims[0];
-  int64_t channels = input_dims[1];
-  int64_t h = input_dims[2];
-  int64_t w = input_dims[3];
-
-  int64_t num_output = output_dims[1];
-  int64_t outh = output_dims[2];
-  int64_t outw = output_dims[3];
-
-  int64_t filter_h = filter_dims[2];
-  int64_t filter_w = filter_dims[3];
-
-  const int64_t in_batch_size = channels * h * w;
-  const int64_t out_batch_size = num_output * outh * outw;
-
-  auto kernel_offset = std::unique_ptr<int[]>(new int[filter_h * filter_w]);
-  {
-    int p = 0;
-    int offset = 0;
-    int gap = w - filter_w;
-    for (int i = 0; i < filter_h; i++) {
-      for (int j = 0; j < filter_w; j++) {
-        kernel_offset[p++] = offset;
-        offset += 1;
-      }
-      offset += gap;
-    }
-  }
-
-  for (int b = 0; b < batches; b++) {
-    auto* input_batch_start = input_data + b * in_batch_size;
-    auto* output_batch_start = output_data + b * out_batch_size;
-    for (int p = 0; p < num_output; p++) {
-      float* output_ptr = output_batch_start + p * outh * outw;
-      const float* filter_ptr = filter_data + p * filter_h * filter_w;
-      const float* input_ptr = input_batch_start + p * h * w;
-
-      for (int i = 0; i < outh; i++) {
-        for (int j = 0; j < outw; j++) {
-          float sum = 0;
-          const float* input_ch_start =
-              input_ptr + i * stride_h * w + j * stride_w;
-
-          for (int fh = 0; fh < filter_h; ++fh) {
-            for (int fw = 0; fw < filter_w; ++fw) {
-              float val = input_ch_start[kernel_offset[fh * filter_w + fw]];
-              float w = filter_ptr[fh * filter_w + fw];
-              sum += val * w;
-            }
-          }
-          output_ptr[j] = sum;
-        }
-
-        output_ptr += outw;
-      }
-    }
-  }
-}
-
-TEST(depthwise_conv2d, compute) {
-  LOG(INFO) << "to get kernel ...";
-  auto kernels = KernelRegistry::Global().Create("depthwise_conv2d",
-                                                 TARGET(kOpenCL),
-                                                 PRECISION(kFloat),
-                                                 DATALAYOUT(kNCHW));
-  ASSERT_FALSE(kernels.empty());
-
-  auto kernel = std::move(kernels.front());
-
-  LOG(INFO) << "get kernel";
-  lite::Tensor input, filter, output;
-  operators::ConvParam param;
-  param.x = &input;
-  param.filter = &filter;
-  param.output = &output;
-  param.paddings = std::vector<int>{0, 0};
-  param.strides = std::vector<int>{1, 1};
-
-  std::unique_ptr<KernelContext> context(new KernelContext);
-  context->As<OpenCLContext>().InitOnce();
-
-  kernel->SetParam(param);
-  std::unique_ptr<KernelContext> dep_context(new KernelContext);
-  context->As<OpenCLContext>().CopySharedTo(
-      &(dep_context->As<OpenCLContext>()));
-  kernel->SetContext(std::move(dep_context));
-
-  std::default_random_engine engine;
-  std::uniform_real_distribution<float> gen(-5, 5);
-  std::vector<float> input_v(4 * 32 * 112 * 112);
-  std::vector<float> filter_v(32 * 1 * 3 * 3);
-  for (auto& i : input_v) {
-    i = gen(engine);
-  }
-  for (auto& f : filter_v) {
-    f = gen(engine);
-  }
-
-  input.Assign<float, lite::DDim, TARGET(kOpenCL)>(
-      input_v.data(), lite::DDim{std::vector<int64_t>({4, 32, 112, 112})});
-  filter.Assign<float, lite::DDim, TARGET(kOpenCL)>(
-      filter_v.data(), lite::DDim{std::vector<int64_t>({32, 1, 3, 3})});
-  output.Resize({4, 32, 110, 110});
-  kernel->Launch();
-
-  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto* out_ptr = param.output->data<float, cl::Buffer>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto& event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
-
-  lite::Tensor output_ref;
-  output_ref.Resize({4, 32, 110, 110});
-  auto* output_ref_data = output_ref.mutable_data<float>(TARGET(kARM));
-  auto* input_data = input.mutable_data<float, cl::Buffer>();
-  auto* filter_data = filter.mutable_data<float, cl::Buffer>();
-  auto* mapped_input = static_cast<float*>(TargetWrapperCL::Map(
-      input_data, 0, sizeof(float) * input.dims().production()));
-  auto* mapped_filter = static_cast<float*>(TargetWrapperCL::Map(
-      filter_data, 0, sizeof(float) * filter.dims().production()));
-  depth_conv<float, 1, 1>(mapped_input,
-                          input.dims(),
-                          mapped_filter,
-                          filter.dims(),
-                          output_ref_data,
-                          output_ref.dims());
-
-  auto* output_data = output.mutable_data<float, cl::Buffer>();
-  auto* mapped_output = static_cast<float*>(TargetWrapperCL::Map(
-      output_data, 0, sizeof(float) * output.dims().production()));
-
-  for (int i = 0; i < output.dims().production(); i++) {
-    EXPECT_NEAR(mapped_output[i], output_ref_data[i], 1e-4);
-  }
-
-  TargetWrapperCL::Unmap(output_data, mapped_output);
-  TargetWrapperCL::Unmap(filter_data, mapped_filter);
-  TargetWrapperCL::Unmap(input_data, mapped_input);
-}
-
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(depthwise_conv2d, kOpenCL, kFloat, kNCHW, def);
diff --git a/lite/kernels/opencl/elementwise_add_compute.cc b/lite/kernels/opencl/elementwise_add_compute.cc
deleted file mode 100644
index ad831010f8..0000000000
--- a/lite/kernels/opencl/elementwise_add_compute.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/opencl/elementwise_add_compute.h"
-#include <memory>
-#include "lite/backends/opencl/cl_include.h"
-#include "lite/core/op_registry.h"
-#include "lite/utils/replace_stl/stream.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace opencl {
-
-void ElementwiseAddCompute::PrepareForRun() {
-  auto& context = ctx_->As<OpenCLContext>();
-  context.cl_context()->AddKernel(
-      kernel_func_name_, "buffer/elementwise_add_kernel.cl", build_options_);
-  ele_param_ = param_.get_mutable<param_t>();
-  UpdateParams();
-}
-
-void ElementwiseAddCompute::Run() {
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  auto* x_buf = ele_param_->X->template data<float, cl::Buffer>();
-  auto* y_buf = ele_param_->Y->template data<float, cl::Buffer>();
-  auto* out_buf = ele_param_->Out->template mutable_data<float, cl::Buffer>(
-      TARGET(kOpenCL));
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_name_ << build_options_;
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-  VLOG(4) << TargetToStr(ele_param_->X->target());
-  VLOG(4) << TargetToStr(ele_param_->Y->target());
-  VLOG(4) << TargetToStr(ele_param_->Out->target());
-  int arg_idx = 0;
-  cl_int status = kernel.setArg(arg_idx, *x_buf);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *y_buf);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *out_buf);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, (const int)batch_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, (const int)channels_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, (const int)num_);
-  CL_CHECK_FATAL(status);
-
-  auto global_work_size = cl::NDRange{channels_, batch_};
-  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-      kernel,
-      cl::NullRange,
-      global_work_size,
-      cl::NullRange,
-      nullptr,
-      event_.get());
-  CL_CHECK_FATAL(status);
-  context.cl_wait_list()->emplace(out_buf, event_);
-}
-
-void ElementwiseAddCompute::UpdateParams() {
-  auto axis = ele_param_->axis;
-  const auto& x_dims = ele_param_->X->dims();
-  const auto& y_dims = ele_param_->Y->dims();
-  const auto& out_dims = ele_param_->Out->dims();
-  if (axis < 0) {
-    axis = static_cast<int>(x_dims.size() - y_dims.size());
-  }
-  for (int i = 0; i < axis; ++i) {
-    batch_ *= x_dims[i];
-  }
-  for (int i = 0; i < y_dims.size(); ++i) {
-    channels_ *= y_dims[i];
-  }
-  for (int i = static_cast<int>(y_dims.size() + axis); i < x_dims.size(); ++i) {
-    num_ *= x_dims[i];
-  }
-  VLOG(4) << "axis: " << axis;
-  VLOG(4) << "batch: " << batch_;
-  VLOG(4) << "channels: " << channels_;
-  VLOG(4) << "num: " << num_;
-}
-
-}  // namespace opencl
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-namespace ocl = paddle::lite::kernels::opencl;
-REGISTER_LITE_KERNEL(
-    elementwise_add, kOpenCL, kFloat, kNCHW, ocl::ElementwiseAddCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .Finalize();
diff --git a/lite/kernels/opencl/elementwise_add_compute.h b/lite/kernels/opencl/elementwise_add_compute.h
deleted file mode 100644
index 2f41dfaa2b..0000000000
--- a/lite/kernels/opencl/elementwise_add_compute.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <memory>
-#include <string>
-#include "lite/core/kernel.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace opencl {
-
-class ElementwiseAddCompute
-    : public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
- public:
-  using param_t = operators::ElementwiseParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
- protected:
-  void UpdateParams();
-
-  size_t batch_{1};
-  size_t channels_{1};
-  size_t num_{1};
-  param_t* ele_param_{nullptr};
-  std::string kernel_func_name_{"elementwise_add"};
-  std::string build_options_{"-DCL_DTYPE=float"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-
-}  // namespace opencl
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/opencl/elementwise_add_compute_test.cc b/lite/kernels/opencl/elementwise_add_compute_test.cc
deleted file mode 100644
index 69df2313bb..0000000000
--- a/lite/kernels/opencl/elementwise_add_compute_test.cc
+++ /dev/null
@@ -1,251 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <algorithm>
-#include <random>
-#include "lite/backends/opencl/target_wrapper.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-
-template <typename dtype>
-void elementwise_compute_ref(const dtype *x_data,
-                             const dtype *y_data,
-                             dtype *out_data,
-                             const DDim &x_dims,
-                             const DDim &y_dims,
-                             int axis,
-                             const std::string elt_type,
-                             bool use_relu = false) {
-  if (axis < 0) {
-    axis = x_dims.size() - y_dims.size();
-  }
-  int batch = 1;
-  int channels = 1;
-  int num = 1;
-  for (int i = 0; i < axis; ++i) {
-    batch *= x_dims[i];
-  }
-  for (int i = 0; i < y_dims.size(); ++i) {
-    channels *= y_dims[i];
-  }
-  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
-    num *= x_dims[i];
-  }
-  // do elementwise add/sub/max/...
-  if (elt_type == "add") {
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
-        int offset = (i * channels + j) * num;
-        const dtype *din_ptr = x_data + offset;
-        const dtype diny_data = y_data[j];
-        dtype *dout_ptr = out_data + offset;
-        for (int k = 0; k < num; ++k) {
-          *dout_ptr = *din_ptr + diny_data;
-          if (use_relu) {
-            *dout_ptr = std::max(*dout_ptr, static_cast<dtype>(0));
-          }
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  } else if (elt_type == "sub") {
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
-        int offset = (i * channels + j) * num;
-        const dtype *din_ptr = x_data + offset;
-        const dtype diny_data = y_data[j];
-        dtype *dout_ptr = out_data + offset;
-        for (int k = 0; k < num; ++k) {
-          *dout_ptr = *din_ptr - diny_data;
-          if (use_relu) {
-            *dout_ptr = std::max(*dout_ptr, static_cast<dtype>(0));
-          }
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  } else {
-    LOG(FATAL) << "unsupported Elementwise type: " << elt_type << std::endl;
-  }
-}
-
-TEST(elementwise_add, compute) {
-  LOG(INFO) << "to get kernel ...";
-  auto kernels = KernelRegistry::Global().Create(
-      "elementwise_add", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
-  ASSERT_FALSE(kernels.empty());
-
-  auto kernel = std::move(kernels.front());
-
-  LOG(INFO) << "get kernel";
-
-  lite::Tensor x, y, out;
-  operators::ElementwiseParam param;
-  param.X = &x;
-  param.Y = &y;
-  param.Out = &out;
-  param.axis = -1;
-
-  std::unique_ptr<KernelContext> context(new KernelContext);
-  context->As<OpenCLContext>().InitOnce();
-
-  kernel->SetParam(param);
-  std::unique_ptr<KernelContext> ele_context(new KernelContext);
-  context->As<OpenCLContext>().CopySharedTo(
-      &(ele_context->As<OpenCLContext>()));
-  kernel->SetContext(std::move(ele_context));
-
-  const DDim x_dim = DDim(std::vector<DDim::value_type>{3, 2, 1, 5});
-  const DDim y_dim = DDim(std::vector<DDim::value_type>{2, 1, 5});
-  const DDim out_dim = DDim(std::vector<DDim::value_type>{3, 2, 1, 5});
-  x.Resize(x_dim);
-  y.Resize(y_dim);
-  out.Resize(out_dim);
-
-  auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-  auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-
-  std::default_random_engine engine;
-  std::uniform_real_distribution<float> dist(-10, 10);
-  auto *mapped_x = static_cast<float *>(
-      TargetWrapperCL::Map(x_data, 0, sizeof(float) * x_dim.production()));
-  for (int i = 0; i < x_dim.production(); i++) {
-    mapped_x[i] = dist(engine);
-  }
-  auto *mapped_y = static_cast<float *>(
-      TargetWrapperCL::Map(y_data, 0, sizeof(float) * y_dim.production()));
-  for (int i = 0; i < y_dim.production(); i++) {
-    mapped_y[i] = dist(engine);
-  }
-
-  kernel->Launch();
-
-  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto *out_ptr = param.Out->data<float, cl::Buffer>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto &event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
-
-  std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
-  elementwise_compute_ref<float>(
-      mapped_x, mapped_y, out_ref.get(), x_dim, y_dim, param.axis, "add");
-
-  TargetWrapperCL::Unmap(x_data, mapped_x);
-  TargetWrapperCL::Unmap(y_data, mapped_y);
-  auto *out_data = out.mutable_data<float, cl::Buffer>();
-  auto *mapped_out = static_cast<float *>(
-      TargetWrapperCL::Map(out_data, 0, sizeof(float) * out_dim.production()));
-  for (int i = 0; i < out_dim.production(); i++) {
-    EXPECT_NEAR(mapped_out[i], out_ref[i], 1e-6);
-  }
-  TargetWrapperCL::Unmap(out_data, mapped_out);
-}
-
-TEST(fusion_elementwise_add_activation, compute) {
-  LOG(INFO) << "to get kernel ...";
-  auto kernels =
-      KernelRegistry::Global().Create("fusion_elementwise_add_activation",
-                                      TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW));
-  ASSERT_FALSE(kernels.empty());
-
-  auto kernel = std::move(kernels.front());
-
-  LOG(INFO) << "get kernel";
-
-  lite::Tensor x, y, out;
-  operators::FusionElementwiseActivationParam param;
-  param.X = &x;
-  param.Y = &y;
-  param.Out = &out;
-  param.axis = -1;
-  param.act_type = "relu";
-
-  std::unique_ptr<KernelContext> context(new KernelContext);
-  context->As<OpenCLContext>().InitOnce();
-
-  kernel->SetParam(param);
-  std::unique_ptr<KernelContext> ele_context(new KernelContext);
-  context->As<OpenCLContext>().CopySharedTo(
-      &(ele_context->As<OpenCLContext>()));
-  kernel->SetContext(std::move(ele_context));
-
-  const DDim x_dim = DDim(std::vector<DDim::value_type>{30, 20, 10, 50});
-  const DDim y_dim = DDim(std::vector<DDim::value_type>{20, 10, 50});
-  const DDim out_dim = DDim(std::vector<DDim::value_type>{30, 20, 10, 50});
-  x.Resize(x_dim);
-  y.Resize(y_dim);
-  out.Resize(out_dim);
-
-  auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-  auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-
-  std::default_random_engine engine;
-  std::uniform_real_distribution<float> dist(-10, 10);
-  auto *mapped_x = static_cast<float *>(
-      TargetWrapperCL::Map(x_data, 0, sizeof(float) * x_dim.production()));
-  for (int i = 0; i < x_dim.production(); i++) {
-    mapped_x[i] = dist(engine);
-  }
-  auto *mapped_y = static_cast<float *>(
-      TargetWrapperCL::Map(y_data, 0, sizeof(float) * y_dim.production()));
-  for (int i = 0; i < y_dim.production(); i++) {
-    mapped_y[i] = dist(engine);
-  }
-
-  kernel->Launch();
-
-  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto *out_ptr = param.Out->data<float, cl::Buffer>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto &event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
-
-  std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
-  elementwise_compute_ref<float>(
-      mapped_x, mapped_y, out_ref.get(), x_dim, y_dim, param.axis, "add", true);
-
-  TargetWrapperCL::Unmap(x_data, mapped_x);
-  TargetWrapperCL::Unmap(y_data, mapped_y);
-  auto *out_data = out.mutable_data<float, cl::Buffer>();
-  auto *mapped_out = static_cast<float *>(
-      TargetWrapperCL::Map(out_data, 0, sizeof(float) * out_dim.production()));
-  for (int i = 0; i < out_dim.production(); i++) {
-    EXPECT_NEAR(mapped_out[i], out_ref[i], 1e-6);
-  }
-  TargetWrapperCL::Unmap(out_data, mapped_out);
-}
-
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(elementwise_add, kOpenCL, kFloat, kNCHW, def);
-USE_LITE_KERNEL(fusion_elementwise_add_activation, kOpenCL, kFloat, kNCHW, def);
diff --git a/lite/kernels/opencl/fc_compute.cc b/lite/kernels/opencl/fc_compute.cc
deleted file mode 100644
index 1f8ba6ae2f..0000000000
--- a/lite/kernels/opencl/fc_compute.cc
+++ /dev/null
@@ -1,126 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-#include "lite/backends/opencl/cl_include.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/replace_stl/stream.h"
-#include "lite/utils/string.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace opencl {
-
-class FcCompute
-    : public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
- public:
-  using param_t = operators::FcParam;
-
-  void PrepareForRun() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    const auto x_dims = param.input->dims();
-    const auto w_dims = param.w->dims();
-
-    CHECK_GE(x_dims.size(), 2UL);
-    CHECK_GE(w_dims.size(), 2UL);
-    CHECK_EQ(param.output->dims().size(), 2UL);
-
-    m_ = x_dims.Slice(0, param.in_num_col_dims).production();
-    k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();
-    n_ = w_dims[1];
-    CHECK_EQ(k_, static_cast<int>(w_dims[0]));
-    VLOG(4) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2]
-            << " " << x_dims[3];
-    VLOG(4) << "w_dims:" << w_dims[0] << " " << w_dims[1] << " " << w_dims[2]
-            << " " << w_dims[3];
-    VLOG(4) << "m_: " << m_ << " n_: " << n_ << " k_: " << k_;
-
-    if (m_ == 1) {  // gemv
-      kernel_func_name_ = "fc_gemv_1x4";
-      global_work_size_ = cl::NDRange{static_cast<size_t>((n_ + 3) / 4)};
-    } else {  // gemm
-      kernel_func_name_ = "fc_gemm_4x4";
-      global_work_size_ = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
-                                      static_cast<size_t>((n_ + 3) / 4)};
-    }
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/fc_kernel.cl", build_options_);
-  }
-
-  void Run() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    auto* x_buf = param.input->data<float, cl::Buffer>();
-    auto* w_buf = param.w->data<float, cl::Buffer>();
-    auto* bias_buf = param.bias->data<float, cl::Buffer>();
-    auto* out_buf =
-        param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-    cl_int status;
-    int arg_idx = 0;
-    status = kernel.setArg(arg_idx, *x_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *w_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *bias_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(m_));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(n_));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(k_));
-    CL_CHECK_FATAL(status);
-
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size_,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_buf, event_);
-  }
-
- private:
-  int m_, n_, k_;
-  std::string kernel_func_name_{};
-  std::string build_options_{"-DCL_DTYPE=float"};
-  cl::NDRange global_work_size_;
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-
-}  // namespace opencl
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    fc, kOpenCL, kFloat, kNCHW, paddle::lite::kernels::opencl::FcCompute, def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("W", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .Finalize();
diff --git a/lite/kernels/opencl/fc_compute_test.cc b/lite/kernels/opencl/fc_compute_test.cc
deleted file mode 100644
index 7f0c9c49a9..0000000000
--- a/lite/kernels/opencl/fc_compute_test.cc
+++ /dev/null
@@ -1,200 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/backends/opencl/target_wrapper.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-
-#define A(i, j) a[i * lda + j]
-#define B(i, j) b[i * ldb + j]
-#define C(i, j) c[i * ldc + j]
-
-template <typename T>
-void gemm_bias(const T* a,
-               const int M,
-               const int K,
-               const T* b,
-               const int K_,
-               const int N,
-               T* biases,
-               T* c) {
-  EXPECT_TRUE(K_ == K && M > 0 && N > 0 && K > 0);
-  EXPECT_TRUE(a && b && c);
-  const int lda = K;
-  const int ldb = N;
-  const int ldc = N;
-  for (int m = 0; m < M; ++m) {
-    for (int n = 0; n < N; ++n) {
-      C(m, n) = 0.0f;
-      for (int k = 0; k < K; ++k) {
-        C(m, n) += A(m, k) * B(k, n);
-      }
-    }
-  }
-  if (biases) {
-    for (int m = 0; m < M; ++m) {
-      for (int n = 0; n < N; ++n) {
-        C(m, n) += biases[n];
-      }
-    }
-  }
-}
-
-void PrintData(std::string name, float* a, const int rows, const int cols) {
-  std::cout << "==== " << name << " ====" << std::endl;
-  for (int r = 0; r < rows; ++r) {
-    for (int c = 0; c < cols; ++c) {
-      std::cout << " " << a[r * cols + c];
-    }
-    std::cout << std::endl;
-  }
-}
-
-// #define PRINT_RESULT
-#define LOOP_TEST
-TEST(fc, compute) {
-  std::unique_ptr<KernelContext> context(new KernelContext);
-  context->As<OpenCLContext>().InitOnce();
-
-#ifdef LOOP_TEST
-  for (int m = 1; m < 213; m += 71) {
-    for (int k = 1; k < 123; k += 31) {
-      for (int n = 1; n < 123; n += 121) {
-#else
-#if 0
-  const int m = 1;
-  const int k = 1024;
-  const int n = 1000;
-#else
-  const int m = 2;
-  const int k = 3;
-  const int n = 1;
-#endif
-#endif
-        LOG(INFO) << "m=" << m << " n=" << n << " k=" << k;
-
-        auto kernels = KernelRegistry::Global().Create(
-            "fc", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
-        ASSERT_FALSE(kernels.empty());
-        auto kernel = std::move(kernels.front());
-
-        lite::Tensor x, w, bias, out, out_ref;
-        operators::FcParam param;
-        param.input = &x;
-        param.w = &w;
-        param.bias = &bias;
-        param.output = &out;
-        param.in_num_col_dims = 1;
-
-        kernel->SetParam(param);
-        std::unique_ptr<KernelContext> fc_context(new KernelContext);
-        context->As<OpenCLContext>().CopySharedTo(
-            &(fc_context->As<OpenCLContext>()));
-        kernel->SetContext(std::move(fc_context));
-
-        const DDim x_dim = DDim(std::vector<DDim::value_type>{m, k});
-        const DDim w_dim = DDim(std::vector<DDim::value_type>{k, n});
-        const DDim bias_dim = DDim(std::vector<DDim::value_type>{n});
-        const DDim out_dim = DDim(std::vector<DDim::value_type>{m, n});
-
-        x.Resize(x_dim);
-        w.Resize(w_dim);
-        bias.Resize(bias_dim);
-        out.Resize(out_dim);
-        out_ref.Resize(out_dim);
-
-        auto* x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-        auto* w_data = w.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-        auto* bias_data = bias.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-
-        std::default_random_engine engine;
-        std::uniform_real_distribution<float> dist(-5, 5);
-        auto* mapped_x = static_cast<float*>(TargetWrapperCL::Map(
-            x_data, 0, sizeof(float) * x_dim.production()));
-        for (int i = 0; i < x_dim.production(); ++i) {
-          mapped_x[i] = static_cast<int>(dist(engine));
-        }
-        auto* mapped_w = static_cast<float*>(TargetWrapperCL::Map(
-            w_data, 0, sizeof(float) * w_dim.production()));
-        for (int i = 0; i < w_dim.production(); ++i) {
-          mapped_w[i] = static_cast<int>((dist(engine)));
-        }
-        auto* mapped_bias = static_cast<float*>(TargetWrapperCL::Map(
-            bias_data, 0, sizeof(float) * bias_dim.production()));
-        for (int i = 0; i < bias_dim.production(); ++i) {
-          mapped_bias[i] = static_cast<int>(/*(dist(engine))*/ 1);
-        }
-
-        // run opencl kernel
-        kernel->Launch();
-
-        auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-        auto* out_ptr = param.output->data<float, cl::Buffer>();
-        auto it = wait_list->find(out_ptr);
-        if (it != wait_list->end()) {
-          VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-          auto& event = *(it->second);
-          event.wait();
-          double start_nanos =
-              event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
-          double stop_nanos =
-              event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
-          double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
-          LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros << " us.";
-        } else {
-          LOG(FATAL)
-              << "Could not find the sync event for the target cl tensor.";
-        }
-
-        // run cpu ref
-        auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
-        gemm_bias<float>(
-            mapped_x, m, k, mapped_w, k, n, mapped_bias, out_ref_data);
-
-        auto* out_data = out.mutable_data<float, cl::Buffer>();
-        auto* mapped_out = static_cast<float*>(TargetWrapperCL::Map(
-            out_data, 0, sizeof(float) * out_dim.production()));
-
-#ifdef PRINT_RESULT
-        PrintData("mapped_x", static_cast<float*>(mapped_x), m, k);
-        PrintData("mapped_w", static_cast<float*>(mapped_w), k, n);
-        PrintData("mapped_bias", static_cast<float*>(mapped_bias), 1, n);
-        PrintData("out_ref_data", static_cast<float*>(out_ref_data), m, n);
-        PrintData("mapped_out", static_cast<float*>(mapped_out), m, n);
-#endif
-
-        for (int i = 0; i < out_dim.production(); i++) {
-          EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6);
-        }
-
-        TargetWrapperCL::Unmap(x_data, mapped_x);
-        TargetWrapperCL::Unmap(w_data, mapped_w);
-        TargetWrapperCL::Unmap(bias_data, mapped_bias);
-        TargetWrapperCL::Unmap(out_data, mapped_out);
-#ifdef LOOP_TEST
-      }  // n
-    }    // k
-  }      // m
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(fc, kOpenCL, kFloat, kNCHW, def);
diff --git a/lite/kernels/opencl/fusion_elementwise_add_activation_compute.cc b/lite/kernels/opencl/fusion_elementwise_add_activation_compute.cc
deleted file mode 100644
index ad17575d69..0000000000
--- a/lite/kernels/opencl/fusion_elementwise_add_activation_compute.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/opencl/cl_include.h"
-#include "lite/core/op_registry.h"
-#include "lite/kernels/opencl/elementwise_add_compute.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace opencl {
-class FusionElementwiseAddActivationCompute : public ElementwiseAddCompute {
- public:
-  using param_t = operators::FusionElementwiseActivationParam;
-
-  void PrepareForRun() override {
-    build_options_ += " -DRELU";
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/elementwise_add_kernel.cl", build_options_);
-    ele_param_ = param_.get_mutable<param_t>();
-    UpdateParams();
-    auto act_t = static_cast<param_t*>(ele_param_)->act_type;
-    VLOG(4) << "act: " << act_t;
-    if (act_t != "relu") {
-      LOG(FATAL) << "Unsupported Activation type: " << act_t;
-    }
-  }
-};
-}  // namespace opencl
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-namespace ocl = paddle::lite::kernels::opencl;
-REGISTER_LITE_KERNEL(fusion_elementwise_add_activation,
-                     kOpenCL,
-                     kFloat,
-                     kNCHW,
-                     ocl::FusionElementwiseAddActivationCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .Finalize();
diff --git a/lite/kernels/opencl/io_copy_compute.cc b/lite/kernels/opencl/io_copy_compute.cc
deleted file mode 100644
index 1d43f7d97e..0000000000
--- a/lite/kernels/opencl/io_copy_compute.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/opencl/target_wrapper.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace opencl {
-
-// Host to OpenCL memory.
-void CopyFromHostSync(void* target, const void* source, size_t size) {
-  TargetWrapperCL::MemcpySync(target, source, size, IoDirection::HtoD);
-}
-
-// Device to Host memory.
-void CopyToHostSync(void* target, const void* source, size_t size) {
-  TargetWrapperCL::MemcpySync(target, source, size, IoDirection::DtoH);
-}
-
-/*
- * This kernel copies a tensor from host to OpenCL space.
- */
-class IoCopyHostToOpenCLCompute
-    : public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kAny)> {
- public:
-  void Run() override {
-    auto& param = Param<operators::IoCopyParam>();
-    CHECK(param.x->target() == TARGET(kHost) ||
-          param.x->target() == TARGET(kARM));
-    auto mem_size = param.x->memory_size();
-    VLOG(4) << "copy size " << mem_size;
-    auto* data = param.y->mutable_data(TARGET(kOpenCL), mem_size);
-    CopyFromHostSync(data, param.x->raw_data(), mem_size);
-  }
-
-  std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
-    std::unique_ptr<type_infer_handler_t> res(new type_infer_handler_t);
-    *res = [](const std::map<std::string, const Type*>& inputs,
-              const std::string& out) -> const Type* {
-      CHECK(!inputs.empty());
-      auto* type = inputs.at("Input");
-      CHECK(type->target() == TARGET(kHost));
-
-      auto out_place = type->place();
-      out_place.target = TARGET(kOpenCL);
-      auto* out_type = Type::Get(type->id(),
-                                 out_place.target,
-                                 out_place.precision,
-                                 out_place.layout,
-                                 out_place.device);
-      return out_type;
-    };
-    return res;
-  }
-
-  std::string doc() const override { return "Copy IO from HOST to OpenCL"; }
-};
-
-/*
- * This kernel copies a tensor from OpenCL to host space.
- */
-class IoCopykOpenCLToHostCompute
-    : public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kAny)> {
- public:
-  void Run() override {
-    auto& param = Param<operators::IoCopyParam>();
-    CHECK(param.x->target() == TARGET(kOpenCL));
-    auto mem_size = param.x->memory_size();
-    VLOG(4) << "copy size " << mem_size;
-    auto* data = param.y->mutable_data(TARGET(kHost), mem_size);
-    auto& context = ctx_->As<OpenCLContext>();
-    auto* wait_list = context.cl_wait_list();
-    auto* x_ptr = param.x->data<float, cl::Buffer>();
-    auto it = wait_list->find(x_ptr);
-    if (it != wait_list->end()) {
-      VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-      auto& event = *(it->second);
-      event.wait();
-    } else {
-      LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-    }
-    CopyToHostSync(data, param.x->raw_data(), mem_size);
-  }
-
-  std::string doc() const override { return "Copy IO from OpenCL to HOST"; }
-};
-
-}  // namespace opencl
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(io_copy,
-                     kOpenCL,
-                     kAny,
-                     kAny,
-                     paddle::lite::kernels::opencl::IoCopyHostToOpenCLCompute,
-                     host_to_device)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(io_copy,
-                     kOpenCL,
-                     kAny,
-                     kAny,
-                     paddle::lite::kernels::opencl::IoCopykOpenCLToHostCompute,
-                     device_to_host)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(io_copy_once,
-                     kOpenCL,
-                     kAny,
-                     kAny,
-                     paddle::lite::kernels::opencl::IoCopyHostToOpenCLCompute,
-                     host_to_device)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(io_copy_once,
-                     kOpenCL,
-                     kAny,
-                     kAny,
-                     paddle::lite::kernels::opencl::IoCopykOpenCLToHostCompute,
-                     device_to_host)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
-    .Finalize();
diff --git a/lite/kernels/opencl/io_copy_compute_test.cc b/lite/kernels/opencl/io_copy_compute_test.cc
deleted file mode 100644
index 320e257d39..0000000000
--- a/lite/kernels/opencl/io_copy_compute_test.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <iterator>
-#include <random>
-#include "lite/backends/opencl/target_wrapper.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-
-TEST(io_copy, compute) {
-  LOG(INFO) << "to get kernel ...";
-  auto h2d_kernels = KernelRegistry::Global().Create(
-      "io_copy", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kAny));
-  ASSERT_FALSE(h2d_kernels.empty());
-
-  auto h2d_kernel = std::move(h2d_kernels.front());
-  auto d2h_kernel = std::move(*std::next(h2d_kernels.begin(), 1));
-  LOG(INFO) << "get first kernel: " << h2d_kernel->doc();
-  LOG(INFO) << "get second kernel: " << d2h_kernel->doc();
-  lite::Tensor h_x, d_y, h_y;
-  operators::IoCopyParam h2d_param;
-  h2d_param.x = &h_x;
-  h2d_param.y = &d_y;
-  operators::IoCopyParam d2h_param;
-  d2h_param.x = &d_y;
-  d2h_param.y = &h_y;
-  std::unique_ptr<KernelContext> context(new KernelContext);
-  context->As<OpenCLContext>().InitOnce();
-
-  h2d_kernel->SetParam(h2d_param);
-  std::unique_ptr<KernelContext> h2d_context(new KernelContext);
-  context->As<OpenCLContext>().CopySharedTo(
-      &(h2d_context->As<OpenCLContext>()));
-  h2d_kernel->SetContext(std::move(h2d_context));
-  d2h_kernel->SetParam(d2h_param);
-  std::unique_ptr<KernelContext> d2h_context(new KernelContext);
-  context->As<OpenCLContext>().CopySharedTo(
-      &(d2h_context->As<OpenCLContext>()));
-  d2h_kernel->SetContext(std::move(d2h_context));
-
-  const DDim dim = DDim(std::vector<DDim::value_type>{3, 9, 28, 28});
-  h_x.Resize(dim);
-  d_y.Resize(dim);
-  h_y.Resize(dim);
-
-  auto* h_x_data = h_x.mutable_data<float>(TARGET(kARM));
-  for (int i = 0; i < 3 * 9 * 28 * 28; i++) {
-    h_x_data[i] = 3.14f * i / 1000.f;
-  }
-
-  h2d_kernel->Launch();
-  auto* event_key = d_y.data<float, cl::Buffer>();
-  std::shared_ptr<cl::Event> event(new cl::Event);
-  context->As<OpenCLContext>().cl_wait_list()->emplace(event_key, event);
-  d2h_kernel->Launch();
-
-  auto* h_y_data = h_y.data<float>();
-
-  for (int i = 0; i < 3 * 9 * 28 * 28; i++) {
-    EXPECT_NEAR(h_x_data[i], h_y_data[i], 1e-6);
-  }
-}
-
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(io_copy, kOpenCL, kAny, kAny, host_to_device);
-USE_LITE_KERNEL(io_copy, kOpenCL, kAny, kAny, device_to_host);
diff --git a/lite/kernels/opencl/mul_compute.cc b/lite/kernels/opencl/mul_compute.cc
deleted file mode 100644
index 78c9131774..0000000000
--- a/lite/kernels/opencl/mul_compute.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-#include "lite/backends/opencl/cl_include.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/replace_stl/stream.h"
-#include "lite/utils/string.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace opencl {
-
-class MulCompute
-    : public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
- public:
-  using param_t = operators::MulParam;
-
-  void PrepareForRun() override {
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/mat_mul_kernel.cl", build_options_);
-    const auto& param = *param_.get_mutable<param_t>();
-    const auto* x_data = param.x->data<float>();
-    const auto* y_data = param.y->data<float>();
-    auto* o_data = param.output->mutable_data<float>();
-
-    m_ = static_cast<int>(
-        param.x->dims().Slice(0, param.x_num_col_dims).production());
-    const int x_w = static_cast<int>(
-        param.x->dims()
-            .Slice(param.x_num_col_dims, param.x->dims().size())
-            .production());
-    int y_h = static_cast<int>(
-        param.y->dims().Slice(0, param.y_num_col_dims).production());
-    n_ = static_cast<int>(
-        param.y->dims()
-            .Slice(param.y_num_col_dims, param.y->dims().size())
-            .production());
-
-    CHECK_EQ(x_w, y_h) << "x_w must be equal with y_h";
-    k_ = x_w;
-    VLOG(4) << "m: " << m_ << " n_: " << n_ << " k_: " << k_ << " y_h: " << y_h
-            << " x_w: " << x_w;
-  }
-
-  void Run() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    auto* x_buf = param.x->data<float, cl::Buffer>();
-    auto* y_buf = param.y->data<float, cl::Buffer>();
-    auto* out_buf =
-        param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-    cl_int status;
-    int arg_idx = 0;
-    status = kernel.setArg(arg_idx, *x_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *y_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, m_);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, n_);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, k_);
-    CL_CHECK_FATAL(status);
-
-    auto global_work_size = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
-                                        static_cast<size_t>((n_ + 3) / 4)};
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_buf, event_);
-  }
-
- private:
-  int m_, n_, k_;
-  std::string kernel_func_name_{"mat_mul"};
-  std::string build_options_{"-DCL_DTYPE=float"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-
-}  // namespace opencl
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    mul, kOpenCL, kFloat, kNCHW, paddle::lite::kernels::opencl::MulCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .Finalize();
diff --git a/lite/kernels/opencl/mul_compute_test.cc b/lite/kernels/opencl/mul_compute_test.cc
deleted file mode 100644
index e35eca658c..0000000000
--- a/lite/kernels/opencl/mul_compute_test.cc
+++ /dev/null
@@ -1,170 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/backends/opencl/target_wrapper.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-
-#define A(i, j) a[i * lda + j]
-#define B(i, j) b[i * ldb + j]
-#define C(i, j) c[i * ldc + j]
-
-template <typename T>
-void mul_gemm(const T* a,
-              const int M,
-              const int K,
-              const T* b,
-              const int K_,
-              const int N,
-              T* c) {
-  EXPECT_TRUE(K_ == K && M > 0 && N > 0 && K > 0);
-  EXPECT_TRUE(a && b && c);
-  const int lda = K;
-  const int ldb = N;
-  const int ldc = N;
-  for (int m = 0; m < M; ++m) {
-    for (int n = 0; n < N; ++n) {
-      C(m, n) = 0.0f;
-      for (int k = 0; k < K; ++k) {
-        C(m, n) += A(m, k) * B(k, n);
-      }
-    }
-  }
-}
-
-void PrintData(std::string name, float* a, const int rows, const int cols) {
-  std::cout << "==== " << name << " ====" << std::endl;
-  for (int r = 0; r < rows; ++r) {
-    for (int c = 0; c < cols; ++c) {
-      std::cout << " " << a[r * cols + c];
-    }
-    std::cout << std::endl;
-  }
-}
-
-// #define PRINT_RESULT
-#define LOOP_TEST
-TEST(mul, compute) {
-  std::unique_ptr<KernelContext> context(new KernelContext);
-  context->As<OpenCLContext>().InitOnce();
-
-#ifdef LOOP_TEST
-  for (int m = 1; m < 213; m += 71) {
-    for (int k = 1; k < 123; k += 31) {
-      for (int n = 1; n < 123; n += 121) {
-        LOG(INFO) << "m=" << m << " n=" << n << " k=" << k;
-#else
-  const int m = 1;
-  const int k = 1;
-  const int n = 13;
-#endif
-        LOG(INFO) << "m=" << m << " n=" << n << " k=" << k;
-
-        auto kernels = KernelRegistry::Global().Create(
-            "mul", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
-        ASSERT_FALSE(kernels.empty());
-        auto kernel = std::move(kernels.front());
-
-        lite::Tensor x, y, out, out_ref;
-        operators::MulParam param;
-        param.x = &x;
-        param.y = &y;
-        param.output = &out;
-        param.x_num_col_dims = 1;
-        param.y_num_col_dims = 1;
-
-        kernel->SetParam(param);
-        std::unique_ptr<KernelContext> mul_context(new KernelContext);
-        context->As<OpenCLContext>().CopySharedTo(
-            &(mul_context->As<OpenCLContext>()));
-        kernel->SetContext(std::move(mul_context));
-
-        const DDim x_dim = DDim(std::vector<DDim::value_type>{m, k});
-        const DDim y_dim = DDim(std::vector<DDim::value_type>{k, n});
-        const DDim out_dim = DDim(std::vector<DDim::value_type>{m, n});
-
-        x.Resize(x_dim);
-        y.Resize(y_dim);
-        out.Resize(out_dim);
-        out_ref.Resize(out_dim);
-
-        auto* x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-        auto* y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-
-        std::default_random_engine engine;
-        std::uniform_real_distribution<float> dist(-5, 5);
-        auto* mapped_x = static_cast<float*>(TargetWrapperCL::Map(
-            x_data, 0, sizeof(float) * x_dim.production()));
-        for (int i = 0; i < x_dim.production(); ++i) {
-          mapped_x[i] = static_cast<int>(dist(engine));
-        }
-        auto* mapped_y = static_cast<float*>(TargetWrapperCL::Map(
-            y_data, 0, sizeof(float) * y_dim.production()));
-        for (int i = 0; i < y_dim.production(); ++i) {
-          mapped_y[i] = static_cast<int>((dist(engine)));
-        }
-
-        // run opencl kernel
-        kernel->Launch();
-
-        auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-        auto* out_ptr = param.output->data<float, cl::Buffer>();
-        auto it = wait_list->find(out_ptr);
-        if (it != wait_list->end()) {
-          VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-          auto& event = *(it->second);
-          event.wait();
-        } else {
-          LOG(FATAL)
-              << "Could not find the sync event for the target cl tensor.";
-        }
-
-        // run cpu ref
-        auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
-        mul_gemm<float>(mapped_x, m, k, mapped_y, k, n, out_ref_data);
-
-#ifdef PRINT_RESULT
-        PrintData("x_data", static_cast<float*>(mapped_x), m, k);
-        PrintData("y_data", static_cast<float*>(mapped_y), k, n);
-        PrintData("out_ref_data", static_cast<float*>(out_ref_data), m, n);
-        PrintData("mapped_out", static_cast<float*>(mapped_out), m, n);
-#endif
-
-        auto* out_data = out.mutable_data<float, cl::Buffer>();
-        auto* mapped_out = static_cast<float*>(TargetWrapperCL::Map(
-            out_data, 0, sizeof(float) * out_dim.production()));
-
-        for (int i = 0; i < out_dim.production(); i++) {
-          EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6);
-        }
-
-        TargetWrapperCL::Unmap(x_data, mapped_x);
-        TargetWrapperCL::Unmap(y_data, mapped_y);
-        TargetWrapperCL::Unmap(out_data, mapped_out);
-#ifdef LOOP_TEST
-      }  // n
-    }    // k
-  }      // m
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(mul, kOpenCL, kFloat, kNCHW, def);
diff --git a/lite/kernels/opencl/pool_compute.cc b/lite/kernels/opencl/pool_compute.cc
deleted file mode 100644
index dc2e851595..0000000000
--- a/lite/kernels/opencl/pool_compute.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-#include "lite/backends/opencl/cl_include.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/replace_stl/stream.h"
-#include "lite/utils/string.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace opencl {
-
-class PoolCompute
-    : public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
- public:
-  using param_t = operators::PoolParam;
-
-  void PrepareForRun() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    kernel_func_name_ += param.pooling_type;
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/pool_kernel.cl", build_options_);
-  }
-
-  void Run() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    const auto& in_dims = param.x->dims();
-    const auto& out_dims = param.output->dims();
-    const std::string pooling_type = param.pooling_type;
-    const bool global_pooling = param.global_pooling;
-    std::vector<int> paddings = param.paddings;
-    std::vector<int> strides = param.strides;
-    std::vector<int> ksize = param.ksize;
-    if (global_pooling) {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
-        ksize[i] = static_cast<int>(in_dims[i + 2]);
-      }
-    }
-
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    auto* input_buf = param.x->data<float, cl::Buffer>();
-    auto* output_buf =
-        param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-    cl_int status;
-    auto numel = out_dims.production();
-    int arg_idx = 0;
-    status = kernel.setArg(arg_idx, static_cast<const int>(numel));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *input_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims[1]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims[2]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims[3]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims[2]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims[3]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(ksize[0]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(ksize[1]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(strides[1]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[1]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *output_buf);
-    CL_CHECK_FATAL(status);
-    auto global_work_size = cl::NDRange(static_cast<size_t>(numel));
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(output_buf, event_);
-  }
-
- private:
-  std::string kernel_func_name_{"pool_"};
-  std::string build_options_{"-DCL_DTYPE=float"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-
-}  // namespace opencl
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(pool2d,
-                     kOpenCL,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::opencl::PoolCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .Finalize();
diff --git a/lite/kernels/opencl/pool_compute_test.cc b/lite/kernels/opencl/pool_compute_test.cc
deleted file mode 100644
index 53f64e9505..0000000000
--- a/lite/kernels/opencl/pool_compute_test.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/backends/opencl/target_wrapper.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-
-void pool_avg(const int padding_height,
-              const int padding_width,
-              const int stride_height,
-              const int stride_width,
-              const int ksize_height,
-              const int ksize_width,
-              const float* input_data,
-              const DDim& in_dim,
-              float* output_data,
-              const DDim& out_dim) {
-  const int batch_size = in_dim[0];
-  const int input_height = in_dim[2];
-  const int input_width = in_dim[3];
-  const int output_channels = out_dim[1];
-  const int output_height = out_dim[2];
-  const int output_width = out_dim[3];
-
-  const size_t input_spatial_size = input_height * input_width;
-  const size_t output_spatial_size = output_height * output_width;
-
-  for (int i = 0; i < batch_size; i++) {
-    for (int c = 0; c < output_channels; ++c) {
-      int channel = i * output_channels + c;
-      const float* input_ptr = input_data + channel * input_spatial_size;
-      float* output_ptr = output_data + channel * output_spatial_size;
-
-      for (int ph = 0; ph < output_height; ++ph) {
-        int hstart = ph * stride_height - padding_height;
-        int hend = std::min(hstart + ksize_height, input_height);
-        hstart = std::max(hstart, 0);
-        for (int pw = 0; pw < output_width; ++pw) {
-          int wstart = pw * stride_width - padding_width;
-          int wend = std::min(wstart + ksize_width, input_width);
-          wstart = std::max(wstart, 0);
-
-          float val = 0.f;
-          int count = 0;
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              val += input_ptr[h * input_width + w];
-              ++count;
-            }
-          }
-          output_ptr[ph * output_width + pw] =
-              (count > 0) ? val * (1.f / count) : 0.f;
-        }
-      }
-    }
-  }
-}
-
-TEST(pool2d, compute) {
-  LOG(INFO) << "to get kernel ...";
-  auto kernels = KernelRegistry::Global().Create(
-      "pool2d", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
-  ASSERT_FALSE(kernels.empty());
-
-  auto kernel = std::move(kernels.front());
-
-  LOG(INFO) << "get kernel";
-
-  lite::Tensor x, out;
-  operators::PoolParam param;
-  param.x = &x;
-  param.output = &out;
-  param.global_pooling = true;
-  param.pooling_type = "avg";
-  param.paddings = std::vector<int>{0, 0};
-  param.strides = std::vector<int>{1, 1};
-  param.ksize = std::vector<int>{7, 7};
-
-  std::unique_ptr<KernelContext> context(new KernelContext);
-  context->As<OpenCLContext>().InitOnce();
-
-  kernel->SetParam(param);
-  std::unique_ptr<KernelContext> pool_context(new KernelContext);
-  context->As<OpenCLContext>().CopySharedTo(
-      &(pool_context->As<OpenCLContext>()));
-  kernel->SetContext(std::move(pool_context));
-
-  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 1024, 7, 7});
-  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 1024, 1, 1});
-  x.Resize(in_dim);
-  out.Resize(out_dim);
-
-  auto* x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-
-  std::default_random_engine engine;
-  std::uniform_real_distribution<float> dist(-5, 5);
-  auto* mapped_x = static_cast<float*>(
-      TargetWrapperCL::Map(x_data, 0, sizeof(float) * in_dim.production()));
-  for (int i = 0; i < in_dim.production(); i++) {
-    mapped_x[i] = dist(engine);
-  }
-
-  kernel->Launch();
-
-  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto* out_ptr = param.output->data<float, cl::Buffer>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto& event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
-
-  std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
-  pool_avg(0, 0, 1, 1, 7, 7, mapped_x, in_dim, out_ref.get(), out_dim);
-  TargetWrapperCL::Unmap(x_data, mapped_x);
-  auto* out_data = out.mutable_data<float, cl::Buffer>();
-  auto* mapped_out = static_cast<float*>(
-      TargetWrapperCL::Map(out_data, 0, sizeof(float) * out_dim.production()));
-  for (int i = 0; i < out_dim.production(); i++) {
-    EXPECT_NEAR(mapped_out[i], out_ref[i], 1e-6);
-  }
-  TargetWrapperCL::Unmap(out_data, mapped_out);
-}
-
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kNCHW, def);
diff --git a/lite/kernels/opencl/relu_compute.cc b/lite/kernels/opencl/relu_compute.cc
deleted file mode 100644
index 93d1dec674..0000000000
--- a/lite/kernels/opencl/relu_compute.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/opencl/cl_include.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/replace_stl/stream.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace opencl {
-
-class ReluCompute
-    : public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
- public:
-  using param_t = operators::ActivationParam;
-
-  void PrepareForRun() override {
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/relu_kernel.cl", build_options_);
-  }
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    const auto& x_dims = param.X->dims();
-    size_t count = x_dims.production();
-
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    auto* x_buf = param.X->data<float, cl::Buffer>();
-    auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-    VLOG(4) << TargetToStr(param.X->target());
-    VLOG(4) << TargetToStr(param.Out->target());
-
-    int arg_idx = 0;
-    cl_int status = kernel.setArg(arg_idx, *x_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, (const int)count);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
-    CL_CHECK_FATAL(status);
-
-    auto global_work_size = cl::NDRange{count};
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_buf, event_);
-  }
-
- private:
-  std::string kernel_func_name_{"relu"};
-  std::string build_options_{"-DCL_DTYPE=float -DRELU"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-
-}  // namespace opencl
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(relu,
-                     kOpenCL,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::opencl::ReluCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .Finalize();
diff --git a/lite/kernels/opencl/relu_compute_test.cc b/lite/kernels/opencl/relu_compute_test.cc
deleted file mode 100644
index d2f0812bae..0000000000
--- a/lite/kernels/opencl/relu_compute_test.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/backends/opencl/target_wrapper.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-
-template <typename dtype>
-void relu_compute_ref(const dtype *x_data, const DDim &x_dim, dtype *out_data) {
-  for (int i = 0; i < x_dim.production(); ++i) {
-    out_data[i] = x_data[i] > 0.f ? x_data[i] : 0.f;
-  }
-}
-
-TEST(opencl_relu, compute) {
-  // prepare data
-  const DDim x_dim = DDim(std::vector<DDim::value_type>{3, 6, 10, 10});
-  lite::Tensor x, out;
-  x.Resize(x_dim);
-  out.Resize(x_dim);
-
-  auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-  std::default_random_engine engine;
-  std::uniform_real_distribution<float> dist(-10, 10);
-  auto *mapped_x = static_cast<float *>(
-      TargetWrapperCL::Map(x_data, 0, sizeof(float) * x_dim.production()));
-  for (int i = 0; i < x_dim.production(); i++) {
-    mapped_x[i] = dist(engine);
-  }
-
-  // set param and kernel, then run
-  operators::ActivationParam param;
-  param.X = &x;
-  param.Out = &out;
-
-  std::unique_ptr<KernelContext> context(new KernelContext);
-  context->As<OpenCLContext>().InitOnce();
-  auto kernels = KernelRegistry::Global().Create(
-      "relu", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
-  ASSERT_FALSE(kernels.empty());
-  auto kernel = std::move(kernels.front());
-  kernel->SetParam(param);
-  std::unique_ptr<KernelContext> relu_context(new KernelContext);
-  context->As<OpenCLContext>().CopySharedTo(
-      &(relu_context->As<OpenCLContext>()));
-  kernel->SetContext(std::move(relu_context));
-
-  kernel->Launch();
-
-  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto *out_ptr = param.Out->data<float, cl::Buffer>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto &event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
-
-  // run compute ref and check
-  std::unique_ptr<float[]> out_ref(new float[x_dim.production()]);
-  relu_compute_ref<float>(mapped_x, x_dim, out_ref.get());
-
-  auto *out_data = out.mutable_data<float, cl::Buffer>();
-  auto *mapped_out = static_cast<float *>(
-      TargetWrapperCL::Map(out_data, 0, sizeof(float) * x_dim.production()));
-  for (int i = 0; i < x_dim.production(); i++) {
-    EXPECT_NEAR(mapped_out[i], out_ref[i], 1e-6);
-  }
-  TargetWrapperCL::Unmap(out_data, mapped_out);
-  TargetWrapperCL::Unmap(x_data, mapped_x);
-}
-
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(relu, kOpenCL, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/CMakeLists.txt b/lite/kernels/x86/CMakeLists.txt
deleted file mode 100644
index 48165ead93..0000000000
--- a/lite/kernels/x86/CMakeLists.txt
+++ /dev/null
@@ -1,48 +0,0 @@
-# lite_cc_library(activation_compute_x86 SRCS activation_compute.cc DEPS ${lite_kernel_deps} activation_op)
-# lite_cc_library(mean_compute_x86 SRCS mean_compute.cc DEPS ${lite_kernel_deps})
-# lite_cc_library(fill_constant_compute_x86 SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps})
-# lite_cc_library(sgd_compute_x86 SRCS sgd_compute.cc DEPS ${lite_kernel_deps})
-
-# lite_cc_library(fc_compute_x86 SRCS fc_compute.cc DEPS ${lite_kernel_deps})
-# lite_cc_library(mul_compute_x86 SRCS mul_compute.cc DEPS ${lite_kernel_deps})
-# lite_cc_library(relu_compute_x86 SRCS relu_compute.cc DEPS ${lite_kernel_deps})
-add_kernel(scale_compute_x86 X86 basic SRCS scale_compute.cc DEPS ${lite_kernel_deps})
-add_kernel(slice_compute_x86 X86 basic SRCS slice_compute.cc DEPS ${lite_kernel_deps})
-add_kernel(squeeze_compute_x86 X86 basic SRCS squeeze_compute.cc DEPS ${lite_kernel_deps})
-add_kernel(reshape_compute_x86 X86 basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)
-# lite_cc_library(elementwise_compute_x86 SRCS elementwise_compute.cc DEPS ${lite_kernel_deps} elementwise_sub_op elementwise_add_op)
-# lite_cc_library(softmax_compute_x86 SRCS softmax_compute.cc DEPS ${lite_kernel_deps} softmax)
-# lite_cc_library(dropout_compute_x86 SRCS dropout_compute.cc DEPS ${lite_kernel_deps} )
-# lite_cc_library(concat_compute_x86 SRCS concat_compute.cc DEPS ${lite_kernel_deps} )
-# lite_cc_library(conv_compute_x86 SRCS conv_compute.cc DEPS ${lite_kernel_deps} blas im2col vol2col)
-# lite_cc_library(pool_compute_x86 SRCS pool_compute.cc DEPS ${lite_kernel_deps} pooling)
-# lite_cc_library(batch_norm_compute_x86 SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps})
-# lite_cc_library(uniform_random_compute_x86 SRCS uniform_random_compute.cc DEPS ${lite_kernel_deps} )
-
-# lite_cc_test(test_fc_compute_x86 SRCS fc_compute_test.cc DEPS fc_compute_x86)
-# lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
-# lite_cc_test(test_pool2d_compute_x86 SRCS pool_compute_test.cc DEPS pool_compute_x86)
-# lite_cc_test(test_softmax_compute_x86 SRCS softmax_compute_test.cc DEPS softmax_compute_x86)
-# lite_cc_test(test_elementwise_compute_x86 SRCS elementwise_compute_test.cc DEPS elementwise_compute_x86)
-# lite_cc_test(test_relu_compute_x86 SRCS relu_compute_test.cc DEPS relu_compute_x86)
-# lite_cc_test(test_scale_compute_x86 SRCS scale_compute_test.cc DEPS scale_compute_x86)
-# lite_cc_test(test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86)
-# lite_cc_test(test_batch_norm_compute_x86 SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_x86)
-add_kernel(mul_compute_x86 X86 basic SRCS mul_compute.cc DEPS ${lite_kernel_deps} blas)
-add_kernel(concat_compute_x86 X86 basic SRCS concat_compute.cc DEPS ${lite_kernel_deps})
-add_kernel(shape_compute_x86 X86 basic SRCS shape_compute.cc DEPS ${lite_kernel_deps})
-add_kernel(sequence_pool_compute_x86 X86 basic SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} sequence_pooling)
-add_kernel(softmax_compute_x86 X86 basic SRCS softmax_compute.cc DEPS ${lite_kernel_deps} softmax)
-
-if(NOT LITE_WITH_X86)
-    return()
-endif()
-
-lite_cc_test(test_mul_compute_x86 SRCS mul_compute_test.cc DEPS mul_compute_x86)
-lite_cc_test(test_slice_compute_x86 SRCS slice_compute_test.cc DEPS slice_compute_x86)
-lite_cc_test(test_squeeze_compute_x86 SRCS squeeze_compute_test.cc DEPS squeeze_compute_x86)
-lite_cc_test(test_reshape_compute_x86 SRCS reshape_compute_test.cc DEPS reshape_compute_x86)
-lite_cc_test(test_concat_compute_x86 SRCS concat_compute_test.cc DEPS concat_compute_x86)
-lite_cc_test(test_sequence_pool_compute_x86 SRCS sequence_pool_compute_test.cc DEPS sequence_pool_compute_x86)
-lite_cc_test(test_shape_compute_x86 SRCS shape_compute_test.cc DEPS shape_compute_x86)
-lite_cc_test(test_softmax_compute_x86 SRCS softmax_compute_test.cc DEPS softmax_compute_x86)
diff --git a/lite/kernels/x86/activation_compute.cc b/lite/kernels/x86/activation_compute.cc
deleted file mode 100644
index 94d877de28..0000000000
--- a/lite/kernels/x86/activation_compute.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/activation_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename Functor>
-void Activate(const platform::CPUDeviceContext& context,
-              const framework::LoDTensor* X,
-              framework::LoDTensor* Out) {
-  using T = typename Functor::ELEMENT_TYPE;
-  auto* place = context.eigen_device();
-  auto x =
-      framework::EigenVector<T>::Flatten(paddle::operators::detail::Ref(X));
-  auto out =
-      framework::EigenVector<T>::Flatten(paddle::operators::detail::Ref(Out));
-  Functor()(*place, x, out);
-}
-
-template <typename Functor>
-void ActivateGrad(const platform::CPUDeviceContext& context,
-                  const framework::LoDTensor* X,
-                  const framework::LoDTensor* Out,
-                  const framework::LoDTensor* Out_grad,
-                  framework::LoDTensor* X_grad) {
-  using T = typename Functor::ELEMENT_TYPE;
-  auto* place = context.eigen_device();
-  auto x =
-      framework::EigenVector<T>::Flatten(paddle::operators::detail::Ref(X));
-  auto out =
-      framework::EigenVector<T>::Flatten(paddle::operators::detail::Ref(Out));
-  auto x_grad = framework::EigenVector<T>::Flatten(
-      paddle::operators::detail::Ref(X_grad));
-  auto out_grad = framework::EigenVector<T>::Flatten(
-      paddle::operators::detail::Ref(Out_grad));
-  Functor()(*place, x, out, out_grad, x_grad);
-}
-
-template <typename T>
-class SquareCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ActivationParam;
-
-  void Run() override {
-    auto& context = ctx_->As<X86Context>();
-    auto& param = *param_.get_mutable<operators::ActivationParam>();
-    CHECK(context.x86_device_context());
-
-    param.Out->template mutable_data<T>();
-    Activate<paddle::operators::SquareFunctor<T>>(*context.x86_device_context(),
-                                                  &param.X->raw_tensor(),
-                                                  &param.Out->raw_tensor());
-  }
-
-  virtual ~SquareCompute() = default;
-};
-
-template <typename T>
-class SquareGradCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ActivationGradParam;
-
-  void Run() override {
-    auto& context = ctx_->As<X86Context>();
-    auto& param = *param_.get_mutable<operators::ActivationGradParam>();
-    CHECK(context.x86_device_context());
-    param.X_grad->template mutable_data<T>();
-
-    ActivateGrad<paddle::operators::SquareGradFunctor<T>>(
-        *context.x86_device_context(),
-        &param.X->raw_tensor(),
-        &param.Out->raw_tensor(),
-        &param.Out_grad->raw_tensor(),
-        &param.X_grad->raw_tensor());
-  }
-
-  virtual ~SquareGradCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-// float
-REGISTER_LITE_KERNEL(square,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::SquareCompute<float>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(square_grad,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::SquareGradCompute<float>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindInput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindInput(paddle::framework::GradVarName("Out"),
-               {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput(paddle::framework::GradVarName("X"),
-                {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
diff --git a/lite/kernels/x86/batch_norm_compute.cc b/lite/kernels/x86/batch_norm_compute.cc
deleted file mode 100644
index 4dfd941548..0000000000
--- a/lite/kernels/x86/batch_norm_compute.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/batch_norm_compute.h"
-
-REGISTER_LITE_KERNEL(batch_norm,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::BatchNormCompute<float>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindInput("Mean", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindInput("Variance", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("VarianceOut", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
diff --git a/lite/kernels/x86/batch_norm_compute.h b/lite/kernels/x86/batch_norm_compute.h
deleted file mode 100644
index 3a94b99b17..0000000000
--- a/lite/kernels/x86/batch_norm_compute.h
+++ /dev/null
@@ -1,159 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <random>
-#include <string>
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T>
-using EigenArrayMap =
-    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using ConstEigenArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
-template <typename T>
-using ConstEigenVectorArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
-
-template <typename T>
-class BatchNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::BatchNormParam;
-  void Run() override {
-    auto &param = *param_.get_mutable<operators::BatchNormParam>();
-    bool global_stats = param.is_test || param.use_global_stats;
-
-    const auto *x = param.x;
-    const auto &x_dims = x->dims();
-    CHECK(x_dims.size() >= 2 && x_dims.size() <= 5);
-    const int N = x_dims[0];
-    const int C = param.data_layout == DATALAYOUT(kNCHW)
-                      ? x_dims[1]
-                      : x_dims[x_dims.size() - 1];
-    const int sample_size = x->dims().production() / N / C;
-
-    // alloc memory
-    param.y->template mutable_data<T>();
-    if (!param.is_test) {
-      param.mean_out->template mutable_data<T>();
-      param.variance_out->template mutable_data<T>();
-      param.saved_mean->template mutable_data<T>();
-      param.saved_variance->template mutable_data<T>();
-    }
-    if (!global_stats) {
-      // saved_xx is use just in this batch of data
-      EigenVectorArrayMap<T> saved_mean_e(param.saved_mean->mutable_data<T>(),
-                                          C);
-      EigenVectorArrayMap<T> saved_variance_e(
-          param.saved_variance->mutable_data<T>(), C);
-      saved_mean_e.setZero();
-      saved_variance_e.setZero();
-
-      EigenVectorArrayMap<T> running_mean_arr(param.mean_out->mutable_data<T>(),
-                                              C);
-      EigenVectorArrayMap<T> running_var_arr(
-          param.variance_out->mutable_data<T>(), C);
-
-      if ((N * sample_size) == 1) {
-        LOG(WARNING) << "Only 1 element in normalization dimension, "
-                     << "we skip the batch norm calculation, let y = x.";
-        framework::TensorCopy(
-            x->raw_tensor(), platform::CPUPlace(), &param.y->raw_tensor());
-        return;
-      }
-
-      switch (param.data_layout) {
-        case DATALAYOUT(kNCHW): {
-          ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
-          for (int nc = 0; nc < N * C; ++nc) {
-            saved_mean_e(nc % C) += x_arr.col(nc).sum();
-          }
-          saved_mean_e /= N * sample_size;
-          for (int nc = 0; nc < N * C; ++nc) {
-            saved_variance_e(nc % C) +=
-                (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm();
-          }
-          saved_variance_e /= N * sample_size;
-          break;
-        }
-        default:
-          LOG(FATAL) << "Unknown storage order: "
-                     << DataLayoutToStr(param.data_layout);
-          break;
-      }
-      running_mean_arr = running_mean_arr * param.momentum +
-                         saved_mean_e * (1. - param.momentum);
-      running_var_arr = running_var_arr * param.momentum +
-                        saved_variance_e * (1. - param.momentum);
-    }
-
-    // use SavedMean and SavedVariance to do normalize
-    Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
-    if (global_stats) {
-      ConstEigenVectorArrayMap<T> var_arr(param.variance->data<T>(), C);
-      inv_std = (var_arr + param.epsilon).sqrt().inverse();
-    } else {
-      EigenVectorArrayMap<T> saved_inv_std(
-          param.saved_variance->mutable_data<T>(), C);
-      // inverse SavedVariance first, gradient will use it too.
-      saved_inv_std = (saved_inv_std + param.epsilon).inverse().sqrt();
-      inv_std = saved_inv_std;
-    }
-
-    ConstEigenVectorArrayMap<T> mean_arr(
-        global_stats ? param.mean->data<T>() : param.saved_mean->data<T>(), C);
-
-    //   ((x - est_mean) * (inv_var) * scale + bias
-    //   formula transform ====>
-    //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
-
-    ConstEigenVectorArrayMap<T> scale_arr(param.scale->data<T>(), C);
-    ConstEigenVectorArrayMap<T> bias_arr(param.bias->data<T>(), C);
-    Eigen::Array<T, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
-    Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
-        bias_arr - mean_arr * inv_std * scale_arr;
-
-    switch (param.data_layout) {
-      case DATALAYOUT(kNCHW): {
-        EigenArrayMap<T> y_arr(param.y->mutable_data<T>(), sample_size, N * C);
-        ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
-        for (int nc = 0; nc < N * C; ++nc) {
-          y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
-        }
-        break;
-      }
-      default:
-        LOG(FATAL) << "Unknown storage order: "
-                   << DataLayoutToStr(param.data_layout);
-        break;
-    }
-  }
-  virtual ~BatchNormCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/x86/batch_norm_compute_test.cc b/lite/kernels/x86/batch_norm_compute_test.cc
deleted file mode 100644
index 254a6a7379..0000000000
--- a/lite/kernels/x86/batch_norm_compute_test.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/batch_norm_compute.h"
-#include <gtest/gtest.h>
-#include <iostream>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-TEST(batch_norm_x86, retrive_op) {
-  auto batch_norm =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "batch_norm");
-  ASSERT_FALSE(batch_norm.empty());
-  ASSERT_TRUE(batch_norm.front());
-}
-
-TEST(batch_norm_x86, init) {
-  BatchNormCompute<float> batch_norm;
-  ASSERT_EQ(batch_norm.precision(), PRECISION(kFloat));
-  ASSERT_EQ(batch_norm.target(), TARGET(kX86));
-}
-
-TEST(batch_norm_x86, run_test) {
-  lite::Tensor x, scale, bias, mean, variance, y, mean_out, variance_out,
-      saved_mean, saved_variance;
-  constexpr int batch_size = 2;
-  std::vector<int64_t> x_shape{batch_size, 3, 64, 64};
-  x.Resize(lite::DDim(x_shape));
-
-  std::vector<int64_t> scale_shape{3};
-  scale.Resize(lite::DDim(scale_shape));
-
-  std::vector<int64_t> bias_shape{3};
-  bias.Resize(lite::DDim(bias_shape));
-
-  std::vector<int64_t> mean_shape{3};
-  mean.Resize(lite::DDim(mean_shape));
-
-  std::vector<int64_t> variance_shape{3};
-  variance.Resize(lite::DDim(variance_shape));
-
-  std::vector<int64_t> y_shape{batch_size, 3, 64, 64};
-  y.Resize(lite::DDim(y_shape));
-
-  std::vector<int64_t> mean_out_shape{3};
-  mean_out.Resize(lite::DDim(mean_out_shape));
-
-  std::vector<int64_t> variance_out_shape{3};
-  variance_out.Resize(lite::DDim(variance_out_shape));
-
-  std::vector<int64_t> saved_mean_shape{3};
-  saved_mean.Resize(lite::DDim(saved_mean_shape));
-
-  std::vector<int64_t> saved_variance_shape{3};
-  saved_variance.Resize(lite::DDim(saved_variance_shape));
-
-  auto x_data = x.mutable_data<float>();
-  auto scale_data = scale.mutable_data<float>();
-  auto bias_data = bias.mutable_data<float>();
-  auto mean_data = mean.mutable_data<float>();
-  auto variance_data = variance.mutable_data<float>();
-  y.mutable_data<float>();
-  mean_out.mutable_data<float>();
-  variance_out.mutable_data<float>();
-  saved_mean.mutable_data<float>();
-  saved_variance.mutable_data<float>();
-
-  for (int64_t i = 0; i < x.dims().production(); i++) {
-    x_data[i] = static_cast<float>(i);
-  }
-  for (int i = 0; i < scale.dims().production(); i++) {
-    scale_data[i] = static_cast<float>(i) * 0.01f + 0.03f;
-  }
-  for (int i = 0; i < bias.dims().production(); i++) {
-    bias_data[i] = static_cast<float>(i) * 0.065f + 0.1f;
-  }
-  for (int i = 0; i < mean.dims().production(); i++) {
-    mean_data[i] = static_cast<float>(i) * 0.0565f;
-  }
-  for (int i = 0; i < variance.dims().production(); i++) {
-    variance_data[i] = static_cast<float>(i) * 2.08f + 1.5f;
-  }
-  // BatchNormCompute batch_norm;
-  BatchNormCompute<float> batch_norm;
-  operators::BatchNormParam param;
-
-  param.x = &x;
-  param.is_test = false;
-  param.scale = &scale;
-  param.bias = &bias;
-  param.mean = &mean;
-  param.variance = &variance;
-  param.use_global_stats = false;
-  param.epsilon = 1e-4f;
-  param.momentum = 0.9f;
-  param.y = &y;
-  param.mean_out = &mean_out;
-  param.variance_out = &variance_out;
-  param.saved_mean = &saved_mean;
-  param.saved_variance = &saved_variance;
-
-  batch_norm.SetParam(param);
-  batch_norm.Run();
-
-  LOG(INFO) << "output: " << y;
-  LOG(INFO) << "mean_out: " << mean_out;
-  LOG(INFO) << "variance_out: " << mean_out;
-  LOG(INFO) << "saved_mean: " << saved_mean;
-  LOG(INFO) << "saved_variance: " << saved_variance;
-
-  /*for (int i = 0; i < y.dims().production(); i++) {
-    if(i < 5 || i > y.dims().production() - 5)
-      LOG(INFO) << y_data[i];
-  }*/
-}
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(batch_norm, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/concat_compute.cc b/lite/kernels/x86/concat_compute.cc
deleted file mode 100644
index a95ec54b4e..0000000000
--- a/lite/kernels/x86/concat_compute.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/concat_compute.h"
-
-REGISTER_LITE_KERNEL(concat,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::ConcatCompute<float>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
diff --git a/lite/kernels/x86/concat_compute.h b/lite/kernels/x86/concat_compute.h
deleted file mode 100644
index 674f06461f..0000000000
--- a/lite/kernels/x86/concat_compute.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <Eigen/Core>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/types.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-inline int count(int start_axis, int end_axis, const lite::DDim& dim) {
-  int count = 1;
-  for (int i = start_axis; i < end_axis; ++i) {
-    count *= dim[i];
-  }
-  return count;
-}
-
-template <typename T>
-class ConcatCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ConcatParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    int64_t axis = static_cast<int64_t>(param.axis);
-    auto x_dims = param.x[0]->dims();
-    auto out = param.output;
-    if (param.x.size() == 1) return;
-
-    auto output_data = param.output->template mutable_data<T>();
-    int offset_concat_axis = 0;
-    int num_concat = count(0, axis, x_dims);
-    int concat_input_size = count(axis + 1, x_dims.size(), x_dims);
-    const int top_concat_axis = out->dims()[axis];
-    for (size_t i = 0; i < param.x.size(); ++i) {
-      auto bottom_data = param.x[i]->data<T>();
-      const int64_t bottom_concat_axis = param.x[i]->dims()[axis];
-      for (int n = 0; n < num_concat; ++n) {
-        std::memcpy(
-            output_data +
-                (n * top_concat_axis + offset_concat_axis) * concat_input_size,
-            bottom_data + n * bottom_concat_axis * concat_input_size,
-            (bottom_concat_axis * concat_input_size) * sizeof(T));
-      }
-      offset_concat_axis += bottom_concat_axis;
-    }
-  }
-  virtual ~ConcatCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/x86/concat_compute_test.cc b/lite/kernels/x86/concat_compute_test.cc
deleted file mode 100644
index 468e942275..0000000000
--- a/lite/kernels/x86/concat_compute_test.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/concat_compute.h"
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-TEST(concat_x86, retrive_op) {
-  auto concat =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "concat");
-  ASSERT_FALSE(concat.empty());
-  ASSERT_TRUE(concat.front());
-}
-
-TEST(concat_x86, init) {
-  ConcatCompute<float> concat;
-  ASSERT_EQ(concat.precision(), PRECISION(kFloat));
-  ASSERT_EQ(concat.target(), TARGET(kX86));
-}
-
-TEST(concat_x86, run_test) {
-  lite::Tensor x1, x2, out;
-  constexpr int batch_size = 1;
-  std::vector<int64_t> x1_shape{batch_size, 1, 3, 3};
-  x1.Resize(lite::DDim(x1_shape));
-  std::vector<int64_t> x2_shape{batch_size, 1, 3, 3};
-  x2.Resize(lite::DDim(x2_shape));
-
-  std::vector<lite::Tensor*> x = {&x1, &x2};
-
-  std::vector<int64_t> out_shape{batch_size, 2, 3, 3};
-  out.Resize(lite::DDim(out_shape));
-
-  auto x1_data = x1.mutable_data<float>();
-  auto x2_data = x2.mutable_data<float>();
-  auto out_data = out.mutable_data<float>();
-
-  for (int64_t i = 0; i < x1.dims().production(); i++) {
-    x1_data[i] = 1;
-    x2_data[i] = 2;
-  }
-
-  ConcatCompute<float> concat;
-  operators::ConcatParam param;
-  param.x = x;
-  param.output = &out;
-  param.axis = 1;
-
-  concat.SetParam(param);
-  concat.Run();
-
-  std::vector<float> ref_results = {
-      1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2};
-  for (int i = 0; i < out.dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], ref_results[i], 1e-3);
-  }
-}
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(concat, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/conv_compute.cc b/lite/kernels/x86/conv_compute.cc
deleted file mode 100644
index 48a9a3110c..0000000000
--- a/lite/kernels/x86/conv_compute.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/conv_compute.h"
-
-REGISTER_LITE_KERNEL(conv2d,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::Conv2dCompute<float>,
-                     def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(depthwise_conv2d,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::Conv2dCompute<float>,
-                     def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
diff --git a/lite/kernels/x86/conv_compute.h b/lite/kernels/x86/conv_compute.h
deleted file mode 100644
index 39114e1716..0000000000
--- a/lite/kernels/x86/conv_compute.h
+++ /dev/null
@@ -1,167 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <Eigen/Core>
-#include <string>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/types.h"
-#include "lite/operators/conv_op.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/depthwise_conv.h"
-#include "paddle/fluid/operators/math/im2col.h"
-#include "paddle/fluid/operators/math/vol2col.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-inline bool IsExpand(const std::vector<int64_t>& filter_dim,
-                     const std::vector<int>& strides,
-                     const std::vector<int>& paddings,
-                     const std::vector<int>& dilations) {
-  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
-  for (size_t j = 0; j < strides.size(); ++j) {
-    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
-    strides_1 = strides_1 && (strides[j] == 1);
-    padding_0 = padding_0 && (paddings[j] == 0);
-    dilation_1 = dilation_1 && (dilations[j] == 1);
-  }
-  return !(filter_1 && strides_1 && padding_0 && dilation_1);
-}
-
-template <typename T>
-class Conv2dCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ConvParam;
-  void Run() override {
-    auto& param = *param_.get_mutable<operators::ConvParam>();
-    lite::Tensor filter = *param.filter;
-    param.output->template mutable_data<T>();
-
-    const int batch_size = static_cast<int>(param.x->dims()[0]);
-
-    std::vector<int64_t> filter_shape_vec(filter.dims().Vectorize());
-    std::vector<int64_t> output_shape_vec(param.output->dims().Vectorize());
-
-    size_t data_dim = filter_shape_vec.size() - 2;
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    col_shape_vec[0] = param.x->dims()[1] / param.groups;
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-    }
-    lite::DDim col_shape(col_shape_vec);
-    lite::DDim col_matrix_shape = col_shape.Flatten2D(data_dim + 1);
-    bool is_expand = IsExpand(
-        filter_shape_vec, param.strides, param.paddings, param.dilations);
-
-    lite::Tensor col;
-    lite::Tensor col_matrix;
-    if (is_expand) {
-      col.Resize(col_shape);
-      col.mutable_data<T>();
-      col_matrix.ShareDataWith(col);
-      col_matrix.Resize(col_matrix_shape);
-    }
-    lite::DDim input_shape = param.x->dims().Slice(1, param.x->dims().size());
-
-    lite::DDim filter_matrix_shape(std::vector<int64_t>{
-        filter.dims()[0], filter.dims().production() / filter.dims()[0]});
-    filter.Resize(filter_matrix_shape);
-
-    lite::DDim output_matrix_shape(std::vector<int64_t>{
-        param.output->dims()[1],
-        param.output->dims().production() /
-            (param.output->dims()[0] * param.output->dims()[1])});
-
-    int in_step = static_cast<int>(param.x->dims()[1]) / param.groups;
-    int out_step = static_cast<int>(param.output->dims()[1]) / param.groups;
-
-    paddle::operators::math::Vol2ColFunctor<platform::CPUDeviceContext, T>
-        vol2col;
-    paddle::operators::math::Im2ColFunctor<
-        paddle::operators::math::ColFormat::kCFO,
-        platform::CPUDeviceContext,
-        T>
-        im2col;
-    auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
-        platform::CPUDeviceContext());
-    for (int i = 0; i < batch_size; i++) {
-      lite::Tensor in_batch;
-      in_batch.ShareDataWith(
-          param.x->raw_tensor().Slice(i, i + 1).Resize(input_shape.data()));
-      lite::Tensor out_batch;
-      out_batch.ShareDataWith(param.output->raw_tensor().Slice(i, i + 1).Resize(
-          output_matrix_shape.data()));
-
-      for (int g = 0; g < param.groups; g++) {
-        lite::Tensor in_slice;
-        in_slice.ShareDataWith(
-            in_batch.raw_tensor().Slice(g * in_step, (g + 1) * in_step));
-
-        if (!is_expand) {
-          col.ShareDataWith(in_slice);
-          col_matrix.ShareDataWith(col);
-          col_matrix.Resize(col_matrix_shape);
-        } else if (data_dim == 2U) {
-          // im2col
-          im2col(platform::CPUDeviceContext(),
-                 in_slice.raw_tensor(),
-                 param.dilations,
-                 param.strides,
-                 std::vector<int>{param.paddings[0],
-                                  param.paddings[1],
-                                  param.paddings[0],
-                                  param.paddings[1]},
-                 &(col.raw_tensor()));
-        } else if (data_dim == 3U) {
-          // vol2col
-          vol2col(platform::CPUDeviceContext(),
-                  in_slice.raw_tensor(),
-                  param.dilations,
-                  param.strides,
-                  param.paddings,
-                  &(col.raw_tensor()));
-        }
-
-        // gemm
-        lite::Tensor out_slice;
-        out_slice.ShareDataWith(
-            out_batch.raw_tensor().Slice(g * out_step, (g + 1) * out_step));
-        lite::Tensor filter_slice;
-        filter_slice.ShareDataWith(
-            filter.raw_tensor().Slice(g * out_step, (g + 1) * out_step));
-        blas.MatMul(filter_slice.raw_tensor(),
-                    false,
-                    col_matrix.raw_tensor(),
-                    false,
-                    T(1.0),
-                    &(out_slice.raw_tensor()),
-                    T(0.0));
-      }
-    }
-  }
-
-  virtual ~Conv2dCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/x86/conv_compute_test.cc b/lite/kernels/x86/conv_compute_test.cc
deleted file mode 100644
index 17efae4160..0000000000
--- a/lite/kernels/x86/conv_compute_test.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/conv_compute.h"
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-TEST(conv_x86, retrive_op) {
-  auto conv2d =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "conv2d");
-  ASSERT_FALSE(conv2d.empty());
-  ASSERT_TRUE(conv2d.front());
-}
-
-TEST(conv2d_x86, init) {
-  Conv2dCompute<float> conv2d;
-  ASSERT_EQ(conv2d.precision(), PRECISION(kFloat));
-  ASSERT_EQ(conv2d.target(), TARGET(kX86));
-}
-
-TEST(conv2d_x86, run_test) {
-  lite::Tensor x, filter, b, out;
-  constexpr int batch_size = 1;
-  std::vector<int64_t> x_shape{batch_size, 3, 3, 3};
-  x.Resize(lite::DDim(x_shape));
-  std::vector<int64_t> filter_shape{1, 3, 3, 3};
-  filter.Resize(lite::DDim(filter_shape));
-  std::vector<int64_t> b_shape{1, 3, 1, 1};
-  b.Resize(lite::DDim(b_shape));
-  std::vector<int64_t> out_shape{batch_size, 1, 1, 1};
-  out.Resize(lite::DDim(out_shape));
-
-  auto x_data = x.mutable_data<float>();
-  auto filter_data = filter.mutable_data<float>();
-  auto b_data = b.mutable_data<float>();
-  auto out_data = out.mutable_data<float>();
-
-  for (int64_t i = 0; i < x.dims().production(); i++) {
-    x_data[i] = 1;
-  }
-  for (int64_t i = 0; i < filter.dims().production(); i++) {
-    filter_data[i] = 1;
-  }
-  for (int64_t i = 0; i < b.dims().production(); i++) {
-    b_data[i] = 0;
-  }
-
-  Conv2dCompute<float> conv2d;
-  operators::ConvParam param;
-
-  param.x = &x;
-  param.filter = &filter;
-  param.bias = &b;
-  param.output = &out;
-  param.strides = {1, 1};
-  param.paddings = {0, 0};
-  param.groups = 1;
-  param.dilations = {1, 1};
-
-  conv2d.SetParam(param);
-  conv2d.Run();
-
-  LOG(INFO) << "output: ";
-  for (int i = 0; i < out.dims().production(); i++) {
-    LOG(INFO) << out_data[i] << " ";
-  }
-}
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(conv2d, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/dropout_compute.cc b/lite/kernels/x86/dropout_compute.cc
deleted file mode 100644
index b805a53491..0000000000
--- a/lite/kernels/x86/dropout_compute.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/dropout_compute.h"
-
-REGISTER_LITE_KERNEL(dropout,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::DropoutCompute<float>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("Mask", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
diff --git a/lite/kernels/x86/dropout_compute.h b/lite/kernels/x86/dropout_compute.h
deleted file mode 100644
index de8730d198..0000000000
--- a/lite/kernels/x86/dropout_compute.h
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <random>
-#include <string>
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T>
-class DropoutCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::DropoutParam;
-  void Run() override {
-    auto& param = *param_.get_mutable<operators::DropoutParam>();
-    const auto* x_data = param.x->data<T>();
-    auto* out_data = param.output->template mutable_data<T>();
-    if (!param.is_test) {
-      auto* mask_data = param.mask->template mutable_data<T>();
-      std::random_device rnd;
-      std::minstd_rand engine;
-      int seed = param.fix_seed ? param.seed : rnd();
-      engine.seed(seed);
-      std::uniform_real_distribution<float> dist(0, 1);
-
-      size_t size = framework::product(param.mask->dims().data());
-      for (size_t i = 0; i < size; ++i) {
-        if (dist(engine) < param.dropout_prob) {
-          mask_data[i] = 0;
-          out_data[i] = 0;
-        } else {
-          if (param.dropout_implementation == "upscale_in_train") {
-            mask_data[i] = 1.0f / static_cast<T>(1.0f - param.dropout_prob);
-            out_data[i] = x_data[i] / static_cast<T>(1.0f - param.dropout_prob);
-          } else {
-            mask_data[i] = 1;
-            out_data[i] = x_data[i];
-          }
-        }
-      }
-    } else {
-      auto X = EigenMatrix<T>::Reshape(param.x->raw_tensor(), 1);
-      auto Y = EigenMatrix<T>::Reshape(param.output->raw_tensor(), 1);
-      auto& place = *platform::CPUDeviceContext().eigen_device();
-      if (param.dropout_implementation == "upscale_in_train") {
-        Y.device(place) = X;
-      } else {
-        Y.device(place) = X * static_cast<T>(1.0f - param.dropout_prob);
-      }
-    }
-  }
-
-  virtual ~DropoutCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/x86/dropout_compute_test.cc b/lite/kernels/x86/dropout_compute_test.cc
deleted file mode 100644
index f68b92a172..0000000000
--- a/lite/kernels/x86/dropout_compute_test.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/dropout_compute.h"
-#include <gtest/gtest.h>
-#include <iostream>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-TEST(dropout_x86, retrive_op) {
-  auto dropout =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "dropout");
-  ASSERT_FALSE(dropout.empty());
-  ASSERT_TRUE(dropout.front());
-}
-
-TEST(dropout_x86, init) {
-  DropoutCompute<float> dropout;
-  ASSERT_EQ(dropout.precision(), PRECISION(kFloat));
-  ASSERT_EQ(dropout.target(), TARGET(kX86));
-}
-
-TEST(dropout_x86, run_test) {
-  lite::Tensor x, y, out;
-  constexpr int batch_size = 1;
-  std::vector<int64_t> x_shape{batch_size, 3, 2, 2};
-  x.Resize(lite::DDim(x_shape));
-  std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
-  out.Resize(lite::DDim(out_shape));
-
-  auto x_data = x.mutable_data<float>();
-  auto out_data = out.mutable_data<float>();
-
-  for (int64_t i = 0; i < x.dims().production(); i++) {
-    x_data[i] = static_cast<float>(i);
-  }
-  // DropoutCompute dropout;
-  DropoutCompute<float> dropout;
-  operators::DropoutParam param;
-
-  param.x = &x;
-  param.dropout_prob = 0.25;
-  param.is_test = true;
-  param.fix_seed = true;
-  param.output = &out;
-
-  dropout.SetParam(param);
-  dropout.Run();
-
-  LOG(INFO) << "output: ";
-  for (int i = 0; i < out.dims().production(); i++) {
-    LOG(INFO) << out_data[i];
-  }
-}
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(dropout, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/elementwise_compute.cc b/lite/kernels/x86/elementwise_compute.cc
deleted file mode 100644
index b0c9e958a0..0000000000
--- a/lite/kernels/x86/elementwise_compute.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/elementwise_compute.h"
-
-REGISTER_LITE_KERNEL(elementwise_sub,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::ElementwiseSubCompute<float>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(elementwise_add,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::ElementwiseAddCompute<float>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
-
-#ifdef LITE_WITH_X86
-REGISTER_LITE_KERNEL(
-    elementwise_sub_grad,
-    kX86,
-    kFloat,
-    kNCHW,
-    paddle::lite::kernels::x86::ElementwiseSubGradCompute<float>,
-    def)
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindInput(paddle::framework::GradVarName("Out"),
-               {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput(paddle::framework::GradVarName("X"),
-                {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput(paddle::framework::GradVarName("Y"),
-                {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
-#endif
diff --git a/lite/kernels/x86/elementwise_compute.h b/lite/kernels/x86/elementwise_compute.h
deleted file mode 100644
index d93f113125..0000000000
--- a/lite/kernels/x86/elementwise_compute.h
+++ /dev/null
@@ -1,142 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T>
-struct SubFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a - b; }
-};
-
-template <typename T>
-struct AddFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
-};
-
-template <typename T>
-class ElementwiseSubCompute
-    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ElementwiseParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto& context = ctx_->As<X86Context>();
-    CHECK(context.x86_device_context());
-
-    param.Out->template mutable_data<T>();
-    paddle::operators::ElementwiseComputeEx<SubFunctor<T>,
-                                            platform::CPUDeviceContext,
-                                            T>(*context.x86_execution_context(),
-                                               &param.X->raw_tensor(),
-                                               &param.Y->raw_tensor(),
-                                               param.axis,
-                                               SubFunctor<T>(),
-                                               &param.Out->raw_tensor());
-  }
-
-  virtual ~ElementwiseSubCompute() = default;
-};
-
-template <typename T>
-struct SubGradDX {
-  T operator()(T x, T y, T out, T dout) const { return dout; }
-};
-
-template <typename T>
-struct SubGradDY {
-  T operator()(T x, T y, T out, T dout) const { return -dout; }
-};
-
-#ifdef LITE_WITH_X86
-template <typename T>
-class ElementwiseSubGradCompute
-    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ElementwiseGradParam;
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto& context = ctx_->As<X86Context>();
-    CHECK(context.x86_device_context());
-
-    param.X_grad->template mutable_data<T>();
-    // skip out, x, y
-    auto dout = param.Out_grad->raw_tensor();
-    auto dx = param.X_grad->raw_tensor();
-
-    framework::Tensor* dy = nullptr;
-    if (param.Y_grad) {
-      param.Y_grad->template mutable_data<T>();
-      dy = &param.Y_grad->raw_tensor();
-    }
-    auto& skip = dout;
-    paddle::operators::ElemwiseExplicitGradCompute<platform::CPUDeviceContext,
-                                                   T,
-                                                   SubGradDX<T>,
-                                                   SubGradDY<T>>(
-        *context.x86_execution_context(),
-        skip,
-        skip,
-        skip,
-        dout,
-        param.axis,
-        &dx,
-        dy,
-        SubGradDX<T>(),
-        SubGradDY<T>());
-  }
-
-  virtual ~ElementwiseSubGradCompute() = default;
-};
-#endif
-
-template <typename T>
-class ElementwiseAddCompute
-    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ElementwiseParam;
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto& context = ctx_->As<X86Context>();
-    CHECK(context.x86_device_context());
-    param.Out->template mutable_data<T>();
-    paddle::operators::ElementwiseComputeEx<AddFunctor<T>,
-                                            platform::CPUDeviceContext,
-                                            T>(*context.x86_execution_context(),
-                                               &param.X->raw_tensor(),
-                                               &param.Y->raw_tensor(),
-                                               param.axis,
-                                               AddFunctor<T>(),
-                                               &param.Out->raw_tensor());
-  }
-
-  virtual ~ElementwiseAddCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/x86/elementwise_compute_test.cc b/lite/kernels/x86/elementwise_compute_test.cc
deleted file mode 100644
index 5d0f9fd57a..0000000000
--- a/lite/kernels/x86/elementwise_compute_test.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/elementwise_compute.h"
-#include <gtest/gtest.h>
-#include <iostream>
-#include <memory>
-#include <utility>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-TEST(elementwise_add_x86, retrive_op) {
-  auto elementwise_add =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "elementwise_add");
-  ASSERT_FALSE(elementwise_add.empty());
-  ASSERT_TRUE(elementwise_add.front());
-}
-
-TEST(elementwise_add_x86, init) {
-  ElementwiseAddCompute<float> elementwise_add;
-  ASSERT_EQ(elementwise_add.precision(), PRECISION(kFloat));
-  ASSERT_EQ(elementwise_add.target(), TARGET(kX86));
-}
-
-TEST(elementwise_add_x86, run_test) {
-  lite::Tensor x, y, out;
-  constexpr int batch_size = 1;
-  std::vector<int64_t> x_shape{batch_size, 3, 2, 2};
-  x.Resize(lite::DDim(x_shape));
-  std::vector<int64_t> y_shape{batch_size, 3, 2, 2};
-  y.Resize(lite::DDim(y_shape));
-  std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
-  out.Resize(lite::DDim(out_shape));
-
-  auto x_data = x.mutable_data<float>();
-  auto y_data = y.mutable_data<float>();
-  auto out_data = out.mutable_data<float>();
-
-  for (int64_t i = 0; i < x.dims().production(); i++) {
-    x_data[i] = 1;
-  }
-  for (int64_t i = 0; i < y.dims().production(); i++) {
-    y_data[i] = 2;
-  }
-
-  // ElementwiseAddCompute elementwise_add;
-  ElementwiseAddCompute<float> elementwise_add;
-  operators::ElementwiseParam param;
-
-  param.X = &x;
-  param.Y = &y;
-  param.Out = &out;
-
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  ctx->As<X86Context>();
-  elementwise_add.SetParam(param);
-  elementwise_add.SetContext(std::move(ctx));
-  elementwise_add.Run();
-
-  LOG(INFO) << "output: ";
-  for (int i = 0; i < out.dims().production(); i++) {
-    LOG(INFO) << out_data[i];
-  }
-}
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(elementwise_add, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/fc_compute.cc b/lite/kernels/x86/fc_compute.cc
deleted file mode 100644
index ea11dea096..0000000000
--- a/lite/kernels/x86/fc_compute.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/fc_compute.h"
-
-REGISTER_LITE_KERNEL(
-    fc, kX86, kFloat, kNCHW, paddle::lite::kernels::x86::FcCompute<float>, def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindInput("W", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
diff --git a/lite/kernels/x86/fc_compute.h b/lite/kernels/x86/fc_compute.h
deleted file mode 100644
index 620236a454..0000000000
--- a/lite/kernels/x86/fc_compute.h
+++ /dev/null
@@ -1,106 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <Eigen/Core>
-#include "lite/core/kernel.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-#include "lite/operators/fc_op.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T>
-void fc_compute_eigen(const T* x,
-                      int x_h,
-                      int x_w,  //
-                      const T* w,
-                      int w_h,
-                      int w_w,     //
-                      const T* b,  //
-                      T* out) {
-  using matrix_t =
-      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-
-  Eigen::Map<const matrix_t> X(x, x_h, x_w);
-  Eigen::Map<const matrix_t> W(w, w_h, w_w);
-  Eigen::Map<matrix_t> Out(out, x_h, w_w);
-
-  Out = X * W;
-
-  if (b) {
-    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>> B(b, w_w);
-    Out = Out.array().rowwise() + B.transpose().array();
-  }
-}
-
-template <typename T>
-void fc_compute_naive(const T* x,
-                      int x_h,
-                      int x_w,  //
-                      const T* w,
-                      int w_h,
-                      int w_w,     //
-                      const T* b,  //
-                      T* out) {
-  CHECK_EQ(x_w, w_h);
-  // out shape: (x_h, w_w)
-  memset(out, 0, x_h * w_w * sizeof(T));
-  for (int i = 0; i < x_h; i++) {
-    for (int j = 0; j < w_w; j++) {
-      T tmp = static_cast<T>(0);
-      for (int k = 0; k < x_w; k++) {
-        tmp += x[i * x_w + k] * w[k * w_w + j];
-      }
-      out[i * w_w + j] = tmp + b[j];
-    }
-  }
-}
-
-template <typename T>
-class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::FcParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    CHECK_GE(param.input->dims().size(), 2UL);
-    CHECK_EQ(param.output->dims().size(), 2UL);
-
-    fc_compute_eigen(
-        param.input->data<T>(),  // x
-        param.input->dims().Slice(0, param.in_num_col_dims).production(),
-        param.input->dims()
-            .Slice(param.in_num_col_dims, param.input->dims().size())
-            .production(),
-        param.w->data<T>(),     // w
-        param.w->dims()[0],     // w_h
-        param.w->dims()[1],     // w_w
-        param.bias->data<T>(),  // b
-        param.output->mutable_data<T>());
-  }
-
-  virtual ~FcCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/x86/fc_compute_test.cc b/lite/kernels/x86/fc_compute_test.cc
deleted file mode 100644
index abc0597457..0000000000
--- a/lite/kernels/x86/fc_compute_test.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "lite/kernels/x86/fc_compute.h"
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-TEST(fc_x86, retrive_op) {
-  auto fc =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("fc");
-  ASSERT_FALSE(fc.empty());
-  ASSERT_TRUE(fc.front());
-}
-
-TEST(fc_x86, init) {
-  FcCompute<float> fc;
-  ASSERT_EQ(fc.precision(), PRECISION(kFloat));
-  ASSERT_EQ(fc.target(), TARGET(kX86));
-}
-
-TEST(fc_x86, run_test) {
-  lite::Tensor x, w, b, out;
-  constexpr int batch_size = 2;
-  std::vector<int64_t> x_shape{batch_size, 3};
-  x.Resize(lite::DDim(x_shape));
-  std::vector<int64_t> w_shape{3, 4};
-  w.Resize(lite::DDim(w_shape));
-  std::vector<int64_t> b_shape{1, 4};
-  b.Resize(lite::DDim(b_shape));
-  std::vector<int64_t> out_shape{1, 4};
-  out.Resize(lite::DDim(out_shape));
-
-  auto x_data = x.mutable_data<float>();
-  auto w_data = w.mutable_data<float>();
-  auto b_data = b.mutable_data<float>();
-  auto out_data = out.mutable_data<float>();
-
-  for (int64_t i = 0; i < x.dims().production(); i++) {
-    x_data[i] = static_cast<float>(i);
-  }
-  for (int64_t i = 0; i < w.dims().production(); i++) {
-    w_data[i] = static_cast<float>(i);
-  }
-  for (int64_t i = 0; i < b.dims().production(); i++) {
-    b_data[i] = static_cast<float>(i);
-  }
-
-  /* lite::x86::math::fc_compute_eigen(x_data, batch_size, 3,  //
-                                     w_data, 3, 4,           //
-                                     b_data, ref_data); */
-
-  // FcCompute fc;
-  FcCompute<float> fc;
-  operators::FcParam param;
-
-  param.in_num_col_dims = 1;
-  param.input = &x;
-  param.w = &w;
-  param.bias = &b;
-  param.output = &out;
-  param.in_mat_dims = x.dims();
-
-  // std::unique_ptr<KernelContext> ctx(new KernelContext);
-  // ctx->As<X86Context>();
-  fc.SetParam(param);
-  // fc.SetContext(std::move(ctx));
-  fc.Run();
-
-  VLOG(3) << "output vs ref";
-  for (int i = 0; i < out.dims().production(); i++) {
-    VLOG(3) << out_data[i];
-  }
-
-  /* for (int i = 0; i < out.dims().production(); ++i) {
-     EXPECT_NEAR(out_data[i], ref_data[i], 1e-5);
-   }*/
-}
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/fill_constant_compute.cc b/lite/kernels/x86/fill_constant_compute.cc
deleted file mode 100644
index 1eb76332cc..0000000000
--- a/lite/kernels/x86/fill_constant_compute.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T>
-class FillConstantCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::FillConstantParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto& context = ctx_->As<X86Context>();
-    CHECK(context.x86_device_context());
-
-    param.Out->template mutable_data<T>();
-
-    paddle::operators::math::set_constant(
-        *context.x86_device_context(), &param.Out->raw_tensor(), param.value);
-  }
-
-  virtual ~FillConstantCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-// float
-REGISTER_LITE_KERNEL(fill_constant,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::FillConstantCompute<float>,
-                     def)
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
diff --git a/lite/kernels/x86/mean_compute.cc b/lite/kernels/x86/mean_compute.cc
deleted file mode 100644
index b618d2d377..0000000000
--- a/lite/kernels/x86/mean_compute.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/activation_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename T>
-class MeanCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::MeanParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto& context = ctx_->As<X86Context>();
-    CHECK(context.x86_device_context());
-
-    param.Out->template mutable_data<T>();
-
-    auto X = EigenVector<T>::Flatten(param.X->raw_tensor());
-    auto y = EigenScalar<T>::From(param.Out->raw_tensor());
-    const auto& place = *(context.x86_device_context()->eigen_device());
-
-    y.device(place) = X.mean();
-  }
-
-  virtual ~MeanCompute() = default;
-};
-
-template <typename T>
-class MeanGradCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::MeanGradParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto& context = ctx_->As<X86Context>();
-    CHECK_EQ(param.Out_grad->raw_tensor().numel(), 1);
-    CHECK(context.x86_device_context());
-
-    param.X_grad->template mutable_data<T>();
-    T x_grad_size = static_cast<T>(param.X_grad->raw_tensor().numel());
-    Eigen::DSizes<int, 1> bcast(static_cast<int>(x_grad_size));
-    EigenVector<T>::Flatten(param.X_grad->raw_tensor())
-        .device(*(context.x86_device_context()->eigen_device())) =
-        (EigenVector<T>::From(param.Out_grad->raw_tensor()) / x_grad_size)
-            .broadcast(bcast);
-  }
-
-  virtual ~MeanGradCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-// float
-REGISTER_LITE_KERNEL(mean,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::MeanCompute<float>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(mean_grad,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::MeanGradCompute<float>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindInput(paddle::framework::GradVarName("Out"),
-               {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput(paddle::framework::GradVarName("X"),
-                {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
diff --git a/lite/kernels/x86/mul_compute.cc b/lite/kernels/x86/mul_compute.cc
deleted file mode 100644
index 64558f6677..0000000000
--- a/lite/kernels/x86/mul_compute.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/mul_compute.h"
-
-REGISTER_LITE_KERNEL(mul,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::MulCompute<float>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
-
-// #ifdef LITE_WITH_TRAIN
-// REGISTER_LITE_KERNEL(mul_grad,
-//                      kX86,
-//                      kFloat,
-//                      kNCHW,
-//                      paddle::lite::kernels::x86::MulGradCompute<float>,
-//                      def)
-//     .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-//     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
-//     .BindInput(paddle::framework::GradVarName("Out"),
-//                {LiteType::GetTensorTy(TARGET(kX86))})
-//     .BindOutput(paddle::framework::GradVarName("X"),
-//                 {LiteType::GetTensorTy(TARGET(kX86))})
-//     .BindOutput(paddle::framework::GradVarName("Y"),
-//                 {LiteType::GetTensorTy(TARGET(kX86))})
-//     .Finalize();
-// #endif
diff --git a/lite/kernels/x86/mul_compute.h b/lite/kernels/x86/mul_compute.h
deleted file mode 100644
index e204fc81f2..0000000000
--- a/lite/kernels/x86/mul_compute.h
+++ /dev/null
@@ -1,159 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include "lite/backends/x86/math/blas.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/types.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-// using Tensor = framework::Tensor;
-inline lite::Tensor ReshapeToMatrix(const lite::Tensor& src, int num_col_dims) {
-  int rank = src.dims().size();
-  if (rank == 2) {
-    return src;
-  }
-  lite::Tensor res;
-  res.ShareDataWith(src);
-  res.Resize(src.dims().Flatten2D(num_col_dims));
-  return res;
-}
-
-template <typename T>
-class MulCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::MulParam;
-
-  void Run() override {
-    auto& context = ctx_->As<X86Context>();
-    auto& param = *param_.get_mutable<operators::MulParam>();
-    // CHECK(context.x86_device_context());
-
-    auto* z = param.output;
-
-    auto* x = param.x;
-    auto* y = param.y;
-
-    Tensor x_matrix, y_matrix;
-
-    if (x->dims().size() > 2) {
-      x_matrix = ReshapeToMatrix(*x, param.x_num_col_dims);
-    } else {
-      x_matrix = *x;
-    }
-
-    if (y->dims().size() > 2) {
-      y_matrix = ReshapeToMatrix(*y, param.y_num_col_dims);
-
-    } else {
-      y_matrix = *y;
-    }
-
-    z->mutable_data<T>();
-    auto z_dim = z->dims();
-    if (z_dim.size() != 2) {
-      z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-    }
-
-    auto blas = lite::x86::math::GetBlas<lite::TargetType::kX86, T>(context);
-
-    blas.MatMul(x_matrix, y_matrix, z);
-    if (z_dim.size() != 2) {
-      z->Resize(z_dim);
-    }
-  }
-
-  virtual ~MulCompute() = default;
-};
-
-#ifdef LITE_WITH_TRAIN
-template <typename T>
-class MulGradCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  void Run() override {
-    auto& context = ctx_->As<X86Context>();
-    auto& param = *param_.get_mutable<operators::MulGradParam>();
-    CHECK(context.x86_device_context());
-
-    auto* x = &param.x->raw_tensor();
-    auto* y = &param.y->raw_tensor();
-
-    Tensor x_matrix, y_matrix;
-
-    if (x->dims().size() > 2) {
-      x_matrix = framework::ReshapeToMatrix(*x, param.x_num_col_dims);
-    } else {
-      x_matrix = *x;
-    }
-
-    if (y->dims().size() > 2) {
-      y_matrix = framework::ReshapeToMatrix(*y, param.y_num_col_dims);
-
-    } else {
-      y_matrix = *y;
-    }
-
-    auto* dout = &param.output_grad->raw_tensor();
-
-    Tensor dout_mat;
-    dout_mat.ShareDataWith(*dout);
-    dout_mat.Resize(
-        {framework::flatten_to_2d(x->dims(), param.x_num_col_dims)[0],
-         framework::flatten_to_2d(y->dims(), param.y_num_col_dims)[1]});
-
-    auto* dx = &param.x_grad->raw_tensor();
-    auto* dy = &param.y_grad->raw_tensor();
-
-    if (dx != nullptr) {
-      dx->set_lod(x->lod());
-    }
-    if (dy != nullptr) {
-      dy->set_lod(y->lod());
-    }
-
-    auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
-        *context.x86_device_context());
-    if (dx) {
-      // dx->mutable_data<T>(context.x86_device_context->GetPlace());
-      param.x_grad->template mutable_data<T>();
-      Tensor dx_matrix = dx->dims().size() > 2 ? framework::ReshapeToMatrix(
-                                                     *dx, param.x_num_col_dims)
-                                               : *dx;
-
-      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
-      blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
-    }
-    if (dy) {
-      // dy->yutable_data<T>(context.x86_device_context->GetPlace());
-      param.y_grad->template mutable_data<T>();
-      Tensor dy_matrix = dy->dims().size() > 2 ? framework::ReshapeToMatrix(
-                                                     *dy, param.y_num_col_dims)
-                                               : *dy;
-      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
-      blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
-    }
-  }
-
-  virtual ~MulGradCompute() = default;
-};
-#endif
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/x86/mul_compute_test.cc b/lite/kernels/x86/mul_compute_test.cc
deleted file mode 100644
index 32d82cbb77..0000000000
--- a/lite/kernels/x86/mul_compute_test.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/mul_compute.h"
-#include <gtest/gtest.h>
-#include <iostream>
-#include <memory>
-#include <utility>
-#include <vector>
-#include "lite/core/op_registry.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-TEST(mul_x86, retrive_op) {
-  auto mul =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("mul");
-  ASSERT_FALSE(mul.empty());
-  ASSERT_TRUE(mul.front());
-}
-
-TEST(mul_x86, init) {
-  lite::kernels::x86::MulCompute<float> mul;
-  ASSERT_EQ(mul.precision(), PRECISION(kFloat));
-  ASSERT_EQ(mul.target(), TARGET(kX86));
-}
-
-TEST(mul_x86, run_test) {
-  lite::Tensor x, y, out;
-  constexpr int batch_size = 1;
-  std::vector<int64_t> x_shape{batch_size, 3};
-  x.Resize(lite::DDim(x_shape));
-  std::vector<int64_t> y_shape{3, 4};
-  y.Resize(lite::DDim(y_shape));
-  std::vector<int64_t> out_shape{batch_size, 4};
-  out.Resize(lite::DDim(out_shape));
-
-  auto x_data = x.mutable_data<float>();
-  auto y_data = y.mutable_data<float>();
-  auto out_data = out.mutable_data<float>();
-
-  for (int64_t i = 0; i < x.dims().production(); i++) {
-    x_data[i] = static_cast<float>(i);
-  }
-  for (int64_t i = 0; i < y.dims().production(); i++) {
-    y_data[i] = static_cast<float>(i);
-  }
-  // MulCompute mul;
-  MulCompute<float> mul;
-  operators::MulParam param;
-
-  param.x = &x;
-  param.y = &y;
-  param.output = &out;
-
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  ctx->As<X86Context>();
-  mul.SetContext(std::move(ctx));
-  mul.SetParam(param);
-  mul.Run();
-
-  std::vector<float> ref_result = {20, 23, 26, 29};
-
-  for (int i = 0; i < out.dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], ref_result[i], 1e-3);
-  }
-}
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/pool_compute.cc b/lite/kernels/x86/pool_compute.cc
deleted file mode 100644
index f8efab7a41..0000000000
--- a/lite/kernels/x86/pool_compute.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/pool_compute.h"
-
-REGISTER_LITE_KERNEL(pool2d,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::PoolCompute<float>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
diff --git a/lite/kernels/x86/pool_compute.h b/lite/kernels/x86/pool_compute.h
deleted file mode 100644
index 1e3ba36a7e..0000000000
--- a/lite/kernels/x86/pool_compute.h
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <Eigen/Core>
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/types.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/pooling.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T>
-class PoolCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::PoolParam;
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    if (param.global_pooling) {
-      for (size_t i = 0; i < param.ksize.size(); ++i) {
-        param.paddings[i] = 0;
-        param.ksize[i] = static_cast<int>(param.x->dims()[i + 2]);
-      }
-    }
-    switch (param.ksize.size()) {
-      case 2: {
-        if (param.pooling_type == "max") {
-          paddle::operators::math::Pool2dFunctor<
-              platform::CPUDeviceContext,
-              paddle::operators::math::MaxPool<T>,
-              T>
-              pool2d_forward;
-          paddle::operators::math::MaxPool<T> pool_process;
-          pool2d_forward(platform::CPUDeviceContext(),
-                         param.x->raw_tensor(),
-                         param.ksize,
-                         param.strides,
-                         param.paddings,
-                         pool_process,
-                         true,
-                         false,
-                         &(param.output->raw_tensor()));
-        } else if (param.pooling_type == "avg") {
-          paddle::operators::math::Pool2dFunctor<
-              platform::CPUDeviceContext,
-              paddle::operators::math::AvgPool<T>,
-              T>
-              pool2d_forward;
-          paddle::operators::math::AvgPool<T> pool_process;
-          pool2d_forward(platform::CPUDeviceContext(),
-                         param.x->raw_tensor(),
-                         param.ksize,
-                         param.strides,
-                         param.paddings,
-                         pool_process,
-                         param.exclusive,
-                         param.adaptive,
-                         &(param.output->raw_tensor()));
-        }
-      } break;
-      case 3: {
-      } break;
-    }
-  }
-  virtual ~PoolCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/x86/pool_compute_test.cc b/lite/kernels/x86/pool_compute_test.cc
deleted file mode 100644
index 9b073b35ed..0000000000
--- a/lite/kernels/x86/pool_compute_test.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/pool_compute.h"
-#include <gtest/gtest.h>
-#include <iostream>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-TEST(pool_x86, retrive_op) {
-  auto pool2d =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "pool2d");
-  ASSERT_FALSE(pool2d.empty());
-  ASSERT_TRUE(pool2d.front());
-}
-
-TEST(pool2d_x86, init) {
-  PoolCompute<float> pool2d;
-  ASSERT_EQ(pool2d.precision(), PRECISION(kFloat));
-  ASSERT_EQ(pool2d.target(), TARGET(kX86));
-}
-
-TEST(pool2d_x86, run_test) {
-  lite::Tensor x, out;
-  constexpr int batch_size = 1;
-  std::vector<int64_t> x_shape{batch_size, 3, 4, 4};
-  x.Resize(lite::DDim(x_shape));
-  std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
-  out.Resize(lite::DDim(out_shape));
-
-  auto x_data = x.mutable_data<float>();
-  auto out_data = out.mutable_data<float>();
-
-  for (int64_t i = 0; i < x.dims().production(); i++) {
-    x_data[i] = static_cast<float>(i);
-  }
-
-  PoolCompute<float> pool2d;
-  operators::PoolParam param;
-
-  param.x = &x;
-  param.output = &out;
-  param.strides = {2, 2};
-  param.paddings = {0, 0};
-  param.ksize = {2, 2};
-  param.pooling_type = "max";
-
-  pool2d.SetParam(param);
-  pool2d.Run();
-
-  LOG(INFO) << "output: ";
-  for (int i = 0; i < out.dims().production(); i++) {
-    LOG(INFO) << out_data[i];
-  }
-}
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(pool2d, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/relu_compute.cc b/lite/kernels/x86/relu_compute.cc
deleted file mode 100644
index 684b144254..0000000000
--- a/lite/kernels/x86/relu_compute.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/relu_compute.h"
-
-REGISTER_LITE_KERNEL(relu,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::ReluCompute<float>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
diff --git a/lite/kernels/x86/relu_compute.h b/lite/kernels/x86/relu_compute.h
deleted file mode 100644
index b80a99302a..0000000000
--- a/lite/kernels/x86/relu_compute.h
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <Eigen/Core>
-#include <algorithm>
-#include "lite/core/kernel.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-#include "lite/operators/relu_op.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T>
-class ReluCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ActivationParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto n = param.X->dims().production();
-    const float* input = param.X->data<float>();
-    float* output = param.Out->mutable_data<float>();
-    for (int i = 0; i < n; i++) {
-      output[i] = std::max(0.f, input[i]);
-    }
-  }
-
-  virtual ~ReluCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/x86/relu_compute_test.cc b/lite/kernels/x86/relu_compute_test.cc
deleted file mode 100644
index ec446de73f..0000000000
--- a/lite/kernels/x86/relu_compute_test.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/relu_compute.h"
-#include <gtest/gtest.h>
-#include <iostream>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-TEST(relu_x86, retrive_op) {
-  auto relu =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("relu");
-  ASSERT_FALSE(relu.empty());
-  ASSERT_TRUE(relu.front());
-}
-
-TEST(relu_x86, init) {
-  ReluCompute<float> relu;
-  ASSERT_EQ(relu.precision(), PRECISION(kFloat));
-  ASSERT_EQ(relu.target(), TARGET(kX86));
-}
-
-TEST(relu_x86, run_test) {
-  lite::Tensor x, out;
-  constexpr int batch_size = 1;
-  std::vector<int64_t> x_shape{batch_size, 3, 2, 2};
-  x.Resize(lite::DDim(x_shape));
-  std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
-  out.Resize(lite::DDim(out_shape));
-
-  auto x_data = x.mutable_data<float>();
-  auto out_data = out.mutable_data<float>();
-
-  for (int64_t i = 0; i < x.dims().production(); i++) {
-    int sign = i % 2 == 0 ? 1 : -1;
-    x_data[i] = static_cast<float>(i * sign);
-  }
-  // ReluCompute relu;
-  ReluCompute<float> relu;
-  operators::ActivationParam param;
-
-  param.X = &x;
-  param.Out = &out;
-
-  relu.SetParam(param);
-  relu.Run();
-
-  LOG(INFO) << "output: ";
-  for (int i = 0; i < out.dims().production(); i++) {
-    LOG(INFO) << out_data[i];
-  }
-}
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(relu, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/reshape_compute.cc b/lite/kernels/x86/reshape_compute.cc
deleted file mode 100644
index abbb0f6af5..0000000000
--- a/lite/kernels/x86/reshape_compute.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/reshape_compute.h"
-
-REGISTER_LITE_KERNEL(reshape,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::ReshapeCompute<float>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(reshape2,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::Reshape2Compute<float>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
diff --git a/lite/kernels/x86/reshape_compute.h b/lite/kernels/x86/reshape_compute.h
deleted file mode 100644
index c17daf85ae..0000000000
--- a/lite/kernels/x86/reshape_compute.h
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <Eigen/Core>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-#include "lite/operators/reshape_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T>
-void Compute(const lite::Tensor* in,
-             const lite::Tensor* actual_shape,
-             lite::Tensor* out) {
-  auto out_dims = out->dims();
-  auto in_dims = in->dims();
-  if (actual_shape) {
-    auto shape_dims = actual_shape->dims();
-    const int* shape_data = actual_shape->data<int>();
-    std::vector<int> shape =
-        std::vector<int>(shape_data, shape_data + shape_dims.production());
-    out_dims = lite::operators::ValidateShape(shape, in_dims);
-    out->Resize(out_dims);
-  }
-  out->CopyDataFrom(*in);
-  out->Resize(out_dims);
-}
-
-template <typename T>
-class ReshapeCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ReshapeParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    Compute<T>(param.x, param.actual_shape, param.output);
-  }
-
-  virtual ~ReshapeCompute() = default;
-};
-
-template <typename T>
-void reshape2_compute() {}
-
-template <typename T>
-class Reshape2Compute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ReshapeParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    Compute<T>(param.x, param.actual_shape, param.output);
-  }
-
-  virtual ~Reshape2Compute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/x86/reshape_compute_test.cc b/lite/kernels/x86/reshape_compute_test.cc
deleted file mode 100644
index a9a8d38d06..0000000000
--- a/lite/kernels/x86/reshape_compute_test.cc
+++ /dev/null
@@ -1,156 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/reshape_compute.h"
-#include <gtest/gtest.h>
-#include <iostream>
-#include <memory>
-#include <utility>
-#include <vector>
-#include "lite/core/op_registry.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-// reshape
-TEST(reshape_x86, retrive_op) {
-  auto reshape =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "reshape");
-  ASSERT_FALSE(reshape.empty());
-  ASSERT_TRUE(reshape.front());
-}
-
-TEST(reshape_x86, init) {
-  lite::kernels::x86::ReshapeCompute<float> reshape;
-  ASSERT_EQ(reshape.precision(), PRECISION(kFloat));
-  ASSERT_EQ(reshape.target(), TARGET(kX86));
-}
-
-TEST(reshape_x86, run_test) {
-  lite::Tensor x, actual_shape;
-  lite::Tensor out;
-  std::vector<int64_t> x_shape({1, 2, 4, 1});
-  x.Resize(lite::DDim(x_shape));
-  actual_shape.Resize(lite::DDim(std::vector<int64_t>({3})));
-  std::vector<int64_t> out_shape({1, 8, 1, 1});
-  out.Resize(lite::DDim(out_shape));
-
-  auto x_data = x.mutable_data<float>();
-  auto actual_data = actual_shape.mutable_data<int>();
-  auto out_data = out.mutable_data<float>();
-
-  for (int64_t i = 0; i < x.dims().production(); ++i) {
-    x_data[i] = static_cast<float>(i);
-  }
-  actual_data[0] = 1;
-  actual_data[1] = 4;
-  actual_data[2] = 2;
-
-  std::vector<int> shape({1, 8, 1, 1});
-
-  // ReshapeCompute reshape;
-  ReshapeCompute<float> reshape;
-  operators::ReshapeParam param;
-
-  param.x = &x;
-  param.output = &out;
-  param.shape = shape;
-  param.actual_shape = &actual_shape;
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  ctx->As<X86Context>();
-  for (int i = 0; i < 2; ++i) {
-    if (1 == i) param.actual_shape = nullptr;
-    reshape.SetContext(std::move(ctx));
-    reshape.SetParam(param);
-    reshape.Run();
-
-    for (int j = 0; j < out.dims().production(); ++j) {
-      EXPECT_NEAR(out_data[j], x_data[j], 1e-5);
-    }
-  }
-}
-
-// reshape2
-TEST(reshape2_x86, retrive_op) {
-  auto reshape2 =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "reshape2");
-  ASSERT_FALSE(reshape2.empty());
-  ASSERT_TRUE(reshape2.front());
-}
-
-TEST(reshape2_x86, init) {
-  lite::kernels::x86::Reshape2Compute<float> reshape2;
-  ASSERT_EQ(reshape2.precision(), PRECISION(kFloat));
-  ASSERT_EQ(reshape2.target(), TARGET(kX86));
-}
-
-TEST(reshape2_x86, run_test) {
-  lite::Tensor x, actual_shape;
-  lite::Tensor out, xshape;
-  std::vector<int64_t> x_shape({1, 2, 4});
-  x.Resize(lite::DDim(x_shape));
-  actual_shape.Resize(lite::DDim(std::vector<int64_t>({3})));
-  std::vector<int64_t> out_shape({1, 4, 2});
-  out.Resize(lite::DDim(out_shape));
-  std::vector<int64_t> xshape_shape({1, 2, 4});
-  xshape.Resize(lite::DDim(xshape_shape));
-
-  auto x_data = x.mutable_data<float>();
-  auto actual_data = actual_shape.mutable_data<int>();
-  auto out_data = out.mutable_data<float>();
-  auto xshape_data = xshape.mutable_data<float>();
-
-  for (int64_t i = 0; i < x.dims().production(); ++i) {
-    x_data[i] = static_cast<float>(i);
-    xshape_data[i] = static_cast<float>(i);
-  }
-  actual_data[0] = 1;
-  actual_data[1] = 4;
-  actual_data[2] = 2;
-
-  std::vector<int> shape({0, -1, 2});
-
-  // Reshape2Compute reshape2;
-  Reshape2Compute<float> reshape2;
-  operators::ReshapeParam param;
-
-  param.x = &x;
-  param.output = &out;
-  param.xshape = &xshape;
-  param.shape = shape;
-  param.actual_shape = &actual_shape;
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  ctx->As<X86Context>();
-  for (int i = 0; i < 2; ++i) {
-    if (1 == i) param.actual_shape = nullptr;
-    reshape2.SetContext(std::move(ctx));
-    reshape2.SetParam(param);
-    reshape2.Run();
-
-    for (int j = 0; j < out.dims().production(); ++j) {
-      EXPECT_NEAR(out_data[j], x_data[j], 1e-5);
-    }
-  }
-}
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(reshape, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(reshape2, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/scale_compute.cc b/lite/kernels/x86/scale_compute.cc
deleted file mode 100644
index a1e314df3d..0000000000
--- a/lite/kernels/x86/scale_compute.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/scale_compute.h"
-
-REGISTER_LITE_KERNEL(scale,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::ScaleCompute<float>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
diff --git a/lite/kernels/x86/scale_compute.h b/lite/kernels/x86/scale_compute.h
deleted file mode 100644
index c78f385b96..0000000000
--- a/lite/kernels/x86/scale_compute.h
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <Eigen/Core>
-#include "lite/core/kernel.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-#include "lite/operators/relu_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T>
-void scale_compute(
-    const T* x, T* out, int size, float scale, float bias, bool bias_before) {
-  if (bias_before) bias *= scale;
-  for (int i = 0; i < size; i++) {
-    out[i] = x[i] * scale + bias;
-  }
-}
-
-template <typename T>
-class ScaleCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ScaleParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    scale_compute(param.x->data<T>(),
-                  param.output->mutable_data<T>(),
-                  param.x->dims().production(),
-                  param.scale,
-                  param.bias,
-                  !param.bias_after_scale);
-  }
-
-  virtual ~ScaleCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/x86/scale_compute_test.cc b/lite/kernels/x86/scale_compute_test.cc
deleted file mode 100644
index 6da27f444c..0000000000
--- a/lite/kernels/x86/scale_compute_test.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/scale_compute.h"
-#include <gtest/gtest.h>
-#include <iostream>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-TEST(scale_x86, retrive_op) {
-  auto scale =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("scale");
-  ASSERT_FALSE(scale.empty());
-  ASSERT_TRUE(scale.front());
-}
-
-TEST(scale_x86, init) {
-  ScaleCompute<float> scale;
-  ASSERT_EQ(scale.precision(), PRECISION(kFloat));
-  ASSERT_EQ(scale.target(), TARGET(kX86));
-}
-
-TEST(scale_x86, run_test) {
-  lite::Tensor x, y, out;
-  constexpr int batch_size = 1;
-  std::vector<int64_t> x_shape{batch_size, 3, 2, 2};
-  x.Resize(lite::DDim(x_shape));
-  std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
-  out.Resize(lite::DDim(out_shape));
-
-  auto x_data = x.mutable_data<float>();
-  auto out_data = out.mutable_data<float>();
-
-  for (int64_t i = 0; i < x.dims().production(); i++) {
-    x_data[i] = static_cast<float>(i);
-  }
-  // ScaleCompute scale;
-  ScaleCompute<float> scale;
-  operators::ScaleParam param;
-
-  param.x = &x;
-  param.scale = 0.5;
-  param.bias = 0;
-  param.output = &out;
-
-  scale.SetParam(param);
-  scale.Run();
-
-  LOG(INFO) << "output: ";
-  for (int i = 0; i < out.dims().production(); i++) {
-    LOG(INFO) << out_data[i];
-  }
-}
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(scale, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/sequence_pool_compute.cc b/lite/kernels/x86/sequence_pool_compute.cc
deleted file mode 100644
index f158392556..0000000000
--- a/lite/kernels/x86/sequence_pool_compute.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/sequence_pool_compute.h"
-
-REGISTER_LITE_KERNEL(sequence_pool,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::SequencePoolCompute<float>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
diff --git a/lite/kernels/x86/sequence_pool_compute.h b/lite/kernels/x86/sequence_pool_compute.h
deleted file mode 100644
index 329a76658d..0000000000
--- a/lite/kernels/x86/sequence_pool_compute.h
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <vector>
-#include "lite/backends/x86/math/math_function.h"
-#include "lite/backends/x86/math/sequence_pooling.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T>
-class SequencePoolCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::SequencePoolParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<operators::SequencePoolParam>();
-    auto& context = ctx_->As<X86Context>();
-    auto* out = param.Out;
-    auto dims = param.X->dims();
-    auto lod = param.X->lod();
-    CHECK_EQ(lod.size(), 1UL);
-    CHECK_GE(dims[0], static_cast<int64_t>(lod[0].size() - 1));
-
-    dims[0] = lod[0].size() - 1;
-    out->Resize({dims});
-    out->mutable_data<T>();
-    lite::Tensor* index = nullptr;
-
-    const bool is_test = true;
-    float pad_value = 0.0;
-
-    lite::x86::math::SequencePoolFunctor<lite::TargetType::kX86, T> pool;
-    pool(context, param.pool_type, pad_value, *param.X, out, is_test, index);
-  }
-
-  virtual ~SequencePoolCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/x86/sequence_pool_compute_test.cc b/lite/kernels/x86/sequence_pool_compute_test.cc
deleted file mode 100644
index 93cc122f7a..0000000000
--- a/lite/kernels/x86/sequence_pool_compute_test.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/sequence_pool_compute.h"
-#include <gtest/gtest.h>
-#include <memory>
-#include <utility>
-#include <vector>
-#include "lite/core/op_registry.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-TEST(sequence_pool_x86, retrive_op) {
-  auto sequence_pool =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "sequence_pool");
-  ASSERT_FALSE(sequence_pool.empty());
-  ASSERT_TRUE(sequence_pool.front());
-}
-
-TEST(sequence_pool_x86, init) {
-  SequencePoolCompute<float> sequence_pool;
-  ASSERT_EQ(sequence_pool.precision(), PRECISION(kFloat));
-  ASSERT_EQ(sequence_pool.target(), TARGET(kX86));
-}
-
-TEST(sequence_pool_x86, run_test) {
-  lite::Tensor x, out;
-  lite::LoD lod;
-  lod.push_back(std::vector<uint64_t>{0, 10});
-
-  x.set_lod(lod);
-  const size_t second_dim = 8u;
-  std::vector<int64_t> input_shape{static_cast<int64_t>(lod[0].back()),
-                                   static_cast<int64_t>(second_dim)};
-  lite::DDim in_dims(input_shape);
-  x.Resize(in_dims);
-
-  const size_t out_first_dim = lod[0].size() - 1;
-  std::vector<int64_t> output_shape{static_cast<int64_t>(out_first_dim),
-                                    static_cast<int64_t>(second_dim)};
-  lite::DDim out_dims(output_shape);
-  out.Resize(out_dims);
-
-  auto x_data = x.mutable_data<float>();
-  auto out_data = out.mutable_data<float>();
-
-  for (int64_t i = 0; i < x.dims().production(); i++) {
-    x_data[i] = 1.1f * i;
-  }
-
-  SequencePoolCompute<float> sequence_pool;
-  operators::SequencePoolParam param;
-  param.X = &x;
-  param.Out = &out;
-
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  ctx->As<X86Context>();
-  sequence_pool.SetContext(std::move(ctx));
-  sequence_pool.SetParam(param);
-  sequence_pool.Run();
-
-  std::vector<float> ref_results = {
-      39.6, 40.7, 41.8, 42.9, 44, 45.1, 46.2, 47.3};
-  for (int i = 0; i < out.dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], ref_results[i], 1e-3);
-  }
-}
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(sequence_pool, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/sgd_compute.cc b/lite/kernels/x86/sgd_compute.cc
deleted file mode 100644
index a3241468f9..0000000000
--- a/lite/kernels/x86/sgd_compute.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/jit/kernels.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T>
-class SGDCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ActivationParam;
-
-  void Run() override {
-    auto &context = ctx_->As<X86Context>();
-    auto &sgd_param = *param_.get_mutable<operators::SGDParam>();
-    CHECK(context.x86_device_context());
-
-    // param.Out->template mutable_data<T>();
-
-    const auto *param = &sgd_param.Param->raw_tensor();
-    const auto *grad = &sgd_param.Grad->raw_tensor();
-    const auto *learning_rate = &sgd_param.LearningRate->raw_tensor();
-    auto *param_out = &sgd_param.ParamOut->raw_tensor();
-
-    auto sz = param_out->numel();
-    PADDLE_ENFORCE_EQ(param->numel(), sz);
-    PADDLE_ENFORCE_EQ(grad->numel(), sz);
-
-    paddle::operators::jit::sgd_attr_t attr(1, sz, 1, sz, 1);
-    const T *lr = learning_rate->template data<T>();
-    const T *param_data = param->template data<T>();
-    const T *grad_data = grad->template data<T>();
-    int64_t rows_idx = 0;
-
-    T *out_data = param_out->template mutable_data<T>(
-        context.x86_device_context()->GetPlace());
-
-    auto sgd =
-        paddle::operators::jit::KernelFuncs<paddle::operators::jit::SgdTuple<T>,
-                                            platform::CPUPlace>::Cache()
-            .At(attr);
-    sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr);
-  }
-
-  virtual ~SGDCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-// float
-REGISTER_LITE_KERNEL(sgd,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::SGDCompute<float>,
-                     def)
-    .BindInput("Param", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindInput("LearningRate", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindInput("Grad", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("ParamOut", {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
diff --git a/lite/kernels/x86/shape_compute.cc b/lite/kernels/x86/shape_compute.cc
deleted file mode 100644
index 565379eb06..0000000000
--- a/lite/kernels/x86/shape_compute.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/shape_compute.h"
-
-REGISTER_LITE_KERNEL(shape,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::ShapeCompute<float>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
diff --git a/lite/kernels/x86/shape_compute.h b/lite/kernels/x86/shape_compute.h
deleted file mode 100644
index ee3678a7f1..0000000000
--- a/lite/kernels/x86/shape_compute.h
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T>
-class ShapeCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ShapeParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<operators::ShapeParam>();
-    // auto& context = context_->As<X86Context>();
-    auto out_data = param.Out->mutable_data<int32_t>();
-    auto in_dims = param.X->dims();
-    for (int i = 0; i < in_dims.size(); ++i) {
-      out_data[i] = in_dims[i];
-    }
-  }
-
-  virtual ~ShapeCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/x86/shape_compute_test.cc b/lite/kernels/x86/shape_compute_test.cc
deleted file mode 100644
index 88bd98f33f..0000000000
--- a/lite/kernels/x86/shape_compute_test.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/shape_compute.h"
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-TEST(shape_x86, retrive_op) {
-  auto shape =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("shape");
-  ASSERT_FALSE(shape.empty());
-  ASSERT_TRUE(shape.front());
-}
-
-TEST(shape_x86, init) {
-  ShapeCompute<float> shape;
-  ASSERT_EQ(shape.precision(), PRECISION(kFloat));
-  ASSERT_EQ(shape.target(), TARGET(kX86));
-}
-
-TEST(shape_x86, run_test) {
-  lite::Tensor x, out;
-  constexpr int batch_size = 1;
-  std::vector<int64_t> x_shape{batch_size, 1, 3, 3};
-  x.Resize(lite::DDim(x_shape));
-
-  std::vector<int64_t> out_shape{4};
-  out.Resize(lite::DDim(out_shape));
-
-  auto x_data = x.mutable_data<float>();
-  auto out_data = out.mutable_data<int32_t>();
-
-  for (int64_t i = 0; i < x.dims().production(); i++) {
-    x_data[i] = 1;
-  }
-
-  ShapeCompute<float> shape;
-  operators::ShapeParam param;
-  param.X = &x;
-  param.Out = &out;
-
-  shape.SetParam(param);
-  shape.Run();
-
-  std::vector<float> ref_results = {1, 1, 3, 3};
-  for (int i = 0; i < out.dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], ref_results[i], 1e-3);
-  }
-}
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(shape, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/slice_compute.cc b/lite/kernels/x86/slice_compute.cc
deleted file mode 100644
index 00602ce62b..0000000000
--- a/lite/kernels/x86/slice_compute.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/slice_compute.h"
-
-REGISTER_LITE_KERNEL(slice,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::SliceCompute<float>,
-                     def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
diff --git a/lite/kernels/x86/slice_compute.h b/lite/kernels/x86/slice_compute.h
deleted file mode 100644
index a3540cafdf..0000000000
--- a/lite/kernels/x86/slice_compute.h
+++ /dev/null
@@ -1,145 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <Eigen/Core>
-#include <algorithm>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-#include "lite/fluid/eigen.h"
-#include "lite/operators/relu_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <size_t D>
-void slice_compute(const lite::Tensor* in,
-                   lite::Tensor* out,
-                   std::vector<int> axes,
-                   std::vector<int> starts,
-                   std::vector<int> ends,
-                   std::vector<int> decrease_axis) {
-  auto out_dims = out->dims();
-  auto in_dims = in->dims();
-
-  // resize out_dims
-  if (decrease_axis.size() > 0) {
-    if (decrease_axis.size() == (size_t)in_dims.size()) {
-      std::vector<int64_t> vec_origin_out_shape(decrease_axis.size(), 1);
-      // lite::DDim dims(vec_origin_out_shape);
-      out->Resize(vec_origin_out_shape);
-    } else {
-      std::vector<int64_t> vec_origin_out_shape(
-          out_dims.size() + decrease_axis.size(), -1);
-      for (size_t i = 0; i < decrease_axis.size(); ++i) {
-        vec_origin_out_shape[decrease_axis[i]] = 1;
-      }
-      int index = 0;
-      for (size_t i = 0; i < vec_origin_out_shape.size(); ++i) {
-        if (-1 == vec_origin_out_shape[i]) {
-          vec_origin_out_shape[i] = out_dims[index];
-          ++index;
-        }
-      }
-      // lite::DDim dims(vec_origin_out_shape);
-      out->Resize(vec_origin_out_shape);
-    }
-  }
-
-  out->mutable_data<float>(lite::TargetType::kX86);
-
-  auto new_out_dims = out->dims();
-  auto offsets = Eigen::array<int, D>();
-  auto extents = Eigen::array<int, D>();
-  for (size_t i = 0; i < D; ++i) {
-    offsets[i] = 0;
-    extents[i] = new_out_dims[i];
-  }
-  int start;
-  for (size_t i = 0; i < axes.size(); ++i) {
-    start = starts[i];
-    if (start < 0) {
-      start = (start + in_dims[axes[i]]);
-    }
-    start = std::max(start, 0);
-    offsets[axes[i]] = start;
-  }
-  auto in_t =
-      lite::fluid::EigenTensor<float, D, Eigen::RowMajor, Eigen::DenseIndex>::
-          From(*in, in->dims());
-  auto out_t =
-      lite::fluid::EigenTensor<float, D, Eigen::RowMajor, Eigen::DenseIndex>::
-          From(*out, new_out_dims);
-  out_t = in_t.slice(offsets, extents);
-
-  out->Resize(out_dims);
-}
-
-template <typename T>
-void slice_compute_(const lite::Tensor* Input,
-                    lite::Tensor* Out,
-                    std::vector<int> axes,
-                    std::vector<int> starts,
-                    std::vector<int> ends,
-                    std::vector<int> decrease_axis) {
-  int rank = Input->dims().size();
-  switch (rank) {
-    case 1:
-      slice_compute<1>(Input, Out, axes, starts, ends, decrease_axis);
-      break;
-    case 2:
-      slice_compute<2>(Input, Out, axes, starts, ends, decrease_axis);
-      break;
-    case 3:
-      slice_compute<3>(Input, Out, axes, starts, ends, decrease_axis);
-      break;
-    case 4:
-      slice_compute<4>(Input, Out, axes, starts, ends, decrease_axis);
-      break;
-    case 5:
-      slice_compute<5>(Input, Out, axes, starts, ends, decrease_axis);
-      break;
-    case 6:
-      slice_compute<6>(Input, Out, axes, starts, ends, decrease_axis);
-      break;
-  }
-}
-
-template <typename T>
-class SliceCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::SliceParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    slice_compute_<T>(param.X,
-                      param.Out,
-                      param.axes,
-                      param.starts,
-                      param.ends,
-                      param.decrease_axis);
-  }
-
-  virtual ~SliceCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/x86/slice_compute_test.cc b/lite/kernels/x86/slice_compute_test.cc
deleted file mode 100644
index db3cb35ccb..0000000000
--- a/lite/kernels/x86/slice_compute_test.cc
+++ /dev/null
@@ -1,265 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/slice_compute.h"
-#include <gtest/gtest.h>
-#include <iostream>
-#include <memory>
-#include <utility>
-#include <vector>
-#include "lite/core/op_registry.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-TEST(slice_x86, retrive_op) {
-  auto slice =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("slice");
-  ASSERT_FALSE(slice.empty());
-  ASSERT_TRUE(slice.front());
-}
-
-TEST(slice_x86, init) {
-  lite::kernels::x86::SliceCompute<float> slice;
-  ASSERT_EQ(slice.precision(), PRECISION(kFloat));
-  ASSERT_EQ(slice.target(), TARGET(kX86));
-}
-
-void test_case1(lite::Tensor x, lite::Tensor out) {
-  std::vector<int64_t> x_shape({3});
-  x.Resize(lite::DDim(x_shape));
-  std::vector<int64_t> out_shape({3});
-  out.Resize(lite::DDim(out_shape));
-
-  auto x_data = x.mutable_data<float>();
-  auto out_data = out.mutable_data<float>();
-
-  for (int64_t i = 0; i < x.dims().production(); ++i) {
-    x_data[i] = static_cast<float>(i);
-  }
-
-  std::vector<int> starts({-3});
-  std::vector<int> ends({3});
-  std::vector<int> axes({0});
-
-  // SliceCompute slice;
-  SliceCompute<float> slice;
-  operators::SliceParam param;
-
-  param.X = &x;
-  param.Out = &out;
-
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  ctx->As<X86Context>();
-  slice.SetContext(std::move(ctx));
-  slice.SetParam(param);
-  slice.Run();
-
-  for (int i = 0; i < out.dims().production(); i++) {
-    LOG(INFO) << out_data[i];
-  }
-}
-
-void test_case2(lite::Tensor x, lite::Tensor out) {
-  std::vector<int64_t> x_shape({3, 4});
-  x.Resize(lite::DDim(x_shape));
-  std::vector<int64_t> out_shape({3, 4});
-  out.Resize(lite::DDim(out_shape));
-
-  auto x_data = x.mutable_data<float>();
-  auto out_data = out.mutable_data<float>();
-
-  for (int64_t i = 0; i < x.dims().production(); ++i) {
-    x_data[i] = static_cast<float>(i);
-  }
-
-  std::vector<int> starts({-3, 0});
-  std::vector<int> ends({3, 100});
-  std::vector<int> axes({0, 1});
-
-  // SliceCompute slice;
-  SliceCompute<float> slice;
-  operators::SliceParam param;
-
-  param.X = &x;
-  param.Out = &out;
-
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  ctx->As<X86Context>();
-  slice.SetContext(std::move(ctx));
-  slice.SetParam(param);
-  slice.Run();
-
-  for (int i = 0; i < out.dims().production(); i++) {
-    LOG(INFO) << out_data[i];
-  }
-}
-
-void test_case3(lite::Tensor x, lite::Tensor out) {
-  std::vector<int64_t> x_shape({3, 4, 5});
-  x.Resize(lite::DDim(x_shape));
-  std::vector<int64_t> out_shape({3, 4, 2});
-  out.Resize(lite::DDim(out_shape));
-
-  auto x_data = x.mutable_data<float>();
-  auto out_data = out.mutable_data<float>();
-
-  for (int64_t i = 0; i < x.dims().production(); ++i) {
-    x_data[i] = static_cast<float>(i);
-  }
-
-  std::vector<int> starts({-3, 0, 2});
-  std::vector<int> ends({3, 100, -1});
-  std::vector<int> axes({0, 1, 2});
-
-  // SliceCompute slice;
-  SliceCompute<float> slice;
-  operators::SliceParam param;
-
-  param.X = &x;
-  param.Out = &out;
-
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  ctx->As<X86Context>();
-  slice.SetContext(std::move(ctx));
-  slice.SetParam(param);
-  slice.Run();
-
-  for (int i = 0; i < out.dims().production(); i++) {
-    LOG(INFO) << out_data[i];
-  }
-}
-void test_case4(lite::Tensor x, lite::Tensor out) {
-  std::vector<int64_t> x_shape({3, 4, 5, 6});
-  x.Resize(lite::DDim(x_shape));
-  std::vector<int64_t> out_shape({3, 4, 2, 6});
-  out.Resize(lite::DDim(out_shape));
-
-  auto x_data = x.mutable_data<float>();
-  auto out_data = out.mutable_data<float>();
-
-  for (int64_t i = 0; i < x.dims().production(); ++i) {
-    x_data[i] = static_cast<float>(i);
-  }
-
-  std::vector<int> starts({-3, 0, 2});
-  std::vector<int> ends({3, 100, -1});
-  std::vector<int> axes({0, 1, 2});
-
-  // SliceCompute slice;
-  SliceCompute<float> slice;
-  operators::SliceParam param;
-
-  param.X = &x;
-  param.Out = &out;
-
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  ctx->As<X86Context>();
-  slice.SetContext(std::move(ctx));
-  slice.SetParam(param);
-  slice.Run();
-
-  for (int i = 0; i < out.dims().production(); i++) {
-    LOG(INFO) << out_data[i];
-  }
-}
-
-void test_case5(lite::Tensor x, lite::Tensor out) {
-  std::vector<int64_t> x_shape({3, 4, 5, 6, 3});
-  x.Resize(lite::DDim(x_shape));
-  std::vector<int64_t> out_shape({3, 4, 2, 6, 3});
-  out.Resize(lite::DDim(out_shape));
-
-  auto x_data = x.mutable_data<float>();
-  auto out_data = out.mutable_data<float>();
-
-  for (int64_t i = 0; i < x.dims().production(); ++i) {
-    x_data[i] = static_cast<float>(i);
-  }
-
-  std::vector<int> starts({-3, 0, 2});
-  std::vector<int> ends({3, 100, -1});
-  std::vector<int> axes({0, 1, 2});
-
-  // SliceCompute slice;
-  SliceCompute<float> slice;
-  operators::SliceParam param;
-
-  param.X = &x;
-  param.Out = &out;
-
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  ctx->As<X86Context>();
-  slice.SetContext(std::move(ctx));
-  slice.SetParam(param);
-  slice.Run();
-
-  for (int i = 0; i < out.dims().production(); i++) {
-    LOG(INFO) << out_data[i];
-  }
-}
-void test_case6(lite::Tensor x, lite::Tensor out) {
-  std::vector<int64_t> x_shape({3, 4, 5, 6, 5, 2});
-  x.Resize(lite::DDim(x_shape));
-  std::vector<int64_t> out_shape({3, 4, 2, 6, 5, 2});
-  out.Resize(lite::DDim(out_shape));
-
-  auto x_data = x.mutable_data<float>();
-  auto out_data = out.mutable_data<float>();
-
-  for (int64_t i = 0; i < x.dims().production(); ++i) {
-    x_data[i] = static_cast<float>(i);
-  }
-
-  std::vector<int> starts({-3, 0, 2});
-  std::vector<int> ends({3, 100, -1});
-  std::vector<int> axes({0, 1, 2});
-
-  // SliceCompute slice;
-  SliceCompute<float> slice;
-  operators::SliceParam param;
-
-  param.X = &x;
-  param.Out = &out;
-
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  ctx->As<X86Context>();
-  slice.SetContext(std::move(ctx));
-  slice.SetParam(param);
-  slice.Run();
-
-  for (int i = 0; i < out.dims().production(); i++) {
-    LOG(INFO) << out_data[i];
-  }
-}
-
-TEST(slice_x86, run_test) {
-  lite::Tensor x;
-  lite::Tensor out;
-
-  test_case1(x, out);
-  test_case2(x, out);
-  test_case3(x, out);
-  test_case4(x, out);
-  test_case5(x, out);
-  test_case6(x, out);
-}
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(slice, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/softmax_compute.cc b/lite/kernels/x86/softmax_compute.cc
deleted file mode 100644
index a00aa6d566..0000000000
--- a/lite/kernels/x86/softmax_compute.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/softmax_compute.h"
-
-REGISTER_LITE_KERNEL(softmax,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::SoftmaxCompute<float>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
diff --git a/lite/kernels/x86/softmax_compute.h b/lite/kernels/x86/softmax_compute.h
deleted file mode 100644
index 169644db05..0000000000
--- a/lite/kernels/x86/softmax_compute.h
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <vector>
-#include "lite/backends/x86/math/softmax.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-static inline int CanonicalAxis(const int axis, const int rank) {
-  if (axis < 0) {
-    return axis + rank;
-  }
-  return axis;
-}
-
-static inline int SizeToAxis(const int axis, lite::DDim dims) {
-  int size = 1;
-  for (int i = 0; i < axis; i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-static inline int SizeFromAxis(const int axis, lite::DDim dims) {
-  int size = 1;
-  for (size_t i = axis; i < dims.size(); i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-template <typename T>
-class SoftmaxCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::SoftmaxParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<operators::SoftmaxParam>();
-    auto& context = ctx_->As<X86Context>();
-    CHECK(param.output);
-    CHECK(param.x);
-    param.output->mutable_data<T>();
-    const int rank = param.x->dims().size();
-    const int axis = CanonicalAxis(param.axis, rank);
-    int axis_dim = param.x->dims()[axis];
-    const int n = SizeToAxis(axis, param.x->dims());
-    const int d = SizeFromAxis(axis, param.x->dims());
-    std::vector<int64_t> shape{n, d};
-
-    lite::Tensor input_2d, out_2d;
-    input_2d.ShareDataWith(*param.x);
-    input_2d.Resize(lite::DDim(shape));
-    out_2d.ShareDataWith(*param.output);
-    out_2d.Resize(lite::DDim(shape));
-
-    lite::x86::math::SoftmaxFunctor<lite::TargetType::kX86, T, true>()(
-        context, axis_dim, &input_2d, &out_2d);
-  }
-
-  virtual ~SoftmaxCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/x86/softmax_compute_test.cc b/lite/kernels/x86/softmax_compute_test.cc
deleted file mode 100644
index 6f18931d6b..0000000000
--- a/lite/kernels/x86/softmax_compute_test.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/softmax_compute.h"
-#include <gtest/gtest.h>
-#include <memory>
-#include <utility>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-TEST(softmax_x86, retrive_op) {
-  auto softmax =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "softmax");
-  ASSERT_FALSE(softmax.empty());
-  ASSERT_TRUE(softmax.front());
-}
-
-TEST(softmax_x86, init) {
-  SoftmaxCompute<float> softmax;
-  ASSERT_EQ(softmax.precision(), PRECISION(kFloat));
-  ASSERT_EQ(softmax.target(), TARGET(kX86));
-}
-
-TEST(softmax_x86, run_test) {
-  lite::Tensor x, out;
-  constexpr int batch_size = 1;
-  std::vector<int64_t> x_shape{batch_size, 3, 3, 3};
-  x.Resize(lite::DDim(x_shape));
-  std::vector<int64_t> out_shape{batch_size, 3, 3, 3};
-  out.Resize(lite::DDim(out_shape));
-
-  auto x_data = x.mutable_data<float>();
-  auto out_data = out.mutable_data<float>();
-
-  for (int64_t i = 0; i < x.dims().production(); i++) {
-    x_data[i] = static_cast<float>(i);
-  }
-  SoftmaxCompute<float> softmax;
-  operators::SoftmaxParam param;
-
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  ctx->As<X86Context>();
-  softmax.SetContext(std::move(ctx));
-
-  param.x = &x;
-  param.output = &out;
-
-  softmax.SetParam(param);
-  softmax.Run();
-
-  std::vector<float> ref_results = {
-      0.0900306, 0.244728, 0.665241, 0.0900306, 0.244728, 0.665241,
-      0.0900306, 0.244728, 0.665241, 0.0900306, 0.244728, 0.665241,
-      0.0900306, 0.244728, 0.665241, 0.0900306, 0.244728, 0.665241,
-      0.0900306, 0.244728, 0.665241, 0.0900306, 0.244728, 0.665241,
-      0.0900306, 0.244728, 0.665241};
-  for (int i = 0; i < out.dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], ref_results[i], 1e-3);
-  }
-}
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(softmax, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/squeeze_compute.cc b/lite/kernels/x86/squeeze_compute.cc
deleted file mode 100644
index 17ecd0c49b..0000000000
--- a/lite/kernels/x86/squeeze_compute.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/squeeze_compute.h"
-
-REGISTER_LITE_KERNEL(squeeze,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::SqueezeCompute<float>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(squeeze2,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::Squeeze2Compute<float>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
diff --git a/lite/kernels/x86/squeeze_compute.h b/lite/kernels/x86/squeeze_compute.h
deleted file mode 100644
index 67086f8c73..0000000000
--- a/lite/kernels/x86/squeeze_compute.h
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <Eigen/Core>
-#include "lite/core/kernel.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-#include "lite/operators/squeeze_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T>
-class SqueezeCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::SqueezeParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto x = param.X;
-    auto output = param.Out;
-    auto x_dims = x->dims();
-    auto* x_data = x->data<T>();
-    auto* out_data = output->mutable_data<T>();
-    memcpy(out_data, x_data, x_dims.production() * sizeof(T));
-  }
-
-  virtual ~SqueezeCompute() = default;
-};
-
-template <typename T>
-class Squeeze2Compute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::SqueezeParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto x = param.X;
-    auto output = param.Out;
-    auto xshape = param.XShape;
-    auto x_dims = x->dims();
-    auto* x_data = x->data<T>();
-    auto* out_data = output->mutable_data<T>();
-    auto* xshape_data = xshape->mutable_data<T>();
-    memcpy(out_data, x_data, x_dims.production() * sizeof(T));
-    memcpy(xshape_data, x_data, x_dims.production() * sizeof(T));
-  }
-
-  virtual ~Squeeze2Compute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/x86/squeeze_compute_test.cc b/lite/kernels/x86/squeeze_compute_test.cc
deleted file mode 100644
index 0799a522b3..0000000000
--- a/lite/kernels/x86/squeeze_compute_test.cc
+++ /dev/null
@@ -1,142 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/squeeze_compute.h"
-#include <gtest/gtest.h>
-#include <memory>
-#include <utility>
-#include <vector>
-#include "lite/core/op_registry.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-// squeeze
-TEST(squeeze_x86, retrive_op) {
-  auto squeeze =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "squeeze");
-  ASSERT_FALSE(squeeze.empty());
-  ASSERT_TRUE(squeeze.front());
-}
-
-TEST(squeeze_x86, init) {
-  lite::kernels::x86::SqueezeCompute<float> squeeze;
-  ASSERT_EQ(squeeze.precision(), PRECISION(kFloat));
-  ASSERT_EQ(squeeze.target(), TARGET(kX86));
-}
-
-TEST(squeeze_x86, run_test) {
-  lite::Tensor x;
-  lite::Tensor out;
-  std::vector<int64_t> x_shape({1, 3, 1, 5});
-  x.Resize(lite::DDim(x_shape));
-  std::vector<int64_t> out_shape({3, 5});
-  out.Resize(lite::DDim(out_shape));
-
-  auto x_data = x.mutable_data<float>();
-  auto out_data = out.mutable_data<float>();
-
-  for (int64_t i = 0; i < x.dims().production(); ++i) {
-    x_data[i] = static_cast<float>(i);
-  }
-
-  // SqueezeCompute squeeze;
-  SqueezeCompute<float> squeeze;
-  operators::SqueezeParam param;
-
-  param.X = &x;
-  param.Out = &out;
-  std::vector<std::vector<float>> ref_res({{3, 5}, {3, 5}});
-  std::vector<std::vector<int>> axes({{0, -2}, {}});
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  ctx->As<X86Context>();
-  for (int i = 0; i < 2; ++i) {
-    param.axes = axes[i];
-    squeeze.SetContext(std::move(ctx));
-    squeeze.SetParam(param);
-    squeeze.Run();
-
-    for (int j = 0; j < out.dims().production(); ++j) {
-      EXPECT_NEAR(out_data[j], x_data[j], 1e-5);
-    }
-  }
-}
-
-// squeeze2
-TEST(squeeze2_x86, retrive_op) {
-  auto squeeze2 =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "squeeze2");
-  ASSERT_FALSE(squeeze2.empty());
-  ASSERT_TRUE(squeeze2.front());
-}
-
-TEST(squeeze2_x86, init) {
-  lite::kernels::x86::Squeeze2Compute<float> squeeze2;
-  ASSERT_EQ(squeeze2.precision(), PRECISION(kFloat));
-  ASSERT_EQ(squeeze2.target(), TARGET(kX86));
-}
-
-TEST(squeeze2_x86, run_test) {
-  lite::Tensor x;
-  lite::Tensor xshape;
-  lite::Tensor out;
-  std::vector<int64_t> x_shape({1, 3, 1, 5});
-  x.Resize(lite::DDim(x_shape));
-  std::vector<int64_t> out_shape({3, 5});
-  out.Resize(lite::DDim(out_shape));
-  std::vector<int64_t> xshape_shape({1, 3, 1, 5});
-  xshape.Resize(lite::DDim(xshape_shape));
-
-  auto x_data = x.mutable_data<float>();
-  auto out_data = out.mutable_data<float>();
-  auto xshape_data = xshape.mutable_data<float>();
-
-  for (int64_t i = 0; i < x.dims().production(); ++i) {
-    x_data[i] = static_cast<float>(i);
-    xshape_data[i] = static_cast<float>(i);
-  }
-
-  // Squeeze2Compute squeeze2;
-  Squeeze2Compute<float> squeeze2;
-  operators::SqueezeParam param;
-
-  param.X = &x;
-  param.Out = &out;
-  param.XShape = &xshape;
-  std::vector<std::vector<float>> ref_res({{3, 5}, {3, 5}});
-  std::vector<std::vector<int>> axes({{0, -2}, {}});
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  ctx->As<X86Context>();
-  for (int i = 0; i < 2; ++i) {
-    param.axes = axes[i];
-    squeeze2.SetContext(std::move(ctx));
-    squeeze2.SetParam(param);
-    squeeze2.Run();
-
-    for (int j = 0; j < out.dims().production(); ++j) {
-      EXPECT_NEAR(out_data[j], x_data[j], 1e-5);
-    }
-  }
-}
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(squeeze, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(squeeze2, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/uniform_random_compute.cc b/lite/kernels/x86/uniform_random_compute.cc
deleted file mode 100644
index 64a701d4c6..0000000000
--- a/lite/kernels/x86/uniform_random_compute.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/jit/kernels.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T>
-class UniformRandomCompute
-    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  void Run() override {
-    auto &context = ctx_->As<X86Context>();
-    auto &param = *param_.get_mutable<operators::UniformRandomParam>();
-    CHECK(context.x86_device_context());
-
-    auto *param_out = &param.Out->raw_tensor();
-
-    T *data =
-        param_out->mutable_data<T>(context.x86_device_context()->GetPlace());
-
-    unsigned int seed = static_cast<unsigned int>(param.seed);
-    std::minstd_rand engine;
-    if (seed == 0) {
-      seed = std::random_device()();
-    }
-    engine.seed(seed);
-    std::uniform_real_distribution<T> dist(static_cast<T>(param.min),
-                                           static_cast<T>(param.max));
-    int64_t size = param_out->numel();
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(engine);
-    }
-  }
-
-  virtual ~UniformRandomCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-// float
-REGISTER_LITE_KERNEL(uniform_random,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::UniformRandomCompute<float>,
-                     def)
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
diff --git a/lite/model_parser/CMakeLists.txt b/lite/model_parser/CMakeLists.txt
deleted file mode 100644
index 2df9b97e55..0000000000
--- a/lite/model_parser/CMakeLists.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-if (NOT LITE_ON_TINY_PUBLISH)
-    add_subdirectory(pb)
-endif()
-add_subdirectory(cpp)
-add_subdirectory(naive_buffer)
-
-#lite_cc_library(runtime_lite SRCS runtime.cc)
-
-#TODO(Superjomn) enable it again.
-if(NOT LITE_ON_MOBILE AND NOT LITE_ON_TINY_PUBLISH)
-    lite_cc_test(test_model_parser SRCS model_parser_test.cc
-      DEPS model_parser framework_proto
-      ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model)
-    if(WITH_TESTING)
-        add_dependencies(test_model_parser extern_lite_download_lite_naive_model_tar_gz)
-    endif(WITH_TESTING)
-endif()
-
-if (NOT LITE_ON_TINY_PUBLISH)
-    lite_cc_library(compatible_pb SRCS compatible_pb.cc
-      DEPS ${cpp_wrapper} ${naive_wrapper} ${pb_wrapper} framework_proto)
-else()
-    lite_cc_library(compatible_pb SRCS compatible_pb.cc DEPS ${cpp_wrapper} ${naive_wrapper})
-endif()
-
-lite_cc_library(model_parser SRCS model_parser.cc DEPS
-    variable scope tensor scope
-    target_wrapper_host
-    compatible_pb
-    memory
-    CUDA_DEPS target_wrapper_cuda
-    NPU_DEPS npu_helper)
-
-lite_cc_test(test_compatible_pb SRCS compatible_pb_test.cc DEPS compatible_pb)
diff --git a/lite/model_parser/compatible_pb.cc b/lite/model_parser/compatible_pb.cc
deleted file mode 100644
index 09604b014a..0000000000
--- a/lite/model_parser/compatible_pb.cc
+++ /dev/null
@@ -1,286 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/model_parser/compatible_pb.h"
-#include <string>
-#include <vector>
-#include "lite/model_parser/naive_buffer/block_desc.h"
-#include "lite/model_parser/naive_buffer/op_desc.h"
-#include "lite/model_parser/naive_buffer/program_desc.h"
-#include "lite/model_parser/naive_buffer/var_desc.h"
-#ifndef LITE_ON_TINY_PUBLISH
-#include "lite/model_parser/pb/block_desc.h"
-#include "lite/model_parser/pb/op_desc.h"
-#include "lite/model_parser/pb/program_desc.h"
-#include "lite/model_parser/pb/var_desc.h"
-#endif
-
-namespace paddle {
-namespace lite {
-
-/// For VarDesc transfrom
-#define TRANS_VAR_ANY_WITH_CPP_IMPL(T)                           \
-  template <>                                                    \
-  void TransformVarDescAnyToCpp<T>(const T &any_desc,            \
-                                   cpp::VarDesc *cpp_desc) {     \
-    cpp_desc->SetName(any_desc.Name());                          \
-    cpp_desc->SetType(any_desc.GetType());                       \
-    cpp_desc->SetPersistable(any_desc.Persistable());            \
-  }                                                              \
-                                                                 \
-  template <>                                                    \
-  void TransformVarDescCppToAny<T>(const cpp::VarDesc &cpp_desc, \
-                                   T *any_desc) {                \
-    any_desc->SetName(cpp_desc.Name());                          \
-    any_desc->SetType(cpp_desc.GetType());                       \
-    any_desc->SetPersistable(cpp_desc.Persistable());            \
-  }
-
-/// For OpDesc transform
-template <typename OpDescType>
-void OpInputsAnyToCpp(const OpDescType &any_desc, cpp::OpDesc *cpp_desc) {
-  for (const std::string &param : any_desc.InputArgumentNames()) {
-    cpp_desc->SetInput(param, any_desc.Input(param));
-  }
-}
-
-template <typename OpDescType>
-void OpInputsCppToAny(const cpp::OpDesc &cpp_desc, OpDescType *any_desc) {
-  for (const std::string &param : cpp_desc.InputArgumentNames()) {
-    any_desc->SetInput(param, cpp_desc.Input(param));
-  }
-}
-
-template <typename OpDescType>
-void OpOutputsAnyToCpp(const OpDescType &any_desc, cpp::OpDesc *cpp_desc) {
-  for (const std::string &param : any_desc.OutputArgumentNames()) {
-    cpp_desc->SetOutput(param, any_desc.Output(param));
-  }
-}
-
-template <typename OpDescType>
-void OpOutputsCppToAny(const cpp::OpDesc &cpp_desc, OpDescType *any_desc) {
-  for (const std::string &param : cpp_desc.OutputArgumentNames()) {
-    any_desc->SetOutput(param, cpp_desc.Output(param));
-  }
-}
-
-template <typename OpDescType>
-void OpAttrsAnyToCpp(const OpDescType &any_desc, cpp::OpDesc *cpp_desc) {
-  using AttrType = OpDescAPI::AttrType;
-  auto set_attr = [&](const std::string &name, AttrType type) {
-    switch (type) {
-      case AttrType::INT:
-        cpp_desc->SetAttr<int32_t>(name,
-                                   any_desc.template GetAttr<int32_t>(name));
-        break;
-      case AttrType::FLOAT:
-        cpp_desc->SetAttr<float>(name, any_desc.template GetAttr<float>(name));
-        break;
-      case AttrType::STRING:
-        cpp_desc->SetAttr<std::string>(
-            name, any_desc.template GetAttr<std::string>(name));
-        break;
-      case AttrType::LONG:
-        cpp_desc->SetAttr<int64_t>(name,
-                                   any_desc.template GetAttr<int64_t>(name));
-        break;
-      case AttrType::INTS:
-        cpp_desc->SetAttr<std::vector<int>>(
-            name, any_desc.template GetAttr<std::vector<int>>(name));
-        break;
-      case AttrType::FLOATS:
-        cpp_desc->SetAttr<std::vector<float>>(
-            name, any_desc.template GetAttr<std::vector<float>>(name));
-        break;
-      case AttrType::BOOLEAN:
-        cpp_desc->SetAttr<bool>(name, any_desc.template GetAttr<bool>(name));
-        break;
-      case AttrType::STRINGS:
-        cpp_desc->SetAttr<std::vector<std::string>>(
-            name, any_desc.template GetAttr<std::vector<std::string>>(name));
-        break;
-      case AttrType::LONGS:
-        cpp_desc->SetAttr<std::vector<int64_t>>(
-            name, any_desc.template GetAttr<std::vector<int64_t>>(name));
-        break;
-      case AttrType::BLOCK: {
-        LOG(INFO) << "loading block " << name;
-        auto i = any_desc.template GetAttr<int16_t>(name);
-        LOG(INFO) << i;
-        cpp_desc->SetAttr<int16_t>(name, i);
-        // naive_buffer::BlockDesc* sub_block = any_desc.template
-        // GetAttr<naive_buffer::BlockDesc*>(name);
-        // LOG(INFO) << sub_block->OpsSize();
-        break;
-      }
-      default:
-        LOG(FATAL) << "Unsupported attr type found " << static_cast<int>(type);
-    }
-  };
-
-  for (const auto &attr_name : any_desc.AttrNames()) {
-    auto type = any_desc.GetAttrType(attr_name);
-    set_attr(attr_name, type);
-  }
-}
-
-template <typename OpDescType>
-void OpAttrsCppToAny(const cpp::OpDesc &cpp_desc, OpDescType *any_desc) {
-  using AttrType = OpDescAPI::AttrType;
-  auto set_attr = [&](const std::string &name, AttrType type) {
-    switch (type) {
-#define IMPL_ONE(type__, T)                                         \
-  case AttrType::type__:                                            \
-    any_desc->template SetAttr<T>(name, cpp_desc.GetAttr<T>(name)); \
-    break;
-      IMPL_ONE(INT, int32_t);
-      IMPL_ONE(FLOAT, float);
-      IMPL_ONE(STRING, std::string);
-      IMPL_ONE(STRINGS, std::vector<std::string>);
-      IMPL_ONE(FLOATS, std::vector<float>);
-      IMPL_ONE(INTS, std::vector<int>);
-      IMPL_ONE(BOOLEAN, bool);
-      default:
-        LOG(FATAL) << "Unsupported attr type found: " << static_cast<int>(type);
-    }
-  };
-#undef IMPL_ONE
-  for (const auto &attr_name : cpp_desc.AttrNames()) {
-    auto type = cpp_desc.GetAttrType(attr_name);
-    set_attr(attr_name, type);
-  }
-}
-
-#define TRANS_OP_ANY_WITH_CPP_IMPL(T)                                         \
-  template <>                                                                 \
-  void TransformOpDescAnyToCpp<T>(const T &any_desc, cpp::OpDesc *cpp_desc) { \
-    cpp_desc->SetType(any_desc.Type());                                       \
-    OpInputsAnyToCpp<T>(any_desc, cpp_desc);                                  \
-    OpOutputsAnyToCpp<T>(any_desc, cpp_desc);                                 \
-    OpAttrsAnyToCpp<T>(any_desc, cpp_desc);                                   \
-  }                                                                           \
-                                                                              \
-  template <>                                                                 \
-  void TransformOpDescCppToAny<T>(const cpp::OpDesc &cpp_desc, T *any_desc) { \
-    any_desc->SetType(cpp_desc.Type());                                       \
-    OpInputsCppToAny<T>(cpp_desc, any_desc);                                  \
-    OpOutputsCppToAny<T>(cpp_desc, any_desc);                                 \
-    OpAttrsCppToAny<T>(cpp_desc, any_desc);                                   \
-  }
-
-/// For BlockDesc transform
-#define TRANS_BLOCK_ANY_WITH_CPP_IMPL(T, NT, PNT)                           \
-  template <>                                                               \
-  void TransformBlockDescAnyToCpp<NT::T>(const NT::T &any_desc,             \
-                                         cpp::BlockDesc *cpp_desc) {        \
-    NT::T desc = any_desc;                                                  \
-    cpp_desc->SetIdx(desc.Idx());                                           \
-    cpp_desc->SetParentIdx(desc.ParentIdx());                               \
-    cpp_desc->SetForwardBlockIdx(desc.ForwardBlockIdx());                   \
-                                                                            \
-    cpp_desc->ClearOps();                                                   \
-    for (size_t i = 0; i < desc.OpsSize(); ++i) {                           \
-      auto any_op_desc = NT::OpDesc(desc.GetOp<PNT::proto::OpDesc>(i));     \
-      auto *cpp_op_desc = cpp_desc->AddOp<cpp::OpDesc>();                   \
-      TransformOpDescAnyToCpp(any_op_desc, cpp_op_desc);                    \
-    }                                                                       \
-                                                                            \
-    cpp_desc->ClearVars();                                                  \
-    for (size_t i = 0; i < desc.VarsSize(); ++i) {                          \
-      auto any_var_desc = NT::VarDesc(desc.GetVar<PNT::proto::VarDesc>(i)); \
-      auto *cpp_var_desc = cpp_desc->AddVar<cpp::VarDesc>();                \
-      TransformVarDescAnyToCpp(any_var_desc, cpp_var_desc);                 \
-    }                                                                       \
-  }                                                                         \
-                                                                            \
-  template <>                                                               \
-  void TransformBlockDescCppToAny<NT::T>(const cpp::T &cpp_desc,            \
-                                         NT::T *any_desc) {                 \
-    auto desc = cpp_desc;                                                   \
-    any_desc->SetIdx(desc.Idx());                                           \
-    any_desc->SetParentIdx(desc.ParentIdx());                               \
-    any_desc->SetForwardBlockIdx(desc.ForwardBlockIdx());                   \
-                                                                            \
-    any_desc->ClearOps();                                                   \
-    for (size_t i = 0; i < desc.OpsSize(); ++i) {                           \
-      auto *cpp_op_desc = desc.GetOp<cpp::OpDesc>(i);                       \
-      auto any_op_desc = NT::OpDesc(any_desc->AddOp<PNT::proto::OpDesc>()); \
-      TransformOpDescCppToAny(*cpp_op_desc, &any_op_desc);                  \
-    }                                                                       \
-                                                                            \
-    any_desc->ClearVars();                                                  \
-    for (size_t i = 0; i < desc.VarsSize(); ++i) {                          \
-      auto *cpp_var_desc = desc.GetVar<cpp::VarDesc>(i);                    \
-      auto any_var_desc =                                                   \
-          NT::VarDesc(any_desc->AddVar<PNT::proto::VarDesc>());             \
-      TransformVarDescCppToAny(*cpp_var_desc, &any_var_desc);               \
-    }                                                                       \
-  }
-
-/// For ProgramDesc transform
-#define TRANS_PROGRAM_ANY_WITH_CPP_IMPL(T, NT, PNT)                      \
-  template <>                                                            \
-  void TransformProgramDescAnyToCpp<NT::T>(const NT::T &any_desc,        \
-                                           cpp::ProgramDesc *cpp_desc) { \
-    NT::T desc = any_desc;                                               \
-    if (desc.HasVersion()) {                                             \
-      cpp_desc->SetVersion(desc.Version());                              \
-    }                                                                    \
-                                                                         \
-    cpp_desc->ClearBlocks();                                             \
-    for (size_t i = 0; i < desc.BlocksSize(); ++i) {                     \
-      auto any_block_desc =                                              \
-          NT::BlockDesc(desc.GetBlock<PNT::proto::BlockDesc>(i));        \
-      auto *cpp_block_desc = cpp_desc->AddBlock<cpp::BlockDesc>();       \
-      TransformBlockDescAnyToCpp(any_block_desc, cpp_block_desc);        \
-    }                                                                    \
-  }                                                                      \
-                                                                         \
-  template <>                                                            \
-  void TransformProgramDescCppToAny<NT::T>(const cpp::T &cpp_desc,       \
-                                           NT::T *any_desc) {            \
-    auto desc = cpp_desc;                                                \
-    if (desc.HasVersion()) {                                             \
-      any_desc->SetVersion(desc.Version());                              \
-    }                                                                    \
-                                                                         \
-    any_desc->ClearBlocks();                                             \
-    for (size_t i = 0; i < desc.BlocksSize(); ++i) {                     \
-      auto *cpp_block_desc = desc.GetBlock<cpp::BlockDesc>(i);           \
-      auto any_block_desc =                                              \
-          NT::BlockDesc(any_desc->AddBlock<PNT::proto::BlockDesc>());    \
-      TransformBlockDescCppToAny(*cpp_block_desc, &any_block_desc);      \
-    }                                                                    \
-  }
-
-TRANS_VAR_ANY_WITH_CPP_IMPL(naive_buffer::VarDesc);
-TRANS_OP_ANY_WITH_CPP_IMPL(naive_buffer::OpDesc);
-TRANS_BLOCK_ANY_WITH_CPP_IMPL(BlockDesc, naive_buffer, naive_buffer);
-TRANS_PROGRAM_ANY_WITH_CPP_IMPL(ProgramDesc, naive_buffer, naive_buffer);
-
-#ifndef LITE_ON_TINY_PUBLISH
-TRANS_VAR_ANY_WITH_CPP_IMPL(pb::VarDesc);
-TRANS_OP_ANY_WITH_CPP_IMPL(pb::OpDesc);
-TRANS_BLOCK_ANY_WITH_CPP_IMPL(BlockDesc, pb, framework);
-TRANS_PROGRAM_ANY_WITH_CPP_IMPL(ProgramDesc, pb, framework);
-#endif
-
-#undef TRANS_VAR_ANY_WITH_CPP_IMPL
-#undef TRANS_OP_ANY_WITH_CPP_IMPL
-#undef TRANS_BLOCK_ANY_WITH_CPP_IMPL
-#undef TRANS_PROGRAM_ANY_WITH_CPP_IMPL
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/compatible_pb.h b/lite/model_parser/compatible_pb.h
deleted file mode 100644
index 80fee49133..0000000000
--- a/lite/model_parser/compatible_pb.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-/*
- * This file implements the interface to manipute the protobuf/naive_buffer
- * message. We use
- * macros to make a compatible interface with the lite::cpp::XXDesc and
- * lite::pb::XXDesc/lite::naive_buffer::XXDesc.
- */
-
-#include "lite/model_parser/cpp/block_desc.h"
-#include "lite/model_parser/cpp/op_desc.h"
-#include "lite/model_parser/cpp/program_desc.h"
-#include "lite/model_parser/cpp/var_desc.h"
-
-namespace paddle {
-namespace lite {
-
-/// Transform an VarDesc from VarDescType to cpp format.
-template <typename VarDescType>
-void TransformVarDescAnyToCpp(const VarDescType& any_desc,
-                              cpp::VarDesc* cpp_desc);
-
-/// Transform an VarDesc from cpp to VarDescType format.
-template <typename VarDescType>
-void TransformVarDescCppToAny(const cpp::VarDesc& cpp_desc,
-                              VarDescType* any_desc);
-
-/// Transform an OpDesc from OpDescType to cpp format.
-template <typename OpDescType>
-void TransformOpDescAnyToCpp(const OpDescType& any_desc, cpp::OpDesc* cpp_desc);
-
-/// Transform an OpDesc from cpp to OpDescType format.
-template <typename OpDescType>
-void TransformOpDescCppToAny(const cpp::OpDesc& cpp_desc, OpDescType* any_desc);
-
-/// Transform an BlockDesc from BlockDescType to cpp format.
-template <typename BlockDescType>
-void TransformBlockDescAnyToCpp(const BlockDescType& any_desc,
-                                cpp::BlockDesc* cpp_desc);
-
-/// Transform an BlockDesc from cpp to BlockDescType format.
-template <typename BlockDescType>
-void TransformBlockDescCppToAny(const cpp::BlockDesc& cpp_desc,
-                                BlockDescType* any_desc);
-
-/// Transform an ProgramDesc from ProgramDescType to cpp format.
-template <typename ProgramDescType>
-void TransformProgramDescAnyToCpp(const ProgramDescType& any_desc,
-                                  cpp::ProgramDesc* cpp_desc);
-
-/// Transform an ProgramDesc from cpp to ProgramDescType format.
-template <typename ProgramDescType>
-void TransformProgramDescCppToAny(const cpp::ProgramDesc& cpp_desc,
-                                  ProgramDescType* any_desc);
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/compatible_pb_test.cc b/lite/model_parser/compatible_pb_test.cc
deleted file mode 100644
index 3d964d14d7..0000000000
--- a/lite/model_parser/compatible_pb_test.cc
+++ /dev/null
@@ -1,433 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/model_parser/compatible_pb.h"
-#include <gtest/gtest.h>
-#include "lite/model_parser/cpp/block_desc.h"
-#include "lite/model_parser/cpp/op_desc.h"
-#include "lite/model_parser/cpp/program_desc.h"
-#include "lite/model_parser/cpp/var_desc.h"
-#include "lite/model_parser/naive_buffer/block_desc.h"
-#include "lite/model_parser/naive_buffer/op_desc.h"
-#include "lite/model_parser/naive_buffer/program_desc.h"
-#include "lite/model_parser/naive_buffer/var_desc.h"
-#include "lite/model_parser/pb/block_desc.h"
-#include "lite/model_parser/pb/op_desc.h"
-#include "lite/model_parser/pb/program_desc.h"
-#include "lite/model_parser/pb/var_desc.h"
-
-namespace paddle {
-namespace lite {
-
-/// For VarDesc test
-template <typename VarDescType>
-void SetVarDesc(VarDescType* desc) {
-  desc->SetName("X");
-  desc->SetPersistable(true);
-  desc->SetType(VarDescAPI::Type::LOD_TENSOR);
-}
-
-template <typename VarDescType>
-void SetVarDesc1(VarDescType* desc) {
-  desc->SetName("Y");
-  desc->SetPersistable(false);
-  desc->SetType(VarDescAPI::Type::SELECTED_ROWS);
-}
-
-template <typename VarDescType>
-void CheckVarDesc(const VarDescType& desc) {
-  ASSERT_EQ(desc.Name(), "X");
-  ASSERT_TRUE(desc.Persistable());
-  ASSERT_EQ(desc.GetType(), VarDescAPI::Type::LOD_TENSOR);
-}
-
-template <typename VarDescType>
-void CheckVarDesc1(const VarDescType& desc) {
-  ASSERT_EQ(desc.Name(), "Y");
-  ASSERT_FALSE(desc.Persistable());
-  ASSERT_EQ(desc.GetType(), VarDescAPI::Type::SELECTED_ROWS);
-}
-
-template <typename VarDescType>
-void TestVarX(VarDescType* desc) {
-  SetVarDesc<VarDescType>(desc);
-  CheckVarDesc<VarDescType>(*desc);
-}
-
-TEST(VarDesc, Basic) {
-  // pb VarDesc
-  framework::proto::VarDesc pb_proto_desc;
-  pb::VarDesc pb_desc(&pb_proto_desc);
-  TestVarX<pb::VarDesc>(&pb_desc);
-
-  // cpp VarDesc
-  cpp::VarDesc cpp_desc;
-  TestVarX<cpp::VarDesc>(&cpp_desc);
-
-  // naive buffer OpDesc
-  naive_buffer::BinaryTable table;
-  naive_buffer::proto::VarDesc nb_proto_desc(&table);
-  naive_buffer::VarDesc nb_desc(&nb_proto_desc);
-  TestVarX<naive_buffer::VarDesc>(&nb_desc);
-}
-
-template <typename VarDescType>
-void TestVarCppToAny(VarDescType* any_desc) {
-  cpp::VarDesc desc;
-  SetVarDesc1<cpp::VarDesc>(&desc);
-  TransformVarDescCppToAny(desc, any_desc);
-  CheckVarDesc1<VarDescType>(*any_desc);
-}
-
-TEST(VarDesc, CppToAny) {
-  // pb VarDesc
-  framework::proto::VarDesc pb_proto_desc;
-  pb::VarDesc pb_desc(&pb_proto_desc);
-  TestVarCppToAny<pb::VarDesc>(&pb_desc);
-
-  // naive buffer VarDesc
-  naive_buffer::BinaryTable table;
-  naive_buffer::proto::VarDesc nb_proto_desc(&table);
-  naive_buffer::VarDesc nb_desc(&nb_proto_desc);
-  TestVarCppToAny<naive_buffer::VarDesc>(&nb_desc);
-}
-
-template <typename VarDescType>
-void TestVarAnyToCpp(VarDescType* desc) {
-  SetVarDesc1<VarDescType>(desc);
-  cpp::VarDesc cpp_desc;
-  TransformVarDescAnyToCpp(*desc, &cpp_desc);
-  CheckVarDesc1<cpp::VarDesc>(cpp_desc);
-}
-
-TEST(VarDesc, AnyToCpp) {
-  // pb VarDesc
-  framework::proto::VarDesc pb_proto_desc;
-  pb::VarDesc pb_desc(&pb_proto_desc);
-  TestVarAnyToCpp<pb::VarDesc>(&pb_desc);
-
-  // naive buffer VarDesc
-  naive_buffer::BinaryTable table;
-  naive_buffer::proto::VarDesc nb_proto_desc(&table);
-  naive_buffer::VarDesc nb_desc(&nb_proto_desc);
-  TestVarAnyToCpp<naive_buffer::VarDesc>(&nb_desc);
-}
-
-/// For OpDesc test
-template <typename OpDescType>
-void SetOpDesc(OpDescType* desc) {
-  desc->SetInput("X", {"a", "b"});
-  desc->SetOutput("Y", {"c", "d"});
-  desc->template SetAttr<int32_t>("aint", 100);
-}
-
-template <typename OpDescType>
-void SetOpDesc1(OpDescType* desc) {
-  desc->SetInput("X", {"m", "n", "k"});
-  desc->SetOutput("Y", {"w"});
-  desc->template SetAttr<float>("afloat", 0.005);
-}
-
-template <typename OpDescType>
-void CheckOpDesc(const OpDescType& desc) {
-  auto X = desc.Input("X");
-  ASSERT_EQ(X.size(), 2UL);
-  ASSERT_EQ(X[0], "a");
-  ASSERT_EQ(X[1], "b");
-
-  auto Y = desc.Output("Y");
-  ASSERT_EQ(Y.size(), 2UL);
-  ASSERT_EQ(Y[0], "c");
-  ASSERT_EQ(Y[1], "d");
-
-  ASSERT_TRUE(desc.HasAttr("aint"));
-  ASSERT_FALSE(desc.HasAttr("afloat"));
-  ASSERT_EQ(desc.template GetAttr<int32_t>("aint"), 100);
-}
-
-template <typename OpDescType>
-void CheckOpDesc1(const OpDescType& desc) {
-  auto X = desc.Input("X");
-  ASSERT_EQ(X.size(), 3UL);
-  ASSERT_EQ(X[0], "m");
-  ASSERT_EQ(X[1], "n");
-  ASSERT_EQ(X[2], "k");
-
-  auto Y = desc.Output("Y");
-  ASSERT_EQ(Y.size(), 1UL);
-  ASSERT_EQ(Y[0], "w");
-
-  ASSERT_TRUE(desc.HasAttr("afloat"));
-  ASSERT_FALSE(desc.HasAttr("aint"));
-  EXPECT_NEAR(desc.template GetAttr<float>("afloat"), 0.005, 1e-5);
-}
-
-template <typename OpDescType>
-void TestOpX(OpDescType* desc) {
-  SetOpDesc<OpDescType>(desc);
-  CheckOpDesc<OpDescType>(*desc);
-}
-
-TEST(OpDesc, Basic) {
-  // pb OpDesc
-  framework::proto::OpDesc pb_proto_desc;
-  pb::OpDesc pb_desc(&pb_proto_desc);
-  TestOpX<pb::OpDesc>(&pb_desc);
-
-  // cpp OpDesc
-  cpp::OpDesc cpp_desc;
-  TestOpX<cpp::OpDesc>(&cpp_desc);
-
-  // naive buffer OpDesc
-  naive_buffer::BinaryTable table;
-  naive_buffer::proto::OpDesc nb_proto_desc(&table);
-  naive_buffer::OpDesc nb_desc(&nb_proto_desc);
-  TestOpX<naive_buffer::OpDesc>(&nb_desc);
-}
-
-template <typename OpDescType>
-void TestOpCppToAny(OpDescType* any_desc) {
-  cpp::OpDesc desc;
-  SetOpDesc1<cpp::OpDesc>(&desc);
-  TransformOpDescCppToAny(desc, any_desc);
-  CheckOpDesc1<OpDescType>(*any_desc);
-}
-
-TEST(OpDesc, CppToAny) {
-  // pb OpDesc
-  framework::proto::OpDesc pb_proto_desc;
-  pb::OpDesc pb_desc(&pb_proto_desc);
-  TestOpCppToAny<pb::OpDesc>(&pb_desc);
-
-  // naive buffer OpDesc
-  naive_buffer::BinaryTable table;
-  naive_buffer::proto::OpDesc nb_proto_desc(&table);
-  naive_buffer::OpDesc nb_desc(&nb_proto_desc);
-  TestOpCppToAny<naive_buffer::OpDesc>(&nb_desc);
-}
-
-template <typename OpDescType>
-void TestOpAnyToCpp(OpDescType* desc) {
-  SetOpDesc1<OpDescType>(desc);
-  cpp::OpDesc cpp_desc;
-  TransformOpDescAnyToCpp(*desc, &cpp_desc);
-  CheckOpDesc1<cpp::OpDesc>(cpp_desc);
-}
-
-TEST(OpDesc, AnyToCpp) {
-  // pb OpDesc
-  framework::proto::OpDesc pb_proto_desc;
-  pb::OpDesc pb_desc(&pb_proto_desc);
-  TestOpAnyToCpp<pb::OpDesc>(&pb_desc);
-
-  // naive buffer OpDesc
-  naive_buffer::BinaryTable table;
-  naive_buffer::proto::OpDesc nb_proto_desc(&table);
-  naive_buffer::OpDesc nb_desc(&nb_proto_desc);
-  TestOpAnyToCpp<naive_buffer::OpDesc>(&nb_desc);
-}
-
-template <typename T>
-void SetBlockDesc(T* desc);
-
-/// For BlockDesc test
-#define SET_BLOCK_DESC(NT, PNT)                            \
-  template <>                                              \
-  void SetBlockDesc<NT::BlockDesc>(NT::BlockDesc * desc) { \
-    desc->ClearVars();                                     \
-    desc->ClearOps();                                      \
-                                                           \
-    desc->SetIdx(1);                                       \
-    desc->SetParentIdx(-1);                                \
-    desc->SetForwardBlockIdx(2);                           \
-                                                           \
-    NT::VarDesc var1(desc->AddVar<PNT::VarDesc>());        \
-    SetVarDesc<NT::VarDesc>(&var1);                        \
-    NT::VarDesc var2(desc->AddVar<PNT::VarDesc>());        \
-    SetVarDesc1<NT::VarDesc>(&var2);                       \
-                                                           \
-    NT::OpDesc op1(desc->AddOp<PNT::OpDesc>());            \
-    SetOpDesc<NT::OpDesc>(&op1);                           \
-    NT::OpDesc op2(desc->AddOp<PNT::OpDesc>());            \
-    SetOpDesc1<NT::OpDesc>(&op2);                          \
-  }
-
-template <>
-void SetBlockDesc<cpp::BlockDesc>(cpp::BlockDesc* desc) {
-  desc->ClearVars();
-  desc->ClearOps();
-
-  desc->SetIdx(1);
-  desc->SetParentIdx(-1);
-  desc->SetForwardBlockIdx(2);
-
-  SetVarDesc<cpp::VarDesc>(desc->AddVar<cpp::VarDesc>());
-  SetVarDesc1<cpp::VarDesc>(desc->AddVar<cpp::VarDesc>());
-
-  SetOpDesc<cpp::OpDesc>(desc->AddOp<cpp::OpDesc>());
-  SetOpDesc1<cpp::OpDesc>(desc->AddOp<cpp::OpDesc>());
-}
-
-SET_BLOCK_DESC(naive_buffer, naive_buffer::proto);
-SET_BLOCK_DESC(pb, framework::proto);
-
-template <typename T>
-void CheckBlockDesc(const T& desc);
-
-#define CHECK_BLOCK_DESC(NT, PNT)                                      \
-  template <>                                                          \
-  void CheckBlockDesc<NT::BlockDesc>(const NT::BlockDesc& some_desc) { \
-    auto desc = some_desc;                                             \
-    ASSERT_EQ(desc.Idx(), 1);                                          \
-    ASSERT_EQ(desc.ParentIdx(), -1);                                   \
-    ASSERT_EQ(desc.ForwardBlockIdx(), 2);                              \
-                                                                       \
-    ASSERT_EQ(desc.VarsSize(), 2UL);                                   \
-    NT::VarDesc var1(desc.GetVar<PNT::VarDesc>(0));                    \
-    CheckVarDesc<NT::VarDesc>(var1);                                   \
-    NT::VarDesc var2(desc.GetVar<PNT::VarDesc>(1));                    \
-    CheckVarDesc1<NT::VarDesc>(var2);                                  \
-                                                                       \
-    ASSERT_EQ(desc.OpsSize(), 2UL);                                    \
-    NT::OpDesc op1(desc.GetOp<PNT::OpDesc>(0));                        \
-    CheckOpDesc<NT::OpDesc>(op1);                                      \
-    NT::OpDesc op2(desc.GetOp<PNT::OpDesc>(1));                        \
-    CheckOpDesc1<NT::OpDesc>(op2);                                     \
-  }
-
-CHECK_BLOCK_DESC(naive_buffer, naive_buffer::proto);
-CHECK_BLOCK_DESC(pb, framework::proto);
-
-template <>
-void CheckBlockDesc<cpp::BlockDesc>(const cpp::BlockDesc& some_desc) {
-  auto desc = some_desc;
-  ASSERT_EQ(desc.Idx(), 1);
-  ASSERT_EQ(desc.ParentIdx(), -1);
-  ASSERT_EQ(desc.ForwardBlockIdx(), 2);
-
-  ASSERT_EQ(desc.VarsSize(), 2UL);
-  CheckVarDesc<cpp::VarDesc>(*desc.GetVar<cpp::VarDesc>(0));
-  CheckVarDesc1<cpp::VarDesc>(*desc.GetVar<cpp::VarDesc>(1));
-
-  ASSERT_EQ(desc.OpsSize(), 2UL);
-  CheckOpDesc<cpp::OpDesc>(*desc.GetOp<cpp::OpDesc>(0));
-  CheckOpDesc1<cpp::OpDesc>(*desc.GetOp<cpp::OpDesc>(1));
-}
-
-template <typename BlockDescType>
-void TestBlockX(BlockDescType* desc) {
-  SetBlockDesc<BlockDescType>(desc);
-  CheckBlockDesc<BlockDescType>(*desc);
-}
-
-TEST(BlockDesc, Basic) {
-  // pb BlockDesc
-  framework::proto::BlockDesc pb_proto_desc;
-  pb::BlockDesc pb_desc(&pb_proto_desc);
-  TestBlockX<pb::BlockDesc>(&pb_desc);
-
-  // cpp OpDesc
-  cpp::BlockDesc cpp_desc;
-  TestBlockX<cpp::BlockDesc>(&cpp_desc);
-
-  // naive buffer OpDesc
-  naive_buffer::BinaryTable table;
-  naive_buffer::proto::BlockDesc nb_proto_desc(&table);
-  naive_buffer::BlockDesc nb_desc(&nb_proto_desc);
-  TestBlockX<naive_buffer::BlockDesc>(&nb_desc);
-}
-
-template <typename BlockDescType>
-void TestBlockCppToAny(BlockDescType* any_desc) {
-  cpp::BlockDesc desc;
-  SetBlockDesc<cpp::BlockDesc>(&desc);
-  TransformBlockDescCppToAny(desc, any_desc);
-  CheckBlockDesc<BlockDescType>(*any_desc);
-}
-
-TEST(BlockDesc, CppToAny) {
-  // pb BlockDesc
-  framework::proto::BlockDesc pb_proto_desc;
-  pb::BlockDesc pb_desc(&pb_proto_desc);
-  TestBlockCppToAny<pb::BlockDesc>(&pb_desc);
-
-  // naive buffer BlockDesc
-  naive_buffer::BinaryTable table;
-  naive_buffer::proto::BlockDesc nb_proto_desc(&table);
-  naive_buffer::BlockDesc nb_desc(&nb_proto_desc);
-  TestBlockCppToAny<naive_buffer::BlockDesc>(&nb_desc);
-}
-
-template <typename BlockDescType>
-void TestBlockAnyToCpp(BlockDescType* desc) {
-  SetBlockDesc<BlockDescType>(desc);
-  cpp::BlockDesc cpp_desc;
-  TransformBlockDescAnyToCpp(*desc, &cpp_desc);
-  CheckBlockDesc<cpp::BlockDesc>(cpp_desc);
-}
-
-TEST(BlockDesc, AnyToCpp) {
-  // pb OpDesc
-  framework::proto::BlockDesc pb_proto_desc;
-  pb::BlockDesc pb_desc(&pb_proto_desc);
-  TestBlockAnyToCpp<pb::BlockDesc>(&pb_desc);
-
-  // naive buffer OpDesc
-  naive_buffer::BinaryTable table;
-  naive_buffer::proto::BlockDesc nb_proto_desc(&table);
-  naive_buffer::BlockDesc nb_desc(&nb_proto_desc);
-  TestBlockAnyToCpp<naive_buffer::BlockDesc>(&nb_desc);
-}
-
-/// For ProgramDesc test
-template <typename ProgramDescType>
-void TestProgramCppToAny(ProgramDescType* any_desc) {
-  cpp::ProgramDesc desc;
-  TransformProgramDescCppToAny(desc, any_desc);
-}
-
-TEST(ProgramDesc, CppToAny) {
-  // pb ProgramDesc
-  framework::proto::ProgramDesc pb_proto_desc;
-  pb::ProgramDesc pb_desc(&pb_proto_desc);
-  TestProgramCppToAny<pb::ProgramDesc>(&pb_desc);
-
-  // naive buffer ProgramDesc
-  naive_buffer::BinaryTable table;
-  naive_buffer::proto::ProgramDesc nb_proto_desc(&table);
-  naive_buffer::ProgramDesc nb_desc(&nb_proto_desc);
-  TestProgramCppToAny<naive_buffer::ProgramDesc>(&nb_desc);
-}
-
-template <typename ProgramDescType>
-void TestProgramAnyToCpp(ProgramDescType* desc) {
-  cpp::ProgramDesc cpp_desc;
-  TransformProgramDescAnyToCpp(*desc, &cpp_desc);
-}
-
-TEST(ProgramDesc, AnyToCpp) {
-  // pb OpDesc
-  framework::proto::ProgramDesc pb_proto_desc;
-  pb::ProgramDesc pb_desc(&pb_proto_desc);
-  TestProgramAnyToCpp<pb::ProgramDesc>(&pb_desc);
-
-  // naive buffer OpDesc
-  naive_buffer::BinaryTable table;
-  naive_buffer::proto::ProgramDesc nb_proto_desc(&table);
-  naive_buffer::ProgramDesc nb_desc(&nb_proto_desc);
-  TestProgramAnyToCpp<naive_buffer::ProgramDesc>(&nb_desc);
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/cpp/CMakeLists.txt b/lite/model_parser/cpp/CMakeLists.txt
deleted file mode 100644
index fe3b2f848e..0000000000
--- a/lite/model_parser/cpp/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-lite_cc_library(cpp_op_desc SRCS op_desc.cc DEPS any)
-lite_cc_library(cpp_var_desc SRCS var_desc.cc)
-lite_cc_library(cpp_block_desc SRCS block_desc.cc)
-lite_cc_library(cpp_program_desc SRCS program_desc.cc)
-
-set(cpp_wrapper cpp_op_desc cpp_var_desc cpp_block_desc cpp_program_desc PARENT_SCOPE)
diff --git a/lite/model_parser/cpp/block_desc.cc b/lite/model_parser/cpp/block_desc.cc
deleted file mode 100644
index a4dc7cd72a..0000000000
--- a/lite/model_parser/cpp/block_desc.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/model_parser/cpp/block_desc.h"
-
-namespace paddle {
-namespace lite {
-namespace cpp {
-
-template <>
-VarDesc* BlockDesc::GetVar<VarDesc>(int32_t idx) {
-  CHECK_LT(idx, VarsSize()) << "idx >= vars.size()";
-  return &vars_[idx];
-}
-
-template <>
-VarDesc* BlockDesc::AddVar<VarDesc>() {
-  vars_.emplace_back();
-  return &vars_.back();
-}
-
-template <>
-OpDesc* BlockDesc::GetOp<OpDesc>(int32_t idx) {
-  CHECK_LT(idx, OpsSize()) << "idx >= ops.size()";
-  return &ops_[idx];
-}
-
-template <>
-OpDesc* BlockDesc::AddOp<OpDesc>() {
-  ops_.emplace_back();
-  return &ops_.back();
-}
-
-}  // namespace cpp
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/cpp/block_desc.h b/lite/model_parser/cpp/block_desc.h
deleted file mode 100644
index b6f473b88b..0000000000
--- a/lite/model_parser/cpp/block_desc.h
+++ /dev/null
@@ -1,75 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <vector>
-#include "lite/model_parser/cpp/op_desc.h"
-#include "lite/model_parser/cpp/var_desc.h"
-#include "lite/model_parser/desc_apis.h"
-
-namespace paddle {
-namespace lite {
-namespace cpp {
-
-/*
- * The cpp::BlockDesc is the internal representation for Op. All the internal
- * imprementation should use it, not the pb::BlockDesc.
- */
-class BlockDesc : public BlockDescAPI {
- public:
-  BlockDesc() = default;
-
-  int32_t Idx() const override { return idx_; }
-
-  void SetIdx(int32_t idx) override { idx_ = idx; }
-
-  int32_t ParentIdx() const override { return parent_idx_; }
-
-  void SetParentIdx(int32_t idx) override { parent_idx_ = idx; }
-
-  size_t VarsSize() const override { return vars_.size(); }
-
-  void ClearVars() override { vars_.clear(); }
-
-  template <typename T>
-  T* GetVar(int32_t idx);
-
-  template <typename T>
-  T* AddVar();
-
-  size_t OpsSize() const override { return ops_.size(); }
-
-  void ClearOps() override { ops_.clear(); }
-
-  template <typename T>
-  T* GetOp(int32_t idx);
-
-  template <typename T>
-  T* AddOp();
-
-  int32_t ForwardBlockIdx() const override { return forward_block_idx_; }
-
-  void SetForwardBlockIdx(int32_t idx) override { forward_block_idx_ = idx; }
-
- private:
-  int32_t idx_;
-  int32_t parent_idx_;
-  std::vector<OpDesc> ops_;
-  std::vector<VarDesc> vars_;
-  int32_t forward_block_idx_;
-};
-
-}  // namespace cpp
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/cpp/op_desc.cc b/lite/model_parser/cpp/op_desc.cc
deleted file mode 100644
index 4c99fdfb3d..0000000000
--- a/lite/model_parser/cpp/op_desc.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/model_parser/cpp/op_desc.h"
-#include <set>
-#include <utility>
-
-namespace paddle {
-namespace lite {
-namespace cpp {
-
-#define SET_ATTR_IMPL(T, repr__)                                 \
-  template <>                                                    \
-  void OpDesc::SetAttr<T>(const std::string& name, const T& v) { \
-    attr_types_[name] = AttrType::repr__;                        \
-    attrs_[name].set<T>(v);                                      \
-  }
-
-SET_ATTR_IMPL(int32_t, INT);
-SET_ATTR_IMPL(int16_t, INT);
-SET_ATTR_IMPL(float, FLOAT);
-SET_ATTR_IMPL(std::string, STRING);
-SET_ATTR_IMPL(bool, BOOLEAN);
-SET_ATTR_IMPL(int64_t, LONG);
-SET_ATTR_IMPL(std::vector<int>, INTS);
-SET_ATTR_IMPL(std::vector<float>, FLOATS);
-SET_ATTR_IMPL(std::vector<std::string>, STRINGS);
-SET_ATTR_IMPL(std::vector<int64_t>, LONGS);
-
-std::pair<OpDesc::attrs_t::const_iterator, OpDesc::attr_types_t::const_iterator>
-FindAttr(const cpp::OpDesc& desc, const std::string& name) {
-  auto it = desc.attrs().find(name);
-  CHECK(it != desc.attrs().end()) << "No attributes called " << name
-                                  << " found";
-  auto attr_it = desc.attr_types().find(name);
-  CHECK(attr_it != desc.attr_types().end());
-  return std::make_pair(it, attr_it);
-}
-
-#define GET_IMPL_ONE(T, repr__)                                          \
-  template <>                                                            \
-  T OpDesc::GetAttr<T>(const std::string& name) const {                  \
-    auto pair = FindAttr(*this, name);                                   \
-    CHECK(pair.second->second == AttrType::repr__)                       \
-        << "required type is " << #repr__ << " not match the true type"; \
-    return pair.first->second.get<T>();                                  \
-  }
-
-GET_IMPL_ONE(int32_t, INT)
-std::vector<std::string> OpDesc::OutputArgumentNames() const {
-  std::vector<std::string> res;
-  for (const auto& x : outputs_) res.push_back(x.first);
-  return res;
-}
-
-std::vector<std::string> OpDesc::input_vars() const {
-  std::vector<std::string> res;
-  for (const auto& arg : InputArgumentNames()) {
-    for (auto& vars : Input(arg)) {
-      res.emplace_back(vars.begin(), vars.end());
-    }
-  }
-  return res;
-}
-
-std::vector<std::string> OpDesc::output_vars() const {
-  std::vector<std::string> res;
-  for (const auto& arg : OutputArgumentNames()) {
-    for (auto& vars : Output(arg)) {
-      res.emplace_back(vars.begin(), vars.end());
-    }
-  }
-  return res;
-}
-
-std::vector<std::string> OpDesc::InputArgumentNames() const {
-  std::vector<std::string> res;
-  for (const auto& x : inputs_) res.push_back(x.first);
-  return res;
-}
-
-std::vector<std::string> OpDesc::Input(const std::string& param) const {
-  auto it = inputs_.find(param);
-  CHECK(it != inputs_.end());
-  return it->second;
-}
-
-std::vector<std::string> OpDesc::Output(const std::string& param) const {
-  auto it = outputs_.find(param);
-  CHECK(it != outputs_.end());
-  return it->second;
-}
-
-bool OpDesc::HasOutput(const std::string& param) const {
-  auto it = outputs_.find(param);
-  return it != outputs_.end();
-}
-
-GET_IMPL_ONE(float, FLOAT);
-GET_IMPL_ONE(int16_t, INT);
-GET_IMPL_ONE(std::string, STRING);
-GET_IMPL_ONE(int64_t, LONG);
-GET_IMPL_ONE(bool, BOOLEAN);
-GET_IMPL_ONE(std::vector<int64_t>, LONGS);
-GET_IMPL_ONE(std::vector<float>, FLOATS);
-GET_IMPL_ONE(std::vector<int>, INTS);
-GET_IMPL_ONE(std::vector<std::string>, STRINGS);
-
-}  // namespace cpp
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/cpp/op_desc.h b/lite/model_parser/cpp/op_desc.h
deleted file mode 100644
index d8cb372bce..0000000000
--- a/lite/model_parser/cpp/op_desc.h
+++ /dev/null
@@ -1,122 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <map>
-#include <string>
-#include <vector>
-#include "lite/model_parser/desc_apis.h"
-#include "lite/utils/any.h"
-#include "lite/utils/varient.h"
-
-namespace paddle {
-namespace lite {
-namespace cpp {
-
-/*
- * The cpp::OpDesc is the internal representation for Op. All the internal
- * imprementation should use it, not the pb::OpDesc.
- */
-class OpDesc : public OpDescAPI {
- public:
-  using attrs_t = std::map<std::string, Any>;
-  using attr_types_t = std::map<std::string, AttrType>;
-
- protected:
-  std::string type_;
-  std::map<std::string, std::vector<std::string>> inputs_;
-  std::map<std::string, std::vector<std::string>> outputs_;
-  std::map<std::string, Any> attrs_;
-  std::map<std::string, AttrType> attr_types_;
-
- public:
-  OpDesc() = default;
-
-  std::string Type() const override { return type_; }
-  void SetType(const std::string& x) override { type_ = x; }
-
-  const std::map<std::string, std::vector<std::string>>& inputs() const {
-    return inputs_;
-  }
-  const std::map<std::string, std::vector<std::string>>& outputs() const {
-    return outputs_;
-  }
-  std::map<std::string, std::vector<std::string>>* mutable_inputs() {
-    return &inputs_;
-  }
-  std::map<std::string, std::vector<std::string>>* mutable_outputs() {
-    return &outputs_;
-  }
-
-  bool HasInput(const std::string& param) const {
-    auto it = inputs_.find(param);
-    return it != inputs_.end();
-  }
-
-  std::vector<std::string> Input(const std::string& param) const override;
-
-  std::vector<std::string> InputArgumentNames() const override;
-  std::vector<std::string> OutputArgumentNames() const override;
-
-  std::vector<std::string> input_vars() const;
-
-  std::vector<std::string> output_vars() const;
-
-  bool HasOutput(const std::string& param) const;
-
-  std::vector<std::string> Output(const std::string& param) const override;
-
-  void SetInput(const std::string& param,
-                const std::vector<std::string>& args) override {
-    inputs_[param] = args;
-  }
-
-  void SetOutput(const std::string& param,
-                 const std::vector<std::string>& args) override {
-    outputs_[param] = args;
-  }
-
-  bool HasAttr(const std::string& name) const override {
-    return attrs_.count(name);
-  }
-
-  AttrType GetAttrType(const std::string& name) const override {
-    auto it = attr_types_.find(name);
-    CHECK(it != attr_types_.end());
-    return it->second;
-  }
-
-  std::vector<std::string> AttrNames() const override {
-    std::vector<std::string> res;
-    for (const auto& x : attrs_) {
-      res.push_back(x.first);
-    }
-    return res;
-  }
-
-  template <typename T>
-  void SetAttr(const std::string& name, const T& v);
-
-  template <typename T>
-  T GetAttr(const std::string& name) const;
-
-  const std::map<std::string, Any>& attrs() const { return attrs_; }
-  const std::map<std::string, AttrType>& attr_types() const {
-    return attr_types_;
-  }
-};
-
-}  // namespace cpp
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/cpp/program_desc.cc b/lite/model_parser/cpp/program_desc.cc
deleted file mode 100644
index 3c6adcddf3..0000000000
--- a/lite/model_parser/cpp/program_desc.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/model_parser/cpp/program_desc.h"
-
-namespace paddle {
-namespace lite {
-namespace cpp {
-
-template <>
-BlockDesc* ProgramDesc::GetBlock<BlockDesc>(int32_t idx) {
-  CHECK_LT(idx, BlocksSize()) << "idx >= blocks.size()";
-  return &blocks_[idx];
-}
-
-template <>
-BlockDesc* ProgramDesc::AddBlock<BlockDesc>() {
-  blocks_.emplace_back();
-  return &blocks_.back();
-}
-
-}  // namespace cpp
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/cpp/program_desc.h b/lite/model_parser/cpp/program_desc.h
deleted file mode 100644
index 786dad134a..0000000000
--- a/lite/model_parser/cpp/program_desc.h
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <vector>
-#include "lite/model_parser/cpp/block_desc.h"
-#include "lite/model_parser/desc_apis.h"
-
-namespace paddle {
-namespace lite {
-namespace cpp {
-
-/*
- * The cpp::ProgramDesc is the internal representation for Op. All the internal
- * imprementation should use it, not the pb::ProgramDesc.
- */
-class ProgramDesc : public ProgramDescAPI {
- public:
-  ProgramDesc() = default;
-
-  size_t BlocksSize() const override { return blocks_.size(); }
-
-  void ClearBlocks() override { blocks_.clear(); }
-
-  template <typename T>
-  T* GetBlock(int32_t idx);
-
-  template <typename T>
-  T* AddBlock();
-
-  // Just return default versoin
-  // TODO(sangoly): refine this
-  bool HasVersion() const override { return true; }
-
-  int64_t Version() const override { return version_; }
-
-  void SetVersion(int64_t version) override { version_ = version; }
-
- private:
-  int64_t version_;
-  std::vector<BlockDesc> blocks_;
-};
-
-}  // namespace cpp
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/cpp/var_desc.cc b/lite/model_parser/cpp/var_desc.cc
deleted file mode 100644
index e30bb3eb55..0000000000
--- a/lite/model_parser/cpp/var_desc.cc
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/model_parser/cpp/var_desc.h"
diff --git a/lite/model_parser/cpp/var_desc.h b/lite/model_parser/cpp/var_desc.h
deleted file mode 100644
index c346934dfd..0000000000
--- a/lite/model_parser/cpp/var_desc.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include "lite/model_parser/desc_apis.h"
-
-namespace paddle {
-namespace lite {
-namespace cpp {
-
-/*
- * The cpp::VarDesc is the internal representation for Op. All the internal
- * imprementation should use it, not the pb::VarDesc.
- */
-class VarDesc : public VarDescAPI {
- public:
-  VarDesc() = default;
-
-  explicit VarDesc(std::string name) : name_(name) {}
-
-  std::string Name() const override { return name_; }
-
-  void SetName(std::string name) override { name_ = name; }
-
-  Type GetType() const override { return type_; }
-
-  void SetType(Type type) override { type_ = type; }
-
-  bool Persistable() const override { return persistable_; }
-
-  void SetPersistable(bool persistable) override { persistable_ = persistable; }
-
- private:
-  std::string name_;
-  Type type_;
-  bool persistable_;
-};
-
-}  // namespace cpp
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/desc_apis.h b/lite/model_parser/desc_apis.h
deleted file mode 100644
index 5461de54a9..0000000000
--- a/lite/model_parser/desc_apis.h
+++ /dev/null
@@ -1,229 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <map>
-#include <string>
-#include <vector>
-#include "lite/utils/all.h"
-#include "lite/utils/replace_stl/stream.h"
-
-namespace paddle {
-namespace lite {
-
-/*
- * Compatible interfaces for all the different kinds of XXXDesc. All the XXXDesc
- * classes should implement this.
- */
-
-class VarDescAPI {
- public:
-  enum class Type {
-    // Pod Types
-    BOOL = 0,
-    INT16,
-    INT32,
-    INT64,
-    FP16,
-    FP32,
-    FP64,
-    // Tensor<size_t> is used in C++.
-    SIZE_T,
-    UINT8,
-    INT8,
-
-    // Other types that may need additional descriptions
-    LOD_TENSOR,
-    SELECTED_ROWS,
-    FEED_MINIBATCH,
-    FETCH_LIST,
-    STEP_SCOPES,
-    LOD_RANK_TABLE,
-    LOD_TENSOR_ARRAY,
-    PLACE_LIST,
-    READER,
-    // Any runtime decided variable type is raw
-    // raw variables should manage their own allocations
-    // in operators like nccl_op
-    RAW,
-    TUPLE
-  };
-
-  using VarDataType = Type;
-
-  virtual ~VarDescAPI() = default;
-
-  // Get var's name
-  virtual std::string Name() const = 0;
-  // Set var's name
-  virtual void SetName(std::string name) = 0;
-  // Get var's type
-  virtual Type GetType() const = 0;
-  // Set var's type
-  virtual void SetType(Type type) = 0;
-  // Tell whether var is persistable or not
-  virtual bool Persistable() const = 0;
-  // Set var to be persistable or not
-  virtual void SetPersistable(bool persistable) = 0;
-};
-
-/*
- * NOTE Some interfaces are weried, we remain them unchanged to keep compatible
- * with framework::OpDesc in Fluid framework.
- */
-class OpDescAPI {
- public:
-  // The AttrType is used to make the proto::AttrType portable.
-  enum class AttrType {
-    INT = 0,
-    FLOAT = 1,
-    STRING = 2,
-    INTS = 3,
-    FLOATS = 4,
-    STRINGS = 5,
-    BOOLEAN = 6,
-    BOOLEANS = 7,
-    BLOCK = 8,
-    LONG = 9,
-    BLOCKS = 10,
-    LONGS = 11,
-    UNK,
-  };
-
-  virtual ~OpDescAPI() = default;
-
-  /// Get operator's type.
-  virtual std::string Type() const = 0;
-  /// Set operator's type.
-  virtual void SetType(const std::string& type) = 0;
-  /// Get arguments given the parameter.
-  virtual std::vector<std::string> Input(const std::string& param) const = 0;
-  /// Get parameters.
-  virtual std::vector<std::string> InputArgumentNames() const = 0;
-  /// Get arguments given the parameter.
-  virtual std::vector<std::string> Output(const std::string& param) const = 0;
-  /// Get parameters.
-  virtual std::vector<std::string> OutputArgumentNames() const = 0;
-  /// Set a input given the parameter and arguments.
-  virtual void SetInput(const std::string& param,
-                        const std::vector<std::string>& args) = 0;
-  virtual void SetOutput(const std::string& param,
-                         const std::vector<std::string>& args) = 0;
-  /// Tell whether this desc has an attribute.
-  virtual bool HasAttr(const std::string& name) const = 0;
-
-  /// Get the type of an attribute.
-  virtual AttrType GetAttrType(const std::string& name) const = 0;
-
-  virtual std::vector<std::string> AttrNames() const = 0;
-
-  /// Set an attribute.
-  template <typename T>
-  void SetAttr(const std::string& name, const T& v);
-
-  /// Get an attribute.
-  template <typename T>
-  T GetAttr(const std::string& name) const;
-
-  std::string Repr() const {
-    STL::stringstream ss;
-    ss << Type();
-    ss << "(";
-    for (auto& arg : InputArgumentNames()) {
-      ss << arg << ":";
-      for (auto val : Input(arg)) {
-        ss << val << " ";
-      }
-    }
-    ss << ") -> (";
-    for (auto& arg : OutputArgumentNames()) {
-      ss << arg << ":";
-      for (auto val : Output(arg)) {
-        ss << val << " ";
-      }
-    }
-    ss << ")";
-    return ss.str();
-  }
-};
-
-class BlockDescAPI {
- public:
-  virtual ~BlockDescAPI() = default;
-
-  virtual int32_t Idx() const = 0;
-
-  virtual void SetIdx(int32_t idx) = 0;
-
-  virtual int32_t ParentIdx() const = 0;
-
-  virtual void SetParentIdx(int32_t idx) = 0;
-
-  virtual size_t VarsSize() const = 0;
-
-  virtual void ClearVars() = 0;
-
-  // NOTE: This ugly method is used to compatible interfaces between cpp and
-  // pb/nb backends
-  // TODO(sangoly): refine this
-  template <typename T>
-  T* GetVar(int32_t idx);
-
-  template <typename T>
-  T* AddVar();
-
-  virtual size_t OpsSize() const = 0;
-
-  virtual void ClearOps() = 0;
-
-  // NOTE: This ugly method is used to compatible interfaces between cpp and
-  // pb/nb backends
-  // TODO(sangoly): refine this
-  template <typename T>
-  T* GetOp(int32_t idx);
-
-  template <typename T>
-  T* AddOp();
-
-  virtual int32_t ForwardBlockIdx() const = 0;
-
-  virtual void SetForwardBlockIdx(int32_t idx) = 0;
-};
-
-class ProgramDescAPI {
- public:
-  virtual ~ProgramDescAPI() = default;
-
-  virtual size_t BlocksSize() const = 0;
-
-  virtual void ClearBlocks() = 0;
-
-  // NOTE: This ugly method is used to compatible interfaces between cpp and
-  // pb/nb backends
-  // TODO(sangoly): refine this
-  template <typename T>
-  T* GetBlock(int32_t idx);
-
-  template <typename T>
-  T* AddBlock();
-
-  virtual bool HasVersion() const = 0;
-
-  virtual int64_t Version() const = 0;
-
-  virtual void SetVersion(int64_t version) = 0;
-};
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/model_parser.cc b/lite/model_parser/model_parser.cc
deleted file mode 100644
index d3e556539c..0000000000
--- a/lite/model_parser/model_parser.cc
+++ /dev/null
@@ -1,794 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/model_parser/model_parser.h"
-#include <algorithm>
-#include <fstream>
-#include <limits>
-#include <set>
-#include "lite/core/scope.h"
-#include "lite/core/tensor.h"
-#include "lite/core/variable.h"
-#include "lite/model_parser/desc_apis.h"
-#include "lite/model_parser/naive_buffer/combined_params_desc.h"
-#include "lite/model_parser/naive_buffer/param_desc.h"
-#include "lite/model_parser/naive_buffer/program_desc.h"
-#include "lite/model_parser/naive_buffer/var_desc.h"
-#ifndef LITE_ON_TINY_PUBLISH
-#include "lite/model_parser/pb/program_desc.h"
-#include "lite/model_parser/pb/var_desc.h"
-#endif
-#include "lite/utils/io.h"
-
-#ifdef LITE_WITH_NPU
-#include "lite/backends/npu/npu_helper.h"
-#endif
-
-namespace paddle {
-namespace lite {
-
-#ifndef LITE_ON_TINY_PUBLISH
-int SizeOfType(framework::proto::VarType::Type type) {
-  using Type = framework::proto::VarType::Type;
-  switch (static_cast<int>(type)) {
-#define DO(desc, type)            \
-  case Type::VarType_Type_##desc: \
-    return sizeof(type);
-    DO(BOOL, bool);
-    DO(FP16, float);
-    DO(FP32, float);
-    DO(INT8, int8_t);
-    DO(INT32, int);
-    DO(INT64, int64_t);
-#undef DO
-    default:
-      LOG(FATAL) << "unknown data type " << type;
-  }
-  return -1;
-}
-
-void TensorFromStream(std::istream &is, lite::Tensor *tensor) {
-  using Type = framework::proto::VarType::Type;
-  uint32_t version;
-  is.read(reinterpret_cast<char *>(&version), sizeof(version));
-  CHECK_EQ(version, 0U) << "Only version 0 is supported";
-  // read tensor desc
-  framework::proto::VarType::TensorDesc desc;
-  {
-    // int32_t size
-    // proto buffer
-    int32_t size;
-    is.read(reinterpret_cast<char *>(&size), sizeof(size));
-    std::unique_ptr<char[]> buf(new char[size]);
-    is.read(reinterpret_cast<char *>(buf.get()), size);
-    CHECK(desc.ParseFromArray(buf.get(), size)) << "Cannot parse tensor desc";
-  }
-
-  // read tensor
-  std::vector<int64_t> dims_vec;
-  std::copy(
-      desc.dims().begin(), desc.dims().end(), std::back_inserter(dims_vec));
-  lite::DDim dims(dims_vec);
-  tensor->Resize(dims);
-  void *buf;
-  size_t size = tensor->dims().production() * SizeOfType(desc.data_type());
-  // alllocate memory
-  switch (static_cast<int>(desc.data_type())) {
-#define SET_TENSOR(desc, type, precision) \
-  case Type::VarType_Type_##desc:         \
-    buf = tensor->mutable_data<type>();   \
-    tensor->set_precision(precision);     \
-    break
-
-    // SET_TENSOR(BOOL, bool, PRECISION(kBool));
-    SET_TENSOR(FP32, float, PRECISION(kFloat));
-    SET_TENSOR(INT8, int8_t, PRECISION(kInt8));
-    SET_TENSOR(INT16, int16_t, PRECISION(kInt16));
-    SET_TENSOR(INT32, int32_t, PRECISION(kInt32));
-    SET_TENSOR(INT64, int64_t, PRECISION(kInt64));
-#undef SET_TENSOR
-    default:
-      LOG(FATAL) << "unknown type " << desc.data_type();
-  }
-  tensor->set_persistable(true);
-
-  is.read(static_cast<char *>(buf), size);
-}
-
-void LoadLoDTensor(std::istream &is, Variable *var) {
-  auto *tensor = var->GetMutable<lite::Tensor>();
-  uint32_t version{};
-  is.read(reinterpret_cast<char *>(&version), sizeof(version));
-  VLOG(3) << "model version " << version;
-
-  // Load LoD information
-  uint64_t lod_level{};
-  is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
-  auto &lod = *tensor->mutable_lod();
-  lod.resize(lod_level);
-  for (uint64_t i = 0; i < lod_level; ++i) {
-    uint64_t size;
-    is.read(reinterpret_cast<char *>(&size), sizeof(size));
-    std::vector<uint64_t> tmp(size / sizeof(uint64_t));
-    is.read(reinterpret_cast<char *>(tmp.data()),
-            static_cast<std::streamsize>(size));
-    lod[i] = tmp;
-  }
-
-  TensorFromStream(is, tensor);
-}
-
-void ReadBinaryFile(const std::string &filename, std::string *contents) {
-  std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  CHECK(fin.is_open()) << "Cannot open file: " << filename;
-  fin.seekg(0, std::ios::end);
-  auto size = fin.tellg();
-  contents->clear();
-  contents->resize(size);
-  fin.seekg(0, std::ios::beg);
-  fin.read(&(contents->at(0)), contents->size());
-  fin.close();
-}
-
-std::unique_ptr<framework::proto::ProgramDesc> LoadProgram(
-    const std::string &path, bool program_from_memory) {
-  std::unique_ptr<framework::proto::ProgramDesc> main_program(
-      new framework::proto::ProgramDesc);
-  if (!program_from_memory) {
-    std::string desc_str;
-    ReadBinaryFile(path, &desc_str);
-    main_program->ParseFromString(desc_str);
-  } else {
-    main_program->ParseFromString(path);
-  }
-  return main_program;
-}
-
-void LoadParams(const std::string &path) {}
-
-// Load directly to CPU, and latter transfer to other devices.
-void LoadParam(const std::string &path, Variable *out) {
-  std::ifstream fin(path, std::ios::binary);
-  CHECK(fin.is_open()) << "failed to open file " << path;
-  LoadLoDTensor(fin, out);
-}
-
-bool IsPersistable(const cpp::VarDesc &var) {
-  if (var.Persistable() && var.GetType() != VarDescAPI::Type::FEED_MINIBATCH &&
-      var.GetType() != VarDescAPI::Type::FETCH_LIST &&
-      var.GetType() != VarDescAPI::Type::RAW) {
-    return true;
-  }
-  return false;
-}
-
-void LoadCombinedParamsPb(const std::string &path,
-                          lite::Scope *scope,
-                          const cpp::ProgramDesc &cpp_prog,
-                          bool params_from_memory) {
-  CHECK(scope);
-  auto prog = cpp_prog;
-  auto &main_block_desc = *prog.GetBlock<cpp::BlockDesc>(0);
-
-  // Get vars
-  std::vector<std::string> paramlist;
-  for (size_t i = 0; i < main_block_desc.VarsSize(); ++i) {
-    auto &var = *main_block_desc.GetVar<cpp::VarDesc>(i);
-    if (!IsPersistable(var)) continue;
-    paramlist.push_back(var.Name());
-  }
-  std::sort(paramlist.begin(), paramlist.end());
-
-  // Load vars
-  auto load_var_func = [&](std::istream &is) {
-    for (size_t i = 0; i < paramlist.size(); ++i) {
-      auto *var = scope->Var(paramlist[i]);
-      // Error checking
-      CHECK(static_cast<bool>(is))
-          << "There is a problem with loading model parameters";
-      LoadLoDTensor(is, var);
-    }
-    is.peek();
-    CHECK(is.eof()) << "You are not allowed to load partial data via"
-                    << " LoadCombinedParamsPb, use LoadParam instead.";
-  };
-
-  if (params_from_memory) {
-    std::stringstream fin(path, std::ios::in | std::ios::binary);
-    load_var_func(fin);
-  } else {
-    std::ifstream fin(path, std::ios::binary);
-    CHECK(fin.is_open());
-    load_var_func(fin);
-  }
-}
-
-void LoadModelPb(const std::string &model_dir,
-                 const std::string &model_file,
-                 const std::string &param_file,
-                 Scope *scope,
-                 cpp::ProgramDesc *cpp_prog,
-                 bool combined,
-                 bool model_from_memory) {
-  CHECK(cpp_prog);
-  CHECK(scope);
-  cpp_prog->ClearBlocks();
-
-  // Load model
-  VLOG(4) << "Start load model program...";
-  std::string prog_path = model_dir + "/__model__";
-  if (combined) {
-    prog_path = model_file;
-  }
-  framework::proto::ProgramDesc pb_proto_prog =
-      *LoadProgram(prog_path, model_from_memory);
-  pb::ProgramDesc pb_prog(&pb_proto_prog);
-  // Transform to cpp::ProgramDesc
-  TransformProgramDescAnyToCpp(pb_prog, cpp_prog);
-
-  // Load Params
-  // NOTE: Only main block be used now.
-  VLOG(4) << "Start load model params...";
-  CHECK(!(!combined && model_from_memory))
-      << "If you want use the model_from_memory,"
-      << " you should load the combined model using cfg.set_model_buffer "
-         "interface.";
-  if (combined) {
-    LoadCombinedParamsPb(param_file, scope, *cpp_prog, model_from_memory);
-  } else {
-    auto main_block = pb_proto_prog.blocks(0);
-    for (auto &var : main_block.vars()) {
-      if (var.name() == "feed" || var.name() == "fetch" || !var.persistable())
-        continue;
-
-      std::string file_path = model_dir + "/" + var.name();
-      VLOG(4) << "reading weight " << var.name();
-
-      std::ifstream file(file_path);
-      switch (var.type().type()) {
-        case framework::proto::VarType_Type_LOD_TENSOR:
-          LoadLoDTensor(file, scope->Var(var.name()));
-          break;
-        default:
-          CHECK(false) << "unknown weight type";
-      }
-    }
-  }
-
-#ifdef LITE_WITH_NPU
-  auto main_block = pb_proto_prog.blocks(0);
-  for (auto &op : main_block.ops()) {
-    LOG(INFO) << "op type:" << op.type();
-    if (op.type() != "graph_op") {
-      continue;
-    }
-    auto xs = op.attrs();
-    auto it = std::find_if(
-        xs.begin(), xs.end(), [&](const framework::proto::OpDesc_Attr &x) {
-          return x.name() == "model_name";
-        });
-    CHECK(it != xs.end());
-    auto model_name = it->s();
-    std::string file_path = model_dir + "/" + model_name;
-    CHECK(npu::BuildNPUClient(file_path, model_name))
-        << "NPU model load failed!";
-  }
-#endif
-  VLOG(4) << "Load protobuf model in '" << model_dir << "'' successfully";
-}
-
-void SaveModelPb(const std::string &model_dir,
-                 const Scope &exec_scope,
-                 const cpp::ProgramDesc &cpp_prog,
-                 bool combined) {
-  MkDirRecur(model_dir);
-  // Save program
-  framework::proto::ProgramDesc pb_proto_prog;
-  pb::ProgramDesc pb_prog(&pb_proto_prog);
-  TransformProgramDescCppToAny(cpp_prog, &pb_prog);
-
-  std::string prog_path = model_dir + "/__model__";
-  if (combined) {
-    prog_path = model_dir + "/model";
-  }
-  std::ofstream model_ostream(prog_path, std::ios_base::binary);
-  CHECK(model_ostream.is_open());
-  const std::string pb_str = pb_proto_prog.SerializeAsString();
-  model_ostream.write(pb_str.c_str(), pb_str.size());
-  model_ostream.close();
-
-  // Save Params
-  // NOTE: Only main block be used now.
-  if (combined) {
-    const std::string combined_params_path = model_dir + "/params";
-    SaveCombinedParamsPb(combined_params_path, exec_scope, cpp_prog);
-  } else {
-    for (auto &item : pb_proto_prog.blocks(0).vars()) {
-      if (item.name() == "feed" || item.name() == "fetch" ||
-          !item.persistable())
-        continue;
-      const std::string path = model_dir + "/" + item.name();
-      std::ofstream var_ostream(path, std::ios::binary);
-      CHECK(var_ostream.is_open());
-      SerializeTensor(var_ostream, exec_scope, item.name());
-      var_ostream.close();
-    }
-  }
-  VLOG(4) << "Save protobuf model in '" << model_dir << "'' successfully";
-}
-
-void SaveCombinedParamsPb(const std::string &path,
-                          const lite::Scope &exec_scope,
-                          const cpp::ProgramDesc &cpp_prog) {
-  auto prog = cpp_prog;
-  auto &main_block_desc = *prog.GetBlock<cpp::BlockDesc>(0);
-
-  // Get vars
-  std::vector<std::string> paramlist;
-  for (size_t i = 0; i < main_block_desc.VarsSize(); ++i) {
-    auto &var = *main_block_desc.GetVar<cpp::VarDesc>(i);
-    if (!IsPersistable(var)) continue;
-    paramlist.push_back(var.Name());
-  }
-  std::sort(paramlist.begin(), paramlist.end());
-
-  // Load vars
-  std::ofstream file(path);
-  CHECK(file.is_open());
-  for (size_t i = 0; i < paramlist.size(); ++i) {
-    SerializeTensor(file, exec_scope, paramlist[i]);
-  }
-  file.close();
-}
-
-void TensorToStream(std::ostream &os, const lite::Tensor &tensor) {
-  // the 1st field, uint32_t version
-  constexpr uint32_t version = 0;
-  os.write(reinterpret_cast<const char *>(&version), sizeof(version));
-
-  {
-    uint64_t size = tensor.lod().size();
-    // the 2st field, LoD information
-    // uint64_t lod_level
-    // uint64_t lod_level_1 size in byte.
-    // int*     lod_level_1 data
-    // ...
-    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
-
-    for (auto &each : tensor.lod()) {
-      size = each.size() * sizeof(each.front());
-      os.write(reinterpret_cast<const char *>(&size), sizeof(size));
-      os.write(reinterpret_cast<const char *>(each.data()),
-               static_cast<std::streamsize>(size));
-    }
-  }
-
-  // There are two version fields in a LoDTensor.
-  os.write(reinterpret_cast<const char *>(&version), sizeof(version));
-
-  {  // the 2nd field, tensor description
-    // int32_t  size
-    // void*    protobuf message
-    framework::proto::VarType::TensorDesc desc;
-    // TODO(Superjomn) support other data types.
-    switch (tensor.precision()) {
-#define SET_DATA_TYPE(precision, type_desc) \
-  case precision:                           \
-    desc.set_data_type(type_desc);          \
-    break
-
-      SET_DATA_TYPE(PRECISION(kFloat), framework::proto::VarType_Type_FP32);
-      SET_DATA_TYPE(PRECISION(kInt8), framework::proto::VarType_Type_INT8);
-      SET_DATA_TYPE(PRECISION(kInt16), framework::proto::VarType_Type_INT16);
-      SET_DATA_TYPE(PRECISION(kInt32), framework::proto::VarType_Type_INT32);
-      SET_DATA_TYPE(PRECISION(kInt64), framework::proto::VarType_Type_INT64);
-#undef SET_DATA_TYPE
-      default:
-        LOG(FATAL) << "unknown precision type: "
-                   << PrecisionToStr(tensor.precision());
-    }
-    auto dims = tensor.dims();
-    auto *pb_dims = desc.mutable_dims();
-    pb_dims->Resize(static_cast<int>(dims.size()), 0);
-    auto dims_vec = dims.Vectorize();
-    std::copy(dims_vec.begin(), dims_vec.end(), pb_dims->begin());
-    int32_t size = desc.ByteSize();
-    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
-    auto out = desc.SerializeAsString();
-    os.write(out.data(), size);
-  }
-  {  // the 3rd field, tensor data
-    uint64_t size = tensor.memory_size();
-    CHECK_LT(size, std::numeric_limits<std::streamsize>::max())
-        << "Index overflow when writing tensor";
-
-#ifdef LITE_WITH_CUDA
-    if (tensor.target() == TARGET(kCUDA)) {
-      std::unique_ptr<char> tmp_buffer(new char[size]);
-      TargetWrapperCuda::MemcpySync(tmp_buffer.get(),
-                                    tensor.data<float>(),
-                                    tensor.data_size(),
-                                    IoDirection::DtoH);
-      os.write(static_cast<const char *>(tmp_buffer.get()),
-               static_cast<std::streamsize>(size));
-    } else  // NOLINT
-#endif      // LITE_WITH_CUDA
-    {
-      os.write(static_cast<const char *>(tensor.data<void>()),
-               static_cast<std::streamsize>(size));
-    }
-  }
-}
-
-void SerializeTensor(std::ostream &os,
-                     const lite::Scope &scope,
-                     const std::string &var_name) {
-  // Store all the persistable vars.
-  auto *var = scope.FindVar(var_name);
-  const auto &tensor = var->Get<lite::Tensor>();
-  TensorToStream(os, tensor);
-}
-
-/// For navie buffer
-void SetParamInfoNaive(naive_buffer::ParamDesc *param_desc,
-                       const lite::Scope &scope,
-                       const std::string &var_name) {
-  CHECK(param_desc);
-  auto &desc = *param_desc;
-
-  // the 1st field, uint32_t version
-  constexpr uint32_t version = 0;
-
-  auto *var = scope.FindVar(var_name);
-  const auto &tensor = var->Get<lite::Tensor>();
-
-  desc.SetName(var_name);
-
-  desc.SetModelVersion(version);
-  desc.SetTensorVersion(version);
-
-  desc.SetLoDLevel(tensor.lod().size());
-  desc.SetLoD(tensor.lod());
-
-  // TODO(sangoly): support other data types.
-  switch (tensor.precision()) {
-#define SET_DATA_TYPE(precision, type_desc) \
-  case precision:                           \
-    desc.SetDataType(type_desc);            \
-    break;
-
-    SET_DATA_TYPE(PRECISION(kFloat), VarDescAPI::VarDataType::FP32);
-    SET_DATA_TYPE(PRECISION(kInt8), VarDescAPI::VarDataType::INT8);
-    SET_DATA_TYPE(PRECISION(kInt16), VarDescAPI::VarDataType::INT16);
-    SET_DATA_TYPE(PRECISION(kInt32), VarDescAPI::VarDataType::INT32);
-    SET_DATA_TYPE(PRECISION(kInt64), VarDescAPI::VarDataType::INT64);
-#undef SET_DATA_TYPE
-    default:
-      LOG(FATAL) << "unknown precision type: "
-                 << PrecisionToStr(tensor.precision());
-  }
-  desc.SetDim(tensor.dims().Vectorize());
-  uint64_t size = tensor.memory_size();
-  CHECK_LT(size, std::numeric_limits<std::streamsize>::max())
-      << "Index overflow when writing tensor";
-
-#ifdef LITE_WITH_CUDA
-  if (tensor.target() == TARGET(kCUDA)) {
-    switch (tensor.precision()) {
-#define DO(precision, type)                                         \
-  case precision: {                                                 \
-    std::unique_ptr<type> tmp_buffer(new type[tensor.data_size()]); \
-    TargetWrapperCuda::MemcpySync(tmp_buffer.get(),                 \
-                                  tensor.data<type>(),              \
-                                  tensor.data_size(),               \
-                                  IoDirection::DtoH);               \
-    desc.SetData<type>(tmp_buffer.get(), tensor.data_size());       \
-  } break;
-      DO(PRECISION(kFloat), float);
-      DO(PRECISION(kInt8), int8_t);
-      DO(PRECISION(kInt16), int16_t);
-      DO(PRECISION(kInt32), int32_t);
-      DO(PRECISION(kInt64), int64_t);
-#undef DO
-      default:
-        LOG(FATAL) << "unknown precision type: "
-                   << PrecisionToStr(tensor.precision());
-    }
-  } else  // NOLINT
-#endif    // LITE_WITH_CUDA
-  {
-    switch (tensor.precision()) {
-#define DO(precision, type)                                      \
-  case precision:                                                \
-    desc.SetData<type>(tensor.data<type>(), tensor.data_size()); \
-    break;
-      DO(PRECISION(kFloat), float);
-      DO(PRECISION(kInt8), int8_t);
-      DO(PRECISION(kInt16), int16_t);
-      DO(PRECISION(kInt32), int32_t);
-      DO(PRECISION(kInt64), int64_t);
-#undef DO
-      default:
-        LOG(FATAL) << "unknown precision type: "
-                   << PrecisionToStr(tensor.precision());
-    }
-  }
-}
-
-void SaveParamNaive(const std::string &path,
-                    const lite::Scope &scope,
-                    const std::string &var_name) {
-  naive_buffer::BinaryTable table;
-  naive_buffer::proto::ParamDesc pt_desc(&table);
-  naive_buffer::ParamDesc desc(&pt_desc);
-
-  SetParamInfoNaive(&desc, scope, var_name);
-
-  // Save param
-  pt_desc.Save();
-  table.SaveToFile(path);
-}
-
-void SaveCombinedParamsNaive(const std::string &path,
-                             const lite::Scope &exec_scope,
-                             const cpp::ProgramDesc &cpp_prog) {
-  naive_buffer::BinaryTable table;
-  naive_buffer::proto::CombinedParamsDesc pt_desc(&table);
-  naive_buffer::CombinedParamsDesc desc(&pt_desc);
-
-  auto prog = cpp_prog;
-  auto &main_block_desc = *prog.GetBlock<cpp::BlockDesc>(0);
-  for (size_t i = 0; i < main_block_desc.VarsSize(); ++i) {
-    auto &var = *main_block_desc.GetVar<cpp::VarDesc>(i);
-    if (var.Name() == "feed" || var.Name() == "fetch" || !var.Persistable())
-      continue;
-    naive_buffer::ParamDesc param_desc(desc.AddParam());
-    SetParamInfoNaive(&param_desc, exec_scope, var.Name());
-  }
-
-  pt_desc.Save();
-  table.SaveToFile(path);
-}
-
-void SaveModelNaive(const std::string &model_dir,
-                    const Scope &exec_scope,
-                    const cpp::ProgramDesc &cpp_prog,
-                    bool combined) {
-  MkDirRecur(model_dir);
-  // Save program
-  const std::string prog_path = model_dir + "/__model__.nb";
-  naive_buffer::BinaryTable table;
-  naive_buffer::proto::ProgramDesc nb_proto_prog(&table);
-  naive_buffer::ProgramDesc nb_prog(&nb_proto_prog);
-  TransformProgramDescCppToAny(cpp_prog, &nb_prog);
-  nb_proto_prog.Save();
-  table.SaveToFile(prog_path);
-
-  // Save Params
-  // NOTE: Only main block be used now.
-  if (combined) {
-    const std::string combined_params_path = model_dir + "/param.nb";
-    SaveCombinedParamsNaive(combined_params_path, exec_scope, cpp_prog);
-  } else {
-    auto prog = cpp_prog;
-    auto &main_block_desc = *prog.GetBlock<cpp::BlockDesc>(0);
-    for (size_t i = 0; i < main_block_desc.VarsSize(); ++i) {
-      auto &var = *main_block_desc.GetVar<cpp::VarDesc>(i);
-      if (var.Name() == "feed" || var.Name() == "fetch" || !var.Persistable())
-        continue;
-      const std::string path = model_dir + "/" + var.Name() + ".nb";
-      SaveParamNaive(path, exec_scope, var.Name());
-    }
-  }
-  VLOG(4) << "Save naive buffer model in '" << model_dir << "'' successfully";
-}
-#endif
-
-template <typename T>
-void SetTensorDataNaive(T *out, size_t size, const std::vector<T> &src) {
-  CHECK(out);
-  CHECK(size == src.size());
-  for (size_t i = 0; i < size; ++i) {
-    out[i] = src[i];
-  }
-}
-
-void GetParamInfoNaive(const naive_buffer::ParamDesc &desc,
-                       lite::Scope *scope,
-                       const std::string &name) {
-  CHECK(scope);
-  CHECK_EQ(desc.Name(), name)
-      << "Var name not equal: ParamDesc.name=" << desc.Name()
-      << "vs filename=" << name;
-
-  auto *tensor = scope->Var(name)->GetMutable<lite::Tensor>();
-
-  VLOG(3) << "model version " << desc.ModelVersion();
-  CHECK_EQ(desc.TensorVersion(), 0U) << "Only version 0 is supported";
-
-  // Load LoD info
-  auto *tgt_lod = tensor->mutable_lod();
-  auto desc_lod = desc.LoD();
-  tgt_lod->assign(desc_lod.begin(), desc_lod.end());
-
-  // Load Dim info
-  tensor->Resize(lite::DDim(desc.Dim()));
-
-  // Load data
-  switch (desc.GetDataType()) {
-#define SET_TENSOR(data_type__, T, precision)                            \
-  case VarDescAPI::VarDataType::data_type__:                             \
-    SetTensorDataNaive<T>(                                               \
-        tensor->mutable_data<T>(), tensor->data_size(), desc.Data<T>()); \
-    tensor->set_precision(precision);                                    \
-    break
-
-    // SET_TENSOR(BOOL, bool, PRECISION(kBool));
-    SET_TENSOR(FP32, float, PRECISION(kFloat));
-    SET_TENSOR(INT8, int8_t, PRECISION(kInt8));
-    SET_TENSOR(INT16, int16_t, PRECISION(kInt16));
-    SET_TENSOR(INT32, int32_t, PRECISION(kInt32));
-    SET_TENSOR(INT64, int64_t, PRECISION(kInt64));
-#undef SET_TENSOR
-    default:
-      LOG(FATAL) << "unknown type";
-  }
-  tensor->set_persistable(true);
-}
-
-void LoadParamNaive(const std::string &path,
-                    lite::Scope *scope,
-                    const std::string &name) {
-  // Load param
-  naive_buffer::BinaryTable table;
-  table.LoadFromFile(path);
-  naive_buffer::proto::ParamDesc pt_desc(&table);
-  pt_desc.Load();
-  naive_buffer::ParamDesc desc(&pt_desc);
-  GetParamInfoNaive(desc, scope, name);
-}
-
-void LoadCombinedParamsNaive(const std::string &path,
-                             lite::Scope *scope,
-                             const cpp::ProgramDesc &cpp_prog,
-                             bool params_from_memory) {
-  naive_buffer::BinaryTable table;
-  if (params_from_memory) {
-    table.LoadFromMemory(path.c_str(), path.length());
-  } else {
-    table.LoadFromFile(path);
-  }
-  naive_buffer::proto::CombinedParamsDesc pt_desc(&table);
-  pt_desc.Load();
-  naive_buffer::CombinedParamsDesc desc(&pt_desc);
-
-  std::set<std::string> param_names;
-  for (size_t i = 0; i < desc.ParamsSize(); ++i) {
-    naive_buffer::ParamDesc param_desc(desc.GetParam(i));
-    GetParamInfoNaive(param_desc, scope, param_desc.Name());
-    param_names.insert(param_desc.Name());
-  }
-
-  // Check all params loaded
-  auto prog = cpp_prog;
-  auto &main_block_desc = *prog.GetBlock<cpp::BlockDesc>(0);
-  for (size_t i = 0; i < main_block_desc.VarsSize(); ++i) {
-    auto &var = *main_block_desc.GetVar<cpp::VarDesc>(i);
-    if (var.Name() == "feed" || var.Name() == "fetch" || !var.Persistable())
-      continue;
-    CHECK(param_names.count(var.Name())) << "Persistable var[" << var.Name()
-                                         << "] not found";
-  }
-}
-
-void LoadModelNaive(const std::string &model_dir,
-                    Scope *scope,
-                    cpp::ProgramDesc *cpp_prog,
-                    bool combined) {
-  CHECK(cpp_prog);
-  CHECK(scope);
-  cpp_prog->ClearBlocks();
-
-  // Load model
-  const std::string prog_path = model_dir + "/__model__.nb";
-  naive_buffer::BinaryTable table;
-  table.LoadFromFile(prog_path);
-  naive_buffer::proto::ProgramDesc nb_proto_prog(&table);
-  nb_proto_prog.Load();
-  naive_buffer::ProgramDesc nb_prog(&nb_proto_prog);
-
-  // Transform to cpp::ProgramDesc
-  TransformProgramDescAnyToCpp(nb_prog, cpp_prog);
-
-  // Load Params
-  // NOTE: Only main block be used now.
-  if (combined) {
-    const std::string combined_params_path = model_dir + "/param.nb";
-    LoadCombinedParamsNaive(combined_params_path, scope, *cpp_prog, false);
-  } else {
-    auto &prog = *cpp_prog;
-    auto &main_block_desc = *prog.GetBlock<cpp::BlockDesc>(0);
-    for (size_t i = 0; i < main_block_desc.VarsSize(); ++i) {
-      auto &var = *main_block_desc.GetVar<cpp::VarDesc>(i);
-      if (var.Name() == "feed" || var.Name() == "fetch" || !var.Persistable())
-        continue;
-
-      std::string file_path = model_dir + "/" + var.Name() + ".nb";
-      VLOG(4) << "reading weight " << var.Name();
-
-      switch (var.GetType()) {
-        case VarDescAPI::Type::LOD_TENSOR:
-          LoadParamNaive(file_path, scope, var.Name());
-          break;
-        default:
-          CHECK(false) << "unknown weight type";
-      }
-    }
-  }
-
-#ifdef LITE_WITH_NPU
-  auto &prog = *cpp_prog;
-  auto &main_block_desc = *prog.GetBlock<cpp::BlockDesc>(0);
-  for (size_t i = 0; i < main_block_desc.OpsSize(); ++i) {
-    auto &op = *main_block_desc.GetOp<cpp::OpDesc>(i);
-    if (op.Type() != "graph_op") {
-      continue;
-    }
-    auto model_name = op.GetAttr<std::string>("model_name");
-    std::string file_path = model_dir + "/" + model_name;
-    CHECK(npu::BuildNPUClient(file_path, model_name))
-        << "NPU model load failed!";
-  }
-#endif
-
-  VLOG(4) << "Load naive buffer model in '" << model_dir << "' successfully";
-}
-
-void LoadModelNaiveFromMemory(const std::string &model_buffer,
-                              const std::string &param_buffer,
-                              Scope *scope,
-                              cpp::ProgramDesc *cpp_prog) {
-  CHECK(cpp_prog);
-  CHECK(scope);
-  cpp_prog->ClearBlocks();
-
-  // Load model
-
-  std::string prog_path = model_buffer;
-
-  naive_buffer::BinaryTable table;
-  table.LoadFromMemory(prog_path.c_str(), prog_path.length());
-
-  naive_buffer::proto::ProgramDesc nb_proto_prog(&table);
-  nb_proto_prog.Load();
-  naive_buffer::ProgramDesc nb_prog(&nb_proto_prog);
-
-  // Transform to cpp::ProgramDesc
-  TransformProgramDescAnyToCpp(nb_prog, cpp_prog);
-
-  // Load Params
-  // NOTE: Only main block be used now.
-  // only combined Params are supported in Loading Model from memory
-  std::string combined_params_path = param_buffer;
-  LoadCombinedParamsNaive(combined_params_path, scope, *cpp_prog, true);
-
-#ifdef LITE_WITH_NPU
-  LOG(FATAL) << "load from memory is not supported by NPU";
-#endif
-
-  VLOG(4) << "Load model from naive buffer memory successfully";
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/model_parser.h b/lite/model_parser/model_parser.h
deleted file mode 100644
index 81be2579e3..0000000000
--- a/lite/model_parser/model_parser.h
+++ /dev/null
@@ -1,108 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file contains model format related operations, such as load a model,
-// parse an operator definitions and so on.
-
-#pragma once
-#include <memory>
-#include <string>
-#include <vector>
-#ifndef LITE_ON_TINY_PUBLISH
-#include "lite/core/framework.pb.h"
-#endif
-#include "lite/core/scope.h"
-#include "lite/core/variable.h"
-#include "lite/model_parser/compatible_pb.h"
-#include "lite/model_parser/naive_buffer/proto/framework.nb.h"
-
-namespace paddle {
-namespace lite {
-
-#ifndef LITE_ON_TINY_PUBLISH
-// Read a __model__ file.
-std::unique_ptr<framework::proto::ProgramDesc> LoadProgram(
-    const std::string& path, bool program_from_memory = false);
-
-// Read a single file containing all the parameters.
-void LoadParams(const std::string& path);
-
-// Load a single parameter to an output tensor.
-void LoadParam(const std::string& path, Variable* out);
-
-void LoadCombinedParamsPb(const std::string& path,
-                          lite::Scope* scope,
-                          const cpp::ProgramDesc& prog,
-                          bool params_from_memory = false);
-
-// Read a model and files of parameters in pb format.
-void LoadModelPb(const std::string& model_dir,
-                 const std::string& model_file,
-                 const std::string& param_file,
-                 Scope* scope,
-                 cpp::ProgramDesc* prog,
-                 bool combined = false,
-                 bool model_from_memory = false);
-
-// Save a model and files of parameters in pb format.
-void SaveModelPb(const std::string& model_dir,
-                 const Scope& scope,
-                 const cpp::ProgramDesc& prog,
-                 bool combined = false);
-
-void SaveCombinedParamsPb(const std::string& path,
-                          const lite::Scope& exec_scope,
-                          const cpp::ProgramDesc& prog);
-
-// Serialize tensors to ostream.
-void SerializeTensor(std::ostream& os,
-                     const lite::Scope& scope,
-                     const std::string& var);
-
-// LoDTensor to ostream
-void TensorToStream(std::ostream& os, const lite::Tensor& tensor);
-
-void ReadBinaryFile(const std::string& filename, std::string* contents);
-
-// For naive buffer
-void SaveParamNaive(const std::string& path,
-                    const lite::Scope& exec_scope,
-                    const std::string& var_name);
-
-void SaveCombinedParamsNaive(const std::string& path,
-                             const lite::Scope& exec_scope,
-                             const cpp::ProgramDesc& cpp_prog);
-
-void SaveModelNaive(const std::string& model_dir,
-                    const Scope& exec_scope,
-                    const cpp::ProgramDesc& cpp_prog,
-                    bool combined = true);
-#endif
-
-void LoadParamNaive(const std::string& path,
-                    lite::Scope* scope,
-                    const std::string& name);
-
-void LoadModelNaive(const std::string& model_dir,
-                    lite::Scope* scope,
-                    cpp::ProgramDesc* prog,
-                    bool combined = true);
-
-void LoadModelNaiveFromMemory(const std::string& model_buffer,
-                              const std::string& param_buffer,
-                              lite::Scope* scope,
-                              cpp::ProgramDesc* cpp_prog);
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/model_parser_test.cc b/lite/model_parser/model_parser_test.cc
deleted file mode 100644
index 5808302784..0000000000
--- a/lite/model_parser/model_parser_test.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/model_parser/model_parser.h"
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-#include "lite/core/scope.h"
-
-DEFINE_string(model_dir, "", "");
-
-namespace paddle {
-namespace lite {
-
-TEST(ModelParser, LoadProgram) {
-  CHECK(!FLAGS_model_dir.empty());
-  auto program = LoadProgram(FLAGS_model_dir + "/__model__");
-}
-
-TEST(ModelParser, LoadParam) {
-  Scope scope;
-  auto* v = scope.Var("xxx");
-  LoadParam(FLAGS_model_dir + "/fc_0.b_0", v);
-  const auto& t = v->Get<Tensor>();
-  LOG(INFO) << "loaded\n";
-  LOG(INFO) << t;
-}
-
-TEST(ModelParser, LoadModelPb) {
-  CHECK(!FLAGS_model_dir.empty());
-  cpp::ProgramDesc prog;
-  Scope scope;
-  LoadModelPb(FLAGS_model_dir, "", "", &scope, &prog);
-}
-
-TEST(ModelParser, SaveModelPb) {
-  CHECK(!FLAGS_model_dir.empty());
-  cpp::ProgramDesc prog;
-  Scope scope;
-  LoadModelPb(FLAGS_model_dir, "", "", &scope, &prog);
-  const std::string save_pb_model_path = FLAGS_model_dir + ".saved.pb";
-  SaveModelPb(save_pb_model_path, scope, prog);
-}
-
-TEST(ModelParser, SaveModelCombinedPb) {
-  CHECK(!FLAGS_model_dir.empty());
-  cpp::ProgramDesc prog;
-  Scope scope;
-  LoadModelPb(FLAGS_model_dir, "", "", &scope, &prog);
-  const std::string save_pb_model_path = FLAGS_model_dir + ".saved.pb.combined";
-  SaveModelPb(save_pb_model_path, scope, prog, true);
-}
-
-TEST(ModelParser, LoadModelCombinedPb) {
-  CHECK(!FLAGS_model_dir.empty());
-  const std::string model_path = FLAGS_model_dir + ".saved.pb.combined";
-  cpp::ProgramDesc prog;
-  Scope scope;
-  std::string model_file_path = FLAGS_model_dir + ".saved.pb.combined/model";
-  std::string param_file_path = FLAGS_model_dir + ".saved.pb.combined/params";
-  LoadModelPb(
-      model_path, model_file_path, param_file_path, &scope, &prog, true);
-}
-
-TEST(ModelParser, SaveParamNaive) {
-  Scope scope;
-  auto* tensor = scope.Var("xxx")->GetMutable<lite::Tensor>();
-  tensor->set_precision(PRECISION(kFloat));
-  tensor->set_persistable(true);
-  auto& lod = *tensor->mutable_lod();
-  lod.resize(2);
-  lod[0] = {1, 2, 3};
-  lod[1] = {4, 5};
-  std::vector<int64_t> dim({1, 2, 5});
-  tensor->Resize(lite::DDim(dim));
-  auto* data = tensor->mutable_data<float>();
-  size_t size = tensor->data_size();
-  for (size_t i = 0; i < size; ++i) {
-    data[i] = i / static_cast<float>(size);
-  }
-  SaveParamNaive("./fc_0.w", scope, "xxx");
-}
-
-TEST(ModelParser, LoadParamNaive) {
-  Scope scope;
-  LoadParamNaive("./fc_0.w", &scope, "xxx");
-  auto& tensor = scope.Var("xxx")->Get<lite::Tensor>();
-  std::vector<int64_t> bg_dim({1, 2, 5});
-  size_t size = 10;
-  std::vector<std::vector<uint64_t>> bg_lod({{1, 2, 3}, {4, 5}});
-  std::vector<float> bg_data(size);
-  for (size_t i = 0; i < size; ++i) {
-    bg_data[i] = i / static_cast<float>(size);
-  }
-
-  ASSERT_EQ(bg_dim, tensor.dims().Vectorize());
-  ASSERT_EQ(bg_lod, tensor.lod());
-  ASSERT_EQ(tensor.data_size(), size);
-  auto* data = tensor.data<float>();
-  for (int i = 0; i < size; ++i) {
-    EXPECT_NEAR(bg_data[i], data[i], 1e-6);
-  }
-}
-
-TEST(ModelParser, SaveModelNaive) {
-  CHECK(!FLAGS_model_dir.empty());
-  cpp::ProgramDesc prog;
-  Scope scope;
-  LoadModelPb(FLAGS_model_dir, "", "", &scope, &prog);
-  const std::string save_pb_model_path = FLAGS_model_dir + ".saved.naive";
-  SaveModelNaive(save_pb_model_path, scope, prog);
-}
-
-TEST(ModelParser, LoadModelNaiveFromMemory) {
-  CHECK(!FLAGS_model_dir.empty());
-  cpp::ProgramDesc prog;
-  Scope scope;
-
-  auto model_path = std::string(FLAGS_model_dir) + ".saved.naive/__model__.nb";
-  auto params_path = std::string(FLAGS_model_dir) + ".saved.naive/param.nb";
-  std::string model_buffer = lite::ReadFile(model_path);
-  std::string params_buffer = lite::ReadFile(params_path);
-
-  LoadModelNaiveFromMemory(model_buffer, params_buffer, &scope, &prog);
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/naive_buffer/CMakeLists.txt b/lite/model_parser/naive_buffer/CMakeLists.txt
deleted file mode 100644
index f85482e5d6..0000000000
--- a/lite/model_parser/naive_buffer/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-lite_cc_library(naive_buffer SRCS naive_buffer.cc DEPS types)
-
-add_subdirectory(proto)
-
-lite_cc_library(nb_op_desc SRCS op_desc.cc DEPS framework_nb)
-lite_cc_library(nb_var_desc SRCS var_desc.cc DEPS framework_nb)
-lite_cc_library(nb_param_desc SRCS param_desc.cc DEPS framework_nb)
-lite_cc_library(nb_combined_params_desc SRCS combined_params_desc.cc DEPS nb_param_desc framework_nb)
-lite_cc_library(nb_block_desc SRCS block_desc.cc DEPS framework_nb)
-lite_cc_library(nb_program_desc SRCS program_desc.cc DEPS framework_nb)
-
-set(naive_wrapper 
-    nb_op_desc nb_var_desc nb_param_desc nb_combined_params_desc
-    nb_block_desc nb_program_desc PARENT_SCOPE)
-
-lite_cc_test(test_naive_buffer SRCS naive_buffer_test.cc DEPS naive_buffer)
-lite_cc_test(test_naive_buffer_wrapper SRCS naive_buffer_wrapper_test.cc 
-             DEPS nb_op_desc nb_var_desc nb_param_desc nb_combined_params_desc 
-             nb_block_desc nb_program_desc)
diff --git a/lite/model_parser/naive_buffer/block_desc.cc b/lite/model_parser/naive_buffer/block_desc.cc
deleted file mode 100644
index 39bd93717d..0000000000
--- a/lite/model_parser/naive_buffer/block_desc.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/model_parser/naive_buffer/block_desc.h"
-
-namespace paddle {
-namespace lite {
-namespace naive_buffer {
-
-int32_t BlockDesc::Idx() const {
-  return desc_->GetField<Int32Builder>("idx").data();
-}
-
-void BlockDesc::SetIdx(int32_t idx) {
-  auto* builder = desc_->GetMutableField<Int32Builder>("idx");
-  CHECK(builder);
-  builder->set(idx);
-}
-
-int32_t BlockDesc::ParentIdx() const {
-  return desc_->GetField<Int32Builder>("parent_idx").data();
-}
-
-void BlockDesc::SetParentIdx(int32_t idx) {
-  auto* builder = desc_->GetMutableField<Int32Builder>("parent_idx");
-  CHECK(builder);
-  builder->set(idx);
-}
-
-size_t BlockDesc::VarsSize() const { return GetVarListBuilder().size(); }
-
-void BlockDesc::ClearVars() { GetMutableVarListBuilder()->Clear(); }
-
-template <>
-proto::VarDesc* BlockDesc::GetVar<proto::VarDesc>(int32_t idx) {
-  CHECK_LT(idx, VarsSize()) << "idx >= vars.size()";
-  return GetMutableVarListBuilder()->GetMutable(idx);
-}
-
-template <>
-proto::VarDesc* BlockDesc::AddVar<proto::VarDesc>() {
-  return GetMutableVarListBuilder()->New();
-}
-
-size_t BlockDesc::OpsSize() const { return GetOpListBuilder().size(); }
-
-void BlockDesc::ClearOps() { return GetMutableOpListBuilder()->Clear(); }
-
-template <>
-proto::OpDesc* BlockDesc::GetOp<proto::OpDesc>(int32_t idx) {
-  CHECK_LT(idx, OpsSize()) << "idx >= ops.size()";
-  return GetMutableOpListBuilder()->GetMutable(idx);
-}
-
-template <>
-proto::OpDesc* BlockDesc::AddOp<proto::OpDesc>() {
-  return GetMutableOpListBuilder()->New();
-}
-
-int32_t BlockDesc::ForwardBlockIdx() const {
-  return desc_->GetField<Int32Builder>("forward_block_idx").data();
-}
-
-void BlockDesc::SetForwardBlockIdx(int32_t idx) {
-  auto* builder = desc_->GetMutableField<Int32Builder>("forward_block_idx");
-  CHECK(builder);
-  builder->set(idx);
-}
-
-const ListBuilder<proto::VarDesc>& BlockDesc::GetVarListBuilder() const {
-  return desc_->GetField<ListBuilder<proto::VarDesc>>("vars");
-}
-
-ListBuilder<proto::VarDesc>* BlockDesc::GetMutableVarListBuilder() {
-  auto* res = desc_->GetMutableField<ListBuilder<proto::VarDesc>>("vars");
-  CHECK(res);
-  return res;
-}
-
-const ListBuilder<proto::OpDesc>& BlockDesc::GetOpListBuilder() const {
-  return desc_->GetField<ListBuilder<proto::OpDesc>>("ops");
-}
-
-ListBuilder<proto::OpDesc>* BlockDesc::GetMutableOpListBuilder() {
-  auto* res = desc_->GetMutableField<ListBuilder<proto::OpDesc>>("ops");
-  CHECK(res);
-  return res;
-}
-
-}  // namespace naive_buffer
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/naive_buffer/block_desc.h b/lite/model_parser/naive_buffer/block_desc.h
deleted file mode 100644
index b0ebe7c03f..0000000000
--- a/lite/model_parser/naive_buffer/block_desc.h
+++ /dev/null
@@ -1,86 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "lite/model_parser/desc_apis.h"
-#include "lite/model_parser/naive_buffer/proto/framework.nb.h"
-
-namespace paddle {
-namespace lite {
-namespace naive_buffer {
-
-class BlockDesc : public BlockDescAPI {
- public:
-  BlockDesc() = delete;
-
-  explicit BlockDesc(proto::BlockDesc* desc) : desc_(desc) { CHECK(desc_); }
-
-  void CopyFrom(BlockDesc& block_desc) {
-    CHECK(block_desc.Proto())
-        << "Source proto::BlockDesc pointer can't be null";
-    desc_ = block_desc.Proto();
-  }
-
-  proto::BlockDesc* Proto() { return desc_; }
-
-  const proto::BlockDesc& ReadonlyProto() const { return *desc_; }
-
-  int32_t Idx() const override;
-
-  void SetIdx(int32_t idx) override;
-
-  int32_t ParentIdx() const override;
-
-  void SetParentIdx(int32_t idx) override;
-
-  size_t VarsSize() const override;
-
-  void ClearVars() override;
-
-  template <typename T>
-  T* GetVar(int32_t idx);
-
-  template <typename T>
-  T* AddVar();
-
-  size_t OpsSize() const override;
-
-  void ClearOps() override;
-
-  template <typename T>
-  T* GetOp(int32_t idx);
-
-  template <typename T>
-  T* AddOp();
-
-  int32_t ForwardBlockIdx() const override;
-
-  void SetForwardBlockIdx(int32_t idx) override;
-
- private:
-  const ListBuilder<proto::VarDesc>& GetVarListBuilder() const;
-  ListBuilder<proto::VarDesc>* GetMutableVarListBuilder();
-  const ListBuilder<proto::OpDesc>& GetOpListBuilder() const;
-  ListBuilder<proto::OpDesc>* GetMutableOpListBuilder();
-
-  proto::BlockDesc* desc_;
-};
-
-}  // namespace naive_buffer
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/naive_buffer/combined_params_desc.cc b/lite/model_parser/naive_buffer/combined_params_desc.cc
deleted file mode 100644
index 72a556b852..0000000000
--- a/lite/model_parser/naive_buffer/combined_params_desc.cc
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/model_parser/naive_buffer/combined_params_desc.h"
diff --git a/lite/model_parser/naive_buffer/combined_params_desc.h b/lite/model_parser/naive_buffer/combined_params_desc.h
deleted file mode 100644
index a5462ef5ee..0000000000
--- a/lite/model_parser/naive_buffer/combined_params_desc.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "lite/model_parser/desc_apis.h"
-#include "lite/model_parser/naive_buffer/param_desc.h"
-#include "lite/model_parser/naive_buffer/proto/framework.nb.h"
-
-namespace paddle {
-namespace lite {
-namespace naive_buffer {
-
-class CombinedParamsDesc {
- public:
-  CombinedParamsDesc() = delete;
-
-  explicit CombinedParamsDesc(proto::CombinedParamsDesc *desc) : desc_(desc) {
-    CHECK(desc_);
-  }
-
-  void CopyFrom(CombinedParamsDesc &combined_params_desc) {  // NOLINT
-    CHECK(combined_params_desc.Proto())
-        << "Source proto::CombinedParamsDesc pointer can't be null";
-    desc_ = combined_params_desc.Proto();
-  }
-
-  proto::CombinedParamsDesc *Proto() { return desc_; }
-
-  const proto::CombinedParamsDesc &ReadonlyProto() const { return *desc_; }
-
-  size_t ParamsSize() const { return desc_->size(); }
-
-  void ClearParams() { desc_->Clear(); }
-
-  proto::ParamDesc *GetParam(int32_t idx) {
-    CHECK_LT(idx, ParamsSize()) << "idx >= params.size()";
-    return desc_->GetMutable(idx);
-  }
-
-  proto::ParamDesc *AddParam() { return desc_->New(); }
-
- private:
-  proto::CombinedParamsDesc *desc_;
-};
-
-}  // namespace naive_buffer
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/naive_buffer/naive_buffer.cc b/lite/model_parser/naive_buffer/naive_buffer.cc
deleted file mode 100644
index cefaf0c28a..0000000000
--- a/lite/model_parser/naive_buffer/naive_buffer.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/model_parser/naive_buffer/naive_buffer.h"
-#include <stdio.h>
-
-namespace paddle {
-namespace lite {
-namespace naive_buffer {
-
-void BinaryTable::Require(size_t size) {
-  CHECK(is_mutable_mode_);
-  if (free_size() < size) {
-    bytes_.resize(cursor_ + size);
-  }
-}
-
-void BinaryTable::Consume(size_t bytes) {
-  CHECK_LE(bytes, free_size()) << "No free memory of " << bytes
-                               << ", should Require the memory first";
-  cursor_ += bytes;
-  // Consume is used in both readonly and mutable mode to move the write/read
-  // cursor, so we don't check mutable mode here.
-}
-
-void BinaryTable::SaveToFile(const std::string &filename) const {
-  FILE *fp = fopen(filename.c_str(), "wb");
-  CHECK(fp) << "Unable to open file: " << filename;
-  if (fwrite(reinterpret_cast<const char *>(data()), 1, size(), fp) != size()) {
-    fclose(fp);
-    LOG(FATAL) << "Write file error: " << filename;
-  }
-  fclose(fp);
-}
-
-void BinaryTable::LoadFromFile(const std::string &filename) {
-  // get file size
-  FILE *fp = fopen(filename.c_str(), "rb");
-  CHECK(fp) << "Unable to open file: " << filename;
-  fseek(fp, 0L, SEEK_END);
-  size_t file_size = ftell(fp);
-  LOG(INFO) << "file size " << file_size;
-
-  // load data.
-  fseek(fp, 0L, SEEK_SET);
-  Require(file_size);
-  if (fread(reinterpret_cast<char *>(&bytes_[0]), 1, file_size, fp) !=
-      file_size) {
-    fclose(fp);
-    LOG(FATAL) << "Read file error: " << filename;
-  }
-  fclose(fp);
-
-  // Set readonly.
-  is_mutable_mode_ = false;
-}
-
-void BinaryTable::LoadFromMemory(const char *buffer, size_t buffer_size) {
-  // get buffer
-  bytes_.resize(buffer_size);
-  memcpy(reinterpret_cast<char *>(&bytes_[0]), buffer, buffer_size);
-  // Set readonly.
-  is_mutable_mode_ = false;
-}
-
-void StringBuilder::Save() {
-  // memory format: [size][string data]
-  uint64_t mem_size = sizeof(uint64_t) + data_.size();
-  table()->Require(mem_size);
-  uint64_t str_len = data_.size();
-
-  // write meta data of size.
-  memcpy(table()->cursor(), &str_len, sizeof(uint64_t));
-  table()->Consume(sizeof(uint64_t));
-
-  // write the string data.
-  memcpy(table()->cursor(),
-         reinterpret_cast<const byte_t *>(data_.c_str()),
-         str_len);
-  table()->Consume(str_len);
-}
-
-void StringBuilder::Load() {
-  // load meta data of size
-  uint64_t str_len{};
-  memcpy(&str_len, table()->cursor(), sizeof(uint64_t));
-  table()->Consume(sizeof(uint64_t));
-
-  // load string data.
-  data_.resize(str_len);
-  memcpy(&data_[0], table()->cursor(), str_len);
-  table()->Consume(str_len);
-}
-
-#define NEW_PRIMARY_BUILDER_IMPL(T, name__)                                   \
-  PrimaryBuilder<T> *StructBuilder::New##name__(const std::string &name,      \
-                                                T val) {                      \
-    using type = PrimaryBuilder<T>;                                           \
-    field_builders_.Set(name, std::unique_ptr<type>(new type(table(), val))); \
-    return static_cast<type *>(field_builders_.Get(name).get());              \
-  }
-NEW_PRIMARY_BUILDER_IMPL(bool, Bool);
-NEW_PRIMARY_BUILDER_IMPL(char, Char);
-NEW_PRIMARY_BUILDER_IMPL(int32_t, Int32);
-NEW_PRIMARY_BUILDER_IMPL(uint32_t, UInt32);
-NEW_PRIMARY_BUILDER_IMPL(int64_t, Int64);
-NEW_PRIMARY_BUILDER_IMPL(uint64_t, UInt64);
-NEW_PRIMARY_BUILDER_IMPL(float, Float32);
-NEW_PRIMARY_BUILDER_IMPL(double, Float64);
-#undef NEW_PRIMARY_BUILDER_IMPL
-
-StringBuilder *StructBuilder::NewStr(const std::string &name,
-                                     const std::string &val) {
-  using type = StringBuilder;
-  field_builders_.Set(name, std::unique_ptr<type>(new type(table(), val)));
-  return static_cast<type *>(field_builders_.Get(name).get());
-}
-
-void StructBuilder::Save() {
-  for (auto &elem : field_builders_.elements()) {
-    elem->Save();
-  }
-}
-
-void StructBuilder::Load() {
-  for (auto &elem : field_builders_.elements()) {
-    elem->Load();
-  }
-}
-
-}  // namespace naive_buffer
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/naive_buffer/naive_buffer.h b/lite/model_parser/naive_buffer/naive_buffer.h
deleted file mode 100644
index e2e2f7fb1e..0000000000
--- a/lite/model_parser/naive_buffer/naive_buffer.h
+++ /dev/null
@@ -1,374 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <string>
-#include <vector>
-#include "lite/core/types.h"
-#include "lite/utils/container.h"
-#include "lite/utils/cp_logging.h"
-
-/*
- * This file contains the implementation of NaiveBuffer. We implement the basic
- * interfaces for serialization and de-serialization for a PaddlePaddle model to
- * avoid using the third-party libraries such as protobuf, and make the lite
- * dependencies small and easy to compile and deploy.
- */
-
-namespace paddle {
-namespace lite {
-namespace naive_buffer {
-
-using core::Type;
-using byte_t = uint8_t;
-
-/*
- * BinaryTable is a binary buffer, it holds all the fields of a NaiveBuffer
- * object.
- * A BinaryTable can only support write or read in its lifetime, it is mutable
- * by default, but the `Load` method will get a readonly BinaryTable.
- */
-struct BinaryTable {
- private:
-  std::vector<byte_t> bytes_;
-  size_t cursor_{};
-  bool is_mutable_mode_{true};  // true for mutable, false for readonly.
-
- public:
-  /// Require free memory of `size` bytes.
-  void Require(size_t size);
-
-  /// Consume some memory.
-  void Consume(size_t bytes);
-
-  /// The current position of cursor for save or load.
-  byte_t* cursor() { return &bytes_[cursor_]; }
-  const byte_t* data() const { return bytes_.data(); }
-  size_t size() const { return bytes_.size(); }
-  size_t free_size() const { return bytes_.size() - cursor_; }
-
-  /// Serialize the table to a binary buffer.
-  void SaveToFile(const std::string& filename) const;
-
-  void LoadFromFile(const std::string& filename);
-  void LoadFromMemory(const char* buffer, size_t buffer_size);
-};
-
-/*
- * Base class of all the fields.
- */
-class FieldBuilder {
-  BinaryTable* table_{};
-
- public:
-  explicit FieldBuilder(BinaryTable* table) : table_(table) {}
-
-  // Write data to table and update the overall cursor.
-  virtual void Save() = 0;
-  // Load data from table and update the overall cursor.
-  virtual void Load() = 0;
-
-  virtual Type type() const = 0;
-
-  BinaryTable* table() { return table_; }
-
-  virtual ~FieldBuilder() = default;
-};
-
-/*
- * Builder for all the primary types. int32, float, bool and so on.
- */
-template <typename Primary>
-class PrimaryBuilder : public FieldBuilder {
-  Primary data_;
-
- public:
-  using value_type = Primary;
-
-  explicit PrimaryBuilder(BinaryTable* table) : FieldBuilder(table) {}
-  PrimaryBuilder(BinaryTable* table, const Primary& val)
-      : FieldBuilder(table), data_(val) {}
-
-  /// Set data.
-  void set(Primary x) { data_ = x; }
-
-  Primary data() const { return data_; }
-
-  /// Save information to the corresponding BinaryTable.
-  void Save() override;
-
-  /// Load information from the corresponding BinaryTable.
-  void Load() override;
-
-  Type type() const override { return core::StdTypeToRepr<Primary>(); }
-
-  ~PrimaryBuilder() = default;
-};
-
-using BoolBuilder = PrimaryBuilder<bool>;
-using CharBuilder = PrimaryBuilder<char>;
-using Int32Builder = PrimaryBuilder<int32_t>;
-using UInt32Builder = PrimaryBuilder<uint32_t>;
-using Int64Builder = PrimaryBuilder<int64_t>;
-using UInt64Builder = PrimaryBuilder<uint64_t>;
-using Float32Builder = PrimaryBuilder<float>;
-using Float64Builder = PrimaryBuilder<double>;
-
-/*
- * Builder for all the primary types. int32, float, bool and so on.
- */
-template <typename EnumType>
-class EnumBuilder : public FieldBuilder {
-  EnumType data_;
-
- public:
-  using value_type = int32_t;
-
-  explicit EnumBuilder(BinaryTable* table) : FieldBuilder(table) {}
-
-  /// Set data.
-  void set(EnumType x) { data_ = x; }
-
-  EnumType data() const { return data_; }
-
-  /// Save information to the corresponding BinaryTable.
-  void Save() override;
-
-  /// Load information from the corresponding BinaryTable.
-  void Load() override;
-
-  ~EnumBuilder() = default;
-
-  Type type() const override { return Type::_enum; }
-};
-
-class StringBuilder : public FieldBuilder {
-  std::string data_;
-
- public:
-  explicit StringBuilder(BinaryTable* table) : FieldBuilder(table) {}
-  StringBuilder(BinaryTable* table, const std::string& val)
-      : FieldBuilder(table), data_(val) {}
-
-  void set(const std::string& x) { data_ = x; }
-
-  const std::string& data() const { return data_; }
-
-  void Save() override;
-
-  void Load() override;
-
-  Type type() const override { return Type::_string; }
-};
-
-/*
- * This is a data structure. A composion of multiple fields.
- *
- * Usage:
- *
- * class MyStruct : public StructBuilder {
- *   public:
- *     MyStruct(BinaryTable* table) : StructBuilder(table) {
- *       NewStr("name");
- *       NewInt32("age");
- *     }
- * };
- *
- * One can retrive a field with the specific field name.
- * e.g.
- * GetField<Int32Builder>("age") will get the age field declared in `MyStruct`
- * GetMutableField<Int32Builder>("age") will get the mutable age field declared
- * in `MyStruct`
- */
-class StructBuilder : public FieldBuilder {
-  OrderedMap<std::unique_ptr<FieldBuilder>> field_builders_;
-
- public:
-  explicit StructBuilder(BinaryTable* table) : FieldBuilder(table) {}
-
-#define NEW_PRIMARY_BUILDER_DECLARE(T, name__, dft_val__) \
-  PrimaryBuilder<T>* New##name__(const std::string& name, T val = dft_val__);
-  NEW_PRIMARY_BUILDER_DECLARE(bool, Bool, false);
-  NEW_PRIMARY_BUILDER_DECLARE(char, Char, 0);
-  NEW_PRIMARY_BUILDER_DECLARE(int32_t, Int32, 0);
-  NEW_PRIMARY_BUILDER_DECLARE(uint32_t, UInt32, 0);
-  NEW_PRIMARY_BUILDER_DECLARE(int64_t, Int64, 0);
-  NEW_PRIMARY_BUILDER_DECLARE(uint64_t, UInt64, 0);
-  NEW_PRIMARY_BUILDER_DECLARE(float, Float32, 0.0);
-  NEW_PRIMARY_BUILDER_DECLARE(double, Float64, 0.0);
-#undef NEW_PRIMARY_BUILDER_DECLARE
-
-  /// Create a string field called `name`.
-  StringBuilder* NewStr(const std::string& name, const std::string& val = "");
-
-  /// Create a user-defined field, this can build a complex composed struct.
-  template <typename CustomBuilder>
-  CustomBuilder* New(const std::string& name);
-
-  /// Save the fields' information to the corresponding BinaryTable.
-  void Save() override;
-
-  /// Load the fields' information from the corresponding BinaryTable.
-  void Load() override;
-
-  /// Type of this struct.
-  // TODO(Superjomn) The customized type is not supported yet.
-  Type type() const override { return Type::_unk; }
-
-  /// Get a field by `name`.
-  template <typename T>
-  const T& GetField(const std::string& name) const {
-    auto& builder = field_builders_.Get(name);
-    return *(static_cast<const T*>(builder.get()));
-  }
-
-  /// Get a mutable field by `name`.
-  template <typename T>
-  T* GetMutableField(const std::string& name) {
-    auto& builder = field_builders_.GetMutable(name);
-    return static_cast<T*>(builder.get());
-  }
-};
-
-/*
- * Builder of a Struct List.
- *
- * Such as
- *
- * ListBuilder<Int32Builder> is equal to a vector<int32>
- */
-template <typename Builder>
-class ListBuilder : public FieldBuilder {
-  std::vector<Builder> builders_;
-
- public:
-  explicit ListBuilder(BinaryTable* table) : FieldBuilder(table) {}
-
-  // Create a new element.
-  Builder* New() {
-    builders_.emplace_back(table());
-    return &builders_.back();
-  }
-
-  // Get i-th element.
-  const Builder& Get(int i) const {
-    CHECK_LT(i, builders_.size());
-    return builders_[i];
-  }
-
-  Builder* GetMutable(int i) {
-    CHECK_LT(i, builders_.size());
-    return &builders_[i];
-  }
-
-  typename std::vector<Builder>::iterator begin() { return builders_.begin(); }
-
-  typename std::vector<Builder>::iterator end() { return builders_.end(); }
-
-  typename std::vector<Builder>::const_iterator begin() const {
-    return builders_.begin();
-  }
-
-  typename std::vector<Builder>::const_iterator end() const {
-    return builders_.end();
-  }
-
-  // Get element type.
-  Type type() const override { return Type::_list; }
-
-  /// Persist information to the corresponding BinaryTable.
-  void Save() override;
-
-  /// Load information from the corresponding BinaryTable.
-  void Load() override;
-
-  /// Number of elements.
-  size_t size() const { return builders_.size(); }
-
-  /// clear builders
-  void Clear() { builders_.clear(); }
-};
-
-template <typename Builder>
-void ListBuilder<Builder>::Save() {
-  // store number of elements in the head.
-  uint64_t num_elems = size();
-  table()->Require(sizeof(uint64_t));
-  memcpy(table()->cursor(), &num_elems, sizeof(uint64_t));
-  table()->Consume(sizeof(uint64_t));
-
-  // Save all the elements.
-  for (auto& elem : builders_) {
-    elem.Save();
-  }
-}
-
-template <typename Builder>
-void ListBuilder<Builder>::Load() {
-  CHECK(builders_.empty()) << "Duplicate load";
-  // Load number of elements first.
-  uint64_t num_elems{};
-  memcpy(&num_elems, table()->cursor(), sizeof(uint64_t));
-  table()->Consume(sizeof(uint64_t));
-
-  // Load all the elements.
-  for (uint64_t i = 0; i < num_elems; i++) {
-    builders_.emplace_back(table());
-    builders_.back().Load();
-  }
-}
-
-template <typename Primary>
-void PrimaryBuilder<Primary>::Save() {
-  table()->Require(sizeof(value_type));
-  memcpy(
-      table()->cursor(), reinterpret_cast<byte_t*>(&data_), sizeof(value_type));
-  table()->Consume(sizeof(value_type));
-}
-
-template <typename Primary>
-void PrimaryBuilder<Primary>::Load() {
-  memcpy(&data_, table()->cursor(), sizeof(value_type));
-  table()->Consume(sizeof(value_type));
-}
-
-template <typename EnumType>
-void EnumBuilder<EnumType>::Save() {
-  value_type holder = static_cast<value_type>(data_);
-  table()->Require(sizeof(value_type));
-  memcpy(table()->cursor(),
-         reinterpret_cast<byte_t*>(&holder),
-         sizeof(value_type));
-  table()->Consume(sizeof(value_type));
-}
-
-template <typename EnumType>
-void EnumBuilder<EnumType>::Load() {
-  value_type holder;
-  memcpy(&holder, table()->cursor(), sizeof(value_type));
-  table()->Consume(sizeof(value_type));
-  data_ = static_cast<EnumType>(holder);
-}
-
-template <typename CustomBuilder>
-CustomBuilder* StructBuilder::New(const std::string& name) {
-  using type = CustomBuilder;
-  field_builders_.Set(name, std::unique_ptr<CustomBuilder>(new type(table())));
-  return static_cast<type*>(field_builders_.GetMutable(name).get());
-}
-
-}  // namespace naive_buffer
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/naive_buffer/naive_buffer_test.cc b/lite/model_parser/naive_buffer/naive_buffer_test.cc
deleted file mode 100644
index 8b6ffb4dcf..0000000000
--- a/lite/model_parser/naive_buffer/naive_buffer_test.cc
+++ /dev/null
@@ -1,178 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/model_parser/naive_buffer/naive_buffer.h"
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace lite {
-namespace naive_buffer {
-
-TEST(NaiveBuffer, primary) {
-  BinaryTable table;
-  PrimaryBuilder<int32_t> p0(&table);
-  PrimaryBuilder<float> p1(&table);
-  StringBuilder p2(&table);
-  ASSERT_EQ(p0.type(), Type::_int32);
-  ASSERT_EQ(p1.type(), Type::_float32);
-  ASSERT_EQ(p2.type(), Type::_string);
-
-  p0.set(2008);
-  p0.Save();
-  p1.set(2.008);
-  p1.Save();
-  p2.set("hello world");
-  p2.Save();
-
-  table.SaveToFile("1.bf");
-
-  BinaryTable table1;
-  table1.LoadFromFile("1.bf");
-  PrimaryBuilder<int32_t> p0_load(&table1);
-  PrimaryBuilder<float> p1_load(&table1);
-  StringBuilder p2_load(&table1);
-
-  p0_load.Load();
-  p1_load.Load();
-  p2_load.Load();
-
-  ASSERT_EQ(p0_load.data(), 2008);
-  EXPECT_NEAR(p1_load.data(), 2.008, 1e-5);
-  ASSERT_EQ(p2_load.data(), "hello world");
-}
-
-// Message structure 0
-class NBTestMsg0 : public StructBuilder {
- public:
-  explicit NBTestMsg0(BinaryTable* table) : StructBuilder(table) {
-    NewInt32("int0");
-    NewInt32("int1");
-    NewInt32("int2");
-    NewInt32("float");
-    NewStr("str0");
-    NewStr("str1");
-  }
-};
-
-using enum_builder = EnumBuilder<Type>;
-// Message structure composed of NBTestMsg0
-class NBTestMsg1 : public StructBuilder {
- public:
-  explicit NBTestMsg1(BinaryTable* table) : StructBuilder(table) {
-    NewInt32("int0");
-    New<enum_builder>("enum0");
-    New<NBTestMsg0>("msg0");
-  }
-};
-
-int32_t int0 = 1222112;
-int32_t int1 = 23232839;
-int32_t int2 = 5431566;
-float float0 = 233.23212;
-const char* str0 = "sdfalfjasngasdghsadfjafas;fj";
-const char* str1 = "sdlkfjasdfafjcsasafasskdfjh  fsadfsafj;fj";
-
-void SetMsg0(NBTestMsg0* msg0) {
-  msg0->GetMutableField<Int32Builder>("int0")->set(int0);
-  msg0->GetMutableField<Int32Builder>("int1")->set(int1);
-  msg0->GetMutableField<Int32Builder>("int2")->set(int2);
-  msg0->GetMutableField<Float32Builder>("float")->set(float0);
-  msg0->GetMutableField<StringBuilder>("str0")->set(str0);
-  msg0->GetMutableField<StringBuilder>("str1")->set(str1);
-  msg0->Save();
-}
-
-void TestMsg0(const NBTestMsg0& msg0) {
-  ASSERT_EQ(msg0.GetField<Int32Builder>("int0").data(), int0);
-  ASSERT_EQ(msg0.GetField<Int32Builder>("int1").data(), int1);
-  ASSERT_EQ(msg0.GetField<Int32Builder>("int2").data(), int2);
-  EXPECT_NEAR(msg0.GetField<Float32Builder>("float").data(), float0, 1e-5);
-  ASSERT_EQ(msg0.GetField<StringBuilder>("str0").data(), str0);
-  ASSERT_EQ(msg0.GetField<StringBuilder>("str1").data(), str1);
-}
-
-TEST(NBTestMsg, msg0) {
-  BinaryTable table;
-  NBTestMsg0 msg0(&table);
-  SetMsg0(&msg0);
-
-  // write the table
-  table.SaveToFile("1.bf");
-
-  // load the table
-  BinaryTable table1;
-  table1.LoadFromFile("1.bf");
-  NBTestMsg0 msg1(&table1);
-  msg1.Load();
-  TestMsg0(msg1);
-}
-
-TEST(NBTestMsg, msg1) {
-  BinaryTable table;
-  NBTestMsg1 msg(&table);
-
-  auto* int0 = msg.GetMutableField<Int32Builder>("int0");
-  auto* enum0 = msg.GetMutableField<enum_builder>("enum0");
-  auto* msg0 = msg.GetMutableField<NBTestMsg0>("msg0");
-
-  int0->set(2008);
-  int0->Save();
-
-  enum0->set(Type::_int64);
-  enum0->Save();
-
-  SetMsg0(msg0);
-
-  table.SaveToFile("1.bf");
-
-  BinaryTable table1;
-  NBTestMsg1 msg1(&table1);
-  table1.LoadFromFile("1.bf");
-
-  msg1.Load();
-
-  ASSERT_EQ(msg.GetField<Int32Builder>("int0").data(), 2008);
-  ASSERT_EQ(msg.GetField<enum_builder>("enum0").data(), Type::_int64);
-  TestMsg0(msg1.GetField<NBTestMsg0>("msg0"));
-}
-
-TEST(ListBuilder, basic) {
-  BinaryTable table;
-  ListBuilder<StringBuilder> li(&table);
-
-  const int num_elems = 101;
-
-  for (int i = 0; i < num_elems; i++) {
-    auto* elem = li.New();
-    elem->set("elem-" + std::to_string(i));
-  }
-  li.Save();
-  table.SaveToFile("2.bf");
-  LOG(INFO) << "table.size " << table.size();
-
-  BinaryTable table1;
-  table1.LoadFromFile("2.bf");
-  ASSERT_EQ(table1.size(), table.size());
-
-  ListBuilder<StringBuilder> li1(&table1);
-  li1.Load();
-
-  for (int i = 0; i < num_elems; i++) {
-    ASSERT_EQ(li1.Get(i).data(), "elem-" + std::to_string(i));
-  }
-}
-
-}  // namespace naive_buffer
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/naive_buffer/naive_buffer_wrapper_helper.h b/lite/model_parser/naive_buffer/naive_buffer_wrapper_helper.h
deleted file mode 100644
index b95c643534..0000000000
--- a/lite/model_parser/naive_buffer/naive_buffer_wrapper_helper.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <vector>
-#include "lite/model_parser/naive_buffer/naive_buffer.h"
-
-namespace paddle {
-namespace lite {
-namespace naive_buffer {
-
-/// BuilderType must have data method
-template <typename T, typename BuilderType>
-std::vector<T> RepeatedToVector(const ListBuilder<BuilderType>& builder) {
-  std::vector<T> res;
-  for (size_t i = 0; i < builder.size(); ++i) {
-    res.push_back(builder.Get(i).data());
-  }
-  return res;
-}
-
-/// BuilderType must have set method
-template <typename T, typename BuilderType>
-void VectorToRepeated(const std::vector<T>& data,
-                      ListBuilder<BuilderType>* builder) {
-  CHECK(builder);
-  builder->Clear();
-  for (auto& val : data) {
-    builder->New()->set(val);
-  }
-}
-
-}  // namespace naive_buffer
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/naive_buffer/naive_buffer_wrapper_test.cc b/lite/model_parser/naive_buffer/naive_buffer_wrapper_test.cc
deleted file mode 100644
index 45224de122..0000000000
--- a/lite/model_parser/naive_buffer/naive_buffer_wrapper_test.cc
+++ /dev/null
@@ -1,316 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/model_parser/naive_buffer/block_desc.h"
-#include "lite/model_parser/naive_buffer/combined_params_desc.h"
-#include "lite/model_parser/naive_buffer/op_desc.h"
-#include "lite/model_parser/naive_buffer/param_desc.h"
-#include "lite/model_parser/naive_buffer/program_desc.h"
-#include "lite/model_parser/naive_buffer/var_desc.h"
-
-namespace paddle {
-namespace lite {
-namespace naive_buffer {
-
-TEST(NaiveBufferWrapper, OpDesc) {
-  BinaryTable table0;
-  proto::OpDesc pt_desc0(&table0);
-  OpDesc nb_desc0(&pt_desc0);
-
-  // Set OpDesc
-  nb_desc0.SetType("mul");
-  nb_desc0.SetInput("X", {"a"});
-  nb_desc0.SetInput("Y", {"b"});
-  nb_desc0.SetOutput("Out", {"c"});
-  nb_desc0.SetAttr<int32_t>("x_num_col_dims", 0);
-  nb_desc0.SetAttr<int32_t>("y_num_col_dims", 1);
-
-  // Save model
-  pt_desc0.Save();
-  table0.SaveToFile("1.bf");
-
-  // Load model
-  BinaryTable table1;
-  table1.LoadFromFile("1.bf");
-  proto::OpDesc pt_desc1(&table1);
-  pt_desc1.Load();
-  OpDesc nb_desc1(&pt_desc1);
-
-  ASSERT_EQ(nb_desc1.Type(), "mul");
-  auto x = nb_desc1.Input("X");
-  ASSERT_EQ(x.size(), 1);
-  ASSERT_EQ(x[0], "a");
-  auto y = nb_desc1.Input("Y");
-  ASSERT_EQ(y.size(), 1);
-  ASSERT_EQ(y[0], "b");
-  auto out = nb_desc1.Output("Out");
-  ASSERT_EQ(out.size(), 1);
-  ASSERT_EQ(out[0], "c");
-  ASSERT_TRUE(nb_desc1.HasAttr("x_num_col_dims"));
-  ASSERT_EQ(nb_desc1.GetAttr<int32_t>("x_num_col_dims"), 0);
-  ASSERT_EQ(nb_desc1.GetAttrType("x_num_col_dims"), OpDescAPI::AttrType::INT);
-  ASSERT_TRUE(nb_desc1.HasAttr("y_num_col_dims"));
-  ASSERT_EQ(nb_desc1.GetAttr<int32_t>("y_num_col_dims"), 1);
-  ASSERT_EQ(nb_desc1.GetAttrType("y_num_col_dims"), OpDescAPI::AttrType::INT);
-}
-
-TEST(NaiveBufferWrapper, VarDesc) {
-  BinaryTable table0;
-  proto::VarDesc pt_desc0(&table0);
-  VarDesc nb_desc0(&pt_desc0);
-
-  // Set VarDesc
-  nb_desc0.SetName("a");
-  nb_desc0.SetPersistable(true);
-  nb_desc0.SetType(VarDescAPI::VarDataType::LOD_TENSOR);
-
-  // Save model
-  pt_desc0.Save();
-  table0.SaveToFile("2.bf");
-
-  // Load model
-  BinaryTable table1;
-  table1.LoadFromFile("2.bf");
-  proto::VarDesc pt_desc1(&table1);
-  pt_desc1.Load();
-  VarDesc nb_desc1(&pt_desc1);
-
-  ASSERT_EQ(nb_desc1.Name(), "a");
-  ASSERT_EQ(nb_desc1.GetType(), VarDescAPI::VarDataType::LOD_TENSOR);
-  ASSERT_TRUE(nb_desc1.Persistable());
-}
-
-TEST(NaiveBufferWrapper, ParamDesc) {
-  BinaryTable table0;
-  proto::ParamDesc pt_desc0(&table0);
-  ParamDesc nb_desc0(&pt_desc0);
-
-  // Set ParamDesc
-  nb_desc0.SetName("fc_w.0");
-  nb_desc0.SetModelVersion(0);
-  nb_desc0.SetTensorVersion(1);
-  std::vector<std::vector<uint64_t>> lod({{1, 2, 3}, {4, 5}});
-  nb_desc0.SetLoDLevel(2);
-  nb_desc0.SetLoD(lod);
-  std::vector<int64_t> dim({1, 2, 5});
-  nb_desc0.SetDim(dim);
-  nb_desc0.SetDataType(VarDescAPI::VarDataType::FP32);
-  std::vector<float> data;
-  for (int i = 0; i < 10; ++i) {
-    data.push_back(i / 10.0);
-  }
-  nb_desc0.SetData(data);
-
-  // Save model
-  pt_desc0.Save();
-  table0.SaveToFile("3.bf");
-
-  // Load model
-  BinaryTable table1;
-  table1.LoadFromFile("3.bf");
-  proto::ParamDesc pt_desc1(&table1);
-  pt_desc1.Load();
-  ParamDesc nb_desc1(&pt_desc1);
-
-  ASSERT_EQ(nb_desc1.Name(), "fc_w.0");
-  ASSERT_EQ(nb_desc1.ModelVersion(), 0);
-  ASSERT_EQ(nb_desc1.TensorVersion(), 1);
-  ASSERT_EQ(nb_desc1.LoDLevel(), 2);
-  ASSERT_EQ(nb_desc1.LoD(), lod);
-  ASSERT_EQ(nb_desc1.Dim(), dim);
-  auto data1 = nb_desc1.Data<float>();
-  ASSERT_EQ(data1.size(), data.size());
-  for (size_t i = 0; i < data1.size(); ++i) {
-    EXPECT_NEAR(data1[i], data[i], 1e-6);
-  }
-}
-
-TEST(NaiveBufferWrapper, CombinedParamsDesc) {
-  BinaryTable table0;
-  proto::CombinedParamsDesc pt_desc0(&table0);
-  CombinedParamsDesc nb_desc0(&pt_desc0);
-
-  // Set ParamDesc
-  ParamDesc param_desc0_0(nb_desc0.AddParam());
-  param_desc0_0.SetName("fc_w.0");
-  param_desc0_0.SetModelVersion(0);
-  param_desc0_0.SetTensorVersion(1);
-  std::vector<std::vector<uint64_t>> param_desc0_0_lod({{1, 2, 3}, {4, 5}});
-  param_desc0_0.SetLoDLevel(2);
-  param_desc0_0.SetLoD(param_desc0_0_lod);
-  std::vector<int64_t> param_desc0_0_dim({1, 2, 5});
-  param_desc0_0.SetDim(param_desc0_0_dim);
-  param_desc0_0.SetDataType(VarDescAPI::VarDataType::FP32);
-  std::vector<float> param_desc0_0_data;
-  for (int i = 0; i < 10; ++i) {
-    param_desc0_0_data.push_back(i / 10.0);
-  }
-  param_desc0_0.SetData(param_desc0_0_data);
-
-  ParamDesc param_desc0_1(nb_desc0.AddParam());
-  param_desc0_1.SetName("fc_b.0");
-  param_desc0_1.SetModelVersion(0);
-  param_desc0_1.SetTensorVersion(1);
-  std::vector<std::vector<uint64_t>> param_desc0_1_lod({{1}, {2, 3}, {4, 5}});
-  param_desc0_1.SetLoDLevel(3);
-  param_desc0_1.SetLoD(param_desc0_1_lod);
-  std::vector<int64_t> param_desc0_1_dim({1, 2, 2, 5});
-  param_desc0_1.SetDim(param_desc0_1_dim);
-  param_desc0_1.SetDataType(VarDescAPI::VarDataType::FP32);
-  std::vector<float> param_desc0_1_data;
-  for (int i = 0; i < 20; ++i) {
-    param_desc0_1_data.push_back((i - 10) / 10.0);
-  }
-  param_desc0_1.SetData(param_desc0_1_data);
-
-  // Save model
-  pt_desc0.Save();
-  table0.SaveToFile("4.bf");
-
-  // Load model
-  BinaryTable table1;
-  table1.LoadFromFile("4.bf");
-  proto::CombinedParamsDesc pt_desc1(&table1);
-  pt_desc1.Load();
-  CombinedParamsDesc nb_desc1(&pt_desc1);
-
-  ASSERT_EQ(nb_desc1.ParamsSize(), 2);
-
-  ParamDesc param_desc1_0(nb_desc1.GetParam(0));
-  ASSERT_EQ(param_desc1_0.Name(), "fc_w.0");
-  ASSERT_EQ(param_desc1_0.ModelVersion(), 0);
-  ASSERT_EQ(param_desc1_0.TensorVersion(), 1);
-  ASSERT_EQ(param_desc1_0.LoDLevel(), 2);
-  ASSERT_EQ(param_desc1_0.LoD(), param_desc0_0_lod);
-  ASSERT_EQ(param_desc1_0.Dim(), param_desc0_0_dim);
-  auto param_desc1_0_data = param_desc1_0.Data<float>();
-  ASSERT_EQ(param_desc1_0_data.size(), param_desc0_0_data.size());
-  for (size_t i = 0; i < param_desc1_0_data.size(); ++i) {
-    EXPECT_NEAR(param_desc1_0_data[i], param_desc0_0_data[i], 1e-6);
-  }
-
-  ParamDesc param_desc1_1(nb_desc1.GetParam(1));
-  ASSERT_EQ(param_desc1_1.Name(), "fc_b.0");
-  ASSERT_EQ(param_desc1_1.ModelVersion(), 0);
-  ASSERT_EQ(param_desc1_1.TensorVersion(), 1);
-  ASSERT_EQ(param_desc1_1.LoDLevel(), 3);
-  ASSERT_EQ(param_desc1_1.LoD(), param_desc0_1_lod);
-  ASSERT_EQ(param_desc1_1.Dim(), param_desc0_1_dim);
-  auto param_desc1_1_data = param_desc1_1.Data<float>();
-  ASSERT_EQ(param_desc1_1_data.size(), param_desc0_1_data.size());
-  for (size_t i = 0; i < param_desc1_1_data.size(); ++i) {
-    EXPECT_NEAR(param_desc1_1_data[i], param_desc0_1_data[i], 1e-6);
-  }
-}
-
-TEST(NaiveBufferWrapper, BlockDesc) {
-  BinaryTable table0;
-  proto::BlockDesc pt_desc0(&table0);
-  BlockDesc nb_desc0(&pt_desc0);
-
-  // Set BlockDesc
-  nb_desc0.SetIdx(1);
-  nb_desc0.SetParentIdx(2);
-  nb_desc0.SetForwardBlockIdx(3);
-  VarDesc var0_0(nb_desc0.AddVar<proto::VarDesc>());
-  var0_0.SetName("a");
-  var0_0.SetPersistable(true);
-  var0_0.SetType(VarDescAPI::VarDataType::LOD_TENSOR);
-  VarDesc var0_1(nb_desc0.AddVar<proto::VarDesc>());
-  var0_1.SetName("b");
-  var0_1.SetPersistable(false);
-  var0_1.SetType(VarDescAPI::VarDataType::READER);
-  OpDesc op0_0(nb_desc0.AddOp<proto::OpDesc>());
-  op0_0.SetType("mul");
-  op0_0.SetInput("X", {"a"});
-  op0_0.SetInput("Y", {"b"});
-  op0_0.SetOutput("Out", {"c"});
-  op0_0.SetAttr<int32_t>("x_num_col_dims", 0);
-  op0_0.SetAttr<int32_t>("y_num_col_dims", 1);
-
-  // Save model
-  pt_desc0.Save();
-  table0.SaveToFile("5.bf");
-
-  // Load model
-  BinaryTable table1;
-  table1.LoadFromFile("5.bf");
-  proto::BlockDesc pt_desc1(&table1);
-  pt_desc1.Load();
-  BlockDesc nb_desc1(&pt_desc1);
-
-  ASSERT_EQ(nb_desc1.Idx(), 1);
-  ASSERT_EQ(nb_desc1.ParentIdx(), 2);
-  ASSERT_EQ(nb_desc1.ForwardBlockIdx(), 3);
-
-  ASSERT_EQ(nb_desc1.VarsSize(), 2);
-  VarDesc var1_0(nb_desc1.GetVar<proto::VarDesc>(0));
-  ASSERT_EQ(var1_0.Name(), "a");
-  ASSERT_EQ(var1_0.GetType(), VarDescAPI::VarDataType::LOD_TENSOR);
-  ASSERT_TRUE(var1_0.Persistable());
-  VarDesc var1_1(nb_desc1.GetVar<proto::VarDesc>(1));
-  ASSERT_EQ(var1_1.Name(), "b");
-  ASSERT_EQ(var1_1.GetType(), VarDescAPI::VarDataType::READER);
-  ASSERT_FALSE(var1_1.Persistable());
-
-  ASSERT_EQ(nb_desc1.OpsSize(), 1);
-  OpDesc op1_0(nb_desc1.GetOp<proto::OpDesc>(0));
-  ASSERT_EQ(op1_0.Type(), "mul");
-  auto x = op1_0.Input("X");
-  ASSERT_EQ(x.size(), 1);
-  ASSERT_EQ(x[0], "a");
-  auto y = op1_0.Input("Y");
-  ASSERT_EQ(y.size(), 1);
-  ASSERT_EQ(y[0], "b");
-  auto out = op1_0.Output("Out");
-  ASSERT_EQ(out.size(), 1);
-  ASSERT_EQ(out[0], "c");
-  ASSERT_TRUE(op1_0.HasAttr("x_num_col_dims"));
-  ASSERT_EQ(op1_0.GetAttr<int32_t>("x_num_col_dims"), 0);
-  ASSERT_EQ(op1_0.GetAttrType("x_num_col_dims"), OpDescAPI::AttrType::INT);
-  ASSERT_TRUE(op1_0.HasAttr("y_num_col_dims"));
-  ASSERT_EQ(op1_0.GetAttr<int32_t>("y_num_col_dims"), 1);
-  ASSERT_EQ(op1_0.GetAttrType("y_num_col_dims"), OpDescAPI::AttrType::INT);
-}
-
-TEST(NaiveBufferWrapper, ProgramDesc) {
-  BinaryTable table0;
-  proto::ProgramDesc pt_desc0(&table0);
-  ProgramDesc nb_desc0(&pt_desc0);
-
-  // Set ProgramDesc
-  nb_desc0.SetVersion(1);
-  for (int i = 0; i < 3; ++i) {
-    auto* item = nb_desc0.AddBlock<proto::BlockDesc>();
-  }
-
-  // Save model
-  pt_desc0.Save();
-  table0.SaveToFile("6.bf");
-
-  // Load model
-  BinaryTable table1;
-  table1.LoadFromFile("6.bf");
-  proto::ProgramDesc pt_desc1(&table1);
-  pt_desc1.Load();
-  ProgramDesc nb_desc1(&pt_desc1);
-
-  ASSERT_EQ(nb_desc1.Version(), 1);
-  ASSERT_EQ(nb_desc1.BlocksSize(), 3);
-}
-
-}  // namespace naive_buffer
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/naive_buffer/op_desc.cc b/lite/model_parser/naive_buffer/op_desc.cc
deleted file mode 100644
index 8d36a4ad3d..0000000000
--- a/lite/model_parser/naive_buffer/op_desc.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/model_parser/naive_buffer/op_desc.h"
-#include <set>
-#include <utility>
-
-namespace paddle {
-namespace lite {
-namespace naive_buffer {
-
-proto::OpDesc::Attr* FindAttr(proto::OpDesc* desc, const std::string& name) {
-  CHECK(desc);
-  auto& xs = *desc->GetMutableField<ListBuilder<proto::OpDesc::Attr>>("attrs");
-  auto it =
-      std::find_if(xs.begin(), xs.end(), [&](const proto::OpDesc::Attr& x) {
-        auto& builder = x.GetField<StringBuilder>("name");
-        return builder.data() == name;
-      });
-  if (it == xs.end()) {
-    auto* attr_builder = xs.New();
-    auto* name_builder = attr_builder->GetMutableField<StringBuilder>("name");
-    CHECK(name_builder);
-    name_builder->set(name);
-    return attr_builder;
-  }
-  return &(*it);
-}
-
-#define SET_ATTR_IMPL(T, ty__, bd__, pb_f__)                               \
-  template <>                                                              \
-  void OpDesc::SetAttr<T>(const std::string& name, const T& v) {           \
-    auto* it = FindAttr(desc_, name);                                      \
-    auto* type_builder =                                                   \
-        it->GetMutableField<EnumBuilder<proto::OpDesc::AttrType>>("type"); \
-    CHECK(type_builder);                                                   \
-    type_builder->set(proto::OpDesc::AttrType::ty__);                      \
-    auto* f_builder = it->GetMutableField<bd__##Builder>(#pb_f__);         \
-    CHECK(f_builder);                                                      \
-    f_builder->set(v);                                                     \
-  }
-SET_ATTR_IMPL(int, INT, Int32, i);
-SET_ATTR_IMPL(float, FLOAT, Float32, f);
-SET_ATTR_IMPL(bool, BOOLEAN, Bool, b);
-SET_ATTR_IMPL(std::string, STRING, String, s);
-#undef SET_ATTR_IMPL
-
-#define SET_ATTRS_IMPL(T, ty__, bd__, pb_f__)                              \
-  template <>                                                              \
-  void OpDesc::SetAttr<std::vector<T>>(const std::string& name,            \
-                                       const std::vector<T>& v) {          \
-    auto* it = FindAttr(desc_, name);                                      \
-    auto* type_builder =                                                   \
-        it->GetMutableField<EnumBuilder<proto::OpDesc::AttrType>>("type"); \
-    CHECK(type_builder);                                                   \
-    type_builder->set(proto::OpDesc::AttrType::ty__);                      \
-    auto* vec_builder =                                                    \
-        it->GetMutableField<ListBuilder<bd__##Builder>>(#pb_f__);          \
-    CHECK(vec_builder);                                                    \
-    vec_builder->Clear();                                                  \
-    for (auto& i : v) {                                                    \
-      auto* builder = vec_builder->New();                                  \
-      builder->set(i);                                                     \
-    }                                                                      \
-  }
-SET_ATTRS_IMPL(int, INTS, Int32, ints);
-SET_ATTRS_IMPL(float, FLOATS, Float32, floats);
-SET_ATTRS_IMPL(std::string, STRINGS, String, strings);
-#undef SET_ATTRS_IMPL
-
-const proto::OpDesc::Attr& GetFindAttr(const proto::OpDesc& desc,
-                                       const std::string& name) {
-  auto& xs = desc.GetField<ListBuilder<proto::OpDesc::Attr>>("attrs");
-  auto it =
-      std::find_if(xs.begin(), xs.end(), [&](const proto::OpDesc::Attr& x) {
-        auto& builder = x.GetField<StringBuilder>("name");
-        return builder.data() == name;
-      });
-  CHECK(it != xs.end());
-  return *it;
-}
-
-#define GET_ATTR_IMPL(T, bd__, pb_f__)                   \
-  template <>                                            \
-  T OpDesc::GetAttr<T>(const std::string& name) const {  \
-    auto& it = GetFindAttr(*desc_, name);                \
-    auto& builder = it.GetField<bd__##Builder>(#pb_f__); \
-    return builder.data();                               \
-  }
-GET_ATTR_IMPL(int32_t, Int32, i);
-GET_ATTR_IMPL(int16_t, Int32, block_idx);
-GET_ATTR_IMPL(float, Float32, f);
-GET_ATTR_IMPL(bool, Bool, b);
-GET_ATTR_IMPL(int64_t, Int64, l);
-GET_ATTR_IMPL(std::string, String, s);
-#undef GET_ATTR_IMPL
-
-#define GET_ATTRS_IMPL(T, bd__, pb_f__)                                    \
-  template <>                                                              \
-  std::vector<T> OpDesc::GetAttr<std::vector<T>>(const std::string& name)  \
-      const {                                                              \
-    auto& it = GetFindAttr(*desc_, name);                                  \
-    std::vector<T> res;                                                    \
-    auto& list_builder = it.GetField<ListBuilder<bd__##Builder>>(#pb_f__); \
-    for (size_t i = 0; i < list_builder.size(); ++i) {                     \
-      res.push_back(list_builder.Get(i).data());                           \
-    }                                                                      \
-    return res;                                                            \
-  }
-GET_ATTRS_IMPL(int, Int32, ints);
-GET_ATTRS_IMPL(float, Float32, floats);
-GET_ATTRS_IMPL(std::string, String, strings);
-GET_ATTRS_IMPL(int64_t, Int64, longs);
-#undef GET_ATTRS_IMPL
-
-}  // namespace naive_buffer
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/naive_buffer/op_desc.h b/lite/model_parser/naive_buffer/op_desc.h
deleted file mode 100644
index c292293169..0000000000
--- a/lite/model_parser/naive_buffer/op_desc.h
+++ /dev/null
@@ -1,234 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-/*
- * This file implements a light-weight OpDesc using NaiveBuffer.
- */
-
-#include <algorithm>
-#include <map>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "lite/model_parser/desc_apis.h"
-#include "lite/model_parser/naive_buffer/proto/framework.nb.h"
-
-namespace paddle {
-namespace lite {
-namespace naive_buffer {
-
-/*
- * The lite::naive_buffer::OpDesc, an light-weight implementation of wrapper of
- * lite::naive_buffer::proto::OpDesc.
- */
-class OpDesc : public OpDescAPI {
- public:
-  using var_list_t = ListBuilder<proto::OpDesc::Var>;
-  using str_list_t = ListBuilder<StringBuilder>;
-  using attr_list_t = ListBuilder<proto::OpDesc::Attr>;
-
-  OpDesc() = delete;
-
-  explicit OpDesc(proto::OpDesc *desc) : desc_(desc) { CHECK(desc_); }
-
-  void CopyFrom(OpDesc &op_desc) {
-    CHECK(op_desc.Proto()) << "Source proto::OpDesc pointer can't be null";
-    desc_ = op_desc.Proto();
-  }
-
-  proto::OpDesc *Proto() { return desc_; }
-
-  const proto::OpDesc &ReadonlyProto() const { return *desc_; }
-
-  std::string Type() const override {
-    auto &builder = desc_->GetField<StringBuilder>("type");
-    return builder.data();
-  }
-
-  void SetType(const std::string &type) override {
-    auto *builder = desc_->GetMutableField<StringBuilder>("type");
-    CHECK(builder);
-    return builder->set(type);
-  }
-
-  // Get the arguments of parameter called `param`
-  std::vector<std::string> Input(const std::string &param) const override {
-    return GetArguments(desc_->GetField<var_list_t>("inputs"), param);
-  }
-
-  std::vector<std::string> InputArgumentNames() const override {
-    return GetArgumentNames(desc_->GetField<var_list_t>("inputs"));
-  }
-
-  void SetInput(const std::string &param,
-                const std::vector<std::string> &args) override {
-    SetArgument(desc_->GetMutableField<var_list_t>("inputs"), param, args);
-  }
-
-  std::vector<std::string> Output(const std::string &param) const override {
-    return GetArguments(desc_->GetField<var_list_t>("outputs"), param);
-  }
-
-  std::vector<std::string> OutputArgumentNames() const override {
-    return GetArgumentNames(desc_->GetField<var_list_t>("outputs"));
-  }
-
-  void SetOutput(const std::string &param,
-                 const std::vector<std::string> &args) override {
-    SetArgument(desc_->GetMutableField<var_list_t>("outputs"), param, args);
-  }
-
-  bool HasAttr(const std::string &name) const override {
-    const auto &xs = desc_->GetField<attr_list_t>("attrs");
-    auto it =
-        std::find_if(xs.begin(), xs.end(), [&](const proto::OpDesc::Attr &x) {
-          auto &builder = x.GetField<StringBuilder>("name");
-          return builder.data() == name;
-        });
-    return it != xs.end();
-  }
-
-  AttrType GetAttrType(const std::string &name) const override {
-    const auto &xs = desc_->GetField<attr_list_t>("attrs");
-    auto it =
-        std::find_if(xs.begin(), xs.end(), [&](const proto::OpDesc::Attr &x) {
-          auto &builder = x.GetField<StringBuilder>("name");
-          return builder.data() == name;
-        });
-    CHECK(it != xs.end());
-#define DEF_ONE(type__)                 \
-  case proto::OpDesc::AttrType::type__: \
-    return AttrType::type__;
-
-    auto &builder = it->GetField<EnumBuilder<proto::OpDesc::AttrType>>("type");
-    switch (builder.data()) {
-      DEF_ONE(INT);
-      DEF_ONE(FLOAT);
-      DEF_ONE(STRING);
-      DEF_ONE(INTS);
-      DEF_ONE(FLOATS);
-      DEF_ONE(STRINGS);
-      DEF_ONE(BOOLEAN);
-      DEF_ONE(BOOLEANS);
-      DEF_ONE(BLOCK);
-      DEF_ONE(LONG);
-      DEF_ONE(BLOCKS);
-      DEF_ONE(LONGS);
-      default:
-        LOG(FATAL) << "Unknown attribute type";
-    }
-#undef DEF_ONE
-  }
-
-  std::vector<std::string> AttrNames() const override {
-    std::vector<std::string> res;
-    const auto &xs = desc_->GetField<attr_list_t>("attrs");
-    std::transform(xs.begin(),
-                   xs.end(),
-                   std::back_inserter(res),
-                   [](const proto::OpDesc::Attr &x) {
-                     auto &builder = x.GetField<StringBuilder>("name");
-                     return builder.data();
-                   });
-    return res;
-  }
-
-  template <typename T>
-  void SetAttr(const std::string &name, const T &v);
-
-  template <typename T>
-  T GetAttr(const std::string &name) const;
-
-  std::string DebugString() const { return "Not Implemented"; }
-
- private:
-  std::vector<std::string> GetArguments(const var_list_t &xs,
-                                        const std::string &param) const {
-    std::vector<std::string> res;
-    auto it =
-        std::find_if(xs.begin(), xs.end(), [&](const proto::OpDesc::Var &it) {
-          auto &builder = it.GetField<StringBuilder>("parameter");
-          return builder.data() == param;
-        });
-    CHECK(it != xs.end());
-
-    auto &list_builder = it->GetField<str_list_t>("arguments");
-    std::transform(list_builder.begin(),
-                   list_builder.end(),
-                   std::back_inserter(res),
-                   [](const StringBuilder &x) { return x.data(); });
-    return res;
-  }
-
-  void SetArgument(var_list_t *xs,
-                   const std::string &param,
-                   const std::vector<std::string> &args) {
-    auto it =
-        std::find_if(xs->begin(), xs->end(), [&](const proto::OpDesc::Var &it) {
-          auto &builder = it.GetField<StringBuilder>("parameter");
-          return builder.data() == param;
-        });
-    if (it == xs->end()) {
-      auto *new_arg = xs->New();
-      auto *param_builder =
-          new_arg->GetMutableField<StringBuilder>("parameter");
-      CHECK(param_builder);
-      param_builder->set(param);
-
-      auto *arg_builder = new_arg->GetMutableField<str_list_t>("arguments");
-      CHECK(arg_builder);
-      for (const auto &arg : args) {
-        arg_builder->New()->set(arg);
-      }
-    } else {
-      auto *arg_builder = it->GetMutableField<str_list_t>("arguments");
-      CHECK(arg_builder);
-      arg_builder->Clear();
-      for (const auto &arg : args) {
-        arg_builder->New()->set(arg);
-      }
-    }
-  }
-
-  std::vector<std::string> GetArgumentNames(const var_list_t &xs) const {
-    std::vector<std::string> res;
-    std::transform(xs.begin(),
-                   xs.end(),
-                   std::back_inserter(res),
-                   [](const proto::OpDesc::Var &x) {
-                     auto &builder = x.GetField<StringBuilder>("parameter");
-                     return builder.data();
-                   });
-    return res;
-  }
-
- private:
-  // Don't owned by naive_buffer::OpDesc
-  proto::OpDesc *desc_;
-};
-
-template <>
-void OpDesc::SetAttr<std::string>(const std::string &name,
-                                  const std::string &v);
-
-template <>
-void OpDesc::SetAttr<std::vector<int>>(const std::string &name,
-                                       const std::vector<int> &v);
-
-}  // namespace naive_buffer
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/naive_buffer/param_desc.cc b/lite/model_parser/naive_buffer/param_desc.cc
deleted file mode 100644
index 4d38ca4a8d..0000000000
--- a/lite/model_parser/naive_buffer/param_desc.cc
+++ /dev/null
@@ -1,228 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/model_parser/naive_buffer/param_desc.h"
-#include <string>
-#include <vector>
-#include "lite/model_parser/naive_buffer/naive_buffer_wrapper_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace naive_buffer {
-
-std::string ParamDesc::Name() const {
-  return desc_->GetField<StringBuilder>("name").data();
-}
-
-void ParamDesc::SetName(const std::string& name) {
-  auto* build = desc_->GetMutableField<StringBuilder>("name");
-  CHECK(build);
-  build->set(name);
-}
-
-uint32_t ParamDesc::ModelVersion() const { return Version("model_version"); }
-
-void ParamDesc::SetModelVersion(uint32_t version) {
-  SetVersion("model_version", version);
-}
-
-uint32_t ParamDesc::TensorVersion() const { return Version("tensor_version"); }
-
-void ParamDesc::SetTensorVersion(uint32_t version) {
-  SetVersion("tensor_version", version);
-}
-
-uint64_t ParamDesc::LoDLevel() const {
-  return desc_->GetField<UInt64Builder>("lod_level").data();
-}
-
-void ParamDesc::SetLoDLevel(uint64_t lod_level) {
-  auto* build = desc_->GetMutableField<UInt64Builder>("lod_level");
-  CHECK(build);
-  build->set(lod_level);
-}
-
-std::vector<std::vector<uint64_t>> ParamDesc::LoD() const {
-  using out_builder_type = ListBuilder<ListBuilder<UInt64Builder>>;
-
-  std::vector<std::vector<uint64_t>> res;
-  auto& out_builder = desc_->GetField<out_builder_type>("lod");
-  for (size_t i = 0; i < out_builder.size(); ++i) {
-    res.emplace_back(
-        RepeatedToVector<uint64_t, UInt64Builder>(out_builder.Get(i)));
-  }
-  return res;
-}
-
-void ParamDesc::SetLoD(const std::vector<std::vector<uint64_t>>& lod) {
-  using out_builder_type = ListBuilder<ListBuilder<UInt64Builder>>;
-
-  auto* out_builder = desc_->GetMutableField<out_builder_type>("lod");
-  CHECK(out_builder);
-  out_builder->Clear();
-  for (const auto& vals : lod) {
-    VectorToRepeated<uint64_t, UInt64Builder>(vals, out_builder->New());
-  }
-}
-
-VarDescAPI::VarDataType ParamDesc::GetDataType() const {
-  using data_type_builder_t = EnumBuilder<proto::VarDataType>;
-
-  auto data_type =
-      GetTensorDesc().GetField<data_type_builder_t>("data_type").data();
-#define GET_DATA_TYPE_CASE_ITEM(type__) \
-  case proto::VarDataType::type__:      \
-    return VarDescAPI::VarDataType::type__
-
-  switch (data_type) {
-    // Only support primary data type now.
-    GET_DATA_TYPE_CASE_ITEM(UINT8);
-    GET_DATA_TYPE_CASE_ITEM(INT8);
-    GET_DATA_TYPE_CASE_ITEM(INT16);
-    GET_DATA_TYPE_CASE_ITEM(INT32);
-    GET_DATA_TYPE_CASE_ITEM(INT64);
-    GET_DATA_TYPE_CASE_ITEM(FP32);
-    GET_DATA_TYPE_CASE_ITEM(FP64);
-    default:
-      LOG(FATAL) << "Unknown var data type";
-  }
-#undef GET_DATA_TYPE_CASE_ITEM
-}
-
-void ParamDesc::SetDataType(VarDescAPI::VarDataType data_type) {
-  using data_type_builder_t = EnumBuilder<proto::VarDataType>;
-
-  auto* data_type_builder =
-      GetMutableTensorDesc()->GetMutableField<data_type_builder_t>("data_type");
-  CHECK(data_type_builder);
-#define SET_DATA_TYPE_CASE_ITEM(type__)                 \
-  case VarDescAPI::VarDataType::type__:                 \
-    data_type_builder->set(proto::VarDataType::type__); \
-    break
-
-  switch (data_type) {
-    // Only support primary data type now.
-    SET_DATA_TYPE_CASE_ITEM(UINT8);
-    SET_DATA_TYPE_CASE_ITEM(INT8);
-    SET_DATA_TYPE_CASE_ITEM(INT16);
-    SET_DATA_TYPE_CASE_ITEM(INT32);
-    SET_DATA_TYPE_CASE_ITEM(INT64);
-    SET_DATA_TYPE_CASE_ITEM(FP32);
-    SET_DATA_TYPE_CASE_ITEM(FP64);
-    default:
-      LOG(FATAL) << "Unknown var data type";
-#undef SET_DATA_TYPE_CASE_ITEM
-  }
-}
-
-std::vector<int64_t> ParamDesc::Dim() const {
-  using out_builder_type = ListBuilder<Int64Builder>;
-
-  auto& out_builder = GetTensorDesc().GetField<out_builder_type>("dims");
-  return RepeatedToVector<int64_t, Int64Builder>(out_builder);
-}
-
-void ParamDesc::SetDim(const std::vector<int64_t>& dim) {
-  using out_builder_type = ListBuilder<Int64Builder>;
-
-  auto* out_builder =
-      GetMutableTensorDesc()->GetMutableField<out_builder_type>("dims");
-  CHECK(out_builder);
-  VectorToRepeated<int64_t, Int64Builder>(dim, out_builder);
-}
-
-#define GET_DATA_IMPL(T, type__)                                            \
-  template <>                                                               \
-  std::vector<T> ParamDesc::Data() const {                                  \
-    CHECK(GetDataType() == VarDescAPI::VarDataType::type__)                 \
-        << "Data Type mismatch";                                            \
-    std::vector<T> res;                                                     \
-    auto& data_builder = desc_->GetField<ListBuilder<CharBuilder>>("data"); \
-    auto data = RepeatedToVector<char, CharBuilder>(data_builder);          \
-    size_t size = data.size() / sizeof(T);                                  \
-    auto* data_ptr = reinterpret_cast<T*>(&data[0]);                        \
-    for (size_t i = 0; i < size; ++i) {                                     \
-      res.push_back(data_ptr[i]);                                           \
-    }                                                                       \
-    return res;                                                             \
-  }
-GET_DATA_IMPL(uint8_t, UINT8);
-GET_DATA_IMPL(int8_t, INT8);
-GET_DATA_IMPL(int16_t, INT16);
-GET_DATA_IMPL(int32_t, INT32);
-GET_DATA_IMPL(int64_t, INT64);
-GET_DATA_IMPL(float, FP32);
-GET_DATA_IMPL(double, FP64);
-#undef GET_DATA_IMPL
-
-// NOTE: Must set data type first
-#define SET_DATA_COMMON_IMPL(T, type__, size__, data_ptr__)     \
-  CHECK(GetDataType() == VarDescAPI::VarDataType::type__)       \
-      << "Data Type mismatch, call SetDataType first.";         \
-  auto* data_builder =                                          \
-      desc_->GetMutableField<ListBuilder<CharBuilder>>("data"); \
-  CHECK(data_builder);                                          \
-  data_builder->Clear();                                        \
-  size_t size = size__ * sizeof(T);                             \
-  auto* data_ptr = reinterpret_cast<const char*>(data_ptr__);   \
-  for (size_t i = 0; i < size; ++i) {                           \
-    data_builder->New()->set(data_ptr[i]);                      \
-  }
-
-#define SET_DATA_IMPL(T, type__)                                \
-  template <>                                                   \
-  void ParamDesc::SetData<T>(const std::vector<T>& data) {      \
-    SET_DATA_COMMON_IMPL(T, type__, data.size(), &data[0])      \
-  }                                                             \
-                                                                \
-  template <>                                                   \
-  void ParamDesc::SetData<T>(const T* data, size_t data_size) { \
-    CHECK(data);                                                \
-    SET_DATA_COMMON_IMPL(T, type__, data_size, data);           \
-  }
-
-SET_DATA_IMPL(uint8_t, UINT8);
-SET_DATA_IMPL(int8_t, INT8);
-SET_DATA_IMPL(int16_t, INT16);
-SET_DATA_IMPL(int32_t, INT32);
-SET_DATA_IMPL(int64_t, INT64);
-SET_DATA_IMPL(float, FP32);
-SET_DATA_IMPL(double, FP64);
-#undef SET_DATA_IMPL
-#undef SET_DATA_COMMON_IMPL
-
-uint32_t ParamDesc::Version(const std::string& name) const {
-  auto& builder = desc_->GetField<UInt32Builder>(name);
-  return builder.data();
-}
-
-void ParamDesc::SetVersion(const std::string& name, uint32_t version) {
-  auto* builder = desc_->GetMutableField<UInt32Builder>(name);
-  CHECK(builder);
-  return builder->set(version);
-}
-
-const proto::TensorDesc& ParamDesc::GetTensorDesc() const {
-  return desc_->GetField<proto::TensorDesc>("tensor_desc");
-}
-
-proto::TensorDesc* ParamDesc::GetMutableTensorDesc() {
-  auto* builder = desc_->GetMutableField<proto::TensorDesc>("tensor_desc");
-  CHECK(builder);
-  return builder;
-}
-
-}  // namespace naive_buffer
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/naive_buffer/param_desc.h b/lite/model_parser/naive_buffer/param_desc.h
deleted file mode 100644
index 0a20b15331..0000000000
--- a/lite/model_parser/naive_buffer/param_desc.h
+++ /dev/null
@@ -1,92 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "lite/model_parser/desc_apis.h"
-#include "lite/model_parser/naive_buffer/proto/framework.nb.h"
-
-namespace paddle {
-namespace lite {
-namespace naive_buffer {
-
-class ParamDesc {
- public:
-  ParamDesc() = delete;
-
-  explicit ParamDesc(proto::ParamDesc *desc) : desc_(desc) { CHECK(desc_); }
-
-  void CopyFrom(ParamDesc &param_desc) {
-    CHECK(param_desc.Proto())
-        << "Source proto::ParamDesc pointer can't be null";
-    desc_ = param_desc.Proto();
-  }
-
-  proto::ParamDesc *Proto() { return desc_; }
-
-  const proto::ParamDesc &ReadonlyProto() const { return *desc_; }
-
-  std::string Name() const;
-
-  void SetName(const std::string &name);
-
-  uint32_t ModelVersion() const;
-
-  void SetModelVersion(uint32_t version);
-
-  uint32_t TensorVersion() const;
-
-  void SetTensorVersion(uint32_t version);
-
-  uint64_t LoDLevel() const;
-
-  void SetLoDLevel(uint64_t lod_level);
-
-  std::vector<std::vector<uint64_t>> LoD() const;
-
-  void SetLoD(const std::vector<std::vector<uint64_t>> &lod);
-
-  VarDescAPI::VarDataType GetDataType() const;
-
-  void SetDataType(VarDescAPI::VarDataType data_type);
-
-  std::vector<int64_t> Dim() const;
-
-  void SetDim(const std::vector<int64_t> &dim);
-
-  template <typename T>
-  std::vector<T> Data() const;
-
-  template <typename T>
-  void SetData(const std::vector<T> &data);
-
-  template <typename T>
-  void SetData(const T *data, size_t size);
-
- private:
-  uint32_t Version(const std::string &name) const;
-  void SetVersion(const std::string &name, uint32_t version);
-
-  const proto::TensorDesc &GetTensorDesc() const;
-  proto::TensorDesc *GetMutableTensorDesc();
-
-  proto::ParamDesc *desc_;
-};
-
-}  // namespace naive_buffer
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/naive_buffer/program_desc.cc b/lite/model_parser/naive_buffer/program_desc.cc
deleted file mode 100644
index 7b0a30b81c..0000000000
--- a/lite/model_parser/naive_buffer/program_desc.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/model_parser/naive_buffer/program_desc.h"
-
-namespace paddle {
-namespace lite {
-namespace naive_buffer {
-
-size_t ProgramDesc::BlocksSize() const { return GetBlockListBuilder().size(); }
-
-void ProgramDesc::ClearBlocks() { GetMutableBlockListBuilder()->Clear(); }
-
-template <>
-proto::BlockDesc* ProgramDesc::GetBlock<proto::BlockDesc>(int32_t idx) {
-  CHECK_LT(idx, BlocksSize()) << "idx >= blocks.size()";
-  return GetMutableBlockListBuilder()->GetMutable(idx);
-}
-
-template <>
-proto::BlockDesc* ProgramDesc::AddBlock<proto::BlockDesc>() {
-  return GetMutableBlockListBuilder()->New();
-}
-
-int64_t ProgramDesc::Version() const {
-  return desc_->GetField<Int64Builder>("version").data();
-}
-
-void ProgramDesc::SetVersion(int64_t version) {
-  auto* builder = desc_->GetMutableField<Int64Builder>("version");
-  CHECK(builder);
-  builder->set(version);
-}
-
-const ListBuilder<proto::BlockDesc>& ProgramDesc::GetBlockListBuilder() const {
-  return desc_->GetField<ListBuilder<proto::BlockDesc>>("blocks");
-}
-
-ListBuilder<proto::BlockDesc>* ProgramDesc::GetMutableBlockListBuilder() {
-  auto* res = desc_->GetMutableField<ListBuilder<proto::BlockDesc>>("blocks");
-  CHECK(res);
-  return res;
-}
-
-}  // namespace naive_buffer
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/naive_buffer/program_desc.h b/lite/model_parser/naive_buffer/program_desc.h
deleted file mode 100644
index 0d59b7f71f..0000000000
--- a/lite/model_parser/naive_buffer/program_desc.h
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <vector>
-#include "lite/model_parser/desc_apis.h"
-#include "lite/model_parser/naive_buffer/proto/framework.nb.h"
-
-namespace paddle {
-namespace lite {
-namespace naive_buffer {
-
-class ProgramDesc : public ProgramDescAPI {
- public:
-  ProgramDesc() = delete;
-
-  explicit ProgramDesc(proto::ProgramDesc *desc) : desc_(desc) { CHECK(desc_); }
-
-  void CopyFrom(ProgramDesc &program_desc) {
-    CHECK(program_desc.Proto())
-        << "Source proto::ProgramDesc pointer can't be null";
-    desc_ = program_desc.Proto();
-  }
-
-  proto::ProgramDesc *Proto() { return desc_; }
-
-  const proto::ProgramDesc &ReadonlyProto() const { return *desc_; }
-
-  size_t BlocksSize() const override;
-
-  void ClearBlocks() override;
-
-  template <typename T>
-  T *GetBlock(int32_t idx);
-
-  template <typename T>
-  T *AddBlock();
-
-  bool HasVersion() const override { return true; }
-
-  int64_t Version() const override;
-
-  void SetVersion(int64_t version) override;
-
- private:
-  const ListBuilder<proto::BlockDesc> &GetBlockListBuilder() const;
-  ListBuilder<proto::BlockDesc> *GetMutableBlockListBuilder();
-
-  proto::ProgramDesc *desc_;
-};
-
-}  // namespace naive_buffer
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/naive_buffer/proto/CMakeLists.txt b/lite/model_parser/naive_buffer/proto/CMakeLists.txt
deleted file mode 100644
index 1ac170ca1a..0000000000
--- a/lite/model_parser/naive_buffer/proto/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-lite_cc_library(framework_nb SRCS framework.nb.cc DEPS naive_buffer)
diff --git a/lite/model_parser/naive_buffer/proto/framework.nb.cc b/lite/model_parser/naive_buffer/proto/framework.nb.cc
deleted file mode 100644
index 7adc5c5043..0000000000
--- a/lite/model_parser/naive_buffer/proto/framework.nb.cc
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/model_parser/naive_buffer/proto/framework.nb.h"
diff --git a/lite/model_parser/naive_buffer/proto/framework.nb.h b/lite/model_parser/naive_buffer/proto/framework.nb.h
deleted file mode 100644
index f495a12b46..0000000000
--- a/lite/model_parser/naive_buffer/proto/framework.nb.h
+++ /dev/null
@@ -1,203 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "lite/model_parser/naive_buffer/naive_buffer.h"
-
-namespace paddle {
-namespace lite {
-namespace naive_buffer {
-namespace proto {
-
-// Struct for framework
-class OpDesc : public StructBuilder {
- public:
-  // Move AttrType in OpDesc in NaiveBuffer
-  enum AttrType {
-    INT = 0,
-    FLOAT,
-    STRING,
-    INTS,
-    FLOATS,
-    STRINGS,
-    BOOLEAN,
-    BOOLEANS,
-    BLOCK,
-    LONG,
-    BLOCKS,
-    LONGS
-  };
-
-  class Attr : public StructBuilder {
-   public:
-    explicit Attr(BinaryTable* table) : StructBuilder(table) {
-      using enum_builder = EnumBuilder<AttrType>;
-
-      NewStr("name");
-      New<enum_builder>("type");
-      NewInt32("i");
-      NewFloat32("f");
-      NewStr("s");
-      New<ListBuilder<Int32Builder>>("ints");
-      New<ListBuilder<Float32Builder>>("floats");
-      New<ListBuilder<StringBuilder>>("strings");
-      New<BoolBuilder>("b");
-      New<ListBuilder<BoolBuilder>>("bools");
-      NewInt32("block_idx");
-      NewInt64("l");
-      New<ListBuilder<Int32Builder>>("blocks_idx");
-      New<ListBuilder<Int64Builder>>("longs");
-    }
-  };
-
-  class Var : public StructBuilder {
-   public:
-    explicit Var(BinaryTable* table) : StructBuilder(table) {
-      NewStr("parameter");
-      New<ListBuilder<StringBuilder>>("arguments");
-    }
-  };
-
-  explicit OpDesc(BinaryTable* table) : StructBuilder(table) {
-    NewStr("type");
-    New<ListBuilder<Var>>("inputs");
-    New<ListBuilder<Var>>("outputs");
-    New<ListBuilder<Attr>>("attrs");
-    NewBool("is_target", false);
-  }
-};
-
-enum VarDataType {
-  // Pod Types
-  BOOL = 0,
-  INT16,
-  INT32,
-  INT64,
-  FP16,
-  FP32,
-  FP64,
-  // Tensor<size_t> is used in C++.
-  SIZE_T,
-  UINT8,
-  INT8,
-
-  // Other types that may need additional descriptions
-  LOD_TENSOR,
-  SELECTED_ROWS,
-  FEED_MINIBATCH,
-  FETCH_LIST,
-  STEP_SCOPES,
-  LOD_RANK_TABLE,
-  LOD_TENSOR_ARRAY,
-  PLACE_LIST,
-  READER,
-  // Any runtime decided variable type is raw
-  // raw variables should manage their own allocations
-  // in operators like nccl_op
-  RAW,
-  TUPLE
-};
-
-class TensorDesc : public StructBuilder {
- public:
-  using enum_builder = EnumBuilder<VarDataType>;
-  explicit TensorDesc(BinaryTable* table) : StructBuilder(table) {
-    // Should only be PODType. Is enforced in C++
-    New<enum_builder>("data_type");
-    New<ListBuilder<Int64Builder>>("dims");
-  }
-};
-
-class LoDTensorDesc : public StructBuilder {
- public:
-  explicit LoDTensorDesc(BinaryTable* table) : StructBuilder(table) {
-    New<TensorDesc>("tensor");
-    NewInt32("lod_level", 0);
-  }
-};
-
-class LoDTensorArrayDesc : public StructBuilder {
- public:
-  explicit LoDTensorArrayDesc(BinaryTable* table) : StructBuilder(table) {
-    New<TensorDesc>("tensor");
-    NewInt32("lod_level", 0);
-  }
-};
-
-class VarType : public StructBuilder {
- public:
-  using Type = VarDataType;
-  using enum_builder = EnumBuilder<Type>;
-  using ReaderDesc = ListBuilder<LoDTensorDesc>;
-  using Tuple = ListBuilder<enum_builder>;
-
-  explicit VarType(BinaryTable* table) : StructBuilder(table) {
-    New<enum_builder>("type");
-    New<TensorDesc>("selected_rows");
-    New<LoDTensorDesc>("lod_tensor");
-    New<LoDTensorArrayDesc>("tensor_array");
-    New<ReaderDesc>("reader");
-    New<Tuple>("tuple");
-  }
-};
-
-class VarDesc : public StructBuilder {
- public:
-  explicit VarDesc(BinaryTable* table) : StructBuilder(table) {
-    NewStr("name");
-    New<VarType>("type");
-    NewBool("persistable", false);
-  }
-};
-
-class BlockDesc : public StructBuilder {
- public:
-  explicit BlockDesc(BinaryTable* table) : StructBuilder(table) {
-    NewInt32("idx");
-    NewInt32("parent_idx");
-    New<ListBuilder<VarDesc>>("vars");
-    New<ListBuilder<OpDesc>>("ops");
-    NewInt32("forward_block_idx", -1);
-  }
-};
-
-class ProgramDesc : public StructBuilder {
- public:
-  explicit ProgramDesc(BinaryTable* table) : StructBuilder(table) {
-    New<ListBuilder<BlockDesc>>("blocks");
-    NewInt64("version", 0);
-  }
-};
-
-class ParamDesc : public StructBuilder {
- public:
-  using lod_type = ListBuilder<ListBuilder<UInt64Builder>>;
-  explicit ParamDesc(BinaryTable* table) : StructBuilder(table) {
-    NewStr("name");
-    NewUInt32("model_version");
-    NewUInt64("lod_level");
-    New<lod_type>("lod");
-    NewUInt32("tensor_version");
-    New<TensorDesc>("tensor_desc");
-    New<ListBuilder<CharBuilder>>("data");
-  }
-};
-
-using CombinedParamsDesc = ListBuilder<ParamDesc>;
-
-}  // namespace proto
-}  // namespace naive_buffer
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/naive_buffer/var_desc.cc b/lite/model_parser/naive_buffer/var_desc.cc
deleted file mode 100644
index 2e00199929..0000000000
--- a/lite/model_parser/naive_buffer/var_desc.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/model_parser/naive_buffer/var_desc.h"
-#include <string>
-
-namespace paddle {
-namespace lite {
-namespace naive_buffer {
-
-std::string VarDesc::Name() const {
-  auto& builder = desc_->GetField<StringBuilder>("name");
-  return builder.data();
-}
-
-void VarDesc::SetName(std::string name) {
-  auto* builder = desc_->GetMutableField<StringBuilder>("name");
-  CHECK(builder);
-  return builder->set(name);
-}
-
-VarDescAPI::Type VarDesc::GetType() const {
-  using PbType = proto::VarDataType;
-  using type_builder_t = EnumBuilder<PbType>;
-
-  auto type = GetVarType().GetField<type_builder_t>("type").data();
-#define GET_TYPE_CASE_ITEM(type__) \
-  case PbType::type__:             \
-    return VarDescAPI::Type::type__
-
-  switch (type) {
-    GET_TYPE_CASE_ITEM(LOD_TENSOR);
-    GET_TYPE_CASE_ITEM(LOD_TENSOR_ARRAY);
-    GET_TYPE_CASE_ITEM(LOD_RANK_TABLE);
-    GET_TYPE_CASE_ITEM(SELECTED_ROWS);
-    GET_TYPE_CASE_ITEM(FEED_MINIBATCH);
-    GET_TYPE_CASE_ITEM(FETCH_LIST);
-    GET_TYPE_CASE_ITEM(STEP_SCOPES);
-    GET_TYPE_CASE_ITEM(PLACE_LIST);
-    GET_TYPE_CASE_ITEM(READER);
-    default:
-      LOG(FATAL) << "Unknown var type";
-  }
-#undef GET_TYPE_CASE_ITEM
-}
-
-void VarDesc::SetType(VarDescAPI::Type type) {
-  using PbType = proto::VarDataType;
-  using type_builder_t = EnumBuilder<PbType>;
-
-  auto* type_builder =
-      GetMutableVarType()->GetMutableField<type_builder_t>("type");
-  CHECK(type_builder);
-#define SET_TYPE_CASE_ITEM(type__)     \
-  case VarDescAPI::Type::type__:       \
-    type_builder->set(PbType::type__); \
-    break
-
-  switch (type) {
-    SET_TYPE_CASE_ITEM(LOD_TENSOR);
-    SET_TYPE_CASE_ITEM(LOD_TENSOR_ARRAY);
-    SET_TYPE_CASE_ITEM(LOD_RANK_TABLE);
-    SET_TYPE_CASE_ITEM(SELECTED_ROWS);
-    SET_TYPE_CASE_ITEM(FEED_MINIBATCH);
-    SET_TYPE_CASE_ITEM(FETCH_LIST);
-    SET_TYPE_CASE_ITEM(STEP_SCOPES);
-    SET_TYPE_CASE_ITEM(PLACE_LIST);
-    SET_TYPE_CASE_ITEM(READER);
-    default:
-      LOG(FATAL) << "Unknown var type";
-  }
-#undef SET_TYPE_CASE_ITEM
-}
-
-bool VarDesc::Persistable() const {
-  auto& builder = desc_->GetField<BoolBuilder>("persistable");
-  return builder.data();
-}
-
-void VarDesc::SetPersistable(bool persistable) {
-  auto* builder = desc_->GetMutableField<BoolBuilder>("persistable");
-  CHECK(builder);
-  return builder->set(persistable);
-}
-
-const proto::VarType& VarDesc::GetVarType() const {
-  return desc_->GetField<proto::VarType>("type");
-}
-
-proto::VarType* VarDesc::GetMutableVarType() {
-  auto* builder = desc_->GetMutableField<proto::VarType>("type");
-  CHECK(builder);
-  return builder;
-}
-
-}  // namespace naive_buffer
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/naive_buffer/var_desc.h b/lite/model_parser/naive_buffer/var_desc.h
deleted file mode 100644
index 92a0cfe3cd..0000000000
--- a/lite/model_parser/naive_buffer/var_desc.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "lite/model_parser/desc_apis.h"
-#include "lite/model_parser/naive_buffer/proto/framework.nb.h"
-
-namespace paddle {
-namespace lite {
-namespace naive_buffer {
-
-class VarDesc : public VarDescAPI {
- public:
-  VarDesc() = delete;
-
-  explicit VarDesc(proto::VarDesc *desc) : desc_(desc) { CHECK(desc_); }
-
-  void CopyFrom(VarDesc &var_desc) {
-    CHECK(var_desc.Proto()) << "Source proto::VarDesc pointer can't be null";
-    desc_ = var_desc.Proto();
-  }
-
-  proto::VarDesc *Proto() { return desc_; }
-
-  const proto::VarDesc &ReadonlyProto() const { return *desc_; }
-
-  std::string Name() const override;
-
-  void SetName(std::string name) override;
-
-  VarDescAPI::Type GetType() const override;
-
-  void SetType(VarDescAPI::Type type) override;
-
-  bool Persistable() const override;
-
-  void SetPersistable(bool persistable) override;
-
- private:
-  const proto::VarType &GetVarType() const;
-  proto::VarType *GetMutableVarType();
-
-  proto::VarDesc *desc_;
-};
-
-}  // namespace naive_buffer
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/pb/CMakeLists.txt b/lite/model_parser/pb/CMakeLists.txt
deleted file mode 100644
index f04d0cfbdd..0000000000
--- a/lite/model_parser/pb/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-lite_cc_library(var_desc SRCS var_desc.cc DEPS framework_proto)
-lite_cc_library(op_desc SRCS op_desc.cc DEPS framework_proto)
-lite_cc_library(block_desc SRCS block_desc.cc DEPS framework_proto)
-lite_cc_library(program_desc SRCS program_desc.cc DEPS framework_proto)
-
-set(pb_wrapper var_desc op_desc block_desc program_desc PARENT_SCOPE) 
diff --git a/lite/model_parser/pb/block_desc.cc b/lite/model_parser/pb/block_desc.cc
deleted file mode 100644
index a0e2bd8391..0000000000
--- a/lite/model_parser/pb/block_desc.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/model_parser/pb/block_desc.h"
-
-namespace paddle {
-namespace lite {
-namespace pb {
-
-template <>
-framework::proto::VarDesc* BlockDesc::GetVar<framework::proto::VarDesc>(
-    int32_t idx) {
-  CHECK_LT(idx, VarsSize()) << "idx >= vars.size()";
-  return desc_->mutable_vars(idx);
-}
-
-template <>
-framework::proto::VarDesc* BlockDesc::AddVar<framework::proto::VarDesc>() {
-  return desc_->add_vars();
-}
-
-template <>
-framework::proto::OpDesc* BlockDesc::GetOp<framework::proto::OpDesc>(
-    int32_t idx) {
-  CHECK_LT(idx, OpsSize()) << "idx >= ops.size()";
-  return desc_->mutable_ops(idx);
-}
-
-template <>
-framework::proto::OpDesc* BlockDesc::AddOp<framework::proto::OpDesc>() {
-  return desc_->add_ops();
-}
-
-}  // namespace pb
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/pb/block_desc.h b/lite/model_parser/pb/block_desc.h
deleted file mode 100644
index d541a7fbd2..0000000000
--- a/lite/model_parser/pb/block_desc.h
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <vector>
-#include "lite/core/framework.pb.h"
-#include "lite/model_parser/desc_apis.h"
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace pb {
-
-class BlockDesc : public BlockDescAPI {
- public:
-  BlockDesc() = delete;
-
-  explicit BlockDesc(framework::proto::BlockDesc* desc) : desc_(desc) {
-    CHECK(desc_);
-  }
-
-  framework::proto::BlockDesc* Proto() { return desc_; }
-
-  const framework::proto::BlockDesc& ReadonlyProto() const { return *desc_; }
-
-  int32_t Idx() const override { return desc_->idx(); }
-
-  void SetIdx(int32_t idx) override { desc_->set_idx(idx); }
-
-  int32_t ParentIdx() const override { return desc_->parent_idx(); }
-
-  void SetParentIdx(int32_t idx) override { desc_->set_parent_idx(idx); }
-
-  size_t VarsSize() const override { return desc_->vars_size(); }
-
-  void ClearVars() override { desc_->clear_vars(); }
-
-  template <typename T>
-  T* GetVar(int32_t idx);
-
-  template <typename T>
-  T* AddVar();
-
-  size_t OpsSize() const override { return desc_->ops_size(); }
-
-  void ClearOps() override { desc_->clear_ops(); }
-
-  template <typename T>
-  T* GetOp(int32_t idx);
-
-  template <typename T>
-  T* AddOp();
-
-  int32_t ForwardBlockIdx() const override {
-    return desc_->forward_block_idx();
-  }
-
-  void SetForwardBlockIdx(int32_t idx) override {
-    desc_->set_forward_block_idx(idx);
-  }
-
- private:
-  framework::proto::BlockDesc* desc_;  // not_own
-};
-
-}  // namespace pb
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/pb/op_desc.cc b/lite/model_parser/pb/op_desc.cc
deleted file mode 100644
index 34b83d55b5..0000000000
--- a/lite/model_parser/pb/op_desc.cc
+++ /dev/null
@@ -1,132 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/model_parser/pb/op_desc.h"
-
-namespace paddle {
-namespace lite {
-namespace pb {
-
-google::protobuf::internal::RepeatedPtrIterator<framework::proto::OpDesc_Attr>
-FindAttr(framework::proto::OpDesc *desc, const std::string &name) {
-  auto &xs = *desc->mutable_attrs();
-  auto it = std::find_if(
-      xs.begin(), xs.end(), [&](const framework::proto::OpDesc_Attr &x) {
-        return x.name() == name;
-      });
-  if (it == xs.end()) {
-    auto *attr = xs.Add();
-    attr->set_name(name);
-    it = std::find_if(
-        xs.begin(), xs.end(), [&](const framework::proto::OpDesc_Attr &x) {
-          return x.name() == name;
-        });
-  }
-  return it;
-}
-
-#define SET_IMPL_ONE(T, ty__, pb_f__)                            \
-  template <>                                                    \
-  void OpDesc::SetAttr<T>(const std::string &name, const T &v) { \
-    auto it = FindAttr(desc_, name);                             \
-    it->set_type(framework::proto::ty__);                        \
-    it->set_##pb_f__(v);                                         \
-  }
-SET_IMPL_ONE(int, INT, i);
-SET_IMPL_ONE(float, FLOAT, f);
-SET_IMPL_ONE(bool, BOOLEAN, b);
-
-template <>
-void OpDesc::SetAttr<std::vector<int>>(const std::string &name,
-                                       const std::vector<int> &v) {
-  auto it = FindAttr(desc_, name);
-  it->set_type(framework::proto::INTS);
-  it->clear_ints();
-  for (auto &i : v) {
-    it->add_ints(i);
-  }
-}
-
-template <>
-void OpDesc::SetAttr<std::string>(const std::string &name,
-                                  const std::string &v) {
-  auto it = FindAttr(desc_, name);
-  it->set_type(framework::proto::STRING);
-  it->set_s(v.c_str());
-}
-
-template <>
-void OpDesc::SetAttr<std::vector<float>>(const std::string &name,
-                                         const std::vector<float> &v) {
-  auto it = FindAttr(desc_, name);
-  it->set_type(framework::proto::FLOATS);
-  it->clear_floats();
-  for (auto &i : v) {
-    it->add_floats(i);
-  }
-}
-
-template <>
-void OpDesc::SetAttr<std::vector<std::string>>(
-    const std::string &name, const std::vector<std::string> &v) {
-  auto it = FindAttr(desc_, name);
-  it->set_type(framework::proto::STRINGS);
-  it->clear_strings();
-  for (auto &i : v) {
-    it->add_strings(i);
-  }
-}
-
-google::protobuf::internal::RepeatedPtrIterator<
-    const framework::proto::OpDesc_Attr>
-GetFindAttr(const framework::proto::OpDesc &desc, const std::string &name) {
-  auto &xs = desc.attrs();
-  auto it = std::find_if(
-      xs.begin(), xs.end(), [&](const framework::proto::OpDesc_Attr &x) {
-        return x.name() == name;
-      });
-  return it;
-}
-
-#define GET_ATTR_IMPL(T, pb_f__)                        \
-  template <>                                           \
-  T OpDesc::GetAttr<T>(const std::string &name) const { \
-    auto it = GetFindAttr(*desc_, name);                \
-    return it->pb_f__();                                \
-  }
-
-#define GET_ATTRS_IMPL(T, pb_f__)                       \
-  template <>                                           \
-  T OpDesc::GetAttr<T>(const std::string &name) const { \
-    auto it = GetFindAttr(*desc_, name);                \
-    T res;                                              \
-    for (const auto &v : it->pb_f__()) {                \
-      res.push_back(v);                                 \
-    }                                                   \
-    return res;                                         \
-  }
-GET_ATTR_IMPL(int32_t, i);
-GET_ATTR_IMPL(int16_t, block_idx);
-GET_ATTR_IMPL(float, f);
-GET_ATTR_IMPL(bool, b);
-GET_ATTR_IMPL(int64_t, l);
-GET_ATTRS_IMPL(std::vector<int>, ints);
-GET_ATTRS_IMPL(std::vector<float>, floats);
-GET_ATTRS_IMPL(std::vector<std::string>, strings);
-GET_ATTR_IMPL(std::string, s);
-GET_ATTRS_IMPL(std::vector<int64_t>, longs);
-
-}  // namespace pb
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/pb/op_desc.h b/lite/model_parser/pb/op_desc.h
deleted file mode 100644
index 1a0af22f27..0000000000
--- a/lite/model_parser/pb/op_desc.h
+++ /dev/null
@@ -1,215 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-/*
- * This file implements a light-weight OpDesc like the framework::OpDesc. We
- * delete the unnecessary methods, and remove the underlying dependencies, such
- * as framework::Operator and boost::varient to make it runnable in mobile.
- */
-
-#include <algorithm>
-#include <map>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "lite/core/framework.pb.h"
-#include "lite/model_parser/desc_apis.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace pb {
-
-using Attribute =
-    variant<int, float, bool, std::vector<std::string>, std::vector<int>>;
-using VariableNameMap = std::map<std::string, std::vector<std::string>>;
-
-/*
- * The lite::OpDesc, an light-weight implementation of wrapper of proto::OpDesc.
- * Unlike the original one in framework::OpDesc, we remove the local members
- * except the desc_, to avoid the inconsistent state, which is normal in the
- * original interface and results in bugs.
- */
-class OpDesc : public OpDescAPI {
- public:
-  OpDesc() = delete;
-
-  explicit OpDesc(framework::proto::OpDesc *desc) : desc_(desc) {
-    CHECK(desc_);
-  }
-
-  framework::proto::OpDesc *Proto() { return desc_; }
-  const framework::proto::OpDesc &ReadonlyProto() const { return *desc_; }
-
-  std::string Type() const override { return desc_->type(); }
-
-  void SetType(const std::string &type) override { desc_->set_type(type); }
-
-  // Get the arguments of parameter called `param`
-  std::vector<std::string> Input(const std::string &param) const override {
-    return GetArguments(desc_->inputs(), param);
-  }
-
-  std::vector<std::string> InputArgumentNames() const override {
-    return GetArgumentNames(desc_->inputs());
-  }
-
-  void SetInput(const std::string &param,
-                const std::vector<std::string> &args) override {
-    SetArgument(desc_->mutable_inputs(), param, args);
-  }
-
-  std::vector<std::string> Output(const std::string &param) const override {
-    return GetArguments(desc_->outputs(), param);
-  }
-
-  std::vector<std::string> OutputArgumentNames() const override {
-    return GetArgumentNames(desc_->outputs());
-  }
-
-  void SetOutput(const std::string &param,
-                 const std::vector<std::string> &args) override {
-    SetArgument(desc_->mutable_outputs(), param, args);
-  }
-
-  bool HasAttr(const std::string &name) const override {
-    const auto &xs = desc_->attrs();
-    auto it = std::find_if(
-        xs.begin(), xs.end(), [&](const framework::proto::OpDesc_Attr &x) {
-          return x.name() == name;
-        });
-    return it != xs.end();
-  }
-
-  AttrType GetAttrType(const std::string &name) const override {
-    const auto &xs = desc_->attrs();
-    auto it = std::find_if(
-        xs.begin(), xs.end(), [&](const framework::proto::OpDesc_Attr &x) {
-          return x.name() == name;
-        });
-    CHECK(it != xs.end());
-#define DEF_ONE(type__)                    \
-  case framework::proto::AttrType::type__: \
-    return AttrType::type__;
-
-    switch (it->type()) {
-      DEF_ONE(INT);
-      DEF_ONE(FLOAT);
-      DEF_ONE(STRING);
-      DEF_ONE(INTS);
-      DEF_ONE(FLOATS);
-      DEF_ONE(STRINGS);
-      DEF_ONE(BOOLEAN);
-      DEF_ONE(BOOLEANS);
-      DEF_ONE(BLOCK);
-      DEF_ONE(LONG);
-      DEF_ONE(BLOCKS);
-      DEF_ONE(LONGS);
-      default:
-        LOG(FATAL) << "Unknown attribute type";
-    }
-#undef DEF_ONE
-  }
-
-  std::vector<std::string> AttrNames() const override {
-    std::vector<std::string> res;
-    const auto &xs = desc_->attrs();
-    std::transform(
-        xs.begin(),
-        xs.end(),
-        std::back_inserter(res),
-        [](const framework::proto::OpDesc_Attr &x) { return x.name(); });
-    return res;
-  }
-
-  template <typename T>
-  void SetAttr(const std::string &name, const T &v);
-
-  template <typename T>
-  T GetAttr(const std::string &name) const;
-
-  std::string DebugString() const { return desc_->DebugString(); }
-
- private:
-  std::vector<std::string> GetArguments(
-      const google::protobuf::RepeatedPtrField<framework::proto::OpDesc_Var>
-          &xs,
-      const std::string &param) const {
-    std::vector<std::string> res;
-    auto it = std::find_if(
-        xs.begin(), xs.end(), [&](const framework::proto::OpDesc_Var &it) {
-          return it.parameter() == param;
-        });
-    CHECK(it != xs.end());
-
-    const auto &ys = it->arguments();
-    std::transform(ys.begin(),
-                   ys.end(),
-                   std::back_inserter(res),
-                   [](const std::string &x) { return x; });
-    return res;
-  }
-
-  void SetArgument(
-      google::protobuf::RepeatedPtrField<framework::proto::OpDesc_Var> *xs,
-      const std::string &param,
-      const std::vector<std::string> &args) {
-    auto it = std::find_if(
-        xs->begin(), xs->end(), [&](const framework::proto::OpDesc_Var &it) {
-          return it.parameter() == param;
-        });
-    if (it == xs->end()) {
-      auto *new_arg = xs->Add();
-      new_arg->set_parameter(param);
-      for (const auto &arg : args) {
-        *new_arg->mutable_arguments()->Add() = arg;
-      }
-    } else {
-      it->mutable_arguments()->Clear();
-      for (const auto &arg : args) {
-        *it->mutable_arguments()->Add() = arg;
-      }
-    }
-  }
-
-  std::vector<std::string> GetArgumentNames(
-      const google::protobuf::RepeatedPtrField<framework::proto::OpDesc_Var>
-          &xs) const {
-    std::vector<std::string> res;
-    std::transform(
-        xs.begin(),
-        xs.end(),
-        std::back_inserter(res),
-        [](const framework::proto::OpDesc_Var &x) { return x.parameter(); });
-    return res;
-  }
-
- private:
-  framework::proto::OpDesc *desc_;
-};
-
-template <>
-void OpDesc::SetAttr<std::string>(const std::string &name,
-                                  const std::string &v);
-
-template <>
-void OpDesc::SetAttr<std::vector<int>>(const std::string &name,
-                                       const std::vector<int> &v);
-
-}  // namespace pb
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/pb/program_desc.cc b/lite/model_parser/pb/program_desc.cc
deleted file mode 100644
index 71bf0a28d7..0000000000
--- a/lite/model_parser/pb/program_desc.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/model_parser/pb/program_desc.h"
-
-namespace paddle {
-namespace lite {
-namespace pb {
-
-template <>
-framework::proto::BlockDesc* ProgramDesc::GetBlock<framework::proto::BlockDesc>(
-    int32_t idx) {
-  CHECK_LT(idx, BlocksSize()) << "idx >= blocks.size()";
-  return desc_->mutable_blocks(idx);
-}
-
-template <>
-framework::proto::BlockDesc*
-ProgramDesc::AddBlock<framework::proto::BlockDesc>() {
-  return desc_->add_blocks();
-}
-
-}  // namespace pb
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/pb/program_desc.h b/lite/model_parser/pb/program_desc.h
deleted file mode 100644
index 38c667f78b..0000000000
--- a/lite/model_parser/pb/program_desc.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <vector>
-#include "lite/core/framework.pb.h"
-#include "lite/model_parser/desc_apis.h"
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace pb {
-
-class ProgramDesc : public ProgramDescAPI {
- public:
-  ProgramDesc() = delete;
-
-  explicit ProgramDesc(framework::proto::ProgramDesc *desc) : desc_(desc) {
-    CHECK(desc_);
-  }
-
-  framework::proto::ProgramDesc *Proto() { return desc_; }
-
-  const framework::proto::ProgramDesc &ReadonlyProto() const { return *desc_; }
-
-  size_t BlocksSize() const override { return desc_->blocks_size(); }
-
-  void ClearBlocks() override { desc_->clear_blocks(); }
-
-  template <typename T>
-  T *GetBlock(int32_t idx);
-
-  template <typename T>
-  T *AddBlock();
-
-  bool HasVersion() const override { return desc_->has_version(); }
-
-  int64_t Version() const override { return desc_->version().version(); }
-
-  void SetVersion(int64_t version) override {
-    desc_->mutable_version()->set_version(version);
-  }
-
- private:
-  framework::proto::ProgramDesc *desc_;  // not_own
-};
-
-}  // namespace pb
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/pb/var_desc.cc b/lite/model_parser/pb/var_desc.cc
deleted file mode 100644
index 91800c88b5..0000000000
--- a/lite/model_parser/pb/var_desc.cc
+++ /dev/null
@@ -1,317 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/model_parser/pb/var_desc.h"
-
-namespace paddle {
-namespace lite {
-namespace pb {
-
-using namespace framework;  // NOLINT
-
-VarDescAPI::Type VarDesc::GetType() const {
-  auto type = desc_->type().type();
-
-#define GET_TYPE_CASE_ITEM(type__)        \
-  case framework::proto::VarType::type__: \
-    return VarDescAPI::Type::type__;
-
-  switch (type) {
-    GET_TYPE_CASE_ITEM(LOD_TENSOR);
-    GET_TYPE_CASE_ITEM(LOD_TENSOR_ARRAY);
-    GET_TYPE_CASE_ITEM(LOD_RANK_TABLE);
-    GET_TYPE_CASE_ITEM(SELECTED_ROWS);
-    GET_TYPE_CASE_ITEM(FEED_MINIBATCH);
-    GET_TYPE_CASE_ITEM(FETCH_LIST);
-    GET_TYPE_CASE_ITEM(STEP_SCOPES);
-    GET_TYPE_CASE_ITEM(PLACE_LIST);
-    GET_TYPE_CASE_ITEM(READER);
-    default:
-      LOG(FATAL) << "Unknown var type";
-  }
-#undef GET_TYPE_CASE_ITEM
-}
-
-void VarDesc::SetType(VarDescAPI::Type type) {
-#define SET_TYPE_CASE_ITEM(type__)                                      \
-  case VarDescAPI::Type::type__:                                        \
-    desc_->mutable_type()->set_type(framework::proto::VarType::type__); \
-    break;
-
-  switch (type) {
-    SET_TYPE_CASE_ITEM(LOD_TENSOR);
-    SET_TYPE_CASE_ITEM(LOD_TENSOR_ARRAY);
-    SET_TYPE_CASE_ITEM(LOD_RANK_TABLE);
-    SET_TYPE_CASE_ITEM(SELECTED_ROWS);
-    SET_TYPE_CASE_ITEM(FEED_MINIBATCH);
-    SET_TYPE_CASE_ITEM(FETCH_LIST);
-    SET_TYPE_CASE_ITEM(STEP_SCOPES);
-    SET_TYPE_CASE_ITEM(PLACE_LIST);
-    SET_TYPE_CASE_ITEM(READER);
-    default:
-      LOG(FATAL) << "Unknown var type";
-  }
-#undef SET_TYPE_CASE_ITEM
-}
-
-void VarDesc::SetShape(const std::vector<int64_t> &dims) {
-  VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims());
-}
-
-void VarDesc::SetTensorDescNum(size_t num) {
-  switch (desc_->type().type()) {
-    case proto::VarType::READER: {
-      auto *lod_tensors_ptr =
-          desc_->mutable_type()->mutable_reader()->mutable_lod_tensor();
-      lod_tensors_ptr->Clear();
-      for (size_t i = 0; i < num; ++i) {
-        lod_tensors_ptr->Add();
-      }
-      return;
-    } break;
-    default:
-      LOG(FATAL) << "Setting 'sub_tensor_number' is not supported by the type "
-                    "of var %s."
-                 << this->Name();
-  }
-}
-
-size_t VarDesc::GetTensorDescNum() const {
-  switch (desc_->type().type()) {
-    case proto::VarType::READER:
-      return desc_->type().reader().lod_tensor_size();
-      break;
-    default:
-      LOG(FATAL) << "Getting 'sub_tensor_number' is not supported by the type "
-                    "of var %s."
-                 << this->Name();
-  }
-  return 0;
-}
-
-void VarDesc::SetShapes(
-    const std::vector<std::vector<int64_t>> &multiple_dims) {
-  if (multiple_dims.size() != GetTensorDescNum()) {
-    VLOG(3) << "WARNING: The number of given shapes(" << multiple_dims.size()
-            << ") doesn't match the existing tensor number("
-            << GetTensorDescNum()
-            << "). The Reader is going to be reinitialized.";
-    SetTensorDescNum(multiple_dims.size());
-  }
-  std::vector<proto::VarType::TensorDesc *> tensors = mutable_tensor_descs();
-  for (size_t i = 0; i < multiple_dims.size(); ++i) {
-    VectorToRepeated(multiple_dims[i], tensors[i]->mutable_dims());
-  }
-}
-
-std::vector<int64_t> VarDesc::GetShape() const {
-  return RepeatedToVector(tensor_desc().dims());
-}
-
-std::vector<std::vector<int64_t>> VarDesc::GetShapes() const {
-  std::vector<proto::VarType::TensorDesc> descs = tensor_descs();
-  std::vector<std::vector<int64_t>> res;
-  res.reserve(descs.size());
-  for (const auto &tensor_desc : descs) {
-    res.push_back(RepeatedToVector(tensor_desc.dims()));
-  }
-  return res;
-}
-
-void VarDesc::SetDataType(proto::VarType::Type data_type) {
-  mutable_tensor_desc()->set_data_type(data_type);
-}
-
-void VarDesc::SetDataTypes(
-    const std::vector<proto::VarType::Type> &multiple_data_type) {
-  if (multiple_data_type.size() != GetTensorDescNum()) {
-    VLOG(3) << "WARNING: The number of given data types("
-            << multiple_data_type.size()
-            << ") doesn't match the existing tensor number("
-            << GetTensorDescNum()
-            << "). The Reader is going to be reinitialized.";
-    SetTensorDescNum(multiple_data_type.size());
-  }
-  std::vector<proto::VarType::TensorDesc *> tensor_descs =
-      mutable_tensor_descs();
-  for (size_t i = 0; i < multiple_data_type.size(); ++i) {
-    tensor_descs[i]->set_data_type(multiple_data_type[i]);
-  }
-}
-
-proto::VarType::Type VarDesc::GetDataType() const {
-  return tensor_desc().data_type();
-}
-
-std::vector<proto::VarType::Type> VarDesc::GetDataTypes() const {
-  std::vector<proto::VarType::TensorDesc> descs = tensor_descs();
-  std::vector<proto::VarType::Type> res;
-  res.reserve(descs.size());
-  for (const auto &tensor_desc : descs) {
-    res.push_back(tensor_desc.data_type());
-  }
-  return res;
-}
-
-void VarDesc::SetLoDLevel(int32_t lod_level) {
-  switch (desc_->type().type()) {
-    case proto::VarType::LOD_TENSOR:
-      desc_->mutable_type()->mutable_lod_tensor()->set_lod_level(lod_level);
-      break;
-    case proto::VarType::LOD_TENSOR_ARRAY:
-      desc_->mutable_type()->mutable_tensor_array()->set_lod_level(lod_level);
-      break;
-    default:
-      LOG(FATAL)
-          << "Setting 'lod_level' is not supported by the type of var %s."
-          << this->Name();
-  }
-}
-
-void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) {
-  if (multiple_lod_level.size() != GetTensorDescNum()) {
-    VLOG(3) << "WARNING: The number of given lod_levels("
-            << multiple_lod_level.size()
-            << ") doesn't match the existing tensor number("
-            << GetTensorDescNum()
-            << "). The Reader is going to be reinitialized.";
-    SetTensorDescNum(multiple_lod_level.size());
-  }
-  switch (desc_->type().type()) {
-    case proto::VarType::READER: {
-      size_t i = 0;
-      for (auto &lod_tensor :
-           *desc_->mutable_type()->mutable_reader()->mutable_lod_tensor()) {
-        lod_tensor.set_lod_level(multiple_lod_level[i++]);
-      }
-    } break;
-    default:
-      LOG(FATAL)
-          << "Setting 'lod_levels' is not supported by the type of var %s."
-          << this->Name();
-  }
-}
-
-int32_t VarDesc::GetLoDLevel() const {
-  switch (desc_->type().type()) {
-    case proto::VarType::LOD_TENSOR:
-      return desc_->type().lod_tensor().lod_level();
-    case proto::VarType::LOD_TENSOR_ARRAY:
-      return desc_->type().tensor_array().lod_level();
-    default:
-      LOG(FATAL)
-          << "Getting 'lod_level' is not supported by the type of var %s."
-          << this->Name();
-  }
-  return 0;
-}
-
-std::vector<int32_t> VarDesc::GetLoDLevels() const {
-  std::vector<int32_t> res;
-  switch (desc_->type().type()) {
-    case proto::VarType::READER:
-      res.reserve(desc_->type().reader().lod_tensor_size());
-      for (auto &lod_tensor : desc_->type().reader().lod_tensor()) {
-        res.push_back(lod_tensor.lod_level());
-      }
-      return res;
-      break;
-    default:
-      LOG(FATAL)
-          << "Getting 'lod_levels' is not supported by the type of var %s."
-          << this->Name();
-  }
-  return std::vector<int32_t>();
-}
-
-const proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
-  CHECK(desc_->has_type()) << "The var's type hasn't been set.";
-  CHECK(desc_->type().has_type()) << "The var type hasn't been set.";
-  switch (desc_->type().type()) {
-    case proto::VarType::SELECTED_ROWS:
-      return desc_->type().selected_rows();
-    case proto::VarType::LOD_TENSOR:
-      return desc_->type().lod_tensor().tensor();
-    case proto::VarType::LOD_TENSOR_ARRAY:
-      return desc_->type().tensor_array().tensor();
-    default:
-      LOG(FATAL)
-          << "Getting 'tensor_desc' is not supported by the type of var %s."
-          << this->Name();
-  }
-  return framework::proto::VarDesc().type().lod_tensor().tensor();
-}
-
-std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const {
-  CHECK(desc_->has_type()) << "The var type hasn't been set.";
-  std::vector<proto::VarType::TensorDesc> res;
-  res.reserve(GetTensorDescNum());
-  switch (desc_->type().type()) {
-    case proto::VarType::READER:
-      for (const auto &lod_tensor : desc_->type().reader().lod_tensor()) {
-        res.push_back(lod_tensor.tensor());
-      }
-      return res;
-    default:
-      LOG(FATAL)
-          << "Getting 'tensor_descs' is not supported by the type of var "
-             "%s."
-          << this->Name();
-  }
-  return std::vector<proto::VarType::TensorDesc>();
-}
-
-proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() {
-  CHECK(desc_->has_type()) << "The var type hasn't been set.";
-  CHECK(desc_->type().has_type()) << "The var type hasn't been set.";
-  switch (desc_->type().type()) {
-    case proto::VarType::SELECTED_ROWS:
-      return desc_->mutable_type()->mutable_selected_rows();
-    case proto::VarType::LOD_TENSOR:
-      return desc_->mutable_type()->mutable_lod_tensor()->mutable_tensor();
-    case proto::VarType::LOD_TENSOR_ARRAY:
-      return desc_->mutable_type()->mutable_tensor_array()->mutable_tensor();
-    default:
-      LOG(FATAL) << "Getting 'mutable_tensor_desc' is not supported by the "
-                    "type of var "
-                    "%s."
-                 << this->Name();
-  }
-  return nullptr;
-}
-
-std::vector<proto::VarType::TensorDesc *> VarDesc::mutable_tensor_descs() {
-  CHECK(desc_->has_type()) << "The var type hasn't been set.";
-  CHECK(desc_->type().has_type()) << "The var type hasn't been set.";
-  std::vector<proto::VarType::TensorDesc *> res;
-  res.reserve(GetTensorDescNum());
-  switch (desc_->type().type()) {
-    case proto::VarType::READER:
-      for (auto &lod_tensor :
-           *desc_->mutable_type()->mutable_reader()->mutable_lod_tensor()) {
-        res.push_back(lod_tensor.mutable_tensor());
-      }
-      return res;
-    default:
-      LOG(FATAL)
-          << "Getting 'tensor_descs' is not supported by the type of var "
-             "%s."
-          << this->Name();
-  }
-  return std::vector<proto::VarType::TensorDesc *>();
-}
-
-}  // namespace pb
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/pb/var_desc.h b/lite/model_parser/pb/var_desc.h
deleted file mode 100644
index c0ac631601..0000000000
--- a/lite/model_parser/pb/var_desc.h
+++ /dev/null
@@ -1,125 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "lite/core/framework.pb.h"
-#include "lite/model_parser/desc_apis.h"
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace pb {
-
-// convert between std::vector and protobuf repeated.
-template <typename T>
-inline std::vector<T> RepeatedToVector(
-    const google::protobuf::RepeatedField<T> &repeated_field) {
-  std::vector<T> ret;
-  ret.reserve(repeated_field.size());
-  std::copy(
-      repeated_field.begin(), repeated_field.end(), std::back_inserter(ret));
-  return ret;
-}
-
-template <typename T, typename RepeatedField>
-inline void VectorToRepeated(const std::vector<T> &vec,
-                             RepeatedField *repeated_field) {
-  repeated_field->Clear();
-  repeated_field->Reserve(vec.size());
-  for (const auto &elem : vec) {
-    *repeated_field->Add() = elem;
-  }
-}
-
-// Specialize vector<bool>.
-template <typename RepeatedField>
-inline void VectorToRepeated(const std::vector<bool> &vec,
-                             RepeatedField *repeated_field) {
-  repeated_field->Clear();
-  repeated_field->Reserve(vec.size());
-  for (auto elem : vec) {
-    *repeated_field->Add() = elem;
-  }
-}
-
-class VarDesc : public VarDescAPI {
- public:
-  VarDesc() = delete;
-
-  explicit VarDesc(framework::proto::VarDesc *desc) : desc_(desc) {
-    CHECK(desc_);
-  }
-
-  framework::proto::VarDesc *Proto() { return desc_; }
-  const framework::proto::VarDesc &ReadonlyProto() const { return *desc_; }
-
-  std::string Name() const override { return desc_->name(); }
-
-  void SetName(std::string name) override { desc_->set_name(name); }
-
-  void SetTensorDescNum(size_t num);
-
-  size_t GetTensorDescNum() const;
-
-  void SetShape(const std::vector<int64_t> &dims);
-
-  void SetShapes(const std::vector<std::vector<int64_t>> &multiple_dims);
-
-  std::vector<int64_t> GetShape() const;
-
-  std::vector<std::vector<int64_t>> GetShapes() const;
-
-  void SetDataType(framework::proto::VarType::Type data_type);
-
-  void SetDataTypes(
-      const std::vector<framework::proto::VarType::Type> &multiple_data_type);
-
-  framework::proto::VarType::Type GetDataType() const;
-
-  std::vector<framework::proto::VarType::Type> GetDataTypes() const;
-
-  void SetLoDLevel(int32_t lod_level);
-
-  void SetLoDLevels(const std::vector<int32_t> &multiple_lod_level);
-
-  int32_t GetLoDLevel() const;
-
-  std::vector<int32_t> GetLoDLevels() const;
-
-  VarDescAPI::Type GetType() const override;
-
-  void SetType(VarDescAPI::Type type) override;
-
-  bool Persistable() const override { return desc_->persistable(); }
-
-  void SetPersistable(bool persistable) override {
-    desc_->set_persistable(persistable);
-  }
-
- private:
-  const framework::proto::VarType::TensorDesc &tensor_desc() const;
-  std::vector<framework::proto::VarType::TensorDesc> tensor_descs() const;
-  framework::proto::VarType::TensorDesc *mutable_tensor_desc();
-  std::vector<framework::proto::VarType::TensorDesc *> mutable_tensor_descs();
-
-  framework::proto::VarDesc *desc_;
-};
-
-}  // namespace pb
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/runtime.cc b/lite/model_parser/runtime.cc
deleted file mode 100644
index f8a9d47e83..0000000000
--- a/lite/model_parser/runtime.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/model_parser/runtime.h"
-
-namespace paddle {
-namespace lite {
-
-void VarDesc::Parse(const framework::proto::VarDesc& desc) {
-  name = desc.name();
-  this->persistable = desc.persistable();
-  type.Parse(desc.type());
-}
-
-void OpDesc::Parse(const framework::proto::OpDesc& desc) {
-  op_type = desc.type();
-  // prepare inputs
-  for (const auto& input : desc.inputs()) {
-    for (const auto& arg : input.arguments()) {
-      inputs[input.parameter()].push_back(arg);
-    }
-  }
-
-  // prepare outputs
-  for (const auto& output : desc.inputs()) {
-    for (const auto& arg : output.arguments()) {
-      inputs[output.parameter()].push_back(arg);
-    }
-  }
-
-  // prepare attributes
-  for (const auto& attr : desc.attrs()) {
-    switch (static_cast<int>(attr.type())) {
-      case framework::proto::AttrType::INT:
-        attrs[attr.name()] = attr.i();
-        break;
-      case framework::proto::AttrType::FLOAT:
-        attrs[attr.name()] = attr.f();
-        break;
-      case framework::proto::AttrType::STRING:
-        attrs[attr.name()] = attr.s();
-        break;
-      case framework::proto::AttrType::INTS:
-        attrs[attr.name()] = attr.ints();
-        break;
-      case framework::proto::AttrType::FLOATS:
-        attrs[attr.name()] = attr.floats();
-        break;
-      case framework::proto::AttrType::STRINGS:
-        attrs[attr.name()] = attr.strings();
-        break;
-      case framework::proto::AttrType::BOOLEAN:
-        attrs[attr.name()] = attr.b();
-        break;
-      case framework::proto::AttrType::BOOLEANS:
-        attrs[attr.name()] = attr.bools();
-        break;
-      case framework::proto::AttrType::LONG:
-        attrs[attr.name()] = attr.l();
-        break;
-      case framework::proto::AttrType::LONGS:
-        attrs[attr.name()] = attr.longs();
-        break;
-      case framework::proto::AttrType::BLOCK:
-        attrs[attr.name()] = attr.block_idx();
-        break;
-      case framework::proto::AttrType::BLOCKS:
-        attrs[attr.name()] = attr.blocks_idx();
-        break;
-      default:
-        LOG(FATAL) << "unknown attribute type found";
-    }
-  }
-}
-
-void BlockDesc::Parse(const framework::proto::BlockDesc& desc) {
-  idx = desc.idx();
-  parent_idx = desc.parent_idx();
-}
-
-void VarType::Parse(const framework::proto::VarType& proto) {
-  switch (static_cast<int>(proto.type())) {
-    case framework::proto::VarType_Type::VarType_Type_LOD_TENSOR:
-      desc = LoDTensorDesc(proto.lod_tensor());
-      break;
-
-    case framework::proto::VarType_Type::VarType_Type_LOD_TENSOR_ARRAY:
-      desc = LoDTensorArrayDesc(proto.tensor_array());
-      break;
-
-    default:
-      LOG(FATAL) << "no valid var type found";
-      return;
-  }
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/model_parser/runtime.h b/lite/model_parser/runtime.h
deleted file mode 100644
index be7a60c0a6..0000000000
--- a/lite/model_parser/runtime.h
+++ /dev/null
@@ -1,122 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <map>
-#include <string>
-#include <vector>
-#include "lite/utils/all.h"
-#include "paddle/fluid/framework/framework.pb.h"
-
-namespace paddle {
-namespace lite {
-
-// We define the runtime data structure for framework.proto to support some
-// other model format such as JSON if needed.
-using proto_type_t = framework::proto::VarType::Type;
-
-class TensorDesc {
- public:
-  proto_type_t data_type;
-  std::vector<int64_t> dims;
-
-  TensorDesc() = default;
-  explicit TensorDesc(const framework::proto::VarType_TensorDesc& proto) {
-    Parse(proto);
-  }
-
-  void Parse(const framework::proto::VarType_TensorDesc& proto) {
-    data_type = proto.data_type();
-    for (auto& d : proto.dims()) dims.push_back(d);
-  }
-};
-
-class LoDTensorDesc {
- public:
-  TensorDesc tensor;
-  int lod_level{-1};
-
-  explicit LoDTensorDesc(const framework::proto::VarType_LoDTensorDesc& proto) {
-    Parse(proto);
-  }
-
-  void Parse(const framework::proto::VarType_LoDTensorDesc& proto) {
-    tensor.Parse(proto.tensor());
-    lod_level = proto.lod_level();
-  }
-};
-
-class LoDTensorArrayDesc {
- public:
-  TensorDesc tensor;
-  int lod_level{-1};
-
-  LoDTensorArrayDesc(
-      const framework::proto::VarType_LoDTensorArrayDesc& proto) {
-    Parse(proto);
-  }
-
-  void Parse(const framework::proto::VarType_LoDTensorArrayDesc& proto) {
-    tensor.Parse(proto.tensor());
-    lod_level = proto.lod_level();
-  }
-};
-
-class VarType {
- public:
-  framework::proto::VarType::Type type;
-  variant<LoDTensorDesc, TensorDesc> desc;
-
-  void Parse(const framework::proto::VarType& proto);
-};
-
-class VarDesc {
- public:
-  void Parse(const framework::proto::VarDesc& desc);
-
-  std::string name;
-  VarType type;
-  bool persistable{false};
-};
-
-class OpDesc {
- public:
-  void Parse(const framework::proto::OpDesc& desc);
-
-  std::string op_type;
-  std::map<std::string, std::vector<std::string>> inputs;
-  std::map<std::string, std::vector<std::string>> outputs;
-  std::map<std::string, variant<int, float, std::string>> attrs;
-};
-
-class BlockDesc {
- public:
-  void Parse(const framework::proto::BlockDesc& desc);
-
-  int idx{-1};
-  int parent_idx{-1};
-  int forward_block_idx{-1};
-  std::map<std::string, VarDesc> vars;
-  std::vector<OpDesc> ops;
-};
-
-class ProgramDesc {
- public:
-  void Parse(const framework::proto::ProgramDesc& desc);
-
-  BlockDesc block;
-};
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt
deleted file mode 100644
index 6ec9f3ea7c..0000000000
--- a/lite/operators/CMakeLists.txt
+++ /dev/null
@@ -1,120 +0,0 @@
-set(op_DEPS tensor op op_params scope memory)
-
-lite_cc_library(op_params SRCS op_params.cc DEPS tensor any)
-
-add_operator(conv_op basic SRCS conv_op.cc DEPS ${op_DEPS})
-add_operator(pool_op basic SRCS pool_op.cc DEPS ${op_DEPS})
-add_operator(fc_op basic SRCS fc_op.cc DEPS ${op_DEPS})
-add_operator(relu_op basic SRCS relu_op.cc DEPS ${op_DEPS})
-add_operator(mul_op basic SRCS mul_op.cc DEPS ${op_DEPS})
-add_operator(matmul_op basic SRCS matmul_op.cc DEPS ${op_DEPS})
-add_operator(scale_op basic SRCS scale_op.cc DEPS ${op_DEPS})
-add_operator(softmax_op basic SRCS softmax_op.cc DEPS ${op_DEPS})
-add_operator(reshape_op basic SRCS reshape_op.cc DEPS ${op_DEPS} )
-add_operator(batch_norm_op basic SRCS batch_norm_op.cc DEPS ${op_DEPS})
-add_operator(feed_op basic SRCS feed_op.cc DEPS ${op_DEPS})
-add_operator(fetch_op basic SRCS fetch_op.cc DEPS ${op_DEPS})
-add_operator(io_copy_op basic SRCS io_copy_op.cc DEPS ${op_DEPS})
-add_operator(io_copy_once_op basic SRCS io_copy_once_op.cc DEPS io_copy_op ${op_DEPS})
-add_operator(activation_ops basic SRCS activation_ops.cc DEPS ${op_DEPS})
-add_operator(elementwise_ops basic SRCS elementwise_ops.cc DEPS ${op_DEPS})
-add_operator(lrn_op_lite basic SRCS lrn_op.cc DEPS ${op_DEPS})
-add_operator(decode_bboxes_op_lite basic SRCS decode_bboxes_op.cc DEPS ${op_DEPS})
-add_operator(box_coder_op_lite basic SRCS box_coder_op.cc DEPS ${op_DEPS})
-add_operator(multiclass_nms_op_lite basic SRCS multiclass_nms_op.cc DEPS ${op_DEPS})
-add_operator(fusion_elementwise_activation_ops basic SRCS fusion_elementwise_activation_ops.cc DEPS elementwise_ops ${op_DEPS})
-add_operator(mean_op basic SRCS mean_op.cc DEPS ${op_DEPS})
-add_operator(fill_constant_op basic SRCS fill_constant_op.cc DEPS ${op_DEPS})
-#add_operator(sgd_op basic SRCS sgd_op.cc DEPS ${op_DEPS})
-add_operator(uniform_random_op basic SRCS uniform_random_op.cc DEPS ${op_DEPS})
-add_operator(power_op basic SRCS power_op.cc DEPS ${op_DEPS})
-add_operator(shuffle_channel_op basic SRCS shuffle_channel_op.cc DEPS ${op_DEPS})
-add_operator(yolo_box_op basic SRCS yolo_box_op.cc DEPS ${op_DEPS})
-add_operator(interpolate_op basic SRCS interpolate_op.cc DEPS ${op_DEPS})
-add_operator(argmax_op basic SRCS argmax_op.cc DEPS ${op_DEPS})
-add_operator(axpy_op basic SRCS axpy_op.cc DEPS ${op_DEPS})
-add_operator(gru_unit_op basic SRCS gru_unit_op.cc DEPS ${op_DEPS})
-add_operator(gru_op basic SRCS gru_op.cc DEPS ${op_DEPS})
-add_operator(layout_op basic SRCS layout_op.cc DEPS ${op_DEPS})
-add_operator(layout_once_op basic SRCS layout_once_op.cc DEPS ${op_DEPS})
-add_operator(prior_box_op basic SRCS prior_box_op.cc DEPS ${op_DEPS})
-add_operator(density_prior_box_op basic SRCS density_prior_box_op.cc DEPS ${op_DEPS})
-add_operator(dropout_op basic SRCS dropout_op.cc DEPS ${op_DEPS})
-add_operator(concat_op basic SRCS concat_op.cc DEPS ${op_DEPS})
-add_operator(pad2d_op basic SRCS pad2d_op.cc DEPS ${op_DEPS})
-add_operator(negative_op basic SRCS negative_op.cc DEPS ${op_DEPS})
-add_operator(crop_op basic SRCS crop_op.cc DEPS ${op_DEPS})
-add_operator(calib_op basic SRCS calib_op.cc DEPS ${op_DEPS})
-add_operator(calib_once_op basic SRCS calib_once_op.cc DEPS ${op_DEPS})
-add_operator(split_op basic SRCS split_op.cc DEPS ${op_DEPS})
-add_operator(transpose_op basic SRCS transpose_op.cc DEPS ${op_DEPS})
-add_operator(fake_quant basic SRCS fake_quantize_moving_avg_max_abs.cc DEPS ${op_DEPS})
-add_operator(fake_dequant basic SRCS fake_dequantize_max_abs.cc DEPS ${op_DEPS})
-add_operator(conv_transpose_op basic SRCS conv_transpose_op.cc DEPS ${op_DEPS})
-add_operator(graph_op basic SRCS graph_op.cc DEPS ${op_DEPS})
-add_operator(expand_op_lite basic SRCS expand_op.cc DEPS ${op_DEPS})
-add_operator(reduce_max_op_lite basic SRCS reduce_max_op.cc DEPS ${op_DEPS})
-add_operator(norm_op basic SRCS norm_op.cc DEPS ${op_DEPS})
-add_operator(shape_op_lite basic SRCS shape_op.cc DEPS ${op_DEPS})
-add_operator(sequence_expand_op_lite basic SRCS sequence_expand_op.cc DEPS ${op_DEPS})
-add_operator(squeeze_op_lite basic SRCS squeeze_op.cc DEPS ${op_DEPS})
-add_operator(im2sequence_op basic SRCS im2sequence_op.cc DEPS ${op_DEPS})
-add_operator(reduce_mean_op basic SRCS reduce_mean_op.cc DEPS ${op_DEPS})
-add_operator(stack_op basic SRCS stack_op.cc DEPS ${op_DEPS})
-add_operator(cast_op_lite basic SRCS cast_op.cc DEPS ${op_DEPS})
-add_operator(assign_op basic SRCS assign_op.cc DEPS ${op_DEPS})
-add_operator(affine_channel_op basic SRCS affine_channel_op.cc DEPS ${op_DEPS})
-add_operator(anchor_generator_op basic SRCS anchor_generator_op.cc DEPS ${op_DEPS})
-add_operator(generate_proposals_op basic SRCS generate_proposals_op.cc DEPS ${op_DEPS})
-add_operator(roi_align_op basic SRCS roi_align_op.cc DEPS ${op_DEPS})
-add_operator(box_clip_op basic SRCS box_clip_op.cc DEPS ${op_DEPS})
-add_operator(flatten_op basic SRCS flatten_op.cc DEPS ${op_DEPS})
-add_operator(fake_quantize_range_abs_max_op basic SRCS fake_quantize_range_abs_max.cc DEPS ${op_DEPS})
-add_operator(assign_value_op basic SRCS assign_value_op.cc DEPS ${op_DEPS})
-
-# for OCR specific
-add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS})
-add_operator(lookup_table_op extra SRCS lookup_table_op.cc DEPS ${op_DEPS})
-add_operator(beam_search_decode_op extra SRCS beam_search_decode_op.cc DEPS ${op_DEPS})
-add_operator(graph_op_lite extra SRCS graph_op.cc DEPS ${op_DEPS})
-add_operator(logical_xor  extra SRCS logical_op.cc DEPS ${op_DEPS})
-add_operator(logical_and  extra SRCS logical_op.cc DEPS ${op_DEPS})
-add_operator(logical_or  extra SRCS logical_op.cc DEPS ${op_DEPS})
-add_operator(logical_not  extra SRCS logical_op.cc DEPS ${op_DEPS})
-add_operator(less_than  extra SRCS compare_op.cc DEPS ${op_DEPS})
-add_operator(equal  extra SRCS compare_op.cc DEPS ${op_DEPS})
-add_operator(not_equal  extra SRCS compare_op.cc DEPS ${op_DEPS})
-add_operator(less_equal  extra SRCS compare_op.cc DEPS ${op_DEPS})
-add_operator(greater_than  extra SRCS compare_op.cc DEPS ${op_DEPS})
-add_operator(greater_equal  extra SRCS compare_op.cc DEPS ${op_DEPS})
-add_operator(read_from_array_op extra SRCS read_from_array_op.cc DEPS ${op_DEPS})
-add_operator(beam_search_op extra SRCS beam_search_op.cc DEPS ${op_DEPS})
-add_operator(sequence_pool extra SRCS sequence_pool_op.cc DEPS ${op_DEPS})
-add_operator(lod_reset_op extra SRCS lod_reset_op.cc DEPS ${op_DEPS})
-add_operator(is_empty extra SRCS is_empty_op.cc DEPS ${op_DEPS})
-add_operator(slice_op_lite extra SRCS slice_op.cc DEPS ${op_DEPS})
-add_operator(write_to_array_op extra SRCS write_to_array_op.cc DEPS ${op_DEPS})
-add_operator(topk_op extra SRCS topk_op.cc DEPS ${op_DEPS})
-add_operator(increment_op extra SRCS increment_op.cc DEPS ${op_DEPS})
-add_operator(sequence_softmax_op extra SRCS sequence_softmax_op.cc DEPS ${op_DEPS})
-
-
-if (NOT LITE_WITH_X86)
-    lite_cc_test(test_fc_op SRCS fc_op_test.cc
-                DEPS fc_op memory
-                X86_DEPS fc_compute_x86
-                ARM_DEPS fc_compute_arm)
-    lite_cc_test(test_pool_op SRCS pool_op_test.cc
-                DEPS pool_op memory
-                ARM_DEPS pool_compute_arm)
-    lite_cc_test(test_scale_op SRCS scale_op_test.cc DEPS scale_op memory)
-    lite_cc_test(test_softmax_op SRCS softmax_op_test.cc DEPS softmax_op memory)
-    #lite_cc_test(test_reshape_op SRCS reshape_op_test.cc DEPS reshape_op memory)
-    lite_cc_test(test_batch_norm_op SRCS batch_norm_op_test.cc DEPS batch_norm_op memory)
-    lite_cc_test(test_concat_op SRCS concat_op_test.cc DEPS concat_op memory scope)
-    lite_cc_test(test_calib_op SRCS calib_op_test.cc DEPS calib_op memory ARM_DEPS calib_compute_arm)
-    lite_cc_test(test_fusion_elementwise_activation_ops
-                SRCS fusion_elementwise_activation_ops_test.cc
-                DEPS fusion_elementwise_activation_ops memory)
-    lite_cc_test(test_transpose_op SRCS transpose_op_test.cc DEPS transpose_op memory)
-endif()
diff --git a/lite/operators/activation_ops.cc b/lite/operators/activation_ops.cc
deleted file mode 100644
index 78056b4c0c..0000000000
--- a/lite/operators/activation_ops.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.i
-
-#include "lite/operators/activation_ops.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool ActivationOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Out);
-  return true;
-}
-
-bool ActivationOp::InferShape() const {
-  param_.Out->Resize(param_.X->dims());
-  auto out_lod = param_.Out->mutable_lod();
-  *out_lod = param_.X->lod();
-  return true;
-}
-
-bool ActivationOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
-  auto x_name = opdesc.Input("X").front();
-  auto out_name = opdesc.Output("Out").front();
-  param_.X = scope->FindVar(x_name)->GetMutable<lite::Tensor>();
-  if (opdesc.Type() == "leaky_relu") {
-    param_.Leaky_relu_alpha = opdesc.GetAttr<float>("alpha");
-  }
-  if (opdesc.Type() == "relu_clipped") {
-    param_.Relu_clipped_coef = opdesc.GetAttr<float>("Relu_clipped_coef");
-  }
-  if (opdesc.Type() == "prelu") {
-    param_.Prelu_mode = opdesc.GetAttr<std::string>("mode");
-    auto prelu_alpha_name = opdesc.Input("Alpha").front();
-    param_.Prelu_alpha =
-        scope->FindVar(prelu_alpha_name)->GetMutable<lite::Tensor>();
-  }
-  if (opdesc.Type() == "swish") {
-    param_.Swish_beta = opdesc.GetAttr<float>("beta");
-  }
-
-  if (opdesc.Type() == "hard_sigmoid") {
-    param_.hard_sigmoid_slope = opdesc.GetAttr<float>("slope");
-    param_.hard_sigmoid_offset = opdesc.GetAttr<float>("offset");
-  }
-  param_.Out = scope->FindVar(out_name)->GetMutable<lite::Tensor>();
-  return true;
-}
-
-#ifdef LITE_WITH_TRAIN
-
-bool ActivationGradOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.X_grad);
-  CHECK_OR_FALSE(param_.Out_grad);
-  return true;
-}
-
-bool ActivationGradOp::InferShape() const {
-  param_.X_grad->Resize(param_.Out_grad->dims());
-  return true;
-}
-
-bool ActivationGradOp::AttachImpl(const cpp::OpDesc& opdesc,
-                                  lite::Scope* scope) {
-  auto Out_grad_name = opdesc.Input(framework::GradVarName("Out")).front();
-  auto X_grad_name = opdesc.Output(framework::GradVarName("X")).front();
-
-  param_.Out_grad = GetVar<lite::Tensor>(scope, Out_grad_name);
-  param_.X_grad = GetMutableVar<Tensor>(scope, X_grad_name);
-
-  if (opdesc.HasInput("X")) {
-    auto X_name = opdesc.Input("X").front();
-    param_.X = GetVar<lite::Tensor>(scope, X_name);
-  } else {
-    param_.X = param_.X_grad;
-  }
-
-  if (opdesc.HasInput("Out")) {
-    auto Out_name = opdesc.Input("Out").front();
-    param_.Out = GetVar<lite::Tensor>(scope, Out_name);
-  } else {
-    param_.Out = param_.Out_grad;
-  }
-
-  return true;
-}
-
-#endif
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(square, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(relu, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(leaky_relu, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(relu_clipped, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(prelu, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(sigmoid, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(tanh, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(swish, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(relu6, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(log, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(exp, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(floor, paddle::lite::operators::ActivationOp);
-REGISTER_LITE_OP(hard_sigmoid, paddle::lite::operators::ActivationOp);
-
-#ifdef LITE_WITH_TRAIN
-REGISTER_LITE_OP(square_grad, paddle::lite::operators::ActivationGradOp);
-#endif
diff --git a/lite/operators/activation_ops.h b/lite/operators/activation_ops.h
deleted file mode 100644
index 7ff91f7bcd..0000000000
--- a/lite/operators/activation_ops.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include "lite/core/op_lite.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class ActivationOp : public OpLite {
- public:
-  explicit ActivationOp(const std::string& type) : OpLite(type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
-
-  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "activation_op"; }
-
- private:
-  mutable operators::ActivationParam param_;
-};
-
-#ifdef LITE_WITH_TRAIN
-class ActivationGradOp : public OpLite {
- public:
-  explicit ActivationGradOp(const std::string& type) : OpLite(type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
-
-  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "activation_grad_op"; }
-
- private:
-  mutable operators::ActivationGradParam param_;
-};
-#endif
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/affine_channel_op.cc b/lite/operators/affine_channel_op.cc
deleted file mode 100644
index c4945ababd..0000000000
--- a/lite/operators/affine_channel_op.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/affine_channel_op.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool AffineChannelOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Scale);
-  CHECK_OR_FALSE(param_.Bias);
-  CHECK_OR_FALSE(param_.Out);
-
-  const auto x_dims = param_.X->dims();
-  const auto scale_dims = param_.Scale->dims();
-  const auto bias_dims = param_.Bias->dims();
-
-  CHECK_OR_FALSE(x_dims.size() == 4);
-  CHECK_OR_FALSE(scale_dims.size() == 1);
-  CHECK_OR_FALSE(bias_dims.size() == 1);
-  CHECK_OR_FALSE(scale_dims == bias_dims);
-
-  const std::string data_layout = param_.data_layout;
-  if (data_layout == "NCHW") {
-    CHECK_OR_FALSE(scale_dims[0] == x_dims[1] && bias_dims[0] == x_dims[1]);
-  } else if (data_layout == "NHWC") {
-    CHECK_OR_FALSE(scale_dims[0] == x_dims[3] && bias_dims[0] == x_dims[3]);
-  }
-  return true;
-}
-
-bool AffineChannelOpLite::InferShape() const {
-  const auto x_dims = param_.X->dims();
-  param_.Out->Resize(x_dims);
-  return true;
-}
-
-// TODO(Superjomn) replace framework::OpDesc with a lite one.
-bool AffineChannelOpLite::AttachImpl(const cpp::OpDesc &op_desc,
-                                     lite::Scope *scope) {
-  auto x = op_desc.Input("X").front();
-  auto scale = op_desc.Input("Scale").front();
-  auto bias = op_desc.Input("Bias").front();
-  auto output = op_desc.Output("Out").front();
-
-  param_.X = scope->FindVar(x)->GetMutable<lite::Tensor>();
-  param_.Scale = scope->FindVar(scale)->GetMutable<lite::Tensor>();
-  param_.Bias = scope->FindVar(bias)->GetMutable<lite::Tensor>();
-  if (op_desc.HasAttr("data_layout")) {
-    param_.data_layout = op_desc.GetAttr<std::string>("data_layout");
-  }
-  param_.Out = scope->FindVar(output)->GetMutable<lite::Tensor>();
-
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(affine_channel, paddle::lite::operators::AffineChannelOpLite);
diff --git a/lite/operators/affine_channel_op.h b/lite/operators/affine_channel_op.h
deleted file mode 100644
index 85a043bdc8..0000000000
--- a/lite/operators/affine_channel_op.h
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class AffineChannelOpLite : public OpLite {
- public:
-  AffineChannelOpLite() {}
-
-  explicit AffineChannelOpLite(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "affine_channel"; }
-
- private:
-  mutable AffineChannelParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/anchor_generator_op.cc b/lite/operators/anchor_generator_op.cc
deleted file mode 100644
index 8daa54905f..0000000000
--- a/lite/operators/anchor_generator_op.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/anchor_generator_op.h"
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool AnchorGeneratorOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.Input);
-  CHECK_OR_FALSE(param_.Anchors);
-  CHECK_OR_FALSE(param_.Variances);
-
-  auto input_dims = param_.Input->dims();
-  CHECK_OR_FALSE(input_dims.size() == 4);
-  return true;
-}
-
-bool AnchorGeneratorOpLite::InferShape() const {
-  auto input_dims = param_.Input->dims();
-  size_t num_anchors = param_.aspect_ratios.size() * param_.anchor_sizes.size();
-  std::vector<int64_t> output_shape(
-      {input_dims[2], input_dims[3], static_cast<int64_t>(num_anchors), 4});
-  param_.Anchors->Resize(output_shape);
-  param_.Variances->Resize(output_shape);
-  return true;
-}
-
-bool AnchorGeneratorOpLite::AttachImpl(const cpp::OpDesc &op_desc,
-                                       lite::Scope *scope) {
-  auto input_name = op_desc.Input("Input").front();
-  auto anchor_name = op_desc.Output("Anchors").front();
-  auto variances_name = op_desc.Output("Variances").front();
-
-  param_.Input = scope->FindVar(input_name)->GetMutable<lite::Tensor>();
-  param_.Anchors = scope->FindVar(anchor_name)->GetMutable<lite::Tensor>();
-  param_.Variances = scope->FindVar(variances_name)->GetMutable<lite::Tensor>();
-  param_.anchor_sizes = op_desc.GetAttr<std::vector<float>>("anchor_sizes");
-  param_.aspect_ratios = op_desc.GetAttr<std::vector<float>>("aspect_ratios");
-  param_.stride = op_desc.GetAttr<std::vector<float>>("stride");
-  if (op_desc.HasAttr("variances")) {
-    param_.variances = op_desc.GetAttr<std::vector<float>>("variances");
-  }
-  if (op_desc.HasAttr("offset")) {
-    param_.offset = op_desc.GetAttr<float>("offset");
-  }
-
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(anchor_generator,
-                 paddle::lite::operators::AnchorGeneratorOpLite);
diff --git a/lite/operators/anchor_generator_op.h b/lite/operators/anchor_generator_op.h
deleted file mode 100644
index 46e5e0fac2..0000000000
--- a/lite/operators/anchor_generator_op.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class AnchorGeneratorOpLite : public OpLite {
- public:
-  AnchorGeneratorOpLite() {}
-
-  explicit AnchorGeneratorOpLite(const std::string &op_type)
-      : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "anchor_generator"; }
-
- private:
-  mutable AnchorGeneratorParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/argmax_op.cc b/lite/operators/argmax_op.cc
deleted file mode 100644
index 6b246603e1..0000000000
--- a/lite/operators/argmax_op.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/argmax_op.h"
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool ArgmaxOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Out);
-  CHECK_OR_FALSE(param_.Axis < (param_.X)->dims().size());
-  return true;
-}
-
-bool ArgmaxOpLite::InferShape() const {
-  auto x_dims = param_.X->dims();
-  int x_rank = x_dims.size();
-  int axis = param_.Axis;
-  if (axis < 0) axis += x_rank;
-
-  std::vector<int64_t> out_dims;
-  for (int64_t i = 0; i < axis; i++) out_dims.push_back(x_dims[i]);
-  for (int64_t i = axis + 1; i < x_rank; i++) out_dims.push_back(x_dims[i]);
-
-  // Set output dims
-  param_.Out->Resize(lite::DDim(out_dims));
-  return true;
-}
-
-// TODO(Superjomn) replace framework::OpDesc with a lite one.
-bool ArgmaxOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  auto x = op_desc.Input("X").front();
-  auto out = op_desc.Output("Out").front();
-
-  param_.X = scope->FindVar(x)->GetMutable<lite::Tensor>();
-  param_.Out = scope->FindVar(out)->GetMutable<lite::Tensor>();
-  param_.Axis = op_desc.GetAttr<int64_t>("axis");
-
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(arg_max, paddle::lite::operators::ArgmaxOpLite);
diff --git a/lite/operators/argmax_op.h b/lite/operators/argmax_op.h
deleted file mode 100644
index a5accc97e3..0000000000
--- a/lite/operators/argmax_op.h
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class ArgmaxOpLite : public OpLite {
- public:
-  ArgmaxOpLite() {}
-
-  explicit ArgmaxOpLite(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "argmax"; }
-
- private:
-  mutable ArgmaxParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/assign_op.cc b/lite/operators/assign_op.cc
deleted file mode 100644
index 8510b7e8b7..0000000000
--- a/lite/operators/assign_op.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/assign_op.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool AssignOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Out);
-  return true;
-}
-
-bool AssignOpLite::InferShape() const {
-  lite::DDim input_dims;
-  input_dims = param_.X->dims();
-  param_.Out->Resize(lite::DDim(input_dims));
-  return true;
-}
-
-// TODO(Superjomn) replace framework::OpDesc with a lite one.
-bool AssignOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  auto input = op_desc.Input("X").front();
-  auto out = op_desc.Output("Out").front();
-
-  param_.X = scope->FindVar(input)->GetMutable<lite::Tensor>();
-  CHECK(scope->FindVar(out));
-  param_.Out = scope->FindVar(out)->GetMutable<lite::Tensor>();
-
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(assign, paddle::lite::operators::AssignOpLite);
diff --git a/lite/operators/assign_op.h b/lite/operators/assign_op.h
deleted file mode 100644
index 555356c365..0000000000
--- a/lite/operators/assign_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class AssignOpLite : public OpLite {
- public:
-  AssignOpLite() {}
-  explicit AssignOpLite(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "assign"; }
-
- private:
-  mutable AssignParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/assign_value_op.cc b/lite/operators/assign_value_op.cc
deleted file mode 100644
index 046c522228..0000000000
--- a/lite/operators/assign_value_op.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/assign_value_op.h"
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool AssignValueOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.Out);
-  auto shape = param_.shape;
-  auto int32_values = param_.int32_values;
-  auto fp32_values = param_.fp32_values;
-  size_t shape_num = 1;
-  for (int i = 0; i < shape.size(); i++) {
-    shape_num *= shape[i];
-  }
-  CHECK_OR_FALSE(shape_num == int32_values.size() ||
-                 shape_num == fp32_values.size());
-  return true;
-}
-
-bool AssignValueOpLite::InferShape() const {
-  std::vector<int> shape = param_.shape;
-  std::vector<int64_t> out_shape;
-  for (size_t i = 0; i < shape.size(); i++) out_shape.push_back(shape[i]);
-  param_.Out->Resize(out_shape);
-  return true;
-}
-
-bool AssignValueOpLite::AttachImpl(const cpp::OpDesc &op_desc,
-                                   lite::Scope *scope) {
-  param_.shape = op_desc.GetAttr<std::vector<int>>("shape");
-  param_.dtype = op_desc.GetAttr<int>("dtype");
-  param_.fp32_values = op_desc.GetAttr<std::vector<float>>("fp32_values");
-  param_.int32_values = op_desc.GetAttr<std::vector<int>>("int32_values");
-
-  auto out = op_desc.Output("Out").front();
-  param_.Out = scope->FindVar(out)->GetMutable<lite::Tensor>();
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(assign_value, paddle::lite::operators::AssignValueOpLite);
diff --git a/lite/operators/assign_value_op.h b/lite/operators/assign_value_op.h
deleted file mode 100644
index 7bf2206159..0000000000
--- a/lite/operators/assign_value_op.h
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class AssignValueOpLite : public OpLite {
- public:
-  AssignValueOpLite() {}
-
-  explicit AssignValueOpLite(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "assign value"; }
-
- private:
-  mutable AssignValueParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/axpy_op.cc b/lite/operators/axpy_op.cc
deleted file mode 100644
index 60f302862a..0000000000
--- a/lite/operators/axpy_op.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/axpy_op.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool AxpyOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.Scale);
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Bias);
-  CHECK_OR_FALSE(param_.Out);
-
-  const auto scale_dims = param_.Scale->dims();
-  const auto x_dims = param_.X->dims();
-  CHECK_OR_FALSE(scale_dims[0] == x_dims[0] && scale_dims[1] == x_dims[1]);
-  CHECK_OR_FALSE(x_dims == param_.Bias->dims());
-
-  return true;
-}
-
-bool AxpyOpLite::InferShape() const {
-  auto dims = param_.Bias->dims();
-
-  // Set output dims
-  param_.Out->Resize(lite::DDim(dims));
-  return true;
-}
-// TODO(Superjomn) replace framework::OpDesc with a lite one.
-bool AxpyOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  auto scale = op_desc.Input("Scale").front();
-  auto x = op_desc.Input("X").front();
-  auto bias = op_desc.Input("Bias").front();
-  auto output = op_desc.Output("Out").front();
-
-  param_.Scale = scope->FindVar(scale)->GetMutable<lite::Tensor>();
-  param_.X = scope->FindVar(x)->GetMutable<lite::Tensor>();
-  param_.Bias = scope->FindVar(bias)->GetMutable<lite::Tensor>();
-  param_.Out = scope->FindVar(output)->GetMutable<lite::Tensor>();
-
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(axpy, paddle::lite::operators::AxpyOpLite);
diff --git a/lite/operators/axpy_op.h b/lite/operators/axpy_op.h
deleted file mode 100644
index 1fa8540743..0000000000
--- a/lite/operators/axpy_op.h
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class AxpyOpLite : public OpLite {
- public:
-  AxpyOpLite() {}
-
-  explicit AxpyOpLite(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "axpy"; }
-
- private:
-  mutable AxpyParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/batch_norm_op.cc b/lite/operators/batch_norm_op.cc
deleted file mode 100644
index 6faa9eb225..0000000000
--- a/lite/operators/batch_norm_op.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/batch_norm_op.h"
-#include "lite/core/op_registry.h"
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool BatchNormOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.x);
-  CHECK_OR_FALSE(param_.bias);
-  CHECK_OR_FALSE(param_.scale);
-  CHECK_OR_FALSE(param_.mean);
-  CHECK_OR_FALSE(param_.variance);
-  CHECK_OR_FALSE(param_.y);
-  if (!param_.is_test) {
-    CHECK_OR_FALSE(param_.mean_out);
-    CHECK_OR_FALSE(param_.variance_out);
-    CHECK_OR_FALSE(param_.saved_mean);
-    CHECK_OR_FALSE(param_.saved_variance);
-  }
-  auto x_dims = param_.x->dims();
-  auto scale_dims = param_.scale->dims();
-  auto bias_dims = param_.bias->dims();
-  auto mean_dims = param_.mean->dims();
-  auto variance_dims = param_.variance->dims();
-  CHECK(x_dims.size() >= 2 && x_dims.size() <= 5)
-      << "Input X must have 2 to 5 dimensions.";
-  CHECK_EQ(scale_dims.size(), 1UL) << "Input Scale must have 1 dimensions.";
-  CHECK_EQ(bias_dims.size(), 1UL) << "Input Bias must have 1 dimensions.";
-  CHECK_EQ(mean_dims.size(), 1UL) << "Input Mean must have 1 dimensions.";
-  CHECK_EQ(variance_dims.size(), 1UL)
-      << "Input Variance must have 1 dimensions.";
-  return true;
-}
-
-bool BatchNormOp::InferShape() const {
-  auto x_dims = param_.x->dims();
-  int64_t channel_size = 0;
-  switch (param_.data_layout) {
-    case DATALAYOUT(kNCHW):
-      channel_size = x_dims[1];
-      break;
-    // case DATALAYOUT(kNHWC):
-    //   channel_size = x_dims[x_dims.size() - 1];
-    //   break;
-    default:
-      LOG(FATAL) << "Unknown storage order: "
-                 << DataLayoutToStr(param_.data_layout);
-      break;
-  }
-  if (!param_.is_test) {
-    param_.mean_out->Resize({channel_size});
-    param_.variance_out->Resize({channel_size});
-    param_.saved_mean->Resize({channel_size});
-    param_.saved_variance->Resize({channel_size});
-  }
-  param_.y->Resize(x_dims);
-  return true;
-}
-
-bool BatchNormOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  param_.x = scope->FindVar(op_desc.Input("X").front())->GetMutable<Tensor>();
-  param_.bias =
-      scope->FindVar(op_desc.Input("Bias").front())->GetMutable<Tensor>();
-  param_.scale =
-      scope->FindVar(op_desc.Input("Scale").front())->GetMutable<Tensor>();
-  param_.mean =
-      scope->FindVar(op_desc.Input("Mean").front())->GetMutable<Tensor>();
-  param_.variance =
-      scope->FindVar(op_desc.Input("Variance").front())->GetMutable<Tensor>();
-  param_.y = scope->FindVar(op_desc.Output("Y").front())->GetMutable<Tensor>();
-  param_.is_test = op_desc.GetAttr<int>("is_test");
-  if (op_desc.HasAttr("use_global_stats")) {
-    param_.use_global_stats = op_desc.GetAttr<bool>("use_global_stats");
-  }
-  if (!param_.is_test) {
-    param_.mean_out =
-        scope->FindVar(op_desc.Output("MeanOut").front())->GetMutable<Tensor>();
-    param_.variance_out = scope->FindVar(op_desc.Output("VarianceOut").front())
-                              ->GetMutable<Tensor>();
-    param_.saved_mean = scope->FindVar(op_desc.Output("SavedMean").front())
-                            ->GetMutable<Tensor>();
-    param_.saved_variance =
-        scope->FindVar(op_desc.Output("SavedVariance").front())
-            ->GetMutable<Tensor>();
-  }
-  param_.epsilon = op_desc.GetAttr<float>("epsilon");
-  param_.momentum = op_desc.GetAttr<float>("momentum");
-  std::string data_layout = op_desc.GetAttr<std::string>("data_layout");
-  CHECK_EQ(data_layout, "NCHW") << "TODO(hong19860320): Only support NCHW.";
-  // param_.data_layout = StringToDataLayout(data_layout);
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(batch_norm, paddle::lite::operators::BatchNormOp);
diff --git a/lite/operators/batch_norm_op.h b/lite/operators/batch_norm_op.h
deleted file mode 100644
index 21dbf9a28a..0000000000
--- a/lite/operators/batch_norm_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class BatchNormOp : public OpLite {
- public:
-  BatchNormOp() {}
-  explicit BatchNormOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "batch_norm"; }
-
- private:
-  mutable BatchNormParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/batch_norm_op_test.cc b/lite/operators/batch_norm_op_test.cc
deleted file mode 100644
index 574bb4cfd3..0000000000
--- a/lite/operators/batch_norm_op_test.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/batch_norm_op.h"
-#include <gtest/gtest.h>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-TEST(batch_norm_op_lite, test) {
-  // prepare variables
-  Scope scope;
-  auto* x = scope.Var("x")->GetMutable<Tensor>();
-  auto* scale = scope.Var("scale")->GetMutable<Tensor>();
-  auto* bias = scope.Var("bias")->GetMutable<Tensor>();
-  auto* mean = scope.Var("mean")->GetMutable<Tensor>();
-  auto* variance = scope.Var("variance")->GetMutable<Tensor>();
-  auto* y = scope.Var("y")->GetMutable<Tensor>();
-  x->Resize({2, 32, 10, 20});
-  auto x_dims = x->dims();
-  const int64_t channel_size = x_dims[1];  // NCHW
-  scale->Resize({channel_size});
-  bias->Resize({channel_size});
-  mean->Resize({channel_size});
-  variance->Resize({channel_size});
-
-  // prepare op desc
-  cpp::OpDesc desc;
-  desc.SetType("batch_norm");
-  desc.SetInput("X", {"x"});
-  desc.SetInput("Scale", {"scale"});
-  desc.SetInput("Bias", {"bias"});
-  desc.SetInput("Mean", {"mean"});
-  desc.SetInput("Variance", {"variance"});
-  desc.SetOutput("Y", {"y"});
-  desc.SetAttr("is_test", static_cast<int>(1));
-  desc.SetAttr("use_global_stats", false);
-  desc.SetAttr("epsilon", 1e-5f);
-  desc.SetAttr("momentum", 0.9f);
-  desc.SetAttr("data_layout", std::string("NCHW"));
-
-  BatchNormOp batch_norm("batch_norm");
-
-  batch_norm.SetValidPlaces({Place{TARGET(kHost), PRECISION(kFloat)}});
-  batch_norm.Attach(desc, &scope);
-  batch_norm.CheckShape();
-  batch_norm.InferShape();
-
-  // check output dims
-  auto y_dims = y->dims();
-  CHECK_EQ(y_dims.size(), x_dims.size());
-  for (size_t i = 0; i < y_dims.size(); i++) {
-    CHECK_EQ(y_dims[i], x_dims[i]);
-  }
-}
-
-TEST(batch_norm_op_lite, test_enable_is_test) {
-  // prepare variables
-  Scope scope;
-  auto* x = scope.Var("x")->GetMutable<Tensor>();
-  auto* scale = scope.Var("scale")->GetMutable<Tensor>();
-  auto* bias = scope.Var("bias")->GetMutable<Tensor>();
-  auto* mean = scope.Var("mean")->GetMutable<Tensor>();
-  auto* variance = scope.Var("variance")->GetMutable<Tensor>();
-  auto* y = scope.Var("y")->GetMutable<Tensor>();
-  auto* mean_out = scope.Var("mean_out")->GetMutable<Tensor>();
-  auto* variance_out = scope.Var("variance_out")->GetMutable<Tensor>();
-  auto* saved_mean = scope.Var("saved_mean")->GetMutable<Tensor>();
-  auto* saved_variance = scope.Var("saved_variance")->GetMutable<Tensor>();
-  x->Resize({2, 32, 10, 20});
-  auto x_dims = x->dims();
-  const int64_t channel_size = x_dims[1];  // NCHW
-  scale->Resize({channel_size});
-  bias->Resize({channel_size});
-  mean->Resize({channel_size});
-  variance->Resize({channel_size});
-
-  // prepare op desc
-  cpp::OpDesc desc;
-  desc.SetType("batch_norm");
-  desc.SetInput("X", {"x"});
-  desc.SetInput("Scale", {"scale"});
-  desc.SetInput("Bias", {"bias"});
-  desc.SetInput("Mean", {"mean"});
-  desc.SetInput("Variance", {"variance"});
-  desc.SetOutput("Y", {"y"});
-  desc.SetOutput("MeanOut", {"mean_out"});
-  desc.SetOutput("VarianceOut", {"variance_out"});
-  desc.SetOutput("SavedMean", {"saved_mean"});
-  desc.SetOutput("SavedVariance", {"saved_variance"});
-  desc.SetAttr("is_test", static_cast<int>(0));
-  desc.SetAttr("use_global_stats", false);
-  desc.SetAttr("epsilon", 1e-5f);
-  desc.SetAttr("momentum", 0.9f);
-  desc.SetAttr("data_layout", std::string("NCHW"));
-
-  BatchNormOp batch_norm("batch_norm");
-
-  batch_norm.SetValidPlaces({Place{TARGET(kHost), PRECISION(kFloat)}});
-  batch_norm.Attach(desc, &scope);
-  batch_norm.CheckShape();
-  batch_norm.InferShape();
-
-  // check output dims
-  auto y_dims = y->dims();
-  CHECK_EQ(y_dims.size(), x_dims.size());
-  for (size_t i = 0; i < y_dims.size(); i++) {
-    CHECK_EQ(y_dims[i], x_dims[i]);
-  }
-  auto mean_out_dims = mean_out->dims();
-  auto variance_out_dims = variance_out->dims();
-  auto saved_mean_dims = saved_mean->dims();
-  auto saved_variance_dims = saved_variance->dims();
-  CHECK_EQ(mean_out_dims.size(), 1UL);
-  CHECK_EQ(variance_out_dims.size(), 1UL);
-  CHECK_EQ(saved_mean_dims.size(), 1UL);
-  CHECK_EQ(saved_variance_dims.size(), 1UL);
-  CHECK_EQ(mean_out_dims[0], channel_size);
-  CHECK_EQ(variance_out_dims[0], channel_size);
-  CHECK_EQ(saved_mean_dims[0], channel_size);
-  CHECK_EQ(saved_variance_dims[0], channel_size);
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/beam_search_decode_op.cc b/lite/operators/beam_search_decode_op.cc
deleted file mode 100644
index f45e068c1f..0000000000
--- a/lite/operators/beam_search_decode_op.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/beam_search_decode_op.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool BeamSearchDecodeOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.ids)
-  CHECK_OR_FALSE(param_.scores)
-  CHECK_OR_FALSE(param_.sentence_ids)
-  CHECK_OR_FALSE(param_.sentence_scores)
-  return true;
-}
-
-bool BeamSearchDecodeOpLite::InferShape() const { return true; }
-
-bool BeamSearchDecodeOpLite::AttachImpl(const cpp::OpDesc &op_desc,
-                                        lite::Scope *scope) {
-  auto ids = op_desc.Input("Ids").front();
-  auto scores = op_desc.Input("Scores").front();
-  auto sentence_ids = op_desc.Output("SentenceIds").front();
-  auto sentence_scores = op_desc.Output("SentenceScores").front();
-
-  param_.ids = scope->FindVar(ids)->GetMutable<std::vector<lite::Tensor>>();
-  param_.scores =
-      scope->FindVar(scores)->GetMutable<std::vector<lite::Tensor>>();
-  param_.sentence_ids =
-      scope->FindVar(sentence_ids)->GetMutable<lite::Tensor>();
-  param_.sentence_scores =
-      scope->FindVar(sentence_scores)->GetMutable<lite::Tensor>();
-
-  param_.beam_size = op_desc.GetAttr<int>("beam_size");
-  param_.end_id = op_desc.GetAttr<int>("end_id");
-
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(beam_search_decode,
-                 paddle::lite::operators::BeamSearchDecodeOpLite)
diff --git a/lite/operators/beam_search_decode_op.h b/lite/operators/beam_search_decode_op.h
deleted file mode 100644
index 9d324d2bf0..0000000000
--- a/lite/operators/beam_search_decode_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class BeamSearchDecodeOpLite : public OpLite {
- public:
-  BeamSearchDecodeOpLite() {}
-  explicit BeamSearchDecodeOpLite(const std::string &op_type)
-      : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "BeamSearchDecode"; }
-
- private:
-  mutable BeamSearchDecodeParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/beam_search_op.cc b/lite/operators/beam_search_op.cc
deleted file mode 100644
index 4e340b8da7..0000000000
--- a/lite/operators/beam_search_op.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/beam_search_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool BeamSearchOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.pre_ids);
-  CHECK_OR_FALSE(param_.pre_scores);
-  CHECK_OR_FALSE(param_.ids);
-  CHECK_OR_FALSE(param_.scores);
-  CHECK_OR_FALSE(param_.selected_ids);
-  CHECK_OR_FALSE(param_.selected_scores);
-  CHECK_OR_FALSE(param_.parent_idx);
-  return true;
-}
-
-bool BeamSearchOp::InferShape() const { return true; }
-
-bool BeamSearchOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  param_.pre_ids = scope->FindVar(opdesc.Input("pre_ids").front())
-                       ->GetMutable<lite::Tensor>();
-  param_.pre_scores = scope->FindVar(opdesc.Input("pre_scores").front())
-                          ->GetMutable<lite::Tensor>();
-  param_.ids =
-      scope->FindVar(opdesc.Input("ids").front())->GetMutable<lite::Tensor>();
-  param_.scores = scope->FindVar(opdesc.Input("scores").front())
-                      ->GetMutable<lite::Tensor>();
-  param_.selected_ids = scope->FindVar(opdesc.Output("selected_ids").front())
-                            ->GetMutable<lite::Tensor>();
-  param_.selected_scores =
-      scope->FindVar(opdesc.Output("selected_scores").front())
-          ->GetMutable<lite::Tensor>();
-  param_.parent_idx = scope->FindVar(opdesc.Output("parent_idx").front())
-                          ->GetMutable<lite::Tensor>();
-  CHECK(param_.pre_ids) << "id null";
-  CHECK(param_.pre_scores) << "pre score null";
-  CHECK(param_.ids) << "ids null";
-  CHECK(param_.scores) << "scores null";
-  CHECK(param_.selected_ids) << "select ids null";
-  CHECK(param_.selected_scores) << "select score null";
-  CHECK(param_.parent_idx) << "parent idx null";
-  param_.level = opdesc.GetAttr<int>("level");
-  param_.beam_size = opdesc.GetAttr<int>("beam_size");
-  param_.end_id = opdesc.GetAttr<int>("end_id");
-  param_.is_accumulated = opdesc.GetAttr<bool>("is_accumulated");
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(beam_search, paddle::lite::operators::BeamSearchOp);
diff --git a/lite/operators/beam_search_op.h b/lite/operators/beam_search_op.h
deleted file mode 100644
index 42a6058de1..0000000000
--- a/lite/operators/beam_search_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class BeamSearchOp : public OpLite {
- public:
-  BeamSearchOp() {}
-  explicit BeamSearchOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "beam_search"; }
-
- private:
-  mutable BeamSearchParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/box_clip_op.cc b/lite/operators/box_clip_op.cc
deleted file mode 100644
index 6bd93c6ea4..0000000000
--- a/lite/operators/box_clip_op.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/box_clip_op.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool BoxClipOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.Input);
-  CHECK_OR_FALSE(param_.ImInfo);
-  CHECK_OR_FALSE(param_.Output);
-
-  auto input_dims = param_.Input->dims();
-  auto im_info_dims = param_.ImInfo->dims();
-  auto input_box_size = input_dims.size();
-  CHECK_OR_FALSE(input_dims[input_box_size - 1] == 4);
-  CHECK_OR_FALSE(im_info_dims.size() == 2);
-  CHECK_OR_FALSE(im_info_dims[1] == 3);
-
-  return true;
-}
-
-bool BoxClipOpLite::InferShape() const {
-  auto* input = param_.Input;
-  auto* output = param_.Output;
-  output->Resize(input->dims());
-  output->set_lod(input->lod());
-  return true;
-}
-
-bool BoxClipOpLite::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
-  auto input = op_desc.Input("Input").front();
-  auto im_info = op_desc.Input("ImInfo").front();
-  auto output = op_desc.Output("Output").front();
-  param_.Input = scope->FindVar(input)->GetMutable<lite::Tensor>();
-  param_.ImInfo = scope->FindVar(im_info)->GetMutable<lite::Tensor>();
-  param_.Output = scope->FindVar(output)->GetMutable<lite::Tensor>();
-
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(box_clip, paddle::lite::operators::BoxClipOpLite);
diff --git a/lite/operators/box_clip_op.h b/lite/operators/box_clip_op.h
deleted file mode 100644
index c7e07b1015..0000000000
--- a/lite/operators/box_clip_op.h
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class BoxClipOpLite : public OpLite {
- public:
-  BoxClipOpLite() {}
-
-  explicit BoxClipOpLite(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "box clip"; }
-
- private:
-  mutable BoxClipParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/box_coder_op.cc b/lite/operators/box_coder_op.cc
deleted file mode 100644
index c86f494fc4..0000000000
--- a/lite/operators/box_coder_op.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/box_coder_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool BoxCoderOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.prior_box);
-  CHECK_OR_FALSE(param_.target_box);
-  CHECK_OR_FALSE(param_.proposals);
-
-  auto prior_box_dims = param_.prior_box->dims();
-  CHECK_OR_FALSE(prior_box_dims.size() == 2);
-  CHECK_OR_FALSE(prior_box_dims[1] == 4);
-  if (param_.prior_box_var != nullptr) {
-    auto box_var_dim = param_.prior_box_var->dims();
-    CHECK_OR_FALSE(box_var_dim.size() == 2);
-    CHECK_OR_FALSE(box_var_dim == prior_box_dims);
-  }
-  return true;
-}
-
-bool BoxCoderOpLite::InferShape() const {
-  auto prior_box_dims = param_.prior_box->dims();
-  auto target_box_dims = param_.target_box->dims();
-  std::string code_type = param_.code_type;
-  int axis = param_.axis;
-  CHECK_OR_FALSE(code_type == "encode_center_size" ||
-                 code_type == "decode_center_size");
-
-  if (code_type == "encode_center_size") {
-    CHECK_OR_FALSE(target_box_dims.size() == 2);
-    CHECK_OR_FALSE(target_box_dims[1] == 4);
-    param_.proposals->Resize({target_box_dims[0], prior_box_dims[0], 4});
-  } else if (code_type == "decode_center_size") {
-    CHECK_OR_FALSE(target_box_dims.size() == 3);
-    CHECK_OR_FALSE(axis == 0 || axis == 1);
-    if (axis == 0) {
-      CHECK_OR_FALSE(target_box_dims[1] == prior_box_dims[0]);
-    } else if (axis == 1) {
-      CHECK_OR_FALSE(target_box_dims[0] == prior_box_dims[0]);
-    }
-    CHECK_OR_FALSE(target_box_dims[2] == prior_box_dims[1]);
-    param_.proposals->Resize(target_box_dims);
-  }
-  if (code_type == "decode_center_size" && axis == 1) {
-    param_.proposals->set_lod(param_.prior_box->lod());
-  } else {
-    param_.proposals->set_lod(param_.target_box->lod());
-  }
-  return true;
-}
-
-bool BoxCoderOpLite::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
-  auto Prior_box_name = opdesc.Input("PriorBox").front();
-  auto Target_box_name = opdesc.Input("TargetBox").front();
-  auto Output_box_name = opdesc.Output("OutputBox").front();
-  param_.prior_box = GetVar<lite::Tensor>(scope, Prior_box_name);
-  param_.target_box = GetVar<lite::Tensor>(scope, Target_box_name);
-  param_.proposals = GetMutableVar<lite::Tensor>(scope, Output_box_name);
-  // optional params
-  std::vector<std::string> input_arg_names = opdesc.InputArgumentNames();
-  if (std::find(input_arg_names.begin(),
-                input_arg_names.end(),
-                "PriorBoxVar") != input_arg_names.end()) {
-    auto box_var_arguments = opdesc.Input("PriorBoxVar");
-    if (box_var_arguments.size() > 0) {
-      auto* box_var = scope->FindVar(box_var_arguments.front());
-      if (box_var != nullptr) {
-        param_.prior_box_var = box_var->GetMutable<Tensor>();
-      }
-    }
-  }
-
-  param_.code_type = opdesc.GetAttr<std::string>("code_type");
-  param_.box_normalized = opdesc.GetAttr<bool>("box_normalized");
-  if (opdesc.HasAttr("axis")) {
-    param_.axis = opdesc.GetAttr<int>("axis");
-  }
-
-  if (opdesc.HasAttr("variance")) {
-    param_.variance = opdesc.GetAttr<std::vector<float>>("variance");
-  }
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(box_coder, paddle::lite::operators::BoxCoderOpLite);
diff --git a/lite/operators/box_coder_op.h b/lite/operators/box_coder_op.h
deleted file mode 100644
index 61d54fd484..0000000000
--- a/lite/operators/box_coder_op.h
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class BoxCoderOpLite : public OpLite {
- public:
-  BoxCoderOpLite() {}
-  explicit BoxCoderOpLite(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "box_coder"; }
-
- private:
-  mutable BoxCoderParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/calib_once_op.cc b/lite/operators/calib_once_op.cc
deleted file mode 100644
index 5006adcf3a..0000000000
--- a/lite/operators/calib_once_op.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/calib_once_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool CalibOnceOp::run_once() const { return true; }
-
-std::string CalibOnceOp::DebugString() const { return "calib_once_op"; }
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(calib_once, paddle::lite::operators::CalibOnceOp);
diff --git a/lite/operators/calib_once_op.h b/lite/operators/calib_once_op.h
deleted file mode 100644
index ec8633af66..0000000000
--- a/lite/operators/calib_once_op.h
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include "lite/core/op_lite.h"
-#include "lite/operators/calib_op.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class CalibOnceOp : public CalibOpLite {
- public:
-  explicit CalibOnceOp(const std::string &type) : CalibOpLite(type) {}
-  bool run_once() const override;
-  std::string DebugString() const override;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/calib_op.cc b/lite/operators/calib_op.cc
deleted file mode 100644
index da00f01c32..0000000000
--- a/lite/operators/calib_op.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/calib_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool CalibOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.input);
-  CHECK_OR_FALSE(param_.output);
-  return true;
-}
-bool CalibOpLite::InferShape() const {
-  param_.output->Resize(param_.input->dims());
-  return true;
-}
-
-bool CalibOpLite::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  auto x_var = scope->FindVar(opdesc.Input("Input").front());
-  auto output_var = scope->FindVar(opdesc.Output("Out").front());
-  CHECK(x_var);
-  CHECK(output_var);
-  param_.input = const_cast<lite::Tensor *>(&(x_var->Get<lite::Tensor>()));
-  param_.output = output_var->GetMutable<lite::Tensor>();
-  std::vector<std::string> input_arg_names = opdesc.InputArgumentNames();
-  if (opdesc.HasAttr("scale")) {
-    param_.scale = opdesc.GetAttr<float>("scale");
-  }
-  CHECK(param_.input) << "Input(X) of CalibOp should not be null.";
-  CHECK(param_.output) << "Output(Out) of CalibOp should not be null.";
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(calib, paddle::lite::operators::CalibOpLite);
diff --git a/lite/operators/calib_op.h b/lite/operators/calib_op.h
deleted file mode 100644
index d575766c10..0000000000
--- a/lite/operators/calib_op.h
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/core/tensor.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-/*
- * The data types used by the two adjacent layers in the model should
- * be the same. When the two operators accept different data types,
- * we may need to implicitly add a data type conversion operator.
- * Currently, this operator only supports mutual conversion of int8
- * and float32 types.
- */
-class CalibOpLite : public OpLite {
- public:
-  CalibOpLite() {}
-
-  explicit CalibOpLite(const std::string &type) : OpLite(type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope);
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "calib"; }
-
- private:
-  mutable CalibParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/calib_op_test.cc b/lite/operators/calib_op_test.cc
deleted file mode 100644
index aaadc053b8..0000000000
--- a/lite/operators/calib_op_test.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "lite/operators/calib_op.h"
-#include <gtest/gtest.h>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-#ifdef LITE_WITH_ARM
-TEST(calib_op_lite, TestARM) {
-  // prepare variables
-  Scope scope;
-  auto* x = scope.Var("Input")->GetMutable<Tensor>();
-  auto* output = scope.Var("output")->GetMutable<Tensor>();
-  x->Resize(DDim(std::vector<int64_t>({1, 10, 20})));
-  output->Resize(DDim(std::vector<int64_t>{1, 10, 20}));
-
-  // set data
-  for (int i = 0; i < 10 * 20; i++) {
-    x->mutable_data<float>()[i] = i;
-  }
-  for (int i = 0; i < 10 * 20; i++) {
-    output->mutable_data<float>()[i] = 0.;
-  }
-
-  // prepare op desc
-  cpp::OpDesc desc;
-  desc.SetType("calib");
-  desc.SetInput("Input", {"Input"});
-  desc.SetOutput("Out", {"output"});
-  desc.SetAttr("scale", 10.0f);
-
-  CalibOpLite calib("calib");
-
-  calib.SetValidPlaces({Place{TARGET(kARM), PRECISION(kInt8)}});
-  calib.Attach(desc, &scope);
-  auto kernels = calib.CreateKernels({Place{TARGET(kARM), PRECISION(kInt8)}});
-  ASSERT_FALSE(kernels.empty());
-}
-#endif
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-#ifdef LITE_WITH_ARM
-USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, fp32_to_int8);
-USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, int8_to_fp32);
-#endif
diff --git a/lite/operators/cast_op.cc b/lite/operators/cast_op.cc
deleted file mode 100644
index 9ece0a45a3..0000000000
--- a/lite/operators/cast_op.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/cast_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool CastOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Out);
-  return true;
-}
-
-bool CastOp::InferShape() const {
-  CHECK_OR_FALSE(param_.Out);
-  // TODO(Superjomn) Enable data sharing.
-  auto out_dims = param_.X->dims();
-  param_.Out->Resize(out_dims);
-  return true;
-}
-
-bool CastOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  param_.X =
-      scope->FindVar(opdesc.Input("X").front())->GetMutable<lite::Tensor>();
-  param_.Out =
-      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
-  CHECK(param_.X);
-  CHECK(param_.Out);
-  param_.in_dtype = opdesc.GetAttr<int>("in_dtype");
-  param_.out_dtype = opdesc.GetAttr<int>("out_dtype");
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(cast, paddle::lite::operators::CastOp);
diff --git a/lite/operators/cast_op.h b/lite/operators/cast_op.h
deleted file mode 100644
index 2f5f57f127..0000000000
--- a/lite/operators/cast_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class CastOp : public OpLite {
- public:
-  CastOp() {}
-  explicit CastOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "cast"; }
-
- private:
-  mutable CastParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/compare_op.cc b/lite/operators/compare_op.cc
deleted file mode 100644
index 3210520cd5..0000000000
--- a/lite/operators/compare_op.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/compare_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool CompareOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Y);
-  CHECK_OR_FALSE(param_.Out);
-  return true;
-}
-
-bool CompareOp::InferShape() const {
-  CHECK_OR_FALSE(param_.Out);
-  // TODO(Superjomn) Enable data sharing.
-  auto input_dims = param_.X->dims();
-  param_.Out->Resize(input_dims);
-  return true;
-}
-
-bool CompareOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  param_.X =
-      scope->FindVar(opdesc.Input("X").front())->GetMutable<lite::Tensor>();
-  param_.Y =
-      scope->FindVar(opdesc.Input("Y").front())->GetMutable<lite::Tensor>();
-  param_.axis = opdesc.GetAttr<int>("axis");
-  param_.force_cpu = opdesc.GetAttr<bool>("force_cpu");
-  param_.Out =
-      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
-  CHECK(param_.X);
-  CHECK(param_.Y);
-  CHECK(param_.Out);
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(equal, paddle::lite::operators::CompareOp);
-REGISTER_LITE_OP(notequal, paddle::lite::operators::CompareOp);
-REGISTER_LITE_OP(less_than, paddle::lite::operators::CompareOp);
-REGISTER_LITE_OP(less_equal, paddle::lite::operators::CompareOp);
-REGISTER_LITE_OP(greater_than, paddle::lite::operators::CompareOp);
-REGISTER_LITE_OP(greater_equal, paddle::lite::operators::CompareOp);
diff --git a/lite/operators/compare_op.h b/lite/operators/compare_op.h
deleted file mode 100644
index 7ca21caaa1..0000000000
--- a/lite/operators/compare_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class CompareOp : public OpLite {
- public:
-  CompareOp() {}
-  explicit CompareOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "binary logical"; }
-
- private:
-  mutable CompareParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/concat_op.cc b/lite/operators/concat_op.cc
deleted file mode 100644
index cbc946dbb0..0000000000
--- a/lite/operators/concat_op.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/concat_op.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool ConcatOpLite::CheckShape() const {
-  CHECK_GT_OR_FALSE(param_.x.size(), 1UL);
-  CHECK_OR_FALSE(param_.output);
-  return true;
-}
-
-bool ConcatOpLite::InferShape() const {
-  std::vector<lite::DDim> input_dims;
-  for (auto p : param_.x) {
-    input_dims.push_back(p->dims());
-  }
-  size_t axis = static_cast<size_t>(param_.axis);
-  const size_t n = input_dims.size();
-  CHECK_GT_OR_FALSE(n, 0);
-  auto &out_dims = input_dims[0];
-  size_t in_zero_dims_size = out_dims.size();
-  for (size_t i = 1; i < n; i++) {
-    for (size_t j = 0; j < in_zero_dims_size; j++) {
-      if (j == axis) {
-        out_dims[axis] += input_dims[i][j];
-      } else {
-        CHECK_EQ_OR_FALSE(out_dims[j], input_dims[i][j]);
-      }
-    }
-  }
-  if (out_dims[axis] < 0) {
-    out_dims[axis] = -1;
-  }
-  // Set output dims
-  param_.output->Resize(lite::DDim(out_dims));
-  auto out_lod = param_.output->mutable_lod();
-  *out_lod = param_.x[0]->lod();
-  return true;
-}
-
-// TODO(Superjomn) replace framework::OpDesc with a lite one.
-bool ConcatOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  auto inputs = op_desc.Input("X");
-  auto out = op_desc.Output("Out").front();
-
-  for (auto var : inputs) {
-    param_.x.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
-  }
-  CHECK(scope->FindVar(out));
-  param_.output = scope->FindVar(out)->GetMutable<lite::Tensor>();
-  param_.axis = op_desc.GetAttr<int>("axis");
-
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(concat, paddle::lite::operators::ConcatOpLite);
diff --git a/lite/operators/concat_op.h b/lite/operators/concat_op.h
deleted file mode 100644
index acc41de9b3..0000000000
--- a/lite/operators/concat_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class ConcatOpLite : public OpLite {
- public:
-  ConcatOpLite() {}
-  explicit ConcatOpLite(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "concat"; }
-
- private:
-  mutable ConcatParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/concat_op_test.cc b/lite/operators/concat_op_test.cc
deleted file mode 100644
index ea9878527c..0000000000
--- a/lite/operators/concat_op_test.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/concat_op.h"
-#include <gtest/gtest.h>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-TEST(concat_op_lite, test) {
-  // prepare variables
-  lite::Scope scope;
-  auto* x0 = scope.Var("x0")->GetMutable<lite::Tensor>();
-  auto* x1 = scope.Var("x1")->GetMutable<lite::Tensor>();
-  auto* output = scope.Var("output")->GetMutable<lite::Tensor>();
-  x0->Resize(lite::DDim(std::vector<int64_t>({10, 20})));
-  x1->Resize(lite::DDim(std::vector<int64_t>({10, 20})));
-  output->Resize(lite::DDim(std::vector<int64_t>{20, 20}));
-
-  // set data
-  for (int i = 0; i < 10 * 20; i++) {
-    x0->mutable_data<float>()[i] = i;
-  }
-  for (int i = 0; i < 10 * 20; i++) {
-    x1->mutable_data<float>()[i] = i;
-  }
-  for (int i = 0; i < 10 * 20; i++) {
-    output->mutable_data<float>()[i] = 0.;
-  }
-
-  // prepare op desc
-  cpp::OpDesc desc;
-  desc.SetType("concat");
-  desc.SetInput("X", {"x0", "x1"});
-  desc.SetOutput("Out", {"output"});
-  desc.SetAttr("axis", static_cast<int>(0));
-
-  ConcatOpLite concat("concat");
-
-  concat.SetValidPlaces({Place{TARGET(kX86), PRECISION(kFloat)}});
-  concat.Attach(desc, &scope);
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/conv_op.cc b/lite/operators/conv_op.cc
deleted file mode 100644
index 640cec1a6c..0000000000
--- a/lite/operators/conv_op.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/conv_op.h"
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool ConvOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.x);
-  CHECK_OR_FALSE(param_.output);
-  CHECK_OR_FALSE(param_.filter);
-  // bias is optional.
-
-  const auto in_dims = param_.x->dims();
-  const auto filter_dims = param_.filter->dims();
-
-  CHECK_OR_FALSE(in_dims.size() == 4 || in_dims.size() == 5);
-
-  CHECK_EQ_OR_FALSE(in_dims.size(), filter_dims.size());
-  CHECK_OR_FALSE(in_dims.size() - param_.strides.size() == 2U);
-  CHECK_EQ_OR_FALSE(param_.paddings.size(), param_.strides.size());
-
-  CHECK_EQ_OR_FALSE(in_dims[1], filter_dims[1] * param_.groups);
-  CHECK_EQ_OR_FALSE(filter_dims[0] % param_.groups, 0);
-  CHECK_EQ_OR_FALSE(filter_dims.size(), 4UL);
-
-  return true;
-}
-
-inline int ConvOutputSize(
-    int input_size, int filter_size, int dilation, int padding, int stride) {
-  const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
-  CHECK_GT_OR_FALSE(output_size, 0);
-
-  return output_size;
-}
-
-bool ConvOpLite::InferShape() const {
-  const auto in_dims = param_.x->dims();
-  const auto filter_dims = param_.filter->dims();
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < param_.strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2],
-                                          filter_dims[i + 2],
-                                          param_.dilations[i],
-                                          param_.paddings[i],
-                                          param_.strides[i]));
-  }
-
-  // Set output dims
-  param_.output->Resize(lite::DDim(output_shape));
-
-  // share LoD
-  // param_.output->set_lod(param_.x->lod());
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(conv2d, paddle::lite::operators::ConvOpLite);
-REGISTER_LITE_OP(depthwise_conv2d, paddle::lite::operators::ConvOpLite);
diff --git a/lite/operators/conv_op.h b/lite/operators/conv_op.h
deleted file mode 100644
index fe0393ee59..0000000000
--- a/lite/operators/conv_op.h
+++ /dev/null
@@ -1,107 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/core/tensor.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class ConvOpLite : public OpLite {
- public:
-  ConvOpLite() {}
-
-  explicit ConvOpLite(const std::string& type) : OpLite(type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  // TODO(Superjomn) replace framework::OpDesc with a lite one.
-  bool AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) override {
-    auto X = op_desc.Input("Input").front();
-    auto Filter = op_desc.Input("Filter").front();
-    auto Out = op_desc.Output("Output").front();
-
-    param_.x = scope->FindVar(X)->GetMutable<lite::Tensor>();
-    param_.filter = scope->FindVar(Filter)->GetMutable<lite::Tensor>();
-    param_.output = scope->FindVar(Out)->GetMutable<lite::Tensor>();
-
-    param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
-    param_.paddings = op_desc.GetAttr<std::vector<int>>("paddings");
-    param_.groups = op_desc.GetAttr<int>("groups");
-    param_.dilations = op_desc.GetAttr<std::vector<int>>("dilations");
-
-    // optional params
-    std::vector<std::string> input_arg_names = op_desc.InputArgumentNames();
-    if (std::find(input_arg_names.begin(), input_arg_names.end(), "Bias") !=
-        input_arg_names.end()) {
-      auto bias_arguments = op_desc.Input("Bias");
-      if (bias_arguments.size() > 0) {
-        auto bias_var = scope->FindVar(bias_arguments.front());
-        if (bias_var != nullptr) {
-          param_.bias =
-              const_cast<lite::Tensor*>(&(bias_var->Get<lite::Tensor>()));
-        }
-      }
-    }
-    if (std::find(input_arg_names.begin(),
-                  input_arg_names.end(),
-                  "ResidualData") != input_arg_names.end()) {
-      auto res_data_arguments = op_desc.Input("ResidualData");
-      if (res_data_arguments.size() > 0) {
-        auto residual_data_var = scope->FindVar(res_data_arguments.front());
-        if (residual_data_var != nullptr) {
-          param_.residualData = const_cast<lite::Tensor*>(
-              &(residual_data_var->Get<lite::Tensor>()));
-        }
-      }
-    }
-    if (op_desc.HasAttr("fuse_relu")) {
-      param_.fuse_relu = op_desc.GetAttr<bool>("fuse_relu");
-    }
-    // For Int8
-    if (op_desc.HasAttr("enable_int8")) {
-      param_.enable_int8 = op_desc.GetAttr<bool>("enable_int8");
-      if (op_desc.HasAttr("input_scale"))
-        param_.input_scale = op_desc.GetAttr<float>("input_scale");
-      if (op_desc.HasAttr("weight_scale"))
-        param_.weight_scale =
-            op_desc.GetAttr<std::vector<float>>("weight_scale");
-      if (op_desc.HasAttr("output_scale")) {
-        param_.output_scale = op_desc.GetAttr<float>("output_scale");
-      }
-    }
-    return true;
-  }
-
-  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "conv2d"; }
-
- private:
-  mutable ConvParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/conv_transpose_op.cc b/lite/operators/conv_transpose_op.cc
deleted file mode 100644
index fb6b431fff..0000000000
--- a/lite/operators/conv_transpose_op.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/conv_transpose_op.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool ConvTransposeOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.x);
-  CHECK_OR_FALSE(param_.filter);
-  CHECK_OR_FALSE(param_.output);
-
-  const auto in_dims = param_.x->dims();
-  const auto filter_dims = param_.filter->dims();
-
-  CHECK_OR_FALSE(in_dims.size() == 4 || in_dims.size() == 5);
-
-  CHECK_EQ_OR_FALSE(in_dims.size(), filter_dims.size());
-  CHECK_OR_FALSE(in_dims.size() - param_.strides.size() == 2U);
-  CHECK_EQ_OR_FALSE(param_.paddings.size(), param_.strides.size());
-
-  CHECK_OR_FALSE(in_dims[1] % param_.groups == 0);
-  return true;
-}
-
-bool ConvTransposeOpLite::InferShape() const {
-  const auto in_dims = param_.x->dims();
-  const auto filter_dims = param_.filter->dims();
-
-  std::vector<int64_t> output_shape;
-  output_shape.push_back(in_dims[0]);
-  output_shape.push_back(filter_dims[1] * param_.groups);
-  for (int i = 0; i < param_.strides.size(); i++) {
-    int kernel_extent = param_.dilations[i] * (filter_dims[i + 2] - 1) + 1;
-    int output_len = (in_dims[i + 2] - 1) * param_.strides[i] + kernel_extent -
-                     2 * param_.paddings[i];
-    output_shape.push_back(output_len);
-  }
-
-  // Set output dims
-  param_.output->Resize(lite::DDim(output_shape));
-  return true;
-}
-
-// TODO(Superjomn) replace framework::OpDesc with a lite one.
-bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc &op_desc,
-                                     lite::Scope *scope) {
-  auto X = op_desc.Input("Input").front();
-  auto Filter = op_desc.Input("Filter").front();
-  auto Out = op_desc.Output("Output").front();
-  param_.x = scope->FindVar(X)->GetMutable<lite::Tensor>();
-  param_.filter = scope->FindVar(Filter)->GetMutable<lite::Tensor>();
-  param_.output = scope->FindVar(Out)->GetMutable<lite::Tensor>();
-
-  param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
-  param_.paddings = op_desc.GetAttr<std::vector<int>>("paddings");
-  param_.groups = op_desc.GetAttr<int>("groups");
-  param_.dilations = op_desc.GetAttr<std::vector<int>>("dilations");
-
-  // optional params
-  std::vector<std::string> input_arg_names = op_desc.InputArgumentNames();
-  if (std::find(input_arg_names.begin(), input_arg_names.end(), "Bias") !=
-      input_arg_names.end()) {
-    auto bias_arguments = op_desc.Input("Bias");
-    if (bias_arguments.size() > 0) {
-      auto bias_var = scope->FindVar(bias_arguments.front());
-      if (bias_var != nullptr) {
-        param_.bias =
-            const_cast<lite::Tensor *>(&(bias_var->Get<lite::Tensor>()));
-      }
-    }
-  }
-  if (op_desc.HasAttr("fuse_relu")) {
-    param_.fuse_relu = op_desc.GetAttr<bool>("fuse_relu");
-  }
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(conv2d_transpose,
-                 paddle::lite::operators::ConvTransposeOpLite);
diff --git a/lite/operators/conv_transpose_op.h b/lite/operators/conv_transpose_op.h
deleted file mode 100644
index d8b64c78ef..0000000000
--- a/lite/operators/conv_transpose_op.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/core/tensor.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class ConvTransposeOpLite : public OpLite {
- public:
-  ConvTransposeOpLite() {}
-
-  explicit ConvTransposeOpLite(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "conv_transpose"; }
-
- private:
-  mutable ConvParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/crop_op.cc b/lite/operators/crop_op.cc
deleted file mode 100644
index 1a27cfb34d..0000000000
--- a/lite/operators/crop_op.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/crop_op.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool CropOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Out);
-  return true;
-}
-
-bool CropOpLite::InferShape() const {
-  // nchw
-  auto x_dims = param_.X->dims();
-  lite::DDim output_shape(x_dims);
-  output_shape[0] = x_dims[0];
-  output_shape[1] = param_.shape[1];
-  output_shape[2] = param_.shape[2];
-  output_shape[3] = param_.shape[3];
-  param_.Out->Resize(output_shape);
-  return true;
-}
-
-// TODO(Superjomn) replace framework::OpDesc with a lite one.
-bool CropOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  param_.X = scope->FindVar(op_desc.Input("X").front())->GetMutable<Tensor>();
-  param_.Out =
-      scope->FindVar(op_desc.Output("Out").front())->GetMutable<Tensor>();
-  param_.offsets = op_desc.GetAttr<std::vector<int>>("offsets");
-  param_.shape = op_desc.GetAttr<std::vector<int>>("shape");
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(crop, paddle::lite::operators::CropOpLite);
diff --git a/lite/operators/crop_op.h b/lite/operators/crop_op.h
deleted file mode 100644
index f21278e891..0000000000
--- a/lite/operators/crop_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class CropOpLite : public OpLite {
- public:
-  CropOpLite() {}
-  explicit CropOpLite(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "crop"; }
-
- private:
-  mutable CropParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/decode_bboxes_op.cc b/lite/operators/decode_bboxes_op.cc
deleted file mode 100644
index e22adf1774..0000000000
--- a/lite/operators/decode_bboxes_op.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/decode_bboxes_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool DecodeBboxesOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.loc_data);
-  CHECK_OR_FALSE(param_.prior_data);
-  CHECK_OR_FALSE(param_.bbox_data);
-
-  CHECK_EQ(param_.loc_data->dims().size(), 2);
-  CHECK_EQ(param_.prior_data->dims().size(), 3);
-  return true;
-}
-
-bool DecodeBboxesOpLite::InferShape() const {
-  param_.bbox_data->Resize(param_.loc_data->dims());
-  return true;
-}
-
-bool DecodeBboxesOpLite::AttachImpl(const cpp::OpDesc& opdesc,
-                                    lite::Scope* scope) {
-  auto Loc_name = opdesc.Input("Loc").front();
-  auto Prior_name = opdesc.Input("Prior").front();
-  auto Bbox_name = opdesc.Output("Bbox").front();
-  param_.loc_data = GetVar<lite::Tensor>(scope, Loc_name);
-  param_.prior_data = GetVar<lite::Tensor>(scope, Prior_name);
-  param_.bbox_data = GetMutableVar<lite::Tensor>(scope, Bbox_name);
-
-  param_.batch_num = opdesc.GetAttr<int>("batch_num");
-  param_.num_priors = opdesc.GetAttr<int>("num_priors");
-  param_.num_loc_classes = opdesc.GetAttr<int>("num_loc_classes");
-  param_.share_location = opdesc.GetAttr<bool>("share_location");
-  param_.variance_encoded_in_target =
-      opdesc.GetAttr<bool>("variance_encoded_in_target");
-  param_.code_type = opdesc.GetAttr<std::string>("code_type");
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(decode_bboxes, paddle::lite::operators::DecodeBboxesOpLite);
diff --git a/lite/operators/decode_bboxes_op.h b/lite/operators/decode_bboxes_op.h
deleted file mode 100644
index c463992c8d..0000000000
--- a/lite/operators/decode_bboxes_op.h
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class DecodeBboxesOpLite : public OpLite {
- public:
-  DecodeBboxesOpLite() {}
-  explicit DecodeBboxesOpLite(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "decode_bboxes"; }
-
- private:
-  mutable DecodeBboxesParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/density_prior_box_op.cc b/lite/operators/density_prior_box_op.cc
deleted file mode 100644
index 86830df2f1..0000000000
--- a/lite/operators/density_prior_box_op.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/density_prior_box_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool DensityPriorBoxOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.input);
-  CHECK_OR_FALSE(param_.image);
-  CHECK_OR_FALSE(param_.boxes);
-  CHECK_OR_FALSE(param_.variances);
-  return true;
-}
-
-bool DensityPriorBoxOpLite::InferShape() const { return true; }
-
-bool DensityPriorBoxOpLite::AttachImpl(const cpp::OpDesc& opdesc,
-                                       lite::Scope* scope) {
-  auto input = opdesc.Input("Input").front();
-  auto image = opdesc.Input("Image").front();
-  auto boxes = opdesc.Output("Boxes").front();
-  auto variances = opdesc.Output("Variances").front();
-
-  param_.input = scope->FindVar(input)->GetMutable<lite::Tensor>();
-  param_.image = scope->FindVar(image)->GetMutable<lite::Tensor>();
-  param_.boxes = scope->FindVar(boxes)->GetMutable<lite::Tensor>();
-  param_.variances = scope->FindVar(variances)->GetMutable<lite::Tensor>();
-
-  param_.clip = opdesc.GetAttr<bool>("clip");
-  param_.fixed_sizes = opdesc.GetAttr<std::vector<float>>("fixed_sizes");
-  param_.fixed_ratios = opdesc.GetAttr<std::vector<float>>("fixed_ratios");
-  param_.variances_ = opdesc.GetAttr<std::vector<float>>("variances");
-
-  if (opdesc.HasAttr("aspect_ratios")) {
-    param_.aspect_ratios = opdesc.GetAttr<std::vector<float>>("aspect_ratios");
-  }
-  if (opdesc.HasAttr("max_sizes")) {
-    param_.max_sizes = opdesc.GetAttr<std::vector<float>>("max_sizes");
-  }
-  if (opdesc.HasAttr("density_sizes")) {
-    param_.density_sizes = opdesc.GetAttr<std::vector<int>>("density_sizes");
-  }
-  if (opdesc.HasAttr("densities")) {
-    param_.density_sizes = opdesc.GetAttr<std::vector<int>>("densities");
-  }
-  if (opdesc.HasAttr("min_sizes")) {
-    param_.min_sizes = opdesc.GetAttr<std::vector<float>>("min_sizes");
-  }
-  if (opdesc.HasAttr("flip")) {
-    param_.flip = opdesc.GetAttr<bool>("flip");
-  }
-  if (opdesc.HasAttr("img_w")) {
-    param_.img_w = opdesc.GetAttr<int>("img_w");
-  }
-  if (opdesc.HasAttr("img_h")) {
-    param_.img_h = opdesc.GetAttr<int>("img_h");
-  }
-  if (opdesc.HasAttr("step_w")) {
-    param_.step_w = opdesc.GetAttr<float>("step_w");
-  }
-  if (opdesc.HasAttr("step_h")) {
-    param_.step_h = opdesc.GetAttr<float>("step_h");
-  }
-  param_.offset = opdesc.GetAttr<float>("offset");
-  if (opdesc.HasAttr("prior_num")) {
-    param_.prior_num = opdesc.GetAttr<int>("prior_num");
-  }
-  if (opdesc.HasAttr("order")) {
-    param_.order = opdesc.GetAttr<std::vector<std::string>>("order");
-  }
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(density_prior_box,
-                 paddle::lite::operators::DensityPriorBoxOpLite);
diff --git a/lite/operators/density_prior_box_op.h b/lite/operators/density_prior_box_op.h
deleted file mode 100644
index bad55ad3b7..0000000000
--- a/lite/operators/density_prior_box_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class DensityPriorBoxOpLite : public OpLite {
- public:
-  DensityPriorBoxOpLite() {}
-  explicit DensityPriorBoxOpLite(const std::string &op_type)
-      : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "density_prior_box"; }
-
- private:
-  mutable DensityPriorBoxParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/dropout_op.cc b/lite/operators/dropout_op.cc
deleted file mode 100644
index 332475bf6b..0000000000
--- a/lite/operators/dropout_op.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class DropoutOpLite : public OpLite {
- public:
-  explicit DropoutOpLite(const std::string& type) : OpLite(type) {}
-
-  bool CheckShape() const override {
-    CHECK_OR_FALSE(param_.x);
-    return true;
-  }
-
-  bool InferShape() const override {
-    const auto x_dims = param_.x->dims();
-    param_.output->Resize(x_dims);
-    if (param_.is_test == false) {
-      param_.mask->Resize(x_dims);
-    }
-    // share LoD
-    // param_.output->set_lod(param_.input->lod());
-    return true;
-  }
-
-  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
-  // TODO(Superjomn) replace framework::OpDesc with a lite one.
-  bool AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) override {
-    auto input = op_desc.Input("X").front();
-    auto out = op_desc.Output("Out").front();
-    auto Mask = op_desc.Output("Mask").front();
-
-    param_.x = GetVar<lite::Tensor>(scope, input);
-    param_.output = GetMutableVar<lite::Tensor>(scope, out);
-    param_.mask = GetMutableVar<lite::Tensor>(scope, Mask);
-
-    param_.dropout_prob = op_desc.GetAttr<float>("dropout_prob");
-    param_.is_test = true;
-    // TODO(sangoly): `is_test` has different attr type in x86 and arm, set
-    // `true` now.
-    // if (op_desc.HasAttr("is_test")) {
-    //   param_.is_test = op_desc.GetAttr<bool>("is_test");
-    // }
-    param_.fix_seed = op_desc.GetAttr<bool>("fix_seed");
-    param_.seed = op_desc.GetAttr<int>("seed");
-    param_.dropout_implementation =
-        op_desc.GetAttr<std::string>("dropout_implementation");
-    return true;
-  }
-
-  std::string DebugString() const override { return "dropout"; }
-
- private:
-  mutable DropoutParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(dropout, paddle::lite::operators::DropoutOpLite);
diff --git a/lite/operators/elementwise_ops.cc b/lite/operators/elementwise_ops.cc
deleted file mode 100644
index a23b1b4c1d..0000000000
--- a/lite/operators/elementwise_ops.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/elementwise_ops.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool ElementwiseOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Y);
-  CHECK_OR_FALSE(param_.Out);
-  return true;
-}
-
-bool ElementwiseOp::InferShape() const {
-  CHECK_OR_FALSE(param_.X->dims().size() >= param_.Y->dims().size());
-  param_.Out->Resize(param_.X->dims());
-  auto out_lod = param_.Out->mutable_lod();
-  *out_lod = param_.X->lod();
-  return true;
-}
-
-bool ElementwiseOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
-  auto X_name = opdesc.Input("X").front();
-  auto Y_name = opdesc.Input("Y").front();
-  auto Out_name = opdesc.Output("Out").front();
-
-  param_.X = GetVar<lite::Tensor>(scope, X_name);
-  param_.Y = GetVar<lite::Tensor>(scope, Y_name);
-  param_.Out = GetMutableVar<lite::Tensor>(scope, Out_name);
-  param_.axis = opdesc.GetAttr<int>("axis");
-  return true;
-}
-
-#ifdef LITE_WITH_TRAIN
-bool ElementwiseGradExplicitOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.Y);
-  CHECK_OR_FALSE(param_.X_grad);
-  CHECK_OR_FALSE(param_.Out_grad);
-  return true;
-}
-
-bool ElementwiseGradExplicitOp::InferShape() const {
-  param_.X_grad->Resize(param_.Out_grad->dims());
-  if (param_.Y_grad) param_.Y_grad->Resize(param_.Y->dims());
-  return true;
-}
-
-bool ElementwiseGradExplicitOp::AttachImpl(const cpp::OpDesc& opdesc,
-                                           lite::Scope* scope) {
-  CHECK_EQ(opdesc.InputArgumentNames().size(), 2UL);
-  auto Y_name = opdesc.Input("Y").front();
-  auto Out_name = opdesc.Input(framework::GradVarName("Out")).front();
-  auto X_grad = opdesc.Output(framework::GradVarName("X")).front();
-
-  if (opdesc.Output(framework::GradVarName("Y")).size() > 0) {
-    auto Y_grad = opdesc.Output(framework::GradVarName("Y")).front();
-    param_.Y_grad = GetMutableVar<Tensor>(scope, Y_grad);
-  }
-  param_.Y = GetVar<lite::Tensor>(scope, Y_name);
-  param_.Out_grad = GetVar<lite::Tensor>(scope, Out_name);
-  param_.X_grad = GetMutableVar<lite::Tensor>(scope, X_grad);
-  param_.axis = opdesc.GetAttr<int>("axis");
-
-  return true;
-}
-#endif
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(elementwise_sub, paddle::lite::operators::ElementwiseOp);
-REGISTER_LITE_OP(elementwise_add, paddle::lite::operators::ElementwiseOp);
-
-REGISTER_LITE_OP(elementwise_mul, paddle::lite::operators::ElementwiseOp);
-REGISTER_LITE_OP(elementwise_max, paddle::lite::operators::ElementwiseOp);
-REGISTER_LITE_OP(elementwise_div, paddle::lite::operators::ElementwiseOp);
-
-#ifdef LITE_WITH_TRAIN
-REGISTER_LITE_OP(elementwise_sub_grad,
-                 paddle::lite::operators::ElementwiseGradExplicitOp);
-#endif
diff --git a/lite/operators/elementwise_ops.h b/lite/operators/elementwise_ops.h
deleted file mode 100644
index b86d35e282..0000000000
--- a/lite/operators/elementwise_ops.h
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class ElementwiseOp : public OpLite {
- public:
-  explicit ElementwiseOp(const std::string& op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
-
-  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "elementwise_op"; }
-
- private:
-  mutable operators::ElementwiseParam param_;
-};
-
-#ifdef LITE_WITH_TRAIN
-class ElementwiseGradExplicitOp : public OpLite {
- public:
-  explicit ElementwiseGradExplicitOp(const std::string& type) : OpLite(type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
-
-  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override {
-    return "elementwise_grad_explicit_op";
-  }
-
- private:
-  mutable operators::ElementwiseGradParam param_;
-};
-#endif
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/expand_op.cc b/lite/operators/expand_op.cc
deleted file mode 100644
index 656e8babc0..0000000000
--- a/lite/operators/expand_op.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/expand_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool ExpandOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Out);
-  int expand_size = param_.expand_times.size();
-  int x_dims_size = param_.X->dims().size();
-  CHECK_EQ(expand_size, x_dims_size)
-      << "The number of expand_times size must be qual to the rank of "
-         "Input(X).";
-  CHECK_LE(param_.X->dims().size(), 6)
-      << "The rank of Input(X) must not be greater than 6.";
-  return true;
-}
-
-bool ExpandOpLite::InferShape() const {
-  DDim out_dims(param_.X->dims());
-  for (size_t i = 0; i < param_.expand_times.size(); ++i) {
-    out_dims[i] *= param_.expand_times[i];
-  }
-  param_.Out->Resize(out_dims);
-  return true;
-}
-
-bool ExpandOpLite::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
-  auto X_name = opdesc.Input("X").front();
-  auto Out_name = opdesc.Output("Out").front();
-  param_.X = GetVar<lite::Tensor>(scope, X_name);
-  param_.Out = GetMutableVar<lite::Tensor>(scope, Out_name);
-  param_.expand_times = opdesc.GetAttr<std::vector<int>>("expand_times");
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(expand, paddle::lite::operators::ExpandOpLite);
diff --git a/lite/operators/expand_op.h b/lite/operators/expand_op.h
deleted file mode 100644
index ce5dcda9e8..0000000000
--- a/lite/operators/expand_op.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class ExpandOpLite : public OpLite {
- public:
-  ExpandOpLite() {}
-  explicit ExpandOpLite(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "expand"; }
-
- private:
-  mutable ExpandParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/fake_dequantize_max_abs.cc b/lite/operators/fake_dequantize_max_abs.cc
deleted file mode 100644
index 0c167c0a8d..0000000000
--- a/lite/operators/fake_dequantize_max_abs.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/fake_dequantize_max_abs.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(fake_dequantize_max_abs,
-                 paddle::lite::operators::FakeDequantizeMaxAbsOpLite);
diff --git a/lite/operators/fake_dequantize_max_abs.h b/lite/operators/fake_dequantize_max_abs.h
deleted file mode 100644
index bc266327eb..0000000000
--- a/lite/operators/fake_dequantize_max_abs.h
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/core/tensor.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class FakeDequantizeMaxAbsOpLite : public OpLite {
- public:
-  FakeDequantizeMaxAbsOpLite() {}
-
-  explicit FakeDequantizeMaxAbsOpLite(const std::string &type) : OpLite(type) {}
-
-  bool CheckShape() const override { return true; }
-
-  bool InferShape() const override { return true; }
-
-  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
-    auto x = op_desc.Input("X").front();
-    auto in_scale = op_desc.Input("Scale").front();
-
-    auto out = op_desc.Output("Out").front();
-
-    param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
-    param_.in_scale = scope->FindVar(in_scale)->GetMutable<lite::Tensor>();
-
-    param_.out = scope->FindVar(out)->GetMutable<lite::Tensor>();
-    param_.max_range = op_desc.GetAttr<float>("max_range");
-    return true;
-  }
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "fake_dequantize_max_abs"; }
-
- private:
-  mutable FakeDequantizeMaxAbsParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/fake_quantize_moving_avg_max_abs.cc b/lite/operators/fake_quantize_moving_avg_max_abs.cc
deleted file mode 100644
index 9af436a15a..0000000000
--- a/lite/operators/fake_quantize_moving_avg_max_abs.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/fake_quantize_moving_avg_max_abs.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(fake_quantize_moving_average_abs_max,
-                 paddle::lite::operators::FakeQuantizeMovingAvgMaxAbsOpLite);
diff --git a/lite/operators/fake_quantize_moving_avg_max_abs.h b/lite/operators/fake_quantize_moving_avg_max_abs.h
deleted file mode 100644
index adc62a480d..0000000000
--- a/lite/operators/fake_quantize_moving_avg_max_abs.h
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/core/tensor.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class FakeQuantizeMovingAvgMaxAbsOpLite : public OpLite {
- public:
-  FakeQuantizeMovingAvgMaxAbsOpLite() {}
-
-  explicit FakeQuantizeMovingAvgMaxAbsOpLite(const std::string &type)
-      : OpLite(type) {}
-
-  bool CheckShape() const override { return true; }
-
-  bool InferShape() const override { return true; }
-
-  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
-    auto x = op_desc.Input("X").front();
-    auto in_scale = op_desc.Input("InScale").front();
-
-    auto out = op_desc.Output("Out").front();
-    auto out_scale = op_desc.Output("OutScale").front();
-
-    param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
-    param_.in_scale = scope->FindVar(in_scale)->GetMutable<lite::Tensor>();
-
-    param_.out = scope->FindVar(out)->GetMutable<lite::Tensor>();
-    param_.out_scale = scope->FindVar(out_scale)->GetMutable<lite::Tensor>();
-    param_.bit_length = op_desc.GetAttr<int>("bit_length");
-    return true;
-  }
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override {
-    return "fake_quantize_moving_avg_max_abs";
-  }
-
- private:
-  mutable FakeQuantizeMovingAvgMaxAbsParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/fake_quantize_range_abs_max.cc b/lite/operators/fake_quantize_range_abs_max.cc
deleted file mode 100644
index a8ce3f75a5..0000000000
--- a/lite/operators/fake_quantize_range_abs_max.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/fake_quantize_range_abs_max.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(fake_quantize_range_abs_max,
-                 paddle::lite::operators::FakeQuantizeRangeMaxAbsOpLite);
diff --git a/lite/operators/fake_quantize_range_abs_max.h b/lite/operators/fake_quantize_range_abs_max.h
deleted file mode 100644
index 726731595a..0000000000
--- a/lite/operators/fake_quantize_range_abs_max.h
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/core/tensor.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class FakeQuantizeRangeMaxAbsOpLite : public OpLite {
- public:
-  FakeQuantizeRangeMaxAbsOpLite() {}
-
-  explicit FakeQuantizeRangeMaxAbsOpLite(const std::string &type)
-      : OpLite(type) {}
-
-  bool CheckShape() const override { return true; }
-
-  bool InferShape() const override { return true; }
-
-  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
-    auto x = op_desc.Input("X").front();
-    auto in_scale = op_desc.Input("InScale").front();
-
-    auto out = op_desc.Output("Out").front();
-    auto out_scale = op_desc.Output("OutScale").front();
-
-    param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
-    param_.in_scale = scope->FindVar(in_scale)->GetMutable<lite::Tensor>();
-
-    param_.out = scope->FindVar(out)->GetMutable<lite::Tensor>();
-    param_.out_scale = scope->FindVar(out_scale)->GetMutable<lite::Tensor>();
-    param_.bit_length = op_desc.GetAttr<int>("bit_length");
-    return true;
-  }
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override {
-    return "fake_quantize_range_max_abs";
-  }
-
- private:
-  mutable FakeQuantizeMovingAvgMaxAbsParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/fc_op.cc b/lite/operators/fc_op.cc
deleted file mode 100644
index d2772bf890..0000000000
--- a/lite/operators/fc_op.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/fc_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool FcOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.input);
-  CHECK_OR_FALSE(param_.output);
-  CHECK_OR_FALSE(param_.w);
-  // bias is optional.
-
-  const auto input_dims = param_.input->dims();
-  const auto w_dims = param_.w->dims();
-
-  if (param_.bias) {
-    const auto bias_dims = param_.bias->dims();
-    if (bias_dims.size() == 2) {
-      CHECK_EQ_OR_FALSE(bias_dims[0], 1);
-      CHECK_EQ_OR_FALSE(bias_dims[1], w_dims[1]);
-    } else if (bias_dims.size() == 1) {
-      CHECK_EQ_OR_FALSE(bias_dims[0], w_dims[1]);
-    }
-  }
-
-  CHECK_EQ_OR_FALSE(w_dims.size(), 2UL);
-  CHECK_GT_OR_FALSE(input_dims.size(),
-                    static_cast<size_t>(param_.in_num_col_dims));
-
-  param_.in_mat_dims = input_dims.Flatten2D(param_.in_num_col_dims);
-  // CHECK_EQ_OR_FALSE(param_.in_mat_dims[1], w_dims[0]);
-
-  return true;
-}
-
-bool FcOpLite::InferShape() const {
-  const auto input_dims = param_.input->dims();
-  const auto w_dims = param_.w->dims();
-
-  // Set output dims
-  std::vector<int64_t> output_dims(param_.in_num_col_dims + 1, 0);
-  for (int i = 0; i < param_.in_num_col_dims; ++i) {
-    output_dims[i] = input_dims[i];
-  }
-  output_dims.back() = w_dims[1];
-  param_.output->Resize(lite::DDim(output_dims));
-
-  // share LoD
-  // param_.output->set_lod(param_.input->lod());
-  return true;
-}
-
-bool FcOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  auto input = op_desc.Input("Input").front();
-  auto W = op_desc.Input("W").front();
-  auto out = op_desc.Output("Out").front();
-
-  param_.input = scope->FindVar(input)->GetMutable<lite::Tensor>();
-  param_.w = scope->FindVar(W)->GetMutable<lite::Tensor>();
-  std::vector<std::string> input_arg_names = op_desc.InputArgumentNames();
-  if (std::find(input_arg_names.begin(), input_arg_names.end(), "Bias") !=
-      input_arg_names.end()) {
-    auto bias_arguments = op_desc.Input("Bias");
-    if (bias_arguments.size() > 0) {
-      auto bias_var = scope->FindVar(bias_arguments.front());
-      if (bias_var != nullptr) {
-        param_.bias = bias_var->GetMutable<lite::Tensor>();
-      }
-    }
-  }
-  CHECK(scope->FindVar(out));
-  param_.output = scope->FindVar(out)->GetMutable<lite::Tensor>();
-  param_.in_num_col_dims = op_desc.GetAttr<int>("in_num_col_dims");
-
-  // For Int8
-  if (op_desc.HasAttr("enable_int8")) {
-    param_.enable_int8 = op_desc.GetAttr<bool>("enable_int8");
-    if (op_desc.HasAttr("input_scale"))
-      param_.input_scale = op_desc.GetAttr<float>("input_scale");
-    if (op_desc.HasAttr("weight_scale"))
-      param_.weight_scale = op_desc.GetAttr<std::vector<float>>("weight_scale");
-    if (op_desc.HasAttr("output_scale"))
-      param_.output_scale = op_desc.GetAttr<float>("output_scale");
-  }
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(fc, paddle::lite::operators::FcOpLite);
diff --git a/lite/operators/fc_op.h b/lite/operators/fc_op.h
deleted file mode 100644
index 3cddde38b2..0000000000
--- a/lite/operators/fc_op.h
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/core/tensor.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class FcOpLite : public OpLite {
- public:
-  FcOpLite() {}
-
-  explicit FcOpLite(const std::string &type) : OpLite(type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  /*
-  bool Run() override {
-    CHECK(kernel_);
-    kernel_->Run();
-    return true;
-  }
-   */
-
-  // TODO(Superjomn) replace framework::OpDesc with a lite one.
-  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "fc"; }
-
- private:
-  mutable FcParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/fc_op_test.cc b/lite/operators/fc_op_test.cc
deleted file mode 100644
index aca19a6436..0000000000
--- a/lite/operators/fc_op_test.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/fc_op.h"
-#include <gtest/gtest.h>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-TEST(fc_op_lite, TestX86) {
-  // prepare variables
-  Scope scope;
-  auto* x = scope.Var("x")->GetMutable<Tensor>();
-  auto* w = scope.Var("w")->GetMutable<Tensor>();
-  auto* bias = scope.Var("bias")->GetMutable<Tensor>();
-  auto* output = scope.Var("output")->GetMutable<Tensor>();
-  x->Resize(DDim(std::vector<int64_t>({1, 10, 20})));
-  w->Resize(DDim(std::vector<int64_t>{20, 20}));
-  bias->Resize(DDim(std::vector<int64_t>{1, 10}));
-  output->Resize(DDim(std::vector<int64_t>{10, 20}));
-
-  // set data
-  for (int i = 0; i < 10 * 20; i++) {
-    x->mutable_data<float>()[i] = i;
-  }
-  for (int i = 0; i < 20 * 20; i++) {
-    w->mutable_data<float>()[i] = i;
-  }
-  for (int i = 0; i < 1 * 10; i++) {
-    bias->mutable_data<float>()[i] = i;
-  }
-  for (int i = 0; i < 10 * 20; i++) {
-    output->mutable_data<float>()[i] = 0.;
-  }
-
-  // prepare op desc
-  cpp::OpDesc desc;
-  desc.SetType("fc");
-  desc.SetInput("Input", {"x"});
-  desc.SetInput("W", {"w"});
-  desc.SetInput("Bias", {"bias"});
-  desc.SetOutput("Out", {"output"});
-  desc.SetAttr("in_num_col_dims", static_cast<int>(1));
-
-  FcOpLite fc("fc");
-
-  fc.SetValidPlaces({Place{TARGET(kX86), PRECISION(kFloat)},
-                     Place{TARGET(kARM), PRECISION(kFloat)}});
-  fc.Attach(desc, &scope);
-  auto kernels = fc.CreateKernels({Place{TARGET(kX86), PRECISION(kFloat)},
-                                   Place{TARGET(kARM), PRECISION(kFloat)}});
-  ASSERT_FALSE(kernels.empty());
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-#ifdef LITE_WITH_X86
-USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def);
-#endif
-
-#ifdef LITE_WITH_ARM
-USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
-#endif
diff --git a/lite/operators/feed_op.cc b/lite/operators/feed_op.cc
deleted file mode 100644
index 8a0c75f62b..0000000000
--- a/lite/operators/feed_op.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class FeedOp : public OpLite {
- public:
-  explicit FeedOp(const std::string& type) : OpLite(type) {}
-
-  bool CheckShape() const override {
-    CHECK_OR_FALSE(param_.feed_list);
-    CHECK_OR_FALSE(param_.out);
-    return true;
-  }
-
-  bool InferShape() const override { return true; }
-
-  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
-
- protected:
-  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override {
-    auto feed_var_name = opdesc.Input("X").front();
-    auto* feed_var = scope->FindVar(feed_var_name);
-    CHECK(feed_var);
-    auto* feed_tensor_list = feed_var->GetMutable<std::vector<lite::Tensor>>();
-    param_.feed_list = feed_tensor_list;
-
-    auto out_name = opdesc.Output("Out").front();
-    auto* out_var = scope->FindVar(out_name);
-    CHECK(out_var);
-    param_.out = out_var->GetMutable<lite::Tensor>();
-
-    // NOTE need boost here
-    // TODO(Superjomn) drop the need of framework::op_desc
-    param_.col = opdesc.GetAttr<int>("col");
-    return true;
-  }
-
-  std::string DebugString() const override { return "feed"; }
-
- private:
-  mutable FeedParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(feed, paddle::lite::operators::FeedOp);
diff --git a/lite/operators/fetch_op.cc b/lite/operators/fetch_op.cc
deleted file mode 100644
index d50c0db340..0000000000
--- a/lite/operators/fetch_op.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class FetchOp : public OpLite {
- public:
-  explicit FetchOp(const std::string& type) : OpLite(type) {}
-
-  bool CheckShape() const override {
-    CHECK_OR_FALSE(param_.input);
-    CHECK_OR_FALSE(param_.fetch_list);
-    return true;
-  }
-
-  bool InferShape() const override { return true; }
-  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
-
- protected:
-  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override {
-    auto _x = opdesc.Input("X").front();
-    auto* x = scope->FindVar(_x);
-    CHECK(x);
-    param_.input = &x->Get<lite::Tensor>();
-
-    auto _out = opdesc.Output("Out").front();
-    auto* out = scope->FindVar(_out);
-    param_.fetch_list = out->GetMutable<std::vector<lite::Tensor>>();
-
-    param_.col = opdesc.GetAttr<int>("col");
-    return true;
-  }
-
-  std::string DebugString() const override { return "fetch"; }
-
- private:
-  mutable FetchParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(fetch, paddle::lite::operators::FetchOp);
diff --git a/lite/operators/fill_constant_op.cc b/lite/operators/fill_constant_op.cc
deleted file mode 100644
index 50b1372248..0000000000
--- a/lite/operators/fill_constant_op.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class FillConstantOp : public OpLite {
- public:
-  explicit FillConstantOp(const std::string& type) : OpLite(type) {}
-
-  bool CheckShape() const override {
-    CHECK_OR_FALSE(param_.Out);
-    return true;
-  }
-
-  bool InferShape() const override {
-    param_.Out->Resize(param_.shape);
-    return true;
-  }
-
-  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override {
-    auto Out_name = opdesc.Output("Out").front();
-
-    param_.Out = GetMutableVar<lite::Tensor>(scope, Out_name);
-    param_.dtype = opdesc.GetAttr<int>("dtype");
-    param_.shape = opdesc.GetAttr<std::vector<int64_t>>("shape");
-    param_.value = opdesc.GetAttr<float>("value");
-    param_.force_cpu = opdesc.GetAttr<bool>("force_cpu");
-    return true;
-  }
-
-  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "fill_constant"; }
-
- private:
-  mutable operators::FillConstantParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(fill_constant, paddle::lite::operators::FillConstantOp);
diff --git a/lite/operators/flatten_op.cc b/lite/operators/flatten_op.cc
deleted file mode 100644
index 6deab45023..0000000000
--- a/lite/operators/flatten_op.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/flatten_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool FlattenOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.x);
-  CHECK_OR_FALSE(param_.output);
-  return true;
-}
-
-bool FlattenOp::InferShape() const {
-  auto x_dims = param_.x->dims();
-
-  auto out_lod = param_.output->mutable_lod();
-  *out_lod = param_.x->lod();
-
-  int64_t outer = 1, inner = 1;
-  for (int i = 0; i < x_dims.size(); ++i) {
-    if (i < axis_) {
-      outer *= x_dims[i];
-    } else {
-      inner *= x_dims[i];
-    }
-  }
-  std::vector<int64_t> out_shape(2);
-  out_shape[0] = outer;
-  out_shape[1] = inner;
-
-  param_.output->Resize(out_shape);
-
-  return true;
-}
-
-bool FlattenOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  auto x_var = scope->FindVar(opdesc.Input("X").front());
-  auto output_var = scope->FindVar(opdesc.Output("Out").front());
-  CHECK(x_var);
-  CHECK(output_var);
-  param_.x = const_cast<lite::Tensor *>(&(x_var->Get<lite::Tensor>()));
-  param_.output = output_var->GetMutable<lite::Tensor>();
-  axis_ = opdesc.GetAttr<int>("axis");
-
-  param_.inplace = false;
-
-  CHECK(param_.x) << "Input(X) of FlattenOp should not be null.";
-  CHECK(param_.output) << "Output(Out) of FlattenOp should not be null.";
-  CHECK_GE(axis_, 0) << "Flatten op axis should >=0.";
-  return true;
-}
-
-bool Flatten2Op::CheckShape() const {
-  FlattenOp::CheckShape();
-  CHECK_OR_FALSE(param_.xshape);
-  return true;
-}
-
-bool Flatten2Op::InferShape() const {
-  FlattenOp::InferShape();
-  auto x_dims = param_.x->dims();
-  std::vector<DDim::value_type> xshape_dims(x_dims.size() + 1, 0);
-  for (size_t i = 0; i < x_dims.size(); i++) {
-    xshape_dims[i + 1] = x_dims[i];
-  }
-  param_.xshape->Resize(DDim(xshape_dims));
-  return true;
-}
-
-bool Flatten2Op::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  FlattenOp::AttachImpl(opdesc, scope);
-  auto xshape_var = scope->FindVar(opdesc.Output("XShape").front());
-  CHECK(xshape_var);
-  param_.xshape = xshape_var->GetMutable<lite::Tensor>();
-  CHECK(param_.xshape) << "Output(XShape) of FlattenOp should not be null.";
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(flatten, paddle::lite::operators::FlattenOp);
-REGISTER_LITE_OP(flatten2, paddle::lite::operators::Flatten2Op);
diff --git a/lite/operators/flatten_op.h b/lite/operators/flatten_op.h
deleted file mode 100644
index 61680fd390..0000000000
--- a/lite/operators/flatten_op.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class FlattenOp : public OpLite {
- public:
-  FlattenOp() {}
-  explicit FlattenOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "flatten"; }
-
- protected:
-  mutable ReshapeParam param_;
-  int axis_;
-};
-
-class Flatten2Op : public FlattenOp {
- public:
-  Flatten2Op() : FlattenOp() {}
-  explicit Flatten2Op(const std::string &op_type) : FlattenOp(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "flatten2"; }
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/fusion_elementwise_activation_ops.cc b/lite/operators/fusion_elementwise_activation_ops.cc
deleted file mode 100644
index b82c6454b4..0000000000
--- a/lite/operators/fusion_elementwise_activation_ops.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/fusion_elementwise_activation_ops.h"
-#include <string>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool FusionElementwiseActivationOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Y);
-  CHECK_OR_FALSE(param_.Out);
-  return true;
-}
-
-bool FusionElementwiseActivationOp::InferShape() const {
-  CHECK_OR_FALSE(param_.X->dims().size() >= param_.Y->dims().size());
-  param_.Out->Resize(param_.X->dims());
-  return true;
-}
-
-bool FusionElementwiseActivationOp::AttachImpl(const cpp::OpDesc& opdesc,
-                                               lite::Scope* scope) {
-  auto X_name = opdesc.Input("X").front();
-  auto Y_name = opdesc.Input("Y").front();
-  auto Out_name = opdesc.Output("Out").front();
-
-  param_.X = GetVar<lite::Tensor>(scope, X_name);
-  param_.Y = GetVar<lite::Tensor>(scope, Y_name);
-  param_.Out = GetMutableVar<lite::Tensor>(scope, Out_name);
-  param_.axis = opdesc.GetAttr<int>("axis");
-  param_.act_type = opdesc.GetAttr<std::string>("act_type");
-  // TODO(sangoly): support more activation types.
-  CHECK(param_.act_type == "relu") << "Only relu activation be supported now";
-
-  return true;
-}
-
-// #ifdef LITE_WITH_X86
-// bool FusionElementwiseActivationGradExplicitOp::CheckShape() const {
-//   CHECK_OR_FALSE(param_.Y);
-//   CHECK_OR_FALSE(param_.X_grad);
-//   CHECK_OR_FALSE(param_.Y_grad);
-//   CHECK_OR_FALSE(param_.Out_grad);
-//   return true;
-// }
-
-// bool FusionElementwiseActivationGradExplicitOp::InferShape() const {
-//   param_.X_grad->Resize(param_.Out_grad->dims());
-//   param_.Y_grad->Resize(param_.Y->dims());
-//   return true;
-// }
-
-// bool FusionElementwiseActivationGradExplicitOp::AttachImpl(
-//     const cpp::OpDesc& opdesc, lite::Scope* scope) {
-//   CHECK_EQ(opdesc.InputArgumentNames().size(), 1UL);
-//   auto Out_name = opdesc.Input(framework::GradVarName("Out")).front();
-//   auto X_name = opdesc.Output(framework::GradVarName("X")).front();
-//   auto Y_name = opdesc.Output(framework::GradVarName("Y")).front();
-
-//   param_.Out_grad = GetVar<lite::Tensor>(scope, Out_name);
-//   param_.X_grad = GetMutableVar<lite::Tensor>(scope, X_name);
-//   param_.Y_grad = GetMutableVar<Tensor>(scope, Y_name);
-//   param_.axis = opdesc.GetAttr<int>("axis");
-//   param_.act_type = opdesc.GetAttr<std::string>("act_type");
-//   // TODO(sangoly): support more activation types.
-//   CHECK(param_.act_type == "relu") << "Only relu activation be supported
-//   now";
-
-//   return true;
-// }
-// #endif
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(fusion_elementwise_sub_activation,
-                 paddle::lite::operators::FusionElementwiseActivationOp);
-REGISTER_LITE_OP(fusion_elementwise_add_activation,
-                 paddle::lite::operators::FusionElementwiseActivationOp);
-REGISTER_LITE_OP(fusion_elementwise_mul_activation,
-                 paddle::lite::operators::FusionElementwiseActivationOp);
-REGISTER_LITE_OP(fusion_elementwise_max_activation,
-                 paddle::lite::operators::FusionElementwiseActivationOp);
-REGISTER_LITE_OP(fusion_elementwise_div_activation,
-                 paddle::lite::operators::FusionElementwiseActivationOp);
-
-#ifdef LITE_WITH_TRAIN
-REGISTER_LITE_OP(
-    fusion_elementwise_sub_activation_grad,
-    paddle::lite::operators::FusionElementwiseActivationGradExplicitOp);
-#endif
diff --git a/lite/operators/fusion_elementwise_activation_ops.h b/lite/operators/fusion_elementwise_activation_ops.h
deleted file mode 100644
index 1999ebd722..0000000000
--- a/lite/operators/fusion_elementwise_activation_ops.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-#include "lite/operators/elementwise_ops.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class FusionElementwiseActivationOp : public OpLite {
- public:
-  explicit FusionElementwiseActivationOp(const std::string& type)
-      : OpLite(type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
-
-  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override {
-    return "fusion_elementwise_activation_op";
-  }
-
- private:
-  mutable operators::FusionElementwiseActivationParam param_;
-};
-
-#ifdef LITE_WITH_TRAIN
-class FusionElementwiseActivationGradExplicitOp : public OpLite {
- public:
-  explicit FusionElementwiseActivationGradExplicitOp(const std::string& type)
-      : OpLite(type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
-
-  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override {
-    return "fusion_elementwise_activation_grad_explicit_op";
-  }
-
- private:
-  mutable operators::FusionElementwiseActivationGradParam param_;
-};
-#endif
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/fusion_elementwise_activation_ops_test.cc b/lite/operators/fusion_elementwise_activation_ops_test.cc
deleted file mode 100644
index fcc7db1c4a..0000000000
--- a/lite/operators/fusion_elementwise_activation_ops_test.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/fusion_elementwise_activation_ops.h"
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-TEST(fusion_elementwise_activation_op_lite, test) {
-  // prepare variables
-  lite::Scope scope;
-  auto* x = scope.Var("x")->GetMutable<lite::Tensor>();
-  auto* y = scope.Var("y")->GetMutable<lite::Tensor>();
-  auto* out = scope.Var("out")->GetMutable<lite::Tensor>();
-  x->Resize(lite::DDim(std::vector<int64_t>({10, 20})));
-  y->Resize(lite::DDim(std::vector<int64_t>({10, 20})));
-  out->Resize(lite::DDim(std::vector<int64_t>{10, 20}));
-
-  // set data
-  for (int i = 0; i < 10 * 20; i++) {
-    x->mutable_data<float>()[i] = i;
-  }
-  for (int i = 0; i < 10 * 20; i++) {
-    y->mutable_data<float>()[i] = i;
-  }
-  for (int i = 0; i < 10 * 20; i++) {
-    out->mutable_data<float>()[i] = 0.;
-  }
-
-  // prepare op desc
-  cpp::OpDesc desc;
-  desc.SetType("fusion_elementwise_add_activation");
-  desc.SetInput("X", {"x"});
-  desc.SetInput("Y", {"y"});
-  desc.SetOutput("Out", {"out"});
-  desc.SetAttr("axis", static_cast<int>(1));
-  desc.SetAttr("act_type", std::string("relu"));
-
-  FusionElementwiseActivationOp fuse_op("fusion_elementwise_add_activation");
-
-  fuse_op.SetValidPlaces({Place{TARGET(kX86), PRECISION(kFloat)}});
-  fuse_op.Attach(desc, &scope);
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/generate_proposals_op.cc b/lite/operators/generate_proposals_op.cc
deleted file mode 100644
index a29ef65e97..0000000000
--- a/lite/operators/generate_proposals_op.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/generate_proposals_op.h"
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool GenerateProposalsOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.Scores);
-  CHECK_OR_FALSE(param_.BboxDeltas);
-  CHECK_OR_FALSE(param_.ImInfo);
-  CHECK_OR_FALSE(param_.Anchors);
-  CHECK_OR_FALSE(param_.Variances);
-  CHECK_OR_FALSE(param_.RpnRois);
-  CHECK_OR_FALSE(param_.RpnRoiProbs);
-
-  auto scores_dims = param_.Scores->dims();
-  auto bbox_dims = param_.BboxDeltas->dims();
-  auto im_info_dims = param_.ImInfo->dims();
-  auto anchors_dims = param_.Anchors->dims();
-  auto vars_dims = param_.Variances->dims();
-
-  CHECK_OR_FALSE(bbox_dims[1] = 4 * scores_dims[1]);
-  CHECK_OR_FALSE(scores_dims[1] == anchors_dims[2]);
-  CHECK_OR_FALSE(anchors_dims == vars_dims);
-
-  return true;
-}
-
-bool GenerateProposalsOpLite::InferShape() const {
-  param_.RpnRois->Resize(std::vector<int64_t>({-1, 4}));
-  param_.RpnRoiProbs->Resize(std::vector<int64_t>({-1, 1}));
-  return true;
-}
-
-bool GenerateProposalsOpLite::AttachImpl(const cpp::OpDesc &op_desc,
-                                         lite::Scope *scope) {
-  // inputs
-  param_.Scores = scope->FindVar(op_desc.Input("Scores").front())
-                      ->GetMutable<lite::Tensor>();
-  param_.BboxDeltas = scope->FindVar(op_desc.Input("BboxDeltas").front())
-                          ->GetMutable<lite::Tensor>();
-  param_.ImInfo = scope->FindVar(op_desc.Input("ImInfo").front())
-                      ->GetMutable<lite::Tensor>();
-  param_.Anchors = scope->FindVar(op_desc.Input("Anchors").front())
-                       ->GetMutable<lite::Tensor>();
-  param_.Variances = scope->FindVar(op_desc.Input("Variances").front())
-                         ->GetMutable<lite::Tensor>();
-
-  // attrs
-  param_.pre_nms_topN = op_desc.GetAttr<int>("pre_nms_topN");
-  param_.post_nms_topN = op_desc.GetAttr<int>("post_nms_topN");
-  param_.nms_thresh = op_desc.GetAttr<float>("nms_thresh");
-  param_.min_size = op_desc.GetAttr<float>("min_size");
-  param_.eta = op_desc.GetAttr<float>("eta");
-
-  // outs
-  param_.RpnRois = scope->FindVar(op_desc.Output("RpnRois").front())
-                       ->GetMutable<lite::Tensor>();
-  param_.RpnRoiProbs = scope->FindVar(op_desc.Output("RpnRoiProbs").front())
-                           ->GetMutable<lite::Tensor>();
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(generate_proposals,
-                 paddle::lite::operators::GenerateProposalsOpLite);
diff --git a/lite/operators/generate_proposals_op.h b/lite/operators/generate_proposals_op.h
deleted file mode 100644
index 502bcca1a3..0000000000
--- a/lite/operators/generate_proposals_op.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class GenerateProposalsOpLite : public OpLite {
- public:
-  GenerateProposalsOpLite() {}
-
-  explicit GenerateProposalsOpLite(const std::string &op_type)
-      : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "generate_proposals"; }
-
- private:
-  mutable GenerateProposalsParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/graph_op.cc b/lite/operators/graph_op.cc
deleted file mode 100644
index 266187d6e8..0000000000
--- a/lite/operators/graph_op.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/graph_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool GraphOpLite::CheckShape() const {
-  CHECK_GE_OR_FALSE(param_.inputs.size(), 1UL);
-  CHECK_GE_OR_FALSE(param_.outputs.size(), 1UL);
-  return true;
-}
-
-bool GraphOpLite::InferShape() const { return CheckShape(); /* enrich me */ }
-
-bool GraphOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  auto inputs = op_desc.Input("Inputs");
-  auto outputs = op_desc.Output("Outputs");
-
-  for (auto var : inputs) {
-    CHECK(scope->FindVar(var));
-    param_.inputs.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
-  }
-
-  for (auto var : outputs) {
-    CHECK(scope->FindVar(var));
-    param_.outputs.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
-  }
-
-  param_.model_name = op_desc.GetAttr<std::string>("model_name");
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(graph_op, paddle::lite::operators::GraphOpLite);
diff --git a/lite/operators/graph_op.h b/lite/operators/graph_op.h
deleted file mode 100644
index 20a7cd9b8d..0000000000
--- a/lite/operators/graph_op.h
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/core/tensor.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class GraphOpLite : public OpLite {
- public:
-  GraphOpLite() {}
-
-  explicit GraphOpLite(const std::string &type) : OpLite(type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "graph_op"; }
-
- private:
-  mutable GraphParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/gru_op.cc b/lite/operators/gru_op.cc
deleted file mode 100644
index 3ddeb5b734..0000000000
--- a/lite/operators/gru_op.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/gru_op.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool GRUOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.input)
-  CHECK_OR_FALSE(param_.weight)
-  CHECK_OR_FALSE(param_.batch_gate)
-  CHECK_OR_FALSE(param_.batch_reset_hidden_prev)
-  CHECK_OR_FALSE(param_.batch_hidden)
-  CHECK_OR_FALSE(param_.hidden)
-
-  auto input_dims = param_.input->dims();
-  auto weight_dims = param_.weight->dims();
-  int input_size = input_dims[1];
-  int frame_size = weight_dims[0];
-  CHECK_EQ_OR_FALSE(input_size, frame_size * 3)
-  CHECK_EQ_OR_FALSE(weight_dims[1], frame_size * 3)
-
-  if (param_.h0) {
-    auto h0_dims = param_.h0->dims();
-    CHECK_EQ_OR_FALSE(h0_dims[1], frame_size)
-  }
-
-  if (param_.bias) {
-    auto bias_dims = param_.bias->dims();
-    int bias_height = bias_dims[0];
-    int bias_width = bias_dims[1];
-    CHECK_EQ_OR_FALSE(bias_height, 1)
-    CHECK_EQ_OR_FALSE(bias_width, frame_size * 3)
-  }
-
-  return true;
-}
-
-bool GRUOpLite::InferShape() const {
-  auto input_dims = param_.input->dims();
-  auto weight_dims = param_.weight->dims();
-  int frame_size = weight_dims[0];
-  auto batch_size = input_dims[0];
-
-  param_.batch_gate->Resize(input_dims);
-  param_.batch_reset_hidden_prev->Resize(lite::DDim({batch_size, frame_size}));
-  param_.batch_hidden->Resize(lite::DDim({batch_size, frame_size}));
-  param_.hidden->Resize(lite::DDim({batch_size, frame_size}));
-
-  *(param_.hidden->mutable_lod()) = param_.input->lod();
-  return true;
-}
-
-bool GRUOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  auto input = op_desc.Input("Input").front();
-  auto weight = op_desc.Input("Weight").front();
-  auto batch_gate = op_desc.Output("BatchGate").front();
-  auto batch_reset_hidden_prev = op_desc.Output("BatchResetHiddenPrev").front();
-  auto batch_hidden = op_desc.Output("BatchHidden").front();
-  auto hidden = op_desc.Output("Hidden").front();
-
-  param_.input = scope->FindVar(input)->GetMutable<lite::Tensor>();
-  if (op_desc.Input("H0").size()) {
-    auto h0 = op_desc.Input("H0").front();
-    param_.h0 = scope->FindVar(h0)->GetMutable<lite::Tensor>();
-  }
-  param_.weight = scope->FindVar(weight)->GetMutable<lite::Tensor>();
-
-  param_.batch_gate = scope->FindVar(batch_gate)->GetMutable<lite::Tensor>();
-  param_.batch_reset_hidden_prev =
-      scope->FindVar(batch_reset_hidden_prev)->GetMutable<lite::Tensor>();
-  param_.batch_hidden =
-      scope->FindVar(batch_hidden)->GetMutable<lite::Tensor>();
-  param_.hidden = scope->FindVar(hidden)->GetMutable<lite::Tensor>();
-
-  if (op_desc.HasInput("Bias")) {
-    auto bias = op_desc.Input("Bias").front();
-    param_.bias = scope->FindVar(bias)->GetMutable<lite::Tensor>();
-  }
-
-  param_.gate_activation = op_desc.GetAttr<std::string>("gate_activation");
-  param_.activation = op_desc.GetAttr<std::string>("activation");
-  param_.is_reverse = op_desc.GetAttr<bool>("is_reverse");
-  param_.origin_mode = op_desc.GetAttr<bool>("origin_mode");
-
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(gru, paddle::lite::operators::GRUOpLite)
diff --git a/lite/operators/gru_op.h b/lite/operators/gru_op.h
deleted file mode 100644
index c43f32f0cd..0000000000
--- a/lite/operators/gru_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class GRUOpLite : public OpLite {
- public:
-  GRUOpLite() {}
-  explicit GRUOpLite(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "GRU"; }
-
- private:
-  mutable GRUParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/gru_unit_op.cc b/lite/operators/gru_unit_op.cc
deleted file mode 100644
index b1efd8d048..0000000000
--- a/lite/operators/gru_unit_op.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/gru_unit_op.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool GRUUnitOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.input);
-  CHECK_OR_FALSE(param_.hidden_prev);
-  CHECK_OR_FALSE(param_.gate);
-  CHECK_OR_FALSE(param_.reset_hidden_prev);
-  CHECK_OR_FALSE(param_.hidden);
-  CHECK_OR_FALSE(param_.weight);
-
-  auto input_dims = param_.input->dims();
-  auto hidden_prev_dims = param_.hidden_prev->dims();
-  auto weight_dims = param_.weight->dims();
-
-  int batch_size = input_dims[0];
-  int input_size = input_dims[1];
-  int frame_size = hidden_prev_dims[1];
-  int weight_height = weight_dims[0];
-  int weight_width = weight_dims[1];
-  CHECK_EQ_OR_FALSE(input_size, frame_size * 3)
-  CHECK_EQ_OR_FALSE(weight_height, frame_size)
-  CHECK_EQ_OR_FALSE(weight_width, frame_size * 3)
-
-  if (param_.bias) {
-    auto bias_dims = param_.bias->dims();
-    int bias_height = bias_dims[0];
-    int bias_width = bias_dims[1];
-    CHECK_EQ_OR_FALSE(bias_height, 1);
-    CHECK_EQ_OR_FALSE(bias_width, frame_size * 3);
-  }
-
-  return true;
-}
-
-bool GRUUnitOpLite::InferShape() const {
-  auto input_dims = param_.input->dims();
-  auto hidden_prev_dims = param_.hidden_prev->dims();
-  auto weight_dims = param_.weight->dims();
-
-  int batch_size = input_dims[0];
-  int frame_size = hidden_prev_dims[1];
-
-  param_.gate->Resize(lite::DDim({batch_size, frame_size * 3}));
-  param_.reset_hidden_prev->Resize(lite::DDim({batch_size, frame_size}));
-  param_.hidden->Resize(lite::DDim({batch_size, frame_size}));
-
-  auto out_lod = param_.hidden->mutable_lod();
-  *out_lod = param_.input->lod();
-  return true;
-}
-
-bool GRUUnitOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  auto input = op_desc.Input("Input").front();
-  auto hidden_prev = op_desc.Input("HiddenPrev").front();
-  auto weight = op_desc.Input("Weight").front();
-  auto gate = op_desc.Output("Gate").front();
-  auto reset_hidden_prev = op_desc.Output("ResetHiddenPrev").front();
-  auto hidden = op_desc.Output("Hidden").front();
-
-  param_.input = scope->FindVar(input)->GetMutable<lite::Tensor>();
-  param_.hidden_prev = scope->FindVar(hidden_prev)->GetMutable<lite::Tensor>();
-  param_.weight = scope->FindVar(weight)->GetMutable<lite::Tensor>();
-
-  param_.gate = scope->FindVar(gate)->GetMutable<lite::Tensor>();
-  param_.reset_hidden_prev =
-      scope->FindVar(reset_hidden_prev)->GetMutable<lite::Tensor>();
-  param_.hidden = scope->FindVar(hidden)->GetMutable<lite::Tensor>();
-
-  if (op_desc.HasInput("Bias")) {
-    auto bias = op_desc.Input("Bias").front();
-    param_.bias = scope->FindVar(bias)->GetMutable<lite::Tensor>();
-  }
-
-  param_.gate_activation = op_desc.GetAttr<int>("gate_activation");
-  param_.activation = op_desc.GetAttr<int>("activation");
-  param_.origin_mode = op_desc.GetAttr<bool>("origin_mode");
-
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(gru_unit, paddle::lite::operators::GRUUnitOpLite)
diff --git a/lite/operators/gru_unit_op.h b/lite/operators/gru_unit_op.h
deleted file mode 100644
index 301a7e7323..0000000000
--- a/lite/operators/gru_unit_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class GRUUnitOpLite : public OpLite {
- public:
-  GRUUnitOpLite() {}
-  explicit GRUUnitOpLite(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "GRUUnit"; }
-
- private:
-  mutable GRUUnitParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/im2sequence_op.cc b/lite/operators/im2sequence_op.cc
deleted file mode 100644
index 1cd415bcd5..0000000000
--- a/lite/operators/im2sequence_op.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/im2sequence_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-inline int Im2SeqOutputSize(
-    int input_size, int filter_size, int padding_0, int padding_1, int stride) {
-  const int output_size =
-      (input_size + padding_0 + padding_1 - filter_size) / stride + 1;
-  return output_size;
-}
-
-bool Im2SequenceOp::CheckShape() const { return true; }
-bool Im2SequenceOp::InferShape() const {
-  CHECK_OR_FALSE(param_.Out);
-  // TODO(Superjomn) Enable data sharing.
-  auto inputs = param_.X;
-  auto input_dims = param_.X->dims();
-  int img_num = input_dims[0];
-  int img_channels = input_dims[1];
-  int img_height = input_dims[2];
-  int img_width = input_dims[3];
-  auto kernels = param_.kernels;
-  auto paddings = param_.paddings;
-  auto strides = param_.strides;
-  DDimLite out_dims(
-      std::vector<int64_t>({1, img_channels * kernels[0] * kernels[1]}));
-
-  int output_height = Im2SeqOutputSize(
-      img_height, kernels[0], paddings[0], paddings[2], strides[0]);
-  int output_width = Im2SeqOutputSize(
-      img_width, kernels[1], paddings[1], paddings[3], strides[1]);
-  out_dims[0] = img_num * output_height * output_width;
-  param_.Out->Resize(out_dims);
-
-  // share lod
-  // param_.Out->set_lod(param_.X->lod());
-  return true;
-}
-
-bool Im2SequenceOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  param_.X =
-      scope->FindVar(opdesc.Input("X").front())->GetMutable<lite::Tensor>();
-  if (opdesc.Input("Y").size()) {
-    param_.Y =
-        scope->FindVar(opdesc.Input("Y").front())->GetMutable<lite::Tensor>();
-  }
-  param_.Out =
-      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
-  CHECK(param_.Out);
-  param_.strides = opdesc.GetAttr<std::vector<int>>("strides");
-  param_.paddings = opdesc.GetAttr<std::vector<int>>("paddings");
-  param_.kernels = opdesc.GetAttr<std::vector<int>>("kernels");
-  param_.out_strides = opdesc.GetAttr<std::vector<int>>("out_stride");
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(im2sequence, paddle::lite::operators::Im2SequenceOp);
diff --git a/lite/operators/im2sequence_op.h b/lite/operators/im2sequence_op.h
deleted file mode 100644
index 83a347c913..0000000000
--- a/lite/operators/im2sequence_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class Im2SequenceOp : public OpLite {
- public:
-  Im2SequenceOp() {}
-  explicit Im2SequenceOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "im2suquence"; }
-
- private:
-  mutable Im2SequenceParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/increment_op.cc b/lite/operators/increment_op.cc
deleted file mode 100644
index 55e387cfa2..0000000000
--- a/lite/operators/increment_op.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/increment_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool IncrementOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Out);
-  return true;
-}
-
-bool IncrementOp::InferShape() const {
-  CHECK_OR_FALSE(param_.Out);
-  // TODO(Superjomn) Enable data sharing.
-  auto out_dims = param_.X->dims();
-  param_.Out->Resize(out_dims);
-  return true;
-}
-
-bool IncrementOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  param_.X =
-      scope->FindVar(opdesc.Input("X").front())->GetMutable<lite::Tensor>();
-  param_.Out =
-      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
-  CHECK(param_.X);
-  CHECK(param_.Out);
-  param_.step = opdesc.GetAttr<float>("step");
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(increment, paddle::lite::operators::IncrementOp);
diff --git a/lite/operators/increment_op.h b/lite/operators/increment_op.h
deleted file mode 100644
index f180d527c3..0000000000
--- a/lite/operators/increment_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class IncrementOp : public OpLite {
- public:
-  IncrementOp() {}
-  explicit IncrementOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "increment"; }
-
- private:
-  mutable IncrementParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/interpolate_op.cc b/lite/operators/interpolate_op.cc
deleted file mode 100644
index f29acf70a7..0000000000
--- a/lite/operators/interpolate_op.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/interpolate_op.h"
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool InterpolateOp::CheckShape() const {
-  auto* X = param_.X;
-  auto* OutSize = param_.OutSize;
-  CHECK_OR_FALSE(X);
-  if (OutSize != nullptr) {
-    CHECK_OR_FALSE(OutSize);
-  }
-  CHECK_OR_FALSE(param_.Out);
-  return true;
-}
-
-bool InterpolateOp::InferShape() const {
-  auto* X = param_.X;
-  auto* OutSize = param_.OutSize;
-
-  int n = X->dims()[0];
-  int c = X->dims()[1];
-  int h = X->dims()[2];
-  int w = X->dims()[3];
-  int out_h;
-  int out_w;
-
-  if (OutSize != nullptr) {
-    auto outsize_data = OutSize->data<int>();
-    int h_out = outsize_data[0];  // HW
-    int w_out = outsize_data[1];  // HW
-    param_.Out->Resize({n, c, h_out, w_out});
-  } else {
-    if (0 >= param_.out_h && 0 >= param_.out_w) {
-      out_h = h * param_.scale;
-      out_w = w * param_.scale;
-      out_h = out_h > 0 ? out_h : -1;
-      out_w = out_w > 0 ? out_w : -1;
-    } else {
-      out_h = param_.out_h;
-      out_w = param_.out_w;
-    }
-    param_.Out->Resize({n, c, out_h, out_w});
-  }
-  return true;
-}
-
-bool InterpolateOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
-  auto X = op_desc.Input("X").front();
-  if (op_desc.HasInput("OutSize")) {
-    auto out_size_var_names = op_desc.Input("OutSize");
-    if (out_size_var_names.size() > 0) {
-      param_.OutSize = scope->FindVar(out_size_var_names.front())
-                           ->GetMutable<lite::Tensor>();
-    }
-  } else {
-    param_.OutSize = nullptr;
-  }
-  auto Out = op_desc.Output("Out").front();
-  param_.X = scope->FindVar(X)->GetMutable<lite::Tensor>();
-  param_.Out = scope->FindVar(Out)->GetMutable<lite::Tensor>();
-  if (op_desc.HasAttr("scale")) {
-    param_.scale = op_desc.GetAttr<float>("scale");
-  }
-  if (op_desc.HasAttr("out_w")) {
-    param_.out_w = op_desc.GetAttr<int>("out_w");
-  }
-  if (op_desc.HasAttr("out_h")) {
-    param_.out_h = op_desc.GetAttr<int>("out_h");
-  }
-  param_.align_corners = op_desc.GetAttr<bool>("align_corners");
-  param_.interp_method = op_desc.GetAttr<std::string>("interp_method");
-  return true;
-}
-
-} /* namespace operators */
-} /* namespace lite */
-} /* namespace paddle */
-
-REGISTER_LITE_OP(nearest_interp, paddle::lite::operators::InterpolateOp);
-REGISTER_LITE_OP(bilinear_interp, paddle::lite::operators::InterpolateOp);
diff --git a/lite/operators/interpolate_op.h b/lite/operators/interpolate_op.h
deleted file mode 100644
index 5fcf4ef594..0000000000
--- a/lite/operators/interpolate_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class InterpolateOp : public OpLite {
- public:
-  InterpolateOp() {}
-
-  explicit InterpolateOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "interpolate"; }
-
- private:
-  mutable InterpolateParam param_;
-};
-
-} /* namespace operators */
-} /* namespace lite */
-} /* namespace paddle */
diff --git a/lite/operators/io_copy_once_op.cc b/lite/operators/io_copy_once_op.cc
deleted file mode 100644
index 286f1e0aa3..0000000000
--- a/lite/operators/io_copy_once_op.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/io_copy_once_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool IoCopyOnceOp::run_once() const { return true; }
-
-std::string IoCopyOnceOp::DebugString() const { return "io_copy_once_op"; }
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(io_copy_once, paddle::lite::operators::IoCopyOnceOp);
diff --git a/lite/operators/io_copy_once_op.h b/lite/operators/io_copy_once_op.h
deleted file mode 100644
index 2dc0babc59..0000000000
--- a/lite/operators/io_copy_once_op.h
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include "lite/core/op_lite.h"
-#include "lite/operators/io_copy_op.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class IoCopyOnceOp : public IoCopyOp {
- public:
-  explicit IoCopyOnceOp(const std::string &type) : IoCopyOp(type) {}
-  bool run_once() const override;
-  std::string DebugString() const override;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/io_copy_op.cc b/lite/operators/io_copy_op.cc
deleted file mode 100644
index f7e72a6e1e..0000000000
--- a/lite/operators/io_copy_op.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/io_copy_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool IoCopyOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.x);
-  CHECK_OR_FALSE(param_.y);
-  return true;
-}
-bool IoCopyOp::InferShape() const {
-  param_.y->Resize(param_.x->dims());
-  return true;
-}
-bool IoCopyOp::Run() { return OpLite::Run(); }
-bool IoCopyOp::AttachImpl(const cpp::OpDesc &opdesc,
-                          paddle::lite::Scope *scope) {
-  auto x = opdesc.Input("Input").front();
-  auto out = opdesc.Output("Out").front();
-  param_.x = GetTensor(scope, x);
-  param_.y = GetMutableTensor(scope, out);
-  return true;
-}
-std::string IoCopyOp::DebugString() const { return "io_copy_op"; }
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(io_copy, paddle::lite::operators::IoCopyOp);
diff --git a/lite/operators/io_copy_op.h b/lite/operators/io_copy_op.h
deleted file mode 100644
index 8d6d69d63e..0000000000
--- a/lite/operators/io_copy_op.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include "lite/core/op_lite.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class IoCopyOp : public OpLite {
- public:
-  explicit IoCopyOp(const std::string &type) : OpLite(type) {}
-  bool CheckShape() const override;
-  bool InferShape() const override;
-  bool Run() override;
-  std::string DebugString() const override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
- protected:
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
- private:
-  operators::IoCopyParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/is_empty_op.cc b/lite/operators/is_empty_op.cc
deleted file mode 100644
index e89c72d414..0000000000
--- a/lite/operators/is_empty_op.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/is_empty_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool IsEmptyOp::CheckShape() const { return true; }
-
-bool IsEmptyOp::InferShape() const {}
-
-bool IsEmptyOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  param_.X =
-      scope->FindVar(opdesc.Input("X").front())->GetMutable<lite::Tensor>();
-  param_.Out =
-      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
-  CHECK(param_.X);
-  CHECK(param_.Out);
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(is_empty, paddle::lite::operators::IsEmptyOp);
diff --git a/lite/operators/is_empty_op.h b/lite/operators/is_empty_op.h
deleted file mode 100644
index 5bfa0905c7..0000000000
--- a/lite/operators/is_empty_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class IsEmptyOp : public OpLite {
- public:
-  IsEmptyOp() {}
-  explicit IsEmptyOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "binary logical"; }
-
- private:
-  mutable IsEmptyParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/layout_once_op.cc b/lite/operators/layout_once_op.cc
deleted file mode 100644
index 89e4a46ae8..0000000000
--- a/lite/operators/layout_once_op.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/layout_once_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool LayoutOnceOp::run_once() const { return true; }
-
-std::string LayoutOnceOp::DebugString() const { return "layout_once_op"; }
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(layout_once, paddle::lite::operators::LayoutOnceOp);
diff --git a/lite/operators/layout_once_op.h b/lite/operators/layout_once_op.h
deleted file mode 100644
index 1e5f4e28a0..0000000000
--- a/lite/operators/layout_once_op.h
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include "lite/core/op_lite.h"
-#include "lite/operators/layout_op.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class LayoutOnceOp : public LayoutOp {
- public:
-  explicit LayoutOnceOp(const std::string &type) : LayoutOp(type) {}
-  bool run_once() const override;
-  std::string DebugString() const override;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/layout_op.cc b/lite/operators/layout_op.cc
deleted file mode 100644
index f27f1ced55..0000000000
--- a/lite/operators/layout_op.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/layout_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool LayoutOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.x);
-  CHECK_OR_FALSE(param_.y);
-  return true;
-}
-bool LayoutOp::InferShape() const {
-  param_.y->Resize(param_.x->dims());
-  return true;
-}
-bool LayoutOp::Run() { return OpLite::Run(); }
-bool LayoutOp::AttachImpl(const cpp::OpDesc &opdesc,
-                          paddle::lite::Scope *scope) {
-  auto x = opdesc.Input("Input").front();
-  auto out = opdesc.Output("Out").front();
-  param_.x = GetTensor(scope, x);
-  param_.y = GetMutableTensor(scope, out);
-  return true;
-}
-std::string LayoutOp::DebugString() const { return "layout_op"; }
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(layout, paddle::lite::operators::LayoutOp);
diff --git a/lite/operators/layout_op.h b/lite/operators/layout_op.h
deleted file mode 100644
index 216d571d7c..0000000000
--- a/lite/operators/layout_op.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include "lite/core/op_lite.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class LayoutOp : public OpLite {
- public:
-  explicit LayoutOp(const std::string &type) : OpLite(type) {}
-  bool CheckShape() const override;
-  bool InferShape() const override;
-  bool Run() override;
-  std::string DebugString() const override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
- protected:
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
- private:
-  operators::LayoutParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/lod_reset_op.cc b/lite/operators/lod_reset_op.cc
deleted file mode 100644
index 1754e709ff..0000000000
--- a/lite/operators/lod_reset_op.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/lod_reset_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool LodResetOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Out);
-  return true;
-}
-
-bool LodResetOp::InferShape() const {
-  CHECK_OR_FALSE(param_.Out);
-  // TODO(Superjomn) Enable data sharing.
-  param_.Out->Resize(param_.X->dims());
-  if (param_.Y) {
-  } else {
-    CHECK_GT(param_.target_lod.size(), 0)
-        << "target lod must be provided when Y is not exist";
-  }
-  return true;
-}
-
-bool LodResetOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  param_.X =
-      scope->FindVar(opdesc.Input("X").front())->GetMutable<lite::Tensor>();
-  if (opdesc.Input("Y").size()) {
-    param_.Y =
-        scope->FindVar(opdesc.Input("Y").front())->GetMutable<lite::Tensor>();
-  }
-  param_.Out =
-      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
-  CHECK(param_.X);
-  CHECK(param_.Out);
-  param_.target_lod = opdesc.GetAttr<std::vector<int>>("target_lod");
-  // param_.append = opdesc.GetAttr<bool>("append");
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(lod_reset, paddle::lite::operators::LodResetOp);
diff --git a/lite/operators/lod_reset_op.h b/lite/operators/lod_reset_op.h
deleted file mode 100644
index 4e048a9a69..0000000000
--- a/lite/operators/lod_reset_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class LodResetOp : public OpLite {
- public:
-  LodResetOp() {}
-  explicit LodResetOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "lod_reset"; }
-
- private:
-  mutable LodResetParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/logical_op.cc b/lite/operators/logical_op.cc
deleted file mode 100644
index 8af982ad53..0000000000
--- a/lite/operators/logical_op.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/logical_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool BinaryLogicalOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Y);
-  CHECK_OR_FALSE(param_.Out);
-  return true;
-}
-
-bool BinaryLogicalOp::InferShape() const {
-  CHECK_OR_FALSE(param_.Out);
-  // TODO(Superjomn) Enable data sharing.
-  auto input_dims = param_.X->dims();
-  param_.Out->Resize(input_dims);
-  return true;
-}
-
-bool BinaryLogicalOp::AttachImpl(const cpp::OpDesc &opdesc,
-                                 lite::Scope *scope) {
-  param_.X =
-      scope->FindVar(opdesc.Input("X").front())->GetMutable<lite::Tensor>();
-  param_.Y =
-      scope->FindVar(opdesc.Input("Y").front())->GetMutable<lite::Tensor>();
-  param_.Out =
-      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
-  CHECK(param_.X);
-  CHECK(param_.Y);
-  CHECK(param_.Out);
-  return true;
-}
-bool UnaryLogicalOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Out);
-  return true;
-}
-
-bool UnaryLogicalOp::InferShape() const {
-  CHECK_OR_FALSE(param_.Out);
-  // TODO(Superjomn) Enable data sharing.
-  auto input_dims = param_.X->dims();
-  param_.Out->Resize(input_dims);
-  return true;
-}
-
-bool UnaryLogicalOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  param_.X =
-      scope->FindVar(opdesc.Input("X").front())->GetMutable<lite::Tensor>();
-  param_.Out =
-      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
-  CHECK(param_.X);
-  CHECK(param_.Out);
-  return true;
-}
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(logical_xor, paddle::lite::operators::BinaryLogicalOp);
-REGISTER_LITE_OP(logical_and, paddle::lite::operators::BinaryLogicalOp);
-REGISTER_LITE_OP(logical_or, paddle::lite::operators::BinaryLogicalOp);
-REGISTER_LITE_OP(logical_not, paddle::lite::operators::UnaryLogicalOp);
diff --git a/lite/operators/logical_op.h b/lite/operators/logical_op.h
deleted file mode 100644
index a0fc1d68a6..0000000000
--- a/lite/operators/logical_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class BinaryLogicalOp : public OpLite {
- public:
-  BinaryLogicalOp() {}
-  explicit BinaryLogicalOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "binary logical"; }
-
- private:
-  mutable LogicalParam param_;
-};
-
-class UnaryLogicalOp : public OpLite {
- public:
-  UnaryLogicalOp() {}
-  explicit UnaryLogicalOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "binary logical"; }
-
- private:
-  mutable LogicalParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/lookup_table_op.cc b/lite/operators/lookup_table_op.cc
deleted file mode 100644
index 192de2ecf8..0000000000
--- a/lite/operators/lookup_table_op.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/lookup_table_op.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool LookupTableOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.W)
-  CHECK_OR_FALSE(param_.Ids)
-  CHECK_OR_FALSE(param_.Out)
-
-  auto table_dims = param_.W->dims();
-  auto ids_dims = param_.Ids->dims();
-
-  int ids_rank = ids_dims.size();
-
-  CHECK_EQ_OR_FALSE(table_dims.size(), 2)
-  CHECK_EQ_OR_FALSE(ids_dims[ids_rank - 1], 1)
-
-  return true;
-}
-
-bool LookupTableOpLite::InferShape() const {
-  auto table_dims = param_.W->dims();
-  auto ids_dims = param_.Ids->dims();
-
-  int ids_rank = ids_dims.size();
-
-  auto output_dims = ids_dims.Slice(0, ids_rank - 1);
-
-  std::vector<int64_t> out_dims;
-  for (int i = 0; i < ids_rank - 1; ++i) {
-    out_dims.push_back(ids_dims[i]);
-  }
-  out_dims.push_back(table_dims[1]);
-  param_.Out->Resize(lite::DDim{out_dims});
-  return true;
-}
-
-bool LookupTableOpLite::AttachImpl(const cpp::OpDesc &op_desc,
-                                   lite::Scope *scope) {
-  auto input = op_desc.Input("W").front();
-  auto ids = op_desc.Input("Ids").front();
-  auto out = op_desc.Output("Out").front();
-
-  param_.W = scope->FindVar(input)->GetMutable<lite::Tensor>();
-  param_.Ids = scope->FindVar(ids)->GetMutable<lite::Tensor>();
-  param_.Out = scope->FindVar(out)->GetMutable<lite::Tensor>();
-
-  param_.padding_idx = op_desc.GetAttr<int64_t>("padding_idx");
-
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(lookup_table, paddle::lite::operators::LookupTableOpLite)
diff --git a/lite/operators/lookup_table_op.h b/lite/operators/lookup_table_op.h
deleted file mode 100644
index 2701af9840..0000000000
--- a/lite/operators/lookup_table_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class LookupTableOpLite : public OpLite {
- public:
-  LookupTableOpLite() {}
-  explicit LookupTableOpLite(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "LookupTable"; }
-
- private:
-  mutable LookupTableParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/lrn_op.cc b/lite/operators/lrn_op.cc
deleted file mode 100644
index 34b00653f9..0000000000
--- a/lite/operators/lrn_op.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/lrn_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool LrnOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Out);
-  const auto in_dims = param_.X->dims();
-  CHECK_EQ(in_dims.size(), 4);
-  return true;
-}
-
-bool LrnOpLite::InferShape() const {
-  param_.Out->Resize(param_.X->dims());
-  return true;
-}
-
-bool LrnOpLite::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
-  auto X_name = opdesc.Input("X").front();
-  auto Out_name = opdesc.Output("Out").front();
-  param_.X = GetVar<lite::Tensor>(scope, X_name);
-  param_.Out = GetMutableVar<lite::Tensor>(scope, Out_name);
-  param_.local_size = opdesc.GetAttr<int>("local_size");
-  param_.alpha = opdesc.GetAttr<float>("alpha");
-  param_.beta = opdesc.GetAttr<float>("beta");
-  param_.k = opdesc.GetAttr<float>("k");
-  param_.norm_region = opdesc.GetAttr<std::string>("norm_region");
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(lrn, paddle::lite::operators::LrnOpLite);
diff --git a/lite/operators/lrn_op.h b/lite/operators/lrn_op.h
deleted file mode 100644
index a569a77fb4..0000000000
--- a/lite/operators/lrn_op.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class LrnOpLite : public OpLite {
- public:
-  LrnOpLite() {}
-  explicit LrnOpLite(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "lrn"; }
-
- private:
-  mutable LrnParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/matmul_op.cc b/lite/operators/matmul_op.cc
deleted file mode 100644
index 286ade7b21..0000000000
--- a/lite/operators/matmul_op.cc
+++ /dev/null
@@ -1,165 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/matmul_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool MatMulOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Y);
-  CHECK_OR_FALSE(param_.Out);
-
-  return true;
-}
-
-bool MatMulOpLite::InferShape() const {
-  const auto x_dims = param_.X->dims();
-  const auto y_dims = param_.Y->dims();
-  bool x_transpose = param_.transpose_X;
-  bool y_transpose = param_.transpose_Y;
-  std::vector<int64_t> dim_out_vec;
-
-  if (x_dims.size() > 2 && y_dims.size() >= 2) {
-    // x: [B, ..., M, K], y: [B, ..., K, N], out: [B, ..., M, N]
-    // x: [B, M, K], y: [K, N], out: [B, M, N]
-    if (!x_transpose && !y_transpose) {
-      CHECK_EQ(x_dims[x_dims.size() - 1], y_dims[y_dims.size() - 2])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << ")";
-    } else if (!x_transpose && y_transpose) {
-      CHECK_EQ(x_dims[x_dims.size() - 1], y_dims[y_dims.size() - 1])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << ")";
-    } else if (x_transpose && !y_transpose) {
-      CHECK_EQ(x_dims[x_dims.size() - 2], y_dims[y_dims.size() - 2])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << ")";
-    } else {
-      CHECK_EQ(x_dims[x_dims.size() - 2], y_dims[y_dims.size() - 1])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << ")";
-    }
-
-    dim_out_vec.resize(x_dims.size());
-    for (size_t i = 0; i < x_dims.size() - 2; ++i) {
-      dim_out_vec[i] = x_dims[i];
-    }
-    if (!x_transpose && !y_transpose) {
-      dim_out_vec[x_dims.size() - 2] = x_dims[x_dims.size() - 2];
-      dim_out_vec[x_dims.size() - 1] = y_dims[y_dims.size() - 1];
-    } else if (!x_transpose && y_transpose) {
-      dim_out_vec[x_dims.size() - 2] = x_dims[x_dims.size() - 2];
-      dim_out_vec[x_dims.size() - 1] = y_dims[y_dims.size() - 2];
-    } else if (x_transpose && !y_transpose) {
-      dim_out_vec[x_dims.size() - 2] = x_dims[x_dims.size() - 1];
-      dim_out_vec[x_dims.size() - 1] = y_dims[y_dims.size() - 1];
-    } else {
-      dim_out_vec[x_dims.size() - 2] = x_dims[x_dims.size() - 1];
-      dim_out_vec[x_dims.size() - 1] = y_dims[y_dims.size() - 2];
-    }
-  } else if (x_dims.size() == 2 && y_dims.size() == 2) {
-    // x: [M, K], y: [K, N], out: [M, N]
-    // x: [M, K], y: [K, N], out: [M, N]
-    if (!x_transpose && !y_transpose) {
-      CHECK_EQ(x_dims[1], y_dims[0])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << "), x_transpose is " << x_transpose << ", y_transpose is "
-          << y_transpose;
-    } else if (!x_transpose && y_transpose) {
-      CHECK_EQ(x_dims[1], y_dims[1])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << "), x_transpose is " << x_transpose << ", y_transpose is "
-          << y_transpose;
-    } else if (x_transpose && !y_transpose) {
-      CHECK_EQ(x_dims[0], y_dims[0])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << "), x_transpose is " << x_transpose << ", y_transpose is "
-          << y_transpose;
-    } else {
-      CHECK_EQ(x_dims[0], y_dims[1])
-          << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-          << "), x_transpose is " << x_transpose << ", y_transpose is "
-          << y_transpose;
-    }
-    dim_out_vec.resize(x_dims.size());
-    if (x_transpose) {
-      dim_out_vec[0] = x_dims[1];
-    } else {
-      dim_out_vec[0] = x_dims[0];
-    }
-    if (y_transpose) {
-      dim_out_vec[1] = y_dims[0];
-    } else {
-      dim_out_vec[1] = y_dims[1];
-    }
-  } else if (x_dims.size() > 2 && y_dims.size() == 1) {
-    // x: [B, M, K], y: [K], out: [B, M]
-    CHECK_EQ(x_dims[x_dims.size() - 1], y_dims[0])
-        << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-        << ")";
-    dim_out_vec.resize(x_dims.size() - 1);
-    for (size_t i = 0; i < dim_out_vec.size(); ++i) {
-      dim_out_vec[i] = x_dims[i];
-    }
-  } else if (x_dims.size() == 1 && y_dims.size() == 1) {  // todo
-    // x: [K], y: [K], out: [1]
-    if (x_dims[0] == y_dims[0] && x_transpose == false &&
-        y_transpose == false) {
-      dim_out_vec.resize(1);
-      dim_out_vec[0] = 1;
-    }
-    // x: [M], y: [N], x_transpose: true, y_transpose: true, out: [M, N]
-    if (x_transpose == true && y_transpose == true) {
-      dim_out_vec.resize(2);
-      dim_out_vec[0] = x_dims[0];
-      dim_out_vec[1] = y_dims[0];
-    }
-  } else {
-    LOG(FATAL) << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
-               << ")";
-  }
-
-  DDim dim_out(dim_out_vec);
-  param_.Out->Resize(dim_out);
-
-  return true;
-}
-
-bool MatMulOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  CHECK(!op_desc.Input("X").empty());
-  CHECK(!op_desc.Input("Y").empty());
-  CHECK(!op_desc.Output("Out").empty());
-
-  auto X = op_desc.Input("X").front();
-  auto Y = op_desc.Input("Y").front();
-  auto Out = op_desc.Output("Out").front();
-
-  param_.X = GetVar<lite::Tensor>(scope, X);
-  param_.Y = GetVar<lite::Tensor>(scope, Y);
-  param_.Out = GetMutableVar<lite::Tensor>(scope, Out);
-  param_.transpose_X = op_desc.GetAttr<bool>("transpose_X");
-  param_.transpose_Y = op_desc.GetAttr<bool>("transpose_Y");
-  param_.alpha = op_desc.GetAttr<float>("alpha");
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(matmul, paddle::lite::operators::MatMulOpLite);
diff --git a/lite/operators/matmul_op.h b/lite/operators/matmul_op.h
deleted file mode 100644
index 0aa47c89dd..0000000000
--- a/lite/operators/matmul_op.h
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class MatMulOpLite : public OpLite {
- public:
-  MatMulOpLite() {}
-
-  explicit MatMulOpLite(const std::string &type) : OpLite(type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
-
-  std::string DebugString() const override { return "matmul"; }
-
- private:
-  mutable MatMulParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/mean_op.cc b/lite/operators/mean_op.cc
deleted file mode 100644
index 33ad7ed7fe..0000000000
--- a/lite/operators/mean_op.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class MeanOp : public OpLite {
- public:
-  explicit MeanOp(const std::string& type) : OpLite(type) {}
-
-  bool CheckShape() const override {
-    CHECK_OR_FALSE(param_.X);
-    CHECK_OR_FALSE(param_.Out);
-    return true;
-  }
-
-  bool InferShape() const override {
-    param_.Out->Resize(std::vector<int64_t>{1});
-    return true;
-  }
-
-  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override {
-    auto X_name = opdesc.Input("X").front();
-    auto Out_name = opdesc.Output("Out").front();
-
-    param_.X = GetVar<lite::Tensor>(scope, X_name);
-    param_.Out = GetMutableVar<Tensor>(scope, Out_name);
-    return true;
-  }
-
-  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "mean"; }
-
- private:
-  mutable operators::MeanParam param_;
-};
-
-#ifdef LITE_WITH_TRAIN
-class MeanGradOp : public OpLite {
- public:
-  explicit MeanGradOp(const std::string& type) : OpLite(type) {}
-
-  bool CheckShape() const override {
-    CHECK_OR_FALSE(param_.X);
-    CHECK_OR_FALSE(param_.Out_grad);
-    CHECK_OR_FALSE(param_.X_grad);
-    return true;
-  }
-
-  bool InferShape() const override {
-    param_.X_grad->Resize(param_.X->dims());
-    // param_.X_grad->set_lod(param_.X->lod());
-    return true;
-  }
-
-  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override {
-    CHECK_EQ(opdesc.InputArgumentNames().size(), 2UL);
-    auto X_name = opdesc.Input("X").front();
-    auto Out_grad_name = opdesc.Input(framework::GradVarName("Out")).front();
-    auto X_grad_name = opdesc.Output(framework::GradVarName("X")).front();
-
-    param_.X = GetVar<lite::Tensor>(scope, X_name);
-    param_.Out_grad = GetVar<lite::Tensor>(scope, Out_grad_name);
-    param_.X_grad = GetMutableVar<Tensor>(scope, X_grad_name);
-    return true;
-  }
-
-  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "mean_grad"; }
-
- private:
-  mutable operators::MeanGradParam param_;
-};
-#endif
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(mean, paddle::lite::operators::MeanOp);
-#ifdef LITE_WITH_TRAIN
-REGISTER_LITE_OP(mean_grad, paddle::lite::operators::MeanGradOp);
-#endif
diff --git a/lite/operators/mul_op.cc b/lite/operators/mul_op.cc
deleted file mode 100644
index 43048f2996..0000000000
--- a/lite/operators/mul_op.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/mul_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool MulOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.x);
-  CHECK_OR_FALSE(param_.y);
-  CHECK_OR_FALSE(param_.output);
-  // bias is optional.
-
-  const auto x_dims = param_.x->dims();
-  const auto y_dims = param_.y->dims();
-
-  CHECK_GT_OR_FALSE(x_dims.size(), static_cast<size_t>(param_.x_num_col_dims));
-  CHECK_GT_OR_FALSE(y_dims.size(), static_cast<size_t>(param_.y_num_col_dims));
-
-  // #ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-  //   auto x_mat_dims =
-  //       framework::flatten_to_2d(x_dims.data(), param_.x_num_col_dims);
-  //   auto y_mat_dims =
-  //       framework::flatten_to_2d(y_dims.data(), param_.y_num_col_dims);
-
-  //   PADDLE_ENFORCE_EQ(x_mat_dims[1],
-  //                     y_mat_dims[0],
-  //                     "First matrix's width must be equal with second
-  //                     matrix's"
-  //                     "height. %s, %s",
-  //                     x_mat_dims[1],
-  //                     y_mat_dims[0]);
-  // #endif
-
-  return true;
-}
-
-bool MulOpLite::InferShape() const {
-  const auto x_dims = param_.x->dims();
-  const auto y_dims = param_.y->dims();
-
-  // Set output dims
-  std::vector<int64_t> out_dims(
-      param_.x_num_col_dims + y_dims.size() - param_.y_num_col_dims, 0);
-  for (int i = 0; i < param_.x_num_col_dims; ++i) {
-    out_dims[i] = x_dims[i];
-  }
-
-  for (auto i = static_cast<size_t>(param_.y_num_col_dims); i < y_dims.size();
-       ++i) {
-    out_dims[i] = y_dims[i];
-  }
-
-  param_.output->Resize(lite::DDim(out_dims));
-  auto out_lod = param_.output->mutable_lod();
-  *out_lod = param_.x->lod();
-
-  // share LoD
-  // param_.output->set_lod(param_.input->lod());
-  return true;
-}
-
-#ifdef LITE_WITH_TRAIN
-bool MulGradOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.x);
-  CHECK_OR_FALSE(param_.y);
-  CHECK_OR_FALSE(param_.output_grad);
-
-  return true;
-}
-
-bool MulGradOpLite::InferShape() const {
-  if (param_.x_grad) param_.x_grad->Resize(param_.x->dims());
-  if (param_.y_grad) param_.y_grad->Resize(param_.y->dims());
-  return true;
-}
-
-bool MulGradOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  auto X_name = op_desc.Input("X").front();
-  auto Y_name = op_desc.Input("Y").front();
-  auto Out_grad_name = op_desc.Input(framework::GradVarName("Out")).front();
-
-  if (op_desc.Output(framework::GradVarName("X")).size()) {
-    auto X_grad_name = op_desc.Output(framework::GradVarName("X")).front();
-    param_.x_grad = GetMutableVar<lite::Tensor>(scope, X_grad_name);
-  }
-
-  if (op_desc.Output(framework::GradVarName("Y")).size()) {
-    auto Y_grad_name = op_desc.Output(framework::GradVarName("Y")).front();
-    param_.y_grad = GetMutableVar<lite::Tensor>(scope, Y_grad_name);
-  }
-
-  param_.x = GetVar<lite::Tensor>(scope, X_name);
-  param_.y = GetVar<lite::Tensor>(scope, Y_name);
-  param_.output_grad = GetVar<lite::Tensor>(scope, Out_grad_name);
-
-  return true;
-}
-#endif
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(mul, paddle::lite::operators::MulOpLite);
-#ifdef LITE_WITH_TRAIN
-REGISTER_LITE_OP(mul_grad, paddle::lite::operators::MulGradOpLite);
-#endif
diff --git a/lite/operators/mul_op.h b/lite/operators/mul_op.h
deleted file mode 100644
index e53168e00e..0000000000
--- a/lite/operators/mul_op.h
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class MulOpLite : public OpLite {
- public:
-  MulOpLite() {}
-
-  explicit MulOpLite(const std::string &type) : OpLite(type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  // TODO(Superjomn) replace framework::OpDesc with a lite one.
-  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
-    CHECK(!op_desc.Input("X").empty());
-    CHECK(!op_desc.Input("Y").empty());
-    CHECK(!op_desc.Output("Out").empty());
-
-    auto input = op_desc.Input("X").front();
-    auto W = op_desc.Input("Y").front();
-    auto out = op_desc.Output("Out").front();
-    auto *var = scope->FindVar(input);
-    CHECK(var);
-    param_.x = &var->Get<Tensor>();
-    var = scope->FindVar(W);
-    CHECK(var) << "no var called " << W;
-    param_.y = &var->Get<Tensor>();
-    var = scope->FindVar(out);
-    CHECK(var) << "no var called " << out;
-    param_.output = var->GetMutable<Tensor>();
-    param_.x_num_col_dims = op_desc.GetAttr<int>("x_num_col_dims");
-    param_.y_num_col_dims = op_desc.GetAttr<int>("y_num_col_dims");
-
-    return true;
-  }
-
-  std::string DebugString() const override { return "mul"; }
-
- private:
-  mutable MulParam param_;
-};
-
-#ifdef LITE_WITH_TRAIN
-class MulGradOpLite : public OpLite {
- public:
-  MulGradOpLite() {}
-
-  explicit MulGradOpLite(const std::string &type) : OpLite(type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
-
-  std::string DebugString() const override { return "mul_grad"; }
-
- private:
-  mutable MulGradParam param_;
-};
-#endif
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/multiclass_nms_op.cc b/lite/operators/multiclass_nms_op.cc
deleted file mode 100644
index b9b0db5cca..0000000000
--- a/lite/operators/multiclass_nms_op.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/multiclass_nms_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool MulticlassNmsOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.bboxes);
-  CHECK_OR_FALSE(param_.scores);
-  CHECK_OR_FALSE(param_.out);
-
-  auto box_dims = param_.bboxes->dims();
-  auto score_dims = param_.scores->dims();
-  auto score_size = score_dims.size();
-
-  CHECK_OR_FALSE(score_size == 2 || score_size == 3);
-  CHECK_OR_FALSE(box_dims.size() == 3);
-  if (score_size == 3) {
-    CHECK_OR_FALSE(box_dims[2] == 4 || box_dims[2] == 8 || box_dims[2] == 16 ||
-                   box_dims[2] == 24 || box_dims[2] == 32);
-    CHECK_OR_FALSE(box_dims[1] == score_dims[2]);
-  } else {
-    CHECK_OR_FALSE(box_dims[2] == 4);
-    CHECK_OR_FALSE(box_dims[1] == score_dims[1]);
-  }
-  return true;
-}
-
-bool MulticlassNmsOpLite::InferShape() const {
-  auto box_dims = param_.bboxes->dims();
-  auto score_dims = param_.scores->dims();
-  auto score_size = score_dims.size();
-  if (score_size == 3) {
-    param_.out->Resize({box_dims[1], box_dims[2], 3});
-  } else {
-    param_.out->Resize({-1, box_dims[2] + 2});
-  }
-  return true;
-}
-
-bool MulticlassNmsOpLite::AttachImpl(const cpp::OpDesc& opdesc,
-                                     lite::Scope* scope) {
-  auto bboxes_name = opdesc.Input("BBoxes").front();
-  auto scores_name = opdesc.Input("Scores").front();
-  auto out_name = opdesc.Output("Out").front();
-  param_.bboxes = GetVar<lite::Tensor>(scope, bboxes_name);
-  param_.scores = GetVar<lite::Tensor>(scope, scores_name);
-  param_.out = GetMutableVar<lite::Tensor>(scope, out_name);
-  param_.background_label = opdesc.GetAttr<int>("background_label");
-  param_.keep_top_k = opdesc.GetAttr<int>("keep_top_k");
-  param_.nms_top_k = opdesc.GetAttr<int>("nms_top_k");
-  param_.score_threshold = opdesc.GetAttr<float>("score_threshold");
-  param_.nms_threshold = opdesc.GetAttr<float>("nms_threshold");
-  param_.nms_eta = opdesc.GetAttr<float>("nms_eta");
-  if (opdesc.HasAttr("normalized")) {
-    param_.normalized = opdesc.GetAttr<bool>("normalized");
-  }
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(multiclass_nms, paddle::lite::operators::MulticlassNmsOpLite);
diff --git a/lite/operators/multiclass_nms_op.h b/lite/operators/multiclass_nms_op.h
deleted file mode 100644
index 7be0d17d74..0000000000
--- a/lite/operators/multiclass_nms_op.h
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class MulticlassNmsOpLite : public OpLite {
- public:
-  MulticlassNmsOpLite() {}
-  explicit MulticlassNmsOpLite(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "multiclass_nms"; }
-
- private:
-  mutable MulticlassNmsParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/negative_op.cc b/lite/operators/negative_op.cc
deleted file mode 100644
index 4db1dd4fee..0000000000
--- a/lite/operators/negative_op.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/negative_op.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool NegativeOpLite::CheckShape() const {
-  CHECK_GT_OR_FALSE(param_.X->dims().size(), 1UL);
-  CHECK_OR_FALSE(param_.Out);
-  return true;
-}
-
-bool NegativeOpLite::InferShape() const {
-  lite::DDim input_dims;
-  input_dims = param_.X->dims();
-  param_.Out->Resize(lite::DDim(input_dims));
-  return true;
-}
-
-// TODO(Superjomn) replace framework::OpDesc with a lite one.
-bool NegativeOpLite::AttachImpl(const cpp::OpDesc &op_desc,
-                                lite::Scope *scope) {
-  auto inputs = op_desc.Input("X").front();
-  auto out = op_desc.Output("Out").front();
-  param_.X = scope->FindVar(inputs)->GetMutable<lite::Tensor>();
-  param_.Out = scope->FindVar(out)->GetMutable<lite::Tensor>();
-
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(negative, paddle::lite::operators::NegativeOpLite);
diff --git a/lite/operators/negative_op.h b/lite/operators/negative_op.h
deleted file mode 100644
index 83f1008c96..0000000000
--- a/lite/operators/negative_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class NegativeOpLite : public OpLite {
- public:
-  NegativeOpLite() {}
-  explicit NegativeOpLite(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "negative"; }
-
- private:
-  mutable NegativeParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/norm_op.cc b/lite/operators/norm_op.cc
deleted file mode 100644
index dff26966d4..0000000000
--- a/lite/operators/norm_op.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/norm_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool NormOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Out);
-  return true;
-}
-
-bool NormOp::InferShape() const {
-  CHECK_OR_FALSE(param_.Out);
-  // TODO(Superjomn) Enable data sharing.
-  auto out_dims = param_.X->dims();
-  param_.Out->Resize(out_dims);
-  return true;
-}
-
-bool NormOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  param_.X =
-      scope->FindVar(opdesc.Input("X").front())->GetMutable<lite::Tensor>();
-  param_.Out =
-      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
-  CHECK(param_.X);
-  CHECK(param_.Out);
-  param_.axis = opdesc.GetAttr<int>("axis");
-  param_.epsilon = opdesc.GetAttr<float>("epsilon");
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(norm, paddle::lite::operators::NormOp);
diff --git a/lite/operators/norm_op.h b/lite/operators/norm_op.h
deleted file mode 100644
index ae4594ed02..0000000000
--- a/lite/operators/norm_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class NormOp : public OpLite {
- public:
-  NormOp() {}
-  explicit NormOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "norm"; }
-
- private:
-  mutable NormParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/op_params.cc b/lite/operators/op_params.cc
deleted file mode 100644
index 162fea8b48..0000000000
--- a/lite/operators/op_params.cc
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/op_params.h"
diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h
deleted file mode 100644
index 392ed6296a..0000000000
--- a/lite/operators/op_params.h
+++ /dev/null
@@ -1,824 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/api/paddle_place.h"
-#include "lite/core/scope.h"
-#include "lite/core/tensor.h"
-#include "lite/core/types.h"
-#include "lite/model_parser/cpp/block_desc.h"
-#include "lite/model_parser/desc_apis.h"
-#include "lite/utils/all.h"
-/*
- * This file contains all the argument parameter data structure for operators.
- */
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-using param_t = Any;
-#define WITH_INT8_CONFIG             \
-  bool enable_int8{false};           \
-  float input_scale{1.0};            \
-  std::vector<float> weight_scale{}; \
-  float output_scale{1.0};
-
-/// ----------------------- Functional operators ------------------------------
-struct FeedParam {
-  std::vector<lite::Tensor>* feed_list{};
-  lite::Tensor* out{};
-  int col;
-};
-
-struct FetchParam {
-  const lite::Tensor* input{};
-  std::vector<lite::Tensor>* fetch_list{};
-  int col;
-};
-
-// Helper op for lite framework
-struct IoCopyParam {
-  const lite::Tensor* x{};
-  lite::Tensor* y{};
-};
-
-struct LayoutParam {
-  const lite::Tensor* x{};
-  lite::Tensor* y{};
-};
-
-struct CalibParam {
-  const lite::Tensor* input{};
-  lite::Tensor* output{};
-  float scale;
-};
-
-struct GraphParam {
-  std::vector<const lite::Tensor*> inputs{};
-  std::vector<lite::Tensor*> outputs{};
-  std::string model_name{"model"};
-};
-
-/// -------------------------- NN operators ------------------------------------
-
-struct FcParam {
-  lite::Tensor* input{nullptr};
-  lite::Tensor* w{nullptr};
-  lite::Tensor* bias{nullptr};
-  lite::Tensor* output{nullptr};
-  lite::DDim in_mat_dims;
-  int in_num_col_dims{1};
-  bool weight_transposed{false};
-  // for int8
-  WITH_INT8_CONFIG
-};
-
-// For Interpolate Op
-struct InterpolateParam {
-  lite::Tensor* X{};
-  lite::Tensor* OutSize{};
-  lite::Tensor* Out{};
-
-  float scale{0.f};
-  int out_h{-1};
-  int out_w{-1};
-  bool align_corners{true};
-  std::string interp_method{"Nearest"};
-};
-
-// For Mul Op
-struct MulParam {
-  const lite::Tensor* x{};
-  const lite::Tensor* y{};
-  lite::Tensor* output{};
-
-  int x_num_col_dims{1};
-  int y_num_col_dims{1};
-  // for int8
-  WITH_INT8_CONFIG
-};
-
-struct MulGradParam {
-  const lite::Tensor* x{};
-  const lite::Tensor* y{};
-  const lite::Tensor* output_grad{};
-  lite::Tensor* x_grad{};
-  lite::Tensor* y_grad{};
-
-  int x_num_col_dims{1};
-  int y_num_col_dims{1};
-};
-
-// For ReduceMean Op
-struct ReduceMeanParam {
-  lite::Tensor* X{};
-  lite::Tensor* Out{};
-
-  std::vector<int> dim;
-  bool keep_dim{false};
-};
-
-// For Stack Op
-struct StackParam {
-  std::vector<lite::Tensor*> X;
-  lite::Tensor* Out{};
-
-  int axis{0};
-};
-
-// For Power Op
-struct PowerParam {
-  const lite::Tensor* X{};
-  lite::Tensor* Out{};
-
-  float scale{};
-  float shift{};
-  float power{};
-};
-
-struct ShuffleChannelParam {
-  const lite::Tensor* X{};
-  lite::Tensor* Out{};
-
-  int group;
-};
-
-// For Yolobox
-struct YoloBoxParam {
-  lite::Tensor* X{};
-  lite::Tensor* ImgSize{};
-  lite::Tensor* Boxes{};
-  lite::Tensor* Scores{};
-
-  std::vector<int> anchors{};
-  int class_num{0};
-  float conf_thresh{0.f};
-  int downsample_ratio{0};
-};
-
-// For Scale Op
-struct ScaleParam {
-  lite::Tensor* x{};
-  lite::Tensor* output{};
-
-  float scale{1.};
-  float bias{};
-  bool bias_after_scale{true};
-};
-
-// For Softmax op
-struct SoftmaxParam {
-  lite::Tensor* x{};
-  lite::Tensor* output{};
-  int axis{-1};
-};
-
-// For Reshape and Reshape2 Op
-struct ReshapeParam {
-  const lite::Tensor* x{};
-  const lite::Tensor* actual_shape{nullptr};
-  lite::Tensor* output{};
-  lite::Tensor* xshape{};
-
-  std::vector<int> shape{};
-  bool inplace{false};
-};
-
-// For Concat op
-struct ConcatParam {
-  std::vector<lite::Tensor*> x{};
-  lite::Tensor* output{};
-  int axis{0};
-};
-
-/// ----------------------- activation operators ----------------------
-struct ActivationParam {
-  const lite::Tensor* X{};
-  float Leaky_relu_alpha{0};   // leaky_relu param
-  float Relu_clipped_coef{6};  // relu_clipped param
-  std::string Prelu_mode{
-      "channel"};  // prelu param, can be "all", "channel" or "element"
-  lite::Tensor* Prelu_alpha{};  // prelu param
-  float Swish_beta;             // swish param
-  float hard_sigmoid_slope{0.2};
-  float hard_sigmoid_offset{0.5};
-  lite::Tensor* Out{};
-  bool has_active{false};
-  lite_api::ActivationType active_type;
-};
-
-struct ActivationGradParam {
-  const lite::Tensor* X{};
-  const lite::Tensor* Out{};
-  // for backward
-  lite::Tensor* X_grad{};
-  const lite::Tensor* Out_grad{};
-};
-
-// For Convolution op
-struct ConvParam {
-  lite::Tensor* x{};
-  lite::Tensor* filter{};
-  lite::Tensor* bias{nullptr};
-  lite::Tensor* residualData{nullptr};
-  lite::Tensor* output{};
-  std::vector<int> strides{1, 1};
-  std::vector<int> paddings{0, 0};
-  int groups{1};
-  std::vector<int> dilations{1, 1};
-  bool fuse_relu_before_depthwise_conv{false};
-  bool use_mkldnn{false};
-  bool fuse_relu{false};  // only used in mkldnn kernel
-  bool use_quantizer{
-      false};  // set true for op that should be quantized, only used for cpu
-  bool fuse_residual_connection{false};
-  float scale_in{1.0f};           // only used with mkl-dnn int8
-  float scale_out{1.0f};          // only used with mkl-dnn int8
-  float scale_in_eltwise{1.0f};   // only used with mkl-dnn int8
-  float scale_weights{1.0f};      // only used with mkl-dnn int8
-  bool force_fp32_output{false};  // only used in mkl-dnn int8
-  std::string data_format{"Anylayout"};
-  // for activation
-  ActivationParam activation_param;
-  // for int8
-  WITH_INT8_CONFIG
-};
-
-// For BatchNorm op
-struct BatchNormParam {
-  lite::Tensor* x{};
-  lite::Tensor* bias{};
-  lite::Tensor* scale{};
-  lite::Tensor* mean{};
-  lite::Tensor* variance{};
-  lite::Tensor* y{};
-  lite::Tensor* mean_out{};
-  lite::Tensor* variance_out{};
-  lite::Tensor* saved_mean{};
-  lite::Tensor* saved_variance{};
-  bool is_test{true};
-  bool use_global_stats{false};
-  float epsilon;
-  float momentum;
-  DataLayoutType data_layout{DATALAYOUT(kNCHW)};
-};
-
-// For Pooling op
-struct PoolParam {
-  lite::Tensor* x{};
-  lite::Tensor* output{};
-  std::string pooling_type{""};
-  std::vector<int> ksize{};
-  bool global_pooling{
-      false};  // if true, knernel size and paddings will be ignored
-  std::vector<int> strides{1, 1};
-  std::vector<int> paddings{0, 0};
-  bool exclusive{true};
-  bool adaptive{false};
-  bool ceil_mode{false};
-  bool use_quantizer{false};
-  std::string data_format{"AnyLayout"};
-};
-
-// For Dropout op
-struct DropoutParam {
-  const lite::Tensor* x{};
-  lite::Tensor* output{};
-  lite::Tensor* mask{};
-  float dropout_prob{.5f};
-  bool is_test{false};
-  bool fix_seed{false};
-  int seed{0};
-  std::string dropout_implementation{"downgrade_in_infer"};
-};
-
-// For Split op
-struct SplitParam {
-  lite::Tensor* x{};
-  std::vector<lite::Tensor*> output{};
-  int axis{-1};
-  int num{0};
-  std::vector<int> sections;
-};
-
-// For Transpose op
-struct TransposeParam {
-  const lite::Tensor* x{};
-  lite::Tensor* output{};
-  std::vector<int> axis;
-  bool use_mkldnn{false};
-  std::string data_format{"AnyLayout"};
-};
-
-/// ----------------------- element wise operators ----------------------
-struct ElementwiseParam {
-  const lite::Tensor* X{};
-  const lite::Tensor* Y{};
-  lite::Tensor* Out{};
-  int axis{-1};  // for broadcasting.
-};
-
-struct ElementwiseGradParam {
-  const lite::Tensor* Y{};
-  const lite::Tensor* Out_grad{};
-  lite::Tensor* X_grad{};
-  lite::Tensor* Y_grad{};
-  int axis{-1};  // for broadcasting.
-};
-
-struct FusionElementwiseActivationParam : public ElementwiseParam {
-  std::string act_type;
-};
-
-struct FusionElementwiseActivationGradParam : public ElementwiseGradParam {
-  std::string act_type;
-};
-
-/// ----------------------- mean operators ----------------------
-struct MeanParam {
-  const lite::Tensor* X{};
-  lite::Tensor* Out{};
-};
-
-struct MeanGradParam {
-  const lite::Tensor* X{};
-  const lite::Tensor* Out_grad{};
-  // for backward
-  lite::Tensor* X_grad{};
-};
-
-/// ----------------------- fill_constant operators ----------------------
-struct FillConstantParam {
-  int dtype{static_cast<int>(VarDescAPI::VarDataType::FP32)};
-  std::vector<int64_t> shape{};
-  float value{0.0f};
-  // useless for x86, keep it for compatibility
-  bool force_cpu{false};
-  lite::Tensor* Out{};
-};
-
-//
-struct FakeQuantizeMovingAvgMaxAbsParam {
-  const lite::Tensor* x{};
-  const lite::Tensor* in_scale{};
-  const lite::Tensor* in_accum{};
-  const lite::Tensor* in_state{};
-  lite::Tensor* out{};
-  lite::Tensor* out_scale{};
-  lite::Tensor* out_state{};
-  lite::Tensor* out_accum{};
-  int bit_length;
-  bool is_test{true};
-  float moving_rate{0.9};
-};
-
-struct FakeDequantizeMaxAbsParam {
-  const lite::Tensor* x{};
-  const lite::Tensor* in_scale{};
-  lite::Tensor* out{};
-  float max_range;
-};
-
-/// ----------------------- sgd operators ----------------------
-struct SGDParam {
-  int dtype{static_cast<int>(VarDescAPI::VarDataType::FP32)};
-
-  const lite::Tensor* Param{};
-  const lite::Tensor* LearningRate{};
-  const lite::Tensor* Grad{};
-  lite::Tensor* ParamOut{};
-};
-
-/// ----------------------- uniform_random operators ----------------------
-struct UniformRandomParam {
-  std::vector<int64_t> shape{};
-  float min{-1.0f};
-  float max{1.0f};
-  int seed{0};
-  int dtype{static_cast<int>(VarDescAPI::VarDataType::FP32)};
-  lite::Tensor* Out{};
-};
-/// ----------------------- negative operators --------------
-struct NegativeParam {
-  const lite::Tensor* X{};
-  lite::Tensor* Out{};
-};
-/// ----------------------- pad2d operators ----------------------
-struct Pad2dParam {
-  const lite::Tensor* X{};
-  lite::Tensor* Out{};
-  std::vector<int> paddings{0, 0, 0, 0};
-  std::string mode{"constant"};
-  float pad_value = 0.f;
-  std::string data_format{"NCHW"};
-};
-
-/// ----------------------- Crop operators ----------------------
-struct CropParam {
-  const lite::Tensor* X{};
-  lite::Tensor* Out{};
-  std::vector<int> offsets;
-  std::vector<int> shape;
-};
-
-///----------------------- argmax operators ----------------------
-struct ArgmaxParam {
-  lite::Tensor* X{};
-  lite::Tensor* Out{};
-  int Axis{0};
-};
-
-///----------------------- axpy operators ----------------------
-struct AxpyParam {
-  lite::Tensor* Scale{};
-  lite::Tensor* X{};
-  lite::Tensor* Bias{};
-  lite::Tensor* Out{};
-};
-/// ----------------------- GRU unit operators ----------------------f
-struct GRUUnitParam {
-  enum ActType { identity, sigmoid, tanh, relu };
-  const lite::Tensor* input{nullptr};
-  const lite::Tensor* hidden_prev{nullptr};
-  const lite::Tensor* weight{nullptr};
-  const lite::Tensor* bias{nullptr};
-  lite::Tensor* gate{nullptr};
-  lite::Tensor* reset_hidden_prev{nullptr};
-  lite::Tensor* hidden{nullptr};
-
-  int gate_activation{ActType::sigmoid};
-  int activation{ActType::tanh};
-  bool origin_mode{false};
-};
-
-/// ------------------------------ lrn operators ------------------------------
-struct LrnParam {
-  const lite::Tensor* X{};
-  lite::Tensor* Out{};
-  int local_size{5};
-  float alpha{1.};
-  float beta{0.75};
-  float k{1.};
-  std::string norm_region{"AcrossChannels"};
-};
-
-/// ----------------------- decode_bboxes operators ----------------------
-struct DecodeBboxesParam {
-  const lite::Tensor* loc_data{};
-  const lite::Tensor* prior_data{};
-  lite::Tensor* bbox_data{};
-
-  int batch_num;
-  int num_priors;
-  int num_loc_classes{0};
-  int background_label_id{0};
-  bool share_location{true};
-  bool variance_encoded_in_target;
-  // code_type:  corner, cente_size, corner_size
-  std::string code_type;
-};
-
-/// ----------------------- box_coder operators ----------------------
-struct BoxCoderParam {
-  const lite::Tensor* prior_box{};
-  const lite::Tensor* prior_box_var{};
-  const lite::Tensor* target_box{};
-  lite::Tensor* proposals{};
-  // code_type: encode_center_size and decode_center_size
-  std::string code_type{"encode_center_size"};
-  bool box_normalized{true};
-  int axis{0};
-  std::vector<float> variance{};
-};
-
-/// ----------------------- multiclass_nms operators ----------------------
-struct MulticlassNmsParam {
-  const lite::Tensor* bboxes{};
-  const lite::Tensor* scores{};
-  lite::Tensor* out{};
-  int background_label{0};
-  float score_threshold{};
-  int nms_top_k{};
-  float nms_threshold{0.3};
-  float nms_eta{1.0};
-  int keep_top_k;
-  bool normalized{true};
-};
-
-/// ----------------------- priorbox operators ----------------------
-struct PriorBoxParam {
-  lite::Tensor* input{};
-  lite::Tensor* image{};
-  lite::Tensor* boxes{};
-  lite::Tensor* variances{};
-
-  bool flip;
-  bool clip;
-  std::vector<float> min_sizes;
-  std::vector<float> max_sizes;
-  std::vector<float> aspect_ratios;
-  std::vector<float> variances_;
-  int img_w{0};
-  int img_h{0};
-  float step_w{0};
-  float step_h{0};
-  float offset{0.5};
-  int prior_num{0};
-  // priortype: prior_min, prior_max, prior_com
-  std::vector<std::string> order;
-};
-
-struct DensityPriorBoxParam : public PriorBoxParam {
-  std::vector<float> fixed_sizes;
-  std::vector<float> fixed_ratios;
-  std::vector<int> density_sizes;
-};
-/// ----------------------- GRU operators ----------------------f
-struct GRUParam {
-  const lite::Tensor* input{nullptr};
-  const lite::Tensor* h0{nullptr};
-  const lite::Tensor* weight{nullptr};
-  const lite::Tensor* bias{nullptr};
-  lite::Tensor* batch_gate{nullptr};
-  lite::Tensor* batch_reset_hidden_prev{nullptr};
-  lite::Tensor* batch_hidden{nullptr};
-  lite::Tensor* hidden{nullptr};
-
-  std::string gate_activation{"sigmoid"};
-  std::string activation{"tanh"};
-  bool is_reverse{false};
-  bool origin_mode{false};
-};
-
-/// ----------------------- BeamSearchDecode operators ----------------------f
-struct BeamSearchDecodeParam {
-  std::vector<lite::Tensor>* ids{nullptr};
-  std::vector<lite::Tensor>* scores{nullptr};
-  lite::Tensor* sentence_ids{nullptr};
-  lite::Tensor* sentence_scores{nullptr};
-  int beam_size;
-  int end_id;
-};
-
-/// ----------------------- LookupTable operators ----------------------f
-struct LookupTableParam {
-  lite::Tensor* W{nullptr};
-  lite::Tensor* Ids{nullptr};
-  lite::Tensor* Out{nullptr};
-  int64_t padding_idx{-1};
-};
-
-struct Im2SequenceParam {
-  const lite::Tensor* X{};
-  const lite::Tensor* Y{};
-  lite::Tensor* Out{};
-  std::vector<int> kernels{3, 3};
-  std::vector<int> strides{1, 1};
-  std::vector<int> paddings{0, 0, 0, 0};
-  std::vector<int> out_strides{1, 1};
-};
-
-struct SequenceSoftmaxParam {
-  const lite::Tensor* X{};
-  lite::Tensor* Out{};
-};
-
-struct NormParam {
-  const lite::Tensor* X{};
-  lite::Tensor* Out{};
-  int axis{1};
-  float epsilon{1e-10};
-};
-
-struct LogicalParam {
-  const lite::Tensor* X{};
-  const lite::Tensor* Y{};
-  lite::Tensor* Out{};
-};
-
-struct CompareParam {
-  const lite::Tensor* X{};
-  const lite::Tensor* Y{};
-  bool force_cpu{0};
-  int axis{-1};
-  lite::Tensor* Out{};
-};
-
-struct WhileParam {
-  Scope* scope{};
-  Tensor* cond{};
-  cpp::BlockDesc* sub_block{};
-  std::vector<Tensor*> x{};
-  std::vector<Tensor*> outs{};
-};
-
-struct TopkParam {
-  const lite::Tensor* X{};
-  lite::Tensor* Out{};
-  lite::Tensor* Indices{};
-  int K{1};
-};
-
-struct IncrementParam {
-  const lite::Tensor* X{};
-  lite::Tensor* Out{};
-  float step{1};
-};
-
-struct WriteToArrayParam {
-  const lite::Tensor* X{};
-  const lite::Tensor* I{};
-  std::vector<lite::Tensor>* Out{};
-};
-
-struct ReadFromArrayParam {
-  std::vector<lite::Tensor>* X{};
-  lite::Tensor* I{};
-  lite::Tensor* Out{};
-};
-
-struct BeamSearchParam {
-  const lite::Tensor* pre_ids{};
-  const lite::Tensor* pre_scores{};
-  const lite::Tensor* ids{};
-  const lite::Tensor* scores{};
-  lite::Tensor* selected_ids{};
-  lite::Tensor* selected_scores{};
-  lite::Tensor* parent_idx{};
-  int level;
-  int beam_size;
-  int end_id;
-  bool is_accumulated;
-};
-
-struct SequencePoolParam {
-  const lite::Tensor* X{};
-  lite::Tensor* Out{};
-  std::string pool_type{"AVERAGE"};
-#ifdef LITE_WITH_X86
-  float pad_value{0.0};
-#endif
-};
-
-struct SequenceExpandParam {
-  const lite::Tensor* X{};
-  const lite::Tensor* Y{};
-  lite::Tensor* Out{};
-  int ref_level{-1};
-};
-
-struct ReduceMaxParam {
-  const lite::Tensor* X{};
-  lite::Tensor* Out{};
-  std::vector<int> dim{};
-  bool keep_dim{false};
-};
-
-struct LodResetParam {
-  const lite::Tensor* X{};
-  const lite::Tensor* Y{};
-  lite::Tensor* Out{};
-  std::vector<int> target_lod;
-  bool append;
-};
-
-struct IsEmptyParam {
-  const lite::Tensor* X{};
-  lite::Tensor* Out{};
-};
-/// ----------------------- shape operators ----------------------
-struct ShapeParam {
-  const lite::Tensor* X{};
-  lite::Tensor* Out{};
-};
-
-struct CastParam {
-  const lite::Tensor* X{};
-  lite::Tensor* Out{};
-  int out_dtype{2};
-  int in_dtype{2};
-};
-
-struct SliceParam {
-  const lite::Tensor* X{};
-  lite::Tensor* Out{};
-  std::vector<int> axes{};
-  std::vector<int> starts{};
-  std::vector<int> ends{};
-  std::vector<int> decrease_axis{};
-};
-
-struct AffineChannelParam {
-  const lite::Tensor* X{};  // X is 4D tensor
-  const lite::Tensor* Scale{};
-  const lite::Tensor* Bias{};
-  std::string data_layout{"NCHW"};  // optional string from: NHWC, NCHW.
-  lite::Tensor* Out{};
-};
-
-struct AnchorGeneratorParam {
-  const lite::Tensor* Input{};
-  std::vector<float> anchor_sizes{};
-  std::vector<float> aspect_ratios{};
-  std::vector<float> stride{};
-  std::vector<float> variances{{0.1, 0.1, 0.2, 0.2}};
-  float offset{0.5};
-
-  lite::Tensor* Anchors{};
-  lite::Tensor* Variances{};
-};
-
-struct GenerateProposalsParam {
-  // inputs
-  const lite::Tensor* Scores{};
-  const lite::Tensor* BboxDeltas{};
-  const lite::Tensor* ImInfo{};
-  lite::Tensor* Anchors{};
-  lite::Tensor* Variances{};
-
-  // attrs
-  int pre_nms_topN{6000};
-  int post_nms_topN{1000};
-  float nms_thresh{0.5};
-  float min_size{0.1};
-  float eta{1.0};
-
-  // outputs
-  lite::Tensor* RpnRois{};
-  lite::Tensor* RpnRoiProbs{};
-};
-/// ----------------------- squeeze operators ----------------------
-struct SqueezeParam {
-  const lite::Tensor* X{};
-  lite::Tensor* Out{};
-  lite::Tensor* XShape{};
-  std::vector<int> axes{};
-};
-
-/// ----------------------- expand operators ----------------------
-struct ExpandParam {
-  const lite::Tensor* X{};
-  lite::Tensor* Out{};
-  std::vector<int> expand_times{};
-};
-
-/// ----------------------- matmul operators ----------------------
-struct MatMulParam {
-  const lite::Tensor* X{};
-  const lite::Tensor* Y{};
-  lite::Tensor* Out{};
-  bool transpose_X{false};
-  bool transpose_Y{false};
-  float alpha{1.0f};
-};
-
-/// ----------------------- assign operators -----------------------
-struct AssignParam {
-  const lite::Tensor* X{};
-  lite::Tensor* Out{};
-};
-
-/// ----------------------- roi_align operators -----------------------
-struct RoiAlignParam {
-  lite::Tensor* X{};
-  lite::Tensor* ROIs{};
-  lite::Tensor* Out{};
-  float spatial_scale{1.0};
-  int pooled_height{1};
-  int pooled_width{1};
-  int sampling_ratio{-1};
-};
-
-/// ----------------------- box_clip operators -----------------------
-struct BoxClipParam {
-  const lite::Tensor* Input{};
-  const lite::Tensor* ImInfo{};
-  lite::Tensor* Output{};
-};
-
-/// ----------------------- assign_value operators -----------------------
-struct AssignValueParam {
-  std::vector<int> shape{};
-  int dtype{};
-  std::vector<float> fp32_values{};
-  std::vector<int> int32_values{};
-  lite::Tensor* Out{};
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/pad2d_op.cc b/lite/operators/pad2d_op.cc
deleted file mode 100644
index 09deed8907..0000000000
--- a/lite/operators/pad2d_op.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/pad2d_op.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool Pad2dOpLite::CheckShape() const {
-  CHECK_GT_OR_FALSE(param_.X->dims().size(), 1UL);
-  CHECK_OR_FALSE(param_.Out);
-  CHECK(param_.mode == "constant" || param_.mode == "reflect" ||
-        param_.mode == "edge")
-      << "Invilid mode.";
-  CHECK_EQ(param_.paddings.size(), 4UL);
-  return true;
-}
-
-bool Pad2dOpLite::InferShape() const {
-  // nchw
-  auto x_dims = param_.X->dims();
-  int out_h = x_dims[2] + param_.paddings[0] + param_.paddings[1];
-  int out_w = x_dims[3] + param_.paddings[2] + param_.paddings[3];
-  param_.Out->Resize(lite::DDim({x_dims[0], x_dims[1], out_h, out_w}));
-  return true;
-}
-
-// TODO(Superjomn) replace framework::OpDesc with a lite one.
-bool Pad2dOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  param_.X = scope->FindVar(op_desc.Input("X").front())->GetMutable<Tensor>();
-  param_.Out =
-      scope->FindVar(op_desc.Output("Out").front())->GetMutable<Tensor>();
-  param_.mode = op_desc.GetAttr<std::string>("mode");
-  param_.pad_value = op_desc.GetAttr<float>("pad_value");
-  param_.paddings = op_desc.GetAttr<std::vector<int>>("paddings");
-  param_.data_format = op_desc.GetAttr<std::string>("data_format");
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(pad2d, paddle::lite::operators::Pad2dOpLite);
diff --git a/lite/operators/pad2d_op.h b/lite/operators/pad2d_op.h
deleted file mode 100644
index c51a76a7ae..0000000000
--- a/lite/operators/pad2d_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class Pad2dOpLite : public OpLite {
- public:
-  Pad2dOpLite() {}
-  explicit Pad2dOpLite(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "pad2d"; }
-
- private:
-  mutable Pad2dParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/pool_op.cc b/lite/operators/pool_op.cc
deleted file mode 100644
index 1ebbc059b7..0000000000
--- a/lite/operators/pool_op.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/pool_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool PoolOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.x);
-  CHECK_OR_FALSE(param_.output);
-
-  const auto& x_dims = param_.x->dims();
-  const auto& ksize = param_.ksize;
-  const auto& strides = param_.strides;
-  const auto& paddings = param_.paddings;
-
-  // "Pooling intput should be 4-D or 5-D tensor."
-  CHECK_OR_FALSE(x_dims.size() == 4 || x_dims.size() == 5);
-  // Input size and pooling size should be consistent.
-  CHECK_OR_FALSE(x_dims.size() - ksize.size() == 2U);
-  // Strides size and pooling size should be the same.
-  CHECK_OR_FALSE(ksize.size() == strides.size());
-  // Paddings size and pooling size should be the same.
-  CHECK_OR_FALSE(ksize.size() == paddings.size());
-
-  return true;
-}
-
-int PoolOutputSize(
-    int input_size, int filter_size, int padding, int stride, bool ceil_mode) {
-  int output_size;
-  if (!ceil_mode) {
-    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
-  } else {
-    output_size =
-        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
-  }
-  return output_size;
-}
-
-bool PoolOpLite::InferShape() const {
-  const auto x_dims = param_.x->dims();
-  std::vector<int>& ksize = param_.ksize;
-  if (param_.global_pooling) {
-    ksize.resize(static_cast<size_t>(x_dims.size()) - 2);
-    for (size_t i = 0; i < ksize.size(); ++i) {
-      param_.paddings[i] = 0;
-      ksize[i] = static_cast<int>(x_dims[i + 2]);
-    }
-  }
-
-  std::vector<int64_t> output_shape({x_dims[0], x_dims[1]});
-  if (param_.adaptive) {
-    output_shape.insert(
-        output_shape.end(), param_.ksize.begin(), param_.ksize.end());
-  } else {
-    for (size_t i = 0; i < param_.ksize.size(); ++i) {
-      output_shape.push_back(PoolOutputSize(x_dims[i + 2],
-                                            param_.ksize[i],
-                                            param_.paddings[i],
-                                            param_.strides[i],
-                                            param_.ceil_mode));
-    }
-  }
-  param_.output->Resize(lite::DDim(output_shape));
-
-  // ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
-  // ctx->ShareLoD("X", "Out");
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(pool2d, paddle::lite::operators::PoolOpLite);
diff --git a/lite/operators/pool_op.h b/lite/operators/pool_op.h
deleted file mode 100644
index aecec4c619..0000000000
--- a/lite/operators/pool_op.h
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/core/tensor.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class PoolOpLite : public OpLite {
- public:
-  PoolOpLite() {}
-
-  explicit PoolOpLite(const std::string &type) : OpLite(type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  // TODO(Superjomn) replace framework::OpDesc with a lite one.
-  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
-    auto x = op_desc.Input("X").front();
-    auto out = op_desc.Output("Out").front();
-
-    CHECK(scope->FindVar(x));
-    CHECK(scope->FindVar(out));
-    param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
-    param_.output = scope->FindVar(out)->GetMutable<lite::Tensor>();
-
-    param_.pooling_type = op_desc.GetAttr<std::string>("pooling_type");
-    param_.ksize = op_desc.GetAttr<std::vector<int>>("ksize");
-    param_.global_pooling = op_desc.GetAttr<bool>("global_pooling");
-    param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
-    param_.paddings = op_desc.GetAttr<std::vector<int>>("paddings");
-
-    if (op_desc.HasAttr("exclusive")) {
-      param_.exclusive = op_desc.GetAttr<bool>("exclusive");
-    }
-    if (op_desc.HasAttr("adaptive")) {
-      param_.adaptive = op_desc.GetAttr<bool>("adaptive");
-    }
-    if (op_desc.HasAttr("ceil_mode")) {
-      param_.ceil_mode = op_desc.GetAttr<bool>("ceil_mode");
-    }
-    if (op_desc.HasAttr("use_quantizer")) {
-      param_.use_quantizer = op_desc.GetAttr<bool>("use_quantizer");
-    }
-    // param_.data_format = op_desc.GetAttr<bool>("data_format");
-    return true;
-  }
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "pool2d"; }
-
- private:
-  mutable PoolParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/pool_op_test.cc b/lite/operators/pool_op_test.cc
deleted file mode 100644
index 1675c016b8..0000000000
--- a/lite/operators/pool_op_test.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/pool_op.h"
-#include <gtest/gtest.h>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-TEST(pool_op_lite, test) {
-  // prepare variables
-  Scope scope;
-  auto* x = scope.Var("x")->GetMutable<Tensor>();
-  auto* output = scope.Var("output")->GetMutable<Tensor>();
-  x->Resize(DDim(std::vector<int64_t>({1, 3, 224, 224})));
-  output->Resize(DDim(std::vector<int64_t>{1, 3, 112, 112}));
-
-  // set data
-  for (int i = 0; i < 1 * 3 * 224 * 224; i++) {
-    x->mutable_data<float>()[i] = i;
-  }
-  for (int i = 0; i < 1 * 3 * 112 * 112; i++) {
-    output->mutable_data<float>()[i] = 0.;
-  }
-
-  // prepare op desc
-  cpp::OpDesc desc;
-  desc.SetType("pool2d");
-  desc.SetInput("X", {"x"});
-  desc.SetOutput("Out", {"output"});
-
-  std::string pooling_type("max");
-  desc.SetAttr("pooling_type", pooling_type);
-  // desc.SetAttr("ksize", static_cast<std::vector<int>>({2, 2}));
-  std::vector<int> ksize{2, 2};
-  desc.SetAttr("ksize", ksize);
-
-  bool global_pooling{false};
-  desc.SetAttr("global_pooling", global_pooling);
-
-  std::vector<int> strides{1, 1};
-  desc.SetAttr("strides", strides);
-
-  std::vector<int> paddings{0, 0};
-  desc.SetAttr("paddings", paddings);
-
-  bool exclusive{true};
-  desc.SetAttr("exclusive", exclusive);
-
-  bool adaptive{false};
-  desc.SetAttr("adaptive", adaptive);
-
-  bool ceil_mode{false};
-  desc.SetAttr("ceil_mode", ceil_mode);
-
-  bool use_quantizer{false};
-  desc.SetAttr("use_quantizer", use_quantizer);
-
-  PoolOpLite pool("pool2d");
-  pool.SetValidPlaces({Place{TARGET(kARM), PRECISION(kFloat)}});
-  pool.Attach(desc, &scope);
-  auto kernels = pool.CreateKernels({Place{TARGET(kARM), PRECISION(kFloat)}});
-  LOG(INFO) << "kernels.size(): " << kernels.size();
-#ifdef LITE_WITH_ARM
-  ASSERT_FALSE(kernels.empty());
-#else
-  ASSERT_TRUE(kernels.empty());
-#endif
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-#ifdef LITE_WITH_ARM
-USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
-#endif
diff --git a/lite/operators/power_op.cc b/lite/operators/power_op.cc
deleted file mode 100644
index 578d95ad53..0000000000
--- a/lite/operators/power_op.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/power_op.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool PowerOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Out);
-  return true;
-}
-
-bool PowerOp::InferShape() const {
-  param_.Out->Resize(param_.X->dims());
-  return true;
-}
-
-bool PowerOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  auto X = op_desc.Input("X").front();
-  auto Out = op_desc.Output("Out").front();
-  param_.X = scope->FindVar(X)->GetMutable<lite::Tensor>();
-  param_.Out = scope->FindVar(Out)->GetMutable<lite::Tensor>();
-  param_.scale = op_desc.GetAttr<float>("scale");
-  param_.shift = op_desc.GetAttr<float>("shift");
-  param_.power = op_desc.GetAttr<float>("power");
-  CHECK(param_.X);
-  CHECK(param_.Out);
-
-  return true;
-}
-
-} /* namespace operators */
-} /* namespace lite */
-} /* namespace paddle */
-
-REGISTER_LITE_OP(power, paddle::lite::operators::PowerOp);
diff --git a/lite/operators/power_op.h b/lite/operators/power_op.h
deleted file mode 100644
index a6d43f4394..0000000000
--- a/lite/operators/power_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class PowerOp : public OpLite {
- public:
-  PowerOp() {}
-
-  explicit PowerOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "power"; }
-
- private:
-  mutable PowerParam param_;
-};
-
-} /* namespace operators */
-} /* namespace lite */
-} /* namespace paddle */
diff --git a/lite/operators/prior_box_op.cc b/lite/operators/prior_box_op.cc
deleted file mode 100644
index 3cc8938f4e..0000000000
--- a/lite/operators/prior_box_op.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/prior_box_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool PriorBoxOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.input);
-  CHECK_OR_FALSE(param_.image);
-  CHECK_OR_FALSE(param_.boxes);
-  CHECK_OR_FALSE(param_.variances);
-  return true;
-}
-
-bool PriorBoxOpLite::InferShape() const { return true; }
-
-bool PriorBoxOpLite::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
-  auto input = opdesc.Input("Input").front();
-  auto image = opdesc.Input("Image").front();
-  auto boxes = opdesc.Output("Boxes").front();
-  auto variances = opdesc.Output("Variances").front();
-
-  param_.input = scope->FindVar(input)->GetMutable<lite::Tensor>();
-  param_.image = scope->FindVar(image)->GetMutable<lite::Tensor>();
-  param_.boxes = scope->FindVar(boxes)->GetMutable<lite::Tensor>();
-  param_.variances = scope->FindVar(variances)->GetMutable<lite::Tensor>();
-
-  param_.clip = opdesc.GetAttr<bool>("clip");
-  param_.min_sizes = opdesc.GetAttr<std::vector<float>>("min_sizes");
-  param_.max_sizes = opdesc.GetAttr<std::vector<float>>("max_sizes");
-  param_.aspect_ratios = opdesc.GetAttr<std::vector<float>>("aspect_ratios");
-  param_.variances_ = opdesc.GetAttr<std::vector<float>>("variances");
-  if (opdesc.HasAttr("flip")) {
-    param_.flip = opdesc.GetAttr<bool>("flip");
-  }
-  if (opdesc.HasAttr("img_w")) {
-    param_.img_w = opdesc.GetAttr<int>("img_w");
-  }
-  if (opdesc.HasAttr("img_h")) {
-    param_.img_h = opdesc.GetAttr<int>("img_h");
-  }
-  if (opdesc.HasAttr("step_w")) {
-    param_.step_w = opdesc.GetAttr<float>("step_w");
-  }
-  if (opdesc.HasAttr("step_h")) {
-    param_.step_h = opdesc.GetAttr<float>("step_h");
-  }
-  param_.offset = opdesc.GetAttr<float>("offset");
-  if (opdesc.HasAttr("prior_num")) {
-    param_.prior_num = opdesc.GetAttr<int>("prior_num");
-  }
-  if (opdesc.HasAttr("order")) {
-    param_.order = opdesc.GetAttr<std::vector<std::string>>("order");
-  }
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(prior_box, paddle::lite::operators::PriorBoxOpLite);
diff --git a/lite/operators/prior_box_op.h b/lite/operators/prior_box_op.h
deleted file mode 100644
index a393e80315..0000000000
--- a/lite/operators/prior_box_op.h
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class PriorBoxOpLite : public OpLite {
- public:
-  PriorBoxOpLite() {}
-  explicit PriorBoxOpLite(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "prior_box"; }
-
- private:
-  mutable PriorBoxParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/read_from_array_op.cc b/lite/operators/read_from_array_op.cc
deleted file mode 100644
index ffc7727eb8..0000000000
--- a/lite/operators/read_from_array_op.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/read_from_array_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool ReadFromArrayOp::CheckShape() const { return true; }
-
-bool ReadFromArrayOp::InferShape() const {
-  auto in_dims = (*param_.X)[0].dims();
-  param_.Out->Resize(in_dims);
-  return true;
-}
-
-bool ReadFromArrayOp::AttachImpl(const cpp::OpDesc &opdesc,
-                                 lite::Scope *scope) {
-  auto in = opdesc.Input("X").front();
-  param_.X = scope->FindVar(in)->GetMutable<std::vector<lite::Tensor>>();
-
-  param_.I =
-      scope->FindVar(opdesc.Input("I").front())->GetMutable<lite::Tensor>();
-
-  param_.Out =
-      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(read_from_array, paddle::lite::operators::ReadFromArrayOp);
diff --git a/lite/operators/read_from_array_op.h b/lite/operators/read_from_array_op.h
deleted file mode 100644
index 5c7ba1468f..0000000000
--- a/lite/operators/read_from_array_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class ReadFromArrayOp : public OpLite {
- public:
-  ReadFromArrayOp() {}
-  explicit ReadFromArrayOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "read_from_array"; }
-
- private:
-  mutable ReadFromArrayParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/reduce_max_op.cc b/lite/operators/reduce_max_op.cc
deleted file mode 100644
index d7d90ee1f4..0000000000
--- a/lite/operators/reduce_max_op.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/reduce_max_op.h"
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool ReduceMaxOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Out);
-  auto dims = param_.dim;
-  auto x_dims = param_.X->dims();
-  int x_rank = x_dims.size();
-  if (dims.size() != 0) {
-    for (int i = 0; i < dims.size(); i++) {
-      if (dims[i] < 0) {
-        dims[i] = x_rank + dims[i];
-      }
-      CHECK_OR_FALSE(dims[i] <= x_rank && dims[i] >= -x_rank);
-    }
-  }
-  return true;
-}
-
-bool ReduceMaxOp::InferShape() const {
-  auto dims = param_.dim;
-  auto x_dims = param_.X->dims();
-  bool reduce_all = false;
-  bool keep_dim = param_.keep_dim;
-  auto x_rank = x_dims.size();
-  if (dims.size() != 0) {
-    for (int i = 0; i < dims.size(); i++) {
-      if (dims[i] < 0) {
-        dims[i] = x_rank + dims[i];
-      }
-    }
-  }
-  sort(dims.begin(), dims.end());
-  if (dims.size() == 0) {
-    reduce_all = true;
-  }
-  std::vector<int64_t> out_dims;
-  if (reduce_all) {
-    if (keep_dim) {
-      out_dims.push_back(x_rank);
-      out_dims.push_back(1);
-    } else {
-      out_dims.push_back(1);
-    }
-  } else {
-    for (int i = 0; i < x_dims.size(); i++) {
-      out_dims.push_back(x_dims[i]);
-    }
-    if (keep_dim) {
-      for (size_t i = 0; i < dims.size(); ++i) {
-        out_dims[dims[i]] = 1;
-      }
-    } else {
-      const int64_t kDelFlag = -2;
-      for (size_t i = 0; i < dims.size(); ++i) {
-        out_dims[dims[i]] = kDelFlag;
-      }
-      out_dims.erase(remove(out_dims.begin(), out_dims.end(), kDelFlag),
-                     out_dims.end());
-    }
-    param_.Out->Resize(DDim(out_dims));
-    if (dims[0] != 0) {
-      // Only pass LoD when not reducing on the first dim.
-      *param_.Out->mutable_lod() = param_.X->lod();
-    }
-  }
-  return true;
-}
-
-bool ReduceMaxOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  param_.X = const_cast<lite::Tensor *>(
-      &scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
-  param_.Out =
-      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
-  param_.dim = opdesc.GetAttr<std::vector<int>>("dim");
-  if (opdesc.HasAttr("keep_dim")) {
-    param_.keep_dim = opdesc.GetAttr<bool>("keep_dim");
-  } else {
-    param_.keep_dim = false;
-  }
-  CHECK(param_.X);
-  CHECK(param_.Out);
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(reduce_max, paddle::lite::operators::ReduceMaxOp);
diff --git a/lite/operators/reduce_max_op.h b/lite/operators/reduce_max_op.h
deleted file mode 100644
index 60e263f1b9..0000000000
--- a/lite/operators/reduce_max_op.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class ReduceMaxOp : public OpLite {
- public:
-  ReduceMaxOp() {}
-  explicit ReduceMaxOp(const std::string &op_type) : OpLite(op_type) {}
-  bool CheckShape() const override;
-  bool InferShape() const override;
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "reduce_max"; }
-
- private:
-  mutable ReduceMaxParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/reduce_mean_op.cc b/lite/operators/reduce_mean_op.cc
deleted file mode 100644
index bce31c315c..0000000000
--- a/lite/operators/reduce_mean_op.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/reduce_mean_op.h"
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool ReduceMeanOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Out);
-  auto dims = param_.dim;
-  auto x_dims = param_.X->dims();
-  int x_rank = x_dims.size();
-  if (dims.size() != 0) {
-    for (int i = 0; i < dims.size(); i++) {
-      if (dims[i] < 0) {
-        dims[i] = x_rank + dims[i];
-      }
-      CHECK_OR_FALSE(dims[i] <= x_rank && dims[i] >= -x_rank);
-    }
-  }
-  return true;
-}
-
-bool ReduceMeanOp::InferShape() const {
-  auto dims = param_.dim;
-  auto x_dims = param_.X->dims();
-  bool reduce_all = false;
-  bool keep_dim = param_.keep_dim;
-  auto x_rank = x_dims.size();
-  if (dims.size() != 0) {
-    for (int i = 0; i < dims.size(); i++) {
-      if (dims[i] < 0) {
-        dims[i] = x_rank + dims[i];
-      }
-    }
-  }
-  sort(dims.begin(), dims.end());
-  if (dims.size() == 0) {
-    reduce_all = true;
-  }
-  std::vector<int64_t> out_dims;
-  if (reduce_all) {
-    if (keep_dim) {
-      out_dims.push_back(x_rank);
-      out_dims.push_back(1);
-    } else {
-      out_dims.push_back(1);
-    }
-  } else {
-    for (int i = 0; i < x_dims.size(); i++) {
-      out_dims.push_back(x_dims[i]);
-    }
-    if (keep_dim) {
-      for (size_t i = 0; i < dims.size(); ++i) {
-        out_dims[dims[i]] = 1;
-      }
-    } else {
-      const int64_t kDelFlag = -2;
-      for (size_t i = 0; i < dims.size(); ++i) {
-        out_dims[dims[i]] = kDelFlag;
-      }
-      out_dims.erase(remove(out_dims.begin(), out_dims.end(), kDelFlag),
-                     out_dims.end());
-    }
-    param_.Out->Resize(DDim(out_dims));
-    if (dims[0] != 0) {
-      // Only pass LoD when not reducing on the first dim.
-      *param_.Out->mutable_lod() = param_.X->lod();
-    }
-  }
-  return true;
-}
-
-bool ReduceMeanOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  param_.X = const_cast<lite::Tensor *>(
-      &scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
-  param_.Out =
-      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
-  param_.dim = opdesc.GetAttr<std::vector<int>>("dim");
-  if (opdesc.HasAttr("keep_dim")) {
-    param_.keep_dim = opdesc.GetAttr<bool>("keep_dim");
-  } else {
-    param_.keep_dim = false;
-  }
-  CHECK(param_.X);
-  CHECK(param_.Out);
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(reduce_mean, paddle::lite::operators::ReduceMeanOp);
diff --git a/lite/operators/reduce_mean_op.h b/lite/operators/reduce_mean_op.h
deleted file mode 100644
index e701a1132a..0000000000
--- a/lite/operators/reduce_mean_op.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class ReduceMeanOp : public OpLite {
- public:
-  ReduceMeanOp() {}
-  explicit ReduceMeanOp(const std::string &op_type) : OpLite(op_type) {}
-  bool CheckShape() const override;
-  bool InferShape() const override;
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "reduce_mean"; }
-
- private:
-  mutable ReduceMeanParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/relu_op.cc b/lite/operators/relu_op.cc
deleted file mode 100644
index 9fa3ac8f30..0000000000
--- a/lite/operators/relu_op.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/relu_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool ReluOp::CheckShape() const { return true; }
-bool ReluOp::InferShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Out);
-  // TODO(Superjomn) Enable data sharing.
-  param_.Out->Resize(param_.X->dims());
-  auto out_lod = param_.Out->mutable_lod();
-  *out_lod = param_.X->lod();
-  // share lod
-  // param_.output->set_lod(param_.X->lod());
-  return true;
-}
-
-bool ReluOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  param_.X = const_cast<lite::Tensor *>(
-      &scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
-  param_.Out =
-      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
-  CHECK(param_.X);
-  CHECK(param_.Out);
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-// REGISTER_LITE_OP(relu, paddle::lite::operators::ReluOp);
diff --git a/lite/operators/relu_op.h b/lite/operators/relu_op.h
deleted file mode 100644
index 23ca7ff16b..0000000000
--- a/lite/operators/relu_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class ReluOp : public OpLite {
- public:
-  ReluOp() {}
-  explicit ReluOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "relu"; }
-
- private:
-  mutable ActivationParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/reshape_op.cc b/lite/operators/reshape_op.cc
deleted file mode 100644
index b7f28fb6cb..0000000000
--- a/lite/operators/reshape_op.cc
+++ /dev/null
@@ -1,182 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/reshape_op.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool ReshapeOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.x);
-  CHECK_OR_FALSE(param_.output);
-  CHECK_OR_FALSE(!param_.shape.empty());
-  return true;
-}
-
-bool ReshapeOp::InferShape() const {
-  auto x_dims = param_.x->dims();
-  auto output_dims = ValidateShape(param_.shape, x_dims);
-  param_.output->Resize(output_dims);
-  auto out_lod = param_.output->mutable_lod();
-  *out_lod = param_.x->lod();
-  return true;
-}
-
-bool ReshapeOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  auto x_var = scope->FindVar(opdesc.Input("X").front());
-  auto output_var = scope->FindVar(opdesc.Output("Out").front());
-  CHECK(x_var);
-  CHECK(output_var);
-  param_.x = const_cast<lite::Tensor *>(&(x_var->Get<lite::Tensor>()));
-  param_.output = output_var->GetMutable<lite::Tensor>();
-  std::vector<std::string> input_arg_names = opdesc.InputArgumentNames();
-  if (opdesc.HasAttr("inplace")) {
-    param_.inplace = opdesc.GetAttr<bool>("inplace");
-  }
-  CHECK(param_.x) << "Input(X) of ReshapeOp should not be null.";
-  CHECK(param_.output) << "Output(Out) of ReshapeOp should not be null.";
-
-  if (opdesc.HasInput("ShapeTensor") &&
-      opdesc.Input("ShapeTensor").size() > 0) {
-    auto inputs = opdesc.Input("ShapeTensor");
-    for (auto var : inputs) {
-      lite::Tensor *datatensor =
-          scope->FindVar(var)->GetMutable<lite::Tensor>();
-      param_.shape.push_back(datatensor->mutable_data<int>()[0]);
-    }
-    const std::vector<int> shape_vector = param_.shape;
-    lite::Tensor *shape_tensor = new lite::Tensor;
-
-    shape_tensor->Resize({static_cast<int64_t>(shape_vector.size())});
-    int *data_shape = shape_tensor->mutable_data<int>();
-    for (int i = 0; i < shape_vector.size(); i++) {
-      data_shape[i] = shape_vector[i];
-    }
-    param_.actual_shape = shape_tensor;
-    return true;
-  } else if (opdesc.HasInput("Shape") && opdesc.Input("Shape").size() > 0) {
-    auto actual_shape_var = scope->FindVar(opdesc.Input("Shape").front());
-    if (actual_shape_var != nullptr) {
-      param_.actual_shape =
-          const_cast<lite::Tensor *>(&(actual_shape_var->Get<lite::Tensor>()));
-      int length = param_.actual_shape->dims().production();
-      int *shape_list = actual_shape_var->GetMutable<int>();
-      param_.shape.assign(shape_list, shape_list + length);
-    }
-    return true;
-  } else {
-    param_.shape = opdesc.GetAttr<std::vector<int>>("shape");
-    CHECK(!param_.shape.empty())
-        << "The shape information must be set by Attr(shape).";
-    const std::vector<int> shape_vector = param_.shape;
-    lite::Tensor *shape_tensor = new lite::Tensor;
-
-    shape_tensor->Resize({static_cast<int64_t>(shape_vector.size())});
-    int *data_shape = shape_tensor->mutable_data<int>();
-    for (int i = 0; i < shape_vector.size(); i++) {
-      data_shape[i] = shape_vector[i];
-    }
-    param_.actual_shape = shape_tensor;
-  }
-  return true;
-}
-
-bool Reshape2Op::CheckShape() const {
-  ReshapeOp::CheckShape();
-  CHECK_OR_FALSE(param_.xshape);
-  return true;
-}
-
-bool Reshape2Op::InferShape() const {
-  ReshapeOp::InferShape();
-  auto x_dims = param_.x->dims();
-  std::vector<DDim::value_type> xshape_dims(x_dims.size() + 1, 1);
-  for (size_t i = 0; i < x_dims.size(); i++) {
-    xshape_dims[i + 1] = x_dims[i];
-  }
-  param_.xshape->Resize(xshape_dims);
-  return true;
-}
-
-bool Reshape2Op::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  ReshapeOp::AttachImpl(opdesc, scope);
-  auto xshape_var = scope->FindVar(opdesc.Output("XShape").front());
-  CHECK(xshape_var);
-  param_.xshape = xshape_var->GetMutable<lite::Tensor>();
-  CHECK(param_.xshape) << "Output(XShape) of ReshapeOp should not be null.";
-  return true;
-}
-
-DDim ValidateShape(const std::vector<int> &shape, const DDim &input_dims) {
-  const lite::DDim::value_type input_size = input_dims.production();
-  auto input_shape = input_dims.Vectorize();
-  bool all_positive = std::all_of(
-      input_shape.cbegin(), input_shape.cend(), [](lite::DDim::value_type i) {
-        return i > 0;
-      });
-  // only one dimension can be set to -1, whose size will be automatically
-  // infered.
-  const int unk_dim_val = -1;
-  const int copy_dim_val = 0;
-
-  std::vector<lite::DDim::value_type> output_shape(shape.size(), 0);
-  lite::DDim::value_type capacity = 1;
-  int unk_dim_idx = -1;
-  for (size_t i = 0; i < shape.size(); ++i) {
-    if (shape[i] == unk_dim_val) {
-      CHECK_EQ(unk_dim_idx, -1)
-          << "Only one input dimension of Attr(shape) can be unknown.";
-      unk_dim_idx = i;
-    } else if (shape[i] == copy_dim_val) {
-      CHECK_LT(static_cast<int>(i), input_shape.size())
-          << "The index of dimension to copy from input shape must be less "
-             "than the size of input shape.";
-    } else {
-      CHECK_GT(shape[i], 0) << "Each input dimension of Attr(shape) must not "
-                               "be negtive except one unknown dimension.";
-    }
-
-    capacity *= (shape[i] ? static_cast<lite::DDim::value_type>(shape[i])
-                          : input_shape[i]);
-    output_shape[i] = (shape[i] ? static_cast<lite::DDim::value_type>(shape[i])
-                                : input_shape[i]);
-  }
-
-  if (unk_dim_idx != -1) {
-    if (all_positive) {
-      // input_size < 0 and is un-determinate in compile time, skip the check,
-      // for example, input_dims = [-1, 8, 1, 1], shape = [-1, 3, 8],
-      // capacity = -24, input_size = -8, output_shape[0] = 0
-      // the following check will fail.
-      output_shape[unk_dim_idx] = -input_size / capacity;
-      CHECK_EQ(output_shape[unk_dim_idx] * capacity, -input_size)
-          << "Invalid shape is given.";
-    } else {
-      output_shape[unk_dim_idx] = -1;
-    }
-  } else {
-    CHECK_EQ(capacity, input_size) << "Invalid shape is given.";
-  }
-  return lite::DDim(output_shape);
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(reshape, paddle::lite::operators::ReshapeOp);
-REGISTER_LITE_OP(reshape2, paddle::lite::operators::Reshape2Op);
diff --git a/lite/operators/reshape_op.h b/lite/operators/reshape_op.h
deleted file mode 100644
index bd31f7f73f..0000000000
--- a/lite/operators/reshape_op.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class ReshapeOp : public OpLite {
- public:
-  ReshapeOp() {}
-  explicit ReshapeOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "reshape"; }
-
- protected:
-  mutable ReshapeParam param_;
-};
-
-class Reshape2Op : public ReshapeOp {
- public:
-  Reshape2Op() : ReshapeOp() {}
-  explicit Reshape2Op(const std::string &op_type) : ReshapeOp(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "reshape2"; }
-};
-
-DDim ValidateShape(const std::vector<int> &shape, const DDim &input_dims);
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/reshape_op_test.cc b/lite/operators/reshape_op_test.cc
deleted file mode 100644
index b22dc153c8..0000000000
--- a/lite/operators/reshape_op_test.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/reshape_op.h"
-#include <gtest/gtest.h>
-#include <map>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-TEST(reshape_op_lite, test) {
-  // prepare variables
-  Scope scope;
-  auto* x = scope.Var("x")->GetMutable<Tensor>();
-  auto* actual_shape = scope.Var("actual_shape")->GetMutable<Tensor>();
-  auto* output = scope.Var("output")->GetMutable<Tensor>();
-  std::map<std::vector<int>, std::vector<int64_t>> shapes = {
-      {{-1, 0, 3, 2, 1}, {2, 4, 3, 2, 1}},
-      {{0, -1, 3, 2, 1}, {2, 4, 3, 2, 1}},
-      {{-1, 48}, {1, 48}},
-      {{48, -1}, {48, 1}},
-      {{0, 24}, {2, 24}},
-      {{12, 0}, {12, 4}},
-  };
-  x->Resize(DDim(std::vector<int64_t>({2, 4, 6})));
-  actual_shape->Resize(DDim(std::vector<int64_t>({2})));
-
-  auto* actual_shape_data = actual_shape->mutable_data<int>();
-  actual_shape_data[0] = 6;
-  actual_shape_data[1] = 8;
-
-  for (auto& shape : shapes) {
-    for (auto& has_actual_shape : {true, false}) {
-      for (auto& inplace : {true, false}) {
-        // prepare op desc
-        cpp::OpDesc desc;
-        desc.SetType("reshape");
-        desc.SetInput("X", {"x"});
-        if (has_actual_shape) {
-          desc.SetInput("Shape", {"actual_shape"});
-        }
-        desc.SetOutput("Out", {"output"});
-        desc.SetAttr("shape", shape.first);
-        desc.SetAttr("inplace", inplace);
-
-        ReshapeOp reshape("reshape");
-
-        reshape.SetValidPlaces(
-            {Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)}});
-        reshape.Attach(desc, &scope);
-        reshape.CheckShape();
-        reshape.InferShape();
-
-        // check output dims
-        auto output_dims = output->dims();
-        CHECK_EQ(output_dims.size(), shape.second.size());
-        for (size_t i = 0; i < output_dims.size(); i++) {
-          CHECK_EQ(output_dims[i], shape.second[i]);
-        }
-      }
-    }
-  }
-}
-
-TEST(reshape2_op_lite, test) {
-  // prepare variables
-  Scope scope;
-  auto* x = scope.Var("x")->GetMutable<Tensor>();
-  auto* actual_shape = scope.Var("actual_shape")->GetMutable<Tensor>();
-  auto* output = scope.Var("output")->GetMutable<Tensor>();
-  auto* xshape = scope.Var("xshape")->GetMutable<Tensor>();
-  std::map<std::vector<int>, std::vector<int64_t>> shapes = {
-      {{-1, 0, 3, 2, 1}, {2, 4, 3, 2, 1}},
-      {{0, -1, 3, 2, 1}, {2, 4, 3, 2, 1}},
-      {{-1, 48}, {1, 48}},
-      {{48, -1}, {48, 1}},
-      {{0, 24}, {2, 24}},
-      {{12, 0}, {12, 4}},
-  };
-  x->Resize(DDim(std::vector<int64_t>({2, 4, 6})));
-  actual_shape->Resize(DDim(std::vector<int64_t>({2})));
-
-  auto* actual_shape_data = actual_shape->mutable_data<int>();
-  actual_shape_data[0] = 6;
-  actual_shape_data[1] = 8;
-
-  for (auto& shape : shapes) {
-    for (auto& has_actual_shape : {true, false}) {
-      for (auto& inplace : {true, false}) {
-        // prepare op desc
-        cpp::OpDesc desc;
-        desc.SetType("reshape");
-        desc.SetInput("X", {"x"});
-        if (has_actual_shape) {
-          desc.SetInput("Shape", {"actual_shape"});
-        }
-        desc.SetOutput("Out", {"output"});
-        desc.SetOutput("XShape", {"xshape"});
-        desc.SetAttr("shape", shape.first);
-        desc.SetAttr("inplace", inplace);
-
-        Reshape2Op reshape2("reshape2");
-
-        reshape2.SetValidPlaces(
-            {Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)}});
-        reshape2.Attach(desc, &scope);
-        reshape2.CheckShape();
-        reshape2.InferShape();
-
-        // check output dims
-        auto output_dims = output->dims();
-        CHECK_EQ(output_dims.size(), shape.second.size());
-        for (int i = 0; i < output_dims.size(); i++) {
-          CHECK_EQ(output_dims[i], shape.second[i]);
-        }
-        // check xshape dims
-        auto x_dims = x->dims();
-        auto xshape_dims = xshape->dims();
-        CHECK_EQ(xshape_dims.size(), x_dims.size() + 1);
-        CHECK_EQ(xshape_dims[0], 0);
-        for (size_t i = 0; i < x_dims.size(); i++) {
-          CHECK_EQ(xshape_dims[i + 1], x_dims[i]);
-        }
-      }
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/roi_align_op.cc b/lite/operators/roi_align_op.cc
deleted file mode 100644
index 2f65c0197e..0000000000
--- a/lite/operators/roi_align_op.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/roi_align_op.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool RoiAlignOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.ROIs);
-  CHECK_OR_FALSE(param_.Out);
-
-  auto x_dims = param_.X->dims();
-  auto rois_dims = param_.ROIs->dims();
-
-  CHECK_OR_FALSE(x_dims.size() == 4);
-  CHECK_OR_FALSE(rois_dims.size() == 2);
-  CHECK_OR_FALSE(rois_dims[1] == 4);
-  CHECK_OR_FALSE(param_.pooled_height > 0);
-  CHECK_OR_FALSE(param_.pooled_width > 0);
-  CHECK_OR_FALSE(param_.spatial_scale > 0.0f);
-
-  return true;
-}
-
-bool RoiAlignOpLite::InferShape() const {
-  auto x_dims = param_.X->dims();
-  auto rois_dims = param_.ROIs->dims();
-
-  param_.Out->Resize(
-      {rois_dims[0], x_dims[1], param_.pooled_height, param_.pooled_width});
-  return true;
-}
-
-bool RoiAlignOpLite::AttachImpl(const cpp::OpDesc &op_desc,
-                                lite::Scope *scope) {
-  param_.X =
-      scope->FindVar(op_desc.Input("X").front())->GetMutable<lite::Tensor>();
-  param_.ROIs =
-      scope->FindVar(op_desc.Input("ROIs").front())->GetMutable<lite::Tensor>();
-
-  param_.spatial_scale = op_desc.GetAttr<float>("spatial_scale");
-  param_.pooled_height = op_desc.GetAttr<int>("pooled_height");
-  param_.pooled_width = op_desc.GetAttr<int>("pooled_width");
-  param_.sampling_ratio = op_desc.GetAttr<int>("sampling_ratio");
-
-  param_.Out =
-      scope->FindVar(op_desc.Output("Out").front())->GetMutable<lite::Tensor>();
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(roi_align, paddle::lite::operators::RoiAlignOpLite);
diff --git a/lite/operators/roi_align_op.h b/lite/operators/roi_align_op.h
deleted file mode 100644
index f3dd1a47f5..0000000000
--- a/lite/operators/roi_align_op.h
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class RoiAlignOpLite : public OpLite {
- public:
-  RoiAlignOpLite() {}
-
-  explicit RoiAlignOpLite(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "roi_align"; }
-
- private:
-  mutable RoiAlignParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/scale_op.cc b/lite/operators/scale_op.cc
deleted file mode 100644
index 1398ea4811..0000000000
--- a/lite/operators/scale_op.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/scale_op.h"
-#include "lite/core/op_registry.h"
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool ScaleOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.x);
-  CHECK_OR_FALSE(param_.output);
-  return true;
-}
-
-bool ScaleOp::InferShape() const {
-  param_.output->Resize(param_.x->dims());
-  return true;
-}
-
-bool ScaleOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  auto x = op_desc.Input("X").front();
-  auto output = op_desc.Output("Out").front();
-  param_.x = scope->FindVar(x)->GetMutable<Tensor>();
-  param_.output = scope->FindMutableTensor(output);
-  param_.scale = op_desc.GetAttr<float>("scale");
-  param_.bias = op_desc.GetAttr<float>("bias");
-  param_.bias_after_scale = op_desc.GetAttr<bool>("bias_after_scale");
-  CHECK(param_.x);
-  CHECK(param_.output);
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(scale, paddle::lite::operators::ScaleOp);
diff --git a/lite/operators/scale_op.h b/lite/operators/scale_op.h
deleted file mode 100644
index 684da4ed47..0000000000
--- a/lite/operators/scale_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class ScaleOp : public OpLite {
- public:
-  ScaleOp() {}
-  explicit ScaleOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "scale"; }
-
- private:
-  mutable ScaleParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/scale_op_test.cc b/lite/operators/scale_op_test.cc
deleted file mode 100644
index 8f1d44c57e..0000000000
--- a/lite/operators/scale_op_test.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/scale_op.h"
-#include <gtest/gtest.h>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-TEST(scale_op_lite, test) {
-  // prepare variables
-  Scope scope;
-  auto* x = scope.Var("x")->GetMutable<Tensor>();
-  auto* output = scope.Var("output")->GetMutable<Tensor>();
-  x->Resize(DDim(std::vector<int64_t>({10, 20})));
-  output->Resize(DDim(std::vector<int64_t>{1, 1}));
-
-  // prepare op desc
-  cpp::OpDesc desc;
-  desc.SetType("scale");
-  desc.SetInput("X", {"x"});
-  desc.SetOutput("Out", {"output"});
-  desc.SetAttr("bias_after_scale", false);
-  desc.SetAttr("scale", 0.5f);
-  desc.SetAttr("bias", 0.125f);
-
-  ScaleOp scale("scale");
-
-  scale.SetValidPlaces({Place{TARGET(kHost), PRECISION(kFloat)}});
-  scale.Attach(desc, &scope);
-  scale.CheckShape();
-  scale.InferShape();
-
-  // check output dims
-  auto x_dims = x->dims();
-  auto output_dims = output->dims();
-  CHECK_EQ(output_dims.size(), x_dims.size());
-  for (size_t i = 0; i < output_dims.size(); i++) {
-    CHECK_EQ(output_dims[i], x_dims[i]);
-  }
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/sequence_expand_op.cc b/lite/operators/sequence_expand_op.cc
deleted file mode 100644
index 0a5427a62f..0000000000
--- a/lite/operators/sequence_expand_op.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/sequence_expand_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool SequenceExpandOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Y);
-  CHECK_OR_FALSE(param_.Out);
-  auto x_lod = param_.X->lod();
-  auto y_lod = param_.Y->lod();
-  CHECK_OR_FALSE(x_lod.size() <= 1);
-  CHECK_OR_FALSE(y_lod.size() > 0);
-  auto ref_level = param_.ref_level;
-  CHECK_OR_FALSE(
-      ref_level == -1 ||
-      (ref_level >= 0 && ref_level < static_cast<int>(y_lod.size())));
-  if (ref_level == -1) {
-    ref_level = y_lod.size() - 1;
-  }
-  if (x_lod.size() > 0) {
-    CHECK_EQ_OR_FALSE(x_lod[0].size(), y_lod[ref_level].size());
-  }
-  return true;
-}
-
-bool SequenceExpandOp::InferShape() const {
-  const auto x_lod = param_.X->lod();
-  auto x_dims = param_.X->dims();
-  int ref_level = param_.ref_level;
-  if (ref_level == -1) {
-    ref_level = param_.Y->lod().size() - 1;
-  }
-  const auto y_lod = param_.Y->lod()[ref_level];
-  auto out_dims = param_.X->dims();
-  int64_t out_first_dim = 0;
-  if (y_lod.size() <= 1) {
-    out_first_dim = x_dims[0];
-  } else {
-    for (int i = 1; i < y_lod.size(); ++i) {
-      int64_t x_seq_len = 1;
-      if (x_lod.size() == 1) {
-        x_seq_len = x_lod[0][i] - x_lod[0][i - 1];
-      }
-      out_first_dim += (y_lod[i] - y_lod[i - 1]) * x_seq_len;
-    }
-    out_dims[0] = out_first_dim;
-  }
-  param_.Out->Resize(out_dims);
-  param_.Out->set_lod(x_lod);
-  return true;
-}
-
-bool SequenceExpandOp::AttachImpl(const cpp::OpDesc &opdesc,
-                                  lite::Scope *scope) {
-  param_.X = const_cast<lite::Tensor *>(
-      &scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
-  param_.Y = const_cast<lite::Tensor *>(
-      &scope->FindVar(opdesc.Input("Y").front())->Get<lite::Tensor>());
-  param_.Out =
-      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
-  param_.ref_level = opdesc.GetAttr<int>("ref_level");
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(sequence_expand, paddle::lite::operators::SequenceExpandOp);
diff --git a/lite/operators/sequence_expand_op.h b/lite/operators/sequence_expand_op.h
deleted file mode 100644
index da4b2fe71e..0000000000
--- a/lite/operators/sequence_expand_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class SequenceExpandOp : public OpLite {
- public:
-  SequenceExpandOp() {}
-  explicit SequenceExpandOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "sequence_expand"; }
-
- private:
-  mutable SequenceExpandParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/sequence_pool_op.cc b/lite/operators/sequence_pool_op.cc
deleted file mode 100644
index be3726ffe7..0000000000
--- a/lite/operators/sequence_pool_op.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/sequence_pool_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool SequencePoolOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Out);
-  auto lod = param_.X->lod();
-  CHECK_EQ_OR_FALSE(lod.size(), 1UL);
-  auto dims = param_.X->dims();
-  CHECK_GE_OR_FALSE(dims[0], (static_cast<int64_t>(lod[0].size()) - 1));
-  return true;
-}
-
-bool SequencePoolOp::InferShape() const {
-  const auto *input = param_.X;
-  auto out_dims = input->dims();
-  out_dims[0] = input->lod()[0].size() - 1;
-  param_.Out->Resize(out_dims);
-  return true;
-}
-
-bool SequencePoolOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  param_.X = const_cast<lite::Tensor *>(
-      &scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
-  param_.Out =
-      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
-  param_.pool_type = opdesc.GetAttr<std::string>("pooltype");
-  CHECK(param_.X);
-  CHECK(param_.Out);
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(sequence_pool, paddle::lite::operators::SequencePoolOp);
diff --git a/lite/operators/sequence_pool_op.h b/lite/operators/sequence_pool_op.h
deleted file mode 100644
index 215dd113a3..0000000000
--- a/lite/operators/sequence_pool_op.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class SequencePoolOp : public OpLite {
- public:
-  SequencePoolOp() {}
-  explicit SequencePoolOp(const std::string &op_type) : OpLite(op_type) {}
-  bool CheckShape() const override;
-  bool InferShape() const override;
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "sequence_pool"; }
-
- private:
-  mutable SequencePoolParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/sequence_softmax_op.cc b/lite/operators/sequence_softmax_op.cc
deleted file mode 100644
index d106097ed5..0000000000
--- a/lite/operators/sequence_softmax_op.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/sequence_softmax_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool SequenceSoftmaxOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Out);
-  return true;
-}
-bool SequenceSoftmaxOp::InferShape() const {
-  CHECK_OR_FALSE(param_.Out);
-  // TODO(Superjomn) Enable data sharing.
-  auto input_dims = param_.X->dims();
-  param_.Out->Resize(input_dims);
-  return true;
-}
-
-bool SequenceSoftmaxOp::AttachImpl(const cpp::OpDesc &opdesc,
-                                   lite::Scope *scope) {
-  param_.X =
-      scope->FindVar(opdesc.Input("X").front())->GetMutable<lite::Tensor>();
-  param_.Out =
-      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
-  CHECK(param_.X);
-  CHECK(param_.Out);
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(sequence_softmax, paddle::lite::operators::SequenceSoftmaxOp);
diff --git a/lite/operators/sequence_softmax_op.h b/lite/operators/sequence_softmax_op.h
deleted file mode 100644
index 37dfc0d444..0000000000
--- a/lite/operators/sequence_softmax_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class SequenceSoftmaxOp : public OpLite {
- public:
-  SequenceSoftmaxOp() {}
-  explicit SequenceSoftmaxOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "im2suquence"; }
-
- private:
-  mutable SequenceSoftmaxParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/sgd_op.cc b/lite/operators/sgd_op.cc
deleted file mode 100644
index cf387def24..0000000000
--- a/lite/operators/sgd_op.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/sgd_op.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool SGDOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.Param);
-  CHECK_OR_FALSE(param_.LearningRate);
-  CHECK_OR_FALSE(param_.Grad);
-  CHECK_OR_FALSE(param_.ParamOut);
-  return true;
-}
-
-bool SGDOpLite::InferShape() const {
-  auto lr_dims = param_.LearningRate->dims().data();
-  param_.ParamOut->Resize(param_.Param->dims());
-  return true;
-}
-
-bool SGDOpLite::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
-  auto Param_name = opdesc.Input("Param").front();
-  auto LearningRate_name = opdesc.Input("LearningRate").front();
-  auto Grad_name = opdesc.Input("Grad").front();
-  auto ParamOut_name = opdesc.Output("ParamOut").front();
-
-  param_.Param = GetVar<lite::Tensor>(scope, Param_name);
-  param_.LearningRate = GetVar<lite::Tensor>(scope, LearningRate_name);
-  param_.Grad = GetVar<Tensor>(scope, Grad_name);
-  param_.ParamOut = GetMutableVar<Tensor>(scope, ParamOut_name);
-
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(sgd, paddle::lite::operators::SGDOpLite);
diff --git a/lite/operators/sgd_op.h b/lite/operators/sgd_op.h
deleted file mode 100644
index 9159bf95a6..0000000000
--- a/lite/operators/sgd_op.h
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class SGDOpLite : public OpLite {
- public:
-  SGDOpLite() {}
-
-  explicit SGDOpLite(const std::string &type) : OpLite(type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
-
-  std::string DebugString() const override { return "sgd"; }
-
- private:
-  mutable SGDParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/shape_op.cc b/lite/operators/shape_op.cc
deleted file mode 100644
index c6d5dc4d01..0000000000
--- a/lite/operators/shape_op.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/shape_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool ShapeOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Out);
-  return true;
-}
-
-bool ShapeOpLite::InferShape() const {
-  std::vector<int64_t> shape_vec;
-  shape_vec.push_back(static_cast<int64_t>(param_.X->dims().size()));
-  param_.Out->Resize(shape_vec);
-  return true;
-}
-
-bool ShapeOpLite::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
-  auto X_name = opdesc.Input("Input").front();
-  auto Out_name = opdesc.Output("Out").front();
-  param_.X = GetVar<lite::Tensor>(scope, X_name);
-  param_.Out = GetMutableVar<lite::Tensor>(scope, Out_name);
-  CHECK(param_.X);
-  CHECK(param_.Out);
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(shape, paddle::lite::operators::ShapeOpLite);
diff --git a/lite/operators/shape_op.h b/lite/operators/shape_op.h
deleted file mode 100644
index ada9961c75..0000000000
--- a/lite/operators/shape_op.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class ShapeOpLite : public OpLite {
- public:
-  ShapeOpLite() {}
-  explicit ShapeOpLite(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "Shape"; }
-
- private:
-  mutable ShapeParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/shuffle_channel_op.cc b/lite/operators/shuffle_channel_op.cc
deleted file mode 100644
index 926aa932f3..0000000000
--- a/lite/operators/shuffle_channel_op.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/shuffle_channel_op.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool ShuffleChannelOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Out);
-  return true;
-}
-
-bool ShuffleChannelOpLite::InferShape() const {
-  param_.Out->Resize(param_.X->dims());
-  return true;
-}
-
-bool ShuffleChannelOpLite::AttachImpl(const cpp::OpDesc& op_desc,
-                                      lite::Scope* scope) {
-  auto X = op_desc.Input("X").front();
-  auto Out = op_desc.Output("Out").front();
-  param_.X = scope->FindVar(X)->GetMutable<lite::Tensor>();
-  param_.Out = scope->FindVar(Out)->GetMutable<lite::Tensor>();
-  param_.group = op_desc.GetAttr<int>("group");
-  CHECK(param_.X);
-  CHECK(param_.Out);
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(shuffle_channel,
-                 paddle::lite::operators::ShuffleChannelOpLite);
diff --git a/lite/operators/shuffle_channel_op.h b/lite/operators/shuffle_channel_op.h
deleted file mode 100644
index c48a47f619..0000000000
--- a/lite/operators/shuffle_channel_op.h
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class ShuffleChannelOpLite : public OpLite {
- public:
-  ShuffleChannelOpLite() {}
-
-  explicit ShuffleChannelOpLite(const std::string &type) : OpLite(type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
-
-  std::string DebugString() const override { return "shuffle_channel"; }
-
- private:
-  mutable ShuffleChannelParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/slice_op.cc b/lite/operators/slice_op.cc
deleted file mode 100644
index ec4267e7d2..0000000000
--- a/lite/operators/slice_op.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "lite/operators/slice_op.h"
-#include <algorithm>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool SliceOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Out);
-  return true;
-}
-
-bool SliceOp::InferShape() const {
-  CHECK_OR_FALSE(param_.Out);
-  // TODO(Superjomn) Enable data sharing.
-  auto in_dims = param_.X->dims();
-  auto out_dims = in_dims;
-  CHECK_EQ(param_.starts.size(), param_.ends.size())
-      << "for slice op starts and ends must be equal";
-  int dim_value, start, end;
-  auto axes = param_.axes;
-  auto starts = param_.starts;
-  auto ends = param_.ends;
-  auto decrease_axis = param_.decrease_axis;
-  for (size_t i = 0; i < axes.size(); ++i) {
-    dim_value = out_dims[axes[i]];
-    if (dim_value > 0) {
-      start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
-      end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
-      start = std::max(start, 0);
-      end = std::max(end, 0);
-      end = std::min(end, dim_value);
-      out_dims[axes[i]] = end - start;
-    }
-  }
-  if (decrease_axis.size() > 0) {
-    std::vector<int64_t> new_out_shape;
-    for (size_t i = 0; i < decrease_axis.size(); ++i) {
-      out_dims[decrease_axis[i]] = 0;
-    }
-    for (int i = 0; i < out_dims.size(); ++i) {
-      if (out_dims[i] != 0) {
-        new_out_shape.push_back(out_dims[i]);
-      }
-    }
-    if (new_out_shape.size() == 0) {
-      new_out_shape.push_back(1);
-    }
-    DDim new_dims;
-    new_dims.ConstructFrom(new_out_shape);
-    out_dims = new_dims;
-  }
-  param_.Out->Resize(out_dims);
-  return true;
-}
-
-bool SliceOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  param_.X =
-      scope->FindVar(opdesc.Input("Input").front())->GetMutable<lite::Tensor>();
-  param_.Out =
-      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
-  CHECK(param_.X);
-  CHECK(param_.Out);
-  param_.axes = opdesc.GetAttr<std::vector<int>>("axes");
-  param_.starts = opdesc.GetAttr<std::vector<int>>("starts");
-  param_.ends = opdesc.GetAttr<std::vector<int>>("ends");
-  if (opdesc.HasAttr("decrease_axis")) {
-    param_.decrease_axis = opdesc.GetAttr<std::vector<int>>("decrease_axis");
-  }
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(slice, paddle::lite::operators::SliceOp);
diff --git a/lite/operators/slice_op.h b/lite/operators/slice_op.h
deleted file mode 100644
index 936a1405f4..0000000000
--- a/lite/operators/slice_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class SliceOp : public OpLite {
- public:
-  SliceOp() {}
-  explicit SliceOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "slice"; }
-
- private:
-  mutable SliceParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/softmax_op.cc b/lite/operators/softmax_op.cc
deleted file mode 100644
index 1e89fc1a2a..0000000000
--- a/lite/operators/softmax_op.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/softmax_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool SoftmaxOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.x);
-  CHECK_OR_FALSE(param_.output);
-  auto x_dims = param_.x->dims();
-  auto x_rank = x_dims.size();
-  CHECK_OR_FALSE(param_.axis >= -static_cast<int>(x_rank) &&
-                 param_.axis < static_cast<int>(x_rank));
-  return true;
-}
-
-bool SoftmaxOp::InferShape() const {
-  param_.output->Resize(param_.x->dims());
-  auto out_lod = param_.output->mutable_lod();
-  *out_lod = param_.x->lod();
-  return true;
-}
-
-bool SoftmaxOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  param_.x = const_cast<lite::Tensor *>(
-      &scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
-  param_.output =
-      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
-
-  if (opdesc.HasAttr("axis")) {
-    param_.axis = opdesc.GetAttr<int>("axis");
-  } else {
-    param_.axis = -1;
-  }
-  CHECK(param_.x);
-  CHECK(param_.output);
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(softmax, paddle::lite::operators::SoftmaxOp);
diff --git a/lite/operators/softmax_op.h b/lite/operators/softmax_op.h
deleted file mode 100644
index bb24acad34..0000000000
--- a/lite/operators/softmax_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class SoftmaxOp : public OpLite {
- public:
-  SoftmaxOp() {}
-  explicit SoftmaxOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "softmax"; }
-
- private:
-  mutable SoftmaxParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/softmax_op_test.cc b/lite/operators/softmax_op_test.cc
deleted file mode 100644
index 1d3a6ccd3b..0000000000
--- a/lite/operators/softmax_op_test.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/softmax_op.h"
-#include <gtest/gtest.h>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-TEST(softmax_op_lite, test) {
-  // prepare variables
-  Scope scope;
-  auto* x = scope.Var("x")->GetMutable<Tensor>();
-  auto* output = scope.Var("output")->GetMutable<Tensor>();
-  x->Resize(DDim(std::vector<int64_t>({10, 20})));
-  output->Resize(DDim(std::vector<int64_t>{10, 20}));
-
-  // set data
-  for (int i = 0; i < 10 * 20; i++) {
-    x->mutable_data<float>()[i] = i;
-  }
-  for (int i = 0; i < 10 * 20; i++) {
-    output->mutable_data<float>()[i] = 0.;
-  }
-
-  // prepare op desc
-  cpp::OpDesc desc;
-  desc.SetType("softmax");
-  desc.SetInput("X", {"x"});
-  desc.SetOutput("Out", {"output"});
-  desc.SetAttr("axis", static_cast<int>(-1));
-
-  SoftmaxOp softmax("softmax");
-
-  softmax.SetValidPlaces({Place{TARGET(kHost), PRECISION(kFloat)}});
-  softmax.Attach(desc, &scope);
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/split_op.cc b/lite/operators/split_op.cc
deleted file mode 100644
index 4ab42d4d21..0000000000
--- a/lite/operators/split_op.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/split_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool SplitOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.x);
-  CHECK_GT_OR_FALSE(param_.output.size(), 1UL);
-  auto x_dims = param_.x->dims();
-  auto x_rank = x_dims.size();
-  CHECK_OR_FALSE(param_.axis >= -static_cast<int>(x_rank) &&
-                 param_.axis < static_cast<int>(x_rank));
-  return true;
-}
-
-bool SplitOp::InferShape() const {
-  const auto &outs = param_.output;
-  auto in_dims = param_.x->dims();
-  int axis = param_.axis;
-  int num = param_.num;
-  const auto &sections = param_.sections;
-
-  const int outs_number = outs.size();
-  std::vector<lite::DDim> outs_dims;
-  outs_dims.reserve(outs_number);
-
-  if (num > 0) {
-    int out_axis_dim = in_dims[axis] / num;
-    for (int i = 0; i < outs_number; ++i) {
-      auto dim = in_dims;
-      dim[axis] = out_axis_dim;
-      outs_dims.push_back(dim);
-    }
-  } else if (sections.size() > 0) {
-    for (int i = 0; i < outs_number; ++i) {
-      auto dim = in_dims;
-      dim[axis] = sections[i];
-      outs_dims.push_back(dim);
-    }
-  }
-
-  for (int j = 0; j < outs_dims.size(); ++j) {
-    outs[j]->Resize(outs_dims[j]);
-  }
-
-  return true;
-}
-
-bool SplitOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  param_.axis = opdesc.GetAttr<int>("axis");
-  param_.num = opdesc.GetAttr<int>("num");
-  param_.sections = opdesc.GetAttr<std::vector<int>>("sections");
-  auto input = opdesc.Input("X").front();
-  auto outs = opdesc.Output("Out");
-  param_.x = scope->FindVar(input)->GetMutable<lite::Tensor>();
-  for (auto var : outs) {
-    param_.output.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
-  }
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(split, paddle::lite::operators::SplitOp);
diff --git a/lite/operators/split_op.h b/lite/operators/split_op.h
deleted file mode 100644
index 6619074215..0000000000
--- a/lite/operators/split_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class SplitOp : public OpLite {
- public:
-  SplitOp() {}
-  explicit SplitOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "split"; }
-
- private:
-  mutable SplitParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/squeeze_op.cc b/lite/operators/squeeze_op.cc
deleted file mode 100644
index 19bd20f1ac..0000000000
--- a/lite/operators/squeeze_op.cc
+++ /dev/null
@@ -1,133 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/squeeze_op.h"
-#include "lite/core/op_registry.h"
-namespace paddle {
-namespace lite {
-namespace operators {
-
-static DDim GetOutputShape(const std::vector<int> &squeeze_dims,
-                           const DDim &in_dims,
-                           bool is_runtime) {
-  size_t num_squeeze_dims = squeeze_dims.size();
-  int cnt_squeezed_dims = 0;
-  bool should_squeeze[9] = {false};
-
-  // Determines number of dimensions of output tensor after squeeze.
-  // Mark and count the dimensions need to be squeezed
-  if (num_squeeze_dims == 0) {
-    for (int idx = 0; idx < in_dims.size(); ++idx) {
-      if (in_dims[idx] == 1) {
-        should_squeeze[idx] = true;
-        ++cnt_squeezed_dims;
-      }
-    }
-  } else {
-    for (size_t idx = 0; idx < num_squeeze_dims; ++idx) {
-      int current = squeeze_dims[idx] < 0 ? squeeze_dims[idx] + in_dims.size()
-                                          : squeeze_dims[idx];
-      // Check current index, the upper limit has been checked.
-      CHECK_GE(current, 0)
-          << "Invalid axis, the negative axis is out of range.";
-
-      if (is_runtime) {
-        CHECK_EQ(in_dims[current], 1) << "Invalid axis index, the axis that "
-                                         "will be squeezed should be equal "
-                                         "to 1.";
-      }
-
-      if (!(should_squeeze[current])) {
-        ++cnt_squeezed_dims;
-      }
-      should_squeeze[current] = true;
-    }
-  }
-
-  // Make output dimensions
-  std::vector<int64_t> output_shape(in_dims.size() - cnt_squeezed_dims, 0);
-  for (int in_idx = 0, out_idx = 0; in_idx < in_dims.size(); ++in_idx) {
-    if (!should_squeeze[in_idx]) {
-      output_shape[out_idx++] = in_dims[in_idx];
-    }
-  }
-  return DDim(output_shape);
-}
-
-bool SqueezeOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.Out);
-  for (int a : param_.axes) {
-    CHECK_LT(a, static_cast<int>(param_.X->dims().size()))
-        << "The squeeze axis should be less than input tensor's rank.";
-  }
-  return true;
-}
-
-bool SqueezeOp::InferShape() const {
-  std::vector<int> squeeze_dims = param_.axes;
-  DDim in_dims = param_.X->dims();
-  DDim out_dim = GetOutputShape(squeeze_dims, in_dims, true);
-  param_.Out->Resize(out_dim);
-  return true;
-}
-
-bool SqueezeOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  auto x_var = scope->FindVar(opdesc.Input("X").front());
-  auto output_var = scope->FindVar(opdesc.Output("Out").front());
-  CHECK(x_var);
-  CHECK(output_var);
-  param_.X = const_cast<lite::Tensor *>(&(x_var->Get<lite::Tensor>()));
-  param_.Out = output_var->GetMutable<lite::Tensor>();
-
-  if (opdesc.HasAttr("axes")) {
-    param_.axes = opdesc.GetAttr<std::vector<int>>("axes");
-  }
-  CHECK(param_.X) << "Input(X) of SqueezeOp should not be null.";
-  CHECK(param_.Out) << "Output(Out) of SqueezeOp should not be null.";
-  return true;
-}
-
-bool Squeeze2Op::CheckShape() const {
-  SqueezeOp::CheckShape();
-  CHECK_OR_FALSE(param_.XShape);
-  return true;
-}
-
-bool Squeeze2Op::InferShape() const {
-  SqueezeOp::InferShape();
-  auto x_dims = param_.X->dims();
-  std::vector<DDim::value_type> xshape_dims(x_dims.size() + 1, 1);
-  for (size_t i = 0; i < x_dims.size(); i++) {
-    xshape_dims[i + 1] = x_dims[i];
-  }
-  param_.XShape->Resize(DDim(xshape_dims));
-  return true;
-}
-
-bool Squeeze2Op::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  SqueezeOp::AttachImpl(opdesc, scope);
-  auto xshape_var = scope->FindVar(opdesc.Output("XShape").front());
-  CHECK(xshape_var);
-  param_.XShape = xshape_var->GetMutable<lite::Tensor>();
-  CHECK(param_.XShape) << "Output(XShape) of ReshapeOp should not be null.";
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(squeeze, paddle::lite::operators::SqueezeOp);
-REGISTER_LITE_OP(squeeze2, paddle::lite::operators::Squeeze2Op);
diff --git a/lite/operators/squeeze_op.h b/lite/operators/squeeze_op.h
deleted file mode 100644
index 1a550c5fbe..0000000000
--- a/lite/operators/squeeze_op.h
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class SqueezeOp : public OpLite {
- public:
-  SqueezeOp() {}
-  explicit SqueezeOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "squeeze"; }
-
- protected:
-  mutable SqueezeParam param_;
-};
-
-class Squeeze2Op : public SqueezeOp {
- public:
-  Squeeze2Op() : SqueezeOp() {}
-  explicit Squeeze2Op(const std::string &op_type) : SqueezeOp(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "squeeze2"; }
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/stack_op.cc b/lite/operators/stack_op.cc
deleted file mode 100644
index 8fdf61e822..0000000000
--- a/lite/operators/stack_op.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/stack_op.h"
-#include <cstddef>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool StackOp::CheckShape() const {
-  auto input = param_.X;
-  for (auto x : input) {
-    CHECK_OR_FALSE(x);
-  }
-  CHECK_OR_FALSE(param_.Out);
-  return true;
-}
-
-bool StackOp::InferShape() const {
-  auto input = param_.X;
-  auto input_dims = input[0]->dims();
-  int axis = param_.axis;
-  int rank = input_dims.size();
-  if (axis < 0) axis += (rank + 1);
-  auto vec = input_dims.Vectorize();
-  vec.insert(vec.begin() + axis, input.size());
-  param_.Out->Resize(vec);
-  return true;
-}
-
-bool StackOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  auto X = op_desc.Input("X");
-  auto Out = op_desc.Output("Y").front();
-  for (auto var : X) {
-    param_.X.emplace_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
-  }
-  param_.Out = scope->FindVar(Out)->GetMutable<lite::Tensor>();
-  param_.axis = op_desc.GetAttr<int>("axis");
-  return true;
-}
-
-} /* namespace operators */
-} /* namespace lite */
-} /* namespace paddle */
-
-REGISTER_LITE_OP(stack, paddle::lite::operators::StackOp);
diff --git a/lite/operators/stack_op.h b/lite/operators/stack_op.h
deleted file mode 100644
index 068d905338..0000000000
--- a/lite/operators/stack_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class StackOp : public OpLite {
- public:
-  StackOp() {}
-
-  explicit StackOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "stack"; }
-
- private:
-  mutable StackParam param_;
-};
-
-} /* namespace operators */
-} /* namespace lite */
-} /* namespace paddle */
diff --git a/lite/operators/topk_op.cc b/lite/operators/topk_op.cc
deleted file mode 100644
index a15c3c7e41..0000000000
--- a/lite/operators/topk_op.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/topk_op.h"
-#include "lite/core/op_registry.h"
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool TopkOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.X);
-  return true;
-}
-
-bool TopkOp::InferShape() const {
-  auto out_dims = param_.X->dims();
-  out_dims[out_dims.size() - 1] = param_.K;
-  auto out = param_.Out;
-  out->Resize(out_dims);
-  auto out_lod = out->mutable_lod();
-  *out_lod = param_.X->lod();
-  auto ind = param_.Indices;
-  ind->Resize(out_dims);
-  auto ind_lod = out->mutable_lod();
-  *ind_lod = param_.X->lod();
-  return true;
-}
-
-bool TopkOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  auto x = op_desc.Input("X").front();
-  param_.X = scope->FindVar(x)->GetMutable<Tensor>();
-
-  auto outputs0 = op_desc.Output("Out").front();
-  auto outputs1 = op_desc.Output("Indices").front();
-  param_.Out = scope->FindVar(outputs0)->GetMutable<lite::Tensor>();
-  param_.Indices = scope->FindVar(outputs1)->GetMutable<lite::Tensor>();
-  param_.K = op_desc.GetAttr<int>("k");
-
-  CHECK(param_.X);
-  CHECK_GE(param_.K, 1) << "topK param is not valid";
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(top_k, paddle::lite::operators::TopkOp);
diff --git a/lite/operators/topk_op.h b/lite/operators/topk_op.h
deleted file mode 100644
index 037fa413ea..0000000000
--- a/lite/operators/topk_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class TopkOp : public OpLite {
- public:
-  TopkOp() {}
-  explicit TopkOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "topk"; }
-
- private:
-  mutable TopkParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/transpose_op.cc b/lite/operators/transpose_op.cc
deleted file mode 100644
index 80e1c2f87b..0000000000
--- a/lite/operators/transpose_op.cc
+++ /dev/null
@@ -1,165 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/transpose_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-// Transpose
-bool TransposeOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.x);
-  CHECK_OR_FALSE(param_.output);
-  auto x_dims = param_.x->dims();
-  auto x_rank = x_dims.size();
-  std::vector<int> axis = param_.axis;
-  size_t axis_size = axis.size();
-  // "The input tensor's rank(%d) should be equal to the axis's size(%d)",
-  // x_rank, axis_size
-  CHECK_OR_FALSE(x_rank == axis_size);
-
-  std::vector<int> count(axis_size, 0);
-  for (size_t i = 0; i < axis_size; i++) {
-    // Each element of Attribute axis should be a unique value
-    // range from 0 to (dims - 1),
-    // where the dims is the axis's size
-    CHECK_OR_FALSE(axis[i] < static_cast<int>(axis_size) &&
-                   ++count[axis[i]] == 1);
-  }
-  return true;
-}
-
-bool TransposeOp::InferShape() const {
-  CHECK_OR_FALSE(param_.x);
-  CHECK_OR_FALSE(param_.output);
-  auto x_dims = param_.x->dims();
-  auto x_rank = x_dims.size();
-  std::vector<int> axis = param_.axis;
-  size_t axis_size = axis.size();
-  // "The input tensor's rank(%d) should be equal to the axis's size(%d)",
-  // x_rank, axis_size
-  CHECK_OR_FALSE(x_rank == axis_size);
-
-  std::vector<int> count(axis_size, 0);
-  for (size_t i = 0; i < axis_size; i++) {
-    // Each element of Attribute axis should be a unique value
-    // range from 0 to (dims - 1),
-    // where the dims is the axis's size
-    CHECK_OR_FALSE(axis[i] < static_cast<int>(axis_size) &&
-                   ++count[axis[i]] == 1);
-  }
-  lite::DDim out_dims(x_dims);
-  for (size_t i = 0; i < axis_size; i++) {
-    out_dims[i] = x_dims[axis[i]];
-  }
-  param_.output->Resize(out_dims);
-  return true;
-}
-
-bool TransposeOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  auto x = op_desc.Input("X").front();
-  auto out = op_desc.Output("Out").front();
-
-  CHECK(scope->FindVar(x));
-  CHECK(scope->FindVar(out));
-  param_.x = GetVar<lite::Tensor>(scope, x);
-  param_.output = GetMutableVar<lite::Tensor>(scope, out);
-
-  param_.axis = op_desc.GetAttr<std::vector<int>>("axis");
-  if (op_desc.HasAttr("use_mkldnn")) {
-    param_.use_mkldnn = op_desc.GetAttr<bool>("use_mkldnn");
-  }
-  if (op_desc.HasAttr("data_format")) {
-    param_.data_format = op_desc.GetAttr<std::string>("data_format");
-  }
-  return true;
-}
-
-// Transpose2
-bool Transpose2Op::CheckShape() const {
-  CHECK_OR_FALSE(param_.x);
-  CHECK_OR_FALSE(param_.output);
-  auto x_dims = param_.x->dims();
-  auto x_rank = x_dims.size();
-  std::vector<int> axis = param_.axis;
-  size_t axis_size = axis.size();
-  // "The input tensor's rank(%d) should be equal to the axis's size(%d)",
-  // x_rank, axis_size
-  CHECK_OR_FALSE(x_rank == axis_size);
-
-  std::vector<int> count(axis_size, 0);
-  for (size_t i = 0; i < axis_size; i++) {
-    // Each element of Attribute axis should be a unique value
-    // range from 0 to (dims - 1),
-    // where the dims is the axis's size
-    CHECK_OR_FALSE(axis[i] < static_cast<int>(axis_size) &&
-                   ++count[axis[i]] == 1);
-  }
-  return true;
-}
-
-bool Transpose2Op::InferShape() const {
-  CHECK_OR_FALSE(param_.x);
-  CHECK_OR_FALSE(param_.output);
-  auto x_dims = param_.x->dims();
-  auto x_rank = x_dims.size();
-  std::vector<int> axis = param_.axis;
-  size_t axis_size = axis.size();
-  // "The input tensor's rank(%d) should be equal to the axis's size(%d)",
-  // x_rank, axis_size
-  CHECK_OR_FALSE(x_rank == axis_size);
-
-  std::vector<int> count(axis_size, 0);
-  for (size_t i = 0; i < axis_size; i++) {
-    // Each element of Attribute axis should be a unique value
-    // range from 0 to (dims - 1),
-    // where the dims is the axis's size
-    CHECK_OR_FALSE(axis[i] < static_cast<int>(axis_size) &&
-                   ++count[axis[i]] == 1);
-  }
-  lite::DDim out_dims(x_dims);
-  for (size_t i = 0; i < axis_size; i++) {
-    out_dims[i] = x_dims[axis[i]];
-  }
-  param_.output->Resize(out_dims);
-  return true;
-}
-
-bool Transpose2Op::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  auto x = op_desc.Input("X").front();
-  auto out = op_desc.Output("Out").front();
-
-  CHECK(scope->FindVar(x));
-  CHECK(scope->FindVar(out));
-  param_.x = GetVar<lite::Tensor>(scope, x);
-  param_.output = GetMutableVar<lite::Tensor>(scope, out);
-
-  param_.axis = op_desc.GetAttr<std::vector<int>>("axis");
-  if (op_desc.HasAttr("use_mkldnn")) {
-    param_.use_mkldnn = op_desc.GetAttr<bool>("use_mkldnn");
-  }
-  if (op_desc.HasAttr("data_format")) {
-    param_.data_format = op_desc.GetAttr<std::string>("data_format");
-  }
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(transpose, paddle::lite::operators::TransposeOp);
-REGISTER_LITE_OP(transpose2, paddle::lite::operators::Transpose2Op);
diff --git a/lite/operators/transpose_op.h b/lite/operators/transpose_op.h
deleted file mode 100644
index ce352a7d82..0000000000
--- a/lite/operators/transpose_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-// Transpose
-class TransposeOp : public OpLite {
- public:
-  TransposeOp() {}
-  explicit TransposeOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "transpose"; }
-
- private:
-  mutable TransposeParam param_;
-};
-
-// Transpose2
-class Transpose2Op : public OpLite {
- public:
-  Transpose2Op() {}
-  explicit Transpose2Op(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "transpose2"; }
-
- private:
-  mutable TransposeParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/transpose_op_test.cc b/lite/operators/transpose_op_test.cc
deleted file mode 100644
index 9dd09735f8..0000000000
--- a/lite/operators/transpose_op_test.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/transpose_op.h"
-#include <gtest/gtest.h>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-// Transpose
-TEST(transpose_op_lite, test) {
-  // prepare variables
-  Scope scope;
-  auto* x = scope.Var("x")->GetMutable<Tensor>();
-  auto* output = scope.Var("output")->GetMutable<Tensor>();
-  const int h = 10;
-  const int w = 20;
-  x->Resize(DDim(std::vector<int64_t>({h, w})));
-  output->Resize(DDim(std::vector<int64_t>{w, h}));
-
-  // set data
-  for (int i = 0; i < h * w; i++) {
-    x->mutable_data<float>()[i] = i;
-  }
-  for (int i = 0; i < w * h; i++) {
-    output->mutable_data<float>()[i] = 0.;
-  }
-
-  // prepare op desc
-  cpp::OpDesc desc;
-  desc.SetType("transpose");
-  desc.SetInput("X", {"x"});
-  desc.SetOutput("Out", {"output"});
-  // axis change for shape in mobilenetssd: [1, 24, 2, 2] => [1, 2, 2, 24]
-  std::vector<int> axis{0, 2, 3, 1};
-  desc.SetAttr("axis", axis);
-
-  TransposeOp transpose("transpose");
-
-  transpose.SetValidPlaces({Place{TARGET(kARM), PRECISION(kFloat)}});
-  transpose.Attach(desc, &scope);
-}
-
-// Transpose2
-TEST(transpose2_op_lite, test) {
-  // prepare variables
-  Scope scope;
-  auto* x = scope.Var("x")->GetMutable<Tensor>();
-  auto* output = scope.Var("output")->GetMutable<Tensor>();
-  const int h = 10;
-  const int w = 20;
-  x->Resize(DDim(std::vector<int64_t>({h, w})));
-  output->Resize(DDim(std::vector<int64_t>{w, h}));
-
-  // set data
-  for (int i = 0; i < h * w; i++) {
-    x->mutable_data<float>()[i] = i;
-  }
-  for (int i = 0; i < w * h; i++) {
-    output->mutable_data<float>()[i] = 0.;
-  }
-
-  // prepare op desc
-  cpp::OpDesc desc;
-  desc.SetType("transpose2");
-  desc.SetInput("X", {"x"});
-  desc.SetOutput("Out", {"output"});
-  // axis change for shape in mobilenetssd: [1, 24, 2, 2] => [1, 2, 2, 24]
-  std::vector<int> axis{0, 2, 3, 1};
-  desc.SetAttr("axis", axis);
-
-  Transpose2Op transpose2("transpose2");
-
-  transpose2.SetValidPlaces({Place{TARGET(kARM), PRECISION(kFloat)}});
-  transpose2.Attach(desc, &scope);
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/uniform_random_op.cc b/lite/operators/uniform_random_op.cc
deleted file mode 100644
index 93e74e2b01..0000000000
--- a/lite/operators/uniform_random_op.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/uniform_random_op.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool UniformRandomOpLite::CheckShape() const { return true; }
-
-bool UniformRandomOpLite::InferShape() const {
-  param_.Out->Resize(param_.shape);
-  return true;
-}
-
-bool UniformRandomOpLite::AttachImpl(const cpp::OpDesc& opdesc,
-                                     lite::Scope* scope) {
-  param_.shape = opdesc.GetAttr<std::vector<int64_t>>("shape");
-  param_.min = opdesc.GetAttr<float>("min");
-  param_.max = opdesc.GetAttr<float>("max");
-  param_.seed = opdesc.GetAttr<int>("seed");
-  param_.dtype = opdesc.GetAttr<int>("dtype");
-  param_.Out = GetMutableVar<Tensor>(scope, opdesc.Output("Out").front());
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(uniform_random, paddle::lite::operators::UniformRandomOpLite);
diff --git a/lite/operators/uniform_random_op.h b/lite/operators/uniform_random_op.h
deleted file mode 100644
index f7dde8882f..0000000000
--- a/lite/operators/uniform_random_op.h
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class UniformRandomOpLite : public OpLite {
- public:
-  UniformRandomOpLite() {}
-
-  explicit UniformRandomOpLite(const std::string &type) : OpLite(type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
-
-  std::string DebugString() const override { return "uniform_random"; }
-
- private:
-  mutable UniformRandomParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/while_op.cc b/lite/operators/while_op.cc
deleted file mode 100644
index dba266af77..0000000000
--- a/lite/operators/while_op.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/while_op.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool WhileOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.sub_block);
-  CHECK_OR_FALSE(param_.scope);
-  CHECK_OR_FALSE(param_.cond);
-  return true;
-}
-
-bool WhileOpLite::InferShape() const { return true; }
-
-bool WhileOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  auto inputs = op_desc.Input("X");
-  auto outs = op_desc.Output("Out");
-
-  for (auto var : inputs) {
-    // param_.x.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
-  }
-  for (auto var : outs) {
-    // param_.outs.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
-  }
-  param_.sub_block = sub_block_;
-
-  auto condition = op_desc.Input("Condition");
-  param_.cond = scope->FindVar(condition[0])->GetMutable<lite::Tensor>();
-  param_.scope = scope;
-
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(while, paddle::lite::operators::WhileOpLite);
diff --git a/lite/operators/while_op.h b/lite/operators/while_op.h
deleted file mode 100644
index fcba722dbc..0000000000
--- a/lite/operators/while_op.h
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class WhileOpLite : public OpLite {
- public:
-  WhileOpLite() {}
-  explicit WhileOpLite(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "while"; }
-  void SetSubBlock(cpp::BlockDesc *desc) { sub_block_ = desc; }
-
- private:
-  mutable WhileParam param_;
-  cpp::BlockDesc *sub_block_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/write_to_array_op.cc b/lite/operators/write_to_array_op.cc
deleted file mode 100644
index 25e839f136..0000000000
--- a/lite/operators/write_to_array_op.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/write_to_array_op.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool WriteToArrayOp::CheckShape() const { return true; }
-
-bool WriteToArrayOp::InferShape() const {
-  auto in_dims = param_.X->dims();
-  for (auto out : *param_.Out) {
-    out.Resize(in_dims);
-  }
-  return true;
-}
-
-bool WriteToArrayOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  auto inputs = opdesc.Input("X").front();
-  param_.X = scope->FindVar(inputs)->GetMutable<lite::Tensor>();
-
-  auto id = opdesc.Input("I").front();
-  param_.I = scope->FindVar(id)->GetMutable<lite::Tensor>();
-
-  auto out = opdesc.Output("Out").front();
-  param_.Out = scope->FindVar(out)->GetMutable<std::vector<lite::Tensor>>();
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(write_to_array, paddle::lite::operators::WriteToArrayOp);
diff --git a/lite/operators/write_to_array_op.h b/lite/operators/write_to_array_op.h
deleted file mode 100644
index 8c987a2450..0000000000
--- a/lite/operators/write_to_array_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class WriteToArrayOp : public OpLite {
- public:
-  WriteToArrayOp() {}
-  explicit WriteToArrayOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "write_to_array"; }
-
- private:
-  mutable WriteToArrayParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/yolo_box_op.cc b/lite/operators/yolo_box_op.cc
deleted file mode 100644
index 068cdf0431..0000000000
--- a/lite/operators/yolo_box_op.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/yolo_box_op.h"
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool YoloBoxOp::CheckShape() const {
-  auto* X = param_.X;
-  auto* ImgSize = param_.ImgSize;
-  auto* Boxes = param_.Boxes;
-  auto* Scores = param_.Scores;
-  CHECK_OR_FALSE(X);
-  CHECK_OR_FALSE(ImgSize);
-  CHECK_OR_FALSE(Boxes);
-  CHECK_OR_FALSE(Scores);
-}
-
-bool YoloBoxOp::InferShape() const {
-  auto* X = param_.X;
-  auto* ImgSize = param_.ImgSize;
-  auto anchors = param_.anchors;
-  int anchor_num = anchors.size() / 2;
-  auto class_num = param_.class_num;
-  DDim x_dim = X->dims();
-
-  int box_num = x_dim[2] * x_dim[3] * anchor_num;
-  param_.Boxes->Resize({x_dim[0], box_num, 4});
-  param_.Scores->Resize({x_dim[0], box_num, class_num});
-  return true;
-}
-
-bool YoloBoxOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
-  auto X = op_desc.Input("X").front();
-  auto ImgSize = op_desc.Input("ImgSize").front();
-  auto Boxes = op_desc.Output("Boxes").front();
-  auto Scores = op_desc.Output("Scores").front();
-  param_.X = scope->FindVar(X)->GetMutable<lite::Tensor>();
-  param_.ImgSize = scope->FindVar(ImgSize)->GetMutable<lite::Tensor>();
-  param_.Boxes = scope->FindVar(Boxes)->GetMutable<lite::Tensor>();
-  param_.Scores = scope->FindVar(Scores)->GetMutable<lite::Tensor>();
-  param_.anchors = op_desc.GetAttr<std::vector<int>>("anchors");
-  param_.class_num = op_desc.GetAttr<int>("class_num");
-  param_.conf_thresh = op_desc.GetAttr<float>("conf_thresh");
-  param_.downsample_ratio = op_desc.GetAttr<int>("downsample_ratio");
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(yolo_box, paddle::lite::operators::YoloBoxOp);
diff --git a/lite/operators/yolo_box_op.h b/lite/operators/yolo_box_op.h
deleted file mode 100644
index 2e2ea6d634..0000000000
--- a/lite/operators/yolo_box_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class YoloBoxOp : public OpLite {
- public:
-  YoloBoxOp() {}
-  explicit YoloBoxOp(const std::string &op_type) : OpLite(op_type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  std::string DebugString() const override { return "yolobox"; }
-
- private:
-  mutable YoloBoxParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/CMakeLists.txt b/lite/tests/CMakeLists.txt
deleted file mode 100644
index 94e1eba1a5..0000000000
--- a/lite/tests/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-add_subdirectory(kernels)
diff --git a/lite/tests/README.md b/lite/tests/README.md
deleted file mode 100644
index 22f2aed968..0000000000
--- a/lite/tests/README.md
+++ /dev/null
@@ -1 +0,0 @@
-The tests shared across multiple devices(Targets).
diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt
deleted file mode 100644
index 9dd339e115..0000000000
--- a/lite/tests/kernels/CMakeLists.txt
+++ /dev/null
@@ -1,54 +0,0 @@
-if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-    lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_power_compute SRCS power_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_shuffle_channel_compute SRCS shuffle_channel_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_yolo_box_compute SRCS yolo_box_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_fc SRCS fc_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_elementwise_compute SRCS elementwise_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_lrn_compute SRCS lrn_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_decode_bboxes_compute SRCS decode_bboxes_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_box_coder_compute SRCS box_coder_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_activation_compute SRCS activation_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_argmax_compute SRCS argmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_axpy_compute SRCS axpy_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_conv2d_transpose_compute SRCS conv2d_transpose_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_norm_compute SRCS norm_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_cast_compute SRCS cast_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_assign_compute SRCS assign_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_assign_value_compute SRCS assign_value_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_sequence_softmax_compute SRCS sequence_softmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_im2sequence_compute SRCS im2sequence_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_compare_compute SRCS compare_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_logical_xor_compute SRCS logical_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_topk_compute SRCS topk_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_increment_compute SRCS increment_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_write_to_array_compute SRCS write_to_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_read_from_array_compute SRCS read_from_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_box_clip_compute SRCS box_clip_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-
-if(LITE_BUILD_EXTRA)
-    lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_sequence_pool_compute SRCS sequence_pool_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_reduce_max_compute SRCS reduce_max_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-endif()
-
-    lite_cc_test(test_sgemm SRCS test_sgemm.cc DEPS ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_pad2d_compute SRCS  pad2d_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_prior_box_compute SRCS  prior_box_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_negative_compute SRCS negative_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_bilinear_interp_compute SRCS bilinear_interp_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_nearest_interp_compute SRCS nearest_interp_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_shape_compute SRCS shape_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_crop_compute SRCS crop_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_sequence_expand_compute SRCS sequence_expand_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_squeeze_compute SRCS squeeze_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_slice_compute SRCS slice_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_expand_compute SRCS expand_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_matmul_compute SRCS matmul_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_reduce_mean_compute SRCS reduce_mean_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_stack_compute SRCS stack_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_affine_channel_compute SRCS affine_channel_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_anchor_generator_compute SRCS anchor_generator_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_generate_proposals_compute SRCS generate_proposals_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_roi_align_compute SRCS roi_align_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-endif()
diff --git a/lite/tests/kernels/activation_compute_test.cc b/lite/tests/kernels/activation_compute_test.cc
deleted file mode 100644
index 6f1d1cdcf0..0000000000
--- a/lite/tests/kernels/activation_compute_test.cc
+++ /dev/null
@@ -1,557 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <cmath>
-#include <string>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-enum activation_type_test {
-  RELU,
-  LEAKY_RELU,
-  RELU_CLIPPED,
-  PRELU,
-  SIGMOID,
-  TANH,
-  SWISH,
-  RELU6,
-  LOG,
-  EXP,
-  FLOOR
-};
-
-class ActivationComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input_ = "x";
-  std::string output_ = "out";
-  std::string prelu_alpha_ = "prelu_alpha";
-  float leaky_relu_alpha_ = 0.01;
-  float relu_clipped_coef_ = 6.;
-  std::string prelu_mode_ = "";
-  float swish_beta_ = 0.;
-  DDim dims_{{1}};
-  std::string type_ = "";
-  activation_type_test act_type_ = RELU;
-
- public:
-  ActivationComputeTester(const Place& place,
-                          const std::string& alias,
-                          float leaky_relu_alpha,
-                          float relu_clipped_coef,
-                          std::string prelu_mode,
-                          float swish_beta,
-                          DDim dims,
-                          std::string type,
-                          activation_type_test act_type)
-      : TestCase(place, alias),
-        leaky_relu_alpha_(leaky_relu_alpha),
-        relu_clipped_coef_(relu_clipped_coef),
-        prelu_mode_(prelu_mode),
-        swish_beta_(swish_beta),
-        dims_(dims),
-        type_(type),
-        act_type_(act_type) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* output_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(input_);
-    const auto* x_data = x->data<float>();
-    switch (act_type_) {
-      case RELU: {
-        for (int i = 0; i < dims_.production(); i++) {
-          output_data[i] = std::max(0.f, x_data[i]);
-        }
-        break;
-      }
-      case LEAKY_RELU: {
-        for (int i = 0; i < dims_.production(); i++) {
-          output_data[i] =
-              x_data[i] > 0.f ? x_data[i] : x_data[i] * leaky_relu_alpha_;
-        }
-        break;
-      }
-      case RELU_CLIPPED: {
-        for (int i = 0; i < dims_.production(); i++) {
-          output_data[i] = x_data[i] > 0.f ? x_data[i] : 0.f;
-          output_data[i] = output_data[i] < relu_clipped_coef_
-                               ? output_data[i]
-                               : relu_clipped_coef_;
-        }
-        break;
-      }
-      case PRELU: {
-        auto* alpha = scope->FindTensor(prelu_alpha_);
-        const auto* alpha_data = alpha->data<float>();
-
-        int num = dims_[0];
-        int channel = dims_[1];
-        int csize = dims_[2] * dims_[3];
-        int bsize = channel * csize;
-        if (prelu_mode_ == "all" || prelu_mode_ == "channel") {
-          for (int n = 0; n < num; n++) {
-            auto x_data_bptr = x_data + n * bsize;
-            auto output_data_bptr = output_data + n * bsize;
-            for (int c = 0; c < channel; c++) {
-              auto x_data_cptr = x_data_bptr + c * csize;
-              auto output_data_cptr = output_data_bptr + c * csize;
-              float slope =
-                  prelu_mode_ == "all" ? alpha_data[0] : alpha_data[c];
-              for (int i = 0; i < csize; i++) {
-                output_data_cptr[i] = x_data_cptr[i] > 0.f
-                                          ? x_data_cptr[i]
-                                          : x_data_cptr[i] * slope;
-              }
-            }
-          }
-        } else {
-          for (int i = 0; i < dims_.production(); i++) {
-            output_data[i] =
-                x_data[i] > 0.f ? x_data[i] : x_data[i] * alpha_data[i];
-          }
-        }
-        break;
-      }
-      case SIGMOID: {
-        for (int i = 0; i < dims_.production(); i++) {
-          output_data[i] = 1.f / (1.f + std::exp(-x_data[i]));
-        }
-        break;
-      }
-      case TANH: {
-        for (int i = 0; i < dims_.production(); i++) {
-          output_data[i] = (std::exp(x_data[i]) - std::exp(-x_data[i])) /
-                           (std::exp(x_data[i]) + std::exp(-x_data[i]));
-        }
-        break;
-      }
-      case SWISH: {
-        for (int i = 0; i < dims_.production(); i++) {
-          output_data[i] =
-              x_data[i] / (1.f + std::exp(-swish_beta_ * x_data[i]));
-        }
-        break;
-      }
-      case RELU6: {
-        for (int i = 0; i < dims_.production(); i++) {
-          output_data[i] = x_data[i] > 0.f ? x_data[i] : 0.f;
-          output_data[i] = output_data[i] < 6.0 ? output_data[i] : 6.0;
-        }
-        break;
-      }
-      case LOG: {
-        for (int i = 0; i < dims_.production(); i++) {
-          output_data[i] = std::log(x_data[i]);
-        }
-        break;
-      }
-      case EXP: {
-        for (int i = 0; i < dims_.production(); i++) {
-          output_data[i] = std::exp(x_data[i]);
-        }
-        break;
-      }
-      case FLOOR: {
-        for (int i = 0; i < dims_.production(); i++) {
-          output_data[i] = std::floor(x_data[i]);
-        }
-        break;
-      }
-      default:
-        LOG(INFO) << "the type of activation is unknow.";
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType(type_);
-    op_desc->SetInput("X", {input_});
-    op_desc->SetOutput("Out", {output_});
-    if (act_type_ == PRELU) {
-      op_desc->SetInput("Alpha", {prelu_alpha_});
-      op_desc->SetAttr("mode", prelu_mode_);
-    }
-    if (act_type_ == LEAKY_RELU) {
-      op_desc->SetAttr("alpha", leaky_relu_alpha_);
-    }
-    if (act_type_ == RELU_CLIPPED) {
-      op_desc->SetAttr("Relu_clipped_coef", relu_clipped_coef_);
-    }
-    if (act_type_ == SWISH) {
-      op_desc->SetAttr("beta", swish_beta_);
-    }
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-    for (int i = 0; i < dims_.production(); i++) {
-      float sign = i % 3 == 0 ? -1.0f : 1.0f;
-      sign = type_ == "log" ? 1 : sign;
-      data[i] = sign * static_cast<float>(i % 128) * 0.013f + 0.001;
-    }
-    SetCommonTensor(input_, dims_, data.data());
-
-    if (type_ == "prelu") {
-      int64_t alpha_len = 0;
-      DDim alpha_dims;
-      if (prelu_mode_ == "all") {
-        alpha_len = 1;
-        alpha_dims = DDim(std::vector<int64_t>({alpha_len}));
-      } else if (prelu_mode_ == "channel") {
-        alpha_len = dims_[1];
-        alpha_dims = DDim(std::vector<int64_t>({alpha_len}));
-      } else if (prelu_mode_ == "element") {
-        alpha_len = dims_.production();
-        alpha_dims = dims_;
-      }
-      std::vector<float> prelu_alpha_data(alpha_len);
-      for (int i = 0; i < alpha_len; i++) {
-        float sign = i % 3 == 0 ? -1.0f : 1.0f;
-        prelu_alpha_data[i] =
-            sign * static_cast<float>(i % 128) * 0.013f + 0.001;
-      }
-      SetCommonTensor(prelu_alpha_, alpha_dims, prelu_alpha_data.data());
-    }
-  }
-};
-
-TEST(Activation_relu, precision) {
-  LOG(INFO) << "test relu op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-
-  for (auto n : {1, 3}) {
-    for (auto c : {3, 6}) {
-      for (auto h : {9, 18}) {
-        for (auto w : {9, 18}) {
-          for (auto slope : {0.01, 0.1}) {
-            std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
-                place,
-                "def",
-                0.01,
-                6.,
-                "all",
-                0.,
-                DDim(std::vector<int64_t>({n, c, h, w})),
-                "relu",
-                RELU));
-            arena::Arena arena(std::move(tester), place, 2e-5);
-            arena.TestPrecision();
-          }
-        }
-      }
-    }
-  }
-#endif
-}
-
-TEST(Activation_leaky_relu, precision) {
-  LOG(INFO) << "test leaky_relu op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-
-  for (auto n : {1, 3}) {
-    for (auto c : {3, 6}) {
-      for (auto h : {9, 18}) {
-        for (auto w : {9, 18}) {
-          for (auto slope : {0.01, 0.1}) {
-            std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
-                place,
-                "def",
-                slope,
-                6.,
-                "all",
-                0.,
-                DDim(std::vector<int64_t>({n, c, h, w})),
-                "leaky_relu",
-                LEAKY_RELU));
-            arena::Arena arena(std::move(tester), place, 2e-5);
-            arena.TestPrecision();
-          }
-        }
-      }
-    }
-  }
-#endif
-}
-
-TEST(Activation_relu_clipped, precision) {
-  LOG(INFO) << "test relu clipped op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-
-  for (auto n : {1, 3}) {
-    for (auto c : {3, 6}) {
-      for (auto h : {9, 18}) {
-        for (auto w : {9, 18}) {
-          for (auto coef : {0.5, 6.}) {
-            std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
-                place,
-                "def",
-                0.01,
-                coef,
-                "all",
-                0.,
-                DDim(std::vector<int64_t>({n, c, h, w})),
-                "relu_clipped",
-                RELU_CLIPPED));
-            arena::Arena arena(std::move(tester), place, 2e-5);
-            arena.TestPrecision();
-          }
-        }
-      }
-    }
-  }
-#endif
-}
-
-TEST(Activation_prelu, precision) {
-  LOG(INFO) << "test prelu op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-
-  for (auto n : {1, 3}) {
-    for (auto c : {3, 6}) {
-      for (auto h : {9, 18}) {
-        for (auto w : {9, 18}) {
-          for (auto mode : {"all", "channel", "element"}) {
-            std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
-                place,
-                "def",
-                0.01,
-                6,
-                mode,
-                0.,
-                DDim(std::vector<int64_t>({n, c, h, w})),
-                "prelu",
-                PRELU));
-            arena::Arena arena(std::move(tester), place, 2e-5);
-            arena.TestPrecision();
-          }
-        }
-      }
-    }
-  }
-#endif
-}
-
-TEST(Activation_sigmoid, precision) {
-  LOG(INFO) << "test sigmoid op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-
-  for (auto n : {1, 3}) {
-    for (auto c : {3, 6}) {
-      for (auto h : {9, 18}) {
-        for (auto w : {9, 18}) {
-          std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
-              place,
-              "def",
-              0.01,
-              6.,
-              "all",
-              0.,
-              DDim(std::vector<int64_t>({n, c, h, w})),
-              "sigmoid",
-              SIGMOID));
-          arena::Arena arena(std::move(tester), place, 2e-5);
-          arena.TestPrecision();
-        }
-      }
-    }
-  }
-#endif
-}
-
-TEST(Activation_tanh, precision) {
-  LOG(INFO) << "test tanh op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-
-  for (auto n : {1, 3}) {
-    for (auto c : {3, 6}) {
-      for (auto h : {9, 18}) {
-        for (auto w : {9, 18}) {
-          std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
-              place,
-              "def",
-              0.01,
-              6.,
-              "all",
-              0.,
-              DDim(std::vector<int64_t>({n, c, h, w})),
-              "tanh",
-              TANH));
-          arena::Arena arena(std::move(tester), place, 2e-5);
-          arena.TestPrecision();
-        }
-      }
-    }
-  }
-#endif
-}
-
-TEST(Activation_swish, precision) {
-  LOG(INFO) << "test swish op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-
-  for (auto n : {1, 3}) {
-    for (auto c : {3, 6}) {
-      for (auto h : {9, 18}) {
-        for (auto w : {9, 18}) {
-          for (auto coef : {0.01, 0.1}) {
-            std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
-                place,
-                "def",
-                0.01,
-                6,
-                "all",
-                coef,
-                DDim(std::vector<int64_t>({n, c, h, w})),
-                "swish",
-                SWISH));
-            arena::Arena arena(std::move(tester), place, 2e-5);
-            arena.TestPrecision();
-          }
-        }
-      }
-    }
-  }
-#endif
-}
-
-TEST(Activation_relu6, precision) {
-  LOG(INFO) << "test relu6 op...";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-
-  for (auto n : {1, 3}) {
-    for (auto c : {3, 6}) {
-      for (auto h : {9, 18}) {
-        for (auto w : {9, 18}) {
-          for (auto slope : {0.01, 0.1}) {
-            std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
-                place,
-                "def",
-                0.01,
-                6.,
-                "all",
-                0.,
-                DDim(std::vector<int64_t>({n, c, h, w})),
-                "relu6",
-                RELU6));
-            arena::Arena arena(std::move(tester), place, 2e-5);
-            arena.TestPrecision();
-          }
-        }
-      }
-    }
-  }
-#endif
-}
-
-TEST(Activation_log, precision) {
-  LOG(INFO) << "test log op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-
-  for (auto n : {1, 3}) {
-    for (auto c : {3, 6}) {
-      for (auto h : {9, 18}) {
-        for (auto w : {9, 18}) {
-          std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
-              place,
-              "def",
-              0.01,
-              6.,
-              "all",
-              0.,
-              DDim(std::vector<int64_t>({n, c, h, w})),
-              "log",
-              LOG));
-          arena::Arena arena(std::move(tester), place, 2e-5);
-          arena.TestPrecision();
-        }
-      }
-    }
-  }
-#endif
-}
-
-TEST(Activation_exp, precision) {
-  LOG(INFO) << "test exp op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-
-  for (auto n : {1, 3}) {
-    for (auto c : {3, 6}) {
-      for (auto h : {9, 18}) {
-        for (auto w : {9, 18}) {
-          std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
-              place,
-              "def",
-              0.01,
-              6.,
-              "all",
-              0.,
-              DDim(std::vector<int64_t>({n, c, h, w})),
-              "exp",
-              EXP));
-          arena::Arena arena(std::move(tester), place, 2e-5);
-          arena.TestPrecision();
-        }
-      }
-    }
-  }
-#endif
-}
-
-TEST(Activation_floor, precision) {
-  LOG(INFO) << "test floor op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  for (auto n : {1, 3}) {
-    for (auto c : {3, 6}) {
-      for (auto h : {9, 18}) {
-        for (auto w : {9, 18}) {
-          std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
-              place,
-              "def",
-              0.01,
-              6.,
-              "all",
-              0.,
-              DDim(std::vector<int64_t>({n, c, h, w})),
-              "floor",
-              FLOOR));
-          arena::Arena arena(std::move(tester), place, 2e-5);
-          arena.TestPrecision();
-        }
-      }
-    }
-  }
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/affine_channel_compute_test.cc b/lite/tests/kernels/affine_channel_compute_test.cc
deleted file mode 100644
index 0e0c044e56..0000000000
--- a/lite/tests/kernels/affine_channel_compute_test.cc
+++ /dev/null
@@ -1,162 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-class AffineChannelComputeTester : public arena::TestCase {
- protected:
-  std::string input_ = "x";
-  std::string scale_ = "scale";
-  std::string bias_ = "bias";
-  std::string output_ = "out";
-  std::string data_layout_ = "";
-  DDim x_dims_{{2, 5, 20, 30}};
-
- public:
-  AffineChannelComputeTester(const Place& place,
-                             const std::string& alias,
-                             int n,
-                             int c,
-                             int h,
-                             int w,
-                             std::string data_layout)
-      : TestCase(place, alias) {
-    data_layout_ = data_layout;
-    CHECK(data_layout_ == "NCHW" || data_layout == "NHWC");
-    if (data_layout_ == "NCHW") {
-      x_dims_ = DDim(std::vector<int64_t>({n, c, h, w}));
-    } else if (data_layout_ == "NHWC") {
-      x_dims_ = DDim(std::vector<int64_t>({n, h, w, c}));
-    }
-  }
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(x_dims_);
-    auto* output_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(input_);
-    const auto* x_data = x->data<float>();
-    auto* scale = scope->FindTensor(scale_);
-    const auto* scale_data = scale->data<float>();
-    auto* bias = scope->FindTensor(bias_);
-    const auto* bias_data = bias->data<float>();
-
-    int num = x_dims_[0];
-
-    if (data_layout_ == "NCHW") {
-      int channel = x_dims_[1];
-      int height = x_dims_[2];
-      int width = x_dims_[3];
-      int size = x_dims_[2] * x_dims_[3];
-      int in_channel = channel * size;
-      for (int n = 0; n < num; n++) {
-        auto x_data_n = x_data + n * in_channel;
-        auto output_data_n = output_data + n * in_channel;
-        for (int c = 0; c < channel; c++) {
-          auto x_data_c = x_data_n + c * size;
-          auto output_data_c = output_data_n + c * size;
-          for (int k = 0; k < size; k++) {
-            output_data_c[k] = scale_data[c] * x_data_c[k] + bias_data[c];
-          }
-        }
-      }
-    } else if (data_layout_ == "NHWC") {
-      int channel = x_dims_[3];
-      int height = x_dims_[1];
-      int width = x_dims_[2];
-      int hwc = height * width * channel;
-      int wc = width * channel;
-      for (int n = 0; n < num; n++) {
-        for (int h = 0; h < height; h++) {
-          for (int w = 0; w < width; w++) {
-            auto x_ptr = x_data + n * hwc + h * wc + w * channel;
-            auto output_ptr = output_data + n * hwc + h * wc + w * channel;
-            for (int c = 0; c < channel; c++) {
-              output_ptr[c] = x_ptr[c] * scale_data[c] + bias_data[c];
-            }
-          }
-        }
-      }
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("affine_channel");
-    op_desc->SetInput("X", {input_});
-    op_desc->SetInput("Scale", {scale_});
-    op_desc->SetInput("Bias", {bias_});
-    op_desc->SetAttr("data_layout", data_layout_);
-    op_desc->SetOutput("Out", {output_});
-  }
-
-  void PrepareData() override {
-    std::vector<float> x_data(x_dims_.production());
-    for (int i = 0; i < x_dims_.production(); i++) {
-      float sign = i % 3 == 0 ? -1.0f : 1.0f;
-      x_data[i] = sign * static_cast<float>(i % 128) * 0.013f + 0.001;
-    }
-    SetCommonTensor(input_, x_dims_, x_data.data());
-
-    int c = data_layout_ == "NCHW" ? x_dims_[1] : x_dims_[3];
-    DDim scale_dims(std::vector<int64_t>({c}));
-    std::vector<float> scale_data(scale_dims.production());
-    for (int i = 0; i < scale_dims.production(); i++) {
-      float sign = i % 3 == 0 ? -1.0f : 1.0f;
-      scale_data[i] = sign * static_cast<float>(i % 128) * 0.005f + 0.001;
-    }
-    SetCommonTensor(scale_, scale_dims, scale_data.data());
-
-    DDim bias_dims(std::vector<int64_t>({c}));
-    std::vector<float> bias_data(bias_dims.production());
-    for (int i = 0; i < bias_dims.production(); i++) {
-      float sign = i % 3 == 0 ? -1.0f : 1.0f;
-      bias_data[i] = sign * static_cast<float>(i % 128) * 0.005f + 0.001;
-    }
-    SetCommonTensor(bias_, bias_dims, bias_data.data());
-  }
-};
-
-TEST(AffineChannel, precision) {
-  LOG(INFO) << "test affine_channel op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-
-  for (int n : {1, 5}) {
-    for (int c : {2, 5}) {
-      for (int h : {3, 10}) {
-        for (int w : {3, 10}) {
-          for (std::string data_layout : {"NCHW", "NHWC"}) {
-            std::unique_ptr<arena::TestCase> tester(
-                new AffineChannelComputeTester(
-                    place, "def", n, c, h, w, data_layout));
-            arena::Arena arena(std::move(tester), place, 2e-5);
-            arena.TestPrecision();
-          }
-        }
-      }
-    }
-  }
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/anchor_generator_compute_test.cc b/lite/tests/kernels/anchor_generator_compute_test.cc
deleted file mode 100644
index c271be7bc4..0000000000
--- a/lite/tests/kernels/anchor_generator_compute_test.cc
+++ /dev/null
@@ -1,177 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-class AnchorGeneratorComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input_str_ = "Input";
-  std::string anchors_str_ = "Anchors";
-  std::string variances_str_ = "Variances";
-  DDim input_dims_;
-  std::vector<float> anchor_sizes_;
-  std::vector<float> aspect_ratios_;
-  std::vector<float> stride_;
-  std::vector<float> variances_;
-  float offset_;
-
- public:
-  AnchorGeneratorComputeTester(const Place& place,
-                               const std::string& alias,
-                               int n,
-                               int c,
-                               int h,
-                               int w,
-                               std::vector<float> anchor_sizes,
-                               std::vector<float> aspect_ratios,
-                               std::vector<float> stride,
-                               std::vector<float> variances,
-                               float offset)
-      : TestCase(place, alias) {
-    input_dims_ = DDim(std::vector<int64_t>({n, c, h, w}));
-    anchor_sizes_ = anchor_sizes;
-    aspect_ratios_ = aspect_ratios;
-    stride_ = stride;
-    variances_ = variances;
-    offset_ = offset;
-  }
-
-  void RunBaseline(Scope* scope) override {
-    auto* anchors = scope->NewTensor(anchors_str_);
-    auto* vars = scope->NewTensor(variances_str_);
-    CHECK(anchors);
-    CHECK(vars);
-
-    int num_anchors = anchor_sizes_.size() * aspect_ratios_.size();
-    std::vector<int64_t> output_shape(
-        {input_dims_[2], input_dims_[3], num_anchors, 4});
-    DDim output_dims(output_shape);
-    anchors->Resize(output_dims);
-    vars->Resize(output_dims);
-    auto* anchors_data = anchors->mutable_data<float>();
-    auto* vars_data = vars->mutable_data<float>();
-
-    int feature_height = input_dims_[2];
-    int feature_width = input_dims_[3];
-    float stride_width = stride_[0];
-    float stride_height = stride_[1];
-    for (int h_idx = 0; h_idx < feature_height; ++h_idx) {
-      for (int w_idx = 0; w_idx < feature_width; ++w_idx) {
-        float x_ctr = (w_idx * stride_width) + offset_ * (stride_width - 1);
-        float y_ctr = (h_idx * stride_height) + offset_ * (stride_height - 1);
-        float area, area_ratios;
-        float base_w, base_h;
-        float scale_w, scale_h;
-        float anchor_width, anchor_height;
-        auto* anchors_data_hw = anchors_data +
-                                h_idx * feature_width * num_anchors * 4 +
-                                w_idx * num_anchors * 4;
-        for (size_t r = 0; r < aspect_ratios_.size(); ++r) {
-          auto ar = aspect_ratios_[r];
-          auto* anchors_data_r = anchors_data_hw + r * anchor_sizes_.size() * 4;
-          for (size_t s = 0; s < anchor_sizes_.size(); ++s) {
-            auto anchor_size = anchor_sizes_[s];
-            area = stride_width * stride_height;
-            area_ratios = area / ar;
-            base_w = round(sqrt(area_ratios));
-            base_h = round(base_w * ar);
-            scale_w = anchor_size / stride_width;
-            scale_h = anchor_size / stride_height;
-            anchor_width = scale_w * base_w;
-            anchor_height = scale_h * base_h;
-            anchors_data_r[s * 4 + 0] = (x_ctr - 0.5 * (anchor_width - 1));
-            anchors_data_r[s * 4 + 1] = (y_ctr - 0.5 * (anchor_height - 1));
-            anchors_data_r[s * 4 + 2] = (x_ctr + 0.5 * (anchor_width - 1));
-            anchors_data_r[s * 4 + 3] = (y_ctr + 0.5 * (anchor_height - 1));
-          }
-        }
-      }
-    }
-
-    for (int h = 0; h < feature_height; h++) {
-      for (int w = 0; w < feature_width; w++) {
-        for (int n = 0; n < num_anchors; n++) {
-          auto vars_data_i = vars_data + h * feature_width * num_anchors * 4 +
-                             w * num_anchors * 4 + n * 4;
-          for (int i = 0; i < 4; i++) {
-            vars_data_i[i] = variances_[i];
-          }
-        }
-      }
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("anchor_generator");
-    op_desc->SetInput("Input", {input_str_});
-    op_desc->SetAttr("anchor_sizes", anchor_sizes_);
-    op_desc->SetAttr("aspect_ratios", aspect_ratios_);
-    op_desc->SetAttr("stride", stride_);
-    op_desc->SetAttr("variances", variances_);
-    op_desc->SetAttr("offset", offset_);
-    op_desc->SetOutput("Anchors", {anchors_str_});
-    op_desc->SetOutput("Variances", {variances_str_});
-  }
-
-  void PrepareData() override {
-    std::vector<float> input_data(input_dims_.production());
-    for (int i = 0; i < input_dims_.production(); i++) {
-      float sign = i % 3 == 0 ? -1.0f : 1.0f;
-      input_data[i] = sign * static_cast<float>(i % 128) * 0.013f + 0.001;
-    }
-    SetCommonTensor(input_str_, input_dims_, input_data.data());
-  }
-};
-
-TEST(AnchorGenerator, precision) {
-  LOG(INFO) << "test anchor_generator op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  for (int n : {1, 3}) {
-    for (int c : {3, 6}) {
-      for (int h : {9, 18}) {
-        for (int w : {9, 18}) {
-          for (std::string str : {"NCHW", "NHWC"}) {
-            std::unique_ptr<arena::TestCase> tester(
-                new AnchorGeneratorComputeTester(place,
-                                                 "def",
-                                                 n,
-                                                 c,
-                                                 h,
-                                                 w,
-                                                 {64, 128, 256, 512},
-                                                 {0.5, 1.0, 2.0},
-                                                 {16.0, 16.0},
-                                                 {0.1, 0.1, 0.2, 0.2},
-                                                 0.5));
-            arena::Arena arena(std::move(tester), place, 2e-5);
-            arena.TestPrecision();
-          }
-        }
-      }
-    }
-  }
-
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/argmax_compute_test.cc b/lite/tests/kernels/argmax_compute_test.cc
deleted file mode 100644
index 9163e4bdaf..0000000000
--- a/lite/tests/kernels/argmax_compute_test.cc
+++ /dev/null
@@ -1,130 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-class ArgmaxComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input_ = "x";
-  std::string output_ = "out";
-  int64_t axis_ = 0.;
-  DDim dims_{{2, 5, 20, 30}};
-
- public:
-  ArgmaxComputeTester(const Place& place,
-                      const std::string& alias,
-                      int axis,
-                      int n,
-                      int c,
-                      int h,
-                      int w)
-      : TestCase(place, alias), axis_(axis) {
-    dims_ = DDim(std::vector<int64_t>({n, c, h, w}));
-  }
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    int64_t nchw[] = {dims_[0], dims_[1], dims_[2], dims_[3]};
-    std::vector<int64_t> output_shape(nchw, nchw + 4);
-    output_shape.erase(output_shape.begin() + axis_);
-    DDim output_dims(output_shape);
-    out->Resize(output_dims);
-    auto* output_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(input_);
-    const auto* x_data = x->data<float>();
-
-    // int in_channel = x_dims
-    const int size = dims_[axis_];
-    const int in_channel = dims_.count(axis_, dims_.size());
-    const int out_channel = output_dims.count(axis_, output_dims.size());
-    const int in_stride = dims_.count(axis_ + 1, dims_.size());
-    const int out_stride = dims_.count(0, axis_);
-
-    for (int n = 0; n < out_stride; n++) {
-      for (int k = 0; k < in_stride; k++) {
-        const float* in_ptr = x_data + n * in_channel + k;
-        std::vector<std::pair<float, int>> vec;
-        vec.resize(size);
-        for (int i = 0; i < size; i++) {
-          vec[i] = std::make_pair(in_ptr[i * in_stride], i);
-        }
-        // sort
-        std::partial_sort(vec.begin(),
-                          vec.begin() + 1,
-                          vec.end(),
-                          std::greater<std::pair<float, int>>());
-
-        // out
-        float* out_ptr = output_data + n * out_channel + k;
-        *out_ptr = vec[0].second;
-      }
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("arg_max");
-    op_desc->SetInput("X", {input_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("axis", axis_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      float sign = i % 3 == 0 ? -1.0f : 1.0f;
-      data[i] = sign * static_cast<float>(i % 128) * 0.013f + 0.001;
-    }
-
-    SetCommonTensor(input_, dims_, data.data());
-  }
-};
-
-TEST(Argmax, precision) {
-  // #ifdef LITE_WITH_X86
-  //  Place place(TARGET(kX86));
-  // #endif
-  LOG(INFO) << "test argmax op";
-#ifdef LITE_WITH_ARM
-  LOG(INFO) << "test argmax arm";
-  Place place(TARGET(kARM));
-
-  for (int axis : {0, 1, 2, 3}) {
-    for (int n : {1, 3}) {
-      for (int c : {3, 6}) {
-        for (int h : {9, 18}) {
-          for (int w : {9, 18}) {
-            std::unique_ptr<arena::TestCase> tester(
-                new ArgmaxComputeTester(place, "def", axis, n, c, h, w));
-            arena::Arena arena(std::move(tester), place, 2e-5);
-            arena.TestPrecision();
-          }
-        }
-      }
-    }
-  }
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/assign_compute_test.cc b/lite/tests/kernels/assign_compute_test.cc
deleted file mode 100644
index 92f885f8da..0000000000
--- a/lite/tests/kernels/assign_compute_test.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-class AssignComputeTester : public arena::TestCase {
- protected:
-  std::string input_ = "X";
-  std::string output_ = "Out";
-  DDim dims_{{100, 20}};
-
- public:
-  AssignComputeTester(const Place& place, const std::string& alias)
-      : TestCase(place, alias) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-    auto* x = scope->FindTensor(input_);
-    const auto* x_data = x->data<float>();
-    for (int i = 0; i < dims_.production(); i++) {
-      out_data[i] = x_data[i];
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("assign");
-    op_desc->SetInput("X", {input_});
-    op_desc->SetOutput("Out", {output_});
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(input_, dims_, data.data());
-  }
-};
-
-void TestAssign(const Place& place) {
-  std::unique_ptr<arena::TestCase> tester(
-      new AssignComputeTester(place, "def"));
-  arena::Arena arena(std::move(tester), place, 2e-5);
-  arena.TestPrecision();
-}
-
-TEST(Assign, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  TestAssign(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/assign_value_compute_test.cc b/lite/tests/kernels/assign_value_compute_test.cc
deleted file mode 100644
index 96959e507d..0000000000
--- a/lite/tests/kernels/assign_value_compute_test.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-class AssignValueComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string out_ = "out";
-  int dtype_{};
-  std::vector<int> shape_{};
-  std::vector<int> int32_values_{};
-  std::vector<float> fp32_values_{};
-  size_t num_ = 1;
-
- public:
-  AssignValueComputeTester(const Place& place,
-                           const std::string& alias,
-                           int dtype,
-                           int n,
-                           int c,
-                           int h,
-                           int w)
-      : TestCase(place, alias) {
-    dtype_ = dtype;
-    shape_.push_back(n);
-    shape_.push_back(c);
-    shape_.push_back(h);
-    shape_.push_back(w);
-    num_ = n * c * h * w;
-  }
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(out_);
-    CHECK(out);
-    std::vector<int64_t> out_shape(shape_.begin(), shape_.end());
-    out->Resize(out_shape);
-    if (dtype_ == 2) {
-      auto* out_data = out->mutable_data<int>();
-      for (int i = 0; i < out->numel(); i++) {
-        out_data[i] = int32_values_[i];
-      }
-    } else if (dtype_ == 5) {
-      auto* out_data = out->mutable_data<float>();
-      for (int i = 0; i < out->numel(); i++) {
-        out_data[i] = fp32_values_[i];
-      }
-    } else {
-      LOG(FATAL) << "unsuport dtype_:" << dtype_;
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("assign_value");
-    op_desc->SetAttr("shape", shape_);
-    op_desc->SetAttr("dtype", dtype_);
-    op_desc->SetAttr("fp32_values", fp32_values_);
-    op_desc->SetAttr("int32_values", int32_values_);
-    op_desc->SetOutput("Out", {out_});
-  }
-
-  void PrepareData() override {
-    // int32
-    if (dtype_ == 2) {
-      int32_values_.resize(num_);
-      for (int i = 0; i < num_; i++) {
-        int32_values_[i] = i;
-      }
-    } else if (dtype_ == 5) {
-      fp32_values_.resize(num_);
-      for (int i = 0; i < num_; i++) {
-        fp32_values_[i] = i / 1.23f;
-      }
-    } else {
-      LOG(FATAL) << "unsupport dtype_:" << dtype_;
-    }
-  }
-};
-
-TEST(AssignValue, precision) {
-  LOG(INFO) << "test argmax op";
-#ifdef LITE_WITH_ARM
-  LOG(INFO) << "test argmax arm";
-  Place place(TARGET(kARM));
-
-  for (int dtype : {2, 5}) {
-    for (int n : {1}) {
-      for (int c : {2}) {
-        for (int h : {1}) {
-          for (int w : {2}) {
-            std::unique_ptr<arena::TestCase> tester(
-                new AssignValueComputeTester(place, "def", dtype, n, c, h, w));
-            arena::Arena arena(std::move(tester), place, 2e-5);
-            arena.TestPrecision();
-          }
-        }
-      }
-    }
-  }
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/axpy_compute_test.cc b/lite/tests/kernels/axpy_compute_test.cc
deleted file mode 100644
index 2163c686a7..0000000000
--- a/lite/tests/kernels/axpy_compute_test.cc
+++ /dev/null
@@ -1,136 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-class AxpyComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input_ = "x";
-  std::string output_ = "out";
-  std::string scale_ = "scale";
-  std::string bias_ = "bias";
-  DDim x_dims_{{2, 5, 20, 30}};
-
- public:
-  AxpyComputeTester(
-      const Place& place, const std::string& alias, int n, int c, int h, int w)
-      : TestCase(place, alias) {
-    x_dims_ = DDim(std::vector<int64_t>({n, c, h, w}));
-  }
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    std::vector<int64_t> output_shape(
-        {x_dims_[0], x_dims_[1], x_dims_[2], x_dims_[3]});
-    DDim output_dims(output_shape);
-    out->Resize(output_dims);
-    auto* output_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(input_);
-    const auto* x_data = x->data<float>();
-    auto* scale = scope->FindTensor(scale_);
-    const auto* scale_data = scale->data<float>();
-    auto* bias = scope->FindTensor(bias_);
-    const auto* bias_data = bias->data<float>();
-
-    int num = x_dims_[0];
-    int channel = x_dims_[1];
-    int size = x_dims_[2] * x_dims_[3];
-    int in_channel = channel * size;
-
-    for (int i = 0; i < num; i++) {
-      auto scale_data_i = scale_data + i * channel;
-      auto x_data_i = x_data + i * in_channel;
-      auto bias_data_i = bias_data + i * in_channel;
-      auto output_data_i = output_data + i * in_channel;
-      for (int j = 0; j < channel; j++) {
-        auto scale_data_j = scale_data_i + j;
-        auto x_data_j = x_data_i + j * size;
-        auto bias_data_j = bias_data_i + j * size;
-        auto output_data_j = output_data_i + j * size;
-        for (int k = 0; k < size; k++) {
-          output_data_j[k] = scale_data_j[0] * x_data_j[k] + bias_data_j[k];
-        }
-      }
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("axpy");
-    op_desc->SetInput("X", {input_});
-    op_desc->SetInput("Scale", {scale_});
-    op_desc->SetInput("Bias", {bias_});
-    op_desc->SetOutput("Out", {output_});
-  }
-
-  void PrepareData() override {
-    std::vector<float> x_data(x_dims_.production());
-    for (int i = 0; i < x_dims_.production(); i++) {
-      float sign = i % 3 == 0 ? -1.0f : 1.0f;
-      x_data[i] = sign * static_cast<float>(i % 128) * 0.013f + 0.001;
-    }
-    SetCommonTensor(input_, x_dims_, x_data.data());
-
-    int n = x_dims_[0];
-    int c = x_dims_[1];
-    int h = x_dims_[2];
-    int w = x_dims_[3];
-    DDim scale_dims(std::vector<int64_t>({n, c}));
-    std::vector<float> scale_data(scale_dims.production());
-    for (int i = 0; i < scale_dims.production(); i++) {
-      float sign = i % 3 == 0 ? -1.0f : 1.0f;
-      scale_data[i] = sign * static_cast<float>(i % 128) * 0.005f + 0.001;
-    }
-    SetCommonTensor(scale_, scale_dims, scale_data.data());
-
-    DDim bias_dims(std::vector<int64_t>({n, c, h, w}));
-    std::vector<float> bias_data(bias_dims.production());
-    for (int i = 0; i < bias_dims.production(); i++) {
-      float sign = i % 3 == 0 ? -1.0f : 1.0f;
-      bias_data[i] = sign * static_cast<float>(i % 128) * 0.005f + 0.001;
-    }
-    SetCommonTensor(bias_, bias_dims, bias_data.data());
-  }
-};
-
-TEST(Axpy, precision) {
-  LOG(INFO) << "test axpy op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-
-  for (int n : {1, 3}) {
-    for (int c : {3, 6}) {
-      for (int h : {9, 18}) {
-        for (int w : {9, 18}) {
-          std::unique_ptr<arena::TestCase> tester(
-              new AxpyComputeTester(place, "def", n, c, h, w));
-          arena::Arena arena(std::move(tester), place, 2e-5);
-          arena.TestPrecision();
-        }
-      }
-    }
-  }
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/bilinear_interp_compute_test.cc b/lite/tests/kernels/bilinear_interp_compute_test.cc
deleted file mode 100644
index b80861217b..0000000000
--- a/lite/tests/kernels/bilinear_interp_compute_test.cc
+++ /dev/null
@@ -1,282 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <string>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-
-template <typename dtype>
-void resize_bilinear_align(std::vector<const lite::Tensor*> inputs,
-                           lite::Tensor* output) {
-  int hin = inputs[0]->dims()[2];
-  int win = inputs[0]->dims()[3];
-  int channels = inputs[0]->dims()[1];
-  int num = inputs[0]->dims()[0];
-  int hout = output->dims()[2];
-  int wout = output->dims()[3];
-
-  dtype scale_w = static_cast<dtype>(win - 1) / (wout - 1);
-  dtype scale_h = static_cast<dtype>(hin - 1) / (hout - 1);
-  const dtype* src = inputs[0]->data<dtype>();
-  dtype* dst = output->mutable_data<dtype>();
-  int dst_stride_w = 1;
-  int dst_stride_h = wout;
-  int dst_stride_c = wout * hout;
-  int dst_stride_batch = wout * hout * channels;
-  int src_stride_w = 1;
-  int src_stride_h = win;
-  int src_stride_c = win * hin;
-  int src_stride_batch = win * hin * channels;
-
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      int src_index = n * src_stride_batch + c * src_stride_c;
-
-      for (int h = 0; h < hout; ++h) {
-        for (int w = 0; w < wout; ++w) {
-          dtype fw = w * scale_w;
-          dtype fh = h * scale_h;
-          int w_start = static_cast<int>(fw);
-          int w_id = w_start < win - 1 ? 1 : 0;
-          int w_end = static_cast<int>(fw + w_id);
-          int h_start = static_cast<int>(fh);
-          int h_id = h_start < hin - 1 ? 1 : 0;
-          int h_end = static_cast<int>(fh + h_id);
-          fw -= w_start;
-          fh -= h_start;
-          const dtype w00 = (1.0 - fh) * (1.0 - fw);
-          const dtype w01 = fw * (1.0 - fh);
-          const dtype w10 = fh * (1.0 - fw);
-          const dtype w11 = fw * fh;
-          dtype tl =
-              src[src_index + w_start * src_stride_w + h_start * src_stride_h];
-          dtype tr =
-              src[src_index + w_end * src_stride_w + h_start * src_stride_h];
-          dtype bl =
-              src[src_index + w_start * src_stride_w + h_end * src_stride_h];
-          dtype br =
-              src[src_index + w_end * src_stride_w + h_end * src_stride_h];
-          int dst_index = n * dst_stride_batch + c * dst_stride_c +
-                          h * dst_stride_h + w * dst_stride_w;
-          dst[dst_index] =
-              static_cast<dtype>(w00 * tl + w01 * tr + w10 * bl + w11 * br);
-        }
-      }
-    }
-  }
-}
-
-template <typename dtype>
-void resize_bilinear_no_align(std::vector<const lite::Tensor*> inputs,
-                              lite::Tensor* output) {
-  int hin = inputs[0]->dims()[2];
-  int win = inputs[0]->dims()[3];
-  int channels = inputs[0]->dims()[1];
-  int num = inputs[0]->dims()[0];
-  int hout = output->dims()[2];
-  int wout = output->dims()[3];
-  dtype scale_w = static_cast<dtype>(win) / (wout);
-  dtype scale_h = static_cast<dtype>(hin) / (hout);
-  const dtype* src = inputs[0]->data<dtype>();
-  dtype* dst = output->mutable_data<dtype>();
-  int dst_stride_w = 1;
-  int dst_stride_h = wout;
-  int dst_stride_c = wout * hout;
-  int dst_stride_batch = wout * hout * channels;
-  int src_stride_w = 1;
-  int src_stride_h = win;
-  int src_stride_c = win * hin;
-  int src_stride_batch = win * hin * channels;
-
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      int src_index = n * src_stride_batch + c * src_stride_c;
-
-      for (int h = 0; h < hout; ++h) {
-        for (int w = 0; w < wout; ++w) {
-          dtype fw = scale_w * (w + 0.5f) - 0.5f;
-          fw = (fw < 0) ? 0 : fw;
-          dtype fh = scale_h * (h + 0.5f) - 0.5f;
-          fh = (fh < 0) ? 0 : fh;
-          int w_start = static_cast<int>(fw);
-          int w_id = w_start < win - 1 ? 1 : 0;
-          int w_end = static_cast<int>(fw + w_id);
-          int h_start = static_cast<int>(fh);
-          int h_id = h_start < hin - 1 ? 1 : 0;
-          int h_end = static_cast<int>(fh + h_id);
-          fw -= w_start;
-          fh -= h_start;
-          const dtype w00 = (1.0 - fh) * (1.0 - fw);
-          const dtype w01 = fw * (1.0 - fh);
-          const dtype w10 = fh * (1.0 - fw);
-          const dtype w11 = fw * fh;
-          dtype tl =
-              src[src_index + w_start * src_stride_w + h_start * src_stride_h];
-          dtype tr =
-              src[src_index + w_end * src_stride_w + h_start * src_stride_h];
-          dtype bl =
-              src[src_index + w_start * src_stride_w + h_end * src_stride_h];
-          dtype br =
-              src[src_index + w_end * src_stride_w + h_end * src_stride_h];
-          int dst_index = n * dst_stride_batch + c * dst_stride_c +
-                          h * dst_stride_h + w * dst_stride_w;
-          dst[dst_index] =
-              static_cast<dtype>(w00 * tl + w01 * tr + w10 * bl + w11 * br);
-        }
-      }
-    }
-  }
-}
-
-class BilinearInterpComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input0_ = "X";
-  std::string input1_ = "OutSize";
-  std::string output_ = "Out";
-
-  float height_scale_ = 0.f;
-  float width_scale_ = 0.f;
-  int out_height_ = -1;
-  int out_width_ = -1;
-  bool align_corners_ = true;
-  std::string interp_method_ = "Bilinear";
-  DDim dims_{{1, 1}};
-  DDim _dims0_{{1, 1, 16, 16}};
-  DDim _dims1_{{2}};
-
- public:
-  BilinearInterpComputeTester(const Place& place,
-                              const std::string& alias,
-                              float height_scale,
-                              float width_scale,
-                              int out_height,
-                              int out_width,
-                              bool align_corners,
-                              std::string interp_method)
-      : TestCase(place, alias),
-        height_scale_(height_scale),
-        width_scale_(width_scale),
-        out_height_(out_height),
-        out_width_(out_width),
-        align_corners_(align_corners),
-        interp_method_(interp_method) {}
-
-  void RunBaseline(Scope* scope) override {
-    width_scale_ = height_scale_;
-    std::vector<const lite::Tensor*> inputs;
-    inputs.emplace_back(scope->FindTensor(input0_));
-    inputs.emplace_back(scope->FindTensor(input1_));
-    auto outsize_data = inputs[1]->data<int>();
-    if (out_width_ != -1 && out_height_ != -1) {
-      height_scale_ = static_cast<float>(out_height_ / inputs[0]->dims()[2]);
-      width_scale_ = static_cast<float>(out_width_ / inputs[0]->dims()[3]);
-    }
-    auto* outputs = scope->NewTensor(output_);
-    CHECK(outputs);
-    if (inputs.size() > 1) {
-      int h_out = outsize_data[0];  // HW
-      int w_out = outsize_data[1];  // HW
-      int num_cout = inputs[0]->dims()[0];
-      int c_cout = inputs[0]->dims()[1];
-      outputs->Resize({num_cout, c_cout, h_out, w_out});
-    } else {
-      int out_h;
-      int out_w;
-      if (-1 == out_height_ && -1 == out_width_) {
-        out_h = inputs[0]->dims()[2] * height_scale_;
-        out_w = inputs[0]->dims()[3] * width_scale_;
-      } else {
-        out_h = out_height_;
-        out_w = out_width_;
-      }
-      outputs->Resize(
-          {inputs[0]->dims()[0], inputs[0]->dims()[1], out_h, out_w});
-    }
-
-    if (align_corners_) {
-      resize_bilinear_align<float>(inputs, outputs);
-    } else {
-      resize_bilinear_no_align<float>(inputs, outputs);
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("bilinear_interp");
-    op_desc->SetInput("X", {input0_});
-    op_desc->SetInput("OutSize", {input1_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("scale", height_scale_);
-    op_desc->SetAttr("out_h", out_height_);
-    op_desc->SetAttr("out_w", out_width_);
-    op_desc->SetAttr("align_corners", align_corners_);
-    op_desc->SetAttr("interp_method", interp_method_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data0(_dims0_.production());
-    for (int i = 0; i < _dims0_.production(); i++) {
-      data0[i] = i * 1.1;
-    }
-    SetCommonTensor(input0_, _dims0_, data0.data());
-
-    std::vector<int> data1(_dims1_.production());
-    for (int i = 0; i < _dims1_.production(); i++) {
-      data1[i] = 16;
-    }
-    SetCommonTensor(input1_, _dims1_, data1.data());
-  }
-};
-
-void test_bilinear_interp(Place place) {
-  std::string interp_method = "Bilinear";
-  for (float scale : {1., 0.5, 0.3}) {
-    for (int out_height : {8, 16}) {
-      for (int out_width : {8, 16}) {
-        for (bool align_corners : {true, false}) {
-          std::unique_ptr<arena::TestCase> tester(
-              new BilinearInterpComputeTester(place,
-                                              "def",
-                                              scale,
-                                              scale,
-                                              out_height,
-                                              out_width,
-                                              align_corners,
-                                              interp_method));
-          arena::Arena arena(std::move(tester), place, 2e-5);
-          arena.TestPrecision();
-        }
-      }
-    }
-  }
-}
-
-TEST(BilinearInterp, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_bilinear_interp(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/box_clip_compute_test.cc b/lite/tests/kernels/box_clip_compute_test.cc
deleted file mode 100644
index 72947fa4b2..0000000000
--- a/lite/tests/kernels/box_clip_compute_test.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-class BoxClipComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input_ = "input";
-  std::string im_info_ = "im_info";
-  std::string output_ = "output";
-  DDim input_dims_{};
-  LoD input_lod_{};
-  DDim im_info_dim_{};
-
- public:
-  BoxClipComputeTester(const Place& place, const std::string& alias)
-      : TestCase(place, alias) {
-    input_dims_.ConstructFrom(std::vector<int64_t>({4, 3, 4}));
-    std::vector<uint64_t> lod0 = {0, 1, 4};
-    input_lod_.push_back(lod0);
-    im_info_dim_.ConstructFrom(std::vector<int64_t>({2, 3}));
-  }
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(input_dims_);
-    auto* out_lod = out->mutable_lod();
-    *out_lod = input_lod_;
-    auto* out_data = out->mutable_data<float>();
-
-    auto* input = scope->FindTensor(input_);
-    const auto* input_data = input->data<float>();
-    for (int i = 0; i < 12; i++) {
-      out_data[i] = std::max(std::min(input_data[i], 9.f), 0.f);
-    }
-    for (int i = 12; i < 48; i++) {
-      out_data[i] = std::max(std::min(input_data[i], 14.f), 0.f);
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("box_clip");
-    op_desc->SetInput("Input", {input_});
-    op_desc->SetInput("ImInfo", {im_info_});
-    op_desc->SetOutput("Output", {output_});
-  }
-
-  void PrepareData() override {
-    std::vector<float> input_data(input_dims_.production());
-    for (int i = 0; i < input_dims_.production(); i++) {
-      float sign = i % 3 == 0 ? -1.0f : 1.0f;
-      input_data[i] = sign * static_cast<float>((i * 7) % 20);
-    }
-    SetCommonTensor(input_, input_dims_, input_data.data());
-    auto input_tensor = baseline_scope()->FindMutableTensor(input_);
-    input_tensor->set_lod(input_lod_);
-
-    std::vector<float> im_info_data{10, 10, 1, 15, 15, 1};
-    SetCommonTensor(im_info_, im_info_dim_, im_info_data.data());
-  }
-};
-
-TEST(Boxclip, precision) {
-  LOG(INFO) << "test box_clip op";
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  std::unique_ptr<arena::TestCase> tester(
-      new BoxClipComputeTester(place, "def"));
-  arena::Arena arena(std::move(tester), place, 2e-5);
-  arena.TestPrecision();
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/box_coder_compute_test.cc b/lite/tests/kernels/box_coder_compute_test.cc
deleted file mode 100644
index f3f9b7e0ab..0000000000
--- a/lite/tests/kernels/box_coder_compute_test.cc
+++ /dev/null
@@ -1,212 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-static inline void box_coder_ref(lite::Tensor* proposals,
-                                 const lite::Tensor* anchors,
-                                 const lite::Tensor* bbox_deltas,
-                                 const lite::Tensor* variances,
-                                 int axis,
-                                 bool box_normalized,
-                                 std::string code_type) {
-  if (code_type == "decode_center_size") {
-    const size_t row = bbox_deltas->dims()[0];
-    const size_t col = bbox_deltas->dims()[1];
-    int anchor_len = 4;
-    int out_len = 4;
-    int var_len = 4;
-    int delta_len = 4;
-    const float* anchor_data = anchors->data<float>();
-    const float* bbox_deltas_data = bbox_deltas->data<float>();
-    float* proposals_data = proposals->mutable_data<float>();
-    const float* variances_data = variances->data<float>();
-    float normalized = !box_normalized ? 1.f : 0;
-
-    for (int64_t row_id = 0; row_id < row; ++row_id) {
-      for (int64_t col_id = 0; col_id < col; ++col_id) {
-        size_t delta_offset = row_id * col * delta_len + col_id * delta_len;
-        size_t out_offset = row_id * col * out_len + col_id * out_len;
-        int prior_box_offset =
-            axis == 0 ? col_id * anchor_len : row_id * anchor_len;
-        int var_offset = axis == 0 ? col_id * var_len : row_id * var_len;
-        auto anchor_data_tmp = anchor_data + prior_box_offset;
-        auto bbox_deltas_data_tmp = bbox_deltas_data + delta_offset;
-        auto proposals_data_tmp = proposals_data + out_offset;
-        auto anchor_width =
-            anchor_data_tmp[2] - anchor_data_tmp[0] + normalized;
-        auto anchor_height =
-            anchor_data_tmp[3] - anchor_data_tmp[1] + normalized;
-        auto anchor_center_x = anchor_data_tmp[0] + 0.5 * anchor_width;
-        auto anchor_center_y = anchor_data_tmp[1] + 0.5 * anchor_height;
-        float bbox_center_x = 0, bbox_center_y = 0;
-        float bbox_width = 0, bbox_height = 0;
-
-        auto variances_data_tmp = variances_data + var_offset;
-        bbox_center_x =
-            variances_data_tmp[0] * bbox_deltas_data_tmp[0] * anchor_width +
-            anchor_center_x;
-        bbox_center_y =
-            variances_data_tmp[1] * bbox_deltas_data_tmp[1] * anchor_height +
-            anchor_center_y;
-        bbox_width = std::exp(variances_data_tmp[2] * bbox_deltas_data_tmp[2]) *
-                     anchor_width;
-        bbox_height =
-            std::exp(variances_data_tmp[3] * bbox_deltas_data_tmp[3]) *
-            anchor_height;
-
-        proposals_data_tmp[0] = bbox_center_x - bbox_width / 2;
-        proposals_data_tmp[1] = bbox_center_y - bbox_height / 2;
-        proposals_data_tmp[2] = bbox_center_x + bbox_width / 2 - normalized;
-        proposals_data_tmp[3] = bbox_center_y + bbox_height / 2 - normalized;
-      }
-    }
-  } else if (code_type == "encode_center_size") {
-    LOG(FATAL) << "not implemented type: " << code_type;
-  } else {
-    LOG(FATAL) << "not supported type: " << code_type;
-  }
-}
-
-class BoxCoderComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string prior_box_ = "PriorBox";
-  std::string prior_box_var_ = "PriorBoxVar";
-  std::string target_box_ = "TargetBox";
-  std::string output_box_ = "OutputBox";
-
-  int axis_;
-  bool box_normalized_{true};
-  std::string code_type_;
-  DDim prior_box_dims_;
-  DDim prior_box_var_dims_;
-  DDim target_box_dims_;
-
- public:
-  BoxCoderComputeTester(const Place& place,
-                        const std::string& alias,
-                        int axis,
-                        bool box_normalized,
-                        std::string code_type,
-                        DDim prior_box_dims,
-                        DDim prior_box_var_dims,
-                        DDim target_box_dims)
-      : TestCase(place, alias),
-        axis_(axis),
-        box_normalized_(box_normalized),
-        code_type_(code_type),
-        prior_box_dims_(prior_box_dims),
-        prior_box_var_dims_(prior_box_var_dims),
-        target_box_dims_(target_box_dims) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* output_box = scope->NewTensor(output_box_);
-    CHECK(output_box);
-    output_box->Resize(target_box_dims_);
-    auto* output_box_data = output_box->mutable_data<float>();
-
-    auto* prior_box = scope->FindTensor(prior_box_);
-    const auto* prior_box_data = prior_box->data<float>();
-
-    auto* prior_box_var = scope->FindTensor(prior_box_var_);
-    const auto* prior_box_var_data = prior_box_var->data<float>();
-
-    auto* target_box = scope->FindTensor(target_box_);
-    const auto* target_box_data = target_box->data<float>();
-
-    box_coder_ref(output_box,
-                  prior_box,
-                  target_box,
-                  prior_box_var,
-                  axis_,
-                  box_normalized_,
-                  code_type_);
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("box_coder");
-    op_desc->SetInput("PriorBox", {prior_box_});
-    op_desc->SetInput("PriorBoxVar", {prior_box_var_});
-    op_desc->SetInput("TargetBox", {target_box_});
-    op_desc->SetOutput("OutputBox", {output_box_});
-
-    op_desc->SetAttr("axis", axis_);
-    op_desc->SetAttr("box_normalized", box_normalized_);
-    op_desc->SetAttr("code_type", code_type_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> prior_box_data(prior_box_dims_.production());
-    std::vector<float> prior_box_var_data(prior_box_var_dims_.production());
-    std::vector<float> target_box_data(target_box_dims_.production());
-
-    for (int i = 0; i < prior_box_dims_.production(); i++) {
-      prior_box_data[i] = i * 1.1 / prior_box_dims_.production();
-    }
-    for (int i = 0; i < prior_box_var_dims_.production(); i++) {
-      prior_box_var_data[i] = i * 1.2 / prior_box_var_dims_.production();
-    }
-    for (int i = 0; i < target_box_dims_.production(); i++) {
-      target_box_data[i] = i * 1.3 / target_box_dims_.production();
-    }
-
-    SetCommonTensor(prior_box_, prior_box_dims_, prior_box_data.data());
-    SetCommonTensor(
-        prior_box_var_, prior_box_var_dims_, prior_box_var_data.data());
-    SetCommonTensor(target_box_, target_box_dims_, target_box_data.data());
-  }
-};
-
-void test_box_coder(Place place) {
-  for (int N : {1, 2, 3, 4}) {
-    for (int M : {1, 3, 4, 8, 10}) {
-      int axis = 0;
-      for (bool norm : {true, false}) {
-        for (std::string code_type : {"decode_center_size"}) {
-          std::unique_ptr<arena::TestCase> tester(
-              new BoxCoderComputeTester(place,
-                                        "def",
-                                        axis,
-                                        norm,
-                                        code_type,
-                                        DDim({M, 4}),
-                                        DDim({M, 4}),
-                                        DDim({N, M, 4})));
-          arena::Arena arena(std::move(tester), place, 2e-5);
-          arena.TestPrecision();
-        }
-      }
-    }
-  }
-}
-
-TEST(BoxCoder, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_box_coder(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/cast_compute_test.cc b/lite/tests/kernels/cast_compute_test.cc
deleted file mode 100644
index f000ea1d71..0000000000
--- a/lite/tests/kernels/cast_compute_test.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-class CastComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input_ = "x";
-  std::string output_ = "out";
-  int in_dtype_ = 21;
-  int out_dtype_ = 5;
-  DDim x_dims_{{2, 2, 2, 2}};
-
- public:
-  CastComputeTester(const Place& place, const std::string& alias)
-      : TestCase(place, alias) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(x_dims_);
-    auto* output_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(input_);
-    const auto* x_data = x->data<char>();
-
-    int num = x_dims_[0];
-    int channel = x_dims_[1];
-    int size = x_dims_[2] * x_dims_[3];
-    int in_channel = channel * size;
-
-    auto* output_data_tmp = output_data;
-    auto* x_data_tmp = x_data;
-    for (int i = 0; i < x_dims_.production(); i++) {
-      *output_data_tmp = static_cast<float>(*x_data_tmp);
-      output_data_tmp++;
-      x_data_tmp++;
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("cast");
-    op_desc->SetInput("X", {input_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("in_dtype", in_dtype_);
-    op_desc->SetAttr("out_dtype", out_dtype_);
-  }
-
-  void PrepareData() override {
-    std::vector<char> x_data(x_dims_.production());
-    for (int i = 0; i < x_dims_.production(); i++) {
-      float sign = i % 3 == 0 ? -1.0f : 1.0f;
-      x_data[i] = sign * static_cast<char>(i % 128);
-    }
-    SetCommonTensor(input_, x_dims_, x_data.data());
-  }
-};
-
-TEST(Cast, precision) {
-  LOG(INFO) << "test cast op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-
-  std::unique_ptr<arena::TestCase> tester(new CastComputeTester(place, "def"));
-  arena::Arena arena(std::move(tester), place, 2e-5);
-  arena.TestPrecision();
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/compare_compute_test.cc b/lite/tests/kernels/compare_compute_test.cc
deleted file mode 100644
index fe27579fe4..0000000000
--- a/lite/tests/kernels/compare_compute_test.cc
+++ /dev/null
@@ -1,243 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-#define COMPARE_FUNCTOR(name, op)                                           \
-  template <typename T>                                                     \
-  struct _##name##Functor {                                                 \
-    inline bool operator()(const T& a, const T& b) const { return a op b; } \
-  };
-
-COMPARE_FUNCTOR(Equal, ==);
-COMPARE_FUNCTOR(NotEqual, !=);
-COMPARE_FUNCTOR(LessThan, <);
-COMPARE_FUNCTOR(LessEqual, <=);
-COMPARE_FUNCTOR(GreaterThan, >);
-COMPARE_FUNCTOR(GreaterEqual, >=);
-
-template <>
-struct _EqualFunctor<float> {
-  inline bool operator()(const float& a, const float& b) const {
-    // It is safe to cast a and b to double.
-    return fabs(static_cast<double>(a - b)) < 1e-8;
-  }
-};
-
-template <>
-struct _NotEqualFunctor<float> {
-  inline bool operator()(const float& a, const float& b) const {
-    return !_EqualFunctor<float>()(a, b);
-  }
-};
-
-template <template <typename T> class Functor>
-class LessThanTester : public arena::TestCase {
- protected:
-  std::string input_x_ = "x";
-  std::string input_y_ = "y";
-  std::string output_ = "out";
-  int axis_ = 1;
-  bool force_cpu_ = 0;
-  DDim x_dims_{{3, 5, 4, 4}};
-  DDim y_dims_{{4}};
-  std::string opname_ = "less_than";
-
- public:
-  LessThanTester(const Place& place,
-                 const std::string& alias,
-                 bool force_cpu,
-                 int axis,
-                 DDim x_dims,
-                 DDim y_dims,
-                 const std::string& opname)
-      : TestCase(place, alias),
-        axis_(axis),
-        force_cpu_(force_cpu),
-        x_dims_(x_dims),
-        y_dims_(y_dims),
-        opname_(opname) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(x_dims_);
-    auto* out_data = out->mutable_data<bool>();
-    auto axis = axis_;
-    auto* x = scope->FindTensor(input_x_);
-    const auto* x_data = x->data<float>();
-    auto* y = scope->FindTensor(input_y_);
-    auto* y_data_in = y->data<float>();
-
-    using CompareFunc = Functor<float>;
-    if (x_dims_.size() == y_dims_.size()) {
-      for (int i = 0; i < x_dims_.production(); i++) {
-        // out_data[i] = x_data[i] < y_data[i];
-        out_data[i] = CompareFunc()(x_data[i], y_data_in[i]);
-      }
-    } else {
-      auto* y_data = reinterpret_cast<float*>(
-          malloc(x_dims_.production() * sizeof(float)));
-
-      if (axis < 0) {
-        axis = x_dims_.size() - y_dims_.size();
-      }
-      int batch = 1;
-      int channels = 1;
-      int num = 1;
-      for (int i = 0; i < axis; ++i) {
-        batch *= x_dims_[i];
-      }
-      for (int i = 0; i < y_dims_.size(); ++i) {
-        channels *= y_dims_[i];
-      }
-      for (int i = y_dims_.size() + axis; i < x_dims_.size(); ++i) {
-        num *= x_dims_[i];
-      }
-      int ysize = channels * num;
-      float* y_data_t = reinterpret_cast<float*>(y_data);
-      if (num == 1) {
-        for (int i = 0; i < batch; ++i) {
-          memcpy(reinterpret_cast<void*>(y_data_t),
-                 reinterpret_cast<const void*>(&y_data_in[0]),
-                 ysize * sizeof(float));
-          y_data_t += ysize;
-        }
-
-      } else {
-        for (int i = 0; i < channels; i++) {
-          for (int j = 0; j < num; j++) {
-            y_data_t[i * num + j] = y_data_in[i];
-          }
-        }
-        float* tempptr = y_data_t;
-        for (int i = 0; i < batch; ++i) {
-          memcpy(y_data_t, tempptr, ysize * sizeof(float));
-          y_data_t += ysize;
-        }
-      }
-      for (int i = 0; i < x_dims_.production(); i++) {
-        // out_data[i] = x_data[i] < y_data[i];
-        out_data[i] = CompareFunc()(x_data[i], y_data[i]);
-      }
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType(opname_);
-    op_desc->SetInput("X", {input_x_});
-    op_desc->SetInput("Y", {input_y_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("axis", axis_);
-    op_desc->SetAttr("force_cpu", force_cpu_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(x_dims_.production());
-    std::vector<float> datay(
-        y_dims_.production());  // datay(dims_.production());
-    for (int i = 0; i < x_dims_.production(); i++) {
-      data[i] = 1.1;
-    }
-    for (int i = 0; i < y_dims_.production(); i++) {
-      datay[i] = i;
-    }
-    SetCommonTensor(input_x_, x_dims_, data.data());
-    SetCommonTensor(input_y_, y_dims_, datay.data());
-  }
-};
-void test_compare(Place place) {
-  for (bool force_cpu : {0}) {
-    for (auto n : {1, 3, 4}) {
-      for (auto c : {1, 3, 4}) {
-        for (auto h : {1, 3, 4}) {
-          for (auto w : {1, 3, 4}) {
-            for (auto axis : {-1, 0, 1, 3}) {
-              for (auto yd : {std::vector<int64_t>({n}),
-                              std::vector<int64_t>({c}),
-                              std::vector<int64_t>({h}),
-                              std::vector<int64_t>({w}),
-                              std::vector<int64_t>({n, c}),
-                              std::vector<int64_t>({h, w}),
-                              std::vector<int64_t>({n, c, h}),
-                              std::vector<int64_t>({n, c, h, w})}) {
-                DDimLite x_dims = DDim(std::vector<int64_t>({n, c, h, w}));
-                DDimLite y_dims = DDim(yd);
-                int axis_t = axis < 0 ? x_dims.size() - y_dims.size() : axis;
-
-                if (axis_t + y_dims.size() > 4) continue;
-                bool flag = false;
-                for (int i = 0; i < y_dims.size(); i++) {
-                  if (x_dims[i + axis_t] != y_dims[i]) flag = true;
-                }
-                if (flag) continue;
-                std::unique_ptr<arena::TestCase> less_than_tester(
-                    new LessThanTester<paddle::lite::_LessThanFunctor>(
-                        place,
-                        "def",
-                        force_cpu,
-                        axis,
-                        x_dims,
-                        y_dims,
-                        "less_than"));
-                arena::Arena less_than_arena(
-                    std::move(less_than_tester), place, 0.001);
-                less_than_arena.TestPrecision();
-                std::unique_ptr<arena::TestCase> equal_tester(
-                    new LessThanTester<paddle::lite::_EqualFunctor>(place,
-                                                                    "def",
-                                                                    force_cpu,
-                                                                    axis,
-                                                                    x_dims,
-                                                                    y_dims,
-                                                                    "equal"));
-                arena::Arena equal_arena(std::move(equal_tester), place, 0.001);
-                equal_arena.TestPrecision();
-                std::unique_ptr<arena::TestCase> greater_than_tester(
-                    new LessThanTester<paddle::lite::_GreaterThanFunctor>(
-                        place,
-                        "def",
-                        force_cpu,
-                        axis,
-                        x_dims,
-                        y_dims,
-                        "greater_than"));
-                arena::Arena greater_than_arena(
-                    std::move(greater_than_tester), place, 0.001);
-                greater_than_arena.TestPrecision();
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-TEST(Compare_OP, precision) {
-// #ifdef LITE_WITH_X86
-// //   Place place(TARGET(kX86));
-// // #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_compare(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/conv2d_transpose_compute_test.cc b/lite/tests/kernels/conv2d_transpose_compute_test.cc
deleted file mode 100644
index c44259022d..0000000000
--- a/lite/tests/kernels/conv2d_transpose_compute_test.cc
+++ /dev/null
@@ -1,465 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-inline bool is_a_ge_zero_and_a_lt_b(int a, int b) {
-  return static_cast<unsigned>(a) < static_cast<unsigned>(b);
-}
-
-template <typename Dtype>
-void col2im(const Dtype* data_col,
-            const int channels,
-            const int height,
-            const int width,
-            const int kernel_h,
-            const int kernel_w,
-            const int pad_h,
-            const int pad_w,
-            const int stride_h,
-            const int stride_w,
-            const int dilation_h,
-            const int dilation_w,
-            Dtype* data_im) {
-  memset(data_im, 0, height * width * channels * sizeof(float));
-  const int output_h =
-      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
-  const int output_w =
-      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
-  const int channel_size = height * width;
-  for (int channel = channels; channel--; data_im += channel_size) {
-    for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
-      for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
-        int input_row = -pad_h + kernel_row * dilation_h;
-        for (int output_rows = output_h; output_rows; output_rows--) {
-          if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
-            data_col += output_w;
-          } else {
-            int input_col = -pad_w + kernel_col * dilation_w;
-            for (int output_col = output_w; output_col; output_col--) {
-              if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
-                data_im[input_row * width + input_col] += *data_col;
-              }
-              data_col++;
-              input_col += stride_w;
-            }
-          }
-          input_row += stride_h;
-        }
-      }
-    }
-  }
-}
-
-template <typename Dtype>
-void fill_bias_relu(Dtype* tensor,
-                    const Dtype* bias,
-                    int channel,
-                    int channel_size,
-                    bool flag_bias,
-                    bool flag_relu);
-
-template <>
-void fill_bias_relu<float>(float* tensor,
-                           const float* bias,
-                           int channel,
-                           int channel_size,
-                           bool flag_bias,
-                           bool flag_relu) {
-  float* data = tensor;
-  if (flag_relu) {
-    for (int j = 0; j < channel; ++j) {
-      float bias_data = flag_bias ? bias[j] : 0.f;
-      for (int i = 0; i < channel_size; i++) {
-        data[i] += bias_data;
-        data[i] = data[i] > 0 ? data[i] : 0.f;
-      }
-      data += channel_size;
-    }
-  } else {
-    for (int j = 0; j < channel; ++j) {
-      float bias_data = flag_bias ? bias[j] : 0.f;
-      for (int i = 0; i < channel_size; i++) {
-        data[i] += bias_data;
-      }
-      data += channel_size;
-    }
-  }
-}
-
-template <typename type, typename type2>
-static void basic_gemm(int m,
-                       int n,
-                       int k,
-                       const type* a,
-                       const type* b,
-                       const type2* bias,
-                       type2* c,
-                       type2 alpha,
-                       type2 beta,
-                       bool trans_a = false,
-                       bool trans_b = false,
-                       bool flag_bias = false,
-                       bool flag_relu = false) {
-#pragma omp parallel for
-  for (int i = 0; i < m; ++i) {
-    type2 bias_data = (type2)0;
-    if (flag_bias) {
-      bias_data = bias[i];
-    }
-    for (int j = 0; j < n; ++j) {
-      type2 sum = static_cast<type2>(0);
-      for (int l = 0; l < k; ++l) {
-        type av;
-        type bv;
-        if (trans_a) {
-          av = a[l * m + i];
-        } else {
-          av = a[i * k + l];
-        }
-        if (trans_b) {
-          bv = b[j * k + l];
-        } else {
-          bv = b[l * n + j];
-        }
-        sum += av * bv;
-      }
-      type2 tmp = alpha * sum + beta * c[i * n + j] + bias_data;
-      if (flag_relu) {
-        c[i * n + j] = tmp > (type2)0 ? tmp : (type2)0;
-      } else {
-        c[i * n + j] = tmp;
-      }
-    }
-  }
-}
-
-//! for float, dtype1 and type2 is float
-//! for int8, dytpe1 is char, dtype2 is int
-template <typename Dtype1, typename Dtype2>
-bool deconv_basic(const Dtype1* din,
-                  Dtype2* dout,
-                  int num,
-                  int chout,
-                  int hout,
-                  int wout,
-                  int chin,
-                  int hin,
-                  int win,
-                  const Dtype1* weights,
-                  const Dtype2* bias,
-                  int group,
-                  int kernel_w,
-                  int kernel_h,
-                  int stride_w,
-                  int stride_h,
-                  int dila_w,
-                  int dila_h,
-                  int pad_w,
-                  int pad_h,
-                  bool flag_bias,
-                  bool flag_relu) {
-  int m = chout * kernel_w * kernel_h / group;
-  int n = hin * win;
-  int k = chin / group;
-
-  if (chin != chout || group != chin) {
-    CHECK_OR_FALSE(chin % group == 0);
-    CHECK_OR_FALSE(chout % group == 0);
-  }
-  lite::Tensor workspace_tensor;
-  std::vector<int64_t> wt_shape = {1, 1, 1, group * m * n};
-  workspace_tensor.Resize(wt_shape);
-  auto* workspace_ptr = workspace_tensor.mutable_data<Dtype2>();
-
-  int group_size_in = win * hin * chin / group;
-  int group_size_out = wout * hout * chout / group;
-  int group_size_coldata = m * n;
-  int group_size_weights = chin * chout * kernel_w * kernel_h / (group * group);
-  bool flag_1x1s1p1 = (kernel_w == 1) && (kernel_h == 1) && (stride_h == 1) &&
-                      (stride_w == 1) && (pad_w == 1) && (pad_h == 1) &&
-                      (dila_w == 1) && (dila_h == 1);
-
-  for (int i = 0; i < num; ++i) {
-    const Dtype1* din_batch = din + i * chin * hin * win;
-    Dtype2* dout_batch = dout + i * chout * hout * wout;
-
-    Dtype2* col_data = workspace_ptr;
-    if (flag_1x1s1p1) {
-      col_data = dout_batch;
-    }
-    memset(col_data, 0, sizeof(Dtype2) * group_size_coldata);
-    for (int g = 0; g < group; ++g) {
-      const Dtype1* din_group = din_batch + g * group_size_in;
-      const Dtype1* weights_group = weights + g * group_size_weights;
-      Dtype2* coldata_group = col_data + g * group_size_coldata;
-      basic_gemm<Dtype1, Dtype2>(m,
-                                 n,
-                                 k,
-                                 weights_group,
-                                 din_group,
-                                 nullptr,
-                                 coldata_group,
-                                 (Dtype2)1,
-                                 (Dtype2)0,
-                                 true,
-                                 false,
-                                 false,
-                                 (!flag_bias && flag_relu));
-    }
-    if (!flag_1x1s1p1) {
-      col2im(col_data,
-             chout,
-             hout,
-             wout,
-             kernel_h,
-             kernel_w,
-             pad_h,
-             pad_w,
-             stride_h,
-             stride_w,
-             dila_h,
-             dila_w,
-             dout_batch);
-    }
-    if (flag_bias) {
-      fill_bias_relu(
-          dout_batch, bias, chout, wout * hout, flag_bias, flag_relu);
-    }
-  }
-  return true;
-}
-
-class Conv2DTransposeComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string x_ = "x";
-  std::string output_ = "out";
-  std::string filter_ = "filter";
-  std::string bias_ = "bias";
-
-  std::vector<int> strides_{1, 1};
-  std::vector<int> paddings_{0, 0};
-  int groups_{1};
-  std::vector<int> dilations_{1, 1};
-  bool flag_relu_{false};
-
-  int n_ = 1;
-  int ic_ = 1;
-  int oc_ = 1;
-  int ih_ = 9;
-  int iw_ = 9;
-  bool flag_bias_ = false;
-  int ks_ = 1;
-
- public:
-  Conv2DTransposeComputeTester(const Place& place,
-                               const std::string& alias,
-                               int n,
-                               int ic,
-                               int oc,
-                               int ih,
-                               int iw,
-                               bool flag_bias,
-                               bool flag_relu,
-                               int dilation,
-                               int stride,
-                               int padding,
-                               int ks,
-                               int groups)
-      : TestCase(place, alias) {
-    n_ = n;
-    ic_ = ic;
-    oc_ = oc;
-    ih_ = ih;
-    iw_ = iw;
-    ks_ = ks;
-    flag_bias_ = flag_bias;
-
-    strides_ = std::vector<int>({stride, stride});
-    paddings_ = std::vector<int>({padding, padding});
-    groups_ = groups;
-    dilations_ = std::vector<int>({dilation, dilation});
-    flag_relu_ = flag_relu;
-  }
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    int oh = (ih_ - 1) * strides_[0] - 2 * paddings_[0] +
-             dilations_[0] * (ks_ - 1) + 1;
-    int ow = (iw_ - 1) * strides_[1] - 2 * paddings_[1] +
-             dilations_[1] * (ks_ - 1) + 1;
-    CHECK(oh > 0 || ow > 0);
-
-    std::vector<int64_t> output_shape = {n_, oc_, oh, ow};
-    DDim output_dims(output_shape);
-    out->Resize(output_dims);
-    auto* output_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(x_);
-    const auto* x_data = x->data<float>();
-    auto* filter = scope->FindTensor(filter_);
-    const auto* filter_data = filter->data<float>();
-    const float* bias_data = nullptr;
-    if (flag_bias_) {
-      auto* bias = scope->FindTensor(bias_);
-      bias_data = bias->data<float>();
-    }
-
-    deconv_basic<float, float>(x_data,
-                               output_data,
-                               n_,
-                               oc_,
-                               oh,
-                               ow,
-                               ic_,
-                               ih_,
-                               iw_,
-                               filter_data,
-                               bias_data,
-                               groups_,
-                               ks_,
-                               ks_,
-                               strides_[1],
-                               strides_[0],
-                               dilations_[1],
-                               dilations_[0],
-                               paddings_[1],
-                               paddings_[0],
-                               flag_bias_,
-                               flag_relu_);
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("conv2d_transpose");
-    op_desc->SetInput("Input", {x_});
-    op_desc->SetInput("Filter", {filter_});
-    op_desc->SetOutput("Output", {output_});
-    op_desc->SetAttr("strides", strides_);
-    op_desc->SetAttr("paddings", paddings_);
-    op_desc->SetAttr("groups", groups_);
-    op_desc->SetAttr("dilations", dilations_);
-    if (flag_bias_) {
-      op_desc->SetInput("Bias", {bias_});
-    }
-    op_desc->SetAttr("fuse_relu", flag_relu_);
-  }
-
-  void PrepareData() override {
-    std::vector<int64_t> input_shape = {n_, ic_, ih_, iw_};
-    std::vector<int64_t> filter_shape = {ic_, oc_ / groups_, ks_, ks_};
-    std::vector<int64_t> bias_shape = {1, oc_, 1, 1};
-
-    // x tensor
-    DDim x_dims(input_shape);
-    std::vector<float> x_data(x_dims.production());
-    for (int i = 0; i < x_dims.production(); i++) {
-      float sign = i % 3 == 0 ? -1.0f : 1.0f;
-      x_data[i] = sign * static_cast<float>(i % 128) * 0.013f + 0.001;
-    }
-    SetCommonTensor(x_, x_dims, x_data.data());
-
-    // filter tensor
-    DDim filter_dims(filter_shape);
-    std::vector<float> filter_data(filter_dims.production());
-    for (int i = 0; i < filter_dims.production(); i++) {
-      float sign = i % 3 == 0 ? -1.0f : 1.0f;
-      filter_data[i] = sign * static_cast<float>(i % 128) * 0.01f + 0.001;
-    }
-    SetCommonTensor(filter_, filter_dims, filter_data.data());
-
-    // bias tensor
-    if (flag_bias_) {
-      DDim bias_dims(bias_shape);
-      std::vector<float> bias_data(bias_dims.production());
-      for (int i = 0; i < bias_dims.production(); i++) {
-        float sign = i % 3 == 0 ? -1.0f : 1.0f;
-        bias_data[i] = sign * static_cast<float>(i % 128) * 0.01f + 0.001;
-      }
-      SetCommonTensor(bias_, bias_dims, bias_data.data());
-    }
-  }
-};
-
-TEST(conv2d_transpose, precision) {
-  LOG(INFO) << "test conv2d_transpose op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  for (auto n : {1, 2}) {
-    for (auto ic : {1, 4 /*, 128*/}) {
-      for (auto oc : {1, 4 /*, 128*/}) {
-        LOG(INFO) << "n:" << n << ",ic:" << ic << ",oc:" << oc;
-        for (auto ih : {8, 16 /*, 56 , 112, 224, 512*/}) {
-          for (auto iw : {8, 16 /*, 56, 112, 224, 512*/}) {
-            for (auto flag_bias : {false, true}) {
-              for (auto flag_relu : {false, true}) {
-                for (auto dilation : {1, 2}) {
-                  for (auto stride : {1, 2}) {
-                    for (auto padding : {0, 2}) {
-                      for (auto ks : {2, 5}) {
-                        for (auto group : {1, 2}) {
-                          // obtain shape
-                          // LOG(INFO) << "n:" << n << ",ic:" << ic << ",oc:" <<
-                          // oc
-                          //           << ",ih:" << ih << ",iw:" << iw
-                          //           << ",flag_bias:" << flag_bias
-                          //           << ",flag_relu:" << flag_relu
-                          //           << ",dila:" << dilation
-                          //           << ",stride:" << stride
-                          //           << ",padding:" << padding << ",ks:" << ks
-                          //           << ",group:" << group;
-                          if (ic % group != 0 || oc % group != 0) {
-                            group = 1;
-                          }
-                          std::unique_ptr<arena::TestCase> tester(
-                              new Conv2DTransposeComputeTester(place,
-                                                               "def",
-                                                               n,
-                                                               ic,
-                                                               oc,
-                                                               ih,
-                                                               iw,
-                                                               flag_bias,
-                                                               flag_relu,
-                                                               dilation,
-                                                               stride,
-                                                               padding,
-                                                               ks,
-                                                               group));
-                          arena::Arena arena(std::move(tester), place, 2e-5);
-                          arena.TestPrecision();
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/crop_compute_test.cc b/lite/tests/kernels/crop_compute_test.cc
deleted file mode 100644
index e2467d0514..0000000000
--- a/lite/tests/kernels/crop_compute_test.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-class CropComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input_ = "X";
-  std::string output_ = "Out";
-
-  DDim dims_{{1, 32, 113, 113}};
-  std::vector<int> offsets_;
-  std::vector<int> shape_;
-
- public:
-  CropComputeTester(const Place& place,
-                    const std::string& alias,
-                    std::vector<int> offsets,
-                    std::vector<int> shape)
-      : TestCase(place, alias), offsets_(offsets), shape_(shape) {}
-
-  void RunBaseline(Scope* scope) override {
-    LOG(INFO) << "into runbase";
-    auto* out = scope->NewTensor(output_);
-    LOG(INFO) << "1";
-    CHECK(out);
-    LOG(INFO) << "2";
-    CHECK_EQ(shape_.size(), 4) << "shape size is" << shape_.size();
-    lite::DDim output_shape(dims_);
-    LOG(INFO) << "2.1";
-    output_shape[0] = dims_[0];
-    LOG(INFO) << "2.2";
-    output_shape[1] = shape_[1];
-    LOG(INFO) << "2.3";
-    output_shape[2] = shape_[2];
-    output_shape[3] = shape_[3];
-    LOG(INFO) << "2.4";
-    out->Resize(output_shape);
-    LOG(INFO) << "3";
-
-    auto* x = scope->FindTensor(input_);
-    LOG(INFO) << "into middle";
-    CHECK_EQ(shape_.size(), 4) << "shape size is" << shape_.size();
-    int c_off = offsets_[1];
-    int h_off = offsets_[2];
-    int w_off = offsets_[3];
-    int c_end = shape_[1] + c_off;
-    int h_end = shape_[2] + h_off;
-    int w_end = shape_[3] + w_off;
-
-    int num = dims_[0];
-    int in_c = dims_[1];
-    int in_h = dims_[2];
-    int in_w = dims_[3];
-    const float* ptr_in = x->data<float>();
-    float* ptr_out = out->mutable_data<float>();
-    for (int i = 0; i < num; ++i) {
-      int offset_n = i * in_c * in_h * in_w;
-      for (int j = c_off; j < c_end; ++j) {
-        int offset_c = offset_n + j * in_h * in_w;
-        for (int k = h_off; k < h_end; ++k) {
-          int offset_h = offset_c + k * in_w;
-          for (int l = w_off; l < w_end; ++l) {
-            ptr_out[0] = ptr_in[offset_h + l];
-            ptr_out++;
-          }
-        }
-      }
-    }
-    LOG(INFO) << "get out of runbase";
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("crop");
-    op_desc->SetInput("X", {input_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("offsets", offsets_);
-    op_desc->SetAttr("shape", shape_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1;
-    }
-    SetCommonTensor(input_, dims_, data.data());
-  }
-};
-
-void TestCrop(const Place& place) {
-  std::vector<int> offset = {0, 0, 1, 1};
-  std::vector<int> shape = {-1, 32, 112, 112};
-  std::unique_ptr<arena::TestCase> tester(
-      new CropComputeTester(place, "def", offset, shape));
-  arena::Arena arena(std::move(tester), place, 2e-5);
-  arena.TestPrecision();
-}
-
-TEST(Crop, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  TestCrop(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/decode_bboxes_compute_test.cc b/lite/tests/kernels/decode_bboxes_compute_test.cc
deleted file mode 100644
index 63db4eabe2..0000000000
--- a/lite/tests/kernels/decode_bboxes_compute_test.cc
+++ /dev/null
@@ -1,225 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-class DecodeBboxesComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string loc_ = "loc";
-  std::string prior_ = "prior";
-  std::string bbox_ = "bbox";
-  int batch_num_;
-  int num_priors_;
-  int num_loc_classes_{0};
-  int background_label_id_{0};
-  bool share_location_{true};
-  bool variance_encoded_in_target_;
-  std::string code_type_;
-  DDim loc_dims_;
-  DDim prior_dims_;
-
- public:
-  DecodeBboxesComputeTester(const Place& place,
-                            const std::string& alias,
-                            int batch_num,
-                            int num_priors,
-                            int num_loc_classes,
-                            int background_label_id,
-                            bool share_location,
-                            bool variance_encoded_in_target,
-                            std::string code_type,
-                            DDim loc_dims,
-                            DDim prior_dims)
-      : TestCase(place, alias),
-        batch_num_(batch_num),
-        num_priors_(num_priors),
-        num_loc_classes_(num_loc_classes),
-        background_label_id_(background_label_id),
-        share_location_(share_location),
-        variance_encoded_in_target_(variance_encoded_in_target),
-        code_type_(code_type),
-        loc_dims_(loc_dims),
-        prior_dims_(prior_dims) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* bbox = scope->NewTensor(bbox_);
-    CHECK(bbox);
-    bbox->Resize(loc_dims_);
-    auto* bbox_data = bbox->mutable_data<float>();
-
-    auto* loc = scope->FindTensor(loc_);
-    const auto* loc_data = loc->data<float>();
-
-    auto* prior = scope->FindTensor(prior_);
-    const auto* prior_data = prior->data<float>();
-
-    for (int n = 0; n < batch_num_; ++n) {
-      const float* ptr_loc_batch = loc_data + n * num_priors_ * 4;
-      float* ptr_bbox_batch = bbox_data + n * num_priors_ * 4;
-      for (int i = 0; i < num_priors_; ++i) {
-        int idx = i * 4;
-        const float* ptr_loc = ptr_loc_batch + idx;
-        const float* ptr_prior = prior_data + idx;
-        float* ptr_bbox = ptr_bbox_batch + idx;
-
-        float p_xmin = ptr_prior[0];
-        float p_ymin = ptr_prior[1];
-        float p_xmax = ptr_prior[2];
-        float p_ymax = ptr_prior[3];
-        float prior_width = p_xmax - p_xmin;
-        float prior_height = p_ymax - p_ymin;
-        float prior_center_x = (p_xmin + p_xmax) / 2.f;
-        float prior_center_y = (p_ymin + p_ymax) / 2.f;
-
-        float xmin = ptr_loc[0];
-        float ymin = ptr_loc[1];
-        float xmax = ptr_loc[2];
-        float ymax = ptr_loc[3];
-
-        if (code_type_ == "corner") {
-          if (variance_encoded_in_target_) {
-            ptr_bbox[0] = ptr_loc[0] + ptr_prior[0];
-            ptr_bbox[1] = ptr_loc[1] + ptr_prior[1];
-            ptr_bbox[2] = ptr_loc[2] + ptr_prior[2];
-            ptr_bbox[3] = ptr_loc[3] + ptr_prior[3];
-          } else {
-            const float* variance_data = prior_data + 4 * num_priors_;
-            const float* ptr_var = variance_data + idx;
-            ptr_bbox[0] = ptr_var[0] * ptr_loc[0] + ptr_prior[0];
-            ptr_bbox[1] = ptr_var[1] * ptr_loc[1] + ptr_prior[1];
-            ptr_bbox[2] = ptr_var[2] * ptr_loc[2] + ptr_prior[2];
-            ptr_bbox[3] = ptr_var[3] * ptr_loc[3] + ptr_prior[3];
-          }
-        } else if (code_type_ == "center_size") {
-          float decode_bbox_center_x;
-          float decode_bbox_center_y;
-          float decode_bbox_width;
-          float decode_bbox_height;
-          if (variance_encoded_in_target_) {
-            //! variance is encoded in target, we simply need to retore the
-            //! offset
-            //! predictions.
-            decode_bbox_center_x = xmin * prior_width + prior_center_x;
-            decode_bbox_center_y = ymin * prior_height + prior_center_y;
-            decode_bbox_width = std::exp(xmax) * prior_width;
-            decode_bbox_height = std::exp(ymax) * prior_height;
-          } else {
-            const float* variance_data = prior_data + 4 * num_priors_;
-            const float* ptr_var = variance_data + idx;
-            decode_bbox_center_x =
-                ptr_var[0] * xmin * prior_width + prior_center_x;
-            decode_bbox_center_y =
-                ptr_var[1] * ymin * prior_height + prior_center_y;
-            decode_bbox_width = std::exp(ptr_var[2] * xmax) * prior_width;
-            decode_bbox_height = std::exp(ptr_var[3] * ymax) * prior_height;
-          }
-          ptr_bbox[0] = decode_bbox_center_x - decode_bbox_width / 2.f;
-          ptr_bbox[1] = decode_bbox_center_y - decode_bbox_height / 2.f;
-          ptr_bbox[2] = decode_bbox_center_x + decode_bbox_width / 2.f;
-          ptr_bbox[3] = decode_bbox_center_y + decode_bbox_height / 2.f;
-        } else if (code_type_ == "corner_size") {
-          if (variance_encoded_in_target_) {
-            ptr_bbox[0] = p_xmin + ptr_loc[0] * prior_width;
-            ptr_bbox[1] = p_ymin + ptr_loc[1] * prior_height;
-            ptr_bbox[2] = p_xmax + ptr_loc[2] * prior_width;
-            ptr_bbox[3] = p_ymax + ptr_loc[3] * prior_height;
-          } else {
-            const float* variance_data = prior_data + 4 * num_priors_;
-            const float* ptr_var = variance_data + idx;
-            ptr_bbox[0] = p_xmin + ptr_loc[0] * ptr_var[0] * prior_width;
-            ptr_bbox[1] = p_ymin + ptr_loc[1] * ptr_var[1] * prior_width;
-            ptr_bbox[2] = p_xmax + ptr_loc[2] * ptr_var[2] * prior_width;
-            ptr_bbox[3] = p_ymax + ptr_loc[3] * ptr_var[3] * prior_width;
-          }
-        } else {
-          LOG(FATAL) << "unsupported code type: " << code_type_;
-        }
-      }
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("decode_bboxes");
-    op_desc->SetInput("Loc", {loc_});
-    op_desc->SetInput("Prior", {prior_});
-    op_desc->SetOutput("Bbox", {bbox_});
-    op_desc->SetAttr("batch_num", batch_num_);
-    op_desc->SetAttr("num_priors", num_priors_);
-    op_desc->SetAttr("num_loc_classes", num_loc_classes_);
-    op_desc->SetAttr("background_label_id", background_label_id_);
-    op_desc->SetAttr("share_location", share_location_);
-    op_desc->SetAttr("variance_encoded_in_target", variance_encoded_in_target_);
-    op_desc->SetAttr("code_type", code_type_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> loc_data(loc_dims_.production());
-    std::vector<float> prior_data(prior_dims_.production());
-
-    for (int i = 0; i < loc_dims_.production(); i++) {
-      loc_data[i] = i * 1.1 / loc_dims_.production();
-    }
-    for (int i = 0; i < prior_dims_.production(); i++) {
-      prior_data[i] = i * 1.2 / prior_dims_.production();
-    }
-
-    SetCommonTensor(loc_, loc_dims_, loc_data.data());
-    SetCommonTensor(prior_, prior_dims_, prior_data.data());
-  }
-};
-
-void test_decode_bboxes(Place place) {
-  for (int batch_num : {1, 2, 3, 4}) {
-    for (int num_priors : {1, 3, 4, 8, 10}) {
-      for (std::string code_type : {"corner", "center_size", "corner_size"}) {
-        for (bool variance_encoded_in_target : {true, false}) {
-          std::unique_ptr<arena::TestCase> tester(
-              new DecodeBboxesComputeTester(place,
-                                            "def",
-                                            batch_num,
-                                            num_priors,
-                                            0,
-                                            0,
-                                            true,
-                                            variance_encoded_in_target,
-                                            code_type,
-                                            DDim({batch_num, num_priors * 4}),
-                                            DDim({1, 2, num_priors * 4})));
-          arena::Arena arena(std::move(tester), place, 2e-5);
-          arena.TestPrecision();
-        }
-      }
-    }
-  }
-}
-
-TEST(DecodeBboxes, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_decode_bboxes(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/elementwise_compute_test.cc b/lite/tests/kernels/elementwise_compute_test.cc
deleted file mode 100644
index 90f7d02362..0000000000
--- a/lite/tests/kernels/elementwise_compute_test.cc
+++ /dev/null
@@ -1,665 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-class ElementwiseComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string inputx_ = "x";
-  std::string inputy_ = "y";
-  std::string output_ = "out";
-  int axis_;
-  DDim dims_{{1, 2, 3, 4}};
-
- public:
-  ElementwiseComputeTester(const Place& place,
-                           const std::string& alias,
-                           int axis)
-      : TestCase(place, alias), axis_(axis) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(inputx_);
-    const auto* x_data = x->data<float>();
-    auto* y = scope->FindTensor(inputy_);
-    const auto* y_data = x->data<float>();
-
-    for (int i = 0; i < dims_.production(); i++) {
-      out_data[i] = x_data[i] + y_data[i];
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("elementwise_add");
-    op_desc->SetInput("X", {inputx_});
-    op_desc->SetInput("Y", {inputy_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("axis", axis_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(inputx_, dims_, data.data());
-    SetCommonTensor(inputy_, dims_, data.data());
-  }
-};
-
-class ElementwiseSubComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string inputx_ = "x";
-  std::string inputy_ = "y";
-  std::string output_ = "out";
-  int axis_;
-  DDim dims_{{1, 2, 3, 4}};
-
- public:
-  ElementwiseSubComputeTester(const Place& place,
-                              const std::string& alias,
-                              int axis)
-      : TestCase(place, alias), axis_(axis) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(inputx_);
-    const auto* x_data = x->data<float>();
-    auto* y = scope->FindTensor(inputy_);
-    const auto* y_data = x->data<float>();
-
-    for (int i = 0; i < dims_.production(); i++) {
-      out_data[i] = x_data[i] - y_data[i];
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("elementwise_sub");
-    op_desc->SetInput("X", {inputx_});
-    op_desc->SetInput("Y", {inputy_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("axis", axis_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(inputx_, dims_, data.data());
-    SetCommonTensor(inputy_, dims_, data.data());
-  }
-};
-
-class ElementwiseMulComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string inputx_ = "x";
-  std::string inputy_ = "y";
-  std::string output_ = "out";
-  int axis_;
-  DDim dims_{{1, 2, 3, 4}};
-
- public:
-  ElementwiseMulComputeTester(const Place& place,
-                              const std::string& alias,
-                              int axis)
-      : TestCase(place, alias), axis_(axis) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(inputx_);
-    const auto* x_data = x->data<float>();
-    auto* y = scope->FindTensor(inputy_);
-    const auto* y_data = x->data<float>();
-
-    for (int i = 0; i < dims_.production(); i++) {
-      out_data[i] = x_data[i] * y_data[i];
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("elementwise_mul");
-    op_desc->SetInput("X", {inputx_});
-    op_desc->SetInput("Y", {inputy_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("axis", axis_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(inputx_, dims_, data.data());
-    SetCommonTensor(inputy_, dims_, data.data());
-  }
-};
-
-class ElementwiseMaxComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string inputx_ = "x";
-  std::string inputy_ = "y";
-  std::string output_ = "out";
-  int axis_;
-  DDim dims_{{1, 2, 3, 4}};
-
- public:
-  ElementwiseMaxComputeTester(const Place& place,
-                              const std::string& alias,
-                              int axis)
-      : TestCase(place, alias), axis_(axis) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(inputx_);
-    const auto* x_data = x->data<float>();
-    auto* y = scope->FindTensor(inputy_);
-    const auto* y_data = x->data<float>();
-
-    for (int i = 0; i < dims_.production(); i++) {
-      out_data[i] = std::max(x_data[i], y_data[i]);
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("elementwise_max");
-    op_desc->SetInput("X", {inputx_});
-    op_desc->SetInput("Y", {inputy_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("axis", axis_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(inputx_, dims_, data.data());
-    SetCommonTensor(inputy_, dims_, data.data());
-  }
-};
-
-class FusionElementwiseAddActivationComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string inputx_ = "x";
-  std::string inputy_ = "y";
-  std::string output_ = "out";
-  int axis_;
-  std::string act_type_;
-  DDim dims_{{1, 2, 3, 4}};
-
- public:
-  FusionElementwiseAddActivationComputeTester(const Place& place,
-                                              const std::string& alias,
-                                              int axis,
-                                              std::string act_type)
-      : TestCase(place, alias), axis_(axis), act_type_(act_type) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(inputx_);
-    const auto* x_data = x->data<float>();
-    auto* y = scope->FindTensor(inputy_);
-    const auto* y_data = x->data<float>();
-
-    for (int i = 0; i < dims_.production(); i++) {
-      out_data[i] = x_data[i] + y_data[i];
-      if (act_type_ == "relu") {
-        out_data[i] = out_data[i] > 0 ? out_data[i] : 0;
-      } else {
-        LOG(FATAL) << "unsupported Activation type: " << act_type_;
-      }
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("fusion_elementwise_add_activation");
-    op_desc->SetInput("X", {inputx_});
-    op_desc->SetInput("Y", {inputy_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("axis", axis_);
-    op_desc->SetAttr("act_type", act_type_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(inputx_, dims_, data.data());
-    SetCommonTensor(inputy_, dims_, data.data());
-  }
-};
-
-class FusionElementwiseSubActivationComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string inputx_ = "x";
-  std::string inputy_ = "y";
-  std::string output_ = "out";
-  int axis_;
-  std::string act_type_;
-  DDim dims_{{1, 2, 3, 4}};
-
- public:
-  FusionElementwiseSubActivationComputeTester(const Place& place,
-                                              const std::string& alias,
-                                              int axis,
-                                              std::string act_type)
-      : TestCase(place, alias), axis_(axis), act_type_(act_type) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(inputx_);
-    const auto* x_data = x->data<float>();
-    auto* y = scope->FindTensor(inputy_);
-    const auto* y_data = x->data<float>();
-
-    for (int i = 0; i < dims_.production(); i++) {
-      out_data[i] = x_data[i] - y_data[i];
-      if (act_type_ == "relu") {
-        out_data[i] = out_data[i] > 0 ? out_data[i] : 0;
-      } else {
-        LOG(FATAL) << "unsupported Activation type: " << act_type_;
-      }
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("fusion_elementwise_sub_activation");
-    op_desc->SetInput("X", {inputx_});
-    op_desc->SetInput("Y", {inputy_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("axis", axis_);
-    op_desc->SetAttr("act_type", act_type_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(inputx_, dims_, data.data());
-    SetCommonTensor(inputy_, dims_, data.data());
-  }
-};
-
-class FusionElementwiseMulActivationComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string inputx_ = "x";
-  std::string inputy_ = "y";
-  std::string output_ = "out";
-  int axis_;
-  std::string act_type_;
-  DDim dims_{{1, 2, 3, 4}};
-
- public:
-  FusionElementwiseMulActivationComputeTester(const Place& place,
-                                              const std::string& alias,
-                                              int axis,
-                                              std::string act_type)
-      : TestCase(place, alias), axis_(axis), act_type_(act_type) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(inputx_);
-    const auto* x_data = x->data<float>();
-    auto* y = scope->FindTensor(inputy_);
-    const auto* y_data = x->data<float>();
-
-    for (int i = 0; i < dims_.production(); i++) {
-      out_data[i] = x_data[i] * y_data[i];
-      if (act_type_ == "relu") {
-        out_data[i] = out_data[i] > 0 ? out_data[i] : 0;
-      } else {
-        LOG(FATAL) << "unsupported Activation type: " << act_type_;
-      }
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("fusion_elementwise_mul_activation");
-    op_desc->SetInput("X", {inputx_});
-    op_desc->SetInput("Y", {inputy_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("axis", axis_);
-    op_desc->SetAttr("act_type", act_type_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(inputx_, dims_, data.data());
-    SetCommonTensor(inputy_, dims_, data.data());
-  }
-};
-
-class FusionElementwiseMaxActivationComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string inputx_ = "x";
-  std::string inputy_ = "y";
-  std::string output_ = "out";
-  int axis_;
-  std::string act_type_;
-  DDim dims_{{1, 2, 3, 4}};
-
- public:
-  FusionElementwiseMaxActivationComputeTester(const Place& place,
-                                              const std::string& alias,
-                                              int axis,
-                                              std::string act_type)
-      : TestCase(place, alias), axis_(axis), act_type_(act_type) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(inputx_);
-    const auto* x_data = x->data<float>();
-    auto* y = scope->FindTensor(inputy_);
-    const auto* y_data = x->data<float>();
-
-    for (int i = 0; i < dims_.production(); i++) {
-      out_data[i] = std::max(x_data[i], y_data[i]);
-      if (act_type_ == "relu") {
-        out_data[i] = out_data[i] > 0 ? out_data[i] : 0;
-      } else {
-        LOG(FATAL) << "unsupported Activation type: " << act_type_;
-      }
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("fusion_elementwise_max_activation");
-    op_desc->SetInput("X", {inputx_});
-    op_desc->SetInput("Y", {inputy_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("axis", axis_);
-    op_desc->SetAttr("act_type", act_type_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(inputx_, dims_, data.data());
-    SetCommonTensor(inputy_, dims_, data.data());
-  }
-};
-
-class ElementwiseDivComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string inputx_ = "x";
-  std::string inputy_ = "y";
-  std::string output_ = "out";
-  int axis_;
-  DDim dims_{{1, 2, 3, 4}};
-
- public:
-  ElementwiseDivComputeTester(const Place& place,
-                              const std::string& alias,
-                              int axis)
-      : TestCase(place, alias), axis_(axis) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(inputx_);
-    const auto* x_data = x->data<float>();
-    auto* y = scope->FindTensor(inputy_);
-    const auto* y_data = y->data<float>();
-
-    for (int i = 0; i < dims_.production(); i++) {
-      out_data[i] = x_data[i] / y_data[i];
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("elementwise_div");
-    op_desc->SetInput("X", {inputx_});
-    op_desc->SetInput("Y", {inputy_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("axis", axis_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    std::vector<float> data2(dims_.production());
-    for (int i = 0; i < dims_.production(); i++) {
-      data2[i] = (i + 1) * 1.1;
-    }
-
-    SetCommonTensor(inputx_, dims_, data.data());
-    SetCommonTensor(inputy_, dims_, data2.data());
-  }
-};
-
-class FusionElementwiseDivActivationComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string inputx_ = "x";
-  std::string inputy_ = "y";
-  std::string output_ = "out";
-  int axis_;
-  std::string act_type_;
-  DDim dims_{{1, 2, 3, 4}};
-
- public:
-  FusionElementwiseDivActivationComputeTester(const Place& place,
-                                              const std::string& alias,
-                                              int axis,
-                                              std::string act_type)
-      : TestCase(place, alias), axis_(axis), act_type_(act_type) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(inputx_);
-    const auto* x_data = x->data<float>();
-    auto* y = scope->FindTensor(inputy_);
-    const auto* y_data = y->data<float>();
-
-    for (int i = 0; i < dims_.production(); i++) {
-      out_data[i] = x_data[i] / y_data[i];
-      if (act_type_ == "relu") {
-        out_data[i] = out_data[i] > 0 ? out_data[i] : 0;
-      } else {
-        LOG(FATAL) << "unsupported Activation type: " << act_type_;
-      }
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("fusion_elementwise_div_activation");
-    op_desc->SetInput("X", {inputx_});
-    op_desc->SetInput("Y", {inputy_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("axis", axis_);
-    op_desc->SetAttr("act_type", act_type_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-    std::vector<float> data2(dims_.production());
-    for (int i = 0; i < dims_.production(); i++) {
-      data2[i] = (i + 1) * 1.1;
-    }
-    SetCommonTensor(inputx_, dims_, data.data());
-    SetCommonTensor(inputy_, dims_, data2.data());
-  }
-};
-
-void test_elementwise(Place place) {
-  for (int axis : {-1, 0, 1, 3}) {
-    std::unique_ptr<arena::TestCase> tester(
-        new ElementwiseComputeTester(place, "def", axis));
-    arena::Arena arena(std::move(tester), place, 2e-5);
-    arena.TestPrecision();
-
-    std::unique_ptr<arena::TestCase> tester_sub(
-        new ElementwiseSubComputeTester(place, "def", axis));
-    arena::Arena arena_sub(std::move(tester_sub), place, 2e-5);
-    arena_sub.TestPrecision();
-
-    std::unique_ptr<arena::TestCase> tester_mul(
-        new ElementwiseMulComputeTester(place, "def", axis));
-    arena::Arena arena_mul(std::move(tester_mul), place, 2e-5);
-    arena_mul.TestPrecision();
-
-    std::unique_ptr<arena::TestCase> tester_max(
-        new ElementwiseMaxComputeTester(place, "def", axis));
-    arena::Arena arena_max(std::move(tester_max), place, 2e-5);
-    arena_max.TestPrecision();
-
-    std::unique_ptr<arena::TestCase> tester_div(
-        new ElementwiseDivComputeTester(place, "def", axis));
-    arena::Arena arena_div(std::move(tester_div), place, 2e-5);
-    arena_div.TestPrecision();
-  }
-}
-
-TEST(Elementwise, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_elementwise(place);
-#endif
-}
-
-void test_fusion_elementwise(Place place) {
-  for (int axis : {-1, 0, 1, 3}) {
-    std::unique_ptr<arena::TestCase> tester_add_act(
-        new FusionElementwiseAddActivationComputeTester(
-            place, "def", axis, "relu"));
-    arena::Arena arena_add_act(std::move(tester_add_act), place, 2e-5);
-    arena_add_act.TestPrecision();
-
-    std::unique_ptr<arena::TestCase> tester_sub_act(
-        new FusionElementwiseSubActivationComputeTester(
-            place, "def", axis, "relu"));
-    arena::Arena arena_sub_act(std::move(tester_sub_act), place, 2e-5);
-    arena_sub_act.TestPrecision();
-
-    std::unique_ptr<arena::TestCase> tester_mul_act(
-        new FusionElementwiseMulActivationComputeTester(
-            place, "def", axis, "relu"));
-    arena::Arena arena_mul_act(std::move(tester_mul_act), place, 2e-5);
-    arena_mul_act.TestPrecision();
-
-    std::unique_ptr<arena::TestCase> tester_max_act(
-        new FusionElementwiseMaxActivationComputeTester(
-            place, "def", axis, "relu"));
-    arena::Arena arena_max_act(std::move(tester_max_act), place, 2e-5);
-    arena_max_act.TestPrecision();
-
-    std::unique_ptr<arena::TestCase> tester_div_act(
-        new FusionElementwiseDivActivationComputeTester(
-            place, "def", axis, "relu"));
-    arena::Arena arena_div_act(std::move(tester_div_act), place, 2e-5);
-    arena_div_act.TestPrecision();
-  }
-}
-
-TEST(FusionElementwise, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_fusion_elementwise(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/expand_compute_test.cc b/lite/tests/kernels/expand_compute_test.cc
deleted file mode 100644
index 4ab1c15a5e..0000000000
--- a/lite/tests/kernels/expand_compute_test.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-class ExpandComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string x_ = "X";
-  std::string out_ = "Out";
-  std::vector<int> expand_times_;
-  DDim dims_;
-
- public:
-  ExpandComputeTester(const Place& place,
-                      const std::string& alias,
-                      const std::vector<int>& expand_times,
-                      DDim dims)
-      : TestCase(place, alias), expand_times_(expand_times), dims_(dims) {}
-
-  void RunBaseline(Scope* scope) override {
-    const auto* input = scope->FindTensor(x_);
-    CHECK(input);
-    auto* out = scope->NewTensor(out_);
-    CHECK(out);
-
-    DDim out_shape(input->dims());
-    DDim in_shape = input->dims();
-
-    for (size_t i = 0; i < expand_times_.size(); ++i) {
-      out_shape[i] *= expand_times_[i];
-    }
-    out->Resize(out_shape);
-    float* out_data = out->mutable_data<float>();
-    const float* input_data = input->data<float>();
-    std::vector<int> in_stride(in_shape.size(), 1),
-        out_stride(out_shape.size(), 1);
-    for (int i = in_shape.size() - 2; i >= 0; --i) {
-      in_stride[i] = in_shape[i + 1] * in_stride[i + 1];
-    }
-    for (int i = out_shape.size() - 2; i >= 0; --i) {
-      out_stride[i] = out_shape[i + 1] * out_stride[i + 1];
-    }
-    for (size_t out_id = 0; out_id < out_shape.production(); ++out_id) {
-      int in_id = 0;
-      for (int i = expand_times_.size() - 1; i >= 0; --i) {
-        int in_j = (out_id / out_stride[i]) % in_shape[i];
-        in_id += in_j * in_stride[i];
-      }
-      out_data[out_id] = input_data[in_id];
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("expand");
-    op_desc->SetInput("X", {x_});
-    op_desc->SetOutput("Out", {out_});
-    op_desc->SetAttr("expand_times", expand_times_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> in_data(dims_.production());
-    for (int i = 0; i < dims_.production(); ++i) {
-      in_data[i] = i;
-    }
-    SetCommonTensor(x_, dims_, in_data.data());
-  }
-};
-
-void test_expand_3dim(Place place) {
-  for (std::vector<int> expand_times : {std::vector<int>({2, 3, 1}),
-                                        std::vector<int>({2, 2, 2}),
-                                        std::vector<int>({3, 1, 2})}) {
-    for (int C : {3}) {
-      for (int H : {2}) {
-        for (int W : {4}) {
-          std::unique_ptr<arena::TestCase> tester(new ExpandComputeTester(
-              place, "def", expand_times, DDim({C, H, W})));
-          arena::Arena arena(std::move(tester), place, 2e-5);
-          arena.TestPrecision();
-        }
-      }
-    }
-  }
-}
-
-void test_expand_4dim(Place place) {
-  for (std::vector<int> expand_times : {std::vector<int>({2, 3, 1, 4}),
-                                        std::vector<int>({2, 2, 2, 2}),
-                                        std::vector<int>({3, 1, 2, 1})}) {
-    for (int N : {2}) {
-      for (int C : {3}) {
-        for (int H : {2}) {
-          for (int W : {4}) {
-            std::unique_ptr<arena::TestCase> tester(new ExpandComputeTester(
-                place, "def", expand_times, DDim({N, C, H, W})));
-            arena::Arena arena(std::move(tester), place, 2e-5);
-            arena.TestPrecision();
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(Expand, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_expand_3dim(place);
-  test_expand_4dim(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/fc_compute_test.cc b/lite/tests/kernels/fc_compute_test.cc
deleted file mode 100644
index 95a8167701..0000000000
--- a/lite/tests/kernels/fc_compute_test.cc
+++ /dev/null
@@ -1,201 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-#include "lite/tests/kernels/fill_data.h"
-#include "lite/tests/kernels/test_funcs.h"
-
-namespace paddle {
-namespace lite {
-
-void fill_bias_fc(float* out, const float* bias, int num, int channel) {
-  int remain = channel;
-  for (int j = 0; j < num; ++j) {
-    const float* ptr_bias = bias;
-    float* ptr_out = out + j * channel;
-    for (int i = 0; i < remain; ++i) {
-      *(ptr_out++) += *(ptr_bias++);
-    }
-  }
-}
-
-DDim compute_out_dim(const DDim& dim_in, const DDim& wdim, int in_num_col_dim) {
-  std::vector<int64_t> out_dim;
-  out_dim.resize(in_num_col_dim + 1);
-  auto in_mat_dims = dim_in.Flatten2D(in_num_col_dim);
-  for (int i = 0; i < in_num_col_dim; ++i) {
-    out_dim[i] = dim_in[i];
-  }
-  out_dim[in_num_col_dim] = wdim[1];
-  return DDim(out_dim);
-}
-
-class FcOPTest : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input_ = "x";
-  std::string weight_ = "w";
-  std::string bias_ = "b";
-  std::string out_ = "out";
-  int in_num_col_dims_{1};
-  DDim dims_{{1, 128}};
-  DDim wdims_{{128, 4}};
-  DDim bdims_{{4}};
-
- public:
-  FcOPTest(const Place& place,
-           const std::string& alias,
-           DDim dim_in,
-           DDim dim_w,
-           DDim dim_b,
-           int in_num_col_dims)
-      : TestCase(place, alias),
-        dims_(std::move(dim_in)),
-        wdims_(std::move(dim_w)),
-        bdims_(dim_b),
-        in_num_col_dims_(in_num_col_dims) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto x = scope->FindTensor(input_);
-    auto w = scope->FindTensor(weight_);
-    auto b = scope->FindTensor(bias_);
-    bool flag_bias = b;
-    auto out = scope->NewTensor(out_);
-    CHECK(out);
-    DDim out_dim = compute_out_dim(x->dims(), w->dims(), in_num_col_dims_);
-    out->Resize(out_dim);
-
-    LOG(INFO) << "out dims: " << out_dim;
-
-    auto x_data = x->data<float>();
-    auto w_data = w->data<float>();
-    const float* b_data = nullptr;
-    if (flag_bias) {
-      b_data = b->data<float>();
-    }
-    auto out_data = out->mutable_data<float>();
-
-    int m = x->dims().count(0, in_num_col_dims_);
-    CHECK_EQ(wdims_[0], x->dims().count(in_num_col_dims_, x->dims().size()));
-    int k = wdims_[0];
-    int n = wdims_[1];
-
-    LOG(INFO) << "m: " << m << ", n: " << n << ", k: " << k;
-
-    if (m == 1) {
-      basic_gemv(n,
-                 k,
-                 w_data,
-                 x_data,
-                 b_data,
-                 out_data,
-                 1.f,
-                 0.f,
-                 true,
-                 flag_bias,
-                 false);
-    } else {
-      basic_gemm(false,
-                 false,
-                 m,
-                 n,
-                 k,
-                 1.f,
-                 x_data,
-                 k,
-                 w_data,
-                 n,
-                 0.f,
-                 out_data,
-                 n,
-                 b_data,
-                 false,
-                 false);
-      if (flag_bias) {
-        fill_bias_fc(out_data, b_data, m, n);
-      }
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("fc");
-    op_desc->SetInput("Input", {input_});
-    op_desc->SetInput("W", {weight_});
-    if (bdims_.production() > 0) {
-      op_desc->SetInput("Bias", {bias_});
-    }
-    op_desc->SetOutput("Out", {out_});
-    op_desc->SetAttr<int>("in_num_col_dims", in_num_col_dims_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> din(dims_.production());
-    fill_data_rand(din.data(), -1.f, 1.f, dims_.production());
-
-    std::vector<float> win(wdims_.production());
-    fill_data_rand(win.data(), -1.f, 1.f, wdims_.production());
-
-    bool flag_bias = bdims_.production() > 0;
-    std::vector<float> bin(bdims_.production());
-    fill_data_rand(bin.data(), -1.f, 1.f, bdims_.production());
-
-    SetCommonTensor(input_, dims_, din.data());
-    SetCommonTensor(weight_, wdims_, win.data());
-    if (flag_bias) {
-      SetCommonTensor(bias_, bdims_, bin.data());
-    }
-  }
-};
-
-void test_fc(Place place) {
-  for (auto& m : {1, 3, 16}) {
-    for (auto& n : {1, 4, 16, 128, 256, 1024}) {
-      for (auto& k : {1, 16, 128, 1024}) {
-        for (auto& bflag : {false, true}) {
-          DDim dim_in{{m, k}};
-          DDim wdim{{k, n}};
-          DDim bdim{{bflag ? n : 0}};
-          std::unique_ptr<arena::TestCase> tester(
-              new FcOPTest(place, "def", dim_in, wdim, bdim, 1));
-#ifdef LITE_WITH_ARM
-          auto& ctx = tester->context()->As<ARMContext>();
-          ctx.SetRunMode(lite_api::LITE_POWER_HIGH, 1);
-#endif
-          arena::Arena arena(std::move(tester), place, 6e-5);
-          if (!arena.TestPrecision()) {
-            LOG(ERROR) << "run m: " << m << ", n: " << n << ", k: " << k
-                       << ", bias: " << (bflag ? "true" : "false") << " failed";
-            return;
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(FcOP, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_fc(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/fill_data.h b/lite/tests/kernels/fill_data.h
deleted file mode 100644
index c9ded1cbba..0000000000
--- a/lite/tests/kernels/fill_data.h
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <random>
-template <typename Dtype>
-inline void fill_data_const(Dtype* dio, Dtype value, size_t size) {
-  for (size_t i = 0; i < size; ++i) {
-    dio[i] = value;
-  }
-}
-
-template <typename Dtype>
-inline void fill_data_rand(Dtype* dio, Dtype vstart, Dtype vend, size_t size) {
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_real_distribution<float> dis(0, 1.f);
-  for (size_t i = 0; i < size; ++i) {
-    dio[i] = static_cast<Dtype>(vstart + (vend - vstart) * dis(gen));
-  }
-}
diff --git a/lite/tests/kernels/generate_proposals_compute_test.cc b/lite/tests/kernels/generate_proposals_compute_test.cc
deleted file mode 100644
index 19f7f43ed0..0000000000
--- a/lite/tests/kernels/generate_proposals_compute_test.cc
+++ /dev/null
@@ -1,183 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <fstream>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-class GenerateProposalsComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string Scores_ = "Scores";
-  std::string BboxDeltas_ = "BboxDeltas";
-  std::string ImInfo_ = "ImInfo";
-  std::string Anchors_ = "Anchors";
-  std::string Variances_ = "Variances";
-  int pre_nms_topN_ = 6000;
-  int post_nms_topN_ = 1000;
-  float nms_thresh_ = 0.699999988079071;
-  float min_size_ = 0.0;
-  float eta_ = 1.0;
-  std::string RpnRois_ = "RpnRois";
-  std::string RpnRoiProbs_ = "RpnRoiProbs";
-
- public:
-  GenerateProposalsComputeTester(const Place& place, const std::string& alias)
-      : TestCase(place, alias) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* rois = scope->NewTensor(RpnRois_);
-    auto* probs = scope->NewTensor(RpnRoiProbs_);
-    CHECK(rois);
-    CHECK(probs);
-    rois->Resize(std::vector<int64_t>({304, 4}));
-    probs->Resize(std::vector<int64_t>({304, 1}));
-    std::vector<uint64_t> lod0({0, 152, 304});
-    LoD lod;
-    lod.push_back(lod0);
-    rois->set_lod(lod);
-    probs->set_lod(lod);
-
-    auto* rois_data = rois->mutable_data<float>();
-    auto* probs_data = probs->mutable_data<float>();
-
-    std::string base_path = "/data/local/tmp/data_files/";
-    std::string filename;
-    std::ifstream reader;
-    // rois
-    filename = "result_generate_proposals_0.tmp_0.txt";
-    reader.open(base_path + filename);
-    for (int i = 0; i < rois->numel(); i++) {
-      reader >> rois_data[i];
-    }
-    LOG(INFO) << "Read Rois data." << rois_data[0];
-    reader.close();
-    // probs
-    filename = "result_generate_proposals_0.tmp_1.txt";
-    reader.open(base_path + filename);
-    for (int i = 0; i < probs->numel(); i++) {
-      reader >> probs_data[i];
-    }
-    LOG(INFO) << "Read Probs data." << probs_data[0];
-    reader.close();
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("generate_proposals");
-
-    op_desc->SetInput("Scores", {Scores_});
-    op_desc->SetInput("BboxDeltas", {BboxDeltas_});
-    op_desc->SetInput("ImInfo", {ImInfo_});
-    op_desc->SetInput("Anchors", {Anchors_});
-    op_desc->SetInput("Variances", {Variances_});
-
-    op_desc->SetAttr("pre_nms_topN", pre_nms_topN_);
-    op_desc->SetAttr("post_nms_topN", post_nms_topN_);
-    op_desc->SetAttr("nms_thresh", nms_thresh_);
-    op_desc->SetAttr("min_size", min_size_);
-    op_desc->SetAttr("eta", eta_);
-
-    op_desc->SetOutput("RpnRois", {RpnRois_});
-    op_desc->SetOutput("RpnRoiProbs", {RpnRoiProbs_});
-  }
-
-  void PrepareData() override {
-    std::string base_path = "/data/local/tmp/data_files/";
-    std::string filename;
-    DDim dims;
-    std::vector<float> datas;
-    std::ifstream reader;
-    // Scores
-    filename = "result_rpn_cls_prob.tmp_0.txt";
-    dims = DDim(std::vector<int64_t>({2, 15, 84, 50}));
-    datas.resize(dims.production());
-    reader.open(base_path + filename);
-    for (int i = 0; i < dims.production(); i++) {
-      reader >> datas[i];
-    }
-    LOG(INFO) << "Read Scores data." << datas[0];
-    reader.close();
-    SetCommonTensor(Scores_, dims, datas.data());
-
-    // BboxDeltas
-    filename = "result_rpn_bbox_pred.tmp_1.txt";
-    dims = DDim(std::vector<int64_t>({2, 60, 84, 50}));
-    datas.resize(dims.production());
-    reader.open(base_path + filename);
-    for (int i = 0; i < dims.production(); i++) {
-      reader >> datas[i];
-    }
-    LOG(INFO) << "Read BboxDeltas  data." << datas[0];
-    reader.close();
-    reader.close();
-    SetCommonTensor(BboxDeltas_, dims, datas.data());
-
-    // ImInfo
-    filename = "result_im_info.txt";
-    dims = DDim(std::vector<int64_t>({2, 3}));
-    datas.resize(dims.production());
-    reader.open(base_path + filename);
-    for (int i = 0; i < dims.production(); i++) {
-      reader >> datas[i];
-    }
-    LOG(INFO) << "Read ImInfo  data." << datas[0];
-    reader.close();
-    SetCommonTensor(ImInfo_, dims, datas.data());
-
-    // Anchors
-    filename = "result_anchor_generator_0.tmp_0.txt";
-    dims = DDim(std::vector<int64_t>({84, 50, 15, 4}));
-    datas.resize(dims.production());
-    reader.open(base_path + filename);
-    for (int i = 0; i < dims.production(); i++) {
-      reader >> datas[i];
-    }
-    LOG(INFO) << "Read Anchors  data." << datas[0];
-    reader.close();
-    SetCommonTensor(Anchors_, dims, datas.data());
-
-    // Variances
-    filename = "result_anchor_generator_0.tmp_1.txt";
-    dims = DDim(std::vector<int64_t>({84, 50, 15, 4}));
-    datas.resize(dims.production());
-    reader.open(base_path + filename);
-    for (int i = 0; i < dims.production(); i++) {
-      reader >> datas[i];
-    }
-    LOG(INFO) << "Read Variances  data." << datas[0];
-    reader.close();
-    SetCommonTensor(Variances_, dims, datas.data());
-  }
-};
-
-TEST(GenerateProposals, precision) {
-  // The unit test for generate_proposals needs the params,
-  // which is obtained by runing model by paddle.
-  LOG(INFO) << "test generate proposals op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  std::unique_ptr<arena::TestCase> tester(
-      new GenerateProposalsComputeTester(place, "def"));
-  arena::Arena arena(std::move(tester), place, 2e-5);
-  arena.TestPrecision();
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/gru_unit_test.cc b/lite/tests/kernels/gru_unit_test.cc
deleted file mode 100644
index bf4b7dd5e2..0000000000
--- a/lite/tests/kernels/gru_unit_test.cc
+++ /dev/null
@@ -1,363 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-#include "lite/tests/kernels/fill_data.h"
-#include "lite/tests/kernels/test_funcs.h"
-
-namespace paddle {
-namespace lite {
-
-static float sigmoid(float a) { return 1.f / (1.f + exp(-a)); }
-
-static float tanh(float a) {
-  float tmp = -2.f * a;
-  return 2.f / (1.f + exp(tmp)) - 1.f;
-}
-
-static float relu(float a) { return a > 0.f ? a : 0.f; }
-
-static float identity(float a) { return a; }
-
-typedef float (*act_func)(float a);
-
-void gru_add_with_bias(const float* din,
-                       const float* bias,
-                       bool flag_bias,
-                       float* dout,
-                       int batch,
-                       int size) {
-  for (int i = 0; i < batch; ++i) {
-    auto din_batch = din + i * size;
-    auto dout_batch = dout + i * size;
-    if (flag_bias) {
-      for (int j = 0; j < size; ++j) {
-        dout_batch[j] = din_batch[j] + bias[j];
-      }
-    } else {
-      memcpy(dout_batch, din_batch, size * sizeof(float));
-    }
-  }
-}
-
-void gru_unit_reset_act_host(act_func act,
-                             float* updata_gate,
-                             int stride_update,
-                             float* reset_gate,
-                             int stride_reset,
-                             const float* hidden_prev,
-                             int stride_hidden_prev,
-                             float* reset_hidden_prev,
-                             int stride_reset_hidden_prev,
-                             int frame_size,
-                             int batch_size) {
-  for (int b = 0; b < batch_size; ++b) {
-    for (int i = 0; i < frame_size; ++i) {
-      updata_gate[i] = act(updata_gate[i]);
-      reset_gate[i] = act(reset_gate[i]);
-      reset_hidden_prev[i] = reset_gate[i] * hidden_prev[i];
-    }
-    updata_gate += stride_update;
-    reset_gate += stride_reset;
-    hidden_prev += stride_hidden_prev;
-    reset_hidden_prev += stride_reset_hidden_prev;
-  }
-}
-
-void gru_unit_out_act_host(act_func act,
-                           bool origin_mode,
-                           const float* updata_gate,
-                           int stride_update,
-                           float* cell_state,
-                           int stride_cell_state,
-                           const float* hidden_prev,
-                           int stride_hidden_prev,
-                           float* hidden,
-                           int stride_hidden,
-                           int frame_size,
-                           int batch_size) {
-  for (int b = 0; b < batch_size; ++b) {
-    if (origin_mode) {
-      for (int i = 0; i < frame_size; ++i) {
-        cell_state[i] = act(cell_state[i]);
-        hidden[i] = cell_state[i] * (1.f - updata_gate[i]) +
-                    updata_gate[i] * hidden_prev[i];
-      }
-    } else {
-      for (int i = 0; i < frame_size; ++i) {
-        cell_state[i] = act(cell_state[i]);
-        hidden[i] = hidden_prev[i] * (1.f - updata_gate[i]) +
-                    updata_gate[i] * cell_state[i];
-      }
-    }
-    updata_gate += stride_update;
-    cell_state += stride_cell_state;
-    hidden_prev += stride_hidden_prev;
-    hidden += stride_hidden;
-  }
-}
-
-void gru_unit_basic(const Tensor* input,
-                    const Tensor* hidden_prev,
-                    const Tensor* weights,
-                    const Tensor* bias,
-                    Tensor* gate,
-                    Tensor* reset_hidden_prev,
-                    Tensor* hidden,
-                    int act_gate,
-                    int act,
-                    bool origin_mode) {
-  auto batch_size = input->dims()[0];
-  auto frame_size = hidden_prev->dims()[1];
-  auto input_data = input->data<float>();
-  auto hidden_prev_data = hidden_prev->data<float>();
-  auto weight_data = weights->data<float>();
-
-  auto gate_data = gate->mutable_data<float>();
-  auto reset_hidden_prev_data = reset_hidden_prev->mutable_data<float>();
-  auto hidden_data = hidden->mutable_data<float>();
-
-  act_func act_gate_func{nullptr};
-  act_func act_func{nullptr};
-  switch (act_gate) {
-    case 0:
-      act_gate_func = identity;
-      break;
-    case 1:
-      act_gate_func = sigmoid;
-      break;
-    case 2:
-      act_gate_func = tanh;
-      break;
-    case 3:
-      act_gate_func = relu;
-      break;
-    default:
-      break;
-  }
-  switch (act) {
-    case 0:
-      act_func = identity;
-      break;
-    case 1:
-      act_func = sigmoid;
-      break;
-    case 2:
-      act_func = tanh;
-      break;
-    case 3:
-      act_func = relu;
-      break;
-    default:
-      break;
-  }
-
-  const float* bias_data = nullptr;
-  bool flag_bias = false;
-  if (bias) {
-    bias_data = bias->data<float>();
-    flag_bias = true;
-  }
-  gru_add_with_bias(
-      input_data, bias_data, flag_bias, gate_data, batch_size, frame_size * 3);
-  basic_gemm(false,
-             false,
-             batch_size,
-             2 * frame_size,
-             frame_size,
-             1.f,
-             hidden_prev_data,
-             frame_size,
-             weight_data,
-             frame_size * 2,
-             1.f,
-             gate_data,
-             frame_size * 3,
-             (const float*)nullptr,
-             false,
-             false);
-
-  gru_unit_reset_act_host(act_gate_func,
-                          gate_data,
-                          3 * frame_size,
-                          gate_data + frame_size,
-                          3 * frame_size,
-                          hidden_prev_data,
-                          frame_size,
-                          reset_hidden_prev_data,
-                          frame_size,
-                          frame_size,
-                          batch_size);
-
-  basic_gemm(false,
-             false,
-             batch_size,
-             frame_size,
-             frame_size,
-             1.f,
-             reset_hidden_prev_data,
-             frame_size,
-             weight_data + 2 * frame_size * frame_size,
-             frame_size,
-             1.f,
-             gate_data + frame_size * 2,
-             frame_size * 3,
-             bias_data,
-             false,
-             false);
-
-  gru_unit_out_act_host(act_func,
-                        origin_mode,
-                        gate_data,
-                        3 * frame_size,
-                        gate_data + 2 * frame_size,
-                        3 * frame_size,
-                        hidden_prev_data,
-                        frame_size,
-                        hidden_data,
-                        frame_size,
-                        frame_size,
-                        batch_size);
-}
-
-class GRUUnitTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input_ = "input";
-  std::string hidden_prev_ = "hidden_prev";
-  std::string weight_ = "weight";
-  std::string bias_ = "bias";
-  std::string gate_ = "gate";
-  std::string reset_hidden_prev_ = "reset_hidden_prev";
-  std::string hidden_ = "hidden";
-
-  DDim dims_{{16, 256 * 3}};
-  // 0: indentity; 1: sigmoid; 2: tanh; 3: relu
-  int gate_activation_{1};
-  int activation_{2};
-  bool origin_mode_{false};
-
- public:
-  GRUUnitTester(const Place& place,
-                const std::string& alias,
-                int gate_activation,
-                int activation,
-                bool origin_mode,
-                DDim dims)
-      : TestCase(place, alias),
-        gate_activation_(gate_activation),
-        activation_(activation),
-        origin_mode_(origin_mode),
-        dims_(dims) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto input = scope->FindTensor(input_);
-    auto hidden_prev = scope->FindTensor(hidden_prev_);
-    auto weights = scope->FindTensor(weight_);
-    auto bias = scope->FindTensor(bias_);
-
-    auto batch_size = input->dims()[0];
-    auto frame_size = hidden_prev->dims()[1];
-
-    auto hidden = scope->NewTensor(hidden_);
-    auto reset_hidden_prev = scope->NewTensor(reset_hidden_prev_);
-    auto gate = scope->NewTensor(gate_);
-
-    CHECK(hidden);
-    CHECK(reset_hidden_prev);
-    CHECK(gate);
-    hidden->Resize(lite::DDim({batch_size, frame_size}));
-    reset_hidden_prev->Resize(lite::DDim({batch_size, frame_size}));
-    gate->Resize(lite::DDim({batch_size, 3 * frame_size}));
-
-    gru_unit_basic(input,
-                   hidden_prev,
-                   weights,
-                   bias,
-                   gate,
-                   reset_hidden_prev,
-                   hidden,
-                   gate_activation_,
-                   activation_,
-                   origin_mode_);
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("gru_unit");
-    op_desc->SetInput("Input", {input_});
-    op_desc->SetInput("HiddenPrev", {hidden_prev_});
-    op_desc->SetInput("Weight", {weight_});
-    op_desc->SetInput("Bias", {bias_});
-    op_desc->SetOutput("Gate", {gate_});
-    op_desc->SetOutput("ResetHiddenPrev", {reset_hidden_prev_});
-    op_desc->SetOutput("Hidden", {hidden_});
-
-    op_desc->SetAttr("gate_activation", gate_activation_);
-    op_desc->SetAttr("activation", activation_);
-    op_desc->SetAttr("origin_mode", origin_mode_);
-  }
-
-  void PrepareData() override {
-    int64_t batch_size = dims_[0];
-    int64_t frame_size = dims_[1] / 3;
-    DDim wdim{{frame_size, frame_size * 3}};
-    DDim bdim{{1, frame_size * 3}};
-    DDim hpdim{{batch_size, frame_size}};
-
-    // set input data
-    std::vector<float> data(dims_.production());
-    fill_data_rand(data.data(), 0.f, 1.f, dims_.production());
-    SetCommonTensor(input_, dims_, data.data());
-
-    // set hidden_prev data
-    data.resize(hpdim.production());
-    fill_data_rand(data.data(), 0.f, 1.f, hpdim.production());
-    SetCommonTensor(hidden_prev_, hpdim, data.data());
-
-    // set weight data
-    data.resize(wdim.production());
-    fill_data_rand(data.data(), 0.f, 1.f, wdim.production());
-    SetCommonTensor(weight_, wdim, data.data());
-
-    // set bias data
-    data.resize(bdim.production());
-    fill_data_rand(data.data(), 0.f, 1.f, bdim.production());
-    SetCommonTensor(bias_, bdim, data.data());
-  }
-};
-
-void test_gru_unit(Place place) {
-  DDimLite dims{{8, 16 * 3}};
-  std::unique_ptr<arena::TestCase> tester(new GRUUnitTester(
-      place, "def", 1 /* sigomoid */, 2 /* tanh */, false, dims));
-#ifdef LITE_WITH_ARM
-  auto& ctx = tester->context()->template As<ARMContext>();
-  ctx.SetRunMode(lite_api::LITE_POWER_HIGH, 1);
-#endif
-  arena::Arena arena(std::move(tester), place, 2e-5);
-  arena.TestPrecision();
-}
-
-TEST(GRUUnit, precision) {
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_gru_unit(place);
-#else
-  Place place(TARGET(kHost));
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/im2sequence_compute_test.cc b/lite/tests/kernels/im2sequence_compute_test.cc
deleted file mode 100644
index df06b7869c..0000000000
--- a/lite/tests/kernels/im2sequence_compute_test.cc
+++ /dev/null
@@ -1,249 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-void im2sequence(const float* input,
-                 const int input_c,
-                 const int input_h,
-                 const int input_w,
-                 const int kernel_h,
-                 const int kernel_w,
-                 const int pad_top,
-                 const int pad_bottom,
-                 const int pad_left,
-                 const int pad_right,
-                 const int stride_h,
-                 const int stride_w,
-                 const int out_h,
-                 const int out_w,
-                 float* out) {
-  int window_size = kernel_h * kernel_w;
-  int out_rows = out_h * out_w;
-  int out_cols = input_c * window_size;
-  int H_pad = input_h + pad_top + pad_bottom;
-  int W_pad = input_w + pad_left + pad_right;
-  for (int h_id = 0; h_id < out_h; h_id++) {
-    for (int w_id = 0; w_id < out_w; w_id++) {
-      // consider dilation.
-      int start_h = h_id * stride_h - pad_top;
-      int start_w = w_id * stride_w - pad_left;
-      for (int c_id = 0; c_id < input_c; c_id++) {
-        for (int k_h_id = 0; k_h_id < kernel_h; k_h_id++) {
-          int in_h_id = start_h + k_h_id;
-          bool exceed_flag = (in_h_id < 0) || (in_h_id >= H_pad);
-          int out_start_id =
-              (h_id * out_w + w_id) * out_cols + c_id * window_size;
-          for (int k_w_id = 0; k_w_id < kernel_w; k_w_id++) {
-            int in_w_id = start_w + k_w_id;
-            exceed_flag = exceed_flag || (in_w_id < 0) || (in_w_id >= W_pad);
-            int input_id = (c_id * input_h + in_h_id) * input_w + in_w_id;
-            int out_id = out_start_id + k_h_id * kernel_w + k_w_id;
-            out[out_id] = exceed_flag ? 0.f : input[input_id];
-          }
-        }
-      }
-    }
-  }
-}
-
-class Im2SequenceComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input0_ = "x";
-  std::string input1_ = "y";
-  std::string output_ = "out";
-  std::vector<int> paddings_{{0, 0, 0, 0}};
-  std::vector<int> kernels_{{1, 1}};
-  std::vector<int> strides_{{1, 1}};
-  std::vector<int> out_strides_{{1, 1}};
-  DDim dims_{{3, 5, 4, 4}};
-
- public:
-  Im2SequenceComputeTester(const Place& place,
-                           const std::string& alias,
-                           std::vector<int> kernels,
-                           std::vector<int> paddings,
-                           std::vector<int> strides,
-                           std::vector<int> out_strides,
-                           DDim dims)
-      : TestCase(place, alias),
-        paddings_(paddings),
-        kernels_(kernels),
-        strides_(strides),
-        out_strides_(out_strides),
-        dims_(dims) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-
-    auto* x = scope->FindTensor(input0_);
-    const auto* x_data = x->data<float>();
-    int im_num = dims_[0];
-    int im_size = dims_.production() / im_num;
-    int out_cols = dims_[1] * kernels_[0] * kernels_[1];
-    int total_rows = 0;
-    std::vector<uint64_t> im_offset;
-    im_offset.push_back(total_rows);
-    /*compute out shape*/
-    auto* y = scope->FindTensor(input1_);
-    if (y) {
-      const auto* y_data = y->data<int>();
-      std::vector<int> im_real_h;
-      std::vector<int> im_real_w;
-      std::vector<int> out_h_vec;
-      std::vector<int> out_w_vec;
-
-      for (int im_id = 0; im_id < im_num; im_id++) {
-        int real_h = y_data[im_id * 2 + 0];
-        int real_w = y_data[im_id * 2 + 1];
-        int tmp_real_h = (real_h + out_strides_[0] - 1) / out_strides_[0];
-        int tmp_real_w = (real_w + out_strides_[1] - 1) / out_strides_[1];
-        im_real_h.push_back(tmp_real_h);
-        im_real_w.push_back(tmp_real_w);
-        int out_h = (tmp_real_h + paddings_[0] + paddings_[1] - kernels_[0]) /
-                        strides_[0] +
-                    1;
-        int out_w = (tmp_real_w + paddings_[2] + paddings_[3] - kernels_[1]) /
-                        strides_[1] +
-                    1;
-        out_h_vec.push_back(out_h);
-        out_w_vec.push_back(out_w);
-        total_rows += out_h * out_w;
-        im_offset.push_back(total_rows);
-      }
-      DDim out_dims{{total_rows, out_cols}};
-      out->Resize(out_dims);
-      auto* o_data = out->mutable_data<float>();
-      int out_offset = 0;
-      for (int im_id = 0; im_id < im_num; im_id++) {
-        im2sequence(x_data + im_id * im_size,
-                    dims_[1],
-                    dims_[2],
-                    dims_[3],
-                    kernels_[0],
-                    kernels_[1],
-                    paddings_[0],
-                    paddings_[1],
-                    paddings_[2],
-                    paddings_[3],
-                    strides_[0],
-                    strides_[1],
-                    out_h_vec[im_id],
-                    out_w_vec[im_id],
-                    o_data + im_offset[im_id] * out_cols);
-      }
-    } else {
-      int out_h =
-          (dims_[2] + paddings_[0] + paddings_[1] - kernels_[0]) / strides_[0] +
-          1;
-      int out_w =
-          (dims_[3] + paddings_[2] + paddings_[3] - kernels_[1]) / strides_[1] +
-          1;
-      DDim out_dims{{im_num * out_h * out_w, out_cols}};
-      out->Resize(out_dims);
-      auto* o_data = out->mutable_data<float>();
-      for (int im_id = 0; im_id < im_num; im_id++) {
-        int out_size_per_im = out_h * out_w * out_cols;
-        im2sequence(x_data + im_id * im_size,
-                    dims_[1],
-                    dims_[2],
-                    dims_[3],
-                    kernels_[0],
-                    kernels_[1],
-                    paddings_[0],
-                    paddings_[1],
-                    paddings_[2],
-                    paddings_[3],
-                    strides_[0],
-                    strides_[1],
-                    out_h,
-                    out_w,
-                    o_data + im_id * out_size_per_im);
-        im_offset.push_back(uint64_t(im_id * out_h * out_w));
-      }
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("im2sequence");
-    op_desc->SetInput("X", {input0_, input1_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("kernels", kernels_);
-    op_desc->SetAttr("paddings", paddings_);
-    op_desc->SetAttr("strides", strides_);
-    op_desc->SetAttr("out_strides", out_strides_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(input0_, dims_, data.data());
-    int in_h = dims_[2];
-    int in_w = dims_[3];
-    int in_num = dims_[0];
-    std::vector<int> real_im_size(in_num * 2);
-    for (int i = 0; i < in_num; i++) {
-      int real_h = std::rand() % static_cast<int>(in_h * 0.3) + (in_h * 0.7);
-      int real_w = std::rand() % static_cast<int>(in_w * 0.3) + (in_w * 0.7);
-      real_im_size[2 * i] = real_h;
-      real_im_size[2 * i] = real_w;
-    }
-    DDim input1_dims{{in_num, 2}};
-    SetCommonTensor(input1_, input1_dims, real_im_size.data());
-  }
-};
-
-void test_im2sequence(Place place) {
-  DDimLite dims{{3, 5, 4, 4}};
-  for (int kernel : {1}) {
-    std::vector<int> kernels{{kernel, kernel}};
-    for (int stride : {1}) {
-      std::vector<int> strides{{stride, stride}};
-      for (int padding : {0}) {
-        std::vector<int> paddings{{padding, padding, padding, padding}};
-        for (int out_stride : {1}) {
-          std::vector<int> out_strides{{out_stride, out_stride}};
-          std::unique_ptr<arena::TestCase> tester(new Im2SequenceComputeTester(
-              place, "def", kernels, paddings, strides, out_strides, dims));
-          arena::Arena arena(std::move(tester), place, 2e-5);
-          arena.TestPrecision();
-        }
-      }
-    }
-  }
-}
-
-TEST(Im2Sequence, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_im2sequence(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/increment_compute_test.cc b/lite/tests/kernels/increment_compute_test.cc
deleted file mode 100644
index 07c1e86c19..0000000000
--- a/lite/tests/kernels/increment_compute_test.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-class IncrementComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input_ = "x";
-  std::string output_ = "out";
-  float step_ = 1.f;
-  DDim dims_{{3, 5, 4, 4}};
-  bool bias_after_scale_;
-
- public:
-  IncrementComputeTester(const Place& place,
-                         const std::string& alias,
-                         float step,
-                         DDim dims)
-      : TestCase(place, alias), step_(step), dims_(dims) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(input_);
-    const auto* x_data = x->data<float>();
-    int n = dims_.production();
-    for (int i = 0; i < n; i++) {
-      out_data[i] = x_data[i] + step_;
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("increment");
-    op_desc->SetInput("X", {input_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("step", step_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(input_, dims_, data.data());
-  }
-};
-void test_increment(Place place) {
-  DDimLite dims_0{{3, 5, 4, 4}};
-  DDimLite dims_1{{3, 5}};
-  for (auto dims : {dims_0, dims_1}) {
-    for (float step : {1, 2}) {
-      std::unique_ptr<arena::TestCase> tester(
-          new IncrementComputeTester(place, "def", step, dims));
-      arena::Arena arena(std::move(tester), place, 2e-5);
-      arena.TestPrecision();
-    }
-  }
-}
-
-TEST(Increment, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_increment(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/logical_compute_test.cc b/lite/tests/kernels/logical_compute_test.cc
deleted file mode 100644
index e53ac15f9b..0000000000
--- a/lite/tests/kernels/logical_compute_test.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-bool _logical_xor_func(const bool& a, const bool& b) {
-  return (a || b) && !(a && b);
-}
-bool _logical_and_func(const bool& a, const bool& b) { return (a && b); }
-template <bool (*T)(const bool&, const bool&)>
-class LogicalXorTester : public arena::TestCase {
- protected:
-  std::string input_x_ = "x";
-  std::string input_y_ = "y";
-  std::string output_ = "out";
-  DDim dims_{{3, 5, 4, 4}};
-
- public:
-  LogicalXorTester(const Place& place, const std::string& alias, DDim dims)
-      : TestCase(place, alias), dims_(dims) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    bool* out_data = out->mutable_data<bool>();
-    auto* x = scope->FindTensor(input_x_);
-    const bool* x_data = x->data<bool>();
-    auto* y = scope->FindTensor(input_y_);
-    const bool* y_data = y->data<bool>();
-    for (int i = 0; i < dims_.production(); i++) {
-      // out_data[i] = (x_data[i] || y_data[i]) && !((x_data[i] && y_data[i]));
-      out_data[i] = T(x_data[i], y_data[i]);
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("logical_xor");
-    op_desc->SetInput("X", {input_x_});
-    op_desc->SetInput("Y", {input_y_});
-    op_desc->SetOutput("Out", {output_});
-  }
-
-  void PrepareData() override {
-    // std::vector<bool> data(dims_.production());
-    // std::vector<char> datay(dims_.production());
-    bool* data;
-    bool* datay;
-    data = reinterpret_cast<bool*>(malloc(dims_.production() * sizeof(bool)));
-    datay = reinterpret_cast<bool*>(malloc(dims_.production() * sizeof(bool)));
-    LOG(INFO) << "dims_.production()"
-              << ":::" << dims_.production();
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = 1;
-      datay[i] = 1;
-    }
-
-    SetCommonTensor(input_x_, dims_, data);
-    SetCommonTensor(input_y_, dims_, datay);
-  }
-};
-
-void test_logical(Place place) {
-  DDimLite dims{{3, 5, 4, 4}};
-  std::unique_ptr<arena::TestCase> logical_xor_tester(
-      new LogicalXorTester<_logical_xor_func>(place, "def", dims));
-  arena::Arena arena_xor(std::move(logical_xor_tester), place, 1);
-
-  arena_xor.TestPrecision();
-
-  std::unique_ptr<arena::TestCase> logical_and_tester(
-      new LogicalXorTester<_logical_and_func>(place, "def", dims));
-  arena::Arena arena_and(std::move(logical_and_tester), place, 1);
-
-  arena_and.TestPrecision();
-}
-TEST(Logical, precision) {
-// #ifdef LITE_WITH_X86
-// //   Place place(TARGET(kX86));
-// // #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_logical(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/lrn_compute_test.cc b/lite/tests/kernels/lrn_compute_test.cc
deleted file mode 100644
index cd0931fcc5..0000000000
--- a/lite/tests/kernels/lrn_compute_test.cc
+++ /dev/null
@@ -1,206 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-/**
- * @brief get sum of x^2 between channels [size elements]
- *
- * @tparam dtype
- * @param input
- * @param channel_id: the c-th channel within n-th graph.
- * @param offset_within_channel: the pixel's offset within a channel.
- * @param offset_num: the first address of n-th graph.
- * @param c
- * @param h
- * @param w
- * @param size
- * @return dtype
- */
-template <typename dtype>
-dtype lrn_square(const dtype* input,
-                 int channel_id,
-                 int offset_within_channel,
-                 int offset_num,
-                 int c,
-                 int h,
-                 int w,
-                 int size) {
-  int pre_pad = (size - 1) / 2;
-  dtype res = 0;
-  const dtype* src = input + offset_num;
-
-  // handle left channels with padding situation.
-  if (channel_id - pre_pad < 0) {
-    for (int i = 0; i <= channel_id; ++i) {
-      res += src[i * h * w + offset_within_channel] *
-             src[i * h * w + offset_within_channel];
-    }
-  }
-
-  // handle left channels.
-  if (channel_id - pre_pad >= 0) {
-    for (int i = channel_id - pre_pad; i <= channel_id; ++i) {
-      res += src[i * h * w + offset_within_channel] *
-             src[i * h * w + offset_within_channel];
-    }
-  }
-
-  // handle right channels.
-  if (channel_id + pre_pad < c) {
-    for (int i = channel_id + 1; i <= channel_id + pre_pad; ++i) {
-      res += src[i * h * w + offset_within_channel] *
-             src[i * h * w + offset_within_channel];
-    }
-  }
-
-  // handle right channels with padding situation.
-  if (channel_id + pre_pad >= c && channel_id + 1 < c) {
-    for (int i = channel_id + 1; i < c; ++i) {
-      res += src[i * h * w + offset_within_channel] *
-             src[i * h * w + offset_within_channel];
-    }
-  }
-
-  return res;
-}
-
-class LrnComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input_ = "x";
-  std::string output_ = "out";
-  float alpha_ = 1.;
-  float beta_ = 0.75;
-  float k_ = 1.;
-  int local_size_ = 5;
-  DDim dims_{{2, 4, 3, 8}};
-  std::string norm_region_{"AcrossChannels"};
-
- public:
-  LrnComputeTester(const Place& place,
-                   const std::string& alias,
-                   float alpha,
-                   float beta,
-                   float k,
-                   int local_size,
-                   std::string norm_region)
-      : TestCase(place, alias),
-        alpha_(alpha),
-        beta_(beta),
-        k_(k),
-        local_size_(local_size),
-        norm_region_(norm_region) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(input_);
-    const auto* x_data = x->data<float>();
-
-    int N = dims_[0];
-    int C = dims_[1];
-    int H = dims_[2];
-    int W = dims_[3];
-
-    int pre_pad = (local_size_ - 1) / 2;
-    int offset_num = 0;
-    int offset_within_channel = 0;
-    int dst_id;
-
-    float square;
-
-    for (int n = 0; n < N; ++n) {
-      offset_num = n * C * H * W;
-      for (int c = 0; c < C; ++c) {
-        for (int h = 0; h < H; ++h) {
-          for (int w = 0; w < W; ++w) {
-            offset_within_channel = h * W + w;
-            dst_id = offset_num + c * H * W + offset_within_channel;
-            square = lrn_square<float>(x_data,
-                                       c,
-                                       offset_within_channel,
-                                       offset_num,
-                                       C,
-                                       H,
-                                       W,
-                                       local_size_);
-            out_data[dst_id] =
-                x_data[dst_id] * pow(k_ + alpha_ * square, -beta_);
-          }
-        }
-      }
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("lrn");
-    op_desc->SetInput("X", {input_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("alpha", alpha_);
-    op_desc->SetAttr("beta", beta_);
-    op_desc->SetAttr("local_size", local_size_);
-    op_desc->SetAttr("k", k_);
-    op_desc->SetAttr("norm_region", norm_region_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(input_, dims_, data.data());
-  }
-};
-
-void test_lrn(Place place) {
-  for (float alpha : {0.9, 1., 1.1}) {
-    for (float beta : {0.5, 0.75, 1.}) {
-      for (float k : {0.9, 1., 1.1}) {
-        for (int local_size : {4, 5, 7}) {
-          for (std::string norm_region : {"AcrossChannels"}) {
-            std::unique_ptr<arena::TestCase> tester(new LrnComputeTester(
-                place, "def", alpha, beta, k, local_size, norm_region));
-            arena::Arena arena(std::move(tester), place, 2e-5);
-            arena.TestPrecision();
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(Lrn, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_lrn(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/matmul_compute_test.cc b/lite/tests/kernels/matmul_compute_test.cc
deleted file mode 100644
index 8b70f59d47..0000000000
--- a/lite/tests/kernels/matmul_compute_test.cc
+++ /dev/null
@@ -1,592 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-void matrix_mul(int m_,
-                int k_,
-                int n_,
-                float alpha,
-                const float* x,
-                const float* y,
-                float* out) {
-  for (int m = 0; m < m_; ++m) {
-    for (int n = 0; n < n_; ++n) {
-      out[m * n_ + n] = 0;
-      for (int k = 0; k < k_; ++k) {
-        out[m * n_ + n] += x[m * k_ + k] * y[k * n_ + n] * alpha;
-      }
-    }
-  }
-}
-
-void transpose(int m, int n, const float* src, float* dst) {
-  for (int i = 0; i < m; ++i) {
-    for (int j = 0; j < n; ++j) {
-      dst[j * m + i] = src[i * n + j];
-    }
-  }
-}
-
-void mul_low_efficiency(DDim x_dims_,
-                        DDim y_dims_,
-                        bool x_transpose_,
-                        bool y_transpose_,
-                        float alpha_,
-                        const float* x_data,
-                        const float* y_data,
-                        float* out_data) {
-  if (!x_transpose_ && !y_transpose_) {
-    CHECK_EQ(x_dims_[1], y_dims_[0])
-        << "not supported x_dims(" << x_dims_ << ") and y_dims(" << y_dims_
-        << "), x_transpose is " << x_transpose_ << ", y_transpose is "
-        << y_transpose_;
-    matrix_mul(
-        x_dims_[0], y_dims_[0], y_dims_[1], alpha_, x_data, y_data, out_data);
-  } else if (!x_transpose_ && y_transpose_) {
-    CHECK_EQ(x_dims_[1], y_dims_[1])
-        << "not supported x_dims(" << x_dims_ << ") and y_dims(" << y_dims_
-        << "), x_transpose is " << x_transpose_ << ", y_transpose is "
-        << y_transpose_;
-    float* y_data_trans =
-        static_cast<float*>(malloc(sizeof(float) * y_dims_[0] * y_dims_[1]));
-    transpose(y_dims_[0], y_dims_[1], y_data, y_data_trans);
-    matrix_mul(x_dims_[0],
-               x_dims_[1],
-               y_dims_[0],
-               alpha_,
-               x_data,
-               y_data_trans,
-               out_data);
-    free(y_data_trans);
-  } else if (x_transpose_ && !y_transpose_) {
-    CHECK_EQ(x_dims_[0], y_dims_[0])
-        << "not supported x_dims(" << x_dims_ << ") and y_dims(" << y_dims_
-        << "), x_transpose is " << x_transpose_ << ", y_transpose is "
-        << y_transpose_;
-    float* x_data_trans =
-        static_cast<float*>(malloc(sizeof(float) * x_dims_[0] * x_dims_[1]));
-    transpose(x_dims_[0], x_dims_[1], x_data, x_data_trans);
-    matrix_mul(x_dims_[1],
-               x_dims_[0],
-               y_dims_[1],
-               alpha_,
-               x_data_trans,
-               y_data,
-               out_data);
-    free(x_data_trans);
-  } else {
-    CHECK_EQ(x_dims_[0], y_dims_[1])
-        << "not supported x_dims(" << x_dims_ << ") and y_dims(" << y_dims_
-        << "), x_transpose is " << x_transpose_ << ", y_transpose is "
-        << y_transpose_;
-    float* x_data_trans =
-        static_cast<float*>(malloc(sizeof(float) * x_dims_[0] * x_dims_[1]));
-    float* y_data_trans =
-        static_cast<float*>(malloc(sizeof(float) * y_dims_[0] * y_dims_[1]));
-    transpose(x_dims_[0], x_dims_[1], x_data, x_data_trans);
-    transpose(y_dims_[0], y_dims_[1], y_data, y_data_trans);
-    matrix_mul(x_dims_[1],
-               x_dims_[0],
-               y_dims_[0],
-               alpha_,
-               x_data_trans,
-               y_data_trans,
-               out_data);
-    free(x_data_trans);
-    free(y_data_trans);
-  }
-}
-
-class MatMulComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string x_ = "X";
-  std::string y_ = "Y";
-  std::string out_ = "Out";
-  DDim x_dims_;
-  DDim y_dims_;
-  bool x_transpose_;
-  bool y_transpose_;
-  float alpha_;
-
- public:
-  MatMulComputeTester(const Place& place,
-                      const std::string& alias,
-                      bool x_transpose,
-                      bool y_transpose,
-                      float alpha,
-                      const DDim& x_dims,
-                      const DDim& y_dims)
-      : TestCase(place, alias),
-        x_transpose_(x_transpose),
-        y_transpose_(y_transpose),
-        alpha_(alpha),
-        x_dims_(x_dims),
-        y_dims_(y_dims) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* x = scope->FindTensor(x_);
-    auto* y = scope->FindTensor(y_);
-    CHECK(x);
-    CHECK(y);
-    const auto* x_data = x->data<float>();
-    const auto* y_data = y->data<float>();
-    auto* out = scope->NewTensor(out_);
-    CHECK(out);
-
-    std::vector<int64_t> dim_out_vec;
-    if (x_dims_.size() > 2 && y_dims_.size() >= 2) {
-      // x: [B, ..., M, K], y: [B, ..., K, N], out: [B, ..., M, N]
-      // x: [B, M, K], y: [K, N], out: [B, M, N]
-      dim_out_vec.resize(x_dims_.size());
-      for (size_t i = 0; i < x_dims_.size() - 2; ++i) {
-        dim_out_vec[i] = x_dims_[i];
-      }
-      if (!x_transpose_ && !y_transpose_) {
-        dim_out_vec[x_dims_.size() - 2] = x_dims_[x_dims_.size() - 2];
-        dim_out_vec[x_dims_.size() - 1] = y_dims_[y_dims_.size() - 1];
-      } else if (!x_transpose_ && y_transpose_) {
-        dim_out_vec[x_dims_.size() - 2] = x_dims_[x_dims_.size() - 2];
-        dim_out_vec[x_dims_.size() - 1] = y_dims_[y_dims_.size() - 2];
-      } else if (x_transpose_ && !y_transpose_) {
-        dim_out_vec[x_dims_.size() - 2] = x_dims_[x_dims_.size() - 1];
-        dim_out_vec[x_dims_.size() - 1] = y_dims_[y_dims_.size() - 1];
-      } else {
-        dim_out_vec[x_dims_.size() - 2] = x_dims_[x_dims_.size() - 1];
-        dim_out_vec[x_dims_.size() - 1] = y_dims_[y_dims_.size() - 2];
-      }
-
-      out->Resize(dim_out_vec);
-      auto* out_data = out->mutable_data<float>();
-      int x_inner = x_dims_[x_dims_.size() - 2] * x_dims_[x_dims_.size() - 1];
-
-      if (y_dims_.size() > 2) {
-        int y_inner = y_dims_[y_dims_.size() - 2] * y_dims_[y_dims_.size() - 1];
-        int o_inner =
-            dim_out_vec[x_dims_.size() - 2] * dim_out_vec[x_dims_.size() - 1];
-        for (size_t i = 0; i < x_dims_.count(0, x_dims_.size() - 2); ++i) {
-          mul_low_efficiency(
-              DDim({x_dims_[x_dims_.size() - 2], x_dims_[x_dims_.size() - 1]}),
-              DDim({y_dims_[y_dims_.size() - 2], y_dims_[y_dims_.size() - 1]}),
-              x_transpose_,
-              y_transpose_,
-              alpha_,
-              x_data + i * x_inner,
-              y_data + i * y_inner,
-              out_data + i * o_inner);
-        }
-      } else {
-        int o_inner =
-            dim_out_vec[x_dims_.size() - 2] * dim_out_vec[x_dims_.size() - 1];
-        for (size_t i = 0; i < x_dims_.count(0, x_dims_.size() - 2); ++i) {
-          mul_low_efficiency(
-              DDim({x_dims_[x_dims_.size() - 2], x_dims_[x_dims_.size() - 1]}),
-              y_dims_,
-              x_transpose_,
-              y_transpose_,
-              alpha_,
-              x_data + i * x_inner,
-              y_data,
-              out_data + i * o_inner);
-        }
-      }
-    } else if (x_dims_.size() == 2 && y_dims_.size() == 2) {
-      // x: [M, K], y: [K, N], out: [M, N]
-      dim_out_vec.resize(x_dims_.size());
-      if (x_transpose_) {
-        dim_out_vec[0] = x_dims_[1];
-      } else {
-        dim_out_vec[0] = x_dims_[0];
-      }
-      if (y_transpose_) {
-        dim_out_vec[1] = y_dims_[0];
-      } else {
-        dim_out_vec[1] = y_dims_[1];
-      }
-      out->Resize(dim_out_vec);
-      auto* out_data = out->mutable_data<float>();
-      mul_low_efficiency(x_dims_,
-                         y_dims_,
-                         x_transpose_,
-                         y_transpose_,
-                         alpha_,
-                         x_data,
-                         y_data,
-                         out_data);
-    } else if (x_dims_.size() > 2 && y_dims_.size() == 1) {
-      // x: [B, M, K], y: [K], out: [B, M]
-      CHECK_EQ(x_dims_[x_dims_.size() - 1], y_dims_[0])
-          << "not supported x_dims(" << x_dims_ << ") and y_dims(" << y_dims_
-          << ")";
-      dim_out_vec.resize(x_dims_.size() - 1);
-      for (size_t i = 0; i < dim_out_vec.size(); ++i) {
-        dim_out_vec[i] = x_dims_[i];
-      }
-      out->Resize(dim_out_vec);
-      auto* out_data = out->mutable_data<float>();
-      for (size_t i = 0; i < x_dims_.count(0, x_dims_.size() - 1); ++i) {
-        out_data[i] = 0;
-        for (size_t j = 0; j < y_dims_[0]; ++j) {
-          out_data[i] += x_data[i * y_dims_[0] + j] * y_data[j] * alpha_;
-        }
-      }
-    } else if (x_dims_.size() == 1 && y_dims_.size() == 1) {
-      // x: [K], y: [K], out: [1]
-      if (x_dims_[0] == y_dims_[0] && x_transpose_ == false &&
-          y_transpose_ == false) {
-        dim_out_vec.resize(1);
-        dim_out_vec[0] = 1;
-
-        out->Resize(dim_out_vec);
-        auto* out_data = out->mutable_data<float>();
-        out_data[0] = 0.f;
-        for (size_t i = 0; i < x_dims_[0]; ++i) {
-          out_data[0] += x_data[i] * y_data[i] * alpha_;
-        }
-      }
-      // x: [M], y: [N], x_transpose: true, y_transpose: true, out: [M, N]
-      if (x_transpose_ == true && y_transpose_ == true) {
-        dim_out_vec.resize(2);
-        dim_out_vec[0] = x_dims_[0];
-        dim_out_vec[1] = y_dims_[0];
-        out->Resize(dim_out_vec);
-        auto* out_data = out->mutable_data<float>();
-        mul_low_efficiency(DDim({x_dims_[0], 1}),
-                           DDim({1, y_dims_[0]}),
-                           false,
-                           false,
-                           alpha_,
-                           x_data,
-                           y_data,
-                           out_data);
-      }
-    } else {
-      LOG(FATAL) << "not supported x_dims(" << x_dims_ << ") and y_dims("
-                 << y_dims_ << ")";
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("matmul");
-    op_desc->SetInput("X", {x_});
-    op_desc->SetInput("Y", {y_});
-    op_desc->SetOutput("Out", {out_});
-    op_desc->SetAttr("transpose_X", x_transpose_);
-    op_desc->SetAttr("transpose_Y", y_transpose_);
-    op_desc->SetAttr("alpha", alpha_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> x_data(x_dims_.production());
-    std::vector<float> y_data(y_dims_.production());
-
-    for (int i = 0; i < x_dims_.production(); ++i) {
-      x_data[i] = 1;  // i * 1.1;
-    }
-    for (int i = 0; i < y_dims_.production(); ++i) {
-      y_data[i] = 1;  // i * 0.9;
-    }
-
-    SetCommonTensor(x_, x_dims_, x_data.data());
-    SetCommonTensor(y_, y_dims_, y_data.data());
-  }
-};
-
-void test_matmul2x2_no_transform(Place place) {
-  for (int m : {1, 2, 4, 8}) {
-    for (int k : {1, 3, 5}) {
-      for (int n : {1, 2, 4, 6}) {
-        for (float alpha : {1., 2.}) {
-          bool x_transform = false;
-          bool y_transform = false;
-          std::unique_ptr<arena::TestCase> tester(
-              new MatMulComputeTester(place,
-                                      "def",
-                                      x_transform,
-                                      y_transform,
-                                      alpha,
-                                      DDim({m, k}),
-                                      DDim({k, n})));
-          arena::Arena arena(std::move(tester), place, 5e-4);
-          arena.TestPrecision();
-        }
-      }
-    }
-  }
-}
-
-void test_matmul2x2_x_transpose(Place place) {
-  std::vector<DDim> x_dims({DDim({3, 4}), DDim({2, 5})});
-  std::vector<DDim> y_dims({DDim({3, 2}), DDim({2, 1})});
-  std::vector<float> alphas({1.f, 2.f});
-  for (int i = 0; i < x_dims.size(); ++i) {
-    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
-        place, "def", true, false, alphas[i], x_dims[i], y_dims[i]));
-    arena::Arena arena(std::move(tester), place, 2e-5);
-    arena.TestPrecision();
-  }
-}
-
-void test_matmul2x2_y_transpose(Place place) {
-  std::vector<DDim> x_dims({DDim({5, 2}), DDim({2, 5})});
-  std::vector<DDim> y_dims({DDim({3, 2}), DDim({1, 5})});
-  std::vector<float> alphas({1.f, 2.f});
-  for (int i = 0; i < x_dims.size(); ++i) {
-    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
-        place, "def", false, true, alphas[i], x_dims[i], y_dims[i]));
-    arena::Arena arena(std::move(tester), place, 2e-5);
-    arena.TestPrecision();
-  }
-}
-
-void test_matmul2x2_transpose(Place place) {
-  std::vector<DDim> x_dims({DDim({6, 2}), DDim({5, 3})});
-  std::vector<DDim> y_dims({DDim({3, 6}), DDim({1, 5})});
-  std::vector<float> alphas({1.f, 2.f});
-  for (int i = 0; i < x_dims.size(); ++i) {
-    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
-        place, "def", true, true, alphas[i], x_dims[i], y_dims[i]));
-    arena::Arena arena(std::move(tester), place, 5e-5);
-    arena.TestPrecision();
-  }
-}
-
-void test_matmul1x1_no_transpose(Place place) {
-  DDim x_dim({3});
-  DDim y_dim({3});
-  float alpha = 1.5f;
-  std::unique_ptr<arena::TestCase> tester(
-      new MatMulComputeTester(place, "def", false, false, alpha, x_dim, y_dim));
-  arena::Arena arena(std::move(tester), place, 2e-5);
-  arena.TestPrecision();
-}
-
-void test_matmul1x1_transpose(Place place) {
-  DDim x_dim({3});
-  DDim y_dim({5});
-  float alpha = 1.5f;
-  std::unique_ptr<arena::TestCase> tester(
-      new MatMulComputeTester(place, "def", true, true, alpha, x_dim, y_dim));
-  arena::Arena arena(std::move(tester), place, 2e-5);
-  arena.TestPrecision();
-}
-
-void test_matmul_nx1(Place place) {
-  DDim x_dim({3, 4, 2, 5});
-  DDim y_dim({5});
-  float alpha = 1.5f;
-  std::unique_ptr<arena::TestCase> tester(
-      new MatMulComputeTester(place, "def", false, false, alpha, x_dim, y_dim));
-  arena::Arena arena(std::move(tester), place, 2e-5);
-  arena.TestPrecision();
-}
-
-void test_matmul_nx2_1(Place place) {
-  DDim x_dim({1, 2, 2, 3});
-  DDim y_dim({3, 1});
-  float alpha = 1.f;
-  std::unique_ptr<arena::TestCase> tester(
-      new MatMulComputeTester(place, "def", false, false, alpha, x_dim, y_dim));
-  arena::Arena arena(std::move(tester), place, 2e-5);
-  arena.TestPrecision();
-}
-
-void test_matmul_nx2_2(Place place) {
-  DDim x_dim({1, 2, 2, 3});
-  DDim y_dim({3, 3});
-  float alpha = 1.5f;
-  std::unique_ptr<arena::TestCase> tester(
-      new MatMulComputeTester(place, "def", false, false, alpha, x_dim, y_dim));
-  arena::Arena arena(std::move(tester), place, 2e-5);
-  arena.TestPrecision();
-}
-
-void test_matmulnx2_x_transpose(Place place) {
-  std::vector<DDim> x_dims({DDim({3, 4, 6, 2}), DDim({5, 3, 5, 2})});
-  std::vector<DDim> y_dims({DDim({6, 2}), DDim({5, 1})});
-  std::vector<float> alphas({1.f, 2.f});
-  for (int i = 0; i < x_dims.size(); ++i) {
-    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
-        place, "def", true, false, alphas[i], x_dims[i], y_dims[i]));
-    arena::Arena arena(std::move(tester), place, 2e-4);
-    arena.TestPrecision();
-  }
-}
-
-void test_matmulnx2_y_transpose(Place place) {
-  std::vector<DDim> x_dims({DDim({3, 4, 6, 2}), DDim({5, 3, 5, 2})});
-  std::vector<DDim> y_dims({DDim({6, 2}), DDim({1, 2})});
-  std::vector<float> alphas({1.f, 2.f});
-  for (int i = 0; i < x_dims.size(); ++i) {
-    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
-        place, "def", false, true, alphas[i], x_dims[i], y_dims[i]));
-    arena::Arena arena(std::move(tester), place, 5e-5);
-    arena.TestPrecision();
-  }
-}
-
-void test_matmulnx2_transpose(Place place) {
-  std::vector<DDim> x_dims({DDim({3, 4, 4, 3}), DDim({5, 3, 3, 2})});
-  std::vector<DDim> y_dims({DDim({2, 4}), DDim({1, 3})});
-  std::vector<float> alphas({1.f, 2.f});
-  for (int i = 0; i < x_dims.size(); ++i) {
-    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
-        place, "def", true, true, alphas[i], x_dims[i], y_dims[i]));
-    arena::Arena arena(std::move(tester), place, 5e-5);
-    arena.TestPrecision();
-  }
-}
-
-void test_matmul_nxn(Place place) {
-  DDim x_dim({3, 4, 2, 5});
-  DDim y_dim({3, 4, 5, 2});
-  float alpha = 1.5f;
-  std::unique_ptr<arena::TestCase> tester(
-      new MatMulComputeTester(place, "def", false, false, alpha, x_dim, y_dim));
-  arena::Arena arena(std::move(tester), place, 1e-3);
-  arena.TestPrecision();
-}
-
-void test_matmulnxn_x_transpose(Place place) {
-  std::vector<DDim> x_dims({DDim({3, 4, 6, 2}), DDim({5, 3, 5, 2})});
-  std::vector<DDim> y_dims({DDim({3, 4, 6, 2}), DDim({5, 3, 5, 1})});
-  std::vector<float> alphas({1.f, 2.f});
-  for (int i = 0; i < x_dims.size(); ++i) {
-    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
-        place, "def", true, false, alphas[i], x_dims[i], y_dims[i]));
-    arena::Arena arena(std::move(tester), place, 1e-3);
-    arena.TestPrecision();
-  }
-}
-
-void test_matmulnxn_y_transpose(Place place) {
-  std::vector<DDim> x_dims({DDim({3, 4, 6, 2}), DDim({5, 3, 5, 2})});
-  std::vector<DDim> y_dims({DDim({3, 4, 6, 2}), DDim({5, 3, 1, 2})});
-  std::vector<float> alphas({1.f, 2.f});
-  for (int i = 0; i < x_dims.size(); ++i) {
-    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
-        place, "def", false, true, alphas[i], x_dims[i], y_dims[i]));
-    arena::Arena arena(std::move(tester), place, 1e-3);
-    arena.TestPrecision();
-  }
-}
-
-void test_matmulnxn_transpose(Place place) {
-  std::vector<DDim> x_dims({DDim({3, 4, 4, 3}), DDim({5, 3, 3, 2})});
-  std::vector<DDim> y_dims({DDim({3, 4, 2, 4}), DDim({5, 3, 1, 3})});
-  std::vector<float> alphas({1.f, 2.f});
-  for (int i = 0; i < x_dims.size(); ++i) {
-    std::unique_ptr<arena::TestCase> tester(new MatMulComputeTester(
-        place, "def", true, true, alphas[i], x_dims[i], y_dims[i]));
-    arena::Arena arena(std::move(tester), place, 1e-3);
-    arena.TestPrecision();
-  }
-}
-
-TEST(Matmul2x2, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_matmul2x2_no_transform(place);
-#endif
-}
-
-TEST(Matmul2x2_x_transpose, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_matmul2x2_x_transpose(place);
-#endif
-}
-TEST(Matmul2x2_y_transpose, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_matmul2x2_y_transpose(place);
-#endif
-}
-
-TEST(Matmul2x2_transpose, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_matmul2x2_transpose(place);
-#endif
-}
-
-TEST(Matmul1x1, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_matmul1x1_transpose(place);
-  test_matmul1x1_no_transpose(place);
-#endif
-}
-
-TEST(Matmulnx1, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_matmul_nx1(place);
-#endif
-}
-
-TEST(Matmulnx2, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_matmul_nx2_1(place);
-  test_matmul_nx2_2(place);
-  test_matmulnx2_x_transpose(place);
-  test_matmulnx2_y_transpose(place);
-  test_matmulnx2_transpose(place);
-#endif
-}
-
-TEST(Matmulnxn, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_matmul_nxn(place);
-  test_matmulnxn_x_transpose(place);
-  test_matmulnxn_y_transpose(place);
-  test_matmulnxn_transpose(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/nearest_interp_compute_test.cc b/lite/tests/kernels/nearest_interp_compute_test.cc
deleted file mode 100644
index 3256ababca..0000000000
--- a/lite/tests/kernels/nearest_interp_compute_test.cc
+++ /dev/null
@@ -1,192 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <string>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-
-template <typename dtype>
-void resize_nearest_align(std::vector<const lite::Tensor*> inputs,
-                          lite::Tensor* output,
-                          bool with_align) {
-  int hin = inputs[0]->dims()[2];
-  int win = inputs[0]->dims()[3];
-  int channels = inputs[0]->dims()[1];
-  int num = inputs[0]->dims()[0];
-  int hout = output->dims()[2];
-  int wout = output->dims()[3];
-  dtype scale_w = (with_align) ? (static_cast<float>(win - 1) / (wout - 1))
-                               : (static_cast<float>(win) / (wout));
-  dtype scale_h = (with_align) ? (static_cast<float>(hin - 1) / (hout - 1))
-                               : (static_cast<float>(hin) / (hout));
-  const dtype* src = inputs[0]->data<dtype>();
-  dtype* dst = output->mutable_data<dtype>();
-  int dst_stride_w = 1;
-  int dst_stride_h = wout;
-  int dst_stride_c = wout * hout;
-  int dst_stride_batch = wout * hout * channels;
-  int src_stride_w = 1;
-  int src_stride_h = win;
-  int src_stride_c = win * hin;
-  int src_stride_batch = win * hin * channels;
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      int src_index = n * src_stride_batch + c * src_stride_c;
-      for (int h = 0; h < hout; ++h) {
-        for (int w = 0; w < wout; ++w) {
-          int fw = (with_align) ? static_cast<int>(scale_w * w + 0.5)
-                                : static_cast<int>(scale_w * w);
-          fw = (fw < 0) ? 0 : fw;
-          int fh = (with_align) ? static_cast<int>(scale_h * h + 0.5)
-                                : static_cast<int>(scale_h * h);
-          fh = (fh < 0) ? 0 : fh;
-          int w_start = static_cast<int>(fw);
-          int h_start = static_cast<int>(fh);
-          int dst_index = n * dst_stride_batch + c * dst_stride_c +
-                          h * dst_stride_h + w * dst_stride_w;
-          dst[dst_index] =
-              src[src_index + w_start * src_stride_w + h_start * src_stride_h];
-        }
-      }
-    }
-  }
-}
-
-class NearestInterpComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input0_ = "X";
-  std::string input1_ = "OutSize";
-  std::string output_ = "Out";
-
-  float height_scale_ = 0.0f;
-  float width_scale_ = 0.0f;
-  int out_height_ = -1;
-  int out_width_ = -1;
-  bool align_corners_ = true;
-  std::string interp_method_ = "Nearest";
-  DDim dims_{{2, 3}};
-  DDim _dims0_{{2, 3, 3, 2}};
-  DDim _dims1_{{2}};
-
- public:
-  NearestInterpComputeTester(const Place& place,
-                             const std::string& alias,
-                             float height_scale,
-                             float width_scale,
-                             int out_height,
-                             int out_width,
-                             bool align_corners,
-                             std::string interp_method)
-      : TestCase(place, alias),
-        height_scale_(height_scale),
-        width_scale_(width_scale),
-        out_height_(out_height),
-        out_width_(out_width),
-        align_corners_(align_corners),
-        interp_method_(interp_method) {}
-
-  void RunBaseline(Scope* scope) override {
-    width_scale_ = height_scale_;
-    auto* outputs = scope->NewTensor(output_);
-    CHECK(outputs);
-    outputs->Resize(dims_);
-    std::vector<const lite::Tensor*> inputs;
-    inputs.emplace_back(scope->FindTensor(input0_));
-    inputs.emplace_back(scope->FindTensor(input1_));
-
-    auto outsize_data = inputs[1]->data<int>();
-    if (out_width_ != -1 && out_height_ != -1) {
-      height_scale_ = static_cast<float>(out_height_ / inputs[0]->dims()[2]);
-      width_scale_ = static_cast<float>(out_width_ / inputs[0]->dims()[3]);
-    }
-    if (inputs.size() > 1) {
-      int h_out = outsize_data[0];  // HW
-      int w_out = outsize_data[1];  // HW
-      int num_cout = outputs->dims()[0];
-      int c_cout = outputs->dims()[1];
-      outputs->Resize({num_cout, c_cout, h_out, w_out});
-    }
-    resize_nearest_align<float>(inputs, outputs, align_corners_);
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("nearest_interp");
-    op_desc->SetInput("X", {input0_});
-    op_desc->SetInput("OutSize", {input1_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("scale", height_scale_);
-    op_desc->SetAttr("out_h", out_height_);
-    op_desc->SetAttr("out_w", out_width_);
-    op_desc->SetAttr("align_corners", align_corners_);
-    op_desc->SetAttr("interp_method", interp_method_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data0(_dims0_.production());
-    for (int i = 0; i < _dims0_.production(); i++) {
-      data0[i] = i * 1.1;
-    }
-
-    std::vector<int> data1(_dims1_.production());
-    for (int i = 0; i < _dims1_.production(); i++) {
-      data1[i] = (i + 1) * 2;
-    }
-
-    SetCommonTensor(input0_, _dims0_, data0.data());
-    SetCommonTensor(input1_, _dims1_, data1.data());
-  }
-};
-
-void test_nearest_interp(Place place) {
-  std::string interp_method = "Nearest";
-  for (float scale : {0.123, 2., 1.2}) {
-    for (int out_height : {2, 1, 6}) {
-      for (int out_width : {2, 3, 5}) {
-        for (bool align_corners : {true, false}) {
-          std::unique_ptr<arena::TestCase> tester(
-              new NearestInterpComputeTester(place,
-                                             "def",
-                                             scale,
-                                             scale,
-                                             out_height,
-                                             out_width,
-                                             align_corners,
-                                             interp_method));
-          arena::Arena arena(std::move(tester), place, 2e-5);
-          arena.TestPrecision();
-        }
-      }
-    }
-  }
-}
-
-TEST(NearestInterp, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_nearest_interp(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/negative_compute_test.cc b/lite/tests/kernels/negative_compute_test.cc
deleted file mode 100644
index ce11bc9509..0000000000
--- a/lite/tests/kernels/negative_compute_test.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-class NegativeComputeTester : public arena::TestCase {
- protected:
-  std::string input_ = "X";
-  std::string output_ = "Out";
-  DDim dims_{{100, 20}};
-
- public:
-  NegativeComputeTester(const Place& place, const std::string& alias)
-      : TestCase(place, alias) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-    auto* x = scope->FindTensor(input_);
-    const auto* x_data = x->data<float>();
-    for (int i = 0; i < dims_.production(); i++) {
-      out_data[i] = -x_data[i];
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("negative");
-    op_desc->SetInput("X", {input_});
-    op_desc->SetOutput("Out", {output_});
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(input_, dims_, data.data());
-  }
-};
-
-void TestNegative(const Place& place) {
-  std::unique_ptr<arena::TestCase> tester(
-      new NegativeComputeTester(place, "def"));
-  arena::Arena arena(std::move(tester), place, 2e-5);
-  arena.TestPrecision();
-}
-
-TEST(Negative, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  TestNegative(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/norm_compute_test.cc b/lite/tests/kernels/norm_compute_test.cc
deleted file mode 100644
index 830bac0627..0000000000
--- a/lite/tests/kernels/norm_compute_test.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-class NormComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input_ = "x";
-  std::string output_ = "out";
-  int axis_ = 1;
-  float epsilon_ = 1e-9;
-  DDim dims_{{3, 5, 4, 4}};
-  bool bias_after_scale_;
-
- public:
-  NormComputeTester(const Place& place,
-                    const std::string& alias,
-                    int axis,
-                    float epsilon,
-                    DDim dims)
-      : TestCase(place, alias), axis_(axis), epsilon_(epsilon), dims_(dims) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(input_);
-    const auto* x_data = x->data<float>();
-    int axis = axis_ < 0 ? axis + dims_.size() : axis_;
-    int pre_n = dims_.count(0, axis);
-    int n = dims_[axis];
-    int post_n = dims_.count(axis + 1, dims_.size());
-    for (int i = 0; i < pre_n; i++) {
-      for (int k = 0; k < post_n; k++) {
-        float sum = epsilon_;
-        const float* in_tmp = x_data + i * n * post_n + k;
-        for (int j = 0; j < n; j++) {
-          sum += in_tmp[j * post_n] * in_tmp[j * post_n];
-        }
-        sum = std::sqrt(sum);
-        float* out_tmp = out_data + i * n * post_n + k;
-        for (int j = 0; j < n; j++) {
-          out_tmp[j * post_n] = in_tmp[j * post_n] / sum;
-        }
-      }
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("norm");
-    op_desc->SetInput("X", {input_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("axis", axis_);
-    op_desc->SetAttr("epsilon", epsilon_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(input_, dims_, data.data());
-  }
-};
-void test_norm(Place place) {
-  DDimLite dims{{3, 5, 4, 4}};
-  for (int axis : {1}) {
-    for (float epsilon : {1e-9}) {
-      std::unique_ptr<arena::TestCase> tester(
-          new NormComputeTester(place, "def", axis, epsilon, dims));
-      arena::Arena arena(std::move(tester), place, 2e-5);
-      arena.TestPrecision();
-    }
-  }
-}
-
-TEST(Norm, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_norm(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/pad2d_compute_test.cc b/lite/tests/kernels/pad2d_compute_test.cc
deleted file mode 100644
index 78afbd97ae..0000000000
--- a/lite/tests/kernels/pad2d_compute_test.cc
+++ /dev/null
@@ -1,182 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-class Pad2dComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input_ = "X";
-  std::string output_ = "Out";
-  DDim dims_{{1, 1, 14, 14}};
-  std::vector<int> paddings_;
-  std::string mode_{"constant"};
-  float pad_value_ = 0.f;
-  std::string data_format_{"NCHW"};
-
- public:
-  Pad2dComputeTester(const Place& place,
-                     const std::string& alias,
-                     std::string mode,
-                     std::vector<int> paddings,
-                     float pad_value,
-                     std::string data_format)
-      : TestCase(place, alias),
-        mode_(mode),
-        paddings_(paddings),
-        pad_value_(pad_value),
-        data_format_(data_format) {}
-
-  void RunBaseline(Scope* scope) override {
-    LOG(INFO) << "into runbase";
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    int out_h = dims_[2] + paddings_[0] + paddings_[1];
-    int out_w = dims_[3] + paddings_[2] + paddings_[3];
-    out->Resize(lite::DDim({dims_[0], dims_[1], out_h, out_w}));
-    auto* out_data = out->mutable_data<float>();
-    auto* x = scope->FindTensor(input_);
-    const auto* x_data = x->data<float>();
-    LOG(INFO) << "get nums";
-
-    auto output_dims = out->dims();
-    int n = output_dims[0];
-    int c = output_dims[1];
-    int h = output_dims[2];
-    int w = output_dims[3];
-
-    int pad_top = paddings_[0];
-    int pad_bottom = paddings_[1];
-    int pad_left = paddings_[2];
-    int pad_right = paddings_[3];
-    int pad_mode;
-    if (mode_ == "constant") {
-      pad_mode = 0;
-    } else if (mode_ == "reflect") {
-      pad_mode = 1;
-    } else if (mode_ == "edge") {
-      pad_mode = 2;
-    } else {
-      LOG(FATAL) << "Unknown mode type";
-    }
-    float pad_value = pad_value_;
-
-    int in_w = w - pad_left - pad_right;
-    int in_h = h - pad_bottom - pad_top;
-    int spatial_size_out = w * h;
-    int spatial_size_in = in_w * in_h;
-#pragma omp parallel for
-    for (int i = 0; i < n * c; ++i) {
-      const float* din_batch = x_data + i * spatial_size_in;
-      float* dout_batch = out_data + i * spatial_size_out;
-      int in_y = 0;
-      int in_x = 0;
-      for (int y = 0; y < h; ++y) {
-        for (int x = 0; x < w; ++x) {
-          switch (pad_mode) {
-            case 0:
-              in_y = y - pad_top;
-              in_x = x - pad_left;
-              dout_batch[y * w + x] =
-                  (in_x >= 0 && in_x < in_w) && (in_y >= 0 && in_y < in_h)
-                      ? din_batch[in_y * in_w + in_x]
-                      : pad_value;
-              break;
-            case 1:
-              in_x = std::min(std::max(pad_left, x), in_w + pad_left - 1) -
-                     pad_left;
-              in_y =
-                  std::min(std::max(pad_top, y), in_h + pad_top - 1) - pad_top;
-              dout_batch[y * w + x] = din_batch[in_y * in_w + in_x];
-              break;
-            case 2:
-              in_y = y - pad_top;
-              in_x = x - pad_left;
-              in_y = std::max(in_y, -in_y);
-              in_y = std::min(in_y, 2 * in_h - in_y - 2);
-              in_x = std::max(in_x, -in_x);
-              in_x = std::min(in_x, 2 * in_w - in_x - 2);
-              dout_batch[y * w + x] = din_batch[in_y * in_w + in_x];
-              break;
-            default:
-              LOG(ERROR) << "ERROR: unknown pad mode:" << pad_mode;
-          }
-        }
-      }
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("pad2d");
-    op_desc->SetInput("X", {input_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("mode", mode_);
-    op_desc->SetAttr("pad_value", pad_value_);
-    op_desc->SetAttr("paddings", paddings_);
-    op_desc->SetAttr("data_format", data_format_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1;
-    }
-
-    SetCommonTensor(input_, dims_, data.data());
-  }
-};
-
-void TestPad2d(const Place& place) {
-  std::string data_format = "NCHW";
-  for (int pad_top : {0, 1}) {
-    for (int pad_bottom : {0, 1}) {
-      for (int pad_left : {0, 1}) {
-        for (int pad_right : {0, 1}) {
-          std::vector<int> paddings{pad_top, pad_bottom, pad_left, pad_right};
-          for (std::string pad_mode : {"constant", "edge", "reflect"}) {
-            for (float pad_value : {0.f, 1.0f}) {
-              LOG(INFO) << "pad param: " << pad_mode << " " << pad_value << " "
-                        << paddings[0] << " " << paddings[1] << " "
-                        << paddings[2] << " " << paddings[3];
-              std::unique_ptr<arena::TestCase> tester(new Pad2dComputeTester(
-                  place, "def", pad_mode, paddings, pad_value, data_format));
-              arena::Arena arena(std::move(tester), place, 2e-5);
-              arena.TestPrecision();
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(Scale, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  TestPad2d(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/power_compute_test.cc b/lite/tests/kernels/power_compute_test.cc
deleted file mode 100644
index 216c618cdb..0000000000
--- a/lite/tests/kernels/power_compute_test.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-class PowerComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input_ = "X";
-  std::string output_ = "Out";
-  float scale_ = 0.;
-  float shift_ = 0.;
-  float power_ = 0.;
-  DDim dims_{{5, 2}};
-
- public:
-  PowerComputeTester(const Place& place,
-                     const std::string& alias,
-                     float scale,
-                     float shift,
-                     float power)
-      : TestCase(place, alias), scale_(scale), shift_(shift), power_(power) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(input_);
-    const auto* x_data = x->data<float>();
-
-    for (int i = 0; i < dims_.production(); i++) {
-      out_data[i] = std::pow((x_data[i] * scale_ + shift_), power_);
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("power");
-    op_desc->SetInput("X", {input_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("scale", scale_);
-    op_desc->SetAttr("shift", shift_);
-    op_desc->SetAttr("power", power_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.01;
-    }
-
-    SetCommonTensor(input_, dims_, data.data());
-  }
-};
-
-void test_power(Place place) {
-  for (float scale : {0.923, 2., 1.2}) {
-    for (float shift : {1., 0., 1.2331}) {
-      for (float power : {1., 1.2, 1.6}) {
-        std::unique_ptr<arena::TestCase> tester(
-            new PowerComputeTester(place, "def", scale, shift, power));
-        arena::Arena arena(std::move(tester), place, 2e-4);
-        arena.TestPrecision();
-      }
-    }
-  }
-}
-
-TEST(Power, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_power(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/prior_box_compute_test.cc b/lite/tests/kernels/prior_box_compute_test.cc
deleted file mode 100644
index 47f7bc9447..0000000000
--- a/lite/tests/kernels/prior_box_compute_test.cc
+++ /dev/null
@@ -1,752 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-
-const int MALLOC_ALIGN = 64;
-
-void* fast_malloc(size_t size) {
-  size_t offset = sizeof(void*) + MALLOC_ALIGN - 1;
-  char* p = static_cast<char*>(malloc(offset + size));
-
-  if (!p) {
-    return nullptr;
-  }
-
-  void* r = reinterpret_cast<void*>(reinterpret_cast<size_t>(p + offset) &
-                                    (~(MALLOC_ALIGN - 1)));
-  static_cast<void**>(r)[-1] = p;
-  memset(r, 0, size);
-  return r;
-}
-
-void fast_free(void* ptr) {
-  if (ptr) {
-    free(static_cast<void**>(ptr)[-1]);
-  }
-}
-
-inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
-                               bool flip,
-                               std::vector<float>* output_aspect_ratior) {
-  constexpr float epsilon = 1e-6;
-  output_aspect_ratior->clear();
-  output_aspect_ratior->push_back(1.0f);
-  for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
-    float ar = input_aspect_ratior[i];
-    bool already_exist = false;
-    for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
-      if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
-        already_exist = true;
-        break;
-      }
-    }
-    if (!already_exist) {
-      output_aspect_ratior->push_back(ar);
-      if (flip) {
-        output_aspect_ratior->push_back(1.0f / ar);
-      }
-    }
-  }
-}
-
-void prior_box_compute_ref(const lite::Tensor* input,
-                           const lite::Tensor* image,
-                           lite::Tensor** boxes,
-                           lite::Tensor** variances,
-                           const std::vector<float>& min_size_,
-                           const std::vector<float>& fixed_size_,
-                           const std::vector<float>& fixed_ratio_,
-                           const std::vector<int>& density_size_,
-                           const std::vector<float>& max_size_,
-                           const std::vector<float>& aspect_ratio_,
-                           const std::vector<float>& variance_,
-                           int img_w_,
-                           int img_h_,
-                           float step_w_,
-                           float step_h_,
-                           float offset_,
-                           int prior_num_,
-                           bool is_flip_,
-                           bool is_clip_,
-                           const std::vector<std::string>& order_) {
-  int win1 = input->dims()[3];
-  int hin1 = input->dims()[2];
-  DDim out_sh({hin1, win1, prior_num_, 4});
-  (*boxes)->Resize(out_sh);
-  (*variances)->Resize(out_sh);
-
-  float* _cpu_data = (*boxes)->mutable_data<float>();
-  float* _variance_data = (*variances)->mutable_data<float>();
-
-  const int width = input->dims()[3];
-  const int height = input->dims()[2];
-  int img_width = img_w_;
-  int img_height = img_h_;
-  if (img_width == 0 || img_height == 0) {
-    img_width = image->dims()[3];
-    img_height = image->dims()[2];
-  }
-
-  float step_w = step_w_;
-  float step_h = step_h_;
-  if (step_w == 0 || step_h == 0) {
-    step_w = static_cast<float>(img_width) / width;
-    step_h = static_cast<float>(img_height) / height;
-  }
-  float offset = offset_;
-  int step_average = static_cast<int>((step_w + step_h) * 0.5);
-  int channel_size = height * width * prior_num_ * 4;
-  int idx = 0;
-  for (int h = 0; h < height; ++h) {
-    for (int w = 0; w < width; ++w) {
-      float center_x = (w + offset) * step_w;
-      float center_y = (h + offset) * step_h;
-      float box_width;
-      float box_height;
-      if (fixed_size_.size() > 0) {
-        for (int s = 0; s < fixed_size_.size(); ++s) {
-          int fixed_size = fixed_size_[s];
-          int com_idx = 0;
-          box_width = fixed_size;
-          box_height = fixed_size;
-
-          if (fixed_ratio_.size() > 0) {
-            for (int r = 0; r < fixed_ratio_.size(); ++r) {
-              float ar = fixed_ratio_[r];
-              int density = density_size_[s];
-              int shift = step_average / density;
-              float box_width_ratio = fixed_size_[s] * sqrt(ar);
-              float box_height_ratio = fixed_size_[s] / sqrt(ar);
-
-              for (int p = 0; p < density; ++p) {
-                for (int c = 0; c < density; ++c) {
-                  float center_x_temp =
-                      center_x - step_average / 2.0f + shift / 2.f + c * shift;
-                  float center_y_temp =
-                      center_y - step_average / 2.0f + shift / 2.f + p * shift;
-                  // xmin
-                  _cpu_data[idx++] =
-                      (center_x_temp - box_width_ratio / 2.f) / img_width >= 0
-                          ? (center_x_temp - box_width_ratio / 2.f) / img_width
-                          : 0;
-                  // ymin
-                  _cpu_data[idx++] =
-                      (center_y_temp - box_height_ratio / 2.f) / img_height >= 0
-                          ? (center_y_temp - box_height_ratio / 2.f) /
-                                img_height
-                          : 0;
-                  // xmax
-                  _cpu_data[idx++] =
-                      (center_x_temp + box_width_ratio / 2.f) / img_width <= 1
-                          ? (center_x_temp + box_width_ratio / 2.f) / img_width
-                          : 1;
-                  // ymax
-                  _cpu_data[idx++] =
-                      (center_y_temp + box_height_ratio / 2.f) / img_height <= 1
-                          ? (center_y_temp + box_height_ratio / 2.f) /
-                                img_height
-                          : 1;
-                }
-              }
-            }
-          } else {
-            if (density_size_.size() > 0) {
-              CHECK_EQ(fixed_size_.size(), density_size_.size())
-                  << "fixed_size should be same with denstiy_size";
-              int density = density_size_[s];
-              int shift = fixed_size_[s] / density;
-
-              for (int r = 0; r < density; ++r) {
-                for (int c = 0; c < density; ++c) {
-                  float center_x_temp =
-                      center_x - fixed_size / 2.f + shift / 2.f + c * shift;
-                  float center_y_temp =
-                      center_y - fixed_size / 2.f + shift / 2.f + r * shift;
-                  // xmin
-                  _cpu_data[idx++] =
-                      (center_x_temp - box_width / 2.f) / img_width >= 0
-                          ? (center_x_temp - box_width / 2.f) / img_width
-                          : 0;
-                  // ymin
-                  _cpu_data[idx++] =
-                      (center_y_temp - box_height / 2.f) / img_height >= 0
-                          ? (center_y_temp - box_height / 2.f) / img_height
-                          : 0;
-                  // xmax
-                  _cpu_data[idx++] =
-                      (center_x_temp + box_width / 2.f) / img_width <= 1
-                          ? (center_x_temp + box_width / 2.f) / img_width
-                          : 1;
-                  // ymax
-                  _cpu_data[idx++] =
-                      (center_y_temp + box_height / 2.f) / img_height <= 1
-                          ? (center_y_temp + box_height / 2.f) / img_height
-                          : 1;
-                }
-              }
-            }
-            // rest of priors : will never come here!!!
-            for (int r = 0; r < aspect_ratio_.size(); ++r) {
-              float ar = aspect_ratio_[r];
-
-              if (fabs(ar - 1.) < 1e-6) {
-                // LOG(INFO) << "returning for aspect == 1";
-                continue;
-              }
-
-              int density = density_size_[s];
-              int shift = fixed_size_[s] / density;
-              float box_width_ratio = fixed_size_[s] * sqrt(ar);
-              float box_height_ratio = fixed_size_[s] / sqrt(ar);
-
-              for (int p = 0; p < density; ++p) {
-                for (int c = 0; c < density; ++c) {
-                  float center_x_temp =
-                      center_x - fixed_size / 2.f + shift / 2.f + c * shift;
-                  float center_y_temp =
-                      center_y - fixed_size / 2.f + shift / 2.f + p * shift;
-                  // xmin
-                  _cpu_data[idx++] =
-                      (center_x_temp - box_width_ratio / 2.f) / img_width >= 0
-                          ? (center_x_temp - box_width_ratio / 2.f) / img_width
-                          : 0;
-                  // ymin
-                  _cpu_data[idx++] =
-                      (center_y_temp - box_height_ratio / 2.f) / img_height >= 0
-                          ? (center_y_temp - box_height_ratio / 2.f) /
-                                img_height
-                          : 0;
-                  // xmax
-                  _cpu_data[idx++] =
-                      (center_x_temp + box_width_ratio / 2.f) / img_width <= 1
-                          ? (center_x_temp + box_width_ratio / 2.f) / img_width
-                          : 1;
-                  // ymax
-                  _cpu_data[idx++] =
-                      (center_y_temp + box_height_ratio / 2.f) / img_height <= 1
-                          ? (center_y_temp + box_height_ratio / 2.f) /
-                                img_height
-                          : 1;
-                }
-              }
-            }
-          }
-        }
-      } else {
-        float* min_buf =
-            reinterpret_cast<float*>(fast_malloc(sizeof(float) * 4));
-        float* max_buf =
-            reinterpret_cast<float*>(fast_malloc(sizeof(float) * 4));
-        float* com_buf = reinterpret_cast<float*>(
-            fast_malloc(sizeof(float) * aspect_ratio_.size() * 4));
-
-        for (int s = 0; s < min_size_.size(); ++s) {
-          int min_idx = 0;
-          int max_idx = 0;
-          int com_idx = 0;
-          int min_size = min_size_[s];
-          // first prior: aspect_ratio = 1, size = min_size
-          box_width = box_height = min_size;
-          //! xmin
-          min_buf[min_idx++] = (center_x - box_width / 2.f) / img_width;
-          //! ymin
-          min_buf[min_idx++] = (center_y - box_height / 2.f) / img_height;
-          //! xmax
-          min_buf[min_idx++] = (center_x + box_width / 2.f) / img_width;
-          // ymax
-          min_buf[min_idx++] = (center_y + box_height / 2.f) / img_height;
-
-          if (max_size_.size() > 0) {
-            int max_size = max_size_[s];
-            //! second prior: aspect_ratio = 1, size = sqrt(min_size * max_size)
-            box_width = box_height = sqrtf(min_size * max_size);
-            //! xmin
-            max_buf[max_idx++] = (center_x - box_width / 2.f) / img_width;
-            //! ymin
-            max_buf[max_idx++] = (center_y - box_height / 2.f) / img_height;
-            //! xmax
-            max_buf[max_idx++] = (center_x + box_width / 2.f) / img_width;
-            //! ymax
-            max_buf[max_idx++] = (center_y + box_height / 2.f) / img_height;
-          }
-
-          //! rest of priors
-          for (int r = 0; r < aspect_ratio_.size(); ++r) {
-            float ar = aspect_ratio_[r];
-            if (fabs(ar - 1.) < 1e-6) {
-              continue;
-            }
-            box_width = min_size * sqrt(ar);
-            box_height = min_size / sqrt(ar);
-            //! xmin
-            com_buf[com_idx++] = (center_x - box_width / 2.f) / img_width;
-            //! ymin
-            com_buf[com_idx++] = (center_y - box_height / 2.f) / img_height;
-            //! xmax
-            com_buf[com_idx++] = (center_x + box_width / 2.f) / img_width;
-            //! ymax
-            com_buf[com_idx++] = (center_y + box_height / 2.f) / img_height;
-          }
-          memcpy(_cpu_data + idx, min_buf, sizeof(float) * min_idx);
-          idx += min_idx;
-          memcpy(_cpu_data + idx, com_buf, sizeof(float) * com_idx);
-          idx += com_idx;
-          memcpy(_cpu_data + idx, max_buf, sizeof(float) * max_idx);
-          idx += max_idx;
-        }
-        fast_free(min_buf);
-        fast_free(max_buf);
-        fast_free(com_buf);
-      }
-    }
-  }
-
-  //! clip the prior's coordinate such that it is within [0, 1]
-  if (is_clip_) {
-    for (int d = 0; d < channel_size; ++d) {
-      _cpu_data[d] = std::min(std::max(_cpu_data[d], 0.f), 1.f);
-    }
-  }
-  //! set the variance.
-  int count = 0;
-  for (int h = 0; h < height; ++h) {
-    for (int w = 0; w < width; ++w) {
-      for (int i = 0; i < prior_num_; ++i) {
-        for (int j = 0; j < 4; ++j) {
-          _variance_data[count] = variance_[j];
-          ++count;
-        }
-      }
-    }
-  }
-}
-
-class DensityPriorBoxComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string ins0 = "Input";
-  std::string ins1 = "Image";
-  std::string outs0 = "Boxes";
-  std::string outs1 = "Variances";
-  bool is_flip_;
-  bool is_clip_;
-  std::vector<float> min_size_;
-  std::vector<float> fixed_size_;
-  std::vector<float> fixed_ratio_;
-  std::vector<int> density_size_;
-  std::vector<float> max_size_;
-  std::vector<float> aspect_ratio_;
-  std::vector<float> variance_;
-  int img_w_{0};
-  int img_h_{0};
-  float step_w_{0};
-  float step_h_{0};
-  float offset_{0.5};
-  int prior_num_{0};
-  // priortype: prior_min, prior_max, prior_com
-  std::vector<std::string> order_;
-  DDim feature_dims_;
-  DDim data_dims_;
-
- public:
-  DensityPriorBoxComputeTester(const Place& place,
-                               const std::string& alias,
-                               bool is_flip,
-                               bool is_clip,
-                               const std::vector<float>& min_size,
-                               const std::vector<float>& fixed_size,
-                               const std::vector<float>& fixed_ratio,
-                               const std::vector<int>& density_size,
-                               const std::vector<float>& max_size,
-                               const std::vector<float>& aspect_ratio,
-                               const std::vector<float>& variance,
-                               int img_w,
-                               int img_h,
-                               float step_w,
-                               float step_h,
-                               float offset,
-                               int prior_num,
-                               // priortype: prior_min, prior_max, prior_com
-                               const std::vector<std::string>& order,
-                               DDim feature_dims,
-                               DDim data_dims)
-      : TestCase(place, alias),
-        is_flip_(is_flip),
-        is_clip_(is_clip),
-        min_size_(min_size),
-        fixed_size_(fixed_size),
-        fixed_ratio_(fixed_ratio),
-        density_size_(density_size),
-        max_size_(max_size),
-        aspect_ratio_(aspect_ratio),
-        variance_(variance),
-        img_w_(img_w),
-        img_h_(img_h),
-        step_w_(step_w),
-        step_h_(step_h),
-        offset_(offset),
-        prior_num_(prior_num),
-        order_(order),
-        feature_dims_(feature_dims),
-        data_dims_(data_dims) {}
-
-  // todo get vector<Tensor>
-  void RunBaseline(Scope* scope) override {
-    auto* inputs0 = scope->FindTensor(ins0);
-    auto* inputs1 = scope->FindTensor(ins1);
-    auto* outputs0 = scope->NewTensor(outs0);
-    auto* outputs1 = scope->NewTensor(outs1);
-
-    CHECK(outputs0);
-    CHECK(outputs1);
-    CHECK(inputs0);
-    CHECK(inputs1);
-
-    prior_box_compute_ref(inputs0,
-                          inputs1,
-                          &outputs0,
-                          &outputs1,
-                          min_size_,
-                          fixed_size_,
-                          fixed_ratio_,
-                          density_size_,
-                          max_size_,
-                          aspect_ratio_,
-                          variance_,
-                          img_w_,
-                          img_h_,
-                          step_w_,
-                          step_h_,
-                          offset_,
-                          prior_num_,
-                          is_flip_,
-                          is_clip_,
-                          order_);
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("density_prior_box");
-    op_desc->SetInput("Input", {ins0});
-    op_desc->SetInput("Image", {ins1});
-    op_desc->SetOutput("Boxes", {outs0});
-    op_desc->SetOutput("Variances", {outs1});
-    op_desc->SetAttr("flip", is_flip_);
-    op_desc->SetAttr("clip", is_clip_);
-    op_desc->SetAttr("min_sizes", min_size_);
-    op_desc->SetAttr("fixed_sizes", fixed_size_);
-    op_desc->SetAttr("fixed_ratios", fixed_ratio_);
-    op_desc->SetAttr("density_sizes", density_size_);
-    op_desc->SetAttr("max_sizes", max_size_);
-    op_desc->SetAttr("aspect_ratios", aspect_ratio_);
-    op_desc->SetAttr("variances", variance_);
-    op_desc->SetAttr("img_w", img_w_);
-    op_desc->SetAttr("img_h", img_h_);
-    op_desc->SetAttr("step_w", step_w_);
-    op_desc->SetAttr("step_h", step_h_);
-    op_desc->SetAttr("offset", offset_);
-    op_desc->SetAttr("prior_num", prior_num_);
-    op_desc->SetAttr("order", order_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> feature_data(feature_dims_.production());
-    std::vector<float> image_data(data_dims_.production());
-
-    for (int i = 0; i < feature_dims_.production(); ++i) {
-      feature_data[i] = i * 1.1 / feature_dims_.production();
-    }
-    for (int i = 0; i < data_dims_.production(); ++i) {
-      image_data[i] = i * 1.2 / data_dims_.production();
-    }
-
-    SetCommonTensor(ins0, feature_dims_, feature_data.data());
-    SetCommonTensor(ins1, data_dims_, image_data.data());
-  }
-};
-
-class PriorBoxComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string ins0 = "Input";
-  std::string ins1 = "Image";
-  std::string outs0 = "Boxes";
-  std::string outs1 = "Variances";
-  bool is_flip_;
-  bool is_clip_;
-  std::vector<float> min_size_;
-  std::vector<float> max_size_;
-  std::vector<float> aspect_ratio_;
-  std::vector<float> variance_;
-  int img_w_{0};
-  int img_h_{0};
-  float step_w_{0};
-  float step_h_{0};
-  float offset_{0.5};
-  int prior_num_{0};
-  // priortype: prior_min, prior_max, prior_com
-  std::vector<std::string> order_;
-  DDim feature_dims_;
-  DDim data_dims_;
-
- public:
-  PriorBoxComputeTester(const Place& place,
-                        const std::string& alias,
-                        bool is_flip,
-                        bool is_clip,
-                        const std::vector<float>& min_size,
-                        const std::vector<float>& max_size,
-                        const std::vector<float>& aspect_ratio,
-                        const std::vector<float>& variance,
-                        int img_w,
-                        int img_h,
-                        float step_w,
-                        float step_h,
-                        float offset,
-                        int prior_num,
-                        // priortype: prior_min, prior_max, prior_com
-                        const std::vector<std::string>& order,
-                        DDim feature_dims,
-                        DDim data_dims)
-      : TestCase(place, alias),
-        is_flip_(is_flip),
-        is_clip_(is_clip),
-        min_size_(min_size),
-        max_size_(max_size),
-        aspect_ratio_(aspect_ratio),
-        variance_(variance),
-        img_w_(img_w),
-        img_h_(img_h),
-        step_w_(step_w),
-        step_h_(step_h),
-        offset_(offset),
-        prior_num_(prior_num),
-        order_(order),
-        feature_dims_(feature_dims),
-        data_dims_(data_dims) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* inputs0 = scope->FindTensor(ins0);
-    auto* inputs1 = scope->FindTensor(ins1);
-    auto* outputs0 = scope->NewTensor(outs0);
-    auto* outputs1 = scope->NewTensor(outs1);
-
-    CHECK(outputs0);
-    CHECK(outputs1);
-    CHECK(inputs0);
-    CHECK(inputs1);
-
-    prior_box_compute_ref(inputs0,
-                          inputs1,
-                          &outputs0,
-                          &outputs1,
-                          min_size_,
-                          std::vector<float>(),
-                          std::vector<float>(),
-                          std::vector<int>(),
-                          max_size_,
-                          aspect_ratio_,
-                          variance_,
-                          img_w_,
-                          img_h_,
-                          step_w_,
-                          step_h_,
-                          offset_,
-                          prior_num_,
-                          is_flip_,
-                          is_clip_,
-                          order_);
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("prior_box");
-    op_desc->SetInput("Input", {ins0});
-    op_desc->SetInput("Image", {ins1});
-    op_desc->SetOutput("Boxes", {outs0});
-    op_desc->SetOutput("Variances", {outs1});
-
-    op_desc->SetAttr("flip", is_flip_);
-    op_desc->SetAttr("clip", is_clip_);
-    op_desc->SetAttr("min_sizes", min_size_);
-    op_desc->SetAttr("max_sizes", max_size_);
-    op_desc->SetAttr("aspect_ratios", aspect_ratio_);
-    op_desc->SetAttr("variances", variance_);
-    op_desc->SetAttr("img_w", img_w_);
-    op_desc->SetAttr("img_h", img_h_);
-    op_desc->SetAttr("step_w", step_w_);
-    op_desc->SetAttr("step_h", step_h_);
-    op_desc->SetAttr("offset", offset_);
-    op_desc->SetAttr("prior_num", prior_num_);
-    op_desc->SetAttr("order", order_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> feature_data(feature_dims_.production());
-    std::vector<float> image_data(data_dims_.production());
-
-    for (int i = 0; i < feature_dims_.production(); ++i) {
-      feature_data[i] = i * 1.1 / feature_dims_.production();
-    }
-    for (int i = 0; i < data_dims_.production(); ++i) {
-      image_data[i] = i * 1.2 / data_dims_.production();
-    }
-
-    SetCommonTensor(ins0, feature_dims_, feature_data.data());
-    SetCommonTensor(ins1, data_dims_, image_data.data());
-  }
-};
-
-void test_density_prior_box(Place place) {
-  std::vector<float> min_size{60.f};
-  std::vector<float> max_size;
-  std::vector<float> aspect_ratio{2};
-  std::vector<float> variance{0.1f, 0.1f, 0.2f, 0.2f};
-  std::vector<float> fixed_size{60, 30};
-  std::vector<float> fixed_ratio{1., 2.};
-  std::vector<int> density_size{1, 3};
-  bool flip = true;
-  bool clip = false;
-  float step_h = 0;
-  float step_w = 0;
-  int img_w = 0;
-  int img_h = 0;
-  float offset = 0.5;
-  std::vector<std::string> order;
-  std::vector<float> aspect_ratios_vec;
-  ExpandAspectRatios(aspect_ratio, flip, &aspect_ratios_vec);
-  size_t prior_num = aspect_ratios_vec.size() * min_size.size();
-  prior_num += max_size.size();
-
-  if (fixed_size.size() > 0) {
-    prior_num = fixed_size.size() * fixed_ratio.size();
-  }
-  if (density_size.size() > 0) {
-    for (int i = 0; i < density_size.size(); ++i) {
-      if (fixed_ratio.size() > 0) {
-        prior_num += (fixed_ratio.size() * ((pow(density_size[i], 2)) - 1));
-      } else {
-        prior_num +=
-            ((fixed_ratio.size() + 1) * ((pow(density_size[i], 2)) - 1));
-      }
-    }
-  }
-
-  int width = 300;
-  int height = 300;
-  int channel = 3;
-  int num = 1;
-  int w_fea = 19;
-  int h_fea = 19;
-  int c_fea = 512;
-  std::unique_ptr<arena::TestCase> tester(
-      new DensityPriorBoxComputeTester(place,
-                                       "def",
-                                       flip,
-                                       clip,
-                                       min_size,
-                                       fixed_size,
-                                       fixed_ratio,
-                                       density_size,
-                                       max_size,
-                                       aspect_ratio,
-                                       variance,
-                                       img_w,
-                                       img_h,
-                                       step_w,
-                                       step_h,
-                                       offset,
-                                       prior_num,
-                                       order,
-                                       DDim({num, c_fea, h_fea, w_fea}),
-                                       DDim({num, channel, height, width})));
-  arena::Arena arena(std::move(tester), place, 2e-5);
-  arena.TestPrecision();
-}
-
-void test_prior_box(Place place) {
-  std::vector<float> min_size{60.f};
-  std::vector<float> max_size;
-  std::vector<float> aspect_ratio{2.};
-  std::vector<float> variance{0.1f, 0.1f, 0.2f, 0.2f};
-  bool flip = true;
-  bool clip = false;
-  float step_h = 0;
-  float step_w = 0;
-  int img_w = 0;
-  int img_h = 0;
-  float offset = 0.5;
-  std::vector<std::string> order;
-  std::vector<float> aspect_ratios_vec;
-  ExpandAspectRatios(aspect_ratio, flip, &aspect_ratios_vec);
-  size_t prior_num = aspect_ratios_vec.size() * min_size.size();
-  prior_num += max_size.size();
-
-  int width = 300;
-  int height = 300;
-  int channel = 3;
-  int num = 1;
-  int w_fea = 19;
-  int h_fea = 19;
-  int c_fea = 128;
-  std::unique_ptr<arena::TestCase> tester(
-      new PriorBoxComputeTester(place,
-                                "def",
-                                flip,
-                                clip,
-                                min_size,
-                                max_size,
-                                aspect_ratios_vec,
-                                variance,
-                                img_w,
-                                img_h,
-                                step_w,
-                                step_h,
-                                offset,
-                                prior_num,
-                                order,
-                                DDim({num, c_fea, h_fea, w_fea}),
-                                DDim({num, channel, height, width})));
-  arena::Arena arena(std::move(tester), place, 2e-5);
-  arena.TestPrecision();
-}
-
-TEST(PriorBox, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_prior_box(place);
-#endif
-}
-
-TEST(DensityPriorBox, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_density_prior_box(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/read_from_array_compute_test.cc b/lite/tests/kernels/read_from_array_compute_test.cc
deleted file mode 100644
index bcba00fddd..0000000000
--- a/lite/tests/kernels/read_from_array_compute_test.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-class ReadFromArrayComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input_0 = "in_0";
-  std::string input_1 = "in_1";
-  std::string input_2 = "in_2";
-  std::string input_i = "i";
-  std::string output = "out";
-  DDim dims_{{3, 5, 4, 4}};
-  int i_;
-
- public:
-  ReadFromArrayComputeTester(const Place& place,
-                             const std::string& alias,
-                             const int i,
-                             DDim dims)
-      : TestCase(place, alias), i_(i), dims_(dims) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output);
-    CHECK(out);
-    auto* in_0 = scope->FindTensor(input_0);
-    auto* in_1 = scope->FindTensor(input_1);
-    auto* in_2 = scope->FindTensor(input_2);
-    auto* id_tensor = scope->FindTensor(input_i);
-    std::vector<const TensorLite*> in_vec = {in_0, in_1, in_2};
-    int cur_in_num = in_vec.size();
-
-    int id = id_tensor->data<int>()[0];
-    out->Resize(dims_);
-    const auto* in_data = in_vec[id]->data<float>();
-    auto* o_data = out->mutable_data<float>();
-    int n = in_vec[id]->numel();
-    memcpy(o_data, in_data, sizeof(float) * n);
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("read_from_array");
-    op_desc->SetInput("X", {input_0, input_1, input_2});
-    op_desc->SetInput("I", {input_i});
-    op_desc->SetOutput("Out", {output});
-  }
-
-  void PrepareData() override {
-    std::vector<std::string> in_vec = {input_0, input_1, input_2};
-    for (auto in : in_vec) {
-      std::vector<float> data(dims_.production());
-      for (int i = 0; i < dims_.production(); i++) {
-        data[i] = std::rand() * 1.0f / RAND_MAX;
-      }
-      SetCommonTensor(in, dims_, data.data());
-    }
-
-    DDimLite dims_i{{1}};
-    int a = 1;
-    SetCommonTensor(input_i, dims_i, &a);
-  }
-};
-
-void test_read_from_array(Place place) {
-  DDimLite dims{{3, 5, 4, 4}};
-  for (int i : {1, 2}) {
-    std::unique_ptr<arena::TestCase> tester(
-        new ReadFromArrayComputeTester(place, "def", i, dims));
-    arena::Arena arena(std::move(tester), place, 2e-5);
-    arena.TestPrecision();
-  }
-}
-
-TEST(ReadFromArray, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_read_from_array(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/reduce_max_compute_test.cc b/lite/tests/kernels/reduce_max_compute_test.cc
deleted file mode 100644
index 2a1116d65f..0000000000
--- a/lite/tests/kernels/reduce_max_compute_test.cc
+++ /dev/null
@@ -1,347 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-void reduce_n(const float* src,
-              float* dst,
-              int num_in,
-              int channel_in,
-              int height_in,
-              int width_in) {
-  int hw_size = height_in * width_in;
-  int chw_size = channel_in * hw_size;
-  int data_index, src_index, src_index0;
-  for (int c = 0; c < channel_in; ++c) {
-    for (int h = 0; h < height_in; ++h) {
-      for (int w = 0; w < width_in; ++w) {
-        data_index = c * hw_size + h * width_in + w;
-        dst[data_index] = src[data_index];
-        for (int n = 1; n < num_in; ++n) {
-          src_index = n * chw_size + data_index;
-          dst[data_index] = dst[data_index] > src[src_index] ? dst[data_index]
-                                                             : src[src_index];
-        }
-      }
-    }
-  }
-}
-
-void reduce_c(const float* src,
-              float* dst,
-              int num_in,
-              int channel_in,
-              int height_in,
-              int width_in) {
-  int hw_size = height_in * width_in;
-  int chw_size = hw_size * channel_in;
-  int data_index, src_index0, src_index;
-  for (int n = 0; n < num_in; ++n) {
-    for (int h = 0; h < height_in; ++h) {
-      for (int w = 0; w < width_in; ++w) {
-        data_index = n * hw_size + h * width_in + w;
-        src_index0 = n * chw_size + h * width_in + w;
-        dst[data_index] = src[src_index0];
-        for (int c = 1; c < channel_in; ++c) {
-          src_index = src_index0 + c * hw_size;
-          dst[data_index] = dst[data_index] > src[src_index] ? dst[data_index]
-                                                             : src[src_index];
-        }
-      }
-    }
-  }
-}
-
-void reduce_h(const float* src,
-              float* dst,
-              int num_in,
-              int channel_in,
-              int height_in,
-              int width_in) {
-  int cw_size = channel_in * width_in;
-  int chw_size = cw_size * height_in;
-  int hw_size = height_in * width_in;
-  int data_index, src_index, src_index0;
-  for (int n = 0; n < num_in; ++n) {
-    for (int c = 0; c < channel_in; ++c) {
-      for (int w = 0; w < width_in; ++w) {
-        data_index = n * cw_size + c * width_in + w;
-        src_index0 = n * chw_size + c * hw_size + w;
-        dst[data_index] = src[src_index0];
-        for (int h = 1; h < height_in; ++h) {
-          src_index = src_index0 + h * width_in;
-          dst[data_index] = dst[data_index] > src[src_index] ? dst[data_index]
-                                                             : src[src_index];
-        }
-      }
-    }
-  }
-}
-
-void reduce_w(const float* src,
-              float* dst,
-              int num_in,
-              int channel_in,
-              int height_in,
-              int width_in) {
-  int ch_size = channel_in * height_in;
-  int hw_size = height_in * width_in;
-  int chw_size = ch_size * width_in;
-  int data_index, src_index0, src_index;
-  for (int n = 0; n < num_in; ++n) {
-    for (int c = 0; c < channel_in; ++c) {
-      for (int h = 0; h < height_in; ++h) {
-        data_index = n * ch_size + c * height_in + h;
-        src_index0 = n * chw_size + c * hw_size + h * width_in;
-        dst[data_index] = src[src_index0];
-        for (int w = 1; w < width_in; ++w) {
-          src_index = src_index0 + w;
-          dst[data_index] = dst[data_index] > src[src_index] ? dst[data_index]
-                                                             : src[src_index];
-        }
-      }
-    }
-  }
-}
-
-void reduce_all(const float* src,
-                float* dst,
-                int num_in,
-                int channel_in,
-                int height_in,
-                int width_in) {
-  float max = src[0];
-  int src_index;
-  int n_id, c_id;
-  for (int n = 0; n < num_in; ++n) {
-    n_id = n * channel_in * height_in * width_in;
-    for (int c = 0; c < channel_in; ++c) {
-      c_id = c * height_in * width_in;
-      for (int h = 0; h < height_in; ++h) {
-        for (int w = 0; w < width_in; ++w) {
-          src_index = n_id + c_id + h * width_in + w;
-          max = src[src_index] > max ? src[src_index] : max;
-        }
-      }
-    }
-  }
-  dst[0] = max;
-}
-
-void reduce_nc(const float* src,
-               float* dst,
-               int num_in,
-               int channel_in,
-               int height_in,
-               int width_in) {
-  // reduce n first.
-  DDimLite ddimA({1, channel_in, height_in, width_in});
-  lite::Tensor tensor_tmp;
-  tensor_tmp.Resize(ddimA);
-  float* tmp_out = tensor_tmp.mutable_data<float>();
-  reduce_n(src, tmp_out, num_in, channel_in, height_in, width_in);
-  reduce_c(tmp_out, dst, 1, channel_in, height_in, width_in);
-}
-
-void reduce_ch(const float* src,
-               float* dst,
-               int num_in,
-               int channel_in,
-               int height_in,
-               int width_in) {
-  // reduce c first
-  DDimLite ddimA({num_in, 1, height_in, width_in});
-  lite::Tensor tensor_tmp;
-  tensor_tmp.Resize(ddimA);
-  float* tmp_out = tensor_tmp.mutable_data<float>();
-  reduce_c(src, tmp_out, num_in, channel_in, height_in, width_in);
-  reduce_h(tmp_out, dst, num_in, 1, height_in, width_in);
-}
-
-void reduce_hw(const float* src,
-               float* dst,
-               int num_in,
-               int channel_in,
-               int height_in,
-               int width_in) {
-  // reduce h first
-  DDimLite ddimA({num_in, channel_in, 1, width_in});
-  lite::Tensor tensor_tmp;
-  tensor_tmp.Resize(ddimA);
-  float* tmp_out = tensor_tmp.mutable_data<float>();
-  reduce_h(src, tmp_out, num_in, channel_in, height_in, width_in);
-  reduce_w(tmp_out, dst, num_in, channel_in, 1, width_in);
-}
-
-class ReduceMaxComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input_ = "x";
-  std::string output_ = "out";
-  std::vector<int> dim_{0};
-  DDim x_dims_{{3, 2, 3, 4}};
-  bool keep_dim_ = false;
-  bool reduce_all_ = false;
-
- public:
-  ReduceMaxComputeTester(const Place& place,
-                         const std::string& alias,
-                         std::vector<int> dim,
-                         bool keep_dim,
-                         DDim x_dims)
-      : TestCase(place, alias),
-        dim_(dim),
-        keep_dim_(keep_dim),
-        x_dims_(x_dims) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* x = scope->FindMutableTensor(input_);
-    const auto* x_data = x->data<float>();
-    auto* out = scope->NewTensor(output_);
-    auto x_rank = x_dims_.size();
-    if (!dim_.empty()) {
-      for (int i = 0; i < dim_.size(); i++) {
-        if (dim_[i] < 0) {
-          dim_[i] += x_rank;
-        }
-      }
-    }
-
-    sort(dim_.begin(), dim_.end());
-    if (dim_.size() == 0) {
-      reduce_all_ = true;
-    }
-    std::vector<int64_t> out_dims;
-    if (reduce_all_) {
-      if (keep_dim_) {
-        out_dims.push_back(x_rank);
-        out_dims.push_back(1);
-      } else {
-        out_dims.push_back(1);
-      }
-    } else {
-      for (int i = 0; i < x_dims_.size(); i++) {
-        out_dims.push_back(x_dims_[i]);
-      }
-      if (keep_dim_) {
-        for (size_t i = 0; i < dim_.size(); ++i) {
-          out_dims[dim_[i]] = 1L;
-        }
-      } else {
-        int64_t kDelFlag = -2;
-        for (size_t i = 0; i < dim_.size(); ++i) {
-          out_dims[dim_[i]] = kDelFlag;
-        }
-        out_dims.erase(remove(out_dims.begin(), out_dims.end(), kDelFlag),
-                       out_dims.end());
-      }
-      out->Resize(DDim(out_dims));
-    }
-
-    auto* out_data = out->mutable_data<float>();
-    int in_n = x_dims_[0];
-    int in_c = x_dims_[1];
-    int in_h = x_dims_[2];
-    int in_w = x_dims_[3];
-
-    if (dim_.size() == 0) {
-      reduce_all(x_data, out_data, in_n, in_c, in_h, in_w);
-    } else if (dim_.size() == 1) {
-      switch (dim_[0]) {
-        case 0:
-          reduce_n(x_data, out_data, in_n, in_c, in_h, in_w);
-          break;
-        case 1:
-          reduce_c(x_data, out_data, in_n, in_c, in_h, in_w);
-          break;
-        case 2:
-          reduce_h(x_data, out_data, in_n, in_c, in_h, in_w);
-          break;
-        case 3:
-          reduce_w(x_data, out_data, in_n, in_c, in_h, in_w);
-          break;
-        default:
-          LOG(FATAL) << "error!!!";
-      }
-    } else if (dim_.size() == 2) {
-      if (dim_[0] == 0 && dim_[1] == 1) {
-        reduce_nc(x_data, out_data, in_n, in_c, in_h, in_w);
-      } else if (dim_[0] == 1 && dim_[1] == 2) {
-        reduce_ch(x_data, out_data, in_n, in_c, in_h, in_w);
-      } else if (dim_[0] == 2 && dim_[1] == 3) {
-        reduce_hw(x_data, out_data, in_n, in_c, in_h, in_w);
-      } else {
-        LOG(FATAL) << "invalid dims_!!";
-      }
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("reduce_max");
-    op_desc->SetInput("X", {input_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("dim", dim_);
-    op_desc->SetAttr("keep_dim", keep_dim_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(x_dims_.production());
-    for (int i = 0; i < x_dims_.production(); i++) {
-      data[i] = i * 1.0;
-    }
-    SetCommonTensor(input_, x_dims_, data.data());
-  }
-};
-
-void test_reduce_max(Place place) {
-  std::vector<std::vector<int>> reduce_dim{
-      {0}, {1}, {2}, {3}, {0, 1}, {1, 2}, {2, 3}, {-2, -1}};
-  for (auto n : {1, 3}) {
-    for (auto c : {1, 2}) {
-      for (auto h : {1, 3}) {
-        for (auto w : {1, 3}) {
-          for (bool keep_dim : {false, true}) {
-            for (auto dim : reduce_dim) {
-              auto x_dims = DDim(std::vector<int64_t>({n, c, h, w}));
-              std::unique_ptr<arena::TestCase> tester(
-                  new ReduceMaxComputeTester(
-                      place, "def", dim, keep_dim, x_dims));
-              arena::Arena arena(std::move(tester), place, 2e-5);
-              arena.TestPrecision();
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(ReduceMax, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_reduce_max(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/reduce_mean_compute_test.cc b/lite/tests/kernels/reduce_mean_compute_test.cc
deleted file mode 100644
index cda273239d..0000000000
--- a/lite/tests/kernels/reduce_mean_compute_test.cc
+++ /dev/null
@@ -1,346 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-void reduce_mean_n(const float* src,
-                   float* dst,
-                   int num_in,
-                   int channel_in,
-                   int height_in,
-                   int width_in) {
-  int hw_size = height_in * width_in;
-  int chw_size = channel_in * hw_size;
-  int data_index, src_index, src_index0;
-  for (int c = 0; c < channel_in; ++c) {
-    for (int h = 0; h < height_in; ++h) {
-      for (int w = 0; w < width_in; ++w) {
-        data_index = c * hw_size + h * width_in + w;
-        dst[data_index] = 0.0;
-        for (int n = 0; n < num_in; ++n) {
-          src_index = n * chw_size + data_index;
-          dst[data_index] += static_cast<float>(src[src_index]) / num_in;
-        }
-      }
-    }
-  }
-}
-
-void reduce_mean_c(const float* src,
-                   float* dst,
-                   int num_in,
-                   int channel_in,
-                   int height_in,
-                   int width_in) {
-  int hw_size = height_in * width_in;
-  int chw_size = hw_size * channel_in;
-  int data_index, src_index0, src_index;
-  for (int n = 0; n < num_in; ++n) {
-    for (int h = 0; h < height_in; ++h) {
-      for (int w = 0; w < width_in; ++w) {
-        data_index = n * hw_size + h * width_in + w;
-        src_index0 = n * chw_size + h * width_in + w;
-        dst[data_index] = 0.0;
-        for (int c = 0; c < channel_in; ++c) {
-          src_index = src_index0 + c * hw_size;
-          dst[data_index] += static_cast<float>(src[src_index]) / channel_in;
-        }
-      }
-    }
-  }
-}
-
-void reduce_mean_h(const float* src,
-                   float* dst,
-                   int num_in,
-                   int channel_in,
-                   int height_in,
-                   int width_in) {
-  int cw_size = channel_in * width_in;
-  int chw_size = cw_size * height_in;
-  int hw_size = height_in * width_in;
-  int data_index, src_index, src_index0;
-  for (int n = 0; n < num_in; ++n) {
-    for (int c = 0; c < channel_in; ++c) {
-      for (int w = 0; w < width_in; ++w) {
-        data_index = n * cw_size + c * width_in + w;
-        src_index0 = n * chw_size + c * hw_size + w;
-        dst[data_index] = 0.0;
-        for (int h = 0; h < height_in; ++h) {
-          src_index = src_index0 + h * width_in;
-          dst[data_index] += static_cast<float>(src[src_index]) / height_in;
-        }
-      }
-    }
-  }
-}
-
-void reduce_mean_w(const float* src,
-                   float* dst,
-                   int num_in,
-                   int channel_in,
-                   int height_in,
-                   int width_in) {
-  int ch_size = channel_in * height_in;
-  int hw_size = height_in * width_in;
-  int chw_size = ch_size * width_in;
-  int data_index = 0;
-  int src_index0 = 0;
-  int src_index = 0;
-  for (int n = 0; n < num_in; ++n) {
-    for (int c = 0; c < channel_in; ++c) {
-      for (int h = 0; h < height_in; ++h) {
-        data_index = n * ch_size + c * height_in + h;
-        src_index0 = n * chw_size + c * hw_size + h * width_in;
-        dst[data_index] = 0.0;
-        for (int w = 0; w < width_in; ++w) {
-          src_index = src_index0 + w;
-          dst[data_index] += static_cast<float>(src[src_index]) / width_in;
-        }
-      }
-    }
-  }
-}
-
-void reduce_mean_all(const float* src,
-                     float* dst,
-                     int num_in,
-                     int channel_in,
-                     int height_in,
-                     int width_in) {
-  float mean = 0.0;
-  int src_index;
-  int n_id, c_id;
-  int all = num_in * channel_in * height_in * width_in;
-  for (int n = 0; n < num_in; ++n) {
-    n_id = n * channel_in * height_in * width_in;
-    for (int c = 0; c < channel_in; ++c) {
-      c_id = c * height_in * width_in;
-      for (int h = 0; h < height_in; ++h) {
-        for (int w = 0; w < width_in; ++w) {
-          src_index = n_id + c_id + h * width_in + w;
-          mean = src[src_index] / all;
-        }
-      }
-    }
-  }
-  dst[0] = mean;
-}
-
-void reduce_mean_nc(const float* src,
-                    float* dst,
-                    int num_in,
-                    int channel_in,
-                    int height_in,
-                    int width_in) {
-  // reduce n first.
-  DDimLite ddimA({1, channel_in, height_in, width_in});
-  lite::Tensor tensor_tmp;
-  tensor_tmp.Resize(ddimA);
-  float* tmp_out = tensor_tmp.mutable_data<float>();
-  reduce_mean_n(src, tmp_out, num_in, channel_in, height_in, width_in);
-  reduce_mean_c(tmp_out, dst, 1, channel_in, height_in, width_in);
-}
-
-void reduce_mean_ch(const float* src,
-                    float* dst,
-                    int num_in,
-                    int channel_in,
-                    int height_in,
-                    int width_in) {
-  // reduce c first
-  DDimLite ddimA({num_in, 1, height_in, width_in});
-  lite::Tensor tensor_tmp;
-  tensor_tmp.Resize(ddimA);
-  float* tmp_out = tensor_tmp.mutable_data<float>();
-  reduce_mean_c(src, tmp_out, num_in, channel_in, height_in, width_in);
-  reduce_mean_h(tmp_out, dst, num_in, 1, height_in, width_in);
-}
-
-void reduce_mean_hw(const float* src,
-                    float* dst,
-                    int num_in,
-                    int channel_in,
-                    int height_in,
-                    int width_in) {
-  // reduce h first
-  DDimLite ddimA({num_in, channel_in, 1, width_in});
-  lite::Tensor tensor_tmp;
-  tensor_tmp.Resize(ddimA);
-  float* tmp_out = tensor_tmp.mutable_data<float>();
-  reduce_mean_h(src, tmp_out, num_in, channel_in, height_in, width_in);
-  reduce_mean_w(tmp_out, dst, num_in, channel_in, 1, width_in);
-}
-
-class ReduceMeanComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input_ = "x";
-  std::string output_ = "out";
-  std::vector<int> dim_{0};
-  DDim x_dims_{{3, 2, 3, 4}};
-  bool keep_dim_ = false;
-  bool reduce_all_ = false;
-
- public:
-  ReduceMeanComputeTester(const Place& place,
-                          const std::string& alias,
-                          std::vector<int> dim,
-                          bool keep_dim,
-                          DDim x_dims)
-      : TestCase(place, alias),
-        dim_(dim),
-        keep_dim_(keep_dim),
-        x_dims_(x_dims) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* x = scope->FindMutableTensor(input_);
-    const auto* x_data = x->data<float>();
-    auto* out = scope->NewTensor(output_);
-    auto x_rank = x_dims_.size();
-    if (!dim_.empty()) {
-      for (int i = 0; i < dim_.size(); i++) {
-        if (dim_[i] < 0) {
-          dim_[i] += x_rank;
-        }
-      }
-    }
-
-    sort(dim_.begin(), dim_.end());
-    if (dim_.size() == 0) {
-      reduce_all_ = true;
-    }
-    std::vector<int64_t> out_dims;
-    if (reduce_all_) {
-      if (keep_dim_) {
-        out_dims.push_back(x_rank);
-        out_dims.push_back(1);
-      } else {
-        out_dims.push_back(1);
-      }
-    } else {
-      for (int i = 0; i < x_dims_.size(); i++) {
-        out_dims.push_back(x_dims_[i]);
-      }
-      if (keep_dim_) {
-        for (size_t i = 0; i < dim_.size(); ++i) {
-          out_dims[dim_[i]] = 1L;
-        }
-      } else {
-        int64_t kDelFlag = -2;
-        for (size_t i = 0; i < dim_.size(); ++i) {
-          out_dims[dim_[i]] = kDelFlag;
-        }
-        out_dims.erase(remove(out_dims.begin(), out_dims.end(), kDelFlag),
-                       out_dims.end());
-      }
-      out->Resize(DDim(out_dims));
-    }
-
-    auto* out_data = out->mutable_data<float>();
-    int in_n = x_dims_[0];
-    int in_c = x_dims_[1];
-    int in_h = x_dims_[2];
-    int in_w = x_dims_[3];
-
-    if (dim_.size() == 0) {
-      reduce_mean_all(x_data, out_data, in_n, in_c, in_h, in_w);
-    } else if (dim_.size() == 1) {
-      switch (dim_[0]) {
-        case 0:
-          reduce_mean_n(x_data, out_data, in_n, in_c, in_h, in_w);
-          break;
-        case 1:
-          reduce_mean_c(x_data, out_data, in_n, in_c, in_h, in_w);
-          break;
-        case 2:
-          reduce_mean_h(x_data, out_data, in_n, in_c, in_h, in_w);
-          break;
-        case 3:
-          reduce_mean_w(x_data, out_data, in_n, in_c, in_h, in_w);
-          break;
-        default:
-          LOG(FATAL) << "error!!!";
-      }
-    } else if (dim_.size() == 2) {
-      if (dim_[0] == 0 && dim_[1] == 1) {
-        reduce_mean_nc(x_data, out_data, in_n, in_c, in_h, in_w);
-      } else if (dim_[0] == 1 && dim_[1] == 2) {
-        reduce_mean_ch(x_data, out_data, in_n, in_c, in_h, in_w);
-      } else if (dim_[0] == 2 && dim_[1] == 3) {
-        reduce_mean_hw(x_data, out_data, in_n, in_c, in_h, in_w);
-      } else {
-        LOG(FATAL) << "invalid dims_!!";
-      }
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("reduce_mean");
-    op_desc->SetInput("X", {input_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("dim", dim_);
-    op_desc->SetAttr("keep_dim", keep_dim_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(x_dims_.production());
-    for (int i = 0; i < x_dims_.production(); i++) {
-      data[i] = i * 1.0;
-    }
-    SetCommonTensor(input_, x_dims_, data.data());
-  }
-};
-
-void test_reduce_mean(Place place) {
-  std::vector<std::vector<int>> reduce_dim{
-      {0}, {1}, {2}, {3}, {0, 1}, {1, 2}, {2, 3}, {-2, -1}};
-  for (auto n : {1, 3}) {
-    for (auto c : {1, 2}) {
-      for (auto h : {1, 3}) {
-        for (auto w : {1, 3}) {
-          for (bool keep_dim : {false, true}) {
-            for (auto dim : reduce_dim) {
-              auto x_dims = DDim(std::vector<int64_t>({n, c, h, w}));
-              std::unique_ptr<arena::TestCase> tester(
-                  new ReduceMeanComputeTester(
-                      place, "def", dim, keep_dim, x_dims));
-              arena::Arena arena(std::move(tester), place, 2e-5);
-              arena.TestPrecision();
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(ReduceMean, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_reduce_mean(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/roi_align_compute_test.cc b/lite/tests/kernels/roi_align_compute_test.cc
deleted file mode 100644
index 8eb84dd033..0000000000
--- a/lite/tests/kernels/roi_align_compute_test.cc
+++ /dev/null
@@ -1,133 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <fstream>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-class RoiAlignComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string x_ = "X";
-  std::string rois_ = "ROIs";
-  std::string out_ = "Out";
-  float spatial_scale_ = 0.0625;
-  int pooled_height_ = 14;
-  int pooled_width_ = 14;
-  int sampling_ratio_ = 0;
-
- public:
-  RoiAlignComputeTester(const Place& place, const std::string& alias)
-      : TestCase(place, alias) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(out_);
-    CHECK(out);
-    out->Resize(std::vector<int64_t>({304, 1024, 14, 14}));
-    /*
-    std::vector<uint64_t> lod0({0, 152, 304});
-    LoD lod;
-    lod.push_back(lod0);
-    probs->set_lod(lod);
-    */
-
-    auto* out_data = out->mutable_data<float>();
-
-    std::string base_path = "/data/local/tmp/roi_align_datas/";
-    std::string filename;
-    std::ifstream reader;
-    // out
-    filename = "result_roi_align_0.tmp_0.txt";
-    reader.open(base_path + filename);
-    LOG(INFO) << "Start read out data";
-    for (int i = 0; i < out->numel(); i++) {
-      reader >> out_data[i];
-    }
-    LOG(INFO) << "Read out data. " << out_data[0] << " "
-              << out_data[out->numel() - 1];
-    reader.close();
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("roi_align");
-
-    op_desc->SetInput("X", {x_});
-    op_desc->SetInput("ROIs", {rois_});
-
-    op_desc->SetAttr("spatial_scale", spatial_scale_);
-    op_desc->SetAttr("pooled_height", pooled_height_);
-    op_desc->SetAttr("pooled_width", pooled_width_);
-    op_desc->SetAttr("sampling_ratio", sampling_ratio_);
-
-    op_desc->SetOutput("Out", {out_});
-  }
-
-  void PrepareData() override {
-    std::string base_path = "/data/local/tmp/roi_align_datas/";
-    std::string filename;
-    DDim dims;
-    std::vector<float> datas;
-    std::ifstream reader;
-    // x
-    filename = "result_res4f.add.output.5.tmp_0.txt";
-    dims = DDim(std::vector<int64_t>({2, 1024, 84, 50}));
-    datas.resize(dims.production());
-    reader.open(base_path + filename);
-    for (int i = 0; i < dims.production(); i++) {
-      reader >> datas[i];
-    }
-    LOG(INFO) << "Read x data. " << datas[0] << " " << datas.back();
-    reader.close();
-    SetCommonTensor(x_, dims, datas.data());
-
-    // rois
-    filename = "result_generate_proposals_0.tmp_0.txt";
-    dims = DDim(std::vector<int64_t>({304, 4}));
-    datas.resize(dims.production());
-    reader.open(base_path + filename);
-    for (int i = 0; i < dims.production(); i++) {
-      reader >> datas[i];
-    }
-    LOG(INFO) << "Read rois  data. " << datas[0] << " " << datas.back();
-    reader.close();
-    SetCommonTensor(rois_, dims, datas.data());
-
-    auto rois_tensor = baseline_scope()->FindMutableTensor(rois_);
-    std::vector<uint64_t> lod0({0, 152, 304});
-    LoD lod;
-    lod.push_back(lod0);
-    rois_tensor->set_lod(lod);
-  }
-};
-
-TEST(RoiAlign, precision) {
-  // The unit test for roi_align needs the params,
-  // which is obtained by runing model by paddle.
-  LOG(INFO) << "test roi align op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  std::unique_ptr<arena::TestCase> tester(
-      new RoiAlignComputeTester(place, "def"));
-  arena::Arena arena(std::move(tester), place, 2e-4);
-  arena.TestPrecision();
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/scale_compute_test.cc b/lite/tests/kernels/scale_compute_test.cc
deleted file mode 100644
index fd254c7495..0000000000
--- a/lite/tests/kernels/scale_compute_test.cc
+++ /dev/null
@@ -1,125 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-class ScaleComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input_ = "x";
-  std::string output_ = "out";
-  float scale_ = 0.;
-  float bias_ = 0.;
-  DDim dims_{{100, 20}};
-  bool bias_after_scale_;
-
- public:
-  ScaleComputeTester(const Place& place,
-                     const std::string& alias,
-                     float scale,
-                     float bias,
-                     bool bias_after_scale)
-      : TestCase(place, alias),
-        scale_(scale),
-        bias_(bias),
-        bias_after_scale_(bias_after_scale) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(input_);
-    const auto* x_data = x->data<float>();
-
-    float bias = bias_;
-
-    if (!bias_after_scale_) {
-      bias *= scale_;
-    }
-
-    for (int i = 0; i < dims_.production(); i++) {
-      out_data[i] = x_data[i] * scale_ + bias;
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("scale");
-    op_desc->SetInput("X", {input_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("scale", scale_);
-    op_desc->SetAttr("bias", bias_);
-    op_desc->SetAttr("bias_after_scale", bias_after_scale_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(input_, dims_, data.data());
-  }
-};
-
-TEST(Scale, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-#endif
-
-  for (float scale : {0.123, 2., -1.2}) {
-    for (float bias : {1., 0., -1.2331}) {
-      for (bool bias_before : {true, false}) {
-        std::unique_ptr<arena::TestCase> tester(
-            new ScaleComputeTester(place, "def", scale, bias, bias_before));
-        arena::Arena arena(std::move(tester), place, 2e-5);
-        arena.TestPrecision();
-      }
-    }
-  }
-}
-
-TEST(Scale, performance) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-#endif
-
-  std::unique_ptr<arena::TestCase> tester(
-      new ScaleComputeTester(place, "def", 1.2, 1.1, true));
-
-  // To modify the arm context, one can retrive the context as follows.
-  // #ifdef LITE_WITH_ARM
-  //   tester->context()->As<ARMContext>();
-  // #endif
-
-  arena::Arena arena(std::move(tester), place, 2e-5);
-  arena.TestPerformance(100);
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/sequence_expand_compute_test.cc b/lite/tests/kernels/sequence_expand_compute_test.cc
deleted file mode 100644
index c110f52793..0000000000
--- a/lite/tests/kernels/sequence_expand_compute_test.cc
+++ /dev/null
@@ -1,188 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-class SequenceExpandComputeTester : public arena::TestCase {
- protected:
-  const std::string input_x_ = "x";
-  const std::string input_y_ = "y";
-  const std::string output_ = "out";
-  int ref_level_ = -1;
-  DDim dims_{{4, 1}};
-  LoD lod_x_{{0, 2, 4}};
-  LoD lod_y_{{0, 1, 4}};
-
- public:
-  SequenceExpandComputeTester(const Place& place,
-                              const std::string& alias,
-                              LoD lod_x,
-                              LoD lod_y,
-                              int ref_level,
-                              DDim dims)
-      : TestCase(place, alias),
-        lod_x_(lod_x),
-        lod_y_(lod_y),
-        ref_level_(ref_level),
-        dims_(dims) {}
-
-  void RunBaseline(Scope* scope) {
-    auto* out = scope->NewTensor(output_);
-
-    auto* x = scope->FindMutableTensor(input_x_);
-    const auto* x_data = x->data<float>();
-    (x->mutable_lod())->clear();
-    (x->mutable_lod())->push_back(lod_x_[0]);
-    int x_rank = dims_.size();
-    auto width = x->numel() / dims_[0];
-    auto lod_x = x->lod();
-
-    auto* y = scope->FindMutableTensor(input_y_);
-    (y->mutable_lod())->clear();
-    for (int i = 0; i < lod_y_.size(); i++) {
-      (y->mutable_lod())->push_back(lod_y_[i]);
-    }
-    const auto* y_data = y->data<float>();
-    if (ref_level_ == -1) {
-      ref_level_ = lod_y_.size() - 1;
-    }
-    auto lod_y = y->lod()[ref_level_];
-
-    DDim out_dims(dims_);
-    int64_t out_first_dim = 0;
-    if (lod_x.size() > 0) {
-      if (lod_y.size() <= 1) {
-        out_first_dim = dims_[0];
-      } else {
-        for (int i = 1; i < lod_y.size(); ++i) {
-          int64_t x_seq_len = 1;
-          if (lod_x.size() == 1) {
-            x_seq_len = lod_x[0][i] - lod_x[0][i - 1];
-          }
-          out_first_dim += (lod_y[i] - lod_y[i - 1]) * x_seq_len;
-        }
-        out_dims[0] = out_first_dim;
-      }
-    } else {
-      out_dims[0] = -1;
-    }
-    out->Resize(out_dims);
-
-    auto* out_data = out->mutable_data<float>();
-    if (lod_x.size() == 0) {
-      for (int i = 0; i < lod_y.size() - 1; i++) {
-        for (int j = lod_y[i]; j < lod_y[i + 1]; j++) {
-          memcpy(
-              out_data + j * width, x_data + i * width, sizeof(float) * width);
-        }
-      }
-      (out->mutable_lod())->push_back(lod_y);
-    } else {
-      std::vector<uint64_t> output_lod;
-      output_lod.push_back(0);
-      uint64_t offset = 0;
-      uint64_t out_offset = 0;
-      for (int i = 0; i < lod_y.size() - 1; i++) {
-        auto x_seq_len = lod_x[0][i + 1] - lod_x[0][i];
-        auto repeat = lod_y[i + 1] - lod_y[i];
-        for (int j = 0; j < repeat; j++) {
-          for (int k = 0; k < x_seq_len; k++) {
-            memcpy(out_data + (offset + j * x_seq_len + k) * width,
-                   x_data + (lod_x[0][i] + k) * width,
-                   width * sizeof(float));
-          }
-          out_offset += x_seq_len;
-          output_lod.push_back(out_offset);
-        }
-        offset += repeat * x_seq_len;
-      }
-      (out->mutable_lod())->push_back(output_lod);
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("sequence_expand");
-    op_desc->SetInput("X", {input_x_});
-    op_desc->SetInput("Y", {input_y_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("ref_level", ref_level_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-    SetCommonTensor(input_x_, dims_, data.data(), lod_x_);
-    SetCommonTensor(input_y_, dims_, data.data(), lod_y_);
-  }
-};
-
-void generate_lod(int seq_num,
-                  int max_len,
-                  std::vector<uint64_t>& seq_offset) {  // NOLINT
-  seq_offset.clear();
-  int sum = 0;
-  seq_offset.push_back(sum);
-  for (int i = 0; i < seq_num; i++) {
-    sum += std::rand() % max_len + 1;
-    seq_offset.push_back(uint64_t(sum));
-  }
-}
-
-void test_sequence_expand(Place place) {
-  int max_len = 2;
-  for (int ref_level : {-1, 0}) {
-    for (auto c : {1, 3, 4}) {
-      for (auto h : {1, 3, 4}) {
-        for (auto w : {1, 3, 4}) {
-          for (int seq_num : {1, 3, 5}) {
-            std::vector<std::vector<uint64_t>> lod_x;
-            std::vector<std::vector<uint64_t>> lod_y;
-            lod_x.resize(1);
-            lod_y.resize(1);
-            generate_lod(seq_num, max_len, lod_x[0]);
-            generate_lod(seq_num, max_len, lod_y[0]);
-            int n = int64_t(lod_x[0].back());
-            auto dims_x = DDim(std::vector<int64_t>({n, c, h, w}));
-            std::unique_ptr<arena::TestCase> tester(
-                new SequenceExpandComputeTester(
-                    place, "def", lod_x, lod_y, ref_level, dims_x));
-            arena::Arena arena(std::move(tester), place, 2e-5);
-            arena.TestPrecision();
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(SequenceExpand, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_sequence_expand(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/sequence_pool_compute_test.cc b/lite/tests/kernels/sequence_pool_compute_test.cc
deleted file mode 100644
index 717b468721..0000000000
--- a/lite/tests/kernels/sequence_pool_compute_test.cc
+++ /dev/null
@@ -1,195 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-class SequencePoolComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input_ = "x";
-  std::string output_ = "out";
-  DDim dims_{{5, 1}};
-  LoD lod_{{0, 2, 5}};
-  std::string pool_type_ = "SUM";
-
- public:
-  SequencePoolComputeTester(const Place& place,
-                            const std::string& alias,
-                            LoD lod,
-                            std::string pool_type,
-                            DDim dims)
-      : TestCase(place, alias), lod_(lod), pool_type_(pool_type), dims_(dims) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* x = scope->FindMutableTensor(input_);
-    const auto* x_data = x->data<float>();
-    (x->mutable_lod())->clear();
-    (x->mutable_lod())->push_back(lod_[0]);
-    auto seq_offset = x->lod()[0];
-    int width = x->numel() / dims_[0];
-    std::vector<int64_t> out_dims;
-    for (int i = 0; i < dims_.size(); i++) {
-      out_dims.push_back(dims_[i]);
-    }
-    out_dims[0] = x->lod()[0].size() - 1;
-    auto* out = scope->NewTensor(output_);
-    out->Resize(out_dims);
-    auto* out_data = out->mutable_data<float>();
-
-    for (int i = 0; i < seq_offset.size() - 1; i++) {
-      int slice_num = seq_offset[i + 1] - seq_offset[i];
-      const float* x_data_ptr = x_data + seq_offset[i] * width;
-      float* out_data_ptr = out_data + i * width;
-      if (slice_num > 0) {
-        if (pool_type_ == "SUM") {
-          for (int j = 0; j < width; ++j) {
-            float sum = x_data_ptr[j];
-            for (int k = 1; k < slice_num; ++k) {
-              float x_data_read = x_data_ptr[k * width + j];
-              sum += x_data_read;
-            }
-            out_data_ptr[j] = sum;
-          }
-        } else if (pool_type_ == "AVERAGE") {
-          for (int j = 0; j < width; ++j) {
-            float sum = x_data_ptr[j];
-            for (int k = 1; k < slice_num; ++k) {
-              float x_data_read = x_data_ptr[k * width + j];
-              sum += x_data_read;
-            }
-            out_data_ptr[j] = sum / slice_num;
-          }
-        } else if (pool_type_ == "SQRT") {
-          float sqrt_len = sqrtf(slice_num);
-          for (int j = 0; j < width; ++j) {
-            float sum = x_data_ptr[j];
-            for (int k = 1; k < slice_num; ++k) {
-              float x_data_read = x_data_ptr[k * width + j];
-              sum += x_data_read;
-            }
-            out_data_ptr[j] = sum / sqrt_len;
-          }
-        } else if (pool_type_ == "MAX") {
-          for (int j = 0; j < width; ++j) {
-            float max = x_data_ptr[j];
-            for (int k = 1; k < slice_num; ++k) {
-              float x_data_read = x_data_ptr[k * width + j];
-              if (max < x_data_read) {
-                max = x_data_read;
-              }
-            }
-            out_data_ptr[j] = max;
-          }
-        } else if (pool_type_ == "MIN") {
-          for (int j = 0; j < width; ++j) {
-            float min = x_data_ptr[j];
-            for (int k = 1; k < slice_num; ++k) {
-              float x_data_read = x_data_ptr[k * width + j];
-              if (min > x_data_read) {
-                min = x_data_read;
-              }
-            }
-            out_data_ptr[j] = min;
-          }
-        } else if (pool_type_ == "FIRST") {
-          memcpy(out_data_ptr, x_data_ptr, width * sizeof(float));
-        } else if (pool_type_ == "LAST") {
-          int64_t seq_len =
-              static_cast<int64_t>(seq_offset[i + 1] - seq_offset[0]);
-          x_data_ptr = x_data + width * seq_len;
-          memcpy(out_data_ptr, x_data_ptr - width, width * sizeof(float));
-        } else {
-          LOG(ERROR) << " UNKNOWN seq pool type";
-        }
-      }
-    }
-    int batch_size = seq_offset.size() - 1;
-    std::vector<uint64_t> offset_new(static_cast<uint64_t>(batch_size + 1));
-    for (int i = 0; i <= batch_size; i++) {
-      offset_new[i] = i;
-    }
-    (out->mutable_lod())->push_back(offset_new);
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("sequence_pool");
-    op_desc->SetInput("X", {input_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("pooltype", pool_type_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-    SetCommonTensor(input_, dims_, data.data(), lod_);
-  }
-};
-void generate_lod(int seq_num,
-                  int max_len,
-                  std::vector<uint64_t>& seq_offset) {  // NOLINT
-  seq_offset.clear();
-  int sum = 0;
-  seq_offset.push_back(sum);
-  for (int i = 0; i < seq_num; i++) {
-    sum += std::rand() % max_len + 1;
-    seq_offset.push_back(uint64_t(sum));
-  }
-}
-
-void test_sequence_pool(Place place) {
-  int max_len = 2;
-  for (auto c : {1, 3, 4}) {
-    for (auto h : {1, 3, 4}) {
-      for (auto w : {1, 3, 4}) {
-        for (auto pool_type :
-             {"SUM", "AVERAGE", "SQRT", "MAX", "MIN", "FIRST", "LAST"}) {
-          for (int seq_num : {1, 3, 5}) {
-            std::vector<std::vector<uint64_t>> lod;
-            lod.resize(1);
-            generate_lod(seq_num, max_len, lod[0]);
-            int64_t n = int64_t(lod[0].back());
-            auto dims = DDim(std::vector<int64_t>({n, c, h, w}));
-            std::unique_ptr<arena::TestCase> tester(
-                new SequencePoolComputeTester(
-                    place, "def", lod, pool_type, dims));
-            arena::Arena arena(std::move(tester), place, 2e-5);
-            arena.TestPrecision();
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(SequencePool, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_sequence_pool(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/sequence_softmax_compute_test.cc b/lite/tests/kernels/sequence_softmax_compute_test.cc
deleted file mode 100644
index 938e98dc96..0000000000
--- a/lite/tests/kernels/sequence_softmax_compute_test.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-class SequenceSoftmaxComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input_ = "x";
-  std::string output_ = "out";
-  DDim dims_{{7, 1}};
-  LoD lod_{{0, 7}};
-  int seq_num_ = 1;
-
- public:
-  SequenceSoftmaxComputeTester(const Place& place,
-                               const std::string& alias,
-                               LoD lod)
-      : TestCase(place, alias), lod_(lod) {
-    DDim dims{{int64_t(lod[0].back()), int64_t(1)}};
-    dims_ = dims;
-  }
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindMutableTensor(input_);
-    const auto* x_data = x->data<float>();
-    (x->mutable_lod())->clear();
-    (x->mutable_lod())->push_back(lod_[0]);
-    auto seq_offset = x->lod()[0];
-    int in_h = dims_[0];
-    int in_w = x->numel() / in_h;
-    CHECK_EQ(in_w, 1) << "input dims is not valid";
-    int seq_num = seq_offset.size() - 1;
-    for (int i = 0; i < seq_num; i++) {
-      float seq_max = x_data[seq_offset[i]];
-      float exp_sum = 0.f;
-      for (int j = seq_offset[i]; j < seq_offset[i + 1]; j++) {
-        seq_max = std::max(seq_max, x_data[j]);
-      }
-      for (int j = seq_offset[i]; j < seq_offset[i + 1]; j++) {
-        exp_sum += expf(x_data[j] - seq_max);
-      }
-      for (int j = seq_offset[i]; j < seq_offset[i + 1]; j++) {
-        out_data[j] = expf(x_data[j] - seq_max) / exp_sum;
-      }
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("sequence_softmax");
-    op_desc->SetInput("X", {input_});
-    op_desc->SetOutput("Out", {output_});
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = 0.1;
-    }
-
-    SetCommonTensor(input_, dims_, data.data(), lod_);
-  }
-};
-
-void generate_lod(int seq_num,
-                  int max_len,
-                  std::vector<uint64_t>& seq_offset) {  // NOLINT
-  seq_offset.clear();
-  int sum = 0;
-  seq_offset.push_back(sum);
-  for (int i = 0; i < seq_num; i++) {
-    sum += std::rand() % max_len + 1;
-    seq_offset.push_back(uint64_t(sum));
-  }
-}
-void test_sequence_softmax(Place place) {
-  int max_len = 10;
-  for (int seq_num : {1, 3, 5}) {
-    std::vector<std::vector<uint64_t>> lod;
-    lod.resize(1);
-    generate_lod(seq_num, max_len, lod[0]);
-    std::unique_ptr<arena::TestCase> tester(
-        new SequenceSoftmaxComputeTester(place, "def", lod));
-    arena::Arena arena(std::move(tester), place, 2e-5);
-    arena.TestPrecision();
-  }
-}
-
-TEST(SequenceSoftmax, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_sequence_softmax(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/shape_compute_test.cc b/lite/tests/kernels/shape_compute_test.cc
deleted file mode 100644
index 23eab7c94f..0000000000
--- a/lite/tests/kernels/shape_compute_test.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-class ShapeComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string x_ = "Input";
-  std::string out_ = "Out";
-  DDim dims_;
-
- public:
-  ShapeComputeTester(const Place& place, const std::string& alias, DDim dims)
-      : TestCase(place, alias), dims_(dims) {}
-
-  void RunBaseline(Scope* scope) override {
-    const auto* input = scope->FindTensor(x_);
-    CHECK(input);
-    auto* out = scope->NewTensor(out_);
-    CHECK(out);
-    int64_t sz = input->dims().size();
-    out->Resize(DDim({sz}));
-    auto* out_data = out->mutable_data<int>();
-    for (int i = 0; i < input->dims().size(); ++i) {
-      out_data[i] = input->dims()[i];
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("shape");
-    op_desc->SetInput("Input", {x_});
-    op_desc->SetOutput("Out", {out_});
-  }
-
-  void PrepareData() override {
-    std::vector<float> in_data(dims_.production());
-    for (int i = 0; i < dims_.production(); ++i) {
-      in_data[i] = i;
-    }
-    SetCommonTensor(x_, dims_, in_data.data());
-  }
-};
-
-void test_shape(Place place) {
-  for (int N : {1, 2, 3, 4}) {
-    for (int C : {1, 2, 3, 4}) {
-      for (int H : {1, 2, 3, 4}) {
-        for (int W : {1, 2, 3, 4}) {
-          std::unique_ptr<arena::TestCase> tester(
-              new ShapeComputeTester(place, "def", DDim({N, C, H, W})));
-          arena::Arena arena(std::move(tester), place, 2e-5);
-          arena.TestPrecision();
-        }
-      }
-    }
-  }
-}
-
-TEST(shape, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_shape(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/shuffle_channel_compute_test.cc b/lite/tests/kernels/shuffle_channel_compute_test.cc
deleted file mode 100644
index d0e9912e65..0000000000
--- a/lite/tests/kernels/shuffle_channel_compute_test.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// TODO(zhengxi)
-// shuffle_channel_test can pass on local compilation
-// while on ci compilation, the test will be killed immediately.
-
-/*
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-class ShuffleChannelComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input_ = "X";
-  std::string output_ = "Out";
-  int group_ = 1;
-  DDim dims_{{1, 2}};
-
- public:
-  ShuffleChannelComputeTester(const Place& place,
-                              const std::string& alias,
-                              int group)
-      : TestCase(place, alias), group_(group) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    out->Resize(dims_);
-    auto* outputs = out->mutable_data<float>();
-    auto* x = scope->FindTensor(input_);
-    const auto* inputs = x->data<float>();
-    DDim x_dims = x->dims();
-    int num = x->dims()[0];
-    int channel = x->dims()[1];
-    int height = x->dims()[2];
-    int width = x->dims()[3];
-    int fea_size = channel * height * width;
-    int spatial_size = height * width;
-    int group_row = group_;
-    int group_col = channel / group_;
-    for (int k = 0; k < num; ++k) {
-      inputs += k * fea_size;
-      outputs += k * fea_size;
-      for (int i = 0; i < group_row; ++i) {
-        for (int j = 0; j < group_col; ++j) {
-          const float* p_i = inputs + (i * group_col + j) * spatial_size;
-          float* p_o = outputs + (j * group_row + i) * spatial_size;
-          memcpy(p_o, p_i, spatial_size * sizeof(float));
-        }
-      }
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("shuffle_channel");
-    op_desc->SetInput("X", {input_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("group", group_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(input_, dims_, data.data());
-  }
-};
-
-void test_shuffle_channel(Place place) {
-  for (int group : {1, 2, 3}) {
-    std::unique_ptr<arena::TestCase> tester(
-        new ShuffleChannelComputeTester(place, "def", group));
-    arena::Arena arena(std::move(tester), place, 2e-5);
-    arena.TestPrecision();
-  }
-}
-
-TEST(ShuffleChannel, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_shuffle_channel(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
-*/
diff --git a/lite/tests/kernels/slice_compute_test.cc b/lite/tests/kernels/slice_compute_test.cc
deleted file mode 100644
index a6e859daed..0000000000
--- a/lite/tests/kernels/slice_compute_test.cc
+++ /dev/null
@@ -1,190 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-static void slice_ref(const float* input,
-                      std::vector<int64_t> in_dims,
-                      std::vector<int> axes,
-                      std::vector<int> starts,
-                      std::vector<int> ends,
-                      float* out) {
-  auto out_dims = in_dims;
-  std::vector<int> real_starts(in_dims.size(), 0);
-  std::vector<int> real_ends(in_dims.size(), 0);
-  std::vector<int> real_step(in_dims.size(), 0);
-  for (int i = 0; i < in_dims.size(); i++) {
-    real_ends[i] = in_dims[i];
-  }
-  for (int i = 0; i < axes.size(); i++) {
-    int dim_value = in_dims[axes[i]];
-    if (dim_value > 0) {
-      int start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
-      int end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
-      start = std::max(start, 0);
-      end = std::max(end, 0);
-      end = std::min(end, dim_value);
-      out_dims[axes[i]] = end - start;
-      real_starts[axes[i]] = start;
-      real_ends[axes[i]] = end;
-    }
-  }
-  const int LEN = in_dims.size();
-  int dst_step[LEN];
-  for (int i = 0; i < in_dims.size(); ++i) {
-    dst_step[i] = 1;
-  }
-  int src_step[LEN];
-  for (int i = 0; i < in_dims.size(); ++i) {
-    src_step[i] = 1;
-  }
-  int out_num = out_dims[in_dims.size() - 1];
-  for (int i = in_dims.size() - 2; i >= 0; i--) {
-    dst_step[i] = out_dims[i + 1] * dst_step[i + 1];
-    src_step[i] = in_dims[i + 1] * src_step[i + 1];
-    out_num *= out_dims[i];
-  }
-
-  for (int dst_id = 0; dst_id < out_num; dst_id++) {
-    int src_id = 0;
-    int index_id = dst_id;
-    for (int j = 0; j < out_dims.size(); j++) {
-      int cur_id = index_id / dst_step[j];
-      index_id = index_id % dst_step[j];
-      src_id += (cur_id + real_starts[j]) * src_step[j];
-    }
-    out[dst_id] = input[src_id];
-  }
-}
-
-class SliceComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input_ = "Input";
-  std::string output_ = "Out";
-  std::vector<int> axes_;
-  std::vector<int> starts_;
-  std::vector<int> ends_;
-  std::vector<int> decrease_axis_;
-  DDim dims_;
-
- public:
-  SliceComputeTester(const Place& place,
-                     const std::string& alias,
-                     const std::vector<int>& axes,
-                     const std::vector<int>& starts,
-                     const std::vector<int>& ends,
-                     const std::vector<int>& decrease_axis,
-                     const DDim& dims)
-      : TestCase(place, alias),
-        axes_(axes),
-        starts_(starts),
-        ends_(ends),
-        decrease_axis_(decrease_axis),
-        dims_(dims) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    auto* input = scope->FindTensor(input_);
-    CHECK(out);
-    CHECK(input);
-    const auto* input_data = input->data<float>();
-    auto in_dims = input->dims();
-    auto out_dims = in_dims;
-    int dim_value, start, end;
-
-    for (size_t i = 0; i < axes_.size(); ++i) {
-      dim_value = out_dims[axes_[i]];
-      if (dim_value > 0) {
-        start = starts_[i] < 0 ? (starts_[i] + dim_value) : starts_[i];
-        end = ends_[i] < 0 ? (ends_[i] + dim_value) : ends_[i];
-        start = std::max(start, 0);
-        end = std::max(end, 0);
-        end = std::min(end, dim_value);
-        out_dims[axes_[i]] = end - start;
-      }
-    }
-    if (decrease_axis_.size() > 0) {
-      std::vector<int64_t> new_out_shape;
-      for (size_t i = 0; i < decrease_axis_.size(); ++i) {
-        out_dims[decrease_axis_[i]] = 0;
-      }
-      for (int i = 0; i < out_dims.size(); ++i) {
-        if (out_dims[i] != 0) {
-          new_out_shape.push_back(out_dims[i]);
-        }
-      }
-      if (new_out_shape.size() == 0) {
-        new_out_shape.push_back(1);
-      }
-      DDim new_dims;
-      new_dims.ConstructFrom(new_out_shape);
-      out_dims = new_dims;
-    }
-    out->Resize(out_dims);
-    auto* out_data = out->mutable_data<float>();
-    slice_ref(input_data, in_dims.data(), axes_, starts_, ends_, out_data);
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("slice");
-    op_desc->SetInput("Input", {input_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("axes", axes_);
-    op_desc->SetAttr("starts", starts_);
-    op_desc->SetAttr("ends", ends_);
-    op_desc->SetAttr("decrease_axis", decrease_axis_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i;
-    }
-
-    SetCommonTensor(input_, dims_, data.data());
-  }
-};
-
-void test_slice(Place place) {
-  std::vector<int> axes({0, 1, 2});
-  std::vector<int> starts({2, 2, 2});
-  std::vector<int> ends({5, 6, 7});
-  std::vector<int> decrease_axis({});
-  DDim dims({10, 10, 10});
-  std::unique_ptr<arena::TestCase> tester(new SliceComputeTester(
-      place, "def", axes, starts, ends, decrease_axis, dims));
-  arena::Arena arena(std::move(tester), place, 2e-4);
-  arena.TestPrecision();
-}
-
-TEST(Slice, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_slice(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/squeeze_compute_test.cc b/lite/tests/kernels/squeeze_compute_test.cc
deleted file mode 100644
index 36efe76978..0000000000
--- a/lite/tests/kernels/squeeze_compute_test.cc
+++ /dev/null
@@ -1,253 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-class SqueezeComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string x_ = "X";
-  std::string out_ = "Out";
-  std::vector<int> axes_;
-  DDim dims_;
-
- public:
-  SqueezeComputeTester(const Place& place,
-                       const std::string& alias,
-                       const std::vector<int>& axes,
-                       DDim dims)
-      : TestCase(place, alias), axes_(axes), dims_(dims) {}
-
-  void RunBaseline(Scope* scope) override {
-    const auto* input = scope->FindTensor(x_);
-    CHECK(input);
-    auto* out = scope->NewTensor(out_);
-    CHECK(out);
-
-    DDim in_dims(dims_);
-    size_t num_squeeze_dims = axes_.size();
-    int cnt_squeezed_dims = 0;
-    bool should_squeeze[9] = {false};
-
-    if (num_squeeze_dims == 0) {
-      for (int idx = 0; idx < in_dims.size(); ++idx) {
-        if (in_dims[idx] == 1) {
-          should_squeeze[idx] = true;
-          ++cnt_squeezed_dims;
-        }
-      }
-    } else {
-      for (size_t idx = 0; idx < num_squeeze_dims; ++idx) {
-        int current = axes_[idx] < 0 ? axes_[idx] + in_dims.size() : axes_[idx];
-        // Check current index, the upper limit has been checked.
-        CHECK_GE(current, 0)
-            << "Invalid axis, the negative axis is out of range.";
-
-        CHECK_EQ(in_dims[current], 1) << "Invalid axis index, the axis that "
-                                         "will be squeezed should be equal "
-                                         "to 1.";
-        if (!(should_squeeze[current])) {
-          ++cnt_squeezed_dims;
-        }
-        should_squeeze[current] = true;
-      }
-    }
-
-    std::vector<int64_t> output_shape(in_dims.size() - cnt_squeezed_dims, 0);
-    for (int in_idx = 0, out_idx = 0; in_idx < in_dims.size(); ++in_idx) {
-      if (!should_squeeze[in_idx]) {
-        output_shape[out_idx++] = in_dims[in_idx];
-      }
-    }
-
-    out->Resize(DDim(output_shape));
-    LOG(INFO) << "baseline out size: " << out->dims();
-    auto* input_data = input->data<float>();
-    auto* out_data = out->mutable_data<float>();
-    memcpy(out_data, input_data, sizeof(float) * dims_.production());
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("squeeze");
-    op_desc->SetInput("X", {x_});
-    op_desc->SetOutput("Out", {out_});
-    op_desc->SetAttr("axes", axes_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> in_data(dims_.production());
-    for (int i = 0; i < dims_.production(); ++i) {
-      in_data[i] = i;
-    }
-    SetCommonTensor(x_, dims_, in_data.data());
-  }
-};
-
-class Squeeze2ComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string x_ = "X";
-  std::string out_ = "Out";
-  std::string xshape_ = "XShape";
-  std::vector<int> axes_;
-  DDim dims_;
-
- public:
-  Squeeze2ComputeTester(const Place& place,
-                        const std::string& alias,
-                        const std::vector<int>& axes,
-                        DDim dims)
-      : TestCase(place, alias), axes_(axes), dims_(dims) {}
-
-  void RunBaseline(Scope* scope) override {
-    const auto* input = scope->FindTensor(x_);
-    CHECK(input);
-    auto* out = scope->NewTensor(out_);
-    CHECK(out);
-    auto* xshape = scope->NewTensor(xshape_);
-    CHECK(xshape);
-    std::vector<int64_t> xshape_sp(dims_.size() + 1, 1);
-    for (size_t i = 0; i < dims_.size(); ++i) {
-      xshape_sp[i + 1] = dims_[i];
-    }
-    xshape->Resize(DDim(xshape_sp));
-
-    DDim in_dims(dims_);
-    size_t num_squeeze_dims = axes_.size();
-    int cnt_squeezed_dims = 0;
-    bool should_squeeze[9] = {false};
-
-    if (num_squeeze_dims == 0) {
-      for (int idx = 0; idx < in_dims.size(); ++idx) {
-        if (in_dims[idx] == 1) {
-          should_squeeze[idx] = true;
-          ++cnt_squeezed_dims;
-        }
-      }
-    } else {
-      for (size_t idx = 0; idx < num_squeeze_dims; ++idx) {
-        int current = axes_[idx] < 0 ? axes_[idx] + in_dims.size() : axes_[idx];
-        // Check current index, the upper limit has been checked.
-        CHECK_GE(current, 0)
-            << "Invalid axis, the negative axis is out of range.";
-
-        CHECK_EQ(in_dims[current], 1) << "Invalid axis index, the axis that "
-                                         "will be squeezed should be equal "
-                                         "to 1.";
-        if (!(should_squeeze[current])) {
-          ++cnt_squeezed_dims;
-        }
-        should_squeeze[current] = true;
-      }
-    }
-
-    std::vector<int64_t> output_shape(in_dims.size() - cnt_squeezed_dims, 0);
-    for (int in_idx = 0, out_idx = 0; in_idx < in_dims.size(); ++in_idx) {
-      if (!should_squeeze[in_idx]) {
-        output_shape[out_idx++] = in_dims[in_idx];
-      }
-    }
-
-    out->Resize(DDim(output_shape));
-
-    auto* input_data = input->data<float>();
-    auto* out_data = out->mutable_data<float>();
-    auto* xshape_data = xshape->mutable_data<float>();
-    memcpy(out_data, input_data, sizeof(float) * dims_.production());
-    memcpy(xshape_data, input_data, sizeof(float) * dims_.production());
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("squeeze2");
-    op_desc->SetInput("X", {x_});
-    op_desc->SetOutput("Out", {out_});
-    op_desc->SetOutput("XShape", {xshape_});
-    op_desc->SetAttr("axes", axes_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> in_data(dims_.production());
-    for (int i = 0; i < dims_.production(); ++i) {
-      in_data[i] = i;
-    }
-    SetCommonTensor(x_, dims_, in_data.data());
-  }
-};
-
-void test_squeeze(Place place) {
-  for (std::vector<int> axes : {std::vector<int>({}),
-                                std::vector<int>({0, 2}),
-                                std::vector<int>({0, -2})}) {
-    for (int N : {1}) {
-      for (int C : {3}) {
-        for (int H : {1}) {
-          for (int W : {5}) {
-            std::unique_ptr<arena::TestCase> tester(new SqueezeComputeTester(
-                place, "def", axes, DDim({N, C, H, W})));
-            arena::Arena arena(std::move(tester), place, 2e-5);
-            arena.TestPrecision();
-          }
-        }
-      }
-    }
-  }
-}
-
-void test_squeeze2(Place place) {
-  for (std::vector<int> axes : {std::vector<int>({}),
-                                std::vector<int>({0, 2}),
-                                std::vector<int>({0, -2})}) {
-    for (int N : {1}) {
-      for (int C : {3}) {
-        for (int H : {1}) {
-          for (int W : {5}) {
-            std::unique_ptr<arena::TestCase> tester(new Squeeze2ComputeTester(
-                place, "def", axes, DDim({N, C, H, W})));
-            arena::Arena arena(std::move(tester), place, 2e-5);
-            arena.TestPrecision();
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(squeeze, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_squeeze(place);
-#endif
-}
-
-TEST(squeeze2, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_squeeze2(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/stack_compute_test.cc b/lite/tests/kernels/stack_compute_test.cc
deleted file mode 100644
index 543409d4ba..0000000000
--- a/lite/tests/kernels/stack_compute_test.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-void stack(std::vector<const lite::Tensor*> x, lite::Tensor* y, int axis) {
-  if (axis < 0) axis += (x[0]->dims().size() + 1);
-  int n = x.size();
-  auto* y_data = y->mutable_data<float>();
-  std::vector<const float*> x_datas(n);
-  for (int i = 0; i < n; i++) x_datas[i] = x[i]->data<float>();
-
-  int pre = 1, post = 1;
-  auto& dim = x[0]->dims();
-  for (auto i = 0; i < axis; ++i) pre *= dim[i];
-  for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
-
-  auto x_data_arr = x_datas.data();
-
-  size_t x_offset = 0;
-  size_t y_offset = 0;
-  for (int i = 0; i < pre; i++) {
-    for (int j = 0; j < n; j++) {
-      std::memcpy(
-          y_data + y_offset, x_data_arr[j] + x_offset, post * sizeof(float));
-      y_offset += post;
-    }
-    x_offset += post;
-  }
-}
-
-class StackComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input1_ = "X1";
-  std::string input2_ = "X2";
-  std::string output_ = "Out";
-  int axis_ = 0;
-  DDim dims_{{1, 5, 6, 7}};
-
- public:
-  StackComputeTester(const Place& place, const std::string& alias, float axis)
-      : TestCase(place, alias), axis_(axis) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
-    CHECK(out);
-    std::vector<const lite::Tensor*> x;
-    x.emplace_back(scope->FindTensor(input1_));
-    x.emplace_back(scope->FindTensor(input2_));
-    auto input_dims = x[0]->dims();
-    int rank = input_dims.size();
-    if (axis_ < 0) axis_ += (rank + 1);
-    auto vec = input_dims.Vectorize();
-    vec.insert(vec.begin() + axis_, x.size());
-    out->Resize(vec);
-    stack(x, out, axis_);
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("stack");
-    op_desc->SetInput("X", {input1_, input2_});
-    op_desc->SetOutput("Y", {output_});
-    op_desc->SetAttr("axis", axis_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.01;
-    }
-
-    SetCommonTensor(input1_, dims_, data.data());
-    SetCommonTensor(input2_, dims_, data.data());
-  }
-};
-
-void test_stack(Place place) {
-  for (float axis : {0, 1, 3}) {
-    std::unique_ptr<arena::TestCase> tester(
-        new StackComputeTester(place, "def", axis));
-    arena::Arena arena(std::move(tester), place, 2e-4);
-    arena.TestPrecision();
-  }
-}
-
-TEST(Stack, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_stack(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/test_funcs.h b/lite/tests/kernels/test_funcs.h
deleted file mode 100644
index accbb0eead..0000000000
--- a/lite/tests/kernels/test_funcs.h
+++ /dev/null
@@ -1,191 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-template <typename type, typename type2>
-static void basic_gemm(bool trans_a,
-                       bool trans_b,
-                       int m,
-                       int n,
-                       int k,
-                       type2 alpha,
-                       const type* a,
-                       int lda,
-                       const type* b,
-                       int ldb,
-                       type2 beta,
-                       type2* c,
-                       int ldc,
-                       const type2* bias,
-                       bool flag_bias = false,
-                       bool flag_relu = false) {
-#pragma omp parallel for
-  for (int i = 0; i < m; ++i) {
-    auto bias_data = static_cast<type2>(0);
-    if (flag_bias) {
-      bias_data = bias[i];
-    }
-    for (int j = 0; j < n; ++j) {
-      auto sum = static_cast<type2>(0);
-      for (int l = 0; l < k; ++l) {
-        type av;
-        type bv;
-        if (trans_a) {
-          av = a[l * lda + i];
-        } else {
-          av = a[i * lda + l];
-        }
-        if (trans_b) {
-          bv = b[j * ldb + l];
-        } else {
-          bv = b[l * ldb + j];
-        }
-        sum += av * bv;
-      }
-      type2 tmp = alpha * sum + beta * c[i * ldc + j] + bias_data;
-      if (flag_relu) {
-        c[i * ldc + j] = tmp > (type2)0 ? tmp : (type2)0;
-      } else {
-        c[i * ldc + j] = tmp;
-      }
-    }
-  }
-}
-
-template <typename type, typename type2>
-static void basic_gemv(int m,
-                       int k,
-                       const type* a,
-                       const type* b,
-                       const type2* bias,
-                       type2* c,
-                       type2 alpha,
-                       type2 beta,
-                       bool trans_a = false,
-                       bool flag_bias = false,
-                       bool flag_relu = false) {
-#pragma omp parallel for
-  for (int i = 0; i < m; ++i) {
-    auto bias_data = static_cast<type2>(0);
-    if (flag_bias) {
-      bias_data = bias[i];
-    }
-    auto sum = static_cast<type2>(0);
-    for (int j = 0; j < k; ++j) {
-      type av;
-      if (trans_a) {
-        av = a[j * m + i];
-      } else {
-        av = a[i * k + j];
-      }
-      sum += av * b[j];
-    }
-    type2 tmp = alpha * sum + beta * c[i] + bias_data;
-    if (flag_relu) {
-      c[i] = tmp > (type2)0 ? tmp : (type2)0;
-    } else {
-      c[i] = tmp;
-    }
-  }
-}
-
-/**
- * \brief basic direct convolution function
- */
-//! for float, dtype1 and type2 is float
-//! for int8, dytpe1 is char, dtype2 is int
-template <typename Dtype1, typename Dtype2>
-static void conv_basic(const Dtype1* din,
-                       Dtype2* dout,
-                       int num,
-                       int chout,
-                       int hout,
-                       int wout,
-                       int chin,
-                       int hin,
-                       int win,
-                       const Dtype1* weights,
-                       const Dtype2* bias,
-                       int group,
-                       int kernel_w,
-                       int kernel_h,
-                       int stride_w,
-                       int stride_h,
-                       int dila_w,
-                       int dila_h,
-                       int pad_w,
-                       int pad_h,
-                       bool flag_bias,
-                       bool flag_relu) {
-  Dtype2 beta = 0;
-  auto src_data = din;
-  auto dst_data_ref = dout;
-  auto weights_data = weights;
-  auto with_bias = flag_bias;
-  auto bias_data = bias;
-
-  int in_num = num;
-  int out_channels = chout;
-  int out_h = hout;
-  int out_w = wout;
-
-  int in_channel = chin;
-  int in_h = hin;
-  int in_w = win;
-  int out_c_group = out_channels / group;
-  int in_c_group = in_channel / group;
-
-  for (int n = 0; n < in_num; ++n) {
-#pragma omp parallel for collapse(4)
-    for (int g = 0; g < group; ++g) {
-      for (int oc = 0; oc < out_c_group; ++oc) {
-        for (int oh = 0; oh < out_h; ++oh) {
-          for (int ow = 0; ow < out_w; ++ow) {
-            int out_idx = n * group * out_c_group * out_h * out_w +
-                          g * out_c_group * out_h * out_w + oc * out_h * out_w +
-                          oh * out_w + ow;
-            Dtype2 bias_d = with_bias ? (bias_data[g * out_c_group + oc]) : 0;
-            dst_data_ref[out_idx] = bias_d;  // + dst_data_ref[out_idx] * beta;
-            for (int ic = 0; ic < in_c_group; ++ic) {
-              for (int kh = 0; kh < kernel_h; ++kh) {
-                for (int kw = 0; kw < kernel_w; ++kw) {
-                  int iw = ow * stride_w - pad_w + kw * (dila_w);
-                  int ih = oh * stride_h - pad_h + kh * (dila_h);
-                  if (iw < 0 || iw >= in_w) continue;
-                  if (ih < 0 || ih >= in_h) continue;
-
-                  int iidx = n * in_channel * in_h * in_w +
-                             g * in_c_group * in_h * in_w + ic * in_h * in_w +
-                             ih * in_w + iw;
-                  int widx =
-                      g * out_c_group * in_c_group * kernel_h * kernel_w +
-                      oc * in_c_group * kernel_h * kernel_w +
-                      ic * kernel_h * kernel_w + kh * kernel_w + kw;
-
-                  dst_data_ref[out_idx] += src_data[iidx] * weights_data[widx];
-                }
-              }
-            }
-            if (flag_relu) {
-              dst_data_ref[out_idx] = dst_data_ref[out_idx] > (Dtype2)0
-                                          ? dst_data_ref[out_idx]
-                                          : (Dtype2)0;
-            }
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/lite/tests/kernels/test_sgemm.cc b/lite/tests/kernels/test_sgemm.cc
deleted file mode 100644
index 4801b3086e..0000000000
--- a/lite/tests/kernels/test_sgemm.cc
+++ /dev/null
@@ -1,353 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//
-// Created by Li,Xiaoyang(SYS) on 2019-07-25.
-//
-
-#include "lite/tests/kernels/fill_data.h"
-#include "lite/tests/kernels/test_funcs.h"
-#ifdef LITE_WITH_ARM
-#include "lite/backends/arm/math/funcs.h"
-#endif
-#include "lite/core/context.h"
-#include "lite/core/tensor.h"
-int g_cluster = 0;
-int g_threads = 1;
-
-bool g_basic_test = false;
-
-int g_M = 512;
-int g_N = 512;
-int g_K = 512;
-bool g_traA = false;
-bool g_traB = false;
-bool g_flag_relu = false;
-bool g_flag_bias = false;
-int g_test_iter = 1;
-int g_warmup_iter = 0;
-bool g_compare_result = true;
-
-int g_offset_a = 10;
-int g_offset_b = 10;
-int g_offset_c = 10;
-
-float g_alpha = 1.f;
-float g_beta = 0.f;
-
-const int MALLOC_ALIGN = 16;
-
-static void* fast_malloc1(size_t size) {
-  size_t offset = sizeof(void*) + MALLOC_ALIGN - 1;
-  char* p;
-  p = static_cast<char*>(malloc(offset + size));
-  if (!p) {
-    return nullptr;
-  }
-  void* r = reinterpret_cast<void*>(reinterpret_cast<size_t>(p + offset) &
-                                    (~(MALLOC_ALIGN - 1)));
-  static_cast<void**>(r)[-1] = p;
-  return r;
-}
-
-static void fast_free1(void* ptr) {
-  if (ptr) {
-    free(static_cast<void**>(ptr)[-1]);
-  }
-}
-
-bool test_sgemm(bool tra,
-                bool trb,
-                int m,
-                int n,
-                int k,
-                int lda,
-                int ldb,
-                int ldc,
-                float alpha,
-                float beta,
-                bool has_bias,
-                bool has_relu,
-                int cls,
-                int ths) {
-  size_t size_a = tra ? k * lda : m * lda;
-  size_t size_b = trb ? n * ldb : k * ldb;
-
-  auto da = static_cast<float*>(fast_malloc1(size_a * sizeof(float)));
-  auto db = static_cast<float*>(fast_malloc1(size_b * sizeof(float)));
-  auto dc = static_cast<float*>(fast_malloc1(m * ldc * sizeof(float)));
-  auto dc_basic = static_cast<float*>(fast_malloc1(m * ldc * sizeof(float)));
-  auto dbias = static_cast<float*>(fast_malloc1(m * sizeof(float)));
-
-  fill_data_rand(da, -1.f, 1.f, size_a);
-  fill_data_rand(db, -1.f, 1.f, size_b);
-  fill_data_rand(dbias, -1.f, 1.f, m);
-  fill_data_rand(dc, -1.f, 1.f, m * ldc);
-  memcpy(dc_basic, dc, sizeof(float) * m * ldc);
-
-  LOG(INFO) << "sgemm M: " << m << ", N: " << n << ", K: " << k;
-  LOG(INFO) << "strides, lda: " << lda << ", ldb: " << ldb << ", ldc: " << ldc;
-  LOG(INFO) << "alpha: " << alpha << ", beta: " << beta;
-  LOG(INFO) << "transA: " << (tra ? "true" : "false")
-            << ", transB: " << (trb ? "true" : "false");
-  LOG(INFO) << "relu: " << (has_relu ? "true" : "false")
-            << ", bias: " << (has_bias ? "true" : "false");
-
-  LOG(INFO) << "basic sgemm compute";
-  basic_gemm(tra,
-             trb,
-             m,
-             n,
-             k,
-             alpha,
-             da,
-             lda,
-             db,
-             ldb,
-             beta,
-             dc_basic,
-             ldc,
-             dbias,
-             has_bias,
-             has_relu);
-
-  float max_error = 0.f;
-  float max_ratio = 0.f;
-#ifdef LITE_WITH_ARM
-  //! compute
-  LOG(INFO) << "sgemm compute";
-  double ops = 2.0 * m * n * k;
-  std::unique_ptr<paddle::lite::KernelContext> ctx1(
-      new paddle::lite::KernelContext);
-  auto& ctx = ctx1->As<paddle::lite::ARMContext>();
-
-  paddle::lite::arm::math::sgemm(tra,
-                                 trb,
-                                 m,
-                                 n,
-                                 k,
-                                 alpha,
-                                 da,
-                                 lda,
-                                 db,
-                                 ldb,
-                                 beta,
-                                 dc,
-                                 ldc,
-                                 dbias,
-                                 has_bias,
-                                 has_relu,
-                                 &ctx);
-
-  for (int i = 0; i < m * ldc; ++i) {
-    auto error = fabsf(dc[i] - dc_basic[i]);
-    if (error > max_error) {
-      max_error = error;
-      max_ratio = error / fabsf(dc_basic[i]);
-    }
-  }
-  if (max_error > 2e-5f && max_ratio > 2e-5f) {
-    LOG(INFO) << "max ratio: " << max_ratio << ", max_error: " << max_error;
-    LOG(INFO) << "sgemm result:";
-    for (int i = 0; i < m * ldc; ++i) {
-      printf("%f ", dc[i]);
-      if ((i + 1) % ldc == 0) {
-        printf("\n");
-      }
-    }
-    LOG(INFO) << "basic result:";
-    for (int i = 0; i < m * ldc; ++i) {
-      printf("%f ", dc_basic[i]);
-      if ((i + 1) % ldc == 0) {
-        printf("\n");
-      }
-    }
-  }
-#endif
-  fast_free1(da);
-  fast_free1(db);
-  fast_free1(dbias);
-  fast_free1(dc);
-  fast_free1(dc_basic);
-  return max_error < 2e-5f || max_ratio < 2e-5f;
-}
-
-void test_input() {
-  int lda = g_K + g_offset_a;
-  if (g_traA) {
-    lda = g_M + g_offset_a;
-  }
-  int ldb = g_N + g_offset_b;
-  if (g_traB) {
-    ldb = g_K + g_offset_b;
-  }
-  int ldc = g_N + g_offset_c;
-  auto flag = test_sgemm(g_traA,
-                         g_traB,
-                         g_M,
-                         g_N,
-                         g_K,
-                         lda,
-                         ldb,
-                         ldc,
-                         g_alpha,
-                         g_beta,
-                         g_flag_bias,
-                         g_flag_relu,
-                         g_cluster,
-                         g_threads);
-  if (!flag) {
-    LOG(FATAL) << "test m = " << g_M << ", n=" << g_N << ", k=" << g_K
-               << ", trans A: " << g_traA << ", trans B: " << g_traB
-               << ", bias: " << g_flag_bias << ", relu: " << g_flag_relu
-               << " failed!!";
-  }
-  LOG(INFO) << "test m = " << g_M << ", n=" << g_N << ", k=" << g_K
-            << ", trans A: " << g_traA << ", trans B: " << g_traB
-            << ", bias: " << g_flag_bias << ", relu: " << g_flag_relu
-            << " passed!!";
-}
-
-void test_func_sgemm_prepacked() {
-  if (g_basic_test) {
-    LOG(INFO) << "run basic sgemm test";
-    for (auto& m : {1, 8, 16, 111, 256, 397, 512, 777, 1024}) {
-      for (auto& n : {1, 3, 13, 141, 256, 345, 512, 789, 1024}) {
-        for (auto& k : {1, 4, 15, 59, 128, 234, 512, 678, 1024}) {
-          for (auto& tra : {false, true}) {
-            for (auto& trb : {false, true}) {
-              for (auto& alpha : {1.f, 0.5f}) {
-                for (auto& beta : {0.f, 0.5f}) {
-                  for (auto& offset : {0, 10}) {
-                    for (auto& has_bias : {false, true}) {
-                      for (auto& has_relu : {false, true}) {
-                        for (auto& th : {1, 2, 4}) {
-                          int lda = k + offset;
-                          if (tra) {
-                            lda = m + offset;
-                          }
-                          int ldb = n + offset;
-                          if (trb) {
-                            ldb = k + offset;
-                          }
-                          int ldc = n + offset;
-                          auto flag = test_sgemm(tra,
-                                                 trb,
-                                                 m,
-                                                 n,
-                                                 k,
-                                                 lda,
-                                                 ldb,
-                                                 ldc,
-                                                 alpha,
-                                                 beta,
-                                                 has_bias,
-                                                 has_relu,
-                                                 g_cluster,
-                                                 th);
-                          if (flag) {
-                            LOG(INFO)
-                                << "test m = " << m << ", n=" << n
-                                << ", k=" << k
-                                << ", bias: " << (has_bias ? "true" : "false")
-                                << ", relu: " << (has_relu ? "true" : "false")
-                                << ", trans A: " << (tra ? "true" : "false")
-                                << ", trans B: " << (trb ? "true" : "false")
-                                << " passed\n";
-                          } else {
-                            LOG(FATAL)
-                                << "test m = " << m << ", n=" << n
-                                << ", k=" << k
-                                << ", bias: " << (has_bias ? "true" : "false")
-                                << ", relu: " << (has_relu ? "true" : "false")
-                                << ", trans A: " << (tra ? "true" : "false")
-                                << ", trans B: " << (trb ? "true" : "false")
-                                << " failed\n";
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-int main(int argc, const char** argv) {
-#ifdef LITE_WITH_ARM
-  paddle::lite::DeviceInfo::Init();
-#endif
-  LOG(ERROR) << "usage: ./" << argv[0]
-             << " [do_basic_test] [cluster]  [threads]  [m] [n]  [k] [transA] "
-                "[transB] [relu] [bias] [test iter] [compare result]";
-  if (argc > 1) {
-    g_basic_test = atoi(argv[1]) > 0;
-  }
-  if (argc > 2) {
-    g_cluster = atoi(argv[2]);
-  }
-  if (argc > 3) {
-    g_threads = atoi(argv[3]);
-  }
-  if (argc > 4) {
-    if (argc < 10) {
-      LOG(ERROR) << "usage: ./" << argv[0] << " [do_basic_test] [cluster]  "
-                                              "[threads]  [m] [n]  [k] "
-                                              "[transA] [transB] [bias] [relu] "
-                                              "[test iter] [compare result]";
-      return 0;
-    }
-    g_M = atoi(argv[4]);
-    g_N = atoi(argv[5]);
-    g_K = atoi(argv[6]);
-    g_traA = atoi(argv[7]) > 0;
-    g_traB = atoi(argv[8]) > 0;
-    g_flag_bias = atoi(argv[9]) > 0;
-    g_flag_relu = atoi(argv[10]) > 0;
-  }
-  if (argc > 11) {
-    g_test_iter = atoi(argv[11]);
-  }
-  if (argc > 12) {
-    g_compare_result = atoi(argv[12]) > 0;
-  }
-  if (argc > 13) {
-    g_warmup_iter = atoi(argv[13]);
-  }
-  if (argc > 14) {
-    g_offset_a = atoi(argv[14]);
-  }
-  if (argc > 15) {
-    g_offset_b = atoi(argv[15]);
-  }
-  if (argc > 16) {
-    g_offset_c = atoi(argv[16]);
-  }
-  if (argc > 17) {
-    g_alpha = atof(argv[17]);
-  }
-  if (argc > 18) {
-    g_beta = atof(argv[18]);
-  }
-  test_input();
-  if (g_basic_test) {
-    test_func_sgemm_prepacked();
-  }
-  return 0;
-}
diff --git a/lite/tests/kernels/topk_compute_test.cc b/lite/tests/kernels/topk_compute_test.cc
deleted file mode 100644
index 3c5540e48f..0000000000
--- a/lite/tests/kernels/topk_compute_test.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-bool comp_func(std::pair<float, int> a, std::pair<float, int> b) {
-  return (a.first > b.first);
-}
-
-class TopkComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input_ = "x";
-  std::string out_val_ = "out_val";
-  std::string out_ind_ = "out_ind";
-  int K_ = 1;
-  DDim dims_{{3, 5, 4, 4}};
-
- public:
-  TopkComputeTester(const Place& place,
-                    const std::string& alias,
-                    int K,
-                    DDim dims)
-      : TestCase(place, alias), K_(K), dims_(dims) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out_val = scope->NewTensor(out_val_);
-    auto* out_ind = scope->NewTensor(out_ind_);
-    CHECK(out_val);
-    CHECK(out_ind);
-    DDim out_dims = dims_;
-    out_dims[out_dims.size() - 1] = K_;
-    out_val->Resize(out_dims);
-    out_ind->Resize(out_dims);
-    auto* out_val_data = out_val->mutable_data<float>();
-    auto* out_ind_data = out_ind->mutable_data<int>();
-
-    auto* x = scope->FindTensor(input_);
-    const auto* x_data = x->data<float>();
-    int m = out_dims.production() / K_;
-    int n = dims_[dims_.size() - 1];
-
-    for (int i = 0; i < m; i++) {
-      const float* in_tmp = x_data + i * n;
-      float* out_val_tmp = out_val_data + i * K_;
-      int* out_ind_tmp = out_ind_data + i * K_;
-      std::vector<std::pair<float, int>> vec;
-      for (int j = 0; j < n; j++) {
-        vec.push_back(std::make_pair(in_tmp[j], j));
-      }
-      std::partial_sort(vec.begin(), vec.begin() + K_, vec.end(), comp_func);
-      for (int q = 0; q < K_; q++) {
-        out_val_tmp[q] = vec[q].first;
-        out_ind_tmp[q] = vec[q].second;
-        LOG(INFO) << "out:" << i << " " << q << " " << out_val_tmp[q] << " "
-                  << out_ind_tmp[q];
-      }
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("topk");
-    op_desc->SetInput("X", {input_});
-    op_desc->SetOutput("Out", {out_val_, out_ind_});
-    op_desc->SetAttr("K", K_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = std::rand() * 1.0f / RAND_MAX;
-    }
-
-    SetCommonTensor(input_, dims_, data.data());
-  }
-};
-
-void test_topk(Place place) {
-  DDimLite dims_0{{3, 5}};
-  DDimLite dims_1{{8}};
-  for (int K : {1, 2}) {
-    for (auto dims : {dims_0, dims_1}) {
-      std::unique_ptr<arena::TestCase> tester(
-          new TopkComputeTester(place, "def", K, dims));
-      arena::Arena arena(std::move(tester), place, 2e-5);
-      arena.TestPrecision();
-    }
-  }
-}
-
-TEST(Topk, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_topk(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/write_to_array_compute_test.cc b/lite/tests/kernels/write_to_array_compute_test.cc
deleted file mode 100644
index 5eaabc9dd8..0000000000
--- a/lite/tests/kernels/write_to_array_compute_test.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-namespace paddle {
-namespace lite {
-
-class WriteToArrayComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input_0 = "x";
-  std::string input_1 = "i";
-  std::string output_0 = "out0";
-  std::string output_1 = "out1";
-  std::string output_2 = "out2";
-  DDim dims_{{3, 5, 4, 4}};
-  int i_;
-
- public:
-  WriteToArrayComputeTester(const Place& place,
-                            const std::string& alias,
-                            const int i,
-                            DDim dims)
-      : TestCase(place, alias), i_(i), dims_(dims) {}
-
-  void RunBaseline(Scope* scope) override {
-    auto* out_0 = scope->NewTensor(output_0);
-    auto* out_1 = scope->NewTensor(output_1);
-    auto* out_2 = scope->NewTensor(output_2);
-    CHECK(out_0);
-    CHECK(out_1);
-    CHECK(out_2);
-    std::vector<TensorLite*> out_vec = {out_0, out_1, out_2};
-
-    auto* x = scope->FindTensor(input_0);
-    const auto* x_data = x->data<float>();
-    auto* id = scope->FindTensor(input_1);
-    const auto* id_data = id->data<float>();
-    int n = x->numel();
-    int cur_out_num = out_vec.size();
-    for (int i = cur_out_num; i < id_data[0] + 1; i++) {
-      char buffer[30];
-      snprintf(buffer, sizeof(buffer), "out%d", i);
-      auto out = scope->NewTensor(buffer);
-      out_vec.push_back(out);
-    }
-    out_vec[id_data[0]]->Resize(dims_);
-    auto* out_data = out_vec[id_data[0]]->mutable_data<float>();
-    memcpy(out_data, x_data, sizeof(float) * n);
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("write_to_array");
-    op_desc->SetInput("X", {input_0});
-    op_desc->SetInput("I", {input_1});
-    op_desc->SetOutput("Out", {output_0, output_1, output_2});
-  }
-
-  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(input_0, dims_, data.data());
-
-    std::vector<int> data_1(1);
-    data_1[0] = i_;
-    DDimLite dims_2{{1}};
-    SetCommonTensor(input_1, dims_2, data_1.data());
-
-    SetCommonTensor(output_0, dims_2, data_1.data());
-    SetCommonTensor(output_1, dims_2, data_1.data());
-    SetCommonTensor(output_2, dims_2, data_1.data());
-  }
-};
-void test_write_to_array(Place place) {
-  DDimLite dims{{3, 5, 4, 4}};
-  for (int i : {1, 4}) {
-    std::unique_ptr<arena::TestCase> tester(
-        new WriteToArrayComputeTester(place, "def", i, dims));
-    arena::Arena arena(std::move(tester), place, 2e-5);
-    arena.TestPrecision();
-  }
-}
-
-TEST(WriteToArray, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_write_to_array(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tests/kernels/yolo_box_compute_test.cc b/lite/tests/kernels/yolo_box_compute_test.cc
deleted file mode 100644
index a051e06b6b..0000000000
--- a/lite/tests/kernels/yolo_box_compute_test.cc
+++ /dev/null
@@ -1,254 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-
-namespace {
-inline float sigmoid(float x) { return 1.f / (1.f + expf(-x)); }
-
-inline void get_yolo_box(float* box,
-                         const float* x,
-                         const int* anchors,
-                         int i,
-                         int j,
-                         int an_idx,
-                         int grid_size,
-                         int input_size,
-                         int index,
-                         int stride,
-                         int img_height,
-                         int img_width) {
-  box[0] = (i + sigmoid(x[index])) * img_width / grid_size;
-  box[1] = (j + sigmoid(x[index + stride])) * img_height / grid_size;
-  box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width /
-           input_size;
-  box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] *
-           img_height / input_size;
-}
-
-inline int get_entry_index(int batch,
-                           int an_idx,
-                           int hw_idx,
-                           int an_num,
-                           int an_stride,
-                           int stride,
-                           int entry) {
-  return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
-}
-
-inline void calc_detection_box(float* boxes,
-                               float* box,
-                               const int box_idx,
-                               const int img_height,
-                               const int img_width) {
-  boxes[box_idx] = box[0] - box[2] / 2;
-  boxes[box_idx + 1] = box[1] - box[3] / 2;
-  boxes[box_idx + 2] = box[0] + box[2] / 2;
-  boxes[box_idx + 3] = box[1] + box[3] / 2;
-
-  boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast<float>(0);
-  boxes[box_idx + 1] =
-      boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast<float>(0);
-  boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1
-                           ? boxes[box_idx + 2]
-                           : static_cast<float>(img_width - 1);
-  boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1
-                           ? boxes[box_idx + 3]
-                           : static_cast<float>(img_height - 1);
-}
-
-inline void calc_label_score(float* scores,
-                             const float* input,
-                             const int label_idx,
-                             const int score_idx,
-                             const int class_num,
-                             const float conf,
-                             const int stride) {
-  for (int i = 0; i < class_num; i++) {
-    scores[score_idx + i] = conf * sigmoid(input[label_idx + i * stride]);
-  }
-}
-}  // namespace
-
-class YoloBoxComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input0_ = "X";
-  std::string input1_ = "ImgSize";
-  std::string output0_ = "Boxes";
-  std::string output1_ = "Scores";
-  std::vector<int> anchors_;
-  int class_num_ = 0;
-  float conf_thresh_ = 0.f;
-  int downsample_ratio_ = 0;
-
-  DDim _dims0_{{1, 2, 2, 1}};
-  DDim _dims1_{{1, 2}};
-
- public:
-  YoloBoxComputeTester(const Place& place,
-                       const std::string& alias,
-                       std::vector<int> anchors,
-                       int class_num,
-                       float conf_thresh,
-                       int downsample_ratio)
-      : TestCase(place, alias),
-        anchors_(anchors),
-        class_num_(class_num),
-        conf_thresh_(conf_thresh),
-        downsample_ratio_(downsample_ratio) {}
-
-  void RunBaseline(Scope* scope) override {
-    const lite::Tensor* X = scope->FindTensor(input0_);
-    const lite::Tensor* ImgSize = scope->FindTensor(input1_);
-    auto* in = X;
-    auto* imgsize = ImgSize;
-    auto anchors = anchors_;
-    int class_num = class_num_;
-    float conf_thresh = conf_thresh_;
-    int downsample_ratio = downsample_ratio_;
-
-    const int n = in->dims()[0];
-    const int h = in->dims()[2];
-    const int w = in->dims()[3];
-    const int an_num = anchors.size() / 2;
-    int in_size = downsample_ratio * h;
-    int box_num = in->dims()[2] * in->dims()[3] * an_num;
-
-    lite::Tensor* Boxes = scope->NewTensor(output0_);
-    lite::Tensor* Scores = scope->NewTensor(output1_);
-    CHECK(Boxes);
-    CHECK(Scores);
-    Boxes->Resize({in->dims()[0], box_num, 4});
-    Scores->Resize({in->dims()[0], box_num, class_num});
-    auto* boxes = Boxes;
-    auto* scores = Scores;
-    const int b_num = boxes->dims()[0];
-
-    const int stride = h * w;
-    const int an_stride = (class_num + 5) * stride;
-
-    auto anchors_data = anchors.data();
-    const float* in_data = in->data<float>();
-    const float* imgsize_data = imgsize->data<float>();
-    float* boxes_data = boxes->mutable_data<float>();
-    float* scores_data = scores->mutable_data<float>();
-
-    float box[4];
-    for (int i = 0; i < n; i++) {
-      int img_height = static_cast<int>(imgsize_data[2 * i]);
-      int img_width = static_cast<int>(imgsize_data[2 * i + 1]);
-      for (int j = 0; j < an_num; j++) {
-        for (int k = 0; k < h; k++) {
-          for (int l = 0; l < w; l++) {
-            int obj_idx =
-                get_entry_index(i, j, k * w + l, an_num, an_stride, stride, 4);
-            float conf = sigmoid(in_data[obj_idx]);
-            if (conf < conf_thresh) {
-              continue;
-            }
-            int box_idx =
-                get_entry_index(i, j, k * w + l, an_num, an_stride, stride, 0);
-            get_yolo_box(box,
-                         in_data,
-                         anchors_data,
-                         l,
-                         k,
-                         j,
-                         h,
-                         in_size,
-                         box_idx,
-                         stride,
-                         img_height,
-                         img_width);
-            box_idx = (i * b_num + j * stride + k * w + l) * 4;
-            calc_detection_box(boxes_data, box, box_idx, img_height, img_width);
-
-            int label_idx =
-                get_entry_index(i, j, k * w + l, an_num, an_stride, stride, 5);
-            int score_idx = (i * b_num + j * stride + k * w + l) * class_num;
-            calc_label_score(scores_data,
-                             in_data,
-                             label_idx,
-                             score_idx,
-                             class_num,
-                             conf,
-                             stride);
-          }
-        }
-      }
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("yolo_box");
-    op_desc->SetInput("X", {input0_});
-    op_desc->SetInput("ImgSize", {input1_});
-    op_desc->SetOutput("Boxes", {output0_});
-    op_desc->SetOutput("Scores", {output1_});
-    op_desc->SetAttr("anchors", anchors_);
-    op_desc->SetAttr("class_num", class_num_);
-    op_desc->SetAttr("conf_thresh", conf_thresh_);
-    op_desc->SetAttr("downsample_ratio", downsample_ratio_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data0(_dims0_.production());
-    for (int i = 0; i < _dims0_.production(); i++) {
-      data0[i] = i * 1.1;
-    }
-    std::vector<int> data1(_dims1_.production());
-    for (int i = 0; i < _dims1_.production(); i++) {
-      data1[i] = i + 8;
-    }
-    SetCommonTensor(input0_, _dims0_, data0.data());
-    SetCommonTensor(input1_, _dims1_, data1.data());
-  }
-};
-
-void test_yolobox(Place place) {
-  for (int class_num : {1, 2, 3, 4}) {
-    for (float conf_thresh : {0.5, 0.2, 0.7}) {
-      for (int downsample_ratio : {1, 2, 3}) {
-        std::vector<int> anchor({1, 2, 3, 4});
-
-        std::unique_ptr<arena::TestCase> tester(new YoloBoxComputeTester(
-            place, "def", anchor, class_num, conf_thresh, downsample_ratio));
-        arena::Arena arena(std::move(tester), place, 2e-5);
-        arena.TestPrecision();
-      }
-    }
-  }
-}
-
-TEST(YoloBox, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_yolobox(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tools/CMakeLists.txt b/lite/tools/CMakeLists.txt
deleted file mode 100644
index 71bebdf6f8..0000000000
--- a/lite/tools/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-add_subdirectory(debug)
diff --git a/lite/tools/Dockerfile.mobile b/lite/tools/Dockerfile.mobile
deleted file mode 100644
index e1094afe67..0000000000
--- a/lite/tools/Dockerfile.mobile
+++ /dev/null
@@ -1,96 +0,0 @@
-# A image for paddle lite mobile cross compile and simulator on android
-
-FROM ubuntu:16.04
-MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-
-RUN echo '\
-deb <mirror> <version> main restricted universe multiverse\n\
-deb <mirror> <version>-updates main restricted universe multiverse\n\
-deb <mirror> <version>-backports main restricted universe multiverse\n\
-deb <mirror> <version>-security main restricted universe multiverse\n'\
-> /etc/apt/sources.list
-RUN sed -ie 's|<mirror>|http://mirrors.tuna.tsinghua.edu.cn/ubuntu/|' /etc/apt/sources.list
-RUN sed -ie 's|<version>|xenial|' /etc/apt/sources.list
-
-RUN apt-get update && apt-get upgrade -y
-RUN apt-get install -y --no-install-recommends \
-        clang-format-3.8 \
-        cmake-curses-gui \
-        curl \
-        fish \
-        gawk \
-        gcc \
-        g++ \
-        git \
-        graphviz \
-        less \
-        make \
-        patch \
-        python \
-        python-pip \
-        python-setuptools \
-        unzip \
-        vim \
-        wget
-
-# timezone
-RUN apt install tzdata
-RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/timezone && echo 'Asia/Shanghai' >/etc/timezone
-
-# for android simulator
-RUN apt-get install -y --no-install-recommends \
-        libc6-i386 \
-        lib32stdc++6 \
-        redir \
-        iptables \
-        openjdk-8-jre \
-        default-jdk
-
-# for cmake 3.10
-RUN curl -O https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz && \
-        tar xzf cmake-3.10.3-Linux-x86_64.tar.gz && \
-        mv cmake-3.10.3-Linux-x86_64 /opt/cmake-3.10 && \
-        mv /usr/bin/cmake /usr/bin/cmake.bak && ln -s /opt/cmake-3.10/bin/cmake /usr/bin/cmake && \
-        mv /usr/bin/ccmake /usr/bin/ccmake.bak && ln -s /opt/cmake-3.10/bin/ccmake /usr/bin/ccmake
-
-# for arm linux compile
-RUN apt-get install -y --no-install-recommends \
-        g++-arm-linux-gnueabi \
-        gcc-arm-linux-gnueabi \
-        g++-arm-linux-gnueabihf \
-        gcc-arm-linux-gnueabihf \
-        gcc-aarch64-linux-gnu \
-        g++-aarch64-linux-gnu 
-
-# for android ndk17c
-RUN cd /tmp && curl -O https://dl.google.com/android/repository/android-ndk-r17c-linux-x86_64.zip
-ENV NDK_ROOT /opt/android-ndk-r17c
-RUN cd /opt && unzip /tmp/android-ndk-r17c-linux-x86_64.zip
-
-# for android simulator
-ENV ANDROID_HOME /opt/android_sdk
-ENV PATH $PATH:${ANDROID_HOME}/tools:${ANDROID_HOME}/platform-tools:${ANDROID_HOME}/tools/bin
-RUN wget "https://dl.google.com/android/repository/sdk-tools-linux-4333796.zip" && \
-    unzip sdk-tools-linux-4333796.zip -d /opt/android_sdk && \
-    mkdir /root/.android && touch /root/.android/repositories.cfg && \
-    echo y | sdkmanager "platform-tools" "emulator" && \
-    echo y | sdkmanager "platforms;android-24" && \
-    echo y | sdkmanager "system-images;android-24;google_apis;arm64-v8a" "system-images;android-24;google_apis;armeabi-v7a"
-
-# this will install the ndk19c and only use clang to compile, then can replace the ndk17c
-# echo y | sdkmanager "ndk;19.2.5345600"
-
-# Expose android port
-EXPOSE 5555
-EXPOSE 5557
-# VNC port
-EXPOSE 5900
-
-# clean
-RUN ln -s clang-format-3.8 /usr/bin/clang-format
-RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --upgrade pip
-RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple wheel
-RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pre-commit
-RUN apt-get autoremove -y && apt-get clean
-RUN rm -rf /sdk-tools-linux-4333796.zip /tmp/android-ndk-r17c-linux-x86_64.zip /cmake-3.10.3-Linux-x86_64.tar.gz
- 
diff --git a/lite/tools/benchmark.sh b/lite/tools/benchmark.sh
deleted file mode 100644
index c3261c6d44..0000000000
--- a/lite/tools/benchmark.sh
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/bin/bash
-set -e
-
-# Check input
-if [ $# -lt  3 ];
-then
-    echo "Input error"
-    echo "Usage:"
-    echo "  sh benchmark.sh <benchmark_bin_path> <benchmark_models_path> <result_filename>"
-    echo "  sh benchmark.sh <benchmark_bin_path> <benchmark_models_path> <result_filename> <is_run_model_optimize: [true|false]>"
-    exit
-fi
-
-# Set benchmark params
-ANDROID_DIR=/data/local/tmp
-BENCHMARK_BIN=$1
-MODELS_DIR=$2
-RESULT_FILENAME=$3
-
-WARMUP=10
-REPEATS=30
-IS_RUN_MODEL_OPTIMIZE=false
-NUM_THREADS_LIST=(1 2 4)
-MODELS_LIST=$(ls $MODELS_DIR)
-
-# Check input
-if [ $# -gt  3 ];
-then
-    IS_RUN_MODEL_OPTIMIZE=$4
-fi
-
-# Adb push benchmark_bin, models
-adb push $BENCHMARK_BIN $ANDROID_DIR/benchmark_bin
-adb shell chmod +x $ANDROID_DIR/benchmark_bin
-adb push $MODELS_DIR $ANDROID_DIR
-
-# Run benchmark
-adb shell "echo 'PaddleLite Benchmark' > $ANDROID_DIR/$RESULT_FILENAME"
-for threads in ${NUM_THREADS_LIST[@]}; do
-    adb shell "echo Threads=$threads Warmup=$WARMUP Repeats=$REPEATS >> $ANDROID_DIR/$RESULT_FILENAME"
-    for model_name in ${MODELS_LIST[@]}; do
-      echo "Model=$model_name Threads=$threads"
-      adb shell "$ANDROID_DIR/benchmark_bin \
-                   --model_dir=$ANDROID_DIR/${MODELS_DIR}/$model_name \
-                   --warmup=$WARMUP \
-                   --repeats=$REPEATS \
-                   --threads=$threads \
-                   --result_filename=$ANDROID_DIR/$RESULT_FILENAME \
-                   --run_model_optimize=$IS_RUN_MODEL_OPTIMIZE"
-    done
-    adb shell "echo >> $ANDROID_DIR/$RESULT_FILENAME"
-done
-
-# Adb pull benchmark result, show result
-adb pull $ANDROID_DIR/$RESULT_FILENAME .
-echo "\n--------------------------------------"
-cat $RESULT_FILENAME
-echo "--------------------------------------"
diff --git a/lite/tools/build.sh b/lite/tools/build.sh
deleted file mode 100755
index 0860f3d00e..0000000000
--- a/lite/tools/build.sh
+++ /dev/null
@@ -1,272 +0,0 @@
-#!/bin/bash
-set -ex
-
-readonly CMAKE_COMMON_OPTIONS="-DWITH_GPU=OFF \
-                               -DWITH_MKL=OFF \
-                               -DWITH_LITE=ON \
-                               -DLITE_WITH_CUDA=OFF \
-                               -DLITE_WITH_X86=OFF \
-                               -DLITE_WITH_ARM=ON \
-                               -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON"
-
-readonly NUM_PROC=${LITE_BUILD_THREADS:-4}
-
-
-# global variables
-BUILD_EXTRA=OFF
-BUILD_JAVA=ON
-
-readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
-
-readonly workspace=$PWD
-
-# for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
-# here we fake an empty file to make cmake works.
-function prepare_workspace {
-    # in build directory
-    # 1. Prepare gen_code file
-    GEN_CODE_PATH_PREFIX=lite/gen_code
-    mkdir -p ./${GEN_CODE_PATH_PREFIX}
-    touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
-
-    # 2.Prepare debug tool
-    DEBUG_TOOL_PATH_PREFIX=lite/tools/debug
-    mkdir -p ./${DEBUG_TOOL_PATH_PREFIX}
-    cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/
-}
-
-function prepare_thirdparty {
-    if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then
-        rm -rf $workspace/third-party
-
-        if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
-            wget $THIRDPARTY_TAR
-        fi
-        tar xzf third-party-05b862.tar.gz
-    else
-        git submodule update --init --recursive
-    fi
-}
-
-function make_tiny_publish_so {
-  local os=$1
-  local abi=$2
-  local lang=$3
-  local android_stl=$4
-
-  cur_dir=$(pwd)
-  build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}
-  if [ -d $build_dir ]
-  then
-    rm -rf $build_dir
-  fi
-  mkdir -p $build_dir
-  cd $build_dir
-
-  if [ ${os} == "armlinux" ]; then
-    BUILD_JAVA=OFF
-  fi
-
-  cmake .. \
-      ${CMAKE_COMMON_OPTIONS} \
-      -DWITH_TESTING=OFF \
-      -DLITE_WITH_JAVA=$BUILD_JAVA \
-      -DLITE_SHUTDOWN_LOG=ON \
-      -DLITE_ON_TINY_PUBLISH=ON \
-      -DANDROID_STL_TYPE=$android_stl \
-      -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
-      -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
-
-  make publish_inference -j$NUM_PROC
-  cd - > /dev/null
-}
-
-function make_full_publish_so {
-  local os=$1
-  local abi=$2
-  local lang=$3
-  local android_stl=$4
-
-  #git submodule update --init --recursive
-  prepare_thirdparty
-
-  cur_dir=$(pwd)
-  build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}
-  if [ -d $build_dir ]
-  then
-    rm -rf $build_dir
-  fi
-  mkdir -p $build_dir
-  cd $build_dir
-  
-  if [ ${os} == "armlinux" ]; then
-    BUILD_JAVA=OFF
-  fi
-
-  prepare_workspace
-  cmake .. \
-      ${CMAKE_COMMON_OPTIONS} \
-      -DWITH_TESTING=OFF \
-      -DLITE_WITH_JAVA=$BUILD_JAVA \
-      -DLITE_SHUTDOWN_LOG=ON \
-      -DANDROID_STL_TYPE=$android_stl \
-      -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
-      -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
-
-  make publish_inference -j4
-  cd - > /dev/null
-}
-
-function make_all_tests {
-  local os=$1
-  local abi=$2
-  local lang=$3
-
-  #git submodule update --init --recursive
-  prepare_thirdparty
-  cur_dir=$(pwd)
-  build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}
-  if [ -d $build_dir ]
-  then
-    rm -rf $build_dir
-  fi
-  mkdir -p $build_dir
-  cd $build_dir
-
-  prepare_workspace
-  cmake .. \
-      ${CMAKE_COMMON_OPTIONS} \
-      -DWITH_TESTING=ON \
-      -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
-      -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
-
-  make lite_compile_deps -j4
-  cd - > /dev/null
-}
-
-function make_ios {
-    local os=$1
-    local abi=$2
-    build_dir=build.ios.${os}.${abi}
-    echo "building ios target into $build_dir"
-    echo "target os: $os"
-    echo "target abi: $abi"
-    mkdir -p ${build_dir}
-    cd ${build_dir}
-    GEN_CODE_PATH_PREFIX=lite/gen_code
-    mkdir -p ./${GEN_CODE_PATH_PREFIX}
-    touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
-
-    cmake .. \
-            -DWITH_GPU=OFF \
-            -DWITH_MKL=OFF \
-            -DWITH_LITE=ON \
-            -DLITE_WITH_CUDA=OFF \
-            -DLITE_WITH_X86=OFF \
-            -DLITE_WITH_ARM=ON \
-            -DWITH_TESTING=OFF \
-            -DLITE_WITH_JAVA=OFF \
-            -DLITE_SHUTDOWN_LOG=ON \
-            -DLITE_ON_TINY_PUBLISH=ON \
-            -DLITE_WITH_OPENMP=OFF \
-            -DWITH_ARM_DOTPROD=OFF \
-            -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
-            -DARM_TARGET_ARCH_ABI=$abi \
-            -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
-            -DARM_TARGET_OS=$os
-
-    make -j4 publish_inference
-    cd -
-}
-
-
-function print_usage {
-    set +x
-    echo -e "\nUSAGE:"
-    echo
-    echo "----------------------------------------"
-    echo -e "compile tiny publish so lib:"
-    echo -e "for android:"
-    echo -e "   ./build.sh --arm_os=<os> --arm_abi=<abi> --arm_lang=<lang> --android_stl=<stl> tiny_publish"
-    echo -e "for ios:"
-    echo -e "   ./build.sh --arm_os=<os> --arm_abi=<abi> ios"
-    echo
-    echo -e "compile full publish so lib (ios not support):"
-    echo -e "   ./build.sh --arm_os=<os> --arm_abi=<abi> --arm_lang=<lang> --android_stl=<stl> full_publish"
-    echo
-    echo -e "compile all arm tests (ios not support):"
-    echo -e "   ./build.sh --arm_os=<os> --arm_abi=<abi> --arm_lang=<lang> test"
-    echo
-    echo -e "optional argument:"
-    echo -e "--build_extra: (OFF|ON); controls whether to publish extra operators and kernels for (sequence-related model such as OCR or NLP)"
-    echo
-    echo -e "argument choices:"
-    echo -e "--arm_os:\t android|ios|ios64"
-    echo -e "--arm_abi:\t armv8|armv7"
-    echo -e "--arm_lang:\t gcc|clang (for android)"
-    echo -e "--android_stl:\t c++_static|c++_shared (for android)"
-    echo
-    echo -e "tasks:"
-    echo
-    echo -e "tiny_publish: a small library for deployment."
-    echo -e "full_publish: a full library for debug and test."
-    echo -e "test: produce all the unittests."
-    echo "----------------------------------------"
-    echo
-}
-
-function main {
-    if [ -z "$1" ]; then
-        print_usage
-        exit -1
-    fi
-
-    # Parse command line.
-    for i in "$@"; do
-        case $i in
-            --arm_os=*)
-                ARM_OS="${i#*=}"
-                shift
-                ;;
-            --arm_abi=*)
-                ARM_ABI="${i#*=}"
-                shift
-                ;;
-            --arm_lang=*)
-                ARM_LANG="${i#*=}"
-                shift
-                ;;
-            --android_stl=*)
-                ANDROID_STL="${i#*=}"
-                shift
-                ;;
-            --build_extra=*)
-                BUILD_EXTRA="${i#*=}"
-                shift
-                ;;
-            tiny_publish)
-                make_tiny_publish_so $ARM_OS $ARM_ABI $ARM_LANG $ANDROID_STL
-                shift
-                ;;
-            full_publish)
-                make_full_publish_so $ARM_OS $ARM_ABI $ARM_LANG $ANDROID_STL
-                shift
-                ;;
-            test)
-                make_all_tests $ARM_OS $ARM_ABI $ARM_LANG
-                shift
-                ;;
-            ios)
-                make_ios $ARM_OS $ARM_ABI
-                shift
-                ;;
-            *)
-                # unknown option
-                print_usage
-                exit 1
-                ;;
-        esac
-    done
-}
-
-main $@
diff --git a/lite/tools/build_fpga.sh b/lite/tools/build_fpga.sh
deleted file mode 100755
index 75d31bc9bd..0000000000
--- a/lite/tools/build_fpga.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/bin/bash
-
-build_dir=build_fpga
-mkdir -p ${build_dir}
-cd ${build_dir}
-
-GEN_CODE_PATH_PREFIX=lite/gen_code
-mkdir -p ./${GEN_CODE_PATH_PREFIX}
-touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
-
-cmake .. \
-        -DWITH_GPU=OFF \
-        -DWITH_MKL=OFF \
-        -DWITH_LITE=ON \
-        -DLITE_WITH_CUDA=OFF \
-        -DLITE_WITH_X86=OFF \
-        -DLITE_WITH_ARM=ON \
-        -DLITE_WITH_FPGA=ON \
-        -DLITE_WITH_OPENMP=ON \
-        -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
-        -DWITH_TESTING=ON \
-        -DARM_TARGET_OS=armlinux
-
-make -j8
-
-cd -
diff --git a/lite/tools/build_npu.sh b/lite/tools/build_npu.sh
deleted file mode 100755
index 9d9d6aceae..0000000000
--- a/lite/tools/build_npu.sh
+++ /dev/null
@@ -1,178 +0,0 @@
-#!/bin/bash
-set -ex
-
-function print_usage {
-    echo -e "\nUSAGE:"
-    echo
-    echo "----------------------------------------"
-    echo -e "--arm_os=<os> android only yet."
-    echo -e "--arm_abi=<abi> armv8, armv7 yet."
-    echo -e "--android_stl=<shared> shared or static"
-    echo -e "--arm_lang=<gcc> "
-    echo -e "--ddk_root=<hiai_ddk_root> "
-    echo -e "--test_name=<test_name>"
-    echo "----------------------------------------"
-    echo
-}
-
-# for code gen, a source file is generated after a test, 
-# but is dependended by some targets in cmake.
-# here we fake an empty file to make cmake works.
-function prepare_workspace {
-    # in build directory
-    # 1. Prepare gen_code file
-    GEN_CODE_PATH_PREFIX=lite/gen_code
-    mkdir -p ./${GEN_CODE_PATH_PREFIX}
-    touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
-
-    # 2.Prepare debug tool
-    DEBUG_TOOL_PATH_PREFIX=lite/tools/debug
-    mkdir -p ./${DEBUG_TOOL_PATH_PREFIX}
-    cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/
-}
-
-function prepare_thirdparty {
-    readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
-
-    readonly workspace=$PWD
-    if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then
-        rm -rf $workspace/third-party
-
-         if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
-            wget $THIRDPARTY_TAR
-        fi
-        tar xzf third-party-05b862.tar.gz
-    else
-        git submodule update --init --recursive
-    fi
-}
-
-function cmake_npu {
-    prepare_workspace
-    # $1: ARM_TARGET_OS in "android" , "armlinux"
-    # $2: ARM_TARGET_ARCH_ABI in "armv8", "armv7" ,"armv7hf"
-    # $3: ARM_TARGET_LANG in "gcc" "clang"
-    # $4: ANDROID_STL_TYPE in "c++_shared" "c++_static"
-    # $5: DDK_ROOT path
-
-    # NPU libs need API LEVEL 24 above
-    cmake .. \
-        -DWITH_GPU=OFF \
-        -DWITH_MKL=OFF \
-        -DWITH_LITE=ON \
-        -DLITE_WITH_CUDA=OFF \
-        -DLITE_WITH_X86=OFF \
-        -DLITE_BUILD_EXTRA=ON \
-        -DLITE_WITH_ARM=ON \
-        -DWITH_ARM_DOTPROD=ON   \
-        -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
-        -DWITH_TESTING=ON \
-        -DLITE_WITH_JAVA=ON \
-        -DLITE_WITH_NPU=ON \
-        -DANDROID_API_LEVEL=24 \
-        -DARM_TARGET_OS=$1 \
-        -DARM_TARGET_ARCH_ABI=$2 \
-        -DARM_TARGET_LANG=$3 \
-        -DANDROID_STL_TYPE=$4 \
-        -DNPU_DDK_ROOT=$5
-}
-
-function build_npu {
-    # os, abi, lang, stl, ddk_root, test_name
-    cur_dir=$(pwd)
-
-    local os=android
-    local abi=armv8
-    local lang=gcc
-    local stl="c++_shared"
-    local ddk_root="${cur_dir}/ai_ddk_lib/" 
-    local test_name=test_npu_pass
-    prepare_thirdparty
-
-    if [ "x${ARM_OS}" != "x" ]; then
-        os=$ARM_OS
-    fi
-    if [[ "x${ARM_ABI}" != "x" ]]; then
-        abi=$ARM_ABI
-    fi
-    if [[ "x${ARM_LANG}" != "x" ]]; then
-        lang=$ARM_LANG
-    fi
-    if [[ "x${ANDROID_STL}" != "x" ]]; then
-        stl=$ANDROID_STL
-    fi
-    if [[ "x${DDK_ROOT}" != "x" ]]; then
-        ddk_root=$DDK_ROOT
-    fi
-    if [[ $# -ge 1 ]]; then
-        test_name=$1
-    fi
-
-    # the c++ symbol is not recognized by the bundled script
-    if [[ "${stl}" == "c++_shared" ]]; then
-        stl_dir="cxx_shared"
-    fi
-    if [[ "${stl}" == "c++_static" ]]; then
-        stl_dir="cxx_static"
-    fi
-    build_dir=$cur_dir/build.lite.npu.${os}.${abi}.${lang}.${stl_dir}
-    mkdir -p $build_dir
-    cd $build_dir
-
-    cmake_npu ${os} ${abi} ${lang} ${stl} ${ddk_root}
-    make $test_name -j8
-
-    cd -
-    echo "Done"
-}
-
-function main {
-    # Parse command line.
-    for i in "$@"; do
-        case $i in
-            --tests=*)
-                TESTS_FILE="${i#*=}"
-                shift
-                ;;
-            --test_name=*)
-                TEST_NAME="${i#*=}"
-                shift
-                ;;
-            --arm_os=*)
-                ARM_OS="${i#*=}"
-                shift
-                ;;
-            --arm_abi=*)
-                ARM_ABI="${i#*=}"
-                shift
-                ;;
-            --arm_lang=*)
-                ARM_LANG="${i#*=}"
-                shift
-                ;;
-            --android_stl=*)
-                ANDROID_STL="${i#*=}"
-                shift
-                ;;
-            --ddk_root=*)
-                DDK_ROOT="${i#*=}"
-                shift
-                ;;
-            build)
-                build_npu $TEST_NAME
-                shift
-                ;;
-            full_publish)
-                build_npu publish_inference
-                shift
-                ;;
-            *)
-                # unknown option
-                print_usage
-                exit 1
-                ;;
-        esac
-    done
-}
-
-main $@
diff --git a/lite/tools/ci_build.sh b/lite/tools/ci_build.sh
deleted file mode 100755
index 7e9c5068d5..0000000000
--- a/lite/tools/ci_build.sh
+++ /dev/null
@@ -1,955 +0,0 @@
-#!/bin/bash
-set -ex
-
-TESTS_FILE="./lite_tests.txt"
-LIBS_FILE="./lite_libs.txt"
-
-
-readonly ADB_WORK_DIR="/data/local/tmp"
-readonly common_flags="-DWITH_LITE=ON -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF -DWITH_PYTHON=OFF -DWITH_TESTING=ON -DLITE_WITH_ARM=OFF"
-
-readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
-readonly workspace=$PWD
-
-NUM_CORES_FOR_COMPILE=${LITE_BUILD_THREADS:-8}
-
-function prepare_thirdparty {
-    if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then
-        rm -rf $workspace/third-party
-
-        if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
-            wget $THIRDPARTY_TAR
-        fi
-        tar xzf third-party-05b862.tar.gz
-    else
-        git submodule update --init --recursive
-    fi
-}
-
-
-# for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
-# here we fake an empty file to make cmake works.
-function prepare_workspace {
-    # in build directory
-    # 1. Prepare gen_code file
-    GEN_CODE_PATH_PREFIX=lite/gen_code
-    mkdir -p ./${GEN_CODE_PATH_PREFIX}
-    touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
-
-    # 2.Prepare debug tool
-    DEBUG_TOOL_PATH_PREFIX=lite/tools/debug
-    mkdir -p ./${DEBUG_TOOL_PATH_PREFIX}
-    cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/
-
-    # clone submodule
-    #git submodule update --init --recursive
-    prepare_thirdparty
-}
-
-function check_need_ci {
-    git log -1 --oneline | grep "test=develop" || exit -1
-}
-
-function cmake_x86 {
-    prepare_workspace
-    cmake ..  -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DLITE_WITH_X86=ON ${common_flags}
-}
-
-function cmake_opencl {
-    prepare_workspace
-    # $1: ARM_TARGET_OS in "android" , "armlinux"
-    # $2: ARM_TARGET_ARCH_ABI in "armv8", "armv7" ,"armv7hf"
-    # $3: ARM_TARGET_LANG in "gcc" "clang"
-    cmake .. \
-        -DLITE_WITH_OPENCL=ON \
-        -DWITH_GPU=OFF \
-        -DWITH_MKL=OFF \
-        -DWITH_LITE=ON \
-        -DLITE_WITH_CUDA=OFF \
-        -DLITE_WITH_X86=OFF \
-        -DLITE_WITH_ARM=ON \
-        -DWITH_ARM_DOTPROD=ON   \
-        -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
-        -DWITH_TESTING=ON \
-        -DLITE_BUILD_EXTRA=ON \
-        -DARM_TARGET_OS=$1 -DARM_TARGET_ARCH_ABI=$2 -DARM_TARGET_LANG=$3
-}
-
-function run_gen_code_test {
-    local port=$1
-    local gen_code_file_name="__generated_code__.cc"
-    local gen_code_file_path="./lite/gen_code/${gen_code_file_path}"
-    local adb_work_dir="/data/local/tmp"
-
-    # 1. build test_cxx_api
-    make test_cxx_api -j$NUM_CORES_FOR_COMPILE
-
-    # 2. run test_cxx_api_lite in emulator to get opt model 
-    local test_cxx_api_lite_path=$(find ./lite -name test_cxx_api)
-    adb -s emulator-${port} push "./third_party/install/lite_naive_model" ${adb_work_dir}
-    adb -s emulator-${port} push ${test_cxx_api_lite_path} ${adb_work_dir}
-    adb -s emulator-${port} shell "${adb_work_dir}/test_cxx_api --model_dir=${adb_work_dir}/lite_naive_model --optimized_model=${adb_work_dir}/lite_naive_model_opt"
-
-    # 3. build test_gen_code
-    make test_gen_code -j$NUM_CORES_FOR_COMPILE
-
-    # 4. run test_gen_code_lite in emulator to get __generated_code__.cc
-    local test_gen_code_lite_path=$(find ./lite -name test_gen_code)
-    adb -s emulator-${port} push ${test_gen_code_lite_path} ${adb_work_dir}
-    adb -s emulator-${port} shell "${adb_work_dir}/test_gen_code --optimized_model=${adb_work_dir}/lite_naive_model_opt --generated_code_file=${adb_work_dir}/${gen_code_file_name}"
-
-    # 5. pull __generated_code__.cc down and mv to build real path
-    adb -s emulator-${port} pull "${adb_work_dir}/${gen_code_file_name}" .
-    mv ${gen_code_file_name} ${gen_code_file_path}
-
-    # 6. build test_generated_code
-    make test_generated_code -j$NUM_CORES_FOR_COMPILE
-}
-
-# $1: ARM_TARGET_OS in "android" , "armlinux"
-# $2: ARM_TARGET_ARCH_ABI in "armv8", "armv7" ,"armv7hf"
-# $3: ARM_TARGET_LANG in "gcc" "clang"
-function build_opencl {
-    os=$1
-    abi=$2
-    lang=$3
-
-    cur_dir=$(pwd)
-    if [[ ${os} == "armlinux" ]]; then
-        # TODO(hongming): enable compile armv7 and armv7hf on armlinux, and clang compile
-        if [[ ${lang} == "clang" ]]; then
-            echo "clang is not enabled on armlinux yet"
-            return 0
-        fi
-        if [[ ${abi} == "armv7hf" ]]; then
-            echo "armv7hf is not supported on armlinux yet"
-            return 0
-        fi
-        if [[ ${abi} == "armv7" ]]; then
-            echo "armv7 is not supported on armlinux yet"
-            return 0
-        fi
-    fi
-
-    if [[ ${os} == "android" && ${abi} == "armv7hf" ]]; then
-        echo "android do not need armv7hf"
-        return 0
-    fi
-
-    build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}.opencl
-    mkdir -p $build_dir
-    cd $build_dir
-
-    cmake_opencl ${os} ${abi} ${lang}
-    make opencl_clhpp
-    build $TESTS_FILE
-
-    # test publish inference lib
-    make publish_inference
-}
-
-# This method is only called in CI.
-function cmake_x86_for_CI {
-    prepare_workspace # fake an empty __generated_code__.cc to pass cmake.
-    cmake ..  -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DLITE_WITH_X86=ON ${common_flags} -DLITE_WITH_PROFILE=ON -DWITH_MKL=ON \
-        -DLITE_BUILD_EXTRA=ON \
-
-    # Compile and execute the gen_code related test, so it will generate some code, and make the compilation reasonable.
-    # make test_gen_code -j$NUM_CORES_FOR_COMPILE
-    # make test_cxx_api -j$NUM_CORES_FOR_COMPILE
-    # ctest -R test_cxx_api
-    # ctest -R test_gen_code
-    # make test_generated_code -j$NUM_CORES_FOR_COMPILE
-}
-
-function cmake_gpu {
-    prepare_workspace
-    cmake .. " -DWITH_GPU=ON {common_flags} -DLITE_WITH_GPU=ON"
-}
-
-function check_style {
-    export PATH=/usr/bin:$PATH
-    #pre-commit install
-    clang-format --version
-
-    if ! pre-commit run -a ; then
-        git diff
-        exit 1
-    fi
-}
-
-function build_single {
-    #make $1 -j$(expr $(nproc) - 2)
-    make $1 -j$NUM_CORES_FOR_COMPILE
-}
-
-function build {
-    make lite_compile_deps -j$NUM_CORES_FOR_COMPILE
-
-    # test publish inference lib
-    # make publish_inference
-}
-
-# It will eagerly test all lite related unittests.
-function test_server {
-    # Due to the missing of x86 kernels, we skip the following tests temporarily.
-    # TODO(xxx) clear the skip list latter
-    local skip_list=("test_paddle_api" "test_cxx_api" "test_googlenet"
-                     "test_mobilenetv1_lite_x86" "test_mobilenetv2_lite_x86"
-                     "test_inceptionv4_lite_x86" "test_light_api"
-                     "test_apis" "test_model_bin"
-                    )
-    local to_skip=0
-    for _test in $(cat $TESTS_FILE); do
-        to_skip=0
-        for skip_name in ${skip_list[@]}; do
-            if [ $skip_name = $_test ]; then
-                echo "to skip " $skip_name
-                to_skip=1
-            fi
-        done
-
-        if [ $to_skip -eq 0 ]; then
-            ctest -R $_test -V
-        fi
-    done
-}
-
-# Build the code and run lite server tests. This is executed in the CI system.
-function build_test_server {
-    mkdir -p ./build
-    cd ./build
-    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib"
-    cmake_x86_for_CI
-    build
-
-    test_server
-    test_model_optimize_tool_compile
-}
-
-function build_test_train {
-    mkdir -p ./build
-    cd ./build
-    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/paddle/build/third_party/install/mklml/lib"
-    prepare_workspace # fake an empty __generated_code__.cc to pass cmake.
-    cmake .. -DWITH_LITE=ON -DWITH_GPU=OFF -DWITH_PYTHON=ON -DLITE_WITH_X86=ON -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF -DWITH_TESTING=ON -DWITH_MKL=OFF \
-        -DLITE_BUILD_EXTRA=ON \
-
-    make test_gen_code -j$NUM_CORES_FOR_COMPILE
-    make test_cxx_api -j$NUM_CORES_FOR_COMPILE
-    ctest -R test_cxx_api
-    ctest -R test_gen_code
-    make test_generated_code -j$NUM_CORES_FOR_COMPILE
-
-    make -j$NUM_CORES_FOR_COMPILE
-
-    find -name "*.whl" | xargs pip2 install
-    python ../lite/tools/python/lite_test.py
-
-}
-
-# test_arm_android <some_test_name> <adb_port_number>
-function test_arm_android {
-    local test_name=$1
-    local port=$2
-    if [[ "${test_name}x" == "x" ]]; then
-        echo "test_name can not be empty"
-        exit 1
-    fi
-    if [[ "${port}x" == "x" ]]; then
-        echo "Port can not be empty"
-        exit 1
-    fi
-
-    echo "test name: ${test_name}"
-    adb_work_dir="/data/local/tmp"
-
-    skip_list=("test_model_parser" "test_mobilenetv1" "test_mobilenetv2" "test_resnet50" "test_inceptionv4" "test_light_api" "test_apis" "test_paddle_api" "test_cxx_api" "test_gen_code" "test_mobilenetv1_int8" "test_subgraph_pass")
-    for skip_name in ${skip_list[@]} ; do
-        [[ $skip_name =~ (^|[[:space:]])$test_name($|[[:space:]]) ]] && echo "skip $test_name" && return
-    done
-
-    local testpath=$(find ./lite -name ${test_name})
-
-    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
-    adb -s emulator-${port} shell "cd ${adb_work_dir} && ./${test_name}"
-    adb -s emulator-${port} shell "rm ${adb_work_dir}/${test_name}"
-}
-
-# test_npu <some_test_name> <adb_port_number>
-function test_npu {
-    local test_name=$1
-    local port=$2
-    if [[ "${test_name}x" == "x" ]]; then
-        echo "test_name can not be empty"
-        exit 1
-    fi
-    if [[ "${port}x" == "x" ]]; then
-        echo "Port can not be empty"
-        exit 1
-    fi
-
-    echo "test name: ${test_name}"
-    adb_work_dir="/data/local/tmp"
-
-    skip_list=("test_model_parser" "test_mobilenetv1" "test_mobilenetv2" "test_resnet50" "test_inceptionv4" "test_light_api" "test_apis" "test_paddle_api" "test_cxx_api" "test_gen_code")
-    for skip_name in ${skip_list[@]} ; do
-        [[ $skip_name =~ (^|[[:space:]])$test_name($|[[:space:]]) ]] && echo "skip $test_name" && return
-    done
-
-    local testpath=$(find ./lite -name ${test_name})
-
-    # note the ai_ddk_lib is under paddle-lite root directory
-    adb -s emulator-${port} push ../ai_ddk_lib/lib64/* ${adb_work_dir}
-    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
-
-    if [[ ${test_name} == "test_npu_pass" ]]; then
-        local model_name=mobilenet_v1
-        adb -s emulator-${port} push "./third_party/install/${model_name}" ${adb_work_dir}
-        adb -s emulator-${port} shell "rm -rf ${adb_work_dir}/${model_name}_opt "
-        adb -s emulator-${port} shell "cd ${adb_work_dir}; export LD_LIBRARY_PATH=./ ; export GLOG_v=0; ./${test_name} --model_dir=./${model_name} --optimized_model=./${model_name}_opt"
-    elif [[ ${test_name} == "test_subgraph_pass" ]]; then
-        local model_name=mobilenet_v1
-        adb -s emulator-${port} push "./third_party/install/${model_name}" ${adb_work_dir}
-        adb -s emulator-${port} shell "cd ${adb_work_dir}; export LD_LIBRARY_PATH=./ ; export GLOG_v=0; ./${test_name} --model_dir=./${model_name}"
-    else
-        adb -s emulator-${port} shell "cd ${adb_work_dir}; export LD_LIBRARY_PATH=./ ; ./${test_name}"
-    fi
-}
-
-function test_npu_model {
-    local test_name=$1
-    local port=$2
-    local model_dir=$3
-
-    if [[ "${test_name}x" == "x" ]]; then
-        echo "test_name can not be empty"
-        exit 1
-    fi
-    if [[ "${port}x" == "x" ]]; then
-        echo "Port can not be empty"
-        exit 1
-    fi
-    if [[ "${model_dir}x" == "x" ]]; then
-        echo "Model dir can not be empty"
-        exit 1
-    fi
-
-    echo "test name: ${test_name}"
-    adb_work_dir="/data/local/tmp"
-
-    testpath=$(find ./lite -name ${test_name})
-    adb -s emulator-${port} push ../ai_ddk_lib/lib64/* ${adb_work_dir}
-    adb -s emulator-${port} push ${model_dir} ${adb_work_dir}
-    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
-    adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}"
-    local adb_model_path="${adb_work_dir}/`basename ${model_dir}`"
-    adb -s emulator-${port} shell "export LD_LIBRARY_PATH=${adb_work_dir}; ${adb_work_dir}/${test_name} --model_dir=$adb_model_path"
-}
-
-# test the inference high level api
-function test_arm_api {
-    local port=$1
-    local test_name="test_paddle_api"
-
-    make $test_name -j$NUM_CORES_FOR_COMPILE
-
-    local model_path=$(find . -name "lite_naive_model")
-    local remote_model=${adb_work_dir}/paddle_api
-    local testpath=$(find ./lite -name ${test_name})
-
-    arm_push_necessary_file $port $model_path $remote_model
-    adb -s emulator-${port} shell mkdir -p $remote_model
-    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
-    adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}"
-    adb -s emulator-${port} shell "${adb_work_dir}/${test_name} --model_dir $remote_model"
-}
-
-function test_arm_model {
-    local test_name=$1
-    local port=$2
-    local model_dir=$3
-
-    if [[ "${test_name}x" == "x" ]]; then
-        echo "test_name can not be empty"
-        exit 1
-    fi
-    if [[ "${port}x" == "x" ]]; then
-        echo "Port can not be empty"
-        exit 1
-    fi
-    if [[ "${model_dir}x" == "x" ]]; then
-        echo "Model dir can not be empty"
-        exit 1
-    fi
-
-    echo "test name: ${test_name}"
-    adb_work_dir="/data/local/tmp"
-
-    testpath=$(find ./lite -name ${test_name})
-    adb -s emulator-${port} push ${model_dir} ${adb_work_dir}
-    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
-    adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}"
-    local adb_model_path="${adb_work_dir}/`basename ${model_dir}`"
-    adb -s emulator-${port} shell "${adb_work_dir}/${test_name} --model_dir=$adb_model_path"
-}
-
-# function _test_model_optimize_tool {
-#     local port=$1
-#     local remote_model_path=$ADB_WORK_DIR/lite_naive_model
-#     local remote_test=$ADB_WORK_DIR/model_optimize_tool
-#     local adb="adb -s emulator-${port}"
-
-#     make model_optimize_tool -j$NUM_CORES_FOR_COMPILE
-#     local test_path=$(find . -name model_optimize_tool | head -n1)
-#     local model_path=$(find . -name lite_naive_model | head -n1)
-#     $adb push ${test_path} ${ADB_WORK_DIR}
-#     $adb shell mkdir -p $remote_model_path
-#     $adb push $model_path/* $remote_model_path
-#     $adb shell $remote_test --model_dir $remote_model_path --optimize_out ${remote_model_path}.opt \
-#          --valid_targets "arm"
-# }
-
-function test_model_optimize_tool_compile {
-    cd $workspace
-    cd build
-    cmake .. -DWITH_LITE=ON -DLITE_ON_MODEL_OPTIMIZE_TOOL=ON -DWITH_TESTING=OFF -DLITE_BUILD_EXTRA=ON
-    make model_optimize_tool -j$NUM_CORES_FOR_COMPILE
-}
-
-function _test_paddle_code_generator {
-    local port=$1
-    local test_name=paddle_code_generator
-    local remote_test=$ADB_WORK_DIR/$test_name
-    local remote_model=$ADB_WORK_DIR/lite_naive_model.opt
-    local adb="adb -s emulator-${port}"
-
-    make paddle_code_generator -j$NUM_CORES_FOR_COMPILE
-    local test_path=$(find . -name $test_name | head -n1)
-
-    $adb push $test_path $remote_test
-    $adb shell $remote_test --optimized_model $remote_model --generated_code_file $ADB_WORK_DIR/gen_code.cc
-}
-
-function cmake_npu {
-    prepare_workspace
-    # $1: ARM_TARGET_OS in "android" , "armlinux"
-    # $2: ARM_TARGET_ARCH_ABI in "armv8", "armv7" ,"armv7hf"
-    # $3: ARM_TARGET_LANG in "gcc" "clang"
-
-    # NPU libs need API LEVEL 24 above
-    build_dir=`pwd`
-
-    cmake .. \
-        -DWITH_GPU=OFF \
-        -DWITH_MKL=OFF \
-        -DWITH_LITE=ON \
-        -DLITE_WITH_CUDA=OFF \
-        -DLITE_WITH_X86=OFF \
-        -DLITE_WITH_ARM=ON \
-        -DWITH_ARM_DOTPROD=ON   \
-        -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
-        -DWITH_TESTING=ON \
-        -DLITE_WITH_NPU=ON \
-        -DANDROID_API_LEVEL=24 \
-        -DLITE_BUILD_EXTRA=ON \
-        -DNPU_DDK_ROOT="${build_dir}/../ai_ddk_lib/" \
-        -DARM_TARGET_OS=$1 -DARM_TARGET_ARCH_ABI=$2 -DARM_TARGET_LANG=$3
-}
-
-function cmake_arm {
-    prepare_workspace
-    # $1: ARM_TARGET_OS in "android" , "armlinux"
-    # $2: ARM_TARGET_ARCH_ABI in "armv8", "armv7" ,"armv7hf"
-    # $3: ARM_TARGET_LANG in "gcc" "clang"
-    cmake .. \
-        -DWITH_GPU=OFF \
-        -DWITH_MKL=OFF \
-        -DWITH_LITE=ON \
-        -DLITE_WITH_CUDA=OFF \
-        -DLITE_WITH_X86=OFF \
-        -DLITE_WITH_ARM=ON \
-        -DWITH_ARM_DOTPROD=ON   \
-        -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
-        -DWITH_TESTING=ON \
-        -DLITE_BUILD_EXTRA=ON \
-        -DARM_TARGET_OS=$1 -DARM_TARGET_ARCH_ABI=$2 -DARM_TARGET_LANG=$3
-}
-
-# $1: ARM_TARGET_OS in "android" , "armlinux"
-# $2: ARM_TARGET_ARCH_ABI in "armv8", "armv7" ,"armv7hf"
-# $3: ARM_TARGET_LANG in "gcc" "clang"
-function build_arm {
-    os=$1
-    abi=$2
-    lang=$3
-
-    cur_dir=$(pwd)
-    # TODO(xxx): enable armlinux clang compile
-    if [[ ${os} == "armlinux" && ${lang} == "clang" ]]; then
-        echo "clang is not enabled on armlinux yet"
-        return 0
-    fi
-
-    if [[ ${os} == "android" && ${abi} == "armv7hf" ]]; then
-        echo "android do not need armv7hf"
-        return 0
-    fi
-
-    build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}
-    mkdir -p $build_dir
-    cd $build_dir
-
-    cmake_arm ${os} ${abi} ${lang}
-    build $TESTS_FILE
-
-    # test publish inference lib
-    make publish_inference
-}
-
-# $1: ARM_TARGET_OS in "android"
-# $2: ARM_TARGET_ARCH_ABI in "armv8", "armv7"
-# $3: ARM_TARGET_LANG in "gcc" "clang"
-# $4: test_name
-function build_npu {
-    os=$1
-    abi=$2
-    lang=$3
-    local test_name=$4
-
-    cur_dir=$(pwd)
-
-    build_dir=$cur_dir/build.lite.npu.${os}.${abi}.${lang}
-    mkdir -p $build_dir
-    cd $build_dir
-
-    cmake_npu ${os} ${abi} ${lang}
-
-    if [[ "${test_name}x" != "x" ]]; then
-        build_single $test_name
-    else
-        build $TESTS_FILE
-    fi
-}
-
-# $1: ARM_TARGET_OS in "android" , "armlinux"
-# $2: ARM_TARGET_ARCH_ABI in "armv8", "armv7" ,"armv7hf"
-# $3: ARM_TARGET_LANG in "gcc" "clang"
-# $4: android test port
-# Note: test must be in build dir
-function test_arm {
-    os=$1
-    abi=$2
-    lang=$3
-    port=$4
-
-    if [[ ${os} == "armlinux" ]]; then
-        # TODO(hongming): enable test armlinux on armv8, armv7 and armv7hf
-        echo "Skip test arm linux yet. armlinux must in another docker"
-        return 0
-    fi
-
-    if [[ ${os} == "android" && ${abi} == "armv7hf" ]]; then
-        echo "android do not need armv7hf"
-        return 0
-    fi
-
-    # prepare for CXXApi test
-    local adb="adb -s emulator-${port}"
-    $adb shell mkdir -p /data/local/tmp/lite_naive_model_opt
-
-    echo "test file: ${TESTS_FILE}"
-    for _test in $(cat $TESTS_FILE); do
-        test_arm_android $_test $port
-    done
-
-    # test finally
-    test_arm_api $port
-
-    # _test_model_optimize_tool $port
-    # _test_paddle_code_generator $port
-}
-
-function prepare_emulator {
-    local port_armv8=$1
-    local port_armv7=$2
-
-    adb kill-server
-    adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
-    # start android armv8 and armv7 emulators first
-    echo n | avdmanager create avd -f -n paddle-armv8 -k "system-images;android-24;google_apis;arm64-v8a"
-    echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv8 -noaudio -no-window -gpu off -port ${port_armv8} &
-    sleep 1m
-    if [[ "${port_armv7}x" != "x" ]]; then
-        echo n | avdmanager create avd -f -n paddle-armv7 -k "system-images;android-24;google_apis;armeabi-v7a"
-        echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -port ${port_armv7} &
-        sleep 1m
-    fi
-}
-
-function arm_push_necessary_file {
-    local port=$1
-    local testpath=$2
-    local adb_work_dir=$3
-
-    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
-}
-
-function build_test_arm_opencl {
-    ########################################################################
-    cur=$PWD
-
-    # job 1
-    build_opencl "android" "armv8" "gcc"
-    cd $cur
-
-    # job 2
-    build_opencl "android" "armv7" "gcc"
-    cd $cur
-
-    echo "Done"
-}
-
-# We split the arm unittest into several sub-tasks to parallel and reduce the overall CI timetime.
-# sub-task1
-function build_test_arm_subtask_android {
-    ########################################################################
-    # job 1-4 must be in one runner
-    port_armv8=5554
-    port_armv7=5556
-
-    prepare_emulator $port_armv8 $port_armv7
-
-    # job 1
-    build_arm "android" "armv8" "gcc"
-    run_gen_code_test ${port_armv8}
-    test_arm "android" "armv8" "gcc" ${port_armv8}
-    cd -
-
-    # job 2
-    #build_arm "android" "armv8" "clang"
-    #run_gen_code_test ${port_armv8}
-    #test_arm "android" "armv8" "clang" ${port_armv8}
-    #cd -
-
-    # job 3
-    build_arm "android" "armv7" "gcc"
-    run_gen_code_test ${port_armv7}
-    test_arm "android" "armv7" "gcc" ${port_armv7}
-    cd -
-
-    # job 4
-    #build_arm "android" "armv7" "clang"
-    #run_gen_code_test ${port_armv7}
-    #test_arm "android" "armv7" "clang" ${port_armv7}
-    #cd -
-
-    adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
-    echo "Done"
-}
-
-# sub-task2
-function build_test_arm_subtask_armlinux {
-    cur=$PWD
-    # job 5
-    build_arm "armlinux" "armv8" "gcc"
-    test_arm "armlinux" "armv8" "gcc" $port_armv8
-    cd $cur
-
-    # job 6
-    build_arm "armlinux" "armv7" "gcc"
-    test_arm "armlinux" "armv7" "gcc" $port_armv8
-    cd $cur
-
-    # job 7
-    build_arm "armlinux" "armv7hf" "gcc"
-    test_arm "armlinux" "armv7hf" "gcc" $port_armv8
-    cd $cur
-
-    echo "Done"
-}
-
-# sub-task-model
-function build_test_arm_subtask_model {
-    local port_armv8=5554
-    local port_armv7=5556
-    # We just test following single one environment to limit the CI time.
-    local os=android
-    local abi=armv8
-    local lang=gcc
-
-    local test_name=$1
-    local model_name=$2
-
-    cur_dir=$(pwd)
-    build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}
-    mkdir -p $build_dir
-    cd $build_dir
-    cmake_arm $os $abi $lang
-    make $test_name -j$NUM_CORES_FOR_COMPILE
-
-    prepare_emulator $port_armv8 $port_armv7
-
-    # just test the model on armv8
-    test_arm_model $test_name $port_armv8 "./third_party/install/$model_name"
-
-    adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
-    echo "Done"
-    cd -
-    rm -rf $build_dir
-}
-
-
-# this test load a model, optimize it and check the prediction result of both cxx and light APIS.
-function test_arm_predict_apis {
-    local port=$1
-    local workspace=$2
-    local naive_model_path=$3
-    local api_test_path=$(find . -name "test_apis")
-    # the model is pushed to ./lite_naive_model
-    adb -s emulator-${port} push ${naive_model_path} ${workspace}
-    adb -s emulator-${port} push $api_test_path ${workspace}
-
-    # test cxx_api first to store the optimized model.
-    adb -s emulator-${port} shell ./test_apis --model_dir ./lite_naive_model --optimized_model ./lite_naive_model_opt
-}
-
-
-# Build the code and run lite arm tests. This is executed in the CI system.
-function build_test_arm {
-    ########################################################################
-    # job 1-4 must be in one runner
-    port_armv8=5554
-    port_armv7=5556
-
-    build_test_arm_subtask_android
-    build_test_arm_subtask_armlinux
-}
-
-function build_test_npu {
-    local test_name=$1
-    local port_armv8=5554
-    local port_armv7=5556
-    local os=android
-    local abi=armv8
-    local lang=gcc
-
-    local test_model_name=test_mobilenetv1 
-    local model_name=mobilenet_v1
-    cur_dir=$(pwd)
-
-    build_npu "android" "armv8" "gcc" $test_name
-
-    # just test the model on armv8
-    # prepare_emulator $port_armv8
-
-    if [[ "${test_name}x" != "x" ]]; then
-        test_npu ${test_name} ${port_armv8}
-    else
-        # run_gen_code_test ${port_armv8}
-        for _test in $(cat $TESTS_FILE | grep npu); do
-            test_npu $_test $port_armv8
-        done
-    fi
-
-    test_npu_model $test_model_name $port_armv8 "./third_party/install/$model_name"
-    cd -
-    # just test the model on armv8
-    # adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
-    echo "Done"
-}
-
-function mobile_publish {
-    # only check os=android abi=armv8 lang=gcc now
-    local os=android
-    local abi=armv8
-    local lang=gcc
-
-    # Install java sdk tmp, remove this when Dockerfile.mobile update
-    apt-get install -y --no-install-recommends default-jdk
-
-    cur_dir=$(pwd)
-    build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}
-    mkdir -p $build_dir
-    cd $build_dir
-
-    cmake .. \
-        -DWITH_GPU=OFF \
-        -DWITH_MKL=OFF \
-        -DWITH_LITE=ON \
-        -DLITE_WITH_CUDA=OFF \
-        -DLITE_WITH_X86=OFF \
-        -DLITE_WITH_ARM=ON \
-        -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
-        -DWITH_TESTING=OFF \
-        -DLITE_WITH_JAVA=ON \
-        -DLITE_SHUTDOWN_LOG=ON \
-        -DLITE_ON_TINY_PUBLISH=ON \
-        -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
-
-    make publish_inference -j$NUM_CORES_FOR_COMPILE
-    cd - > /dev/null
-}
-
-############################# MAIN #################################
-function print_usage {
-    echo -e "\nUSAGE:"
-    echo
-    echo "----------------------------------------"
-    echo -e "cmake_x86: run cmake with X86 mode"
-    echo -e "cmake_cuda: run cmake with CUDA mode"
-    echo -e "--arm_os=<os> --arm_abi=<abi> cmake_arm: run cmake with ARM mode"
-    echo
-    echo -e "build: compile the tests"
-    echo -e "--test_name=<test_name> build_single: compile single test"
-    echo
-    echo -e "test_server: run server tests"
-    echo -e "--test_name=<test_name> --adb_port_number=<adb_port_number> test_arm_android: run arm test"
-    echo "----------------------------------------"
-    echo
-}
-
-function main {
-    # Parse command line.
-    for i in "$@"; do
-        case $i in
-            --tests=*)
-                TESTS_FILE="${i#*=}"
-                shift
-                ;;
-            --test_name=*)
-                TEST_NAME="${i#*=}"
-                shift
-                ;;
-            --arm_os=*)
-                ARM_OS="${i#*=}"
-                shift
-                ;;
-            --arm_abi=*)
-                ARM_ABI="${i#*=}"
-                shift
-                ;;
-            --arm_lang=*)
-                ARM_LANG="${i#*=}"
-                shift
-                ;;
-            --arm_port=*)
-                ARM_PORT="${i#*=}"
-                shift
-                ;;
-            build)
-                build $TESTS_FILE
-                build $LIBS_FILE
-                shift
-                ;;
-            build_single)
-                build_single $TEST_NAME
-                shift
-                ;;
-            cmake_x86)
-                cmake_x86
-                shift
-                ;;
-            cmake_opencl)
-                cmake_opencl $ARM_OS $ARM_ABI $ARM_LANG
-                shift
-                ;;
-            cmake_cuda)
-                cmake_cuda
-                shift
-                ;;
-            cmake_arm)
-                cmake_arm $ARM_OS $ARM_ABI $ARM_LANG
-                shift
-                ;;
-            build_opencl)
-                build_opencl $ARM_OS $ARM_ABI $ARM_LANG
-                shift
-                ;;
-            build_arm)
-                build_arm $ARM_OS $ARM_ABI $ARM_LANG
-                shift
-                ;;
-            test_server)
-                test_server
-                shift
-                ;;
-            test_arm)
-                test_arm $ARM_OS $ARM_ABI $ARM_LANG $ARM_PORT
-                shift
-                ;;
-            test_npu)
-                test_npu $TEST_NAME $ARM_PORT
-                shift
-                ;;
-            test_arm_android)
-                test_arm_android $TEST_NAME $ARM_PORT
-                shift
-                ;;
-            build_test_server)
-                build_test_server
-                shift
-                ;;
-            build_test_train)
-                build_test_train
-                shift
-                ;;
-            build_test_arm)
-                build_test_arm
-                shift
-                ;;
-            build_test_npu)
-                build_test_npu $TEST_NAME
-                shift
-                ;;
-            build_test_arm_opencl)
-                build_test_arm_opencl
-                shift
-                ;;
-            build_test_arm_subtask_android)
-                build_test_arm_subtask_android
-                shift
-                ;;
-            build_test_arm_subtask_armlinux)
-                build_test_arm_subtask_armlinux
-                shift
-                ;;
-            build_test_arm_model_mobilenetv1)
-                build_test_arm_subtask_model test_mobilenetv1 mobilenet_v1
-                build_test_arm_subtask_model test_mobilenetv1_int8 MobileNetV1_quant
-                shift
-                ;;
-            build_test_arm_model_mobilenetv2)
-                build_test_arm_subtask_model test_mobilenetv2 mobilenet_v2_relu
-                shift
-                ;;
-            build_test_arm_model_resnet50)
-                build_test_arm_subtask_model test_resnet50 resnet50
-                shift
-                ;;
-            build_test_arm_model_inceptionv4)
-                build_test_arm_subtask_model test_inceptionv4 inception_v4_simple
-                shift
-                ;;
-            check_style)
-                check_style
-                shift
-                ;;
-            check_need_ci)
-                check_need_ci
-                shift
-                ;;
-            mobile_publish)
-                mobile_publish
-                shift
-                ;;
-            *)
-                # unknown option
-                print_usage
-                exit 1
-                ;;
-        esac
-    done
-}
-
-main $@
diff --git a/lite/tools/cmake_tools/ast.py b/lite/tools/cmake_tools/ast.py
deleted file mode 100644
index 7df41cbc8f..0000000000
--- a/lite/tools/cmake_tools/ast.py
+++ /dev/null
@@ -1,321 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-
-class SyntaxParser(object):
-    def __init__(self, str):
-        self.str = str
-        self.cur_pos = 0
-        self.N = len(self.str)
-        self.token = ''
-
-    def eat_char(self):
-        self.cur_pos += 1
-
-    def eat_str(self):
-        '''
-        "xx"
-        '''
-        self.token = ''
-        assert self.cur == '"';
-        self.cur_pos += 1;
-
-        assert self.cur_pos < self.N
-        while self.cur != '"':
-            self.token += self.cur
-            self.cur_pos += 1
-            assert self.cur_pos < self.N
-        assert self.cur == '"'
-        self.cur_pos += 1
-        #logging.warning('get: %s' % self.token)
-
-    def eat_word(self):
-        self.token = ''
-        str = ''
-        while self.cur.isalnum() or self.cur in ('_', ':',):
-            self.token += self.cur
-            self.forward()
-
-        #logging.warning('get: %s' % self.token)
-
-    def eat_left_parentheses(self):
-        '''
-        (
-        '''
-        self.assert_is('(')
-        self.token = '('
-        self.forward()
-        #logging.warning('get: %s' % self.token)
-
-    def eat_right_parentheses(self):
-        '''
-        )
-        '''
-        self.assert_is(')')
-        self.token = ')'
-        self.forward()
-        #logging.warning('get: %s' % self.token)
-
-    def eat_left_brace(self):
-        '''
-        {
-        '''
-        self.assert_is('{')
-        self.token = '{'
-        self.forward()
-        #logging.warning('get: %s' % self.token)
-
-    def eat_right_brace(self):
-        '''
-        }
-        '''
-        self.assert_is('}')
-        self.token = '}'
-        self.forward()
-        #logging.warning('get: %s' % self.token)
-
-    def eat_comma(self):
-        '''
-        ,
-        '''
-        self.assert_is(',')
-        self.token = ','
-        self.forward()
-        #logging.warning('get: %s' % self.token)
-
-    def eat_spaces(self):
-        '''
-        eat space like string.
-        '''
-        while self.cur_pos < len(self.str):
-            if self.cur in (' ', '\t', '\n'):
-                self.forward()
-            else:
-                break
-
-    def eat_point(self):
-        '''
-        .
-        '''
-        self.assert_is('.')
-        self.token = '.'
-        self.forward()
-        #logging.warning('get: %s' % self.token)
-
-    def eat_any_but_brace(self):
-        '''
-        anything but {}
-        '''
-        start = self.cur_pos
-        while self.cur not in ('{', '}'):
-            self.cur_pos += 1
-
-        self.token = self.str[start:self.cur_pos]
-        #logging.warning('get: %s' % self.token)
-
-    def eat_semicolon(self):
-        '''
-        ;
-        '''
-        self.assert_is(';')
-        self.token = ';'
-        self.forward()
-        #logging.warning('get: %s' % self.token)
-
-    def assert_is(self, w):
-        assert self.cur == w, "token should be %s, but get %s" % (w, self.cur)
-
-    @property
-    def cur(self):
-        assert self.cur_pos < self.N
-        return self.str[self.cur_pos]
-        #logging.warning('get: %s' % self.token)
-
-    def forward(self):
-        self.cur_pos += 1
-
-
-class IO:
-    def __init__(self):
-        self.name = ''
-        self.type = ''
-
-    def __repr__(self):
-        return "- %s: %s" % (self.name, self.type)
-
-
-class KernelRegistry:
-    def __init__(self):
-        self.op_type = ''
-        self.target = ''
-        self.precision = ''
-        self.data_layout = ''
-        self.class_ = ''
-        self.alias = ''
-        self.inputs = []
-        self.outputs = []
-
-    def __repr__(self):
-        str = "Kernel({op_type}, {target}, {precision}, {data_layout}, {alias}):".format(
-            op_type = self.op_type,
-            target = self.target,
-            precision = self.precision,
-            data_layout = self.data_layout,
-            alias = self.alias,
-        )
-
-        str += '\n' + '\n'.join(repr(io) for io in self.inputs)
-        str += '\n' + '\n'.join(repr(io) for io in self.outputs)
-        str += '\n'
-        return str
-
-
-class RegisterLiteKernelParser(SyntaxParser):
-
-    KEYWORD = 'REGISTER_LITE_KERNEL'
-
-    def __init__(self, str):
-        super(RegisterLiteKernelParser, self).__init__(str)
-
-        self.kernels = []
-
-    def parse(self):
-        find_registry_command = False
-
-        while self.cur_pos < len(self.str):
-            start = self.str.find(self.KEYWORD, self.cur_pos)
-            if start != -1:
-                #print 'str ', start, self.str[start-2: start]
-                if start != 0 and '/' in self.str[start-2: start]:
-                    '''
-                    skip commented code
-                    '''
-                    self.cur_pos = start + 1
-                    continue
-                self.cur_pos = start
-                k = KernelRegistry()
-                self.kernels.append(self.parse_register(k))
-            else:
-                break
-
-    def eat_class(self):
-        start = self.cur_pos
-        self.eat_word()
-        stack = ''
-        if self.cur == '<':
-            stack = stack + '<'
-            self.forward()
-            while stack:
-                if self.cur == '<':
-                    stack = stack + '<'
-                elif self.cur == '>':
-                    stack = stack[1:]
-                else:
-                    pass
-                self.forward()
-        self.token = self.str[start:self.cur_pos]
-
-
-    def parse_register(self, k):
-
-        self.eat_word()
-        assert self.token == self.KEYWORD
-        self.eat_spaces()
-
-        self.eat_left_parentheses()
-        self.eat_spaces()
-
-        self.eat_word()
-        k.op_type = self.token
-        self.eat_comma()
-        self.eat_spaces()
-
-
-        self.eat_word()
-        k.target = self.token
-        self.eat_comma()
-        self.eat_spaces()
-
-        self.eat_word()
-        k.precision = self.token
-        self.eat_comma()
-        self.eat_spaces()
-
-        self.eat_word()
-        k.data_layout = self.token
-        self.eat_comma()
-        self.eat_spaces()
-
-        self.eat_class()
-        k.class_ = self.token
-        self.eat_comma()
-        self.eat_spaces()
-
-        self.eat_word()
-        k.alias = self.token
-        self.eat_spaces()
-
-        self.eat_right_parentheses()
-        self.eat_spaces()
-
-
-        def eat_io(is_input, io):
-            self.eat_left_parentheses()
-            self.eat_str()
-            io.name = self.token
-            self.eat_comma()
-            self.eat_spaces()
-
-            self.eat_left_brace()
-            self.eat_any_but_brace()
-            io.type = self.token
-            self.eat_right_brace()
-            self.eat_spaces()
-            self.eat_right_parentheses()
-            self.eat_spaces()
-
-
-        # eat input and output
-        while self.cur_pos < len(self.str):
-            self.eat_point()
-            self.eat_spaces()
-            self.eat_word()
-            assert self.token in ('BindInput', 'BindOutput', 'Finalize')
-            io = IO()
-
-            if self.token == 'BindInput':
-                eat_io(True, io)
-                k.inputs.append(io)
-            elif self.token == 'BindOutput':
-                eat_io(False, io)
-                k.outputs.append(io)
-            else:
-                self.eat_left_parentheses()
-                self.eat_right_parentheses()
-                self.eat_semicolon()
-                self.eat_spaces()
-                return k
-                break
-
-
-if __name__ == '__main__':
-    with open('/home/chunwei/project2/Paddle-Lite/lite/kernels/arm/activation_compute.cc') as f:
-        c = f.read()
-        kernel_parser = RegisterLiteKernelParser(c)
-
-        kernel_parser.parse()
-
-        for k in kernel_parser.kernels:
-            print k
diff --git a/lite/tools/cmake_tools/create_fake_kernel_registry.py b/lite/tools/cmake_tools/create_fake_kernel_registry.py
deleted file mode 100644
index 7031c8d18b..0000000000
--- a/lite/tools/cmake_tools/create_fake_kernel_registry.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import logging
-from ast import RegisterLiteKernelParser
-from utils import *
-
-ops_list_path = sys.argv[1]
-dest_path = sys.argv[2]
-
-out_lines = [
-    '#pragma once',
-    '#include "lite/core/op_registry.h"',
-    '#include "lite/core/kernel.h"',
-    '#include "lite/core/type_system.h"',
-    '',
-]
-
-fake_kernel = '''
-
-namespace paddle {
-namespace lite {
-
-class %s : public KernelLite<TARGET(%s), PRECISION(%s), DATALAYOUT(%s)> {
- public:
-  void PrepareForRun() override {}
-
-  void Run() override {}
-
-  virtual ~%s() = default;
-};
-
-}  // namespace lite
-}  // namespace paddle
-'''
-
-
-
-with open(ops_list_path) as f:
-    paths = set([path for path in f])
-    for path in paths:
-        print 'path', path
-        with open(path.strip()) as g:
-            c = g.read()
-            kernel_parser = RegisterLiteKernelParser(c)
-            kernel_parser.parse()
-
-            for k in kernel_parser.kernels:
-                kernel_name = "{op_type}_{target}_{precision}_{data_layout}_{alias}_class".format(
-                    op_type = k.op_type,
-                    target = k.target,
-                    precision = k.precision,
-                    data_layout = k.data_layout,
-                    alias = k.alias,
-                )
-
-                kernel_define = fake_kernel % (
-                    kernel_name,
-                    k.target,
-                    k.precision,
-                    k.data_layout,
-                    kernel_name,
-                )
-
-                out_lines.append(kernel_define)
-                out_lines.append("")
-
-
-                key = "REGISTER_LITE_KERNEL(%s, %s, %s, %s, %s, %s)" % (
-                    k.op_type,
-                    k.target,
-                    k.precision,
-                    k.data_layout,
-                    '::paddle::lite::' + kernel_name,
-                    k.alias,
-                )
-                out_lines.append(key)
-
-                for input in k.inputs:
-                    io = '    .BindInput("%s", {%s})' % (input.name, input.type)
-                    out_lines.append(io)
-                for output in k.outputs:
-                    io = '    .BindOutput("%s", {%s})' % (output.name, output.type)
-                    out_lines.append(io)
-                out_lines.append("    .Finalize();")
-                out_lines.append("")
-                out_lines.append(gen_use_kernel_statement(k.op_type, k.target, k.precision, k.data_layout, k.alias))
-
-
-with open(dest_path, 'w') as f:
-    logging.info("write kernel list to %s" % dest_path)
-    f.write('\n'.join(out_lines))
diff --git a/lite/tools/cmake_tools/parse_kernel_registry.py b/lite/tools/cmake_tools/parse_kernel_registry.py
deleted file mode 100644
index b9bfbb2692..0000000000
--- a/lite/tools/cmake_tools/parse_kernel_registry.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import logging
-from ast import RegisterLiteKernelParser
-
-ops_list_path = sys.argv[1]
-dest_path = sys.argv[2]
-
-out_lines = [
-    '#pragma once',
-    '#include "paddle_lite_factory_helper.h"',
-    '',
-]
-
-
-with open(ops_list_path) as f:
-    paths = set([path for path in f])
-    for path in paths:
-        with open(path.strip()) as g:
-            print 'path: ', path
-            c = g.read()
-            kernel_parser = RegisterLiteKernelParser(c)
-            kernel_parser.parse()
-
-            for k in kernel_parser.kernels:
-                  key = "USE_LITE_KERNEL(%s, %s, %s, %s, %s);" % (
-                     k.op_type,
-                     k.target,
-                     k.precision,
-                     k.data_layout,
-                     k.alias,
-                  )
-                  out_lines.append(key)
-
-with open(dest_path, 'w') as f:
-    logging.info("write kernel list to %s" % dest_path)
-    f.write('\n'.join(out_lines))
diff --git a/lite/tools/cmake_tools/parse_op_registry.py b/lite/tools/cmake_tools/parse_op_registry.py
deleted file mode 100644
index 6c936c899d..0000000000
--- a/lite/tools/cmake_tools/parse_op_registry.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-''' Collect op registry information. '''
-
-import sys
-import logging
-
-ops_list_path = sys.argv[1]
-dest_path = sys.argv[2]
-
-out_lines = [
-    '#pragma once',
-    '#include "paddle_lite_factory_helper.h"',
-    '',
-]
-
-lines = set()
-with open(ops_list_path) as f:
-    for line in f:
-        lines.add(line.strip())
-
-for line in lines:
-    path = line.strip()
-
-    with open(path) as g:
-        for line in g:
-            key = 'REGISTER_LITE_OP'
-            if line.startswith(key):
-                end = line.find(',')
-                op = line[len(key) + 1:end]
-                if not op: continue
-                if "_grad" in op: continue
-                out = "USE_LITE_OP(%s);" % op
-                out_lines.append(out)
-
-with open(dest_path, 'w') as f:
-    logging.info("write op list to %s" % dest_path)
-    f.write('\n'.join(out_lines))
diff --git a/lite/tools/cmake_tools/utils.py b/lite/tools/cmake_tools/utils.py
deleted file mode 100644
index 832ead301b..0000000000
--- a/lite/tools/cmake_tools/utils.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-def gen_use_kernel_statement(op_type, target, precision, layout, alias):
-    return 'USE_LITE_KERNEL(%s, %s, %s, %s, %s);' %(
-        op_type, target, precision, layout, alias
-    )
diff --git a/lite/tools/debug/CMakeLists.txt b/lite/tools/debug/CMakeLists.txt
deleted file mode 100644
index b26fd1545a..0000000000
--- a/lite/tools/debug/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-lite_cc_library(debug_utils SRCS debug_utils.cc DEPS op_params model_parser)
-
-lite_cc_binary(lite_model_debug_tool SRCS model_debug_tool.cc
-    DEPS
-    cxx_api
-    debug_utils
-    target_wrapper_host
-    mir_passes
-    gflags
-    ${ops} ${host_kernels}
-    X86_DEPS ${x86_kernels}
-    ARM_DEPS ${arm_kernels}
-    NPU_DEPS ${npu_kernels}
-    FPGA_DEPS ${fpga_kernels}
-    CL_DEPS ${opencl_kernels})
diff --git a/lite/tools/debug/analysis_tool.py b/lite/tools/debug/analysis_tool.py
deleted file mode 100644
index 3de8ed9249..0000000000
--- a/lite/tools/debug/analysis_tool.py
+++ /dev/null
@@ -1,401 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-'''
-Fluid model analysis tools 
-'''
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import logging
-import os
-import subprocess
-import sys
-from collections import OrderedDict
-from operator import mul
-
-# Simple logging config
-logging.basicConfig(
-    level=logging.DEBUG,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-
-import numpy as np
-import paddle.fluid as fluid
-from paddle.fluid import debugger
-from paddle.fluid import core
-
-# Command arguments
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--model_dir", type=str, required=True, help="Model dir path")
-parser.add_argument(
-    "--input_file", default="", type=str, help="Input datas file path")
-parser.add_argument(
-    "--topo_file",
-    type=str,
-    required=True,
-    help="Runtime topology order output file path")
-parser.add_argument(
-    "--tensor_file",
-    default="",
-    type=str,
-    required=True,
-    help="Tensor file path")
-parser.add_argument(
-    "--tensor_names",
-    default="",
-    type=str,
-    help="If tensor_names is not empty, then only this tensors will be compare")
-parser.add_argument(
-    "--separator",
-    default=",",
-    type=str,
-    help="Deafult separator, use in string split")
-parser.add_argument(
-    "--output_tensor",
-    default=0,
-    type=int,
-    help="dump fluid runntime tensors or not")
-parser.add_argument(
-    "--tensor_output_file",
-    default="./tensor_output_py",
-    type=str,
-    help="dump fluid runntime tensors filepath")
-parser.add_argument(
-    "--tensor_output_length",
-    default=-1,
-    type=int,
-    help="Output tensor data length, dims size will be used if tensor_output_length < 0"
-)
-parser.add_argument(
-    "--only_first",
-    default=1,
-    type=int,
-    help="If only output the first mismatch vars info or not")
-parser.add_argument(
-    "--output_file",
-    default="./diff.txt",
-    type=str,
-    help="dump diff info filepath")
-parser.add_argument(
-    "--threshold", default=1e-5, type=float, help="float value diff threshold")
-
-
-# Help functions
-def load_file(filename, delim=None):
-    """
-    Load file help function
-    """
-    with open(filename) as fd:
-        for line in fd:
-            line = line.strip()
-            assert len(line) != ""
-            if delim:
-                line = line.split(delim)
-            yield line
-
-
-class FluidModelExecutor(object):
-    """
-    A fluid inference model executeor
-    """
-
-    def __init__(self, model_dir, input_file):
-        self.model_dir = model_dir
-        self.place = fluid.CPUPlace()
-        self.exe = fluid.Executor(self.place)
-        self.scope = fluid.core.Scope()
-        self.input_data = self._load_input_file(input_file)
-
-        self.program, self.feed_target_names, self.fetch_targets = self._load_inference_model(
-        )
-
-    def infer_var_list(self,
-                       arg_names=None,
-                       out_data_len=-1,
-                       dump_tensor=False,
-                       dump_tensor_file=''):
-        """
-        Get variables' tensor in var_list
-        """
-        with fluid.scope_guard(self.scope):
-            global_block = self.program.global_block()
-            feed_list = self._prepare_feed_data(global_block,
-                                                self.feed_target_names)
-            fetch_targets = self._fetch_tmp_vars(global_block, arg_names)
-            results = self.exe.run(program=self.program,
-                                   feed=feed_list,
-                                   fetch_list=fetch_targets,
-                                   return_numpy=False)
-            return self._get_results(
-                results,
-                fetch_targets,
-                arg_names=arg_names,
-                need_save=dump_tensor,
-                save_path=dump_tensor_file,
-                out_data_len=out_data_len)
-
-    def draw_graph(self, output_path='./', filename='debug'):
-        """
-        Draw graph with graphviz
-        """
-        dot_path = os.path.join([output_path, filename + '.dot'])
-        pdf_path = os.path.join([output_path, filename + '.pdf'])
-        debugger.draw_block_graphviz(self.program.global_block(), path=dot_path)
-        cmd = ["dot", "-Tpdf", dot_path, "-o", pdf_path]
-        subprocess.Popen(
-            cmd,
-            stdin=subprocess.PIPE,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE)
-
-    def _prepare_feed_data(self, block, feed_target_names):
-        feed_dict = dict()
-
-        def fill_data(np_dtype, col, shape):
-            if self.input_data:
-                input_size = reduce(mul, shape)
-                assert len(self.input_data[0]) > col
-                data = self.input_data[0][col].split(' ')
-                assert len(data) == input_size
-                return np.array(
-                    map(np_dtype, data), dtype=np_dtype).reshape(shape)
-            else:
-                return np.ones(shape, dtype=np_dtype)
-
-        # TODO(sangoly): support multiple feed fields 
-        assert len(feed_target_names) == 1
-        for idx, name in enumerate(feed_target_names):
-            var = block.var(name)
-            np_shape = list(var.shape)
-            # TODO(sangoly): support batch
-            if np_shape[0] == -1:
-                np_shape[0] = 1
-            if var.dtype == core.VarDesc.VarType.INT32:
-                feed_dict[name] = fill_data(np.int32, idx, np_shape)
-            elif var.dtype == core.VarDesc.VarType.INT64:
-                feed_dict[name] = fill_data(np.int64, idx, np_shape)
-            elif var.dtype == core.VarDesc.VarType.FP16:
-                feed_dict[name] = fill_data(np.float16, idx, np_shape)
-            elif var.dtype == core.VarDesc.VarType.FP32:
-                feed_dict[name] = fill_data(np.float32, idx, np_shape)
-            elif var.dtype == core.VarDesc.VarType.FP64:
-                feed_dict[name] = fill_data(np.float64, idx, np_shape)
-            else:
-                raise TypeError("Data type is not supported")
-        return feed_dict
-
-    def _load_input_file(self, input_file=None):
-        input_data = []
-        if not input_file:
-            return input_data
-        logger.info("Loading input file %s ..." % input_file)
-        for line in load_file(input_file, "\t"):
-            input_data.append(line)
-        return input_data
-
-    def _load_inference_model(self):
-        with fluid.scope_guard(self.scope):
-            model_abs_path = os.path.join(self.model_dir, 'model')
-            param_abs_path = os.path.join(self.model_dir, 'params')
-            if os.path.exists(model_abs_path) and os.path.exists(
-                    param_abs_path):
-                return fluid.io.load_inference_model(self.model_dir, exe,
-                                                     'model', 'params')
-            else:
-                return fluid.io.load_inference_model(self.model_dir, self.exe)
-
-    def _fetch_tmp_vars(self, block, var_names_list=None):
-        fetch_var = block.var('fetch')
-        old_fetch_names = set([var.name for var in self.fetch_targets])
-        new_fetch_vars = [block.var(name) for name in old_fetch_names]
-        i = len(new_fetch_vars)
-        if var_names_list is None:
-            var_names_list = block.vars.keys()
-        for var_name in var_names_list:
-            if var_name in old_fetch_names: continue
-            new_fetch_vars.append(block.var(var_name))
-            block.append_op(
-                type='fetch',
-                inputs={'X': [var_name]},
-                outputs={'Out': [fetch_var]},
-                attrs={'col': i})
-            i = i + 1
-        return new_fetch_vars
-
-    def _get_results(self,
-                     results,
-                     new_fetch_targets,
-                     need_save=False,
-                     arg_names=None,
-                     save_path='',
-                     out_data_len=10):
-        res = OrderedDict()
-        old_fetch_names = set([var.name for var in self.fetch_targets])
-        if need_save:
-            out_fd = open(save_path, 'w')
-        for result in results:
-            idx = results.index(result)
-            name = new_fetch_targets[idx].name
-            dim = [v if v >= 0 else 1 for v in new_fetch_targets[idx].shape]
-            size = min(reduce(mul, dim),
-                       out_data_len) if out_data_len > 0 else reduce(mul, dim)
-            values = list(np.array(result).flatten())[:size]
-            res[name] = {"dim": dim, "values": values}
-            if need_save:
-                if arg_names and name not in arg_names: continue
-                dim_str = '{' + ','.join(map(str, dim)) + '}'
-                out_fd.write('\t'.join(
-                    [name, dim_str, ' '.join(map(str, values))]) + '\n')
-        if need_save:
-            out_fd.close()
-        return res
-
-
-class Analyser(object):
-    """
-    A FLuid model analysis tool
-    """
-
-    def __init__(self, args):
-        self.args = args
-        self.tensors = OrderedDict()
-        self.topo = {}
-        self.input = []
-        logger.info("Loading fluid inference model %s ..." % args.model_dir)
-        self.predictor = FluidModelExecutor(args.model_dir, args.input_file)
-
-    def analysis(self):
-        """
-        Analyser work function
-        """
-        self._load_topo_file()
-        self._load_tensor_file()
-        arg_names = self.args.tensor_names.split(',') if self.args.tensor_names != "" \
-                                           else self.tensors.keys()
-        infer_results = self.predictor.infer_var_list(
-            out_data_len=self.args.tensor_output_length,
-            arg_names=arg_names,
-            dump_tensor=self.args.output_tensor,
-            dump_tensor_file=self.args.tensor_output_file)
-        if self.args.tensor_names == "":
-            self._check_diff_nodes(infer_results)
-
-    def _parse_topo_field(self, field):
-        params = [item.split(':')[1].strip() for item in field[1:-1].split(' ')]
-        params = [item.split('#') for item in params if item != ""]
-        return [item for lst in params for item in lst]
-
-    def _load_topo_file(self):
-        if self.args.topo_file == "":
-            raise ValueError("Topo file path in empty")
-        logger.info("Loading topo file %s ..." % self.args.topo_file)
-        for line in load_file(self.args.topo_file, '\t'):
-            op_type, inputs, outputs = line
-            for name in self._parse_topo_field(outputs):
-                if name not in self.topo:
-                    self.topo[name] = []
-                self.topo[name].append(line)
-
-    def _load_tensor_file(self):
-        if self.args.tensor_file == "":
-            raise ValueError("Tensor file path in empty")
-        logger.info("Loading tensor file %s ..." % args.tensor_file)
-        for line in load_file(args.tensor_file, "\t"):
-            name, dim, values = line
-            dim = map(int, dim[1:-1].split(','))
-            values = map(float, values.split(' '))
-
-            dim_size = reduce(mul, dim)
-            value_size = len(values)
-            assert dim_size == value_size, \
-                        "Dim size mismatch with data: %d vs %d" % (dim_size, value_size)
-
-            self.tensors[name] = {"dim": dim, "values": values}
-
-    def _check_diff_nodes(self, results):
-        """
-        NOTE: The tensor output by c++ debug tool is according to runtime topology order,
-              so we can find the first ops (may be one of them) with error results
-        """
-        assert len(self.tensors) == len(results), \
-                "FLuid output tensor'size mismatch with `tensor_file`"
-        diff_vars = []
-        flag = False
-        for k in self.tensors:
-            if k not in results:
-                raise KeyError("Have not found infer result for `%s`" % k)
-            if len(self.tensors[k]['values']) != len(results[k]['values']):
-                raise ValueError(
-                    "Argname: %s size mismatch with `tensor_file`: %d vs %d" %
-                    (k, len(self.tensors[k]['values']),
-                     len(results[k]['values'])))
-            for i in range(len(self.tensors[k]['values'])):
-                if abs(self.tensors[k]['values'][i] - results[k]['values'][
-                        i]) > args.threshold:
-                    diff_vars.append(k)
-                    if args.only_first:
-                        flag = True
-                    break
-            if flag: break
-        self._output_diff_nodes(results, diff_vars)
-
-    def _output_diff_nodes(self, results, diff_vars):
-        def output_param_info(inputs, outputs, infos, fd):
-            def tensor_repr(name):
-                return '\t'.join([
-                    name, '{' + ','.join(map(str, infos[name]['dim'])) + '}',
-                    ' '.join(map(str, infos[name]['values']))
-                ])
-
-            for name in self._parse_topo_field(inputs):
-                if name not in infos: continue
-                fd.write(tensor_repr(name) + '\n')
-            for name in self._parse_topo_field(outputs):
-                if name not in infos: continue
-                fd.write(tensor_repr(name) + '\n')
-
-        if len(diff_vars) == 0:
-            logger.info("No diff found. Congratulation!")
-            return
-        logger.info("Total diff vars: %d" % len(diff_vars))
-        with open(self.args.output_file, 'w') as fd:
-            for var in diff_vars:
-                if var not in self.topo:
-                    raise KeyError("%s not in any op's output params, " % var +
-                                   "please check your model and input")
-                fd.write(
-                    '>>>>>>>>>>>>>>>>>>DIFF VARIABLE: %s<<<<<<<<<<<<<<<<<<<\n' %
-                    var)
-                for idx, (op_type, inputs,
-                          outputs) in enumerate(self.topo[var]):
-                    op_repr = '\t'.join([op_type, inputs, outputs])
-                    logger.info("dump diff info: ------------ %s" % op_repr)
-                    fd.write(op_repr + '\n')
-                    fd.write(
-                        "--------------- Tensor File info ---------------\n")
-                    output_param_info(inputs, outputs, self.tensors, fd)
-                    fd.write(
-                        "--------------- Fluid Tensor info ---------------\n")
-                    output_param_info(inputs, outputs, results, fd)
-                    fd.write("\n\n")
-
-
-if __name__ == "__main__":
-    args = parser.parse_args()
-    analyser = Analyser(args)
-    analyser.analysis()
diff --git a/lite/tools/debug/check_model.sh b/lite/tools/debug/check_model.sh
deleted file mode 100755
index d730de7203..0000000000
--- a/lite/tools/debug/check_model.sh
+++ /dev/null
@@ -1,182 +0,0 @@
-#!/bin/bash
-
-############################# Arguments ############################
-# For both cpp & python
-BUILD_ROOT_DIR=""                 # Cmake build root path, for LD_LIBRARY_PATH
-MODEL_DIR=""                      # Model dir path
-INPUT_FILE=""                     # Input data file, only the first record will be used. 
-                                  # If the path is empty, then all-ones input will be used.
-CPP_TOPO_FILE=./topo_file.txt     # Runtime program topology info. Write by Cpp-debug-tool and Read by Py-debug-tool
-CPP_TENSOR_FILE=./tensor_cpp.txt  # Store Cpp-debug-tool's tensor outputs int runtime topology order.
-                                  # Write by Cpp-debug-tool and Read by Py-debug-tool 
-TENSOR_NAMES=""                   # If is not empty, then only dump the tensor fo arguments whoes name is 
-                                  # in tensor names. Separate by ','.
-TENSOR_OUTPUT_LENGTH=-1           # Output tensor data length. Tensor's dim size will be used if this value < 0.
-
-# For Cpp debug tools
-CPP_OUTPUT_TOPO=1                 # If output topology info or not.
-CPP_OUTPUT_VARS=1                 # If output TmpVar' tensor or not.
-CPP_OUTPUT_WEIGHTS=1              # If output WeightVar' tensor or not.
-CPP_ARM_THREAD_NUM=1              # ARM thread num. Used by ARM device info. 
-                                  # Only be used by compile option - LITE_WITH_ARM
-
-# For python debug tools
-PY_THRESHOLD=0.00001              # The numerical lower bound  be used to judge [Cpp vs Py] runtime model diff.
-PY_TENSOR_FILE=./tensor_py.txt    # Store Py-debug-tool's tensor outputs.
-PY_OUTPUT_FILE=./diff.txt         # Store model different op/var info for debug.
-PY_ONLY_OUTPUT_FIRST_DIFF=1       # If only output the first different var's info in runtime topology order or not.
-PY_OUTPUT_TENSOR=1                # If output var' tensor in CPP_TENSOR_FILE/TENSOR_NAMES or not.
-
-############################# MAIN #################################
-function print_usage {
-    echo -e "\nUSAGE:"
-    echo -e "debug_cpp_stage -> debug_py_stage"
-    echo
-    echo "----------------------------------------"
-    echo -e "debug_cpp_stage:"
-    echo -e "run_debug.sh [--option=value]* debug_cpp_stage"
-    echo -e "See run_debug.sh#run_cpp_debug_tool for detail"
-    echo
-    echo -e "debug_py_stage:"
-    echo -e "run_debug.sh [--option=value]* debug_py_stage"
-    echo -e "See run_debug.sh#run_py_debug_tool for detail"
-    echo "----------------------------------------"
-}
-
-function check_enviroment {
-    if [ "X${BUILD_ROOT_DIR}" == "X" ]; then
-	echo -e "\nOption: --build_root_dir=xxx is required.\n";
-	exit 1
-    fi 
-    if [ "X${MODEL_DIR}" == "X" ]; then
-	echo -e "\nOption: --model_dir=xxx is required.\n";
-	exit 1
-    fi 
-}
-
-function run_cpp_debug_tool {
-    check_enviroment
-
-    local tool_name="lite_model_debug_tool"
-    local tool_path=$(find ${BUILD_ROOT_DIR} -type f -name ${tool_name})
-    if [ "X${tool_path}" == "X" ]; then
-	echo -e "\nERROR: ${tool_name} not found in ${BUILD_ROOT_DIR}.\n"
-	exit 1
-    fi
-    echo "Find Cpp-debug-tool path: ${tool_path}"
-    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$BUILD_ROOT_DIR/third_party/install/mklml/lib"
-    ${tool_path} \
-        --model_dir=$MODEL_DIR                         \
-        --input_file=$INPUT_FILE                       \
-        --topo_output_file=$CPP_TOPO_FILE              \
-        --output_topo=$CPP_OUTPUT_TOPO                 \
-        --tensor_output_file=$CPP_TENSOR_FILE          \
-        --output_vars=$CPP_OUTPUT_VARS                 \
-        --output_weights=$CPP_OUTPUT_WEIGHTS           \
-        --tensor_names=$TENSOR_NAMES                   \
-        --tensor_output_length=$TENSOR_OUTPUT_LENGTH   \
-        --arm_thread_num=$CPP_ARM_THREAD_NUM
-}
-
-function run_py_debug_tool {
-    check_enviroment
-
-    local tool_name="analysis_tool.py"
-    local tool_path=$(find ${BUILD_ROOT_DIR} -type f -name ${tool_name})
-    if [ "X${tool_path}" == "X" ]; then
-	echo -e "\nERROR: ${tool_name} not found in ${BUILD_ROOT_DIR}.\n"
-	return
-    fi
-    echo "Find Py-debug-tool path: ${tool_path}"
-    python ${tool_path} \
-        --model_dir=$MODEL_DIR                         \
-        --input_file=$INPUT_FILE                       \
-        --topo_file=$CPP_TOPO_FILE                     \
-        --tensor_file=$CPP_TENSOR_FILE                 \
-        --tensor_names=$TENSOR_NAMES                   \
-        --output_tensor=$PY_OUTPUT_TENSOR              \
-        --tensor_output_file=$PY_TENSOR_FILE           \
-        --tensor_output_length=$TENSOR_OUTPUT_LENGTH   \
-        --only_first=$PY_ONLY_OUTPUT_FIRST_DIFF        \
-        --output_file=$PY_OUTPUT_FILE                  \
-        --threshold=$PY_THRESHOLD
-}
-
-function main {
-    # Parse command line.
-    for i in "$@"; do
-        case $i in
-            --model_dir=*)
-                MODEL_DIR="${i#*=}"
-                shift
-                ;;
-            --input_file=*)
-                INPUT_FILE="${i#*=}"
-                shift
-                ;;
-            --cpp_topo_file=*)
-                CPP_TOPO_FILE="${i#*=}"
-                shift
-                ;;
-            --cpp_tensor_file=*)
-                CPP_TENSOR_FILE="${i#*=}"
-                shift
-                ;;
-            --tensor_names=*)
-                TENSOR_NAMES="${i#*=}"
-                shift
-                ;;
-            --tensor_output_length=*)
-                TENSOR_OUTPUT_LENGTH="${i#*=}"
-                shift
-                ;;
-            --cpp_output_vars=*)
-                CPP_OUTPUT_VARS="${i#*=}"
-                shift
-                ;;
-            --cpp_output_weights=*)
-                CPP_OUTPUT_WEIGHTS="${i#*=}"
-                shift
-                ;;
-            --py_threshold=*)
-                PY_THRESHOLD="${i#*=}"
-                shift
-                ;;
-            --py_tensor_file=*)
-                PY_TENSOR_FILE="${i#*=}"
-                shift
-                ;;
-            --py_output_file=*)
-                PY_OUTPUT_FILE="${i#*=}"
-                shift
-                ;;
-            --py_only_output_first_diff=*)
-                PY_ONLY_OUTPUT_FIRST_DIFF="${i#*=}"
-                shift
-                ;;
-            --py_output_tensor=*)
-                PY_OUTPUT_TENSOR="${i#*=}"
-                shift
-                ;;
-            --build_root_dir=*)
-                BUILD_ROOT_DIR="${i#*=}"
-                shift
-                ;;
-            debug_cpp_stage)
-                run_cpp_debug_tool
-                shift
-                ;;
-            debug_py_stage)
-                run_py_debug_tool
-                shift
-                ;;
-            *)
-                # unknown option
-                print_usage
-                exit 1
-                ;;
-        esac
-    done
-}
-
-main $@
diff --git a/lite/tools/debug/debug_utils.cc b/lite/tools/debug/debug_utils.cc
deleted file mode 100644
index c2f3029123..0000000000
--- a/lite/tools/debug/debug_utils.cc
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/tools/debug/debug_utils.h"
diff --git a/lite/tools/debug/debug_utils.h b/lite/tools/debug/debug_utils.h
deleted file mode 100644
index 7f77b90488..0000000000
--- a/lite/tools/debug/debug_utils.h
+++ /dev/null
@@ -1,337 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <gflags/gflags.h>
-#include <algorithm>
-#include <fstream>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "lite/api/cxx_api.h"
-#include "lite/core/tensor.h"
-#include "lite/model_parser/pb/op_desc.h"
-#include "lite/model_parser/pb/var_desc.h"
-#include "lite/utils/all.h"
-
-DEFINE_string(model_dir, "", "Model dir path");
-DEFINE_string(input_file, "", "Input datas file path");
-DEFINE_string(topo_output_file, "", "Runtime topology order output file path");
-DEFINE_bool(output_topo, true, "Dump runtime topology or not");
-DEFINE_string(tensor_output_file, "", "Tensor output file path");
-DEFINE_bool(output_vars, true, "Dump vars or not");
-DEFINE_bool(output_weights, true, "Dump weight tensors or not");
-DEFINE_string(
-    tensor_names,
-    "",
-    "If tensor_names is not empty, then only this tensors will be dump");
-DEFINE_int32(tensor_output_length,
-             -1,
-             "Output tensor data length, dims size will be used if "
-             "output_tensor_length < 0");
-DEFINE_int32(arm_thread_num, 1, "Arm thread nums, 1 as default");
-DEFINE_string(separator, ",", "Deafult separator, use in string split");
-
-namespace paddle {
-namespace lite {
-namespace tools {
-namespace debug {
-
-struct DebugConfig {
-  // arguments
-  std::string model_dir;
-  std::string topo_output_file;
-  std::string tensor_output_file;
-  std::string input_file;
-  std::vector<std::string> tensor_names;
-  bool output_weights;
-  bool output_topo;
-  bool output_vars;
-  int tensor_output_length;
-  int arm_thread_num;
-
-  std::unordered_map<std::string, lite::pb::VarDesc> var_descs;
-  std::vector<std::vector<std::string>> input_values;
-};
-
-template <typename T>
-std::vector<T> Split2Vector(const std::string& input,
-                            const std::string& separator) {
-  std::vector<T> tgt;
-  std::vector<std::string> inputs = Split(input, separator);
-  tgt.resize(inputs.size());
-  STL::stringstream ss;
-  for (int i = 0; i < inputs.size(); ++i) {
-    ss << inputs[i] << " ";
-  }
-  for (int i = 0; i < inputs.size(); ++i) {
-    ss >> tgt[i];
-  }
-  return tgt;
-}
-
-void CollectFeedVarsInfo(std::unordered_map<int, std::string>* feed_vars_info,
-                         const framework::proto::ProgramDesc& prog_desc) {
-  CHECK(feed_vars_info);
-  auto desc = prog_desc;
-  for (const auto& proto_op_desc : desc.blocks(0).ops()) {
-    auto tmp_desc = proto_op_desc;
-    lite::pb::OpDesc op_desc(&tmp_desc);
-    auto op_type = op_desc.Type();
-    if (op_type == "feed") {
-      (*feed_vars_info)
-          .emplace(op_desc.GetAttr<int>("col"), op_desc.Output("Out").front());
-    }
-  }
-}
-template <typename T>
-void FillTensorData(lite::Tensor* tensor, const DebugConfig& conf, int col) {
-  CHECK(tensor);
-  auto dim_size = tensor->dims().production();
-  auto* data = tensor->mutable_data<T>();
-  if (conf.input_values.size() > 0) {
-    CHECK(col < conf.input_values[0].size())
-        << "Input data fields out of index. field_len: "
-        << conf.input_values[0].size() << " col: " << col;
-    std::vector<T> input_data(
-        std::move(Split2Vector<T>(conf.input_values[0][col], " ")));
-    CHECK(input_data.size() == dim_size)
-        << "Input data field[" << col
-        << "] mismatch TensorDim: " << input_data.size() << " vs " << dim_size;
-    for (int i = 0; i < dim_size; i++) {
-      data[i] = input_data[i];
-    }
-  } else {
-    LOG(INFO) << "-------------> Use all-ones input";
-    for (int i = 0; i < dim_size; i++) {
-      data[i] = 1;
-    }
-  }
-}
-
-void CheckDim(std::vector<DDim::value_type>* dim) {
-  CHECK(dim);
-  for (int i = 0; i < dim->size(); ++i) {
-    if ((*dim)[i] < 0) (*dim)[i] = -(*dim)[i];
-  }
-}
-
-void PrepareModelInputTensor(const DebugConfig& conf,
-                             lite::Scope* scope,
-                             const framework::proto::ProgramDesc& desc) {
-  CHECK(scope);
-
-  std::unordered_map<int, std::string> feed_vars_info;
-  CollectFeedVarsInfo(&feed_vars_info, desc);
-  auto* feed_var =
-      scope->FindVar("feed")->GetMutable<std::vector<lite::Tensor>>();
-  feed_var->resize(feed_vars_info.size());
-
-  for (auto& item : feed_vars_info) {
-    auto& var_desc = conf.var_descs.at(item.second);
-    auto val_type = var_desc.GetDataType();
-    auto dim = var_desc.GetShape();
-    CheckDim(&dim);
-    auto* input_tensor = &feed_var->at(item.first);
-    input_tensor->Resize(DDim(dim));
-    switch (val_type) {
-#define FILL_TENSOR_BY_TYPE_ONCE(pb_type__, type__)         \
-  case framework::proto::VarType::pb_type__:                \
-    FillTensorData<type__>(input_tensor, conf, item.first); \
-    break
-
-      FILL_TENSOR_BY_TYPE_ONCE(UINT8, uint8_t);
-      FILL_TENSOR_BY_TYPE_ONCE(INT8, int8_t);
-      FILL_TENSOR_BY_TYPE_ONCE(INT16, int16_t);
-      FILL_TENSOR_BY_TYPE_ONCE(INT32, int32_t);
-      FILL_TENSOR_BY_TYPE_ONCE(INT64, int64_t);
-      FILL_TENSOR_BY_TYPE_ONCE(FP32, float);
-      FILL_TENSOR_BY_TYPE_ONCE(FP64, double);
-
-      default:
-        LOG(FATAL) << "Unsupported data type: " << static_cast<int>(val_type);
-#undef FILL_TENSOR_BY_TYPE_ONCE
-    }
-  }
-}
-
-void ParseInputFile(DebugConfig* conf) {
-  CHECK(conf);
-  if (conf->input_file.empty()) return;
-  auto& inputs = conf->input_values;
-  std::ifstream fd(conf->input_file);
-  CHECK(fd.is_open()) << "Open input file: " << conf->input_file << " failed!";
-  std::string line;
-  while (std::getline(fd, line)) {
-    inputs.emplace_back(std::move(Split(line, FLAGS_separator)));
-  }
-  LOG(INFO) << "Load data:" << inputs.size() << " items";
-}
-
-void ParseConfig(DebugConfig* conf) {
-  CHECK(conf);
-#define CHECK_NON_EMPTY(name__) \
-  CHECK(!FLAGS_##name__.empty()) << "Option " << #name__ << " can't be empty."
-  CHECK_NON_EMPTY(model_dir);
-  if (FLAGS_output_topo) {
-    CHECK_NON_EMPTY(topo_output_file);
-  }
-  if (FLAGS_output_vars || FLAGS_output_weights) {
-    CHECK_NON_EMPTY(tensor_output_file);
-  }
-#undef CHECK_NON_EMPTY
-  conf->model_dir = FLAGS_model_dir;
-  conf->topo_output_file = FLAGS_topo_output_file;
-  conf->tensor_output_file = FLAGS_tensor_output_file;
-  conf->input_file = FLAGS_input_file;
-  conf->output_weights = FLAGS_output_weights;
-  conf->output_vars = FLAGS_output_vars;
-  conf->output_topo = FLAGS_output_topo;
-  conf->tensor_output_length = FLAGS_tensor_output_length;
-  conf->arm_thread_num = FLAGS_arm_thread_num;
-
-  if (!FLAGS_tensor_names.empty()) {
-    conf->tensor_names = Split(FLAGS_tensor_names, FLAGS_separator);
-  }
-
-  ParseInputFile(conf);
-}
-
-void CollectAndDumpTopoInfo(const std::vector<Instruction>& instructions,
-                            const DebugConfig& conf) {
-  if (!conf.output_topo) return;
-  LOG(INFO) << "----------------- dump topo file";
-  std::ofstream os(conf.topo_output_file);
-  CHECK(os.is_open());
-  for (auto& inst : instructions) {
-    auto* op_info = inst.op()->op_info();
-    CHECK(op_info);
-    os << op_info->Type() << "\t";
-    os << "(";
-#define DUMP_TOPO_INFO_ONCE(name__)                   \
-  {                                                   \
-    auto argnames = op_info->name__##ArgumentNames(); \
-    for (int i = 0; i < argnames.size(); ++i) {       \
-      os << argnames[i] << ":";                       \
-      auto vars = op_info->name__(argnames[i]);       \
-      for (int j = 0; j < vars.size(); ++j) {         \
-        os << vars[j];                                \
-        if (j != vars.size() - 1) os << "#";          \
-      }                                               \
-      if (i != argnames.size() - 1) os << " ";        \
-    }                                                 \
-  }
-    DUMP_TOPO_INFO_ONCE(Input);
-    os << ")\t(";
-    DUMP_TOPO_INFO_ONCE(Output);
-    os << ")\n";
-#undef DUMP_TOPO_INFO_ONCE
-  }
-  os.close();
-}
-
-void CollectVarDescs(
-    std::unordered_map<std::string, lite::pb::VarDesc>* var_descs,
-    framework::proto::ProgramDesc* desc) {
-  CHECK(desc);
-  CHECK(var_descs);
-  CHECK(!desc->blocks().empty());
-  std::unordered_set<std::string> weights;
-  for (auto& proto_var_desc : *desc->mutable_blocks(0)->mutable_vars()) {
-    lite::pb::VarDesc var_desc(&proto_var_desc);
-    (*var_descs).emplace(var_desc.Name(), std::move(var_desc));
-  }
-}
-
-std::unordered_set<std::string> CollectUnusedVars(
-    const std::vector<Instruction>& instructions) {
-  std::unordered_set<std::string> unused;
-  std::unordered_set<std::string> all_inputs;
-  for (auto& inst : instructions) {
-    for (const auto& name : inst.op()->op_info()->input_names()) {
-      all_inputs.insert(name);
-    }
-  }
-
-  for (auto& inst : instructions) {
-    for (const auto& name : inst.op()->op_info()->output_names()) {
-      if (all_inputs.count(name) == 0) unused.insert(name);
-    }
-  }
-
-  return unused;
-}
-
-std::string GetTensorRepr(const lite::Tensor& tensor, int out_data_len) {
-  STL::stringstream ss;
-  auto size = tensor.dims().production();
-  if (out_data_len >= 0) {
-    size = std::min(size, static_cast<DDim::value_type>(out_data_len));
-  }
-  for (int i = 0; i < size; i++) {
-    ss << tensor.template data<float>()[i];
-    if (i != size - 1) ss << " ";
-  }
-  return ss.str();
-}
-
-void CollectAndDumpTensorInfo(const std::vector<Instruction>& instructions,
-                              const DebugConfig& conf) {
-  CHECK(instructions.size() > 0) << "No instruction found";
-  const auto* scope = const_cast<lite::OpLite*>(instructions[0].op())->scope();
-  std::ofstream os(conf.tensor_output_file);
-  CHECK(os.is_open());
-
-  std::unordered_set<std::string> dump_vars;
-#define DUMP_TENSOR_ONCE(name__)                                  \
-  LOG(INFO) << "----------------- dump tensor: " << name__;       \
-  auto& tensor = scope->FindVar(name__)->Get<lite::Tensor>();     \
-  os << name__ << "\t" << tensor.dims() << "\t"                   \
-     << GetTensorRepr(tensor, conf.tensor_output_length) << "\n"; \
-  dump_vars.insert(name__)
-
-#define DUMP_OP_TENSOR_ONCE(name__, skip__)                              \
-  for (const auto& name : inst.op()->op_info()->name__##_names()) {      \
-    bool is_weight = conf.var_descs.at(name).Persistable();              \
-    if (unused.count(name) != 0 || name == #skip__ ||                    \
-        (!conf.output_weights && is_weight) ||                           \
-        (!conf.output_vars && !is_weight) || dump_vars.count(name) != 0) \
-      continue;                                                          \
-    DUMP_TENSOR_ONCE(name);                                              \
-  }
-
-  if (conf.tensor_names.size() == 0) {
-    std::unordered_set<std::string> unused(
-        std::move(CollectUnusedVars(instructions)));
-
-    for (auto& inst : instructions) {
-      DUMP_OP_TENSOR_ONCE(input, feed);
-      DUMP_OP_TENSOR_ONCE(output, fetch);
-    }
-  } else {
-    for (const auto& name : conf.tensor_names) {
-      DUMP_TENSOR_ONCE(name);
-    }
-  }
-#undef DUMP_OP_TENSOR_ONCE
-#undef DUMP_TENSOR_ONCE
-  os.close();
-}
-
-}  // namespace debug
-}  // namespace tools
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tools/debug/model_debug_tool.cc b/lite/tools/debug/model_debug_tool.cc
deleted file mode 100644
index a2ff37895c..0000000000
--- a/lite/tools/debug/model_debug_tool.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <sstream>
-#include <string>
-#include <vector>
-#include "lite/api/cxx_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/core/op_registry.h"
-#include "lite/model_parser/model_parser.h"
-#include "lite/model_parser/pb/program_desc.h"
-#include "lite/tools/debug/debug_utils.h"
-
-namespace paddle {
-namespace lite {
-namespace tools {
-namespace debug {
-
-void Run(DebugConfig* conf) {
-  CHECK(conf);
-#ifdef LITE_WITH_ARM
-  DeviceInfo::Init();
-  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH,
-                                  conf->arm_thread_num);
-#endif
-  lite::Predictor predictor;
-  std::vector<Place> valid_places({
-      Place{TARGET(kHost), PRECISION(kFloat)},
-#ifdef LITE_WITH_ARM
-      Place{TARGET(kARM), PRECISION(kFloat)},
-#endif
-#ifdef LITE_WITH_X86
-      Place{TARGET(kX86), PRECISION(kFloat)},
-#endif
-#ifdef LITE_WITH_FPGA
-      Place{TARGET(kFPGA), PRECISION(kFloat)},
-#endif
-  });
-
-  std::vector<std::string> passes{{
-      "static_kernel_pick_pass",
-      "variable_place_inference_pass",
-      "type_target_cast_pass",
-      "variable_place_inference_pass",
-      "io_copy_kernel_pick_pass",
-      "variable_place_inference_pass",
-      "runtime_context_assign_pass",
-  }};
-
-  predictor.Build(conf->model_dir,
-                  "",
-                  "",
-#ifdef LITE_WITH_ARM
-                  Place{TARGET(kARM), PRECISION(kFloat)},
-#endif
-#ifdef LITE_WITH_X86
-                  Place{TARGET(kX86), PRECISION(kFloat)},
-#endif
-                  valid_places,
-                  passes);
-
-  predictor.GenRuntimeProgram();
-  auto& instructions = predictor.runtime_program().instructions();
-  CHECK(!instructions.empty()) << "No instruction found";
-  auto* scope = const_cast<lite::OpLite*>(instructions[0].op())->scope();
-
-  // TODO(sangoly): Reload pb program for debug, this may not be a good manner,
-  // refine this
-  std::unique_ptr<framework::proto::ProgramDesc> program_desc =
-      LoadProgram(conf->model_dir + "/__model__");
-  CollectVarDescs(&(conf->var_descs), program_desc.get());
-  PrepareModelInputTensor(*conf, scope, *program_desc);
-
-  predictor.Run();
-
-  CollectAndDumpTopoInfo(instructions, *conf);
-  CollectAndDumpTensorInfo(instructions, *conf);
-
-  // TODO(sangoly): Maybe add some profile info here
-  auto* out = predictor.GetOutput(0);
-  LOG(INFO) << out << " memory size " << out->data_size();
-  LOG(INFO) << "out " << out->data<float>()[0];
-  LOG(INFO) << "dims " << out->dims();
-  LOG(INFO) << "out data size: " << out->data_size();
-}
-
-}  // namespace debug
-}  // namespace tools
-}  // namespace lite
-}  // namespace paddle
-
-int main(int argc, char** argv) {
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-  paddle::lite::tools::debug::DebugConfig conf;
-  paddle::lite::tools::debug::ParseConfig(&conf);
-  paddle::lite::tools::debug::Run(&conf);
-
-  return 0;
-}
diff --git a/lite/tools/gitlab_review.sh b/lite/tools/gitlab_review.sh
deleted file mode 100755
index 4be0fcada4..0000000000
--- a/lite/tools/gitlab_review.sh
+++ /dev/null
@@ -1,75 +0,0 @@
-#!/bin/bash
-set -ex
-
-readonly ci_username="ci-robot"
-readonly ci_pass="ci-robot"
-
-readonly workspace=$PWD
-readonly approval_root=$workspace/lite-review
-
-function update_approval_repo {
-    local repo_url=$1
-
-    if [ -d $approval_root ]; then
-        cd $approval_root
-        git pull
-    else
-        git clone $repo_url $approval_root
-    fi
-}
-
-function check_not_approve_by_self {
-    local commit=$1
-
-    cd $approval_root
-    local commits=$(git log --pretty=format:"%h")
-
-    cd $workspace
-    local author=$(git log --pretty=format:"%cn" -n1)
-
-    for c in $commits; do
-        cd $approval_root
-        local diff=$(git diff ${c})
-        local hit=$(echo $diff | grep "+${commit}")
-
-        cd $approval_root
-        if [ ! -z "$hit" ]; then
-            local c_author=$(git show -s --format='%cn' $c)
-            if [ "$author" == "$c_author" ]; then
-                echo "Approve the PR by yourself is not acceptiable."
-                exit -1
-            else
-                exit 0
-            fi
-        fi
-    done
-}
-
-function check_is_approved {
-    local commit=$1
-
-    local approval_file=$approval_root/approvals.txt
-
-    local approval="$(cat $approval_file | grep $commit)"
-
-    if [ -z "$approval" ]; then
-        echo "no approval found"
-        echo "find another developer to review and fill the approved PR's commit id to lite-review repo"
-        exit -1
-    fi
-}
-
-function main {
-    local approval_repo_path="./lite-review"
-    local approval_repo_url="http://${ci_username}:${ci_pass}@10.87.145.36/inference/lite-review.git"
-    #local commit=$(git log -n1 --pretty=format:"%h")
-    local commit=${CI_COMMIT_SHA:0:6}
-
-    update_approval_repo $approval_repo_url
-    check_is_approved $commit $approval_repo_path
-    check_not_approve_by_self $commit
-
-    echo "approved!"
-}
-
-main
diff --git a/lite/tools/mobile_readme.md b/lite/tools/mobile_readme.md
deleted file mode 100644
index 56da6a7fd9..0000000000
--- a/lite/tools/mobile_readme.md
+++ /dev/null
@@ -1,135 +0,0 @@
-
-# Paddle-lite-mobile开发指南
-
-## 交叉编译
-
-Paddle-lite-mobile 推荐在我们的Docker环境下交叉编译，减少环境配置上的不必要问题。
-
-### 1. 拉取代码创建容器
-
-```shell
-$ git clone https://github.com/PaddlePaddle/Paddle.git
-$ git checkout incubate/lite
-```
-
-编译docker环境:
-`docker build --file lite/tools/Dockerfile.mobile --tag paddle-lite-mobile:latest . `
-
-### 主要cmake选项
-                
-- `ARM_TARGET_OS` 代表目标操作系统， 目前支持 "android" "armlinux"， 默认是Android
-- `ARM_TARGET_ARCH_ABI` 代表ARCH，支持输入"armv8"和"armv7"，针对OS不一样选择不一样。
-    - `-DARM_TARGET_OS="android"` 时 
-        - "armv8", 等效于 "arm64-v8a"。 default值为这个。
-        - "armv7", 等效于 "armeabi-v7a"。 
-    - `-DARM_TARGET_OS="armlinux"` 时 
-        - "armv8", 等效于 "arm64"。 default值为这个。
-        - "armv7hf", 等效于使用`eabihf`且`-march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 `。
-        - "armv7", 等效于使用`eabi`且`-march=armv7-a -mfloat-abi=softfp -mfpu=neon-vfpv4`。
-- `ARM_TARGET_LANG` 代表目标编译的语言， 默认为gcc，支持 gcc和clang两种。
-
-### 编译
-
-基于`paddle-lite-mobile`镜像创建容器，并在容器内外建立目录映射关系：
-
-```shell
-$ docker run -it --name <yourname> --net=host --privileged -v <your-directory-path>:<your-directory-path> paddle-lite-mobile bash
-```
-
-参考build.sh下的 cmake arm编译需要的平台。
-
-参考示例：
-
-```shell
-#!/bin/bash
-cmake .. \
-    -DWITH_GPU=OFF \
-    -DWITH_LITE=ON \
-    -DLITE_WITH_CUDA=OFF \
-    -DLITE_WITH_X86=OFF \
-    -DLITE_WITH_ARM=ON \
-    -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
-    -DWITH_TESTING=ON \
-    -DWITH_MKL=OFF \
-    -DARM_TARGET_OS="android" -DARM_TARGET_ARCH_ABI="arm64-v8a"
-
-# fc层单测
-make test_fc_compute_arm -j
-
-```
-### 在Android上执行
-
-#### 1. 创建模拟器（如果使用真机则跳过此步骤）
-
-```shell
-# 创建Android avd (armv8)
-$ echo n | avdmanager create avd -f -n paddle-armv8 -k "system-images;android-24;google_apis;arm64-v8a"
-# 启动Android armv8 emulator
-$ ${ANDROID_HOME}/emulator/emulator -avd paddle-armv8 -noaudio -no-window -gpu off -verbose &
-
-# 如果需要执行armv7版本，如下：
-# $ echo n | avdmanager create avd -f -n paddle-armv7 -k "system-images;android-24;google_apis;armeabi-v7a"
-# $ ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -verbose &
-
-# 退出所有模拟器
-adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
-```
-
-#### 2. 上传编译文件到手机上
-
-键盘上`crtl+q+p`同时摁下，切换到容器外（容器还在后台运行），将刚刚编译出的程序`adb push`到手机上。USB线连接手机，确保`adb devices`可以找到手机设备。
-```shell
-$ cd <paddle-repo>
-$ adb push ./build/lite/kernels/arm/test_fc_compute_arm /data/local/tmp/
-
-# 进入手机
-$ adb shell # 若多台手机设备先用命令adb devices查看目标手机的序列码
-$ cd /data/local/tmp
-
-# 执行编译的程序
-$ ./test_fc_compute_arm
-```
-
-### 在ARM LINUX下执行
-
-拉取Linux arm64镜像
-```shell
-$ docker pull multiarch/ubuntu-core:arm64-bionic
-```
-运行容器并在内外建立目录映射关系
-```shell
-$ docker run -it --name <yourname> -v <your-directory-path>:<your-directory-path> multiarch/ubuntu-core:arm64-bionic
-```
-进入bin目录，并运行并文件
-```shell
-$ cd <bin-dir>
-$ ./test_fc_compute_arm
-```
-
-# Q&A
-
-#### 1. adb命令找不到：adb: command not found  
-解决：`sudo apt install -y adb`   
-
-#### 2. 明明手机USB连接电脑却显示找不到设备：`error: device not found`  
-解决：
-第一步`lsusb`命令查看插上拔下手机前后usb设备的变化情况，确定手机设备的ID。  假设`lsusb`命令执行显示`Bus 003 Device 011: ID 2717:9039  `，则ID是`0x2717`；  
-第二步：创建`adb_usb.ini`文件并追加写入ID：`echo 0x2717 >> ~/.android/adb_usb.ini`；  
-第三步：给手机添加权限`sudo vim /etc/udev/rules.d/70-android.rules`，根据第一步骤取得的`ATTRS{idVendor}`和`ATTRS{idProduct}`这两个属性值，在该文件加入该设备信息：
- `SUBSYSTEM=="usb", ATTRS{idVendor}=="2717", ATTRS{idProduct}=="9039",MODE="0666"`；  
-第四步：重启USB服务：
-```shell
-$ sudo chmod a+rx /etc/udev/rules.d/70-android.rules
-$ sudo service udev restart
-```
-第五步：重启adb服务，adb devices有设备说明adb安装成功。  
-```shell
-$ adb kill-server
-$ sudo adb start-server
-$ adb devices
-
-# 若显示连接的手机设备，则表示成功
-List of devices attached
-5cb00b6 device
-```
- 
diff --git a/lite/tools/prepare_benchmark.sh b/lite/tools/prepare_benchmark.sh
deleted file mode 100755
index 97f5f12492..0000000000
--- a/lite/tools/prepare_benchmark.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-set -e
-
-function download_files_with_url_prefix {
-    local url_prefix=$1
-    local download_file_list=$2
-    local tar_file_pattern="tar.gz"
-
-    for file_name in ${download_file_list[*]}; do
-        echo "[INFO] Downloading ${file_name} ..."
-        wget -c ${url_prefix}/${file_name}
-        chmod +x ./${file_name}
-        # check  tar.gz file
-        if [[ ${file_name} =~ ${tar_file_pattern} ]]
-        then
-            echo "[INFO] Extracting ${file_name} ..."
-            tar -zxvf ${file_name}
-        fi
-    done
-}
-
-
-# 1.Download tar packages: models, benchmark_bin
-readonly DOWNLOAD_TAR_PREFIX="https://paddle-inference-dist.bj.bcebos.com/PaddleLite/"
-readonly DOWNLOAD_TAR_LIST=("benchmark_bin_android_armv8_cpu.tar.gz" \
-                            "benchmark_bin_android_armv7_cpu.tar.gz" \
-                            "benchmark_models.tar.gz")
-download_files_with_url_prefix ${DOWNLOAD_TAR_PREFIX} "${DOWNLOAD_TAR_LIST[*]}"
-
-# 2.Download script: benchmark
-readonly DOWNLOAD_SCRIPT_PREFIX="https://raw.githubusercontent.com/PaddlePaddle/Paddle-Lite/develop/lite/tools/"
-readonly DOWNLOAD_SCRIPT_LIST=("benchmark.sh")
-download_files_with_url_prefix ${DOWNLOAD_SCRIPT_PREFIX} "${DOWNLOAD_SCRIPT_LIST[*]}"
-
-# 3.Run benchmark
-echo "[INFO] Run benchmark for android armv7 cpu"
-bash benchmark.sh \
-  ./benchmark_bin_android_armv7_cpu \
-  ./benchmark_models \
-  result_android_armv7_cpu.txt
-
-echo "[INFO] Run benchmark for android armv8 cpu"
-bash benchmark.sh \
-  ./benchmark_bin_android_armv8_cpu \
-  ./benchmark_models \
-  result_android_armv8_cpu.txt
diff --git a/lite/tools/python/lite_test.py b/lite/tools/python/lite_test.py
deleted file mode 100644
index 5ef3548832..0000000000
--- a/lite/tools/python/lite_test.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.fluid.core as core
-import paddle.fluid.core.lite as lite
-import paddle.fluid.layers as layers
-import numpy as np
-import unittest
-
-from paddle.fluid.cxx_trainer import add_feed_fetch_op
-
-
-def _as_lodtensor(data, place):
-    # single tensor case
-    tensor = core.LoDTensor()
-    tensor.set(data, place)
-    return tensor
-
-
-data_label = [[
-    0.753544, 0.772977, 0.646915, 0.747543, 0.528923, 0.0517749, 0.248678,
-    0.75932, 0.960376, 0.606618
-]]
-data_a = [[
-    0.874445, 0.21623, 0.713262, 0.702672, 0.396977, 0.828285, 0.932995,
-    0.442674, 0.0321735, 0.484833, 0.045935, 0.21276, 0.556421, 0.131825,
-    0.285626, 0.741409, 0.257467, 0.975958, 0.444006, 0.114553
-]]
-
-data_loss = [0.9876687]
-
-
-class NaiveModelTest(unittest.TestCase):
-    def test_model(self):
-
-        start_prog = fluid.Program()
-        main_prog = fluid.Program()
-
-        start_prog.random_seed = 100
-        main_prog.random_seed = 100
-
-        with fluid.program_guard(main_prog, start_prog):
-            a = fluid.layers.data(name="a", shape=[1, 20], dtype='float32')
-            label = fluid.layers.data(name="label", shape=[10], dtype='float32')
-            a1 = fluid.layers.fc(input=a, size=10, act=None, bias_attr=False)
-            cost = fluid.layers.square_error_cost(a1, label)
-            avg_cost = fluid.layers.mean(cost)
-
-            optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-            optimizer.minimize(avg_cost)
-
-            x86_place = lite.Place(lite.TargetType.kX86,
-                                   lite.PrecisionType.kFloat,
-                                   lite.DataLayoutType.kNCHW, 0)
-            host_place = lite.Place(lite.TargetType.kHost,
-                                    lite.PrecisionType.kFloat,
-                                    lite.DataLayoutType.kNCHW, 0)
-            scope = lite.Scope()
-
-        trainer = lite.CXXTrainer(scope, x86_place, [x86_place, host_place])
-        trainer.run_startup_program(start_prog.desc)
-
-        cpu = fluid.core.CPUPlace()
-        main_prog = add_feed_fetch_op(
-            main_prog,
-            feed=['a', 'label'],
-            fetch_list={avg_cost},
-            scope=scope,
-            place=cpu)
-        # print(main_prog)
-        exe = trainer.build_main_program_executor(main_prog.desc)
-
-        feed_data = [
-            _as_lodtensor(np.array(data_a, object), cpu),
-            _as_lodtensor(np.array(data_label, object), cpu)
-        ]
-
-        exe.run(feed_data)
-        # print(np.array(exe.get_output(0).raw_tensor()))
-        self.assertTrue(
-            np.allclose(
-                np.array(data_loss),
-                np.array(exe.get_output(0).raw_tensor()),
-                atol=1e-8),
-            "lite result not equel to offline result")
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/lite/tools/search_support_ops.py b/lite/tools/search_support_ops.py
deleted file mode 100644
index 43c3c0704d..0000000000
--- a/lite/tools/search_support_ops.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# -*- coding: utf-8 -*- 
-import os
-import re
-
-
-def merge_sort_two_list(la, lb):
-    la.extend(lb)
-    la = list(set(la))
-    la.sort()
-    return la
-
-
-ops_file = "../api/paddle_use_ops.h"
-kernels_file = "../api/paddle_use_kernels.h"
-result_file = "./support_ops_list.md"
-
-# search ops
-if os.path.exists(ops_file):
-    pattern = re.compile("USE_LITE_OP[(](.*?)[)]")
-    ops = []
-    for line in open(ops_file):
-        if line != None and line[0:2] != "//":
-            op = pattern.findall(line)
-            ops.extend(op)
-    ops.sort()
-    # print ops
-    # print len(ops)
-else:
-    print "ops_file no exist in ", ops_file
-
-# search kernels
-if os.path.exists(kernels_file):
-    kernel_types = [
-        "kARM, kFloat", "kARM, kInt8", "kARM, kAny", "kX86, kFloat",
-        "kX86, kInt8", "kX86, kAny", "kOpenCL, kFloat", "kOpenCL, kInt8",
-        "kOpenCL, kAny"
-    ]
-    patterns = []
-    for type in kernel_types:
-        pat_str = "USE_LITE_KERNEL[(](.*?), " + type
-        patterns.append(re.compile(pat_str))
-
-    kernels = [[] for i in range(len(kernel_types))]
-    for line in open(kernels_file):
-        if line != None and line[0:2] != "//":
-            for i in range(len(kernel_types)):
-                kl = patterns[i].findall(line)
-                kernels[i].extend(kl)
-else:
-    print "kernels_file no exist in ", kernels_file
-
-# write out
-if os.path.exists(result_file):
-    os.remove(result_file)
-out = open(result_file, "w")
-out.write("# PaddleLite support ops and kernels\n")
-out.write("## ops\n")
-for op in ops:
-    out.write("- " + op + "\n")
-
-out.write("## kernels\n")
-for i in range(len(kernel_types) / 3):
-    for j in range(2):
-        out.write("### " + kernel_types[3 * i + j] + "\n")
-        for kl in merge_sort_two_list(kernels[3 * i + j], kernels[3 * i + 2]):
-            out.write("- " + kl + "\n")
diff --git a/lite/utils/CMakeLists.txt b/lite/utils/CMakeLists.txt
deleted file mode 100644
index 6337085d82..0000000000
--- a/lite/utils/CMakeLists.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-# if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-#     set(utils_DEPS)
-# else()
-# endif()
-
-if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR LITE_ON_MODEL_OPTIMIZE_TOOL)
-  lite_cc_library(logging SRCS logging.cc)
-  set(utils_DEPS logging)
-  lite_cc_test(test_logging SRCS logging_test.cc DEPS ${utils_DEPS})
-else()
-   set(utils_DEPS glog)
-endif()
-
-lite_cc_test(test_varient SRCS varient_test.cc DEPS utils)
-lite_cc_library(any SRCS any.cc)
-
-if(LITE_ON_TINY_PUBLISH OR LITE_ON_MODEL_OPTIMIZE_TOOL)
-  lite_cc_library(stream SRCS replace_stl/stream.cc)
-endif()
-#lite_cc_library(utils SRCS cp_logging.cc string.cc DEPS ${utils_DEPS} any)
-
-if(LITE_ON_TINY_PUBLISH OR LITE_ON_MODEL_OPTIMIZE_TOOL)
-  lite_cc_library(utils SRCS string.cc DEPS ${utils_DEPS} any stream)
-else()
-  lite_cc_library(utils SRCS string.cc DEPS ${utils_DEPS} any)
-endif()
diff --git a/lite/utils/all.h b/lite/utils/all.h
deleted file mode 100644
index b8cffb9fd1..0000000000
--- a/lite/utils/all.h
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "lite/utils/any.h"
-#include "lite/utils/check.h"
-#include "lite/utils/cp_logging.h"
-#include "lite/utils/factory.h"
-#include "lite/utils/hash.h"
-#include "lite/utils/io.h"
-#include "lite/utils/macros.h"
-#include "lite/utils/varient.h"
-
-#ifdef LITE_ON_TINY_PUBLISH
-#include "lite/utils/replace_stl/stream.h"
-#endif
diff --git a/lite/utils/any.cc b/lite/utils/any.cc
deleted file mode 100644
index fde832aae0..0000000000
--- a/lite/utils/any.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/utils/any.h"
-
-namespace paddle {
-namespace lite {
-
-size_t Any::kInvalidType{typeid(void).hash_code()};
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/utils/any.h b/lite/utils/any.h
deleted file mode 100644
index 00c652613d..0000000000
--- a/lite/utils/any.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <functional>
-#include <set>
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-
-class Any {
- public:
-  template <typename T>
-  void set(const T& v) {
-    set<T>();
-    *get_mutable<T>() = v;
-  }
-
-  template <typename T>
-  void set() {
-    if (type_ != kInvalidType) {
-      CHECK(type_ == typeid(T).hash_code());
-    } else {
-      type_ = typeid(T).hash_code();
-      deleter_ = [&] { delete static_cast<T*>(data_); };
-    }
-    data_ = new T;
-  }
-
-  template <typename T>
-  const T& get() const {
-    CHECK(data_);
-    CHECK(type_ == typeid(T).hash_code());
-    return *static_cast<T*>(data_);
-  }
-  template <typename T>
-  T* get_mutable() {
-    CHECK(data_);
-    CHECK(type_ == typeid(T).hash_code());
-    return static_cast<T*>(data_);
-  }
-
-  bool valid() const { return data_; }
-
-  // ~Any() {
-  //    if (valid()) {
-  //      deleter_();
-  //    }
-  //  }
-
- private:
-  static size_t kInvalidType;
-  size_t type_{kInvalidType};
-  void* data_{nullptr};
-  std::function<void()> deleter_;
-};
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/utils/check.h b/lite/utils/check.h
deleted file mode 100644
index ee0fa1f2d2..0000000000
--- a/lite/utils/check.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#define CHECK_OR_FALSE(cond)               \
-  if (!(cond)) {                           \
-    LOG(ERROR) << #cond << " test error!"; \
-    return false;                          \
-  }
-#define CHECK_EQ_OR_FALSE(a__, b__)                           \
-  if ((a__) != (b__)) {                                       \
-    LOG(ERROR) << #a__ << " == " << #b__ << " check failed!"; \
-    LOG(ERROR) << a__ << " != " << b__;                       \
-    return false;                                             \
-  }
-
-#define CHECK_GT_OR_FALSE(a__, b__)                          \
-  if (!((a__) > (b__))) {                                    \
-    LOG(ERROR) << #a__ << " > " << #b__ << " check failed!"; \
-    LOG(ERROR) << a__ << " <= " << b__;                      \
-    return false;                                            \
-  }
-
-#define CHECK_GE_OR_FALSE(a__, b__)                           \
-  if (!((a__) >= (b__))) {                                    \
-    LOG(ERROR) << #a__ << " >= " << #b__ << " check failed!"; \
-    LOG(ERROR) << a__ << " < " << b__;                        \
-    return false;                                             \
-  }
diff --git a/lite/utils/container.h b/lite/utils/container.h
deleted file mode 100644
index 0022e516b5..0000000000
--- a/lite/utils/container.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <map>
-#include <string>
-#include <utility>
-#include <vector>
-
-namespace paddle {
-namespace lite {
-
-template <typename Elem>
-class OrderedMap {
-  std::vector<Elem> list_;
-  std::map<std::string, int> order_;
-
- public:
-  void Set(const std::string& key, Elem&& e) {
-    list_.emplace_back(std::move(e));
-    CHECK(!order_.count(key)) << "duplicate key '" << key << "' found";
-    order_[key] = list_.size() - 1;
-  }
-
-  const Elem& Get(const std::string& key) const {
-    CHECK(order_.count(key)) << "No key " << key << " found";
-    return list_[order_.at(key)];
-  }
-
-  Elem& GetMutable(const std::string& key) {
-    CHECK(order_.count(key)) << "No key " << key << " found";
-    return list_[order_[key]];
-  }
-
-  std::vector<Elem>& elements() { return list_; }
-  const std::vector<Elem>& elements() const { return list_; }
-};
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/utils/cp_logging.cc b/lite/utils/cp_logging.cc
deleted file mode 100644
index 830a871b86..0000000000
--- a/lite/utils/cp_logging.cc
+++ /dev/null
@@ -1,19 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {}  // namespace lite
-}  // namespace paddle
diff --git a/lite/utils/cp_logging.h b/lite/utils/cp_logging.h
deleted file mode 100644
index cc10bece47..0000000000
--- a/lite/utils/cp_logging.h
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#if defined(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || \
-    defined(LITE_ON_MODEL_OPTIMIZE_TOOL)
-#include "lite/utils/logging.h"
-#else  // LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-#include <glog/logging.h>
-#endif  // LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
diff --git a/lite/utils/factory.h b/lite/utils/factory.h
deleted file mode 100644
index fea8561bcf..0000000000
--- a/lite/utils/factory.h
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <iostream>
-#include <list>
-#include <memory>
-#include <sstream>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include "lite/utils/all.h"
-#include "lite/utils/cp_logging.h"
-#include "lite/utils/replace_stl/stream.h"
-
-namespace paddle {
-namespace lite {
-
-/*
- * Factor for any Type creator.
- *
- * Usage:
- *
- * struct SomeType;
- * // Register a creator.
- * Factory<SomeType>::Global().Register("some_key", [] ->
- *                                      std::unique_ptr<SomeType> { ... });
- * // Retrive a creator.
- * auto some_type_instance = Factory<SomeType>::Global().Create("some_key");
- */
-template <typename ItemType, typename ItemTypePtr>
-class Factory {
- public:
-  using item_t = ItemType;
-  using self_t = Factory<item_t, ItemTypePtr>;
-  using item_ptr_t = ItemTypePtr;
-  using creator_t = std::function<item_ptr_t()>;
-
-  static Factory& Global() {
-    static Factory* x = new self_t;
-    return *x;
-  }
-
-  void Register(const std::string& op_type, creator_t&& creator) {
-    creators_[op_type].emplace_back(std::move(creator));
-  }
-
-  item_ptr_t Create(const std::string& op_type) const {
-    auto res = Creates(op_type);
-    if (res.empty()) return nullptr;
-    CHECK_EQ(res.size(), 1UL) << "Get multiple Op for type " << op_type;
-    return std::move(res.front());
-  }
-
-  std::list<item_ptr_t> Creates(const std::string& op_type) const {
-    std::list<item_ptr_t> res;
-    auto it = creators_.find(op_type);
-    if (it == creators_.end()) return res;
-    for (auto& c : it->second) {
-      res.emplace_back(c());
-    }
-    return res;
-  }
-
-  std::string DebugString() const {
-    STL::stringstream ss;
-    for (const auto& item : creators_) {
-      ss << "  - " << item.first << "\n";
-    }
-    return ss.str();
-  }
-
- protected:
-  std::unordered_map<std::string, std::list<creator_t>> creators_;
-};
-
-/* A helper function to help run a lambda at the start.
- */
-template <typename Type>
-class Registor {
- public:
-  explicit Registor(std::function<void()>&& functor) { functor(); }
-
-  // Touch will do nothing.
-  int Touch() { return 0; }
-};
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/utils/hash.h b/lite/utils/hash.h
deleted file mode 100644
index a1fa3be02e..0000000000
--- a/lite/utils/hash.h
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <functional>
-
-namespace paddle {
-namespace lite {
-
-template <typename T>
-inline size_t hash_combine(size_t s, const T& v) {
-  std::hash<T> h;
-  return (s ^ h(v)) + 0x9e3779b9 + (s << 6) + (s >> 2);
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/utils/io.h b/lite/utils/io.h
deleted file mode 100644
index 98a0f39b08..0000000000
--- a/lite/utils/io.h
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <sys/stat.h>
-#include <fstream>
-#include <string>
-#include "lite/utils/cp_logging.h"
-#include "lite/utils/string.h"
-
-namespace paddle {
-namespace lite {
-
-static bool IsFileExists(const std::string& path) {
-  std::ifstream file(path);
-  bool res = file.is_open();
-  if (res) {
-    file.close();
-  }
-  return res;
-}
-
-// ARM mobile not support mkdir in C++
-static void MkDirRecur(const std::string& path) {
-#ifndef LITE_WITH_ARM
-  if (system(string_format("mkdir -p %s", path.c_str()).c_str()) != 0) {
-    LOG(ERROR) << "Cann't mkdir " << path;
-  }
-#else  // On ARM
-  CHECK_NE(mkdir(path.c_str(), S_IRWXU), -1) << "Cann't mkdir " << path;
-#endif
-}
-
-// read buffer from file
-static std::string ReadFile(const std::string& filename) {
-  std::ifstream ifile(filename.c_str());
-  std::ostringstream buf;
-  char ch;
-  while (buf && ifile.get(ch)) buf.put(ch);
-  return buf.str();
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/utils/logging.cc b/lite/utils/logging.cc
deleted file mode 100644
index c83dd79eb8..0000000000
--- a/lite/utils/logging.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/*
- * This file implements an lightweight alternative for glog, which is more
- * friendly for mobile.
- */
-
-#include "lite/utils/logging.h"
-
-#if defined(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || \
-    defined(LITE_ON_MODEL_OPTIMIZE_TOOL)
-#ifndef LITE_SHUTDOWN_LOG
-
-namespace paddle {
-namespace lite {
-
-void gen_log(STL::ostream& log_stream_,
-             const char* file,
-             const char* func,
-             int lineno,
-             const char* level,
-             const int kMaxLen) {
-  const int len = strlen(file);
-
-  std::string time_str;
-  struct tm tm_time;  // Time of creation of LogMessage
-  time_t timestamp = time(NULL);
-  localtime_r(&timestamp, &tm_time);
-  struct timeval tv;
-  gettimeofday(&tv, NULL);
-
-  // print date / time
-  log_stream_ << '[' << level << ' ' << std::setw(2) << 1 + tm_time.tm_mon
-              << '/' << std::setw(2) << tm_time.tm_mday << ' ' << std::setw(2)
-              << tm_time.tm_hour << ':' << std::setw(2) << tm_time.tm_min << ':'
-              << std::setw(2) << tm_time.tm_sec << '.' << std::setw(3)
-              << tv.tv_usec / 1000 << " ";
-
-  if (len > kMaxLen) {
-    log_stream_ << "..." << file + len - kMaxLen << ":" << lineno << " " << func
-                << "] ";
-  } else {
-    log_stream_ << file << " " << func << ":" << lineno << "] ";
-  }
-}
-
-}  // namespace lite
-}  // namespace paddle
-
-#endif  // LITE_SHUTDOWN_LOG
-#endif  // LITE_WITH_LIGHT_FRAMEWORK
diff --git a/lite/utils/logging.h b/lite/utils/logging.h
deleted file mode 100644
index 85c716d52f..0000000000
--- a/lite/utils/logging.h
+++ /dev/null
@@ -1,185 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/*
- * This file implements an lightweight alternative for glog, which is more
- * friendly for mobile.
- */
-#pragma once
-
-#include <assert.h>
-#include <sys/time.h>
-#include <sys/types.h>
-#include <time.h>
-#include <cstdlib>
-#include <cstring>
-#include <string>
-#include "lite/utils/replace_stl/stream.h"
-
-// NOLINTFILE()
-
-// LOG()
-#ifdef LITE_SHUTDOWN_LOG
-#define LOG(status) LOG_##status
-#define LOG_INFO paddle::lite::Voidify()
-#define LOG_ERROR LOG_INFO
-#define LOG_WARNING LOG_INFO
-#define LOG_FATAL paddle::lite::VoidifyFatal()
-#else
-#define LOG(status) LOG_##status.stream()
-#define LOG_INFO paddle::lite::LogMessage(__FILE__, __FUNCTION__, __LINE__, "I")
-#define LOG_ERROR LOG_INFO
-#define LOG_WARNING \
-  paddle::lite::LogMessage(__FILE__, __FUNCTION__, __LINE__, "W")
-#define LOG_FATAL \
-  paddle::lite::LogMessageFatal(__FILE__, __FUNCTION__, __LINE__)
-#endif
-
-#ifdef LITE_SHUTDOWN_LOG
-#define VLOG(level) paddle::lite::Voidify()
-#else
-// VLOG()
-#define VLOG(level) \
-  paddle::lite::VLogMessage(__FILE__, __FUNCTION__, __LINE__, level).stream()
-#endif
-
-// CHECK()
-// clang-format off
-#ifdef LITE_SHUTDOWN_LOG
-#define CHECK(x) if (!(x)) paddle::lite::VoidifyFatal()
-#define _CHECK_BINARY(x, cmp, y) CHECK(x cmp y)
-#else
-#define CHECK(x) if (!(x)) paddle::lite::LogMessageFatal(__FILE__, __FUNCTION__, __LINE__).stream() << "Check failed: " #x << ": " // NOLINT(*)
-#define _CHECK_BINARY(x, cmp, y) CHECK(x cmp y) << x << "!" #cmp << y << " "
-#endif
-
-// clang-format on
-#define CHECK_EQ(x, y) _CHECK_BINARY(x, ==, y)
-#define CHECK_NE(x, y) _CHECK_BINARY(x, !=, y)
-#define CHECK_LT(x, y) _CHECK_BINARY(x, <, y)
-#define CHECK_LE(x, y) _CHECK_BINARY(x, <=, y)
-#define CHECK_GT(x, y) _CHECK_BINARY(x, >, y)
-#define CHECK_GE(x, y) _CHECK_BINARY(x, >=, y)
-
-namespace paddle {
-namespace lite {
-
-#ifndef LITE_SHUTDOWN_LOG
-void gen_log(STL::ostream& log_stream_,
-             const char* file,
-             const char* func,
-             int lineno,
-             const char* level,
-             const int kMaxLen = 40);
-
-// LogMessage
-class LogMessage {
- public:
-  LogMessage(const char* file,
-             const char* func,
-             int lineno,
-             const char* level = "I") {
-    paddle::lite::gen_log(log_stream_, file, func, lineno, level);
-  }
-
-  ~LogMessage() {
-    log_stream_ << '\n';
-    fprintf(stderr, "%s", log_stream_.str().c_str());
-  }
-
-  STL::ostream& stream() { return log_stream_; }
-
- protected:
-  STL::stringstream log_stream_;
-
-  LogMessage(const LogMessage&) = delete;
-  void operator=(const LogMessage&) = delete;
-};
-
-// LogMessageFatal
-class LogMessageFatal : public LogMessage {
- public:
-  LogMessageFatal(const char* file,
-                  const char* func,
-                  int lineno,
-                  const char* level = "F")
-      : LogMessage(file, func, lineno, level) {}
-
-  ~LogMessageFatal() {
-    log_stream_ << '\n';
-    fprintf(stderr, "%s", log_stream_.str().c_str());
-#ifndef LITE_ON_TINY_PUBLISH
-    abort();
-#else
-    assert(false);
-#endif
-  }
-};
-
-// VLOG
-class VLogMessage {
- public:
-  VLogMessage(const char* file,
-              const char* func,
-              int lineno,
-              const int32_t level_int = 0) {
-    const char* GLOG_v = std::getenv("GLOG_v");
-    GLOG_v_int = (GLOG_v && atoi(GLOG_v) > 0) ? atoi(GLOG_v) : 0;
-    this->level_int = level_int;
-    if (GLOG_v_int < level_int) {
-      return;
-    }
-    const char* level = std::to_string(level_int).c_str();
-    paddle::lite::gen_log(log_stream_, file, func, lineno, level);
-  }
-
-  ~VLogMessage() {
-    if (GLOG_v_int < this->level_int) {
-      return;
-    }
-    log_stream_ << '\n';
-    fprintf(stderr, "%s", log_stream_.str().c_str());
-  }
-
-  STL::ostream& stream() { return log_stream_; }
-
- protected:
-  STL::stringstream log_stream_;
-  int32_t GLOG_v_int;
-  int32_t level_int;
-
-  VLogMessage(const VLogMessage&) = delete;
-  void operator=(const VLogMessage&) = delete;
-};
-#else
-class Voidify {
- public:
-  Voidify() {}
-  ~Voidify() {}
-
-  template <typename T>
-  Voidify& operator<<(const T& obj) {
-    return *this;
-  }
-};
-
-class VoidifyFatal : public Voidify {
- public:
-  ~VoidifyFatal() { assert(false); }
-};
-
-#endif
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/utils/logging_test.cc b/lite/utils/logging_test.cc
deleted file mode 100644
index 6b01b7c77f..0000000000
--- a/lite/utils/logging_test.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-
-TEST(logging, basic) {
-  CHECK_EQ(1, 1) << "error";
-  CHECK_EQ(0 + 1, 1) << "error";
-  CHECK_GT(12, 1);
-
-  int a;
-  CHECK(&a);
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/utils/macros.h b/lite/utils/macros.h
deleted file mode 100644
index 0fbe90fa45..0000000000
--- a/lite/utils/macros.h
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#ifndef DISALLOW_COPY_AND_ASSIGN
-#define DISALLOW_COPY_AND_ASSIGN(class__) \
-  class__(const class__&) = delete;       \
-  class__& operator=(const class__&) = delete;
-#endif
-
-#define LITE_UNIMPLEMENTED CHECK(false) << "Not Implemented";
-
-#if defined(_WIN32)
-#define UNUSED
-#define __builtin_expect(EXP, C) (EXP)
-#else
-#define UNUSED __attribute__((unused))
-#endif
-
-/*
-#ifndef LIKELY
-#define LIKELY(x) __builtin_expect(!!(x), 1)
-#endif
-
-#ifndef UNLIKELY
-//#define UNLIKELY(x) __built_expect(!!(x), 0)
-#define UNLIKELY(x) (x)
-#endif
- */
-
-#ifdef __CUDACC__
-#define HOSTDEVICE __host__ __device__
-#define DEVICE __device__
-#define HOST __host__
-#else
-#define HOSTDEVICE
-#define DEVICE
-#define HOST
-#endif
-
-#if defined(__FLT_MAX__)
-#define FLT_MAX __FLT_MAX__
-#endif  // __FLT_MAX__
diff --git a/lite/utils/paddle_enforce.h b/lite/utils/paddle_enforce.h
deleted file mode 100644
index 8317f45a0c..0000000000
--- a/lite/utils/paddle_enforce.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/*
- * This file defines PADDLE_ENFORCE_xx, which helps to adapt the legacy fluid
- * codes.
- */
-#pragma once
-#include "lite/utils/cp_logging.h"
-#include "lite/utils/string.h"
-
-#define PADDLE_ENFORCE(cond, ...) \
-  CHECK((cond)) << paddle::lite::string_format("" __VA_ARGS__);
-#define PADDLE_ENFORCE_EQ(a, b, ...) \
-  CHECK_EQ((a), (b)) << paddle::lite::string_format("" __VA_ARGS__);
-#define PADDLE_ENFORCE_LE(a, b, ...) \
-  CHECK_LE((a), (b)) << paddle::lite::string_format("" __VA_ARGS__);
-#define PADDLE_ENFORCE_LT(a, b, ...) \
-  CHECK_LT((a), (b)) << paddle::lite::string_format("" __VA_ARGS__);
-
-#define PADDLE_ENFORCE_GE(a, b, ...) \
-  CHECK_GE((a), (b)) << paddle::lite::string_format("" __VA_ARGS__);
-#define PADDLE_ENFORCE_GT(a, b, ...) \
-  CHECK_GT((a), (b)) << paddle::lite::string_format("" __VA_ARGS__);
-
-#ifndef PADDLE_THROW
-#define PADDLE_THROW
-#endif
diff --git a/lite/utils/replace_stl/stream.cc b/lite/utils/replace_stl/stream.cc
deleted file mode 100644
index e4867d16c0..0000000000
--- a/lite/utils/replace_stl/stream.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/utils/replace_stl/stream.h"
-
-#ifdef LITE_ON_TINY_PUBLISH
-
-namespace paddle {
-namespace lite {
-namespace replace_stl {
-
-#ifdef LITE_SHUTDOWN_LOG
-#define ADD_DATA_AS_STRING(data_, obj_)
-#else
-#define ADD_DATA_AS_STRING(data_, obj_) data_ = data_ + std::to_string(obj_)
-#endif
-
-template <>
-ostream& ostream::operator<<(const char* obj) {
-  _data = _data + std::string(obj);
-  return *this;
-}
-
-template <>
-ostream& ostream::operator<<(const std::string& obj) {
-  _data = _data + obj;
-  return *this;
-}
-
-template <>
-ostream& ostream::operator<<(const int& obj) {
-  ADD_DATA_AS_STRING(_data, obj);
-  return *this;
-}
-
-template <>
-ostream& ostream::operator<<(const bool& obj) {
-  ADD_DATA_AS_STRING(_data, obj);
-  return *this;
-}
-
-template <>
-ostream& ostream::operator<<(const long& obj) {  // NOLINT
-  ADD_DATA_AS_STRING(_data, obj);
-  return *this;
-}
-
-template <>
-ostream& ostream::operator<<(const long long& obj) {  // NOLINT
-  ADD_DATA_AS_STRING(_data, obj);
-  return *this;
-}
-
-template <>
-ostream& ostream::operator<<(const unsigned& obj) {
-  ADD_DATA_AS_STRING(_data, obj);
-  return *this;
-}
-
-template <>
-ostream& ostream::operator<<(const unsigned long& obj) {  // NOLINT
-  ADD_DATA_AS_STRING(_data, obj);
-  return *this;
-}
-
-template <>
-ostream& ostream::operator<<(const unsigned long long& obj) {  // NOLINT
-  ADD_DATA_AS_STRING(_data, obj);
-  return *this;
-}
-
-template <>
-ostream& ostream::operator<<(const float& obj) {
-  ADD_DATA_AS_STRING(_data, obj);
-  return *this;
-}
-
-template <>
-ostream& ostream::operator<<(const double& obj) {
-  ADD_DATA_AS_STRING(_data, obj);
-  return *this;
-}
-
-template <>
-ostream& ostream::operator<<(const long double& obj) {
-  ADD_DATA_AS_STRING(_data, obj);
-  return *this;
-}
-
-}  // namespace replace_stl
-}  // namespace lite
-}  // namespace paddle
-
-#endif  // LITE_ON_TINY_PUBLISH
diff --git a/lite/utils/replace_stl/stream.h b/lite/utils/replace_stl/stream.h
deleted file mode 100644
index e6bb261706..0000000000
--- a/lite/utils/replace_stl/stream.h
+++ /dev/null
@@ -1,76 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#ifdef LITE_ON_TINY_PUBLISH
-#include <stdlib.h>
-#include <string>
-#else
-#include <iomanip>
-#include <iostream>
-#include <sstream>
-#endif
-
-#ifdef LITE_ON_TINY_PUBLISH
-namespace paddle {
-namespace lite {
-
-namespace replace_stl {
-
-class ostream {
- public:
-  ostream() {}
-  explicit ostream(const std::string& x) : _data(x) {}
-  ~ostream() {}
-
-  const char* c_str() { return _data.c_str(); }
-
-  const std::string& str() { return _data; }
-  const std::string& str(const std::string& x) {
-    _data = x;
-    return _data;
-  }
-
-  template <typename T>
-  ostream& operator<<(const T& obj);
-
-  template <typename T>
-  ostream& operator<<(const T* obj);
-
- private:
-  std::string _data;
-};
-
-class stringstream : public ostream {
- public:
-  stringstream() = default;
-
-  ~stringstream() {}
-
-  explicit stringstream(const std::string& x) : ostream(x) {}  // NOLINT
-
-  void precision(int x) {}
-};
-
-}  // namespace replace_stl
-
-}  // namespace lite
-}  // namespace paddle
-
-// replace namespace
-namespace STL = paddle::lite::replace_stl;
-#else
-namespace STL = std;
-#endif
diff --git a/lite/utils/string.cc b/lite/utils/string.cc
deleted file mode 100644
index df46be5ba3..0000000000
--- a/lite/utils/string.cc
+++ /dev/null
@@ -1,19 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/utils/string.h"
-
-namespace paddle {
-namespace lite {}  // namespace lite
-}  // namespace paddle
diff --git a/lite/utils/string.h b/lite/utils/string.h
deleted file mode 100644
index d96b2aac20..0000000000
--- a/lite/utils/string.h
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <stdarg.h>  // For va_start, etc.
-#include <algorithm>
-#include <cstring>
-#include <memory>  // For std::unique_ptr
-#include <string>
-#include <vector>
-#include "lite/utils/replace_stl/stream.h"
-
-namespace paddle {
-namespace lite {
-
-static std::string string_format(const std::string fmt_str, ...) {
-  /* Reserve two times as much as the length of the fmt_str */
-  int final_n, n = (static_cast<int>(fmt_str.size())) * 2;
-  std::unique_ptr<char[]> formatted;
-  va_list ap;
-  while (1) {
-    formatted.reset(
-        new char[n]); /* Wrap the plain char array into the unique_ptr */
-    std::strcpy(&formatted[0], fmt_str.c_str());  // NOLINT
-    va_start(ap, fmt_str);
-    final_n = vsnprintf(&formatted[0], n, fmt_str.c_str(), ap);
-    va_end(ap);
-    if (final_n < 0 || final_n >= n)
-      n += abs(final_n - n + 1);
-    else
-      break;
-  }
-  return std::string(formatted.get());
-}
-
-template <typename T>
-static std::string to_string_with_precision(const T& v, const int n = 6) {
-  STL::stringstream ss;
-  ss.precision(n);
-  // ss << std::fixed << v;
-  return ss.str();
-}
-
-template <typename T>
-std::string Join(const std::vector<T>& vec, const std::string& delim) {
-  if (vec.empty()) return "";
-
-  STL::stringstream ss;
-  for (size_t i = 0; i < vec.size() - 1; i++) ss << vec[i] << delim;
-  if (!vec.empty()) {
-    ss << vec.back();
-  }
-
-  return ss.str();
-}
-
-static std::string Repr(const std::string& x) { return "\"" + x + "\""; }
-
-static std::string Repr(const std::vector<std::string>& v) {
-  std::vector<std::string> tmp;
-  std::transform(
-      v.begin(), v.end(), std::back_inserter(tmp), [](const std::string& x) {
-        return Repr(x);
-      });
-  return "{" + Join(tmp, ",") + "}";
-}
-
-static std::vector<std::string> Split(const std::string& original,
-                                      const std::string& separator) {
-  std::vector<std::string> results;
-  std::string::size_type pos1, pos2;
-  pos2 = original.find(separator);
-  pos1 = 0;
-  while (std::string::npos != pos2) {
-    results.push_back(original.substr(pos1, pos2 - pos1));
-    pos1 = pos2 + separator.size();
-    pos2 = original.find(separator, pos1);
-  }
-  if (pos1 != original.length()) {
-    results.push_back(original.substr(pos1));
-  }
-  return results;
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/utils/varient.h b/lite/utils/varient.h
deleted file mode 100644
index b1fd9a11b9..0000000000
--- a/lite/utils/varient.h
+++ /dev/null
@@ -1,151 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include <exception>
-#include <memory>
-#include <type_traits>
-#include <typeinfo>
-#include <utility>
-#include "lite/utils/cp_logging.h"
-#include "lite/utils/string.h"
-
-// This is an equivalent implementation of boost::any. We implement this to
-// avoid including the whole boost library and keep the inference library small.
-// These code references https://gist.github.com/shoooe/9202235
-
-namespace paddle {
-namespace lite {
-
-template <size_t arg1, size_t... others>
-struct static_max;
-template <size_t arg>
-struct static_max<arg> {
-  static const size_t value = arg;
-};
-template <size_t arg1, size_t arg2, size_t... others>
-struct static_max<arg1, arg2, others...> {
-  static const size_t value = arg1 >= arg2 ? static_max<arg1, others...>::value
-                                           : static_max<arg2, others...>::value;
-};
-template <typename... Ts>
-struct variant_helper;
-template <typename F, typename... Ts>
-struct variant_helper<F, Ts...> {
-  inline static void destroy(size_t id, void* data) {
-    if (id == typeid(F).hash_code())
-      reinterpret_cast<F*>(data)->~F();
-    else
-      variant_helper<Ts...>::destroy(id, data);
-  }
-  inline static void move(size_t old_t, void* old_v, void* new_v) {
-    if (old_t == typeid(F).hash_code())
-      new (new_v) F(std::move(*reinterpret_cast<F*>(old_v)));
-    else
-      variant_helper<Ts...>::move(old_t, old_v, new_v);
-  }
-  inline static void copy(size_t old_t, const void* old_v, void* new_v) {
-    if (old_t == typeid(F).hash_code())
-      new (new_v) F(*reinterpret_cast<const F*>(old_v));
-    else
-      variant_helper<Ts...>::copy(old_t, old_v, new_v);
-  }
-};
-template <>
-struct variant_helper<> {
-  inline static void destroy(size_t id, void* data) {}
-  inline static void move(size_t old_t, void* old_v, void* new_v) {}
-  inline static void copy(size_t old_t, const void* old_v, void* new_v) {}
-};
-
-template <typename... Ts>
-struct variant {
- private:
-  static const size_t data_size = static_max<sizeof(Ts)...>::value;
-  static const size_t data_align = static_max<alignof(Ts)...>::value;
-  using data_t = typename std::aligned_storage<data_size, data_align>::type;
-  using helper_t = variant_helper<Ts...>;
-  static inline size_t invalid_type() { return typeid(void).hash_code(); }
-  size_t type_id;
-  data_t data;
-
- public:
-  variant() : type_id(invalid_type()) {}
-  variant(const variant<Ts...>& old) : type_id(old.type_id) {
-    helper_t::copy(old.type_id, &old.data, &data);
-  }
-  variant(variant<Ts...>&& old) : type_id(old.type_id) {
-    helper_t::move(old.type_id, &old.data, &data);
-  }
-  // Serves as both the move and the copy asignment operator.
-  variant<Ts...>& operator=(variant<Ts...> old) {
-    std::swap(type_id, old.type_id);
-    std::swap(data, old.data);
-    return *this;
-  }
-  template <typename T>
-  bool is() {
-    return (type_id == typeid(T).hash_code());
-  }
-
-  size_t type() { return type_id; }
-
-  bool valid() { return (type_id != invalid_type()); }
-
-  template <typename T, typename... Args>
-  void set(Args&&... args) {
-    // First we destroy the current contents
-    helper_t::destroy(type_id, &data);
-    new (&data) T(std::forward<Args>(args)...);
-    type_id = typeid(T).hash_code();
-  }
-  template <typename T>
-  const T& get() const {
-    // It is a dynamic_cast-like behaviour
-    if (type_id == typeid(T).hash_code()) {
-      return *reinterpret_cast<const T*>(&data);
-    } else {
-#ifdef LITE_ON_TINY_PUBLISH
-      LOG(FATAL) << "unmatched type, store as " << type_id
-                 << " , but want to get " << typeid(T).name();
-#else
-      throw std::invalid_argument(
-          string_format("unmatched type, store as %d, but want to get %s",
-                        type_id,
-                        typeid(T).name()));
-#endif
-    }
-    return *reinterpret_cast<const T*>(&data);
-  }
-
-  template <typename T>
-  T* get_mutable() {
-    // It is a dynamic_cast-like behaviour
-    if (type_id == typeid(T).hash_code()) {
-      return reinterpret_cast<T*>(&data);
-    } else {
-#ifdef LITE_ON_TINY_PUBLISH
-      LOG(ERROR) << "unmatched type get, should be " << type_id << " but get "
-                 << typeid(T).name();
-#else
-      throw std::invalid_argument("unmatched type");
-#endif
-    }
-  }
-  ~variant() { helper_t::destroy(type_id, &data); }
-};
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/utils/varient_test.cc b/lite/utils/varient_test.cc
deleted file mode 100644
index a2e17d3af6..0000000000
--- a/lite/utils/varient_test.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/utils/varient.h"
-#include <gtest/gtest.h>
-#include <set>
-#include <string>
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace utils {
-
-TEST(varient, test) {
-  variant<int, float> a;
-  // The initial state should be invalid.
-  ASSERT_FALSE(a.valid());
-  a.set<int>(1);
-  ASSERT_EQ(a.get<int>(), 1);
-  a.set<int>(20);
-  ASSERT_EQ(a.get<int>(), 20);
-}
-
-TEST(varient, reference) {
-  variant<int, float, std::string> a;
-  a.set<std::string>("hello world");
-
-  auto& b = a.get<std::string>();
-  ASSERT_EQ(b, "hello world");
-}
-
-TEST(varient, get_wrong_type) {
-  variant<int, float> a;
-  a.set<int>(100);
-  bool exception = false;
-  try {
-    float b = a.get<float>();
-    LOG(INFO) << b + 1;
-  } catch (...) {
-    exception = true;
-  }
-  ASSERT_TRUE(exception);
-}
-
-}  // namespace utils
-}  // namespace lite
-}  // namespace paddle
diff --git a/metal/MobileNetDemo/MobileNetDemo.xcodeproj/project.pbxproj b/metal/MobileNetDemo/MobileNetDemo.xcodeproj/project.pbxproj
deleted file mode 100644
index 245483a35b..0000000000
--- a/metal/MobileNetDemo/MobileNetDemo.xcodeproj/project.pbxproj
+++ /dev/null
@@ -1,504 +0,0 @@
-// !$*UTF8*$!
-{
-	archiveVersion = 1;
-	classes = {
-	};
-	objectVersion = 50;
-	objects = {
-
-/* Begin PBXBuildFile section */
-		FA37E99B9AD29A07FEE8E743 /* Pods_MobileNetDemo.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = DD3A2E3175627EF63DACA36C /* Pods_MobileNetDemo.framework */; };
-		FC74BB3621DFAFEC0055232B /* MobileNet.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC74BB3521DFAFEC0055232B /* MobileNet.swift */; };
-		FCB40DA221E0B7C60075EC91 /* MobilenetPreProcess.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCB40DA121E0B7C60075EC91 /* MobilenetPreProcess.metal */; };
-		FCB40DA421E0B85B0075EC91 /* MetalHelper.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCB40DA321E0B85B0075EC91 /* MetalHelper.swift */; };
-		FCB40DE921E0B9410075EC91 /* banana.jpeg in Resources */ = {isa = PBXBuildFile; fileRef = FCB40DD221E0B9410075EC91 /* banana.jpeg */; };
-		FCB40E5121E0CEBB0075EC91 /* mobilenet_model in Resources */ = {isa = PBXBuildFile; fileRef = FCB40E4F21E0CEBB0075EC91 /* mobilenet_model */; };
-		FCB40E5221E0CEBB0075EC91 /* mobilenet_params in Resources */ = {isa = PBXBuildFile; fileRef = FCB40E5021E0CEBB0075EC91 /* mobilenet_params */; };
-		FCB40E5421E0CEF80075EC91 /* synset.txt in Resources */ = {isa = PBXBuildFile; fileRef = FCB40E5321E0CEF80075EC91 /* synset.txt */; };
-		FCC15E13221E715400DC3CB2 /* paddle-mobile-metallib.metallib in Resources */ = {isa = PBXBuildFile; fileRef = FCC15E12221E715400DC3CB2 /* paddle-mobile-metallib.metallib */; };
-		FCD3873821E1C31F0052F3D0 /* paddle_mobile.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = FCD3873721E1C31F0052F3D0 /* paddle_mobile.framework */; };
-		FCD3873921E1C31F0052F3D0 /* paddle_mobile.framework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = FCD3873721E1C31F0052F3D0 /* paddle_mobile.framework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; };
-		FCF2870921DFAEC7009A87DA /* AppDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCF2870821DFAEC7009A87DA /* AppDelegate.swift */; };
-		FCF2870B21DFAEC7009A87DA /* ViewController.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCF2870A21DFAEC7009A87DA /* ViewController.swift */; };
-		FCF2870E21DFAEC7009A87DA /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FCF2870C21DFAEC7009A87DA /* Main.storyboard */; };
-		FCF2871021DFAEC8009A87DA /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = FCF2870F21DFAEC8009A87DA /* Assets.xcassets */; };
-		FCF2871321DFAEC8009A87DA /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FCF2871121DFAEC8009A87DA /* LaunchScreen.storyboard */; };
-/* End PBXBuildFile section */
-
-/* Begin PBXCopyFilesBuildPhase section */
-		FCB40DFC21E0BC360075EC91 /* Embed Frameworks */ = {
-			isa = PBXCopyFilesBuildPhase;
-			buildActionMask = 2147483647;
-			dstPath = "";
-			dstSubfolderSpec = 10;
-			files = (
-				FCD3873921E1C31F0052F3D0 /* paddle_mobile.framework in Embed Frameworks */,
-			);
-			name = "Embed Frameworks";
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXCopyFilesBuildPhase section */
-
-/* Begin PBXFileReference section */
-		4FE67FF667A24FCB0134F627 /* Pods-MobileNetDemo.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-MobileNetDemo.debug.xcconfig"; path = "../Pods/Target Support Files/Pods-MobileNetDemo/Pods-MobileNetDemo.debug.xcconfig"; sourceTree = "<group>"; };
-		DD3A2E3175627EF63DACA36C /* Pods_MobileNetDemo.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Pods_MobileNetDemo.framework; sourceTree = BUILT_PRODUCTS_DIR; };
-		E57059FE3629E3A8DE6C7ECF /* Pods-MobileNetDemo.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-MobileNetDemo.release.xcconfig"; path = "../Pods/Target Support Files/Pods-MobileNetDemo/Pods-MobileNetDemo.release.xcconfig"; sourceTree = "<group>"; };
-		FC74BB3521DFAFEC0055232B /* MobileNet.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MobileNet.swift; sourceTree = "<group>"; };
-		FCB40DA121E0B7C60075EC91 /* MobilenetPreProcess.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = MobilenetPreProcess.metal; sourceTree = "<group>"; };
-		FCB40DA321E0B85B0075EC91 /* MetalHelper.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; name = MetalHelper.swift; path = "../../paddle-mobile-demo/paddle-mobile-demo/MetalHelper.swift"; sourceTree = "<group>"; };
-		FCB40DD221E0B9410075EC91 /* banana.jpeg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = banana.jpeg; sourceTree = "<group>"; };
-		FCB40E4F21E0CEBB0075EC91 /* mobilenet_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_model; sourceTree = "<group>"; };
-		FCB40E5021E0CEBB0075EC91 /* mobilenet_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_params; sourceTree = "<group>"; };
-		FCB40E5321E0CEF80075EC91 /* synset.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = synset.txt; sourceTree = "<group>"; };
-		FCC15E12221E715400DC3CB2 /* paddle-mobile-metallib.metallib */ = {isa = PBXFileReference; lastKnownFileType = "archive.metal-library"; name = "paddle-mobile-metallib.metallib"; path = "../../../../Library/Developer/Xcode/DerivedData/paddle-mobile-hdsimtkoxoondndnjczkbkchcwyh/Build/Products/Release-iphoneos/paddle-mobile-metallib.metallib"; sourceTree = "<group>"; };
-		FCD3873721E1C31F0052F3D0 /* paddle_mobile.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; path = paddle_mobile.framework; sourceTree = BUILT_PRODUCTS_DIR; };
-		FCF2870521DFAEC7009A87DA /* MobileNetDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = MobileNetDemo.app; sourceTree = BUILT_PRODUCTS_DIR; };
-		FCF2870821DFAEC7009A87DA /* AppDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppDelegate.swift; sourceTree = "<group>"; };
-		FCF2870A21DFAEC7009A87DA /* ViewController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ViewController.swift; sourceTree = "<group>"; };
-		FCF2870D21DFAEC7009A87DA /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/Main.storyboard; sourceTree = "<group>"; };
-		FCF2870F21DFAEC8009A87DA /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
-		FCF2871221DFAEC8009A87DA /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = "<group>"; };
-		FCF2871421DFAEC8009A87DA /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
-/* End PBXFileReference section */
-
-/* Begin PBXFrameworksBuildPhase section */
-		FCF2870221DFAEC7009A87DA /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				FCD3873821E1C31F0052F3D0 /* paddle_mobile.framework in Frameworks */,
-				FA37E99B9AD29A07FEE8E743 /* Pods_MobileNetDemo.framework in Frameworks */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXFrameworksBuildPhase section */
-
-/* Begin PBXGroup section */
-		0DDBA47E92A64BC7B0385B0F /* Frameworks */ = {
-			isa = PBXGroup;
-			children = (
-				DD3A2E3175627EF63DACA36C /* Pods_MobileNetDemo.framework */,
-			);
-			name = Frameworks;
-			sourceTree = "<group>";
-		};
-		1EACBAAF38D9EDE0AC2B3F90 /* Pods */ = {
-			isa = PBXGroup;
-			children = (
-				4FE67FF667A24FCB0134F627 /* Pods-MobileNetDemo.debug.xcconfig */,
-				E57059FE3629E3A8DE6C7ECF /* Pods-MobileNetDemo.release.xcconfig */,
-			);
-			name = Pods;
-			sourceTree = "<group>";
-		};
-		FCB40DCF21E0B9410075EC91 /* Resources */ = {
-			isa = PBXGroup;
-			children = (
-				FCB40DD021E0B9410075EC91 /* images */,
-				FCB40DD921E0B9410075EC91 /* models */,
-			);
-			path = Resources;
-			sourceTree = "<group>";
-		};
-		FCB40DD021E0B9410075EC91 /* images */ = {
-			isa = PBXGroup;
-			children = (
-				FCB40DD221E0B9410075EC91 /* banana.jpeg */,
-			);
-			path = images;
-			sourceTree = "<group>";
-		};
-		FCB40DD921E0B9410075EC91 /* models */ = {
-			isa = PBXGroup;
-			children = (
-				FCB40E4E21E0CEBB0075EC91 /* mobilenet_combine */,
-			);
-			path = models;
-			sourceTree = "<group>";
-		};
-		FCB40E4E21E0CEBB0075EC91 /* mobilenet_combine */ = {
-			isa = PBXGroup;
-			children = (
-				FCB40E5321E0CEF80075EC91 /* synset.txt */,
-				FCB40E4F21E0CEBB0075EC91 /* mobilenet_model */,
-				FCB40E5021E0CEBB0075EC91 /* mobilenet_params */,
-			);
-			path = mobilenet_combine;
-			sourceTree = "<group>";
-		};
-		FCF286FC21DFAEC7009A87DA = {
-			isa = PBXGroup;
-			children = (
-				FCC15E12221E715400DC3CB2 /* paddle-mobile-metallib.metallib */,
-				FCD3873721E1C31F0052F3D0 /* paddle_mobile.framework */,
-				FCF2870721DFAEC7009A87DA /* MobileNetDemo */,
-				FCF2870621DFAEC7009A87DA /* Products */,
-				1EACBAAF38D9EDE0AC2B3F90 /* Pods */,
-				0DDBA47E92A64BC7B0385B0F /* Frameworks */,
-			);
-			sourceTree = "<group>";
-		};
-		FCF2870621DFAEC7009A87DA /* Products */ = {
-			isa = PBXGroup;
-			children = (
-				FCF2870521DFAEC7009A87DA /* MobileNetDemo.app */,
-			);
-			name = Products;
-			sourceTree = "<group>";
-		};
-		FCF2870721DFAEC7009A87DA /* MobileNetDemo */ = {
-			isa = PBXGroup;
-			children = (
-				FCB40DCF21E0B9410075EC91 /* Resources */,
-				FCB40DA321E0B85B0075EC91 /* MetalHelper.swift */,
-				FC74BB3521DFAFEC0055232B /* MobileNet.swift */,
-				FCF2870821DFAEC7009A87DA /* AppDelegate.swift */,
-				FCF2870A21DFAEC7009A87DA /* ViewController.swift */,
-				FCF2870C21DFAEC7009A87DA /* Main.storyboard */,
-				FCF2870F21DFAEC8009A87DA /* Assets.xcassets */,
-				FCF2871121DFAEC8009A87DA /* LaunchScreen.storyboard */,
-				FCF2871421DFAEC8009A87DA /* Info.plist */,
-				FCB40DA121E0B7C60075EC91 /* MobilenetPreProcess.metal */,
-			);
-			path = MobileNetDemo;
-			sourceTree = "<group>";
-		};
-/* End PBXGroup section */
-
-/* Begin PBXNativeTarget section */
-		FCF2870421DFAEC7009A87DA /* MobileNetDemo */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = FCF2871721DFAEC8009A87DA /* Build configuration list for PBXNativeTarget "MobileNetDemo" */;
-			buildPhases = (
-				B4EB56AEEFF6F3965DA3D2DA /* [CP] Check Pods Manifest.lock */,
-				FCF2870121DFAEC7009A87DA /* Sources */,
-				FCF2870221DFAEC7009A87DA /* Frameworks */,
-				FCF2870321DFAEC7009A87DA /* Resources */,
-				1D801B9681ACFCA70D444D2C /* [CP] Embed Pods Frameworks */,
-				FCB40DFC21E0BC360075EC91 /* Embed Frameworks */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-			);
-			name = MobileNetDemo;
-			productName = MobileNetDemo;
-			productReference = FCF2870521DFAEC7009A87DA /* MobileNetDemo.app */;
-			productType = "com.apple.product-type.application";
-		};
-/* End PBXNativeTarget section */
-
-/* Begin PBXProject section */
-		FCF286FD21DFAEC7009A87DA /* Project object */ = {
-			isa = PBXProject;
-			attributes = {
-				LastSwiftUpdateCheck = 1010;
-				LastUpgradeCheck = 1010;
-				ORGANIZATIONNAME = Ray;
-				TargetAttributes = {
-					FCF2870421DFAEC7009A87DA = {
-						CreatedOnToolsVersion = 10.1;
-					};
-				};
-			};
-			buildConfigurationList = FCF2870021DFAEC7009A87DA /* Build configuration list for PBXProject "MobileNetDemo" */;
-			compatibilityVersion = "Xcode 9.3";
-			developmentRegion = en;
-			hasScannedForEncodings = 0;
-			knownRegions = (
-				en,
-				Base,
-			);
-			mainGroup = FCF286FC21DFAEC7009A87DA;
-			productRefGroup = FCF2870621DFAEC7009A87DA /* Products */;
-			projectDirPath = "";
-			projectRoot = "";
-			targets = (
-				FCF2870421DFAEC7009A87DA /* MobileNetDemo */,
-			);
-		};
-/* End PBXProject section */
-
-/* Begin PBXResourcesBuildPhase section */
-		FCF2870321DFAEC7009A87DA /* Resources */ = {
-			isa = PBXResourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				FCF2871321DFAEC8009A87DA /* LaunchScreen.storyboard in Resources */,
-				FCB40E5121E0CEBB0075EC91 /* mobilenet_model in Resources */,
-				FCB40DE921E0B9410075EC91 /* banana.jpeg in Resources */,
-				FCF2871021DFAEC8009A87DA /* Assets.xcassets in Resources */,
-				FCC15E13221E715400DC3CB2 /* paddle-mobile-metallib.metallib in Resources */,
-				FCB40E5421E0CEF80075EC91 /* synset.txt in Resources */,
-				FCB40E5221E0CEBB0075EC91 /* mobilenet_params in Resources */,
-				FCF2870E21DFAEC7009A87DA /* Main.storyboard in Resources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXResourcesBuildPhase section */
-
-/* Begin PBXShellScriptBuildPhase section */
-		1D801B9681ACFCA70D444D2C /* [CP] Embed Pods Frameworks */ = {
-			isa = PBXShellScriptBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			inputFileListPaths = (
-			);
-			inputPaths = (
-				"${SRCROOT}/../Pods/Target Support Files/Pods-MobileNetDemo/Pods-MobileNetDemo-frameworks.sh",
-				"${BUILT_PRODUCTS_DIR}/Protobuf/Protobuf.framework",
-			);
-			name = "[CP] Embed Pods Frameworks";
-			outputFileListPaths = (
-			);
-			outputPaths = (
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/Protobuf.framework",
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-			shellPath = /bin/sh;
-			shellScript = "\"${SRCROOT}/../Pods/Target Support Files/Pods-MobileNetDemo/Pods-MobileNetDemo-frameworks.sh\"\n";
-			showEnvVarsInLog = 0;
-		};
-		B4EB56AEEFF6F3965DA3D2DA /* [CP] Check Pods Manifest.lock */ = {
-			isa = PBXShellScriptBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			inputFileListPaths = (
-			);
-			inputPaths = (
-				"${PODS_PODFILE_DIR_PATH}/Podfile.lock",
-				"${PODS_ROOT}/Manifest.lock",
-			);
-			name = "[CP] Check Pods Manifest.lock";
-			outputFileListPaths = (
-			);
-			outputPaths = (
-				"$(DERIVED_FILE_DIR)/Pods-MobileNetDemo-checkManifestLockResult.txt",
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-			shellPath = /bin/sh;
-			shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n    # print error to STDERR\n    echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n    exit 1\nfi\n# This output is used by Xcode 'outputs' to avoid re-running this script phase.\necho \"SUCCESS\" > \"${SCRIPT_OUTPUT_FILE_0}\"\n";
-			showEnvVarsInLog = 0;
-		};
-/* End PBXShellScriptBuildPhase section */
-
-/* Begin PBXSourcesBuildPhase section */
-		FCF2870121DFAEC7009A87DA /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				FC74BB3621DFAFEC0055232B /* MobileNet.swift in Sources */,
-				FCB40DA421E0B85B0075EC91 /* MetalHelper.swift in Sources */,
-				FCB40DA221E0B7C60075EC91 /* MobilenetPreProcess.metal in Sources */,
-				FCF2870B21DFAEC7009A87DA /* ViewController.swift in Sources */,
-				FCF2870921DFAEC7009A87DA /* AppDelegate.swift in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXSourcesBuildPhase section */
-
-/* Begin PBXVariantGroup section */
-		FCF2870C21DFAEC7009A87DA /* Main.storyboard */ = {
-			isa = PBXVariantGroup;
-			children = (
-				FCF2870D21DFAEC7009A87DA /* Base */,
-			);
-			name = Main.storyboard;
-			sourceTree = "<group>";
-		};
-		FCF2871121DFAEC8009A87DA /* LaunchScreen.storyboard */ = {
-			isa = PBXVariantGroup;
-			children = (
-				FCF2871221DFAEC8009A87DA /* Base */,
-			);
-			name = LaunchScreen.storyboard;
-			sourceTree = "<group>";
-		};
-/* End PBXVariantGroup section */
-
-/* Begin XCBuildConfiguration section */
-		FCF2871521DFAEC8009A87DA /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_ENABLE_OBJC_WEAK = YES;
-				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_COMMA = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-				CLANG_WARN_STRICT_PROTOTYPES = YES;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = dwarf;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				ENABLE_TESTABILITY = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu11;
-				GCC_DYNAMIC_NO_PIC = NO;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_OPTIMIZATION_LEVEL = 0;
-				GCC_PREPROCESSOR_DEFINITIONS = (
-					"DEBUG=1",
-					"$(inherited)",
-				);
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 12.1;
-				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
-				MTL_FAST_MATH = YES;
-				ONLY_ACTIVE_ARCH = YES;
-				SDKROOT = iphoneos;
-				SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG;
-				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
-			};
-			name = Debug;
-		};
-		FCF2871621DFAEC8009A87DA /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_ENABLE_OBJC_WEAK = YES;
-				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_COMMA = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-				CLANG_WARN_STRICT_PROTOTYPES = YES;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
-				ENABLE_NS_ASSERTIONS = NO;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu11;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 12.1;
-				MTL_ENABLE_DEBUG_INFO = NO;
-				MTL_FAST_MATH = YES;
-				SDKROOT = iphoneos;
-				SWIFT_COMPILATION_MODE = wholemodule;
-				SWIFT_OPTIMIZATION_LEVEL = "-O";
-				VALIDATE_PRODUCT = YES;
-			};
-			name = Release;
-		};
-		FCF2871821DFAEC8009A87DA /* Debug */ = {
-			isa = XCBuildConfiguration;
-			baseConfigurationReference = 4FE67FF667A24FCB0134F627 /* Pods-MobileNetDemo.debug.xcconfig */;
-			buildSettings = {
-				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-				CODE_SIGN_STYLE = Manual;
-				DEVELOPMENT_TEAM = A798K58VVL;
-				INFOPLIST_FILE = MobileNetDemo/Info.plist;
-				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
-				LD_RUNPATH_SEARCH_PATHS = (
-					"$(inherited)",
-					"@executable_path/Frameworks",
-				);
-				PRODUCT_BUNDLE_IDENTIFIER = Ray.MobileNetDemo;
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				PROVISIONING_PROFILE_SPECIFIER = ForAllDev;
-				SWIFT_VERSION = 4.0;
-				TARGETED_DEVICE_FAMILY = "1,2";
-			};
-			name = Debug;
-		};
-		FCF2871921DFAEC8009A87DA /* Release */ = {
-			isa = XCBuildConfiguration;
-			baseConfigurationReference = E57059FE3629E3A8DE6C7ECF /* Pods-MobileNetDemo.release.xcconfig */;
-			buildSettings = {
-				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-				CODE_SIGN_STYLE = Manual;
-				DEVELOPMENT_TEAM = A798K58VVL;
-				INFOPLIST_FILE = MobileNetDemo/Info.plist;
-				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
-				LD_RUNPATH_SEARCH_PATHS = (
-					"$(inherited)",
-					"@executable_path/Frameworks",
-				);
-				PRODUCT_BUNDLE_IDENTIFIER = Ray.MobileNetDemo;
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				PROVISIONING_PROFILE_SPECIFIER = ForAllDev;
-				SWIFT_VERSION = 4.0;
-				TARGETED_DEVICE_FAMILY = "1,2";
-			};
-			name = Release;
-		};
-/* End XCBuildConfiguration section */
-
-/* Begin XCConfigurationList section */
-		FCF2870021DFAEC7009A87DA /* Build configuration list for PBXProject "MobileNetDemo" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				FCF2871521DFAEC8009A87DA /* Debug */,
-				FCF2871621DFAEC8009A87DA /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		FCF2871721DFAEC8009A87DA /* Build configuration list for PBXNativeTarget "MobileNetDemo" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				FCF2871821DFAEC8009A87DA /* Debug */,
-				FCF2871921DFAEC8009A87DA /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-/* End XCConfigurationList section */
-	};
-	rootObject = FCF286FD21DFAEC7009A87DA /* Project object */;
-}
diff --git a/metal/MobileNetDemo/MobileNetDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata b/metal/MobileNetDemo/MobileNetDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata
deleted file mode 100644
index 73ab39f972..0000000000
--- a/metal/MobileNetDemo/MobileNetDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata
+++ /dev/null
@@ -1,7 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<Workspace
-   version = "1.0">
-   <FileRef
-      location = "self:MobileNetDemo.xcodeproj">
-   </FileRef>
-</Workspace>
diff --git a/metal/MobileNetDemo/MobileNetDemo.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist b/metal/MobileNetDemo/MobileNetDemo.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
deleted file mode 100644
index 18d981003d..0000000000
--- a/metal/MobileNetDemo/MobileNetDemo.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
+++ /dev/null
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>IDEDidComputeMac32BitWarning</key>
-	<true/>
-</dict>
-</plist>
diff --git a/metal/MobileNetDemo/MobileNetDemo/AppDelegate.swift b/metal/MobileNetDemo/MobileNetDemo/AppDelegate.swift
deleted file mode 100644
index 9596c1a535..0000000000
--- a/metal/MobileNetDemo/MobileNetDemo/AppDelegate.swift
+++ /dev/null
@@ -1,46 +0,0 @@
-//
-//  AppDelegate.swift
-//  MobileNetDemo
-//
-//  Created by liuRuiLong on 2019/1/4.
-//  Copyright © 2019 Ray. All rights reserved.
-//
-
-import UIKit
-
-@UIApplicationMain
-class AppDelegate: UIResponder, UIApplicationDelegate {
-    
-    var window: UIWindow?
-    
-    
-    func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplication.LaunchOptionsKey: Any]?) -> Bool {
-        // Override point for customization after application launch.
-        return true
-    }
-    
-    func applicationWillResignActive(_ application: UIApplication) {
-        // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state.
-        // Use this method to pause ongoing tasks, disable timers, and invalidate graphics rendering callbacks. Games should use this method to pause the game.
-    }
-    
-    func applicationDidEnterBackground(_ application: UIApplication) {
-        // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later.
-        // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits.
-    }
-    
-    func applicationWillEnterForeground(_ application: UIApplication) {
-        // Called as part of the transition from the background to the active state; here you can undo many of the changes made on entering the background.
-    }
-    
-    func applicationDidBecomeActive(_ application: UIApplication) {
-        // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface.
-    }
-    
-    func applicationWillTerminate(_ application: UIApplication) {
-        // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:.
-    }
-    
-    
-}
-
diff --git a/metal/MobileNetDemo/MobileNetDemo/Assets.xcassets/AppIcon.appiconset/Contents.json b/metal/MobileNetDemo/MobileNetDemo/Assets.xcassets/AppIcon.appiconset/Contents.json
deleted file mode 100644
index b542ec24d2..0000000000
--- a/metal/MobileNetDemo/MobileNetDemo/Assets.xcassets/AppIcon.appiconset/Contents.json
+++ /dev/null
@@ -1,98 +0,0 @@
-{
-  "images" : [
-    {
-      "idiom" : "iphone",
-      "size" : "20x20",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "iphone",
-      "size" : "20x20",
-      "scale" : "3x"
-    },
-    {
-      "idiom" : "iphone",
-      "size" : "29x29",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "iphone",
-      "size" : "29x29",
-      "scale" : "3x"
-    },
-    {
-      "idiom" : "iphone",
-      "size" : "40x40",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "iphone",
-      "size" : "40x40",
-      "scale" : "3x"
-    },
-    {
-      "idiom" : "iphone",
-      "size" : "60x60",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "iphone",
-      "size" : "60x60",
-      "scale" : "3x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "20x20",
-      "scale" : "1x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "20x20",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "29x29",
-      "scale" : "1x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "29x29",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "40x40",
-      "scale" : "1x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "40x40",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "76x76",
-      "scale" : "1x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "76x76",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "83.5x83.5",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "ios-marketing",
-      "size" : "1024x1024",
-      "scale" : "1x"
-    }
-  ],
-  "info" : {
-    "version" : 1,
-    "author" : "xcode"
-  }
-}
diff --git a/metal/MobileNetDemo/MobileNetDemo/Assets.xcassets/Contents.json b/metal/MobileNetDemo/MobileNetDemo/Assets.xcassets/Contents.json
deleted file mode 100644
index 2d92bd53fd..0000000000
--- a/metal/MobileNetDemo/MobileNetDemo/Assets.xcassets/Contents.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
-  "info" : {
-    "version" : 1,
-    "author" : "xcode"
-  }
-}
diff --git a/metal/MobileNetDemo/MobileNetDemo/Base.lproj/LaunchScreen.storyboard b/metal/MobileNetDemo/MobileNetDemo/Base.lproj/LaunchScreen.storyboard
deleted file mode 100644
index bfa3612941..0000000000
--- a/metal/MobileNetDemo/MobileNetDemo/Base.lproj/LaunchScreen.storyboard
+++ /dev/null
@@ -1,25 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="13122.16" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" launchScreen="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="01J-lp-oVM">
-    <dependencies>
-        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="13104.12"/>
-        <capability name="Safe area layout guides" minToolsVersion="9.0"/>
-        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
-    </dependencies>
-    <scenes>
-        <!--View Controller-->
-        <scene sceneID="EHf-IW-A2E">
-            <objects>
-                <viewController id="01J-lp-oVM" sceneMemberID="viewController">
-                    <view key="view" contentMode="scaleToFill" id="Ze5-6b-2t3">
-                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
-                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
-                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
-                        <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
-                    </view>
-                </viewController>
-                <placeholder placeholderIdentifier="IBFirstResponder" id="iYj-Kq-Ea1" userLabel="First Responder" sceneMemberID="firstResponder"/>
-            </objects>
-            <point key="canvasLocation" x="53" y="375"/>
-        </scene>
-    </scenes>
-</document>
diff --git a/metal/MobileNetDemo/MobileNetDemo/Base.lproj/Main.storyboard b/metal/MobileNetDemo/MobileNetDemo/Base.lproj/Main.storyboard
deleted file mode 100644
index 067a594bf3..0000000000
--- a/metal/MobileNetDemo/MobileNetDemo/Base.lproj/Main.storyboard
+++ /dev/null
@@ -1,166 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="14460.31" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="hKf-0C-qAk">
-    <device id="retina4_7" orientation="portrait">
-        <adaptation id="fullscreen"/>
-    </device>
-    <dependencies>
-        <deployment identifier="iOS"/>
-        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="14460.20"/>
-        <capability name="Safe area layout guides" minToolsVersion="9.0"/>
-        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
-    </dependencies>
-    <scenes>
-        <!--View Controller-->
-        <scene sceneID="Me8-c9-Oox">
-            <objects>
-                <viewController id="hKf-0C-qAk" customClass="ViewController" customModule="MobileNetDemo" customModuleProvider="target" sceneMemberID="viewController">
-                    <view key="view" contentMode="scaleToFill" id="Yst-rK-Wk7">
-                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
-                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
-                        <subviews>
-                            <imageView userInteractionEnabled="NO" contentMode="scaleAspectFit" horizontalHuggingPriority="251" verticalHuggingPriority="251" translatesAutoresizingMaskIntoConstraints="NO" id="bDP-xQ-JgS">
-                                <rect key="frame" x="0.0" y="20" width="375" height="271"/>
-                            </imageView>
-                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" showsTouchWhenHighlighted="YES" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="HLo-2k-dr7">
-                                <rect key="frame" x="16" y="597" width="63.5" height="30"/>
-                                <color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-                                <constraints>
-                                    <constraint firstAttribute="width" secondItem="HLo-2k-dr7" secondAttribute="height" multiplier="21:10" id="xlA-qq-ubI"/>
-                                </constraints>
-                                <state key="normal" title="Image">
-                                    <color key="titleColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-                                </state>
-                                <connections>
-                                    <action selector="selectImageAct:" destination="hKf-0C-qAk" eventType="touchUpInside" id="rJB-ZK-jTR"/>
-                                </connections>
-                            </button>
-                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" showsTouchWhenHighlighted="YES" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="Aa7-KR-JhB">
-                                <rect key="frame" x="109.5" y="597" width="63" height="30"/>
-                                <color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-                                <state key="normal" title="Load">
-                                    <color key="titleColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-                                </state>
-                                <connections>
-                                    <action selector="loadAct:" destination="hKf-0C-qAk" eventType="touchUpInside" id="Lkj-aW-8vj"/>
-                                </connections>
-                            </button>
-                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" showsTouchWhenHighlighted="YES" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="2dy-Ya-PJY">
-                                <rect key="frame" x="202.5" y="597" width="63.5" height="30"/>
-                                <color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-                                <state key="normal" title="Predict">
-                                    <color key="titleColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-                                </state>
-                                <connections>
-                                    <action selector="predictAct:" destination="hKf-0C-qAk" eventType="touchUpInside" id="iw4-E7-3br"/>
-                                </connections>
-                            </button>
-                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" showsTouchWhenHighlighted="YES" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="Bac-eY-xPP">
-                                <rect key="frame" x="296" y="597" width="63" height="30"/>
-                                <color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-                                <state key="normal" title="Clear">
-                                    <color key="titleColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-                                </state>
-                                <connections>
-                                    <action selector="clearAct:" destination="hKf-0C-qAk" eventType="touchUpInside" id="QgH-jd-cR1"/>
-                                </connections>
-                            </button>
-                            <view contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="vhI-WH-WKF">
-                                <rect key="frame" x="79.5" y="597" width="30" height="30"/>
-                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-                                <constraints>
-                                    <constraint firstAttribute="width" constant="30" id="ffB-31-3Iy"/>
-                                    <constraint firstAttribute="height" constant="30" id="nbx-3B-EW0"/>
-                                </constraints>
-                            </view>
-                            <view contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="ZoT-1q-tgf">
-                                <rect key="frame" x="266" y="597" width="30" height="30"/>
-                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-                                <constraints>
-                                    <constraint firstAttribute="height" constant="30" id="Iu3-ig-lYv"/>
-                                    <constraint firstAttribute="width" constant="30" id="Jic-6I-7ch"/>
-                                </constraints>
-                            </view>
-                            <view contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="Zvo-dq-f6D">
-                                <rect key="frame" x="172.5" y="597" width="30" height="30"/>
-                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-                                <constraints>
-                                    <constraint firstAttribute="width" constant="30" id="Zgu-c6-rPT"/>
-                                    <constraint firstAttribute="height" constant="30" id="c8V-Gd-hiK"/>
-                                </constraints>
-                            </view>
-                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="耗时:" lineBreakMode="tailTruncation" numberOfLines="0" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="Jox-rT-ieC">
-                                <rect key="frame" x="15" y="301" width="350" height="38"/>
-                                <constraints>
-                                    <constraint firstAttribute="height" constant="38" id="8TB-w5-hbk"/>
-                                </constraints>
-                                <fontDescription key="fontDescription" type="system" pointSize="15"/>
-                                <nil key="textColor"/>
-                                <nil key="highlightedColor"/>
-                            </label>
-                            <imageView userInteractionEnabled="NO" contentMode="scaleToFill" horizontalHuggingPriority="251" verticalHuggingPriority="251" image="paddle-mobile.png" translatesAutoresizingMaskIntoConstraints="NO" id="PZO-kk-MVS">
-                                <rect key="frame" x="90" y="637" width="195" height="30"/>
-                                <constraints>
-                                    <constraint firstAttribute="width" secondItem="PZO-kk-MVS" secondAttribute="height" multiplier="6.5:1" id="9DJ-Rj-4ex"/>
-                                </constraints>
-                            </imageView>
-                            <textView clipsSubviews="YES" multipleTouchEnabled="YES" contentMode="scaleToFill" editable="NO" text="结果:" textAlignment="natural" translatesAutoresizingMaskIntoConstraints="NO" id="efW-gP-E3g">
-                                <rect key="frame" x="10" y="347" width="355" height="150"/>
-                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-                                <constraints>
-                                    <constraint firstAttribute="height" constant="150" id="whC-NW-nhZ"/>
-                                </constraints>
-                                <fontDescription key="fontDescription" type="system" pointSize="15"/>
-                                <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
-                            </textView>
-                        </subviews>
-                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
-                        <constraints>
-                            <constraint firstItem="PZO-kk-MVS" firstAttribute="centerX" secondItem="Yst-rK-Wk7" secondAttribute="centerX" id="2ET-tq-zfh"/>
-                            <constraint firstItem="Zvo-dq-f6D" firstAttribute="leading" secondItem="Aa7-KR-JhB" secondAttribute="trailing" id="368-Sl-KgC"/>
-                            <constraint firstItem="bDP-xQ-JgS" firstAttribute="top" secondItem="hlK-mk-uEU" secondAttribute="top" id="3HC-Tb-qff"/>
-                            <constraint firstItem="hlK-mk-uEU" firstAttribute="trailing" secondItem="efW-gP-E3g" secondAttribute="trailing" constant="10" id="8QW-BB-dry"/>
-                            <constraint firstItem="ZoT-1q-tgf" firstAttribute="leading" secondItem="2dy-Ya-PJY" secondAttribute="trailing" id="AhB-vB-1aW"/>
-                            <constraint firstItem="Bac-eY-xPP" firstAttribute="leading" secondItem="ZoT-1q-tgf" secondAttribute="trailing" id="BhE-d9-7Sf"/>
-                            <constraint firstItem="HLo-2k-dr7" firstAttribute="leading" secondItem="hlK-mk-uEU" secondAttribute="leading" constant="16" id="BuX-zw-HOG"/>
-                            <constraint firstItem="HLo-2k-dr7" firstAttribute="width" secondItem="Aa7-KR-JhB" secondAttribute="width" id="Dbs-xF-8in"/>
-                            <constraint firstItem="HLo-2k-dr7" firstAttribute="width" secondItem="Bac-eY-xPP" secondAttribute="width" id="Dov-mA-K38"/>
-                            <constraint firstItem="hlK-mk-uEU" firstAttribute="trailing" secondItem="Jox-rT-ieC" secondAttribute="trailing" constant="10" id="LfU-MA-UTb"/>
-                            <constraint firstItem="Aa7-KR-JhB" firstAttribute="centerY" secondItem="HLo-2k-dr7" secondAttribute="centerY" id="OMl-f7-5CL"/>
-                            <constraint firstItem="HLo-2k-dr7" firstAttribute="top" secondItem="efW-gP-E3g" secondAttribute="bottom" constant="100" id="P2f-lC-F02"/>
-                            <constraint firstItem="hlK-mk-uEU" firstAttribute="bottom" secondItem="HLo-2k-dr7" secondAttribute="bottom" constant="40" id="Po9-43-AFd"/>
-                            <constraint firstItem="bDP-xQ-JgS" firstAttribute="trailing" secondItem="hlK-mk-uEU" secondAttribute="trailing" id="Pqb-0o-qjh"/>
-                            <constraint firstItem="hlK-mk-uEU" firstAttribute="trailing" secondItem="Bac-eY-xPP" secondAttribute="trailing" constant="16" id="VOE-fl-N71"/>
-                            <constraint firstItem="vhI-WH-WKF" firstAttribute="leading" secondItem="HLo-2k-dr7" secondAttribute="trailing" id="Vlg-FW-uEQ"/>
-                            <constraint firstItem="ZoT-1q-tgf" firstAttribute="centerY" secondItem="HLo-2k-dr7" secondAttribute="centerY" id="Wpv-Ck-8l3"/>
-                            <constraint firstItem="efW-gP-E3g" firstAttribute="top" secondItem="Jox-rT-ieC" secondAttribute="bottom" constant="8" id="Z8f-Rs-QDZ"/>
-                            <constraint firstItem="vhI-WH-WKF" firstAttribute="centerY" secondItem="HLo-2k-dr7" secondAttribute="centerY" id="af8-Qd-iQN"/>
-                            <constraint firstItem="bDP-xQ-JgS" firstAttribute="leading" secondItem="hlK-mk-uEU" secondAttribute="leading" id="bZr-fF-a2S"/>
-                            <constraint firstItem="HLo-2k-dr7" firstAttribute="width" secondItem="2dy-Ya-PJY" secondAttribute="width" id="c0U-4X-uIO"/>
-                            <constraint firstItem="2dy-Ya-PJY" firstAttribute="leading" secondItem="Zvo-dq-f6D" secondAttribute="trailing" id="cRa-pW-xi8"/>
-                            <constraint firstItem="2dy-Ya-PJY" firstAttribute="centerY" secondItem="HLo-2k-dr7" secondAttribute="centerY" id="f2B-zG-wXC"/>
-                            <constraint firstItem="PZO-kk-MVS" firstAttribute="top" secondItem="HLo-2k-dr7" secondAttribute="bottom" constant="10" id="hAy-La-Eeh"/>
-                            <constraint firstItem="Zvo-dq-f6D" firstAttribute="centerY" secondItem="HLo-2k-dr7" secondAttribute="centerY" id="hUc-wn-Ua1"/>
-                            <constraint firstItem="Bac-eY-xPP" firstAttribute="centerY" secondItem="HLo-2k-dr7" secondAttribute="centerY" id="jDC-ag-kL6"/>
-                            <constraint firstItem="Aa7-KR-JhB" firstAttribute="leading" secondItem="vhI-WH-WKF" secondAttribute="trailing" id="jgU-OM-v1G"/>
-                            <constraint firstItem="PZO-kk-MVS" firstAttribute="bottom" secondItem="hlK-mk-uEU" secondAttribute="bottom" id="lkS-rk-Ap8"/>
-                            <constraint firstItem="efW-gP-E3g" firstAttribute="leading" secondItem="hlK-mk-uEU" secondAttribute="leading" constant="10" id="q2g-4E-mgJ"/>
-                            <constraint firstItem="Jox-rT-ieC" firstAttribute="top" secondItem="bDP-xQ-JgS" secondAttribute="bottom" constant="10" id="rqK-Pv-SXt"/>
-                            <constraint firstItem="Jox-rT-ieC" firstAttribute="leading" secondItem="hlK-mk-uEU" secondAttribute="leading" constant="15" id="sP3-ym-vhH"/>
-                        </constraints>
-                        <viewLayoutGuide key="safeArea" id="hlK-mk-uEU"/>
-                    </view>
-                    <connections>
-                        <outlet property="elapsedTimeLabel" destination="Jox-rT-ieC" id="QdK-sY-xmq"/>
-                        <outlet property="resultTextView" destination="efW-gP-E3g" id="Vnl-XG-D8E"/>
-                        <outlet property="selectImageView" destination="bDP-xQ-JgS" id="dMV-Wh-YsW"/>
-                    </connections>
-                </viewController>
-                <placeholder placeholderIdentifier="IBFirstResponder" id="ShQ-yg-7s0" sceneMemberID="firstResponder"/>
-            </objects>
-            <point key="canvasLocation" x="-1558" y="-14"/>
-        </scene>
-    </scenes>
-    <resources>
-        <image name="paddle-mobile.png" width="16" height="16"/>
-    </resources>
-</document>
diff --git a/metal/MobileNetDemo/MobileNetDemo/Info.plist b/metal/MobileNetDemo/MobileNetDemo/Info.plist
deleted file mode 100644
index fbe86e68a6..0000000000
--- a/metal/MobileNetDemo/MobileNetDemo/Info.plist
+++ /dev/null
@@ -1,47 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>CFBundleDevelopmentRegion</key>
-	<string>$(DEVELOPMENT_LANGUAGE)</string>
-	<key>CFBundleExecutable</key>
-	<string>$(EXECUTABLE_NAME)</string>
-	<key>CFBundleIdentifier</key>
-	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
-	<key>CFBundleInfoDictionaryVersion</key>
-	<string>6.0</string>
-	<key>CFBundleName</key>
-	<string>$(PRODUCT_NAME)</string>
-	<key>CFBundlePackageType</key>
-	<string>APPL</string>
-	<key>CFBundleShortVersionString</key>
-	<string>1.0</string>
-	<key>CFBundleVersion</key>
-	<string>1</string>
-	<key>LSRequiresIPhoneOS</key>
-	<true/>
-	<key>NSCameraUsageDescription</key>
-	<string>use camera</string>
-	<key>UILaunchStoryboardName</key>
-	<string>LaunchScreen</string>
-	<key>UIMainStoryboardFile</key>
-	<string>Main</string>
-	<key>UIRequiredDeviceCapabilities</key>
-	<array>
-		<string>armv7</string>
-	</array>
-	<key>UISupportedInterfaceOrientations</key>
-	<array>
-		<string>UIInterfaceOrientationPortrait</string>
-		<string>UIInterfaceOrientationLandscapeLeft</string>
-		<string>UIInterfaceOrientationLandscapeRight</string>
-	</array>
-	<key>UISupportedInterfaceOrientations~ipad</key>
-	<array>
-		<string>UIInterfaceOrientationPortrait</string>
-		<string>UIInterfaceOrientationPortraitUpsideDown</string>
-		<string>UIInterfaceOrientationLandscapeLeft</string>
-		<string>UIInterfaceOrientationLandscapeRight</string>
-	</array>
-</dict>
-</plist>
diff --git a/metal/MobileNetDemo/MobileNetDemo/MobileNet.swift b/metal/MobileNetDemo/MobileNetDemo/MobileNet.swift
deleted file mode 100644
index f28c3137c8..0000000000
--- a/metal/MobileNetDemo/MobileNetDemo/MobileNet.swift
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import paddle_mobile
-
-public class MobileNet: Net {
-    class MobilenetPreProccess: CusomKernel {
-        init(device: MTLDevice) {
-            let s = Shape.init(inWidth: 224, inHeight: 224, inChannel: 3)
-            super.init(device: device, inFunctionName: "mobilenet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
-        }
-    }
-    
-    class PreWords {
-        var contents: [String] = []
-        init(fileName: String, type: String = "txt", inBundle: Bundle = Bundle.main) {
-            if let filePath = inBundle.path(forResource: fileName, ofType: type) {
-                let string = try! String.init(contentsOfFile: filePath)
-                contents = string.components(separatedBy: CharacterSet.newlines).filter{$0.count > 10}.map{
-                    String($0[$0.index($0.startIndex, offsetBy: 10)...])
-                }
-            } else {
-                print("no file called \(fileName)")
-            }
-        }
-        subscript(index: Int) -> String {
-            return index < contents.count ? contents[index] : ""
-        }
-    }
-    
-    let labels = PreWords.init(fileName: "synset")
-    
-    override public func resultStr(res: [ResultHolder]) -> String {
-        let firstRes = res[0]
-        let resPointer = firstRes.result
-        var s: [String] = []
-        (0..<firstRes.capacity).map { resPointer[$0] }.top(r: 5).enumerated().forEach{
-            s.append(String(format: "%d: %@ (%3.2f%%)", $0 + 1, labels[$1.0], $1.1 * 100))
-        }
-        return s.joined(separator: "\n")
-    }
-    
-    override public init(device: MTLDevice) throws {
-        super.init(device: device)
-        except = 0
-        guard let modelPath = Bundle.main.path(forResource: "mobilenet_model", ofType: nil) else {
-            throw PaddleMobileError.makeError(type: PaddleMobileErrorType.loaderError, msg: "model null")
-        }
-        self.modelPath = modelPath
-        guard let paramPath = Bundle.main.path(forResource: "mobilenet_params", ofType: nil) else {
-            throw PaddleMobileError.makeError(type: PaddleMobileErrorType.loaderError, msg: "para null")
-        }
-        self.paramPath = paramPath
-        preprocessKernel = MobilenetPreProccess.init(device: device)
-        inputDim = Dim.init(inDim: [1, 224, 224, 3])
-        metalLoadMode = .LoadMetalInCustomMetalLib
-        guard let metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") else {
-            throw PaddleMobileError.makeError(type: PaddleMobileErrorType.loaderError, msg: "metallib null")
-        }
-        self.metalLibPath = metalLibPath
-        useMPS = true
-    }
-}
-
diff --git a/metal/MobileNetDemo/MobileNetDemo/MobilenetPreProcess.metal b/metal/MobileNetDemo/MobileNetDemo/MobilenetPreProcess.metal
deleted file mode 100644
index 2da78ec4c1..0000000000
--- a/metal/MobileNetDemo/MobileNetDemo/MobilenetPreProcess.metal
+++ /dev/null
@@ -1,38 +0,0 @@
-//
-//  MobilenetProcess.metal
-//  MobileNetDemo
-//
-//  Created by liuRuiLong on 2019/1/5.
-//  Copyright © 2019 Ray. All rights reserved.
-//
-
-#include <metal_stdlib>
-using namespace metal;
-
-kernel void mobilenet_preprocess(
-                                 texture2d<float, access::read> inTexture [[texture(0)]],
-                                 texture2d<float, access::write> outTexture [[texture(1)]],
-                                 uint2 gid [[thread_position_in_grid]])
-{
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height()) {
-        return;
-    }
-    const auto means = float4(123.68f, 116.78f, 103.94f, 0.0f);
-    const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
-    outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
-}
-
-kernel void mobilenet_preprocess_half(
-                                      texture2d<half, access::read> inTexture [[texture(0)]],
-                                      texture2d<half, access::write> outTexture [[texture(1)]],
-                                      uint2 gid [[thread_position_in_grid]])
-{
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height()) {
-        return;
-    }
-    const auto means = half4(123.68f, 116.78f, 103.94f, 0.0f);
-    const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
-    outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
-}
diff --git a/metal/MobileNetDemo/MobileNetDemo/ViewController.swift b/metal/MobileNetDemo/MobileNetDemo/ViewController.swift
deleted file mode 100644
index 675265b2a4..0000000000
--- a/metal/MobileNetDemo/MobileNetDemo/ViewController.swift
+++ /dev/null
@@ -1,94 +0,0 @@
-//
-//  ViewController.swift
-//  MobileNetDemo
-//
-//  Created by liuRuiLong on 2019/1/4.
-//  Copyright © 2019 Ray. All rights reserved.
-//
-
-import UIKit
-import paddle_mobile
-
-class ViewController: UIViewController {
-    @IBOutlet weak var resultTextView: UITextView!
-    @IBOutlet weak var selectImageView: UIImageView!
-    @IBOutlet weak var elapsedTimeLabel: UILabel!
-    var net: MobileNet!
-    var runner: Runner!
-    var toPredictTexture: MTLTexture?
-    
-    override func viewDidLoad() {
-        super.viewDidLoad()
-        GlobalConfig.shared.computePrecision = .Float16
-        net = MobileNet.init(device: MetalHelper.shared.device)
-        runner = Runner.init(inNet: net, commandQueue: MetalHelper.shared.queue)
-        
-        if let selectImage = UIImage.init(named: "banana.jpeg") {
-            selectImageView.image = selectImage
-            runner.getTexture(image: selectImage.cgImage!) {[weak self] (texture) in
-                self?.toPredictTexture = texture
-            }
-        }
-        
-    }
-    
-    @IBAction func loadAct(_ sender: Any) {
-        if runner.load() {
-            let resutText = " load success ! "
-            print(resutText)
-            self.resultTextView.text = resutText
-        } else {
-            print("load fail!!!")
-        }
-    }
-    
-    @IBAction func selectImageAct(_ sender: Any) {
-        let imagePicker = UIImagePickerController()
-        imagePicker.sourceType = .camera
-        imagePicker.delegate = self
-        self.present(imagePicker, animated: true, completion: nil)
-    }
-    
-    @IBAction func clearAct(_ sender: Any) {
-        runner.clear()
-    }
-    
-    @IBAction func predictAct(_ sender: Any) {
-        
-        if let texture = toPredictTexture {
-            let beginDate = Date.init()
-            runner.predict(texture: texture) { [weak self] (success, resultHolder) in
-                if success, let inResultHolder = resultHolder {
-                    let timeUse = Date.init().timeIntervalSince(beginDate)
-                    DispatchQueue.main.async {
-                        self?.elapsedTimeLabel.text = "\(timeUse * 1000)ms"
-                        self?.resultTextView.text = self?.net.resultStr(res: inResultHolder)
-                    }
-                    
-                } else {
-                    print(" predict fail ")
-                }
-            }
-        } else {
-            print(" toPredictTexture is nil ")
-        }
-        
-    }
-    
-}
-
-extension ViewController:  UIImagePickerControllerDelegate, UINavigationControllerDelegate {
-    func imagePickerController(_ picker: UIImagePickerController, didFinishPickingMediaWithInfo info: [String : Any]) {
-        picker.dismiss(animated: true){[weak self] in
-            guard let sSelf = self, let image =  info["UIImagePickerControllerOriginalImage"] as? UIImage else {
-                print("no image!!!")
-                return
-            }
-            sSelf.selectImageView.image = image
-            sSelf.runner.getTexture(image: image.cgImage!, getTexture: { (texture) in
-                sSelf.toPredictTexture = texture
-            })
-        }
-    }
-}
-
diff --git a/metal/PaddleMobileTest/PaddleMobileTest.xcodeproj/project.pbxproj b/metal/PaddleMobileTest/PaddleMobileTest.xcodeproj/project.pbxproj
deleted file mode 100644
index 0bbe4140e0..0000000000
--- a/metal/PaddleMobileTest/PaddleMobileTest.xcodeproj/project.pbxproj
+++ /dev/null
@@ -1,457 +0,0 @@
-// !$*UTF8*$!
-{
-	archiveVersion = 1;
-	classes = {
-	};
-	objectVersion = 50;
-	objects = {
-
-/* Begin PBXBuildFile section */
-		63B1E033193F615BEBD58F0A /* Pods_PaddleMobileTest.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = CBD7FB4D751917F7F9CCDED6 /* Pods_PaddleMobileTest.framework */; };
-		A7138A8522C4AE8D00DE0BD3 /* AppDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = A7138A8422C4AE8D00DE0BD3 /* AppDelegate.swift */; };
-		A7138A8722C4AE8D00DE0BD3 /* ViewController.swift in Sources */ = {isa = PBXBuildFile; fileRef = A7138A8622C4AE8D00DE0BD3 /* ViewController.swift */; };
-		A7138A8A22C4AE8E00DE0BD3 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = A7138A8822C4AE8E00DE0BD3 /* Main.storyboard */; };
-		A7138A8C22C4AE9100DE0BD3 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = A7138A8B22C4AE9100DE0BD3 /* Assets.xcassets */; };
-		A7138A8F22C4AE9100DE0BD3 /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = A7138A8D22C4AE9100DE0BD3 /* LaunchScreen.storyboard */; };
-		A7138A9722C4E36200DE0BD3 /* TestViewController.swift in Sources */ = {isa = PBXBuildFile; fileRef = A7138A9622C4E36100DE0BD3 /* TestViewController.swift */; };
-		A73E3FBC22CA330F00860CFD /* yolo_params_v3_16 in Resources */ = {isa = PBXBuildFile; fileRef = A73E3FBA22CA330E00860CFD /* yolo_params_v3_16 */; };
-		A73E3FBD22CA330F00860CFD /* yolo_model_v3_16 in Resources */ = {isa = PBXBuildFile; fileRef = A73E3FBB22CA330E00860CFD /* yolo_model_v3_16 */; };
-		A759D07822E6B96500570128 /* paddle-mobile-metallib.metallib in Resources */ = {isa = PBXBuildFile; fileRef = A759D07722E6B96500570128 /* paddle-mobile-metallib.metallib */; };
-		A759D07A22E6BAFB00570128 /* paddle_mobile.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = A759D07922E6BAFB00570128 /* paddle_mobile.framework */; };
-/* End PBXBuildFile section */
-
-/* Begin PBXFileReference section */
-		A7138A8122C4AE8D00DE0BD3 /* PaddleMobileTest.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = PaddleMobileTest.app; sourceTree = BUILT_PRODUCTS_DIR; };
-		A7138A8422C4AE8D00DE0BD3 /* AppDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppDelegate.swift; sourceTree = "<group>"; };
-		A7138A8622C4AE8D00DE0BD3 /* ViewController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ViewController.swift; sourceTree = "<group>"; };
-		A7138A8922C4AE8E00DE0BD3 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/Main.storyboard; sourceTree = "<group>"; };
-		A7138A8B22C4AE9100DE0BD3 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
-		A7138A8E22C4AE9100DE0BD3 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = "<group>"; };
-		A7138A9022C4AE9100DE0BD3 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
-		A7138A9622C4E36100DE0BD3 /* TestViewController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TestViewController.swift; sourceTree = "<group>"; };
-		A73E3FBA22CA330E00860CFD /* yolo_params_v3_16 */ = {isa = PBXFileReference; lastKnownFileType = file; name = yolo_params_v3_16; path = ../../../../../Documents/paddlemobiletest/yolo_params_v3_16; sourceTree = "<group>"; };
-		A73E3FBB22CA330E00860CFD /* yolo_model_v3_16 */ = {isa = PBXFileReference; lastKnownFileType = file; name = yolo_model_v3_16; path = ../../../../../Documents/paddlemobiletest/yolo_model_v3_16; sourceTree = "<group>"; };
-		A759D07722E6B96500570128 /* paddle-mobile-metallib.metallib */ = {isa = PBXFileReference; lastKnownFileType = "archive.metal-library"; name = "paddle-mobile-metallib.metallib"; path = "../../../../../Library/Developer/Xcode/DerivedData/paddle-mobile-ahygfxrimvxpabekluyjeefybtnn/Build/Products/Release-iphoneos/paddle-mobile-metallib.metallib"; sourceTree = "<group>"; };
-		A759D07922E6BAFB00570128 /* paddle_mobile.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = paddle_mobile.framework; path = "../../../../../Library/Developer/Xcode/DerivedData/paddle-mobile-ahygfxrimvxpabekluyjeefybtnn/Build/Products/Debug-iphoneos/paddle_mobile.framework"; sourceTree = "<group>"; };
-		B9E5E71FB1F45F0539BC30B8 /* Pods-PaddleMobileTest.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-PaddleMobileTest.release.xcconfig"; path = "Target Support Files/Pods-PaddleMobileTest/Pods-PaddleMobileTest.release.xcconfig"; sourceTree = "<group>"; };
-		BB4F668F51949D6F7E88641F /* Pods-PaddleMobileTest.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-PaddleMobileTest.debug.xcconfig"; path = "Target Support Files/Pods-PaddleMobileTest/Pods-PaddleMobileTest.debug.xcconfig"; sourceTree = "<group>"; };
-		CBD7FB4D751917F7F9CCDED6 /* Pods_PaddleMobileTest.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Pods_PaddleMobileTest.framework; sourceTree = BUILT_PRODUCTS_DIR; };
-/* End PBXFileReference section */
-
-/* Begin PBXFrameworksBuildPhase section */
-		A7138A7E22C4AE8D00DE0BD3 /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				A759D07A22E6BAFB00570128 /* paddle_mobile.framework in Frameworks */,
-				63B1E033193F615BEBD58F0A /* Pods_PaddleMobileTest.framework in Frameworks */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXFrameworksBuildPhase section */
-
-/* Begin PBXGroup section */
-		3D35D3608D9C641A0B411ECF /* Frameworks */ = {
-			isa = PBXGroup;
-			children = (
-				CBD7FB4D751917F7F9CCDED6 /* Pods_PaddleMobileTest.framework */,
-			);
-			name = Frameworks;
-			sourceTree = "<group>";
-		};
-		467B36527BB8321ECB123A29 /* Pods */ = {
-			isa = PBXGroup;
-			children = (
-				BB4F668F51949D6F7E88641F /* Pods-PaddleMobileTest.debug.xcconfig */,
-				B9E5E71FB1F45F0539BC30B8 /* Pods-PaddleMobileTest.release.xcconfig */,
-			);
-			name = Pods;
-			path = ../Pods;
-			sourceTree = "<group>";
-		};
-		A7138A7822C4AE8D00DE0BD3 = {
-			isa = PBXGroup;
-			children = (
-				A73E3FBB22CA330E00860CFD /* yolo_model_v3_16 */,
-				A73E3FBA22CA330E00860CFD /* yolo_params_v3_16 */,
-				A759D07722E6B96500570128 /* paddle-mobile-metallib.metallib */,
-				A759D07922E6BAFB00570128 /* paddle_mobile.framework */,
-				A7138A8322C4AE8D00DE0BD3 /* PaddleMobileTest */,
-				A7138A8222C4AE8D00DE0BD3 /* Products */,
-				467B36527BB8321ECB123A29 /* Pods */,
-				3D35D3608D9C641A0B411ECF /* Frameworks */,
-			);
-			sourceTree = "<group>";
-		};
-		A7138A8222C4AE8D00DE0BD3 /* Products */ = {
-			isa = PBXGroup;
-			children = (
-				A7138A8122C4AE8D00DE0BD3 /* PaddleMobileTest.app */,
-			);
-			name = Products;
-			sourceTree = "<group>";
-		};
-		A7138A8322C4AE8D00DE0BD3 /* PaddleMobileTest */ = {
-			isa = PBXGroup;
-			children = (
-				A7138A8422C4AE8D00DE0BD3 /* AppDelegate.swift */,
-				A7138A8622C4AE8D00DE0BD3 /* ViewController.swift */,
-				A7138A8822C4AE8E00DE0BD3 /* Main.storyboard */,
-				A7138A8B22C4AE9100DE0BD3 /* Assets.xcassets */,
-				A7138A8D22C4AE9100DE0BD3 /* LaunchScreen.storyboard */,
-				A7138A9022C4AE9100DE0BD3 /* Info.plist */,
-				A7138A9622C4E36100DE0BD3 /* TestViewController.swift */,
-			);
-			path = PaddleMobileTest;
-			sourceTree = "<group>";
-		};
-/* End PBXGroup section */
-
-/* Begin PBXNativeTarget section */
-		A7138A8022C4AE8D00DE0BD3 /* PaddleMobileTest */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = A7138A9322C4AE9100DE0BD3 /* Build configuration list for PBXNativeTarget "PaddleMobileTest" */;
-			buildPhases = (
-				896B772D5979550252A5A335 /* [CP] Check Pods Manifest.lock */,
-				A7138A7D22C4AE8D00DE0BD3 /* Sources */,
-				A7138A7E22C4AE8D00DE0BD3 /* Frameworks */,
-				A7138A7F22C4AE8D00DE0BD3 /* Resources */,
-				0B913BBB59E1AE207CEE0A43 /* [CP] Embed Pods Frameworks */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-			);
-			name = PaddleMobileTest;
-			productName = PaddleMobileTest;
-			productReference = A7138A8122C4AE8D00DE0BD3 /* PaddleMobileTest.app */;
-			productType = "com.apple.product-type.application";
-		};
-/* End PBXNativeTarget section */
-
-/* Begin PBXProject section */
-		A7138A7922C4AE8D00DE0BD3 /* Project object */ = {
-			isa = PBXProject;
-			attributes = {
-				LastSwiftUpdateCheck = 1020;
-				LastUpgradeCheck = 1020;
-				ORGANIZATIONNAME = "Li,Jian(MMS)";
-				TargetAttributes = {
-					A7138A8022C4AE8D00DE0BD3 = {
-						CreatedOnToolsVersion = 10.2.1;
-					};
-				};
-			};
-			buildConfigurationList = A7138A7C22C4AE8D00DE0BD3 /* Build configuration list for PBXProject "PaddleMobileTest" */;
-			compatibilityVersion = "Xcode 9.3";
-			developmentRegion = en;
-			hasScannedForEncodings = 0;
-			knownRegions = (
-				en,
-				Base,
-			);
-			mainGroup = A7138A7822C4AE8D00DE0BD3;
-			productRefGroup = A7138A8222C4AE8D00DE0BD3 /* Products */;
-			projectDirPath = "";
-			projectRoot = "";
-			targets = (
-				A7138A8022C4AE8D00DE0BD3 /* PaddleMobileTest */,
-			);
-		};
-/* End PBXProject section */
-
-/* Begin PBXResourcesBuildPhase section */
-		A7138A7F22C4AE8D00DE0BD3 /* Resources */ = {
-			isa = PBXResourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				A7138A8F22C4AE9100DE0BD3 /* LaunchScreen.storyboard in Resources */,
-				A7138A8C22C4AE9100DE0BD3 /* Assets.xcassets in Resources */,
-				A73E3FBD22CA330F00860CFD /* yolo_model_v3_16 in Resources */,
-				A759D07822E6B96500570128 /* paddle-mobile-metallib.metallib in Resources */,
-				A73E3FBC22CA330F00860CFD /* yolo_params_v3_16 in Resources */,
-				A7138A8A22C4AE8E00DE0BD3 /* Main.storyboard in Resources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXResourcesBuildPhase section */
-
-/* Begin PBXShellScriptBuildPhase section */
-		0B913BBB59E1AE207CEE0A43 /* [CP] Embed Pods Frameworks */ = {
-			isa = PBXShellScriptBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			inputFileListPaths = (
-			);
-			inputPaths = (
-				"${PODS_ROOT}/Target Support Files/Pods-PaddleMobileTest/Pods-PaddleMobileTest-frameworks.sh",
-				"${BUILT_PRODUCTS_DIR}/Protobuf/Protobuf.framework",
-				"${BUILT_PRODUCTS_DIR}/Alamofire/Alamofire.framework",
-			);
-			name = "[CP] Embed Pods Frameworks";
-			outputFileListPaths = (
-			);
-			outputPaths = (
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/Protobuf.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/Alamofire.framework",
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-			shellPath = /bin/sh;
-			shellScript = "\"${PODS_ROOT}/Target Support Files/Pods-PaddleMobileTest/Pods-PaddleMobileTest-frameworks.sh\"\n";
-			showEnvVarsInLog = 0;
-		};
-		896B772D5979550252A5A335 /* [CP] Check Pods Manifest.lock */ = {
-			isa = PBXShellScriptBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			inputFileListPaths = (
-			);
-			inputPaths = (
-				"${PODS_PODFILE_DIR_PATH}/Podfile.lock",
-				"${PODS_ROOT}/Manifest.lock",
-			);
-			name = "[CP] Check Pods Manifest.lock";
-			outputFileListPaths = (
-			);
-			outputPaths = (
-				"$(DERIVED_FILE_DIR)/Pods-PaddleMobileTest-checkManifestLockResult.txt",
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-			shellPath = /bin/sh;
-			shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n    # print error to STDERR\n    echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n    exit 1\nfi\n# This output is used by Xcode 'outputs' to avoid re-running this script phase.\necho \"SUCCESS\" > \"${SCRIPT_OUTPUT_FILE_0}\"\n";
-			showEnvVarsInLog = 0;
-		};
-/* End PBXShellScriptBuildPhase section */
-
-/* Begin PBXSourcesBuildPhase section */
-		A7138A7D22C4AE8D00DE0BD3 /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				A7138A8722C4AE8D00DE0BD3 /* ViewController.swift in Sources */,
-				A7138A9722C4E36200DE0BD3 /* TestViewController.swift in Sources */,
-				A7138A8522C4AE8D00DE0BD3 /* AppDelegate.swift in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXSourcesBuildPhase section */
-
-/* Begin PBXVariantGroup section */
-		A7138A8822C4AE8E00DE0BD3 /* Main.storyboard */ = {
-			isa = PBXVariantGroup;
-			children = (
-				A7138A8922C4AE8E00DE0BD3 /* Base */,
-			);
-			name = Main.storyboard;
-			sourceTree = "<group>";
-		};
-		A7138A8D22C4AE9100DE0BD3 /* LaunchScreen.storyboard */ = {
-			isa = PBXVariantGroup;
-			children = (
-				A7138A8E22C4AE9100DE0BD3 /* Base */,
-			);
-			name = LaunchScreen.storyboard;
-			sourceTree = "<group>";
-		};
-/* End PBXVariantGroup section */
-
-/* Begin XCBuildConfiguration section */
-		A7138A9122C4AE9100DE0BD3 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_ENABLE_OBJC_WEAK = YES;
-				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_COMMA = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-				CLANG_WARN_STRICT_PROTOTYPES = YES;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = dwarf;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				ENABLE_TESTABILITY = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu11;
-				GCC_DYNAMIC_NO_PIC = NO;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_OPTIMIZATION_LEVEL = 0;
-				GCC_PREPROCESSOR_DEFINITIONS = (
-					"DEBUG=1",
-					"$(inherited)",
-				);
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 12.2;
-				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
-				MTL_FAST_MATH = YES;
-				ONLY_ACTIVE_ARCH = YES;
-				SDKROOT = iphoneos;
-				SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG;
-				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
-			};
-			name = Debug;
-		};
-		A7138A9222C4AE9100DE0BD3 /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_ENABLE_OBJC_WEAK = YES;
-				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_COMMA = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-				CLANG_WARN_STRICT_PROTOTYPES = YES;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
-				ENABLE_NS_ASSERTIONS = NO;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu11;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 12.2;
-				MTL_ENABLE_DEBUG_INFO = NO;
-				MTL_FAST_MATH = YES;
-				SDKROOT = iphoneos;
-				SWIFT_COMPILATION_MODE = wholemodule;
-				SWIFT_OPTIMIZATION_LEVEL = "-O";
-				VALIDATE_PRODUCT = YES;
-			};
-			name = Release;
-		};
-		A7138A9422C4AE9100DE0BD3 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			baseConfigurationReference = BB4F668F51949D6F7E88641F /* Pods-PaddleMobileTest.debug.xcconfig */;
-			buildSettings = {
-				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = A798K58VVL;
-				INFOPLIST_FILE = PaddleMobileTest/Info.plist;
-				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
-				LD_RUNPATH_SEARCH_PATHS = (
-					"$(inherited)",
-					"@executable_path/Frameworks",
-				);
-				OTHER_LDFLAGS = (
-					"$(inherited)",
-					"-framework",
-					"\"Alamofire\"",
-					"-framework",
-					"\"Protobuf\"",
-					"-all_load",
-				);
-				PRODUCT_BUNDLE_IDENTIFIER = mutouren.PaddleMobileTest;
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SWIFT_VERSION = 5.0;
-				TARGETED_DEVICE_FAMILY = "1,2";
-			};
-			name = Debug;
-		};
-		A7138A9522C4AE9100DE0BD3 /* Release */ = {
-			isa = XCBuildConfiguration;
-			baseConfigurationReference = B9E5E71FB1F45F0539BC30B8 /* Pods-PaddleMobileTest.release.xcconfig */;
-			buildSettings = {
-				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = A798K58VVL;
-				INFOPLIST_FILE = PaddleMobileTest/Info.plist;
-				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
-				LD_RUNPATH_SEARCH_PATHS = (
-					"$(inherited)",
-					"@executable_path/Frameworks",
-				);
-				OTHER_LDFLAGS = (
-					"$(inherited)",
-					"-framework",
-					"\"Alamofire\"",
-					"-framework",
-					"\"Protobuf\"",
-					"-all_load",
-				);
-				PRODUCT_BUNDLE_IDENTIFIER = mutouren.PaddleMobileTest;
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SWIFT_VERSION = 5.0;
-				TARGETED_DEVICE_FAMILY = "1,2";
-			};
-			name = Release;
-		};
-/* End XCBuildConfiguration section */
-
-/* Begin XCConfigurationList section */
-		A7138A7C22C4AE8D00DE0BD3 /* Build configuration list for PBXProject "PaddleMobileTest" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				A7138A9122C4AE9100DE0BD3 /* Debug */,
-				A7138A9222C4AE9100DE0BD3 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		A7138A9322C4AE9100DE0BD3 /* Build configuration list for PBXNativeTarget "PaddleMobileTest" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				A7138A9422C4AE9100DE0BD3 /* Debug */,
-				A7138A9522C4AE9100DE0BD3 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-/* End XCConfigurationList section */
-	};
-	rootObject = A7138A7922C4AE8D00DE0BD3 /* Project object */;
-}
diff --git a/metal/PaddleMobileTest/PaddleMobileTest.xcodeproj/project.xcworkspace/contents.xcworkspacedata b/metal/PaddleMobileTest/PaddleMobileTest.xcodeproj/project.xcworkspace/contents.xcworkspacedata
deleted file mode 100644
index b1e7d0c915..0000000000
--- a/metal/PaddleMobileTest/PaddleMobileTest.xcodeproj/project.xcworkspace/contents.xcworkspacedata
+++ /dev/null
@@ -1,7 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<Workspace
-   version = "1.0">
-   <FileRef
-      location = "self:PaddleMobileTest.xcodeproj">
-   </FileRef>
-</Workspace>
diff --git a/metal/PaddleMobileTest/PaddleMobileTest.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist b/metal/PaddleMobileTest/PaddleMobileTest.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
deleted file mode 100644
index 18d981003d..0000000000
--- a/metal/PaddleMobileTest/PaddleMobileTest.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
+++ /dev/null
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>IDEDidComputeMac32BitWarning</key>
-	<true/>
-</dict>
-</plist>
diff --git a/metal/PaddleMobileTest/PaddleMobileTest.xcodeproj/xcshareddata/xcschemes/PaddleMobileTest.xcscheme b/metal/PaddleMobileTest/PaddleMobileTest.xcodeproj/xcshareddata/xcschemes/PaddleMobileTest.xcscheme
deleted file mode 100644
index 99841044fd..0000000000
--- a/metal/PaddleMobileTest/PaddleMobileTest.xcodeproj/xcshareddata/xcschemes/PaddleMobileTest.xcscheme
+++ /dev/null
@@ -1,91 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<Scheme
-   LastUpgradeVersion = "1020"
-   version = "1.3">
-   <BuildAction
-      parallelizeBuildables = "YES"
-      buildImplicitDependencies = "YES">
-      <BuildActionEntries>
-         <BuildActionEntry
-            buildForTesting = "YES"
-            buildForRunning = "YES"
-            buildForProfiling = "YES"
-            buildForArchiving = "YES"
-            buildForAnalyzing = "YES">
-            <BuildableReference
-               BuildableIdentifier = "primary"
-               BlueprintIdentifier = "A7138A8022C4AE8D00DE0BD3"
-               BuildableName = "PaddleMobileTest.app"
-               BlueprintName = "PaddleMobileTest"
-               ReferencedContainer = "container:PaddleMobileTest.xcodeproj">
-            </BuildableReference>
-         </BuildActionEntry>
-      </BuildActionEntries>
-   </BuildAction>
-   <TestAction
-      buildConfiguration = "Debug"
-      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
-      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
-      shouldUseLaunchSchemeArgsEnv = "YES">
-      <Testables>
-      </Testables>
-      <MacroExpansion>
-         <BuildableReference
-            BuildableIdentifier = "primary"
-            BlueprintIdentifier = "A7138A8022C4AE8D00DE0BD3"
-            BuildableName = "PaddleMobileTest.app"
-            BlueprintName = "PaddleMobileTest"
-            ReferencedContainer = "container:PaddleMobileTest.xcodeproj">
-         </BuildableReference>
-      </MacroExpansion>
-      <AdditionalOptions>
-      </AdditionalOptions>
-   </TestAction>
-   <LaunchAction
-      buildConfiguration = "Debug"
-      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
-      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
-      launchStyle = "0"
-      useCustomWorkingDirectory = "NO"
-      ignoresPersistentStateOnLaunch = "NO"
-      debugDocumentVersioning = "YES"
-      debugServiceExtension = "internal"
-      allowLocationSimulation = "YES">
-      <BuildableProductRunnable
-         runnableDebuggingMode = "0">
-         <BuildableReference
-            BuildableIdentifier = "primary"
-            BlueprintIdentifier = "A7138A8022C4AE8D00DE0BD3"
-            BuildableName = "PaddleMobileTest.app"
-            BlueprintName = "PaddleMobileTest"
-            ReferencedContainer = "container:PaddleMobileTest.xcodeproj">
-         </BuildableReference>
-      </BuildableProductRunnable>
-      <AdditionalOptions>
-      </AdditionalOptions>
-   </LaunchAction>
-   <ProfileAction
-      buildConfiguration = "Release"
-      shouldUseLaunchSchemeArgsEnv = "YES"
-      savedToolIdentifier = ""
-      useCustomWorkingDirectory = "NO"
-      debugDocumentVersioning = "YES">
-      <BuildableProductRunnable
-         runnableDebuggingMode = "0">
-         <BuildableReference
-            BuildableIdentifier = "primary"
-            BlueprintIdentifier = "A7138A8022C4AE8D00DE0BD3"
-            BuildableName = "PaddleMobileTest.app"
-            BlueprintName = "PaddleMobileTest"
-            ReferencedContainer = "container:PaddleMobileTest.xcodeproj">
-         </BuildableReference>
-      </BuildableProductRunnable>
-   </ProfileAction>
-   <AnalyzeAction
-      buildConfiguration = "Debug">
-   </AnalyzeAction>
-   <ArchiveAction
-      buildConfiguration = "Release"
-      revealArchiveInOrganizer = "YES">
-   </ArchiveAction>
-</Scheme>
diff --git a/metal/PaddleMobileTest/PaddleMobileTest/AppDelegate.swift b/metal/PaddleMobileTest/PaddleMobileTest/AppDelegate.swift
deleted file mode 100644
index b069c4d949..0000000000
--- a/metal/PaddleMobileTest/PaddleMobileTest/AppDelegate.swift
+++ /dev/null
@@ -1,46 +0,0 @@
-//
-//  AppDelegate.swift
-//  PaddleMobileTest
-//
-//  Created by Li,Jian(MMS) on 2019/6/27.
-//  Copyright © 2019 Li,Jian(MMS). All rights reserved.
-//
-
-import UIKit
-
-@UIApplicationMain
-class AppDelegate: UIResponder, UIApplicationDelegate {
-
-    var window: UIWindow?
-
-
-    func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplication.LaunchOptionsKey: Any]?) -> Bool {
-        // Override point for customization after application launch.
-        return true
-    }
-
-    func applicationWillResignActive(_ application: UIApplication) {
-        // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state.
-        // Use this method to pause ongoing tasks, disable timers, and invalidate graphics rendering callbacks. Games should use this method to pause the game.
-    }
-
-    func applicationDidEnterBackground(_ application: UIApplication) {
-        // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later.
-        // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits.
-    }
-
-    func applicationWillEnterForeground(_ application: UIApplication) {
-        // Called as part of the transition from the background to the active state; here you can undo many of the changes made on entering the background.
-    }
-
-    func applicationDidBecomeActive(_ application: UIApplication) {
-        // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface.
-    }
-
-    func applicationWillTerminate(_ application: UIApplication) {
-        // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:.
-    }
-
-
-}
-
diff --git a/metal/PaddleMobileTest/PaddleMobileTest/Assets.xcassets/AppIcon.appiconset/Contents.json b/metal/PaddleMobileTest/PaddleMobileTest/Assets.xcassets/AppIcon.appiconset/Contents.json
deleted file mode 100644
index b542ec24d2..0000000000
--- a/metal/PaddleMobileTest/PaddleMobileTest/Assets.xcassets/AppIcon.appiconset/Contents.json
+++ /dev/null
@@ -1,98 +0,0 @@
-{
-  "images" : [
-    {
-      "idiom" : "iphone",
-      "size" : "20x20",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "iphone",
-      "size" : "20x20",
-      "scale" : "3x"
-    },
-    {
-      "idiom" : "iphone",
-      "size" : "29x29",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "iphone",
-      "size" : "29x29",
-      "scale" : "3x"
-    },
-    {
-      "idiom" : "iphone",
-      "size" : "40x40",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "iphone",
-      "size" : "40x40",
-      "scale" : "3x"
-    },
-    {
-      "idiom" : "iphone",
-      "size" : "60x60",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "iphone",
-      "size" : "60x60",
-      "scale" : "3x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "20x20",
-      "scale" : "1x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "20x20",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "29x29",
-      "scale" : "1x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "29x29",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "40x40",
-      "scale" : "1x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "40x40",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "76x76",
-      "scale" : "1x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "76x76",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "83.5x83.5",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "ios-marketing",
-      "size" : "1024x1024",
-      "scale" : "1x"
-    }
-  ],
-  "info" : {
-    "version" : 1,
-    "author" : "xcode"
-  }
-}
diff --git a/metal/PaddleMobileTest/PaddleMobileTest/Assets.xcassets/Contents.json b/metal/PaddleMobileTest/PaddleMobileTest/Assets.xcassets/Contents.json
deleted file mode 100644
index 2d92bd53fd..0000000000
--- a/metal/PaddleMobileTest/PaddleMobileTest/Assets.xcassets/Contents.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
-  "info" : {
-    "version" : 1,
-    "author" : "xcode"
-  }
-}
diff --git a/metal/PaddleMobileTest/PaddleMobileTest/Base.lproj/LaunchScreen.storyboard b/metal/PaddleMobileTest/PaddleMobileTest/Base.lproj/LaunchScreen.storyboard
deleted file mode 100644
index bfa3612941..0000000000
--- a/metal/PaddleMobileTest/PaddleMobileTest/Base.lproj/LaunchScreen.storyboard
+++ /dev/null
@@ -1,25 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="13122.16" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" launchScreen="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="01J-lp-oVM">
-    <dependencies>
-        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="13104.12"/>
-        <capability name="Safe area layout guides" minToolsVersion="9.0"/>
-        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
-    </dependencies>
-    <scenes>
-        <!--View Controller-->
-        <scene sceneID="EHf-IW-A2E">
-            <objects>
-                <viewController id="01J-lp-oVM" sceneMemberID="viewController">
-                    <view key="view" contentMode="scaleToFill" id="Ze5-6b-2t3">
-                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
-                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
-                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
-                        <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
-                    </view>
-                </viewController>
-                <placeholder placeholderIdentifier="IBFirstResponder" id="iYj-Kq-Ea1" userLabel="First Responder" sceneMemberID="firstResponder"/>
-            </objects>
-            <point key="canvasLocation" x="53" y="375"/>
-        </scene>
-    </scenes>
-</document>
diff --git a/metal/PaddleMobileTest/PaddleMobileTest/Base.lproj/Main.storyboard b/metal/PaddleMobileTest/PaddleMobileTest/Base.lproj/Main.storyboard
deleted file mode 100644
index 27f03617d8..0000000000
--- a/metal/PaddleMobileTest/PaddleMobileTest/Base.lproj/Main.storyboard
+++ /dev/null
@@ -1,45 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="14490.70" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="ByB-zp-bAl">
-    <device id="retina6_1" orientation="portrait">
-        <adaptation id="fullscreen"/>
-    </device>
-    <dependencies>
-        <deployment identifier="iOS"/>
-        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="14490.49"/>
-        <capability name="Safe area layout guides" minToolsVersion="9.0"/>
-        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
-    </dependencies>
-    <scenes>
-        <!--Navigation Controller-->
-        <scene sceneID="nmA-W2-Pg9">
-            <objects>
-                <navigationController id="ByB-zp-bAl" sceneMemberID="viewController">
-                    <navigationBar key="navigationBar" contentMode="scaleToFill" insetsLayoutMarginsFromSafeArea="NO" id="gOg-IM-7Z2">
-                        <rect key="frame" x="0.0" y="44" width="414" height="44"/>
-                        <autoresizingMask key="autoresizingMask"/>
-                    </navigationBar>
-                    <connections>
-                        <segue destination="BYZ-38-t0r" kind="relationship" relationship="rootViewController" id="H9I-lM-OYZ"/>
-                    </connections>
-                </navigationController>
-                <placeholder placeholderIdentifier="IBFirstResponder" id="ibs-iK-f8W" userLabel="First Responder" sceneMemberID="firstResponder"/>
-            </objects>
-            <point key="canvasLocation" x="-1498" y="81"/>
-        </scene>
-        <!--View Controller-->
-        <scene sceneID="tne-QT-ifu">
-            <objects>
-                <viewController id="BYZ-38-t0r" customClass="ViewController" customModule="PaddleMobileTest" customModuleProvider="target" sceneMemberID="viewController">
-                    <view key="view" contentMode="scaleToFill" id="8bC-Xf-vdC">
-                        <rect key="frame" x="0.0" y="0.0" width="414" height="896"/>
-                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
-                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
-                        <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
-                    </view>
-                    <navigationItem key="navigationItem" id="OTX-aL-mKl"/>
-                </viewController>
-                <placeholder placeholderIdentifier="IBFirstResponder" id="dkx-z0-nzr" sceneMemberID="firstResponder"/>
-            </objects>
-        </scene>
-    </scenes>
-</document>
diff --git a/metal/PaddleMobileTest/PaddleMobileTest/Info.plist b/metal/PaddleMobileTest/PaddleMobileTest/Info.plist
deleted file mode 100644
index 23664ef868..0000000000
--- a/metal/PaddleMobileTest/PaddleMobileTest/Info.plist
+++ /dev/null
@@ -1,52 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>NSAppTransportSecurity</key>
-	<dict>
-		<key>NSAllowsArbitraryLoads</key>
-		<true/>
-	</dict>
-	<key>NSCameraUsageDescription</key>
-	<string>need camera</string>
-	<key>CFBundleDevelopmentRegion</key>
-	<string>$(DEVELOPMENT_LANGUAGE)</string>
-	<key>CFBundleExecutable</key>
-	<string>$(EXECUTABLE_NAME)</string>
-	<key>CFBundleIdentifier</key>
-	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
-	<key>CFBundleInfoDictionaryVersion</key>
-	<string>6.0</string>
-	<key>CFBundleName</key>
-	<string>$(PRODUCT_NAME)</string>
-	<key>CFBundlePackageType</key>
-	<string>APPL</string>
-	<key>CFBundleShortVersionString</key>
-	<string>1.0</string>
-	<key>CFBundleVersion</key>
-	<string>1</string>
-	<key>LSRequiresIPhoneOS</key>
-	<true/>
-	<key>UILaunchStoryboardName</key>
-	<string>LaunchScreen</string>
-	<key>UIMainStoryboardFile</key>
-	<string>Main</string>
-	<key>UIRequiredDeviceCapabilities</key>
-	<array>
-		<string>armv7</string>
-	</array>
-	<key>UISupportedInterfaceOrientations</key>
-	<array>
-		<string>UIInterfaceOrientationPortrait</string>
-		<string>UIInterfaceOrientationLandscapeLeft</string>
-		<string>UIInterfaceOrientationLandscapeRight</string>
-	</array>
-	<key>UISupportedInterfaceOrientations~ipad</key>
-	<array>
-		<string>UIInterfaceOrientationPortrait</string>
-		<string>UIInterfaceOrientationPortraitUpsideDown</string>
-		<string>UIInterfaceOrientationLandscapeLeft</string>
-		<string>UIInterfaceOrientationLandscapeRight</string>
-	</array>
-</dict>
-</plist>
diff --git a/metal/PaddleMobileTest/PaddleMobileTest/TestViewController.swift b/metal/PaddleMobileTest/PaddleMobileTest/TestViewController.swift
deleted file mode 100644
index e1ec72f00a..0000000000
--- a/metal/PaddleMobileTest/PaddleMobileTest/TestViewController.swift
+++ /dev/null
@@ -1,478 +0,0 @@
-//
-//  TestViewController.swift
-//  PaddleMobileTest
-//
-//  Created by Li,Jian(MMS) on 2019/6/27.
-//  Copyright © 2019 Li,Jian(MMS). All rights reserved.
-//
-
-import UIKit
-import Alamofire
-import paddle_mobile
-
-class Model: CustomStringConvertible {
-    var name = ""
-    var paramsPrecision: [Int] = [0, 1]
-    var fusion: [Bool] = [true, false]
-    var useMPS: [Bool] = [true, false]
-    var reuseTexture: [Bool] = [true, false]
-    var testPerformance = true
-    var diffPrecision: Float = 0.1
-    var varsDic: [String: [Int]] = [:]
-    
-    var description: String {
-        return "name: \(name)\nfusion: \(fusion)\nuseMPS:\(useMPS)\nreuseTexture:\(reuseTexture)\ntestPerformance: \(testPerformance)\ndiffPrecision: \(diffPrecision)\nvarsDesc:\(varsDic)"
-    }
-}
-
-class TestModel: CustomStringConvertible, Codable {
-    var name: String
-    var paramsPrecision: Int
-    var fusion: Bool
-    var useMPS: Bool
-    var reuseTexture: Bool
-    var testPerformance: Bool
-    var diffPrecision: Float
-    var varsDic: [String: [Int]]
-    
-    init(name: String, paramsPrecision: Int, fusion: Bool, useMPS: Bool, reuseTexture: Bool, testPerformance: Bool, diffPrecision: Float, varsDic: [String: [Int]]) {
-        self.name = name
-        self.paramsPrecision = paramsPrecision
-        self.fusion = fusion
-        self.useMPS = useMPS
-        self.reuseTexture = reuseTexture
-        self.testPerformance = testPerformance
-        self.diffPrecision = diffPrecision
-        self.varsDic = varsDic
-    }
-    
-    var description: String {
-        return "name: \(name)\nparamsPrecision:\(paramsPrecision)\nfusion: \(fusion)\nuseMPS:\(useMPS)\nreuseTexture:\(reuseTexture)\ntestPerformance: \(testPerformance)\ndiffPrecision: \(diffPrecision)\nvarsDesc:\(varsDic)"
-    }
-}
-
-class ModelTestResult: Codable {
-    var model: TestModel?
-    var isResultEqual = false
-    var performance = 0.0
-    var msg = ""
-    var diverVarName = ""
-}
-
-let device = MTLCreateSystemDefaultDevice()!
-let commandQueue = device.makeCommandQueue()!
-var timeCosts = [Double]()
-var count = 0
-var totalCount = 100
-
-var orderedVars: [String] = []
-var varIndex = 0
-
-class TestViewController: UIViewController {
-    private var hostUrlStr = ""
-    private var testModels: [TestModel] = []
-    private var testResults: [ModelTestResult] = []
-    private var runner: Runner?
-    private var textView: UITextView!
-    
-    init(hostUrlStr: String) {
-        //self.hostUrlStr = "https://www.baidu.com"//
-        self.hostUrlStr = hostUrlStr
-        super.init(nibName: nil, bundle: nil)
-        self.title = self.hostUrlStr
-    }
-    
-    required init?(coder aDecoder: NSCoder) {
-        return nil
-    }
-    
-    override func viewDidAppear(_ animated: Bool) {
-        super.viewDidAppear(animated)
-        textView = UITextView(frame: self.view.bounds)
-        self.view.addSubview(textView)
-        textView.text = "hello"
-        textView.backgroundColor = .white
-        textView.isEditable = false
-        
-        let button1 = UIButton(frame: CGRect(x: 100, y: 100, width: 100, height: 50))
-        button1.center = CGPoint(x: self.view.center.x, y: 200) 
-        button1.addTarget(self, action: #selector(test1), for: .touchUpInside)
-        button1.backgroundColor = .lightGray
-        button1.setTitle("开始测试", for: .normal)
-        self.view.addSubview(button1)
-    }
-    
-    @objc func test1() {
-        getTestInfo()
-    }
-    
-    private func getTestInfo() {
-        var testInfoRequest = URLRequest(url: URL(string: "\(hostUrlStr)/getTestInfo")!)
-        testInfoRequest.cachePolicy = .reloadIgnoringLocalAndRemoteCacheData
-        Alamofire.request(testInfoRequest).validate().responseJSON { (response) in
-            guard response.result.isSuccess else {
-                self.testLog("getTestInfo request error")
-                return
-            }
-            guard let json = response.result.value as? [String: Any] else {
-                self.testLog("getTestInfo serialize error")
-                return
-            }
-            if let modelList = json["model_list"] as? [[String: Any]] {
-                self.testModels.removeAll()
-                for model in modelList {
-                    guard let name = model["name"] as? String else {
-                        self.testLog("getTestInfo fetch name error")
-                        return
-                    }
-                    guard let varsDic = model["vars_dic"] as? [String: [Int]] else {
-                        self.testLog("getTestInfo fetch vars_dic error")
-                        return
-                    }
-                    let paddlemodel = Model()
-                    paddlemodel.name = name
-                    paddlemodel.varsDic = varsDic
-                    if let paramsPrecision = model["params_precision"] as? [Int] {
-                        paddlemodel.paramsPrecision = paramsPrecision
-                    }
-                    if let fusion = model["fusion"] as? [Bool] {
-                        paddlemodel.fusion = fusion
-                    }
-                    if let useMPS = model["use_mps"] as? [Bool] {
-                        paddlemodel.useMPS = useMPS
-                    }
-                    if let reuseTexture = model["reuse_texture"] as? [Bool] {
-                        paddlemodel.reuseTexture = reuseTexture
-                    }
-                    if let testPerformance = model["test_performance"] as? NSNumber {
-                        paddlemodel.testPerformance = testPerformance.boolValue
-                    }
-                    if let diffPrecision = model["diff_precision"] as? NSNumber {
-                        paddlemodel.diffPrecision = diffPrecision.floatValue
-                    }
-                    self.testModels.append(contentsOf: self.testModelsFromModel(paddlemodel))
-                }
-                if !self.testModels.isEmpty {
-                    self.beginTest()
-                }
-            }
-        }
-    }
-    
-    private func beginTest() {
-        testResults.removeAll()
-        testModelOfIndex(0)
-    }
-    
-    private func testModelOfIndex(_ index: Int) {
-        guard index < self.testModels.count else {
-            //report
-            self.testLog("********test result: done")
-            let jsonEncoder = JSONEncoder()
-            let jsonData = try! jsonEncoder.encode(["results": self.testResults])
-            let json = String(data: jsonData, encoding: String.Encoding.utf8) ?? ""
-            let data = json.data(using: .utf8)!
-            let jsonDic = try! JSONSerialization.jsonObject(with: data, options: []) as? [String: Any]
-            Alamofire.request("\(hostUrlStr)/putTestResult", method: .post, parameters: jsonDic, encoding: JSONEncoding.default, headers: nil).validate().responseData { (response) in
-                guard response.result.isSuccess else {
-                    self.testLog("puttestresult fail\ntest done")
-                    return
-                }
-                self.testLog("puttestresult success\test done")
-            }
-            return
-        }
-        let model = self.testModels[index]
-        self.testLog("******begin test model: \(model)")
-        testModel(model) { [weak self] (testResult) in
-            self?.testLog("********test result: isresultequal: \(testResult.isResultEqual) msg: \(testResult.msg) performance: \(testResult.performance) s diverVar: \(testResult.diverVarName)")
-            self?.testResults.append(testResult)
-            self?.testModelOfIndex(index+1)
-        }
-    }
-    
-    private func testModel(_ model: TestModel, completion: @escaping ((ModelTestResult)->Void)) {
-        let testResult = ModelTestResult()
-        testResult.model = model
-        
-        let modelUrlStr = "\(hostUrlStr)/getFile/\(model.name)"
-        var modelRequest = URLRequest(url: URL(string: "\(modelUrlStr)/model")!)
-        modelRequest.cachePolicy = .reloadIgnoringLocalAndRemoteCacheData
-        Alamofire.request(modelRequest).validate().responseData { (response) in
-            guard response.result.isSuccess, let modelData = response.result.value else {
-                let msg = "get model \(model.name) error"
-                self.testLog(msg)
-                testResult.msg = msg
-                completion(testResult)
-                return
-            }
-            //let modelData2 = try! Data(contentsOf: URL(fileURLWithPath: Bundle.main.path(forResource: "yolo_model_v3_16", ofType: nil)!))
-            let modelPtr = UnsafeMutablePointer<UInt8>.allocate(capacity: modelData.count)
-            NSData(data: modelData).getBytes(modelPtr, length: modelData.count)
-            var paramsRequest = URLRequest(url: URL(string: "\(modelUrlStr)/params/\(model.paramsPrecision)")!)
-            paramsRequest.cachePolicy = .reloadIgnoringLocalAndRemoteCacheData
-            Alamofire.request(paramsRequest).validate().responseData(completionHandler: { (response) in
-                guard response.result.isSuccess, let paramsData = response.result.value else {
-                    let msg = "get params \(model.name) error"
-                    self.testLog(msg)
-                    testResult.msg = msg
-                    completion(testResult)
-                    return
-                }
-                //let paramsData2 = try! Data(contentsOf: URL(fileURLWithPath: Bundle.main.path(forResource: "yolo_params_v3_16", ofType: nil)!))
-                let paramsPtr = UnsafeMutablePointer<UInt8>.allocate(capacity: paramsData.count)
-                //paramsPtr.copyMemory(from: NSData(data: paramsData2).bytes, byteCount: paramsData2.count)
-                NSData(data: paramsData).getBytes(paramsPtr, length: paramsData.count)
-                guard let net = try? Net(device: device, inParamPointer: paramsPtr, inParamSize: paramsData.count, inModelPointer: modelPtr, inModelSize: modelData.count) else {
-                    let msg = "init net error"
-                    self.testLog(msg)
-                    testResult.msg = msg
-                    completion(testResult)
-                    return
-                }
-                net.inputDim = Dim(inDim: [1, 1, 1, 1])
-                net.paramPrecision = (model.paramsPrecision == 0) ? .Float16 : .Float32
-                net.metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-                net.metalLoadMode = .LoadMetalInCustomMetalLib
-                guard let runner = try? Runner(inNet: net, commandQueue: commandQueue), runner.load() else {
-                    let msg = "init runner error"
-                    self.testLog(msg)
-                    testResult.msg = msg
-                    completion(testResult)
-                    return
-                }
-                guard let feedVar = runner.feedOpOutputVarDesc(), let dims = feedVar.dims, let fetchVars = runner.fetchOpInputVarDesc(), fetchVars.count > 0 else {
-                    let msg = "feed var nil"
-                    self.testLog(msg)
-                    testResult.msg = msg
-                    completion(testResult)
-                    return
-                }
-                let fetchVar = fetchVars[0]
-                net.inputDim = Dim(inDim: [dims[0], dims[2], dims[3], dims[1]])
-               
-                var feedVarRequest = URLRequest(url: URL(string: "\(modelUrlStr)/data/\(feedVar.name.replacingOccurrences(of: "/", with: "_"))")!)
-                feedVarRequest.cachePolicy = .reloadIgnoringLocalAndRemoteCacheData
-                Alamofire.request(feedVarRequest).validate().responseData(completionHandler: { (response) in
-                    guard response.result.isSuccess, let inputData = response.result.value else {
-                        let msg = "get var \(feedVar) error"
-                        self.testLog(msg)
-                        testResult.msg = msg
-                        completion(testResult)
-                        return
-                    }
-                    var fetchvarRequest = URLRequest(url: URL(string: "\(modelUrlStr)/data/\(fetchVar.name.replacingOccurrences(of: "/", with: "_"))")!)
-                    fetchvarRequest.cachePolicy = .reloadIgnoringLocalAndRemoteCacheData
-                    Alamofire.request(fetchvarRequest).validate().responseData(completionHandler: { (response) in
-                        guard response.result.isSuccess, let outputData = response.result.value else {
-                            let msg = "get var \(fetchVar) error"
-                            self.testLog(msg)
-                            testResult.msg = msg
-                            completion(testResult)
-                            return
-                        }
-                        self.runModel(model: model, net: net, inputData: inputData, outputData: outputData, compareCompletion: { (testResult) in
-                            paramsPtr.deallocate()
-                            modelPtr.deallocate()
-                            completion(testResult)
-                        })
-                    })
-                })
-            })
-        }
-    }
-    
-    private func runModel(model: TestModel, net: Net, inputData: Data, outputData: Data, compareCompletion: @escaping ((_ testResult: ModelTestResult)->Void)) {
-        let testResult = ModelTestResult()
-        testResult.model = model
-        
-        net.useMPS = model.useMPS
-        net.paramPrecision = (model.paramsPrecision == 0) ? .Float16 : .Float32
-        guard let runner = try? Runner(inNet: net, commandQueue: commandQueue), runner.load(optimizeProgram: model.fusion, optimizeMemory: model.reuseTexture) else {
-            let msg = "runner init or load fail"
-            self.testLog(msg)
-            testResult.msg = msg
-            compareCompletion(testResult)
-            return
-        }
-        self.runner = runner
-        let buffer = device.makeBuffer(length: inputData.count, options: [])!
-        memcpy(buffer.contents(), NSData(data: inputData).bytes, inputData.count)
-        runner.getTexture(inBuffer: buffer, getTexture: { (success, texture) in
-            if success, let texture = texture {
-                runner.predict(texture: texture, completion: { (success, results) in
-                    if success, let resultHolders = results {
-                        let resultHolder = resultHolders[0]
-                        let outputSize = outputData.count/MemoryLayout<Float32>.stride
-                        let output = NSData(data: outputData).bytes.bindMemory(to: Float32.self, capacity: outputSize)
-                        guard resultHolder.capacity == outputSize else {
-                            let msg = "count not equal"
-                            self.testLog(msg)
-                            testResult.msg = msg
-                            compareCompletion(testResult)
-                            return
-                        }
-                        let precision = model.diffPrecision
-                        var isResultEqual = true
-                        var msg = ""
-                        for i in 0..<outputSize {
-                            let a = output[i]
-                            let b = resultHolder.result[i]
-                            // && abs(a - b) / min(abs(a), abs(b)) > 0.05 
-                            if abs(a - b) > precision {
-                                isResultEqual = false
-                                msg = "unequal: i: \(i) target: \(output[i]) result: \(resultHolder.result[i])"
-                                self.testLog(msg)
-                                testResult.msg = msg
-                                break
-                            }
-                        }
-                        testResult.isResultEqual = isResultEqual
-                        func checkPerformance(testResult: ModelTestResult, completion: @escaping ((_ testResult: ModelTestResult)->Void)) {
-                            if model.testPerformance {
-                                DispatchQueue.main.asyncAfter(deadline: .now() + 0.2, execute: {
-                                    self.testPerformance(runner: runner, texture: texture, testResult: testResult, completion: completion)
-                                })
-                            } else {
-                                completion(testResult)
-                            }
-                        }
-                        if isResultEqual {
-                            checkPerformance(testResult: testResult, completion: compareCompletion)
-                        } else {
-                            if model.reuseTexture {
-                                testResult.diverVarName = "can not find diver var when reusing texture"
-                                checkPerformance(testResult: testResult, completion: compareCompletion)
-                            } else {
-                                self.findDivergentVar(runner: runner, testResult: testResult, completion: { (testResult) in
-                                    checkPerformance(testResult: testResult, completion: compareCompletion)
-                                })
-                            }
-                        }
-                        return
-                    }
-                })
-            } else {
-                let msg = "get texture error"
-                self.testLog(msg)
-                testResult.msg = msg
-                compareCompletion(testResult)
-            }
-        }, channelNum: net.inputDim[3])
-    }
-    
-    private func testPerformance(runner: Runner, texture: MTLTexture, testResult: ModelTestResult, completion: @escaping ((ModelTestResult)->Void)) {
-        let startTime = CFAbsoluteTimeGetCurrent()
-        runner.predict(texture: texture) { (success, results) in
-            self.testLog("!!!!!\(CFAbsoluteTimeGetCurrent()-startTime)")
-            if success && count > 0 {
-                timeCosts.append(CFAbsoluteTimeGetCurrent()-startTime)
-            }
-            count += 1
-            if count < totalCount {
-                DispatchQueue.main.asyncAfter(deadline: .now()+0.1, execute: {
-                    self.testPerformance(runner: runner, texture: texture, testResult: testResult, completion: completion)
-                })
-            } else {
-                if timeCosts.count > 0 {
-                    var total = 0.0
-                    for time in timeCosts {
-                        total += time
-                    }
-                    let avgTime = total / Double(timeCosts.count)
-                    testResult.performance = avgTime
-                }
-                count = 0
-                timeCosts.removeAll()
-                completion(testResult)
-            }
-        }
-    }
-    
-    private func findDivergentVar(runner: Runner, testResult: ModelTestResult, completion: @escaping ((ModelTestResult)->Void)) {
-        orderedVars.removeAll()
-        varIndex = 0
-        orderedVars = runner.getAllOutputVars()
-        compareVars(runner: runner, model: testResult.model!) { (varName) in
-            testResult.diverVarName = varName
-            completion(testResult)
-        }
-    }
-    
-    private func compareVars(runner: Runner, model: TestModel, completion: @escaping ((_ varName: String)->Void)) {
-        let severVars: [String] = Array(model.varsDic.keys)
-        let urlString: String = "\(hostUrlStr)/getFile/\(model.name)/data"
-        if varIndex < orderedVars.count {
-            var varName = orderedVars[varIndex]
-            while !(severVars.contains(varName)) || runner.fetchVar(varName) == nil {
-                varIndex += 1
-                if varIndex < orderedVars.count {
-                    varName = orderedVars[varIndex]
-                } else {
-                    break
-                }
-            }
-            if severVars.contains(varName) {
-                var severVarRequest = URLRequest(url: URL(string: "\(urlString)/\(varName)")!)
-                severVarRequest.cachePolicy = .reloadIgnoringLocalAndRemoteCacheData
-                Alamofire.request(severVarRequest).validate().responseData { (response) in
-                    varIndex += 1
-                    guard response.result.isSuccess, let varData = response.result.value else {
-                        self.compareVars(runner: runner, model: model, completion: completion)
-                        return
-                    }
-                    let varSize = varData.count/MemoryLayout<Float32>.stride
-                    let varTarget = NSData(data: varData).bytes.bindMemory(to: Float32.self, capacity: varSize)
-                    let varResult = runner.fetchVar(varName)!
-                    guard varSize == varResult.count else {
-                        completion(varName)
-                        return
-                    }
-                    let precision = model.diffPrecision
-                    for i in 0..<varSize {
-                        if abs(varTarget[i] - varResult[i]) > precision {
-                            completion(varName)
-                            return
-                        }
-                    }
-                    self.compareVars(runner: runner, model: model, completion: completion)
-                    return
-                }
-            } else {
-                completion("")
-            }
-        }
-    }
-    
-    private func testModelsFromModel(_ model: Model) -> [TestModel] {
-        var testModels: [TestModel] = []
-        for paramsPrecision in model.paramsPrecision {
-            for fusion in model.fusion {
-                for useMPS in model.useMPS {
-                    for reuseTexture in model.reuseTexture {
-                        let testModel = TestModel(name: model.name, paramsPrecision: paramsPrecision, fusion: fusion, useMPS: useMPS, reuseTexture: reuseTexture, testPerformance: model.testPerformance, diffPrecision: model.diffPrecision, varsDic: model.varsDic)
-                        testModels.append(testModel)
-                    }
-                }
-            }
-        }
-        return testModels
-    }
-    
-    private func testLog(_ log: String) {
-        DispatchQueue.main.async {
-            self.textView.text = self.textView.text + "\n\(log)"
-            DispatchQueue.main.asyncAfter(deadline: .now()+0.1) {
-                self.scrollTextViewToBottom(textView: self.textView)
-            }
-        }
-    }
-    
-    private func scrollTextViewToBottom(textView: UITextView) {
-        if textView.text.count > 0 {
-            let location = textView.text.count - 1
-            let bottom = NSMakeRange(location, 1)
-            textView.scrollRangeToVisible(bottom)
-        }
-    }
-}
diff --git a/metal/PaddleMobileTest/PaddleMobileTest/ViewController.swift b/metal/PaddleMobileTest/PaddleMobileTest/ViewController.swift
deleted file mode 100644
index 303cb41b77..0000000000
--- a/metal/PaddleMobileTest/PaddleMobileTest/ViewController.swift
+++ /dev/null
@@ -1,122 +0,0 @@
-//
-//  ViewController.swift
-//  PaddleMobileTest
-//
-//  Created by Li,Jian(MMS) on 2019/6/27.
-//  Copyright © 2019 Li,Jian(MMS). All rights reserved.
-//
-
-import AVFoundation
-import UIKit
-
-class ViewController: UIViewController, AVCaptureMetadataOutputObjectsDelegate {
-    var captureSession: AVCaptureSession!
-    var previewLayer: AVCaptureVideoPreviewLayer!
-    
-    override func viewDidLoad() {
-        super.viewDidLoad()
-        
-        view.backgroundColor = UIColor.black
-        captureSession = AVCaptureSession()
-        
-        guard let videoCaptureDevice = AVCaptureDevice.default(for: .video) else { return }
-        let videoInput: AVCaptureDeviceInput
-        
-        do {
-            videoInput = try AVCaptureDeviceInput(device: videoCaptureDevice)
-        } catch {
-            return
-        }
-        
-        if (captureSession.canAddInput(videoInput)) {
-            captureSession.addInput(videoInput)
-        } else {
-            failed()
-            return
-        }
-        
-        let metadataOutput = AVCaptureMetadataOutput()
-        
-        if (captureSession.canAddOutput(metadataOutput)) {
-            captureSession.addOutput(metadataOutput)
-            
-            metadataOutput.setMetadataObjectsDelegate(self, queue: DispatchQueue.main)
-            metadataOutput.metadataObjectTypes = [.qr]//[.ean8, .ean13, .pdf417]
-        } else {
-            failed()
-            return
-        }
-        
-        previewLayer = AVCaptureVideoPreviewLayer(session: captureSession)
-        previewLayer.frame = view.layer.bounds
-        previewLayer.videoGravity = .resizeAspectFill
-        view.layer.addSublayer(previewLayer)
-        
-        captureSession.startRunning()
-    }
-    
-    func failed() {
-        let ac = UIAlertController(title: "Scanning not supported", message: "Your device does not support scanning a code from an item. Please use a device with a camera.", preferredStyle: .alert)
-        ac.addAction(UIAlertAction(title: "OK", style: .default))
-        present(ac, animated: true)
-        captureSession = nil
-    }
-    
-    override func viewWillAppear(_ animated: Bool) {
-        super.viewWillAppear(animated)
-        
-        if (captureSession?.isRunning == false) {
-            captureSession.startRunning()
-        }
-    }
-    
-    override func viewWillDisappear(_ animated: Bool) {
-        super.viewWillDisappear(animated)
-        
-        if (captureSession?.isRunning == true) {
-            captureSession.stopRunning()
-        }
-    }
-    
-    override func viewDidAppear(_ animated: Bool) {
-        super.viewDidAppear(animated)
-//        let testVC = TestViewController(hostUrlStr: "http://0.0.0.0:8000")
-//        self.present(testVC, animated: true, completion: nil)
-    }
-    
-    func metadataOutput(_ output: AVCaptureMetadataOutput, didOutput metadataObjects: [AVMetadataObject], from connection: AVCaptureConnection) {
-        captureSession.stopRunning()
-        
-        if let metadataObject = metadataObjects.first {
-            guard let readableObject = metadataObject as? AVMetadataMachineReadableCodeObject else { return }
-            guard let stringValue = readableObject.stringValue else { return }
-            AudioServicesPlaySystemSound(SystemSoundID(kSystemSoundID_Vibrate))
-            found(code: stringValue)
-        }
-    }
-    
-    func found(code: String) {
-        print(code)
-        let testVC = TestViewController(hostUrlStr: code)
-        self.navigationController?.pushViewController(testVC, animated: true)
-    }
-    
-    func alertMsg(_ msg: String) {
-        let ac = UIAlertController(title: "Found!", message: msg, preferredStyle: .alert)
-        ac.addAction(UIAlertAction(title: "OK", style: .default, handler: { (action) in
-            self.captureSession.startRunning()
-        }))
-        present(ac, animated: true, completion: nil)
-    }
-    
-    override var prefersStatusBarHidden: Bool {
-        return true
-    }
-    
-    override var supportedInterfaceOrientations: UIInterfaceOrientationMask {
-        return .portrait
-    }
-}
-
-
-
diff --git a/metal/Podfile b/metal/Podfile
deleted file mode 100644
index 49d0396fd3..0000000000
--- a/metal/Podfile
+++ /dev/null
@@ -1,40 +0,0 @@
-platform :ios, ‘9.0’
-use_frameworks!
-
-workspace 'paddle-mobile.xcworkspace'
-
-target 'paddle-mobile-demo' do
-	project 'paddle-mobile-demo/paddle-mobile-demo.xcodeproj'
-    # pod 'SwiftProtobuf', '~> 1.0'
-    pod 'Protobuf', '~> 3.0.0'
-end
-
-target 'paddle-mobile' do
-	project 'paddle-mobile/paddle-mobile.xcodeproj'
-    # pod 'SwiftProtobuf', '~> 1.0'
-    pod 'Protobuf', '~> 3.0.0'
-end
-
-target 'paddle-mobile-unit-test' do
-    project 'paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj'
-    # pod 'SwiftProtobuf', '~> 1.0'
-    pod 'Protobuf', '~> 3.0.0'
-end
-
-target 'MobileNetDemo' do
-    project 'MobileNetDemo/MobileNetDemo.xcodeproj'
-    # pod 'SwiftProtobuf', '~> 1.0'
-    pod 'Protobuf', '~> 3.0.0'
-end
-
-
-target 'paddle-mobile-metallib' do
-    project 'paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj'
-end
-
-
-target 'PaddleMobileTest' do
-  project 'PaddleMobileTest/PaddleMobileTest.xcodeproj'
-  pod 'Protobuf', '~> 3.0.0'
-  pod 'Alamofire'
-end
diff --git a/metal/README.md b/metal/README.md
deleted file mode 100644
index 2da6558b05..0000000000
--- a/metal/README.md
+++ /dev/null
@@ -1,12 +0,0 @@
-## Paddle-Mobile
-
-需要: xcode、 cocoapods
-
-```
-pod install
-
-open paddle-mobile.xcworkspace
-
-```
-
-Demo 所需依赖的模型可在[这里](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)下载
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj
deleted file mode 100644
index 04bb0aa82a..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj
+++ /dev/null
@@ -1,742 +0,0 @@
-// !$*UTF8*$!
-{
-	archiveVersion = 1;
-	classes = {
-	};
-	objectVersion = 50;
-	objects = {
-
-/* Begin PBXBuildFile section */
-		30D0ED21F392CFA3885B1002 /* Pods_paddle_mobile_demo.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 18896810981724F8A0FED62A /* Pods_paddle_mobile_demo.framework */; };
-		C2CBB49021B778EA0020DC6C /* libc++.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = FC4FD97B2140EE250073E130 /* libc++.tbd */; };
-		C2E67E5E21524E460013F575 /* LoadPointerViewController.m in Sources */ = {isa = PBXBuildFile; fileRef = C2E67E5D21524E460013F575 /* LoadPointerViewController.m */; };
-		FC039B8220E11C550081E9F8 /* AppDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B8120E11C550081E9F8 /* AppDelegate.swift */; };
-		FC039B8420E11C550081E9F8 /* ViewController.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B8320E11C550081E9F8 /* ViewController.swift */; };
-		FC039B8720E11C550081E9F8 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FC039B8520E11C550081E9F8 /* Main.storyboard */; };
-		FC039B8920E11C560081E9F8 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = FC039B8820E11C560081E9F8 /* Assets.xcassets */; };
-		FC039B8C20E11C560081E9F8 /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FC039B8A20E11C560081E9F8 /* LaunchScreen.storyboard */; };
-		FC1A1D552237EAC70047B7FD /* ImageTool.m in Sources */ = {isa = PBXBuildFile; fileRef = FC1A1D542237EAC70047B7FD /* ImageTool.m */; };
-		FC203FB221CBFDBA00B37166 /* test.jpg in Resources */ = {isa = PBXBuildFile; fileRef = FC203FA921CBFDBA00B37166 /* test.jpg */; };
-		FC2BFCBC21DF0A8600C262B2 /* 00001.jpg in Resources */ = {isa = PBXBuildFile; fileRef = FC2BFCBB21DF0A8600C262B2 /* 00001.jpg */; };
-		FC2BFCBE21DF15D900C262B2 /* 123.jpg in Resources */ = {isa = PBXBuildFile; fileRef = FC2BFCBD21DF15D900C262B2 /* 123.jpg */; };
-		FC2BFCC021DF279900C262B2 /* classify-img-output.png in Resources */ = {isa = PBXBuildFile; fileRef = FC2BFCBF21DF279900C262B2 /* classify-img-output.png */; };
-		FC2BFD3021DF3FEA00C262B2 /* MobilenetSSD_AR.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD2A21DF3FE900C262B2 /* MobilenetSSD_AR.swift */; };
-		FC2BFD3121DF3FEA00C262B2 /* Genet.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD2B21DF3FE900C262B2 /* Genet.swift */; };
-		FC2BFD3221DF3FEA00C262B2 /* MobileNetSSD.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD2C21DF3FE900C262B2 /* MobileNetSSD.swift */; };
-		FC2BFD3321DF3FEA00C262B2 /* YoloNet.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD2D21DF3FE900C262B2 /* YoloNet.swift */; };
-		FC2BFD3421DF3FEA00C262B2 /* MobileNetCombined.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD2E21DF3FEA00C262B2 /* MobileNetCombined.swift */; };
-		FC2BFD3521DF3FEA00C262B2 /* MobileNet.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD2F21DF3FEA00C262B2 /* MobileNet.swift */; };
-		FC2BFD3821DF46DE00C262B2 /* OCDemoViewController.m in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD3721DF46DE00C262B2 /* OCDemoViewController.m */; };
-		FC2BFD3C21DF480400C262B2 /* CPUCompute.mm in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD3B21DF480400C262B2 /* CPUCompute.mm */; };
-		FC2BFD3E21DF5CE800C262B2 /* PreProcessKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD3D21DF5CE800C262B2 /* PreProcessKernel.metal */; };
-		FC2BFD4321DF5E1E00C262B2 /* PaddleMobileGPU.m in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD4021DF5E1E00C262B2 /* PaddleMobileGPU.m */; };
-		FC2BFD4421DF5E1E00C262B2 /* SuperResolutionNet.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD4221DF5E1E00C262B2 /* SuperResolutionNet.swift */; };
-		FC5E03B221DCE8D90016C137 /* mingren_input_data in Resources */ = {isa = PBXBuildFile; fileRef = FC5E03B121DCE8D90016C137 /* mingren_input_data */; };
-		FC704C1921D2375300F98BAB /* super_params in Resources */ = {isa = PBXBuildFile; fileRef = FC704C1721D2375300F98BAB /* super_params */; };
-		FC704C1A21D2375300F98BAB /* super_model in Resources */ = {isa = PBXBuildFile; fileRef = FC704C1821D2375300F98BAB /* super_model */; };
-		FC704C2421D237FC00F98BAB /* yolo_params in Resources */ = {isa = PBXBuildFile; fileRef = FC704C2021D237FC00F98BAB /* yolo_params */; };
-		FC704C2521D237FC00F98BAB /* yolo_model in Resources */ = {isa = PBXBuildFile; fileRef = FC704C2121D237FC00F98BAB /* yolo_model */; };
-		FC803BCD214D27930094B8E5 /* FPSCounter.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC803BCB214D27920094B8E5 /* FPSCounter.swift */; };
-		FC803BCE214D27930094B8E5 /* VideoCapture.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC803BCC214D27920094B8E5 /* VideoCapture.swift */; };
-		FC9797BE21D6045B00F2FD90 /* banana.jpeg in Resources */ = {isa = PBXBuildFile; fileRef = FC9797BD21D6045B00F2FD90 /* banana.jpeg */; };
-		FC9797C221D608E000F2FD90 /* mobilenet_model in Resources */ = {isa = PBXBuildFile; fileRef = FC9797C021D608DF00F2FD90 /* mobilenet_model */; };
-		FC9797C321D608E000F2FD90 /* mobilenet_params in Resources */ = {isa = PBXBuildFile; fileRef = FC9797C121D608DF00F2FD90 /* mobilenet_params */; };
-		FC9797C721D609FB00F2FD90 /* synset.txt in Resources */ = {isa = PBXBuildFile; fileRef = FC9797C621D609FB00F2FD90 /* synset.txt */; };
-		FC9797CF21D6506F00F2FD90 /* mingren.jpg in Resources */ = {isa = PBXBuildFile; fileRef = FC9797CE21D6506F00F2FD90 /* mingren.jpg */; };
-		FC9B9F092236ABF300BB6DB0 /* combined_mobilenet_model_16 in Resources */ = {isa = PBXBuildFile; fileRef = FC9B9F072236ABD200BB6DB0 /* combined_mobilenet_model_16 */; };
-		FC9B9F0A2236ABF500BB6DB0 /* combined_mobilenet_params_16 in Resources */ = {isa = PBXBuildFile; fileRef = FC9B9F082236ABD300BB6DB0 /* combined_mobilenet_params_16 */; };
-		FCAFD84B2231614200496A36 /* yolo_16_param in Resources */ = {isa = PBXBuildFile; fileRef = FCAFD8492231614200496A36 /* yolo_16_param */; };
-		FCAFD84C2231614200496A36 /* yolo_16_model in Resources */ = {isa = PBXBuildFile; fileRef = FCAFD84A2231614200496A36 /* yolo_16_model */; };
-		FCBCCC552122EF5500D94F7E /* MetalHelper.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC542122EF5400D94F7E /* MetalHelper.swift */; };
-		FCC15E15221E716500DC3CB2 /* paddle-mobile-metallib.metallib in Resources */ = {isa = PBXBuildFile; fileRef = FCC15E14221E716400DC3CB2 /* paddle-mobile-metallib.metallib */; };
-		FCCED60521D7646E00BE8D5F /* test_image_super in Resources */ = {isa = PBXBuildFile; fileRef = FCCED60421D7646E00BE8D5F /* test_image_super */; };
-		FCE834AE2232A4AE0057BF43 /* combined_mobilenet_params in Resources */ = {isa = PBXBuildFile; fileRef = FCE834AC2232A4AE0057BF43 /* combined_mobilenet_params */; };
-		FCE834AF2232A4AE0057BF43 /* combined_mobilenet_model in Resources */ = {isa = PBXBuildFile; fileRef = FCE834AD2232A4AE0057BF43 /* combined_mobilenet_model */; };
-		FCE834B12232B6DC0057BF43 /* vision_synset.txt in Resources */ = {isa = PBXBuildFile; fileRef = FCE834B02232B6DC0057BF43 /* vision_synset.txt */; };
-		FCEBEC2C20E1391F00C0B14D /* paddle_mobile.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = FCEBEC2B20E1391F00C0B14D /* paddle_mobile.framework */; };
-		FCEBEC2D20E1391F00C0B14D /* paddle_mobile.framework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = FCEBEC2B20E1391F00C0B14D /* paddle_mobile.framework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; };
-		FCF437E8214B6DDB00943429 /* MultiPredictViewController.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCF437E7214B6DDB00943429 /* MultiPredictViewController.swift */; };
-		FCFADE34222F63CC0037DCE8 /* test_big.JPG in Resources */ = {isa = PBXBuildFile; fileRef = FCFADE33222F63CB0037DCE8 /* test_big.JPG */; };
-/* End PBXBuildFile section */
-
-/* Begin PBXCopyFilesBuildPhase section */
-		FCEBEC2E20E1392000C0B14D /* Embed Frameworks */ = {
-			isa = PBXCopyFilesBuildPhase;
-			buildActionMask = 2147483647;
-			dstPath = "";
-			dstSubfolderSpec = 10;
-			files = (
-				FCEBEC2D20E1391F00C0B14D /* paddle_mobile.framework in Embed Frameworks */,
-			);
-			name = "Embed Frameworks";
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXCopyFilesBuildPhase section */
-
-/* Begin PBXFileReference section */
-		081C9CF10DB06C58B8B6B039 /* Pods-paddle-mobile-demo.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile-demo.release.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile-demo/Pods-paddle-mobile-demo.release.xcconfig"; sourceTree = "<group>"; };
-		18896810981724F8A0FED62A /* Pods_paddle_mobile_demo.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Pods_paddle_mobile_demo.framework; sourceTree = BUILT_PRODUCTS_DIR; };
-		878829884E1A14D7044721D5 /* Pods-paddle-mobile-demo.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile-demo.debug.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile-demo/Pods-paddle-mobile-demo.debug.xcconfig"; sourceTree = "<group>"; };
-		C2E67E5C21524E460013F575 /* LoadPointerViewController.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = LoadPointerViewController.h; sourceTree = "<group>"; };
-		C2E67E5D21524E460013F575 /* LoadPointerViewController.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = LoadPointerViewController.m; sourceTree = "<group>"; };
-		FC039B7E20E11C550081E9F8 /* paddle-mobile-demo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = "paddle-mobile-demo.app"; sourceTree = BUILT_PRODUCTS_DIR; };
-		FC039B8120E11C550081E9F8 /* AppDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppDelegate.swift; sourceTree = "<group>"; };
-		FC039B8320E11C550081E9F8 /* ViewController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ViewController.swift; sourceTree = "<group>"; };
-		FC039B8620E11C550081E9F8 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/Main.storyboard; sourceTree = "<group>"; };
-		FC039B8820E11C560081E9F8 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
-		FC039B8B20E11C560081E9F8 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = "<group>"; };
-		FC039B8D20E11C560081E9F8 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
-		FC1A1D532237EAC70047B7FD /* ImageTool.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ImageTool.h; sourceTree = "<group>"; };
-		FC1A1D542237EAC70047B7FD /* ImageTool.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = ImageTool.m; sourceTree = "<group>"; };
-		FC203FA921CBFDBA00B37166 /* test.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = test.jpg; sourceTree = "<group>"; };
-		FC27991121343A39000B6BAD /* paddle-mobile-demo-Bridging-Header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "paddle-mobile-demo-Bridging-Header.h"; sourceTree = "<group>"; };
-		FC2BFCBB21DF0A8600C262B2 /* 00001.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = 00001.jpg; sourceTree = "<group>"; };
-		FC2BFCBD21DF15D900C262B2 /* 123.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = 123.jpg; sourceTree = "<group>"; };
-		FC2BFCBF21DF279900C262B2 /* classify-img-output.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; path = "classify-img-output.png"; sourceTree = "<group>"; };
-		FC2BFD2A21DF3FE900C262B2 /* MobilenetSSD_AR.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = MobilenetSSD_AR.swift; sourceTree = "<group>"; };
-		FC2BFD2B21DF3FE900C262B2 /* Genet.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Genet.swift; sourceTree = "<group>"; };
-		FC2BFD2C21DF3FE900C262B2 /* MobileNetSSD.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = MobileNetSSD.swift; sourceTree = "<group>"; };
-		FC2BFD2D21DF3FE900C262B2 /* YoloNet.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = YoloNet.swift; sourceTree = "<group>"; };
-		FC2BFD2E21DF3FEA00C262B2 /* MobileNetCombined.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = MobileNetCombined.swift; sourceTree = "<group>"; };
-		FC2BFD2F21DF3FEA00C262B2 /* MobileNet.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = MobileNet.swift; sourceTree = "<group>"; };
-		FC2BFD3621DF46DE00C262B2 /* OCDemoViewController.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = OCDemoViewController.h; sourceTree = "<group>"; };
-		FC2BFD3721DF46DE00C262B2 /* OCDemoViewController.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = OCDemoViewController.m; sourceTree = "<group>"; };
-		FC2BFD3A21DF480300C262B2 /* CPUCompute.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CPUCompute.h; sourceTree = "<group>"; };
-		FC2BFD3B21DF480400C262B2 /* CPUCompute.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = CPUCompute.mm; sourceTree = "<group>"; };
-		FC2BFD3D21DF5CE800C262B2 /* PreProcessKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = PreProcessKernel.metal; sourceTree = "<group>"; };
-		FC2BFD4021DF5E1E00C262B2 /* PaddleMobileGPU.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = PaddleMobileGPU.m; sourceTree = "<group>"; };
-		FC2BFD4121DF5E1E00C262B2 /* PaddleMobileGPU.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = PaddleMobileGPU.h; sourceTree = "<group>"; };
-		FC2BFD4221DF5E1E00C262B2 /* SuperResolutionNet.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = SuperResolutionNet.swift; sourceTree = "<group>"; };
-		FC4FD97B2140EE250073E130 /* libc++.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = "libc++.tbd"; path = "usr/lib/libc++.tbd"; sourceTree = SDKROOT; };
-		FC5E03B121DCE8D90016C137 /* mingren_input_data */ = {isa = PBXFileReference; lastKnownFileType = file; path = mingren_input_data; sourceTree = "<group>"; };
-		FC704C1721D2375300F98BAB /* super_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = super_params; sourceTree = "<group>"; };
-		FC704C1821D2375300F98BAB /* super_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = super_model; sourceTree = "<group>"; };
-		FC704C2021D237FC00F98BAB /* yolo_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = yolo_params; sourceTree = "<group>"; };
-		FC704C2121D237FC00F98BAB /* yolo_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = yolo_model; sourceTree = "<group>"; };
-		FC803BCB214D27920094B8E5 /* FPSCounter.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = FPSCounter.swift; sourceTree = "<group>"; };
-		FC803BCC214D27920094B8E5 /* VideoCapture.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = VideoCapture.swift; sourceTree = "<group>"; };
-		FC9797BD21D6045B00F2FD90 /* banana.jpeg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = banana.jpeg; sourceTree = "<group>"; };
-		FC9797C021D608DF00F2FD90 /* mobilenet_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_model; sourceTree = "<group>"; };
-		FC9797C121D608DF00F2FD90 /* mobilenet_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_params; sourceTree = "<group>"; };
-		FC9797C621D609FB00F2FD90 /* synset.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = synset.txt; sourceTree = "<group>"; };
-		FC9797CE21D6506F00F2FD90 /* mingren.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = mingren.jpg; sourceTree = "<group>"; };
-		FC9B9F072236ABD200BB6DB0 /* combined_mobilenet_model_16 */ = {isa = PBXFileReference; lastKnownFileType = file; path = combined_mobilenet_model_16; sourceTree = "<group>"; };
-		FC9B9F082236ABD300BB6DB0 /* combined_mobilenet_params_16 */ = {isa = PBXFileReference; lastKnownFileType = file; path = combined_mobilenet_params_16; sourceTree = "<group>"; };
-		FCAFD8492231614200496A36 /* yolo_16_param */ = {isa = PBXFileReference; lastKnownFileType = file; path = yolo_16_param; sourceTree = "<group>"; };
-		FCAFD84A2231614200496A36 /* yolo_16_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = yolo_16_model; sourceTree = "<group>"; };
-		FCBCCC542122EF5400D94F7E /* MetalHelper.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = MetalHelper.swift; sourceTree = "<group>"; };
-		FCC15E14221E716400DC3CB2 /* paddle-mobile-metallib.metallib */ = {isa = PBXFileReference; lastKnownFileType = "archive.metal-library"; name = "paddle-mobile-metallib.metallib"; path = "../../../../Library/Developer/Xcode/DerivedData/paddle-mobile-hdsimtkoxoondndnjczkbkchcwyh/Build/Products/Release-iphoneos/paddle-mobile-metallib.metallib"; sourceTree = "<group>"; };
-		FCCED60421D7646E00BE8D5F /* test_image_super */ = {isa = PBXFileReference; lastKnownFileType = file; path = test_image_super; sourceTree = "<group>"; };
-		FCE834AC2232A4AE0057BF43 /* combined_mobilenet_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = combined_mobilenet_params; sourceTree = "<group>"; };
-		FCE834AD2232A4AE0057BF43 /* combined_mobilenet_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = combined_mobilenet_model; sourceTree = "<group>"; };
-		FCE834B02232B6DC0057BF43 /* vision_synset.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = vision_synset.txt; sourceTree = "<group>"; };
-		FCEBEC2B20E1391F00C0B14D /* paddle_mobile.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; path = paddle_mobile.framework; sourceTree = BUILT_PRODUCTS_DIR; };
-		FCF437E7214B6DDB00943429 /* MultiPredictViewController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MultiPredictViewController.swift; sourceTree = "<group>"; };
-		FCFADE33222F63CB0037DCE8 /* test_big.JPG */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = test_big.JPG; sourceTree = "<group>"; };
-/* End PBXFileReference section */
-
-/* Begin PBXFrameworksBuildPhase section */
-		FC039B7B20E11C550081E9F8 /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				C2CBB49021B778EA0020DC6C /* libc++.tbd in Frameworks */,
-				FCEBEC2C20E1391F00C0B14D /* paddle_mobile.framework in Frameworks */,
-				30D0ED21F392CFA3885B1002 /* Pods_paddle_mobile_demo.framework in Frameworks */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXFrameworksBuildPhase section */
-
-/* Begin PBXGroup section */
-		5722B50FEC38F55CA9B6A57B /* Pods */ = {
-			isa = PBXGroup;
-			children = (
-				878829884E1A14D7044721D5 /* Pods-paddle-mobile-demo.debug.xcconfig */,
-				081C9CF10DB06C58B8B6B039 /* Pods-paddle-mobile-demo.release.xcconfig */,
-			);
-			name = Pods;
-			sourceTree = "<group>";
-		};
-		7B7DED984E9EE7BFB45E24E8 /* Frameworks */ = {
-			isa = PBXGroup;
-			children = (
-				FC4FD97B2140EE250073E130 /* libc++.tbd */,
-				18896810981724F8A0FED62A /* Pods_paddle_mobile_demo.framework */,
-			);
-			name = Frameworks;
-			sourceTree = "<group>";
-		};
-		FC039B7520E11C550081E9F8 = {
-			isa = PBXGroup;
-			children = (
-				FCC15E14221E716400DC3CB2 /* paddle-mobile-metallib.metallib */,
-				FCEBEC2B20E1391F00C0B14D /* paddle_mobile.framework */,
-				FC039B8020E11C550081E9F8 /* paddle-mobile-demo */,
-				FC039B7F20E11C550081E9F8 /* Products */,
-				5722B50FEC38F55CA9B6A57B /* Pods */,
-				7B7DED984E9EE7BFB45E24E8 /* Frameworks */,
-			);
-			sourceTree = "<group>";
-		};
-		FC039B7F20E11C550081E9F8 /* Products */ = {
-			isa = PBXGroup;
-			children = (
-				FC039B7E20E11C550081E9F8 /* paddle-mobile-demo.app */,
-			);
-			name = Products;
-			sourceTree = "<group>";
-		};
-		FC039B8020E11C550081E9F8 /* paddle-mobile-demo */ = {
-			isa = PBXGroup;
-			children = (
-				FC1A1D522237EAA10047B7FD /* OC */,
-				FC2BFD4F21DF892500C262B2 /* Resources */,
-				FCBCCC542122EF5400D94F7E /* MetalHelper.swift */,
-				FC2BFD3F21DF5DDF00C262B2 /* OCInterface */,
-				FC27991121343A39000B6BAD /* paddle-mobile-demo-Bridging-Header.h */,
-				FC2BFD3921DF46F000C262B2 /* OCDemo */,
-				FC803BCA214D27920094B8E5 /* VideoCapture */,
-				FC8CFED2213519540094D569 /* Net */,
-				FC039B8120E11C550081E9F8 /* AppDelegate.swift */,
-				FC039B8320E11C550081E9F8 /* ViewController.swift */,
-				FC039B8520E11C550081E9F8 /* Main.storyboard */,
-				FC039B8820E11C560081E9F8 /* Assets.xcassets */,
-				FC039B8A20E11C560081E9F8 /* LaunchScreen.storyboard */,
-				FC039B8D20E11C560081E9F8 /* Info.plist */,
-				FCF437E7214B6DDB00943429 /* MultiPredictViewController.swift */,
-			);
-			path = "paddle-mobile-demo";
-			sourceTree = "<group>";
-		};
-		FC1A1D522237EAA10047B7FD /* OC */ = {
-			isa = PBXGroup;
-			children = (
-				FC1A1D532237EAC70047B7FD /* ImageTool.h */,
-				FC1A1D542237EAC70047B7FD /* ImageTool.m */,
-			);
-			path = OC;
-			sourceTree = "<group>";
-		};
-		FC203FA821CBFDBA00B37166 /* images */ = {
-			isa = PBXGroup;
-			children = (
-				FCFADE33222F63CB0037DCE8 /* test_big.JPG */,
-				FC2BFCBF21DF279900C262B2 /* classify-img-output.png */,
-				FC2BFCBD21DF15D900C262B2 /* 123.jpg */,
-				FC2BFCBB21DF0A8600C262B2 /* 00001.jpg */,
-				FC5E03B121DCE8D90016C137 /* mingren_input_data */,
-				FCCED60421D7646E00BE8D5F /* test_image_super */,
-				FC9797CE21D6506F00F2FD90 /* mingren.jpg */,
-				FC9797BD21D6045B00F2FD90 /* banana.jpeg */,
-				FC203FA921CBFDBA00B37166 /* test.jpg */,
-			);
-			path = images;
-			sourceTree = "<group>";
-		};
-		FC203FAA21CBFDBA00B37166 /* models */ = {
-			isa = PBXGroup;
-			children = (
-				FC9797BF21D608DF00F2FD90 /* mobilenet */,
-				FC704C1B21D237FC00F98BAB /* vision_model */,
-				FC704C1621D2375300F98BAB /* superresoltion */,
-			);
-			path = models;
-			sourceTree = "<group>";
-		};
-		FC2BFD3921DF46F000C262B2 /* OCDemo */ = {
-			isa = PBXGroup;
-			children = (
-				C2E67E5C21524E460013F575 /* LoadPointerViewController.h */,
-				C2E67E5D21524E460013F575 /* LoadPointerViewController.m */,
-				FC2BFD3621DF46DE00C262B2 /* OCDemoViewController.h */,
-				FC2BFD3721DF46DE00C262B2 /* OCDemoViewController.m */,
-			);
-			path = OCDemo;
-			sourceTree = "<group>";
-		};
-		FC2BFD3F21DF5DDF00C262B2 /* OCInterface */ = {
-			isa = PBXGroup;
-			children = (
-				FC2BFD4121DF5E1E00C262B2 /* PaddleMobileGPU.h */,
-				FC2BFD4021DF5E1E00C262B2 /* PaddleMobileGPU.m */,
-				FC2BFD4221DF5E1E00C262B2 /* SuperResolutionNet.swift */,
-			);
-			path = OCInterface;
-			sourceTree = "<group>";
-		};
-		FC2BFD4F21DF892500C262B2 /* Resources */ = {
-			isa = PBXGroup;
-			children = (
-				FC203FA821CBFDBA00B37166 /* images */,
-				FC203FAA21CBFDBA00B37166 /* models */,
-			);
-			path = Resources;
-			sourceTree = "<group>";
-		};
-		FC704C1621D2375300F98BAB /* superresoltion */ = {
-			isa = PBXGroup;
-			children = (
-				FC704C1721D2375300F98BAB /* super_params */,
-				FC704C1821D2375300F98BAB /* super_model */,
-			);
-			path = superresoltion;
-			sourceTree = "<group>";
-		};
-		FC704C1B21D237FC00F98BAB /* vision_model */ = {
-			isa = PBXGroup;
-			children = (
-				FC9B9F062236AB4800BB6DB0 /* mobilenet_16 */,
-				FCE834AB2232A4AE0057BF43 /* vision_mobilenet */,
-				FCAFD8482231614200496A36 /* yolo_16 */,
-				FC704C1F21D237FC00F98BAB /* yolo */,
-			);
-			path = vision_model;
-			sourceTree = "<group>";
-		};
-		FC704C1F21D237FC00F98BAB /* yolo */ = {
-			isa = PBXGroup;
-			children = (
-				FC704C2021D237FC00F98BAB /* yolo_params */,
-				FC704C2121D237FC00F98BAB /* yolo_model */,
-			);
-			path = yolo;
-			sourceTree = "<group>";
-		};
-		FC803BCA214D27920094B8E5 /* VideoCapture */ = {
-			isa = PBXGroup;
-			children = (
-				FC803BCB214D27920094B8E5 /* FPSCounter.swift */,
-				FC803BCC214D27920094B8E5 /* VideoCapture.swift */,
-			);
-			path = VideoCapture;
-			sourceTree = "<group>";
-		};
-		FC8CFED2213519540094D569 /* Net */ = {
-			isa = PBXGroup;
-			children = (
-				FC2BFD3A21DF480300C262B2 /* CPUCompute.h */,
-				FC2BFD3B21DF480400C262B2 /* CPUCompute.mm */,
-				FC2BFD3D21DF5CE800C262B2 /* PreProcessKernel.metal */,
-				FC2BFD2B21DF3FE900C262B2 /* Genet.swift */,
-				FC2BFD2F21DF3FEA00C262B2 /* MobileNet.swift */,
-				FC2BFD2E21DF3FEA00C262B2 /* MobileNetCombined.swift */,
-				FC2BFD2A21DF3FE900C262B2 /* MobilenetSSD_AR.swift */,
-				FC2BFD2C21DF3FE900C262B2 /* MobileNetSSD.swift */,
-				FC2BFD2D21DF3FE900C262B2 /* YoloNet.swift */,
-			);
-			path = Net;
-			sourceTree = "<group>";
-		};
-		FC9797BF21D608DF00F2FD90 /* mobilenet */ = {
-			isa = PBXGroup;
-			children = (
-				FC9797C621D609FB00F2FD90 /* synset.txt */,
-				FC9797C021D608DF00F2FD90 /* mobilenet_model */,
-				FC9797C121D608DF00F2FD90 /* mobilenet_params */,
-			);
-			path = mobilenet;
-			sourceTree = "<group>";
-		};
-		FC9B9F062236AB4800BB6DB0 /* mobilenet_16 */ = {
-			isa = PBXGroup;
-			children = (
-				FC9B9F072236ABD200BB6DB0 /* combined_mobilenet_model_16 */,
-				FC9B9F082236ABD300BB6DB0 /* combined_mobilenet_params_16 */,
-			);
-			path = mobilenet_16;
-			sourceTree = "<group>";
-		};
-		FCAFD8482231614200496A36 /* yolo_16 */ = {
-			isa = PBXGroup;
-			children = (
-				FCAFD8492231614200496A36 /* yolo_16_param */,
-				FCAFD84A2231614200496A36 /* yolo_16_model */,
-			);
-			path = yolo_16;
-			sourceTree = "<group>";
-		};
-		FCE834AB2232A4AE0057BF43 /* vision_mobilenet */ = {
-			isa = PBXGroup;
-			children = (
-				FCE834B02232B6DC0057BF43 /* vision_synset.txt */,
-				FCE834AC2232A4AE0057BF43 /* combined_mobilenet_params */,
-				FCE834AD2232A4AE0057BF43 /* combined_mobilenet_model */,
-			);
-			path = vision_mobilenet;
-			sourceTree = "<group>";
-		};
-/* End PBXGroup section */
-
-/* Begin PBXNativeTarget section */
-		FC039B7D20E11C550081E9F8 /* paddle-mobile-demo */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = FC039B9020E11C560081E9F8 /* Build configuration list for PBXNativeTarget "paddle-mobile-demo" */;
-			buildPhases = (
-				9E041A9C487A2E44C709327E /* [CP] Check Pods Manifest.lock */,
-				FC039B7A20E11C550081E9F8 /* Sources */,
-				FC039B7B20E11C550081E9F8 /* Frameworks */,
-				FC039B7C20E11C550081E9F8 /* Resources */,
-				84ED590C0E51ABA9C34F51B5 /* [CP] Embed Pods Frameworks */,
-				FCEBEC2E20E1392000C0B14D /* Embed Frameworks */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-			);
-			name = "paddle-mobile-demo";
-			productName = "paddle-mobile-demo";
-			productReference = FC039B7E20E11C550081E9F8 /* paddle-mobile-demo.app */;
-			productType = "com.apple.product-type.application";
-		};
-/* End PBXNativeTarget section */
-
-/* Begin PBXProject section */
-		FC039B7620E11C550081E9F8 /* Project object */ = {
-			isa = PBXProject;
-			attributes = {
-				LastSwiftUpdateCheck = 0930;
-				LastUpgradeCheck = 0930;
-				ORGANIZATIONNAME = orange;
-				TargetAttributes = {
-					FC039B7D20E11C550081E9F8 = {
-						CreatedOnToolsVersion = 9.3.1;
-						LastSwiftMigration = 0940;
-					};
-				};
-			};
-			buildConfigurationList = FC039B7920E11C550081E9F8 /* Build configuration list for PBXProject "paddle-mobile-demo" */;
-			compatibilityVersion = "Xcode 9.3";
-			developmentRegion = en;
-			hasScannedForEncodings = 0;
-			knownRegions = (
-				en,
-				Base,
-			);
-			mainGroup = FC039B7520E11C550081E9F8;
-			productRefGroup = FC039B7F20E11C550081E9F8 /* Products */;
-			projectDirPath = "";
-			projectRoot = "";
-			targets = (
-				FC039B7D20E11C550081E9F8 /* paddle-mobile-demo */,
-			);
-		};
-/* End PBXProject section */
-
-/* Begin PBXResourcesBuildPhase section */
-		FC039B7C20E11C550081E9F8 /* Resources */ = {
-			isa = PBXResourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				FCCED60521D7646E00BE8D5F /* test_image_super in Resources */,
-				FC039B8C20E11C560081E9F8 /* LaunchScreen.storyboard in Resources */,
-				FC9797CF21D6506F00F2FD90 /* mingren.jpg in Resources */,
-				FCAFD84B2231614200496A36 /* yolo_16_param in Resources */,
-				FCE834AF2232A4AE0057BF43 /* combined_mobilenet_model in Resources */,
-				FC704C1921D2375300F98BAB /* super_params in Resources */,
-				FC2BFCBE21DF15D900C262B2 /* 123.jpg in Resources */,
-				FC039B8920E11C560081E9F8 /* Assets.xcassets in Resources */,
-				FC9797C721D609FB00F2FD90 /* synset.txt in Resources */,
-				FCFADE34222F63CC0037DCE8 /* test_big.JPG in Resources */,
-				FC5E03B221DCE8D90016C137 /* mingren_input_data in Resources */,
-				FC704C1A21D2375300F98BAB /* super_model in Resources */,
-				FC039B8720E11C550081E9F8 /* Main.storyboard in Resources */,
-				FCE834B12232B6DC0057BF43 /* vision_synset.txt in Resources */,
-				FC9797C221D608E000F2FD90 /* mobilenet_model in Resources */,
-				FCAFD84C2231614200496A36 /* yolo_16_model in Resources */,
-				FC2BFCC021DF279900C262B2 /* classify-img-output.png in Resources */,
-				FC203FB221CBFDBA00B37166 /* test.jpg in Resources */,
-				FC9B9F0A2236ABF500BB6DB0 /* combined_mobilenet_params_16 in Resources */,
-				FCC15E15221E716500DC3CB2 /* paddle-mobile-metallib.metallib in Resources */,
-				FC9797C321D608E000F2FD90 /* mobilenet_params in Resources */,
-				FC704C2421D237FC00F98BAB /* yolo_params in Resources */,
-				FCE834AE2232A4AE0057BF43 /* combined_mobilenet_params in Resources */,
-				FC9B9F092236ABF300BB6DB0 /* combined_mobilenet_model_16 in Resources */,
-				FC2BFCBC21DF0A8600C262B2 /* 00001.jpg in Resources */,
-				FC9797BE21D6045B00F2FD90 /* banana.jpeg in Resources */,
-				FC704C2521D237FC00F98BAB /* yolo_model in Resources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXResourcesBuildPhase section */
-
-/* Begin PBXShellScriptBuildPhase section */
-		84ED590C0E51ABA9C34F51B5 /* [CP] Embed Pods Frameworks */ = {
-			isa = PBXShellScriptBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			inputPaths = (
-				"${SRCROOT}/../Pods/Target Support Files/Pods-paddle-mobile-demo/Pods-paddle-mobile-demo-frameworks.sh",
-				"${BUILT_PRODUCTS_DIR}/Protobuf/Protobuf.framework",
-			);
-			name = "[CP] Embed Pods Frameworks";
-			outputPaths = (
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/Protobuf.framework",
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-			shellPath = /bin/sh;
-			shellScript = "\"${SRCROOT}/../Pods/Target Support Files/Pods-paddle-mobile-demo/Pods-paddle-mobile-demo-frameworks.sh\"\n";
-			showEnvVarsInLog = 0;
-		};
-		9E041A9C487A2E44C709327E /* [CP] Check Pods Manifest.lock */ = {
-			isa = PBXShellScriptBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			inputPaths = (
-				"${PODS_PODFILE_DIR_PATH}/Podfile.lock",
-				"${PODS_ROOT}/Manifest.lock",
-			);
-			name = "[CP] Check Pods Manifest.lock";
-			outputPaths = (
-				"$(DERIVED_FILE_DIR)/Pods-paddle-mobile-demo-checkManifestLockResult.txt",
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-			shellPath = /bin/sh;
-			shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n    # print error to STDERR\n    echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n    exit 1\nfi\n# This output is used by Xcode 'outputs' to avoid re-running this script phase.\necho \"SUCCESS\" > \"${SCRIPT_OUTPUT_FILE_0}\"\n";
-			showEnvVarsInLog = 0;
-		};
-/* End PBXShellScriptBuildPhase section */
-
-/* Begin PBXSourcesBuildPhase section */
-		FC039B7A20E11C550081E9F8 /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				FC2BFD3221DF3FEA00C262B2 /* MobileNetSSD.swift in Sources */,
-				FC2BFD3C21DF480400C262B2 /* CPUCompute.mm in Sources */,
-				FC2BFD4321DF5E1E00C262B2 /* PaddleMobileGPU.m in Sources */,
-				FC039B8420E11C550081E9F8 /* ViewController.swift in Sources */,
-				FC803BCE214D27930094B8E5 /* VideoCapture.swift in Sources */,
-				FCF437E8214B6DDB00943429 /* MultiPredictViewController.swift in Sources */,
-				FC2BFD3021DF3FEA00C262B2 /* MobilenetSSD_AR.swift in Sources */,
-				FC2BFD3321DF3FEA00C262B2 /* YoloNet.swift in Sources */,
-				FC2BFD3421DF3FEA00C262B2 /* MobileNetCombined.swift in Sources */,
-				FCBCCC552122EF5500D94F7E /* MetalHelper.swift in Sources */,
-				FC803BCD214D27930094B8E5 /* FPSCounter.swift in Sources */,
-				FC2BFD3521DF3FEA00C262B2 /* MobileNet.swift in Sources */,
-				C2E67E5E21524E460013F575 /* LoadPointerViewController.m in Sources */,
-				FC2BFD3121DF3FEA00C262B2 /* Genet.swift in Sources */,
-				FC039B8220E11C550081E9F8 /* AppDelegate.swift in Sources */,
-				FC1A1D552237EAC70047B7FD /* ImageTool.m in Sources */,
-				FC2BFD4421DF5E1E00C262B2 /* SuperResolutionNet.swift in Sources */,
-				FC2BFD3E21DF5CE800C262B2 /* PreProcessKernel.metal in Sources */,
-				FC2BFD3821DF46DE00C262B2 /* OCDemoViewController.m in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXSourcesBuildPhase section */
-
-/* Begin PBXVariantGroup section */
-		FC039B8520E11C550081E9F8 /* Main.storyboard */ = {
-			isa = PBXVariantGroup;
-			children = (
-				FC039B8620E11C550081E9F8 /* Base */,
-			);
-			name = Main.storyboard;
-			sourceTree = "<group>";
-		};
-		FC039B8A20E11C560081E9F8 /* LaunchScreen.storyboard */ = {
-			isa = PBXVariantGroup;
-			children = (
-				FC039B8B20E11C560081E9F8 /* Base */,
-			);
-			name = LaunchScreen.storyboard;
-			sourceTree = "<group>";
-		};
-/* End PBXVariantGroup section */
-
-/* Begin XCBuildConfiguration section */
-		FC039B8E20E11C560081E9F8 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_ENABLE_OBJC_WEAK = YES;
-				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_COMMA = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-				CLANG_WARN_STRICT_PROTOTYPES = YES;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = dwarf;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				ENABLE_TESTABILITY = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu11;
-				GCC_DYNAMIC_NO_PIC = NO;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_OPTIMIZATION_LEVEL = 0;
-				GCC_PREPROCESSOR_DEFINITIONS = (
-					"DEBUG=1",
-					"$(inherited)",
-				);
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 11.3;
-				MTL_ENABLE_DEBUG_INFO = YES;
-				ONLY_ACTIVE_ARCH = YES;
-				SDKROOT = iphoneos;
-				SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG;
-				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
-			};
-			name = Debug;
-		};
-		FC039B8F20E11C560081E9F8 /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_ENABLE_OBJC_WEAK = YES;
-				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_COMMA = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-				CLANG_WARN_STRICT_PROTOTYPES = YES;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
-				ENABLE_NS_ASSERTIONS = NO;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu11;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 11.3;
-				MTL_ENABLE_DEBUG_INFO = NO;
-				SDKROOT = iphoneos;
-				SWIFT_COMPILATION_MODE = wholemodule;
-				SWIFT_OPTIMIZATION_LEVEL = "-O";
-				VALIDATE_PRODUCT = YES;
-			};
-			name = Release;
-		};
-		FC039B9120E11C560081E9F8 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			baseConfigurationReference = 878829884E1A14D7044721D5 /* Pods-paddle-mobile-demo.debug.xcconfig */;
-			buildSettings = {
-				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-				CLANG_ENABLE_MODULES = YES;
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = A798K58VVL;
-				ENABLE_BITCODE = NO;
-				INFOPLIST_FILE = "paddle-mobile-demo/Info.plist";
-				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
-				LD_RUNPATH_SEARCH_PATHS = (
-					"$(inherited)",
-					"@executable_path/Frameworks",
-				);
-				PRODUCT_BUNDLE_IDENTIFIER = "com.baidu.paddle-mobile";
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				PROVISIONING_PROFILE = "";
-				PROVISIONING_PROFILE_SPECIFIER = "";
-				SWIFT_OBJC_BRIDGING_HEADER = "paddle-mobile-demo/paddle-mobile-demo-Bridging-Header.h";
-				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
-				SWIFT_VERSION = 4.0;
-				TARGETED_DEVICE_FAMILY = "1,2";
-			};
-			name = Debug;
-		};
-		FC039B9220E11C560081E9F8 /* Release */ = {
-			isa = XCBuildConfiguration;
-			baseConfigurationReference = 081C9CF10DB06C58B8B6B039 /* Pods-paddle-mobile-demo.release.xcconfig */;
-			buildSettings = {
-				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-				CLANG_ENABLE_MODULES = YES;
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = A798K58VVL;
-				ENABLE_BITCODE = NO;
-				INFOPLIST_FILE = "paddle-mobile-demo/Info.plist";
-				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
-				LD_RUNPATH_SEARCH_PATHS = (
-					"$(inherited)",
-					"@executable_path/Frameworks",
-				);
-				PRODUCT_BUNDLE_IDENTIFIER = "com.baidu.paddle-mobile";
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				PROVISIONING_PROFILE = "";
-				PROVISIONING_PROFILE_SPECIFIER = "";
-				SWIFT_OBJC_BRIDGING_HEADER = "paddle-mobile-demo/paddle-mobile-demo-Bridging-Header.h";
-				SWIFT_VERSION = 4.0;
-				TARGETED_DEVICE_FAMILY = "1,2";
-			};
-			name = Release;
-		};
-/* End XCBuildConfiguration section */
-
-/* Begin XCConfigurationList section */
-		FC039B7920E11C550081E9F8 /* Build configuration list for PBXProject "paddle-mobile-demo" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				FC039B8E20E11C560081E9F8 /* Debug */,
-				FC039B8F20E11C560081E9F8 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		FC039B9020E11C560081E9F8 /* Build configuration list for PBXNativeTarget "paddle-mobile-demo" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				FC039B9120E11C560081E9F8 /* Debug */,
-				FC039B9220E11C560081E9F8 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-/* End XCConfigurationList section */
-	};
-	rootObject = FC039B7620E11C550081E9F8 /* Project object */;
-}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.xcworkspace/contents.xcworkspacedata b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.xcworkspace/contents.xcworkspacedata
deleted file mode 100644
index d363ac3d83..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.xcworkspace/contents.xcworkspacedata
+++ /dev/null
@@ -1,7 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<Workspace
-   version = "1.0">
-   <FileRef
-      location = "self:paddle-mobile-demo.xcodeproj">
-   </FileRef>
-</Workspace>
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
deleted file mode 100644
index 18d981003d..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
+++ /dev/null
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>IDEDidComputeMac32BitWarning</key>
-	<true/>
-</dict>
-</plist>
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
deleted file mode 100644
index c13bba168aef55d0004299258e02496fc2486236..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 5181
zcma)A3wTu3wO;##%$(<(6NCWak%0{I%7n>d^5B^XlRzLDVkQZB!7xb<;h4!xJTo!I
zG`dj{AEm8wtyfyxBp~2zwN?dupy0K&YSk)Gd{q6Y_0@iTVD-}5T58vslNmBazt80R
za?U<`ug6~NU;kb^9ep7!8gsc$0|5roU=pN*h5R?Ur`d6}sz<d*xY-fuxKizmMOz#}
zy>q*UpBo&pNZ({2H;?>834s+@xD2MiRG1H!!va_ci(oM<fu&FkB~T6PVFT1aE%;y~
z)WH?737TOubi;Pg;7aI$5JX@Qu7&Gh2ke4d;8yrH+yy^`5%?eY5gdjia1?$CFTu<3
z3j7LQh1cLDoPyJE7Jduw!SCRGI0v7=U*S{u4E_dxhc5^aLaan2nPeK7OXiWwNiNAJ
z%gG9|lB^}`NHw{Fw2`f(i)<$$(o2TPb>w<N$qsS@xslvTc9T2EU5$=#e<*YcIN(75
z5oDOW$>-1OQ3s=~_#6*PlcO^fj7G;m0vly$+88L1K_}6z)E-bAo=TV3?J4j$3qARz
z&Z0_pk+an6DsmQiy~U;OGIx=uv@oDdsrFWGis(JjzF?>7jdb?+s^M6|a3;(K#{rlI
z(_sc=!AzJ%)2W48DN8xZAAmWK4R)NG2RT%r61Cw}fG(qsfU<C+-c4G#D>4x9`Rmna
zETRWHLaIL&jH%vWEErH`7)#?T8X}QUM^LYhHfd1}7X_466V)^XJJv+PF+CCrsd~WU
zaaFk76>eu)QHk4GRN|>}R#do)aaigtF0Al)3SC73B^woZI%9~O<StXxY)Klg(u2Kd
zaeDFtuB({f%t^?_TsR@m$V3-<zA+>1>5c@136nM{-BMWeb)lPKWjS6DCEK*wbF$|z
z$#uHSoClOyg)VzxsXf2Q<+7I+mf2nSn_pCfgOU<^69#cvVSZgenLIHvX87>yRQ#IA
z;>B2<l9i0fQ!>9ALZg>ac62Hgsd8%ijEt<AvoH=mzrRyg)o@L4Fw!4uY^?VBW4adZ
zM$@TNe(hS-zylQ!b@^2lRRvWgRZe$F<r=5ExYF%(7v$s6U5p8Fd-FZT`E5hSesb+)
zF4yo$``mdsttODen1E*URgG>*H8M8O(d_eYjI+V1%NJxUT(lUG<MW$4H+?kU$Mcw-
zIyZ4I@fC5n$^j*FXs8f<aSi9%(JdFgVw{JEF_c3?7rz_s#=oP#c}vl^iFU{G9QZlE
zpxx*9YKXN+IH(Vf<zp4dKK}-FQ0?-Jlh@ns;*!#`Wy@ErT(!E~Q?Uk<wRT<g`VBRk
z6B^^bVWJwen1TNc%(rREEFD$Hx#}n?ZSwgWtE;^jl<H`0PzwhX#pkcm^k@uM_C>;7
zs@~@FuQ3S17YDYIRAojhiqrnsU`VYEMtcIvWeG#cI;r+1_v!2J?ND_}Xd1O}l;J2K
z%~%zgrf00HMGDS9cF4inC_vKPfHd0&18@!OhHt?EcmQei1pESCKq7q^N%S>%13o1z
z;gKY5#7#V;3MsLbXh?_s<QvHUPmp8eX>x+RNj@RxnRJF_BqocQ%`9csF-^=CMrW>N
zb}+Xw_cHf0$C&4tcbPvie_=jhK4m^*&NF{w{?2^PTwuOPBM|>##-Df$r2M2!qT)$#
zK|Z=&_+=2pq}2Au47M_eY9B5u0yoCp7$w2TCCIWA%E56E%3v8RhZV3AR>5j2Q-x;G
z$@DThg-$&P9wf{);Dt(juBDlX!D)0pU5L+ogETWe(I}pX!EmgyOGBcJnwi4XVp;pQ
zM+SVstF`VR&IA<a#QIO6w8oeg(qe-iT@Ct?o?=a!I$)|uiW_qQCHE^;c-5%Z9sUvx
zh7+cKO^t3wdDBs2rGf{ed0y2+12{&(Pp6MUBb{OFEW_K>kaVUxruIfp#*elZXdi=C
zXrozlF3m|O*a96ec^?E|D+K9GI*ZQU2b~x)mCnIXWmEfRR9D@FCzZBMQ}uRZbn(nk
zaA@qgHrO53V*OoeT~7IqkK?H~)7=Z<)KZ0Ba3^f{LF`Ly55#THqxPKgzL5#ahTt1m
zCgV8q=z4H)9}K}TxUo><L(}H<K;t!*092AE_QCZN&EEhw8L_*OUOoyt>4JFd{Avh$
zAhj!*zJOwH?u>M)d6<9=S}ZR)k%tuB-x=%I)x>sggKs8tv70WU4m0lGf%biH2iysJ
z=wg~n^Df5zyRi3?{jb2---{@g=n`Z9SD9s$Ffj`ET_Uj>B@WT4DB(0DZnwup5)wa#
zhc1!$DM~y-m!gErXkQG(CxK?b3&UQXqU14n1{{arad-lrgk$g&JPpU;1TCP2w1~QC
zF)g8`wCoV_Qa09|2cAO?dLCZD_C&(U%cRR`9le6K7!g(sSbNou{_bv7H;fs`HolqY
z#xAMJ>_^S9zrUjwqu*+dYVj3QixOUSTd+SAiz_m;V4B81)fwW^7y`Cl?bFpLz66bP
zB5G8|xZPxVw1(m*ZUY{^Y$9U6hPS}6AI`w*$ewS~6?7$CMOW{K-@x1O4!ldt=~`M%
z*HcGxjk+yf@h$aQ_xAW(tH~m!pGJd_w3PHTVdew)gJI@F_y|6R-%}5*plhgiKgQ4l
ze}Zg`T_&x>_+iXuHU)Lf;6^Mwb-)N#v_3e{6b$vNxT~zx*>M2-J0pD<D|^ahTs)#<
z*rJ`f)@RTjGGWS+xb}v@KGg*8|H1hYT4mPNF#P|RQEiDG`~&`J;Qe#DZWKJ|t-(Fm
z2t^|ywMq-cRNV-JLdKU9hD=J72T7+J#><1SNK1qxJguR=WN8qIWLyH<GmuTDU_K;T
zYXE&H1*2p-nRSW8>nM?p`IG2IL*m+V6H!fahyxrWBh-=bZ~<AA^l&k)Pk87wJj^3Y
zsh>7o^ss;w|EE1tg7(U21KMk(_7+UP3EP}<ZEK3<Rixq)*)@<uD(NQ7d$UoLavbCr
z%Ot0~{X|^HX|kSdAT=P6TH+%ciP4|xkx0x$o`Gavyh^5Zt6{A_YI<Pwr|LB^b7p#<
zuJvNubjcKL!)BwEZaG8hi60F#kVevkMruh5-nD#C*NpeVP?xTToBD$IH`dl^=ncmV
zydc)P#_u#{3uXW|wFKKDx|S$MwA4m7Q%JSdj@DYB&ag$>aS1Mn<B$a5{3zLo&Y4&w
zD%loL7GR8$EmX?6oEnTV(hcsfzdVk1?2)bvC^=ul!Wii>w11tYU)cgD5Ev%?cnc&E
z(nqc$I*F1P4bl$UNxP^@w;do?lL2xK8H^J|H`VBsv?oClS!VBQoP*(R)wo*5DW^KT
zErMuDTvW_O25XGdr3r%g6){>B(HrY)a8gbjgURL7>(x$FX|7HvGZ#;(K>qFNi)i6k
zG=BZV+h$i{LpPC|jgpcmGfH;R?TM1Q4cg-$X^xrRZzabG!8^$wvpCHK--Y%OnsaHF
z{~Fm#zSkIcpAzhjjSKP;aARY*nq(0>abnk7j{gj(L15J(oHrqq+wl5-4>>>{A&1FP
z@;G^tJVlO^m&hyRRdSM?CTGYS<Sp!l=Q174P0SwVLFOoP+_J-Rqve~HyDcM@2Q7y!
z$1Fd$JZCv&dC&5`<wMKImOohjX!*0{oaGBEv8GwmtyU{%6|9m~wl1_TxAs`Cvp!%w
zVSUB=s`VY~S?hb&53C<qe{a3OT3CUVSeeaW=dknH1?(br30uGxvBhjDyNq4OHnQ7U
zjn&y9_7-+GyO$kh_p$e}2iXVNhuDYNN7*CnarSxkb@pBML-sFRIwx{7xLMpBZZ4O@
z&F9Lvc246$T$sCxi*o(kb=*#F7k4XnJGYzL%Z+mTxcj(++=JXh+{4_X+!5|L_Y!xC
z`!)AE_a=9idyjjc`;hyX`-J-!pUzu(yhQVp`6+xRKb_CwoqRFBhHv1T_-4M9Z|Ar0
zTlo&Yi|^+L_(6V%ALg&;cknmzJNaGwUj7h&oPUu&E2Ih2gn2@a;1bG&<-$r~wcruf
z3pIjIs1xdi24S<%E$G5+!rj7%FedC54hV;Yp9+r%hlQiU<HD1|o5Fc<hFBmj7gva@
z#I<6L*dn%zTg0H)DXQY2I3(U8-Y)JIza!o${+D>KI415F4~U1v!{RaVY4L>kzv2tx
zFU6O|Ux}}YZ;2m?e-_V)e-%FyKNl~EUr0nslSFBTlqJoSW=o4Dr?gZmkcy;YX|?2)
zs-$(&dTEoiMcOKLNL^Bo)GI}#tE8wjEZr>ak@ia8mwq6PNC&0+r5{NTNI#LDkzSO3
zBb~GHw)wUNwnerjHm9x1w$avP>#*&#-C^5fyUTXB?H=31wnuG8Y>(NVusv&g&i1_R
zMcXOc1(}g^Ww+cWZ<TMBe<=T0eq4T1eo8(rKO;XY|6D#PpO(+aZ^&=SZ_Dq>XXOiu
ztmG=Iloq8=8CIxrgR)cErQE9AsobU9t=yxGDfcM{mHU+=%2DMp<q73k<!8!s${Wf%
V%J1Um3NVcMB;PlmiNDGR{|2v7^_u_y

diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/xcshareddata/xcschemes/paddle-mobile-demo.xcscheme b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/xcshareddata/xcschemes/paddle-mobile-demo.xcscheme
deleted file mode 100644
index b812f38e88..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/xcshareddata/xcschemes/paddle-mobile-demo.xcscheme
+++ /dev/null
@@ -1,91 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<Scheme
-   LastUpgradeVersion = "1010"
-   version = "1.3">
-   <BuildAction
-      parallelizeBuildables = "YES"
-      buildImplicitDependencies = "YES">
-      <BuildActionEntries>
-         <BuildActionEntry
-            buildForTesting = "YES"
-            buildForRunning = "YES"
-            buildForProfiling = "YES"
-            buildForArchiving = "YES"
-            buildForAnalyzing = "YES">
-            <BuildableReference
-               BuildableIdentifier = "primary"
-               BlueprintIdentifier = "FC039B7D20E11C550081E9F8"
-               BuildableName = "paddle-mobile-demo.app"
-               BlueprintName = "paddle-mobile-demo"
-               ReferencedContainer = "container:paddle-mobile-demo.xcodeproj">
-            </BuildableReference>
-         </BuildActionEntry>
-      </BuildActionEntries>
-   </BuildAction>
-   <TestAction
-      buildConfiguration = "Debug"
-      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
-      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
-      shouldUseLaunchSchemeArgsEnv = "YES">
-      <Testables>
-      </Testables>
-      <MacroExpansion>
-         <BuildableReference
-            BuildableIdentifier = "primary"
-            BlueprintIdentifier = "FC039B7D20E11C550081E9F8"
-            BuildableName = "paddle-mobile-demo.app"
-            BlueprintName = "paddle-mobile-demo"
-            ReferencedContainer = "container:paddle-mobile-demo.xcodeproj">
-         </BuildableReference>
-      </MacroExpansion>
-      <AdditionalOptions>
-      </AdditionalOptions>
-   </TestAction>
-   <LaunchAction
-      buildConfiguration = "Debug"
-      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
-      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
-      launchStyle = "0"
-      useCustomWorkingDirectory = "NO"
-      ignoresPersistentStateOnLaunch = "NO"
-      debugDocumentVersioning = "YES"
-      debugServiceExtension = "internal"
-      allowLocationSimulation = "YES">
-      <BuildableProductRunnable
-         runnableDebuggingMode = "0">
-         <BuildableReference
-            BuildableIdentifier = "primary"
-            BlueprintIdentifier = "FC039B7D20E11C550081E9F8"
-            BuildableName = "paddle-mobile-demo.app"
-            BlueprintName = "paddle-mobile-demo"
-            ReferencedContainer = "container:paddle-mobile-demo.xcodeproj">
-         </BuildableReference>
-      </BuildableProductRunnable>
-      <AdditionalOptions>
-      </AdditionalOptions>
-   </LaunchAction>
-   <ProfileAction
-      buildConfiguration = "Release"
-      shouldUseLaunchSchemeArgsEnv = "YES"
-      savedToolIdentifier = ""
-      useCustomWorkingDirectory = "NO"
-      debugDocumentVersioning = "YES">
-      <BuildableProductRunnable
-         runnableDebuggingMode = "0">
-         <BuildableReference
-            BuildableIdentifier = "primary"
-            BlueprintIdentifier = "FC039B7D20E11C550081E9F8"
-            BuildableName = "paddle-mobile-demo.app"
-            BlueprintName = "paddle-mobile-demo"
-            ReferencedContainer = "container:paddle-mobile-demo.xcodeproj">
-         </BuildableReference>
-      </BuildableProductRunnable>
-   </ProfileAction>
-   <AnalyzeAction
-      buildConfiguration = "Debug">
-   </AnalyzeAction>
-   <ArchiveAction
-      buildConfiguration = "Release"
-      revealArchiveInOrganizer = "YES">
-   </ArchiveAction>
-</Scheme>
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift
deleted file mode 100644
index 557f5eef35..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import UIKit
-
-@UIApplicationMain
-class AppDelegate: UIResponder, UIApplicationDelegate {
-    
-    var window: UIWindow?
-    
-    func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplicationLaunchOptionsKey: Any]?) -> Bool {
-        // Override point for customization after application launch.
-        return true
-    }
-    
-    func applicationWillResignActive(_ application: UIApplication) {
-        // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state.
-        // Use this method to pause ongoing tasks, disable timers, and invalidate graphics rendering callbacks. Games should use this method to pause the game.
-    }
-    
-    func applicationDidEnterBackground(_ application: UIApplication) {
-        // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later.
-        // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits.
-    }
-    
-    func applicationWillEnterForeground(_ application: UIApplication) {
-        // Called as part of the transition from the background to the active state; here you can undo many of the changes made on entering the background.
-    }
-    
-    func applicationDidBecomeActive(_ application: UIApplication) {
-        // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface.
-    }
-    
-    func applicationWillTerminate(_ application: UIApplication) {
-        // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:.
-    }
-    
-    
-}
-
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/AppIcon.appiconset/Contents.json b/metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/AppIcon.appiconset/Contents.json
deleted file mode 100644
index b542ec24d2..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/AppIcon.appiconset/Contents.json
+++ /dev/null
@@ -1,98 +0,0 @@
-{
-  "images" : [
-    {
-      "idiom" : "iphone",
-      "size" : "20x20",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "iphone",
-      "size" : "20x20",
-      "scale" : "3x"
-    },
-    {
-      "idiom" : "iphone",
-      "size" : "29x29",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "iphone",
-      "size" : "29x29",
-      "scale" : "3x"
-    },
-    {
-      "idiom" : "iphone",
-      "size" : "40x40",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "iphone",
-      "size" : "40x40",
-      "scale" : "3x"
-    },
-    {
-      "idiom" : "iphone",
-      "size" : "60x60",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "iphone",
-      "size" : "60x60",
-      "scale" : "3x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "20x20",
-      "scale" : "1x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "20x20",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "29x29",
-      "scale" : "1x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "29x29",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "40x40",
-      "scale" : "1x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "40x40",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "76x76",
-      "scale" : "1x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "76x76",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "83.5x83.5",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "ios-marketing",
-      "size" : "1024x1024",
-      "scale" : "1x"
-    }
-  ],
-  "info" : {
-    "version" : 1,
-    "author" : "xcode"
-  }
-}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/Contents.json b/metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/Contents.json
deleted file mode 100644
index 2d92bd53fd..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/Contents.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
-  "info" : {
-    "version" : 1,
-    "author" : "xcode"
-  }
-}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/paddle-mobile.imageset/Contents.json b/metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/paddle-mobile.imageset/Contents.json
deleted file mode 100644
index 383ba3e133..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/paddle-mobile.imageset/Contents.json
+++ /dev/null
@@ -1,21 +0,0 @@
-{
-  "images" : [
-    {
-      "idiom" : "universal",
-      "filename" : "paddle-mobile.png",
-      "scale" : "1x"
-    },
-    {
-      "idiom" : "universal",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "universal",
-      "scale" : "3x"
-    }
-  ],
-  "info" : {
-    "version" : 1,
-    "author" : "xcode"
-  }
-}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/paddle-mobile.imageset/paddle-mobile.png b/metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/paddle-mobile.imageset/paddle-mobile.png
deleted file mode 100644
index 7cb32991117140a3016c24bc6d3b96f696facf82..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 5331
zcmc&&`9GBH_aFOO*^;b7$s;kc?}V(02O-1Qm%+@)SQ}YmP?nHwWJ{LBkX?psS%ws4
z>|{;0#+qgLjGpKFJfH9D`v-jQAMV$=u5-@&EU$B&bKQ5W$z9!ZjNFU>0N|Xyp0+6f
zK#@$Q)#z!-Zwj#&6952X6;w;hL|;n_XySu#g?hOF0D7^{)9B3ES2(*{Gxb3=8h@B-
zA|iY#^{^Big^}?|00!a5k3V%r-?(1MX|And`}rO#4S}-98t~_hm;y%v&3%qkzbfGM
z>&ox1G(XyG-v4+sc;Fhcy!7z}N@V-RHeJ}(V{wk6_db9^Mk#H<7XGIt-??Z{6vF__
zdVmWtUI|v;`FVNiO$KaED-!oVd6@_FU-Kj#ZJyrSnhd;22l#_)QK2FuC2mX9xAwu~
z0%kyxmrMobxnTCf_0)^E-6Q`H$X)1!!gCiIpdpga**WK8w<H17j2LkP>ab&mcxfrc
zSa-fS#y(C;3A?bj0(}y^{Fo<Ow5iKQMMXwZ<5ZGyN5x99p=Fmv%6nZ!41yk}iLy5T
za&%x+UOo-xI|Y}DCTF^RFL=_<eh`&snc2Iamdy!txi#6IcX<g}yj7ebLS)KyNEnrN
zjc)Y9hFwm(ihegJ36?=2BtSQOqHjY5hYllBqr!&{i8n;MpWw9I6v8(<lJ`>D7%|`O
zgmkuF<`hBG_N(E|xG(8z7OED<xgF`V9Ox}C67LTkGea_ReRcM{E)eJbfFdRXK?tY~
zi^Kq{YoMMdqgjNTn?CXRhpx>0A(Zr|L|Amn=t*0(U&Jfv2ow7#{^|F$qm)blskv(D
zYv`YM*q35{#WpiXD>otMF5jT&4wmTXYfol)zo3?Vf|(gE-n*pTD#ru%=Ky%3WUQ2V
z48^EtCoKWa@&g~9p3f7X1mx{f=?e8_CpcU)w(<?YQ{K*Fl=-~bL!XgP8lw=IjrL!;
z5TU^pLyHH{{nWV55`H6zZX`m}gt;z^FOPDK=BUlVn>ia0KYPQSCcVwin`tMkJMR*6
z7{Q)8n0lUt`kUZ&?MQP$aXkU4s5iiC3zR`x;U@GP=ejgwlK{yYL7$XLxUNTC&wFHY
zZimU2)+4GybLx{;$p!9+>05U8)J5;(Drvm{gOQn)%Dez{o8dQY1-_3_OtVJbiZ<s2
zqIhO4ekn~-?YCpU{YedUew^Tb#g(3&s2!2Sb(Mluuq;WakhxUnN0MtHRb%9iP)Cm&
z^LecfNl8~Ok*5OJcstq(oh%#-Brj%kWYFGGUOHcNK|10f;g#dpsv;}h8%#V=MxCw>
zOjRzUk)y+-hNJvzfW0f{(J$wjAM7cp+sH^T)KgkUOvIOU*UmOQAT|7{!jFxU?^2$R
z{OJ`;+rrYy6v`1w5z3<;cB&0zuc5b~7SLqArO_jj%9_CPksZzsVfzs4qy4Dh%FUP_
z;UyM%_6sbSSov70&IidVy-5pq7_SIF8%gfd_bW6k6e-+ymb#B#(|~9%r4W+vz1_W*
zJ&zU!IcVY-x>DXg7RvW3;4P>z;WwQ&D>bKj%_AF~b!$qyJRef1_A(?nzc&M4>D(Yy
z=5pPI+Zm2;ekR=a1K)&~NEvu8o}wy>GV`}e_`?mjEZUt07Pb@((Aj92EKxC<HdWJ8
zyVs(95_5d-;1*4DM0f-_=zI-_*7n;`A>5k7uY!gYYW!-bY7A>AZf6(d3mM+jud*U$
zIHu3VBu6C2n?5$JGkrCr_zI$=DW6yUU36z)hkXaN1M1`M+vp?q@jPEmMTjMq_W$|7
zEY?!gwA6Ct&x3+KlU)lVi>cS+C5PtcUb+_4>4_V{632yWzbVEcvzNS=I0h7NE<aOx
z125n&XwiJs`mp%0B)TXX8*Sp0{^54KgHclv#x;4#e_%UH%UVYKl6cFAe>syimdk)S
zK!@>V_UDgWpDhS`!)flD!y7~VgqvR&2)%?-LeOi&A(tQ5zQYNyVZP79L*TshV#~69
zz1)()FUu>NUp7ApNK4AZ$XFphxl6j2IQB^O%f68fcGQ;jku{KiE!X7vRN9qWglAK(
z#f1_xn)UO}bt8CIdy#)p@bFa86tW7*x5T()F^EIx+(gRbWy)k%rHe6iS<4vttk%q8
zd64V$ND$w43wYGJ(2BP*(5uvb1(Asu^E&w0*BR$@59&wA@>qAmjW_ikmdBSGH4pG(
z=;~x9XzEsB6JD)jV-vA!Kh~nxcGsNN%hoofFJ`v&vn^eudlQN7@IY6e&*fX^mF6{8
zI9YLCN?r5qgnGApJN|V*x3sUks!;u6Uva-l(#Fv?#An6ttG~-Ti+5@P!+!Hke7>j^
z_*%|F$QM#PIttx0U(=mY{Bqe~kQ3C2CzSP+;lXXnlkeJ(G7}Sw3yjMLW3c6$j5D0-
zZ|uTmELx9loa02_>)<BV@7t}+@M|#8pDUpc<acNLgV75D0R6P9gteta9Wv1{@kDB{
zh`ED54cm9j?#$X<`o>8EdTp~K)G2ipRCpzovq@z~?|ZjoV&#BCVn=+t@HZIiTUJ9M
z51|i2I3dF;sFzE(%bzy!mvasz!sFVywL5vAUbbDs1$NJc3)dMkrcZ)B8e~u_UtK)#
zrfDz067RBc$Idt8<fQiW)-3&5-eswBcgK~*@q7#P@%Mosry3?2wTK^yuZSS_Y<2^7
zTJ{(0^}^?gHbE}U4lN?B5vOv!AShzU%Ex3Ye<3!tC-u-=)5Gh><?smL{ZBbXnQ1RZ
z#APmWGNOk0h64%{V<@%hx{?A*8$`8OWmR6VUFp4$+;S(qzpvl;KI?sv>cwiBu@{Qh
z6nT)4rNf1jX3FY_>H=|NWdsZ|W3d1H$l_MnoJDQ<h^=kqL;Lv6tPW3o&lJLRPkj$)
z;IvZ0#}*l`)0P!+A`l!}BAzPBadFyivnFJdwx9MSx*~prCjfQ~=@f6W@cPrn;<)y)
zna{c|UH~7OJ(^ls>RCOtqzd*1Q-e2Pep3${%v)sYYeFX?VCN<-Pdup$9jyxP{wjre
zi&NHD7FI^#sqh&8mG$wrtDZF2g5sf<Z(9s-9!_6J;TszhsL8!A>t9E~&HH%c<6Qo=
z?J0r@%*3`%pk5t$`h<#=PASCo#0F`yZcl7OpsCRhFQ9qh*eYzPVnK*AQrt$YluxN?
z;d{9gq&w?J|Bt<|!>dmcBj=Onu_3Xf#p&2Ec9=+}$m-pd+}BD*%C=tsvI{a#Wwy9c
zm;y}G;ICY$r@x09sq6rY{^*fgu2Sp^-wql&h}dSo<O6zIM-Pq$vxAQ&(&|-PxcrFl
zqk_E(LyGhYo4v;1st(+I`^9B3H>?sw6&W8?PHdXmjlMhaaai69k-hPD)uRC#TD_9i
zQNFZsFtv<$Hx0+l`z>$Tj!dnpGNGC_qh?Uvp}nMOx=Fgo3ugQ+k5UiZkBM0#qawGB
z7jx27%|b0m7$Ra9=Z{-HY(7-VMXQBQ4z7ff-u>FgQqh1)Kz$Go+?&fa+$zco(!Q%-
z=PaqPQP^%Y(`nM_%OCNdF=nAnq%lP{S*f2EJN|RFy{NDIXtmK`cb;?nkM?fv&i}Id
zEb`SDtuCz|5FC5F<)gAHcH(&;KaUugp=wdj)O(S^^7i(cs90DY(_KBl?35s&<TKzC
zoI=A|Wj&>s+6Z+dN_iSvjN3_F1C&6hG-`MNDQU2ww>))s#(<L0`1VPF7fjstG)bXt
z+@pz;#t$=U%+9_k3wSS2_0ve^=FJwA;~dWqrzA9>rqK9kTc0K?e@|ZI+BJnHw?*JD
zOA?KX5#3OK^eYeY6*e1cZslubbVtbv0h4fWMmV}i1j4+@ms<cpB~Xb>!d!eEfPpYC
zI8rH4Rp_^b5}7{R1_=Rwi}-q~3RxMM0JRW4E<jlc83{=tHAWy1sN&=7s${CI^S3&A
zr7Gm^>+7uq0tEyFNCZerAbi|FHx(5XL6TA+DJgNXgg7z??&}aJ4o6=33*>)rv|W%+
zK2UF8C;|>V!*y^(`1z^|37r}G=lRP|mq6&hmf*<0!y*R=I^%$DN=SnK!6vJ!ob4)^
zKm%R8thAvp7dVn^Lrq3fO69lw|Ka>=@xL_h|EnqYzZCzA@wcK1=*<8Bis!Fn{oW;K
zON~(l^iSs17}xKZoB#myvijP$%mXRbU&KD(P-clHV0);f+O#PXsHNI6M-v{K1P*gu
zzW46ci0n(nmQtVi1htcs!dfmAahe3F;JHK%1W?dx<UM;I8XwYd3qZ*d2B1D+nFTsr
z(KuUZT%tIiED@}r^_xS-fci_KiQYTO_uItC!N$f$+L1NbNTR;~zU7m|i|DFVwhSCP
zCpQ<dzrT;bszUJ@egh*O&~eA7k<Ltg3uO57`m^1M0`(xfmPrT%GQDF6{@|Rj`usf~
z;#Bpcg&wI)-Mn&7xe>fo|LgI?1}zQx{&B6u#Kd|u8oef@qN+*?vS!e!u8BUb-cWhg
zFJug%*RUs0F93-WIR+30qW0W+D&N513E{5c31=>!pl;v!GX%(V*tHY=ux0iM+EeMt
zAR)z$>!wuX=)+^F74)vPi^x&~EA1$A)kFQeo{u)%x@-UbmQTd(LTWO&N_g^2XzJ)w
z%evMW-m6Dob`El)l!3rgyM?3uO?_c05jmTB9y*41zcUsQrt+W%96T@Jpb!2K$bCQ;
z9{?jcRUM721w5`<KoG}Bnm}(eO5JfS-616()*zwF`YBdtX_wFoi(H6!a<}(vp~E%{
z{Fh;$5~&>mS2$dqs7Kt!%Zedl6-x7U?DYG&F=Gef<+3UUUWWUzuUyuv*hr#0WfjJ`
zY8N*}@U6wuSvVP`fVgL}5X?jzYvQgX`;5X!j6duV?r`RKF;TAKk$CR{@*6r-d^6`4
zX_bI|b9_GtS)g6*E>JF>E}b(W)oSVV#&A665k9zA?Xyzd_xdkvN8g*&LXNGvjF0NS
zOg5!^_;kxAx+nIRX9gX5tm6Vlf{h+o8+)FP{IHsLyQu7O2cm)p6O(;(7C%2|ZWg^&
zP`gKiMHU~HW)pvn9GF>m1impEcL?6W`Qd6>o{?B(Kls(Im5sL6)lK8fvryC`d`{-@
zfji5eN!F-!yKzvt@hWs(-~`BEpMJ{gEJeTVIb{|ke$(+?+WFX3Tj?#-uE#*mqs`!f
zhfvtdBV29giD`Q9c1*X&c&&}jhA=3=h_AK|*SMSWbBi#JA~pwM)yj)&H$(TK@w^8?
zT{=ZwMq)*~dTwJ~PF)PA+#g!<VT<weOZ=x!%2>XwR~w?InP*-%jPQOi3xI9zmMb*w
zzFk9gn{wkCMAa|X2P6GftjbTW?3u#gcs2gv+Qq{vJNM-V8q62lWrMLOA_TRLJrXny
z9{gHN0+YXp^{8_~3ZH~quUmUtFpD8#R#i{v-X){12XNIp_J0(pd9`7U?2~xviNXpT
z@7Qjpu;@<2e@BlVZgh?8Jy2i7ycXMg?*HVp1ZR#>b`kjC2Wt%Z`FWp19#(v$l%7K%
z{kErM_r3VX@JTtz=)~<^Gml_^_#AS0$p+eQsw^FQ(~1W>GOc$Y_l2#jYgs(sqo(uq
zZfUJ0+l-&*e9+z|8wYdYUVXaQqKGIe7)(MTY0YaNc&uyjuR?e{5qlO!M62@Y(fVRs
z??fnydX9<QQUH`k*M2(8Y?;u7!x$EFDC_QN%F*+p0mfIm?~A2sl8?%$FuAwbN-!0i
zwIDJ_eO&3Ib-u6QEJ=!0k41q!{N-9ej^4+b)e;(VCwfm!$$OPOFNN)l&LsDrY^ZGM
z-;Cn~XG&kwI`-N1#`U$ID*S(GCe+uPFq1v8w<Mo%|61OdZ~!&XTZIhM4VLCO>u94F
z&pZ&PbNuZ=;`^2J$%})+GimApILvbv@dD_Gta2!9N_Q9xo_V%TfM7DQQ}-JQu1#Yh
z;OUE%hqW9j$H!Eeo{euZM(7%>{E1eujZuOJn5YxXmDE|g)#YRXzl_OLA91S{A9%TA
zU9~UcN{s3Dv361}d$Ijo5ii1Tc#vp?Q80D~W2f>2M+-OFDi+{fT$vMb0*UvzbtP-(
z;EE%Aw#}vv%s%D*KduPtE~W57nt8L>LO~&LwRG^3>%!p5^Tx^Ax@oLk>(UTbjnGll
zZOOR0E6^y%CKeE6jaot>J*RRwVFLTQQnkKs8fUh#3k`h<b@#rMR4qQ++20DxFqgk>
zY())%nD)%Xl3QRC$PuLLF^w1B1}_^!&&}t!-k-2nKI9+yGF&;*Y-uckNW^xo@1}FT
zmD@Vpc6_yjBUCITi7n13=Nz>Vv&IJb*4n3rIo3?~UvBJc30!T|9jY{$Zcpp)p7B;W
zN|Fyq*B#5bWN)A);igjWa*{eC$-in)i%dKbyo*<b5X?7ZyHy7oCskY6>Iv%Zp0=1|
z0=(xNwETrYrm{P8fU#jEehQrVu#qsiiB{D!$Pe_!1&#R<7stBmAYhN5b6@*W2Rp}{
z2dX4%vkG-<#0q~O^hf5Z!|=RV7K-FICF^O1pBlE7R#&QDnuUJeYYaJO2Ez6eMskLW
zmd_t$t}}5-oK-N(C`I@%jub$5=h@kmoHZpEl0@w!$60;btB`Alcqf+r|A!HNVP2~x
zu_4$Ve{@Ud_Y9xqpq4vWZ7l}gW<47qKzp`XUGOzdTEX7~J{3TxFDknBTk`$YpXzx7
X+?9HAS>(Tz0D%7OyV@0+_7VR9o6zuF

diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/LaunchScreen.storyboard b/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/LaunchScreen.storyboard
deleted file mode 100644
index f83f6fd581..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/LaunchScreen.storyboard
+++ /dev/null
@@ -1,25 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="13122.16" systemVersion="17A277" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" launchScreen="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="01J-lp-oVM">
-    <dependencies>
-        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="13104.12"/>
-        <capability name="Safe area layout guides" minToolsVersion="9.0"/>
-        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
-    </dependencies>
-    <scenes>
-        <!--View Controller-->
-        <scene sceneID="EHf-IW-A2E">
-            <objects>
-                <viewController id="01J-lp-oVM" sceneMemberID="viewController">
-                    <view key="view" contentMode="scaleToFill" id="Ze5-6b-2t3">
-                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
-                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
-                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
-                        <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
-                    </view>
-                </viewController>
-                <placeholder placeholderIdentifier="IBFirstResponder" id="iYj-Kq-Ea1" userLabel="First Responder" sceneMemberID="firstResponder"/>
-            </objects>
-            <point key="canvasLocation" x="53" y="375"/>
-        </scene>
-    </scenes>
-</document>
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard b/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard
deleted file mode 100644
index 585fc3417d..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard
+++ /dev/null
@@ -1,325 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="14460.31" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
-    <device id="retina4_7" orientation="portrait">
-        <adaptation id="fullscreen"/>
-    </device>
-    <dependencies>
-        <deployment identifier="iOS"/>
-        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="14460.20"/>
-        <capability name="Safe area layout guides" minToolsVersion="9.0"/>
-        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
-    </dependencies>
-    <scenes>
-        <!--Multi Predict View Controller-->
-        <scene sceneID="ec4-AW-9Vs">
-            <objects>
-                <viewController id="Vwd-lt-764" customClass="MultiPredictViewController" customModule="paddle_mobile_demo" customModuleProvider="target" sceneMemberID="viewController">
-                    <view key="view" contentMode="scaleToFill" id="55D-rz-Ex6">
-                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
-                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
-                        <subviews>
-                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="TQt-X9-PdF">
-                                <rect key="frame" x="164.5" y="318.5" width="46" height="30"/>
-                                <state key="normal" title="Button"/>
-                                <connections>
-                                    <action selector="predictAct:" destination="Vwd-lt-764" eventType="touchUpInside" id="d4z-Cv-6jY"/>
-                                </connections>
-                            </button>
-                        </subviews>
-                        <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-                        <constraints>
-                            <constraint firstItem="TQt-X9-PdF" firstAttribute="centerY" secondItem="55D-rz-Ex6" secondAttribute="centerY" id="bL3-wr-TcH"/>
-                            <constraint firstItem="TQt-X9-PdF" firstAttribute="centerX" secondItem="55D-rz-Ex6" secondAttribute="centerX" id="sBi-RQ-sJn"/>
-                        </constraints>
-                        <viewLayoutGuide key="safeArea" id="bsd-h4-RYZ"/>
-                    </view>
-                </viewController>
-                <placeholder placeholderIdentifier="IBFirstResponder" id="68E-SG-96s" userLabel="First Responder" sceneMemberID="firstResponder"/>
-            </objects>
-            <point key="canvasLocation" x="-559" y="686"/>
-        </scene>
-        <!--View Controller-->
-        <scene sceneID="tne-QT-ifu">
-            <objects>
-                <viewController id="BYZ-38-t0r" customClass="ViewController" customModule="paddle_mobile_demo" customModuleProvider="target" sceneMemberID="viewController">
-                    <view key="view" contentMode="scaleToFill" id="8bC-Xf-vdC">
-                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
-                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
-                        <subviews>
-                            <imageView userInteractionEnabled="NO" contentMode="scaleAspectFit" horizontalHuggingPriority="251" verticalHuggingPriority="251" translatesAutoresizingMaskIntoConstraints="NO" id="ZZh-fw-LwK">
-                                <rect key="frame" x="0.0" y="20" width="225" height="247"/>
-                            </imageView>
-                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" horizontalCompressionResistancePriority="749" text="Platform:" textAlignment="natural" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="2EB-m2-a3L">
-                                <rect key="frame" x="10" y="538" width="35" height="24"/>
-                                <constraints>
-                                    <constraint firstAttribute="height" constant="24" id="SYv-As-Si8"/>
-                                </constraints>
-                                <fontDescription key="fontDescription" type="system" pointSize="20"/>
-                                <nil key="textColor"/>
-                                <nil key="highlightedColor"/>
-                            </label>
-                            <pickerView contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="DlO-dk-RMr">
-                                <rect key="frame" x="55" y="510" width="320" height="80"/>
-                                <constraints>
-                                    <constraint firstAttribute="height" constant="80" id="Sbi-05-Mwd"/>
-                                </constraints>
-                            </pickerView>
-                            <pickerView contentMode="scaleToFill" horizontalCompressionResistancePriority="749" translatesAutoresizingMaskIntoConstraints="NO" id="6MG-gv-hD5">
-                                <rect key="frame" x="85" y="401" width="290" height="80"/>
-                                <constraints>
-                                    <constraint firstAttribute="height" constant="80" id="yAL-JY-G6b"/>
-                                </constraints>
-                            </pickerView>
-                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="Models" textAlignment="natural" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="avL-VK-Kha">
-                                <rect key="frame" x="10" y="429" width="65" height="24"/>
-                                <constraints>
-                                    <constraint firstAttribute="height" constant="24" id="EwE-B3-z2R"/>
-                                </constraints>
-                                <fontDescription key="fontDescription" type="system" pointSize="20"/>
-                                <nil key="textColor"/>
-                                <nil key="highlightedColor"/>
-                            </label>
-                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" showsTouchWhenHighlighted="YES" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="wUL-9N-u1V">
-                                <rect key="frame" x="16" y="597" width="63.5" height="30"/>
-                                <color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-                                <constraints>
-                                    <constraint firstAttribute="width" secondItem="wUL-9N-u1V" secondAttribute="height" multiplier="21:10" id="cp7-bd-CvU"/>
-                                </constraints>
-                                <state key="normal" title="Image">
-                                    <color key="titleColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-                                </state>
-                                <connections>
-                                    <action selector="selectImageAct:" destination="BYZ-38-t0r" eventType="touchUpInside" id="5uR-SM-fKO"/>
-                                </connections>
-                            </button>
-                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" showsTouchWhenHighlighted="YES" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="XpL-9M-UOp">
-                                <rect key="frame" x="109.5" y="597" width="63" height="30"/>
-                                <color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-                                <state key="normal" title="Load">
-                                    <color key="titleColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-                                </state>
-                                <connections>
-                                    <action selector="loadAct:" destination="BYZ-38-t0r" eventType="touchUpInside" id="fZ5-CQ-jCY"/>
-                                </connections>
-                            </button>
-                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" showsTouchWhenHighlighted="YES" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="R90-Yf-S6g">
-                                <rect key="frame" x="202.5" y="597" width="63.5" height="30"/>
-                                <color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-                                <state key="normal" title="Predict">
-                                    <color key="titleColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-                                </state>
-                                <connections>
-                                    <action selector="predictAct:" destination="BYZ-38-t0r" eventType="touchUpInside" id="Iyy-sY-gt4"/>
-                                </connections>
-                            </button>
-                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" showsTouchWhenHighlighted="YES" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="a3K-ri-NVs">
-                                <rect key="frame" x="296" y="597" width="63" height="30"/>
-                                <color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-                                <state key="normal" title="Clear">
-                                    <color key="titleColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-                                </state>
-                                <connections>
-                                    <action selector="clearAct:" destination="BYZ-38-t0r" eventType="touchUpInside" id="JYf-UX-rCR"/>
-                                </connections>
-                            </button>
-                            <view contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="w7H-Sk-Rai">
-                                <rect key="frame" x="79.5" y="597" width="30" height="30"/>
-                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-                                <constraints>
-                                    <constraint firstAttribute="height" constant="30" id="tje-ct-ded"/>
-                                    <constraint firstAttribute="width" constant="30" id="vYd-Fc-KAj"/>
-                                </constraints>
-                            </view>
-                            <view contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="T4O-nx-ciH">
-                                <rect key="frame" x="266" y="597" width="30" height="30"/>
-                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-                                <constraints>
-                                    <constraint firstAttribute="height" constant="30" id="CZQ-vS-4di"/>
-                                    <constraint firstAttribute="width" constant="30" id="fXE-S7-ZXL"/>
-                                </constraints>
-                            </view>
-                            <view contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="976-fk-Kx2">
-                                <rect key="frame" x="172.5" y="597" width="30" height="30"/>
-                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-                                <constraints>
-                                    <constraint firstAttribute="height" constant="30" id="JBc-yg-8YH"/>
-                                    <constraint firstAttribute="width" constant="30" id="L4p-hP-s5C"/>
-                                </constraints>
-                            </view>
-                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="耗时:" lineBreakMode="tailTruncation" numberOfLines="0" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="m5L-O7-P31">
-                                <rect key="frame" x="15" y="277" width="350" height="38"/>
-                                <constraints>
-                                    <constraint firstAttribute="height" constant="38" id="6SS-sb-7I2"/>
-                                </constraints>
-                                <fontDescription key="fontDescription" type="system" pointSize="15"/>
-                                <nil key="textColor"/>
-                                <nil key="highlightedColor"/>
-                            </label>
-                            <imageView userInteractionEnabled="NO" contentMode="scaleToFill" horizontalHuggingPriority="251" verticalHuggingPriority="251" image="paddle-mobile" translatesAutoresizingMaskIntoConstraints="NO" id="4ey-Xr-U4e">
-                                <rect key="frame" x="90" y="637" width="195" height="30"/>
-                                <constraints>
-                                    <constraint firstAttribute="width" secondItem="4ey-Xr-U4e" secondAttribute="height" multiplier="6.5:1" id="8c5-FF-lB9"/>
-                                </constraints>
-                            </imageView>
-                            <textView clipsSubviews="YES" multipleTouchEnabled="YES" contentMode="scaleToFill" editable="NO" text="结果:" textAlignment="natural" translatesAutoresizingMaskIntoConstraints="NO" id="VQn-bS-fWp">
-                                <rect key="frame" x="10" y="323" width="355" height="70"/>
-                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-                                <constraints>
-                                    <constraint firstAttribute="height" constant="70" id="07M-Gx-Elk"/>
-                                </constraints>
-                                <fontDescription key="fontDescription" type="system" pointSize="15"/>
-                                <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
-                            </textView>
-                            <view contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="Cil-py-NiA">
-                                <rect key="frame" x="225" y="20" width="150" height="247"/>
-                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-                            </view>
-                        </subviews>
-                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
-                        <constraints>
-                            <constraint firstItem="m5L-O7-P31" firstAttribute="top" secondItem="Cil-py-NiA" secondAttribute="bottom" constant="10" id="16p-IK-b5X"/>
-                            <constraint firstItem="6Tk-OE-BBY" firstAttribute="trailing" secondItem="VQn-bS-fWp" secondAttribute="trailing" constant="10" id="1Xg-0h-9SE"/>
-                            <constraint firstItem="avL-VK-Kha" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="10" id="2t9-hS-VXa"/>
-                            <constraint firstItem="R90-Yf-S6g" firstAttribute="centerY" secondItem="wUL-9N-u1V" secondAttribute="centerY" id="76b-Ny-1Og"/>
-                            <constraint firstItem="DlO-dk-RMr" firstAttribute="centerY" secondItem="2EB-m2-a3L" secondAttribute="centerY" id="7R7-7x-IRs"/>
-                            <constraint firstItem="a3K-ri-NVs" firstAttribute="centerY" secondItem="wUL-9N-u1V" secondAttribute="centerY" id="8Gv-HO-dKf"/>
-                            <constraint firstItem="w7H-Sk-Rai" firstAttribute="centerY" secondItem="wUL-9N-u1V" secondAttribute="centerY" id="94H-ZN-G7S"/>
-                            <constraint firstItem="2EB-m2-a3L" firstAttribute="top" secondItem="avL-VK-Kha" secondAttribute="bottom" constant="85" id="A5J-Qv-Ux5"/>
-                            <constraint firstItem="6Tk-OE-BBY" firstAttribute="trailing" secondItem="a3K-ri-NVs" secondAttribute="trailing" constant="16" id="Avk-9e-Pvg"/>
-                            <constraint firstItem="DlO-dk-RMr" firstAttribute="leading" secondItem="2EB-m2-a3L" secondAttribute="trailing" constant="10" id="CYY-XV-JFd"/>
-                            <constraint firstItem="T4O-nx-ciH" firstAttribute="leading" secondItem="R90-Yf-S6g" secondAttribute="trailing" id="ImW-FE-Mua"/>
-                            <constraint firstItem="T4O-nx-ciH" firstAttribute="centerY" secondItem="wUL-9N-u1V" secondAttribute="centerY" id="KIi-87-AGM"/>
-                            <constraint firstItem="XpL-9M-UOp" firstAttribute="centerY" secondItem="wUL-9N-u1V" secondAttribute="centerY" id="KWW-qT-Rzf"/>
-                            <constraint firstItem="6MG-gv-hD5" firstAttribute="centerY" secondItem="avL-VK-Kha" secondAttribute="centerY" id="KZa-YZ-DEs"/>
-                            <constraint firstItem="2EB-m2-a3L" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="10" id="Le3-TN-zOL"/>
-                            <constraint firstItem="ZZh-fw-LwK" firstAttribute="trailing" secondItem="6Tk-OE-BBY" secondAttribute="trailing" constant="-150" id="MeS-HQ-voE"/>
-                            <constraint firstItem="m5L-O7-P31" firstAttribute="top" secondItem="ZZh-fw-LwK" secondAttribute="bottom" constant="10" id="NUL-Ta-VI8"/>
-                            <constraint firstItem="m5L-O7-P31" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="15" id="RFA-z1-9aB"/>
-                            <constraint firstItem="wUL-9N-u1V" firstAttribute="width" secondItem="a3K-ri-NVs" secondAttribute="width" id="Rp6-Bh-BN3"/>
-                            <constraint firstItem="6MG-gv-hD5" firstAttribute="trailing" secondItem="6Tk-OE-BBY" secondAttribute="trailing" id="S0W-0G-75m"/>
-                            <constraint firstItem="Cil-py-NiA" firstAttribute="top" secondItem="6Tk-OE-BBY" secondAttribute="top" id="UNc-Et-9Yv"/>
-                            <constraint firstItem="w7H-Sk-Rai" firstAttribute="leading" secondItem="wUL-9N-u1V" secondAttribute="trailing" id="VBM-8b-jP0"/>
-                            <constraint firstItem="VQn-bS-fWp" firstAttribute="top" secondItem="m5L-O7-P31" secondAttribute="bottom" constant="8" id="VpS-4N-mOo"/>
-                            <constraint firstItem="wUL-9N-u1V" firstAttribute="top" secondItem="2EB-m2-a3L" secondAttribute="bottom" constant="35" id="VpU-j2-gaE"/>
-                            <constraint firstItem="wUL-9N-u1V" firstAttribute="width" secondItem="XpL-9M-UOp" secondAttribute="width" id="Xrz-oE-aIz"/>
-                            <constraint firstItem="wUL-9N-u1V" firstAttribute="width" secondItem="R90-Yf-S6g" secondAttribute="width" id="a4b-Rh-yKG"/>
-                            <constraint firstItem="6Tk-OE-BBY" firstAttribute="trailing" secondItem="m5L-O7-P31" secondAttribute="trailing" constant="10" id="aOn-WU-xP7"/>
-                            <constraint firstItem="R90-Yf-S6g" firstAttribute="leading" secondItem="976-fk-Kx2" secondAttribute="trailing" id="amy-QU-hbW"/>
-                            <constraint firstItem="a3K-ri-NVs" firstAttribute="leading" secondItem="T4O-nx-ciH" secondAttribute="trailing" id="dkX-Iq-hYk"/>
-                            <constraint firstItem="ZZh-fw-LwK" firstAttribute="top" secondItem="6Tk-OE-BBY" secondAttribute="top" id="eIC-fZ-OEE"/>
-                            <constraint firstItem="976-fk-Kx2" firstAttribute="centerY" secondItem="wUL-9N-u1V" secondAttribute="centerY" id="fFg-pB-eyU"/>
-                            <constraint firstItem="6Tk-OE-BBY" firstAttribute="bottom" secondItem="wUL-9N-u1V" secondAttribute="bottom" constant="40" id="fG6-0p-I0P"/>
-                            <constraint firstItem="Cil-py-NiA" firstAttribute="trailing" secondItem="6Tk-OE-BBY" secondAttribute="trailing" id="gGK-DB-ibv"/>
-                            <constraint firstItem="XpL-9M-UOp" firstAttribute="leading" secondItem="w7H-Sk-Rai" secondAttribute="trailing" id="guC-Db-cA9"/>
-                            <constraint firstItem="6MG-gv-hD5" firstAttribute="leading" secondItem="avL-VK-Kha" secondAttribute="trailing" constant="10" id="jNW-iC-u7V"/>
-                            <constraint firstItem="4ey-Xr-U4e" firstAttribute="bottom" secondItem="6Tk-OE-BBY" secondAttribute="bottom" id="o1X-q5-P7j"/>
-                            <constraint firstItem="6MG-gv-hD5" firstAttribute="top" secondItem="VQn-bS-fWp" secondAttribute="bottom" constant="8" id="tAE-ss-jlA"/>
-                            <constraint firstItem="Cil-py-NiA" firstAttribute="leading" secondItem="ZZh-fw-LwK" secondAttribute="trailing" id="teJ-PP-h2R"/>
-                            <constraint firstItem="4ey-Xr-U4e" firstAttribute="top" secondItem="wUL-9N-u1V" secondAttribute="bottom" constant="10" id="udc-wT-jqd"/>
-                            <constraint firstItem="ZZh-fw-LwK" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" id="vXI-l2-CjL"/>
-                            <constraint firstItem="VQn-bS-fWp" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="10" id="wtI-Dl-YPq"/>
-                            <constraint firstItem="976-fk-Kx2" firstAttribute="leading" secondItem="XpL-9M-UOp" secondAttribute="trailing" id="wxP-4D-gDn"/>
-                            <constraint firstItem="wUL-9N-u1V" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="16" id="xzZ-jO-4fI"/>
-                            <constraint firstItem="DlO-dk-RMr" firstAttribute="trailing" secondItem="6Tk-OE-BBY" secondAttribute="trailing" id="z6f-Nb-ASh"/>
-                            <constraint firstItem="4ey-Xr-U4e" firstAttribute="centerX" secondItem="8bC-Xf-vdC" secondAttribute="centerX" id="zzi-Qz-G9G"/>
-                        </constraints>
-                        <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
-                    </view>
-                    <connections>
-                        <outlet property="elapsedTimeLabel" destination="m5L-O7-P31" id="vJ7-EQ-Z5f"/>
-                        <outlet property="modelPickerView" destination="6MG-gv-hD5" id="l0g-ue-raK"/>
-                        <outlet property="resultTextView" destination="VQn-bS-fWp" id="306-c7-3vM"/>
-                        <outlet property="selectImageView" destination="ZZh-fw-LwK" id="afR-Bv-6AW"/>
-                        <outlet property="threadPickerView" destination="DlO-dk-RMr" id="Kk4-QV-b5o"/>
-                        <outlet property="videoView" destination="Cil-py-NiA" id="QY2-BP-SNS"/>
-                    </connections>
-                </viewController>
-                <placeholder placeholderIdentifier="IBFirstResponder" id="dkx-z0-nzr" sceneMemberID="firstResponder"/>
-            </objects>
-            <point key="canvasLocation" x="-1449" y="-3"/>
-        </scene>
-        <!--Load Pointer View Controller-->
-        <scene sceneID="56v-9i-I4d">
-            <objects>
-                <viewController id="4MS-jc-i6A" customClass="LoadPointerViewController" sceneMemberID="viewController">
-                    <view key="view" contentMode="scaleToFill" id="VbZ-nk-rJR">
-                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
-                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
-                        <subviews>
-                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="37q-nm-0H7">
-                                <rect key="frame" x="38" y="610" width="42" height="30"/>
-                                <constraints>
-                                    <constraint firstAttribute="height" constant="30" id="ofW-G3-KST"/>
-                                    <constraint firstAttribute="width" constant="42" id="pwd-tO-zcJ"/>
-                                </constraints>
-                                <state key="normal" title="Image"/>
-                            </button>
-                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="fAg-ai-yaA">
-                                <rect key="frame" x="119" y="610" width="34" height="30"/>
-                                <constraints>
-                                    <constraint firstAttribute="height" constant="30" id="IES-jf-Z1n"/>
-                                    <constraint firstAttribute="width" constant="34" id="jxK-Xn-WCE"/>
-                                </constraints>
-                                <state key="normal" title="Load"/>
-                                <connections>
-                                    <action selector="loaderButtonPressed:" destination="4MS-jc-i6A" eventType="touchUpInside" id="3cy-PD-aiE"/>
-                                </connections>
-                            </button>
-                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="pdS-6e-Pd1">
-                                <rect key="frame" x="185" y="610" width="49" height="30"/>
-                                <constraints>
-                                    <constraint firstAttribute="width" constant="49" id="ddY-uM-fzA"/>
-                                    <constraint firstAttribute="height" constant="30" id="yKd-YL-UML"/>
-                                </constraints>
-                                <state key="normal" title="Predict"/>
-                                <connections>
-                                    <action selector="predictButtonPressed:" destination="4MS-jc-i6A" eventType="touchUpInside" id="sOH-iT-s1w"/>
-                                </connections>
-                            </button>
-                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="DZa-sd-lY7">
-                                <rect key="frame" x="279" y="610" width="34" height="30"/>
-                                <constraints>
-                                    <constraint firstAttribute="width" constant="34" id="aSO-4q-PgA"/>
-                                    <constraint firstAttribute="height" constant="30" id="eAt-Uc-BxX"/>
-                                </constraints>
-                                <state key="normal" title="clear"/>
-                                <connections>
-                                    <action selector="clear:" destination="4MS-jc-i6A" eventType="touchUpInside" id="yW8-Dq-qwU"/>
-                                </connections>
-                            </button>
-                            <imageView userInteractionEnabled="NO" contentMode="scaleToFill" horizontalHuggingPriority="251" verticalHuggingPriority="251" translatesAutoresizingMaskIntoConstraints="NO" id="2p5-S3-M4T">
-                                <rect key="frame" x="0.0" y="20" width="375" height="211"/>
-                                <constraints>
-                                    <constraint firstAttribute="width" secondItem="2p5-S3-M4T" secondAttribute="height" multiplier="16:9" id="9Gh-8L-t3g"/>
-                                </constraints>
-                            </imageView>
-                        </subviews>
-                        <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
-                        <constraints>
-                            <constraint firstItem="vsb-FH-h7h" firstAttribute="bottom" secondItem="37q-nm-0H7" secondAttribute="bottom" constant="27" id="4Wf-Uh-gvr"/>
-                            <constraint firstItem="DZa-sd-lY7" firstAttribute="leading" secondItem="pdS-6e-Pd1" secondAttribute="trailing" constant="45" id="8dB-uI-cs9"/>
-                            <constraint firstItem="fAg-ai-yaA" firstAttribute="leading" secondItem="37q-nm-0H7" secondAttribute="trailing" constant="39" id="EAV-Oq-jeD"/>
-                            <constraint firstItem="vsb-FH-h7h" firstAttribute="bottom" secondItem="fAg-ai-yaA" secondAttribute="bottom" constant="27" id="Px0-A9-Eql"/>
-                            <constraint firstItem="2p5-S3-M4T" firstAttribute="leading" secondItem="vsb-FH-h7h" secondAttribute="leading" id="RNx-6D-oix"/>
-                            <constraint firstItem="pdS-6e-Pd1" firstAttribute="leading" secondItem="fAg-ai-yaA" secondAttribute="trailing" constant="32" id="ZUR-Nv-aNb"/>
-                            <constraint firstItem="2p5-S3-M4T" firstAttribute="top" secondItem="vsb-FH-h7h" secondAttribute="top" id="atk-ma-aSA"/>
-                            <constraint firstItem="vsb-FH-h7h" firstAttribute="bottom" secondItem="pdS-6e-Pd1" secondAttribute="bottom" constant="27" id="kPx-mt-ab9"/>
-                            <constraint firstItem="2p5-S3-M4T" firstAttribute="trailing" secondItem="vsb-FH-h7h" secondAttribute="trailing" id="mwX-bu-jJY"/>
-                            <constraint firstItem="37q-nm-0H7" firstAttribute="leading" secondItem="vsb-FH-h7h" secondAttribute="leading" constant="38" id="trH-Fq-sSv"/>
-                            <constraint firstItem="vsb-FH-h7h" firstAttribute="bottom" secondItem="DZa-sd-lY7" secondAttribute="bottom" constant="27" id="yNJ-hq-2Qg"/>
-                        </constraints>
-                        <viewLayoutGuide key="safeArea" id="vsb-FH-h7h"/>
-                    </view>
-                    <connections>
-                        <outlet property="imageView" destination="2p5-S3-M4T" id="ePO-1L-eb4"/>
-                    </connections>
-                </viewController>
-                <placeholder placeholderIdentifier="IBFirstResponder" id="hGb-Pb-icS" userLabel="First Responder" sceneMemberID="firstResponder"/>
-            </objects>
-            <point key="canvasLocation" x="-135.19999999999999" y="-218.1409295352324"/>
-        </scene>
-    </scenes>
-    <resources>
-        <image name="paddle-mobile" width="402" height="62"/>
-    </resources>
-</document>
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Info.plist b/metal/paddle-mobile-demo/paddle-mobile-demo/Info.plist
deleted file mode 100644
index 665ff9e0cd..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Info.plist
+++ /dev/null
@@ -1,47 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>CFBundleDevelopmentRegion</key>
-	<string>$(DEVELOPMENT_LANGUAGE)</string>
-	<key>CFBundleExecutable</key>
-	<string>$(EXECUTABLE_NAME)</string>
-	<key>CFBundleIdentifier</key>
-	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
-	<key>CFBundleInfoDictionaryVersion</key>
-	<string>6.0</string>
-	<key>CFBundleName</key>
-	<string>$(PRODUCT_NAME)</string>
-	<key>CFBundlePackageType</key>
-	<string>APPL</string>
-	<key>CFBundleShortVersionString</key>
-	<string>1.0</string>
-	<key>CFBundleVersion</key>
-	<string>1</string>
-	<key>LSRequiresIPhoneOS</key>
-	<true/>
-	<key>NSCameraUsageDescription</key>
-	<string>use camera</string>
-	<key>NSPhotoLibraryUsageDescription</key>
-	<string>use album</string>
-	<key>UILaunchStoryboardName</key>
-	<string>LaunchScreen</string>
-	<key>UIMainStoryboardFile</key>
-	<string>Main</string>
-	<key>UIRequiredDeviceCapabilities</key>
-	<array>
-		<string>armv7</string>
-	</array>
-	<key>UISupportedInterfaceOrientations</key>
-	<array>
-		<string>UIInterfaceOrientationPortrait</string>
-	</array>
-	<key>UISupportedInterfaceOrientations~ipad</key>
-	<array>
-		<string>UIInterfaceOrientationPortrait</string>
-		<string>UIInterfaceOrientationPortraitUpsideDown</string>
-		<string>UIInterfaceOrientationLandscapeLeft</string>
-		<string>UIInterfaceOrientationLandscapeRight</string>
-	</array>
-</dict>
-</plist>
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/MetalHelper.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/MetalHelper.swift
deleted file mode 100644
index 8252258c97..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/MetalHelper.swift
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Metal
-import MetalKit
-import Foundation
-import paddle_mobile
-
-@objc public class MetalHelper: NSObject {
-    @objc let device: MTLDevice
-    @objc let queue: MTLCommandQueue
-    @objc let textureLoader: MTKTextureLoader
-    @objc static let shared: MetalHelper = MetalHelper.init()
-    private override init(){
-        device = MTLCreateSystemDefaultDevice()!
-        queue = device.makeCommandQueue()!
-        textureLoader = MTKTextureLoader.init(device: device)
-        super.init()
-    }
-}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift
deleted file mode 100644
index cf5cb75a16..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import UIKit
-import paddle_mobile
-
-class MultiPredictViewController: UIViewController {
-    var runner1: Runner!
-    var runner2: Runner!
-    override func viewDidLoad() {
-        super.viewDidLoad()
-        let mobileNet = try! MobileNet_ssd_hand.init(device: MetalHelper.shared.device)
-        let genet = try! Genet.init(device: MetalHelper.shared.device)
-        runner1 = try! Runner.init(inNet: mobileNet, commandQueue: MetalHelper.shared.queue)
-        let queue2 = MetalHelper.shared.device.makeCommandQueue()
-        
-        runner2 = try! Runner.init(inNet: genet, commandQueue: MetalHelper.shared.queue)
-    }
-    
-    @IBAction func predictAct(_ sender: Any) {
-        let success = self.runner2.load()
-        //    DispatchQueue.global().async {
-        let image1 = UIImage.init(named: "hand.jpg")
-        //      let success = self.runner2.load()
-        //      if success {
-        //        for i in 0..<10000 {
-        //          print(i)
-        //          self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in
-        //            print("result1: ")
-        ////            print(res)
-        //          })
-        //        }
-        //      } else {
-        //        print("load failed")
-        //      }
-        //      self.runner1.clear()
-        //    }
-        //    return
-        //    DispatchQueue.global().async {
-        ////      sleep(1)
-        //      let image1 = UIImage.init(named: "banana.jpeg")
-        ////      if success {
-        //        for _ in 0..<10 {
-        //          self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in
-        //            print("result2: ")
-        //            print(res)
-        //          })
-        //        }
-        ////      } else {
-        ////        print("load failed")
-        ////      }
-        ////      self.runner2.clear()
-        //    }
-    }
-}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/BufferToTexture.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/BufferToTexture.metal
deleted file mode 100644
index 97969100c0..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/BufferToTexture.metal
+++ /dev/null
@@ -1,35 +0,0 @@
-//
-//  RGBToYCrCb_Y.metal
-//  paddle-mobile-demo
-//
-//  Created by liuRuiLong on 2018/12/28.
-//  Copyright © 2018 orange. All rights reserved.
-//
-
-#include <metal_stdlib>
-using namespace metal;
-
-kernel void buffer_to_texture_kernel(                                     const device float *input [[buffer(0)]],
-texture2d<float, access::write> outTexture [[texture(0)]],
-uint2 gid [[thread_position_in_grid]]){
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height()) {
-    return;
-  }
-  
-  float y = input[outTexture.get_width() * gid.y + gid.x];
-  outTexture.write(float4(y, 0.0f, 0.0f, 0.0f), gid);
-}
-
-kernel void buffer_to_texture_kernel_half(                                     const device float *input [[buffer(0)]],
-                                     texture2d<half, access::write> outTexture [[texture(0)]],
-                                     uint2 gid [[thread_position_in_grid]]){
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height()) {
-    return;
-  }
-  
-  float y = input[outTexture.get_width() * gid.y + gid.x];
-  outTexture.write(half4(y, 0.0f, 0.0f, 0.0f), gid);
-}
-
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/CPUCompute.h b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/CPUCompute.h
deleted file mode 100644
index 752a67bfa1..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/CPUCompute.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#pragma once
-
-#import <Foundation/Foundation.h>
-
-@interface CPUResult: NSObject
-@property (assign, nonatomic) float *output;
-@property (assign, nonatomic) int outputSize;
-@end
-
-@interface NMSCompute: NSObject
-
-@property (assign, nonatomic) float scoreThredshold;
-
-@property (assign, nonatomic) int nmsTopK;
-
-@property (assign, nonatomic) int keepTopK;
-
-@property (assign, nonatomic) float nmsEta;
-
-@property (assign, nonatomic) float nmsThreshold;
-
-@property (assign, nonatomic) int background_label;
-
-@property (strong, nonatomic) NSArray<NSNumber *> *scoreDim;
-
-@property (strong, nonatomic) NSArray<NSNumber *> *bboxDim;
-
--(CPUResult *)computeWithScore:(float *)score andBBoxs:(float *)bbox;
-
-@end
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/CPUCompute.mm b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/CPUCompute.mm
deleted file mode 100644
index ddfc5f770d..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/CPUCompute.mm
+++ /dev/null
@@ -1,318 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#import "CPUCompute.h"
-
-#import <map>
-#import <vector>
-#import <utility>
-#import <algorithm>
-
-struct NMSParam {
-    
-    float *score_data;
-    
-    float *box_data;
-    
-    float *output;
-    
-    int output_size;
-    
-    std::vector<int> score_dim;
-    
-    std::vector<int> box_dim;
-    
-    float scoreThredshold;
-    
-    int nmsTopK;
-    
-    int keepTopK;
-    
-    float nmsEta;
-    
-    float nmsThreshold;
-    
-    int background_label;
-};
-
-
-constexpr int kOutputDim = 6;
-constexpr int kBBoxSize = 4;
-
-template <class T>
-bool SortScorePairDescend(const std::pair<float, T>& pair1,
-                          const std::pair<float, T>& pair2) {
-    return pair1.first > pair2.first;
-}
-
-template <class T>
-static inline void GetMaxScoreIndex(
-                                    const std::vector<T>& scores, const T threshold, int top_k,
-                                    std::vector<std::pair<T, int>>* sorted_indices) {
-    for (size_t i = 0; i < scores.size(); ++i) {
-        if (scores[i] > threshold) {
-            sorted_indices->push_back(std::make_pair(scores[i], i));
-        }
-    }
-    // Sort the score pair according to the scores in descending order
-    std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
-                     SortScorePairDescend<int>);
-    // Keep top_k scores if needed.
-    if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
-        sorted_indices->resize(top_k);
-    }
-}
-
-template <class T>
-static inline T BBoxArea(const T* box, const bool normalized) {
-    if (box[2] < box[0] || box[3] < box[1]) {
-        // If coordinate values are is invalid
-        // (e.g. xmax < xmin or ymax < ymin), return 0.
-        return static_cast<T>(0.);
-    } else {
-        const T w = box[2] - box[0];
-        const T h = box[3] - box[1];
-        if (normalized) {
-            return w * h;
-        } else {
-            // If coordinate values are not within range [0, 1].
-            return (w + 1) * (h + 1);
-        }
-    }
-}
-
-template <class T>
-static inline T JaccardOverlap(const T* box1, const T* box2,
-                               const bool normalized) {
-    if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
-        box2[3] < box1[1]) {
-        return static_cast<T>(0.);
-    } else {
-        const T inter_xmin = std::max(box1[0], box2[0]);
-        const T inter_ymin = std::max(box1[1], box2[1]);
-        const T inter_xmax = std::min(box1[2], box2[2]);
-        const T inter_ymax = std::min(box1[3], box2[3]);
-        const T inter_w = inter_xmax - inter_xmin;
-        const T inter_h = inter_ymax - inter_ymin;
-        const T inter_area = inter_w * inter_h;
-        const T bbox1_area = BBoxArea<T>(box1, normalized);
-        const T bbox2_area = BBoxArea<T>(box2, normalized);
-        return inter_area / (bbox1_area + bbox2_area - inter_area);
-    }
-}
-
-template <typename T>
-static inline void NMSFast(
-                           const T *bbox_data,
-                           std::vector<int> bbox_dim,
-                           const T *score_data,
-                           const T score_threshold, const T nms_threshold,
-                           const T eta, const int top_k,
-                           std::vector<int>* selected_indices) {
-    // The total boxes for each instance.
-    int num_boxes = bbox_dim[0];
-    // 4: [xmin ymin xmax ymax]
-    int box_size = bbox_dim[1];
-    
-    std::vector<T> scores_data(num_boxes);
-    std::copy_n(score_data, num_boxes, scores_data.begin());
-    std::vector<std::pair<T, int>> sorted_indices;
-    GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
-    
-    selected_indices->clear();
-    T adaptive_threshold = nms_threshold;
-    
-    while (sorted_indices.size() != 0) {
-        const int idx = sorted_indices.front().second;
-        bool keep = true;
-        for (size_t k = 0; k < selected_indices->size(); ++k) {
-            if (keep) {
-                const int kept_idx = (*selected_indices)[k];
-                T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
-                                              bbox_data + kept_idx * box_size, true);
-                keep = overlap <= adaptive_threshold;
-            } else {
-                break;
-            }
-        }
-        if (keep) {
-            selected_indices->push_back(idx);
-        }
-        sorted_indices.erase(sorted_indices.begin());
-        if (keep && eta < 1 && adaptive_threshold > 0.5) {
-            adaptive_threshold *= eta;
-        }
-    }
-}
-
-template <typename T>
-void MultiClassNMS(const T *boxes_data,
-                   const std::vector<int> &box_dim,
-                   const T *scores_data,
-                   const std::vector<int> &score_dim,
-                   std::map<int, std::vector<int>>* indices, int* num_nmsed_out,
-                   const int& background_label, const int& nms_top_k,
-                   const int& keep_top_k, const T& nms_threshold,
-                   const T& nms_eta, const T& score_threshold) {
-    
-    int64_t class_num = score_dim[0];
-    int64_t predict_dim = score_dim[1];
-    int num_det = 0;
-    for (int c = 0; c < class_num; ++c) {
-        if (c == background_label) continue;
-        const T *score_data = scores_data + c * predict_dim;
-        
-        /// [c] is key
-        NMSFast<T>(boxes_data, box_dim, score_data, score_threshold, nms_threshold, nms_eta,
-                   nms_top_k, &((*indices)[c]));
-        num_det += (*indices)[c].size();
-    }
-    
-    *num_nmsed_out = num_det;
-    if (keep_top_k > -1 && num_det > keep_top_k) {
-        std::vector<std::pair<T, std::pair<int, int>>> score_index_pairs;
-        for (const auto& it : *indices) {
-            int label = it.first;
-            const T* sdata = scores_data + label * predict_dim;
-            const std::vector<int>& label_indices = it.second;
-            for (size_t j = 0; j < label_indices.size(); ++j) {
-                int idx = label_indices[j];
-                // PADDLE_ENFORCE_LT(idx, predict_dim);
-                score_index_pairs.push_back(std::make_pair(sdata[idx], std::make_pair(label, idx)));
-            }
-        }
-        // Keep top k results per image.
-        std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
-                         SortScorePairDescend<std::pair<int, int>>);
-        score_index_pairs.resize(keep_top_k);
-        
-        // Store the new indices.
-        std::map<int, std::vector<int>> new_indices;
-        for (size_t j = 0; j < score_index_pairs.size(); ++j) {
-            int label = score_index_pairs[j].second.first;
-            int idx = score_index_pairs[j].second.second;
-            new_indices[label].push_back(idx);
-        }
-        new_indices.swap(*indices);
-        *num_nmsed_out = keep_top_k;
-    }
-}
-
-template <typename T>
-void MultiClassOutput(const T *scores_data,
-                      const std::vector<int> &score_dim,
-                      const T *bboxes_data,
-                      T *outputs_data,
-                      const std::map<int, std::vector<int>>& selected_indices) {
-    int predict_dim = score_dim[1];
-    int count = 0;
-    for (const auto& it : selected_indices) {
-        /// one batch
-        int label = it.first;
-        const T* sdata = scores_data + label * predict_dim;
-        const std::vector<int>& indices = it.second;
-        for (size_t j = 0; j < indices.size(); ++j) {
-            int idx = indices[j];
-            const T* bdata = bboxes_data + idx * kBBoxSize;
-            outputs_data[count * kOutputDim] = label;           // label
-            outputs_data[count * kOutputDim + 1] = sdata[idx];  // score
-            // xmin, ymin, xmax, ymax
-            std::memcpy(outputs_data + count * kOutputDim + 2, bdata, 4 * sizeof(T));
-            count++;
-        }
-    }
-}
-
-void MultiClassNMSCompute(NMSParam *param) {
-    assert(param->score_dim[0] == 1);
-    assert(param->box_dim[0] == 1);
-    assert (param->score_dim.size() == 3);
-    assert(param->box_dim.size() == 3);
-    
-    float* outputs;
-    auto background_label = param->background_label;
-    auto nms_top_k = param->nmsTopK;
-    auto keep_top_k = param->keepTopK;
-    auto nms_threshold = param->nmsThreshold;
-    auto nms_eta = param->nmsEta;
-    auto score_threshold = param->scoreThredshold;
-    
-    std::vector<int> score_dim_one_batch = {param->score_dim[1], param->score_dim[2]};
-    std::vector<int> box_dim_one_batch = {param->box_dim[1], param->box_dim[2]};
-    
-    std::vector<int> batch_starts = {0};
-    
-    std::map<int, std::vector<int>> indices;
-    int num_nmsed_out = 0;
-    
-    MultiClassNMS<float>(param->box_data, box_dim_one_batch, param->score_data, score_dim_one_batch, &indices, &num_nmsed_out,
-                         background_label, nms_top_k, keep_top_k, nms_threshold,
-                         nms_eta, score_threshold);
-    batch_starts.push_back(batch_starts.back() + num_nmsed_out);
-    
-    int output_size = 0;
-    int num_kept = batch_starts.back();
-    if (num_kept == 0) {
-        outputs = new float[1];
-        outputs[0] = -1;
-        output_size = 1;
-    } else {
-        outputs = new float[num_kept * kOutputDim];
-        int64_t s = batch_starts[0];
-        int64_t e = batch_starts[1];
-        if (e > s) {
-            MultiClassOutput<float>(param->score_data, score_dim_one_batch, param->box_data, outputs, indices);
-        }
-        output_size = num_kept * kOutputDim;
-    }
-    param->output = outputs;
-    param->output_size = output_size;
-}
-
-@implementation CPUResult
-@end
-
-@implementation NMSCompute
-
--(CPUResult *)computeWithScore:(float *)score andBBoxs:(float *)bbox {
-    NMSParam param;
-    param.box_data = bbox;
-    param.score_data = score;
-    param.background_label = self.background_label;
-    param.scoreThredshold = self.scoreThredshold;
-    param.nmsTopK = self.nmsTopK;
-    param.keepTopK = self.keepTopK;
-    param.nmsEta = self.nmsEta;
-    param.nmsThreshold = self.nmsThreshold;
-    std::vector<int> score_dim;
-    for (int i = 0; i < self.scoreDim.count; ++i) {
-        score_dim.push_back(self.scoreDim[i].intValue);
-    }
-    param.score_dim = score_dim;
-    
-    std::vector<int> box_dim;
-    for (int i = 0; i < self.bboxDim.count; ++i) {
-        box_dim.push_back(self.bboxDim[i].intValue);
-    }
-    param.box_dim = box_dim;
-    MultiClassNMSCompute(&param);
-    CPUResult *cr = [[CPUResult alloc] init];
-    cr.output = param.output;
-    cr.outputSize = param.output_size;
-    return cr;
-}
-
-@end
-
-
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/Genet.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/Genet.swift
deleted file mode 100644
index cd356f8f47..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/Genet.swift
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import paddle_mobile
-
-public class Genet: Net {
-    @objc public override init(device: MTLDevice) throws {
-        try super.init(device: device)
-        guard let modelPath = Bundle.main.path(forResource: "genet_model", ofType: nil) else {
-            throw PaddleMobileError.makeError(type: PaddleMobileErrorType.loaderError, msg: "model null")
-        }
-        self.modelPath = modelPath
-        guard let paramPath = Bundle.main.path(forResource: "genet_params", ofType: nil) else {
-            throw PaddleMobileError.makeError(type: PaddleMobileErrorType.loaderError, msg: "para null")
-        }
-        self.paramPath = paramPath
-        preprocessKernel = try GenetPreProccess.init(device: device)
-        inputDim = Dim.init(inDim: [1, 128, 128, 3])
-        metalLoadMode = .LoadMetalInCustomMetalLib
-        guard let metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") else {
-            throw PaddleMobileError.makeError(type: PaddleMobileErrorType.loaderError, msg: "metallib null")
-        }
-        self.metalLibPath = metalLibPath
-    }
-    
-    @objc override public init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) throws {
-        try super.init(device: device,
-                   inParamPointer: inParamPointer,
-                   inParamSize: inParamSize,
-                   inModelPointer: inModelPointer,
-                   inModelSize: inModelSize)
-        metalLoadMode = .LoadMetalInCustomMetalLib
-        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-        preprocessKernel = try GenetPreProccess.init(device: device)
-        inputDim = Dim.init(inDim: [1, 128, 128, 3])
-    }
-    
-    class GenetPreProccess: CusomKernel {
-        init(device: MTLDevice) throws {
-            let s = Shape.init(inWidth: 128, inHeight: 128, inChannel: 3)
-            try super.init(device: device, inFunctionName: "genet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
-        }
-    }
-    
-    override  public func resultStr(res: [ResultHolder]) -> String {
-        return " \(res[0].result[0]) ... "
-    }
-    
-}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNet.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNet.swift
deleted file mode 100644
index 6fe27cf7d1..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNet.swift
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import paddle_mobile
-
-public class MobileNet: Net{
-    
-    class MobilenetPreProccess: CusomKernel {
-        init(device: MTLDevice) throws {
-            let s = Shape.init(inWidth: 224, inHeight: 224, inChannel: 3)
-            try super.init(device: device, inFunctionName: "mobilenet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
-        }
-    }
-    
-    class PreWords {
-        var contents: [String] = []
-        init(fileName: String, type: String = "txt", inBundle: Bundle = Bundle.main) {
-            if let filePath = inBundle.path(forResource: fileName, ofType: type) {
-                let string = try! String.init(contentsOfFile: filePath)
-                contents = string.components(separatedBy: CharacterSet.newlines).filter{$0.count > 10}.map{
-                    String($0[$0.index($0.startIndex, offsetBy: 10)...])
-                }
-            } else {
-                print("no file called \(fileName)")
-            }
-        }
-        subscript(index: Int) -> String {
-            return index < contents.count ? contents[index] : ""
-        }
-    }
-    
-    let labels = PreWords.init(fileName: "synset")
-    
-    override public func resultStr(res: [ResultHolder]) -> String {
-        let resPointer = res[0].result
-        var s: [String] = []
-        (0..<res[0].capacity).map { resPointer[$0] }.top(r: 5).enumerated().forEach{
-            s.append(String(format: "%d: %@ (%3.2f%%)", $0 + 1, labels[$1.0], $1.1 * 100))
-        }
-        return s.joined(separator: "\n")
-    }
-    
-    override public init(device: MTLDevice) throws {
-        try super.init(device: device)
-        except = 0
-        guard let modelPath = Bundle.main.path(forResource: "mobilenet_model", ofType: nil) else {
-            throw PaddleMobileError.makeError(type: PaddleMobileErrorType.loaderError, msg: "model null")
-        }
-        self.modelPath = modelPath
-        guard let paramPath = Bundle.main.path(forResource: "mobilenet_params", ofType: nil) else {
-            throw PaddleMobileError.makeError(type: PaddleMobileErrorType.loaderError, msg: "para null")
-        }
-        self.paramPath = paramPath
-        preprocessKernel = try MobilenetPreProccess.init(device: device)
-        inputDim = Dim.init(inDim: [1, 224, 224, 3])
-        metalLoadMode = .LoadMetalInCustomMetalLib
-        guard let metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") else {
-            throw PaddleMobileError.makeError(type: PaddleMobileErrorType.loaderError, msg: "metallib null")
-        }
-        self.metalLibPath = metalLibPath
-    }
-}
-
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetCombined.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetCombined.swift
deleted file mode 100644
index aeafc0f147..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetCombined.swift
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import paddle_mobile
-
-public class MobileNetCombined: Net {
-    @objc public override init(device: MTLDevice) throws {
-        try super.init(device: device)
-        except = 0
-        guard let modelPath = Bundle.main.path(forResource: "combined_mobilenet_model_16", ofType: nil) else {
-            throw PaddleMobileError.makeError(type: PaddleMobileErrorType.loaderError, msg: "model null")
-        }
-        self.modelPath = modelPath
-        guard let paramPath = Bundle.main.path(forResource: "combined_mobilenet_params_16", ofType: nil) else {
-            throw PaddleMobileError.makeError(type: PaddleMobileErrorType.loaderError, msg: "para null")
-        }
-        self.paramPath = paramPath
-        inputDim = Dim.init(inDim: [1, 224, 224, 3])
-        metalLoadMode = .LoadMetalInCustomMetalLib
-        guard let paddleMobileMetallib = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") else {
-            throw PaddleMobileError.makeError(type: PaddleMobileErrorType.loaderError, msg: "metallib null")
-        }
-        metalLibPath = paddleMobileMetallib
-        useMPS = true
-        paramPrecision = .Float16
-        preprocessKernel = try ScaleKernel.init(device: device, shape: Shape.init(inWidth: 224, inHeight: 224, inChannel: 3), metalLoadMode: .LoadMetalInCustomMetalLib, metalLibPath: paddleMobileMetallib)
-        
-    }
-    let labels = PreWords.init(fileName: "vision_synset")
-
-    class PreWords {
-        var contents: [String] = []
-        init(fileName: String, type: String = "txt", inBundle: Bundle = Bundle.main) {
-            if let filePath = inBundle.path(forResource: fileName, ofType: type) {
-                let string = try! String.init(contentsOfFile: filePath)
-                contents = string.components(separatedBy: CharacterSet.newlines).filter{$0.count > 10}.map{
-                    String($0[$0.index($0.startIndex, offsetBy: 10)...])
-                }
-            } else {
-                print("no file called \(fileName)")
-            }
-        }
-        subscript(index: Int) -> String {
-            return index < contents.count ? contents[index] : ""
-        }
-    }
-    
-    override  public func resultStr(res: [ResultHolder]) -> String {
-        let firstRes = res[0]
-        let resPointer = firstRes.result
-        var s: [String] = []
-        (0..<firstRes.capacity).map { resPointer[$0] }.top(r: 5).enumerated().forEach{
-            s.append(String(format: "%d: %@ (%3.2f%%)", $0 + 1, labels[$1.0], $1.1 * 100))
-        }
-        return s.joined(separator: "\n")
-    }
-    
-}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift
deleted file mode 100644
index 58081a40bf..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import paddle_mobile
-
-public class MobileNet_ssd_hand: Net {
-    @objc public override init(device: MTLDevice) throws {
-        try super.init(device: device)
-        except = 2
-        guard let modelPath = Bundle.main.path(forResource: "ssd_hand_model", ofType: nil) else {
-            throw PaddleMobileError.makeError(type: PaddleMobileErrorType.loaderError, msg: "model null")
-        }
-        self.modelPath = modelPath
-        guard let paramPath = Bundle.main.path(forResource: "ssd_hand_params", ofType: nil) else {
-            throw PaddleMobileError.makeError(type: PaddleMobileErrorType.loaderError, msg: "para null")
-        }
-        self.paramPath = paramPath
-        metalLoadMode = .LoadMetalInCustomMetalLib
-        guard let metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") else {
-            throw PaddleMobileError.makeError(type: PaddleMobileErrorType.loaderError, msg: "metallib null")
-        }
-        self.metalLibPath = metalLibPath
-        preprocessKernel = try MobilenetssdPreProccess.init(device: device)
-        inputDim = Dim.init(inDim: [1, 300, 300, 3])
-    }
-    
-    @objc override public init(device: MTLDevice,inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer inModePointer: UnsafeMutableRawPointer, inModelSize: Int) throws {
-        try super.init(device:device,inParamPointer:inParamPointer,inParamSize:inParamSize,inModelPointer:inModePointer,inModelSize:inModelSize)
-        except = 2
-        modelPath = ""
-        paramPath = ""
-        metalLoadMode = .LoadMetalInCustomMetalLib
-        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-        preprocessKernel = try MobilenetssdPreProccess.init(device: device)
-        inputDim = Dim.init(inDim: [1, 300, 300, 3])
-    }
-    
-    class MobilenetssdPreProccess: CusomKernel {
-        init(device: MTLDevice) throws {
-            let s = Shape.init(inWidth: 300, inHeight: 300, inChannel: 3)
-            try super.init(device: device, inFunctionName: "mobilenet_ssd_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
-        }
-    }
-    
-    override public func resultStr(res: [ResultHolder]) -> String {
-        return " \(res[0])"
-    }
-    
-    override public func fetchResult(paddleMobileRes: [GPUResultHolder]) -> [ResultHolder] {
-        return []
-    }
-}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobilenetSSD_AR.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobilenetSSD_AR.swift
deleted file mode 100644
index 3bcf793e74..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobilenetSSD_AR.swift
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import paddle_mobile
-
-public class MobileNet_ssd_AR: Net {
-    @objc public override init(device: MTLDevice) throws {
-        try super.init(device: device)
-        except = 2
-        guard let modelPath = Bundle.main.path(forResource: "ar_model", ofType: nil) else {
-            throw PaddleMobileError.makeError(type: PaddleMobileErrorType.loaderError, msg: "model null")
-        }
-        self.modelPath = modelPath
-        guard let paramPath = Bundle.main.path(forResource: "ar_params", ofType: nil) else {
-            throw PaddleMobileError.makeError(type: PaddleMobileErrorType.loaderError, msg: "para null")
-        }
-        self.paramPath = paramPath
-        preprocessKernel = try MobilenetssdPreProccess.init(device: device)
-        inputDim = Dim.init(inDim: [1, 160, 160, 3])
-        metalLoadMode = .LoadMetalInCustomMetalLib
-        guard let metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") else {
-            throw PaddleMobileError.makeError(type: PaddleMobileErrorType.loaderError, msg: "metallib null")
-        }
-        self.metalLibPath = metalLibPath
-    }
-    
-    @objc override public init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) throws {
-        try super.init(device:device,inParamPointer:inParamPointer,inParamSize:inParamSize,inModelPointer:inModelPointer,inModelSize:inModelSize)
-        except = 2
-        preprocessKernel = try MobilenetssdPreProccess.init(device: device)
-        inputDim = Dim.init(inDim: [1, 160, 160, 3])
-        metalLoadMode = .LoadMetalInCustomMetalLib
-        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-    }
-    
-    class MobilenetssdPreProccess: CusomKernel {
-        init(device: MTLDevice) throws  {
-            let s = Shape.init(inWidth: 160, inHeight: 160, inChannel: 3)
-            try super.init(device: device, inFunctionName: "mobilent_ar_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
-        }
-    }
-    
-    override public func resultStr(res: [ResultHolder]) -> String {
-        return " \(res[0].result[0])"
-    }
-    
-    override public func fetchResult(paddleMobileRes: [GPUResultHolder]) -> [ResultHolder] {
-        return []
-    }
-}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal
deleted file mode 100644
index 99bd8f4a03..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-
-kernel void mobilenet_preprocess(
-                                 texture2d<float, access::read> inTexture [[texture(0)]],
-                                 texture2d<float, access::write> outTexture [[texture(1)]],
-                                 uint2 gid [[thread_position_in_grid]])
-{
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height()) {
-        return;
-    }
-    const auto means = float4(123.68f, 116.78f, 103.94f, 0.0f);
-    const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
-    outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
-}
-
-kernel void mobilenet_preprocess_half(
-                                      texture2d<half, access::read> inTexture [[texture(0)]],
-                                      texture2d<half, access::write> outTexture [[texture(1)]],
-                                      uint2 gid [[thread_position_in_grid]])
-{
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height()) {
-        return;
-    }
-    const auto means = half4(123.68f, 116.78f, 103.94f, 0.0f);
-    const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
-    outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
-}
-
-kernel void mobilenet_ssd_preprocess(
-                                     texture2d<float, access::read> inTexture [[texture(0)]],
-                                     texture2d<float, access::write> outTexture [[texture(1)]],
-                                     uint2 gid [[thread_position_in_grid]])
-{
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height()) {
-        return;
-    }
-    const auto means = float4(123.68f, 116.78f, 103.94f, 0.0f);
-    const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
-    outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
-}
-
-kernel void mobilenet_ssd_preprocess_half(
-                                          texture2d<half, access::read> inTexture [[texture(0)]],
-                                          texture2d<half, access::write> outTexture [[texture(1)]],
-                                          uint2 gid [[thread_position_in_grid]])
-{
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height()) {
-        return;
-    }
-    const auto means = half4(123.68f, 116.78f, 103.94f, 0.0f);
-    const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
-    outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
-}
-
-kernel void genet_preprocess(texture2d<float, access::read> inTexture [[texture(0)]], texture2d<float, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]])
-{
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height()) {
-        return;
-    }
-    const auto means = float4(128.0f, 128.0f, 128.0f, 0.0f);
-    const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
-    outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
-}
-
-kernel void genet_preprocess_half(texture2d<half, access::read> inTexture [[texture(0)]], texture2d<half, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]])
-{
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height()) {
-        return;
-    }
-    const auto means = half4(128.0f, 128.0f, 128.0f, 0.0f);
-    const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
-    outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
-}
-
-kernel void mobilent_ar_preprocess(texture2d<float, access::read> inTexture [[texture(0)]], texture2d<float, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]])
-{
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height()) {
-        return;
-    }
-    const auto means = float4(128.0f, 128.0f, 128.0f, 0.0f);
-    const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
-    outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
-}
-
-kernel void mobilent_ar_preprocess_half(texture2d<half, access::read> inTexture [[texture(0)]], texture2d<half, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]])
-{
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height()) {
-        return;
-    }
-    const auto means = half4(128.0f, 128.0f, 128.0f, 0.0f);
-    const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
-    outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
-}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/YoloNet.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/YoloNet.swift
deleted file mode 100644
index 42ba573e52..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/YoloNet.swift
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Metal
-import Foundation
-import paddle_mobile
-
-public class YoloNet: Net {
-    @objc public override init(device: MTLDevice) throws {
-        try super.init(device: device)
-        except = 0
-        guard let modelPath = Bundle.main.path(forResource: "yolo_16_model", ofType: nil) else {
-            throw PaddleMobileError.makeError(type: PaddleMobileErrorType.loaderError, msg: "model null")
-        }
-        self.modelPath = modelPath
-        guard let paramPath = Bundle.main.path(forResource: "yolo_16_param", ofType: nil) else {
-            throw PaddleMobileError.makeError(type: PaddleMobileErrorType.loaderError, msg: "para null")
-        }
-        self.paramPath = paramPath
-        inputDim = Dim.init(inDim: [1, 416, 416, 3])
-        metalLoadMode = .LoadMetalInCustomMetalLib
-        guard let metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") else {
-            throw PaddleMobileError.makeError(type: PaddleMobileErrorType.loaderError, msg: "metallib null")
-        }
-        self.metalLibPath = metalLibPath
-        useMPS = true
-        paramPrecision = .Float16
-        preprocessKernel = try ScaleKernel.init(device: device, shape: Shape.init(inWidth: 416, inHeight: 416, inChannel: 3), metalLoadMode: .LoadMetalInCustomMetalLib, metalLibPath: Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib"))
-        
-    }
-    
-    override  public func resultStr(res: [ResultHolder]) -> String {
-        return " \(res[0].result[0]) ... "
-    }
-    
-}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/OC/ImageTool.h b/metal/paddle-mobile-demo/paddle-mobile-demo/OC/ImageTool.h
deleted file mode 100644
index 6a1e52b540..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/OC/ImageTool.h
+++ /dev/null
@@ -1,22 +0,0 @@
-//
-//  ImageTool.h
-//  paddle-mobile-demo
-//
-//  Created by liuRuiLong on 2019/3/12.
-//  Copyright © 2019 orange. All rights reserved.
-//
-
-#import <UIKit/UIKit.h>
-#import <CoreVideo/CoreVideo.h>
-#import <Foundation/Foundation.h>
-
-NS_ASSUME_NONNULL_BEGIN
-
-@interface ImageTool : NSObject
-
-+ (CVPixelBufferRef)imageToRGBPixelBuffer:(UIImage *)image;
-
-
-@end
-
-NS_ASSUME_NONNULL_END
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/OC/ImageTool.m b/metal/paddle-mobile-demo/paddle-mobile-demo/OC/ImageTool.m
deleted file mode 100644
index e45ee91957..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/OC/ImageTool.m
+++ /dev/null
@@ -1,38 +0,0 @@
-//
-//  ImageTool.m
-//  paddle-mobile-demo
-//
-//  Created by liuRuiLong on 2019/3/12.
-//  Copyright © 2019 orange. All rights reserved.
-//
-
-#import "ImageTool.h"
-
-@implementation ImageTool
-
-+ (CVPixelBufferRef)imageToRGBPixelBuffer:(UIImage *)image {
-    CVPixelBufferRef pxbuffer = NULL;
-#if defined(__arm__) || defined(__arm64__)
-    CGSize frameSize = CGSizeMake(CGImageGetWidth(image.CGImage),CGImageGetHeight(image.CGImage));
-    //Metal渲染纹理需要IOSurface属性
-    NSDictionary *options =
-    [NSDictionary dictionaryWithObjectsAndKeys:
-     [NSNumber numberWithBool:YES], kCVPixelBufferCGImageCompatibilityKey,
-     [NSNumber numberWithBool:YES], kCVPixelBufferCGBitmapContextCompatibilityKey,
-     [NSNumber numberWithBool:YES], kCVPixelBufferIOSurfaceOpenGLESTextureCompatibilityKey,
-     [NSNumber numberWithBool:YES], kCVPixelBufferIOSurfaceCoreAnimationCompatibilityKey,nil];
-    CVPixelBufferCreate(kCFAllocatorDefault, frameSize.width, frameSize.height,kCVPixelFormatType_32BGRA, (__bridge CFDictionaryRef)options, &pxbuffer);
-    CVPixelBufferLockBaseAddress(pxbuffer, 0);
-    void *pxdata = CVPixelBufferGetBaseAddress(pxbuffer);
-    CGColorSpaceRef rgbColorSpace = CGColorSpaceCreateDeviceRGB();
-    CGContextRef context = CGBitmapContextCreate(pxdata, frameSize.width, frameSize.height,8, CVPixelBufferGetBytesPerRow(pxbuffer),rgbColorSpace,(CGBitmapInfo)kCGBitmapByteOrder32Little | kCGImageAlphaPremultipliedFirst);
-    CGContextDrawImage(context, CGRectMake(0, 0, CGImageGetWidth(image.CGImage),CGImageGetHeight(image.CGImage)), image.CGImage);
-    CGColorSpaceRelease(rgbColorSpace);
-    CGContextRelease(context);
-    CVPixelBufferUnlockBaseAddress(pxbuffer, 0);
-#endif
-    
-    return pxbuffer;
-}
-
-@end
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.h b/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.h
deleted file mode 100644
index 82e6ad9467..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#import <UIKit/UIKit.h>
-
-
-/**
- @b 从内存中加载模型 Demo, 可以在 main storyboard 中调整 Demo
- */
-@interface LoadPointerViewController : UIViewController
-
-@end
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.m b/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.m
deleted file mode 100644
index 5bef9317b1..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.m
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#import "PaddleMobileGPU.h"
-#import "paddle_mobile_demo-Swift.h"
-#import "LoadPointerViewController.h"
-
-#import <Metal/Metal.h>
-#import <MetalKit/MetalKit.h>
-
-@interface LoadPointerViewController ()
-
-@property (weak, nonatomic) IBOutlet UIImageView *imageView;
-
-@property (assign, nonatomic) BOOL loaded;
-@property (strong, nonatomic) id<MTLTexture> texture;
-
-@property (strong, nonatomic) PaddleMobileGPU *paddleMobile;
-@property (strong, nonatomic) ModelConfig *modelConfig;
-
-@end
-
-@implementation LoadPointerViewController
-
-- (void)viewDidLoad {
-    [super viewDidLoad];
-    
-    self.imageView.image = [UIImage imageNamed:@"banana.jpeg"];
-    
-    NSString *modelPath = [[NSBundle mainBundle] URLForResource:@"super_model" withExtension:nil].path;
-    NSString *paramPath = [[NSBundle mainBundle] URLForResource:@"super_params" withExtension:nil].path;
-    
-    long fileSize;
-    FILE *fp;
-    fp = fopen([modelPath UTF8String], "rb");
-    fseek(fp, 0, SEEK_END);
-    fileSize = ftell(fp);
-    rewind(fp);
-    void *buffer = malloc(fileSize);
-    fread(buffer, 1, fileSize, fp);
-    fclose(fp);
-    
-    long paramfileSize;
-    FILE *parmaFilePointer;
-    parmaFilePointer = fopen([paramPath UTF8String], "rb");
-    fseek(parmaFilePointer, 0, SEEK_END);
-    paramfileSize = ftell(parmaFilePointer);
-    rewind(parmaFilePointer);
-    void *parmaBuffer = malloc(paramfileSize);
-    fread(parmaBuffer, 1, paramfileSize, parmaFilePointer);
-    fclose(parmaFilePointer);
-    
-    _modelConfig = [[ModelConfig alloc] init];
-    _modelConfig.modelPointer = buffer;
-    _modelConfig.modelSize = (int)fileSize;
-    _modelConfig.paramPointer = parmaBuffer;
-    _modelConfig.paramSize = (int)paramfileSize;
-}
-- (IBAction)loaderButtonPressed:(id)sender {
-    self.paddleMobile = [[PaddleMobileGPU alloc] initWithCommandQueue:MetalHelper.shared.queue net:SuperResolutionNetType modelConfig:_modelConfig];
-    _loaded = [self.paddleMobile load];
-    NSLog(@" load 结果: %@", _loaded ? @"成功" : @"失败");
-}
-- (IBAction)predictButtonPressed:(id)sender {
-    [self predict];
-}
-
-- (void)predict {
-    UIImage *image = self.imageView.image;
-    if (!image) {
-        NSLog(@" image is nil");
-        return;
-    }
-    id<MTLTexture> texture = [MetalHelper.shared.textureLoader newTextureWithCGImage:image.CGImage options:nil error:nil];
-    _texture = texture;
-    if (!_texture) {
-        NSLog(@" texture is nil");
-        return;
-    }
-    
-    if (!self.loaded) {
-        NSLog(@" not load ");
-        return;
-    }
-    
-    NSTimeInterval startTime = [[NSDate date] timeIntervalSince1970];
-    NSInteger max = 1;
-    for (int i = 0;i < max; i ++) {
-        [self.paddleMobile predict:_texture withCompletion:^(BOOL success , NSArray<NSNumber *> *result) {
-            if (success) {
-                if (i == max -1) {
-                    double time = [[NSDate date] timeIntervalSince1970] - startTime;
-                    time = (time/max)*1000;
-                    NSLog(@"gap ==== %fms",time);
-                }
-            }
-        }];
-    }
-}
-- (IBAction)clear:(id)sender {
-    [self.paddleMobile clear];
-    self.loaded = NO;
-}
-
-@end
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/OCDemoViewController.h b/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/OCDemoViewController.h
deleted file mode 100644
index b968f5e9a0..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/OCDemoViewController.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-#import <Foundation/Foundation.h>
-
-NS_ASSUME_NONNULL_BEGIN
-
-@interface OCDemoViewController : NSObject
-
-@end
-
-NS_ASSUME_NONNULL_END
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/OCDemoViewController.m b/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/OCDemoViewController.m
deleted file mode 100644
index 50ac4aa873..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/OCDemoViewController.m
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#import "OCDemoViewController.h"
-
-@implementation OCDemoViewController
-
-@end
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.h b/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.h
deleted file mode 100644
index 9f8e97b795..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#import <Metal/Metal.h>
-#import <Foundation/Foundation.h>
-
-typedef enum : NSUInteger {
-    SuperResolutionNetType,
-    MobileNetSSDType
-} NetType;
-
-@interface PaddleMobileGPUResult: NSObject
-
-@property (assign, nonatomic) float *output;
-
-@property (assign, nonatomic) int outputSize;
-
-@property (strong, nonatomic) NSArray <NSNumber *>*dim;
-
--(void)releaseOutput;
-
-@end
-
-@interface ModelConfig: NSObject
-
-/*
- * 预处理需要用到的值 (三个)
- */
-@property (strong, nonatomic) NSArray<NSNumber *> *means;
-/*
- * 预处理需要用到的 scale 值
- */
-@property (assign, nonatomic) float scale;
-
-/*
- * 输出维度信息  [n c h w]
- */
-@property (strong, nonatomic) NSArray<NSNumber *> *dims;
-
-
-/*
- * 模型参数内存地址
- */
-@property (assign, nonatomic) void *paramPointer;
-
-/*
- * 模型参数占用内存大小 (kb)
- */
-@property (assign, nonatomic) int paramSize;
-
-/*
- * 模型内存地址
- */
-@property (assign, nonatomic) void *modelPointer;
-
-/*
- * 模型占用内存大小 (kb)
- */
-@property (assign, nonatomic) int modelSize;
-
-@end
-
-@interface PaddleMobileGPU: NSObject
-
-/*
- * 初始化
- */
--(instancetype)initWithCommandQueue:(id<MTLCommandQueue>)queue net:(NetType)netType modelConfig:(ModelConfig *)config;
-
-/*
- * paramPointer 模型参数内存地址
- * paramSize    模型参数占用内存大小 (kb)
- * modelPointer 模型内存地址
- * modelSize    模型占用内存大小 (kb)
- */
--(BOOL)load;
-
-/*
- * texture:     需要进行预测的图像转换的 texture
- * completion:  预测完成回调
- */
--(void)predict:(id<MTLTexture>)texture withCompletion:(void (^)(BOOL, NSArray<NSArray <NSNumber *>*> *))completion;
-
-/*
- * texture:     需要进行预测的图像转换的 texture
- * completion:  预测完成回调
- */
--(void)predict:(id<MTLTexture>)texture withResultCompletion:(void (^)(BOOL, NSArray <PaddleMobileGPUResult *> *))completion;
-
-/*
- * 清理内存
- */
--(void)clear;
-
-@end
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.m b/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.m
deleted file mode 100644
index 722565dae4..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.m
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#import "PaddleMobileGPU.h"
-
-#import <Foundation/Foundation.h>
-#import <paddle_mobile_demo-Swift.h>
-
-@implementation ModelConfig
-@end
-
-@interface PaddleMobileGPUResult ()
-
-@property (strong, nonatomic) ResultHolder *resultHolder;
-
-- (void)setOutputResult:(ResultHolder *)resultHolder;
-
-@end
-
-@implementation PaddleMobileGPUResult
-- (void)setOutputResult:(ResultHolder *)resultHolder {
-    self.resultHolder = resultHolder;
-    self.output = resultHolder.result;
-    self.outputSize = resultHolder.capacity;
-}
-
--(void)releaseOutput {
-    [self.resultHolder releasePointer];
-}
-@end
-
-@interface PaddleMobileGPU ()
-{
-    Runner *runner;
-}
-@end
-
-@implementation PaddleMobileGPU
-
--(instancetype)initWithCommandQueue:(id<MTLCommandQueue>)queue net:(NetType)netType modelConfig:(ModelConfig *)config {
-    self = [super init];
-    if (self) {
-        Net *net = nil;
-        NSError *error = nil;
-        if (netType == SuperResolutionNetType) {
-            net = [[SuperResolutionNet alloc] initWithDevice:queue.device inParamPointer:config.paramPointer inParamSize:config.paramSize inModelPointer:config.modelPointer inModelSize:config.modelSize error:&error];
-        } else if (netType == MobileNetSSDType) {
-            net = [[MobileNet_ssd_AR alloc] initWithDevice:queue.device inParamPointer:config.paramPointer inParamSize:config.paramSize inModelPointer:config.paramPointer inModelSize:config.modelSize error:&error];
-        }
-        if (!error && net) {
-            runner = [[Runner alloc] initInNet:net commandQueue:queue error:&error];
-        }
-    }
-    return self;
-}
-
--(BOOL)load {
-    return [runner load];
-}
-
--(void)predict:(id<MTLTexture>)texture withCompletion:(void (^)(BOOL, NSArray<NSArray <NSNumber *>*> *))completion {
-    
-    [runner predictWithTexture:texture completion:^(BOOL success, NSArray<ResultHolder *> * _Nullable resultArr) {
-        NSMutableArray<NSMutableArray <NSNumber *>*> *ocResultArray = [NSMutableArray arrayWithCapacity:resultArr.count];
-        for (int i = 0; i < resultArr.count; ++i) {
-            ResultHolder *resultHolder = resultArr[i];
-            NSMutableArray <NSNumber *>*res = [NSMutableArray arrayWithCapacity:resultHolder.capacity];
-            for (int j = 0; j < resultHolder.capacity; ++j) {
-                [res addObject:[NSNumber numberWithFloat:resultHolder.result[i]]];
-            }
-            [ocResultArray addObject:res];
-            [resultHolder releasePointer];
-        }
-        completion(success, ocResultArray);
-    }];
-}
-
--(void)predict:(id<MTLTexture>)texture withResultCompletion:(void (^)(BOOL, NSArray <PaddleMobileGPUResult *> *))completion {
-    [runner predictWithTexture:texture completion:^(BOOL success, NSArray<ResultHolder *> * _Nullable resultArr) {
-        NSMutableArray <PaddleMobileGPUResult *> *ocResultArr = [NSMutableArray arrayWithCapacity:resultArr.count];
-        for (int i = 0; i < resultArr.count; ++i) {
-            ResultHolder *result = resultArr[i];
-            PaddleMobileGPUResult *gpuResult = [[PaddleMobileGPUResult alloc] init];
-            gpuResult.dim = result.dim;
-            [gpuResult setOutputResult:result];
-            [ocResultArr addObject:gpuResult];
-        }
-        completion(success, ocResultArr);
-    }];
-}
-
--(void)clear {
-    [runner clear];
-}
-
-@end
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/SuperResolutionNet.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/SuperResolutionNet.swift
deleted file mode 100644
index 02a03f3977..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/SuperResolutionNet.swift
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import paddle_mobile
-
-@objc public class SuperResolutionNet: Net{
-    override public func resultStr(res: [ResultHolder]) -> String {
-        return "未实现"
-    }
-    
-    public override init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize: Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) throws {
-        try super.init(device: device)
-        except = 0
-        metalLoadMode = .LoadMetalInCustomMetalLib
-        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-        inputDim = Dim.init(inDim: [1, 224, 224, 3])
-        self.paramPointer = inParamPointer
-        self.paramSize = inParamSize
-        self.modelPointer = inModelPointer
-        self.modelSize = inModelSize
-    }
-    
-    @objc override public init(device: MTLDevice) throws {
-        try super.init(device: device)
-        except = 0
-        guard let modelPath = Bundle.main.path(forResource: "super_model", ofType: nil) else {
-            throw PaddleMobileError.makeError(type: PaddleMobileErrorType.loaderError, msg: "model null")
-        }
-        self.modelPath = modelPath
-        guard let paramPath = Bundle.main.path(forResource: "super_params", ofType: nil) else {
-            throw PaddleMobileError.makeError(type: PaddleMobileErrorType.loaderError, msg: "para null")
-        }
-        self.paramPath = paramPath
-        preprocessKernel = nil
-        inputDim = Dim.init(inDim: [1, 224, 224, 1])
-        metalLoadMode = .LoadMetalInCustomMetalLib
-        guard let metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") else {
-            throw PaddleMobileError.makeError(type: PaddleMobileErrorType.loaderError, msg: "metallib null")
-        }
-        self.metalLibPath = metalLibPath
-    }
-    
-    override public func updateProgram(program: Program) throws {
-        // n h w c
-        for block in program.programDesc.blocks {
-            for varDesc in block.vars {
-                if !varDesc.persistable {
-                    if varDesc.type == .LodTensor {
-                        let varEle = program.scope.vars[varDesc.name]
-                        if let texture = varEle as? Texture {
-                            let newDim = Dim.init(inDim: [texture.dim[0],  inputDim[1], inputDim[2], texture.tensorDim[1]])
-                            print(" var desc name " + varDesc.name + " new dim" + "\(newDim)")
-                        
-                            try texture.updateDims(inTensorDim: Dim.init(inDim: [texture.tensorDim[0], texture.tensorDim[1], inputDim[1], inputDim[2]]), inDim: newDim)
-                            try texture.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: GlobalConfig.shared.computePrecision)
-                            
-                            if let output: FetchHolder = program.scope.output() as? FetchHolder {
-                                output.dim = newDim
-                                output.capacity = newDim.numel()
-                                output.paddedCapacity = newDim.numel() * 4
-                                output.initBuffer(device: device)
-                            } else {
-                                throw PaddleMobileError.makeError(type: .loaderError, msg: "scope output nil")
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift
deleted file mode 100644
index 0080aa80f6..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift
+++ /dev/null
@@ -1,31 +0,0 @@
-
-
-import Foundation
-import QuartzCore
-
-public class FPSCounter {
-    private(set) public var fps: Double = 0
-    
-    var frames = 0
-    var startTime: CFTimeInterval = 0
-    
-    public func start() {
-        frames = 0
-        startTime = CACurrentMediaTime()
-    }
-    
-    public func frameCompleted() {
-        frames += 1
-        let now = CACurrentMediaTime()
-        let elapsed = now - startTime
-        if elapsed > 0.1 {
-            let current = Double(frames) / elapsed
-            let smoothing = 0.75
-            fps = smoothing*fps + (1 - smoothing)*current
-            if elapsed > 1 {
-                frames = 0
-                startTime = CACurrentMediaTime()
-            }
-        }
-    }
-}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift
deleted file mode 100644
index cb63954487..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift
+++ /dev/null
@@ -1,218 +0,0 @@
-
-import UIKit
-import Metal
-import CoreVideo
-import AVFoundation
-
-@available(iOS 10.0, *)
-@objc public protocol VideoCaptureDelegate: NSObjectProtocol {
-    @objc optional func videoCapture(_ capture: VideoCapture, didCaptureSampleBuffer sampleBuffer: CMSampleBuffer, timestamp: CMTime)
-    @objc optional func videoCapture(_ capture: VideoCapture, didCaptureVideoTexture texture: MTLTexture?, timestamp: CMTime)
-    @objc optional func videoCapture(_ capture: VideoCapture, didCapturePhoto previewImage: UIImage?)
-    @objc optional func videoCapture(_ capture: VideoCapture, didCapturePhotoTexture texture: MTLTexture?)
-}
-
-/**
- Simple interface to the iPhone's camera.
- */
-@available(iOS 10.0, *)
-public class VideoCapture: NSObject {
-    public var previewLayer: AVCaptureVideoPreviewLayer?
-    public weak var delegate: VideoCaptureDelegate?
-    public var fps = -1
-    private let device: MTLDevice?
-    private let videoOrientation: AVCaptureVideoOrientation
-    private var textureCache: CVMetalTextureCache?
-    private let captureSession = AVCaptureSession()
-    private let videoOutput = AVCaptureVideoDataOutput()
-    private let photoOutput = AVCapturePhotoOutput()
-    private let queue = DispatchQueue(label: "net.machinethink.camera-queue")
-    private var lastTimestamp = CMTime()
-    private let cameraPosition: AVCaptureDevice.Position
-    public init(device: MTLDevice? = nil, orientation: AVCaptureVideoOrientation = .portrait, position: AVCaptureDevice.Position = .back) {
-        self.device = device
-        self.videoOrientation = orientation
-        self.cameraPosition = position
-        super.init()
-    }
-    
-    public func setUp(sessionPreset: AVCaptureSession.Preset = .medium,
-                      completion: @escaping (Bool) -> Void) {
-        queue.async {
-            let success = self.setUpCamera(sessionPreset: sessionPreset)
-            DispatchQueue.main.async {
-                completion(success)
-            }
-        }
-    }
-    
-    func fontCamera() -> AVCaptureDevice? {
-        let deveices = AVCaptureDevice.DiscoverySession.init(deviceTypes: [.builtInWideAngleCamera], mediaType: AVMediaType.video, position: .front).devices
-        return deveices.first
-        
-    }
-    
-    func setUpCamera(sessionPreset: AVCaptureSession.Preset) -> Bool {
-        if let inDevice = device{
-            guard CVMetalTextureCacheCreate(kCFAllocatorDefault, nil, inDevice, nil, &textureCache) == kCVReturnSuccess else {
-                print("Error: could not create a texture cache")
-                return false
-            }
-        }
-        
-        captureSession.beginConfiguration()
-        captureSession.sessionPreset = sessionPreset
-        
-        var oCaptureDevice: AVCaptureDevice?
-        switch cameraPosition {
-        case .back:
-            oCaptureDevice = AVCaptureDevice.default(for: AVMediaType.video)
-            break
-        case .front:
-            oCaptureDevice = fontCamera()
-            break
-        default:
-            break
-        }
-        
-        guard let captureDevice = oCaptureDevice else {
-            print("Error: no video devices available")
-            return false
-        }
-        
-        guard let videoInput = try? AVCaptureDeviceInput(device: captureDevice) else {
-            print("Error: could not create AVCaptureDeviceInput")
-            return false
-        }
-        
-        if captureSession.canAddInput(videoInput) {
-            captureSession.addInput(videoInput)
-        }
-        
-        let previewLayer = AVCaptureVideoPreviewLayer(session: captureSession)
-        previewLayer.videoGravity = AVLayerVideoGravity.resizeAspect
-        previewLayer.connection?.videoOrientation = self.videoOrientation
-        self.previewLayer = previewLayer
-        
-        let settings: [String : Any] = [
-            kCVPixelBufferPixelFormatTypeKey as String: NSNumber(value: kCVPixelFormatType_32BGRA)
-        ]
-        
-        videoOutput.videoSettings = settings
-        videoOutput.alwaysDiscardsLateVideoFrames = true
-        videoOutput.setSampleBufferDelegate(self, queue: queue)
-        if captureSession.canAddOutput(videoOutput) {
-            captureSession.addOutput(videoOutput)
-        }
-        
-        // We want the buffers to be in portrait orientation otherwise they are
-        // rotated by 90 degrees. Need to set this _after_ addOutput()!
-        videoOutput.connection(with: AVMediaType.video)?.videoOrientation = self.videoOrientation
-        
-        if captureSession.canAddOutput(photoOutput) {
-            captureSession.addOutput(photoOutput)
-        }
-        
-        captureSession.commitConfiguration()
-        return true
-    }
-    
-    public func start() {
-        if !captureSession.isRunning {
-            captureSession.startRunning()
-        }
-    }
-    
-    public func stop() {
-        if captureSession.isRunning {
-            captureSession.stopRunning()
-        }
-    }
-    
-    /* Captures a single frame of the camera input. */
-    public func capturePhoto() {
-        let settings = AVCapturePhotoSettings(format: [kCVPixelBufferPixelFormatTypeKey as String: NSNumber(value: kCVPixelFormatType_32BGRA)])
-        settings.previewPhotoFormat = [
-            kCVPixelBufferPixelFormatTypeKey as String: settings.__availablePreviewPhotoPixelFormatTypes[0],
-            kCVPixelBufferWidthKey as String: 480,
-            kCVPixelBufferHeightKey as String: 360,
-        ]
-        photoOutput.capturePhoto(with: settings, delegate: self)
-    }
-    
-    func convertToMTLTexture(sampleBuffer: CMSampleBuffer?) -> MTLTexture? {
-        if let textureCache = textureCache, let sampleBuffer = sampleBuffer, let imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) {
-            let width = CVPixelBufferGetWidth(imageBuffer)
-            let height = CVPixelBufferGetHeight(imageBuffer)
-            var texture: CVMetalTexture?
-            CVMetalTextureCacheCreateTextureFromImage(kCFAllocatorDefault, textureCache, imageBuffer, nil, .bgra8Unorm, width, height, 0, &texture)
-            if let texture = texture {
-                return CVMetalTextureGetTexture(texture)
-            }
-        }
-        return nil
-    }
-    
-    func convertToUIImage(sampleBuffer: CMSampleBuffer?) -> UIImage? {
-        if let sampleBuffer = sampleBuffer,
-            let imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) {
-            let width = CVPixelBufferGetWidth(imageBuffer)
-            let height = CVPixelBufferGetHeight(imageBuffer)
-            let rect = CGRect(x: 0, y: 0, width: CGFloat(width), height: CGFloat(height))
-            let ciImage = CIImage(cvPixelBuffer: imageBuffer)
-            let ciContext = CIContext(options: nil)
-            if let cgImage = ciContext.createCGImage(ciImage, from: rect) {
-                return UIImage(cgImage: cgImage)
-            }
-        }
-        return nil
-    }
-}
-
-
-@available(iOS 10.0, *)
-extension VideoCapture: AVCaptureVideoDataOutputSampleBufferDelegate {
-    public func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
-        // Because lowering the capture device's FPS looks ugly in the preview,
-        // we capture at full speed but only call the delegate at its desired
-        // framerate. If `fps` is -1, we run at the full framerate.
-        let timestamp = CMSampleBufferGetPresentationTimeStamp(sampleBuffer)
-        let deltaTime = timestamp - lastTimestamp
-        if fps == -1 || deltaTime >= CMTimeMake(1, Int32(fps)) {
-            lastTimestamp = timestamp
-            self.delegate?.videoCapture?(self, didCaptureSampleBuffer: sampleBuffer, timestamp: timestamp)
-            if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCaptureVideoTexture:timestamp:))) ?? false{
-                let texture = convertToMTLTexture(sampleBuffer: sampleBuffer)
-                delegate?.videoCapture?(self, didCaptureVideoTexture: texture, timestamp: timestamp)
-            }
-        }
-    }
-    
-    public func captureOutput(_ output: AVCaptureOutput, didDrop sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
-        print("dropped frame")
-    }
-}
-
-@available(iOS 10.0, *)
-extension VideoCapture: AVCapturePhotoCaptureDelegate {
-    public func photoOutput(_ captureOutput: AVCapturePhotoOutput,
-                            didFinishProcessingPhoto photoSampleBuffer: CMSampleBuffer?,
-                            previewPhoto previewPhotoSampleBuffer: CMSampleBuffer?,
-                            resolvedSettings: AVCaptureResolvedPhotoSettings,
-                            bracketSettings: AVCaptureBracketedStillImageSettings?,
-                            error: Error?) {
-        var imageTexture: MTLTexture?
-        var previewImage: UIImage?
-        if error == nil {
-            if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhotoTexture:))) ?? false{
-                imageTexture = convertToMTLTexture(sampleBuffer: photoSampleBuffer)
-                self.delegate?.videoCapture?(self, didCapturePhotoTexture: imageTexture)
-            }
-            
-            if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhoto:))) ?? false{
-                previewImage = convertToUIImage(sampleBuffer: previewPhotoSampleBuffer)
-                self.delegate?.videoCapture?(self, didCapturePhoto: previewImage)
-            }
-        }
-    }
-}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
deleted file mode 100644
index 59becd9f8b..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
+++ /dev/null
@@ -1,302 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import UIKit
-import MetalKit
-import CoreMedia
-import paddle_mobile
-import paddle_mobile_demo
-import MetalPerformanceShaders
-
-class FileReader {
-    let file: UnsafeMutablePointer<FILE>
-    let fileSize: Int
-    init(paramPath: String) throws {
-        guard let tmpFile = fopen(paramPath, "rb") else {
-            throw PaddleMobileError.makeError(type: .loaderError, msg: "open param file error" + paramPath)
-        }
-        file = tmpFile
-        fseek(file, 0, SEEK_END)
-        fileSize = ftell(file)
-        guard fileSize > 0 else {
-            throw PaddleMobileError.makeError(type: .loaderError, msg: "param file size is too small")
-        }
-        rewind(file)
-    }
-    
-    func read<T>() -> UnsafeMutablePointer<T> {
-        let ptr = UnsafeMutablePointer<T>.allocate(capacity: MemoryLayout<T>.size * fileSize)
-        fread(ptr, fileSize, 1, file)
-        return ptr
-    }
-    
-    deinit {
-        fclose(file)
-    }
-}
-
-enum Platform {
-    case GPU
-}
-
-let platformSupport: [(Platform, String)] = [(.GPU, "GPU")]
-
-enum SupportModel: String{
-    case yolo               = "yolo"
-    case mobilenet_combined = "mobilenet_combined"
-    case super_resolution   = "superresoltion"
-    case mobilenet          = "mobilenet"
-    
-    static func supportedModels() -> [SupportModel] {
-        return [.super_resolution, .yolo, .mobilenet_combined, .mobilenet]
-    }
-}
-
-let netSupport: [SupportModel : Net] = [
-    .super_resolution : try! SuperResolutionNet.init(device: MetalHelper.shared.device),
-    .yolo : try! YoloNet.init(device: MetalHelper.shared.device),
-    .mobilenet_combined : try! MobileNetCombined.init(device: MetalHelper.shared.device),
-    .mobilenet : try! MobileNet.init(device: MetalHelper.shared.device)]
-
-class ViewController: UIViewController {
-    @IBOutlet weak var resultTextView: UITextView!
-    @IBOutlet weak var selectImageView: UIImageView!
-    @IBOutlet weak var elapsedTimeLabel: UILabel!
-    @IBOutlet weak var modelPickerView: UIPickerView!
-    @IBOutlet weak var threadPickerView: UIPickerView!
-    @IBOutlet weak var videoView: UIView!
-    var inputImageSize: CGSize = CGSize.init(width: 0, height: 0)
-    //  var videoCapture: VideoCapture!
-    
-    var textureCache: CVMetalTextureCache?
-    var selectImage: UIImage?
-    var inputPointer: UnsafeMutablePointer<Float32>?
-    var modelType: SupportModel = SupportModel.supportedModels()[0]
-    var toPredictTexture: MTLTexture?
-    
-    var runner: Runner!
-    var platform: Platform = .GPU
-    var threadNum = 1
-    
-    @IBAction func loadAct(_ sender: Any) {
-        runner = try! Runner.init(inNet: netSupport[modelType]!, commandQueue: MetalHelper.shared.queue)
-        if platform == .GPU {
-            //      let filePath = Bundle.main.path(forResource: "mingren_input_data", ofType: nil)
-            //      let fileReader = try! FileReader.init(paramPath: filePath!)
-            //      let pointer: UnsafeMutablePointer<Float32> = fileReader.read()
-            //      
-            //      
-            //      let buffer = MetalHelper.shared.device.makeBuffer(length: fileReader.fileSize, options: .storageModeShared)
-            //      
-            //      buffer?.contents().copyMemory(from: pointer, byteCount: fileReader.fileSize)
-            
-            
-            if self.toPredictTexture == nil {
-                let beforeDate = Date.init()
-                if modelType == .mobilenet_combined || modelType == .yolo {
-                    let buffer = ImageTool.image(toRGBPixelBuffer: selectImage!)
-                    let texture = convertToMTLTexture(imageBuffer: buffer.takeRetainedValue())
-                    self.toPredictTexture = texture
-                } else {
-                    runner.getTexture(image: selectImage!.cgImage!) { [weak self] (success, texture) in
-                        let timeUse = Date.init().timeIntervalSince(beforeDate)
-                        print("get texture time use: \(timeUse)")
-                        self?.toPredictTexture = texture
-                    }
-                }
-            }
-        } else {
-            print( " unsupport " )
-        }
-        
-        if runner.load(optimizeProgram: true, optimizeMemory: true) {
-            print(" load success ! ")
-        } else {
-            print(" load error ! ")
-        }
-    }
-    
-    @IBAction func selectImageAct(_ sender: Any) {
-        let imagePicker = UIImagePickerController()
-        imagePicker.sourceType = .camera
-        imagePicker.delegate = self
-        self.present(imagePicker, animated: true, completion: nil)
-    }
-    
-    @IBAction func clearAct(_ sender: Any) {
-        runner.clear()
-    }
-    
-    @IBAction func predictAct(_ sender: Any) {
-        let max = 1
-        switch platform {
-        case .GPU:
-            guard let inTexture = toPredictTexture else {
-                resultTextView.text = "请选择图片 ! "
-                return
-            }
-            
-            let startDate = Date.init()
-            for i in 0..<max {
-                self.runner.predict(texture: inTexture) { [weak self] (success, resultHolder)  in
-                    guard let sSelf = self else {
-                        print("runner nil in predict completion")
-                        return
-                    }
-                    
-                    if success, let inResultHolderArr = resultHolder {
-//                        writeToLibrary(fileName: "00001_result_32_new_new", buffer: UnsafeBufferPointer<Float32>.init(start: inResultHolderArr[0].result, count: inResultHolderArr[0].capacity))
-
-                        let inResultHolder = inResultHolderArr[0]
-                        if i == max - 1 {
-                            let time = Date.init().timeIntervalSince(startDate)
-                            
-                            print(inResultHolder.result.floatArr(count: inResultHolder.capacity).strideArray())
-                            DispatchQueue.main.async {
-                                sSelf.resultTextView.text = sSelf.runner.net.resultStr(res: resultHolder!)
-                                sSelf.elapsedTimeLabel.text = "平均耗时: \(time/Double(max) * 1000.0) ms"
-                            }
-                        }
-                    }
-                    
-                    DispatchQueue.main.async {
-                        resultHolder?.first?.releasePointer()
-                    }
-                }
-            }
-        }
-    }
-    
-    override func viewDidLoad() {
-        super.viewDidLoad()
-        
-        CVMetalTextureCacheCreate(kCFAllocatorDefault, nil, MetalHelper.shared.device, nil, &textureCache)
-
-        GlobalConfig.shared.computePrecision = .Float16
-        GlobalConfig.shared.debug = false
-        
-        modelPickerView.delegate = self
-        modelPickerView.dataSource = self
-        threadPickerView.delegate = self
-        threadPickerView.dataSource = self
-        if let image = UIImage.init(named: "00001.jpg") {
-            selectImage = image
-            selectImageView.image = image
-        } else {
-            print("请添加测试图片")
-        }
-    }
-}
-
-extension ViewController: UIPickerViewDataSource, UIPickerViewDelegate{
-    func numberOfComponents(in pickerView: UIPickerView) -> Int {
-        if pickerView == modelPickerView {
-            return 1
-        } else if pickerView == threadPickerView {
-            return 1
-        } else {
-            print("unsupport picker view")
-            return 0
-        }
-    }
-    
-    func pickerView(_ pickerView: UIPickerView, numberOfRowsInComponent component: Int) -> Int {
-        if pickerView == modelPickerView {
-            return SupportModel.supportedModels().count
-        } else if pickerView == threadPickerView {
-            return platformSupport.count
-        } else {
-            print("unsupport picker view")
-            return 0
-        }
-    }
-    
-    public func pickerView(_ pickerView: UIPickerView, titleForRow row: Int, forComponent component: Int) -> String? {
-        if pickerView == modelPickerView {
-            return SupportModel.supportedModels()[row].rawValue
-        } else if pickerView == threadPickerView {
-            return platformSupport[row].1
-        } else {
-            print("unsupport picker view")
-            return ""
-        }
-    }
-    
-    public func pickerView(_ pickerView: UIPickerView, didSelectRow row: Int, inComponent component: Int) {
-        if pickerView == modelPickerView {
-            self.modelType = SupportModel.supportedModels()[row]
-        } else if pickerView == threadPickerView {
-            platform = platformSupport[row].0
-        } else {
-            print("unsupport picker view")
-        }
-    }
-}
-
-extension ViewController:  UIImagePickerControllerDelegate, UINavigationControllerDelegate {
-    func imagePickerController(_ picker: UIImagePickerController, didFinishPickingMediaWithInfo info: [String : Any]) {
-        picker.dismiss(animated: true){[weak self] in
-            guard let sSelf = self, let image =  info["UIImagePickerControllerOriginalImage"] as? UIImage else{
-                print("no image")
-                return
-            }
-            sSelf.selectImage = image
-            sSelf.selectImageView.image = image
-            sSelf.runner.getTexture(image: image.cgImage!, getTexture: { (success, texture) in
-                sSelf.toPredictTexture = texture
-            })
-        }
-    }
-}
-
-var bool1 = false
-extension ViewController: VideoCaptureDelegate{
-    func predictTexture(texture: MTLTexture) {
-        runner.scaleTexture(input: texture) { (success, scaledTexture) in
-            if success, let scaledTexture = scaledTexture {
-                self.runner.predict(texture: scaledTexture, completion: { (success, resultHolder) in
-                    resultHolder?.first?.releasePointer()
-                })
-            }
-        }
-    }
-    
-}
-
-
-extension ViewController {
-    private func convertToMTLTexture(imageBuffer: CVPixelBuffer?) -> MTLTexture? {
-        if let textureCache = textureCache, let imageBuffer = imageBuffer {
-            CVPixelBufferLockBaseAddress(imageBuffer, CVPixelBufferLockFlags(rawValue: 0))
-            let width = CVPixelBufferGetWidth(imageBuffer)
-            let height = CVPixelBufferGetHeight(imageBuffer)
-            inputImageSize = CGSize(width: width, height: height);
-            let pixelFormat: MTLPixelFormat = .bgra8Unorm
-            var texture: CVMetalTexture?
-            
-            CVMetalTextureCacheCreateTextureFromImage(kCFAllocatorDefault, textureCache,
-                                                      imageBuffer, nil, pixelFormat, width, height, 0, &texture)
-            
-            CVPixelBufferUnlockBaseAddress(imageBuffer, CVPixelBufferLockFlags(rawValue: 0))
-            
-            if let texture = texture {
-                return CVMetalTextureGetTexture(texture)
-            }
-        }
-        return nil
-    }
-}
-
-
-
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/BatchNormKernel.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/BatchNormKernel.metal
deleted file mode 100644
index 96333a07a9..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/BatchNormKernel.metal
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-kernel void batchnorm(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                      texture2d_array<float, access::write> outTexture [[texture(1)]],
-                      const device float4 * nscale [[buffer(0)]],
-                      const device float4 * nbias [[buffer(1)]],
-                      uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  const float4 input = inTexture.read(gid.xy, gid.z);
-  float4 output = input * nscale[gid.z] + nbias[gid.z];
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void batchnorm_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                      texture2d_array<half, access::write> outTexture [[texture(1)]],
-                      const device half4 * newScale [[buffer(0)]],
-                      const device half4 * newBias [[buffer(1)]],
-                      uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  const half4 input = inTexture.read(gid.xy, gid.z);
-  half4 output = input * newScale[gid.z] + newBias[gid.z];
-  outTexture.write(output, gid.xy, gid.z);
-}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/BatchNormRelu.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/BatchNormRelu.metal
deleted file mode 100644
index eb94408c8a..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/BatchNormRelu.metal
+++ /dev/null
@@ -1,36 +0,0 @@
-//
-//  BatchNormRelu.metal
-//  paddle-mobile
-//
-
-#include <metal_stdlib>
-using namespace metal;
-
-struct MetalConvParam {
-    short offsetX;
-    short offsetY;
-    short offsetZ;
-    ushort strideX;
-    ushort strideY;
-};
-
-kernel void batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                         const device float4 *new_scale [[buffer(0)]],
-                                         const device float4 *new_biase [[buffer(1)]],
-                                         uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    float4 input;
-    float4 output;
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    input = inTexture.sample(sample, gid.x, gid.y, gid.z);
-    output = fmax(input * new_scale[gid.z] + new_biase[gid.z], 0.0);
-    outTexture.write(output, gid.xy, gid.z);
-
-}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/BilinearInterp.inc.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/BilinearInterp.inc.metal
deleted file mode 100644
index a590f80898..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/BilinearInterp.inc.metal
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#ifdef P
-
-#define CONCAT2(a, b) a ## b
-#define CONCAT2_(a, b) a ## _ ## b
-
-#define FUNC(f, p) CONCAT2_(f, p)
-#define VECTOR(p, n) CONCAT2(p, n)
-
-kernel void FUNC(bilinear_interp, P)(texture2d_array<P, access::read> input [[texture(0)]],
-                     texture2d_array<P, access::write> output [[texture(1)]],
-                     constant bilinear_interp_param & pm [[buffer(0)]],
-                     uint3 gid [[thread_position_in_grid]]) {
-  VECTOR(P, 4) r;
-  if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
-    r = input.read(gid.xy, gid.z);
-  } else {
-    P w = gid.x * pm.ratio_w;
-    P h = gid.y * pm.ratio_h;
-    uint w0 = w, h0 = h;
-    uint w1 = w0 + 1, h1 = h0 + 1;
-    P w1lambda = w - w0, h1lambda = h - h0;
-    P w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
-    if (w1 >= input.get_width()) w1 = w0;
-    if (h1 >= input.get_height()) h1 = h0;
-    VECTOR(P, 4) r0 = input.read(uint2(w0, h0), gid.z);
-    VECTOR(P, 4) r1 = input.read(uint2(w1, h0), gid.z);
-    VECTOR(P, 4) r2 = input.read(uint2(w0, h1), gid.z);
-    VECTOR(P, 4) r3 = input.read(uint2(w1, h1), gid.z);
-    r = h2lambda * (w2lambda * r0 + w1lambda * r1)
-      + h1lambda * (w2lambda * r2 + w1lambda * r3);
-  }
-  output.write(r, gid.xy, gid.z);
-}
-
-#endif
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/BilinearInterp.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/BilinearInterp.metal
deleted file mode 100644
index 394cf89db0..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/BilinearInterp.metal
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-struct bilinear_interp_param {
-  float ratio_h;
-  float ratio_w;
-};
-
-#define P float
-#include "BilinearInterp.inc.metal"
-#undef P
-
-#define P half
-#include "BilinearInterp.inc.metal"
-#undef P
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/BoxCoder.inc.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/BoxCoder.inc.metal
deleted file mode 100644
index 918fbac1a7..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/BoxCoder.inc.metal
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#ifdef P
-
-#define CONCAT2(a, b) a ## b
-#define CONCAT2_(a, b) a ## _ ## b
-
-#define FUNC(f, p) CONCAT2_(f, p)
-#define VECTOR(p, n) CONCAT2(p, n)
-kernel void FUNC(boxcoder, P)(texture2d_array<P, access::read> priorBox [[texture(0)]],
-                     texture2d_array<P, access::read> priorBoxVar [[texture(1)]],
-                     texture2d_array<P, access::read> targetBox [[texture(2)]],
-                     texture2d_array<P, access::write> output[[texture(3)]],
-                     uint3 gid [[thread_position_in_grid]]) {
-  VECTOR(P, 4) p = priorBox.read(uint2(0, gid.x), gid.z);
-  VECTOR(P, 4) pv = priorBoxVar.read(uint2(0, gid.x), gid.z);
-  VECTOR(P, 4) t;
-  t[0] = targetBox.read(uint2(0, gid.x), gid.z)[0];
-  t[1] = targetBox.read(uint2(1, gid.x), gid.z)[0];
-  t[2] = targetBox.read(uint2(2, gid.x), gid.z)[0];
-  t[3] = targetBox.read(uint2(3, gid.x), gid.z)[0];
-  
-  P px = (p.x + p.z) / 2;
-  P py = (p.y + p.w) / 2;
-  P pw = p.z - p.x;
-  P ph = p.w - p.y;
-  
-  P tx = pv.x * t.x * pw + px;
-  P ty = pv.y * t.y * ph + py;
-  P tw = exp(pv.z * t.z) * pw;
-  P th = exp(pv.w * t.w) * ph;
-  
-  VECTOR(P, 4) r;
-  r.x = tx - tw / 2;
-  r.y = ty - th / 2;
-  r.z = tx + tw / 2;
-  r.w = ty + th / 2;
-
-  output.write(r, gid.xy, gid.z);
-}
-
-#endif
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/BoxCoder.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/BoxCoder.metal
deleted file mode 100644
index 4009e213d5..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/BoxCoder.metal
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-#define P float
-#include "BoxCoder.inc.metal"
-#undef P
-#define P half
-#include "BoxCoder.inc.metal"
-#undef P
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/Common.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/Common.metal
deleted file mode 100644
index 40bae035c0..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/Common.metal
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-
-inline void xyzn2abcd_1(int xyzn[4], int abcd[4]) {
-  abcd[0] = abcd[1] = abcd[2] = 0;
-  abcd[3] = xyzn[0] * 4 + xyzn[3];
-}
-inline void xyzn2abcd_2(int xyzn[4], int abcd[4]) {
-  abcd[0] = abcd[1] = 0;
-  abcd[2] = xyzn[1];
-  abcd[3] = xyzn[0] * 4 + xyzn[3];
-}
-inline void xyzn2abcd_3(int xyzn[4], int abcd[4]) {
-  abcd[0] = 0;
-  abcd[3] = xyzn[0];
-  abcd[2] = xyzn[1];
-  abcd[1] = xyzn[2] * 4 + xyzn[3];
-}
-inline void xyzn2abcd_4(int C, int xyzn[4], int abcd[4]) {
-  abcd[2] = xyzn[0];
-  abcd[1] = xyzn[1];
-  uint t = xyzn[2] * 4 + xyzn[3];
-  abcd[0] = t / C;
-  abcd[3] = t % C;
-}
-
-inline void abcd2xyzn_1(int abcd[4], int xyzn[4]) {
-  xyzn[1] = xyzn[2] = 0;
-  xyzn[0] = abcd[3] / 4;
-  xyzn[1] = abcd[3] % 4;
-}
-inline void abcd2xyzn_2(int abcd[4], int xyzn[4]) {
-  xyzn[2] = 0;
-  xyzn[1] = abcd[2];
-  xyzn[0] = abcd[3] / 4;
-  xyzn[3] = abcd[3] % 4;
-}
-inline void abcd2xyzn_3(int abcd[4], int xyzn[4]) {
-  xyzn[0] = abcd[3];
-  xyzn[1] = abcd[2];
-  xyzn[2] = abcd[1] / 4;
-  xyzn[3] = abcd[1] % 4;
-}
-inline void abcd2xyzn_4(int C, int abcd[4], int xyzn[4]) {
-  xyzn[0] = abcd[2];
-  xyzn[1] = abcd[1];
-  uint t = abcd[0] * C + abcd[3];
-  xyzn[2] = t / 4;
-  xyzn[3] = t % 4;
-}
-
-inline void xyzn2abcd(int C, int xyzn[4], int abcd[4]) {
-  abcd[2] = xyzn[0];
-  abcd[1] = xyzn[1];
-  uint t = xyzn[2] * 4 + xyzn[3];
-  abcd[0] = t / C;
-  abcd[3] = t % C;
-}
-
-inline void abcd2xyzn(int C, int abcd[4], int xyzn[4]) {
-  xyzn[0] = abcd[2];
-  xyzn[1] = abcd[1];
-  uint t = abcd[0] * C + abcd[3];
-  xyzn[2] = t / 4;
-  xyzn[3] = t % 4;
-}
-
-inline int32_t abcd2index(int32_t dim[4], int32_t abcd[4]) {
-  int32_t r = abcd[0];
-  r = r * dim[1] + abcd[1];
-  r = r * dim[2] + abcd[2];
-  r = r * dim[3] + abcd[3];
-  return r;
-}
-
-inline void index2abcd(int32_t dim[4], int32_t ind, int32_t abcd[4]) {
-  abcd[3] = ind % dim[3]; ind /= dim[3];
-  abcd[2] = ind % dim[2]; ind /= dim[2];
-  abcd[1] = ind % dim[1]; ind /= dim[1];
-  abcd[0] = ind;
-}
-
-inline void trans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) {
-  for (int i = 0; i < 4; i++) {
-    opos[i] = ipos[trans[i]];
-  }
-}
-
-inline void invtrans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) {
-  for (int i = 0; i < 4; i++) {
-    opos[trans[i]] = ipos[i];
-  }
-}
-
-
-struct MetalConvParam {
-  short offsetX;
-  short offsetY;
-  short offsetZ;
-  ushort strideX;
-  ushort strideY;
-  ushort dilationX;
-  ushort dilationY;
-};
-
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConcatKernel.inc.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConcatKernel.inc.metal
deleted file mode 100644
index 2b070fc48b..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConcatKernel.inc.metal
+++ /dev/null
@@ -1,318 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#ifdef P
-
-#define CONCAT2(a, b) a ## b
-#define CONCAT2_(a, b) a ## _ ## b
-#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
-#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
-#define CONCAT5_(a, b, c, d, e) a ## _ ## b ## _ ## c ## _ ## d ## _ ## e
-
-#define FUNC(f, r, n, v, p) CONCAT5_(f, r, n, v, p)
-#define VECTOR(p, n) CONCAT2(p, n)
-#define FUNC_R(f, r) CONCAT2_(f, r)
-
-#if V == VX
-#define VV x
-#elif V == VY
-#define VV y
-#elif V == VZ
-#define VV z
-#else
-#define VV normal
-#endif
-
-#if V == VNORMAL
-//kernel void FUNC(concat, R, N, normal, P)(array<texture2d_array<P, access::read>, N> in [[texture(0)]],
-//                                     texture2d_array<P, access::read> out_x [[texture(N)]],
-//                                     texture2d_array<P, access::write> out [[texture(N+1)]],
-//                                     constant ConcatParam & pm [[buffer(0)]],
-//                                     uint3 gid [[thread_position_in_grid]]) {
-//}
-kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
-                                          texture2d_array<P, access::read> in1 [[texture(1)]],
-#if N >= 3
-                                          texture2d_array<P, access::read> in2 [[texture(2)]],
-#endif
-#if N >= 4
-                                          texture2d_array<P, access::read> in3 [[texture(3)]],
-#endif
-#if N >= 5
-                                          texture2d_array<P, access::read> in4 [[texture(4)]],
-#endif
-#if N >= 6
-                                          texture2d_array<P, access::read> in5 [[texture(5)]],
-#endif
-                                          texture2d_array<P, access::read> inx [[texture(N)]],
-                                          texture2d_array<P, access::write> out [[texture(N+1)]],
-                                          constant ConcatParam & pm [[buffer(0)]],
-                                          uint3 gid [[thread_position_in_grid]]) {
-
-   ConcatParam cp = pm;
-   int xyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, abcd[4], oxyzn[4];
-   VECTOR(P, 4) r = inx.read(gid.xy, gid.z);
-   for (int i = 0; i < 4; i++) {
-     xyzn[3] = i;
-#if R == 4
-     xyzn2abcd_4(cp.odim[3], xyzn, abcd);
-#else
-     FUNC_R(xyzn2abcd, R)(xyzn, abcd);
-#endif
-     int k = abcd[cp.axis] - cp.offset;
-     if (k < 0) continue;
-     int j = 0;
-     for (; j < N; j++) {
-       if (k < cp.vdim[j]) {
-         break;
-       }
-       k -= cp.vdim[j];
-     }
-     if (j == N) {
-       continue;
-     }
-     int ta = cp.odim[cp.axis];
-     abcd[cp.axis] = k;
-     cp.odim[cp.axis] = cp.vdim[j];
-#if R == 4
-     abcd2xyzn_4(cp.odim[3], abcd, oxyzn);
-#else
-     FUNC_R(abcd2xyzn, R)(abcd, oxyzn);
-#endif
-     cp.odim[cp.axis] = ta;
-     switch (j) {
-       case 0: r[i] = in0.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
-       case 1: r[i] = in1.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
-#if N >= 3
-       case 2: r[i] = in2.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
-#endif
-#if N >= 4
-       case 3: r[i] = in3.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
-#endif
-#if N >= 5
-       case 4: r[i] = in4.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
-#endif
-#if N >= 6
-       case 5: r[i] = in5.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
-#endif
-     }
-   }
-   out.write(r, gid.xy, gid.z);
-}
-
-#endif // V == NORMAL
-
-
-
-#if V == VX
-kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
-                                          texture2d_array<P, access::read> in1 [[texture(1)]],
-#if N >= 3
-                                          texture2d_array<P, access::read> in2 [[texture(2)]],
-#endif // N >= 3
-#if N >= 4
-                                          texture2d_array<P, access::read> in3 [[texture(3)]],
-#endif // N >= 4
-#if N >= 5
-                                          texture2d_array<P, access::read> in4 [[texture(4)]],
-#endif // N >= 5
-#if N >= 6
-                                          texture2d_array<P, access::read> in5 [[texture(5)]],
-#endif // N >= 6
-                                          texture2d_array<P, access::write> out [[texture(N)]],
-                                          constant ConcatParam & pm [[buffer(0)]],
-                                          uint3 gid [[thread_position_in_grid]]) {
-  int x = gid.x - pm.offset;
-  if (x < 0) return;
-  if (x < pm.vdim[0]) {
-    VECTOR(P, 4) r = in0.read(gid.xy, gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
-  x -= pm.vdim[0];
-  if (x < pm.vdim[1]) {
-    VECTOR(P, 4) r = in1.read(uint2(x, gid.y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
-#if N >= 3
-  x -= pm.vdim[1];
-  if (x < pm.vdim[2]) {
-    VECTOR(P, 4) r = in2.read(uint2(x, gid.y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
-#endif // N >= 3
-#if N >= 4
-  x -= pm.vdim[2];
-  if (x < pm.vdim[3]) {
-    VECTOR(P, 4) r = in3.read(uint2(x, gid.y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
-#endif // N >= 4
-#if N >= 5
-  x -= pm.vdim[3];
-  if (x < pm.vdim[4]) {
-    VECTOR(P, 4) r = in4.read(uint2(x, gid.y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
-#endif // N >= 5
-#if N >= 6
-  x -= pm.vdim[4];
-  if (x < pm.vdim[5]) {
-    VECTOR(P, 4) r = in5.read(uint2(x, gid.y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
-#endif // N >= 6
-}
-#endif // V == VX
-
-#if V == VY
-kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
-                                      texture2d_array<P, access::read> in1 [[texture(1)]],
-#if N >= 3
-                                      texture2d_array<P, access::read> in2 [[texture(2)]],
-#endif // N >= 3
-#if N >= 4
-                                      texture2d_array<P, access::read> in3 [[texture(3)]],
-#endif // N >= 4
-#if N >= 5
-                                      texture2d_array<P, access::read> in4 [[texture(4)]],
-#endif // N >= 5
-#if N >= 6
-                                      texture2d_array<P, access::read> in5 [[texture(5)]],
-#endif // N >= 6
-                                      texture2d_array<P, access::write> out [[texture(N)]],
-                                      constant ConcatParam & pm [[buffer(0)]],
-                                      uint3 gid [[thread_position_in_grid]]) {
-  int y = gid.y - pm.offset;
-  if (y < 0) return;
-  if (y < pm.vdim[0]) {
-    VECTOR(P, 4)  r = in0.read(gid.xy, gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
-  y -= pm.vdim[0];
-  if (y < pm.vdim[1]) {
-    VECTOR(P, 4)  r = in1.read(uint2(gid.x, y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
-#if N >= 3
-  y -= pm.vdim[1];
-  if (y < pm.vdim[2]) {
-    VECTOR(P, 4)  r = in2.read(uint2(gid.x, y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
-#endif // N >= 3
-#if N >= 4
-  y -= pm.vdim[2];
-  if (y < pm.vdim[3]) {
-    VECTOR(P, 4)  r = in3.read(uint2(gid.x, y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
-#endif // N >= 4
-#if N >= 5
-  y -= pm.vdim[3];
-  if (y < pm.vdim[4]) {
-    VECTOR(P, 4)  r = in4.read(uint2(gid.x, y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
-#endif // N >= 5
-#if N >= 6
-  y -= pm.vdim[4];
-  if (y < pm.vdim[5]) {
-    VECTOR(P, 4)  r = in5.read(uint2(gid.x, y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
-#endif // N >= 6
-}
-#endif // V == VY
-
-#if V == VZ
-kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
-                                      texture2d_array<P, access::read> in1 [[texture(1)]],
-#if N >= 3
-                                      texture2d_array<P, access::read> in2 [[texture(2)]],
-#endif // N >= 3
-#if N >= 4
-                                      texture2d_array<P, access::read> in3 [[texture(3)]],
-#endif // N >= 4
-#if N >= 5
-                                      texture2d_array<P, access::read> in4 [[texture(4)]],
-#endif // N >= 5
-#if N >= 6
-                                      texture2d_array<P, access::read> in5 [[texture(5)]],
-#endif // N >= 6
-                                      texture2d_array<P, access::write> out [[texture(N)]],
-                                      constant ConcatParam & pm [[buffer(0)]],
-                                      uint3 gid [[thread_position_in_grid]]) {
-  int z = gid.z - pm.offset;
-  if (z < 0) return;
-  if (z < pm.vdim[0]) {
-    VECTOR(P, 4) r = in0.read(gid.xy, gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
-  z -= pm.vdim[0];
-  if (z < pm.vdim[1]) {
-    VECTOR(P, 4)  r = in1.read(gid.xy, z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
-#if N >= 3
-  z -= pm.vdim[1];
-  if (z < pm.vdim[2]) {
-    VECTOR(P, 4)  r = in2.read(gid.xy, z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
-#endif // N >= 3
-#if N >= 4
-  z -= pm.vdim[2];
-  if (z < pm.vdim[3]) {
-    VECTOR(P, 4)  r = in3.read(gid.xy, z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
-#endif // N >= 4
-#if N >= 5
-  z -= pm.vdim[3];
-  if (z < pm.vdim[4]) {
-    VECTOR(P, 4)  r = in4.read(gid.xy, z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
-#endif // N >= 5
-#if N >= 6
-  z -= pm.vdim[4];
-  if (z < pm.vdim[5]) {
-    VECTOR(P, 4)  r = in5.read(gid.xy, z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
-#endif // N >= 6
-}
-#endif // V == VZ
-
-
-#undef VV
-#endif // #ifdef P
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConcatKernel.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConcatKernel.metal
deleted file mode 100644
index b7d17f2d25..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConcatKernel.metal
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-#include "Common.metal"
-
-using namespace metal;
-
-struct ConcatParam {
-  int32_t odim[4];
-  int32_t axis;
-  int32_t offset;
-  int32_t trans[4];
-  int32_t vdim[6];
-};
-
-#define VNORMAL 1
-#define VX 2
-#define VY 3
-#define VZ 4
-
-// >> fast mode
-// only support concat_{2,3,4}_{2,3,4,5,6}_y_{float,half}
-// only support concat_{3,4}_{2,3,4,5,6}_x_{float,half}
-// only support concat_{1,2,3,4}_{2,3,4,5,6}_z_{float,half}
-// >> normal mode (loop mode)
-// ssd-ar: (R=4, N=3, V=z), (R=3, N=2, V=y), (R=2, N=5, V=x), (R=3, N=5, V=x)
-// ssd: (R=2, N=6, V=y), (R=3, N=6, V=y)
-// genet: (R=4, N=2, V=normal)
-
-// ssd-ar: (R=3, N=5, V=x)
-#define V VX
-  #define R 3
-    #define N 5
-      #define P float
-        #include "ConcatKernel.inc.metal"
-      #undef P
-      #define P half
-        #include "ConcatKernel.inc.metal"
-      #undef P
-    #undef N
-  #undef R
-#undef V
-
-// ssd-ar: (R=2, N=5, V=x)
-#define V VX
-  #define R 2
-    #define N 5
-      #define P float
-        #include "ConcatKernel.inc.metal"
-      #undef P
-      #define P half
-        #include "ConcatKernel.inc.metal"
-      #undef P
-    #undef N
-  #undef R
-#undef V
-
-
-// ssd-ar: (R=3, N=2, V=y)
-#define V VY
-  #define R 3
-    #define N 2
-      #define P float
-        #include "ConcatKernel.inc.metal"
-      #undef P
-      #define P half
-        #include "ConcatKernel.inc.metal"
-      #undef P
-    #undef N
-  #undef R
-#undef V
-
-// ssd-ar: (R=4, N=3, V=z)
-#define V VZ
-  #define R 4
-    #define N 3
-      #define P float
-        #include "ConcatKernel.inc.metal"
-      #undef P
-      #define P half
-        #include "ConcatKernel.inc.metal"
-      #undef P
-    #undef N
-  #undef R
-#undef V
-
-
-// ssd: (R=2, N=6, V=y)
-#define V VY
-  #define R 2
-    #define N 6
-      #define P float
-        #include "ConcatKernel.inc.metal"
-      #undef P
-      #define P half
-        #include "ConcatKernel.inc.metal"
-      #undef P
-    #undef N
-  #undef R
-#undef V
-
-// ssd: (R=3, N=6, V=y)
-#define V VY
-  #define R 3
-    #define N 6
-      #define P float
-        #include "ConcatKernel.inc.metal"
-      #undef P
-      #define P half
-        #include "ConcatKernel.inc.metal"
-      #undef P
-    #undef N
-  #undef R
-#undef V
-
-#define V VNORMAL
-  #define R 4
-    #define N 2
-      #define P float
-        #include "ConcatKernel.inc.metal"
-      #undef P
-      #define P half
-        #include "ConcatKernel.inc.metal"
-      #undef P
-    #undef N
-  #undef R
-#undef V
-
-
-#define V VY
-  #define R 2
-    #define N 2
-      #define P float
-        #include "ConcatKernel.inc.metal"
-      #undef P
-      #define P half
-        #include "ConcatKernel.inc.metal"
-      #undef P
-    #undef N
-  #undef R
-#undef V
-
-
-#define V VY
-  #define R 2
-    #define N 5
-      #define P float
-        #include "ConcatKernel.inc.metal"
-      #undef P
-      #define P half
-        #include "ConcatKernel.inc.metal"
-      #undef P
-    #undef N
-  #undef R
-#undef V
-
-
-
-
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvAddBNReluKernel.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvAddBNReluKernel.metal
deleted file mode 100644
index 87b60a64fc..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvAddBNReluKernel.metal
+++ /dev/null
@@ -1,310 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-#include "Common.metal"
-using namespace metal;
-
-
-kernel void conv_add_batch_norm_relu_1x1_half(
-            texture2d_array<half, access::sample> inTexture [[texture(0)]],
-            texture2d_array<half, access::write> outTexture [[texture(1)]],
-            constant MetalConvParam &param [[buffer(0)]],
-            const device half4 *weights [[buffer(1)]],
-            const device half4 *biase [[buffer(2)]],
-            const device half4 *new_scale [[buffer(3)]],
-            const device half4 *new_biase [[buffer(4)]],
-            uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  half4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(input, weight_x);
-    
-    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(input, weight_y);
-    
-    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(input, weight_z);
-    
-    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(input, weight_w);
-  }
-  output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
-  outTexture.write(half4(output), gid.xy, gid.z);
-}
-
-kernel void conv_add_batch_norm_relu_3x3_half(
-            texture2d_array<half, access::sample> inTexture [[texture(0)]],
-            texture2d_array<half, access::write> outTexture [[texture(1)]],
-            constant MetalConvParam &param [[buffer(0)]],
-            const device half4 *weights [[buffer(1)]],
-            const device half4 *biase [[buffer(2)]],
-            const device half4 *new_scale [[buffer(3)]],
-            const device half4 *new_biase [[buffer(4)]],
-            uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  half4 input[9];
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-    for (int j = 0; j < 9; ++j) {
-      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
-    }
-  }
-  output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
-  outTexture.write(half4(output), gid.xy, gid.z);
-}
-
-kernel void depthwise_conv_add_batch_norm_relu_3x3_half(
-            texture2d_array<half, access::sample> inTexture [[texture(0)]],
-            texture2d_array<half, access::write> outTexture [[texture(1)]],
-            constant MetalConvParam &param [[buffer(0)]],
-            const device half *weights [[buffer(1)]],
-            const device half4 *biase [[buffer(2)]],
-            const device half4 *new_scale [[buffer(3)]],
-            const device half4 *new_biase [[buffer(4)]],
-            uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  float4 output = float4(0.0);
-  half4 inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    half4 input = inputs[j];
-    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-  }
-  output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
-  outTexture.write(half4(output), gid.xy, gid.z);
-}
-
-
-
-/*---------------------------------------------*/
-
-
-
-kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                         constant MetalConvParam &param [[buffer(0)]],
-                                         const device float4 *weights [[buffer(1)]],
-                                         const device float4 *biase [[buffer(2)]],
-                                         const device float4 *new_scale [[buffer(3)]],
-                                         const device float4 *new_biase [[buffer(4)]],
-                                         uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  float4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(input, weight_x);
-    
-    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(input, weight_y);
-    
-    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(input, weight_z);
-    
-    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(input, weight_w);
-  }
-  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                         constant MetalConvParam &param [[buffer(0)]],
-                                         const device float4 *weights [[buffer(1)]],
-                                         const device float4 *biase [[buffer(2)]],
-                                         const device float4 *new_scale [[buffer(3)]],
-                                         const device float4 *new_biase [[buffer(4)]],
-                                         uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  float4 input[9];
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-    for (int j = 0; j < 9; ++j) {
-      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
-    }
-  }
-  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                                   texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                                   constant MetalConvParam &param [[buffer(0)]],
-                                                   const device float *weights [[buffer(1)]],
-                                                   const device float4 *biase [[buffer(2)]],
-                                                   const device float4 *new_scale [[buffer(3)]],
-                                                   const device float4 *new_biase [[buffer(4)]],
-                                                   uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  float4 output = float4(0.0);
-  float4 inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    float4 input = inputs[j];
-    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-  }
-  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
-  outTexture.write(output, gid.xy, gid.z);
-}
-
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvAddMetal.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvAddMetal.metal
deleted file mode 100644
index 274e416576..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvAddMetal.metal
+++ /dev/null
@@ -1,622 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-#include "Common.metal"
-
-using namespace metal;
-
-#pragma mark - convAdd
-kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device float4 *weights [[buffer(1)]],
-                         const device float4 *biase [[buffer(2)]],
-                         uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = biase[gid.z];
-  
-  float4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(input, weight_x);
-    
-    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(input, weight_y);
-    
-    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(input, weight_z);
-    
-    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(input, weight_w);
-  }
-//  output = output + biase[gid.z];
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device float4 *weights [[buffer(1)]],
-                         const device float4 *biase [[buffer(2)]],
-                         uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  
-  const uint kernelHXW = 9;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = biase[gid.z];
-  
-  ushort dilation_x = param.dilationX;
-  ushort dilation_y = param.dilationY;
-  
-  float4 input[9];
-  
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
-    
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
-    
-    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
-    
-    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
-    
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    
-    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
-    
-    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
-    
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
-    
-    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
-    
-    for (int j = 0; j < 9; ++j) {
-      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
-    }
-  }
-//  output = output + biase[gid.z];
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_add_5x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device float4 *weights [[buffer(1)]],
-                         const device float4 *biase [[buffer(2)]],
-                         uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  
-  const uint kernelHXW = 5;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = biase[gid.z];
-  
-  ushort dilation_y = param.dilationY;
-  float4 input[5];
-  
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
-    
-    input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
-    
-    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    
-    input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
-    
-    input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
-    
-    for (int j = 0; j < 5; ++j) {
-      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
-    }
-  }
-//  output = output + biase[gid.z];
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-
-kernel void conv_add_1x5(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device float4 *weights [[buffer(1)]],
-                         const device float4 *biase [[buffer(2)]],
-                         uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  
-  const uint kernelHXW = 5;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = biase[gid.z];
-  
-  ushort dilation_x = param.dilationX;
-  float4 input[5];
-  
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
-    
-    input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
-    
-    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    
-    input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
-    
-    input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
-    
-    for (int j = 0; j < 5; ++j) {
-      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
-    }
-  }
-//  output = output + biase[gid.z];
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-
-kernel void depthwise_conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                   texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                   constant MetalConvParam &param [[buffer(0)]],
-                                   const device float *weights [[buffer(1)]],
-                                   const device float4 *biase [[buffer(2)]],
-                                   uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  float4 output = biase[gid.z];
-  float4 inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    float4 input = inputs[j];
-    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-  }
-//  output = output + biase[gid.z];
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-
-#pragma mark - half
-
-kernel void conv_add_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<half, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device half4 *weights [[buffer(1)]],
-                         const device half4 *biase [[buffer(2)]],
-                         uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  half4 output = biase[gid.z];
-  
-  half4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(input, weight_x);
-    
-    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(input, weight_y);
-    
-    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(input, weight_z);
-    
-    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(input, weight_w);
-  }
-//  output = output + float4(biase[gid.z]);
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_add_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<half, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device half4 *weights [[buffer(1)]],
-                         const device half4 *biase [[buffer(2)]],
-                         uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  half4 output = biase[gid.z];
-  
-  ushort dilation_x = param.dilationX;
-  ushort dilation_y = param.dilationY;
-  
-  half4 input[9];
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y - dilation_y), i);
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
-    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y - dilation_y), i);
-    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
-    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
-    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
-    for (int j = 0; j < 9; ++j) {
-      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(float4(input[j]), float4(weight_x));
-      
-      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(float4(input[j]), float4(weight_y));
-      
-      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(float4(input[j]), float4(weight_z));
-      
-      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(float4(input[j]), float4(weight_w));
-    }
-  }
-//  output = output + float4(biase[gid.z]);
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void depthwise_conv_add_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                   texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                   constant MetalConvParam &param [[buffer(0)]],
-                                   const device half *weights [[buffer(1)]],
-                                   const device half4 *biase [[buffer(2)]],
-                                   uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  half4 output = biase[gid.z];
-  half4 inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    half4 input = inputs[j];
-    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-  }
-//  output = output + float4(biase[gid.z]);
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-
-kernel void conv_add_5x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<half, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device half4 *weights [[buffer(1)]],
-                         const device half4 *biase [[buffer(2)]],
-                         uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  
-  const uint kernelHXW = 5;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  half4 output = biase[gid.z];
-  
-  ushort dilation_y = param.dilationY;
-  half4 input[5];
-  
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
-    
-    input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
-    
-    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    
-    input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
-    
-    input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
-    
-    for (int j = 0; j < 5; ++j) {
-      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
-    }
-  }
-//  output = output + float4(biase[gid.z]);
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-
-kernel void conv_add_1x5_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<half, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device half4 *weights [[buffer(1)]],
-                         const device half4 *biase [[buffer(2)]],
-                         uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  
-  const uint kernelHXW = 5;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  half4 output = biase[gid.z];
-  
-  ushort dilation_x = param.dilationX;
-  half4 input[5];
-  
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
-    
-    input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
-    
-    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    
-    input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
-    
-    input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
-    
-    for (int j = 0; j < 5; ++j) {
-      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
-    }
-  }
-//  output = output + float4(biase[gid.z]);
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-
-kernel void test_conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                              texture2d_array<float, access::write> outTexture [[texture(1)]],
-                              constant MetalConvParam &param [[buffer(0)]],
-                              const device float4 *weights [[buffer(1)]],
-                              const device float4 *biase [[buffer(2)]],
-                              uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  if (gid.x > 0 || gid.y > 0 || gid.z > 0) { return; }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  
-  const uint kernelHXW = 9;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  ushort dilation_x = param.dilationX;
-  ushort dilation_y = param.dilationY;
-  
-  float4 input[9];
-  
-  for (uint i = 0; i < input_arr_size; ++i) {
-    
-    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
-    
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
-    
-    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
-    
-    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
-    
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    
-    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
-    
-    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
-    
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
-    
-    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
-    
-    for (int j = 0; j < 9; ++j) {
-      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
-    }
-  }
-  //  output = output + biase[gid.z];
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-
-
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvAddPrelu.inc.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvAddPrelu.inc.metal
deleted file mode 100644
index 069daa20e8..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvAddPrelu.inc.metal
+++ /dev/null
@@ -1,447 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#ifdef P
-
-#include "Macro.metal"
-
-
-#pragma mark - convAdd
-kernel void FUNC3_(conv_add_1x1, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<P, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device VECTOR(P, 4) *weights [[buffer(1)]],
-                         const device VECTOR(P, 4) *biase [[buffer(2)]],
-#ifdef PRELU_CHANNEL
-                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
-#endif
-#ifdef PRELU_ELEMENT
-                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
-#endif
-#ifdef PRELU_OTHER
-                         const device P *alpha [[buffer(3)]],
-#endif
-                         uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  VECTOR(P, 4) output = biase[gid.z];
-  
-  VECTOR(P, 4) input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample,float2(posInInput.x, posInInput.y), i);
-    VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(input, weight_x);
-    
-    VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(input, weight_y);
-    
-    VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(input, weight_z);
-    
-    VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(input, weight_w);
-  }
-  
-//  output = output + float4(biase[gid.z]);
-  
-#ifdef PRELU_CHANNEL
-  VECTOR(P, 4) alpha_value = alpha[gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
-#endif
-#ifdef PRELU_ELEMENT
-  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
-  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
-#endif
-#ifdef PRELU_OTHER
-  P alpha_value = alpha[0];
-  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
-#endif
-  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
-}
-
-kernel void FUNC3_(conv_add_3x3, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
-    texture2d_array<P, access::write> outTexture [[texture(1)]],
-    constant MetalConvParam &param [[buffer(0)]],
-    const device VECTOR(P, 4) *weights [[buffer(1)]],
-    const device VECTOR(P, 4) *biase [[buffer(2)]],
-#ifdef PRELU_CHANNEL
-     const device VECTOR(P, 4) *alpha [[buffer(3)]],
-#endif
-#ifdef PRELU_ELEMENT
-     const device VECTOR(P, 4) *alpha [[buffer(3)]],
-#endif
-#ifdef PRELU_OTHER
-     const device P *alpha [[buffer(3)]],
-#endif
-     uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-
-  const uint kernelHXW = 9;
-
-  uint input_arr_size = inTexture.get_array_size();
-
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-
-  VECTOR(P, 4) output = biase[gid.z];
-
-  ushort dilation_x = param.dilationX;
-  ushort dilation_y = param.dilationY;
-
-  VECTOR(P, 4) input[9];
-
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
-
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
-
-    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
-
-    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
-
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-
-    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
-
-    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
-
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
-
-    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
-
-    for (int j = 0; j < 9; ++j) {
-      VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-
-      VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-
-      VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-
-      VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
-    }
-  }
-//  output = output + float4(biase[gid.z]);
-  
-#ifdef PRELU_CHANNEL
-  VECTOR(P, 4) alpha_value = alpha[gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
-#endif
-#ifdef PRELU_ELEMENT
-  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
-  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
-#endif
-#ifdef PRELU_OTHER
-  P alpha_value = alpha[0];
-  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
-#endif
-  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
-}
-
-kernel void FUNC3_(conv_add_5x1, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<P, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device VECTOR(P, 4) *weights [[buffer(1)]],
-                         const device VECTOR(P, 4) *biase [[buffer(2)]],
-#ifdef PRELU_CHANNEL
-                        const device VECTOR(P, 4) *alpha [[buffer(3)]],
-#endif
-#ifdef PRELU_ELEMENT
-                        const device VECTOR(P, 4) *alpha [[buffer(3)]],
-#endif
-#ifdef PRELU_OTHER
-                        const device P *alpha [[buffer(3)]],
-#endif
-                         uint3 gid [[thread_position_in_grid]]) {
-
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-
-  const uint kernelHXW = 5;
-
-  uint input_arr_size = inTexture.get_array_size();
-
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-
-  VECTOR(P, 4) output = biase[gid.z];;
-
-  ushort dilation_y = param.dilationY;
-  VECTOR(P, 4) input[5];
-
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
-
-    input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
-
-    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-
-    input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
-
-    input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
-
-    for (int j = 0; j < 5; ++j) {
-      VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-
-      VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-
-      VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-
-      VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
-    }
-  }
-  
-#ifdef PRELU_CHANNEL
-  VECTOR(P, 4) alpha_value = alpha[gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
-#endif
-#ifdef PRELU_ELEMENT
-  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
-  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
-#endif
-#ifdef PRELU_OTHER
-  P alpha_value = alpha[0];
-  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
-#endif
-  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
-}
-
-
-kernel void FUNC3_(conv_add_1x5, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<P, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device VECTOR(P, 4) *weights [[buffer(1)]],
-                         const device VECTOR(P, 4) *biase [[buffer(2)]],
-#ifdef PRELU_CHANNEL
-                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
-#endif
-#ifdef PRELU_ELEMENT
-                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
-#endif
-#ifdef PRELU_OTHER
-                         const device P *alpha [[buffer(3)]],
-#endif
-                         uint3 gid [[thread_position_in_grid]]) {
-
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-
-  const uint kernelHXW = 5;
-
-  uint input_arr_size = inTexture.get_array_size();
-
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-
-  VECTOR(P, 4) output = biase[gid.z];
-
-  ushort dilation_x = param.dilationX;
-  VECTOR(P, 4) input[5];
-
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
-
-    input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
-
-    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-
-    input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
-
-    input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
-
-    for (int j = 0; j < 5; ++j) {
-      VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-
-      VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-
-      VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-
-      VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
-    }
-  }
-  
-#ifdef PRELU_CHANNEL
-  VECTOR(P, 4) alpha_value = alpha[gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
-#endif
-#ifdef PRELU_ELEMENT
-  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
-  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
-#endif
-#ifdef PRELU_OTHER
-  P alpha_value = alpha[0];
-  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
-#endif
-  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
-}
-
-kernel void FUNC3_(depthwise_conv_add_3x3, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
-    texture2d_array<P, access::write> outTexture [[texture(1)]],
-    constant MetalConvParam &param [[buffer(0)]],
-    const device P *weights [[buffer(1)]],
-    const device VECTOR(P, 4) *biase [[buffer(2)]],
-#ifdef PRELU_CHANNEL
-    const device VECTOR(P, 4) *alpha [[buffer(3)]],
-#endif
-#ifdef PRELU_ELEMENT
-    const device VECTOR(P, 4) *alpha [[buffer(3)]],
-#endif
-#ifdef PRELU_OTHER
-    const device P *alpha [[buffer(3)]],
-#endif
-    uint3 gid [[thread_position_in_grid]]) {
-
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  VECTOR(P, 4) output = biase[gid.z];
-  VECTOR(P, 4) inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    VECTOR(P, 4) input = inputs[j];
-    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-  }
-  
-#ifdef PRELU_CHANNEL
-  VECTOR(P, 4) alpha_value = alpha[gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
-#endif
-#ifdef PRELU_ELEMENT
-  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
-  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
-#endif
-#ifdef PRELU_OTHER
-  P alpha_value = alpha[0];
-  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
-#endif
-  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
-}
-
-#endif
-
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvAddPreluKernel.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvAddPreluKernel.metal
deleted file mode 100644
index f03a1d5b62..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvAddPreluKernel.metal
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-#include "Common.metal"
-using namespace metal;
-
-#define P float
-
-  #define PRELU_CHANNEL prelu_channel
-  #define PRELU_TYPE prelu_channel
-    #include "ConvAddPrelu.inc.metal"
-  #undef  PRELU_TYPE
-  #undef  PRELU_CHANNEL
-
-  #define PRELU_ELEMENT prelu_element
-  #define PRELU_TYPE prelu_element
-    #include "ConvAddPrelu.inc.metal"
-  #undef  PRELU_TYPE
-  #undef  PRELU_ELEMENT
-
-  #define PRELU_OTHER   prelu_other
-  #define PRELU_TYPE prelu_other
-    #include "ConvAddPrelu.inc.metal"
-  #undef  PRELU_TYPE
-  #undef  PRELU_OTHER
-
-#undef P
-
-#define P half
-
-  #define PRELU_CHANNEL prelu_channel
-  #define PRELU_TYPE prelu_channel
-    #include "ConvAddPrelu.inc.metal"
-  #undef  PRELU_TYPE
-  #undef  PRELU_CHANNEL
-
-  #define PRELU_ELEMENT prelu_element
-  #define PRELU_TYPE prelu_element
-    #include "ConvAddPrelu.inc.metal"
-  #undef  PRELU_TYPE
-  #undef  PRELU_ELEMENT
-
-  #define PRELU_OTHER   prelu_other
-  #define PRELU_TYPE prelu_other
-    #include "ConvAddPrelu.inc.metal"
-  #undef  PRELU_TYPE
-  #undef  PRELU_OTHER
-
-#undef P
-
-
-
-
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvBNReluKernel.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvBNReluKernel.metal
deleted file mode 100644
index 4b97b7829a..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvBNReluKernel.metal
+++ /dev/null
@@ -1,297 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-#include "Common.metal"
-
-using namespace metal;
-
-#pragma mark - conv bn relu
-kernel void conv_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                     texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                     constant MetalConvParam &param [[buffer(0)]],
-                                     const device float4 *weights [[buffer(1)]],
-                                     const device float4 *new_scale [[buffer(2)]],
-                                     const device float4 *new_biase [[buffer(3)]],
-                                     uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  float4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(input, weight_x);
-    
-    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(input, weight_y);
-    
-    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(input, weight_z);
-    
-    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(input, weight_w);
-  }
-  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                     texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                     constant MetalConvParam &param [[buffer(0)]],
-                                     const device float4 *weights [[buffer(1)]],
-                                     const device float4 *new_scale [[buffer(2)]],
-                                     const device float4 *new_biase [[buffer(3)]],
-                                     uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  float4 input[9];
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-    for (int j = 0; j < 9; ++j) {
-      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
-    }
-  }
-  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void depthwise_conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                               texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                               constant MetalConvParam &param [[buffer(0)]],
-                                               const device float *weights [[buffer(1)]],
-                                               const device float4 *new_scale [[buffer(2)]],
-                                               const device float4 *new_biase [[buffer(3)]],
-                                               uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  float4 output = float4(0.0);
-  float4 inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    float4 input = inputs[j];
-    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-  }
-  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-#pragma mark - half
-kernel void conv_batch_norm_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                     texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                     constant MetalConvParam &param [[buffer(0)]],
-                                     const device half4 *weights [[buffer(1)]],
-                                     const device half4 *new_scale [[buffer(2)]],
-                                     const device half4 *new_biase [[buffer(3)]],
-                                     uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  half4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(float4(input), float4(weight_x));
-    
-    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(float4(input), float4(weight_y));
-    
-    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(float4(input), float4(weight_z));
-    
-    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(float4(input), float4(weight_w));
-  }
-  output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
-  outTexture.write(half4(output), gid.xy, gid.z);
-}
-
-kernel void conv_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                     texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                     constant MetalConvParam &param [[buffer(0)]],
-                                     const device half4 *weights [[buffer(1)]],
-                                     const device half4 *new_scale [[buffer(2)]],
-                                     const device half4 *new_biase [[buffer(3)]],
-                                     uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  half4 input[9];
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-    for (int j = 0; j < 9; ++j) {
-      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(float4(input[j]), float4(weight_x));
-      
-      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(float4(input[j]), float4(weight_y));
-      
-      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(float4(input[j]), float4(weight_z));
-      
-      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(float4(input[j]), float4(weight_w));
-    }
-  }
-  output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
-  outTexture.write(half4(output), gid.xy, gid.z);
-}
-
-kernel void depthwise_conv_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                               texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                               constant MetalConvParam &param [[buffer(0)]],
-                                               const device half *weights [[buffer(1)]],
-                                               const device half4 *new_scale [[buffer(2)]],
-                                               const device half4 *new_biase [[buffer(3)]],
-                                               uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  float4 output = float4(0.0);
-  half4 inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    half4 input = inputs[j];
-    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-  }
-  output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
-  outTexture.write(half4(output), gid.xy, gid.z);
-}
-
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvKernel.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvKernel.metal
deleted file mode 100644
index c07515c13d..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvKernel.metal
+++ /dev/null
@@ -1,280 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-#include "Common.metal"
-using namespace metal;
-
-// conv
-#pragma mark -- conv
-kernel void conv_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                     texture2d_array<float, access::write> outTexture [[texture(1)]],
-                     constant MetalConvParam &param [[buffer(0)]],
-                     const device float4 *weights [[buffer(1)]],
-                     uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  float4 input[9];
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-    for (int j = 0; j < 9; ++j) {
-      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
-    }
-  }
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void depthwise_conv_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                               texture2d_array<float, access::write> outTexture [[texture(1)]],
-                               constant MetalConvParam &param [[buffer(0)]],
-                               const device float *weights [[buffer(1)]],
-                               uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  float4 output = float4(0.0);
-  float4 inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    float4 input = inputs[j];
-    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-  }
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                     texture2d_array<float, access::write> outTexture [[texture(1)]],
-                     constant MetalConvParam &param [[buffer(0)]],
-                     const device float4 *weights [[buffer(1)]],
-                     uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  float4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(input, weight_x);
-    
-    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(input, weight_y);
-    
-    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(input, weight_z);
-    
-    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(input, weight_w);
-  }
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-
-kernel void conv_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                     texture2d_array<half, access::write> outTexture [[texture(1)]],
-                     constant MetalConvParam &param [[buffer(0)]],
-                     const device half4 *weights [[buffer(1)]],
-                     uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  half4 input[9];
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-    for (int j = 0; j < 9; ++j) {
-      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(float4(input[j]), float4(weight_x));
-      
-      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(float4(input[j]), float4(weight_y));
-      
-      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(float4(input[j]), float4(weight_z));
-      
-      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(float4(input[j]), float4(weight_w));
-    }
-  }
-  outTexture.write(half4(output), gid.xy, gid.z);
-}
-
-kernel void depthwise_conv_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                               texture2d_array<half, access::write> outTexture [[texture(1)]],
-                               constant MetalConvParam &param [[buffer(0)]],
-                               const device half *weights [[buffer(1)]],
-                               uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  float4 output = float4(0.0);
-  half4 inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    half4 input = inputs[j];
-    output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]);
-    output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]);
-    output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
-    output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]);
-  }
-  outTexture.write(half4(output), gid.xy, gid.z);
-}
-
-kernel void conv_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                     texture2d_array<half, access::write> outTexture [[texture(1)]],
-                     constant MetalConvParam &param [[buffer(0)]],
-                     const device half4 *weights [[buffer(1)]],
-                     uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  half4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(float4(input), float4(weight_x));
-    
-    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(float4(input), float4(weight_y));
-    
-    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(float4(input), float4(weight_z));
-    
-    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(float4(input), float4(weight_w));
-  }
-  outTexture.write(half4(output), gid.xy, gid.z);
-}
-
-
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvTransposeKernel.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvTransposeKernel.metal
deleted file mode 100644
index baf3f31157..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ConvTransposeKernel.metal
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-struct MetalConvTransposeParam{
-  ushort kernelW;
-  ushort kernelH;
-  
-  ushort strideX;
-  ushort strideY;
-  
-  ushort paddingX;
-  ushort paddingY;
-  
-  ushort dilationX;
-  ushort dilationY;
-};
-
-kernel void conv_transpose2x2_stride2(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                      texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                      constant MetalConvTransposeParam &param [[buffer(0)]],
-                                      const device float4 *weights [[buffer(1)]],
-                                      uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  int input_array_size = inTexture.get_array_size();
-  int kernel_index_x = gid.x % 2;
-  int kernel_index_y = gid.y % 2;
-  int kernel_index = kernel_index_y * 2 + kernel_index_x;
-  int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size);
-  int input_x = gid.x / 2;
-  int input_y = gid.y / 2;
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  float4 output = float4(0.0);
-  for (int i = 0; i < input_array_size; ++i) {
-    
-    float4 input = inTexture.sample(sample, float2(input_x, input_y), i);
-    
-    float4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i];
-    float4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i];
-    float4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i];
-    float4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i];
-    
-    output.x += dot(input, kernel_slice0);
-    
-    output.y += dot(input, kernel_slice1);
-    
-    output.z += dot(input, kernel_slice2);
-    
-    output.w += dot(input, kernel_slice3);
-  }
-  
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_transpose2x2_stride2_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                      texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                      constant MetalConvTransposeParam &param [[buffer(0)]],
-                                      const device half4 *weights [[buffer(1)]],
-                                      uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  int input_array_size = inTexture.get_array_size();
-  int kernel_index_x = gid.x % 2;
-  int kernel_index_y = gid.y % 2;
-  int kernel_index = kernel_index_y * 2 + kernel_index_x;
-  int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size);
-  int input_x = gid.x / 2;
-  int input_y = gid.y / 2;
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  float4 output = float4(0.0);
-  for (int i = 0; i < input_array_size; ++i) {
-    
-    half4 input = inTexture.sample(sample, float2(input_x, input_y), i);
-    
-    half4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i];
-    half4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i];
-    half4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i];
-    half4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i];
-    
-    output.x += dot(float4(input), float4(kernel_slice0));
-    
-    output.y += dot(float4(input), float4(kernel_slice1));
-    
-    output.z += dot(float4(input), float4(kernel_slice2));
-    
-    output.w += dot(float4(input), float4(kernel_slice3));
-  }
-  
-  outTexture.write(half4(output), gid.xy, gid.z);
-}
-
-//kernel void conv_transpose(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-//                           texture2d_array<float, access::write> outTexture [[texture(1)]],
-//                           constant MetalConvTransposeParam &param [[buffer(0)]],
-//                           const device float4 *weights [[buffer(1)]],
-//                           uint3 gid [[thread_position_in_grid]]){
-//  if (gid.x >= outTexture.get_width() ||
-//      gid.y >= outTexture.get_height() ||
-//      gid.z >= outTexture.get_array_size()) {
-//    return;
-//  }
-//
-//  int input_array_size = inTexture.get_array_size();
-//
-//  uint kernel_one_output_slice = input_array_size * param.kernelW * param.kernelH;
-//
-//  uint kernel_stride_z = gid.z * 4 * (kernel_one_output_slice);
-//
-//  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-//
-//  float4 output;
-//
-//  for (int w = 0; w < param.kernelW; ++w) {
-//    int top = gid.x - w * param.dilationX + param.paddingX;
-//    int input_x = top / param.strideX;
-//    if (top < 0 || input_x >= int(inTexture.get_width())) {
-//      continue;
-//    }
-//
-//    for (int h = 0; h < param.kernelH; ++h) {
-//      int top_y = gid.y - h * param.dilationY + param.paddingY;
-//      int input_y = top_y / param.strideY;
-//      if (top_y < 0 || input_y >= int(inTexture.get_height())) {
-//        continue;
-//      }
-//
-//      uint kernel_index = (w * param.kernelH + h) * inTexture.get_array_size();
-//
-//      for (int slice = 0; slice < input_array_size; ++slice) {
-//
-//        float4 input;
-//        float4 kernel_slice = weights[kernel_stride_z + 0 * kernel_one_output_slice + kernel_index + slice];
-//        float4 kernel_slice1 = weights[kernel_stride_z + 1 * kernel_one_output_slice + kernel_index + slice];
-//
-//        float4 kernel_slice2 = weights[kernel_stride_z + 2 * kernel_one_output_slice + kernel_index + slice];
-//
-//        float4 kernel_slice3 = weights[kernel_stride_z + 3 * kernel_one_output_slice + kernel_index + slice];
-//
-//        input = inTexture.sample(sample, float2(input_x,    input_y), slice);
-//        output.x += dot(input, kernel_slice);
-//        output.y += dot(input, kernel_slice1);
-//        output.z += dot(input, kernel_slice2);
-//        output.w += dot(input, kernel_slice3);
-//      }
-//    }
-//  }
-//
-//  outTexture.write(output, gid.xy, gid.z);
-//}
-//
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/Elementwise.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/Elementwise.metal
deleted file mode 100644
index b152df8281..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/Elementwise.metal
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-#include "Common.metal"
-
-using namespace metal;
-
-struct ElementwiseAddParam {
-  int32_t fast;
-  int32_t axis;
-  int32_t ylen;
-  int32_t xdim[4];
-  int32_t xtrans[4];
-  int32_t ydim[4];
-  int32_t ytrans[4];
-};
-
-kernel void elementwise_add(texture2d_array<float, access::read> inputX [[texture(0)]],
-                            texture2d_array<float, access::read> inputY [[texture(1)]],
-                            texture2d_array<float, access::write> outTexture [[texture(2)]],
-                            constant ElementwiseAddParam &pm [[buffer(0)]],
-                            uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  float4 rx, ry;
-
-  if (pm.fast == 1) {
-    rx = inputX.read(gid.xy, gid.z);
-    ry = inputY.read(gid.xy, gid.z);
-  } else {
-    rx = inputX.read(gid.xy, gid.z);
-    int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
-    int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
-    int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
-    int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
-    int32_t yshift = 4 - pm.ylen - pm.axis;
-    for (int n = 0; n < 4; n++) {
-      x_xyzn[3] = n;
-      xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
-      invtrans(xtrans, x_abcd, t_abcd);
-      for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
-        y_abcd[yshift+k] = t_abcd[k];
-      }
-      trans(ytrans, y_abcd, t_abcd);
-      abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
-      ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
-    }
-  }
-  float4 r = rx + ry;
-  outTexture.write(r, gid.xy, gid.z);
-}
-
-kernel void elementwise_add_half(texture2d_array<half, access::read> inputX [[texture(0)]],
-                            texture2d_array<half, access::read> inputY [[texture(1)]],
-                            texture2d_array<half, access::write> outTexture [[texture(2)]],
-                            constant ElementwiseAddParam &pm [[buffer(0)]],
-                            uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  half4 rx, ry;
-
-  if (pm.fast == 1) {
-    rx = inputX.read(gid.xy, gid.z);
-    ry = inputY.read(gid.xy, gid.z);
-  } else {
-    rx = inputX.read(gid.xy, gid.z);
-    int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
-    int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
-    int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
-    int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
-    int32_t yshift = 4 - pm.ylen - pm.axis;
-    for (int n = 0; n < 4; n++) {
-      x_xyzn[3] = n;
-      xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
-      invtrans(xtrans, x_abcd, t_abcd);
-      for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
-        y_abcd[yshift+k] = t_abcd[k];
-      }
-      trans(ytrans, y_abcd, t_abcd);
-      abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
-      ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
-    }
-  }
-  half4 r = rx + ry;
-  outTexture.write(r, gid.xy, gid.z);
-}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ElementwiseAddPreluKernel.inc.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ElementwiseAddPreluKernel.inc.metal
deleted file mode 100644
index b1d68d6809..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ElementwiseAddPreluKernel.inc.metal
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#ifdef P
-
-#include <metal_stdlib>
-#include "Macro.metal"
-
-using namespace metal;
-
-kernel void FUNC3_(elementwise_add, PRELU_TYPE, P)(texture2d_array<P, access::read> inputX [[texture(0)]],
-                                 texture2d_array<P, access::read> inputY [[texture(1)]],
-                                 texture2d_array<P, access::write> outTexture [[texture(2)]],
-                                 constant ElementwiseAddParam &pm [[buffer(0)]],
-#ifdef PRELU_CHANNEL
-                                 const device VECTOR(P, 4) *alpha [[buffer(1)]],
-#endif
-#ifdef PRELU_ELEMENT
-                                 const device VECTOR(P, 4) *alpha [[buffer(1)]],
-#endif
-#ifdef PRELU_OTHER
-                                 const device P *alpha [[buffer(1)]],
-#endif
-                                 uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  VECTOR(P, 4) rx, ry;
-  
-  if (pm.fast == 1) {
-    rx = inputX.read(gid.xy, gid.z);
-    ry = inputY.read(gid.xy, gid.z);
-    } else {
-      rx = inputX.read(gid.xy, gid.z);
-      int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
-      int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
-      int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
-      int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
-      int32_t yshift = 4 - pm.ylen - pm.axis;
-      for (int n = 0; n < 4; n++) {
-        x_xyzn[3] = n;
-        xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
-        invtrans(xtrans, x_abcd, t_abcd);
-        for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
-          y_abcd[yshift+k] = t_abcd[k];
-        }
-        trans(ytrans, y_abcd, t_abcd);
-        abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
-        ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
-      }
-  }
-  VECTOR(P, 4) output = rx + ry;
-  
-#ifdef PRELU_CHANNEL
-  VECTOR(P, 4) alpha_value = alpha[gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
-#endif
-#ifdef PRELU_ELEMENT
-  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
-  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
-#endif
-#ifdef PRELU_OTHER
-  P alpha_value = alpha[0];
-  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
-#endif
-  
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-#endif
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ElementwiseAddPreluKernel.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ElementwiseAddPreluKernel.metal
deleted file mode 100644
index 8fd1a9fdab..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ElementwiseAddPreluKernel.metal
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-#include "Common.metal"
-using namespace metal;
-
-struct ElementwiseAddParam {
-  int32_t fast;
-  int32_t axis;
-  int32_t ylen;
-  int32_t xdim[4];
-  int32_t xtrans[4];
-  int32_t ydim[4];
-  int32_t ytrans[4];
-};
-
-#define P float
-
-#define PRELU_CHANNEL prelu_channel
-#define PRELU_TYPE channel
-#include "ElementwiseAddPreluKernel.inc.metal"
-#undef  PRELU_TYPE
-#undef  PRELU_CHANNEL
-
-#define PRELU_ELEMENT element
-#define PRELU_TYPE prelu_element
-#include "ElementwiseAddPreluKernel.inc.metal"
-#undef  PRELU_TYPE
-#undef  PRELU_ELEMENT
-
-#define PRELU_OTHER   other
-#define PRELU_TYPE prelu_other
-#include "ElementwiseAddPreluKernel.inc.metal"
-#undef  PRELU_TYPE
-#undef  PRELU_OTHER
-
-#undef P
-
-#define P half
-
-#define PRELU_CHANNEL channel
-#define PRELU_TYPE channel
-#include "ElementwiseAddPreluKernel.inc.metal"
-#undef  PRELU_TYPE
-#undef  PRELU_CHANNEL
-
-#define PRELU_ELEMENT element
-#define PRELU_TYPE prelu_element
-#include "ElementwiseAddPreluKernel.inc.metal"
-#undef  PRELU_TYPE
-#undef  PRELU_ELEMENT
-
-#define PRELU_OTHER   other
-#define PRELU_TYPE prelu_other
-#include "ElementwiseAddPreluKernel.inc.metal"
-#undef  PRELU_TYPE
-#undef  PRELU_OTHER
-
-#undef P
-
-
-
-
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/FetchKernel.inc.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/FetchKernel.inc.metal
deleted file mode 100644
index 5a883d8ce3..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/FetchKernel.inc.metal
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#ifdef P
-
-#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
-#define CONCAT2_(a, b) a ## _ ## b
-#define CONCAT2(a, b) a ## b
-#define FUNC(m, n, q) CONCAT3_(m, n, q)
-#define FUNC_T(m, n) CONCAT2_(m, n)
-
-#define VECTOR(p, n) CONCAT2(p, n)
-
-kernel void FUNC_T(fetch, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
-                  device float *output [[buffer(0)]],
-                  uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= inTexture.get_width() ||
-      gid.y >= inTexture.get_height() ||
-      gid.z >= inTexture.get_array_size()) {
-    return;
-  }
-
-  int input_width = inTexture.get_width();
-  int input_height = inTexture.get_height();
-  const VECTOR(P, 4) input = inTexture.read(gid.xy, gid.z);
-  int output_to = 4 * input_width * input_height;
-  
-  output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x;
-  
-  output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y;
-  output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z;
-  output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w;
-}
-
-#endif
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/FetchKernel.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/FetchKernel.metal
deleted file mode 100644
index 87d304302f..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/FetchKernel.metal
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-#define P float
-
-#include "FetchKernel.inc.metal"
-
-#undef P
-
-#define P half
-
-#include "FetchKernel.inc.metal"
-
-#undef P
-
-
-kernel void fetch_placeholder(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                              device float *output [[buffer(0)]],
-                              uint3 gid [[thread_position_in_grid]]) {
-  
-}
-
-kernel void fetch_placeholder_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                                   device float *output [[buffer(0)]],
-                                   uint3 gid [[thread_position_in_grid]]) {
-}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/Kernels.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/Kernels.metal
deleted file mode 100644
index 368509f001..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/Kernels.metal
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-#include "Common.metal"
-using namespace metal;
-
-// 占位函数, 啥也没干
-kernel void place_holder(texture2d<half, access::read> inTexture [[texture(0)]],
-                         texture2d_array<half, access::write> outTexture [[texture(1)]],
-                         uint3 gid [[thread_position_in_grid]]) {
-}
-
-struct OutputDim {
-  ushort width;
-  ushort height;
-  ushort strideX;
-  ushort strideY;
-};
-
-kernel void resize(texture2d<half, access::read> inTexture [[texture(0)]],
-                   texture2d_array<half, access::write> outTexture [[texture(1)]],
-                   constant OutputDim &params [[buffer(0)]],
-                   uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  
-  constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint2 pos = gid.xy * uint2(params.strideX, params.strideY);
-  const half4 input = inTexture.read(pos);
-  outTexture.write(half4(input.x, input.y, input.z, input.w), gid.xy, gid.z);
-}
-
-
-kernel void texture2d_to_2d_array(texture2d<float, access::read> inTexture [[texture(0)]],
-                                  texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                  uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= inTexture.get_width() ||
-      gid.y >= inTexture.get_height()){
-    return;
-  }
-  const float4 input = inTexture.read(gid.xy);
-  outTexture.write(input, gid.xy, 0);
-}
-
-kernel void texture2d_to_2d_array_half(texture2d<half, access::read> inTexture [[texture(0)]],
-                                      texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                      uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= inTexture.get_width() ||
-      gid.y >= inTexture.get_height()){
-    return;
-  }
-  const half4 input = inTexture.read(gid.xy);
-  outTexture.write(input, gid.xy, 0);
-}
-
-
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/Macro.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/Macro.metal
deleted file mode 100644
index 43b9579c89..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/Macro.metal
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-
-#define CONCAT2(a, b) a ## b
-#define CONCAT2_(a, b) a ## _ ## b
-#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
-#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
-#define CONCAT5_(a, b, c, d, e) a ## _ ## b ## _ ## c ## _ ## d ## _ ## e
-
-#define FUNC(f, r, n, v, p) CONCAT5_(f, r, n, v, p)
-#define VECTOR(p, n) CONCAT2(p, n)
-#define FUNC2_(a, b) CONCAT2_(a, b)
-#define FUNC3_(a, b, c) CONCAT3_(a, b, c)
-
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/NMSFetchResultKernel.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/NMSFetchResultKernel.metal
deleted file mode 100644
index 44c57440e1..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/NMSFetchResultKernel.metal
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-kernel void nms_fetch_result(texture2d_array<float, access::read> inTexture [[texture(0)]],
-    device float *output [[buffer(0)]],
-    uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= inTexture.get_width() ||
-      gid.y >= inTexture.get_height() ||
-      gid.z >= inTexture.get_array_size()) {
-    return;
-  }
-  
-  int input_width = inTexture.get_width();
-  const float4 input = inTexture.read(gid.xy, gid.z);
-  output[gid.y * input_width + gid.x] = input.x;
-  
-}
-
-
-kernel void nms_fetch_result_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                             device float *output [[buffer(0)]],
-                             uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= inTexture.get_width() ||
-      gid.y >= inTexture.get_height() ||
-      gid.z >= inTexture.get_array_size()) {
-    return;
-  }
-  
-  int input_width = inTexture.get_width();
-  const half4 input = inTexture.read(gid.xy, gid.z);
-  output[gid.y * input_width + gid.x] = input.x;
-}
-
-kernel void nms_fetch_bbox(texture2d_array<float, access::read> inTexture [[texture(0)]],
-    device float4 *output [[buffer(0)]],
-    uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= inTexture.get_width() ||
-      gid.y >= inTexture.get_height() ||
-      gid.z >= inTexture.get_array_size()) {
-    return;
-  }
-  
-  int input_width = inTexture.get_width();
-//  int input_height = inTexture.get_height();
-  const float4 input = inTexture.read(gid.xy, gid.z);
-  output[gid.y * input_width + gid.x] = input;
-}
-
-kernel void nms_fetch_bbox_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                           device float4 *output [[buffer(0)]],
-                           uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= inTexture.get_width() ||
-      gid.y >= inTexture.get_height() ||
-      gid.z >= inTexture.get_array_size()) {
-    return;
-  }
-  
-  int input_width = inTexture.get_width();
-//  int input_height = inTexture.get_height();
-  const half4 input = inTexture.read(gid.xy, gid.z);
-  output[gid.y * input_width + gid.x] = float4(input);
-}
-
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/PoolKernel.inc.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/PoolKernel.inc.metal
deleted file mode 100644
index 5b0cae3e3a..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/PoolKernel.inc.metal
+++ /dev/null
@@ -1,44 +0,0 @@
-//
-//  PoolKernel.inc.metal
-//  paddle-mobile
-//
-//  Created by liuRuiLong on 2018/12/29.
-//  Copyright © 2018 orange. All rights reserved.
-//
-
-#ifdef P
-
-kernel void FUNC2_(pool, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
-                 texture2d_array<P, access::write> outTexture [[texture(1)]],
-                 constant PoolParam &pm [[buffer(0)]],
-                 uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  int xmin = gid.x * pm.strideX - pm.paddingX;
-  int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
-  xmin = max(xmin, 0);
-  int ymin = gid.y * pm.strideX - pm.paddingX;
-  int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
-  ymin = max(ymin, 0);
-  
-  VECTOR(P, 4) r = 0;
-  if (pm.poolType == 0) {
-    r = inTexture.read(uint2(xmin, ymin), gid.z);
-    for (int x = xmin; x < xmax; x++) {
-      for (int y = ymin; y < ymax; y++) {
-        r = fmax(r, inTexture.read(uint2(x, y), gid.z));
-      }
-    }
-  } else if (pm.poolType == 1) {
-    for (int x = xmin; x < xmax; x++) {
-      for (int y = ymin; y < ymax; y++) {
-        r += inTexture.read(uint2(x, y), gid.z);
-      }
-    }
-    r /= (xmax - xmin) * (ymax - ymin);
-  }
-  outTexture.write(r, gid.xy, gid.z);
-}
-
-#endif
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/PoolKernel.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/PoolKernel.metal
deleted file mode 100644
index d17536eab6..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/PoolKernel.metal
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-#include "Macro.metal"
-
-using namespace metal;
-
-struct PoolParam {
-  int ksizeX;
-  int ksizeY;
-  int strideX;
-  int strideY;
-  int paddingX;
-  int paddingY;
-  int poolType;
-};
-
-#define P float
-#import "PoolKernel.inc.metal"
-#undef P
-
-#define P half
-#import "PoolKernel.inc.metal"
-#undef P
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/PreluKernel.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/PreluKernel.metal
deleted file mode 100644
index 5978041377..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/PreluKernel.metal
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-kernel void prelu_channel(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                           texture2d_array<float, access::write> outTexture [[texture(1)]],
-                           const device float4 *alpha [[buffer(0)]],
-                           uint3 gid [[thread_position_in_grid]]){
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
-  float4 alpha_value = alpha[gid.z];
-  float4 output;
-  output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
-  output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
-  output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
-  output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void prelu_element(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                          texture2d_array<float, access::write> outTexture [[texture(1)]],
-                          const device float4 *alpha [[buffer(0)]],
-                          uint3 gid [[thread_position_in_grid]]){
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
-
-  int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size();
-  float4 alpha_value = alpha[alpha_to + gid.z];
-
-  float4 output;
-  output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
-  output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
-  output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
-  output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void prelu_other(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                          texture2d_array<float, access::write> outTexture [[texture(1)]],
-                          const device float *alpha [[buffer(0)]],
-                          uint3 gid [[thread_position_in_grid]]){
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
-  float alpha_value = alpha[0];
-  float4 output;
-  output.x = input.x > 0 ? input.x : (alpha_value * input.x);
-  output.y = input.y > 0 ? input.y : (alpha_value * input.y);
-  output.z = input.z > 0 ? input.z : (alpha_value * input.z);
-  output.w = input.w > 0 ? input.w : (alpha_value * input.w);
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-
-kernel void prelu_channel_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                          texture2d_array<half, access::write> outTexture [[texture(1)]],
-                          const device half4 *alpha [[buffer(0)]],
-                          uint3 gid [[thread_position_in_grid]]){
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
-  half4 alpha_value = alpha[gid.z];
-  half4 output;
-  output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
-  output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
-  output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
-  output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void prelu_element_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                          texture2d_array<half, access::write> outTexture [[texture(1)]],
-                          const device half4 *alpha [[buffer(0)]],
-                          uint3 gid [[thread_position_in_grid]]){
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
-  
-  int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size();
-  half4 alpha_value = alpha[alpha_to + gid.z];
-  
-  half4 output;
-  output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
-  output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
-  output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
-  output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void prelu_other_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                        texture2d_array<half, access::write> outTexture [[texture(1)]],
-                        const device half *alpha [[buffer(0)]],
-                        uint3 gid [[thread_position_in_grid]]){
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
-  half alpha_value = alpha[0];
-  half4 output;
-  output.x = input.x > 0 ? input.x : (alpha_value * input.x);
-  output.y = input.y > 0 ? input.y : (alpha_value * input.y);
-  output.z = input.z > 0 ? input.z : (alpha_value * input.z);
-  output.w = input.w > 0 ? input.w : (alpha_value * input.w);
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/PriorBoxKernel.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/PriorBoxKernel.metal
deleted file mode 100644
index 7630febf77..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/PriorBoxKernel.metal
+++ /dev/null
@@ -1,367 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-struct PriorBoxMetalParam {
-  float offset;
-  float stepWidth;
-  float stepHeight;
-  float minSize;
-  float maxSize;
-  float imageWidth;
-  float imageHeight;
-  
-  bool clip;
-  
-  uint numPriors;
-  uint aspecRatiosSize;
-  uint minSizeSize;
-  uint maxSizeSize;
-};
-
-kernel void prior_box(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                      texture2d_array<float, access::write> outBoxTexture [[texture(1)]],
-                      texture2d_array<float, access::write> varianceTexture [[texture(2)]],
-                      const device float *aspect_ratios [[buffer(0)]],
-                      constant PriorBoxMetalParam &param [[buffer(1)]],
-                      const device float4 *variances [[buffer(2)]],
-                      uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outBoxTexture.get_width() ||
-      gid.y >= outBoxTexture.get_height() ||
-      gid.z >= outBoxTexture.get_array_size()) return;
-  
-  float center_x = (gid.x + param.offset) * param.stepWidth;
-  float center_y = (gid.y + param.offset) * param.stepHeight;
-  
-  float box_width, box_height;
-  
-  if (gid.z < param.aspecRatiosSize) {
-    float ar = aspect_ratios[gid.z];
-    box_width = param.minSize * sqrt(ar) / 2;
-    box_height = param.minSize / sqrt(ar) / 2;
-    float4 box;
-    box.x = (center_x - box_width) / param.imageWidth;
-    box.y = (center_y - box_height) / param.imageHeight;
-    box.z = (center_x + box_width) / param.imageWidth;
-    box.w = (center_y + box_height) / param.imageHeight;
-    
-    float4 res;
-    if (param.clip) {
-      res = fmin(fmax(box, 0.0), 1.0);
-    } else {
-      res = box;
-    }
-    
-    outBoxTexture.write(res, gid.xy, gid.z);
-  } else if (gid.z >= param.aspecRatiosSize) {
-    if (param.maxSizeSize > 0) {
-      box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
-      float4 max_box;
-      max_box.x = (center_x - box_width) / param.imageWidth;
-      max_box.y = (center_y - box_height) / param.imageHeight;
-      max_box.z = (center_x + box_width) / param.imageWidth;
-      max_box.w = (center_y + box_height) / param.imageHeight;
-
-      float4 res;
-      if (param.clip) {
-        res = min(max(max_box, 0.0), 1.0);
-      } else {
-        res = max_box;
-      }
-      outBoxTexture.write(max_box, gid.xy, gid.z);
-    }
-  }
-  
-  float4 variance = variances[0];
-  if (gid.z < param.numPriors) {
-    float4 variances_output;
-    variances_output.x = variance.x;
-    variances_output.y = variance.y;
-    variances_output.z = variance.z;
-    variances_output.w = variance.w;
-    varianceTexture.write(variances_output, gid.xy, gid.z);
-  }
-}
-
-
-kernel void prior_box_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                      texture2d_array<half, access::write> outBoxTexture [[texture(1)]],
-                      texture2d_array<half, access::write> varianceTexture [[texture(2)]],
-                      const device half *aspect_ratios [[buffer(0)]],
-                      constant PriorBoxMetalParam &param [[buffer(1)]],
-                      const device float4 *variances [[buffer(2)]],
-                      uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outBoxTexture.get_width() ||
-      gid.y >= outBoxTexture.get_height() ||
-      gid.z >= outBoxTexture.get_array_size()) return;
-  
-  float center_x = (gid.x + param.offset) * param.stepWidth;
-  float center_y = (gid.y + param.offset) * param.stepHeight;
-  
-  float box_width, box_height;
-  
-  if (gid.z < param.aspecRatiosSize) {
-    half ar = aspect_ratios[gid.z];
-    box_width = param.minSize * sqrt(ar) / 2;
-    box_height = param.minSize / sqrt(ar) / 2;
-    float4 box;
-    box.x = (center_x - box_width) / param.imageWidth;
-    box.y = (center_y - box_height) / param.imageHeight;
-    box.z = (center_x + box_width) / param.imageWidth;
-    box.w = (center_y + box_height) / param.imageHeight;
-    
-    float4 res;
-    if (param.clip) {
-      res = fmin(fmax(box, 0.0), 1.0);
-    } else {
-      res = box;
-    }
-    
-    outBoxTexture.write(half4(res), gid.xy, gid.z);
-  } else if (gid.z >= param.aspecRatiosSize) {
-    if (param.maxSizeSize > 0) {
-      box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
-      float4 max_box;
-      max_box.x = (center_x - box_width) / param.imageWidth;
-      max_box.y = (center_y - box_height) / param.imageHeight;
-      max_box.z = (center_x + box_width) / param.imageWidth;
-      max_box.w = (center_y + box_height) / param.imageHeight;
-      
-      float4 res;
-      if (param.clip) {
-        res = min(max(max_box, 0.0), 1.0);
-      } else {
-        res = max_box;
-      }
-      outBoxTexture.write(half4(max_box), gid.xy, gid.z);
-    }
-  }
-  
-  float4 variance = variances[0];
-  if (gid.z < param.numPriors) {
-    float4 variances_output;
-    variances_output.x = variance.x;
-    variances_output.y = variance.y;
-    variances_output.z = variance.z;
-    variances_output.w = variance.w;
-    varianceTexture.write(half4(variances_output), gid.xy, gid.z);
-  }
-}
-
-
-
-kernel void prior_box_MinMaxAspectRatiosOrder(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                      texture2d_array<float, access::write> outBoxTexture [[texture(1)]],
-                      texture2d_array<float, access::write> varianceTexture [[texture(2)]],
-                      const device float *aspect_ratios [[buffer(0)]],
-                      constant PriorBoxMetalParam &param [[buffer(1)]],
-                      const device float4 *variances [[buffer(2)]],
-                      uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outBoxTexture.get_width() ||
-      gid.y >= outBoxTexture.get_height() ||
-      gid.z >= outBoxTexture.get_array_size()) return;
-  
-  float center_x = (gid.x + param.offset) * param.stepWidth;
-  float center_y = (gid.y + param.offset) * param.stepHeight;
-  
-  float box_width, box_height;
-  
-  
-  
-  if (gid.z == 0) {
-    box_width = box_height = param.minSize / 2;
-    
-    float4 box;
-    box.x = (center_x - box_width) / param.imageWidth;
-    box.y = (center_y - box_height) / param.imageHeight;
-    box.z = (center_x + box_width) / param.imageWidth;
-    box.w = (center_y + box_height) / param.imageHeight;
-    
-    float4 res;
-    if (param.clip) {
-      res = fmin(fmax(box, 0.0), 1.0);
-    } else {
-      res = box;
-    }
-
-    outBoxTexture.write(res, gid.xy, gid.z);
-  }
-  
-  if (gid.z == 1 && param.maxSizeSize > 0) {
-    
-    box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
-    float4 max_box;
-    max_box.x = (center_x - box_width) / param.imageWidth;
-    max_box.y = (center_y - box_height) / param.imageHeight;
-    max_box.z = (center_x + box_width) / param.imageWidth;
-    max_box.w = (center_y + box_height) / param.imageHeight;
-    
-    float4 res;
-    if (param.clip) {
-      res = min(max(max_box, 0.0), 1.0);
-    } else {
-      res = max_box;
-    }
-    outBoxTexture.write(res, gid.xy, gid.z);
-  }
-  
-  int aspect_to = 0;
-  if (param.maxSizeSize > 0) {
-    aspect_to = gid.z - 2;
-  } else {
-    aspect_to = gid.z - 1;
-  }
-  
-
-  
-  
-  if (aspect_to >= 0 && aspect_to < int(param.aspecRatiosSize)) {
-    
-    int skip = 0;
-    for (int i = 0; i < aspect_to + 1; ++i) {
-      if (fabs(aspect_ratios[i] - 1.) < 1e-6) {
-        skip += 1;
-      }
-    }
-    aspect_to += skip;
-    
-    float ar = aspect_ratios[aspect_to];
-    
-    box_width = param.minSize * sqrt(ar) / 2;
-    box_height = param.minSize / sqrt(ar) / 2;
-    float4 box;
-    box.x = (center_x - box_width) / param.imageWidth;
-    box.y = (center_y - box_height) / param.imageHeight;
-    box.z = (center_x + box_width) / param.imageWidth;
-    box.w = (center_y + box_height) / param.imageHeight;
-    
-    float4 res;
-    if (param.clip) {
-      res = fmin(fmax(box, 0.0), 1.0);
-    } else {
-      res = box;
-    }
-    
-    outBoxTexture.write(res, gid.xy, gid.z);
-  }
-  
-  float4 variance = variances[0];
-  if (gid.z < param.numPriors) {
-    float4 variances_output;
-    variances_output.x = variance.x;
-    variances_output.y = variance.y;
-    variances_output.z = variance.z;
-    variances_output.w = variance.w;
-    varianceTexture.write(variances_output, gid.xy, gid.z);
-  }
-}
-
-
-kernel void prior_box_MinMaxAspectRatiosOrder_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                           texture2d_array<half, access::write> outBoxTexture [[texture(1)]],
-                           texture2d_array<half, access::write> varianceTexture [[texture(2)]],
-                           const device half *aspect_ratios [[buffer(0)]],
-                           constant PriorBoxMetalParam &param [[buffer(1)]],
-                           const device float4 *variances [[buffer(2)]],
-                           uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outBoxTexture.get_width() ||
-      gid.y >= outBoxTexture.get_height() ||
-      gid.z >= outBoxTexture.get_array_size()) return;
-  
-  float center_x = (gid.x + param.offset) * param.stepWidth;
-  float center_y = (gid.y + param.offset) * param.stepHeight;
-  
-  float box_width, box_height;
-  
-  
-  
-  if (gid.z == 0) {
-    box_width = box_height = param.minSize / 2;
-    
-    float4 box;
-    box.x = (center_x - box_width) / param.imageWidth;
-    box.y = (center_y - box_height) / param.imageHeight;
-    box.z = (center_x + box_width) / param.imageWidth;
-    box.w = (center_y + box_height) / param.imageHeight;
-    
-    float4 res;
-    if (param.clip) {
-      res = fmin(fmax(box, 0.0), 1.0);
-    } else {
-      res = box;
-    }
-    
-    outBoxTexture.write(half4(res), gid.xy, gid.z);
-  }
-  
-  if (gid.z == 1 && param.maxSizeSize > 0) {
-    
-    box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
-    float4 max_box;
-    max_box.x = (center_x - box_width) / param.imageWidth;
-    max_box.y = (center_y - box_height) / param.imageHeight;
-    max_box.z = (center_x + box_width) / param.imageWidth;
-    max_box.w = (center_y + box_height) / param.imageHeight;
-    
-    float4 res;
-    if (param.clip) {
-      res = min(max(max_box, 0.0), 1.0);
-    } else {
-      res = max_box;
-    }
-    outBoxTexture.write(half4(res), gid.xy, gid.z);
-  }
-  
-  int aspect_to = 0;
-  if (param.maxSizeSize > 0) {
-    aspect_to = gid.z - 2;
-  } else {
-    aspect_to = gid.z - 1;
-  }
-  
-  if (aspect_to > 0 && aspect_to < int(param.aspecRatiosSize) && fabs(aspect_ratios[aspect_to] - 1.) > 1e-6) {
-    float ar = aspect_ratios[aspect_to];
-    
-    box_width = param.minSize * sqrt(ar) / 2;
-    box_height = param.minSize / sqrt(ar) / 2;
-    float4 box;
-    box.x = (center_x - box_width) / param.imageWidth;
-    box.y = (center_y - box_height) / param.imageHeight;
-    box.z = (center_x + box_width) / param.imageWidth;
-    box.w = (center_y + box_height) / param.imageHeight;
-    
-    float4 res;
-    if (param.clip) {
-      res = fmin(fmax(box, 0.0), 1.0);
-    } else {
-      res = box;
-    }
-    
-    outBoxTexture.write(half4(res), gid.xy, gid.z);
-  }
-  
-  float4 variance = variances[0];
-  if (gid.z < param.numPriors) {
-    float4 variances_output;
-    variances_output.x = variance.x;
-    variances_output.y = variance.y;
-    variances_output.z = variance.z;
-    variances_output.w = variance.w;
-    varianceTexture.write(half4(variances_output), gid.xy, gid.z);
-  }
-}
-
-
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ReluKernel.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ReluKernel.metal
deleted file mode 100644
index e725440bbe..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ReluKernel.metal
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-
-kernel void relu_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                 texture2d_array<half, access::write> outTexture [[texture(1)]],
-                 uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const half4 input = inTexture.read(gid.xy, gid.z);
-  const float4 relu = fmax((float4)input, 0.0);
-  outTexture.write(half4(relu), gid.xy, gid.z);
-}
-
-kernel void relu(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                      texture2d_array<float, access::write> outTexture [[texture(1)]],
-                      uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const float4 input = inTexture.read(gid.xy, gid.z);
-  const float4 relu = fmax((float4)input, 0.0);
-  outTexture.write(float4(relu), gid.xy, gid.z);
-}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ReshapeKernel.inc.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ReshapeKernel.inc.metal
deleted file mode 100644
index 7583537c2b..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ReshapeKernel.inc.metal
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#ifdef P
-
-#define CONCAT2(a, b) a ## b
-#define CONCAT2_(a, b) a ## _ ## b
-#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
-#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
-
-#define FUNC(f, r1, r2, p) CONCAT4_(f, r1, r2, p)
-#define VECTOR(p, n) CONCAT2(p, n)
-#define FUNC_R(f, r) CONCAT2_(f, r)
-
-kernel void FUNC(reshape, RIN, ROUT, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
-                    texture2d_array<P, access::write> outTexture [[texture(1)]],
-                    constant ReshapeParam &rp [[buffer(0)]],
-                    uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-
-  int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, oabcd[4], ixyzn[4], iabcd[4];
-  ReshapeParam lrp = rp;
-  int oC = lrp.odim[lrp.otrans[3]];
-  int iC = lrp.idim[lrp.itrans[3]];
-  int count = lrp.odim[0] * lrp.odim[1] * lrp.odim[2] * lrp.odim[3];
-  VECTOR(P, 4) r;
-  for (int n = 0; n < 4; n++) {
-    oxyzn[3] = n;
-#if ROUT == 4
-    xyzn2abcd_4(oC, oxyzn, oabcd);
-#else
-    FUNC_R(xyzn2abcd, ROUT)(oxyzn, oabcd);
-#endif
-    int tabcd[4];
-    invtrans(lrp.otrans, oabcd, tabcd);
-    int index = abcd2index(lrp.odim, tabcd);
-    if (index < count) {
-      index2abcd(lrp.idim, index, tabcd);
-      trans(lrp.itrans, tabcd, iabcd);
-#if RIN == 4
-      abcd2xyzn_4(iC, iabcd, ixyzn);
-#else
-      FUNC_R(abcd2xyzn, RIN)(iabcd, ixyzn);
-#endif
-      r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]];
-    } else {
-      r[n] = 0;
-    }
-  }
-  outTexture.write(r, gid.xy, gid.z);
-}
-
-#endif
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ReshapeKernel.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ReshapeKernel.metal
deleted file mode 100644
index d2f5815d42..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ReshapeKernel.metal
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONRITIONS OF ANY KINR, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-#include "Common.metal"
-
-using namespace metal;
-
-struct ReshapeParam {
-  int32_t idim[4];
-  int32_t itrans[4];
-  int32_t odim[4];
-  int32_t otrans[4];
-};
-
-#define P float
-#define RIN 4
-#define ROUT 4
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 3
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 2
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 1
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#undef RIN
-
-#define RIN 3
-#define ROUT 4
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 3
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 2
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 1
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#undef RIN
-
-#define RIN 2
-#define ROUT 4
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 3
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 2
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 1
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#undef RIN
-
-#define RIN 1
-#define ROUT 4
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 3
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 2
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 1
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#undef RIN
-
-#undef P
-
-#define P half
-#define RIN 4
-#define ROUT 4
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 3
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 2
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 1
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#undef RIN
-
-#define RIN 3
-#define ROUT 4
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 3
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 2
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 1
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#undef RIN
-
-#define RIN 2
-#define ROUT 4
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 3
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 2
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 1
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#undef RIN
-
-#define RIN 1
-#define ROUT 4
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 3
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 2
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 1
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#undef RIN
-#undef P
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ResizeBilinear.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ResizeBilinear.metal
deleted file mode 100644
index fbb4e12cb8..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/ResizeBilinear.metal
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-struct resize_bilinear_param {
-//  int32_t out_h;
-//  int32_t out_w;
-  float ratio_h;
-  float ratio_w;
-};
-
-kernel void resize_bilinear(texture2d_array<float, access::read> input [[texture(0)]],
-                     texture2d_array<float, access::write> output [[texture(2)]],
-                     constant resize_bilinear_param & pm [[buffer(0)]],
-                     uint3 gid [[thread_position_in_grid]]) {
-  float4 r;
-  if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
-    r = input.read(gid.xy, gid.z);
-  } else {
-    float w = gid.x * pm.ratio_w;
-    float h = gid.y * pm.ratio_h;
-    uint w0 = w, h0 = h;
-    uint w1 = w0 + 1, h1 = h0 + 1;
-    float w1lambda = w - w0, h1lambda = h - h0;
-    float w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
-    if (w1 >= input.get_width()) w1 = w0;
-    if (h1 >= input.get_height()) h1 = h0;
-    float4 r0 = input.read(uint2(w0, h0), gid.z);
-    float4 r1 = input.read(uint2(w1, h0), gid.z);
-    float4 r2 = input.read(uint2(w0, h1), gid.z);
-    float4 r3 = input.read(uint2(w1, h1), gid.z);
-    r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
-  }
-  output.write(r, gid.xy, gid.z);
-}
-
-kernel void resize_bilinear_half(texture2d_array<half, access::read> input [[texture(0)]],
-                            texture2d_array<half, access::write> output [[texture(2)]],
-                            constant resize_bilinear_param & pm [[buffer(0)]],
-                            uint3 gid [[thread_position_in_grid]]) {
-  
-  half4 r;
-  if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
-    r = input.read(gid.xy, gid.z);
-  } else {
-    half w = gid.x * pm.ratio_w;
-    half h = gid.y * pm.ratio_h;
-    uint w0 = w, h0 = h;
-    uint w1 = w0 + 1, h1 = h0 + 1;
-    half w1lambda = w - w0, h1lambda = h - h0;
-    half w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
-    if (w1 >= input.get_width()) w1 = w0;
-    if (h1 >= input.get_height()) h1 = h0;
-    half4 r0 = input.read(uint2(w0, h0), gid.z);
-    half4 r1 = input.read(uint2(w1, h0), gid.z);
-    half4 r2 = input.read(uint2(w0, h1), gid.z);
-    half4 r3 = input.read(uint2(w1, h1), gid.z);
-    r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
-  }
-  output.write(r, gid.xy, gid.z);
-  output.write(r, gid.xy, gid.z);
-}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/Shape.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/Shape.metal
deleted file mode 100644
index b50d554719..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/Shape.metal
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-kernel void shape() {
-}
-kernel void shape_half() {
-}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/Softmax.inc.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/Softmax.inc.metal
deleted file mode 100644
index 455cf1471b..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/Softmax.inc.metal
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#ifdef P
-
-#define CONCAT2(a, b) a ## b
-#define CONCAT2_(a, b) a ## _ ## b
-
-#define FUNC(f, p) CONCAT2_(f, p)
-#define VECTOR(p, n) CONCAT2(p, n)
-
-kernel void FUNC(softmax, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
-                    texture2d_array<P, access::write> outTexture [[texture(1)]],
-                    constant SoftmaxParam &sp [[buffer(0)]],
-                    uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-//  int zsize = inTexture.get_array_size();
-  P maxv = inTexture.read(uint2(0, gid.y), 0)[0];
-  int group = sp.K / 4;
-  int remain = sp.K % 4;
-  for (int x = 0; x < group; x++) {
-    VECTOR(P, 4) r = inTexture.read(uint2(x, gid.y), 0);
-    maxv = max(maxv, max(r[0], max(r[1], max(r[2], r[3]))));
-  }
-  if (remain > 0) {
-    VECTOR(P, 4) r = inTexture.read(uint2(group, gid.y), 0);
-    for (int i = 0; i < remain; i++) {
-      maxv = max(maxv, r[i]);
-    }
-  }
-  VECTOR(P, 4) rsum = {0, 0, 0, 0};
-  for (int x = 0; x < group; x++) {
-    VECTOR(P, 4) r = inTexture.read(uint2(x, gid.y), 0);
-    rsum += exp(r - maxv);
-  }
-  P sum = rsum[0] + rsum[1] + rsum[2] + rsum[3];
-  if (remain > 0) {
-    VECTOR(P, 4) r = inTexture.read(uint2(group, gid.y), 0);
-    for (int i = 0; i < remain; i++) {
-      sum += exp(r[i] - maxv);
-    }
-  }
-  VECTOR(P, 4) rr = inTexture.read(gid.xy, gid.z);
-  rr = exp(rr - maxv) / sum;
-  outTexture.write(rr, gid.xy, gid.z);
-}
-
-#endif
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/Softmax.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/Softmax.metal
deleted file mode 100644
index 67c279a444..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/Softmax.metal
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-struct SoftmaxParam {
-  int N;
-  int K;
-};
-
-#define P float
-#include "Softmax.inc.metal"
-#undef P
-
-#define P half
-#include "Softmax.inc.metal"
-#undef P
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/Split.inc.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/Split.inc.metal
deleted file mode 100644
index 54e3f21e79..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/Split.inc.metal
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#ifdef P
-
-#define CONCAT2(a, b) a ## b
-#define CONCAT2_(a, b) a ## _ ## b
-#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
-#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
-#define CONCAT5_(a, b, c, d, e) a ## _ ## b ## _ ## c ## _ ## d ## _ ## e
-
-#define FUNC(f, r, n, v, p) CONCAT5_(f, r, n, v, p)
-#define VECTOR(p, n) CONCAT2(p, n)
-#define FUNC_R(f, r) CONCAT2_(f, r)
-
-#if V == VX
-#define VV x
-#elif V == VY
-#define VV y
-#elif V == VZ
-#define VV z
-#else
-#define VV normal
-#endif
-
-#if V == VY
-kernel void FUNC(split, R, N, VV, P)(texture2d_array<P, access::read> input [[texture(0)]],
-                                 texture2d_array<P, access::write> out1 [[texture(1)]],
-                                 texture2d_array<P, access::write> out2 [[texture(2)]],
-#if N >= 3
-                                 texture2d_array<P, access::write> out3 [[texture(3)]],
-#endif // N >= 3
-#if N >= 4
-                                 texture2d_array<P, access::write> out4 [[texture(4)]],
-#endif // N >= 4
-                                 constant SplitParam &sp [[buffer(0)]],
-                                 uint3 gid [[thread_position_in_grid]]) {
-
-  VECTOR(P, 4) r = input.read(gid.xy, gid.z);
-  int y = gid.y - sp.offset;
-  if (y < sp.vdim[0]) {
-    out1.write(r, gid.xy, gid.z);
-    return;
-  }
-  y -= sp.vdim[0];
-  if (y < sp.vdim[1]) {
-    out2.write(r, uint2(gid.x, y), gid.z);
-    return;
-  }
-#if N >= 3
-  y -= sp.vdim[1];
-  if (y < sp.vdim[2]) {
-    out3.write(r, uint2(gid.x, y), gid.z);
-    return;
-  }
-#endif // N >= 3
-#if N >= 4
-  y -= sp.vdim[2];
-  if (y < sp.vdim[3]) {
-    out4.write(r, uint2(gid.x, y), gid.z);
-    return;
-  }
-#endif // N >= 4
-}
-#endif // V == VY
-
-
-#if V == VX
-kernel void FUNC(split, R, N, VV, P)(texture2d_array<P, access::read> input [[texture(0)]],
-                                     texture2d_array<P, access::write> out1 [[texture(1)]],
-                                     texture2d_array<P, access::write> out2 [[texture(2)]],
-#if N >= 3
-                                     texture2d_array<P, access::write> out3 [[texture(3)]],
-#endif // N >= 3
-#if N >= 4
-                                     texture2d_array<P, access::write> out4 [[texture(4)]],
-#endif // N >= 4
-                                     constant SplitParam &sp [[buffer(0)]],
-                                     uint3 gid [[thread_position_in_grid]]) {
-  VECTOR(P, 4) r = input.read(gid.xy, gid.z);
-  int x = gid.x;
-  if (x < sp.vdim[0]) {
-    out1.write(r, gid.xy, gid.z);
-    return;
-  }
-  x -= sp.vdim[0];
-  if (x < sp.vdim[1]) {
-    out2.write(r, uint2(x, gid.y), gid.z);
-    return;
-  }
-#if N >= 3
-  x -= sp.vdim[1];
-  if (x < sp.vdim[2]) {
-    out3.write(r, uint2(x, gid.y), gid.z);
-    return;
-  }
-#endif // N >= 3
-#if N >= 4
-  x -= sp.vdim[2];
-  if (x < sp.vdim[3]) {
-    out4.write(r, uint2(x, gid.y), gid.z);
-    return;
-  }
-#endif // N >= 4
-}
-#endif // V == VX
-
-
-
-#undef VV
-#endif
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/Split.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/Split.metal
deleted file mode 100644
index 4c1e818d2b..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/Split.metal
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-#include "Common.metal"
-
-using namespace metal;
-
-struct SplitParam {
-  int32_t idim[4];
-  int32_t axis;
-  int32_t offset;
-  int32_t trans[4];
-  int32_t vdim[4];
-};
-
-#define VNORMAL 1
-#define VX 2
-#define VY 3
-#define VZ 4
-
-// only support split_{2, 3, 4}_{2, 3, 4}_y_{float, half}
-// only support split_{3, 4}_{2, 3, 4}_x_{float, half}
-
-
-//// ssd-ar: (R=3, N=2, V=y)
-#define V VY
-  #define R 3
-    #define N 2
-      #define P float
-        #include "Split.inc.metal"
-      #undef P
-      #define P half
-        #include "Split.inc.metal"
-      #undef P
-    #undef N
-  #undef R
-#undef V
-
-
-//// ssd-ar: (R=2, N=2, V=y)
-#define V VY
-  #define R 2
-    #define N 2
-      #define P float
-        #include "Split.inc.metal"
-      #undef P
-      #define P half
-        #include "Split.inc.metal"
-      #undef P
-    #undef N
-  #undef R
-#undef V
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/TransposeKernel.inc.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/TransposeKernel.inc.metal
deleted file mode 100644
index 534166e45f..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/TransposeKernel.inc.metal
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#ifdef P
-
-#define CONCAT2(a, b) a ## b
-#define CONCAT2_(a, b) a ## _ ## b
-#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
-
-#define FUNC(f, r, p) CONCAT3_(f, r, p)
-#define VECTOR(p, n) CONCAT2(p, n)
-
-kernel void FUNC(transpose, R, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
-                      texture2d_array<P, access::write> outTexture [[texture(1)]],
-                      constant TransposeParam &pm [[buffer(0)]],
-                      uint3 gid [[thread_position_in_grid]]) {
-  VECTOR(P, 4) r;
-  int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0};
-  int iabcd[4], oabcd[4], ixyzn[4];
-  for (int n = 0; n < 4; n++) {
-    oxyzn[3] = n;
-#if R == 4
-    xyzn2abcd_4(pm.oC, oxyzn, iabcd);
-#endif // R == 4
-#if R == 3
-    xyzn2abcd_3(oxyzn, oabcd);
-#endif // R == 3
-#if R == 2
-    xyzn2abcd_2(oxyzn, oabcd);
-#endif // R == 2
-    iabcd[pm.axis[0]] = oabcd[0];
-    iabcd[pm.axis[1]] = oabcd[1];
-    iabcd[pm.axis[2]] = oabcd[2];
-    iabcd[pm.axis[3]] = oabcd[3];
-#if R == 4
-    abcd2xyzn_4(pm.iC, iabcd, ixyzn);
-#endif // R == 4
-#if R == 3
-    abcd2xyzn_3(iabcd, ixyzn);
-#endif // R == 3
-#if R == 2
-    abcd2xyzn_2(iabcd, ixyzn);
-#endif // R == 2
-    r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]];
-  }
-  outTexture.write(r, gid.xy, gid.z);
-}
-
-#endif
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/TransposeKernel.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/TransposeKernel.metal
deleted file mode 100644
index 321663b9b7..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/TransposeKernel.metal
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-#include "Common.metal"
-using namespace metal;
-
-struct TransposeParam {
-  int iC;
-  int oC;
-  int axis[4];
-};
-
-kernel void transpose_copy_float(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                           texture2d_array<float, access::write> outTexture [[texture(1)]],
-                           constant TransposeParam &pm [[buffer(0)]],
-                           uint3 gid [[thread_position_in_grid]]) {
-  outTexture.write(inTexture.read(gid.xy, gid.z), gid.xy, gid.z);
-}
-kernel void transpose_copy_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                           texture2d_array<half, access::write> outTexture [[texture(1)]],
-                           constant TransposeParam &pm [[buffer(0)]],
-                           uint3 gid [[thread_position_in_grid]]) {
-  outTexture.write(inTexture.read(gid.xy, gid.z), gid.xy, gid.z);
-}
-
-#define R 4
-  #define P float
-    #include "TransposeKernel.inc.metal"
-  #undef P
-  #define P half
-    #include "TransposeKernel.inc.metal"
-  #undef P
-#undef R
-
-#define R 3
-  #define P float
-    #include "TransposeKernel.inc.metal"
-  #undef P
-  #define P half
-    #include "TransposeKernel.inc.metal"
-  #undef P
-#undef R
-
-#define R 2
-  #define P float
-    #include "TransposeKernel.inc.metal"
-  #undef P
-  #define P half
-    #include "TransposeKernel.inc.metal"
-  #undef P
-#undef R
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/paddle-mobile-demo-Bridging-Header.h b/metal/paddle-mobile-demo/paddle-mobile-demo/paddle-mobile-demo-Bridging-Header.h
deleted file mode 100644
index cb5511cc29..0000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/paddle-mobile-demo-Bridging-Header.h
+++ /dev/null
@@ -1,6 +0,0 @@
-//
-//  Use this file to import your target's public headers that you would like to expose to Swift.
-//
-
-#import <paddle_mobile/paddle_mobile.h>
-#import "ImageTool.h"
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj b/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj
deleted file mode 100644
index 21e6fa4f4c..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj
+++ /dev/null
@@ -1,407 +0,0 @@
-// !$*UTF8*$!
-{
-	archiveVersion = 1;
-	classes = {
-	};
-	objectVersion = 50;
-	objects = {
-
-/* Begin PBXBuildFile section */
-		16324D862292C4930047277D /* NearestInterpKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = 16324D852292C4930047277D /* NearestInterpKernel.metal */; };
-		165F38D72276F4C00088E29F /* ConvAddReluMetal.metal in Sources */ = {isa = PBXBuildFile; fileRef = 165F38D62276F4C00088E29F /* ConvAddReluMetal.metal */; };
-		16D3F3BB22929EAD0067C45D /* SliceKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = 16D3F3BA22929EAD0067C45D /* SliceKernel.metal */; };
-		16FBFB3E22925D040025B406 /* ActivationKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = 16FBFB3D22925D040025B406 /* ActivationKernel.metal */; };
-		5CCC0CF6759710BAFE999DB7 /* Pods_paddle_mobile_metallib.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 5D9D330A035906298947080B /* Pods_paddle_mobile_metallib.framework */; };
-		A74CAFF0228D9B9B000BBFCA /* ScaleKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = A74CAFEF228D9B9B000BBFCA /* ScaleKernel.metal */; };
-		FCC15DE5221E69E100DC3CB2 /* ReluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DBC221E69DD00DC3CB2 /* ReluKernel.metal */; };
-		FCC15DE6221E69E100DC3CB2 /* BoxCoder.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DBD221E69DD00DC3CB2 /* BoxCoder.metal */; };
-		FCC15DE7221E69E100DC3CB2 /* ConvAddBNReluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DBE221E69DD00DC3CB2 /* ConvAddBNReluKernel.metal */; };
-		FCC15DE8221E69E100DC3CB2 /* Split.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DBF221E69DD00DC3CB2 /* Split.metal */; };
-		FCC15DE9221E69E100DC3CB2 /* BilinearInterp.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DC0221E69DD00DC3CB2 /* BilinearInterp.metal */; };
-		FCC15DEA221E69E100DC3CB2 /* ElementwiseAddPreluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DC1221E69DD00DC3CB2 /* ElementwiseAddPreluKernel.metal */; };
-		FCC15DEB221E69E100DC3CB2 /* NMSFetchResultKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DC2221E69DD00DC3CB2 /* NMSFetchResultKernel.metal */; };
-		FCC15DEC221E69E100DC3CB2 /* Softmax.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DC3221E69DD00DC3CB2 /* Softmax.metal */; };
-		FCC15DED221E69E100DC3CB2 /* PoolKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DC4221E69DE00DC3CB2 /* PoolKernel.inc.metal */; };
-		FCC15DEE221E69E100DC3CB2 /* ConvTransposeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DC5221E69DE00DC3CB2 /* ConvTransposeKernel.metal */; };
-		FCC15DEF221E69E100DC3CB2 /* Macro.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DC6221E69DE00DC3CB2 /* Macro.metal */; };
-		FCC15DF0221E69E100DC3CB2 /* PreluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DC7221E69DE00DC3CB2 /* PreluKernel.metal */; };
-		FCC15DF1221E69E100DC3CB2 /* BilinearInterp.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DC8221E69DE00DC3CB2 /* BilinearInterp.inc.metal */; };
-		FCC15DF2221E69E100DC3CB2 /* TransposeKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DC9221E69DE00DC3CB2 /* TransposeKernel.inc.metal */; };
-		FCC15DF3221E69E100DC3CB2 /* ConcatKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DCA221E69DE00DC3CB2 /* ConcatKernel.metal */; };
-		FCC15DF4221E69E100DC3CB2 /* ResizeBilinear.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DCB221E69DE00DC3CB2 /* ResizeBilinear.metal */; };
-		FCC15DF5221E69E100DC3CB2 /* Common.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DCC221E69DE00DC3CB2 /* Common.metal */; };
-		FCC15DF6221E69E100DC3CB2 /* PoolKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DCD221E69DE00DC3CB2 /* PoolKernel.metal */; };
-		FCC15DF7221E69E100DC3CB2 /* ReshapeKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DCE221E69DE00DC3CB2 /* ReshapeKernel.inc.metal */; };
-		FCC15DF8221E69E100DC3CB2 /* ConvBNReluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DCF221E69DE00DC3CB2 /* ConvBNReluKernel.metal */; };
-		FCC15DF9221E69E100DC3CB2 /* Kernels.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD0221E69DE00DC3CB2 /* Kernels.metal */; };
-		FCC15DFA221E69E100DC3CB2 /* Shape.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD1221E69DF00DC3CB2 /* Shape.metal */; };
-		FCC15DFB221E69E100DC3CB2 /* Softmax.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD2221E69DF00DC3CB2 /* Softmax.inc.metal */; };
-		FCC15DFC221E69E100DC3CB2 /* ConvAddPreluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD3221E69DF00DC3CB2 /* ConvAddPreluKernel.metal */; };
-		FCC15DFD221E69E100DC3CB2 /* Elementwise.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD4221E69DF00DC3CB2 /* Elementwise.metal */; };
-		FCC15DFE221E69E100DC3CB2 /* ReshapeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD5221E69DF00DC3CB2 /* ReshapeKernel.metal */; };
-		FCC15DFF221E69E100DC3CB2 /* Scale.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD6221E69DF00DC3CB2 /* Scale.metal */; };
-		FCC15E01221E69E100DC3CB2 /* PriorBoxKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD8221E69DF00DC3CB2 /* PriorBoxKernel.metal */; };
-		FCC15E02221E69E100DC3CB2 /* BatchNormRelu.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD9221E69E000DC3CB2 /* BatchNormRelu.metal */; };
-		FCC15E03221E69E100DC3CB2 /* TransposeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DDA221E69E000DC3CB2 /* TransposeKernel.metal */; };
-		FCC15E04221E69E100DC3CB2 /* ConvAddPrelu.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DDB221E69E000DC3CB2 /* ConvAddPrelu.inc.metal */; };
-		FCC15E05221E69E100DC3CB2 /* BatchNormKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DDC221E69E000DC3CB2 /* BatchNormKernel.metal */; };
-		FCC15E06221E69E100DC3CB2 /* BoxCoder.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DDD221E69E000DC3CB2 /* BoxCoder.inc.metal */; };
-		FCC15E07221E69E100DC3CB2 /* FetchKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DDE221E69E000DC3CB2 /* FetchKernel.metal */; };
-		FCC15E08221E69E100DC3CB2 /* Split.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DDF221E69E000DC3CB2 /* Split.inc.metal */; };
-		FCC15E09221E69E100DC3CB2 /* ConcatKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DE0221E69E100DC3CB2 /* ConcatKernel.inc.metal */; };
-		FCC15E0A221E69E100DC3CB2 /* ElementwiseAddPreluKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DE1221E69E100DC3CB2 /* ElementwiseAddPreluKernel.inc.metal */; };
-		FCC15E0B221E69E100DC3CB2 /* FetchKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DE2221E69E100DC3CB2 /* FetchKernel.inc.metal */; };
-		FCC15E0C221E69E100DC3CB2 /* BufferToTexture.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DE3221E69E100DC3CB2 /* BufferToTexture.metal */; };
-/* End PBXBuildFile section */
-
-/* Begin PBXFileReference section */
-		16324D852292C4930047277D /* NearestInterpKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = NearestInterpKernel.metal; sourceTree = "<group>"; };
-		165F38D62276F4C00088E29F /* ConvAddReluMetal.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvAddReluMetal.metal; sourceTree = "<group>"; };
-		16D3F3BA22929EAD0067C45D /* SliceKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = SliceKernel.metal; sourceTree = "<group>"; };
-		16FBFB3D22925D040025B406 /* ActivationKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ActivationKernel.metal; sourceTree = "<group>"; };
-		33511F4FF7FE78679BE12DC0 /* Pods-paddle-mobile-metallib.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile-metallib.release.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile-metallib/Pods-paddle-mobile-metallib.release.xcconfig"; sourceTree = "<group>"; };
-		5D9D330A035906298947080B /* Pods_paddle_mobile_metallib.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Pods_paddle_mobile_metallib.framework; sourceTree = BUILT_PRODUCTS_DIR; };
-		A74CAFEF228D9B9B000BBFCA /* ScaleKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ScaleKernel.metal; sourceTree = "<group>"; };
-		C6D31B9F9533810DBCA6B28D /* Pods-paddle-mobile-metallib.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile-metallib.debug.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile-metallib/Pods-paddle-mobile-metallib.debug.xcconfig"; sourceTree = "<group>"; };
-		FCC15D60221E66DE00DC3CB2 /* paddle-mobile-metallib.metallib */ = {isa = PBXFileReference; explicitFileType = "archive.metal-library"; includeInIndex = 0; path = "paddle-mobile-metallib.metallib"; sourceTree = BUILT_PRODUCTS_DIR; };
-		FCC15DBC221E69DD00DC3CB2 /* ReluKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ReluKernel.metal; sourceTree = "<group>"; };
-		FCC15DBD221E69DD00DC3CB2 /* BoxCoder.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BoxCoder.metal; sourceTree = "<group>"; };
-		FCC15DBE221E69DD00DC3CB2 /* ConvAddBNReluKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ConvAddBNReluKernel.metal; sourceTree = "<group>"; };
-		FCC15DBF221E69DD00DC3CB2 /* Split.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Split.metal; sourceTree = "<group>"; };
-		FCC15DC0221E69DD00DC3CB2 /* BilinearInterp.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BilinearInterp.metal; sourceTree = "<group>"; };
-		FCC15DC1221E69DD00DC3CB2 /* ElementwiseAddPreluKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ElementwiseAddPreluKernel.metal; sourceTree = "<group>"; };
-		FCC15DC2221E69DD00DC3CB2 /* NMSFetchResultKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = NMSFetchResultKernel.metal; sourceTree = "<group>"; };
-		FCC15DC3221E69DD00DC3CB2 /* Softmax.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Softmax.metal; sourceTree = "<group>"; };
-		FCC15DC4221E69DE00DC3CB2 /* PoolKernel.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = PoolKernel.inc.metal; sourceTree = "<group>"; };
-		FCC15DC5221E69DE00DC3CB2 /* ConvTransposeKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ConvTransposeKernel.metal; sourceTree = "<group>"; };
-		FCC15DC6221E69DE00DC3CB2 /* Macro.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Macro.metal; sourceTree = "<group>"; };
-		FCC15DC7221E69DE00DC3CB2 /* PreluKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = PreluKernel.metal; sourceTree = "<group>"; };
-		FCC15DC8221E69DE00DC3CB2 /* BilinearInterp.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BilinearInterp.inc.metal; sourceTree = "<group>"; };
-		FCC15DC9221E69DE00DC3CB2 /* TransposeKernel.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = TransposeKernel.inc.metal; sourceTree = "<group>"; };
-		FCC15DCA221E69DE00DC3CB2 /* ConcatKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ConcatKernel.metal; sourceTree = "<group>"; };
-		FCC15DCB221E69DE00DC3CB2 /* ResizeBilinear.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ResizeBilinear.metal; sourceTree = "<group>"; };
-		FCC15DCC221E69DE00DC3CB2 /* Common.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Common.metal; sourceTree = "<group>"; };
-		FCC15DCD221E69DE00DC3CB2 /* PoolKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = PoolKernel.metal; sourceTree = "<group>"; };
-		FCC15DCE221E69DE00DC3CB2 /* ReshapeKernel.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ReshapeKernel.inc.metal; sourceTree = "<group>"; };
-		FCC15DCF221E69DE00DC3CB2 /* ConvBNReluKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ConvBNReluKernel.metal; sourceTree = "<group>"; };
-		FCC15DD0221E69DE00DC3CB2 /* Kernels.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Kernels.metal; sourceTree = "<group>"; };
-		FCC15DD1221E69DF00DC3CB2 /* Shape.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Shape.metal; sourceTree = "<group>"; };
-		FCC15DD2221E69DF00DC3CB2 /* Softmax.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Softmax.inc.metal; sourceTree = "<group>"; };
-		FCC15DD3221E69DF00DC3CB2 /* ConvAddPreluKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ConvAddPreluKernel.metal; sourceTree = "<group>"; };
-		FCC15DD4221E69DF00DC3CB2 /* Elementwise.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Elementwise.metal; sourceTree = "<group>"; };
-		FCC15DD5221E69DF00DC3CB2 /* ReshapeKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ReshapeKernel.metal; sourceTree = "<group>"; };
-		FCC15DD6221E69DF00DC3CB2 /* Scale.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Scale.metal; sourceTree = "<group>"; };
-		FCC15DD8221E69DF00DC3CB2 /* PriorBoxKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = PriorBoxKernel.metal; sourceTree = "<group>"; };
-		FCC15DD9221E69E000DC3CB2 /* BatchNormRelu.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BatchNormRelu.metal; sourceTree = "<group>"; };
-		FCC15DDA221E69E000DC3CB2 /* TransposeKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = TransposeKernel.metal; sourceTree = "<group>"; };
-		FCC15DDB221E69E000DC3CB2 /* ConvAddPrelu.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ConvAddPrelu.inc.metal; sourceTree = "<group>"; };
-		FCC15DDC221E69E000DC3CB2 /* BatchNormKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BatchNormKernel.metal; sourceTree = "<group>"; };
-		FCC15DDD221E69E000DC3CB2 /* BoxCoder.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BoxCoder.inc.metal; sourceTree = "<group>"; };
-		FCC15DDE221E69E000DC3CB2 /* FetchKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = FetchKernel.metal; sourceTree = "<group>"; };
-		FCC15DDF221E69E000DC3CB2 /* Split.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Split.inc.metal; sourceTree = "<group>"; };
-		FCC15DE0221E69E100DC3CB2 /* ConcatKernel.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ConcatKernel.inc.metal; sourceTree = "<group>"; };
-		FCC15DE1221E69E100DC3CB2 /* ElementwiseAddPreluKernel.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ElementwiseAddPreluKernel.inc.metal; sourceTree = "<group>"; };
-		FCC15DE2221E69E100DC3CB2 /* FetchKernel.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = FetchKernel.inc.metal; sourceTree = "<group>"; };
-		FCC15DE3221E69E100DC3CB2 /* BufferToTexture.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BufferToTexture.metal; sourceTree = "<group>"; };
-/* End PBXFileReference section */
-
-/* Begin PBXFrameworksBuildPhase section */
-		3262940821E130A79BEC3880 /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				5CCC0CF6759710BAFE999DB7 /* Pods_paddle_mobile_metallib.framework in Frameworks */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXFrameworksBuildPhase section */
-
-/* Begin PBXGroup section */
-		2EC9E2608C2591494F8F23F0 /* Frameworks */ = {
-			isa = PBXGroup;
-			children = (
-				5D9D330A035906298947080B /* Pods_paddle_mobile_metallib.framework */,
-			);
-			name = Frameworks;
-			sourceTree = "<group>";
-		};
-		755C26B34D5114CE1B98D3DC /* Pods */ = {
-			isa = PBXGroup;
-			children = (
-				C6D31B9F9533810DBCA6B28D /* Pods-paddle-mobile-metallib.debug.xcconfig */,
-				33511F4FF7FE78679BE12DC0 /* Pods-paddle-mobile-metallib.release.xcconfig */,
-			);
-			name = Pods;
-			sourceTree = "<group>";
-		};
-		FCC15D59221E66DE00DC3CB2 = {
-			isa = PBXGroup;
-			children = (
-				FCC15D62221E66DE00DC3CB2 /* paddle-mobile-metallib */,
-				FCC15D61221E66DE00DC3CB2 /* Products */,
-				755C26B34D5114CE1B98D3DC /* Pods */,
-				2EC9E2608C2591494F8F23F0 /* Frameworks */,
-			);
-			sourceTree = "<group>";
-		};
-		FCC15D61221E66DE00DC3CB2 /* Products */ = {
-			isa = PBXGroup;
-			children = (
-				FCC15D60221E66DE00DC3CB2 /* paddle-mobile-metallib.metallib */,
-			);
-			name = Products;
-			sourceTree = "<group>";
-		};
-		FCC15D62221E66DE00DC3CB2 /* paddle-mobile-metallib */ = {
-			isa = PBXGroup;
-			children = (
-				FCC15DDC221E69E000DC3CB2 /* BatchNormKernel.metal */,
-				FCC15DD9221E69E000DC3CB2 /* BatchNormRelu.metal */,
-				FCC15DC8221E69DE00DC3CB2 /* BilinearInterp.inc.metal */,
-				FCC15DC0221E69DD00DC3CB2 /* BilinearInterp.metal */,
-				FCC15DDD221E69E000DC3CB2 /* BoxCoder.inc.metal */,
-				FCC15DBD221E69DD00DC3CB2 /* BoxCoder.metal */,
-				FCC15DE3221E69E100DC3CB2 /* BufferToTexture.metal */,
-				FCC15DCC221E69DE00DC3CB2 /* Common.metal */,
-				FCC15DE0221E69E100DC3CB2 /* ConcatKernel.inc.metal */,
-				FCC15DCA221E69DE00DC3CB2 /* ConcatKernel.metal */,
-				FCC15DBE221E69DD00DC3CB2 /* ConvAddBNReluKernel.metal */,
-				FCC15DDB221E69E000DC3CB2 /* ConvAddPrelu.inc.metal */,
-				FCC15DD3221E69DF00DC3CB2 /* ConvAddPreluKernel.metal */,
-				FCC15DCF221E69DE00DC3CB2 /* ConvBNReluKernel.metal */,
-				FCC15DC5221E69DE00DC3CB2 /* ConvTransposeKernel.metal */,
-				FCC15DD4221E69DF00DC3CB2 /* Elementwise.metal */,
-				FCC15DE1221E69E100DC3CB2 /* ElementwiseAddPreluKernel.inc.metal */,
-				FCC15DC1221E69DD00DC3CB2 /* ElementwiseAddPreluKernel.metal */,
-				FCC15DE2221E69E100DC3CB2 /* FetchKernel.inc.metal */,
-				FCC15DDE221E69E000DC3CB2 /* FetchKernel.metal */,
-				FCC15DD0221E69DE00DC3CB2 /* Kernels.metal */,
-				FCC15DC6221E69DE00DC3CB2 /* Macro.metal */,
-				FCC15DC2221E69DD00DC3CB2 /* NMSFetchResultKernel.metal */,
-				FCC15DC4221E69DE00DC3CB2 /* PoolKernel.inc.metal */,
-				FCC15DCD221E69DE00DC3CB2 /* PoolKernel.metal */,
-				FCC15DC7221E69DE00DC3CB2 /* PreluKernel.metal */,
-				FCC15DD8221E69DF00DC3CB2 /* PriorBoxKernel.metal */,
-				FCC15DBC221E69DD00DC3CB2 /* ReluKernel.metal */,
-				FCC15DCE221E69DE00DC3CB2 /* ReshapeKernel.inc.metal */,
-				FCC15DD5221E69DF00DC3CB2 /* ReshapeKernel.metal */,
-				FCC15DCB221E69DE00DC3CB2 /* ResizeBilinear.metal */,
-				FCC15DD6221E69DF00DC3CB2 /* Scale.metal */,
-				FCC15DD1221E69DF00DC3CB2 /* Shape.metal */,
-				FCC15DD2221E69DF00DC3CB2 /* Softmax.inc.metal */,
-				FCC15DC3221E69DD00DC3CB2 /* Softmax.metal */,
-				FCC15DDF221E69E000DC3CB2 /* Split.inc.metal */,
-				FCC15DBF221E69DD00DC3CB2 /* Split.metal */,
-				FCC15DC9221E69DE00DC3CB2 /* TransposeKernel.inc.metal */,
-				FCC15DDA221E69E000DC3CB2 /* TransposeKernel.metal */,
-				A74CAFEF228D9B9B000BBFCA /* ScaleKernel.metal */,
-				165F38D62276F4C00088E29F /* ConvAddReluMetal.metal */,
-				16FBFB3D22925D040025B406 /* ActivationKernel.metal */,
-				16D3F3BA22929EAD0067C45D /* SliceKernel.metal */,
-				16324D852292C4930047277D /* NearestInterpKernel.metal */,
-			);
-			path = "paddle-mobile-metallib";
-			sourceTree = "<group>";
-		};
-/* End PBXGroup section */
-
-/* Begin PBXNativeTarget section */
-		FCC15D5F221E66DE00DC3CB2 /* paddle-mobile-metallib */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = FCC15D67221E66DE00DC3CB2 /* Build configuration list for PBXNativeTarget "paddle-mobile-metallib" */;
-			buildPhases = (
-				DD854B7EC9A77A557887A67F /* [CP] Check Pods Manifest.lock */,
-				FCC15D5E221E66DE00DC3CB2 /* Sources */,
-				3262940821E130A79BEC3880 /* Frameworks */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-			);
-			name = "paddle-mobile-metallib";
-			productName = "paddle-mobile-metallib";
-			productReference = FCC15D60221E66DE00DC3CB2 /* paddle-mobile-metallib.metallib */;
-			productType = "com.apple.product-type.metal-library";
-		};
-/* End PBXNativeTarget section */
-
-/* Begin PBXProject section */
-		FCC15D5A221E66DE00DC3CB2 /* Project object */ = {
-			isa = PBXProject;
-			attributes = {
-				LastUpgradeCheck = 1010;
-				ORGANIZATIONNAME = Ray;
-				TargetAttributes = {
-					FCC15D5F221E66DE00DC3CB2 = {
-						CreatedOnToolsVersion = 10.1;
-					};
-				};
-			};
-			buildConfigurationList = FCC15D5D221E66DE00DC3CB2 /* Build configuration list for PBXProject "paddle-mobile-metallib" */;
-			compatibilityVersion = "Xcode 9.3";
-			developmentRegion = en;
-			hasScannedForEncodings = 0;
-			knownRegions = (
-				en,
-			);
-			mainGroup = FCC15D59221E66DE00DC3CB2;
-			productRefGroup = FCC15D61221E66DE00DC3CB2 /* Products */;
-			projectDirPath = "";
-			projectRoot = "";
-			targets = (
-				FCC15D5F221E66DE00DC3CB2 /* paddle-mobile-metallib */,
-			);
-		};
-/* End PBXProject section */
-
-/* Begin PBXShellScriptBuildPhase section */
-		DD854B7EC9A77A557887A67F /* [CP] Check Pods Manifest.lock */ = {
-			isa = PBXShellScriptBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			inputFileListPaths = (
-			);
-			inputPaths = (
-				"${PODS_PODFILE_DIR_PATH}/Podfile.lock",
-				"${PODS_ROOT}/Manifest.lock",
-			);
-			name = "[CP] Check Pods Manifest.lock";
-			outputFileListPaths = (
-			);
-			outputPaths = (
-				"$(DERIVED_FILE_DIR)/Pods-paddle-mobile-metallib-checkManifestLockResult.txt",
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-			shellPath = /bin/sh;
-			shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n    # print error to STDERR\n    echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n    exit 1\nfi\n# This output is used by Xcode 'outputs' to avoid re-running this script phase.\necho \"SUCCESS\" > \"${SCRIPT_OUTPUT_FILE_0}\"\n";
-			showEnvVarsInLog = 0;
-		};
-/* End PBXShellScriptBuildPhase section */
-
-/* Begin PBXSourcesBuildPhase section */
-		FCC15D5E221E66DE00DC3CB2 /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				FCC15DF5221E69E100DC3CB2 /* Common.metal in Sources */,
-				FCC15DFF221E69E100DC3CB2 /* Scale.metal in Sources */,
-				FCC15DEC221E69E100DC3CB2 /* Softmax.metal in Sources */,
-				FCC15DE8221E69E100DC3CB2 /* Split.metal in Sources */,
-				FCC15DF2221E69E100DC3CB2 /* TransposeKernel.inc.metal in Sources */,
-				FCC15DE7221E69E100DC3CB2 /* ConvAddBNReluKernel.metal in Sources */,
-				A74CAFF0228D9B9B000BBFCA /* ScaleKernel.metal in Sources */,
-				FCC15E04221E69E100DC3CB2 /* ConvAddPrelu.inc.metal in Sources */,
-				FCC15DF9221E69E100DC3CB2 /* Kernels.metal in Sources */,
-				FCC15DF0221E69E100DC3CB2 /* PreluKernel.metal in Sources */,
-				FCC15DEB221E69E100DC3CB2 /* NMSFetchResultKernel.metal in Sources */,
-				FCC15DE9221E69E100DC3CB2 /* BilinearInterp.metal in Sources */,
-				FCC15DFA221E69E100DC3CB2 /* Shape.metal in Sources */,
-				FCC15E0C221E69E100DC3CB2 /* BufferToTexture.metal in Sources */,
-				FCC15E0A221E69E100DC3CB2 /* ElementwiseAddPreluKernel.inc.metal in Sources */,
-				FCC15DFB221E69E100DC3CB2 /* Softmax.inc.metal in Sources */,
-				FCC15E03221E69E100DC3CB2 /* TransposeKernel.metal in Sources */,
-				FCC15DFE221E69E100DC3CB2 /* ReshapeKernel.metal in Sources */,
-				16FBFB3E22925D040025B406 /* ActivationKernel.metal in Sources */,
-				FCC15DF7221E69E100DC3CB2 /* ReshapeKernel.inc.metal in Sources */,
-				FCC15DE5221E69E100DC3CB2 /* ReluKernel.metal in Sources */,
-				FCC15DEF221E69E100DC3CB2 /* Macro.metal in Sources */,
-				FCC15E02221E69E100DC3CB2 /* BatchNormRelu.metal in Sources */,
-				FCC15E01221E69E100DC3CB2 /* PriorBoxKernel.metal in Sources */,
-				FCC15DEA221E69E100DC3CB2 /* ElementwiseAddPreluKernel.metal in Sources */,
-				FCC15DED221E69E100DC3CB2 /* PoolKernel.inc.metal in Sources */,
-				FCC15E07221E69E100DC3CB2 /* FetchKernel.metal in Sources */,
-				FCC15E0B221E69E100DC3CB2 /* FetchKernel.inc.metal in Sources */,
-				FCC15DEE221E69E100DC3CB2 /* ConvTransposeKernel.metal in Sources */,
-				FCC15DFC221E69E100DC3CB2 /* ConvAddPreluKernel.metal in Sources */,
-				16D3F3BB22929EAD0067C45D /* SliceKernel.metal in Sources */,
-				FCC15E06221E69E100DC3CB2 /* BoxCoder.inc.metal in Sources */,
-				FCC15DF1221E69E100DC3CB2 /* BilinearInterp.inc.metal in Sources */,
-				FCC15E08221E69E100DC3CB2 /* Split.inc.metal in Sources */,
-				FCC15DF4221E69E100DC3CB2 /* ResizeBilinear.metal in Sources */,
-				FCC15E05221E69E100DC3CB2 /* BatchNormKernel.metal in Sources */,
-				165F38D72276F4C00088E29F /* ConvAddReluMetal.metal in Sources */,
-				16324D862292C4930047277D /* NearestInterpKernel.metal in Sources */,
-				FCC15DE6221E69E100DC3CB2 /* BoxCoder.metal in Sources */,
-				FCC15DF6221E69E100DC3CB2 /* PoolKernel.metal in Sources */,
-				FCC15E09221E69E100DC3CB2 /* ConcatKernel.inc.metal in Sources */,
-				FCC15DFD221E69E100DC3CB2 /* Elementwise.metal in Sources */,
-				FCC15DF8221E69E100DC3CB2 /* ConvBNReluKernel.metal in Sources */,
-				FCC15DF3221E69E100DC3CB2 /* ConcatKernel.metal in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXSourcesBuildPhase section */
-
-/* Begin XCBuildConfiguration section */
-		FCC15D65221E66DE00DC3CB2 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
-				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
-				MTL_FAST_MATH = YES;
-				MTL_LANGUAGE_REVISION = Metal12;
-				SDKROOT = iphoneos;
-			};
-			name = Debug;
-		};
-		FCC15D66221E66DE00DC3CB2 /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
-				MTL_ENABLE_DEBUG_INFO = NO;
-				MTL_FAST_MATH = YES;
-				MTL_LANGUAGE_REVISION = Metal12;
-				SDKROOT = iphoneos;
-			};
-			name = Release;
-		};
-		FCC15D68221E66DE00DC3CB2 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			baseConfigurationReference = C6D31B9F9533810DBCA6B28D /* Pods-paddle-mobile-metallib.debug.xcconfig */;
-			buildSettings = {
-				CODE_SIGN_STYLE = Automatic;
-				MTL_ENABLE_DEBUG_INFO = NO;
-				MTL_LANGUAGE_REVISION = iOSMetal10;
-				PRODUCT_NAME = "$(TARGET_NAME)";
-			};
-			name = Debug;
-		};
-		FCC15D69221E66DE00DC3CB2 /* Release */ = {
-			isa = XCBuildConfiguration;
-			baseConfigurationReference = 33511F4FF7FE78679BE12DC0 /* Pods-paddle-mobile-metallib.release.xcconfig */;
-			buildSettings = {
-				CODE_SIGN_STYLE = Automatic;
-				MTL_ENABLE_DEBUG_INFO = NO;
-				MTL_LANGUAGE_REVISION = iOSMetal10;
-				PRODUCT_NAME = "$(TARGET_NAME)";
-			};
-			name = Release;
-		};
-/* End XCBuildConfiguration section */
-
-/* Begin XCConfigurationList section */
-		FCC15D5D221E66DE00DC3CB2 /* Build configuration list for PBXProject "paddle-mobile-metallib" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				FCC15D65221E66DE00DC3CB2 /* Debug */,
-				FCC15D66221E66DE00DC3CB2 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		FCC15D67221E66DE00DC3CB2 /* Build configuration list for PBXNativeTarget "paddle-mobile-metallib" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				FCC15D68221E66DE00DC3CB2 /* Debug */,
-				FCC15D69221E66DE00DC3CB2 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-/* End XCConfigurationList section */
-	};
-	rootObject = FCC15D5A221E66DE00DC3CB2 /* Project object */;
-}
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.xcworkspace/contents.xcworkspacedata b/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.xcworkspace/contents.xcworkspacedata
deleted file mode 100644
index 7fb68fedde..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.xcworkspace/contents.xcworkspacedata
+++ /dev/null
@@ -1,7 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<Workspace
-   version = "1.0">
-   <FileRef
-      location = "self:paddle-mobile-metallib.xcodeproj">
-   </FileRef>
-</Workspace>
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist b/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
deleted file mode 100644
index 18d981003d..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
+++ /dev/null
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>IDEDidComputeMac32BitWarning</key>
-	<true/>
-</dict>
-</plist>
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/xcshareddata/xcschemes/paddle-mobile-metallib.xcscheme b/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/xcshareddata/xcschemes/paddle-mobile-metallib.xcscheme
deleted file mode 100644
index db675cafd8..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/xcshareddata/xcschemes/paddle-mobile-metallib.xcscheme
+++ /dev/null
@@ -1,80 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<Scheme
-   LastUpgradeVersion = "1010"
-   version = "1.3">
-   <BuildAction
-      parallelizeBuildables = "YES"
-      buildImplicitDependencies = "YES">
-      <BuildActionEntries>
-         <BuildActionEntry
-            buildForTesting = "YES"
-            buildForRunning = "YES"
-            buildForProfiling = "YES"
-            buildForArchiving = "YES"
-            buildForAnalyzing = "YES">
-            <BuildableReference
-               BuildableIdentifier = "primary"
-               BlueprintIdentifier = "FCC15D5F221E66DE00DC3CB2"
-               BuildableName = "paddle-mobile-metallib.metallib"
-               BlueprintName = "paddle-mobile-metallib"
-               ReferencedContainer = "container:paddle-mobile-metallib.xcodeproj">
-            </BuildableReference>
-         </BuildActionEntry>
-      </BuildActionEntries>
-   </BuildAction>
-   <TestAction
-      buildConfiguration = "Debug"
-      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
-      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
-      shouldUseLaunchSchemeArgsEnv = "YES">
-      <Testables>
-      </Testables>
-      <AdditionalOptions>
-      </AdditionalOptions>
-   </TestAction>
-   <LaunchAction
-      buildConfiguration = "Release"
-      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
-      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
-      launchStyle = "0"
-      useCustomWorkingDirectory = "NO"
-      ignoresPersistentStateOnLaunch = "NO"
-      debugDocumentVersioning = "YES"
-      debugServiceExtension = "internal"
-      allowLocationSimulation = "YES">
-      <MacroExpansion>
-         <BuildableReference
-            BuildableIdentifier = "primary"
-            BlueprintIdentifier = "FCC15D5F221E66DE00DC3CB2"
-            BuildableName = "paddle-mobile-metallib.metallib"
-            BlueprintName = "paddle-mobile-metallib"
-            ReferencedContainer = "container:paddle-mobile-metallib.xcodeproj">
-         </BuildableReference>
-      </MacroExpansion>
-      <AdditionalOptions>
-      </AdditionalOptions>
-   </LaunchAction>
-   <ProfileAction
-      buildConfiguration = "Release"
-      shouldUseLaunchSchemeArgsEnv = "YES"
-      savedToolIdentifier = ""
-      useCustomWorkingDirectory = "NO"
-      debugDocumentVersioning = "YES">
-      <MacroExpansion>
-         <BuildableReference
-            BuildableIdentifier = "primary"
-            BlueprintIdentifier = "FCC15D5F221E66DE00DC3CB2"
-            BuildableName = "paddle-mobile-metallib.metallib"
-            BlueprintName = "paddle-mobile-metallib"
-            ReferencedContainer = "container:paddle-mobile-metallib.xcodeproj">
-         </BuildableReference>
-      </MacroExpansion>
-   </ProfileAction>
-   <AnalyzeAction
-      buildConfiguration = "Debug">
-   </AnalyzeAction>
-   <ArchiveAction
-      buildConfiguration = "Release"
-      revealArchiveInOrganizer = "YES">
-   </ArchiveAction>
-</Scheme>
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ActivationKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ActivationKernel.metal
deleted file mode 100644
index df42085eb1..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ActivationKernel.metal
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-kernel void exp(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                texture2d_array<float, access::write> outTexture [[texture(1)]],
-                uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const float4 input = inTexture.read(gid.xy, gid.z);
-    const float4 output = exp(input);
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void exp_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                       texture2d_array<half, access::write> outTexture [[texture(1)]],
-                       uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const float4 input = float4(inTexture.read(gid.xy, gid.z));
-    const float4 output = exp(input);
-    outTexture.write(half4(output), gid.xy, gid.z);
-}
-
-kernel void sigmoid(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                texture2d_array<float, access::write> outTexture [[texture(1)]],
-                uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const float4 input = inTexture.read(gid.xy, gid.z);
-    const float4 output = 1.0 / (1.0 + exp(-input));
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void sigmoid_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                     texture2d_array<half, access::write> outTexture [[texture(1)]],
-                     uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const float4 input = float4(inTexture.read(gid.xy, gid.z));
-    const float4 output = 1.0 / (1.0 + exp(-input));
-    outTexture.write(half4(output), gid.xy, gid.z);
-}
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormKernel.metal
deleted file mode 100644
index ab1dcfae68..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormKernel.metal
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-kernel void batchnorm(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                      texture2d_array<float, access::write> outTexture [[texture(1)]],
-                      const device float4 * nscale [[buffer(0)]],
-                      const device float4 * nbias [[buffer(1)]],
-                      uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    const float4 input = inTexture.read(gid.xy, gid.z);
-    float4 output = input * nscale[gid.z] + nbias[gid.z];
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void batchnorm_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                           texture2d_array<half, access::write> outTexture [[texture(1)]],
-                           const device half4 * newScale [[buffer(0)]],
-                           const device half4 * newBias [[buffer(1)]],
-                           uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    const half4 input = inTexture.read(gid.xy, gid.z);
-    half4 output = input * newScale[gid.z] + newBias[gid.z];
-    outTexture.write(output, gid.xy, gid.z);
-}
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal
deleted file mode 100644
index 18f1ee3769..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal
+++ /dev/null
@@ -1,28 +0,0 @@
-//
-//  BatchNormRelu.metal
-//  paddle-mobile
-//
-
-#include <metal_stdlib>
-using namespace metal;
-
-kernel void batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                const device float4 *new_scale [[buffer(0)]],
-                                const device float4 *new_biase [[buffer(1)]],
-                                uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    float4 input;
-    float4 output;
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    input = inTexture.sample(sample, gid.x, gid.y, gid.z);
-    output = fmax(input * new_scale[gid.z] + new_biase[gid.z], 0.0);
-    outTexture.write(output, gid.xy, gid.z);
-    
-}
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.inc.metal
deleted file mode 100644
index 188c31019d..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.inc.metal
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#ifdef P
-
-#define CONCAT2(a, b) a ## b
-#define CONCAT2_(a, b) a ## _ ## b
-
-#define FUNC(f, p) CONCAT2_(f, p)
-#define VECTOR(p, n) CONCAT2(p, n)
-
-kernel void FUNC(bilinear_interp, P)(texture2d_array<P, access::read> input [[texture(0)]],
-                                     texture2d_array<P, access::write> output [[texture(1)]],
-                                     constant bilinear_interp_param & pm [[buffer(0)]],
-                                     uint3 gid [[thread_position_in_grid]]) {
-    VECTOR(P, 4) r;
-    if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
-        r = input.read(gid.xy, gid.z);
-    } else {
-        P w = gid.x * pm.ratio_w;
-        P h = gid.y * pm.ratio_h;
-        uint w0 = w, h0 = h;
-        uint w1 = w0 + 1, h1 = h0 + 1;
-        P w1lambda = w - w0, h1lambda = h - h0;
-        P w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
-        if (w1 >= input.get_width()) w1 = w0;
-            if (h1 >= input.get_height()) h1 = h0;
-                VECTOR(P, 4) r0 = input.read(uint2(w0, h0), gid.z);
-                VECTOR(P, 4) r1 = input.read(uint2(w1, h0), gid.z);
-                VECTOR(P, 4) r2 = input.read(uint2(w0, h1), gid.z);
-                VECTOR(P, 4) r3 = input.read(uint2(w1, h1), gid.z);
-                r = h2lambda * (w2lambda * r0 + w1lambda * r1)
-                + h1lambda * (w2lambda * r2 + w1lambda * r3);
-                }
-    output.write(r, gid.xy, gid.z);
-}
-
-#endif
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.metal
deleted file mode 100644
index 6104abb01d..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.metal
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-struct bilinear_interp_param {
-    float ratio_h;
-    float ratio_w;
-};
-
-#define P float
-#include "BilinearInterp.inc.metal"
-#undef P
-
-#define P half
-#include "BilinearInterp.inc.metal"
-#undef P
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BoxCoder.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BoxCoder.inc.metal
deleted file mode 100644
index 184ee2bb71..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BoxCoder.inc.metal
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#ifdef P
-
-#define CONCAT2(a, b) a ## b
-#define CONCAT2_(a, b) a ## _ ## b
-
-#define FUNC(f, p) CONCAT2_(f, p)
-#define VECTOR(p, n) CONCAT2(p, n)
-kernel void FUNC(boxcoder, P)(texture2d_array<P, access::read> priorBox [[texture(0)]],
-                              texture2d_array<P, access::read> priorBoxVar [[texture(1)]],
-                              texture2d_array<P, access::read> targetBox [[texture(2)]],
-                              texture2d_array<P, access::write> output[[texture(3)]],
-                              uint3 gid [[thread_position_in_grid]]) {
-    VECTOR(P, 4) p = priorBox.read(uint2(0, gid.x), gid.z);
-    VECTOR(P, 4) pv = priorBoxVar.read(uint2(0, gid.x), gid.z);
-    VECTOR(P, 4) t;
-    t[0] = targetBox.read(uint2(0, gid.x), gid.z)[0];
-    t[1] = targetBox.read(uint2(1, gid.x), gid.z)[0];
-    t[2] = targetBox.read(uint2(2, gid.x), gid.z)[0];
-    t[3] = targetBox.read(uint2(3, gid.x), gid.z)[0];
-    
-    P px = (p.x + p.z) / 2;
-    P py = (p.y + p.w) / 2;
-    P pw = p.z - p.x;
-    P ph = p.w - p.y;
-    
-    P tx = pv.x * t.x * pw + px;
-    P ty = pv.y * t.y * ph + py;
-    P tw = exp(pv.z * t.z) * pw;
-    P th = exp(pv.w * t.w) * ph;
-    
-    VECTOR(P, 4) r;
-    r.x = tx - tw / 2;
-    r.y = ty - th / 2;
-    r.z = tx + tw / 2;
-    r.w = ty + th / 2;
-    
-    output.write(r, gid.xy, gid.z);
-}
-
-#endif
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BoxCoder.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BoxCoder.metal
deleted file mode 100644
index 4009e213d5..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BoxCoder.metal
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-#define P float
-#include "BoxCoder.inc.metal"
-#undef P
-#define P half
-#include "BoxCoder.inc.metal"
-#undef P
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BufferToTexture.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BufferToTexture.metal
deleted file mode 100644
index 5ff3e64142..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BufferToTexture.metal
+++ /dev/null
@@ -1,67 +0,0 @@
-//
-//  RGBToYCrCb_Y.metal
-//  paddle-mobile-demo
-//
-//  Created by liuRuiLong on 2018/12/28.
-//  Copyright © 2018 orange. All rights reserved.
-//
-
-#include <metal_stdlib>
-using namespace metal;
-
-kernel void buffer_to_texture_kernel(
-                                     const device float *input [[buffer(0)]],
-                                     texture2d<float, access::write> outTexture [[texture(0)]],
-                                     uint2 gid [[thread_position_in_grid]]){
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height()) {
-        return;
-    }
-    
-    float y = input[outTexture.get_width() * gid.y + gid.x];
-    outTexture.write(float4(y, 0.0f, 0.0f, 0.0f), gid);
-}
-
-kernel void buffer_to_texture_kernel_half(const device float *input [[buffer(0)]],
-                                          texture2d<half, access::write> outTexture [[texture(0)]],
-                                          uint2 gid [[thread_position_in_grid]]){
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height()) {
-        return;
-    }
-    
-    float y = input[outTexture.get_width() * gid.y + gid.x];
-    outTexture.write(half4(y, 0.0f, 0.0f, 0.0f), gid);
-}
-
-kernel void buffer_to_texture_kernel_channel_3(
-                                     const device float *input [[buffer(0)]],
-                                     texture2d<float, access::write> outTexture [[texture(0)]],
-                                     uint2 gid [[thread_position_in_grid]]){
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height()) {
-        return;
-    }
-    
-    int offset = outTexture.get_width() * outTexture.get_height();
-    float y0 = input[outTexture.get_width() * gid.y + gid.x + 0 * offset];
-    float y1 = input[outTexture.get_width() * gid.y + gid.x + 1 * offset];
-    float y2 = input[outTexture.get_width() * gid.y + gid.x + 2 * offset];
-    outTexture.write(float4(y0, y1, y2, 0.0f), gid);
-}
-
-kernel void buffer_to_texture_kernel_half_channel_3(const device float *input [[buffer(0)]],
-                                          texture2d<half, access::write> outTexture [[texture(0)]],
-                                          uint2 gid [[thread_position_in_grid]]){
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height()) {
-        return;
-    }
-
-    int offset = outTexture.get_width() * outTexture.get_height();
-    float y0 = input[outTexture.get_width() * gid.y + gid.x + 0 * offset];
-    float y1 = input[outTexture.get_width() * gid.y + gid.x + 1 * offset];
-    float y2 = input[outTexture.get_width() * gid.y + gid.x + 2 * offset];
-    outTexture.write(half4(y0, y1, y2, 0.0f), gid);
-}
-
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal
deleted file mode 100644
index 9dacf6dd86..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-
-inline void xyzn2abcd_1(int xyzn[4], int abcd[4]) {
-    abcd[0] = abcd[1] = abcd[2] = 0;
-    abcd[3] = xyzn[0] * 4 + xyzn[3];
-}
-inline void xyzn2abcd_2(int xyzn[4], int abcd[4]) {
-    abcd[0] = abcd[1] = 0;
-    abcd[2] = xyzn[1];
-    abcd[3] = xyzn[0] * 4 + xyzn[3];
-}
-inline void xyzn2abcd_3(int xyzn[4], int abcd[4]) {
-    abcd[0] = 0;
-    abcd[3] = xyzn[0];
-    abcd[2] = xyzn[1];
-    abcd[1] = xyzn[2] * 4 + xyzn[3];
-}
-inline void xyzn2abcd_4(int C, int xyzn[4], int abcd[4]) {
-    abcd[2] = xyzn[0];
-    abcd[1] = xyzn[1];
-    uint t = xyzn[2] * 4 + xyzn[3];
-    abcd[0] = t / C;
-    abcd[3] = t % C;
-}
-
-inline void abcd2xyzn_1(int abcd[4], int xyzn[4]) {
-    xyzn[1] = xyzn[2] = 0;
-    xyzn[0] = abcd[3] / 4;
-    xyzn[1] = abcd[3] % 4;
-}
-inline void abcd2xyzn_2(int abcd[4], int xyzn[4]) {
-    xyzn[2] = 0;
-    xyzn[1] = abcd[2];
-    xyzn[0] = abcd[3] / 4;
-    xyzn[3] = abcd[3] % 4;
-}
-inline void abcd2xyzn_3(int abcd[4], int xyzn[4]) {
-    xyzn[0] = abcd[3];
-    xyzn[1] = abcd[2];
-    xyzn[2] = abcd[1] / 4;
-    xyzn[3] = abcd[1] % 4;
-}
-inline void abcd2xyzn_4(int C, int abcd[4], int xyzn[4]) {
-    xyzn[0] = abcd[2];
-    xyzn[1] = abcd[1];
-    uint t = abcd[0] * C + abcd[3];
-    xyzn[2] = t / 4;
-    xyzn[3] = t % 4;
-}
-
-inline void xyzn2abcd(int C, int xyzn[4], int abcd[4]) {
-    abcd[2] = xyzn[0];
-    abcd[1] = xyzn[1];
-    uint t = xyzn[2] * 4 + xyzn[3];
-    abcd[0] = t / C;
-    abcd[3] = t % C;
-}
-
-inline void abcd2xyzn(int C, int abcd[4], int xyzn[4]) {
-    xyzn[0] = abcd[2];
-    xyzn[1] = abcd[1];
-    uint t = abcd[0] * C + abcd[3];
-    xyzn[2] = t / 4;
-    xyzn[3] = t % 4;
-}
-
-inline int32_t abcd2index(int32_t dim[4], int32_t abcd[4]) {
-    int32_t r = abcd[0];
-    r = r * dim[1] + abcd[1];
-    r = r * dim[2] + abcd[2];
-    r = r * dim[3] + abcd[3];
-    return r;
-}
-
-inline void index2abcd(int32_t dim[4], int32_t ind, int32_t abcd[4]) {
-    abcd[3] = ind % dim[3]; ind /= dim[3];
-    abcd[2] = ind % dim[2]; ind /= dim[2];
-    abcd[1] = ind % dim[1]; ind /= dim[1];
-    abcd[0] = ind;
-}
-
-inline void trans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) {
-    for (int i = 0; i < 4; i++) {
-        opos[i] = ipos[trans[i]];
-    }
-}
-
-inline void invtrans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) {
-    for (int i = 0; i < 4; i++) {
-        opos[trans[i]] = ipos[i];
-    }
-}
-
-struct ElementwiseAddParam {
-    int32_t fast;
-    int32_t addByChannel;
-    int32_t axis;
-    int32_t ylen;
-    int32_t xdim[4];
-    int32_t xtrans[4];
-    int32_t ydim[4];
-    int32_t ytrans[4];
-};
-
-struct MetalConvParam {
-    short offsetX;
-    short offsetY;
-    short offsetZ;
-    ushort strideX;
-    ushort strideY;
-    ushort dilationX;
-    ushort dilationY;
-    ushort groups;
-    ushort iC;
-    ushort fC;
-    ushort oC;
-    ushort hasAddOp;
-    ushort hasReluOp;
-    ElementwiseAddParam addParam;
-};
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.inc.metal
deleted file mode 100644
index ff8bd3d7a3..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.inc.metal
+++ /dev/null
@@ -1,318 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#ifdef P
-
-#define CONCAT2(a, b) a ## b
-#define CONCAT2_(a, b) a ## _ ## b
-#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
-#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
-#define CONCAT5_(a, b, c, d, e) a ## _ ## b ## _ ## c ## _ ## d ## _ ## e
-
-#define FUNC(f, r, n, v, p) CONCAT5_(f, r, n, v, p)
-#define VECTOR(p, n) CONCAT2(p, n)
-#define FUNC_R(f, r) CONCAT2_(f, r)
-
-#if V == VX
-#define VV x
-#elif V == VY
-#define VV y
-#elif V == VZ
-#define VV z
-#else
-#define VV normal
-#endif
-
-#if V == VNORMAL
-//kernel void FUNC(concat, R, N, normal, P)(array<texture2d_array<P, access::read>, N> in [[texture(0)]],
-//                                     texture2d_array<P, access::read> out_x [[texture(N)]],
-//                                     texture2d_array<P, access::write> out [[texture(N+1)]],
-//                                     constant ConcatParam & pm [[buffer(0)]],
-//                                     uint3 gid [[thread_position_in_grid]]) {
-//}
-kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
-                                      texture2d_array<P, access::read> in1 [[texture(1)]],
-#if N >= 3
-                                      texture2d_array<P, access::read> in2 [[texture(2)]],
-#endif
-#if N >= 4
-                                      texture2d_array<P, access::read> in3 [[texture(3)]],
-#endif
-#if N >= 5
-                                      texture2d_array<P, access::read> in4 [[texture(4)]],
-#endif
-#if N >= 6
-                                      texture2d_array<P, access::read> in5 [[texture(5)]],
-#endif
-                                      texture2d_array<P, access::read> inx [[texture(N)]],
-                                      texture2d_array<P, access::write> out [[texture(N+1)]],
-                                      constant ConcatParam & pm [[buffer(0)]],
-                                      uint3 gid [[thread_position_in_grid]]) {
-    
-    ConcatParam cp = pm;
-    int xyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, abcd[4], oxyzn[4];
-    VECTOR(P, 4) r = inx.read(gid.xy, gid.z);
-    for (int i = 0; i < 4; i++) {
-        xyzn[3] = i;
-#if R == 4
-        xyzn2abcd_4(cp.odim[3], xyzn, abcd);
-#else
-        FUNC_R(xyzn2abcd, R)(xyzn, abcd);
-#endif
-        int k = abcd[cp.axis] - cp.offset;
-        if (k < 0) continue;
-        int j = 0;
-        for (; j < N; j++) {
-            if (k < cp.vdim[j]) {
-                break;
-            }
-            k -= cp.vdim[j];
-        }
-        if (j == N) {
-            continue;
-        }
-        int ta = cp.odim[cp.axis];
-        abcd[cp.axis] = k;
-        cp.odim[cp.axis] = cp.vdim[j];
-#if R == 4
-        abcd2xyzn_4(cp.odim[3], abcd, oxyzn);
-#else
-        FUNC_R(abcd2xyzn, R)(abcd, oxyzn);
-#endif
-        cp.odim[cp.axis] = ta;
-        switch (j) {
-            case 0: r[i] = in0.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
-            case 1: r[i] = in1.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
-#if N >= 3
-            case 2: r[i] = in2.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
-#endif
-#if N >= 4
-            case 3: r[i] = in3.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
-#endif
-#if N >= 5
-            case 4: r[i] = in4.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
-#endif
-#if N >= 6
-            case 5: r[i] = in5.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
-#endif
-        }
-    }
-    out.write(r, gid.xy, gid.z);
-}
-
-#endif // V == NORMAL
-
-
-
-#if V == VX
-kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
-                                      texture2d_array<P, access::read> in1 [[texture(1)]],
-#if N >= 3
-                                      texture2d_array<P, access::read> in2 [[texture(2)]],
-#endif // N >= 3
-#if N >= 4
-                                      texture2d_array<P, access::read> in3 [[texture(3)]],
-#endif // N >= 4
-#if N >= 5
-                                      texture2d_array<P, access::read> in4 [[texture(4)]],
-#endif // N >= 5
-#if N >= 6
-                                      texture2d_array<P, access::read> in5 [[texture(5)]],
-#endif // N >= 6
-                                      texture2d_array<P, access::write> out [[texture(N)]],
-                                      constant ConcatParam & pm [[buffer(0)]],
-                                      uint3 gid [[thread_position_in_grid]]) {
-    int x = gid.x - pm.offset;
-    if (x < 0) return;
-    if (x < pm.vdim[0]) {
-        VECTOR(P, 4) r = in0.read(gid.xy, gid.z);
-        out.write(r, gid.xy, gid.z);
-        return;
-    }
-    x -= pm.vdim[0];
-    if (x < pm.vdim[1]) {
-        VECTOR(P, 4) r = in1.read(uint2(x, gid.y), gid.z);
-        out.write(r, gid.xy, gid.z);
-        return;
-    }
-#if N >= 3
-    x -= pm.vdim[1];
-    if (x < pm.vdim[2]) {
-        VECTOR(P, 4) r = in2.read(uint2(x, gid.y), gid.z);
-        out.write(r, gid.xy, gid.z);
-        return;
-    }
-#endif // N >= 3
-#if N >= 4
-    x -= pm.vdim[2];
-    if (x < pm.vdim[3]) {
-        VECTOR(P, 4) r = in3.read(uint2(x, gid.y), gid.z);
-        out.write(r, gid.xy, gid.z);
-        return;
-    }
-#endif // N >= 4
-#if N >= 5
-    x -= pm.vdim[3];
-    if (x < pm.vdim[4]) {
-        VECTOR(P, 4) r = in4.read(uint2(x, gid.y), gid.z);
-        out.write(r, gid.xy, gid.z);
-        return;
-    }
-#endif // N >= 5
-#if N >= 6
-    x -= pm.vdim[4];
-    if (x < pm.vdim[5]) {
-        VECTOR(P, 4) r = in5.read(uint2(x, gid.y), gid.z);
-        out.write(r, gid.xy, gid.z);
-        return;
-    }
-#endif // N >= 6
-}
-#endif // V == VX
-
-#if V == VY
-kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
-                                      texture2d_array<P, access::read> in1 [[texture(1)]],
-#if N >= 3
-                                      texture2d_array<P, access::read> in2 [[texture(2)]],
-#endif // N >= 3
-#if N >= 4
-                                      texture2d_array<P, access::read> in3 [[texture(3)]],
-#endif // N >= 4
-#if N >= 5
-                                      texture2d_array<P, access::read> in4 [[texture(4)]],
-#endif // N >= 5
-#if N >= 6
-                                      texture2d_array<P, access::read> in5 [[texture(5)]],
-#endif // N >= 6
-                                      texture2d_array<P, access::write> out [[texture(N)]],
-                                      constant ConcatParam & pm [[buffer(0)]],
-                                      uint3 gid [[thread_position_in_grid]]) {
-    int y = gid.y - pm.offset;
-    if (y < 0) return;
-    if (y < pm.vdim[0]) {
-        VECTOR(P, 4)  r = in0.read(gid.xy, gid.z);
-        out.write(r, gid.xy, gid.z);
-        return;
-    }
-    y -= pm.vdim[0];
-    if (y < pm.vdim[1]) {
-        VECTOR(P, 4)  r = in1.read(uint2(gid.x, y), gid.z);
-        out.write(r, gid.xy, gid.z);
-        return;
-    }
-#if N >= 3
-    y -= pm.vdim[1];
-    if (y < pm.vdim[2]) {
-        VECTOR(P, 4)  r = in2.read(uint2(gid.x, y), gid.z);
-        out.write(r, gid.xy, gid.z);
-        return;
-    }
-#endif // N >= 3
-#if N >= 4
-    y -= pm.vdim[2];
-    if (y < pm.vdim[3]) {
-        VECTOR(P, 4)  r = in3.read(uint2(gid.x, y), gid.z);
-        out.write(r, gid.xy, gid.z);
-        return;
-    }
-#endif // N >= 4
-#if N >= 5
-    y -= pm.vdim[3];
-    if (y < pm.vdim[4]) {
-        VECTOR(P, 4)  r = in4.read(uint2(gid.x, y), gid.z);
-        out.write(r, gid.xy, gid.z);
-        return;
-    }
-#endif // N >= 5
-#if N >= 6
-    y -= pm.vdim[4];
-    if (y < pm.vdim[5]) {
-        VECTOR(P, 4)  r = in5.read(uint2(gid.x, y), gid.z);
-        out.write(r, gid.xy, gid.z);
-        return;
-    }
-#endif // N >= 6
-}
-#endif // V == VY
-
-#if V == VZ
-kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
-                                      texture2d_array<P, access::read> in1 [[texture(1)]],
-#if N >= 3
-                                      texture2d_array<P, access::read> in2 [[texture(2)]],
-#endif // N >= 3
-#if N >= 4
-                                      texture2d_array<P, access::read> in3 [[texture(3)]],
-#endif // N >= 4
-#if N >= 5
-                                      texture2d_array<P, access::read> in4 [[texture(4)]],
-#endif // N >= 5
-#if N >= 6
-                                      texture2d_array<P, access::read> in5 [[texture(5)]],
-#endif // N >= 6
-                                      texture2d_array<P, access::write> out [[texture(N)]],
-                                      constant ConcatParam & pm [[buffer(0)]],
-                                      uint3 gid [[thread_position_in_grid]]) {
-    int z = gid.z - pm.offset;
-    if (z < 0) return;
-    if (z < pm.vdim[0]) {
-        VECTOR(P, 4) r = in0.read(gid.xy, gid.z);
-        out.write(r, gid.xy, gid.z);
-        return;
-    }
-    z -= pm.vdim[0];
-    if (z < pm.vdim[1]) {
-        VECTOR(P, 4)  r = in1.read(gid.xy, z);
-        out.write(r, gid.xy, gid.z);
-        return;
-    }
-#if N >= 3
-    z -= pm.vdim[1];
-    if (z < pm.vdim[2]) {
-        VECTOR(P, 4)  r = in2.read(gid.xy, z);
-        out.write(r, gid.xy, gid.z);
-        return;
-    }
-#endif // N >= 3
-#if N >= 4
-    z -= pm.vdim[2];
-    if (z < pm.vdim[3]) {
-        VECTOR(P, 4)  r = in3.read(gid.xy, z);
-        out.write(r, gid.xy, gid.z);
-        return;
-    }
-#endif // N >= 4
-#if N >= 5
-    z -= pm.vdim[3];
-    if (z < pm.vdim[4]) {
-        VECTOR(P, 4)  r = in4.read(gid.xy, z);
-        out.write(r, gid.xy, gid.z);
-        return;
-    }
-#endif // N >= 5
-#if N >= 6
-    z -= pm.vdim[4];
-    if (z < pm.vdim[5]) {
-        VECTOR(P, 4)  r = in5.read(gid.xy, z);
-        out.write(r, gid.xy, gid.z);
-        return;
-    }
-#endif // N >= 6
-}
-#endif // V == VZ
-
-
-#undef VV
-#endif // #ifdef P
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal
deleted file mode 100644
index 55362f44de..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal
+++ /dev/null
@@ -1,219 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-#include "Common.metal"
-
-using namespace metal;
-
-struct ConcatParam {
-    int32_t odim[4];
-    int32_t axis;
-    int32_t offset;
-    int32_t trans[4];
-    int32_t vdim[6];
-};
-
-#define VNORMAL 1
-#define VX 2
-#define VY 3
-#define VZ 4
-
-// >> fast mode
-// only support concat_{2,3,4}_{2,3,4,5,6}_y_{float,half}
-// only support concat_{3,4}_{2,3,4,5,6}_x_{float,half}
-// only support concat_{1,2,3,4}_{2,3,4,5,6}_z_{float,half}
-// >> normal mode (loop mode)
-// ssd-ar: (R=4, N=3, V=z), (R=3, N=2, V=y), (R=2, N=5, V=x), (R=3, N=5, V=x)
-// ssd: (R=2, N=6, V=y), (R=3, N=6, V=y)
-// genet: (R=4, N=2, V=normal)
-
-// ssd-ar: (R=3, N=5, V=x)
-#define V VX
-#define R 3
-#define N 5
-#define P float
-#include "ConcatKernel.inc.metal"
-#undef P
-#define P half
-#include "ConcatKernel.inc.metal"
-#undef P
-#undef N
-#undef R
-#undef V
-
-// ssd-ar: (R=2, N=5, V=x)
-#define V VX
-#define R 2
-#define N 5
-#define P float
-#include "ConcatKernel.inc.metal"
-#undef P
-#define P half
-#include "ConcatKernel.inc.metal"
-#undef P
-#undef N
-#undef R
-#undef V
-
-// lens: (R=4, N=3, V=normal)
-#define V VNORMAL
-#define R 4
-#define N 3
-#define P float
-#include "ConcatKernel.inc.metal"
-#undef P
-#define P half
-#include "ConcatKernel.inc.metal"
-#undef P
-#undef N
-#undef R
-#undef V
-
-// ssd-ar: (R=3, N=2, V=y)
-#define V VY
-#define R 3
-#define N 2
-#define P float
-#include "ConcatKernel.inc.metal"
-#undef P
-#define P half
-#include "ConcatKernel.inc.metal"
-#undef P
-#undef N
-#undef R
-#undef V
-
-// ssd-ar: (R=4, N=3, V=z)
-#define V VZ
-#define R 4
-#define N 3
-#define P float
-#include "ConcatKernel.inc.metal"
-#undef P
-#define P half
-#include "ConcatKernel.inc.metal"
-#undef P
-#undef N
-#undef R
-#undef V
-
-// lens: (R=4, N=2, V=z)
-#define V VZ
-#define R 4
-#define N 2
-#define P float
-#include "ConcatKernel.inc.metal"
-#undef P
-#define P half
-#include "ConcatKernel.inc.metal"
-#undef P
-#undef N
-#undef R
-#undef V
-
-// ssd: (R=2, N=6, V=y)
-#define V VY
-#define R 2
-#define N 6
-#define P float
-#include "ConcatKernel.inc.metal"
-#undef P
-#define P half
-#include "ConcatKernel.inc.metal"
-#undef P
-#undef N
-#undef R
-#undef V
-
-// ssd: (R=3, N=6, V=y)
-#define V VY
-#define R 3
-#define N 6
-#define P float
-#include "ConcatKernel.inc.metal"
-#undef P
-#define P half
-#include "ConcatKernel.inc.metal"
-#undef P
-#undef N
-#undef R
-#undef V
-
-#define V VNORMAL
-#define R 4
-#define N 2
-#define P float
-#include "ConcatKernel.inc.metal"
-#undef P
-#define P half
-#include "ConcatKernel.inc.metal"
-#undef P
-#undef N
-#undef R
-#undef V
-
-// lens: (R=2, N=3, V=normal)
-#define V VNORMAL
-#define R 2
-#define N 3
-#define P float
-#include "ConcatKernel.inc.metal"
-#undef P
-#define P half
-#include "ConcatKernel.inc.metal"
-#undef P
-#undef N
-#undef R
-#undef V
-
-#define V VY
-#define R 2
-#define N 2
-#define P float
-#include "ConcatKernel.inc.metal"
-#undef P
-#define P half
-#include "ConcatKernel.inc.metal"
-#undef P
-#undef N
-#undef R
-#undef V
-
-
-#define V VY
-#define R 2
-#define N 5
-#define P float
-#include "ConcatKernel.inc.metal"
-#undef P
-#define P half
-#include "ConcatKernel.inc.metal"
-#undef P
-#undef N
-#undef R
-#undef V
-
-#define V VY
-#define R 4
-#define N 3
-#define P float
-#include "ConcatKernel.inc.metal"
-#undef P
-#define P half
-#include "ConcatKernel.inc.metal"
-#undef P
-#undef N
-#undef R
-#undef V
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddBNReluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddBNReluKernel.metal
deleted file mode 100644
index f55386096f..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddBNReluKernel.metal
+++ /dev/null
@@ -1,310 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-#include "Common.metal"
-using namespace metal;
-
-
-kernel void conv_add_batch_norm_relu_1x1_half(
-                                              texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                              texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                              constant MetalConvParam &param [[buffer(0)]],
-                                              const device half4 *weights [[buffer(1)]],
-                                              const device half4 *biase [[buffer(2)]],
-                                              const device half4 *new_scale [[buffer(3)]],
-                                              const device half4 *new_biase [[buffer(4)]],
-                                              uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 1;
-    
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    float4 output = float4(0.0);
-    
-    half4 input;
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-        half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-        output.x += dot(input, weight_x);
-        
-        half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-        output.y += dot(input, weight_y);
-        
-        half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-        output.z += dot(input, weight_z);
-        
-        half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-        output.w += dot(input, weight_w);
-    }
-    output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
-    outTexture.write(half4(output), gid.xy, gid.z);
-}
-
-kernel void conv_add_batch_norm_relu_3x3_half(
-                                              texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                              texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                              constant MetalConvParam &param [[buffer(0)]],
-                                              const device half4 *weights [[buffer(1)]],
-                                              const device half4 *biase [[buffer(2)]],
-                                              const device half4 *new_scale [[buffer(3)]],
-                                              const device half4 *new_biase [[buffer(4)]],
-                                              uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 9;
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    float4 output = float4(0.0);
-    
-    half4 input[9];
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-        input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-        input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-        input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-        input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-        input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-        for (int j = 0; j < 9; ++j) {
-            half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.x += dot(input[j], weight_x);
-            
-            half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.y += dot(input[j], weight_y);
-            
-            half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.z += dot(input[j], weight_z);
-            
-            half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.w += dot(input[j], weight_w);
-        }
-    }
-    output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
-    outTexture.write(half4(output), gid.xy, gid.z);
-}
-
-kernel void depthwise_conv_add_batch_norm_relu_3x3_half(
-                                                        texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                                        texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                                        constant MetalConvParam &param [[buffer(0)]],
-                                                        const device half *weights [[buffer(1)]],
-                                                        const device half4 *biase [[buffer(2)]],
-                                                        const device half4 *new_scale [[buffer(3)]],
-                                                        const device half4 *new_biase [[buffer(4)]],
-                                                        uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    uint output_slice = gid.z;
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 9;
-    uint weithTo = gid.z * kernelHXW * 4;
-    float4 output = float4(0.0);
-    half4 inputs[9];
-    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-    for (int j = 0; j < 9; ++j) {
-        half4 input = inputs[j];
-        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-    }
-    output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
-    outTexture.write(half4(output), gid.xy, gid.z);
-}
-
-
-
-/*---------------------------------------------*/
-
-
-
-kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                         constant MetalConvParam &param [[buffer(0)]],
-                                         const device float4 *weights [[buffer(1)]],
-                                         const device float4 *biase [[buffer(2)]],
-                                         const device float4 *new_scale [[buffer(3)]],
-                                         const device float4 *new_biase [[buffer(4)]],
-                                         uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 1;
-    
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    float4 output = float4(0.0);
-    
-    float4 input;
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-        float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-        output.x += dot(input, weight_x);
-        
-        float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-        output.y += dot(input, weight_y);
-        
-        float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-        output.z += dot(input, weight_z);
-        
-        float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-        output.w += dot(input, weight_w);
-    }
-    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                         constant MetalConvParam &param [[buffer(0)]],
-                                         const device float4 *weights [[buffer(1)]],
-                                         const device float4 *biase [[buffer(2)]],
-                                         const device float4 *new_scale [[buffer(3)]],
-                                         const device float4 *new_biase [[buffer(4)]],
-                                         uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 9;
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    float4 output = float4(0.0);
-    
-    float4 input[9];
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-        input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-        input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-        input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-        input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-        input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-        for (int j = 0; j < 9; ++j) {
-            float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.x += dot(input[j], weight_x);
-            
-            float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.y += dot(input[j], weight_y);
-            
-            float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.z += dot(input[j], weight_z);
-            
-            float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.w += dot(input[j], weight_w);
-        }
-    }
-    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                                   texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                                   constant MetalConvParam &param [[buffer(0)]],
-                                                   const device float *weights [[buffer(1)]],
-                                                   const device float4 *biase [[buffer(2)]],
-                                                   const device float4 *new_scale [[buffer(3)]],
-                                                   const device float4 *new_biase [[buffer(4)]],
-                                                   uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    uint output_slice = gid.z;
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 9;
-    uint weithTo = gid.z * kernelHXW * 4;
-    float4 output = float4(0.0);
-    float4 inputs[9];
-    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-    for (int j = 0; j < 9; ++j) {
-        float4 input = inputs[j];
-        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-    }
-    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
-    outTexture.write(output, gid.xy, gid.z);
-}
-
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPrelu.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPrelu.inc.metal
deleted file mode 100644
index e2b8834cc5..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPrelu.inc.metal
+++ /dev/null
@@ -1,447 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#ifdef P
-
-#include "Macro.metal"
-
-
-#pragma mark - convAdd
-kernel void FUNC3_(conv_add_1x1, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
-                                                texture2d_array<P, access::write> outTexture [[texture(1)]],
-                                                constant MetalConvParam &param [[buffer(0)]],
-                                                const device VECTOR(P, 4) *weights [[buffer(1)]],
-                                                const device VECTOR(P, 4) *biase [[buffer(2)]],
-#ifdef PRELU_CHANNEL
-                                                const device VECTOR(P, 4) *alpha [[buffer(3)]],
-#endif
-#ifdef PRELU_ELEMENT
-                                                const device VECTOR(P, 4) *alpha [[buffer(3)]],
-#endif
-#ifdef PRELU_OTHER
-                                                const device P *alpha [[buffer(3)]],
-#endif
-                                                uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 1;
-    
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    VECTOR(P, 4) output = biase[gid.z];
-    
-    VECTOR(P, 4) input;
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input = inTexture.sample(sample,float2(posInInput.x, posInInput.y), i);
-        VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-        output.x += dot(input, weight_x);
-        
-        VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-        output.y += dot(input, weight_y);
-        
-        VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-        output.z += dot(input, weight_z);
-        
-        VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-        output.w += dot(input, weight_w);
-    }
-    
-    //  output = output + float4(biase[gid.z]);
-    
-#ifdef PRELU_CHANNEL
-    VECTOR(P, 4) alpha_value = alpha[gid.z];
-    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
-#endif
-#ifdef PRELU_ELEMENT
-    int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
-    VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
-    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
-#endif
-#ifdef PRELU_OTHER
-    P alpha_value = alpha[0];
-    output.x = output.x > 0 ? output.x : (alpha_value * output.x);
-    output.y = output.y > 0 ? output.y : (alpha_value * output.y);
-    output.z = output.z > 0 ? output.z : (alpha_value * output.z);
-    output.w = output.w > 0 ? output.w : (alpha_value * output.w);
-#endif
-    outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
-}
-
-kernel void FUNC3_(conv_add_3x3, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
-                                                texture2d_array<P, access::write> outTexture [[texture(1)]],
-                                                constant MetalConvParam &param [[buffer(0)]],
-                                                const device VECTOR(P, 4) *weights [[buffer(1)]],
-                                                const device VECTOR(P, 4) *biase [[buffer(2)]],
-#ifdef PRELU_CHANNEL
-                                                const device VECTOR(P, 4) *alpha [[buffer(3)]],
-#endif
-#ifdef PRELU_ELEMENT
-                                                const device VECTOR(P, 4) *alpha [[buffer(3)]],
-#endif
-#ifdef PRELU_OTHER
-                                                const device P *alpha [[buffer(3)]],
-#endif
-                                                uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    
-    const uint kernelHXW = 9;
-    
-    uint input_arr_size = inTexture.get_array_size();
-    
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    VECTOR(P, 4) output = biase[gid.z];
-    
-    ushort dilation_x = param.dilationX;
-    ushort dilation_y = param.dilationY;
-    
-    VECTOR(P, 4) input[9];
-    
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
-        
-        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
-        
-        input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
-        
-        input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
-        
-        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-        
-        input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
-        
-        input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
-        
-        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
-        
-        input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
-        
-        for (int j = 0; j < 9; ++j) {
-            VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.x += dot(input[j], weight_x);
-            
-            VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.y += dot(input[j], weight_y);
-            
-            VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.z += dot(input[j], weight_z);
-            
-            VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.w += dot(input[j], weight_w);
-        }
-    }
-    //  output = output + float4(biase[gid.z]);
-    
-#ifdef PRELU_CHANNEL
-    VECTOR(P, 4) alpha_value = alpha[gid.z];
-    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
-#endif
-#ifdef PRELU_ELEMENT
-    int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
-    VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
-    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
-#endif
-#ifdef PRELU_OTHER
-    P alpha_value = alpha[0];
-    output.x = output.x > 0 ? output.x : (alpha_value * output.x);
-    output.y = output.y > 0 ? output.y : (alpha_value * output.y);
-    output.z = output.z > 0 ? output.z : (alpha_value * output.z);
-    output.w = output.w > 0 ? output.w : (alpha_value * output.w);
-#endif
-    outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
-}
-
-kernel void FUNC3_(conv_add_5x1, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
-                                                texture2d_array<P, access::write> outTexture [[texture(1)]],
-                                                constant MetalConvParam &param [[buffer(0)]],
-                                                const device VECTOR(P, 4) *weights [[buffer(1)]],
-                                                const device VECTOR(P, 4) *biase [[buffer(2)]],
-#ifdef PRELU_CHANNEL
-                                                const device VECTOR(P, 4) *alpha [[buffer(3)]],
-#endif
-#ifdef PRELU_ELEMENT
-                                                const device VECTOR(P, 4) *alpha [[buffer(3)]],
-#endif
-#ifdef PRELU_OTHER
-                                                const device P *alpha [[buffer(3)]],
-#endif
-                                                uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    
-    const uint kernelHXW = 5;
-    
-    uint input_arr_size = inTexture.get_array_size();
-    
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    VECTOR(P, 4) output = biase[gid.z];;
-    
-    ushort dilation_y = param.dilationY;
-    VECTOR(P, 4) input[5];
-    
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
-        
-        input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
-        
-        input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-        
-        input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
-        
-        input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
-        
-        for (int j = 0; j < 5; ++j) {
-            VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.x += dot(input[j], weight_x);
-            
-            VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.y += dot(input[j], weight_y);
-            
-            VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.z += dot(input[j], weight_z);
-            
-            VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.w += dot(input[j], weight_w);
-        }
-    }
-    
-#ifdef PRELU_CHANNEL
-    VECTOR(P, 4) alpha_value = alpha[gid.z];
-    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
-#endif
-#ifdef PRELU_ELEMENT
-    int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
-    VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
-    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
-#endif
-#ifdef PRELU_OTHER
-    P alpha_value = alpha[0];
-    output.x = output.x > 0 ? output.x : (alpha_value * output.x);
-    output.y = output.y > 0 ? output.y : (alpha_value * output.y);
-    output.z = output.z > 0 ? output.z : (alpha_value * output.z);
-    output.w = output.w > 0 ? output.w : (alpha_value * output.w);
-#endif
-    outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
-}
-
-
-kernel void FUNC3_(conv_add_1x5, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
-                                                texture2d_array<P, access::write> outTexture [[texture(1)]],
-                                                constant MetalConvParam &param [[buffer(0)]],
-                                                const device VECTOR(P, 4) *weights [[buffer(1)]],
-                                                const device VECTOR(P, 4) *biase [[buffer(2)]],
-#ifdef PRELU_CHANNEL
-                                                const device VECTOR(P, 4) *alpha [[buffer(3)]],
-#endif
-#ifdef PRELU_ELEMENT
-                                                const device VECTOR(P, 4) *alpha [[buffer(3)]],
-#endif
-#ifdef PRELU_OTHER
-                                                const device P *alpha [[buffer(3)]],
-#endif
-                                                uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    
-    const uint kernelHXW = 5;
-    
-    uint input_arr_size = inTexture.get_array_size();
-    
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    VECTOR(P, 4) output = biase[gid.z];
-    
-    ushort dilation_x = param.dilationX;
-    VECTOR(P, 4) input[5];
-    
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
-        
-        input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
-        
-        input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-        
-        input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
-        
-        input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
-        
-        for (int j = 0; j < 5; ++j) {
-            VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.x += dot(input[j], weight_x);
-            
-            VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.y += dot(input[j], weight_y);
-            
-            VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.z += dot(input[j], weight_z);
-            
-            VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.w += dot(input[j], weight_w);
-        }
-    }
-    
-#ifdef PRELU_CHANNEL
-    VECTOR(P, 4) alpha_value = alpha[gid.z];
-    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
-#endif
-#ifdef PRELU_ELEMENT
-    int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
-    VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
-    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
-#endif
-#ifdef PRELU_OTHER
-    P alpha_value = alpha[0];
-    output.x = output.x > 0 ? output.x : (alpha_value * output.x);
-    output.y = output.y > 0 ? output.y : (alpha_value * output.y);
-    output.z = output.z > 0 ? output.z : (alpha_value * output.z);
-    output.w = output.w > 0 ? output.w : (alpha_value * output.w);
-#endif
-    outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
-}
-
-kernel void FUNC3_(depthwise_conv_add_3x3, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
-                                                          texture2d_array<P, access::write> outTexture [[texture(1)]],
-                                                          constant MetalConvParam &param [[buffer(0)]],
-                                                          const device P *weights [[buffer(1)]],
-                                                          const device VECTOR(P, 4) *biase [[buffer(2)]],
-#ifdef PRELU_CHANNEL
-                                                          const device VECTOR(P, 4) *alpha [[buffer(3)]],
-#endif
-#ifdef PRELU_ELEMENT
-                                                          const device VECTOR(P, 4) *alpha [[buffer(3)]],
-#endif
-#ifdef PRELU_OTHER
-                                                          const device P *alpha [[buffer(3)]],
-#endif
-                                                          uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    uint output_slice = gid.z;
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 9;
-    uint weithTo = gid.z * kernelHXW * 4;
-    VECTOR(P, 4) output = biase[gid.z];
-    VECTOR(P, 4) inputs[9];
-    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-    for (int j = 0; j < 9; ++j) {
-        VECTOR(P, 4) input = inputs[j];
-        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-    }
-    
-#ifdef PRELU_CHANNEL
-    VECTOR(P, 4) alpha_value = alpha[gid.z];
-    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
-#endif
-#ifdef PRELU_ELEMENT
-    int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
-    VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
-    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
-#endif
-#ifdef PRELU_OTHER
-    P alpha_value = alpha[0];
-    output.x = output.x > 0 ? output.x : (alpha_value * output.x);
-    output.y = output.y > 0 ? output.y : (alpha_value * output.y);
-    output.z = output.z > 0 ? output.z : (alpha_value * output.z);
-    output.w = output.w > 0 ? output.w : (alpha_value * output.w);
-#endif
-    outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
-}
-
-#endif
-
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPreluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPreluKernel.metal
deleted file mode 100644
index 407b8385b7..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPreluKernel.metal
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-#include "Common.metal"
-using namespace metal;
-
-#define P float
-
-#define PRELU_CHANNEL prelu_channel
-#define PRELU_TYPE prelu_channel
-#include "ConvAddPrelu.inc.metal"
-#undef  PRELU_TYPE
-#undef  PRELU_CHANNEL
-
-#define PRELU_ELEMENT prelu_element
-#define PRELU_TYPE prelu_element
-#include "ConvAddPrelu.inc.metal"
-#undef  PRELU_TYPE
-#undef  PRELU_ELEMENT
-
-#define PRELU_OTHER   prelu_other
-#define PRELU_TYPE prelu_other
-#include "ConvAddPrelu.inc.metal"
-#undef  PRELU_TYPE
-#undef  PRELU_OTHER
-
-#undef P
-
-#define P half
-
-#define PRELU_CHANNEL prelu_channel
-#define PRELU_TYPE prelu_channel
-#include "ConvAddPrelu.inc.metal"
-#undef  PRELU_TYPE
-#undef  PRELU_CHANNEL
-
-#define PRELU_ELEMENT prelu_element
-#define PRELU_TYPE prelu_element
-#include "ConvAddPrelu.inc.metal"
-#undef  PRELU_TYPE
-#undef  PRELU_ELEMENT
-
-#define PRELU_OTHER   prelu_other
-#define PRELU_TYPE prelu_other
-#include "ConvAddPrelu.inc.metal"
-#undef  PRELU_TYPE
-#undef  PRELU_OTHER
-
-#undef P
-
-
-
-
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal
deleted file mode 100644
index 05b8150842..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal
+++ /dev/null
@@ -1,889 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-#include "Common.metal"
-
-using namespace metal;
-
-half4 getBiasHalf(uint3 gid, constant ElementwiseAddParam &addParam, texture2d_array<half, access::sample> biasTexture) {
-    half4 output;
-    if (addParam.fast == 1) {
-        output = biasTexture.read(gid.xy, gid.z);
-    } else if (addParam.addByChannel == 1) {
-        output = biasTexture.read(uint2(0, 0), gid.z);
-    } else {
-        int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
-        int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
-        int32_t xtrans[4] = {addParam.xtrans[0], addParam.xtrans[1], addParam.xtrans[2], addParam.xtrans[3]};
-        int32_t ytrans[4] = {addParam.ytrans[0], addParam.ytrans[1], addParam.ytrans[2], addParam.ytrans[3]};
-        int32_t yshift = 4 - addParam.ylen - addParam.axis;
-        for (int n = 0; n < 4; n++) {
-            x_xyzn[3] = n;
-            xyzn2abcd(addParam.xdim[3], x_xyzn, x_abcd);
-            invtrans(xtrans, x_abcd, t_abcd);
-            for (int k = addParam.axis; k < (addParam.axis + addParam.ylen); k++) {
-                y_abcd[yshift+k] = t_abcd[k];
-            }
-            trans(ytrans, y_abcd, t_abcd);
-            abcd2xyzn(addParam.ydim[3], t_abcd, y_xyzn);
-            output[n] = biasTexture.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
-        }
-    }
-    return output;
-}
-
-float4 getBias(uint3 gid, constant ElementwiseAddParam &addParam, texture2d_array<float, access::sample> biasTexture) {
-    float4 output;
-    if (addParam.fast == 1) {
-        output = float4(biasTexture.read(gid.xy, gid.z));
-    } else if (addParam.addByChannel == 1) {
-        output = float4(biasTexture.read(uint2(0, 0), gid.z));
-    } else {
-        int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
-        int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
-        int32_t xtrans[4] = {addParam.xtrans[0], addParam.xtrans[1], addParam.xtrans[2], addParam.xtrans[3]};
-        int32_t ytrans[4] = {addParam.ytrans[0], addParam.ytrans[1], addParam.ytrans[2], addParam.ytrans[3]};
-        int32_t yshift = 4 - addParam.ylen - addParam.axis;
-        for (int n = 0; n < 4; n++) {
-            x_xyzn[3] = n;
-            xyzn2abcd(addParam.xdim[3], x_xyzn, x_abcd);
-            invtrans(xtrans, x_abcd, t_abcd);
-            for (int k = addParam.axis; k < (addParam.axis + addParam.ylen); k++) {
-                y_abcd[yshift+k] = t_abcd[k];
-            }
-            trans(ytrans, y_abcd, t_abcd);
-            abcd2xyzn(addParam.ydim[3], t_abcd, y_xyzn);
-            output[n] = biasTexture.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
-        }
-    }
-    return output;
-}
-
-#pragma mark - convAdd
-kernel void conv_add_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<float, access::sample> biasTexture [[texture(1)]],
-                         texture2d_array<float, access::write> outTexture [[texture(2)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device float4 *weights [[buffer(1)]],
-                         uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 1;
-    
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    float4 output = float4(0.0, 0.0, 0.0, 0.0);
-    if (param.hasAddOp) {
-        constant ElementwiseAddParam &addParam = param.addParam;
-        output = getBias(gid, addParam, biasTexture);
-    }
-    
-    float4 input;
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-        float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-        output.x += dot(input, weight_x);
-        
-        float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-        output.y += dot(input, weight_y);
-        
-        float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-        output.z += dot(input, weight_z);
-        
-        float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-        output.w += dot(input, weight_w);
-    }
-    float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
-    outTexture.write(relu, gid.xy, gid.z);
-}
-
-kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<float, access::sample> biasTexture [[texture(1)]],
-                         texture2d_array<float, access::write> outTexture [[texture(2)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device float4 *weights [[buffer(1)]],
-                         uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    
-    const uint kernelHXW = 9;
-    
-    uint input_arr_size = inTexture.get_array_size();
-    
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    float4 output = float4(0.0, 0.0, 0.0, 0.0);
-    if (param.hasAddOp) {
-        constant ElementwiseAddParam &addParam = param.addParam;
-        output = getBias(gid, addParam, biasTexture);
-    }
-    
-    ushort dilation_x = param.dilationX;
-    ushort dilation_y = param.dilationY;
-    
-    float4 input[9];
-    
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
-        
-        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
-        
-        input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
-        
-        input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
-        
-        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-        
-        input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
-        
-        input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
-        
-        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
-        
-        input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
-        
-        for (int j = 0; j < 9; ++j) {
-            float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.x += dot(input[j], weight_x);
-            
-            float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.y += dot(input[j], weight_y);
-            
-            float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.z += dot(input[j], weight_z);
-            
-            float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.w += dot(input[j], weight_w);
-        }
-    }
-    float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
-    outTexture.write(relu, gid.xy, gid.z);
-}
-
-kernel void group_conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                              texture2d_array<float, access::sample> biasTexture [[texture(1)]],
-                              texture2d_array<float, access::write> outTexture [[texture(2)]],
-                              constant MetalConvParam &param [[buffer(0)]],
-                              const device float *weights [[buffer(1)]],
-                              uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    
-    const uint kernelHXW = 9;
-    
-    float4 output = float4(0.0, 0.0, 0.0, 0.0);
-    if (param.hasAddOp) {
-        constant ElementwiseAddParam &addParam = param.addParam;
-        output = getBias(gid, addParam, biasTexture);
-    }
-    
-    ushort dilation_x = param.dilationX;
-    ushort dilation_y = param.dilationY;
-    
-    float input[9];
-    
-    uint iC = param.iC, fC = param.fC, oC = param.oC;
-    uint filter_array_size = (fC + 3) / 4;
-    
-    for (uint c = 0; c < 4; ++c) {
-        uint output_depth = gid.z * 4 + c, output_c = output_depth % oC, output_n = output_depth / oC;
-        for (uint i = 0; i < fC; ++i) {
-            uint input_depth = output_n * iC + output_c * fC + i;
-            uint input_array_index = input_depth / 4;
-            uint input_array_item_index = input_depth % 4;
-            input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), input_array_index)[input_array_item_index];
-            input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), input_array_index)[input_array_item_index];
-            input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), input_array_index)[input_array_item_index];
-            input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), input_array_index)[input_array_item_index];
-            input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), input_array_index)[input_array_item_index];
-            input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), input_array_index)[input_array_item_index];
-            input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), input_array_index)[input_array_item_index];
-            input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), input_array_index)[input_array_item_index];
-            input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), input_array_index)[input_array_item_index];
-            for (int j = 0; j < 9; ++j) {
-                float weight = weights[(output_c * kernelHXW + j) * filter_array_size * 4 + i];
-                output[c] += input[j] * weight;
-            }
-        }
-    }
-    
-    float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
-    outTexture.write(relu, gid.xy, gid.z);
-}
-
-kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<float, access::sample> biasTexture [[texture(1)]],
-                         texture2d_array<float, access::write> outTexture [[texture(2)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device float4 *weights [[buffer(1)]],
-                         uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    
-    const uint kernelHXW = 5;
-    
-    uint input_arr_size = inTexture.get_array_size();
-    
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    float4 output = float4(0.0, 0.0, 0.0, 0.0);
-    if (param.hasAddOp) {
-        constant ElementwiseAddParam &addParam = param.addParam;
-        output = getBias(gid, addParam, biasTexture);
-    }
-    
-    ushort dilation_y = param.dilationY;
-    float4 input[5];
-    
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
-        
-        input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
-        
-        input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-        
-        input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
-        
-        input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
-        
-        for (int j = 0; j < 5; ++j) {
-            float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.x += dot(input[j], weight_x);
-            
-            float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.y += dot(input[j], weight_y);
-            
-            float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.z += dot(input[j], weight_z);
-            
-            float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.w += dot(input[j], weight_w);
-        }
-    }
-    float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
-    outTexture.write(relu, gid.xy, gid.z);
-}
-
-kernel void conv_add_relu_1x5(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<float, access::sample> biasTexture [[texture(1)]],
-                         texture2d_array<float, access::write> outTexture [[texture(2)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device float4 *weights [[buffer(1)]],
-                         uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    
-    const uint kernelHXW = 5;
-    
-    uint input_arr_size = inTexture.get_array_size();
-    
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    float4 output = float4(0.0, 0.0, 0.0, 0.0);
-    if (param.hasAddOp) {
-        constant ElementwiseAddParam &addParam = param.addParam;
-        output = getBias(gid, addParam, biasTexture);
-    }
-    
-    ushort dilation_x = param.dilationX;
-    float4 input[5];
-    
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
-        
-        input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
-        
-        input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-        
-        input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
-        
-        input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
-        
-        for (int j = 0; j < 5; ++j) {
-            float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.x += dot(input[j], weight_x);
-            
-            float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.y += dot(input[j], weight_y);
-            
-            float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.z += dot(input[j], weight_z);
-            
-            float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.w += dot(input[j], weight_w);
-        }
-    }
-    float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
-    outTexture.write(relu, gid.xy, gid.z);
-}
-
-kernel void depthwise_conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                   texture2d_array<float, access::sample> biasTexture [[texture(1)]],
-                                   texture2d_array<float, access::write> outTexture [[texture(2)]],
-                                   constant MetalConvParam &param [[buffer(0)]],
-                                   const device float *weights [[buffer(1)]],
-                                   uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    uint output_slice = gid.z;
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 9;
-    uint weithTo = gid.z * kernelHXW * 4;
-
-    float4 output = float4(0.0, 0.0, 0.0, 0.0);
-    if (param.hasAddOp) {
-        constant ElementwiseAddParam &addParam = param.addParam;
-        output = getBias(gid, addParam, biasTexture);
-    }
-    
-    float4 inputs[9];
-    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-    for (int j = 0; j < 9; ++j) {
-        float4 input = inputs[j];
-        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-    }
-    float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
-    outTexture.write(relu, gid.xy, gid.z);
-}
-
-#pragma mark - half
-
-kernel void conv_add_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                              texture2d_array<half, access::sample> biasTexture [[texture(1)]],
-                              texture2d_array<half, access::write> outTexture [[texture(2)]],
-                              constant MetalConvParam &param [[buffer(0)]],
-                              const device half4 *weights [[buffer(1)]],
-                              uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 1;
-    
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    float4 output = float4(0.0, 0.0, 0.0, 0.0);
-    if (param.hasAddOp) {
-        constant ElementwiseAddParam &addParam = param.addParam;
-        output = float4(getBiasHalf(gid, addParam, biasTexture));
-    }
-    
-    float4 input;
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input = float4(inTexture.sample(sample, float2(posInInput.x, posInInput.y), i));
-        float4 weight_x = float4(weights[weithTo + 0 * kernelHXW * input_arr_size  + i]);
-        output.x += dot(input, weight_x);
-        
-        float4 weight_y = float4(weights[weithTo + 1 * kernelHXW * input_arr_size  + i]);
-        output.y += dot(input, weight_y);
-        
-        float4 weight_z = float4(weights[weithTo + 2 * kernelHXW * input_arr_size  + i]);
-        output.z += dot(input, weight_z);
-        
-        float4 weight_w = float4(weights[weithTo + 3 * kernelHXW * input_arr_size + i]);
-        output.w += dot(input, weight_w);
-    }
-    float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
-    outTexture.write(half4(relu), gid.xy, gid.z);
-}
-
-kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                              texture2d_array<half, access::sample> biasTexture [[texture(1)]],
-                              texture2d_array<half, access::write> outTexture [[texture(2)]],
-                              constant MetalConvParam &param [[buffer(0)]],
-                              const device half4 *weights [[buffer(1)]],
-                              uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 9;
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    float4 output = float4(0.0, 0.0, 0.0, 0.0);
-    if (param.hasAddOp) {
-        constant ElementwiseAddParam &addParam = param.addParam;
-        output = float4(getBiasHalf(gid, addParam, biasTexture));
-    }
-
-    ushort dilation_x = param.dilationX;
-    ushort dilation_y = param.dilationY;
-
-    half4 input[9];
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y - dilation_y), i);
-        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
-        input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y - dilation_y), i);
-        input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
-        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-        input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
-        input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
-        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
-        input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
-        for (int j = 0; j < 9; ++j) {
-            half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.x += dot(float4(input[j]), float4(weight_x));
-
-            half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.y += dot(float4(input[j]), float4(weight_y));
-
-            half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.z += dot(float4(input[j]), float4(weight_z));
-
-            half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.w += dot(float4(input[j]), float4(weight_w));
-        }
-    }
-    float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
-    outTexture.write(half4(relu), gid.xy, gid.z);
-}
-
-kernel void group_conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                    texture2d_array<half, access::sample> biasTexture [[texture(1)]],
-                                    texture2d_array<half, access::write> outTexture [[texture(2)]],
-                                    constant MetalConvParam &param [[buffer(0)]],
-                                    const device half *weights [[buffer(1)]],
-                                    uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    
-    const uint kernelHXW = 9;
-    
-    float4 output = float4(0.0, 0.0, 0.0, 0.0);
-    if (param.hasAddOp) {
-        constant ElementwiseAddParam &addParam = param.addParam;
-        output = float4(getBiasHalf(gid, addParam, biasTexture));
-    }
-    
-    ushort dilation_x = param.dilationX;
-    ushort dilation_y = param.dilationY;
-    
-    half input[9];
-    
-    uint iC = param.iC, fC = param.fC, oC = param.oC;
-    uint filter_array_size = (fC + 3) / 4;
-    
-    for (uint c = 0; c < 4; ++c) {
-        uint output_depth = gid.z * 4 + c, output_c = output_depth % oC, output_n = output_depth / oC;
-        for (uint i = 0; i < fC; ++i) {
-            uint input_depth = output_n * iC + output_c * fC + i;
-            uint input_array_index = input_depth / 4;
-            uint input_array_item_index = input_depth % 4;
-            input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), input_array_index)[input_array_item_index];
-            input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), input_array_index)[input_array_item_index];
-            input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), input_array_index)[input_array_item_index];
-            input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), input_array_index)[input_array_item_index];
-            input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), input_array_index)[input_array_item_index];
-            input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), input_array_index)[input_array_item_index];
-            input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), input_array_index)[input_array_item_index];
-            input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), input_array_index)[input_array_item_index];
-            input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), input_array_index)[input_array_item_index];
-            for (int j = 0; j < 9; ++j) {
-                half weight = weights[(output_c * kernelHXW + j) * filter_array_size * 4 + i];
-                output[c] += float(input[j]) * float(weight);
-            }
-        }
-    }
-    
-    float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
-    outTexture.write(half4(relu), gid.xy, gid.z);
-}
-
-kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                        texture2d_array<half, access::sample> biasTexture [[texture(1)]],
-                                        texture2d_array<half, access::write> outTexture [[texture(2)]],
-                                        constant MetalConvParam &param [[buffer(0)]],
-                                        const device half *weights [[buffer(1)]],
-                                        uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    uint output_slice = gid.z;
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 9;
-    uint weithTo = gid.z * kernelHXW * 4;
-
-    float4 output = float4(0.0, 0.0, 0.0, 0.0);
-    if (param.hasAddOp) {
-        constant ElementwiseAddParam &addParam = param.addParam;
-        output = float4(getBiasHalf(gid, addParam, biasTexture));
-    }
-
-    half4 inputs[9];
-    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-    for (int j = 0; j < 9; ++j) {
-        half4 input = inputs[j];
-        output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]);
-        output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]);
-        output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
-        output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]);
-    }
-
-    float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
-    outTexture.write(half4(relu), gid.xy, gid.z);
-}
-
-kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                             texture2d_array<half, access::sample> biasTexture [[texture(1)]],
-                                             texture2d_array<half, access::write> outTexture [[texture(2)]],
-                                             constant MetalConvParam &param [[buffer(0)]],
-                                             const device half4x4 *weights [[buffer(1)]],
-                                             uint3 gid [[thread_position_in_grid]]) {
-    uint x = gid.x, y = gid.y;
-    uint ow = outTexture.get_width();
-    if (ow % 2 != 0) {
-        ow++;
-    }
-    uint oh = outTexture.get_height();
-    if (oh % 2 != 0) {
-        oh++;
-    }
-    if (x >= ow || y >= oh) {
-        return;
-    }
-
-    uint tx = (x >> 1) << 1;
-    uint ty = (y >> 1) << 1;
-    uint tc = ((x % 2) << 1) + y % 2;
-    
-    int hasComputedC = 4 * tc;
-    
-    if (hasComputedC >= param.oC) {
-        return;
-    }
-
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    half4 inputs[16];
-    inputs[0] = inTexture.sample(sample, float2(tx - 1, ty - 1), tc);
-    inputs[1] = inTexture.sample(sample, float2(tx, ty - 1), tc);
-    inputs[2] = inTexture.sample(sample, float2(tx + 1, ty - 1), tc);
-    inputs[3] = inTexture.sample(sample, float2(tx + 2, ty - 1), tc);
-
-    inputs[4] = inTexture.sample(sample, float2(tx - 1, ty), tc);
-    inputs[5] = inTexture.sample(sample, float2(tx, ty), tc);
-    inputs[6] = inTexture.sample(sample, float2(tx + 1, ty), tc);
-    inputs[7] = inTexture.sample(sample, float2(tx + 2, ty), tc);
-
-    inputs[8] = inTexture.sample(sample, float2(tx - 1, ty + 1), tc);
-    inputs[9] = inTexture.sample(sample, float2(tx, ty + 1), tc);
-    inputs[10] = inTexture.sample(sample, float2(tx + 1, ty + 1), tc);
-    inputs[11] = inTexture.sample(sample, float2(tx + 2, ty + 1), tc);
-
-    inputs[12] = inTexture.sample(sample, float2(tx - 1, ty + 2), tc);
-    inputs[13] = inTexture.sample(sample, float2(tx, ty + 2), tc);
-    inputs[14] = inTexture.sample(sample, float2(tx + 1, ty + 2), tc);
-    inputs[15] = inTexture.sample(sample, float2(tx + 2, ty + 2), tc);
-
-    uint weightTo = 4 * tc;
-    half4 res[4];
-
-    for (int c = 0; c < 4; ++c) {
-        if (hasComputedC + c >= param.oC) {
-            break;
-        }
-        half I[16];
-        for (int i = 0; i < 16; ++i) {
-            I[i] = inputs[i][c];
-        }
-        half4x4 f = weights[weightTo + c];
-        half B[16];
-        half tmp1 = I[2] - I[10];
-        half tmp2 = I[9] - I[1];
-        B[0] = I[0] - I[8] - tmp1;
-        B[1] = tmp1 - tmp2;
-        B[2] = tmp1 + tmp2;
-        B[3] = I[3] - I[11] + tmp2;
-        tmp1 = I[6] + I[10];
-        tmp2 = I[5] + I[9];
-        B[4] = I[4] + I[8] - tmp1;
-        B[5] = tmp1 + tmp2;
-        B[6] = tmp1 - tmp2;
-        B[7] = I[7] + I[11] - tmp2;
-        tmp1 = I[10] - I[6];
-        tmp2 = I[5] - I[9];
-        B[8] = I[8] - I[4] - tmp1;
-        B[9] = tmp1 - tmp2;
-        B[10] = tmp1 + tmp2;
-        B[11] = tmp2 - I[7] + I[11];
-        tmp1 = I[14] - I[6];
-        tmp2 = I[5] - I[13];
-        B[12] = I[12] - I[4] - tmp1;
-        B[13] = tmp1 - tmp2;
-        B[14] = tmp1 + tmp2;
-        B[15] = tmp2 - I[7] + I[15];
-        half T[16];
-        T[0] = B[0] * f[0][0];
-        T[1] = B[1] * f[0][1];
-        T[2] = B[2] * f[0][2];
-        T[3] = B[3] * f[0][3];
-        T[4] = B[4] * f[1][0];
-        T[5] = B[5] * f[1][1];
-        T[6] = B[6] * f[1][2];
-        T[7] = B[7] * f[1][3];
-        T[8] = B[8] * f[2][0];
-        T[9] = B[9] * f[2][1];
-        T[10] = B[10] * f[2][2];
-        T[11] = B[11] * f[2][3];
-        T[12] = B[12] * f[3][0];
-        T[13] = B[13] * f[3][1];
-        T[14] = B[14] * f[3][2];
-        T[15] = B[15] * f[3][3];
-        tmp1 = T[1] + T[5] + T[9];
-        tmp2 = T[2] + T[6] + T[10];
-        res[0][c] = T[0] + T[4] + T[8] + tmp1 + tmp2;
-        res[1][c] = T[3] + T[7] + T[11] + tmp1 - tmp2;
-        tmp1 = T[5] - T[9] + T[13];
-        tmp2 = T[6] - T[10] + T[14];
-        res[2][c] = T[4] - T[8] + T[12] + tmp1 + tmp2;
-        res[3][c] = T[7] - T[11] + T[15] + tmp1 - tmp2;
-    }
-    
-    if (param.hasAddOp == 1) {
-        constant ElementwiseAddParam &addParam = param.addParam;
-        half4 base = getBiasHalf(uint3(tx, ty, tc), addParam, biasTexture);
-        res[0] += base;
-        base = getBiasHalf(uint3(tx + 1, ty, tc), addParam, biasTexture);
-        res[1] += base;
-        base = getBiasHalf(uint3(tx, ty + 1, tc), addParam, biasTexture);
-        res[2] += base;
-        base = getBiasHalf(uint3(tx + 1, ty + 1, tc), addParam, biasTexture);
-        res[3] += base;
-    }
-
-    if (param.hasReluOp == 1) {
-        outTexture.write(fmax(res[0], 0.0), uint2(tx, ty), tc);
-        outTexture.write(fmax(res[1], 0.0), uint2(tx + 1, ty), tc);
-        outTexture.write(fmax(res[2], 0.0), uint2(tx, ty + 1), tc);
-        outTexture.write(fmax(res[3], 0.0), uint2(tx + 1, ty + 1), tc);
-    } else {
-        outTexture.write(res[0], uint2(tx, ty), tc);
-        outTexture.write(res[1], uint2(tx + 1, ty), tc);
-        outTexture.write(res[2], uint2(tx, ty + 1), tc);
-        outTexture.write(res[3], uint2(tx + 1, ty + 1), tc);
-    }
-}
-
-kernel void conv_add_relu_5x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                              texture2d_array<half, access::sample> biasTexture [[texture(1)]],
-                              texture2d_array<half, access::write> outTexture [[texture(2)]],
-                              constant MetalConvParam &param [[buffer(0)]],
-                              const device half4 *weights [[buffer(1)]],
-                              uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    
-    const uint kernelHXW = 5;
-    
-    uint input_arr_size = inTexture.get_array_size();
-    
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    float4 output = float4(0.0, 0.0, 0.0, 0.0);
-    if (param.hasAddOp) {
-        constant ElementwiseAddParam &addParam = param.addParam;
-        output = float4(getBiasHalf(gid, addParam, biasTexture));
-    }
-
-    ushort dilation_y = param.dilationY;
-    half4 input[5];
-    
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
-        
-        input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
-        
-        input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-        
-        input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
-        
-        input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
-        
-        for (int j = 0; j < 5; ++j) {
-            half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.x += dot(float4(input[j]), float4(weight_x));
-            
-            half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.y += dot(float4(input[j]), float4(weight_y));
-            
-            half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.z += dot(float4(input[j]), float4(weight_z));
-            
-            half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.w += dot(float4(input[j]), float4(weight_w));
-        }
-    }
-    float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
-    outTexture.write(half4(relu), gid.xy, gid.z);
-}
-
-kernel void conv_add_relu_1x5_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                              texture2d_array<half, access::sample> biasTexture [[texture(1)]],
-                              texture2d_array<half, access::write> outTexture [[texture(2)]],
-                              constant MetalConvParam &param [[buffer(0)]],
-                              const device half4 *weights [[buffer(1)]],
-                              uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    
-    const uint kernelHXW = 5;
-    
-    uint input_arr_size = inTexture.get_array_size();
-    
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    float4 output = float4(0.0, 0.0, 0.0, 0.0);
-    if (param.hasAddOp) {
-        constant ElementwiseAddParam &addParam = param.addParam;
-        output = float4(getBiasHalf(gid, addParam, biasTexture));
-    }
-
-    ushort dilation_x = param.dilationX;
-    half4 input[5];
-    
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
-        
-        input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
-        
-        input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-        
-        input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
-        
-        input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
-        
-        for (int j = 0; j < 5; ++j) {
-            half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.x += dot(float4(input[j]), float4(weight_x));
-            
-            half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.y += dot(float4(input[j]), float4(weight_y));
-            
-            half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.z += dot(float4(input[j]), float4(weight_z));
-            
-            half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.w += dot(float4(input[j]), float4(weight_w));
-        }
-    }
-    float4 relu = param.hasReluOp == 1 ? fmax(output, 0.0) : output;
-    outTexture.write(half4(relu), gid.xy, gid.z);
-}
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvBNReluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvBNReluKernel.metal
deleted file mode 100644
index 6851f8aa98..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvBNReluKernel.metal
+++ /dev/null
@@ -1,297 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-#include "Common.metal"
-
-using namespace metal;
-
-#pragma mark - conv bn relu
-kernel void conv_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                     texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                     constant MetalConvParam &param [[buffer(0)]],
-                                     const device float4 *weights [[buffer(1)]],
-                                     const device float4 *new_scale [[buffer(2)]],
-                                     const device float4 *new_biase [[buffer(3)]],
-                                     uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 1;
-    
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    float4 output = float4(0.0);
-    
-    float4 input;
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-        float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-        output.x += dot(input, weight_x);
-        
-        float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-        output.y += dot(input, weight_y);
-        
-        float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-        output.z += dot(input, weight_z);
-        
-        float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-        output.w += dot(input, weight_w);
-    }
-    output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                     texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                     constant MetalConvParam &param [[buffer(0)]],
-                                     const device float4 *weights [[buffer(1)]],
-                                     const device float4 *new_scale [[buffer(2)]],
-                                     const device float4 *new_biase [[buffer(3)]],
-                                     uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 9;
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    float4 output = float4(0.0);
-    
-    float4 input[9];
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-        input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-        input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-        input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-        input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-        input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-        for (int j = 0; j < 9; ++j) {
-            float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.x += dot(input[j], weight_x);
-            
-            float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.y += dot(input[j], weight_y);
-            
-            float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.z += dot(input[j], weight_z);
-            
-            float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.w += dot(input[j], weight_w);
-        }
-    }
-    output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void depthwise_conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                               texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                               constant MetalConvParam &param [[buffer(0)]],
-                                               const device float *weights [[buffer(1)]],
-                                               const device float4 *new_scale [[buffer(2)]],
-                                               const device float4 *new_biase [[buffer(3)]],
-                                               uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    uint output_slice = gid.z;
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 9;
-    uint weithTo = gid.z * kernelHXW * 4;
-    float4 output = float4(0.0);
-    float4 inputs[9];
-    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-    for (int j = 0; j < 9; ++j) {
-        float4 input = inputs[j];
-        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-    }
-    output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-#pragma mark - half
-kernel void conv_batch_norm_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                          texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                          constant MetalConvParam &param [[buffer(0)]],
-                                          const device half4 *weights [[buffer(1)]],
-                                          const device half4 *new_scale [[buffer(2)]],
-                                          const device half4 *new_biase [[buffer(3)]],
-                                          uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 1;
-    
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    float4 output = float4(0.0);
-    
-    half4 input;
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-        half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-        output.x += dot(float4(input), float4(weight_x));
-        
-        half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-        output.y += dot(float4(input), float4(weight_y));
-        
-        half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-        output.z += dot(float4(input), float4(weight_z));
-        
-        half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-        output.w += dot(float4(input), float4(weight_w));
-    }
-    output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
-    outTexture.write(half4(output), gid.xy, gid.z);
-}
-
-kernel void conv_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                          texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                          constant MetalConvParam &param [[buffer(0)]],
-                                          const device half4 *weights [[buffer(1)]],
-                                          const device half4 *new_scale [[buffer(2)]],
-                                          const device half4 *new_biase [[buffer(3)]],
-                                          uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 9;
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    float4 output = float4(0.0);
-    
-    half4 input[9];
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-        input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-        input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-        input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-        input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-        input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-        for (int j = 0; j < 9; ++j) {
-            half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.x += dot(float4(input[j]), float4(weight_x));
-            
-            half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.y += dot(float4(input[j]), float4(weight_y));
-            
-            half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.z += dot(float4(input[j]), float4(weight_z));
-            
-            half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.w += dot(float4(input[j]), float4(weight_w));
-        }
-    }
-    output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
-    outTexture.write(half4(output), gid.xy, gid.z);
-}
-
-kernel void depthwise_conv_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                                    texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                                    constant MetalConvParam &param [[buffer(0)]],
-                                                    const device half *weights [[buffer(1)]],
-                                                    const device half4 *new_scale [[buffer(2)]],
-                                                    const device half4 *new_biase [[buffer(3)]],
-                                                    uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    uint output_slice = gid.z;
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 9;
-    uint weithTo = gid.z * kernelHXW * 4;
-    float4 output = float4(0.0);
-    half4 inputs[9];
-    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-    for (int j = 0; j < 9; ++j) {
-        half4 input = inputs[j];
-        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-    }
-    output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
-    outTexture.write(half4(output), gid.xy, gid.z);
-}
-
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvTransposeKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvTransposeKernel.metal
deleted file mode 100644
index a324fac188..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvTransposeKernel.metal
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-struct MetalConvTransposeParam{
-    ushort kernelW;
-    ushort kernelH;
-    
-    ushort strideX;
-    ushort strideY;
-    
-    ushort paddingX;
-    ushort paddingY;
-    
-    ushort dilationX;
-    ushort dilationY;
-};
-
-kernel void conv_transpose2x2_stride2(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                      texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                      constant MetalConvTransposeParam &param [[buffer(0)]],
-                                      const device float4 *weights [[buffer(1)]],
-                                      uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    int input_array_size = inTexture.get_array_size();
-    int kernel_index_x = gid.x % 2;
-    int kernel_index_y = gid.y % 2;
-    int kernel_index = kernel_index_y * 2 + kernel_index_x;
-    int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size);
-    int input_x = gid.x / 2;
-    int input_y = gid.y / 2;
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    float4 output = float4(0.0);
-    for (int i = 0; i < input_array_size; ++i) {
-        
-        float4 input = inTexture.sample(sample, float2(input_x, input_y), i);
-        
-        float4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i];
-        float4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i];
-        float4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i];
-        float4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i];
-        
-        output.x += dot(input, kernel_slice0);
-        
-        output.y += dot(input, kernel_slice1);
-        
-        output.z += dot(input, kernel_slice2);
-        
-        output.w += dot(input, kernel_slice3);
-    }
-    
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_transpose2x2_stride2_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                           texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                           constant MetalConvTransposeParam &param [[buffer(0)]],
-                                           const device half4 *weights [[buffer(1)]],
-                                           uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    int input_array_size = inTexture.get_array_size();
-    int kernel_index_x = gid.x % 2;
-    int kernel_index_y = gid.y % 2;
-    int kernel_index = kernel_index_y * 2 + kernel_index_x;
-    int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size);
-    int input_x = gid.x / 2;
-    int input_y = gid.y / 2;
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    float4 output = float4(0.0);
-    for (int i = 0; i < input_array_size; ++i) {
-        
-        half4 input = inTexture.sample(sample, float2(input_x, input_y), i);
-        
-        half4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i];
-        half4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i];
-        half4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i];
-        half4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i];
-        
-        output.x += dot(float4(input), float4(kernel_slice0));
-        
-        output.y += dot(float4(input), float4(kernel_slice1));
-        
-        output.z += dot(float4(input), float4(kernel_slice2));
-        
-        output.w += dot(float4(input), float4(kernel_slice3));
-    }
-    
-    outTexture.write(half4(output), gid.xy, gid.z);
-}
-
-//kernel void conv_transpose(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-//                           texture2d_array<float, access::write> outTexture [[texture(1)]],
-//                           constant MetalConvTransposeParam &param [[buffer(0)]],
-//                           const device float4 *weights [[buffer(1)]],
-//                           uint3 gid [[thread_position_in_grid]]){
-//  if (gid.x >= outTexture.get_width() ||
-//      gid.y >= outTexture.get_height() ||
-//      gid.z >= outTexture.get_array_size()) {
-//    return;
-//  }
-//
-//  int input_array_size = inTexture.get_array_size();
-//
-//  uint kernel_one_output_slice = input_array_size * param.kernelW * param.kernelH;
-//
-//  uint kernel_stride_z = gid.z * 4 * (kernel_one_output_slice);
-//
-//  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-//
-//  float4 output;
-//
-//  for (int w = 0; w < param.kernelW; ++w) {
-//    int top = gid.x - w * param.dilationX + param.paddingX;
-//    int input_x = top / param.strideX;
-//    if (top < 0 || input_x >= int(inTexture.get_width())) {
-//      continue;
-//    }
-//
-//    for (int h = 0; h < param.kernelH; ++h) {
-//      int top_y = gid.y - h * param.dilationY + param.paddingY;
-//      int input_y = top_y / param.strideY;
-//      if (top_y < 0 || input_y >= int(inTexture.get_height())) {
-//        continue;
-//      }
-//
-//      uint kernel_index = (w * param.kernelH + h) * inTexture.get_array_size();
-//
-//      for (int slice = 0; slice < input_array_size; ++slice) {
-//
-//        float4 input;
-//        float4 kernel_slice = weights[kernel_stride_z + 0 * kernel_one_output_slice + kernel_index + slice];
-//        float4 kernel_slice1 = weights[kernel_stride_z + 1 * kernel_one_output_slice + kernel_index + slice];
-//
-//        float4 kernel_slice2 = weights[kernel_stride_z + 2 * kernel_one_output_slice + kernel_index + slice];
-//
-//        float4 kernel_slice3 = weights[kernel_stride_z + 3 * kernel_one_output_slice + kernel_index + slice];
-//
-//        input = inTexture.sample(sample, float2(input_x,    input_y), slice);
-//        output.x += dot(input, kernel_slice);
-//        output.y += dot(input, kernel_slice1);
-//        output.z += dot(input, kernel_slice2);
-//        output.w += dot(input, kernel_slice3);
-//      }
-//    }
-//  }
-//
-//  outTexture.write(output, gid.xy, gid.z);
-//}
-//
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal
deleted file mode 100644
index 1748eadb79..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-#include "Common.metal"
-
-using namespace metal;
-
-kernel void elementwise_add(texture2d_array<float, access::read> inputX [[texture(0)]],
-                            texture2d_array<float, access::read> inputY [[texture(1)]],
-                            texture2d_array<float, access::write> outTexture [[texture(2)]],
-                            constant ElementwiseAddParam &pm [[buffer(0)]],
-                            uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    float4 rx, ry;
-    rx = inputX.read(gid.xy, gid.z);
-    if (pm.fast == 1) {
-        ry = inputY.read(gid.xy, gid.z);
-    } else if (pm.addByChannel == 1) {
-        ry = inputY.read(uint2(0, 0), gid.z);
-    } else {
-        int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
-        int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
-        int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
-        int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
-        int32_t yshift = 4 - pm.ylen - pm.axis;
-        for (int n = 0; n < 4; n++) {
-            x_xyzn[3] = n;
-            xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
-            invtrans(xtrans, x_abcd, t_abcd);
-            for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
-                y_abcd[yshift+k] = t_abcd[k];
-            }
-            trans(ytrans, y_abcd, t_abcd);
-            abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
-            ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
-        }
-    }
-    float4 r = rx + ry;
-    outTexture.write(r, gid.xy, gid.z);
-}
-
-kernel void elementwise_add_half(texture2d_array<half, access::read> inputX [[texture(0)]],
-                                 texture2d_array<half, access::read> inputY [[texture(1)]],
-                                 texture2d_array<half, access::write> outTexture [[texture(2)]],
-                                 constant ElementwiseAddParam &pm [[buffer(0)]],
-                                 uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    half4 rx, ry;
-    rx = inputX.read(gid.xy, gid.z);
-    if (pm.fast == 1) {
-        ry = inputY.read(gid.xy, gid.z);
-    } else if (pm.addByChannel == 1) {
-        ry = inputY.read(uint2(0, 0), gid.z);
-    } else {
-        int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
-        int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
-        int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
-        int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
-        int32_t yshift = 4 - pm.ylen - pm.axis;
-        for (int n = 0; n < 4; n++) {
-            x_xyzn[3] = n;
-            xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
-            invtrans(xtrans, x_abcd, t_abcd);
-            for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
-                y_abcd[yshift+k] = t_abcd[k];
-            }
-            trans(ytrans, y_abcd, t_abcd);
-            abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
-            ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
-        }
-    }
-    half4 r = rx + ry;
-    outTexture.write(r, gid.xy, gid.z);
-}
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.inc.metal
deleted file mode 100644
index bed8763f36..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.inc.metal
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#ifdef P
-
-#include <metal_stdlib>
-#include "Macro.metal"
-
-using namespace metal;
-
-kernel void FUNC3_(elementwise_add, PRELU_TYPE, P)(texture2d_array<P, access::read> inputX [[texture(0)]],
-                                                   texture2d_array<P, access::read> inputY [[texture(1)]],
-                                                   texture2d_array<P, access::write> outTexture [[texture(2)]],
-                                                   constant ElementwiseAddParam &pm [[buffer(0)]],
-#ifdef PRELU_CHANNEL
-                                                   const device VECTOR(P, 4) *alpha [[buffer(1)]],
-#endif
-#ifdef PRELU_ELEMENT
-                                                   const device VECTOR(P, 4) *alpha [[buffer(1)]],
-#endif
-#ifdef PRELU_OTHER
-                                                   const device P *alpha [[buffer(1)]],
-#endif
-                                                   uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    VECTOR(P, 4) rx, ry;
-    rx = inputX.read(gid.xy, gid.z);
-    if (pm.fast == 1) {
-        ry = inputY.read(gid.xy, gid.z);
-    } else if (pm.addByChannel == 1) {
-        ry = inputY.read(uint2(0, 0), gid.z);
-    } else {
-        int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
-        int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
-        int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
-        int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
-        int32_t yshift = 4 - pm.ylen - pm.axis;
-        for (int n = 0; n < 4; n++) {
-            x_xyzn[3] = n;
-            xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
-            invtrans(xtrans, x_abcd, t_abcd);
-            for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
-                y_abcd[yshift+k] = t_abcd[k];
-            }
-            trans(ytrans, y_abcd, t_abcd);
-            abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
-            ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
-        }
-    }
-    VECTOR(P, 4) output = rx + ry;
-    
-#ifdef PRELU_CHANNEL
-    VECTOR(P, 4) alpha_value = alpha[gid.z];
-    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
-#endif
-#ifdef PRELU_ELEMENT
-    int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
-    VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
-    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
-#endif
-#ifdef PRELU_OTHER
-    P alpha_value = alpha[0];
-    output.x = output.x > 0 ? output.x : (alpha_value * output.x);
-    output.y = output.y > 0 ? output.y : (alpha_value * output.y);
-    output.z = output.z > 0 ? output.z : (alpha_value * output.z);
-    output.w = output.w > 0 ? output.w : (alpha_value * output.w);
-#endif
-    
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-#endif
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal
deleted file mode 100644
index c688674918..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-#include "Common.metal"
-using namespace metal;
-
-#define P float
-
-#define PRELU_CHANNEL prelu_channel
-#define PRELU_TYPE channel
-#include "ElementwiseAddPreluKernel.inc.metal"
-#undef  PRELU_TYPE
-#undef  PRELU_CHANNEL
-
-#define PRELU_ELEMENT element
-#define PRELU_TYPE prelu_element
-#include "ElementwiseAddPreluKernel.inc.metal"
-#undef  PRELU_TYPE
-#undef  PRELU_ELEMENT
-
-#define PRELU_OTHER   other
-#define PRELU_TYPE prelu_other
-#include "ElementwiseAddPreluKernel.inc.metal"
-#undef  PRELU_TYPE
-#undef  PRELU_OTHER
-
-#undef P
-
-#define P half
-
-#define PRELU_CHANNEL channel
-#define PRELU_TYPE channel
-#include "ElementwiseAddPreluKernel.inc.metal"
-#undef  PRELU_TYPE
-#undef  PRELU_CHANNEL
-
-#define PRELU_ELEMENT element
-#define PRELU_TYPE prelu_element
-#include "ElementwiseAddPreluKernel.inc.metal"
-#undef  PRELU_TYPE
-#undef  PRELU_ELEMENT
-
-#define PRELU_OTHER   other
-#define PRELU_TYPE prelu_other
-#include "ElementwiseAddPreluKernel.inc.metal"
-#undef  PRELU_TYPE
-#undef  PRELU_OTHER
-
-#undef P
-
-
-
-
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.inc.metal
deleted file mode 100644
index efeec0298b..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.inc.metal
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#ifdef P
-
-#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
-#define CONCAT2_(a, b) a ## _ ## b
-#define CONCAT2(a, b) a ## b
-#define FUNC(m, n, q) CONCAT3_(m, n, q)
-#define FUNC_T(m, n) CONCAT2_(m, n)
-
-#define VECTOR(p, n) CONCAT2(p, n)
-
-kernel void FUNC_T(fetch, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
-                             device float *output [[buffer(0)]],
-                             uint3 gid [[thread_position_in_grid]]) {
-    uint input_width = inTexture.get_width();
-    uint input_height = inTexture.get_height();
-    if (gid.x >= input_width ||
-        gid.y >= input_height ||
-        gid.z >= inTexture.get_array_size()) {
-        return;
-    }
-    
-    const VECTOR(P, 4) input = inTexture.read(gid.xy, gid.z);
-    uint delta = input_width * input_height;
-    uint output_to = 4 * gid.z * delta + gid.y * input_width + gid.x;
-    
-    output[output_to] = input.x;
-    output[output_to + delta] = input.y;
-    output[output_to + 2 * delta] = input.z;
-    output[output_to + 3 * delta] = input.w;
-}
-
-kernel void FUNC(fetch, 1or2, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
-                                 device float4 *output [[buffer(0)]],
-                                 uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= inTexture.get_width() ||
-        gid.y >= inTexture.get_height() ||
-        gid.z >= inTexture.get_array_size()) {
-        return;
-    }
-    
-    int input_width = inTexture.get_width();
-    const VECTOR(P, 4) input = inTexture.read(gid.xy, gid.z);
-    output[gid.y * input_width + gid.x] = float4(input);
-}
-
-#endif
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.metal
deleted file mode 100644
index df2de98648..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.metal
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-#define P float
-
-#include "FetchKernel.inc.metal"
-
-#undef P
-
-#define P half
-
-#include "FetchKernel.inc.metal"
-
-#undef P
-
-
-kernel void fetch_placeholder(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                              device float *output [[buffer(0)]],
-                              uint3 gid [[thread_position_in_grid]]) {
-    
-}
-
-kernel void fetch_placeholder_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                                   device float *output [[buffer(0)]],
-                                   uint3 gid [[thread_position_in_grid]]) {
-}
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Kernels.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Kernels.metal
deleted file mode 100644
index 06bf42697e..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Kernels.metal
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-#include "Common.metal"
-using namespace metal;
-
-// 占位函数, 啥也没干
-kernel void place_holder(texture2d<half, access::read> inTexture [[texture(0)]],
-                         texture2d_array<half, access::write> outTexture [[texture(1)]],
-                         uint3 gid [[thread_position_in_grid]]) {
-}
-
-struct OutputDim {
-    ushort width;
-    ushort height;
-    ushort strideX;
-    ushort strideY;
-};
-
-kernel void resize(texture2d<half, access::read> inTexture [[texture(0)]],
-                   texture2d_array<half, access::write> outTexture [[texture(1)]],
-                   constant OutputDim &params [[buffer(0)]],
-                   uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    
-    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint2 pos = gid.xy * uint2(params.strideX, params.strideY);
-    const half4 input = inTexture.read(pos);
-    outTexture.write(half4(input.x, input.y, input.z, input.w), gid.xy, gid.z);
-}
-
-
-kernel void texture2d_to_2d_array(texture2d<float, access::read> inTexture [[texture(0)]],
-                                  texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                  uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= inTexture.get_width() ||
-        gid.y >= inTexture.get_height()){
-        return;
-    }
-    const float4 input = inTexture.read(gid.xy);
-    outTexture.write(input, gid.xy, 0);
-}
-
-kernel void texture2d_to_2d_array_half(texture2d<half, access::read> inTexture [[texture(0)]],
-                                       texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                       uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= inTexture.get_width() ||
-        gid.y >= inTexture.get_height()){
-        return;
-    }
-    const half4 input = inTexture.read(gid.xy);
-    outTexture.write(input, gid.xy, 0);
-}
-
-
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Macro.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Macro.metal
deleted file mode 100644
index 43b9579c89..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Macro.metal
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-
-#define CONCAT2(a, b) a ## b
-#define CONCAT2_(a, b) a ## _ ## b
-#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
-#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
-#define CONCAT5_(a, b, c, d, e) a ## _ ## b ## _ ## c ## _ ## d ## _ ## e
-
-#define FUNC(f, r, n, v, p) CONCAT5_(f, r, n, v, p)
-#define VECTOR(p, n) CONCAT2(p, n)
-#define FUNC2_(a, b) CONCAT2_(a, b)
-#define FUNC3_(a, b, c) CONCAT3_(a, b, c)
-
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/NMSFetchResultKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/NMSFetchResultKernel.metal
deleted file mode 100644
index e32c98cc29..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/NMSFetchResultKernel.metal
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-kernel void nms_fetch_result(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                             device float *output [[buffer(0)]],
-                             uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= inTexture.get_width() ||
-        gid.y >= inTexture.get_height() ||
-        gid.z >= inTexture.get_array_size()) {
-        return;
-    }
-    
-    int input_width = inTexture.get_width();
-    const float4 input = inTexture.read(gid.xy, gid.z);
-    output[gid.y * input_width + gid.x] = input.x;
-    
-}
-
-
-kernel void nms_fetch_result_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                                  device float *output [[buffer(0)]],
-                                  uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= inTexture.get_width() ||
-        gid.y >= inTexture.get_height() ||
-        gid.z >= inTexture.get_array_size()) {
-        return;
-    }
-    
-    int input_width = inTexture.get_width();
-    const half4 input = inTexture.read(gid.xy, gid.z);
-    output[gid.y * input_width + gid.x] = input.x;
-}
-
-kernel void nms_fetch_bbox(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                           device float4 *output [[buffer(0)]],
-                           uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= inTexture.get_width() ||
-        gid.y >= inTexture.get_height() ||
-        gid.z >= inTexture.get_array_size()) {
-        return;
-    }
-    
-    int input_width = inTexture.get_width();
-    //  int input_height = inTexture.get_height();
-    const float4 input = inTexture.read(gid.xy, gid.z);
-    output[gid.y * input_width + gid.x] = input;
-}
-
-kernel void nms_fetch_bbox_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                                device float4 *output [[buffer(0)]],
-                                uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= inTexture.get_width() ||
-        gid.y >= inTexture.get_height() ||
-        gid.z >= inTexture.get_array_size()) {
-        return;
-    }
-    
-    int input_width = inTexture.get_width();
-    //  int input_height = inTexture.get_height();
-    const half4 input = inTexture.read(gid.xy, gid.z);
-    output[gid.y * input_width + gid.x] = float4(input);
-}
-
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/NearestInterpKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/NearestInterpKernel.metal
deleted file mode 100644
index 10f5d1f6f9..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/NearestInterpKernel.metal
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-struct NearestInterpParam {
-    float scale;
-};
-
-kernel void nearest_interp(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                  texture2d_array<float, access::write> outTexture [[texture(1)]],
-                  constant NearestInterpParam &param [[buffer(0)]],
-                  uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-    float scale = param.scale;
-    uint x = uint(floor(float(gid.x) / scale));
-    uint y = uint(floor(float(gid.y) / scale));
-    const float4 input = inTexture.read(uint2(x, y), gid.z);
-    outTexture.write(input, gid.xy, gid.z);
-}
-
-kernel void nearest_interp_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                       texture2d_array<half, access::write> outTexture [[texture(1)]],
-                       constant NearestInterpParam &param [[buffer(0)]],
-                       uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-    float scale = param.scale;
-    uint x = uint(floor(float(gid.x) / scale));
-    uint y = uint(floor(float(gid.y) / scale));
-    const half4 input = inTexture.read(uint2(x, y), gid.z);
-    outTexture.write(input, gid.xy, gid.z);
-}
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.inc.metal
deleted file mode 100644
index 05146b8d14..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.inc.metal
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#ifdef P
-
-kernel void FUNC2_(pool, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
-                            texture2d_array<P, access::write> outTexture [[texture(1)]],
-                            constant PoolParam &pm [[buffer(0)]],
-                            uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    int xmin = gid.x * pm.strideX - pm.paddingX;
-    int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
-    xmin = max(xmin, 0);
-    int ymin = gid.y * pm.strideX - pm.paddingX;
-    int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
-    ymin = max(ymin, 0);
-    
-    VECTOR(P, 4) r = 0;
-    if (pm.poolType == 0) {
-        r = inTexture.read(uint2(xmin, ymin), gid.z);
-        for (int x = xmin; x < xmax; x++) {
-            for (int y = ymin; y < ymax; y++) {
-                r = fmax(r, inTexture.read(uint2(x, y), gid.z));
-            }
-        }
-    } else if (pm.poolType == 1) {
-        for (int x = xmin; x < xmax; x++) {
-            for (int y = ymin; y < ymax; y++) {
-                r += inTexture.read(uint2(x, y), gid.z);
-            }
-        }
-        r /= (xmax - xmin) * (ymax - ymin);
-    }
-    outTexture.write(r, gid.xy, gid.z);
-}
-
-#endif
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.metal
deleted file mode 100644
index 30111b7bcb..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.metal
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-#include "Macro.metal"
-
-using namespace metal;
-
-struct PoolParam {
-    int ksizeX;
-    int ksizeY;
-    int strideX;
-    int strideY;
-    int paddingX;
-    int paddingY;
-    int poolType;
-};
-
-#define P half
-#include "PoolKernel.inc.metal"
-#undef P
-
-#define P float
-#include "PoolKernel.inc.metal"
-#undef P
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PreluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PreluKernel.metal
deleted file mode 100644
index 6279821436..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PreluKernel.metal
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-kernel void prelu_channel(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                          texture2d_array<float, access::write> outTexture [[texture(1)]],
-                          const device float4 *alpha [[buffer(0)]],
-                          uint3 gid [[thread_position_in_grid]]){
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
-    float4 alpha_value = alpha[gid.z];
-    float4 output;
-    output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
-    output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
-    output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
-    output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void prelu_element(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                          texture2d_array<float, access::write> outTexture [[texture(1)]],
-                          const device float4 *alpha [[buffer(0)]],
-                          uint3 gid [[thread_position_in_grid]]){
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
-    
-    int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size();
-    float4 alpha_value = alpha[alpha_to + gid.z];
-    
-    float4 output;
-    output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
-    output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
-    output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
-    output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void prelu_other(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                        texture2d_array<float, access::write> outTexture [[texture(1)]],
-                        const device float *alpha [[buffer(0)]],
-                        uint3 gid [[thread_position_in_grid]]){
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
-    float alpha_value = alpha[0];
-    float4 output;
-    output.x = input.x > 0 ? input.x : (alpha_value * input.x);
-    output.y = input.y > 0 ? input.y : (alpha_value * input.y);
-    output.z = input.z > 0 ? input.z : (alpha_value * input.z);
-    output.w = input.w > 0 ? input.w : (alpha_value * input.w);
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-
-kernel void prelu_channel_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                               texture2d_array<half, access::write> outTexture [[texture(1)]],
-                               const device half4 *alpha [[buffer(0)]],
-                               uint3 gid [[thread_position_in_grid]]){
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
-    half4 alpha_value = alpha[gid.z];
-    half4 output;
-    output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
-    output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
-    output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
-    output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void prelu_element_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                               texture2d_array<half, access::write> outTexture [[texture(1)]],
-                               const device half4 *alpha [[buffer(0)]],
-                               uint3 gid [[thread_position_in_grid]]){
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
-    
-    int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size();
-    half4 alpha_value = alpha[alpha_to + gid.z];
-    
-    half4 output;
-    output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
-    output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
-    output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
-    output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void prelu_other_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                             texture2d_array<half, access::write> outTexture [[texture(1)]],
-                             const device half *alpha [[buffer(0)]],
-                             uint3 gid [[thread_position_in_grid]]){
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
-    half alpha_value = alpha[0];
-    half4 output;
-    output.x = input.x > 0 ? input.x : (alpha_value * input.x);
-    output.y = input.y > 0 ? input.y : (alpha_value * input.y);
-    output.z = input.z > 0 ? input.z : (alpha_value * input.z);
-    output.w = input.w > 0 ? input.w : (alpha_value * input.w);
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PriorBoxKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PriorBoxKernel.metal
deleted file mode 100644
index c7f97043bf..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PriorBoxKernel.metal
+++ /dev/null
@@ -1,367 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-struct PriorBoxMetalParam {
-    float offset;
-    float stepWidth;
-    float stepHeight;
-    float minSize;
-    float maxSize;
-    float imageWidth;
-    float imageHeight;
-    
-    bool clip;
-    
-    uint numPriors;
-    uint aspecRatiosSize;
-    uint minSizeSize;
-    uint maxSizeSize;
-};
-
-kernel void prior_box(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                      texture2d_array<float, access::write> outBoxTexture [[texture(1)]],
-                      texture2d_array<float, access::write> varianceTexture [[texture(2)]],
-                      const device float *aspect_ratios [[buffer(0)]],
-                      constant PriorBoxMetalParam &param [[buffer(1)]],
-                      const device float4 *variances [[buffer(2)]],
-                      uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outBoxTexture.get_width() ||
-        gid.y >= outBoxTexture.get_height() ||
-        gid.z >= outBoxTexture.get_array_size()) return;
-    
-    float center_x = (gid.x + param.offset) * param.stepWidth;
-    float center_y = (gid.y + param.offset) * param.stepHeight;
-    
-    float box_width, box_height;
-    
-    if (gid.z < param.aspecRatiosSize) {
-        float ar = aspect_ratios[gid.z];
-        box_width = param.minSize * sqrt(ar) / 2;
-        box_height = param.minSize / sqrt(ar) / 2;
-        float4 box;
-        box.x = (center_x - box_width) / param.imageWidth;
-        box.y = (center_y - box_height) / param.imageHeight;
-        box.z = (center_x + box_width) / param.imageWidth;
-        box.w = (center_y + box_height) / param.imageHeight;
-        
-        float4 res;
-        if (param.clip) {
-            res = fmin(fmax(box, 0.0), 1.0);
-        } else {
-            res = box;
-        }
-        
-        outBoxTexture.write(res, gid.xy, gid.z);
-    } else if (gid.z >= param.aspecRatiosSize) {
-        if (param.maxSizeSize > 0) {
-            box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
-            float4 max_box;
-            max_box.x = (center_x - box_width) / param.imageWidth;
-            max_box.y = (center_y - box_height) / param.imageHeight;
-            max_box.z = (center_x + box_width) / param.imageWidth;
-            max_box.w = (center_y + box_height) / param.imageHeight;
-            
-            float4 res;
-            if (param.clip) {
-                res = min(max(max_box, 0.0), 1.0);
-            } else {
-                res = max_box;
-            }
-            outBoxTexture.write(max_box, gid.xy, gid.z);
-        }
-    }
-    
-    float4 variance = variances[0];
-    if (gid.z < param.numPriors) {
-        float4 variances_output;
-        variances_output.x = variance.x;
-        variances_output.y = variance.y;
-        variances_output.z = variance.z;
-        variances_output.w = variance.w;
-        varianceTexture.write(variances_output, gid.xy, gid.z);
-    }
-}
-
-
-kernel void prior_box_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                           texture2d_array<half, access::write> outBoxTexture [[texture(1)]],
-                           texture2d_array<half, access::write> varianceTexture [[texture(2)]],
-                           const device half *aspect_ratios [[buffer(0)]],
-                           constant PriorBoxMetalParam &param [[buffer(1)]],
-                           const device float4 *variances [[buffer(2)]],
-                           uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outBoxTexture.get_width() ||
-        gid.y >= outBoxTexture.get_height() ||
-        gid.z >= outBoxTexture.get_array_size()) return;
-    
-    float center_x = (gid.x + param.offset) * param.stepWidth;
-    float center_y = (gid.y + param.offset) * param.stepHeight;
-    
-    float box_width, box_height;
-    
-    if (gid.z < param.aspecRatiosSize) {
-        half ar = aspect_ratios[gid.z];
-        box_width = param.minSize * sqrt(ar) / 2;
-        box_height = param.minSize / sqrt(ar) / 2;
-        float4 box;
-        box.x = (center_x - box_width) / param.imageWidth;
-        box.y = (center_y - box_height) / param.imageHeight;
-        box.z = (center_x + box_width) / param.imageWidth;
-        box.w = (center_y + box_height) / param.imageHeight;
-        
-        float4 res;
-        if (param.clip) {
-            res = fmin(fmax(box, 0.0), 1.0);
-        } else {
-            res = box;
-        }
-        
-        outBoxTexture.write(half4(res), gid.xy, gid.z);
-    } else if (gid.z >= param.aspecRatiosSize) {
-        if (param.maxSizeSize > 0) {
-            box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
-            float4 max_box;
-            max_box.x = (center_x - box_width) / param.imageWidth;
-            max_box.y = (center_y - box_height) / param.imageHeight;
-            max_box.z = (center_x + box_width) / param.imageWidth;
-            max_box.w = (center_y + box_height) / param.imageHeight;
-            
-            float4 res;
-            if (param.clip) {
-                res = min(max(max_box, 0.0), 1.0);
-            } else {
-                res = max_box;
-            }
-            outBoxTexture.write(half4(max_box), gid.xy, gid.z);
-        }
-    }
-    
-    float4 variance = variances[0];
-    if (gid.z < param.numPriors) {
-        float4 variances_output;
-        variances_output.x = variance.x;
-        variances_output.y = variance.y;
-        variances_output.z = variance.z;
-        variances_output.w = variance.w;
-        varianceTexture.write(half4(variances_output), gid.xy, gid.z);
-    }
-}
-
-
-
-kernel void prior_box_MinMaxAspectRatiosOrder(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                                              texture2d_array<float, access::write> outBoxTexture [[texture(1)]],
-                                              texture2d_array<float, access::write> varianceTexture [[texture(2)]],
-                                              const device float *aspect_ratios [[buffer(0)]],
-                                              constant PriorBoxMetalParam &param [[buffer(1)]],
-                                              const device float4 *variances [[buffer(2)]],
-                                              uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outBoxTexture.get_width() ||
-        gid.y >= outBoxTexture.get_height() ||
-        gid.z >= outBoxTexture.get_array_size()) return;
-    
-    float center_x = (gid.x + param.offset) * param.stepWidth;
-    float center_y = (gid.y + param.offset) * param.stepHeight;
-    
-    float box_width, box_height;
-    
-    
-    
-    if (gid.z == 0) {
-        box_width = box_height = param.minSize / 2;
-        
-        float4 box;
-        box.x = (center_x - box_width) / param.imageWidth;
-        box.y = (center_y - box_height) / param.imageHeight;
-        box.z = (center_x + box_width) / param.imageWidth;
-        box.w = (center_y + box_height) / param.imageHeight;
-        
-        float4 res;
-        if (param.clip) {
-            res = fmin(fmax(box, 0.0), 1.0);
-        } else {
-            res = box;
-        }
-        
-        outBoxTexture.write(res, gid.xy, gid.z);
-    }
-    
-    if (gid.z == 1 && param.maxSizeSize > 0) {
-        
-        box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
-        float4 max_box;
-        max_box.x = (center_x - box_width) / param.imageWidth;
-        max_box.y = (center_y - box_height) / param.imageHeight;
-        max_box.z = (center_x + box_width) / param.imageWidth;
-        max_box.w = (center_y + box_height) / param.imageHeight;
-        
-        float4 res;
-        if (param.clip) {
-            res = min(max(max_box, 0.0), 1.0);
-        } else {
-            res = max_box;
-        }
-        outBoxTexture.write(res, gid.xy, gid.z);
-    }
-    
-    int aspect_to = 0;
-    if (param.maxSizeSize > 0) {
-        aspect_to = gid.z - 2;
-    } else {
-        aspect_to = gid.z - 1;
-    }
-    
-    
-    
-    
-    if (aspect_to >= 0 && aspect_to < int(param.aspecRatiosSize)) {
-        
-        int skip = 0;
-        for (int i = 0; i < aspect_to + 1; ++i) {
-            if (fabs(aspect_ratios[i] - 1.) < 1e-6) {
-                skip += 1;
-            }
-        }
-        aspect_to += skip;
-        
-        float ar = aspect_ratios[aspect_to];
-        
-        box_width = param.minSize * sqrt(ar) / 2;
-        box_height = param.minSize / sqrt(ar) / 2;
-        float4 box;
-        box.x = (center_x - box_width) / param.imageWidth;
-        box.y = (center_y - box_height) / param.imageHeight;
-        box.z = (center_x + box_width) / param.imageWidth;
-        box.w = (center_y + box_height) / param.imageHeight;
-        
-        float4 res;
-        if (param.clip) {
-            res = fmin(fmax(box, 0.0), 1.0);
-        } else {
-            res = box;
-        }
-        
-        outBoxTexture.write(res, gid.xy, gid.z);
-    }
-    
-    float4 variance = variances[0];
-    if (gid.z < param.numPriors) {
-        float4 variances_output;
-        variances_output.x = variance.x;
-        variances_output.y = variance.y;
-        variances_output.z = variance.z;
-        variances_output.w = variance.w;
-        varianceTexture.write(variances_output, gid.xy, gid.z);
-    }
-}
-
-
-kernel void prior_box_MinMaxAspectRatiosOrder_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                                                   texture2d_array<half, access::write> outBoxTexture [[texture(1)]],
-                                                   texture2d_array<half, access::write> varianceTexture [[texture(2)]],
-                                                   const device half *aspect_ratios [[buffer(0)]],
-                                                   constant PriorBoxMetalParam &param [[buffer(1)]],
-                                                   const device float4 *variances [[buffer(2)]],
-                                                   uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outBoxTexture.get_width() ||
-        gid.y >= outBoxTexture.get_height() ||
-        gid.z >= outBoxTexture.get_array_size()) return;
-    
-    float center_x = (gid.x + param.offset) * param.stepWidth;
-    float center_y = (gid.y + param.offset) * param.stepHeight;
-    
-    float box_width, box_height;
-    
-    
-    
-    if (gid.z == 0) {
-        box_width = box_height = param.minSize / 2;
-        
-        float4 box;
-        box.x = (center_x - box_width) / param.imageWidth;
-        box.y = (center_y - box_height) / param.imageHeight;
-        box.z = (center_x + box_width) / param.imageWidth;
-        box.w = (center_y + box_height) / param.imageHeight;
-        
-        float4 res;
-        if (param.clip) {
-            res = fmin(fmax(box, 0.0), 1.0);
-        } else {
-            res = box;
-        }
-        
-        outBoxTexture.write(half4(res), gid.xy, gid.z);
-    }
-    
-    if (gid.z == 1 && param.maxSizeSize > 0) {
-        
-        box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
-        float4 max_box;
-        max_box.x = (center_x - box_width) / param.imageWidth;
-        max_box.y = (center_y - box_height) / param.imageHeight;
-        max_box.z = (center_x + box_width) / param.imageWidth;
-        max_box.w = (center_y + box_height) / param.imageHeight;
-        
-        float4 res;
-        if (param.clip) {
-            res = min(max(max_box, 0.0), 1.0);
-        } else {
-            res = max_box;
-        }
-        outBoxTexture.write(half4(res), gid.xy, gid.z);
-    }
-    
-    int aspect_to = 0;
-    if (param.maxSizeSize > 0) {
-        aspect_to = gid.z - 2;
-    } else {
-        aspect_to = gid.z - 1;
-    }
-    
-    if (aspect_to > 0 && aspect_to < int(param.aspecRatiosSize) && fabs(aspect_ratios[aspect_to] - 1.) > 1e-6) {
-        float ar = aspect_ratios[aspect_to];
-        
-        box_width = param.minSize * sqrt(ar) / 2;
-        box_height = param.minSize / sqrt(ar) / 2;
-        float4 box;
-        box.x = (center_x - box_width) / param.imageWidth;
-        box.y = (center_y - box_height) / param.imageHeight;
-        box.z = (center_x + box_width) / param.imageWidth;
-        box.w = (center_y + box_height) / param.imageHeight;
-        
-        float4 res;
-        if (param.clip) {
-            res = fmin(fmax(box, 0.0), 1.0);
-        } else {
-            res = box;
-        }
-        
-        outBoxTexture.write(half4(res), gid.xy, gid.z);
-    }
-    
-    float4 variance = variances[0];
-    if (gid.z < param.numPriors) {
-        float4 variances_output;
-        variances_output.x = variance.x;
-        variances_output.y = variance.y;
-        variances_output.z = variance.z;
-        variances_output.w = variance.w;
-        varianceTexture.write(half4(variances_output), gid.xy, gid.z);
-    }
-}
-
-
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReluKernel.metal
deleted file mode 100644
index 8ad4d713a8..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReluKernel.metal
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-struct Relu6Param {
-    float threshold;
-};
-
-struct LeakyReluParam {
-    float alpha;
-};
-
-kernel void relu_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                      texture2d_array<half, access::write> outTexture [[texture(1)]],
-                      uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const half4 input = inTexture.read(gid.xy, gid.z);
-    const float4 relu = fmax((float4)input, 0.0);
-    outTexture.write(half4(relu), gid.xy, gid.z);
-}
-
-kernel void relu(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                 texture2d_array<float, access::write> outTexture [[texture(1)]],
-                 uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const float4 input = inTexture.read(gid.xy, gid.z);
-    const float4 relu = fmax((float4)input, 0.0);
-    outTexture.write(float4(relu), gid.xy, gid.z);
-}
-
-kernel void relu6_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                      texture2d_array<half, access::write> outTexture [[texture(1)]],
-                      constant Relu6Param &pm [[buffer(0)]],
-                      uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const half4 input = inTexture.read(gid.xy, gid.z);
-    const float threshold = pm.threshold;
-    const float4 relu = fmin(fmax((float4)input, 0.0), threshold);
-    outTexture.write(half4(relu), gid.xy, gid.z);
-}
-
-kernel void relu6(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                 texture2d_array<float, access::write> outTexture [[texture(1)]],
-                 constant Relu6Param &pm [[buffer(0)]],
-                 uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const float4 input = inTexture.read(gid.xy, gid.z);
-    const float threshold = pm.threshold;
-    const float4 relu = fmin(fmax((float4)input, 0.0), threshold);
-    outTexture.write(float4(relu), gid.xy, gid.z);
-}
-
-kernel void leaky_relu(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                  texture2d_array<float, access::write> outTexture [[texture(1)]],
-                  constant LeakyReluParam &pm [[buffer(0)]],
-                  uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const float4 input = inTexture.read(gid.xy, gid.z);
-    const float alpha = pm.alpha;
-    const float4 output = fmax(input, input * alpha);
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void leaky_relu_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                       texture2d_array<half, access::write> outTexture [[texture(1)]],
-                       constant LeakyReluParam &pm [[buffer(0)]],
-                       uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const float4 input = float4(inTexture.read(gid.xy, gid.z));
-    const float alpha = pm.alpha;
-    const float4 output = fmax(input, input * alpha);
-    outTexture.write(half4(output), gid.xy, gid.z);
-}
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.inc.metal
deleted file mode 100644
index 3037e404a3..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.inc.metal
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#ifdef P
-
-#define CONCAT2(a, b) a ## b
-#define CONCAT2_(a, b) a ## _ ## b
-#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
-#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
-
-#define FUNC(f, r1, r2, p) CONCAT4_(f, r1, r2, p)
-#define VECTOR(p, n) CONCAT2(p, n)
-#define FUNC_R(f, r) CONCAT2_(f, r)
-
-kernel void FUNC(reshape, RIN, ROUT, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
-                                        texture2d_array<P, access::write> outTexture [[texture(1)]],
-                                        constant ReshapeParam &rp [[buffer(0)]],
-                                        uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    
-    int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, oabcd[4], ixyzn[4], iabcd[4];
-    ReshapeParam lrp = rp;
-    int oC = lrp.odim[lrp.otrans[3]];
-    int iC = lrp.idim[lrp.itrans[3]];
-    int count = lrp.odim[0] * lrp.odim[1] * lrp.odim[2] * lrp.odim[3];
-    VECTOR(P, 4) r;
-    for (int n = 0; n < 4; n++) {
-        oxyzn[3] = n;
-#if ROUT == 4
-        xyzn2abcd_4(oC, oxyzn, oabcd);
-#else
-        FUNC_R(xyzn2abcd, ROUT)(oxyzn, oabcd);
-#endif
-        int tabcd[4];
-        invtrans(lrp.otrans, oabcd, tabcd);
-        int index = abcd2index(lrp.odim, tabcd);
-        if (index < count) {
-            index2abcd(lrp.idim, index, tabcd);
-            trans(lrp.itrans, tabcd, iabcd);
-#if RIN == 4
-            abcd2xyzn_4(iC, iabcd, ixyzn);
-#else
-            FUNC_R(abcd2xyzn, RIN)(iabcd, ixyzn);
-#endif
-            r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]];
-        } else {
-            r[n] = 0;
-        }
-    }
-    outTexture.write(r, gid.xy, gid.z);
-}
-
-#endif
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.metal
deleted file mode 100644
index bb155a87a3..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.metal
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONRITIONS OF ANY KINR, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-#include "Common.metal"
-
-using namespace metal;
-
-struct ReshapeParam {
-    int32_t idim[4];
-    int32_t itrans[4];
-    int32_t odim[4];
-    int32_t otrans[4];
-};
-
-#define P float
-#define RIN 4
-#define ROUT 4
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 3
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 2
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 1
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#undef RIN
-
-#define RIN 3
-#define ROUT 4
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 3
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 2
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 1
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#undef RIN
-
-#define RIN 2
-#define ROUT 4
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 3
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 2
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 1
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#undef RIN
-
-#define RIN 1
-#define ROUT 4
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 3
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 2
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 1
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#undef RIN
-
-#undef P
-
-#define P half
-#define RIN 4
-#define ROUT 4
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 3
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 2
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 1
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#undef RIN
-
-#define RIN 3
-#define ROUT 4
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 3
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 2
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 1
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#undef RIN
-
-#define RIN 2
-#define ROUT 4
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 3
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 2
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 1
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#undef RIN
-
-#define RIN 1
-#define ROUT 4
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 3
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 2
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#define ROUT 1
-#include "ReshapeKernel.inc.metal"
-#undef ROUT
-#undef RIN
-#undef P
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ResizeBilinear.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ResizeBilinear.metal
deleted file mode 100644
index 3cca15d551..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ResizeBilinear.metal
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-struct resize_bilinear_param {
-    //  int32_t out_h;
-    //  int32_t out_w;
-    float ratio_h;
-    float ratio_w;
-};
-
-kernel void resize_bilinear(texture2d_array<float, access::read> input [[texture(0)]],
-                            texture2d_array<float, access::write> output [[texture(2)]],
-                            constant resize_bilinear_param & pm [[buffer(0)]],
-                            uint3 gid [[thread_position_in_grid]]) {
-    float4 r;
-    if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
-        r = input.read(gid.xy, gid.z);
-    } else {
-        float w = gid.x * pm.ratio_w;
-        float h = gid.y * pm.ratio_h;
-        uint w0 = w, h0 = h;
-        uint w1 = w0 + 1, h1 = h0 + 1;
-        float w1lambda = w - w0, h1lambda = h - h0;
-        float w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
-        if (w1 >= input.get_width()) w1 = w0;
-        if (h1 >= input.get_height()) h1 = h0;
-        float4 r0 = input.read(uint2(w0, h0), gid.z);
-        float4 r1 = input.read(uint2(w1, h0), gid.z);
-        float4 r2 = input.read(uint2(w0, h1), gid.z);
-        float4 r3 = input.read(uint2(w1, h1), gid.z);
-        r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
-    }
-    output.write(r, gid.xy, gid.z);
-}
-
-kernel void resize_bilinear_half(texture2d_array<half, access::read> input [[texture(0)]],
-                                 texture2d_array<half, access::write> output [[texture(2)]],
-                                 constant resize_bilinear_param & pm [[buffer(0)]],
-                                 uint3 gid [[thread_position_in_grid]]) {
-    
-    half4 r;
-    if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
-        r = input.read(gid.xy, gid.z);
-    } else {
-        half w = gid.x * pm.ratio_w;
-        half h = gid.y * pm.ratio_h;
-        uint w0 = w, h0 = h;
-        uint w1 = w0 + 1, h1 = h0 + 1;
-        half w1lambda = w - w0, h1lambda = h - h0;
-        half w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
-        if (w1 >= input.get_width()) w1 = w0;
-        if (h1 >= input.get_height()) h1 = h0;
-        half4 r0 = input.read(uint2(w0, h0), gid.z);
-        half4 r1 = input.read(uint2(w1, h0), gid.z);
-        half4 r2 = input.read(uint2(w0, h1), gid.z);
-        half4 r3 = input.read(uint2(w1, h1), gid.z);
-        r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
-    }
-    output.write(r, gid.xy, gid.z);
-    output.write(r, gid.xy, gid.z);
-}
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Scale.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Scale.metal
deleted file mode 100644
index 62b5fd0c92..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Scale.metal
+++ /dev/null
@@ -1,30 +0,0 @@
-//
-//  Scale.metal
-//  paddle-mobile
-//
-//  Created by liuRuiLong on 2019/1/4.
-//  Copyright © 2019 orange. All rights reserved.
-//
-
-#include <metal_stdlib>
-using namespace metal;
-
-kernel void scale(texture2d<float, access::sample> inTexture [[texture(0)]], texture2d<float, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height()) return;
-    float w_stride = inTexture.get_width() / outTexture.get_width();
-    float h_stride = inTexture.get_height() / outTexture.get_height();
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    float4 input = inTexture.sample(sample, float2(gid.x * w_stride,    gid.y * h_stride), 0);
-    outTexture.write(input, gid);
-}
-
-kernel void scale_half(texture2d<float, access::sample> inTexture [[texture(0)]], texture2d<half, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height()) return;
-    float w_stride = inTexture.get_width() / outTexture.get_width();
-    float h_stride = inTexture.get_height() / outTexture.get_height();
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    float4 input = inTexture.sample(sample, float2(gid.x * w_stride,    gid.y * h_stride), 0);
-    outTexture.write(half4(input), gid);
-}
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ScaleKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ScaleKernel.metal
deleted file mode 100644
index d494c815f1..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ScaleKernel.metal
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-#include "Common.metal"
-using namespace metal;
-
-struct ScaleParam {
-    float scale;
-    float abias;
-};
-
-kernel void scale_before_bias_float(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                            texture2d_array<float, access::write> outTexture [[texture(1)]],
-                            constant ScaleParam &pm [[buffer(0)]],
-                            uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const float4 input = inTexture.read(gid.xy, gid.z);
-    const float scale = pm.scale;
-    const float abias = pm.abias;
-    const float4 output = scale * input + abias;
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void scale_after_bias_float(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                                    texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                    constant ScaleParam &pm [[buffer(0)]],
-                                    uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const float4 input = inTexture.read(gid.xy, gid.z);
-    const float scale = pm.scale;
-    const float abias = pm.abias;
-    const float4 output = scale * (input + abias);
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void scale_before_bias_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                                    texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                    constant ScaleParam &pm [[buffer(0)]],
-                                    uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const half4 input = inTexture.read(gid.xy, gid.z);
-    const float scale = pm.scale;
-    const float abias = pm.abias;
-    const float4 output = scale * (float4)input + abias;
-    outTexture.write(half4(output), gid.xy, gid.z);
-}
-
-kernel void scale_after_bias_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                                   texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                   constant ScaleParam &pm [[buffer(0)]],
-                                   uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const half4 input = inTexture.read(gid.xy, gid.z);
-    const float scale = pm.scale;
-    const float abias = pm.abias;
-    const float4 output = scale * ((float4)input + abias);
-    outTexture.write(half4(output), gid.xy, gid.z);
-}
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Shape.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Shape.metal
deleted file mode 100644
index b50d554719..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Shape.metal
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-kernel void shape() {
-}
-kernel void shape_half() {
-}
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/SliceKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/SliceKernel.metal
deleted file mode 100644
index 9cc260a33f..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/SliceKernel.metal
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-struct MetalSliceParam {
-    short start0;
-    short start1;
-    short start2;
-    short start3;
-    short end0;
-    short end1;
-    short end2;
-    short end3;
-    int iC;
-    int oC;
-};
-
-kernel void slice(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                texture2d_array<float, access::write> outTexture [[texture(1)]],
-                constant MetalSliceParam &param [[buffer(0)]],
-                uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-    float4 output;
-    for (int i = 0; i < 4; ++i) {
-        int tmp = gid.z * 4 + i;
-        int output_c = tmp % param.oC;
-        int output_n = tmp / param.oC;
-        int c = output_c + param.start1;
-        tmp = output_n * param.iC + c;
-        int input_z = tmp / 4;
-        int input_c = tmp % 4;
-        const float4 input = inTexture.read(gid.xy, input_z);
-        output[i] = input[input_c % 4];
-    }
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void slice_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                     texture2d_array<half, access::write> outTexture [[texture(1)]],
-                       constant MetalSliceParam &param [[buffer(0)]],
-                     uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-    half4 output;
-    for (int i = 0; i < 4; ++i) {
-        int tmp = gid.z * 4 + i;
-        int output_c = tmp % param.oC;
-        int output_n = tmp / param.oC;
-        int c = output_c + param.start1;
-        tmp = output_n * param.iC + c;
-        int input_z = tmp / 4;
-        int input_c = tmp % 4;
-        const half4 input = inTexture.read(gid.xy, input_z);
-        output[i] = input[input_c % 4];
-    }
-    outTexture.write(output, gid.xy, gid.z);
-}
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.inc.metal
deleted file mode 100644
index 3affcadd79..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.inc.metal
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#ifdef P
-
-#define CONCAT2(a, b) a ## b
-#define CONCAT2_(a, b) a ## _ ## b
-
-#define FUNC(f, p) CONCAT2_(f, p)
-#define VECTOR(p, n) CONCAT2(p, n)
-
-kernel void FUNC(softmax, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
-                             texture2d_array<P, access::write> outTexture [[texture(1)]],
-                             constant SoftmaxParam &sp [[buffer(0)]],
-                             uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    //  int zsize = inTexture.get_array_size();
-    P maxv = inTexture.read(uint2(0, gid.y), 0)[0];
-    int group = sp.K / 4;
-    int remain = sp.K % 4;
-    for (int x = 0; x < group; x++) {
-        VECTOR(P, 4) r = inTexture.read(uint2(x, gid.y), 0);
-        maxv = max(maxv, max(r[0], max(r[1], max(r[2], r[3]))));
-    }
-    if (remain > 0) {
-        VECTOR(P, 4) r = inTexture.read(uint2(group, gid.y), 0);
-        for (int i = 0; i < remain; i++) {
-            maxv = max(maxv, r[i]);
-        }
-    }
-    VECTOR(P, 4) rsum = {0, 0, 0, 0};
-    for (int x = 0; x < group; x++) {
-        VECTOR(P, 4) r = inTexture.read(uint2(x, gid.y), 0);
-        rsum += exp(r - maxv);
-    }
-    P sum = rsum[0] + rsum[1] + rsum[2] + rsum[3];
-    if (remain > 0) {
-        VECTOR(P, 4) r = inTexture.read(uint2(group, gid.y), 0);
-        for (int i = 0; i < remain; i++) {
-            sum += exp(r[i] - maxv);
-        }
-    }
-    VECTOR(P, 4) rr = inTexture.read(gid.xy, gid.z);
-    rr = exp(rr - maxv) / sum;
-    outTexture.write(rr, gid.xy, gid.z);
-}
-
-#endif
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.metal
deleted file mode 100644
index f4bc8de4bc..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.metal
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-struct SoftmaxParam {
-    int N;
-    int K;
-};
-
-#define P float
-#include "Softmax.inc.metal"
-#undef P
-
-#define P half
-#include "Softmax.inc.metal"
-#undef P
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.inc.metal
deleted file mode 100644
index 1c9bcc7e18..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.inc.metal
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#ifdef P
-
-#define CONCAT2(a, b) a ## b
-#define CONCAT2_(a, b) a ## _ ## b
-#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
-#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
-#define CONCAT5_(a, b, c, d, e) a ## _ ## b ## _ ## c ## _ ## d ## _ ## e
-
-#define FUNC(f, r, n, v, p) CONCAT5_(f, r, n, v, p)
-#define VECTOR(p, n) CONCAT2(p, n)
-#define FUNC_R(f, r) CONCAT2_(f, r)
-
-#if V == VX
-#define VV x
-#elif V == VY
-#define VV y
-#elif V == VZ
-#define VV z
-#else
-#define VV normal
-#endif
-
-#if V == VY
-kernel void FUNC(split, R, N, VV, P)(texture2d_array<P, access::read> input [[texture(0)]],
-                                     texture2d_array<P, access::write> out1 [[texture(1)]],
-                                     texture2d_array<P, access::write> out2 [[texture(2)]],
-#if N >= 3
-                                     texture2d_array<P, access::write> out3 [[texture(3)]],
-#endif // N >= 3
-#if N >= 4
-                                     texture2d_array<P, access::write> out4 [[texture(4)]],
-#endif // N >= 4
-                                     constant SplitParam &sp [[buffer(0)]],
-                                     uint3 gid [[thread_position_in_grid]]) {
-    
-    VECTOR(P, 4) r = input.read(gid.xy, gid.z);
-    int y = gid.y - sp.offset;
-    if (y < sp.vdim[0]) {
-        out1.write(r, gid.xy, gid.z);
-        return;
-    }
-    y -= sp.vdim[0];
-    if (y < sp.vdim[1]) {
-        out2.write(r, uint2(gid.x, y), gid.z);
-        return;
-    }
-#if N >= 3
-    y -= sp.vdim[1];
-    if (y < sp.vdim[2]) {
-        out3.write(r, uint2(gid.x, y), gid.z);
-        return;
-    }
-#endif // N >= 3
-#if N >= 4
-    y -= sp.vdim[2];
-    if (y < sp.vdim[3]) {
-        out4.write(r, uint2(gid.x, y), gid.z);
-        return;
-    }
-#endif // N >= 4
-}
-#endif // V == VY
-
-
-#if V == VX
-kernel void FUNC(split, R, N, VV, P)(texture2d_array<P, access::read> input [[texture(0)]],
-                                     texture2d_array<P, access::write> out1 [[texture(1)]],
-                                     texture2d_array<P, access::write> out2 [[texture(2)]],
-#if N >= 3
-                                     texture2d_array<P, access::write> out3 [[texture(3)]],
-#endif // N >= 3
-#if N >= 4
-                                     texture2d_array<P, access::write> out4 [[texture(4)]],
-#endif // N >= 4
-                                     constant SplitParam &sp [[buffer(0)]],
-                                     uint3 gid [[thread_position_in_grid]]) {
-    VECTOR(P, 4) r = input.read(gid.xy, gid.z);
-    int x = gid.x;
-    if (x < sp.vdim[0]) {
-        out1.write(r, gid.xy, gid.z);
-        return;
-    }
-    x -= sp.vdim[0];
-    if (x < sp.vdim[1]) {
-        out2.write(r, uint2(x, gid.y), gid.z);
-        return;
-    }
-#if N >= 3
-    x -= sp.vdim[1];
-    if (x < sp.vdim[2]) {
-        out3.write(r, uint2(x, gid.y), gid.z);
-        return;
-    }
-#endif // N >= 3
-#if N >= 4
-    x -= sp.vdim[2];
-    if (x < sp.vdim[3]) {
-        out4.write(r, uint2(x, gid.y), gid.z);
-        return;
-    }
-#endif // N >= 4
-}
-#endif // V == VX
-
-
-
-#undef VV
-#endif
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.metal
deleted file mode 100644
index d167608fbb..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.metal
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-#include "Common.metal"
-
-using namespace metal;
-
-struct SplitParam {
-    int32_t idim[4];
-    int32_t axis;
-    int32_t offset;
-    int32_t trans[4];
-    int32_t vdim[4];
-};
-
-#define VNORMAL 1
-#define VX 2
-#define VY 3
-#define VZ 4
-
-// only support split_{2, 3, 4}_{2, 3, 4}_y_{float, half}
-// only support split_{3, 4}_{2, 3, 4}_x_{float, half}
-
-
-//// ssd-ar: (R=3, N=2, V=y)
-#define V VY
-#define R 3
-#define N 2
-#define P float
-#include "Split.inc.metal"
-#undef P
-#define P half
-#include "Split.inc.metal"
-#undef P
-#undef N
-#undef R
-#undef V
-
-
-//// ssd-ar: (R=2, N=2, V=y)
-#define V VY
-#define R 2
-#define N 2
-#define P float
-#include "Split.inc.metal"
-#undef P
-#define P half
-#include "Split.inc.metal"
-#undef P
-#undef N
-#undef R
-#undef V
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.inc.metal
deleted file mode 100644
index 8fa73b2011..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.inc.metal
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#ifdef P
-
-#define CONCAT2(a, b) a ## b
-#define CONCAT2_(a, b) a ## _ ## b
-#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
-
-#define FUNC(f, r, p) CONCAT3_(f, r, p)
-#define VECTOR(p, n) CONCAT2(p, n)
-
-kernel void FUNC(transpose, R, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
-                                  texture2d_array<P, access::write> outTexture [[texture(1)]],
-                                  constant TransposeParam &pm [[buffer(0)]],
-                                  uint3 gid [[thread_position_in_grid]]) {
-    VECTOR(P, 4) r;
-    int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0};
-    int iabcd[4], oabcd[4], ixyzn[4];
-    for (int n = 0; n < 4; n++) {
-        oxyzn[3] = n;
-#if R == 4
-        xyzn2abcd_4(pm.oC, oxyzn, oabcd);
-#endif // R == 4
-#if R == 3
-        xyzn2abcd_3(oxyzn, oabcd);
-#endif // R == 3
-#if R == 2
-        xyzn2abcd_2(oxyzn, oabcd);
-#endif // R == 2
-        iabcd[pm.axis[0]] = oabcd[0];
-        iabcd[pm.axis[1]] = oabcd[1];
-        iabcd[pm.axis[2]] = oabcd[2];
-        iabcd[pm.axis[3]] = oabcd[3];
-#if R == 4
-        abcd2xyzn_4(pm.iC, iabcd, ixyzn);
-#endif // R == 4
-#if R == 3
-        abcd2xyzn_3(iabcd, ixyzn);
-#endif // R == 3
-#if R == 2
-        abcd2xyzn_2(iabcd, ixyzn);
-#endif // R == 2
-        r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]];
-    }
-    outTexture.write(r, gid.xy, gid.z);
-}
-
-#endif
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.metal
deleted file mode 100644
index 66c22f0388..0000000000
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.metal
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-#include "Common.metal"
-using namespace metal;
-
-struct TransposeParam {
-    int iC;
-    int oC;
-    int axis[4];
-};
-
-kernel void transpose_copy_float(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                                 texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                 constant TransposeParam &pm [[buffer(0)]],
-                                 uint3 gid [[thread_position_in_grid]]) {
-    outTexture.write(inTexture.read(gid.xy, gid.z), gid.xy, gid.z);
-}
-kernel void transpose_copy_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                                texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                constant TransposeParam &pm [[buffer(0)]],
-                                uint3 gid [[thread_position_in_grid]]) {
-    outTexture.write(inTexture.read(gid.xy, gid.z), gid.xy, gid.z);
-}
-
-#define R 4
-#define P float
-#include "TransposeKernel.inc.metal"
-#undef P
-#define P half
-#include "TransposeKernel.inc.metal"
-#undef P
-#undef R
-
-#define R 3
-#define P float
-#include "TransposeKernel.inc.metal"
-#undef P
-#define P half
-#include "TransposeKernel.inc.metal"
-#undef P
-#undef R
-
-#define R 2
-#define P float
-#include "TransposeKernel.inc.metal"
-#undef P
-#define P half
-#include "TransposeKernel.inc.metal"
-#undef P
-#undef R
diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.pbxproj b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.pbxproj
deleted file mode 100644
index 6de1a7f372..0000000000
--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.pbxproj
+++ /dev/null
@@ -1,478 +0,0 @@
-// !$*UTF8*$!
-{
-	archiveVersion = 1;
-	classes = {
-	};
-	objectVersion = 50;
-	objects = {
-
-/* Begin PBXBuildFile section */
-		168DA950D7D6CF91EBF70A17 /* Pods_paddle_mobile_unit_test.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8BCD4792E483BFEE9F5523DE /* Pods_paddle_mobile_unit_test.framework */; };
-		FC607427211DF3B100B17547 /* synset.txt in Resources */ = {isa = PBXBuildFile; fileRef = FC60734E211DF3B000B17547 /* synset.txt */; };
-		FC607428211DF3B100B17547 /* banana.jpeg in Resources */ = {isa = PBXBuildFile; fileRef = FC60734F211DF3B000B17547 /* banana.jpeg */; };
-		FC607429211DF3B100B17547 /* iphone.JPG in Resources */ = {isa = PBXBuildFile; fileRef = FC607350211DF3B000B17547 /* iphone.JPG */; };
-		FC60742A211DF3B100B17547 /* paddle-mobile.png in Resources */ = {isa = PBXBuildFile; fileRef = FC607351211DF3B000B17547 /* paddle-mobile.png */; };
-		FC60742B211DF3B100B17547 /* params in Resources */ = {isa = PBXBuildFile; fileRef = FC607354211DF3B000B17547 /* params */; };
-		FC60742C211DF3B100B17547 /* model in Resources */ = {isa = PBXBuildFile; fileRef = FC607355211DF3B000B17547 /* model */; };
-		FC91818D211DAE9A00B6F354 /* paddle_mobile.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = FC91818C211DAE9A00B6F354 /* paddle_mobile.framework */; };
-		FC91818E211DAE9A00B6F354 /* paddle_mobile.framework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = FC91818C211DAE9A00B6F354 /* paddle_mobile.framework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; };
-		FCDFD409211D9185005AB38B /* AppDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDFD408211D9185005AB38B /* AppDelegate.swift */; };
-		FCDFD40B211D9185005AB38B /* ViewController.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDFD40A211D9185005AB38B /* ViewController.swift */; };
-		FCDFD40E211D9185005AB38B /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FCDFD40C211D9185005AB38B /* Main.storyboard */; };
-		FCDFD410211D9187005AB38B /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = FCDFD40F211D9187005AB38B /* Assets.xcassets */; };
-		FCDFD413211D9187005AB38B /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FCDFD411211D9187005AB38B /* LaunchScreen.storyboard */; };
-/* End PBXBuildFile section */
-
-/* Begin PBXCopyFilesBuildPhase section */
-		FC91818F211DAE9B00B6F354 /* Embed Frameworks */ = {
-			isa = PBXCopyFilesBuildPhase;
-			buildActionMask = 2147483647;
-			dstPath = "";
-			dstSubfolderSpec = 10;
-			files = (
-				FC91818E211DAE9A00B6F354 /* paddle_mobile.framework in Embed Frameworks */,
-			);
-			name = "Embed Frameworks";
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXCopyFilesBuildPhase section */
-
-/* Begin PBXFileReference section */
-		5CC132C848027BE970FB2637 /* Pods-paddle-mobile-unit-test.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile-unit-test.debug.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile-unit-test/Pods-paddle-mobile-unit-test.debug.xcconfig"; sourceTree = "<group>"; };
-		72F34AE9677943FC580DE7F4 /* Pods-paddle-mobile-unit-test.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile-unit-test.release.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile-unit-test/Pods-paddle-mobile-unit-test.release.xcconfig"; sourceTree = "<group>"; };
-		8BCD4792E483BFEE9F5523DE /* Pods_paddle_mobile_unit_test.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Pods_paddle_mobile_unit_test.framework; sourceTree = BUILT_PRODUCTS_DIR; };
-		FC60734E211DF3B000B17547 /* synset.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = synset.txt; sourceTree = "<group>"; };
-		FC60734F211DF3B000B17547 /* banana.jpeg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = banana.jpeg; sourceTree = "<group>"; };
-		FC607350211DF3B000B17547 /* iphone.JPG */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = iphone.JPG; sourceTree = "<group>"; };
-		FC607351211DF3B000B17547 /* paddle-mobile.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; path = "paddle-mobile.png"; sourceTree = "<group>"; };
-		FC607354211DF3B000B17547 /* params */ = {isa = PBXFileReference; lastKnownFileType = file; path = params; sourceTree = "<group>"; };
-		FC607355211DF3B000B17547 /* model */ = {isa = PBXFileReference; lastKnownFileType = file; path = model; sourceTree = "<group>"; };
-		FC91818C211DAE9A00B6F354 /* paddle_mobile.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; path = paddle_mobile.framework; sourceTree = BUILT_PRODUCTS_DIR; };
-		FCDFD405211D9185005AB38B /* paddle-mobile-unit-test.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = "paddle-mobile-unit-test.app"; sourceTree = BUILT_PRODUCTS_DIR; };
-		FCDFD408211D9185005AB38B /* AppDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppDelegate.swift; sourceTree = "<group>"; };
-		FCDFD40A211D9185005AB38B /* ViewController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ViewController.swift; sourceTree = "<group>"; };
-		FCDFD40D211D9185005AB38B /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/Main.storyboard; sourceTree = "<group>"; };
-		FCDFD40F211D9187005AB38B /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
-		FCDFD412211D9187005AB38B /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = "<group>"; };
-		FCDFD414211D9187005AB38B /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
-/* End PBXFileReference section */
-
-/* Begin PBXFrameworksBuildPhase section */
-		FCDFD402211D9185005AB38B /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				FC91818D211DAE9A00B6F354 /* paddle_mobile.framework in Frameworks */,
-				168DA950D7D6CF91EBF70A17 /* Pods_paddle_mobile_unit_test.framework in Frameworks */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXFrameworksBuildPhase section */
-
-/* Begin PBXGroup section */
-		CF78F766C11CC8AD67269581 /* Frameworks */ = {
-			isa = PBXGroup;
-			children = (
-				8BCD4792E483BFEE9F5523DE /* Pods_paddle_mobile_unit_test.framework */,
-			);
-			name = Frameworks;
-			sourceTree = "<group>";
-		};
-		F752428B187BC4E0928ACD3D /* Pods */ = {
-			isa = PBXGroup;
-			children = (
-				5CC132C848027BE970FB2637 /* Pods-paddle-mobile-unit-test.debug.xcconfig */,
-				72F34AE9677943FC580DE7F4 /* Pods-paddle-mobile-unit-test.release.xcconfig */,
-			);
-			name = Pods;
-			sourceTree = "<group>";
-		};
-		FC60734D211DF3B000B17547 /* images */ = {
-			isa = PBXGroup;
-			children = (
-				FC60734E211DF3B000B17547 /* synset.txt */,
-				FC60734F211DF3B000B17547 /* banana.jpeg */,
-				FC607350211DF3B000B17547 /* iphone.JPG */,
-				FC607351211DF3B000B17547 /* paddle-mobile.png */,
-			);
-			name = images;
-			path = ../../images;
-			sourceTree = "<group>";
-		};
-		FC607352211DF3B000B17547 /* models */ = {
-			isa = PBXGroup;
-			children = (
-				FC607353211DF3B000B17547 /* mobilenet */,
-			);
-			name = models;
-			path = ../../models;
-			sourceTree = "<group>";
-		};
-		FC607353211DF3B000B17547 /* mobilenet */ = {
-			isa = PBXGroup;
-			children = (
-				FC607354211DF3B000B17547 /* params */,
-				FC607355211DF3B000B17547 /* model */,
-			);
-			path = mobilenet;
-			sourceTree = "<group>";
-		};
-		FCDFD3FC211D9185005AB38B = {
-			isa = PBXGroup;
-			children = (
-				FC91818C211DAE9A00B6F354 /* paddle_mobile.framework */,
-				FCDFD407211D9185005AB38B /* paddle-mobile-unit-test */,
-				FCDFD406211D9185005AB38B /* Products */,
-				F752428B187BC4E0928ACD3D /* Pods */,
-				CF78F766C11CC8AD67269581 /* Frameworks */,
-			);
-			sourceTree = "<group>";
-		};
-		FCDFD406211D9185005AB38B /* Products */ = {
-			isa = PBXGroup;
-			children = (
-				FCDFD405211D9185005AB38B /* paddle-mobile-unit-test.app */,
-			);
-			name = Products;
-			sourceTree = "<group>";
-		};
-		FCDFD407211D9185005AB38B /* paddle-mobile-unit-test */ = {
-			isa = PBXGroup;
-			children = (
-				FC60734D211DF3B000B17547 /* images */,
-				FC607352211DF3B000B17547 /* models */,
-				FCDFD408211D9185005AB38B /* AppDelegate.swift */,
-				FCDFD40A211D9185005AB38B /* ViewController.swift */,
-				FCDFD40C211D9185005AB38B /* Main.storyboard */,
-				FCDFD40F211D9187005AB38B /* Assets.xcassets */,
-				FCDFD411211D9187005AB38B /* LaunchScreen.storyboard */,
-				FCDFD414211D9187005AB38B /* Info.plist */,
-			);
-			path = "paddle-mobile-unit-test";
-			sourceTree = "<group>";
-		};
-/* End PBXGroup section */
-
-/* Begin PBXNativeTarget section */
-		FCDFD404211D9185005AB38B /* paddle-mobile-unit-test */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = FCDFD417211D9187005AB38B /* Build configuration list for PBXNativeTarget "paddle-mobile-unit-test" */;
-			buildPhases = (
-				5F5A9A9DC0C6307DEA4294C1 /* [CP] Check Pods Manifest.lock */,
-				FCDFD401211D9185005AB38B /* Sources */,
-				FCDFD402211D9185005AB38B /* Frameworks */,
-				FCDFD403211D9185005AB38B /* Resources */,
-				53A2089068F9D64BB96D4322 /* [CP] Embed Pods Frameworks */,
-				FC91818F211DAE9B00B6F354 /* Embed Frameworks */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-			);
-			name = "paddle-mobile-unit-test";
-			productName = "paddle-mobile-unit-test";
-			productReference = FCDFD405211D9185005AB38B /* paddle-mobile-unit-test.app */;
-			productType = "com.apple.product-type.application";
-		};
-/* End PBXNativeTarget section */
-
-/* Begin PBXProject section */
-		FCDFD3FD211D9185005AB38B /* Project object */ = {
-			isa = PBXProject;
-			attributes = {
-				LastSwiftUpdateCheck = 0940;
-				LastUpgradeCheck = 0940;
-				ORGANIZATIONNAME = orange;
-				TargetAttributes = {
-					FCDFD404211D9185005AB38B = {
-						CreatedOnToolsVersion = 9.4.1;
-					};
-				};
-			};
-			buildConfigurationList = FCDFD400211D9185005AB38B /* Build configuration list for PBXProject "paddle-mobile-unit-test" */;
-			compatibilityVersion = "Xcode 9.3";
-			developmentRegion = en;
-			hasScannedForEncodings = 0;
-			knownRegions = (
-				en,
-				Base,
-			);
-			mainGroup = FCDFD3FC211D9185005AB38B;
-			productRefGroup = FCDFD406211D9185005AB38B /* Products */;
-			projectDirPath = "";
-			projectRoot = "";
-			targets = (
-				FCDFD404211D9185005AB38B /* paddle-mobile-unit-test */,
-			);
-		};
-/* End PBXProject section */
-
-/* Begin PBXResourcesBuildPhase section */
-		FCDFD403211D9185005AB38B /* Resources */ = {
-			isa = PBXResourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				FC607427211DF3B100B17547 /* synset.txt in Resources */,
-				FC60742B211DF3B100B17547 /* params in Resources */,
-				FC607428211DF3B100B17547 /* banana.jpeg in Resources */,
-				FC60742A211DF3B100B17547 /* paddle-mobile.png in Resources */,
-				FC607429211DF3B100B17547 /* iphone.JPG in Resources */,
-				FC60742C211DF3B100B17547 /* model in Resources */,
-				FCDFD413211D9187005AB38B /* LaunchScreen.storyboard in Resources */,
-				FCDFD410211D9187005AB38B /* Assets.xcassets in Resources */,
-				FCDFD40E211D9185005AB38B /* Main.storyboard in Resources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXResourcesBuildPhase section */
-
-/* Begin PBXShellScriptBuildPhase section */
-		53A2089068F9D64BB96D4322 /* [CP] Embed Pods Frameworks */ = {
-			isa = PBXShellScriptBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			inputPaths = (
-				"${SRCROOT}/../Pods/Target Support Files/Pods-paddle-mobile-unit-test/Pods-paddle-mobile-unit-test-frameworks.sh",
-				"${BUILT_PRODUCTS_DIR}/Protobuf/Protobuf.framework",
-			);
-			name = "[CP] Embed Pods Frameworks";
-			outputPaths = (
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/Protobuf.framework",
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-			shellPath = /bin/sh;
-			shellScript = "\"${SRCROOT}/../Pods/Target Support Files/Pods-paddle-mobile-unit-test/Pods-paddle-mobile-unit-test-frameworks.sh\"\n";
-			showEnvVarsInLog = 0;
-		};
-		5F5A9A9DC0C6307DEA4294C1 /* [CP] Check Pods Manifest.lock */ = {
-			isa = PBXShellScriptBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			inputPaths = (
-				"${PODS_PODFILE_DIR_PATH}/Podfile.lock",
-				"${PODS_ROOT}/Manifest.lock",
-			);
-			name = "[CP] Check Pods Manifest.lock";
-			outputPaths = (
-				"$(DERIVED_FILE_DIR)/Pods-paddle-mobile-unit-test-checkManifestLockResult.txt",
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-			shellPath = /bin/sh;
-			shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n    # print error to STDERR\n    echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n    exit 1\nfi\n# This output is used by Xcode 'outputs' to avoid re-running this script phase.\necho \"SUCCESS\" > \"${SCRIPT_OUTPUT_FILE_0}\"\n";
-			showEnvVarsInLog = 0;
-		};
-/* End PBXShellScriptBuildPhase section */
-
-/* Begin PBXSourcesBuildPhase section */
-		FCDFD401211D9185005AB38B /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				FCDFD40B211D9185005AB38B /* ViewController.swift in Sources */,
-				FCDFD409211D9185005AB38B /* AppDelegate.swift in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXSourcesBuildPhase section */
-
-/* Begin PBXVariantGroup section */
-		FCDFD40C211D9185005AB38B /* Main.storyboard */ = {
-			isa = PBXVariantGroup;
-			children = (
-				FCDFD40D211D9185005AB38B /* Base */,
-			);
-			name = Main.storyboard;
-			sourceTree = "<group>";
-		};
-		FCDFD411211D9187005AB38B /* LaunchScreen.storyboard */ = {
-			isa = PBXVariantGroup;
-			children = (
-				FCDFD412211D9187005AB38B /* Base */,
-			);
-			name = LaunchScreen.storyboard;
-			sourceTree = "<group>";
-		};
-/* End PBXVariantGroup section */
-
-/* Begin XCBuildConfiguration section */
-		FCDFD415211D9187005AB38B /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_ENABLE_OBJC_WEAK = YES;
-				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_COMMA = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-				CLANG_WARN_STRICT_PROTOTYPES = YES;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = dwarf;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				ENABLE_TESTABILITY = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu11;
-				GCC_DYNAMIC_NO_PIC = NO;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_OPTIMIZATION_LEVEL = 0;
-				GCC_PREPROCESSOR_DEFINITIONS = (
-					"DEBUG=1",
-					"$(inherited)",
-				);
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 11.4;
-				MTL_ENABLE_DEBUG_INFO = YES;
-				ONLY_ACTIVE_ARCH = YES;
-				SDKROOT = iphoneos;
-				SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG;
-				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
-			};
-			name = Debug;
-		};
-		FCDFD416211D9187005AB38B /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_ENABLE_OBJC_WEAK = YES;
-				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_COMMA = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-				CLANG_WARN_STRICT_PROTOTYPES = YES;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
-				ENABLE_NS_ASSERTIONS = NO;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu11;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 11.4;
-				MTL_ENABLE_DEBUG_INFO = NO;
-				SDKROOT = iphoneos;
-				SWIFT_COMPILATION_MODE = wholemodule;
-				SWIFT_OPTIMIZATION_LEVEL = "-O";
-				VALIDATE_PRODUCT = YES;
-			};
-			name = Release;
-		};
-		FCDFD418211D9187005AB38B /* Debug */ = {
-			isa = XCBuildConfiguration;
-			baseConfigurationReference = 5CC132C848027BE970FB2637 /* Pods-paddle-mobile-unit-test.debug.xcconfig */;
-			buildSettings = {
-				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = A798K58VVL;
-				INFOPLIST_FILE = "paddle-mobile-unit-test/Info.plist";
-				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
-				LD_RUNPATH_SEARCH_PATHS = (
-					"$(inherited)",
-					"@executable_path/Frameworks",
-				);
-				PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile-unit-test";
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SWIFT_VERSION = 4.0;
-				TARGETED_DEVICE_FAMILY = "1,2";
-			};
-			name = Debug;
-		};
-		FCDFD419211D9187005AB38B /* Release */ = {
-			isa = XCBuildConfiguration;
-			baseConfigurationReference = 72F34AE9677943FC580DE7F4 /* Pods-paddle-mobile-unit-test.release.xcconfig */;
-			buildSettings = {
-				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = A798K58VVL;
-				INFOPLIST_FILE = "paddle-mobile-unit-test/Info.plist";
-				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
-				LD_RUNPATH_SEARCH_PATHS = (
-					"$(inherited)",
-					"@executable_path/Frameworks",
-				);
-				PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile-unit-test";
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SWIFT_VERSION = 4.0;
-				TARGETED_DEVICE_FAMILY = "1,2";
-			};
-			name = Release;
-		};
-/* End XCBuildConfiguration section */
-
-/* Begin XCConfigurationList section */
-		FCDFD400211D9185005AB38B /* Build configuration list for PBXProject "paddle-mobile-unit-test" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				FCDFD415211D9187005AB38B /* Debug */,
-				FCDFD416211D9187005AB38B /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		FCDFD417211D9187005AB38B /* Build configuration list for PBXNativeTarget "paddle-mobile-unit-test" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				FCDFD418211D9187005AB38B /* Debug */,
-				FCDFD419211D9187005AB38B /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-/* End XCConfigurationList section */
-	};
-	rootObject = FCDFD3FD211D9185005AB38B /* Project object */;
-}
diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.xcworkspace/contents.xcworkspacedata b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.xcworkspace/contents.xcworkspacedata
deleted file mode 100644
index cb4dfcfed9..0000000000
--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.xcworkspace/contents.xcworkspacedata
+++ /dev/null
@@ -1,7 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<Workspace
-   version = "1.0">
-   <FileRef
-      location = "self:paddle-mobile-unit-test.xcodeproj">
-   </FileRef>
-</Workspace>
diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
deleted file mode 100644
index 18d981003d..0000000000
--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
+++ /dev/null
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>IDEDidComputeMac32BitWarning</key>
-	<true/>
-</dict>
-</plist>
diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
deleted file mode 100644
index 775d17b268941d24e8e9ebd7ac5ae26c2c0dbda9..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 5178
zcma)A3wRXO6+U->o%ig7R}vz!kc}Xau*oJQ0f9U=kw91=n}iS&mQ6B*fz585odAI%
z7qvcXE#jjVYe`~JtG2dUQ3TZ5T1%-9o<69x_G$gB)mpVa+Mb!oF4>~r=jQu%cJ96B
zanJeBf6q-vZ&Xj1KHo_oz`!UN4P(GXzD@3E@ov_PgdUHzc;g)#wN5j!+8Z%CH|RLr
z=r!ZLV}V?~??-ABSm0nBjE4!32N%I~m;o2VOqd08U@nwG87zUNP!G!>2+Lswtb_(=
zfz_}UHh>NrVG~552YO)}Y=@n21zZc)!S(PhxDEEf0Nf7`z@u;g9)n}>8+aaGfEVE<
zI1VS^B%FfZ!<+C2_#^xY-h=nypYQ?v3qFQV2oORT!V#Hdk}NWf<dW$mpA?Zx$O5vE
zEGBhi327j0BtkTzlOD2zTuycpN_LSe$d%+eawFMIZf)|$`l8Vjz=HrHNZ^LCtAe3|
zO<I4V6`#XFY4&zTBZ<Ty$e>V$jv53NGU#Z!j!q4$-b#N_pxR$unqTHG4CEL40;Tzt
z)m63mB~@ib)qa1Kzh-WASRG#%s96;^HYIu^omwE?+1I1R%%tIoFd4l2ArrD78zw;x
zTtLTA7iB3&c`EFO3t<X)aCjQzQjyA3!C^O@PxHd+jFEa*>9MZ(mT)lCpe4+>5$T9(
zAv0oXfruGdYb~_rYmG>c7FM%U1C8-`v?F5FC7Shwjxu3&3eHt_nuvpx&yA6e>UhjF
z;?byPgw=&3?WWK9O3TarwZ(<`{<7k-{9=D?d473WS#f?zk-xa8s-!AV=C2K_S?A10
zI;uukKMu1o7x_?NWugl`U!F1QP)8ENgc&t@jLTpARiT@~a=ai)vXVLZ!YO&PX6O5y
zoQKt%a*w~vQ&{ZtdCL6d9v}V{78m17X{o0fV>qv{a79=hJ93{+{BU#vjwX|ME-EM7
zYR1^{6Tcim1LG(+Fo8-`J&~22F)8N)3_~y!>NGShRv+n)_nA#ib%Bs+=&^1zojw)P
zw`mp}sED8|tgWgosx3v7mDW_}m&~mx!SEL1YsuV7zpo@vSUI<F&30=)vpwZL-;U#+
zsnc>>9UR9n{VmQ<EwMJ;$lx?@OE9$DCW8|fP0yHd@k~TYFy!pq@i7fWQ_|-~?#0HL
zjaD~CZ2R`|*&g4H*&g)DhhAZrckHmo@nv}2^41<lcMImAYa`7L7I<;Eu&6B<3h0Q<
zcr0S{4;FgEsyi53s`YDKmBZ9^wq$N;S^2#Amn>MgsG_o}8gq5&;<_bE>)VqWZPzeQ
zjk;;UJp<G2m~sk7GjOghfl8Z$p{BaJ0LG*)u`Hs;!m1h!)#^sV#Ff4ASeIt335KdI
zdI;K8rKVIl!AjtC$n1}5%OZ(QVRc;6P^zx#0;zol`+7PwgAzK5x_HX)6p&<C)lJ7_
zOx}eAoPpesi<MD?l)Ds3HV&I%3w#4^gzqDTJ_1j|v+x`oMH+n`DfA4yPguee5h=2m
zRFE37f~+PR$R=Wv?c{dyIC+8`CQp(x<X!RsGm3FBB9q1BF!@X^)5x?jz06i-Cvz>c
zm$`>I$UMuOW!`4qVcuolW8P;zVE)B?#QdB2l=+<bFW4W3I8QtT(mtaYDjf$O6r$Vy
zFG3(DWm%tTag{|=L%6IMN-*x$TMCO`1|U93q2*8s-g{sk%!f;00W5??P(f9iLC4Z@
zbUd9vC*A{9NTLC#fm(bnrkO~iS@a@$F+PhdDxFxFNZ>gciJ3KBI?`vt$snd2t39+K
zz9kshtanFnCamU<q=hv4YclnyZuVCiS|o&|Wj5>D7Dq)&+?orkv%geDKuhS|u`keI
zIpOHnwM0A0JC0f_RXi@8Q!E5c;2nTQnmqu`bdt5Rj6ic^%9%P->q#63>%eP;_CZ(!
zZ8V2Yqu!)~Fm%G$Ay@|y=%5$S$@Ib@=)#b#r&BOg9y+xh)zx+3`K7PdHKWaXJ9l)b
z_-gHWS)@Crn|)o{irk7_f3+duz#s<k^b&>`lq79m27O;>d#i0bmrl*C=-oF$+1FqP
zmd`MLDh(sjKLlTc%b^4d*nV|vQrKnJUJ__2{ulx}()^Y1bt`sP(dh$lHJxF{E~G`V
zGt#<J=?kl#md<#WR)7gus+$F=i2`KqzD~2x(30D^0c$gvi*M4IG|!3qZfGBZn_&;!
zLTAx@I_F&MZ->3-?SBEr{w_qZLT6k1zsxDGq=`Y;f1X4IO58^$qC|luaf8PeNlH8j
zKR!?5Cn)g<^`S(e)znysPXWz>7lyqeP08c%Gw|LE2jK}g1c%`WlFw7{H1*SBT0-a2
zQd&mK>AZW9uRK_HRq#t>reDEPY+4koyi9rtT}c~gs}*6@f^|Ua=<Dv*49l2>Y-`Lx
zH}+1=PM7Muhx$5tF#4^|+cNx#sYQu^wm#AqHEl(f7EIId&<aaDVIp7~v|d9?;E#xP
zek82Q7`B@#kJhMt5I5qP%q1gs8eRkM-N>jf!z=JAT|gJoMYQ5>I19gp*Wq`xk}jr8
z=u(>3Qm?JID}Hr@-o3&8)#|W_<EPa(q%5U8O`7>Lylt8J3;Y${g1^x!T1^AA=5CB(
z75p7M7`sebi}Ayl<upeO-Qr9vJZ*~=tVBa(OLHXJr{S(9rOyrnIMf;M#aMaLCT;P!
zfniH@8hWoqd&q=oOKk0p{k@t4-VfoUee_bNt}cgv4_eih+`*^txrO)7Xx#u*p||L1
z2KGmZcvP#^qo!t9VNl8Nax#jHNtOq3(fZ-?AUx6%5r{~a(dDVqAPO0K9&FEmhfKhH
zC^TpReR~>4Nj8~$p2W*2;lcbVbcH3c?H?miO}r!zy!-aiyrhRS$jp?7vuG&kVS(l0
z9O9#mbk#Wz{iO8&?U6FHH;*=<y=FRfHKyNzZEl6WF3s{HQhlCm0CGt!ZNa>+wu;hi
zgZx~X<W{sjW$QRemXdn13`7zn%kh+7iM^u{kDCtiEF=f*D#`5DVtQY~@xW?ZwXLQz
zlhteJJ=jj2H$~g930XtKr%8x3qJbvTOje<hAZaCpEQggXcw3Bi8CtBlH-g{hnodh^
z%(U==SnC?T={T*K1=#cwToX6+WI3XxHoBHVx~+D!)(R^vTeKaQ-~t<mqys)0AS=*0
z2a9Aq!Jc_K#wgWVrJc*^!5AbPpyaDBx6w{E671k|zk-E961B8{m8D6b#Yi7s0ZE+n
zlFNue62znt+Ce*M7uD$c{bV!QLbj5An+Lk7PB+p`NiLY=w64~17wgund!@}Rb+Pqv
z1Wodq;w-YrVVD)2T-*1<L~Yz?YN*Fack;ALEzfGuI#H#wI;qTAJiZEPc2jR$kC_Sk
zwuo2FuH=TUB3E0LBU5I8Tthb`E9ZJ>w+E@$$>KLt?@7U%$sVT$odvf+`#$;wP5w*d
zJ7jN@?LH;g9B1S0covjk2e^n#A|Cu-KnXUzmttWAu_T%hwX4Y*y!+ol_LGOnPsxE)
z1AL4;PhKR)$w_jWyi8tY#xUa<AJf3>VSdOQV4igCbnS9o@4D4>m+Jx7!>)s_XI!tj
z&br=meeC+g^_lBGEU*kansu=g*i1H?&0#0AQ`o6&E}O?LWLL7g*jw0V*wgGE**~-Y
zWItp-W<O;=XaCEM<1XN)a=Ba{H=Qfy=5q_UMO-Dfm|Mctb3twex0c()UC!;~uH|-f
zcX319L)-!GaqbE3F!yurS?)RRD0hrI#l6nG!+pSg#*g81_-XtczKAd8OZjqsKEIqd
z_?`R}{8jul{B`^d{4M-m{=5A5_yK;1e~3T8Kh8hFALf6~Kg&PIALWnnr}#JdxA?dD
zcldYt5BZPzPx#OH{|KxwUdRz93sZ!t!VF=iFk2`Pd_uKQFRT_e3Q-{@^a_Sx3Y&$k
z!ZzUs;YQ&mVYjeHxK+4axI@@0d{=l#I3k=9-Vi<%v&DR|Oe_~`#pU8kF(fvLtHgC;
zhu9^q7dMC-#e}#+yjI*NJ}5paJ|-R%4~a*_Uy8pHe=Yt-d_jCkd{>gBInrWjrPLrb
zO6^jI6qgLil(tCy(st=4X}5HjG$0L0`=xuOA4m^Nk4XomL(&oH*V1w6q;y((S^Ayy
zd+86-pQOJ?|BybHN6RjmlLa|L9w$$bGv#b~nmkAL$%V3Cu8^zc8hNq2M6Q>c<W{*&
zUMsJYdt_7IEN_*!$-Cq$<*&=v$k)kx<U8g2<cH*+$dAa6%7^7A<)`Im<X^~V<TvE^
z6qn*r<|_-7MM|Yot+Xkc5>>V;dzE{X`;_~Y2bCWw&nZWhW6JZ&i^?m?S><)*4dpHO
zICqx2%DvQmnR~PQPWOH8N8OLP54sPzkGP+5|IGb@`z7}Y_bK-o_bcw#+`n~yqDtz;
z>U_0X?NPU>JJg-(73x*$)#^>^&FU@cZR+>bLG^BRzxuHHi276YfcljBwEB#CN_|y*
T!#+NMVVozq(|IPp)i?hKOm_U^

diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
deleted file mode 100644
index 7817befaed..0000000000
--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import UIKit
-
-@UIApplicationMain
-class AppDelegate: UIResponder, UIApplicationDelegate {
-
-    var window: UIWindow?
-
-    func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplicationLaunchOptionsKey: Any]?) -> Bool {
-        // Override point for customization after application launch.
-        return true
-    }
-
-    func applicationWillResignActive(_ application: UIApplication) {
-        // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state.
-        // Use this method to pause ongoing tasks, disable timers, and invalidate graphics rendering callbacks. Games should use this method to pause the game.
-    }
-
-    func applicationDidEnterBackground(_ application: UIApplication) {
-        // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later.
-        // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits.
-    }
-
-    func applicationWillEnterForeground(_ application: UIApplication) {
-        // Called as part of the transition from the background to the active state; here you can undo many of the changes made on entering the background.
-    }
-
-    func applicationDidBecomeActive(_ application: UIApplication) {
-        // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface.
-    }
-
-    func applicationWillTerminate(_ application: UIApplication) {
-        // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:.
-    }
-
-
-}
diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Assets.xcassets/AppIcon.appiconset/Contents.json b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Assets.xcassets/AppIcon.appiconset/Contents.json
deleted file mode 100644
index b542ec24d2..0000000000
--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Assets.xcassets/AppIcon.appiconset/Contents.json
+++ /dev/null
@@ -1,98 +0,0 @@
-{
-  "images" : [
-    {
-      "idiom" : "iphone",
-      "size" : "20x20",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "iphone",
-      "size" : "20x20",
-      "scale" : "3x"
-    },
-    {
-      "idiom" : "iphone",
-      "size" : "29x29",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "iphone",
-      "size" : "29x29",
-      "scale" : "3x"
-    },
-    {
-      "idiom" : "iphone",
-      "size" : "40x40",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "iphone",
-      "size" : "40x40",
-      "scale" : "3x"
-    },
-    {
-      "idiom" : "iphone",
-      "size" : "60x60",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "iphone",
-      "size" : "60x60",
-      "scale" : "3x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "20x20",
-      "scale" : "1x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "20x20",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "29x29",
-      "scale" : "1x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "29x29",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "40x40",
-      "scale" : "1x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "40x40",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "76x76",
-      "scale" : "1x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "76x76",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "ipad",
-      "size" : "83.5x83.5",
-      "scale" : "2x"
-    },
-    {
-      "idiom" : "ios-marketing",
-      "size" : "1024x1024",
-      "scale" : "1x"
-    }
-  ],
-  "info" : {
-    "version" : 1,
-    "author" : "xcode"
-  }
-}
diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Assets.xcassets/Contents.json b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Assets.xcassets/Contents.json
deleted file mode 100644
index 2d92bd53fd..0000000000
--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Assets.xcassets/Contents.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
-  "info" : {
-    "version" : 1,
-    "author" : "xcode"
-  }
-}
diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Base.lproj/LaunchScreen.storyboard b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Base.lproj/LaunchScreen.storyboard
deleted file mode 100644
index f83f6fd581..0000000000
--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Base.lproj/LaunchScreen.storyboard
+++ /dev/null
@@ -1,25 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="13122.16" systemVersion="17A277" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" launchScreen="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="01J-lp-oVM">
-    <dependencies>
-        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="13104.12"/>
-        <capability name="Safe area layout guides" minToolsVersion="9.0"/>
-        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
-    </dependencies>
-    <scenes>
-        <!--View Controller-->
-        <scene sceneID="EHf-IW-A2E">
-            <objects>
-                <viewController id="01J-lp-oVM" sceneMemberID="viewController">
-                    <view key="view" contentMode="scaleToFill" id="Ze5-6b-2t3">
-                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
-                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
-                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
-                        <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
-                    </view>
-                </viewController>
-                <placeholder placeholderIdentifier="IBFirstResponder" id="iYj-Kq-Ea1" userLabel="First Responder" sceneMemberID="firstResponder"/>
-            </objects>
-            <point key="canvasLocation" x="53" y="375"/>
-        </scene>
-    </scenes>
-</document>
diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Base.lproj/Main.storyboard b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Base.lproj/Main.storyboard
deleted file mode 100644
index 03c13c2286..0000000000
--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Base.lproj/Main.storyboard
+++ /dev/null
@@ -1,24 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="13122.16" systemVersion="17A277" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
-    <dependencies>
-        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="13104.12"/>
-        <capability name="Safe area layout guides" minToolsVersion="9.0"/>
-        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
-    </dependencies>
-    <scenes>
-        <!--View Controller-->
-        <scene sceneID="tne-QT-ifu">
-            <objects>
-                <viewController id="BYZ-38-t0r" customClass="ViewController" customModuleProvider="target" sceneMemberID="viewController">
-                    <view key="view" contentMode="scaleToFill" id="8bC-Xf-vdC">
-                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
-                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
-                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
-                        <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
-                    </view>
-                </viewController>
-                <placeholder placeholderIdentifier="IBFirstResponder" id="dkx-z0-nzr" sceneMemberID="firstResponder"/>
-            </objects>
-        </scene>
-    </scenes>
-</document>
diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Info.plist b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Info.plist
deleted file mode 100644
index 16be3b6811..0000000000
--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Info.plist
+++ /dev/null
@@ -1,45 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>CFBundleDevelopmentRegion</key>
-	<string>$(DEVELOPMENT_LANGUAGE)</string>
-	<key>CFBundleExecutable</key>
-	<string>$(EXECUTABLE_NAME)</string>
-	<key>CFBundleIdentifier</key>
-	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
-	<key>CFBundleInfoDictionaryVersion</key>
-	<string>6.0</string>
-	<key>CFBundleName</key>
-	<string>$(PRODUCT_NAME)</string>
-	<key>CFBundlePackageType</key>
-	<string>APPL</string>
-	<key>CFBundleShortVersionString</key>
-	<string>1.0</string>
-	<key>CFBundleVersion</key>
-	<string>1</string>
-	<key>LSRequiresIPhoneOS</key>
-	<true/>
-	<key>UILaunchStoryboardName</key>
-	<string>LaunchScreen</string>
-	<key>UIMainStoryboardFile</key>
-	<string>Main</string>
-	<key>UIRequiredDeviceCapabilities</key>
-	<array>
-		<string>armv7</string>
-	</array>
-	<key>UISupportedInterfaceOrientations</key>
-	<array>
-		<string>UIInterfaceOrientationPortrait</string>
-		<string>UIInterfaceOrientationLandscapeLeft</string>
-		<string>UIInterfaceOrientationLandscapeRight</string>
-	</array>
-	<key>UISupportedInterfaceOrientations~ipad</key>
-	<array>
-		<string>UIInterfaceOrientationPortrait</string>
-		<string>UIInterfaceOrientationPortraitUpsideDown</string>
-		<string>UIInterfaceOrientationLandscapeLeft</string>
-		<string>UIInterfaceOrientationLandscapeRight</string>
-	</array>
-</dict>
-</plist>
diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
deleted file mode 100644
index 052cac90d7..0000000000
--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import UIKit
-import Metal
-//import MetalKit
-import paddle_mobile
-
-class ViewController: UIViewController {
-    override func viewDidLoad() {
-        super.viewDidLoad()
-        let device = Metal.MTLCreateSystemDefaultDevice()!
-        let queue = device.makeCommandQueue()!
-        let test = PaddleMobileUnitTest.init(
-            inDevice: device,
-            inQueue: queue
-        )
-        test.testConcat()
-        //        test.testReshape()
-        //        test.testTranspose()
-        print(" done ")
-    }
-
-}
diff --git a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
deleted file mode 100644
index 193e5ed613..0000000000
--- a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
+++ /dev/null
@@ -1,913 +0,0 @@
-// !$*UTF8*$!
-{
-	archiveVersion = 1;
-	classes = {
-	};
-	objectVersion = 50;
-	objects = {
-
-/* Begin PBXBuildFile section */
-		16324D842292ABDB0047277D /* NearestInterpKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16324D832292ABDB0047277D /* NearestInterpKernel.swift */; };
-		165F38D32276CDEA0088E29F /* ConvAddReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 165F38D22276CDEA0088E29F /* ConvAddReluOp.swift */; };
-		165F38D52276CE7D0088E29F /* ConvAddReluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 165F38D42276CE7D0088E29F /* ConvAddReluKernel.swift */; };
-		16D3F3B522929C390067C45D /* SliceOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16D3F3B422929C390067C45D /* SliceOp.swift */; };
-		16D3F3B722929C660067C45D /* NearestInterpOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16D3F3B622929C660067C45D /* NearestInterpOp.swift */; };
-		16D3F3B922929D070067C45D /* SliceKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16D3F3B822929D070067C45D /* SliceKernel.swift */; };
-		16F70989229BF975000755B0 /* ConvReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16F70988229BF975000755B0 /* ConvReluOp.swift */; };
-		16F7098B229BFA46000755B0 /* ConvReluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16F7098A229BFA46000755B0 /* ConvReluKernel.swift */; };
-		16FBFB36229259C60025B406 /* ExpOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16FBFB35229259C60025B406 /* ExpOp.swift */; };
-		16FBFB3822925B030025B406 /* ExpKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16FBFB3722925B030025B406 /* ExpKernel.swift */; };
-		16FBFB3A22925C3E0025B406 /* SigmoidKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16FBFB3922925C3E0025B406 /* SigmoidKernel.swift */; };
-		16FBFB3C22925C800025B406 /* SigmoidOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16FBFB3B22925C800025B406 /* SigmoidOp.swift */; };
-		16FBFB40229266FE0025B406 /* LeakyReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16FBFB3F229266FE0025B406 /* LeakyReluOp.swift */; };
-		16FBFB422292684E0025B406 /* LeakyReluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 16FBFB412292684E0025B406 /* LeakyReluKernel.swift */; };
-		456BB7B421F5B356001474E2 /* Framework.pbobjc.m in Sources */ = {isa = PBXBuildFile; fileRef = 456BB7B221F5B356001474E2 /* Framework.pbobjc.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc"; }; };
-		456BB7B521F5B356001474E2 /* Framework.pbobjc.h in Headers */ = {isa = PBXBuildFile; fileRef = 456BB7B321F5B356001474E2 /* Framework.pbobjc.h */; settings = {ATTRIBUTES = (Public, ); }; };
-		4AA1EA862146625E00D0F791 /* BilinearInterpOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA852146625E00D0F791 /* BilinearInterpOp.swift */; };
-		4AA1EA88214662BD00D0F791 /* BilinearInterpKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA87214662BD00D0F791 /* BilinearInterpKernel.swift */; };
-		4AA1EA8C2146640900D0F791 /* SplitOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA8B2146640900D0F791 /* SplitOp.swift */; };
-		4AA1EA8E2146647F00D0F791 /* SplitKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA8D2146647F00D0F791 /* SplitKernel.swift */; };
-		4AA1EA92214665D700D0F791 /* ShapeOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA91214665D700D0F791 /* ShapeOp.swift */; };
-		4AA1EA942146661500D0F791 /* ShapeKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA932146661500D0F791 /* ShapeKernel.swift */; };
-		4AA1EA982146666500D0F791 /* FlattenOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA972146666500D0F791 /* FlattenOp.swift */; };
-		4AA1EAA2214912CD00D0F791 /* FlattenKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAA1214912CC00D0F791 /* FlattenKernel.swift */; };
-		A73DC749227F1C7A001EB663 /* ScaleOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = A73DC748227F1C7A001EB663 /* ScaleOp.swift */; };
-		A73DC74B227F1EDE001EB663 /* ScaleOpKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = A73DC74A227F1EDE001EB663 /* ScaleOpKernel.swift */; };
-		A744C89722C074AC0084C6E9 /* Utils.swift in Sources */ = {isa = PBXBuildFile; fileRef = A744C89622C074AC0084C6E9 /* Utils.swift */; };
-		A744C9B422C206E20084C6E9 /* MemoryOptimze.swift in Sources */ = {isa = PBXBuildFile; fileRef = A744C9B322C206E20084C6E9 /* MemoryOptimze.swift */; };
-		A7F26FDA22842EF200365D47 /* Relu6Op.swift in Sources */ = {isa = PBXBuildFile; fileRef = A7F26FD922842EF200365D47 /* Relu6Op.swift */; };
-		A7F26FDC2284301500365D47 /* Relu6Kernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = A7F26FDB2284301500365D47 /* Relu6Kernel.swift */; };
-		C28FE02F21BA68C00054EFAC /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = C28FE02C21BA68C00054EFAC /* Metal.framework */; settings = {ATTRIBUTES = (Weak, ); }; };
-		C28FE03021BA68C00054EFAC /* MetalPerformanceShaders.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = C28FE02D21BA68C00054EFAC /* MetalPerformanceShaders.framework */; settings = {ATTRIBUTES = (Weak, ); }; };
-		C28FE03121BA68C00054EFAC /* MetalKit.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = C28FE02E21BA68C00054EFAC /* MetalKit.framework */; settings = {ATTRIBUTES = (Weak, ); }; };
-		D3831F70E7E0B565B9AC22DA /* Pods_paddle_mobile.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = DD2E06330A1E7129C918DB46 /* Pods_paddle_mobile.framework */; };
-		FC039B6F20E11C3C0081E9F8 /* paddle_mobile.h in Headers */ = {isa = PBXBuildFile; fileRef = FC039B6D20E11C3C0081E9F8 /* paddle_mobile.h */; settings = {ATTRIBUTES = (Public, ); }; };
-		FC039B9720E11C9A0081E9F8 /* Extensions.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B9420E11C9A0081E9F8 /* Extensions.swift */; };
-		FC039B9820E11C9A0081E9F8 /* Errors.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B9520E11C9A0081E9F8 /* Errors.swift */; };
-		FC039B9920E11C9A0081E9F8 /* Types.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B9620E11C9A0081E9F8 /* Types.swift */; };
-		FC039B9B20E11CA00081E9F8 /* Executor.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B9A20E11CA00081E9F8 /* Executor.swift */; };
-		FC039B9F20E11CB20081E9F8 /* Tensor.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B9D20E11CB20081E9F8 /* Tensor.swift */; };
-		FC039BA020E11CB20081E9F8 /* Dim.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B9E20E11CB20081E9F8 /* Dim.swift */; };
-		FC039BA220E11CB70081E9F8 /* Loader.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BA120E11CB70081E9F8 /* Loader.swift */; };
-		FC039BA920E11CBC0081E9F8 /* ConvOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BA420E11CBC0081E9F8 /* ConvOp.swift */; };
-		FC039BAA20E11CBC0081E9F8 /* ElementwiseAddOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BA520E11CBC0081E9F8 /* ElementwiseAddOp.swift */; };
-		FC039BAB20E11CBC0081E9F8 /* Operator.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BA620E11CBC0081E9F8 /* Operator.swift */; };
-		FC039BAC20E11CBC0081E9F8 /* BatchNormOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BA720E11CBC0081E9F8 /* BatchNormOp.swift */; };
-		FC039BAD20E11CBC0081E9F8 /* ReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BA820E11CBC0081E9F8 /* ReluOp.swift */; };
-		FC039BB920E11CC20081E9F8 /* Scope.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BB020E11CC20081E9F8 /* Scope.swift */; };
-		FC039BBA20E11CC20081E9F8 /* TensorDesc.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BB120E11CC20081E9F8 /* TensorDesc.swift */; };
-		FC039BBB20E11CC20081E9F8 /* PMProgramDesc.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BB220E11CC20081E9F8 /* PMProgramDesc.swift */; };
-		FC039BBC20E11CC20081E9F8 /* PMVarDesc.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BB320E11CC20081E9F8 /* PMVarDesc.swift */; };
-		FC039BBD20E11CC20081E9F8 /* Program.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BB420E11CC20081E9F8 /* Program.swift */; };
-		FC039BBE20E11CC20081E9F8 /* PMOpDesc.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BB520E11CC20081E9F8 /* PMOpDesc.swift */; };
-		FC039BBF20E11CC20081E9F8 /* Attribute.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BB620E11CC20081E9F8 /* Attribute.swift */; };
-		FC039BC020E11CC20081E9F8 /* PMBlockDesc.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BB720E11CC20081E9F8 /* PMBlockDesc.swift */; };
-		FC0E2DBA20EE3B8D009C1FAC /* ReluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DB920EE3B8D009C1FAC /* ReluKernel.swift */; };
-		FC0E2DBC20EE45FE009C1FAC /* ConvKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DBB20EE45FE009C1FAC /* ConvKernel.swift */; };
-		FC0E2DBE20EE460D009C1FAC /* BatchNormKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */; };
-		FC0E2DC020EE461F009C1FAC /* ElementwiseAddKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */; };
-		FC1CF3F721D4B4C400F7392E /* Runner.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC1CF3F621D4B4C400F7392E /* Runner.swift */; };
-		FC2BFCC221DF2F9100C262B2 /* GlobalConfig.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFCC121DF2F9100C262B2 /* GlobalConfig.swift */; };
-		FC2BFD4621DF685F00C262B2 /* Scale.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD4521DF685F00C262B2 /* Scale.swift */; };
-		FC2BFD4A21DF81DE00C262B2 /* Kernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD4921DF81DE00C262B2 /* Kernel.swift */; };
-		FC2BFD4E21DF820B00C262B2 /* ConvAddBatchNormReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD4D21DF820A00C262B2 /* ConvAddBatchNormReluOp.swift */; };
-		FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC3602CB2108819F00FACB58 /* PaddleMobileUnitTest.swift */; };
-		FC4CB74B20F12C30007C0C6D /* ProgramOptimize.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC4CB74A20F12C30007C0C6D /* ProgramOptimize.swift */; };
-		FC5163F620EF556E00636C28 /* Texture2DTo2DArrayKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC5163F520EF556E00636C28 /* Texture2DTo2DArrayKernel.swift */; };
-		FC60DB8920E9AAA500FF203F /* MetalExtension.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC60DB8820E9AAA500FF203F /* MetalExtension.swift */; };
-		FC803BBF214CB65A0094B8E5 /* ConvAddPreluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC803BBE214CB65A0094B8E5 /* ConvAddPreluOp.swift */; };
-		FC803BC1214CB77A0094B8E5 /* ConvAddPreluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC803BC0214CB77A0094B8E5 /* ConvAddPreluKernel.swift */; };
-		FC82735920E3C04200BE430A /* OpCreator.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC82735820E3C04200BE430A /* OpCreator.swift */; };
-		FC9797C921D6101D00F2FD90 /* ResizeBilinearOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9797C821D6101D00F2FD90 /* ResizeBilinearOp.swift */; };
-		FC9797CB21D6102D00F2FD90 /* ResizeBilinearKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9797CA21D6102D00F2FD90 /* ResizeBilinearKernel.swift */; };
-		FC9D037920E229E4000F735A /* OpParam.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9D037820E229E4000F735A /* OpParam.swift */; };
-		FC9D038020E22FBB000F735A /* FeedOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9D037F20E22FBB000F735A /* FeedOp.swift */; };
-		FC9D038220E2312E000F735A /* FetchOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9D038120E2312E000F735A /* FetchOp.swift */; };
-		FC9D038420E23B01000F735A /* Texture.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9D038320E23B01000F735A /* Texture.swift */; };
-		FCB40E5921E0DCAB0075EC91 /* FetchKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCB40E5821E0DCAB0075EC91 /* FetchKernel.swift */; };
-		FCBCCC572122F41300D94F7E /* DwConvBNReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC562122F41300D94F7E /* DwConvBNReluOp.swift */; };
-		FCBCCC592122F42700D94F7E /* ConvBNReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC582122F42700D94F7E /* ConvBNReluOp.swift */; };
-		FCBCCC5B2122F66F00D94F7E /* ConvBNReluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC5A2122F66F00D94F7E /* ConvBNReluKernel.swift */; };
-		FCBCCC5D2122F8A100D94F7E /* DepthwiseConvOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC5C2122F8A100D94F7E /* DepthwiseConvOp.swift */; };
-		FCBCCC5F2122FB3B00D94F7E /* PriorBoxOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC5E2122FB3B00D94F7E /* PriorBoxOp.swift */; };
-		FCBCCC612122FBDF00D94F7E /* PriorBoxKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC602122FBDF00D94F7E /* PriorBoxKernel.swift */; };
-		FCBCCC632122FCC000D94F7E /* TransposeKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC622122FCC000D94F7E /* TransposeKernel.swift */; };
-		FCBCCC652122FCD700D94F7E /* TransposeOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC642122FCD700D94F7E /* TransposeOp.swift */; };
-		FCBCCC67212306B000D94F7E /* ConcatOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC66212306B000D94F7E /* ConcatOp.swift */; };
-		FCBCCC69212306D300D94F7E /* ConcatKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC68212306D300D94F7E /* ConcatKernel.swift */; };
-		FCBCCC6B2123071700D94F7E /* BoxcoderOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC6A2123071700D94F7E /* BoxcoderOp.swift */; };
-		FCBCCC6D2123073A00D94F7E /* BoxcoderKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC6C2123073A00D94F7E /* BoxcoderKernel.swift */; };
-		FCBCCC6F2123097100D94F7E /* MulticlassNMSOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC6E2123097100D94F7E /* MulticlassNMSOp.swift */; };
-		FCBCCC71212309A700D94F7E /* MulticlassNMSKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC70212309A700D94F7E /* MulticlassNMSKernel.swift */; };
-		FCD04E6620F314C50007374F /* PoolOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E6520F314C50007374F /* PoolOp.swift */; };
-		FCD04E6820F315020007374F /* PoolKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E6720F315020007374F /* PoolKernel.swift */; };
-		FCD04E6A20F319EC0007374F /* SoftmaxOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E6920F319EC0007374F /* SoftmaxOp.swift */; };
-		FCD04E6C20F31A280007374F /* SoftmaxKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E6B20F31A280007374F /* SoftmaxKernel.swift */; };
-		FCD04E6E20F31B4B0007374F /* ReshapeOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E6D20F31B4B0007374F /* ReshapeOp.swift */; };
-		FCD04E7020F31B720007374F /* ReshapeKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E6F20F31B720007374F /* ReshapeKernel.swift */; };
-		FCD04E7220F343420007374F /* ConvAddOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E7120F343420007374F /* ConvAddOp.swift */; };
-		FCD04E7420F3437E0007374F /* ConvAddKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E7320F3437E0007374F /* ConvAddKernel.swift */; };
-		FCDC0FEB21099A1D00DC9EFB /* Tools.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDC0FEA21099A1D00DC9EFB /* Tools.swift */; };
-		FCDDC6C6212F9FB800E5EF74 /* PreluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6C5212F9FB800E5EF74 /* PreluKernel.swift */; };
-		FCDDC6C8212FA3CA00E5EF74 /* ConvTransposeKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6C7212FA3CA00E5EF74 /* ConvTransposeKernel.swift */; };
-		FCDE8A33212A917900F4A8F6 /* ConvTransposeOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDE8A32212A917900F4A8F6 /* ConvTransposeOp.swift */; };
-		FCE3A1A92153DE5100C37CDE /* ConvAddAddPreluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1A82153DE5100C37CDE /* ConvAddAddPreluOp.swift */; };
-		FCE3A1AB2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1AA2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift */; };
-		FCE3A1AD2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1AC2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift */; };
-		FCE3A1AF2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1AE2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift */; };
-		FCE9D7B7214F869000B520C3 /* Net.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE9D7B6214F869000B520C3 /* Net.swift */; };
-		FCEB684C212F093800D2448E /* PreluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCEB684B212F093800D2448E /* PreluOp.swift */; };
-		FCEBC0F620F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCEBC0F520F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift */; };
-/* End PBXBuildFile section */
-
-/* Begin PBXFileReference section */
-		16324D832292ABDB0047277D /* NearestInterpKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = NearestInterpKernel.swift; sourceTree = "<group>"; };
-		165F38D22276CDEA0088E29F /* ConvAddReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddReluOp.swift; sourceTree = "<group>"; };
-		165F38D42276CE7D0088E29F /* ConvAddReluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddReluKernel.swift; sourceTree = "<group>"; };
-		16D3F3B422929C390067C45D /* SliceOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SliceOp.swift; sourceTree = "<group>"; };
-		16D3F3B622929C660067C45D /* NearestInterpOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = NearestInterpOp.swift; sourceTree = "<group>"; };
-		16D3F3B822929D070067C45D /* SliceKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SliceKernel.swift; sourceTree = "<group>"; };
-		16F70988229BF975000755B0 /* ConvReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvReluOp.swift; sourceTree = "<group>"; };
-		16F7098A229BFA46000755B0 /* ConvReluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvReluKernel.swift; sourceTree = "<group>"; };
-		16FBFB35229259C60025B406 /* ExpOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ExpOp.swift; sourceTree = "<group>"; };
-		16FBFB3722925B030025B406 /* ExpKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ExpKernel.swift; sourceTree = "<group>"; };
-		16FBFB3922925C3E0025B406 /* SigmoidKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SigmoidKernel.swift; sourceTree = "<group>"; };
-		16FBFB3B22925C800025B406 /* SigmoidOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SigmoidOp.swift; sourceTree = "<group>"; };
-		16FBFB3F229266FE0025B406 /* LeakyReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LeakyReluOp.swift; sourceTree = "<group>"; };
-		16FBFB412292684E0025B406 /* LeakyReluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LeakyReluKernel.swift; sourceTree = "<group>"; };
-		456BB7B221F5B356001474E2 /* Framework.pbobjc.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = Framework.pbobjc.m; sourceTree = "<group>"; };
-		456BB7B321F5B356001474E2 /* Framework.pbobjc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Framework.pbobjc.h; sourceTree = "<group>"; };
-		4AA1EA852146625E00D0F791 /* BilinearInterpOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = BilinearInterpOp.swift; sourceTree = "<group>"; };
-		4AA1EA87214662BD00D0F791 /* BilinearInterpKernel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = BilinearInterpKernel.swift; sourceTree = "<group>"; };
-		4AA1EA8B2146640900D0F791 /* SplitOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = SplitOp.swift; sourceTree = "<group>"; };
-		4AA1EA8D2146647F00D0F791 /* SplitKernel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = SplitKernel.swift; sourceTree = "<group>"; };
-		4AA1EA91214665D700D0F791 /* ShapeOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ShapeOp.swift; sourceTree = "<group>"; };
-		4AA1EA932146661500D0F791 /* ShapeKernel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ShapeKernel.swift; sourceTree = "<group>"; };
-		4AA1EA972146666500D0F791 /* FlattenOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = FlattenOp.swift; sourceTree = "<group>"; };
-		4AA1EAA1214912CC00D0F791 /* FlattenKernel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = FlattenKernel.swift; sourceTree = "<group>"; };
-		A73DC748227F1C7A001EB663 /* ScaleOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ScaleOp.swift; sourceTree = "<group>"; };
-		A73DC74A227F1EDE001EB663 /* ScaleOpKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ScaleOpKernel.swift; sourceTree = "<group>"; };
-		A744C89622C074AC0084C6E9 /* Utils.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Utils.swift; sourceTree = "<group>"; };
-		A744C9B322C206E20084C6E9 /* MemoryOptimze.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MemoryOptimze.swift; sourceTree = "<group>"; };
-		A7F26FD922842EF200365D47 /* Relu6Op.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Relu6Op.swift; sourceTree = "<group>"; };
-		A7F26FDB2284301500365D47 /* Relu6Kernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Relu6Kernel.swift; sourceTree = "<group>"; };
-		C28FE02C21BA68C00054EFAC /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
-		C28FE02D21BA68C00054EFAC /* MetalPerformanceShaders.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = MetalPerformanceShaders.framework; path = System/Library/Frameworks/MetalPerformanceShaders.framework; sourceTree = SDKROOT; };
-		C28FE02E21BA68C00054EFAC /* MetalKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = MetalKit.framework; path = System/Library/Frameworks/MetalKit.framework; sourceTree = SDKROOT; };
-		CDF58151D902A1CBAE56A0C2 /* Pods-paddle-mobile.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile.debug.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile/Pods-paddle-mobile.debug.xcconfig"; sourceTree = "<group>"; };
-		DD2E06330A1E7129C918DB46 /* Pods_paddle_mobile.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Pods_paddle_mobile.framework; sourceTree = BUILT_PRODUCTS_DIR; };
-		E2A7957C92EDA5C3BEC0FFC2 /* Pods-paddle-mobile.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile.release.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile/Pods-paddle-mobile.release.xcconfig"; sourceTree = "<group>"; };
-		FC039B6A20E11C3C0081E9F8 /* paddle_mobile.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = paddle_mobile.framework; sourceTree = BUILT_PRODUCTS_DIR; };
-		FC039B6D20E11C3C0081E9F8 /* paddle_mobile.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = paddle_mobile.h; sourceTree = "<group>"; };
-		FC039B6E20E11C3C0081E9F8 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
-		FC039B9420E11C9A0081E9F8 /* Extensions.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Extensions.swift; sourceTree = "<group>"; };
-		FC039B9520E11C9A0081E9F8 /* Errors.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Errors.swift; sourceTree = "<group>"; };
-		FC039B9620E11C9A0081E9F8 /* Types.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Types.swift; sourceTree = "<group>"; };
-		FC039B9A20E11CA00081E9F8 /* Executor.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Executor.swift; sourceTree = "<group>"; };
-		FC039B9D20E11CB20081E9F8 /* Tensor.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Tensor.swift; sourceTree = "<group>"; };
-		FC039B9E20E11CB20081E9F8 /* Dim.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Dim.swift; sourceTree = "<group>"; };
-		FC039BA120E11CB70081E9F8 /* Loader.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Loader.swift; sourceTree = "<group>"; };
-		FC039BA420E11CBC0081E9F8 /* ConvOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ConvOp.swift; sourceTree = "<group>"; };
-		FC039BA520E11CBC0081E9F8 /* ElementwiseAddOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ElementwiseAddOp.swift; sourceTree = "<group>"; };
-		FC039BA620E11CBC0081E9F8 /* Operator.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Operator.swift; sourceTree = "<group>"; };
-		FC039BA720E11CBC0081E9F8 /* BatchNormOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = BatchNormOp.swift; sourceTree = "<group>"; };
-		FC039BA820E11CBC0081E9F8 /* ReluOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ReluOp.swift; sourceTree = "<group>"; };
-		FC039BB020E11CC20081E9F8 /* Scope.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Scope.swift; sourceTree = "<group>"; };
-		FC039BB120E11CC20081E9F8 /* TensorDesc.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = TensorDesc.swift; sourceTree = "<group>"; };
-		FC039BB220E11CC20081E9F8 /* PMProgramDesc.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = PMProgramDesc.swift; sourceTree = "<group>"; };
-		FC039BB320E11CC20081E9F8 /* PMVarDesc.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = PMVarDesc.swift; sourceTree = "<group>"; };
-		FC039BB420E11CC20081E9F8 /* Program.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Program.swift; sourceTree = "<group>"; };
-		FC039BB520E11CC20081E9F8 /* PMOpDesc.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = PMOpDesc.swift; sourceTree = "<group>"; };
-		FC039BB620E11CC20081E9F8 /* Attribute.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Attribute.swift; sourceTree = "<group>"; };
-		FC039BB720E11CC20081E9F8 /* PMBlockDesc.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = PMBlockDesc.swift; sourceTree = "<group>"; };
-		FC0E2DB920EE3B8D009C1FAC /* ReluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ReluKernel.swift; sourceTree = "<group>"; };
-		FC0E2DBB20EE45FE009C1FAC /* ConvKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvKernel.swift; sourceTree = "<group>"; };
-		FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BatchNormKernel.swift; sourceTree = "<group>"; };
-		FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ElementwiseAddKernel.swift; sourceTree = "<group>"; };
-		FC1CF3F621D4B4C400F7392E /* Runner.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Runner.swift; sourceTree = "<group>"; };
-		FC2BFCC121DF2F9100C262B2 /* GlobalConfig.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = GlobalConfig.swift; sourceTree = "<group>"; };
-		FC2BFD4521DF685F00C262B2 /* Scale.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Scale.swift; sourceTree = "<group>"; };
-		FC2BFD4921DF81DE00C262B2 /* Kernel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Kernel.swift; sourceTree = "<group>"; };
-		FC2BFD4D21DF820A00C262B2 /* ConvAddBatchNormReluOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ConvAddBatchNormReluOp.swift; sourceTree = "<group>"; };
-		FC3602CB2108819F00FACB58 /* PaddleMobileUnitTest.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PaddleMobileUnitTest.swift; sourceTree = "<group>"; };
-		FC4CB74A20F12C30007C0C6D /* ProgramOptimize.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ProgramOptimize.swift; sourceTree = "<group>"; };
-		FC4FD97D2140F2C30073E130 /* libstdc++.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = "libstdc++.tbd"; path = "usr/lib/libstdc++.tbd"; sourceTree = SDKROOT; };
-		FC5163F520EF556E00636C28 /* Texture2DTo2DArrayKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Texture2DTo2DArrayKernel.swift; sourceTree = "<group>"; };
-		FC60DB8820E9AAA500FF203F /* MetalExtension.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MetalExtension.swift; sourceTree = "<group>"; };
-		FC803BBE214CB65A0094B8E5 /* ConvAddPreluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddPreluOp.swift; sourceTree = "<group>"; };
-		FC803BC0214CB77A0094B8E5 /* ConvAddPreluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddPreluKernel.swift; sourceTree = "<group>"; };
-		FC82735820E3C04200BE430A /* OpCreator.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = OpCreator.swift; sourceTree = "<group>"; };
-		FC9797C821D6101D00F2FD90 /* ResizeBilinearOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ResizeBilinearOp.swift; sourceTree = "<group>"; };
-		FC9797CA21D6102D00F2FD90 /* ResizeBilinearKernel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ResizeBilinearKernel.swift; sourceTree = "<group>"; };
-		FC9D037820E229E4000F735A /* OpParam.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = OpParam.swift; sourceTree = "<group>"; };
-		FC9D037F20E22FBB000F735A /* FeedOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FeedOp.swift; sourceTree = "<group>"; };
-		FC9D038120E2312E000F735A /* FetchOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FetchOp.swift; sourceTree = "<group>"; };
-		FC9D038320E23B01000F735A /* Texture.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Texture.swift; sourceTree = "<group>"; };
-		FCB40E5821E0DCAB0075EC91 /* FetchKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FetchKernel.swift; sourceTree = "<group>"; };
-		FCBCCC562122F41300D94F7E /* DwConvBNReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DwConvBNReluOp.swift; sourceTree = "<group>"; };
-		FCBCCC582122F42700D94F7E /* ConvBNReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvBNReluOp.swift; sourceTree = "<group>"; };
-		FCBCCC5A2122F66F00D94F7E /* ConvBNReluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvBNReluKernel.swift; sourceTree = "<group>"; };
-		FCBCCC5C2122F8A100D94F7E /* DepthwiseConvOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DepthwiseConvOp.swift; sourceTree = "<group>"; };
-		FCBCCC5E2122FB3B00D94F7E /* PriorBoxOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PriorBoxOp.swift; sourceTree = "<group>"; };
-		FCBCCC602122FBDF00D94F7E /* PriorBoxKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PriorBoxKernel.swift; sourceTree = "<group>"; };
-		FCBCCC622122FCC000D94F7E /* TransposeKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TransposeKernel.swift; sourceTree = "<group>"; };
-		FCBCCC642122FCD700D94F7E /* TransposeOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TransposeOp.swift; sourceTree = "<group>"; };
-		FCBCCC66212306B000D94F7E /* ConcatOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConcatOp.swift; sourceTree = "<group>"; };
-		FCBCCC68212306D300D94F7E /* ConcatKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConcatKernel.swift; sourceTree = "<group>"; };
-		FCBCCC6A2123071700D94F7E /* BoxcoderOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BoxcoderOp.swift; sourceTree = "<group>"; };
-		FCBCCC6C2123073A00D94F7E /* BoxcoderKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BoxcoderKernel.swift; sourceTree = "<group>"; };
-		FCBCCC6E2123097100D94F7E /* MulticlassNMSOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MulticlassNMSOp.swift; sourceTree = "<group>"; };
-		FCBCCC70212309A700D94F7E /* MulticlassNMSKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MulticlassNMSKernel.swift; sourceTree = "<group>"; };
-		FCD04E6520F314C50007374F /* PoolOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PoolOp.swift; sourceTree = "<group>"; };
-		FCD04E6720F315020007374F /* PoolKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PoolKernel.swift; sourceTree = "<group>"; };
-		FCD04E6920F319EC0007374F /* SoftmaxOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SoftmaxOp.swift; sourceTree = "<group>"; };
-		FCD04E6B20F31A280007374F /* SoftmaxKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SoftmaxKernel.swift; sourceTree = "<group>"; };
-		FCD04E6D20F31B4B0007374F /* ReshapeOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ReshapeOp.swift; sourceTree = "<group>"; };
-		FCD04E6F20F31B720007374F /* ReshapeKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ReshapeKernel.swift; sourceTree = "<group>"; };
-		FCD04E7120F343420007374F /* ConvAddOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddOp.swift; sourceTree = "<group>"; };
-		FCD04E7320F3437E0007374F /* ConvAddKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddKernel.swift; sourceTree = "<group>"; };
-		FCDC0FEA21099A1D00DC9EFB /* Tools.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Tools.swift; sourceTree = "<group>"; };
-		FCDDC6C5212F9FB800E5EF74 /* PreluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PreluKernel.swift; sourceTree = "<group>"; };
-		FCDDC6C7212FA3CA00E5EF74 /* ConvTransposeKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvTransposeKernel.swift; sourceTree = "<group>"; };
-		FCDE8A32212A917900F4A8F6 /* ConvTransposeOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvTransposeOp.swift; sourceTree = "<group>"; };
-		FCE3A1A82153DE5100C37CDE /* ConvAddAddPreluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddAddPreluOp.swift; sourceTree = "<group>"; };
-		FCE3A1AA2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddAddPreluKernel.swift; sourceTree = "<group>"; };
-		FCE3A1AC2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ElementwiseAddPreluOp.swift; sourceTree = "<group>"; };
-		FCE3A1AE2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ElementwiseAddPreluKernel.swift; sourceTree = "<group>"; };
-		FCE9D7B6214F869000B520C3 /* Net.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Net.swift; sourceTree = "<group>"; };
-		FCEB684B212F093800D2448E /* PreluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PreluOp.swift; sourceTree = "<group>"; };
-		FCEBC0F520F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddBatchNormReluKernel.swift; sourceTree = "<group>"; };
-/* End PBXFileReference section */
-
-/* Begin PBXFrameworksBuildPhase section */
-		FC039B6620E11C3C0081E9F8 /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				C28FE02F21BA68C00054EFAC /* Metal.framework in Frameworks */,
-				C28FE03021BA68C00054EFAC /* MetalPerformanceShaders.framework in Frameworks */,
-				C28FE03121BA68C00054EFAC /* MetalKit.framework in Frameworks */,
-				D3831F70E7E0B565B9AC22DA /* Pods_paddle_mobile.framework in Frameworks */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXFrameworksBuildPhase section */
-
-/* Begin PBXGroup section */
-		336CBE234BF5DE48658DE65F /* Frameworks */ = {
-			isa = PBXGroup;
-			children = (
-				C28FE02C21BA68C00054EFAC /* Metal.framework */,
-				C28FE02E21BA68C00054EFAC /* MetalKit.framework */,
-				C28FE02D21BA68C00054EFAC /* MetalPerformanceShaders.framework */,
-				FC4FD97D2140F2C30073E130 /* libstdc++.tbd */,
-				DD2E06330A1E7129C918DB46 /* Pods_paddle_mobile.framework */,
-			);
-			name = Frameworks;
-			sourceTree = "<group>";
-		};
-		8EB858F9B68D372C9F1CA263 /* Pods */ = {
-			isa = PBXGroup;
-			children = (
-				CDF58151D902A1CBAE56A0C2 /* Pods-paddle-mobile.debug.xcconfig */,
-				E2A7957C92EDA5C3BEC0FFC2 /* Pods-paddle-mobile.release.xcconfig */,
-			);
-			name = Pods;
-			sourceTree = "<group>";
-		};
-		FC039B6020E11C3C0081E9F8 = {
-			isa = PBXGroup;
-			children = (
-				FC039B6C20E11C3C0081E9F8 /* paddle-mobile */,
-				FC039B6B20E11C3C0081E9F8 /* Products */,
-				8EB858F9B68D372C9F1CA263 /* Pods */,
-				336CBE234BF5DE48658DE65F /* Frameworks */,
-			);
-			sourceTree = "<group>";
-		};
-		FC039B6B20E11C3C0081E9F8 /* Products */ = {
-			isa = PBXGroup;
-			children = (
-				FC039B6A20E11C3C0081E9F8 /* paddle_mobile.framework */,
-			);
-			name = Products;
-			sourceTree = "<group>";
-		};
-		FC039B6C20E11C3C0081E9F8 /* paddle-mobile */ = {
-			isa = PBXGroup;
-			children = (
-				FC2BFD4721DF818000C262B2 /* API */,
-				FC2BFD4821DF818000C262B2 /* Src */,
-				FC039B6D20E11C3C0081E9F8 /* paddle_mobile.h */,
-				FC039B6E20E11C3C0081E9F8 /* Info.plist */,
-			);
-			path = "paddle-mobile";
-			sourceTree = "<group>";
-		};
-		FC039B9320E11C9A0081E9F8 /* Common */ = {
-			isa = PBXGroup;
-			children = (
-				FC039B9420E11C9A0081E9F8 /* Extensions.swift */,
-				FC039B9520E11C9A0081E9F8 /* Errors.swift */,
-				FC039B9620E11C9A0081E9F8 /* Types.swift */,
-				FC3602CB2108819F00FACB58 /* PaddleMobileUnitTest.swift */,
-				FC60DB8820E9AAA500FF203F /* MetalExtension.swift */,
-				FCDC0FEA21099A1D00DC9EFB /* Tools.swift */,
-			);
-			path = Common;
-			sourceTree = "<group>";
-		};
-		FC039B9C20E11CB20081E9F8 /* Framework */ = {
-			isa = PBXGroup;
-			children = (
-				FC039BA120E11CB70081E9F8 /* Loader.swift */,
-				FC039B9A20E11CA00081E9F8 /* Executor.swift */,
-				FC039B9D20E11CB20081E9F8 /* Tensor.swift */,
-				FC039B9E20E11CB20081E9F8 /* Dim.swift */,
-				FC9D038320E23B01000F735A /* Texture.swift */,
-				A744C89622C074AC0084C6E9 /* Utils.swift */,
-			);
-			path = Framework;
-			sourceTree = "<group>";
-		};
-		FC039BA320E11CBC0081E9F8 /* Operators */ = {
-			isa = PBXGroup;
-			children = (
-				FC086BA520E67E8500D85EF7 /* Kernels */,
-				FCD592FA20E248EC00252966 /* Base */,
-				FC9797C821D6101D00F2FD90 /* ResizeBilinearOp.swift */,
-				FC2BFD4D21DF820A00C262B2 /* ConvAddBatchNormReluOp.swift */,
-				FC039BA420E11CBC0081E9F8 /* ConvOp.swift */,
-				FC039BA520E11CBC0081E9F8 /* ElementwiseAddOp.swift */,
-				FC039BA720E11CBC0081E9F8 /* BatchNormOp.swift */,
-				FC039BA820E11CBC0081E9F8 /* ReluOp.swift */,
-				FC9D037F20E22FBB000F735A /* FeedOp.swift */,
-				FC9D038120E2312E000F735A /* FetchOp.swift */,
-				FCD04E6520F314C50007374F /* PoolOp.swift */,
-				FCD04E6920F319EC0007374F /* SoftmaxOp.swift */,
-				FCD04E6D20F31B4B0007374F /* ReshapeOp.swift */,
-				FCD04E7120F343420007374F /* ConvAddOp.swift */,
-				FCBCCC562122F41300D94F7E /* DwConvBNReluOp.swift */,
-				FCBCCC582122F42700D94F7E /* ConvBNReluOp.swift */,
-				FCBCCC5C2122F8A100D94F7E /* DepthwiseConvOp.swift */,
-				FCBCCC5E2122FB3B00D94F7E /* PriorBoxOp.swift */,
-				FCBCCC642122FCD700D94F7E /* TransposeOp.swift */,
-				FCBCCC66212306B000D94F7E /* ConcatOp.swift */,
-				FCBCCC6A2123071700D94F7E /* BoxcoderOp.swift */,
-				4AA1EA8B2146640900D0F791 /* SplitOp.swift */,
-				4AA1EA91214665D700D0F791 /* ShapeOp.swift */,
-				4AA1EA972146666500D0F791 /* FlattenOp.swift */,
-				4AA1EA852146625E00D0F791 /* BilinearInterpOp.swift */,
-				FCBCCC6E2123097100D94F7E /* MulticlassNMSOp.swift */,
-				FCDE8A32212A917900F4A8F6 /* ConvTransposeOp.swift */,
-				FCEB684B212F093800D2448E /* PreluOp.swift */,
-				FC803BBE214CB65A0094B8E5 /* ConvAddPreluOp.swift */,
-				FCE3A1A82153DE5100C37CDE /* ConvAddAddPreluOp.swift */,
-				FCE3A1AC2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift */,
-				165F38D22276CDEA0088E29F /* ConvAddReluOp.swift */,
-				A73DC748227F1C7A001EB663 /* ScaleOp.swift */,
-				A7F26FD922842EF200365D47 /* Relu6Op.swift */,
-				16FBFB35229259C60025B406 /* ExpOp.swift */,
-				16FBFB3B22925C800025B406 /* SigmoidOp.swift */,
-				16FBFB3F229266FE0025B406 /* LeakyReluOp.swift */,
-				16D3F3B422929C390067C45D /* SliceOp.swift */,
-				16D3F3B622929C660067C45D /* NearestInterpOp.swift */,
-				16F70988229BF975000755B0 /* ConvReluOp.swift */,
-			);
-			path = Operators;
-			sourceTree = "<group>";
-		};
-		FC039BAE20E11CC20081E9F8 /* Program */ = {
-			isa = PBXGroup;
-			children = (
-				456BB7B321F5B356001474E2 /* Framework.pbobjc.h */,
-				456BB7B221F5B356001474E2 /* Framework.pbobjc.m */,
-				FC039BB020E11CC20081E9F8 /* Scope.swift */,
-				FC039BB120E11CC20081E9F8 /* TensorDesc.swift */,
-				FC039BB220E11CC20081E9F8 /* PMProgramDesc.swift */,
-				FC039BB320E11CC20081E9F8 /* PMVarDesc.swift */,
-				FC039BB420E11CC20081E9F8 /* Program.swift */,
-				FC039BB520E11CC20081E9F8 /* PMOpDesc.swift */,
-				FC039BB620E11CC20081E9F8 /* Attribute.swift */,
-				FC039BB720E11CC20081E9F8 /* PMBlockDesc.swift */,
-				FC4CB74A20F12C30007C0C6D /* ProgramOptimize.swift */,
-				A744C9B322C206E20084C6E9 /* MemoryOptimze.swift */,
-			);
-			path = Program;
-			sourceTree = "<group>";
-		};
-		FC086BA520E67E8500D85EF7 /* Kernels */ = {
-			isa = PBXGroup;
-			children = (
-				FCDDC6CD212FE02100E5EF74 /* Base */,
-				FC9797CA21D6102D00F2FD90 /* ResizeBilinearKernel.swift */,
-				FCDDC6C7212FA3CA00E5EF74 /* ConvTransposeKernel.swift */,
-				FC0E2DBB20EE45FE009C1FAC /* ConvKernel.swift */,
-				FC0E2DB920EE3B8D009C1FAC /* ReluKernel.swift */,
-				FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */,
-				FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */,
-				FC5163F520EF556E00636C28 /* Texture2DTo2DArrayKernel.swift */,
-				FCEBC0F520F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift */,
-				FCD04E6720F315020007374F /* PoolKernel.swift */,
-				FCD04E6B20F31A280007374F /* SoftmaxKernel.swift */,
-				FCD04E6F20F31B720007374F /* ReshapeKernel.swift */,
-				4AA1EAA1214912CC00D0F791 /* FlattenKernel.swift */,
-				FCD04E7320F3437E0007374F /* ConvAddKernel.swift */,
-				FCBCCC5A2122F66F00D94F7E /* ConvBNReluKernel.swift */,
-				FCBCCC602122FBDF00D94F7E /* PriorBoxKernel.swift */,
-				FCBCCC622122FCC000D94F7E /* TransposeKernel.swift */,
-				FCBCCC68212306D300D94F7E /* ConcatKernel.swift */,
-				FCBCCC6C2123073A00D94F7E /* BoxcoderKernel.swift */,
-				4AA1EA8D2146647F00D0F791 /* SplitKernel.swift */,
-				4AA1EA932146661500D0F791 /* ShapeKernel.swift */,
-				4AA1EA87214662BD00D0F791 /* BilinearInterpKernel.swift */,
-				FCBCCC70212309A700D94F7E /* MulticlassNMSKernel.swift */,
-				FCDDC6C5212F9FB800E5EF74 /* PreluKernel.swift */,
-				FC803BC0214CB77A0094B8E5 /* ConvAddPreluKernel.swift */,
-				FCE3A1AA2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift */,
-				FCE3A1AE2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift */,
-				FC2BFD4521DF685F00C262B2 /* Scale.swift */,
-				FCB40E5821E0DCAB0075EC91 /* FetchKernel.swift */,
-				165F38D42276CE7D0088E29F /* ConvAddReluKernel.swift */,
-				A73DC74A227F1EDE001EB663 /* ScaleOpKernel.swift */,
-				A7F26FDB2284301500365D47 /* Relu6Kernel.swift */,
-				16FBFB3722925B030025B406 /* ExpKernel.swift */,
-				16FBFB3922925C3E0025B406 /* SigmoidKernel.swift */,
-				16FBFB412292684E0025B406 /* LeakyReluKernel.swift */,
-				16D3F3B822929D070067C45D /* SliceKernel.swift */,
-				16324D832292ABDB0047277D /* NearestInterpKernel.swift */,
-				16F7098A229BFA46000755B0 /* ConvReluKernel.swift */,
-			);
-			path = Kernels;
-			sourceTree = "<group>";
-		};
-		FC2BFD4721DF818000C262B2 /* API */ = {
-			isa = PBXGroup;
-			children = (
-				FCE9D7B6214F869000B520C3 /* Net.swift */,
-				FC1CF3F621D4B4C400F7392E /* Runner.swift */,
-				FC2BFCC121DF2F9100C262B2 /* GlobalConfig.swift */,
-			);
-			path = API;
-			sourceTree = "<group>";
-		};
-		FC2BFD4821DF818000C262B2 /* Src */ = {
-			isa = PBXGroup;
-			children = (
-				FC039BAE20E11CC20081E9F8 /* Program */,
-				FC039BA320E11CBC0081E9F8 /* Operators */,
-				FC039B9C20E11CB20081E9F8 /* Framework */,
-				FC039B9320E11C9A0081E9F8 /* Common */,
-			);
-			path = Src;
-			sourceTree = "<group>";
-		};
-		FCD592FA20E248EC00252966 /* Base */ = {
-			isa = PBXGroup;
-			children = (
-				FC9D037820E229E4000F735A /* OpParam.swift */,
-				FC039BA620E11CBC0081E9F8 /* Operator.swift */,
-				FC82735820E3C04200BE430A /* OpCreator.swift */,
-			);
-			path = Base;
-			sourceTree = "<group>";
-		};
-		FCDDC6CD212FE02100E5EF74 /* Base */ = {
-			isa = PBXGroup;
-			children = (
-				FC2BFD4921DF81DE00C262B2 /* Kernel.swift */,
-			);
-			path = Base;
-			sourceTree = "<group>";
-		};
-/* End PBXGroup section */
-
-/* Begin PBXHeadersBuildPhase section */
-		FC039B6720E11C3C0081E9F8 /* Headers */ = {
-			isa = PBXHeadersBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				456BB7B521F5B356001474E2 /* Framework.pbobjc.h in Headers */,
-				FC039B6F20E11C3C0081E9F8 /* paddle_mobile.h in Headers */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXHeadersBuildPhase section */
-
-/* Begin PBXNativeTarget section */
-		FC039B6920E11C3C0081E9F8 /* paddle-mobile */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = FC039B7220E11C3C0081E9F8 /* Build configuration list for PBXNativeTarget "paddle-mobile" */;
-			buildPhases = (
-				AF33BB8D0770A77AC22B5EF4 /* [CP] Check Pods Manifest.lock */,
-				FC039B6520E11C3C0081E9F8 /* Sources */,
-				FC039B6620E11C3C0081E9F8 /* Frameworks */,
-				FC039B6720E11C3C0081E9F8 /* Headers */,
-				FC039B6820E11C3C0081E9F8 /* Resources */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-			);
-			name = "paddle-mobile";
-			productName = "paddle-mobile";
-			productReference = FC039B6A20E11C3C0081E9F8 /* paddle_mobile.framework */;
-			productType = "com.apple.product-type.framework";
-		};
-/* End PBXNativeTarget section */
-
-/* Begin PBXProject section */
-		FC039B6120E11C3C0081E9F8 /* Project object */ = {
-			isa = PBXProject;
-			attributes = {
-				LastUpgradeCheck = 0930;
-				ORGANIZATIONNAME = orange;
-				TargetAttributes = {
-					FC039B6920E11C3C0081E9F8 = {
-						CreatedOnToolsVersion = 9.3.1;
-						LastSwiftMigration = 1000;
-					};
-				};
-			};
-			buildConfigurationList = FC039B6420E11C3C0081E9F8 /* Build configuration list for PBXProject "paddle-mobile" */;
-			compatibilityVersion = "Xcode 9.3";
-			developmentRegion = en;
-			hasScannedForEncodings = 0;
-			knownRegions = (
-				en,
-			);
-			mainGroup = FC039B6020E11C3C0081E9F8;
-			productRefGroup = FC039B6B20E11C3C0081E9F8 /* Products */;
-			projectDirPath = "";
-			projectRoot = "";
-			targets = (
-				FC039B6920E11C3C0081E9F8 /* paddle-mobile */,
-			);
-		};
-/* End PBXProject section */
-
-/* Begin PBXResourcesBuildPhase section */
-		FC039B6820E11C3C0081E9F8 /* Resources */ = {
-			isa = PBXResourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXResourcesBuildPhase section */
-
-/* Begin PBXShellScriptBuildPhase section */
-		AF33BB8D0770A77AC22B5EF4 /* [CP] Check Pods Manifest.lock */ = {
-			isa = PBXShellScriptBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			inputPaths = (
-				"${PODS_PODFILE_DIR_PATH}/Podfile.lock",
-				"${PODS_ROOT}/Manifest.lock",
-			);
-			name = "[CP] Check Pods Manifest.lock";
-			outputPaths = (
-				"$(DERIVED_FILE_DIR)/Pods-paddle-mobile-checkManifestLockResult.txt",
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-			shellPath = /bin/sh;
-			shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n    # print error to STDERR\n    echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n    exit 1\nfi\n# This output is used by Xcode 'outputs' to avoid re-running this script phase.\necho \"SUCCESS\" > \"${SCRIPT_OUTPUT_FILE_0}\"\n";
-			showEnvVarsInLog = 0;
-		};
-/* End PBXShellScriptBuildPhase section */
-
-/* Begin PBXSourcesBuildPhase section */
-		FC039B6520E11C3C0081E9F8 /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				FC9D038020E22FBB000F735A /* FeedOp.swift in Sources */,
-				FC039B9F20E11CB20081E9F8 /* Tensor.swift in Sources */,
-				A73DC74B227F1EDE001EB663 /* ScaleOpKernel.swift in Sources */,
-				4AA1EA942146661500D0F791 /* ShapeKernel.swift in Sources */,
-				FC0E2DBC20EE45FE009C1FAC /* ConvKernel.swift in Sources */,
-				16FBFB3A22925C3E0025B406 /* SigmoidKernel.swift in Sources */,
-				FC039BAA20E11CBC0081E9F8 /* ElementwiseAddOp.swift in Sources */,
-				FCDE8A33212A917900F4A8F6 /* ConvTransposeOp.swift in Sources */,
-				FCBCCC6B2123071700D94F7E /* BoxcoderOp.swift in Sources */,
-				16D3F3B522929C390067C45D /* SliceOp.swift in Sources */,
-				FC039B9B20E11CA00081E9F8 /* Executor.swift in Sources */,
-				FCD04E7020F31B720007374F /* ReshapeKernel.swift in Sources */,
-				FCD04E7220F343420007374F /* ConvAddOp.swift in Sources */,
-				FC039BBB20E11CC20081E9F8 /* PMProgramDesc.swift in Sources */,
-				FCE3A1AB2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift in Sources */,
-				FC9D037920E229E4000F735A /* OpParam.swift in Sources */,
-				16324D842292ABDB0047277D /* NearestInterpKernel.swift in Sources */,
-				FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */,
-				FCDDC6C6212F9FB800E5EF74 /* PreluKernel.swift in Sources */,
-				FC9797CB21D6102D00F2FD90 /* ResizeBilinearKernel.swift in Sources */,
-				FCBCCC5B2122F66F00D94F7E /* ConvBNReluKernel.swift in Sources */,
-				4AA1EA8C2146640900D0F791 /* SplitOp.swift in Sources */,
-				FC0E2DC020EE461F009C1FAC /* ElementwiseAddKernel.swift in Sources */,
-				165F38D52276CE7D0088E29F /* ConvAddReluKernel.swift in Sources */,
-				FC803BBF214CB65A0094B8E5 /* ConvAddPreluOp.swift in Sources */,
-				FCEB684C212F093800D2448E /* PreluOp.swift in Sources */,
-				FC60DB8920E9AAA500FF203F /* MetalExtension.swift in Sources */,
-				FCEBC0F620F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift in Sources */,
-				16FBFB422292684E0025B406 /* LeakyReluKernel.swift in Sources */,
-				FC039BBA20E11CC20081E9F8 /* TensorDesc.swift in Sources */,
-				16FBFB36229259C60025B406 /* ExpOp.swift in Sources */,
-				FC039BA020E11CB20081E9F8 /* Dim.swift in Sources */,
-				FC039B9920E11C9A0081E9F8 /* Types.swift in Sources */,
-				FCBCCC592122F42700D94F7E /* ConvBNReluOp.swift in Sources */,
-				FC039BA920E11CBC0081E9F8 /* ConvOp.swift in Sources */,
-				FC2BFD4A21DF81DE00C262B2 /* Kernel.swift in Sources */,
-				FC9D038420E23B01000F735A /* Texture.swift in Sources */,
-				FC2BFD4E21DF820B00C262B2 /* ConvAddBatchNormReluOp.swift in Sources */,
-				4AA1EAA2214912CD00D0F791 /* FlattenKernel.swift in Sources */,
-				4AA1EA982146666500D0F791 /* FlattenOp.swift in Sources */,
-				FC2BFCC221DF2F9100C262B2 /* GlobalConfig.swift in Sources */,
-				456BB7B421F5B356001474E2 /* Framework.pbobjc.m in Sources */,
-				FCBCCC652122FCD700D94F7E /* TransposeOp.swift in Sources */,
-				FCD04E6E20F31B4B0007374F /* ReshapeOp.swift in Sources */,
-				FC039B9820E11C9A0081E9F8 /* Errors.swift in Sources */,
-				FC039BBF20E11CC20081E9F8 /* Attribute.swift in Sources */,
-				4AA1EA8E2146647F00D0F791 /* SplitKernel.swift in Sources */,
-				FCD04E7420F3437E0007374F /* ConvAddKernel.swift in Sources */,
-				FC1CF3F721D4B4C400F7392E /* Runner.swift in Sources */,
-				16D3F3B722929C660067C45D /* NearestInterpOp.swift in Sources */,
-				FC039BB920E11CC20081E9F8 /* Scope.swift in Sources */,
-				FCD04E6620F314C50007374F /* PoolOp.swift in Sources */,
-				FC039BAC20E11CBC0081E9F8 /* BatchNormOp.swift in Sources */,
-				FCBCCC6F2123097100D94F7E /* MulticlassNMSOp.swift in Sources */,
-				FC039BBC20E11CC20081E9F8 /* PMVarDesc.swift in Sources */,
-				FCBCCC632122FCC000D94F7E /* TransposeKernel.swift in Sources */,
-				FCBCCC71212309A700D94F7E /* MulticlassNMSKernel.swift in Sources */,
-				FCDC0FEB21099A1D00DC9EFB /* Tools.swift in Sources */,
-				FC0E2DBA20EE3B8D009C1FAC /* ReluKernel.swift in Sources */,
-				4AA1EA862146625E00D0F791 /* BilinearInterpOp.swift in Sources */,
-				FCBCCC6D2123073A00D94F7E /* BoxcoderKernel.swift in Sources */,
-				FCB40E5921E0DCAB0075EC91 /* FetchKernel.swift in Sources */,
-				16FBFB3822925B030025B406 /* ExpKernel.swift in Sources */,
-				FCBCCC69212306D300D94F7E /* ConcatKernel.swift in Sources */,
-				FCDDC6C8212FA3CA00E5EF74 /* ConvTransposeKernel.swift in Sources */,
-				FC82735920E3C04200BE430A /* OpCreator.swift in Sources */,
-				4AA1EA92214665D700D0F791 /* ShapeOp.swift in Sources */,
-				FC803BC1214CB77A0094B8E5 /* ConvAddPreluKernel.swift in Sources */,
-				FCBCCC5D2122F8A100D94F7E /* DepthwiseConvOp.swift in Sources */,
-				FCE3A1AF2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift in Sources */,
-				16F70989229BF975000755B0 /* ConvReluOp.swift in Sources */,
-				16D3F3B922929D070067C45D /* SliceKernel.swift in Sources */,
-				FCE9D7B7214F869000B520C3 /* Net.swift in Sources */,
-				FC0E2DBE20EE460D009C1FAC /* BatchNormKernel.swift in Sources */,
-				FC039BAB20E11CBC0081E9F8 /* Operator.swift in Sources */,
-				FCD04E6A20F319EC0007374F /* SoftmaxOp.swift in Sources */,
-				A744C9B422C206E20084C6E9 /* MemoryOptimze.swift in Sources */,
-				A7F26FDA22842EF200365D47 /* Relu6Op.swift in Sources */,
-				FCBCCC612122FBDF00D94F7E /* PriorBoxKernel.swift in Sources */,
-				A744C89722C074AC0084C6E9 /* Utils.swift in Sources */,
-				FCBCCC5F2122FB3B00D94F7E /* PriorBoxOp.swift in Sources */,
-				FC9D038220E2312E000F735A /* FetchOp.swift in Sources */,
-				FC039BBD20E11CC20081E9F8 /* Program.swift in Sources */,
-				FC039BA220E11CB70081E9F8 /* Loader.swift in Sources */,
-				16FBFB3C22925C800025B406 /* SigmoidOp.swift in Sources */,
-				165F38D32276CDEA0088E29F /* ConvAddReluOp.swift in Sources */,
-				FCBCCC67212306B000D94F7E /* ConcatOp.swift in Sources */,
-				FCD04E6C20F31A280007374F /* SoftmaxKernel.swift in Sources */,
-				FC4CB74B20F12C30007C0C6D /* ProgramOptimize.swift in Sources */,
-				FCE3A1A92153DE5100C37CDE /* ConvAddAddPreluOp.swift in Sources */,
-				FC5163F620EF556E00636C28 /* Texture2DTo2DArrayKernel.swift in Sources */,
-				FCE3A1AD2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift in Sources */,
-				FC039BC020E11CC20081E9F8 /* PMBlockDesc.swift in Sources */,
-				16F7098B229BFA46000755B0 /* ConvReluKernel.swift in Sources */,
-				A7F26FDC2284301500365D47 /* Relu6Kernel.swift in Sources */,
-				FCD04E6820F315020007374F /* PoolKernel.swift in Sources */,
-				FC039BAD20E11CBC0081E9F8 /* ReluOp.swift in Sources */,
-				FCBCCC572122F41300D94F7E /* DwConvBNReluOp.swift in Sources */,
-				16FBFB40229266FE0025B406 /* LeakyReluOp.swift in Sources */,
-				FC039BBE20E11CC20081E9F8 /* PMOpDesc.swift in Sources */,
-				FC9797C921D6101D00F2FD90 /* ResizeBilinearOp.swift in Sources */,
-				4AA1EA88214662BD00D0F791 /* BilinearInterpKernel.swift in Sources */,
-				FC2BFD4621DF685F00C262B2 /* Scale.swift in Sources */,
-				FC039B9720E11C9A0081E9F8 /* Extensions.swift in Sources */,
-				A73DC749227F1C7A001EB663 /* ScaleOp.swift in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXSourcesBuildPhase section */
-
-/* Begin XCBuildConfiguration section */
-		FC039B7020E11C3C0081E9F8 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_ENABLE_OBJC_WEAK = YES;
-				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_COMMA = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-				CLANG_WARN_STRICT_PROTOTYPES = YES;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				CURRENT_PROJECT_VERSION = 1;
-				DEBUG_INFORMATION_FORMAT = dwarf;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				ENABLE_TESTABILITY = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu11;
-				GCC_DYNAMIC_NO_PIC = NO;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_OPTIMIZATION_LEVEL = 0;
-				GCC_PREPROCESSOR_DEFINITIONS = (
-					"DEBUG=1",
-					"$(inherited)",
-				);
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 11.3;
-				MTL_ENABLE_DEBUG_INFO = YES;
-				ONLY_ACTIVE_ARCH = YES;
-				SDKROOT = iphoneos;
-				SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG;
-				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
-				VERSIONING_SYSTEM = "apple-generic";
-				VERSION_INFO_PREFIX = "";
-			};
-			name = Debug;
-		};
-		FC039B7120E11C3C0081E9F8 /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_ENABLE_OBJC_WEAK = YES;
-				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_COMMA = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-				CLANG_WARN_STRICT_PROTOTYPES = YES;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				CURRENT_PROJECT_VERSION = 1;
-				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
-				ENABLE_NS_ASSERTIONS = NO;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu11;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 11.3;
-				MTL_ENABLE_DEBUG_INFO = NO;
-				SDKROOT = iphoneos;
-				SWIFT_COMPILATION_MODE = wholemodule;
-				SWIFT_OPTIMIZATION_LEVEL = "-O";
-				VALIDATE_PRODUCT = YES;
-				VERSIONING_SYSTEM = "apple-generic";
-				VERSION_INFO_PREFIX = "";
-			};
-			name = Release;
-		};
-		FC039B7320E11C3C0081E9F8 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			baseConfigurationReference = CDF58151D902A1CBAE56A0C2 /* Pods-paddle-mobile.debug.xcconfig */;
-			buildSettings = {
-				CLANG_ENABLE_MODULES = YES;
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				CODE_SIGN_STYLE = Automatic;
-				DEFINES_MODULE = YES;
-				DEVELOPMENT_TEAM = A798K58VVL;
-				DYLIB_COMPATIBILITY_VERSION = 1;
-				DYLIB_CURRENT_VERSION = 1;
-				DYLIB_INSTALL_NAME_BASE = "@rpath";
-				ENABLE_BITCODE = YES;
-				INFOPLIST_FILE = "paddle-mobile/Info.plist";
-				INSTALL_PATH = "$(LOCAL_LIBRARY_DIR)/Frameworks";
-				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
-				LD_RUNPATH_SEARCH_PATHS = (
-					"$(inherited)",
-					"@executable_path/Frameworks",
-					"@loader_path/Frameworks",
-				);
-				LIBRARY_SEARCH_PATHS = (
-					"$(inherited)",
-					"$(PROJECT_DIR)/paddle-mobile/CPU",
-				);
-				MACH_O_TYPE = staticlib;
-				MTL_LANGUAGE_REVISION = UseDeploymentTarget;
-				PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile";
-				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
-				PROVISIONING_PROFILE_SPECIFIER = "";
-				SKIP_INSTALL = YES;
-				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
-				SWIFT_VERSION = 4.0;
-				TARGETED_DEVICE_FAMILY = "1,2";
-			};
-			name = Debug;
-		};
-		FC039B7420E11C3C0081E9F8 /* Release */ = {
-			isa = XCBuildConfiguration;
-			baseConfigurationReference = E2A7957C92EDA5C3BEC0FFC2 /* Pods-paddle-mobile.release.xcconfig */;
-			buildSettings = {
-				CLANG_ENABLE_MODULES = YES;
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				CODE_SIGN_STYLE = Automatic;
-				DEFINES_MODULE = YES;
-				DEVELOPMENT_TEAM = A798K58VVL;
-				DYLIB_COMPATIBILITY_VERSION = 1;
-				DYLIB_CURRENT_VERSION = 1;
-				DYLIB_INSTALL_NAME_BASE = "@rpath";
-				ENABLE_BITCODE = YES;
-				INFOPLIST_FILE = "paddle-mobile/Info.plist";
-				INSTALL_PATH = "$(LOCAL_LIBRARY_DIR)/Frameworks";
-				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
-				LD_RUNPATH_SEARCH_PATHS = (
-					"$(inherited)",
-					"@executable_path/Frameworks",
-					"@loader_path/Frameworks",
-				);
-				LIBRARY_SEARCH_PATHS = (
-					"$(inherited)",
-					"$(PROJECT_DIR)/paddle-mobile/CPU",
-				);
-				MACH_O_TYPE = staticlib;
-				MTL_LANGUAGE_REVISION = UseDeploymentTarget;
-				PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile";
-				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
-				PROVISIONING_PROFILE_SPECIFIER = "";
-				SKIP_INSTALL = YES;
-				SWIFT_VERSION = 4.0;
-				TARGETED_DEVICE_FAMILY = "1,2";
-			};
-			name = Release;
-		};
-/* End XCBuildConfiguration section */
-
-/* Begin XCConfigurationList section */
-		FC039B6420E11C3C0081E9F8 /* Build configuration list for PBXProject "paddle-mobile" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				FC039B7020E11C3C0081E9F8 /* Debug */,
-				FC039B7120E11C3C0081E9F8 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		FC039B7220E11C3C0081E9F8 /* Build configuration list for PBXNativeTarget "paddle-mobile" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				FC039B7320E11C3C0081E9F8 /* Debug */,
-				FC039B7420E11C3C0081E9F8 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-/* End XCConfigurationList section */
-	};
-	rootObject = FC039B6120E11C3C0081E9F8 /* Project object */;
-}
diff --git a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.xcworkspace/contents.xcworkspacedata b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.xcworkspace/contents.xcworkspacedata
deleted file mode 100644
index bb84e46b46..0000000000
--- a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.xcworkspace/contents.xcworkspacedata
+++ /dev/null
@@ -1,7 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<Workspace
-   version = "1.0">
-   <FileRef
-      location = "self:paddle-mobile.xcodeproj">
-   </FileRef>
-</Workspace>
diff --git a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
deleted file mode 100644
index 18d981003d..0000000000
--- a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
+++ /dev/null
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>IDEDidComputeMac32BitWarning</key>
-	<true/>
-</dict>
-</plist>
diff --git a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
deleted file mode 100644
index 6b9abef67c5d123b4fb22117ed3f4f575de52aa0..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 9571
zcma)B34Bw<)}NWMWVyGwS(@Ce6etCR?mH;Z6)UtrL(ATfwwDr`CM8LMK19a_P*fBJ
zH=aUU5KvSEpNboZ0^)+o;(|Q)r-Gv5zCOP*xlP&tzR&j8-eqRaIWy;+|M}0{`c|Je
z6wb&vhA<*Xf%>4nsGq{D5T_?El7b;`z&|}XP`^NG2#01Qdx8z~z3_T!ayZbcM|j1?
zhlP`<KT;wCGNME@1f`&%Xc!ufE<z(vI?6-&r~nnB31}j!L{(@KnvABPYUDyQ&`jh;
zUbFx;BOhu(ezXuRMVBLru0c1T8_`YZ9&|6-fHtBI)QP&#HgrGQfgVA-(Ua&Y^c>oc
z4xm@i5%e~C4}FeKq3_W5=)dSsjIartv4}0$if!1A9oUHn;2}5#kHo1s9~a<4JPMD&
zlkpT>ja_&uuEDi<8lH|Dv4rR0COjYeaR@KNSKur0^>`J&0pE=8z<1)ia0l+hU3e4T
zjPJ+W@eceXehTlw`|$z%5`G!KhL7U6@Z0zu{678&e~LfD-{J4^5BNv?6H$;pq%Y}5
z`V%z~h>qxqNE~D+8Ah^5HpwTKkYX~CRFWz(oy;IJNj>qAAX!BIL6(xs2_?(P_2ee9
zhHNJHku78^*+%Xs50LHTL9&bNCi}?&@)CKCyiVRG?~wP&2joNY1No8sM1Cf}kYCAf
z<ahE1`7ilXffQIl6#W#OLQoiLlKpKy-%+GOYNSCN(jxt|Dpy*w)E=4%*KViRCO7yz
zp-?B{Q354Y(TN14qkZV3bdXy}&de#w&Mzp+O)V%W%1g~DF33$S%FE73&B!dwE-EQ4
zEG#L=atp?a(z0oRU~{O|(;$@w8roVUe>kexgd8aOK4eBBvLGw6Av^6$`_cYXNmW#R
zA9A7rC<)ooAT*e2C{Gipnrf*)Q{2L^-g2jT{f&Xe?kZQc6bc7|o_e3;3VXs*sVD4l
z3nJ?p=`qz)U+f81glfGZFZ6K>mY$|Sz*isFHUQd+8p06sG5cYBNx&Zt27Ep#SQi^x
z9`v+;urcgKbgFS(-6cgO*?GCSg{fs(Wm%~?1qIou#krZqsbz)P`9%eJ*@fk~IY-b)
z@I@+0W1eUP%O~mjTvi`VQA`wl`u4kI%->B-V}GSet>O5DM3X)Fq7ft0bMx{?$!>QG
zw!)<Bf~3rxjEtm$?82lB_|448frtG3q*_?UQQ4Vp!QAR;Z1hQ~ErEKkPfA<T5NMQI
zgMkHZLEn3#^2$zj3yJVH>foMLI;s_PdV{e$l%_9^_@SZ{0;82dOehTDP|*ngyz{(L
zkg5b5RZe3jWM&l}F^d-KOgRYsu*B2lTPfrgoK>#LvGFw(WqxK|W3@C-3QGP4sncSc
zUgeq;!LFkYr*^==!54_{jwGt-7z7bzI65RnJ9OA^h?Odrt05>!{)wLUKwG$`rlQmp
z4to7fFzfgxm-inM<ycU%TqrLt&nnL^PtDCQD@o1GE6YvI&B}zw+`OXfjNH=9qP)ym
zOW7KaOe)OCSOzmoO{<F+*qLfOnvto^%Fcnfta8cgDw`qScZb=4__p5WMtVSg=%^n2
zv2<zS$fS&ABa=XJ1}FyaE?X9P4-eh%nL&~FU{OIKSkzl>XMqh~kG^<zm8;YXnGo=M
zg6*B7ZEitZ<(eS1ON~X{Md9SwaYe-?rDf&gFRi$2!o<p|NmWy-T~ljn=R_q&OamWG
z^@dsE>A=OZDmi7PAhcD4Kxl21tEQr&6x>%4s`PmMZb7JWm3xDsF!XE<_#36*tSVOt
z1E#7-+6ggH@?wRc*%fa0NtK>Zvs*Ak6~$7sqBJ(os<xJTh&oIaw4aI+6+llPsur}g
zukO+eC~Z1GrNNMsSy0TX&{Pye?dUeN6-v@`=tcAzl%TKCStvk1pg%CdJQSdOC^04Y
zGAJ-}up4`D1C*8~C@a_Fd+|mnBAcLGyoFEVFYs6R2Pg;GBo~T6A(=wF#7~w*0RL99
z4xoN7IZWOpr^&bE7Xa==#Xv=lVw|E@At{;^UPZIQr|>HRiiL`hqD`?_(T*ZN1j|=6
zt^3-2QCv>BBPau9!c?*^lv3cR%C;~=P6nD?&@Bh$f-l%pER~e1zY41;qK`twD0vGS
zjV?x)pfPAH8i$HlI8p;O(nM;a<}IiMFue?wqw#R9pdz5Ug$|{cz_pwqyQwI|aKYmb
zmo<6;%|o(VAr1gwT=N5qt2~RmO&(}*3#q-4C=POK!d{;@++GxvJTAb{aIIHbEQ^Q@
zXKij_<X?&?l|tSo{{<v4BV_qrDKrPh%Z9R^0*bE0R8)(SJ5UX^cA#n0#%87~t(_V(
zrXno0gpMFqe`le(PBa_Mp>{f$4v7kwiyD!>3wcmIYM>74qyxH;1WVRLlVGU^(m``T
zTn}{4W=}npO@K!>=SokL-y3dglqL@zch%>SXprL|fLi188U;{pRCf?9x}fg%i0))M
zXz;k!jlF~|Lsvjfbq7#UFz9LTLd(#VC>OFi@+j-P3SAvZ$Y`X+K&1;^)m#0wXeCPS
zM9a|%I*bnQL@UtsbU3{zvW70n2ZU5=jJeS*Bu!`Np9YSY;0>q68qxr{+Zw`cK`A<|
z)o2~#8#kj{&>D0rWb#@%f{vuAG>xXyjQh~-z&GxMzq`?TnhDTe0Dq(CXr`C(n6f3U
z9>AZ*i0h_8m2e9K;dQmt0QD<c`pUh&u*51xnV)4-qa0x|qvB)~OY;IjDK<hDX$O(6
zl1fHk+Cs6mZfT5kbh8&w8wMn1y}Or$jgL-s6S@!SJJ4pD)q%FqY{uAF*IJLSO$rr-
zdUt&QJ;)+;JI(1p57AtjGAFL`M<wn=|6)$rg&w7OG@sq<f}1>=GWo*&pGMEdEZ9Q}
z>7b~fedu|Vym4b>RhKS})}pNZ%#lg?1zF3MMf)8@hhhR=q8IlFcomR0YA~}QB|73L
zIv&fpH|ZFfBInmTsICj0K<}cHbSy2RB|SOzKKl5)I3EK!_9>uM0v*TVe4&h>Q5C1r
zSLcl=f)Qt_8AcSd5jP}7hD1mFfPOh|#IG>ocUlS~%9v*uvc&*}!3d<+xZXlA#u}8o
z4Qh-6_rZN}KinTHu?nl{czP+VpqJ4JbRw;!RdmueGywA;z66zE9oAz5HbN9n22~>b
z-at3djdUwpdx61NsZ`(A)FcIEEHr@~GL}_BLb)LX{f(YrBZD~BS#E+{sFinz^0TY0
zz6E;E>@`p@z=%?5o~O+hj%bj@Cw0H-VQ$sHD5Vf&kB6}f;B~sFK{1@1>5DM2UIB}f
z@E~wA9!RHj;K8(-O=gI!y}MNWm4a@U;GuXp6EKXrI`BnwDovTnm`1cU4QGMFa5~Pw
znY4!1(rH~d8&)%yPKOn|o;hrIoVv(f!v$5z-xRrxktHY(P%z{BcIWE3@5f7l7AYKT
zk3=jUjW6lI7t<NEJ!;fgJRS-w9*2u?F%XPWT!zc(Ogf9srgLZ=b<??9@TIr{n8yV8
zt;AK-L+j~Mx*UF2(3Lc0Hb6<Fwoh$um1fB{MXUwVE7LdzSMd%3f5^ZEEe!Nz!exm<
z-38ig(YCTB!0g!(LQjR76>Y1jo(S8EhCoZ2r?oY@wMb)o3xKrfglFJ7q~C;R;#qh$
zo<kdGBbDg9O+c#V0<Eg2P1H{V^dHO{!y@>fJy{kYPc1DJXbUz-6M;#jg%|-?6d?d3
zFVJ;zP+H^-w1uX6f=yC4>4@-4fbI))kD_gKHWLF4(dbRNKzFzNuD9^$6ur0^!Z1P+
zu#e8~hASKZI)Gd8Lh7Zy7*ydfUUDvs;C7URm(m50+s$kp2KLN3mOjjFJuVnLu5oT0
zu26jKxl$R)z$+t#zJ<ZripT&28=&)NB1h|uELv}(t+f3Zz6Gzrw;~O`4X?%P@a;1~
zk$8tVm=X52!0z1B(mJcL`xpS?fQD$0E~HD`!mN1^pQF;#(`!P2F6lmRTd>XR3;3JT
zE8E9=!}Htf)8oogI;4xoch0-G9z9#u!9{ypSEp%}OQI`(H@*jI8D5Y7NyD_Q6S(;X
zx`-~ODYbI+)XMI<;GE;DU4+=+`|uXG;E417x#7`;x8iLm_y6k#{0Nxaxr1f;bR~YE
zlaZ?EEFQ!U$%XVdegyA?Mc;)V#s9*O;m7e4csC&Yxnk;iB4M&8JU`N-1(2`jH9Hak
zinC<Wyy)1THhMK(Mz5up)2rM<dStGQa>ao_X@H4Ki*2=H3``D`r&*vpL$3f>!+Y^_
zVDUb{)fZ;==%|Tkr&j{d(hDv0LHwdyU<|1TvgG}FWZYzrH`s{}u~q(`eGFIrDgPXe
z#OfM|)K~Fgh{J0roGZdE3BQitxUeTW@ewBeFP(&s;p6h@RNTV!m>Z9<o%l^Aj-6)x
zt^V<A;ui8{Kg30ztF5&)5Dcfup&a#NT3nx__ynx}yZA8vH(tk%M(RBw2{z7YLCFVb
zwn(CgUPl+oi}3;eum*PJ)e}4M2Pju|<;TpGpU_pb9r!>Giz~=0fCGi>!WfIgpW`zq
zc@sW`Ps5(~OL`-{iLRzMZ-VXb*Z3QJmfk{dr+3o3Xv*}7(!9tvZAP`XX@2Bxrd-b>
zWF&IF6oa)GtBB^p&-f3f<`?`c{tf?5*U($(ZFKD>AoC>zp(LO{B3%bS30%Qe>j`>U
zv4h=`w3rc_P_<{V%#7WFHNLgGT)P?qtw8sa;u<5vVWnZC*bwx#vb`#7U*oz&q_c_}
zSv8`>mu!sHbfO_V;6LH$9UZ6y>;v0$aB>z3_`3H5Y~lj$o+2@j#Avx7CVF>wxgZuO
z7sN_zbUnQ%RxXH>3_2Is@hOx<hJXhW=sy{-FHP!Q3dnGhdftfBFd_pymO$@~j1b~R
z<d9q$f1(x^kWn!UN7Ig|g=3h7W63z$NjLXcSVG31tB+g?`Yxkgpl=f$Gy{@Cj^V-M
zymR9;Pa>}K#!f{8NG-h&JiUb#L$=$Al1Z6T&)yr6ag59&v&kH!A$9mLnM>ddKpzYQ
z!g8!Jo>|4nuV8MH{NA>ZYymsal$M0$7O^$xZGoe(^P1=mNSkeR*PEn)G=c(&%p*;p
zq>e0rqn>6|JsrNG_!{AevbNO&|HHEym|TCDMF>P!V|QWvE3}vu2zsMr88q#nJL&zs
zu%EOLKf`{qkUmTwh(Ps2a@>U=TS=I1j{z@P%z&3Zcs|blFKmpP7r6>%wg6S*!>F1*
z0_N;w;0fm$WChbU7Zw88EPyFlMQ(5lL%|=hx)66%6Tk4C<VJ>wf494=VKuqgEe!q}
z8al}>O#0vH8V%}O$$FH$mE1<w!m-@#<PLHtxr^LQ|3x38kJBgUZu%sBiaxy+BKID0
zFWEpgk`B^ISPVZy570yOE&4V+LEmNBZ7hbpU2|)|>ko$_pBCU71so7qyZ1$MJVi)(
zs5}q^zzxlozd*Pe=1c4ogHe90Zu0}?guVU<@q}29g1mgB3Mgci@7URkzX^uI`2^eP
zRru!xpnH_U#=4908HQ{?R2b7AU4)0o4#;d=Nmq1`N9eP$EjM|TJjNQT=*kZAINj6J
z@Fdy80Qr3WPM#zCBK&<XeGd3LEX!XJOLC9}(u;IoFJehvCa*AJNnfDPGvX*yyM$L3
zMlF4V9P3T($Z?={`x&*npq}Wc6XYbEHIR4d!4C3o`XU^zK`^oHj?6CN%WibgN8}5n
z-%36vpOF8MPswNGb8?EDrZ3T#=_~YAdYHaOU#D+uC11jJd;s~Hd_&HXZ=n_)f!N^b
zQF@FXr*Fdjpc&!F*27ggVWu}!=JPhmU#Cp+>r&XBG)NVpX<jJKQV6_j?A;LEf580Y
z*Aesx!&VATzG5$BA?2PHuMaAAVs8Pg%J@nmn0JmwKt1>v5(X&-Z|nSk9}YxY+JGMf
zf|;3V+3>}yaZ$K2VqNMv?V~cvvP&~^ax+uQb8-q(bISAKJ7HdKMrw9`d2va0PHAaY
zNnY)ef;@L#POWz`lRZ<uWM+(%DTYFU>N*e)A0CHvFSw$=LIuIb-gPL{^qubdqTpj4
zBlx4xp}LK9h<tBkJO!eY@sxAFTD+mKD6BP+e4`jn`z`RHaRkamqtICVD?x-Oi6jX=
z)s7@-B%}MBel&cbZ6_-L4%Pt_+(p(y)^C8%v5+8SC(8!%1bLGD2R^sj6=M{Yiq(oc
z6dU@V>Hl^ApOp$Fr?e^uDu*dEm08Mc<pkw)Ws}mYY*w}?1ImTUkg`pAt#XC(I^`<m
zjmp)^Ta>pd*D4=W?p1!HQmC?2Wh$4dMpdtpR86V{Dxb=)x>|LE>K4_ls<o=yRb8t4
zRohh$sdlIySM63krFurSNA;@egzAjyE7ebGq~_In^#FB>dYJkm^+<KLI#->qE>w?J
zm#Qn(v(*jiW_4J-O1(yXuewXUS-nNQUHy=HhkB>_fcljBM~y<$SJPjk(r7fiMys)F
zoSG!fAWgC+MKeruk)}u^Y1V3XY4&N3X+G3^qWM&FO7n&0jOI^{<FuTPGjb+Q<Wjh7
zu7DfGUCb46C0rRdp1YK*<r=to+<dN?YvBUi<=isv3XXDjaGSU-+%|4I_Yk*(dxqP~
z?c?@y2f0Jso7_9xyWD%+2i!;87u>hp_uP-%&)lzke?Ea1cs+0AZM=gYzz^gH^CS5}
zejH!Sm-6L&B|nLu!n^nyzK(C^gM64@#4q8m<gend;g|C(`Mdcpeh2?Nzn?$IAL3u;
zU*%up-{6n($NA6s)BKnGSNu2pxBU0~kNnU4ul(-`+Js>Vc?p#X(-Zs&YZ5jm>_~Vf
zVQ<2|g#8Hz6AmSuP57@CYZcmlTBTO4)oSh9!P+6(q1xfv3~iP+N1LZD&^Br>*RIvx
zqur?O)Nax~qTQ?Ar+q<tK>MQhCG9KP!`gpq-`9Sq{aE`S?PuCk+Ap+cw7&}dgn>e?
zFhvLmONDhpr|^LApzyHph_Fj|T6k92E9?_q5Do}ubxFEOx+S{hy6beSbT{d4)~(Uq
zt=pjM&~@oH>vrmP>z>j*quZl9pnFmGlI|7VVcoa7-}FkoRu7+Q^@)1B-l<R057H;=
zQ}xCA3HmDiWPP>1LEoZZtiMLTT)$F(z5WLMDgC$lKMj2h{R~P2Z_pZa27|$B7-+~d
zlo~1xErzgRt>J#dF2fUs{e}aEgN8$f_Y7x^$Y?O88#9gB#$02*vC!CT3>z04+l@<&
z*BNg!-ep{GyvMk~_@MDI<8k8&<1dLMQJbhsG$bY_Iumme3lk?LPEWih@w&uaiO(he
zW>T62lilPrC7A}9Mwl{9d8V<ZVpFN9+%(6uz!WmInHHPcO-oJJnpT*uGp#b+Xj*N$
z!_;BgZhFD=hUtXqr0G4=2d0lqUzpCAzBZjTeP{Z?Y%ph-U1p!T&AiCG#Jt>moq3h{
zCiBhaHRjFcE#^nfkDGU!pEf^h-fKQ&K5Ty7e8har{Gs`C^J(*!=C8~@n13?=V*bti
zhuBXvh*r@qI>jV$s5o34A*PDyVu4sDmW$)X3b9t4A<h!(#JOU<*ete+K`|^Y60Z?g
zi8qR?#aqNX#Jj}x;yvO9ahteP+#?<kUld;wUlEUr$HlkAcf@zaGvbey{ua9>&oag0
zvea0nS!P&*mZg@fEvqe$Se~)$u{>va-m>3v!gA8`p5+6}N0v`4XDr`XzO{UB#a4y2
zueHB5(dx90u%=qmt(n%#tP`zO*2&gt>nv-Xb*{DE+H75Cz1#YN^;PS4)}L)o+X&mm
zwlTJGwqjeUt=x90?K0a$+Z>zQ=CL){BwLfsYiqW(*aEf{wl3Qa+d<nqwjb?E`(S&9
zJ=>mZFR+iYUu>_iPqbIrr`TQg8hgFH+1_Fg*caME_RH;8*s1+$`?dBJ_VxBn_AT~p
z_6O`w+n=@XwePdPV1Lzq!v4PfL;J_}&+Vt}U)sNNC>(;r;7D|c4y(iNxX4l9D09?0
zraNXjW;^N}a~<`LM#ntIVn@4UsbiVrO2<`>YaGiRD;?K6Zgbr0c-(Q=@v-A)r{Em!
z9P6acmCn`9yPTcQ&CV^(ZO+G>yPZ!tpLOna?sLBEJnVekdBpjy^Q7}V=V#8-&M%!`
a4L}3>4$wq)6o|kUll>@c@-_N@0RMk)LA31v

diff --git a/metal/paddle-mobile/paddle-mobile.xcodeproj/xcshareddata/xcschemes/paddle-mobile.xcscheme b/metal/paddle-mobile/paddle-mobile.xcodeproj/xcshareddata/xcschemes/paddle-mobile.xcscheme
deleted file mode 100644
index 2b5b1c45c0..0000000000
--- a/metal/paddle-mobile/paddle-mobile.xcodeproj/xcshareddata/xcschemes/paddle-mobile.xcscheme
+++ /dev/null
@@ -1,80 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<Scheme
-   LastUpgradeVersion = "1010"
-   version = "1.3">
-   <BuildAction
-      parallelizeBuildables = "YES"
-      buildImplicitDependencies = "YES">
-      <BuildActionEntries>
-         <BuildActionEntry
-            buildForTesting = "YES"
-            buildForRunning = "YES"
-            buildForProfiling = "YES"
-            buildForArchiving = "YES"
-            buildForAnalyzing = "YES">
-            <BuildableReference
-               BuildableIdentifier = "primary"
-               BlueprintIdentifier = "FC039B6920E11C3C0081E9F8"
-               BuildableName = "paddle_mobile.framework"
-               BlueprintName = "paddle-mobile"
-               ReferencedContainer = "container:paddle-mobile.xcodeproj">
-            </BuildableReference>
-         </BuildActionEntry>
-      </BuildActionEntries>
-   </BuildAction>
-   <TestAction
-      buildConfiguration = "Debug"
-      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
-      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
-      shouldUseLaunchSchemeArgsEnv = "YES">
-      <Testables>
-      </Testables>
-      <AdditionalOptions>
-      </AdditionalOptions>
-   </TestAction>
-   <LaunchAction
-      buildConfiguration = "Debug"
-      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
-      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
-      launchStyle = "0"
-      useCustomWorkingDirectory = "NO"
-      ignoresPersistentStateOnLaunch = "NO"
-      debugDocumentVersioning = "YES"
-      debugServiceExtension = "internal"
-      allowLocationSimulation = "YES">
-      <MacroExpansion>
-         <BuildableReference
-            BuildableIdentifier = "primary"
-            BlueprintIdentifier = "FC039B6920E11C3C0081E9F8"
-            BuildableName = "paddle_mobile.framework"
-            BlueprintName = "paddle-mobile"
-            ReferencedContainer = "container:paddle-mobile.xcodeproj">
-         </BuildableReference>
-      </MacroExpansion>
-      <AdditionalOptions>
-      </AdditionalOptions>
-   </LaunchAction>
-   <ProfileAction
-      buildConfiguration = "Release"
-      shouldUseLaunchSchemeArgsEnv = "YES"
-      savedToolIdentifier = ""
-      useCustomWorkingDirectory = "NO"
-      debugDocumentVersioning = "YES">
-      <MacroExpansion>
-         <BuildableReference
-            BuildableIdentifier = "primary"
-            BlueprintIdentifier = "FC039B6920E11C3C0081E9F8"
-            BuildableName = "paddle_mobile.framework"
-            BlueprintName = "paddle-mobile"
-            ReferencedContainer = "container:paddle-mobile.xcodeproj">
-         </BuildableReference>
-      </MacroExpansion>
-   </ProfileAction>
-   <AnalyzeAction
-      buildConfiguration = "Debug">
-   </AnalyzeAction>
-   <ArchiveAction
-      buildConfiguration = "Release"
-      revealArchiveInOrganizer = "YES">
-   </ArchiveAction>
-</Scheme>
diff --git a/metal/paddle-mobile/paddle-mobile/API/GlobalConfig.swift b/metal/paddle-mobile/paddle-mobile/API/GlobalConfig.swift
deleted file mode 100644
index 57b4f82017..0000000000
--- a/metal/paddle-mobile/paddle-mobile/API/GlobalConfig.swift
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-@objc public enum MetalLoadMode: Int {
-    case
-    LoadMetalInPaddleMobile   = 1,     // 使用 paddle-mobile 中的 metal 代码
-    LoadMetalInDefaultLib     = 2,     // 使用 main bundle 中的 metal 代码
-    LoadMetalInCustomMetalLib = 3      // 使用 metal 库文件
-}
-
-@objc public enum Precision: Int {
-    case
-    Float32 = 1,
-    Float16 = 2
-}
-
-@objc public class GlobalConfig: NSObject {
-    
-    /// 单例
-    @objc public static let shared: GlobalConfig = GlobalConfig.init()
-    
-    /// 运算精度， runner 生命周期中不可变
-    @objc public var computePrecision: Precision = .Float16
-    
-    /// 是否开启 log
-    @objc public var debug: Bool = false
-}
diff --git a/metal/paddle-mobile/paddle-mobile/API/Net.swift b/metal/paddle-mobile/paddle-mobile/API/Net.swift
deleted file mode 100644
index deab4de33b..0000000000
--- a/metal/paddle-mobile/paddle-mobile/API/Net.swift
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Metal
-import Foundation
-
-/// 网络的基类， 参数已经给了默认值，请在子类实现中修改需要改的参数
-@objc open class Net: NSObject {
-    
-    /// 默认为0， 如果指定个数， 后边 except 个op不使用 GPU 运算， 中间结果会通过 fetchResult 传参过来
-    @objc public var except: Int = 0
-    
-    /// 预处理 kernel， 如果输入图像需要预处理， 则指定预处理 kernel
-    @objc public var preprocessKernel: CusomKernel? = nil
-    
-    // 以下四个参数为从内存中读取模型时用到的参数
-    /// 模型在内存中的指针
-    @objc public var modelPointer: UnsafeMutableRawPointer? = nil
-    
-    /// 模型大小 单位： 字节
-    @objc public var modelSize: Int = 0
-    
-    /// 权重参数在内存中的指针
-    @objc public var paramPointer: UnsafeMutableRawPointer? = nil
-    
-    /// 权重大小 单位： 字节
-    @objc public var paramSize: Int = 0
-    
-    // 以下两个为从文件中读取模型时用到的参数
-    /// 模型文件路径
-    @objc public var modelPath: String? = nil
-    
-    /// 权重文件路径
-    @objc public var paramPath: String? = nil
-    
-    /// 代表着 GPU 处理器
-    @objc public let device: MTLDevice
-    
-    /// metal 代码加载方式 注意： 如果静态库只能使用 LoadMetalInDefaultLib LoadMetalInCustomMetalLib 进行 load metal 代码
-    @objc public var metalLoadMode: MetalLoadMode = .LoadMetalInPaddleMobile
-    
-    /// 当 metalLoadMode 为 LoadMetalInCustomMetalLib 时， metal library 路径不能为空
-    @objc public var metalLibPath: String? = nil
-    
-    /// 输入维度，按照 n h w c 方式传入
-    @objc public var inputDim: Dim = Dim.init(inDim: [])
-    
-    /// 是否使用 MetalPerformanceShaders 进行运算, 运算精度为 32 位时不支持开启 MPS
-    @objc public var useMPS: Bool = false
-    
-    /// 是否使用最高等级的加速策略
-    @objc public var useAggressiveOptimization: Bool = false
-    
-    /// 模型精度
-    @objc public var paramPrecision: Precision = .Float32
-
-    @objc public init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) throws {
-        self.paramPointer = inParamPointer
-        self.paramSize = inParamSize
-        self.modelPointer = inModelPointer
-        self.modelSize = inModelSize
-        self.device = device
-        super.init()
-    }
-    
-    @objc public init(device: MTLDevice) throws {
-        self.device = device
-        super.init()
-    }
-    
-    @objc open func resultStr(res: [ResultHolder]) -> String {
-        return ""
-    }
-    
-    @objc open func fetchResult(paddleMobileRes: [GPUResultHolder]) -> [ResultHolder] {
-        let results = try? paddleMobileRes.map { (gpuRes) -> ResultHolder in
-            guard let inResPointer = gpuRes.resultPointer else {
-                throw PaddleMobileError.makeError(type: .defaultError, msg: "resultPointer nil")
-            }
-            return ResultHolder.init(inResult: inResPointer, inCapacity: gpuRes.capacity, inDim: gpuRes.dim)
-        }
-        return results ?? []
-    }
-    
-    open func updateProgram(program: Program) throws {
-    }
-    
-}
diff --git a/metal/paddle-mobile/paddle-mobile/API/Runner.swift b/metal/paddle-mobile/paddle-mobile/API/Runner.swift
deleted file mode 100644
index 882af60f79..0000000000
--- a/metal/paddle-mobile/paddle-mobile/API/Runner.swift
+++ /dev/null
@@ -1,415 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import MetalKit
-import Foundation
-
-@objc public class ResultHolder: NSObject {
-    @objc public let result: UnsafeMutablePointer<Float32>
-    @objc public let capacity: Int
-    @objc public let dim: [Int]
-    
-    init(inResult: UnsafeMutablePointer<Float32>, inCapacity: Int, inDim: [Int]) {
-        result = inResult
-        capacity = inCapacity
-        dim = inDim
-    }
-    
-    @objc public func releasePointer() {
-        result.deinitialize(count: capacity)
-        result.deallocate()
-    }
-}
-
-@objc public class Runner: NSObject {
-    var program: Program?
-    var executor: Executorable?
-    var memoryManager: MemoryManager?
-    var queue: MTLCommandQueue?
-    var textureLoader: MTKTextureLoader?
-    public let net: Net
-    let device: MTLDevice?
-    private static let loadLock = NSLock()
-    private static let clearLock = NSLock()
-    /// 初始化函数
-    ///
-    /// - Parameters:
-    ///   - inNet: 传入自定义的网络
-    ///   - commandQueue: commandQueue
-    @objc public init(inNet: Net, commandQueue: MTLCommandQueue?) throws {
-        guard inNet.inputDim.cout() == 4 else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "input dim count must 4")
-        }
-        
-        net = inNet
-        queue = commandQueue
-        device = queue?.device
-        if let inDevice = device {
-            textureLoader = MTKTextureLoader.init(device: inDevice)
-        }
-    }
-    
-    /// load 模型, 返回 true 可进行预测，公共方法，保证线程安全
-    ///
-    /// - Returns: load 成功或失败
-    @objc public func load() -> Bool {
-        Runner.loadLock.lock()
-        let success = unSafeLoad()
-        Runner.loadLock.unlock()
-        if !success {
-            clear()
-        }
-        return success
-    }
-    
-    /// load 模型, 返回 true 可进行预测，公共方法，保证线程安全
-    ///
-    /// - Returns: load 成功或失败
-    @objc public func load(optimizeProgram: Bool, optimizeMemory: Bool = true) -> Bool {
-        Runner.loadLock.lock()
-        let success = unSafeLoad(optimizeProgram: optimizeProgram, optimizeMemory: optimizeMemory)
-        Runner.loadLock.unlock()
-        if !success {
-            clear()
-        }
-        return success
-    }
-    
-    /// load 模型, 返回 true 可进行预测，不保证线程安全
-    ///
-    /// - Returns: load 成功或失败
-    private func unSafeLoad(optimizeProgram: Bool = true, optimizeMemory: Bool = true) -> Bool {
-        guard let inDevice = device, let inQueue = queue else {
-            paddleMobileLog("paddle mobile gpu load error, need MTLCommandQueue", logLevel: .FatalError, callStack: Thread.callStackSymbols)
-            return false
-        }
-        var loader: Loaderable
-        switch net.paramPrecision {
-        case .Float16:
-            loader = Loader<Float16>.init()
-        case .Float32:
-            loader = Loader<Float32>.init()
-        }
-        
-        do {
-            if let inParamPointer = net.paramPointer, let inModelPointer = net.modelPointer {
-                guard net.paramSize > 0 && net.modelSize > 0 else {
-                    paddleMobileLog("load from memory param size or model size can't 0", logLevel: .FatalError, callStack: Thread.callStackSymbols)
-                    return false
-                }
-                program = try loader.load(device: inDevice, paramPointer: inParamPointer, paramSize: net.paramSize, modePointer: inModelPointer, modelSize: net.modelSize, optimize: optimizeProgram)
-            } else if let inModelPath = net.modelPath, let inParamPath = net.paramPath {
-                program = try loader.load(device: inDevice, modelPath: inModelPath, paraPath: inParamPath, optimize: optimizeProgram)
-            } else {
-                paddleMobileLog("model pointer or model file path need be specified", logLevel: .FatalError, callStack: Thread.callStackSymbols)
-                return false
-            }
-            
-            let initContext: InitContext = InitContext.init()
-            initContext.metalLoadMode = net.metalLoadMode
-            initContext.metalLibPath = net.metalLibPath
-            initContext.useMPS = net.useMPS
-            initContext.useAggressiveOptimization = net.useAggressiveOptimization
-
-            switch net.paramPrecision {
-            case .Float16:
-                executor = try Executor<Float16>.init(inDevice: inDevice, inQueue: inQueue, inProgram: program!, initContext: initContext)
-            case .Float32:
-                executor = try Executor<Float32>.init(inDevice: inDevice, inQueue: inQueue, inProgram: program!, initContext: initContext)
-            }
-            
-            try net.updateProgram(program: program!)
-            
-            if optimizeMemory, #available(iOS 10.0, *) {
-                memoryManager = MemoryOptimize(program: program!, device: inDevice)
-            } else {
-                memoryManager = MemoryManager(program: program!, device: inDevice)
-            }
-            memoryManager?.optimizeProgramMemory()
-            memoryManager?.makeMetalTextures()
-        } catch _ {
-            return false
-        }
-        return true
-    }
-    
-    /// 预测
-    ///
-    /// - Parameters:
-    ///   - texture: 输入 texture 需要使用 getTexture 获得
-    ///   - completion: 结果回调， 当 success 为 true 时 result 不为 nil
-    @objc public func predict(texture: MTLTexture, completion: @escaping ( _ success: Bool, _ result: [ResultHolder]?) -> Void) {
-        do {
-            guard let executor = self.executor else {
-                paddleMobileLog("executor is empty", logLevel: .FatalError, callStack: Thread.callStackSymbols)
-                DispatchQueue.main.async {
-                    completion(false, nil)
-                }
-                return
-            }
-            guard texture != nil else {
-                paddleMobileLog("texture is nil", logLevel: .FatalError, callStack: Thread.callStackSymbols)
-                DispatchQueue.main.async {
-                    completion(false, nil)
-                }
-                return
-            }
-            try executor.predict(input: texture, dim: self.net.inputDim, completionHandle: { [weak self] (success, res) in
-                if success, let SSelf = self, let res = res {
-                    let result = SSelf.net.fetchResult(paddleMobileRes: res)
-                    if result.count > 0 {
-                        completion(true, result)
-                        return
-                    }
-                }
-                completion(false, nil)
-            }, preProcessKernle: self.net.preprocessKernel, except: self.net.except)
-        } catch _ {
-            DispatchQueue.main.async {
-                completion(false, nil)
-            }
-            return
-        }
-    }
-    
-    /// 清理内存, 调用此函数后, 不能再使用, 需重新 load
-    @objc public func clear() {
-        Runner.clearLock.lock()
-        executor?.clear()
-        executor = nil
-        program = nil
-        memoryManager = nil
-        Runner.clearLock.unlock()
-    }
-    
-    /// 获取 texture, 对 texture 进行预处理, 预测时使用
-    ///
-    /// - Parameters:
-    ///   - image: 输入图像
-    ///   - getTexture: 获取 texture 回调
-    @objc public func getTexture(image: CGImage, getTexture: @escaping (Bool, MTLTexture?) -> Void) {
-        if let textureLoader = textureLoader, let texture = try? textureLoader.newTexture(cgImage: image, options: [:])  {
-            scaleTexture(input: texture, complete: getTexture)
-        } else {
-            DispatchQueue.main.async {
-                getTexture(false, nil)
-            }
-        }
-    }
-    
-    /// 通过 buffer 获取 texture， 内部会使用GPU进行转换操作
-    ///
-    /// - Parameters:
-    ///   - inBuffer: 输入buffer
-    ///   - getTexture: 结果回调
-    @objc public func getTexture(inBuffer: MTLBuffer, getTexture: @escaping (Bool, MTLTexture?) -> Void, channelNum: Int = 1) {
-        guard let inQueue = queue, let inDevice = device else {
-            DispatchQueue.main.async {
-                getTexture(false, nil)
-            }
-            return
-        }
-        
-        guard let buffer = inQueue.makeCommandBuffer() else {
-            DispatchQueue.main.async {
-                getTexture(false, nil)
-            }
-            return
-        }
-        
-        do {
-            let bufferToTextureKernel = try BufferToTextureKernel.init(device: inDevice, outputDim: Shape.init(inWidth: net.inputDim[2], inHeight: net.inputDim[1], inChannel: net.inputDim[3]), metalLoadMode: net.metalLoadMode, metalLibPath: net.metalLibPath, channelNum: channelNum)
-            try bufferToTextureKernel.compute(inputBuffer: inBuffer, commandBuffer: buffer)
-            buffer.addCompletedHandler { (buffer) in
-                getTexture(true, bufferToTextureKernel.outputTexture)
-            }
-            buffer.commit()
-        } catch _ {
-            DispatchQueue.main.async {
-                getTexture(false, nil)
-            }
-            return
-        }
-    }
-    
-    /// 更新输入维度， 针对可变长输入模型
-    ///
-    /// - Parameter inDim: 输入维度
-    @objc public func updateInputDim(inDim: Dim) -> Bool {
-        if net.inputDim != inDim {
-            guard let inProgram = program else {
-                paddleMobileLog("need load first", logLevel: .FatalError, callStack: Thread.callStackSymbols)
-                return false
-            }
-            net.inputDim = inDim
-            do {
-                try net.updateProgram(program: inProgram)
-                memoryManager?.reallocMemory()
-                memoryManager?.makeMetalTextures()
-            } catch _ {
-                return false
-            }
-        }
-        return true
-    }
-    
-    public func scaleTexture(input: MTLTexture , complete: @escaping (Bool, MTLTexture?) -> Void) {
-        
-        guard let inQueue = queue, let inDevice = device else {
-            DispatchQueue.main.async {
-                complete(false, nil)
-            }
-            return
-        }
-        
-        guard let buffer = inQueue.makeCommandBuffer() else {
-            DispatchQueue.main.async {
-                complete(false, nil)
-            }
-            return
-        }
-        
-        do {
-            let scaleKernel = try ScaleKernel.init(device: inDevice, shape: Shape.init(inWidth: net.inputDim[2], inHeight: net.inputDim[1], inChannel: 3), metalLoadMode: net.metalLoadMode, metalLibPath: net.metalLibPath)
-            try scaleKernel.compute(inputTexuture: input, commandBuffer: buffer)
-            buffer.addCompletedHandler { (buffer) in
-                complete(true, scaleKernel.outputTexture)
-            }
-            buffer.commit()
-        } catch _ {
-            DispatchQueue.main.async {
-                complete(false, nil)
-            }
-            return
-        }
-    }
-    
-    public func feedOpOutputVarDesc() -> PMVarDesc? {
-        guard let program = program else {
-            paddleMobileLog("need load program first")
-            return nil
-        }
-        var feedOp: PMOpDesc? = nil
-        var feedBlock: PMBlockDesc? = nil
-        for block in program.programDesc.blocks {
-            for op in block.ops {
-                if op.type == gFeedType {
-                    feedOp = op
-                    feedBlock = block
-                    break
-                }
-            }
-            if feedOp != nil && feedBlock != nil{
-                break
-            }
-        }
-        if let feedOp = feedOp, let feedBlock = feedBlock {
-            guard let outputKey = opInfos[gFeedType]?.outputs.first else {
-                return nil
-            }
-            guard let feedVarName = feedOp.outputs[outputKey]?.first else {
-                return nil
-            }
-            for varDesc in feedBlock.vars {
-                if varDesc.name == feedVarName {
-                    return varDesc
-                }
-            }
-        }
-        return nil
-    }
-    
-    public func fetchOpInputVarDesc() -> [PMVarDesc]? {
-        guard let program = program else {
-            paddleMobileLog("need load program first")
-            return nil
-        }
-        var fetchOp: PMOpDesc? = nil
-        var fetchBlock: PMBlockDesc? = nil
-        for block in program.programDesc.blocks {
-            for op in block.ops {
-                if op.type == gFetchType {
-                    fetchOp = op
-                    fetchBlock = block
-                    break
-                }
-            }
-            if fetchOp != nil && fetchBlock != nil{
-                break
-            }
-        }
-        if let fetchOp = fetchOp, let fetchBlock = fetchBlock {
-            guard let outKey = opInfos[gFetchType]?.inputs.first else {
-                return nil
-            }
-            guard let fetchVarNames = fetchOp.inputs[outKey] else {
-                return nil
-            }
-            var varDescs: [PMVarDesc] = []
-            for varName in fetchVarNames {
-                for varDesc in fetchBlock.vars {
-                    if varDesc.name == varName {
-                        varDescs.append(varDesc)
-                    }
-                }
-            }
-            return varDescs
-        }
-        return nil
-    }
-    
-    @objc public func fetchVar(_ varName: String) -> [Float]? {
-        guard let value = program?.scope[varName] else {
-            return nil
-        }
-        if let texture = value as? Texture {
-            do {
-                if texture.transpose == [0, 2, 3, 1] {
-                    return try texture.metalTexture?.toTensor(dim: (n: texture.padToFourDim[0], c: texture.padToFourDim[1], h: texture.padToFourDim[2], w: texture.padToFourDim[3]))
-                } else if texture.transpose == [0, 1, 2, 3] {
-                    return try texture.realNHWC()
-                } else {
-                    paddleMobileLog("unsupported transpose: \(texture.transpose)", logLevel: .Warning)
-                }
-            } catch _ {
-                return nil
-            }
-        }
-        return nil
-    }
-    
-    public func getAllOutputVars() -> [String] {
-        var orderedVars = [String]()
-        let program = self.program!
-        let programDesc = program.programDesc
-        let scope = program.scope
-        for block in programDesc.blocks {
-            var varsDic = [String: PMVarDesc]()
-            for varDesc in block.vars {
-                varsDic[varDesc.name] = varDesc
-            }
-            for op in block.ops {
-                let outputs = op.outputs
-                for dicPair in outputs {
-                    for varName in dicPair.value {
-                        if scope[varName] is Texture {
-                            orderedVars.append(varName)
-                        }
-                    }
-                }
-            }
-        }
-        return orderedVars
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Info.plist b/metal/paddle-mobile/paddle-mobile/Info.plist
deleted file mode 100644
index 1007fd9dd7..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Info.plist
+++ /dev/null
@@ -1,24 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>CFBundleDevelopmentRegion</key>
-	<string>$(DEVELOPMENT_LANGUAGE)</string>
-	<key>CFBundleExecutable</key>
-	<string>$(EXECUTABLE_NAME)</string>
-	<key>CFBundleIdentifier</key>
-	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
-	<key>CFBundleInfoDictionaryVersion</key>
-	<string>6.0</string>
-	<key>CFBundleName</key>
-	<string>$(PRODUCT_NAME)</string>
-	<key>CFBundlePackageType</key>
-	<string>FMWK</string>
-	<key>CFBundleShortVersionString</key>
-	<string>1.0</string>
-	<key>CFBundleVersion</key>
-	<string>$(CURRENT_PROJECT_VERSION)</string>
-	<key>NSPrincipalClass</key>
-	<string></string>
-</dict>
-</plist>
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/Errors.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/Errors.swift
deleted file mode 100644
index cf2f2b00e2..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Common/Errors.swift
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-public enum PaddleMobileErrorType {
-    case loaderError
-    case netError
-    case memoryError
-    case paramError
-    case opError
-    case predictError
-    case defaultError
-}
-
-public enum PaddleMobileError: Error{
-    case loaderError(message: String)
-    case netError(message: String)
-    case memoryError(message: String)
-    case paramError(message: String)
-    case opError(message: String)
-    case predictError(message: String)
-    case defaultError(message: String)
-    
-    static public func makeError(type: PaddleMobileErrorType, msg: String, file: String = #file, line: Int = #line, function: String = #function, callStack: Array<String> = Thread.callStackSymbols) -> PaddleMobileError {
-        paddleMobileLog(msg, logLevel: .FatalError, file: file, line: line, function: function, callStack: callStack)
-        let debugMsg = "\(msg) -file: \(file) -line: \(line) -function:\(function) -calling stack: \(callStack)"
-        switch type {
-        case .loaderError:
-            return PaddleMobileError.loaderError(message: debugMsg)
-        case .netError:
-            return PaddleMobileError.netError(message: debugMsg)
-        case .memoryError:
-            return PaddleMobileError.memoryError(message: debugMsg)
-        case .paramError:
-            return PaddleMobileError.paramError(message: debugMsg)
-        case .opError:
-            return PaddleMobileError.opError(message: debugMsg)
-        case .predictError:
-            return PaddleMobileError.predictError(message: debugMsg)
-        case .defaultError:
-            return PaddleMobileError.defaultError(message: debugMsg)
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/Extensions.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/Extensions.swift
deleted file mode 100644
index f5ed25c861..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Common/Extensions.swift
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-//Lense
-struct Lense<A, B> {
-    let from: (A) -> B
-    let to: (B, A) -> A
-}
-
-precedencegroup CombineLense{
-    associativity: left
-    higherThan: AssignmentPrecedence
-}
-
-infix operator >>>: CombineLense
-func >>><A, B, C>(left: Lense<B, C>, right: Lense<A, B>) -> Lense<A, C> {
-    return Lense<A, C>.init(from: { (a) -> C in
-        left.from(right.from(a))
-    }, to: { (c, a) -> A in
-        right.to( left.to(c, right.from(a)),a)
-    })
-}
-
-protocol CIntIndex {
-    associatedtype T;
-    subscript(index: CInt) -> T { get set};
-}
-
-extension Array: CIntIndex {
-    typealias T = Element
-    subscript(index: CInt) -> T {
-        get {
-            return self[Int(index)]
-        }
-        set {
-            self[Int(index)] = newValue
-        }
-
-    }
-}
-
-extension Array where Element: AnyObject{
-    mutating func remove(element: Element) {
-        if let index = index(where: { (node) -> Bool in
-            return unsafeBitCast(element, to: Int.self) == unsafeBitCast(node, to: Int.self)
-        }) {
-            remove(at: index)
-        }
-    }
-    
-}
-
-//MARK: Array extension
-extension Array where Element: Comparable{
-    
-    /// 返回数组前 r 个元素, 并将元素处于原数组的位置作为元组的第一个元素返回
-    ///
-    /// - Parameter r: 前 r 个元素
-    /// - Returns: [(原有位置, 排好位置的元素)]
-    public func top(r: Int) -> [(Int, Element)] {
-        precondition(r <= self.count)
-        return Array<(Int, Element)>(zip(0..<self.count, self).sorted{ $0.1 > $1.1 }.prefix(through: r - 1))
-    }
-}
-
-extension Array {
-    public func strideArray(inCount: Int = 20) -> [(Int, Element)] {
-        if count < inCount {
-            return (0..<count).map{ ($0, self[$0]) }
-        } else {
-            let stride = count / inCount
-            var newArray: [(Int, Element)] = []
-            for i in 0..<inCount {
-                newArray.append((i * stride, self[i * stride]))
-            }
-            return newArray
-        }
-    }
-    
-    public static func floatArrWithBuffer(floatArrBuffer: UnsafeMutablePointer<Float32>, count: Int) -> [Float32] {
-        var arr: [Float32] = []
-        for i in 0..<count {
-            arr.append(floatArrBuffer[i])
-        }
-        return arr
-    }
-}
-
-extension UnsafeMutablePointer {
-    public func floatArr(count: Int) -> [Pointee]{
-        var arr: [Pointee] = []
-        for i in 0..<count {
-            arr.append(self[i])
-        }
-        return arr
-    }
-}
-
-extension String {
-    func cStr() -> UnsafePointer<Int8>? {
-        return (self as NSString).utf8String
-    }
-}
-
-func address<T: AnyObject>(o: T) -> String {
-    return String.init(format: "%018p", unsafeBitCast(o, to: Int.self))
-}
-
-
-
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift
deleted file mode 100644
index 01aa244c90..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift
+++ /dev/null
@@ -1,666 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import MetalKit
-import CoreMedia
-
-fileprivate var defaultMetalLibrary: MTLLibrary?
-fileprivate var paddleMobileMetalLibrary: MTLLibrary?
-fileprivate var customMetalLibrary: MTLLibrary?
-
-extension MTLDevice {
-    func defaultLibrary() throws -> MTLLibrary {
-        if defaultMetalLibrary == nil {
-            defaultMetalLibrary = makeDefaultLibrary()
-        }
-        if let inDefaultLib = defaultMetalLibrary {
-            return inDefaultLib
-        } else {
-            throw PaddleMobileError.makeError(type: .defaultError, msg: "default metal libary is nil")
-        }
-    }
-    
-    func customLibrary(metalLibPath: String) throws -> MTLLibrary {
-        if customMetalLibrary == nil {
-            customMetalLibrary = try makeLibrary(filepath: metalLibPath)
-        }
-        
-        if let inMetalLib = customMetalLibrary {
-            return inMetalLib
-        } else {
-            throw PaddleMobileError.makeError(type: .defaultError, msg: "customlib is nil")
-        }
-    }
-    
-    func paddleMobileLibrary() throws -> MTLLibrary {
-        if paddleMobileMetalLibrary == nil {
-            guard let path = Bundle.init(for: Kernel.self).path(forResource: "default", ofType: "metallib") else {
-                throw PaddleMobileError.makeError(type: .defaultError, msg: "Counld't find paddle mobile library")
-            }
-            do {
-                paddleMobileMetalLibrary = try makeLibrary(filepath: path)
-            } catch _ {
-                throw PaddleMobileError.makeError(type: .defaultError, msg: "Counld't load paddle mobile library")
-            }
-        }
-        
-        if let inPaddleMobileLib = paddleMobileMetalLibrary {
-            return inPaddleMobileLib
-        } else {
-            throw PaddleMobileError.makeError(type: .defaultError, msg: "PaddleMobile metal libary is nil")
-        }
-    }
-    
-    func pipeLine(funcName: String, metalLoadMode: MetalLoadMode, metalLibPath: String?) throws -> MTLComputePipelineState {
-        let useLib: MTLLibrary
-        switch metalLoadMode {
-        case .LoadMetalInDefaultLib:
-            useLib = try defaultLibrary()
-        case .LoadMetalInPaddleMobile:
-            useLib = try paddleMobileLibrary()
-        case .LoadMetalInCustomMetalLib:
-            guard let path = metalLibPath else {
-                throw PaddleMobileError.makeError(type: .loaderError, msg: "metalibpath can not be nil")
-            }
-            useLib = try customLibrary(metalLibPath: path)
-        default:
-            throw PaddleMobileError.makeError(type: .defaultError, msg: "metalLoadMode \(metalLoadMode) not implemented")
-        }
-        
-        guard let function = useLib.makeFunction(name: funcName) else {
-            throw PaddleMobileError.makeError(type: .defaultError, msg: "function " + funcName + " not found")
-        }
-        do {
-            let pipLine = try makeComputePipelineState(function: function)
-            return pipLine
-        } catch let error {
-            throw PaddleMobileError.makeError(type: .defaultError, msg: "make pip line error occured : \(error)")
-        }
-    }
-    
-    func makeBuffer<P>(value: [P]) throws -> MTLBuffer {
-        guard let buffer = makeBuffer(length: value.count * MemoryLayout<P>.size, options: MTLResourceOptions.storageModeShared) else {
-            throw PaddleMobileError.makeError(type: .defaultError, msg: "make buffer nil")
-        }
-        let contents = buffer.contents().bindMemory(to: P.self, capacity: value.count * MemoryLayout<P>.size)
-        for i in 0..<value.count {
-            contents[i] = value[i]
-        }
-        return buffer
-    }
-    
-    func texture2tensor_loop<P>(texture: MTLTexture, cb: ([Int], P)->Void) -> Void {
-        let bpR = texture.width * 4 * MemoryLayout<P>.size
-        let bpI = texture.height * bpR
-        let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: 1))
-        for i in 0..<texture.arrayLength {
-            let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: bpI)
-            texture.getBytes(pointer, bytesPerRow: bpR, bytesPerImage: bpI, from: region, mipmapLevel: 0, slice: i)
-            for tx in 0..<texture.width * texture.height * 4 {
-                var k = tx
-                var xyzn: [Int] = [0, 0, 0, 0]
-                xyzn[1] = k / (texture.width * 4)
-                k %= (texture.width * 4)
-                xyzn[3] = k % 4
-                xyzn[0] = k / 4
-                xyzn[2] = i
-                cb(xyzn, pointer[tx])
-            }
-        }
-    }
-    
-    func texture2tensor_3<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) throws -> [P] {
-        var tdim: [Int] = [1, 1, 1, 1]
-        for i in 0..<dim.count {
-            tdim[4 - dim.count + i] = dim[i]
-        }
-        let count = dim.reduce(1) { $0 * $1 }
-        var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
-        let ndim: [Int] = transpose.map { tdim[$0] }
-        guard dim.count == 3 &&
-              texture.width == ndim[3] &&
-              texture.height == ndim[2] &&
-              ndim[0] == 1 &&
-              texture.arrayLength == (ndim[1] + 3) / 4
-        else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "dim: \(dim) or ndim: \(ndim) or texture: \(texture) do not satisfy")
-        }
-        
-        texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
-            var tg: [Int] = [0, 0, 0, 0]
-            tg[1] = xyzn[2] * 4 + xyzn[3]
-            tg[2] = xyzn[1]
-            tg[3] = xyzn[0]
-            var ig: [Int] = [0, 0, 0, 0]
-            for k in 0..<4 {
-                ig[transpose[k]] = tg[k]
-            }
-            let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
-            if ix < count {
-                tensor[ix] = v
-            }
-        }
-        return tensor
-    }
-    
-    func texture2tensor_2<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) throws -> [P] {
-        var tdim: [Int] = [1, 1, 1, 1]
-        for i in 0..<dim.count {
-            tdim[4 - dim.count + i] = dim[i]
-        }
-        let count = dim.reduce(1) { $0 * $1 }
-        var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
-        let ndim: [Int] = transpose.map { tdim[$0] }
-        guard dim.count == 2 else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "dim count must equal to 2")
-        }
-        let w = (ndim[3] + 3) / 4
-        guard texture.width == w &&
-              texture.height == ndim[2] &&
-              ndim[0] == 1 &&
-              ndim[1] == 1 &&
-              texture.arrayLength == 1
-        else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "ndim: \(ndim) w: \(w) texture: \(texture) do not satisfy")
-        }
-        
-        texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
-            var tg: [Int] = [0, 0, 0, 0]
-            tg[2] = xyzn[1]
-            tg[3] = xyzn[0] * 4 + xyzn[3]
-            var ig: [Int] = [0, 0, 0, 0]
-            for k in 0..<4 {
-                ig[transpose[k]] = tg[k]
-            }
-            let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
-            if ix < count {
-                tensor[ix] = v
-            }
-        }
-        return tensor
-    }
-    
-    func texture2tensor_1<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) throws -> [P] {
-        var tdim: [Int] = [1, 1, 1, 1]
-        for i in 0..<dim.count {
-            tdim[4 - dim.count + i] = dim[i]
-        }
-        let count = dim.reduce(1) { $0 * $1 }
-        var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
-        let ndim: [Int] = transpose.map { tdim[$0] }
-        guard dim.count == 1 else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "dim count must equal to 1")
-        }
-        let w = (ndim[3] + 3) / 4
-        guard texture.width == w &&
-              texture.height == 1 &&
-              ndim[0] == 1 &&
-              ndim[1] == 1 &&
-              ndim[2] == 1 &&
-              texture.arrayLength == 1
-        else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "ndim: \(ndim) w: \(w) texture: \(texture) do not satisfy")
-        }
-        
-        texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
-            var tg: [Int] = [0, 0, 0, 0]
-            tg[3] = xyzn[0] * 4 + xyzn[3]
-            var ig: [Int] = [0, 0, 0, 0]
-            for k in 0..<4 {
-                ig[transpose[k]] = tg[k]
-            }
-            let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
-            if ix < count {
-                tensor[ix] = v
-            }
-        }
-        return tensor
-    }
-    
-    func texture2tensor<P>(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) throws -> [P] {
-        if dim.count == 3 {
-            return try texture2tensor_3(texture: texture, dim: dim, transpose: transpose)
-        } else if dim.count == 2 {
-            return try texture2tensor_2(texture: texture, dim: dim, transpose: transpose)
-        } else if dim.count == 1 {
-            return try texture2tensor_1(texture: texture, dim: dim, transpose: transpose)
-        }
-        var tdim: [Int] = [1, 1, 1, 1]
-        for i in 0..<dim.count {
-            tdim[4 - dim.count + i] = dim[i]
-        }
-        let count = dim.reduce(1) { $0 * $1 }
-        var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
-        let ndim: [Int] = transpose.map { tdim[$0] }
-        
-        guard texture.width == ndim[2] &&
-              texture.height == ndim[1] &&
-              texture.arrayLength == (ndim[0] * ndim[3] + 3) / 4
-        else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "ndim: \(ndim) texture: \(texture) do not satisfy")
-        }
-        
-        texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
-            var tg: [Int] = [0, 0, 0, 0]
-            tg[1] = xyzn[1]
-            tg[2] = xyzn[0]
-            tg[0] = (xyzn[2] * 4 + xyzn[3]) / ndim[3]
-            tg[3] = (xyzn[2] * 4 + xyzn[3]) % ndim[3]
-            var ig: [Int] = [0, 0, 0, 0]
-            for k in 0..<4 {
-                ig[transpose[k]] = tg[k]
-            }
-            let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
-            if ix < count {
-                tensor[ix] = v
-            }
-        }
-        return tensor
-    }
-    
-    func tensor2texture<P>(value: [P], dim: [Int], transpose: [Int] = [0, 1, 2, 3], inComputePrecision: Precision = .Float32) throws -> MTLTexture {
-        if value.count > 0 {
-            guard value.count == (dim.reduce(1) { $0 * $1 }) else {
-                throw PaddleMobileError.makeError(type: .netError, msg: "value count: \(value.count) and dim: \(dim) do not satisfy")
-            }
-        }
-        
-        var tdim: [Int] = [1, 1, 1, 1]
-        for i in 0..<dim.count {
-            tdim[4 - dim.count + i] = dim[i]
-        }
-        let ndim: [Int] = transpose.map { tdim[$0] }
-        
-        let textureDesc = MTLTextureDescriptor.init()
-        textureDesc.width = ndim[2]
-        textureDesc.height = ndim[1]
-        textureDesc.depth = 1
-        textureDesc.usage = [.shaderRead, .shaderWrite]
-        
-        if inComputePrecision == .Float16 {
-            textureDesc.pixelFormat = .rgba16Float
-        } else if inComputePrecision == .Float32 {
-            textureDesc.pixelFormat = .rgba32Float
-        }
-        
-        textureDesc.textureType = .type2DArray
-        textureDesc.storageMode = .shared
-        textureDesc.cpuCacheMode = .defaultCache
-        textureDesc.arrayLength = (ndim[0] * ndim[3] + 3) / 4
-        let texture = makeTexture(descriptor: textureDesc)!
-        
-        if value.count > 0 {
-            var rcount: Int = (ndim[0] * ndim[3] + 3) / 4
-            rcount = rcount * 4 * ndim[1] * ndim[2]
-            var nvalue: [Float32] = .init(repeating: 0.0, count: rcount)
-            
-            var value32: [Float32]?
-            if value is [Float16] {
-                var value16 = value as! [Float16]
-                value32 = try float16To32(input: &value16, count: value.count)
-            } else {
-                value32 = value as? [Float32]
-            }
-            guard let tmpValue32 = value32 else {
-                throw PaddleMobileError.makeError(type: .loaderError, msg: "tensor2texture tensor value type not support")
-            }
-
-            for i0 in 0..<tdim[0] {
-                for i1 in 0..<tdim[1] {
-                    for i2 in 0..<tdim[2] {
-                        for i3 in 0..<tdim[3] {
-                            let ig = [i0, i1, i2, i3]
-                            let ix = (i0 * tdim[1] * tdim[2] * tdim[3]) + (i1 * tdim[2] * tdim[3]) + (i2 * tdim[3]) + i3
-                            
-                            let jg = transpose.map { ig[$0] }
-                            let k = jg[0] * ndim[3] + jg[3]
-                            let jx = ((k / 4) * ndim[1] * ndim[2] * 4) + (jg[1] * ndim[2] * 4) + (jg[2] * 4) + (k % 4)
-                            nvalue[jx] = tmpValue32[ix]
-                        }
-                    }
-                }
-            }
-            
-            let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: ndim[2], height: ndim[1], depth: 1))
-            if inComputePrecision == .Float16 {
-                let xvalue: [UInt16] = .init(repeating: 0, count: rcount)
-                let pointer: UnsafeMutablePointer<Float32> = UnsafeMutablePointer(mutating: nvalue)
-                let outputP: UnsafeMutablePointer<UInt16> = UnsafeMutablePointer(mutating: xvalue)
-                try float32ToFloat16(input: pointer, output: outputP, count: rcount)
-                let bpR = ndim[2] * 4 * 2
-                let bpI = ndim[1] * bpR
-                for i in 0..<textureDesc.arrayLength {
-                    let p = outputP + texture.width * texture.height * 4 * i
-                    texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bpR, bytesPerImage: bpI)
-                }
-            } else {
-                let pointer: UnsafeMutablePointer<Float32> = UnsafeMutablePointer(mutating: nvalue)
-                let bpR = ndim[2] * 4 * MemoryLayout<P>.size
-                let bpI = ndim[1] * bpR
-                for i in 0..<textureDesc.arrayLength {
-                    let p = pointer + texture.width * texture.height * 4 * i
-                    texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bpR, bytesPerImage: bpI)
-                }
-            }
-        }
-        return texture
-    }
-    
-    func makeFloatTexture<P>(value: [P], textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture {
-        
-        let textureDesc = MTLTextureDescriptor.init()
-        textureDesc.width = textureWidth
-        textureDesc.height = textureHeight
-        textureDesc.depth = 1
-        textureDesc.usage = [.shaderRead, .shaderWrite]
-        textureDesc.pixelFormat = .rgba32Float
-        textureDesc.textureType = .type2DArray
-        textureDesc.storageMode = .shared
-        textureDesc.cpuCacheMode = .defaultCache
-        textureDesc.arrayLength = arrayLength
-        let texture = makeTexture(descriptor: textureDesc)!
-        
-        if value.count >= 4{
-            let counts = arrayLength * 4 * textureWidth * textureHeight
-            let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: counts * MemoryLayout<P>.size)
-            defer {
-                pointer.deallocate()
-            }
-            for i in 0..<value.count {
-                pointer[i] = value[i]
-            }
-            for i in value.count..<counts {
-                pointer[i] = 0 as! P
-            }
-            
-            let bytesPerRow = texture.width * texture.depth * 4 * MemoryLayout<P>.size
-            let bytesPerImage = texture.height * bytesPerRow
-            let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: texture.depth))
-            for i in 0..<arrayLength {
-                let p = pointer + texture.width * texture.height * 4 * i
-                texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage)
-            }
-        } else {
-            
-        }
-        
-        return texture
-    }
-}
-
-extension MTLComputeCommandEncoder {
-    public func dispatch(computePipline: MTLComputePipelineState, outTexture: MTLTexture, groupDepth: Int? = nil) throws {
-        let slices = (outTexture.arrayLength * 4 + 3)/4
-        
-        let width = computePipline.threadExecutionWidth
-        let height = computePipline.maxTotalThreadsPerThreadgroup/width
-        let threadsPerGroup = MTLSize.init(width: width, height: height, depth: 1)
-        
-        //    print(" thread: threads per group: \(threadsPerGroup) ")
-        //    print(" thread: out texture width: \(outTexture.width) , out texture height: \(outTexture.height)")
-        
-        let groupWidth = (outTexture.width + width - 1)/width
-        let groupHeight = (outTexture.height + height - 1)/height
-        let groups = MTLSize.init(width: groupWidth, height: groupHeight, depth: groupDepth ?? slices)
-        guard groups.width > 0 && groups.height > 0 && groups.depth > 0 else {
-            throw PaddleMobileError.makeError(type: PaddleMobileErrorType.predictError, msg: "dispatch thread groups width:\(groups.width) or height:\(groups.height) or depth: \(groups.depth) must not be 0")
-        }
-        setComputePipelineState(computePipline)
-        
-        dispatchThreadgroups(groups, threadsPerThreadgroup: threadsPerGroup)
-    }
-}
-
-public extension MTLTexture {
-    
-    func stridableFloatArray<P>(stridable: Bool = true) -> [(index: Int, value: P)] {
-        var arr: [P] = floatArray { (p: P) -> P in
-            return p;
-        }
-        var result:  [(index: Int, value: P)] = []
-        if arr.count > 100 && stridable {
-            for j in stride(from: 0, to: arr.count , by: arr.count / 100){
-                result.append((j, arr[j]))
-            }
-        } else {
-            for j in 0..<arr.count {
-                result.append((j, arr[j]))
-            }
-        }
-        return result
-    }
-    
-    func floatArray<P, T>(res: (P) -> T) -> [T] {
-        var fArr: [T] = []
-        if textureType == .type2DArray {
-            for i in 0..<arrayLength{
-                let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<P>.size, alignment: MemoryLayout<P>.alignment)
-                let bytesPerRow = width * depth * 4 * MemoryLayout<P>.size
-                let bytesPerImage = width * height * depth * 4 * MemoryLayout<P>.size
-                let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
-                getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i)
-                let p = bytes.assumingMemoryBound(to: P.self)
-                
-                for j in 0..<width * height * depth * 4 {
-                    fArr.append(res(p[j]))
-                }
-                bytes.deallocate()
-            }
-        } else if textureType == .type2D {
-            let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<P>.size, alignment: MemoryLayout<P>.alignment)
-            let bytesPerRow = width * depth * 4 * MemoryLayout<P>.size
-            let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
-            getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0)
-            let p = bytes.assumingMemoryBound(to: P.self)
-            
-            for j in 0..<width * height * 4 {
-                fArr.append(res(p[j]))
-            }
-            bytes.deallocate()
-        }
-        return fArr
-    }
-    
-    func float32Array() throws -> [Float32] {
-        if pixelFormat == .rgba32Float {
-            let float32Array = floatArray { (f: Float32) -> Float32 in
-                return f
-            }
-            return float32Array
-        } else if pixelFormat == .rgba16Float {
-            
-            var float16Array = floatArray { (f: Float16) -> Float16 in
-                return f
-            }
-            return try float16To32(input: &float16Array, count: float16Array.count)
-        } else {
-            throw PaddleMobileError.makeError(type: .defaultError, msg: "pixelFormat \(pixelFormat) unsupported yet")
-        }
-    }
-    
-    func logDesc<T>(header: String = "", stridable: Bool = true) -> T? {
-        paddleMobileLog("\(header)")
-        paddleMobileLog("texture: \(self)")
-        //        let res: [(index: Int, value: T)] = stridableFloatArray(stridable: stridable)
-        //        print(res)
-        
-        if textureType == .type2DArray {
-            for i in 0..<arrayLength{
-                var str: String = "slice: \(i): \n"
-                let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<T>.size, alignment: MemoryLayout<T>.alignment)
-                let bytesPerRow = width * depth * 4 * MemoryLayout<T>.size
-                let bytesPerImage = width * height * depth * 4 * MemoryLayout<T>.size
-                let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
-                getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i)
-                let p = bytes.assumingMemoryBound(to: T.self)
-                str += "2d array count : \(width * height * depth * 4) \n"
-                if stridable && width * height * depth * 4 > 20 {
-                    for j in stride(from: 0, to: width * height * depth * 4 , by: width * height * depth * 4 / 20){
-                        str += " index \(j): \(p[j])"
-                    }
-                } else {
-                    for j in 0..<width * height * depth * 4 {
-                        str += " index \(j): \(p[j])"
-                    }
-                }
-                
-                bytes.deallocate()
-                paddleMobileLog(str)
-            }
-        } else if textureType == .type2D {
-            var str: String = "texture 2D: "
-            let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<T>.size, alignment: MemoryLayout<T>.alignment)
-            let bytesPerRow = width * depth * 4 * MemoryLayout<T>.size
-            let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
-            getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0)
-            let p = bytes.assumingMemoryBound(to: T.self)
-            str += "2d count : \(width * width * 4) \n"
-            
-            if stridable {
-                for j in stride(from: 0, to: width * height * 4, by: width * height * 4 / 20){
-                    str += "index \(j): \(p[j]) "
-                }
-            } else {
-                for j in 0..<width * height * 4 {
-                    str += "index \(j): \(p[j]) "
-                }
-            }
-            
-            paddleMobileLog(str)
-            bytes.deallocate()
-        }
-        return nil
-        
-    }
-    
-    // n c h w - dim
-    func toTensor(dim: (n: Int, c: Int, h: Int, w: Int)) throws -> [Float32] {
-        var textureArray: [Float32]
-        if pixelFormat == .rgba32Float {
-            textureArray = floatArray { (i : Float32) -> Float32 in
-                return i
-            }
-        } else if pixelFormat == .rgba16Float {
-            
-            var textureFloat16Array = floatArray { (i : Float16) -> Float16 in
-                return i
-            }
-            textureArray = try float16To32(input: &textureFloat16Array, count: textureFloat16Array.count)
-        } else {
-            throw PaddleMobileError.makeError(type: .defaultError, msg: "pixelFormat \(pixelFormat) unsupported yet")
-        }
-        print(textureArray.count)
-        var output: [Float32] = []
-        for s in 0..<arrayLength {
-            for c in 0..<4{
-                for h in 0..<dim.h {
-                    for w in 0..<dim.w {
-                        if (s * 4 + c) < (dim.c * dim.n) {
-                            let textureValue = textureArray[dim.w * dim.h * 4 * s + h * dim.w * 4 + w * 4 + c]
-                            output.append(textureValue)
-                        }
-                    }
-                }
-            }
-        }
-        return output
-    }
-    
-    func realNHWC(dim: (n: Int, h: Int, w: Int, c: Int)) throws -> [Float32] {
-        //    print("origin dim: \(dim)")
-        //    print("texture: ")
-        //    print(self)
-        
-        var textureArray: [Float32]
-        if pixelFormat == .rgba32Float {
-            textureArray = floatArray { (i : Float32) -> Float32 in
-                return i
-            }
-        } else if pixelFormat == .rgba16Float {
-            var textureFloat16Array = floatArray { (i : Float16) -> Float16 in
-                return i
-            }
-            textureArray = try float16To32(input: &textureFloat16Array, count: textureFloat16Array.count)
-        } else {
-            throw PaddleMobileError.makeError(type: .defaultError, msg: "pixelFormat \(pixelFormat) unsupported yet")
-        }
-        let total = dim.n*dim.c*dim.h*dim.w
-        var output: [Float32] = Array<Float32>.init(repeating: 0, count: total)
-        
-        for i in 0..<textureArray.count {
-            let z = i / (dim.h*dim.w*4)
-            let k = i % (dim.h*dim.w*4)
-            let y = k / (dim.w*4)
-            let m = k % (dim.w*4)
-            let x = m / 4
-            let c = m % 4
-            let nn = (z*4+c)/dim.c
-            let nc = y
-            let nh = x
-            let nw = (z*4+c)%dim.c
-            let index = nn*dim.h*dim.w*dim.c+nc*dim.w*dim.c+nh*dim.c+nw
-            if index < total {
-                output[index] = textureArray[i]
-            }
-        }
-        return output
-    }
-    
-}
-
-
-public extension MTLBuffer {
-    func logDesc<T>(header: String = "", stridable: Bool = true) -> T? {
-        paddleMobileLog(header)
-        paddleMobileLog("MTLBuffer: \(self) ")
-        var str = ""
-        if stridable && length/MemoryLayout<T>.stride > 1000 {
-            for j in stride(from: 0, to: length, by: length/MemoryLayout<T>.stride / 100) {
-                str += " \(contents().assumingMemoryBound(to: T.self)[j])"
-            }
-        } else {
-            for i in 0..<length/MemoryLayout<T>.size {
-                str += " \(contents().assumingMemoryBound(to: T.self)[i])"
-            }
-        }
-        paddleMobileLog(str)
-        return nil
-    }
-    
-    func makeTexture(textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture {
-        let textureDesc = MTLTextureDescriptor.init()
-        textureDesc.width = textureWidth
-        textureDesc.height = textureHeight
-        textureDesc.depth = 1
-        textureDesc.usage = [.shaderRead, .shaderWrite]
-        textureDesc.pixelFormat = .rgba32Float
-        textureDesc.textureType = .type2DArray
-        textureDesc.storageMode = .shared
-        textureDesc.cpuCacheMode = .defaultCache
-        textureDesc.arrayLength = arrayLength
-        let texture = makeTexture(descriptor: textureDesc, offset: 0, bytesPerRow: textureWidth * 4 * 4)!
-        return texture
-    }
-    
-    func array<T>() -> [T] {
-        var array: [T] = []
-        let pointer = contents().bindMemory(to: T.self, capacity: length)
-        for i in 0..<(length / MemoryLayout<T>.size) {
-            array.append(pointer[i])
-        }
-        return array;
-    }
-}
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift
deleted file mode 100644
index 8e28fa861e..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift
+++ /dev/null
@@ -1,362 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Metal
-import Foundation
-
-public class PaddleMobileUnitTest {
-    let device: MTLDevice
-    let queue: MTLCommandQueue
-    public init(inDevice: MTLDevice, inQueue: MTLCommandQueue) {
-        device = inDevice
-        queue = inQueue
-    }
-    
-    private func indentPrintTensor(tensor: [Float32], dim: [Int], ix: [Int], indentLevel: Int) {
-        let indent = Array.init(repeating: " ", count: indentLevel).joined(separator: "")
-        var tx = ix
-        if dim.count == indentLevel + 1 {
-            var log: String = indent + "["
-            for i in 0..<dim[indentLevel] {
-                tx = ix
-                tx[indentLevel] = i
-                for x in 1..<dim.count {
-                    for y in 0..<x {
-                        tx[y] *= dim[x]
-                    }
-                }
-                let c = tx.reduce(0) { $0 + $1 }
-                if i > 0 {
-                    log += ", "
-                }
-                log += tensor[c].description
-            }
-            log += "]"
-            if (indentLevel > 0) && (ix[indentLevel - 1] < dim[indentLevel - 1] - 1) {
-                log += ","
-            }
-            print(log)
-        } else {
-            print(indent + "[")
-            for i in 0..<dim[indentLevel] {
-                tx[indentLevel] = i
-                indentPrintTensor(tensor: tensor, dim: dim, ix: tx, indentLevel: indentLevel + 1)
-            }
-            if (indentLevel > 0) && (ix[indentLevel - 1] < dim[indentLevel - 1] - 1) {
-                print(indent + "],")
-            } else {
-                print(indent + "]")
-            }
-        }
-    }
-    
-    private func tensorPrint(tensor: [Float32], dim: [Int]) {
-        var detectPos = -1
-        var odim = 1
-        var ndim = dim
-        for i in 0..<dim.count {
-            if dim[i] == -1 {
-                if detectPos == -1 {
-                    detectPos = i
-                } else {
-                    detectPos = -2
-                }
-            } else if dim[i] <= 0 {
-                detectPos = -3
-            } else {
-                odim *= dim[i]
-            }
-        }
-        guard detectPos >= -1 else {
-            print("must satisfy detectPos >= -1")
-            return
-        }
-        if (detectPos == -1) {
-            guard tensor.count == odim else {
-                print("must satisfy tensor.count == odim")
-                return
-            }
-        } else {
-            guard tensor.count % odim == 0 else {
-                print("must satisfy tensor.count % odim == 0")
-                return
-            }
-            ndim[detectPos] = tensor.count / odim
-        }
-        indentPrintTensor(tensor: tensor, dim: ndim, ix: dim.map { $0 * 0 }, indentLevel: 0)
-    }
-    
-    public func testConcat() {
-        //        let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
-        //        var it: [[Float32]] = []
-        //        for _ in 0..<7 {
-        //            it.append((0..<12).map { Float32($0) })
-        //        }
-        //        let input = it.map { device.tensor2texture(value: $0, dim: [3, 4]) }
-        //        let output = device.tensor2texture(value: [Float32](), dim: [3, 28])
-        //
-        //        let param = ConcatTestParam.init(
-        //            input: input,
-        //            output: output,
-        //            dims: [[3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4]],
-        //            axis: 1,
-        //            odim: [3, 28]
-        //        )
-        //        let concatKernel = ConcatKernel<Float32>.init(device: device, testParam: param)
-        //        concatKernel.test(cmdBuffer: buffer, param: param)
-        //        buffer.addCompletedHandler { (buffer) in
-        //            for i in 0..<it.count {
-        //                let _: Float32? = input[i].logDesc()
-        //                self.tensorPrint(tensor: it[i], dim: [3, 4])
-        //            }
-        //            let _: Float32? = output.logDesc()
-        //            let tx: [Float32] = self.device.texture2tensor(texture: output, dim: [3, 28])
-        //            self.tensorPrint(tensor: tx, dim: [3, 28])
-        //        }
-        //
-        //        buffer.commit()
-    }
-    
-    public func testReshape() {
-        //        let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
-        //        let input: [Float32] = (0..<24).map { Float32($0) }
-        //        let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
-        //        let outTexture = device.tensor2texture(value: [Float32](), dim: [4, 6])
-        //        let mp = ReshapeMetalParam.init(
-        //            idim: (1, 2, 3, 4),
-        //            itrans: (0, 1, 2, 3),
-        //            odim: (1, 1, 4, 6),
-        //            otrans: (0, 1, 2, 3)
-        //        )
-        //        let param = ReshapeTestParam.init(
-        //            inputTexture: inTexture,
-        //            outputTexture: outTexture,
-        //            param: mp
-        //        )
-        //        let reshapeKernel = ReshapeKernel<Float32>.init(device: device, testParam: param)
-        //        reshapeKernel.test(commandBuffer: buffer, testParam: param)
-        //        buffer.addCompletedHandler { (buffer) in
-        //            let _: Float32? = inTexture.logDesc()
-        //            let _: Float32? = outTexture.logDesc()
-        //            self.tensorPrint(tensor: input, dim: [2, 3, 4])
-        //            let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [4, 6])
-        //            self.tensorPrint(tensor: tx, dim: [4, 6])
-        //        }
-        
-        //        let input: [Float32] = (0..<24).map { Float32($0) }
-        //        let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
-        //        let outTexture = device.tensor2texture(value: [Float32](), dim: [24])
-        //        let mp = ReshapeMetalParam.init(
-        //            idim: (1, 2, 3, 4),
-        //            itrans: (0, 1, 2, 3),
-        //            odim: (1, 1, 1, 24),
-        //            otrans: (0, 1, 2, 3)
-        //        )
-        //        let param = ReshapeTestParam.init(
-        //            inputTexture: inTexture,
-        //            outputTexture: outTexture,
-        //            param: mp
-        //        )
-        //        let reshapeKernel = ReshapeKernel<Float32>.init(device: device, testParam: param)
-        //        reshapeKernel.test(commandBuffer: buffer, testParam: param)
-        //        buffer.addCompletedHandler { (buffer) in
-        //            let _: Float32? = inTexture.logDesc()
-        //            let _: Float32? = outTexture.logDesc()
-        //            self.tensorPrint(tensor: input, dim: [2, 3, 4])
-        //            let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [24])
-        //            self.tensorPrint(tensor: tx, dim: [24])
-        //        }
-        //
-        //        
-        //        buffer.commit()
-    }
-    
-    public func testTranspose() {
-        
-        guard let buffer = queue.makeCommandBuffer() else {
-            return
-        }
-        //        var input: [Float32] = []
-        //        for i in 0..<72 {
-        //            input.append(Float32(i))
-        //        }
-        ////        let inputTexture = device.makeFloatTexture(value: input, textureWidth: 3, textureHeight: 2, arrayLength: 3)
-        //        let inputTexture = device.tensor2texture(value: input, dim: [4, 3, 2, 3]);
-        //        // group 1
-        //        let outputTexture = device.tensor2texture(value: [Float32](), dim: [3, 3, 2, 4])
-        //        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 3, oC: 4, axis: [3, 1, 2, 0])
-        ////        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [3, 0, 2, 1])
-        ////        // group 2
-        ////        let outputTexture = device.makeFloatTexture(value: [Float32](), textureWidth: 3, textureHeight: 3, arrayLength: 6)
-        ////        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 4, axis: [3, 0, 2, 1])
-        ////
-        //        let transposeKernel = TransposeKernel<Float32>.init(device: device, testParam: param)
-        //
-        //        transposeKernel.test(commandBuffer: buffer, param: param)
-        //
-        //        buffer.addCompletedHandler { (buffer) in
-        //            let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false)
-        //            let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false)
-        //            self.tensorPrint(tensor: input, dim: [4, 3, 2, 3])
-        //            let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 3, 2, 4])
-        //            self.tensorPrint(tensor: tx, dim: [3, 3, 2, 4])
-        //        }
-        //
-        //        let input: [Float32] = (0..<24).map { Float32($0) }
-        //        let inputTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
-        //        let outputTexture = device.tensor2texture(value: [Float](), dim: [3, 4, 2])
-        //        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [0, 2, 3, 1])
-        //        let transposeKernel = TransposeKernel<Float32>.init(device: device, testParam: param)
-        //
-        //        transposeKernel.test(commandBuffer: buffer, param: param)
-        //
-        //        buffer.addCompletedHandler { (buffer) in
-        //            let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false)
-        //            let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false)
-        //            self.tensorPrint(tensor: input, dim: [2, 3, 4])
-        //            let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 4, 2])
-        //            self.tensorPrint(tensor: tx, dim: [3, 4, 2])
-        //        }
-        //        
-        buffer.commit()
-    }
-    
-    public func testConvAddBnRelu() {
-        guard let buffer = queue.makeCommandBuffer() else {
-            return
-        }
-        
-        let input: [Float32] = [
-            1.0, 2.0, 3.0, 4.0,
-            1.0, 2.0, 3.0, 4.0,
-            1.0, 2.0, 3.0, 4.0,
-            
-            1.0, 2.0, 3.0, 4.0,
-            1.0, 2.0, 3.0, 4.0,
-            1.0, 2.0, 3.0, 4.0,
-            
-            1.0, 2.0, 3.0, 4.0,
-            1.0, 2.0, 3.0, 4.0,
-            1.0, 2.0, 3.0, 4.0,
-            ]
-        
-        let filter: [Float32] = [
-            //1.0
-            1.0, 1.0, 1.0, 1.0,
-            1.0, 1.0, 1.0, 1.0,
-            1.0, 1.0, 1.0, 1.0,
-            
-            1.0, 1.0, 1.0, 1.0,
-            1.0, 1.0, 1.0, 1.0,
-            1.0, 1.0, 1.0, 1.0,
-            
-            1.0, 1.0, 1.0, 1.0,
-            1.0, 1.0, 1.0, 1.0,
-            1.0, 1.0, 1.0, 1.0,
-            
-            //2.0
-            1.0, 1.0, 1.0, 1.0,
-            1.0, 1.0, 1.0, 1.0,
-            1.0, 1.0, 1.0, 1.0,
-            
-            1.0, 1.0, 1.0, 1.0,
-            1.0, 1.0, 1.0, 1.0,
-            1.0, 1.0, 1.0, 1.0,
-            
-            1.0, 1.0, 1.0, 1.0,
-            1.0, 1.0, 1.0, 1.0,
-            1.0, 1.0, 1.0, 1.0,
-            
-            //3.0
-            1.0, 1.0, 1.0, 1.0,
-            1.0, 1.0, 1.0, 1.0,
-            1.0, 1.0, 1.0, 1.0,
-            
-            1.0, 1.0, 1.0, 1.0,
-            1.0, 1.0, 1.0, 1.0,
-            1.0, 1.0, 1.0, 1.0,
-            
-            1.0, 1.0, 1.0, 1.0,
-            1.0, 1.0, 1.0, 1.0,
-            1.0, 1.0, 1.0, 1.0,
-            
-            //4.0
-            1.0, 1.0, 1.0, 1.0,
-            1.0, 1.0, 1.0, 1.0,
-            1.0, 1.0, 1.0, 1.0,
-            
-            1.0, 1.0, 1.0, 1.0,
-            1.0, 1.0, 1.0, 1.0,
-            1.0, 1.0, 1.0, 1.0,
-            
-            1.0, 1.0, 1.0, 1.0,
-            1.0, 1.0, 1.0, 1.0,
-            1.0, 1.0, 1.0, 1.0,
-            ]
-        
-        let biase: [Float32] = [1.0, 1.0, 1.0, 100.0]
-        let newScalue: [Float32] = [1.0, 1.0, 1.0, 1.0]
-        let newBiase: [Float32] = [1.0, 1.0, 1.0, 1.0]
-        
-        let inputeTexture = device.makeFloatTexture(value: input, textureWidth: 3, textureHeight: 3, arrayLength: 1)
-        
-        //filter
-        let filterBuffer = try! device.makeBuffer(value: filter)
-        
-        // biase
-        let biaseBuffer = try! device.makeBuffer(value: biase)
-        
-        // new scale
-        let newScalueBuffer = try! device.makeBuffer(value: newScalue)
-        
-        // new biase
-        let newBiaseBuffer = try! device.makeBuffer(value: newBiase)
-        
-        //output
-        let outputTexture = device.makeFloatTexture(value: [Float32](), textureWidth: 2, textureHeight: 2, arrayLength: 1)
-        
-        let filterSize: (width: Int, height: Int, channel: Int) = (3, 3, 4)
-        let paddings: (Int, Int) = (1, 1)
-        let stride: (Int, Int) = (2, 2)
-        
-        let offsetX = filterSize.width/2 - paddings.0
-        let offsetY = filterSize.height/2 - paddings.1
-        
-        let groups = 1
-        let iC = 4
-        let fC = 4
-        let oC = 4
-        
-        let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), dilationX: UInt16(1), dilationY: UInt16(1), groups: UInt16(groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0), addParam: ElementwiseAddMetalParam())
-        
-        let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize)
-        
-        let initContext = InitContext.init()
-        initContext.metalLoadMode = .LoadMetalInDefaultLib
-        
-        let convAddBnReluKernel = try! ConvAddBatchNormReluKernel<Float32>.init(device: device, testParam: param, initContext: initContext)
-        
-        try! convAddBnReluKernel.test(commandBuffer: buffer, param: param)
-        
-        buffer.addCompletedHandler { (buffer) in
-            let _: Float32? = inputeTexture.logDesc(header: "input texture", stridable: false)
-            let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false)
-        }
-        
-        buffer.commit()
-    }
-}
-
-
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/Tools.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/Tools.swift
deleted file mode 100644
index 9809aae9f3..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Common/Tools.swift
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-func writeToLibrary<P: PrecisionProtocol>(fileName: String, array: [P]) throws {
-    guard let libraryPath = NSSearchPathForDirectoriesInDomains(.libraryDirectory, .userDomainMask, true).last else {
-        throw PaddleMobileError.makeError(type: .defaultError, msg: "library path get error")
-    }
-    let filePath = libraryPath + "/" + fileName
-    let fileManager = FileManager.init()
-    fileManager.createFile(atPath: filePath, contents: nil, attributes: nil)
-    guard let fileHandler = FileHandle.init(forWritingAtPath: filePath) else {
-        throw PaddleMobileError.makeError(type: .defaultError, msg: "file handler nil")
-    }
-    let data = Data.init(buffer: UnsafeBufferPointer.init(start: array, count: array.count))
-    fileHandler.write(data)
-    fileHandler.closeFile()
-}
-
-public func writeToLibrary<P: PrecisionProtocol>(fileName: String, buffer: UnsafeBufferPointer<P>) throws {
-    guard let libraryPath = NSSearchPathForDirectoriesInDomains(.libraryDirectory, .userDomainMask, true).last else {
-        throw PaddleMobileError.makeError(type: .defaultError, msg: "library path get error")
-    }
-    let filePath = libraryPath + "/" + fileName
-    let fileManager = FileManager.init()
-    fileManager.createFile(atPath: filePath, contents: nil, attributes: nil)
-    guard let fileHandler = FileHandle.init(forWritingAtPath: filePath) else {
-        throw PaddleMobileError.makeError(type: .defaultError, msg: "file handler nil")
-    }
-    let data = Data.init(buffer: buffer)
-    fileHandler.write(data)
-    fileHandler.closeFile()
-}
-
-func createFile(fileName: String) throws -> String {
-    guard let libraryPath = NSSearchPathForDirectoriesInDomains(.libraryDirectory, .userDomainMask, true).last else {
-        throw PaddleMobileError.makeError(type: .defaultError, msg: "library path get error")
-    }
-    let filePath = libraryPath + "/" + fileName
-    let fileManager = FileManager.init()
-    fileManager.createFile(atPath: filePath, contents: nil, attributes: nil)
-    return filePath
-}
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/Types.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/Types.swift
deleted file mode 100644
index cd20db37de..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Common/Types.swift
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import Accelerate
-
-public protocol SummableMultipliable: Equatable {
-    static func +(lhs: Self, rhs: Self) -> Self
-    static func *(lhs: Self, rhs: Self) -> Self
-    static func -(lhs: Self, rhs: Self) -> Self
-}
-
-public protocol PrecisionProtocol: SummableMultipliable{
-    //    init(inFloat: Float32)
-    //    init(inFloat16: Float16)
-    init<P: PrecisionProtocol>(_ inP: P) throws
-    static var bitSize: UInt { get }
-    static func initializeValue() -> Self
-    static var precisionType: Precision { get }
-}
-
-public typealias Float16 = Int16
-extension Float16: PrecisionProtocol {
-    
-    public static var precisionType: Precision {
-        return .Float16
-    }
-    
-    public static func initializeValue() -> Int16 {
-        return 0
-    }
-    
-    public init<P>(_ inP: P) throws where P : PrecisionProtocol {
-        switch P.precisionType {
-        case .Float32:
-            throw PaddleMobileError.makeError(type: .defaultError, msg: "Float16 can not be initialized from Float32")
-
-        case .Float16:
-            self = inP as! Int16
-        default:
-            throw PaddleMobileError.makeError(type: .defaultError, msg:  "Float16 must be initialized from Float16")
-        }
-    }
-    
-    public static var bitSize: UInt {
-        return 16
-    }
-}
-
-extension Float32: PrecisionProtocol {
-    
-    public static var precisionType: Precision {
-        return .Float32
-    }
-    
-    public static func initializeValue() -> Float {
-        return 0.0
-    }
-    
-    public init<P>(_ inP: P) throws where P : PrecisionProtocol {
-        switch P.precisionType {
-        case .Float32:
-            self = inP as! Float32
-        case .Float16:
-            self = Float32.init(Int32.init(inP as! Int16))
-        default:
-            throw PaddleMobileError.makeError(type: .defaultError, msg: "Float32 must be initialized from Float16 or Float32")
-        }
-    }
-    
-    public static var bitSize: UInt {
-        return 32
-    }
-}
-
-public func float32ToFloat16(input: UnsafeMutablePointer<Float32>, output: UnsafeMutableRawPointer, count: Int) throws {
-    var float32Buffer = vImage_Buffer(data: input,  height: 1, width: UInt(count), rowBytes: count * 4)
-    var float16buffer = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 2)
-    guard vImageConvert_PlanarFtoPlanar16F(&float32Buffer, &float16buffer, 0) == kvImageNoError else {
-        throw PaddleMobileError.makeError(type: .defaultError, msg: "float 32 to float 16 error!")
-    }
-}
-
-public func float16To32(input: UnsafeMutablePointer<Float16>, count: Int) throws -> [Float32] {
-    var output = Array<Float>.init(repeating: 0.0, count: count)
-    try float16to32(input: input, output: &output, count: count)
-    return output
-}
-
-public func float16to32(input: UnsafeMutablePointer<Float16>, output: UnsafeMutablePointer<Float32>, count: Int) throws {
-    var bufferFloat16 = vImage_Buffer(data: input,  height: 1, width: UInt(count), rowBytes: count * 2)
-    var bufferFloat32 = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 4)
-    if vImageConvert_Planar16FtoPlanarF(&bufferFloat16, &bufferFloat32, 0) != kvImageNoError {
-        throw PaddleMobileError.makeError(type: .defaultError, msg: "convert float16 to float32 error")
-    }
-}
-
-// N - 0   C - 1   H - 2   W - 3
-struct DataLayout {
-    
-    static func NCHW(dim: Dim = Dim.init(inDim: [0, 0, 0, 0])) -> DataLayout {
-        return DataLayout.init([(.N, dim[0]), (.C, dim[1]), (.H, dim[2]), (.W, dim[3])])
-    }
-    
-    static func NHWC(dim: Dim = Dim.init(inDim: [0, 0, 0, 0])) -> DataLayout {
-        return DataLayout.init([(.N, dim[0]), (.H, dim[1]), (.W, dim[2]), (.C, dim[3])])
-    }
-    
-    func count() -> Int {
-        return layoutWithDim.count
-    }
-    
-    init(_ inLayout: [(Layout, Int)]) {
-        layoutWithDim = inLayout
-    }
-    
-    func layout() -> [Layout] {
-        return layoutWithDim.map({ (layout: Layout, dim: Int) -> Layout in
-            return layout
-        })
-    }
-    
-    var layoutWithDim: [(Layout, Int)] = [(.N, 0), (.C, 0), (.H, 0), (.W, 0)]
-    
-    func convertTo(inLayout: [Layout]) {
-        
-    }
-    
-    enum Layout: Int{
-        case N = 0
-        case C = 1
-        case H = 2
-        case W = 3
-        static func defaultLayout() -> [Layout] {
-            return [N, C, H, W]
-        }
-    }
-}
-
-extension DataLayout: Equatable {
-    public static func == (lhs: DataLayout, rhs: DataLayout) -> Bool {
-        if lhs.layoutWithDim.count == rhs.layoutWithDim.count {
-            var result = true
-            for i in 0..<lhs.layoutWithDim.count {
-                result = (lhs.layoutWithDim[i].0 == rhs.layoutWithDim[i].0)
-                if !result {
-                    break
-                }
-            }
-            return result
-        } else {
-            return false
-        }
-    }
-}
-
-public protocol Variant: CustomStringConvertible, CustomDebugStringConvertible {
-}
-
-extension Tensor: Variant {
-}
-
-extension Texture: Variant {
-}
-
-extension GPUResultHolder: Variant {
-}
-
-extension InputTexture: Variant {
-}
-
-extension MTLTexture where Self: Variant {
-}
-
-public class FetchHolder: Variant {
-    var resultBuffer: MTLBuffer?
-    public var dim: Dim
-    public var capacity: Int
-    public var paddedCapacity: Int
-    
-    init(inPaddedCapacity: Int, inDim: Dim) {
-        paddedCapacity = inPaddedCapacity
-        capacity = inDim.numel()
-        dim = inDim
-    }
-    
-    public func initBuffer(device: MTLDevice) {
-        resultBuffer = device.makeBuffer(length: paddedCapacity * 4, options: [])
-    }
-    
-    var result: UnsafeMutablePointer<Float32>? {
-        guard let inResultBuffer = resultBuffer else {
-            return nil
-        }
-        return inResultBuffer.contents().bindMemory(to: Float32.self, capacity: paddedCapacity)
-    }
-    
-}
-
-extension FetchHolder: CustomStringConvertible, CustomDebugStringConvertible {
-    public var description: String {
-        return "FetchHolder: dim \(dim) capacity \(capacity) paddedCapacity \(paddedCapacity)"
-        //    return "\(result)"
-    }
-    
-    public var debugDescription: String {
-        return "FetchHolder: dim \(dim) capacity \(capacity) paddedCapacity \(paddedCapacity)"
-        //    return "\(result)"
-    }
-}
-
-
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Dim.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Dim.swift
deleted file mode 100644
index 0867e1b6c5..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Dim.swift
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-@objc public class Dim: NSObject {
-    private(set) var dims: [Int]
-    
-    @objc public init(inDim: [Int]) {
-        dims = inDim
-    }
-    
-    public func cout() -> Int {
-        return dims.count
-    }
-    
-    public func numel() -> Int {
-        return dims.reduce(1) { $0 * $1 }
-    }
-    
-    public static func ==(left: Dim, right: Dim) -> Bool {
-        return left.dims == right.dims;
-    }
-    
-    public static func !=(left: Dim, right: Dim) -> Bool {
-        return left.dims != right.dims;
-    }
-    
-    public subscript(index: Int) -> Int {
-        return dims[index];
-    }
-    
-    public override var description: String {
-        return "\(dims)"
-    }
-    
-    func swapeDimAt(index1: Int, index2: Int) {
-        dims.swapAt(index1, index2)
-    }
-    
-    private override init() {
-        dims = []
-        super.init()
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Executor.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Executor.swift
deleted file mode 100644
index 83f84baa45..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Executor.swift
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-let testTo = 5
-
-var isTest = false
-
-@objc public class GPUResultHolder: NSObject{
-    @objc public let dim: [Int]
-    @objc public let capacity: Int
-    @objc public var resultPointer: UnsafeMutablePointer<Float32>?
-    @objc public var intermediateResults: [String : [MTLBuffer]]?
-    public init(inDim: [Int], inPointer: UnsafeMutablePointer<Float32>?, inCapacity: Int, inIntermediateResults: [String : [MTLBuffer]]? = nil) {
-        dim = inDim
-        capacity = inCapacity
-        
-        if let inInPointer = inPointer {
-            resultPointer = UnsafeMutablePointer<Float32>.allocate(capacity: inCapacity)
-            resultPointer?.initialize(from: inInPointer, count: inCapacity)
-        }
-        
-        intermediateResults = inIntermediateResults
-    }
-    
-    public override var description: String {
-        return ""
-    }
-    
-}
-
-protocol Executorable {
-    func predict(input: MTLTexture, dim: Dim, completionHandle: @escaping ( _ success: Bool, _ result: [GPUResultHolder]?) -> Void, preProcessKernle: CusomKernel?, except: Int) throws
-    func clear()
-}
-
-public class Executor<P: PrecisionProtocol>: Executorable{
-    var ops: [Runable & InferShaperable] = []
-    var preInputDim: Dim = Dim.init(inDim: [])
-    let program: Program
-    let device: MTLDevice
-    let inflightSemaphore: DispatchSemaphore
-    let queue: MTLCommandQueue
-    private var isValid = true
-    
-    init(inDevice:MTLDevice, inQueue: MTLCommandQueue, inProgram: Program, initContext: InitContext) throws {
-        self.inflightSemaphore = DispatchSemaphore(value: 1)
-        program = inProgram
-        device = inDevice
-        queue = inQueue
-        
-        for block in inProgram.programDesc.blocks {
-            //block.ops.count
-            for i in 0..<block.ops.count {
-                let opDesc = block.ops[i]
-                let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: opDesc, scope: inProgram.scope, initContext: initContext)
-                ops.append(op)
-            }
-        }
-    }
-    
-    public func predict(input: MTLTexture, dim: Dim, completionHandle: @escaping ( _ success: Bool, _ result: [GPUResultHolder]?) -> Void, preProcessKernle: CusomKernel? = nil, except: Int = 0) throws {
-        inflightSemaphore.wait()
-        guard isValid else {
-            inflightSemaphore.signal()
-            throw PaddleMobileError.makeError(type: .predictError, msg: "Executor is cleared and invalid")
-        }
-        guard let buffer = queue.makeCommandBuffer() else {
-            inflightSemaphore.signal()
-            throw PaddleMobileError.makeError(type: .predictError, msg: "CommandBuffer is nil")
-        }
-        
-        let resInput: MTLTexture
-        if let inPre = preProcessKernle {
-            try inPre.compute(inputTexuture: input, commandBuffer: buffer)
-            resInput = inPre.outputTexture
-        } else {
-            resInput = input
-        }
-        
-        let inputTexture = InputTexture.init(inMTLTexture: resInput, inExpectDim: dim)
-        program.scope.setInput(input: inputTexture)
-        //(ops.count - except)
-        for i in 0..<(ops.count - except) {
-            let op = ops[i]
-            try op.run(device: device, buffer: buffer)
-        }
-        
-        var outputTextures: [String : [MTLBuffer]]?
-        if except > 0 {
-            try ops[ops.count - except].computeMiddleResult(device: device, buffer: buffer)
-            outputTextures = ops[ops.count - except].inputVariant()
-        }
-        
-        let safeComplete = { [weak self] (success: Bool, result: [GPUResultHolder]?) in
-            completionHandle(success, result)
-            self?.inflightSemaphore.signal()
-        }
-        
-        buffer.addCompletedHandler { [weak self] (commandbuffer) in
-            guard let SSelf = self else {
-                safeComplete(false, nil)
-                return
-            }
-            
-            guard SSelf.isValid else {
-                safeComplete(false, nil)
-                return
-            }
-            
-            //将输入写进文件
-            /*
-             let inputArr = resInput.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2]))
-             print(dim)
-             writeToLibrary(fileName: "mobilenet_input", array: inputArr)
-             print(" write done ")
-             return
-             */
-            
-            //输出 op 计算结果
-            if GlobalConfig.shared.debug {
-                for i in 0..<SSelf.ops.count {
-                    print("第 \(i) 个 op: " )
-                    let op = SSelf.ops[i]
-                    op.delogOutput()
-                }
-            }
-            
-            var resultHolder: GPUResultHolder?
-            if except > 0 {
-                resultHolder = GPUResultHolder.init(inDim: [], inPointer: nil, inCapacity: 0,  inIntermediateResults: outputTextures)
-            } else if let output = SSelf.program.scope.output() as? FetchHolder, let outputResult = output.result {
-                resultHolder = GPUResultHolder.init(inDim: output.dim.dims, inPointer: outputResult, inCapacity: output.capacity)
-            }
-            if let resultHolder = resultHolder {
-                safeComplete(true, [resultHolder])
-            } else {
-                safeComplete(false, nil)
-            }
-        }
-        
-        buffer.commit()
-    }
-    
-    public func clear() {
-        isValid = false
-        program.scope.clear()
-    }
-    
-    deinit {
-        while (inflightSemaphore.signal() != 0) {
-            
-        }
-    }
-    
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift
deleted file mode 100644
index aece176000..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift
+++ /dev/null
@@ -1,297 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-//import SwiftProtobuf
-
-protocol Loaderable {
-    func load(device: MTLDevice, paramPointer: UnsafeMutableRawPointer, paramSize: Int, modePointer: UnsafeMutableRawPointer, modelSize: Int, optimize: Bool) throws -> Program
-    func load(device: MTLDevice, modelPath: String, paraPath: String, optimize: Bool) throws -> Program
-}
-
-public class Loader<P: PrecisionProtocol>: Loaderable {
-    class ParaLoader {
-        let file: UnsafeMutablePointer<FILE>
-        let fileSize: Int
-        var nowIndex: Int
-        init(paramPath: String) throws {
-            guard let tmpFile = fopen(paramPath, "rb") else {
-                throw PaddleMobileError.makeError(type: .loaderError, msg: "open param file error" + paramPath)
-            }
-            file = tmpFile
-            fseek(file, 0, SEEK_END)
-            fileSize = ftell(file)
-            guard fileSize > 0 else {
-                fclose(file)
-                throw PaddleMobileError.makeError(type: .loaderError, msg: "param file size is too small")
-            }
-            rewind(file)
-            nowIndex = 0
-        }
-        
-        func read(tensor: Tensor<P>) throws {
-            guard nowIndex <= fileSize else {
-                throw PaddleMobileError.makeError(type: .loaderError, msg: "out of the file range")
-            }
-            
-            func pointerReader<T>(type: T.Type) -> T {
-                let ptr = UnsafeMutablePointer<T>.allocate(capacity: 1)
-                fread(ptr, 1, MemoryLayout<T>.size, file)
-                nowIndex += MemoryLayout<T>.size
-                let pointee = ptr.pointee
-                ptr.deinitialize(count: 1)
-                ptr.deallocate()
-                return pointee
-            }
-            
-            let _ = pointerReader(type: UInt32.self)
-            let lodLevel = pointerReader(type: UInt64.self)
-            for _ in 0..<lodLevel {
-                let size = pointerReader(type: UInt64.self)
-                for _ in 0..<Int(size/UInt64(MemoryLayout<size_t>.size)){
-                    _ = pointerReader(type: size_t.self)
-                }
-            }
-            
-            let _ = pointerReader(type: UInt32.self)
-            
-            // 读取张量信息
-            let tensorDescSize = Int(pointerReader(type: Int32.self))
-            
-            if GlobalConfig.shared.debug {
-                let tensorDescCharArray = UnsafeMutablePointer<CChar>.allocate(capacity: tensorDescSize)
-                for i in 0..<tensorDescSize {
-                    let ch = pointerReader(type: CChar.self)
-                    tensorDescCharArray[i] = ch
-                }
-                let data = Data(bytes: tensorDescCharArray, count: MemoryLayout<CChar>.size * tensorDescSize)
-                var tensorDescFromParams: VarType_TensorDesc?
-                do {
-                    tensorDescFromParams = try VarType_TensorDesc.init(data: data)
-                } catch _ {
-                }
-                tensorDescCharArray.deinitialize(count: tensorDescSize)
-                tensorDescCharArray.deallocate()
-                repeat {
-                    guard let tensorDescFromParams = tensorDescFromParams, let dimsArrayFromParams = tensorDescFromParams.dimsArray else {
-                        paddleMobileLog("tensorDescFromParams is nil", logLevel: .FatalError)
-                        break
-                    }
-                    if tensorDescFromParams.dimsArray_Count != dimsArrayFromParams.count {
-                        paddleMobileLog("dimsArray_Count not equal to tensorDescFromParams.dimsArray.count", logLevel: .FatalError)
-                        break
-                    }
-                    if tensorDescFromParams.dimsArray_Count != tensor.tensorDim.cout() {
-                        paddleMobileLog("dimsArray_Count not equal to tensor.tensorDim.cout()", logLevel: .FatalError)
-                        break
-                    }
-                    for i in 0..<dimsArrayFromParams.count {
-                        if dimsArrayFromParams.value(at: i) != tensor.tensorDim[Int(i)] {
-                            paddleMobileLog("tensorDescFromParams \(String(describing: tensorDescFromParams.dimsArray)) not equal to tensor.tensorDim \(tensor.tensorDim)", logLevel: .FatalError)
-                            break
-                        }
-                    }
-                } while (false)
-            } else {
-                fseek(file, MemoryLayout<CChar>.size * tensorDescSize, SEEK_CUR)
-                nowIndex += MemoryLayout<CChar>.size * tensorDescSize
-            }
-            
-            /*
-             这里没有根据 Data Type 去判断, 而是从外部泛型直接指定了精度
-             */
-            
-            //现在模型传入模型为  Float 类型, 这块应该根据模型来
-            //            let tmpCapacity = MemoryLayout<Float>.size * tensor.numel()
-            //            let tmpPointer = UnsafeMutablePointer<Float>.allocate(capacity: tmpCapacity);
-            let bytesRead = fread(tensor.data.pointer, 1, tensor.data.size, file)
-            
-            guard bytesRead == tensor.data.size else {
-                throw PaddleMobileError.makeError(type: .loaderError, msg: "param read size error")
-            }
-            
-            // TODO: use script to convert
-            //            let bytesRead = fread(tmpPointer, 1, tmpCapacity, file)
-            //            for i in 0..<tensor.numel() {
-            //                tensor.data[i] = P.init(inFloat: tmpPointer[i])
-            //            }
-            //            tmpPointer.deinitialize(count: tmpCapacity)
-            //            tmpPointer.deallocate()
-            
-            nowIndex += bytesRead
-        }
-        
-        deinit {
-            fclose(file)
-        }
-    }
-    class ParaLoaderWithPointer {
-        var paramPointer: UnsafeMutableRawPointer
-        let paramSize: Int
-        var nowIndex: Int
-        init(pPointer: UnsafeMutableRawPointer,pSize:Int) throws {
-            paramPointer = UnsafeMutableRawPointer.init(pPointer)
-            paramSize = pSize
-            nowIndex = 0
-        }
-        
-        func read(tensor: Tensor<P>) throws {
-            guard nowIndex < paramSize else {
-                throw PaddleMobileError.makeError(type: .loaderError, msg: "out of the file range")
-            }
-            func pointerReader<T>(type: T.Type) throws -> T {
-                guard nowIndex + MemoryLayout<T>.size <= paramSize else {
-                    throw PaddleMobileError.makeError(type: .loaderError, msg: "must satisfy nowIndex:\(nowIndex)+MemoryLayout<T>.size:\(MemoryLayout<T>.size) <= paramSize:\(paramSize)")
-                }
-                let ptr = UnsafeMutablePointer<T>.allocate(capacity: 1)
-                memcpy(ptr, paramPointer.advanced(by: nowIndex), MemoryLayout<T>.size)
-                nowIndex += MemoryLayout<T>.size
-                let pointee = ptr.pointee
-                ptr.deinitialize(count: MemoryLayout<UInt32>.size)
-                ptr.deallocate()
-                return pointee
-            }
-            let _ = try pointerReader(type: UInt32.self)
-            let lodLevel = try pointerReader(type: UInt64.self)
-            for _ in 0..<lodLevel {
-                let size = try pointerReader(type: UInt64.self)
-                for _ in 0..<Int(size/UInt64(MemoryLayout<size_t>.size)){
-                    _ = try pointerReader(type: size_t.self)
-                }
-            }
-            
-            let _ = try pointerReader(type: UInt32.self)
-            let tensorDescSize = try pointerReader(type: Int32.self)
-            nowIndex += Int(tensorDescSize)
-            
-            let _ = memcpy(tensor.data.pointer, paramPointer.advanced(by: nowIndex), tensor.data.size)
-            nowIndex += tensor.data.size
-        }
-        deinit {
-        }
-    }
-    public init(){}
-    private func loadModelandParam(_ device: MTLDevice, _ modelData: Data, _ paraLoaderPointer: ParaLoaderWithPointer?, _ paraLoader: ParaLoader?, _ optimize: Bool = true) throws -> Program {
-        do {
-            /// swift protobuf serialized Data to instance class
-            //      let protoProgram = try PaddleMobile_Framework_Proto_ProgramDesc.init(
-            //        serializedData: modelData)
-            
-            /// oc protobuf serialized Data to instance class
-            let protoProgram = try ProgramDesc.init(data: (modelData as NSData) as Data)
-            
-            let originProgramDesc = try PMProgramDesc.init(protoProgram: protoProgram)
-            let programDesc = optimize ? (ProgramOptimize<P>.init().optimize(originProgramDesc: originProgramDesc) ?? originProgramDesc) : originProgramDesc
-            
-            //      let programDesc = PMProgramDesc.init(protoProgram: protoProgram)
-            if GlobalConfig.shared.debug {
-                paddleMobileLog("\(programDesc)")
-            }
-            
-            guard programDesc.blocks.count > 0 else {
-                throw PaddleMobileError.makeError(type: .loaderError, msg: "count of blocks must greater than 0")
-            }
-            
-            // to get feed key and fetch key
-            let block = programDesc.blocks[0]
-            guard let firstOp = block.ops.first, let lastOp = block.ops.last else {
-                throw PaddleMobileError.makeError(type: .loaderError, msg: "at least two operator")
-            }
-            
-            guard firstOp.type == gFeedType, lastOp.type == gFetchType else {
-                throw PaddleMobileError.makeError(type: .loaderError, msg: "the first op is not feed or the last op is not fetch")
-            }
-            
-            guard let inputKey = opInfos[gFeedType]?.inputs.first, let outKey = opInfos[gFetchType]?.outputs.first else {
-                throw PaddleMobileError.makeError(type: .loaderError, msg: "the feed input key or fetch output key not found")
-            }
-            guard let feedKey = firstOp.inputs[inputKey]?.first, let fetchKey = lastOp.outputs[outKey]?.first else {
-                throw PaddleMobileError.makeError(type: .loaderError, msg: "feed key or fetch key not found")
-            }
-            
-            let scope = Scope.init(inFeedKey: feedKey, inFetchKey: fetchKey)
-            
-            // to load memory
-            for block in programDesc.blocks {
-                for varDesc in block.vars {
-                    if (varDesc.type == .LodTensor) {
-                        guard let tensorDesc = varDesc.tensorDesc else {
-                            throw PaddleMobileError.makeError(type: .loaderError, msg: "get tensor desc failed")
-                        }
-                        
-                        if (varDesc.persistable
-                            && varDesc.type != .FeedMiniBatch
-                            && varDesc.type != .FetchList) {
-                            let dimArr = tensorDesc.dims
-                            
-                            guard dimArr.count > 0 else {
-                                throw PaddleMobileError.makeError(type: .loaderError, msg: "tensor desc dim size error")
-                            }
-                            
-                            let dim = Dim.init(inDim: dimArr)
-                            let tensor = Tensor<P>.init(inDim: dim, inLayout: tensorDesc.dataLayout, originDimsCount: tensorDesc.originDimsCount)
-                        
-                            if paraLoaderPointer != nil {
-                                try paraLoaderPointer!.read(tensor: tensor)
-                            }
-                            
-                            if paraLoader != nil {
-                                try paraLoader!.read(tensor: tensor)
-                            }
-                            //              tensor.convert(to: DataLayout.NHWC())
-                            //                            tensor.initBuffer(device: device)
-                            scope[varDesc.name] = tensor
-                        } else {
-                            let dim = Dim.init(inDim: tensorDesc.dims)
-                            let texture = try Texture.init(device: device, inDim: dim)
-                            texture.originDimsCount = tensorDesc.originDimsCount
-                            scope[varDesc.name] = texture
-                        }
-                    } else {
-                        if varDesc.name == fetchKey {
-                            //              scope[varDesc.name] = ResultHolder.init(inDim: [], inResult: [], inCapacity: <#Int#>, inElapsedTime: 0.0)
-                        } else if varDesc.name == feedKey {
-                        }
-                    }
-                }
-            }
-            
-            let program = Program.init(inProgramDesc: programDesc, inScope: scope)
-            
-            return program
-        } catch _ {
-            throw PaddleMobileError.makeError(type: .loaderError, msg: "protobuf decoder error")
-        }
-    }
-    public func load(device: MTLDevice, paramPointer: UnsafeMutableRawPointer, paramSize: Int, modePointer: UnsafeMutableRawPointer, modelSize: Int, optimize: Bool = true) throws -> Program {
-        let modelData = Data.init(bytes:modePointer, count:modelSize)
-        guard let paraLoader = try? ParaLoaderWithPointer.init(pPointer: paramPointer,pSize: paramSize) else {
-            throw PaddleMobileError.makeError(type: .loaderError, msg: "load para error")
-        }
-        let program = try loadModelandParam(device, modelData, paraLoader, nil, optimize)
-        return program
-    }
-    
-    public func load(device: MTLDevice, modelPath: String, paraPath: String, optimize: Bool = true) throws -> Program {
-        guard let modelData = try? Data.init(contentsOf: URL.init(fileURLWithPath: modelPath)) else {
-            throw PaddleMobileError.makeError(type: .loaderError, msg: "load " + modelPath + " failed !")
-        }
-        guard let paraLoader = try? ParaLoader.init(paramPath: paraPath) else {
-            throw PaddleMobileError.makeError(type: .loaderError, msg: "load para error")
-        }
-        
-        let program = try loadModelandParam(device, modelData, nil, paraLoader, optimize)
-        return program
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift
deleted file mode 100644
index 871e458539..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift
+++ /dev/null
@@ -1,585 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import MetalKit
-import CoreMedia
-
-protocol Tensorial: Variant {
-    var dim: Dim { get set }
-    func numel() -> Int
-    var layout: DataLayout { get }
-}
-
-extension Tensorial {
-    func numel() -> Int {
-        return dim.numel()
-    }
-}
-
-class DataConverter<P: PrecisionProtocol> {
-    func convert(from: UnsafeMutablePointer<P>, to: UnsafeMutablePointer<P>, fromDim: Dim) throws {
-        throw PaddleMobileError.makeError(type: .memoryError, msg: "DataConverter convert need imp")
-    }
-    
-    func getToDim(fromDim: Dim, layout: DataLayout) throws -> (dim: Dim, layout: DataLayout) {
-        throw PaddleMobileError.makeError(type: .memoryError, msg: "DataConverter getToDim need imp")
-    }
-    
-    func capacity(fromDim: Dim) throws -> Int? {
-        return nil
-    }
-}
-
-/// [ outputChannels ][ inputChannels ][ kernelHeight ][ kernelWidth ] ->
-/// [ outputChannels ][ kernelHeight ][ kernelWidth ][ inputChannels ]
-class MPSPointerConverter<P: PrecisionProtocol>: DataConverter<P>{
-    
-    /// [ outputChannels ][ inputChannels ][ kernelHeight ][ kernelWidth ] ->
-    /// [ outputChannels ][ kernelHeight ][ kernelWidth ][ inputChannels ]
-    /// - Parameters:
-    ///   - from: from pointer
-    ///   - to: to pointer
-    override func convert(from: UnsafeMutablePointer<P>, to: UnsafeMutablePointer<P>, fromDim: Dim) throws {
-        let outputChannels = fromDim[0]
-        let inputChannels = fromDim[1]
-        let kernelHeight = fromDim[2]
-        let kernelWidth = fromDim[3]
-        
-        for outChannel in 0..<outputChannels {
-            for kernelH in 0..<kernelHeight {
-                for kernelW in 0..<kernelWidth {
-                    for inChannel in 0..<inputChannels {
-                        to[outChannel * inputChannels * kernelHeight * kernelWidth + kernelH * kernelWidth * inputChannels + kernelW * inputChannels + inChannel] =
-                        from[outChannel * inputChannels * kernelHeight * kernelWidth + inChannel * kernelHeight * kernelWidth + kernelH * kernelWidth + kernelW]
-                    }
-                }
-            }
-        }
-    }
-    
-    override func getToDim(fromDim: Dim, layout: DataLayout) throws -> (dim: Dim, layout: DataLayout) {
-        
-        if layout != DataLayout.NCHW() {
-            throw PaddleMobileError.makeError(type: .memoryError, msg: "MPSPointerConverter layout other than NCHW unsupported")
-        }
-        
-        let outputChannels = fromDim[0]
-        let inputChannels = fromDim[1]
-        let kernelHeight = fromDim[2]
-        let kernelWidth = fromDim[3]
-        let toDim = Dim.init(inDim: [outputChannels, kernelHeight, kernelWidth, inputChannels])
-        
-        return (dim: toDim, layout: DataLayout.NHWC())
-    }
-}
-
-class WinogradPointerConverter<P: PrecisionProtocol>: DataConverter<P>{
-    override func convert(from: UnsafeMutablePointer<P>, to: UnsafeMutablePointer<P>, fromDim: Dim) throws {
-        let N = fromDim[0]
-        let C = fromDim[1]
-        let H = fromDim[2]
-        let W = fromDim[3]
-        if H != 3 || W != 3 {
-            throw PaddleMobileError.makeError(type: .memoryError, msg: "WinogradPointerConverter convert H and W must equal to 3")
-        }
-        for n in 0..<N {
-            for c in 0..<C {
-                let fromOffset = n * C * H * W + c * H * W
-                let toOffset = n * C * (H + 1) * (W + 1) + c * (H + 1) * (W + 1)
-                func f(_ h: Int, _ w: Int) -> P {
-                    return from[fromOffset + h * W + w]
-                }
-                let c05 = try P(Float(0.5))
-                let c025 = try P(Float(0.25))
-                to[toOffset] = f(0, 0);
-                to[toOffset + 1] = c05 * f(0, 0)
-                to[toOffset + 1] = to[toOffset + 1] + c05 * f(0, 1)
-                to[toOffset + 1] = to[toOffset + 1] + c05 * f(0, 2)
-                to[toOffset + 2] = c05 * f(0, 0)
-                to[toOffset + 2] = to[toOffset + 2] - c05 * f(0, 1)
-                to[toOffset + 2] = to[toOffset + 2] + c05 * f(0, 2)
-                to[toOffset + 3] = f(0, 2)
-                to[toOffset + 4] = c05 * f(0, 0)
-                to[toOffset + 4] = to[toOffset + 4] + c05 * f(1, 0)
-                to[toOffset + 4] = to[toOffset + 4] + c05 * f(2, 0)
-                to[toOffset + 5] = c025 * f(0, 0)
-                to[toOffset + 5] = to[toOffset + 5] + c025 * f(0, 1)
-                to[toOffset + 5] = to[toOffset + 5] + c025 * f(0, 2)
-                to[toOffset + 5] = to[toOffset + 5] + c025 * f(1, 0)
-                to[toOffset + 5] = to[toOffset + 5] + c025 * f(1, 1)
-                to[toOffset + 5] = to[toOffset + 5] + c025 * f(1, 2)
-                to[toOffset + 5] = to[toOffset + 5] + c025 * f(2, 0)
-                to[toOffset + 5] = to[toOffset + 5] + c025 * f(2, 1)
-                to[toOffset + 5] = to[toOffset + 5] + c025 * f(2, 2)
-                to[toOffset + 6] = c025 * f(0, 0)
-                to[toOffset + 6] = to[toOffset + 6] - c025 * f(0, 1)
-                to[toOffset + 6] = to[toOffset + 6] + c025 * f(0, 2)
-                to[toOffset + 6] = to[toOffset + 6] + c025 * f(1, 0)
-                to[toOffset + 6] = to[toOffset + 6] - c025 * f(1, 1)
-                to[toOffset + 6] = to[toOffset + 6] + c025 * f(1, 2)
-                to[toOffset + 6] = to[toOffset + 6] + c025 * f(2, 0)
-                to[toOffset + 6] = to[toOffset + 6] - c025 * f(2, 1)
-                to[toOffset + 6] = to[toOffset + 6] + c025 * f(2, 2)
-                to[toOffset + 7] = c05 * f(0, 2)
-                to[toOffset + 7] = to[toOffset + 7] + c05 * f(1, 2)
-                to[toOffset + 7] = to[toOffset + 7] + c05 * f(2, 2)
-                to[toOffset + 8] = c05 * f(0, 0)
-                to[toOffset + 8] = to[toOffset + 8] - c05 * f(1, 0)
-                to[toOffset + 8] = to[toOffset + 8] + c05 * f(2, 0)
-                to[toOffset + 9] = c025 * f(0, 0)
-                to[toOffset + 9] = to[toOffset + 9] + c025 * f(0, 1)
-                to[toOffset + 9] = to[toOffset + 9] + c025 * f(0, 2)
-                to[toOffset + 9] = to[toOffset + 9] - c025 * f(1, 0)
-                to[toOffset + 9] = to[toOffset + 9] - c025 * f(1, 1)
-                to[toOffset + 9] = to[toOffset + 9] - c025 * f(1, 2)
-                to[toOffset + 9] = to[toOffset + 9] + c025 * f(2, 0)
-                to[toOffset + 9] = to[toOffset + 9] + c025 * f(2, 1)
-                to[toOffset + 9] = to[toOffset + 9] + c025 * f(2, 2)
-                to[toOffset + 10] = c025 * f(0, 0)
-                to[toOffset + 10] = to[toOffset + 10] - c025 * f(0, 1)
-                to[toOffset + 10] = to[toOffset + 10] + c025 * f(0, 2)
-                to[toOffset + 10] = to[toOffset + 10] - c025 * f(1, 0)
-                to[toOffset + 10] = to[toOffset + 10] + c025 * f(1, 1)
-                to[toOffset + 10] = to[toOffset + 10] - c025 * f(1, 2)
-                to[toOffset + 10] = to[toOffset + 10] + c025 * f(2, 0)
-                to[toOffset + 10] = to[toOffset + 10] - c025 * f(2, 1)
-                to[toOffset + 10] = to[toOffset + 10] + c025 * f(2, 2)
-                to[toOffset + 11] = c05 * f(0, 2)
-                to[toOffset + 11] = to[toOffset + 11] - c05 * f(1, 2)
-                to[toOffset + 11] = to[toOffset + 11] + c05 * f(2, 2)
-                to[toOffset + 12] = f(2, 0)
-                to[toOffset + 13] = c05 * f(2, 0)
-                to[toOffset + 13] = to[toOffset + 13] + c05 * f(2, 1)
-                to[toOffset + 13] = to[toOffset + 13] + c05 * f(2, 2)
-                to[toOffset + 14] = c05 * f(2, 0)
-                to[toOffset + 14] = to[toOffset + 14] - c05 * f(2, 1)
-                to[toOffset + 14] = to[toOffset + 14] + c05 * f(2, 2)
-                to[toOffset + 15] = f(2, 2)
-            }
-        }
-    }
-    
-    override func getToDim(fromDim: Dim, layout: DataLayout) throws -> (dim: Dim, layout: DataLayout) {
-        if layout != DataLayout.NCHW() {
-            throw PaddleMobileError.makeError(type: .memoryError, msg: "WinogradPointerConverter getToDim only support NCHW")
-        }
-        let N = fromDim[0]
-        let C = fromDim[1]
-        let H = fromDim[2]
-        let W = fromDim[3]
-        if H != 3 || W != 3 {
-            throw PaddleMobileError.makeError(type: .memoryError, msg: "WinogradPointerConverter getToDim H and W must equal to 3")
-        }
-        let toDim = Dim.init(inDim: [N, C, H + 1, W + 1])
-        return (dim: toDim, layout: DataLayout.NCHW())
-    }
-    
-    override func capacity(fromDim: Dim) throws -> Int? {
-        let N = fromDim[0]
-        let C = fromDim[1]
-        let H = fromDim[2]
-        let W = fromDim[3]
-        if H != 3 || W != 3 {
-            throw PaddleMobileError.makeError(type: .memoryError, msg: "WinogradPointerConverter capacity H and W must equal to 3")
-        }
-        return N * C * (H + 1) * (W + 1)
-    }
-}
-
-class Tensor<P: PrecisionProtocol>: Tensorial {
-    
-    var data: Data
-    var dim: Dim
-    var originDimsCount: Int
-    
-    /// 模型中的维度: 未经过转换 paddle 模型维度为 N C H W
-    var tensorDim: Dim
-    var buffer: MTLBuffer!
-    private(set) var layout: DataLayout
-    
-    class Data {
-        private var released = false
-        let count: Int
-        let size: Int
-        init(inCount: Int, inPointer: UnsafeMutablePointer<P>) {
-            count = inCount
-            size = inCount * MemoryLayout<P>.size
-            pointer = inPointer
-        }
-        internal private(set) var pointer: UnsafeMutablePointer<P>
-        subscript(index: Int) -> P {
-            get {
-                return pointer[index]
-            }
-            set {
-                pointer[index] = newValue
-            }
-        }
-        func release() {
-            if !released {
-                pointer.deinitialize(count: count)
-                pointer.deallocate()
-                released = true
-            }
-        }
-        
-        deinit {
-            if !released {
-                pointer.deinitialize(count: count)
-                pointer.deallocate()
-                released = true
-            }
-        }
-    }
-    
-    init(inDim: Dim, inLayout: DataLayout = DataLayout.NCHW(), originDimsCount: Int?) {
-        tensorDim = inDim
-        dim = inDim
-        let pointer = UnsafeMutablePointer<P>.allocate(capacity: inDim.numel())
-        data = Data.init(inCount: inDim.numel(), inPointer: pointer)
-        layout = inLayout
-        self.originDimsCount = originDimsCount ?? inDim.cout()
-    }
-    
-    func convert(converter: DataConverter<P>) throws -> UnsafeMutablePointer<P> {
-        let toCapacity = try converter.capacity(fromDim: dim) ?? numel()
-        let to = UnsafeMutablePointer<P>.allocate(capacity: toCapacity)
-        do {
-            try converter.convert(from: data.pointer, to: to, fromDim: dim)
-        } catch let error {
-            to.deallocate()
-            throw error
-        }
-        data = Data.init(inCount: toCapacity, inPointer: to)
-        let dimAndLayout = try converter.getToDim(fromDim: dim, layout: layout)
-        dim = dimAndLayout.dim
-        layout = dimAndLayout.layout
-        return to
-    }
-    
-    func convert(to: DataLayout) {
-        guard to != layout else {
-            return
-        }
-        
-        guard dim.cout() == 4 else {
-            return
-        }
-        
-        guard layout == DataLayout.NCHW() && to == DataLayout.NHWC() else {
-            // other not support
-            return
-        }
-        
-        let newPointer = UnsafeMutablePointer<P>.allocate(capacity: numel())
-        
-        if layout == DataLayout.NCHW() {
-            NCHW2NHWC(newPtr: newPointer)
-        }
-        
-        data.release()
-        data = Data.init(inCount: data.count, inPointer: newPointer)
-        layout = to
-    }
-    
-    func initBuffer(device: MTLDevice, precision computePrecision: Precision = .Float16, padWhenOneC: Bool = false, convertToNHWC: Bool = true, withTranspose: Bool = false) throws {
-        if convertToNHWC {
-            convert(to: DataLayout.NHWC())
-        }
-        
-        if P.precisionType == .Float16 && computePrecision == .Float32 {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "Float16 model can not compute in Float32 precision")
-        }
-        
-        if withTranspose {
-            let transposePointer = UnsafeMutablePointer<P>.allocate(capacity: numel())
-            let n = dim[0]
-            let hwc = numel()/n
-            for j in 0..<hwc {
-                for i in 0..<n {
-                    //data[i * hwc + j]
-                    transposePointer[j * n + i] = data[i * hwc + j]
-                }
-            }
-            
-            dim.swapeDimAt(index1: 0, index2: 3)
-            data.release()
-            data = Data.init(inCount: data.count, inPointer: transposePointer)
-        }
-        
-        let precisionSize: Int
-        switch computePrecision {
-        case .Float32:
-            precisionSize = 4
-        case .Float16:
-            precisionSize = 2
-        }
-        
-        if dim.cout() == 4 {
-            if layout == DataLayout.NHWC() {
-                let C = dim[3]
-                let cSlices = (C + 3) / 4
-                let paddedC = cSlices * 4
-                let count = paddedC * dim[0] * dim[1] * dim[2]
-                if C == paddedC {
-                    buffer = device.makeBuffer(length: count * precisionSize)
-                    switch P.precisionType {
-                    case .Float16:
-                        buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
-                    case .Float32:
-                        switch computePrecision {
-                        case .Float32:
-                            buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
-                        case .Float16:
-                            try float32ToFloat16(input: data.pointer as! UnsafeMutablePointer<Float32>, output: buffer.contents(), count: count)
-                        }
-                    }
-                } else if C == 1 && !padWhenOneC {
-                    buffer = device.makeBuffer(length: numel() * precisionSize)
-                    switch P.precisionType {
-                    case .Float16:
-                        buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
-                    case .Float32:
-                        switch computePrecision {
-                        case .Float32:
-                            buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
-                        case .Float16:
-                            try float32ToFloat16(input: data.pointer as! UnsafeMutablePointer<Float32>, output: buffer.contents(), count: numel())
-                        }
-                    }
-                } else {
-                    buffer = device.makeBuffer(length: count * precisionSize)
-                    let convertedPointer = UnsafeMutablePointer<P>.allocate(capacity: count)
-                    defer {
-                        convertedPointer.deinitialize(count: count)
-                        convertedPointer.deallocate()
-                    }
-                    var tmpPointer = data.pointer
-                    var dstPtr = convertedPointer
-                    for _ in 0..<dim[0] * dim[1] * dim[2] {
-                        for j in 0..<paddedC {
-                            if j < C {
-                                dstPtr[j] = tmpPointer[j]
-                            } else {
-                                dstPtr[j] = P.initializeValue()
-                            }
-                        }
-                        tmpPointer += C
-                        dstPtr += paddedC
-                    }
-                    
-                    switch P.precisionType {
-                    case .Float16:
-                        buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout<P>.stride)
-                    case .Float32:
-                        switch computePrecision {
-                        case .Float32:
-                            buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout<P>.stride)
-                        case .Float16:
-                            try float32ToFloat16(input: convertedPointer as! UnsafeMutablePointer<Float32>, output: buffer.contents(), count: count)
-                        }
-                    }
-                }
-            } else {
-                let C = dim[3]
-                let cSlices = (C + 3) / 4
-                let paddedC = cSlices * 4
-                let count = paddedC * dim[0] * dim[1] * dim[2]
-                if C == paddedC {
-                    buffer = device.makeBuffer(length: count * precisionSize)
-                    switch P.precisionType {
-                    case .Float16:
-                        buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
-                    case .Float32:
-                        switch computePrecision {
-                        case .Float32:
-                            buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
-                        case .Float16:
-                            try float32ToFloat16(input: data.pointer as! UnsafeMutablePointer<Float32>, output: buffer.contents(), count: count)
-                        }
-                    }
-                } else if C == 1 {
-                    throw PaddleMobileError.makeError(type: .netError, msg: "Tensor initBuffer channel can not be 1")
-                } else {
-                    buffer = device.makeBuffer(length: count * precisionSize)
-                    let convertedPointer = UnsafeMutablePointer<P>.allocate(capacity: count)
-                    defer {
-                        convertedPointer.deinitialize(count: count)
-                        convertedPointer.deallocate()
-                    }
-                    var tmpPointer = data.pointer
-                    var dstPtr = convertedPointer
-                    for _ in 0..<dim[0] * dim[1] * dim[2] {
-                        for j in 0..<paddedC {
-                            if j < C {
-                                dstPtr[j] = tmpPointer[j]
-                            } else {
-                                dstPtr[j] = P.initializeValue()
-                            }
-                        }
-                        tmpPointer += C
-                        dstPtr += paddedC
-                    }
-                    
-                    switch P.precisionType {
-                    case .Float16: // 模型精度为 32 位
-                        buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout<P>.stride)
-                    case .Float32: // 模型精度为 16 位
-                        switch computePrecision {
-                        case .Float32:
-                            buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout<P>.stride)
-                        case .Float16:
-                            try float32ToFloat16(input: convertedPointer as! UnsafeMutablePointer<Float32>, output: buffer.contents(), count: count)
-                        }
-                    }
-                }
-            }
-        } else if dim.cout() == 1 {
-            let num = ((numel() + 3) / 4) * 4
-            buffer = device.makeBuffer(length: num * precisionSize)
-            
-            switch P.precisionType {
-            case .Float16:
-                buffer?.contents().copyMemory(from: data.pointer, byteCount: num * MemoryLayout<P>.stride)
-            case .Float32:
-                switch computePrecision {
-                case .Float32:
-                    buffer?.contents().copyMemory(from: data.pointer, byteCount: num * MemoryLayout<P>.stride)
-                case .Float16:
-                    try float32ToFloat16(input: data.pointer as! UnsafeMutablePointer<Float32>, output: buffer.contents(), count: num)
-                }
-            }
-        } else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "not support tensor initBuffer dim count \(dim.cout())")
-        }
-        data.release()
-    }
-    
-    
-    var n: Int? {
-        get {
-            if dim.cout() == 4 {
-                if layout == DataLayout.NCHW() {
-                    return dim[0]
-                } else if layout == DataLayout.NHWC() {
-                    return dim[0]
-                } else {
-                    return nil
-                }
-            } else {
-                return nil
-            }
-        }
-    }
-
-    var width: Int? {
-        get {
-            if dim.cout() == 4 {
-                if layout == DataLayout.NHWC() {
-                    return dim[2]
-                } else if layout == DataLayout.NCHW() {
-                    return dim[3]
-                } else {
-                    return nil
-                }
-            } else {
-                return nil
-            }
-        }
-    }
-
-    var height: Int? {
-        get {
-            if dim.cout() == 4 {
-                if layout == DataLayout.NHWC() {
-                    return dim[1]
-                } else if layout == DataLayout.NCHW() {
-                    return dim[2]
-                } else {
-                    return nil
-                }
-            } else {
-                return nil
-            }
-        }
-    }
-
-    var channel: Int? {
-        get {
-            if dim.cout() == 4 {
-                if layout == DataLayout.NHWC() {
-                    return dim[3]
-                } else if layout == DataLayout.NCHW() {
-                    return dim[1]
-                } else {
-                    return nil
-                }
-            } else {
-                return nil
-            }
-        }
-    }
-    
-    
-    func NCHW2NHWC(newPtr: UnsafeMutablePointer<P>) {
-        let N = dim[0]
-        let C = dim[1]
-        let H = dim[2]
-        let W = dim[3]
-        let HXW = H * W
-        let CXHXW = C * H * W
-        
-        var index: Int = 0
-        for n in 0..<N {
-            for h in 0..<H{
-                for w in 0..<W{
-                    for c in 0..<C{
-                        newPtr[index] = data.pointer[n * CXHXW + c * HXW + h * W + w]
-                        index += 1
-                    }
-                }
-            }
-        }
-        dim.swapeDimAt(index1: 1, index2: 3)
-    }
-}
-
-extension Tensor {
-    
-    var debugDescription: String {
-        var str = "dim: \(dim) \n"
-        str += "MTLBuffer: \(self.buffer.description) \n"
-        for i in 0..<buffer.length/MemoryLayout<P>.size {
-            str += " \(buffer.contents().assumingMemoryBound(to: P.self)[i])"
-        }
-        return str
-    }
-    
-    func logDataPointer(header: String = "") {
-        paddleMobileLog(header)
-        var str = ""
-        str += "data count: \(data.count) \n"
-        str += "dim: \(dim) \n"
-        for i in 0..<numel() {
-            str += " \(data.pointer[i])"
-        }
-        paddleMobileLog(str)
-    }
-    
-    var description: String {
-        return debugDescription
-    }
-    
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Texture.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Texture.swift
deleted file mode 100644
index 45ed819c20..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Texture.swift
+++ /dev/null
@@ -1,262 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Metal
-import Foundation
-
-class InputTexture {
-    let mtlTexture: MTLTexture
-    let expectDim: Dim
-    init(inMTLTexture: MTLTexture, inExpectDim: Dim) {
-        mtlTexture = inMTLTexture
-        expectDim = inExpectDim
-    }
-}
-
-extension InputTexture {
-    var description: String {
-        get{
-            return mtlTexture.description
-        }
-    }
-    
-    var debugDescription: String {
-        get {
-            return mtlTexture.debugDescription ?? " MetalTexture "
-        }
-    }
-}
-
-
-/*
- 4 维 tensor 存储 texture，要考虑 transpose
- transpose 之后的维度是 [a, b, c, d]，对应的texture_2darray
- .width = c
- .height = b
- .len = a * d + 3 / 4
- 
- 低于 4 维的 tensor，transpose 必须为 [0, 1, 2, 3] 既不考虑 transpose
- 
- // TODO transpose 对于低维 tensor 的扩展原则。。。
- // [a, b] -> [1, 1, a, b] transpose 必须为 [0, 1, x, x]
- // [a] -> [1, 1, 1, a] transpose 必须为 [0, 1, 2, 3]
- // [a, b, c] -> [1, a, b, c] tranpose 必须为 [0, x, x, x]
- 
- 3 维 tensor [a, b, c] 对应的 texture_2darray,
- .width = c
- .height = b
- .len = a + 3 / 4
- 
- 2 维 tensor [a, b] 对应的 texture_2darray
- .width = b + 3 / 4
- .height = a
- .len = 1
- 
- 1 维 tensor [a] 对应的 texture_2darray
- .width = a + 3 / 4
- .height = 1
- .len = 1
- */
-public class Texture: Tensorial {
-    public var dim: Dim
-    public var tensorDim: Dim
-    public var useMPS = false
-    public var originDimsCount: Int?
-    
-    /// tensor dim pad to four
-    public var padToFourDim: Dim
-    private(set) var textureDesc: MTLTextureDescriptor?
-    public var metalTexture: MTLTexture? {
-        get {
-            if _metalTexture == nil, let tmpTextureDesc = textureDesc {
-                _metalTexture = device.makeTexture(descriptor: tmpTextureDesc)
-            }
-            return _metalTexture
-        }
-        set {
-            _metalTexture = newValue
-        }
-    }
-    private var _metalTexture: MTLTexture?
-    var transpose: [Int] = [0, 1, 2, 3]
-
-    public var device: MTLDevice
-    
-    func elementCount() -> Int {
-        if let tmpMetalTexture = _metalTexture {
-            return tmpMetalTexture.width * tmpMetalTexture.height * tmpMetalTexture.arrayLength * 4
-        } else if let tmpTextureDesc = textureDesc {
-            return tmpTextureDesc.width * tmpTextureDesc.height * tmpTextureDesc.arrayLength * 4
-        } else {
-            paddleMobileLog("texture desc nil, using tensorDim.numel() as element count may cause trouble", logLevel: .Warning)
-            return tensorDim.numel()
-        }
-    }
-    
-    func toTensor() throws -> [Float32] {
-        guard padToFourDim.cout() == 4 else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "Texture toTensor padToFourDim count must be 4")
-        }
-        guard let tmpMetalTexture = metalTexture else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "metaltexture nil")
-        }
-        return try tmpMetalTexture.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2]))
-    }
-    
-    func realNHWC() throws -> [Float32] {
-        guard padToFourDim.cout() == 4 else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "Texture toTensor padToFourDim count must be 4")
-        }
-        guard let tmpMetalTexture = metalTexture else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "metaltexture nil")
-        }
-        return try tmpMetalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
-    }
-
-    public func initTexture(device: MTLDevice, inTranspose: [Int] = [0, 1, 2, 3], computePrecision: Precision = .Float16) throws {
-        transpose = inTranspose
-        for i in 0..<(4 - tensorDim.cout()) {
-            if i != inTranspose[i] {
-                throw PaddleMobileError.makeError(type: .loaderError, msg: "dims error")
-            }
-        }
-        
-        let newDim = transpose.map { padToFourDim[$0] }
-        let newLayout = transpose.map { layout.layoutWithDim[$0] }
-        
-        layout = DataLayout.init(newLayout)
-        dim = Dim.init(inDim: newDim)
-        
-        let tmpTextureDes = MTLTextureDescriptor.init()
-        tmpTextureDes.textureType = .type2DArray
-        tmpTextureDes.depth = 1
-        
-        switch tensorDim.cout() {
-        case 4:
-            tmpTextureDes.width = newDim[2]
-            tmpTextureDes.height = newDim[1]
-            tmpTextureDes.arrayLength = ((newDim[0]) * (newDim[3]) + 3) / 4
-        case 3:
-            tmpTextureDes.width = newDim[3]
-            tmpTextureDes.height = newDim[2]
-            tmpTextureDes.arrayLength = (newDim[1] + 3) / 4
-        case 2, 1:
-            tmpTextureDes.width = (newDim[3] + 3) / 4
-            tmpTextureDes.height = newDim[2]
-            tmpTextureDes.arrayLength = 1
-        default:
-            throw PaddleMobileError.makeError(type: .loaderError, msg: "unreachable")
-        }
-        
-        if computePrecision == .Float16 {
-            if useMPS {
-                if tensorDim[1] == 1 {
-                    tmpTextureDes.pixelFormat = .r16Float
-                } else {
-                    tmpTextureDes.pixelFormat = .rgba16Float
-                }
-            } else {
-                tmpTextureDes.pixelFormat = .rgba16Float
-            }
-        } else if computePrecision == .Float32 {
-            if useMPS {
-                if tensorDim[1] == 1 {
-                    tmpTextureDes.pixelFormat = .r32Float
-                } else {
-                    tmpTextureDes.pixelFormat = .rgba32Float
-                }
-            } else {
-                tmpTextureDes.pixelFormat = .rgba32Float
-            }
-        }
-        
-        tmpTextureDes.usage = [.shaderRead, .shaderWrite]
-        tmpTextureDes.storageMode = .shared
-        textureDesc = tmpTextureDes
-        _metalTexture = nil
-    }
-    
-    public func updateDims(inTensorDim: Dim, inDim: Dim) throws {
-        var fourDim: Dim
-        if inDim.cout() == 4 {
-            fourDim = inDim
-        } else if inDim.cout() < 4 {
-            var fourDimNum: [Int] = []
-            for _ in 0..<(4 - inDim.cout()) {
-                fourDimNum.append(1)
-            }
-            fourDimNum.append(contentsOf: inDim.dims)
-            fourDim = Dim.init(inDim: fourDimNum)
-        } else {
-            throw PaddleMobileError.makeError(type: .loaderError, msg: "not support")
-        }
-        
-        tensorDim = inTensorDim
-        dim = fourDim
-        padToFourDim = fourDim
-    }
-    
-    // 初始化时 dim padToFourDim 模型中的维度（一般来说 nchw），前面补全0
-    init(device: MTLDevice, inDim: Dim) throws {
-        if GlobalConfig.shared.debug {
-            print(" in dim > \(inDim)")
-        }
-        self.device = device
-        var fourDim: Dim
-        if inDim.cout() == 4 {
-            fourDim = inDim
-        } else if inDim.cout() < 4 {
-            var fourDimNum: [Int] = []
-            for _ in 0..<(4 - inDim.cout()) {
-                fourDimNum.append(1)
-            }
-            fourDimNum.append(contentsOf: inDim.dims)
-            fourDim = Dim.init(inDim: fourDimNum)
-        } else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "Texture init: dim count \(inDim) unsupported")
-        }
-        tensorDim = inDim
-        dim = fourDim
-        padToFourDim = fourDim
-        layout = DataLayout.init([(.N, fourDim[0]), (.C, fourDim[1]), (.H, fourDim[2]), (.W, fourDim[3])])
-    }
-    
-    private(set) var layout: DataLayout
-}
-
-extension Texture {
-    public var description: String {
-        return debugDescription
-    }
-    
-    public var debugDescription: String{
-        var str = ""
-        str += "Dim: \(dim) \n value:[ "
-        str += "\(_metalTexture?.description ?? "")"
-        str += " ]"
-        return str
-    }
-    
-    public func delog() {
-        if self.transpose == [0, 2, 3, 1] {
-            let outputArray = (try? self.metalTexture?.toTensor(dim: (n: self.padToFourDim[0], c: self.padToFourDim[1], h: self.padToFourDim[2], w: self.padToFourDim[3]))) ?? []
-            print(outputArray?.strideArray() ?? [])
-        } else if self.transpose == [0, 1, 2, 3] {
-            let outputArray = (try? self.realNHWC()) ?? []
-            print(outputArray.strideArray())
-        } else {
-            paddleMobileLog("unsupported transpose: \(self.transpose)", logLevel: .Warning)
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Utils.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Utils.swift
deleted file mode 100644
index 62588a9b53..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Utils.swift
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-enum PaddleMobileLogLevel: String {
-    case Info = "Info"
-    case Warning = "Warning"
-    case FatalError = "FatalError"
-}
-
-func paddleMobileLog(_ msg: String, logLevel: PaddleMobileLogLevel = .Info, file: String = #file, line: Int = #line, function: String = #function, callStack: Array<String>? = nil) {
-    var msg = "PaddleMobileLog-\(logLevel.rawValue): \(msg) -file: \(file) -line: \(line) -function:\(function)"
-    if let callStack = callStack {
-        msg.append(contentsOf: " -calling stack: \(callStack)")
-    }
-    print(msg)
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpCreator.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpCreator.swift
deleted file mode 100644
index 4aa8fac608..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpCreator.swift
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-fileprivate var singletons : [String : Any] = [:]
-class OpCreator<P: PrecisionProtocol> {
-    static var shared : OpCreator<P> {
-        let key = String(describing: P.self)
-        if let singleton = singletons[key] {
-            return singleton as! OpCreator<P>
-        } else {
-            let newSingleton = OpCreator<P>()
-            singletons[key] = newSingleton
-            return newSingleton
-        }
-    }
-    
-    func creat(device: MTLDevice, opDesc: PMOpDesc, scope: Scope, initContext: InitContext) throws -> Runable & InferShaperable {
-        guard let opCreator = opCreators[opDesc.type] else {
-            throw PaddleMobileError.makeError(type: .opError, msg: "there is no " + opDesc.type + " yet")
-        }
-        
-        return try opCreator(device, opDesc, scope, initContext)
-    }
-    
-    let opCreators: [String : (MTLDevice, PMOpDesc, Scope, InitContext) throws -> Runable & InferShaperable] =
-        [gConvType                  :     ConvOp<P>.creat,
-         gBatchNormType             :     BatchNormOp<P>.creat,
-         gReluType                  :     ReluOp<P>.creat,
-         gElementwiseAddType        :     ElementwiseAddOp<P>.creat,
-         gFeedType                  :     FeedOp<P>.creat,
-         gFetchType                 :     FetchOp<P>.creat,
-         gConvAddBatchNormReluType  :     ConvAddBatchNormReluOp<P>.creat,
-         gPooType                   :     PoolOp<P>.creat,
-         gSoftmaxType               :     SoftmaxOp<P>.creat,
-         gReshapeType               :     ReshapeOp<P>.creat,
-         gConvAddType               :     ConvAddOp<P>.creat,
-         gDepthConvType             :     DepthConvOp<P>.creat,
-         gConcatType                :     ConcatOp<P>.creat,
-         gBoxcoderType              :     BoxcoderOp<P>.creat,
-         gConvBnReluType            :     ConvBNReluOp<P>.creat,
-         gDwConvBnReluType          :     DwConvBNReluOp<P>.creat,
-         gMulticlassNMSType         :     MulticlassNMSOp<P>.creat,
-         gTransposeType             :     TransposeOp<P>.creat,
-         gPriorBoxType              :     PriorBoxOp<P>.creat,
-         gPreluType                 :     PreluOp<P>.creat,
-         gConv2dTransposeType       :     ConvTransposeOp<P>.creat,
-         gBilinearInterpType        :     BilinearInterpOp<P>.creat,
-         gSplit                     :     SplitOp<P>.creat,
-         gShape                     :     ShapeOp<P>.creat,
-         gFlatten                   :     FlattenOp<P>.creat,
-         gConvAddPreluType          :     ConvAddPreluOp<P>.creat,
-         gConvAddAddPreluType       :     ConvAddAddPreluOp<P>.creat,
-         gElementwiseAddPreluType   :     ElementwiseAddPreluOp<P>.creat,
-         gFusionConvAddType         :     ConvAddOp<P>.creat,
-         gConvAddReluType           :     ConvAddReluOp<P>.creat,
-         gReshape2Type              :     ReshapeOp<P>.creat,
-         gTranspose2Type            :     TransposeOp<P>.creat,
-         gScaleType                 :     ScaleOp<P>.creat,
-         gRelu6Type                 :     Relu6Op<P>.creat,
-         gExpType                   :     ExpOp<P>.creat,
-         gSigmoidType               :     SigmoidOp<P>.creat,
-         gLeakyReluType             :     LeakyReluOp<P>.creat,
-         gFlatten2Type              :     Flatten2Op<P>.creat,
-         gSliceType                 :     SliceOp<P>.creat,
-         gNearestInterpType         :     NearestInterpOp<P>.creat,
-         gConvReluType              :     ConvReluOp<P>.creat,
-    ]
-    
-    
-    private init(){}
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpParam.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpParam.swift
deleted file mode 100644
index b3009cac30..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpParam.swift
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-/*
- let opInputsOutputsKey  = [gConvType         : (inputs: ["Input"], outputs: ["Output"]),
- gBatchNormType    : (inputs: ["X"], outputs: ["Y"]),
- gReluType         : (inputs: ["X"], outputs: ["Out"]),
- gElementwiseAdd   : (inputs: ["X", "Y"], outputs: ["Out"])]
- */
-
-protocol OpParam {
-    associatedtype OutputType: Variant
-    var output: OutputType { get set }
-    func outputDesc() -> String
-    
-    //associatedtype ParamPrecisionType: PrecisionProtocol
-    init(opDesc: PMOpDesc, inScope: Scope) throws
-    static func getFirstTensor<VarType: Variant>(key: String, map: [String : [String]], from: Scope) throws -> VarType
-    static func inputX<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-    static func inputBiase<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-    static func inputMean<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-    static func inputScale<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-    static func inputVariance<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-    static func inputFilter<VarType: Variant>(paraInputs: [String : [String]], from: Scope) throws -> VarType
-    static func input<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-    static func output<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
-    static func outputY<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
-    static func inputY<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-    
-    static func inputImage<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-    
-    static func outputBoxes<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
-    
-    static func outputOut<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
-    
-    static func outputVariances<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
-    
-    static func getAttr<T>(key: String, attrs: [String : Attr]) throws -> T
-    
-    static func paramInputAlpha<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-    
-}
-
-extension OpParam {
-    func outputDesc() -> String {
-        return output.debugDescription
-    }
-    
-    static func getFirstTensor<VarType: Variant>(key: String, map: [String : [String]], from: Scope) throws -> VarType {
-        guard let mapKeys = map[key], mapKeys.count > 0 else {
-            throw PaddleMobileError.makeError(type: .paramError, msg: key + " not found in \(map) or maped values is empty")
-        }
-        guard let variant = from[mapKeys[0]] else {
-            throw PaddleMobileError.makeError(type: .paramError, msg: mapKeys[0] + " not found in scope")
-        }
-        
-        guard let v = variant as? VarType else {
-            throw PaddleMobileError.makeError(type: .paramError, msg: "type error")
-        }
-        return v
-    }
-    
-    static func outputVariances<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
-        let tensorVariances: VarType = try getFirstTensor(key: "Variances", map: outputs, from: from)
-        return tensorVariances
-    }
-    
-    static func paramInputAlpha<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-        let alphaTensor: VarType = try getFirstTensor(key: "Alpha", map: inputs, from: from)
-        return alphaTensor
-    }
-    
-    
-    static func inputImage<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-        let tensorImage: VarType = try getFirstTensor(key: "Image", map: inputs, from: from)
-        return tensorImage
-    }
-    
-    static func inputX<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-        let tensorX: VarType = try getFirstTensor(key: "X", map: inputs, from: from)
-        return tensorX
-    }
-    
-    static func outputBoxes<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
-        let tensorBox: VarType = try getFirstTensor(key: "Boxes", map: outputs, from: from)
-        return tensorBox
-    }
-    
-    static func input<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-        let tensorInput: VarType = try getFirstTensor(key: "Input", map: inputs, from: from)
-        return tensorInput
-    }
-    
-    static func output<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
-        let tensorOutput: VarType = try getFirstTensor(key: "Output", map: outputs, from: from)
-        return tensorOutput
-    }
-    static func outputY<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
-        let tensorOutputY: VarType = try getFirstTensor(key: "Y", map: outputs, from: from)
-        return tensorOutputY
-    }
-    static func inputY<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-        let tensorY: VarType = try getFirstTensor(key: "Y", map: inputs, from: from)
-        return tensorY
-    }
-    
-    static func outputOut<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
-        let out: VarType = try getFirstTensor(key: "Out", map: outputs, from: from)
-        return out
-    }
-    static func inputFilter<VarType: Variant>(paraInputs: [String : [String]], from: Scope) throws -> VarType {
-        let tensorFilter: VarType = try getFirstTensor(key: "Filter", map: paraInputs, from: from)
-        return tensorFilter
-    }
-    
-    static func inputBiase<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-        let tensorBias: VarType = try getFirstTensor(key: "Bias", map: inputs, from: from)
-        return tensorBias
-    }
-    
-    static func inputMean<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-        let tensorMean: VarType = try getFirstTensor(key: "Mean", map: inputs, from: from)
-        return tensorMean
-    }
-    
-    static func inputScale<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-        let tensorScale: VarType = try getFirstTensor(key: "Scale", map: inputs, from: from)
-        return tensorScale
-    }
-    
-    static func inputVariance<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-        let tensorVariance: VarType = try getFirstTensor(key: "Variance", map: inputs, from: from)
-        return tensorVariance
-    }
-    
-    static func getAttr<T>(key: String, attrs: [String : Attr]) throws -> T {
-        guard let attr = attrs[key] else {
-            throw PaddleMobileError.makeError(type: .paramError, msg: "attr \(key) can't found in: \(attrs)")
-        }
-        
-        guard let tAttr = attr as? T else {
-            throw PaddleMobileError.makeError(type: .paramError, msg: "key: \(key) attr: \(attr) type error")
-        }
-        return tAttr
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift
deleted file mode 100644
index 1a033df092..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift
+++ /dev/null
@@ -1,219 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Metal
-import Foundation
-
-protocol Fusion {
-    static func fusionNode() -> Node
-    static func change() -> [String : [(from: String, to: String)]]
-    static func fusionType() -> String
-    static func needCheck() -> [(Int, String)]
-}
-extension Fusion {
-    static func needCheck() -> [(Int, String)] {
-        return []
-    }
-}
-
-protocol Runable {
-    func run(device: MTLDevice, buffer: MTLCommandBuffer) throws
-    func runImpl(device: MTLDevice,buffer: MTLCommandBuffer) throws
-    func delogOutput()
-    func inputVariant() -> [String : [MTLBuffer]]?
-    func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) throws
-}
-
-extension Runable where Self: OperatorProtocol {
-    func run(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try runImpl(device: device, buffer: buffer)
-    }
-    
-    func inputVariant() -> [String : [MTLBuffer]]? {
-        //    return [:]
-        return nil
-    }
-    
-    func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        throw PaddleMobileError.makeError(type: .netError, msg: "need implement func computeMiddleResult")
-    }
-    
-    func delogOutput() {
-        print(type + ": has no implementation" )
-    }
-}
-
-public class InitContext {
-    
-    /// metal 代码加载方式
-    var metalLoadMode: MetalLoadMode = .LoadMetalInDefaultLib
-    
-    /// 当 metalLoadMode 为 LoadMetalInCustomMetalLib 时， metal library 路径不能为空
-    var metalLibPath: String? = nil
-    
-    /// 是否使用 MetalPerformanceShaders 进行运算
-    var useMPS: Bool = false
-    
-    /// 是否使用最高等级的加速策略
-    var useAggressiveOptimization: Bool = false
-    
-    init() {
-        metalLoadMode = .LoadMetalInDefaultLib
-        metalLibPath = nil
-    }
-}
-
-protocol Creator where Self: OperatorProtocol{
-    associatedtype OpType: OperatorProtocol & Runable & InferShaperable
-    static func creat(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws -> OpType
-}
-
-extension Creator where Self: OperatorProtocol {
-    static func creat(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws -> OpType {
-        return try OpType.provide(device:device, opDesc: opDesc, inScope: inScope, initContext: initContext)
-    }
-}
-
-protocol InferShaperable {
-    func inferShape()
-}
-
-protocol OperatorProtocol {
-    associatedtype ParamType
-    associatedtype KerType:  Computable where Self.KerType.ParamType == ParamType
-    var type: String { get }
-    var scope: Scope { get }
-    var inputs: [String : [String]] { get }
-    var paraInputs: [String : [String]] { get set }
-    var outpus: [String : [String]] { get }
-    var attrs: [String : Attr] { get }
-    var para: ParamType { get }
-    var kernel: KerType { get }
-    init(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws
-}
-
-extension OperatorProtocol {
-    static func provide(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws -> Self {
-        return try Self.init(device: device, opDesc: opDesc, inScope: inScope, initContext: initContext)
-    }
-}
-
-class Operator <KernelType:  Computable , ParameterType>: OperatorProtocol where KernelType.ParamType == ParameterType {
-    required init(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws {
-        type = opDesc.type
-        scope = inScope
-        inputs = opDesc.inputs
-        outpus = opDesc.outputs
-        attrs =  opDesc.attrs
-        paraInputs = opDesc.paraInputs
-        para = try ParamType.init(opDesc:opDesc, inScope: inScope)
-        kernel = try KernelType.init(device: device, param: para, initContext: initContext)
-    }
-    
-    typealias ParamType = ParameterType
-    typealias KerType = KernelType
-    let type: String
-    let inputs: [String : [String]]
-    var paraInputs: [String : [String]]
-    let outpus: [String : [String]]
-    let attrs: [String : Attr]
-    let para: ParamType
-    let scope: Scope
-    var kernel: KerType
-}
-
-// op infos
-let gFetchType                  = "fetch"
-let gFeedType                   = "feed"
-let gConvType                   = "conv2d"
-let gBatchNormType              = "batch_norm"
-let gReluType                   = "relu"
-let gElementwiseAddType         = "elementwise_add"
-let gConvAddBatchNormReluType   = "conv_add_batchnorm_relu"
-let gPooType                    = "pool2d"
-let gSoftmaxType                = "softmax"
-let gReshapeType                = "reshape"
-let gConvAddType                = "conv_add"
-let gConvReluType               = "conv_relu"
-let gDepthConvType              = "depthwise_conv2d"
-let gPriorBoxType               = "prior_box"
-let gTransposeType              = "transpose"
-let gConcatType                 = "concat"
-let gBoxcoderType               = "box_coder"
-let gMulticlassNMSType          = "multiclass_nms"
-let gConvBnReluType             = "conv_bn_relu"
-let gDwConvBnReluType           = "depth_conv_bn_relu"
-let gPreluType                  = "prelu"
-let gConv2dTransposeType        = "conv2d_transpose"
-let gBilinearInterpType         = "bilinear_interp"
-let gSplit                      = "split"
-let gShape                      = "shape"
-let gFlatten                    = "flatten"
-let gConvAddReluType            = "conv_add_relu"
-let gConvAddPreluType           = "conv_add_prelu"
-let gConvAddAddPreluType        = "conv_add_add_prelu"
-let gElementwiseAddPreluType    = "elementwise_add_prelu"
-let gFusionConvAddType          = "fusion_conv_add"
-let gReshape2Type               = "reshape2"
-let gTranspose2Type             = "transpose2"
-let gScaleType                  = "scale"
-let gRelu6Type                  = "relu6"
-let gExpType                    = "exp"
-let gSigmoidType                = "sigmoid"
-let gLeakyReluType              = "leaky_relu"
-let gFlatten2Type               = "flatten2"
-let gSliceType                  = "slice"
-let gNearestInterpType          = "nearest_interp"
-
-
-let opInfos = [gConvType                    : (inputs: ["Input"], outputs: ["Output"]),
-               gBatchNormType               : (inputs: ["X"], outputs: ["Y"]),
-               gReluType                    : (inputs: ["X"], outputs: ["Out"]),
-               gElementwiseAddType          : (inputs: ["X"], outputs: ["Out"]),
-               gFeedType                    : (inputs: ["X"], outputs: ["Out"]),
-               gFetchType                   : (inputs: ["X"], outputs: ["Out"]),
-               gConvAddBatchNormReluType    : (inputs: ["Input"], outputs: ["Out"]),
-               gPooType                     : (inputs: ["X"], outputs: ["Out"]),
-               gSoftmaxType                 : (inputs: ["X"], outputs: ["Out"]),
-               gReshapeType                 : (inputs: ["X"], outputs: ["Out"]),
-               gConvAddType                 : (inputs: ["Input"], outputs: ["Out"]),
-               gDepthConvType               : (inputs: ["Input"], outputs: ["Output"]),
-               gConcatType                  : (inputs: ["X"], outputs: ["Out"]),
-               gBoxcoderType                : (inputs: ["PriorBox", "PriorBoxVar", "TargetBox"], outputs: ["OutputBox"]),
-               gTransposeType               : (inputs: ["X"], outputs: ["Out"]),
-               gConvBnReluType              : (inputs: ["Input"], outputs: ["Out"]),
-               gDwConvBnReluType            : (inputs: ["Input"], outputs: ["Out"]),
-               gMulticlassNMSType           : (inputs: ["BBoxes", "Scores"], outputs: ["Out"]),
-               gPriorBoxType                : (inputs: ["Input", "Image"], outputs: ["Boxes", "Variances"]),
-               gPreluType                   : (inputs: ["X"], outputs: ["Out"]),
-               gConv2dTransposeType         : (inputs: ["Input"], outputs: ["Output"]),
-               gBilinearInterpType          : (inputs: ["X"], outputs: ["Out"]),
-               gSplit                       : (inputs: ["X"], outputs: ["Out"]),
-               gShape                       : (inputs: ["Input"], outputs: ["Out"]),
-               gFlatten                     : (inputs: ["X"], outputs: ["Out"]),
-               gConvAddPreluType            : (inputs: ["Input"], outputs: ["Out"]),
-               gConvAddAddPreluType         : (inputs: ["Input"], outputs: ["Out"]),
-               gElementwiseAddPreluType     : (inputs: ["X"], outputs: ["Out"]),
-               gFusionConvAddType           : (inputs: ["Input"], outputs: ["Out"]),
-               gReshape2Type                : (inputs: ["X"], outputs: ["Out"]),
-               gTranspose2Type              : (inputs: ["X"], outputs: ["Out"]),
-               gScaleType                   : (inputs: ["X"], outputs: ["Out"]),
-               gRelu6Type                   : (inputs: ["X"], outputs: ["Out"]),
-               gExpType                     : (inputs: ["X"], outputs: ["Out"]),
-               gSigmoidType                 : (inputs: ["X"], outputs: ["Out"]),
-               gLeakyReluType               : (inputs: ["X"], outputs: ["Out"]),
-               gFlatten2Type                : (inputs: ["X"], outputs: ["Out"]),
-               gSliceType                   : (inputs: ["Input"], outputs: ["Out"]),
-               gNearestInterpType           : (inputs: ["X"], outputs: ["Out"]),
-]
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/BatchNormOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/BatchNormOp.swift
deleted file mode 100644
index 7cbb22b58d..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/BatchNormOp.swift
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import Metal
-
-class BatchNormParam<P: PrecisionProtocol>: OpParam {
-    //typealias ParamPrecisionType = P
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        input = try BatchNormParam.inputX(inputs: opDesc.inputs, from: inScope)
-        if input.transpose != [0, 2, 3, 1] {
-            throw PaddleMobileError.makeError(type: .netError, msg: "batch norm only accepts NHWC")
-        }
-        output = try BatchNormParam.outputY(outputs: opDesc.outputs, from: inScope)
-        bias = try BatchNormParam.getFirstTensor(key: "Bias", map: opDesc.paraInputs, from: inScope)
-        mean = try BatchNormParam.getFirstTensor(key: "Mean", map: opDesc.paraInputs, from: inScope)
-        scale = try BatchNormParam.getFirstTensor(key: "Scale", map: opDesc.paraInputs, from: inScope)
-        variance = try BatchNormParam.getFirstTensor(key: "Variance", map: opDesc.paraInputs, from: inScope)
-        epsilon = try BatchNormParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
-        momentum = try BatchNormParam.getAttr(key: "momentum", attrs: opDesc.attrs)
-    }
-    let input: Texture
-    var output: Texture
-    let bias: Tensor<P>
-    let mean: Tensor<P>
-    let scale: Tensor<P>
-    let variance: Tensor<P>
-    let epsilon: Float
-    let momentum: Float
-}
-
-class BatchNormOp<P: PrecisionProtocol>: Operator<BatchNormKernel<P>, BatchNormParam<P>>, Runable, Creator, InferShaperable{
-    typealias OpType = BatchNormOp<P>
-    
-    func inferShape() {
-        para.output.dim = para.input.dim
-    }
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-        para.output.delog()
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/BilinearInterpOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/BilinearInterpOp.swift
deleted file mode 100644
index a2cb5be3bc..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/BilinearInterpOp.swift
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import Metal
-
-class BilinearInterpParam<P: PrecisionProtocol>: OpParam {
-    //typealias ParamPrecisionType = P
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        input = try BilinearInterpParam.inputX(inputs: opDesc.inputs, from: inScope)
-        output = try BilinearInterpParam.outputOut(outputs: opDesc.outputs, from: inScope)
-        out_h = try BilinearInterpParam.getAttr(key: "out_h", attrs: opDesc.attrs)
-        out_w = try BilinearInterpParam.getAttr(key: "out_w", attrs: opDesc.attrs)
-        if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) {
-            throw PaddleMobileError.makeError(type: .netError, msg: "BilinearInterpParam input transpose or tensordim not supported")
-        }
-    }
-    let input: Texture
-    var output: Texture
-    let out_h: Int
-    let out_w: Int
-}
-
-class BilinearInterpOp<P: PrecisionProtocol>: Operator<BilinearInterpKernel<P>, BilinearInterpParam<P>>, Runable, Creator, InferShaperable{
-    
-    typealias OpType = BilinearInterpOp<P>
-    
-    func inferShape() {
-        //        para.output.dim = para.input.dim
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-        para.output.delog()
-    }
-    
-}
-
-
-
-
-
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/BoxcoderOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/BoxcoderOp.swift
deleted file mode 100644
index f829cd3332..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/BoxcoderOp.swift
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class BoxcoderParam<P: PrecisionProtocol>: OpParam {
-    //typealias ParamPrecisionType = P
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        priorBox = try BoxcoderParam.getFirstTensor(key: "PriorBox", map: opDesc.inputs, from: inScope)
-        priorBoxVar = try BoxcoderParam.getFirstTensor(key: "PriorBoxVar", map: opDesc.inputs, from: inScope)
-        targetBox = try BoxcoderParam.getFirstTensor(key: "TargetBox", map: opDesc.inputs, from: inScope)
-        output = try BoxcoderParam.getFirstTensor(key: "OutputBox", map: opDesc.outputs, from: inScope)
-        codeType = try BoxcoderParam.getAttr(key: "code_type", attrs: opDesc.attrs)
-        boxNormalized = try BoxcoderParam.getAttr(key: "box_normalized", attrs: opDesc.attrs)
-
-        guard priorBox.tensorDim.cout() == 2 &&
-              priorBoxVar.tensorDim.cout() == 2 &&
-              targetBox.tensorDim.cout() == 3 &&
-              output.tensorDim.cout() == 3 &&
-              priorBox.transpose == [0, 1, 2, 3] &&
-              priorBoxVar.transpose == [0, 1, 2, 3] &&
-              targetBox.transpose == [0, 1, 2, 3] &&
-              codeType == "decode_center_size" &&
-              targetBox.tensorDim.cout() == 3 &&
-              targetBox.tensorDim[0] == 1
-        else {
-            throw PaddleMobileError.makeError(type: .netError, msg:"param do not satisfiy")
-        }
-    }
-    let priorBox: Texture
-    let priorBoxVar: Texture
-    let targetBox: Texture
-    var output: Texture
-    let codeType: String
-    let boxNormalized: Bool
-}
-
-class BoxcoderOp<P: PrecisionProtocol>: Operator<BoxcoderKernel<P>, BoxcoderParam<P>>, Runable, Creator, InferShaperable{
-    
-    typealias OpType = BoxcoderOp<P>
-    
-    func inferShape() {
-        //        para.output.dim = para.input.dim
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-        print(" prior box var ")
-        para.priorBoxVar.delog()
-        print(" target box ")
-        para.targetBox.delog()
-        print(" prior box ")
-        para.priorBox.delog()
-        print(" output ")
-        para.output.delog()
-    }
-    
-}
-
-
-
-
-
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/CNNMPSConvOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/CNNMPSConvOp.swift
deleted file mode 100644
index 8ba74a1c31..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/CNNMPSConvOp.swift
+++ /dev/null
@@ -1,75 +0,0 @@
-//
-//  CNNConvAddBatchNormReluOp.swift
-//  paddle-mobile
-
-import Foundation
-
-class CNNMPSConvTestParam: TestParam {
-    var outputTexture: MTLTexture?
-    var metalParam: MetalConvParam
-    let filterPointer: UnsafeMutableRawPointer
-    let biasePointer: UnsafeMutablePointer<Float>
-    let filterSize: (width: Int, height: Int, channel: Int)
-    init(inMetalParam: MetalConvParam, inFilter: [Float], inBiase: [Float], inFilterSize: (width: Int, height: Int, channel: Int)) {
-        metalParam = inMetalParam
-        filterPointer = UnsafeMutableRawPointer.init(mutating: inFilter)
-        biasePointer = UnsafeMutablePointer.init(mutating: inBiase)
-        filterSize = inFilterSize
-    }
-}
-
-@available(iOS 10.0, *)
-class CNNMPSConvOp<P: PrecisionType>: Operator<CNNConvKernel<P>, CNNConvParam<P>>, Runable, Creator, InferShaperable, Fusion {
-    
-    typealias OpType = CNNMPSConvOp<P>
-
-    required init(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws {
-        fatalError()
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        do {
-            try kernel.compute(commandBuffer: buffer, param: para)
-        } catch let error {
-            throw error
-        }
-    }
-    
-    func delogOutput() {
-    }
-    
-    static func fusionNode() -> Node {
-        let beginNode = Node.init(inType: gConvType)
-        _ = beginNode-->Node.init(inType: gElementwiseAdd);
-        return beginNode
-    }
-    
-    static func change() -> [String : [(from: String, to: String)]] {
-        return [:]
-    }
-    
-    static func fusionType() -> String {
-        return gMPSCNNConvType
-    }
-    func inferShape() {
-        let inDims = para.input.dim
-        let filterDim = para.filter.dim
-        let strides = para.stride
-        let paddings = para.paddings
-        let dilations = para.dilations
-        
-        var outDim = [inDims[0]]
-        for i in 0..<strides.count {
-            let dilation: Int = Int(dilations[i])
-            let filterSize: Int = filterDim[i + 1]
-            let inputSize: Int = inDims[i + 1]
-            let padding: Int = Int(paddings[i])
-            let stride: Int = Int(strides[i])
-            let dKernel = dilation * (filterSize - 1) + 1
-            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-            outDim.append(outputSize)
-        }
-        outDim.append(filterDim[0])
-        para.output.dim = Dim.init(inDim: outDim)
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift
deleted file mode 100644
index 5813a3c2eb..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class ConcatParam<P: PrecisionProtocol>: OpParam {
-    //typealias ParamPrecisionType = P
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        guard let xlist = opDesc.inputs["X"] else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "concat input desc nil")
-        }
-        for x in xlist {
-            guard let variant = inScope[x], let v = variant as? Texture else {
-                throw PaddleMobileError.makeError(type: .netError, msg: "concat input texture nil")
-            }
-            if transpose.count == 0 {
-                transpose = v.transpose
-            }
-            if v.transpose != transpose {
-                throw PaddleMobileError.makeError(type: .netError, msg: "concat transpose not equal")
-            }
-            
-            input.append(v)
-        }
-        axis = try ConcatParam.getAttr(key: "axis", attrs: opDesc.attrs)
-        if input.count > 0 {
-            if let originDimsCount = input[0].originDimsCount {
-                let nowDimsCount = input[0].dim.cout()
-                let diff = originDimsCount - nowDimsCount
-                if diff > 0 {
-                    axis -= diff
-                }
-            }
-        }
-        output = try ConcatParam.outputOut(outputs: opDesc.outputs, from: inScope)
-    }
-    var input: [Texture] = []
-    var output: Texture
-    var transpose: [Int] = []
-    var axis: Int
-}
-
-class ConcatOp<P: PrecisionProtocol>: Operator<ConcatKernel<P>, ConcatParam<P>>, Runable, Creator, InferShaperable{
-    
-    typealias OpType = ConcatOp<P>
-    
-    func inferShape() {
-        //        let dim = para.input.reduce([0, 0]) {[$0[0] + $1.dim[0], $1.dim[1]]}
-        //        para.output.dim = Dim.init(inDim: dim)
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-        para.output.delog()
-    }
-    
-}
-
-
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddAddPreluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddAddPreluOp.swift
deleted file mode 100644
index ff97b4122a..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddAddPreluOp.swift
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import Metal
-
-class ConvAddAddPreluParam<P: PrecisionProtocol>: OpParam {
-    //typealias ParamPrecisionType = P
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        filter = try ConvAddAddPreluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
-        input = try ConvAddAddPreluParam.input(inputs: opDesc.inputs, from: inScope)
-        output = try ConvAddAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
-        stride = try ConvAddAddPreluParam.getAttr(key: "strides", attrs: opDesc.attrs)
-        paddings = try ConvAddAddPreluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-        dilations = try ConvAddAddPreluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
-        groups = try ConvAddAddPreluParam.getAttr(key: "groups", attrs: opDesc.attrs)
-        alpha = try ConvAddAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
-        mode = try ConvAddAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
-        y = try ConvAddAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-    }
-    
-    let input: Texture
-    let y: Tensor<P>
-    let filter: Tensor<P>
-    let mode: String
-    let alpha: Tensor<P>
-    var output: Texture
-    let stride: [Int32]
-    let paddings: [Int32]
-    let dilations: [Int32]
-    let groups: Int
-}
-
-class ConvAddAddPreluOp<P: PrecisionProtocol>: Operator<ConvAddAddPreluKernel<P>, ConvAddAddPreluParam<P>>, Runable, Creator, InferShaperable, Fusion{
-    typealias OpType = ConvAddAddPreluOp<P>
-    
-    static func fusionNode() -> Node {
-        let beginNode = Node.init(inType: gConvType)
-        _ = beginNode
-            --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gPreluType)
-        return beginNode
-    }
-    
-    static func change() -> [String : [(from: String, to: String)]] {
-        return [:]
-    }
-    
-    static func fusionType() -> String {
-        return gConvAddAddPreluType
-    }
-    
-    static func needCheck() -> [(Int, String)] {
-        return [(2, "Y"), (2, "X")]
-    }
-    
-    
-    
-    func inferShape() {
-        let inDims = para.input.dim
-        let filterDim = para.filter.dim
-        let strides = para.stride
-        let paddings = para.paddings
-        let dilations = para.dilations
-        
-        var outDim = [inDims[0]]
-        for i in 0..<strides.count {
-            let dilation: Int = Int(dilations[i])
-            let filterSize: Int = filterDim[i + 1]
-            let inputSize: Int = inDims[i + 1]
-            let padding: Int = Int(paddings[i])
-            let stride: Int = Int(strides[i])
-            let dKernel = dilation * (filterSize - 1) + 1
-            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-            outDim.append(outputSize)
-        }
-        outDim.append(filterDim[0])
-        para.output.dim = Dim.init(inDim: outDim)
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-        do {
-            let output = try para.output.metalTexture?.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray() ?? []
-            print(output)
-        } catch _ {
-        }
-    }
-    
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddBatchNormReluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddBatchNormReluOp.swift
deleted file mode 100644
index 3ad1f332cc..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddBatchNormReluOp.swift
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-
-class ConvAddBatchNormReluParam<P: PrecisionProtocol>: OpParam {
-    //typealias ParamPrecisionType = P
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        filter = try ConvAddBatchNormReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
-        input = try ConvAddBatchNormReluParam.input(inputs: opDesc.inputs, from: inScope)
-        output = try ConvAddBatchNormReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
-        stride = try ConvAddBatchNormReluParam.getAttr(key: "strides", attrs: opDesc.attrs)
-        paddings = try ConvAddBatchNormReluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-        dilations = try ConvAddBatchNormReluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
-        epsilon = try ConvAddBatchNormReluParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
-        
-        groups = try ConvAddBatchNormReluParam.getAttr(key: "groups", attrs: opDesc.attrs)
-        variance = try ConvAddBatchNormReluParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
-        bias = try ConvAddBatchNormReluParam.inputBiase(inputs: opDesc.paraInputs, from: inScope)
-        
-        scale = try ConvAddBatchNormReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope)
-        mean = try ConvAddBatchNormReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope)
-        y = try ConvAddBatchNormReluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-    }
-    
-    let input: Texture
-    
-    let variance: Tensor<P>
-    let bias: Tensor<P>
-    let mean: Tensor<P>
-    let scale: Tensor<P>
-    let y: Tensor<P>
-    let filter: Tensor<P>
-    let epsilon: Float32
-    var newScale: MTLBuffer?
-    var newBiase: MTLBuffer?
-    
-    var output: Texture
-    let stride: [Int32]
-    let paddings: [Int32]
-    let dilations: [Int32]
-    let groups: Int
-}
-
-class ConvAddBatchNormReluOp<P: PrecisionProtocol>: Operator<ConvAddBatchNormReluKernel<P>, ConvAddBatchNormReluParam<P>>, Runable, Creator, InferShaperable, Fusion{
-    
-    typealias OpType = ConvAddBatchNormReluOp<P>
-    
-    func inferShape() {
-        let inDims = para.input.dim
-        let filterDim = para.filter.dim
-        let strides = para.stride
-        let paddings = para.paddings
-        let dilations = para.dilations
-        
-        var outDim = [inDims[0]]
-        for i in 0..<strides.count {
-            let dilation: Int = Int(dilations[i])
-            let filterSize: Int = filterDim[i + 1]
-            let inputSize: Int = inDims[i + 1]
-            let padding: Int = Int(paddings[i])
-            let stride: Int = Int(strides[i])
-            let dKernel = dilation * (filterSize - 1) + 1
-            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-            outDim.append(outputSize)
-        }
-        outDim.append(filterDim[0])
-        para.output.dim = Dim.init(inDim: outDim)
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    static func fusionNode() -> Node {
-        let beginNode = Node.init(inType: gConvType)
-        _ = beginNode
-            --> Node.init(inType: gElementwiseAddType)
-            --> Node.init(inType: gBatchNormType)
-            --> Node.init(inType: gReluType)
-        return beginNode
-    }
-    
-    static func change() -> [String : [(from: String, to: String)]] {
-        return [:]
-    }
-    
-    static func fusionType() -> String {
-        return gConvAddBatchNormReluType
-    }
-    
-    func delogOutput() {
-        print(" conv add batchnorm relu output ")
-        do {
-            let output = try para.output.toTensor().strideArray()
-            print(output)
-        } catch _ {
-        }
-        //        let _: P? = para.input.metalTexture.logDesc(header: "conv add batchnorm relu input: ", stridable: false)
-        //        para.filter.logDataPointer(header: "filter data pointer: ")
-        //        print("filter: \(para.filter)")
-        
-        //        print("biase: \(para.y)")
-        //        print("padding: \(para.paddings)")
-        //        print("stride: \(para.stride)")
-        
-        //        let _: P? = para.y.buffer?.logDesc(header: " biase: ", stridable: false)
-        //        let _: P? = para.newBiase?.logDesc(header: "new biase: ", stridable: false)
-        //        let _: P? = para.newScale?.logDesc(header: "new scale: ", stridable: false)
-        
-        //        let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: false)
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddOp.swift
deleted file mode 100644
index 532e88b567..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddOp.swift
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class ConvAddOp<P: PrecisionProtocol>: Operator<ConvAddKernel<P>, ConvAddReluParam<P>>, Runable, Creator, InferShaperable, Fusion {
-    typealias OpType = ConvAddOp<P>
-    
-    static func fusionNode() -> Node {
-        let beginNode = Node.init(inType: gConvType)
-        _ = beginNode
-            --> Node.init(inType: gElementwiseAddType)
-        return beginNode
-    }
-    
-    static func change() -> [String : [(from: String, to: String)]] {
-        return [:]
-    }
-    
-    static func fusionType() -> String {
-        return gConvAddType
-    }
-    
-    func inferShape() {
-        let inDims = para.input.dim
-        let filterDim = para.filter.dim
-        let strides = para.stride
-        let paddings = para.paddings
-        let dilations = para.dilations
-        
-        var outDim = [inDims[0]]
-        for i in 0..<strides.count {
-            let dilation: Int = Int(dilations[i])
-            let filterSize: Int = filterDim[i + 1]
-            let inputSize: Int = inDims[i + 1]
-            let padding: Int = Int(paddings[i])
-            let stride: Int = Int(strides[i])
-            let dKernel = dilation * (filterSize - 1) + 1
-            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-            outDim.append(outputSize)
-        }
-        outDim.append(filterDim[0])
-        para.output.dim = Dim.init(inDim: outDim)
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-        print(para.output.metalTexture ?? "")
-        do {
-            let output = try para.output.metalTexture?.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray() ?? []
-            print(output)
-        } catch _ {
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddPreluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddPreluOp.swift
deleted file mode 100644
index 30e332d739..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddPreluOp.swift
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class ConvAddPreluParam<P: PrecisionProtocol>: OpParam {
-    //typealias ParamPrecisionType = P
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        filter = try ConvAddPreluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
-        input = try ConvAddPreluParam.input(inputs: opDesc.inputs, from: inScope)
-        output = try ConvAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
-        stride = try ConvAddPreluParam.getAttr(key: "strides", attrs: opDesc.attrs)
-        paddings = try ConvAddPreluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-        dilations = try ConvAddPreluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
-        groups = try ConvAddPreluParam.getAttr(key: "groups", attrs: opDesc.attrs)
-        alpha = try ConvAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
-        mode = try ConvAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
-        y = try ConvAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-    }
-    
-    let input: Texture
-    let y: Tensor<P>
-    let filter: Tensor<P>
-    let mode: String
-    let alpha: Tensor<P>
-    var output: Texture
-    let stride: [Int32]
-    let paddings: [Int32]
-    let dilations: [Int32]
-    let groups: Int
-}
-
-class ConvAddPreluOp<P: PrecisionProtocol>: Operator<ConvAddPreluKernel<P>, ConvAddPreluParam<P>>, Runable, Creator, InferShaperable, Fusion{
-    typealias OpType = ConvAddPreluOp<P>
-    
-    static func fusionNode() -> Node {
-        let beginNode = Node.init(inType: gConvType)
-        _ = beginNode
-            --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gPreluType)
-        return beginNode
-    }
-    
-    static func change() -> [String : [(from: String, to: String)]] {
-        return [:]
-    }
-    
-    static func fusionType() -> String {
-        return gConvAddPreluType
-    }
-    
-    func inferShape() {
-        let inDims = para.input.dim
-        let filterDim = para.filter.dim
-        let strides = para.stride
-        let paddings = para.paddings
-        let dilations = para.dilations
-        
-        var outDim = [inDims[0]]
-        for i in 0..<strides.count {
-            let dilation: Int = Int(dilations[i])
-            let filterSize: Int = filterDim[i + 1]
-            let inputSize: Int = inDims[i + 1]
-            let padding: Int = Int(paddings[i])
-            let stride: Int = Int(strides[i])
-            let dKernel = dilation * (filterSize - 1) + 1
-            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-            outDim.append(outputSize)
-        }
-        outDim.append(filterDim[0])
-        para.output.dim = Dim.init(inDim: outDim)
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-        do {
-            let output = try para.output.metalTexture?.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray() ?? []
-            print(output)
-        } catch _ {
-        }
-    }
-    
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddReluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddReluOp.swift
deleted file mode 100644
index 1acdee3dd6..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddReluOp.swift
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class ConvAddReluParam<P: PrecisionProtocol>: OpParam {
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        do {
-            filter = try ConvAddReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
-            input = try ConvAddReluParam.input(inputs: opDesc.inputs, from: inScope)
-            output = try ConvAddReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
-            stride = try ConvAddReluParam.getAttr(key: "strides", attrs: opDesc.attrs)
-            paddings = try ConvAddReluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-            dilations = try ConvAddReluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
-            groups = try ConvAddReluParam.getAttr(key: "groups", attrs: opDesc.attrs)
-            do {
-                axis = try ConvAddReluParam.getAttr(key: "axis", attrs: opDesc.attrs)
-            } catch {
-                axis = -1
-            }
-            do {
-                y = try ConvAddReluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-            } catch {
-                do {
-                    let yTensor: Tensor<P> = try ConvAddReluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-                    let device = input.metalTexture!.device
-                    y = try Texture.init(device: device, inDim: yTensor.dim)
-                    let value: [P] = Array(UnsafeBufferPointer(start: yTensor.data.pointer, count: yTensor.dim.numel()))
-                    y?.metalTexture = try device.tensor2texture(value: value, dim: yTensor.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: GlobalConfig.shared.computePrecision)
-                    self.yTensor = yTensor
-                } catch {
-                }
-            }
-        } catch let error {
-            throw error
-        }
-    }
-    
-    let input: Texture
-    let filter: Tensor<P>
-    var output: Texture
-    let stride: [Int32]
-    let paddings: [Int32]
-    let dilations: [Int32]
-    let groups: Int
-    let axis: Int
-    
-    var y: Texture?
-    var yTensor: Tensor<P>?
-    
-    open class func hasY() -> Bool {
-        return true
-    }
-}
-
-class ConvAddReluOp<P: PrecisionProtocol>: Operator<ConvAddReluKernel<P>, ConvAddReluParam<P>>, Runable, Creator, InferShaperable, Fusion {
-    typealias OpType = ConvAddReluOp<P>
-    
-    static func fusionNode() -> Node {
-        let beginNode = Node.init(inType: gConvType)
-        _ = beginNode
-            --> Node.init(inType: gElementwiseAddType)
-            --> Node.init(inType: gReluType)
-        return beginNode
-    }
-    
-    static func change() -> [String : [(from: String, to: String)]] {
-        return [:]
-    }
-    
-    static func fusionType() -> String {
-        return gConvAddReluType
-    }
-    
-    func inferShape() {
-        let inDims = para.input.dim
-        let filterDim = para.filter.dim
-        let strides = para.stride
-        let paddings = para.paddings
-        let dilations = para.dilations
-        
-        var outDim = [inDims[0]]
-        for i in 0..<strides.count {
-            let dilation: Int = Int(dilations[i])
-            let filterSize: Int = filterDim[i + 1]
-            let inputSize: Int = inDims[i + 1]
-            let padding: Int = Int(paddings[i])
-            let stride: Int = Int(strides[i])
-            let dKernel = dilation * (filterSize - 1) + 1
-            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-            outDim.append(outputSize)
-        }
-        outDim.append(filterDim[0])
-        para.output.dim = Dim.init(inDim: outDim)
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-        print(para.output.metalTexture ?? "")
-        do {
-            let output = try para.output.metalTexture?.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray() ?? []
-            print(output)
-        } catch _ {
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvBNReluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvBNReluOp.swift
deleted file mode 100644
index 9493636edd..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvBNReluOp.swift
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class ConvBNReluParam<P: PrecisionProtocol>: OpParam {
-    //typealias ParamPrecisionType = P
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        filter = try ConvBNReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
-        input = try ConvBNReluParam.input(inputs: opDesc.inputs, from: inScope)
-        output = try ConvBNReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
-        stride = try ConvBNReluParam.getAttr(key: "strides", attrs: opDesc.attrs)
-        paddings = try ConvBNReluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-        dilations = try ConvBNReluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
-        epsilon = try ConvBNReluParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
-        
-        groups = try ConvBNReluParam.getAttr(key: "groups", attrs: opDesc.attrs)
-        variance = try ConvBNReluParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
-        bias = try ConvBNReluParam.inputBiase(inputs: opDesc.paraInputs, from: inScope)
-        scale = try ConvBNReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope)
-        mean = try ConvBNReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope)
-    }
-    
-    let input: Texture
-    let variance: Tensor<P>
-    let bias: Tensor<P>
-    let mean: Tensor<P>
-    let scale: Tensor<P>
-    let filter: Tensor<P>
-    let epsilon: Float32
-    var newScale: MTLBuffer?
-    var newBiase: MTLBuffer?
-    
-    var output: Texture
-    let stride: [Int32]
-    let paddings: [Int32]
-    let dilations: [Int32]
-    let groups: Int
-}
-
-class ConvBNReluOp<P: PrecisionProtocol>: Operator<ConvBNReluKernel<P>, ConvBNReluParam<P>>, Runable, Creator, InferShaperable, Fusion{
-    typealias OpType = ConvBNReluOp<P>
-    
-    func inputs() -> [Variant] {
-        return [para.input, para.variance, para.bias, para.mean, para.scale, para.filter]
-    }
-    
-    
-    func inferShape() {
-        let inDims = para.input.dim
-        let filterDim = para.filter.dim
-        let strides = para.stride
-        let paddings = para.paddings
-        let dilations = para.dilations
-        
-        var outDim = [inDims[0]]
-        for i in 0..<strides.count {
-            let dilation: Int = Int(dilations[i])
-            let filterSize: Int = filterDim[i + 1]
-            let inputSize: Int = inDims[i + 1]
-            let padding: Int = Int(paddings[i])
-            let stride: Int = Int(strides[i])
-            let dKernel = dilation * (filterSize - 1) + 1
-            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-            outDim.append(outputSize)
-        }
-        outDim.append(filterDim[0])
-        para.output.dim = Dim.init(inDim: outDim)
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    static func fusionNode() -> Node {
-        let beginNode = Node.init(inType: gConvType)
-        _ = beginNode
-            --> Node.init(inType: gBatchNormType)
-            --> Node.init(inType: gReluType)
-        return beginNode
-    }
-    
-    static func change() -> [String : [(from: String, to: String)]] {
-        return [:]
-    }
-    
-    static func fusionType() -> String {
-        return gConvBnReluType
-    }
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-        do {
-            let output = try para.output.metalTexture?.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray() ?? []
-            print(output)
-        } catch _ {
-        }
-    }
-    
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvOp.swift
deleted file mode 100644
index 25221538e9..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvOp.swift
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class ConvParam<P: PrecisionProtocol>: OpParam {
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        filter = try ConvParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
-        input = try ConvParam.input(inputs: opDesc.inputs, from: inScope)
-        output = try ConvParam.output(outputs: opDesc.outputs, from: inScope)
-        stride = try ConvParam.getAttr(key: "strides", attrs: opDesc.attrs)
-        paddings = try ConvParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-        dilations = try ConvParam.getAttr(key: "dilations", attrs: opDesc.attrs)
-        groups = try ConvParam.getAttr(key: "groups", attrs: opDesc.attrs)
-    }
-    
-    let input: Texture
-    let filter: Tensor<P>
-    var output: Texture
-    let stride: [Int32]
-    let paddings: [Int32]
-    let dilations: [Int32]
-    let groups: Int
-}
-
-class ConvOp<P: PrecisionProtocol>: Operator<ConvKernel<P>, ConvParam<P>>, Runable, Creator, InferShaperable {
-    typealias OpType = ConvOp<P>
-    
-    func inferShape() {
-        let inDims = para.input.dim
-        let filterDim = para.filter.dim
-        let strides = para.stride
-        let paddings = para.paddings
-        let dilations = para.dilations
-        
-        var outDim = [inDims[0]]
-        for i in 0..<strides.count {
-            let dilation: Int = Int(dilations[i])
-            let filterSize: Int = filterDim[i + 1]
-            let inputSize: Int = inDims[i + 1]
-            let padding: Int = Int(paddings[i])
-            let stride: Int = Int(strides[i])
-            let dKernel = dilation * (filterSize - 1) + 1
-            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-            outDim.append(outputSize)
-        }
-        outDim.append(filterDim[0])
-        para.output.dim = Dim.init(inDim: outDim)
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print("conv output : ")
-        do {
-            let output = try para.output.toTensor().strideArray()
-            print(output)
-        } catch _ {
-        }
-        //        let _: Float16? = para.output.metalTexture.logDesc()
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvReluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvReluOp.swift
deleted file mode 100644
index 6ec4ba1f22..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvReluOp.swift
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class ConvReluOp<P: PrecisionProtocol>: Operator<ConvReluKernel<P>, ConvAddReluParam<P>>, Runable, Creator, InferShaperable, Fusion {
-    typealias OpType = ConvReluOp<P>
-    
-    static func fusionNode() -> Node {
-        let beginNode = Node.init(inType: gConvType)
-        _ = beginNode
-            --> Node.init(inType: gReluType)
-        return beginNode
-    }
-    
-    static func change() -> [String : [(from: String, to: String)]] {
-        return [:]
-    }
-    
-    static func fusionType() -> String {
-        return gConvReluType
-    }
-    
-    func inferShape() {
-        let inDims = para.input.dim
-        let filterDim = para.filter.dim
-        let strides = para.stride
-        let paddings = para.paddings
-        let dilations = para.dilations
-        
-        var outDim = [inDims[0]]
-        for i in 0..<strides.count {
-            let dilation: Int = Int(dilations[i])
-            let filterSize: Int = filterDim[i + 1]
-            let inputSize: Int = inDims[i + 1]
-            let padding: Int = Int(paddings[i])
-            let stride: Int = Int(strides[i])
-            let dKernel = dilation * (filterSize - 1) + 1
-            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-            outDim.append(outputSize)
-        }
-        outDim.append(filterDim[0])
-        para.output.dim = Dim.init(inDim: outDim)
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        do {
-            try kernel.compute(commandBuffer: buffer, param: para)
-        } catch let error {
-            throw error
-        }
-    }
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-        do {
-            let output = try para.output.metalTexture?.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray() ?? []
-            print(output)
-        } catch _ {
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvTransposeOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvTransposeOp.swift
deleted file mode 100644
index 2d19772f37..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvTransposeOp.swift
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class ConvTransposeParam<P: PrecisionProtocol>: ConvParam<P> {
-    //typealias ParamPrecisionType = P
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        try super.init(opDesc: opDesc, inScope: inScope)
-    }
-}
-
-class ConvTransposeOp<P: PrecisionProtocol>: Operator<ConvTransposeKernel<P>, ConvTransposeParam<P>>, Runable, Creator, InferShaperable{
-    
-    typealias OpType = ConvTransposeOp<P>
-    
-    func inferShape() {
-        // para.output.dim = para.input.dim
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        
-        print(" \(type) output: ")
-        let padToFourDim = para.output.padToFourDim
-        do {
-            if para.output.transpose == [0, 1, 2, 3] {
-                let outputArray: [Float32] = try para.output.metalTexture?.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) ?? []
-                print(outputArray.strideArray())
-            } else if para.output.transpose == [0, 2, 3, 1] {
-                let output = try para.output.metalTexture?.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])) ?? []
-                print(output.strideArray())
-            } else {
-                print(" not implement")
-            }
-        } catch _ {
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/DepthwiseConvOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/DepthwiseConvOp.swift
deleted file mode 100644
index b294670822..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/DepthwiseConvOp.swift
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class DepthConvOp<P: PrecisionProtocol>: Operator<ConvKernel<P>, ConvParam<P>>, Runable, Creator, InferShaperable {
-    
-    typealias OpType = DepthConvOp<P>
-    
-    func inferShape() {
-        let inDims = para.input.dim
-        let filterDim = para.filter.dim
-        let strides = para.stride
-        let paddings = para.paddings
-        let dilations = para.dilations
-        
-        var outDim = [inDims[0]]
-        for i in 0..<strides.count {
-            let dilation: Int = Int(dilations[i])
-            let filterSize: Int = filterDim[i + 1]
-            let inputSize: Int = inDims[i + 1]
-            let padding: Int = Int(paddings[i])
-            let stride: Int = Int(strides[i])
-            let dKernel = dilation * (filterSize - 1) + 1
-            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-            outDim.append(outputSize)
-        }
-        outDim.append(filterDim[0])
-        para.output.dim = Dim.init(inDim: outDim)
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-        do {
-            let output = try para.output.metalTexture?.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray() ?? []
-            print(output)
-        } catch _ {
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/DwConvBNReluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/DwConvBNReluOp.swift
deleted file mode 100644
index b7ce696c95..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/DwConvBNReluOp.swift
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class DwConvBNReluOp<P: PrecisionProtocol>: Operator<ConvBNReluKernel<P>, ConvBNReluParam<P>>, Runable, Creator, InferShaperable, Fusion{
-    typealias OpType = ConvBNReluOp<P>
-    
-    func inferShape() {
-        let inDims = para.input.dim
-        let filterDim = para.filter.dim
-        let strides = para.stride
-        let paddings = para.paddings
-        let dilations = para.dilations
-        
-        var outDim = [inDims[0]]
-        for i in 0..<strides.count {
-            let dilation: Int = Int(dilations[i])
-            let filterSize: Int = filterDim[i + 1]
-            let inputSize: Int = inDims[i + 1]
-            let padding: Int = Int(paddings[i])
-            let stride: Int = Int(strides[i])
-            let dKernel = dilation * (filterSize - 1) + 1
-            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-            outDim.append(outputSize)
-        }
-        outDim.append(filterDim[0])
-        para.output.dim = Dim.init(inDim: outDim)
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    static func fusionNode() -> Node {
-        let beginNode = Node.init(inType: gDepthConvType)
-        _ = beginNode
-            --> Node.init(inType: gBatchNormType)
-            --> Node.init(inType: gReluType)
-        return beginNode
-    }
-    
-    static func change() -> [String : [(from: String, to: String)]] {
-        return [:]
-    }
-    
-    static func fusionType() -> String {
-        return gDwConvBnReluType
-    }
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-        do {
-            let output = try para.output.metalTexture?.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray() ?? []
-            print(output)
-        } catch _ {
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddOp.swift
deleted file mode 100644
index 385ee39ff5..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddOp.swift
+++ /dev/null
@@ -1,88 +0,0 @@
-///* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License. */
-
-import Foundation
-import Metal
-
-class ElementwiseAddParam<P: PrecisionProtocol>: OpParam {
-    //typealias ParamPrecisionType = P
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        inputX = try ElementwiseAddParam.inputX(inputs: opDesc.inputs, from: inScope)
-        output = try ElementwiseAddParam.outputOut(outputs: opDesc.outputs, from: inScope)
-        axis = try ElementwiseAddParam.getAttr(key: "axis", attrs: opDesc.attrs)
-        do {
-            inputY = try ElementwiseAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-        } catch _ {
-            let tensorY: Tensor<P> = try ElementwiseAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-            guard let device = inputX.metalTexture?.device else {
-                throw PaddleMobileError.makeError(type: .loaderError, msg: "ElementwiseAddParam inputX metalTexture nil")
-            }
-            inputY = try Texture.init(device: device, inDim: tensorY.dim)
-            let value: [P] = Array(UnsafeBufferPointer(start: tensorY.data.pointer, count: tensorY.dim.numel()))
-            inputY.metalTexture = try device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: GlobalConfig.shared.computePrecision)
-        }
-        
-        var offset = axis
-        if axis == -1 {
-            offset = inputX.tensorDim.cout() - inputY.tensorDim.cout()
-        }
-        for i in 0..<(inputY.tensorDim.cout()) {
-            guard inputX.tensorDim[offset + i] == inputY.tensorDim[i] else {
-                throw PaddleMobileError.makeError(type: .netError, msg:"inputs tensordim inputx: \(inputX.tensorDim) inputy: \(inputY.tensorDim) offset: \(offset) do not satisfy")
-            }
-        }
-    }
-    
-    var inputX: Texture
-    var inputY: Texture
-    var output: Texture
-    var axis: Int
-}
-
-class ElementwiseAddOp<P: PrecisionProtocol>: Operator<ElementwiseAddKernel<P>, ElementwiseAddParam<P>>, Runable, Creator, InferShaperable{
-    typealias OpType = ElementwiseAddOp<P>
-    
-    func inferShape() {
-        //    para.output.dim = para.input.dim
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-        print(para.output)
-        
-        let padToFourDim = para.output.padToFourDim
-        do {
-            if para.output.transpose == [0, 1, 2, 3] {
-                let outputArray: [Float32] = try para.output.metalTexture?.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) ?? []
-                print(outputArray.strideArray())
-            } else if para.output.transpose == [0, 2, 3, 1] {
-                print(try para.output.metalTexture?.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray() ?? [])
-            } else {
-                print(" not implement")
-            }
-        } catch _ {
-        }
-        
-    }
-}
-
-
-
-
-
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddPreluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddPreluOp.swift
deleted file mode 100644
index 7d726a5989..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddPreluOp.swift
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import Metal
-
-class ElementwiseAddPreluParam<P: PrecisionProtocol>: OpParam {
-    //typealias ParamPrecisionType = P
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        alpha = try ElementwiseAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
-        mode = try ElementwiseAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
-        inputX = try ElementwiseAddPreluParam.inputX(inputs: opDesc.inputs, from: inScope)
-        output = try ElementwiseAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
-        axis = try ElementwiseAddPreluParam.getAttr(key: "axis", attrs: opDesc.attrs)
-        do {
-            inputY = try ElementwiseAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-        } catch _ {
-            let tensorY: Tensor<P> = try ElementwiseAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-            guard let device = inputX.metalTexture?.device else {
-                throw PaddleMobileError.makeError(type: .loaderError, msg: "ElementwiseAddParam inputX metalTexture nil")
-            }
-            inputY = try Texture.init(device: device, inDim: tensorY.dim)
-            let value: [P] = Array(UnsafeBufferPointer(start: tensorY.data.pointer, count: tensorY.dim.numel()))
-            inputY.metalTexture = try device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: GlobalConfig.shared.computePrecision)
-        }
-        
-        var offset = axis
-        if axis == -1 {
-            offset = inputX.tensorDim.cout() - inputY.tensorDim.cout()
-        }
-        for i in 0..<(inputY.tensorDim.cout()) {
-            guard inputX.tensorDim[offset + i] == inputY.tensorDim[i] else {
-                throw PaddleMobileError.makeError(type: .netError, msg: "inputs tensordim inputx: \(inputX.tensorDim) inputy: \(inputY.tensorDim) offset: \(offset) do not satisfy")
-            }
-        }
-    }
-    
-    let mode: String
-    let alpha: Tensor<P>
-    var inputX: Texture
-    var inputY: Texture
-    var output: Texture
-    var axis: Int
-}
-
-class ElementwiseAddPreluOp<P: PrecisionProtocol>: Operator<ElementwiseAddPreluKernel<P>, ElementwiseAddPreluParam<P>>, Runable, Creator, InferShaperable, Fusion{
-    static func fusionNode() -> Node {
-        let beginNode = Node.init(inType: gElementwiseAddType)
-        _ = beginNode
-            --> Node.init(inType: gPreluType)
-        return beginNode
-    }
-    
-    static func change() -> [String : [(from: String, to: String)]] {
-        return [:]
-    }
-    
-    static func fusionType() -> String {
-        return gElementwiseAddPreluType
-    }
-    
-    typealias OpType = ElementwiseAddPreluOp<P>
-    
-    func inferShape() {
-        //    para.output.dim = para.input.dim
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-        print(para.output)
-        
-        let padToFourDim = para.output.padToFourDim
-        do {
-            if para.output.transpose == [0, 1, 2, 3] {
-                let outputArray: [Float32] = try para.output.metalTexture?.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) ?? []
-                print(outputArray.strideArray())
-            } else if para.output.transpose == [0, 2, 3, 1] {
-                print(try para.output.metalTexture?.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray() ?? [])
-            } else {
-                print(" not implement")
-            }
-        } catch _ {
-        }
-    }
-}
-
-
-
-
-
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ExpOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ExpOp.swift
deleted file mode 100644
index ac20770bfe..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ExpOp.swift
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-
-import Foundation
-
-class ExpParam<P: PrecisionProtocol>: OpParam {
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        input = try ExpParam.inputX(inputs: opDesc.inputs, from: inScope)
-        output = try ExpParam.outputOut(outputs: opDesc.outputs, from: inScope)
-    }
-    let input: Texture
-    var output: Texture
-}
-
-class ExpOp<P: PrecisionProtocol>: Operator<ExpKernel<P>, ExpParam<P>>, Runable, Creator, InferShaperable {
-    typealias OpType = ExpOp<P>
-    
-    func inferShape() {
-        para.output.dim = para.input.dim
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-        print(para.output.metalTexture ?? "")
-        do {
-            let output = try para.output.metalTexture?.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray() ?? []
-            print(output)
-        } catch _ {
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift
deleted file mode 100644
index a15b5079fc..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import MetalKit
-import CoreMedia
-
-class FeedParam<P: PrecisionProtocol>: OpParam{
-    var output: Texture
-    var input: InputTexture {
-        return scope.input() as! InputTexture
-    }
-    let scope: Scope
-    
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        scope = inScope
-        output = try FeedParam.outputOut(outputs: opDesc.outputs, from: inScope)
-    }
-    
-    //typealias ParamPrecisionType = P
-}
-
-class FeedOp<P: PrecisionProtocol>: Operator<Texture2DTo2DArrayKernel<P>, FeedParam<P>>, Runable, Creator, InferShaperable {
-    typealias OpType = FeedOp<P>
-    
-    func inferShape() {
-        //        print("feed  input: \(para.input.expectDim)")
-        paddleMobileLog("feed output: \(para.output.dim)")
-        //        para.output.dim =
-        //        para.output.dim = para.input.expectDim
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-        
-        //        let resizeKernel = ResizeKernel<P>.init(device: device)
-        //        let resizeParam = ResizeParam.init(input: para.input.mtlTexture, output: para.output.metalTexture, expectDim: para.input.expectDim)
-        //        do {
-        //            try resizeKernel.compute(commandBuffer: buffer, param: resizeParam)
-        //        } catch let error {
-        //            throw error
-        //        }
-    }
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-        print(para.output.metalTexture ?? "")
-        do {
-            let output = try para.output.metalTexture?.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray()
-            print(output ?? "")
-        } catch let error {
-            print(error)
-        }
-    }
-}
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/FetchOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/FetchOp.swift
deleted file mode 100644
index 2f3db88504..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/FetchOp.swift
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import Metal
-
-class FetchParam<P: PrecisionProtocol>: OpParam{
-    var output: FetchHolder
-    let input: Texture
-    let scope: Scope
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        scope = inScope
-        input = try FetchParam.inputX(inputs: opDesc.inputs, from: inScope)
-        output = FetchHolder.init(inPaddedCapacity: input.elementCount(), inDim: input.tensorDim)
-        scope.setOutput(output: output)
-    }
-    
-    //typealias ParamPrecisionType = P
-}
-
-class FetchOp<P: PrecisionProtocol>: Operator< FetchKernel<P>, FetchParam<P>>, Runable, Creator, InferShaperable {
-    
-    typealias OpType = FetchOp<P>
-    
-    func inferShape() {
-        paddleMobileLog("\(para.input.dim)")
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print("fetch output: ")
-        let resArr = self.para.output.result?.floatArr(count: self.para.output.capacity)
-        print(resArr?.strideArray() ?? "")
-    }
-}
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/FlattenOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/FlattenOp.swift
deleted file mode 100644
index 16f2b2ceaa..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/FlattenOp.swift
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class FlattenParam<P: PrecisionProtocol>: OpParam {
-    //typealias ParamPrecisionType = P
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        input = try FlattenParam.inputX(inputs: opDesc.inputs, from: inScope)
-        output = try FlattenParam.outputOut(outputs: opDesc.outputs, from: inScope)
-//            axis = try FlattenParam.getAttr(key: "axis", attrs: opDesc.attrs)
-    }
-    var input: Texture
-    var output: Texture
-    var axis: Int = 0
-}
-
-
-class FlattenOp<P: PrecisionProtocol>: Operator<FlattenKernel<P>, FlattenParam<P>>, Runable, Creator, InferShaperable{
-    
-    typealias OpType = FlattenOp<P>
-    
-    func inferShape() {
-        //        para.output.dim = para.input.dim
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-        para.output.delog()
-    }
-}
-
-class Flatten2Param<P: PrecisionProtocol>: OpParam {
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        input = try Flatten2Param.inputX(inputs: opDesc.inputs, from: inScope)
-        output = try Flatten2Param.outputOut(outputs: opDesc.outputs, from: inScope)
-        
-        let inDims = input.dim
-        guard inDims.cout() == 4 else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "flatten2 can't handle dims not equal to 4")
-        }
-        let outDim = [inDims[0] * inDims[1], inDims[2] * inDims[3]]
-        output.dim = Dim.init(inDim: outDim)
-    }
-    var input: Texture
-    var output: Texture
-}
-
-class Flatten2Op<P: PrecisionProtocol>: Operator<Flatten2Kernel<P>, Flatten2Param<P>>, Runable, Creator, InferShaperable {
-    typealias OpType = Flatten2Op<P>
-    
-    func inferShape() {
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-        para.output.delog()
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Base/Kernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Base/Kernel.swift
deleted file mode 100644
index 2bde861feb..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Base/Kernel.swift
+++ /dev/null
@@ -1,238 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Metal
-import Foundation
-
-public protocol TestParam {
-}
-
-public protocol Testable {
-    associatedtype TestParamType: TestParam
-    func test(commandBuffer: MTLCommandBuffer, param: TestParamType) throws
-    init(device: MTLDevice, testParam: TestParamType, initContext: InitContext) throws
-}
-
-
-protocol Computable {
-    associatedtype ParamType: OpParam
-    func compute(commandBuffer: MTLCommandBuffer, param: ParamType) throws
-    init(device: MTLDevice, param: ParamType, initContext: InitContext) throws
-}
-
-protocol KernelProtocol {
-    var pipline: MTLComputePipelineState { get set }
-    var functionName: String { get set }
-    
-}
-
-@objc open class Kernel: NSObject {
-    
-    private var _pipline: MTLComputePipelineState?
-    
-    var pipline: MTLComputePipelineState? {
-        get {
-            return _pipline
-        }
-    }
-    
-    let functionName: String?
-    public init(device: MTLDevice, inFunctionName: String?, usePaddleMobileLib: Bool = false, initContext: InitContext) throws {
-        functionName = inFunctionName
-        if let funcName = inFunctionName {
-            _pipline = try device.pipeLine(funcName: funcName, metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath)
-        }
-    }
-    
-    func encodeTransposeInput(input: Texture, toTranspose: [Int], commandBuffer: MTLCommandBuffer, device: MTLDevice, initContext: InitContext) -> Texture? {
-        do {
-            let intermediateTexture = try Texture(device: device, inDim: input.tensorDim)
-            try intermediateTexture.initTexture(device: device, inTranspose: toTranspose, computePrecision: GlobalConfig.shared.computePrecision)
-            
-            let irank = input.tensorDim.cout()
-            let orank = intermediateTexture.tensorDim.cout()
-            var funcName = ""
-            if GlobalConfig.shared.computePrecision == .Float32 {
-                funcName = "reshape_\(irank)_\(orank)_float"
-            } else if GlobalConfig.shared.computePrecision == .Float16 {
-                funcName = "reshape_\(irank)_\(orank)_half"
-            } else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-            }
-            let intermediatePipeline = try device.pipeLine(funcName: funcName, metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath)
-            guard let inputMetalTexture = input.metalTexture else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-            }
-            guard let interMetalTexture = intermediateTexture.metalTexture else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "intermediateTexture metaltexture is nil")
-            }
-            var id: [Int32] = [1, 1, 1, 1]
-            for i in 0..<input.tensorDim.cout() {
-                id[4-input.tensorDim.cout()+i] = Int32(input.tensorDim[i])
-            }
-            let it: [Int32] = input.transpose.map { Int32($0) }
-            var od: [Int32] = [1, 1, 1, 1]
-            for i in 0..<intermediateTexture.tensorDim.cout() {
-                od[4-intermediateTexture.tensorDim.cout()+i] = Int32(intermediateTexture.tensorDim[i])
-            }
-            let ot: [Int32] = intermediateTexture.transpose.map { Int32($0) }
-            var reshapeMetalParam = ReshapeMetalParam.init(
-                idim: (id[0], id[1], id[2], id[3]),
-                itrans: (it[0], it[1], it[2], it[3]),
-                odim: (od[0], od[1], od[2], od[3]),
-                otrans: (ot[0], ot[1], ot[2], ot[3])
-            )
-            do {
-                guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                    throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-                }
-                defer {
-                    encoder.endEncoding()
-                }
-                encoder.setTexture(inputMetalTexture, index: 0)
-                encoder.setTexture(interMetalTexture, index: 1)
-                encoder.setBytes(&reshapeMetalParam, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
-                try encoder.dispatch(computePipline: intermediatePipeline, outTexture: interMetalTexture)
-            }
-            return intermediateTexture
-        } catch _ {
-            return nil
-        }
-    }
-}
-
-@objc public class Shape: NSObject {
-    public let width: Int
-    public let height: Int
-    public let channel: Int
-    @objc public init(inWidth: Int, inHeight: Int, inChannel: Int){
-        width = inWidth
-        height = inHeight
-        channel = inChannel
-    }
-}
-
-open class BufferToTextureKernel: Kernel {
-    public let outputTexture: MTLTexture
-    
-    public init(device: MTLDevice, outputDim: Shape, metalLoadMode: MetalLoadMode, metalLibPath: String?, channelNum: Int) throws {
-        let textureDesc = MTLTextureDescriptor.init()
-        textureDesc.textureType = .type2D
-        textureDesc.width = outputDim.width
-        textureDesc.height = outputDim.height
-        textureDesc.depth = (outputDim.channel + 3) / 4
-        if GlobalConfig.shared.computePrecision == .Float16 {
-            textureDesc.pixelFormat = .rgba16Float
-        } else if GlobalConfig.shared.computePrecision == .Float32 {
-            textureDesc.pixelFormat = .rgba32Float
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-        textureDesc.usage = [.shaderRead, .shaderWrite]
-        textureDesc.storageMode = .shared
-        guard let tempTexture = device.makeTexture(descriptor: textureDesc) else {
-            throw PaddleMobileError.makeError(type: .loaderError, msg: "make texture error")
-        }
-        outputTexture = tempTexture
-        let initContext = InitContext.init()
-        initContext.metalLibPath = metalLibPath
-        initContext.metalLoadMode = metalLoadMode
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            if channelNum == 1 {
-                try super.init(device: device, inFunctionName: "buffer_to_texture_kernel", initContext: initContext)
-            } else if channelNum == 3 {
-                try super.init(device: device, inFunctionName: "buffer_to_texture_kernel_channel_3", initContext: initContext)
-            } else {
-                throw PaddleMobileError.makeError(type: .loaderError, msg: "not support channel num: \(channelNum)")
-            }
-        } else {
-            if channelNum == 1 {
-                try super.init(device: device, inFunctionName: "buffer_to_texture_kernel_half", initContext: initContext)
-            } else if channelNum == 3 {
-                try super.init(device: device, inFunctionName: "buffer_to_texture_kernel_half_channel_3", initContext: initContext)
-            } else {
-                throw PaddleMobileError.makeError(type: .loaderError, msg: "not support channel num: \(channelNum)")
-            }
-        }
-    }
-    
-    public func compute(inputBuffer: MTLBuffer , commandBuffer: MTLCommandBuffer) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setBuffer(inputBuffer, offset: 0, index: 0)
-            encoder.setTexture(outputTexture, index: 0)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputTexture)
-        }
-    }
-    
-}
-
-@objc open class CusomKernel: Kernel {
-    
-    public let outputTexture: MTLTexture
-    public init(device: MTLDevice, inFunctionName: String?, outputDim: Shape, metalLoadModel: MetalLoadMode, metalLibPath: String?) throws {
-        let textureDesc = MTLTextureDescriptor.init()
-        textureDesc.textureType = .type2D
-        textureDesc.width = outputDim.width
-        textureDesc.height = outputDim.height
-        textureDesc.depth = (outputDim.channel + 3) / 4
-        
-        if GlobalConfig.shared.computePrecision == .Float16 {
-            textureDesc.pixelFormat = .rgba16Float
-        } else if GlobalConfig.shared.computePrecision == .Float32 {
-            textureDesc.pixelFormat = .rgba32Float
-        } else {
-            throw PaddleMobileError.makeError(type: .defaultError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-        
-        textureDesc.usage = [.shaderRead, .shaderWrite]
-        textureDesc.storageMode = .shared
-        guard let tempTexture = device.makeTexture(descriptor: textureDesc) else {
-            throw PaddleMobileError.makeError(type: .loaderError, msg: "make texture error")
-        }
-        outputTexture = tempTexture
-        
-        let context = InitContext.init()
-        context.metalLoadMode = metalLoadModel
-        context.metalLibPath = metalLibPath
-        try super.init(device: device, inFunctionName: inFunctionName, initContext: context)
-    }
-    
-    public func compute(inputTexuture: MTLTexture, commandBuffer: MTLCommandBuffer) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputTexuture, index: 0)
-            encoder.setTexture(outputTexture, index: 1)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputTexture)
-        }
-    }
-    
-}
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BatchNormKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BatchNormKernel.swift
deleted file mode 100644
index 21988570ba..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BatchNormKernel.swift
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class BatchNormKernel<P: PrecisionProtocol>: Kernel, Computable {
-    required init(device: MTLDevice, param: BatchNormParam<P>, initContext: InitContext) throws {
-        let count = param.variance.dim.numel()
-        let varianceP = param.variance.data.pointer
-        let meanP = param.mean.data.pointer
-        let scaleP = param.scale.data.pointer
-        let biasP = param.bias.data.pointer
-        for i in 0..<count {
-            let invStd = try P(1 / (Float32(varianceP[i]) + param.epsilon).squareRoot())
-            biasP[i] = biasP[i] - meanP[i] * invStd * scaleP[i]
-            scaleP[i] = invStd * scaleP[i]
-        }
-        
-        try param.bias.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-        try param.scale.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-        
-        try param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-        
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            try super.init(device: device, inFunctionName: "batchnorm", initContext: initContext)
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            try super.init(device: device, inFunctionName: "batchnorm_half", initContext: initContext)
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-    }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: BatchNormParam<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let inputMetalTexture = param.input.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        guard let outputMetalTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputMetalTexture, index: 0)
-            encoder.setTexture(outputMetalTexture, index: 1)
-            encoder.setBuffer(param.scale.buffer, offset: 0, index: 0)
-            encoder.setBuffer(param.bias.buffer, offset: 0, index: 1)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputMetalTexture)
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BatchNormReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BatchNormReluKernel.swift
deleted file mode 100644
index fca5719553..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BatchNormReluKernel.swift
+++ /dev/null
@@ -1,91 +0,0 @@
-//
-//  BatchNormRelu.swift
-//  paddle-mobile
-//
-//  Created by zhangxinjun on 2018/8/23.
-//  Copyright © 2018年 orange. All rights reserved.
-//
-
-import Foundation
-
-
-class BatchNormReluParam<P: PrecisionType>: BatchNormParam<P> {
-    
-}
-
-class BatchNormReluKernel<P: PrecisionType>: Kernel, Computable{
-    
-    
-    typealias ParamType = BatchNormReluParam<P>
-    var newScale: MTLBuffer
-    var newBias: MTLBuffer
-    
-    required init(device: MTLDevice, testParam: BatchNormReluTestParam) {
-        
-        newScale = testParam.newScaleBuffer
-        newBias = testParam.newBiaseBuffer
-        
-        super.init(device: device, inFunctionName: "batch_norm_relu_3x3")
-    }
-    
-    required init(device: MTLDevice, param: BatchNormReluParam<P>) {
-        guard let newScale = device.makeBuffer(length: param.inputScale.buffer.length) else {
-            fatalError()
-        }
-        guard let newBias = device.makeBuffer(length: param.inputBias.buffer.length) else {
-            fatalError()
-        }
-        self.newScale = newScale
-        self.newBias = newBias
-        
-        super.init(device: device, inFunctionName: "batch_norm_relu_3x3")
-        
-        
-        let varianceBuffer : MTLBuffer = param.inputVariance.buffer
-        
-        var invStd: [Float32] = Array(repeating: 0, count: varianceBuffer.length)
-        let varianceContents = varianceBuffer.contents().assumingMemoryBound(to: P.self)
-        for i in 0..<(varianceBuffer.length / MemoryLayout<P>.stride) {
-            invStd[i] = 1 / (Float32(varianceContents[i]) + param.epsilon).squareRoot()
-        }
-        
-        let newScaleContents = newScale.contents().assumingMemoryBound(to: P.self)
-        let newBiasContents = newBias.contents().assumingMemoryBound(to: P.self)
-        let scale : MTLBuffer = param.inputScale.buffer
-        let scaleContents = scale.contents().assumingMemoryBound(to: P.self)
-        let bias : MTLBuffer = param.inputBias.buffer
-        let biasContents = bias.contents().assumingMemoryBound(to: P.self)
-        let meanContents = param.inputMean.buffer.contents().assumingMemoryBound(to: P.self)
-        
-        for i in 0..<(newScale.length / MemoryLayout<P>.stride) {
-            newScaleContents[i] = P(invStd[i] * Float32(scaleContents[i]))
-            newBiasContents[i] = P(Float32(biasContents[i]) - Float32(meanContents[i]) * invStd[i] * Float32(scaleContents[i]))
-        }
-    }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: BatchNormReluParam<P>) throws {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            fatalError()
-        }
-        encoder.setTexture(param.input as? MTLTexture, index: 0)
-        encoder.setTexture(param.output as? MTLTexture, index: 1)
-        encoder.setBuffer(newScale, offset: 0, index: 1)
-        encoder.setBuffer(newBias, offset: 0, index: 1)
-        encoder.dispatch(computePipline: pipline, outTexture: param.output as! MTLTexture)
-        encoder.endEncoding()
-    }
-    
-    func testCompute(commandBuffer: MTLCommandBuffer, testParam: BatchNormReluTestParam) throws {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            fatalError()
-        }
-        encoder.setTexture(testParam.inputTexture, index: 0)
-        encoder.setTexture(testParam.outputTexture, index: 1)
-        encoder.setBuffer(newScale, offset: 0, index: 0)
-        encoder.setBuffer(newBias, offset: 0, index: 1)
-        encoder.dispatch(computePipline: pipline, outTexture: testParam.outputTexture)
-        encoder.endEncoding()
-    }
-    
-    
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BilinearInterpKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BilinearInterpKernel.swift
deleted file mode 100644
index aa627d34a8..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BilinearInterpKernel.swift
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-struct BilinearInterpMetalParam {
-    var ratio_h: Float32
-    var ratio_w: Float32
-}
-
-class BilinearInterpKernel<P: PrecisionProtocol>: Kernel, Computable{
-    func compute(commandBuffer: MTLCommandBuffer, param: BilinearInterpParam<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let inputMetalTexture = param.input.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        guard let outputMetalTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        var ratio_h: Float32 = 0
-        var ratio_w: Float32 = 0
-        if param.output.tensorDim.dims[2] > 1 {
-            ratio_h = Float32(param.input.tensorDim.dims[2]-1) / Float32(param.output.tensorDim.dims[2]-1)
-        }
-        if param.output.tensorDim.dims[3] > 1 {
-            ratio_w = Float32(param.input.tensorDim.dims[3]-1) / Float32(param.output.tensorDim.dims[3]-1)
-        }
-        var p = BilinearInterpMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w)
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputMetalTexture, index: 0)
-            encoder.setTexture(outputMetalTexture, index: 1)
-            encoder.setBytes(&p, length: MemoryLayout<BilinearInterpMetalParam>.size, index: 0)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputMetalTexture)
-        }
-    }
-    
-    required init(device: MTLDevice, param: BilinearInterpParam<P>, initContext: InitContext) throws {
-        
-        try param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-        
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            try super.init(device: device, inFunctionName: "bilinear_interp_float", initContext: initContext)
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            try super.init(device: device, inFunctionName: "bilinear_interp_half", initContext: initContext)
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-    }
-    
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BoxcoderKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BoxcoderKernel.swift
deleted file mode 100644
index 9c920e6da3..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BoxcoderKernel.swift
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-struct BoxcoderMetalParam {
-}
-
-class BoxcoderKernel<P: PrecisionProtocol>: Kernel, Computable{
-    func compute(commandBuffer: MTLCommandBuffer, param: BoxcoderParam<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let priorBoxMetalTexture = param.priorBox.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "priorBox metaltexture is nil")
-        }
-        guard let priorBoxVarMetalTexture = param.priorBoxVar.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "priorBoxVar metaltexture is nil")
-        }
-        guard let targetBoxMetalTexture = param.targetBox.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        guard let outputMetalTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(priorBoxMetalTexture, index: 0)
-            encoder.setTexture(priorBoxVarMetalTexture, index: 1)
-            encoder.setTexture(targetBoxMetalTexture, index: 2)
-            encoder.setTexture(outputMetalTexture, index: 3)
-            var bmp = BoxcoderMetalParam.init()
-            encoder.setBytes(&bmp, length: MemoryLayout<BoxcoderMetalParam>.size, index: 0)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputMetalTexture)
-        }
-    }
-    
-    required init(device: MTLDevice, param: BoxcoderParam<P>, initContext: InitContext) throws {
-        try param.output.initTexture(device: device, inTranspose: [0, 3, 1, 2], computePrecision: GlobalConfig.shared.computePrecision)
-        
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            try super.init(device: device, inFunctionName: "boxcoder_float", initContext: initContext)
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            try super.init(device: device, inFunctionName: "boxcoder_half", initContext: initContext)
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-    }
-    
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/CNNConvKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/CNNConvKernel.swift
deleted file mode 100644
index 14a5bd5214..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/CNNConvKernel.swift
+++ /dev/null
@@ -1,176 +0,0 @@
-//
-//  CNNConvKernel.swift
-//  paddle-mobile
-//
-
-import Foundation
-import Metal
-import Accelerate
-import MetalPerformanceShaders
-
-@available(iOS 10.0, *)
-class WeightsDataSource: NSObject, MPSCNNConvolutionDataSource  {
-    
-    let desc: MPSCNNConvolutionDescriptor
-    let weight:UnsafeMutableRawPointer
-    let bias:UnsafeMutablePointer<Float>
-    
-    
-    
-    init(inDesc: MPSCNNConvolutionDescriptor, inWeight: UnsafeMutableRawPointer, inBias: UnsafeMutablePointer<Float>) {
-        desc = inDesc
-        weight = inWeight
-        bias = inBias
-    }
-    
-    
-    func dataType() -> MPSDataType {
-        return .float32
-    }
-    
-    func descriptor() -> MPSCNNConvolutionDescriptor {
-        return desc
-    }
-    
-    func weights() -> UnsafeMutableRawPointer {
-        return self.weight
-    }
-    
-    func biasTerms() -> UnsafeMutablePointer<Float>? {
-        return self.bias
-    }
-    
-    func load() -> Bool {
-        return true
-    }
-    
-    func purge() {
-    }
-    
-    func label() -> String? {
-        return "Conv"
-    }
-    
-    
-}
-
-@available(iOS 10.0, *)
-class CNNConvParam<P: PrecisionType>: OpParam{
-    
-    typealias ParamPrecisionType = P
-    required init(opDesc: OpDesc, inScope: Scope) throws {
-        do {
-            filter = try CNNConvParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
-            input = try CNNConvParam.input(inputs: opDesc.inputs, from: inScope)
-            output = try CNNConvParam.outputOut(outputs: opDesc.outputs, from: inScope)
-            stride = try CNNConvParam.getAttr(key: "strides", attrs: opDesc.attrs)
-            paddings = try CNNConvParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-            // 暂时不用关心
-            dilations = try CNNConvParam.getAttr(key: "dilations", attrs: opDesc.attrs)
-            // 暂时不用关心
-            groups = try CNNConvParam.getAttr(key: "groups", attrs: opDesc.attrs)
-            
-            variance = try CNNConvParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
-            // bias
-            y = try CNNConvParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-        } catch let error {
-            throw error
-        }
-    }
-    
-    var input: Texture<P>
-    let variance: Tensor<ParamPrecisionType>
-    let y: Tensor<ParamPrecisionType>
-    let filter: Tensor<ParamPrecisionType>
-    var output: Texture<P>
-    let stride: [Int32]
-    let paddings: [Int32]
-    let dilations: [Int32]
-    let groups: Int
-}
-
-@available(iOS 10.0, *)
-class CNNConvKernel<P: PrecisionType>: Kernel, Computable {
-    
-    typealias ParamType = CNNConvParam<P>
-    
-    var mpsImageCreator: MpsImageCreator<P>?
-    var activation:MPSCNNNeuron?
-    var conv:MPSCNNConvolution?
-    var weightDataSource:WeightsDataSource?
-    var param: CNNConvParam<P>?
-    var device: MTLDevice?
-    
-    
-    required init(device:MTLDevice, testParam:CNNMPSConvTestParam) {
-        self.device = device
-        
-        let desc = MPSCNNConvolutionDescriptor(kernelWidth: testParam.filterSize.width, kernelHeight: testParam.filterSize.height, inputFeatureChannels: testParam.filterSize.channel, outputFeatureChannels: testParam.filterSize.channel, neuronFilter: activation)
-        
-        desc.strideInPixelsX = Int(testParam.metalParam.offsetX)
-        desc.strideInPixelsY = Int(testParam.metalParam.offsetY)
-        
-        
-        weightDataSource = WeightsDataSource(inDesc: desc, inWeight:testParam.filterPointer, inBias:testParam.biasePointer)
-        
-        if #available(iOS 11.0, *) {
-            conv = MPSCNNConvolution(device: self.device!, weights: weightDataSource!)
-        } else {
-            // Fallback on earlier versions
-        }
-        
-        super.init(device: device, inFunctionName: "")
-    }
-
-    required init(device:MTLDevice, param:CNNConvParam<P>) {
-        
-        self.device = device
-
-        let inChannels: Int
-        let outChannels: Int
-        
-        if param.y.dim.cout() == 4 {
-            inChannels = (param.y.dim[3])
-            outChannels = inChannels
-        } else {
-            inChannels = 0
-            outChannels = inChannels
-        }
-        
-        let desc = MPSCNNConvolutionDescriptor(kernelWidth: param.filter.width, kernelHeight: param.filter.height, inputFeatureChannels: inChannels, outputFeatureChannels: outChannels, neuronFilter: activation)
-        
-        desc.strideInPixelsX = Int(param.stride[0])
-        desc.strideInPixelsY = Int(param.stride[1])
-        
-        
-        weightDataSource = WeightsDataSource(inDesc: desc, inWeight:param.filter.data.pointer as! UnsafeMutablePointer<Float>, inBias: param.y.data.pointer as! UnsafeMutablePointer<Float>)
-        
-        if #available(iOS 11.0, *) {
-            conv = MPSCNNConvolution(device: self.device!, weights: weightDataSource!)
-        } else {
-            // Fallback on earlier versions
-        }
-        
-        super.init(device: device, inFunctionName: "")
-    }
-
-    func compute(commandBuffer: MTLCommandBuffer, param: CNNConvParam<P>) throws {
-        let inputImage:MPSImage = (mpsImageCreator?.createMPSImage(device: device!))!
-        var outputImage = (mpsImageCreator?.createMPSImage(device: device!))!
-        
-        // 运算conv和add两个步骤，add用了bias偏差做为参数，被Metal API进行调用
-        conv?.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage)
-        
-        param.input = outputImage.texture as! Texture<P>
-    }
-    
-    func testCompute(commandBuffer: MTLCommandBuffer, testParam: CNNMPSConvTestParam) throws {
-        let inputImage:MPSImage = (mpsImageCreator?.createMPSImage(device: device!))!
-        var outputImage = (mpsImageCreator?.createMPSImage(device: device!))!
-        
-        // 运算conv和add两个步骤，add用了bias偏差做为参数，被Metal API进行调用
-        conv?.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage)
-        
-        testParam.outputTexture = outputImage.texture
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Concat.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Concat.swift
deleted file mode 100644
index 25f0a21bff..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Concat.swift
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class ConcatKernel<P: PrecisionType>: Kernel, Computable{
-    func compute(commandBuffer: MTLCommandBuffer, param: ConcatParam<P>) throws {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            throw PaddleMobileError.predictError(message: " encoder is nil")
-        }
-        encoder.setTexture(param.input.metalTexture, index: 0)
-        encoder.setTexture(param.output.metalTexture, index: 1)
-        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-        encoder.endEncoding()
-    }
-    
-    required init(device: MTLDevice, param: ConcatParam<P>) {
-        super.init(device: device, inFunctionName: "concat")
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConcatKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConcatKernel.swift
deleted file mode 100644
index 9aeb8949cf..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConcatKernel.swift
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import Metal
-
-struct ConcatTestParam: TestParam {
-    var input: [MTLTexture]
-    var output: MTLTexture
-    var dims: [[Int]]
-    var axis: Int
-    var odim: [Int]
-}
-
-struct ConcatMetalParam {
-    var odim: (Int32, Int32, Int32, Int32) = (1, 1, 1, 1)
-    var axis: Int32 = 0
-    var offset: Int32 = 0
-    var trans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
-    var vdim: (Int32, Int32, Int32, Int32, Int32, Int32) = (0, 0, 0, 0, 0, 0)
-}
-
-class ConcatKernel<P: PrecisionProtocol>: Kernel, Computable{
-    var v = "normal"
-    var pm = ConcatMetalParam.init()
-    func compute(commandBuffer: MTLCommandBuffer, param: ConcatParam<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let outputMetalTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            let num = param.input.count
-            for i in 0..<num {
-                guard let inputMetalTexture = param.input[i].metalTexture else {
-                    throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture \(i) is nil")
-                }
-                encoder.setTexture(inputMetalTexture, index: i)
-            }
-            encoder.setTexture(outputMetalTexture, index: num)
-            if v == "normal" {
-                encoder.setTexture(param.output.metalTexture, index: num + 1)
-            }
-            encoder.setBytes(&pm, length: MemoryLayout<ConcatMetalParam>.size, index: 0)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputMetalTexture)
-        }
-    }
-    
-    required init(device: MTLDevice, param: ConcatParam<P>, initContext: InitContext) throws {
-        
-        try param.output.initTexture(device: device, inTranspose: param.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-        
-        let orank = param.output.tensorDim.cout()
-        let num = param.input.count
-        guard num <= 6 else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "param input count must be less than or equal to 6")
-        }
-        var axis = 4 - param.output.tensorDim.cout() + param.axis
-        for i in 0..<4 {
-            if param.transpose[i] == axis {
-                axis = i
-                break
-            }
-        }
-        pm.axis = Int32(axis)
-        pm.odim = (Int32(param.output.dim[0]), Int32(param.output.dim[1]), Int32(param.output.dim[2]), Int32(param.output.dim[3]))
-        pm.trans = (Int32(param.output.transpose[0]), Int32(param.output.transpose[1]), Int32(param.output.transpose[2]), Int32(param.output.transpose[3]))
-        var vdim: [Int] = [0, 0, 0, 0, 0, 0]
-        for i in 0..<num {
-            vdim[i] = param.input[i].dim[axis]
-        }
-        if orank == 4 {
-            if axis == 1 {
-                v = "y"
-            } else if axis == 2 {
-                v = "x"
-            } else {
-                if (param.output.dim[0] == 1) && axis == 3 {
-                    var vz = true
-                    for i in 0..<num {
-                        if vdim[i] % 4 != 0 {
-                            vz = false
-                            break
-                        }
-                    }
-                    if vz {
-                        v = "z"
-                        for i in 0..<num {
-                            vdim[i] = vdim[i] / 4
-                        }
-                    }
-                }
-            }
-        } else if orank == 3 {
-            if axis == 2 {
-                v = "y"
-            } else if axis == 3 {
-                v = "x"
-            } else if axis == 1 {
-                var vz = true
-                for i in 0..<num {
-                    if vdim[i] % 4 != 0 {
-                        vz = false
-                        break
-                    }
-                }
-                if vz {
-                    v = "z"
-                    for i in 0..<num {
-                        vdim[i] = vdim[i] / 4
-                    }
-                }
-            }
-        } else {
-            if axis == 2 {
-                v = "y"
-            } else if axis == 3 {
-                var vx = true
-                for i in 0..<num {
-                    if vdim[i] % 4 != 0 {
-                        vx = false
-                        break
-                    }
-                }
-                if vx {
-                    v = "x"
-                    for i in 0..<num {
-                        vdim[i] = vdim[i] / 4
-                    }
-                }
-            }
-        }
-        pm.vdim = (Int32(vdim[0]), Int32(vdim[1]), Int32(vdim[2]), Int32(vdim[3]), Int32(vdim[4]), Int32(vdim[5]))
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            try super.init(device: device, inFunctionName: "concat_\(orank)_\(num)_\(v)_float", initContext: initContext)
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            try super.init(device: device, inFunctionName: "concat_\(orank)_\(num)_\(v)_half", initContext: initContext)
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-    }
-    
-    required init(device: MTLDevice, testParam: ConcatTestParam, initContext: InitContext) throws {
-        try super.init(device: device, inFunctionName: "concat", initContext: initContext)
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift
deleted file mode 100644
index 509cdbaaa5..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class ConvAddAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable {
-    var metalParam: MetalConvParam!
-    required init(device: MTLDevice, param: ConvAddAddPreluParam<P>, initContext: InitContext) throws {
-        
-        try param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
-        
-        try param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-        try param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-        try param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-        
-        if GlobalConfig.shared.computePrecision == .Float16 {
-            if param.filter.width == 1 && param.filter.height == 1 {
-                if param.mode == "channel" {
-                    try super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half", initContext: initContext)
-                } else if param.mode == "element" {
-                    try super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half", initContext: initContext)
-                } else {
-                    try super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half", initContext: initContext)
-                }
-            } else if param.filter.width == 3 && param.filter.height == 3 {
-                if param.filter.channel == 1 {
-                    if param.mode == "channel" {
-                        try super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half", initContext: initContext)
-                    } else if param.mode == "element" {
-                        try super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half", initContext: initContext)
-                    } else {
-                        try super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half", initContext: initContext)
-                    }
-                } else {
-                    if param.mode == "channel" {
-                        try super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half", initContext: initContext)
-                    } else if param.mode == "element" {
-                        try super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half", initContext: initContext)
-                    } else {
-                        try super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half", initContext: initContext)
-                    }
-                }
-            } else if param.filter.width == 1 && param.filter.height == 5 {
-                if param.mode == "channel" {
-                    try super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half", initContext: initContext)
-                } else if param.mode == "element" {
-                    try super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half", initContext: initContext)
-                } else {
-                    try super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half", initContext: initContext)
-                }
-            } else if param.filter.width == 5 && param.filter.height == 1 {
-                if param.mode == "channel" {
-                    try super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half", initContext: initContext)
-                } else if param.mode == "element" {
-                    try super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half", initContext: initContext)
-                } else {
-                    try super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half", initContext: initContext)
-                }
-            } else {
-                throw PaddleMobileError.makeError(type: .netError, msg: "unsupported conv filter")
-            }
-        } else if GlobalConfig.shared.computePrecision == .Float32 {
-            if param.filter.width == 1 && param.filter.height == 1 {
-                if param.mode == "channel" {
-                    try super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float", initContext: initContext)
-                } else if param.mode == "element" {
-                    try super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float", initContext: initContext)
-                } else {
-                    try super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float", initContext: initContext)
-                }
-            } else if param.filter.width == 3 && param.filter.height == 3 {
-                if param.filter.channel == 1 {
-                    if param.mode == "channel" {
-                        try super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float", initContext: initContext)
-                    } else if param.mode == "element" {
-                        try super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float", initContext: initContext)
-                    } else {
-                        try super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float", initContext: initContext)
-                    }
-                } else {
-                    if param.mode == "channel" {
-                        try super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float", initContext: initContext)
-                    } else if param.mode == "element" {
-                        try super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float", initContext: initContext)
-                    } else {
-                        try super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float", initContext: initContext)
-                    }
-                }
-            } else if param.filter.width == 1 && param.filter.height == 5 {
-                if param.mode == "channel" {
-                    try super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float", initContext: initContext)
-                } else if param.mode == "element" {
-                    try super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float", initContext: initContext)
-                } else {
-                    try super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float", initContext: initContext)
-                }
-            } else if param.filter.width == 5 && param.filter.height == 1 {
-                if param.mode == "channel" {
-                    try super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float", initContext: initContext)
-                } else if param.mode == "element" {
-                    try super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float", initContext: initContext)
-                } else {
-                    try super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float", initContext: initContext)
-                }
-            } else {
-                throw PaddleMobileError.makeError(type: .netError, msg: "unsupported conv filter")
-            }
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-        
-        guard let filterHeight = param.filter.height else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "filter unsupported")
-        }
-        guard let filterWidth = param.filter.width else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "filter unsupported")
-        }
-        
-        let offsetY = (Int(param.dilations[1]) * (filterHeight - 1) + 1)/2 - Int(param.paddings[1])
-        
-        let offsetX = (Int(param.dilations[0]) * (filterWidth - 1) + 1)/2 - Int(param.paddings[0])
-        
-        //    print(" function: \(functionName)")
-        //    print("offset x: \(offsetX)")
-        //    print("offset y: \(offsetY)")
-        
-        let offsetZ = 0.0
-        let iC = param.input.tensorDim[1];
-        let fC = param.filter.tensorDim[1];
-        let oC = param.output.tensorDim[1];
-        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0), addParam: ElementwiseAddMetalParam())
-        //    print("metal param: ")
-        //    print(inMetalParam)
-        
-        metalParam = inMetalParam
-    }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: ConvAddAddPreluParam<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let inputMetalTexture = param.input.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        guard let outputMetalTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputMetalTexture, index: 0)
-            encoder.setTexture(outputMetalTexture, index: 1)
-            encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-            encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-            encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
-            encoder.setBuffer(param.alpha.buffer, offset: 0, index: 3)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputMetalTexture)
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift
deleted file mode 100644
index cc2d730fd4..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift
+++ /dev/null
@@ -1,217 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import Metal
-
-struct ConvAddBatchNormReluTestParam: TestParam {
-    let inputTexture: MTLTexture
-    let outputTexture: MTLTexture
-    var metalParam: MetalConvParam
-    let filterBuffer: MTLBuffer
-    let biaseBuffer: MTLBuffer
-    let newScaleBuffer: MTLBuffer
-    let newBiaseBuffer: MTLBuffer
-    let filterSize: (width: Int, height: Int, channel: Int)
-    init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) {
-        inputTexture = inInputTexture
-        outputTexture = inOutputTexture
-        metalParam = inMetalParam
-        filterBuffer = inFilterBuffer
-        biaseBuffer = inBiaseBuffer
-        newScaleBuffer = inNewScaleBuffer
-        newBiaseBuffer = inNewBiaseBuffer
-        filterSize = inFilterSize
-    }
-}
-
-class ConvAddBatchNormReluKernel<P: PrecisionProtocol>: Kernel, Computable, Testable {
-    required init(device: MTLDevice, testParam: ConvAddBatchNormReluTestParam, initContext: InitContext) throws {
-        if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 {
-            try super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1", initContext: initContext)
-        } else if testParam.filterSize.width == 3 && testParam.filterSize.height == 3 {
-            if testParam.filterSize.channel == 1 {
-                try super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3", initContext: initContext)
-            } else {
-                try super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3", initContext: initContext)
-            }
-        } else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "unsupported conv filter")
-        }
-    }
-    
-    var metalParam: MetalConvParam!
-    
-    required init(device: MTLDevice, param: ConvAddBatchNormReluParam<P>, initContext: InitContext) throws {
-        
-        try param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
-        
-        try param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-        try param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-        try param.variance.initBuffer(device: device, precision: .Float32)
-        try param.mean.initBuffer(device: device, precision: .Float32)
-        try param.scale.initBuffer(device: device, precision: .Float32)
-        try param.bias.initBuffer(device: device, precision: .Float32)
-        
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            if param.filter.width == 1 && param.filter.height == 1 {
-                try super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1", initContext: initContext)
-            } else if param.filter.width == 3 && param.filter.height == 3 {
-                if param.filter.channel == 1 {
-                    try super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3", initContext: initContext)
-                } else {
-                    try super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3", initContext: initContext)
-                }
-            } else {
-                throw PaddleMobileError.makeError(type: .netError, msg: "unsupported conv filter")
-            }
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            if param.filter.width == 1 && param.filter.height == 1 {
-                try super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1_half", initContext: initContext)
-            } else if param.filter.width == 3 && param.filter.height == 3 {
-                if param.filter.channel == 1 {
-                    try super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3_half", initContext: initContext)
-                } else {
-                    try super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3_half", initContext: initContext)
-                }
-            } else {
-                throw PaddleMobileError.makeError(type: .netError, msg: "unsupported conv filter")
-            }
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-        
-        guard let filterWidth = param.filter.width else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "filter unsupported")
-        }
-        guard let filterHeight = param.filter.height else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "filter unsupported")
-        }
-        let offsetX = filterWidth/2 - Int(param.paddings[0])
-        let offsetY = filterHeight/2 - Int(param.paddings[1])
-        
-        paddleMobileLog("offset x: \(offsetX)")
-        paddleMobileLog("offset y: \(offsetY)")
-        
-        let offsetZ = 0.0
-        let iC = param.input.tensorDim[1];
-        let fC = param.filter.tensorDim[1];
-        let oC = param.output.tensorDim[1];
-        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0), addParam: ElementwiseAddMetalParam())
-        
-        var invs: [P] = []
-        let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
-        
-        for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {
-            let inv = 1.0/pow(try Float32.init(varianceContents[i]) + param.epsilon, 0.5)
-            invs.append(try P(inv))
-        }
-        
-        let newScale: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.scale.buffer.length)
-        let newBiase: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.bias.buffer.length)
-        defer {
-            newScale.deinitialize(count: param.scale.buffer.length)
-            newScale.deallocate()
-            
-            newBiase.deinitialize(count: param.bias.buffer.length)
-            newBiase.deallocate()
-        }
-        let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self)
-        let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self)
-        let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self)
-        for i in 0..<param.scale.buffer.length/MemoryLayout<P>.stride {
-            newScale[i] = invs[i] * scaleContents[i]
-            newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i]
-        }
-        
-        //    var newScaleFP16: UnsafeMutableRawPointer
-        //
-        //    float32ToFloat16(input: newScale as! UnsafeMutablePointer<Float32>, output: newScaleFP16, count: param.scale.buffer.length / MemoryLayout<P>.size)
-        
-        
-        //    let newBiaseFloat16 = device.makeBuffer(length: <#T##Int#>, options: <#T##MTLResourceOptions#>)
-        
-        var newBiaseBuffer: MTLBuffer
-        var newScaleBuffer: MTLBuffer
-        
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)!
-            newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)!
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            
-            newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
-            newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
-            
-            try float32ToFloat16(input: newBiase as! UnsafeMutablePointer<Float32>, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout<P>.size)
-            
-            try float32ToFloat16(input: newScale as! UnsafeMutablePointer<Float32>, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout<P>.size)
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-        
-        param.newBiase = newBiaseBuffer
-        param.newScale = newScaleBuffer
-    }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluParam<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let inputTexture = param.input.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        guard let outputTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputTexture, index: 0)
-            encoder.setTexture(outputTexture, index: 1)
-            encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-            encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-            encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
-            encoder.setBuffer(param.newScale!, offset: 0, index: 3)
-            encoder.setBuffer(param.newBiase!, offset: 0, index: 4)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputTexture)
-        }
-    }
-    
-    public func test(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluTestParam) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .defaultError, msg: "pipline nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .defaultError, msg: "encoder nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(param.inputTexture, index: 0)
-            encoder.setTexture(param.outputTexture, index: 1)
-            var inMetalParam = param.metalParam
-            encoder.setBytes(&inMetalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-            encoder.setBuffer(param.filterBuffer, offset: 0, index: 1)
-            encoder.setBuffer(param.biaseBuffer, offset: 0, index: 2)
-            encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 3)
-            encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 4)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: param.outputTexture)
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift
deleted file mode 100644
index 79dfd6f754..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import MetalPerformanceShaders
-
-class ConvAddKernel<P: PrecisionProtocol>: ConvAddReluKernel<P> {
-    override class func hasAddOp() -> Bool {
-        return true
-    }
-
-    override class func hasReluOp() -> Bool {
-        return false
-    }
-}
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift
deleted file mode 100644
index d3fba8df26..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class ConvAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable {
-    var metalParam: MetalConvParam!
-    required init(device: MTLDevice, param: ConvAddPreluParam<P>, initContext: InitContext) throws {
-        
-        try param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
-        
-        try param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-        try param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-        try param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-        
-        if GlobalConfig.shared.computePrecision == .Float16 {
-            if param.filter.width == 1 && param.filter.height == 1 {
-                if param.mode == "channel" {
-                    try super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half", initContext: initContext)
-                } else if param.mode == "element" {
-                    try super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half", initContext: initContext)
-                } else {
-                    try super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half", initContext: initContext)
-                }
-            } else if param.filter.width == 3 && param.filter.height == 3 {
-                if param.filter.channel == 1 {
-                    if param.mode == "channel" {
-                        try super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half", initContext: initContext)
-                    } else if param.mode == "element" {
-                        try super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half", initContext: initContext)
-                    } else {
-                        try super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half", initContext: initContext)
-                    }
-                } else {
-                    if param.mode == "channel" {
-                        try super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half", initContext: initContext)
-                    } else if param.mode == "element" {
-                        try super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half", initContext: initContext)
-                    } else {
-                        try super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half", initContext: initContext)
-                    }
-                }
-            } else if param.filter.width == 1 && param.filter.height == 5 {
-                if param.mode == "channel" {
-                    try super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half", initContext: initContext)
-                } else if param.mode == "element" {
-                    try super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half", initContext: initContext)
-                } else {
-                    try super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half", initContext: initContext)
-                }
-            } else if param.filter.width == 5 && param.filter.height == 1 {
-                if param.mode == "channel" {
-                    try super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half", initContext: initContext)
-                } else if param.mode == "element" {
-                    try super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half", initContext: initContext)
-                } else {
-                    try super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half", initContext: initContext)
-                }
-            } else {
-                throw PaddleMobileError.makeError(type: .paramError, msg: "unsupported filter")
-            }
-        } else if GlobalConfig.shared.computePrecision == .Float32 {
-            if param.filter.width == 1 && param.filter.height == 1 {
-                if param.mode == "channel" {
-                    try super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float", initContext: initContext)
-                } else if param.mode == "element" {
-                    try super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float", initContext: initContext)
-                } else {
-                    try super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float", initContext: initContext)
-                }
-            } else if param.filter.width == 3 && param.filter.height == 3 {
-                if param.filter.channel == 1 {
-                    if param.mode == "channel" {
-                        try super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float", initContext: initContext)
-                    } else if param.mode == "element" {
-                        try super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float", initContext: initContext)
-                    } else {
-                        try super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float", initContext: initContext)
-                    }
-                } else {
-                    if param.mode == "channel" {
-                        try super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float", initContext: initContext)
-                    } else if param.mode == "element" {
-                        try super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float", initContext: initContext)
-                    } else {
-                        try super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float", initContext: initContext)
-                    }
-                }
-            } else if param.filter.width == 1 && param.filter.height == 5 {
-                if param.mode == "channel" {
-                    try super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float", initContext: initContext)
-                } else if param.mode == "element" {
-                    try super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float", initContext: initContext)
-                } else {
-                    try super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float", initContext: initContext)
-                }
-            } else if param.filter.width == 5 && param.filter.height == 1 {
-                if param.mode == "channel" {
-                    try super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float", initContext: initContext)
-                } else if param.mode == "element" {
-                    try super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float", initContext: initContext)
-                } else {
-                    try super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float", initContext: initContext)
-                }
-            } else {
-                throw PaddleMobileError.makeError(type: .paramError, msg: "unsupported filter")
-            }
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-        
-        guard let filterHeight = param.filter.height else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "filter unsupported")
-        }
-        guard let filterWidth = param.filter.width else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "filter unsupported")
-        }
-        
-        let offsetY = (Int(param.dilations[1]) * (filterHeight - 1) + 1)/2 - Int(param.paddings[1])
-        
-        let offsetX = (Int(param.dilations[0]) * (filterWidth - 1) + 1)/2 - Int(param.paddings[0])
-        
-        //    print(" function: \(functionName)")
-        //    print("offset x: \(offsetX)")
-        //    print("offset y: \(offsetY)")
-        
-        let offsetZ = 0.0
-        let iC = param.input.tensorDim[1];
-        let fC = param.filter.tensorDim[1];
-        let oC = param.output.tensorDim[1];
-        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0), addParam: ElementwiseAddMetalParam())
-        //    print("metal param: ")
-        //    print(inMetalParam)
-        
-        metalParam = inMetalParam
-    }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: ConvAddPreluParam<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let inputMetalTexture = param.input.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        guard let outputMetalTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputMetalTexture, index: 0)
-            encoder.setTexture(outputMetalTexture, index: 1)
-            encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-            encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-            encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
-            encoder.setBuffer(param.alpha.buffer, offset: 0, index: 3)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputMetalTexture)
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift
deleted file mode 100644
index 9bdf2c7d42..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift
+++ /dev/null
@@ -1,394 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import MetalPerformanceShaders
-
-public struct MetalConvParam {
-    let offsetX: Int16
-    let offsetY: Int16
-    let offsetZ: Int16
-    let strideX: UInt16
-    let strideY: UInt16
-    let dilationX: UInt16
-    let dilationY: UInt16
-    let groups: UInt16
-    let iC: UInt16
-    let fC: UInt16
-    let oC: UInt16
-    let hasAddOp: UInt16
-    let hasReluOp: UInt16
-    let addParam: ElementwiseAddMetalParam
-}
-
-@available(iOS 11.0, *)
-class ConvDataSource<P: PrecisionProtocol>: NSObject, MPSCNNConvolutionDataSource {
-
-    var _descriptor: MPSCNNConvolutionDescriptor
-    var _weightsTensor: Tensor<P>
-    var _biasTensor: Tensor<P>?
-    var _biasTerms: UnsafeMutablePointer<Float>?
-
-    func load() -> Bool {
-        return true
-    }
-
-    func purge() {
-
-    }
-
-    func label() -> String? {
-        return "conv_add_relu_label"
-    }
-
-    func copy(with zone: NSZone? = nil) -> Any {
-        return self
-    }
-
-    init(inDesc: MPSCNNConvolutionDescriptor,
-         inWeights: Tensor<P>,
-         inBiasTerms: Tensor<P>?) throws {
-        _descriptor = inDesc
-        _weightsTensor = inWeights
-        _biasTensor = inBiasTerms
-        if let tempBiasTensor = _biasTensor {
-            switch P.precisionType {
-            case .Float32:
-                if let tempBiasTerms = tempBiasTensor.data.pointer as? UnsafeMutablePointer<Float> {
-                    _biasTerms = tempBiasTerms
-                } else {
-                    throw PaddleMobileError.makeError(type: .loaderError, msg: "_biasTensor.data.pointer not UnsafeMutablePointer<Float>")
-                }
-            case .Float16:
-                    _biasTerms = UnsafeMutablePointer<Float>.allocate(capacity: tempBiasTensor.data.count)
-                do {
-                    if let float16Point = tempBiasTensor.data.pointer as? UnsafeMutablePointer<Float16> {
-                        try float16to32(input: float16Point, output: _biasTerms!, count: tempBiasTensor.data.count)
-                    } else {
-                        throw PaddleMobileError.makeError(type: .loaderError, msg: "_biasTensor.data.pointer not UnsafeMutablePointer<Float16>")
-                    }
-                } catch let error {
-                    _biasTerms?.deallocate()
-                    _biasTerms = nil
-                    throw error
-                }
-            }
-        }
-        super.init()
-    }
-
-    func descriptor() -> MPSCNNConvolutionDescriptor {
-        return _descriptor
-    }
-
-    func dataType() -> MPSDataType {
-        switch P.precisionType {
-        case .Float32:
-            return .float32
-        case .Float16:
-            return .float16
-        }
-    }
-
-    func weights() -> UnsafeMutableRawPointer {
-        return UnsafeMutableRawPointer.init(_weightsTensor.data.pointer)
-    }
-
-    func biasTerms() -> UnsafeMutablePointer<Float>? {
-        return _biasTerms
-    }
-
-    deinit {
-        switch P.precisionType {
-        case .Float32:
-            break
-        case .Float16:
-            _biasTerms?.deallocate()
-        }
-    }
-}
-
-
-class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
-    var metalParam: MetalConvParam!
-    var mpsConvOp: Any?
-    var mpsAddOp: Any?
-    var mpsReluOp: Any?
-    var blankTexture: Texture?
-    
-    required init(device: MTLDevice, param: ConvAddReluParam<P>, initContext: InitContext) throws {
-        do {
-            try param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
-        } catch let error {
-            throw error
-        }
-        
-        var shouldUseMPS = false
-        let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggressiveOptimization)
-        if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggressiveOptimization) {
-            let inputChannel = param.input.tensorDim[1]
-            let outputChannel = param.output.tensorDim[1]
-            if inputChannel > 4 && outputChannel > 4 {
-                shouldUseMPS = true
-            }
-        }
-        if type(of: self).isWinoGrad(functionName: functionName) {
-            shouldUseMPS = false
-        }
-        let isDepthWise = param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1]
-        if !isDepthWise && param.groups > 1 {
-            shouldUseMPS = false
-        }
-        if type(of: self).hasAddOp() {
-            if !(type(of: self).canAddUseMPS(param: param)) {
-                shouldUseMPS = false
-            }
-        }
-        if shouldUseMPS {
-            try super.init(device: device, inFunctionName: nil, initContext: initContext)
-            try setupWithMPS(device: device, param: param)
-        } else {
-            if functionName == nil {
-                throw PaddleMobileError.makeError(type: .netError, msg: "function name nil")
-            }
-            try super.init(device: device, inFunctionName: functionName, initContext: initContext)
-            try setupWithoutMPS(device: device, param: param)
-        }
-    }
-    
-    var inputImage: AnyObject?
-    var outputImage: AnyObject?
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: ConvAddReluParam<P>) throws {
-        guard let inputMetalTexture = param.input.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        guard let outputMetalTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        if #available(iOS 10.0, *) {
-            if let conv = mpsConvOp as? MPSCNNConvolution {
-                if inputImage == nil {
-                    inputImage = MPSImage.init(texture: inputMetalTexture, featureChannels: param.input.tensorDim[1])
-                }
-                if outputImage == nil {
-                    outputImage = MPSImage.init(texture: outputMetalTexture, featureChannels: param.output.tensorDim[1])
-                }
-                
-                if let inputImage = inputImage as? MPSImage, let outputImage = outputImage as? MPSImage {
-                    conv.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage)
-                    if #available(iOS 11.3, *) {
-                        if let add = mpsAddOp as? MPSCNNAdd, let y = param.y {
-                            guard let yMetalTexture = y.metalTexture else {
-                                throw PaddleMobileError.makeError(type: .predictError, msg: "y metaltexture is nil")
-                            }
-                            let biasImage = MPSImage.init(texture: yMetalTexture, featureChannels: y.tensorDim[1])
-                            add.encode(commandBuffer: commandBuffer, primaryImage: outputImage, secondaryImage: biasImage, destinationImage: outputImage)
-                        }
-                        if let relu = mpsReluOp as? MPSCNNNeuronReLU {
-                            relu.encode(commandBuffer: commandBuffer, sourceImage: outputImage, destinationImage: outputImage)
-                        }
-                    }
-                }
-                return
-            }
-        }
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputMetalTexture, index: 0)
-            encoder.setTexture(param.y?.metalTexture, index: 1)
-            encoder.setTexture(outputMetalTexture, index: 2)
-            encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-            encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputMetalTexture, groupDepth: type(of: self).isWinoGrad(functionName: functionName) ? 1 : nil)
-        }
-    }
-    
-    func setupWithMPS(device: MTLDevice, param: ConvAddReluParam<P>) throws {
-        let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0])
-        let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
-        
-        let isDepthWise = param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1]
-        if #available(iOS 11.0, *) {
-            param.input.useMPS = true
-            param.output.useMPS = true
-            if #available(iOS 11.3, *) {
-                if type(of: self).hasAddOp() && type(of: self).canMPSAddByElement(param: param) && !type(of: self).canMPSAddByChannel(param: param) {
-                    mpsAddOp = MPSCNNAdd(device: device)
-                }
-                if type(of: self).hasReluOp() {
-                    mpsReluOp = MPSCNNNeuronReLU(device: device, a: 0.0)
-                }
-            }
-            let neuronFilter: MPSCNNNeuron? = param.y != nil ? nil : (neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron)
-            let desc: MPSCNNConvolutionDescriptor = isDepthWise ?
-                MPSCNNDepthWiseConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
-                                                     kernelHeight: param.filter.tensorDim[2],
-                                                     inputFeatureChannels: param.input.tensorDim[1],
-                                                     outputFeatureChannels: param.output.tensorDim[1],
-                                                     neuronFilter: neuronFilter) :
-                MPSCNNConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
-                                            kernelHeight: param.filter.tensorDim[2],
-                                            inputFeatureChannels: param.input.tensorDim[1],
-                                            outputFeatureChannels: param.output.tensorDim[1],
-                                            neuronFilter: neuronFilter)
-            desc.strideInPixelsX = Int(param.stride[0])
-            desc.strideInPixelsY = Int(param.stride[1])
-            let _ = try param.filter.convert(converter: MPSPointerConverter<P>.init())
-            var biasTerms: Tensor<P>? = nil
-            if type(of: self).hasAddOp() && type(of: self).canMPSAddByChannel(param: param) {
-                biasTerms = param.yTensor
-            }
-            let dataSource = try ConvDataSource.init(inDesc: desc, inWeights: param.filter, inBiasTerms: biasTerms)
-            
-            let conv = MPSCNNConvolution.init(device: device, weights: dataSource)
-            conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0)
-            conv.edgeMode = .zero
-            mpsConvOp = conv
-        }
-    }
-    
-    func setupWithoutMPS(device: MTLDevice, param: ConvAddReluParam<P>) throws {
-        let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0])
-        let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
-        let offsetZ = 0.0
-        let iC = param.input.tensorDim[1];
-        let fC = param.filter.tensorDim[1];
-        let oC = param.output.tensorDim[1];
-        var addParam = ElementwiseAddMetalParam()
-        if let inputY = param.y {
-            addParam = ElementwiseAddKernel<P>.metalParamFrom(inputX: param.output, inputY: inputY, axis: param.axis)
-        }
-        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(type(of: self).hasAddOp() ? 1 : 0), hasReluOp: UInt16(type(of: self).hasReluOp() ? 1 : 0), addParam: addParam)
-        metalParam = inMetalParam
-        
-        if type(of: self).isWinoGrad(functionName: functionName) {
-            let _ = try param.filter.convert(converter: WinogradPointerConverter<P>.init())
-        }
-        let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1])
-        try param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC)
-        
-        if param.y == nil {
-            let blankTensor = Tensor<P>.init(inDim: Dim(inDim: [1, 1, 1, 4]), inLayout: DataLayout.NHWC(), originDimsCount: 4)
-            blankTexture = try Texture.init(device: device, inDim: blankTensor.dim)
-            let value:[P] = try [P(Float32(1.0)), P(Float32(1.0)), P(Float32(1.0)), P(Float32(1.0)),]
-            blankTexture?.metalTexture = try device.tensor2texture(value: value, dim: blankTensor.dim.dims, transpose: [0, 2, 3, 1], inComputePrecision: GlobalConfig.shared.computePrecision)
-        }
-    }
-    
-    class func kernelFunctionName(param: ConvAddReluParam<P>, useAggressiveOptimization: Bool = false) -> String? {
-        if GlobalConfig.shared.computePrecision == .Float16 {
-            if param.filter.width == 1 && param.filter.height == 1 {
-                return "conv_add_relu_1x1_half"
-            } else if param.filter.width == 3 && param.filter.height == 3 {
-                if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
-                    if useAggressiveOptimization {
-                        let couldUseWinograd = param.filter.width == 3 && param.filter.height == 3
-                            && (param.filter.n ?? Int.max) <= 16 && param.stride[0] == 1 && param.stride[1] == 1
-                            && param.dilations[0] == 1 && param.dilations[1] == 1
-                        if couldUseWinograd {
-                            return "depthwise_conv_add_relu_3x3_half_winograd"
-                        }
-                    }
-                    return "depthwise_conv_add_relu_3x3_half"
-                } else {
-                    if param.groups == 1 {
-                        return "conv_add_relu_3x3_half"
-                    } else {
-                        return "group_conv_add_relu_3x3_half"
-                    }
-                }
-            }
-            if param.filter.width == 1 && param.filter.height == 5 {
-                return "conv_add_relu_5x1_half"
-            }
-            if param.filter.width == 5 && param.filter.height == 1 {
-                return "conv_add_relu_1x5_half"
-            }
-            return nil
-        } else if GlobalConfig.shared.computePrecision == .Float32 {
-            if param.filter.width == 1 && param.filter.height == 1 {
-                return "conv_add_relu_1x1"
-            } else if param.filter.width == 3 && param.filter.height == 3 {
-                if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
-                    return "depthwise_conv_add_relu_3x3"
-                } else {
-                    if param.groups == 1 {
-                        return "conv_add_relu_3x3"
-                    } else {
-                        return "group_conv_add_relu_3x3"
-                    }
-                }
-            } else if param.filter.width == 1 && param.filter.height == 5 {
-                return "conv_add_relu_5x1"
-            } else if param.filter.width == 5 && param.filter.height == 1 {
-                return "conv_add_relu_1x5"
-            } else {
-                return nil
-            }
-        } else {
-            return nil
-        }
-    }
-    
-    open func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? {
-        if type(of: self).hasReluOp() {
-            if #available(iOS 10.0, *) {
-                return MPSCNNNeuronReLU(device: device, a: 0)
-            }
-        }
-        return nil
-    }
-    
-    open class func canAddUseMPS(param: ConvAddReluParam<P>) -> Bool {
-        return canMPSAddByChannel(param: param) || canMPSAddByElement(param: param)
-    }
-    
-    private class func canMPSAddByChannel(param: ConvAddReluParam<P>) -> Bool {
-        if let yTensor = param.yTensor, yTensor.dim.cout() == 1 {
-            return true
-        }
-        return false
-    }
-    
-    private class func canMPSAddByElement(param: ConvAddReluParam<P>) -> Bool {
-        if let y = param.y, y.dim.dims == param.output.dim.dims {
-            return true
-        }
-        return false
-    }
-    
-    open class func hasAddOp() -> Bool {
-        return true
-    }
-    
-    open class func hasReluOp() -> Bool {
-        return true
-    }
-    
-    open class func isWinoGrad(functionName: String?) -> Bool {
-        if let functionName = functionName {
-            return functionName.hasSuffix("winograd")
-        }
-        return false
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift
deleted file mode 100644
index c5ff740bb0..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift
+++ /dev/null
@@ -1,214 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import MetalPerformanceShaders
-
-struct ConvBNReluTestParam: TestParam {
-    let inputTexture: MTLTexture
-    let outputTexture: MTLTexture
-    var metalParam: MetalConvParam
-    let filterBuffer: MTLBuffer
-    let biaseBuffer: MTLBuffer
-    let newScaleBuffer: MTLBuffer
-    let newBiaseBuffer: MTLBuffer
-    let filterSize: (width: Int, height: Int, channel: Int)
-    init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) {
-        
-        inputTexture = inInputTexture
-        outputTexture = inOutputTexture
-        metalParam = inMetalParam
-        filterBuffer = inFilterBuffer
-        biaseBuffer = inBiaseBuffer
-        newScaleBuffer = inNewScaleBuffer
-        newBiaseBuffer = inNewBiaseBuffer
-        filterSize = inFilterSize
-    }
-}
-
-class ConvBNReluKernel<P: PrecisionProtocol>: Kernel, Computable, Testable {
-    required init(device: MTLDevice, testParam: ConvBNReluTestParam, initContext: InitContext) throws {
-        if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 {
-            try super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1", initContext: initContext)
-        } else if testParam.filterSize.width == 3 && testParam.filterSize.height == 3 {
-            if testParam.filterSize.channel == 1 {
-                try super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3", initContext: initContext)
-            } else {
-                try super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3", initContext: initContext)
-            }
-        } else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "unsupported conv filter")
-        }
-    }
-    
-    var metalParam: MetalConvParam!
-    
-    required init(device: MTLDevice, param: ConvBNReluParam<P>, initContext: InitContext) throws {
-        
-        try param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
-        
-        try param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-        try param.variance.initBuffer(device: device, precision: .Float32)
-        try param.mean.initBuffer(device: device, precision: .Float32)
-        try param.scale.initBuffer(device: device, precision: .Float32)
-        try param.bias.initBuffer(device: device, precision: .Float32)
-        
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            if param.filter.width == 1 && param.filter.height == 1 {
-                try super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1", initContext: initContext)
-            } else if param.filter.width == 3 && param.filter.height == 3 {
-                if param.filter.channel == 1 {
-                    try super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3", initContext: initContext)
-                } else {
-                    try super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3", initContext: initContext)
-                }
-            } else {
-                throw PaddleMobileError.makeError(type: .netError, msg: "unsupported conv filter")
-            }
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            if param.filter.width == 1 && param.filter.height == 1 {
-                try super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1_half", initContext: initContext)
-            } else if param.filter.width == 3 && param.filter.height == 3 {
-                if param.filter.channel == 1 {
-                    try super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3_half", initContext: initContext)
-                } else {
-                    try super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3_half", initContext: initContext)
-                }
-            } else {
-                throw PaddleMobileError.makeError(type: .netError, msg: "unsupported conv filter")
-            }
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-        
-        guard let filterHeight = param.filter.height else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "filter unsupported")
-        }
-        guard let filterWidth = param.filter.width else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "filter unsupported")
-        }
-        
-        let offsetX = filterWidth/2 - Int(param.paddings[0])
-        let offsetY = filterHeight/2 - Int(param.paddings[1])
-        
-        //    print(" param filter width: \(param.filter.width)")
-        //    print(" param filter height: \(param.filter.height)")
-        //
-        //    print(" param paddings: \(param.paddings)")
-        //
-        //    print("ConvBNReluKernel offset x: \(offsetX)")
-        //    print("ConvBNReluKernel offset y: \(offsetY)")
-        
-        let offsetZ = 0.0
-        let iC = param.input.tensorDim[1];
-        let fC = param.filter.tensorDim[1];
-        let oC = param.output.tensorDim[1];
-        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0), addParam: ElementwiseAddMetalParam())
-        
-        var invs: [P] = []
-        let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
-        
-        for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {
-            let inv = 1.0/pow((try Float32.init(varianceContents[i])) + param.epsilon, 0.5)
-            invs.append(try P(inv))
-        }
-        
-        let newScale: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.scale.buffer.length)
-        let newBiase: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.bias.buffer.length)
-        defer {
-            newScale.deinitialize(count: param.scale.buffer.length)
-            newScale.deallocate()
-            
-            newBiase.deinitialize(count: param.bias.buffer.length)
-            newBiase.deallocate()
-        }
-        let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self)
-        let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self)
-        let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self)
-        for i in 0..<param.scale.buffer.length/MemoryLayout<P>.stride {
-            newScale[i] = invs[i] * scaleContents[i]
-            newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i]
-        }
-        
-        var newBiaseBuffer: MTLBuffer
-        var newScaleBuffer: MTLBuffer
-        
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)!
-            newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)!
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            
-            newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
-            newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
-            
-            try float32ToFloat16(input: newBiase as! UnsafeMutablePointer<Float32>, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout<P>.size)
-            
-            try float32ToFloat16(input: newScale as! UnsafeMutablePointer<Float32>, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout<P>.size)
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-        
-        param.newBiase = newBiaseBuffer
-        param.newScale = newScaleBuffer
-    }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: ConvBNReluParam<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let inputMetalTexture = param.input.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        guard let outputMetalTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputMetalTexture, index: 0)
-            encoder.setTexture(outputMetalTexture, index: 1)
-            encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-            encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-            encoder.setBuffer(param.newScale!, offset: 0, index: 2)
-            encoder.setBuffer(param.newBiase!, offset: 0, index: 3)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputMetalTexture)
-        }
-    }
-    
-    public func test(commandBuffer: MTLCommandBuffer, param: ConvBNReluTestParam) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .defaultError, msg: "pipline nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .defaultError, msg: "encoder nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(param.inputTexture, index: 0)
-            encoder.setTexture(param.outputTexture, index: 1)
-            var inMetalParam = param.metalParam
-            encoder.setBytes(&inMetalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-            encoder.setBuffer(param.filterBuffer, offset: 0, index: 1)
-            encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 2)
-            encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 3)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: param.outputTexture)
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift
deleted file mode 100644
index cb9f09b81c..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift
+++ /dev/null
@@ -1,207 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import MetalPerformanceShaders
-
-class ConvKernel<P: PrecisionProtocol>: Kernel, Computable {
-    var metalParam: MetalConvParam!
-    var mpsConvOp: Any?
-    var blankTensor: Tensor<P>?
-    
-    required init(device: MTLDevice, param: ConvParam<P>, initContext: InitContext) throws {
-        do {
-            try param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
-        } catch let error {
-            throw error
-        }
-        
-        var shouldUseMPS = false
-        let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggressiveOptimization)
-        if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggressiveOptimization) {
-            if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 {
-                shouldUseMPS = true
-            }
-        }
-        if type(of: self).isWinoGrad(functionName: functionName) {
-            shouldUseMPS = false
-        }
-        let isDepthWise = param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1]
-        if !isDepthWise && param.groups > 1 {
-            shouldUseMPS = false
-        }
-        if shouldUseMPS {
-            try super.init(device: device, inFunctionName: nil, initContext: initContext)
-            try setupWithMPS(device: device, param: param)
-        } else {
-            if functionName == nil {
-                throw PaddleMobileError.makeError(type: .netError, msg: "function name nil")
-            }
-            try super.init(device: device, inFunctionName: functionName, initContext: initContext)
-            try setupWithoutMPS(device: device, param: param)
-        }
-    }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: ConvParam<P>) throws {
-        guard let inputMetalTexture = param.input.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        guard let outputMetalTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        if #available(iOS 10.0, *) {
-            if let conv = mpsConvOp as? MPSCNNConvolution {
-                let inputImage = MPSImage.init(texture: inputMetalTexture, featureChannels: param.input.tensorDim[1])
-                let outputImage = MPSImage.init(texture: outputMetalTexture, featureChannels: param.output.tensorDim[1])
-                conv.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage)
-                return
-            }
-        }
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputMetalTexture, index: 0)
-            encoder.setTexture(outputMetalTexture, index: 2)
-            encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-            encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-            encoder.setBuffer(blankTensor?.buffer, offset: 0, index: 2)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputMetalTexture, groupDepth: type(of: self).isWinoGrad(functionName: functionName) ? 1 : nil)
-        }
-    }
-    
-    func setupWithMPS(device: MTLDevice, param: ConvParam<P>) throws {
-        let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0])
-        let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
-        
-        let isDepthWise = param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1]
-        if #available(iOS 11.0, *) {
-            param.input.useMPS = true
-            param.output.useMPS = true
-            let desc: MPSCNNConvolutionDescriptor = isDepthWise ?
-                MPSCNNDepthWiseConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
-                                                     kernelHeight: param.filter.tensorDim[2],
-                                                     inputFeatureChannels: param.input.tensorDim[1],
-                                                     outputFeatureChannels: param.output.tensorDim[1],
-                                                     neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron) :
-                MPSCNNConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
-                                            kernelHeight: param.filter.tensorDim[2],
-                                            inputFeatureChannels: param.input.tensorDim[1],
-                                            outputFeatureChannels: param.output.tensorDim[1],
-                                            neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron)
-            desc.strideInPixelsX = Int(param.stride[0])
-            desc.strideInPixelsY = Int(param.stride[1])
-            let _ = try param.filter.convert(converter: MPSPointerConverter<P>.init())
-            let dataSource = try ConvDataSource.init(inDesc: desc, inWeights: param.filter, inBiasTerms: nil)
-            let conv = MPSCNNConvolution.init(device: device, weights: dataSource)
-            conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0)
-            conv.edgeMode = .zero
-            mpsConvOp = conv
-        }
-    }
-    
-    func setupWithoutMPS(device: MTLDevice, param: ConvParam<P>) throws {
-        let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0])
-        let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
-        let offsetZ = 0.0
-        let iC = param.input.tensorDim[1];
-        let fC = param.filter.tensorDim[1];
-        let oC = param.output.tensorDim[1];
-        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(hasAddOp() ? 1 : 0), hasReluOp: UInt16(hasReluOp() ? 1 : 0), addParam: ElementwiseAddMetalParam())
-        metalParam = inMetalParam
-        
-        if type(of: self).isWinoGrad(functionName: functionName) {
-            let _ = try param.filter.convert(converter: WinogradPointerConverter<P>.init())
-        }
-        let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1])
-        try param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC)
-        blankTensor = Tensor<P>.init(inDim: Dim(inDim: [1, 1, 1, 4]), inLayout: DataLayout.NHWC(), originDimsCount: 4)
-        try blankTensor?.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    }
-    
-    class func kernelFunctionName(param: ConvParam<P>, useAggressiveOptimization: Bool = false) -> String? {
-        if GlobalConfig.shared.computePrecision == .Float16 {
-            if param.filter.width == 1 && param.filter.height == 1 {
-                return "conv_add_relu_1x1_half"
-            } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
-                if useAggressiveOptimization {
-                    let couldUseWinograd = param.filter.width == 3 && param.filter.height == 3
-                        && (param.filter.n ?? Int.max) <= 16 && param.stride[0] == 1 && param.stride[1] == 1
-                        && param.dilations[0] == 1 && param.dilations[1] == 1
-                    if couldUseWinograd {
-                        return "depthwise_conv_add_relu_3x3_half_winograd"
-                    }
-                }
-                return "depthwise_conv_add_relu_3x3_half"
-            } else if param.filter.width == 3 && param.filter.height == 3 {
-                if param.groups == 1 {
-                    return "conv_add_relu_3x3_half"
-                } else {
-                    return "group_conv_add_relu_3x3_half"
-                }
-            } else if param.filter.width == 1 && param.filter.height == 5 {
-                return "conv_add_relu_5x1_half"
-            } else if param.filter.width == 5 && param.filter.height == 1 {
-                return "conv_add_relu_1x5_half"
-            } else {
-                return nil
-            }
-        } else if GlobalConfig.shared.computePrecision == .Float32 {
-            if param.filter.width == 1 && param.filter.height == 1 {
-                return "conv_add_relu_1x1"
-            } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
-                return "depthwise_conv_add_relu_3x3"
-            } else if param.filter.width == 1 && param.filter.height == 5 {
-                return "conv_add_relu_5x1"
-            } else if param.filter.width == 5 && param.filter.height == 1 {
-                return "conv_add_relu_1x5"
-            } else if param.filter.width == 3 && param.filter.height == 3 {
-                if param.groups == 1 {
-                    return "conv_add_relu_3x3"
-                } else {
-                    return "group_conv_add_relu_3x3"
-                }
-            } else {
-                return nil
-            }
-        } else {
-            return nil
-        }
-    }
-    
-    open func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? {
-        return nil
-    }
-    
-    open func hasAddOp() -> Bool {
-        return false
-    }
-    
-    open func hasReluOp() -> Bool {
-        return false
-    }
-    
-    open class func isWinoGrad(functionName: String?) -> Bool {
-        if let functionName = functionName {
-            return functionName.hasSuffix("winograd")
-        }
-        return false
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvReluKernel.swift
deleted file mode 100644
index 9937ca158b..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvReluKernel.swift
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import MetalPerformanceShaders
-
-class ConvReluKernel<P: PrecisionProtocol>: ConvAddReluKernel<P> {
-    override class func hasAddOp() -> Bool {
-        return false
-    }
-    
-    override class func hasReluOp() -> Bool {
-        return true
-    }
-}
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvTransposeKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvTransposeKernel.swift
deleted file mode 100644
index e5e95d786f..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvTransposeKernel.swift
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-struct MetalConvTransposeParam {
-    let kernelW: UInt16;
-    let kernelH: UInt16;
-    
-    let strideX: UInt16;
-    let strideY: UInt16;
-    
-    let paddingX: UInt16;
-    let paddingY: UInt16;
-    
-    let dilationX: UInt16;
-    let dilationY: UInt16;
-}
-
-class ConvTransposeKernel<P: PrecisionProtocol>: Kernel, Computable{
-    var metalParam: MetalConvTransposeParam!
-    required init(device: MTLDevice, param: ConvTransposeParam<P>, initContext: InitContext) throws {
-        try param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-        
-        try param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, convertToNHWC: false, withTranspose: true)
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            if param.stride == [2, 2] && param.stride == [2, 2] {
-                try super.init(device: device, inFunctionName: "conv_transpose2x2_stride2", initContext: initContext)
-            } else {
-                throw PaddleMobileError.makeError(type: .netError, msg: "conv transpose param \(param) unsupported yet")
-            }
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            if param.stride == [2, 2] && param.stride == [2, 2] {
-                try super.init(device: device, inFunctionName: "conv_transpose2x2_stride2_half", initContext: initContext)
-            } else {
-                throw PaddleMobileError.makeError(type: .netError, msg: "conv transpose param \(param) unsupported yet")
-            }
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-        
-        //    let filter: [Float32] = param.filter.buffer.array()
-        //    print(" conv transpose filter")
-        //    print(filter)
-        guard let filterWidth = param.filter.width else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "filter unsupported")
-        }
-        guard let filterHeight = param.filter.height else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "filter unsupported")
-        }
-        let kernelWidth = UInt16(filterWidth)
-        let kernelHeight = UInt16(filterHeight)
-        
-        let strideX = UInt16(param.stride[0])
-        let strideY = UInt16(param.stride[1])
-        let paddingX = UInt16(param.paddings[0])
-        let paddingY = UInt16(param.paddings[1])
-        let dilationX = UInt16(param.dilations[0])
-        let dilationY = UInt16(param.dilations[1])
-        
-        metalParam = MetalConvTransposeParam.init(kernelW: kernelWidth, kernelH: kernelHeight, strideX: strideX, strideY: strideY, paddingX: paddingX, paddingY: paddingY, dilationX: dilationX, dilationY: dilationY)
-        
-    }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: ConvTransposeParam<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let inputMetalTexture = param.input.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        guard let outputMetalTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputMetalTexture, index: 0)
-            encoder.setTexture(outputMetalTexture, index: 1)
-            encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvTransposeParam>.size, index: 0)
-            encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputMetalTexture)
-        }
-    }
-}
-
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift
deleted file mode 100644
index 168786e02a..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-struct ElementwiseAddMetalParam {
-    var fast: Int32 = 0
-    var addByChannel: Int32 = 0
-    var axis: Int32 = 0
-    var ylen: Int32 = 0
-    var xdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
-    var xtrans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
-    var ydim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
-    var ytrans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
-}
-
-class ElementwiseAddKernel<P: PrecisionProtocol>: Kernel, Computable {
-    var metalParam: ElementwiseAddMetalParam
-    required init(device: MTLDevice, param: ElementwiseAddParam<P>, initContext: InitContext) throws {
-        
-        try param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-        
-        metalParam = ElementwiseAddKernel.metalParamFrom(inputX: param.inputX, inputY: param.inputY, axis: param.axis)
-        
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            try super.init(device: device, inFunctionName: "elementwise_add", initContext: initContext)
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            try super.init(device: device, inFunctionName: "elementwise_add_half", initContext: initContext)
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-    }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddParam<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let inputXMetalTexture = param.inputX.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "inputX metaltexture is nil")
-        }
-        guard let inputYMetalTexture = param.inputY.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "inputY metaltexture is nil")
-        }
-        guard let outputMetalTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputXMetalTexture, index: 0)
-            encoder.setTexture(inputYMetalTexture, index: 1)
-            encoder.setTexture(outputMetalTexture, index: 2)
-            encoder.setBytes(&metalParam, length: MemoryLayout<ElementwiseAddMetalParam>.size, index: 0)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputMetalTexture)
-        }
-    }
-    
-    static func metalParamFrom(inputX: Texture, inputY: Texture, axis: Int) -> ElementwiseAddMetalParam {
-        var metalParam = ElementwiseAddMetalParam.init()
-        
-        let xdim: [Int32] = (0..<4).map { Int32(inputX.dim[$0]) }
-        let ydim: [Int32] = (0..<4).map { Int32(inputY.dim[$0]) }
-        let xtrans: [Int32] = (0..<4).map { Int32(inputX.transpose[$0]) }
-        let ytrans: [Int32] = (0..<4).map { Int32(inputY.transpose[$0]) }
-        
-        metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3])
-        metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3])
-        metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3])
-        metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3])
-        if axis == -1 {
-            metalParam.axis = 4 - Int32(inputY.tensorDim.cout())
-        } else {
-            metalParam.axis = 4 - Int32(inputX.tensorDim.cout()) + Int32(axis)
-        }
-        metalParam.ylen = Int32(inputY.tensorDim.cout())
-        if (inputX.dim == inputY.dim) && (inputX.transpose == inputY.transpose) {
-            //      print("===> elementwise_add fast!!!")
-            metalParam.fast = 1
-        }
-        if inputY.tensorDim.cout() == 1 && (axis == 1 || (axis == -1 && inputY.tensorDim.dims[0] == inputX.padToFourDim[1])) {
-            metalParam.addByChannel = 1
-        }
-        return metalParam
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddPreluKernel.swift
deleted file mode 100644
index d266f55605..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddPreluKernel.swift
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-
-class ElementwiseAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable {
-    var metalParam: ElementwiseAddMetalParam
-    required init(device: MTLDevice, param: ElementwiseAddPreluParam<P>, initContext: InitContext) throws {
-        
-        try param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-        
-        try param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-        
-        metalParam = ElementwiseAddKernel<P>.metalParamFrom(inputX: param.inputX, inputY: param.inputY, axis: param.axis)
-        
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            if param.mode == "channel" {
-                try super.init(device: device, inFunctionName: "elementwise_add_channel_float", initContext: initContext)
-            } else if param.mode == "element" {
-                try super.init(device: device, inFunctionName: "elementwise_add_element_float", initContext: initContext)
-            } else {
-                try super.init(device: device, inFunctionName: "elementwise_add_prelu_float", initContext: initContext)
-            }
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            if param.mode == "channel" {
-                try super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext)
-            } else if param.mode == "element" {
-                try super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext)
-            } else {
-                try super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext)
-            }
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-    }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddPreluParam<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let inputXMetalTexture = param.inputX.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "inputX metaltexture is nil")
-        }
-        guard let inputYMetalTexture = param.inputY.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "inputY metaltexture is nil")
-        }
-        guard let outputMetalTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputXMetalTexture, index: 0)
-            encoder.setTexture(inputYMetalTexture, index: 1)
-            encoder.setTexture(outputMetalTexture, index: 2)
-            encoder.setBytes(&metalParam, length: MemoryLayout<ElementwiseAddMetalParam>.size, index: 0)
-            encoder.setBuffer(param.alpha.buffer, offset: 0, index: 1)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputMetalTexture)
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ExpKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ExpKernel.swift
deleted file mode 100644
index d78956a2c8..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ExpKernel.swift
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class ExpKernel<P: PrecisionProtocol>: Kernel, Computable {
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: ExpParam<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let inputMetalTexture = param.input.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        guard let outputMetalTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputMetalTexture, index: 0)
-            encoder.setTexture(outputMetalTexture, index: 1)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputMetalTexture)
-        }
-    }
-    
-    required init(device: MTLDevice, param: ExpParam<P>, initContext: InitContext) throws {
-        try param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            try super.init(device: device, inFunctionName: "exp", initContext: initContext)
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            try super.init(device: device, inFunctionName: "exp_half", initContext: initContext)
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FetchKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FetchKernel.swift
deleted file mode 100644
index e4fd64afd5..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FetchKernel.swift
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class FetchKernel<P: PrecisionProtocol>: Kernel, Computable {
-    var expectedTranspose: [Int]?
-    var device: MTLDevice?
-    var initContext: InitContext?
-    
-    required init(device: MTLDevice, param: FetchParam<P>, initContext: InitContext) throws {
-        param.output.initBuffer(device: device)
-        if GlobalConfig.shared.computePrecision == .Float16 {
-            if param.input.transpose == [0, 2, 3, 1] {
-                try super.init(device: device, inFunctionName: "fetch_half", initContext: initContext)
-            } else if param.input.transpose == [0, 1, 2, 3] {
-                switch param.input.tensorDim.cout() {
-                case 1, 2:
-                    try super.init(device: device, inFunctionName: "fetch_1or2_half", initContext: initContext)
-                case 4:
-                    expectedTranspose = [0, 2, 3, 1]
-                    try super.init(device: device, inFunctionName: "fetch_half", initContext: initContext)
-                default:
-                    throw PaddleMobileError.makeError(type: .netError, msg: "unsupported tensor dim count")
-                }
-            } else {
-                throw PaddleMobileError.makeError(type: .netError, msg: "unsupported input transpose")
-            }
-        } else if GlobalConfig.shared.computePrecision == .Float32 {
-            if param.input.transpose == [0, 2, 3, 1] {
-                try super.init(device: device, inFunctionName: "fetch_float", initContext: initContext)
-            } else if param.input.transpose == [0, 1, 2, 3] {
-                switch param.input.tensorDim.cout() {
-                case 1, 2:
-                    try super.init(device: device, inFunctionName: "fetch_1or2_float", initContext: initContext)
-                case 4:
-                    expectedTranspose = [0, 2, 3, 1]
-                    try super.init(device: device, inFunctionName: "fetch_float", initContext: initContext)
-                default:
-                    throw PaddleMobileError.makeError(type: .netError, msg: "unsupported tensor dim count")
-                }
-            } else {
-                throw PaddleMobileError.makeError(type: .netError, msg: "unsupported input transpose")
-            }
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-        self.device = device
-        self.initContext = initContext
-    }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: FetchParam<P>) throws {
-        var input = param.input
-        if let expectedTranspose = expectedTranspose {
-            if param.input.transpose != expectedTranspose {
-                if let device = device, let initContext = initContext, let transposedInput = encodeTransposeInput(input: param.input, toTranspose: expectedTranspose, commandBuffer: commandBuffer, device: device, initContext: initContext) {
-                    input = transposedInput
-                } else {
-                    throw PaddleMobileError.makeError(type: .predictError, msg: "input transpose failed in slice kernel")
-                }
-            }
-        }
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let inputMetalTexture = input.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputMetalTexture, index: 0)
-            encoder.setBuffer(param.output.resultBuffer!, offset: 0, index: 0)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: inputMetalTexture)
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FlattenKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FlattenKernel.swift
deleted file mode 100644
index ba6c059421..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FlattenKernel.swift
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-struct FlattenMetalParam {
-    var idim: (Int32, Int32, Int32, Int32)
-    var itrans: (Int32, Int32, Int32, Int32)
-    var odim: (Int32, Int32, Int32, Int32)
-    var otrans: (Int32, Int32, Int32, Int32)
-}
-
-
-class FlattenKernel<P: PrecisionProtocol>: Kernel, Computable {
-    
-    var metalParam: FlattenMetalParam
-    
-    required init(device: MTLDevice, param: FlattenParam<P>, initContext: InitContext) throws {
-        
-        try param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
-        
-        var id: [Int32] = [1, 1, 1, 1]
-        for i in 0..<param.input.tensorDim.cout() {
-            id[4-param.input.tensorDim.cout()+i] = Int32(param.input.tensorDim[i])
-        }
-        let it: [Int32] = param.input.transpose.map { Int32($0) }
-        var od: [Int32] = [1, 1, 1, 1]
-        for i in 0..<param.output.tensorDim.cout() {
-            od[4-param.output.tensorDim.cout()+i] = Int32(param.output.tensorDim[i])
-        }
-        let ot: [Int32] = param.output.transpose.map { Int32($0) }
-        metalParam = FlattenMetalParam.init(
-            idim: (id[0], id[1], id[2], id[3]),
-            itrans: (it[0], it[1], it[2], it[3]),
-            odim: (od[0], od[1], od[2], od[3]),
-            otrans: (ot[0], ot[1], ot[2], ot[3])
-        )
-        let irank = param.input.tensorDim.cout()
-        let orank = param.output.tensorDim.cout()
-        guard orank == 2 else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "output tensordim: \(param.output.tensorDim) rank must be 2")
-        }
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            try super.init(device: device, inFunctionName: "reshape_\(irank)_2_float", initContext: initContext)
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            try super.init(device: device, inFunctionName: "reshape_\(irank)_2_half", initContext: initContext)
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-    }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: FlattenParam<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let inputMetalTexture = param.input.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        guard let outputMetalTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputMetalTexture, index: 0)
-            encoder.setTexture(outputMetalTexture, index: 1)
-            encoder.setBytes(&metalParam, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputMetalTexture)
-        }
-    }
-}
-
-class Flatten2Kernel<P: PrecisionProtocol>: Kernel, Computable {
-    
-    var metalParam: FlattenMetalParam
-    
-    required init(device: MTLDevice, param: Flatten2Param<P>, initContext: InitContext) throws {
-        
-        try param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
-        
-        var id: [Int32] = [1, 1, 1, 1]
-        for i in 0..<param.input.tensorDim.cout() {
-            id[4-param.input.tensorDim.cout()+i] = Int32(param.input.tensorDim[i])
-        }
-        let it: [Int32] = param.input.transpose.map { Int32($0) }
-        var od: [Int32] = [1, 1, 1, 1]
-        for i in 0..<param.output.tensorDim.cout() {
-            od[4-param.output.tensorDim.cout()+i] = Int32(param.output.tensorDim[i])
-        }
-        let ot: [Int32] = param.output.transpose.map { Int32($0) }
-        metalParam = FlattenMetalParam.init(
-            idim: (id[0], id[1], id[2], id[3]),
-            itrans: (it[0], it[1], it[2], it[3]),
-            odim: (od[0], od[1], od[2], od[3]),
-            otrans: (ot[0], ot[1], ot[2], ot[3])
-        )
-        let irank = param.input.tensorDim.cout()
-        let orank = param.output.tensorDim.cout()
-        guard orank == 2 else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "output tensordim: \(param.output.tensorDim) rank must be 2")
-        }
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            try super.init(device: device, inFunctionName: "reshape_\(irank)_2_float", initContext: initContext)
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            try super.init(device: device, inFunctionName: "reshape_\(irank)_2_half", initContext: initContext)
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-    }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: Flatten2Param<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let inputMetalTexture = param.input.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        guard let outputMetalTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputMetalTexture, index: 0)
-            encoder.setTexture(outputMetalTexture, index: 1)
-            encoder.setBytes(&metalParam, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputMetalTexture)
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/LeakyReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/LeakyReluKernel.swift
deleted file mode 100644
index 5558be5b91..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/LeakyReluKernel.swift
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-struct LeakyReluMetalParam {
-    let alpha: Float32
-}
-
-class LeakyReluKernel<P: PrecisionProtocol>: Kernel, Computable {
-    var metalParam: LeakyReluMetalParam
-    func compute(commandBuffer: MTLCommandBuffer, param: LeakyReluParam<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let inputTexture = param.input.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        guard let outputTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputTexture, index: 0)
-            encoder.setTexture(outputTexture, index: 1)
-            encoder.setBytes(&metalParam, length: MemoryLayout<Relu6MetalParam>.size, index: 0)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputTexture)
-        }
-    }
-    
-    required init(device: MTLDevice, param: LeakyReluParam<P>, initContext: InitContext) throws {
-        try param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-        metalParam = LeakyReluMetalParam(alpha: param.alpha)
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            try super.init(device: device, inFunctionName: "leaky_relu", initContext: initContext)
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            try super.init(device: device, inFunctionName: "leaky_relu_half", initContext: initContext)
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/MulticlassNMSKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/MulticlassNMSKernel.swift
deleted file mode 100644
index 7d7b322b0b..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/MulticlassNMSKernel.swift
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class MulticlassNMSKernel<P: PrecisionProtocol>: Kernel, Computable{
-    let pipline1: MTLComputePipelineState
-    
-    required init(device: MTLDevice, param: MulticlassNMSParam<P>, initContext: InitContext) throws {
-        
-        param.middleOutput.initBuffer(device: device)
-        param.bboxOutput.initBuffer(device: device)
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            pipline1 = try device.pipeLine(funcName: "nms_fetch_bbox", metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath)
-            try super.init(device: device, inFunctionName: "nms_fetch_result", initContext: initContext)
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            pipline1 = try device.pipeLine(funcName: "nms_fetch_bbox_half", metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath)
-            try super.init(device: device, inFunctionName: "nms_fetch_result_half", initContext: initContext)
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-        
-    }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: MulticlassNMSParam<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let scoresMetalTexture = param.scores.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "scores metaltexture is nil")
-        }
-        guard let bboxesMetalTexture = param.bboxes.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "bboxes metaltexture is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(scoresMetalTexture, index: 0)
-            encoder.setBuffer(param.middleOutput.resultBuffer!, offset: 0, index: 0)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: scoresMetalTexture)
-        }
-        
-        do {
-            guard let encoderBox = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoderBox.endEncoding()
-            }
-            encoderBox.setTexture(param.bboxes.metalTexture, index: 0)
-            encoderBox.setBuffer(param.bboxOutput.resultBuffer!, offset: 0, index: 0)
-            try encoderBox.dispatch(computePipline: pipline1, outTexture: bboxesMetalTexture)
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/NearestInterpKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/NearestInterpKernel.swift
deleted file mode 100644
index c23de61413..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/NearestInterpKernel.swift
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-struct NearestInterpMetalParam {
-    let scale: Float32
-}
-
-class NearestInterpKernel<P: PrecisionProtocol>: Kernel, Computable {
-    var metalParam: NearestInterpMetalParam
-    func compute(commandBuffer: MTLCommandBuffer, param: NearestInterpParam<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let inputMetalTexture = param.input.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        guard let outputMetalTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputMetalTexture, index: 0)
-            encoder.setTexture(outputMetalTexture, index: 1)
-            encoder.setBytes(&metalParam, length: MemoryLayout<NearestInterpMetalParam>.size, index: 0)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputMetalTexture)
-        }
-    }
-    
-    required init(device: MTLDevice, param: NearestInterpParam<P>, initContext: InitContext) throws {
-        try param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-        metalParam = NearestInterpMetalParam(scale: param.scale)
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            try super.init(device: device, inFunctionName: "nearest_interp", initContext: initContext)
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            try super.init(device: device, inFunctionName: "nearest_interp_half", initContext: initContext)
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PoolKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PoolKernel.swift
deleted file mode 100644
index 38264902a9..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PoolKernel.swift
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-struct PoolMetalParam {
-    let ksizeX: Int32
-    let ksizeY: Int32
-    let strideX: Int32
-    let strideY: Int32
-    let paddingX: Int32
-    let paddingY: Int32
-    let poolType: Int32
-}
-
-class PoolKernel<P: PrecisionProtocol>: Kernel, Computable{
-    var metalParam: PoolMetalParam
-    required init(device: MTLDevice, param: PoolParam<P>, initContext: InitContext) throws {
-        
-        try param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-        
-        var poolType: Int32
-        switch param.poolType {
-        case "max":
-            poolType = 0
-        case "avg":
-            poolType = 1
-        default:
-            throw PaddleMobileError.makeError(type: .netError, msg: "unsupported pool type: \(param.poolType)")
-        }
-        metalParam = PoolMetalParam.init(
-            ksizeX: param.ksize[0],
-            ksizeY: param.ksize[1],
-            strideX: param.stride[0],
-            strideY: param.stride[1],
-            paddingX: param.padding[0],
-            paddingY: param.padding[1],
-            poolType: poolType
-        )
-        
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            try super.init(device: device, inFunctionName: "pool_float", initContext: initContext)
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            try super.init(device: device, inFunctionName: "pool_half", initContext: initContext)
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-    }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: PoolParam<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let inputMetalTexture = param.input.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        guard let outputMetalTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputMetalTexture, index: 0)
-            encoder.setTexture(outputMetalTexture, index: 1)
-            encoder.setBytes(&metalParam, length: MemoryLayout<PoolMetalParam>.size, index: 0)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputMetalTexture)
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PreluKernel.swift
deleted file mode 100644
index 80b437768d..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PreluKernel.swift
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class PreluKernel<P: PrecisionProtocol>: Kernel, Computable {
-    required init(device: MTLDevice, param: PreluParam<P>, initContext: InitContext) throws {
-        try param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-        
-        try param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-        
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            if param.mode == "channel" {
-                try super.init(device: device, inFunctionName: "prelu_channel", initContext: initContext)
-            } else if param.mode == "element" {
-                try super.init(device: device, inFunctionName: "prelu_element", initContext: initContext)
-            } else {
-                try super.init(device: device, inFunctionName: "prelu_other", initContext: initContext)
-            }
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            if param.mode == "channel" {
-                try super.init(device: device, inFunctionName: "prelu_channel_half", initContext: initContext)
-            } else if param.mode == "element" {
-                try super.init(device: device, inFunctionName: "prelu_element_half", initContext: initContext)
-            } else {
-                try super.init(device: device, inFunctionName: "prelu_other_half", initContext: initContext)
-            }
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-    }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: PreluParam<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let inputMetalTexture = param.input.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        guard let outputMetalTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputMetalTexture, index: 0)
-            encoder.setTexture(outputMetalTexture, index: 1)
-            encoder.setBuffer(param.alpha.buffer, offset: 0, index: 0)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputMetalTexture)
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PriorBoxKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PriorBoxKernel.swift
deleted file mode 100644
index d7692b74e1..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PriorBoxKernel.swift
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-struct PriorBoxMetalParam {
-    let offset: Float32
-    let stepWidth: Float32
-    let stepHeight: Float32
-    let minSize: Float32
-    let maxSize: Float32
-    let imageWidth: Float32
-    let imageHeight: Float32
-    let clip: Bool
-    let numPriors: uint
-    let aspecRatiosSize: uint
-    let minSizeSize: uint
-    let maxSizeSize: uint
-}
-
-class PriorBoxKernel<P: PrecisionProtocol>: Kernel, Computable{
-    var metalParam: PriorBoxMetalParam!
-    
-    required init(device: MTLDevice, param: PriorBoxParam<P>, initContext: InitContext) throws {
-        
-        let originDim = param.output.tensorDim;
-        
-        param.output.tensorDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]])
-        param.output.padToFourDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]])
-        
-        try param.output.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: GlobalConfig.shared.computePrecision)
-        try param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: GlobalConfig.shared.computePrecision)
-        
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            if param.min_max_aspect_ratios_order {
-                try super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder", initContext: initContext)
-            } else {
-                try super.init(device: device, inFunctionName: "prior_box", initContext: initContext)
-            }
-            
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            if param.min_max_aspect_ratios_order {
-                try super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder_half", initContext: initContext)
-            } else {
-                try super.init(device: device, inFunctionName: "prior_box_half", initContext: initContext)
-            }
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-        
-        
-        guard param.minSizes.count == 1 else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "param.minSizes.count must equal to 1")
-        }
-        
-        //    let n = 1
-        //    let h = param.output.dim[1]
-        //    let w = param.output.dim[2]
-        //    let c = param.output.dim[3] * param.output.dim[0]
-        //
-        //    param.output.dim = Dim.init(inDim: [n, h, w, c])
-        //    param.output.transpose = [0, 1, 2, 3]
-        
-        let imageWidth = Float32(param.inputImage.padToFourDim[3])
-        let imageHeight = Float32(param.inputImage.padToFourDim[2])
-        
-        let featureWidth = param.input.padToFourDim[3]
-        let featureHeight = param.input.padToFourDim[2]
-        
-        if param.stepW == 0 || param.stepH == 0 {
-            param.stepW = Float32(imageWidth) / Float32(featureWidth)
-            param.stepH = Float32(imageHeight) / Float32(featureHeight)
-        }
-        
-        var outputAspectRatior: [Float32] = []
-        outputAspectRatior.append(1.0)
-        
-        let epsilon = 1e-6
-        for ar in param.aspectRatios {
-            var alreadyExist = false
-            for outputAr in outputAspectRatior {
-                if fabs(Double(ar) - Double(outputAr)) < Double(epsilon) {
-                    alreadyExist = true
-                    break
-                }
-            }
-            
-            if !alreadyExist {
-                outputAspectRatior.append(ar)
-            }
-            if param.flip {
-                outputAspectRatior.append(1.0 / ar)
-            }
-        }
-        
-        if GlobalConfig.shared.computePrecision == .Float16 {
-            let buffer = device.makeBuffer(length: outputAspectRatior.count * MemoryLayout<Float16>.size)
-            try float32ToFloat16(input: &outputAspectRatior, output:(buffer?.contents())!, count: outputAspectRatior.count)
-            param.newAspectRatios = buffer
-            
-        } else if GlobalConfig.shared.computePrecision == .Float32 {
-            let buffer = device.makeBuffer(bytes: outputAspectRatior, length: outputAspectRatior.count * MemoryLayout<Float32>.size, options: [])
-            param.newAspectRatios = buffer
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-        
-        let aspectRatiosSize = uint(outputAspectRatior.count)
-        
-        let maxSizeSize: uint = uint(param.maxSizes.count)
-        let minSizeSize: uint = uint(param.minSizes.count)
-        
-        let numPriors = aspectRatiosSize * minSizeSize + maxSizeSize
-        
-        let minSize = param.minSizes.last ?? 0.0
-        let maxSize = param.maxSizes.last ?? 0.0
-        
-        metalParam = PriorBoxMetalParam.init(offset: param.offset, stepWidth: param.stepW, stepHeight: param.stepH, minSize: minSize, maxSize: maxSize, imageWidth: imageWidth, imageHeight: imageHeight, clip: param.clip, numPriors: numPriors, aspecRatiosSize: aspectRatiosSize, minSizeSize: minSizeSize, maxSizeSize: maxSizeSize)
-        
-    }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: PriorBoxParam<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let inputMetalTexture = param.input.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        guard let outputMetalTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        guard let outputVariancesMetalTexture = param.outputVariances.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "outputVariances metaltexture is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputMetalTexture, index: 0)
-            encoder.setTexture(outputMetalTexture, index: 1)
-            encoder.setTexture(outputVariancesMetalTexture, index: 2)
-            encoder.setBuffer(param.newAspectRatios!, offset: 0, index: 0)
-            encoder.setBytes(&metalParam, length: MemoryLayout<PriorBoxMetalParam>.size, index: 1)
-            encoder.setBytes(param.variances, length: MemoryLayout<Float32>.size * param.variances.count, index: 2)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputMetalTexture)
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Relu6Kernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Relu6Kernel.swift
deleted file mode 100644
index 2900cc834b..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Relu6Kernel.swift
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-struct Relu6MetalParam {
-    let threshold: Float32
-}
-
-class Relu6Kernel<P: PrecisionProtocol>: Kernel, Computable{
-    var metalParam: Relu6MetalParam
-    func compute(commandBuffer: MTLCommandBuffer, param: Relu6Param<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let inputMetalTexture = param.input.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        guard let outputMetalTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputMetalTexture, index: 0)
-            encoder.setTexture(outputMetalTexture, index: 1)
-            encoder.setBytes(&metalParam, length: MemoryLayout<Relu6MetalParam>.size, index: 0)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputMetalTexture)
-        }
-    }
-    
-    required init(device: MTLDevice, param: Relu6Param<P>, initContext: InitContext) throws {
-        try param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-        metalParam = Relu6MetalParam(threshold: param.threshold)
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            try super.init(device: device, inFunctionName: "relu6", initContext: initContext)
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            try super.init(device: device, inFunctionName: "relu6_half", initContext: initContext)
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift
deleted file mode 100644
index 7b380b7108..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class ReluKernel<P: PrecisionProtocol>: Kernel, Computable{
-    func compute(commandBuffer: MTLCommandBuffer, param: ReluParam<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let inputMetalTexture = param.input.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        guard let outputMetalTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputMetalTexture, index: 0)
-            encoder.setTexture(outputMetalTexture, index: 1)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputMetalTexture)
-        }
-    }
-    
-    required init(device: MTLDevice, param: ReluParam<P>, initContext: InitContext) throws {
-        try param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            try super.init(device: device, inFunctionName: "relu", initContext: initContext)
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            try super.init(device: device, inFunctionName: "relu_half", initContext: initContext)
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReshapeKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReshapeKernel.swift
deleted file mode 100644
index 5cf4fd515b..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReshapeKernel.swift
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-struct ReshapeMetalParam {
-    var idim: (Int32, Int32, Int32, Int32)
-    var itrans: (Int32, Int32, Int32, Int32)
-    var odim: (Int32, Int32, Int32, Int32)
-    var otrans: (Int32, Int32, Int32, Int32)
-}
-
-struct ReshapeTestParam: TestParam {
-    let inputTexture: MTLTexture
-    let outputTexture: MTLTexture
-    let param: ReshapeMetalParam
-}
-
-class ReshapeKernel<P: PrecisionProtocol>: Kernel, Computable{
-    
-    var metalParam: ReshapeMetalParam
-    
-    required init(device: MTLDevice, param: ReshapeParam<P>, initContext: InitContext) throws {
-        try param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
-        
-        var id: [Int32] = [1, 1, 1, 1]
-        for i in 0..<param.input.tensorDim.cout() {
-            id[4-param.input.tensorDim.cout()+i] = Int32(param.input.tensorDim[i])
-        }
-        let it: [Int32] = param.input.transpose.map { Int32($0) }
-        var od: [Int32] = [1, 1, 1, 1]
-        for i in 0..<param.output.tensorDim.cout() {
-            od[4-param.output.tensorDim.cout()+i] = Int32(param.output.tensorDim[i])
-        }
-        let ot: [Int32] = param.output.transpose.map { Int32($0) }
-        metalParam = ReshapeMetalParam.init(
-            idim: (id[0], id[1], id[2], id[3]),
-            itrans: (it[0], it[1], it[2], it[3]),
-            odim: (od[0], od[1], od[2], od[3]),
-            otrans: (ot[0], ot[1], ot[2], ot[3])
-        )
-        let irank = param.input.tensorDim.cout()
-        let orank = param.output.tensorDim.cout()
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            try super.init(device: device, inFunctionName: "reshape_\(irank)_\(orank)_float", initContext: initContext)
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            try super.init(device: device, inFunctionName: "reshape_\(irank)_\(orank)_half", initContext: initContext)
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-    }
-    
-    required init(device: MTLDevice, testParam: ReshapeTestParam, initContext: InitContext) throws {
-        metalParam = ReshapeMetalParam.init(
-            idim: (0, 0, 0, 0),
-            itrans: (0, 0, 0, 0),
-            odim: (0, 0, 0, 0),
-            otrans: (0, 0, 0, 0)
-        )
-        try super.init(device: device, inFunctionName: "reshape", initContext: initContext)
-    }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: ReshapeParam<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let inputMetalTexture = param.input.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        guard let outputMetalTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputMetalTexture, index: 0)
-            encoder.setTexture(outputMetalTexture, index: 1)
-            encoder.setBytes(&metalParam, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputMetalTexture)
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ResizeBilinearKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ResizeBilinearKernel.swift
deleted file mode 100644
index 9f1f560998..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ResizeBilinearKernel.swift
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-struct ResizeBilinearMetalParam {
-    var ratio_h: Float32
-    var ratio_w: Float32
-}
-
-class ResizeBilinearKernel<P: PrecisionProtocol>: Kernel, Computable{
-    required init(device: MTLDevice, param: ResizeBilinearParam<P>, initContext: InitContext) throws {
-        try param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-        
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            try super.init(device: device, inFunctionName: "resize_bilinear", initContext: initContext)
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            try super.init(device: device, inFunctionName: "resize_bilinear_half", initContext: initContext)
-        } else {
-            throw PaddleMobileError.makeError(type: .defaultError, msg: "not support compute precision \(GlobalConfig.shared.computePrecision)")
-        }
-    }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: ResizeBilinearParam<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let inputMetalTexture = param.input.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        guard let outputMetalTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        let ratio_h: Float32 = Float32(param.input.tensorDim.dims[2]) / Float32(param.output.tensorDim.dims[2])
-        let ratio_w: Float32 = Float32(param.input.tensorDim.dims[3]) / Float32(param.output.tensorDim.dims[3])
-        var p = ResizeBilinearMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w)
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputMetalTexture, index: 0)
-            encoder.setTexture(outputMetalTexture, index: 1)
-            encoder.setBytes(&p, length: MemoryLayout<ConcatMetalParam>.size, index: 0)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputMetalTexture)
-        }
-    }
-    
-    
-    
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Scale.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Scale.swift
deleted file mode 100644
index 0a4be80989..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Scale.swift
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import MetalPerformanceShaders
-
-public class ScaleKernel: CusomKernel {
-    var lanczos: MPSImageLanczosScale!
-    public init(device: MTLDevice, shape: Shape, metalLoadMode: MetalLoadMode, metalLibPath: String?) throws {
-        lanczos = MPSImageLanczosScale(device: device)        
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            try super.init(device: device, inFunctionName: nil, outputDim: shape, metalLoadModel: metalLoadMode, metalLibPath: metalLibPath)
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            try super.init(device: device, inFunctionName: nil, outputDim: shape, metalLoadModel: metalLoadMode, metalLibPath: metalLibPath)
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-    }
-    
-    public override func compute(inputTexuture: MTLTexture, commandBuffer: MTLCommandBuffer) throws {
-        lanczos.encode(commandBuffer: commandBuffer, sourceTexture: inputTexuture, destinationTexture: outputTexture)
-    }
-    
-}
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ScaleOpKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ScaleOpKernel.swift
deleted file mode 100644
index 56480c5a39..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ScaleOpKernel.swift
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import MetalPerformanceShaders
-
-struct ScaleMetalParam {
-    let scale: Float32
-    let abias: Float32
-}
-
-class ScaleOpKernel<P: PrecisionProtocol>: Kernel, Computable{
-    var metalParam: ScaleMetalParam
-    var inputImage: AnyObject?
-    var outputImage: AnyObject?
-    
-    required init(device: MTLDevice, param: ScaleParam<P>, initContext: InitContext) throws {
-        try param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-        
-        metalParam = ScaleMetalParam(scale: param.scale, abias: param.bias)
-        
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            if param.biasAfterScale {
-                try super.init(device: device, inFunctionName: "scale_before_bias_float", initContext: initContext)
-            } else {
-                try super.init(device: device, inFunctionName: "scale_after_bias_float", initContext: initContext)
-            }
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            if param.biasAfterScale {
-                try super.init(device: device, inFunctionName: "scale_before_bias_half", initContext: initContext)
-            } else {
-                try super.init(device: device, inFunctionName: "scale_after_bias_half", initContext: initContext)
-            }
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-    }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: ScaleParam<P>) throws {
-        guard let inputMetalTexture = param.input.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        guard let outputMetalTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputMetalTexture, index: 0)
-            encoder.setTexture(outputMetalTexture, index: 1)
-            encoder.setBytes(&metalParam, length: MemoryLayout<PoolMetalParam>.size, index: 0)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputMetalTexture)
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ShapeKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ShapeKernel.swift
deleted file mode 100644
index fadfc9ea34..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ShapeKernel.swift
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Metal
-import Foundation
-
-struct ShapeMetalParam {
-}
-
-class ShapeKernel<P: PrecisionProtocol>: Kernel, Computable{
-    func compute(commandBuffer: MTLCommandBuffer, param: ShapeParam<P>) throws {
-        //    print("shape compute")
-        //    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-        //      throw PaddleMobileError.predictError(message: " encoder is nil")
-        //    }
-        //    encoder.setTexture(param.output.metalTexture, index: 0)
-        //    encoder.endEncoding()
-    }
-    
-    required init(device: MTLDevice, param: ShapeParam<P>, initContext: InitContext) throws {
-        
-        try param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
-        
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            try super.init(device: device, inFunctionName: "shape", initContext: initContext)
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            try super.init(device: device, inFunctionName: "shape_half", initContext: initContext)
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-    }
-    
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SigmoidKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SigmoidKernel.swift
deleted file mode 100644
index 3717f9e05d..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SigmoidKernel.swift
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class SigmoidKernel<P: PrecisionProtocol>: Kernel, Computable {
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: SigmoidParam<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let inputMetalTexture = param.input.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        guard let outputMetalTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputMetalTexture, index: 0)
-            encoder.setTexture(outputMetalTexture, index: 1)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputMetalTexture)
-        }
-    }
-    
-    required init(device: MTLDevice, param: SigmoidParam<P>, initContext: InitContext) throws {
-        try param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            try super.init(device: device, inFunctionName: "sigmoid", initContext: initContext)
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            try super.init(device: device, inFunctionName: "sigmoid_half", initContext: initContext)
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SliceKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SliceKernel.swift
deleted file mode 100644
index b1c8acbefb..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SliceKernel.swift
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-public struct SliceMetalParam {
-    let start0: Int16
-    let start1: Int16
-    let start2: Int16
-    let start3: Int16
-    let end0: Int16
-    let end1: Int16
-    let end2: Int16
-    let end3: Int16
-    let iC: Int32
-    let oC: Int32
-}
-
-class SliceKernel<P: PrecisionProtocol>: Kernel, Computable {
-    var metalParam: SliceMetalParam
-    var device: MTLDevice?
-    var initContext: InitContext?
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: SliceParam<P>) throws {
-        let expectedTranspose = [0, 2, 3, 1]
-        var input = param.input
-        if param.input.transpose != expectedTranspose {
-            if let device = device, let initContext = initContext, let transposedInput = encodeTransposeInput(input: param.input, toTranspose: expectedTranspose, commandBuffer: commandBuffer, device: device, initContext: initContext) {
-                input = transposedInput
-            } else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "input transpose failed in slice kernel")
-            }
-        }
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let inputMetalTexture = input.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        guard let outputMetalTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputMetalTexture, index: 0)
-            encoder.setTexture(outputMetalTexture, index: 1)
-            encoder.setBytes(&metalParam, length: MemoryLayout<SliceMetalParam>.size, index: 0)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputMetalTexture)
-        }
-    }
-    
-    required init(device: MTLDevice, param: SliceParam<P>, initContext: InitContext) throws {
-        try param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
-        var ranges = [[Int16]]()
-        for i in 0..<4 {
-            if let range = param.ranges[i] {
-                ranges.append(range)
-            } else {
-                ranges.append([0, Int16(param.input.tensorDim[i])])
-            }
-        }
-        let start0 = ranges[0][0]
-        let start1 = ranges[1][0]
-        let start2 = ranges[2][0]
-        let start3 = ranges[3][0]
-        let end0 = ranges[0][1]
-        let end1 = ranges[1][1]
-        let end2 = ranges[2][1]
-        let end3 = ranges[3][1]
-        
-        let iC = Int32(param.input.tensorDim[1])
-        let oC = Int32(param.output.tensorDim[1])
-        
-        metalParam = SliceMetalParam.init(start0: start0, start1: start1, start2: start2, start3: start3, end0: end0, end1: end1, end2: end2, end3: end3, iC: iC, oC: oC)
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            try super.init(device: device, inFunctionName: "slice", initContext: initContext)
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            try super.init(device: device, inFunctionName: "slice_half", initContext: initContext)
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-        self.device = device
-        self.initContext = initContext
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SoftmaxKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SoftmaxKernel.swift
deleted file mode 100644
index d4978cd1c0..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SoftmaxKernel.swift
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-struct SoftmaxMetalParam {
-    let N: Int32
-    let K: Int32
-}
-
-class SoftmaxKernel<P: PrecisionProtocol>: Kernel, Computable{
-    
-    var metalParam: SoftmaxMetalParam
-    required init(device: MTLDevice, param: SoftmaxParam<P>, initContext: InitContext) throws {
-        
-        try param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
-        
-        metalParam = SoftmaxMetalParam.init(
-            N: Int32(param.input.tensorDim[0]),
-            K: Int32(param.input.tensorDim[1])
-        )
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            try super.init(device: device, inFunctionName: "softmax_float", initContext: initContext)
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            try super.init(device: device, inFunctionName: "softmax_half", initContext: initContext)
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-    }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: SoftmaxParam<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let inputMetalTexture = param.input.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        guard let outputMetalTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputMetalTexture, index: 0)
-            encoder.setTexture(outputMetalTexture, index: 1)
-            encoder.setBytes(&metalParam, length: MemoryLayout<SoftmaxMetalParam>.size, index: 0)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputMetalTexture)
-        }
-    }
-    
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SplitKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SplitKernel.swift
deleted file mode 100644
index 551fda654c..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SplitKernel.swift
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-struct SplitMetalParam {
-    var idim: (Int32, Int32, Int32, Int32) = (1, 1, 1, 1)
-    var axis: Int32 = 0
-    var offset: Int32 = 0
-    var trans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
-    var vdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
-}
-
-class SplitKernel<P: PrecisionProtocol>: Kernel, Computable{
-    var smp: SplitMetalParam
-    func compute(commandBuffer: MTLCommandBuffer, param: SplitParam<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let inputMetalTexture = param.input.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputMetalTexture, index: 0)
-            for i in 0..<param.outputList.count {
-                guard let outputMetalTexture = param.outputList[i].metalTexture else {
-                    throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture \(i) is nil")
-                }
-                encoder.setTexture(outputMetalTexture, index: i + 1)
-            }
-            encoder.setBytes(&smp, length: MemoryLayout<SplitMetalParam>.size, index: 0)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: inputMetalTexture)
-        }
-    }
-    
-    required init(device: MTLDevice, param: SplitParam<P>, initContext: InitContext) throws {
-        //     param.output.initTexture(device: device, computePrecision: computePrecision)
-        let num = param.outputList.count
-        let rank = param.input.tensorDim.cout()
-        guard num >= 2 && num <= 4 else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "param.outputList.count should satisfy num >= 2 && num <= 4")
-        }
-        for output in param.outputList {
-            try output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-        }
-        smp = SplitMetalParam.init()
-        smp.idim = (Int32(param.input.dim[0]), Int32(param.input.dim[1]), Int32(param.input.dim[2]), Int32(param.input.dim[3]))
-        smp.axis = Int32(param.axis + param.input.dim.cout() - param.input.tensorDim.cout())
-        for i in 0..<4 {
-            if param.input.transpose[i] == smp.axis {
-                smp.axis = Int32(i)
-                break
-            }
-        }
-        smp.trans = (Int32(param.input.transpose[0]), Int32(param.input.transpose[1]), Int32(param.input.transpose[2]), Int32(param.input.transpose[3]))
-        var vdim: [Int32] = [0, 0, 0, 0]
-        for i in 0..<num {
-            vdim[i] = Int32(param.outputList[i].tensorDim[param.axis])
-        }
-        smp.vdim = (vdim[0], vdim[1], vdim[2], vdim[3])
-        var v = "normal"
-        if rank == 4 {
-            if smp.axis == 1 {
-                v = "y"
-            } else if smp.axis == 2 {
-                v = "x"
-            }
-        } else if rank == 3 {
-            if smp.axis == 2 {
-                v = "y"
-            } else if smp.axis == 3 {
-                v = "x"
-            }
-        } else if rank == 2 {
-            if smp.axis == 2 {
-                v = "y"
-            }
-        }
-        if v == "normal" {
-            throw PaddleMobileError.makeError(type: .netError, msg: "unsupported split type")
-        }
-        if GlobalConfig.shared.computePrecision == .Float32 {
-            try super.init(device: device, inFunctionName: "split_\(rank)_\(num)_\(v)_float", initContext: initContext)
-        } else if GlobalConfig.shared.computePrecision == .Float16 {
-            try super.init(device: device, inFunctionName: "split_\(rank)_\(num)_\(v)_half", initContext: initContext)
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-    }
-    
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Texture2DTo2DArrayKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Texture2DTo2DArrayKernel.swift
deleted file mode 100644
index 69b19464a3..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Texture2DTo2DArrayKernel.swift
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import MetalKit
-import CoreMedia
-
-struct Texture2DTo2DArrayParam {
-    let input: MTLTexture
-    let output: MTLTexture
-    let expectDim: Dim
-}
-
-class Texture2DTo2DArrayKernel<P: PrecisionProtocol>: Kernel, Computable{
-    
-    required init(device: MTLDevice, param: FeedParam<P>, initContext: InitContext) throws {
-        try param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
-        
-        if GlobalConfig.shared.computePrecision == .Float16 {
-            try super.init(device: device, inFunctionName: "texture2d_to_2d_array_half", initContext: initContext)
-        } else if GlobalConfig.shared.computePrecision == .Float32 {
-            try super.init(device: device, inFunctionName: "texture2d_to_2d_array", initContext: initContext)
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-    }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: FeedParam<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(param.input.mtlTexture, index: 0)
-            encoder.setTexture(param.output.metalTexture, index: 1)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: param.input.mtlTexture)
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/TransposeKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/TransposeKernel.swift
deleted file mode 100644
index 1ce8b6694f..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/TransposeKernel.swift
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-struct TransposeMetalParam {
-    var iC: Int32 = 0
-    var oC: Int32 = 0
-    var axis: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
-}
-
-class TransposeKernel<P: PrecisionProtocol>: Kernel, Computable {
-    var metalParam: TransposeMetalParam = TransposeMetalParam.init()
-    required init(device: MTLDevice, param: TransposeParam<P>, initContext: InitContext) throws {
-        
-        try param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
-        
-        let rank = param.input.tensorDim.cout()
-        var axis: [Int] = [0, 1, 2, 3]
-        for i in 0..<param.axis.count {
-            axis[4-rank+i] = 4 - rank + Int(param.axis[i])
-        }
-        
-        var naxis: [Int] = [0, 0, 0, 0]
-        for i in 0..<4 {
-            for j in 0..<4 {
-                if param.input.transpose[j] == axis[i] {
-                    naxis[i] = j
-                    break
-                }
-            }
-        }
-        metalParam.iC = Int32(param.input.dim[param.input.transpose[3]])
-        metalParam.oC = Int32(param.output.dim[3])
-        metalParam.axis = (Int32(naxis[0]), Int32(naxis[1]), Int32(naxis[2]), Int32(naxis[3]))
-        var kernelFunc = "transpose_undefined"
-        if GlobalConfig.shared.computePrecision == .Float16 {
-            if param.input.transpose == axis {
-                kernelFunc = "transpose_copy_half"
-            } else {
-                kernelFunc = "transpose_\(rank)_half"
-            }
-        } else if GlobalConfig.shared.computePrecision == .Float32 {
-            if param.input.transpose == axis {
-                kernelFunc = "transpose_copy_float"
-            } else {
-                kernelFunc = "transpose_\(rank)_float"
-            }
-        } else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "unsupported compute precision: \(GlobalConfig.shared.computePrecision)")
-        }
-        paddleMobileLog("===========> \(kernelFunc)")
-        paddleMobileLog("\(metalParam)")
-        try super.init(device: device, inFunctionName: kernelFunc, initContext: initContext)
-    }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: TransposeParam<P>) throws {
-        guard let tempPipline = pipline else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "pipline is nil")
-        }
-        guard let inputMetalTexture = param.input.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "input metaltexture is nil")
-        }
-        guard let outputMetalTexture = param.output.metalTexture else {
-            throw PaddleMobileError.makeError(type: .predictError, msg: "output metaltexture is nil")
-        }
-        do {
-            guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-                throw PaddleMobileError.makeError(type: .predictError, msg: "encoder is nil")
-            }
-            defer {
-                encoder.endEncoding()
-            }
-            encoder.setTexture(inputMetalTexture, index: 0)
-            encoder.setTexture(outputMetalTexture, index: 1)
-            encoder.setBytes(&metalParam, length: MemoryLayout<TransposeMetalParam>.size, index: 0)
-            try encoder.dispatch(computePipline: tempPipline, outTexture: outputMetalTexture)
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BatchNormRelu.metal b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BatchNormRelu.metal
deleted file mode 100644
index eb94408c8a..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BatchNormRelu.metal
+++ /dev/null
@@ -1,36 +0,0 @@
-//
-//  BatchNormRelu.metal
-//  paddle-mobile
-//
-
-#include <metal_stdlib>
-using namespace metal;
-
-struct MetalConvParam {
-    short offsetX;
-    short offsetY;
-    short offsetZ;
-    ushort strideX;
-    ushort strideY;
-};
-
-kernel void batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                         const device float4 *new_scale [[buffer(0)]],
-                                         const device float4 *new_biase [[buffer(1)]],
-                                         uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    float4 input;
-    float4 output;
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    input = inTexture.sample(sample, gid.x, gid.y, gid.z);
-    output = fmax(input * new_scale[gid.z] + new_biase[gid.z], 0.0);
-    outTexture.write(output, gid.xy, gid.z);
-
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ResizeBilinear.metal b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ResizeBilinear.metal
deleted file mode 100644
index fbb4e12cb8..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ResizeBilinear.metal
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-struct resize_bilinear_param {
-//  int32_t out_h;
-//  int32_t out_w;
-  float ratio_h;
-  float ratio_w;
-};
-
-kernel void resize_bilinear(texture2d_array<float, access::read> input [[texture(0)]],
-                     texture2d_array<float, access::write> output [[texture(2)]],
-                     constant resize_bilinear_param & pm [[buffer(0)]],
-                     uint3 gid [[thread_position_in_grid]]) {
-  float4 r;
-  if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
-    r = input.read(gid.xy, gid.z);
-  } else {
-    float w = gid.x * pm.ratio_w;
-    float h = gid.y * pm.ratio_h;
-    uint w0 = w, h0 = h;
-    uint w1 = w0 + 1, h1 = h0 + 1;
-    float w1lambda = w - w0, h1lambda = h - h0;
-    float w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
-    if (w1 >= input.get_width()) w1 = w0;
-    if (h1 >= input.get_height()) h1 = h0;
-    float4 r0 = input.read(uint2(w0, h0), gid.z);
-    float4 r1 = input.read(uint2(w1, h0), gid.z);
-    float4 r2 = input.read(uint2(w0, h1), gid.z);
-    float4 r3 = input.read(uint2(w1, h1), gid.z);
-    r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
-  }
-  output.write(r, gid.xy, gid.z);
-}
-
-kernel void resize_bilinear_half(texture2d_array<half, access::read> input [[texture(0)]],
-                            texture2d_array<half, access::write> output [[texture(2)]],
-                            constant resize_bilinear_param & pm [[buffer(0)]],
-                            uint3 gid [[thread_position_in_grid]]) {
-  
-  half4 r;
-  if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
-    r = input.read(gid.xy, gid.z);
-  } else {
-    half w = gid.x * pm.ratio_w;
-    half h = gid.y * pm.ratio_h;
-    uint w0 = w, h0 = h;
-    uint w1 = w0 + 1, h1 = h0 + 1;
-    half w1lambda = w - w0, h1lambda = h - h0;
-    half w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
-    if (w1 >= input.get_width()) w1 = w0;
-    if (h1 >= input.get_height()) h1 = h0;
-    half4 r0 = input.read(uint2(w0, h0), gid.z);
-    half4 r1 = input.read(uint2(w1, h0), gid.z);
-    half4 r2 = input.read(uint2(w0, h1), gid.z);
-    half4 r3 = input.read(uint2(w1, h1), gid.z);
-    r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
-  }
-  output.write(r, gid.xy, gid.z);
-  output.write(r, gid.xy, gid.z);
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/LeakyReluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/LeakyReluOp.swift
deleted file mode 100644
index 1b15aa8db4..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/LeakyReluOp.swift
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-
-import Foundation
-
-class LeakyReluParam<P: PrecisionProtocol>: OpParam {
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        input = try LeakyReluParam.inputX(inputs: opDesc.inputs, from: inScope)
-        output = try LeakyReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
-        alpha = try LeakyReluParam.getAttr(key: "alpha", attrs: opDesc.attrs)
-    }
-    let input: Texture
-    var output: Texture
-    let alpha: Float32
-}
-
-class LeakyReluOp<P: PrecisionProtocol>: Operator<LeakyReluKernel<P>, LeakyReluParam<P>>, Runable, Creator, InferShaperable {
-    typealias OpType = LeakyReluOp<P>
-    
-    func inferShape() {
-        para.output.dim = para.input.dim
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-        print(para.output.metalTexture ?? "")
-        do {
-            let output = try para.output.metalTexture?.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray() ?? []
-            print(output)
-        } catch _ {
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/MulticlassNMSOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/MulticlassNMSOp.swift
deleted file mode 100644
index 0eba658a40..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/MulticlassNMSOp.swift
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class MulticlassNMSParam<P: PrecisionProtocol>: OpParam {
-    //typealias ParamPrecisionType = P
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        scores = try MulticlassNMSParam.getFirstTensor(key: "Scores", map: opDesc.inputs, from: inScope)
-        bboxes = try MulticlassNMSParam.getFirstTensor(key: "BBoxes", map: opDesc.inputs, from: inScope)
-        output = try MulticlassNMSParam.outputOut(outputs: opDesc.outputs, from: inScope)
-        
-        middleOutput = FetchHolder.init(inPaddedCapacity: scores.tensorDim.numel(), inDim: scores.tensorDim)
-        
-        bboxOutput = FetchHolder.init(inPaddedCapacity: bboxes.tensorDim.numel(), inDim: bboxes.tensorDim)
-    }
-    var bboxOutput: FetchHolder
-    var middleOutput: FetchHolder
-    let scores: Texture
-    let bboxes: Texture
-    var output: Texture
-}
-
-class MulticlassNMSOp<P: PrecisionProtocol>: Operator<MulticlassNMSKernel<P>, MulticlassNMSParam<P>>, Runable, Creator, InferShaperable{
-    
-    func inputVariant() -> [String : [MTLBuffer]]? {
-        guard let scoreBuffer = para.middleOutput.resultBuffer, let bboxBuffer = para.middleOutput.resultBuffer else {
-            return nil
-        }
-        return ["Scores" : [scoreBuffer], "BBoxes" : [bboxBuffer]]
-    }
-    
-    func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func inferShape() {
-        // para.output.dim = para.input.dim
-    }
-    
-    typealias OpType =  MulticlassNMSOp<P>
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        
-    }
-    
-    func delogOutput() {
-        print(" nms - output: ")
-        do {
-            let output = try para.bboxes.metalTexture?.float32Array().strideArray() ?? []
-            print(output)
-        } catch _ {
-        }
-    }
-}
-
-
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/NearestInterpOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/NearestInterpOp.swift
deleted file mode 100644
index 78b5144596..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/NearestInterpOp.swift
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class NearestInterpParam<P: PrecisionProtocol>: OpParam {
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        input = try NearestInterpParam.inputX(inputs: opDesc.inputs, from: inScope)
-        output = try NearestInterpParam.outputOut(outputs: opDesc.outputs, from: inScope)
-        let inputDim = input.tensorDim
-        let outputDim = output.tensorDim
-        guard inputDim.cout() == 4 && outputDim.cout() == 4 && inputDim[0] == outputDim[0] && inputDim[1] == outputDim[1] else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "nearest interp only support scale along width and height")
-        }
-        let scaleX = Float32(outputDim[2]) / Float32(inputDim[2])
-        let scaleY = Float32(outputDim[3]) / Float32(inputDim[3])
-        guard abs(scaleX - scaleY) <= 0.00001 else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "nearest interp only support same scale factor")
-        }
-        scale = scaleX
-    }
-    var input: Texture
-    var output: Texture
-    var scale: Float32
-}
-
-class NearestInterpOp<P: PrecisionProtocol>: Operator<NearestInterpKernel<P>, NearestInterpParam<P>>, Runable, Creator, InferShaperable {
-    typealias OpType = NearestInterpOp<P>
-    
-    func inferShape() {
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-        para.output.delog()
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/PoolOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/PoolOp.swift
deleted file mode 100644
index ed3b57d50e..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/PoolOp.swift
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class PoolParam<P: PrecisionProtocol>: OpParam {
-    //typealias ParamPrecisionType = P
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        input = try PoolParam.inputX(inputs: opDesc.inputs, from: inScope)
-        output = try PoolParam.outputOut(outputs: opDesc.outputs, from: inScope)
-        poolType = try PoolParam.getAttr(key: "pooling_type", attrs: opDesc.attrs)
-        ksize = try PoolParam.getAttr(key: "ksize", attrs: opDesc.attrs)
-        stride = try PoolParam.getAttr(key: "strides", attrs: opDesc.attrs)
-        padding = try PoolParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-        ceilMode = try PoolParam.getAttr(key: "ceil_mode", attrs: opDesc.attrs)
-        globalPooling = try PoolParam.getAttr(key: "global_pooling", attrs: opDesc.attrs)
-        guard input.transpose == [0, 2, 3, 1] else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "input transpose must equal to [0, 2, 3, 1]")
-        }
-    }
-    let input: Texture
-    var output: Texture
-    var ksize: [Int32]
-    var stride: [Int32]
-    var padding: [Int32]
-    var poolType: String
-    var ceilMode: Bool
-    var globalPooling: Bool
-}
-
-class PoolOp<P: PrecisionProtocol>: Operator<PoolKernel<P>, PoolParam<P>>, Runable, Creator, InferShaperable{
-    
-    typealias OpType = PoolOp<P>
-    
-    func inferShape() {
-        // para.output.dim = para.input.dim
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-        do {
-            let output = try para.output.metalTexture?.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray() ?? []
-            print(output)
-        } catch _ {
-        }
-        
-        
-        //    print("pool2d delog")
-        //    let _: P? = para.input.metalTexture.logDesc(header: "pool2d input: ", stridable: true)
-        //    print(para.ksize)
-        //    print(para.stride)
-        //    print(para.padding)
-        //    print(para.poolType)
-        //    let _: P? = para.output.metalTexture.logDesc(header: "pool2d output: ", stridable: true)
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/PreluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/PreluOp.swift
deleted file mode 100644
index c79038a23e..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/PreluOp.swift
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class PreluParam<P: PrecisionProtocol>: OpParam {
-    //typealias ParamPrecisionType = P
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        input = try PreluParam.inputX(inputs: opDesc.inputs, from: inScope)
-        output = try PreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
-        alpha = try PreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
-        mode = try PreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
-    }
-    let mode: String
-    let alpha: Tensor<P>
-    let input: Texture
-    var output: Texture
-}
-
-class PreluOp<P: PrecisionProtocol>: Operator<PreluKernel<P>, PreluParam<P>>, Runable, Creator, InferShaperable{
-    
-    typealias OpType = PreluOp<P>
-    
-    func inferShape() {
-        // para.output.dim = para.input.dim
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print(" \(type) input: ")
-        do {
-            let output = try para.input.metalTexture?.toTensor(dim: (n: para.input.padToFourDim[0], c: para.input.padToFourDim[1], h: para.input.padToFourDim[2], w: para.input.padToFourDim[3])).strideArray() ?? []
-            print(output)
-        } catch _ {
-        }
-        
-        print(" \(type) Alpha: ")
-        let _: Float32? = para.alpha.buffer.logDesc(header: " alpha: ", stridable: false)
-        
-        print(" \(type) output: ")
-        do {
-            let output = try para.output.metalTexture?.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray() ?? []
-            print(output)
-        } catch _ {
-        }
-    }
-    
-    //    print("softmax delog")
-    //    let _: P? = para.input.metalTexture.logDesc(header: "softmax input: ", stridable: false)
-    //    let _: P? = para.output.metalTexture.logDesc(header: "softmax output: ", stridable: false)
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/PriorBoxOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/PriorBoxOp.swift
deleted file mode 100644
index 1dee4ad952..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/PriorBoxOp.swift
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class PriorBoxParam<P: PrecisionProtocol>: OpParam {
-    //typealias ParamPrecisionType = P
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        do {
-            min_max_aspect_ratios_order = try PriorBoxParam.getAttr(key: "min_max_aspect_ratios_order", attrs: opDesc.attrs)
-        } catch _ {
-        }
-        
-        input = try PriorBoxParam.input(inputs: opDesc.inputs, from: inScope)
-        output = try PriorBoxParam.outputBoxes(outputs: opDesc.outputs, from: inScope)
-        inputImage = try PriorBoxParam.inputImage(inputs: opDesc.inputs, from: inScope)
-        outputVariances = try PriorBoxParam.outputVariances(outputs: opDesc.outputs, from: inScope)
-        minSizes = try PriorBoxParam.getAttr(key: "min_sizes", attrs: opDesc.attrs)
-        maxSizes = try PriorBoxParam.getAttr(key: "max_sizes", attrs: opDesc.attrs)
-        aspectRatios = try PriorBoxParam.getAttr(key: "aspect_ratios", attrs: opDesc.attrs)
-        variances = try PriorBoxParam.getAttr(key: "variances", attrs: opDesc.attrs)
-        flip = try PriorBoxParam.getAttr(key: "flip", attrs: opDesc.attrs)
-        clip = try PriorBoxParam.getAttr(key: "clip", attrs: opDesc.attrs)
-        stepW = try PriorBoxParam.getAttr(key: "step_w", attrs: opDesc.attrs)
-        stepH = try PriorBoxParam.getAttr(key: "step_h", attrs: opDesc.attrs)
-        offset = try PriorBoxParam.getAttr(key: "offset", attrs: opDesc.attrs)
-    }
-    
-    var min_max_aspect_ratios_order: Bool = false
-    let minSizes: [Float32]
-    let maxSizes: [Float32]
-    let aspectRatios: [Float32]
-    var newAspectRatios: MTLBuffer?
-    let variances: [Float32]
-    let flip: Bool
-    let clip: Bool
-    var stepW: Float32
-    var stepH: Float32
-    let offset: Float32
-    
-    let input: Texture
-    let inputImage: Texture
-    var output: Texture
-    let outputVariances: Texture
-}
-
-class PriorBoxOp<P: PrecisionProtocol>: Operator<PriorBoxKernel<P>, PriorBoxParam<P>>, Runable, Creator, InferShaperable{
-    
-    typealias OpType = PriorBoxOp<P>
-    
-    func inferShape() {
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-        para.output.delog()
-    }
-}
-
-
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Relu6Op.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Relu6Op.swift
deleted file mode 100644
index 41de13bf17..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Relu6Op.swift
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-
-import Foundation
-
-class Relu6Param<P: PrecisionProtocol>: OpParam {
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        input = try Relu6Param.inputX(inputs: opDesc.inputs, from: inScope)
-        output = try Relu6Param.outputOut(outputs: opDesc.outputs, from: inScope)
-        threshold = try Relu6Param.getAttr(key: "threshold", attrs: opDesc.attrs)
-    }
-    let input: Texture
-    var output: Texture
-    let threshold: Float32
-}
-
-class Relu6Op<P: PrecisionProtocol>: Operator<Relu6Kernel<P>, Relu6Param<P>>, Runable, Creator, InferShaperable {
-    typealias OpType = Relu6Op<P>
-    
-    func inferShape() {
-        para.output.dim = para.input.dim
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-        print(para.output.metalTexture ?? "")
-        do {
-            let output = try para.output.metalTexture?.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray() ?? []
-            print(output)
-        } catch _ {
-        }
-    }
-}
-
-
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ReluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ReluOp.swift
deleted file mode 100644
index f99392ebcf..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ReluOp.swift
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-
-import Foundation
-
-class ReluParam<P: PrecisionProtocol>: OpParam {
-    //typealias ParamPrecisionType = P
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        input = try ReluParam.inputX(inputs: opDesc.inputs, from: inScope)
-        output = try ReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
-    }
-    let input: Texture
-    var output: Texture
-}
-
-class ReluOp<P: PrecisionProtocol>: Operator<ReluKernel<P>, ReluParam<P>>, Runable, Creator, InferShaperable{
-    
-    typealias OpType = ReluOp<P>
-    
-    func inferShape() {
-        para.output.dim = para.input.dim
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-        para.output.delog()
-    }
-    
-}
-
-
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ReshapeOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ReshapeOp.swift
deleted file mode 100644
index 3bdece6033..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ReshapeOp.swift
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import Metal
-
-class ReshapeParam<P: PrecisionProtocol>: OpParam {
-    //typealias ParamPrecisionType = P
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        input = try ReshapeParam.inputX(inputs: opDesc.inputs, from: inScope)
-        output = try ReshapeParam.outputOut(outputs: opDesc.outputs, from: inScope)
-        shape = try ReshapeParam.getAttr(key: "shape", attrs: opDesc.attrs)
-        
-        if shape.count > 4 {
-            if shape[0] == -1 {
-                shape.removeFirst()
-            }
-        }
-        
-        var s: [Int] = shape.map { Int($0) }
-        
-        var di = -1
-        var ml = 1
-        for i in 0..<s.count {
-            if s[i] == -1 {
-                di = i
-                continue
-            }
-            ml *= s[i]
-        }
-        if di >= 0 {
-            s[di] = input.dim.numel() / ml
-        }
-        output.tensorDim = Dim.init(inDim: s)
-        var dim: [Int] = [1, 1, 1, 1]
-        for i in 0..<s.count {
-            dim[4-s.count+i] = s[i]
-        }
-        output.padToFourDim = Dim.init(inDim: dim)
-        output.dim = output.padToFourDim
-    }
-    let input: Texture
-    var shape: [Int32]
-    var output: Texture
-}
-
-class ReshapeOp<P: PrecisionProtocol>: Operator<ReshapeKernel<P>, ReshapeParam<P>>, Runable, Creator, InferShaperable{
-    
-    typealias OpType = ReshapeOp<P>
-    
-    func inferShape() {
-        // para.output.dim = para.input.dim
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    func delogOutput() {
-        print("reshape output")
-        para.output.delog()
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ResizeBilinearOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ResizeBilinearOp.swift
deleted file mode 100644
index 1748f526cd..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ResizeBilinearOp.swift
+++ /dev/null
@@ -1,47 +0,0 @@
-///* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License. */
-
-import Foundation
-
-class ResizeBilinearParam<P: PrecisionProtocol>: OpParam {
-    typealias ParamPrecisionType = P
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        input = try ResizeBilinearParam.inputX(inputs: opDesc.inputs, from: inScope)
-        output = try ResizeBilinearParam.outputOut(outputs: opDesc.outputs, from: inScope)
-        out_h = try ResizeBilinearParam.getAttr(key: "out_h", attrs: opDesc.attrs)
-        out_w = try ResizeBilinearParam.getAttr(key: "out_w", attrs: opDesc.attrs)
-    }
-    let input: Texture
-    var output: Texture
-    let out_h: Int32
-    let out_w: Int32
-}
-
-class ResizeBilinearOp<P: PrecisionProtocol>: Operator<ResizeBilinearKernel<P>, ResizeBilinearParam<P>>, Runable, Creator, InferShaperable{
-    
-    typealias OpType = ResizeBilinearOp<P>
-    
-    func inferShape() {
-        //        para.output.dim = para.input.dim
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-    }
-    
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ScaleOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ScaleOp.swift
deleted file mode 100644
index 66aed28a33..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ScaleOp.swift
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class ScaleParam<P: PrecisionProtocol>: OpParam {
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        input = try ScaleParam.inputX(inputs: opDesc.inputs, from: inScope)
-        output = try ScaleParam.outputOut(outputs: opDesc.outputs, from: inScope)
-        scale = try ScaleParam.getAttr(key: "scale", attrs: opDesc.attrs)
-        bias = try ScaleParam.getAttr(key: "bias", attrs: opDesc.attrs)
-        biasAfterScale = try ScaleParam.getAttr(key: "bias_after_scale", attrs: opDesc.attrs)
-    }
-    
-    let input: Texture
-    var output: Texture
-    let scale: Float32
-    let bias: Float32
-    let biasAfterScale: Bool
-}
-
-class ScaleOp<P: PrecisionProtocol>: Operator<ScaleOpKernel<P>, ScaleParam<P>>, Runable, Creator, InferShaperable{
-    typealias OpType = ScaleOp<P>
-    
-    func inferShape() {
-        para.output.dim = para.input.dim
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-        print(para.output.metalTexture ?? "")
-        do {
-            let output = try para.output.metalTexture?.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray() ?? []
-            print(output)
-        } catch _ {
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ShapeOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ShapeOp.swift
deleted file mode 100644
index 4d53d18423..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ShapeOp.swift
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class ShapeParam<P: PrecisionProtocol>: OpParam {
-    // typealias ParamPrecisionType = P
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        input = try ShapeParam.input(inputs: opDesc.inputs, from: inScope)
-        output = try ShapeParam.outputOut(outputs: opDesc.outputs, from: inScope)
-    }
-    var output: Texture
-    let input: Texture
-}
-
-class ShapeOp<P: PrecisionProtocol>: Operator<ShapeKernel<P>, ShapeParam<P>>, Runable, Creator, InferShaperable{
-    
-    typealias OpType = ShapeOp<P>
-    
-    func inferShape() {
-        //        para.output.dim = para.input.dim
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-    }
-    
-}
-
-
-
-
-
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/SigmoidOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/SigmoidOp.swift
deleted file mode 100644
index 93bcd0eb06..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/SigmoidOp.swift
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-
-import Foundation
-
-class SigmoidParam<P: PrecisionProtocol>: OpParam {
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        input = try SigmoidParam.inputX(inputs: opDesc.inputs, from: inScope)
-        output = try SigmoidParam.outputOut(outputs: opDesc.outputs, from: inScope)
-    }
-    let input: Texture
-    var output: Texture
-}
-
-class SigmoidOp<P: PrecisionProtocol>: Operator<SigmoidKernel<P>, SigmoidParam<P>>, Runable, Creator, InferShaperable {
-    typealias OpType = SigmoidOp<P>
-    
-    func inferShape() {
-        para.output.dim = para.input.dim
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-        print(para.output.metalTexture)
-        do {
-            let output = try para.output.metalTexture?.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray() ?? []
-            print(output)
-        } catch _ {
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/SliceOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/SliceOp.swift
deleted file mode 100644
index d7516ba630..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/SliceOp.swift
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class SliceParam<P: PrecisionProtocol>: OpParam {
-    //typealias ParamPrecisionType = P
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        input = try SliceParam.input(inputs: opDesc.inputs, from: inScope)
-        output = try SliceParam.outputOut(outputs: opDesc.outputs, from: inScope)
-        starts = try SliceParam.getAttr(key: "starts", attrs: opDesc.attrs)
-        ends = try SliceParam.getAttr(key: "ends", attrs: opDesc.attrs)
-        for i in 0..<input.tensorDim.cout() {
-            if input.tensorDim[i] != output.tensorDim[i] {
-                axes.append(Int32(i))
-            }
-        }
-        guard axes.count == 1 && axes[0] == 1 else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "slice only support channel axe")
-        }
-        for i in 0..<axes.count {
-            ranges[Int(axes[i])] = [Int16(starts[i]), Int16(ends[i])]
-        }
-    }
-    
-    let input: Texture
-    var output: Texture
-    let starts: [Int32]
-    let ends: [Int32]
-    var axes = [Int32]()
-    var ranges = [Int: [Int16]]()
-}
-
-class SliceOp<P: PrecisionProtocol>: Operator<SliceKernel<P>, SliceParam<P>>, Runable, Creator, InferShaperable {
-    typealias OpType = SliceOp<P>
-    
-    func inferShape() {
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print("\(type) output : ")
-        do {
-            let output = try para.output.toTensor().strideArray()
-            print(output)
-        } catch _ {
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/SoftmaxOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/SoftmaxOp.swift
deleted file mode 100644
index f5b5b9d8fa..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/SoftmaxOp.swift
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import Metal
-
-class SoftmaxParam<P: PrecisionProtocol>: OpParam {
-    //typealias ParamPrecisionType = P
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        input = try SoftmaxParam.inputX(inputs: opDesc.inputs, from: inScope)
-        output = try SoftmaxParam.outputOut(outputs: opDesc.outputs, from: inScope)
-        
-        output.dim = input.dim
-        output.tensorDim = input.tensorDim
-        output.padToFourDim = input.padToFourDim
-    }
-    let input: Texture
-    var output: Texture
-}
-
-class SoftmaxOp<P: PrecisionProtocol>: Operator<SoftmaxKernel<P>, SoftmaxParam<P>>, Runable, Creator, InferShaperable{
-    typealias OpType = SoftmaxOp<P>
-    
-    func inferShape() {
-        // para.output.dim = para.input.dim
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print("softmax delog")
-        print(para.input)
-        
-        print(para.output)
-        let padToFourDim = para.output.padToFourDim
-        do {
-            let outputArray: [Float32] = try para.output.metalTexture?.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) ?? []
-            print(outputArray.strideArray())
-        } catch _ {
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/SplitOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/SplitOp.swift
deleted file mode 100644
index adbf2e4b37..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/SplitOp.swift
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class SplitParam<P: PrecisionProtocol>: OpParam {
-    //typealias ParamPrecisionType = P
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        input = try SplitParam.inputX(inputs: opDesc.inputs, from: inScope)
-        output = try Texture.init(device: input.metalTexture!.device, inDim: input.dim)
-        axis = try SplitParam.getAttr(key: "axis", attrs: opDesc.attrs)
-        sections = try SplitParam.getAttr(key: "sections", attrs: opDesc.attrs)
-        if axis < 0 {
-            axis = input.tensorDim.cout() + axis
-        }
-        guard let outlist = opDesc.outputs["Out"] else {
-            throw PaddleMobileError.makeError(type: .netError, msg: "split output desc nil")
-        }
-        for out in outlist {
-            guard let variant = inScope[out], let v = variant as? Texture else {
-                throw PaddleMobileError.makeError(type: .netError, msg: "split output texture nil")
-            }
-            outputList.append(v)
-            sections.append(Int32(v.tensorDim.dims[axis]))
-        }
-    }
-    
-    var axis: Int
-    let input: Texture
-    var output: Texture
-    var outputList: [Texture] = []
-    var sections: [Int32] = []
-}
-
-class SplitOp<P: PrecisionProtocol>: Operator<SplitKernel<P>, SplitParam<P>>, Runable, Creator, InferShaperable{
-    
-    typealias OpType = SplitOp<P>
-    
-    func inferShape() {
-        //        para.output.dim = para.input.dim
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-        para.output.delog()
-    }
-    
-}
-
-
-
-
-
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/TransposeOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/TransposeOp.swift
deleted file mode 100644
index 16ee5b4a82..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/TransposeOp.swift
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import Metal
-
-class TransposeParam<P: PrecisionProtocol>: OpParam {
-    //typealias ParamPrecisionType = P
-    required init(opDesc: PMOpDesc, inScope: Scope) throws {
-        input = try TransposeParam.inputX(inputs: opDesc.inputs, from: inScope)
-        output = try TransposeParam.outputOut(outputs: opDesc.outputs, from: inScope)
-        axis = try TransposeParam.getAttr(key: "axis", attrs: opDesc.attrs)
-    }
-    let input: Texture
-    var output: Texture
-    let axis: [Int32]
-}
-
-class TransposeOp<P: PrecisionProtocol>: Operator<TransposeKernel<P>, TransposeParam<P>>, Runable, Creator, InferShaperable{
-    
-    typealias OpType = TransposeOp<P>
-    
-    func inferShape() {
-        //para.output.dim = para.input.dim
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        try kernel.compute(commandBuffer: buffer, param: para)
-    }
-    
-    func delogOutput() {
-        print(" \(type) output: ")
-        para.output.delog()
-    }
-}
-
-
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/Attribute.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/Attribute.swift
deleted file mode 100644
index 4888100417..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Program/Attribute.swift
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-protocol Attr {
-}
-
-extension Bool: Attr {
-}
-
-extension Int: Attr {
-}
-
-extension Float: Attr {
-}
-
-extension Int64: Attr {
-}
-
-extension Array: Attr {
-}
-
-extension String: Attr {
-}
-
-extension NSMutableArray :Attr {
-}
-
-
-func attrWithProtoDesc(attrDesc: OpDesc_Attr) throws -> Attr {
-    switch attrDesc.type {
-    case .boolean:
-        return attrDesc.b
-    case .int:
-        return Int(attrDesc.i)
-    case .string:
-        return attrDesc.s
-    case .long:
-        return attrDesc.l
-    case .float:
-        return attrDesc.f
-    ///  convert GPB class to swift class
-    case .booleans:
-        var dimsArray = [Bool]()
-        let dimsCount = attrDesc.boolsArray.count
-        for i in 0..<dimsCount {
-            let dim = Bool(attrDesc.boolsArray.value(at: i))
-            dimsArray.append(dim)
-        }
-        return dimsArray
-    case .floats:
-        var dimsArray = [Float]()
-        let dimsCount = attrDesc.floatsArray.count
-        for i in 0..<dimsCount {
-            let dim = Float(attrDesc.floatsArray.value(at: i))
-            dimsArray.append(dim)
-        }
-        return dimsArray
-    case .ints:
-        var dimsArray = [Int32]()
-        let dimsCount = attrDesc.intsArray.count
-        for i in 0..<dimsCount {
-            let dim = Int32(attrDesc.intsArray.value(at: i))
-            dimsArray.append(dim)
-        }
-        return dimsArray
-    case .strings:
-        return attrDesc.stringsArray
-    default:
-        throw PaddleMobileError.makeError(type: .netError, msg: "not support this attr type: \(attrDesc.type)")
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/Framework.pbobjc.h b/metal/paddle-mobile/paddle-mobile/Src/Program/Framework.pbobjc.h
deleted file mode 100755
index 49ba5d45b4..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Program/Framework.pbobjc.h
+++ /dev/null
@@ -1,599 +0,0 @@
-// Generated by the protocol buffer compiler.  DO NOT EDIT!
-// source: Framework.proto
-
-// This CPP symbol can be defined to use imports that match up to the framework
-// imports needed when using CocoaPods.
-#if !defined(GPB_USE_PROTOBUF_FRAMEWORK_IMPORTS)
- #define GPB_USE_PROTOBUF_FRAMEWORK_IMPORTS 0
-#endif
-
-//#if GPB_USE_PROTOBUF_FRAMEWORK_IMPORTS
-// #import <Protobuf/GPBProtocolBuffers.h>
-//#else
-// #import "GPBProtocolBuffers.h"
-//#endif
-#if GPB_USE_PROTOBUF_FRAMEWORK_IMPORTS
-#import <Protobuf/GPBProtocolBuffers.h>
-#else
-#import <ProtocolBuffers/ProtocolBuffers.h>
-#endif
-
-#if GOOGLE_PROTOBUF_OBJC_GEN_VERSION != 30001
-#error This file was generated by a different version of protoc which is incompatible with your Protocol Buffer library sources.
-#endif
-
-// @@protoc_insertion_point(imports)
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wdeprecated-declarations"
-
-CF_EXTERN_C_BEGIN
-
-@class BlockDesc;
-@class OpDesc;
-@class OpDesc_Attr;
-@class OpDesc_Var;
-@class OpProto_Attr;
-@class OpProto_Var;
-@class VarDesc;
-@class VarType;
-@class VarType_ChannelDesc;
-@class VarType_LoDTensorArrayDesc;
-@class VarType_LoDTensorDesc;
-@class VarType_ReaderDesc;
-@class VarType_TensorDesc;
-@class VarType_Tuple;
-@class Version;
-
-NS_ASSUME_NONNULL_BEGIN
-
-#pragma mark - Enum AttrType
-
-typedef GPB_ENUM(AttrType) {
-  AttrType_Int = 0,
-  AttrType_Float = 1,
-  AttrType_String = 2,
-  AttrType_Ints = 3,
-  AttrType_Floats = 4,
-  AttrType_Strings = 5,
-  AttrType_Boolean = 6,
-  AttrType_Booleans = 7,
-  AttrType_Block = 8,
-  AttrType_Long = 9,
-  AttrType_Blocks = 10,
-};
-
-GPBEnumDescriptor *AttrType_EnumDescriptor(void);
-
-/// Checks to see if the given value is defined by the enum or was not known at
-/// the time this source was generated.
-BOOL AttrType_IsValidValue(int32_t value);
-
-#pragma mark - Enum VarType_Type
-
-typedef GPB_ENUM(VarType_Type) {
-  /// Pod Types
-  VarType_Type_Bool = 0,
-  VarType_Type_Int16 = 1,
-  VarType_Type_Int32 = 2,
-  VarType_Type_Int64 = 3,
-  VarType_Type_Fp16 = 4,
-  VarType_Type_Fp32 = 5,
-  VarType_Type_Fp64 = 6,
-
-  /// Tensor<size_t> is used in C++.
-  VarType_Type_SizeT = 19,
-  VarType_Type_Uint8 = 20,
-  VarType_Type_Int8 = 21,
-
-  /// Other types that may need additional descriptions
-  VarType_Type_LodTensor = 7,
-  VarType_Type_SelectedRows = 8,
-  VarType_Type_FeedMinibatch = 9,
-  VarType_Type_FetchList = 10,
-  VarType_Type_StepScopes = 11,
-  VarType_Type_LodRankTable = 12,
-  VarType_Type_LodTensorArray = 13,
-  VarType_Type_PlaceList = 14,
-  VarType_Type_Reader = 15,
-  VarType_Type_Channel = 16,
-
-  /// Any runtime decided variable type is raw
-  /// raw variables should manage their own allocations
-  /// in operators like nccl_op
-  VarType_Type_Raw = 17,
-  VarType_Type_Tuple = 18,
-};
-
-GPBEnumDescriptor *VarType_Type_EnumDescriptor(void);
-
-/// Checks to see if the given value is defined by the enum or was not known at
-/// the time this source was generated.
-BOOL VarType_Type_IsValidValue(int32_t value);
-
-#pragma mark - FrameworkRoot
-
-/// Exposes the extension registry for this file.
-///
-/// The base class provides:
-/// @code
-///   + (GPBExtensionRegistry *)extensionRegistry;
-/// @endcode
-/// which is a @c GPBExtensionRegistry that includes all the extensions defined by
-/// this file and all files that it depends on.
-@interface FrameworkRoot : GPBRootObject
-@end
-
-#pragma mark - Version
-
-typedef GPB_ENUM(Version_FieldNumber) {
-  Version_FieldNumber_Version = 1,
-};
-
-/// Any incompatible changes to ProgramDesc and its dependencies should
-/// raise the version defined version.h.
-///
-/// Serailization and Deserialization codes should be modified in a way
-/// that supports old versions following the version and compatibility policy.
-@interface Version : GPBMessage
-
-@property(nonatomic, readwrite) int64_t version;
-
-@property(nonatomic, readwrite) BOOL hasVersion;
-@end
-
-#pragma mark - OpDesc
-
-typedef GPB_ENUM(OpDesc_FieldNumber) {
-  OpDesc_FieldNumber_InputsArray = 1,
-  OpDesc_FieldNumber_OutputsArray = 2,
-  OpDesc_FieldNumber_Type = 3,
-  OpDesc_FieldNumber_AttrsArray = 4,
-  OpDesc_FieldNumber_IsTarget = 5,
-};
-
-/// OpDesc describes an instance of a C++ framework::OperatorBase
-/// derived class type.
-@interface OpDesc : GPBMessage
-
-@property(nonatomic, readwrite, copy, null_resettable) NSString *type;
-/// Test to see if @c type has been set.
-@property(nonatomic, readwrite) BOOL hasType;
-
-@property(nonatomic, readwrite, strong, null_resettable) NSMutableArray<OpDesc_Var*> *inputsArray;
-/// The number of items in @c inputsArray without causing the array to be created.
-@property(nonatomic, readonly) NSUInteger inputsArray_Count;
-
-@property(nonatomic, readwrite, strong, null_resettable) NSMutableArray<OpDesc_Var*> *outputsArray;
-/// The number of items in @c outputsArray without causing the array to be created.
-@property(nonatomic, readonly) NSUInteger outputsArray_Count;
-
-@property(nonatomic, readwrite, strong, null_resettable) NSMutableArray<OpDesc_Attr*> *attrsArray;
-/// The number of items in @c attrsArray without causing the array to be created.
-@property(nonatomic, readonly) NSUInteger attrsArray_Count;
-
-@property(nonatomic, readwrite) BOOL isTarget;
-
-@property(nonatomic, readwrite) BOOL hasIsTarget;
-@end
-
-#pragma mark - OpDesc_Attr
-
-typedef GPB_ENUM(OpDesc_Attr_FieldNumber) {
-  OpDesc_Attr_FieldNumber_Name = 1,
-  OpDesc_Attr_FieldNumber_Type = 2,
-  OpDesc_Attr_FieldNumber_I = 3,
-  OpDesc_Attr_FieldNumber_F = 4,
-  OpDesc_Attr_FieldNumber_S = 5,
-  OpDesc_Attr_FieldNumber_IntsArray = 6,
-  OpDesc_Attr_FieldNumber_FloatsArray = 7,
-  OpDesc_Attr_FieldNumber_StringsArray = 8,
-  OpDesc_Attr_FieldNumber_B = 10,
-  OpDesc_Attr_FieldNumber_BoolsArray = 11,
-  OpDesc_Attr_FieldNumber_BlockIdx = 12,
-  OpDesc_Attr_FieldNumber_L = 13,
-  OpDesc_Attr_FieldNumber_BlocksIdxArray = 14,
-};
-
-@interface OpDesc_Attr : GPBMessage
-
-@property(nonatomic, readwrite, copy, null_resettable) NSString *name;
-/// Test to see if @c name has been set.
-@property(nonatomic, readwrite) BOOL hasName;
-
-@property(nonatomic, readwrite) AttrType type;
-
-@property(nonatomic, readwrite) BOOL hasType;
-@property(nonatomic, readwrite) int32_t i;
-
-@property(nonatomic, readwrite) BOOL hasI;
-@property(nonatomic, readwrite) float f;
-
-@property(nonatomic, readwrite) BOOL hasF;
-@property(nonatomic, readwrite, copy, null_resettable) NSString *s;
-/// Test to see if @c s has been set.
-@property(nonatomic, readwrite) BOOL hasS;
-
-@property(nonatomic, readwrite, strong, null_resettable) GPBInt32Array *intsArray;
-/// The number of items in @c intsArray without causing the array to be created.
-@property(nonatomic, readonly) NSUInteger intsArray_Count;
-
-@property(nonatomic, readwrite, strong, null_resettable) GPBFloatArray *floatsArray;
-/// The number of items in @c floatsArray without causing the array to be created.
-@property(nonatomic, readonly) NSUInteger floatsArray_Count;
-
-@property(nonatomic, readwrite, strong, null_resettable) NSMutableArray<NSString*> *stringsArray;
-/// The number of items in @c stringsArray without causing the array to be created.
-@property(nonatomic, readonly) NSUInteger stringsArray_Count;
-
-@property(nonatomic, readwrite) BOOL b;
-
-@property(nonatomic, readwrite) BOOL hasB;
-@property(nonatomic, readwrite, strong, null_resettable) GPBBoolArray *boolsArray;
-/// The number of items in @c boolsArray without causing the array to be created.
-@property(nonatomic, readonly) NSUInteger boolsArray_Count;
-
-@property(nonatomic, readwrite) int32_t blockIdx;
-
-@property(nonatomic, readwrite) BOOL hasBlockIdx;
-@property(nonatomic, readwrite) int64_t l;
-
-@property(nonatomic, readwrite) BOOL hasL;
-@property(nonatomic, readwrite, strong, null_resettable) GPBInt32Array *blocksIdxArray;
-/// The number of items in @c blocksIdxArray without causing the array to be created.
-@property(nonatomic, readonly) NSUInteger blocksIdxArray_Count;
-
-@end
-
-#pragma mark - OpDesc_Var
-
-typedef GPB_ENUM(OpDesc_Var_FieldNumber) {
-  OpDesc_Var_FieldNumber_Parameter = 1,
-  OpDesc_Var_FieldNumber_ArgumentsArray = 2,
-};
-
-@interface OpDesc_Var : GPBMessage
-
-@property(nonatomic, readwrite, copy, null_resettable) NSString *parameter;
-/// Test to see if @c parameter has been set.
-@property(nonatomic, readwrite) BOOL hasParameter;
-
-@property(nonatomic, readwrite, strong, null_resettable) NSMutableArray<NSString*> *argumentsArray;
-/// The number of items in @c argumentsArray without causing the array to be created.
-@property(nonatomic, readonly) NSUInteger argumentsArray_Count;
-
-@end
-
-#pragma mark - OpProto
-
-typedef GPB_ENUM(OpProto_FieldNumber) {
-  OpProto_FieldNumber_Type = 1,
-  OpProto_FieldNumber_InputsArray = 2,
-  OpProto_FieldNumber_OutputsArray = 3,
-  OpProto_FieldNumber_AttrsArray = 4,
-  OpProto_FieldNumber_Comment = 5,
-};
-
-/// OpProto describes a C++ framework::OperatorBase derived class.
-@interface OpProto : GPBMessage
-
-@property(nonatomic, readwrite, copy, null_resettable) NSString *type;
-/// Test to see if @c type has been set.
-@property(nonatomic, readwrite) BOOL hasType;
-
-@property(nonatomic, readwrite, strong, null_resettable) NSMutableArray<OpProto_Var*> *inputsArray;
-/// The number of items in @c inputsArray without causing the array to be created.
-@property(nonatomic, readonly) NSUInteger inputsArray_Count;
-
-@property(nonatomic, readwrite, strong, null_resettable) NSMutableArray<OpProto_Var*> *outputsArray;
-/// The number of items in @c outputsArray without causing the array to be created.
-@property(nonatomic, readonly) NSUInteger outputsArray_Count;
-
-@property(nonatomic, readwrite, strong, null_resettable) NSMutableArray<OpProto_Attr*> *attrsArray;
-/// The number of items in @c attrsArray without causing the array to be created.
-@property(nonatomic, readonly) NSUInteger attrsArray_Count;
-
-@property(nonatomic, readwrite, copy, null_resettable) NSString *comment;
-/// Test to see if @c comment has been set.
-@property(nonatomic, readwrite) BOOL hasComment;
-
-@end
-
-#pragma mark - OpProto_Var
-
-typedef GPB_ENUM(OpProto_Var_FieldNumber) {
-  OpProto_Var_FieldNumber_Name = 1,
-  OpProto_Var_FieldNumber_Comment = 2,
-  OpProto_Var_FieldNumber_Duplicable = 3,
-  OpProto_Var_FieldNumber_Intermediate = 4,
-  OpProto_Var_FieldNumber_Dispensable = 5,
-  OpProto_Var_FieldNumber_Reuse = 6,
-};
-
-/// VarProto describes the C++ type framework::Variable.
-@interface OpProto_Var : GPBMessage
-
-@property(nonatomic, readwrite, copy, null_resettable) NSString *name;
-/// Test to see if @c name has been set.
-@property(nonatomic, readwrite) BOOL hasName;
-
-@property(nonatomic, readwrite, copy, null_resettable) NSString *comment;
-/// Test to see if @c comment has been set.
-@property(nonatomic, readwrite) BOOL hasComment;
-
-@property(nonatomic, readwrite) BOOL duplicable;
-
-@property(nonatomic, readwrite) BOOL hasDuplicable;
-@property(nonatomic, readwrite) BOOL intermediate;
-
-@property(nonatomic, readwrite) BOOL hasIntermediate;
-@property(nonatomic, readwrite) BOOL dispensable;
-
-@property(nonatomic, readwrite) BOOL hasDispensable;
-@property(nonatomic, readwrite, copy, null_resettable) NSString *reuse;
-/// Test to see if @c reuse has been set.
-@property(nonatomic, readwrite) BOOL hasReuse;
-
-@end
-
-#pragma mark - OpProto_Attr
-
-typedef GPB_ENUM(OpProto_Attr_FieldNumber) {
-  OpProto_Attr_FieldNumber_Name = 1,
-  OpProto_Attr_FieldNumber_Type = 2,
-  OpProto_Attr_FieldNumber_Comment = 3,
-  OpProto_Attr_FieldNumber_Generated = 4,
-};
-
-/// AttrProto describes the C++ type Attribute.
-@interface OpProto_Attr : GPBMessage
-
-@property(nonatomic, readwrite, copy, null_resettable) NSString *name;
-/// Test to see if @c name has been set.
-@property(nonatomic, readwrite) BOOL hasName;
-
-@property(nonatomic, readwrite) AttrType type;
-
-@property(nonatomic, readwrite) BOOL hasType;
-@property(nonatomic, readwrite, copy, null_resettable) NSString *comment;
-/// Test to see if @c comment has been set.
-@property(nonatomic, readwrite) BOOL hasComment;
-
-/// If that attribute is generated, it means the Paddle third
-/// language binding has responsibility to fill that
-/// attribute. End-User should not set that attribute.
-@property(nonatomic, readwrite) BOOL generated;
-
-@property(nonatomic, readwrite) BOOL hasGenerated;
-@end
-
-#pragma mark - VarType
-
-typedef GPB_ENUM(VarType_FieldNumber) {
-  VarType_FieldNumber_Type = 1,
-  VarType_FieldNumber_SelectedRows = 2,
-  VarType_FieldNumber_LodTensor = 3,
-  VarType_FieldNumber_TensorArray_p = 4,
-  VarType_FieldNumber_Reader = 5,
-  VarType_FieldNumber_Channel = 6,
-  VarType_FieldNumber_Tuple = 7,
-};
-
-@interface VarType : GPBMessage
-
-@property(nonatomic, readwrite) VarType_Type type;
-
-@property(nonatomic, readwrite) BOOL hasType;
-@property(nonatomic, readwrite, strong, null_resettable) VarType_TensorDesc *selectedRows;
-/// Test to see if @c selectedRows has been set.
-@property(nonatomic, readwrite) BOOL hasSelectedRows;
-
-@property(nonatomic, readwrite, strong, null_resettable) VarType_LoDTensorDesc *lodTensor;
-/// Test to see if @c lodTensor has been set.
-@property(nonatomic, readwrite) BOOL hasLodTensor;
-
-@property(nonatomic, readwrite, strong, null_resettable) VarType_LoDTensorArrayDesc *tensorArray_p;
-/// Test to see if @c tensorArray_p has been set.
-@property(nonatomic, readwrite) BOOL hasTensorArray_p;
-
-@property(nonatomic, readwrite, strong, null_resettable) VarType_ReaderDesc *reader;
-/// Test to see if @c reader has been set.
-@property(nonatomic, readwrite) BOOL hasReader;
-
-@property(nonatomic, readwrite, strong, null_resettable) VarType_ChannelDesc *channel;
-/// Test to see if @c channel has been set.
-@property(nonatomic, readwrite) BOOL hasChannel;
-
-@property(nonatomic, readwrite, strong, null_resettable) VarType_Tuple *tuple;
-/// Test to see if @c tuple has been set.
-@property(nonatomic, readwrite) BOOL hasTuple;
-
-@end
-
-#pragma mark - VarType_TensorDesc
-
-typedef GPB_ENUM(VarType_TensorDesc_FieldNumber) {
-  VarType_TensorDesc_FieldNumber_DataType = 1,
-  VarType_TensorDesc_FieldNumber_DimsArray = 2,
-};
-
-@interface VarType_TensorDesc : GPBMessage
-
-/// Should only be PODType. Is enforced in C++
-@property(nonatomic, readwrite) VarType_Type dataType;
-
-@property(nonatomic, readwrite) BOOL hasDataType;
-/// [UNK, 640, 480] is saved as [-1, 640, 480]
-@property(nonatomic, readwrite, strong, null_resettable) GPBInt64Array *dimsArray;
-/// The number of items in @c dimsArray without causing the array to be created.
-@property(nonatomic, readonly) NSUInteger dimsArray_Count;
-
-@end
-
-#pragma mark - VarType_LoDTensorDesc
-
-typedef GPB_ENUM(VarType_LoDTensorDesc_FieldNumber) {
-  VarType_LoDTensorDesc_FieldNumber_Tensor = 1,
-  VarType_LoDTensorDesc_FieldNumber_LodLevel = 2,
-};
-
-@interface VarType_LoDTensorDesc : GPBMessage
-
-@property(nonatomic, readwrite, strong, null_resettable) VarType_TensorDesc *tensor;
-/// Test to see if @c tensor has been set.
-@property(nonatomic, readwrite) BOOL hasTensor;
-
-@property(nonatomic, readwrite) int32_t lodLevel;
-
-@property(nonatomic, readwrite) BOOL hasLodLevel;
-@end
-
-#pragma mark - VarType_LoDTensorArrayDesc
-
-typedef GPB_ENUM(VarType_LoDTensorArrayDesc_FieldNumber) {
-  VarType_LoDTensorArrayDesc_FieldNumber_Tensor = 1,
-  VarType_LoDTensorArrayDesc_FieldNumber_LodLevel = 2,
-};
-
-@interface VarType_LoDTensorArrayDesc : GPBMessage
-
-@property(nonatomic, readwrite, strong, null_resettable) VarType_TensorDesc *tensor;
-/// Test to see if @c tensor has been set.
-@property(nonatomic, readwrite) BOOL hasTensor;
-
-@property(nonatomic, readwrite) int32_t lodLevel;
-
-@property(nonatomic, readwrite) BOOL hasLodLevel;
-@end
-
-#pragma mark - VarType_ReaderDesc
-
-typedef GPB_ENUM(VarType_ReaderDesc_FieldNumber) {
-  VarType_ReaderDesc_FieldNumber_LodTensorArray = 1,
-};
-
-@interface VarType_ReaderDesc : GPBMessage
-
-@property(nonatomic, readwrite, strong, null_resettable) NSMutableArray<VarType_LoDTensorDesc*> *lodTensorArray;
-/// The number of items in @c lodTensorArray without causing the array to be created.
-@property(nonatomic, readonly) NSUInteger lodTensorArray_Count;
-
-@end
-
-#pragma mark - VarType_ChannelDesc
-
-typedef GPB_ENUM(VarType_ChannelDesc_FieldNumber) {
-  VarType_ChannelDesc_FieldNumber_DataType = 1,
-  VarType_ChannelDesc_FieldNumber_Capacity = 2,
-};
-
-@interface VarType_ChannelDesc : GPBMessage
-
-@property(nonatomic, readwrite) VarType_Type dataType;
-
-@property(nonatomic, readwrite) BOOL hasDataType;
-@property(nonatomic, readwrite) int64_t capacity;
-
-@property(nonatomic, readwrite) BOOL hasCapacity;
-@end
-
-#pragma mark - VarType_Tuple
-
-typedef GPB_ENUM(VarType_Tuple_FieldNumber) {
-  VarType_Tuple_FieldNumber_ElementTypeArray = 1,
-};
-
-@interface VarType_Tuple : GPBMessage
-
-// |elementTypeArray| contains |VarType_Type|
-@property(nonatomic, readwrite, strong, null_resettable) GPBEnumArray *elementTypeArray;
-/// The number of items in @c elementTypeArray without causing the array to be created.
-@property(nonatomic, readonly) NSUInteger elementTypeArray_Count;
-
-@end
-
-#pragma mark - VarDesc
-
-typedef GPB_ENUM(VarDesc_FieldNumber) {
-  VarDesc_FieldNumber_Name = 1,
-  VarDesc_FieldNumber_Type = 2,
-  VarDesc_FieldNumber_Persistable = 3,
-};
-
-@interface VarDesc : GPBMessage
-
-@property(nonatomic, readwrite, copy, null_resettable) NSString *name;
-/// Test to see if @c name has been set.
-@property(nonatomic, readwrite) BOOL hasName;
-
-@property(nonatomic, readwrite, strong, null_resettable) VarType *type;
-/// Test to see if @c type has been set.
-@property(nonatomic, readwrite) BOOL hasType;
-
-@property(nonatomic, readwrite) BOOL persistable;
-
-@property(nonatomic, readwrite) BOOL hasPersistable;
-@end
-
-#pragma mark - BlockDesc
-
-typedef GPB_ENUM(BlockDesc_FieldNumber) {
-  BlockDesc_FieldNumber_Idx = 1,
-  BlockDesc_FieldNumber_ParentIdx = 2,
-  BlockDesc_FieldNumber_VarsArray = 3,
-  BlockDesc_FieldNumber_OpsArray = 4,
-  BlockDesc_FieldNumber_ForwardBlockIdx = 5,
-};
-
-@interface BlockDesc : GPBMessage
-
-@property(nonatomic, readwrite) int32_t idx;
-
-@property(nonatomic, readwrite) BOOL hasIdx;
-@property(nonatomic, readwrite) int32_t parentIdx;
-
-@property(nonatomic, readwrite) BOOL hasParentIdx;
-@property(nonatomic, readwrite, strong, null_resettable) NSMutableArray<VarDesc*> *varsArray;
-/// The number of items in @c varsArray without causing the array to be created.
-@property(nonatomic, readonly) NSUInteger varsArray_Count;
-
-@property(nonatomic, readwrite, strong, null_resettable) NSMutableArray<OpDesc*> *opsArray;
-/// The number of items in @c opsArray without causing the array to be created.
-@property(nonatomic, readonly) NSUInteger opsArray_Count;
-
-@property(nonatomic, readwrite) int32_t forwardBlockIdx;
-
-@property(nonatomic, readwrite) BOOL hasForwardBlockIdx;
-@end
-
-#pragma mark - ProgramDesc
-
-typedef GPB_ENUM(ProgramDesc_FieldNumber) {
-  ProgramDesc_FieldNumber_BlocksArray = 1,
-  ProgramDesc_FieldNumber_Version = 2,
-};
-
-/// Please refer to
-/// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
-/// for more details.
-/// TODO(panyx0718): A model can have multiple programs. Need a
-/// way to distinguish them. Maybe ID or name?
-@interface ProgramDesc : GPBMessage
-
-@property(nonatomic, readwrite, strong, null_resettable) NSMutableArray<BlockDesc*> *blocksArray;
-/// The number of items in @c blocksArray without causing the array to be created.
-@property(nonatomic, readonly) NSUInteger blocksArray_Count;
-
-@property(nonatomic, readwrite, strong, null_resettable) Version *version;
-/// Test to see if @c version has been set.
-@property(nonatomic, readwrite) BOOL hasVersion;
-
-@end
-
-NS_ASSUME_NONNULL_END
-
-CF_EXTERN_C_END
-
-#pragma clang diagnostic pop
-
-// @@protoc_insertion_point(global_scope)
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/Framework.pbobjc.m b/metal/paddle-mobile/paddle-mobile/Src/Program/Framework.pbobjc.m
deleted file mode 100755
index 00dad2662c..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Program/Framework.pbobjc.m
+++ /dev/null
@@ -1,1417 +0,0 @@
-// Generated by the protocol buffer compiler.  DO NOT EDIT!
-// source: Framework.proto
-
-// This CPP symbol can be defined to use imports that match up to the framework
-// imports needed when using CocoaPods.
-#if !defined(GPB_USE_PROTOBUF_FRAMEWORK_IMPORTS)
- #define GPB_USE_PROTOBUF_FRAMEWORK_IMPORTS 0
-#endif
-
-//#if GPB_USE_PROTOBUF_FRAMEWORK_IMPORTS
-// #import <Protobuf/GPBProtocolBuffers_RuntimeSupport.h>
-//#else
-// #import "GPBProtocolBuffers_RuntimeSupport.h"
-//#endif
-
-#if GPB_USE_PROTOBUF_FRAMEWORK_IMPORTS
-#import <Protobuf/GPBProtocolBuffers_RuntimeSupport.h>
-#else
-#import <ProtocolBuffers/GPBProtocolBuffers_RuntimeSupport.h>
-#endif
-
- #import "Framework.pbobjc.h"
-// @@protoc_insertion_point(imports)
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wdeprecated-declarations"
-
-#pragma mark - FrameworkRoot
-
-@implementation FrameworkRoot
-
-@end
-
-#pragma mark - FrameworkRoot_FileDescriptor
-
-static GPBFileDescriptor *FrameworkRoot_FileDescriptor(void) {
-  // This is called by +initialize so there is no need to worry
-  // about thread safety of the singleton.
-  static GPBFileDescriptor *descriptor = NULL;
-  if (!descriptor) {
-    GPBDebugCheckRuntimeVersion();
-    descriptor = [[GPBFileDescriptor alloc] initWithPackage:@"paddle_mobile.framework.proto"
-                                                     syntax:GPBFileSyntaxProto2];
-  }
-  return descriptor;
-}
-
-#pragma mark - Enum AttrType
-
-GPBEnumDescriptor *AttrType_EnumDescriptor(void) {
-  static GPBEnumDescriptor *descriptor = NULL;
-  if (!descriptor) {
-    static const char *valueNames =
-        "Int\000Float\000String\000Ints\000Floats\000Strings\000Boo"
-        "lean\000Booleans\000Block\000Long\000Blocks\000";
-    static const int32_t values[] = {
-        AttrType_Int,
-        AttrType_Float,
-        AttrType_String,
-        AttrType_Ints,
-        AttrType_Floats,
-        AttrType_Strings,
-        AttrType_Boolean,
-        AttrType_Booleans,
-        AttrType_Block,
-        AttrType_Long,
-        AttrType_Blocks,
-    };
-    GPBEnumDescriptor *worker =
-        [GPBEnumDescriptor allocDescriptorForName:GPBNSStringifySymbol(AttrType)
-                                       valueNames:valueNames
-                                           values:values
-                                            count:(uint32_t)(sizeof(values) / sizeof(int32_t))
-                                     enumVerifier:AttrType_IsValidValue];
-    if (!OSAtomicCompareAndSwapPtrBarrier(nil, worker, (void * volatile *)&descriptor)) {
-      [worker release];
-    }
-  }
-  return descriptor;
-}
-
-BOOL AttrType_IsValidValue(int32_t value__) {
-  switch (value__) {
-    case AttrType_Int:
-    case AttrType_Float:
-    case AttrType_String:
-    case AttrType_Ints:
-    case AttrType_Floats:
-    case AttrType_Strings:
-    case AttrType_Boolean:
-    case AttrType_Booleans:
-    case AttrType_Block:
-    case AttrType_Long:
-    case AttrType_Blocks:
-      return YES;
-    default:
-      return NO;
-  }
-}
-
-#pragma mark - Version
-
-@implementation Version
-
-@dynamic hasVersion, version;
-
-typedef struct Version__storage_ {
-  uint32_t _has_storage_[1];
-  int64_t version;
-} Version__storage_;
-
-// This method is threadsafe because it is initially called
-// in +initialize for each subclass.
-+ (GPBDescriptor *)descriptor {
-  static GPBDescriptor *descriptor = nil;
-  if (!descriptor) {
-    static GPBMessageFieldDescription fields[] = {
-      {
-        .name = "version",
-        .dataTypeSpecific.className = NULL,
-        .number = Version_FieldNumber_Version,
-        .hasIndex = 0,
-        .offset = (uint32_t)offsetof(Version__storage_, version),
-        .flags = GPBFieldOptional | GPBFieldHasDefaultValue,
-        .dataType = GPBDataTypeInt64,
-      },
-    };
-    GPBDescriptor *localDescriptor =
-        [GPBDescriptor allocDescriptorForClass:[Version class]
-                                     rootClass:[FrameworkRoot class]
-                                          file:FrameworkRoot_FileDescriptor()
-                                        fields:fields
-                                    fieldCount:(uint32_t)(sizeof(fields) / sizeof(GPBMessageFieldDescription))
-                                   storageSize:sizeof(Version__storage_)
-                                         flags:0];
-    NSAssert(descriptor == nil, @"Startup recursed!");
-    descriptor = localDescriptor;
-  }
-  return descriptor;
-}
-
-@end
-
-#pragma mark - OpDesc
-
-@implementation OpDesc
-
-@dynamic hasType, type;
-@dynamic inputsArray, inputsArray_Count;
-@dynamic outputsArray, outputsArray_Count;
-@dynamic attrsArray, attrsArray_Count;
-@dynamic hasIsTarget, isTarget;
-
-typedef struct OpDesc__storage_ {
-  uint32_t _has_storage_[1];
-  NSMutableArray *inputsArray;
-  NSMutableArray *outputsArray;
-  NSString *type;
-  NSMutableArray *attrsArray;
-} OpDesc__storage_;
-
-// This method is threadsafe because it is initially called
-// in +initialize for each subclass.
-+ (GPBDescriptor *)descriptor {
-  static GPBDescriptor *descriptor = nil;
-  if (!descriptor) {
-    static GPBMessageFieldDescription fields[] = {
-      {
-        .name = "inputsArray",
-        .dataTypeSpecific.className = GPBStringifySymbol(OpDesc_Var),
-        .number = OpDesc_FieldNumber_InputsArray,
-        .hasIndex = GPBNoHasBit,
-        .offset = (uint32_t)offsetof(OpDesc__storage_, inputsArray),
-        .flags = GPBFieldRepeated,
-        .dataType = GPBDataTypeMessage,
-      },
-      {
-        .name = "outputsArray",
-        .dataTypeSpecific.className = GPBStringifySymbol(OpDesc_Var),
-        .number = OpDesc_FieldNumber_OutputsArray,
-        .hasIndex = GPBNoHasBit,
-        .offset = (uint32_t)offsetof(OpDesc__storage_, outputsArray),
-        .flags = GPBFieldRepeated,
-        .dataType = GPBDataTypeMessage,
-      },
-      {
-        .name = "type",
-        .dataTypeSpecific.className = NULL,
-        .number = OpDesc_FieldNumber_Type,
-        .hasIndex = 0,
-        .offset = (uint32_t)offsetof(OpDesc__storage_, type),
-        .flags = GPBFieldRequired,
-        .dataType = GPBDataTypeString,
-      },
-      {
-        .name = "attrsArray",
-        .dataTypeSpecific.className = GPBStringifySymbol(OpDesc_Attr),
-        .number = OpDesc_FieldNumber_AttrsArray,
-        .hasIndex = GPBNoHasBit,
-        .offset = (uint32_t)offsetof(OpDesc__storage_, attrsArray),
-        .flags = GPBFieldRepeated,
-        .dataType = GPBDataTypeMessage,
-      },
-      {
-        .name = "isTarget",
-        .dataTypeSpecific.className = NULL,
-        .number = OpDesc_FieldNumber_IsTarget,
-        .hasIndex = 1,
-        .offset = 2,  // Stored in _has_storage_ to save space.
-        .flags = GPBFieldOptional | GPBFieldHasDefaultValue,
-        .dataType = GPBDataTypeBool,
-      },
-    };
-    GPBDescriptor *localDescriptor =
-        [GPBDescriptor allocDescriptorForClass:[OpDesc class]
-                                     rootClass:[FrameworkRoot class]
-                                          file:FrameworkRoot_FileDescriptor()
-                                        fields:fields
-                                    fieldCount:(uint32_t)(sizeof(fields) / sizeof(GPBMessageFieldDescription))
-                                   storageSize:sizeof(OpDesc__storage_)
-                                         flags:0];
-    NSAssert(descriptor == nil, @"Startup recursed!");
-    descriptor = localDescriptor;
-  }
-  return descriptor;
-}
-
-@end
-
-#pragma mark - OpDesc_Attr
-
-@implementation OpDesc_Attr
-
-@dynamic hasName, name;
-@dynamic hasType, type;
-@dynamic hasI, i;
-@dynamic hasF, f;
-@dynamic hasS, s;
-@dynamic intsArray, intsArray_Count;
-@dynamic floatsArray, floatsArray_Count;
-@dynamic stringsArray, stringsArray_Count;
-@dynamic hasB, b;
-@dynamic boolsArray, boolsArray_Count;
-@dynamic hasBlockIdx, blockIdx;
-@dynamic hasL, l;
-@dynamic blocksIdxArray, blocksIdxArray_Count;
-
-typedef struct OpDesc_Attr__storage_ {
-  uint32_t _has_storage_[1];
-  AttrType type;
-  int32_t i;
-  float f;
-  int32_t blockIdx;
-  NSString *name;
-  NSString *s;
-  GPBInt32Array *intsArray;
-  GPBFloatArray *floatsArray;
-  NSMutableArray *stringsArray;
-  GPBBoolArray *boolsArray;
-  GPBInt32Array *blocksIdxArray;
-  int64_t l;
-} OpDesc_Attr__storage_;
-
-// This method is threadsafe because it is initially called
-// in +initialize for each subclass.
-+ (GPBDescriptor *)descriptor {
-  static GPBDescriptor *descriptor = nil;
-  if (!descriptor) {
-    static GPBMessageFieldDescription fields[] = {
-      {
-        .name = "name",
-        .dataTypeSpecific.className = NULL,
-        .number = OpDesc_Attr_FieldNumber_Name,
-        .hasIndex = 0,
-        .offset = (uint32_t)offsetof(OpDesc_Attr__storage_, name),
-        .flags = GPBFieldRequired,
-        .dataType = GPBDataTypeString,
-      },
-      {
-        .name = "type",
-        .dataTypeSpecific.enumDescFunc = AttrType_EnumDescriptor,
-        .number = OpDesc_Attr_FieldNumber_Type,
-        .hasIndex = 1,
-        .offset = (uint32_t)offsetof(OpDesc_Attr__storage_, type),
-        .flags = GPBFieldRequired | GPBFieldHasEnumDescriptor,
-        .dataType = GPBDataTypeEnum,
-      },
-      {
-        .name = "i",
-        .dataTypeSpecific.className = NULL,
-        .number = OpDesc_Attr_FieldNumber_I,
-        .hasIndex = 2,
-        .offset = (uint32_t)offsetof(OpDesc_Attr__storage_, i),
-        .flags = GPBFieldOptional,
-        .dataType = GPBDataTypeInt32,
-      },
-      {
-        .name = "f",
-        .dataTypeSpecific.className = NULL,
-        .number = OpDesc_Attr_FieldNumber_F,
-        .hasIndex = 3,
-        .offset = (uint32_t)offsetof(OpDesc_Attr__storage_, f),
-        .flags = GPBFieldOptional,
-        .dataType = GPBDataTypeFloat,
-      },
-      {
-        .name = "s",
-        .dataTypeSpecific.className = NULL,
-        .number = OpDesc_Attr_FieldNumber_S,
-        .hasIndex = 4,
-        .offset = (uint32_t)offsetof(OpDesc_Attr__storage_, s),
-        .flags = GPBFieldOptional,
-        .dataType = GPBDataTypeString,
-      },
-      {
-        .name = "intsArray",
-        .dataTypeSpecific.className = NULL,
-        .number = OpDesc_Attr_FieldNumber_IntsArray,
-        .hasIndex = GPBNoHasBit,
-        .offset = (uint32_t)offsetof(OpDesc_Attr__storage_, intsArray),
-        .flags = GPBFieldRepeated,
-        .dataType = GPBDataTypeInt32,
-      },
-      {
-        .name = "floatsArray",
-        .dataTypeSpecific.className = NULL,
-        .number = OpDesc_Attr_FieldNumber_FloatsArray,
-        .hasIndex = GPBNoHasBit,
-        .offset = (uint32_t)offsetof(OpDesc_Attr__storage_, floatsArray),
-        .flags = GPBFieldRepeated,
-        .dataType = GPBDataTypeFloat,
-      },
-      {
-        .name = "stringsArray",
-        .dataTypeSpecific.className = NULL,
-        .number = OpDesc_Attr_FieldNumber_StringsArray,
-        .hasIndex = GPBNoHasBit,
-        .offset = (uint32_t)offsetof(OpDesc_Attr__storage_, stringsArray),
-        .flags = GPBFieldRepeated,
-        .dataType = GPBDataTypeString,
-      },
-      {
-        .name = "b",
-        .dataTypeSpecific.className = NULL,
-        .number = OpDesc_Attr_FieldNumber_B,
-        .hasIndex = 5,
-        .offset = 6,  // Stored in _has_storage_ to save space.
-        .flags = GPBFieldOptional,
-        .dataType = GPBDataTypeBool,
-      },
-      {
-        .name = "boolsArray",
-        .dataTypeSpecific.className = NULL,
-        .number = OpDesc_Attr_FieldNumber_BoolsArray,
-        .hasIndex = GPBNoHasBit,
-        .offset = (uint32_t)offsetof(OpDesc_Attr__storage_, boolsArray),
-        .flags = GPBFieldRepeated,
-        .dataType = GPBDataTypeBool,
-      },
-      {
-        .name = "blockIdx",
-        .dataTypeSpecific.className = NULL,
-        .number = OpDesc_Attr_FieldNumber_BlockIdx,
-        .hasIndex = 7,
-        .offset = (uint32_t)offsetof(OpDesc_Attr__storage_, blockIdx),
-        .flags = GPBFieldOptional,
-        .dataType = GPBDataTypeInt32,
-      },
-      {
-        .name = "l",
-        .dataTypeSpecific.className = NULL,
-        .number = OpDesc_Attr_FieldNumber_L,
-        .hasIndex = 8,
-        .offset = (uint32_t)offsetof(OpDesc_Attr__storage_, l),
-        .flags = GPBFieldOptional,
-        .dataType = GPBDataTypeInt64,
-      },
-      {
-        .name = "blocksIdxArray",
-        .dataTypeSpecific.className = NULL,
-        .number = OpDesc_Attr_FieldNumber_BlocksIdxArray,
-        .hasIndex = GPBNoHasBit,
-        .offset = (uint32_t)offsetof(OpDesc_Attr__storage_, blocksIdxArray),
-        .flags = GPBFieldRepeated,
-        .dataType = GPBDataTypeInt32,
-      },
-    };
-    GPBDescriptor *localDescriptor =
-        [GPBDescriptor allocDescriptorForClass:[OpDesc_Attr class]
-                                     rootClass:[FrameworkRoot class]
-                                          file:FrameworkRoot_FileDescriptor()
-                                        fields:fields
-                                    fieldCount:(uint32_t)(sizeof(fields) / sizeof(GPBMessageFieldDescription))
-                                   storageSize:sizeof(OpDesc_Attr__storage_)
-                                         flags:0];
-    NSAssert(descriptor == nil, @"Startup recursed!");
-    descriptor = localDescriptor;
-  }
-  return descriptor;
-}
-
-@end
-
-#pragma mark - OpDesc_Var
-
-@implementation OpDesc_Var
-
-@dynamic hasParameter, parameter;
-@dynamic argumentsArray, argumentsArray_Count;
-
-typedef struct OpDesc_Var__storage_ {
-  uint32_t _has_storage_[1];
-  NSString *parameter;
-  NSMutableArray *argumentsArray;
-} OpDesc_Var__storage_;
-
-// This method is threadsafe because it is initially called
-// in +initialize for each subclass.
-+ (GPBDescriptor *)descriptor {
-  static GPBDescriptor *descriptor = nil;
-  if (!descriptor) {
-    static GPBMessageFieldDescription fields[] = {
-      {
-        .name = "parameter",
-        .dataTypeSpecific.className = NULL,
-        .number = OpDesc_Var_FieldNumber_Parameter,
-        .hasIndex = 0,
-        .offset = (uint32_t)offsetof(OpDesc_Var__storage_, parameter),
-        .flags = GPBFieldRequired,
-        .dataType = GPBDataTypeString,
-      },
-      {
-        .name = "argumentsArray",
-        .dataTypeSpecific.className = NULL,
-        .number = OpDesc_Var_FieldNumber_ArgumentsArray,
-        .hasIndex = GPBNoHasBit,
-        .offset = (uint32_t)offsetof(OpDesc_Var__storage_, argumentsArray),
-        .flags = GPBFieldRepeated,
-        .dataType = GPBDataTypeString,
-      },
-    };
-    GPBDescriptor *localDescriptor =
-        [GPBDescriptor allocDescriptorForClass:[OpDesc_Var class]
-                                     rootClass:[FrameworkRoot class]
-                                          file:FrameworkRoot_FileDescriptor()
-                                        fields:fields
-                                    fieldCount:(uint32_t)(sizeof(fields) / sizeof(GPBMessageFieldDescription))
-                                   storageSize:sizeof(OpDesc_Var__storage_)
-                                         flags:0];
-    NSAssert(descriptor == nil, @"Startup recursed!");
-    descriptor = localDescriptor;
-  }
-  return descriptor;
-}
-
-@end
-
-#pragma mark - OpProto
-
-@implementation OpProto
-
-@dynamic hasType, type;
-@dynamic inputsArray, inputsArray_Count;
-@dynamic outputsArray, outputsArray_Count;
-@dynamic attrsArray, attrsArray_Count;
-@dynamic hasComment, comment;
-
-typedef struct OpProto__storage_ {
-  uint32_t _has_storage_[1];
-  NSString *type;
-  NSMutableArray *inputsArray;
-  NSMutableArray *outputsArray;
-  NSMutableArray *attrsArray;
-  NSString *comment;
-} OpProto__storage_;
-
-// This method is threadsafe because it is initially called
-// in +initialize for each subclass.
-+ (GPBDescriptor *)descriptor {
-  static GPBDescriptor *descriptor = nil;
-  if (!descriptor) {
-    static GPBMessageFieldDescription fields[] = {
-      {
-        .name = "type",
-        .dataTypeSpecific.className = NULL,
-        .number = OpProto_FieldNumber_Type,
-        .hasIndex = 0,
-        .offset = (uint32_t)offsetof(OpProto__storage_, type),
-        .flags = GPBFieldRequired,
-        .dataType = GPBDataTypeString,
-      },
-      {
-        .name = "inputsArray",
-        .dataTypeSpecific.className = GPBStringifySymbol(OpProto_Var),
-        .number = OpProto_FieldNumber_InputsArray,
-        .hasIndex = GPBNoHasBit,
-        .offset = (uint32_t)offsetof(OpProto__storage_, inputsArray),
-        .flags = GPBFieldRepeated,
-        .dataType = GPBDataTypeMessage,
-      },
-      {
-        .name = "outputsArray",
-        .dataTypeSpecific.className = GPBStringifySymbol(OpProto_Var),
-        .number = OpProto_FieldNumber_OutputsArray,
-        .hasIndex = GPBNoHasBit,
-        .offset = (uint32_t)offsetof(OpProto__storage_, outputsArray),
-        .flags = GPBFieldRepeated,
-        .dataType = GPBDataTypeMessage,
-      },
-      {
-        .name = "attrsArray",
-        .dataTypeSpecific.className = GPBStringifySymbol(OpProto_Attr),
-        .number = OpProto_FieldNumber_AttrsArray,
-        .hasIndex = GPBNoHasBit,
-        .offset = (uint32_t)offsetof(OpProto__storage_, attrsArray),
-        .flags = GPBFieldRepeated,
-        .dataType = GPBDataTypeMessage,
-      },
-      {
-        .name = "comment",
-        .dataTypeSpecific.className = NULL,
-        .number = OpProto_FieldNumber_Comment,
-        .hasIndex = 1,
-        .offset = (uint32_t)offsetof(OpProto__storage_, comment),
-        .flags = GPBFieldRequired,
-        .dataType = GPBDataTypeString,
-      },
-    };
-    GPBDescriptor *localDescriptor =
-        [GPBDescriptor allocDescriptorForClass:[OpProto class]
-                                     rootClass:[FrameworkRoot class]
-                                          file:FrameworkRoot_FileDescriptor()
-                                        fields:fields
-                                    fieldCount:(uint32_t)(sizeof(fields) / sizeof(GPBMessageFieldDescription))
-                                   storageSize:sizeof(OpProto__storage_)
-                                         flags:0];
-    NSAssert(descriptor == nil, @"Startup recursed!");
-    descriptor = localDescriptor;
-  }
-  return descriptor;
-}
-
-@end
-
-#pragma mark - OpProto_Var
-
-@implementation OpProto_Var
-
-@dynamic hasName, name;
-@dynamic hasComment, comment;
-@dynamic hasDuplicable, duplicable;
-@dynamic hasIntermediate, intermediate;
-@dynamic hasDispensable, dispensable;
-@dynamic hasReuse, reuse;
-
-typedef struct OpProto_Var__storage_ {
-  uint32_t _has_storage_[1];
-  NSString *name;
-  NSString *comment;
-  NSString *reuse;
-} OpProto_Var__storage_;
-
-// This method is threadsafe because it is initially called
-// in +initialize for each subclass.
-+ (GPBDescriptor *)descriptor {
-  static GPBDescriptor *descriptor = nil;
-  if (!descriptor) {
-    static GPBMessageFieldDescription fields[] = {
-      {
-        .name = "name",
-        .dataTypeSpecific.className = NULL,
-        .number = OpProto_Var_FieldNumber_Name,
-        .hasIndex = 0,
-        .offset = (uint32_t)offsetof(OpProto_Var__storage_, name),
-        .flags = GPBFieldRequired,
-        .dataType = GPBDataTypeString,
-      },
-      {
-        .name = "comment",
-        .dataTypeSpecific.className = NULL,
-        .number = OpProto_Var_FieldNumber_Comment,
-        .hasIndex = 1,
-        .offset = (uint32_t)offsetof(OpProto_Var__storage_, comment),
-        .flags = GPBFieldRequired,
-        .dataType = GPBDataTypeString,
-      },
-      {
-        .name = "duplicable",
-        .dataTypeSpecific.className = NULL,
-        .number = OpProto_Var_FieldNumber_Duplicable,
-        .hasIndex = 2,
-        .offset = 3,  // Stored in _has_storage_ to save space.
-        .flags = GPBFieldOptional | GPBFieldHasDefaultValue,
-        .dataType = GPBDataTypeBool,
-      },
-      {
-        .name = "intermediate",
-        .dataTypeSpecific.className = NULL,
-        .number = OpProto_Var_FieldNumber_Intermediate,
-        .hasIndex = 4,
-        .offset = 5,  // Stored in _has_storage_ to save space.
-        .flags = GPBFieldOptional | GPBFieldHasDefaultValue,
-        .dataType = GPBDataTypeBool,
-      },
-      {
-        .name = "dispensable",
-        .dataTypeSpecific.className = NULL,
-        .number = OpProto_Var_FieldNumber_Dispensable,
-        .hasIndex = 6,
-        .offset = 7,  // Stored in _has_storage_ to save space.
-        .flags = GPBFieldOptional | GPBFieldHasDefaultValue,
-        .dataType = GPBDataTypeBool,
-      },
-      {
-        .name = "reuse",
-        .dataTypeSpecific.className = NULL,
-        .number = OpProto_Var_FieldNumber_Reuse,
-        .hasIndex = 8,
-        .offset = (uint32_t)offsetof(OpProto_Var__storage_, reuse),
-        .flags = GPBFieldOptional,
-        .dataType = GPBDataTypeString,
-      },
-    };
-    GPBDescriptor *localDescriptor =
-        [GPBDescriptor allocDescriptorForClass:[OpProto_Var class]
-                                     rootClass:[FrameworkRoot class]
-                                          file:FrameworkRoot_FileDescriptor()
-                                        fields:fields
-                                    fieldCount:(uint32_t)(sizeof(fields) / sizeof(GPBMessageFieldDescription))
-                                   storageSize:sizeof(OpProto_Var__storage_)
-                                         flags:0];
-    NSAssert(descriptor == nil, @"Startup recursed!");
-    descriptor = localDescriptor;
-  }
-  return descriptor;
-}
-
-@end
-
-#pragma mark - OpProto_Attr
-
-@implementation OpProto_Attr
-
-@dynamic hasName, name;
-@dynamic hasType, type;
-@dynamic hasComment, comment;
-@dynamic hasGenerated, generated;
-
-typedef struct OpProto_Attr__storage_ {
-  uint32_t _has_storage_[1];
-  AttrType type;
-  NSString *name;
-  NSString *comment;
-} OpProto_Attr__storage_;
-
-// This method is threadsafe because it is initially called
-// in +initialize for each subclass.
-+ (GPBDescriptor *)descriptor {
-  static GPBDescriptor *descriptor = nil;
-  if (!descriptor) {
-    static GPBMessageFieldDescription fields[] = {
-      {
-        .name = "name",
-        .dataTypeSpecific.className = NULL,
-        .number = OpProto_Attr_FieldNumber_Name,
-        .hasIndex = 0,
-        .offset = (uint32_t)offsetof(OpProto_Attr__storage_, name),
-        .flags = GPBFieldRequired,
-        .dataType = GPBDataTypeString,
-      },
-      {
-        .name = "type",
-        .dataTypeSpecific.enumDescFunc = AttrType_EnumDescriptor,
-        .number = OpProto_Attr_FieldNumber_Type,
-        .hasIndex = 1,
-        .offset = (uint32_t)offsetof(OpProto_Attr__storage_, type),
-        .flags = GPBFieldRequired | GPBFieldHasEnumDescriptor,
-        .dataType = GPBDataTypeEnum,
-      },
-      {
-        .name = "comment",
-        .dataTypeSpecific.className = NULL,
-        .number = OpProto_Attr_FieldNumber_Comment,
-        .hasIndex = 2,
-        .offset = (uint32_t)offsetof(OpProto_Attr__storage_, comment),
-        .flags = GPBFieldRequired,
-        .dataType = GPBDataTypeString,
-      },
-      {
-        .name = "generated",
-        .dataTypeSpecific.className = NULL,
-        .number = OpProto_Attr_FieldNumber_Generated,
-        .hasIndex = 3,
-        .offset = 4,  // Stored in _has_storage_ to save space.
-        .flags = GPBFieldOptional | GPBFieldHasDefaultValue,
-        .dataType = GPBDataTypeBool,
-      },
-    };
-    GPBDescriptor *localDescriptor =
-        [GPBDescriptor allocDescriptorForClass:[OpProto_Attr class]
-                                     rootClass:[FrameworkRoot class]
-                                          file:FrameworkRoot_FileDescriptor()
-                                        fields:fields
-                                    fieldCount:(uint32_t)(sizeof(fields) / sizeof(GPBMessageFieldDescription))
-                                   storageSize:sizeof(OpProto_Attr__storage_)
-                                         flags:0];
-    NSAssert(descriptor == nil, @"Startup recursed!");
-    descriptor = localDescriptor;
-  }
-  return descriptor;
-}
-
-@end
-
-#pragma mark - VarType
-
-@implementation VarType
-
-@dynamic hasType, type;
-@dynamic hasSelectedRows, selectedRows;
-@dynamic hasLodTensor, lodTensor;
-@dynamic hasTensorArray_p, tensorArray_p;
-@dynamic hasReader, reader;
-@dynamic hasChannel, channel;
-@dynamic hasTuple, tuple;
-
-typedef struct VarType__storage_ {
-  uint32_t _has_storage_[1];
-  VarType_Type type;
-  VarType_TensorDesc *selectedRows;
-  VarType_LoDTensorDesc *lodTensor;
-  VarType_LoDTensorArrayDesc *tensorArray_p;
-  VarType_ReaderDesc *reader;
-  VarType_ChannelDesc *channel;
-  VarType_Tuple *tuple;
-} VarType__storage_;
-
-// This method is threadsafe because it is initially called
-// in +initialize for each subclass.
-+ (GPBDescriptor *)descriptor {
-  static GPBDescriptor *descriptor = nil;
-  if (!descriptor) {
-    static GPBMessageFieldDescription fields[] = {
-      {
-        .name = "type",
-        .dataTypeSpecific.enumDescFunc = VarType_Type_EnumDescriptor,
-        .number = VarType_FieldNumber_Type,
-        .hasIndex = 0,
-        .offset = (uint32_t)offsetof(VarType__storage_, type),
-        .flags = GPBFieldRequired | GPBFieldHasEnumDescriptor,
-        .dataType = GPBDataTypeEnum,
-      },
-      {
-        .name = "selectedRows",
-        .dataTypeSpecific.className = GPBStringifySymbol(VarType_TensorDesc),
-        .number = VarType_FieldNumber_SelectedRows,
-        .hasIndex = 1,
-        .offset = (uint32_t)offsetof(VarType__storage_, selectedRows),
-        .flags = GPBFieldOptional,
-        .dataType = GPBDataTypeMessage,
-      },
-      {
-        .name = "lodTensor",
-        .dataTypeSpecific.className = GPBStringifySymbol(VarType_LoDTensorDesc),
-        .number = VarType_FieldNumber_LodTensor,
-        .hasIndex = 2,
-        .offset = (uint32_t)offsetof(VarType__storage_, lodTensor),
-        .flags = GPBFieldOptional,
-        .dataType = GPBDataTypeMessage,
-      },
-      {
-        .name = "tensorArray_p",
-        .dataTypeSpecific.className = GPBStringifySymbol(VarType_LoDTensorArrayDesc),
-        .number = VarType_FieldNumber_TensorArray_p,
-        .hasIndex = 3,
-        .offset = (uint32_t)offsetof(VarType__storage_, tensorArray_p),
-        .flags = GPBFieldOptional,
-        .dataType = GPBDataTypeMessage,
-      },
-      {
-        .name = "reader",
-        .dataTypeSpecific.className = GPBStringifySymbol(VarType_ReaderDesc),
-        .number = VarType_FieldNumber_Reader,
-        .hasIndex = 4,
-        .offset = (uint32_t)offsetof(VarType__storage_, reader),
-        .flags = GPBFieldOptional,
-        .dataType = GPBDataTypeMessage,
-      },
-      {
-        .name = "channel",
-        .dataTypeSpecific.className = GPBStringifySymbol(VarType_ChannelDesc),
-        .number = VarType_FieldNumber_Channel,
-        .hasIndex = 5,
-        .offset = (uint32_t)offsetof(VarType__storage_, channel),
-        .flags = GPBFieldOptional,
-        .dataType = GPBDataTypeMessage,
-      },
-      {
-        .name = "tuple",
-        .dataTypeSpecific.className = GPBStringifySymbol(VarType_Tuple),
-        .number = VarType_FieldNumber_Tuple,
-        .hasIndex = 6,
-        .offset = (uint32_t)offsetof(VarType__storage_, tuple),
-        .flags = GPBFieldOptional,
-        .dataType = GPBDataTypeMessage,
-      },
-    };
-    GPBDescriptor *localDescriptor =
-        [GPBDescriptor allocDescriptorForClass:[VarType class]
-                                     rootClass:[FrameworkRoot class]
-                                          file:FrameworkRoot_FileDescriptor()
-                                        fields:fields
-                                    fieldCount:(uint32_t)(sizeof(fields) / sizeof(GPBMessageFieldDescription))
-                                   storageSize:sizeof(VarType__storage_)
-                                         flags:0];
-    NSAssert(descriptor == nil, @"Startup recursed!");
-    descriptor = localDescriptor;
-  }
-  return descriptor;
-}
-
-@end
-
-#pragma mark - Enum VarType_Type
-
-GPBEnumDescriptor *VarType_Type_EnumDescriptor(void) {
-  static GPBEnumDescriptor *descriptor = NULL;
-  if (!descriptor) {
-    static const char *valueNames =
-        "Bool\000Int16\000Int32\000Int64\000Fp16\000Fp32\000Fp64\000Si"
-        "zeT\000Uint8\000Int8\000LodTensor\000SelectedRows\000Fe"
-        "edMinibatch\000FetchList\000StepScopes\000LodRank"
-        "Table\000LodTensorArray\000PlaceList\000Reader\000Ch"
-        "annel\000Raw\000Tuple\000";
-    static const int32_t values[] = {
-        VarType_Type_Bool,
-        VarType_Type_Int16,
-        VarType_Type_Int32,
-        VarType_Type_Int64,
-        VarType_Type_Fp16,
-        VarType_Type_Fp32,
-        VarType_Type_Fp64,
-        VarType_Type_SizeT,
-        VarType_Type_Uint8,
-        VarType_Type_Int8,
-        VarType_Type_LodTensor,
-        VarType_Type_SelectedRows,
-        VarType_Type_FeedMinibatch,
-        VarType_Type_FetchList,
-        VarType_Type_StepScopes,
-        VarType_Type_LodRankTable,
-        VarType_Type_LodTensorArray,
-        VarType_Type_PlaceList,
-        VarType_Type_Reader,
-        VarType_Type_Channel,
-        VarType_Type_Raw,
-        VarType_Type_Tuple,
-    };
-    GPBEnumDescriptor *worker =
-        [GPBEnumDescriptor allocDescriptorForName:GPBNSStringifySymbol(VarType_Type)
-                                       valueNames:valueNames
-                                           values:values
-                                            count:(uint32_t)(sizeof(values) / sizeof(int32_t))
-                                     enumVerifier:VarType_Type_IsValidValue];
-    if (!OSAtomicCompareAndSwapPtrBarrier(nil, worker, (void * volatile *)&descriptor)) {
-      [worker release];
-    }
-  }
-  return descriptor;
-}
-
-BOOL VarType_Type_IsValidValue(int32_t value__) {
-  switch (value__) {
-    case VarType_Type_Bool:
-    case VarType_Type_Int16:
-    case VarType_Type_Int32:
-    case VarType_Type_Int64:
-    case VarType_Type_Fp16:
-    case VarType_Type_Fp32:
-    case VarType_Type_Fp64:
-    case VarType_Type_SizeT:
-    case VarType_Type_Uint8:
-    case VarType_Type_Int8:
-    case VarType_Type_LodTensor:
-    case VarType_Type_SelectedRows:
-    case VarType_Type_FeedMinibatch:
-    case VarType_Type_FetchList:
-    case VarType_Type_StepScopes:
-    case VarType_Type_LodRankTable:
-    case VarType_Type_LodTensorArray:
-    case VarType_Type_PlaceList:
-    case VarType_Type_Reader:
-    case VarType_Type_Channel:
-    case VarType_Type_Raw:
-    case VarType_Type_Tuple:
-      return YES;
-    default:
-      return NO;
-  }
-}
-
-#pragma mark - VarType_TensorDesc
-
-@implementation VarType_TensorDesc
-
-@dynamic hasDataType, dataType;
-@dynamic dimsArray, dimsArray_Count;
-
-typedef struct VarType_TensorDesc__storage_ {
-  uint32_t _has_storage_[1];
-  VarType_Type dataType;
-  GPBInt64Array *dimsArray;
-} VarType_TensorDesc__storage_;
-
-// This method is threadsafe because it is initially called
-// in +initialize for each subclass.
-+ (GPBDescriptor *)descriptor {
-  static GPBDescriptor *descriptor = nil;
-  if (!descriptor) {
-    static GPBMessageFieldDescription fields[] = {
-      {
-        .name = "dataType",
-        .dataTypeSpecific.enumDescFunc = VarType_Type_EnumDescriptor,
-        .number = VarType_TensorDesc_FieldNumber_DataType,
-        .hasIndex = 0,
-        .offset = (uint32_t)offsetof(VarType_TensorDesc__storage_, dataType),
-        .flags = GPBFieldRequired | GPBFieldHasEnumDescriptor,
-        .dataType = GPBDataTypeEnum,
-      },
-      {
-        .name = "dimsArray",
-        .dataTypeSpecific.className = NULL,
-        .number = VarType_TensorDesc_FieldNumber_DimsArray,
-        .hasIndex = GPBNoHasBit,
-        .offset = (uint32_t)offsetof(VarType_TensorDesc__storage_, dimsArray),
-        .flags = GPBFieldRepeated,
-        .dataType = GPBDataTypeInt64,
-      },
-    };
-    GPBDescriptor *localDescriptor =
-        [GPBDescriptor allocDescriptorForClass:[VarType_TensorDesc class]
-                                     rootClass:[FrameworkRoot class]
-                                          file:FrameworkRoot_FileDescriptor()
-                                        fields:fields
-                                    fieldCount:(uint32_t)(sizeof(fields) / sizeof(GPBMessageFieldDescription))
-                                   storageSize:sizeof(VarType_TensorDesc__storage_)
-                                         flags:0];
-    NSAssert(descriptor == nil, @"Startup recursed!");
-    descriptor = localDescriptor;
-  }
-  return descriptor;
-}
-
-@end
-
-#pragma mark - VarType_LoDTensorDesc
-
-@implementation VarType_LoDTensorDesc
-
-@dynamic hasTensor, tensor;
-@dynamic hasLodLevel, lodLevel;
-
-typedef struct VarType_LoDTensorDesc__storage_ {
-  uint32_t _has_storage_[1];
-  int32_t lodLevel;
-  VarType_TensorDesc *tensor;
-} VarType_LoDTensorDesc__storage_;
-
-// This method is threadsafe because it is initially called
-// in +initialize for each subclass.
-+ (GPBDescriptor *)descriptor {
-  static GPBDescriptor *descriptor = nil;
-  if (!descriptor) {
-    static GPBMessageFieldDescription fields[] = {
-      {
-        .name = "tensor",
-        .dataTypeSpecific.className = GPBStringifySymbol(VarType_TensorDesc),
-        .number = VarType_LoDTensorDesc_FieldNumber_Tensor,
-        .hasIndex = 0,
-        .offset = (uint32_t)offsetof(VarType_LoDTensorDesc__storage_, tensor),
-        .flags = GPBFieldRequired,
-        .dataType = GPBDataTypeMessage,
-      },
-      {
-        .name = "lodLevel",
-        .dataTypeSpecific.className = NULL,
-        .number = VarType_LoDTensorDesc_FieldNumber_LodLevel,
-        .hasIndex = 1,
-        .offset = (uint32_t)offsetof(VarType_LoDTensorDesc__storage_, lodLevel),
-        .flags = GPBFieldOptional | GPBFieldHasDefaultValue,
-        .dataType = GPBDataTypeInt32,
-      },
-    };
-    GPBDescriptor *localDescriptor =
-        [GPBDescriptor allocDescriptorForClass:[VarType_LoDTensorDesc class]
-                                     rootClass:[FrameworkRoot class]
-                                          file:FrameworkRoot_FileDescriptor()
-                                        fields:fields
-                                    fieldCount:(uint32_t)(sizeof(fields) / sizeof(GPBMessageFieldDescription))
-                                   storageSize:sizeof(VarType_LoDTensorDesc__storage_)
-                                         flags:0];
-    NSAssert(descriptor == nil, @"Startup recursed!");
-    descriptor = localDescriptor;
-  }
-  return descriptor;
-}
-
-@end
-
-#pragma mark - VarType_LoDTensorArrayDesc
-
-@implementation VarType_LoDTensorArrayDesc
-
-@dynamic hasTensor, tensor;
-@dynamic hasLodLevel, lodLevel;
-
-typedef struct VarType_LoDTensorArrayDesc__storage_ {
-  uint32_t _has_storage_[1];
-  int32_t lodLevel;
-  VarType_TensorDesc *tensor;
-} VarType_LoDTensorArrayDesc__storage_;
-
-// This method is threadsafe because it is initially called
-// in +initialize for each subclass.
-+ (GPBDescriptor *)descriptor {
-  static GPBDescriptor *descriptor = nil;
-  if (!descriptor) {
-    static GPBMessageFieldDescription fields[] = {
-      {
-        .name = "tensor",
-        .dataTypeSpecific.className = GPBStringifySymbol(VarType_TensorDesc),
-        .number = VarType_LoDTensorArrayDesc_FieldNumber_Tensor,
-        .hasIndex = 0,
-        .offset = (uint32_t)offsetof(VarType_LoDTensorArrayDesc__storage_, tensor),
-        .flags = GPBFieldRequired,
-        .dataType = GPBDataTypeMessage,
-      },
-      {
-        .name = "lodLevel",
-        .dataTypeSpecific.className = NULL,
-        .number = VarType_LoDTensorArrayDesc_FieldNumber_LodLevel,
-        .hasIndex = 1,
-        .offset = (uint32_t)offsetof(VarType_LoDTensorArrayDesc__storage_, lodLevel),
-        .flags = GPBFieldOptional | GPBFieldHasDefaultValue,
-        .dataType = GPBDataTypeInt32,
-      },
-    };
-    GPBDescriptor *localDescriptor =
-        [GPBDescriptor allocDescriptorForClass:[VarType_LoDTensorArrayDesc class]
-                                     rootClass:[FrameworkRoot class]
-                                          file:FrameworkRoot_FileDescriptor()
-                                        fields:fields
-                                    fieldCount:(uint32_t)(sizeof(fields) / sizeof(GPBMessageFieldDescription))
-                                   storageSize:sizeof(VarType_LoDTensorArrayDesc__storage_)
-                                         flags:0];
-    NSAssert(descriptor == nil, @"Startup recursed!");
-    descriptor = localDescriptor;
-  }
-  return descriptor;
-}
-
-@end
-
-#pragma mark - VarType_ReaderDesc
-
-@implementation VarType_ReaderDesc
-
-@dynamic lodTensorArray, lodTensorArray_Count;
-
-typedef struct VarType_ReaderDesc__storage_ {
-  uint32_t _has_storage_[1];
-  NSMutableArray *lodTensorArray;
-} VarType_ReaderDesc__storage_;
-
-// This method is threadsafe because it is initially called
-// in +initialize for each subclass.
-+ (GPBDescriptor *)descriptor {
-  static GPBDescriptor *descriptor = nil;
-  if (!descriptor) {
-    static GPBMessageFieldDescription fields[] = {
-      {
-        .name = "lodTensorArray",
-        .dataTypeSpecific.className = GPBStringifySymbol(VarType_LoDTensorDesc),
-        .number = VarType_ReaderDesc_FieldNumber_LodTensorArray,
-        .hasIndex = GPBNoHasBit,
-        .offset = (uint32_t)offsetof(VarType_ReaderDesc__storage_, lodTensorArray),
-        .flags = GPBFieldRepeated,
-        .dataType = GPBDataTypeMessage,
-      },
-    };
-    GPBDescriptor *localDescriptor =
-        [GPBDescriptor allocDescriptorForClass:[VarType_ReaderDesc class]
-                                     rootClass:[FrameworkRoot class]
-                                          file:FrameworkRoot_FileDescriptor()
-                                        fields:fields
-                                    fieldCount:(uint32_t)(sizeof(fields) / sizeof(GPBMessageFieldDescription))
-                                   storageSize:sizeof(VarType_ReaderDesc__storage_)
-                                         flags:0];
-    NSAssert(descriptor == nil, @"Startup recursed!");
-    descriptor = localDescriptor;
-  }
-  return descriptor;
-}
-
-@end
-
-#pragma mark - VarType_ChannelDesc
-
-@implementation VarType_ChannelDesc
-
-@dynamic hasDataType, dataType;
-@dynamic hasCapacity, capacity;
-
-typedef struct VarType_ChannelDesc__storage_ {
-  uint32_t _has_storage_[1];
-  VarType_Type dataType;
-  int64_t capacity;
-} VarType_ChannelDesc__storage_;
-
-// This method is threadsafe because it is initially called
-// in +initialize for each subclass.
-+ (GPBDescriptor *)descriptor {
-  static GPBDescriptor *descriptor = nil;
-  if (!descriptor) {
-    static GPBMessageFieldDescription fields[] = {
-      {
-        .name = "dataType",
-        .dataTypeSpecific.enumDescFunc = VarType_Type_EnumDescriptor,
-        .number = VarType_ChannelDesc_FieldNumber_DataType,
-        .hasIndex = 0,
-        .offset = (uint32_t)offsetof(VarType_ChannelDesc__storage_, dataType),
-        .flags = GPBFieldRequired | GPBFieldHasEnumDescriptor,
-        .dataType = GPBDataTypeEnum,
-      },
-      {
-        .name = "capacity",
-        .dataTypeSpecific.className = NULL,
-        .number = VarType_ChannelDesc_FieldNumber_Capacity,
-        .hasIndex = 1,
-        .offset = (uint32_t)offsetof(VarType_ChannelDesc__storage_, capacity),
-        .flags = GPBFieldRequired,
-        .dataType = GPBDataTypeInt64,
-      },
-    };
-    GPBDescriptor *localDescriptor =
-        [GPBDescriptor allocDescriptorForClass:[VarType_ChannelDesc class]
-                                     rootClass:[FrameworkRoot class]
-                                          file:FrameworkRoot_FileDescriptor()
-                                        fields:fields
-                                    fieldCount:(uint32_t)(sizeof(fields) / sizeof(GPBMessageFieldDescription))
-                                   storageSize:sizeof(VarType_ChannelDesc__storage_)
-                                         flags:0];
-    NSAssert(descriptor == nil, @"Startup recursed!");
-    descriptor = localDescriptor;
-  }
-  return descriptor;
-}
-
-@end
-
-#pragma mark - VarType_Tuple
-
-@implementation VarType_Tuple
-
-@dynamic elementTypeArray, elementTypeArray_Count;
-
-typedef struct VarType_Tuple__storage_ {
-  uint32_t _has_storage_[1];
-  GPBEnumArray *elementTypeArray;
-} VarType_Tuple__storage_;
-
-// This method is threadsafe because it is initially called
-// in +initialize for each subclass.
-+ (GPBDescriptor *)descriptor {
-  static GPBDescriptor *descriptor = nil;
-  if (!descriptor) {
-    static GPBMessageFieldDescription fields[] = {
-      {
-        .name = "elementTypeArray",
-        .dataTypeSpecific.enumDescFunc = VarType_Type_EnumDescriptor,
-        .number = VarType_Tuple_FieldNumber_ElementTypeArray,
-        .hasIndex = GPBNoHasBit,
-        .offset = (uint32_t)offsetof(VarType_Tuple__storage_, elementTypeArray),
-        .flags = GPBFieldRepeated | GPBFieldHasEnumDescriptor,
-        .dataType = GPBDataTypeEnum,
-      },
-    };
-    GPBDescriptor *localDescriptor =
-        [GPBDescriptor allocDescriptorForClass:[VarType_Tuple class]
-                                     rootClass:[FrameworkRoot class]
-                                          file:FrameworkRoot_FileDescriptor()
-                                        fields:fields
-                                    fieldCount:(uint32_t)(sizeof(fields) / sizeof(GPBMessageFieldDescription))
-                                   storageSize:sizeof(VarType_Tuple__storage_)
-                                         flags:0];
-    NSAssert(descriptor == nil, @"Startup recursed!");
-    descriptor = localDescriptor;
-  }
-  return descriptor;
-}
-
-@end
-
-#pragma mark - VarDesc
-
-@implementation VarDesc
-
-@dynamic hasName, name;
-@dynamic hasType, type;
-@dynamic hasPersistable, persistable;
-
-typedef struct VarDesc__storage_ {
-  uint32_t _has_storage_[1];
-  NSString *name;
-  VarType *type;
-} VarDesc__storage_;
-
-// This method is threadsafe because it is initially called
-// in +initialize for each subclass.
-+ (GPBDescriptor *)descriptor {
-  static GPBDescriptor *descriptor = nil;
-  if (!descriptor) {
-    static GPBMessageFieldDescription fields[] = {
-      {
-        .name = "name",
-        .dataTypeSpecific.className = NULL,
-        .number = VarDesc_FieldNumber_Name,
-        .hasIndex = 0,
-        .offset = (uint32_t)offsetof(VarDesc__storage_, name),
-        .flags = GPBFieldRequired,
-        .dataType = GPBDataTypeString,
-      },
-      {
-        .name = "type",
-        .dataTypeSpecific.className = GPBStringifySymbol(VarType),
-        .number = VarDesc_FieldNumber_Type,
-        .hasIndex = 1,
-        .offset = (uint32_t)offsetof(VarDesc__storage_, type),
-        .flags = GPBFieldRequired,
-        .dataType = GPBDataTypeMessage,
-      },
-      {
-        .name = "persistable",
-        .dataTypeSpecific.className = NULL,
-        .number = VarDesc_FieldNumber_Persistable,
-        .hasIndex = 2,
-        .offset = 3,  // Stored in _has_storage_ to save space.
-        .flags = GPBFieldOptional | GPBFieldHasDefaultValue,
-        .dataType = GPBDataTypeBool,
-      },
-    };
-    GPBDescriptor *localDescriptor =
-        [GPBDescriptor allocDescriptorForClass:[VarDesc class]
-                                     rootClass:[FrameworkRoot class]
-                                          file:FrameworkRoot_FileDescriptor()
-                                        fields:fields
-                                    fieldCount:(uint32_t)(sizeof(fields) / sizeof(GPBMessageFieldDescription))
-                                   storageSize:sizeof(VarDesc__storage_)
-                                         flags:0];
-    NSAssert(descriptor == nil, @"Startup recursed!");
-    descriptor = localDescriptor;
-  }
-  return descriptor;
-}
-
-@end
-
-#pragma mark - BlockDesc
-
-@implementation BlockDesc
-
-@dynamic hasIdx, idx;
-@dynamic hasParentIdx, parentIdx;
-@dynamic varsArray, varsArray_Count;
-@dynamic opsArray, opsArray_Count;
-@dynamic hasForwardBlockIdx, forwardBlockIdx;
-
-typedef struct BlockDesc__storage_ {
-  uint32_t _has_storage_[1];
-  int32_t idx;
-  int32_t parentIdx;
-  int32_t forwardBlockIdx;
-  NSMutableArray *varsArray;
-  NSMutableArray *opsArray;
-} BlockDesc__storage_;
-
-// This method is threadsafe because it is initially called
-// in +initialize for each subclass.
-+ (GPBDescriptor *)descriptor {
-  static GPBDescriptor *descriptor = nil;
-  if (!descriptor) {
-    static GPBMessageFieldDescriptionWithDefault fields[] = {
-      {
-        .defaultValue.valueInt32 = 0,
-        .core.name = "idx",
-        .core.dataTypeSpecific.className = NULL,
-        .core.number = BlockDesc_FieldNumber_Idx,
-        .core.hasIndex = 0,
-        .core.offset = (uint32_t)offsetof(BlockDesc__storage_, idx),
-        .core.flags = GPBFieldRequired,
-        .core.dataType = GPBDataTypeInt32,
-      },
-      {
-        .defaultValue.valueInt32 = 0,
-        .core.name = "parentIdx",
-        .core.dataTypeSpecific.className = NULL,
-        .core.number = BlockDesc_FieldNumber_ParentIdx,
-        .core.hasIndex = 1,
-        .core.offset = (uint32_t)offsetof(BlockDesc__storage_, parentIdx),
-        .core.flags = GPBFieldRequired,
-        .core.dataType = GPBDataTypeInt32,
-      },
-      {
-        .defaultValue.valueMessage = nil,
-        .core.name = "varsArray",
-        .core.dataTypeSpecific.className = GPBStringifySymbol(VarDesc),
-        .core.number = BlockDesc_FieldNumber_VarsArray,
-        .core.hasIndex = GPBNoHasBit,
-        .core.offset = (uint32_t)offsetof(BlockDesc__storage_, varsArray),
-        .core.flags = GPBFieldRepeated,
-        .core.dataType = GPBDataTypeMessage,
-      },
-      {
-        .defaultValue.valueMessage = nil,
-        .core.name = "opsArray",
-        .core.dataTypeSpecific.className = GPBStringifySymbol(OpDesc),
-        .core.number = BlockDesc_FieldNumber_OpsArray,
-        .core.hasIndex = GPBNoHasBit,
-        .core.offset = (uint32_t)offsetof(BlockDesc__storage_, opsArray),
-        .core.flags = GPBFieldRepeated,
-        .core.dataType = GPBDataTypeMessage,
-      },
-      {
-        .defaultValue.valueInt32 = -1,
-        .core.name = "forwardBlockIdx",
-        .core.dataTypeSpecific.className = NULL,
-        .core.number = BlockDesc_FieldNumber_ForwardBlockIdx,
-        .core.hasIndex = 2,
-        .core.offset = (uint32_t)offsetof(BlockDesc__storage_, forwardBlockIdx),
-        .core.flags = GPBFieldOptional | GPBFieldHasDefaultValue,
-        .core.dataType = GPBDataTypeInt32,
-      },
-    };
-    GPBDescriptor *localDescriptor =
-        [GPBDescriptor allocDescriptorForClass:[BlockDesc class]
-                                     rootClass:[FrameworkRoot class]
-                                          file:FrameworkRoot_FileDescriptor()
-                                        fields:fields
-                                    fieldCount:(uint32_t)(sizeof(fields) / sizeof(GPBMessageFieldDescriptionWithDefault))
-                                   storageSize:sizeof(BlockDesc__storage_)
-                                         flags:GPBDescriptorInitializationFlag_FieldsWithDefault];
-    NSAssert(descriptor == nil, @"Startup recursed!");
-    descriptor = localDescriptor;
-  }
-  return descriptor;
-}
-
-@end
-
-#pragma mark - ProgramDesc
-
-@implementation ProgramDesc
-
-@dynamic blocksArray, blocksArray_Count;
-@dynamic hasVersion, version;
-
-typedef struct ProgramDesc__storage_ {
-  uint32_t _has_storage_[1];
-  NSMutableArray *blocksArray;
-  Version *version;
-} ProgramDesc__storage_;
-
-// This method is threadsafe because it is initially called
-// in +initialize for each subclass.
-+ (GPBDescriptor *)descriptor {
-  static GPBDescriptor *descriptor = nil;
-  if (!descriptor) {
-    static GPBMessageFieldDescription fields[] = {
-      {
-        .name = "blocksArray",
-        .dataTypeSpecific.className = GPBStringifySymbol(BlockDesc),
-        .number = ProgramDesc_FieldNumber_BlocksArray,
-        .hasIndex = GPBNoHasBit,
-        .offset = (uint32_t)offsetof(ProgramDesc__storage_, blocksArray),
-        .flags = GPBFieldRepeated,
-        .dataType = GPBDataTypeMessage,
-      },
-      {
-        .name = "version",
-        .dataTypeSpecific.className = GPBStringifySymbol(Version),
-        .number = ProgramDesc_FieldNumber_Version,
-        .hasIndex = 0,
-        .offset = (uint32_t)offsetof(ProgramDesc__storage_, version),
-        .flags = GPBFieldOptional,
-        .dataType = GPBDataTypeMessage,
-      },
-    };
-    GPBDescriptor *localDescriptor =
-        [GPBDescriptor allocDescriptorForClass:[ProgramDesc class]
-                                     rootClass:[FrameworkRoot class]
-                                          file:FrameworkRoot_FileDescriptor()
-                                        fields:fields
-                                    fieldCount:(uint32_t)(sizeof(fields) / sizeof(GPBMessageFieldDescription))
-                                   storageSize:sizeof(ProgramDesc__storage_)
-                                         flags:0];
-    NSAssert(descriptor == nil, @"Startup recursed!");
-    descriptor = localDescriptor;
-  }
-  return descriptor;
-}
-
-@end
-
-
-#pragma clang diagnostic pop
-
-// @@protoc_insertion_point(global_scope)
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/MemoryOptimze.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/MemoryOptimze.swift
deleted file mode 100644
index 732296493e..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Program/MemoryOptimze.swift
+++ /dev/null
@@ -1,206 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class MemoryManager {
-    open var program: Program
-    open var device: MTLDevice
-    
-    init(program: Program, device: MTLDevice) {
-        self.program = program
-        self.device = device
-    }
-    
-    public func optimizeProgramMemory() {
-        
-    }
-    
-    public func reallocMemory() {
-        
-    }
-    
-    public func makeMetalTextures() {
-        for block in program.programDesc.blocks {
-            for varDesc in block.vars {
-                if !varDesc.persistable && varDesc.type == .LodTensor {
-                    let varEle = program.scope.vars[varDesc.name]
-                    if let texture = varEle as? Texture, let desc = texture.textureDesc {
-                        texture.metalTexture = device.makeTexture(descriptor: desc)
-                    }
-                }
-            }
-        }
-    }
-}
-
-@available(iOS 10.0, *)
-class MemoryOptimize: MemoryManager {
-    private class Node {
-        var visited = false
-        var name = ""
-        var count = 0
-        var texture: Texture?
-    }
-    private var memoryBuckets: [MemoryBucket] = []
-    
-    override func makeMetalTextures() {
-        for bucket in memoryBuckets {
-            bucket.makeMetalTextures()
-        }
-    }
-    
-    override func optimizeProgramMemory() {
-        let programDesc = program.programDesc
-        var createdNodes = [String: Node]()
-        var nodesArray = [Node]()
-        let scope = program.scope
-        var fetchVarNames: [String] = []
-        func appendNodes(textureDic: [String: [String]], varsDic: [String: PMVarDesc]) {
-            for dicPair in textureDic {
-                for varName in dicPair.value {
-                    if let texture = scope[varName] as? Texture {
-                        let targetVar = varsDic[varName]
-                        if targetVar?.persistable == false && targetVar?.type == .LodTensor {
-                            if let node = createdNodes[varName] {
-                                node.count += 1
-                                nodesArray.append(node)
-                            } else {
-                                let node = Node.init()
-                                node.name = varName
-                                node.count = 1
-                                node.texture = texture
-                                createdNodes[varName] = node
-                                nodesArray.append(node)
-                            }
-                        }
-                    }
-                }
-            }
-        }
-        for block in programDesc.blocks {
-            var varsDic = [String: PMVarDesc]()
-            for varDesc in block.vars {
-                varsDic[varDesc.name] = varDesc
-            }
-            for op in block.ops {
-                if op.type == gFetchType {
-                    for names in op.inputs.values {
-                        fetchVarNames.append(contentsOf: names)
-                    }
-                }
-                appendNodes(textureDic: op.inputs, varsDic: varsDic)
-                appendNodes(textureDic: op.paraInputs, varsDic: varsDic)
-                appendNodes(textureDic: op.outputs, varsDic: varsDic)
-                appendNodes(textureDic: op.inputs, varsDic: varsDic)
-                appendNodes(textureDic: op.paraInputs, varsDic: varsDic)
-            }
-        }
-        var nodeGroups: [[Node]] = []
-        for node in nodesArray {
-            node.count -= 1
-            if !node.visited {
-                node.visited = true
-                var placed = false
-                for i in 0..<nodeGroups.count {
-                    let lastNode = nodeGroups[i].last
-                    if lastNode?.count == 0 && !fetchVarNames.contains(lastNode?.name ?? "") {
-                        nodeGroups[i].append(node)
-                        placed = true
-                        break
-                    }
-                }
-                if !placed {
-                    nodeGroups.append([node])
-                }
-            }
-        }
-        var textureGroups = [[Texture]]()
-        for group in nodeGroups {
-            var textureGroup = [Texture]()
-            for node in group {
-                if let texture = node.texture {
-                    textureGroup.append(texture)
-                } else {
-                    paddleMobileLog("texture nil for node: \(node)", logLevel: .Warning)
-                }
-            }
-            textureGroups.append(textureGroup)
-        }
-        memoryBuckets.removeAll()
-        for textureGroup in textureGroups {
-            memoryBuckets.append(MemoryBucket(textures: textureGroup, device: device))
-        }
-    }
-    
-    override func reallocMemory() {
-        for bucket in memoryBuckets {
-            bucket.allocHeap()
-        }
-    }
-}
-
-@available(iOS 10.0, *)
-class MemoryBucket {
-    private var heap: MTLHeap?
-    public var textures: [Texture]
-    private var device: MTLDevice
-    
-    init(textures: [Texture], device: MTLDevice) {
-        self.device = device
-        self.textures = textures
-        allocHeap()
-    }
-    
-    public func allocHeap() {
-        let size = maxSizeForTextures(textures)
-        if size != (heap?.size ?? 0) {
-            heap?.setPurgeableState(.empty)
-            heap = makeHeapForSize(size)
-        }
-    }
-    
-    public func makeMetalTextures() {
-        guard let tmpHeap = heap else {
-            return
-        }
-        for texture in textures {
-            if let desc = texture.textureDesc {
-                let heapMetalTexture = tmpHeap.makeTexture(descriptor: desc)
-                heapMetalTexture.makeAliasable()
-                texture.metalTexture = heapMetalTexture
-            }
-        }
-    }
-    
-    private func maxSizeForTextures(_ textures: [Texture]) -> Int {
-        var maxSize = 0
-        for texture in textures {
-            if let desc = texture.textureDesc {
-                maxSize = max(maxSize, device.heapTextureSizeAndAlign(descriptor: desc).size)
-            } else {
-                paddleMobileLog("texturedesc nil for texture: \(texture)", logLevel: .Warning)
-            }
-        }
-        return maxSize
-    }
-    
-    private func makeHeapForSize(_ size: Int) -> MTLHeap? {
-        let heapDesc = MTLHeapDescriptor()
-        heapDesc.size = size
-        heapDesc.storageMode = .shared
-        return device.makeHeap(descriptor: heapDesc)
-    }
-}
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/PMBlockDesc.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/PMBlockDesc.swift
deleted file mode 100644
index dc10221eb5..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Program/PMBlockDesc.swift
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-public class PMBlockDesc {
-    let index: Int
-    let parentIndex: Int
-    public let vars: [PMVarDesc]
-    let ops: [PMOpDesc]
-    init(block: BlockDesc) throws {
-        index = Int(block.idx)
-        parentIndex = Int(block.parentIdx)
-        var vars: [PMVarDesc] = []
-        for varOfBlock in block.varsArray {
-            vars.append(PMVarDesc.init(protoVarDesc: varOfBlock as! VarDesc))
-        }
-        vars.sort { $0.name < $1.name }
-        self.vars = vars
-        var ops: [PMOpDesc] = []
-        for op in block.opsArray {
-            try ops.append(PMOpDesc.init(protoOpDesc: op as! OpDesc))
-        }
-        self.ops = ops
-    }
-    
-    init(inVars: [PMVarDesc], inOps: [PMOpDesc]) {
-        vars = inVars
-        ops = inOps
-        index = 0
-        parentIndex = 0
-    }
-    
-}
-
-extension PMBlockDesc: CustomStringConvertible, CustomDebugStringConvertible {
-    public var description: String {
-        var str = ""
-        
-        for i in 0..<ops.count {
-            str += " op \(i): "
-            let op = ops[i]
-            str += op.description
-        }
-        
-        for varDesc in vars {
-            str += varDesc.description
-        }
-        
-        return str
-    }
-    
-    public var debugDescription: String {
-        return description
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/PMOpDesc.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/PMOpDesc.swift
deleted file mode 100644
index d24780ad5b..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Program/PMOpDesc.swift
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class PMOpDesc {
-    let inputs: [String : [String]]
-    var paraInputs: [String : [String]]
-    var outputs: [String : [String]]
-    let unusedOutputs: [String : [String]]
-    var attrs: [String : Attr] = [:]
-    var type: String
-    init(protoOpDesc: OpDesc) throws {
-        type = protoOpDesc.type
-        let creator = { (vars: [OpDesc_Var], canAdd: (String) -> Bool) -> [String : [String]] in
-            var map: [String : [String]] = [:]
-            for opDescVar  in vars {
-                if (canAdd(opDescVar.parameter)) {
-                    map[opDescVar.parameter] = opDescVar.argumentsArray as? [String]
-                }
-            }
-            return map
-        }
-        
-        guard let _ = opInfos[protoOpDesc.type] else {
-            throw PaddleMobileError.makeError(type: .opError, msg: "unsupported op type \(String(describing: protoOpDesc.type))")
-        }
-        
-        inputs = creator(protoOpDesc.inputsArray as! [OpDesc_Var]) {
-            opInfos[protoOpDesc.type]?.inputs.contains($0) ?? false
-        }
-        
-        paraInputs = creator(protoOpDesc.inputsArray as! [OpDesc_Var]) {
-            !(opInfos[protoOpDesc.type]?.inputs.contains($0) ?? false)
-        }
-        
-        outputs = creator(protoOpDesc.outputsArray as! [OpDesc_Var]) {
-            opInfos[protoOpDesc.type]?.outputs.contains($0) ?? false
-        }
-        
-        unusedOutputs = creator(protoOpDesc.outputsArray as! [OpDesc_Var]) {
-            !(opInfos[protoOpDesc.type]?.outputs.contains($0) ?? false)
-        }
-        
-        for attr in protoOpDesc.attrsArray {
-            if ((attr as! OpDesc_Attr).type != .block) {
-                attrs[(attr as! OpDesc_Attr).name] = try attrWithProtoDesc(attrDesc: attr as! OpDesc_Attr)
-            }
-        }
-    }
-}
-
-extension PMOpDesc: CustomStringConvertible, CustomDebugStringConvertible {
-    var description: String {
-        var str = ""
-        str += "op type: \(type): \n"
-        str += "    op inputs: \n"
-        str += "        \(inputs) \n"
-        str += "    op para inputs: \n"
-        str += "        \(paraInputs) \n"
-        str += "    op para outputs: \n"
-        str += "        \(outputs) \n"
-        str += "    op attrs: \n"
-        str += "        \(attrs) \n"
-        
-        return str
-    }
-    
-    var debugDescription: String {
-        return description
-    }
-    
-    
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/PMProgramDesc.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/PMProgramDesc.swift
deleted file mode 100644
index c0be8eb983..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Program/PMProgramDesc.swift
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-public class PMProgramDesc {
-    public var blocks: [PMBlockDesc] = []
-    init(protoProgram: ProgramDesc) throws {
-        for block in protoProgram.blocksArray {
-            try self.blocks.append(PMBlockDesc.init(block: block as! BlockDesc))
-        }
-    }
-    
-    init() {
-    }
-}
-
-extension PMProgramDesc: CustomStringConvertible, CustomDebugStringConvertible {
-    public var description: String {
-        var str: String = ""
-        for i in 0..<blocks.count {
-            str += "block - \(i): \n"
-            str += blocks[i].description
-        }
-        return str
-    }
-    
-    public var debugDescription: String {
-        return description
-    }
-    
-    
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/PMVarDesc.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/PMVarDesc.swift
deleted file mode 100644
index 16214088f8..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Program/PMVarDesc.swift
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-public enum VarTypeType: Int {
-    case ErrorType = -1,
-    Bool = 0,
-    Int16 = 1,
-    Int32 = 2,
-    Int64 = 3,
-    FP16 = 4,
-    FP32 = 5,
-    FP64 = 6,
-    LodTensor = 7,
-    SelectedRows = 8,
-    FeedMiniBatch = 9,
-    FetchList = 10,
-    StepScopes = 11,
-    StepLodRankTable = 12,
-    StepLodTensorArray = 13,
-    StepPlaceList = 14,
-    Reader = 15,
-    Channel = 16,
-    Raw = 17,
-    Tuple = 18
-    
-    func dataTypeSize() throws -> Int {
-        switch self {
-        case .FP16:
-            return 2
-        case .FP32:
-            return 4
-        case .FP64:
-            return 8
-        case .Int32:
-            return 4
-        case .Int64:
-            return 8
-        case .Bool:
-            return 1
-        default:
-            throw PaddleMobileError.makeError(type: .memoryError, msg: "not support \(self) type to get size")
-        }
-    }
-}
-
-public class PMVarDesc {
-    public let name: String
-    public let persistable: Bool
-    public let type: VarTypeType
-    let tensorDesc: TensorDesc?
-    public var dims: [Int]? {
-        get {
-            return tensorDesc?.dims
-        }
-    }
-    init(protoVarDesc: VarDesc) {
-        type = VarTypeType.init(rawValue: Int(protoVarDesc.type.type.rawValue)) ?? .ErrorType
-        name = protoVarDesc.name
-        persistable = protoVarDesc.persistable
-        switch type {
-        case .SelectedRows:
-            tensorDesc = TensorDesc.init(protoTensorDesc: protoVarDesc.type.selectedRows)
-        case .LodTensor:
-            tensorDesc = TensorDesc.init(protoTensorDesc: protoVarDesc.type.lodTensor.tensor)
-        case .StepLodTensorArray:
-            tensorDesc = TensorDesc.init(protoTensorDesc: protoVarDesc.type.tensorArray_p.tensor);
-        default:
-            tensorDesc = .none
-        }
-    }
-}
-
-extension PMVarDesc: CustomStringConvertible, CustomDebugStringConvertible {
-    public var description: String {
-        var str = ""
-        str += "var name \(name): \n"
-        if let inTensorDesc = tensorDesc {
-            str += " dim size: \(inTensorDesc.dims.count) \n"
-            str += "    dim: \(inTensorDesc.dims) \n"
-            str += "type:\(self.type) \n"
-        } else {
-            str += " no dim info"
-        }
-        
-        return str
-    }
-    
-    public var debugDescription: String {
-        return description
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/Program.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/Program.swift
deleted file mode 100644
index f74879556b..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Program/Program.swift
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-@objc public class Program: NSObject {
-    public let paramPath: String
-    public let programDesc: PMProgramDesc
-    public let scope: Scope
-    init(inProgramDesc: PMProgramDesc, inParamPath: String, inScope: Scope) {
-        programDesc = inProgramDesc
-        paramPath = inParamPath
-        scope = inScope
-    }
-    init(inProgramDesc: PMProgramDesc, inScope: Scope) {
-        programDesc = inProgramDesc
-        scope = inScope
-        paramPath = ""
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/ProgramOptimize.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/ProgramOptimize.swift
deleted file mode 100644
index 3baa58b465..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Program/ProgramOptimize.swift
+++ /dev/null
@@ -1,308 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-precedencegroup ChainNode {
-    associativity: left
-    higherThan: MultiplicationPrecedence
-}
-
-infix operator --> : ChainNode
-
-class Node {
-    var outputs: [Node] = []
-    var type: String
-    var opDesc: PMOpDesc?
-    init(inOpDesc: PMOpDesc) {
-        type = inOpDesc.type
-        opDesc = inOpDesc
-    }
-    
-    init(inType: String) {
-        type = inType
-    }
-    
-    subscript(index: Int) -> [Node] {
-        var nodes: [Node] = []
-        getNodesWithLocation(index: index, nowIndex: 0, nodes: &nodes)
-        return nodes
-    }
-    
-    func getNodesWithLocation(index: Int, nowIndex: Int, nodes: inout [Node]) {
-        if index == nowIndex {
-            nodes.append(self)
-        }
-        
-        for output in outputs {
-            output.getNodesWithLocation(index: index, nowIndex: nowIndex + 1, nodes: &nodes)
-        }
-    }
-    
-    static func -->(lNode: Node, rNode: Node) -> Node {
-        lNode.outputs.append(rNode)
-        return rNode
-    }
-    
-    func depth(begin: UInt = 1) -> UInt {
-        var beginMax: UInt = 1
-        for output in outputs {
-            let subDepth = output.depth(begin: begin + 1)
-            beginMax = max(begin, subDepth)
-        }
-        beginMax = max(begin, beginMax)
-        return beginMax
-    }
-    
-    func to(depth: UInt) -> Node {
-        let beginNode = Node.init(inType: type)
-        beginNode.opDesc = opDesc
-        to(depth: depth - 1, withNode: beginNode)
-        return beginNode
-    }
-    
-    func folderWith(fusion: Fusion.Type, removedNodes: inout [Node]) throws {
-        let fusionNode = fusion.fusionNode()
-        let change = fusion.change()
-        let inOutputs = outputs
-        outputs.removeAll()
-        opDesc?.outputs.removeAll()
-        for i in 0..<inOutputs.count {
-            try inOutputs[i].folderWith(beginNode: self, matchNode: fusionNode.outputs[i], change: change, removedNodes: &removedNodes)
-        }
-        opDesc?.type = fusion.fusionType()
-        type = fusion.fusionType()
-    }
-    
-    private func folderWith(beginNode: Node, matchNode: Node, change: [String : [(from: String, to: String)]], removedNodes: inout [Node]) throws {
-        guard let inOpdesc = opDesc else {
-            throw PaddleMobileError.makeError(type: .loaderError, msg: "opdesc nil when optimize")
-        }
-        
-        for attr in inOpdesc.attrs {
-            beginNode.opDesc?.attrs[attr.key] = attr.value
-            //            print(beginNode.opDesc?.attrs)
-        }
-        
-        for paraInput in inOpdesc.paraInputs {
-            if let inChanges = change[type] {
-                for keyChange in inChanges {
-                    if keyChange.from == paraInput.key {
-                        beginNode.opDesc?.paraInputs[keyChange.to] = paraInput.value
-                    } else {
-                        beginNode.opDesc?.paraInputs[paraInput.key] = paraInput.value
-                    }
-                }
-            } else {
-                beginNode.opDesc?.paraInputs[paraInput.key] = paraInput.value
-            }
-        }
-        
-        if matchNode.outputs.count == 0 {
-            beginNode.outputs.append(contentsOf: outputs)
-            beginNode.opDesc?.outputs = inOpdesc.outputs
-            
-        }
-        removedNodes.append(self)
-        
-        for i in 0..<matchNode.outputs.count {
-            try outputs[i].folderWith(beginNode: beginNode, matchNode: matchNode.outputs[i], change: change, removedNodes: &removedNodes)
-        }
-        
-    }
-    
-    private func to(depth: UInt, withNode: Node) {
-        if depth < 1 {
-            return
-        }
-        
-        for output in outputs {
-            let node = Node.init(inType: output.type)
-            node.opDesc = output.opDesc
-            withNode.outputs.append(node)
-            output.to(depth: depth - 1, withNode: node)
-        }
-    }
-    
-    func relationship() -> [String : Node]{
-        var map: [String : Node] = [:]
-        relationship(map: &map)
-        return map
-    }
-    
-    private func relationship(map: inout [String : Node]) {
-        guard let inOpDesc = opDesc else {
-            return
-        }
-        
-        for output in inOpDesc.outputs {
-            for outputKey in output.value {
-                map[outputKey] = self
-            }
-        }
-        
-        for output in outputs {
-            output.relationship(map: &map)
-        }
-    }
-}
-
-extension Node: Equatable {
-    static func == (lhs: Node, rhs: Node) -> Bool {
-        if lhs.outputs.count != rhs.outputs.count {
-            return false
-        }
-        
-        if lhs.type != rhs.type {
-            return false
-        }
-        
-        for i in 0..<lhs.outputs.count {
-            if lhs.outputs[i] != rhs.outputs[i] {
-                return false
-            }
-        }
-        return true
-    }
-    
-}
-
-class ProgramOptimize<P: PrecisionProtocol> {
-    // register fusion
-    let fusionOps: [Fusion.Type] = [ConvAddBatchNormReluOp<P>.self,
-        ConvAddReluOp<P>.self,
-        ConvReluOp<P>.self,
-                                    //                                  ConvAddAddPreluOp<P>.self,
-        ConvAddPreluOp<P>.self,
-        ConvAddOp<P>.self,
-        ConvBNReluOp<P>.self,
-        DwConvBNReluOp<P>.self,
-        ElementwiseAddPreluOp<P>.self
-    ]
-    
-    func optimize(originProgramDesc: PMProgramDesc) -> PMProgramDesc? {
-        
-        guard originProgramDesc.blocks.count == 1 else {
-            paddleMobileLog("originProgramDesc.blocks.count != 1", logLevel: .FatalError, callStack: Thread.callStackSymbols)
-            return nil
-        }
-        
-        var mapForNodeChain: [String : Node] = [:]
-        var nodes: [Node] = []
-        var typeMapNodes: [String : [(node: Node, output: [String : Node])]] = [:]
-        let block = originProgramDesc.blocks[0]
-        for opDesc in block.ops {
-            if GlobalConfig.shared.debug {
-                paddleMobileLog(opDesc.type)
-            }
-            guard let opInputKeys = opInfos[opDesc.type]?.inputs, let outputKeys = opInfos[opDesc.type]?.outputs else {
-                paddleMobileLog("op inputs or outputs nil", logLevel: .FatalError, callStack: Thread.callStackSymbols)
-                return nil
-            }
-            
-            let node = Node.init(inOpDesc: opDesc)
-            for inputKey in opInputKeys {
-                if let inputs = opDesc.inputs[inputKey] {
-                    for input in inputs {
-                        if let inputNode = mapForNodeChain[input] {
-                            _ = inputNode --> node
-                        }
-                    }
-                }
-            }
-            
-            for outputKey in outputKeys {
-                if let outputs = opDesc.outputs[outputKey] {
-                    for output in outputs {
-                        mapForNodeChain[output] = node
-                    }
-                }
-            }
-            
-            nodes.append(node)
-            
-            if var inNodes = typeMapNodes[opDesc.type] {
-                inNodes.append((node, mapForNodeChain))
-                typeMapNodes[opDesc.type] = inNodes
-            } else {
-                typeMapNodes[opDesc.type] = [(node, mapForNodeChain)]
-            }
-        }
-        
-        for fusion in fusionOps {
-            let fusionNode = fusion.fusionNode()
-            let depth = fusionNode.depth()
-            if let toMatchNodes = typeMapNodes[fusionNode.type] {
-                for node in toMatchNodes {
-                    
-                    let toNode = node.node.to(depth: depth)
-                    if toNode == fusionNode {   // match
-                        var canFolder = true
-                        let relationshipMap = toNode.relationship()
-                        
-                        for toCheck in fusion.needCheck() {
-                            //              let nodes = toCheck
-                            let checkNodes = toNode[toCheck.0]
-                            
-                            for checkNode in checkNodes {
-                                let inputToChecks = checkNode.opDesc?.inputs[toCheck.1] ?? []
-                                for inputToCheck in inputToChecks {
-                                    if node.output[inputToCheck] == nil {
-                                        if relationshipMap[inputToCheck] == nil {
-                                            canFolder = false
-                                        }
-                                    }
-                                }
-                                
-                                let paramInputToChecks = checkNode.opDesc?.paraInputs[toCheck.1] ?? []
-                                for paramInputToCheck in paramInputToChecks {
-                                    if node.output[paramInputToCheck] == nil {
-                                        if relationshipMap[paramInputToCheck] == nil {
-                                            canFolder = false
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                        
-                        if !canFolder {
-                            continue
-                        }
-                        
-                        var removeNodes: [Node] = []
-                        do {
-                            try node.node.folderWith(fusion: fusion, removedNodes: &removeNodes)
-                        } catch _ {
-                            return nil
-                        }
-                        
-                        for removeNode in removeNodes {
-                            nodes.remove(element: removeNode)
-                        }
-                    }
-                }
-            }
-        }
-        
-        var ops: [PMOpDesc] = []
-        for node in nodes {
-            ops.append(node.opDesc!)
-        }
-        
-        let newProgramDesc = PMProgramDesc.init()
-        let newBlock = PMBlockDesc.init(inVars: block.vars, inOps: ops)
-        newProgramDesc.blocks.append(newBlock)
-        return newProgramDesc
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/Scope.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/Scope.swift
deleted file mode 100644
index 478867b08c..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Program/Scope.swift
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-public class Scope {
-    let feedKey: String
-    let fetchKey: String
-    func setInput(input: Variant) {
-        vars[feedKey] = input
-    }
-    
-    func setOutput(output: Variant) {
-        vars[fetchKey] = output
-    }
-    
-    func input() -> Variant? {
-        return vars[feedKey];
-    }
-    
-    public func output() -> Variant? {
-        return vars[fetchKey];
-    }
-    
-    init(inFeedKey: String, inFetchKey: String) {
-        feedKey = inFeedKey
-        fetchKey = inFetchKey
-    }
-    
-    public var vars: [String : Variant] = [:]
-    subscript(key: String) -> Variant?{
-        get {
-            return vars[key]
-        }
-        set {
-            vars[key] = newValue
-        }
-        
-    }
-    
-    func clear(){
-        vars.removeAll()
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/TensorDesc.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/TensorDesc.swift
deleted file mode 100644
index c5606897ae..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Program/TensorDesc.swift
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-class TensorDesc {
-    let dims: [Int]
-    let originDimsCount: Int
-    let dataType: VarTypeType
-    let dataLayout: DataLayout = DataLayout.NCHW()
-    var NCHWDim: [Int]? {
-        get {
-            if dims.count != 4 {
-                return dims
-            }
-            if dataLayout == DataLayout.NCHW() {
-                return dims
-            } else if dataLayout == DataLayout.NHWC() {
-                var resultDims = dims
-                resultDims.swapAt(1, 3)
-                return resultDims
-            } else {
-                paddleMobileLog("not support other layout", logLevel: .FatalError, callStack: Thread.callStackSymbols)
-                return nil
-            }
-        }
-    }
-    
-    var NHWCDim: [Int]? {
-        get {
-            if dims.count != 4 {
-                return dims
-            }
-            if dataLayout == DataLayout.NHWC() {
-                return dims
-            } else if dataLayout == DataLayout.NCHW() {
-                var resultDims = dims
-                resultDims.swapAt(1, 3)
-                return resultDims
-            } else {
-                paddleMobileLog("not support other layout", logLevel: .FatalError, callStack: Thread.callStackSymbols)
-                return nil
-            }
-        }
-    }
-    
-    init?(protoTensorDesc: VarType_TensorDesc) {
-        //        dims = protoTensorDesc.dimsArray.map{ Int64($0)! > 0 ? Int64($0) : abs(Int64($0)) }
-        
-        var dimsArray = [Int]()
-        
-        let dimsCount = protoTensorDesc.dimsArray.count
-        for i in 0..<dimsCount {
-            let dim = Int(protoTensorDesc.dimsArray.value(at: i)) > 0 ?Int(protoTensorDesc.dimsArray.value(at: i)) :abs(Int(protoTensorDesc.dimsArray.value(at: i)))
-            dimsArray.append(dim)
-        }
-        
-        originDimsCount = Int(dimsCount)
-        
-        if dimsCount > 4 {
-            let headDims = Int(dimsCount - 4)
-            for i in 0..<headDims {
-                guard dimsArray[i] <= 1 else {
-                    paddleMobileLog("dims count is larger than 4 and can't be truncated to 4", logLevel: .FatalError, callStack: Thread.callStackSymbols)
-                    return nil
-                }
-            }
-            for _ in 0..<headDims {
-                dimsArray.removeFirst()
-            }
-        }
-        
-        dims = dimsArray
-        
-        dataType = VarTypeType.init(rawValue: Int(protoTensorDesc.dataType.rawValue)) ?? .ErrorType
-    }
-    
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/framework.pb.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/framework.pb.swift
deleted file mode 100644
index 4a320a9eff..0000000000
--- a/metal/paddle-mobile/paddle-mobile/Src/Program/framework.pb.swift
+++ /dev/null
@@ -1,1820 +0,0 @@
-// DO NOT EDIT.
-//
-// Generated by the Swift generator plugin for the protocol buffer compiler.
-// Source: framework.proto
-//
-// For information on using the generated types, please see the documenation:
-//   https://github.com/apple/swift-protobuf/
-
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-//
-//Licensed under the Apache License, Version 2.0 (the "License");
-//you may not use this file except in compliance with the License.
-//You may obtain a copy of the License at
-//
-//http://www.apache.org/licenses/LICENSE-2.0
-//
-//Unless required by applicable law or agreed to in writing, software
-//distributed under the License is distributed on an "AS IS" BASIS,
-//WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//See the License for the specific language governing permissions and
-//limitations under the License. 
-
-import Foundation
-
-// If the compiler emits an error on this type, it is because this file
-// was generated by a version of the `protoc` Swift plug-in that is
-// incompatible with the version of SwiftProtobuf to which you are linking.
-// Please ensure that your are building against the same version of the API
-// that was used to generate this file.
-fileprivate struct _GeneratedWithProtocGenSwiftVersion: SwiftProtobuf.ProtobufAPIVersionCheck {
-  struct _2: SwiftProtobuf.ProtobufAPIVersion_2 {}
-  typealias Version = _2
-}
-
-enum PaddleMobile_Framework_Proto_AttrType: SwiftProtobuf.Enum {
-  typealias RawValue = Int
-  case int // = 0
-  case float // = 1
-  case string // = 2
-  case ints // = 3
-  case floats // = 4
-  case strings // = 5
-  case boolean // = 6
-  case booleans // = 7
-  case block // = 8
-  case long // = 9
-
-  init() {
-    self = .int
-  }
-
-  init?(rawValue: Int) {
-    switch rawValue {
-    case 0: self = .int
-    case 1: self = .float
-    case 2: self = .string
-    case 3: self = .ints
-    case 4: self = .floats
-    case 5: self = .strings
-    case 6: self = .boolean
-    case 7: self = .booleans
-    case 8: self = .block
-    case 9: self = .long
-    default: return nil
-    }
-  }
-
-  var rawValue: Int {
-    switch self {
-    case .int: return 0
-    case .float: return 1
-    case .string: return 2
-    case .ints: return 3
-    case .floats: return 4
-    case .strings: return 5
-    case .boolean: return 6
-    case .booleans: return 7
-    case .block: return 8
-    case .long: return 9
-    }
-  }
-
-}
-
-/// OpDesc describes an instance of a C++ framework::OperatorBase
-/// derived class type.
-struct PaddleMobile_Framework_Proto_OpDesc {
-  // SwiftProtobuf.Message conformance is added in an extension below. See the
-  // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
-  // methods supported on all messages.
-
-  var type: String {
-    get {return _type ?? String()}
-    set {_type = newValue}
-  }
-  /// Returns true if `type` has been explicitly set.
-  var hasType: Bool {return self._type != nil}
-  /// Clears the value of `type`. Subsequent reads from it will return its default value.
-  mutating func clearType() {self._type = nil}
-
-  var inputs: [PaddleMobile_Framework_Proto_OpDesc.Var] = []
-
-  var outputs: [PaddleMobile_Framework_Proto_OpDesc.Var] = []
-
-  var attrs: [PaddleMobile_Framework_Proto_OpDesc.Attr] = []
-
-  var isTarget: Bool {
-    get {return _isTarget ?? false}
-    set {_isTarget = newValue}
-  }
-  /// Returns true if `isTarget` has been explicitly set.
-  var hasIsTarget: Bool {return self._isTarget != nil}
-  /// Clears the value of `isTarget`. Subsequent reads from it will return its default value.
-  mutating func clearIsTarget() {self._isTarget = nil}
-
-  var unknownFields = SwiftProtobuf.UnknownStorage()
-
-  struct Attr {
-    // SwiftProtobuf.Message conformance is added in an extension below. See the
-    // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
-    // methods supported on all messages.
-
-    var name: String {
-      get {return _name ?? String()}
-      set {_name = newValue}
-    }
-    /// Returns true if `name` has been explicitly set.
-    var hasName: Bool {return self._name != nil}
-    /// Clears the value of `name`. Subsequent reads from it will return its default value.
-    mutating func clearName() {self._name = nil}
-
-    var type: PaddleMobile_Framework_Proto_AttrType {
-      get {return _type ?? .int}
-      set {_type = newValue}
-    }
-    /// Returns true if `type` has been explicitly set.
-    var hasType: Bool {return self._type != nil}
-    /// Clears the value of `type`. Subsequent reads from it will return its default value.
-    mutating func clearType() {self._type = nil}
-
-    var i: Int32 {
-      get {return _i ?? 0}
-      set {_i = newValue}
-    }
-    /// Returns true if `i` has been explicitly set.
-    var hasI: Bool {return self._i != nil}
-    /// Clears the value of `i`. Subsequent reads from it will return its default value.
-    mutating func clearI() {self._i = nil}
-
-    var f: Float {
-      get {return _f ?? 0}
-      set {_f = newValue}
-    }
-    /// Returns true if `f` has been explicitly set.
-    var hasF: Bool {return self._f != nil}
-    /// Clears the value of `f`. Subsequent reads from it will return its default value.
-    mutating func clearF() {self._f = nil}
-
-    var s: String {
-      get {return _s ?? String()}
-      set {_s = newValue}
-    }
-    /// Returns true if `s` has been explicitly set.
-    var hasS: Bool {return self._s != nil}
-    /// Clears the value of `s`. Subsequent reads from it will return its default value.
-    mutating func clearS() {self._s = nil}
-
-    var ints: [Int32] = []
-
-    var floats: [Float] = []
-
-    var strings: [String] = []
-
-    var b: Bool {
-      get {return _b ?? false}
-      set {_b = newValue}
-    }
-    /// Returns true if `b` has been explicitly set.
-    var hasB: Bool {return self._b != nil}
-    /// Clears the value of `b`. Subsequent reads from it will return its default value.
-    mutating func clearB() {self._b = nil}
-
-    var bools: [Bool] = []
-
-    var blockIdx: Int32 {
-      get {return _blockIdx ?? 0}
-      set {_blockIdx = newValue}
-    }
-    /// Returns true if `blockIdx` has been explicitly set.
-    var hasBlockIdx: Bool {return self._blockIdx != nil}
-    /// Clears the value of `blockIdx`. Subsequent reads from it will return its default value.
-    mutating func clearBlockIdx() {self._blockIdx = nil}
-
-    var l: Int64 {
-      get {return _l ?? 0}
-      set {_l = newValue}
-    }
-    /// Returns true if `l` has been explicitly set.
-    var hasL: Bool {return self._l != nil}
-    /// Clears the value of `l`. Subsequent reads from it will return its default value.
-    mutating func clearL() {self._l = nil}
-
-    var unknownFields = SwiftProtobuf.UnknownStorage()
-
-    init() {}
-
-    fileprivate var _name: String? = nil
-    fileprivate var _type: PaddleMobile_Framework_Proto_AttrType? = nil
-    fileprivate var _i: Int32? = nil
-    fileprivate var _f: Float? = nil
-    fileprivate var _s: String? = nil
-    fileprivate var _b: Bool? = nil
-    fileprivate var _blockIdx: Int32? = nil
-    fileprivate var _l: Int64? = nil
-  }
-
-  struct Var {
-    // SwiftProtobuf.Message conformance is added in an extension below. See the
-    // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
-    // methods supported on all messages.
-
-    var parameter: String {
-      get {return _parameter ?? String()}
-      set {_parameter = newValue}
-    }
-    /// Returns true if `parameter` has been explicitly set.
-    var hasParameter: Bool {return self._parameter != nil}
-    /// Clears the value of `parameter`. Subsequent reads from it will return its default value.
-    mutating func clearParameter() {self._parameter = nil}
-
-    var arguments: [String] = []
-
-    var unknownFields = SwiftProtobuf.UnknownStorage()
-
-    init() {}
-
-    fileprivate var _parameter: String? = nil
-  }
-
-  init() {}
-
-  fileprivate var _type: String? = nil
-  fileprivate var _isTarget: Bool? = nil
-}
-
-/// OpProto describes a C++ framework::OperatorBase derived class.
-struct PaddleMobile_Framework_Proto_OpProto {
-  // SwiftProtobuf.Message conformance is added in an extension below. See the
-  // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
-  // methods supported on all messages.
-
-  var type: String {
-    get {return _type ?? String()}
-    set {_type = newValue}
-  }
-  /// Returns true if `type` has been explicitly set.
-  var hasType: Bool {return self._type != nil}
-  /// Clears the value of `type`. Subsequent reads from it will return its default value.
-  mutating func clearType() {self._type = nil}
-
-  var inputs: [PaddleMobile_Framework_Proto_OpProto.Var] = []
-
-  var outputs: [PaddleMobile_Framework_Proto_OpProto.Var] = []
-
-  var attrs: [PaddleMobile_Framework_Proto_OpProto.Attr] = []
-
-  var comment: String {
-    get {return _comment ?? String()}
-    set {_comment = newValue}
-  }
-  /// Returns true if `comment` has been explicitly set.
-  var hasComment: Bool {return self._comment != nil}
-  /// Clears the value of `comment`. Subsequent reads from it will return its default value.
-  mutating func clearComment() {self._comment = nil}
-
-  var unknownFields = SwiftProtobuf.UnknownStorage()
-
-  /// VarProto describes the C++ type framework::Variable.
-  struct Var {
-    // SwiftProtobuf.Message conformance is added in an extension below. See the
-    // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
-    // methods supported on all messages.
-
-    var name: String {
-      get {return _name ?? String()}
-      set {_name = newValue}
-    }
-    /// Returns true if `name` has been explicitly set.
-    var hasName: Bool {return self._name != nil}
-    /// Clears the value of `name`. Subsequent reads from it will return its default value.
-    mutating func clearName() {self._name = nil}
-
-    var comment: String {
-      get {return _comment ?? String()}
-      set {_comment = newValue}
-    }
-    /// Returns true if `comment` has been explicitly set.
-    var hasComment: Bool {return self._comment != nil}
-    /// Clears the value of `comment`. Subsequent reads from it will return its default value.
-    mutating func clearComment() {self._comment = nil}
-
-    var duplicable: Bool {
-      get {return _duplicable ?? false}
-      set {_duplicable = newValue}
-    }
-    /// Returns true if `duplicable` has been explicitly set.
-    var hasDuplicable: Bool {return self._duplicable != nil}
-    /// Clears the value of `duplicable`. Subsequent reads from it will return its default value.
-    mutating func clearDuplicable() {self._duplicable = nil}
-
-    var intermediate: Bool {
-      get {return _intermediate ?? false}
-      set {_intermediate = newValue}
-    }
-    /// Returns true if `intermediate` has been explicitly set.
-    var hasIntermediate: Bool {return self._intermediate != nil}
-    /// Clears the value of `intermediate`. Subsequent reads from it will return its default value.
-    mutating func clearIntermediate() {self._intermediate = nil}
-
-    var dispensable: Bool {
-      get {return _dispensable ?? false}
-      set {_dispensable = newValue}
-    }
-    /// Returns true if `dispensable` has been explicitly set.
-    var hasDispensable: Bool {return self._dispensable != nil}
-    /// Clears the value of `dispensable`. Subsequent reads from it will return its default value.
-    mutating func clearDispensable() {self._dispensable = nil}
-
-    var unknownFields = SwiftProtobuf.UnknownStorage()
-
-    init() {}
-
-    fileprivate var _name: String? = nil
-    fileprivate var _comment: String? = nil
-    fileprivate var _duplicable: Bool? = nil
-    fileprivate var _intermediate: Bool? = nil
-    fileprivate var _dispensable: Bool? = nil
-  }
-
-  /// AttrProto describes the C++ type Attribute.
-  struct Attr {
-    // SwiftProtobuf.Message conformance is added in an extension below. See the
-    // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
-    // methods supported on all messages.
-
-    var name: String {
-      get {return _name ?? String()}
-      set {_name = newValue}
-    }
-    /// Returns true if `name` has been explicitly set.
-    var hasName: Bool {return self._name != nil}
-    /// Clears the value of `name`. Subsequent reads from it will return its default value.
-    mutating func clearName() {self._name = nil}
-
-    var type: PaddleMobile_Framework_Proto_AttrType {
-      get {return _type ?? .int}
-      set {_type = newValue}
-    }
-    /// Returns true if `type` has been explicitly set.
-    var hasType: Bool {return self._type != nil}
-    /// Clears the value of `type`. Subsequent reads from it will return its default value.
-    mutating func clearType() {self._type = nil}
-
-    var comment: String {
-      get {return _comment ?? String()}
-      set {_comment = newValue}
-    }
-    /// Returns true if `comment` has been explicitly set.
-    var hasComment: Bool {return self._comment != nil}
-    /// Clears the value of `comment`. Subsequent reads from it will return its default value.
-    mutating func clearComment() {self._comment = nil}
-
-    /// If that attribute is generated, it means the Paddle third
-    /// language binding has responsibility to fill that
-    /// attribute. End-User should not set that attribute.
-    var generated: Bool {
-      get {return _generated ?? false}
-      set {_generated = newValue}
-    }
-    /// Returns true if `generated` has been explicitly set.
-    var hasGenerated: Bool {return self._generated != nil}
-    /// Clears the value of `generated`. Subsequent reads from it will return its default value.
-    mutating func clearGenerated() {self._generated = nil}
-
-    var unknownFields = SwiftProtobuf.UnknownStorage()
-
-    init() {}
-
-    fileprivate var _name: String? = nil
-    fileprivate var _type: PaddleMobile_Framework_Proto_AttrType? = nil
-    fileprivate var _comment: String? = nil
-    fileprivate var _generated: Bool? = nil
-  }
-
-  init() {}
-
-  fileprivate var _type: String? = nil
-  fileprivate var _comment: String? = nil
-}
-
-struct PaddleMobile_Framework_Proto_VarType {
-  // SwiftProtobuf.Message conformance is added in an extension below. See the
-  // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
-  // methods supported on all messages.
-
-  var type: PaddleMobile_Framework_Proto_VarType.TypeEnum {
-    get {return _storage._type ?? .bool}
-    set {_uniqueStorage()._type = newValue}
-  }
-  /// Returns true if `type` has been explicitly set.
-  var hasType: Bool {return _storage._type != nil}
-  /// Clears the value of `type`. Subsequent reads from it will return its default value.
-  mutating func clearType() {_storage._type = nil}
-
-  var selectedRows: PaddleMobile_Framework_Proto_VarType.TensorDesc {
-    get {return _storage._selectedRows ?? PaddleMobile_Framework_Proto_VarType.TensorDesc()}
-    set {_uniqueStorage()._selectedRows = newValue}
-  }
-  /// Returns true if `selectedRows` has been explicitly set.
-  var hasSelectedRows: Bool {return _storage._selectedRows != nil}
-  /// Clears the value of `selectedRows`. Subsequent reads from it will return its default value.
-  mutating func clearSelectedRows() {_storage._selectedRows = nil}
-
-  var lodTensor: PaddleMobile_Framework_Proto_VarType.LoDTensorDesc {
-    get {return _storage._lodTensor ?? PaddleMobile_Framework_Proto_VarType.LoDTensorDesc()}
-    set {_uniqueStorage()._lodTensor = newValue}
-  }
-  /// Returns true if `lodTensor` has been explicitly set.
-  var hasLodTensor: Bool {return _storage._lodTensor != nil}
-  /// Clears the value of `lodTensor`. Subsequent reads from it will return its default value.
-  mutating func clearLodTensor() {_storage._lodTensor = nil}
-
-  var tensorArray: PaddleMobile_Framework_Proto_VarType.LoDTensorArrayDesc {
-    get {return _storage._tensorArray ?? PaddleMobile_Framework_Proto_VarType.LoDTensorArrayDesc()}
-    set {_uniqueStorage()._tensorArray = newValue}
-  }
-  /// Returns true if `tensorArray` has been explicitly set.
-  var hasTensorArray: Bool {return _storage._tensorArray != nil}
-  /// Clears the value of `tensorArray`. Subsequent reads from it will return its default value.
-  mutating func clearTensorArray() {_storage._tensorArray = nil}
-
-  var reader: PaddleMobile_Framework_Proto_VarType.ReaderDesc {
-    get {return _storage._reader ?? PaddleMobile_Framework_Proto_VarType.ReaderDesc()}
-    set {_uniqueStorage()._reader = newValue}
-  }
-  /// Returns true if `reader` has been explicitly set.
-  var hasReader: Bool {return _storage._reader != nil}
-  /// Clears the value of `reader`. Subsequent reads from it will return its default value.
-  mutating func clearReader() {_storage._reader = nil}
-
-  var channel: PaddleMobile_Framework_Proto_VarType.ChannelDesc {
-    get {return _storage._channel ?? PaddleMobile_Framework_Proto_VarType.ChannelDesc()}
-    set {_uniqueStorage()._channel = newValue}
-  }
-  /// Returns true if `channel` has been explicitly set.
-  var hasChannel: Bool {return _storage._channel != nil}
-  /// Clears the value of `channel`. Subsequent reads from it will return its default value.
-  mutating func clearChannel() {_storage._channel = nil}
-
-  var tuple: PaddleMobile_Framework_Proto_VarType.Tuple {
-    get {return _storage._tuple ?? PaddleMobile_Framework_Proto_VarType.Tuple()}
-    set {_uniqueStorage()._tuple = newValue}
-  }
-  /// Returns true if `tuple` has been explicitly set.
-  var hasTuple: Bool {return _storage._tuple != nil}
-  /// Clears the value of `tuple`. Subsequent reads from it will return its default value.
-  mutating func clearTuple() {_storage._tuple = nil}
-
-  var unknownFields = SwiftProtobuf.UnknownStorage()
-
-  enum TypeEnum: SwiftProtobuf.Enum {
-    typealias RawValue = Int
-
-    /// Pod Types
-    case bool // = 0
-    case int16 // = 1
-    case int32 // = 2
-    case int64 // = 3
-    case fp16 // = 4
-    case fp32 // = 5
-    case fp64 // = 6
-
-    /// Other types that may need additional descriptions
-    case lodTensor // = 7
-    case selectedRows // = 8
-    case feedMinibatch // = 9
-    case fetchList // = 10
-    case stepScopes // = 11
-    case lodRankTable // = 12
-    case lodTensorArray // = 13
-    case placeList // = 14
-    case reader // = 15
-    case channel // = 16
-
-    /// Any runtime decided variable type is raw
-    /// raw variables should manage their own allocations
-    /// in operators like nccl_op
-    case raw // = 17
-    case tuple // = 18
-
-    init() {
-      self = .bool
-    }
-
-    init?(rawValue: Int) {
-      switch rawValue {
-      case 0: self = .bool
-      case 1: self = .int16
-      case 2: self = .int32
-      case 3: self = .int64
-      case 4: self = .fp16
-      case 5: self = .fp32
-      case 6: self = .fp64
-      case 7: self = .lodTensor
-      case 8: self = .selectedRows
-      case 9: self = .feedMinibatch
-      case 10: self = .fetchList
-      case 11: self = .stepScopes
-      case 12: self = .lodRankTable
-      case 13: self = .lodTensorArray
-      case 14: self = .placeList
-      case 15: self = .reader
-      case 16: self = .channel
-      case 17: self = .raw
-      case 18: self = .tuple
-      default: return nil
-      }
-    }
-
-    var rawValue: Int {
-      switch self {
-      case .bool: return 0
-      case .int16: return 1
-      case .int32: return 2
-      case .int64: return 3
-      case .fp16: return 4
-      case .fp32: return 5
-      case .fp64: return 6
-      case .lodTensor: return 7
-      case .selectedRows: return 8
-      case .feedMinibatch: return 9
-      case .fetchList: return 10
-      case .stepScopes: return 11
-      case .lodRankTable: return 12
-      case .lodTensorArray: return 13
-      case .placeList: return 14
-      case .reader: return 15
-      case .channel: return 16
-      case .raw: return 17
-      case .tuple: return 18
-      }
-    }
-
-  }
-
-  struct TensorDesc {
-    // SwiftProtobuf.Message conformance is added in an extension below. See the
-    // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
-    // methods supported on all messages.
-
-    /// Should only be PODType. Is enforced in C++
-    var dataType: PaddleMobile_Framework_Proto_VarType.TypeEnum {
-      get {return _dataType ?? .bool}
-      set {_dataType = newValue}
-    }
-    /// Returns true if `dataType` has been explicitly set.
-    var hasDataType: Bool {return self._dataType != nil}
-    /// Clears the value of `dataType`. Subsequent reads from it will return its default value.
-    mutating func clearDataType() {self._dataType = nil}
-
-    /// [UNK, 640, 480] is saved as [-1, 640, 480]
-    var dims: [Int64] = []
-
-    var unknownFields = SwiftProtobuf.UnknownStorage()
-
-    init() {}
-
-    fileprivate var _dataType: PaddleMobile_Framework_Proto_VarType.TypeEnum? = nil
-  }
-
-  struct LoDTensorDesc {
-    // SwiftProtobuf.Message conformance is added in an extension below. See the
-    // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
-    // methods supported on all messages.
-
-    var tensor: PaddleMobile_Framework_Proto_VarType.TensorDesc {
-      get {return _storage._tensor ?? PaddleMobile_Framework_Proto_VarType.TensorDesc()}
-      set {_uniqueStorage()._tensor = newValue}
-    }
-    /// Returns true if `tensor` has been explicitly set.
-    var hasTensor: Bool {return _storage._tensor != nil}
-    /// Clears the value of `tensor`. Subsequent reads from it will return its default value.
-    mutating func clearTensor() {_storage._tensor = nil}
-
-    var lodLevel: Int32 {
-      get {return _storage._lodLevel ?? 0}
-      set {_uniqueStorage()._lodLevel = newValue}
-    }
-    /// Returns true if `lodLevel` has been explicitly set.
-    var hasLodLevel: Bool {return _storage._lodLevel != nil}
-    /// Clears the value of `lodLevel`. Subsequent reads from it will return its default value.
-    mutating func clearLodLevel() {_storage._lodLevel = nil}
-
-    var unknownFields = SwiftProtobuf.UnknownStorage()
-
-    init() {}
-
-    fileprivate var _storage = _StorageClass.defaultInstance
-  }
-
-  struct LoDTensorArrayDesc {
-    // SwiftProtobuf.Message conformance is added in an extension below. See the
-    // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
-    // methods supported on all messages.
-
-    var tensor: PaddleMobile_Framework_Proto_VarType.TensorDesc {
-      get {return _storage._tensor ?? PaddleMobile_Framework_Proto_VarType.TensorDesc()}
-      set {_uniqueStorage()._tensor = newValue}
-    }
-    /// Returns true if `tensor` has been explicitly set.
-    var hasTensor: Bool {return _storage._tensor != nil}
-    /// Clears the value of `tensor`. Subsequent reads from it will return its default value.
-    mutating func clearTensor() {_storage._tensor = nil}
-
-    var lodLevel: Int32 {
-      get {return _storage._lodLevel ?? 0}
-      set {_uniqueStorage()._lodLevel = newValue}
-    }
-    /// Returns true if `lodLevel` has been explicitly set.
-    var hasLodLevel: Bool {return _storage._lodLevel != nil}
-    /// Clears the value of `lodLevel`. Subsequent reads from it will return its default value.
-    mutating func clearLodLevel() {_storage._lodLevel = nil}
-
-    var unknownFields = SwiftProtobuf.UnknownStorage()
-
-    init() {}
-
-    fileprivate var _storage = _StorageClass.defaultInstance
-  }
-
-  struct ReaderDesc {
-    // SwiftProtobuf.Message conformance is added in an extension below. See the
-    // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
-    // methods supported on all messages.
-
-    var lodTensor: [PaddleMobile_Framework_Proto_VarType.LoDTensorDesc] = []
-
-    var unknownFields = SwiftProtobuf.UnknownStorage()
-
-    init() {}
-  }
-
-  struct ChannelDesc {
-    // SwiftProtobuf.Message conformance is added in an extension below. See the
-    // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
-    // methods supported on all messages.
-
-    var dataType: PaddleMobile_Framework_Proto_VarType.TypeEnum {
-      get {return _dataType ?? .bool}
-      set {_dataType = newValue}
-    }
-    /// Returns true if `dataType` has been explicitly set.
-    var hasDataType: Bool {return self._dataType != nil}
-    /// Clears the value of `dataType`. Subsequent reads from it will return its default value.
-    mutating func clearDataType() {self._dataType = nil}
-
-    var capacity: Int64 {
-      get {return _capacity ?? 0}
-      set {_capacity = newValue}
-    }
-    /// Returns true if `capacity` has been explicitly set.
-    var hasCapacity: Bool {return self._capacity != nil}
-    /// Clears the value of `capacity`. Subsequent reads from it will return its default value.
-    mutating func clearCapacity() {self._capacity = nil}
-
-    var unknownFields = SwiftProtobuf.UnknownStorage()
-
-    init() {}
-
-    fileprivate var _dataType: PaddleMobile_Framework_Proto_VarType.TypeEnum? = nil
-    fileprivate var _capacity: Int64? = nil
-  }
-
-  struct Tuple {
-    // SwiftProtobuf.Message conformance is added in an extension below. See the
-    // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
-    // methods supported on all messages.
-
-    var elementType: [PaddleMobile_Framework_Proto_VarType.TypeEnum] = []
-
-    var unknownFields = SwiftProtobuf.UnknownStorage()
-
-    init() {}
-  }
-
-  init() {}
-
-  fileprivate var _storage = _StorageClass.defaultInstance
-}
-
-struct PaddleMobile_Framework_Proto_VarDesc {
-  // SwiftProtobuf.Message conformance is added in an extension below. See the
-  // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
-  // methods supported on all messages.
-
-  var name: String {
-    get {return _storage._name ?? String()}
-    set {_uniqueStorage()._name = newValue}
-  }
-  /// Returns true if `name` has been explicitly set.
-  var hasName: Bool {return _storage._name != nil}
-  /// Clears the value of `name`. Subsequent reads from it will return its default value.
-  mutating func clearName() {_storage._name = nil}
-
-  var type: PaddleMobile_Framework_Proto_VarType {
-    get {return _storage._type ?? PaddleMobile_Framework_Proto_VarType()}
-    set {_uniqueStorage()._type = newValue}
-  }
-  /// Returns true if `type` has been explicitly set.
-  var hasType: Bool {return _storage._type != nil}
-  /// Clears the value of `type`. Subsequent reads from it will return its default value.
-  mutating func clearType() {_storage._type = nil}
-
-  var persistable: Bool {
-    get {return _storage._persistable ?? false}
-    set {_uniqueStorage()._persistable = newValue}
-  }
-  /// Returns true if `persistable` has been explicitly set.
-  var hasPersistable: Bool {return _storage._persistable != nil}
-  /// Clears the value of `persistable`. Subsequent reads from it will return its default value.
-  mutating func clearPersistable() {_storage._persistable = nil}
-
-  var unknownFields = SwiftProtobuf.UnknownStorage()
-
-  init() {}
-
-  fileprivate var _storage = _StorageClass.defaultInstance
-}
-
-struct PaddleMobile_Framework_Proto_BlockDesc {
-  // SwiftProtobuf.Message conformance is added in an extension below. See the
-  // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
-  // methods supported on all messages.
-
-  var idx: Int32 {
-    get {return _idx ?? 0}
-    set {_idx = newValue}
-  }
-  /// Returns true if `idx` has been explicitly set.
-  var hasIdx: Bool {return self._idx != nil}
-  /// Clears the value of `idx`. Subsequent reads from it will return its default value.
-  mutating func clearIdx() {self._idx = nil}
-
-  var parentIdx: Int32 {
-    get {return _parentIdx ?? 0}
-    set {_parentIdx = newValue}
-  }
-  /// Returns true if `parentIdx` has been explicitly set.
-  var hasParentIdx: Bool {return self._parentIdx != nil}
-  /// Clears the value of `parentIdx`. Subsequent reads from it will return its default value.
-  mutating func clearParentIdx() {self._parentIdx = nil}
-
-  var vars: [PaddleMobile_Framework_Proto_VarDesc] = []
-
-  var ops: [PaddleMobile_Framework_Proto_OpDesc] = []
-
-  var forwardBlockIdx: Int32 {
-    get {return _forwardBlockIdx ?? -1}
-    set {_forwardBlockIdx = newValue}
-  }
-  /// Returns true if `forwardBlockIdx` has been explicitly set.
-  var hasForwardBlockIdx: Bool {return self._forwardBlockIdx != nil}
-  /// Clears the value of `forwardBlockIdx`. Subsequent reads from it will return its default value.
-  mutating func clearForwardBlockIdx() {self._forwardBlockIdx = nil}
-
-  var unknownFields = SwiftProtobuf.UnknownStorage()
-
-  init() {}
-
-  fileprivate var _idx: Int32? = nil
-  fileprivate var _parentIdx: Int32? = nil
-  fileprivate var _forwardBlockIdx: Int32? = nil
-}
-
-/// Please refer to
-/// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
-/// for more details.
-/// TODO(panyx0718): A model can have multiple programs. Need a
-/// way to distinguish them. Maybe ID or name?
-struct PaddleMobile_Framework_Proto_ProgramDesc {
-  // SwiftProtobuf.Message conformance is added in an extension below. See the
-  // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
-  // methods supported on all messages.
-
-  var blocks: [PaddleMobile_Framework_Proto_BlockDesc] = []
-
-  var unknownFields = SwiftProtobuf.UnknownStorage()
-
-  init() {}
-}
-
-// MARK: - Code below here is support for the SwiftProtobuf runtime.
-
-fileprivate let _protobuf_package = "paddle_mobile.framework.proto"
-
-extension PaddleMobile_Framework_Proto_AttrType: SwiftProtobuf._ProtoNameProviding {
-  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
-    0: .same(proto: "INT"),
-    1: .same(proto: "FLOAT"),
-    2: .same(proto: "STRING"),
-    3: .same(proto: "INTS"),
-    4: .same(proto: "FLOATS"),
-    5: .same(proto: "STRINGS"),
-    6: .same(proto: "BOOLEAN"),
-    7: .same(proto: "BOOLEANS"),
-    8: .same(proto: "BLOCK"),
-    9: .same(proto: "LONG"),
-  ]
-}
-
-extension PaddleMobile_Framework_Proto_OpDesc: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
-  static let protoMessageName: String = _protobuf_package + ".OpDesc"
-  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
-    3: .same(proto: "type"),
-    1: .same(proto: "inputs"),
-    2: .same(proto: "outputs"),
-    4: .same(proto: "attrs"),
-    5: .standard(proto: "is_target"),
-  ]
-
-  public var isInitialized: Bool {
-    if self._type == nil {return false}
-    if !SwiftProtobuf.Internal.areAllInitialized(self.inputs) {return false}
-    if !SwiftProtobuf.Internal.areAllInitialized(self.outputs) {return false}
-    if !SwiftProtobuf.Internal.areAllInitialized(self.attrs) {return false}
-    return true
-  }
-
-  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
-    while let fieldNumber = try decoder.nextFieldNumber() {
-      switch fieldNumber {
-      case 1: try decoder.decodeRepeatedMessageField(value: &self.inputs)
-      case 2: try decoder.decodeRepeatedMessageField(value: &self.outputs)
-      case 3: try decoder.decodeSingularStringField(value: &self._type)
-      case 4: try decoder.decodeRepeatedMessageField(value: &self.attrs)
-      case 5: try decoder.decodeSingularBoolField(value: &self._isTarget)
-      default: break
-      }
-    }
-  }
-
-  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
-    if !self.inputs.isEmpty {
-      try visitor.visitRepeatedMessageField(value: self.inputs, fieldNumber: 1)
-    }
-    if !self.outputs.isEmpty {
-      try visitor.visitRepeatedMessageField(value: self.outputs, fieldNumber: 2)
-    }
-    if let v = self._type {
-      try visitor.visitSingularStringField(value: v, fieldNumber: 3)
-    }
-    if !self.attrs.isEmpty {
-      try visitor.visitRepeatedMessageField(value: self.attrs, fieldNumber: 4)
-    }
-    if let v = self._isTarget {
-      try visitor.visitSingularBoolField(value: v, fieldNumber: 5)
-    }
-    try unknownFields.traverse(visitor: &visitor)
-  }
-
-  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_OpDesc) -> Bool {
-    if self._type != other._type {return false}
-    if self.inputs != other.inputs {return false}
-    if self.outputs != other.outputs {return false}
-    if self.attrs != other.attrs {return false}
-    if self._isTarget != other._isTarget {return false}
-    if unknownFields != other.unknownFields {return false}
-    return true
-  }
-}
-
-extension PaddleMobile_Framework_Proto_OpDesc.Attr: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
-  static let protoMessageName: String = PaddleMobile_Framework_Proto_OpDesc.protoMessageName + ".Attr"
-  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
-    1: .same(proto: "name"),
-    2: .same(proto: "type"),
-    3: .same(proto: "i"),
-    4: .same(proto: "f"),
-    5: .same(proto: "s"),
-    6: .same(proto: "ints"),
-    7: .same(proto: "floats"),
-    8: .same(proto: "strings"),
-    10: .same(proto: "b"),
-    11: .same(proto: "bools"),
-    12: .standard(proto: "block_idx"),
-    13: .same(proto: "l"),
-  ]
-
-  public var isInitialized: Bool {
-    if self._name == nil {return false}
-    if self._type == nil {return false}
-    return true
-  }
-
-  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
-    while let fieldNumber = try decoder.nextFieldNumber() {
-      switch fieldNumber {
-      case 1: try decoder.decodeSingularStringField(value: &self._name)
-      case 2: try decoder.decodeSingularEnumField(value: &self._type)
-      case 3: try decoder.decodeSingularInt32Field(value: &self._i)
-      case 4: try decoder.decodeSingularFloatField(value: &self._f)
-      case 5: try decoder.decodeSingularStringField(value: &self._s)
-      case 6: try decoder.decodeRepeatedInt32Field(value: &self.ints)
-      case 7: try decoder.decodeRepeatedFloatField(value: &self.floats)
-      case 8: try decoder.decodeRepeatedStringField(value: &self.strings)
-      case 10: try decoder.decodeSingularBoolField(value: &self._b)
-      case 11: try decoder.decodeRepeatedBoolField(value: &self.bools)
-      case 12: try decoder.decodeSingularInt32Field(value: &self._blockIdx)
-      case 13: try decoder.decodeSingularInt64Field(value: &self._l)
-      default: break
-      }
-    }
-  }
-
-  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
-    if let v = self._name {
-      try visitor.visitSingularStringField(value: v, fieldNumber: 1)
-    }
-    if let v = self._type {
-      try visitor.visitSingularEnumField(value: v, fieldNumber: 2)
-    }
-    if let v = self._i {
-      try visitor.visitSingularInt32Field(value: v, fieldNumber: 3)
-    }
-    if let v = self._f {
-      try visitor.visitSingularFloatField(value: v, fieldNumber: 4)
-    }
-    if let v = self._s {
-      try visitor.visitSingularStringField(value: v, fieldNumber: 5)
-    }
-    if !self.ints.isEmpty {
-      try visitor.visitRepeatedInt32Field(value: self.ints, fieldNumber: 6)
-    }
-    if !self.floats.isEmpty {
-      try visitor.visitRepeatedFloatField(value: self.floats, fieldNumber: 7)
-    }
-    if !self.strings.isEmpty {
-      try visitor.visitRepeatedStringField(value: self.strings, fieldNumber: 8)
-    }
-    if let v = self._b {
-      try visitor.visitSingularBoolField(value: v, fieldNumber: 10)
-    }
-    if !self.bools.isEmpty {
-      try visitor.visitRepeatedBoolField(value: self.bools, fieldNumber: 11)
-    }
-    if let v = self._blockIdx {
-      try visitor.visitSingularInt32Field(value: v, fieldNumber: 12)
-    }
-    if let v = self._l {
-      try visitor.visitSingularInt64Field(value: v, fieldNumber: 13)
-    }
-    try unknownFields.traverse(visitor: &visitor)
-  }
-
-  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_OpDesc.Attr) -> Bool {
-    if self._name != other._name {return false}
-    if self._type != other._type {return false}
-    if self._i != other._i {return false}
-    if self._f != other._f {return false}
-    if self._s != other._s {return false}
-    if self.ints != other.ints {return false}
-    if self.floats != other.floats {return false}
-    if self.strings != other.strings {return false}
-    if self._b != other._b {return false}
-    if self.bools != other.bools {return false}
-    if self._blockIdx != other._blockIdx {return false}
-    if self._l != other._l {return false}
-    if unknownFields != other.unknownFields {return false}
-    return true
-  }
-}
-
-extension PaddleMobile_Framework_Proto_OpDesc.Var: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
-  static let protoMessageName: String = PaddleMobile_Framework_Proto_OpDesc.protoMessageName + ".Var"
-  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
-    1: .same(proto: "parameter"),
-    2: .same(proto: "arguments"),
-  ]
-
-  public var isInitialized: Bool {
-    if self._parameter == nil {return false}
-    return true
-  }
-
-  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
-    while let fieldNumber = try decoder.nextFieldNumber() {
-      switch fieldNumber {
-      case 1: try decoder.decodeSingularStringField(value: &self._parameter)
-      case 2: try decoder.decodeRepeatedStringField(value: &self.arguments)
-      default: break
-      }
-    }
-  }
-
-  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
-    if let v = self._parameter {
-      try visitor.visitSingularStringField(value: v, fieldNumber: 1)
-    }
-    if !self.arguments.isEmpty {
-      try visitor.visitRepeatedStringField(value: self.arguments, fieldNumber: 2)
-    }
-    try unknownFields.traverse(visitor: &visitor)
-  }
-
-  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_OpDesc.Var) -> Bool {
-    if self._parameter != other._parameter {return false}
-    if self.arguments != other.arguments {return false}
-    if unknownFields != other.unknownFields {return false}
-    return true
-  }
-}
-
-extension PaddleMobile_Framework_Proto_OpProto: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
-  static let protoMessageName: String = _protobuf_package + ".OpProto"
-  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
-    1: .same(proto: "type"),
-    2: .same(proto: "inputs"),
-    3: .same(proto: "outputs"),
-    4: .same(proto: "attrs"),
-    5: .same(proto: "comment"),
-  ]
-
-  public var isInitialized: Bool {
-    if self._type == nil {return false}
-    if self._comment == nil {return false}
-    if !SwiftProtobuf.Internal.areAllInitialized(self.inputs) {return false}
-    if !SwiftProtobuf.Internal.areAllInitialized(self.outputs) {return false}
-    if !SwiftProtobuf.Internal.areAllInitialized(self.attrs) {return false}
-    return true
-  }
-
-  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
-    while let fieldNumber = try decoder.nextFieldNumber() {
-      switch fieldNumber {
-      case 1: try decoder.decodeSingularStringField(value: &self._type)
-      case 2: try decoder.decodeRepeatedMessageField(value: &self.inputs)
-      case 3: try decoder.decodeRepeatedMessageField(value: &self.outputs)
-      case 4: try decoder.decodeRepeatedMessageField(value: &self.attrs)
-      case 5: try decoder.decodeSingularStringField(value: &self._comment)
-      default: break
-      }
-    }
-  }
-
-  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
-    if let v = self._type {
-      try visitor.visitSingularStringField(value: v, fieldNumber: 1)
-    }
-    if !self.inputs.isEmpty {
-      try visitor.visitRepeatedMessageField(value: self.inputs, fieldNumber: 2)
-    }
-    if !self.outputs.isEmpty {
-      try visitor.visitRepeatedMessageField(value: self.outputs, fieldNumber: 3)
-    }
-    if !self.attrs.isEmpty {
-      try visitor.visitRepeatedMessageField(value: self.attrs, fieldNumber: 4)
-    }
-    if let v = self._comment {
-      try visitor.visitSingularStringField(value: v, fieldNumber: 5)
-    }
-    try unknownFields.traverse(visitor: &visitor)
-  }
-
-  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_OpProto) -> Bool {
-    if self._type != other._type {return false}
-    if self.inputs != other.inputs {return false}
-    if self.outputs != other.outputs {return false}
-    if self.attrs != other.attrs {return false}
-    if self._comment != other._comment {return false}
-    if unknownFields != other.unknownFields {return false}
-    return true
-  }
-}
-
-extension PaddleMobile_Framework_Proto_OpProto.Var: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
-  static let protoMessageName: String = PaddleMobile_Framework_Proto_OpProto.protoMessageName + ".Var"
-  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
-    1: .same(proto: "name"),
-    2: .same(proto: "comment"),
-    3: .same(proto: "duplicable"),
-    4: .same(proto: "intermediate"),
-    5: .same(proto: "dispensable"),
-  ]
-
-  public var isInitialized: Bool {
-    if self._name == nil {return false}
-    if self._comment == nil {return false}
-    return true
-  }
-
-  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
-    while let fieldNumber = try decoder.nextFieldNumber() {
-      switch fieldNumber {
-      case 1: try decoder.decodeSingularStringField(value: &self._name)
-      case 2: try decoder.decodeSingularStringField(value: &self._comment)
-      case 3: try decoder.decodeSingularBoolField(value: &self._duplicable)
-      case 4: try decoder.decodeSingularBoolField(value: &self._intermediate)
-      case 5: try decoder.decodeSingularBoolField(value: &self._dispensable)
-      default: break
-      }
-    }
-  }
-
-  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
-    if let v = self._name {
-      try visitor.visitSingularStringField(value: v, fieldNumber: 1)
-    }
-    if let v = self._comment {
-      try visitor.visitSingularStringField(value: v, fieldNumber: 2)
-    }
-    if let v = self._duplicable {
-      try visitor.visitSingularBoolField(value: v, fieldNumber: 3)
-    }
-    if let v = self._intermediate {
-      try visitor.visitSingularBoolField(value: v, fieldNumber: 4)
-    }
-    if let v = self._dispensable {
-      try visitor.visitSingularBoolField(value: v, fieldNumber: 5)
-    }
-    try unknownFields.traverse(visitor: &visitor)
-  }
-
-  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_OpProto.Var) -> Bool {
-    if self._name != other._name {return false}
-    if self._comment != other._comment {return false}
-    if self._duplicable != other._duplicable {return false}
-    if self._intermediate != other._intermediate {return false}
-    if self._dispensable != other._dispensable {return false}
-    if unknownFields != other.unknownFields {return false}
-    return true
-  }
-}
-
-extension PaddleMobile_Framework_Proto_OpProto.Attr: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
-  static let protoMessageName: String = PaddleMobile_Framework_Proto_OpProto.protoMessageName + ".Attr"
-  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
-    1: .same(proto: "name"),
-    2: .same(proto: "type"),
-    3: .same(proto: "comment"),
-    4: .same(proto: "generated"),
-  ]
-
-  public var isInitialized: Bool {
-    if self._name == nil {return false}
-    if self._type == nil {return false}
-    if self._comment == nil {return false}
-    return true
-  }
-
-  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
-    while let fieldNumber = try decoder.nextFieldNumber() {
-      switch fieldNumber {
-      case 1: try decoder.decodeSingularStringField(value: &self._name)
-      case 2: try decoder.decodeSingularEnumField(value: &self._type)
-      case 3: try decoder.decodeSingularStringField(value: &self._comment)
-      case 4: try decoder.decodeSingularBoolField(value: &self._generated)
-      default: break
-      }
-    }
-  }
-
-  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
-    if let v = self._name {
-      try visitor.visitSingularStringField(value: v, fieldNumber: 1)
-    }
-    if let v = self._type {
-      try visitor.visitSingularEnumField(value: v, fieldNumber: 2)
-    }
-    if let v = self._comment {
-      try visitor.visitSingularStringField(value: v, fieldNumber: 3)
-    }
-    if let v = self._generated {
-      try visitor.visitSingularBoolField(value: v, fieldNumber: 4)
-    }
-    try unknownFields.traverse(visitor: &visitor)
-  }
-
-  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_OpProto.Attr) -> Bool {
-    if self._name != other._name {return false}
-    if self._type != other._type {return false}
-    if self._comment != other._comment {return false}
-    if self._generated != other._generated {return false}
-    if unknownFields != other.unknownFields {return false}
-    return true
-  }
-}
-
-extension PaddleMobile_Framework_Proto_VarType: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
-  static let protoMessageName: String = _protobuf_package + ".VarType"
-  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
-    1: .same(proto: "type"),
-    2: .standard(proto: "selected_rows"),
-    3: .standard(proto: "lod_tensor"),
-    4: .standard(proto: "tensor_array"),
-    5: .same(proto: "reader"),
-    6: .same(proto: "channel"),
-    7: .same(proto: "tuple"),
-  ]
-
-  fileprivate class _StorageClass {
-    var _type: PaddleMobile_Framework_Proto_VarType.TypeEnum? = nil
-    var _selectedRows: PaddleMobile_Framework_Proto_VarType.TensorDesc? = nil
-    var _lodTensor: PaddleMobile_Framework_Proto_VarType.LoDTensorDesc? = nil
-    var _tensorArray: PaddleMobile_Framework_Proto_VarType.LoDTensorArrayDesc? = nil
-    var _reader: PaddleMobile_Framework_Proto_VarType.ReaderDesc? = nil
-    var _channel: PaddleMobile_Framework_Proto_VarType.ChannelDesc? = nil
-    var _tuple: PaddleMobile_Framework_Proto_VarType.Tuple? = nil
-
-    static let defaultInstance = _StorageClass()
-
-    private init() {}
-
-    init(copying source: _StorageClass) {
-      _type = source._type
-      _selectedRows = source._selectedRows
-      _lodTensor = source._lodTensor
-      _tensorArray = source._tensorArray
-      _reader = source._reader
-      _channel = source._channel
-      _tuple = source._tuple
-    }
-  }
-
-  fileprivate mutating func _uniqueStorage() -> _StorageClass {
-    if !isKnownUniquelyReferenced(&_storage) {
-      _storage = _StorageClass(copying: _storage)
-    }
-    return _storage
-  }
-
-  public var isInitialized: Bool {
-    return withExtendedLifetime(_storage) { (_storage: _StorageClass) in
-      if _storage._type == nil {return false}
-      if let v = _storage._selectedRows, !v.isInitialized {return false}
-      if let v = _storage._lodTensor, !v.isInitialized {return false}
-      if let v = _storage._tensorArray, !v.isInitialized {return false}
-      if let v = _storage._reader, !v.isInitialized {return false}
-      if let v = _storage._channel, !v.isInitialized {return false}
-      return true
-    }
-  }
-
-  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
-    _ = _uniqueStorage()
-    try withExtendedLifetime(_storage) { (_storage: _StorageClass) in
-      while let fieldNumber = try decoder.nextFieldNumber() {
-        switch fieldNumber {
-        case 1: try decoder.decodeSingularEnumField(value: &_storage._type)
-        case 2: try decoder.decodeSingularMessageField(value: &_storage._selectedRows)
-        case 3: try decoder.decodeSingularMessageField(value: &_storage._lodTensor)
-        case 4: try decoder.decodeSingularMessageField(value: &_storage._tensorArray)
-        case 5: try decoder.decodeSingularMessageField(value: &_storage._reader)
-        case 6: try decoder.decodeSingularMessageField(value: &_storage._channel)
-        case 7: try decoder.decodeSingularMessageField(value: &_storage._tuple)
-        default: break
-        }
-      }
-    }
-  }
-
-  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
-    try withExtendedLifetime(_storage) { (_storage: _StorageClass) in
-      if let v = _storage._type {
-        try visitor.visitSingularEnumField(value: v, fieldNumber: 1)
-      }
-      if let v = _storage._selectedRows {
-        try visitor.visitSingularMessageField(value: v, fieldNumber: 2)
-      }
-      if let v = _storage._lodTensor {
-        try visitor.visitSingularMessageField(value: v, fieldNumber: 3)
-      }
-      if let v = _storage._tensorArray {
-        try visitor.visitSingularMessageField(value: v, fieldNumber: 4)
-      }
-      if let v = _storage._reader {
-        try visitor.visitSingularMessageField(value: v, fieldNumber: 5)
-      }
-      if let v = _storage._channel {
-        try visitor.visitSingularMessageField(value: v, fieldNumber: 6)
-      }
-      if let v = _storage._tuple {
-        try visitor.visitSingularMessageField(value: v, fieldNumber: 7)
-      }
-    }
-    try unknownFields.traverse(visitor: &visitor)
-  }
-
-  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_VarType) -> Bool {
-    if _storage !== other._storage {
-      let storagesAreEqual: Bool = withExtendedLifetime((_storage, other._storage)) { (_args: (_StorageClass, _StorageClass)) in
-        let _storage = _args.0
-        let other_storage = _args.1
-        if _storage._type != other_storage._type {return false}
-        if _storage._selectedRows != other_storage._selectedRows {return false}
-        if _storage._lodTensor != other_storage._lodTensor {return false}
-        if _storage._tensorArray != other_storage._tensorArray {return false}
-        if _storage._reader != other_storage._reader {return false}
-        if _storage._channel != other_storage._channel {return false}
-        if _storage._tuple != other_storage._tuple {return false}
-        return true
-      }
-      if !storagesAreEqual {return false}
-    }
-    if unknownFields != other.unknownFields {return false}
-    return true
-  }
-}
-
-extension PaddleMobile_Framework_Proto_VarType.TypeEnum: SwiftProtobuf._ProtoNameProviding {
-  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
-    0: .same(proto: "BOOL"),
-    1: .same(proto: "INT16"),
-    2: .same(proto: "INT32"),
-    3: .same(proto: "INT64"),
-    4: .same(proto: "FP16"),
-    5: .same(proto: "FP32"),
-    6: .same(proto: "FP64"),
-    7: .same(proto: "LOD_TENSOR"),
-    8: .same(proto: "SELECTED_ROWS"),
-    9: .same(proto: "FEED_MINIBATCH"),
-    10: .same(proto: "FETCH_LIST"),
-    11: .same(proto: "STEP_SCOPES"),
-    12: .same(proto: "LOD_RANK_TABLE"),
-    13: .same(proto: "LOD_TENSOR_ARRAY"),
-    14: .same(proto: "PLACE_LIST"),
-    15: .same(proto: "READER"),
-    16: .same(proto: "CHANNEL"),
-    17: .same(proto: "RAW"),
-    18: .same(proto: "TUPLE"),
-  ]
-}
-
-extension PaddleMobile_Framework_Proto_VarType.TensorDesc: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
-  static let protoMessageName: String = PaddleMobile_Framework_Proto_VarType.protoMessageName + ".TensorDesc"
-  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
-    1: .standard(proto: "data_type"),
-    2: .same(proto: "dims"),
-  ]
-
-  public var isInitialized: Bool {
-    if self._dataType == nil {return false}
-    return true
-  }
-
-  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
-    while let fieldNumber = try decoder.nextFieldNumber() {
-      switch fieldNumber {
-      case 1: try decoder.decodeSingularEnumField(value: &self._dataType)
-      case 2: try decoder.decodeRepeatedInt64Field(value: &self.dims)
-      default: break
-      }
-    }
-  }
-
-  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
-    if let v = self._dataType {
-      try visitor.visitSingularEnumField(value: v, fieldNumber: 1)
-    }
-    if !self.dims.isEmpty {
-      try visitor.visitRepeatedInt64Field(value: self.dims, fieldNumber: 2)
-    }
-    try unknownFields.traverse(visitor: &visitor)
-  }
-
-  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_VarType.TensorDesc) -> Bool {
-    if self._dataType != other._dataType {return false}
-    if self.dims != other.dims {return false}
-    if unknownFields != other.unknownFields {return false}
-    return true
-  }
-}
-
-extension PaddleMobile_Framework_Proto_VarType.LoDTensorDesc: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
-  static let protoMessageName: String = PaddleMobile_Framework_Proto_VarType.protoMessageName + ".LoDTensorDesc"
-  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
-    1: .same(proto: "tensor"),
-    2: .standard(proto: "lod_level"),
-  ]
-
-  fileprivate class _StorageClass {
-    var _tensor: PaddleMobile_Framework_Proto_VarType.TensorDesc? = nil
-    var _lodLevel: Int32? = nil
-
-    static let defaultInstance = _StorageClass()
-
-    private init() {}
-
-    init(copying source: _StorageClass) {
-      _tensor = source._tensor
-      _lodLevel = source._lodLevel
-    }
-  }
-
-  fileprivate mutating func _uniqueStorage() -> _StorageClass {
-    if !isKnownUniquelyReferenced(&_storage) {
-      _storage = _StorageClass(copying: _storage)
-    }
-    return _storage
-  }
-
-  public var isInitialized: Bool {
-    return withExtendedLifetime(_storage) { (_storage: _StorageClass) in
-      if _storage._tensor == nil {return false}
-      if let v = _storage._tensor, !v.isInitialized {return false}
-      return true
-    }
-  }
-
-  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
-    _ = _uniqueStorage()
-    try withExtendedLifetime(_storage) { (_storage: _StorageClass) in
-      while let fieldNumber = try decoder.nextFieldNumber() {
-        switch fieldNumber {
-        case 1: try decoder.decodeSingularMessageField(value: &_storage._tensor)
-        case 2: try decoder.decodeSingularInt32Field(value: &_storage._lodLevel)
-        default: break
-        }
-      }
-    }
-  }
-
-  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
-    try withExtendedLifetime(_storage) { (_storage: _StorageClass) in
-      if let v = _storage._tensor {
-        try visitor.visitSingularMessageField(value: v, fieldNumber: 1)
-      }
-      if let v = _storage._lodLevel {
-        try visitor.visitSingularInt32Field(value: v, fieldNumber: 2)
-      }
-    }
-    try unknownFields.traverse(visitor: &visitor)
-  }
-
-  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_VarType.LoDTensorDesc) -> Bool {
-    if _storage !== other._storage {
-      let storagesAreEqual: Bool = withExtendedLifetime((_storage, other._storage)) { (_args: (_StorageClass, _StorageClass)) in
-        let _storage = _args.0
-        let other_storage = _args.1
-        if _storage._tensor != other_storage._tensor {return false}
-        if _storage._lodLevel != other_storage._lodLevel {return false}
-        return true
-      }
-      if !storagesAreEqual {return false}
-    }
-    if unknownFields != other.unknownFields {return false}
-    return true
-  }
-}
-
-extension PaddleMobile_Framework_Proto_VarType.LoDTensorArrayDesc: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
-  static let protoMessageName: String = PaddleMobile_Framework_Proto_VarType.protoMessageName + ".LoDTensorArrayDesc"
-  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
-    1: .same(proto: "tensor"),
-    2: .standard(proto: "lod_level"),
-  ]
-
-  fileprivate class _StorageClass {
-    var _tensor: PaddleMobile_Framework_Proto_VarType.TensorDesc? = nil
-    var _lodLevel: Int32? = nil
-
-    static let defaultInstance = _StorageClass()
-
-    private init() {}
-
-    init(copying source: _StorageClass) {
-      _tensor = source._tensor
-      _lodLevel = source._lodLevel
-    }
-  }
-
-  fileprivate mutating func _uniqueStorage() -> _StorageClass {
-    if !isKnownUniquelyReferenced(&_storage) {
-      _storage = _StorageClass(copying: _storage)
-    }
-    return _storage
-  }
-
-  public var isInitialized: Bool {
-    return withExtendedLifetime(_storage) { (_storage: _StorageClass) in
-      if _storage._tensor == nil {return false}
-      if let v = _storage._tensor, !v.isInitialized {return false}
-      return true
-    }
-  }
-
-  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
-    _ = _uniqueStorage()
-    try withExtendedLifetime(_storage) { (_storage: _StorageClass) in
-      while let fieldNumber = try decoder.nextFieldNumber() {
-        switch fieldNumber {
-        case 1: try decoder.decodeSingularMessageField(value: &_storage._tensor)
-        case 2: try decoder.decodeSingularInt32Field(value: &_storage._lodLevel)
-        default: break
-        }
-      }
-    }
-  }
-
-  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
-    try withExtendedLifetime(_storage) { (_storage: _StorageClass) in
-      if let v = _storage._tensor {
-        try visitor.visitSingularMessageField(value: v, fieldNumber: 1)
-      }
-      if let v = _storage._lodLevel {
-        try visitor.visitSingularInt32Field(value: v, fieldNumber: 2)
-      }
-    }
-    try unknownFields.traverse(visitor: &visitor)
-  }
-
-  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_VarType.LoDTensorArrayDesc) -> Bool {
-    if _storage !== other._storage {
-      let storagesAreEqual: Bool = withExtendedLifetime((_storage, other._storage)) { (_args: (_StorageClass, _StorageClass)) in
-        let _storage = _args.0
-        let other_storage = _args.1
-        if _storage._tensor != other_storage._tensor {return false}
-        if _storage._lodLevel != other_storage._lodLevel {return false}
-        return true
-      }
-      if !storagesAreEqual {return false}
-    }
-    if unknownFields != other.unknownFields {return false}
-    return true
-  }
-}
-
-extension PaddleMobile_Framework_Proto_VarType.ReaderDesc: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
-  static let protoMessageName: String = PaddleMobile_Framework_Proto_VarType.protoMessageName + ".ReaderDesc"
-  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
-    1: .standard(proto: "lod_tensor"),
-  ]
-
-  public var isInitialized: Bool {
-    if !SwiftProtobuf.Internal.areAllInitialized(self.lodTensor) {return false}
-    return true
-  }
-
-  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
-    while let fieldNumber = try decoder.nextFieldNumber() {
-      switch fieldNumber {
-      case 1: try decoder.decodeRepeatedMessageField(value: &self.lodTensor)
-      default: break
-      }
-    }
-  }
-
-  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
-    if !self.lodTensor.isEmpty {
-      try visitor.visitRepeatedMessageField(value: self.lodTensor, fieldNumber: 1)
-    }
-    try unknownFields.traverse(visitor: &visitor)
-  }
-
-  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_VarType.ReaderDesc) -> Bool {
-    if self.lodTensor != other.lodTensor {return false}
-    if unknownFields != other.unknownFields {return false}
-    return true
-  }
-}
-
-extension PaddleMobile_Framework_Proto_VarType.ChannelDesc: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
-  static let protoMessageName: String = PaddleMobile_Framework_Proto_VarType.protoMessageName + ".ChannelDesc"
-  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
-    1: .standard(proto: "data_type"),
-    2: .same(proto: "capacity"),
-  ]
-
-  public var isInitialized: Bool {
-    if self._dataType == nil {return false}
-    if self._capacity == nil {return false}
-    return true
-  }
-
-  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
-    while let fieldNumber = try decoder.nextFieldNumber() {
-      switch fieldNumber {
-      case 1: try decoder.decodeSingularEnumField(value: &self._dataType)
-      case 2: try decoder.decodeSingularInt64Field(value: &self._capacity)
-      default: break
-      }
-    }
-  }
-
-  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
-    if let v = self._dataType {
-      try visitor.visitSingularEnumField(value: v, fieldNumber: 1)
-    }
-    if let v = self._capacity {
-      try visitor.visitSingularInt64Field(value: v, fieldNumber: 2)
-    }
-    try unknownFields.traverse(visitor: &visitor)
-  }
-
-  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_VarType.ChannelDesc) -> Bool {
-    if self._dataType != other._dataType {return false}
-    if self._capacity != other._capacity {return false}
-    if unknownFields != other.unknownFields {return false}
-    return true
-  }
-}
-
-extension PaddleMobile_Framework_Proto_VarType.Tuple: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
-  static let protoMessageName: String = PaddleMobile_Framework_Proto_VarType.protoMessageName + ".Tuple"
-  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
-    1: .standard(proto: "element_type"),
-  ]
-
-  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
-    while let fieldNumber = try decoder.nextFieldNumber() {
-      switch fieldNumber {
-      case 1: try decoder.decodeRepeatedEnumField(value: &self.elementType)
-      default: break
-      }
-    }
-  }
-
-  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
-    if !self.elementType.isEmpty {
-      try visitor.visitRepeatedEnumField(value: self.elementType, fieldNumber: 1)
-    }
-    try unknownFields.traverse(visitor: &visitor)
-  }
-
-  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_VarType.Tuple) -> Bool {
-    if self.elementType != other.elementType {return false}
-    if unknownFields != other.unknownFields {return false}
-    return true
-  }
-}
-
-extension PaddleMobile_Framework_Proto_VarDesc: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
-  static let protoMessageName: String = _protobuf_package + ".VarDesc"
-  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
-    1: .same(proto: "name"),
-    2: .same(proto: "type"),
-    3: .same(proto: "persistable"),
-  ]
-
-  fileprivate class _StorageClass {
-    var _name: String? = nil
-    var _type: PaddleMobile_Framework_Proto_VarType? = nil
-    var _persistable: Bool? = nil
-
-    static let defaultInstance = _StorageClass()
-
-    private init() {}
-
-    init(copying source: _StorageClass) {
-      _name = source._name
-      _type = source._type
-      _persistable = source._persistable
-    }
-  }
-
-  fileprivate mutating func _uniqueStorage() -> _StorageClass {
-    if !isKnownUniquelyReferenced(&_storage) {
-      _storage = _StorageClass(copying: _storage)
-    }
-    return _storage
-  }
-
-  public var isInitialized: Bool {
-    return withExtendedLifetime(_storage) { (_storage: _StorageClass) in
-      if _storage._name == nil {return false}
-      if _storage._type == nil {return false}
-      if let v = _storage._type, !v.isInitialized {return false}
-      return true
-    }
-  }
-
-  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
-    _ = _uniqueStorage()
-    try withExtendedLifetime(_storage) { (_storage: _StorageClass) in
-      while let fieldNumber = try decoder.nextFieldNumber() {
-        switch fieldNumber {
-        case 1: try decoder.decodeSingularStringField(value: &_storage._name)
-        case 2: try decoder.decodeSingularMessageField(value: &_storage._type)
-        case 3: try decoder.decodeSingularBoolField(value: &_storage._persistable)
-        default: break
-        }
-      }
-    }
-  }
-
-  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
-    try withExtendedLifetime(_storage) { (_storage: _StorageClass) in
-      if let v = _storage._name {
-        try visitor.visitSingularStringField(value: v, fieldNumber: 1)
-      }
-      if let v = _storage._type {
-        try visitor.visitSingularMessageField(value: v, fieldNumber: 2)
-      }
-      if let v = _storage._persistable {
-        try visitor.visitSingularBoolField(value: v, fieldNumber: 3)
-      }
-    }
-    try unknownFields.traverse(visitor: &visitor)
-  }
-
-  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_VarDesc) -> Bool {
-    if _storage !== other._storage {
-      let storagesAreEqual: Bool = withExtendedLifetime((_storage, other._storage)) { (_args: (_StorageClass, _StorageClass)) in
-        let _storage = _args.0
-        let other_storage = _args.1
-        if _storage._name != other_storage._name {return false}
-        if _storage._type != other_storage._type {return false}
-        if _storage._persistable != other_storage._persistable {return false}
-        return true
-      }
-      if !storagesAreEqual {return false}
-    }
-    if unknownFields != other.unknownFields {return false}
-    return true
-  }
-}
-
-extension PaddleMobile_Framework_Proto_BlockDesc: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
-  static let protoMessageName: String = _protobuf_package + ".BlockDesc"
-  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
-    1: .same(proto: "idx"),
-    2: .standard(proto: "parent_idx"),
-    3: .same(proto: "vars"),
-    4: .same(proto: "ops"),
-    5: .standard(proto: "forward_block_idx"),
-  ]
-
-  public var isInitialized: Bool {
-    if self._idx == nil {return false}
-    if self._parentIdx == nil {return false}
-    if !SwiftProtobuf.Internal.areAllInitialized(self.vars) {return false}
-    if !SwiftProtobuf.Internal.areAllInitialized(self.ops) {return false}
-    return true
-  }
-
-  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
-    while let fieldNumber = try decoder.nextFieldNumber() {
-      switch fieldNumber {
-      case 1: try decoder.decodeSingularInt32Field(value: &self._idx)
-      case 2: try decoder.decodeSingularInt32Field(value: &self._parentIdx)
-      case 3: try decoder.decodeRepeatedMessageField(value: &self.vars)
-      case 4: try decoder.decodeRepeatedMessageField(value: &self.ops)
-      case 5: try decoder.decodeSingularInt32Field(value: &self._forwardBlockIdx)
-      default: break
-      }
-    }
-  }
-
-  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
-    if let v = self._idx {
-      try visitor.visitSingularInt32Field(value: v, fieldNumber: 1)
-    }
-    if let v = self._parentIdx {
-      try visitor.visitSingularInt32Field(value: v, fieldNumber: 2)
-    }
-    if !self.vars.isEmpty {
-      try visitor.visitRepeatedMessageField(value: self.vars, fieldNumber: 3)
-    }
-    if !self.ops.isEmpty {
-      try visitor.visitRepeatedMessageField(value: self.ops, fieldNumber: 4)
-    }
-    if let v = self._forwardBlockIdx {
-      try visitor.visitSingularInt32Field(value: v, fieldNumber: 5)
-    }
-    try unknownFields.traverse(visitor: &visitor)
-  }
-
-  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_BlockDesc) -> Bool {
-    if self._idx != other._idx {return false}
-    if self._parentIdx != other._parentIdx {return false}
-    if self.vars != other.vars {return false}
-    if self.ops != other.ops {return false}
-    if self._forwardBlockIdx != other._forwardBlockIdx {return false}
-    if unknownFields != other.unknownFields {return false}
-    return true
-  }
-}
-
-extension PaddleMobile_Framework_Proto_ProgramDesc: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
-  static let protoMessageName: String = _protobuf_package + ".ProgramDesc"
-  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
-    1: .same(proto: "blocks"),
-  ]
-
-  public var isInitialized: Bool {
-    if !SwiftProtobuf.Internal.areAllInitialized(self.blocks) {return false}
-    return true
-  }
-
-  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
-    while let fieldNumber = try decoder.nextFieldNumber() {
-      switch fieldNumber {
-      case 1: try decoder.decodeRepeatedMessageField(value: &self.blocks)
-      default: break
-      }
-    }
-  }
-
-  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
-    if !self.blocks.isEmpty {
-      try visitor.visitRepeatedMessageField(value: self.blocks, fieldNumber: 1)
-    }
-    try unknownFields.traverse(visitor: &visitor)
-  }
-
-  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_ProgramDesc) -> Bool {
-    if self.blocks != other.blocks {return false}
-    if unknownFields != other.unknownFields {return false}
-    return true
-  }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/paddle_mobile.h b/metal/paddle-mobile/paddle-mobile/paddle_mobile.h
deleted file mode 100644
index 122d3e4d5f..0000000000
--- a/metal/paddle-mobile/paddle-mobile/paddle_mobile.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#pragma once
-
-#import <UIKit/UIKit.h>
-#import <paddle_mobile/Framework.pbobjc.h>
-
-//! Project version number for paddle_mobile.
-//FOUNDATION_EXPORT double paddle_mobileVersionNumber;
-
-//! Project version string for paddle_mobile.
-//FOUNDATION_EXPORT const unsigned char paddle_mobileVersionString[];
diff --git a/mobile.md b/mobile.md
new file mode 100644
index 0000000000..505f20cbbc
--- /dev/null
+++ b/mobile.md
@@ -0,0 +1,7 @@
+# paddle mobile 代码编译
+
+详情可以参考 [mobile/README](https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/mobile)
+
+要切换 paddle-mobile 编译，cmake 需要加上 **-DWITH_PADDLE_MOBILE=ON** 开关，其余 flag 请参考上面文档添加到后面
+
+所有其他选项跟 paddle-mobile 原始操作完全一致
\ No newline at end of file
diff --git a/mobile/.clang-format b/mobile/.clang-format
deleted file mode 100644
index d59e088579..0000000000
--- a/mobile/.clang-format
+++ /dev/null
@@ -1,5 +0,0 @@
----
-Language:        Cpp
-BasedOnStyle: Google 
-Standard:  Cpp11 
-...
diff --git a/mobile/.clang-tidy b/mobile/.clang-tidy
deleted file mode 100644
index c788efe69d..0000000000
--- a/mobile/.clang-tidy
+++ /dev/null
@@ -1,67 +0,0 @@
-Checks: >
-  *
-  -android-*
-  -bugprone-bool-pointer-implicit-conversion
-  -cert-env33-c
-  -cert-dcl50-cpp
-  -cert-dcl59-cpp
-  -cppcoreguidelines-*
-  -fuchsia-*
-  -google-*
-  google-default-arguments
-  google-explicit-constructor
-  google-runtime-member-string-references
-  google-runtime-operator
-  -hicpp-braces-around-statements
-  -hicpp-named-parameter
-  -hicpp-no-array-decay
-  -hicpp-no-assembler
-  -hicpp-no-malloc
-  -hicpp-function-size
-  -hicpp-special-member-functions
-  -hicpp-vararg
-  -llvm-*
-  -objc-*
-  -readability-else-after-return
-  -readability-implicit-bool-conversion
-  -readability-named-parameter
-  -readability-simplify-boolean-expr
-  -readability-braces-around-statements
-  -readability-identifier-naming
-  -readability-function-size
-  -readability-redundant-member-init
-  -misc-bool-pointer-implicit-conversion
-  -misc-definitions-in-headers
-  -misc-unused-alias-decls
-  -misc-unused-parameters
-  -misc-unused-using-decls
-  -modernize-use-using
-  -modernize-use-default-member-init
-  -clang-diagnostic-*
-  -clang-analyzer-*
-WarningsAsErrors: '*'
-HeaderFilterRegex: ''
-AnalyzeTemporaryDtors: false
-FormatStyle:     none
-User:            allonli
-CheckOptions:    
-  - key:             google-readability-braces-around-statements.ShortStatementLines
-    value:           '1'
-  - key:             google-readability-function-size.StatementThreshold
-    value:           '800'
-  - key:             google-readability-namespace-comments.ShortNamespaceLines
-    value:           '10'
-  - key:             google-readability-namespace-comments.SpacesBeforeComments
-    value:           '2'
-  - key:             modernize-loop-convert.MaxCopySize
-    value:           '16'
-  - key:             modernize-loop-convert.MinConfidence
-    value:           reasonable
-  - key:             modernize-loop-convert.NamingStyle
-    value:           CamelCase
-  - key:             modernize-pass-by-value.IncludeStyle
-    value:           llvm
-  - key:             modernize-replace-auto-ptr.IncludeStyle
-    value:           llvm
-  - key:             modernize-use-nullptr.NullMacros
-    value:           'NULL'
diff --git a/mobile/.gitignore b/mobile/.gitignore
deleted file mode 100644
index 70d0b40927..0000000000
--- a/mobile/.gitignore
+++ /dev/null
@@ -1,103 +0,0 @@
-opencl_kernels.cpp
-# Prerequisites
-*.d
-
-# Compiled Object files
-*.slo
-*.lo
-*.o
-*.obj
-
-# Precompiled Headers
-*.gch
-*.pch
-
-# Compiled Dynamic libraries
-*.so
-*.dylib
-*.dll
-
-# Fortran module files
-*.mod
-*.smod
-
-# Compiled Static libraries
-*.lai
-*.la
-*.lib
-*.a
-
-# Executables
-*.exe
-*.out
-*.app
-
-.DS_Store
-
-build/
-
-.idea/
-
-CMakeCache.txt
-
-CMakeFiles/
-
-Makefile
-
-cmake_install.cmake
-
-
-*.cbp
-
-paddle-mobile.cbp
-
-.idea
-
-compile_commands.json
-
-cmake-build-debug/
-cmake-build-release/
-
-test/models/
-
-test/images/
-
-# Emacs intermediate files
-*~
-
-# CMake building directory
-build
-
-# clion building directories
-cmake-build-debug
-cmake-build-release
-
-# ios
-tools/libomp.a
-
-# ios demo
-demo/ios/PaddleMobileDemo/PaddleMobileDemo/googlenet_combine/
-demo/ios/PaddleMobileDemo/PaddleMobileDemo/*.jpg
-demo/ios/PaddleMobileDemo/PaddleMobileDemo/PaddleMobile/*.a
-*.xcuserstate
-/tools/quantification/quantify
-
-# metal
-Podfile.lock
-metal/Pods/
-SwiftProtobuf.framework
-paddle-mobile.xcworkspace
-metal/models/
-metal/images/
-*.a
-metal/paddle-mobile/paddle-mobile/CPU/libpaddle-mobile.a
-*.xcuserdatad/
-*/xcuserdata/
-/venv/
-
-metal/paddle-mobile-demo/paddle-mobile-demo/images
-metal/paddle-mobile-demo/paddle-mobile-demo/models
-metal/paddle-mobile-demo/paddle-mobile-demo/Resources
-metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images
-metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models
-metal/MobileNetDemo/MobileNetDemo/Resources
diff --git a/mobile/.pre-commit-config.yaml b/mobile/.pre-commit-config.yaml
deleted file mode 100644
index d9827afcd0..0000000000
--- a/mobile/.pre-commit-config.yaml
+++ /dev/null
@@ -1,69 +0,0 @@
-repos:
--   repo: https://github.com/Lucas-C/pre-commit-hooks.git
-    sha: v1.0.1
-    hooks:
-    -   id: remove-crlf
-        files: ^(mobile/src/).*\.(md|py|mm|swift|java|c|cc|cxx|cpp|cu|h|hpp|hxx)$
-        exclude: ^(lite/)
-    -   id: remove-tabs
-        files: ^(mobile/test/|mobile/src/).*\.(md|py|mm|swift|java|c|cc|cxx|cpp|cu|h|hpp|hxx)$
-        exclude: ^(lite/)
-
--   repo: https://github.com/pre-commit/pre-commit-hooks
-    sha: 5bf6c09bfa1297d3692cadd621ef95f1284e33c0
-    hooks:
-    -   id: check-added-large-files
-        exclude: ^(lite/)
-    -   id: check-merge-conflict
-        exclude: ^(lite/)
-    -   id: check-symlinks
-        exclude: ^(lite/)
-    -   id: detect-private-key
-        files: (?!.*tar.gz)^.*$ 
-        exclude: ^(lite/)
-    -   id: end-of-file-fixer
-        files: ^(mobile/test/|mobile/src/).*\.(md|py|mm|swift|java|c|cc|cxx|cpp|h|hpp|hxx)$
-        exclude: ^(lite/)
-    -   id: trailing-whitespace
-        files: ^(mobile/test/|mobile/src/).*\.(md|py|mm|swift|java|c|cc|cxx|cpp|h|hpp|hxx)$
-        exclude: ^(lite/)
-
--   repo: local
-    hooks:
-    -   id: copyright
-        name: copyright
-        entry: python ./mobile/tools/pre-commit.hooks/copyright.hook
-        language: system
-        files: ^(mobile/test/|mobile/src/).*\.(c|cc|cxx|cpp|h|hpp|hxx|py)$
-        exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$ | ^(lite/)
-
--   repo: local
-    hooks:
-    -   id: clang-format
-        name: clang-format
-        description: Format files with ClangFormat.
-        entry: bash ./mobile/tools/pre-commit.hooks/clang-format.hook -i
-        language: system
-        files: ^(mobile/test/|mobile/src/).*\.(c|cc|cxx|cpp|h|hpp|hxx)$
-        exclude: ^(lite/)
-
--   repo: local
-    hooks:
-    -   id: cpplint
-        name: cpplint
-        description: Check C++ code style using cpplint.
-        entry: bash ./mobile/tools/pre-commit.hooks/cpplint.hook
-        language: system
-        files: ^(mobile/test/|mobile/src/).*\.(c|cc|cxx|cpp|h|hpp|hxx)$
-        exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$i | *\.pb\.cpp | ^(lite/)
-
-
-#
-#-   repo: local
-#    hooks:
-#    -   id: clang-tidy
-#        name: clang-tidy
-#        description: Check C++ code style using clang-tidy.
-#        entry: bash ./tools/pre-commit.hooks/.clang-tidy.hook -i
-#        language: system
-#        files: (src).*\.(c|cc|cxx|cpp|h|hpp|hxx)$
diff --git a/mobile/.travis.yml b/mobile/.travis.yml
deleted file mode 100644
index 20fdddd5a1..0000000000
--- a/mobile/.travis.yml
+++ /dev/null
@@ -1,36 +0,0 @@
-language: cpp
-cache: ccache
-sudo: required
-dist: trusty
-
-os:
-  - linux
-
-addons:
-  apt:
-    packages:
-      - git
-      - python
-      - python-pip
-      - python2.7-dev
-      - libc6-i386
-      - curl
-
-compiler:
-  - clang
-        
-before_install:
-  - sudo pip install -U virtualenv pre-commit pip
-  # Download and install recent cmake
-
-script:
-  - | 
-    function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
-  - |
-    timeout 600 .travis/pre-commit-job.sh # 10min timeout
-    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else exit 1; fi;
-
-notifications:
-  email:
-    on_success: change
-    on_failure: always
diff --git a/mobile/.travis/pre-commit-job.sh b/mobile/.travis/pre-commit-job.sh
deleted file mode 100755
index a0ae98dddd..0000000000
--- a/mobile/.travis/pre-commit-job.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-function abort(){
-    echo "Your change doesn't follow Paddle-Moible's code style" 1>&2
-    echo "Please use pre-commit to auto-format your code." 1>&2
-    exit 1
-}
-
-trap 'abort' 0
-set -e
-cd `dirname $0`
-cd ..
-export PATH=/usr/bin:$PATH
-pre-commit install
-
-if ! pre-commit run -a ; then
-  ls -lh
-  git diff  --exit-code
-  exit 1
-fi
-
-trap : 0
diff --git a/mobile/CMakeLists.txt b/mobile/CMakeLists.txt
deleted file mode 100644
index 00a53035a1..0000000000
--- a/mobile/CMakeLists.txt
+++ /dev/null
@@ -1,293 +0,0 @@
-cmake_minimum_required(VERSION 3.0.0)
-
-# basic build option
-if(IS_IOS)
-    option(USE_OPENMP       "build with openmp support"      OFF)
-else()
-    option(USE_OPENMP       "build with openmp support"      ON)
-endif()
-option(USE_EXCEPTION    "build with exception"           ON)
-option(WITH_LOGGING     "print logging for debug"        OFF)
-option(WITH_SYMBOL      "build with all symbols"         ON) # turn off if use jni or ios io
-option(WITH_PROFILE     "print op profile for debug"     OFF)
-option(WITH_TEST        "build with unit tests"          ON)
-
-# select platform: CPU, GPU_CL, FPGA
-option(CPU              "build with arm CPU support"     ON)
-option(GPU_CL           "build with OpenCL support"      ON)
-option(FPGA             "build with FPGA support"        OFF)
-if(FPGA)
-  option(FPGAV1     "build with fpga v1 support"   ON)
-  option(FPGAV2     "build with fpga v2 support"   OFF)
-  option(FPGAKD     "build with fpga KD support"   OFF)
-endif()
-
-project(paddle-mobile)
-
-# source code
-file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
-file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
-include_directories(src/)
-
-# build flags
-set(CMAKE_CXX_FLAGS "-O3 -s -DNDEBUG ${CMAKE_CXX_FLAGS} -Wno-attributes")
-if(IS_IOS)
-    set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc \
-        -std=gnu++11 -stdlib=libc++ -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")
-    add_compile_options(-fembed-bitcode)
-else()
-    set(CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}")
-endif()
-
-# others
-if(USE_OPENMP)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
-    add_definitions(-DPADDLE_MOBILE_USE_OPENMP)
-endif()
-
-if(WITH_LOGGING)
-    message(STATUS "Debugging mode")
-    add_definitions(-DPADDLE_MOBILE_DEBUG)
-else()
-endif()
-
-if(NOT WITH_SYMBOL)
-    add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
-endif()
-
-if(USE_EXCEPTION)
-    message(STATUS "Use exception")
-    add_definitions(-DENABLE_EXCEPTION -fexceptions)
-else()
-    add_definitions(-fno-exceptions)
-endif()
-
-if(WITH_PROFILE)
-    add_definitions(-DPADDLE_MOBILE_PROFILE)
-endif()
-
-# platform control
-if(ARM_LINUX)
-    include("${CMAKE_CURRENT_LIST_DIR}/tools/arm-platform.cmake")
-endif()
-
-if(CPU)
-    add_definitions(-DPADDLE_MOBILE_CPU)
-else()
-    file(GLOB_RECURSE _tmp_list src/operators/kernel/arm/*.cpp src/operators/kernel/arm/*.cc)
-    foreach(f ${_tmp_list})
-        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
-    endforeach()
-
-    file(GLOB_RECURSE _tmp_list_h src/operators/kernel/arm/*.h)
-    foreach(f ${_tmp_list_h})
-        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
-    endforeach()
-endif()
-
-if (GPU_CL)
-    add_definitions(-DPADDLE_MOBILE_CL)
-
-    # opencl version
-    add_definitions(-DCL_TARGET_OPENCL_VERSION=220)
-
-    if (ANDROID_ABI STREQUAL "arm64-v8a")    
-        link_libraries(${CMAKE_CURRENT_LIST_DIR}/third_party/opencl/libOpenCL-64.so)
-    else ()
-        link_libraries(${CMAKE_CURRENT_LIST_DIR}/third_party/opencl/libOpenCL.so)
-    endif ()
-
-    include_directories(third_party/opencl/OpenCL-Headers)
-else()
-    file(GLOB_RECURSE _tmp_list src/framework/cl/*.cpp src/operators/kernel/cl/*.cpp)
-    foreach(f ${_tmp_list})
-        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
-    endforeach()
-
-    file(GLOB_RECURSE _tmp_list_h src/framework/cl/*.h)
-    foreach(f ${_tmp_list_h})
-        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
-    endforeach()
-endif()
-
-if(FPGA)
-    file(GLOB_RECURSE _tmp_list src/operators/math/*.cpp src/operators/math/*.cc src/operators/kernel/fpga/*.cc)
-    foreach(f ${_tmp_list})
-        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
-    endforeach()
-    file(GLOB_RECURSE _tmp_list_h src/operators/math/*.h)
-    foreach(f ${_tmp_list_h})
-        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
-    endforeach()
-    list(APPEND PADDLE_MOBILE_CC src/operators/math/softmax.cpp)
-    list(APPEND PADDLE_MOBILE_h src/operators/math/softmax.h)
-    list(APPEND PADDLE_MOBILE_h src/operators/math/math_func_neon.h)
-    if(FPGAV1)
-        add_definitions(-DPADDLE_MOBILE_FPGA)
-        message("FPGA_V1 enabled")
-        add_definitions(-DPADDLE_MOBILE_FPGA_V1)
-        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.cpp src/fpga/V2/*.cpp)
-        foreach(f ${_tmp_list})
-            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
-        endforeach()
-        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.h src/fpga/V2/*.h)
-        foreach(f ${_tmp_list})
-            list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
-        endforeach()
-        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/KD/*.cpp src/fpga/KD/*.cpp)
-        foreach(f ${_tmp_list})
-            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
-        endforeach()
-        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/KD/*.h src/operators/kernel/fpga/KD/*.hpp
-            src/fpga/KD/*.h src/fpga/KD/*.hpp)
-        foreach(f ${_tmp_list})
-            list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
-        endforeach()
-    endif()
-    if(FPGAV2)
-        add_definitions(-DPADDLE_MOBILE_FPGA)
-        message("FPGA_V2 enabled")
-        add_definitions(-DPADDLE_MOBILE_FPGA_V2)
-        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.cpp src/fpga/V1/*.cpp)
-        foreach(f ${_tmp_list})
-            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
-        endforeach()
-        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.h src/fpga/V1/*.h)
-        foreach(f ${_tmp_list})
-            list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
-        endforeach()
-        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/KD/*.cpp src/fpga/KD/*.cpp)
-        foreach(f ${_tmp_list})
-            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
-        endforeach()
-        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/KD/*.h src/operators/kernel/fpga/KD/*.hpp
-            src/fpga/KD/*.h src/fpga/KD/*.hpp)
-        foreach(f ${_tmp_list})
-            list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
-        endforeach()
-    endif()
-    if(FPGAKD)
-        message("FPGAKD enabled")
-        add_definitions(-DPADDLE_MOBILE_FPGA_KD)
-        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.cpp src/fpga/V1/*.cpp)
-        foreach(f ${_tmp_list})
-            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
-        endforeach()
-        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.h src/fpga/V1/*.h)
-        foreach(f ${_tmp_list})
-            list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
-        endforeach()
-        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.cpp src/fpga/V2/*.cpp)
-        foreach(f ${_tmp_list})
-            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
-        endforeach()
-        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.h src/fpga/V2/*.h)
-        foreach(f ${_tmp_list})
-            list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
-        endforeach()
-
-        file(GLOB_RECURSE _tmp_list src/operators/kernel/central-arm-func/*.h)
-        foreach(f ${_tmp_list})
-            list(APPEND PADDLE_MOBILE_H ${f})
-        endforeach()
-        file(GLOB_RECURSE _tmp_list src/operators/kernel/central-arm-func/*.cpp)
-        foreach(f ${_tmp_list})
-            list(APPEND PADDLE_MOBILE_CC ${f})
-        endforeach()
-
-    endif()
-else()
-    file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc)
-    foreach(f ${_tmp_list})
-        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
-    endforeach()
-
-    file(GLOB_RECURSE _tmp_list_h src/operators/kernel/fpga/*.h)
-    foreach(f ${_tmp_list_h})
-        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
-    endforeach()
-
-
-    file(GLOB_RECURSE _tmp_list src/fpga/*.cpp src/fpga/*.cc)
-    foreach(f ${_tmp_list})
-        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
-    endforeach()
-
-    file(GLOB_RECURSE _tmp_list_h src/fpga/*.h)
-    foreach(f ${_tmp_list_h})
-        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
-    endforeach()
-endif()
-
-if(ANDROID_NDK_TOOLCHAIN_INCLUDED)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog")
-else()
-    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/jni/paddle_mobile_jni.h)
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/io/jni/paddle_mobile_jni.cpp)
-    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h)
-endif()
-
-if(IS_IOS)
-else()
-    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/PaddleMobileCPU.h)
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/PaddleMobileCPU.mm)
-    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/op_symbols.h)
-endif ()
-
-set(CMAKE_VERBOSE_MAKEFILE ON)
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY build)
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
-
-# NET default
-if(FPGAV1)
-    set(NET "FPGA_NET_V1" CACHE STRING "select net type")
-elseif(FPGAV2)
-    set(NET "FPGA_NET_V2" CACHE STRING "select net type")
-elseif(FPGAKD)
-    set(NET "FPGA_OPS_KD" CACHE STRING "select net type")
-else()
-    set(NET "default"     CACHE STRING "select net type")
-endif()
-
-set_property(CACHE NET PROPERTY STRINGS "default" "googlenet" "mobilenet" "yolo" "squeezenet" "FPGA_NET_V1" "FPGA_NET_V2" "NLP" "op")
-include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake")
-
-# build library
-if(ANDROID_NDK_TOOLCHAIN_INCLUDED)
-    list(REMOVE_DUPLICATES CMAKE_CXX_FLAGS)
-    add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
-elseif(IS_IOS)
-    if(USE_OPENMP)
-        add_library(paddle-mobile-stage0 STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
-        add_custom_target(paddle-mobile ALL
-            COMMAND libtool -static -o ${CMAKE_BINARY_DIR}/libpaddle-mobile.a ${CMAKE_CURRENT_LIST_DIR}/tools/libomp.a $<TARGET_FILE:paddle-mobile-stage0>
-            WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
-            DEPENDS paddle-mobile
-        )
-        add_dependencies(paddle-mobile paddle-mobile-stage0)
-    else()
-        add_library(paddle-mobile STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
-    endif()
-else()
-  add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
-endif()
-
-# unit test
-if(WITH_TEST AND WITH_SYMBOL)
-    if(IS_IOS)
-    else()
-        add_subdirectory(test)
-    endif()
-elseif(FPGA)
-    add_subdirectory(test)
-endif()
-
-# # if you want to combine third party static librares into paddle mobile so, please uncomment this code block
-# target_link_libraries(
-#     paddle-mobile
-#     -Wl,--whole-archive
-#     "path_to_third_party_static_library"
-#     -Wl,--no-whole-archive
-# )
diff --git a/mobile/CONTRIBUTING.md b/mobile/CONTRIBUTING.md
deleted file mode 100644
index faed8edf8e..0000000000
--- a/mobile/CONTRIBUTING.md
+++ /dev/null
@@ -1,234 +0,0 @@
-# 贡献代码
-
-欢迎您对Paddle-Mobile项目的贡献。
-我们诚挚的感谢你的贡献，这个文档描述了我们的工作方式和工作流程。Paddle-Mobile在PaddlePaddle org下，和服务器版本的Paddle工程的代码规范基本相同，开发者也可以同时参考Paddle的相关文档。
-
-## Workflow
-
-Paddle-Mobile 开发中使用到的几种模型在这个链接下载 [点我](https://mms-mis.cdn.bcebos.com/paddle-mobile/models.zip).  
-之后是贡献代码的主要流程。
-
-### Fork
-
-* Paddle-Mobile采用Pull Request的方式提交代码，禁止直接push，所有的代码都需要人工review。首先要fork一份Paddle-Moble的代码 ["Fork" button](https://help.github.com/articles/fork-a-repo/).
-* 跳转到[Paddle-Mobile](https://github.com/PaddlePaddle/paddle-mobile) GitHub首页，然后单击 `Fork` 按钮，生成自己目录下的仓库，比如 <https://github.com/你的用户名/paddle-mobile>。
-
-### Clone(克隆)
-将远程仓库 clone 到本地：
-
-```bash
-➜  git clone https://github.com/你的用户名/paddle-mobile
-➜  cd Paddle
-```
-
-### 创建本地分支
-
-Paddle-Mobile 和Paddle一样，目前使用[Git流分支模型](http://nvie.com/posts/a-successful-git-branching-model/)进行开发，测试，发行和维护，具体请参考 [Paddle 分支规范](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/releasing_process.md#paddle-分支规范)。
-
-所有的 feature 和 bug fix 的开发工作都应该在一个新的分支上完成，一般从 `develop` 分支上创建新分支。
-
-使用 `git checkout -b` 创建并切换到新分支。
-
-```bash
-➜  git checkout -b my-cool-stuff
-```
-
-值得注意的是，在 checkout 之前，需要保持当前分支目录 clean，否则会把 untracked 的文件也带到新分支上，这可以通过 `git status` 查看。
-
-### 使用 `pre-commit` 钩子
-
-Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理 Git 预提交钩子。 它可以帮助我们格式化源代码（C++，Python），在提交（commit）前自动检查一些基本事宜（如每个文件只有一个 EOL，Git 中不要添加大文件等）。
-
-`pre-commit`测试是 Travis-CI 中单元测试的一部分，不满足钩子的 PR 不能被提交到 Paddle，首先安装并在当前目录运行它：
-
-```bash
-pip install pre-commit
-pre-commit -v -a
-```
-
-Paddle-Mobile 使用 `clang-format` 来调整 C/C++ 源代码格式，在格式化代码时不同的`clang-format`版本会有不同的表现形态，和Paddle不同的是，Paddle-Mobile开发人员使用的是更的5.0版本的llvm工具集。所以为了防止无法CI，请确保 `clang-format` 版本是 5.0 版本。
-
-> 另外：通过`pip install pre-commit`和`conda install -c conda-forge pre-commit`安装的`yapf`稍有不同的，Paddle 开发人员使用的是`pip install pre-commit`。
-
-
-
-## 开始开发
-
-在本例中，我删除了 README.md 中的一行，并创建了一个新文件。
-
-通过 `git status` 查看当前状态，这会提示当前目录的一些变化，同时也可以通过 `git diff` 查看文件具体被修改的内容。
-
-```bash
-➜  git status
-On branch test
-Changes not staged for commit:
-  (use "git add <file>..." to update what will be committed)
-  (use "git checkout -- <file>..." to discard changes in working directory)
-
-	modified:   README.md
-
-Untracked files:
-  (use "git add <file>..." to include in what will be committed)
-
-	test
-
-no changes added to commit (use "git add" and/or "git commit -a")
-```
-
-## 构建
-
-paddle-mobile是为了移动端版本开发的，而移动端大多以arm平台为主。所以我们要交叉编译到arm平台。以cpu为例：
-
-1. 安装NDK最新版
-2. 配置ANDROID_NDK和NDK_ROOT环境变量
-3. 开发，并写单元测试
-4. sh build.sh
-
-## 提交（commit）
-
-接下来我们取消对 README.md 文件的改变，然后提交新添加的 test 文件。
-
-```bash
-➜  git checkout -- README.md
-➜  git status
-On branch test
-Untracked files:
-  (use "git add <file>..." to include in what will be committed)
-
-	test
-
-nothing added to commit but untracked files present (use "git add" to track)
-➜  git add test
-```
-
-Git 每次提交代码，都需要写提交说明，这可以让其他人知道这次提交做了哪些改变，这可以通过`git commit` 完成。
-
-```bash
-▶ pre-commit run -a -v
-[remove-crlf] CRLF end-lines remover........................................Passed
-[remove-tabs] Tabs remover..................................................Passed
-[check-added-large-files] Check for added large files.......................Passed
-[check-merge-conflict] Check for merge conflicts............................Passed
-[check-symlinks] Check for broken symlinks..................................Passed
-[detect-private-key] Detect Private Key.....................................Passed
-[end-of-file-fixer] Fix End of Files........................................Passed
-[trailing-whitespace] Trim Trailing Whitespace..............................Passed
-[copyright] copyright.......................................................Passed
-[clang-format] clang-format.................................................Passed
-[cpplint] cpplint...........................................................Passed
-hookid: cpplint
-
-Ignoring build_bak.sh; not a valid file name (c, cc, h, hpp, c++, h++, cu, cpp, hxx, cxx, cuh)
-Done processing build_bak.sh
-Ignoring build_bak.sh; not a valid file name (c, cc, h, hpp, c++, h++, cu, cpp, hxx, cxx, cuh)
-Done processing build_bak.sh
-```
-
-## 保持本地仓库最新
-
-在准备发起 Pull Request 之前，需要同步原仓库（<https://github.com/PaddlePaddle/paddle-mobile>）最新的代码。
-
-首先通过 `git remote` 查看当前远程仓库的名字。
-
-```bash
-➜  git remote
-origin
-➜  git remote -v
-origin	https://github.com/USERNAME/paddle-mobile (fetch)
-origin	https://github.com/USERNAME/paddle-mobile (push)
-```
-
-这里 origin 是我们 clone 的远程仓库的名字，也就是自己用户名下的 paddle-mobile，接下来我们创建一个原始 paddle-mobile 仓库的远程主机，命名为 upstream。
-
-```bash
-➜  git remote add upstream https://github.com/PaddlePaddle/paddle-mobile
-➜  git remote
-origin
-upstream
-```
-
-获取 upstream 的最新代码并更新当前分支。
-
-```bash
-➜  git fetch upstream
-➜  git pull upstream develop
-```
-
-## Push 到远程仓库
-
-将本地的修改推送到 GitHub 上，也就是 https://github.com/USERNAME/paddle-mobile。
-
-```bash
-# 推送到远程仓库 origin 的 my-cool-stuff 分支上
-➜  git push origin my-cool-stuff
-```
-
-## 建立 Issue 并完成 Pull Request
-
-建立一个 Issue 描述问题，并记录它的编号。
-
-切换到所建分支，然后点击 `New pull request`。
-
-在 PR 的描述说明中，填写 `resolve #Issue编号` 可以在这个 PR 被 merge 后，自动关闭对应的 Issue
-> 具体请见 <https://help.github.com/articles/closing-issues-via-commit-messages/>
-
-
-## review
-
-在接到PR后，可以看到该pr页面内正在运行CI。如果运行出现问题，可以点Details进入Travis平台上看详细内容。
-![](http://otkwwi4x8.bkt.clouddn.com/2018-06-20-15294833030073.jpg)
-
-可以在travis上看到更加详细的信息。
-![](http://otkwwi4x8.bkt.clouddn.com/2018-06-20-15294833651326.jpg)
-
-接下来等待 review，如果有需要修改的地方，参照上述步骤更新 origin 中的对应分支即可。
-
-![](http://otkwwi4x8.bkt.clouddn.com/2018-06-20-15294877166787.jpg)
-之后就可以提交代码了
-
-## 删除远程分支
-
-在 PR 被 merge 进主仓库后，我们可以在 PR 的页面删除远程仓库的分支。
-
-<img width="775" alt="screen shot 2017-04-26 at 9 18 24 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436457/e4cdd472-2ac5-11e7-9272-badc76c4a23e.png">
-
-也可以使用 `git push origin :分支名` 删除远程分支，如：
-
-```bash
-➜  git push origin :my-cool-stuff
-```
-
-## 删除本地分支
-
-最后，删除本地分支。
-
-```bash
-# 切换到 develop 分支
-➜  git checkout develop 
-
-# 删除 my-cool-stuff 分支
-➜  git branch -D my-cool-stuff
-```
-
-至此，我们就完成了一次代码贡献的过程。
-
-## 提交代码的一些约定
-
-为了使评审人在评审代码时更好地专注于代码本身，请您每次提交代码时，遵守以下约定：
-
-1. 请保证Travis-CI 中单元测试能顺利通过。如果没过，说明提交的代码存在问题，评审人一般不做评审。
-2. 提交Pull Request前：
-   - 请注意commit的数量：
-     - 原因：如果仅仅修改一个文件但提交了十几个commit，每个commit只做了少量的修改，这会给评审人带来很大困扰。评审人需要逐一查看每个commit才能知道做了哪些修改，且不排除commit之间的修改存在相互覆盖的情况。
-     - 建议：每次提交时，保持尽量少的commit，可以通过`git commit --amend`补充上次的commit。对已经Push到远程仓库的多个commit，可以参考[squash commits after push](http://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed)。
-   - 请注意每个commit的名称：应能反映当前commit的内容，不能太随意。
-3. 如果解决了某个Issue的问题，请在该Pull Request的**第一个**评论框中加上：`fix #issue_number`，这样当该Pull Request被合并后，会自动关闭对应的Issue。关键词包括：close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved，请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。
-
-此外，在回复评审人意见时，请您遵守以下约定：
-
-1. 评审人的每个意见都必须回复（这是开源社区的基本礼貌，别人帮了忙，应该说谢谢）：
-   - 对评审意见同意且按其修改完的，给个简单的`Done`即可；
-   - 对评审意见不同意的，请给出您自己的反驳理由。
-2. 如果评审意见比较多：
-   - 请给出总体的修改情况。
-   - 请采用[start a review](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/)进行回复，而非直接回复的方式。原因是每个回复都会发送一封邮件，会造成邮件灾难。
diff --git a/mobile/Dockerfile b/mobile/Dockerfile
deleted file mode 100644
index b9fc9ed45c..0000000000
--- a/mobile/Dockerfile
+++ /dev/null
@@ -1,38 +0,0 @@
-FROM ubuntu:16.04
-
-RUN echo '\
-deb <mirror> <version> main restricted universe multiverse\n\
-deb <mirror> <version>-updates main restricted universe multiverse\n\
-deb <mirror> <version>-backports main restricted universe multiverse\n\
-deb <mirror> <version>-security main restricted universe multiverse\n'\
-> /etc/apt/sources.list
-RUN sed -ie 's|<mirror>|http://mirrors.tuna.tsinghua.edu.cn/ubuntu/|' /etc/apt/sources.list
-RUN sed -ie 's|<version>|xenial|' /etc/apt/sources.list
-
-RUN apt-get update && apt-get upgrade -y
-RUN apt-get install -y --no-install-recommends \
-        curl \
-        unzip \
-        git \
-        make \
-        cmake-curses-gui \
-        python \
-        python-pip \
-        python-setuptools \
-        clang-format-5.0 \
-        graphviz \
-        g++-arm-linux-gnueabi \
-        gcc-arm-linux-gnueabi
-RUN apt-get autoremove -y && apt-get clean
-RUN ln -s clang-format-5.0 /usr/bin/clang-format
-RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --upgrade pip
-RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple wheel
-RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pre-commit
-RUN cd /tmp && curl -O https://dl.google.com/android/repository/android-ndk-r17c-linux-x86_64.zip
-RUN curl -O https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz && \
-        tar xzf cmake-3.10.3-Linux-x86_64.tar.gz && \
-        mv cmake-3.10.3-Linux-x86_64 /opt/cmake-3.10 && \
-        mv /usr/bin/cmake /usr/bin/cmake.bak && ln -s /opt/cmake-3.10/bin/cmake /usr/bin/cmake && \
-        mv /usr/bin/ccmake /usr/bin/ccmake.bak && ln -s /opt/cmake-3.10/bin/ccmake /usr/bin/ccmake
-RUN cd /opt && unzip /tmp/android-ndk-r17c-linux-x86_64.zip
-ENV NDK_ROOT /opt/android-ndk-r17c
diff --git a/mobile/LICENSE b/mobile/LICENSE
deleted file mode 100644
index e95626c0e4..0000000000
--- a/mobile/LICENSE
+++ /dev/null
@@ -1,204 +0,0 @@
-Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-
diff --git a/mobile/README.md b/mobile/README.md
deleted file mode 100644
index aa948a7ba7..0000000000
--- a/mobile/README.md
+++ /dev/null
@@ -1,137 +0,0 @@
-# Paddle-Mobile
-
-[![Build Status](https://travis-ci.org/PaddlePaddle/paddle-mobile.svg?branch=develop&longCache=true&style=flat-square)](https://travis-ci.org/PaddlePaddle/paddle-mobile)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc)
-[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
-
-<!--[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle-Mobile.svg)](https://github.com/PaddlePaddle/Paddle-Mobile/releases)
-[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)-->
-
-Welcome to Paddle-Mobile GitHub project。Paddle-Mobile is a project of PaddlePaddle as well as a deep learning framework for embedded platforms.
-
-欢迎来到 Paddle-Mobile GitHub 项目。Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平台的深度学习的框架。
-
-## Features
-
-- high performance in support of ARM CPU 
-- support Mali GPU
-- support Andreno GPU
-- support the realization of GPU Metal on Apple devices
-- support implementation on ZU5、ZU9 and other FPGA-based development boards
-- support implementation on Raspberry Pi and other arm-linux development boards
-
-## Features
-
-- 高性能支持ARM CPU 
-- 支持Mali GPU
-- 支持Andreno GPU
-- 支持苹果设备的GPU Metal实现
-- 支持ZU5、ZU9等FPGA开发板
-- 支持树莓派等arm-linux开发板
-
-
-## Demo
-- [ANDROID](https://github.com/xiebaiyuan/paddle-mobile-demo)
-
-### 原Domo目录
-
-[https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/mobile/demo](https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/mobile/demo)
-
-## Documentation
-
-### Documentation of design
-
-If you want to know more details about the documentation of paddle-mobile design, please refer to the link as follows. There are many previous designs and discussion: [issue](https://github.com/PaddlePaddle/Paddle-Lite/issues).
-
-[link of documentation of design](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/design_doc.md)
-
-### Documentation of development
-
-Documentation of development is mainly about building, running and other tasks.As a developer,you can use it with the help of contributed documents.
-* [iOS](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_ios.md)
-* [Android_CPU](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_android.md)
-* [Android_GPU](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_android_GPU.md)
-* [FPGA](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_fpga.md)
-* [ARM_LINUX](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_arm_linux.md)
-
-### How to contribute your documents
-- [tutorial link to contribute documents](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/CONTRIBUTING.md)
-- Main procedure of contributing code is covered in the document above.If you have other problems during the procedure,please send them as [issue](https://github.com/PaddlePaddle/Paddle-Lite/issues). We will deal with it as quickly as possible.
-
-## 文档
-
-### 设计文档
-
-关于paddle-mobile设计文档在下面链接中，如果想了解更多内容。[issue](https://github.com/PaddlePaddle/Paddle-Lite/issues)中会有很多早期的设计和讨论过程。
-[设计文档链接](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/design_doc.md)
-
-### 开发文档
-
-开发文档主要是关于编译、运行等问题。做为开发者，它可以和贡献文档共同结合使用。
-* [iOS](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_ios.md)
-* [Android_CPU](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_android.md)
-* [Android_GPU](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_android_GPU.md)
-* [FPGA](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_fpga.md)
-* [ARM_LINUX](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_arm_linux.md)
-
-### 贡献文档
-- [贡献文档链接](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/CONTRIBUTING.md)
-- 上面文档中涵盖了主要的贡献代码流程，如果在实践中您还遇到了其他问题，可以发[issue](https://github.com/PaddlePaddle/Paddle-Lite/issues)。我们看到后会尽快处理。
-
-## Acquision of Models
-At present Paddle-Mobile only supports Paddle fluid training model. Models wiil be operated regularly after transformation if you have various models.
-### 1. Use Paddle Fluid directly to train
-It is the most reliable method to be recommanded
-### 2. Transform Caffe to Paddle Fluid model
-[caffe2fluid](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/caffe2fluid)
-### 3. ONNX
-ONNX is expanded as Open Neural Network Exchange. The project is aimed to make a full communication and usage among diffrent nerual network development frameworks.
-
-Except for directly using fluid models trained by PaddlePaddle,you can also get certain Paddle fluid models through onnx transformation.
-
-At present，work in support of onnx is also under operation in Baidu. Related tranformation project can be referred to here：
-[https://github.com/PaddlePaddle/paddle-onnx](https://github.com/PaddlePaddle/paddle-onnx)
-
-### 4. Download parts of testing models and testing pictures
-[http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)
-
-- input data generated by tools from `tools/python/imagetools`.
-
-
-## 模型获得
-目前Paddle-Mobile仅支持Paddle fluid训练的模型。如果你手中的模型是不同种类的模型，需要进行模型转换才可以运行。
-### 1. 直接使用Paddle Fluid训练
-该方式最为可靠，推荐方式
-### 2. caffe转为Paddle Fluid模型
-[caffe2fluid](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/caffe2fluid)
-### 3. ONNX
-ONNX全称为“Open Neural Network Exchange”，即“开放的神经网络切换”。该项目的目的是让不同的神经网络开发框架做到互通互用。
-
-除直接使用PaddlePaddle训练fluid版本的模型外，还可以通过onnx转换得到个别Paddle fluid模型。
-
-目前，百度也在做onnx支持工作。相关转换项目在这里：
-[https://github.com/PaddlePaddle/paddle-onnx](https://github.com/PaddlePaddle/paddle-onnx)
-
-### 4. 部分测试模型和测试图片下载
-[http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)
-
-- 测试输入数据可由本仓库下的脚本`tools/python/imagetools`生成。
-
-## Communication
-- [Github Issues](https://github.com/PaddlePaddle/Paddle/issues): bug reports, feature requests, install issues, usage issues, etc.
-- QQ discussion group: 696965088 (Paddle-Mobile).
-- [Forums](http://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc.
-
-## 交流与反馈
-- 欢迎您通过[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)来提交问题、报告与建议
-- QQ群: 696965088 (Paddle-Mobile)
-- [论坛](http://ai.baidu.com/forum/topic/list/168): 欢迎大家在PaddlePaddle论坛分享在使用PaddlePaddle中遇到的问题和经验, 营造良好的论坛氛围
-
-## Old version Mobile-Deep-Learning
-Original MDL(Mobile-Deep-Learning) project has been transferred to [Mobile-Deep-Learning](https://github.com/allonli/mobile-deep-learning)
-
-## 旧版 Mobile-Deep-Learning
-原MDL(Mobile-Deep-Learning)工程被迁移到了这里 [Mobile-Deep-Learning](https://github.com/allonli/mobile-deep-learning)
-
-## Copyright and License
-[Apache-2.0 license](LICENSE).
diff --git a/mobile/benchmark/arm_benchmark.md b/mobile/benchmark/arm_benchmark.md
deleted file mode 100644
index aacbf3ef05..0000000000
--- a/mobile/benchmark/arm_benchmark.md
+++ /dev/null
@@ -1,36 +0,0 @@
-|mobilenet arm v7|1线程|2线程|4线程|
-|------------|----|-----|-----|
-|麒麟970(ms)|108.180|63.935|37.545|
-|麒麟960(ms)|108.588|63.073|36.822|
-|高通845(ms)|85.952|48.890|28.641|
-|高通835(ms)|105.434|62.752|37.131|
-|||||
-|mobilenetssd arm v7|1线程|2线程|4线程|
-|麒麟970(ms)|212.686|127.205|77.485|
-|麒麟960(ms)|212.641|125.338|75.250|
-|高通845(ms)|182.863|95.671|56.857|
-|高通835(ms)|213.849|127.717|77.006|
-|||||
-|googlenet(v1) arm v7|1线程|2线程|4线程|
-|麒麟970(ms)|335.288|234.559|161.295|
-|麒麟960(ms)|354.443|232.642|157.815|
-|高通845(ms)|282.007|173.146|122.148|
-|高通835(ms)|341.250|233.354|158.554|
-|||||
-|squeezenet arm v7|1线程|2线程|4线程|
-|麒麟970(ms)|83.726|57.944|36.923|
-|麒麟960(ms)|85.835|55.762|36.496|
-|高通845(ms)|71.301|41.618|28.785|
-|高通835(ms)|82.407|56.176|36.455|
-|||||
-|yolo arm v7|1线程|2线程|4线程|
-|麒麟970(ms)|129.658|79.993|49.969|
-|麒麟960(ms)|130.208|78.791|48.390|
-|高通845(ms)|109.244|61.736|40.600|
-|高通835(ms)|130.402|80.863|50.359|
-
-    测试机型信息：
-    麒麟970:荣耀v10     (2.36GHz * 4 + 1.8GHz * 4)
-    麒麟960:华为mate9   (2.36GHz * 4 + 1.8GHz * 4)
-    骁龙835:小米6       (2.45GHz * 4 + 1.9GHz * 4)
-    骁龙845:OPPO FindX  (2.80GHz * 4 + 1.8GHz * 4)
diff --git a/mobile/benchmark/metal_benchmark.md b/mobile/benchmark/metal_benchmark.md
deleted file mode 100644
index 2ffa7a00af..0000000000
--- a/mobile/benchmark/metal_benchmark.md
+++ /dev/null
@@ -1,10 +0,0 @@
-|mobilenetfssd|速度|
-|------------|-----|
-|A9(ms)|33.78|
-|A10(ms)|24.05|
-|A11(ms)|17.15|
-|||
-|genet|速度|
-|A9(ms) |3.49|
-|A10(ms)|2.54|
-|A11(ms)|1.43|
diff --git a/mobile/demo/ReadMe.md b/mobile/demo/ReadMe.md
deleted file mode 100644
index c6d7b3def9..0000000000
--- a/mobile/demo/ReadMe.md
+++ /dev/null
@@ -1,10 +0,0 @@
-## Demo 下载路径
-- [ANDROID](http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobile_Android.zip)
-
-- [IOS](http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobileDemo_iOS.zip)
-
-- 原demo亦可使用getDemo.sh进行下载
-
-```
-sh getDemo.sh
-```
diff --git a/mobile/demo/getDemo.sh b/mobile/demo/getDemo.sh
deleted file mode 100644
index 37662a2f4e..0000000000
--- a/mobile/demo/getDemo.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/usr/bin/env bash
-wget http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobile_Android.zip
-wget http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobileDemo_iOS.zip
-unzip paddle-mobile%2FPaddleMobile_Android.zip
-unzip paddle-mobile%2FPaddleMobileDemo_iOS.zip
-rm -rf paddle-mobile%2FPaddleMobile_Android.zip
-rm -rf paddle-mobile%2FPaddleMobileDemo_iOS.zip
-rm -rf __MACOSX
diff --git a/mobile/doc/build.md b/mobile/doc/build.md
deleted file mode 100644
index 0aaaccd031..0000000000
--- a/mobile/doc/build.md
+++ /dev/null
@@ -1,63 +0,0 @@
-# 环境搭建
-## 使用 docker
-### 1. 安装 docker
-安装 docker 的方式，参考官方文档 [https://docs.docker.com/install/](https://docs.docker.com/install/)
-### 2. 使用 docker 搭建构建环境
-首先进入 paddle-mobile 的目录下，执行 `docker build`
-以 Linux/Mac 为例 (windows 建议在 'Docker Quickstart Terminal' 中执行)
-```
-$ docker build -t paddle-mobile:dev - < Dockerfile
-```
-使用 `docker images` 可以看到我们新建的 image
-```
-$ docker images
-REPOSITORY      TAG     IMAGE ID       CREATED         SIZE
-paddle-mobile   dev     33b146787711   45 hours ago    372MB
-```
-### 3. 使用 docker 构建
-进入 paddle-mobile 目录，执行 docker run
-```
-$ docker run -it --mount type=bind,source=$PWD,target=/paddle-mobile paddle-mobile:dev
-root@5affd29d4fc5:/ # cd /paddle-mobile
-###
-### paddle-mobile 支持 arm 架构下的各种平台，包括 android 以及 linux 等，可以使用不同的
-### toolchain 文件生成满足需要的 makefile
-###
-# 生成构建 android 产出的 Makefile
-root@5affd29d4fc5:/ # rm CMakeCache.txt
-root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-android-neon.cmake
-
-# 生成构建 linux 产出的 Makefile
-root@5affd29d4fc5:/ # rm CMakeCache.txt
-root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-linux-gnueabi.cmake
-```
-### 4. 设置编译选项
-可以通过 ccmake 设置编译选项
-```
-root@5affd29d4fc5:/ # ccmake .
-                                                     Page 1 of 1
- CMAKE_ASM_FLAGS
- CMAKE_ASM_FLAGS_DEBUG
- CMAKE_ASM_FLAGS_RELEASE
- CMAKE_BUILD_TYPE
- CMAKE_INSTALL_PREFIX             /usr/local
- CMAKE_TOOLCHAIN_FILE             /paddle-mobile/tools/toolchains/arm-android-neon.cmake
- CPU                              ON
- DEBUGING                         ON
- FPGA                             OFF
- LOG_PROFILE                      ON
- NET                              googlenet
- USE_EXCEPTION                    ON
- USE_OPENMP                       OFF
-```
-修改选项后，按 `c`, `g` 更新 Makefile
-### 5. 构建
-使用 make 命令进行构建
-```
-root@5affd29d4fc5:/ # make
-```
-### 6. 查看构建产出
-构架产出可以在 host 机器上查看，在 paddle-mobile 的目录下，build 以及 test/build 下，可以使用 adb 指令或者 scp 传输到 device 上执行
-
-## 不使用 docker
-不使用 docker 的方法，可以直接用 cmake 生成 makefile 后构建。使用 ndk 构建 android 应用需要正确设置 NDK_ROOT。构建 linux 应用需要安装 arm-linux-gnueabi-gcc 或者类似的交叉编译工具，可能需要设置 CC，CXX 环境变量，或者在 tools/toolchains/ 中修改 arm-linux-gnueabi.cmake，或者增加自己需要的 toolchain file。
diff --git a/mobile/doc/design_doc.md b/mobile/doc/design_doc.md
deleted file mode 100644
index 1e23efd52c..0000000000
--- a/mobile/doc/design_doc.md
+++ /dev/null
@@ -1,171 +0,0 @@
-# paddle-mobile 设计文档
-
-
-#### 以下是 paddle-mobile 代码的执行流程图:
-
-![执行流程图](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/flow_chart.png)
-
-
-#### 主要分为: Loader 模块、 Program 模块、 Executor 模块、 op 模块、 kernel 模块、scope variable Tensor 模块
-
-#### 下面展开说一下各个模块的作用以及设计思路
-
-### 一. Loader
-先来看一下模型, 模型分为两种结构:
- 一种为参数文件是散开的, 如下图, 红框为模型结构的 protobuf 文件, 其余为参数文件
-
-![模型描述](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/model_desc.png)
-
-
-另一种为参数文件结合在一起的, 如下图, 红框内为模型结构描述的 protobuf 文件, 另一个文件为结合在一起的参数文件
-
-![模型描述combined](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/model_desc_combined.png)
-
-
-loader 模块的作用是将模型结构信息 load 进内存, 将红框内的 protobuf 文件 load 进内存, 并对模型结构进行优化(如将几个细粒度的 op 融合成 粗粒度的 op, 如将 conv、 add、 batchnorm、 relu 融合为 conv\_add\_batchnorm\_relu).
-方便进行算法优化.
-
-__那么为什么融合在一起能够做算法优化 ?__
-
-如果未融合的 conv add batchnorm relu 运算是这样的
-
-```
-[n]
-[conv_res] = conv([n])
-
-for &res in conv_res {
-	res = add_biase(res)
-}
-
-for &res in conv_res {
-	res = batchnorm(res)
-}
-
-for &res in conv_res {
-	res = relu(res)
-}
-
-```
-融合后的 conv\_add\_batchnorm\_relu 运算是这样的:
-
-```
-[n]
-[conv_res] = conv([n])
-
-for &res in conv_res {
-	res = relu(batchnorm(add_biase(res)))
-}
-
-```
-由于 conv 可以转换为两个大矩阵相乘, 更进一步可以分为若干个一行一列的小矩阵相乘, 那最终的运算是这样的:
-
-```
-[n]
-for &res in [res] {
-	res = relu(batchnorm(add_biase(A * B)))
-}
-
-其中 A 和 B 为 1 * k 和 k * 1 矩阵
-
-```
-
-
-
-### 二. Program
-
-program 为 loader 模块的结果, 包含了优化前的模型结构对象, 以及优化后的模型结构对象, 此模块基本对应着 paddle 模型的结构, 关于paddle 模型的一些概念的定义, 详细设计可以参考 [program.md](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), 以下是一个简单的概况: 
-
-* programDesc 中包含着若干个(googlenet mobilenet yolo squeezenet resnet 常见的模型只有一个)可以嵌套的 block, blocks中的第一个block中的某个 op 可能会执行 blocks 中后边 block 中的一系列 op 运算(只有多个block才会有此概念)
-* block 包含着 ops 和 vars
-* ops 为一系列 op 的描述, 描述着每个 op 的类型, 输入输出, 所需参数
-* vars 里包含的为所有 op 运算所需的参数描述
-
-### 三. Executor
-
-executor 主要是用于 op 运算的上层调度操作, 主要有两个操作,  executor 实例化 和 暴露给上层的 predict 方法
-
-* executor 实例化过程中, 主要进行了这几个操作 
-	1. 根据 loader 产出的 program 初始化 operator 对象 
-	2. 分配所有需要用到的内存, 包括每个op 的输入输出, 权重参数, 目前模型的权重参数文件的内存格式为 NCHW, op 的输入输出中间矩阵参数也是 NCHW 格式
-	3. 调用每个 op 的 init 方法, init 方法是每个 op 实现者进行参数预处理的地方, 有助于减少 predict 的耗时
-
-* predict, 主要用于拿到外部的输入, 顺序调用 op 的 run 方法进行运算, 并返回最终的结果.
-
-
-### 四. op
-关于 op 模块代码的详细设计可以参考 [operator部分代码设计](https://github.com/PaddlePaddle/paddle-mobile/issues/300), operator主要包含一个kernel用于运算、一个 param 用于存储属性, operator 主要有三个操作, Init、RunImp、InferShape
-
-* Init: Init 函数主要用于参数预处理, 如对 batchNorm 参数进行预处理, 可以将 batchNorm 运算转化为 a * x + b 形式的运算, 这个函数也会调用, kernel 的 Init 函数对 kernel 进行初始化
-* RunImp: RunImp 函数会调用自己的kernel 的 compute 方法进行运算
-* InferShape: InferShape 函数会根据输入和参数得出输出的形状, 这个函数会在 executor 实例化时, 内存初始化前调用
-
-每个 operator 都需要进行注册才可以被使用, 以 conv 为例, 需在 conv_op.cpp 底部这样写: 
-
-```c++
-// 三个平台都注册了 conv op
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(conv2d);
-REGISTER_OPERATOR_CPU(conv2d, ops::ConvOp);
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(conv2d);
-REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp);
-#endif
-
-```
-
-__一个关于包大小的优化__:
-
-每个 operator 都由一个宏控制编译, 如 conv_op.h(除了 conv_op.h ,  conv_op.cpp、conv_kernle.h、conv_kernle.cpp 也都需要加此宏控制)
-
-```c++
-
-#ifdef CONV_OP    //这个宏控制着 conv_op 是否被编译, 除了 conv_op.h ,  conv_op.cpp、conv_kernle.h conv_kernle.cpp 也都需要加此宏控制
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/conv_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-template <typename DeviceType, typename T>
-class ConvOp
-	//impl  
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
-
-```
-这样做的目的是为了根据不同类型的网络编译特定的op, 在 cmake 中已经配置好不同网络编译的宏, 如果你要进行编译支持 yolo 的模型, 仅需执行:
-
-```sh
-cd toools
-sh build.sh android yolo
-
-```
-这样只会编译 yolo 所包含的四种 op, 极大的减小了包体积和编译时间
-
-### 五. kernel
-kernel 为 op 的底层运算实现, 主要有两个函数, Init 和 Compute, 分别用来初始化、预处理 和 运算操作, 值得提出的是, kernel 会根据泛型特化到不同的平台, 如图所示:
-
-![设备特化](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/devices.png)
-
-不同平台的 kernel 实现, 为同一个 kernel 类不同泛型的特化实现, 目前有三个平台, arm、mali、fpga, 图中的 central-arm-func\ 目录为 op kernel 的 arm 实现, 它承担了 arm\ 目录下 kernel 的底层实现, 同时 arm 处理器作为中央处理器, central-arm-func\ 也可以作为其他协处理器的底层实现, 如: fpga 的某一个 op kernel 还没有 fpga 协处理器的实现, 就可以直接调用使用这里的 arm 实现.
-
-__如果你有兴趣新增一个协处理器实现, 就可以在次添加一个 kernel 目录, 提供协处理器实现, 如果某个 kernel 你没有实现完, 你也可以直接使用 arm 实现__
-
-### 六. scope variable Tensor
-* scope 用来存储管理所需用到的所有 variable(用来存储不同类型的对象, 主要是矩阵Tensor, 也就是说 scpoe 管理着 op 运算过程中所有参数矩阵, 输入输出矩阵), 可以将 scope 理解为一个 map, 这里在 map 上封了一层 scope 的概念是为了方便内存管理
-* variable 可以用来存储不同类型的对象, paddle-mobile 里主要用它来存储矩阵 Tensor
-* tensor 代表着矩阵, 通过泛型可以用来存储不同类型的矩阵, 但需要注意的是, 存入和取出时的类型必须保持一致, 如果类型不一致,  使用 inline const T \*data() const 获取指针会不能通过类型检查, 通过  inline T \*mutable_data() 获取指针会重新分配内存, 以下是关于 Tensor 的一些小概念:
-	1. DDim: 用来存储矩阵的维度信息.
-	2. Slice(): 这个函数用来获取 N 维 (NCHW中的 N) 上切片
-	3. 当实例化未分配内存时, 调用 inline T *mutable_data() 会分配内存
diff --git a/mobile/doc/development_android.md b/mobile/doc/development_android.md
deleted file mode 100644
index c7574eb55e..0000000000
--- a/mobile/doc/development_android.md
+++ /dev/null
@@ -1,189 +0,0 @@
-# Android开发文档
-
-用户可通过如下两种方式进行编译:
-
-- 基于macOS 、Linux交叉编译
-- 基于Docker容器编译
-
-## 基于macOS 、Linux交叉编译
-
-需要: NDK17及以上、cmake 3.0及以上
-
-### 执行编译
-
-在paddle-mobile根目录中，执行以下命令：
-
-```shell
-
-cd tools
-sh build.sh android
-
-# 如果想编译只支持某些特定网络的库 (可以控制包体积, 编译出来的库就只包含了支持这些特定模型的算子), 可以使用
-
-sh build.sh android  mobilenet googlenet
-
-# 当然这些网络是需要在 cmakelist  中配置的(https://github.com/PaddlePaddle/paddle-mobile/blob/73769e7d05ef4820a115ad3fb9b1ca3f55179d03/CMakeLists.txt#L216), 目前配置了几个常见模型
-
-```
-
-执行完毕后，生成的`so`位于`build/release/`目录中：  
-
-- jni 头文件位于 [https://github.com/PaddlePaddle/paddle-mobile/tree/develop/src/io/jni](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/src/io/jni)  
-- c++ 头文件位于 [https://github.com/PaddlePaddle/paddle-mobile/blob/develop/src/io/paddle_inference_api.h](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/src/io/paddle_inference_api.h)   
-
-单测可执行文件位于`test/build`目录中。
-
-如果有环境问题, 可以看接下来的环节
-
-### 环境配置
-
-##### 下载Android NDK
-
-如果你的电脑安装了Android Studio, 可以在 Android Studio 中直接下载安装`NDK`或者可以在 [https://developer.android.com/ndk/](https://developer.android.com/ndk/) 这里自行下载，也可以通过以下命令获取：
-
-- Mac平台
-
-```shell
-wget https://dl.google.com/android/repository/android-ndk-r17b-darwin-x86_64.zip
-unzip android-ndk-r17b-darwin-x86_64.zip
-```
-
-- Linux平台
-
-```shell
-wget https://dl.google.com/android/repository/android-ndk-r17b-linux-x86_64.zip
-unzip android-ndk-r17b-linux-x86_64.zip
-```
-
-##### 设置环境变量
-工程中自带的独立工具链会根据环境变量`NDK_ROOT`查找NDK，因此需要配置环境变量：
-
-```shell
-export NDK_ROOT = "path to ndk"
-```
-
-##### 安装 CMake
-
-- Mac平台
-
-mac 平台下可以使用`homebrew`安装
-
-```shell
-brew install cmake
-```
-
-- Linux平台
-
-linux 下可以使用`apt-get`进行安装
-
-```shell
-apt-get install cmake
-
-```
-
-##### Tips:
-如果想要获得体积更小的库，可选择编译支持指定模型结构的库。
-如执行如下命令：
-
-```shell
-sh build.sh android googlenet
-```
-
-会得到一个支持googlnet的体积更小的库。
-
-## 基于Docker容器编译
-
-### 1. 安装 docker
-
-安装 docker 的方式，参考官方文档 [https://docs.docker.com/install/](https://docs.docker.com/install/)
-
-### 2. 使用 docker 搭建构建环境
-
-首先进入 paddle-mobile 的目录下，执行 `docker build`
-以 Linux/Mac 为例 (windows 建议在 'Docker Quickstart Terminal' 中执行)
-
-```shell
-$ docker build -t paddle-mobile:dev - < Dockerfile
-```
-使用 `docker images` 可以看到我们新建的 image
-
-```shell
-$ docker images
-REPOSITORY      TAG     IMAGE ID       CREATED         SIZE
-paddle-mobile   dev     33b146787711   45 hours ago    372MB
-```
-### 3. 使用 docker 构建
-进入 paddle-mobile 目录，执行 docker run
-
-```shell
-$ docker run -it --mount type=bind,source=$PWD,target=/paddle-mobile paddle-mobile:dev
-root@5affd29d4fc5:/ # cd /paddle-mobile
-# 生成构建 android 产出的 Makefile
-root@5affd29d4fc5:/ # rm CMakeCache.txt
-root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-android-neon.cmake
-# 生成构建 linux 产出的 Makefile
-root@5affd29d4fc5:/ # rm CMakeCache.txt
-root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-linux-gnueabi.cmake
-```
-### 4. 设置编译选项
-
-可以通过 ccmake 设置编译选项
-
-```
-root@5affd29d4fc5:/ # ccmake .
-                                                     Page 1 of 1
- CMAKE_ASM_FLAGS
- CMAKE_ASM_FLAGS_DEBUG
- CMAKE_ASM_FLAGS_RELEASE
- CMAKE_BUILD_TYPE
- CMAKE_INSTALL_PREFIX             /usr/local
- CMAKE_TOOLCHAIN_FILE             /paddle-mobile/tools/toolchains/arm-android-neon.cmake
- CPU                              ON
- DEBUGING                         ON
- FPGA                             OFF
- LOG_PROFILE                      ON
- MALI_GPU                         OFF
- NET                              googlenet
- USE_EXCEPTION                    ON
- USE_OPENMP                       OFF
-```
-修改选项后，按 `c`, `g` 更新 Makefile
-### 5. 构建
-使用 make 命令进行构建
-
-```
-root@5affd29d4fc5:/ # make
-```
-### 6. 查看构建产出
-
-构架产出可以在 host 机器上查看，在 paddle-mobile 的目录下，build 以及`test/build`下，可以使用`adb`指令或`scp`传输到`device`上执行
-
-## 测试
-
-在编译完成后，我们提供了自动化的测试脚本，帮助用户将运行单测文件所需要的模型及库文件push到Android设备
-
-执行下面的脚本，该脚本会下载测试需要的 [mobilenet和test_image_1x3x224x224_float(预处理过的 NCHW 文件) 文件](http://mms-graph.bj.bcebos.com/paddle-mobile/opencl_test_src.zip)，在项目下的`test`目录创建模型和图片文件夹，并将`mobilenet`复制到`paddle-mobile/test/models`目录下，将`test_image_1x3x224x224_float`复制到`paddle-mobile/test/images`目录下
-
-
-```shell
-cd tools
-sh ./prepare_images_and_models.sh
-```
-
-* 执行下面命令将可执行文件和预测需要的文件部署到手机
-
-```shell
-cd tools/android-debug-script
-sh push2android.sh
-```
-
-* mobilenet cpu模型预测结果
-
-假设mobilenet和`test_image_1x3x224x224_float`文件已经推送到手机上，执行下面命令进行mobilenet cpu的预测
-
-```shell
-adb shell
-cd /data/local/tmp/bin/
-export LD_LIBRARY_PATH=.
-./test-mobilenet
-```
diff --git a/mobile/doc/development_android_GPU.md b/mobile/doc/development_android_GPU.md
deleted file mode 100644
index a3fb7dd1dd..0000000000
--- a/mobile/doc/development_android_GPU.md
+++ /dev/null
@@ -1,77 +0,0 @@
-## paddle-mobile GPU开发文档
-
-编译环境配置方法请参考`development_android.md`文档
-
-1. 下载 paddle-mobile
-
-```shell
-git clone https://github.com/PaddlePaddle/paddle-mobile.git
-
-adb pull /system/vendor/lib/libOpenCL.so paddle-mobile/third_party/opencl
-
-# 修改paddle-mobile/CMakeLists.txt文件，执行如下操作:
-# option(GPU_CL "opencl gpu" OFF)->option(GPU_CL "opencl gpu" ON)
-
-cd paddle-mobile/tools
-sh build.sh android
-```
-
-2. 将单测可执行文件和模型部署到手机
-
-执行下面的脚本，该脚本会下载测试需要的 [mobilenet和test_image_1x3x224x224_float(预处理过的 NCHW 文件) 文件](http://mms-graph.bj.bcebos.com/paddle-mobile/opencl_test_src.zip)，在项目下的`test`目录创建模型>和图片文件夹，并将`mobilenet`复制到`paddle-mobile/test/models`目录下，将`test_image_1x3x224x224_float`复制到`paddle-mobile/test/images`目录下
-
-```shell
-cd tools
-sh ./prepare_images_and_models.sh
-```
-
-执行下面命令将可执行文件和预测需要的文件部署到手机
-
-```shell
-cd ../tools/android-debug-script
-sh push2android.sh
-```
-
-3. 在`adb shell`中执行对应的可执行文件（目前只支持mobilenet，后续会支持更多的网络模型）
-
-```shell
-adb shell
-cd /data/local/tmp/bin/
-export LD_LIBRARY_PATH=.
-./test-mobilenetgpu
-```
-
-4. mobilenet cpu模型预测结果
-
-执行下面命令进行mobilenet cpu的预测
-
-```shell
-adb shell
-cd /data/local/tmp/bin/
-export LD_LIBRARY_PATH=.
-./test-mobilenet
-```
-
-5. 预测结果
-
-  手机型号：小米6(CPU 835,GPU Adreno 540)
-
-  mobilenet gpu：预测性能，耗时41ms左右。
-
-  mobilenet cpu:
-
-  1线程：108ms
-  2线程：65ms
-  4线程：38ms
-
-  手机型号：OPPO Findx(CPU 845,GPU Adreno 630)
-
-  mobilenet gpu：预测性能，耗时27ms左右。
-
-  mobilenet cpu:
-
-  1线程：90ms
-  2线程：50ms
-  4线程：29ms
-  
- 备注: GPU 在打开log之后, 会大幅增加性能开销,测试benchmark请关闭CmakeList中Log选项
diff --git a/mobile/doc/development_arm_linux.md b/mobile/doc/development_arm_linux.md
deleted file mode 100644
index bdabd04223..0000000000
--- a/mobile/doc/development_arm_linux.md
+++ /dev/null
@@ -1,62 +0,0 @@
-# ARM Linux开发文档
-
-在ARM Linux如Raspberrypi3，或Firefly-RK3399上编译paddle-mobile（**注：暂不支持ARM Linux GPU**）。
-
-## 预先安装
-
-```shell
-$ sudo apt update
-$ sudo apt-get install -y cmake git
-$ git clone https://github.com/PaddlePaddle/paddle-mobile.git
-```
-
-## 编译
-
-在paddle-mobile根目录中，执行以下命令：
-
-```shell
-# 进入paddle-mobile根目录
-$ cd <your-paddle-mobile>
-
-# 可选：开启GPU支持，在CMakeLists.txt开启GPU_CL选项为ON
-$ cp /usr/lib/aarch64-linux-gnu/libMali.so ./third_party/opencl/
-$ cp /usr/lib/aarch64-linux-gnu/libOpenCL.so ./third_party/opencl/
-$ ln -s ./third_party/opencl/libMali.so ./third_party/opencl/
-
-# 编译
-$ cd ./tools
-$ /bin/bash build.sh arm_linux
-```
-
-- 动态库`so`文件位于`<paddle-mobile-repo>/build/release/arm-linux/build`目录；  
-- 单元测试位于`<paddle-model-repo>/test/build`目录，若只编译如`googlenet`，可以执行`bash build.sh arm_linux googlenet`。
-
-## 运行
-
-接着刚刚的命令，执行MobileNet模型：
-
-```shell
-# 导入编译好的动态库路径到LD_LIBRARY_PATH中
-$ cd ../build/release/arm-linux/build
-$ export LD_LIBRARY_PATH=.
-
-# 执行MobileNet
-# 可选：GPU执行./test-mobilenetgpu
-$ cd ../../../../test/build/
-$ ./test-mobilenet
-
-# 执行顺利会打印如下日志
-load cost :0ms
- Max element is 0.985921 at position 954
-predict cost :121.462ms
-如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana 是否存在?
-```
-
-注意：  
-1. 如果本地仓库中`test`目录下没有模型，脚本会自动下载官方demo模型并解压；  
-2. 因为ARM Linux设备算力限制，编译卡死重启机器尝试单线程编译（修改`tools/build.sh`中`build_for_arm_linux`的编译为`make -j`），或指定编译某个模型（如googlenet）或扩大系统的swap交换空间。
-
-## 其它
-
-- 若编译中提示有不识别的编译选项等ARM Linux平台的编译问题，可尝试修改`tools/build.sh`中的相关编译参数；  
-- Android平台请参考Android开发文档.
diff --git a/mobile/doc/development_fpga.md b/mobile/doc/development_fpga.md
deleted file mode 100644
index 4019739b45..0000000000
--- a/mobile/doc/development_fpga.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# FPGA开发文档
-
-FPGA平台的代码分为V1和V2。要复现V1运行的结果，需要准备专门的硬件、底层驱动程序、FPGA工程。这些都在之前的版本[1.1.1](https://github.com/PaddlePaddle/paddle-mobile/releases/tag/1.1.1) 中提供了链接。根据链接的使用说明，可以复现resnet50的推测结果。
-
-后续PaddleMobile版本，不再提供相关的辅助文件。
diff --git a/mobile/doc/development_ios.md b/mobile/doc/development_ios.md
deleted file mode 100644
index 1dbc7555e8..0000000000
--- a/mobile/doc/development_ios.md
+++ /dev/null
@@ -1,85 +0,0 @@
-# iOS开发文档
-
-## CPU
-
-需要: xcode
-
-### 编译
-
-```sh
-
-# 在 paddle-mobile 目录下:
-cd tools
-
-sh build.sh ios
-
-# 如果只想编译某个特定模型的 op, 则需执行以下命令
-sh build.sh ios googlenet
-
-# 在这个文件夹下, 你可以拿到生成的 .a 库
-cd ../build/release/ios/build
-
-```
-#### 常见问题:
-
-1. No iOS SDK's found in default search path ...
-
-    这个问题是因为 tools/ios-cmake/ios.toolchain.cmake 找不到你最近使用的 iOS SDK 路径, 所以需要自己进行指定, 
-    以我当前的环境为例: 在 tools/ios-cmake/ios.toolchain.cmake 143行前添加我本地的 iOS SDK 路径: set(CMAKE_IOS_SDK_ROOT "/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk")
-
-### 集成
-
-```
-将上一步生成的:
-libpaddle-mobile.a
-
-/src/ios_io/ 下的
-PaddleMobileCPU.h
-```
-拖入工程
-
-#### oc 接口
-
-接口如下:
-
-```
-/*
-	创建对象
-*/
-- (instancetype)init;
-
-/*
-	load 模型, 开辟内存
-*/
-- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
-
-/*
-	进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
-*/
-- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
-
-/*
-	进行预测
-*/
-- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
-
-/*
-	清理内存
-*/
-- (void)clear;
-
-```
-
-## GPU
-
-需要: xcode、cocoapods  
-
-```
-# 在 paddle-mobile 目录下:
-cd metal
-
-pod install
-
-open paddle-mobile.xcworkspace
-
-```
diff --git a/mobile/doc/quantification.md b/mobile/doc/quantification.md
deleted file mode 100644
index 4e851581ae..0000000000
--- a/mobile/doc/quantification.md
+++ /dev/null
@@ -1,33 +0,0 @@
-# Quantification 模型量化、反量化
-
-## 背景故事
-部分网络如AlexNet训练出的模型体积较大，不适宜在移动设备上使用。
-
-
-## 解决模型过大办法
-1. 选用适合移动端的模型结构如：mobilenet、googlenet、 yolo、squeezenet 等；
-2. 使用我们提供的量化工具，可以在几乎不影响精度的情况下将float32模型减小至原模型的 1/4；
-
-- - - - - 
-## 量化工具介绍
-
-### 模型转化工具目录：
-
-- [量化工具目录](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/tools/quantification)
-
-- [模型转化工具](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/tools/quantification/convert.cpp)
-
-#### 使用说明
-- [工具使用](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/tools/quantification/README.md)
-
-## 如何读取量化后的模型
-load方法中添加了 quantification 参数，默认为false。 如果需要load量化后的模型，按需传参即可。
-
-[我是源代码](https://github.com/PaddlePaddle/paddle-mobile/blob/55302b33ea3bd68c9797d8f65e527544792b8095/src/io/paddle_mobile.h)
-
-```c++
-bool Load(const std::string &dirname, bool optimize = false,
-            bool quantification = false, int batch_size = 1);
-```
-
-- - - - - 
diff --git a/mobile/src/common/common.h b/mobile/src/common/common.h
deleted file mode 100644
index c7a681f426..0000000000
--- a/mobile/src/common/common.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <chrono>  // NOLINT
-
-namespace paddle_mobile {
-
-using Time = decltype(std::chrono::high_resolution_clock::now());
-
-inline Time time() { return std::chrono::high_resolution_clock::now(); }
-
-inline double time_diff(Time t1, Time t2) {
-  typedef std::chrono::microseconds ms;
-  auto diff = t2 - t1;
-  ms counter = std::chrono::duration_cast<ms>(diff);
-  return counter.count() / 1000.0;
-}
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/common/enforce.h b/mobile/src/common/enforce.h
deleted file mode 100644
index 9cabee989b..0000000000
--- a/mobile/src/common/enforce.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef ENABLE_EXCEPTION
-#include <stdio.h>
-#include <stdlib.h>
-#include <exception>
-#include <string>
-#endif
-
-namespace paddle_mobile {
-
-#ifdef ENABLE_EXCEPTION
-struct PaddleMobileException : public std::exception {
-  const std::string exception_prefix = "paddle mobile C++ Exception: \n";
-  std::string message;
-
-  PaddleMobileException(const char *header, const char *detail,
-                        const char *file, const int line) {
-    char buffer[1500];
-    snprintf(buffer, sizeof(buffer),
-             "%s| %s \n| [in file] : %s\n| [on line] : %d\n| [detail]  : %s\n",
-             exception_prefix.c_str(), header, file, line, detail);
-    message = std::string(buffer);
-  }
-  const char *what() const noexcept { return message.c_str(); }
-};
-
-#define PADDLE_MOBILE_THROW_EXCEPTION(...)                                 \
-  {                                                                        \
-    char buffer[1000];                                                     \
-    snprintf(buffer, sizeof(buffer), __VA_ARGS__);                         \
-    throw paddle_mobile::PaddleMobileException("Custom Exception", buffer, \
-                                               __FILE__, __LINE__);        \
-  }                                                                        \
-  exit(0);
-
-#define PADDLE_MOBILE_ENFORCE(stat, ...)                                      \
-  {                                                                           \
-    if (stat) {                                                               \
-    } else {                                                                  \
-      char buffer[1000];                                                      \
-      snprintf(buffer, sizeof(buffer), __VA_ARGS__);                          \
-      throw paddle_mobile::PaddleMobileException("paddle-mobile enforce",     \
-                                                 buffer, __FILE__, __LINE__); \
-    }                                                                         \
-  }
-#else
-#define PADDLE_MOBILE_THROW_EXCEPTION(...)
-
-#define PADDLE_MOBILE_ENFORCE(stat, ...) \
-  {                                      \
-    if (stat) {                          \
-    } else {                             \
-    }                                    \
-  }
-
-#endif
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/common/log.h b/mobile/src/common/log.h
deleted file mode 100644
index dde50b6170..0000000000
--- a/mobile/src/common/log.h
+++ /dev/null
@@ -1,235 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#ifdef PADDLE_MOBILE_DEBUG
-#include <cstring>
-#include <iostream>
-#include <sstream>
-#include <string>
-#endif
-#ifdef ANDROID
-#include <android/log.h>
-#endif
-
-namespace paddle_mobile {
-
-#ifdef PADDLE_MOBILE_DEBUG
-
-#ifdef ANDROID
-
-static const char *ANDROID_LOG_TAG =
-    "paddle_mobile LOG built on " __DATE__ " " __TIME__;
-
-#define ANDROIDLOGI(...)                                               \
-  __android_log_print(ANDROID_LOG_INFO, ANDROID_LOG_TAG, __VA_ARGS__); \
-  fprintf(stderr, "%s\n", __VA_ARGS__);                                \
-  fflush(stderr)
-#define ANDROIDLOGW(...)                                                  \
-  __android_log_print(ANDROID_LOG_WARNING, ANDROID_LOG_TAG, __VA_ARGS__); \
-  fprintf(stderr, "%s\n", __VA_ARGS__);                                   \
-  fflush(stderr)
-#define ANDROIDLOGD(...)                                                \
-  __android_log_print(ANDROID_LOG_DEBUG, ANDROID_LOG_TAG, __VA_ARGS__); \
-  fprintf(stderr, "%s\n", __VA_ARGS__);                                 \
-  fflush(stderr)
-#define ANDROIDLOGE(...)                                                \
-  __android_log_print(ANDROID_LOG_ERROR, ANDROID_LOG_TAG, __VA_ARGS__); \
-  fprintf(stderr, "%s\n", __VA_ARGS__);                                 \
-  fflush(stderr)
-#else
-#define ANDROIDLOGI(...)
-#define ANDROIDLOGW(...)
-#define ANDROIDLOGD(...)
-#define ANDROIDLOGE(...)
-
-#endif
-
-enum LogLevel {
-  kNO_LOG,
-  kLOG_ERROR,
-  kLOG_WARNING,
-  kLOG_INFO,
-  kLOG_DEBUG,
-  kLOG_DEBUG1,
-  kLOG_DEBUG2,
-  kLOG_DEBUG3,
-  kLOG_DEBUG4
-};
-
-// log level
-static LogLevel log_level = kLOG_DEBUG4;
-
-static std::vector<std::string> logs{"NO",      "ERROR ",  "WARNING",
-                                     "INFO   ", "DEBUG  ", "DEBUG1 ",
-                                     "DEBUG2 ", "DEBUG3 ", "DEBUG4 "};
-struct ToLog;
-struct Print;
-
-struct Print {
-  friend struct ToLog;
-
-  template <typename T>
-  Print &operator<<(T const &value) {
-    buffer_ << value;
-    return *this;
-  }
-
- private:
-  void print(LogLevel level) {
-    // buffer_ << std::endl;
-    if (level == kLOG_ERROR) {
-#ifdef ANDROID
-      ANDROIDLOGE(buffer_.str().c_str());
-#else
-      std::cerr << buffer_.str() << std::endl;
-#endif
-    } else {
-#ifdef ANDROID
-      ANDROIDLOGI(buffer_.str().c_str());
-#else
-      std::cout << buffer_.str() << std::endl;
-#endif
-    }
-  }
-  std::ostringstream buffer_;
-};
-
-struct ToLog {
-  explicit ToLog(LogLevel level = kLOG_DEBUG, const std::string &info = "")
-      : level_(level) {
-    unsigned blanks =
-        (unsigned)(level > kLOG_DEBUG ? (level - kLOG_DEBUG) * 4 : 1);
-    printer_ << logs[level] << " " << info << ":" << std::string(blanks, ' ');
-  }
-
-  template <typename T>
-  ToLog &operator<<(T const &value) {
-    printer_ << value;
-    return *this;
-  }
-
-  ~ToLog() { printer_.print(level_); }
-
- private:
-  LogLevel level_;
-  Print printer_;
-};
-
-#define LOG(level)                                                           \
-  if (level > paddle_mobile::log_level) {                                    \
-  } else                                                                     \
-    paddle_mobile::ToLog(                                                    \
-        level, static_cast<const std::stringstream &>(                       \
-                   std::stringstream()                                       \
-                   << "[file: "                                              \
-                   << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) \
-                                              : __FILE__)                    \
-                   << "] [line: " << __LINE__ << "] ")                       \
-                   .str())
-
-#define DLOG                                                          \
-  if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) {         \
-  } else                                                              \
-    paddle_mobile::ToLog(                                             \
-        paddle_mobile::kLOG_DEBUG,                                    \
-        static_cast<const std::stringstream &>(                       \
-            std::stringstream()                                       \
-            << "[file: "                                              \
-            << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) \
-                                       : __FILE__)                    \
-            << "] [line: " << __LINE__ << "] ")                       \
-            .str())
-
-#define LOGF(level, format, ...)          \
-  if (level > paddle_mobile::log_level) { \
-  } else                                  \
-    printf(format, ##__VA_ARGS__)
-
-#define DLOGF(format, ...)                                    \
-  if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) { \
-  } else                                                      \
-    printf(format, ##__VA_ARGS__)
-
-#else
-
-#define ANDROIDLOGI(...)
-#define ANDROIDLOGW(...)
-#define ANDROIDLOGD(...)
-#define ANDROIDLOGE(...)
-
-enum LogLevel {
-  kNO_LOG,
-  kLOG_ERROR,
-  kLOG_WARNING,
-  kLOG_INFO,
-  kLOG_DEBUG,
-  kLOG_DEBUG1,
-  kLOG_DEBUG2,
-  kLOG_DEBUG3,
-  kLOG_DEBUG4
-};
-
-struct ToLog;
-struct Print {
-  friend struct ToLog;
-  template <typename T>
-  Print &operator<<(T const &value) {
-    return *this;
-  }
-};
-
-struct ToLog {
-  ToLog(LogLevel level) {}
-
-  template <typename T>
-  ToLog &operator<<(T const &value) {
-    return *this;
-  }
-};
-
-#define LOG(level) \
-  if (true) {      \
-  } else           \
-    paddle_mobile::ToLog(level)
-
-#define DLOG  \
-  if (true) { \
-  } else      \
-    paddle_mobile::ToLog(paddle_mobile::kLOG_DEBUG)
-
-#define LOGF(level, format, ...)
-
-#define DLOGF(format, ...)
-
-#endif
-
-template <typename T>
-Print &operator<<(Print &printer, const std::vector<T> &v) {
-  printer << "[ ";
-
-  for (int i = 0; i < v.size(); ++i) {
-    const auto &value = v[i];
-    printer << value << " ";
-    if (i % 10 == 9) {
-      printer << "\n";
-    }
-  }
-  printer << " ]";
-  return printer;
-}
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/common/threadpool.h b/mobile/src/common/threadpool.h
deleted file mode 100644
index bf7894dd94..0000000000
--- a/mobile/src/common/threadpool.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <condition_variable>
-#include <functional>
-#include <future>
-#include <memory>
-#include <mutex>
-#include <queue>
-#include <stdexcept>
-#include <thread>
-#include <vector>
-
-namespace paddle_mobile {
-class ThreadPool {
- public:
-  static ThreadPool& getThreadPool();
-  static int getThreadPoolThreadId();
-  explicit ThreadPool(size_t);
-  template <class F, class... Args>
-  auto enqueue(F&& f, Args&&... args)
-      -> std::future<typename std::result_of<F(Args...)>::type>;
-  ~ThreadPool();
-  int getTid(const std::thread::id& id) {
-    for (int i = 0; i < workers.size(); i++) {
-      if (workers[i].get_id() == id) {
-        return i;
-      }
-    }
-    return -1;
-  }
-
- private:
-  // need to keep track of threads so we can join them
-  std::vector<std::thread> workers;
-  // the task queue
-  std::queue<std::function<void()>> tasks;
-
-  // synchronization
-  std::mutex queue_mutex;
-  std::condition_variable condition;
-  bool stop;
-};
-
-// the constructor just launches some amount of workers
-inline ThreadPool::ThreadPool(size_t threads) : stop(false) {
-  for (size_t i = 0; i < threads; ++i)
-    workers.emplace_back([this] {
-      for (;;) {
-        std::function<void()> task;
-        {
-          std::unique_lock<std::mutex> lock(this->queue_mutex);
-          this->condition.wait(
-              lock, [this] { return this->stop || !this->tasks.empty(); });
-          // for (;;) {
-          //     if (this->stop || !this->tasks.empty()) {
-          //         break;
-          //     }
-          //     lock.unlock();
-          //     lock.lock();
-          // }
-          if (this->stop && this->tasks.empty()) return;
-          task = std::move(this->tasks.front());
-          this->tasks.pop();
-        }
-
-        task();
-      }
-    });
-}
-
-// add new work item to the pool
-template <class F, class... Args>
-auto ThreadPool::enqueue(F&& f, Args&&... args)
-    -> std::future<typename std::result_of<F(Args...)>::type> {
-  using return_type = typename std::result_of<F(Args...)>::type;
-
-  auto task = std::make_shared<std::packaged_task<return_type()>>(
-      std::bind(std::forward<F>(f), std::forward<Args>(args)...));
-
-  std::future<return_type> res = task->get_future();
-  {
-    std::unique_lock<std::mutex> lock(queue_mutex);
-
-    // don't allow enqueueing after stopping the pool
-    // if(stop)
-    //     throw std::runtime_error("enqueue on stopped ThreadPool");
-
-    tasks.emplace([task]() { (*task)(); });
-  }
-  condition.notify_one();
-  return res;
-}
-
-// the destructor joins all threads
-inline ThreadPool::~ThreadPool() {
-  {
-    std::unique_lock<std::mutex> lock(queue_mutex);
-    stop = true;
-  }
-  condition.notify_all();
-  for (std::thread& worker : workers) worker.join();
-}
-
-ThreadPool& ThreadPool::getThreadPool() {
-  static ThreadPool threadPool(3);
-  return threadPool;
-}
-
-int ThreadPool::getThreadPoolThreadId() {
-  return getThreadPool().getTid(std::this_thread::get_id());
-}
-}  // namespace paddle_mobile
diff --git a/mobile/src/common/type_define.h b/mobile/src/common/type_define.h
deleted file mode 100644
index bedbd2a75e..0000000000
--- a/mobile/src/common/type_define.h
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <string>
-#include <vector>
-
-namespace paddle_mobile {
-
-typedef enum {
-  _void = 0,
-  _float,
-  _int,
-  _uint16_t,
-  _double,
-  _int64_t,
-  _size_t,
-  _int16_t,
-  _int8_t,
-  _uint8_t,
-  _bool,
-  _string,
-  _floats = 100,
-  _ints,
-  _int64_ts,
-  _size_ts,
-  _bools,
-  _strings,
-  _const_float = 200,
-  _const_int,
-  _block = 300,
-  _tensor,
-  _lod_tensor,
-  _blocks,
-  _tensors,
-  _lod_tensors,
-  _p_block = 400,
-  _p_tensor,
-  _p_lod_tensor,
-  _p_blocks,
-  _p_tensors,
-  _p_lod_tensors,
-  _scopes = 500,
-  _selected_rows,
-  _dim0 = 600,
-  _dim1,
-  _dim2,
-  _dim3,
-  _dim4,
-  _dim5,
-  _dim6,
-  _dim7,
-  _dim8,
-  _dim9,
-#ifdef PADDLE_MOBILE_CL
-  _cl_image,
-#endif
-} kTypeId_t;
-
-template <typename T>
-struct TypeIdWrapper {
-  inline std::string name();
-  inline kTypeId_t hash_code();
-};
-
-template <typename T>
-struct type_id {
-  const kTypeId_t hash_code() const { return TypeIdWrapper<T>().hash_code(); }
-  const std::string name() const { return TypeIdWrapper<T>().name(); }
-
-  template <typename OtherType>
-  bool operator==(const type_id<OtherType> &operand) const {
-    return this->hash_code() == operand.hash_code();
-  }
-};
-
-#define OVERIDE_TYPEID_OPERATOR(oprand)                                    \
-  template <typename T>                                                    \
-  inline bool operator oprand(const kTypeId_t &t0, const type_id<T> &t1) { \
-    return t0 oprand t1.hash_code();                                       \
-  }                                                                        \
-  template <typename T>                                                    \
-  inline bool operator oprand(const type_id<T> &t0, const kTypeId_t &t1) { \
-    return t1 oprand t0.hash_code();                                       \
-  }
-
-OVERIDE_TYPEID_OPERATOR(==)
-OVERIDE_TYPEID_OPERATOR(!=)
-
-namespace framework {
-class BlockDesc;
-class Tensor;
-class LoDTensor;
-class SelectedRows;
-class Scope;
-#ifdef PADDLE_MOBILE_CL
-class CLImage;
-#endif
-
-template <int>
-struct Dim;
-}  // namespace framework
-
-#define REGISTER_TYPE_ID(Type, TypeName)                         \
-  template <>                                                    \
-  struct TypeIdWrapper<Type> {                                   \
-    inline std::string name() { return std::string(#TypeName); } \
-    inline kTypeId_t hash_code() { return kTypeId_t::TypeName; } \
-  };
-
-REGISTER_TYPE_ID(void, _void)
-REGISTER_TYPE_ID(float, _float)
-REGISTER_TYPE_ID(int, _int)
-REGISTER_TYPE_ID(uint16_t, _uint16_t)
-REGISTER_TYPE_ID(double, _double)
-REGISTER_TYPE_ID(int64_t, _int64_t)
-REGISTER_TYPE_ID(size_t, _size_t)
-REGISTER_TYPE_ID(int16_t, _int16_t)
-REGISTER_TYPE_ID(int8_t, _int8_t)
-REGISTER_TYPE_ID(uint8_t, _uint8_t)
-REGISTER_TYPE_ID(bool, _bool)
-REGISTER_TYPE_ID(std::string, _string)
-REGISTER_TYPE_ID(std::vector<float>, _floats)
-REGISTER_TYPE_ID(std::vector<int>, _ints)
-REGISTER_TYPE_ID(std::vector<int64_t>, _int64_ts)
-REGISTER_TYPE_ID(std::vector<size_t>, _size_ts)
-REGISTER_TYPE_ID(std::vector<bool>, _bools)
-REGISTER_TYPE_ID(std::vector<std::string>, _strings)
-
-REGISTER_TYPE_ID(float const, _const_float)
-REGISTER_TYPE_ID(int const, _const_int)
-
-REGISTER_TYPE_ID(framework::BlockDesc, _block)
-REGISTER_TYPE_ID(framework::Tensor, _tensor)
-REGISTER_TYPE_ID(framework::LoDTensor, _lod_tensor)
-REGISTER_TYPE_ID(std::vector<framework::BlockDesc>, _blocks)
-REGISTER_TYPE_ID(std::vector<framework::Tensor>, _tensors)
-REGISTER_TYPE_ID(std::vector<framework::LoDTensor>, _lod_tensors)
-
-REGISTER_TYPE_ID(framework::BlockDesc *, _p_block)
-REGISTER_TYPE_ID(framework::Tensor *, _p_tensor)
-REGISTER_TYPE_ID(framework::LoDTensor *, _p_lod_tensor)
-REGISTER_TYPE_ID(std::vector<framework::BlockDesc *>, _p_blocks)
-REGISTER_TYPE_ID(std::vector<framework::Tensor *>, _p_tensors)
-REGISTER_TYPE_ID(std::vector<framework::LoDTensor *>, _p_lod_tensors)
-
-REGISTER_TYPE_ID(std::vector<framework::Scope *>, _scopes);
-REGISTER_TYPE_ID(framework::SelectedRows, _selected_rows)
-REGISTER_TYPE_ID(framework::Dim<0>, _dim0)
-REGISTER_TYPE_ID(framework::Dim<1>, _dim1)
-REGISTER_TYPE_ID(framework::Dim<2>, _dim2)
-REGISTER_TYPE_ID(framework::Dim<3>, _dim3)
-REGISTER_TYPE_ID(framework::Dim<4>, _dim4)
-REGISTER_TYPE_ID(framework::Dim<5>, _dim5)
-REGISTER_TYPE_ID(framework::Dim<6>, _dim6)
-REGISTER_TYPE_ID(framework::Dim<7>, _dim7)
-REGISTER_TYPE_ID(framework::Dim<8>, _dim8)
-REGISTER_TYPE_ID(framework::Dim<9>, _dim9)
-
-#ifdef PADDLE_MOBILE_CL
-REGISTER_TYPE_ID(framework::CLImage, _cl_image)
-#endif
-}  // namespace paddle_mobile
-
-namespace std {
-
-template <>
-struct hash<paddle_mobile::kTypeId_t> {
-  size_t operator()(const paddle_mobile::kTypeId_t &t) const {
-    return std::hash<int>{}(static_cast<int>(t));
-  }
-};
-
-}  // namespace std
diff --git a/mobile/src/common/types.cpp b/mobile/src/common/types.cpp
deleted file mode 100755
index c056a58130..0000000000
--- a/mobile/src/common/types.cpp
+++ /dev/null
@@ -1,260 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "common/types.h"
-#include <vector>
-
-namespace paddle_mobile {
-
-const char *G_OP_TYPE_CONV = "conv2d";
-const char *G_OP_TYPE_BATCHNORM = "batch_norm";
-const char *G_OP_TYPE_INSTANCENORM = "instance_norm";
-const char *G_OP_TYPE_BOX_CODER = "box_coder";
-const char *G_OP_TYPE_CONCAT = "concat";
-const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
-const char *G_OP_TYPE_ELEMENTWISE_SUB = "elementwise_sub";
-const char *G_OP_TYPE_ELEMENTWISE_MUL = "elementwise_mul";
-const char *G_OP_TYPE_FILL_CONSTANT = "fill_constant";
-const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
-const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU = "fusion_conv_add_prelu";
-const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU = "fusion_conv_add_add_prelu";
-const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
-const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU = "fusion_conv_bn_add_relu";
-const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
-const char *G_OP_TYPE_FUSION_CONV_RELU = "fusion_conv_relu";
-const char *G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu";
-const char *G_OP_TYPE_FC = "fusion_fc";
-const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
-const char *G_OP_TYPE_LRN = "lrn";
-const char *G_OP_TYPE_MUL = "mul";
-const char *G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
-const char *G_OP_TYPE_NORM = "norm";
-const char *G_OP_TYPE_POLYGON_BOX_TRANSFORM = "polygon_box_transform";
-const char *G_OP_TYPE_POOL2D = "pool2d";
-const char *G_OP_TYPE_PRIOR_BOX = "prior_box";
-const char *G_OP_TYPE_DENSITY_PRIOR_BOX = "density_prior_box";
-const char *G_OP_TYPE_RELU = "relu";
-const char *G_OP_TYPE_RELU6 = "relu6";
-const char *G_OP_TYPE_LEAKY_RELU = "leaky_relu";
-const char *G_OP_TYPE_RESHAPE = "reshape";
-const char *G_OP_TYPE_RESHAPE2 = "reshape2";
-const char *G_OP_TYPE_SCALE = "scale";
-const char *G_OP_TYPE_SIGMOID = "sigmoid";
-const char *G_OP_TYPE_SOFTMAX = "softmax";
-const char *G_OP_TYPE_TRANSPOSE = "transpose";
-const char *G_OP_TYPE_TRANSPOSE2 = "transpose2";
-const char *G_OP_TYPE_SPLIT = "split";
-const char *G_OP_TYPE_FEED = "feed";
-const char *G_OP_TYPE_FETCH = "fetch";
-const char *G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
-const char *G_OP_TYPE_IM2SEQUENCE = "im2sequence";
-const char *G_OP_TYPE_DROPOUT = "dropout";
-const char *G_OP_TYPE_FUSION_CONV_ADD_BN = "fusion_conv_add_bn";
-const char *G_OP_TYPE_FUSION_POOL_BN = "fusion_pool_bn";
-const char *G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU =
-    "fusion_elementwise_add_relu";
-const char *G_OP_TYPE_FUSION_FC_RELU = "fusion_fc_relu";
-const char *G_OP_TYPE_REGION = "region";
-const char *G_OP_TYPE_FUSION_CONV_BN = "fusion_conv_bn";
-const char *G_OP_TYPE_CONV_TRANSPOSE = "conv2d_transpose";
-const char *G_OP_TYPE_PRELU = "prelu";
-const char *G_OP_TYPE_LOOKUP_TABLE = "lookup_table";
-const char *G_OP_TYPE_GRU = "gru";
-const char *G_OP_TYPE_GRU_UNIT = "gru_unit";
-const char *G_OP_TYPE_CRF = "crf_decoding";
-const char *G_OP_TYPE_BILINEAR_INTERP = "bilinear_interp";
-const char *G_OP_TYPE_NEAREST_INTERP = "nearest_interp";
-const char *G_OP_TYPE_FLATTEN = "flatten";
-const char *G_OP_TYPE_FLATTEN2 = "flatten2";
-const char *G_OP_TYPE_SHAPE = "shape";
-const char *G_OP_TYPE_SUM = "sum";
-const char *G_OP_TYPE_TOP_K = "top_k";
-const char *G_OP_TYPE_CAST = "cast";
-const char *G_OP_TYPE_LOG = "log";
-const char *G_OP_TYPE_LOD_RESET = "lod_reset";
-const char *G_OP_TYPE_LESS_THAN = "less_than";
-const char *G_OP_TYPE_LOGICAL_AND = "logical_and";
-const char *G_OP_TYPE_LOGICAL_OR = "logical_or";
-const char *G_OP_TYPE_LOGICAL_NOT = "logical_not";
-const char *G_OP_TYPE_LOGICAL_XOR = "logical_xor";
-const char *G_OP_TYPE_WRITE_TO_ARRAY = "write_to_array";
-const char *G_OP_TYPE_READ_FROM_ARRAY = "read_from_array";
-const char *G_OP_TYPE_IS_EMPTY = "is_empty";
-const char *G_OP_TYPE_INCREMENT = "increment";
-const char *G_OP_TYPE_EXP = "exp";
-
-const char *G_OP_TYPE_QUANTIZE = "quantize";
-const char *G_OP_TYPE_DEQUANTIZE = "dequantize";
-const char *G_OP_TYPE_FUSION_DEQUANT_BN = "fusion_dequant_bn";
-const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN = "fusion_dequant_add_bn";
-const char *G_OP_TYPE_FUSION_DEQUANT_BN_RELU = "fusion_dequant_bn_relu";
-const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU = "fusion_dequant_add_bn_relu";
-const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_QUANT =
-    "fusion_dequant_add_bn_quant";
-const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT =
-    "fusion_dequant_add_bn_relu_quant";
-
-const char *G_OP_TYPE_TANH = "tanh";
-const char *G_OP_TYPE_FUSION_DECONV_RELU = "fusion_deconv_relu";
-const char *G_OP_TYPE_FUSION_DECONV_ADD = "fusion_deconv_add";
-const char *G_OP_TYPE_FUSION_DECONV_ADD_RELU = "fusion_deconv_add_relu";
-
-const char *G_OP_TYPE_SEQUENCE_EXPAND = "sequence_expand";
-const char *G_OP_TYPE_SEQUENCE_POOL = "sequence_pool";
-const char *G_OP_TYPE_SEQUENCE_SOFTMAX = "sequence_softmax";
-const char *G_OP_TYPE_SLICE = "slice";
-const char *G_OP_TYPE_ANCHOR_GENERATOR = "anchor_generator";
-const char *G_OP_TYPE_GENERATE_PROPOSALS = "generate_proposals";
-const char *G_OP_TYPE_PSROI_POOL = "psroi_pool";
-const char *G_OP_TYPE_ROIALIGN_POOL = "roialign_pool";
-const char *G_OP_TYPE_ROI_PERSPECTIVE = "roi_perspective_transform";
-const char *G_OP_TYPE_PAD2D = "pad2d";
-const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU = "fusion_deconv_add_bn_relu";
-const char *G_OP_TYPE_FUSION_DECONV_ADD_BN = "fusion_deconv_add_bn";
-const char *G_OP_TYPE_FUSION_DECONV_BN_RELU = "fusion_deconv_bn_relu";
-const char *G_OP_TYPE_ASSIGN = "assign";
-const char *G_OP_TYPE_REDUCE_PROD = "reduce_prod";
-const char *G_OP_TYPE_EQUAL = "equal";
-const char *G_OP_TYPE_CONDITIONAL_BLOCK = "conditional_block";
-const char *G_OP_TYPE_RANGE = "range";
-const char *G_OP_TYPE_WHILE = "while";
-const char *G_OP_TYPE_BEAM_SEARCH_DECODE = "beam_search_decode";
-const char *G_OP_TYPE_FILL_CONSTAN_BATCH_SIZE_LIKE =
-    "fill_constant_batch_size_like";
-const char *G_OP_TYPE_FUSION_INSTANCENORM_RELU = "fusion_instancenorm_relu";
-
-std::unordered_map<
-    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
-    op_input_output_key = {
-        {G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
-        {G_OP_TYPE_FUSION_DWCONV_BN_RELU, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_CONV_RELU, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_CONV_BN_RELU, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_PRELU, {{"X", "Alpha"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_CONV_ADD, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_RELU6, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_LEAKY_RELU, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_SCALE, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_SIGMOID, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_ELEMENTWISE_ADD, {{"X", "Y"}, {"Out"}}},
-        {G_OP_TYPE_ELEMENTWISE_SUB, {{"X", "Y"}, {"Out"}}},
-        {G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}},
-        {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}},
-        {G_OP_TYPE_INSTANCENORM, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_INSTANCENORM_RELU, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_LRN, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_CONCAT, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_SPLIT, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_FEED, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_FETCH, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_TRANSPOSE, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_TRANSPOSE2, {{"X"}, {"Out", "XShape"}}},
-        {G_OP_TYPE_BOX_CODER,
-         {{"PriorBox", "PriorBoxVar", "TargetBox"}, {"OutputBox"}}},
-        {G_OP_TYPE_FUSION_CONV_ADD_BN_RELU, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_CONV_BN_ADD_RELU, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_PRIOR_BOX, {{"Image", "Input"}, {"Boxes", "Variances"}}},
-        {G_OP_TYPE_DENSITY_PRIOR_BOX,
-         {{"Image", "Input"}, {"Boxes", "Variances"}}},
-        {G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
-        {G_OP_TYPE_POLYGON_BOX_TRANSFORM, {{"Input"}, {"Output"}}},
-        {G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
-        {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_RESHAPE2, {{"X"}, {"Out", "XShape"}}},
-        {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
-        {G_OP_TYPE_FILL_CONSTANT, {{}, {"Out"}}},
-        {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_CONV_ADD_PRELU, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_DROPOUT, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_EXP, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_CONV_ADD_BN, {{"Input"}, {"Y"}}},
-        {G_OP_TYPE_FUSION_POOL_BN, {{"X"}, {"Y"}}},
-        {G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU, {{"X", "Y"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_FC_RELU, {{"X", "Y", "Z"}, {"Out"}}},
-        {G_OP_TYPE_REGION, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_CONV_BN, {{"Input"}, {"Y"}}},
-        {G_OP_TYPE_LOOKUP_TABLE, {{"W", "Ids"}, {"Out"}}},
-        {G_OP_TYPE_GRU,
-         {{"Input", "H0", "Weight", "Bias"},
-          {"BatchGate", "BatchResetHiddenPrev", "BatchHidden", "Hidden"}}},
-        {G_OP_TYPE_GRU_UNIT,
-         {{"Input", "HiddenPrev", "Weight", "Bias"},
-          {"Gate", "ResetHiddenPrev", "Hidden"}}},
-        {G_OP_TYPE_CRF, {{"Emission", "Transition", "Label"}, {"ViterbiPath"}}},
-        {G_OP_TYPE_BILINEAR_INTERP, {{"OutSize", "X"}, {"Out"}}},
-        {G_OP_TYPE_NEAREST_INTERP, {{"OutSize", "X"}, {"Out"}}},
-        {G_OP_TYPE_FLATTEN, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_FLATTEN2, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_SHAPE, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_CONV_TRANSPOSE, {{"Input"}, {"Output"}}},
-        {G_OP_TYPE_SUM, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_TOP_K, {{"X"}, {"Out", "Indices"}}},
-        {G_OP_TYPE_CAST, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}},
-        {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_DEQUANT_BN, {{"X", "Scale"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_DEQUANT_ADD_BN, {{"X", "Scale"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_DEQUANT_BN_RELU, {{"X", "Scale"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU, {{"X", "Scale"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT,
-         {{"X", "Scale"}, {"Out", "OutScale"}}},
-        {G_OP_TYPE_FUSION_DEQUANT_ADD_BN_QUANT,
-         {{"X", "Scale"}, {"Out", "OutScale"}}},
-        {G_OP_TYPE_TANH, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_DECONV_RELU, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_DECONV_ADD, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_DECONV_ADD_RELU, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_SEQUENCE_EXPAND, {{"X", "Y"}, {"Out"}}},
-        {G_OP_TYPE_SEQUENCE_POOL, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_SEQUENCE_SOFTMAX, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_NORM, {{"X"}, {"Out", "Norm"}}},
-        {G_OP_TYPE_LOG, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_LOD_RESET, {{"X", "Y"}, {"Out"}}},
-        {G_OP_TYPE_LESS_THAN, {{"X", "Y"}, {"Out"}}},
-        {G_OP_TYPE_LOGICAL_AND, {{"X", "Y"}, {"Out"}}},
-        {G_OP_TYPE_LOGICAL_OR, {{"X", "Y"}, {"Out"}}},
-        {G_OP_TYPE_LOGICAL_XOR, {{"X", "Y"}, {"Out"}}},
-        {G_OP_TYPE_LOGICAL_NOT, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_WRITE_TO_ARRAY, {{"X", "I"}, {"Out"}}},
-        {G_OP_TYPE_READ_FROM_ARRAY, {{"X", "I"}, {"Out"}}},
-        {G_OP_TYPE_IS_EMPTY, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_INCREMENT, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_SLICE, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_ANCHOR_GENERATOR, {{"Input"}, {"Anchors", "Variances"}}},
-        {G_OP_TYPE_GENERATE_PROPOSALS,
-         {{"Scores", "BboxDeltas", "ImInfo", "Anchors", "Variances"},
-          {"RpnRois", "RpnRoiProbs"}}},
-        {G_OP_TYPE_PSROI_POOL, {{"X", "ROIs"}, {"Out"}}},
-        {G_OP_TYPE_ROIALIGN_POOL, {{"X", "ROIs"}, {"Out"}}},
-        {G_OP_TYPE_ROI_PERSPECTIVE, {{"X", "ROIs"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_DECONV_ADD_BN, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_DECONV_BN_RELU, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_REDUCE_PROD, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_ASSIGN, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_EQUAL, {{"X", "Y"}, {"Out"}}},
-        {G_OP_TYPE_RANGE, {{"Start", "End", "Step"}, {"Out"}}},
-        {G_OP_TYPE_CONDITIONAL_BLOCK, {{"Input", "Cond"}, {"Out", "Scope"}}},
-        {G_OP_TYPE_WHILE, {{"Condition", "X"}, {"Out", "StepScopes"}}},
-        {G_OP_TYPE_BEAM_SEARCH_DECODE,
-         {{"Ids", "Scores"}, {"SentenceIds", "SentenceScores"}}},
-        {G_OP_TYPE_FILL_CONSTAN_BATCH_SIZE_LIKE, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_PAD2D, {{"X"}, {"Out"}}}};
-}  // namespace paddle_mobile
diff --git a/mobile/src/common/types.h b/mobile/src/common/types.h
deleted file mode 100644
index 7b71bccf46..0000000000
--- a/mobile/src/common/types.h
+++ /dev/null
@@ -1,268 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-namespace paddle_mobile {
-enum class Precision : int { FP32 = 0, FP16 = 1 };
-
-typedef int16_t half;
-
-template <Precision p>
-struct PrecisionTrait {
-  typedef void ptype;
-};
-
-template <>
-struct PrecisionTrait<Precision::FP32> {
-  typedef float ptype;
-};
-template <>
-struct PrecisionTrait<Precision::FP16> {
-  typedef half ptype;
-};
-
-//! device type
-enum DeviceTypeEnum {
-  kINVALID = -1,
-  kCPU = 0,
-  kFPGA = 1,
-  kGPU_MALI = 2,
-  kGPU_CL = 3
-};
-
-template <DeviceTypeEnum T>
-struct DeviceType {};
-
-typedef DeviceType<kCPU> CPU;
-typedef DeviceType<kFPGA> FPGA;
-typedef DeviceType<kGPU_CL> GPU_CL;
-
-//! data type
-enum DataType {
-  PM_INVALID = -1,
-  PM_HALF = 0,
-  PM_FLOAT = 1,
-  PM_DOUBLE = 2,
-  PM_INT8 = 3,
-  PM_INT16 = 4,
-  PM_INT32 = 5,
-  PM_INT64 = 6,
-  PM_UINT8 = 7,
-  PM_UINT16 = 8,
-  PM_UINT32 = 9,
-  PM_STRING = 10,
-  PM_BOOL = 11,
-  PM_SHAPE = 12,
-  PM_TENSOR = 13
-};
-//!
-enum PMStatus {
-  PMSuccess = 0xFF,        /*!< No errors */
-  PMNotInitialized = 0x01, /*!< Data not initialized. */
-  PMInvalidValue = 0x02,   /*!< Incorrect variable value. */
-  PMMemAllocFailed = 0x03, /*!< Memory allocation error. */
-  PMUnKownError = 0x04,    /*!< Unknown error. */
-  PMOutOfAuthority = 0x05, /*!< Try to modified data not your own*/
-  PMOutOfMem = 0x06,       /*!< OOM error*/
-  PMUnImplError = 0x07,    /*!< Unimplement error. */
-  PMWrongDevice = 0x08,    /*!< un-correct device. */
-  PMException = 0x09       /*!< throw exception. */
-};
-
-enum RoundType {
-  ROUND_NEAREST_AWAY_ZERO = 0,
-  ROUND_NEAREST_TOWARDS_ZERO = 1,
-  ROUND_NEAREST_TO_EVEN = 2,
-};
-
-enum ActivationType {
-  IDENTITY = 0,
-  RELU = 1,
-  RELU6 = 2,
-  PRELU = 3,
-  LEAKY_RELU = 4,
-  TANH = 5,
-  SIGMOID = 6,
-  LOG = 7,
-};
-
-enum PoolingType {
-  MAX = 0,
-  AVG = 1,
-  SUM = 2,
-  FIRST = 3,
-  LAST = 4,
-};
-
-enum PowerMode {
-  PERFORMANCE_PRIORITY = 0,  // let threads run on big cores if
-                             // thread_num <= big_cores_num,
-                             // otherwise the power mode will be
-                             // set to AUTO and all threads are
-                             // scheduled by system
-  EFFICIENCY_PRIORITY = 1,   // let threads run on little cores if
-                             // thread_num <= little_cores_num,
-                             // otherwise the power mode will be
-                             // set to AUTO and all threads are
-                             // scheduled by system
-  PERFORMANCE_ONLY = 2,      // force threads run on big cores,
-                             // and the remains are ignored if
-                             // exceed the number big cores
-  EFFICIENCY_ONLY = 3,       // force threads run on little cores,
-                             // and the remains are ignored if
-                             // exceed the number of little cores
-  AUTO = 4,                  // scheduled by system
-};
-
-enum MemoryOptimizationLevel {
-  NoMemoryOptimization = 0,
-  MemoryOptimizationWithoutFeeds = 1,
-  FullMemoryOptimization = 2,
-};
-
-struct PaddleMobileConfigInternal {
-  bool load_when_predict = false;
-  MemoryOptimizationLevel memory_optimization_level =
-      MemoryOptimizationWithoutFeeds;
-  std::string model_obfuscate_key = "";
-};
-
-enum ARMArch {
-  APPLE = 0,
-  A53 = 53,
-  A55 = 55,
-  A57 = 57,
-  A72 = 72,
-  A73 = 73,
-  A75 = 75,
-  A76 = 76,
-  ARM_UNKOWN = -1
-};
-
-extern const char *G_OP_TYPE_CONV;
-extern const char *G_OP_TYPE_BATCHNORM;
-extern const char *G_OP_TYPE_INSTANCENORM;
-extern const char *G_OP_TYPE_BOX_CODER;
-extern const char *G_OP_TYPE_CONCAT;
-extern const char *G_OP_TYPE_ELEMENTWISE_ADD;
-extern const char *G_OP_TYPE_ELEMENTWISE_SUB;
-extern const char *G_OP_TYPE_ELEMENTWISE_MUL;
-extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU;
-extern const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU;
-extern const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU;
-extern const char *G_OP_TYPE_FC;
-extern const char *G_OP_TYPE_FUSION_CONV_ADD;
-extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
-extern const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU;
-extern const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU;
-extern const char *G_OP_TYPE_FUSION_CONV_BN_RELU;
-extern const char *G_OP_TYPE_FUSION_CONV_RELU;
-
-extern const char *G_OP_TYPE_GRU;
-extern const char *G_OP_TYPE_GRU_UNIT;
-extern const char *G_OP_TYPE_CRF;
-extern const char *G_OP_TYPE_BILINEAR_INTERP;
-extern const char *G_OP_TYPE_NEAREST_INTERP;
-extern const char *G_OP_TYPE_FLATTEN;
-extern const char *G_OP_TYPE_FLATTEN2;
-extern const char *G_OP_TYPE_SHAPE;
-extern const char *G_OP_TYPE_LRN;
-extern const char *G_OP_TYPE_MUL;
-extern const char *G_OP_TYPE_MULTICLASS_NMS;
-extern const char *G_OP_TYPE_NORM;
-extern const char *G_OP_TYPE_POOL2D;
-extern const char *G_OP_TYPE_PRIOR_BOX;
-extern const char *G_OP_TYPE_RELU;
-extern const char *G_OP_TYPE_RELU6;
-extern const char *G_OP_TYPE_LEAKY_RELU;
-extern const char *G_OP_TYPE_RESHAPE;
-extern const char *G_OP_TYPE_SCALE;
-extern const char *G_OP_TYPE_SIGMOID;
-extern const char *G_OP_TYPE_SOFTMAX;
-extern const char *G_OP_TYPE_TRANSPOSE;
-extern const char *G_OP_TYPE_SPLIT;
-extern const char *G_OP_TYPE_FEED;
-extern const char *G_OP_TYPE_FETCH;
-extern const char *G_OP_TYPE_DEPTHWISE_CONV;
-extern const char *G_OP_TYPE_IM2SEQUENCE;
-extern const char *G_OP_TYPE_DROPOUT;
-
-extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN;
-extern const char *G_OP_TYPE_FUSION_POOL_BN;
-extern const char *G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU;
-extern const char *G_OP_TYPE_FUSION_FC_RELU;
-extern const char *G_OP_TYPE_REGION;
-extern const char *G_OP_TYPE_FUSION_CONV_BN;
-extern const char *G_OP_TYPE_CONV_TRANSPOSE;
-extern const char *G_OP_TYPE_PRELU;
-extern const char *G_OP_TYPE_SUM;
-extern const char *G_OP_TYPE_TOP_K;
-extern const char *G_OP_TYPE_CAST;
-extern const char *G_OP_TYPE_LOG;
-extern const char *G_OP_TYPE_LOD_RESET;
-extern const char *G_OP_TYPE_LESS_THAN;
-extern const char *G_OP_TYPE_LOGICAL_AND;
-extern const char *G_OP_TYPE_LOGICAL_OR;
-extern const char *G_OP_TYPE_LOGICAL_NOT;
-extern const char *G_OP_TYPE_LOGICAL_XOR;
-extern const char *G_OP_TYPE_WRITE_TO_ARRAY;
-extern const char *G_OP_TYPE_READ_FROM_ARRAY;
-extern const char *G_OP_TYPE_IS_EMPTY;
-extern const char *G_OP_TYPE_INCREMENT;
-
-extern const char *G_OP_TYPE_QUANTIZE;
-extern const char *G_OP_TYPE_DEQUANTIZE;
-extern const char *G_OP_TYPE_FUSION_DEQUANT_BN;
-extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN;
-extern const char *G_OP_TYPE_FUSION_DEQUANT_BN_RELU;
-extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU;
-extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_QUANT;
-extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT;
-
-extern const char *G_OP_TYPE_TANH;
-extern const char *G_OP_TYPE_FUSION_DECONV_RELU;
-
-extern const char *G_OP_TYPE_FUSION_DECONV_ADD;
-extern const char *G_OP_TYPE_FUSION_DECONV_ADD_RELU;
-
-extern const char *G_OP_TYPE_SEQUENCE_EXPAND;
-extern const char *G_OP_TYPE_SEQUENCE_POOL;
-extern const char *G_OP_TYPE_SEQUENCE_SOFTMAX;
-
-extern const char *G_OP_TYPE_SLICE;
-extern const char *G_OP_TYPE_ANCHOR_GENERATOR;
-extern const char *G_OP_TYPE_GENERATE_PROPOSALS;
-extern const char *G_OP_TYPE_PSROI_POOL;
-extern const char *G_OP_TYPE_ROIALIGN_POOL;
-extern const char *G_OP_TYPE_ROI_PERSPECTIVE;
-extern const char *G_OP_TYPE_PAD2D;
-extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU;
-extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN;
-extern const char *G_OP_TYPE_FUSION_DECONV_BN_RELU;
-extern const char *G_OP_TYPE_FUSION_INSTANCENORM_RELU;
-
-extern std::unordered_map<
-    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
-    op_input_output_key;
-
-typedef std::map<std::string, std::vector<std::string>> VariableNameMap;
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/common/util.cpp b/mobile/src/common/util.cpp
deleted file mode 100644
index acdc42e879..0000000000
--- a/mobile/src/common/util.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "common/util.h"
-
-namespace paddle_mobile {
-
-char *ReadFileToBuff(std::string filename) {
-  FILE *file = fopen(filename.c_str(), "rb");
-  PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
-                        filename.c_str());
-  fseek(file, 0, SEEK_END);
-  int64_t size = ftell(file);
-  PADDLE_MOBILE_ENFORCE(size > 0, "file should not be empty");
-  rewind(file);
-  char *data = new char[size];
-  size_t bytes_read = fread(data, 1, size, file);
-  PADDLE_MOBILE_ENFORCE(bytes_read == size,
-                        "read binary file bytes do not match with fseek");
-  fclose(file);
-  return data;
-}
-
-int GetFileLength(std::string filename) {
-  FILE *file = fopen(filename.c_str(), "rb");
-  PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
-                        filename.c_str());
-  fseek(file, 0, SEEK_END);
-  int size = ftell(file);
-  PADDLE_MOBILE_ENFORCE(size > 0, "file should not be empty");
-  fclose(file);
-  return size;
-}
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/common/util.h b/mobile/src/common/util.h
deleted file mode 100644
index 212362a52e..0000000000
--- a/mobile/src/common/util.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "common/enforce.h"
-
-namespace paddle_mobile {
-
-char *ReadFileToBuff(std::string filename);
-
-int GetFileLength(std::string filename);
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/common/variant.h b/mobile/src/common/variant.h
deleted file mode 100644
index 63795468ff..0000000000
--- a/mobile/src/common/variant.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdlib>
-#include <cstring>
-#include <memory>
-#include <string>
-#include <utility>
-#include "common/enforce.h"
-#include "common/log.h"
-#include "common/type_define.h"
-
-namespace paddle_mobile {
-
-template <int ID, typename Type>
-struct IDToType {
-  typedef Type type_t;
-};
-
-template <typename F, typename... Ts>
-struct VariantHelper {
-  inline static void Destroy(kTypeId_t type, void *raw_ptr) {
-    if (type == type_id<F>()) {
-      auto ptr = reinterpret_cast<F *>(raw_ptr);
-      delete ptr;
-    } else {
-      VariantHelper<Ts...>::Destroy(type, raw_ptr);
-    }
-  }
-};
-
-template <typename F>
-struct VariantHelper<F> {
-  inline static void Destroy(kTypeId_t type, void *raw_ptr) {
-    if (type == type_id<F>()) {
-      auto ptr = reinterpret_cast<F *>(raw_ptr);
-      delete ptr;
-    }
-  }
-};
-
-template <typename... Ts>
-struct VariantDeleter {
-  kTypeId_t type_ = type_id<void>().hash_code();
-  explicit VariantDeleter(kTypeId_t type) { type_ = type; }
-  void operator()(void *raw_ptr) {
-    // DLOG << "variant delete: " << type_ << " " << raw_ptr;
-    VariantHelper<Ts...>::Destroy(type_, raw_ptr);
-  }
-};
-
-template <typename... Ts>
-struct Variant {
-  Variant() : type_(invalid_type()) {}
-
-  Variant(const Variant &variant) {
-    type_ = variant.type_;
-    data_ = variant.data_;
-  }
-
-  virtual ~Variant() {
-    // DLOG << "variant deinit: " << type_ << " " << (void *)data_.get();
-    data_.reset();
-  }
-
-  template <typename T, typename... Args>
-  void Set(Args &&... args) {
-    auto raw_ptr = new T(std::forward<Args>(args)...);
-    type_ = type_id<T>().hash_code();
-    // DLOG << "variant new: " << type_ << " " << (void *)raw_ptr;
-    data_.reset(raw_ptr, VariantDeleter<Ts...>(type_));
-  }
-
-  template <typename T>
-  T &Get() const {
-    return *const_cast<T *>(reinterpret_cast<const T *>(data_.get()));
-  }
-
-  kTypeId_t TypeId() const { return type_; }
-
- private:
-  static inline kTypeId_t invalid_type() { return type_id<void>().hash_code(); }
-  typedef VariantHelper<Ts...> helper;
-  kTypeId_t type_ = type_id<void>().hash_code();
-  std::shared_ptr<void> data_;
-};
-
-template <typename T>
-struct Vistor {
-  typedef T type_t;
-};
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/alignment.h b/mobile/src/fpga/KD/alignment.h
deleted file mode 100644
index 4df852f5fd..0000000000
--- a/mobile/src/fpga/KD/alignment.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef alignment_h
-#define alignment_h
-
-#include <stdio.h>
-
-#include "llapi/zynqmp_api.h"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-inline int align_image(int wc) { return align_to_x(wc, IMAGE_ALIGNMENT); }
-
-}  // namespace zynqmp
-}  // namespace paddle_mobile
-
-#endif /* alignment_h */
diff --git a/mobile/src/fpga/KD/context.hpp b/mobile/src/fpga/KD/context.hpp
deleted file mode 100644
index e7c106ff8c..0000000000
--- a/mobile/src/fpga/KD/context.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef Context_hpp
-#define Context_hpp
-
-#include <stdio.h>
-#include "pe.hpp"
-#include "pes/conv_pe.hpp"
-#include "pes/depthwise_conv_pe.hpp"
-#include "pes/fully_connected_pe.hpp"
-#include "pes/input_pe.hpp"
-#include "pes/output_pe.hpp"
-#include "pes/pooling_pe.hpp"
-#include "pes/softmax_pe.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-class Context {
- public:
-  template <typename Ptype>
-  Ptype& pe() {
-    if (pe_ == nullptr) {
-      pe_ = new Ptype();
-    }
-    return static_cast<Ptype&>(*pe_);
-  }
-
-  ~Context() {
-    if (pe_ != nullptr) {
-      delete pe_;
-    }
-  }
-
- private:
-  PE* pe_ = nullptr;
-};
-}  // namespace zynqmp
-}  // namespace paddle_mobile
-
-#endif /* Context_hpp */
diff --git a/mobile/src/fpga/KD/dl_engine.cpp b/mobile/src/fpga/KD/dl_engine.cpp
deleted file mode 100644
index a8923fd6c5..0000000000
--- a/mobile/src/fpga/KD/dl_engine.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "dl_engine.hpp"
diff --git a/mobile/src/fpga/KD/dl_engine.hpp b/mobile/src/fpga/KD/dl_engine.hpp
deleted file mode 100644
index 861d7231dc..0000000000
--- a/mobile/src/fpga/KD/dl_engine.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdio.h>
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-class DLEngine {
- public:
-  static DLEngine& get_instance() {
-    static DLEngine s_instance;
-    return s_instance;
-  }
-
- private:
-  DLEngine();
-};
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/float16.hpp b/mobile/src/fpga/KD/float16.hpp
deleted file mode 100644
index f3d5c6637b..0000000000
--- a/mobile/src/fpga/KD/float16.hpp
+++ /dev/null
@@ -1,506 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-typedef uint16_t float16;
-
-static const uint32_t mantissatable[2048] = {
-    0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34a00000,
-    0x34c00000, 0x34e00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000,
-    0x35400000, 0x35500000, 0x35600000, 0x35700000, 0x35800000, 0x35880000,
-    0x35900000, 0x35980000, 0x35a00000, 0x35a80000, 0x35b00000, 0x35b80000,
-    0x35c00000, 0x35c80000, 0x35d00000, 0x35d80000, 0x35e00000, 0x35e80000,
-    0x35f00000, 0x35f80000, 0x36000000, 0x36040000, 0x36080000, 0x360c0000,
-    0x36100000, 0x36140000, 0x36180000, 0x361c0000, 0x36200000, 0x36240000,
-    0x36280000, 0x362c0000, 0x36300000, 0x36340000, 0x36380000, 0x363c0000,
-    0x36400000, 0x36440000, 0x36480000, 0x364c0000, 0x36500000, 0x36540000,
-    0x36580000, 0x365c0000, 0x36600000, 0x36640000, 0x36680000, 0x366c0000,
-    0x36700000, 0x36740000, 0x36780000, 0x367c0000, 0x36800000, 0x36820000,
-    0x36840000, 0x36860000, 0x36880000, 0x368a0000, 0x368c0000, 0x368e0000,
-    0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369a0000,
-    0x369c0000, 0x369e0000, 0x36a00000, 0x36a20000, 0x36a40000, 0x36a60000,
-    0x36a80000, 0x36aa0000, 0x36ac0000, 0x36ae0000, 0x36b00000, 0x36b20000,
-    0x36b40000, 0x36b60000, 0x36b80000, 0x36ba0000, 0x36bc0000, 0x36be0000,
-    0x36c00000, 0x36c20000, 0x36c40000, 0x36c60000, 0x36c80000, 0x36ca0000,
-    0x36cc0000, 0x36ce0000, 0x36d00000, 0x36d20000, 0x36d40000, 0x36d60000,
-    0x36d80000, 0x36da0000, 0x36dc0000, 0x36de0000, 0x36e00000, 0x36e20000,
-    0x36e40000, 0x36e60000, 0x36e80000, 0x36ea0000, 0x36ec0000, 0x36ee0000,
-    0x36f00000, 0x36f20000, 0x36f40000, 0x36f60000, 0x36f80000, 0x36fa0000,
-    0x36fc0000, 0x36fe0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000,
-    0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000,
-    0x370a0000, 0x370b0000, 0x370c0000, 0x370d0000, 0x370e0000, 0x370f0000,
-    0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000,
-    0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371a0000, 0x371b0000,
-    0x371c0000, 0x371d0000, 0x371e0000, 0x371f0000, 0x37200000, 0x37210000,
-    0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000,
-    0x37280000, 0x37290000, 0x372a0000, 0x372b0000, 0x372c0000, 0x372d0000,
-    0x372e0000, 0x372f0000, 0x37300000, 0x37310000, 0x37320000, 0x37330000,
-    0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000,
-    0x373a0000, 0x373b0000, 0x373c0000, 0x373d0000, 0x373e0000, 0x373f0000,
-    0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000,
-    0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374a0000, 0x374b0000,
-    0x374c0000, 0x374d0000, 0x374e0000, 0x374f0000, 0x37500000, 0x37510000,
-    0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000,
-    0x37580000, 0x37590000, 0x375a0000, 0x375b0000, 0x375c0000, 0x375d0000,
-    0x375e0000, 0x375f0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000,
-    0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000,
-    0x376a0000, 0x376b0000, 0x376c0000, 0x376d0000, 0x376e0000, 0x376f0000,
-    0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000,
-    0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377a0000, 0x377b0000,
-    0x377c0000, 0x377d0000, 0x377e0000, 0x377f0000, 0x37800000, 0x37808000,
-    0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000,
-    0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000,
-    0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000,
-    0x378a0000, 0x378a8000, 0x378b0000, 0x378b8000, 0x378c0000, 0x378c8000,
-    0x378d0000, 0x378d8000, 0x378e0000, 0x378e8000, 0x378f0000, 0x378f8000,
-    0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000,
-    0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000,
-    0x37960000, 0x37968000, 0x37970000, 0x37978000, 0x37980000, 0x37988000,
-    0x37990000, 0x37998000, 0x379a0000, 0x379a8000, 0x379b0000, 0x379b8000,
-    0x379c0000, 0x379c8000, 0x379d0000, 0x379d8000, 0x379e0000, 0x379e8000,
-    0x379f0000, 0x379f8000, 0x37a00000, 0x37a08000, 0x37a10000, 0x37a18000,
-    0x37a20000, 0x37a28000, 0x37a30000, 0x37a38000, 0x37a40000, 0x37a48000,
-    0x37a50000, 0x37a58000, 0x37a60000, 0x37a68000, 0x37a70000, 0x37a78000,
-    0x37a80000, 0x37a88000, 0x37a90000, 0x37a98000, 0x37aa0000, 0x37aa8000,
-    0x37ab0000, 0x37ab8000, 0x37ac0000, 0x37ac8000, 0x37ad0000, 0x37ad8000,
-    0x37ae0000, 0x37ae8000, 0x37af0000, 0x37af8000, 0x37b00000, 0x37b08000,
-    0x37b10000, 0x37b18000, 0x37b20000, 0x37b28000, 0x37b30000, 0x37b38000,
-    0x37b40000, 0x37b48000, 0x37b50000, 0x37b58000, 0x37b60000, 0x37b68000,
-    0x37b70000, 0x37b78000, 0x37b80000, 0x37b88000, 0x37b90000, 0x37b98000,
-    0x37ba0000, 0x37ba8000, 0x37bb0000, 0x37bb8000, 0x37bc0000, 0x37bc8000,
-    0x37bd0000, 0x37bd8000, 0x37be0000, 0x37be8000, 0x37bf0000, 0x37bf8000,
-    0x37c00000, 0x37c08000, 0x37c10000, 0x37c18000, 0x37c20000, 0x37c28000,
-    0x37c30000, 0x37c38000, 0x37c40000, 0x37c48000, 0x37c50000, 0x37c58000,
-    0x37c60000, 0x37c68000, 0x37c70000, 0x37c78000, 0x37c80000, 0x37c88000,
-    0x37c90000, 0x37c98000, 0x37ca0000, 0x37ca8000, 0x37cb0000, 0x37cb8000,
-    0x37cc0000, 0x37cc8000, 0x37cd0000, 0x37cd8000, 0x37ce0000, 0x37ce8000,
-    0x37cf0000, 0x37cf8000, 0x37d00000, 0x37d08000, 0x37d10000, 0x37d18000,
-    0x37d20000, 0x37d28000, 0x37d30000, 0x37d38000, 0x37d40000, 0x37d48000,
-    0x37d50000, 0x37d58000, 0x37d60000, 0x37d68000, 0x37d70000, 0x37d78000,
-    0x37d80000, 0x37d88000, 0x37d90000, 0x37d98000, 0x37da0000, 0x37da8000,
-    0x37db0000, 0x37db8000, 0x37dc0000, 0x37dc8000, 0x37dd0000, 0x37dd8000,
-    0x37de0000, 0x37de8000, 0x37df0000, 0x37df8000, 0x37e00000, 0x37e08000,
-    0x37e10000, 0x37e18000, 0x37e20000, 0x37e28000, 0x37e30000, 0x37e38000,
-    0x37e40000, 0x37e48000, 0x37e50000, 0x37e58000, 0x37e60000, 0x37e68000,
-    0x37e70000, 0x37e78000, 0x37e80000, 0x37e88000, 0x37e90000, 0x37e98000,
-    0x37ea0000, 0x37ea8000, 0x37eb0000, 0x37eb8000, 0x37ec0000, 0x37ec8000,
-    0x37ed0000, 0x37ed8000, 0x37ee0000, 0x37ee8000, 0x37ef0000, 0x37ef8000,
-    0x37f00000, 0x37f08000, 0x37f10000, 0x37f18000, 0x37f20000, 0x37f28000,
-    0x37f30000, 0x37f38000, 0x37f40000, 0x37f48000, 0x37f50000, 0x37f58000,
-    0x37f60000, 0x37f68000, 0x37f70000, 0x37f78000, 0x37f80000, 0x37f88000,
-    0x37f90000, 0x37f98000, 0x37fa0000, 0x37fa8000, 0x37fb0000, 0x37fb8000,
-    0x37fc0000, 0x37fc8000, 0x37fd0000, 0x37fd8000, 0x37fe0000, 0x37fe8000,
-    0x37ff0000, 0x37ff8000, 0x38000000, 0x38004000, 0x38008000, 0x3800c000,
-    0x38010000, 0x38014000, 0x38018000, 0x3801c000, 0x38020000, 0x38024000,
-    0x38028000, 0x3802c000, 0x38030000, 0x38034000, 0x38038000, 0x3803c000,
-    0x38040000, 0x38044000, 0x38048000, 0x3804c000, 0x38050000, 0x38054000,
-    0x38058000, 0x3805c000, 0x38060000, 0x38064000, 0x38068000, 0x3806c000,
-    0x38070000, 0x38074000, 0x38078000, 0x3807c000, 0x38080000, 0x38084000,
-    0x38088000, 0x3808c000, 0x38090000, 0x38094000, 0x38098000, 0x3809c000,
-    0x380a0000, 0x380a4000, 0x380a8000, 0x380ac000, 0x380b0000, 0x380b4000,
-    0x380b8000, 0x380bc000, 0x380c0000, 0x380c4000, 0x380c8000, 0x380cc000,
-    0x380d0000, 0x380d4000, 0x380d8000, 0x380dc000, 0x380e0000, 0x380e4000,
-    0x380e8000, 0x380ec000, 0x380f0000, 0x380f4000, 0x380f8000, 0x380fc000,
-    0x38100000, 0x38104000, 0x38108000, 0x3810c000, 0x38110000, 0x38114000,
-    0x38118000, 0x3811c000, 0x38120000, 0x38124000, 0x38128000, 0x3812c000,
-    0x38130000, 0x38134000, 0x38138000, 0x3813c000, 0x38140000, 0x38144000,
-    0x38148000, 0x3814c000, 0x38150000, 0x38154000, 0x38158000, 0x3815c000,
-    0x38160000, 0x38164000, 0x38168000, 0x3816c000, 0x38170000, 0x38174000,
-    0x38178000, 0x3817c000, 0x38180000, 0x38184000, 0x38188000, 0x3818c000,
-    0x38190000, 0x38194000, 0x38198000, 0x3819c000, 0x381a0000, 0x381a4000,
-    0x381a8000, 0x381ac000, 0x381b0000, 0x381b4000, 0x381b8000, 0x381bc000,
-    0x381c0000, 0x381c4000, 0x381c8000, 0x381cc000, 0x381d0000, 0x381d4000,
-    0x381d8000, 0x381dc000, 0x381e0000, 0x381e4000, 0x381e8000, 0x381ec000,
-    0x381f0000, 0x381f4000, 0x381f8000, 0x381fc000, 0x38200000, 0x38204000,
-    0x38208000, 0x3820c000, 0x38210000, 0x38214000, 0x38218000, 0x3821c000,
-    0x38220000, 0x38224000, 0x38228000, 0x3822c000, 0x38230000, 0x38234000,
-    0x38238000, 0x3823c000, 0x38240000, 0x38244000, 0x38248000, 0x3824c000,
-    0x38250000, 0x38254000, 0x38258000, 0x3825c000, 0x38260000, 0x38264000,
-    0x38268000, 0x3826c000, 0x38270000, 0x38274000, 0x38278000, 0x3827c000,
-    0x38280000, 0x38284000, 0x38288000, 0x3828c000, 0x38290000, 0x38294000,
-    0x38298000, 0x3829c000, 0x382a0000, 0x382a4000, 0x382a8000, 0x382ac000,
-    0x382b0000, 0x382b4000, 0x382b8000, 0x382bc000, 0x382c0000, 0x382c4000,
-    0x382c8000, 0x382cc000, 0x382d0000, 0x382d4000, 0x382d8000, 0x382dc000,
-    0x382e0000, 0x382e4000, 0x382e8000, 0x382ec000, 0x382f0000, 0x382f4000,
-    0x382f8000, 0x382fc000, 0x38300000, 0x38304000, 0x38308000, 0x3830c000,
-    0x38310000, 0x38314000, 0x38318000, 0x3831c000, 0x38320000, 0x38324000,
-    0x38328000, 0x3832c000, 0x38330000, 0x38334000, 0x38338000, 0x3833c000,
-    0x38340000, 0x38344000, 0x38348000, 0x3834c000, 0x38350000, 0x38354000,
-    0x38358000, 0x3835c000, 0x38360000, 0x38364000, 0x38368000, 0x3836c000,
-    0x38370000, 0x38374000, 0x38378000, 0x3837c000, 0x38380000, 0x38384000,
-    0x38388000, 0x3838c000, 0x38390000, 0x38394000, 0x38398000, 0x3839c000,
-    0x383a0000, 0x383a4000, 0x383a8000, 0x383ac000, 0x383b0000, 0x383b4000,
-    0x383b8000, 0x383bc000, 0x383c0000, 0x383c4000, 0x383c8000, 0x383cc000,
-    0x383d0000, 0x383d4000, 0x383d8000, 0x383dc000, 0x383e0000, 0x383e4000,
-    0x383e8000, 0x383ec000, 0x383f0000, 0x383f4000, 0x383f8000, 0x383fc000,
-    0x38400000, 0x38404000, 0x38408000, 0x3840c000, 0x38410000, 0x38414000,
-    0x38418000, 0x3841c000, 0x38420000, 0x38424000, 0x38428000, 0x3842c000,
-    0x38430000, 0x38434000, 0x38438000, 0x3843c000, 0x38440000, 0x38444000,
-    0x38448000, 0x3844c000, 0x38450000, 0x38454000, 0x38458000, 0x3845c000,
-    0x38460000, 0x38464000, 0x38468000, 0x3846c000, 0x38470000, 0x38474000,
-    0x38478000, 0x3847c000, 0x38480000, 0x38484000, 0x38488000, 0x3848c000,
-    0x38490000, 0x38494000, 0x38498000, 0x3849c000, 0x384a0000, 0x384a4000,
-    0x384a8000, 0x384ac000, 0x384b0000, 0x384b4000, 0x384b8000, 0x384bc000,
-    0x384c0000, 0x384c4000, 0x384c8000, 0x384cc000, 0x384d0000, 0x384d4000,
-    0x384d8000, 0x384dc000, 0x384e0000, 0x384e4000, 0x384e8000, 0x384ec000,
-    0x384f0000, 0x384f4000, 0x384f8000, 0x384fc000, 0x38500000, 0x38504000,
-    0x38508000, 0x3850c000, 0x38510000, 0x38514000, 0x38518000, 0x3851c000,
-    0x38520000, 0x38524000, 0x38528000, 0x3852c000, 0x38530000, 0x38534000,
-    0x38538000, 0x3853c000, 0x38540000, 0x38544000, 0x38548000, 0x3854c000,
-    0x38550000, 0x38554000, 0x38558000, 0x3855c000, 0x38560000, 0x38564000,
-    0x38568000, 0x3856c000, 0x38570000, 0x38574000, 0x38578000, 0x3857c000,
-    0x38580000, 0x38584000, 0x38588000, 0x3858c000, 0x38590000, 0x38594000,
-    0x38598000, 0x3859c000, 0x385a0000, 0x385a4000, 0x385a8000, 0x385ac000,
-    0x385b0000, 0x385b4000, 0x385b8000, 0x385bc000, 0x385c0000, 0x385c4000,
-    0x385c8000, 0x385cc000, 0x385d0000, 0x385d4000, 0x385d8000, 0x385dc000,
-    0x385e0000, 0x385e4000, 0x385e8000, 0x385ec000, 0x385f0000, 0x385f4000,
-    0x385f8000, 0x385fc000, 0x38600000, 0x38604000, 0x38608000, 0x3860c000,
-    0x38610000, 0x38614000, 0x38618000, 0x3861c000, 0x38620000, 0x38624000,
-    0x38628000, 0x3862c000, 0x38630000, 0x38634000, 0x38638000, 0x3863c000,
-    0x38640000, 0x38644000, 0x38648000, 0x3864c000, 0x38650000, 0x38654000,
-    0x38658000, 0x3865c000, 0x38660000, 0x38664000, 0x38668000, 0x3866c000,
-    0x38670000, 0x38674000, 0x38678000, 0x3867c000, 0x38680000, 0x38684000,
-    0x38688000, 0x3868c000, 0x38690000, 0x38694000, 0x38698000, 0x3869c000,
-    0x386a0000, 0x386a4000, 0x386a8000, 0x386ac000, 0x386b0000, 0x386b4000,
-    0x386b8000, 0x386bc000, 0x386c0000, 0x386c4000, 0x386c8000, 0x386cc000,
-    0x386d0000, 0x386d4000, 0x386d8000, 0x386dc000, 0x386e0000, 0x386e4000,
-    0x386e8000, 0x386ec000, 0x386f0000, 0x386f4000, 0x386f8000, 0x386fc000,
-    0x38700000, 0x38704000, 0x38708000, 0x3870c000, 0x38710000, 0x38714000,
-    0x38718000, 0x3871c000, 0x38720000, 0x38724000, 0x38728000, 0x3872c000,
-    0x38730000, 0x38734000, 0x38738000, 0x3873c000, 0x38740000, 0x38744000,
-    0x38748000, 0x3874c000, 0x38750000, 0x38754000, 0x38758000, 0x3875c000,
-    0x38760000, 0x38764000, 0x38768000, 0x3876c000, 0x38770000, 0x38774000,
-    0x38778000, 0x3877c000, 0x38780000, 0x38784000, 0x38788000, 0x3878c000,
-    0x38790000, 0x38794000, 0x38798000, 0x3879c000, 0x387a0000, 0x387a4000,
-    0x387a8000, 0x387ac000, 0x387b0000, 0x387b4000, 0x387b8000, 0x387bc000,
-    0x387c0000, 0x387c4000, 0x387c8000, 0x387cc000, 0x387d0000, 0x387d4000,
-    0x387d8000, 0x387dc000, 0x387e0000, 0x387e4000, 0x387e8000, 0x387ec000,
-    0x387f0000, 0x387f4000, 0x387f8000, 0x387fc000, 0x38000000, 0x38002000,
-    0x38004000, 0x38006000, 0x38008000, 0x3800a000, 0x3800c000, 0x3800e000,
-    0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801a000,
-    0x3801c000, 0x3801e000, 0x38020000, 0x38022000, 0x38024000, 0x38026000,
-    0x38028000, 0x3802a000, 0x3802c000, 0x3802e000, 0x38030000, 0x38032000,
-    0x38034000, 0x38036000, 0x38038000, 0x3803a000, 0x3803c000, 0x3803e000,
-    0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804a000,
-    0x3804c000, 0x3804e000, 0x38050000, 0x38052000, 0x38054000, 0x38056000,
-    0x38058000, 0x3805a000, 0x3805c000, 0x3805e000, 0x38060000, 0x38062000,
-    0x38064000, 0x38066000, 0x38068000, 0x3806a000, 0x3806c000, 0x3806e000,
-    0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807a000,
-    0x3807c000, 0x3807e000, 0x38080000, 0x38082000, 0x38084000, 0x38086000,
-    0x38088000, 0x3808a000, 0x3808c000, 0x3808e000, 0x38090000, 0x38092000,
-    0x38094000, 0x38096000, 0x38098000, 0x3809a000, 0x3809c000, 0x3809e000,
-    0x380a0000, 0x380a2000, 0x380a4000, 0x380a6000, 0x380a8000, 0x380aa000,
-    0x380ac000, 0x380ae000, 0x380b0000, 0x380b2000, 0x380b4000, 0x380b6000,
-    0x380b8000, 0x380ba000, 0x380bc000, 0x380be000, 0x380c0000, 0x380c2000,
-    0x380c4000, 0x380c6000, 0x380c8000, 0x380ca000, 0x380cc000, 0x380ce000,
-    0x380d0000, 0x380d2000, 0x380d4000, 0x380d6000, 0x380d8000, 0x380da000,
-    0x380dc000, 0x380de000, 0x380e0000, 0x380e2000, 0x380e4000, 0x380e6000,
-    0x380e8000, 0x380ea000, 0x380ec000, 0x380ee000, 0x380f0000, 0x380f2000,
-    0x380f4000, 0x380f6000, 0x380f8000, 0x380fa000, 0x380fc000, 0x380fe000,
-    0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810a000,
-    0x3810c000, 0x3810e000, 0x38110000, 0x38112000, 0x38114000, 0x38116000,
-    0x38118000, 0x3811a000, 0x3811c000, 0x3811e000, 0x38120000, 0x38122000,
-    0x38124000, 0x38126000, 0x38128000, 0x3812a000, 0x3812c000, 0x3812e000,
-    0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813a000,
-    0x3813c000, 0x3813e000, 0x38140000, 0x38142000, 0x38144000, 0x38146000,
-    0x38148000, 0x3814a000, 0x3814c000, 0x3814e000, 0x38150000, 0x38152000,
-    0x38154000, 0x38156000, 0x38158000, 0x3815a000, 0x3815c000, 0x3815e000,
-    0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816a000,
-    0x3816c000, 0x3816e000, 0x38170000, 0x38172000, 0x38174000, 0x38176000,
-    0x38178000, 0x3817a000, 0x3817c000, 0x3817e000, 0x38180000, 0x38182000,
-    0x38184000, 0x38186000, 0x38188000, 0x3818a000, 0x3818c000, 0x3818e000,
-    0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819a000,
-    0x3819c000, 0x3819e000, 0x381a0000, 0x381a2000, 0x381a4000, 0x381a6000,
-    0x381a8000, 0x381aa000, 0x381ac000, 0x381ae000, 0x381b0000, 0x381b2000,
-    0x381b4000, 0x381b6000, 0x381b8000, 0x381ba000, 0x381bc000, 0x381be000,
-    0x381c0000, 0x381c2000, 0x381c4000, 0x381c6000, 0x381c8000, 0x381ca000,
-    0x381cc000, 0x381ce000, 0x381d0000, 0x381d2000, 0x381d4000, 0x381d6000,
-    0x381d8000, 0x381da000, 0x381dc000, 0x381de000, 0x381e0000, 0x381e2000,
-    0x381e4000, 0x381e6000, 0x381e8000, 0x381ea000, 0x381ec000, 0x381ee000,
-    0x381f0000, 0x381f2000, 0x381f4000, 0x381f6000, 0x381f8000, 0x381fa000,
-    0x381fc000, 0x381fe000, 0x38200000, 0x38202000, 0x38204000, 0x38206000,
-    0x38208000, 0x3820a000, 0x3820c000, 0x3820e000, 0x38210000, 0x38212000,
-    0x38214000, 0x38216000, 0x38218000, 0x3821a000, 0x3821c000, 0x3821e000,
-    0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822a000,
-    0x3822c000, 0x3822e000, 0x38230000, 0x38232000, 0x38234000, 0x38236000,
-    0x38238000, 0x3823a000, 0x3823c000, 0x3823e000, 0x38240000, 0x38242000,
-    0x38244000, 0x38246000, 0x38248000, 0x3824a000, 0x3824c000, 0x3824e000,
-    0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825a000,
-    0x3825c000, 0x3825e000, 0x38260000, 0x38262000, 0x38264000, 0x38266000,
-    0x38268000, 0x3826a000, 0x3826c000, 0x3826e000, 0x38270000, 0x38272000,
-    0x38274000, 0x38276000, 0x38278000, 0x3827a000, 0x3827c000, 0x3827e000,
-    0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828a000,
-    0x3828c000, 0x3828e000, 0x38290000, 0x38292000, 0x38294000, 0x38296000,
-    0x38298000, 0x3829a000, 0x3829c000, 0x3829e000, 0x382a0000, 0x382a2000,
-    0x382a4000, 0x382a6000, 0x382a8000, 0x382aa000, 0x382ac000, 0x382ae000,
-    0x382b0000, 0x382b2000, 0x382b4000, 0x382b6000, 0x382b8000, 0x382ba000,
-    0x382bc000, 0x382be000, 0x382c0000, 0x382c2000, 0x382c4000, 0x382c6000,
-    0x382c8000, 0x382ca000, 0x382cc000, 0x382ce000, 0x382d0000, 0x382d2000,
-    0x382d4000, 0x382d6000, 0x382d8000, 0x382da000, 0x382dc000, 0x382de000,
-    0x382e0000, 0x382e2000, 0x382e4000, 0x382e6000, 0x382e8000, 0x382ea000,
-    0x382ec000, 0x382ee000, 0x382f0000, 0x382f2000, 0x382f4000, 0x382f6000,
-    0x382f8000, 0x382fa000, 0x382fc000, 0x382fe000, 0x38300000, 0x38302000,
-    0x38304000, 0x38306000, 0x38308000, 0x3830a000, 0x3830c000, 0x3830e000,
-    0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831a000,
-    0x3831c000, 0x3831e000, 0x38320000, 0x38322000, 0x38324000, 0x38326000,
-    0x38328000, 0x3832a000, 0x3832c000, 0x3832e000, 0x38330000, 0x38332000,
-    0x38334000, 0x38336000, 0x38338000, 0x3833a000, 0x3833c000, 0x3833e000,
-    0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834a000,
-    0x3834c000, 0x3834e000, 0x38350000, 0x38352000, 0x38354000, 0x38356000,
-    0x38358000, 0x3835a000, 0x3835c000, 0x3835e000, 0x38360000, 0x38362000,
-    0x38364000, 0x38366000, 0x38368000, 0x3836a000, 0x3836c000, 0x3836e000,
-    0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837a000,
-    0x3837c000, 0x3837e000, 0x38380000, 0x38382000, 0x38384000, 0x38386000,
-    0x38388000, 0x3838a000, 0x3838c000, 0x3838e000, 0x38390000, 0x38392000,
-    0x38394000, 0x38396000, 0x38398000, 0x3839a000, 0x3839c000, 0x3839e000,
-    0x383a0000, 0x383a2000, 0x383a4000, 0x383a6000, 0x383a8000, 0x383aa000,
-    0x383ac000, 0x383ae000, 0x383b0000, 0x383b2000, 0x383b4000, 0x383b6000,
-    0x383b8000, 0x383ba000, 0x383bc000, 0x383be000, 0x383c0000, 0x383c2000,
-    0x383c4000, 0x383c6000, 0x383c8000, 0x383ca000, 0x383cc000, 0x383ce000,
-    0x383d0000, 0x383d2000, 0x383d4000, 0x383d6000, 0x383d8000, 0x383da000,
-    0x383dc000, 0x383de000, 0x383e0000, 0x383e2000, 0x383e4000, 0x383e6000,
-    0x383e8000, 0x383ea000, 0x383ec000, 0x383ee000, 0x383f0000, 0x383f2000,
-    0x383f4000, 0x383f6000, 0x383f8000, 0x383fa000, 0x383fc000, 0x383fe000,
-    0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840a000,
-    0x3840c000, 0x3840e000, 0x38410000, 0x38412000, 0x38414000, 0x38416000,
-    0x38418000, 0x3841a000, 0x3841c000, 0x3841e000, 0x38420000, 0x38422000,
-    0x38424000, 0x38426000, 0x38428000, 0x3842a000, 0x3842c000, 0x3842e000,
-    0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843a000,
-    0x3843c000, 0x3843e000, 0x38440000, 0x38442000, 0x38444000, 0x38446000,
-    0x38448000, 0x3844a000, 0x3844c000, 0x3844e000, 0x38450000, 0x38452000,
-    0x38454000, 0x38456000, 0x38458000, 0x3845a000, 0x3845c000, 0x3845e000,
-    0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846a000,
-    0x3846c000, 0x3846e000, 0x38470000, 0x38472000, 0x38474000, 0x38476000,
-    0x38478000, 0x3847a000, 0x3847c000, 0x3847e000, 0x38480000, 0x38482000,
-    0x38484000, 0x38486000, 0x38488000, 0x3848a000, 0x3848c000, 0x3848e000,
-    0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849a000,
-    0x3849c000, 0x3849e000, 0x384a0000, 0x384a2000, 0x384a4000, 0x384a6000,
-    0x384a8000, 0x384aa000, 0x384ac000, 0x384ae000, 0x384b0000, 0x384b2000,
-    0x384b4000, 0x384b6000, 0x384b8000, 0x384ba000, 0x384bc000, 0x384be000,
-    0x384c0000, 0x384c2000, 0x384c4000, 0x384c6000, 0x384c8000, 0x384ca000,
-    0x384cc000, 0x384ce000, 0x384d0000, 0x384d2000, 0x384d4000, 0x384d6000,
-    0x384d8000, 0x384da000, 0x384dc000, 0x384de000, 0x384e0000, 0x384e2000,
-    0x384e4000, 0x384e6000, 0x384e8000, 0x384ea000, 0x384ec000, 0x384ee000,
-    0x384f0000, 0x384f2000, 0x384f4000, 0x384f6000, 0x384f8000, 0x384fa000,
-    0x384fc000, 0x384fe000, 0x38500000, 0x38502000, 0x38504000, 0x38506000,
-    0x38508000, 0x3850a000, 0x3850c000, 0x3850e000, 0x38510000, 0x38512000,
-    0x38514000, 0x38516000, 0x38518000, 0x3851a000, 0x3851c000, 0x3851e000,
-    0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852a000,
-    0x3852c000, 0x3852e000, 0x38530000, 0x38532000, 0x38534000, 0x38536000,
-    0x38538000, 0x3853a000, 0x3853c000, 0x3853e000, 0x38540000, 0x38542000,
-    0x38544000, 0x38546000, 0x38548000, 0x3854a000, 0x3854c000, 0x3854e000,
-    0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855a000,
-    0x3855c000, 0x3855e000, 0x38560000, 0x38562000, 0x38564000, 0x38566000,
-    0x38568000, 0x3856a000, 0x3856c000, 0x3856e000, 0x38570000, 0x38572000,
-    0x38574000, 0x38576000, 0x38578000, 0x3857a000, 0x3857c000, 0x3857e000,
-    0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858a000,
-    0x3858c000, 0x3858e000, 0x38590000, 0x38592000, 0x38594000, 0x38596000,
-    0x38598000, 0x3859a000, 0x3859c000, 0x3859e000, 0x385a0000, 0x385a2000,
-    0x385a4000, 0x385a6000, 0x385a8000, 0x385aa000, 0x385ac000, 0x385ae000,
-    0x385b0000, 0x385b2000, 0x385b4000, 0x385b6000, 0x385b8000, 0x385ba000,
-    0x385bc000, 0x385be000, 0x385c0000, 0x385c2000, 0x385c4000, 0x385c6000,
-    0x385c8000, 0x385ca000, 0x385cc000, 0x385ce000, 0x385d0000, 0x385d2000,
-    0x385d4000, 0x385d6000, 0x385d8000, 0x385da000, 0x385dc000, 0x385de000,
-    0x385e0000, 0x385e2000, 0x385e4000, 0x385e6000, 0x385e8000, 0x385ea000,
-    0x385ec000, 0x385ee000, 0x385f0000, 0x385f2000, 0x385f4000, 0x385f6000,
-    0x385f8000, 0x385fa000, 0x385fc000, 0x385fe000, 0x38600000, 0x38602000,
-    0x38604000, 0x38606000, 0x38608000, 0x3860a000, 0x3860c000, 0x3860e000,
-    0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861a000,
-    0x3861c000, 0x3861e000, 0x38620000, 0x38622000, 0x38624000, 0x38626000,
-    0x38628000, 0x3862a000, 0x3862c000, 0x3862e000, 0x38630000, 0x38632000,
-    0x38634000, 0x38636000, 0x38638000, 0x3863a000, 0x3863c000, 0x3863e000,
-    0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864a000,
-    0x3864c000, 0x3864e000, 0x38650000, 0x38652000, 0x38654000, 0x38656000,
-    0x38658000, 0x3865a000, 0x3865c000, 0x3865e000, 0x38660000, 0x38662000,
-    0x38664000, 0x38666000, 0x38668000, 0x3866a000, 0x3866c000, 0x3866e000,
-    0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867a000,
-    0x3867c000, 0x3867e000, 0x38680000, 0x38682000, 0x38684000, 0x38686000,
-    0x38688000, 0x3868a000, 0x3868c000, 0x3868e000, 0x38690000, 0x38692000,
-    0x38694000, 0x38696000, 0x38698000, 0x3869a000, 0x3869c000, 0x3869e000,
-    0x386a0000, 0x386a2000, 0x386a4000, 0x386a6000, 0x386a8000, 0x386aa000,
-    0x386ac000, 0x386ae000, 0x386b0000, 0x386b2000, 0x386b4000, 0x386b6000,
-    0x386b8000, 0x386ba000, 0x386bc000, 0x386be000, 0x386c0000, 0x386c2000,
-    0x386c4000, 0x386c6000, 0x386c8000, 0x386ca000, 0x386cc000, 0x386ce000,
-    0x386d0000, 0x386d2000, 0x386d4000, 0x386d6000, 0x386d8000, 0x386da000,
-    0x386dc000, 0x386de000, 0x386e0000, 0x386e2000, 0x386e4000, 0x386e6000,
-    0x386e8000, 0x386ea000, 0x386ec000, 0x386ee000, 0x386f0000, 0x386f2000,
-    0x386f4000, 0x386f6000, 0x386f8000, 0x386fa000, 0x386fc000, 0x386fe000,
-    0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870a000,
-    0x3870c000, 0x3870e000, 0x38710000, 0x38712000, 0x38714000, 0x38716000,
-    0x38718000, 0x3871a000, 0x3871c000, 0x3871e000, 0x38720000, 0x38722000,
-    0x38724000, 0x38726000, 0x38728000, 0x3872a000, 0x3872c000, 0x3872e000,
-    0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873a000,
-    0x3873c000, 0x3873e000, 0x38740000, 0x38742000, 0x38744000, 0x38746000,
-    0x38748000, 0x3874a000, 0x3874c000, 0x3874e000, 0x38750000, 0x38752000,
-    0x38754000, 0x38756000, 0x38758000, 0x3875a000, 0x3875c000, 0x3875e000,
-    0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876a000,
-    0x3876c000, 0x3876e000, 0x38770000, 0x38772000, 0x38774000, 0x38776000,
-    0x38778000, 0x3877a000, 0x3877c000, 0x3877e000, 0x38780000, 0x38782000,
-    0x38784000, 0x38786000, 0x38788000, 0x3878a000, 0x3878c000, 0x3878e000,
-    0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879a000,
-    0x3879c000, 0x3879e000, 0x387a0000, 0x387a2000, 0x387a4000, 0x387a6000,
-    0x387a8000, 0x387aa000, 0x387ac000, 0x387ae000, 0x387b0000, 0x387b2000,
-    0x387b4000, 0x387b6000, 0x387b8000, 0x387ba000, 0x387bc000, 0x387be000,
-    0x387c0000, 0x387c2000, 0x387c4000, 0x387c6000, 0x387c8000, 0x387ca000,
-    0x387cc000, 0x387ce000, 0x387d0000, 0x387d2000, 0x387d4000, 0x387d6000,
-    0x387d8000, 0x387da000, 0x387dc000, 0x387de000, 0x387e0000, 0x387e2000,
-    0x387e4000, 0x387e6000, 0x387e8000, 0x387ea000, 0x387ec000, 0x387ee000,
-    0x387f0000, 0x387f2000, 0x387f4000, 0x387f6000, 0x387f8000, 0x387fa000,
-    0x387fc000, 0x387fe000};
-
-static const uint16_t offsettable[64] = {
-    0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400};
-
-static const uint32_t exponenttable[64] = {
-    0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000,
-    0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000,
-    0x06000000, 0x06800000, 0x07000000, 0x07800000, 0x08000000, 0x08800000,
-    0x09000000, 0x09800000, 0x0a000000, 0x0a800000, 0x0b000000, 0x0b800000,
-    0x0c000000, 0x0c800000, 0x0d000000, 0x0d800000, 0x0e000000, 0x0e800000,
-    0x0f000000, 0x47800000, 0x80000000, 0x80800000, 0x81000000, 0x81800000,
-    0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000,
-    0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000,
-    0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8a000000, 0x8a800000,
-    0x8b000000, 0x8b800000, 0x8c000000, 0x8c800000, 0x8d000000, 0x8d800000,
-    0x8e000000, 0x8e800000, 0x8f000000, 0xc7800000};
-
-static const uint16_t basetable[512] = {
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010,
-    0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0c00, 0x1000,
-    0x1400, 0x1800, 0x1c00, 0x2000, 0x2400, 0x2800, 0x2c00, 0x3000, 0x3400,
-    0x3800, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00, 0x5000, 0x5400, 0x5800,
-    0x5c00, 0x6000, 0x6400, 0x6800, 0x6c00, 0x7000, 0x7400, 0x7800, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001,
-    0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200,
-    0x8400, 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00, 0xa000, 0xa400,
-    0xa800, 0xac00, 0xb000, 0xb400, 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800,
-    0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00, 0xe000, 0xe400, 0xe800, 0xec00,
-    0xf000, 0xf400, 0xf800, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00};
-
-static const uint8_t shifttable[512] = {
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13,
-    0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
-    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
-    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17,
-    0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d,
-    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
-    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
-    0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d};
-
-inline float16 float_to_half(float f) {
-  uint32_t v = *reinterpret_cast<uint32_t *>(&f);
-  return basetable[(v >> 23) & 0x1ff] +
-         ((v & 0x007fffff) >> shifttable[(v >> 23) & 0x1ff]);
-}
-
-inline float half_to_float(float16 h) {
-  uint32_t v = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] +
-               exponenttable[h >> 10];
-  return *reinterpret_cast<float *>(&v);
-}
-
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/layout.hpp b/mobile/src/fpga/KD/layout.hpp
deleted file mode 100644
index 8df0d11d3b..0000000000
--- a/mobile/src/fpga/KD/layout.hpp
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-
-#include "fpga/KD/alignment.h"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-enum LayoutType {
-  N,
-  NC,
-  NCHW,
-  NHWC,
-  NHW,
-};
-
-class Layout {
- public:
-  virtual int numIndex() = 0;
-  virtual int channelIndex() { return -1; }
-  virtual int heightIndex() { return -1; }
-  virtual int widthIndex() { return -1; }
-  virtual int alignedElementCount(const std::vector<int>& dims) = 0;
-  virtual int elementCount(const std::vector<int>& dims) = 0;
-};
-
-struct NCHW : Layout {
-  int numIndex() { return 0; }
-  int channelIndex() { return 1; }
-  int heightIndex() { return 2; }
-  int widthIndex() { return 3; }
-  int alignedElementCount(const std::vector<int>& dims) {
-    return dims[0] * dims[2] * align_image(dims[1] * dims[3]);
-  }
-  virtual int elementCount(const std::vector<int>& dims) {
-    return dims[0] * dims[1] * dims[2] * dims[3];
-  }
-};
-
-struct NHWC : Layout {
-  int numIndex() { return 0; }
-  int heightIndex() { return 1; }
-  int widthIndex() { return 2; }
-  int channelIndex() { return 3; }
-  int alignedElementCount(const std::vector<int>& dims) {
-    return dims[0] * dims[1] * align_image(dims[2] * dims[3]);
-  }
-  virtual int elementCount(const std::vector<int>& dims) {
-    return dims[0] * dims[1] * dims[2] * dims[3];
-  }
-};
-
-struct NC : Layout {
-  int numIndex() { return 0; }
-  int channelIndex() { return 1; }
-  int alignedElementCount(const std::vector<int>& dims) {
-    return dims[0] * dims[1];
-  }
-  virtual int elementCount(const std::vector<int>& dims) {
-    return dims[0] * dims[1];
-  }
-};
-
-struct N : Layout {
-  int numIndex() { return 0; }
-  int alignedElementCount(const std::vector<int>& dims) { return dims[0]; }
-  virtual int elementCount(const std::vector<int>& dims) { return dims[0]; }
-};
-
-struct NHW : Layout {
-  int numIndex() { return 0; }
-  int heightIndex() { return 1; }
-  int widthIndex() { return 2; }
-  int alignedElementCount(const std::vector<int>& dims) {
-    // TODO(chonwhite) align it;
-    return dims[0] * dims[1] * dims[2];
-  }
-  virtual int elementCount(const std::vector<int>& dims) {
-    return dims[0] * dims[1] * dims[2];
-  }
-};
-
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/llapi/bias_scale.cpp b/mobile/src/fpga/KD/llapi/bias_scale.cpp
deleted file mode 100644
index 612c86871c..0000000000
--- a/mobile/src/fpga/KD/llapi/bias_scale.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory.h>
-
-#include "fpga/KD/llapi/bias_scale.h"
-#include "fpga/KD/llapi/zynqmp_api.h"
-
-namespace paddle_mobile {
-namespace zynqmp {
-namespace bias_scale {
-
-void align_element(float **data_in, int num_per_div_before_alignment, int num) {
-  int copynum = 0;
-  float *ptr_unaligned = *data_in;
-  int div_num =
-      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
-  int num_element =
-      2 * div_num * num_per_div_after_alignment;  // including bias & scale
-  float *ptr_aligned =
-      (float *)fpga_malloc(num_element * sizeof(float));  // NOLINT
-
-  memset(ptr_aligned, 0, num_element * sizeof(float));
-  for (int i = 0; i < div_num; i++) {
-    if (i == div_num - 1) {
-      copynum = (num_per_div_after_alignment * div_num > num)
-                    ? (num % num_per_div_after_alignment)
-                    : (num_per_div_before_alignment);
-    } else {
-      copynum = num_per_div_before_alignment;
-    }
-
-    memcpy(ptr_aligned + i * num_per_div_after_alignment,
-           ptr_unaligned + num_per_div_before_alignment * i,
-           copynum * sizeof(float));
-    memcpy(ptr_aligned + (div_num + i) * num_per_div_after_alignment,
-           ptr_unaligned + num_per_div_before_alignment * i + num,
-           copynum * sizeof(float));
-  }
-  fpga_free(ptr_unaligned);
-  *data_in = ptr_aligned;
-}
-
-void interleave(float **data_in, int num_after_alignment) {
-  float *ptr_uninterleaved = *data_in;
-  float *ptr_interleaved =
-      (float *)fpga_malloc(2 * num_after_alignment * sizeof(float));  // NOLINT
-  int num = num_after_alignment / 4;
-  for (int i = 0; i < num; i++) {
-    memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i,
-           4 * sizeof(float));
-    memcpy(ptr_interleaved + 8 * i + 4,
-           ptr_uninterleaved + num_after_alignment + 4 * i, 4 * sizeof(float));
-  }
-
-  fpga_free(ptr_uninterleaved);
-  *data_in = ptr_interleaved;
-}
-
-void format_bias_scale_array(float **bias_scale_array,
-                             int element_num_per_division, int num) {
-  align_element(bias_scale_array, element_num_per_division, num);
-  int div_num = (num + element_num_per_division - 1) / element_num_per_division;
-  int element_num_after_division =
-      align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
-  interleave(bias_scale_array, div_num * element_num_after_division);
-  fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float));
-}
-void format_bias_array(float **bias_array, int num) {
-  float *ptr_unaligned = *bias_array;
-  int num_before_align = num;
-  int num_after_align = align_to_x(num_before_align, BIAS_NUM_ALIGNMENT);
-  int16_t *ptr_aligned =
-      (int16_t *)fpga_malloc(num_after_align * sizeof(int16_t));  // NOLINT
-
-  memset(ptr_aligned, 0, num_after_align * sizeof(int16_t));
-  for (int i = 0; i < num_before_align; i++) {
-    float value = ptr_aligned[i];
-    ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]);
-  }
-  *bias_array = (float *)ptr_aligned;  // NOLINT
-  fpga_free(ptr_unaligned);
-}
-
-}  // namespace bias_scale
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/llapi/bias_scale.h b/mobile/src/fpga/KD/llapi/bias_scale.h
deleted file mode 100644
index 66f05cc647..0000000000
--- a/mobile/src/fpga/KD/llapi/bias_scale.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle_mobile {
-namespace zynqmp {
-namespace bias_scale {
-
-void align_element(float** data_in, int num_per_div_before_alignment, int num);
-void interleave(float** data_in, int num_after_alignment);
-void format_bias_scale_array(float** bias_scale_array,
-                             int element_num_per_division, int num);
-void format_bias_array(float** bias_array, int num);
-
-}  // namespace bias_scale
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/llapi/config.h b/mobile/src/fpga/KD/llapi/config.h
deleted file mode 100755
index be919489fb..0000000000
--- a/mobile/src/fpga/KD/llapi/config.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#define PADDLE_MOBILE_ZU5
-#define FPGA_PRINT_MODE
-#define PADDLE_MOBILE_PROFILE
diff --git a/mobile/src/fpga/KD/llapi/filter.cpp b/mobile/src/fpga/KD/llapi/filter.cpp
deleted file mode 100644
index f9e5717e32..0000000000
--- a/mobile/src/fpga/KD/llapi/filter.cpp
+++ /dev/null
@@ -1,346 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/KD/llapi/filter.h"
-#include <memory.h>
-#include <algorithm>
-#include "fpga/KD/float16.hpp"
-#include "fpga/KD/llapi/zynqmp_api.h"
-
-namespace paddle_mobile {
-namespace zynqmp {
-namespace filter {
-
-int calc_division_capacity(int chw) {
-  int n = 2048 / ((chw + 15) / 16) * 32;
-  return n < 2048 ? n : 2048;
-}
-
-int calc_split_num(int num, int division_capacity) {
-  return (num + division_capacity - 1) / division_capacity;
-}
-
-int calc_division_number(int num, int group_num, int division_capacity) {
-  int split_num = calc_split_num(num, division_capacity);
-  //  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
-  //                        "Split number or group number should be 1");
-  return group_num * split_num;
-}
-
-int calc_num_per_div(int num, int group_num, int division_capacity) {
-  if (group_num == 1) {
-    if (num > division_capacity) {
-      return division_capacity;
-    } else {
-      return num;
-    }
-  } else {
-    return (num + group_num - 1) / group_num;
-  }
-}
-
-void convert_to_hwc(char **data_in, int num, int channel, int height,
-                    int width) {
-  char *tmp = *data_in;
-  int chw = channel * height * width;
-  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));  // NOLINT
-  for (int n = 0; n < num; n++) {
-    int64_t amount_per_row = width * channel;
-    for (int c = 0; c < channel; c++) {
-      for (int h = 0; h < height; h++) {
-        int64_t offset_height = h * amount_per_row;
-        for (int w = 0; w < width; w++) {
-          *(data_tmp + n * chw + offset_height + w * channel + c) =
-              *((*data_in)++);
-        }
-      }
-    }
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-float find_max(float *data_in, int data_size) {
-  float max = 0.0;
-  for (int i = 0; i < data_size; ++i) {
-    float value = data_in[i];
-    float abs = value > 0 ? value : -value;
-    max = std::max(max, abs);
-  }
-  return max;
-}
-
-signed char float_to_int8(float fdata) {
-  if (fdata < 0.0) {
-    fdata -= 0.5;
-  } else {
-    fdata += 0.5;
-  }
-  return (signed char)fdata;
-}
-
-void quantize(float **data_in, int data_size, float max) {
-  float *tmp = *data_in;
-  float fix_range = 127;
-  float scale = fix_range / max;
-
-  signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
-  for (int i = 0; i < data_size; i++) {
-    tmp_data[i] = float_to_int8(
-        (*data_in)[i] * scale);  // (signed char)((*data_in)[i] * scale);
-  }
-  *data_in = (float *)tmp_data;  // NOLINT
-  fpga_free(tmp);
-}
-
-void align_element(char **data_in, int num, int chw) {
-  int j = 0;
-  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  if (align_chw != chw) {
-    char *tmp = *data_in;
-    char *data_tmp =
-        (char *)fpga_malloc(num * align_chw * sizeof(char));  // NOLINT
-
-    memset(data_tmp, 0, num * align_chw);
-    for (j = 0; j < num; j++) {
-      memcpy(data_tmp + j * align_chw, (*data_in) + j * chw, chw);
-    }
-    *data_in = data_tmp;
-    fpga_free(tmp);
-  }
-}
-
-void align_num(char **data_in, int num_per_div_before_alignment, int num,
-               int chw) {
-  int i = 0;
-  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-
-  char *tmp = *data_in;
-  int div_num =
-      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int num_element = div_num * num_per_div_after_alignment * align_chw;
-  char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char));  // NOLINT
-
-  memset(data_tmp, 0, num_element * sizeof(char));
-
-  for (i = 0; i < div_num - 1; i++) {
-    memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
-           *data_in + num_per_div_before_alignment * align_chw * i,
-           num_per_div_before_alignment * align_chw);
-  }
-
-  memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
-         *data_in + num_per_div_before_alignment * align_chw * i,
-         (num - (div_num - 1) * num_per_div_before_alignment) * align_chw);
-
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void reorder(char **data_in, int num_after_alignment, int chw) {
-  int index = 0;
-  int new_index = 0;
-
-  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-
-  char *data_tmp =
-      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
-                          sizeof(char));
-  char *tmp = *data_in;
-  for (index = 0; index < num_after_alignment; index++) {
-    new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) +
-                (index / 16 % 2 * 4);
-    memcpy(data_tmp + index * chw_align, *data_in + new_index * chw_align,
-           chw_align);
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-size_t interleave(char **data_in, int num_after_alignment, int chw) {
-  int i = 0;
-  int j = 0;
-  int k = 0;
-  int interleave_per_num = 16;
-
-  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  char *data_tmp =
-      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
-                          sizeof(char));
-  std::cout << "interleave size:" << chw_align * num_after_alignment
-            << std::endl;
-  char *tmp = *data_in;
-  int interleave_num = chw_align * 2 / interleave_per_num;
-  for (i = 0; i < num_after_alignment; i += 2) {
-    for (j = 0, k = 0; j < interleave_num; j += 2, k++) {
-      memcpy(data_tmp + i * chw_align + interleave_per_num * j,
-             *data_in + i * chw_align + interleave_per_num * k,
-             interleave_per_num);
-      memcpy(data_tmp + i * chw_align + interleave_per_num * (j + 1),
-             *data_in + (i + 1) * chw_align + interleave_per_num * k,
-             interleave_per_num);
-    }
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-  return chw_align * num_after_alignment;
-}
-
-size_t format_filter(float **data_in, int num, int channel, int height,
-                     int width, int group_num, float max) {
-  int data_size = channel * height * width * num;
-  int chw = channel * height * width;
-
-  int division_capacity = calc_division_capacity(chw);
-  int num_per_div_before_alignment =
-      calc_num_per_div(num, group_num, division_capacity);
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-  int div_num =
-      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int residual = num % num_per_div_before_alignment;
-  int num_after_alignment = num_per_div_after_alignment *
-                                ((residual == 0) ? div_num : (div_num - 1)) +
-                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
-  quantize(data_in, data_size, max);
-  char **quantize_data = (char **)data_in;  // NOLINT
-  convert_to_hwc(quantize_data, num, channel, height, width);
-  align_element(quantize_data, num, chw);
-  if (num_after_alignment != num) {
-    align_num(quantize_data, num_per_div_before_alignment, num, chw);
-  }
-
-  reorder(quantize_data, num_after_alignment, chw);
-  size_t mem_size = interleave(quantize_data, num_after_alignment, chw);
-  fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
-                                 num_after_alignment * sizeof(char));
-  return mem_size;
-}
-
-void convert_fc_filter(char **data_in, int num, int chw) {
-  char *tmp = *data_in;
-  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));  // NOLINT
-  for (int n = 0; n < num; n++) {
-    for (int c = 0; c < chw; c++) {
-      data_tmp[n * chw + c] = (*data_in)[num * c + n];
-    }
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void format_fc_filter(float **data_in, int num, int channel, int height,
-                      int width, int group_num, float max) {
-  int data_size = channel * height * width * num;
-  int chw = channel * height * width;
-
-  int division_capacity = calc_division_capacity(chw);
-  int num_per_div_before_alignment =
-      calc_num_per_div(num, group_num, division_capacity);
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-  int div_num =
-      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int residual = num % num_per_div_before_alignment;
-  int num_after_alignment = num_per_div_after_alignment *
-                                ((residual == 0) ? div_num : (div_num - 1)) +
-                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
-
-  quantize(data_in, data_size, max);
-  char **quantize_data = (char **)data_in;  // NOLINT
-  convert_fc_filter(quantize_data, num, chw);
-  align_element(quantize_data, num, chw);
-  if (num_after_alignment != num) {
-    align_num(quantize_data, num_per_div_before_alignment, num, chw);
-  }
-  reorder(quantize_data, num_after_alignment, chw);
-  interleave(quantize_data, num_after_alignment, chw);
-  fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
-                                 num_after_alignment * sizeof(char));
-}
-void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
-  int16_t *tmp = *data_in;
-  int16_t *data_tmp =
-      (int16_t *)fpga_malloc(height * width * num * sizeof(int16_t));  // NOLINT
-  for (int n = 0; n < num; n++) {
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        *(data_tmp + h * width * num + w * num + n) = *((*data_in)++);
-      }
-    }
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void align_element_n(int16_t **data_in, int num, int height, int width) {
-  int unalign_n = num;
-  int align_n = align_to_x(num, FILTER_ELEMENT_ALIGNMENT);
-  if (unalign_n == align_n) {
-    return;
-  } else {
-    int16_t *tmp = *data_in;
-
-    int num_element = height * width * align_n;
-    int16_t *data_tmp =
-        (int16_t *)fpga_malloc(num_element * sizeof(int16_t));  // NOLINT
-
-    memset(data_tmp, 0, num_element * sizeof(int16_t));
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        int offset_unalign = h * width * unalign_n + w * unalign_n;
-        int offset_align = h * width * align_n + w * align_n;
-        for (int n = 0; n < unalign_n; n++) {
-          data_tmp[offset_align + n] = *((*data_in) + offset_unalign + n);
-        }
-      }
-    }
-    *data_in = data_tmp;
-    free(tmp);
-  }
-}
-void quantize_to_fp16(float **data_in, int num, int height, int width,
-                      float *scale_ptr) {
-  float *tmp = *data_in;
-  int size = num * height * width;
-
-  float16 *tmp_data = (float16 *)fpga_malloc(size * sizeof(float16));  // NOLINT
-  for (int n = 0; n < num; n++) {
-    float scale_val = scale_ptr[n];
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        int index = n * height * width + h * width + w;
-        float value = tmp[index] * scale_val;
-        tmp_data[index] = float_to_half(value);
-      }
-    }
-  }
-  fpga_flush(tmp_data, size * sizeof(int16_t));
-  *data_in = (float *)tmp_data;  // NOLINT
-  fpga_free(tmp);
-}
-void format_dwconv_filter(float **data_in, int num, int height, int width,
-                          float *scale_ptr) {
-  quantize_to_fp16(data_in, num, height, width, scale_ptr);
-  int16_t **quantize_data = (int16_t **)data_in;  // NOLINT
-  convert_to_hwn(quantize_data, num, height, width);
-  align_element_n(quantize_data, num, height, width);
-  fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
-                                 height * width * sizeof(int16_t));
-}
-}  // namespace filter
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/llapi/filter.h b/mobile/src/fpga/KD/llapi/filter.h
deleted file mode 100644
index 80c027a104..0000000000
--- a/mobile/src/fpga/KD/llapi/filter.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdint>
-#include <cstdlib>
-#include <cwchar>
-
-namespace paddle_mobile {
-namespace zynqmp {
-namespace filter {
-
-int calc_division_capacity(int chw);
-int calc_split_num(int num, int division_capacity);
-int calc_division_number(int num, int group_num, int division_capacity);
-int calc_num_per_div(int num, int group_num, int division_capacity);
-void convert_to_hwc(char** data_in, int num, int channel, int height,
-                    int width);
-float find_max(float* data_in, int data_size);
-void quantize(float** data_in, int data_size, float max);
-void align_element(char** data_in, int num, int chw);
-void align_num(char** data_in, int num_per_div_before_alignment, int num,
-               int chw);
-void reorder(char** data_in, int num_after_alignment, int chw);
-size_t interleave(char** data_in, int num_after_alignment, int chw);
-size_t format_filter(float** data_in, int num, int channel, int height,
-                     int width, int group_num, float max);
-
-void convert_fc_filter(char** data_in, int num, int chw);
-void format_fc_filter(float** data_in, int num, int channel, int height,
-                      int width, int group_num, float max);
-
-void convert_to_hwn(int16_t** data_in, int num, int height, int width);
-void align_element_n(int16_t** data_in, int num, int height, int width);
-void quantize_to_fp16(float** data_in, int num, int height, int width,
-                      float* scale_ptr);
-void format_dwconv_filter(float** data_in, int num, int height, int width,
-                          float* scale_ptr);
-
-}  // namespace filter
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/llapi/image.cpp b/mobile/src/fpga/KD/llapi/image.cpp
deleted file mode 100644
index d44d25420a..0000000000
--- a/mobile/src/fpga/KD/llapi/image.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory.h>
-#include <algorithm>
-
-#include "fpga/KD/llapi/image.h"
-#include "fpga/KD/llapi/zynqmp_api.h"
-
-namespace paddle_mobile {
-namespace zynqmp {
-namespace image {
-
-void convert_to_hwc(float **data_in, int channel, int height, int width) {
-  float *tmp = *data_in;
-  float *data_tmp =
-      (float *)fpga_malloc(channel * height * width * sizeof(float));  // NOLINT
-  int64_t amount_per_row = width * channel;
-  for (int c = 0; c < channel; c++) {
-    for (int h = 0; h < height; h++) {
-      int64_t offset_height = h * amount_per_row;
-      for (int w = 0; w < width; w++) {
-        *(data_tmp + offset_height + w * channel + c) = *((*data_in)++);
-      }
-    }
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void align_element_conv(float **data_in, int height, int cw) {
-  int h = 0;
-  int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
-  if (align_cw != cw) {
-    float *tmp = *data_in;
-    float *data_tmp =
-        (float *)fpga_malloc(height * align_cw * sizeof(float));  // NOLINT
-
-    memset(data_tmp, 0, height * align_cw * sizeof(float));
-
-    for (h = 0; h < height; h++) {
-      memcpy((void *)(data_tmp + h * align_cw),  // NOLINT
-             (void *)(*data_in + h * cw),        // NOLINT
-             cw * sizeof(float));
-    }
-    *data_in = data_tmp;
-    fpga_free(tmp);
-  }
-}
-
-void format_image(float **data_in, int channel, int height, int width) {
-  // convert_to_hwc(data_in, channel, height, width);
-  align_element_conv(data_in, height, channel * width);
-  fpga_flush(*data_in, align_to_x(channel * width, IMAGE_ALIGNMENT) * height *
-                           sizeof(float));
-}
-
-void concat_images(int16_t **images_in, float **scales_in, void *image_out,
-                   float *scale_out, int image_num, uint32_t *channel_num,
-                   int height, int width) {
-  int i = 0;
-  int j = 0;
-  int k = 0;
-  int each_out_line_channel = 0;
-  int align_each_out_area_cw = 0;
-  int align_each_in_area_cw = 0;
-  int align_each_out_area_cw_differ = 0;
-  int tmp_channel = 0;
-  scale_out[0] = 0.0;
-  scale_out[1] = 0.0;
-  for (i = 0; i < image_num; i++) {
-    each_out_line_channel += channel_num[i];
-    scale_out[0] = std::max(*scale_out, scales_in[i][0]);
-    // fpga_invalidate(images_in[i],
-    //                 height *
-    //                     align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) *
-    //                     sizeof(int16_t));
-  }
-  scale_out[1] = 1 / scale_out[0];
-  align_each_out_area_cw =
-      align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT);
-  align_each_out_area_cw_differ =
-      align_each_out_area_cw - each_out_line_channel * width;
-
-  for (k = 0; k < height; k++) {
-    for (j = 0; j < width; j++) {
-      for (i = 0; i < image_num; i++) {
-        align_each_in_area_cw =
-            align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
-        memcpy((int16_t *)image_out + tmp_channel +  // NOLINT
-                   k * align_each_out_area_cw_differ,
-               images_in[i] + j * channel_num[i] + k * align_each_in_area_cw,
-               channel_num[i] * sizeof(int16_t));
-
-        tmp_channel += channel_num[i];
-      }
-    }
-  }
-  fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int16_t));
-}
-
-void split_image(int16_t *image_in, const float *scale_in, void **images_out,
-                 float **scales_out, int image_num,
-                 const uint32_t *channel_nums, int height, int width) {
-  int total_channel = 0;
-  for (int i = 0; i < image_num; i++) {
-    scales_out[i][0] = scale_in[0];
-    scales_out[i][1] = scale_in[1];
-    total_channel += channel_nums[i];
-  }
-  int element_num = height * align_to_x(width * total_channel, IMAGE_ALIGNMENT);
-  fpga_invalidate(image_in, element_num * sizeof(int16_t));
-
-  int src_offset = 0;
-  int des_offset = 0;
-  for (int h = 0; h < height; h++) {
-    for (int w = 0; w < width; w++) {
-      src_offset = h * align_to_x(total_channel * width, IMAGE_ALIGNMENT) +
-                   w * total_channel;
-      for (int i = 0; i < image_num; i++) {
-        des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT) +
-                     w * channel_nums[i];
-        memcpy(reinterpret_cast<int16_t *>(images_out[i] + des_offset),
-               image_in + src_offset, channel_nums[i] * sizeof(int16_t));
-        src_offset += channel_nums[i];
-      }
-    }
-  }
-
-  for (int i = 0; i < image_num; i++) {
-    element_num = height * align_to_x(width * channel_nums[i], IMAGE_ALIGNMENT);
-    fpga_flush(images_out[i], element_num * sizeof(int16_t));
-  }
-}
-
-}  // namespace image
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/llapi/image.h b/mobile/src/fpga/KD/llapi/image.h
deleted file mode 100644
index d01877397a..0000000000
--- a/mobile/src/fpga/KD/llapi/image.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdint>
-
-namespace paddle_mobile {
-namespace zynqmp {
-namespace image {
-
-void convert_to_hwc(float** data_in, int channel, int height, int width);
-void align_element_conv(float** data_in, int height, int cw);
-void format_image(float** data_in, int channel, int height, int width);
-
-// Concat featuremaps along channel direction
-void concat_images(int16_t** images_in, float** scales_in, void* image_out,
-                   float* scale_out, int image_num, uint32_t* channel_num,
-                   int height, int width);
-
-// Split featuremap along channel direction
-void split_image(int16_t* image_in, const float* scale_in, void** images_out,
-                 float** scales_out, int image_num,
-                 const uint32_t* channel_nums, int height, int width);
-}  // namespace image
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/llapi/zynqmp_api.cpp b/mobile/src/fpga/KD/llapi/zynqmp_api.cpp
deleted file mode 100644
index ec6ee9f331..0000000000
--- a/mobile/src/fpga/KD/llapi/zynqmp_api.cpp
+++ /dev/null
@@ -1,384 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fcntl.h>
-#include <sys/ioctl.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <cstring>
-#include <map>
-
-#include "fpga/KD/llapi/config.h"
-#include "fpga/KD/llapi/zynqmp_api.h"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-#define PADDLE_MOBILE_OS_LINUX
-
-static int fd = -1;
-static const char *device_path = "/dev/fpgadrv0";
-static std::map<void *, size_t> memory_map;
-
-static size_t memory_size_max = 0;
-static size_t memory_size = 0;
-
-static inline int do_ioctl(uint64_t req, const void *arg) {
-#ifdef PADDLE_MOBILE_OS_LINUX
-  return ioctl(fd, req, arg);
-#else
-  return -1;
-#endif
-}
-
-int open_device() {
-  std::cout << "open_device" << std::endl;
-  if (fd == -1) {
-    fd = open(device_path, O_RDWR);
-  }
-  std::cout << "open_device fd:" << fd << std::endl;
-  return fd;
-}
-
-void close_device() { close(fd); }
-
-void reset_device() {
-  FpgaResetArgs args;
-  do_ioctl(IOCTL_FPGA_RESET, &args);
-}
-
-// memory management;
-void *fpga_malloc(size_t size) {
-// std::cout << "fpga malloc: 0x" << std::hex << size  << std::dec << "  (" <<
-// size << ") - ";
-#ifdef ENABLE_DEBUG
-// std::cout << "fpga_malloc:" << size << std::endl;
-#endif
-#ifdef PADDLE_MOBILE_OS_LINUX
-  void *ptr = reinterpret_cast<void *>(
-      mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
-  if (ptr == NULL) {
-    std::cout << "not enough memory !";
-    exit(-1);
-  }
-  // std::cout << std::hex << ptr << std::dec << std::endl;
-  memory_map.insert(std::make_pair(ptr, size));
-  memory_size += size;
-  if (memory_size > memory_size_max) {
-    memory_size_max = memory_size;
-  }
-  return ptr;
-#else
-  return malloc(size);
-#endif
-}
-
-size_t fpga_get_memory_size(void *ptr) { return memory_map[ptr]; }
-
-size_t fpga_get_memory_size_max() { return memory_size_max; }
-
-size_t fpga_diagnose_memory(int detailed) {
-  size_t total = 0;
-  //        size_t size = 0;
-  //        int i = 0;
-  auto iter = memory_map.begin();  // std::map<void *, size_t>::iterator
-  while (iter != memory_map.end()) {
-    total += iter->second;
-    iter++;
-  }
-  return total;
-}
-
-void fpga_free(void *ptr) {
-  size_t size = 0;
-  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
-  if (iter != memory_map.end()) {
-    size = iter->second;
-    memory_map.erase(iter);
-  }
-
-  memory_size -= size;
-
-#ifdef PADDLE_MOBILE_OS_LINUX
-
-  munmap(ptr, size);
-#else
-  free(ptr);
-#endif
-}
-
-void fpga_copy(void *dst, const void *src, int size) { memcpy(dst, src, size); }
-
-int fpga_flush(void *address, size_t size) {
-  struct MemoryCacheArgs args;
-  args.address = address;
-  args.size = size;
-  return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
-}
-
-int fpga_invalidate(void *address, size_t size) {
-  // std::cout <<
-  // "=================================================================================="
-  // << std::endl;
-  struct MemoryCacheArgs args;
-  args.address = address;
-  args.size = size;
-  return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
-}
-
-int invalidate_cache(void *addr, int size) {
-  struct MemoryCacheArgs args;
-  args.address = addr;
-  args.size = size;
-  return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
-}
-
-int flush_cache(void *addr, int size) {
-  struct MemoryCacheArgs args;
-  args.address = addr;
-  args.size = size;
-  return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
-}
-
-void fpga_copy(void *dest, const void *src, size_t num) {
-  memcpy(dest, src, num);
-}
-
-int ioctl_conv(const struct ConvArgs &args) {
-#ifdef ENABLE_DEBUG
-//        std::cout << "======Compute Basic Conv======";
-//        std::cout << "   relu_enabled:" << args.relu_enabled
-//       << "   sb_address:" << args.sb_address
-//       << "   filter_address:" << args.filter_address
-//       << "   filter_num:" << args.filter_num
-//       << "   group_num:" << args.group_num;
-//  std::cout << "   image_address:" << args.image.address
-//       << "   image_scale_address:" << args.image.scale_address
-//       << "   image_channels:" << args.image.channels
-//       << "   image_height:" << args.image.height
-//       << "   image_width:" << args.image.width
-//       << "   pad_height:" << args.image.pad_height
-//       << "   pad_width:" << args.image.pad_width;
-//  std::cout << "   kernel_height:" << args.kernel.height
-//       << "   kernel_width:" << args.kernel.width
-//       << "   stride_h:" << args.kernel.stride_h
-//       << "   stride_w:" << args.kernel.stride_w;
-//  std::cout << "   out_address:" << args.output.address
-//       << "   out_scale_address:" << args.output.scale_address;
-//
-//       float* in_scale = (float*)args.image.scale_address;
-//       std::cout << "inv_scale:" << in_scale[0] << "," << in_scale[1] <<
-//       std::endl;
-
-#endif
-
-  return do_ioctl(IOCTL_CONFIG_CONV, &args);
-
-  // return 0;
-}
-
-int compute_fpga_conv_basic(const struct ConvArgs &args) {
-#ifdef ENABLE_DEBUG
-
-//        std::cout << "======Compute Basic Conv======";
-//        std::cout << "   relu_enabled:" << args.relu_enabled
-//       << "   sb_address:" << args.sb_address
-//       << "   filter_address:" << args.filter_address
-//       << "   filter_num:" << args.filter_num
-//       << "   group_num:" << args.group_num;
-//  std::cout << "   image_address:" << args.image.address
-//       << "   image_scale_address:" << args.image.scale_address
-//       << "   image_channels:" << args.image.channels
-//       << "   image_height:" << args.image.height
-//       << "   image_width:" << args.image.width
-//       << "   pad_height:" << args.image.pad_height
-//       << "   pad_width:" << args.image.pad_width;
-//  std::cout << "   kernel_height:" << args.kernel.height
-//       << "   kernel_width:" << args.kernel.width
-//       << "   stride_h:" << args.kernel.stride_h
-//       << "   stride_w:" << args.kernel.stride_w;
-//  std::cout << "   out_address:" << args.output.address
-//       << "   out_scale_address:" << args.output.scale_address;
-
-// float *in_scale = (float *)args.image.scale_address;
-//        std::cout << " scale:" << in_scale[0] << "," << in_scale[1] <<
-//        std::endl;
-
-// float *filter_scale = (float *)args.filter_scale_address;
-//        std::cout << " filter scale:" << filter_scale[0] << "," <<
-//        filter_scale[1] << std::endl;
-
-#endif
-  return do_ioctl(IOCTL_CONFIG_CONV, &args);
-}
-
-int compute_fpga_conv(const struct SplitConvArgs &args) {
-  // return do_ioctl(IOCTL_CONFIG_CONV, &args);
-  int split_num = args.split_num;
-  int ret = -1;
-  for (int i = 0; i < split_num; i++) {
-    // ComputeBasicConv(args.conv_args[i]);
-    ret = compute_fpga_conv_basic(args.conv_arg[i]);
-  }
-
-  if (split_num > 1) {
-    std::cout << "Split num > 1 !!!!!!!!!!!!!!!!!!" << std::endl;
-    exit(-1);
-  }
-  return ret;
-}
-
-int compute_fpga_pool(const struct PoolingArgs &args) {
-  return do_ioctl(IOCTL_CONFIG_POOLING, &args);
-}
-
-int compute_fpga_ewadd(const struct EWAddArgs &args) {
-  return do_ioctl(IOCTL_CONFIG_EW, &args);
-}
-
-int perform_bypass(const struct BypassArgs &args) {
-  int size = args.image.channels * args.image.width * args.image.height;
-  int max_size = 1 << 21;
-
-  float times = 1.0 * size / max_size;
-  int count = static_cast<int>(times);
-
-  void *input_address = args.image.address;
-  int type_size =
-      args.input_data_type == DATA_TYPE_FP32 ? sizeof(float) : sizeof(int16_t);
-
-  void *output_address = args.output.address;
-  int out_type_size =
-      args.output_data_type == DATA_TYPE_FP32 ? sizeof(float) : sizeof(int16_t);
-
-  struct BypassArgs bypassArgs = args;
-  bypassArgs.image.width = 1;
-  bypassArgs.image.height = 1;
-
-  // std::cout << "times:" << times << " count:" << count << std::endl;
-
-  for (int i = 0; i < count; ++i) {
-    bypassArgs.image.channels = max_size;
-    bypassArgs.image.address =
-        reinterpret_cast<char *>(input_address + i * max_size * type_size);
-    bypassArgs.output.address =
-        reinterpret_cast<char *>(output_address + i * max_size * out_type_size);
-    int ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
-    if (ret != 0) {
-      return ret;
-    }
-    // std::cout << "@:" << i << " ret:" << ret << std::endl;
-  }
-
-  int remainder = size - max_size * count;
-  // std::cout << "remainder:" << remainder << std::endl;
-  bypassArgs.image.channels = remainder;
-  bypassArgs.image.address =
-      reinterpret_cast<char *>(input_address + count * max_size * type_size);
-  bypassArgs.output.address = reinterpret_cast<char *>(
-      output_address + count * max_size * out_type_size);
-  return do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
-}
-
-int compute_fpga_concat(const struct ConcatArgs &args) { return -1; }
-
-int compute_fpga_scale(const struct ScaleArgs &args) {
-#ifdef ENABLE_DEBUG
-  std::cout << "======Compute Scale======";
-  std::cout << "scale_address:" << args.scale_address << std::endl;
-  std::cout << "bias_address:" << args.bias_address << std::endl;
-
-  std::cout << "wc_alignment:" << args.wc_alignment << std::endl;
-  std::cout << "channel_alignment:" << args.channel_alignment << std::endl;
-
-  std::cout << "   image_address:" << args.image.address
-            << "   image_scale_address:" << args.image.scale_address
-            << "   image_channels:" << args.image.channels
-            << "   image_height:" << args.image.height
-            << "   image_width:" << args.image.width
-            << "   pad_height:" << args.image.pad_height
-            << "   pad_width:" << args.image.pad_width;
-
-  std::cout << "   out_address:" << args.output.address
-            << "   out_scale_address:" << args.output.scale_address;
-
-#endif
-  return do_ioctl(IOCTL_CONFIG_SCALE, &args);
-}
-
-int compute_fpga_dwconv(const struct DWconvArgs &args) {
-  std::cout << "======Compute Basic Conv======";
-  std::cout << "   relu_enabled:" << args.relu_enabled
-            << "   filter_address:" << args.filter_address;
-  std::cout << "   image_address:" << args.image.address
-            << "   image_scale_address:" << args.image.scale_address
-            << "   image_channels:" << args.image.channels
-            << "   image_height:" << args.image.height
-            << "   image_width:" << args.image.width
-            << "   pad_height:" << args.image.pad_height
-            << "   pad_width:" << args.image.pad_width;
-  std::cout << "   kernel_height:" << args.kernel.height
-            << "   kernel_width:" << args.kernel.width
-            << "   stride_h:" << args.kernel.stride_h
-            << "   stride_w:" << args.kernel.stride_w;
-  std::cout << "   out_address:" << args.output.address
-            << "   out_scale_address:" << args.output.scale_address;
-
-  // float *in_scale = (float *)args.image.scale_address;
-  // std::cout << "inv_scale:" << in_scale[0] << "," << in_scale[1] <<
-  // std::endl;
-
-  return do_ioctl(IOCTL_CONFIG_DWCONV, &args);
-}
-
-// int config_power(const struct PowerArgs& args) {
-//     return do_ioctl(IOCTL_CONFIG_POWER, &args);
-// }
-
-int config_inplace(const struct InplaceArgs &args) {
-  return do_ioctl(IOCTL_CONFIG_INPLACE, &args);
-}
-
-// uint64_t vaddr_to_paddr(void *address) {
-//     return 0;
-// }
-
-int16_t fp32_2_fp16(float fp32_num) {
-  unsigned long tmp = *(unsigned long *)(&fp32_num);  // NOLINT
-  auto t = (int16_t)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
-                     (((tmp & 0x7f800000) >> 13) - (112 << 10)));
-  if (tmp & 0x1000) {
-    t++;  // roundoff
-  }
-  return t;
-}
-
-float fp16_2_fp32(int16_t fp16_num) {
-  if (0 == fp16_num) {
-    return 0;
-  }
-  int frac = (fp16_num & 0x3ff);
-  int exp = ((fp16_num & 0x7c00) >> 10) + 112;
-  int s = fp16_num & 0x8000;
-  int tmp = 0;
-  float fp32_num = 0;
-  tmp = s << 16 | exp << 23 | frac << 13;
-  fp32_num = *(float *)&tmp;  // NOLINT
-  return fp32_num;
-}
-
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/llapi/zynqmp_api.h b/mobile/src/fpga/KD/llapi/zynqmp_api.h
deleted file mode 100644
index 89d9754903..0000000000
--- a/mobile/src/fpga/KD/llapi/zynqmp_api.h
+++ /dev/null
@@ -1,329 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
-#define PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
-
-#include <stdint.h>
-#include <cstddef>
-#include <iostream>
-#include <limits>
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-typedef int16_t half;
-
-#define IMAGE_ALIGNMENT 16           // Aligned to 16
-#define FILTER_NUM_ALIGNMENT 32      // Filter number aligned to 32
-#define FILTER_ELEMENT_ALIGNMENT 16  // Filter element number aligned to 16
-#define BS_NUM_ALIGNMENT 8
-#define BIAS_NUM_ALIGNMENT 16
-
-enum DDataType {
-  DATA_TYPE_FP32 = 1,
-  DATA_TYPE_FP16 = 0,
-};
-
-enum DLayoutType {
-  LAYOUT_CHW = 1,
-  LAYOUT_HWC = 0,
-};
-
-struct VersionArgs {
-  void* buffer;
-};
-
-struct MemoryCopyArgs {
-  void* src;
-  void* dest;
-  size_t size;
-};
-
-struct MemoryCacheArgs {
-  void* address;
-  size_t size;
-};
-
-struct MemoryBarrierArgs {};
-
-struct BNArgs {
-  bool enabled;
-  void* bias_address;
-  void* scale_address;
-};
-
-/**
-Conv and Pooling kernel
-*/
-struct KernelArgs {
-  uint32_t width;
-  uint32_t height;
-  uint32_t stride_w;
-  uint32_t stride_h;
-};
-
-struct ImageInputArgs {
-  void* address;        // input featuremap virtual address
-  void* scale_address;  // input scale address;
-  uint32_t channels;
-  uint32_t width;  // featuremap width
-  uint32_t height;
-  uint32_t pad_width;  // padding width;
-  uint32_t pad_height;
-};
-
-struct ImageOutputArgs {
-  void* address;         // output result address;
-  float* scale_address;  // output scale address;
-};
-
-struct ConvArgs {
-  bool relu_enabled;
-  void* sb_address;  // scale and bias are interlaced;
-  void* filter_address;
-  void* filter_scale_address;
-  uint32_t filter_num;
-  uint32_t group_num;
-
-  struct KernelArgs kernel;
-  struct ImageInputArgs image;  // input image;
-  struct ImageOutputArgs output;
-};
-
-struct DWconvArgs {
-  bool relu_enabled;
-  void* bias_address;
-  void* filter_address;
-  struct KernelArgs kernel;
-  struct ImageInputArgs image;
-  struct ImageOutputArgs output;
-  uint16_t out_width;
-  uint16_t out_height;
-  uint16_t sub_conv_num;
-};
-
-struct PoolingArgs {
-  uint16_t mode;
-  uint16_t kernel_reciprocal;
-  struct KernelArgs kernel;
-  struct ImageInputArgs image;  // input image;
-  struct ImageOutputArgs output;
-  uint16_t out_width;
-  uint16_t out_height;
-};
-
-// elementwise add arguments
-struct EWAddArgs {
-  bool relu_enabled;
-
-  uint32_t const0;  // output0 = const0 x input0 + const1 x input1;
-  uint32_t const1;
-  struct ImageInputArgs image0;
-  struct ImageInputArgs image1;
-  struct ImageOutputArgs output;
-};
-
-struct BypassArgs {
-  enum DDataType input_data_type;
-  enum DDataType output_data_type;
-  enum DLayoutType input_layout_type;
-  enum DLayoutType output_layout_type;
-  struct ImageInputArgs image;
-  struct ImageOutputArgs output;
-};
-
-struct ScaleArgs {
-  void* scale_address;
-  void* bias_address;
-  uint32_t wc_alignment;
-  uint32_t channel_alignment;
-
-  struct ImageInputArgs image;
-  struct ImageOutputArgs output;
-};
-
-struct NormalizeArgs {
-  void* input_image_address;
-  void* output_image_address;
-  uint32_t image_width;
-  uint32_t image_height;
-  uint32_t image_channel;
-  uint32_t* output_scale_address;
-};
-
-struct PowerParameterArgs {
-  uint16_t shift;
-  uint16_t scale;
-  uint16_t power;
-};
-
-struct NormalizeParameterArgs {
-  uint32_t channel;
-  uint32_t hight_width;
-};
-
-struct InplaceArgs {
-  bool relu_enable;
-  bool power_enable;
-  bool normalize_enable;
-};
-
-struct FpgaRegWriteArgs {
-  uint64_t address;  //
-  uint64_t value;
-};
-
-struct FpgaRegReadArgs {
-  uint64_t address;
-  uint64_t value;
-};
-
-struct FpgaResetArgs {};
-
-#define IOCTL_FPGA_MAGIC (('F' + 'P' + 'G' + 'A') / 4)
-
-#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs)
-
-#define IOCTL_SEPARATOR_0 10
-
-#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
-#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
-#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
-#define IOCTL_MEMORY_BARRIER \
-  _IOW(IOCTL_FPGA_MAGIC, 14, struct MemoryBarrierArgs)
-
-#define IOCTL_SEPARATOR_1 20
-
-#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct ConvArgs)
-#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct PoolingArgs)
-#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct EWAddArgs)
-#define IOCTL_CONFIG_BYPASS _IOW(IOCTL_FPGA_MAGIC, 24, struct BypassArgs)
-#define IOCTL_CONFIG_SCALE _IOW(IOCTL_FPGA_MAGIC, 25, struct ScaleArgs)
-#define IOCTL_CONFIG_NORMALIZE _IOW(IOCTL_FPGA_MAGIC, 26, struct NormalizeArgs)
-
-#define IOCTL_CONFIG_DWCONV _IOW(IOCTL_FPGA_MAGIC, 31, struct DWconvArgs)
-
-#define IOCTL_CONFIG_INPLACE _IOW(IOCTL_FPGA_MAGIC, 40, struct InplaceArgs)
-#define IOCTL_CONFIG_POWER_PARAMETER \
-  _IOW(IOCTL_FPGA_MAGIC, 41, struct PowerParameterArgs)
-#define IOCTL_CONFIG_NORMALIZE_PARAMETER \
-  _IOW(IOCTL_FPGA_MAGIC, 42, struct NormalizeParameterArgs)
-#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 50, struct FpgaRegReadArgs)
-#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 51, struct FpgaRegWriteArgs)
-#define IOCTL_FPGA_RESET _IOW(IOCTL_FPGA_MAGIC, 52, struct FpgaResetArgs)
-
-//============================== API =============================
-
-// struct DWconvArgs {
-//     bool relu_enabled;
-//     void* bias_address;
-//     void* filter_address;
-//     struct KernelArgs kernel;
-//     struct ImageInputArgs image;
-//     struct ImageOutputArgs output;
-// };
-
-struct DeconvArgs {
-  uint32_t sub_conv_num;
-  uint32_t group_num;
-  uint32_t filter_num;
-  uint32_t omit_size;
-  uint32_t sub_output_width;
-  uint32_t sub_output_height;
-  struct ImageOutputArgs output;
-  struct SplitConvArgs* split_conv_args;
-};
-
-struct SplitArgs {
-  uint32_t image_num;
-  int16_t* image_in;
-  float* scale_in;
-  void** images_out;
-  float** scales_out;
-  uint32_t* out_channel_nums;
-  uint32_t height;
-  uint32_t width;
-};
-
-struct ConcatArgs {
-  uint32_t image_num;
-  half** images_in;
-  float** scales_in;
-  void* image_out;
-  float* scale_out;
-  uint32_t* channel_num;
-  uint32_t height;
-  uint32_t width;
-};
-
-struct SplitConvArgs {
-  uint32_t split_num;
-  uint32_t group_num;
-  uint32_t filter_num;
-  struct ImageOutputArgs output;
-  struct ConvArgs* conv_arg;
-  struct ConcatArgs concat_arg;
-};
-
-struct GroupConvArgs {
-  uint32_t group_num;
-  uint32_t filter_num;
-  struct ImageOutputArgs output;
-  struct SplitConvArgs* conv_args;
-  struct ConcatArgs concat_arg;
-};
-
-inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
-int open_device();
-void close_device();
-
-void reset_device();
-
-void* fpga_malloc(size_t size);
-void fpga_free(void* ptr);
-size_t fpga_get_memory_size(void* ptr);
-size_t fpga_get_memory_size_max();
-size_t fpga_diagnose_memory(int detailed);
-
-void fpga_copy(void* dst, const void* src, int size);
-
-int fpga_flush(void* address, size_t size);
-int fpga_invalidate(void* address, size_t size);
-
-int perform_bypass(const struct BypassArgs& args);
-int compute_fpga_conv_basic(const struct ConvArgs& args);
-int compute_fpga_conv(const struct SplitConvArgs& args);
-int compute_fpga_pool(const struct PoolingArgs& args);
-int compute_fpga_ewadd(const struct EWAddArgs& args);
-int compute_fpga_scale(const struct ScaleArgs& args);
-int compute_fpga_concat(const struct ConcatArgs& args);
-int config_power(const struct PowerArgs& args);
-int compute_fpga_dwconv(const struct DWconvArgs& args);
-
-// int config_relu(const struct ReluArgs& args);
-
-int config_inplace(const struct InplaceArgs& args);
-
-int flush_cache(void* addr, int size);
-int invalidate_cache(void* addr, int size);
-
-int16_t fp32_2_fp16(float fp32_num);
-float fp16_2_fp32(int16_t fp16_num);
-}  // namespace zynqmp
-}  // namespace paddle_mobile
-
-#endif  // PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
diff --git a/mobile/src/fpga/KD/pe.hpp b/mobile/src/fpga/KD/pe.hpp
deleted file mode 100644
index e2be6b3610..0000000000
--- a/mobile/src/fpga/KD/pe.hpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef PE_hpp
-#define PE_hpp
-
-#include <stdio.h>
-#include <iostream>
-#include "pe_params.hpp"
-#include "tensor_util.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-class PE {
- public:
-  virtual bool init() { return false; }
-
-  virtual void apply() {}
-
-  virtual bool dispatch() {
-    std::cout << "pe dispatch \n";
-    return false;
-  }
-
-  virtual ~PE() {}
-};
-
-}  // namespace zynqmp
-}  // namespace paddle_mobile
-
-#endif /* PE_hpp */
diff --git a/mobile/src/fpga/KD/pe_params.hpp b/mobile/src/fpga/KD/pe_params.hpp
deleted file mode 100644
index f9a495fad8..0000000000
--- a/mobile/src/fpga/KD/pe_params.hpp
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef PEParams_hpp
-#define PEParams_hpp
-
-#include <stdio.h>
-#include <vector>
-
-#include "llapi/zynqmp_api.h"
-#include "tensor.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-struct PEParam {};
-
-struct InputParam : PEParam {
- public:
-  Tensor* input = nullptr;
-  Tensor* output = nullptr;
-};
-
-struct OutputParam : PEParam {
- public:
-  Tensor* input = nullptr;
-  Tensor* output = nullptr;
-};
-
-struct ReLUParam : PEParam {
- public:
-  bool enabled = false;
-};
-
-struct BatchnormParam : PEParam {
- public:
-  Tensor* bias = nullptr;
-  Tensor* scale = nullptr;
-  Tensor* mean = nullptr;
-  Tensor* variance = nullptr;
-  float epsilon = 0;
-};
-
-struct BasicConvParam {
-  Tensor output;
-  Tensor filter;
-  Tensor scaleBias;
-  ConvArgs args;
-};
-
-struct ConvParam : PEParam {
- public:
-  Tensor* input = nullptr;
-  Tensor* output = nullptr;
-  Tensor* filter = nullptr;
-  BatchnormParam* batchnorm = nullptr;
-  ReLUParam relu;
-  int groups = 1;
-  std::vector<int> strides;
-  std::vector<int> paddings;
-  std::vector<int> kernelSize;
-  std::vector<int> dilations;
-
-  Tensor* scale() { return scale_; }
-
-  Tensor* bias() { return bias_; }
-
-  // Tensor* quantizedFilter() {
-  //     return quantizedFilter_;
-  // }
-
-  std::vector<BasicConvParam*>& splitParams() { return splitParams_; }
-
- protected:
-  std::vector<BasicConvParam*> splitParams_;
-  // Tensor* quantizedFilter_ = new Tensor();
-  Tensor* scale_ = new Tensor();
-  Tensor* bias_ = new Tensor();
-};
-
-struct DepthwiseConvParam : ConvParam {
- public:
-  Tensor* quantizedFilter() { return quantizedFilter_; }
-
-  DWconvArgs args;
-
- protected:
-  Tensor* quantizedFilter_ = new Tensor();
-};
-
-enum PoolingType : int {
-  MAX = 0,
-  AVERAGE = 1,
-};
-
-struct PoolingParam : PEParam {
- public:
-  Tensor* input = nullptr;
-  Tensor* output = nullptr;
-
-  PoolingType type = PoolingType::MAX;
-  bool globalPooling = false;
-  std::vector<int> kernelSize;
-  std::vector<int> strides;
-  std::vector<int> paddings;
-
-  PoolingArgs poolingArgs = {0};
-};
-
-struct ConcatParam : PEParam {
- public:
-  std::vector<Tensor*> inputs;
-  Tensor* output;
-  int axis = 0;
-};
-
-struct ElementwiseAddParam : PEParam {
- public:
-  std::vector<Tensor*> inputs;
-  Tensor* output = nullptr;
-  int axis = 0;
-  ReLUParam relu;
-
-  EWAddArgs ewargs;
-};
-
-struct FullyConnectedParam : PEParam {
- public:
-  Tensor* input = nullptr;
-  Tensor* filter = nullptr;
-  Tensor* bias = nullptr;
-  Tensor* output = nullptr;
-
-  Tensor* quantizedFilter() { return quantizedFilter_; }
-
-  Tensor* biasScale() { return biasScale_; }
-
-  SplitConvArgs convArgs;
-
- protected:
-  Tensor* quantizedFilter_ = new Tensor();
-  Tensor* biasScale_ = new Tensor();
-};
-
-struct SoftmaxParam : PEParam {
- public:
-  Tensor* input = nullptr;
-
-  Tensor* output = nullptr;
-
- private:
-  Tensor* floatInput = nullptr;
-};
-struct NormParam : PEParam {
- public:
-  Tensor* input = nullptr;
-
-  Tensor* output = nullptr;
-
- private:
-  Tensor* floatInput = nullptr;
-};
-}  // namespace zynqmp
-}  // namespace paddle_mobile
-
-#endif /* PEParams_hpp */
diff --git a/mobile/src/fpga/KD/pes/concat_pe.hpp b/mobile/src/fpga/KD/pes/concat_pe.hpp
deleted file mode 100644
index 54169ad5d2..0000000000
--- a/mobile/src/fpga/KD/pes/concat_pe.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-
-#include "../pe.hpp"
-#include "../pe_params.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-class ConcatPE : public PE {
- public:
-  bool init() {
-    Tensor* output = param_.output;
-    output->setAligned(true);
-    return true;
-  }
-
-  void apply() {}
-
-  bool dispatch() {
-    Tensor* output = param_.output;
-    Shape& output_shape = output->shape();
-    float16* out_data = param_.output->data<float16>();
-
-    int channel_sum = 0;
-    int out_channel = output_shape.channel();
-    float scale = 0;
-    for (int n = 0; n < param_.inputs.size(); n++) {
-      Tensor* input = param_.inputs[n];
-      input->invalidate();
-      scale = std::max(scale, input->scale()[0]);
-      Shape& input_shape = input->shape();
-      int wh = output_shape.width() * output_shape.height();
-      for (int j = 0; j < wh; j++) {
-        float16* src = input->data<float16>() + j * input_shape.channel();
-        memcpy(out_data + j * out_channel + channel_sum, src,
-               input_shape.channel() * sizeof(float16));
-      }
-      channel_sum += input_shape.channel();
-    }
-    output->scale()[0] = scale;
-    output->scale()[1] = 1.0f / scale;
-    std::cout << "conv scale::" << scale << std::endl;
-    output->flush();
-    return true;
-  }
-
-  ConcatParam& param() { return param_; }
-
- private:
-  ConcatParam param_;
-};
-
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/pes/conv_pe.hpp b/mobile/src/fpga/KD/pes/conv_pe.hpp
deleted file mode 100644
index 5ef89e920e..0000000000
--- a/mobile/src/fpga/KD/pes/conv_pe.hpp
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-
-#include "../llapi/image.h"
-#include "../pe.hpp"
-#include "../pe_params.hpp"
-#include "concat_pe.hpp"
-#include "conv_pe.hpp"
-#include "conv_process.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-class ConvPE : public PE {
- public:
-  bool init() {
-    std::cout << "Conv init" << std::endl;
-    return true;
-  }
-
-  void apply() {
-    // process scale and bias;
-    BatchnormParam* bn = param_.batchnorm;
-    int channel = param_.output->shape().channel();
-    Shape sb_shape(N, {channel});
-    float* new_scale_ptr = param_.scale()->mutableData<float>(FP32, sb_shape);
-    float* new_bias_ptr = param_.bias()->mutableData<float>(FP32, sb_shape);
-    if (bn != nullptr) {
-      float* bn_scale_ptr = bn->scale->data<float>();
-      float* bn_bias_ptr = bn->bias->data<float>();
-      float* bn_var_ptr = bn->variance->data<float>();
-      float* bn_mean_ptr = bn->mean->data<float>();
-      float epsilon = bn->epsilon;
-      for (int i = 0; i < channel; i++) {
-        float new_scale =
-            bn_scale_ptr[i] /
-            static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
-        new_scale_ptr[i] = new_scale;
-        new_bias_ptr[i] =
-            bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
-      }
-    } else {
-      for (int i = 0; i < channel; i++) {
-        new_scale_ptr[i] = 1.0f;
-        new_bias_ptr[i] = 0.0f;
-      }
-    }
-    fill_split_arg(param_);
-    if (param_.splitParams().size() > 1) {
-      ConcatParam& concat_param = concatPE_.param();
-      for (auto conv_param : param_.splitParams()) {
-        concat_param.inputs.push_back(&conv_param->output);
-      }
-      concat_param.output = param_.output;
-      concatPE_.init();
-      concatPE_.apply();
-    }
-  }
-
-  bool dispatch() {
-    std::vector<BasicConvParam*>& params = param_.splitParams();
-    int ret = 0;
-    for (auto conv_param : params) {
-      ret |= compute_fpga_conv_basic(conv_param->args);
-    }
-    size_t size = params.size();
-    if (ret == 0 && size > 1) {
-      concatPE_.dispatch();
-    }
-    return ret == 0;
-  }
-
-  ConvParam& param() { return param_; }
-
- private:
-  ConvParam param_;
-  ConcatPE concatPE_;
-};
-
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/pes/conv_process.hpp b/mobile/src/fpga/KD/pes/conv_process.hpp
deleted file mode 100644
index 13bcaccabd..0000000000
--- a/mobile/src/fpga/KD/pes/conv_process.hpp
+++ /dev/null
@@ -1,374 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef conv_process_hpp
-#define conv_process_hpp
-
-#include <string.h>
-#include <cmath>
-#include <vector>
-
-#include "../float16.hpp"
-#include "../llapi/bias_scale.h"
-#include "../llapi/filter.h"
-#include "../llapi/image.h"
-#include "../tensor.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-inline int get_aligned_filter_element_num(int chw) {
-  return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-}
-
-inline int get_filter_num_per_div(Tensor* filter, int group_num) {
-  auto chw = filter->shape().channel() * filter->shape().height() *
-             filter->shape().width();
-  auto num = filter->shape().num();
-  int div_capacity = filter::calc_division_capacity(chw);
-  return filter::calc_num_per_div(num, group_num, div_capacity);
-}
-
-inline int get_split_num(Tensor* filter) {
-  auto chw = filter->shape().channel() * filter->shape().height() *
-             filter->shape().width();
-  auto num = filter->shape().num();
-  int div_capacity = filter::calc_division_capacity(chw);
-  return filter::calc_split_num(num, div_capacity);
-}
-
-inline void format_scale_bias(Tensor* scale, Tensor* bias, Tensor* filter,
-                              Tensor* scale_bias, int group) {
-  float* scale_data = nullptr;
-  float* bias_data = nullptr;
-  if (scale != nullptr) {
-    scale_data = scale->data<float>();
-  }
-  if (bias != nullptr) {
-    bias_data = bias->data<float>();
-  }
-  int channel = filter->shape().num();
-  Shape bias_scale_shape(N, {2 * channel});
-  float* bs_data = scale_bias->mutableData<float>(FP32, bias_scale_shape);
-  for (int i = 0; i < channel; i++) {
-    float scale_value = scale_data == nullptr ? 1 : scale_data[i];
-    float bias_value = bias_data == nullptr ? 0 : bias_data[i];
-    bs_data[i + channel] = scale_value;
-    bs_data[i] = bias_value;
-  }
-
-  int element_num_per_div = get_filter_num_per_div(filter, group);
-  bias_scale::format_bias_scale_array(&bs_data, element_num_per_div, channel);
-}
-
-inline void format_filter(Tensor* filter, Tensor* quantized_filter, int group) {
-  float max_value = find_max(*filter);
-  Shape& filter_shape = filter->shape();
-  quantized_filter->setAligned(true);
-  quantized_filter->mutableData<int8_t>(INT8, filter->shape());
-  quantized_filter->scale()[0] = max_value / 127.0f;
-  quantized_filter->scale()[1] = 127.0f / max_value;
-
-  auto memory_size = filter->shape().memorySize(sizeof(float));
-  auto new_data = reinterpret_cast<float*>(fpga_malloc(memory_size));
-  memcpy(new_data, filter->data<float>(), memory_size);
-  size_t mem_size = filter::format_filter(
-      &new_data, filter_shape.num(), filter_shape.channel(),
-      filter_shape.height(), filter_shape.width(), group, max_value);
-  int8_t* src = quantized_filter->mutableData<int8_t>(INT8, filter->shape());
-  memcpy(src, new_data, mem_size);
-  fpga_free(new_data);
-  quantized_filter->flush();
-}
-
-inline void format_dw_filter(Tensor* filter, Tensor* quantized_filter,
-                             float* scale) {
-  int num = filter->shape().num();
-  int height = filter->shape().height();
-  int width = filter->shape().width();
-  auto memory_size = filter->shape().memorySize(sizeof(float));
-  auto new_data = (float*)fpga_malloc(memory_size);  // NOLINT
-  memcpy(new_data, filter->data<float>(), memory_size);
-
-  filter::format_dwconv_filter(&new_data, num, height, width, scale);
-  float16* src = quantized_filter->mutableData<float16>(FP16, filter->shape());
-  memcpy(src, new_data, quantized_filter->shape().memorySize(sizeof(float16)));
-  quantized_filter->flush();
-
-  fpga_free(new_data);
-}
-
-inline void format_fc_filter(Tensor* filter, Tensor* quantized_filter) {
-  float max_value = find_max(*filter);
-  Shape& filter_shape = filter->shape();
-  quantized_filter->setAligned(true);
-  quantized_filter->mutableData<int8_t>(INT8, filter->shape());
-  quantized_filter->scale()[0] = max_value / 127.0f;
-  quantized_filter->scale()[1] = 127.0f / max_value;
-
-  size_t memory_size = filter->shape().memorySize(sizeof(float));
-  auto new_data = (float*)fpga_malloc(memory_size);  // NOLINT
-  memcpy(new_data, filter->data<float>(), memory_size);
-  filter::format_fc_filter(&new_data, filter_shape.num(),
-                           filter_shape.channel(), filter_shape.height(),
-                           filter_shape.width(), 1, max_value);
-
-  int8_t* src = quantized_filter->mutableData<int8_t>(INT8, filter->shape());
-  memcpy(src, new_data, quantized_filter->shape().memorySize(sizeof(int8_t)));
-  quantized_filter->flush();
-  fpga_free(new_data);
-}
-
-inline void fill_split_arg(const ConvParam& c_param) {
-  ConvParam& param = const_cast<ConvParam&>(c_param);
-  Tensor* input = param.input;
-  Tensor* out = param.output;
-  Tensor* filter = param.filter;
-  auto channel = out->shape().channel();
-
-  int split_num = param.groups == 1 ? get_split_num(param.filter) : 1;
-  int filter_num_per_div = get_filter_num_per_div(filter, param.groups);
-  int element_num = get_aligned_filter_element_num(filter->shape().channel() *
-                                                   filter->shape().height() *
-                                                   filter->shape().width());
-
-  Shape& out_shape = out->shape();
-  for (int i = 0; i < split_num; i++) {
-    BasicConvParam* conv_param = new BasicConvParam();
-
-    int filter_num = filter->shape().num();
-    float16* out_address = nullptr;
-    int8_t* filter_address = nullptr;
-    float* sb_address = nullptr;
-    float* out_scale_address = nullptr;
-
-    ConvArgs& args = conv_param->args;
-
-    if (split_num == 1) {
-      out_address = out->data<float16>();
-      out_scale_address = out->scale();
-    }
-    filter_num = i == split_num - 1
-                     ? channel - (split_num - 1) * filter_num_per_div  // NOLINT
-                     : filter_num_per_div;
-    if (split_num != 1) {
-      Shape shape(NHWC, {1, out_shape.height(), out_shape.width(), filter_num});
-      out_address = conv_param->output.mutableData<float16>(FP16, shape);
-      out_scale_address = conv_param->output.scale();
-    }
-    Shape f_shape(NCHW, {filter_num, filter->shape().channel(),
-                         filter->shape().height(), filter->shape().width()});
-
-    Tensor new_filter;
-    float* new_filter_data = new_filter.mutableData<float>(FP32, f_shape);
-    int filter_hwc = filter->shape().height() * filter->shape().width() *
-                     filter->shape().channel();
-    memcpy(new_filter_data,
-           filter->data<float>() + i * filter_num_per_div * filter_hwc,
-           filter_num * filter_hwc * sizeof(float));
-    new_filter.flush();
-    conv_param->filter.mutableData<float>(FP32, f_shape);
-    format_filter(&new_filter, &(conv_param->filter), param.groups);
-    filter_address = conv_param->filter.data<int8_t>();
-    std::cout << conv_param->filter.scale()[0] << std::endl;
-    args.filter_scale_address = conv_param->filter.scale();
-
-    int sb_num = 2 * align_to_x(filter_num, BS_NUM_ALIGNMENT);
-    Tensor scale;
-    Tensor bias;
-
-    int chnnnel_start = i * filter_num_per_div;
-
-    Shape s_shape(N, {filter_num});
-    float* scale_data = scale.mutableData<float>(FP32, s_shape);
-    float* bias_data = bias.mutableData<float>(FP32, s_shape);
-    for (int i = 0; i < filter_num; i++) {
-      scale_data[i] = param.scale()->data<float>()[i + chnnnel_start];
-    }
-    for (int i = 0; i < filter_num; i++) {
-      // bias_data[i] = 0.0f;//TODO
-      bias_data[i] = param.bias()->data<float>()[i + chnnnel_start];
-    }
-    Shape sb_shape(N, {sb_num});
-    format_scale_bias(&scale, &bias, &conv_param->filter,
-                      &conv_param->scaleBias, param.groups);
-    sb_address = conv_param->scaleBias.mutableData<float>(FP32, sb_shape);
-
-    args.group_num = param.groups;
-    args.relu_enabled = param.relu.enabled;
-    args.sb_address = sb_address;
-    args.kernel.stride_h = param.strides[1];
-    args.kernel.stride_w = param.strides[0];
-    args.kernel.height = new_filter.shape().height();
-    args.kernel.width = new_filter.shape().width();
-
-    args.filter_address = filter_address;
-    args.filter_num = filter_num;
-
-    args.image.address = input->data<void>();
-    args.image.scale_address = input->scale();
-    args.image.channels = input->shape().channel();
-    args.image.width = input->shape().width();
-    args.image.height = input->shape().height();
-    args.image.pad_width = param.paddings[0];
-    args.image.pad_height = param.paddings[1];
-
-    args.output.address = out_address;
-    args.output.scale_address = out_scale_address;
-    param.splitParams().push_back(conv_param);
-  }
-}
-
-inline void fill_split_arg(struct SplitConvArgs* arg, Tensor* input,
-                           Tensor* out, Tensor* filter, bool relu_enabled,
-                           int group_num, int stride_h, int stride_w,
-                           int padding_h, int padding_w, float* bs_ptr) {
-  auto input_ptr = input->data<float>();
-  auto filter_ptr = filter->data<float>();
-  auto out_ptr = out->data<float>();
-
-  arg->group_num = (uint32_t)group_num;
-  arg->split_num = group_num == 1 ? get_split_num(filter) : 1;
-  arg->filter_num = filter->shape().num();
-  arg->output.address = out_ptr;
-  arg->output.scale_address = out->scale();
-  arg->conv_arg =
-      (ConvArgs*)fpga_malloc(arg->split_num * sizeof(ConvArgs));  // NOLINT
-
-  memset(arg->conv_arg, 0, arg->split_num * sizeof(struct ConvArgs));
-
-  arg->concat_arg.image_num = arg->split_num;
-  arg->concat_arg.image_out = out_ptr;
-  arg->concat_arg.scale_out = out->scale();
-  arg->concat_arg.height = out->shape().height();
-  arg->concat_arg.width = out->shape().width();
-
-  int n = arg->split_num;
-  arg->concat_arg.images_in = (half**)fpga_malloc(n * sizeof(int*));  // NOLINT
-  arg->concat_arg.scales_in =
-      (float**)fpga_malloc(n * sizeof(float*));  // NOLINT
-  arg->concat_arg.channel_num =
-      (uint32_t*)fpga_malloc(n * sizeof(uint32_t));  // NOLINT
-
-  auto channel = out->shape().channel();
-  int filter_num_per_div = get_filter_num_per_div(filter, group_num);
-  int element_num = get_aligned_filter_element_num(filter->shape().channel() *
-                                                   filter->shape().height() *
-                                                   filter->shape().width());
-
-  for (int i = 0; i < n; i++) {
-    arg->conv_arg[i].relu_enabled = relu_enabled;
-    arg->conv_arg[i].group_num = (uint32_t)group_num;
-    arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h;
-    arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w;
-    arg->conv_arg[i].kernel.height = filter->shape().height();
-    arg->conv_arg[i].kernel.width = filter->shape().width();
-    arg->conv_arg[i].image.address = input_ptr;
-    arg->conv_arg[i].image.channels = input->shape().channel();
-    arg->conv_arg[i].image.height = input->shape().height();
-    arg->conv_arg[i].image.width = input->shape().width();
-    arg->conv_arg[i].image.scale_address = input->scale();
-    arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
-    arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
-    arg->conv_arg[i].filter_scale_address = filter->scale();
-    arg->conv_arg[i].filter_num = (uint32_t)(
-        i == n - 1 ? channel - (n - 1) * filter_num_per_div  // NOLINT
-                   : filter_num_per_div);
-
-    size_t filter_size =
-        element_num *
-        align_to_x(arg->conv_arg[i].filter_num, FILTER_NUM_ALIGNMENT) *
-        sizeof(int8_t);
-    auto filter_head =
-        &((int8_t*)filter_ptr)[i * element_num * filter_num_per_div];  // NOLINT
-    arg->conv_arg[i].filter_address = fpga_malloc(filter_size);
-    memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size);
-    fpga_flush(arg->conv_arg[i].filter_address, filter_size);
-
-    size_t bs_size = 2 *
-                     align_to_x(arg->conv_arg[i].filter_num, BS_NUM_ALIGNMENT) *
-                     sizeof(float);
-    auto bs_head = &bs_ptr[i * filter_num_per_div * 2];
-    arg->conv_arg[i].sb_address = fpga_malloc(bs_size);
-    memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size);
-    fpga_flush(arg->conv_arg[i].sb_address, bs_size);
-
-    if (n > 1) {
-      arg->conv_arg[i].output.scale_address =
-          (float*)fpga_malloc(2 * sizeof(float));  // NOLINT
-      arg->conv_arg[i].output.address = fpga_malloc(
-          out->shape().height() *
-          align_to_x(out->shape().width() * arg->conv_arg[i].filter_num,
-                     IMAGE_ALIGNMENT) *
-          sizeof(half));
-    } else {
-      arg->conv_arg[i].output.scale_address = out->scale();
-      arg->conv_arg[i].output.address = out_ptr;
-    }
-
-    arg->concat_arg.images_in[i] =
-        (half*)arg->conv_arg[i].output.address;  // NOLINT
-    arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address;
-    arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num;
-  }
-}
-
-inline int do_concat(const struct ConcatArgs& args) {
-  image::concat_images(args.images_in, args.scales_in, args.image_out,
-                       args.scale_out, args.image_num, args.channel_num,
-                       args.height, args.width);
-  return 0;
-}
-
-inline bool compute_conv(const ConvParam& c_conv_params) {
-  ConvParam& conv_params = const_cast<ConvParam&>(c_conv_params);
-  std::vector<BasicConvParam*>& params = conv_params.splitParams();
-  int ret = 0;
-  for (auto conv_param : params) {
-    ret |= compute_fpga_conv_basic(conv_param->args);
-  }
-  size_t size = params.size();
-  if (ret == 0 && size > 1) {
-    Tensor* output = conv_params.output;
-
-    Tensor& img = params[0]->output;
-    for (int i = 0; i < 1; i++) {
-      for (int i = 0; i < img.shape().numel(); i++) {
-        float value = half_to_float(img.data<float16>()[i]);
-        std::cout << "value:" << value << std::endl;
-      }
-    }
-  }
-  return ret == 0;
-}
-
-inline bool compute_conv(const SplitConvArgs& args) {
-  int ret = 0;
-  int split_num = args.split_num;
-  for (int i = 0; i < split_num; i++) {
-    ret |= compute_fpga_conv_basic(args.conv_arg[i]);
-  }
-
-  if (split_num > 1) {
-    do_concat(args.concat_arg);
-  }
-  return ret == 0;
-}
-
-}  // namespace zynqmp
-}  // namespace paddle_mobile
-
-#endif /* conv_process_hpp */
diff --git a/mobile/src/fpga/KD/pes/depthwise_conv_pe.hpp b/mobile/src/fpga/KD/pes/depthwise_conv_pe.hpp
deleted file mode 100644
index 43dbb4f4a1..0000000000
--- a/mobile/src/fpga/KD/pes/depthwise_conv_pe.hpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "../float16.hpp"
-#include "../pe.hpp"
-#include "../pe_params.hpp"
-#include "conv_process.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-class DepthwiseConvPE : public PE {
- public:
-  bool init() {
-    std::cout << "DWConv init" << std::endl;
-    return true;
-  }
-
-  void apply() {
-    DepthwiseConvParam& param = param_;
-    Tensor* input = param.input;
-    Tensor* output = param.output;
-    int channel = output->shape().channel();
-
-    Tensor* new_scale = param.scale();
-    Tensor* new_bias = param.bias();
-    Shape shape(NC, {channel, 1});
-    float* new_scale_data = new_scale->mutableData<float>(FP32, shape);
-    float16* new_bias_data = new_bias->mutableData<float16>(FP16, shape);
-
-    BatchnormParam* batchnorm = param.batchnorm;
-    memset(new_scale_data, 0, new_scale->shape().memorySize(sizeof(float16)));
-    memset(new_bias_data, 0, new_bias->shape().memorySize(sizeof(float16)));
-    if (batchnorm != nullptr) {
-      for (size_t i = 0; i < channel; i++) {
-        // TODO(chonwhite) combine;
-      }
-    } else {
-      float16 zero = float_to_half(0.0f);
-      for (size_t i = 0; i < channel; i++) {
-        new_bias_data[i] = zero;
-        new_scale_data[i] = 1.0f;
-      }
-    }
-
-    Tensor* quantized_filter = param.quantizedFilter();
-    quantized_filter->mutableData<float16>(FP16, param.filter->shape());
-    format_dw_filter(param.filter, param.quantizedFilter(), new_scale_data);
-
-    DWconvArgs args = {0};
-
-    void* filter_address = quantized_filter->data<float>();
-    std::cout << "filter:" << filter_address;
-
-    args.bias_address = new_bias_data;
-    args.filter_address = param.quantizedFilter()->data<void>();
-    args.kernel.width = param.kernelSize[0];
-    args.kernel.height = param.kernelSize[1];
-    args.kernel.stride_w = param.strides[0];
-    args.kernel.stride_h = param.strides[1];
-    args.image.address = input->data<void>();
-    args.image.channels = input->shape().channel();
-    args.image.height = input->shape().height();
-    args.image.width = input->shape().width();
-    args.image.pad_width = param.paddings[0];
-    args.image.pad_height = param.paddings[1];
-    args.image.scale_address = input->scale();
-    args.output.address = output->data<void>();
-    args.output.scale_address = output->scale();
-    args.out_width = param.output->shape().width();
-    args.out_height = param.output->shape().height();
-    args.sub_conv_num = 1;
-    param.args = args;
-  }
-
-  bool dispatch() { return compute_fpga_dwconv(param_.args) == 0; }
-
-  DepthwiseConvParam& param() { return param_; }
-
- private:
-  DepthwiseConvParam param_;
-};
-
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/pes/elementwise_add_pe.hpp b/mobile/src/fpga/KD/pes/elementwise_add_pe.hpp
deleted file mode 100644
index c4fab49a3d..0000000000
--- a/mobile/src/fpga/KD/pes/elementwise_add_pe.hpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "../pe.hpp"
-#include "../pe_params.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-class ElementwiseAddPE : public PE {
- public:
-  bool init() { return true; }
-
-  void apply() {
-    Tensor* input0 = param_.inputs[0];
-    Tensor* input1 = param_.inputs[1];
-    Tensor* output = param_.output;
-    EWAddArgs args = {0};
-    args.const0 = 0x3c00;
-    args.const1 = 0x3c00;  // =1
-    args.image0.address = input0->data<float16>();
-    args.image0.channels = input0->shape().channel();
-    args.image0.scale_address = input0->scale();
-    args.image0.height = input0->shape().height();
-    args.image0.width = input0->shape().width();
-    args.image0.pad_height = 0;
-    args.image0.pad_width = 0;
-    args.image1.address = input1->data<float16>();
-    args.image1.channels = input1->shape().channel();
-    args.image1.scale_address = input1->scale();
-    args.image1.height = input1->shape().height();
-    args.image1.width = input1->shape().width();
-    args.image1.pad_height = 0;
-    args.image1.pad_width = 0;
-    args.output.scale_address = output->scale();
-    args.output.address = output->data<float16>();
-    param_.ewargs = args;
-  }
-
-  bool dispatch() {
-    InplaceArgs inplace_args = {0};
-    if (param_.relu.enabled) {
-      inplace_args.relu_enable = true;
-      config_inplace(inplace_args);
-    }
-    compute_fpga_ewadd(param_.ewargs);
-    if (param_.relu.enabled) {
-      inplace_args.relu_enable = false;
-      config_inplace(inplace_args);
-    }
-    return true;
-  }
-
-  ElementwiseAddParam& param() { return param_; }
-
- private:
-  ElementwiseAddParam param_;
-};
-
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/pes/fully_connected_pe.hpp b/mobile/src/fpga/KD/pes/fully_connected_pe.hpp
deleted file mode 100644
index 0082cf0aa9..0000000000
--- a/mobile/src/fpga/KD/pes/fully_connected_pe.hpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-
-#include "../pe.hpp"
-#include "../pe_params.hpp"
-#include "conv_process.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-class FullyConnectedPE : public PE {
- public:
-  bool init() { return true; }
-
-  void apply() {
-    Tensor* input = param_.input;
-    Tensor* output = param_.output;
-
-    convParam_.input = param_.input;
-    convParam_.output = param_.output;
-    // convParam_.relu = param_.relu;
-    convParam_.groups = 1;
-    convParam_.strides = {1, 1};
-    convParam_.paddings = {0, 0};
-    convParam_.kernelSize = {input->shape().width(), input->shape().height()};
-    convParam_.dilations = {1, 1};
-
-    int num = param_.filter->shape().channel();
-    int chw = param_.filter->shape().num();
-
-    int height = param_.input->shape().height();
-    int width = param_.input->shape().width();
-    int filter_channel = chw / height / width;
-
-    int channel = param_.output->shape().channel();
-    Shape shape(NCHW, {num, filter_channel, height, width});
-    Tensor* conv_filter = new Tensor();
-    float* new_filter_data = conv_filter->mutableData<float>(FP32, shape);
-    float* filter_data = param_.filter->data<float>();
-
-    for (int i = 0; i < num; i++) {
-      float sum = 0;
-      float* f_start = filter_data + i * chw;
-      for (int j = 0; j < chw; j++) {
-        float scale = filter_data[j * num + i];
-        new_filter_data[i * chw + j] = scale;
-      }
-    }
-
-    conv_filter->flush();
-    convParam_.filter = conv_filter;
-
-    Shape sb_shape(N, {channel});
-    float* scale_data = convParam_.scale()->mutableData<float>(FP32, sb_shape);
-    float* bias_data = convParam_.bias()->mutableData<float>(FP32, sb_shape);
-
-    for (int i = 0; i < channel; i++) {
-      scale_data[i] = 1.0f;
-      bias_data[i] = param_.bias->data<float>()[i];
-    }
-
-    fill_split_arg(convParam_);
-  }
-
-  bool dispatch() {
-    int ret = 0;
-    std::vector<BasicConvParam*>& params = convParam_.splitParams();
-
-    for (auto conv_param : params) {
-      std::cout << "conv basic \n";
-      ret |= compute_fpga_conv_basic(conv_param->args);
-    }
-    return ret == 0;
-  }
-
-  FullyConnectedParam& param() { return param_; }
-
- private:
-  FullyConnectedParam param_;
-  ConvParam convParam_;
-};
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/pes/input_pe.hpp b/mobile/src/fpga/KD/pes/input_pe.hpp
deleted file mode 100644
index ad3187c1f9..0000000000
--- a/mobile/src/fpga/KD/pes/input_pe.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "../pe.hpp"
-#include "../pe_params.hpp"
-namespace paddle_mobile {
-namespace zynqmp {
-
-class InputPE : public PE {
- public:
-  bool init() {
-    Tensor* output = param_.output;
-    output->setAligned(true);
-    return true;
-  }
-
-  bool dispatch() {
-    std::cout << "InputPE dispatch \n";
-    Tensor* input = param_.input;
-    Tensor* output = param_.output;
-
-    Tensor* src = input;
-    Tensor half_tensor;
-    if (input->dataType() == DataType::FP32) {
-      half_tensor.mutableData<void*>(DataType::FP16, input->shape());
-      half_tensor.copyFrom(input);
-      src = &half_tensor;
-    }
-    output->mutableData<void>();
-    src->alignImage(output, true);
-    return true;
-  }
-
-  InputParam& param() { return param_; }
-
- private:
-  InputParam param_;
-};
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/pes/math_func_neon.h b/mobile/src/fpga/KD/pes/math_func_neon.h
deleted file mode 100755
index f34e30036c..0000000000
--- a/mobile/src/fpga/KD/pes/math_func_neon.h
+++ /dev/null
@@ -1,330 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/* NEON implementation of sin, cos, exp and log
- *
- *   Inspired by Intel Approximate Math library, and based on the
- *   corresponding algorithms of the cephes math library
- */
-
-/* Copyright (C) 2011  Julien Pommier
- *
- *  This software is provided 'as-is', without any express or implied
- *  warranty.  In no event will the authors be held liable for any damages
- *  arising from the use of this software.
- *
- *  Permission is granted to anyone to use this software for any purpose,
- *  including commercial applications, and to alter it and redistribute it
- *  freely, subject to the following restrictions:
- *
- *  1. The origin of this software must not be misrepresented; you must not
- *     claim that you wrote the original software. If you use this software
- *     in a product, an acknowledgment in the product documentation would be
- *     appreciated but is not required.
- *  2. Altered source versions must be plainly marked as such, and must not be
- *     misrepresented as being the original software.
- *  3. This notice may not be removed or altered from any source distribution.
- *
- *  (this is the zlib license)
- */
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-#pragma once
-
-#include <arm_neon.h>
-
-static const int32_t c_inv_mant_mask = ~0x7f800000u;
-static const float c_cephes_SQRTHF = 0.707106781186547524;
-static const float c_cephes_log_p0 = 7.0376836292E-2;
-static const float c_cephes_log_p1 = -1.1514610310E-1;
-static const float c_cephes_log_p2 = 1.1676998740E-1;
-static const float c_cephes_log_p3 = -1.2420140846E-1;
-static const float c_cephes_log_p4 = +1.4249322787E-1;
-static const float c_cephes_log_p5 = -1.6668057665E-1;
-static const float c_cephes_log_p6 = +2.0000714765E-1;
-static const float c_cephes_log_p7 = -2.4999993993E-1;
-static const float c_cephes_log_p8 = +3.3333331174E-1;
-static const float c_cephes_log_q1 = -2.12194440e-4;
-static const float c_cephes_log_q2 = 0.693359375;
-
-/* natural logarithm computed for 4 simultaneous float
- *   return NaN for x <= 0
- */
-static inline float32x4_t log_ps(float32x4_t x) {
-  float32x4_t one = vdupq_n_f32(1);
-
-  x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
-  uint32x4_t invalid_mask = vcleq_f32(x, vdupq_n_f32(0));
-
-  int32x4_t ux = vreinterpretq_s32_f32(x);
-
-  int32x4_t emm0 = vshrq_n_s32(ux, 23);
-
-  /* keep only the fractional part */
-  ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask));
-  ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f)));
-  x = vreinterpretq_f32_s32(ux);
-
-  emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f));
-  float32x4_t e = vcvtq_f32_s32(emm0);
-
-  e = vaddq_f32(e, one);
-
-  /* part2:
-   *     if( x < SQRTHF ) {
-   *       e -= 1;
-   *       x = x + x - 1.0;
-   *     } else { x = x - 1.0; }
-   */
-  uint32x4_t mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF));
-  float32x4_t tmp =
-      vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
-  x = vsubq_f32(x, one);
-  e = vsubq_f32(
-      e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask)));
-  x = vaddq_f32(x, tmp);
-
-  float32x4_t z = vmulq_f32(x, x);
-
-  float32x4_t y = vdupq_n_f32(c_cephes_log_p0);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8));
-  y = vmulq_f32(y, x);
-
-  y = vmulq_f32(y, z);
-
-  tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1));
-  y = vaddq_f32(y, tmp);
-
-  tmp = vmulq_f32(z, vdupq_n_f32(0.5f));
-  y = vsubq_f32(y, tmp);
-
-  tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2));
-  x = vaddq_f32(x, y);
-  x = vaddq_f32(x, tmp);
-  x = vreinterpretq_f32_u32(vorrq_u32(
-      vreinterpretq_u32_f32(x), invalid_mask));  // negative arg will be NAN
-  return x;
-}
-
-static const float c_exp_hi = 88.3762626647949f;
-static const float c_exp_lo = -88.3762626647949f;
-
-static const float c_cephes_LOG2EF = 1.44269504088896341;
-static const float c_cephes_exp_C1 = 0.693359375;
-static const float c_cephes_exp_C2 = -2.12194440e-4;
-
-static const float c_cephes_exp_p0 = 1.9875691500E-4;
-static const float c_cephes_exp_p1 = 1.3981999507E-3;
-static const float c_cephes_exp_p2 = 8.3334519073E-3;
-static const float c_cephes_exp_p3 = 4.1665795894E-2;
-static const float c_cephes_exp_p4 = 1.6666665459E-1;
-static const float c_cephes_exp_p5 = 5.0000001201E-1;
-
-/* exp() computed for 4 float at once */
-static inline float32x4_t exp_ps(float32x4_t x) {
-  float32x4_t tmp, fx;
-
-  float32x4_t one = vdupq_n_f32(1);
-  x = vminq_f32(x, vdupq_n_f32(c_exp_hi));
-  x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo));
-
-  /* express exp(x) as exp(g + n*log(2)) */
-  fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF));
-
-  /* perform a floorf */
-  tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
-
-  /* if greater, substract 1 */
-  uint32x4_t mask = vcgtq_f32(tmp, fx);
-  mask = vandq_u32(mask, vreinterpretq_u32_f32(one));
-
-  fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
-
-  tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1));
-  float32x4_t z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2));
-  x = vsubq_f32(x, tmp);
-  x = vsubq_f32(x, z);
-
-  static const float cephes_exp_p[6] = {c_cephes_exp_p0, c_cephes_exp_p1,
-                                        c_cephes_exp_p2, c_cephes_exp_p3,
-                                        c_cephes_exp_p4, c_cephes_exp_p5};
-  float32x4_t y = vld1q_dup_f32(cephes_exp_p + 0);
-  float32x4_t c1 = vld1q_dup_f32(cephes_exp_p + 1);
-  float32x4_t c2 = vld1q_dup_f32(cephes_exp_p + 2);
-  float32x4_t c3 = vld1q_dup_f32(cephes_exp_p + 3);
-  float32x4_t c4 = vld1q_dup_f32(cephes_exp_p + 4);
-  float32x4_t c5 = vld1q_dup_f32(cephes_exp_p + 5);
-
-  y = vmulq_f32(y, x);
-  z = vmulq_f32(x, x);
-
-  y = vaddq_f32(y, c1);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, c2);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, c3);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, c4);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, c5);
-
-  y = vmulq_f32(y, z);
-  y = vaddq_f32(y, x);
-  y = vaddq_f32(y, one);
-
-  /* build 2^n */
-  int32x4_t mm;
-  mm = vcvtq_s32_f32(fx);
-  mm = vaddq_s32(mm, vdupq_n_s32(0x7f));
-  mm = vshlq_n_s32(mm, 23);
-  float32x4_t pow2n = vreinterpretq_f32_s32(mm);
-
-  y = vmulq_f32(y, pow2n);
-  return y;
-}
-
-static const float c_minus_cephes_DP1 = -0.78515625;
-static const float c_minus_cephes_DP2 = -2.4187564849853515625e-4;
-static const float c_minus_cephes_DP3 = -3.77489497744594108e-8;
-static const float c_sincof_p0 = -1.9515295891E-4;
-static const float c_sincof_p1 = 8.3321608736E-3;
-static const float c_sincof_p2 = -1.6666654611E-1;
-static const float c_coscof_p0 = 2.443315711809948E-005;
-static const float c_coscof_p1 = -1.388731625493765E-003;
-static const float c_coscof_p2 = 4.166664568298827E-002;
-static const float c_cephes_FOPI = 1.27323954473516;  // 4 / M_PI
-
-/* evaluation of 4 sines & cosines at once.
- *
- *   The code is the exact rewriting of the cephes sinf function.
- *   Precision is excellent as long as x < 8192 (I did not bother to
- *   take into account the special handling they have for greater values
- *   -- it does not return garbage for arguments over 8192, though, but
- *   the extra precision is missing).
- *
- *   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
- *   surprising but correct result.
- *
- *   Note also that when you compute sin(x), cos(x) is available at
- *   almost no extra price so both sin_ps and cos_ps make use of
- *   sincos_ps..
- */
-static inline void sincos_ps(float32x4_t x, float32x4_t *ysin,
-                             float32x4_t *ycos) {
-  // any x
-  float32x4_t xmm1, xmm2, xmm3, y;
-
-  uint32x4_t emm2;
-
-  uint32x4_t sign_mask_sin, sign_mask_cos;
-  sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0));
-  x = vabsq_f32(x);
-
-  /* scale by 4/Pi */
-  y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI));
-
-  /* store the integer part of y in mm0 */
-  emm2 = vcvtq_u32_f32(y);
-  /* j=(j+1) & (~1) (see the cephes sources) */
-  emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
-  emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
-  y = vcvtq_f32_u32(emm2);
-
-  /* get the polynom selection mask
-   *     there is one polynom for 0 <= x <= Pi/4
-   *     and another one for Pi/4<x<=Pi/2
-   *
-   *     Both branches will be computed.
-   */
-  uint32x4_t poly_mask = vtstq_u32(emm2, vdupq_n_u32(2));
-
-  /* The magic pass: "Extended precision modular arithmetic"
-   *     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = vmulq_n_f32(y, c_minus_cephes_DP1);
-  xmm2 = vmulq_n_f32(y, c_minus_cephes_DP2);
-  xmm3 = vmulq_n_f32(y, c_minus_cephes_DP3);
-  x = vaddq_f32(x, xmm1);
-  x = vaddq_f32(x, xmm2);
-  x = vaddq_f32(x, xmm3);
-
-  sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, vdupq_n_u32(4)));
-  sign_mask_cos = vtstq_u32(vsubq_u32(emm2, vdupq_n_u32(2)), vdupq_n_u32(4));
-
-  /* Evaluate the first polynom  (0 <= x <= Pi/4) in y1,
-   *     and the second polynom      (Pi/4 <= x <= 0) in y2 */
-  float32x4_t z = vmulq_f32(x, x);
-  float32x4_t y1, y2;
-
-  y1 = vmulq_n_f32(z, c_coscof_p0);
-  y2 = vmulq_n_f32(z, c_sincof_p0);
-  y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p1));
-  y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p1));
-  y1 = vmulq_f32(y1, z);
-  y2 = vmulq_f32(y2, z);
-  y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p2));
-  y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p2));
-  y1 = vmulq_f32(y1, z);
-  y2 = vmulq_f32(y2, z);
-  y1 = vmulq_f32(y1, z);
-  y2 = vmulq_f32(y2, x);
-  y1 = vsubq_f32(y1, vmulq_f32(z, vdupq_n_f32(0.5f)));
-  y2 = vaddq_f32(y2, x);
-  y1 = vaddq_f32(y1, vdupq_n_f32(1));
-
-  /* select the correct result from the two polynoms */
-  float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
-  float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
-  *ysin = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
-  *ycos = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
-}
-
-static inline float32x4_t sin_ps(float32x4_t x) {
-  float32x4_t ysin, ycos;
-  sincos_ps(x, &ysin, &ycos);
-  return ysin;
-}
-
-static inline float32x4_t cos_ps(float32x4_t x) {
-  float32x4_t ysin, ycos;
-  sincos_ps(x, &ysin, &ycos);
-  return ycos;
-}
-
-static inline float32x4_t div_ps(float32x4_t a, float32x4_t b) {
-  float32x4_t reciprocal = vrecpeq_f32(b);
-  reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
-  return vmulq_f32(a, reciprocal);
-}
-
-static inline float32x4_t pow_ps(float32x4_t a, float32x4_t b) {
-  return exp_ps(vmulq_f32(b, log_ps(a)));
-}
-
-#endif  // __ARM_NEON__
diff --git a/mobile/src/fpga/KD/pes/output_pe.hpp b/mobile/src/fpga/KD/pes/output_pe.hpp
deleted file mode 100644
index 92757db815..0000000000
--- a/mobile/src/fpga/KD/pes/output_pe.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "../pe.hpp"
-#include "../pe_params.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-class OutputPE : public PE {
- public:
-  bool init() {
-    Tensor* output = param_.output;
-    output->setAligned(false);
-    return true;
-  }
-
-  bool dispatch() {
-    Tensor* input = param_.input;
-    Tensor* output = param_.output;
-    Tensor* src_tensor = input;
-    Tensor float_tensor;
-    input->invalidate();
-    float_tensor.mutableData<float>(DataType::FP32, input->shape());
-    if (input->dataType() == DataType::FP16) {
-      float_tensor.copyFrom(input);
-      src_tensor = &float_tensor;
-    }
-    src_tensor->unalignImage(output, true);
-    return true;
-  }
-
-  OutputParam& param() { return param_; }
-
- private:
-  OutputParam param_;
-};
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/pes/pooling_pe.hpp b/mobile/src/fpga/KD/pes/pooling_pe.hpp
deleted file mode 100644
index 421f30cd33..0000000000
--- a/mobile/src/fpga/KD/pes/pooling_pe.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "../pe.hpp"
-#include "../pe_params.hpp"
-namespace paddle_mobile {
-namespace zynqmp {
-
-class PoolingPE : public PE {
- public:
-  bool init() {
-    Tensor* output = param_.output;
-    output->setAligned(true);
-    return true;
-  }
-
-  void apply() {
-    Tensor* input = param_.input;
-    Tensor* output = param_.output;
-
-    uint32_t k_width = param_.kernelSize[0];
-    uint32_t k_height = param_.kernelSize[1];
-
-    if (param_.globalPooling) {
-      k_width = input->shape().width();
-      k_height = input->shape().height();
-    }
-
-    PoolingArgs args = {0};
-    args.mode = param_.type;
-    args.kernel_reciprocal = fp32_2_fp16(1.0f / (k_width * k_height));
-    args.image.address = input->data<float16>();
-    args.image.channels = input->shape().channel();
-    args.image.height = input->shape().height();
-    args.image.width = input->shape().width();
-    args.image.pad_height = param_.paddings[0];
-    args.image.pad_width = param_.paddings[1];
-    args.image.scale_address = input->scale();
-    args.output.address = output->mutableData<float16>();
-    args.output.scale_address = output->scale();
-    args.kernel.height = k_height;
-    args.kernel.width = k_width;
-    args.kernel.stride_h = param_.strides[0];
-    args.kernel.stride_w = param_.strides[1];
-    args.out_height = output->shape().height();
-    args.out_width = output->shape().width();
-    param_.poolingArgs = args;
-  }
-
-  bool dispatch() { return compute_fpga_pool(param_.poolingArgs) == 0; }
-
-  PoolingParam& param() { return param_; }
-
- private:
-  PoolingParam param_;
-};
-
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/pes/softmax_pe.cpp b/mobile/src/fpga/KD/pes/softmax_pe.cpp
deleted file mode 100644
index f4596d3aa7..0000000000
--- a/mobile/src/fpga/KD/pes/softmax_pe.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "softmax_pe.hpp"
-
-#include <vector>
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-#ifndef __aarch64__
-static inline float32_t vmaxvq_f32(const float32x4_t &r) {
-  float32x2_t v = vmax_f32(vget_high_f32(r), vget_low_f32(r));
-  return vget_lane_f32(vpmax_f32(v, v), 0);
-}
-
-static inline float32_t vaddvq_f32(const float32x4_t &r) {
-  float32x2_t v = vadd_f32(vget_high_f32(r), vget_low_f32(r));
-  return vget_lane_f32(vpadd_f32(v, v), 0);
-}
-#endif  // __aarch64__
-#endif  // __ARM_NEON__
-
-static float find_max(const float *input, const int num_classes) {
-  int remain = num_classes;
-  float max = -std::numeric_limits<float>::max();
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-  int loop = num_classes >> 3;
-  remain = num_classes & 0x7;
-  float32x4_t __max = vdupq_n_f32(max);
-  for (int i = 0; i < loop; ++i, input += 8) {
-    float32x4_t x0 = vld1q_f32(input);
-    float32x4_t x1 = vld1q_f32(input + 4);
-    __max = vmaxq_f32(x0, __max);
-    __max = vmaxq_f32(x1, __max);
-  }
-  max = vmaxvq_f32(__max);
-#endif
-  for (int i = 0; i < remain; ++i) {
-    max = std::max(max, input[i]);
-  }
-  return max;
-}
-
-static void softmax(Tensor *X, Tensor *Y) {
-  std::vector<int> dims = X->shape().dims();
-  int batch_size = X->shape().num();
-  int num_classes = dims[X->shape().dimSize() - 1];
-  int channels = X->shape().numel() / batch_size / num_classes;
-  float *x = X->data<float>();
-  float *y = Y->mutableData<float>();
-
-#pragma omp parallel for collapse(2)
-  for (int batch = 0; batch < batch_size; ++batch) {
-    for (int channel = 0; channel < channels; ++channel) {
-      size_t offset = (batch * channels + channel) * num_classes;
-      const float *input = x + offset;
-      float *output = y + offset;
-      // find max
-      float max = find_max(input, num_classes);
-
-      // exp(x - max)
-      int remain = num_classes;
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-      int loop = num_classes >> 3;
-      remain = num_classes & 0x7;
-      float32x4_t __max = vdupq_n_f32(max);
-      for (int i = 0; i < loop; ++i, input += 8, output += 8) {
-        float32x4_t x0 = vld1q_f32(input);
-        float32x4_t x1 = vld1q_f32(input + 4);
-        x0 = vsubq_f32(x0, __max);
-        x1 = vsubq_f32(x1, __max);
-        x0 = exp_ps(x0);
-        x1 = exp_ps(x1);
-        vst1q_f32(output, x0);
-        vst1q_f32(output + 4, x1);
-      }
-#endif  // __ARM_NEON__
-      for (int i = 0; i < remain; ++i) {
-        output[i] = expf(input[i] - max);
-      }
-
-      // sum(exp(x - max))
-      float sum = 0.f;
-      output = y + offset;
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-      float32x4_t __sum = vdupq_n_f32(0.f);
-      for (int i = 0; i < loop; ++i, output += 8) {
-        float32x4_t x0 = vld1q_f32(output);
-        float32x4_t x1 = vld1q_f32(output + 4);
-        __sum = vaddq_f32(x0, __sum);
-        __sum = vaddq_f32(x1, __sum);
-      }
-      sum += vaddvq_f32(__sum);
-#endif  // __ARM_NEON__
-      for (int i = 0; i < remain; ++i) {
-        sum += output[i];
-      }
-
-      // exp(x - max) / sum
-      float inv_sum = 1.f / sum;
-      output = y + offset;
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-      float32x4_t __inv_sum = vdupq_n_f32(inv_sum);
-      for (int i = 0; i < loop; ++i, output += 8) {
-        float32x4_t x0 = vld1q_f32(output);
-        float32x4_t x1 = vld1q_f32(output + 4);
-        x0 = vmulq_f32(x0, __inv_sum);
-        x1 = vmulq_f32(x1, __inv_sum);
-        vst1q_f32(output, x0);
-        vst1q_f32(output + 4, x1);
-      }
-#endif
-      for (int i = 0; i < remain; ++i) {
-        output[i] *= inv_sum;
-      }
-    }
-  }
-}
-
-bool SoftmaxPE::init() {
-  Tensor *output = param_.output;
-  output->setAligned(false);
-  return true;
-}
-
-bool SoftmaxPE::dispatch() {
-  Tensor *input = param_.input;
-  Tensor *output = param_.output;
-  input->invalidate();
-
-  Tensor float_input;
-  Tensor float_output;
-  float_input.mutableData<float>(DataType::FP32, input->shape());
-  float_input.copyFrom(input);
-  float_input.unalignImage();
-
-  float *out_data =
-      float_output.mutableData<float>(DataType::FP32, input->shape());
-
-  softmax(&float_input, &float_output);
-  float_output.flush();
-
-  output->copyFrom(&float_output);
-  return true;
-}
-
-SoftmaxParam &SoftmaxPE::param() { return param_; }
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/pes/softmax_pe.hpp b/mobile/src/fpga/KD/pes/softmax_pe.hpp
deleted file mode 100644
index 42b4014616..0000000000
--- a/mobile/src/fpga/KD/pes/softmax_pe.hpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <math.h>
-#include <algorithm>
-#include <limits>
-
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-#include <arm_neon.h>
-#include "fpga/KD/pes/math_func_neon.h"
-#endif
-
-#include "../pe.hpp"
-#include "../pe_params.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-class SoftmaxPE : public PE {
- public:
-  bool init();
-  bool dispatch();
-
-  SoftmaxParam& param();
-
- private:
-  SoftmaxParam param_;
-};
-
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/shape.hpp b/mobile/src/fpga/KD/shape.hpp
deleted file mode 100644
index 587df10310..0000000000
--- a/mobile/src/fpga/KD/shape.hpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdio.h>
-#include <vector>
-
-#include "fpga/KD/alignment.h"
-#include "fpga/KD/layout.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-static struct NCHW nchw_;
-static struct NHWC nhwc_;
-static struct NC nc_;
-static struct NHW nhw_;
-static struct N n_;
-
-class Shape {
- public:
-  explicit Shape(std::vector<int> dims) { dims_ = dims; }
-
-  Shape(LayoutType type, std::vector<int> dims) {
-    dims_ = dims;
-    setLayoutType(type);
-  }
-
-  Shape(const Shape& src) {
-    dims_ = src.dims_;
-    setLayoutType(src.layoutType_);
-  }
-
-  bool shouldAlign() {
-    return layout_->alignedElementCount(dims_) != layout_->elementCount(dims_);
-  }
-
-  int num() {
-    int index = layout_->numIndex();
-    return index == -1 ? 1 : dims_[index];
-  }
-
-  int channel() {
-    int index = layout_->channelIndex();
-    return index == -1 ? 1 : dims_[index];
-  }
-
-  int height() {
-    int index = layout_->heightIndex();
-    return index == -1 ? 1 : dims_[index];
-  }
-
-  int width() {
-    int index = layout_->widthIndex();
-    return index == -1 ? 1 : dims_[index];
-  }
-
-  int dimSize() { return dims_.size(); }
-
-  std::vector<int> dims() { return dims_; }
-
-  size_t memorySize(int cellSize) {
-    return layout_->alignedElementCount(dims_) * cellSize;
-  }
-
-  int numel() { return layout_->elementCount(dims_); }
-
-  void setLayoutType(LayoutType layout) {
-    this->layoutType_ = layout;
-    switch (layout) {
-      case NCHW:
-        layout_ = &nchw_;
-        break;
-      case NHWC:
-        layout_ = &nhwc_;
-        break;
-      case NC:
-        layout_ = &nc_;
-        break;
-      case NHW:
-        layout_ = &nhw_;
-        break;
-      case N:
-        layout_ = &n_;
-        break;
-      default:
-        break;
-    }
-  }
-
-  int operator[](int index) { return dims_[index]; }
-
- private:
-  LayoutType layoutType_;
-  Layout* layout_ = &nhwc_;
-  std::vector<int> dims_;
-};
-
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/tensor.hpp b/mobile/src/fpga/KD/tensor.hpp
deleted file mode 100644
index 496d6f7792..0000000000
--- a/mobile/src/fpga/KD/tensor.hpp
+++ /dev/null
@@ -1,281 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdio.h>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "float16.hpp"
-#include "llapi/zynqmp_api.h"
-#include "shape.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-enum DataType : int {
-  FP32 = 0,
-  FP16 = 1,
-  INT8 = 2,
-};
-
-typedef uint16_t float16;
-
-inline int CellSize(DataType type) {
-  switch (type) {
-    case FP32:
-      return sizeof(float);
-    case FP16:
-      return sizeof(float16);
-    case INT8:
-      return sizeof(int8_t);
-    default:
-      return 0;
-  }
-  return 0;
-}
-
-class PlaceHolder {
- public:
-  explicit PlaceHolder(size_t size) {
-    size_ = size;
-    data_ = fpga_malloc(size_);
-  }
-
-  void* data() { return data_; }
-
-  size_t memorySize() { return size_; }
-
-  ~PlaceHolder() {
-    std::cout << "place holder dealloc";
-    fpga_free(data_);
-  }
-
- private:
-  void* data_ = nullptr;
-  size_t size_ = 0;
-};
-
-class Tensor {
- public:
-  int id() { return id_; }
-
-  template <typename Dtype>
-  Dtype* data() {
-    if (placeHolder_ == nullptr) {
-      return nullptr;
-    }
-    return reinterpret_cast<Dtype*>(this->placeHolder_->data());
-  }
-
-  template <typename Dtype>
-  Dtype* mutableData(DataType dataType, const Shape& shape) {
-    // if (this->shape_ != &shape) {
-    if (this->shape_ != nullptr) {
-      delete shape_;
-    }
-    this->shape_ = new Shape(shape);
-    // }
-    this->dataType_ = dataType;
-    return mutableData<Dtype>();
-  }
-
-  template <typename Dtype>
-  Dtype* mutableData() {
-    size_t memorySize = shape_->memorySize(CellSize(dataType_));
-    if (placeHolder_ != nullptr) {
-      if (memorySize > placeHolder_->memorySize()) {
-        delete placeHolder_;
-        placeHolder_ = new PlaceHolder(memorySize);
-      }
-    } else {
-      placeHolder_ = new PlaceHolder(memorySize);
-    }
-    return reinterpret_cast<Dtype*>(placeHolder_->data());
-  }
-
-  void setDataType(DataType dataType) { this->dataType_ = dataType; }
-
-  DataType dataType() { return this->dataType_; }
-
-  Shape& shape() { return *shape_; }
-
-  bool aligned() { return this->aligned_; }
-
-  void setAligned(bool aligned) { this->aligned_ = aligned; }
-
-  float* scale() { return scale_; }
-
-  void alignImage(Tensor* dst = nullptr, bool copy = false) {
-    if (shape_->shouldAlign()) {
-      int cell_size = CellSize(this->dataType_);
-      char* dst_data = nullptr;
-      size_t mem_size = shape_->memorySize(cell_size);
-      if (dst == nullptr) {
-        dst_data = reinterpret_cast<char*>(fpga_malloc(mem_size));
-      } else {
-        dst_data = dst->data<char>();
-      }
-      int wc = shape_->width() * shape_->channel();
-      int wc_aligned = align_image(wc);
-      int remainder = wc_aligned - wc;
-
-      char* src_start = data<char>();
-      char* dst_start = dst_data;
-      for (int n = 0; n < shape_->num(); n++) {
-        for (int h = 0; h < shape_->height(); h++) {
-          memcpy(dst_start, src_start, wc * cell_size);
-          memcpy(dst_start + wc * cell_size, 0, remainder * cell_size);
-          src_start += wc * cell_size;
-          dst_start += wc_aligned * cell_size;
-        }
-      }
-      if (dst == nullptr) {
-        memcpy(data<void>(), dst_data, mem_size);
-        flush();
-        fpga_free(dst_data);
-      } else {
-        dst->flush();
-      }
-    } else {
-      if (copy) {
-        dst->copyFrom(this);
-      } else {
-        // TODO(chonwhite) share data.
-      }
-    }
-  }
-
-  void unalignImage(Tensor* dst = nullptr, bool copy = false) {
-    if (shape_->shouldAlign()) {
-      // int cell_size = CellSize(this->dataType_);
-      // char* dst_data = nullptr;
-      // size_t mem_size = shape_->memorySize(cell_size);
-      // if (dst == nullptr) {
-      //     dst_data = (char*)fpga_malloc(mem_size);
-      // } else {
-      //     dst_data = dst->data<char>();
-      // }
-      // int wc = shape_->width() * shape_->channel();
-      // int wc_aligned = align_image(wc);
-      // int remainder = wc_aligned - wc;
-
-      // char* src_start = data<char>();
-      // char* dst_start = dst_data;
-      // for (int n = 0; n < shape_->num(); n++) {
-      //     for (int h = 0;h < shape_->height(); h++) {
-      //         memcpy(dst_start, src_start, wc * cell_size);
-      //         memcpy(dst_start + wc * cell_size, 0, remainder * cell_size);
-      //         src_start += wc * cell_size;
-      //         dst_start += wc_aligned * cell_size;
-      //     }
-      // }
-      // if (dst == nullptr) {
-      //     memcpy(data<void>(), dst_data, mem_size);
-      //     flush();
-      //     fpga_free(dst_data);
-      // } else {
-      //     dst->flush();
-      // }
-    } else {
-      if (copy) {
-        dst->copyFrom(this);
-      } else {
-        // TODO(chonwhite) share data.
-      }
-    }
-  }
-
-  void copyFrom(Tensor* src) {
-    BypassArgs args;
-    args.input_data_type =
-        src->dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
-    args.output_data_type = dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
-    args.input_layout_type = LAYOUT_HWC;
-    args.output_layout_type = LAYOUT_HWC;
-    args.image = {.address = src->data<void>(),
-                  .scale_address = src->scale(),
-                  .channels = (uint32_t)src->shape().channel(),
-                  .width = (uint32_t)src->shape().width(),
-                  .height = (uint32_t)src->shape().height(),
-                  .pad_width = 0u,
-                  .pad_height = 0u};
-    args.output = {
-        .address = data<void>(),
-        .scale_address = scale(),
-    };
-    src->flush();
-    perform_bypass(args);
-    this->invalidate();
-  }
-
-  void flush() { fpga_flush(placeHolder_->data(), placeHolder_->memorySize()); }
-
-  void invalidate() {
-    fpga_invalidate(placeHolder_->data(), placeHolder_->memorySize());
-  }
-
-  void print() {
-    int count = shape_->numel();
-    for (int i = 0; i < count; i++) {
-      std::cout << "" << '\n';
-    }
-  }
-
-  void saveToFile() {
-    std::string path = std::to_string(id_) + ".txt";
-    saveToFile(path);
-  }
-
-  void saveToFile(std::string path) {
-    std::ofstream ofs;
-    static int counter = 0;
-    std::string npath = std::to_string(counter) + "_" + path;
-    counter++;
-    ofs.open(npath);
-    for (size_t i = 0; i < shape_->numel(); i++) {
-      float value = 0;
-      if (dataType_ == FP32) {
-        value = data<float>()[i];
-      } else {
-        value = half_to_float(data<float16>()[i]);
-      }
-      ofs << value << std::endl;
-    }
-    ofs.close();
-  }
-
- private:
-  float scale_[2];
-  Shape* shape_ = nullptr;
-  DataType dataType_ = FP32;
-  bool aligned_ = false;
-
-  static int generateID() {
-    static int sID = 0;
-    int id = sID++;
-    return id;
-  }
-
-  int id_ = generateID();
-
-  PlaceHolder* placeHolder_ = nullptr;
-};
-
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/tensor_util.cpp b/mobile/src/fpga/KD/tensor_util.cpp
deleted file mode 100644
index 29b6595788..0000000000
--- a/mobile/src/fpga/KD/tensor_util.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-
-#include "tensor_util.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-float find_max(const Tensor& tensor) {
-  float max = 0;
-  Tensor& t = const_cast<Tensor&>(tensor);
-  float* data = t.data<float>();
-  for (int i = 0; i < t.shape().numel(); i++) {
-    max = std::max(data[i], max);
-  }
-  return max;
-}
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/tensor_util.hpp b/mobile/src/fpga/KD/tensor_util.hpp
deleted file mode 100644
index 81d86f22f7..0000000000
--- a/mobile/src/fpga/KD/tensor_util.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdio.h>
-
-#include "tensor.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-float find_max(const Tensor& tensor);
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V1/api.cpp b/mobile/src/fpga/V1/api.cpp
deleted file mode 100644
index dc5163d2b2..0000000000
--- a/mobile/src/fpga/V1/api.cpp
+++ /dev/null
@@ -1,1021 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/V1/api.h"
-#include <memory>
-#include "fpga/V1/bias_scale.h"
-#include "fpga/V1/deconv_filter.h"
-#include "fpga/V1/filter.h"
-#include "fpga/V1/image.h"
-
-namespace paddle_mobile {
-namespace fpga {
-
-#define USE_RELU 1
-#define USE_BIAS 2
-
-void format_image(framework::Tensor *image_tensor) {
-  auto dims = image_tensor->dims();
-  auto channel = dims[1], height = dims[2], width = dims[3];
-  kTypeId_t input_type = image_tensor->type();
-  if (input_type == type_id<float>()) {
-    auto data_ptr = image_tensor->data<float>();
-    auto external_ptr = reinterpret_cast<float *>(image_tensor->external_data);
-    float *p_data = external_ptr == nullptr ? data_ptr : external_ptr;
-
-    image::format_image<float>(&p_data, channel, height, width);
-    if (p_data != data_ptr && external_ptr == nullptr) {
-      image_tensor->reset_data_ptr(p_data);
-    }
-  } else {
-    auto data_ptr = image_tensor->data<int8_t>();
-    auto external_ptr = reinterpret_cast<int8_t *>(image_tensor->external_data);
-    int8_t *p_data = external_ptr == nullptr ? data_ptr : external_ptr;
-
-    image::format_image<int8_t>(&p_data, channel, height, width);
-    if (p_data != data_ptr && external_ptr == nullptr) {
-      image_tensor->reset_data_ptr(p_data);
-    }
-  }
-}
-
-void format_ofm(framework::Tensor *ofm_tensor) {
-  if (ofm_tensor->type() == type_id<float>()) {
-    format_fp32_ofm(ofm_tensor);
-  } else {
-    format_fp16_ofm(ofm_tensor);
-  }
-}
-void format_fp16_ofm(framework::Tensor *ofm_tensor) {
-  auto dims = ofm_tensor->dims();
-  size_t memory_size = 0;
-  if (dims.size() == 4) {
-    auto channel = dims[1], height = dims[2], width = dims[3], num = dims[0];
-    memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) *
-                  sizeof(half);
-  } else if (dims.size() == 2) {
-    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half);
-  } else {
-    DLOG << "Wrong ofm dimension";
-  }
-  auto p = fpga_malloc(memory_size);
-  // memset(p, 0, memory_size);
-  ofm_tensor->reset_data_ptr(p);
-  ofm_tensor->set_type(type_id<half>().hash_code());
-  ofm_tensor->fpga_data_num = memory_size / sizeof(half);
-  fpga::fpga_flush(p, memory_size);
-}
-
-void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
-  // auto dims = ofm_tensor->dims();
-  size_t memory_size = 0;
-  if (dims.size() == 4) {
-    auto channel = dims[1], height = dims[2], width = dims[3];
-    memory_size =
-        height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half);
-  } else if (dims.size() == 2) {
-    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half);
-  } else {
-    DLOG << "Wrong ofm dimension";
-  }
-  auto p = fpga_malloc(memory_size);
-  // memset(p, 0, memory_size);
-  ofm_tensor->reset_data_ptr(p);
-  ofm_tensor->set_type(type_id<half>().hash_code());
-  ofm_tensor->fpga_data_num = memory_size / sizeof(half);
-  fpga::fpga_flush(p, memory_size);
-}
-
-void format_fp32_ofm(framework::Tensor *ofm_tensor) {
-  auto dims = ofm_tensor->dims();
-  size_t memory_size = 0;
-  if (dims.size() == 4) {
-    auto channel = dims[1], height = dims[2], width = dims[3];
-    memory_size =
-        height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(float);
-  } else if (dims.size() == 2) {
-    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(float);
-  } else {
-    DLOG << "Wrong ofm dimension";
-  }
-  auto p = fpga_malloc(memory_size);
-  // memset(p, 0, memory_size);
-  ofm_tensor->reset_data_ptr(p);
-  ofm_tensor->set_type(type_id<float>().hash_code());
-  ofm_tensor->fpga_data_num = memory_size / sizeof(float);
-  fpga::fpga_flush(p, memory_size);
-}
-
-float filter_find_max(framework::Tensor *filter_tensor) {
-  auto filter_ptr = filter_tensor->data<float>();
-  return filter::find_max(filter_ptr, filter_tensor->numel());
-}
-
-int get_plit_num(framework::Tensor *filter_tensor) {
-  auto dims = filter_tensor->dims();
-  auto chw = dims[1] * dims[2] * dims[3];
-  auto num = dims[0];
-  int div_capacity = filter::calc_division_capacity(chw);
-  return filter::calc_split_num(num, div_capacity);
-}
-int get_deconv_plit_num(framework::Tensor *filter_tensor, int stride) {
-  auto dims = filter_tensor->dims();
-  auto chw = dims[1] * dims[2] / stride * dims[3] / stride;
-  auto num = dims[0] * stride;
-  int div_capacity = filter::calc_division_capacity(chw);
-  return filter::calc_split_num(num, div_capacity);
-}
-
-int get_filter_num_per_div(framework::Tensor *filter_tensor, int group_num) {
-  auto dims = filter_tensor->dims();
-  auto chw = dims[1] * dims[2] * dims[3];
-  auto num = dims[0];
-  int div_capacity = filter::calc_division_capacity(chw);
-  return filter::calc_num_per_div(num, group_num, div_capacity);
-}
-
-int get_deconv_filter_num_per_div(framework::Tensor *filter_tensor,
-                                  int group_num, int stride) {
-  auto dims = filter_tensor->dims();
-  auto chw = dims[1] * dims[2] / stride * dims[3] / stride;
-  auto num = dims[0] * stride;
-  int div_capacity = filter::calc_division_capacity(chw);
-  return filter::calc_num_per_div(num, group_num, div_capacity);
-}
-
-int get_aligned_filter_element_num(int chw) {
-  return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-}
-
-void format_filter(framework::Tensor *filter_tensor, float max_value,
-                   int group_num) {
-  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
-  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
-  auto dims = filter_tensor->dims();
-  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
-  auto data_ptr = filter_tensor->data<float>();
-  size_t memory_size = num * channel * height * width * sizeof(float);
-  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
-  fpga_copy(new_data, data_ptr, memory_size);
-  filter::format_filter(&new_data, num, channel, height, width, group_num,
-                        max_value);
-  filter_tensor->reset_data_ptr(new_data);
-  filter_tensor->set_type(type_id<int8_t>().hash_code());
-}
-void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
-  auto dims = filter_tensor->dims();
-  auto num = dims[0], height = dims[2], width = dims[3];
-  auto data_ptr = filter_tensor->data<float>();
-  size_t memory_size = num * height * width * sizeof(float);
-  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
-  fpga_copy(new_data, data_ptr, memory_size);
-  filter::format_dwconv_filter(&new_data, num, height, width, scale_ptr);
-  filter_tensor->reset_data_ptr(new_data);
-  filter_tensor->set_type(type_id<int16_t>().hash_code());
-}
-
-void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
-                           int stride) {
-  auto dims = filter_tensor->dims();
-  auto num = dims[0], height = dims[2], width = dims[3];
-  auto data_ptr = filter_tensor->data<float>();
-  size_t memory_size = num * height * width * sizeof(float);
-  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
-  fpga_copy(new_data, data_ptr, memory_size);
-
-  int hw = height * width;
-  deconv_filter::deconv_NC_convert(&new_data, num, 1, hw);
-
-  num = dims[1];
-  int channel = dims[0];
-
-  deconv_filter::DWDconv_format_filter(&new_data, num, channel, height, width,
-                                       scale_ptr, stride);
-
-  //  framework::DDim dims_new =
-  //      framework::make_ddim({num, 1, height, width});
-  //  filter_tensor->Resize(dims_new);
-  filter_tensor->reset_data_ptr(new_data);
-  filter_tensor->set_type(type_id<int16_t>().hash_code());
-}
-
-void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
-  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
-  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
-  auto dims = filter_tensor->dims();
-  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
-  auto data_ptr = filter_tensor->data<float>();
-  size_t memory_size = num * channel * height * width * sizeof(float);
-  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
-  fpga_copy(new_data, data_ptr, memory_size);
-  filter::format_fc_filter(&new_data, num, channel, height, width, 1,
-                           max_value);
-  filter_tensor->reset_data_ptr(new_data);
-  filter_tensor->set_type(type_id<int8_t>().hash_code());
-}
-void format_deconv_filter(framework::Tensor *filter_tensor, float max_value,
-                          int group_num, int stride) {
-  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
-  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
-  auto dims = filter_tensor->dims();
-  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
-  auto data_ptr = filter_tensor->data<float>();
-  size_t memory_size = num * channel * height * width * sizeof(float);
-  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
-  memcpy(new_data, data_ptr, memory_size);
-
-  int hw = height * width;
-  deconv_filter::deconv_NC_convert(&new_data, num, channel, hw);
-
-  num = dims[1];
-  channel = dims[0];
-  deconv_filter::deconv_format_filter(
-      &new_data, (int)num, (int)channel,          // NOLINT
-      (int)height,                                // NOLINT
-      (int)width, group_num, max_value, stride);  // NOLINT
-
-  framework::DDim dims_new =
-      framework::make_ddim({num, channel, height, width});
-  filter_tensor->Resize(dims_new);
-  filter_tensor->reset_data_ptr(new_data);
-  filter_tensor->set_type(type_id<int8_t>().hash_code());
-}
-
-void format_bias_scale_array(float **bias_scale_array,
-                             int element_num_per_division, int num) {
-  bias_scale::format_bias_scale_array(bias_scale_array,
-                                      element_num_per_division, num);
-}
-void format_bias_array(float **bias_array, int num) {
-  bias_scale::format_bias_array(bias_array, num);
-}
-
-void format_concat_output(framework::Tensor *out, int height, int width,
-                          int image_num, uint32_t *channel_num) {
-  int sum_channel = 0, sum_cw = 0;
-  for (int i = 0; i < image_num; i++) {
-    sum_channel += channel_num[i];
-  }
-
-  sum_cw = align_to_x(width * sum_channel, IMAGE_ALIGNMENT);
-  auto data_ptr = fpga_malloc(height * sum_cw * sizeof(half));
-  auto ddim = framework::make_ddim({1, sum_channel, height, width});
-  out->Resize(ddim);
-  out->reset_data_ptr(data_ptr);
-  out->fpga_data_num = sum_cw * height;
-  out->set_type(type_id<half>().hash_code());
-}
-void format_conv_data(framework::Tensor *filter_tensor,
-                      framework::Tensor *ofm_tensor, float **bs_ptr,
-                      int group) {
-  float max_value = fpga::filter_find_max(filter_tensor);
-  fpga::format_filter(filter_tensor, max_value, group);
-  int element_num_per_div = fpga::get_filter_num_per_div(filter_tensor, group);
-  fpga::format_bias_scale_array(bs_ptr, element_num_per_div,
-                                ofm_tensor->dims()[1]);
-  fpga::format_fp16_ofm(ofm_tensor);
-}
-void format_deconv_data(framework::Tensor *filter_tensor,
-                        framework::Tensor *ofm_tensor, float **bs_ptr,
-                        int group, int sub_conv_n) {
-  int channel = ofm_tensor->dims()[1];
-  float max_value = filter_find_max(filter_tensor);
-  format_deconv_filter(filter_tensor, max_value, group, sub_conv_n);
-  int element_num_per_div =
-      get_deconv_filter_num_per_div(filter_tensor, group, sub_conv_n);
-  format_bias_scale_array(bs_ptr, element_num_per_div, channel * sub_conv_n);
-  format_fp16_ofm(ofm_tensor);
-}
-
-void format_dwconv_data(framework::Tensor *filter_tensor,
-                        framework::Tensor *ofm_tensor, float *scale_ptr,
-                        float **bias_ptr) {
-  auto channel = ofm_tensor->dims()[1];
-  format_dwconv_filter(filter_tensor, scale_ptr);
-  format_bias_array(bias_ptr, channel);
-  format_fp16_ofm(ofm_tensor);
-}
-void format_DWDeconv_data(framework::Tensor *filter_tensor,
-                          framework::Tensor *ofm_tensor, float **bs_ptr,
-                          int group, int sub_conv_n) {
-  int channel = ofm_tensor->dims()[1];
-  // dw-deconv
-  format_DWDconv_filter(
-      filter_tensor,
-      (reinterpret_cast<float *>(*bs_ptr) + sub_conv_n * channel), sub_conv_n);
-  format_bias_array(bs_ptr, channel);
-  format_fp16_ofm(ofm_tensor);
-}
-void expand_conv_arg(ConvArgs *arg) {
-  ConvArgs args = *arg;
-
-  auto fpga_bias_scale_len =
-      align_to_x(args.filter_num / args.group_num, 8) * args.group_num;
-
-  auto output_height =
-      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
-          args.kernel.stride_h +
-      1;
-  auto output_width =
-      (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
-          args.kernel.stride_w +
-      1;
-
-  auto filter_per_group = args.filter_num / args.group_num;
-  auto channel_per_group = args.image.channels / args.group_num;
-
-  auto image_row_count = args.image.width * args.image.channels;
-  auto image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT);
-  auto image_one_pad_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT) +
-                               args.image.pad_width * args.image.channels;
-  auto filter_amount_all =
-      align_to_x(args.kernel.height * args.kernel.width * channel_per_group,
-                 FILTER_ELEMENT_ALIGNMENT);
-
-  auto output_amount_per_row = align_to_x(
-      (output_width - (args.deconv_tx_param.omit_size) * 2) * args.filter_num,
-      IMAGE_ALIGNMENT);
-
-  // find the opt partition strategy
-  uint64_t res_win;
-  uint64_t res_fit = 0;
-  for (res_win = 1; res_win <= output_width; res_win++) {
-    if ((align_to_x(
-             (args.image.channels *
-              (args.kernel.width + (res_win - 1) * args.kernel.stride_w)),
-             IMAGE_ALIGNMENT) /
-             16 +
-         1) *
-            args.kernel.height >
-        2048) {
-      break;
-    }
-  }
-
-  if (res_win != output_width) {
-    res_win -= 1;
-  }
-
-  if (((res_win % 2) != 0) && (res_win != 1)) {
-    res_win = res_win - 1;
-  }
-  res_fit = res_win;
-
-  auto block_num = (output_width + res_fit - 1) / res_fit;
-  auto block_len = res_fit;
-  auto block_last = output_width - res_fit * (block_num - 1);
-
-  auto res_amount_per_row =
-      (output_width - (args.deconv_tx_param.omit_size) * 2) * args.filter_num;
-  auto res_amount_per_row_pad = output_amount_per_row - res_amount_per_row;
-
-  auto image_block_amount_per_row =
-      args.kernel.stride_w * res_fit * args.image.channels;
-  auto filter_pad_width_mul_channel =
-      args.image.pad_width * args.image.channels;
-  auto image_amount_per_row_multi_win_first =
-      image_amount_per_row *
-      (ROW_PARALLEL_NUM * args.kernel.stride_h - args.image.pad_height);
-  auto image_amount_per_row_multi_win =
-      image_amount_per_row * (ROW_PARALLEL_NUM * args.kernel.stride_h);
-
-  auto image_block_num = block_num;
-  auto image_block_len =
-      align_to_x((args.image.channels *
-                  (args.kernel.width + (block_len - 1) * args.kernel.stride_w)),
-                 IMAGE_ALIGNMENT) /
-          16 +
-      1;
-  auto image_block_len_last =
-      align_to_x(
-          (args.image.channels *
-           (args.kernel.width + (block_last - 1) * args.kernel.stride_w)),
-          IMAGE_ALIGNMENT) /
-          16 +
-      1;
-  auto image_win_cnt = block_len;
-  auto image_win_cnt_last = block_last;
-  auto res_row_data_align4_pad = res_amount_per_row_pad / 8;
-  auto prog_full_cnt = 1024 / (filter_amount_all / 16 * 2) - 1;
-  if (prog_full_cnt == 511) {
-    prog_full_cnt--;
-  }
-  auto post_prog_full_cnt =
-      (512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2)
-          ? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2)
-          : 0;
-  // auto cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
-  auto cmd = 0UL | USE_BIAS;
-
-  auto deconv_param = ((args.deconv_tx_param.deconv_en) << 16) |
-                      ((args.deconv_tx_param.sub_conv_num) << 8) |
-                      ((args.deconv_tx_param.omit_size) << 0);
-  (*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address);
-  (*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address);
-  (*arg).driver.filter_address_phy = vaddr_to_paddr(args.filter_address);
-  (*arg).driver.output_address_phy = vaddr_to_paddr(args.output.address) +
-                                     args.deconv_tx_param.out_addr_offset;
-  (*arg).driver.output_height = output_height;
-  (*arg).driver.output_width = output_width;
-  (*arg).driver.filter_per_group = filter_per_group;
-  (*arg).driver.channel_per_group = channel_per_group;
-  (*arg).driver.image_amount_per_row = image_amount_per_row;
-  (*arg).driver.image_one_pad_per_row = image_one_pad_per_row;
-  (*arg).driver.filter_amount_all = filter_amount_all;
-  (*arg).driver.output_amount_per_row = output_amount_per_row;
-  (*arg).driver.image_block_amount_per_row = image_block_amount_per_row;
-  (*arg).driver.filter_pad_width_mul_channel = filter_pad_width_mul_channel;
-  (*arg).driver.image_amount_per_row_multi_win_first =
-      image_amount_per_row_multi_win_first;
-  (*arg).driver.image_amount_per_row_multi_win = image_amount_per_row_multi_win;
-  (*arg).driver.image_block_num = image_block_num;
-  (*arg).driver.image_block_len = image_block_len;
-  (*arg).driver.image_block_len_last = image_block_len_last;
-  (*arg).driver.image_win_cnt = image_win_cnt;
-  (*arg).driver.image_win_cnt_last = image_win_cnt_last;
-  (*arg).driver.res_row_data_align4_pad = res_row_data_align4_pad;
-  (*arg).driver.prog_full_cnt = prog_full_cnt;
-  (*arg).driver.post_prog_full_cnt = post_prog_full_cnt;
-  (*arg).driver.fpga_bias_scale_len = fpga_bias_scale_len;
-  (*arg).driver.cmd = cmd;
-  (*arg).driver.deconv_param = deconv_param;
-}  // expand_conv_arg()
-
-void expand_EW_arg(EWAddArgs *arg) {
-  EWAddArgs args = *arg;
-  // uint64_t cmd = args.relu_enabled ? USE_RELU : 0;
-  uint64_t cmd = 0;
-  uint64_t datalen = (uint64_t)args.image0.width *
-                     (uint64_t)args.image0.height *
-                     (uint64_t)args.image0.channels;
-  uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1;
-  uint64_t image0_address_phy = vaddr_to_paddr(args.image0.address);
-  uint64_t image1_address_phy = vaddr_to_paddr(args.image1.address);
-  uint64_t output_address_phy = vaddr_to_paddr(args.output.address);
-
-  uint64_t image_amount_per_row =
-      align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
-                 IMAGE_ALIGNMENT);
-  uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) |
-                               ((uint64_t)args.image0.width << 16) |
-                               (uint64_t)args.image0.height;
-
-  (*arg).driver.image0_address_phy = image0_address_phy;
-  (*arg).driver.image1_address_phy = image1_address_phy;
-  (*arg).driver.datalen = datalen;
-  (*arg).driver.image_image_pixel = image_image_pixel;
-  (*arg).driver.image_amount_per_row = image_amount_per_row;
-  (*arg).driver.output_address_phy = output_address_phy;
-  (*arg).driver.coefficient = coefficient;
-  (*arg).driver.cmd = cmd;
-}  // expand_EW_arg
-
-void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
-                    framework::Tensor *out, framework::Tensor *filter,
-                    ActivationType activation_enable,
-                    int16_t leaky_relu_negative_slope, int group_num,
-                    int stride_h, int stride_w, int padding_h, int padding_w,
-                    float *bs_ptr) {
-  auto input_ptr = input->data<half>();
-  auto filter_ptr = filter->data<int8_t>();
-  auto out_ptr = out->data<half>();
-  auto deleter = [](void *p) { fpga_free(p); };
-
-  arg->group_num = (uint32_t)group_num;
-  // Either group_num or split_num = 1;
-  arg->split_num = group_num == 1 ? (uint32_t)get_plit_num(filter) : 1;
-  arg->filter_num = (uint32_t)filter->dims()[0];
-  arg->output.address = out_ptr;
-  arg->output.scale_address = out->scale;
-  arg->conv_arg =
-      (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));  // NOLINT
-
-  arg->shared_conv_arg = std::shared_ptr<ConvArgs>(arg->conv_arg, deleter);
-
-  memset(arg->conv_arg, 0, arg->split_num * sizeof(struct ConvArgs));
-
-  arg->concat_arg.image_num = arg->split_num;
-  arg->concat_arg.image_out = out_ptr;
-  arg->concat_arg.scale_out = out->scale;
-  arg->concat_arg.height = (uint32_t)out->dims()[2];
-  arg->concat_arg.width = (uint32_t)out->dims()[3];
-
-  int n = arg->split_num;
-  arg->concat_arg.images_in =
-      static_cast<int16_t **>(fpga_malloc(n * sizeof(int *)));
-  arg->concat_arg.scales_in =
-      static_cast<float **>(fpga_malloc(n * sizeof(float *)));
-  arg->concat_arg.channel_num =
-      static_cast<uint32_t *>(fpga_malloc(n * sizeof(uint32_t)));
-  arg->vector_concat_space.push_back(std::shared_ptr<char>(
-      reinterpret_cast<char *>(arg->concat_arg.images_in), deleter));
-  arg->vector_concat_space.push_back(std::shared_ptr<char>(
-      reinterpret_cast<char *>(arg->concat_arg.scales_in), deleter));
-  arg->vector_concat_space.push_back(std::shared_ptr<char>(
-      reinterpret_cast<char *>(arg->concat_arg.channel_num), deleter));
-
-  auto channel = (int)out->dims()[1];  // NOLINT
-  int filter_num_per_div = get_filter_num_per_div(filter, group_num);
-  int element_num = get_aligned_filter_element_num(
-      (int)(filter->dims()[1] * filter->dims()[2] *  // NOLINT
-            filter->dims()[3]));
-
-  for (int i = 0; i < n; i++) {
-    // arg->conv_arg[i].relu_enabled = relu_enabled;
-    arg->conv_arg[i].output.activation.activation_type = activation_enable;
-    arg->conv_arg[i].output.activation.leaky_relu_negative_slope =
-        leaky_relu_negative_slope;
-    arg->conv_arg[i].group_num = (uint32_t)group_num;
-    arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h;
-    arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w;
-    arg->conv_arg[i].kernel.height = (uint32_t)filter->dims()[2];
-    arg->conv_arg[i].kernel.width = (uint32_t)filter->dims()[3];
-    arg->conv_arg[i].image.address = input_ptr;
-    arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1];
-    arg->conv_arg[i].image.height = (uint32_t)input->dims()[2];
-    arg->conv_arg[i].image.width = (uint32_t)input->dims()[3];
-    arg->conv_arg[i].image.scale_address = input->scale;
-    arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
-    arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
-    arg->conv_arg[i].filter_scale_address = filter->scale;
-    arg->conv_arg[i].filter_num = (uint32_t)(
-        i == n - 1 ? channel - (n - 1) * filter_num_per_div  // NOLINT
-                   : filter_num_per_div);
-
-    size_t filter_size =
-        element_num *
-        align_to_x(arg->conv_arg[i].filter_num, FILTER_NUM_ALIGNMENT) *
-        sizeof(int8_t);
-    auto filter_head = &(
-        (int8_t *)filter_ptr)[i * element_num * filter_num_per_div];  // NOLINT
-    arg->conv_arg[i].filter_address = fpga_malloc(filter_size);
-    arg->vector_conv_space.push_back(std::shared_ptr<char>(
-        reinterpret_cast<char *>(arg->conv_arg[i].filter_address), deleter));
-    memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size);
-    fpga_flush(arg->conv_arg[i].filter_address, filter_size);
-    // for test
-    //    {
-    //    static int cnt = 0;
-    //    if(cnt == 4){
-    //        int8_t result = 0;
-    //        std::string str = "fc_filter";
-    //      fpga::savefile<int8_t>(str, arg->conv_arg[i].filter_address,
-    //      filter_size, result);
-    //
-    //    }
-    //    cnt++;
-    //}
-
-    size_t bs_size = 2 *
-                     align_to_x(arg->conv_arg[i].filter_num, BS_NUM_ALIGNMENT) *
-                     sizeof(float);
-    auto bs_head = &bs_ptr[i * filter_num_per_div * 2];
-    arg->conv_arg[i].sb_address = fpga_malloc(bs_size);
-    arg->vector_conv_space.push_back(std::shared_ptr<char>(
-        reinterpret_cast<char *>(arg->conv_arg[i].sb_address), deleter));
-    memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size);
-    fpga_flush(arg->conv_arg[i].sb_address, bs_size);
-    // for test
-    /*{
-    static int cnt = 0;
-    if(cnt == 4){
-        float result = 0;
-        std::string str = "fc_bs";
-      fpga::savefile<float>(str, arg->conv_arg[i].sb_address, bs_size/4,
-result);
-
-    }
-    cnt++;
-}*/
-
-    if (n > 1) {
-      arg->conv_arg[i].output.scale_address =
-          static_cast<float *>(fpga_malloc(2 * sizeof(float)));
-      arg->conv_arg[i].output.address =
-          fpga_malloc(out->dims()[2] *
-                      align_to_x((int)(out->dims()[3] *  // NOLINT
-                                       arg->conv_arg[i].filter_num),
-                                 IMAGE_ALIGNMENT) *
-                      sizeof(half));
-      arg->vector_conv_space.push_back(std::shared_ptr<char>(
-          reinterpret_cast<char *>(arg->conv_arg[i].output.scale_address),
-          deleter));
-      arg->vector_conv_space.push_back(std::shared_ptr<char>(
-          reinterpret_cast<char *>(arg->conv_arg[i].output.address), deleter));
-    } else {
-      arg->conv_arg[i].output.scale_address = out->scale;
-      arg->conv_arg[i].output.address = out_ptr;
-    }
-
-    arg->concat_arg.images_in[i] =
-        (half *)arg->conv_arg[i].output.address;  // NOLINT
-    arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address;
-    arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num;
-
-    expand_conv_arg(&arg->conv_arg[i]);
-  }
-  filter->reset_data_ptr(nullptr);
-  fpga_free(bs_ptr);
-}  // fill_split_arg
-
-void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
-                     framework::Tensor *out, framework::Tensor *filter,
-                     ActivationType activation_enable,
-                     int16_t leaky_relu_negative_slope, int group_num,
-                     int stride_h, int stride_w, int padding_h, int padding_w,
-                     float *bs_ptr) {
-  auto input_ptr = input->data<half>();
-  auto filter_ptr = filter->data<int8_t>();
-  auto deleter = [](void *p) { fpga_free(p); };
-
-  arg->group_num = (uint32_t)group_num;
-  arg->sub_conv_num = (uint32_t)stride_h;
-  arg->filter_num = (uint32_t)filter->dims()[0];
-  uint32_t sub_conv_num = arg->sub_conv_num;
-  int sub_pad =
-      deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3],  // NOLINT
-                                         padding_w, stride_w);
-  auto sub_filter_width = (uint32_t)deconv_filter::deconv_get_sub_filter_axis(
-      (int)filter->dims()[3], stride_w);  // NOLINT
-
-  auto sub_output_width = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
-      (int)input->dims()[3], sub_pad, sub_filter_width);  // NOLINT
-  auto sub_output_height = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
-      (int)input->dims()[2], sub_pad, sub_filter_width);  // NOLINT
-
-  arg->sub_output_width = (uint32_t)sub_output_width;
-  arg->sub_output_height = (uint32_t)sub_output_height;
-  arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit(
-      stride_w, (int)filter->dims()[3], padding_w);  // NOLINT
-
-  auto sub_channels = (int)input->dims()[1];  // NOLINT
-  uint32_t omit_size = arg->omit_size;
-  int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size;
-  int sub_filter_num = sub_conv_num * (arg->filter_num);
-
-  framework::DDim dims_out_new = framework::make_ddim(
-      {1, arg->filter_num, sub_output_height * sub_conv_num, real_out_width});
-  fpga::format_fp16_ofm(out, dims_out_new);
-  auto out_ptr = out->data<half>();
-  arg->output.address =
-      (half *)out_ptr +  // NOLINT
-      omit_size * sizeof(half) *
-          (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
-  arg->output.scale_address = out->scale;
-
-  uint32_t conv_output_size =
-      (align_to_x(sub_output_width * sub_filter_num, IMAGE_ALIGNMENT)) *
-      sub_output_height;
-  uint32_t split_num =
-      group_num == 1 ? (uint32_t)get_deconv_plit_num(filter, sub_conv_num) : 1;
-
-  for (int i = 0; i < sub_conv_num; ++i) {
-    arg->split_conv_args.push_back(std::make_shared<SplitConvArgs>());
-    arg->split_conv_args[i]->filter_num =
-        (arg->sub_conv_num) * (arg->filter_num);
-    arg->split_conv_args[i]->group_num = (uint32_t)group_num;
-    arg->split_conv_args[i]->split_num = split_num;
-    arg->split_conv_args[i]->concat_arg.height = sub_output_height;
-    arg->split_conv_args[i]->concat_arg.width = sub_output_width;
-    arg->split_conv_args[i]->concat_arg.image_num = split_num;
-
-    arg->split_conv_args[i]->conv_arg =
-        static_cast<ConvArgs *>(fpga_malloc(split_num * sizeof(ConvArgs)));
-    arg->split_conv_args[i]->concat_arg.images_in =
-        static_cast<int16_t **>(fpga_malloc(split_num * sizeof(int16_t *)));
-    arg->split_conv_args[i]->concat_arg.scales_in =
-        static_cast<float **>(fpga_malloc(split_num * sizeof(float *)));
-    arg->split_conv_args[i]->concat_arg.channel_num =
-        static_cast<uint32_t *>(fpga_malloc(split_num * sizeof(uint32_t)));
-    arg->split_conv_args[i]->shared_conv_arg =
-        std::shared_ptr<ConvArgs>(arg->split_conv_args[i]->conv_arg, deleter);
-    arg->split_conv_args[i]->vector_concat_space.push_back(
-        std::shared_ptr<char>(
-            reinterpret_cast<char *>(
-                arg->split_conv_args[i]->concat_arg.images_in),
-            deleter));
-    arg->split_conv_args[i]->vector_concat_space.push_back(
-        std::shared_ptr<char>(
-            reinterpret_cast<char *>(
-                arg->split_conv_args[i]->concat_arg.scales_in),
-            deleter));
-    arg->split_conv_args[i]->vector_concat_space.push_back(
-        std::shared_ptr<char>(
-            reinterpret_cast<char *>(
-                arg->split_conv_args[i]->concat_arg.channel_num),
-            deleter));
-  }
-
-  auto filter_num_per_div =
-      (uint32_t)get_deconv_filter_num_per_div(filter, group_num, stride_w);
-  int element_num = get_aligned_filter_element_num(
-      (int)(sub_channels * sub_filter_width * sub_filter_width));  // NOLINT
-
-  int chw = sub_channels * sub_filter_width * sub_filter_width;
-  int division_capacity = filter::calc_division_capacity(chw);
-  int num_per_div_before_alignment =
-      filter::calc_num_per_div(sub_filter_num, group_num, division_capacity);
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-  int div_num = (sub_filter_num + num_per_div_before_alignment - 1) /
-                num_per_div_before_alignment;
-  int residual = sub_filter_num % num_per_div_before_alignment;
-  int num_after_alignment = num_per_div_after_alignment *
-                                ((residual == 0) ? div_num : (div_num - 1)) +
-                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
-
-  int filter_sub_conv_offset = element_num * num_after_alignment;
-  uint32_t out_addr_offset = 0;
-  for (int i = 0; i < sub_conv_num; ++i) {
-    if (sub_conv_num == 1) {
-      arg->split_conv_args[i]->output.address = arg->output.address;
-      arg->split_conv_args[i]->output.scale_address = arg->output.scale_address;
-      out_addr_offset = 0;
-
-    } else {
-      out_addr_offset =
-          sizeof(int16_t) * (sub_conv_num - 1 - i) *
-          (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
-
-      arg->split_conv_args[i]->output.address = out_ptr;
-      arg->split_conv_args[i]->output.scale_address =
-          static_cast<float *>(fpga_malloc(2 * sizeof(float)));
-      arg->split_conv_args[i]->vector_conv_space.push_back(
-          std::shared_ptr<char>(
-              reinterpret_cast<char *>(
-                  arg->split_conv_args[i]->output.scale_address),
-              deleter));
-    }
-
-    for (int j = 0; j < split_num; ++j) {
-      arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type =
-          activation_enable;
-      arg->split_conv_args[i]
-          ->conv_arg[j]
-          .output.activation.leaky_relu_negative_slope =
-          leaky_relu_negative_slope;
-      arg->split_conv_args[i]->conv_arg[j].group_num = (uint32_t)group_num;
-
-      arg->split_conv_args[i]->conv_arg[j].kernel.width =
-          (uint32_t)sub_filter_width;
-      arg->split_conv_args[i]->conv_arg[j].kernel.height =
-          (uint32_t)sub_filter_width;
-      arg->split_conv_args[i]->conv_arg[j].kernel.stride_w = 1;
-      arg->split_conv_args[i]->conv_arg[j].kernel.stride_h = 1;
-
-      arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.deconv_en = 1;
-      arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.sub_conv_num =
-          sub_conv_num;
-      arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.omit_size =
-          omit_size;
-      arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.out_addr_offset =
-          out_addr_offset;
-
-      arg->split_conv_args[i]->conv_arg[j].image.scale_address = input->scale;
-      arg->split_conv_args[i]->conv_arg[j].image.channels =
-          (uint32_t)sub_channels;
-      arg->split_conv_args[i]->conv_arg[j].image.width =
-          (uint32_t)input->dims()[3];
-      arg->split_conv_args[i]->conv_arg[j].image.height =
-          (uint32_t)input->dims()[2];
-      arg->split_conv_args[i]->conv_arg[j].image.pad_width = (uint32_t)sub_pad;
-      arg->split_conv_args[i]->conv_arg[j].image.pad_height = (uint32_t)sub_pad;
-      arg->split_conv_args[i]->conv_arg[j].image.address = input_ptr;
-
-      arg->split_conv_args[i]->conv_arg[j].filter_scale_address = filter->scale;
-      arg->split_conv_args[i]->conv_arg[j].filter_num =
-          (uint32_t)(j == split_num - 1
-                         ? sub_filter_num - (split_num - 1) * filter_num_per_div
-                         : filter_num_per_div);
-
-      size_t filter_size =
-          element_num *
-          align_to_x(arg->split_conv_args[i]->conv_arg[j].filter_num,
-                     FILTER_NUM_ALIGNMENT) *
-          sizeof(int8_t);
-      auto filter_head = &((
-          int8_t *)filter_ptr)[j * element_num * filter_num_per_div +  // NOLINT
-                               i * filter_sub_conv_offset];
-      arg->split_conv_args[i]->conv_arg[j].filter_address =
-          fpga_malloc(filter_size);
-      arg->split_conv_args[i]->vector_conv_space.push_back(
-          std::shared_ptr<char>(
-              reinterpret_cast<char *>(
-                  arg->split_conv_args[i]->conv_arg[j].filter_address),
-              deleter));
-
-      memcpy(arg->split_conv_args[i]->conv_arg[j].filter_address, filter_head,
-             filter_size);
-      fpga_flush(arg->split_conv_args[i]->conv_arg[j].filter_address,
-                 filter_size);
-
-      size_t bs_align_num = align_to_x(
-          arg->split_conv_args[i]->conv_arg[j].filter_num, BS_NUM_ALIGNMENT);
-      size_t bs_size = 2 * bs_align_num * sizeof(float);
-      auto bs_head = &bs_ptr[j * filter_num_per_div * 2];
-
-      arg->split_conv_args[i]->conv_arg[j].sb_address = fpga_malloc(bs_size);
-      arg->split_conv_args[i]->vector_conv_space.push_back(
-          std::shared_ptr<char>(
-              reinterpret_cast<char *>(
-                  arg->split_conv_args[i]->conv_arg[j].sb_address),
-              deleter));
-
-      memcpy(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_head, bs_size);
-      fpga_flush(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_size);
-
-      if (split_num == 1) {
-        arg->split_conv_args[i]->conv_arg[j].output.address =
-            arg->split_conv_args[i]->output.address;
-        arg->split_conv_args[i]->conv_arg[j].output.scale_address =
-            arg->split_conv_args[i]->output.scale_address;
-      } else {
-        arg->split_conv_args[i]->conv_arg[j].output.address =
-            fpga_malloc(conv_output_size * sizeof(int16_t));
-        arg->split_conv_args[i]->conv_arg[j].output.scale_address =
-            static_cast<float *>(fpga_malloc(2 * sizeof(float)));
-        arg->split_conv_args[i]->vector_conv_space.push_back(
-            std::shared_ptr<char>(
-                reinterpret_cast<char *>(
-                    arg->split_conv_args[i]->conv_arg[j].output.address),
-                deleter));
-        arg->split_conv_args[i]->vector_conv_space.push_back(
-            std::shared_ptr<char>(
-                reinterpret_cast<char *>(
-                    arg->split_conv_args[i]->conv_arg[j].output.scale_address),
-                deleter));
-      }
-      arg->split_conv_args[i]->concat_arg.images_in[j] = static_cast<half *>(
-          arg->split_conv_args[i]->conv_arg[j].output.address);
-      arg->split_conv_args[i]->concat_arg.scales_in[j] =
-          arg->split_conv_args[i]->conv_arg[j].output.scale_address;
-      arg->split_conv_args[i]->concat_arg.channel_num[j] =
-          arg->split_conv_args[i]->conv_arg[j].filter_num;
-
-      expand_conv_arg(&(arg->split_conv_args[i]->conv_arg[j]));
-    }
-
-    arg->split_conv_args[i]->concat_arg.image_out =
-        arg->split_conv_args[i]->output.address;
-    arg->split_conv_args[i]->concat_arg.scale_out =
-        arg->split_conv_args[i]->output.scale_address;
-  }
-  filter->reset_data_ptr(nullptr);
-  fpga_free(bs_ptr);
-}  // fill_deconv_arg
-
-void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
-                     framework::Tensor *out, framework::Tensor *filter,
-                     ActivationType activation_enable,
-                     int16_t leaky_relu_negative_slope, int stride_h,
-                     int stride_w, int padding_h, int padding_w,
-                     float *bias_ptr) {
-  auto deleter = [](void *p) { fpga_free(p); };
-  arg->vector_dwconv_space.push_back(
-      std::shared_ptr<char>(reinterpret_cast<char *>(bias_ptr), deleter));
-
-  auto filter_ptr = filter->data<int16_t>();
-  auto input_ptr = input->data<half>();
-  auto output_ptr = out->mutable_data<half>();
-  arg->sub_conv_num = 1;
-  // arg->relu_enabled = relu_enabled;
-  arg->output.activation.activation_type = activation_enable;
-  arg->output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope;
-  arg->bias_address = bias_ptr;
-  arg->filter_address = filter_ptr;
-  arg->kernel.height = (uint32_t)filter->dims()[2];
-  arg->kernel.width = (uint32_t)filter->dims()[3];
-  arg->kernel.stride_h = (uint32_t)stride_h;
-  arg->kernel.stride_w = (uint32_t)stride_w;
-  arg->image.address = input_ptr;
-  arg->image.channels = (uint32_t)input->dims()[1];
-  arg->image.height = (uint32_t)input->dims()[2];
-  arg->image.width = (uint32_t)input->dims()[3];
-  arg->image.pad_height = (uint32_t)padding_h;
-  arg->image.pad_width = (uint32_t)padding_w;
-  arg->image.scale_address = input->scale;
-  arg->output.address = output_ptr;
-  arg->output.scale_address = out->scale;
-}  // end dwconv arg fill
-
-void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
-                       framework::Tensor *out, framework::Tensor *filter,
-                       ActivationType activation_enable,
-                       int16_t leaky_relu_negative_slope, int stride_h,
-                       int stride_w, int padding_h, int padding_w,
-                       float *bias_ptr) {
-  auto filter_ptr = filter->data<int8_t>();
-  auto input_ptr = input->data<half>();
-
-  auto deleter = [](void *p) { fpga_free(p); };
-
-  arg->group_num = (uint32_t)filter->dims()[0];
-  arg->sub_conv_num = (uint32_t)stride_w;
-  arg->filter_num = (uint32_t)filter->dims()[0];
-
-  int sub_conv_num = stride_w;
-
-  int sub_pad =
-      deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3],  // NOLINT
-                                         padding_w, stride_w);
-  auto sub_filter_width = (uint32_t)deconv_filter::deconv_get_sub_filter_axis(
-      (int)filter->dims()[3], stride_w);  // NOLINT
-
-  auto sub_output_width = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
-      (int)input->dims()[3], sub_pad, sub_filter_width);  // NOLINT
-  auto sub_output_height = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
-      (int)input->dims()[2], sub_pad, sub_filter_width);  // NOLINT
-
-  arg->sub_output_width = (uint32_t)sub_output_width;
-  arg->sub_output_height = (uint32_t)sub_output_height;
-  arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit(
-      stride_w, (int)filter->dims()[3], padding_w);  // NOLINT
-
-  auto sub_channels = (int)input->dims()[1];  // NOLINT
-  uint32_t omit_size = arg->omit_size;
-  int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size;
-  int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size;
-  int sub_filter_num = sub_conv_num * (arg->filter_num);
-
-  framework::DDim dims_out_new = framework::make_ddim(
-      {1, arg->filter_num, real_out_height, real_out_width});
-  fpga::format_fp16_ofm(out, dims_out_new);
-  auto out_ptr = out->data<half>();
-
-  /*====For Addition
-  arg->output.address =
-      (half *)out_ptr +  // NOLINT
-      omit_size * sizeof(half) *
-          (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
-          */
-  arg->output.address = out_ptr;
-  arg->output.scale_address = out->scale;
-
-  int filter_offset = sub_filter_width * sub_filter_width *
-                      align_to_x(sub_channels, FILTER_ELEMENT_ALIGNMENT) *
-                      arg->sub_conv_num;
-
-  for (int i = 0; i < sub_conv_num; ++i) {
-    arg->dw_conv_args.push_back(std::make_shared<DWconvArgs>());
-
-    arg->dw_conv_args[i]->sub_conv_num = sub_conv_num;
-    // arg->dw_conv_args[i]->relu_enabled = relu_enabled;
-    arg->dw_conv_args[i]->output.activation.activation_type = activation_enable;
-    arg->dw_conv_args[i]->output.activation.leaky_relu_negative_slope =
-        leaky_relu_negative_slope;
-    arg->dw_conv_args[i]->bias_address = bias_ptr;
-
-    arg->dw_conv_args[i]->filter_address =
-        fpga_malloc(filter_offset * sizeof(int16_t));
-    memcpy(arg->dw_conv_args[i]->filter_address,
-           (reinterpret_cast<half *>(filter_ptr) + i * filter_offset),
-           filter_offset * sizeof(int16_t));
-    arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
-        reinterpret_cast<char *>(arg->dw_conv_args[i]->filter_address),
-        deleter));
-
-    arg->dw_conv_args[i]->kernel.height = (uint32_t)sub_filter_width;
-    arg->dw_conv_args[i]->kernel.width = (uint32_t)sub_filter_width;
-
-    arg->dw_conv_args[i]->kernel.stride_h = (uint32_t)1;
-    arg->dw_conv_args[i]->kernel.stride_w = (uint32_t)1;
-    arg->dw_conv_args[i]->image.address = input_ptr;
-    arg->dw_conv_args[i]->image.channels = (uint32_t)input->dims()[1];
-    arg->dw_conv_args[i]->image.height = (uint32_t)input->dims()[2];
-    arg->dw_conv_args[i]->image.width = (uint32_t)input->dims()[3];
-
-    arg->dw_conv_args[i]->image.pad_height = sub_pad;
-    arg->dw_conv_args[i]->image.pad_width = sub_pad;
-    arg->dw_conv_args[i]->image.scale_address = input->scale;
-
-    arg->dw_conv_args[i]->output.address =
-        fpga_malloc(sub_output_height *
-                    align_to_x(sub_output_width * sub_channels * sub_conv_num,
-                               IMAGE_ALIGNMENT) *
-                    sizeof(int16_t));
-    arg->dw_conv_args[i]->output.scale_address =
-        static_cast<float *>(fpga_malloc(2 * sizeof(float)));
-    arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
-        reinterpret_cast<char *>(arg->dw_conv_args[i]->output.address),
-        deleter));
-    arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
-        reinterpret_cast<char *>(arg->dw_conv_args[i]->output.scale_address),
-        deleter));
-  }
-
-  // arg->output.scale_address = out->scale;
-}  // end dwconv arg fill
-
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V1/api.h b/mobile/src/fpga/V1/api.h
deleted file mode 100644
index 33a5d3d33f..0000000000
--- a/mobile/src/fpga/V1/api.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "fpga/common/fpga_common.h"
-#include "fpga/common/pe.h"
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace fpga {
-
-void format_image(framework::Tensor* image_tensor);
-void format_ofm(framework::Tensor* ofm_tensor);
-void format_fp16_ofm(framework::Tensor* ofm_tensor);  // only allocate memory
-void format_fp16_ofm(framework::Tensor* ofm_tensor, framework::DDim dims);
-void format_fp32_ofm(framework::Tensor* ofm_tensor);
-
-float filter_find_max(framework::Tensor* filter_tensor);
-int get_filter_num_per_div(framework::Tensor* filter_tensor, int group_num);
-int get_deconv_filter_num_per_div(framework::Tensor* filter_tensor,
-                                  int group_num, int stride);
-
-int get_plit_num(framework::Tensor* filter_tensor);
-int get_deconv_plit_num(framework::Tensor* filter_tensor, int stride);
-
-int get_aligned_filter_element_num(int chw);
-void format_filter(framework::Tensor* filter_tensor, float max_value,
-                   int group_num);
-void format_fc_filter(framework::Tensor* filter_tensor, float max_value);
-void format_bias_scale_array(float** bias_scale_array,
-                             int element_num_per_division, int num);
-void format_bias_array(float** bias_array, int num);
-void format_concat_output(framework::Tensor* out, int height, int width,
-                          int image_num, uint32_t* channel_num);
-
-void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
-                    framework::Tensor* out, framework::Tensor* filter,
-                    ActivationType activation_enable,
-                    int16_t leaky_relu_negative_slope, int group_num,
-                    int stride_h, int stride_w, int padding_h, int padding_w,
-                    float* bs_ptr);
-void fill_deconv_arg(struct DeconvArgs* arg, framework::Tensor* input,
-                     framework::Tensor* out, framework::Tensor* filter,
-                     ActivationType activation_enable,
-                     int16_t leaky_relu_negative_slope, int group_num,
-                     int stride_h, int stride_w, int padding_h, int padding_w,
-                     float* bs_ptr);
-void fill_dwconv_arg(struct DWconvArgs* arg, framework::Tensor* input,
-                     framework::Tensor* out, framework::Tensor* filter,
-                     ActivationType activation_enable,
-                     int16_t leaky_relu_negative_slope, int stride_h,
-                     int stride_w, int padding_h, int padding_w,
-                     float* bias_ptr);
-void fill_DWDeconv_arg(struct DWDeconvArgs* arg, framework::Tensor* input,
-                       framework::Tensor* out, framework::Tensor* filter,
-                       ActivationType activation_enable,
-                       int16_t leaky_relu_negative_slope, int stride_h,
-                       int stride_w, int padding_h, int padding_w,
-                       float* bs_ptr);
-
-void format_deconv_filter(framework::Tensor* filter_tensor, float max_value,
-                          int group_num, int stride);
-void format_dwconv_filter(framework::Tensor* filter_tensor, float* scale_ptr);
-void format_conv_data(framework::Tensor* filter_tensor,
-                      framework::Tensor* ofm_tensor, float** bs_ptr, int group);
-void format_deconv_data(framework::Tensor* filter_tensor,
-                        framework::Tensor* ofm_tensor, float** bs_ptr,
-                        int group, int sub_conv_n);
-void format_dwconv_data(framework::Tensor* filter_tensor,
-                        framework::Tensor* ofm_tensor, float* scale_ptr,
-                        float** bias_ptr);
-void format_DWDeconv_data(framework::Tensor* filter_tensor,
-                          framework::Tensor* ofm_tensor, float** bs_ptr,
-                          int group, int sub_conv_n);
-
-template <typename Dtype>
-void savefile(std::string filename, void* buffer, int dataSize, Dtype tmp) {
-  float data;
-  std::ofstream out(filename.c_str());
-  for (int i = 0; i < dataSize; ++i) {
-    data = (((Dtype*)buffer)[i]);  // NOLINT
-    out << data << std::endl;
-  }
-  out.close();
-  return;
-}
-
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V1/bias_scale.cpp b/mobile/src/fpga/V1/bias_scale.cpp
deleted file mode 100644
index ffb5303c85..0000000000
--- a/mobile/src/fpga/V1/bias_scale.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/V1/bias_scale.h"
-#include <memory.h>
-#include "fpga/common/fpga_common.h"
-
-namespace paddle_mobile {
-namespace fpga {
-namespace bias_scale {
-
-void align_element(float **data_in, int num_per_div_before_alignment, int num) {
-  int copynum = 0;
-  float *ptr_unaligned = *data_in;
-  int div_num =
-      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
-  int num_element =
-      2 * div_num * num_per_div_after_alignment;  // including bias & scale
-  float *ptr_aligned =
-      (float *)fpga_malloc(num_element * sizeof(float));  // NOLINT
-
-  memset(ptr_aligned, 0, num_element * sizeof(float));
-
-  for (int i = 0; i < div_num; i++) {
-    if (i == div_num - 1) {
-      copynum = (num_per_div_after_alignment * div_num > num)
-                    ? (num % num_per_div_after_alignment)
-                    : (num_per_div_before_alignment);
-    } else {
-      copynum = num_per_div_before_alignment;
-    }
-
-    memcpy(ptr_aligned + i * num_per_div_after_alignment,
-           ptr_unaligned + num_per_div_before_alignment * i,
-           copynum * sizeof(float));
-    memcpy(ptr_aligned + (div_num + i) * num_per_div_after_alignment,
-           ptr_unaligned + num_per_div_before_alignment * i + num,
-           copynum * sizeof(float));
-  }
-
-  fpga_free(ptr_unaligned);
-  *data_in = ptr_aligned;
-}
-
-void interleave(float **data_in, int num_after_alignment) {
-  // num_after_alignment: number of bias after alignment
-
-  float *ptr_uninterleaved = *data_in;
-  float *ptr_interleaved =
-      (float *)fpga_malloc(2 * num_after_alignment * sizeof(float));  // NOLINT
-  int num = num_after_alignment / 4;
-  for (int i = 0; i < num; i++) {
-    memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i,
-           4 * sizeof(float));
-    memcpy(ptr_interleaved + 8 * i + 4,
-           ptr_uninterleaved + num_after_alignment + 4 * i, 4 * sizeof(float));
-  }
-
-  fpga_free(ptr_uninterleaved);
-  *data_in = ptr_interleaved;
-}
-
-void format_bias_scale_array(float **bias_scale_array,
-                             int element_num_per_division, int num) {
-  align_element(bias_scale_array, element_num_per_division, num);
-  int div_num = (num + element_num_per_division - 1) / element_num_per_division;
-  int element_num_after_division =
-      align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
-  interleave(bias_scale_array, div_num * element_num_after_division);
-  fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float));
-}
-void format_bias_array(float **bias_array, int num) {
-  float *ptr_unaligned = *bias_array;
-  int num_before_align = num;
-  int num_after_align = align_to_x(num_before_align, BIAS_NUM_ALIGNMENT);
-  int16_t *ptr_aligned =
-      (int16_t *)fpga_malloc(num_after_align * sizeof(int16_t));  // NOLINT
-
-  memset(ptr_aligned, 0, num_after_align * sizeof(int16_t));
-  for (int i = 0; i < num_before_align; i++) {
-    ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]);
-  }
-  *bias_array = (float *)ptr_aligned;  // NOLINT
-  fpga_free(ptr_unaligned);
-}
-
-}  // namespace bias_scale
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V1/bias_scale.h b/mobile/src/fpga/V1/bias_scale.h
deleted file mode 100755
index 9ebdc71bce..0000000000
--- a/mobile/src/fpga/V1/bias_scale.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle_mobile {
-namespace fpga {
-namespace bias_scale {
-
-void align_element(float** data_in, int num_per_div_before_alignment, int num);
-void interleave(float** data_in, int num_after_alignment);
-void format_bias_scale_array(float** bias_scale_array,
-                             int element_num_per_division, int num);
-void format_bias_array(float** bias_array, int num);
-
-}  // namespace bias_scale
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V1/deconv_bias_scale.cpp b/mobile/src/fpga/V1/deconv_bias_scale.cpp
deleted file mode 100644
index 0bcc91ddd2..0000000000
--- a/mobile/src/fpga/V1/deconv_bias_scale.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/V1/deconv_bias_scale.h"
-// #include "deconv_bias_scale.h"
-#include "fpga/V1/bias_scale.h"
-// #include "bias_scale.h"
-#include <memory.h>
-
-#include "fpga/V1/api.h"
-// #include "fpga_api.h"
-namespace paddle_mobile {
-namespace fpga {
-namespace deconv_bias_scale {
-
-void deconv_bias_scale_expand(float** bias_scale_array, int num,
-                              int sub_conv_n) {
-  int sub_num = num * sub_conv_n;
-  float* ptr_tmp = *bias_scale_array;
-  float* ptr_bias_scale_expand =
-      (float*)fpga_malloc(sizeof(float) * sub_num * 2);
-  int scale_base_offset = sub_num;
-  for (int i = 0; i < sub_conv_n; ++i) {
-    int offset = num * i;
-    // copy bias
-    fpga_copy(ptr_bias_scale_expand + offset, ptr_tmp, num * sizeof(float));
-    // copy scale
-    fpga_copy(ptr_bias_scale_expand + scale_base_offset + offset, ptr_tmp + num,
-              num * sizeof(float));
-  }
-  *bias_scale_array = ptr_bias_scale_expand;
-  fpga_free(ptr_tmp);
-}
-
-}  // namespace deconv_bias_scale
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V1/deconv_bias_scale.h b/mobile/src/fpga/V1/deconv_bias_scale.h
deleted file mode 100644
index 820c6984d4..0000000000
--- a/mobile/src/fpga/V1/deconv_bias_scale.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle_mobile {
-namespace fpga {
-namespace deconv_bias_scale {
-
-void deconv_bias_scale_expand(float** bias_scale_array, int num,
-                              int sub_conv_n);
-
-}  // namespace deconv_bias_scale
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V1/deconv_filter.cpp b/mobile/src/fpga/V1/deconv_filter.cpp
deleted file mode 100644
index 36a02578bc..0000000000
--- a/mobile/src/fpga/V1/deconv_filter.cpp
+++ /dev/null
@@ -1,280 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/V1/deconv_filter.h"
-#include <memory.h>
-#include <algorithm>
-// #include "deconv_filter.h"
-#include "fpga/V1/filter.h"
-// #include "filter.h"
-#include "fpga/V1/api.h"
-
-namespace paddle_mobile {
-namespace fpga {
-namespace deconv_filter {
-
-/*
-inverse kernel weights of each channel for every filter
-*/
-void deconv_inverse_filter(float** data_in, int num, int channel, int width,
-                           int height) {
-  float* tmp = *data_in;
-  int data_size = num * channel * width * height;
-  int hw_len = height * width;
-  auto tmp_data =
-      reinterpret_cast<float*>(fpga_malloc(data_size * sizeof(float)));
-  for (int i = 0; i < num; ++i) {
-    for (int j = 0; j < channel; ++j) {
-      for (int k = 0; k < hw_len; ++k) {
-        tmp_data[i * channel * hw_len + j * hw_len + k] =
-            (*data_in)[i * channel * hw_len + j * hw_len + hw_len - k - 1];
-      }
-    }
-  }
-  *data_in = tmp_data;
-  fpga_free(tmp);
-}
-
-/*
-    calculate sub padding number
-*/
-int deconv_calc_sub_pad(int filter_axis, int pad, int stride) {
-  if (stride == 0 || ((filter_axis - pad - 1) < 0)) {
-    PADDLE_MOBILE_ENFORCE(false, "Wrong deconv parameters");
-  }
-  return (filter_axis - pad - 1) / stride;
-}
-int deconv_get_sub_filter_axis(int filter_axis, int stride) {
-  return (filter_axis / stride);
-}
-
-int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis) {
-  return ((image_axis + 2 * sub_pad - sub_filter_axis) + 1);
-}
-
-/*
-    (filter_width-pad,filter_width-pad) is the first pixel of sub-pixel image
-   position. so the omit rows or columns is (stride - )
-*/
-int deconv_get_omit(int stride, int filter_width, int pad) {
-  PADDLE_MOBILE_ENFORCE(filter_width > pad, "Wrong deconv parameters");
-  int idx;
-  bool flag = false;
-  for (idx = 1; idx <= stride; ++idx) {
-    int j = idx;
-    for (; j <= filter_width;) {
-      if (j == filter_width - pad) {
-        flag = true;
-        break;
-      }
-      j = j + stride;
-    }
-    if (flag) {
-      break;
-    }
-  }
-
-  return (stride - idx);
-}
-
-template <typename T>
-void deconv_get_sub_filter(T** data_in, int height, int width, int sub_conv_n,
-                           int kernel_num, int channel) {
-  T* ptr_tmp = *data_in;
-  int sub_num = kernel_num * sub_conv_n;
-  int sub_h = height / sub_conv_n;
-  int sub_w = width / sub_conv_n;
-
-  int sub_filter_size =
-      kernel_num * sub_h * sub_w * channel * sub_conv_n * sub_conv_n;
-
-  T* ptr_sub_filter =
-      reinterpret_cast<T*>(fpga_malloc(sub_filter_size * sizeof(T)));
-  for (int idx = 0; idx < sub_conv_n; ++idx) {
-    for (int nn = 0; nn < sub_num; ++nn) {
-      int ni = nn % kernel_num;
-
-      int woff = sub_conv_n - 1 - (nn / kernel_num);  //
-
-      for (int hh = 0; hh < sub_h; ++hh) {
-        int hi = hh * sub_conv_n + idx % sub_conv_n;
-        for (int ww = 0; ww < sub_w; ++ww) {
-          int wi = ww * sub_conv_n + woff;  // 1 0
-
-          int sidx = ((nn * sub_h + hh) * sub_w + ww) * channel;   //
-          int kidx = ((ni * height + hi) * width + wi) * channel;  //
-
-          fpga_copy(
-              ptr_sub_filter + idx * sub_h * sub_w * channel * sub_num + sidx,
-              (*data_in) + kidx, channel * sizeof(T));
-          // for (int cc =0; cc < channel; ++cc) {
-          //     ptr_sub_filter[idx*sub_h*sub_w*channel*sub_num + sidx + cc] =
-          //     (*data_in)[kidx + cc];
-          // }
-        }
-      }
-    }
-  }
-  *data_in = ptr_sub_filter;
-  fpga_free(ptr_tmp);
-}
-
-void deconv_NC_convert(float** filter_in, int kernel_num, int channels,
-                       int hw) {
-  float* tmp = *filter_in;
-  float* ptr_filter = reinterpret_cast<float*>(paddle_mobile::fpga::fpga_malloc(
-      hw * kernel_num * channels * sizeof(float)));
-
-  for (int c = 0; c < channels; ++c) {
-    for (int n = 0; n < kernel_num; ++n) {
-      paddle_mobile::fpga::fpga_copy(ptr_filter + n * hw + kernel_num * hw * c,
-                                     tmp + n * channels * hw + c * hw,
-                                     hw * sizeof(float));
-    }
-  }
-  *filter_in = ptr_filter;
-  paddle_mobile::fpga::fpga_free(tmp);
-}
-
-void deconv_format_filter(float** data_in, int num, int channel, int height,
-                          int width, int group_num, float max, int stride) {
-  int data_size = channel * height * width * num;
-
-  /*{
-       float result2 = (float)0;
-       string filename = "origin_filter_data";
-       api::savefile<float>(filename, (void *)*data_in, data_size, result2);
-    }*/
-
-  deconv_inverse_filter(data_in, num, channel, width, height);
-
-  /* {
-          float result2 = (float)0;
-          string filename = "inverse_filter_data";
-          api::savefile<float>(filename, (void *)*data_in, data_size, result2);
-   }*/
-
-  filter::quantize(data_in, data_size, max);
-  /* {
-        char result2 = (char)0;
-        string filename = "quantize_filter_data";
-        api::savefile<char>(filename, (void *)*data_in, data_size, result2);
- }*/
-  char** quantize_data = (char**)data_in;  // NOLINT
-
-  filter::convert_to_hwc(quantize_data, num, channel, height, width);
-  /*{
-       char result2 = (char)0;
-       string filename = "convert_to_hwc_filter_data";
-       api::savefile<char>(filename, (void *)*quantize_data, data_size,
-  result2);
-  }*/
-
-  deconv_get_sub_filter<char>(quantize_data, height, width, stride, num,
-                              channel);
-  /*{
-     char result2 = (char)0;
-     string filename = "sub_filter_filter_data";
-     api::savefile<char>(filename, (void *)*quantize_data, data_size, result2);
-}*/
-
-  int sub_conv_n = stride;
-  int sub_h = height / sub_conv_n;
-  int sub_w = width / sub_conv_n;
-  int sub_chw = sub_h * sub_w * channel;
-  int sub_num = sub_conv_n * num;
-  int division_capacity = filter::calc_division_capacity(sub_chw);
-  int num_per_div_before_alignment =
-      filter::calc_num_per_div(sub_num, group_num, division_capacity);
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-  int div_num = (sub_num + num_per_div_before_alignment - 1) /
-                num_per_div_before_alignment;
-  int residual = (sub_num) % num_per_div_before_alignment;
-  int num_after_alignment = num_per_div_after_alignment *
-                                ((residual == 0) ? div_num : (div_num - 1)) +
-                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
-
-  char** ptr_ptr_data =
-      reinterpret_cast<char**>(fpga_malloc(sub_conv_n * sizeof(char*)));
-  int origin_offset = sub_chw * sub_num;
-  for (int i = 0; i < sub_conv_n; ++i) {
-    (ptr_ptr_data)[i] =
-        reinterpret_cast<char*>(fpga_malloc(origin_offset * sizeof(char)));
-    fpga_copy((ptr_ptr_data)[i], (*quantize_data) + origin_offset * i,
-              origin_offset * sizeof(char));
-
-    /* char result2 = (char)0;
-     string filename = "ptr_ptr_data" + to_string(i);
-     api::savefile<char>(filename, (void *)(ptr_ptr_data[i]), origin_offset,
-     result2);
-     */
-  }
-  // char result2 = (char)0;
-  //      string filename = "interleave";
-  //      api::savefile<char>(filename, (void *)*ptr_ptr_data, origin_offset,
-  //      result2);
-  fpga_free(*quantize_data);
-
-  int align_offset =
-      align_to_x(sub_chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment;
-  char* ptr_space = reinterpret_cast<char*>(fpga_malloc(
-      sub_conv_n * align_offset * sizeof(char)));  // continuous space
-  for (int i = 0; i < sub_conv_n; ++i) {
-    char* ptr_tmp = (ptr_ptr_data)[i];
-
-    filter::align_element(&ptr_tmp, sub_num, sub_chw);
-    filter::align_num(&ptr_tmp, num_per_div_before_alignment, sub_num, sub_chw);
-
-    filter::reorder(&ptr_tmp, num_after_alignment, sub_chw);
-    filter::interleave(&ptr_tmp, num_after_alignment, sub_chw);
-
-    /*   char result2 = (char)0;
-       string filename = "interleave" + to_string(i);
-       api::savefile<char>(filename, (void *)ptr_tmp, align_offset, result2);
-*/
-    fpga_copy(ptr_space + i * align_offset, ptr_tmp, align_offset);
-    fpga_free(ptr_tmp);
-  }
-  fpga_free(ptr_ptr_data);
-  *data_in = reinterpret_cast<float*>(ptr_space);
-
-  /*    {
-        char result2 = (char)0;
-         string filename = "ptr_space";
-         api::savefile<char>(filename, (void *)ptr_space, sub_conv_n *
-     align_offset, result2);
-      }*/
-  fpga_flush(ptr_space, sub_conv_n * align_offset * sizeof(char));
-}
-
-void DWDconv_format_filter(float** data_in, int num, int channel, int height,
-                           int width, float* scale_ptr, int stride) {
-  deconv_inverse_filter(data_in, num, channel, width, height);
-
-  filter::quantize_to_fp16(data_in, channel, height, width, scale_ptr);
-  int16_t** quantize_data = (int16_t**)data_in;  // NOLINT
-  filter::convert_to_hwn(quantize_data, channel, height, width);
-
-  deconv_get_sub_filter<int16_t>(quantize_data, height, width, stride, num,
-                                 channel);
-
-  filter::align_element_n(quantize_data, channel, height, width);
-  fpga_flush(*quantize_data, align_to_x(channel, FILTER_ELEMENT_ALIGNMENT) *
-                                 height * width * sizeof(int16_t));
-}
-
-}  // namespace deconv_filter
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V1/deconv_filter.h b/mobile/src/fpga/V1/deconv_filter.h
deleted file mode 100644
index f1a50b95c5..0000000000
--- a/mobile/src/fpga/V1/deconv_filter.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle_mobile {
-namespace fpga {
-namespace deconv_filter {
-
-void deconv_inverse_filter(float** data_in, int num, int channel, int width,
-                           int height);
-int deconv_calc_sub_pad(int filter_axis, int pad, int stride);
-int deconv_get_sub_filter_axis(int filter_axis, int stride);
-int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis);
-int deconv_get_omit(int stride, int filter_width, int pad);
-
-template <typename T>
-void deconv_get_sub_filter(T** data_in, int height, int width, int sub_conv_n,
-                           int kernel_num, int channel);
-void deconv_format_filter(float** data_in, int num, int channel, int height,
-                          int width, int group_num, float max, int stride);
-void deconv_NC_convert(float** filter_in, int kernel_num, int channels, int hw);
-void DWDconv_format_filter(float** data_in, int num, int channel, int height,
-                           int width, float* scale_ptr, int stride);
-
-}  // namespace deconv_filter
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V1/filter.cpp b/mobile/src/fpga/V1/filter.cpp
deleted file mode 100644
index 425d1d1b5c..0000000000
--- a/mobile/src/fpga/V1/filter.cpp
+++ /dev/null
@@ -1,362 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/V1/filter.h"
-#include <memory.h>
-#include <algorithm>
-#include "fpga/common/fpga_common.h"
-
-namespace paddle_mobile {
-namespace fpga {
-namespace filter {
-
-int calc_division_capacity(int chw) {
-  int n = 2048 / ((chw + 15) / 16) * 32;
-  return n < 2048 ? n : 2048;
-}
-
-int calc_split_num(int num, int division_capacity) {
-  return (num + division_capacity - 1) / division_capacity;
-}
-
-int calc_division_number(int num, int group_num, int division_capacity) {
-  //  PADDLE_MOBILE_ENFORCE(num % group_num == 0,
-  //                        "Filter number should be divisible by group
-  //                        number");
-  int split_num = calc_split_num(num, division_capacity);
-  //  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
-  //                        "Split number or group number should be 1");
-  return group_num * split_num;
-}
-
-int calc_num_per_div(int num, int group_num, int division_capacity) {
-  //  PADDLE_MOBILE_ENFORCE(num % group_num == 0,
-  //                        "Filter number should be divisible by group
-  //                        number");
-  int split_num = calc_split_num(num, division_capacity);
-  //  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
-  //                        "Split number or group number should be 1");
-  if (group_num == 1) {
-    if (num > division_capacity) {
-      return division_capacity;
-    } else {
-      return num;
-    }
-  } else {
-    return (num + group_num - 1) / group_num;
-  }
-}
-
-void convert_to_hwc(char **data_in, int num, int channel, int height,
-                    int width) {
-  char *tmp = *data_in;
-  int chw = channel * height * width;
-  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));  // NOLINT
-  for (int n = 0; n < num; n++) {
-    int64_t amount_per_row = width * channel;
-    for (int c = 0; c < channel; c++) {
-      for (int h = 0; h < height; h++) {
-        int64_t offset_height = h * amount_per_row;
-        for (int w = 0; w < width; w++) {
-          *(data_tmp + n * chw + offset_height + w * channel + c) =
-              *((*data_in)++);
-        }
-      }
-    }
-  }
-
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-float find_max(float *data_in, int data_size) {
-  float max = 0.0;
-  for (int i = 0; i < data_size; ++i) {
-    float value = data_in[i];
-    float abs = value > 0 ? value : -value;
-    max = std::max(max, abs);
-  }
-  return max;
-}
-
-signed char float_to_int8(float fdata) {
-  if (fdata < 0.0) {
-    fdata -= 0.5;
-  } else {
-    fdata += 0.5;
-  }
-  return (signed char)fdata;
-}
-
-void quantize(float **data_in, int data_size, float max) {
-  float *tmp = *data_in;
-  float fix_range = 127;
-  float scale = fix_range / max;
-
-  signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
-  for (int i = 0; i < data_size; i++) {
-    tmp_data[i] = float_to_int8(
-        (*data_in)[i] * scale);  // (signed char)((*data_in)[i] * scale);
-  }
-  *data_in = (float *)tmp_data;  // NOLINT
-  fpga_free(tmp);
-}
-
-void align_element(char **data_in, int num, int chw) {
-  int i = 0;
-  int j = 0;
-  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  if (align_chw != chw) {
-    char *tmp = *data_in;
-    char *data_tmp =
-        (char *)fpga_malloc(num * align_chw * sizeof(char));  // NOLINT
-
-    memset(data_tmp, 0, num * align_chw);
-    for (j = 0; j < num; j++) {
-      memcpy(data_tmp + j * align_chw, (*data_in) + j * chw, chw);
-    }
-    *data_in = data_tmp;
-    fpga_free(tmp);
-  }
-}
-
-void align_num(char **data_in, int num_per_div_before_alignment, int num,
-               int chw) {
-  int i = 0;
-  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-
-  char *tmp = *data_in;
-  int div_num =
-      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int num_element = div_num * num_per_div_after_alignment * align_chw;
-  char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char));  // NOLINT
-
-  memset(data_tmp, 0, num_element * sizeof(char));
-
-  for (i = 0; i < div_num - 1; i++) {
-    memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
-           *data_in + num_per_div_before_alignment * align_chw * i,
-           num_per_div_before_alignment * align_chw);
-  }
-
-  memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
-         *data_in + num_per_div_before_alignment * align_chw * i,
-         (num - (div_num - 1) * num_per_div_before_alignment) * align_chw);
-
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void reorder(char **data_in, int num_after_alignment, int chw) {
-  int index = 0;
-  int new_index;
-
-  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-
-  char *data_tmp =
-      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
-                          sizeof(char));
-  char *tmp = *data_in;
-  for (index = 0; index < num_after_alignment; index++) {
-    new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) +
-                (index / 16 % 2 * 4);
-    memcpy(data_tmp + index * chw_align, *data_in + new_index * chw_align,
-           chw_align);
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void interleave(char **data_in, int num_after_alignment, int chw) {
-  int i = 0;
-  int j = 0;
-  int k = 0;
-  int interleave_per_num = 16;
-
-  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  char *data_tmp =
-      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
-                          sizeof(char));
-  char *tmp = *data_in;
-  int interleave_num = chw_align * 2 / interleave_per_num;
-  for (i = 0; i < num_after_alignment; i += 2) {
-    for (j = 0, k = 0; j < interleave_num; j += 2, k++) {
-      memcpy(data_tmp + i * chw_align + interleave_per_num * j,
-             *data_in + i * chw_align + interleave_per_num * k,
-             interleave_per_num);
-      memcpy(data_tmp + i * chw_align + interleave_per_num * (j + 1),
-             *data_in + (i + 1) * chw_align + interleave_per_num * k,
-             interleave_per_num);
-    }
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void format_filter(float **data_in, int num, int channel, int height, int width,
-                   int group_num, float max) {
-  int data_size = channel * height * width * num;
-  int chw = channel * height * width;
-
-  int division_capacity = calc_division_capacity(chw);
-  int num_per_div_before_alignment =
-      calc_num_per_div(num, group_num, division_capacity);
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-  int div_num =
-      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int residual = num % num_per_div_before_alignment;
-  int num_after_alignment = num_per_div_after_alignment *
-                                ((residual == 0) ? div_num : (div_num - 1)) +
-                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
-  quantize(data_in, data_size, max);
-  char **quantize_data = (char **)data_in;  // NOLINT
-  convert_to_hwc(quantize_data, num, channel, height, width);
-  align_element(quantize_data, num, chw);
-  if (num_after_alignment != num) {
-    align_num(quantize_data, num_per_div_before_alignment, num, chw);
-  }
-
-  reorder(quantize_data, num_after_alignment, chw);
-  interleave(quantize_data, num_after_alignment, chw);
-  fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
-                                 num_after_alignment * sizeof(char));
-}
-
-void convert_fc_filter(char **data_in, int num, int chw) {
-  char *tmp = *data_in;
-  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));  // NOLINT
-  for (int n = 0; n < num; n++) {
-    for (int c = 0; c < chw; c++) {
-      data_tmp[n * chw + c] = (*data_in)[num * c + n];
-    }
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void format_fc_filter(float **data_in, int num, int channel, int height,
-                      int width, int group_num, float max) {
-  int data_size = channel * height * width * num;
-  int chw = channel * height * width;
-
-  int division_capacity = calc_division_capacity(chw);
-  int num_per_div_before_alignment =
-      calc_num_per_div(num, group_num, division_capacity);
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-  int div_num =
-      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int residual = num % num_per_div_before_alignment;
-  int num_after_alignment = num_per_div_after_alignment *
-                                ((residual == 0) ? div_num : (div_num - 1)) +
-                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
-
-  quantize(data_in, data_size, max);
-  char **quantize_data = (char **)data_in;  // NOLINT
-  convert_fc_filter(quantize_data, num, chw);
-  convert_to_hwc(quantize_data, num, channel, height, width);
-  align_element(quantize_data, num, chw);
-  if (num_after_alignment != num) {
-    align_num(quantize_data, num_per_div_before_alignment, num, chw);
-  }
-  reorder(quantize_data, num_after_alignment, chw);
-  interleave(quantize_data, num_after_alignment, chw);
-  fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
-                                 num_after_alignment * sizeof(char));
-}
-void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
-  int16_t *tmp = *data_in;
-  int16_t *data_tmp =
-      (int16_t *)fpga_malloc(height * width * num * sizeof(int16_t));  // NOLINT
-  for (int n = 0; n < num; n++) {
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        *(data_tmp + h * width * num + w * num + n) = *((*data_in)++);
-      }
-    }
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void align_element_n(int16_t **data_in, int num, int height, int width) {
-  int unalign_n = num;
-  int align_n = align_to_x(num, FILTER_ELEMENT_ALIGNMENT);
-  if (unalign_n == align_n) {
-    return;
-  } else {
-    int16_t *tmp = *data_in;
-
-    int num_element = height * width * align_n;
-    int16_t *data_tmp =
-        (int16_t *)fpga_malloc(num_element * sizeof(int16_t));  // NOLINT
-
-    memset(data_tmp, 0, num_element * sizeof(int16_t));
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        int offset_unalign = h * width * unalign_n + w * unalign_n;
-        int offset_align = h * width * align_n + w * align_n;
-        for (int n = 0; n < unalign_n; n++) {
-          data_tmp[offset_align + n] = *((*data_in) + offset_unalign + n);
-        }
-      }
-    }
-
-    *data_in = data_tmp;
-    fpga_free(tmp);
-  }
-}
-void quantize_to_fp16(float **data_in, int num, int height, int width,
-                      float *scale_ptr) {
-  float *tmp = *data_in;
-  int size = num * height * width;
-
-  int16_t *tmp_data = (int16_t *)fpga_malloc(size * sizeof(int16_t));  // NOLINT
-  for (int n = 0; n < num; n++) {
-    float scale_val = scale_ptr[n];
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        int index = n * height * width + h * width + w;
-        tmp_data[index] = fp32_2_fp16((*data_in)[index] * scale_val);
-      }
-    }
-  }
-  *data_in = (float *)tmp_data;  // NOLINT
-  fpga_free(tmp);
-}
-void format_dwconv_filter(float **data_in, int num, int height, int width,
-                          float *scale_ptr) {
-  quantize_to_fp16(data_in, num, height, width, scale_ptr);
-  int16_t **quantize_data = (int16_t **)data_in;  // NOLINT
-  convert_to_hwn(quantize_data, num, height, width);
-  align_element_n(quantize_data, num, height, width);
-  fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
-                                 height * width * sizeof(int16_t));
-}
-
-void format_DWDeconv_filter(float **data_in, int num, int height, int width,
-                            float *scale_ptr) {
-  quantize_to_fp16(data_in, num, height, width, scale_ptr);
-  int16_t **quantize_data = (int16_t **)data_in;  // NOLINT
-  convert_to_hwn(quantize_data, num, height, width);
-  align_element_n(quantize_data, num, height, width);
-  fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
-                                 height * width * sizeof(int16_t));
-}
-}  // namespace filter
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V1/filter.h b/mobile/src/fpga/V1/filter.h
deleted file mode 100755
index 4812a75af2..0000000000
--- a/mobile/src/fpga/V1/filter.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cstdint>
-namespace paddle_mobile {
-namespace fpga {
-namespace filter {
-
-int calc_division_capacity(int chw);
-int calc_split_num(int num, int division_capacity);
-int calc_division_number(int num, int group_num, int division_capacity);
-int calc_num_per_div(int num, int group_num, int division_capacity);
-void convert_to_hwc(char** data_in, int num, int channel, int height,
-                    int width);
-float find_max(float* data_in, int data_size);
-void quantize(float** data_in, int data_size, float max);
-void align_element(char** data_in, int num, int chw);
-void align_num(char** data_in, int num_per_div_before_alignment, int num,
-               int chw);
-void reorder(char** data_in, int num_after_alignment, int chw);
-void interleave(char** data_in, int num_after_alignment, int chw);
-void format_filter(float** data_in, int num, int channel, int height, int width,
-                   int group_num, float max);
-
-void convert_fc_filter(char** data_in, int num, int chw);
-void format_fc_filter(float** data_in, int num, int channel, int height,
-                      int width, int group_num, float max);
-
-void convert_to_hwn(int16_t** data_in, int num, int height, int width);
-void align_element_n(int16_t** data_in, int num, int height, int width);
-void quantize_to_fp16(float** data_in, int num, int height, int width,
-                      float* scale_ptr);
-void format_dwconv_filter(float** data_in, int num, int height, int width,
-                          float* scale_ptr);
-
-}  // namespace filter
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V1/image.cpp b/mobile/src/fpga/V1/image.cpp
deleted file mode 100644
index 4ba5af83ab..0000000000
--- a/mobile/src/fpga/V1/image.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/V1/image.h"
-
-namespace paddle_mobile {
-namespace fpga {
-namespace image {
-
-void convert_to_hwc(float **data_in, int channel, int height, int width,
-                    int num) {
-  float *data_tmp = reinterpret_cast<float *>(
-      fpga_malloc(num * channel * height * width * sizeof(float)));
-  int64_t amount_per_row = width * channel;
-  for (int n = 0; n < num; n++) {
-    for (int c = 0; c < channel; c++) {
-      for (int h = 0; h < height; h++) {
-        int64_t offset_height = h * amount_per_row;
-        for (int w = 0; w < width; w++) {
-          *(data_tmp + n * channel * height * width + offset_height +
-            w * channel + c) = *((*data_in)++);
-        }
-      }
-    }
-  }
-  *data_in = data_tmp;
-}
-
-void convert_to_chw(float **data_in, int channel, int height, int width,
-                    int num) {
-  float *data_tmp =
-      (float *)fpga_malloc(channel * height * width * sizeof(float));  // NOLINT
-  int64_t amount_per_side = width * height;
-  for (int n = 0; n < num; n++) {
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        for (int c = 0; c < channel; c++) {
-          *(data_tmp + n * height * width * channel + c * amount_per_side +
-            width * h + w) = *((*data_in)++);
-        }
-      }
-    }
-  }
-  *data_in = data_tmp;
-}
-
-void concat_images(int16_t **images_in, float **scales_in, void *image_out,
-                   float *scale_out, int image_num, uint32_t *channel_num,
-                   int height, int width) {
-  int i = 0;
-  int j = 0;
-  int k = 0;
-  int each_out_line_channel = 0;
-  int align_each_out_area_cw = 0;
-  int align_each_in_area_cw = 0;
-  int align_each_out_area_cw_differ = 0;
-  int tmp_channel = 0;
-  scale_out[0] = 0.0;
-  scale_out[1] = 0.0;
-  for (i = 0; i < image_num; i++) {
-    each_out_line_channel += channel_num[i];
-    scale_out[0] = std::max(*scale_out, scales_in[i][0]);
-    fpga_invalidate(images_in[i],
-                    height *
-                        align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) *
-                        sizeof(int16_t));
-  }
-  scale_out[1] = 1 / scale_out[0];
-  align_each_out_area_cw =
-      align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT);
-  align_each_out_area_cw_differ =
-      align_each_out_area_cw - each_out_line_channel * width;
-
-  for (k = 0; k < height; k++) {
-    for (j = 0; j < width; j++) {
-      for (i = 0; i < image_num; i++) {
-        align_each_in_area_cw =
-            align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
-        memcpy((int16_t *)image_out + tmp_channel +  // NOLINT
-                   k * align_each_out_area_cw_differ,
-               images_in[i] + j * channel_num[i] + k * align_each_in_area_cw,
-               channel_num[i] * sizeof(int16_t));
-
-        tmp_channel += channel_num[i];
-      }
-    }
-  }
-
-  fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int16_t));
-}
-
-void split_image(int16_t *image_in, const float *scale_in, void **images_out,
-                 float **scales_out, int image_num,
-                 const uint32_t *channel_nums, int height, int width) {
-  int total_channel = 0;
-  for (int i = 0; i < image_num; i++) {
-    scales_out[i][0] = scale_in[0];
-    scales_out[i][1] = scale_in[1];
-    total_channel += channel_nums[i];
-  }
-  int element_num = height * align_to_x(width * total_channel, IMAGE_ALIGNMENT);
-  fpga_invalidate(image_in, element_num * sizeof(int16_t));
-
-  int src_offset = 0, des_offset = 0;
-  for (int h = 0; h < height; h++) {
-    for (int w = 0; w < width; w++) {
-      src_offset = h * align_to_x(total_channel * width, IMAGE_ALIGNMENT) +
-                   w * total_channel;
-      for (int i = 0; i < image_num; i++) {
-        des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT) +
-                     w * channel_nums[i];
-        memcpy(reinterpret_cast<int16_t *>(images_out[i]) + des_offset,
-               image_in + src_offset, channel_nums[i] * sizeof(int16_t));
-        src_offset += channel_nums[i];
-      }
-    }
-  }
-
-  for (int i = 0; i < image_num; i++) {
-    element_num = height * align_to_x(width * channel_nums[i], IMAGE_ALIGNMENT);
-    fpga_flush(images_out[i], element_num * sizeof(int16_t));
-  }
-}
-
-}  // namespace image
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V1/image.h b/mobile/src/fpga/V1/image.h
deleted file mode 100644
index f5dc6ffe3e..0000000000
--- a/mobile/src/fpga/V1/image.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory.h>
-#include <algorithm>
-#include <cstdint>
-#include "fpga/common/fpga_common.h"
-namespace paddle_mobile {
-namespace fpga {
-namespace image {
-
-void convert_to_hwc(float** data_in, int channel, int height, int width,
-                    int num = 1);
-void convert_to_chw(float** data_in, int channel, int height, int width,
-                    int num = 1);
-// template <typename Dtype>
-// void align_element_conv(Dtype** data_in, int height, int cw);
-// template <typename T>
-// void format_image(T** data_in, int channel, int height, int width);
-template <typename Dtype>
-void align_element_conv(Dtype** data_in, int height, int cw);
-template <typename Dtype>
-void align_element_conv(Dtype** data_in, int height, int cw) {
-  int h = 0;
-  int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
-
-  Dtype* data_tmp =
-      (Dtype*)fpga_malloc(height * align_cw * sizeof(Dtype));  // NOLINT
-
-  memset(data_tmp, 0, height * align_cw * sizeof(Dtype));
-
-  for (h = 0; h < height; h++) {
-    memcpy((void*)(data_tmp + h * align_cw),  // NOLINT
-           (void*)(*data_in + h * cw),        // NOLINT
-           cw * sizeof(Dtype));
-  }
-
-  *data_in = data_tmp;
-}
-template <typename T>
-void format_image(T** data_in, int channel, int height, int width) {
-  int cw = channel * width;
-  int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
-  if (align_cw != cw) {
-    T* hwc_temp = *data_in;
-    align_element_conv(data_in, height, channel * width);
-    fpga_free(hwc_temp);
-  }
-  fpga_flush(*data_in,
-             align_to_x(channel * width, IMAGE_ALIGNMENT) * height * sizeof(T));
-}
-// Concat featuremaps along channel direction
-void concat_images(int16_t** images_in, float** scales_in, void* image_out,
-                   float* scale_out, int image_num, uint32_t* channel_num,
-                   int height, int width);
-
-// Split featuremap along channel direction
-void split_image(int16_t* image_in, const float* scale_in, void** images_out,
-                 float** scales_out, int image_num,
-                 const uint32_t* channel_nums, int height, int width);
-}  // namespace image
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V1/pe.cpp b/mobile/src/fpga/V1/pe.cpp
deleted file mode 100644
index fef971a348..0000000000
--- a/mobile/src/fpga/V1/pe.cpp
+++ /dev/null
@@ -1,1180 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/common/pe.h"
-#include "common/enforce.h"
-#include "common/types.h"
-#include "fpga/V1/filter.h"
-#include "fpga/V1/image.h"
-#include "fpga/common/config.h"
-#include "fpga/common/driver.h"
-#include "fpga/common/fpga_common.h"
-#ifdef COST_TIME_PRINT
-#include <sys/time.h>
-#include <time.h>
-#include <iomanip>
-#include <iostream>
-#endif
-
-namespace paddle_mobile {
-namespace fpga {
-
-using namespace driver;  // NOLINT
-using namespace std;     // NOLINT
-#define USE_RELU 1
-#define USE_BIAS 2
-
-// bypass cmd
-#define CMD_FP16_TO_FP16 0
-#define CMD_FP16_TO_FP32 1
-#define CMD_FP32_TO_FP16 2
-#define CMD_FP32_TO_FP32 3
-#define CMD_INT8_TO_FP16 4
-
-// bypass macro
-#define SIZE_FP16 2
-#define SIZE_FP32 4
-#define SIZE_INT8 1
-
-#define PE_IRQ_TIMEOUT 1000000
-
-/* Interrupt bit-set offset*/
-#define INTERRUPT_RSVD 0x0001
-#define INTERRUPT_BYPASS 0x0002
-#define INTERRUPT_CONV 0x0004
-#define INTERRUPT_POOLING 0x0008
-#define INTERRUPT_EW 0x0010
-
-/* Register offset */
-#define REG_INTERRUPT 0x000
-#define REG_VERSION 0x008
-#define REG_TEMPERATURE 0x010
-#define REG_FPGA_RESET 0x018
-#define REG_TEST_REGISTER 0x048
-#define REG_HARDWARE_STATUS 0x050
-
-#define REG_TIMER_COUNTER 0x070
-
-#define REG_SCALE_PARAMETER 0x080
-#define REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR 0x090
-
-#define REG_FLASH_CMD 0x200
-#define REG_FLASH_DATA 0x208
-#define REG_FLASH_CONFIG 0x210
-#define REG_FLASH_STATUS 0x218
-#define REG_SN 0x220
-
-/*bypass*/
-#define REG_CONVERT_CMD 0x400
-#define REG_CONVERT_SRC_ADDR 0x408
-#define REG_CONVERT_DST_ADDR 0x410
-#define REG_CONVERT_LENGTH 0x418
-
-/*resize*/
-#define REG_RESIZE_CMD 0x600
-#define REG_RESIZE_CHANNEL_NUMBER 0x608
-#define REG_RESIZE_INPUT_IMAGE_PIXEL 0x610
-#define REG_RESIZE_OUTPUT_IMAGE_PIXEL 0x618
-#define REG_RESIZE_INPUT_BASE_ADDR 0x620
-#define REG_RESIZE_WEIGHT_BASE_ADDR 0x628
-#define REG_RESIZE_SRC_POS_BASE_ADDR 0x630
-#define REG_RESIZE_OUTPUT_BASE_ADDR 0x638
-
-/*pooling*/
-#define REG_POOLING_CMD 0x800
-#define REG_POOLING_IMAGE_BASE_ADDR 0x808
-#define REG_POOLING_RESULT_BASE_ADDR 0x810
-#define REG_POOLING_IMAGE_PIXEL 0x818
-#define REG_POOLING_WINDOW_SIZE 0x820
-#define REG_POOLING_RESULT_PIXEL 0x828
-#define REG_POOLING_PAD_PIXEL 0x830
-#define REG_POOLING_STEP_PIXEL 0x838
-#define REG_POOLING_CHANNEL_NUMBER 0x840
-#define REG_POOLING_IMAGE_AMOUNT_PER_ROW 0x848
-#define REG_POOLING_IMAGE_ONE_PAD_PER_ROW 0x850
-#define REG_POOLING_IMAGE_TWO_PAD_PER_ROW 0x858
-#define REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT 0x860
-#define REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT 0x868
-#define REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT 0x870
-#define REG_POOLING_RESULT_AMOUNT_ALIGN_32 0x878
-#define REG_POOLING_RESULT_AMOUNT_ALIGN_64 0x880
-#define REG_POOLING_IMAGE_CALCU_HEIGHT 0x888
-#define REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW 0x898
-#define REG_POOLING_MODE_RECIPROCAL 0x890
-
-/*conv*/
-#define REG_CONV_CMD 0xC00
-#define REG_CONV_IMAGE_BASE_ADDR 0xC08
-#define REG_CONV_FILTER_BASE_ADDR 0xC10
-#define REG_CONV_SB_BASE_ADDR 0xC18
-#define REG_CONV_RESULT_BASE_ADDR 0xC20
-#define REG_CONV_IMAGE_PIXEL 0xC28
-#define REG_CONV_FILTER_PIXEL 0xC30
-#define REG_CONV_RESULT_PIXEL 0xC38
-#define REG_CONV_PAD_PIXEL 0xC40
-#define REG_CONV_STEP_PIXEL 0xC48
-#define REG_CONV_GROUP_NUMBER 0xC50
-#define REG_CONV_FILTER_NUMBER 0xC58
-#define REG_CONV_CHANNEL_NUMBER 0xC60
-#define REG_CONV_FILTER_PER_GROUP 0xC68
-#define REG_CONV_CHANNEL_PER_GROUP 0xC70
-#define REG_CONV_IMAGE_AMOUNT_PER_ROW 0xC78
-#define REG_CONV_IMAGE_ONE_PAD_PER_ROW 0xC80
-#define REG_CONV_IMAGE_TWO_PAD_PER_ROW 0xC88
-#define REG_CONV_FILTER_AMOUNT_ALL 0xC90
-#define REG_CONV_RESULT_AMOUNT_PER_ROW 0xC98
-#define REG_CONV_RESULT_LAST_VALID 0xCA0
-
-#define REG_CONV_BLOCK_AMOUNT_PER_ROW 0xCA8
-#define REG_CONV_FILTER_PAD_WIDTH_MUL_CH 0xCB0
-#define REG_CONV_IMAGE_AMOUNT_PER_ROW_MUL_WIN_F 0xCB8
-#define REG_CONV_IMAGE_AMOUNT_PER_ROW_MUL_WIN 0xCC0
-#define REG_CONV_IMAGE_BLOCK_NUM 0xCC8
-#define REG_CONV_IMAGE_BLOCK_LEN 0xCD0
-#define REG_CONV_IMAGE_BLOCK_LEN_LAST 0xCD8
-#define REG_CONV_IMAGE_WIN_CNT 0xCE0
-#define REG_CONV_IMAGE_WIN_CNT_LAST 0xCE8
-#define REG_CONV_RES_ROW_DATA_ALIGN4_PAD 0xCF8
-#define REG_CONV_PROG_FULL_CNT 0xD08
-#define REG_CONV_POST_PROG_FULL_CNT 0xD10
-#define REG_CONV_FPGA_BIAS_SCALE_LEN 0xD20
-
-#define REG_CONV_IMAGE_SCALE 0xD28
-#define REG_CONV_FILTER_SCALE 0xD30
-
-/*ew*/
-#define REG_EW_CMD 0x0F00
-#define REG_EW_IMAGE0_BASE_ADDR 0x0F08
-#define REG_EW_IMAGE1_BASE_ADDR 0x0F10
-#define REG_EW_RESULT_BASE_ADDR 0x0F18
-#define REG_EW_DATA_LEN 0x0F20
-#define REG_EW_COEFFICIENT 0x0F28
-#define REG_EW_IMAGE_PIXEL 0x0F30
-#define REG_EW_IMAGE_AMOUNT_PER_ROW 0x0F38
-
-/*dwconv*/
-#define REG_DWCONV_FILTER_BASE_ADDR 0xe08
-#define REG_DWCONV_FILTER_SHAPE 0xe10
-#define REG_DWCONV_FILTER_N_ALIGN 0xe18
-#define REG_DWCONV_FILTER_SUBNUMBER 0xe20
-#define REG_DWCONV_CMD 0xe00
-
-int ComputeFpgaConv(const struct SplitConvArgs &args) {
-//  ComputeBasicConv(args.conv_arg[0]);
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFPGAConv===========";
-  DLOG << "   filter_num:" << args.filter_num
-       << "   group_num:" << args.group_num
-       << "   split_num:" << args.split_num;
-#endif
-  int ret = 0;
-  int split_num = args.split_num;
-  for (int i = 0; i < split_num; i++) {
-    ret |= ComputeBasicConv(args.conv_arg[i]);
-  }
-
-  if (split_num > 1) {
-    ComputeFPGAConcat(args.concat_arg);
-  }
-
-  return ret;
-}
-
-int ComputeBasicConv(const struct ConvArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "======Compute Basic Conv======";
-  // DLOG << "   relu_enabled:" << args.relu_enabled
-  DLOG << "   sb_address:" << args.sb_address
-       << "   filter_address:" << args.filter_address
-       << "   filter_num:" << args.filter_num
-       << "   group_num:" << args.group_num;
-  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
-       << "   image_channels:" << args.image.channels
-       << "   image_height:" << args.image.height
-       << "   image_width:" << args.image.width
-       << "   pad_height:" << args.image.pad_height
-       << "   pad_width:" << args.image.pad_width;
-  DLOG << "   kernel_height:" << args.kernel.height
-       << "   kernel_width:" << args.kernel.width
-       << "   stride_h:" << args.kernel.stride_h
-       << "   stride_w:" << args.kernel.stride_w;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-
-#ifdef PADDLE_MOBILE_ZU5
-  int ret = 0;
-  uint64_t output_scale = 0;
-
-  uint64_t reg_ActivationArgs = 0;
-  // active function:{none,leakeyrelu,sigmoid,tanh}
-  ActivationArgs active_args;
-  // active_args.activation_type = LEAKYRELU;
-
-  active_args.activation_type = args.output.activation.activation_type;
-
-  active_args.leaky_relu_negative_slope =
-      args.output.activation.leaky_relu_negative_slope;
-
-  reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
-                       active_args.leaky_relu_negative_slope;
-
-  DLOG << "   activation_type:" << active_args.activation_type
-       << "   leaky_relu_negative_slope:"
-       << active_args.leaky_relu_negative_slope;
-  DLOG << "   reg_ActivationArgs:" << reg_ActivationArgs;
-
-  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
-  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) {
-    ret = -EIO;
-    DLOG << "Conv Status Error!";
-    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-    return ret;
-  }
-
-  reg_writeq(reg_ActivationArgs,
-             REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
-
-  reg_writeq(output_scale, REG_SCALE_PARAMETER);
-  reg_writeq(
-      ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32),
-      REG_CONV_IMAGE_PIXEL);
-  reg_writeq(
-      ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32),
-      REG_CONV_FILTER_PIXEL);
-
-  uint64_t output_height_fraction =
-      args.driver.output_height / ROW_PARALLEL_NUM;
-  uint64_t output_height_remainder =
-      args.driver.output_height % ROW_PARALLEL_NUM;
-  reg_writeq(args.driver.output_height | (output_height_fraction << 16) |
-                 (output_height_remainder << 26) |
-                 (args.driver.output_width << 32),
-             REG_CONV_RESULT_PIXEL);
-  reg_writeq(((uint64_t)args.image.pad_height) |
-                 (((uint64_t)args.image.pad_width) << 32),
-             REG_CONV_PAD_PIXEL);
-  reg_writeq(((uint64_t)args.kernel.stride_h) |
-                 (((uint64_t)args.kernel.stride_w) << 32),
-             REG_CONV_STEP_PIXEL);
-  reg_writeq((uint64_t)args.group_num, REG_CONV_GROUP_NUMBER);
-  reg_writeq((uint64_t)args.filter_num, REG_CONV_FILTER_NUMBER);
-  reg_writeq((uint64_t)args.image.channels, REG_CONV_CHANNEL_NUMBER);
-  reg_writeq(*(uint64_t *)args.image.scale_address,  // NOLINT
-             REG_CONV_IMAGE_SCALE);
-  reg_writeq(*(uint64_t *)args.filter_scale_address,  // NOLINT
-             REG_CONV_FILTER_SCALE);
-  reg_writeq(args.driver.image_address_phy, REG_CONV_IMAGE_BASE_ADDR);
-  reg_writeq(args.driver.filter_address_phy, REG_CONV_FILTER_BASE_ADDR);
-  reg_writeq(args.driver.sb_address_phy, REG_CONV_SB_BASE_ADDR);
-  reg_writeq(args.driver.output_address_phy, REG_CONV_RESULT_BASE_ADDR);
-  reg_writeq(args.driver.filter_per_group, REG_CONV_FILTER_PER_GROUP);
-  reg_writeq(args.driver.channel_per_group, REG_CONV_CHANNEL_PER_GROUP);
-  reg_writeq(args.driver.image_amount_per_row, REG_CONV_IMAGE_AMOUNT_PER_ROW);
-  reg_writeq(args.driver.image_one_pad_per_row, REG_CONV_IMAGE_ONE_PAD_PER_ROW);
-  reg_writeq(args.driver.filter_amount_all, REG_CONV_FILTER_AMOUNT_ALL);
-  reg_writeq(args.driver.output_amount_per_row, REG_CONV_RESULT_AMOUNT_PER_ROW);
-  reg_writeq(args.driver.image_block_amount_per_row, 0xca8);
-  reg_writeq(args.driver.filter_pad_width_mul_channel, 0xcb0);
-  reg_writeq(args.driver.image_amount_per_row_multi_win_first, 0xcb8);
-  reg_writeq(args.driver.image_amount_per_row_multi_win, 0xcc0);
-  reg_writeq(args.driver.image_block_num, 0xcc8);
-  reg_writeq(args.driver.image_block_len, 0xcd0);
-  reg_writeq(args.driver.image_block_len_last, 0xcd8);
-  reg_writeq(args.driver.image_win_cnt, 0xce0);
-  reg_writeq(args.driver.image_win_cnt_last, 0xce8);
-  reg_writeq(args.driver.res_row_data_align4_pad, 0xcf8);
-  reg_writeq(args.driver.prog_full_cnt, 0xd08);
-  reg_writeq(args.driver.post_prog_full_cnt, 0xd10);
-  reg_writeq(args.driver.deconv_param, 0xd18);
-  reg_writeq(args.driver.fpga_bias_scale_len / 4, 0xd20);
-  reg_writeq(args.driver.cmd, REG_CONV_CMD);
-  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) {
-    g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR;
-    ret = -EIO;
-    DLOG << "Conv Wait Irq Timeout!";
-    PADDLE_MOBILE_ENFORCE(0, "Conv Wait Irq Timeout");
-  }
-  output_scale = reg_readq(REG_SCALE_PARAMETER);
-  output_scale = (output_scale << 32) | (output_scale >> 32);
-  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-
-  active_args.activation_type = NONE;
-  reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
-
-  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-
-  return ret;
-#endif
-  return 0;
-}  // ComputeBasicConv
-
-int ComputeFpgaPool(const struct PoolingArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFpgaPool===========";
-  DLOG << "   mode:" << args.mode
-       << "   kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal);
-  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
-       << "   image_channels:" << args.image.channels
-       << "   image_height:" << args.image.height
-       << "   image_width:" << args.image.width
-       << "   pad_height:" << args.image.pad_height
-       << "   pad_width:" << args.image.pad_width;
-  DLOG << "   kernel_height:" << args.kernel.height
-       << "   kernel_width:" << args.kernel.width
-       << "   stride_h:" << args.kernel.stride_h
-       << "   stride_w:" << args.kernel.stride_w;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-#ifdef PADDLE_MOBILE_ZU5
-  DLOG << "Polling";
-  // return 0;
-  uint64_t output_scale = 0;
-  uint64_t timer_cnt = 0;
-  int ret = 0;
-  uint64_t cmd = 0;
-  uint64_t image_physical_address = 0;
-  uint64_t output_physical_address = 0;
-
-  uint64_t reg_ActivationArgs = 0;
-  // active function:{none,leakeyrelu,sigmoid,tanh}
-  ActivationArgs active_args;
-  // active_args.activation_type = LEAKYRELU;
-  active_args.activation_type = args.output.activation.activation_type;
-
-  active_args.leaky_relu_negative_slope =
-      args.output.activation.leaky_relu_negative_slope;
-
-  reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
-                       active_args.leaky_relu_negative_slope;
-
-  DLOG << "   activation_type:" << active_args.activation_type
-       << "   leaky_relu_negative_slope:"
-       << active_args.leaky_relu_negative_slope;
-  DLOG << "   reg_ActivationArgs:" << reg_ActivationArgs;
-
-  image_physical_address = vaddr_to_paddr_driver(args.image.address);
-  output_physical_address = vaddr_to_paddr_driver(args.output.address);
-  uint32_t output_height = (uint32_t)(
-      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
-          args.kernel.stride_h +
-      1);
-  uint32_t output_width = (uint32_t)(
-      (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
-          args.kernel.stride_w +
-      1);
-  uint64_t image_amount_per_row =
-      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
-                 IMAGE_ALIGNMENT);
-  uint64_t image_one_pad_per_row =
-      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
-                 FILTER_ELEMENT_ALIGNMENT) +
-      (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
-  uint64_t image_two_pad_per_row = align_to_x(
-      ((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) *
-          (uint64_t)args.image.channels,
-      IMAGE_ALIGNMENT);
-  uint64_t image_row_mul_pooling_hight =
-      image_amount_per_row * (uint64_t)args.kernel.height;
-  uint64_t image_row_mul_pad_hight =
-      image_amount_per_row * (uint64_t)args.image.pad_height;
-  uint64_t image_row_mul_step_hight =
-      image_amount_per_row * (uint64_t)args.kernel.stride_h;
-  uint64_t result_amount_align_32 =
-      align_to_x((uint64_t)output_width * (uint64_t)args.image.channels,
-                 FILTER_ELEMENT_ALIGNMENT);
-  uint64_t result_amount_align_64 = align_to_x(
-      (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT);
-  uint64_t image_calcu_height =
-      (uint64_t)args.kernel.height +
-      ((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h;
-  uint64_t image_pad_left = args.image.channels * args.image.pad_width;
-  uint64_t image_skip_window = args.image.channels * args.kernel.stride_w;
-  uint64_t image_padleft_skipwindow =
-      (image_skip_window << 32) | image_pad_left;
-  uint64_t mode_reciprocal = (uint64_t)0 | ((uint64_t)args.mode) << 16 |
-                             (((uint64_t)args.kernel_reciprocal));
-
-  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
-  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
-    ret = -EIO;
-    DLOG << "Conv Status Error!";
-    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-    return ret;
-  }
-
-  reg_writeq(reg_ActivationArgs,
-             REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
-
-  reg_writeq(output_scale, REG_SCALE_PARAMETER);
-  reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR);
-  reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR);
-  reg_writeq(
-      ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32),
-      REG_POOLING_IMAGE_PIXEL);
-  reg_writeq(
-      ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32),
-      REG_POOLING_WINDOW_SIZE);
-  reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32),
-             REG_POOLING_RESULT_PIXEL);
-  reg_writeq(((uint64_t)args.image.pad_height) |
-                 (((uint64_t)args.image.pad_width) << 32),
-             REG_POOLING_PAD_PIXEL);
-  reg_writeq(((uint64_t)args.kernel.stride_h) |
-                 (((uint64_t)args.kernel.stride_w) << 32),
-             REG_POOLING_STEP_PIXEL);
-  reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER);
-  reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW);
-  reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW);
-  reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW);
-  reg_writeq(image_row_mul_pooling_hight,
-             REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT);
-  reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT);
-  reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT);
-  reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32);
-  reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64);
-  reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT);
-  reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW);
-  reg_writeq(mode_reciprocal, REG_POOLING_MODE_RECIPROCAL);
-  reg_writeq(cmd, REG_POOLING_CMD);
-
-  DLOG << "before reg poll";
-  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
-    g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR;
-    ret = -EIO;
-    DLOG << "Pooling Wait Irq Timeout!";
-    PADDLE_MOBILE_ENFORCE(0, "Pooling Wait Irq Timeout!");
-  }
-  DLOG << "after reg poll";
-
-  // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
-  output_scale = reg_readq(REG_SCALE_PARAMETER);
-  output_scale = (output_scale << 32) | (output_scale >> 32);
-  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-
-  active_args.activation_type = NONE;
-  reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
-
-  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-
-  return ret;
-#endif
-  return 0;
-}  // ComputeFpgaPool
-
-int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFpgaEWAdd===========";
-  // DLOG << "   relu_enabled:" << args.relu_enabled
-  DLOG << "   const0:" << fp16_2_fp32(int16_t(args.const0))
-       << "   const1:" << fp16_2_fp32(int16_t(args.const1));
-  DLOG << "   image0_address:" << args.image0.address
-       << "   image0_scale_address:" << args.image0.scale_address
-       << "   image0_channels:" << args.image0.channels
-       << "   image0_height:" << args.image0.height
-       << "   image0_width:" << args.image0.width
-       << "   pad0_height:" << args.image0.pad_height
-       << "   pad0_width:" << args.image0.pad_width;
-  DLOG << "   image1_address:" << args.image1.address
-       << "   image1_scale_address:" << args.image1.scale_address
-       << "   image1_channels:" << args.image1.channels
-       << "   image1_height:" << args.image1.height
-       << "   image1_width:" << args.image1.width
-       << "   pad1_height:" << args.image1.pad_height
-       << "   pad_width:" << args.image1.pad_width;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-#ifdef PADDLE_MOBILE_ZU5
-  int ret = 0;
-  uint64_t output_scale = 0;
-
-  uint64_t reg_ActivationArgs = 0;
-  ActivationArgs active_args;
-  active_args.activation_type = args.output.activation.activation_type;
-  active_args.leaky_relu_negative_slope =
-      args.output.activation.leaky_relu_negative_slope;
-  reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
-                       active_args.leaky_relu_negative_slope;
-  DLOG << "    activation_type:" << active_args.activation_type
-       << "    leaky_relu_negative_slope:"
-       << active_args.leaky_relu_negative_slope;
-  DLOG << "    reg_ActivationArgs:" << reg_ActivationArgs;
-
-  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
-  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) {
-    ret = -EIO;
-    DLOG << "EW Status Error!";
-    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-    return ret;
-  }
-
-  reg_writeq(reg_ActivationArgs,
-             REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
-
-  reg_writeq(output_scale, REG_SCALE_PARAMETER);
-  reg_writeq(args.driver.image0_address_phy, REG_EW_IMAGE0_BASE_ADDR);
-  reg_writeq(args.driver.image1_address_phy, REG_EW_IMAGE1_BASE_ADDR);
-  reg_writeq(args.driver.datalen, REG_EW_DATA_LEN);
-  reg_writeq(args.driver.image_image_pixel, REG_EW_IMAGE_PIXEL);
-  reg_writeq(args.driver.image_amount_per_row, REG_EW_IMAGE_AMOUNT_PER_ROW);
-  reg_writeq(args.driver.output_address_phy, REG_EW_RESULT_BASE_ADDR);
-  reg_writeq(args.driver.coefficient, REG_EW_COEFFICIENT);
-  reg_writeq(args.driver.cmd, REG_EW_CMD);
-
-  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
-    g_fpgainfo.pe_data->pes[PE_IDX_EW]->status = ERROR;
-    ret = -EIO;
-    DLOG << "EW Wait Irq Timeout!";
-    PADDLE_MOBILE_ENFORCE(0, "EW Wait Irq Timeout!");
-  }
-
-  output_scale = reg_readq(REG_SCALE_PARAMETER);
-  output_scale = (output_scale << 32) | (output_scale >> 32);
-  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-  active_args.activation_type = NONE;
-  reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
-
-  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-  return ret;
-#endif
-  return 0;
-}  // ComputeFpgaEWAdd
-
-int PerformBypass(const struct BypassArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFpgaBypass===========";
-  DLOG << "   input_type:" << args.input_data_type
-       << "   output_type:" << args.output_data_type
-       << "   input_layout_type:" << args.input_layout_type
-       << "   output_layout_type:" << args.output_layout_type;
-  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
-       << "   image_channels:" << args.image.channels
-       << "   image_height:" << args.image.height
-       << "   image_width:" << args.image.width
-       << "   pad_height:" << args.image.pad_height
-       << "   pad_width:" << args.image.pad_width;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-#ifdef PADDLE_MOBILE_ZU5
-  uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
-  uint64_t output_scale = 0;
-  uint64_t timer_cnt = 0;
-  uint64_t cmd = 0;
-  uint64_t datalen = 0;
-  uint64_t input_address_phy = 0;
-  uint64_t output_address_phy = 0;
-  uint8_t data_cell_in = 0;
-  uint8_t data_cell_out = 0;
-  int ret = 0;
-
-  uint64_t reg_ActivationArgs = 0;
-  ActivationArgs active_args;
-  active_args.activation_type = args.output.activation.activation_type;
-
-  active_args.leaky_relu_negative_slope =
-      args.output.activation.leaky_relu_negative_slope;
-
-  reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
-                       active_args.leaky_relu_negative_slope;
-
-  datalen = (uint64_t)args.image.width * (uint64_t)args.image.height *
-            (uint64_t)args.image.channels;
-  datalen = align_to_x(datalen, 16);
-  input_address_phy = vaddr_to_paddr_driver(args.image.address);
-  output_address_phy = vaddr_to_paddr_driver(args.output.address);
-  DLOG << "input_phy:" << input_address_phy;
-  DLOG << "output_phy:" << output_address_phy;
-
-  switch (args.input_data_type) {
-    case DATA_TYPE_FP16: {
-      switch (args.output_data_type) {
-        case DATA_TYPE_FP16:
-          data_cell_in = SIZE_FP16;
-          data_cell_out = SIZE_FP16;
-          cmd = CMD_FP16_TO_FP16;
-          break;
-
-        case DATA_TYPE_FP32:
-          data_cell_in = SIZE_FP16;
-          data_cell_out = SIZE_FP32;
-          cmd = CMD_FP16_TO_FP32;
-          break;
-
-        default:
-          break;
-      }
-    } break;
-
-    case DATA_TYPE_INT8: {
-      if (args.output_data_type != DATA_TYPE_FP16) {
-        DLOG << "error:Output Datetype error,not DATA_TYPE_FP16: "
-             << args.output_data_type;
-      }
-      data_cell_in = SIZE_INT8;
-      data_cell_out = SIZE_FP16;
-      cmd = CMD_INT8_TO_FP16;
-    } break;
-
-    case DATA_TYPE_FP32: {
-      switch (args.output_data_type) {
-        case DATA_TYPE_FP16:
-          data_cell_in = SIZE_FP32;
-          data_cell_out = SIZE_FP16;
-          cmd = CMD_FP32_TO_FP16;
-          break;
-
-        case DATA_TYPE_FP32:
-          data_cell_in = SIZE_FP32;
-          data_cell_out = SIZE_FP32;
-          cmd = CMD_FP32_TO_FP32;
-          break;
-
-        default:
-          break;
-      }
-    } break;
-
-    default:
-      break;
-  }
-  if (cmd != CMD_FP16_TO_FP16 && cmd != CMD_FP16_TO_FP32 &&
-      cmd != CMD_FP32_TO_FP16 && cmd != CMD_FP32_TO_FP32 &&
-      cmd != CMD_INT8_TO_FP16) {
-    //   std::cout<< " err back Error1!" <<std::endl;
-    return -EFAULT;
-  }
-  if ((data_cell_in != SIZE_FP16 && data_cell_in != SIZE_FP32 &&
-       data_cell_in != SIZE_INT8) ||
-      (data_cell_out != SIZE_FP16 && data_cell_out != SIZE_FP32)) {
-    return -EFAULT;
-  }
-  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
-  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_BYPASS]->status) {
-    ret = -EIO;
-    DLOG << "Bypass Status Error!";
-    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-    return ret;
-  }
-  reg_writeq(reg_ActivationArgs,
-             REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
-  reg_writeq(output_scale, REG_SCALE_PARAMETER);
-  reg_writeq(input_address_phy, REG_CONVERT_SRC_ADDR);
-  reg_writeq(output_address_phy, REG_CONVERT_DST_ADDR);
-  reg_writeq(datalen, REG_CONVERT_LENGTH);
-  reg_writeq(cmd, REG_CONVERT_CMD);
-  DLOG << "before reg poll";
-  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_BYPASS, PE_IRQ_TIMEOUT)) {
-    g_fpgainfo.pe_data->pes[PE_IDX_BYPASS]->status = ERROR;
-    ret = -EIO;
-    DLOG << "BYPASS Wait Irq Timeout!";
-    PADDLE_MOBILE_ENFORCE(0, "BYPASS Wait Irq Timeout!");
-  }
-  DLOG << "after reg poll";
-
-  output_scale = reg_readq(REG_SCALE_PARAMETER);
-  output_scale = (output_scale << 32) | (output_scale >> 32);
-  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-  reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
-  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-  return ret;
-#endif
-  return 0;
-}  // PerformBypass
-
-uint64_t FPGAVersion() {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFpgaBypass===========";
-#endif
-#ifdef PADDLE_MOBILE_ZU5
-  uint64_t fpga_ver = 0;
-  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
-  fpga_ver = reg_readq(REG_HARDWARE_STATUS);
-  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-  return fpga_ver;
-#endif
-  return 0;
-}  // FPGAVersion
-
-int ComputeFPGAConcat(const struct ConcatArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFpgaConcat===========";
-  DLOG << "   Image_num: " << args.image_num
-       << "   out_address:" << args.image_out
-       << "   out_scale_address:" << args.scale_out
-       << "   out_channel:" << args.out_channel;
-  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
-  for (int i = 0; i < args.image_num; i++) {
-    DLOG << "   " << i << "th:        ";
-    DLOG << "   channel_num:"
-         << args.channel_num[i]
-         //<< "   aligned_channel_num:" << args.aligned_channel_num[i]
-         << "   image_address:" << args.images_in[i]
-         << "   image_scale_address:" << args.scales_in[i];
-  }
-#endif
-
-  image::concat_images(args.images_in, args.scales_in, args.image_out,
-                       args.scale_out, args.image_num, args.channel_num,
-                       args.height, args.width);
-  return 0;
-}  // ComputeFPGAConcat
-
-void deconv_post_process(const struct DeconvArgs &args) {
-  int sub_conv_n = args.sub_conv_num;
-  int sub_height = args.sub_output_height;
-  int sub_width = args.sub_output_width;
-  int omit_size = args.omit_size;
-  int channel = args.filter_num;
-  int num = 1;
-  int origin_h = sub_height * sub_conv_n;
-  int origin_w = sub_width * sub_conv_n;
-  int align_origin_w = align_to_x(origin_w * channel, 16);
-  int deconv_h = origin_h - 2 * omit_size;
-  int deconv_w = origin_w - 2 * omit_size;
-  int deconv_row_len = deconv_w * channel;
-  int align_deconv_row_len = align_to_x(deconv_row_len, 16);
-
-  for (int idx = 0; idx < sub_conv_n; ++idx) {
-    paddle_mobile::fpga::fpga_invalidate(
-        args.split_conv_args[idx]->output.address,
-        align_origin_w * origin_h * sizeof(int16_t));
-  }
-
-  int deconv_idx = 0;
-  for (int nn = 0; nn < num; ++nn) {
-    for (int hh = 0; hh < origin_h; ++hh) {
-      int hx = (hh % sub_conv_n);
-      auto sub_t =
-          (int16_t *)(args.split_conv_args[sub_conv_n - hx - 1]  // NOLINT
-                          ->output.address);
-      int hi = (hh / sub_conv_n);
-      if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue;
-      int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w +
-                  omit_size * channel);
-      fpga_copy((int16_t *)(args.output.address) + deconv_idx,    // NOLINT
-                sub_t + sidx, sizeof(int16_t) * deconv_row_len);  // NOLINT
-      deconv_idx += align_deconv_row_len;
-    }
-  }
-  fpga_flush(args.output.address,
-             num * align_deconv_row_len * deconv_h * sizeof(int16_t));
-}
-void DWDeconv_post_process(const struct DWDeconvArgs &args) {
-  int sub_conv_n = args.sub_conv_num;
-  int sub_height = args.sub_output_height;
-  int sub_width = args.sub_output_width;
-  int omit_size = args.omit_size;
-  int channel = args.filter_num;
-  int num = 1;
-  int origin_h = sub_height * sub_conv_n;
-  int origin_w = sub_width * sub_conv_n;
-  int align_origin_w = align_to_x(origin_w * channel, IMAGE_ALIGNMENT);
-  int deconv_h = origin_h - 2 * omit_size;
-  int deconv_w = origin_w - 2 * omit_size;
-  int deconv_row_len = deconv_w * channel;
-  int align_deconv_row_len = align_to_x(deconv_row_len, IMAGE_ALIGNMENT);
-
-  for (int idx = 0; idx < sub_conv_n; ++idx) {
-    paddle_mobile::fpga::fpga_invalidate(
-        args.dw_conv_args[idx]->output.address,
-        align_origin_w * origin_h * sizeof(int16_t));
-  }
-
-  int deconv_idx = 0;
-  for (int nn = 0; nn < num; ++nn) {
-    for (int hh = 0; hh < origin_h; ++hh) {
-      int hx = (hh % sub_conv_n);
-      auto sub_t = (int16_t *)(args.dw_conv_args[sub_conv_n - hx - 1]  // NOLINT
-                                   ->output.address);
-      int hi = (hh / sub_conv_n);
-      if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue;
-      int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w +
-                  omit_size * channel);
-      fpga_copy((int16_t *)(args.output.address) + deconv_idx,    // NOLINT
-                sub_t + sidx, sizeof(int16_t) * deconv_row_len);  // NOLINT
-      deconv_idx += align_deconv_row_len;
-    }
-  }
-  fpga_flush(args.output.address,
-             num * align_deconv_row_len * deconv_h * sizeof(int16_t));
-}
-
-int ComputeFpgaDeconv(const struct DeconvArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFPGADeConv===========";
-  DLOG << "   filter_num:" << args.filter_num
-       << "   group_num:" << args.group_num << "omit_size:" << args.omit_size
-       << "sub_output_width: " << args.sub_output_width
-       << "sub_output_height: " << args.sub_output_height
-       << "   sub_conv_num:" << args.sub_conv_num;
-  DLOG << "args.output.address: " << args.output.address
-       << "args.output.scale_address: " << args.output.scale_address;
-
-#endif
-
-  int sub_conv_num = args.sub_conv_num;
-
-#ifdef COST_TIME_PRINT
-  timeval start, end;
-  long dif_sec, dif_usec;  // NOLINT
-#endif
-
-  for (int i = 0; i < sub_conv_num; i++) {
-#ifdef COST_TIME_PRINT
-    gettimeofday(&start, NULL);
-#endif
-
-    ComputeFpgaConv(*args.split_conv_args[i]);
-#ifdef COST_TIME_PRINT
-    gettimeofday(&end, NULL);
-    dif_sec = end.tv_sec - start.tv_sec;
-    dif_usec = end.tv_usec - start.tv_usec;
-    std::cout << "deconv basic_conv: " << i << " times:  "
-              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
-              << std::endl;
-#endif
-  }
-
-  if (sub_conv_num > 1) {
-    float max_scale = -1.0f;
-#ifdef COST_TIME_PRINT
-    gettimeofday(&start, NULL);
-#endif
-    for (int i = 0; i < sub_conv_num; i++) {
-      paddle_mobile::fpga::fpga_invalidate(
-          args.split_conv_args[i]->output.scale_address, 2 * sizeof(float));
-      float ptr_scale = (args.split_conv_args[i]->output.scale_address)[0];
-      if (ptr_scale > max_scale) {
-        args.output.scale_address[0] = ptr_scale;
-        args.output.scale_address[1] =
-            (args.split_conv_args[i]->output.scale_address)[1];
-      }
-    }
-
-#ifdef COST_TIME_PRINT
-    gettimeofday(&end, NULL);
-    dif_sec = end.tv_sec - start.tv_sec;
-    dif_usec = end.tv_usec - start.tv_usec;
-    std::cout << "deconv scale  "
-              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
-              << std::endl;
-#endif
-
-    //    fpga_flush(args.output.scale_address, 2 * sizeof(float));
-    /*#ifdef COST_TIME_PRINT
-    gettimeofday(&start,NULL);
-    #endif
-        //deconv_post_process(args);
-    #ifdef COST_TIME_PRINT
-        gettimeofday(&end,NULL);
-     dif_sec = end.tv_sec - start.tv_sec;
-     dif_usec = end.tv_usec - start.tv_usec;
-      std::cout << "deconv_post_process  " << "    cost time: "  <<
-    (dif_sec*1000000+dif_usec)  << "us" << std::endl; #endif*/
-  }
-
-  return 0;
-}  // ComputeFpgaDeconv
-
-int ComputeFPGASplit(const struct SplitArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFpgaSplit===========";
-  DLOG << "   Image_num: " << args.image_num
-       << "   in_address:" << args.image_in
-       << "   in_scale_address:" << args.scale_in;
-  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
-  for (int i = 0; i < args.image_num; i++) {
-    DLOG << "   " << i << "th:        ";
-    DLOG << "   channel_num:" << args.out_channel_nums[i]
-         << "   image_address:" << args.images_out[i]
-         << "   image_scale_address:" << args.scales_out[i];
-  }
-#endif
-  image::split_image(args.image_in, args.scale_in, args.images_out,
-                     args.scales_out, args.image_num, args.out_channel_nums,
-                     args.height, args.width);
-  return 0;
-}  // ComputeFPGASplit
-int ComputeDWConv(const struct DWconvArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeDWConv===========";
-  // DLOG << "   mode:" << args.relu_enabled;
-  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
-       << "   image_channels:" << args.image.channels
-       << "   image_height:" << args.image.height
-       << "   image_width:" << args.image.width
-       << "   pad_height:" << args.image.pad_height
-       << "   pad_width:" << args.image.pad_width;
-  DLOG << "   filter_address:" << args.filter_address
-       << "   bias_address:" << args.bias_address;
-  DLOG << "   kernel_height:" << args.kernel.height
-       << "   kernel_width:" << args.kernel.width
-       << "   stride_h:" << args.kernel.stride_h
-       << "   stride_w:" << args.kernel.stride_w;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-#ifdef PADDLE_MOBILE_ZU5
-  DLOG << "DWConv";
-  // return 0;
-  uint64_t output_scale = 0;
-  uint64_t timer_cnt = 0;
-  int ret = 0;
-  // uint64_t cmd = args.relu_enabled;
-  uint64_t cmd = 0;
-  uint64_t image_physical_address = 0;
-  uint64_t output_physical_address = 0;
-  uint64_t filter_physical_address = 0;
-  uint64_t bias_physical_address = 0;
-
-  image_physical_address = vaddr_to_paddr(args.image.address);
-  output_physical_address = vaddr_to_paddr(args.output.address);
-  filter_physical_address = vaddr_to_paddr(args.filter_address);
-  bias_physical_address = vaddr_to_paddr(args.bias_address);
-  uint64_t filter_N_align =
-      align_to_x((uint64_t)args.image.channels, IMAGE_ALIGNMENT);
-  uint64_t filter_amount_per_row_align =
-      filter_N_align * (uint64_t)args.kernel.width;
-  uint64_t sub_filter_amount_align = filter_N_align *
-                                     (uint64_t)args.kernel.width *
-                                     (uint64_t)args.kernel.height;
-  uint64_t filter_amount_align =
-      sub_filter_amount_align * (uint64_t)args.sub_conv_num;
-
-  uint32_t output_height = (uint32_t)(
-      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
-          args.kernel.stride_h +
-      1);
-  uint32_t output_width = (uint32_t)(
-      ((args.image.width + args.image.pad_width * 2 - args.kernel.width) /
-           args.kernel.stride_w +
-       1) *
-      args.sub_conv_num);
-
-  uint64_t image_amount_per_row =
-      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
-                 IMAGE_ALIGNMENT);
-  uint64_t image_one_pad_per_row =
-      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
-                 FILTER_ELEMENT_ALIGNMENT) +
-      (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
-  uint64_t image_two_pad_per_row = align_to_x(
-      ((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) *
-          (uint64_t)args.image.channels,
-      IMAGE_ALIGNMENT);
-  uint64_t image_row_mul_pooling_hight =
-      image_amount_per_row * (uint64_t)args.kernel.height;
-  uint64_t image_row_mul_pad_hight =
-      image_amount_per_row * (uint64_t)args.image.pad_height;
-  uint64_t image_row_mul_step_hight =
-      image_amount_per_row * (uint64_t)args.kernel.stride_h;
-  uint64_t result_amount_align_32 =
-      align_to_x((uint64_t)output_width * (uint64_t)args.image.channels,
-                 FILTER_ELEMENT_ALIGNMENT);
-  uint64_t result_amount_align_64 = align_to_x(
-      (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT);
-  uint64_t image_calcu_height =
-      (uint64_t)args.kernel.height +
-      ((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h;
-  uint64_t image_pad_left = args.image.channels * args.image.pad_width;
-  uint64_t image_skip_window = args.image.channels * args.kernel.stride_w;
-
-  uint64_t image_padleft_skipwindow =
-      (image_skip_window << 32) | image_pad_left;
-
-  uint64_t reg_ActivationArgs = 0;
-  // active function:{none,leakeyrelu,sigmoid,tanh}
-  ActivationArgs active_args;
-  // active_args.activation_type = LEAKYRELU;
-
-  active_args.activation_type = args.output.activation.activation_type;
-
-  active_args.leaky_relu_negative_slope =
-      args.output.activation.leaky_relu_negative_slope;
-
-  reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
-                       active_args.leaky_relu_negative_slope;
-
-  DLOG << "   activation_type:" << active_args.activation_type
-       << "   leaky_relu_negative_slope:"
-       << active_args.leaky_relu_negative_slope;
-  DLOG << "   reg_ActivationArgs:" << reg_ActivationArgs;
-
-  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
-  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
-    ret = -EIO;
-    DLOG << "DWConv Status Error!";
-    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-    return ret;
-  }
-
-  reg_writeq(reg_ActivationArgs,
-             REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
-  /*restart scale*/
-  reg_writeq(output_scale, REG_SCALE_PARAMETER);
-
-  reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR);
-  reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR);
-  reg_writeq((bias_physical_address << 32 | filter_physical_address),
-             REG_DWCONV_FILTER_BASE_ADDR);
-  reg_writeq(filter_amount_per_row_align | (filter_amount_align << 32),
-             REG_DWCONV_FILTER_SHAPE);
-  reg_writeq(sub_filter_amount_align | (((uint64_t)args.sub_conv_num) << 32),
-             REG_DWCONV_FILTER_SUBNUMBER);
-  reg_writeq(filter_N_align, REG_DWCONV_FILTER_N_ALIGN);
-
-  reg_writeq(
-      ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32),
-      REG_POOLING_IMAGE_PIXEL);
-  reg_writeq(
-      ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32),
-      REG_POOLING_WINDOW_SIZE);
-
-  reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32),
-             REG_POOLING_RESULT_PIXEL);
-
-  reg_writeq(((uint64_t)args.image.pad_height) |
-                 (((uint64_t)args.image.pad_width) << 32),
-             REG_POOLING_PAD_PIXEL);
-  reg_writeq(((uint64_t)args.kernel.stride_h) |
-                 (((uint64_t)args.kernel.stride_w) << 32),
-             REG_POOLING_STEP_PIXEL);
-
-  reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER);
-
-  reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW);
-  reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW);
-  reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW);
-
-  reg_writeq(image_row_mul_pooling_hight,
-             REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT);
-  reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT);
-  reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT);
-
-  reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32);
-  reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64);
-
-  reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT);
-
-  reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW);
-
-  /*SDK刷Cache保证数据一致性*/
-
-  reg_writeq(cmd, REG_DWCONV_CMD);
-
-  DLOG << "before reg poll";
-  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
-    g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR;
-    ret = -EIO;
-    DLOG << "Pooling Wait Irq Timeout!";
-    PADDLE_MOBILE_ENFORCE(0, "DWConv Wait Irq Timeout");
-  }
-  DLOG << "after reg poll";
-
-  // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
-  output_scale = reg_readq(REG_SCALE_PARAMETER);
-  output_scale = (output_scale << 32) | (output_scale >> 32);
-  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-  DLOG << "output_scale:" << output_scale;
-  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-  return ret;
-#endif
-  return 0;
-}
-int ComputeDWDeconv(const struct DWDeconvArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFPGADeConv===========";
-  DLOG << "   filter_num:" << args.filter_num
-       << "   group_num:" << args.group_num << "omit_size:" << args.omit_size
-       << "sub_output_width: " << args.sub_output_width
-       << "sub_output_height: " << args.sub_output_height
-       << "   sub_conv_num:" << args.sub_conv_num;
-  DLOG << "args.output.address: " << args.output.address
-       << "args.output.scale_address: " << args.output.scale_address;
-
-#endif
-
-  int sub_conv_num = args.sub_conv_num;
-
-#ifdef COST_TIME_PRINT
-  timeval start, end;
-  long dif_sec, dif_usec;  // NOLINT
-#endif
-
-  for (int i = 0; i < sub_conv_num; i++) {
-#ifdef COST_TIME_PRINT
-    gettimeofday(&start, NULL);
-#endif
-
-    ComputeDWConv(*args.dw_conv_args[i]);
-#ifdef COST_TIME_PRINT
-    gettimeofday(&end, NULL);
-    dif_sec = end.tv_sec - start.tv_sec;
-    dif_usec = end.tv_usec - start.tv_usec;
-    std::cout << "deconv basic_conv: " << i << " times:  "
-              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
-              << std::endl;
-#endif
-  }
-
-  if (sub_conv_num > 1) {
-    float max_scale = -1.0f;
-#ifdef COST_TIME_PRINT
-    gettimeofday(&start, NULL);
-#endif
-    for (int i = 0; i < sub_conv_num; i++) {
-      paddle_mobile::fpga::fpga_invalidate(
-          args.dw_conv_args[i]->output.scale_address, 2 * sizeof(float));
-      float ptr_scale = (args.dw_conv_args[i]->output.scale_address)[0];
-      if (ptr_scale > max_scale) {
-        args.output.scale_address[0] = ptr_scale;
-        args.output.scale_address[1] =
-            (args.dw_conv_args[i]->output.scale_address)[1];
-      }
-    }
-
-#ifdef COST_TIME_PRINT
-    gettimeofday(&end, NULL);
-    dif_sec = end.tv_sec - start.tv_sec;
-    dif_usec = end.tv_usec - start.tv_usec;
-    std::cout << "deconv scale  "
-              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
-              << std::endl;
-#endif
-  }
-
-#ifdef COST_TIME_PRINT
-  gettimeofday(&start, NULL);
-#endif
-  DWDeconv_post_process(args);
-#ifdef COST_TIME_PRINT
-  gettimeofday(&end, NULL);
-  dif_sec = end.tv_sec - start.tv_sec;
-  dif_usec = end.tv_usec - start.tv_usec;
-  std::cout << "deconv_post_process  "
-            << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
-            << std::endl;
-#endif
-  return 0;
-}  // ComputeFpgaDeconv
-
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V2/api.cpp b/mobile/src/fpga/V2/api.cpp
deleted file mode 100644
index f1d19364f8..0000000000
--- a/mobile/src/fpga/V2/api.cpp
+++ /dev/null
@@ -1,1011 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/V2/api.h"
-#include <memory>
-#include "fpga/V2/bias_scale.h"
-#include "fpga/V2/deconv_filter.h"
-#include "fpga/V2/filter.h"
-#include "fpga/V2/image.h"
-
-namespace paddle_mobile {
-namespace fpga {
-
-#define USE_RELU 1
-#define USE_BIAS 2
-
-void format_image(framework::Tensor *image_tensor) {
-  auto dims = image_tensor->dims();
-  auto channel = dims[1], height = dims[2], width = dims[3];
-  auto data_ptr = image_tensor->data<int8_t>();
-  auto external_ptr = reinterpret_cast<int8_t *>(image_tensor->external_data);
-  int8_t *p_data = external_ptr == nullptr ? data_ptr : external_ptr;
-
-  image::format_image<int8_t>(&p_data, channel, height, width);
-  if (p_data != data_ptr) {
-    image_tensor->reset_data_ptr(p_data);
-  }
-}
-
-void format_ofm(framework::Tensor *ofm_tensor) {
-  if (ofm_tensor->type() == type_id<float>()) {
-    format_fp32_ofm(ofm_tensor);
-  } else {
-    format_int8_ofm(ofm_tensor);
-  }
-}
-
-void format_int8_ofm(framework::Tensor *ofm_tensor) {
-  auto dims = ofm_tensor->dims();
-  size_t memory_size = 0;
-  if (dims.size() == 4) {
-    auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1],
-         height = dims[2], width = dims[3];
-    memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) *
-                  sizeof(int8_t);
-  } else if (dims.size() == 2) {
-    auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1];
-    memory_size = num * align_to_x(channel, IMAGE_ALIGNMENT) * sizeof(int8_t);
-  } else {
-    DLOG << "Wrong ofm dimension";
-  }
-  auto p = fpga_malloc(memory_size);
-  ofm_tensor->reset_data_ptr(p);
-  ofm_tensor->set_type(type_id<int8_t>().hash_code());
-  ofm_tensor->fpga_data_num = memory_size / sizeof(int8_t);
-  fpga::fpga_flush(p, memory_size);
-}
-
-void format_int8_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
-  size_t memory_size = 0;
-  if (dims.size() == 4) {
-    auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1],
-         height = dims[2], width = dims[3];
-    memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) *
-                  sizeof(int8_t);
-  } else if (dims.size() == 2) {
-    auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1];
-    memory_size = num * align_to_x(channel, IMAGE_ALIGNMENT) * sizeof(int8_t);
-  } else {
-    DLOG << "Wrong ofm dimension";
-  }
-  auto p = fpga_malloc(memory_size);
-  ofm_tensor->reset_data_ptr(p);
-  ofm_tensor->set_type(type_id<int8_t>().hash_code());
-  ofm_tensor->fpga_data_num = memory_size / sizeof(int8_t);
-  fpga::fpga_flush(p, memory_size);
-}
-
-void format_fp32_ofm(framework::Tensor *ofm_tensor) {
-  auto dims = ofm_tensor->dims();
-  size_t memory_size = 0;
-  if (dims.size() == 4) {
-    auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1],
-         height = dims[2], width = dims[3];
-    memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) *
-                  sizeof(float);
-  } else if (dims.size() == 2) {
-    auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1];
-    memory_size = num * align_to_x(channel, IMAGE_ALIGNMENT) * sizeof(float);
-  } else {
-    DLOG << "Wrong ofm dimension";
-  }
-  auto p = fpga_malloc(memory_size);
-  ofm_tensor->reset_data_ptr(p);
-  ofm_tensor->set_type(type_id<float>().hash_code());
-  ofm_tensor->fpga_data_num = memory_size / sizeof(float);
-  fpga::fpga_flush(p, memory_size);
-}
-
-float filter_find_max(framework::Tensor *filter_tensor) {
-  auto filter_ptr = filter_tensor->data<float>();
-  return filter::find_max(filter_ptr, filter_tensor->numel());
-}
-
-int get_plit_num(framework::Tensor *filter_tensor) {
-  auto dims = filter_tensor->dims();
-  auto chw = dims[1] * dims[2] * dims[3];
-  auto num = dims[0];
-  int div_capacity = filter::calc_division_capacity(chw);
-  return filter::calc_split_num(num, div_capacity);
-}
-int get_deconv_plit_num(framework::Tensor *filter_tensor, int stride) {
-  auto dims = filter_tensor->dims();
-  auto chw = dims[1] * dims[2] / stride * dims[3] / stride;
-  auto num = dims[0] * stride;
-  int div_capacity = filter::calc_division_capacity(chw);
-  return filter::calc_split_num(num, div_capacity);
-}
-
-int get_filter_num_per_div(framework::Tensor *filter_tensor, int group_num) {
-  auto dims = filter_tensor->dims();
-  auto chw = dims[1] * dims[2] * dims[3];
-  auto num = dims[0];
-  int div_capacity = filter::calc_division_capacity(chw);
-  return filter::calc_num_per_div(num, group_num, div_capacity);
-}
-
-int get_deconv_filter_num_per_div(framework::Tensor *filter_tensor,
-                                  int group_num, int stride) {
-  auto dims = filter_tensor->dims();
-  auto chw = dims[1] * dims[2] / stride * dims[3] / stride;
-  auto num = dims[0] * stride;
-  int div_capacity = filter::calc_division_capacity(chw);
-  return filter::calc_num_per_div(num, group_num, div_capacity);
-}
-
-int get_aligned_filter_element_num(int chw) {
-  return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-}
-
-void format_filter(framework::Tensor *filter_tensor, float max_value,
-                   int group_num) {
-  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
-  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
-  auto dims = filter_tensor->dims();
-  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
-  auto data_ptr = filter_tensor->data<float>();
-  size_t memory_size = num * channel * height * width * sizeof(float);
-  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
-  fpga_copy(new_data, data_ptr, memory_size);
-  filter::format_filter(&new_data, num, channel, height, width, group_num,
-                        max_value);
-  filter_tensor->reset_data_ptr(new_data);
-  filter_tensor->set_type(type_id<int8_t>().hash_code());
-}
-void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
-  auto dims = filter_tensor->dims();
-  auto num = dims[0], height = dims[2], width = dims[3];
-  auto data_ptr = filter_tensor->data<float>();
-  size_t memory_size = num * height * width * sizeof(float);
-  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
-  fpga_copy(new_data, data_ptr, memory_size);
-  filter::format_dwconv_filter(&new_data, num, height, width, scale_ptr);
-  filter_tensor->reset_data_ptr(new_data);
-  filter_tensor->set_type(type_id<int16_t>().hash_code());
-}
-
-void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
-                           int stride) {
-  auto dims = filter_tensor->dims();
-  auto num = dims[0], height = dims[2], width = dims[3];
-  auto data_ptr = filter_tensor->data<float>();
-  size_t memory_size = num * height * width * sizeof(float);
-  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
-  fpga_copy(new_data, data_ptr, memory_size);
-
-  int hw = height * width;
-  deconv_filter::deconv_NC_convert(&new_data, num, 1, hw);
-
-  num = dims[1];
-  int channel = dims[0];
-
-  deconv_filter::DWDconv_format_filter(&new_data, num, channel, height, width,
-                                       scale_ptr, stride);
-
-  filter_tensor->reset_data_ptr(new_data);
-  filter_tensor->set_type(type_id<int16_t>().hash_code());
-}
-
-void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
-  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
-  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
-  auto dims = filter_tensor->dims();
-  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
-  auto data_ptr = filter_tensor->data<float>();
-  size_t memory_size = num * channel * height * width * sizeof(float);
-  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
-  fpga_copy(new_data, data_ptr, memory_size);
-  filter::format_fc_filter(&new_data, num, channel, height, width, 1,
-                           max_value);
-  filter_tensor->reset_data_ptr(new_data);
-  filter_tensor->set_type(type_id<int8_t>().hash_code());
-}
-void format_deconv_filter(framework::Tensor *filter_tensor, float max_value,
-                          int group_num, int stride) {
-  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
-  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
-  auto dims = filter_tensor->dims();
-  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
-  auto data_ptr = filter_tensor->data<float>();
-  size_t memory_size = num * channel * height * width * sizeof(float);
-  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
-  memcpy(new_data, data_ptr, memory_size);
-
-  int hw = height * width;
-  deconv_filter::deconv_NC_convert(&new_data, num, channel, hw);
-
-  num = dims[1];
-  channel = dims[0];
-  deconv_filter::deconv_format_filter(
-      &new_data, (int)num, (int)channel,          // NOLINT
-      (int)height,                                // NOLINT
-      (int)width, group_num, max_value, stride);  // NOLINT
-
-  framework::DDim dims_new =
-      framework::make_ddim({num, channel, height, width});
-  filter_tensor->Resize(dims_new);
-  filter_tensor->reset_data_ptr(new_data);
-  filter_tensor->set_type(type_id<int8_t>().hash_code());
-}
-
-void format_bias_scale_array(float **bias_scale_array,
-                             int element_num_per_division, int num) {
-  bias_scale::format_bias_scale_array(bias_scale_array,
-                                      element_num_per_division, num);
-}
-void format_bias_array(float **bias_array, int num) {
-  bias_scale::format_bias_array(bias_array, num);
-}
-
-void format_concat_output(framework::Tensor *out, int height, int width,
-                          int image_num, uint32_t *channel_num) {
-  int sum_channel = 0, sum_cw = 0;
-  for (int i = 0; i < image_num; i++) {
-    sum_channel += channel_num[i];
-  }
-
-  sum_cw = align_to_x(width * sum_channel, IMAGE_ALIGNMENT);
-  auto data_ptr = fpga_malloc(height * sum_cw * sizeof(int8_t));
-  auto ddim = framework::make_ddim({1, sum_channel, height, width});
-  out->Resize(ddim);
-  out->reset_data_ptr(data_ptr);
-  out->set_type(type_id<int8_t>().hash_code());
-}
-void format_conv_data(framework::Tensor *filter_tensor,
-                      framework::Tensor *ofm_tensor, float **bs_ptr,
-                      int group) {
-  float max_value = fpga::filter_find_max(filter_tensor);
-  fpga::format_filter(filter_tensor, max_value, group);
-  int element_num_per_div = fpga::get_filter_num_per_div(filter_tensor, group);
-  fpga::format_bias_scale_array(bs_ptr, element_num_per_div,
-                                ofm_tensor->dims()[1]);
-  fpga::format_ofm(ofm_tensor);
-}
-void format_deconv_data(framework::Tensor *filter_tensor,
-                        framework::Tensor *ofm_tensor, float **bs_ptr,
-                        int group, int sub_conv_n) {
-  int channel = ofm_tensor->dims()[1];
-  float max_value = filter_find_max(filter_tensor);
-  format_deconv_filter(filter_tensor, max_value, group, sub_conv_n);
-  int element_num_per_div =
-      get_deconv_filter_num_per_div(filter_tensor, group, sub_conv_n);
-  format_bias_scale_array(bs_ptr, element_num_per_div, channel * sub_conv_n);
-  format_ofm(ofm_tensor);
-}
-
-void format_dwconv_data(framework::Tensor *filter_tensor,
-                        framework::Tensor *ofm_tensor, float *scale_ptr,
-                        float **bias_ptr) {
-  auto channel = ofm_tensor->dims()[1];
-  format_dwconv_filter(filter_tensor, scale_ptr);
-  format_bias_array(bias_ptr, channel);
-  format_ofm(ofm_tensor);
-}
-void format_DWDeconv_data(framework::Tensor *filter_tensor,
-                          framework::Tensor *ofm_tensor, float **bs_ptr,
-                          int group, int sub_conv_n) {
-  int channel = ofm_tensor->dims()[1];
-  format_DWDconv_filter(
-      filter_tensor,
-      (reinterpret_cast<float *>(*bs_ptr) + sub_conv_n * channel), sub_conv_n);
-  format_bias_array(bs_ptr, channel);
-  format_ofm(ofm_tensor);
-}
-
-void expand_conv_arg(ConvArgs *arg) {
-  ConvArgs args = *arg;
-
-  auto fpga_bias_scale_len =
-      align_to_x(args.filter_num / args.group_num, BS_NUM_ALIGNMENT) *
-      args.group_num;
-  fpga_bias_scale_len = fpga_bias_scale_len / BIAS_SCALE_DMA_NUM;
-
-  auto output_height =
-      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
-          args.kernel.stride_h +
-      1;
-  auto output_width =
-      (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
-          args.kernel.stride_w +
-      1;
-
-  auto filter_per_group = args.filter_num / args.group_num;
-  auto channel_per_group = args.image.channels / args.group_num;
-
-  auto image_row_count = args.image.width * args.image.channels;
-  auto image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT);
-  auto image_one_pad_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT) +
-                               args.image.pad_width * args.image.channels;
-  auto filter_amount_all =
-      align_to_x(args.kernel.height * args.kernel.width * channel_per_group,
-                 FILTER_ELEMENT_ALIGNMENT);
-
-  auto output_amount_per_row = align_to_x(
-      (output_width - (args.deconv_tx_param.omit_size) * 2) * args.filter_num,
-      RESULT_ALIGNMENT);
-
-  // find the opt partition strategy
-  uint64_t res_win;
-  uint64_t res_fit = 0;
-  for (res_win = 1; res_win <= output_width; res_win++) {
-    if ((align_to_x(
-             (args.image.channels *
-              (args.kernel.width + (res_win - 1) * args.kernel.stride_w)),
-             IMAGE_ALIGNMENT) /
-             IMAGE_ALIGNMENT +
-         1) *
-            args.kernel.height >
-        256) {
-      break;
-    }
-  }
-
-  if (res_win != output_width) {
-    res_win -= 1;
-  }
-
-  if (((res_win % 2) != 0) && (res_win != 1)) {
-    res_win = res_win - 1;
-  }
-  PADDLE_MOBILE_ENFORCE(res_win >= 2, "window too bigger than fpga volume");
-  res_fit = res_win;
-
-  auto block_num = (output_width + res_fit - 1) / res_fit;
-  auto block_len = res_fit;
-  auto block_last = output_width - res_fit * (block_num - 1);
-
-  auto res_amount_per_row =
-      (output_width - (args.deconv_tx_param.omit_size) * 2) * args.filter_num;
-  auto res_amount_per_row_pad = output_amount_per_row - res_amount_per_row;
-
-  auto image_block_amount_per_row =
-      args.kernel.stride_w * res_fit * args.image.channels;
-  auto filter_pad_width_mul_channel =
-      args.image.pad_width * args.image.channels;
-  auto image_amount_per_row_multi_win_first =
-      image_amount_per_row *
-      (ROW_PARALLEL_NUM * args.kernel.stride_h - args.image.pad_height);
-  auto image_amount_per_row_multi_win =
-      image_amount_per_row * (ROW_PARALLEL_NUM * args.kernel.stride_h);
-
-  auto image_block_num = block_num;
-  auto image_block_len =
-      align_to_x((args.image.channels *
-                  (args.kernel.width + (block_len - 1) * args.kernel.stride_w)),
-                 IMAGE_ALIGNMENT) /
-          IMAGE_ALIGNMENT +
-      1;
-  auto image_block_len_last =
-      align_to_x(
-          (args.image.channels *
-           (args.kernel.width + (block_last - 1) * args.kernel.stride_w)),
-          IMAGE_ALIGNMENT) /
-          IMAGE_ALIGNMENT +
-      1;
-  auto image_win_cnt = block_len;
-  auto image_win_cnt_last = block_last;
-  auto res_row_data_align4_pad = res_amount_per_row_pad / 8;
-  auto prog_full_cnt = 1024 / (filter_amount_all / 16 * 2) - 1;
-  if (prog_full_cnt == 511) {
-    prog_full_cnt--;
-  }
-  auto post_prog_full_cnt =
-      (512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2)
-          ? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2)
-          : 0;
-  auto cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
-  // auto cmd = 0UL | USE_BIAS;
-
-  auto deconv_param = ((args.deconv_tx_param.deconv_en) << 16) |
-                      ((args.deconv_tx_param.sub_conv_num) << 8) |
-                      ((args.deconv_tx_param.omit_size) << 0);
-
-  (*arg).driver.filter_per_group = filter_per_group;
-  (*arg).driver.channel_per_group = channel_per_group;
-  (*arg).driver.image_one_pad_per_row = image_one_pad_per_row;
-  (*arg).driver.deconv_param = deconv_param;
-  // new
-  (*arg).driver.col_padding_up = args.image.pad_width * args.image.channels;
-  (*arg).driver.col_padding_down = image_one_pad_per_row;
-  (*arg).driver.row_padding_up = args.image.pad_height;
-  (*arg).driver.row_padding_down = args.image.pad_height + args.image.height;
-  (*arg).driver.image_block_amount_per_row = image_block_amount_per_row;
-  (*arg).driver.filter_pad_width_mul_channel = filter_pad_width_mul_channel;
-  (*arg).driver.image_win_cnt = image_win_cnt;
-  (*arg).driver.image_win_cnt_last = image_win_cnt_last;
-  (*arg).driver.filter_row = args.kernel.width * args.image.channels;
-  (*arg).driver.filter_width = args.kernel.width;
-  (*arg).driver.filter_height = args.kernel.height;
-  (*arg).driver.skip_window = args.image.channels * args.kernel.stride_w;
-  (*arg).driver.stride_h = args.kernel.stride_h;
-  (*arg).driver.filter_amount_all = filter_amount_all;
-  (*arg).driver.prog_full_cnt = prog_full_cnt;
-  (*arg).driver.filter_align = args.filter_num / (4 * PE_COLUMN) +
-                               (((args.filter_num % (4 * PE_COLUMN))) ? 1 : 0);
-  (*arg).driver.filter_num = args.filter_num;
-  (*arg).driver.output_width = output_width;
-  (*arg).driver.output_amount_per_row = output_amount_per_row;
-  (*arg).driver.res_row_data_align4_pad = res_row_data_align4_pad;
-  (*arg).driver.cal_res_num = output_height / ROW_PARALLEL_NUM +
-                              ((output_height % ROW_PARALLEL_NUM) ? 1 : 0) - 1;
-  (*arg).driver.last_cal_res_row_num =
-      (output_height % (ROW_PARALLEL_NUM))
-          ? (output_height % (ROW_PARALLEL_NUM))
-          : (ROW_PARALLEL_NUM);
-
-  (*arg).driver.post_prog_full_cnt = post_prog_full_cnt;
-  (*arg).driver.deconv_skip_row =
-      ROW_PARALLEL_NUM *
-      args.deconv_tx_param.sub_conv_num;  // paralvl*deconv_group
-  (*arg).driver.deconv_res_skip_row =
-      args.deconv_tx_param.sub_conv_num *
-      output_amount_per_row;  // deconv_group * result_amount_per_row
-  (*arg).driver.deconv_ena = args.deconv_tx_param.deconv_en;
-  (*arg).driver.deconv_dump = args.deconv_tx_param.omit_size;
-  (*arg).driver.output_address_phy = vaddr_to_paddr(args.output.address) +
-                                     args.deconv_tx_param.out_addr_offset;
-  (*arg).driver.output_height = output_height;
-  (*arg).driver.result_amount_per_row_multi_para =
-      output_amount_per_row / RESULT_ALIGNMENT *
-      (args.deconv_tx_param.deconv_en ? (*arg).driver.deconv_skip_row
-                                      : ROW_PARALLEL_NUM);
-  (*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address);
-  (*arg).driver.fpga_bias_scale_len = fpga_bias_scale_len;
-  (*arg).driver.filter_amount_whole = filter_amount_all;
-  (*arg).driver.filter_address_phy = vaddr_to_paddr(args.filter_address);
-  (*arg).driver.filters_amount_whole =
-      filter_amount_all * (*arg).driver.filter_align * (4 * PE_COLUMN);
-  (*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address);
-  (*arg).driver.image_hight = args.image.height;
-  (*arg).driver.image_amount_per_row = image_amount_per_row;
-  (*arg).driver.image_amount_per_row_multi_win_first =
-      image_amount_per_row_multi_win_first;
-  (*arg).driver.image_amount_per_row_multi_win = image_amount_per_row_multi_win;
-  (*arg).driver.filter_pad_hight = args.image.pad_height;
-  (*arg).driver.image_block_num = image_block_num;
-  (*arg).driver.image_block_len = image_block_len;
-  (*arg).driver.image_block_len_last = image_block_len_last;
-
-  (*arg).driver.cmd = cmd;
-}  // expand_conv_arg()
-
-void expand_EW_arg(EWAddArgs *arg) {
-  EWAddArgs args = *arg;
-  uint64_t cmd = args.relu_enabled ? USE_RELU : 0;
-  uint64_t datalen = (uint64_t)args.image0.width *
-                     (uint64_t)args.image0.height *
-                     (uint64_t)args.image0.channels;
-  uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1;
-  uint64_t image0_address_phy = vaddr_to_paddr(args.image0.address);
-  uint64_t image1_address_phy = vaddr_to_paddr(args.image1.address);
-  uint64_t output_address_phy = vaddr_to_paddr(args.output.address);
-
-  uint64_t image_amount_per_row =
-      align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
-                 IMAGE_ALIGNMENT);
-  uint64_t image_amount_per_row_p = align_to_x(
-      (uint64_t)args.image0.width * (uint64_t)args.image0.channels, 16);
-  uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) |
-                               ((uint64_t)args.image0.width << 16) |
-                               (uint64_t)args.image0.height;
-
-  (*arg).driver.image0_address_phy = image0_address_phy;
-  (*arg).driver.image1_address_phy = image1_address_phy;
-  (*arg).driver.datalen = datalen;
-  (*arg).driver.image_image_pixel = image_image_pixel;
-  (*arg).driver.image_amount_per_row =
-      (uint64_t)image_amount_per_row | (uint64_t)(image_amount_per_row_p << 32);
-  (*arg).driver.output_address_phy = output_address_phy;
-  (*arg).driver.coefficient = coefficient;
-  (*arg).driver.cmd = cmd;
-}  // expand_EW_arg
-
-void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
-                    framework::Tensor *out, framework::Tensor *filter,
-                    bool relu_enabled, int group_num, int stride_h,
-                    int stride_w, int padding_h, int padding_w, float *bs_ptr) {
-  auto input_ptr = input->data<int8_t>();
-  auto filter_ptr = filter->data<int8_t>();
-  auto out_ptr = out->data<int8_t>();
-  auto deleter = [](void *p) { fpga_free(p); };
-
-  arg->group_num = (uint32_t)group_num;
-  // Either group_num or split_num = 1;
-  PADDLE_MOBILE_ENFORCE(group_num == 1, "group_num is not equal to 1");
-  arg->split_num = group_num == 1 ? (uint32_t)get_plit_num(filter) : 1;
-  arg->filter_num = (uint32_t)filter->dims()[0];
-  arg->output.address = out_ptr;
-  arg->output.scale_address = out->scale;
-  arg->conv_arg =
-      (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));  // NOLINT
-
-  arg->shared_conv_arg = std::shared_ptr<ConvArgs>(arg->conv_arg, deleter);
-
-  memset(arg->conv_arg, 0, arg->split_num * sizeof(struct ConvArgs));
-
-  arg->concat_arg.image_num = arg->split_num;
-  arg->concat_arg.image_out = out_ptr;
-  arg->concat_arg.scale_out = out->scale;
-  arg->concat_arg.height = (uint32_t)out->dims()[2];
-  arg->concat_arg.width = (uint32_t)out->dims()[3];
-
-  int n = arg->split_num;
-  arg->concat_arg.images_in =
-      static_cast<int8_t **>(fpga_malloc(n * sizeof(int *)));
-  arg->concat_arg.scales_in =
-      static_cast<float **>(fpga_malloc(n * sizeof(float *)));
-  arg->concat_arg.channel_num =
-      static_cast<uint32_t *>(fpga_malloc(n * sizeof(uint32_t)));
-  arg->vector_concat_space.push_back(std::shared_ptr<char>(
-      reinterpret_cast<char *>(arg->concat_arg.images_in), deleter));
-  arg->vector_concat_space.push_back(std::shared_ptr<char>(
-      reinterpret_cast<char *>(arg->concat_arg.scales_in), deleter));
-  arg->vector_concat_space.push_back(std::shared_ptr<char>(
-      reinterpret_cast<char *>(arg->concat_arg.channel_num), deleter));
-
-  auto channel = (int)out->dims()[1];  // NOLINT
-  int filter_num_per_div = get_filter_num_per_div(filter, group_num);
-  int element_num = get_aligned_filter_element_num(
-      (int)(filter->dims()[1] * filter->dims()[2] *  // NOLINT
-            filter->dims()[3]));
-
-  for (int i = 0; i < n; i++) {
-    arg->conv_arg[i].relu_enabled = relu_enabled;
-    arg->conv_arg[i].group_num = (uint32_t)group_num;
-    arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h;
-    arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w;
-    arg->conv_arg[i].kernel.height = (uint32_t)filter->dims()[2];
-    arg->conv_arg[i].kernel.width = (uint32_t)filter->dims()[3];
-    arg->conv_arg[i].image.address = input_ptr;
-    arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1];
-    arg->conv_arg[i].image.height = (uint32_t)input->dims()[2];
-    arg->conv_arg[i].image.width = (uint32_t)input->dims()[3];
-    arg->conv_arg[i].image.scale_address = input->scale;
-    arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
-    arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
-    arg->conv_arg[i].filter_scale_address = filter->scale;
-    arg->conv_arg[i].filter_num = (uint32_t)(
-        i == n - 1 ? channel - (n - 1) * filter_num_per_div  // NOLINT
-                   : filter_num_per_div);
-
-    size_t filter_size =
-        element_num *
-        align_to_x(arg->conv_arg[i].filter_num, FILTER_NUM_ALIGNMENT) *
-        sizeof(int8_t);
-    auto filter_head = &(
-        (int8_t *)filter_ptr)[i * element_num * filter_num_per_div];  // NOLINT
-    arg->conv_arg[i].filter_address = fpga_malloc(filter_size);
-    arg->vector_conv_space.push_back(std::shared_ptr<char>(
-        reinterpret_cast<char *>(arg->conv_arg[i].filter_address), deleter));
-    memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size);
-    fpga_flush(arg->conv_arg[i].filter_address, filter_size);
-
-    size_t bs_size = 2 *
-                     align_to_x(arg->conv_arg[i].filter_num, BS_NUM_ALIGNMENT) *
-                     sizeof(float);
-    auto bs_head = &bs_ptr[i * filter_num_per_div * 2];
-    arg->conv_arg[i].sb_address = fpga_malloc(bs_size);
-    arg->vector_conv_space.push_back(std::shared_ptr<char>(
-        reinterpret_cast<char *>(arg->conv_arg[i].sb_address), deleter));
-    memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size);
-    fpga_flush(arg->conv_arg[i].sb_address, bs_size);
-
-    if (n > 1) {
-      arg->conv_arg[i].output.scale_address =
-          static_cast<float *>(fpga_malloc(2 * sizeof(float)));
-      arg->conv_arg[i].output.address =
-          fpga_malloc(out->dims()[2] *
-                      align_to_x((int)(out->dims()[3] *  // NOLINT
-                                       arg->conv_arg[i].filter_num),
-                                 IMAGE_ALIGNMENT) *
-                      sizeof(int8_t));
-      arg->vector_conv_space.push_back(std::shared_ptr<char>(
-          reinterpret_cast<char *>(arg->conv_arg[i].output.scale_address),
-          deleter));
-      arg->vector_conv_space.push_back(std::shared_ptr<char>(
-          reinterpret_cast<char *>(arg->conv_arg[i].output.address), deleter));
-    } else {
-      arg->conv_arg[i].output.scale_address = out->scale;
-      arg->conv_arg[i].output.address = out_ptr;
-    }
-
-    arg->concat_arg.images_in[i] =
-        (int8_t *)arg->conv_arg[i].output.address;  // NOLINT
-    arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address;
-    arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num;
-
-    expand_conv_arg(&arg->conv_arg[i]);
-  }
-  filter->reset_data_ptr(nullptr);
-  fpga_free(bs_ptr);
-}  // fill_split_arg
-
-void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
-                     framework::Tensor *out, framework::Tensor *filter,
-                     bool relu_enabled, int group_num, int stride_h,
-                     int stride_w, int padding_h, int padding_w,
-                     float *bs_ptr) {
-  auto input_ptr = input->data<int8_t>();
-  auto filter_ptr = filter->data<int8_t>();
-  auto deleter = [](void *p) { fpga_free(p); };
-
-  arg->group_num = (uint32_t)group_num;
-  arg->sub_conv_num = (uint32_t)stride_h;
-  arg->filter_num = (uint32_t)filter->dims()[0];
-  uint32_t sub_conv_num = arg->sub_conv_num;
-  int sub_pad =
-      deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3],  // NOLINT
-                                         padding_w, stride_w);
-  auto sub_filter_width = (uint32_t)deconv_filter::deconv_get_sub_filter_axis(
-      (int)filter->dims()[3], stride_w);  // NOLINT
-
-  auto sub_output_width = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
-      (int)input->dims()[3], sub_pad, sub_filter_width);  // NOLINT
-  auto sub_output_height = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
-      (int)input->dims()[2], sub_pad, sub_filter_width);  // NOLINT
-
-  arg->sub_output_width = (uint32_t)sub_output_width;
-  arg->sub_output_height = (uint32_t)sub_output_height;
-  arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit(
-      stride_w, (int)filter->dims()[3], padding_w);  // NOLINT
-
-  auto sub_channels = (int)input->dims()[1];  // NOLINT
-  uint32_t omit_size = arg->omit_size;
-  int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size;
-  int sub_filter_num = sub_conv_num * (arg->filter_num);
-
-  framework::DDim dims_out_new = framework::make_ddim(
-      {1, arg->filter_num, sub_output_height * sub_conv_num, real_out_width});
-  fpga::format_int8_ofm(out, dims_out_new);
-  auto out_ptr = out->data<int8_t>();
-  arg->output.address =
-      (int8_t *)out_ptr +  // NOLINT
-      omit_size * sizeof(int8_t) *
-          (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
-  arg->output.scale_address = out->scale;
-
-  uint32_t conv_output_size =
-      (align_to_x(sub_output_width * sub_filter_num, IMAGE_ALIGNMENT)) *
-      sub_output_height;
-  uint32_t split_num =
-      group_num == 1 ? (uint32_t)get_deconv_plit_num(filter, sub_conv_num) : 1;
-
-  for (int i = 0; i < sub_conv_num; ++i) {
-    arg->split_conv_args.push_back(std::make_shared<SplitConvArgs>());
-    arg->split_conv_args[i]->filter_num =
-        (arg->sub_conv_num) * (arg->filter_num);
-    arg->split_conv_args[i]->group_num = (uint32_t)group_num;
-    arg->split_conv_args[i]->split_num = split_num;
-    arg->split_conv_args[i]->concat_arg.height = sub_output_height;
-    arg->split_conv_args[i]->concat_arg.width = sub_output_width;
-    arg->split_conv_args[i]->concat_arg.image_num = split_num;
-
-    arg->split_conv_args[i]->conv_arg =
-        static_cast<ConvArgs *>(fpga_malloc(split_num * sizeof(ConvArgs)));
-    arg->split_conv_args[i]->concat_arg.images_in =
-        static_cast<int8_t **>(fpga_malloc(split_num * sizeof(int8_t *)));
-    arg->split_conv_args[i]->concat_arg.scales_in =
-        static_cast<float **>(fpga_malloc(split_num * sizeof(float *)));
-    arg->split_conv_args[i]->concat_arg.channel_num =
-        static_cast<uint32_t *>(fpga_malloc(split_num * sizeof(uint32_t)));
-    arg->split_conv_args[i]->shared_conv_arg =
-        std::shared_ptr<ConvArgs>(arg->split_conv_args[i]->conv_arg, deleter);
-    arg->split_conv_args[i]->vector_concat_space.push_back(
-        std::shared_ptr<char>(
-            reinterpret_cast<char *>(
-                arg->split_conv_args[i]->concat_arg.images_in),
-            deleter));
-    arg->split_conv_args[i]->vector_concat_space.push_back(
-        std::shared_ptr<char>(
-            reinterpret_cast<char *>(
-                arg->split_conv_args[i]->concat_arg.scales_in),
-            deleter));
-    arg->split_conv_args[i]->vector_concat_space.push_back(
-        std::shared_ptr<char>(
-            reinterpret_cast<char *>(
-                arg->split_conv_args[i]->concat_arg.channel_num),
-            deleter));
-  }
-
-  auto filter_num_per_div =
-      (uint32_t)get_deconv_filter_num_per_div(filter, group_num, stride_w);
-  int element_num = get_aligned_filter_element_num(
-      (int)(sub_channels * sub_filter_width * sub_filter_width));  // NOLINT
-
-  int chw = sub_channels * sub_filter_width * sub_filter_width;
-  int division_capacity = filter::calc_division_capacity(chw);
-  int num_per_div_before_alignment =
-      filter::calc_num_per_div(sub_filter_num, group_num, division_capacity);
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-  int div_num = (sub_filter_num + num_per_div_before_alignment - 1) /
-                num_per_div_before_alignment;
-  int residual = sub_filter_num % num_per_div_before_alignment;
-  int num_after_alignment = num_per_div_after_alignment *
-                                ((residual == 0) ? div_num : (div_num - 1)) +
-                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
-
-  int filter_sub_conv_offset = element_num * num_after_alignment;
-  uint32_t out_addr_offset = 0;
-  for (int i = 0; i < sub_conv_num; ++i) {
-    if (sub_conv_num == 1) {
-      arg->split_conv_args[i]->output.address = arg->output.address;
-      arg->split_conv_args[i]->output.scale_address = arg->output.scale_address;
-      out_addr_offset = 0;
-
-    } else {
-      out_addr_offset =
-          sizeof(int8_t) * (sub_conv_num - 1 - i) *
-          (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
-
-      arg->split_conv_args[i]->output.address = out_ptr;
-      arg->split_conv_args[i]->output.scale_address =
-          static_cast<float *>(fpga_malloc(2 * sizeof(float)));
-      arg->split_conv_args[i]->vector_conv_space.push_back(
-          std::shared_ptr<char>(
-              reinterpret_cast<char *>(
-                  arg->split_conv_args[i]->output.scale_address),
-              deleter));
-    }
-
-    for (int j = 0; j < split_num; ++j) {
-      // arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type
-      // =
-      //    activation_enable;
-      // arg->split_conv_args[i]
-      //     ->conv_arg[j]
-      //    .output.activation.leaky_relu_negative_slope =
-      //    leaky_relu_negative_slope;
-      arg->split_conv_args[i]->conv_arg[j].relu_enabled = relu_enabled;
-      arg->split_conv_args[i]->conv_arg[j].group_num = (uint32_t)group_num;
-
-      arg->split_conv_args[i]->conv_arg[j].kernel.width =
-          (uint32_t)sub_filter_width;
-      arg->split_conv_args[i]->conv_arg[j].kernel.height =
-          (uint32_t)sub_filter_width;
-      arg->split_conv_args[i]->conv_arg[j].kernel.stride_w = 1;
-      arg->split_conv_args[i]->conv_arg[j].kernel.stride_h = 1;
-
-      arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.deconv_en = 1;
-      arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.sub_conv_num =
-          sub_conv_num;
-      arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.omit_size =
-          omit_size;
-      arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.out_addr_offset =
-          out_addr_offset;
-
-      arg->split_conv_args[i]->conv_arg[j].image.scale_address = input->scale;
-      arg->split_conv_args[i]->conv_arg[j].image.channels =
-          (uint32_t)sub_channels;
-      arg->split_conv_args[i]->conv_arg[j].image.width =
-          (uint32_t)input->dims()[3];
-      arg->split_conv_args[i]->conv_arg[j].image.height =
-          (uint32_t)input->dims()[2];
-      arg->split_conv_args[i]->conv_arg[j].image.pad_width = (uint32_t)sub_pad;
-      arg->split_conv_args[i]->conv_arg[j].image.pad_height = (uint32_t)sub_pad;
-      arg->split_conv_args[i]->conv_arg[j].image.address = input_ptr;
-
-      arg->split_conv_args[i]->conv_arg[j].filter_scale_address = filter->scale;
-      arg->split_conv_args[i]->conv_arg[j].filter_num =
-          (uint32_t)(j == split_num - 1
-                         ? sub_filter_num - (split_num - 1) * filter_num_per_div
-                         : filter_num_per_div);
-
-      size_t filter_size =
-          element_num *
-          align_to_x(arg->split_conv_args[i]->conv_arg[j].filter_num,
-                     FILTER_NUM_ALIGNMENT) *
-          sizeof(int8_t);
-      auto filter_head = &((
-          int8_t *)filter_ptr)[j * element_num * filter_num_per_div +  // NOLINT
-                               i * filter_sub_conv_offset];
-      arg->split_conv_args[i]->conv_arg[j].filter_address =
-          fpga_malloc(filter_size);
-      arg->split_conv_args[i]->vector_conv_space.push_back(
-          std::shared_ptr<char>(
-              reinterpret_cast<char *>(
-                  arg->split_conv_args[i]->conv_arg[j].filter_address),
-              deleter));
-
-      memcpy(arg->split_conv_args[i]->conv_arg[j].filter_address, filter_head,
-             filter_size);
-      fpga_flush(arg->split_conv_args[i]->conv_arg[j].filter_address,
-                 filter_size);
-
-      size_t bs_align_num = align_to_x(
-          arg->split_conv_args[i]->conv_arg[j].filter_num, BS_NUM_ALIGNMENT);
-      size_t bs_size = 2 * bs_align_num * sizeof(float);
-      auto bs_head = &bs_ptr[j * filter_num_per_div * 2];
-
-      arg->split_conv_args[i]->conv_arg[j].sb_address = fpga_malloc(bs_size);
-      arg->split_conv_args[i]->vector_conv_space.push_back(
-          std::shared_ptr<char>(
-              reinterpret_cast<char *>(
-                  arg->split_conv_args[i]->conv_arg[j].sb_address),
-              deleter));
-
-      memcpy(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_head, bs_size);
-      fpga_flush(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_size);
-
-      if (split_num == 1) {
-        arg->split_conv_args[i]->conv_arg[j].output.address =
-            arg->split_conv_args[i]->output.address;
-        arg->split_conv_args[i]->conv_arg[j].output.scale_address =
-            arg->split_conv_args[i]->output.scale_address;
-      } else {
-        arg->split_conv_args[i]->conv_arg[j].output.address =
-            fpga_malloc(conv_output_size * sizeof(int8_t));
-        arg->split_conv_args[i]->conv_arg[j].output.scale_address =
-            static_cast<float *>(fpga_malloc(2 * sizeof(float)));
-        arg->split_conv_args[i]->vector_conv_space.push_back(
-            std::shared_ptr<char>(
-                reinterpret_cast<char *>(
-                    arg->split_conv_args[i]->conv_arg[j].output.address),
-                deleter));
-        arg->split_conv_args[i]->vector_conv_space.push_back(
-            std::shared_ptr<char>(
-                reinterpret_cast<char *>(
-                    arg->split_conv_args[i]->conv_arg[j].output.scale_address),
-                deleter));
-      }
-      arg->split_conv_args[i]->concat_arg.images_in[j] = static_cast<int8_t *>(
-          arg->split_conv_args[i]->conv_arg[j].output.address);
-      arg->split_conv_args[i]->concat_arg.scales_in[j] =
-          arg->split_conv_args[i]->conv_arg[j].output.scale_address;
-      arg->split_conv_args[i]->concat_arg.channel_num[j] =
-          arg->split_conv_args[i]->conv_arg[j].filter_num;
-
-      expand_conv_arg(&(arg->split_conv_args[i]->conv_arg[j]));
-    }
-
-    arg->split_conv_args[i]->concat_arg.image_out =
-        arg->split_conv_args[i]->output.address;
-    arg->split_conv_args[i]->concat_arg.scale_out =
-        arg->split_conv_args[i]->output.scale_address;
-  }
-  filter->reset_data_ptr(nullptr);
-  fpga_free(bs_ptr);
-}  // fill_deconv_arg
-
-void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
-                     framework::Tensor *out, framework::Tensor *filter,
-                     bool relu_enabled, int stride_h, int stride_w,
-                     int padding_h, int padding_w, float *bias_ptr) {
-  auto filter_ptr = filter->data<int16_t>();
-  auto input_ptr = input->data<int8_t>();
-  auto output_ptr = out->mutable_data<int8_t>();
-  arg->sub_conv_num = 1;
-  arg->relu_enabled = relu_enabled;
-  // arg->output.activation.activation_type = activation_enable;
-  arg->bias_address = bias_ptr;
-  arg->filter_address = filter_ptr;
-  arg->kernel.height = (uint32_t)filter->dims()[2];
-  arg->kernel.width = (uint32_t)filter->dims()[3];
-  arg->kernel.stride_h = (uint32_t)stride_h;
-  arg->kernel.stride_w = (uint32_t)stride_w;
-  arg->image.address = input_ptr;
-  arg->image.channels = (uint32_t)input->dims()[1];
-  arg->image.height = (uint32_t)input->dims()[2];
-  arg->image.width = (uint32_t)input->dims()[3];
-  arg->image.pad_height = (uint32_t)padding_h;
-  arg->image.pad_width = (uint32_t)padding_w;
-  arg->image.scale_address = input->scale;
-  arg->output.address = output_ptr;
-  arg->output.scale_address = out->scale;
-}  // end dwconv arg fill
-
-void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
-                       framework::Tensor *out, framework::Tensor *filter,
-                       bool relu_enabled, int stride_h, int stride_w,
-                       int padding_h, int padding_w, float *bias_ptr) {
-  auto filter_ptr = filter->data<int8_t>();
-  auto input_ptr = input->data<int8_t>();
-
-  auto deleter = [](void *p) { fpga_free(p); };
-
-  arg->group_num = (uint32_t)filter->dims()[0];
-  arg->sub_conv_num = (uint32_t)stride_w;
-  arg->filter_num = (uint32_t)filter->dims()[0];
-
-  int sub_conv_num = stride_w;
-
-  int sub_pad =
-      deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3],  // NOLINT
-                                         padding_w, stride_w);
-  auto sub_filter_width = (uint32_t)deconv_filter::deconv_get_sub_filter_axis(
-      (int)filter->dims()[3], stride_w);  // NOLINT
-
-  auto sub_output_width = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
-      (int)input->dims()[3], sub_pad, sub_filter_width);  // NOLINT
-  auto sub_output_height = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
-      (int)input->dims()[2], sub_pad, sub_filter_width);  // NOLINT
-
-  arg->sub_output_width = (uint32_t)sub_output_width;
-  arg->sub_output_height = (uint32_t)sub_output_height;
-  arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit(
-      stride_w, (int)filter->dims()[3], padding_w);  // NOLINT
-
-  auto sub_channels = (int)input->dims()[1];  // NOLINT
-  uint32_t omit_size = arg->omit_size;
-  int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size;
-  int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size;
-  int sub_filter_num = sub_conv_num * (arg->filter_num);
-
-  framework::DDim dims_out_new = framework::make_ddim(
-      {1, arg->filter_num, real_out_height, real_out_width});
-  fpga::format_int8_ofm(out, dims_out_new);
-  auto out_ptr = out->data<int8_t>();
-
-  arg->output.address = out_ptr;
-  arg->output.scale_address = out->scale;
-
-  int filter_offset = sub_filter_width * sub_filter_width *
-                      align_to_x(sub_channels, FILTER_ELEMENT_ALIGNMENT) *
-                      arg->sub_conv_num;
-
-  for (int i = 0; i < sub_conv_num; ++i) {
-    arg->dw_conv_args.push_back(std::make_shared<DWconvArgs>());
-
-    arg->dw_conv_args[i]->sub_conv_num = sub_conv_num;
-    arg->dw_conv_args[i]->relu_enabled = relu_enabled;
-    // arg->dw_conv_args[i]->output.activation.activation_type =
-    // activation_enable;
-    // arg->dw_conv_args[i]->output.activation.leaky_relu_negative_slope =
-    //     leaky_relu_negative_slope;
-    arg->dw_conv_args[i]->bias_address = bias_ptr;
-
-    arg->dw_conv_args[i]->filter_address =
-        fpga_malloc(filter_offset * sizeof(int16_t));
-    memcpy(arg->dw_conv_args[i]->filter_address,
-           (reinterpret_cast<half *>(filter_ptr) + i * filter_offset),
-           filter_offset * sizeof(int16_t));
-    arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
-        reinterpret_cast<char *>(arg->dw_conv_args[i]->filter_address),
-        deleter));
-
-    arg->dw_conv_args[i]->kernel.height = (uint32_t)sub_filter_width;
-    arg->dw_conv_args[i]->kernel.width = (uint32_t)sub_filter_width;
-
-    arg->dw_conv_args[i]->kernel.stride_h = (uint32_t)1;
-    arg->dw_conv_args[i]->kernel.stride_w = (uint32_t)1;
-    arg->dw_conv_args[i]->image.address = input_ptr;
-    arg->dw_conv_args[i]->image.channels = (uint32_t)input->dims()[1];
-    arg->dw_conv_args[i]->image.height = (uint32_t)input->dims()[2];
-    arg->dw_conv_args[i]->image.width = (uint32_t)input->dims()[3];
-
-    arg->dw_conv_args[i]->image.pad_height = sub_pad;
-    arg->dw_conv_args[i]->image.pad_width = sub_pad;
-    arg->dw_conv_args[i]->image.scale_address = input->scale;
-
-    arg->dw_conv_args[i]->output.address =
-        fpga_malloc(sub_output_height *
-                    align_to_x(sub_output_width * sub_channels * sub_conv_num,
-                               IMAGE_ALIGNMENT) *
-                    sizeof(int8_t));
-    arg->dw_conv_args[i]->output.scale_address =
-        static_cast<float *>(fpga_malloc(2 * sizeof(float)));
-    arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
-        reinterpret_cast<char *>(arg->dw_conv_args[i]->output.address),
-        deleter));
-    arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
-        reinterpret_cast<char *>(arg->dw_conv_args[i]->output.scale_address),
-        deleter));
-  }
-
-  // arg->output.scale_address = out->scale;
-}  // end dwconv arg fill
-
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V2/api.h b/mobile/src/fpga/V2/api.h
deleted file mode 100644
index d8674c4401..0000000000
--- a/mobile/src/fpga/V2/api.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "fpga/common/fpga_common.h"
-#include "fpga/common/pe.h"
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace fpga {
-
-void format_image(framework::Tensor* image_tensor);
-void format_ofm(framework::Tensor* ofm_tensor);
-void format_int8_ofm(framework::Tensor* ofm_tensor);
-void format_int8_ofm(framework::Tensor* ofm_tensor, framework::DDim dims);
-void format_fp32_ofm(framework::Tensor* ofm_tensor);
-
-float filter_find_max(framework::Tensor* filter_tensor);
-int get_filter_num_per_div(framework::Tensor* filter_tensor, int group_num);
-int get_deconv_filter_num_per_div(framework::Tensor* filter_tensor,
-                                  int group_num, int stride);
-
-int get_plit_num(framework::Tensor* filter_tensor);
-int get_deconv_plit_num(framework::Tensor* filter_tensor, int stride);
-
-int get_aligned_filter_element_num(int chw);
-void format_filter(framework::Tensor* filter_tensor, float max_value,
-                   int group_num);
-void format_fc_filter(framework::Tensor* filter_tensor, float max_value);
-void format_bias_scale_array(float** bias_scale_array,
-                             int element_num_per_division, int num);
-void format_bias_array(float** bias_array, int num);
-void format_concat_output(framework::Tensor* out, int height, int width,
-                          int image_num, uint32_t* channel_num);
-
-void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
-                    framework::Tensor* out, framework::Tensor* filter,
-                    bool relu_enabled, int group_num, int stride_h,
-                    int stride_w, int padding_h, int padding_w, float* bs_ptr);
-void fill_deconv_arg(struct DeconvArgs* arg, framework::Tensor* input,
-                     framework::Tensor* out, framework::Tensor* filter,
-                     bool relu_enabled, int group_num, int stride_h,
-                     int stride_w, int padding_h, int padding_w, float* bs_ptr);
-void fill_dwconv_arg(struct DWconvArgs* arg, framework::Tensor* input,
-                     framework::Tensor* out, framework::Tensor* filter,
-                     bool relu_enabled, int stride_h, int stride_w,
-                     int padding_h, int padding_w, float* bias_ptr);
-void fill_DWDeconv_arg(struct DWDeconvArgs* arg, framework::Tensor* input,
-                       framework::Tensor* out, framework::Tensor* filter,
-                       bool relu_enabled, int stride_h, int stride_w,
-                       int padding_h, int padding_w, float* bs_ptr);
-
-void format_deconv_filter(framework::Tensor* filter_tensor, float max_value,
-                          int group_num, int stride);
-void format_dwconv_filter(framework::Tensor* filter_tensor, float* scale_ptr);
-void format_conv_data(framework::Tensor* filter_tensor,
-                      framework::Tensor* ofm_tensor, float** bs_ptr, int group);
-void format_deconv_data(framework::Tensor* filter_tensor,
-                        framework::Tensor* ofm_tensor, float** bs_ptr,
-                        int group, int sub_conv_n);
-void format_dwconv_data(framework::Tensor* filter_tensor,
-                        framework::Tensor* ofm_tensor, float* scale_ptr,
-                        float** bias_ptr);
-void format_DWDeconv_data(framework::Tensor* filter_tensor,
-                          framework::Tensor* ofm_tensor, float** bs_ptr,
-                          int group, int sub_conv_n);
-
-template <typename Dtype>
-void savefile(std::string filename, void* buffer, int dataSize, Dtype tmp) {
-  float data;
-  std::ofstream out(filename.c_str());
-  for (int i = 0; i < dataSize; ++i) {
-    data = (((Dtype*)buffer)[i]);  // NOLINT
-    out << data << std::endl;
-  }
-  out.close();
-  return;
-}
-
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V2/bias_scale.cpp b/mobile/src/fpga/V2/bias_scale.cpp
deleted file mode 100644
index ca93fe17ca..0000000000
--- a/mobile/src/fpga/V2/bias_scale.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/V2/bias_scale.h"
-#include <memory.h>
-#include "fpga/common/fpga_common.h"
-
-namespace paddle_mobile {
-namespace fpga {
-namespace bias_scale {
-
-void align_element(float **data_in, int num_per_div_before_alignment, int num) {
-  int copynum = 0;
-  float *ptr_unaligned = *data_in;
-  int div_num =
-      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
-  int num_element =
-      2 * div_num * num_per_div_after_alignment;  // including bias & scale
-  float *ptr_aligned =
-      (float *)fpga_malloc(num_element * sizeof(float));  // NOLINT
-
-  memset(ptr_aligned, 0, num_element * sizeof(float));
-
-  for (int i = 0; i < div_num; i++) {
-    if (i == div_num - 1) {
-      copynum = (num_per_div_after_alignment * div_num > num)
-                    ? (num % num_per_div_after_alignment)
-                    : (num_per_div_before_alignment);
-    } else {
-      copynum = num_per_div_before_alignment;
-    }
-
-    memcpy(ptr_aligned + i * num_per_div_after_alignment,
-           ptr_unaligned + num_per_div_before_alignment * i,
-           copynum * sizeof(float));
-    memcpy(ptr_aligned + (div_num + i) * num_per_div_after_alignment,
-           ptr_unaligned + num_per_div_before_alignment * i + num,
-           copynum * sizeof(float));
-  }
-
-  fpga_free(ptr_unaligned);
-  *data_in = ptr_aligned;
-}
-
-void interleave(float **data_in, int num_after_alignment) {
-  // num_after_alignment: number of bias after alignment
-
-  float *ptr_uninterleaved = *data_in;
-  float *ptr_interleaved =
-      (float *)fpga_malloc(2 * num_after_alignment * sizeof(float));  // NOLINT
-  int num = num_after_alignment / 4;
-  for (int i = 0; i < num; i++) {
-    memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i,
-           4 * sizeof(float));
-    memcpy(ptr_interleaved + 8 * i + 4,
-           ptr_uninterleaved + num_after_alignment + 4 * i, 4 * sizeof(float));
-  }
-
-  fpga_free(ptr_uninterleaved);
-  *data_in = ptr_interleaved;
-}
-
-void format_bias_scale_array(float **bias_scale_array,
-                             int element_num_per_division, int num) {
-  align_element(bias_scale_array, element_num_per_division, num);
-  int div_num = (num + element_num_per_division - 1) / element_num_per_division;
-  int element_num_after_division =
-      align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
-  interleave(bias_scale_array, div_num * element_num_after_division);
-  fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float));
-}
-void format_bias_array(float **bias_array, int num) {
-  float *ptr_unaligned = *bias_array;
-  int num_before_align = num;
-  int num_after_align = align_to_x(num_before_align, BIAS_NUM_ALIGNMENT);
-  int16_t *ptr_aligned =
-      (int16_t *)fpga_malloc(num_after_align * sizeof(int16_t));  // NOLINT
-
-  memset(ptr_aligned, 0, num_after_align * sizeof(int16_t));
-  for (int i = 0; i < num_before_align; i++) {
-    ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]);
-  }
-  *bias_array = (float *)ptr_aligned;  // NOLINT
-  fpga_free(ptr_unaligned);
-}
-
-}  // namespace bias_scale
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V2/bias_scale.h b/mobile/src/fpga/V2/bias_scale.h
deleted file mode 100644
index 9ebdc71bce..0000000000
--- a/mobile/src/fpga/V2/bias_scale.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle_mobile {
-namespace fpga {
-namespace bias_scale {
-
-void align_element(float** data_in, int num_per_div_before_alignment, int num);
-void interleave(float** data_in, int num_after_alignment);
-void format_bias_scale_array(float** bias_scale_array,
-                             int element_num_per_division, int num);
-void format_bias_array(float** bias_array, int num);
-
-}  // namespace bias_scale
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V2/deconv_bias_scale.cpp b/mobile/src/fpga/V2/deconv_bias_scale.cpp
deleted file mode 100644
index f88e1a7738..0000000000
--- a/mobile/src/fpga/V2/deconv_bias_scale.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/V2/deconv_bias_scale.h"
-// #include "deconv_bias_scale.h"
-#include "fpga/V2/bias_scale.h"
-// #include "bias_scale.h"
-// #include <memory.h>
-
-#include "fpga/V2/api.h"
-// #include "fpga_api.h"
-namespace paddle_mobile {
-namespace fpga {
-namespace deconv_bias_scale {
-
-void deconv_bias_scale_expand(float** bias_scale_array, int num,
-                              int sub_conv_n) {
-  int sub_num = num * sub_conv_n;
-  float* ptr_tmp = *bias_scale_array;
-  float* ptr_bias_scale_expand =
-      reinterpret_cast<float*>(fpga_malloc(sizeof(float) * sub_num * 2));
-  int scale_base_offset = sub_num;
-  for (int i = 0; i < sub_conv_n; ++i) {
-    int offset = num * i;
-    // copy bias
-    fpga_copy(ptr_bias_scale_expand + offset, ptr_tmp, num * sizeof(float));
-    // copy scale
-    fpga_copy(ptr_bias_scale_expand + scale_base_offset + offset, ptr_tmp + num,
-              num * sizeof(float));
-  }
-  *bias_scale_array = ptr_bias_scale_expand;
-  fpga_free(ptr_tmp);
-}
-
-}  // namespace deconv_bias_scale
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V2/deconv_bias_scale.h b/mobile/src/fpga/V2/deconv_bias_scale.h
deleted file mode 100644
index 820c6984d4..0000000000
--- a/mobile/src/fpga/V2/deconv_bias_scale.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle_mobile {
-namespace fpga {
-namespace deconv_bias_scale {
-
-void deconv_bias_scale_expand(float** bias_scale_array, int num,
-                              int sub_conv_n);
-
-}  // namespace deconv_bias_scale
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V2/deconv_filter.cpp b/mobile/src/fpga/V2/deconv_filter.cpp
deleted file mode 100644
index 5ed9786f19..0000000000
--- a/mobile/src/fpga/V2/deconv_filter.cpp
+++ /dev/null
@@ -1,280 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/V2/deconv_filter.h"
-#include <memory.h>
-#include <algorithm>
-// #include "deconv_filter.h"
-#include "fpga/V2/filter.h"
-// #include "filter.h"
-#include "fpga/V2/api.h"
-
-namespace paddle_mobile {
-namespace fpga {
-namespace deconv_filter {
-
-/*
-inverse kernel weights of each channel for every filter
-*/
-void deconv_inverse_filter(float** data_in, int num, int channel, int width,
-                           int height) {
-  float* tmp = *data_in;
-  int data_size = num * channel * width * height;
-  int hw_len = height * width;
-  auto tmp_data =
-      reinterpret_cast<float*>(fpga_malloc(data_size * sizeof(float)));
-  for (int i = 0; i < num; ++i) {
-    for (int j = 0; j < channel; ++j) {
-      for (int k = 0; k < hw_len; ++k) {
-        tmp_data[i * channel * hw_len + j * hw_len + k] =
-            (*data_in)[i * channel * hw_len + j * hw_len + hw_len - k - 1];
-      }
-    }
-  }
-  *data_in = tmp_data;
-  fpga_free(tmp);
-}
-
-/*
-    calculate sub padding number
-*/
-int deconv_calc_sub_pad(int filter_axis, int pad, int stride) {
-  if (stride == 0 || ((filter_axis - pad - 1) < 0)) {
-    PADDLE_MOBILE_ENFORCE(false, "Wrong deconv parameters");
-  }
-  return (filter_axis - pad - 1) / stride;
-}
-int deconv_get_sub_filter_axis(int filter_axis, int stride) {
-  return (filter_axis / stride);
-}
-
-int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis) {
-  return ((image_axis + 2 * sub_pad - sub_filter_axis) + 1);
-}
-
-/*
-    (filter_width-pad,filter_width-pad) is the first pixel of sub-pixel image
-   position. so the omit rows or columns is (stride - )
-*/
-int deconv_get_omit(int stride, int filter_width, int pad) {
-  PADDLE_MOBILE_ENFORCE(filter_width > pad, "Wrong deconv parameters");
-  int idx;
-  bool flag = false;
-  for (idx = 1; idx <= stride; ++idx) {
-    int j = idx;
-    for (; j <= filter_width;) {
-      if (j == filter_width - pad) {
-        flag = true;
-        break;
-      }
-      j = j + stride;
-    }
-    if (flag) {
-      break;
-    }
-  }
-
-  return (stride - idx);
-}
-
-template <typename T>
-void deconv_get_sub_filter(T** data_in, int height, int width, int sub_conv_n,
-                           int kernel_num, int channel) {
-  T* ptr_tmp = *data_in;
-  int sub_num = kernel_num * sub_conv_n;
-  int sub_h = height / sub_conv_n;
-  int sub_w = width / sub_conv_n;
-
-  int sub_filter_size =
-      kernel_num * sub_h * sub_w * channel * sub_conv_n * sub_conv_n;
-
-  T* ptr_sub_filter =
-      reinterpret_cast<T*>(fpga_malloc(sub_filter_size * sizeof(T)));
-  for (int idx = 0; idx < sub_conv_n; ++idx) {
-    for (int nn = 0; nn < sub_num; ++nn) {
-      int ni = nn % kernel_num;
-
-      int woff = sub_conv_n - 1 - (nn / kernel_num);  //
-
-      for (int hh = 0; hh < sub_h; ++hh) {
-        int hi = hh * sub_conv_n + idx % sub_conv_n;
-        for (int ww = 0; ww < sub_w; ++ww) {
-          int wi = ww * sub_conv_n + woff;  // 1 0
-
-          int sidx = ((nn * sub_h + hh) * sub_w + ww) * channel;   //
-          int kidx = ((ni * height + hi) * width + wi) * channel;  //
-
-          fpga_copy(
-              ptr_sub_filter + idx * sub_h * sub_w * channel * sub_num + sidx,
-              (*data_in) + kidx, channel * sizeof(T));
-          // for (int cc =0; cc < channel; ++cc) {
-          //     ptr_sub_filter[idx*sub_h*sub_w*channel*sub_num + sidx + cc] =
-          //     (*data_in)[kidx + cc];
-          // }
-        }
-      }
-    }
-  }
-  *data_in = ptr_sub_filter;
-  fpga_free(ptr_tmp);
-}
-
-void deconv_NC_convert(float** filter_in, int kernel_num, int channels,
-                       int hw) {
-  float* tmp = *filter_in;
-  float* ptr_filter = reinterpret_cast<float*>(paddle_mobile::fpga::fpga_malloc(
-      hw * kernel_num * channels * sizeof(float)));
-
-  for (int c = 0; c < channels; ++c) {
-    for (int n = 0; n < kernel_num; ++n) {
-      paddle_mobile::fpga::fpga_copy(ptr_filter + n * hw + kernel_num * hw * c,
-                                     tmp + n * channels * hw + c * hw,
-                                     hw * sizeof(float));
-    }
-  }
-  *filter_in = ptr_filter;
-  paddle_mobile::fpga::fpga_free(tmp);
-}
-
-void deconv_format_filter(float** data_in, int num, int channel, int height,
-                          int width, int group_num, float max, int stride) {
-  int data_size = channel * height * width * num;
-
-  /*{
-       float result2 = (float)0;
-       string filename = "origin_filter_data";
-       api::savefile<float>(filename, (void *)*data_in, data_size, result2);
-    }*/
-
-  deconv_inverse_filter(data_in, num, channel, width, height);
-
-  /* {
-          float result2 = (float)0;
-          string filename = "inverse_filter_data";
-          api::savefile<float>(filename, (void *)*data_in, data_size, result2);
-   }*/
-
-  filter::quantize(data_in, data_size, max);
-  /* {
-        char result2 = (char)0;
-        string filename = "quantize_filter_data";
-        api::savefile<char>(filename, (void *)*data_in, data_size, result2);
- }*/
-  char** quantize_data = (char**)data_in;  // NOLINT
-
-  filter::convert_to_hwc(quantize_data, num, channel, height, width);
-  /*{
-       char result2 = (char)0;
-       string filename = "convert_to_hwc_filter_data";
-       api::savefile<char>(filename, (void *)*quantize_data, data_size,
-  result2);
-  }*/
-
-  deconv_get_sub_filter<char>(quantize_data, height, width, stride, num,
-                              channel);
-  /*{
-     char result2 = (char)0;
-     string filename = "sub_filter_filter_data";
-     api::savefile<char>(filename, (void *)*quantize_data, data_size, result2);
-}*/
-
-  int sub_conv_n = stride;
-  int sub_h = height / sub_conv_n;
-  int sub_w = width / sub_conv_n;
-  int sub_chw = sub_h * sub_w * channel;
-  int sub_num = sub_conv_n * num;
-  int division_capacity = filter::calc_division_capacity(sub_chw);
-  int num_per_div_before_alignment =
-      filter::calc_num_per_div(sub_num, group_num, division_capacity);
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-  int div_num = (sub_num + num_per_div_before_alignment - 1) /
-                num_per_div_before_alignment;
-  int residual = (sub_num) % num_per_div_before_alignment;
-  int num_after_alignment = num_per_div_after_alignment *
-                                ((residual == 0) ? div_num : (div_num - 1)) +
-                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
-
-  char** ptr_ptr_data =
-      reinterpret_cast<char**>(fpga_malloc(sub_conv_n * sizeof(char*)));
-  int origin_offset = sub_chw * sub_num;
-  for (int i = 0; i < sub_conv_n; ++i) {
-    (ptr_ptr_data)[i] =
-        reinterpret_cast<char*>(fpga_malloc(origin_offset * sizeof(char)));
-    fpga_copy((ptr_ptr_data)[i], (*quantize_data) + origin_offset * i,
-              origin_offset * sizeof(char));
-
-    /* char result2 = (char)0;
-     string filename = "ptr_ptr_data" + to_string(i);
-     api::savefile<char>(filename, (void *)(ptr_ptr_data[i]), origin_offset,
-     result2);
-     */
-  }
-  // char result2 = (char)0;
-  //      string filename = "interleave";
-  //      api::savefile<char>(filename, (void *)*ptr_ptr_data, origin_offset,
-  //      result2);
-  fpga_free(*quantize_data);
-
-  int align_offset =
-      align_to_x(sub_chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment;
-  char* ptr_space = reinterpret_cast<char*>(fpga_malloc(
-      sub_conv_n * align_offset * sizeof(char)));  // continuous space
-  for (int i = 0; i < sub_conv_n; ++i) {
-    char* ptr_tmp = (ptr_ptr_data)[i];
-
-    filter::align_element(&ptr_tmp, sub_num, sub_chw);
-    filter::align_num(&ptr_tmp, num_per_div_before_alignment, sub_num, sub_chw);
-
-    filter::reorder(&ptr_tmp, num_after_alignment, sub_chw);
-    filter::interleave(&ptr_tmp, num_after_alignment, sub_chw);
-
-    /*   char result2 = (char)0;
-       string filename = "interleave" + to_string(i);
-       api::savefile<char>(filename, (void *)ptr_tmp, align_offset, result2);
-*/
-    fpga_copy(ptr_space + i * align_offset, ptr_tmp, align_offset);
-    fpga_free(ptr_tmp);
-  }
-  fpga_free(ptr_ptr_data);
-  *data_in = reinterpret_cast<float*>(ptr_space);
-
-  /*    {
-        char result2 = (char)0;
-         string filename = "ptr_space";
-         api::savefile<char>(filename, (void *)ptr_space, sub_conv_n *
-     align_offset, result2);
-      }*/
-  fpga_flush(ptr_space, sub_conv_n * align_offset * sizeof(char));
-}
-
-void DWDconv_format_filter(float** data_in, int num, int channel, int height,
-                           int width, float* scale_ptr, int stride) {
-  deconv_inverse_filter(data_in, num, channel, width, height);
-
-  filter::quantize_to_fp16(data_in, channel, height, width, scale_ptr);
-  int16_t** quantize_data = (int16_t**)data_in;  // NOLINT
-  filter::convert_to_hwn(quantize_data, channel, height, width);
-
-  deconv_get_sub_filter<int16_t>(quantize_data, height, width, stride, num,
-                                 channel);
-
-  filter::align_element_n(quantize_data, channel, height, width);
-  fpga_flush(*quantize_data, align_to_x(channel, FILTER_ELEMENT_ALIGNMENT) *
-                                 height * width * sizeof(int16_t));
-}
-
-}  // namespace deconv_filter
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V2/deconv_filter.h b/mobile/src/fpga/V2/deconv_filter.h
deleted file mode 100644
index f1a50b95c5..0000000000
--- a/mobile/src/fpga/V2/deconv_filter.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle_mobile {
-namespace fpga {
-namespace deconv_filter {
-
-void deconv_inverse_filter(float** data_in, int num, int channel, int width,
-                           int height);
-int deconv_calc_sub_pad(int filter_axis, int pad, int stride);
-int deconv_get_sub_filter_axis(int filter_axis, int stride);
-int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis);
-int deconv_get_omit(int stride, int filter_width, int pad);
-
-template <typename T>
-void deconv_get_sub_filter(T** data_in, int height, int width, int sub_conv_n,
-                           int kernel_num, int channel);
-void deconv_format_filter(float** data_in, int num, int channel, int height,
-                          int width, int group_num, float max, int stride);
-void deconv_NC_convert(float** filter_in, int kernel_num, int channels, int hw);
-void DWDconv_format_filter(float** data_in, int num, int channel, int height,
-                           int width, float* scale_ptr, int stride);
-
-}  // namespace deconv_filter
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V2/filter.cpp b/mobile/src/fpga/V2/filter.cpp
deleted file mode 100644
index a281a7335c..0000000000
--- a/mobile/src/fpga/V2/filter.cpp
+++ /dev/null
@@ -1,362 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/V2/filter.h"
-#include <memory.h>
-#include <algorithm>
-#include "fpga/common/fpga_common.h"
-
-namespace paddle_mobile {
-namespace fpga {
-namespace filter {
-
-int calc_division_capacity(int chw) {
-  int n = 2048 / ((chw + 15) / 16) * 32;
-  return n < 2048 ? n : 2048;
-}
-
-int calc_split_num(int num, int division_capacity) {
-  return (num + division_capacity - 1) / division_capacity;
-}
-
-int calc_division_number(int num, int group_num, int division_capacity) {
-  //  PADDLE_MOBILE_ENFORCE(num % group_num == 0,
-  //                        "Filter number should be divisible by group
-  //                        number");
-  int split_num = calc_split_num(num, division_capacity);
-  //  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
-  //                        "Split number or group number should be 1");
-  return group_num * split_num;
-}
-
-int calc_num_per_div(int num, int group_num, int division_capacity) {
-  //  PADDLE_MOBILE_ENFORCE(num % group_num == 0,
-  //                        "Filter number should be divisible by group
-  //                        number");
-  int split_num = calc_split_num(num, division_capacity);
-  //  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
-  //                        "Split number or group number should be 1");
-  if (group_num == 1) {
-    if (num > division_capacity) {
-      return division_capacity;
-    } else {
-      return num;
-    }
-  } else {
-    return (num + group_num - 1) / group_num;
-  }
-}
-
-void convert_to_hwc(char **data_in, int num, int channel, int height,
-                    int width) {
-  char *tmp = *data_in;
-  int chw = channel * height * width;
-  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));  // NOLINT
-  for (int n = 0; n < num; n++) {
-    int64_t amount_per_row = width * channel;
-    for (int c = 0; c < channel; c++) {
-      for (int h = 0; h < height; h++) {
-        int64_t offset_height = h * amount_per_row;
-        for (int w = 0; w < width; w++) {
-          *(data_tmp + n * chw + offset_height + w * channel + c) =
-              *((*data_in)++);
-        }
-      }
-    }
-  }
-
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-float find_max(float *data_in, int data_size) {
-  float max = 0.0;
-  for (int i = 0; i < data_size; ++i) {
-    float value = data_in[i];
-    float abs = value > 0 ? value : -value;
-    max = std::max(max, abs);
-  }
-  return max;
-}
-
-signed char float_to_int8(float fdata) {
-  if (fdata < 0.0) {
-    fdata -= 0.5;
-  } else {
-    fdata += 0.5;
-  }
-  return (signed char)fdata;
-}
-
-void quantize(float **data_in, int data_size, float max) {
-  float *tmp = *data_in;
-  float fix_range = 127;
-  float scale = fix_range / max;
-
-  signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
-  for (int i = 0; i < data_size; i++) {
-    tmp_data[i] = float_to_int8(
-        (*data_in)[i] * scale);  // (signed char)((*data_in)[i] * scale);
-  }
-  *data_in = (float *)tmp_data;  // NOLINT
-  fpga_free(tmp);
-}
-
-void align_element(char **data_in, int num, int chw) {
-  int i = 0;
-  int j = 0;
-  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  if (align_chw != chw) {
-    char *tmp = *data_in;
-    char *data_tmp =
-        (char *)fpga_malloc(num * align_chw * sizeof(char));  // NOLINT
-
-    memset(data_tmp, 0, num * align_chw);
-    for (j = 0; j < num; j++) {
-      memcpy(data_tmp + j * align_chw, (*data_in) + j * chw, chw);
-    }
-    *data_in = data_tmp;
-    fpga_free(tmp);
-  }
-}
-
-void align_num(char **data_in, int num_per_div_before_alignment, int num,
-               int chw) {
-  int i = 0;
-  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-
-  char *tmp = *data_in;
-  int div_num =
-      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int num_element = div_num * num_per_div_after_alignment * align_chw;
-  char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char));  // NOLINT
-
-  memset(data_tmp, 0, num_element * sizeof(char));
-
-  for (i = 0; i < div_num - 1; i++) {
-    memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
-           *data_in + num_per_div_before_alignment * align_chw * i,
-           num_per_div_before_alignment * align_chw);
-  }
-
-  memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
-         *data_in + num_per_div_before_alignment * align_chw * i,
-         (num - (div_num - 1) * num_per_div_before_alignment) * align_chw);
-
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void reorder(char **data_in, int num_after_alignment, int chw) {
-  int index = 0;
-  int new_index;
-
-  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-
-  char *data_tmp =
-      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
-                          sizeof(char));
-  char *tmp = *data_in;
-  for (index = 0; index < num_after_alignment; index++) {
-    new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) +
-                (index / 16 % 2 * 4);
-    memcpy(data_tmp + index * chw_align, *data_in + new_index * chw_align,
-           chw_align);
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void interleave(char **data_in, int num_after_alignment, int chw) {
-  int i = 0;
-  int j = 0;
-  int k = 0;
-  int interleave_per_num = 16;
-
-  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  char *data_tmp =
-      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
-                          sizeof(char));
-  char *tmp = *data_in;
-  int interleave_num = chw_align * 2 / interleave_per_num;
-  for (i = 0; i < num_after_alignment; i += 2) {
-    for (j = 0, k = 0; j < interleave_num; j += 2, k++) {
-      memcpy(data_tmp + i * chw_align + interleave_per_num * j,
-             *data_in + i * chw_align + interleave_per_num * k,
-             interleave_per_num);
-      memcpy(data_tmp + i * chw_align + interleave_per_num * (j + 1),
-             *data_in + (i + 1) * chw_align + interleave_per_num * k,
-             interleave_per_num);
-    }
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void format_filter(float **data_in, int num, int channel, int height, int width,
-                   int group_num, float max) {
-  int data_size = channel * height * width * num;
-  int chw = channel * height * width;
-
-  int division_capacity = calc_division_capacity(chw);
-  int num_per_div_before_alignment =
-      calc_num_per_div(num, group_num, division_capacity);
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-  int div_num =
-      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int residual = num % num_per_div_before_alignment;
-  int num_after_alignment = num_per_div_after_alignment *
-                                ((residual == 0) ? div_num : (div_num - 1)) +
-                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
-  quantize(data_in, data_size, max);
-  char **quantize_data = (char **)data_in;  // NOLINT
-  convert_to_hwc(quantize_data, num, channel, height, width);
-  align_element(quantize_data, num, chw);
-  if (num_after_alignment != num) {
-    align_num(quantize_data, num_per_div_before_alignment, num, chw);
-  }
-
-  reorder(quantize_data, num_after_alignment, chw);
-  interleave(quantize_data, num_after_alignment, chw);
-  fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
-                                 num_after_alignment * sizeof(char));
-}
-
-void convert_fc_filter(char **data_in, int num, int chw) {
-  char *tmp = *data_in;
-  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));  // NOLINT
-  for (int n = 0; n < num; n++) {
-    for (int c = 0; c < chw; c++) {
-      data_tmp[n * chw + c] = (*data_in)[num * c + n];
-    }
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void format_fc_filter(float **data_in, int num, int channel, int height,
-                      int width, int group_num, float max) {
-  int data_size = channel * height * width * num;
-  int chw = channel * height * width;
-
-  int division_capacity = calc_division_capacity(chw);
-  int num_per_div_before_alignment =
-      calc_num_per_div(num, group_num, division_capacity);
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-  int div_num =
-      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int residual = num % num_per_div_before_alignment;
-  int num_after_alignment = num_per_div_after_alignment *
-                                ((residual == 0) ? div_num : (div_num - 1)) +
-                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
-
-  quantize(data_in, data_size, max);
-  char **quantize_data = (char **)data_in;  // NOLINT
-  convert_fc_filter(quantize_data, num, chw);
-  convert_to_hwc(quantize_data, num, channel, height, width);
-  align_element(quantize_data, num, chw);
-  if (num_after_alignment != num) {
-    align_num(quantize_data, num_per_div_before_alignment, num, chw);
-  }
-  reorder(quantize_data, num_after_alignment, chw);
-  interleave(quantize_data, num_after_alignment, chw);
-  fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
-                                 num_after_alignment * sizeof(char));
-}
-void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
-  int16_t *tmp = *data_in;
-  int16_t *data_tmp =
-      (int16_t *)fpga_malloc(height * width * num * sizeof(int16_t));  // NOLINT
-  for (int n = 0; n < num; n++) {
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        *(data_tmp + h * width * num + w * num + n) = *((*data_in)++);
-      }
-    }
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void align_element_n(int16_t **data_in, int num, int height, int width) {
-  int unalign_n = num;
-  int align_n = align_to_x(num, FILTER_ELEMENT_ALIGNMENT);
-  if (unalign_n == align_n) {
-    return;
-  } else {
-    int16_t *tmp = *data_in;
-
-    int num_element = height * width * align_n;
-    int16_t *data_tmp =
-        (int16_t *)fpga_malloc(num_element * sizeof(int16_t));  // NOLINT
-
-    memset(data_tmp, 0, num_element * sizeof(int16_t));
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        int offset_unalign = h * width * unalign_n + w * unalign_n;
-        int offset_align = h * width * align_n + w * align_n;
-        for (int n = 0; n < unalign_n; n++) {
-          data_tmp[offset_align + n] = *((*data_in) + offset_unalign + n);
-        }
-      }
-    }
-
-    *data_in = data_tmp;
-    fpga_free(tmp);
-  }
-}
-void quantize_to_fp16(float **data_in, int num, int height, int width,
-                      float *scale_ptr) {
-  float *tmp = *data_in;
-  int size = num * height * width;
-
-  int16_t *tmp_data = (int16_t *)fpga_malloc(size * sizeof(int16_t));  // NOLINT
-  for (int n = 0; n < num; n++) {
-    float scale_val = scale_ptr[n];
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        int index = n * height * width + h * width + w;
-        tmp_data[index] = fp32_2_fp16((*data_in)[index] * scale_val);
-      }
-    }
-  }
-  *data_in = (float *)tmp_data;  // NOLINT
-  fpga_free(tmp);
-}
-void format_dwconv_filter(float **data_in, int num, int height, int width,
-                          float *scale_ptr) {
-  quantize_to_fp16(data_in, num, height, width, scale_ptr);
-  int16_t **quantize_data = (int16_t **)data_in;  // NOLINT
-  convert_to_hwn(quantize_data, num, height, width);
-  align_element_n(quantize_data, num, height, width);
-  fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
-                                 height * width * sizeof(int16_t));
-}
-
-void format_DWDeconv_filter(float **data_in, int num, int height, int width,
-                            float *scale_ptr) {
-  quantize_to_fp16(data_in, num, height, width, scale_ptr);
-  int16_t **quantize_data = (int16_t **)data_in;  // NOLINT
-  convert_to_hwn(quantize_data, num, height, width);
-  align_element_n(quantize_data, num, height, width);
-  fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
-                                 height * width * sizeof(int16_t));
-}
-}  // namespace filter
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V2/filter.h b/mobile/src/fpga/V2/filter.h
deleted file mode 100644
index 4812a75af2..0000000000
--- a/mobile/src/fpga/V2/filter.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cstdint>
-namespace paddle_mobile {
-namespace fpga {
-namespace filter {
-
-int calc_division_capacity(int chw);
-int calc_split_num(int num, int division_capacity);
-int calc_division_number(int num, int group_num, int division_capacity);
-int calc_num_per_div(int num, int group_num, int division_capacity);
-void convert_to_hwc(char** data_in, int num, int channel, int height,
-                    int width);
-float find_max(float* data_in, int data_size);
-void quantize(float** data_in, int data_size, float max);
-void align_element(char** data_in, int num, int chw);
-void align_num(char** data_in, int num_per_div_before_alignment, int num,
-               int chw);
-void reorder(char** data_in, int num_after_alignment, int chw);
-void interleave(char** data_in, int num_after_alignment, int chw);
-void format_filter(float** data_in, int num, int channel, int height, int width,
-                   int group_num, float max);
-
-void convert_fc_filter(char** data_in, int num, int chw);
-void format_fc_filter(float** data_in, int num, int channel, int height,
-                      int width, int group_num, float max);
-
-void convert_to_hwn(int16_t** data_in, int num, int height, int width);
-void align_element_n(int16_t** data_in, int num, int height, int width);
-void quantize_to_fp16(float** data_in, int num, int height, int width,
-                      float* scale_ptr);
-void format_dwconv_filter(float** data_in, int num, int height, int width,
-                          float* scale_ptr);
-
-}  // namespace filter
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V2/image.cpp b/mobile/src/fpga/V2/image.cpp
deleted file mode 100644
index dc3c3356e8..0000000000
--- a/mobile/src/fpga/V2/image.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/V2/image.h"
-
-namespace paddle_mobile {
-namespace fpga {
-namespace image {
-
-void convert_to_hwc(float **data_in, int channel, int height, int width,
-                    int num) {
-  float *data_tmp = reinterpret_cast<float *>(
-      fpga_malloc(num * channel * height * width * sizeof(float)));
-  int64_t amount_per_row = width * channel;
-  for (int n = 0; n < num; n++) {
-    for (int c = 0; c < channel; c++) {
-      for (int h = 0; h < height; h++) {
-        int64_t offset_height = h * amount_per_row;
-        for (int w = 0; w < width; w++) {
-          *(data_tmp + n * channel * height * width + offset_height +
-            w * channel + c) = *((*data_in)++);
-        }
-      }
-    }
-  }
-  *data_in = data_tmp;
-}
-
-void convert_to_chw(float **data_in, int channel, int height, int width,
-                    int num) {
-  float *data_tmp =
-      (float *)fpga_malloc(channel * height * width * sizeof(float));  // NOLINT
-  int64_t amount_per_side = width * height;
-  for (int n = 0; n < num; n++) {
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        for (int c = 0; c < channel; c++) {
-          *(data_tmp + n * height * width * channel + c * amount_per_side +
-            width * h + w) = *((*data_in)++);
-        }
-      }
-    }
-  }
-  *data_in = data_tmp;
-}
-
-void concat_images(int8_t **images_in, float **scales_in, void *image_out,
-                   float *scale_out, int image_num, uint32_t *channel_num,
-                   int height, int width) {
-  int i = 0;
-  int j = 0;
-  int k = 0;
-  int each_out_line_channel = 0;
-  int align_each_out_area_cw = 0;
-  int align_each_in_area_cw = 0;
-  int align_each_out_area_cw_differ = 0;
-  int tmp_channel = 0;
-  float Ck = 0.0f;
-  float So = scale_out[0];
-  auto images_in_tmp =
-      (int8_t **)fpga::fpga_malloc(image_num * sizeof(int8_t *));  // NOLINT
-  for (i = 0; i < image_num; i++) {
-    images_in_tmp[i] = reinterpret_cast<int8_t *>(fpga::fpga_malloc(
-        height * align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) *
-        sizeof(int8_t)));
-  }
-  for (i = 0; i < image_num; i++) {
-    each_out_line_channel += channel_num[i];
-    float Si_k = scales_in[i][0];
-    Ck = Si_k / So;
-    fpga_invalidate(images_in[i],
-                    height *
-                        align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) *
-                        sizeof(int8_t));
-    for (j = 0;
-         j < height * align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
-         j++) {
-      images_in_tmp[i][j] = (int8_t)(images_in[i][j] * Ck + 0.5);
-    }
-  }
-  align_each_out_area_cw =
-      align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT);
-  align_each_out_area_cw_differ =
-      align_each_out_area_cw - each_out_line_channel * width;
-
-  for (k = 0; k < height; k++) {
-    for (j = 0; j < width; j++) {
-      for (i = 0; i < image_num; i++) {
-        align_each_in_area_cw =
-            align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
-        memcpy(
-            (int8_t *)image_out + tmp_channel +  // NOLINT
-                k * align_each_out_area_cw_differ,
-            images_in_tmp[i] + j * channel_num[i] + k * align_each_in_area_cw,
-            channel_num[i] * sizeof(int8_t));
-
-        tmp_channel += channel_num[i];
-      }
-    }
-  }
-  fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int8_t));
-}
-
-void split_image(int8_t *image_in, void **images_out, int image_num,
-                 const uint32_t *channel_nums, int height, int width) {
-  int total_channel = 0;
-  for (int i = 0; i < image_num; i++) {
-    total_channel += channel_nums[i];
-  }
-  int element_num = height * align_to_x(width * total_channel, IMAGE_ALIGNMENT);
-  fpga_invalidate(image_in, element_num * sizeof(int8_t));
-  int src_offset = 0, des_offset = 0;
-  for (int h = 0; h < height; h++) {
-    for (int w = 0; w < width; w++) {
-      src_offset = h * align_to_x(total_channel * width, IMAGE_ALIGNMENT) +
-                   w * total_channel;
-      for (int i = 0; i < image_num; i++) {
-        des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT) +
-                     w * channel_nums[i];
-        memcpy(reinterpret_cast<int8_t *>(images_out[i]) + des_offset,
-               image_in + src_offset, channel_nums[i] * sizeof(int8_t));
-        src_offset += channel_nums[i];
-      }
-    }
-  }
-
-  for (int i = 0; i < image_num; i++) {
-    element_num = height * align_to_x(width * channel_nums[i], IMAGE_ALIGNMENT);
-    fpga_flush(images_out[i], element_num * sizeof(int8_t));
-  }
-}
-
-}  // namespace image
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V2/image.h b/mobile/src/fpga/V2/image.h
deleted file mode 100644
index 11988ee11d..0000000000
--- a/mobile/src/fpga/V2/image.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory.h>
-#include <algorithm>
-#include <cstdint>
-#include "fpga/common/fpga_common.h"
-namespace paddle_mobile {
-namespace fpga {
-namespace image {
-
-void convert_to_hwc(float** data_in, int channel, int height, int width,
-                    int num = 1);
-void convert_to_chw(float** data_in, int channel, int height, int width,
-                    int num = 1);
-template <typename Dtype>
-void align_element_conv(Dtype** data_in, int height, int cw);
-template <typename Dtype>
-void align_element_conv(Dtype** data_in, int height, int cw) {
-  int h = 0;
-  int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
-
-  Dtype* data_tmp =
-      (Dtype*)fpga_malloc(height * align_cw * sizeof(Dtype));  // NOLINT
-
-  memset(data_tmp, 0, height * align_cw * sizeof(Dtype));
-
-  for (h = 0; h < height; h++) {
-    memcpy((void*)(data_tmp + h * align_cw),  // NOLINT
-           (void*)(*data_in + h * cw),        // NOLINT
-           cw * sizeof(Dtype));
-  }
-
-  *data_in = data_tmp;
-}
-template <typename T>
-void format_image(T** data_in, int channel, int height, int width) {
-  int cw = channel * width;
-  int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
-  if (align_cw != cw) {
-    T* hwc_temp = *data_in;
-    align_element_conv(data_in, height, channel * width);
-    fpga_free(hwc_temp);
-  }
-  fpga_flush(*data_in,
-             align_to_x(channel * width, IMAGE_ALIGNMENT) * height * sizeof(T));
-}
-// Concat featuremaps along channel direction
-void concat_images(int8_t** images_in, float** scales_in, void* image_out,
-                   float* scale_out, int image_num, uint32_t* channel_num,
-                   int height, int width);
-
-// Split featuremap along channel direction
-void split_image(int8_t* image_in, void** images_out, int image_num,
-                 const uint32_t* channel_nums, int height, int width);
-}  // namespace image
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V2/pe.cpp b/mobile/src/fpga/V2/pe.cpp
deleted file mode 100644
index cc9d8d20cd..0000000000
--- a/mobile/src/fpga/V2/pe.cpp
+++ /dev/null
@@ -1,1175 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/common/pe.h"
-#include "common/enforce.h"
-#include "common/types.h"
-#include "fpga/V2/filter.h"
-#include "fpga/V2/image.h"
-#include "fpga/common/config.h"
-#include "fpga/common/driver.h"
-#include "fpga/common/fpga_common.h"
-#ifdef COST_TIME_PRINT
-#include <sys/time.h>
-#include <time.h>
-#include <iomanip>
-#include <iostream>
-#endif
-
-namespace paddle_mobile {
-namespace fpga {
-
-using namespace driver;  // NOLINT
-using namespace std;     // NOLINT
-#define USE_RELU 1
-#define USE_BIAS 2
-
-// bypass cmd
-#define CMD_FP16_TO_FP16 0
-#define CMD_FP16_TO_FP32 1
-#define CMD_FP32_TO_FP16 2
-#define CMD_FP32_TO_FP32 3
-#define CMD_INT8_TO_FP16 4
-
-// bypass macro
-#define SIZE_FP16 2
-#define SIZE_FP32 4
-#define SIZE_INT8 1
-
-#define PE_IRQ_TIMEOUT 1000000
-
-/* Interrupt bit-set offset*/
-#define INTERRUPT_RSVD 0x0001
-#define INTERRUPT_BYPASS 0x0002
-#define INTERRUPT_CONV 0x0004
-#define INTERRUPT_POOLING 0x0008
-#define INTERRUPT_EW 0x0010
-
-/* Register offset */
-#define REG_INTERRUPT 0x000
-#define REG_VERSION 0x008
-#define REG_TEMPERATURE 0x010
-#define REG_FPGA_RESET 0x018
-#define REG_TEST_REGISTER 0x048
-#define REG_HARDWARE_STATUS 0x050
-
-#define REG_TIMER_COUNTER 0x070
-
-#define REG_SCALE_PARAMETER 0x080
-#define REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR 0x090
-
-#define REG_FLASH_CMD 0x200
-#define REG_FLASH_DATA 0x208
-#define REG_FLASH_CONFIG 0x210
-#define REG_FLASH_STATUS 0x218
-#define REG_SN 0x220
-
-/*bypass*/
-#define REG_CONVERT_CMD 0x400
-#define REG_CONVERT_SRC_ADDR 0x408
-#define REG_CONVERT_DST_ADDR 0x410
-#define REG_CONVERT_LENGTH 0x418
-
-/*resize*/
-#define REG_RESIZE_CMD 0x600
-#define REG_RESIZE_CHANNEL_NUMBER 0x608
-#define REG_RESIZE_INPUT_IMAGE_PIXEL 0x610
-#define REG_RESIZE_OUTPUT_IMAGE_PIXEL 0x618
-#define REG_RESIZE_INPUT_BASE_ADDR 0x620
-#define REG_RESIZE_WEIGHT_BASE_ADDR 0x628
-#define REG_RESIZE_SRC_POS_BASE_ADDR 0x630
-#define REG_RESIZE_OUTPUT_BASE_ADDR 0x638
-
-/*pooling*/
-#define REG_POOLING_CMD 0x800
-#define REG_POOLING_IMAGE_BASE_ADDR 0x808
-#define REG_POOLING_RESULT_BASE_ADDR 0x810
-#define REG_POOLING_IMAGE_PIXEL 0x818
-#define REG_POOLING_WINDOW_SIZE 0x820
-#define REG_POOLING_RESULT_PIXEL 0x828
-#define REG_POOLING_PAD_PIXEL 0x830
-#define REG_POOLING_STEP_PIXEL 0x838
-#define REG_POOLING_CHANNEL_NUMBER 0x840
-#define REG_POOLING_IMAGE_AMOUNT_PER_ROW 0x848
-#define REG_POOLING_IMAGE_ONE_PAD_PER_ROW 0x850
-#define REG_POOLING_IMAGE_TWO_PAD_PER_ROW 0x858
-#define REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT 0x860
-#define REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT 0x868
-#define REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT 0x870
-#define REG_POOLING_RESULT_AMOUNT_ALIGN_32 0x878
-#define REG_POOLING_RESULT_AMOUNT_ALIGN_64 0x880
-#define REG_POOLING_IMAGE_CALCU_HEIGHT 0x888
-#define REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW 0x898
-#define REG_POOLING_MODE_RECIPROCAL 0x890
-
-/*conv*/
-#define REG_CONV_CMD 0xC00
-#define REG_CONV_REG0 0xC08
-#define REG_CONV_REG1 0xC10
-#define REG_CONV_REG2 0xC18
-#define REG_CONV_REG3 0xC20
-#define REG_CONV_REG4 0xC28
-#define REG_CONV_REG5 0xC30
-#define REG_CONV_REG6 0xC38
-#define REG_CONV_REG7 0xC40
-#define REG_CONV_REG8 0xC48
-#define REG_CONV_REG9 0xC50
-#define REG_CONV_REG10 0xC58
-#define REG_CONV_REG11 0xC60
-
-#define REG_CONV_IMAGE_BASE_ADDR 0xC08
-#define REG_CONV_FILTER_BASE_ADDR 0xC10
-#define REG_CONV_SB_BASE_ADDR 0xC18
-#define REG_CONV_RESULT_BASE_ADDR 0xC20
-#define REG_CONV_IMAGE_PIXEL 0xC28
-#define REG_CONV_FILTER_PIXEL 0xC30
-#define REG_CONV_RESULT_PIXEL 0xC38
-#define REG_CONV_PAD_PIXEL 0xC40
-#define REG_CONV_STEP_PIXEL 0xC48
-#define REG_CONV_GROUP_NUMBER 0xC50
-#define REG_CONV_FILTER_NUMBER 0xC58
-#define REG_CONV_CHANNEL_NUMBER 0xC60
-#define REG_CONV_FILTER_PER_GROUP 0xC68
-#define REG_CONV_CHANNEL_PER_GROUP 0xC70
-#define REG_CONV_IMAGE_AMOUNT_PER_ROW 0xC78
-#define REG_CONV_IMAGE_ONE_PAD_PER_ROW 0xC80
-#define REG_CONV_IMAGE_TWO_PAD_PER_ROW 0xC88
-#define REG_CONV_FILTER_AMOUNT_ALL 0xC90
-#define REG_CONV_RESULT_AMOUNT_PER_ROW 0xC98
-#define REG_CONV_RESULT_LAST_VALID 0xCA0
-
-#define REG_CONV_BLOCK_AMOUNT_PER_ROW 0xCA8
-#define REG_CONV_FILTER_PAD_WIDTH_MUL_CH 0xCB0
-#define REG_CONV_IMAGE_AMOUNT_PER_ROW_MUL_WIN_F 0xCB8
-#define REG_CONV_IMAGE_AMOUNT_PER_ROW_MUL_WIN 0xCC0
-#define REG_CONV_IMAGE_BLOCK_NUM 0xCC8
-#define REG_CONV_IMAGE_BLOCK_LEN 0xCD0
-#define REG_CONV_IMAGE_BLOCK_LEN_LAST 0xCD8
-#define REG_CONV_IMAGE_WIN_CNT 0xCE0
-#define REG_CONV_IMAGE_WIN_CNT_LAST 0xCE8
-#define REG_CONV_RES_ROW_DATA_ALIGN4_PAD 0xCF8
-#define REG_CONV_PROG_FULL_CNT 0xD08
-#define REG_CONV_POST_PROG_FULL_CNT 0xD10
-#define REG_CONV_FPGA_BIAS_SCALE_LEN 0xD20
-
-#define REG_CONV_IMAGE_SCALE 0xD28
-#define REG_CONV_FILTER_SCALE 0xD30
-
-/*ew*/
-#define REG_EW_CMD 0x0F00
-#define REG_EW_IMAGE0_BASE_ADDR 0x0F08
-#define REG_EW_IMAGE1_BASE_ADDR 0x0F10
-#define REG_EW_RESULT_BASE_ADDR 0x0F18
-#define REG_EW_DATA_LEN 0x0F20
-#define REG_EW_COEFFICIENT 0x0F28
-#define REG_EW_IMAGE_PIXEL 0x0F30
-#define REG_EW_IMAGE_AMOUNT_PER_ROW 0x0F38
-
-/*dwconv*/
-#define REG_DWCONV_FILTER_BASE_ADDR 0xe08
-#define REG_DWCONV_FILTER_SHAPE 0xe10
-#define REG_DWCONV_FILTER_N_ALIGN 0xe18
-#define REG_DWCONV_FILTER_SUBNUMBER 0xe20
-#define REG_DWCONV_CMD 0xe00
-
-int ComputeFpgaConv(const struct SplitConvArgs &args) {
-//  ComputeBasicConv(args.conv_arg[0]);
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFPGAConv===========";
-  DLOG << "   filter_num:" << args.filter_num
-       << "   group_num:" << args.group_num
-       << "   split_num:" << args.split_num;
-#endif
-  int ret = 0;
-  int split_num = args.split_num;
-  for (int i = 0; i < split_num; i++) {
-    ret |= ComputeBasicConv(args.conv_arg[i]);
-  }
-
-  if (split_num > 1) {
-    ComputeFPGAConcat(args.concat_arg);
-  }
-
-  return ret;
-}
-
-int ComputeBasicConv(const struct ConvArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "======Compute Basic Conv======";
-  DLOG << "   relu_enabled:" << args.relu_enabled;
-  DLOG << "   sb_address:" << args.sb_address
-       << "   filter_address:" << args.filter_address
-       << "   filter_num:" << args.filter_num
-       << "   group_num:" << args.group_num;
-  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
-       << "   image_channels:" << args.image.channels
-       << "   image_height:" << args.image.height
-       << "   image_width:" << args.image.width
-       << "   pad_height:" << args.image.pad_height
-       << "   pad_width:" << args.image.pad_width;
-  DLOG << "   kernel_height:" << args.kernel.height
-       << "   kernel_width:" << args.kernel.width
-       << "   stride_h:" << args.kernel.stride_h
-       << "   stride_w:" << args.kernel.stride_w;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-
-#ifdef PADDLE_MOBILE_ZU5
-  int ret = 0;
-  uint64_t output_scale = 0;
-
-  // uint64_t reg_ActivationArgs = 0;
-  // active function:{none,leakeyrelu,sigmoid,tanh}
-  // ActivationArgs active_args;
-  // active_args.activation_type = LEAKYRELU;
-
-  // active_args.activation_type = args.output.activation.activation_type;
-
-  // active_args.leaky_relu_negative_slope =
-  //    args.output.activation.leaky_relu_negative_slope;
-
-  // reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
-  //                     active_args.leaky_relu_negative_slope;
-
-  // DLOG << "   activation_type:" << active_args.activation_type
-  //     << "   leaky_relu_negative_slope:"
-  //     << active_args.leaky_relu_negative_slope;
-  // DLOG << "   reg_ActivationArgs:" << reg_ActivationArgs;
-
-  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
-  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) {
-    ret = -EIO;
-    DLOG << "Conv Status Error!";
-    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-    return ret;
-  }
-  // new
-  reg_writeq((args.driver.row_padding_down << 45) |
-                 (args.driver.row_padding_up << 34) |
-                 (args.driver.col_padding_down << 17) |
-                 args.driver.col_padding_up,
-             REG_CONV_REG0);
-
-  reg_writeq((args.driver.image_win_cnt_last << 50) |
-                 (args.driver.image_win_cnt << 39) |
-                 (args.driver.image_block_amount_per_row << 20) |
-                 args.driver.filter_pad_width_mul_channel,
-             REG_CONV_REG1);
-
-  reg_writeq((args.driver.stride_h << 48) | (args.driver.skip_window << 28) |
-                 (args.driver.filter_row << 8) |
-                 (args.driver.filter_height << 4) | args.driver.filter_width,
-             REG_CONV_REG2);
-
-  reg_writeq((args.driver.filter_num << 42) | (args.driver.filter_align << 26) |
-                 (args.driver.prog_full_cnt << 16) |
-                 args.driver.filter_amount_all,
-             REG_CONV_REG3);
-
-  reg_writeq((args.driver.post_prog_full_cnt << 54) |
-                 (args.driver.last_cal_res_row_num << 50) |
-                 (args.driver.cal_res_num << 39) |
-                 (args.driver.res_row_data_align4_pad << 35) |
-                 (args.driver.output_amount_per_row << 16) |
-                 args.driver.output_width,
-             REG_CONV_REG4);
-
-  reg_writeq((args.driver.deconv_dump << 40) | (args.driver.deconv_ena << 39) |
-                 (args.driver.deconv_res_skip_row << 7) |
-                 args.driver.deconv_skip_row,
-             REG_CONV_REG5);
-
-  reg_writeq((args.driver.result_amount_per_row_multi_para << 43) |
-                 (args.driver.output_height << 32) |
-                 args.driver.output_address_phy,
-             REG_CONV_REG6);
-
-  reg_writeq((args.driver.filter_amount_whole << 48) |
-                 (args.driver.fpga_bias_scale_len << 32) |
-                 args.driver.sb_address_phy,
-             REG_CONV_REG7);
-
-  reg_writeq(
-      (args.driver.filters_amount_whole << 32) | args.driver.filter_address_phy,
-      REG_CONV_REG8);
-
-  reg_writeq((args.driver.image_amount_per_row << 43) |
-                 (args.driver.image_hight << 32) |
-                 args.driver.image_address_phy,
-             REG_CONV_REG9);
-
-  reg_writeq((args.driver.filter_pad_hight << 46) |
-                 (args.driver.image_amount_per_row_multi_win << 23) |
-                 args.driver.image_amount_per_row_multi_win_first,
-             REG_CONV_REG10);
-
-  reg_writeq((args.driver.image_block_num << 48) |
-                 (args.driver.image_block_len << 24) |
-                 args.driver.image_block_len_last,
-             REG_CONV_REG11);
-
-  reg_writeq(args.driver.cmd, REG_CONV_CMD);
-  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) {
-    g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR;
-    ret = -EIO;
-    DLOG << "Conv Wait Irq Timeout!";
-    PADDLE_MOBILE_ENFORCE(0, "Conv Wait Irq Timeout");
-  }
-  DLOG << "after reg poll";
-
-  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-
-  return ret;
-#endif
-  return 0;
-}  // ComputeBasicConv
-
-int ComputeFpgaPool(const struct PoolingArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFpgaPool===========";
-  DLOG << "   mode:" << args.mode
-       << "   kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal);
-  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
-       << "   image_channels:" << args.image.channels
-       << "   image_height:" << args.image.height
-       << "   image_width:" << args.image.width
-       << "   pad_height:" << args.image.pad_height
-       << "   pad_width:" << args.image.pad_width;
-  DLOG << "   kernel_height:" << args.kernel.height
-       << "   kernel_width:" << args.kernel.width
-       << "   stride_h:" << args.kernel.stride_h
-       << "   stride_w:" << args.kernel.stride_w;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-#ifdef PADDLE_MOBILE_ZU5
-  DLOG << "Polling";
-  // return 0;
-  uint64_t output_scale = 0;
-  uint64_t timer_cnt = 0;
-  int ret = 0;
-  uint64_t cmd = 0;
-  uint64_t image_physical_address = 0;
-  uint64_t output_physical_address = 0;
-
-  // uint64_t reg_ActivationArgs = 0;
-  // active function:{none,leakeyrelu,sigmoid,tanh}
-  //  ActivationArgs active_args;
-  // active_args.activation_type = LEAKYRELU;
-  //  active_args.activation_type = args.output.activation.activation_type;
-
-  //  active_args.leaky_relu_negative_slope =
-  //     args.output.activation.leaky_relu_negative_slope;
-
-  //  reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
-  //                       active_args.leaky_relu_negative_slope;
-
-  //  DLOG << "   activation_type:" << active_args.activation_type
-  //       << "   leaky_relu_negative_slope:"
-  //       << active_args.leaky_relu_negative_slope;
-  // DLOG << "   reg_ActivationArgs:" << reg_ActivationArgs;
-
-  image_physical_address = vaddr_to_paddr_driver(args.image.address);
-  output_physical_address = vaddr_to_paddr_driver(args.output.address);
-  uint32_t output_height = (uint32_t)(
-      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
-          args.kernel.stride_h +
-      1);
-  uint32_t output_width = (uint32_t)(
-      (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
-          args.kernel.stride_w +
-      1);
-  uint64_t image_amount_per_row =
-      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
-                 IMAGE_ALIGNMENT);
-  uint64_t image_one_pad_per_row =
-      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
-                 FILTER_ELEMENT_ALIGNMENT) +
-      (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
-  uint64_t image_two_pad_per_row = align_to_x(
-      ((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) *
-          (uint64_t)args.image.channels,
-      IMAGE_ALIGNMENT);
-  uint64_t image_row_mul_pooling_hight =
-      image_amount_per_row * (uint64_t)args.kernel.height;
-  uint64_t image_row_mul_pad_hight =
-      image_amount_per_row * (uint64_t)args.image.pad_height;
-  uint64_t image_row_mul_step_hight =
-      image_amount_per_row * (uint64_t)args.kernel.stride_h;
-  uint64_t result_amount_align_32 =
-      align_to_x((uint64_t)output_width * (uint64_t)args.image.channels,
-                 FILTER_ELEMENT_ALIGNMENT);
-  uint64_t result_amount_align_64 = align_to_x(
-      (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT);
-  uint64_t image_calcu_height =
-      (uint64_t)args.kernel.height +
-      ((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h;
-  uint64_t image_pad_left = args.image.channels * args.image.pad_width;
-  uint64_t image_skip_window = args.image.channels * args.kernel.stride_w;
-  uint64_t image_padleft_skipwindow =
-      (image_skip_window << 32) | image_pad_left;
-  uint64_t mode_reciprocal = (uint64_t)0 | ((uint64_t)args.mode) << 16 |
-                             (((uint64_t)args.kernel_reciprocal));
-
-  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
-  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
-    ret = -EIO;
-    DLOG << "Conv Status Error!";
-    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-    return ret;
-  }
-
-  // reg_writeq(reg_ActivationArgs,
-  //            REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
-
-  // reg_writeq(output_scale, REG_SCALE_PARAMETER);
-  reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR);
-  reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR);
-  reg_writeq(
-      ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32),
-      REG_POOLING_IMAGE_PIXEL);
-  reg_writeq(
-      ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32),
-      REG_POOLING_WINDOW_SIZE);
-  reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32),
-             REG_POOLING_RESULT_PIXEL);
-  reg_writeq(((uint64_t)args.image.pad_height) |
-                 (((uint64_t)args.image.pad_width) << 32),
-             REG_POOLING_PAD_PIXEL);
-  reg_writeq(((uint64_t)args.kernel.stride_h) |
-                 (((uint64_t)args.kernel.stride_w) << 32),
-             REG_POOLING_STEP_PIXEL);
-  reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER);
-  reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW);
-  reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW);
-  reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW);
-  reg_writeq(image_row_mul_pooling_hight,
-             REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT);
-  reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT);
-  reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT);
-  reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32);
-  reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64);
-  reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT);
-  reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW);
-  reg_writeq(mode_reciprocal, REG_POOLING_MODE_RECIPROCAL);
-  reg_writeq(cmd, REG_POOLING_CMD);
-
-  DLOG << "before reg poll";
-  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
-    g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR;
-    ret = -EIO;
-    DLOG << "Pooling Wait Irq Timeout!";
-    PADDLE_MOBILE_ENFORCE(0, "Pooling Wait Irq Timeout!");
-  }
-  DLOG << "after reg poll";
-
-  // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
-  //  output_scale = reg_readq(REG_SCALE_PARAMETER);
-  //  output_scale = (output_scale << 32) | (output_scale >> 32);
-  //  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-
-  //  active_args.activation_type = NONE;
-  //  reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
-
-  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-
-  return ret;
-#endif
-  return 0;
-}  // ComputeFpgaPool
-
-int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFpgaEWAdd===========";
-  DLOG << "   relu_enabled:" << args.relu_enabled;
-  DLOG << "   const0:" << fp16_2_fp32(int16_t(args.const0))
-       << "   const1:" << fp16_2_fp32(int16_t(args.const1));
-  DLOG << "   image0_address:" << args.image0.address
-       << "   image0_scale_address:" << args.image0.scale_address
-       << "   image0_channels:" << args.image0.channels
-       << "   image0_height:" << args.image0.height
-       << "   image0_width:" << args.image0.width
-       << "   pad0_height:" << args.image0.pad_height
-       << "   pad0_width:" << args.image0.pad_width;
-  DLOG << "   image1_address:" << args.image1.address
-       << "   image1_scale_address:" << args.image1.scale_address
-       << "   image1_channels:" << args.image1.channels
-       << "   image1_height:" << args.image1.height
-       << "   image1_width:" << args.image1.width
-       << "   pad1_height:" << args.image1.pad_height
-       << "   pad_width:" << args.image1.pad_width;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-#ifdef PADDLE_MOBILE_ZU5
-  int ret = 0;
-  uint64_t output_scale = 0;
-
-  // uint64_t reg_ActivationArgs = 0;
-  // ActivationArgs active_args;
-  // active_args.activation_type = args.output.activation.activation_type;
-  // active_args.leaky_relu_negative_slope =
-  //     args.output.activation.leaky_relu_negative_slope;
-  //  reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
-  //                      active_args.leaky_relu_negative_slope;
-  // DLOG << "    activation_type:" << active_args.activation_type
-  //     << "    leaky_relu_negative_slope:"
-  //     << active_args.leaky_relu_negative_slope;
-  // DLOG << "    reg_ActivationArgs:" << reg_ActivationArgs;
-
-  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
-  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) {
-    ret = -EIO;
-    DLOG << "EW Status Error!";
-    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-    return ret;
-  }
-
-  // reg_writeq(reg_ActivationArgs,
-  //          REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
-
-  reg_writeq(output_scale, REG_SCALE_PARAMETER);
-  reg_writeq(args.driver.image0_address_phy, REG_EW_IMAGE0_BASE_ADDR);
-  reg_writeq(args.driver.image1_address_phy, REG_EW_IMAGE1_BASE_ADDR);
-  reg_writeq(args.driver.datalen, REG_EW_DATA_LEN);
-  reg_writeq(args.driver.image_image_pixel, REG_EW_IMAGE_PIXEL);
-  reg_writeq(args.driver.image_amount_per_row, REG_EW_IMAGE_AMOUNT_PER_ROW);
-  reg_writeq(args.driver.output_address_phy, REG_EW_RESULT_BASE_ADDR);
-  reg_writeq(args.driver.coefficient, REG_EW_COEFFICIENT);
-  reg_writeq(args.driver.cmd, REG_EW_CMD);
-
-  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
-    g_fpgainfo.pe_data->pes[PE_IDX_EW]->status = ERROR;
-    ret = -EIO;
-    DLOG << "EW Wait Irq Timeout!";
-    PADDLE_MOBILE_ENFORCE(0, "EW Wait Irq Timeout!");
-  }
-
-  // output_scale = reg_readq(REG_SCALE_PARAMETER);
-  // output_scale = (output_scale << 32) | (output_scale >> 32);
-  // fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-  // active_args.activation_type = NONE;
-  // reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
-
-  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-  return ret;
-#endif
-  return 0;
-}  // ComputeFpgaEWAdd
-
-int PerformBypass(const struct BypassArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFpgaBypass===========";
-  DLOG << "   input_type:" << args.input_data_type
-       << "   output_type:" << args.output_data_type
-       << "   input_layout_type:" << args.input_layout_type
-       << "   output_layout_type:" << args.output_layout_type;
-  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
-       << "   image_channels:" << args.image.channels
-       << "   image_height:" << args.image.height
-       << "   image_width:" << args.image.width
-       << "   pad_height:" << args.image.pad_height
-       << "   pad_width:" << args.image.pad_width;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-#ifdef PADDLE_MOBILE_ZU5
-  uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
-  uint64_t output_scale = 0;
-  uint64_t timer_cnt = 0;
-  uint64_t cmd = 0;
-  uint64_t datalen = 0;
-  uint64_t input_address_phy = 0;
-  uint64_t output_address_phy = 0;
-  uint8_t data_cell_in = 0;
-  uint8_t data_cell_out = 0;
-  int ret = 0;
-
-  uint64_t reg_ActivationArgs = 0;
-  ActivationArgs active_args;
-  active_args.activation_type = args.output.activation.activation_type;
-
-  active_args.leaky_relu_negative_slope =
-      args.output.activation.leaky_relu_negative_slope;
-
-  reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
-                       active_args.leaky_relu_negative_slope;
-
-  datalen = (uint64_t)args.image.width * (uint64_t)args.image.height *
-            (uint64_t)args.image.channels;
-  datalen = align_to_x(datalen, 16);
-  input_address_phy = vaddr_to_paddr_driver(args.image.address);
-  output_address_phy = vaddr_to_paddr_driver(args.output.address);
-  DLOG << "input_phy:" << input_address_phy;
-  DLOG << "output_phy:" << output_address_phy;
-
-  switch (args.input_data_type) {
-    case DATA_TYPE_FP16: {
-      switch (args.output_data_type) {
-        case DATA_TYPE_FP16:
-          data_cell_in = SIZE_FP16;
-          data_cell_out = SIZE_FP16;
-          cmd = CMD_FP16_TO_FP16;
-          break;
-
-        case DATA_TYPE_FP32:
-          data_cell_in = SIZE_FP16;
-          data_cell_out = SIZE_FP32;
-          cmd = CMD_FP16_TO_FP32;
-          break;
-
-        default:
-          break;
-      }
-    } break;
-
-    case DATA_TYPE_INT8: {
-      if (args.output_data_type != DATA_TYPE_FP16) {
-        DLOG << "error:Output Datetype error,not DATA_TYPE_FP16: "
-             << args.output_data_type;
-      }
-      data_cell_in = SIZE_INT8;
-      data_cell_out = SIZE_FP16;
-      cmd = CMD_INT8_TO_FP16;
-    } break;
-
-    case DATA_TYPE_FP32: {
-      switch (args.output_data_type) {
-        case DATA_TYPE_FP16:
-          data_cell_in = SIZE_FP32;
-          data_cell_out = SIZE_FP16;
-          cmd = CMD_FP32_TO_FP16;
-          break;
-
-        case DATA_TYPE_FP32:
-          data_cell_in = SIZE_FP32;
-          data_cell_out = SIZE_FP32;
-          cmd = CMD_FP32_TO_FP32;
-          break;
-
-        default:
-          break;
-      }
-    } break;
-
-    default:
-      break;
-  }
-  if (cmd != CMD_FP16_TO_FP16 && cmd != CMD_FP16_TO_FP32 &&
-      cmd != CMD_FP32_TO_FP16 && cmd != CMD_FP32_TO_FP32 &&
-      cmd != CMD_INT8_TO_FP16) {
-    //   std::cout<< " err back Error1!" <<std::endl;
-    return -EFAULT;
-  }
-  if ((data_cell_in != SIZE_FP16 && data_cell_in != SIZE_FP32 &&
-       data_cell_in != SIZE_INT8) ||
-      (data_cell_out != SIZE_FP16 && data_cell_out != SIZE_FP32)) {
-    return -EFAULT;
-  }
-  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
-  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_BYPASS]->status) {
-    ret = -EIO;
-    DLOG << "Bypass Status Error!";
-    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-    return ret;
-  }
-  reg_writeq(reg_ActivationArgs,
-             REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
-  reg_writeq(output_scale, REG_SCALE_PARAMETER);
-  reg_writeq(input_address_phy, REG_CONVERT_SRC_ADDR);
-  reg_writeq(output_address_phy, REG_CONVERT_DST_ADDR);
-  reg_writeq(datalen, REG_CONVERT_LENGTH);
-  reg_writeq(cmd, REG_CONVERT_CMD);
-  DLOG << "before reg poll";
-  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_BYPASS, PE_IRQ_TIMEOUT)) {
-    g_fpgainfo.pe_data->pes[PE_IDX_BYPASS]->status = ERROR;
-    ret = -EIO;
-    DLOG << "BYPASS Wait Irq Timeout!";
-    PADDLE_MOBILE_ENFORCE(0, "BYPASS Wait Irq Timeout!");
-  }
-  DLOG << "after reg poll";
-
-  output_scale = reg_readq(REG_SCALE_PARAMETER);
-  output_scale = (output_scale << 32) | (output_scale >> 32);
-  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-  reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
-  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-  return ret;
-#endif
-  return 0;
-}  // PerformBypass
-
-uint64_t FPGAVersion() {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFpgaBypass===========";
-#endif
-#ifdef PADDLE_MOBILE_ZU5
-  uint64_t fpga_ver = 0;
-  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
-  fpga_ver = reg_readq(REG_HARDWARE_STATUS);
-  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-  return fpga_ver;
-#endif
-  return 0;
-}  // FPGAVersion
-
-int ComputeFPGAConcat(const struct ConcatArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFpgaConcat===========";
-  DLOG << "   Image_num: " << args.image_num
-       << "   out_address:" << args.image_out
-       << "   out_scale_address:" << args.scale_out
-       << "   out_channel:" << args.out_channel;
-  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
-  for (int i = 0; i < args.image_num; i++) {
-    DLOG << "   " << i << "th:        ";
-    DLOG << "   channel_num:"
-         << args.channel_num[i]
-         //<< "   aligned_channel_num:" << args.aligned_channel_num[i]
-         << "   image_address:" << args.images_in[i]
-         << "   image_scale_address:" << args.scales_in[i];
-  }
-#endif
-
-  image::concat_images(args.images_in, args.scales_in, args.image_out,
-                       args.scale_out, args.image_num, args.channel_num,
-                       args.height, args.width);
-  return 0;
-}  // ComputeFPGAConcat
-
-void deconv_post_process(const struct DeconvArgs &args) {
-  int sub_conv_n = args.sub_conv_num;
-  int sub_height = args.sub_output_height;
-  int sub_width = args.sub_output_width;
-  int omit_size = args.omit_size;
-  int channel = args.filter_num;
-  int num = 1;
-  int origin_h = sub_height * sub_conv_n;
-  int origin_w = sub_width * sub_conv_n;
-  int align_origin_w = align_to_x(origin_w * channel, 16);
-  int deconv_h = origin_h - 2 * omit_size;
-  int deconv_w = origin_w - 2 * omit_size;
-  int deconv_row_len = deconv_w * channel;
-  int align_deconv_row_len = align_to_x(deconv_row_len, 16);
-
-  for (int idx = 0; idx < sub_conv_n; ++idx) {
-    paddle_mobile::fpga::fpga_invalidate(
-        args.split_conv_args[idx]->output.address,
-        align_origin_w * origin_h * sizeof(int16_t));
-  }
-
-  int deconv_idx = 0;
-  for (int nn = 0; nn < num; ++nn) {
-    for (int hh = 0; hh < origin_h; ++hh) {
-      int hx = (hh % sub_conv_n);
-      auto sub_t =
-          (int16_t *)(args.split_conv_args[sub_conv_n - hx - 1]  // NOLINT
-                          ->output.address);
-      int hi = (hh / sub_conv_n);
-      if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue;
-      int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w +
-                  omit_size * channel);
-      fpga_copy((int16_t *)(args.output.address) + deconv_idx,    // NOLINT
-                sub_t + sidx, sizeof(int16_t) * deconv_row_len);  // NOLINT
-      deconv_idx += align_deconv_row_len;
-    }
-  }
-  fpga_flush(args.output.address,
-             num * align_deconv_row_len * deconv_h * sizeof(int16_t));
-}
-void DWDeconv_post_process(const struct DWDeconvArgs &args) {
-  int sub_conv_n = args.sub_conv_num;
-  int sub_height = args.sub_output_height;
-  int sub_width = args.sub_output_width;
-  int omit_size = args.omit_size;
-  int channel = args.filter_num;
-  int num = 1;
-  int origin_h = sub_height * sub_conv_n;
-  int origin_w = sub_width * sub_conv_n;
-  int align_origin_w = align_to_x(origin_w * channel, IMAGE_ALIGNMENT);
-  int deconv_h = origin_h - 2 * omit_size;
-  int deconv_w = origin_w - 2 * omit_size;
-  int deconv_row_len = deconv_w * channel;
-  int align_deconv_row_len = align_to_x(deconv_row_len, IMAGE_ALIGNMENT);
-
-  for (int idx = 0; idx < sub_conv_n; ++idx) {
-    paddle_mobile::fpga::fpga_invalidate(
-        args.dw_conv_args[idx]->output.address,
-        align_origin_w * origin_h * sizeof(int16_t));
-  }
-
-  int deconv_idx = 0;
-  for (int nn = 0; nn < num; ++nn) {
-    for (int hh = 0; hh < origin_h; ++hh) {
-      int hx = (hh % sub_conv_n);
-      auto sub_t = (int16_t *)(args.dw_conv_args[sub_conv_n - hx - 1]  // NOLINT
-                                   ->output.address);
-      int hi = (hh / sub_conv_n);
-      if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue;
-      int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w +
-                  omit_size * channel);
-      fpga_copy((int16_t *)(args.output.address) + deconv_idx,    // NOLINT
-                sub_t + sidx, sizeof(int16_t) * deconv_row_len);  // NOLINT
-      deconv_idx += align_deconv_row_len;
-    }
-  }
-  fpga_flush(args.output.address,
-             num * align_deconv_row_len * deconv_h * sizeof(int16_t));
-}
-
-int ComputeFpgaDeconv(const struct DeconvArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFPGADeConv===========";
-  DLOG << "   filter_num:" << args.filter_num
-       << "   group_num:" << args.group_num << "omit_size:" << args.omit_size
-       << "sub_output_width: " << args.sub_output_width
-       << "sub_output_height: " << args.sub_output_height
-       << "   sub_conv_num:" << args.sub_conv_num;
-  DLOG << "args.output.address: " << args.output.address
-       << "args.output.scale_address: " << args.output.scale_address;
-
-#endif
-
-  int sub_conv_num = args.sub_conv_num;
-
-#ifdef COST_TIME_PRINT
-  timeval start, end;
-  long dif_sec, dif_usec;  // NOLINT
-#endif
-
-  for (int i = 0; i < sub_conv_num; i++) {
-#ifdef COST_TIME_PRINT
-    gettimeofday(&start, NULL);
-#endif
-
-    ComputeFpgaConv(*args.split_conv_args[i]);
-#ifdef COST_TIME_PRINT
-    gettimeofday(&end, NULL);
-    dif_sec = end.tv_sec - start.tv_sec;
-    dif_usec = end.tv_usec - start.tv_usec;
-    std::cout << "deconv basic_conv: " << i << " times:  "
-              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
-              << std::endl;
-#endif
-  }
-
-  if (sub_conv_num > 1) {
-    float max_scale = -1.0f;
-#ifdef COST_TIME_PRINT
-    gettimeofday(&start, NULL);
-#endif
-    for (int i = 0; i < sub_conv_num; i++) {
-      paddle_mobile::fpga::fpga_invalidate(
-          args.split_conv_args[i]->output.scale_address, 2 * sizeof(float));
-      float ptr_scale = (args.split_conv_args[i]->output.scale_address)[0];
-      if (ptr_scale > max_scale) {
-        args.output.scale_address[0] = ptr_scale;
-        args.output.scale_address[1] =
-            (args.split_conv_args[i]->output.scale_address)[1];
-      }
-    }
-
-#ifdef COST_TIME_PRINT
-    gettimeofday(&end, NULL);
-    dif_sec = end.tv_sec - start.tv_sec;
-    dif_usec = end.tv_usec - start.tv_usec;
-    std::cout << "deconv scale  "
-              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
-              << std::endl;
-#endif
-
-    //    fpga_flush(args.output.scale_address, 2 * sizeof(float));
-    /*#ifdef COST_TIME_PRINT
-    gettimeofday(&start,NULL);
-    #endif
-        //deconv_post_process(args);
-    #ifdef COST_TIME_PRINT
-        gettimeofday(&end,NULL);
-     dif_sec = end.tv_sec - start.tv_sec;
-     dif_usec = end.tv_usec - start.tv_usec;
-      std::cout << "deconv_post_process  " << "    cost time: "  <<
-    (dif_sec*1000000+dif_usec)  << "us" << std::endl; #endif*/
-  }
-
-  return 0;
-}  // ComputeFpgaDeconv
-
-int ComputeFPGASplit(const struct SplitArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFpgaSplit===========";
-  DLOG << "   Image_num: " << args.image_num
-       << "   in_address:" << args.image_in
-       << "   in_scale_address:" << args.scale_in;
-  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
-  for (int i = 0; i < args.image_num; i++) {
-    DLOG << "   " << i << "th:        ";
-    DLOG << "   channel_num:" << args.out_channel_nums[i]
-         << "   image_address:" << args.images_out[i]
-         << "   image_scale_address:" << args.scales_out[i];
-  }
-#endif
-  image::split_image(args.image_in, args.images_out, args.image_num,
-                     args.out_channel_nums, args.height, args.width);
-  return 0;
-}  // ComputeFPGASplit
-int ComputeDWConv(const struct DWconvArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeDWConv===========";
-  // DLOG << "   mode:" << args.relu_enabled;
-  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
-       << "   image_channels:" << args.image.channels
-       << "   image_height:" << args.image.height
-       << "   image_width:" << args.image.width
-       << "   pad_height:" << args.image.pad_height
-       << "   pad_width:" << args.image.pad_width;
-  DLOG << "   filter_address:" << args.filter_address
-       << "   bias_address:" << args.bias_address;
-  DLOG << "   kernel_height:" << args.kernel.height
-       << "   kernel_width:" << args.kernel.width
-       << "   stride_h:" << args.kernel.stride_h
-       << "   stride_w:" << args.kernel.stride_w;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-#ifdef PADDLE_MOBILE_ZU5
-  DLOG << "DWConv";
-  // return 0;
-  uint64_t output_scale = 0;
-  uint64_t timer_cnt = 0;
-  int ret = 0;
-  // uint64_t cmd = args.relu_enabled;
-  uint64_t cmd = 0;
-  uint64_t image_physical_address = 0;
-  uint64_t output_physical_address = 0;
-  uint64_t filter_physical_address = 0;
-  uint64_t bias_physical_address = 0;
-
-  image_physical_address = vaddr_to_paddr(args.image.address);
-  output_physical_address = vaddr_to_paddr(args.output.address);
-  filter_physical_address = vaddr_to_paddr(args.filter_address);
-  bias_physical_address = vaddr_to_paddr(args.bias_address);
-  uint64_t filter_N_align =
-      align_to_x((uint64_t)args.image.channels, IMAGE_ALIGNMENT);
-  uint64_t filter_amount_per_row_align =
-      filter_N_align * (uint64_t)args.kernel.width;
-  uint64_t sub_filter_amount_align = filter_N_align *
-                                     (uint64_t)args.kernel.width *
-                                     (uint64_t)args.kernel.height;
-  uint64_t filter_amount_align =
-      sub_filter_amount_align * (uint64_t)args.sub_conv_num;
-
-  uint32_t output_height = (uint32_t)(
-      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
-          args.kernel.stride_h +
-      1);
-  uint32_t output_width = (uint32_t)(
-      ((args.image.width + args.image.pad_width * 2 - args.kernel.width) /
-           args.kernel.stride_w +
-       1) *
-      args.sub_conv_num);
-
-  uint64_t image_amount_per_row =
-      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
-                 IMAGE_ALIGNMENT);
-  uint64_t image_one_pad_per_row =
-      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
-                 FILTER_ELEMENT_ALIGNMENT) +
-      (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
-  uint64_t image_two_pad_per_row = align_to_x(
-      ((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) *
-          (uint64_t)args.image.channels,
-      IMAGE_ALIGNMENT);
-  uint64_t image_row_mul_pooling_hight =
-      image_amount_per_row * (uint64_t)args.kernel.height;
-  uint64_t image_row_mul_pad_hight =
-      image_amount_per_row * (uint64_t)args.image.pad_height;
-  uint64_t image_row_mul_step_hight =
-      image_amount_per_row * (uint64_t)args.kernel.stride_h;
-  uint64_t result_amount_align_32 =
-      align_to_x((uint64_t)output_width * (uint64_t)args.image.channels,
-                 FILTER_ELEMENT_ALIGNMENT);
-  uint64_t result_amount_align_64 = align_to_x(
-      (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT);
-  uint64_t image_calcu_height =
-      (uint64_t)args.kernel.height +
-      ((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h;
-  uint64_t image_pad_left = args.image.channels * args.image.pad_width;
-  uint64_t image_skip_window = args.image.channels * args.kernel.stride_w;
-
-  uint64_t image_padleft_skipwindow =
-      (image_skip_window << 32) | image_pad_left;
-
-  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
-  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
-    ret = -EIO;
-    DLOG << "Conv Status Error!";
-    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-    return ret;
-  }
-
-  /*restart scale*/
-  reg_writeq(output_scale, REG_SCALE_PARAMETER);
-
-  reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR);
-  reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR);
-  reg_writeq((bias_physical_address << 32 | filter_physical_address),
-             REG_DWCONV_FILTER_BASE_ADDR);
-  reg_writeq(filter_amount_per_row_align | (filter_amount_align << 32),
-             REG_DWCONV_FILTER_SHAPE);
-  reg_writeq(sub_filter_amount_align | (((uint64_t)args.sub_conv_num) << 32),
-             REG_DWCONV_FILTER_SUBNUMBER);
-  reg_writeq(filter_N_align, REG_DWCONV_FILTER_N_ALIGN);
-
-  reg_writeq(
-      ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32),
-      REG_POOLING_IMAGE_PIXEL);
-  reg_writeq(
-      ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32),
-      REG_POOLING_WINDOW_SIZE);
-
-  reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32),
-             REG_POOLING_RESULT_PIXEL);
-
-  reg_writeq(((uint64_t)args.image.pad_height) |
-                 (((uint64_t)args.image.pad_width) << 32),
-             REG_POOLING_PAD_PIXEL);
-  reg_writeq(((uint64_t)args.kernel.stride_h) |
-                 (((uint64_t)args.kernel.stride_w) << 32),
-             REG_POOLING_STEP_PIXEL);
-
-  reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER);
-
-  reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW);
-  reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW);
-  reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW);
-
-  reg_writeq(image_row_mul_pooling_hight,
-             REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT);
-  reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT);
-  reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT);
-
-  reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32);
-  reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64);
-
-  reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT);
-
-  reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW);
-
-  /*SDK刷Cache保证数据一致性*/
-
-  reg_writeq(cmd, REG_DWCONV_CMD);
-
-  DLOG << "before reg poll";
-  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
-    g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR;
-    ret = -EIO;
-    DLOG << "Pooling Wait Irq Timeout!";
-    PADDLE_MOBILE_ENFORCE(0, "DWConv Wait Irq Timeout");
-  }
-  DLOG << "after reg poll";
-
-  // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
-  output_scale = reg_readq(REG_SCALE_PARAMETER);
-  output_scale = (output_scale << 32) | (output_scale >> 32);
-  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-  DLOG << "output_scale:" << output_scale;
-  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-  return ret;
-#endif
-  return 0;
-}
-int ComputeDWDeconv(const struct DWDeconvArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFPGADeConv===========";
-  DLOG << "   filter_num:" << args.filter_num
-       << "   group_num:" << args.group_num << "omit_size:" << args.omit_size
-       << "sub_output_width: " << args.sub_output_width
-       << "sub_output_height: " << args.sub_output_height
-       << "   sub_conv_num:" << args.sub_conv_num;
-  DLOG << "args.output.address: " << args.output.address
-       << "args.output.scale_address: " << args.output.scale_address;
-
-#endif
-
-  int sub_conv_num = args.sub_conv_num;
-
-#ifdef COST_TIME_PRINT
-  timeval start, end;
-  long dif_sec, dif_usec;  // NOLINT
-#endif
-
-  for (int i = 0; i < sub_conv_num; i++) {
-#ifdef COST_TIME_PRINT
-    gettimeofday(&start, NULL);
-#endif
-
-    ComputeDWConv(*args.dw_conv_args[i]);
-#ifdef COST_TIME_PRINT
-    gettimeofday(&end, NULL);
-    dif_sec = end.tv_sec - start.tv_sec;
-    dif_usec = end.tv_usec - start.tv_usec;
-    std::cout << "deconv basic_conv: " << i << " times:  "
-              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
-              << std::endl;
-#endif
-  }
-
-  if (sub_conv_num > 1) {
-    float max_scale = -1.0f;
-#ifdef COST_TIME_PRINT
-    gettimeofday(&start, NULL);
-#endif
-    for (int i = 0; i < sub_conv_num; i++) {
-      paddle_mobile::fpga::fpga_invalidate(
-          args.dw_conv_args[i]->output.scale_address, 2 * sizeof(float));
-      float ptr_scale = (args.dw_conv_args[i]->output.scale_address)[0];
-      if (ptr_scale > max_scale) {
-        args.output.scale_address[0] = ptr_scale;
-        args.output.scale_address[1] =
-            (args.dw_conv_args[i]->output.scale_address)[1];
-      }
-    }
-
-#ifdef COST_TIME_PRINT
-    gettimeofday(&end, NULL);
-    dif_sec = end.tv_sec - start.tv_sec;
-    dif_usec = end.tv_usec - start.tv_usec;
-    std::cout << "deconv scale  "
-              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
-              << std::endl;
-#endif
-  }
-
-#ifdef COST_TIME_PRINT
-  gettimeofday(&start, NULL);
-#endif
-  DWDeconv_post_process(args);
-#ifdef COST_TIME_PRINT
-  gettimeofday(&end, NULL);
-  dif_sec = end.tv_sec - start.tv_sec;
-  dif_usec = end.tv_usec - start.tv_usec;
-  std::cout << "deconv_post_process  "
-            << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
-            << std::endl;
-#endif
-  return 0;
-}  // ComputeFpgaDeconv
-
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/common/config.h b/mobile/src/fpga/common/config.h
deleted file mode 100644
index 27187c7b85..0000000000
--- a/mobile/src/fpga/common/config.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#define PADDLE_MOBILE_ZU5
-#define FPGA_PRINT_MODE
diff --git a/mobile/src/fpga/common/driver.cpp b/mobile/src/fpga/common/driver.cpp
deleted file mode 100644
index 71e3bf9746..0000000000
--- a/mobile/src/fpga/common/driver.cpp
+++ /dev/null
@@ -1,295 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <errno.h>
-#include <fcntl.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/ioctl.h>
-#include <sys/mman.h>
-#include <unistd.h>
-#include <algorithm>
-#include <cstddef>
-#include <cstring>
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <utility>
-
-#include "common/enforce.h"
-#include "fpga/common/driver.h"
-
-namespace paddle_mobile {
-namespace fpga {
-namespace driver {
-struct FPGA_INFO g_fpgainfo;
-
-int open_drvdevice() {
-  if (g_fpgainfo.fd_drv == -1) {
-    g_fpgainfo.fd_drv = open(g_fpgainfo.drvdevice_path, O_RDWR);
-  }
-  return g_fpgainfo.fd_drv;
-}
-
-int open_memdevice() {
-  if (g_fpgainfo.fd_mem == -1) {
-    // g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR | O_DSYNC);
-    g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR);
-  }
-  return g_fpgainfo.fd_mem;
-}
-
-int close_drvdevice() { return close(g_fpgainfo.fd_drv); }
-
-int close_memdevice() { return close(g_fpgainfo.fd_mem); }
-
-void pl_reset() { usleep(100 * 1000); }
-
-void setup_pe(struct pe_data_s *pe_data, struct fpga_pe *pe,
-              char const *type_name, int pe_idx) {
-  memset(pe, 0, sizeof(struct fpga_pe));
-
-  pe->outer = pe_data;
-  snprintf(pe->type_name, MAX_TYPE_NAME_LENTH, "%s", type_name);
-
-  pe->status = IDLE;
-  pe->interrupt_cnt = 0;
-  pe_data->pes[pe_idx] = pe;
-  pe_data->pe_num++;
-}
-
-void pl_init() {
-  struct pe_data_s *pe_data = nullptr;
-
-  pl_reset();
-
-  pe_data = (struct pe_data_s *)malloc(sizeof(struct pe_data_s));
-  if (pe_data == nullptr) {
-    std::cout << "pe_data malloc error!" << std::endl;
-    return;
-  }
-  memset(pe_data, 0, sizeof(struct pe_data_s));
-  pthread_mutex_init(&pe_data->mutex, 0);
-
-  setup_pe(pe_data, &pe_data->pe_conv, "CONV", PE_IDX_CONV);
-  setup_pe(pe_data, &pe_data->pe_pooling, "POOLING", PE_IDX_POOLING);
-  setup_pe(pe_data, &pe_data->pe_ew, "EW", PE_IDX_EW);
-  setup_pe(pe_data, &pe_data->pe_bypass, "BYPASS", PE_IDX_BYPASS);
-
-  g_fpgainfo.pe_data = pe_data;
-}
-
-void pl_destroy() {
-  struct pe_data_s *pe_data = g_fpgainfo.pe_data;
-  pthread_mutex_destroy(&pe_data->mutex);
-  free(pe_data);
-}
-
-void pl_start() {
-  struct pe_data_s *pe_data = g_fpgainfo.pe_data;
-
-  pthread_mutex_unlock(&pe_data->mutex);
-}
-
-void pl_stop() {
-  struct pe_data_s *pe_data = g_fpgainfo.pe_data;
-
-  pthread_mutex_lock(&pe_data->mutex);
-}
-
-void pl_reinit() {
-  struct pe_data_s *pe_data = g_fpgainfo.pe_data;
-  struct fpga_pe *pe = nullptr;
-  int i = 0;
-
-  pl_stop();
-  pl_reset();
-  pl_start();
-
-  for (i = 0; i < pe_data->pe_num; i++) {
-    pe = pe_data->pes[i];
-    pe->status = IDLE;
-    pe->interrupt_cnt = 0;
-  }
-
-  pl_start();
-}
-
-int pl_get_status() { return 0; }
-
-/*tmie单位us*/
-int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
-  uint64_t i = 0;
-  /*timeout精确性待确认*/
-  int64_t timeout = time * 6;
-
-  for (i = 0; i < timeout; i++) {
-    if (val == reg_readq(reg)) {
-      break;
-    }
-  }
-
-  if (i < timeout) {
-    return 0;
-  } else {
-    return -1;
-  }
-}
-
-uint64_t vaddr_to_paddr_driver(void *address) {
-  uint64_t paddr = 0;
-  auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(address);
-  if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) {
-    paddr = iter->second;
-  } else {
-    std::cout << "Invalid pointer: " << address << std::endl;
-  }
-
-  return paddr;
-}
-
-void *fpga_reg_malloc(size_t size) {
-  void *ret = nullptr;
-  ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED,
-               g_fpgainfo.fd_drv, FPGA_REG_PHY_ADDR);
-  // PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1");
-
-  g_fpgainfo.fpga_addr2size_map.insert(std::make_pair(ret, size));
-
-  return ret;
-}
-
-void *fpga_reg_free(void *ptr) {
-  size_t size = 0;
-
-  auto iter = g_fpgainfo.fpga_addr2size_map.find(ptr);
-  if (iter != g_fpgainfo.fpga_addr2size_map.end()) {
-    size = iter->second;
-    g_fpgainfo.fpga_addr2size_map.erase(iter);
-    munmap(ptr, size);
-  } else {
-    std::cout << "Invalid pointer" << ptr << std::endl;
-  }
-}
-
-static inline int do_ioctl(int64_t req, const void *arg) {
-  return ioctl(g_fpgainfo.fd_mem, req, arg);
-}
-
-void *fpga_malloc_driver(size_t size) {
-  void *ret = nullptr;
-  uint64_t phy_addr = 0;
-  int i = 0;
-  struct MemoryVM2PHYArgs args;
-  struct MemoryCacheArgs args_c;
-  ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED,
-               g_fpgainfo.fd_mem, FPGA_MEM_PHY_ADDR);
-  PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1");
-
-  args.pVM = reinterpret_cast<void *>(ret);
-  args.pPHY = reinterpret_cast<void *>(0);
-  do_ioctl(IOCTL_MEMORY_VM2PHY, &args);
-  phy_addr = (uint64_t)args.pPHY;
-
-  g_fpgainfo.fpga_vaddr2paddr_map.insert(std::make_pair(ret, phy_addr));
-  g_fpgainfo.fpga_addr2size_map.insert(std::make_pair(ret, size));
-
-  return ret;
-}
-
-void fpga_free_driver(void *ptr) {
-  size_t size = 0;
-  uint32_t pos = 0;
-  uint64_t p_addr = 0;
-
-  auto iter = g_fpgainfo.fpga_addr2size_map.find(ptr);
-  if (iter != g_fpgainfo.fpga_addr2size_map.end()) {
-    size = iter->second;
-    g_fpgainfo.fpga_addr2size_map.erase(iter);
-    munmap(ptr, size);
-    auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(ptr);
-    if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) {
-      g_fpgainfo.fpga_vaddr2paddr_map.erase(iter);
-    }
-  } else {
-    std::cout << "Invalid pointer" << ptr << std::endl;
-  }
-}
-
-int fpga_flush_driver(void *address, size_t size) {
-  struct MemoryCacheArgs args;
-  uint64_t p_addr;
-
-  p_addr = vaddr_to_paddr_driver(address);
-
-  args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);  // NOLINT
-  args.size = size;
-
-  return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
-}
-
-int fpga_invalidate_driver(void *address, size_t size) {
-  struct MemoryCacheArgs args;
-  uint64_t p_addr;
-
-  p_addr = vaddr_to_paddr_driver(address);
-
-  args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);  // NOLINT
-  args.size = size;
-
-  return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
-}
-
-void fpga_copy_driver(void *dest, const void *src, size_t num) {
-  uint64_t i;
-  for (i = 0; i < num; i++) {
-    *((int8_t *)dest + i) = *((int8_t *)src + i);  // NOLINT
-  }
-
-  return;
-}
-
-int open_device_driver() {
-  g_fpgainfo.FpgaRegPhyAddr = FPGA_REG_PHY_ADDR;
-  g_fpgainfo.FpgaMemPhyAddr = FPGA_MEM_PHY_ADDR;
-  g_fpgainfo.FpgaRegVirAddr = nullptr;
-  g_fpgainfo.pe_data = nullptr;
-  g_fpgainfo.drvdevice_path = "/dev/fpgadrv0";
-  g_fpgainfo.memdevice_path = "/dev/fpgamem0";
-  g_fpgainfo.fd_drv = -1;
-  g_fpgainfo.fd_mem = -1;
-
-  int ret = 0;
-  ret = open_drvdevice();
-  ret |= open_memdevice();
-
-  g_fpgainfo.FpgaRegVirAddr =
-      (uint64_t *)fpga_reg_malloc(FPGA_REG_SIZE);  // NOLINT
-  pl_init();
-  return ret;
-}
-
-int close_device_driver() {
-  pl_destroy();
-  fpga_reg_free(g_fpgainfo.FpgaRegVirAddr);
-  int ret = 0;
-  ret = close_drvdevice();
-  ret |= close_memdevice();
-  return ret;
-}
-
-}  // namespace driver
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/common/driver.h b/mobile/src/fpga/common/driver.h
deleted file mode 100644
index 87c68cbb5a..0000000000
--- a/mobile/src/fpga/common/driver.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <ctype.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <cstring>
-#include <map>
-
-#include "common/log.h"
-
-namespace paddle_mobile {
-namespace fpga {
-namespace driver {
-
-#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
-
-#define FPGA_REG_PHY_ADDR 0x80000000
-#define FPGA_REG_SIZE 0x1000
-#define FPGA_MEM_PHY_ADDR 0x20000000
-#define FPGA_MEM_SIZE 0x20000000
-
-#define FPGA_PAGE_SIZE (16UL * 1024UL)
-
-// PE related macros
-const int MAX_NUM_PES = 6;
-const size_t MAX_TYPE_NAME_LENTH = 8;
-
-const int PE_IDX_CONV = 0;
-const int PE_IDX_POOLING = 1;
-const int PE_IDX_EW = 2;
-const int PE_IDX_BYPASS = 3;
-
-enum pe_status { IDLE = 0, BUSY = 1, ERROR = 2 };
-
-struct MemoryCacheArgs {
-  void *offset;
-  size_t size;
-};
-
-struct MemoryVM2PHYArgs {
-  void *pVM;
-  void *pPHY;
-};
-
-#define IOCTL_FPGA_MAGIC 'F'
-#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
-#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
-#define IOCTL_MEMORY_VM2PHY _IOWR(IOCTL_FPGA_MAGIC, 15, struct MemoryVM2PHYArgs)
-
-struct fpga_pe {
-  char type_name[MAX_TYPE_NAME_LENTH + 1];
-  struct pe_data_s *outer;
-  pe_status status;
-  uint64_t interrupt_cnt;
-};
-
-struct pe_data_s {
-  pthread_mutex_t mutex;
-  struct fpga_pe pe_conv;
-  struct fpga_pe pe_pooling;
-  struct fpga_pe pe_ew;
-  struct fpga_pe pe_bypass;
-
-  struct fpga_pe *pes[MAX_NUM_PES];
-  int pe_num;
-};
-
-struct fpga_memory {
-  pthread_mutex_t mutex;
-  uint64_t *bitmap;
-  unsigned int *nr;
-  unsigned int page_num;
-  unsigned int page_num_long;
-  uint64_t mem_start;
-  uint64_t mem_end;
-};
-
-struct FPGA_INFO {
-  uint64_t FpgaRegPhyAddr;
-  uint64_t FpgaMemPhyAddr;
-  pthread_t poll_pid;
-  void *FpgaRegVirAddr;
-  struct pe_data_s *pe_data;
-
-  std::map<void *, size_t> fpga_addr2size_map;
-  std::map<void *, uint64_t> fpga_vaddr2paddr_map;
-  const char *drvdevice_path;
-  const char *memdevice_path;
-  struct fpga_memory *memory_info;
-  int fd_drv;
-  int fd_mem;
-};
-
-extern struct FPGA_INFO g_fpgainfo;
-
-inline uint64_t reg_readq(uint32_t offset) {
-  uint64_t value =
-      *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +  // NOLINT
-                             offset);                                // NOLINT
-  return value;
-}
-
-inline void reg_writeq(uint64_t value, uint32_t offset) {
-  *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +  // NOLINT
-                         offset) = value;
-}
-
-int open_device_driver();
-
-int close_device_driver();
-
-void *fpga_malloc_driver(size_t size);
-
-void fpga_free_driver(void *ptr);
-
-int fpga_flush_driver(void *address, size_t size);
-
-int fpga_invalidate_driver(void *address, size_t size);
-
-uint64_t vaddr_to_paddr_driver(void *address);
-
-int fpga_regpoll(uint64_t reg, uint64_t val, int time);
-
-}  // namespace driver
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/common/fpga_common.cpp b/mobile/src/fpga/common/fpga_common.cpp
deleted file mode 100644
index 2c589b3ef6..0000000000
--- a/mobile/src/fpga/common/fpga_common.cpp
+++ /dev/null
@@ -1,214 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/common/fpga_common.h"
-#include <algorithm>
-#include <map>
-#include <utility>
-#include "fpga/common/config.h"
-#include "fpga/common/driver.h"
-
-namespace paddle_mobile {
-namespace fpga {
-
-int16_t fp32_2_fp16(float fp32_num) {
-  int32_t tmp = *(reinterpret_cast<int32_t *>(&fp32_num));
-  int16_t se_fp32 = (tmp >> 23) & 0x1ff;
-  int32_t m_fp32 = tmp & 0x007fffff;
-  int16_t se_fp16 = 0;
-  int16_t m_fp16 = 0;
-
-  if (se_fp32 < 103) {
-    se_fp16 = 0x0000;
-    m_fp16 = m_fp32 >> 24;
-  } else if (se_fp32 < 113) {
-    se_fp16 = (0x0400 >> (113 - se_fp32));
-    m_fp16 = m_fp32 >> (126 - se_fp32);
-  } else if (se_fp32 <= 142) {
-    se_fp16 = (se_fp32 - 112) << 10;
-    m_fp16 = m_fp32 >> 13;
-  } else if (se_fp32 < 255) {
-    se_fp16 = 0x7C00;
-    m_fp16 = m_fp32 >> 24;
-  } else if (se_fp32 == 255) {
-    se_fp16 = 0x7C00;
-    m_fp16 = m_fp32 >> 13;
-  } else if (se_fp32 < 359) {
-    se_fp16 = 0x8000;
-    m_fp16 = m_fp32 >> 24;
-  } else if (se_fp32 < 369) {
-    se_fp16 = (0x0400 >> (369 - se_fp32)) | 0x8000;
-    m_fp16 = m_fp32 >> (382 - se_fp32);
-  } else if (se_fp32 <= 398) {
-    se_fp16 = ((se_fp32 - 368) << 10) | 0x8000;
-    m_fp16 = m_fp32 >> 13;
-  } else if (se_fp32 < 511) {
-    se_fp16 = 0x7C00;
-    m_fp16 = m_fp32 >> 24;
-  } else {
-    se_fp16 = 0x7C00;
-    m_fp16 = m_fp32 >> 13;
-  }
-  int16_t result = se_fp16 + m_fp16;
-  return result;
-}
-
-int32_t convertmantissa(int32_t i) {
-  int32_t m = i << 13;
-  int32_t e = 0;
-  while (!(m & 0x00800000)) {
-    e -= 0x00800000;
-    m <<= 1;
-  }
-  m &= ~0x00800000;
-  e += 0x38800000;
-  return m | e;
-}
-
-float fp16_2_fp32(int16_t fp16_num) {
-  int16_t se_fp16 = (fp16_num >> 10) & 0x3f;
-  int16_t m_fp16 = fp16_num & 0x3ff;
-  int32_t e_fp32 = 0;
-  int16_t offset = 0;
-  int32_t m_fp32 = 0;
-  if (se_fp16 == 0) {
-    e_fp32 = 0;
-    offset = 0;
-  } else if (se_fp16 < 31) {
-    e_fp32 = se_fp16 << 23;
-    offset = 1024;
-  } else if (se_fp16 == 31) {
-    e_fp32 = 0x47800000;
-    offset = 1024;
-  } else if (se_fp16 == 32) {
-    e_fp32 = 0x80000000;
-    offset = 0;
-  } else if (se_fp16 < 63) {
-    e_fp32 = 0x80000000 + ((se_fp16 - 32) << 23);
-    offset = 1024;
-  } else {  // se_fp16 == 63
-    e_fp32 = 0xC7800000;
-    offset = 1024;
-  }
-  int16_t a = offset + m_fp16;
-  if (a == 0) {
-    m_fp32 = 0;
-  } else if (a < 1024) {
-    int32_t tmp = a;
-    m_fp32 = convertmantissa(tmp);
-  } else {
-    int32_t tmp = a - 1024;
-    m_fp32 = 0x38000000 + (tmp << 13);
-  }
-
-  int32_t tmp = e_fp32 + m_fp32;
-  float fp32_num = *(reinterpret_cast<float *>(&tmp));
-  return fp32_num;
-}
-
-static std::map<void *, size_t> memory_map;
-
-int open_device() {
-  int ret = driver::open_device_driver();
-  return ret;
-}
-
-int close_device() {
-  int ret = driver::close_device_driver();
-  return ret;
-}
-
-void *fpga_malloc(size_t size) {
-  static uint64_t counter = 0;
-  if (size <= 0) {
-    size = 1;
-  }
-#ifdef PADDLE_MOBILE_ZU5
-  auto ptr = driver::fpga_malloc_driver(size);
-#else
-  auto ptr = malloc(size);
-#endif
-  counter += size;
-  memory_map.insert(std::make_pair(ptr, size));
-  //  DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
-  //       << counter << " bytes";
-  return ptr;
-}
-
-void fpga_free(void *ptr) {
-  if (ptr == nullptr) {
-    return;
-  }
-  static uint64_t counter = 0;
-  size_t size = 0;
-  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
-  if (iter != memory_map.end()) {
-    size = iter->second;
-    memory_map.erase(iter);
-#ifdef PADDLE_MOBILE_ZU5
-    driver::fpga_free_driver(ptr);
-#else
-    free(ptr);
-#endif
-    counter += size;
-    //    DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
-    //         << counter << " bytes";
-  } else {
-    DLOG << "Address: " << ptr << "  Invalid pointer";
-  }
-}
-void fpga_copy(void *dest, const void *src, size_t num) {
-#ifdef PADDLE_MOBILE_ZU5
-  // driver::fpga_copy_driver(dest, src, num);
-  memcpy(dest, src, num);
-#else
-  memcpy(dest, src, num);
-#endif
-}
-
-int fpga_flush(void *address, size_t size) {
-#ifdef PADDLE_MOBILE_ZU5
-  return driver::fpga_flush_driver(address, size);
-#else
-  return 0;
-#endif
-}
-int fpga_invalidate(void *address, size_t size) {
-#ifdef PADDLE_MOBILE_ZU5
-  return driver::fpga_invalidate_driver(address, size);
-#else
-  return 0;
-#endif
-}
-uint64_t vaddr_to_paddr(void *address) {
-#ifdef PADDLE_MOBILE_ZU5
-  return driver::vaddr_to_paddr_driver(address);
-#else
-  return 0;
-#endif
-}
-
-uint32_t paddle_mobile_version() {
-  uint32_t v_master = 52;
-  uint32_t v_slave = 52;
-
-  uint32_t first = 1, second = 2, fourth_master = 1, fourth_slave = 1;
-  uint32_t master = first << 24 | second << 16 | v_master << 8 | fourth_master;
-  uint32_t slave = first << 24 | second << 16 | v_slave << 8 | fourth_slave;
-
-  return slave;
-}
-
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/common/fpga_common.h b/mobile/src/fpga/common/fpga_common.h
deleted file mode 100644
index a798d54459..0000000000
--- a/mobile/src/fpga/common/fpga_common.h
+++ /dev/null
@@ -1,330 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <vector>
-
-#ifdef PADDLE_MOBILE_FPGA_V1
-#define IMAGE_ALIGNMENT (16)           // Aligned to 16
-#define FILTER_NUM_ALIGNMENT (32)      // Filter number aligned to 32
-#define FILTER_ELEMENT_ALIGNMENT (16)  // Filter element number aligned to 16
-#define BS_NUM_ALIGNMENT (8)
-#define BIAS_NUM_ALIGNMENT (16)
-#define ROW_PARALLEL_NUM (2)
-#endif
-#ifdef PADDLE_MOBILE_FPGA_V2
-#define IMAGE_ALIGNMENT (32)           // Aligned to 32
-#define FILTER_NUM_ALIGNMENT (32)      // Filter number aligned to 32
-#define FILTER_ELEMENT_ALIGNMENT (16)  // Filter element number aligned to 16
-#define BS_NUM_ALIGNMENT (8)
-#define BIAS_SCALE_DMA_NUM (4)
-#define RESULT_ALIGNMENT (32)
-
-#define PE_COLUMN (8)
-#define ROW_PARALLEL_NUM (2)
-
-#define BIAS_NUM_ALIGNMENT (16)
-
-#endif
-
-namespace paddle_mobile {
-namespace fpga {
-
-enum DataType {
-  DATA_TYPE_INT8 = 2,
-  DATA_TYPE_FP32 = 1,
-  DATA_TYPE_FP16 = 0,
-};
-
-enum LayoutType {
-  LAYOUT_CHW = 1,
-  LAYOUT_HWC = 0,
-};
-
-enum ActivationType {
-  NONE = 0,
-  LEAKYRELU = 1,
-  SIGMOID = 2,
-  TANH = 3,
-  SOFTMAX = 4,
-};
-
-struct ActivationArgs {
-  enum ActivationType activation_type = NONE;
-  int16_t leaky_relu_negative_slope;
-};
-
-struct KernelArgs {
-  uint32_t width;
-  uint32_t height;
-  uint32_t stride_w;
-  uint32_t stride_h;
-};
-
-struct ImageInputArgs {
-  void* address;         // input featuremap virtual address
-  float* scale_address;  // input scale address;
-  uint32_t channels;
-  uint32_t width;  // featuremap width
-  uint32_t height;
-  uint32_t pad_width;  // padding width;
-  uint32_t pad_height;
-};
-
-struct ImageOutputArgs {
-  void* address;         // output result address;
-  float* scale_address;  // output scale address;
-  uint64_t timer_cnt;    // time counter for FPGA computation
-  struct ActivationArgs
-      activation;  // To select activation and specify (Leaky)Relu parameter.
-};
-
-// #ifdef PADDLE_MOBILE_FPGA_V1
-struct ConvDriverParam {
-  uint64_t filter_per_group;
-  uint64_t channel_per_group;
-  uint64_t image_one_pad_per_row;
-  uint64_t deconv_param;
-
-  // new
-  uint64_t col_padding_up;
-  uint64_t col_padding_down;
-  uint64_t row_padding_up;
-  uint64_t row_padding_down;
-
-  uint64_t image_block_amount_per_row;
-  uint64_t filter_pad_width_mul_channel;
-  uint64_t image_win_cnt;
-  uint64_t image_win_cnt_last;
-
-  uint64_t filter_row;
-  uint64_t filter_width;
-  uint64_t filter_height;
-  uint64_t skip_window;
-  uint64_t stride_h;
-
-  uint64_t filter_amount_all;
-  uint64_t prog_full_cnt;
-  uint64_t filter_align;
-  uint64_t filter_num;
-
-  uint64_t output_width;
-  uint64_t output_amount_per_row;
-  uint64_t res_row_data_align4_pad;
-  uint64_t cal_res_num;
-  uint64_t last_cal_res_row_num;
-  uint64_t post_prog_full_cnt;
-
-  uint64_t deconv_skip_row;      // paralvl*deconv_group
-  uint64_t deconv_res_skip_row;  // deconv_group * result_amount_per_row
-  uint64_t deconv_ena;
-  uint64_t deconv_dump;
-
-  uint64_t output_address_phy;
-  uint64_t output_height;
-  uint64_t result_amount_per_row_multi_para;
-
-  uint64_t sb_address_phy;
-  uint64_t fpga_bias_scale_len;
-  uint64_t filter_amount_whole;
-
-  uint64_t filter_address_phy;
-  uint64_t filters_amount_whole;
-
-  uint64_t image_address_phy;
-  uint64_t image_hight;
-  uint64_t image_amount_per_row;
-
-  uint64_t image_amount_per_row_multi_win_first;
-  uint64_t image_amount_per_row_multi_win;
-  uint64_t filter_pad_hight;
-
-  uint64_t image_block_num;
-  uint64_t image_block_len;
-  uint64_t image_block_len_last;
-
-  uint64_t cmd;
-};
-
-struct EWAddDriverParam {
-  uint64_t image0_address_phy;
-  uint64_t image1_address_phy;
-  uint64_t datalen;
-  uint64_t image_image_pixel;
-  uint64_t image_amount_per_row;
-  uint64_t output_address_phy;
-  uint64_t coefficient;
-  uint64_t cmd;
-};
-
-struct DeconvTxParm {
-  uint32_t omit_size;
-  uint32_t sub_conv_num;
-  uint32_t deconv_en;
-  uint32_t out_addr_offset;
-};
-
-struct ConvArgs {
-  bool relu_enabled;
-  void* sb_address;  // scale and bias
-  void* filter_address;
-  float* filter_scale_address;
-  uint32_t filter_num;
-  uint32_t group_num;
-
-  struct KernelArgs kernel;
-  struct ImageInputArgs image;  // input image;
-  struct ImageOutputArgs output;
-
-  // #ifdef PADDLE_MOBILE_FPGA_V1
-  struct DeconvTxParm deconv_tx_param;
-  struct ConvDriverParam driver;
-};
-
-struct ConcatArgs {
-  uint32_t image_num;
-#ifdef PADDLE_MOBILE_FPGA_V2
-  int8_t** images_in;
-#else
-  int16_t** images_in;
-#endif
-  float** scales_in;
-  void* image_out;
-  float* scale_out;
-  uint32_t* channel_num;
-  uint32_t* aligned_channel_num;  // Not used so far. Reserved for V2.
-  uint32_t out_channel;
-  uint32_t height;
-  uint32_t width;
-};
-
-struct SplitConvArgs {
-  uint32_t split_num;
-  uint32_t group_num;
-  uint32_t filter_num;
-  struct ImageOutputArgs output;
-  struct ConvArgs* conv_arg;
-  struct ConcatArgs concat_arg;
-  std::shared_ptr<ConvArgs> shared_conv_arg;
-  std::vector<std::shared_ptr<char>> vector_concat_space;
-  std::vector<std::shared_ptr<char>> vector_conv_space;
-};
-
-struct SplitArgs {
-  uint32_t image_num;
-#ifdef PADDLE_MOBILE_FPGA_V2
-  int8_t* image_in;
-#else
-  int16_t* image_in;
-#endif
-  float* scale_in;
-  void** images_out;
-  float** scales_out;
-  uint32_t* out_channel_nums;
-  uint32_t height;
-  uint32_t width;
-  std::vector<std::shared_ptr<char>> vector_split_space;
-};
-
-struct PoolingArgs {
-  int16_t mode;  // mode: 0:max, 1:avg
-  int16_t kernel_reciprocal;
-  struct KernelArgs kernel;
-  struct ImageInputArgs image;  // input image;
-  struct ImageOutputArgs output;
-};
-
-struct EWAddArgs {
-  bool relu_enabled;
-  uint32_t const0;  // output0 = const0 x input0 + const1 x input1;
-  uint32_t const1;
-  struct ImageInputArgs image0;
-  struct ImageInputArgs image1;
-  struct ImageOutputArgs output;
-  // #ifdef PADDLE_MOBILE_FPGA_V1
-  struct EWAddDriverParam driver;
-};
-
-struct BypassArgs {
-  enum DataType input_data_type;
-  enum DataType output_data_type;
-  enum LayoutType input_layout_type;
-  enum LayoutType output_layout_type;
-  struct ImageInputArgs image;
-  struct ImageOutputArgs output;
-};
-
-struct DeconvArgs {
-  uint32_t sub_conv_num;
-  uint32_t group_num;
-  uint32_t filter_num;
-  uint32_t omit_size;
-  uint32_t sub_output_width;
-  uint32_t sub_output_height;
-  struct ImageOutputArgs output;
-  std::vector<std::shared_ptr<SplitConvArgs>> split_conv_args;
-};
-struct DWconvArgs {
-  uint32_t sub_conv_num;
-  bool relu_enabled;
-  void* bias_address;
-  void* filter_address;
-  struct KernelArgs kernel;
-  struct ImageInputArgs image;
-  struct ImageOutputArgs output;
-  std::vector<std::shared_ptr<char>> vector_dwconv_space;
-};
-
-struct DWDeconvArgs {
-  uint32_t sub_conv_num;
-  uint32_t group_num;
-  uint32_t filter_num;
-  uint32_t omit_size;
-  uint32_t sub_output_width;
-  uint32_t sub_output_height;
-  struct ImageOutputArgs output;
-  std::vector<std::shared_ptr<DWconvArgs>> dw_conv_args;
-  std::vector<std::shared_ptr<char>> vector_dw_conv_space;
-};
-
-// static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x;
-// }
-static inline uint32_t align_to_x(int64_t num, int64_t x) {
-  return ((uint32_t)(num + x) - 1) / (uint32_t)x * (uint32_t)x;
-}
-
-int16_t fp32_2_fp16(float fp32_num);
-float fp16_2_fp32(int16_t fp16_num);
-
-int open_device();
-int close_device();
-void* fpga_malloc(size_t size);
-void fpga_free(void* ptr);
-void fpga_copy(void* dest, const void* src, size_t num);
-int fpga_flush(void* address, size_t size);
-int fpga_invalidate(void* address, size_t size);
-
-uint64_t vaddr_to_paddr(void* address);
-void expand_conv_arg(ConvArgs* arg);
-void expand_EW_arg(EWAddArgs* arg);
-inline int32_t convertmantissa(int32_t i);
-
-uint32_t paddle_mobile_version();
-
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/common/pe.h b/mobile/src/fpga/common/pe.h
deleted file mode 100644
index cf0574bc04..0000000000
--- a/mobile/src/fpga/common/pe.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include "fpga/common/fpga_common.h"
-
-namespace paddle_mobile {
-namespace fpga {
-
-uint64_t FPGAVersion();
-int PerformBypass(const struct BypassArgs& args);
-int ComputeBasicConv(const struct ConvArgs& args);
-int ComputeFpgaPool(const struct PoolingArgs& args);
-int ComputeFpgaEWAdd(const struct EWAddArgs& args);
-
-int ComputeFpgaConv(const struct SplitConvArgs& args);
-int ComputeFPGAConcat(const struct ConcatArgs& args);
-int ComputeFPGASplit(const struct SplitArgs& args);
-int ComputeFpgaDeconv(const struct DeconvArgs& args);
-int ComputeDWConv(const struct DWconvArgs& args);
-int ComputeDWDeconv(const struct DWDeconvArgs& args);
-
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/CMakeLists.txt b/mobile/src/framework/CMakeLists.txt
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/mobile/src/framework/attribute.cpp b/mobile/src/framework/attribute.cpp
deleted file mode 100644
index 8b150f4e9e..0000000000
--- a/mobile/src/framework/attribute.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "attribute.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-struct PrintVistor : Vistor<Print &> {
-  explicit PrintVistor(Print &printer) : printer_(printer) {}
-  template <typename T>
-  Print &operator()(const T &value) {
-    printer_ << value;
-    return printer_;
-  }
-
- private:
-  Print &printer_;
-};
-
-Print &operator<<(Print &printer, const Attribute &attr) {
-  Attribute::ApplyVistor(PrintVistor(printer), attr);
-  //  std::vector<std::string> v = {"1", "2"};
-  //  printer << (v);
-  return printer;
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/attribute.h b/mobile/src/framework/attribute.h
deleted file mode 100644
index ece55f99b6..0000000000
--- a/mobile/src/framework/attribute.h
+++ /dev/null
@@ -1,183 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdlib>
-#include <string>
-#include <typeinfo>
-#include <unordered_map>
-#include <vector>
-
-#include "common/enforce.h"
-#include "common/log.h"
-#include "common/variant.h"
-#include "framework/framework.pb-c.h"
-
-namespace paddle_mobile {
-namespace framework {
-using std::string;
-using std::vector;
-
-class BlockDesc;
-
-class Attribute {
- public:
-  static Attribute GetAttrValue(
-      PaddleMobile__Framework__Proto__OpDesc__Attr *attr_desc) {
-    Attribute attr;
-    switch (attr_desc->type) {
-      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN: {
-        attr.Set<bool>(attr_desc->b);
-        break;
-      }
-      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT: {
-        attr.Set<int>(attr_desc->i);
-        break;
-      }
-      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT: {
-        attr.Set<float>(attr_desc->f);
-        break;
-      }
-      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING: {
-        attr.Set<std::string>(attr_desc->s);
-        break;
-      }
-      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS: {
-        vector<bool> val(attr_desc->n_bools);
-        for (int i = 0; i < attr_desc->n_bools; ++i) {
-          val[i] = attr_desc->bools[i];
-        }
-        attr.Set<vector<bool>>(val);
-        break;
-      }
-      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INTS: {
-        vector<int> val(attr_desc->n_ints);
-        for (int i = 0; i < attr_desc->n_ints; ++i) {
-          val[i] = attr_desc->ints[i];
-        }
-        attr.Set<vector<int>>(val);
-        break;
-      }
-      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOATS: {
-        vector<float> val(attr_desc->n_floats);
-        for (int i = 0; i < attr_desc->n_floats; ++i) {
-          val[i] = attr_desc->floats[i];
-        }
-        attr.Set<vector<float>>(val);
-        break;
-      }
-      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRINGS: {
-        vector<string> val(attr_desc->n_strings);
-        for (int i = 0; i < attr_desc->n_strings; ++i) {
-          val[i] = attr_desc->strings[i];
-        }
-        attr.Set<vector<string>>(val);
-        break;
-      }
-      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG: {
-        attr.Set<int64_t>(attr_desc->l);
-        break;
-      }
-      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK: {
-        break;
-      }
-      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONGS: {
-        vector<int> val(attr_desc->n_longs);
-        for (int i = 0; i < attr_desc->n_longs; ++i) {
-          val[i] = attr_desc->longs[i];
-        }
-        attr.Set<vector<int>>(val);
-        break;
-      }
-      default:
-        PADDLE_MOBILE_THROW_EXCEPTION("attr type not support");
-    }
-    return attr;
-  }
-
-  Attribute() {}
-  template <typename T, typename... Args>
-  Attribute &Set(Args &&... args) {
-    variant_.Set<T>(args...);
-    return *this;
-  }
-
-  template <typename T>
-  T &Get() const {
-    return variant_.Get<T>();
-  }
-
-  std::string GetString() const { return variant_.Get<std::string>(); }
-
-  template <typename Vistor>
-  static typename Vistor::type_t ApplyVistor(Vistor vistor, Attribute attr) {
-    if (attr.variant_.TypeId() == type_id<int>()) {  // NOLINT
-      return vistor(attr.variant_.Get<int>());
-    } else if (attr.variant_.TypeId() == type_id<float>()) {  // NOLINT
-      return vistor(attr.variant_.Get<float>());
-    } else if (attr.variant_.TypeId() == type_id<string>()) {
-      return vistor(attr.variant_.Get<std::string>());
-    } else if (attr.variant_.TypeId() == type_id<vector<int>>()) {
-      return vistor(attr.variant_.Get<vector<int>>());
-    } else if (attr.variant_.TypeId() == type_id<vector<float>>()) {
-      return vistor(attr.variant_.Get<vector<float>>());
-    } else if (attr.variant_.TypeId() == type_id<vector<string>>()) {
-      return vistor(attr.variant_.Get<vector<string>>());
-    } else if (attr.variant_.TypeId() == type_id<bool>()) {  // NOLINT
-      return vistor(attr.variant_.Get<bool>());
-    } else if (attr.variant_.TypeId() == type_id<vector<bool>>()) {
-      return vistor(attr.variant_.Get<vector<bool>>());
-    } else if (attr.variant_.TypeId() == type_id<int64_t>()) {
-      return vistor(attr.variant_.Get<int64_t>());
-    } else if (attr.variant_.TypeId() == type_id<framework::BlockDesc *>()) {
-      return vistor(attr.variant_.Get<framework::BlockDesc *>());
-    } else if (attr.variant_.TypeId() ==
-               type_id<vector<framework::BlockDesc *>>()) {
-      return vistor(attr.variant_.Get<vector<framework::BlockDesc *>>());
-    } else if (attr.variant_.TypeId() == type_id<vector<int64_t>>()) {
-      return vistor(attr.variant_.Get<vector<int64_t>>());
-    } else {
-      PADDLE_MOBILE_THROW_EXCEPTION("type not support");
-    }
-  }
-
- private:
-  Variant<int, float, string, vector<int>, vector<float>, vector<string>, bool,
-          vector<bool>, BlockDesc *, vector<BlockDesc *>, int64_t,
-          vector<int64_t>>
-      variant_;
-};
-
-using AttributeMap = std::unordered_map<string, Attribute>;
-
-class AttrReader {
- public:
-  explicit AttrReader(const AttributeMap &attrs) : attrs_(attrs) {}
-
-  template <typename T>
-  inline T Get(const string &name) const {
-    PADDLE_MOBILE_ENFORCE(attrs_.count(name) != 0,
-                          "%s should  be in AttributeMap", name.c_str());
-    return ((Attribute)attrs_.at(name)).Get<T>();
-  }
-
- private:
-  const AttributeMap &attrs_;
-};
-
-Print &operator<<(Print &printer, const Attribute &op_desc);
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/cl/cl_deleter.h b/mobile/src/framework/cl/cl_deleter.h
deleted file mode 100644
index 55af631174..0000000000
--- a/mobile/src/framework/cl/cl_deleter.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "CL/cl.h"
-
-struct CLKernelDeleter {
-  template <class T>
-  void operator()(T *clKernelObj) {
-    clReleaseKernel(clKernelObj);
-  }
-};
-
-struct CLMemDeleter {
-  template <class T>
-  void operator()(T *clMemObj) {
-    clReleaseMemObject(clMemObj);
-  }
-};
-
-struct CLEventDeleter {
-  template <class T>
-  void operator()(T *clEventObj) {
-    clReleaseEvent(clEventObj);
-  }
-};
-
-struct CLCommQueueDeleter {
-  template <class T>
-  void operator()(T *clQueueObj) {
-    clReleaseCommandQueue(clQueueObj);
-  }
-};
-
-struct CLContextDeleter {
-  template <class T>
-  void operator()(T *clContextObj) {
-    clReleaseContext(clContextObj);
-  }
-};
-
-struct CLProgramDeleter {
-  template <class T>
-  void operator()(T *clProgramObj) {
-    clReleaseProgram(clProgramObj);
-  }
-};
diff --git a/mobile/src/framework/cl/cl_engine.cpp b/mobile/src/framework/cl/cl_engine.cpp
deleted file mode 100644
index c39ae00b00..0000000000
--- a/mobile/src/framework/cl/cl_engine.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/cl/cl_engine.h"
-#include "CL/cl.h"
-#include "framework/cl/cl_tool.h"
-
-#include <cstdlib>
-#include <cstring>
-
-namespace paddle_mobile {
-namespace framework {
-
-bool CLEngine::Init() {
-  if (initialized_) {
-    return true;
-  }
-  cl_int status;
-  bool is_setplatform_success = SetPlatform();
-  bool is_setcldeviceid_success = SetClDeviceId();
-  is_init_success_ = is_setplatform_success && is_setcldeviceid_success;
-  initialized_ = true;
-  return initialized_;
-  //  setClCommandQueue();
-  //  std::string filename = "./HelloWorld_Kernel.cl";
-  //  loadKernelFromFile(filename.c_str());
-  //  buildProgram();
-}
-
-CLEngine *CLEngine::Instance() {
-  static CLEngine cl_engine_;
-  cl_engine_.Init();
-  return &cl_engine_;
-}
-
-bool CLEngine::isInitSuccess() { return is_init_success_; }
-bool CLEngine::SetPlatform() {
-  platform_ = NULL;      // the chosen platform
-  cl_uint numPlatforms;  // the NO. of platforms
-  cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
-  if (status != CL_SUCCESS) {
-    return false;
-  }
-  /**For clarity, choose the first available platform. */
-  if (numPlatforms > 0) {
-    cl_platform_id *platforms = reinterpret_cast<cl_platform_id *>(
-        malloc(numPlatforms * sizeof(cl_platform_id)));
-    status = clGetPlatformIDs(numPlatforms, platforms, NULL);
-    platform_ = platforms[0];
-    free(platforms);
-    return status == CL_SUCCESS;
-  }
-
-  return false;
-}
-
-bool CLEngine::SetClDeviceId() {
-  cl_uint numDevices = 0;
-  devices_ = NULL;
-  cl_int status =
-      clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
-  if (status != CL_SUCCESS) {
-    return false;
-  }
-  if (numDevices > 0) {
-    devices_ = reinterpret_cast<cl_device_id *>(
-        malloc(numDevices * sizeof(cl_device_id)));
-    status = clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, numDevices, devices_,
-                            NULL);
-    return status == CL_SUCCESS;
-  }
-  return false;
-}
-
-// std::unique_ptr<_cl_kernel, clKernel_deleter> CLEngine::GSetKernel(
-//    const std::string &kernel_name) {
-//  std::unique_ptr<_cl_kernel, clKernel_deleter> kernel(
-//      clCreateKernel(program_.get(), kernel_name.c_str(), NULL));
-//  return std::move(kernel);
-//}
-//
-// bool CLEngine::SetClCommandQueue() {
-//  cl_int status;
-//  command_queue_.reset(
-//          clCreateCommandQueue(context_.get(), devices_[0], 0, &status));
-//  return true;
-//}
-
-// bool CLEngine::SetClContext() {
-//  context_.reset(clCreateContext(NULL, 1, devices_, NULL, NULL, NULL));
-//  return true;
-//}
-
-// bool CLEngine::LoadKernelFromFile(const char *kernel_file) {
-//  size_t size;
-//  char *str;
-//  std::fstream f(kernel_file, (std::fstream::in | std::fstream::binary));
-//
-//  if (!f.is_open()) {
-//    return false;
-//  }
-//
-//  size_t fileSize;
-//  f.seekg(0, std::fstream::end);
-//  size = fileSize = (size_t)f.tellg();
-//  f.seekg(0, std::fstream::beg);
-//  str = new char[size + 1];
-//  if (!str) {
-//    f.close();
-//    return 0;
-//  }
-//
-//  f.read(str, fileSize);
-//  f.close();
-//  str[size] = '\0';
-//  const char *source = str;
-//  size_t sourceSize[] = {strlen(source)};
-//  program_.reset(
-//      clCreateProgramWithSource(context_.get(), 1, &source, sourceSize,
-//      NULL));
-//  return true;
-//}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/cl/cl_engine.h b/mobile/src/framework/cl/cl_engine.h
deleted file mode 100644
index f5b1e3c2d2..0000000000
--- a/mobile/src/framework/cl/cl_engine.h
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstring>
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "CL/cl.h"
-#include "common/enforce.h"
-#include "common/log.h"
-#include "framework/cl/cl_deleter.h"
-#include "framework/cl/cl_tool.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class CLLocalWorkSizeInfo {
- public:
-  CLLocalWorkSizeInfo() {
-    max_work_group_size = 0;
-    max_work_item_size0 = 0;
-    max_work_item_size1 = 0;
-    max_work_item_size2 = 0;
-  }
-  CLLocalWorkSizeInfo(size_t total_size, size_t size0, size_t size1,
-                      size_t size2) {
-    max_work_group_size = total_size;
-    max_work_item_size0 = size0;
-    max_work_item_size1 = size1;
-    max_work_item_size2 = size2;
-  }
-  bool isEmpty() {
-    return max_work_group_size == 0 && max_work_item_size0 == 0 &&
-           max_work_item_size1 == 0 && max_work_item_size2 == 0;
-  }
-
-  // max total number of work-items in the work-group
-  size_t max_work_group_size;
-  // max number of work-items in local_work_size in dim 0
-  size_t max_work_item_size0;
-  // max number of work-items in local_work_size in dim 1
-  size_t max_work_item_size1;
-  // max number of work-items in local_work_size in dim 2
-  size_t max_work_item_size2;
-};
-
-class CLEngine {
- public:
-  static CLEngine *Instance();
-
-  bool Init();
-  bool isInitSuccess();
-  std::unique_ptr<_cl_context, CLContextDeleter> CreateContext() {
-    cl_int status;
-    cl_context c = clCreateContext(NULL, 1, devices_, NULL, NULL, &status);
-    std::unique_ptr<_cl_context, CLContextDeleter> context_ptr(c);
-    CL_CHECK_ERRORS(status);
-    return std::move(context_ptr);
-  }
-
-  std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> CreateClCommandQueue(
-      cl_context context) {
-    cl_int status;
-    cl_command_queue queue =
-        clCreateCommandQueue(context, devices_[0], 0, &status);
-    std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_ptr(
-        queue);
-    CL_CHECK_ERRORS(status);
-    return std::move(command_queue_ptr);
-  }
-
-  cl_context getContext() {
-    if (context_ == nullptr) {
-      context_ = CreateContext();
-    }
-    return context_.get();
-  }
-
-  cl_command_queue getClCommandQueue() {
-    if (command_queue_ == nullptr) {
-      command_queue_ = CreateClCommandQueue(getContext());
-    }
-    return command_queue_.get();
-  }
-
-  CLLocalWorkSizeInfo getLocalWorkSizeInfo() {
-    if (!localWorkSizeInfo_.isEmpty()) {
-      return localWorkSizeInfo_;
-    }
-    cl_int status;
-    size_t max_work_group_size = 0;
-    status = clGetDeviceInfo(devices_[0], CL_DEVICE_MAX_WORK_GROUP_SIZE,
-                             sizeof(size_t), &max_work_group_size, NULL);
-    if (status != CL_SUCCESS) {
-      return CLLocalWorkSizeInfo(0, 0, 0, 0);
-    }
-    cl_uint max_dims_num = 0;
-    status = clGetDeviceInfo(devices_[0], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
-                             sizeof(cl_uint), &max_dims_num, NULL);
-    if (status != CL_SUCCESS) {
-      return CLLocalWorkSizeInfo(0, 0, 0, 0);
-    }
-    DLOG << "max_work_item_sizes max_dims_num: " << max_dims_num;
-    size_t *max_work_item_sizes =
-        reinterpret_cast<size_t *>(calloc(max_dims_num, sizeof(size_t)));
-    size_t ret_size = 0;
-    status = clGetDeviceInfo(devices_[0], CL_DEVICE_MAX_WORK_ITEM_SIZES,
-                             max_dims_num * sizeof(size_t), max_work_item_sizes,
-                             &ret_size);
-    if (status != CL_SUCCESS || ret_size / sizeof(size_t) < 3) {
-      return CLLocalWorkSizeInfo(0, 0, 0, 0);
-    }
-    DLOG << max_work_item_sizes[0];
-    DLOG << max_work_item_sizes[1];
-    DLOG << max_work_item_sizes[2];
-    localWorkSizeInfo_ =
-        CLLocalWorkSizeInfo(max_work_group_size, max_work_item_sizes[0],
-                            max_work_item_sizes[1], max_work_item_sizes[2]);
-    free(max_work_item_sizes);
-    return localWorkSizeInfo_;
-  }
-
-  std::unique_ptr<_cl_program, CLProgramDeleter> CreateProgramWith(
-      cl_context context, std::string file_name) {
-    FILE *file = fopen(file_name.c_str(), "rb");
-    PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
-                          file_name.c_str());
-    fseek(file, 0, SEEK_END);
-    int64_t size = ftell(file);
-    PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
-    rewind(file);
-    char *data = new char[size + 1];
-    size_t bytes_read = fread(data, 1, size, file);
-    data[size] = '\0';
-    PADDLE_MOBILE_ENFORCE(bytes_read == size,
-                          "read binary file bytes do not match with fseek");
-    fclose(file);
-
-    const char *source = data;
-    size_t sourceSize[] = {strlen(source)};
-    cl_program p =
-        clCreateProgramWithSource(context, 1, &source, sourceSize, &status_);
-
-    DLOG << " cl kernel file name: " << file_name;
-    DLOG << " source size: " << sourceSize[0];
-    CL_CHECK_ERRORS(status_);
-
-    std::unique_ptr<_cl_program, CLProgramDeleter> program_ptr(p);
-
-    return std::move(program_ptr);
-  }
-
-  std::unique_ptr<_cl_program, CLProgramDeleter> CreateProgramWithSource(
-      cl_context context, const char *source) {
-    size_t sourceSize[] = {strlen(source)};
-    cl_program p =
-        clCreateProgramWithSource(context, 1, &source, sourceSize, &status_);
-
-    DLOG << " cl kernel from source";
-    DLOG << " source size: " << sourceSize[0];
-    CL_CHECK_ERRORS(status_);
-
-    std::unique_ptr<_cl_program, CLProgramDeleter> program_ptr(p);
-
-    return std::move(program_ptr);
-  }
-
-  std::unique_ptr<_cl_event, CLEventDeleter> CreateEvent(cl_context context) {
-    cl_event event = clCreateUserEvent(context, &status_);
-    std::unique_ptr<_cl_event, CLEventDeleter> event_ptr(event);
-    CL_CHECK_ERRORS(status_);
-    return std::move(event_ptr);
-  }
-
-  bool BuildProgram(cl_program program, const std::string &options = "") {
-    cl_int status;
-    std::string path = options + " -cl-fast-relaxed-math -I " +
-                       CLEngine::Instance()->GetCLPath() + "/cl_kernel";
-
-    status = clBuildProgram(program, 0, 0, path.c_str(), 0, 0);
-
-    CL_CHECK_ERRORS(status);
-
-    if (status == CL_BUILD_PROGRAM_FAILURE) {
-      size_t log_size;
-      clGetProgramBuildInfo(program, CLEngine::Instance()->DeviceID(),
-                            CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
-      char *log = reinterpret_cast<char *>(malloc(log_size));
-      clGetProgramBuildInfo(program, CLEngine::Instance()->DeviceID(),
-                            CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
-      DLOG << " program build error: " << log;
-    }
-
-    if (status == CL_SUCCESS) {
-      return true;
-    } else {
-      return false;
-    }
-  }
-
-  cl_device_id DeviceID(int index = 0) { return devices_[index]; }
-
-  std::string GetCLPath() { return cl_path_; }
-  void setClPath(std::string cl_path) { cl_path_ = cl_path; }
-
- private:
-  CLEngine() { initialized_ = false; }
-
-  bool SetPlatform();
-
-  bool SetClDeviceId();
-
-  bool initialized_;
-
-  CLLocalWorkSizeInfo localWorkSizeInfo_;
-
-  cl_platform_id platform_;
-
-  cl_device_id *devices_;
-
-  cl_int status_;
-
-  std::string cl_path_;
-  std::unique_ptr<_cl_program, CLProgramDeleter> program_;
-
-  std::unique_ptr<_cl_context, CLContextDeleter> context_ = nullptr;
-
-  std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_ =
-      nullptr;
-
-  //  bool SetClContext();
-
-  //  bool SetClCommandQueue();
-
-  //  bool LoadKernelFromFile(const char *kernel_file);
-
-  //  bool BuildProgram();
-  bool is_init_success_ = false;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/cl/cl_half.cpp b/mobile/src/framework/cl/cl_half.cpp
deleted file mode 100644
index 2877289325..0000000000
--- a/mobile/src/framework/cl/cl_half.cpp
+++ /dev/null
@@ -1,518 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
-
-#include "framework/cl/cl_half.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-static const uint32_t mantissatable[2048] = {
-    0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34a00000,
-    0x34c00000, 0x34e00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000,
-    0x35400000, 0x35500000, 0x35600000, 0x35700000, 0x35800000, 0x35880000,
-    0x35900000, 0x35980000, 0x35a00000, 0x35a80000, 0x35b00000, 0x35b80000,
-    0x35c00000, 0x35c80000, 0x35d00000, 0x35d80000, 0x35e00000, 0x35e80000,
-    0x35f00000, 0x35f80000, 0x36000000, 0x36040000, 0x36080000, 0x360c0000,
-    0x36100000, 0x36140000, 0x36180000, 0x361c0000, 0x36200000, 0x36240000,
-    0x36280000, 0x362c0000, 0x36300000, 0x36340000, 0x36380000, 0x363c0000,
-    0x36400000, 0x36440000, 0x36480000, 0x364c0000, 0x36500000, 0x36540000,
-    0x36580000, 0x365c0000, 0x36600000, 0x36640000, 0x36680000, 0x366c0000,
-    0x36700000, 0x36740000, 0x36780000, 0x367c0000, 0x36800000, 0x36820000,
-    0x36840000, 0x36860000, 0x36880000, 0x368a0000, 0x368c0000, 0x368e0000,
-    0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369a0000,
-    0x369c0000, 0x369e0000, 0x36a00000, 0x36a20000, 0x36a40000, 0x36a60000,
-    0x36a80000, 0x36aa0000, 0x36ac0000, 0x36ae0000, 0x36b00000, 0x36b20000,
-    0x36b40000, 0x36b60000, 0x36b80000, 0x36ba0000, 0x36bc0000, 0x36be0000,
-    0x36c00000, 0x36c20000, 0x36c40000, 0x36c60000, 0x36c80000, 0x36ca0000,
-    0x36cc0000, 0x36ce0000, 0x36d00000, 0x36d20000, 0x36d40000, 0x36d60000,
-    0x36d80000, 0x36da0000, 0x36dc0000, 0x36de0000, 0x36e00000, 0x36e20000,
-    0x36e40000, 0x36e60000, 0x36e80000, 0x36ea0000, 0x36ec0000, 0x36ee0000,
-    0x36f00000, 0x36f20000, 0x36f40000, 0x36f60000, 0x36f80000, 0x36fa0000,
-    0x36fc0000, 0x36fe0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000,
-    0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000,
-    0x370a0000, 0x370b0000, 0x370c0000, 0x370d0000, 0x370e0000, 0x370f0000,
-    0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000,
-    0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371a0000, 0x371b0000,
-    0x371c0000, 0x371d0000, 0x371e0000, 0x371f0000, 0x37200000, 0x37210000,
-    0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000,
-    0x37280000, 0x37290000, 0x372a0000, 0x372b0000, 0x372c0000, 0x372d0000,
-    0x372e0000, 0x372f0000, 0x37300000, 0x37310000, 0x37320000, 0x37330000,
-    0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000,
-    0x373a0000, 0x373b0000, 0x373c0000, 0x373d0000, 0x373e0000, 0x373f0000,
-    0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000,
-    0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374a0000, 0x374b0000,
-    0x374c0000, 0x374d0000, 0x374e0000, 0x374f0000, 0x37500000, 0x37510000,
-    0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000,
-    0x37580000, 0x37590000, 0x375a0000, 0x375b0000, 0x375c0000, 0x375d0000,
-    0x375e0000, 0x375f0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000,
-    0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000,
-    0x376a0000, 0x376b0000, 0x376c0000, 0x376d0000, 0x376e0000, 0x376f0000,
-    0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000,
-    0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377a0000, 0x377b0000,
-    0x377c0000, 0x377d0000, 0x377e0000, 0x377f0000, 0x37800000, 0x37808000,
-    0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000,
-    0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000,
-    0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000,
-    0x378a0000, 0x378a8000, 0x378b0000, 0x378b8000, 0x378c0000, 0x378c8000,
-    0x378d0000, 0x378d8000, 0x378e0000, 0x378e8000, 0x378f0000, 0x378f8000,
-    0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000,
-    0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000,
-    0x37960000, 0x37968000, 0x37970000, 0x37978000, 0x37980000, 0x37988000,
-    0x37990000, 0x37998000, 0x379a0000, 0x379a8000, 0x379b0000, 0x379b8000,
-    0x379c0000, 0x379c8000, 0x379d0000, 0x379d8000, 0x379e0000, 0x379e8000,
-    0x379f0000, 0x379f8000, 0x37a00000, 0x37a08000, 0x37a10000, 0x37a18000,
-    0x37a20000, 0x37a28000, 0x37a30000, 0x37a38000, 0x37a40000, 0x37a48000,
-    0x37a50000, 0x37a58000, 0x37a60000, 0x37a68000, 0x37a70000, 0x37a78000,
-    0x37a80000, 0x37a88000, 0x37a90000, 0x37a98000, 0x37aa0000, 0x37aa8000,
-    0x37ab0000, 0x37ab8000, 0x37ac0000, 0x37ac8000, 0x37ad0000, 0x37ad8000,
-    0x37ae0000, 0x37ae8000, 0x37af0000, 0x37af8000, 0x37b00000, 0x37b08000,
-    0x37b10000, 0x37b18000, 0x37b20000, 0x37b28000, 0x37b30000, 0x37b38000,
-    0x37b40000, 0x37b48000, 0x37b50000, 0x37b58000, 0x37b60000, 0x37b68000,
-    0x37b70000, 0x37b78000, 0x37b80000, 0x37b88000, 0x37b90000, 0x37b98000,
-    0x37ba0000, 0x37ba8000, 0x37bb0000, 0x37bb8000, 0x37bc0000, 0x37bc8000,
-    0x37bd0000, 0x37bd8000, 0x37be0000, 0x37be8000, 0x37bf0000, 0x37bf8000,
-    0x37c00000, 0x37c08000, 0x37c10000, 0x37c18000, 0x37c20000, 0x37c28000,
-    0x37c30000, 0x37c38000, 0x37c40000, 0x37c48000, 0x37c50000, 0x37c58000,
-    0x37c60000, 0x37c68000, 0x37c70000, 0x37c78000, 0x37c80000, 0x37c88000,
-    0x37c90000, 0x37c98000, 0x37ca0000, 0x37ca8000, 0x37cb0000, 0x37cb8000,
-    0x37cc0000, 0x37cc8000, 0x37cd0000, 0x37cd8000, 0x37ce0000, 0x37ce8000,
-    0x37cf0000, 0x37cf8000, 0x37d00000, 0x37d08000, 0x37d10000, 0x37d18000,
-    0x37d20000, 0x37d28000, 0x37d30000, 0x37d38000, 0x37d40000, 0x37d48000,
-    0x37d50000, 0x37d58000, 0x37d60000, 0x37d68000, 0x37d70000, 0x37d78000,
-    0x37d80000, 0x37d88000, 0x37d90000, 0x37d98000, 0x37da0000, 0x37da8000,
-    0x37db0000, 0x37db8000, 0x37dc0000, 0x37dc8000, 0x37dd0000, 0x37dd8000,
-    0x37de0000, 0x37de8000, 0x37df0000, 0x37df8000, 0x37e00000, 0x37e08000,
-    0x37e10000, 0x37e18000, 0x37e20000, 0x37e28000, 0x37e30000, 0x37e38000,
-    0x37e40000, 0x37e48000, 0x37e50000, 0x37e58000, 0x37e60000, 0x37e68000,
-    0x37e70000, 0x37e78000, 0x37e80000, 0x37e88000, 0x37e90000, 0x37e98000,
-    0x37ea0000, 0x37ea8000, 0x37eb0000, 0x37eb8000, 0x37ec0000, 0x37ec8000,
-    0x37ed0000, 0x37ed8000, 0x37ee0000, 0x37ee8000, 0x37ef0000, 0x37ef8000,
-    0x37f00000, 0x37f08000, 0x37f10000, 0x37f18000, 0x37f20000, 0x37f28000,
-    0x37f30000, 0x37f38000, 0x37f40000, 0x37f48000, 0x37f50000, 0x37f58000,
-    0x37f60000, 0x37f68000, 0x37f70000, 0x37f78000, 0x37f80000, 0x37f88000,
-    0x37f90000, 0x37f98000, 0x37fa0000, 0x37fa8000, 0x37fb0000, 0x37fb8000,
-    0x37fc0000, 0x37fc8000, 0x37fd0000, 0x37fd8000, 0x37fe0000, 0x37fe8000,
-    0x37ff0000, 0x37ff8000, 0x38000000, 0x38004000, 0x38008000, 0x3800c000,
-    0x38010000, 0x38014000, 0x38018000, 0x3801c000, 0x38020000, 0x38024000,
-    0x38028000, 0x3802c000, 0x38030000, 0x38034000, 0x38038000, 0x3803c000,
-    0x38040000, 0x38044000, 0x38048000, 0x3804c000, 0x38050000, 0x38054000,
-    0x38058000, 0x3805c000, 0x38060000, 0x38064000, 0x38068000, 0x3806c000,
-    0x38070000, 0x38074000, 0x38078000, 0x3807c000, 0x38080000, 0x38084000,
-    0x38088000, 0x3808c000, 0x38090000, 0x38094000, 0x38098000, 0x3809c000,
-    0x380a0000, 0x380a4000, 0x380a8000, 0x380ac000, 0x380b0000, 0x380b4000,
-    0x380b8000, 0x380bc000, 0x380c0000, 0x380c4000, 0x380c8000, 0x380cc000,
-    0x380d0000, 0x380d4000, 0x380d8000, 0x380dc000, 0x380e0000, 0x380e4000,
-    0x380e8000, 0x380ec000, 0x380f0000, 0x380f4000, 0x380f8000, 0x380fc000,
-    0x38100000, 0x38104000, 0x38108000, 0x3810c000, 0x38110000, 0x38114000,
-    0x38118000, 0x3811c000, 0x38120000, 0x38124000, 0x38128000, 0x3812c000,
-    0x38130000, 0x38134000, 0x38138000, 0x3813c000, 0x38140000, 0x38144000,
-    0x38148000, 0x3814c000, 0x38150000, 0x38154000, 0x38158000, 0x3815c000,
-    0x38160000, 0x38164000, 0x38168000, 0x3816c000, 0x38170000, 0x38174000,
-    0x38178000, 0x3817c000, 0x38180000, 0x38184000, 0x38188000, 0x3818c000,
-    0x38190000, 0x38194000, 0x38198000, 0x3819c000, 0x381a0000, 0x381a4000,
-    0x381a8000, 0x381ac000, 0x381b0000, 0x381b4000, 0x381b8000, 0x381bc000,
-    0x381c0000, 0x381c4000, 0x381c8000, 0x381cc000, 0x381d0000, 0x381d4000,
-    0x381d8000, 0x381dc000, 0x381e0000, 0x381e4000, 0x381e8000, 0x381ec000,
-    0x381f0000, 0x381f4000, 0x381f8000, 0x381fc000, 0x38200000, 0x38204000,
-    0x38208000, 0x3820c000, 0x38210000, 0x38214000, 0x38218000, 0x3821c000,
-    0x38220000, 0x38224000, 0x38228000, 0x3822c000, 0x38230000, 0x38234000,
-    0x38238000, 0x3823c000, 0x38240000, 0x38244000, 0x38248000, 0x3824c000,
-    0x38250000, 0x38254000, 0x38258000, 0x3825c000, 0x38260000, 0x38264000,
-    0x38268000, 0x3826c000, 0x38270000, 0x38274000, 0x38278000, 0x3827c000,
-    0x38280000, 0x38284000, 0x38288000, 0x3828c000, 0x38290000, 0x38294000,
-    0x38298000, 0x3829c000, 0x382a0000, 0x382a4000, 0x382a8000, 0x382ac000,
-    0x382b0000, 0x382b4000, 0x382b8000, 0x382bc000, 0x382c0000, 0x382c4000,
-    0x382c8000, 0x382cc000, 0x382d0000, 0x382d4000, 0x382d8000, 0x382dc000,
-    0x382e0000, 0x382e4000, 0x382e8000, 0x382ec000, 0x382f0000, 0x382f4000,
-    0x382f8000, 0x382fc000, 0x38300000, 0x38304000, 0x38308000, 0x3830c000,
-    0x38310000, 0x38314000, 0x38318000, 0x3831c000, 0x38320000, 0x38324000,
-    0x38328000, 0x3832c000, 0x38330000, 0x38334000, 0x38338000, 0x3833c000,
-    0x38340000, 0x38344000, 0x38348000, 0x3834c000, 0x38350000, 0x38354000,
-    0x38358000, 0x3835c000, 0x38360000, 0x38364000, 0x38368000, 0x3836c000,
-    0x38370000, 0x38374000, 0x38378000, 0x3837c000, 0x38380000, 0x38384000,
-    0x38388000, 0x3838c000, 0x38390000, 0x38394000, 0x38398000, 0x3839c000,
-    0x383a0000, 0x383a4000, 0x383a8000, 0x383ac000, 0x383b0000, 0x383b4000,
-    0x383b8000, 0x383bc000, 0x383c0000, 0x383c4000, 0x383c8000, 0x383cc000,
-    0x383d0000, 0x383d4000, 0x383d8000, 0x383dc000, 0x383e0000, 0x383e4000,
-    0x383e8000, 0x383ec000, 0x383f0000, 0x383f4000, 0x383f8000, 0x383fc000,
-    0x38400000, 0x38404000, 0x38408000, 0x3840c000, 0x38410000, 0x38414000,
-    0x38418000, 0x3841c000, 0x38420000, 0x38424000, 0x38428000, 0x3842c000,
-    0x38430000, 0x38434000, 0x38438000, 0x3843c000, 0x38440000, 0x38444000,
-    0x38448000, 0x3844c000, 0x38450000, 0x38454000, 0x38458000, 0x3845c000,
-    0x38460000, 0x38464000, 0x38468000, 0x3846c000, 0x38470000, 0x38474000,
-    0x38478000, 0x3847c000, 0x38480000, 0x38484000, 0x38488000, 0x3848c000,
-    0x38490000, 0x38494000, 0x38498000, 0x3849c000, 0x384a0000, 0x384a4000,
-    0x384a8000, 0x384ac000, 0x384b0000, 0x384b4000, 0x384b8000, 0x384bc000,
-    0x384c0000, 0x384c4000, 0x384c8000, 0x384cc000, 0x384d0000, 0x384d4000,
-    0x384d8000, 0x384dc000, 0x384e0000, 0x384e4000, 0x384e8000, 0x384ec000,
-    0x384f0000, 0x384f4000, 0x384f8000, 0x384fc000, 0x38500000, 0x38504000,
-    0x38508000, 0x3850c000, 0x38510000, 0x38514000, 0x38518000, 0x3851c000,
-    0x38520000, 0x38524000, 0x38528000, 0x3852c000, 0x38530000, 0x38534000,
-    0x38538000, 0x3853c000, 0x38540000, 0x38544000, 0x38548000, 0x3854c000,
-    0x38550000, 0x38554000, 0x38558000, 0x3855c000, 0x38560000, 0x38564000,
-    0x38568000, 0x3856c000, 0x38570000, 0x38574000, 0x38578000, 0x3857c000,
-    0x38580000, 0x38584000, 0x38588000, 0x3858c000, 0x38590000, 0x38594000,
-    0x38598000, 0x3859c000, 0x385a0000, 0x385a4000, 0x385a8000, 0x385ac000,
-    0x385b0000, 0x385b4000, 0x385b8000, 0x385bc000, 0x385c0000, 0x385c4000,
-    0x385c8000, 0x385cc000, 0x385d0000, 0x385d4000, 0x385d8000, 0x385dc000,
-    0x385e0000, 0x385e4000, 0x385e8000, 0x385ec000, 0x385f0000, 0x385f4000,
-    0x385f8000, 0x385fc000, 0x38600000, 0x38604000, 0x38608000, 0x3860c000,
-    0x38610000, 0x38614000, 0x38618000, 0x3861c000, 0x38620000, 0x38624000,
-    0x38628000, 0x3862c000, 0x38630000, 0x38634000, 0x38638000, 0x3863c000,
-    0x38640000, 0x38644000, 0x38648000, 0x3864c000, 0x38650000, 0x38654000,
-    0x38658000, 0x3865c000, 0x38660000, 0x38664000, 0x38668000, 0x3866c000,
-    0x38670000, 0x38674000, 0x38678000, 0x3867c000, 0x38680000, 0x38684000,
-    0x38688000, 0x3868c000, 0x38690000, 0x38694000, 0x38698000, 0x3869c000,
-    0x386a0000, 0x386a4000, 0x386a8000, 0x386ac000, 0x386b0000, 0x386b4000,
-    0x386b8000, 0x386bc000, 0x386c0000, 0x386c4000, 0x386c8000, 0x386cc000,
-    0x386d0000, 0x386d4000, 0x386d8000, 0x386dc000, 0x386e0000, 0x386e4000,
-    0x386e8000, 0x386ec000, 0x386f0000, 0x386f4000, 0x386f8000, 0x386fc000,
-    0x38700000, 0x38704000, 0x38708000, 0x3870c000, 0x38710000, 0x38714000,
-    0x38718000, 0x3871c000, 0x38720000, 0x38724000, 0x38728000, 0x3872c000,
-    0x38730000, 0x38734000, 0x38738000, 0x3873c000, 0x38740000, 0x38744000,
-    0x38748000, 0x3874c000, 0x38750000, 0x38754000, 0x38758000, 0x3875c000,
-    0x38760000, 0x38764000, 0x38768000, 0x3876c000, 0x38770000, 0x38774000,
-    0x38778000, 0x3877c000, 0x38780000, 0x38784000, 0x38788000, 0x3878c000,
-    0x38790000, 0x38794000, 0x38798000, 0x3879c000, 0x387a0000, 0x387a4000,
-    0x387a8000, 0x387ac000, 0x387b0000, 0x387b4000, 0x387b8000, 0x387bc000,
-    0x387c0000, 0x387c4000, 0x387c8000, 0x387cc000, 0x387d0000, 0x387d4000,
-    0x387d8000, 0x387dc000, 0x387e0000, 0x387e4000, 0x387e8000, 0x387ec000,
-    0x387f0000, 0x387f4000, 0x387f8000, 0x387fc000, 0x38000000, 0x38002000,
-    0x38004000, 0x38006000, 0x38008000, 0x3800a000, 0x3800c000, 0x3800e000,
-    0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801a000,
-    0x3801c000, 0x3801e000, 0x38020000, 0x38022000, 0x38024000, 0x38026000,
-    0x38028000, 0x3802a000, 0x3802c000, 0x3802e000, 0x38030000, 0x38032000,
-    0x38034000, 0x38036000, 0x38038000, 0x3803a000, 0x3803c000, 0x3803e000,
-    0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804a000,
-    0x3804c000, 0x3804e000, 0x38050000, 0x38052000, 0x38054000, 0x38056000,
-    0x38058000, 0x3805a000, 0x3805c000, 0x3805e000, 0x38060000, 0x38062000,
-    0x38064000, 0x38066000, 0x38068000, 0x3806a000, 0x3806c000, 0x3806e000,
-    0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807a000,
-    0x3807c000, 0x3807e000, 0x38080000, 0x38082000, 0x38084000, 0x38086000,
-    0x38088000, 0x3808a000, 0x3808c000, 0x3808e000, 0x38090000, 0x38092000,
-    0x38094000, 0x38096000, 0x38098000, 0x3809a000, 0x3809c000, 0x3809e000,
-    0x380a0000, 0x380a2000, 0x380a4000, 0x380a6000, 0x380a8000, 0x380aa000,
-    0x380ac000, 0x380ae000, 0x380b0000, 0x380b2000, 0x380b4000, 0x380b6000,
-    0x380b8000, 0x380ba000, 0x380bc000, 0x380be000, 0x380c0000, 0x380c2000,
-    0x380c4000, 0x380c6000, 0x380c8000, 0x380ca000, 0x380cc000, 0x380ce000,
-    0x380d0000, 0x380d2000, 0x380d4000, 0x380d6000, 0x380d8000, 0x380da000,
-    0x380dc000, 0x380de000, 0x380e0000, 0x380e2000, 0x380e4000, 0x380e6000,
-    0x380e8000, 0x380ea000, 0x380ec000, 0x380ee000, 0x380f0000, 0x380f2000,
-    0x380f4000, 0x380f6000, 0x380f8000, 0x380fa000, 0x380fc000, 0x380fe000,
-    0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810a000,
-    0x3810c000, 0x3810e000, 0x38110000, 0x38112000, 0x38114000, 0x38116000,
-    0x38118000, 0x3811a000, 0x3811c000, 0x3811e000, 0x38120000, 0x38122000,
-    0x38124000, 0x38126000, 0x38128000, 0x3812a000, 0x3812c000, 0x3812e000,
-    0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813a000,
-    0x3813c000, 0x3813e000, 0x38140000, 0x38142000, 0x38144000, 0x38146000,
-    0x38148000, 0x3814a000, 0x3814c000, 0x3814e000, 0x38150000, 0x38152000,
-    0x38154000, 0x38156000, 0x38158000, 0x3815a000, 0x3815c000, 0x3815e000,
-    0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816a000,
-    0x3816c000, 0x3816e000, 0x38170000, 0x38172000, 0x38174000, 0x38176000,
-    0x38178000, 0x3817a000, 0x3817c000, 0x3817e000, 0x38180000, 0x38182000,
-    0x38184000, 0x38186000, 0x38188000, 0x3818a000, 0x3818c000, 0x3818e000,
-    0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819a000,
-    0x3819c000, 0x3819e000, 0x381a0000, 0x381a2000, 0x381a4000, 0x381a6000,
-    0x381a8000, 0x381aa000, 0x381ac000, 0x381ae000, 0x381b0000, 0x381b2000,
-    0x381b4000, 0x381b6000, 0x381b8000, 0x381ba000, 0x381bc000, 0x381be000,
-    0x381c0000, 0x381c2000, 0x381c4000, 0x381c6000, 0x381c8000, 0x381ca000,
-    0x381cc000, 0x381ce000, 0x381d0000, 0x381d2000, 0x381d4000, 0x381d6000,
-    0x381d8000, 0x381da000, 0x381dc000, 0x381de000, 0x381e0000, 0x381e2000,
-    0x381e4000, 0x381e6000, 0x381e8000, 0x381ea000, 0x381ec000, 0x381ee000,
-    0x381f0000, 0x381f2000, 0x381f4000, 0x381f6000, 0x381f8000, 0x381fa000,
-    0x381fc000, 0x381fe000, 0x38200000, 0x38202000, 0x38204000, 0x38206000,
-    0x38208000, 0x3820a000, 0x3820c000, 0x3820e000, 0x38210000, 0x38212000,
-    0x38214000, 0x38216000, 0x38218000, 0x3821a000, 0x3821c000, 0x3821e000,
-    0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822a000,
-    0x3822c000, 0x3822e000, 0x38230000, 0x38232000, 0x38234000, 0x38236000,
-    0x38238000, 0x3823a000, 0x3823c000, 0x3823e000, 0x38240000, 0x38242000,
-    0x38244000, 0x38246000, 0x38248000, 0x3824a000, 0x3824c000, 0x3824e000,
-    0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825a000,
-    0x3825c000, 0x3825e000, 0x38260000, 0x38262000, 0x38264000, 0x38266000,
-    0x38268000, 0x3826a000, 0x3826c000, 0x3826e000, 0x38270000, 0x38272000,
-    0x38274000, 0x38276000, 0x38278000, 0x3827a000, 0x3827c000, 0x3827e000,
-    0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828a000,
-    0x3828c000, 0x3828e000, 0x38290000, 0x38292000, 0x38294000, 0x38296000,
-    0x38298000, 0x3829a000, 0x3829c000, 0x3829e000, 0x382a0000, 0x382a2000,
-    0x382a4000, 0x382a6000, 0x382a8000, 0x382aa000, 0x382ac000, 0x382ae000,
-    0x382b0000, 0x382b2000, 0x382b4000, 0x382b6000, 0x382b8000, 0x382ba000,
-    0x382bc000, 0x382be000, 0x382c0000, 0x382c2000, 0x382c4000, 0x382c6000,
-    0x382c8000, 0x382ca000, 0x382cc000, 0x382ce000, 0x382d0000, 0x382d2000,
-    0x382d4000, 0x382d6000, 0x382d8000, 0x382da000, 0x382dc000, 0x382de000,
-    0x382e0000, 0x382e2000, 0x382e4000, 0x382e6000, 0x382e8000, 0x382ea000,
-    0x382ec000, 0x382ee000, 0x382f0000, 0x382f2000, 0x382f4000, 0x382f6000,
-    0x382f8000, 0x382fa000, 0x382fc000, 0x382fe000, 0x38300000, 0x38302000,
-    0x38304000, 0x38306000, 0x38308000, 0x3830a000, 0x3830c000, 0x3830e000,
-    0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831a000,
-    0x3831c000, 0x3831e000, 0x38320000, 0x38322000, 0x38324000, 0x38326000,
-    0x38328000, 0x3832a000, 0x3832c000, 0x3832e000, 0x38330000, 0x38332000,
-    0x38334000, 0x38336000, 0x38338000, 0x3833a000, 0x3833c000, 0x3833e000,
-    0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834a000,
-    0x3834c000, 0x3834e000, 0x38350000, 0x38352000, 0x38354000, 0x38356000,
-    0x38358000, 0x3835a000, 0x3835c000, 0x3835e000, 0x38360000, 0x38362000,
-    0x38364000, 0x38366000, 0x38368000, 0x3836a000, 0x3836c000, 0x3836e000,
-    0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837a000,
-    0x3837c000, 0x3837e000, 0x38380000, 0x38382000, 0x38384000, 0x38386000,
-    0x38388000, 0x3838a000, 0x3838c000, 0x3838e000, 0x38390000, 0x38392000,
-    0x38394000, 0x38396000, 0x38398000, 0x3839a000, 0x3839c000, 0x3839e000,
-    0x383a0000, 0x383a2000, 0x383a4000, 0x383a6000, 0x383a8000, 0x383aa000,
-    0x383ac000, 0x383ae000, 0x383b0000, 0x383b2000, 0x383b4000, 0x383b6000,
-    0x383b8000, 0x383ba000, 0x383bc000, 0x383be000, 0x383c0000, 0x383c2000,
-    0x383c4000, 0x383c6000, 0x383c8000, 0x383ca000, 0x383cc000, 0x383ce000,
-    0x383d0000, 0x383d2000, 0x383d4000, 0x383d6000, 0x383d8000, 0x383da000,
-    0x383dc000, 0x383de000, 0x383e0000, 0x383e2000, 0x383e4000, 0x383e6000,
-    0x383e8000, 0x383ea000, 0x383ec000, 0x383ee000, 0x383f0000, 0x383f2000,
-    0x383f4000, 0x383f6000, 0x383f8000, 0x383fa000, 0x383fc000, 0x383fe000,
-    0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840a000,
-    0x3840c000, 0x3840e000, 0x38410000, 0x38412000, 0x38414000, 0x38416000,
-    0x38418000, 0x3841a000, 0x3841c000, 0x3841e000, 0x38420000, 0x38422000,
-    0x38424000, 0x38426000, 0x38428000, 0x3842a000, 0x3842c000, 0x3842e000,
-    0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843a000,
-    0x3843c000, 0x3843e000, 0x38440000, 0x38442000, 0x38444000, 0x38446000,
-    0x38448000, 0x3844a000, 0x3844c000, 0x3844e000, 0x38450000, 0x38452000,
-    0x38454000, 0x38456000, 0x38458000, 0x3845a000, 0x3845c000, 0x3845e000,
-    0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846a000,
-    0x3846c000, 0x3846e000, 0x38470000, 0x38472000, 0x38474000, 0x38476000,
-    0x38478000, 0x3847a000, 0x3847c000, 0x3847e000, 0x38480000, 0x38482000,
-    0x38484000, 0x38486000, 0x38488000, 0x3848a000, 0x3848c000, 0x3848e000,
-    0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849a000,
-    0x3849c000, 0x3849e000, 0x384a0000, 0x384a2000, 0x384a4000, 0x384a6000,
-    0x384a8000, 0x384aa000, 0x384ac000, 0x384ae000, 0x384b0000, 0x384b2000,
-    0x384b4000, 0x384b6000, 0x384b8000, 0x384ba000, 0x384bc000, 0x384be000,
-    0x384c0000, 0x384c2000, 0x384c4000, 0x384c6000, 0x384c8000, 0x384ca000,
-    0x384cc000, 0x384ce000, 0x384d0000, 0x384d2000, 0x384d4000, 0x384d6000,
-    0x384d8000, 0x384da000, 0x384dc000, 0x384de000, 0x384e0000, 0x384e2000,
-    0x384e4000, 0x384e6000, 0x384e8000, 0x384ea000, 0x384ec000, 0x384ee000,
-    0x384f0000, 0x384f2000, 0x384f4000, 0x384f6000, 0x384f8000, 0x384fa000,
-    0x384fc000, 0x384fe000, 0x38500000, 0x38502000, 0x38504000, 0x38506000,
-    0x38508000, 0x3850a000, 0x3850c000, 0x3850e000, 0x38510000, 0x38512000,
-    0x38514000, 0x38516000, 0x38518000, 0x3851a000, 0x3851c000, 0x3851e000,
-    0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852a000,
-    0x3852c000, 0x3852e000, 0x38530000, 0x38532000, 0x38534000, 0x38536000,
-    0x38538000, 0x3853a000, 0x3853c000, 0x3853e000, 0x38540000, 0x38542000,
-    0x38544000, 0x38546000, 0x38548000, 0x3854a000, 0x3854c000, 0x3854e000,
-    0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855a000,
-    0x3855c000, 0x3855e000, 0x38560000, 0x38562000, 0x38564000, 0x38566000,
-    0x38568000, 0x3856a000, 0x3856c000, 0x3856e000, 0x38570000, 0x38572000,
-    0x38574000, 0x38576000, 0x38578000, 0x3857a000, 0x3857c000, 0x3857e000,
-    0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858a000,
-    0x3858c000, 0x3858e000, 0x38590000, 0x38592000, 0x38594000, 0x38596000,
-    0x38598000, 0x3859a000, 0x3859c000, 0x3859e000, 0x385a0000, 0x385a2000,
-    0x385a4000, 0x385a6000, 0x385a8000, 0x385aa000, 0x385ac000, 0x385ae000,
-    0x385b0000, 0x385b2000, 0x385b4000, 0x385b6000, 0x385b8000, 0x385ba000,
-    0x385bc000, 0x385be000, 0x385c0000, 0x385c2000, 0x385c4000, 0x385c6000,
-    0x385c8000, 0x385ca000, 0x385cc000, 0x385ce000, 0x385d0000, 0x385d2000,
-    0x385d4000, 0x385d6000, 0x385d8000, 0x385da000, 0x385dc000, 0x385de000,
-    0x385e0000, 0x385e2000, 0x385e4000, 0x385e6000, 0x385e8000, 0x385ea000,
-    0x385ec000, 0x385ee000, 0x385f0000, 0x385f2000, 0x385f4000, 0x385f6000,
-    0x385f8000, 0x385fa000, 0x385fc000, 0x385fe000, 0x38600000, 0x38602000,
-    0x38604000, 0x38606000, 0x38608000, 0x3860a000, 0x3860c000, 0x3860e000,
-    0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861a000,
-    0x3861c000, 0x3861e000, 0x38620000, 0x38622000, 0x38624000, 0x38626000,
-    0x38628000, 0x3862a000, 0x3862c000, 0x3862e000, 0x38630000, 0x38632000,
-    0x38634000, 0x38636000, 0x38638000, 0x3863a000, 0x3863c000, 0x3863e000,
-    0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864a000,
-    0x3864c000, 0x3864e000, 0x38650000, 0x38652000, 0x38654000, 0x38656000,
-    0x38658000, 0x3865a000, 0x3865c000, 0x3865e000, 0x38660000, 0x38662000,
-    0x38664000, 0x38666000, 0x38668000, 0x3866a000, 0x3866c000, 0x3866e000,
-    0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867a000,
-    0x3867c000, 0x3867e000, 0x38680000, 0x38682000, 0x38684000, 0x38686000,
-    0x38688000, 0x3868a000, 0x3868c000, 0x3868e000, 0x38690000, 0x38692000,
-    0x38694000, 0x38696000, 0x38698000, 0x3869a000, 0x3869c000, 0x3869e000,
-    0x386a0000, 0x386a2000, 0x386a4000, 0x386a6000, 0x386a8000, 0x386aa000,
-    0x386ac000, 0x386ae000, 0x386b0000, 0x386b2000, 0x386b4000, 0x386b6000,
-    0x386b8000, 0x386ba000, 0x386bc000, 0x386be000, 0x386c0000, 0x386c2000,
-    0x386c4000, 0x386c6000, 0x386c8000, 0x386ca000, 0x386cc000, 0x386ce000,
-    0x386d0000, 0x386d2000, 0x386d4000, 0x386d6000, 0x386d8000, 0x386da000,
-    0x386dc000, 0x386de000, 0x386e0000, 0x386e2000, 0x386e4000, 0x386e6000,
-    0x386e8000, 0x386ea000, 0x386ec000, 0x386ee000, 0x386f0000, 0x386f2000,
-    0x386f4000, 0x386f6000, 0x386f8000, 0x386fa000, 0x386fc000, 0x386fe000,
-    0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870a000,
-    0x3870c000, 0x3870e000, 0x38710000, 0x38712000, 0x38714000, 0x38716000,
-    0x38718000, 0x3871a000, 0x3871c000, 0x3871e000, 0x38720000, 0x38722000,
-    0x38724000, 0x38726000, 0x38728000, 0x3872a000, 0x3872c000, 0x3872e000,
-    0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873a000,
-    0x3873c000, 0x3873e000, 0x38740000, 0x38742000, 0x38744000, 0x38746000,
-    0x38748000, 0x3874a000, 0x3874c000, 0x3874e000, 0x38750000, 0x38752000,
-    0x38754000, 0x38756000, 0x38758000, 0x3875a000, 0x3875c000, 0x3875e000,
-    0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876a000,
-    0x3876c000, 0x3876e000, 0x38770000, 0x38772000, 0x38774000, 0x38776000,
-    0x38778000, 0x3877a000, 0x3877c000, 0x3877e000, 0x38780000, 0x38782000,
-    0x38784000, 0x38786000, 0x38788000, 0x3878a000, 0x3878c000, 0x3878e000,
-    0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879a000,
-    0x3879c000, 0x3879e000, 0x387a0000, 0x387a2000, 0x387a4000, 0x387a6000,
-    0x387a8000, 0x387aa000, 0x387ac000, 0x387ae000, 0x387b0000, 0x387b2000,
-    0x387b4000, 0x387b6000, 0x387b8000, 0x387ba000, 0x387bc000, 0x387be000,
-    0x387c0000, 0x387c2000, 0x387c4000, 0x387c6000, 0x387c8000, 0x387ca000,
-    0x387cc000, 0x387ce000, 0x387d0000, 0x387d2000, 0x387d4000, 0x387d6000,
-    0x387d8000, 0x387da000, 0x387dc000, 0x387de000, 0x387e0000, 0x387e2000,
-    0x387e4000, 0x387e6000, 0x387e8000, 0x387ea000, 0x387ec000, 0x387ee000,
-    0x387f0000, 0x387f2000, 0x387f4000, 0x387f6000, 0x387f8000, 0x387fa000,
-    0x387fc000, 0x387fe000};
-
-static const uint16_t offsettable[64] = {
-    0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400};
-
-static const uint32_t exponenttable[64] = {
-    0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000,
-    0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000,
-    0x06000000, 0x06800000, 0x07000000, 0x07800000, 0x08000000, 0x08800000,
-    0x09000000, 0x09800000, 0x0a000000, 0x0a800000, 0x0b000000, 0x0b800000,
-    0x0c000000, 0x0c800000, 0x0d000000, 0x0d800000, 0x0e000000, 0x0e800000,
-    0x0f000000, 0x47800000, 0x80000000, 0x80800000, 0x81000000, 0x81800000,
-    0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000,
-    0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000,
-    0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8a000000, 0x8a800000,
-    0x8b000000, 0x8b800000, 0x8c000000, 0x8c800000, 0x8d000000, 0x8d800000,
-    0x8e000000, 0x8e800000, 0x8f000000, 0xc7800000};
-
-static const uint16_t basetable[512] = {
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010,
-    0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0c00, 0x1000,
-    0x1400, 0x1800, 0x1c00, 0x2000, 0x2400, 0x2800, 0x2c00, 0x3000, 0x3400,
-    0x3800, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00, 0x5000, 0x5400, 0x5800,
-    0x5c00, 0x6000, 0x6400, 0x6800, 0x6c00, 0x7000, 0x7400, 0x7800, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001,
-    0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200,
-    0x8400, 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00, 0xa000, 0xa400,
-    0xa800, 0xac00, 0xb000, 0xb400, 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800,
-    0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00, 0xe000, 0xe400, 0xe800, 0xec00,
-    0xf000, 0xf400, 0xf800, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00};
-
-static const uint8_t shifttable[512] = {
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13,
-    0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
-    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
-    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17,
-    0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d,
-    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
-    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
-    0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d};
-
-half_t Float2Half(float f) {
-  uint32_t v = *reinterpret_cast<uint32_t *>(&f);
-  return basetable[(v >> 23) & 0x1ff] +
-         ((v & 0x007fffff) >> shifttable[(v >> 23) & 0x1ff]);
-}
-
-float Half2Float(half_t h) {
-  uint32_t v = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] +
-               exponenttable[h >> 10];
-  return *reinterpret_cast<float *>(&v);
-}
-
-void FloatArray2HalfArray(float *f_array, half_t *h_array, int count) {
-  for (int i = 0; i < count; ++i) {
-    h_array[i] = Float2Half(f_array[i]);
-  }
-}
-
-void HalfArray2FloatArray(half_t *h_array, float *f_array, int count) {
-  for (int i = 0; i < count; ++i) {
-    f_array[i] = Half2Float(h_array[i]);
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/cl/cl_half.h b/mobile/src/framework/cl/cl_half.h
deleted file mode 100644
index 9b05740f1e..0000000000
--- a/mobile/src/framework/cl/cl_half.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cstdint>
-
-namespace paddle_mobile {
-namespace framework {
-
-typedef uint16_t half_t;
-
-half_t Float2Half(float f);
-
-float Half2Float(half_t h);
-
-void FloatArray2HalfArray(float *f_array, half_t *h_array, int count);
-
-void HalfArray2FloatArray(half_t *h_array, float *f_array, int count);
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/cl/cl_helper.h b/mobile/src/framework/cl/cl_helper.h
deleted file mode 100644
index f072edd82b..0000000000
--- a/mobile/src/framework/cl/cl_helper.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "common/log.h"
-#include "framework/cl/cl_deleter.h"
-#include "framework/cl/cl_image.h"
-#include "framework/cl/cl_scope.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class CLHelper {
- public:
-  CLHelper() = default;
-
-  explicit CLHelper(CLScope *scope) : scope_(scope) {}
-
-  void AddKernel(const std::string &kernel_name, const std::string &file_name,
-                 const std::string &options = "") {
-    DLOG << " begin add kernel ";
-    auto kernel = scope_->GetKernel(kernel_name, file_name, options);
-    DLOG << " add kernel ing ";
-    kernels.emplace_back(std::move(kernel));
-  }
-
-  cl_kernel KernelAt(const int index) {
-    DLOG << " kernel count: " << kernels.size();
-    return kernels[index].get();
-  }
-
-  cl_command_queue CLCommandQueue() { return scope_->CommandQueue(); }
-
-  cl_context CLContext() { return scope_->Context(); }
-
-  CLLocalWorkSizeInfo LocalWorkSizeInfo() {
-    return scope_->LocalWorkSizeInfo();
-  }
-
-  std::vector<size_t> DefaultWorkSize(const CLImage &image) {
-    // n c h w
-    auto image_dim = image.dims();
-    if (image_dim.size() == 4) {
-      auto n = image_dim[0];
-      auto h = image_dim[2];
-      auto w = image_dim[3];
-      auto image_width = image.ImageWidth();
-      auto work_size_0 = image_width / w;
-      auto work_size_1 = w;
-      auto work_size_2 = n * h;
-      return {work_size_0, work_size_1, work_size_2};
-    } else if (image_dim.size() == 2) {
-      auto h = image_dim[0];
-      auto w = image_dim[1];
-      return {1, image.ImageWidth(), image.ImageHeight()};
-    } else if (image_dim.size() == 1) {
-      return {1, image.ImageWidth(), 1};
-    } else if (image_dim.size() == 3) {
-      int c = image_dim[0];
-      int h = image_dim[1];
-      int w = image_dim[2];
-      return {(c + 3) / 4, w, h};
-    }
-    PADDLE_MOBILE_THROW_EXCEPTION(" not support this dim, need imp ");
-  }
-
- private:
-  CLScope *scope_;
-  std::vector<std::unique_ptr<_cl_kernel, CLKernelDeleter>> kernels;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/cl/cl_image.cpp b/mobile/src/framework/cl/cl_image.cpp
deleted file mode 100644
index 4f4b0d8883..0000000000
--- a/mobile/src/framework/cl/cl_image.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/cl/cl_image.h"
-#include "framework/cl/cl_tensor.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-void CLImageToTensor(CLImage *cl_image, Tensor *tensor, cl_context context,
-                     cl_command_queue commandQueue, cl_kernel kernel) {
-  tensor->mutable_data<float>();
-  const auto &dim = cl_image->dims();
-  size_t new_dims[] = {1, 1, 1, 1};
-  for (int j = 0; j < dim.size(); ++j) {
-    new_dims[4 - dim.size() + j] = dim[j];
-  }
-  size_t C, in_height, in_width;
-
-  C = new_dims[1];
-  in_height = new_dims[2];
-  in_width = new_dims[3];
-
-  CLTensor out_cl_tensor(context, commandQueue);
-  out_cl_tensor.Resize(tensor->dims());
-  cl_mem outBuffer = out_cl_tensor.mutable_data<float>();
-
-  auto input_image = cl_image->GetCLImage();
-
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(int), &in_height);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(int), &in_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &input_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer);
-  CL_CHECK_ERRORS(status);
-  int size_ch = in_height * in_width;
-  int size_block = size_ch * 4;
-  int size_batch = size_ch * C;
-  status = clSetKernelArg(kernel, 4, sizeof(int), &size_ch);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(int), &size_block);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(int), &size_batch);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(int), &C);
-  CL_CHECK_ERRORS(status);
-  size_t global_work_size[3] = {(new_dims[1] + 3) / 4, new_dims[3],
-                                new_dims[0] * new_dims[2]};
-  status = clEnqueueNDRangeKernel(commandQueue, kernel, 3, NULL,
-                                  global_work_size, NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-  memcpy(tensor->data<float>(), out_cl_tensor.Data<float>(),
-         tensor->memory_size());
-}
-
-void TensorToCLImage(Tensor *tensor, CLImage *cl_image, cl_context context,
-                     cl_command_queue commandQueue, cl_kernel kernel) {
-  const auto &dim = cl_image->dims();
-  size_t new_dims[] = {1, 1, 1, 1};
-  for (int j = 0; j < dim.size(); ++j) {
-    new_dims[4 - dim.size() + j] = dim[j];
-  }
-  cl_int status;
-  auto output = cl_image;
-  const Tensor *input = tensor;
-  const float *input_data = input->data<float>();
-  auto output_image = output->GetCLImage();
-  const int out_C = new_dims[1];
-  const int out_H = new_dims[2];
-  const int out_W = new_dims[3];
-  const int Stride2 = out_C * out_H * out_W;
-  const int Stride1 = out_H * out_W;
-  const int Stride0 = out_W;
-  DLOG << out_C;
-  DLOG << out_H;
-  DLOG << out_W;
-  CLTensor input_cl_tensor(context, commandQueue);
-  input_cl_tensor.Resize(input->dims());
-  cl_mem inputBuffer = input_cl_tensor.mutable_with_data<float>(input_data);
-
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2);
-  CL_CHECK_ERRORS(status);
-
-  size_t global_work_size[3] = {(new_dims[1] + 3) / 4, new_dims[3],
-                                new_dims[0] * new_dims[2]};
-  status = clEnqueueNDRangeKernel(commandQueue, kernel, 3, NULL,
-                                  global_work_size, NULL, 0, NULL, NULL);
-
-  CL_CHECK_ERRORS(status);
-}
-
-#ifdef PADDLE_MOBILE_DEBUG
-Print &operator<<(Print &printer, const CLImage &cl_image) {
-  int width = cl_image.ImageDims()[0];
-  int height = cl_image.ImageDims()[1];
-
-  half_t *image_data = new half_t[height * width * 4];
-  cl_int err;
-  cl_mem image = cl_image.GetCLImage();
-  size_t origin[3] = {0, 0, 0};
-  size_t region[3] = {width, height, 1};
-  err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin,
-                           region, 0, 0, image_data, 0, NULL, NULL);
-
-  CL_CHECK_ERRORS(err);
-
-  PADDLE_MOBILE_ENFORCE(cl_image.numel() != 0,
-                        "cl_image numel should not be 0 ");
-  float *tensor_data = new float[cl_image.numel()];
-  auto converter = cl_image.Converter();
-  converter->ImageToNCHW(image_data, tensor_data, cl_image.ImageDims(),
-                         cl_image.dims());
-  int stride = cl_image.numel() / 20;
-  stride = stride > 0 ? stride : 1;
-
-  printer << " dims: " << cl_image.dims() << "\n";
-  for (int i = 0; i < cl_image.numel(); i += stride) {
-    printer << tensor_data[i] << " ";
-  }
-
-  delete[](tensor_data);
-  delete[](image_data);
-
-  return printer;
-}
-#endif
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/cl/cl_image.h b/mobile/src/framework/cl/cl_image.h
deleted file mode 100644
index d92800b170..0000000000
--- a/mobile/src/framework/cl/cl_image.h
+++ /dev/null
@@ -1,312 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <vector>
-
-#include "CL/cl.h"
-
-#include "framework/cl/cl_deleter.h"
-#include "framework/cl/cl_engine.h"
-#include "framework/cl/cl_half.h"
-#include "framework/cl/cl_image_converter.h"
-#include "framework/cl/cl_tool.h"
-#include "framework/ddim.h"
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class CLImage {
- public:
-  CLImage() = default;
-
-  ~CLImage() {
-    if (tensor_data_ != nullptr) {
-      delete[](tensor_data_);
-    }
-
-    if (image_converter_) {
-      delete (image_converter_);
-    }
-  }
-  /*
-   * will not hold input tensor data, memcpy in this method
-   * */
-  void SetTensorData(float *tensorData, const DDim &dim) {
-    int numel = product(dim);
-    if (tensor_data_ != nullptr) {
-      delete[](tensor_data_);
-      tensor_data_ = nullptr;
-    }
-    tensor_data_ = new float[numel];
-    memcpy(tensor_data_, tensorData, numel * sizeof(float));
-    tensor_dims_ = dim;
-  }
-
-  bool isInit() { return initialized_; }
-  /*
-   * need call SetTensorData first
-   *
-   * folder when one dim or two dim
-   * */
-  void InitCLImage(cl_context context, cl_command_queue command_queue) {
-    PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr,
-                          " need call SetTensorData first");
-    CLImageConverterFolder *folder_converter = new CLImageConverterFolder();
-    InitCLImage(context, command_queue, folder_converter);
-  }
-
-  void InitNormalCLImage(cl_context context, cl_command_queue command_queue) {
-    PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr,
-                          " need call SetTensorData first");
-    CLImageConverterNormal *normal_converter = new CLImageConverterNormal();
-    InitCLImage(context, command_queue, normal_converter);
-  }
-
-  void InitCLImage(cl_context context, cl_command_queue command_queue,
-                   CLImageConverterBase *converter) {
-    if (image_converter_ != nullptr) {
-      delete (image_converter_);
-    }
-
-    PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr,
-                          " need call SetTensorData first");
-
-    DLOG << " begin init cl image ";
-    image_dims_ = converter->InitImageDimInfoWith(tensor_dims_);
-
-    half_t *image_data = new half_t[product(image_dims_) * 4];
-
-    DLOG << " convert to image";
-    converter->NCHWToImage(tensor_data_, image_data, tensor_dims_);
-    DLOG << " end convert to image";
-
-    InitCLImage(context, image_dims_[0], image_dims_[1], image_data);
-
-    delete[](image_data);
-    delete[](tensor_data_);
-
-    command_queue_ = command_queue;
-    tensor_data_ = nullptr;
-    image_converter_ = converter;
-    initialized_ = true;
-    DLOG << " end init cl image";
-  }
-
-  void InitNImage(cl_context context, cl_command_queue command_queue) {
-    if (tensor_data_ == nullptr) {
-      PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first");
-    }
-    CLImageConverterNWBlock *folder_converter = new CLImageConverterNWBlock();
-    InitCLImage(context, command_queue, folder_converter);
-    PADDLE_MOBILE_ENFORCE(tensor_dims_.size() == 4, " tensor dim is not 4");
-  }
-  void InitDWImage(cl_context context, cl_command_queue command_queue) {
-    if (tensor_data_ == nullptr) {
-      PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first");
-    }
-    CLImageConverterDWBlock *dw_converter = new CLImageConverterDWBlock();
-    InitCLImage(context, command_queue, dw_converter);
-    PADDLE_MOBILE_ENFORCE(tensor_dims_.size() == 4, " tensor dim is not 4");
-  }
-
-  void InitEmptyImage(cl_context context, cl_command_queue command_queue,
-                      const DDim &dim) {
-    PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr,
-                          " empty image tensor data shouldn't have value");
-
-    //    CLImageConverterFolder *folder_converter = new
-    //    CLImageConverterFolder();
-    CLImageConverterNormal *normal_converter = new CLImageConverterNormal();
-
-    DLOG << " to get image dims ";
-    image_dims_ = normal_converter->InitImageDimInfoWith(dim);
-    DLOG << " end get image dims " << image_dims_;
-
-    InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
-
-    tensor_dims_ = dim;
-    command_queue_ = command_queue;
-    image_converter_ = normal_converter;
-    cl_event_ = CLEngine::Instance()->CreateEvent(context);
-    initialized_ = true;
-    DLOG << " end init cl image";
-  }
-  // create fake size cl_mem for mem share
-  void InitFakeSizeImage(cl_context context, cl_command_queue command_queue,
-                         const DDim &need_dims, const DDim &real_dims) {
-    PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr,
-                          " empty image tensor data shouldn't have value");
-
-    CLImageConverterNormal *normal_converter = new CLImageConverterNormal();
-
-    real_image_dims = normal_converter->InitImageDimInfoWith(real_dims);
-    real_tensor_dims = real_dims;
-
-    image_dims_ = normal_converter->InitImageDimInfoWith(need_dims);
-    InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
-
-    tensor_dims_ = need_dims;
-    command_queue_ = command_queue;
-    image_converter_ = normal_converter;
-    cl_event_ = CLEngine::Instance()->CreateEvent(context);
-    initialized_ = true;
-    DLOG << " end init cl image";
-  }
-
-  void InitWithExitedMem(cl_context context, cl_command_queue command_queue,
-                         DDim need_dims, const CLImage &src) {
-    CLImageConverterNormal *normal_converter = new CLImageConverterNormal();
-
-    real_image_dims = normal_converter->InitImageDimInfoWith(src.dims());
-    real_tensor_dims = src.dims();
-
-    image_dims_ = normal_converter->InitImageDimInfoWith(need_dims);
-    // InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
-    if (cl_image_ != src.cl_image_) {
-      cl_image_.reset(src.cl_image_.get());
-    }
-
-    tensor_dims_ = need_dims;
-    command_queue_ = command_queue;
-    image_converter_ = normal_converter;
-    cl_event_ = CLEngine::Instance()->CreateEvent(context);
-    initialized_ = true;
-    DLOG << " end init cl image";
-  }
-
-  void InitConv2dTransposeFilterCLImage(cl_context context,
-                                        cl_command_queue command_queue) {
-    PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr,
-                          " need call SetTensorData first");
-    CLImageConverterConv2dTransposeTransWeight *converter =
-        new CLImageConverterConv2dTransposeTransWeight();
-    InitCLImage(context, command_queue, converter);
-  }
-
-  /*! The internal of two tensors share the same memory block. */
-  inline CLImage &ShareHolderWith(const CLImage &src) {
-    PADDLE_MOBILE_ENFORCE(
-        src.cl_image_ != nullptr,
-        "Tensor holds no memory. Call Tensor::mutable_data first.")
-
-    if (cl_image_ != src.cl_image_) {
-      cl_image_.reset(src.cl_image_.get());
-    }
-    return *this;
-  }
-
-  cl_mem GetCLImage() const { return cl_image_.get(); }
-
-  const DDim &ImageDims() const { return image_dims_; }
-
-  inline size_t ImageWidth() const { return image_dims_[0]; }
-
-  inline size_t ImageHeight() const { return image_dims_[1]; }
-
-  inline cl_command_queue CommandQueue() const { return command_queue_; }
-
-  /*
-   *  resize original tensor dim
-   * */
-  inline CLImage &Resize(const DDim &dims) {
-    tensor_dims_ = dims;
-    return *this;
-  }
-
-  template <typename T>
-  T *data() const {
-    if (initialized_) {
-      PADDLE_MOBILE_THROW_EXCEPTION(
-          " cl image has initialized, tensor data has been deleted, can't use "
-          "tensor data");
-    }
-    return reinterpret_cast<T *>(tensor_data_);
-  }
-
-  /*
-   *  numel of tensor dim
-   * */
-  inline int64_t numel() const { return product(tensor_dims_); }
-
-  /*
-   *  original tensor dim
-   * */
-  const DDim &dims() const { return tensor_dims_; }
-
-  cl_event GetClEvent() const { return cl_event_.get(); }
-
-  CLImageConverterBase *Converter() const { return image_converter_; }
-
- private:
-  void InitCLImage(cl_context context, int width, int height, void *data) {
-    cl_image_format cf = {.image_channel_order = CL_RGBA,
-                          .image_channel_data_type = CL_HALF_FLOAT};
-    cl_image_desc cid = {
-        .image_type = CL_MEM_OBJECT_IMAGE2D,
-        .image_width = width,
-        .image_height = height,
-        .image_depth = 1,
-        .image_array_size = 1,
-        .image_row_pitch = 0,
-        .image_slice_pitch = 0,
-        .num_mip_levels = 0,
-        .num_samples = 0,
-        // .buffer = nullptr
-    };
-    cid.buffer = nullptr;
-    cl_int err;
-    cl_mem cl_image = clCreateImage(
-        context, CL_MEM_READ_WRITE | (data ? CL_MEM_COPY_HOST_PTR : 0),
-        &cf,   // const cl_image_format *image_format
-        &cid,  // const cl_image_desc *image_desc
-        data,  // void *host_ptr
-        &err);
-    cl_image_.reset(cl_image);
-    if (err != CL_SUCCESS) {
-      CL_CHECK_ERRORS(err);
-      PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error ");
-    }
-  }
-
-  bool initialized_ = false;
-  std::unique_ptr<_cl_mem, CLMemDeleter> cl_image_;
-  std::unique_ptr<_cl_event, CLEventDeleter> cl_event_;
-  DDim tensor_dims_;
-  DDim image_dims_;
-  // real image dims usually it is same as image_dims
-  DDim real_image_dims;
-  // real tensor dims usually it is same as tensor dims
-  DDim real_tensor_dims;
-  float *tensor_data_ = nullptr;
-  cl_context context_;
-  cl_command_queue command_queue_;
-  CLImageConverterBase *image_converter_ = nullptr;
-};
-
-void TensorToCLImage(Tensor *tensor, CLImage *image, cl_context context,
-                     cl_command_queue commandQueue, cl_kernel kernel);
-
-void CLImageToTensor(CLImage *image, Tensor *tensor, cl_context context,
-                     cl_command_queue commandQueue, cl_kernel kernel);
-
-#ifdef PADDLE_MOBILE_DEBUG
-Print &operator<<(Print &printer, const CLImage &image);
-#endif
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/cl/cl_image_converter.cpp b/mobile/src/framework/cl/cl_image_converter.cpp
deleted file mode 100644
index 277d379152..0000000000
--- a/mobile/src/framework/cl/cl_image_converter.cpp
+++ /dev/null
@@ -1,510 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/cl/cl_image_converter.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-DDim CLImageConverterDefault::InitImageDimInfoWith(const DDim &tensor_dim) {
-  size_t new_dims[] = {1, 1, 1, 1};
-  for (int j = 0; j < tensor_dim.size(); ++j) {
-    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
-  }
-  size_t N, C, H, W;
-  N = new_dims[0];
-  C = new_dims[1];
-  H = new_dims[2];
-  W = new_dims[3];
-  size_t width = W * ((C + 3) / 4);
-  size_t height = H * N;
-  return make_ddim({width, height});
-}
-
-void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image,
-                                          const DDim &tensor_dim) {
-  size_t new_dims[] = {1, 1, 1, 1};
-  for (int j = 0; j < tensor_dim.size(); ++j) {
-    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
-  }
-
-  size_t N, C, H, W;
-  N = new_dims[0];
-  C = new_dims[1];
-  H = new_dims[2];
-  W = new_dims[3];
-
-  DDim in_image_dim = InitImageDimInfoWith(tensor_dim);
-
-  DLOG << " tensor dim " << tensor_dim;
-  DLOG << " image dim " << in_image_dim;
-
-  size_t width = in_image_dim[0];
-  size_t height = in_image_dim[1];
-
-  int w_block = width / W;
-
-  float *p = nchw;
-  size_t i0 = 0;
-  for (int n = 0; n < N; n++) {
-    for (int c = 0; c < w_block * 4; c++) {
-      size_t i1 = i0 + (c / 4) * W;
-      for (int h = 0; h < H; h++) {
-        size_t i2 = (i1 << 2) + c % 4;
-        for (int w = 0; w < W; w++) {
-          if (c < C) {
-            // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
-            // (c % 4);
-            image[i2] = Float2Half(*p);
-            i2 += 4;
-            p++;
-          } else {
-            image[i2] = 0.0;
-            i2 += 4;
-          }
-        }
-        i1 += width;
-      }
-    }
-    i0 += width * H;
-  }
-}
-
-void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor,
-                                          const DDim &image_dim,
-                                          const DDim &tensor_dim) {
-  size_t new_dims[] = {1, 1, 1, 1};
-  for (int j = 0; j < tensor_dim.size(); ++j) {
-    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
-  }
-
-  size_t N, C, H, W;
-  N = new_dims[0];
-  C = new_dims[1];
-  H = new_dims[2];
-  W = new_dims[3];
-
-  int width = image_dim[0];
-  int height = image_dim[0];
-
-  float *p = tensor;
-
-  size_t i0 = 0;
-  for (int n = 0; n < N; n++) {
-    for (int c = 0; c < C; c++) {
-      size_t i1 = i0 + (c / 4) * W;
-      for (int h = 0; h < H; h++) {
-        size_t i2 = (i1 << 2) + c % 4;
-        for (int w = 0; w < W; w++) {
-          *p = Half2Float(image[i2]);
-          i2 += 4;
-          p++;
-        }
-        i1 += width;
-      }
-    }
-    i0 += width * H;
-  }
-}
-
-DDim CLImageConverterFolder::InitImageDimInfoWith(const DDim &tensor_dim) {
-  if (tensor_dim.size() <= 2) {
-    int tdim[2] = {1, 1};
-    if (tensor_dim.size() == 1) {
-      tdim[1] = tensor_dim[0];
-    } else {
-      tdim[0] = tensor_dim[0];
-      tdim[1] = tensor_dim[1];
-    }
-    int width = (tdim[1] + 3) / 4;
-    int height = tdim[0];
-
-    width_of_one_block_ = width;
-    height_of_one_block_ = height;
-    c_block_ = 1;
-
-    return make_ddim({width, height});
-
-  } else {
-    size_t new_dims[] = {1, 1, 1, 1};
-    for (int j = 0; j < tensor_dim.size(); ++j) {
-      new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
-    }
-    size_t N, C, H, W;
-    N = new_dims[0];
-    C = new_dims[1];
-    H = new_dims[2];
-    W = new_dims[3];
-    size_t width = W * ((C + 3) / 4);
-    size_t height = H * N;
-
-    width_of_one_block_ = W;
-    height_of_one_block_ = H;
-    c_block_ = width / W;
-
-    return make_ddim({width, height});
-  }
-}
-
-void CLImageConverterFolder::NCHWToImage(float *tensor, half_t *image,
-                                         const DDim &tensor_dim) {
-  PADDLE_MOBILE_ENFORCE(tensor_dim.size() <= 4 && tensor_dim.size() > 0,
-                        "tensor dim is not support ");
-
-  if (tensor_dim.size() > 2) {
-    CLImageConverterDefault default_converter;
-    default_converter.NCHWToImage(tensor, image, tensor_dim);
-
-  } else {
-    int tdim[2] = {1, 1};
-    if (tensor_dim.size() == 1) {
-      tdim[1] = tensor_dim[0];
-    } else {
-      tdim[0] = tensor_dim[0];
-      tdim[1] = tensor_dim[1];
-    }
-
-    DDim image_dim = InitImageDimInfoWith(tensor_dim);
-    int width = image_dim[0];
-
-    for (int h = 0; h < tdim[0]; h++) {
-      for (int w = 0; w < tdim[1]; w++) {
-        image[(h * width + w / 4) * 4 + (w % 4)] =
-            Float2Half(tensor[h * tdim[1] + w]);
-      }
-    }
-  }
-}
-
-void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor,
-                                         const DDim &image_dim,
-                                         const DDim &tensor_dim) {
-  if (tensor_dim.size() > 2) {
-    CLImageConverterDefault default_converter;
-    default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim);
-
-  } else {
-    int width = image_dim[0];
-    int height = image_dim[1];
-    int H, W;
-
-    if (tensor_dim.size() == 2) {
-      H = tensor_dim[0];
-      W = tensor_dim[1];
-    } else if (tensor_dim.size() == 1) {
-      H = 1;
-      W = tensor_dim[0];
-    }
-    float *p = tensor;
-
-    for (int h = 0; h < H; h++) {
-      for (int w = 0; w < W; w++) {
-        p[h * W + w] = Half2Float(image[(h * width + w / 4) * 4 + (w % 4)]);
-      }
-    }
-  }
-}
-
-DDim CLImageConverterNWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
-  PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
-  size_t N, C, H, W;
-  N = tensor_dim[0];
-  C = tensor_dim[1];
-  H = tensor_dim[2];
-  W = tensor_dim[3];
-  size_t width = W * ((N + 3) / 4);
-  size_t height = C * H;
-  return make_ddim({width, height});
-}
-
-void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image,
-                                          const DDim &tensor_dim) {
-  PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
-  auto image_dim = InitImageDimInfoWith(tensor_dim);
-  float *p = tensor;
-  int N = tensor_dim[0];
-  int C = tensor_dim[1];
-  int H = tensor_dim[2];
-  int W = tensor_dim[3];
-  int width = image_dim[0];
-  int height = image_dim[1];
-  int block = image_dim[0] / tensor_dim[3];
-
-  for (int n = 0; n < block * 4; n++) {
-    for (int c = 0; c < C; c++) {
-      for (int h = 0; h < H; ++h) {
-        for (int w = 0; w < W; ++w) {
-          int index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
-                      w * 4 + n % 4;
-          if (n < N) {
-            image[index] = Float2Half(*p);
-            p++;
-          } else {
-            image[index] = 0.0;
-          }
-          if (index >= (width * height * 4)) {
-            DLOG << " index out of range ";
-          }
-        }
-      }
-    }
-  }
-  DLOG << " init done";
-}
-
-void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor,
-                                          const DDim &image_dim,
-                                          const DDim &tensor_dim) {
-  PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
-  float *p = tensor;
-  int N = tensor_dim[0];
-  int C = tensor_dim[1];
-  int H = tensor_dim[2];
-  int W = tensor_dim[3];
-  int width = image_dim[0];
-  int height = image_dim[1];
-  int block = image_dim[0] / tensor_dim[3];
-
-  for (int n = 0; n < N; n++) {
-    for (int c = 0; c < C; c++) {
-      for (int h = 0; h < H; ++h) {
-        for (int w = 0; w < W; ++w) {
-          int index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
-                      w * 4 + n % 4;
-          *p = Half2Float(image[index]);
-          p++;
-          if (index >= (width * height * 4)) {
-            DLOG << " index out of range ";
-          }
-        }
-      }
-    }
-  }
-  DLOG << " init done";
-}
-
-DDim CLImageConverterDWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
-  PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
-  size_t N, C, H, W;
-  N = tensor_dim[0];
-  C = tensor_dim[1];
-  H = tensor_dim[2];
-  W = tensor_dim[3];
-  size_t width = W * ((N + 3) / 4);
-  size_t height = C * H;
-  return make_ddim({width, height});
-}
-
-void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image,
-                                          const DDim &tensor_dim) {
-  size_t new_dims[] = {1, 1, 1, 1};
-  for (int j = 0; j < tensor_dim.size(); ++j) {
-    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
-  }
-
-  size_t N, C, H, W;
-  N = new_dims[1];
-  C = new_dims[0];
-  H = new_dims[2];
-  W = new_dims[3];
-
-  DDim in_image_dim = InitImageDimInfoWith(tensor_dim);
-
-  DLOG << " tensor dim " << tensor_dim;
-  DLOG << " image dim " << in_image_dim;
-
-  size_t width = in_image_dim[0];
-  size_t height = in_image_dim[1];
-
-  int w_block = width / W;
-
-  float *p = tensor;
-  size_t i0 = 0;
-  for (int n = 0; n < N; n++) {
-    for (int c = 0; c < w_block * 4; c++) {
-      size_t i1 = i0 + (c / 4) * W;
-      for (int h = 0; h < H; h++) {
-        size_t i2 = (i1 << 2) + c % 4;
-        for (int w = 0; w < W; w++) {
-          if (c < C) {
-            // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
-            // (c % 4);
-            image[i2] = Float2Half(*p);
-            i2 += 4;
-            p++;
-          } else {
-            image[i2] = 0.0;
-            i2 += 4;
-          }
-        }
-        i1 += width;
-      }
-    }
-    i0 += width * H;
-  }
-}
-
-void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor,
-                                          const DDim &image_dim,
-                                          const DDim &tensor_dim) {
-  PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
-  float *p = tensor;
-  int N = tensor_dim[1];
-  int C = tensor_dim[0];
-  int H = tensor_dim[2];
-  int W = tensor_dim[3];
-  int width = image_dim[0];
-  int height = image_dim[0];
-
-  size_t i0 = 0;
-  for (int n = 0; n < N; n++) {
-    for (int c = 0; c < C; c++) {
-      size_t i1 = i0 + (c / 4) * W;
-      for (int h = 0; h < H; h++) {
-        size_t i2 = (i1 << 2) + c % 4;
-        for (int w = 0; w < W; w++) {
-          *p = Half2Float(image[i2]);
-          i2 += 4;
-          p++;
-        }
-        i1 += width;
-      }
-    }
-    i0 += width * H;
-  }
-}
-
-DDim CLImageConverterNormal::InitImageDimInfoWith(const DDim &tensor_dim) {
-  PADDLE_MOBILE_ENFORCE(tensor_dim.size() <= 4 && tensor_dim.size() > 0,
-                        "tensor dim is not support ");
-  size_t new_dims[] = {1, 1, 1, 1};
-  for (int j = 0; j < tensor_dim.size(); ++j) {
-    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
-  }
-  size_t N, C, H, W;
-  N = new_dims[0];
-  C = new_dims[1];
-  H = new_dims[2];
-  W = new_dims[3];
-  size_t width = W * ((C + 3) / 4);
-  size_t height = H * N;
-
-  width_of_one_block_ = W;
-  height_of_one_block_ = H;
-  c_block_ = width / W;
-
-  return make_ddim({width, height});
-}
-
-void CLImageConverterNormal::NCHWToImage(float *tensor, half_t *image,
-                                         const DDim &tensor_dim) {
-  PADDLE_MOBILE_ENFORCE(tensor_dim.size() <= 4 && tensor_dim.size() > 0,
-                        "tensor dim is not support ");
-
-  CLImageConverterDefault default_converter;
-  default_converter.NCHWToImage(tensor, image, tensor_dim);
-}
-
-void CLImageConverterNormal::ImageToNCHW(half_t *image, float *tensor,
-                                         const DDim &image_dim,
-                                         const DDim &tensor_dim) {
-  CLImageConverterDefault default_converter;
-  default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim);
-}
-
-DDim CLImageConverterWinoTransWeight::InitImageDimInfoWith(
-    const DDim &tensor_dim) {
-  PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
-  size_t N, C, H, W;
-  N = tensor_dim[0];
-  C = tensor_dim[1];
-  H = tensor_dim[2];
-  W = tensor_dim[3];
-  size_t width = (C + 3) / 4;
-  size_t height = N * 16;  // N * (wino_blk_size + 2) * (wino_blk_size + 2)
-  return make_ddim({width, height});
-}
-
-void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor, half_t *image,
-                                                  const DDim &tensor_dim) {}
-
-void CLImageConverterWinoTransWeight::ImageToNCHW(half_t *image, float *tensor,
-                                                  const DDim &image_dim,
-                                                  const DDim &tensor_dim) {}
-
-DDim CLImageConverterConv2dTransposeTransWeight::InitImageDimInfoWith(
-    const DDim &tensor_dim) {
-  size_t new_dims[] = {1, 1, 1, 1};
-  for (int j = 0; j < tensor_dim.size(); ++j) {
-    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
-  }
-  size_t N, C, H, W;
-  C = new_dims[0];
-  N = new_dims[1];
-  H = new_dims[2];
-  W = new_dims[3];
-  size_t width = W * ((C + 3) / 4);
-  size_t height = H * N;
-  return make_ddim({width, height});
-}
-
-// it is actually CNHW to Image, because conv2d_transpose's filter is CNHW
-void CLImageConverterConv2dTransposeTransWeight::NCHWToImage(
-    float *nchw, half_t *image, const DDim &tensor_dim) {
-  size_t new_dims[] = {1, 1, 1, 1};
-  for (int j = 0; j < tensor_dim.size(); ++j) {
-    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
-  }
-
-  size_t N, C, H, W;
-  C = new_dims[0];
-  N = new_dims[1];
-  H = new_dims[2];
-  W = new_dims[3];
-
-  DDim in_image_dim = InitImageDimInfoWith(tensor_dim);
-
-  DLOG << " tensor dim " << tensor_dim;
-  DLOG << " image dim " << in_image_dim;
-
-  size_t width = in_image_dim[0];
-  size_t height = in_image_dim[1];
-
-  int w_block = width / W;
-
-  float *p = nchw;
-  int realC = w_block * 4;
-  for (int c = 0; c < realC; c++) {
-    for (int n = 0; n < N; n++) {
-      for (int h = 0; h < H; h++) {
-        for (int w = 0; w < W; w++) {
-          int index = (n * H + h) * width * 4 + (c / 4) * 4 * W + w * 4 + c % 4;
-          if (c < C) {
-            image[index] = Float2Half(*p);
-            p++;
-          } else {
-            image[index] = 0;
-          }
-        }
-      }
-    }
-  }
-}
-
-void CLImageConverterConv2dTransposeTransWeight::ImageToNCHW(
-    half_t *image, float *tensor, const DDim &image_dim,
-    const DDim &tensor_dim) {}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/cl/cl_image_converter.h b/mobile/src/framework/cl/cl_image_converter.h
deleted file mode 100644
index 75c135c042..0000000000
--- a/mobile/src/framework/cl/cl_image_converter.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "framework/cl/cl_half.h"
-#include "framework/ddim.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class CLImageConverterBase {
- public:
-  virtual void NCHWToImage(float *nchw, half_t *image,
-                           const DDim &tensor_dim) = 0;
-
-  virtual void ImageToNCHW(half_t *image, float *nchw, const DDim &image_dim,
-                           const DDim &tensor_dim) = 0;
-  virtual DDim InitImageDimInfoWith(const DDim &tensor_dim) = 0;
-};
-
-class CLImageConverterDefault : public CLImageConverterBase {
- public:
-  DDim InitImageDimInfoWith(const DDim &tensor_dim);
-  void NCHWToImage(float *nchw, half_t *image, const DDim &tensor_dim);
-  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
-                   const DDim &tensor_dim);
-};
-
-class CLImageConverterFolder : public CLImageConverterBase {
- public:
-  DDim InitImageDimInfoWith(const DDim &tensor_dim);
-  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
-  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
-                   const DDim &tensor_dim);
-
-  /*
-   *  width of original tensor
-   * */
-  inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
-
-  /*
-   *  height of original tensor
-   * */
-  inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
-
-  int GetCBlock() const { return c_block_; }
-
- private:
-  int c_block_;
-  int width_of_one_block_;
-  int height_of_one_block_;
-};
-
-class CLImageConverterNormal : public CLImageConverterBase {
- public:
-  DDim InitImageDimInfoWith(const DDim &tensor_dim);
-  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
-  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
-                   const DDim &tensor_dim);
-
-  /*
-   *  width of original tensor
-   * */
-  inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
-
-  /*
-   *  height of original tensor
-   * */
-  inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
-
-  int GetCBlock() const { return c_block_; }
-
- private:
-  int c_block_;
-  int width_of_one_block_;
-  int height_of_one_block_;
-};
-
-class CLImageConverterNWBlock : public CLImageConverterBase {
-  DDim InitImageDimInfoWith(const DDim &tensor_dim);
-  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
-  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
-                   const DDim &tensor_dim);
-};
-class CLImageConverterDWBlock : public CLImageConverterBase {
-  DDim InitImageDimInfoWith(const DDim &tensor_dim);
-  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
-  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
-                   const DDim &tensor_dim);
-};
-
-class CLImageConverterWinoTransWeight : public CLImageConverterBase {
- public:
-  DDim InitImageDimInfoWith(const DDim &tensor_dim);
-  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
-  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
-                   const DDim &tensor_dim);
-};
-
-class CLImageConverterConv2dTransposeTransWeight : public CLImageConverterBase {
- public:
-  DDim InitImageDimInfoWith(const DDim &tensor_dim);
-  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
-  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
-                   const DDim &tensor_dim);
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/cl/cl_scope.h b/mobile/src/framework/cl/cl_scope.h
deleted file mode 100644
index ebe16b553a..0000000000
--- a/mobile/src/framework/cl/cl_scope.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "CL/cl.h"
-#include "framework/cl/cl_deleter.h"
-#include "framework/cl/cl_engine.h"
-#include "framework/cl/cl_tool.h"
-
-namespace paddle_mobile {
-
-extern const std::map<std::string, std::vector<unsigned char>> opencl_kernels;
-extern const std::map<std::string, std::vector<unsigned char>> opencl_headers;
-
-namespace framework {
-
-class CLScope {
- public:
-  CLScope() {
-    CLEngine *engine = CLEngine::Instance();
-    context_ = engine->getContext();
-    command_queue_ = engine->getClCommandQueue();
-    localWorkSizeInfo_ = engine->getLocalWorkSizeInfo();
-  }
-
-  cl_command_queue CommandQueue() { return command_queue_; }
-
-  std::unique_ptr<_cl_kernel, CLKernelDeleter> GetKernel(
-      const std::string &kernel_name, const std::string &file_name,
-      const std::string &options) {
-    DLOG << " to get program " << file_name;
-    auto program = Program(file_name, kernel_name, options);
-    DLOG << " end get program ~ ";
-    DLOG << " to create kernel: " << kernel_name;
-    std::unique_ptr<_cl_kernel, CLKernelDeleter> kernel(
-        clCreateKernel(program, kernel_name.c_str(), &status_));
-    CL_CHECK_ERRORS(status_);
-    DLOG << " end create kernel ~ ";
-    return std::move(kernel);
-  }
-
-  cl_context Context() { return context_; }
-
-  cl_program Program(const std::string &file_name,
-                     const std::string &kernel_name,
-                     const std::string &options) {
-    if (opencl_kernels.find(kernel_name) != opencl_kernels.end() &&
-        opencl_headers.find(file_name) != opencl_headers.end()) {
-      std::string program_key = file_name + kernel_name;
-      if (!options.empty()) {
-        program_key += options;
-      }
-      auto it = programs_.find(program_key);
-      if (it != programs_.end()) {
-        return it->second.get();
-      }
-      auto src_it = opencl_kernels.find(kernel_name);
-      std::string source(src_it->second.begin(), src_it->second.end());
-      auto header_it = opencl_headers.find(file_name);
-      std::string header(header_it->second.begin(), header_it->second.end());
-      source = header + "\n" + source;
-      auto program = CLEngine::Instance()->CreateProgramWithSource(
-          context_, source.c_str());
-
-      DLOG << " --- begin build program -> " << program_key << " --- ";
-      CLEngine::Instance()->BuildProgram(program.get(), options);
-      DLOG << " --- end build program -> " << program_key << " --- ";
-
-      programs_[program_key] = std::move(program);
-      return programs_[program_key].get();
-    } else {
-      std::string program_key = file_name;
-      if (!options.empty()) {
-        program_key += options;
-      }
-      auto it = programs_.find(program_key);
-      if (it != programs_.end()) {
-        return it->second.get();
-      }
-      auto program = CLEngine::Instance()->CreateProgramWith(
-          context_,
-          CLEngine::Instance()->GetCLPath() + "/cl_kernel/" + file_name);
-
-      DLOG << " --- begin build program -> " << program_key << " --- ";
-      CLEngine::Instance()->BuildProgram(program.get(), options);
-      DLOG << " --- end build program -> " << program_key << " --- ";
-
-      programs_[program_key] = std::move(program);
-      return programs_[program_key].get();
-    }
-  }
-
-  CLLocalWorkSizeInfo LocalWorkSizeInfo() { return localWorkSizeInfo_; }
-
- private:
-  cl_int status_;
-  cl_context context_;
-  cl_command_queue command_queue_;
-  std::unordered_map<std::string,
-                     std::unique_ptr<_cl_program, CLProgramDeleter>>
-      programs_;
-  CLLocalWorkSizeInfo localWorkSizeInfo_;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/cl/cl_tensor.h b/mobile/src/framework/cl/cl_tensor.h
deleted file mode 100644
index 5bb4055eff..0000000000
--- a/mobile/src/framework/cl/cl_tensor.h
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "CL/cl.h"
-#include "framework/cl/cl_deleter.h"
-#include "framework/cl/cl_engine.h"
-#include "framework/tensor_base.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class CLTensor : public TensorBase {
- public:
-  CLTensor(cl_context context, cl_command_queue command_queue)
-      : context_(context), command_queue_(command_queue) {}
-
-  CLTensor() = default;
-
-  /*
-   * if init method haven't set context and command_queue, need set
-   * */
-  void SetContextAndCommandQueue(cl_context context,
-                                 cl_command_queue command_queue) {
-    context_ = context;
-    command_queue_ = command_queue;
-  }
-
-  /*! Resize the dimensions of the memory block. */
-  inline CLTensor &Resize(const DDim &dims) {
-    dims_ = dims;
-    return *this;
-  }
-
-  template <typename T>
-  inline cl_mem mutable_with_data(const T *data) {
-    int64_t size = numel() * sizeof(T);
-
-    holder_.reset(new PlaceholderImpl(
-        size, reinterpret_cast<void *>(const_cast<T *>(data)),
-        type_id<T>().hash_code(), context_, command_queue_));
-    return reinterpret_cast<cl_mem>(holder_->ptr());
-  }
-
-  inline cl_mem mutable_data(kTypeId_t type) {
-    if (holder_ != nullptr) {
-      holder_->set_type(type);
-    }
-    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
-    int64_t size = numel() * SizeOfType(type);
-    if (holder_ == nullptr || holder_->size() < size + offset_) {
-      holder_.reset(new PlaceholderImpl(size, type, context_, command_queue_));
-      offset_ = 0;
-    }
-    return reinterpret_cast<cl_mem>(holder_->ptr());
-  }
-
-  /**
-   * @brief   Return a pointer to cl buffer.
-   * @note    If not exist, then allocation.
-   */
-  template <typename T>
-  inline cl_mem mutable_data() {
-    return reinterpret_cast<cl_mem>(mutable_data(type_id<T>().hash_code()));
-  }
-
-  /**
-   * @brief     Return a pointer to cl buffer.
-   *
-   * @param[in] dims    The dimensions of the memory block.
-   * @param[in] place   The place of the memory block.
-   *
-   * @note      If not exist, then allocation.
-   */
-  template <typename T>
-  inline cl_mem mutable_data(DDim dims) {
-    Resize(dims);
-    return mutable_data<T>();
-  }
-
-  inline cl_mem CLBuffer() {
-    check_memory_size();
-    return reinterpret_cast<cl_mem>(
-        reinterpret_cast<uintptr_t>(holder_->ptr()));
-  }
-
-  template <typename T>
-  inline T *Data() {
-    if (host_ptr_) {
-      delete (host_ptr_);
-      host_ptr_ = nullptr;
-    }
-    cl_mem buffer = CLBuffer();
-    host_ptr_ = new char[holder_->size()];
-    cl_int status;
-    status = clEnqueueReadBuffer(command_queue_, buffer, CL_TRUE, 0,
-                                 holder_->size(), host_ptr_, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
-    return reinterpret_cast<T *>(host_ptr_);
-  }
-
-  int memorySize() { return holder_->size(); }
-
-  ~CLTensor() {
-    DLOG << "~CLTensor";
-    if (host_ptr_) {
-      DLOG << " delete host ptr ";
-      delete (host_ptr_);
-      host_ptr_ = nullptr;
-    }
-  }
-
- private:
-  cl_context context_;
-  cl_command_queue command_queue_;
-  void *host_ptr_ = nullptr;
-
-  struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(size_t size, void *input, kTypeId_t type,
-                    cl_context context, cl_command_queue command_queue)
-        : ptr_(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
-                              size, reinterpret_cast<void *>(input), NULL)),
-          size_(size),
-          capatity_(size),
-          type_(type),
-          context_(context),
-          command_queue_(command_queue) {}
-
-    PlaceholderImpl(size_t size, kTypeId_t type, cl_context context,
-                    cl_command_queue command_queue)
-        : ptr_(clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, NULL)),
-          size_(size),
-          capatity_(size),
-          type_(type),
-          context_(context),
-          command_queue_(command_queue) {}
-
-    virtual size_t size() const { return size_; }
-
-    virtual void *ptr() const { return static_cast<void *>(ptr_.get()); }
-
-    virtual kTypeId_t type() const { return type_; }
-
-    virtual void set_type(kTypeId_t type) { type_ = type; }
-
-    virtual void resize(size_t size) {
-      if (size > capatity_) {
-        capatity_ = size;
-        ptr_.reset(
-            clCreateBuffer(context_, CL_MEM_READ_WRITE, capatity_, NULL, NULL));
-      }
-      size_ = size;
-    }
-
-    virtual void realloc(size_t size) {
-      capatity_ = size;
-      ptr_.reset(
-          clCreateBuffer(context_, CL_MEM_READ_WRITE, capatity_, NULL, NULL));
-      size_ = size;
-    }
-
-    std::unique_ptr<_cl_mem, CLMemDeleter> ptr_;
-
-    size_t size_;
-
-    size_t capatity_;
-
-    /* the current type of memory */
-    kTypeId_t type_;
-
-    cl_context context_;
-    cl_command_queue command_queue_;
-  };
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/cl/cl_tool.cpp b/mobile/src/framework/cl/cl_tool.cpp
deleted file mode 100644
index 827642b6b7..0000000000
--- a/mobile/src/framework/cl/cl_tool.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/cl/cl_tool.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-const char *opencl_error_to_str(cl_int error) {
-#define CASE_CL_CONSTANT(NAME) \
-  case NAME:                   \
-    return #NAME;
-  // Suppose that no combinations are possible.
-  switch (error) {
-    CASE_CL_CONSTANT(CL_SUCCESS)
-    CASE_CL_CONSTANT(CL_DEVICE_NOT_FOUND)
-    CASE_CL_CONSTANT(CL_DEVICE_NOT_AVAILABLE)
-    CASE_CL_CONSTANT(CL_COMPILER_NOT_AVAILABLE)
-    CASE_CL_CONSTANT(CL_MEM_OBJECT_ALLOCATION_FAILURE)
-    CASE_CL_CONSTANT(CL_OUT_OF_RESOURCES)
-    CASE_CL_CONSTANT(CL_OUT_OF_HOST_MEMORY)
-    CASE_CL_CONSTANT(CL_PROFILING_INFO_NOT_AVAILABLE)
-    CASE_CL_CONSTANT(CL_MEM_COPY_OVERLAP)
-    CASE_CL_CONSTANT(CL_IMAGE_FORMAT_MISMATCH)
-    CASE_CL_CONSTANT(CL_IMAGE_FORMAT_NOT_SUPPORTED)
-    CASE_CL_CONSTANT(CL_BUILD_PROGRAM_FAILURE)
-    CASE_CL_CONSTANT(CL_MAP_FAILURE)
-    CASE_CL_CONSTANT(CL_MISALIGNED_SUB_BUFFER_OFFSET)
-    CASE_CL_CONSTANT(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST)
-    CASE_CL_CONSTANT(CL_INVALID_VALUE)
-    CASE_CL_CONSTANT(CL_INVALID_DEVICE_TYPE)
-    CASE_CL_CONSTANT(CL_INVALID_PLATFORM)
-    CASE_CL_CONSTANT(CL_INVALID_DEVICE)
-    CASE_CL_CONSTANT(CL_INVALID_CONTEXT)
-    CASE_CL_CONSTANT(CL_INVALID_QUEUE_PROPERTIES)
-    CASE_CL_CONSTANT(CL_INVALID_COMMAND_QUEUE)
-    CASE_CL_CONSTANT(CL_INVALID_HOST_PTR)
-    CASE_CL_CONSTANT(CL_INVALID_MEM_OBJECT)
-    CASE_CL_CONSTANT(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)
-    CASE_CL_CONSTANT(CL_INVALID_IMAGE_SIZE)
-    CASE_CL_CONSTANT(CL_INVALID_SAMPLER)
-    CASE_CL_CONSTANT(CL_INVALID_BINARY)
-    CASE_CL_CONSTANT(CL_INVALID_BUILD_OPTIONS)
-    CASE_CL_CONSTANT(CL_INVALID_PROGRAM)
-    CASE_CL_CONSTANT(CL_INVALID_PROGRAM_EXECUTABLE)
-    CASE_CL_CONSTANT(CL_INVALID_KERNEL_NAME)
-    CASE_CL_CONSTANT(CL_INVALID_KERNEL_DEFINITION)
-    CASE_CL_CONSTANT(CL_INVALID_KERNEL)
-    CASE_CL_CONSTANT(CL_INVALID_ARG_INDEX)
-    CASE_CL_CONSTANT(CL_INVALID_ARG_VALUE)
-    CASE_CL_CONSTANT(CL_INVALID_ARG_SIZE)
-    CASE_CL_CONSTANT(CL_INVALID_KERNEL_ARGS)
-    CASE_CL_CONSTANT(CL_INVALID_WORK_DIMENSION)
-    CASE_CL_CONSTANT(CL_INVALID_WORK_GROUP_SIZE)
-    CASE_CL_CONSTANT(CL_INVALID_WORK_ITEM_SIZE)
-    CASE_CL_CONSTANT(CL_INVALID_GLOBAL_OFFSET)
-    CASE_CL_CONSTANT(CL_INVALID_EVENT_WAIT_LIST)
-    CASE_CL_CONSTANT(CL_INVALID_EVENT)
-    CASE_CL_CONSTANT(CL_INVALID_OPERATION)
-    CASE_CL_CONSTANT(CL_INVALID_GL_OBJECT)
-    CASE_CL_CONSTANT(CL_INVALID_BUFFER_SIZE)
-    CASE_CL_CONSTANT(CL_INVALID_MIP_LEVEL)
-    CASE_CL_CONSTANT(CL_INVALID_GLOBAL_WORK_SIZE)
-    CASE_CL_CONSTANT(CL_INVALID_PROPERTY)
-
-    default:
-      return "UNKNOWN ERROR CODE";
-  }
-#undef CASE_CL_CONSTANT
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/cl/cl_tool.h b/mobile/src/framework/cl/cl_tool.h
deleted file mode 100644
index 25d5bfc584..0000000000
--- a/mobile/src/framework/cl/cl_tool.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "CL/cl.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-const char* opencl_error_to_str(cl_int error);
-
-#define CL_CHECK_ERRORS(ERR)                                          \
-  if (ERR != CL_SUCCESS) {                                            \
-    printf(                                                           \
-        "OpenCL error with code %s happened in file %s at line %d. "  \
-        "Exiting.\n",                                                 \
-        paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \
-        __LINE__);                                                    \
-  }
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/context.cpp b/mobile/src/framework/context.cpp
deleted file mode 100644
index 10f1572d03..0000000000
--- a/mobile/src/framework/context.cpp
+++ /dev/null
@@ -1,605 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// Tencent is pleased to support the open source community by making ncnn
-// available.
-//
-// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this
-// file except in compliance with the License. You may obtain a copy of the
-// License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-// License for the specific language governing permissions and limitations under
-// the License.
-
-#include "framework/context.h"
-#include <iostream>
-#include <string>
-#include "common/log.h"
-
-#ifdef __APPLE__
-#include "TargetConditionals.h"
-#ifdef TARGET_OS_IPHONE
-// iOS
-#elif TARGET_OS_MAC
-// Mac OS
-#else
-// Unsupported platform
-#endif
-#include <mach/machine.h>
-#include <sys/sysctl.h>
-#include <sys/types.h>
-#else  // Linux or Android
-#include <sys/syscall.h>
-#include <unistd.h>
-#endif
-
-namespace paddle_mobile {
-namespace framework {
-
-const int DEFAULT_L1_CACHE_SIZE = 32 * 1024;
-const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024;
-const int DEFAULT_L3_CACHE_SIZE = 0;
-
-void fill_cpu_cache_size(std::vector<int> *cpu_cache_sizes, int value,
-                         const std::vector<int> cpu_ids = {}) {
-  int num = cpu_ids.size();
-  if (num > 0) {
-    for (int i = 0; i < num; i++) {
-      if (cpu_ids.size() > i) {
-        int idx = cpu_ids[i];
-        if (cpu_cache_sizes->size() > idx) {
-          (*cpu_cache_sizes)[idx] = value;
-        }
-      }
-    }
-  } else {
-    num = cpu_cache_sizes->size();
-    for (int i = 0; i < num; i++) {
-      if (cpu_cache_sizes->size() > i) {
-        (*cpu_cache_sizes)[i] = value;
-      }
-    }
-  }
-}
-
-int get_cpu_num() {
-#ifdef __APPLE__
-  int count = 0;
-  size_t len = sizeof(count);
-  sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
-  if (count < 1) {
-    count = 1;
-  }
-  return count;
-#else  // Linux or Android
-  // get cpu num from /sys/devices/system/cpu/cpunum/uevent
-  int max_cpu_num = 20;
-  int count = 0;
-  for (int i = 0; i < max_cpu_num; i++) {
-    char path[256];
-    snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i);
-    FILE *fp = fopen(path, "rb");
-    if (!fp) {
-      break;
-    }
-    count++;
-    fclose(fp);
-  }
-  if (count < 1) {
-    count = 1;
-  }
-  return count;
-#endif
-}
-
-#if !defined(__APPLE__)  // Linux or Android
-std::string get_cpu_name() {
-  FILE *fp = fopen("/proc/cpuinfo", "rb");
-  if (!fp) {
-    return "";
-  }
-  char line[1024];
-  while (!feof(fp)) {
-    char *s = fgets(line, 1024, fp);
-    if (!s) {
-      break;
-    }
-    if (strstr(line, "Hardware") != NULL) {
-      fclose(fp);
-      return std::string(line);
-    }
-  }
-  fclose(fp);
-  return "";
-}
-
-int get_cpu_max_freq_khz(int cpu_id) {
-  // first try, for all possible cpu
-  char path[256];
-#ifdef __ANDROID__
-  snprintf(path, sizeof(path),
-           "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpu_id);
-  FILE *fp = fopen(path, "rb");
-  if (!fp) {
-    // second try, for online cpu
-    snprintf(path, sizeof(path),
-             "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state",
-             cpu_id);
-    fp = fopen(path, "rb");
-    if (!fp) {
-      // third try, for online cpu
-      snprintf(path, sizeof(path),
-               "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq",
-               cpu_id);
-      fp = fopen(path, "rb");
-      if (!fp) {
-        return 0;
-      }
-      int max_freq_khz = 0;
-      if (fscanf(fp, "%d", &max_freq_khz) <= 0) {
-        max_freq_khz = 0;
-      }
-      fclose(fp);
-      return max_freq_khz;
-    }
-  }
-  int max_freq_khz = 0;
-  while (!feof(fp)) {
-    int freq_khz = 0;
-    int nscan = fscanf(fp, "%d %*d", &freq_khz);
-    if (nscan != 1) {
-      break;
-    }
-    if (freq_khz > max_freq_khz) {
-      max_freq_khz = freq_khz;
-    }
-  }
-  fclose(fp);
-  return max_freq_khz;
-#else
-  snprintf(path, sizeof(path),
-           "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_max_freq", cpu_id);
-  FILE *fp = fopen(path, "r");
-  if (!fp) {
-    return 0;
-  }
-  int max_freq_khz = 0;
-  if (fscanf(fp, "%d", &max_freq_khz) <= 0) {
-    max_freq_khz = 0;
-  }
-  fclose(fp);
-  return max_freq_khz;
-#endif
-}
-
-void get_cpu_cache_size(int cpu_id, int *l1_cache_size, int *l2_cache_size,
-                        int *l3_cache_size) {
-  int max_cache_idx_num = 10;
-  *l1_cache_size = DEFAULT_L1_CACHE_SIZE;
-  *l2_cache_size = DEFAULT_L2_CACHE_SIZE;
-  *l3_cache_size = DEFAULT_L3_CACHE_SIZE;
-  for (int i = 0; i < max_cache_idx_num; i++) {
-    char path[256];
-    snprintf(path, sizeof(path),
-             "/sys/devices/system/cpu/cpu%d/cache/index%d/level", cpu_id, i);
-    FILE *fp = fopen(path, "rb");
-    if (fp) {
-      int level = -1;
-      fscanf(fp, "%d", &level);
-      fclose(fp);
-      snprintf(path, sizeof(path),
-               "/sys/devices/system/cpu/cpu%d/cache/index%d/size", cpu_id, i);
-      fp = fopen(path, "rb");
-      if (fp) {
-        int size = -1;
-        fscanf(fp, "%d", &size);
-        fclose(fp);
-        if (size >= 0) {
-          if (level == 1) {
-            *l1_cache_size = size * 1024;
-          } else if (level == 2) {
-            *l2_cache_size = size * 1024;
-          } else if (level == 3) {
-            *l3_cache_size = size * 1024;
-          }
-        }
-      }
-    }
-  }
-}
-
-int check_online(std::vector<int> *cpu_ids) {
-  if (cpu_ids->size() == 0) {
-    return 0;
-  }
-  std::vector<int> online_cpu_ids;
-  char path[256];
-  for (int i = 0; i < cpu_ids->size(); i++) {
-    int cpu_id = (*cpu_ids)[i];
-    snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/online",
-             cpu_id);
-    FILE *fp = fopen(path, "rb");
-    if (fp) {
-      int is_online = 0;
-      fscanf(fp, "%d", &is_online);
-      fclose(fp);
-      if (is_online != 0) {
-        online_cpu_ids.push_back(cpu_id);
-      }
-    }
-    // open failed(Permission denied)
-  }
-  *cpu_ids = online_cpu_ids;
-  return cpu_ids->size();
-}
-
-int set_sched_affinity(const std::vector<int> &cpu_ids) {
-// cpu_set_t definition
-// ref http://stackoverflow.com/questions/16319725/android-set-thread-affinity
-#define CPU_SETSIZE 1024
-#define __NCPUBITS (8 * sizeof(unsigned long))  // NOLINT
-  typedef struct {
-    unsigned long __bits[CPU_SETSIZE / __NCPUBITS];  // NOLINT
-  } cpu_set_t;
-
-#define CPU_SET(cpu, cpusetp) \
-  ((cpusetp)->__bits[(cpu) / __NCPUBITS] |= (1UL << ((cpu) % __NCPUBITS)))
-
-#define CPU_ZERO(cpusetp) memset((cpusetp), 0, sizeof(cpu_set_t))
-
-  // set affinity for thread
-#ifdef __GLIBC__
-  pid_t pid = syscall(SYS_gettid);
-#else
-  pid_t pid = gettid();
-#endif
-  cpu_set_t mask;
-  CPU_ZERO(&mask);
-  for (int i = 0; i < cpu_ids.size(); i++) {
-    CPU_SET(cpu_ids[i], &mask);
-  }
-  int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask);
-  if (syscallret) {
-    LOG(kLOG_WARNING) << "invoke syscall(__NR_sched_setaffinity) error(ret="
-                      << syscallret << ")";
-    return -1;
-  }
-  return 0;
-}
-
-int get_cpu_info_by_name(int *cpu_num, ARMArch *arch,
-                         std::vector<int> *big_core_ids,
-                         std::vector<int> *little_core_ids,
-                         std::vector<int> *l1_cache_sizes,
-                         std::vector<int> *l2_cache_sizes,
-                         std::vector<int> *l3_cache_sizes,
-                         std::string hardware_name) {
-  /* Snapdragon */
-  if (hardware_name.find("SDM845") != std::string::npos) {  // 845
-    *cpu_num = 8;
-    *arch = A75;
-    *big_core_ids = {4, 5, 6, 7};
-    *little_core_ids = {0, 1, 2, 3};
-    l1_cache_sizes->resize(*cpu_num);
-    l2_cache_sizes->resize(*cpu_num);
-    l3_cache_sizes->resize(*cpu_num);
-    fill_cpu_cache_size(l1_cache_sizes, 64 * 1024);
-    fill_cpu_cache_size(l2_cache_sizes, 256 * 1024, *big_core_ids);
-    fill_cpu_cache_size(l2_cache_sizes, 128 * 1024, *little_core_ids);
-    fill_cpu_cache_size(l3_cache_sizes, 2048 * 1024);
-    return 0;
-  } else if (hardware_name.find("SDM710") != std::string::npos) {  // 710
-    *cpu_num = 8;
-    *arch = A75;
-    *big_core_ids = {6, 7};
-    *little_core_ids = {0, 1, 2, 3, 4, 5};
-    l1_cache_sizes->resize(*cpu_num);
-    l2_cache_sizes->resize(*cpu_num);
-    l3_cache_sizes->resize(*cpu_num);
-    fill_cpu_cache_size(l1_cache_sizes, 64 * 1024, *big_core_ids);
-    fill_cpu_cache_size(l1_cache_sizes, 32 * 1024, *little_core_ids);
-    fill_cpu_cache_size(l2_cache_sizes, 256 * 1024, *big_core_ids);
-    fill_cpu_cache_size(l2_cache_sizes, 128 * 1024, *little_core_ids);
-    fill_cpu_cache_size(l3_cache_sizes, 1024 * 1024);
-    return 0;
-  } else if (hardware_name.find("MSM8998") != std::string::npos) {  // 835
-    *cpu_num = 8;
-    *arch = A73;
-    *big_core_ids = {4, 5, 6, 7};
-    *little_core_ids = {0, 1, 2, 3};
-    l1_cache_sizes->resize(*cpu_num);
-    l2_cache_sizes->resize(*cpu_num);
-    l3_cache_sizes->resize(*cpu_num);
-    fill_cpu_cache_size(l1_cache_sizes, 64 * 1024, *big_core_ids);
-    fill_cpu_cache_size(l1_cache_sizes, 32 * 1024, *little_core_ids);
-    // real L2 cache size is 2M, while that will get bad performace on conv3x3s1
-    // or gemm, set to 1M or 512K
-    // fill_cpu_cache_size(l2_cache_sizes, 2048 *1024,
-    // *big_core_ids);
-    // fill_cpu_cache_size(l2_cache_sizes, 1024 * 1024,
-    // *little_core_ids);
-    fill_cpu_cache_size(l2_cache_sizes, 1024 * 1024);
-    fill_cpu_cache_size(l3_cache_sizes, 0);
-    return 0;
-  } else if (hardware_name.find("MSM8976") != std::string::npos) {  // 652,653
-    *cpu_num = 8;
-    *arch = A72;
-    *big_core_ids = {4, 5, 6, 7};
-    *little_core_ids = {0, 1, 2, 3};
-    l1_cache_sizes->resize(*cpu_num);
-    l2_cache_sizes->resize(*cpu_num);
-    l3_cache_sizes->resize(*cpu_num);
-    fill_cpu_cache_size(l1_cache_sizes, 32 * 1024);
-    fill_cpu_cache_size(l2_cache_sizes, 1024 * 1024);
-    fill_cpu_cache_size(l3_cache_sizes, 0);
-    return 0;
-  } else if (hardware_name.find("SDM660") != std::string::npos ||
-             hardware_name.find("SDM636") != std::string::npos) {  // 660, 636
-    *cpu_num = 8;
-    *arch = A73;
-    *big_core_ids = {4, 5, 6, 7};
-    *little_core_ids = {0, 1, 2, 3};
-    l1_cache_sizes->resize(*cpu_num);
-    l2_cache_sizes->resize(*cpu_num);
-    l3_cache_sizes->resize(*cpu_num);
-    fill_cpu_cache_size(l1_cache_sizes, 64 * 1024);
-    fill_cpu_cache_size(l2_cache_sizes, 1024 * 1024);
-    fill_cpu_cache_size(l3_cache_sizes, 0);
-    return 0;
-
-    /* MediaTek */
-  } else if (hardware_name.find("MT6799") != std::string::npos) {  // X30
-    *cpu_num = 10;
-    *arch = A73;
-    *big_core_ids = {8, 9};
-    *little_core_ids = {0, 1, 2, 3, 4, 5, 6, 7};
-    return 0;
-  } else if (hardware_name.find("MT6771") != std::string::npos) {  // P60
-    *cpu_num = 8;
-    *arch = A73;
-    *big_core_ids = {4, 5, 6, 7};
-    *little_core_ids = {0, 1, 2, 3};
-    return 0;
-
-    /* Kirin */
-  } else if (hardware_name.find("KIRIN970") !=
-             std::string::npos) {  // Kirin 970
-    *cpu_num = 8;
-    *arch = A73;
-    *big_core_ids = {4, 5, 6, 7};
-    *little_core_ids = {0, 1, 2, 3};
-    return 0;
-  }
-  return -1;
-}
-
-// divide cpu cores into big and little clusters by max frequency
-void get_cpu_info_by_probe(int cpu_num, std::vector<int> *big_core_ids,
-                           std::vector<int> *little_core_ids,
-                           std::vector<int> *l1_cache_sizes,
-                           std::vector<int> *l2_cache_sizes,
-                           std::vector<int> *l3_cache_sizes) {
-  // get maxium & minium of cpu_max_freqs
-  std::vector<int> cpu_max_freqs(cpu_num);
-  for (int i = 0; i < cpu_num; i++) {
-    cpu_max_freqs[i] = get_cpu_max_freq_khz(i) / 1000;
-  }
-  int max_cpu_max_freq = cpu_max_freqs[0];
-  int min_cpu_max_freq = cpu_max_freqs[0];
-  for (int i = 1; i < cpu_num; i++) {
-    int cur_cpu_max_freq = cpu_max_freqs[i];
-    if (cur_cpu_max_freq < min_cpu_max_freq) {
-      min_cpu_max_freq = cur_cpu_max_freq;
-    } else if (cur_cpu_max_freq > max_cpu_max_freq) {
-      max_cpu_max_freq = cur_cpu_max_freq;
-    }
-  }
-  int mid_max_freq_khz = (max_cpu_max_freq + min_cpu_max_freq) / 2;
-  big_core_ids->clear();
-  little_core_ids->clear();
-  for (int i = 0; i < cpu_num; i++) {
-    if (cpu_max_freqs[i] >= mid_max_freq_khz) {
-      big_core_ids->push_back(i);
-    } else {
-      little_core_ids->push_back(i);
-    }
-  }
-  /* get l1, l2, l3 cache size for each core */
-  l1_cache_sizes->resize(cpu_num);
-  l2_cache_sizes->resize(cpu_num);
-  l3_cache_sizes->resize(cpu_num);
-  for (int i = 0; i < cpu_num; i++) {
-    get_cpu_cache_size(i, &((*l1_cache_sizes)[i]), &((*l2_cache_sizes)[i]),
-                       &((*l3_cache_sizes)[i]));
-  }
-}
-
-void bind_threads(const std::vector<int> &cpu_ids) {
-#ifdef _OPENMP
-  int num_threads = omp_get_max_threads();
-  std::vector<int> ssarets;
-  for (int i = 0; i < num_threads; i++) {
-    ssarets.push_back(0);
-  }
-#pragma omp parallel for
-  for (int i = 0; i < num_threads; i++) {
-    ssarets[i] = set_sched_affinity(cpu_ids);
-  }
-  for (int i = 0; i < num_threads; i++) {
-    if (ssarets[i] != 0) {
-      LOG(kLOG_WARNING) << "set cpu affinity failed, thread idx: " << i;
-      return;
-    }
-  }
-#else
-  int ssaret = set_sched_affinity(cpu_ids);
-  if (ssaret != 0) {
-    LOG(kLOG_WARNING) << "set cpu affinity failed, thread idx: 0 ";
-    return;
-  }
-#endif
-}
-#endif
-
-CPUContext::CPUContext() {
-  _cpu_num = get_cpu_num();
-  _big_core_ids.clear();
-  _little_core_ids.clear();
-#ifdef __APPLE__
-  // set default L1, L2 and L3 cache sizes
-  _l1_cache_sizes.resize(_cpu_num);
-  _l2_cache_sizes.resize(_cpu_num);
-  _l3_cache_sizes.resize(_cpu_num);
-  fill_cpu_cache_size(&_l1_cache_sizes, DEFAULT_L1_CACHE_SIZE);
-  fill_cpu_cache_size(&_l2_cache_sizes, DEFAULT_L2_CACHE_SIZE);
-  fill_cpu_cache_size(&_l3_cache_sizes, DEFAULT_L3_CACHE_SIZE);
-#else  // Linux or Android
-  // probe cpu info, and set big&litte clusters, L1, L2 and L3 cache sizes
-  std::string cpu_name = get_cpu_name();
-  bool failed =
-      get_cpu_info_by_name(&_cpu_num, &_arch, &_big_core_ids, &_little_core_ids,
-                           &_l1_cache_sizes, &_l2_cache_sizes, &_l3_cache_sizes,
-                           cpu_name) != 0;
-  if (failed) {
-    get_cpu_info_by_probe(_cpu_num, &_big_core_ids, &_little_core_ids,
-                          &_l1_cache_sizes, &_l2_cache_sizes, &_l3_cache_sizes);
-  }
-  LOG(kLOG_INFO) << "CPU num: " << _cpu_num;
-  for (int i = 0; i < _cpu_num; i++) {
-    if (!(_l1_cache_sizes.size() > i && _l2_cache_sizes.size() > i &&
-          _l3_cache_sizes.size() > i)) {
-      break;
-    }
-    LOG(kLOG_INFO) << i << " L1 Cache: " << _l1_cache_sizes[i] << "KB"
-                   << " L2 Cache: " << _l2_cache_sizes[i] << "KB"
-                   << " L3 Cache: " << _l3_cache_sizes[i] << "KB";
-  }
-  LOG(kLOG_INFO) << "Big cores: ";
-  for (int i = 0; i < _big_core_ids.size(); i++) {
-    LOG(kLOG_INFO) << _big_core_ids[i];
-  }
-  LOG(kLOG_INFO) << "Little cores: ";
-  for (int i = 0; i < _little_core_ids.size(); i++) {
-    LOG(kLOG_INFO) << _little_core_ids[i];
-  }
-#endif
-  // use single thread by default
-  set_thread_num(1, PERFORMANCE_PRIORITY);
-}
-
-void CPUContext::set_thread_num(int thread_num, PowerMode power_mode) {
-  int big_core_num = _big_core_ids.size();
-  int little_core_num = _little_core_ids.size();
-#ifdef _OPENMP
-  if (thread_num > _cpu_num) {
-    thread_num = _cpu_num;
-  }
-#else
-  thread_num = 1;
-#endif
-  std::vector<int> bind_core_ids;
-  if (power_mode == PERFORMANCE_PRIORITY || power_mode == PERFORMANCE_ONLY) {
-    if (big_core_num > 0) {
-      bind_core_ids = _big_core_ids;
-      if (power_mode == PERFORMANCE_ONLY && thread_num > big_core_num) {
-        LOG(kLOG_ERROR) << "thread_num(" << thread_num
-                        << ") exceed the big cores num (" << big_core_num << ")"
-                        << ", force to set thread_num = " << big_core_num;
-        thread_num = big_core_num;
-      }
-    }
-  } else if (power_mode == EFFICIENCY_PRIORITY ||
-             power_mode == EFFICIENCY_ONLY) {
-    if (little_core_num > 0) {
-      bind_core_ids = _little_core_ids;
-      if (power_mode == EFFICIENCY_ONLY && thread_num > little_core_num) {
-        LOG(kLOG_ERROR) << "thread_num(" << thread_num
-                        << ") exceed the little cores num (" << little_core_num
-                        << ")"
-                        << ", force to set thread_num = " << little_core_num;
-        thread_num = little_core_num;
-      }
-    }
-  }
-  _power_mode = AUTO;
-#ifdef _OPENMP
-  omp_set_num_threads(thread_num);
-  thread_num = omp_get_max_threads();
-#endif
-#if !defined(__APPLE__)  // Linux or Android
-  if (bind_core_ids.size() > 0 && check_online(&bind_core_ids) >= thread_num) {
-    bind_threads(bind_core_ids);
-    _power_mode = power_mode;
-  }
-#endif
-  LOG(kLOG_INFO) << "thread num: " << thread_num
-                 << " power mode: " << _power_mode;
-}
-
-int CPUContext::get_thread_num() {
-  int thread_num = 1;
-#ifdef _OPENMP
-  thread_num = omp_get_max_threads();
-#endif
-  return thread_num;
-}
-
-int CPUContext::get_cache_size(int level) {
-  std::vector<int> *ptr = nullptr;
-  if (level == 1) {
-    ptr = &_l1_cache_sizes;
-  } else if (level == 2) {
-    ptr = &_l2_cache_sizes;
-  } else if (level == 3) {
-    ptr = &_l3_cache_sizes;
-  } else {
-    return 0;
-  }
-  if (_power_mode == PERFORMANCE_PRIORITY || _power_mode == PERFORMANCE_ONLY) {
-    if (_big_core_ids.size() > 0) {
-      int idx = _big_core_ids[0];
-      if (ptr->size() > idx) {
-        return (*ptr)[idx];
-      }
-    }
-  } else if (_power_mode == EFFICIENCY_PRIORITY ||
-             _power_mode == EFFICIENCY_ONLY) {
-    if (_little_core_ids.size() > 0) {
-      int idx = _little_core_ids[0];
-      if (ptr->size() > idx) {
-        return (*ptr)[idx];
-      }
-    }
-  } else {  // AUTO
-    int idx = 0;
-    if (ptr->size() > idx) {
-      return (*ptr)[idx];
-    }
-  }
-}
-
-void *CPUContext::get_work_space(int size_in_byte) {
-  return reinterpret_cast<void *>(
-      _workspace.mutable_data<int8_t>(make_ddim({size_in_byte})));
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/context.h b/mobile/src/framework/context.h
deleted file mode 100644
index 944d54cc49..0000000000
--- a/mobile/src/framework/context.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// Tencent is pleased to support the open source community by making ncnn
-// available.
-//
-// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this
-// file except in compliance with the License. You may obtain a copy of the
-// License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-// License for the specific language governing permissions and limitations under
-// the License.
-
-#pragma once
-
-#if _OPENMP
-#include <omp.h>
-#endif
-
-#include <vector>
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-struct CPUContext {
- private:
-  CPUContext();
-  virtual ~CPUContext() {}
-
- public:
-  static CPUContext* Context() {
-    static CPUContext* ctx = nullptr;
-    if (ctx == nullptr) {
-      ctx = new CPUContext();
-    }
-    return ctx;
-  }
-
-  void set_thread_num(int thread_num,
-                      PowerMode power_mode = PERFORMANCE_PRIORITY);
-  int get_thread_num();
-  PowerMode get_power_mode() const { return _power_mode; }
-  int get_cache_size(int level);
-  ARMArch get_arch() const { return _arch; }
-  int get_l1_cache_size() { return get_cache_size(1); }
-  int get_l2_cache_size() { return get_cache_size(2); }
-  int get_l3_cache_size() { return get_cache_size(3); }
-  void* get_work_space(int size_in_byte);
-
-  int _cpu_num;
-  ARMArch _arch;
-  PowerMode _power_mode;
-  std::vector<int> _big_core_ids;
-  std::vector<int> _little_core_ids;
-  std::vector<int> _l1_cache_sizes;
-  std::vector<int> _l2_cache_sizes;
-  std::vector<int> _l3_cache_sizes;
-  Tensor _workspace;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/data_layout.h b/mobile/src/framework/data_layout.h
deleted file mode 100644
index fd0bec3913..0000000000
--- a/mobile/src/framework/data_layout.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cctype>
-#include <cstdlib>
-#include <string>
-
-namespace paddle_mobile {
-namespace framework {
-
-enum class DataLayout {
-  kNHWC = 0,
-  kNCHW = 1,
-  kAnyLayout = 2,
-};
-
-inline DataLayout StringToDataLayout(const std::string &str) {
-  std::string s(str);
-  for (size_t i = 0; i < s.size(); ++i) {
-    s[i] = toupper(s[i]);
-  }
-
-  if (s == "NHWC") {
-    return DataLayout::kNHWC;
-  } else if (s == "NCHW") {
-    return DataLayout::kNCHW;
-  } else if (s == "ANYLAYOUT") {
-    return DataLayout::kAnyLayout;
-  } else {
-    PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string: %s", s.c_str())
-  }
-  return DataLayout::kNCHW;
-}
-
-inline std::string DataLayoutToString(const DataLayout &data_layout) {
-  switch (data_layout) {
-    case DataLayout::kNHWC:
-      return "NHWC";
-    case DataLayout::kNCHW:
-      return "NCHW";
-    case DataLayout::kAnyLayout:
-      return "ANY_LAYOUT";
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string ")
-      break;
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/data_type.cpp b/mobile/src/framework/data_type.cpp
deleted file mode 100644
index 5eaf3ecaf5..0000000000
--- a/mobile/src/framework/data_type.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/data_type.h"
-#include <stdint.h>
-#include <string>
-#include <unordered_map>
-#include "common/type_define.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-struct DataTypeMap {
-  std::unordered_map<kTypeId_t, _PaddleMobile__Framework__Proto__VarType__Type>
-      cpp_to_proto_;
-  std::unordered_map<int, kTypeId_t> proto_to_cpp_;
-  std::unordered_map<int, std::string> proto_to_str_;
-  std::unordered_map<kTypeId_t, size_t> cpp_to_size_;
-};
-
-static DataTypeMap* InitDataTypeMap();
-// C++11 removes the need for manual locking. Concurrent execution shall wait if
-// a static local variable is already being initialized.
-// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex
-static DataTypeMap& gDataTypeMap() {
-  static DataTypeMap* g_data_type_map_ = InitDataTypeMap();
-  return *g_data_type_map_;
-}
-
-template <typename T>
-static inline void RegisterType(
-    DataTypeMap* map, _PaddleMobile__Framework__Proto__VarType__Type proto_type,
-    const std::string& name) {
-  map->proto_to_cpp_.emplace(static_cast<int>(proto_type),
-                             type_id<T>().hash_code());
-  map->cpp_to_proto_.emplace(type_id<T>().hash_code(), proto_type);
-  map->proto_to_str_.emplace(static_cast<int>(proto_type), name);
-  map->cpp_to_size_.emplace(type_id<T>().hash_code(), sizeof(T));
-}
-
-static DataTypeMap* InitDataTypeMap() {
-  auto retv = new DataTypeMap();
-
-#define RegType(cc_type, proto_type) \
-  RegisterType<cc_type>(retv, proto_type, #cc_type)
-
-  // NOTE: Add your customize type here.
-  // RegType(float16, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16);
-  RegType(float, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32);
-  RegType(double, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64);
-  RegType(int, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32);
-  RegType(int64_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64);
-  RegType(bool, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL);
-  RegType(size_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SIZE_T);
-  RegType(int16_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16);
-  RegType(uint8_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__UINT8);
-  RegType(int8_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT8);
-
-#undef RegType
-  return retv;
-}
-
-_PaddleMobile__Framework__Proto__VarType__Type ToDataType(kTypeId_t type) {
-  auto it = gDataTypeMap().cpp_to_proto_.find(type);
-  if (it != gDataTypeMap().cpp_to_proto_.end()) {
-    return it->second;
-  }
-  PADDLE_MOBILE_THROW_EXCEPTION("Not support %d as tensor type", type);
-}
-
-kTypeId_t ToTypeIndex(_PaddleMobile__Framework__Proto__VarType__Type type) {
-  auto it = gDataTypeMap().proto_to_cpp_.find(static_cast<int>(type));
-  if (it != gDataTypeMap().proto_to_cpp_.end()) {
-    return it->second;
-  }
-  PADDLE_MOBILE_THROW_EXCEPTION(
-      "Not support _PaddleMobile__Framework__Proto__VarType__Type(%d) as "
-      "tensor type",
-      static_cast<int>(type));
-}
-
-std::string DataTypeToString(
-    const _PaddleMobile__Framework__Proto__VarType__Type type) {
-  auto it = gDataTypeMap().proto_to_str_.find(static_cast<int>(type));
-  if (it != gDataTypeMap().proto_to_str_.end()) {
-    return it->second;
-  }
-  PADDLE_MOBILE_THROW_EXCEPTION(
-      "Not support _PaddleMobile__Framework__Proto__VarType__Type(%d) as "
-      "tensor type",
-      static_cast<int>(type));
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/data_type.h b/mobile/src/framework/data_type.h
deleted file mode 100644
index bda823ada4..0000000000
--- a/mobile/src/framework/data_type.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "common/enforce.h"
-#include "common/type_define.h"
-#include "framework/framework.pb-c.h"
-
-namespace paddle_mobile {
-
-namespace framework {
-
-_PaddleMobile__Framework__Proto__VarType__Type ToDataType(kTypeId_t type);
-
-kTypeId_t ToTypeIndex(_PaddleMobile__Framework__Proto__VarType__Type type);
-
-inline _PaddleMobile__Framework__Proto__VarType__Type ToDataType(int type) {
-  return static_cast<_PaddleMobile__Framework__Proto__VarType__Type>(type);
-}
-
-template <typename Visitor>
-inline void VisitDataType(_PaddleMobile__Framework__Proto__VarType__Type type,
-                          Visitor visitor) {
-  switch (type) {
-    // case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16:
-    //   visitor.template apply<float16>();
-    //   break;
-    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32:
-      visitor.template apply<float>();
-      break;
-    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64:
-      visitor.template apply<double>();
-      break;
-    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32:
-      visitor.template apply<int>();
-      break;
-    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64:
-      visitor.template apply<int64_t>();
-      break;
-    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL:
-      visitor.template apply<bool>();
-      break;
-    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__UINT8:
-      visitor.template apply<uint8_t>();
-      break;
-    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16:
-      visitor.template apply<int16_t>();
-      break;
-    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT8:
-      visitor.template apply<int8_t>();
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Not supported %d", type);
-  }
-}
-
-extern std::string DataTypeToString(
-    const _PaddleMobile__Framework__Proto__VarType__Type type);
-inline std::ostream& operator<<(
-    std::ostream& out,
-    const _PaddleMobile__Framework__Proto__VarType__Type& type) {
-  out << DataTypeToString(type);
-  return out;
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/ddim.cpp b/mobile/src/framework/ddim.cpp
deleted file mode 100644
index 6da08bf88e..0000000000
--- a/mobile/src/framework/ddim.cpp
+++ /dev/null
@@ -1,327 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ddim.h"
-#include <algorithm>
-
-namespace paddle_mobile {
-namespace framework {
-
-/// @cond HIDDEN
-
-template <int i>
-Dim<i> make_dim(const int64_t *d) {
-  return Dim<i>(*d, make_dim<i - 1>(d + 1));
-}
-
-template <>
-Dim<0> make_dim<0>(const int64_t *d) {
-  return Dim<0>(*d);
-}
-
-void make_ddim(DDim &ddim, const int64_t *dims, int n) {
-  switch (n) {
-    case 0:
-      ddim = make_dim<0>(dims);
-      break;
-    case 1:
-      ddim = make_dim<1>(dims);
-      break;
-    case 2:
-      ddim = make_dim<2>(dims);
-      break;
-    case 3:
-      ddim = make_dim<3>(dims);
-      break;
-    case 4:
-      ddim = make_dim<4>(dims);
-      break;
-    case 5:
-      ddim = make_dim<5>(dims);
-      break;
-    case 6:
-      ddim = make_dim<6>(dims);
-      break;
-    case 7:
-      ddim = make_dim<7>(dims);
-      break;
-    case 8:
-      ddim = make_dim<8>(dims);
-      break;
-    case 9:
-      ddim = make_dim<9>(dims);
-      break;
-    default:
-      break;
-  }
-}
-
-/// @endcond
-
-DDim make_ddim(std::initializer_list<int64_t> dims) {
-  DDim result(make_dim(0));
-  make_ddim(result, dims.begin(), dims.size());
-  return result;
-}
-
-DDim make_ddim(const std::vector<int64_t> &dims) {
-  DDim result(make_dim(0));
-  make_ddim(result, &dims[0], dims.size());
-  return result;
-}
-
-DDim make_ddim(const std::vector<int> &dims) {
-  std::vector<int64_t> res(dims.size());
-  std::transform(dims.begin(), dims.end(), res.begin(),
-                 [](int d) { return static_cast<int64_t>(d); });
-  return make_ddim(res);
-}
-
-/// @cond HIDDEN
-// XXX For some reason, putting this in an anonymous namespace causes
-// errors
-struct DynamicMutableIndexer : Vistor<int64_t &> {
- public:
-  explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
-
-  template <int D>
-  int64_t &operator()(Dim<D> &dim) const {
-    return dim[idx_];
-  }
-
- private:
-  int idx_;
-};
-
-struct DynamicConstIndexer : public Vistor<int64_t> {
- public:
-  explicit DynamicConstIndexer(int idx) : idx_(idx) {}
-
-  template <int D>
-  int64_t operator()(const Dim<D> &dim) const {
-    return dim[idx_];
-  }
-
- private:
-  int idx_;
-};
-
-/// @endcond
-
-int64_t &DDim::operator[](int idx) {
-  return DDim::ApplyVistor(DynamicMutableIndexer(idx), *this);
-}
-
-int64_t DDim::operator[](int idx) const {
-  return DDim::ApplyVistor(DynamicConstIndexer(idx), *this);
-}
-
-int DDim::size() const { return arity(*this); }
-
-bool DDim::operator==(DDim d) const {
-  std::vector<int64_t> v1 = vectorize(*this);
-  std::vector<int64_t> v2 = vectorize(d);
-
-  if (v1.size() != v2.size()) {
-    return false;
-  }
-
-  for (unsigned int i = 0; i < v1.size(); i++) {
-    if (v1[i] != v2[i]) {
-      return false;
-    }
-  }
-
-  return true;
-  //  }
-}
-
-bool DDim::operator!=(DDim d) const { return !(*this == d); }
-
-DDim DDim::operator+(DDim d) const {
-  std::vector<int64_t> v1 = vectorize(*this);
-  std::vector<int64_t> v2 = vectorize(d);
-
-  std::vector<int64_t> v3;
-
-  PADDLE_MOBILE_ENFORCE(v1.size() == v2.size(), "v1.size() != v2.size()");
-
-  for (unsigned int i = 0; i < v1.size(); i++) {
-    v3.push_back(v1[i] + v2[i]);
-  }
-
-  return make_ddim(v3);
-}
-
-DDim DDim::operator*(DDim d) const {
-  std::vector<int64_t> v1 = vectorize(*this);
-  std::vector<int64_t> v2 = vectorize(d);
-
-  std::vector<int64_t> v3;
-
-  PADDLE_MOBILE_ENFORCE(v1.size() == v2.size(), "v1.size() == v2.size()");
-
-  for (unsigned int i = 0; i < v1.size(); i++) {
-    v3.push_back(v1[i] * v2[i]);
-  }
-
-  return make_ddim(v3);
-}
-
-int64_t get(const DDim &ddim, int idx) { return ddim[idx]; }
-
-void set(DDim *ddim, int idx, int value) { (*ddim)[idx] = value; }
-
-/// @cond HIDDEN
-struct VectorizeVisitor : Vistor<void> {
-  std::vector<int64_t> &vector;
-
-  explicit VectorizeVisitor(std::vector<int64_t> &v) : vector(v) {}
-
-  template <typename T>
-  void operator()(const T &t) {
-    vector.push_back(t.head);
-    this->operator()(t.tail);
-  }
-
-  void operator()(const Dim<0> &t) {}
-};
-/// @endcond
-
-std::vector<int64_t> vectorize(const DDim &ddim) {
-  std::vector<int64_t> result;
-  VectorizeVisitor visitor(result);
-  DDim::ApplyVistor(visitor, ddim);
-  return result;
-}
-
-// NOTE: framework::vectorize converts to type int64_t
-//       which does not fit cudnn inputs.
-std::vector<int> vectorize2int(const DDim &ddim) {
-  std::vector<int64_t> temp = vectorize(ddim);
-  std::vector<int> result(temp.begin(), temp.end());
-  return result;
-}
-
-struct ProductVisitor : Vistor<int64_t> {
-  template <int D>
-  int64_t operator()(const Dim<D> &dim) {
-    return product(dim);
-  }
-};
-
-int64_t product(const DDim &ddim) {
-  ProductVisitor visitor;
-  return DDim::ApplyVistor(visitor, ddim);
-}
-
-struct SliceVectorizeVisitor : Vistor<void> {
-  std::vector<int64_t> &vector;
-  int begin;
-  int end;
-
-  SliceVectorizeVisitor(std::vector<int64_t> &v, int b, int e)
-      : vector(v), begin(b), end(e) {
-    PADDLE_MOBILE_ENFORCE(
-        begin < end, "Begin index must be less than end index in ddim slice.");
-    PADDLE_MOBILE_ENFORCE(begin >= 0,
-                          "Begin index can't be less than zero in ddim slice.");
-  }
-
-  template <int S>
-  void operator()(const Dim<S> &dim) {
-    if (begin == 0) {
-      vector.push_back(dim.head);
-    } else {
-      --begin;
-    }
-    --end;
-    if (end > 0) {
-      this->operator()(dim.tail);
-    }
-  }
-
-  void operator()(const Dim<0> &dim) {
-    //    PADDLE_ENFORCE(end == 0, "End index in ddim slice is out
-    //    of bound.");
-  }
-};
-
-DDim slice_ddim(const DDim &ddim, int begin, int end) {
-  std::vector<int64_t> vec;
-  vec.reserve(end - begin);
-  SliceVectorizeVisitor visitor(vec, begin, end);
-  DDim::ApplyVistor(visitor, ddim);
-  return make_ddim(vec);
-}
-
-/// \cond HIDDEN
-
-struct ArityVisitor : Vistor<int> {
-  template <int D>
-  int operator()(Dim<D>) const {
-    return D;
-  }
-};
-
-/// \endcond
-
-int arity(const DDim &d) {
-  ArityVisitor arityVisitor = ArityVisitor();
-  return DDim::ApplyVistor(arityVisitor, d);
-}
-
-#ifdef PADDLE_MOBILE_DEBUG
-Print &operator<<(Print &printer, const DDim &ddim) {
-  for (int j = 0; j < ddim.size(); ++j) {
-    printer << ddim[j] << " ";
-  }
-
-  return printer;
-}
-
-#endif
-
-DDim::DDim(std::initializer_list<int64_t> init_list) {
-  *this = make_ddim(init_list);
-}
-
-DDim flatten_to_2d(const DDim &src, int num_col_dims) {
-  int rank = src.size();
-  return make_ddim({product(slice_ddim(src, 0, num_col_dims)),
-                    product(slice_ddim(src, num_col_dims, rank))});
-}
-
-DDim flatten_to_1d(const DDim &src) { return make_ddim({product(src)}); }
-
-DDim stride(const DDim &ddim) {
-  std::vector<int64_t> strides(ddim.size());
-  strides[ddim.size() - 1] = 1;
-  for (int i = ddim.size() - 2; i >= 0; --i) {
-    strides[i] = strides[i + 1] * ddim[i + 1];
-  }
-  return framework::make_ddim(strides);
-}
-
-DDim stride_numel(const framework::DDim &ddim) {
-  std::vector<int64_t> strides(ddim.size());
-  strides[ddim.size() - 1] = ddim[ddim.size() - 1];
-  for (int i = ddim.size() - 2; i >= 0; --i) {
-    strides[i] = strides[i + 1] * ddim[i];
-  }
-  return framework::make_ddim(strides);
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/ddim.h b/mobile/src/framework/ddim.h
deleted file mode 100644
index 5d3844be78..0000000000
--- a/mobile/src/framework/ddim.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdlib>
-#include <initializer_list>
-#include <string>
-#include <typeinfo>
-#include <vector>
-
-#include "common/enforce.h"
-#include "common/variant.h"
-#include "framework/dim.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-/**
- * \brief A dynamically sized dimension.
- *
- * The number of dimensions must be between [1, 9].
- */
-struct DDim {
-  typedef Variant<Dim<0>, Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>,
-                  Dim<7>, Dim<8>, Dim<9>>
-      DDimVar;
-  DDimVar var;
-
-  template <typename Vistor>
-  static typename Vistor::type_t ApplyVistor(Vistor vistor, const DDim &d) {
-    if (d.var.TypeId() == type_id<Dim<0>>()) {
-      return vistor(d.var.Get<Dim<0>>());
-    } else if (d.var.TypeId() == type_id<Dim<1>>()) {
-      return vistor(d.var.Get<Dim<1>>());
-    } else if (d.var.TypeId() == type_id<Dim<2>>()) {
-      return vistor(d.var.Get<Dim<2>>());
-    } else if (d.var.TypeId() == type_id<Dim<3>>()) {
-      return vistor(d.var.Get<Dim<3>>());
-    } else if (d.var.TypeId() == type_id<Dim<4>>()) {
-      return vistor(d.var.Get<Dim<4>>());
-    } else if (d.var.TypeId() == type_id<Dim<5>>()) {
-      return vistor(d.var.Get<Dim<5>>());
-    } else if (d.var.TypeId() == type_id<Dim<6>>()) {
-      return vistor(d.var.Get<Dim<6>>());
-    } else if (d.var.TypeId() == type_id<Dim<7>>()) {
-      return vistor(d.var.Get<Dim<7>>());
-    } else if (d.var.TypeId() == type_id<Dim<8>>()) {
-      return vistor(d.var.Get<Dim<8>>());
-    } else if (d.var.TypeId() == type_id<Dim<9>>()) {
-      return vistor(d.var.Get<Dim<9>>());
-    } else {
-      PADDLE_MOBILE_ENFORCE(false, " dim not support");
-    }
-  }
-
-  DDim() { var.Set<Dim<1>>(Dim<1>()); }
-
-  template <int D>
-  explicit DDim(const Dim<D> &in) {
-    var.Set<Dim<D>>(in);
-  }
-
-  DDim(const DDim &in) { setNewDim(in); }
-
-  /*implicit*/ DDim(std::initializer_list<int64_t> init_list);
-
-  template <int D>
-  DDim &operator=(const Dim<D> &in) {
-    var.Set<Dim<D>>(in);
-    return *this;
-  }
-
-  DDim &operator=(const DDim &in) {
-    setNewDim(in);
-    return *this;
-  }
-
-  void setNewDim(const DDim &d) {
-    if (d.var.TypeId() == type_id<Dim<0>>()) {
-      return var.Set<Dim<0>>(d.var.Get<Dim<0>>());
-    } else if (d.var.TypeId() == type_id<Dim<1>>()) {
-      return var.Set<Dim<1>>(d.var.Get<Dim<1>>());
-    } else if (d.var.TypeId() == type_id<Dim<2>>()) {
-      return var.Set<Dim<2>>(d.var.Get<Dim<2>>());
-    } else if (d.var.TypeId() == type_id<Dim<3>>()) {
-      return var.Set<Dim<3>>(d.var.Get<Dim<3>>());
-    } else if (d.var.TypeId() == type_id<Dim<4>>()) {
-      return var.Set<Dim<4>>(d.var.Get<Dim<4>>());
-    } else if (d.var.TypeId() == type_id<Dim<5>>()) {
-      return var.Set<Dim<5>>(d.var.Get<Dim<5>>());
-    } else if (d.var.TypeId() == type_id<Dim<6>>()) {
-      return var.Set<Dim<6>>(d.var.Get<Dim<6>>());
-    } else if (d.var.TypeId() == type_id<Dim<7>>()) {
-      return var.Set<Dim<7>>(d.var.Get<Dim<7>>());
-    } else if (d.var.TypeId() == type_id<Dim<8>>()) {
-      return var.Set<Dim<8>>(d.var.Get<Dim<8>>());
-    } else if (d.var.TypeId() == type_id<Dim<9>>()) {
-      return var.Set<Dim<9>>(d.var.Get<Dim<9>>());
-    } else {
-      PADDLE_MOBILE_ENFORCE(false, " dim not support");
-    }
-  }
-
-  int64_t &operator[](int idx);
-
-  int64_t operator[](int idx) const;
-
-  DDimVar getVar() const { return var; }
-
-  bool operator==(DDim d) const;
-
-  bool operator!=(DDim d) const;
-
-  DDim operator+(DDim d) const;
-
-  DDim operator*(DDim d) const;
-
-  int size() const;
-};
-
-/**
- * \brief Make a DDim from std::vector<int64_t>
- *
- * \param dims An vector of ints. Must be sized between [1, 9]
- */
-DDim make_ddim(const std::vector<int64_t> &dims);
-
-DDim make_ddim(const std::vector<int> &dims);
-
-/**
- * \brief Make a DDim from an initializer list
- *
- * \param dims An initializer list of ints. Must be sized between [1, 9]
- *
- */
-DDim make_ddim(std::initializer_list<int64_t> dims);
-
-int64_t get(const DDim &dim, int idx);
-
-void set(DDim *dim, int idx, int val);
-
-std::vector<int64_t> vectorize(const DDim &ddim);
-
-std::vector<int> vectorize2int(const DDim &ddim);
-
-int64_t product(const DDim &ddim);
-
-/**
- * \brief Slice a ddim
- *
- * Slice dim with [begin, end).
- * e.g.  DDim d = make_ddim({1,2,3,4,5});
- *       slice_ddim(d, 1, 3); ====> {2,3}
- */
-DDim slice_ddim(const DDim &dim, int begin, int end);
-
-/**
- * \brief What is the length of this dimension?
- *
- * \param Dynamic dimension to inspect
- */
-
-int arity(const DDim &ddim);
-
-// Reshape a tensor to a matrix. The matrix's first dimension(column
-// length)
-// will be the product of tensor's first `num_col_dims` dimensions.
-DDim flatten_to_2d(const DDim &src, int num_col_dims);
-
-DDim flatten_to_1d(const DDim &src);
-
-DDim stride(const DDim &ddim);
-
-DDim stride_numel(const DDim &ddim);
-
-#ifdef PADDLE_MOBILE_DEBUG
-Print &operator<<(Print &printer, const DDim &ddim);
-#endif
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/dim.h b/mobile/src/framework/dim.h
deleted file mode 100644
index e11d6fe39a..0000000000
--- a/mobile/src/framework/dim.h
+++ /dev/null
@@ -1,335 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdlib>
-#include <string>
-#include "common/enforce.h"
-namespace paddle_mobile {
-namespace framework {
-
-// Statically sized, statically indexed dimension
-template <int i>
-struct Dim {
-  static constexpr int dimensions = i;
-
-  template <typename... Args>
-  Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) {
-    static_assert(sizeof...(_tail) == i - 1,
-                  "Dim initialized with the wrong number of parameters");
-  }
-
-  Dim(int64_t _head, const Dim<i - 1> &_tail) : head(_head), tail(_tail) {}
-
-  Dim() : head(0), tail() {}
-
-  /** Construct a Dim from a linear index and size.  Uses Fortran
-   * order
-   * indexing. */
-  Dim(int64_t idx, const Dim<i> &size)
-      : head(idx % size.head), tail(idx / size.head, size.tail) {}
-
-  /** Construct a Dim with each dimension set to the given index */
-  explicit Dim(int64_t idx) : head(idx), tail(idx) {}
-
-  bool operator==(const Dim<i> &o) const {
-    return (head == o.head) && (tail == o.tail);
-  }
-
-  bool operator!=(const Dim<i> &o) const { return !(*this == o); }
-
-  int64_t &operator[](int idx);
-
-  int64_t operator[](int idx) const;
-
-  std::string to_string() const;
-
-  int64_t head;
-  Dim<i - 1> tail;
-};
-
-// Base case specialization
-template <>
-struct Dim<0> {
-  static constexpr int dimensions = 0;
-
-  explicit Dim(int64_t _head) {}
-
-  Dim() {}
-
-  Dim(int idx, const Dim<0> &size) {
-    if (idx > 0) {
-      PADDLE_MOBILE_THROW_EXCEPTION("Index out of range.")
-    }
-  }
-
-  bool operator==(const Dim<0> &o) const { return true; }
-
-  bool operator!=(const Dim<0> &o) const { return false; }
-
-  int64_t &operator[](int idx);
-
-  int64_t operator[](int idx) const;
-
-  int64_t head;
-};
-
-namespace {
-
-// Helper for accessing Dim classes
-template <int i>
-struct DimGetter {
-  // Return a copy if Dim is const
-  template <typename D>
-  static int64_t impl(const D &d) {
-    return DimGetter<i - 1>::impl(d.tail);
-  }
-  // Return a reference if Dim is mutable
-  template <typename D>
-  static int64_t &impl(D &d) {
-    return DimGetter<i - 1>::impl(d.tail);
-  }
-};
-
-// Eureka! We found the element!
-template <>
-struct DimGetter<0> {
-  // Return a copy if Dim is const
-  template <typename D>
-  static int64_t impl(const D &d) {
-    return d.head;
-  }
-  // Return a reference if Dim is mutable
-  template <typename D>
-  static int64_t &impl(D &d) {
-    return d.head;
-  }
-};
-
-template <int D>
-int64_t &indexer(Dim<D> &dim, int idx) {
-  if (idx < 0) {
-    PADDLE_MOBILE_THROW_EXCEPTION("Tried to access a negative dimension")
-  }
-
-  if (idx == 0) {
-    return dim.head;
-  }
-  return indexer(dim.tail, idx - 1);
-}
-
-template <>
-int64_t &indexer<0>(Dim<0> &dim, int idx) {
-  PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
-  return dim.head;
-}
-
-template <int D>
-int64_t indexer(const Dim<D> &dim, int idx) {
-  if (idx < 0) {
-    PADDLE_MOBILE_THROW_EXCEPTION("Tried to access a negative dimension")
-  }
-  if (idx == 0) {
-    return dim.head;
-  }
-  return indexer(dim.tail, idx - 1);
-}
-
-template <>
-int64_t indexer<0>(const Dim<0> &dim, int idx) {
-  PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
-  return dim.head;
-}
-
-}  // namespace
-// Static access to constant Dim
-template <int i, int l>
-int64_t get(const Dim<l> &d) {
-  return DimGetter<i>::impl(d);
-}
-
-// Static access to mutable Dim
-template <int i, int l>
-int64_t &get(Dim<l> &d) {
-  return DimGetter<i>::impl(d);
-}
-
-// Dynamic access to constant Dim
-template <int l>
-int64_t Dim<l>::operator[](int i) const {
-  //  std::cout << "l: " << l << std::endl;
-  return indexer(*this, i);
-}
-
-// Dynamic access to mutable Dim
-template <int l>
-int64_t &Dim<l>::operator[](int i) {
-  return indexer(*this, i);
-}
-
-// Dynamic access to constant Dim
-inline int64_t Dim<0>::operator[](int i) const { return indexer(*this, i); }
-
-// Dynamic access to mutable Dim
-inline int64_t &Dim<0>::operator[](int i) { return indexer(*this, i); }
-
-// Dynamic access to constant Dim
-// without std::enable_if will try to instantiate this on get<0>(d)
-template <int l>
-typename std::enable_if<(l > 0), int64_t>::type get(const Dim<l> &d, int i) {
-  return d[i];
-}
-
-// Dynamic access to mutable Dim
-template <int l>
-typename std::enable_if<(l > 0), int64_t &>::type get(Dim<l> &d, int i) {
-  return d[i];
-}
-
-// Dot product of two dims
-template <int i>
-int64_t linearize(const Dim<i> &a, const Dim<i> &b) {
-  return a.head * b.head + linearize(a.tail, b.tail);
-}
-
-// Base case dot product of two Dims
-// Notice it is inline because it is no longer a template
-template <>
-inline int64_t linearize(const Dim<0> &a, const Dim<0> &b) {
-  return 0;
-}
-
-// Product of a Dim
-template <int i>
-int64_t product(const Dim<i> &a, int prod = 1) {
-  return prod * a.head * product(a.tail);
-}
-
-// Base case product of a Dim
-// Notice it is inline because it is no longer a template
-template <>
-inline int64_t product(const Dim<0> &a, int prod) {
-  return prod;
-}
-
-// Is 0 <= idx_i < size_i for all i?
-template <int i>
-bool contained(const Dim<i> &idx, const Dim<i> &size) {
-  return ((0 <= idx.head) && (idx.head < size.head) &&
-          contained(idx.tail, size.tail));
-}
-
-// Base case of is 0 <= idx_i < size_i ?
-// Notice it is inline because it is no longer a template
-template <>
-inline bool contained(const Dim<0> &idx, const Dim<0> &size) {
-  return true;
-}
-
-/**
- * \brief Compute exclusive prefix-multiply of a Dim.
- */
-template <int i>
-Dim<i> ex_prefix_mul(const Dim<i> &src, int mul = 1) {
-  return Dim<i>(mul, ex_prefix_mul(src.tail, mul * src.head));
-}
-
-///\cond HIDDEN
-// Base case of ex_prefix_mul
-// Notice it is inline because it is no longer a template
-template <>
-inline Dim<0> ex_prefix_mul(const Dim<0> &src, int mul) {
-  return Dim<0>();
-}
-///\endcond
-
-/**
- * Add two dimensions together
- */
-template <int i>
-Dim<i> dim_plus(const Dim<i> &a, const Dim<i> &b) {
-  return Dim<i>(a.head + b.head, dim_plus(a.tail, b.tail));
-}
-
-// Base case
-template <>
-inline Dim<0> dim_plus(const Dim<0> &a, const Dim<0> &b) {
-  return Dim<0>();
-}
-
-template <int i>
-Dim<i> operator+(const Dim<i> &lhs, const Dim<i> &rhs) {
-  return dim_plus(lhs, rhs);
-}
-
-/**
- * Multiply two dimensions together
- */
-template <int i>
-Dim<i> dim_mult(const Dim<i> &a, const Dim<i> &b) {
-  return Dim<i>(a.head * b.head, dim_mult(a.tail, b.tail));
-}
-
-// Base case
-template <>
-inline Dim<0> dim_mult(const Dim<0> &a, const Dim<0> &b) {
-  return Dim<0>();
-}
-
-template <int i>
-Dim<i> operator*(const Dim<i> &lhs, const Dim<i> &rhs) {
-  return dim_mult(lhs, rhs);
-}
-
-/**
- * \brief Normalize strides to ensure any dimension with extent 1
- * has stride 0.
- *
- * \param size Dim object containing the size of an array
- * \param stride Dim object containing stride of an array
- * \return Dim object the same size as \p size with normalized strides
- *
- */
-
-template <int i>
-Dim<i> normalize_strides(const Dim<i> &size, const Dim<i> &stride) {
-  int norm_stride = size.head == 1 ? 0 : stride.head;
-  return Dim<i>(norm_stride, normalize_strides(size.tail, stride.tail));
-}
-
-///\cond HIDDEN
-
-template <>
-inline Dim<0> normalize_strides(const Dim<0> &size, const Dim<0> &stride) {
-  return Dim<0>();
-}
-
-///\endcond
-
-/**
- * Helper function to create a Dim
- *
- * \param idxes The type of Dim constructed depends on the number of
- * params
- *
- */
-
-template <typename... Args>
-Dim<sizeof...(Args)> make_dim(Args... idxes) {
-  return Dim<sizeof...(Args)>(idxes...);
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/executor.cpp b/mobile/src/framework/executor.cpp
deleted file mode 100644
index f06ed498da..0000000000
--- a/mobile/src/framework/executor.cpp
+++ /dev/null
@@ -1,1102 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/executor.h"
-#include <algorithm>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include "common/enforce.h"
-#include "common/log.h"
-#include "framework/context.h"
-#include "framework/framework.pb-c.h"
-#include "framework/lod_tensor.h"
-#include "framework/operator.h"
-#include "framework/program/program-optimize/program_optimize.h"
-#include "framework/program/program_desc.h"
-#include "framework/program/var_desc.h"
-#include "framework/scope.h"
-#include "framework/tensor.h"
-#include "memory/t_malloc.h"
-#include "pass/memory_optimize.h"
-#include "pass/model_obfuscate.h"
-#ifdef PADDLE_MOBILE_CL
-#include "framework/cl/cl_image.h"
-#include "pass/memory_optimize_super.h"
-#endif
-
-namespace paddle_mobile {
-namespace framework {
-
-#pragma mark - executor
-
-template <typename Device, typename T>
-void Executor<Device, T>::SetThreadNum(int thread_num, PowerMode power_mode) {
-  CPUContext::Context()->set_thread_num(thread_num, power_mode);
-}
-
-template <typename Device, typename T>
-Executor<Device, T>::Executor(const Program<Device> &program,
-                              paddle_mobile::PaddleMobileConfigInternal config,
-                              int batch_size, const bool use_optimize,
-                              const bool lod_mode)
-    : program_(program),
-      batch_size_(batch_size),
-      use_optimize_(use_optimize),
-      lod_mode_(lod_mode),
-      config_(config) {
-  DLOG << "executor in lod mode: " << lod_mode;
-
-  Variable *variable_ptr = program_.scope->Var("batch_size");
-  variable_ptr->SetValue<int>(batch_size);
-
-  program_desc_ =
-      use_optimize_ ? program_.optimizeProgram : program_.originProgram;
-  PADDLE_MOBILE_ENFORCE(program_desc_ != nullptr,
-                        "program_desc_ should not be nullptr");
-#if !defined(PADDLE_MOBILE_FPGA) && !defined(PADDLE_MOBILE_FPGA_KD) && \
-    !defined(PADDLE_MOBILE_CL)
-  if (config_.memory_optimization_level != NoMemoryOptimization) {
-    pass::MemoryOptPass()(program_desc_.get(), program_.scope.get(),
-                          config_.memory_optimization_level);
-  }
-#endif
-  // resize feed and fetch list
-  // should init feed and fetch variables before infer shape
-  InitFeedFetchList();
-  const auto &blocks = program_desc_->Blocks();
-  std::shared_ptr<BlockDesc> block_desc = blocks[0];
-  std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-  for (int j = 0; j < ops.size(); ++j) {
-    std::shared_ptr<OpDesc> op_desc = ops[j];
-    DLOG << "create op: " << op_desc->Type();
-
-    auto op_handler = OpRegistry<Device>::CreateOp(
-        op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(),
-        op_desc->GetAttrMap(), program_.scope.get());
-    // infer shape to reshape inputs and outputs before predict,
-    // but for lod mode, it still need to infer shape in runtime
-    if (!lod_mode) {
-      op_handler->InferShape();
-    }
-    ops_of_block0_.push_back(op_handler);
-  }
-#ifdef PADDLE_MOBILE_FPGA_V2
-  InitQuantMemory();
-#endif
-  if (program_.combined) {
-    InitCombineMemory();
-  } else {
-    InitMemory();
-  }
-  int count = 0;
-#ifdef PADDLE_MOBILE_PROFILE
-  std::vector<ProfInfo> profile(ops_of_block0_.size());
-  struct timespec ts;
-  int op_index = 0;
-#endif
-  for (auto &op_handler : ops_of_block0_) {
-#ifdef PADDLE_MOBILE_PROFILE
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-#endif
-    DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type();
-    op_handler->Init();
-#ifdef PADDLE_MOBILE_PROFILE
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    profile[op_index].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-    ++op_index;
-#endif
-  }
-#ifdef PADDLE_MOBILE_PROFILE
-  printf("================[ op init profile ]==================\n");
-  PrintProfile(profile);
-#endif
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::InitFeedFetchList() {
-  std::unordered_map<std::string, int> feed_indices, fetch_indices;
-  for (const auto &block : program_desc_->Blocks()) {
-    for (const auto &op_desc : block->Ops()) {
-      if (op_desc->Type() == "feed") {
-        std::string name = op_desc->Output("Out")[0];
-        feed_indices[name] = op_desc->GetAttr("col").Get<int>();
-      } else if (op_desc->Type() == "fetch") {
-        std::string name = op_desc->Input("X")[0];
-        fetch_indices[name] = op_desc->GetAttr("col").Get<int>();
-      }
-    }
-  }
-  feed_indices_.swap(feed_indices);
-  fetch_indices_.swap(fetch_indices);
-
-  auto *feed_var = program_.scope->Var("feed");
-  auto *feed_list = feed_var->template GetMutable<framework::LoDTensorArray>();
-  feed_list->resize(feed_indices_.size());
-
-  auto *fetch_var = program_.scope->Var("fetch");
-  auto *fetch_list =
-      fetch_var->template GetMutable<framework::LoDTensorArray>();
-  fetch_list->resize(fetch_indices_.size());
-}
-
-template <typename T>
-static void LoadMemInternal(void **data, LoDTensor *tensor,
-                            bool quant_uint8 = false) {
-  char **data_buf = reinterpret_cast<char **>(data);
-  int64_t size = tensor->numel();
-  T *tensor_data = tensor->mutable_data<T>();
-  if (quant_uint8) {
-    // should be moved into operator init function
-    float min_value;
-    float max_value;
-    memory::Copy(&min_value, *data_buf, sizeof(float));
-    memory::Copy(&max_value, *data_buf + sizeof(float), sizeof(float));
-    *data_buf += 2 * sizeof(float);
-    const float factor = (max_value - min_value) / 255.0;
-    const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data_buf);
-    for (int k = 0; k < size; ++k) {
-      tensor_data[k] = uint8_data[k] * factor + min_value;
-    }
-    *data_buf += size * sizeof(uint8_t);
-  } else {
-    memory::Copy(tensor_data, *data_buf, size * sizeof(T));
-    *data_buf += size * sizeof(T);
-  }
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::LoadMemory(void **data,
-                                     const std::shared_ptr<VarDesc> var_desc,
-                                     LoDTensor *tensor) {
-  char **data_buf = reinterpret_cast<char **>(data);
-  // version
-  uint32_t version = *(reinterpret_cast<uint32_t *>(*data_buf));
-  *data_buf += sizeof(uint32_t);
-  // lod information
-  // uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
-  uint64_t lod_level = 0;
-  memory::Copy(&lod_level, *data_buf, sizeof(uint64_t));
-  *data_buf += sizeof(uint64_t);
-
-  auto *lod = tensor->mutable_lod();
-  lod->resize(lod_level);
-  for (uint64_t i = 0; i < lod_level; ++i) {
-    uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
-    *data_buf += sizeof(uint64_t);
-    std::vector<size_t> tmp_dim(size / sizeof(size_t));
-    memory::Copy(tmp_dim.data(), *data_buf, size);
-    (*lod)[i] = std::move(tmp_dim);
-    *data_buf += size;
-  }
-  // tensor version
-  uint32_t tensor_version = *(reinterpret_cast<uint32_t *>(*data_buf));
-  *data_buf += sizeof(uint32_t);
-  // tensor desc size
-  int32_t tensor_desc_size = *(reinterpret_cast<int32_t *>(*data_buf));
-  *data_buf += sizeof(int32_t);
-  // skip tensor desc
-  *data_buf += tensor_desc_size;
-
-  const TensorDesc &tensor_desc = var_desc->Tensor_desc();
-  tensor->Resize(make_ddim(tensor_desc.Dims()));
-  // parse tensor from stream
-  switch (tensor_desc.DataType()) {
-    case VARTYPE_TYPE_FP32:
-      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor,
-                             program_.quantification);
-      break;
-    case VARTYPE_TYPE_INT8:
-      LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
-      break;
-    case VARTYPE_TYPE_INT32:
-      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf), tensor);
-      break;
-    default:
-      LOG(kLOG_ERROR) << "data type is not supported";
-  }
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::InitMemory() {
-  for (const auto &block : program_desc_->Blocks()) {
-    for (const auto &var_desc : block->Vars()) {
-      auto var = program_.scope->Var(var_desc->Name());
-      if (var_desc->Persistable()) {
-        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-          var->template GetMutable<framework::LoDTensorArray>();
-          continue;
-        }
-        DLOG << "init persistable var: " << var_desc->Name();
-        char *origin_data =
-            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
-        char *data = origin_data;
-        auto tensor = var->template GetMutable<LoDTensor>();
-        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
-        delete[] origin_data;
-      } else {
-        DLOG << "init no persistable var: " << var_desc->Name();
-        varInputMemory(var_desc, var);
-      }
-    }
-  }
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::InitCombineMemory() {
-  char *origin_data = nullptr;
-  bool self_alloc = false;
-  if (program_.combined_params_buf && program_.combined_params_len) {
-    origin_data = reinterpret_cast<char *>(
-        const_cast<uint8_t *>(program_.combined_params_buf));
-    if (config_.model_obfuscate_key != "") {
-      auto obfuscator = pass::ModelObfuscatePass(config_.model_obfuscate_key);
-      obfuscator.convert_data(origin_data, program_.combined_params_len);
-    }
-  } else {
-    self_alloc = true;
-    origin_data = ReadFileToBuff(program_.para_path);
-    if (config_.model_obfuscate_key != "") {
-      auto obfuscator = pass::ModelObfuscatePass(config_.model_obfuscate_key);
-      obfuscator.convert_data(origin_data, GetFileLength(program_.para_path));
-    }
-  }
-  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "data == nullptr");
-  char *data = origin_data;
-  for (const auto &block : program_desc_->Blocks()) {
-    for (const auto &var_desc : block->Vars()) {
-      auto var = program_.scope->Var(var_desc->Name());
-      if (var_desc->Persistable()) {
-        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-          var->template GetMutable<framework::LoDTensorArray>();
-          continue;
-        }
-
-        DLOG << " init combine memory persistable: " << var_desc->Name();
-        auto tensor = var->template GetMutable<LoDTensor>();
-        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
-      } else {
-        DLOG << " init combine memory no persistable: " << var_desc->Name();
-        varInputMemory(var_desc, var);
-      }
-    }
-  }
-  if (self_alloc) {
-    delete[] origin_data;
-  }
-  LOG(kLOG_INFO) << "init combine memory finish";
-}
-
-static void ClearNoPersistableTensorArray(const framework::ProgramDesc *program,
-                                          framework::Scope *scope) {
-  for (const auto &block : program->Blocks()) {
-    for (const auto &var_desc : block->Vars()) {
-      if (!var_desc->Persistable() &&
-          var_desc->Type() == VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY) {
-        auto var = scope->Var(var_desc->Name());
-        auto array = var->template GetMutable<framework::LoDTensorArray>();
-        array->resize(1);
-      }
-    }
-  }
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::InitNoPersistableMemory(const Tensor &input_tensor) {
-  if (input_tensor.dims().size() != 4) {
-    return;
-  }
-  for (const auto &block : program_desc_->Blocks()) {
-    for (const auto &var_desc : block->Vars()) {
-      auto var = program_.scope->Var(var_desc->Name());
-      if (!var_desc->Persistable() &&
-          var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
-        DLOG << "InitNoPersistableMemory var " << var_desc->Name();
-        auto tensor = var->template GetMutable<LoDTensor>();
-        if (tensor->IsInitialized() && tensor->dims().size() == 4) {
-          DLOG << "var's tensor is Initialized or dims size != 4";
-          DDim tensor_dim = tensor->dims();
-          DDim new_dim =
-              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
-                         input_tensor.dims()[3]});
-          tensor->Resize(new_dim);
-          tensor->template mutable_data_new<T>();
-          DLOG << "var's tensor dims " << tensor_dim;
-          DLOG << "var's tensor new dims " << new_dim;
-        } else {
-          DLOG << "var's tensor is not Initialized ???";
-        }
-      }
-    }
-  }
-}
-
-template <typename Device, typename T>
-bool Executor<Device, T>::varInputMemory(
-    const std::shared_ptr<VarDesc> &var_desc, Variable *var) const {
-#ifdef PADDLE_MOBILE_FPGA
-  framework::LoDTensor *tensor = var->template GetMutable<LoDTensor>();
-#ifdef PADDLE_MOBILE_FPGA_V2
-  tensor->init(type_id<int8_t>().hash_code());
-#else
-  tensor->init(type_id<float>().hash_code());
-#endif
-  return true;
-#endif
-
-  auto type = var_desc->Type();
-  if (type == VARTYPE_TYPE_LOD_TENSOR) {
-    auto data_type = var_desc->Tensor_desc().DataType();
-    framework::LoDTensor *tensor = var->template GetMutable<LoDTensor>();
-  } else if (type == VARTYPE_TYPE_STEP_SCOPES) {
-    std::vector<framework::Scope *> *step_scopes =
-        var->template GetMutable<std::vector<framework::Scope *>>();
-  } else if (type == VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY) {
-    framework::LoDTensorArray *tensor_array =
-        var->template GetMutable<framework::LoDTensorArray>();
-  } else {
-    PADDLE_MOBILE_THROW_EXCEPTION("got unhandled var type `%d`", type);
-  }
-  return true;
-}
-
-template <typename Device, typename T>
-PMStatus Executor<Device, T>::Predict(
-    const std::vector<std::pair<std::string, Tensor>> &inputs) {
-  for (const auto &input : inputs) {
-    SetInput(input.second, input.first);
-  }
-  return this->Predict();
-}
-
-template <typename Device, typename T>
-PMStatus Executor<Device, T>::Predict(
-    const std::vector<std::pair<std::string, LoDTensor>> &inputs) {
-  for (const auto &input : inputs) {
-    SetInput(input.second, input.first);
-  }
-  return this->Predict();
-}
-
-template <typename Device, typename T>
-std::vector<T> Executor<Device, T>::Predict(const std::vector<T> &input,
-                                            const std::vector<int64_t> &dims) {
-  PADDLE_MOBILE_ENFORCE(feed_indices_.size() != 0,
-                        "We don't know which tensor should be assign, since no "
-                        "feed op found in this model");
-  PADDLE_MOBILE_ENFORCE(fetch_indices_.size() != 0,
-                        "We don't know which tensor should be fetch out, since "
-                        "no fetch op found in this model");
-  std::string input_name = feed_indices_.begin()->first;
-  Tensor feed_tensor(input, make_ddim(dims));
-  SetInput(feed_tensor, input_name);
-  std::vector<T> output;
-  if (this->Predict() == PMSuccess) {
-    std::string output_name = fetch_indices_.begin()->first;
-    const auto output_tensor = GetOutput(output_name);
-    output.resize(output_tensor->numel());
-    memcpy(output.data(), output_tensor->template data<T>(),
-           output.size() * sizeof(T));
-  }
-  return output;
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::SetInput(const Tensor &input,
-                                   const std::string &var_name) {
-  int index = 0;
-  if (feed_indices_.find(var_name) != feed_indices_.end()) {
-    index = feed_indices_.find(var_name)->second;
-  }
-  auto *feed_var = program_.scope->Var("feed");
-  framework::LoDTensor &target =
-      feed_var->template GetMutable<framework::LoDTensorArray>()->at(index);
-
-  target.Resize(input.dims());
-  target.ShareDataWith(input);
-  if (feed_indices_.size() == 1) {
-    auto &dim = input.dims();
-    if (lod_mode_ && product(dim) < 0.9 * product(input_dim_last_)) {
-      InitNoPersistableMemory(target);
-    }
-    input_dim_has_changed_ = input_dim_last_ != dim;
-    input_dim_last_ = static_cast<DDim>(dim);
-  }
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::SetInput(const LoDTensor &input,
-                                   const std::string &var_name) {
-  int index = 0;
-  if (feed_indices_.find(var_name) != feed_indices_.end()) {
-    index = feed_indices_.find(var_name)->second;
-  }
-  auto *feed_var = program_.scope->Var("feed");
-  framework::LoDTensor &target =
-      feed_var->template GetMutable<framework::LoDTensorArray>()->at(index);
-
-  target.Resize(input.dims());
-  target.ShareDataWith(input);
-  target.set_lod(input.lod());
-  if (feed_indices_.size() == 1) {
-    auto &dim = input.dims();
-    if (lod_mode_ && product(dim) < 0.9 * product(input_dim_last_)) {
-      InitNoPersistableMemory(target);
-    }
-    input_dim_has_changed_ = input_dim_last_ != dim;
-    input_dim_last_ = static_cast<DDim>(dim);
-  }
-}
-
-template <typename Device, typename T>
-std::shared_ptr<LoDTensor> Executor<Device, T>::GetOutput(
-    const std::string &var_name) {
-  const auto &iter = fetch_indices_.find(var_name);
-  if (var_name == "fetch" || iter != fetch_indices_.end()) {
-    int index = 0;
-    if (iter != fetch_indices_.end()) {
-      index = iter->second;
-    }
-    auto *fetch_var = program_.scope->Var("fetch");
-    framework::LoDTensor &target =
-        fetch_var->template GetMutable<framework::LoDTensorArray>()->at(index);
-
-    return std::make_shared<LoDTensor>(target);
-  } else {
-    auto *fetch_var = program_.scope->Var(var_name);
-    framework::LoDTensor *target =
-        fetch_var->template GetMutable<framework::LoDTensor>();
-    return std::make_shared<LoDTensor>(*target);
-  }
-}
-
-#ifdef PADDLE_MOBILE_CL
-template <typename Device, typename T>
-const CLImage *Executor<Device, T>::GetOutputImage(
-    const std::string &var_name) {
-  auto var = program_.scope->FindVar(var_name);
-  if (var->IsInitialized() && var->template IsType<framework::CLImage>()) {
-    const CLImage *cl_image = var->template Get<framework::CLImage>();
-    return cl_image;
-  } else {
-    return nullptr;
-  }
-}
-#endif
-
-template <typename Device, typename T>
-PMStatus Executor<Device, T>::Predict() {
-  try {
-#if _OPENMP
-    omp_set_num_threads(CPUContext::Context()->get_thread_num());
-#endif
-    // clear all no persistable tensor array since write_to_array
-    // is always push back a new tensor in the array
-    ClearNoPersistableTensorArray(program_desc_.get(), program_.scope.get());
-
-#ifdef PADDLE_MOBILE_PROFILE
-    std::vector<ProfInfo> profile(ops_of_block0_.size());
-    struct timespec ts;
-    int op_index = 0;
-#endif
-    for (int i = 0; i < ops_of_block0_.size(); ++i) {
-      auto &op_handler = ops_of_block0_[i];
-#ifdef PADDLE_MOBILE_PROFILE
-      clock_gettime(CLOCK_MONOTONIC, &ts);
-      profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-#endif
-      DLOG << i << "th, "
-           << "run op: " << op_handler->Type();
-      if (lod_mode_ && input_dim_has_changed_) {
-        op_handler->InferShape();
-      }
-      op_handler->Run();
-#ifdef PADDLE_MOBILE_PROFILE
-      clock_gettime(CLOCK_MONOTONIC, &ts);
-      profile[op_index].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-      ++op_index;
-#endif
-    }
-    if (feed_indices_.size() == 1) {
-      input_dim_has_changed_ = false;
-    }
-
-#ifdef PADDLE_MOBILE_PROFILE
-    PrintProfile(profile);
-#endif
-    return PMSuccess;
-  } catch (PaddleMobileException &e) {
-    exception_msg_ = e.what();
-    return PMException;
-  } catch (std::exception &e) {
-    exception_msg_ = e.what();
-    return PMException;
-  }
-}
-
-#ifdef PADDLE_MOBILE_PROFILE
-template <typename Device, typename T>
-void Executor<Device, T>::PrintProfile(
-    const vector<Executor<Device, T>::ProfInfo> &profile) const {
-  std::unordered_map<std::string, uint64_t> _tp;
-  for (int i = 0; i < profile.size(); i++) {
-    const auto &pInfo = profile[i];
-    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
-    if (this->ops_of_block0_[i]->Type() == "conv2d" ||
-        this->ops_of_block0_[i]->Type() == "depthwise_conv2d") {
-      auto inputs = this->ops_of_block0_[i]->Inputs();
-
-      auto *filter = GetVarValue<ProfileTensorType>("Filter", inputs,
-                                                    *(this->program_.scope));
-      int kernel_size = filter->dims()[2];
-      _tp[this->ops_of_block0_[i]->Type() + "_" +
-          std::to_string(kernel_size)] += timeCost;
-    } else {
-      _tp[this->ops_of_block0_[i]->Type()] += timeCost;
-    }
-  }
-  printf("====================[ profile ]======================\n");
-  typedef std::pair<std::string, uint64_t> prof_t;
-  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
-  uint64_t _ptotal = 0;
-  for (auto const &p : _tv) {
-    _ptotal += p.second;
-  }
-  auto compf = [](const prof_t &a, const prof_t &b) {
-    return a.second > b.second;
-  };
-  std::sort(_tv.begin(), _tv.end(), compf);
-  _tv.push_back(std::make_pair("total", _ptotal));
-  for (auto const &p : _tv) {
-    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
-           static_cast<float>(p.second),
-           static_cast<float>(p.second) / _ptotal * 100.0);
-  }
-  printf("====================[---------]======================\n");
-}
-#endif
-
-template <typename Device, typename T>
-void Executor<Device, T>::FeedTensorData(const vector<framework::Tensor> &v) {
-  auto input_size = v.size();
-  auto *feed_var = program_.scope->Var("feed");
-
-  PADDLE_MOBILE_ENFORCE(input_size == feed_indices_.size(),
-                        "input data number not correct");
-  for (int i = 0; i < input_size; i++) {
-    framework::LoDTensor &target =
-        feed_var->template GetMutable<framework::LoDTensorArray>()->at(i);
-    target.ShareDataWith(v[input_size - i - 1]);
-  }
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::GetTensorResults(
-    std::vector<framework::Tensor *> *v) {
-  auto *fetch_var = program_.scope->Var("fetch");
-  auto output_size = fetch_indices_.size();
-  for (int i = 0; i < output_size; i++) {
-    framework::LoDTensor &target =
-        fetch_var->template GetMutable<framework::LoDTensorArray>()->at(i);
-    v->push_back(&target);
-  }
-}
-
-template <typename Device, typename T>
-std::string Executor<Device, T>::GetExceptionMsg() {
-  return exception_msg_;
-}
-
-#ifdef PADDLE_MOBILE_FPGA
-template <typename Device, typename T>
-void Executor<Device, T>::InjectVariable(const Tensor &t,
-                                         std::string var_name) {
-  Variable *g_feed_value = program_.scope->Var(var_name);
-  Tensor *feed_tensor = g_feed_value->template GetMutable<LoDTensor>();
-  feed_tensor->Resize(t.dims());
-  feed_tensor->ShareDataWith(t);
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::FeedData(const Tensor &t) {
-  InjectVariable(t, "feed0");
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::FeedData(const std::vector<void *> &v) {
-  auto input_size = v.size();
-  int index = 0;
-  // auto vars = program_.scope->VarContain("feed", &index);
-  // PADDLE_MOBILE_ENFORCE(input_size == vars.size(),
-  //                    "input data number not correct");
-  for (int i = 0; i < input_size; i++) {
-    auto var = program_.scope->Var("feed", i + index);
-    auto feed_tensor = var->template GetMutable<LoDTensor>();
-    feed_tensor->external_data = v[i];
-  }
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::GetResults(std::vector<void *> *v) {
-  auto output_size = v->size();
-  PADDLE_MOBILE_ENFORCE(output_size > 0, "Empty output");
-  int index = 0;
-  auto vars = program_.scope->VarContain("fetch", &index);
-  PADDLE_MOBILE_ENFORCE(output_size == vars.size(),
-                        "output data number not correct");
-
-  for (int i = 0; i < output_size; i++) {
-    auto var = program_.scope->Var("fetch", i + index);
-    auto fetch_tensor = var->template GetMutable<LoDTensor>();
-    (*v)[i] = fetch_tensor->template data<float>();
-  }
-}
-
-template <typename Device, typename T>
-framework::Tensor *Executor<Device, T>::GetTensorByName(
-    const std::string &name) {
-  auto var = program_.scope->Var(name);
-  return var->template GetMutable<LoDTensor>();
-}
-
-template <typename Device, typename T>
-std::shared_ptr<Tensor> Executor<Device, T>::FetchResult(int id) {
-  auto &ops = ops_of_block0_;
-
-  PADDLE_MOBILE_ENFORCE(id < (int)ops.size(), "Index out of range");
-  auto op = id < 0 ? ops[ops.size() - 1] : ops[id];
-  auto output_map = op->Outputs();
-  std::vector<std::string> out_keys = op->GetOutKeys();
-  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output");
-  auto *output_tensor =
-      GetVarValue<LoDTensor>(out_keys[0], output_map, *(program_.scope));
-  return std::make_shared<Tensor>(Tensor(*output_tensor));
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::Predict_From_To(int start, int end) {
-  auto &ops = ops_of_block0_;
-  end = end < 0 ? static_cast<int>(ops.size()) : end;
-  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
-                        "start or end parameter is wrong");
-
-#ifdef PADDLE_MOBILE_PROFILE
-  std::vector<ProfInfo> profile(ops.size());
-#endif
-  for (int i = start; i < end; i++) {
-#ifdef PADDLE_MOBILE_PROFILE
-    struct timespec ts;
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-#endif
-    DLOG << "Running op: " << i << "  " << ops[i]->Type();
-    ops[i]->Run();
-
-#ifdef PADDLE_MOBILE_PROFILE
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-#endif
-  }
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::Predict_From(int start) {
-  Predict_From_To(start);
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::Predict_To(int end) {
-  Predict_From_To(0, end);
-}
-#ifdef PADDLE_MOBILE_FPGA_V2
-std::map<std::string, float> LoadQuantValFromFile(std::string filename) {
-  std::map<std::string, float> quantValList;
-  std::ifstream in;
-  in.open(filename, std::ios::in);
-  if (!in.is_open()) {
-    // std::cout << "open File Failed." << std::endl;
-    DLOG << "open File Failed.";
-    exit(-1);
-  }
-
-  std::string line;
-  while (getline(in, line)) {
-    std::string splitStr = " : ";
-    std::string::size_type pos;
-    pos = line.find(splitStr);
-    std::string subStr[2];
-    subStr[0] = line.substr(0, pos);
-    subStr[1] = line.substr(pos + splitStr.size(), line.size());
-    quantValList.insert(std::make_pair(subStr[0], atof(subStr[1].c_str())));
-  }
-  in.close();
-  return quantValList;
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::InitQuantMemory() {
-  std::string quantValFilePath;
-  if (program_.combined) {
-    quantValFilePath = program_.para_path;
-    quantValFilePath =
-        quantValFilePath.substr(0, (quantValFilePath.length() - 6));
-    quantValFilePath = quantValFilePath + "scale";
-  } else {
-    quantValFilePath = program_.model_path + "/scale";
-  }
-  std::map<std::string, float> quantValList =
-      LoadQuantValFromFile(quantValFilePath);
-  auto ops = ops_of_block0_;
-  for (int id = 0; id < ops.size(); id++) {
-    auto op = ops[id];
-    auto input_keys = op->GetInputKeys();
-    auto inputs = op->Inputs();
-    for (auto key = input_keys.begin(); key != input_keys.end(); key++) {
-      auto inputs_vars = inputs[*key];
-      int count = inputs_vars.size();
-      for (int i = 0; i < count; i++) {
-        if (inputs_vars[i] != "feed") {
-          auto tensor = GetTensorByName(inputs_vars[i]);
-          tensor->scale[0] = quantValList[inputs_vars[i]];
-          DLOG << "input variance name : " << inputs_vars[i]
-               << ", scale value : " << tensor->scale[0];
-        }
-      }
-    }
-    auto output_keys = op->GetOutKeys();
-    auto outputs = op->Outputs();
-    for (auto key = output_keys.begin(); key != output_keys.end(); key++) {
-      auto outputs_vars = outputs[*key];
-      int count = outputs_vars.size();
-      for (int i = 0; i < count; i++) {
-        if (outputs_vars[i] != "fetch") {
-          auto tensor = GetTensorByName(outputs_vars[i]);
-          tensor->scale[0] = quantValList[outputs_vars[i]];
-          DLOG << "output variance name : " << outputs_vars[i]
-               << ", scale value : " << tensor->scale[0];
-        }
-      }
-    }
-  }
-}
-#endif
-#endif
-#ifdef PADDLE_MOBILE_CL
-template <>
-void Executor<GPU_CL, float>::InitNoPersistableMemory(
-    const Tensor &input_tensor) {
-  DLOG << "CL InitNoPersistableMemory ";
-  for (const auto &block : program_desc_->Blocks()) {
-    for (const auto &var_desc : block->Vars()) {
-      auto var = program_.scope->Var(var_desc->Name());
-
-      if (var_desc->Persistable()) {
-        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-          var->template GetMutable<framework::LoDTensorArray>();
-          continue;
-        }
-      } else {
-        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
-          auto cl_image = var->template GetMutable<CLImage>();
-          cl_context context = program_.scope->GetCLScpoe()->Context();
-          cl_command_queue command_queue =
-              program_.scope->GetCLScpoe()->CommandQueue();
-
-          DDim tensor_dim = cl_image->dims();
-          DDim new_dim =
-              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
-                         input_tensor.dims()[3]});
-          cl_image->Resize(new_dim);
-          cl_image->InitEmptyImage(context, command_queue, new_dim);
-        }
-      }
-    }
-  }
-  std::shared_ptr<LoDTensor> output = GetOutput("fetch");
-  output->Resize(input_tensor.dims());
-  output->mutable_data<float>();
-}
-
-template <>
-void Executor<GPU_CL, float>::SetInput(const Tensor &input,
-                                       const std::string &var_name) {
-  int index = 0;
-  if (feed_indices_.find(var_name) != feed_indices_.end()) {
-    index = feed_indices_.find(var_name)->second;
-  }
-  auto *feed_var = program_.scope->Var("feed");
-  framework::LoDTensor *input_tensor =
-      &(feed_var->template GetMutable<framework::LoDTensorArray>()->at(index));
-
-  DLOG << "config_.load_when_predict   " << config_.load_when_predict;
-  DLOG << "target_tensor->IsInitialized() " << input_tensor->IsInitialized();
-  DLOG << "target_tensor->dims()   " << input_tensor->dims();
-  DLOG << "input.dims()   " << input.dims();
-  DLOG << "input_dim_last_   " << input_dim_last_;
-  if (config_.load_when_predict) {
-    if (input_dim_last_ != input.dims()) {
-      DLOG << "SetInput ---- > resize1";
-      input_tensor->Resize(input.dims());
-      input_tensor->mutable_data<float>();
-      //     InitNoPersistableMemory(*input_tensor);
-      pass::MemoryOptPassSuper()(program_desc_.get(), program_.scope.get(),
-                                 config_.memory_optimization_level,
-                                 input.dims());
-    }
-  } else {
-    DLOG << "SetInput ---- > resize2";
-    input_tensor->Resize(input.dims());
-    DLOG << "SetInput ---- > ShareDataWith";
-  }
-  input_tensor->ShareDataWith(input);
-  if (feed_indices_.size() == 1) {
-    input_dim_has_changed_ = input_dim_last_ != input.dims();
-  }
-  auto &dim = input.dims();
-  input_dim_last_ = static_cast<DDim>(dim);
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::LoadMemory(const VarDesc var_desc, float *tensorInput,
-                                     char **data) {}
-
-template <>
-void Executor<GPU_CL, float>::LoadMemory(const VarDesc var_desc,
-                                         float *tensorInput, char **data) {
-  // 1. version
-  uint32_t version = *reinterpret_cast<uint32_t *>(*data);
-
-  (*data) += sizeof(uint32_t);
-
-  // 2 Lod information
-  uint64_t *lod_level_ptr = new uint64_t();
-  memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
-  uint64_t lod_level = *lod_level_ptr;
-  delete lod_level_ptr;
-  (*data) += sizeof(uint64_t);
-
-  for (uint64_t i = 0; i < lod_level; ++i) {
-    uint64_t size = *reinterpret_cast<uint64_t *>(*data);
-    (*data) += sizeof(uint64_t);
-    std::vector<size_t> tmp(size / sizeof(size_t));
-
-    for (int k = 0; k < tmp.size(); ++k) {
-      tmp[k] = *reinterpret_cast<size_t *>(*data);
-      (*data) += sizeof(size_t);
-    }
-  }
-
-  // 3. tensor version
-  uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
-  (*data) += sizeof(uint32_t);
-
-  // 4. tensor desc
-  int32_t size = *reinterpret_cast<int32_t *>(*data);
-  (*data) += sizeof(int32_t);
-
-  std::unique_ptr<char[]> buf(new char[size]);
-  for (int m = 0; m < size; ++m) {
-    buf.get()[m] = (*data)[m];
-  }
-  (*data) += (sizeof(char) * size);
-
-  const TensorDesc &desc = var_desc.Tensor_desc();
-  int memory_size = 1;
-  for (auto l : desc.Dims()) {
-    memory_size *= l;
-  }
-
-  void *memory = nullptr;
-  int type_size = 4;
-  memory = tensorInput;
-  if (program_.quantification) {
-    float min_value;
-    float max_value;
-
-    memcpy(&min_value, *data, sizeof(float));
-    memcpy(&max_value, *data + sizeof(float), sizeof(float));
-    *data += 2 * sizeof(float);
-    const float factor = (max_value - min_value) / 255.0;
-    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
-    for (int k = 0; k < memory_size; ++k) {
-      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
-    }
-    *data += (memory_size * sizeof(uint8_t));
-  } else {
-    for (int n = 0; n < memory_size; n++) {
-      float value;
-      memcpy(&value, *data + n * type_size, type_size);
-      if (value < 1e-30 && value > -1e-30) {
-        static_cast<float *>(memory)[n] = 0.0;
-      } else {
-        static_cast<float *>(memory)[n] = value;
-      }
-    }
-    (*data) += (sizeof(char) * memory_size * type_size);
-  }
-}
-
-template <>
-void Executor<GPU_CL, float>::InitMemory() {
-  for (const auto &block : program_desc_->Blocks()) {
-    for (const auto &var_desc : block->Vars()) {
-      auto var = program_.scope->Var(var_desc->Name());
-      if (var_desc->Persistable()) {
-        CLImage *cl_image = nullptr;
-        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-          var->template GetMutable<framework::LoDTensorArray>();
-          continue;
-        } else {
-          cl_image = var->template GetMutable<CLImage>();
-        }
-
-        char *origin_data =
-            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
-        char *data = origin_data;
-        cl_context context = program_.scope->GetCLScpoe()->Context();
-        const TensorDesc &desc = var_desc->Tensor_desc();
-        int numel = 1;
-        for (auto l : desc.Dims()) {
-          numel *= l;
-        }
-        DLOG << var_desc->Name();
-        float *tensorInput = static_cast<float *>(
-            paddle_mobile::memory::Alloc(sizeof(float) * numel));
-        LoadMemory(*var_desc, tensorInput, &data);
-
-        DDim ddim = make_ddim(desc.Dims());
-
-        // has not init
-        cl_image->SetTensorData(tensorInput, ddim);
-
-        delete origin_data;
-        paddle_mobile::memory::Free(tensorInput);
-      } else {
-        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
-          auto cl_image = var->template GetMutable<CLImage>();
-          cl_context context = program_.scope->GetCLScpoe()->Context();
-          cl_command_queue command_queue =
-              program_.scope->GetCLScpoe()->CommandQueue();
-
-          const TensorDesc &desc = var_desc->Tensor_desc();
-          //          DDim ddim = make_ddim(desc.Dims());
-          DDim ddim = cl_image->dims();
-          DLOG << var_desc->Name();
-          cl_image->InitEmptyImage(context, command_queue, ddim);
-        }
-      }
-    }
-  }
-}
-
-template <>
-void Executor<GPU_CL, float>::InitCombineMemory() {
-  DLOG << "CL InitCombineMemory---- "
-       << "config_.load_when_predict: " << config_.load_when_predict;
-  char *origin_data = nullptr;
-  bool self_alloc = false;
-  if (program_.combined_params_buf && program_.combined_params_len) {
-    LOG(kLOG_INFO) << "use outter memory";
-    origin_data = reinterpret_cast<char *>(program_.combined_params_buf);
-    if (config_.model_obfuscate_key != "") {
-      auto obfuscator = pass::ModelObfuscatePass(config_.model_obfuscate_key);
-      obfuscator.convert_data(origin_data, program_.combined_params_len);
-    }
-  } else {
-    LOG(kLOG_INFO) << " begin init combine memory";
-    self_alloc = true;
-    origin_data = ReadFileToBuff(program_.para_path);
-    if (config_.model_obfuscate_key != "") {
-      auto obfuscator = pass::ModelObfuscatePass(config_.model_obfuscate_key);
-      obfuscator.convert_data(origin_data, GetFileLength(program_.para_path));
-    }
-  }
-  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
-  float *data = reinterpret_cast<float *>(origin_data);
-
-  for (const auto &block : program_desc_->Blocks()) {
-    for (const auto &var_desc : block->Vars()) {
-      auto var = program_.scope->Var(var_desc->Name());
-      if (var_desc->Persistable()) {
-        CLImage *cl_image = nullptr;
-        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-          var->template GetMutable<framework::LoDTensorArray>();
-          continue;
-        } else {
-          cl_image = var->template GetMutable<CLImage>();
-        }
-
-        cl_context context = program_.scope->GetCLScpoe()->Context();
-
-        const TensorDesc &desc = var_desc->Tensor_desc();
-        DDim ddim = make_ddim(desc.Dims());
-
-        int numel = 1;
-        for (int i = 0; i < ddim.size(); i++) {
-          numel = numel * ddim[i];
-        }
-        float *tensorInput = static_cast<float *>(
-            paddle_mobile::memory::Alloc(sizeof(float) * numel));
-        LoadMemory(*var_desc, tensorInput, &origin_data);
-
-        // has not init
-        cl_image->SetTensorData(tensorInput, ddim);
-
-        paddle_mobile::memory::Free(tensorInput);
-      } else {
-        auto cl_image = var->template GetMutable<CLImage>();
-        cl_context context = program_.scope->GetCLScpoe()->Context();
-        cl_command_queue command_queue =
-            program_.scope->GetCLScpoe()->CommandQueue();
-        const TensorDesc &desc = var_desc->Tensor_desc();
-        DDim ddim = cl_image->dims();
-        bool shouldResize = true;
-        if (ddim.size() > 4) {
-          for (int i = 0; i < ddim.size() - 4; ++i) {
-            if (ddim[i] != 0 && ddim[i] != 1) {
-              shouldResize = false;
-              break;
-            }
-          }
-          if (shouldResize) {
-            std::vector<int64_t> temp_intput_dims;
-            temp_intput_dims.reserve(static_cast<size_t>(4));
-            for (int i = ddim.size() - 4; i < ddim.size(); ++i) {
-              temp_intput_dims.push_back(ddim[i]);
-            }
-            ddim = framework::make_ddim(temp_intput_dims);
-          }
-        }
-        //  DDim ddim = make_ddim(desc.Dims());
-        cl_image->InitEmptyImage(context, command_queue, ddim);
-      }
-    }
-  }
-  if (self_alloc) {
-    delete data;
-  }
-  LOG(kLOG_INFO) << " end init combine memory ";
-}
-
-#endif
-
-template class Executor<CPU, float>;
-
-template class Executor<FPGA, float>;
-
-template class Executor<GPU_CL, float>;
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/executor.h b/mobile/src/framework/executor.h
deleted file mode 100644
index 4f108c993c..0000000000
--- a/mobile/src/framework/executor.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include "common/types.h"
-#include "common/util.h"
-#include "framework/lod_tensor.h"
-#include "framework/operator.h"
-#include "framework/program/program.h"
-#include "framework/tensor.h"
-#include "framework/type_trait.h"
-#include "pass/memory_optimize.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Device, typename T = float>
-class Executor {
- public:
-  Executor(const Program<Device> &program,
-           paddle_mobile::PaddleMobileConfigInternal config, int batch_size = 1,
-           const bool use_optimize = true, const bool lod_mode = false);
-
-  void SetThreadNum(int thread_num,
-                    PowerMode power_mode = PERFORMANCE_PRIORITY);
-
-  PMStatus Predict(const std::vector<std::pair<std::string, Tensor>> &inputs);
-  PMStatus Predict(
-      const std::vector<std::pair<std::string, LoDTensor>> &inputs);
-
-  std::vector<T> Predict(const std::vector<T> &input,
-                         const std::vector<int64_t> &dims);
-  PMStatus Predict();
-
-  void SetInput(const Tensor &input, const std::string &var_name);
-  void SetInput(const LoDTensor &input, const std::string &var_name);
-
-  std::shared_ptr<LoDTensor> GetOutput(const std::string &var_name);
-#ifdef PADDLE_MOBILE_CL
-  const CLImage *GetOutputImage(const std::string &var_name);
-#endif
-
-  void FeedTensorData(const std::vector<framework::Tensor> &v);
-  void GetTensorResults(std::vector<framework::Tensor *> *v);
-  std::string GetExceptionMsg();
-
-#ifdef PADDLE_MOBILE_FPGA
-  void InjectVariable(const Tensor &t, std::string var_name);
-  void FeedData(const Tensor &t);
-  void FeedData(const std::vector<void *> &v);
-  void GetResults(std::vector<void *> *v);
-  framework::Tensor *GetTensorByName(const std::string &name);
-  std::shared_ptr<Tensor> FetchResult(int id = -1);
-  void Predict_From_To(int start = 0, int end = -1);
-  void Predict_From(int start);
-  void Predict_To(int end);
-#ifdef PADDLE_MOBILE_FPGA_V2
-  void InitQuantMemory();
-#endif
-#endif
-
- protected:
-  Executor() = default;
-
-  bool varInputMemory(const std::shared_ptr<VarDesc> &var_desc,
-                      Variable *var) const;
-  void InitFeedFetchList();
-  void InitMemory();
-  void InitCombineMemory();
-  void InitNoPersistableMemory(const Tensor &input_tensor);
-  void LoadMemory(void **data, const std::shared_ptr<VarDesc> var_desc,
-                  LoDTensor *tensor);
-#ifdef PADDLE_MOBILE_CL
-  void LoadMemory(const VarDesc var_desc, float *tensorInput, char **data);
-#endif
-
-  int batch_size_;
-  bool use_optimize_;
-  bool lod_mode_;
-  PaddleMobileConfigInternal config_;
-  Program<Device> program_;
-  std::shared_ptr<ProgramDesc> program_desc_;
-  std::vector<std::shared_ptr<OperatorBase<Device>>> ops_of_block0_;
-  std::unordered_map<std::string, int> feed_indices_;
-  std::unordered_map<std::string, int> fetch_indices_;
-  std::string exception_msg_;
-
-  // for super resoltion
-  DDim input_dim_last_;
-  bool input_dim_has_changed_ = true;
-
-#ifdef PADDLE_MOBILE_PROFILE
-  typedef typename DtypeTensorTrait<Device>::gtype ProfileTensorType;
-
-  struct ProfInfo {
-    int tid = 0;
-    uint64_t runBegin = 0UL;
-    uint64_t runEnd = 0UL;
-  };
-
-  void PrintProfile(const vector<Executor<Device, T>::ProfInfo> &profile) const;
-#endif
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/framework.pb-c.cpp b/mobile/src/framework/framework.pb-c.cpp
deleted file mode 100644
index b8d76282ec..0000000000
--- a/mobile/src/framework/framework.pb-c.cpp
+++ /dev/null
@@ -1,1465 +0,0 @@
-/* Generated by the protocol buffer compiler.  DO NOT EDIT! */
-/* Generated from: framework.proto */
-
-/* Do not generate deprecated warnings for self */
-#ifndef PROTOBUF_C__NO_DEPRECATED
-#define PROTOBUF_C__NO_DEPRECATED
-#endif
-
-#include "framework.pb-c.h"
-void paddle_mobile__framework__proto__version__init(
-    PaddleMobile__Framework__Proto__Version *message) {
-  static const PaddleMobile__Framework__Proto__Version init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VERSION__INIT;
-  *message = init_value;
-}
-PaddleMobile__Framework__Proto__Version *
-paddle_mobile__framework__proto__version__unpack(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__Version *)
-      PaddleMobile__Framework__protobuf_c_message_unpack(
-          &paddle_mobile__framework__proto__version__descriptor, allocator, len,
-          data);
-}
-void paddle_mobile__framework__proto__version__free_unpacked(
-    PaddleMobile__Framework__Proto__Version *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__version__descriptor);
-  PaddleMobile__Framework__protobuf_c_message_free_unpacked(
-      (PaddleMobile__Framework__ProtobufCMessage *)message, allocator);
-}
-void paddle_mobile__framework__proto__op_desc__attr__init(
-    PaddleMobile__Framework__Proto__OpDesc__Attr *message) {
-  static const PaddleMobile__Framework__Proto__OpDesc__Attr init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__ATTR__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__op_desc__var__init(
-    PaddleMobile__Framework__Proto__OpDesc__Var *message) {
-  static const PaddleMobile__Framework__Proto__OpDesc__Var init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__VAR__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__op_desc__init(
-    PaddleMobile__Framework__Proto__OpDesc *message) {
-  static const PaddleMobile__Framework__Proto__OpDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__INIT;
-  *message = init_value;
-}
-PaddleMobile__Framework__Proto__OpDesc *
-paddle_mobile__framework__proto__op_desc__unpack(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__OpDesc *)
-      PaddleMobile__Framework__protobuf_c_message_unpack(
-          &paddle_mobile__framework__proto__op_desc__descriptor, allocator, len,
-          data);
-}
-void paddle_mobile__framework__proto__op_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__OpDesc *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__op_desc__descriptor);
-  PaddleMobile__Framework__protobuf_c_message_free_unpacked(
-      (PaddleMobile__Framework__ProtobufCMessage *)message, allocator);
-}
-void paddle_mobile__framework__proto__op_proto__var__init(
-    PaddleMobile__Framework__Proto__OpProto__Var *message) {
-  static const PaddleMobile__Framework__Proto__OpProto__Var init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__VAR__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__op_proto__attr__init(
-    PaddleMobile__Framework__Proto__OpProto__Attr *message) {
-  static const PaddleMobile__Framework__Proto__OpProto__Attr init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__ATTR__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__op_proto__init(
-    PaddleMobile__Framework__Proto__OpProto *message) {
-  static const PaddleMobile__Framework__Proto__OpProto init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__INIT;
-  *message = init_value;
-}
-PaddleMobile__Framework__Proto__OpProto *
-paddle_mobile__framework__proto__op_proto__unpack(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__OpProto *)
-      PaddleMobile__Framework__protobuf_c_message_unpack(
-          &paddle_mobile__framework__proto__op_proto__descriptor, allocator,
-          len, data);
-}
-void paddle_mobile__framework__proto__op_proto__free_unpacked(
-    PaddleMobile__Framework__Proto__OpProto *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__op_proto__descriptor);
-  PaddleMobile__Framework__protobuf_c_message_free_unpacked(
-      (PaddleMobile__Framework__ProtobufCMessage *)message, allocator);
-}
-void paddle_mobile__framework__proto__var_type__tensor_desc__init(
-    PaddleMobile__Framework__Proto__VarType__TensorDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarType__TensorDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TENSOR_DESC__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__lo_dtensor_desc__init(
-    PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarType__LoDTensorDesc
-      init_value =
-          PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_DESC__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__init(
-    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc
-      init_value =
-          PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_ARRAY_DESC__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__reader_desc__init(
-    PaddleMobile__Framework__Proto__VarType__ReaderDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarType__ReaderDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__READER_DESC__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__channel_desc__init(
-    PaddleMobile__Framework__Proto__VarType__ChannelDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarType__ChannelDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__CHANNEL_DESC__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__tuple__init(
-    PaddleMobile__Framework__Proto__VarType__Tuple *message) {
-  static const PaddleMobile__Framework__Proto__VarType__Tuple init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TUPLE__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__init(
-    PaddleMobile__Framework__Proto__VarType *message) {
-  static const PaddleMobile__Framework__Proto__VarType init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__INIT;
-  *message = init_value;
-}
-PaddleMobile__Framework__Proto__VarType *
-paddle_mobile__framework__proto__var_type__unpack(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__VarType *)
-      PaddleMobile__Framework__protobuf_c_message_unpack(
-          &paddle_mobile__framework__proto__var_type__descriptor, allocator,
-          len, data);
-}
-void paddle_mobile__framework__proto__var_type__free_unpacked(
-    PaddleMobile__Framework__Proto__VarType *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__var_type__descriptor);
-  PaddleMobile__Framework__protobuf_c_message_free_unpacked(
-      (PaddleMobile__Framework__ProtobufCMessage *)message, allocator);
-}
-void paddle_mobile__framework__proto__var_desc__init(
-    PaddleMobile__Framework__Proto__VarDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_DESC__INIT;
-  *message = init_value;
-}
-PaddleMobile__Framework__Proto__VarDesc *
-paddle_mobile__framework__proto__var_desc__unpack(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__VarDesc *)
-      PaddleMobile__Framework__protobuf_c_message_unpack(
-          &paddle_mobile__framework__proto__var_desc__descriptor, allocator,
-          len, data);
-}
-void paddle_mobile__framework__proto__var_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__VarDesc *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__var_desc__descriptor);
-  PaddleMobile__Framework__protobuf_c_message_free_unpacked(
-      (PaddleMobile__Framework__ProtobufCMessage *)message, allocator);
-}
-void paddle_mobile__framework__proto__block_desc__init(
-    PaddleMobile__Framework__Proto__BlockDesc *message) {
-  static const PaddleMobile__Framework__Proto__BlockDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__BLOCK_DESC__INIT;
-  *message = init_value;
-}
-PaddleMobile__Framework__Proto__BlockDesc *
-paddle_mobile__framework__proto__block_desc__unpack(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__BlockDesc *)
-      PaddleMobile__Framework__protobuf_c_message_unpack(
-          &paddle_mobile__framework__proto__block_desc__descriptor, allocator,
-          len, data);
-}
-void paddle_mobile__framework__proto__block_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__BlockDesc *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__block_desc__descriptor);
-  PaddleMobile__Framework__protobuf_c_message_free_unpacked(
-      (PaddleMobile__Framework__ProtobufCMessage *)message, allocator);
-}
-void paddle_mobile__framework__proto__program_desc__init(
-    PaddleMobile__Framework__Proto__ProgramDesc *message) {
-  static const PaddleMobile__Framework__Proto__ProgramDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__PROGRAM_DESC__INIT;
-  *message = init_value;
-}
-PaddleMobile__Framework__Proto__ProgramDesc *
-paddle_mobile__framework__proto__program_desc__unpack(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__ProgramDesc *)
-      PaddleMobile__Framework__protobuf_c_message_unpack(
-          &paddle_mobile__framework__proto__program_desc__descriptor, allocator,
-          len, data);
-}
-void paddle_mobile__framework__proto__program_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__ProgramDesc *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__program_desc__descriptor);
-  PaddleMobile__Framework__protobuf_c_message_free_unpacked(
-      (PaddleMobile__Framework__ProtobufCMessage *)message, allocator);
-}
-static const int64_t
-    paddle_mobile__framework__proto__version__version__default_value = 0ll;
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__version__field_descriptors[1] = {
-        {
-            "version", 1, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT64,
-            offsetof(PaddleMobile__Framework__Proto__Version, has_version),
-            offsetof(PaddleMobile__Framework__Proto__Version, version), NULL,
-            &paddle_mobile__framework__proto__version__version__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__version__field_indices_by_name[] = {
-        0, /* field[0] = version */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__version__number_ranges[1 + 1] = {{1, 0},
-                                                                      {0, 1}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__version__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.Version",
-        "Version",
-        "PaddleMobile__Framework__Proto__Version",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__Version),
-        1,
-        paddle_mobile__framework__proto__version__field_descriptors,
-        paddle_mobile__framework__proto__version__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__version__number_ranges,
-        (ProtobufCMessageInit)paddle_mobile__framework__proto__version__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_desc__attr__field_descriptors[14] = {
-        {
-            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, name), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "type", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, type),
-            &paddle_mobile__framework__proto__attr_type__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "i", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT32,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_i),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, i), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "f", 4, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_FLOAT,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_f),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, f), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "s", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, s), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "ints", 6, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT32,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_ints),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, ints), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "floats", 7, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_FLOAT,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_floats),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, floats),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "strings", 8, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_STRING,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_strings),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, strings),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "b", 10, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_b),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, b), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "bools", 11, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_bools),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, bools), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "block_idx", 12, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT32,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr,
-                     has_block_idx),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, block_idx),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "l", 13, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT64,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_l),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, l), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "blocks_idx", 14, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT32,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr,
-                     n_blocks_idx),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, blocks_idx),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "longs", 15, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT64,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_longs),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, longs), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name[] = {
-        8,  /* field[8] = b */
-        10, /* field[10] = block_idx */
-        12, /* field[12] = blocks_idx */
-        9,  /* field[9] = bools */
-        3,  /* field[3] = f */
-        6,  /* field[6] = floats */
-        2,  /* field[2] = i */
-        5,  /* field[5] = ints */
-        11, /* field[11] = l */
-        13, /* field[13] = longs */
-        0,  /* field[0] = name */
-        4,  /* field[4] = s */
-        7,  /* field[7] = strings */
-        1,  /* field[1] = type */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__op_desc__attr__number_ranges[2 + 1] = {
-        {1, 0}, {10, 8}, {0, 14}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__attr__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpDesc.Attr",
-        "Attr",
-        "PaddleMobile__Framework__Proto__OpDesc__Attr",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpDesc__Attr),
-        14,
-        paddle_mobile__framework__proto__op_desc__attr__field_descriptors,
-        paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name,
-        2,
-        paddle_mobile__framework__proto__op_desc__attr__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__op_desc__attr__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_desc__var__field_descriptors[2] = {
-        {
-            "parameter", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Var, parameter),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "arguments", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_STRING,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Var, n_arguments),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Var, arguments),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_desc__var__field_indices_by_name[] = {
-        1, /* field[1] = arguments */
-        0, /* field[0] = parameter */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__op_desc__var__number_ranges[1 + 1] = {
-        {1, 0}, {0, 2}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__var__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpDesc.Var",
-        "Var",
-        "PaddleMobile__Framework__Proto__OpDesc__Var",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpDesc__Var),
-        2,
-        paddle_mobile__framework__proto__op_desc__var__field_descriptors,
-        paddle_mobile__framework__proto__op_desc__var__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__op_desc__var__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__op_desc__var__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__op_desc__is_target__default_value = 0;
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_desc__field_descriptors[5] = {
-        {
-            "inputs", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, n_inputs),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, inputs),
-            &paddle_mobile__framework__proto__op_desc__var__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "outputs", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, n_outputs),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, outputs),
-            &paddle_mobile__framework__proto__op_desc__var__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "type", 3, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, type), NULL, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "attrs", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, n_attrs),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, attrs),
-            &paddle_mobile__framework__proto__op_desc__attr__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "is_target", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, has_is_target),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, is_target), NULL,
-            &paddle_mobile__framework__proto__op_desc__is_target__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_desc__field_indices_by_name[] = {
-        3, /* field[3] = attrs */
-        0, /* field[0] = inputs */
-        4, /* field[4] = is_target */
-        1, /* field[1] = outputs */
-        2, /* field[2] = type */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__op_desc__number_ranges[1 + 1] = {{1, 0},
-                                                                      {0, 5}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpDesc",
-        "OpDesc",
-        "PaddleMobile__Framework__Proto__OpDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpDesc),
-        5,
-        paddle_mobile__framework__proto__op_desc__field_descriptors,
-        paddle_mobile__framework__proto__op_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__op_desc__number_ranges,
-        (ProtobufCMessageInit)paddle_mobile__framework__proto__op_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__op_proto__var__duplicable__default_value =
-        0;
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__op_proto__var__intermediate__default_value =
-        0;
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__op_proto__var__dispensable__default_value =
-        0;
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_proto__var__field_descriptors[6] = {
-        {
-            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, name), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "comment", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, comment),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "duplicable", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
-                     has_duplicable),
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, duplicable),
-            NULL,
-            &paddle_mobile__framework__proto__op_proto__var__duplicable__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "intermediate", 4, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
-                     has_intermediate),
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
-                     intermediate),
-            NULL,
-            &paddle_mobile__framework__proto__op_proto__var__intermediate__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "dispensable", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
-                     has_dispensable),
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, dispensable),
-            NULL,
-            &paddle_mobile__framework__proto__op_proto__var__dispensable__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "reuse", 6, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, reuse), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_proto__var__field_indices_by_name[] = {
-        1, /* field[1] = comment */
-        4, /* field[4] = dispensable */
-        2, /* field[2] = duplicable */
-        3, /* field[3] = intermediate */
-        0, /* field[0] = name */
-        5, /* field[5] = reuse */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__op_proto__var__number_ranges[1 + 1] = {
-        {1, 0}, {0, 6}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__var__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpProto.Var",
-        "Var",
-        "PaddleMobile__Framework__Proto__OpProto__Var",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpProto__Var),
-        6,
-        paddle_mobile__framework__proto__op_proto__var__field_descriptors,
-        paddle_mobile__framework__proto__op_proto__var__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__op_proto__var__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__op_proto__var__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__op_proto__attr__generated__default_value =
-        0;
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_proto__attr__field_descriptors[4] = {
-        {
-            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, name), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "type", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, type),
-            &paddle_mobile__framework__proto__attr_type__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "comment", 3, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, comment),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "generated", 4, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr,
-                     has_generated),
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, generated),
-            NULL,
-            &paddle_mobile__framework__proto__op_proto__attr__generated__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_proto__attr__field_indices_by_name[] = {
-        2, /* field[2] = comment */
-        3, /* field[3] = generated */
-        0, /* field[0] = name */
-        1, /* field[1] = type */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__op_proto__attr__number_ranges[1 + 1] = {
-        {1, 0}, {0, 4}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__attr__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpProto.Attr",
-        "Attr",
-        "PaddleMobile__Framework__Proto__OpProto__Attr",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpProto__Attr),
-        4,
-        paddle_mobile__framework__proto__op_proto__attr__field_descriptors,
-        paddle_mobile__framework__proto__op_proto__attr__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__op_proto__attr__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__op_proto__attr__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_proto__field_descriptors[5] = {
-        {
-            "type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto, type), NULL, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "inputs", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpProto, n_inputs),
-            offsetof(PaddleMobile__Framework__Proto__OpProto, inputs),
-            &paddle_mobile__framework__proto__op_proto__var__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "outputs", 3, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpProto, n_outputs),
-            offsetof(PaddleMobile__Framework__Proto__OpProto, outputs),
-            &paddle_mobile__framework__proto__op_proto__var__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "attrs", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpProto, n_attrs),
-            offsetof(PaddleMobile__Framework__Proto__OpProto, attrs),
-            &paddle_mobile__framework__proto__op_proto__attr__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "comment", 5, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto, comment), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_proto__field_indices_by_name[] = {
-        3, /* field[3] = attrs */
-        4, /* field[4] = comment */
-        1, /* field[1] = inputs */
-        2, /* field[2] = outputs */
-        0, /* field[0] = type */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__op_proto__number_ranges[1 + 1] = {{1, 0},
-                                                                       {0, 5}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpProto",
-        "OpProto",
-        "PaddleMobile__Framework__Proto__OpProto",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpProto),
-        5,
-        paddle_mobile__framework__proto__op_proto__field_descriptors,
-        paddle_mobile__framework__proto__op_proto__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__op_proto__number_ranges,
-        (ProtobufCMessageInit)paddle_mobile__framework__proto__op_proto__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__tensor_desc__field_descriptors
-        [2] = {
-            {
-                "data_type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
-                0, /* quantifier_offset */
-                offsetof(PaddleMobile__Framework__Proto__VarType__TensorDesc,
-                         data_type),
-                &paddle_mobile__framework__proto__var_type__type__descriptor,
-                NULL, 0,      /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-            {
-                "dims", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT64,
-                offsetof(PaddleMobile__Framework__Proto__VarType__TensorDesc,
-                         n_dims),
-                offsetof(PaddleMobile__Framework__Proto__VarType__TensorDesc,
-                         dims),
-                NULL, NULL, 0, /* flags */
-                0, NULL, NULL  /* reserved1,reserved2, etc */
-            },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__tensor_desc__field_indices_by_name
-        [] = {
-            0, /* field[0] = data_type */
-            1, /* field[1] = dims */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__tensor_desc__number_ranges[1 +
-                                                                          1] = {
-        {1, 0}, {0, 2}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__tensor_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.TensorDesc",
-        "TensorDesc",
-        "PaddleMobile__Framework__Proto__VarType__TensorDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__TensorDesc),
-        2,
-        paddle_mobile__framework__proto__var_type__tensor_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_type__tensor_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__tensor_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__tensor_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const int32_t
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__lod_level__default_value =
-        0;
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_descriptors
-        [2] = {
-            {
-                "tensor", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_MESSAGE,
-                0, /* quantifier_offset */
-                offsetof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc,
-                         tensor),
-                &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor,
-                NULL, 0,      /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-            {
-                "lod_level", 2, PROTOBUF_C_LABEL_OPTIONAL,
-                PROTOBUF_C_TYPE_INT32,
-                offsetof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc,
-                         has_lod_level),
-                offsetof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc,
-                         lod_level),
-                NULL,
-                &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__lod_level__default_value,
-                0,            /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_indices_by_name
-        [] = {
-            1, /* field[1] = lod_level */
-            0, /* field[0] = tensor */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__number_ranges
-        [1 + 1] = {{1, 0}, {0, 2}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.LoDTensorDesc",
-        "LoDTensorDesc",
-        "PaddleMobile__Framework__Proto__VarType__LoDTensorDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc),
-        2,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__lo_dtensor_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const int32_t
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__lod_level__default_value =
-        0;
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_descriptors
-        [2] = {
-            {
-                "tensor", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_MESSAGE,
-                0, /* quantifier_offset */
-                offsetof(
-                    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc,
-                    tensor),
-                &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor,
-                NULL, 0,      /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-            {
-                "lod_level", 2, PROTOBUF_C_LABEL_OPTIONAL,
-                PROTOBUF_C_TYPE_INT32,
-                offsetof(
-                    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc,
-                    has_lod_level),
-                offsetof(
-                    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc,
-                    lod_level),
-                NULL,
-                &paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__lod_level__default_value,
-                0,            /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_indices_by_name
-        [] = {
-            1, /* field[1] = lod_level */
-            0, /* field[0] = tensor */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__number_ranges
-        [1 + 1] = {{1, 0}, {0, 2}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc",
-        "LoDTensorArrayDesc",
-        "PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc),
-        2,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__reader_desc__field_descriptors[1] = {
-        {
-            "lod_tensor", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__VarType__ReaderDesc,
-                     n_lod_tensor),
-            offsetof(PaddleMobile__Framework__Proto__VarType__ReaderDesc,
-                     lod_tensor),
-            &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__reader_desc__field_indices_by_name
-        [] = {
-            0, /* field[0] = lod_tensor */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__reader_desc__number_ranges[1 +
-                                                                          1] = {
-        {1, 0}, {0, 1}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__reader_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.ReaderDesc",
-        "ReaderDesc",
-        "PaddleMobile__Framework__Proto__VarType__ReaderDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__ReaderDesc),
-        1,
-        paddle_mobile__framework__proto__var_type__reader_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_type__reader_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__reader_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__reader_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__channel_desc__field_descriptors
-        [2] = {
-            {
-                "data_type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
-                0, /* quantifier_offset */
-                offsetof(PaddleMobile__Framework__Proto__VarType__ChannelDesc,
-                         data_type),
-                &paddle_mobile__framework__proto__var_type__type__descriptor,
-                NULL, 0,      /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-            {
-                "capacity", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_INT64,
-                0, /* quantifier_offset */
-                offsetof(PaddleMobile__Framework__Proto__VarType__ChannelDesc,
-                         capacity),
-                NULL, NULL, 0, /* flags */
-                0, NULL, NULL  /* reserved1,reserved2, etc */
-            },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__channel_desc__field_indices_by_name
-        [] = {
-            1, /* field[1] = capacity */
-            0, /* field[0] = data_type */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__channel_desc__number_ranges[1 +
-                                                                           1] =
-        {{1, 0}, {0, 2}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__channel_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.ChannelDesc",
-        "ChannelDesc",
-        "PaddleMobile__Framework__Proto__VarType__ChannelDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__ChannelDesc),
-        2,
-        paddle_mobile__framework__proto__var_type__channel_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_type__channel_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__channel_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__channel_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__tuple__field_descriptors[1] = {
-        {
-            "element_type", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_ENUM,
-            offsetof(PaddleMobile__Framework__Proto__VarType__Tuple,
-                     n_element_type),
-            offsetof(PaddleMobile__Framework__Proto__VarType__Tuple,
-                     element_type),
-            &paddle_mobile__framework__proto__var_type__type__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__tuple__field_indices_by_name[] =
-        {
-            0, /* field[0] = element_type */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__tuple__number_ranges[1 + 1] = {
-        {1, 0}, {0, 1}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__tuple__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.Tuple",
-        "Tuple",
-        "PaddleMobile__Framework__Proto__VarType__Tuple",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__Tuple),
-        1,
-        paddle_mobile__framework__proto__var_type__tuple__field_descriptors,
-        paddle_mobile__framework__proto__var_type__tuple__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__tuple__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__tuple__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const PaddleMobile__Framework__ProtobufCEnumValue
-    paddle_mobile__framework__proto__var_type__type__enum_values_by_number[22] =
-        {
-            {"BOOL", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL",
-             0},
-            {"INT16", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16",
-             1},
-            {"INT32", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32",
-             2},
-            {"INT64", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64",
-             3},
-            {"FP16", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16",
-             4},
-            {"FP32", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32",
-             5},
-            {"FP64", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64",
-             6},
-            {"LOD_TENSOR",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR", 7},
-            {"SELECTED_ROWS",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SELECTED_ROWS",
-             8},
-            {"FEED_MINIBATCH",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FEED_MINIBATCH",
-             9},
-            {"FETCH_LIST",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FETCH_LIST", 10},
-            {"STEP_SCOPES",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__STEP_SCOPES",
-             11},
-            {"LOD_RANK_TABLE",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_RANK_TABLE",
-             12},
-            {"LOD_TENSOR_ARRAY",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR_"
-             "ARRAY",
-             13},
-            {"PLACE_LIST",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__PLACE_LIST", 14},
-            {"READER",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__READER", 15},
-            {"CHANNEL",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__CHANNEL", 16},
-            {"RAW", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__RAW", 17},
-            {"TUPLE", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__TUPLE",
-             18},
-            {"SIZE_T",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SIZE_T", 19},
-            {"UINT8", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__UINT8",
-             20},
-            {"INT8", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT8",
-             21},
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__type__value_ranges[] = {{0, 0},
-                                                                       {0, 22}};
-static const PaddleMobile__Framework__ProtobufCEnumValueIndex
-    paddle_mobile__framework__proto__var_type__type__enum_values_by_name[22] = {
-        {"BOOL", 0},           {"CHANNEL", 16},
-        {"FEED_MINIBATCH", 9}, {"FETCH_LIST", 10},
-        {"FP16", 4},           {"FP32", 5},
-        {"FP64", 6},           {"INT16", 1},
-        {"INT32", 2},          {"INT64", 3},
-        {"INT8", 21},          {"LOD_RANK_TABLE", 12},
-        {"LOD_TENSOR", 7},     {"LOD_TENSOR_ARRAY", 13},
-        {"PLACE_LIST", 14},    {"RAW", 17},
-        {"READER", 15},        {"SELECTED_ROWS", 8},
-        {"SIZE_T", 19},        {"STEP_SCOPES", 11},
-        {"TUPLE", 18},         {"UINT8", 20},
-};
-const PaddleMobile__Framework__ProtobufCEnumDescriptor
-    paddle_mobile__framework__proto__var_type__type__descriptor = {
-        PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.Type",
-        "Type",
-        "PaddleMobile__Framework__Proto__VarType__Type",
-        "paddle_mobile.framework.proto",
-        22,
-        paddle_mobile__framework__proto__var_type__type__enum_values_by_number,
-        22,
-        paddle_mobile__framework__proto__var_type__type__enum_values_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__type__value_ranges,
-        NULL,
-        NULL,
-        NULL,
-        NULL /* reserved[1234] */
-};
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__field_descriptors[7] = {
-        {
-            "type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, type),
-            &paddle_mobile__framework__proto__var_type__type__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "selected_rows", 2, PROTOBUF_C_LABEL_OPTIONAL,
-            PROTOBUF_C_TYPE_MESSAGE, 0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, selected_rows),
-            &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "lod_tensor", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, lod_tensor),
-            &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "tensor_array", 4, PROTOBUF_C_LABEL_OPTIONAL,
-            PROTOBUF_C_TYPE_MESSAGE, 0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, tensor_array),
-            &paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "reader", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, reader),
-            &paddle_mobile__framework__proto__var_type__reader_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "channel", 6, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, channel),
-            &paddle_mobile__framework__proto__var_type__channel_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "tuple", 7, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, tuple),
-            &paddle_mobile__framework__proto__var_type__tuple__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__field_indices_by_name[] = {
-        5, /* field[5] = channel */
-        2, /* field[2] = lod_tensor */
-        4, /* field[4] = reader */
-        1, /* field[1] = selected_rows */
-        3, /* field[3] = tensor_array */
-        6, /* field[6] = tuple */
-        0, /* field[0] = type */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__number_ranges[1 + 1] = {{1, 0},
-                                                                       {0, 7}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType",
-        "VarType",
-        "PaddleMobile__Framework__Proto__VarType",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType),
-        7,
-        paddle_mobile__framework__proto__var_type__field_descriptors,
-        paddle_mobile__framework__proto__var_type__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__number_ranges,
-        (ProtobufCMessageInit)paddle_mobile__framework__proto__var_type__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__var_desc__persistable__default_value = 0;
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_desc__field_descriptors[3] = {
-        {
-            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarDesc, name), NULL, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "type", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_MESSAGE,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarDesc, type),
-            &paddle_mobile__framework__proto__var_type__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "persistable", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__VarDesc, has_persistable),
-            offsetof(PaddleMobile__Framework__Proto__VarDesc, persistable),
-            NULL,
-            &paddle_mobile__framework__proto__var_desc__persistable__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_desc__field_indices_by_name[] = {
-        0, /* field[0] = name */
-        2, /* field[2] = persistable */
-        1, /* field[1] = type */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__var_desc__number_ranges[1 + 1] = {{1, 0},
-                                                                       {0, 3}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarDesc",
-        "VarDesc",
-        "PaddleMobile__Framework__Proto__VarDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarDesc),
-        3,
-        paddle_mobile__framework__proto__var_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_desc__number_ranges,
-        (ProtobufCMessageInit)paddle_mobile__framework__proto__var_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const int32_t
-    paddle_mobile__framework__proto__block_desc__forward_block_idx__default_value =
-        -1;
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__block_desc__field_descriptors[5] = {
-        {
-            "idx", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_INT32,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, idx), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "parent_idx", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_INT32,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, parent_idx),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "vars", 3, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, n_vars),
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, vars),
-            &paddle_mobile__framework__proto__var_desc__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "ops", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, n_ops),
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, ops),
-            &paddle_mobile__framework__proto__op_desc__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "forward_block_idx", 5, PROTOBUF_C_LABEL_OPTIONAL,
-            PROTOBUF_C_TYPE_INT32,
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc,
-                     has_forward_block_idx),
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc,
-                     forward_block_idx),
-            NULL,
-            &paddle_mobile__framework__proto__block_desc__forward_block_idx__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__block_desc__field_indices_by_name[] = {
-        4, /* field[4] = forward_block_idx */
-        0, /* field[0] = idx */
-        3, /* field[3] = ops */
-        1, /* field[1] = parent_idx */
-        2, /* field[2] = vars */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__block_desc__number_ranges[1 + 1] = {
-        {1, 0}, {0, 5}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__block_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.BlockDesc",
-        "BlockDesc",
-        "PaddleMobile__Framework__Proto__BlockDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__BlockDesc),
-        5,
-        paddle_mobile__framework__proto__block_desc__field_descriptors,
-        paddle_mobile__framework__proto__block_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__block_desc__number_ranges,
-        (ProtobufCMessageInit)paddle_mobile__framework__proto__block_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__program_desc__field_descriptors[2] = {
-        {
-            "blocks", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__ProgramDesc, n_blocks),
-            offsetof(PaddleMobile__Framework__Proto__ProgramDesc, blocks),
-            &paddle_mobile__framework__proto__block_desc__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "version", 2, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__ProgramDesc, version),
-            &paddle_mobile__framework__proto__version__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__program_desc__field_indices_by_name[] = {
-        0, /* field[0] = blocks */
-        1, /* field[1] = version */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__program_desc__number_ranges[1 + 1] = {
-        {1, 0}, {0, 2}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__program_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.ProgramDesc",
-        "ProgramDesc",
-        "PaddleMobile__Framework__Proto__ProgramDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__ProgramDesc),
-        2,
-        paddle_mobile__framework__proto__program_desc__field_descriptors,
-        paddle_mobile__framework__proto__program_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__program_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__program_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const PaddleMobile__Framework__ProtobufCEnumValue
-    paddle_mobile__framework__proto__attr_type__enum_values_by_number[12] = {
-        {"INT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT", 0},
-        {"FLOAT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT", 1},
-        {"STRING", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING", 2},
-        {"INTS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INTS", 3},
-        {"FLOATS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOATS", 4},
-        {"STRINGS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRINGS", 5},
-        {"BOOLEAN", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN", 6},
-        {"BOOLEANS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS", 7},
-        {"BLOCK", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK", 8},
-        {"LONG", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG", 9},
-        {"BLOCKS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCKS", 10},
-        {"LONGS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONGS", 11},
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__attr_type__value_ranges[] = {{0, 0},
-                                                                  {0, 12}};
-static const PaddleMobile__Framework__ProtobufCEnumValueIndex
-    paddle_mobile__framework__proto__attr_type__enum_values_by_name[12] = {
-        {"BLOCK", 8}, {"BLOCKS", 10}, {"BOOLEAN", 6}, {"BOOLEANS", 7},
-        {"FLOAT", 1}, {"FLOATS", 4},  {"INT", 0},     {"INTS", 3},
-        {"LONG", 9},  {"LONGS", 11},  {"STRING", 2},  {"STRINGS", 5},
-};
-const PaddleMobile__Framework__ProtobufCEnumDescriptor
-    paddle_mobile__framework__proto__attr_type__descriptor = {
-        PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.AttrType",
-        "AttrType",
-        "PaddleMobile__Framework__Proto__AttrType",
-        "paddle_mobile.framework.proto",
-        12,
-        paddle_mobile__framework__proto__attr_type__enum_values_by_number,
-        12,
-        paddle_mobile__framework__proto__attr_type__enum_values_by_name,
-        1,
-        paddle_mobile__framework__proto__attr_type__value_ranges,
-        NULL,
-        NULL,
-        NULL,
-        NULL /* reserved[1234] */
-};
diff --git a/mobile/src/framework/framework.pb-c.h b/mobile/src/framework/framework.pb-c.h
deleted file mode 100644
index 910963f1e6..0000000000
--- a/mobile/src/framework/framework.pb-c.h
+++ /dev/null
@@ -1,615 +0,0 @@
-/* Generated by the protocol buffer compiler.  DO NOT EDIT! */
-/* Generated from: framework.proto */
-
-#ifndef PROTOBUF_C_framework_2eproto__INCLUDED
-#define PROTOBUF_C_framework_2eproto__INCLUDED
-
-#include <protobuf-c/protobuf-c.h>
-
-PROTOBUF_C__BEGIN_DECLS
-
-#if PROTOBUF_C_VERSION_NUMBER < 1000000
-# error This file was generated by a newer version of protoc-c which is incompatible with your libprotobuf-c headers. Please update your headers.
-#elif 1003001 < PROTOBUF_C_MIN_COMPILER_VERSION
-# error This file was generated by an older version of protoc-c which is incompatible with your libprotobuf-c headers. Please regenerate this file with a newer version of protoc-c.
-#endif
-
-typedef struct _PaddleMobile__Framework__Proto__Version
-    PaddleMobile__Framework__Proto__Version;
-typedef struct _PaddleMobile__Framework__Proto__OpDesc
-    PaddleMobile__Framework__Proto__OpDesc;
-typedef struct _PaddleMobile__Framework__Proto__OpDesc__Attr
-    PaddleMobile__Framework__Proto__OpDesc__Attr;
-typedef struct _PaddleMobile__Framework__Proto__OpDesc__Var
-    PaddleMobile__Framework__Proto__OpDesc__Var;
-typedef struct _PaddleMobile__Framework__Proto__OpProto
-    PaddleMobile__Framework__Proto__OpProto;
-typedef struct _PaddleMobile__Framework__Proto__OpProto__Var
-    PaddleMobile__Framework__Proto__OpProto__Var;
-typedef struct _PaddleMobile__Framework__Proto__OpProto__Attr
-    PaddleMobile__Framework__Proto__OpProto__Attr;
-typedef struct _PaddleMobile__Framework__Proto__VarType
-    PaddleMobile__Framework__Proto__VarType;
-typedef struct _PaddleMobile__Framework__Proto__VarType__TensorDesc
-    PaddleMobile__Framework__Proto__VarType__TensorDesc;
-typedef struct _PaddleMobile__Framework__Proto__VarType__LoDTensorDesc
-    PaddleMobile__Framework__Proto__VarType__LoDTensorDesc;
-typedef struct _PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc
-    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc;
-typedef struct _PaddleMobile__Framework__Proto__VarType__ReaderDesc
-    PaddleMobile__Framework__Proto__VarType__ReaderDesc;
-typedef struct _PaddleMobile__Framework__Proto__VarType__ChannelDesc
-    PaddleMobile__Framework__Proto__VarType__ChannelDesc;
-typedef struct _PaddleMobile__Framework__Proto__VarType__Tuple
-    PaddleMobile__Framework__Proto__VarType__Tuple;
-typedef struct _PaddleMobile__Framework__Proto__VarDesc
-    PaddleMobile__Framework__Proto__VarDesc;
-typedef struct _PaddleMobile__Framework__Proto__BlockDesc
-    PaddleMobile__Framework__Proto__BlockDesc;
-typedef struct _PaddleMobile__Framework__Proto__ProgramDesc
-    PaddleMobile__Framework__Proto__ProgramDesc;
-
-/* --- enums --- */
-
-typedef enum _PaddleMobile__Framework__Proto__VarType__Type {
-  /*
-   * Pod Types
-   */
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL = 0,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16 = 1,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32 = 2,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64 = 3,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16 = 4,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32 = 5,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64 = 6,
-  /*
-   * Tensor<size_t> is used in C++.
-   */
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SIZE_T = 19,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__UINT8 = 20,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT8 = 21,
-  /*
-   * Other types that may need additional descriptions
-   */
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR = 7,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SELECTED_ROWS = 8,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FEED_MINIBATCH = 9,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FETCH_LIST = 10,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__STEP_SCOPES = 11,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_RANK_TABLE = 12,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR_ARRAY = 13,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__PLACE_LIST = 14,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__READER = 15,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__CHANNEL = 16,
-  /*
-   * Any runtime decided variable type is raw
-   * raw variables should manage their own allocations
-   * in operators like nccl_op
-   */
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__RAW = 17,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__TUPLE =
-      18 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(
-          PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE)
-} PaddleMobile__Framework__Proto__VarType__Type;
-typedef enum _PaddleMobile__Framework__Proto__AttrType {
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT = 0,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT = 1,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING = 2,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INTS = 3,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOATS = 4,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRINGS = 5,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN = 6,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS = 7,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK = 8,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG = 9,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCKS = 10,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONGS =
-      11 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(
-          PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE)
-} PaddleMobile__Framework__Proto__AttrType;
-
-/* --- messages --- */
-
-/*
- * Any incompatible changes to ProgramDesc and its dependencies should
- * raise the version defined version.h.
- * Serailization and Deserialization codes should be modified in a way
- * that supports old versions following the version and compatibility policy.
- */
-struct _PaddleMobile__Framework__Proto__Version {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  protobuf_c_boolean has_version;
-  int64_t version;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VERSION__INIT         \
-  {                                                            \
-    PROTOBUF_C_MESSAGE_INIT(                                   \
-        &paddle_mobile__framework__proto__version__descriptor) \
-    , 0, 0ll                                                   \
-  }
-
-struct _PaddleMobile__Framework__Proto__OpDesc__Attr {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  char *name;
-  PaddleMobile__Framework__Proto__AttrType type;
-  protobuf_c_boolean has_i;
-  int32_t i;
-  protobuf_c_boolean has_f;
-  float f;
-  char *s;
-  size_t n_ints;
-  int32_t *ints;
-  size_t n_floats;
-  float *floats;
-  size_t n_strings;
-  char **strings;
-  protobuf_c_boolean has_b;
-  protobuf_c_boolean b;
-  size_t n_bools;
-  protobuf_c_boolean *bools;
-  protobuf_c_boolean has_block_idx;
-  int32_t block_idx;
-  protobuf_c_boolean has_l;
-  int64_t l;
-  size_t n_blocks_idx;
-  int32_t *blocks_idx;
-  size_t n_longs;
-  int64_t *longs;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__ATTR__INIT                   \
-  {                                                                            \
-    PROTOBUF_C_MESSAGE_INIT(                                                   \
-        &paddle_mobile__framework__proto__op_desc__attr__descriptor)           \
-    , NULL, PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT, 0, 0, 0, 0, NULL, \
-        0, NULL, 0, NULL, 0, NULL, 0, 0, 0, NULL, 0, 0, 0, 0, 0, NULL, 0, NULL \
-  }
-
-struct _PaddleMobile__Framework__Proto__OpDesc__Var {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  char *parameter;
-  size_t n_arguments;
-  char **arguments;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__VAR__INIT         \
-  {                                                                 \
-    PROTOBUF_C_MESSAGE_INIT(                                        \
-        &paddle_mobile__framework__proto__op_desc__var__descriptor) \
-    , NULL, 0, NULL                                                 \
-  }
-
-/*
- * OpDesc describes an instance of a C++ framework::OperatorBase
- * derived class type.
- */
-struct _PaddleMobile__Framework__Proto__OpDesc {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  char *type;
-  size_t n_inputs;
-  PaddleMobile__Framework__Proto__OpDesc__Var **inputs;
-  size_t n_outputs;
-  PaddleMobile__Framework__Proto__OpDesc__Var **outputs;
-  size_t n_attrs;
-  PaddleMobile__Framework__Proto__OpDesc__Attr **attrs;
-  protobuf_c_boolean has_is_target;
-  protobuf_c_boolean is_target;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__INIT         \
-  {                                                            \
-    PROTOBUF_C_MESSAGE_INIT(                                   \
-        &paddle_mobile__framework__proto__op_desc__descriptor) \
-    , NULL, 0, NULL, 0, NULL, 0, NULL, 0, 0                    \
-  }
-
-/*
- * VarProto describes the C++ type framework::Variable.
- */
-struct _PaddleMobile__Framework__Proto__OpProto__Var {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  char *name;
-  char *comment;
-  protobuf_c_boolean has_duplicable;
-  protobuf_c_boolean duplicable;
-  protobuf_c_boolean has_intermediate;
-  protobuf_c_boolean intermediate;
-  protobuf_c_boolean has_dispensable;
-  protobuf_c_boolean dispensable;
-  char *reuse;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__VAR__INIT         \
-  {                                                                  \
-    PROTOBUF_C_MESSAGE_INIT(                                         \
-        &paddle_mobile__framework__proto__op_proto__var__descriptor) \
-    , NULL, NULL, 0, 0, 0, 0, 0, 0, NULL                             \
-  }
-
-/*
- * AttrProto describes the C++ type Attribute.
- */
-struct _PaddleMobile__Framework__Proto__OpProto__Attr {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  char *name;
-  PaddleMobile__Framework__Proto__AttrType type;
-  char *comment;
-  /*
-   * If that attribute is generated, it means the Paddle third
-   * language binding has responsibility to fill that
-   * attribute. End-User should not set that attribute.
-   */
-  protobuf_c_boolean has_generated;
-  protobuf_c_boolean generated;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__ATTR__INIT           \
-  {                                                                     \
-    PROTOBUF_C_MESSAGE_INIT(                                            \
-        &paddle_mobile__framework__proto__op_proto__attr__descriptor)   \
-    , NULL, PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT, NULL, 0, 0 \
-  }
-
-/*
- * OpProto describes a C++ framework::OperatorBase derived class.
- */
-struct _PaddleMobile__Framework__Proto__OpProto {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  char *type;
-  size_t n_inputs;
-  PaddleMobile__Framework__Proto__OpProto__Var **inputs;
-  size_t n_outputs;
-  PaddleMobile__Framework__Proto__OpProto__Var **outputs;
-  size_t n_attrs;
-  PaddleMobile__Framework__Proto__OpProto__Attr **attrs;
-  char *comment;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__INIT         \
-  {                                                             \
-    PROTOBUF_C_MESSAGE_INIT(                                    \
-        &paddle_mobile__framework__proto__op_proto__descriptor) \
-    , NULL, 0, NULL, 0, NULL, 0, NULL, NULL                     \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__TensorDesc {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  /*
-   * Should only be PODType. Is enforced in C++
-   */
-  PaddleMobile__Framework__Proto__VarType__Type data_type;
-  /*
-   * [UNK, 640, 480] is saved as [-1, 640, 480]
-   */
-  size_t n_dims;
-  int64_t *dims;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TENSOR_DESC__INIT         \
-  {                                                                          \
-    PROTOBUF_C_MESSAGE_INIT(                                                 \
-        &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor) \
-    , PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL, 0, NULL         \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__LoDTensorDesc {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  PaddleMobile__Framework__Proto__VarType__TensorDesc *tensor;
-  protobuf_c_boolean has_lod_level;
-  int32_t lod_level;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_DESC__INIT         \
-  {                                                                              \
-    PROTOBUF_C_MESSAGE_INIT(                                                     \
-        &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor) \
-    , NULL, 0, 0                                                                 \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  PaddleMobile__Framework__Proto__VarType__TensorDesc *tensor;
-  protobuf_c_boolean has_lod_level;
-  int32_t lod_level;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_ARRAY_DESC__INIT         \
-  {                                                                                    \
-    PROTOBUF_C_MESSAGE_INIT(                                                           \
-        &paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor) \
-    , NULL, 0, 0                                                                       \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__ReaderDesc {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  size_t n_lod_tensor;
-  PaddleMobile__Framework__Proto__VarType__LoDTensorDesc **lod_tensor;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__READER_DESC__INIT         \
-  {                                                                          \
-    PROTOBUF_C_MESSAGE_INIT(                                                 \
-        &paddle_mobile__framework__proto__var_type__reader_desc__descriptor) \
-    , 0, NULL                                                                \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__ChannelDesc {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  PaddleMobile__Framework__Proto__VarType__Type data_type;
-  int64_t capacity;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__CHANNEL_DESC__INIT         \
-  {                                                                           \
-    PROTOBUF_C_MESSAGE_INIT(                                                  \
-        &paddle_mobile__framework__proto__var_type__channel_desc__descriptor) \
-    , PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL, 0                \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__Tuple {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  size_t n_element_type;
-  PaddleMobile__Framework__Proto__VarType__Type *element_type;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TUPLE__INIT         \
-  {                                                                    \
-    PROTOBUF_C_MESSAGE_INIT(                                           \
-        &paddle_mobile__framework__proto__var_type__tuple__descriptor) \
-    , 0, NULL                                                          \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  PaddleMobile__Framework__Proto__VarType__Type type;
-  PaddleMobile__Framework__Proto__VarType__TensorDesc *selected_rows;
-  PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *lod_tensor;
-  PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *tensor_array;
-  PaddleMobile__Framework__Proto__VarType__ReaderDesc *reader;
-  PaddleMobile__Framework__Proto__VarType__ChannelDesc *channel;
-  PaddleMobile__Framework__Proto__VarType__Tuple *tuple;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__INIT                        \
-  {                                                                            \
-    PROTOBUF_C_MESSAGE_INIT(                                                   \
-        &paddle_mobile__framework__proto__var_type__descriptor)                \
-    , PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL, NULL, NULL, NULL, \
-        NULL, NULL, NULL                                                       \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarDesc {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  char *name;
-  PaddleMobile__Framework__Proto__VarType *type;
-  protobuf_c_boolean has_persistable;
-  protobuf_c_boolean persistable;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_DESC__INIT         \
-  {                                                             \
-    PROTOBUF_C_MESSAGE_INIT(                                    \
-        &paddle_mobile__framework__proto__var_desc__descriptor) \
-    , NULL, NULL, 0, 0                                          \
-  }
-
-struct _PaddleMobile__Framework__Proto__BlockDesc {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  int32_t idx;
-  int32_t parent_idx;
-  size_t n_vars;
-  PaddleMobile__Framework__Proto__VarDesc **vars;
-  size_t n_ops;
-  PaddleMobile__Framework__Proto__OpDesc **ops;
-  protobuf_c_boolean has_forward_block_idx;
-  int32_t forward_block_idx;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__BLOCK_DESC__INIT         \
-  {                                                               \
-    PROTOBUF_C_MESSAGE_INIT(                                      \
-        &paddle_mobile__framework__proto__block_desc__descriptor) \
-    , 0, 0, 0, NULL, 0, NULL, 0, -1                               \
-  }
-
-/*
- * Please refer to
- * https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
- * for more details.
- * TODO(panyx0718): A model can have multiple programs. Need a
- * way to distinguish them. Maybe ID or name?
- */
-struct _PaddleMobile__Framework__Proto__ProgramDesc {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  size_t n_blocks;
-  PaddleMobile__Framework__Proto__BlockDesc **blocks;
-  PaddleMobile__Framework__Proto__Version *version;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__PROGRAM_DESC__INIT         \
-  {                                                                 \
-    PROTOBUF_C_MESSAGE_INIT(                                        \
-        &paddle_mobile__framework__proto__program_desc__descriptor) \
-    , 0, NULL, NULL                                                 \
-  }
-
-/* PaddleMobile__Framework__Proto__Version methods */
-void paddle_mobile__framework__proto__version__init(
-    PaddleMobile__Framework__Proto__Version *message);
-PaddleMobile__Framework__Proto__Version *
-paddle_mobile__framework__proto__version__unpack(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data);
-void paddle_mobile__framework__proto__version__free_unpacked(
-    PaddleMobile__Framework__Proto__Version *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator);
-/* PaddleMobile__Framework__Proto__OpDesc__Attr methods */
-void paddle_mobile__framework__proto__op_desc__attr__init(
-    PaddleMobile__Framework__Proto__OpDesc__Attr *message);
-/* PaddleMobile__Framework__Proto__OpDesc__Var methods */
-void paddle_mobile__framework__proto__op_desc__var__init(
-    PaddleMobile__Framework__Proto__OpDesc__Var *message);
-/* PaddleMobile__Framework__Proto__OpDesc methods */
-void paddle_mobile__framework__proto__op_desc__init(
-    PaddleMobile__Framework__Proto__OpDesc *message);
-PaddleMobile__Framework__Proto__OpDesc *
-paddle_mobile__framework__proto__op_desc__unpack(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data);
-void paddle_mobile__framework__proto__op_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__OpDesc *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator);
-/* PaddleMobile__Framework__Proto__OpProto__Var methods */
-void paddle_mobile__framework__proto__op_proto__var__init(
-    PaddleMobile__Framework__Proto__OpProto__Var *message);
-/* PaddleMobile__Framework__Proto__OpProto__Attr methods */
-void paddle_mobile__framework__proto__op_proto__attr__init(
-    PaddleMobile__Framework__Proto__OpProto__Attr *message);
-/* PaddleMobile__Framework__Proto__OpProto methods */
-void paddle_mobile__framework__proto__op_proto__init(
-    PaddleMobile__Framework__Proto__OpProto *message);
-PaddleMobile__Framework__Proto__OpProto *
-paddle_mobile__framework__proto__op_proto__unpack(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data);
-void paddle_mobile__framework__proto__op_proto__free_unpacked(
-    PaddleMobile__Framework__Proto__OpProto *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator);
-/* PaddleMobile__Framework__Proto__VarType__TensorDesc methods */
-void paddle_mobile__framework__proto__var_type__tensor_desc__init(
-    PaddleMobile__Framework__Proto__VarType__TensorDesc *message);
-/* PaddleMobile__Framework__Proto__VarType__LoDTensorDesc methods */
-void paddle_mobile__framework__proto__var_type__lo_dtensor_desc__init(
-    PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *message);
-/* PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc methods */
-void paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__init(
-    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *message);
-/* PaddleMobile__Framework__Proto__VarType__ReaderDesc methods */
-void paddle_mobile__framework__proto__var_type__reader_desc__init(
-    PaddleMobile__Framework__Proto__VarType__ReaderDesc *message);
-/* PaddleMobile__Framework__Proto__VarType__ChannelDesc methods */
-void paddle_mobile__framework__proto__var_type__channel_desc__init(
-    PaddleMobile__Framework__Proto__VarType__ChannelDesc *message);
-/* PaddleMobile__Framework__Proto__VarType__Tuple methods */
-void paddle_mobile__framework__proto__var_type__tuple__init(
-    PaddleMobile__Framework__Proto__VarType__Tuple *message);
-/* PaddleMobile__Framework__Proto__VarType methods */
-void paddle_mobile__framework__proto__var_type__init(
-    PaddleMobile__Framework__Proto__VarType *message);
-PaddleMobile__Framework__Proto__VarType *
-paddle_mobile__framework__proto__var_type__unpack(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data);
-void paddle_mobile__framework__proto__var_type__free_unpacked(
-    PaddleMobile__Framework__Proto__VarType *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator);
-/* PaddleMobile__Framework__Proto__VarDesc methods */
-void paddle_mobile__framework__proto__var_desc__init(
-    PaddleMobile__Framework__Proto__VarDesc *message);
-PaddleMobile__Framework__Proto__VarDesc *
-paddle_mobile__framework__proto__var_desc__unpack(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data);
-void paddle_mobile__framework__proto__var_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__VarDesc *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator);
-/* PaddleMobile__Framework__Proto__BlockDesc methods */
-void paddle_mobile__framework__proto__block_desc__init(
-    PaddleMobile__Framework__Proto__BlockDesc *message);
-PaddleMobile__Framework__Proto__BlockDesc *
-paddle_mobile__framework__proto__block_desc__unpack(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data);
-void paddle_mobile__framework__proto__block_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__BlockDesc *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator);
-/* PaddleMobile__Framework__Proto__ProgramDesc methods */
-void paddle_mobile__framework__proto__program_desc__init(
-    PaddleMobile__Framework__Proto__ProgramDesc *message);
-PaddleMobile__Framework__Proto__ProgramDesc *
-paddle_mobile__framework__proto__program_desc__unpack(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data);
-void paddle_mobile__framework__proto__program_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__ProgramDesc *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator);
-/* --- per-message closures --- */
-
-typedef void (*PaddleMobile__Framework__Proto__Version_Closure)(
-    const PaddleMobile__Framework__Proto__Version *message, void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__OpDesc__Attr_Closure)(
-    const PaddleMobile__Framework__Proto__OpDesc__Attr *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__OpDesc__Var_Closure)(
-    const PaddleMobile__Framework__Proto__OpDesc__Var *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__OpDesc_Closure)(
-    const PaddleMobile__Framework__Proto__OpDesc *message, void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__OpProto__Var_Closure)(
-    const PaddleMobile__Framework__Proto__OpProto__Var *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__OpProto__Attr_Closure)(
-    const PaddleMobile__Framework__Proto__OpProto__Attr *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__OpProto_Closure)(
-    const PaddleMobile__Framework__Proto__OpProto *message, void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType__TensorDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__TensorDesc *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType__LoDTensorDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *message,
-    void *closure_data);
-typedef void (
-    *PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType__ReaderDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__ReaderDesc *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType__ChannelDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__ChannelDesc *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType__Tuple_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__Tuple *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType_Closure)(
-    const PaddleMobile__Framework__Proto__VarType *message, void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarDesc *message, void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__BlockDesc_Closure)(
-    const PaddleMobile__Framework__Proto__BlockDesc *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__ProgramDesc_Closure)(
-    const PaddleMobile__Framework__Proto__ProgramDesc *message,
-    void *closure_data);
-
-/* --- services --- */
-
-/* --- descriptors --- */
-
-extern const PaddleMobile__Framework__ProtobufCEnumDescriptor
-    paddle_mobile__framework__proto__attr_type__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__version__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__attr__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__var__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__var__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__attr__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__tensor_desc__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__reader_desc__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__channel_desc__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__tuple__descriptor;
-extern const PaddleMobile__Framework__ProtobufCEnumDescriptor
-    paddle_mobile__framework__proto__var_type__type__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_desc__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__block_desc__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__program_desc__descriptor;
-
-PROTOBUF_C__END_DECLS
-
-#endif /* PROTOBUF_C_framework_2eproto__INCLUDED */
diff --git a/mobile/src/framework/framework.proto b/mobile/src/framework/framework.proto
deleted file mode 100644
index 27a98e0d61..0000000000
--- a/mobile/src/framework/framework.proto
+++ /dev/null
@@ -1,196 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-syntax = "proto2";
-option optimize_for = LITE_RUNTIME;
-package paddle_mobile.framework.proto;
-
-// Any incompatible changes to ProgramDesc and its dependencies should
-// raise the version defined version.h.
-//
-// Serailization and Deserialization codes should be modified in a way
-// that supports old versions following the version and compatibility policy.
-message Version { optional int64 version = 1 [ default = 0 ]; }
-
-enum AttrType {
-  INT = 0;
-  FLOAT = 1;
-  STRING = 2;
-  INTS = 3;
-  FLOATS = 4;
-  STRINGS = 5;
-  BOOLEAN = 6;
-  BOOLEANS = 7;
-  BLOCK = 8;
-  LONG = 9;
-  BLOCKS = 10;
-  LONGS = 11;
-}
-
-// OpDesc describes an instance of a C++ framework::OperatorBase
-// derived class type.
-message OpDesc {
-
-  message Attr {
-    required string name = 1;
-    required AttrType type = 2;
-    optional int32 i = 3;
-    optional float f = 4;
-    optional string s = 5;
-    repeated int32 ints = 6;
-    repeated float floats = 7;
-    repeated string strings = 8;
-    optional bool b = 10;
-    repeated bool bools = 11;
-    optional int32 block_idx = 12;
-    optional int64 l = 13;
-    repeated int32 blocks_idx = 14;
-    repeated int64 longs = 15;
-  };
-
-  message Var {
-    required string parameter = 1;
-    repeated string arguments = 2;
-  };
-
-  required string type = 3;
-  repeated Var inputs = 1;
-  repeated Var outputs = 2;
-  repeated Attr attrs = 4;
-  optional bool is_target = 5 [ default = false ];
-};
-
-// OpProto describes a C++ framework::OperatorBase derived class.
-message OpProto {
-
-  // VarProto describes the C++ type framework::Variable.
-  message Var {
-    required string name = 1;
-    required string comment = 2;
-
-    optional bool duplicable = 3 [ default = false ];
-    optional bool intermediate = 4 [ default = false ];
-    optional bool dispensable = 5 [ default = false ];
-    optional string reuse = 6;
-  }
-
-  // AttrProto describes the C++ type Attribute.
-  message Attr {
-    required string name = 1;
-    required AttrType type = 2;
-    required string comment = 3;
-    // If that attribute is generated, it means the Paddle third
-    // language binding has responsibility to fill that
-    // attribute. End-User should not set that attribute.
-    optional bool generated = 4 [ default = false ];
-  }
-
-  required string type = 1;
-  repeated Var inputs = 2;
-  repeated Var outputs = 3;
-  repeated Attr attrs = 4;
-  required string comment = 5;
-}
-
-message VarType {
-  enum Type {
-    // Pod Types
-    BOOL = 0;
-    INT16 = 1;
-    INT32 = 2;
-    INT64 = 3;
-    FP16 = 4;
-    FP32 = 5;
-    FP64 = 6;
-    // Tensor<size_t> is used in C++.
-    SIZE_T = 19;
-    UINT8 = 20;
-    INT8 = 21;
-
-    // Other types that may need additional descriptions
-    LOD_TENSOR = 7;
-    SELECTED_ROWS = 8;
-    FEED_MINIBATCH = 9;
-    FETCH_LIST = 10;
-    STEP_SCOPES = 11;
-    LOD_RANK_TABLE = 12;
-    LOD_TENSOR_ARRAY = 13;
-    PLACE_LIST = 14;
-    READER = 15;
-    CHANNEL = 16;
-    // Any runtime decided variable type is raw
-    // raw variables should manage their own allocations
-    // in operators like nccl_op
-    RAW = 17;
-    TUPLE = 18;
-  }
-
-  required Type type = 1;
-
-  message TensorDesc {
-    // Should only be PODType. Is enforced in C++
-    required Type data_type = 1;
-    repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
-  }
-  optional TensorDesc selected_rows = 2;
-
-  message LoDTensorDesc {
-    required TensorDesc tensor = 1;
-    optional int32 lod_level = 2 [ default = 0 ];
-  }
-  optional LoDTensorDesc lod_tensor = 3;
-
-  message LoDTensorArrayDesc {
-    required TensorDesc tensor = 1;
-    optional int32 lod_level = 2 [ default = 0 ];
-  }
-  optional LoDTensorArrayDesc tensor_array = 4;
-
-  message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; }
-  optional ReaderDesc reader = 5;
-
-  message ChannelDesc {
-    required Type data_type = 1;
-    required int64 capacity = 2;
-  }
-  optional ChannelDesc channel = 6;
-
-  message Tuple { repeated Type element_type = 1; }
-  optional Tuple tuple = 7;
-}
-
-message VarDesc {
-  required string name = 1;
-  required VarType type = 2;
-  optional bool persistable = 3 [ default = false ];
-}
-
-message BlockDesc {
-  required int32 idx = 1;
-  required int32 parent_idx = 2;
-  repeated VarDesc vars = 3;
-  repeated OpDesc ops = 4;
-  optional int32 forward_block_idx = 5 [ default = -1 ];
-}
-
-// Please refer to
-// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
-// for more details.
-// TODO(panyx0718): A model can have multiple programs. Need a
-// way to distinguish them. Maybe ID or name?
-message ProgramDesc {
-  repeated BlockDesc blocks = 1;
-
-  optional Version version = 2;
-}
diff --git a/mobile/src/framework/load_ops.h b/mobile/src/framework/load_ops.h
deleted file mode 100755
index ed30a45114..0000000000
--- a/mobile/src/framework/load_ops.h
+++ /dev/null
@@ -1,379 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef PADDLE_MOBILE_CPU
-#define LOAD_CPU_OP(op_type)                                           \
-  extern int TouchOpRegistrar_##op_type##_##cpu();                     \
-  static int use_op_itself_##op_type##_##cpu __attribute__((unused)) = \
-      TouchOpRegistrar_##op_type##_##cpu()
-#else
-#define LOAD_CPU_OP(op_type)
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-#define LOAD_GPU_CL_OP(op_type)                                       \
-  extern int TouchOpRegistrar_##op_type##_##cl();                     \
-  static int use_op_itself_##op_type##_##cl __attribute__((unused)) = \
-      TouchOpRegistrar_##op_type##_##cl()
-#else
-#define LOAD_GPU_CL_OP(op_type)
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-#define LOAD_FPGA_OP(op_type)                                           \
-  extern int TouchOpRegistrar_##op_type##_##fpga();                     \
-  static int use_op_itself_##op_type##_##fpga __attribute__((unused)) = \
-      TouchOpRegistrar_##op_type##_##fpga()
-#else
-#define LOAD_FPGA_OP(op_type)
-#endif
-
-#define LOAD_FUSION_MATCHER(op_type)                                       \
-  extern int TouchFusionMatcherRegistrar_##op_type();                      \
-  static int use_fusion_matcher_itself_##op_type __attribute__((unused)) = \
-      TouchFusionMatcherRegistrar_##op_type();
-
-#define LOAD_OP(op_type)   \
-  LOAD_CPU_OP(op_type);    \
-  LOAD_GPU_CL_OP(op_type); \
-  LOAD_FPGA_OP(op_type);
-
-#define LOAD_OP1(op_type, device_type) LOAD_##device_type##_OP(op_type);
-
-#define LOAD_OP2(op_type, device_type1, device_type2) \
-  LOAD_OP1(op_type, device_type1)                     \
-  LOAD_OP1(op_type, device_type2)
-
-#define LOAD_OP3(op_type, device_type1, device_type2, device_type3) \
-  LOAD_OP2(op_type, device_type1, device_type2)                     \
-  LOAD_OP1(op_type, device_type3)
-
-// load requared ops
-LOAD_OP(feed)
-LOAD_OP(fetch)
-#ifdef FILL_CONSTANT_OP
-LOAD_OP2(fill_constant, CPU, FPGA)
-#endif
-#ifdef BATCHNORM_OP
-LOAD_OP2(batch_norm, CPU, GPU_CL);
-#endif
-#ifdef INSTANCENORM_OP
-LOAD_OP1(instance_norm, GPU_CL);
-#endif
-#ifdef BILINEAR_INTERP_OP
-LOAD_OP1(bilinear_interp, CPU);
-#endif
-#ifdef NEAREST_INTERP_OP
-LOAD_OP1(nearest_interp, CPU);
-#endif
-#ifdef LEAKY_RELU_OP
-LOAD_OP1(leaky_relu, CPU);
-#endif
-#ifdef BOXCODER_OP
-LOAD_OP2(box_coder, CPU, GPU_CL);
-#endif
-#ifdef CONCAT_OP
-LOAD_OP3(concat, CPU, GPU_CL, FPGA);
-#endif
-#ifdef CONV_OP
-LOAD_OP3(conv2d, CPU, GPU_CL, FPGA);
-#endif
-#ifdef LRN_OP
-LOAD_OP2(lrn, CPU, GPU_CL);
-#endif
-#ifdef SIGMOID_OP
-LOAD_OP1(sigmoid, CPU);
-#endif
-#ifdef FUSION_FC_RELU_OP
-LOAD_OP2(fusion_fc_relu, CPU, FPGA);
-LOAD_FUSION_MATCHER(fusion_fc_relu);
-#endif
-#ifdef FUSION_ELEMENTWISEADDRELU_OP
-LOAD_OP2(fusion_elementwise_add_relu, CPU, FPGA);
-LOAD_FUSION_MATCHER(fusion_elementwise_add_relu);
-#endif
-#ifdef SPLIT_OP
-LOAD_OP2(split, CPU, GPU_CL);
-#endif
-#ifdef RESIZE_OP
-LOAD_OP1(resize, CPU);
-#endif
-#ifdef FUSION_CONVADDBNRELU_OP
-LOAD_OP3(fusion_conv_add_bn_relu, CPU, GPU_CL, FPGA);
-LOAD_FUSION_MATCHER(fusion_conv_add_bn_relu);
-#endif
-#ifdef RESHAPE_OP
-LOAD_OP2(reshape, CPU, GPU_CL);
-#endif
-#ifdef RESHAPE2_OP
-LOAD_OP2(reshape2, CPU, GPU_CL);
-#endif
-#ifdef TRANSPOSE_OP
-LOAD_OP2(transpose, CPU, GPU_CL);
-#endif
-#ifdef TRANSPOSE2_OP
-LOAD_OP2(transpose2, CPU, GPU_CL);
-#endif
-#ifdef PRIORBOX_OP
-LOAD_OP2(prior_box, CPU, GPU_CL);
-#endif
-#ifdef DENSITY_PRIORBOX_OP
-LOAD_OP2(density_prior_box, CPU, GPU_CL);
-#endif
-#ifdef FUSION_CONVADDRELU_OP
-LOAD_OP3(fusion_conv_add_relu, CPU, GPU_CL, FPGA);
-LOAD_FUSION_MATCHER(fusion_conv_add_relu);
-#endif
-#ifdef FUSION_CONVADD_OP
-LOAD_OP2(fusion_conv_add, CPU, GPU_CL);
-LOAD_FUSION_MATCHER(fusion_conv_add);
-#endif
-#ifdef SOFTMAX_OP
-LOAD_OP2(softmax, CPU, GPU_CL);
-#endif
-#ifdef SHAPE_OP
-LOAD_OP1(shape, CPU);
-#endif
-#ifdef DEPTHWISECONV_OP
-LOAD_OP2(depthwise_conv2d, CPU, GPU_CL);
-#endif
-#ifdef CONV_TRANSPOSE_OP
-LOAD_OP2(conv2d_transpose, CPU, GPU_CL);
-#endif
-#ifdef SCALE_OP
-LOAD_OP2(scale, CPU, GPU_CL);
-#endif
-#ifdef ELEMENTWISEADD_OP
-LOAD_OP2(elementwise_add, CPU, GPU_CL);
-#endif
-#ifdef PRELU_OP
-LOAD_OP1(prelu, CPU);
-#endif
-#ifdef TANH_OP
-LOAD_OP2(tanh, CPU, GPU_CL);
-#endif
-#ifdef FLATTEN_OP
-LOAD_OP1(flatten, CPU);
-#endif
-#ifdef FLATTEN2_OP
-LOAD_OP2(flatten2, CPU, GPU_CL);
-#endif
-#ifdef FUSION_CONVBNADDRELU_OP
-LOAD_OP3(fusion_conv_bn_add_relu, CPU, GPU_CL, FPGA);
-LOAD_FUSION_MATCHER(fusion_conv_bn_add_relu);
-#endif
-#ifdef FUSION_CONVBNRELU_OP
-LOAD_OP3(fusion_conv_bn_relu, CPU, GPU_CL, FPGA);
-LOAD_FUSION_MATCHER(fusion_conv_bn_relu);
-#endif
-#ifdef FUSION_CONVRELU_OP
-LOAD_OP2(fusion_conv_relu, CPU, GPU_CL);
-LOAD_FUSION_MATCHER(fusion_conv_relu);
-#endif
-#ifdef GRU_OP
-LOAD_OP1(gru, CPU);
-#endif
-#ifdef GRU_UNIT_OP
-LOAD_OP1(gru_unit, CPU);
-#endif
-#ifdef FUSION_CONVADDBN_OP
-LOAD_OP2(fusion_conv_add_bn, CPU, FPGA);
-LOAD_FUSION_MATCHER(fusion_conv_add_bn);
-#endif
-#ifdef DROPOUT_OP
-LOAD_OP3(dropout, CPU, GPU_CL, FPGA);
-#endif
-#ifdef FUSION_DWCONVBNRELU_OP
-LOAD_OP2(fusion_dwconv_bn_relu, CPU, GPU_CL);
-LOAD_FUSION_MATCHER(fusion_dwconv_bn_relu);
-#endif
-#ifdef CRF_OP
-LOAD_OP1(crf_decoding, CPU);
-#endif
-#ifdef MUL_OP
-LOAD_OP2(mul, CPU, GPU_CL);
-#endif
-#ifdef NORM_OP
-LOAD_OP1(norm, CPU);
-#endif
-#ifdef RELU_OP
-LOAD_OP2(relu, CPU, GPU_CL);
-LOAD_OP2(relu6, CPU, GPU_CL);
-#endif
-#ifdef IM2SEQUENCE_OP
-LOAD_OP1(im2sequence, CPU);
-#endif
-#ifdef LOOKUP_OP
-LOAD_OP1(lookup_table, CPU);
-#endif
-#ifdef FUSION_FC_OP
-LOAD_OP3(fusion_fc, CPU, GPU_CL, FPGA);
-LOAD_FUSION_MATCHER(fusion_fc);
-#endif
-#ifdef POOL_OP
-LOAD_OP3(pool2d, CPU, GPU_CL, FPGA);
-#endif
-#ifdef MULTICLASSNMS_OP
-LOAD_OP2(multiclass_nms, CPU, GPU_CL);
-#endif
-#ifdef POLYGONBOXTRANSFORM_OP
-LOAD_OP1(polygon_box_transform, CPU);
-#endif
-#ifdef SUM_OP
-LOAD_OP1(sum, CPU);
-#endif
-#ifdef ELEMENTWISEMUL_OP
-LOAD_OP1(elementwise_mul, CPU);
-#endif
-#ifdef SLICE_OP
-LOAD_OP1(slice, CPU);
-#endif
-#ifdef FUSION_CONVBN_OP
-LOAD_OP2(fusion_conv_bn, CPU, FPGA);
-LOAD_FUSION_MATCHER(fusion_conv_bn);
-#endif
-#ifdef ELEMENTWISESUB_OP
-LOAD_OP1(elementwise_sub, CPU)
-#endif
-#ifdef TOP_K_OP
-LOAD_OP1(top_k, CPU)
-#endif
-#ifdef CAST_OP
-LOAD_OP1(cast, CPU)
-#endif
-#ifdef QUANT_OP
-LOAD_OP1(quantize, CPU);
-#endif
-#ifdef DEQUANT_OP
-LOAD_OP1(dequantize, CPU);
-#endif
-#ifdef FUSION_DEQUANT_BN_OP
-LOAD_OP1(fusion_dequant_bn, CPU);
-LOAD_FUSION_MATCHER(fusion_dequant_bn);
-#endif
-#ifdef FUSION_DEQUANT_ADD_BN_OP
-LOAD_OP1(fusion_dequant_add_bn, CPU);
-LOAD_FUSION_MATCHER(fusion_dequant_add_bn);
-#endif
-#ifdef FUSION_DEQUANT_BN_RELU_OP
-LOAD_OP1(fusion_dequant_bn_relu, CPU);
-LOAD_FUSION_MATCHER(fusion_dequant_bn_relu);
-#endif
-#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
-LOAD_OP1(fusion_dequant_add_bn_relu, CPU);
-LOAD_FUSION_MATCHER(fusion_dequant_add_bn_relu);
-#endif
-#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
-LOAD_OP1(fusion_dequant_add_bn_quant, CPU);
-LOAD_FUSION_MATCHER(fusion_dequant_add_bn_quant);
-#endif
-#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
-LOAD_OP1(fusion_dequant_add_bn_relu_quant, CPU);
-LOAD_FUSION_MATCHER(fusion_dequant_add_bn_relu_quant);
-#endif
-#ifdef SEQUENCE_EXPAND_OP
-LOAD_OP1(sequence_expand, CPU);
-#endif
-#ifdef SEQUENCE_POOL_OP
-LOAD_OP1(sequence_pool, CPU);
-#endif
-#ifdef SEQUENCE_SOFTMAX_OP
-LOAD_OP1(sequence_softmax, CPU);
-#endif
-#ifdef LOG_OP
-LOAD_OP1(log, CPU);
-#endif
-#ifdef LOD_RESET_OP
-LOAD_OP1(lod_reset, CPU);
-#endif
-#ifdef LESS_THAN_OP
-LOAD_OP1(less_than, CPU);
-#endif
-#ifdef LOGICAL_AND_OP
-LOAD_OP1(logical_and, CPU);
-#endif
-#ifdef LOGICAL_OR_OP
-LOAD_OP1(logical_or, CPU);
-#endif
-#ifdef LOGICAL_NOT_OP
-LOAD_OP1(logical_not, CPU);
-#endif
-#ifdef LOGICAL_XOR_OP
-LOAD_OP1(logical_xor, CPU);
-#endif
-#ifdef WHILE_OP
-LOAD_OP1(while, CPU);
-#endif
-#ifdef WRITE_TO_ARRAY_OP
-LOAD_OP1(write_to_array, CPU);
-#endif
-#ifdef READ_FROM_ARRAY_OP
-LOAD_OP1(read_from_array, CPU);
-#endif
-#ifdef IS_EMPTY_OP
-LOAD_OP1(is_empty, CPU);
-#endif
-#ifdef INCREMENT_OP
-LOAD_OP1(increment, CPU);
-#endif
-#ifdef ANCHOR_GENERATOR_OP
-LOAD_OP1(anchor_generator, CPU);
-#endif
-#ifdef PROPOSAL_OP
-LOAD_OP1(generate_proposals, CPU);
-#endif
-#ifdef PSROI_POOL_OP
-LOAD_OP1(psroi_pool, CPU);
-#endif
-#ifdef ROI_PERSPECTIVE_OP
-LOAD_OP1(roi_perspective_transform, CPU);
-#endif
-#ifdef BEAM_SEARCH_OP
-LOAD_OP1(beam_search, CPU);
-#endif
-#ifdef BEAM_SEARCH_DECODE_OP
-LOAD_OP1(beam_search_decode, CPU);
-#endif
-#ifdef PAD2D_OP
-LOAD_OP1(pad2d, CPU);
-#endif
-#ifdef ONE_HOT_OP
-LOAD_OP1(one_hot, CPU);
-#endif
-#ifdef ASSIGN_VALUE_OP
-LOAD_OP1(assign_value, CPU);
-#endif
-#ifdef EXP_OP
-LOAD_OP1(exp, CPU);
-#endif
-#ifdef ASSIGN_OP
-LOAD_OP1(assign, CPU);
-#endif
-#ifdef CONDITIONAL_BLOCK_OP
-LOAD_OP1(conditional_block, CPU);
-#endif
-#ifdef EQUAL_OP
-LOAD_OP1(equal, CPU);
-#endif
-#ifdef FILL_CONSTANT_BATCH_SIZE_LIKE_OP
-LOAD_OP1(fill_constant_batch_size_like, CPU);
-#endif
-#ifdef RANGE_OP
-LOAD_OP1(range, CPU);
-#endif
-#ifdef REDUCE_PROD_OP
-LOAD_OP1(reduce_prod, CPU);
-#endif
diff --git a/mobile/src/framework/loader.cpp b/mobile/src/framework/loader.cpp
deleted file mode 100644
index 4350fda969..0000000000
--- a/mobile/src/framework/loader.cpp
+++ /dev/null
@@ -1,290 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/loader.h"
-
-#include "framework/lod_tensor.h"
-#include "framework/program/program-optimize/program_optimize.h"
-#ifdef PADDLE_MOBILE_CL
-#include "framework/cl/cl_image.h"
-#endif
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Device, typename T>
-void Loader<Device, T>::InitMemoryFromProgram(
-    const std::shared_ptr<ProgramDesc> &originProgramDesc,
-    const std::shared_ptr<Scope> &scope) {
-  for (const auto &block : originProgramDesc.get()->Blocks()) {
-    for (const auto &var_desc : block->Vars()) {
-      auto var = scope.get()->Var(var_desc->Name());
-      if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
-        if (var_desc->Persistable()) {
-          auto dim = var_desc->Tensor_desc().Dims();
-          auto tensor = var->GetMutable<LoDTensor>();
-          tensor->Resize(make_ddim(dim));
-        } else {
-          auto dim = var_desc->Tensor_desc().Dims();
-          if (dim.size() == 0) {
-            auto tensor = var->GetMutable<LoDTensor>();
-            framework::DDim dDim = {0};
-            tensor->Resize(dDim);
-          } else {
-            for (auto &d : dim) {
-              if (d < 0) {
-                d *= -1;
-              }
-            }
-            auto tensor = var->GetMutable<LoDTensor>();
-            tensor->Resize(make_ddim(dim));
-          }
-        }
-      } else {
-        // TODO(codeWorm)
-      }
-    }
-  }
-}
-
-#ifdef PADDLE_MOBILE_CL
-template <>
-void Loader<GPU_CL, float>::InitMemoryFromProgram(
-    const std::shared_ptr<ProgramDesc> &originProgramDesc,
-    const std::shared_ptr<Scope> &scope) {
-  for (const auto &block : originProgramDesc.get()->Blocks()) {
-    for (const auto &var_desc : block->Vars()) {
-      auto var = scope.get()->Var(var_desc->Name());
-      if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
-        if (var_desc->Persistable()) {
-          auto dim = var_desc->Tensor_desc().Dims();
-          auto cl_image = var->GetMutable<framework::CLImage>();
-          cl_image->Resize(make_ddim(dim));
-        } else {
-          auto dim = var_desc->Tensor_desc().Dims();
-          PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
-          dim[0] = 1;
-          auto cl_image = var->GetMutable<framework::CLImage>();
-          cl_image->Resize(make_ddim(dim));
-        }
-      } else {
-        // TODO(codeWorm)
-      }
-    }
-  }
-}
-template <>
-const Program<GPU_CL, float> Loader<GPU_CL, float>::LoadCombinedMemory(
-    size_t read_size, const uint8_t *buf, size_t combined_params_len,
-    uint8_t *combined_params_buf, bool optimize, bool quantification) {
-  bool can_add_split = false;
-
-  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
-  PADDLE_MOBILE_ENFORCE(buf != nullptr, "read from __model__ is null");
-
-  c_program = paddle_mobile__framework__proto__program_desc__unpack(
-      nullptr, read_size, buf);
-  //
-  PADDLE_MOBILE_ENFORCE(c_program != nullptr, "program is null");
-  //
-  DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
-  //
-
-  auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);
-
-  Program<GPU_CL, float> program;
-  program.combined = true;
-  program.originProgram = originProgramDesc;
-  program.quantification = quantification;
-  program.combined_params_len = combined_params_len;
-  program.combined_params_buf = combined_params_buf;
-
-  auto scope = std::make_shared<Scope>();
-  program.scope = scope;
-  InitMemoryFromProgram(originProgramDesc, scope);
-  if (optimize) {
-    ProgramOptimize program_optimize;
-    program.optimizeProgram =
-        program_optimize.FusionOptimize(originProgramDesc, can_add_split);
-    if (!program.optimizeProgram) {
-      program.optimizeProgram = originProgramDesc;
-    }
-  }
-  if (optimize) {
-    program.optimizeProgram->Description("optimize: ");
-  } else {
-    originProgramDesc->Description("program: ");
-  }
-  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program,
-                                                               nullptr);
-  return program;
-}
-
-#endif
-
-/**
- * fusion and print someinfos
- * @tparam Device
- * @tparam P
- * @param optimize
- * @param can_add_split
- * @param program
- * @param originProgramDesc
- */
-template <typename Device, typename T>
-void FusionAndPrintInfos(
-    bool optimize, bool can_add_split, Program<Device, T> *program,
-    const std::shared_ptr<ProgramDesc> &originProgramDesc) {
-  if (optimize) {
-    ProgramOptimize program_optimize;
-    program->optimizeProgram =
-        program_optimize.FusionOptimize(originProgramDesc, can_add_split);
-    if (!program->optimizeProgram) {
-      program->optimizeProgram = originProgramDesc;
-    }
-  }
-  if (optimize) {
-    program->optimizeProgram->Description("optimize: ");
-  } else {
-    originProgramDesc->Description("program: ");
-  }
-}
-
-static size_t ReadBuffer(const char *file_name, uint8_t **out) {
-  FILE *fp;
-  fp = fopen(file_name, "rb");
-  PADDLE_MOBILE_ENFORCE(fp != NULL, " %s open failed !", file_name);
-
-  fseek(fp, 0, SEEK_END);
-  size_t size = ftell(fp);
-  rewind(fp);
-
-  DLOG << "model size: " << size;
-
-  *out = reinterpret_cast<uint8_t *>(malloc(size));
-
-  size_t cur_len = 0;
-  size_t nread;
-  while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
-    cur_len += nread;
-  }
-  fclose(fp);
-  return cur_len;
-}
-
-template <typename Device, typename T>
-const Program<Device, T> Loader<Device, T>::Load(const std::string &dirname,
-                                                 bool optimize,
-                                                 bool quantification,
-                                                 bool can_add_split) {
-  auto program = this->LoadProgram(dirname + "/__model__", optimize,
-                                   quantification, can_add_split);
-  program.model_path = dirname;
-  return program;
-}
-
-template <typename Device, typename T>
-const Program<Device, T> Loader<Device, T>::Load(const std::string &model_path,
-                                                 const std::string &para_path,
-                                                 bool optimize,
-                                                 bool quantification) {
-  auto program = this->LoadProgram(model_path, optimize, quantification);
-
-  program.para_path = para_path;
-  program.combined = true;
-  program.quantification = quantification;
-  return program;
-}
-
-template <typename Device, typename T>
-const Program<Device, T> Loader<Device, T>::LoadProgram(
-    const std::string &model_path, bool optimize, bool quantification,
-    bool can_add_split) {
-  std::string model_filename = model_path;
-  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
-  uint8_t *buf = NULL;
-  size_t read_size = ReadBuffer(model_filename.c_str(), &buf);
-
-  PADDLE_MOBILE_ENFORCE(buf != NULL, "read from __model__ is null");
-
-  c_program = paddle_mobile__framework__proto__program_desc__unpack(
-      NULL, read_size, buf);
-  //
-  PADDLE_MOBILE_ENFORCE(c_program != NULL, "program is null");
-  //
-  DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
-  //
-  auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);
-
-  Program<Device, T> program;
-  program.originProgram = originProgramDesc;
-  program.quantification = quantification;
-  program.combined_params_len = 0;
-  program.combined_params_buf = nullptr;
-  auto scope = std::make_shared<Scope>();
-  program.scope = scope;
-
-  // use  originProgramDesc and scope to init tensors
-  InitMemoryFromProgram(originProgramDesc, scope);
-  // perform fusion and print infos
-  FusionAndPrintInfos(optimize, can_add_split, &program, originProgramDesc);
-
-  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
-  free(buf);
-  return program;
-}
-
-template <typename Device, typename T>
-const Program<Device, T> Loader<Device, T>::LoadCombinedMemory(
-    size_t read_size, const uint8_t *buf, size_t combined_params_len,
-    uint8_t *combined_params_buf, bool optimize, bool quantification) {
-  bool can_add_split = false;
-
-  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
-  PADDLE_MOBILE_ENFORCE(buf != nullptr, "read from __model__ is null");
-
-  c_program = paddle_mobile__framework__proto__program_desc__unpack(
-      nullptr, read_size, buf);
-  //
-  PADDLE_MOBILE_ENFORCE(c_program != nullptr, "program is null");
-  //
-  DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
-  //
-
-  auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);
-
-  Program<Device, T> program;
-  program.combined = true;
-  program.originProgram = originProgramDesc;
-  program.quantification = quantification;
-  program.combined_params_len = combined_params_len;
-  program.combined_params_buf = combined_params_buf;
-
-  auto scope = std::make_shared<Scope>();
-  program.scope = scope;
-  InitMemoryFromProgram(originProgramDesc, scope);
-  FusionAndPrintInfos(optimize, can_add_split, &program, originProgramDesc);
-  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program,
-                                                               nullptr);
-  return program;
-}
-
-template class Loader<CPU, float>;
-
-template class Loader<FPGA, float>;
-
-template class Loader<GPU_CL, float>;
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/loader.h b/mobile/src/framework/loader.h
deleted file mode 100644
index bd4dfa1556..0000000000
--- a/mobile/src/framework/loader.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-
-#include "common/types.h"
-#include "framework/program/program.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Device = CPU, typename T = float>
-class Loader {
- public:
-  /*
-   * @b load separate format fluid model
-   * @b 加载分开存储的fluid模型
-   * */
-  const Program<Device, T> Load(const std::string &dirname,
-                                bool optimize = false,
-                                bool quantification = false,
-                                bool can_add_split = false);
-
-  /*
-   * @b load combine format fluid mode
-   * @b 加载统一存储的fluid模型
-   * */
-  const Program<Device, T> Load(const std::string &model_path,
-                                const std::string &para_path,
-                                bool optimize = false,
-                                bool quantification = false);
-
-  const Program<Device, T> LoadCombinedMemory(size_t model_len,
-                                              const uint8_t *model_buf,
-                                              size_t combined_params_len,
-                                              uint8_t *combined_params_buf,
-                                              bool optimize = false,
-                                              bool quantification = false);
-
- private:
-  const Program<Device, T> LoadProgram(const std::string &model_path,
-                                       bool optimize = false,
-                                       bool quantification = false,
-                                       bool can_add_split = false);
-
-  void InitMemoryFromProgram(
-      const std::shared_ptr<ProgramDesc> &originProgramDesc,
-      const std::shared_ptr<Scope> &scope);
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/lod_tensor.cpp b/mobile/src/framework/lod_tensor.cpp
deleted file mode 100644
index 0a1a6f881d..0000000000
--- a/mobile/src/framework/lod_tensor.cpp
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/lod_tensor.h"
-#include <algorithm>
-
-namespace paddle_mobile {
-namespace framework {
-
-LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
-                 size_t elem_end) {
-  PADDLE_MOBILE_ENFORCE(level < in.size(), "level should >= in.size()");
-  PADDLE_MOBILE_ENFORCE(elem_end < in[level].size(),
-                        "elem_end >= in[level].size()");
-  LoD res;
-  res.resize(in.size() - level);
-  // copy the first level
-  res[0].assign(in[level].begin() + elem_begin,
-                in[level].begin() + elem_end + 1);
-  for (size_t lvl = 1; lvl < res.size(); lvl++) {
-    const auto &in_level = in[level + lvl];
-    const auto &above_level = res[lvl - 1];
-    auto &out_level = res[lvl];
-    out_level.assign(in_level.begin() + above_level.front(),
-                     in_level.begin() + above_level.back() + 1);
-  }
-  for (size_t lvl = 0; lvl < res.size(); lvl++) {
-    // to make the first offset equals 0, all the elements minus the
-    // first
-    // element
-    size_t front = res[lvl].front();
-    for (auto &ele : res[lvl]) {
-      ele -= front;
-    }
-  }
-  return res;
-}
-
-LoD ToAbsOffset(const LoD &in) {
-  // the lowest level stores relative offsets
-  if (in.empty() || in.size() == 1) return in;
-  LoD result = in;
-  for (auto level = static_cast<int>(in.size() - 2); level >= 0; level--) {
-    for (size_t i = 0; i < in[level].size(); ++i) {
-      size_t index = in[level][i];
-      result[level][i] = result[level + 1][index];
-    }
-  }
-  return result;
-}
-
-bool operator==(const LoD &a, const LoD &b) {
-  if (a.size() != b.size()) {
-    return false;
-  }
-
-  for (size_t i = 0; i < a.size(); i++) {
-    const auto &a_level = a[i];
-    const auto &b_level = b[i];
-    if (a_level.size() != b_level.size()) {
-      return false;
-    }
-    for (size_t j = 0; j < a_level.size(); j++) {
-      if (a_level[j] != b_level[j]) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-bool CheckLoD(const LoD &in, int tensor_height) {
-  if (in.empty()) return true;
-  for (const auto &level : in) {
-    // check: there should be more than 2 offsets existing in each
-    // level.
-    if (level.size() < 2) return false;
-    // check: the first offset(the begin offset) of each level
-    // should be 0.
-    if (level.front() != 0) return false;
-    // check: all the offsets in a level should be ascending(no same
-    // items
-    // allows).
-    if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) {
-          if (a < b) return true;
-          return false;
-        })) {
-      PADDLE_MOBILE_THROW_EXCEPTION("ascending error")
-      return false;
-    }
-  }
-  // check: the lowest level's last offset should equals
-  // `tensor_height` if
-  //        tensor_height>0.
-  if (tensor_height > 0 && (size_t)tensor_height != in.back().back())
-    return false;
-
-  // check: the higher level's last offset should equals the lower
-  // level's
-  // size-1.
-  // NOTE LoD store the levels from top to bottom, so the higher level
-  // goes
-  // first.
-  for (size_t level = 0; level < in.size() - 1; level++) {
-    if (in[level].back() != in[level + 1].size() - 1) return false;
-  }
-  return true;
-}
-
-bool CheckAbsLoD(const LoD &in, int tensor_height) {
-  if (in.empty()) return true;
-  for (const auto &level : in) {
-    // check: all the offsets in a level should be ascending(no same
-    // items
-    // allows).
-    if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) {
-          if (a < b) return true;
-          return false;
-        })) {
-      return false;
-    }
-
-    // check: there should be more than 2 offsets existing in each
-    // level.
-    if (level.size() < 2) return false;
-
-    // check: the first offset of each level should be 0, and the
-    // last should be
-    // the same(the height of underlying tensor).
-    if (level.front() != 0) return false;
-    if (tensor_height < 0) {
-      tensor_height = level.back();
-    } else if ((size_t)tensor_height != level.back()) {
-      return false;
-    }
-  }
-  return true;
-}
-
-using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
-
-LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
-                                        size_t end_idx, size_t start_level) {
-  LoD sub_lod;
-
-  for (size_t level_idx = start_level; level_idx < lod.size(); ++level_idx) {
-    PADDLE_MOBILE_ENFORCE(start_idx <= end_idx, "start_idx > end_idx");
-    PADDLE_MOBILE_ENFORCE(end_idx < lod[level_idx].size(),
-                          "end_idx >= lod[level_idx].size()");
-    std::vector<size_t> level_lens;
-    for (size_t i = start_idx; i < end_idx; ++i) {
-      level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]);
-    }
-    sub_lod.emplace_back(level_lens);
-    start_idx = lod[level_idx][start_idx];
-    end_idx = lod[level_idx][end_idx];
-  }
-
-  return LoDAndOffset{sub_lod, {start_idx, end_idx}};
-}
-
-void AppendLoD(LoD *lod, const LoD &lod_length) {
-  PADDLE_MOBILE_ENFORCE(
-      lod->empty() || lod->size() == lod_length.size(),
-      "The lod_length should has the same size with the appended lod.");
-  if (lod->empty()) {
-    for (size_t i = 0; i < lod_length.size(); ++i) {
-      lod->emplace_back(1, 0);  // size = 1, value = 0;
-    }
-    *lod = LoD(lod_length.size(), std::vector<size_t>({0}));
-  }
-  for (size_t i = 0; i < lod->size(); ++i) {
-    auto &level = (*lod)[i];
-    for (size_t len : lod_length[i]) {
-      level.push_back(level.back() + len);
-    }
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/lod_tensor.h b/mobile/src/framework/lod_tensor.h
deleted file mode 100644
index 6d67b517ff..0000000000
--- a/mobile/src/framework/lod_tensor.h
+++ /dev/null
@@ -1,234 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "framework/tensor.h"
-#include "framework/tensor_util.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-/*
- * LoD is short for Level of Details.
- *
- * - in a level, each element indicates relative offset of the lower
- * level
- * - the first element should be 0 and that indicates that this sequence
- * start
- * from 0
- * - each sequence's begin and end(no-inclusive) is level[id, id+1]
- *
- * For example:
- *    3-level LoD stores
- *
- *    0 2 3
- *    0 2 4 7
- *    0 2 5 7 10 12 15 20
- */
-using LoD = std::vector<std::vector<size_t>>;
-
-std::ostream &operator<<(std::ostream &os, const LoD &lod);
-
-std::ostream &operator<<(std::ostream &os, const LoDTensor &t);
-
-std::string LoDToString(const LoD &lod);
-
-LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
-                 size_t elem_end);
-
-/*
- * Transform an LoD from relative offsets to absolute offsets.
- */
-LoD ToAbsOffset(const LoD &in);
-
-bool operator==(const LoD &a, const LoD &b);
-
-/*
- * Check whether this lod's format is valid.
- *
- * ATTENTION:
- *   - Empty lod is treated as valid.
- *
- * It will check two things:
- *
- *  1. all the offsets in a level should be ascending(no same items
- * allows).
- *  2. there should be more than 2 offsets existing in each level.
- *  3. the higher level's last offset should equals the lower level's
- * size-1.
- *  4. the first offset(the begin offset) of each level should be 0.
- *  5. the lowest level's last offset should equals `tensor_height` if
- * tensor_height>0.
- */
-
-bool CheckLoD(const LoD &in, int tensor_height = -1);
-
-/*
- * Check whether this absolute lod's format is valid.
- *
- * ATTENTION:
- *   - Empty lod is treated as valid.
- *
- * It will check two things:
- *  1. all the offsets in a level should be ascending(no same items
- * allows)
- *  2. there should be more than 2 offsets existing in each level.
- *  3. the first offset of each level should be 0, and the last should
- * be the
- *     same(the height of underlying tensor) or `tensor_height` if
- *     tensor_height>0.
- */
-bool CheckAbsLoD(const LoD &in, int tensor_height = -1);
-
-/*
- * LoDTensor (Level of details Tensor)
- * see https://en.wikipedia.org/wiki/Level_of_details for reference.
- */
-class LoDTensor : public Tensor {
- public:
-  LoDTensor() : Tensor() {}
-
-  explicit LoDTensor(const LoD &lod) : lod_(lod) {}
-
-  void set_lod(const LoD &lod) { lod_ = lod; }
-
-  const LoD &lod() const { return lod_; }
-
-  LoD *mutable_lod() { return &lod_; }
-
-  /*
-   * Get the start offset and end offset of an  element from LoD.
-   */
-  std::pair<size_t, size_t> lod_element(size_t level, size_t elem) const {
-    //    PADDLE_ENFORCE_LT(level, NumLevels());
-    //    PADDLE_ENFORCE_LT(elem, NumElements(level));
-    return std::make_pair((lod_)[level][elem], (lod_)[level][elem + 1]);
-  }
-
-  /*
-   * Number of LoDTensor's levels, each level has units of data, for
-   * example,
-   * in the sentence's view, article, paragraph, sentence are 3
-   * levels.
-   */
-  size_t NumLevels() const { return lod_.size(); }
-
-  /*
-   * Number of elements in a level.
-   */
-  size_t NumElements(size_t level = 0) const {
-    //    PADDLE_ENFORCE_LT(level, NumLevels());
-    // the last offset is the end of last element
-    return (lod_)[level].size() - 1;
-  }
-
- private:
-  LoD lod_;
-};
-
-/*
- * Expand the `source` to fit the LoD of `lod`. For example, a `source`
- * LoDTensor is
- *  - LoD: [0, 2]
- *  - tensor: [a0, a1]
- * a `lod` is
- *  - LoD: [0 3 5]
- * returns a new LoDTensor
- *  - [a0 a0 a0 a1 a1]
- */
-template <typename T>
-LoDTensor LodExpand(const LoDTensor &source, const LoD &lod, size_t level) {
-  LoD abs_lod = ToAbsOffset(lod);
-  const auto &lod_level = lod[level];
-  size_t num_instances = source.dims()[0];
-
-  // new tensor
-  LoDTensor tensor;
-  tensor.set_lod(lod);
-  auto dims = source.dims();
-  dims[0] = lod_level.back();
-  tensor.Resize(dims);
-  tensor.mutable_data<T>();
-
-  //  PADDLE_ENFORCE_EQ(num_instances, lod_level.size() - 1);
-  for (size_t ins = 0; ins < num_instances; ins++) {
-    for (size_t elem = lod_level[ins]; elem < lod_level[ins + 1]; elem++) {
-      auto slice = tensor.Slice(elem, elem + 1);
-      TensorCopy(source.Slice(ins, ins + 1), &slice);
-    }
-  }
-  return tensor;
-}
-
-using LoDTensorArray = std::vector<LoDTensor>;
-
-// Get the absolute offset of a lod[start_level][start_idx:end_idx] and
-// relative length of details for every levels(i.e., [start_level: ]).
-//
-// For example,
-//   lod = [[0, 3, 4, 8], [0, 9, 10, 11, 13, 17, 19, 22, 24]]
-//   start_level = 0
-//   start_idx = 1
-//   end_idx = 3
-//
-// Returns:
-//  LoD = [[1, 4], [2, 4, 2, 3, 2]]
-//  pair<size_t, size_t> = {11, 24}
-std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(
-    const LoD &lod, size_t start_idx, size_t end_idx, size_t start_level);
-
-void AppendLoD(LoD *lod, const LoD &lod_length);
-
-/*
- * Serialize/Desiralize LoDTensor to std::ostream
- * You can pass ofstream or ostringstream to serilize to file
- * or to a in memory string. GPU tensor will be copied to CPU.
- */
-void SerializeToStream(std::ostream &os, const LoDTensor &tensor);
-
-void DeserializeFromStream(std::istream &is, LoDTensor *tensor);
-
-#ifdef PADDLE_MOBILE_DEBUG
-inline Print &operator<<(Print &printer, const LoDTensor &tensor) {
-  printer << " dims: " << tensor.dims() << "\n";
-  int stride = tensor.numel() / 20;
-  stride = stride > 0 ? stride : 1;
-#ifndef PADDLE_MOBILE_FPGA
-  for (int i = 0; i < tensor.numel(); i += stride) {
-    if (tensor.type() == type_id<float>()) {
-      printer << tensor.data<float>()[i] << " ";
-    } else if (tensor.type() == type_id<int32_t>()) {
-      printer << tensor.data<int32_t>()[i] << " ";
-    } else if (tensor.type() == type_id<int64_t>()) {
-      printer << tensor.data<int64_t>()[i] << " ";
-    } else if (tensor.type() == type_id<int8_t>()) {
-      printer << static_cast<int>(tensor.data<int8_t>()[i]) << " ";
-    } else if (tensor.type() == type_id<int32_t>()) {
-      printer << tensor.data<int32_t>()[i] << " ";
-    } else if (tensor.type() == type_id<bool>()) {
-      printer << tensor.data<bool>()[i] << " ";
-    }
-  }
-#endif  // PADDLE_MOBILE_FPGA
-  return printer;
-}
-#endif  // PADDLE_MOBILE_DEBUG
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/mixed_vector.h b/mobile/src/framework/mixed_vector.h
deleted file mode 100644
index 6e46164fb7..0000000000
--- a/mobile/src/framework/mixed_vector.h
+++ /dev/null
@@ -1,271 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <initializer_list>
-#include <vector>
-#include "framework/tensor.h"
-#include "framework/tensor_util.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-// Vector<T> implements the std::vector interface, and can get Data or
-// MutableData from any place. The data will be synced implicitly inside.
-template <typename T>
-class Vector {
- public:
-  using value_type = T;
-  // Default ctor. Create empty Vector
-  Vector() { InitEmpty(); }
-
-  // Fill vector with value. The vector size is `count`.
-  explicit Vector(size_t count, const T& value = T()) {
-    InitEmpty();
-    if (count != 0) {
-      resize(count);
-      T* ptr = begin();
-      for (size_t i = 0; i < count; ++i) {
-        ptr[i] = value;
-      }
-    }
-  }
-
-  // Ctor with init_list
-  Vector(std::initializer_list<T> init) {
-    if (init.size() == 0) {
-      InitEmpty();
-    } else {
-      InitByIter(init.size(), init.begin(), init.end());
-    }
-  }
-
-  // implicit cast from std::vector.
-  template <typename U>
-  Vector(const std::vector<U>& dat) {  // NOLINT
-    if (dat.size() == 0) {
-      InitEmpty();
-    } else {
-      InitByIter(dat.size(), dat.begin(), dat.end());
-    }
-  }
-
-  // Copy ctor
-  Vector(const Vector<T>& other) { this->operator=(other); }
-
-  // Copy operator
-  Vector<T>& operator=(const Vector<T>& other) {
-    if (other.size() != 0) {
-      this->InitByIter(other.size(), other.begin(), other.end());
-    } else {
-      InitEmpty();
-    }
-    return *this;
-  }
-
-  // Move ctor
-  Vector(Vector<T>&& other) {
-    this->size_ = other.size_;
-    this->flag_ = other.flag_;
-    if (other.cuda_vec_.memory_size()) {
-      this->cuda_vec_.ShareDataWith(other.cuda_vec_);
-    }
-    if (other.cpu_vec_.memory_size()) {
-      this->cpu_vec_.ShareDataWith(other.cpu_vec_);
-    }
-  }
-
-  // CPU data access method. Mutable.
-  T& operator[](size_t i) {
-    MutableCPU();
-    return const_cast<T*>(cpu_vec_.data<T>())[i];
-  }
-
-  // CPU data access method. Immutable.
-  const T& operator[](size_t i) const {
-    //    ImmutableCPU();
-    return cpu_vec_.data<T>()[i];
-  }
-
-  // std::vector iterator methods. Based on CPU data access method
-  size_t size() const { return size_; }
-
-  T* begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
-
-  T* end() {
-    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
-  }
-
-  T& front() { return *begin(); }
-
-  T& back() {
-    auto it = end();
-    --it;
-    return *it;
-  }
-
-  const T* begin() const {
-    return capacity() == 0 ? &EmptyDummy() : &this->operator[](0);
-  }
-
-  const T* end() const {
-    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
-  }
-
-  const T* cbegin() const { return begin(); }
-
-  const T* cend() const { return end(); }
-
-  const T& back() const {
-    auto it = end();
-    --it;
-    return *it;
-  }
-
-  T* data() { return begin(); }
-
-  const T* data() const { return begin(); }
-
-  const T& front() const { return *begin(); }
-  // end of std::vector iterator methods
-
-  // assign this from iterator.
-  // NOTE: the iterator must support `end-begin`
-  template <typename Iter>
-  void assign(Iter begin, Iter end) {
-    InitByIter(end - begin, begin, end);
-  }
-
-  // push_back. If the previous capacity is not enough, the memory will
-  // double.
-  void push_back(T elem) {
-    if (size_ + 1 > capacity()) {
-      reserve((size_ + 1) << 1);
-    }
-    *end() = elem;
-    ++size_;
-  }
-
-  // extend a vector by iterator.
-  // NOTE: the iterator must support end-begin
-  template <typename It>
-  void Extend(It begin, It end) {
-    size_t pre_size = size_;
-    resize(pre_size + (end - begin));
-    T* ptr = this->begin() + pre_size;
-    for (; begin < end; ++begin, ++ptr) {
-      *ptr = *begin;
-    }
-  }
-
-  // resize the vector
-  void resize(size_t size) {
-    if (size + 1 <= capacity()) {
-      size_ = size;
-    } else {
-      MutableCPU();
-      Tensor cpu_tensor;
-      T* ptr = cpu_tensor.mutable_data<T>(
-          framework::make_ddim({static_cast<int64_t>(size)}));
-      const T* old_ptr =
-          cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
-      if (old_ptr != nullptr) {
-        std::copy(old_ptr, old_ptr + size_, ptr);
-      }
-      size_ = size;
-      cpu_vec_.ShareDataWith(cpu_tensor);
-    }
-  }
-
-  // clear
-  void clear() {
-    size_ = 0;
-    flag_ = kDirty | kDataInCPU;
-  }
-
-  size_t capacity() const {
-    return cpu_vec_.memory_size() / SizeOfType(type_id<T>().hash_code());
-  }
-
-  // reserve data
-  void reserve(size_t size) {
-    size_t pre_size = size_;
-    resize(size);
-    resize(pre_size);
-  }
-
-  // implicit cast operator. Vector can be cast to std::vector implicitly.
-  operator std::vector<T>() const {
-    std::vector<T> result;
-    result.resize(size());
-    std::copy(begin(), end(), result.begin());
-    return result;
-  }
-
-  bool operator==(const Vector<T>& other) const {
-    if (size() != other.size()) return false;
-    auto it1 = cbegin();
-    auto it2 = other.cbegin();
-    for (; it1 < cend(); ++it1, ++it2) {
-      if (*it1 != *it2) {
-        return false;
-      }
-    }
-    return true;
-  }
-
- private:
-  void InitEmpty() {
-    size_ = 0;
-    flag_ = kDataInCPU;
-  }
-
-  template <typename Iter>
-  void InitByIter(size_t size, Iter begin, Iter end) {
-    T* ptr = this->cpu_vec_.template mutable_data<T>(
-        framework::make_ddim({static_cast<int64_t>(size)}));
-    for (size_t i = 0; i < size; ++i) {
-      *ptr++ = *begin++;
-    }
-    flag_ = kDataInCPU | kDirty;
-    size_ = size;
-  }
-
-  enum DataFlag {
-    kDataInCPU = 0x01,
-    kDataInCUDA = 0x02,
-    // kDirty means the data has been changed in one device.
-    kDirty = 0x10
-  };
-
-  void MutableCPU() { flag_ = kDirty | kDataInCPU; }
-
-  void UnsetFlag(int flag) const { flag_ &= ~flag; }
-  void SetFlag(int flag) const { flag_ |= flag; }
-
-  static T& EmptyDummy() {
-    static T dummy = T();
-    return dummy;
-  }
-
-  mutable int flag_;
-  mutable Tensor cpu_vec_;
-  mutable Tensor cuda_vec_;
-  size_t size_;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/op_info.h b/mobile/src/framework/op_info.h
deleted file mode 100644
index c250f61664..0000000000
--- a/mobile/src/framework/op_info.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <string>
-#include "common/log.h"
-#include "common/type_define.h"
-#include "framework/scope.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class OperatorBase;
-
-template <typename Dtype>
-using OpCreator = std::function<framework::OperatorBase<Dtype> *(
-    const std::string & /*type*/, const VariableNameMap & /*inputs*/,
-    const VariableNameMap & /*outputs*/,
-    const framework::AttributeMap & /*attrs*/, framework::Scope * /*scope*/)>;
-
-template <typename Dtype>
-struct OpInfo {
-  OpCreator<Dtype> creator_;
-  const OpCreator<Dtype> &Creator() const {
-    PADDLE_MOBILE_ENFORCE(creator_ != nullptr,
-                          "Operator Creator has not been registered");
-    return creator_;
-  }
-};
-
-template <typename Dtype>
-class OpInfoMap {
- public:
-  static OpInfoMap<Dtype> *Instance() {
-    static OpInfoMap<Dtype> *s_instance = nullptr;
-    if (s_instance == nullptr) {
-      s_instance = new OpInfoMap();
-    }
-    return s_instance;
-  }
-
-  bool Has(const std::string &op_type) const {
-    return map_.find(op_type) != map_.end();
-  }
-
-  void Insert(const std::string &type, const OpInfo<Dtype> &info) {
-    PADDLE_MOBILE_ENFORCE(!Has(type), "Operator %s has been registered",
-                          type.c_str());
-    map_.insert({type, info});
-  }
-
-  const OpInfo<Dtype> &Get(const std::string &type) const {
-    auto op_info_ptr = GetNullable(type);
-    PADDLE_MOBILE_ENFORCE(op_info_ptr != nullptr,
-                          "Operator %s has not been registered", type.c_str());
-    return *op_info_ptr;
-  }
-
-  const OpInfo<Dtype> *GetNullable(const std::string &type) const {
-    auto it = map_.find(type);
-    if (it == map_.end()) {
-      return nullptr;
-    } else {
-      return &it->second;
-    }
-  }
-
-  const std::unordered_map<std::string, OpInfo<Dtype>> &map() const {
-    return map_;
-  }
-
-  std::unordered_map<std::string, OpInfo<Dtype>> *mutable_map() {
-    return &map_;
-  }
-
- private:
-  OpInfoMap() = default;
-  std::unordered_map<std::string, OpInfo<Dtype>> map_;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/op_kernel_type.h b/mobile/src/framework/op_kernel_type.h
deleted file mode 100644
index fd59eb494d..0000000000
--- a/mobile/src/framework/op_kernel_type.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "framework/data_layout.h"
-#include "framework/program/tensor_desc.h"
-
-namespace paddle_mobile {
-namespace framework {
-struct OpKernelType {
-  struct Hash {
-    size_t operator()(const OpKernelType &key) const {
-      int data_type = static_cast<int>(key.data_type_) << LEFT_SHIFT;
-      int data_layout = static_cast<int>(key.data_layout_) << (LEFT_SHIFT * 2);
-
-      std::hash<int> hasher;
-      return hasher(data_type + data_layout);
-    }
-  };
-
-  // place, data_type, library_type kinds less than 2^8
-  constexpr static int LEFT_SHIFT = 8;
-
-  VarType_Type data_type_;
-  DataLayout data_layout_;
-
-  OpKernelType(VarType_Type data_type,
-               DataLayout data_layout = DataLayout::kAnyLayout)
-      : data_type_(data_type), data_layout_(data_layout) {}
-
-  bool operator==(const OpKernelType &o) const {
-    return data_type_ == o.data_type_ && data_layout_ == o.data_layout_;
-  }
-
-  bool operator!=(const OpKernelType &o) const { return !(*this == o); }
-};
-
-inline bool NeedTransformLayout(const DataLayout &l, const DataLayout &r) {
-  return l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r;
-}
-
-inline bool TransFromNeeded(const OpKernelType &l, const OpKernelType &r) {
-  return (l.data_type_ != r.data_type_) ||
-         NeedTransformLayout(l.data_layout_, r.data_layout_);
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/op_proto_maker.h b/mobile/src/framework/op_proto_maker.h
deleted file mode 100644
index a41e65d357..0000000000
--- a/mobile/src/framework/op_proto_maker.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle_mobile {
-namespace framework {
-// this class not only make proto but also init attribute checkers.
-class OpProtoAndCheckerMaker {};
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/op_registry.h b/mobile/src/framework/op_registry.h
deleted file mode 100644
index 3897fc02c8..0000000000
--- a/mobile/src/framework/op_registry.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <tuple>
-
-#include "common/log.h"
-#include "common/type_define.h"
-#include "framework/op_info.h"
-#include "framework/operator.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class Registrar {
- public:
-  void Touch() {}
-};
-
-template <typename Dtype, size_t I, bool at_end, typename... ARGS>
-class OperatorRegistrarRecursive;
-
-template <typename Dtype, typename... ARGS>
-struct OperatorRegistrar : public Registrar {
-  explicit OperatorRegistrar(const std::string& op_type) {
-    if (OpInfoMap<Dtype>::Instance()->Has(op_type)) {
-      LOG(paddle_mobile::kLOG_DEBUG1)
-          << op_type << " is registered more than once.";
-      return;
-    }
-    if (sizeof...(ARGS) == 0) {
-      LOG(paddle_mobile::kLOG_DEBUG1)
-          << "OperatorRegistrar should be invoked at least by OpClass";
-      return;
-    }
-    OpInfo<Dtype> info;
-    OperatorRegistrarRecursive<Dtype, 0, false, ARGS...>(op_type, &info);
-    OpInfoMap<Dtype>::Instance()->Insert(op_type, info);
-  }
-};
-
-template <typename Dtype, typename T>
-struct OpInfoFiller {
-  void operator()(const std::string& op_type, OpInfo<Dtype>* info) const {
-    info->creator_ = [](const std::string& type, const VariableNameMap& inputs,
-                        const VariableNameMap& outputs,
-                        const AttributeMap& attrs, framework::Scope* scope) {
-      return new T(type, inputs, outputs, attrs, scope);
-    };
-  }
-};
-
-template <typename Dtype, size_t I, typename... ARGS>
-class OperatorRegistrarRecursive<Dtype, I, false, ARGS...> {
- public:
-  using T = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
-  OperatorRegistrarRecursive(const std::string& op_type, OpInfo<Dtype>* info) {
-    OpInfoFiller<Dtype, T> fill;
-    fill(op_type, info);
-    constexpr auto size = sizeof...(ARGS);
-    OperatorRegistrarRecursive<Dtype, I + 1, I + 1 == size, ARGS...> reg(
-        op_type, info);
-    (void)(reg);
-  }
-};
-
-template <typename Dtype, size_t I, typename... ARGS>
-class OperatorRegistrarRecursive<Dtype, I, true, ARGS...> {
- public:
-  OperatorRegistrarRecursive(const std::string& op_type, OpInfo<Dtype>* info) {}
-};
-
-template <typename Dtype>
-class OpRegistry {
- public:
-  static std::shared_ptr<OperatorBase<Dtype>> CreateOp(
-      const std::string& type, const VariableNameMap& inputs,
-      const VariableNameMap& outputs, const AttributeMap attrs,
-      paddle_mobile::framework::Scope* scope) {
-    auto& info = OpInfoMap<Dtype>::Instance()->Get(type);
-    auto op = info.Creator()(type, inputs, outputs, attrs, scope);
-    return std::shared_ptr<OperatorBase<Dtype>>(op);
-  }
-};
-
-#define REGISTER_OPERATOR(op_type, op_class, device_name, device_type)     \
-  template class op_class<device_type, float>;                             \
-  template <typename Dtype, typename T>                                    \
-  class _OpClass_##op_type##_##device_name : public op_class<Dtype, T> {   \
-   public:                                                                 \
-    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_##device_name, op_class);   \
-  };                                                                       \
-  static paddle_mobile::framework::OperatorRegistrar<                      \
-      device_type, _OpClass_##op_type##_##device_name<device_type, float>> \
-      __op_registrar_##op_type##_##device_name(#op_type);                  \
-  int TouchOpRegistrar_##op_type##_##device_name() {                       \
-    __op_registrar_##op_type##_##device_name.Touch();                      \
-    return 0;                                                              \
-  }
-
-#define REGISTER_OPERATOR_CPU(op_type, op_class) \
-  REGISTER_OPERATOR(op_type, op_class, cpu, paddle_mobile::CPU);
-
-#define REGISTER_OPERATOR_FPGA(op_type, op_class) \
-  REGISTER_OPERATOR(op_type, op_class, fpga, paddle_mobile::FPGA);
-
-#define REGISTER_OPERATOR_CL(op_type, op_class) \
-  REGISTER_OPERATOR(op_type, op_class, cl, paddle_mobile::GPU_CL);
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/operator.cpp b/mobile/src/framework/operator.cpp
deleted file mode 100644
index 402512c723..0000000000
--- a/mobile/src/framework/operator.cpp
+++ /dev/null
@@ -1,154 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/operator.h"
-#include <memory>
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-vector<string> OperatorBase<Dtype>::GetOutKeys() const {
-  auto it = op_input_output_key.find(type_);
-  if (it == op_input_output_key.end()) {
-    DLOG << type_ << " has no outputs";
-    return {};
-  }
-  return it->second.second;
-}
-
-template <typename Dtype>
-vector<string> OperatorBase<Dtype>::GetInputKeys() const {
-  auto it = op_input_output_key.find(type_);
-  if (it == op_input_output_key.end()) {
-    DLOG << type_ << " has no inputs";
-    return {};
-  }
-  return it->second.first;
-}
-
-template <typename Dtype>
-OperatorBase<Dtype>::OperatorBase(const std::string &type,
-                                  const VariableNameMap &inputs,
-                                  const VariableNameMap &outputs,
-                                  const AttributeMap &attrs,
-                                  framework::Scope *scope)
-    : type_(type),
-      inputs_(inputs),
-      outputs_(outputs),
-      attrs_(attrs),
-      scope_(scope) {
-  CheckAllInputOutputSet();
-}
-
-template <typename Dtype>
-void OperatorBase<Dtype>::CheckAllInputOutputSet() const {}
-
-template <typename Dtype>
-void OperatorBase<Dtype>::Run() {
-  RunImpl();
-#ifdef PADDLE_MOBILE_DEBUG
-  DLOG << "-------------" << type_ << "----------------------------";
-  vector<string> input_keys = GetInputKeys();
-  for (const auto key : input_keys) {
-    auto var_vec_in = inputs_.at(key);
-    for (int i = 0; i < var_vec_in.size(); ++i) {
-      auto var = this->scope_->FindVar(var_vec_in[i]);
-      if (var->IsInitialized() &&
-          var->template IsType<framework::LoDTensor>()) {
-        const Tensor *tensor = var->template Get<framework::LoDTensor>();
-        if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor;
-#ifdef PADDLE_MOBILE_FPGA
-        DLOG << var_vec_in[i];
-#endif
-      }
-    }
-  }
-  for (const auto key : GetOutKeys()) {
-    auto var_vec_out = outputs_.at(key);
-    for (int i = 0; i < var_vec_out.size(); ++i) {
-      auto var = scope_->FindVar(var_vec_out[i]);
-      if (var->IsInitialized() &&
-          var->template IsType<framework::LoDTensor>()) {
-        const Tensor *tensor = var->template Get<framework::LoDTensor>();
-        if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor;
-#ifdef PADDLE_MOBILE_FPGA
-        DLOG << var_vec_out[i];
-#endif
-      }
-    }
-  }
-#endif
-}
-
-#ifdef PADDLE_MOBILE_CL
-template <>
-void OperatorBase<GPU_CL>::Run() {
-  RunImpl();
-#ifdef PADDLE_MOBILE_DEBUG
-  DLOG << "-------------" << type_ << "----------------------------";
-  vector<string> input_keys = GetInputKeys();
-  for (const auto key : input_keys) {
-    auto var_vec_in = inputs_.at(key);
-    for (int i = 0; i < var_vec_in.size(); ++i) {
-      auto var = scope_->FindVar(var_vec_in[i]);
-      if (var->IsInitialized() && var->template IsType<framework::CLImage>()) {
-        const CLImage *cl_image = var->template Get<framework::CLImage>();
-        if (cl_image) {
-          DLOG << type_ << " input- " << key << "=" << *cl_image;
-        }
-      }
-    }
-  }
-  for (const auto key : GetOutKeys()) {
-    auto var_vec_out = outputs_.at(key);
-    for (int i = 0; i < var_vec_out.size(); ++i) {
-      auto var = scope_->FindVar(var_vec_out[i]);
-      if (var->IsInitialized() && var->template IsType<framework::CLImage>()) {
-        const CLImage *cl_image = var->template Get<framework::CLImage>();
-        if (cl_image) {
-          DLOG << type_ << " output- " << key << "=" << *cl_image;
-        }
-      }
-    }
-  }
-#endif
-}
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-template <typename Dtype>
-void OperatorBase<Dtype>::InsertTensors() {
-  static int feed_num = 0;
-  static int fetch_num = 0;
-  if (type_ == "feed") {
-    auto new_name = string("feed") + std::to_string(feed_num++);
-    auto var = scope_->Var(new_name);
-    var->template GetMutable<framework::LoDTensor>();
-    inputs_.at("X") = {string(new_name)};
-  } else if (type_ == "fetch") {
-    auto new_name = string("fetch") + std::to_string(fetch_num++);
-    auto var = scope_->Var(new_name);
-    var->template GetMutable<framework::LoDTensor>();
-    outputs_.at("Out") = {string(new_name)};
-  }
-}
-#endif
-
-template class OperatorBase<CPU>;
-template class OperatorBase<FPGA>;
-template class OperatorBase<GPU_CL>;
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/operator.h b/mobile/src/framework/operator.h
deleted file mode 100644
index c8b3a5ccf7..0000000000
--- a/mobile/src/framework/operator.h
+++ /dev/null
@@ -1,203 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <map>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "common/enforce.h"
-#include "common/type_define.h"
-#include "common/types.h"
-#include "common/variant.h"
-#include "framework/attribute.h"
-#include "framework/op_info.h"
-#include "framework/op_kernel_type.h"
-#include "framework/op_registry.h"
-#include "framework/program/block_desc.h"
-#include "framework/program/program-optimize/node.h"
-#include "framework/scope.h"
-#include "framework/tensor.h"
-#include "framework/variable.h"
-#ifdef PADDLE_MOBILE_CL
-#include "framework/cl/cl_helper.h"
-#include "framework/cl/cl_scope.h"
-#endif
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename T>
-static T *GetVarValue(const std::string &key, const VariableNameMap &var_map,
-                      const Scope &scope) {
-  auto var_vec = var_map.at(key);
-  if (!var_vec.empty()) {
-    auto var = scope.FindVar(var_vec[0]);
-    return var->GetMutable<T>();
-  } else {
-    return nullptr;
-  }
-}
-
-template <typename Dtype>
-class OperatorBase {
- public:
-  OperatorBase(const std::string &type, const VariableNameMap &inputs,
-               const VariableNameMap &outputs, const AttributeMap &attrs,
-               framework::Scope *scope);
-  virtual ~OperatorBase() {}
-
-  virtual void Init() = 0;
-  virtual void InferShape() const = 0;
-  virtual void Run();
-  virtual void RunImpl() = 0;
-
-  std::vector<std::string> GetOutKeys() const;
-  std::vector<std::string> GetInputKeys() const;
-
-  const VariableNameMap &Inputs() const { return inputs_; }
-  const VariableNameMap &Outputs() const { return outputs_; }
-  const std::string &Type() const { return type_; }
-  const AttributeMap &Attrs() const { return attrs_; }
-
-  void ClearVariables(const std::vector<std::string> &var_names) const {
-    if (this->scope_) {
-      this->scope_->EraseVars(var_names);
-    }
-  }
-#ifdef PADDLE_MOBILE_FPGA
-  void InsertTensors();
-#endif
-
- protected:
-  framework::Scope *scope_;
-  std::string type_;
-  VariableNameMap inputs_;
-  VariableNameMap outputs_;
-  AttributeMap attrs_;
-
- private:
-  void CheckAllInputOutputSet() const;
-};
-
-template <typename Dtype, typename ParamType, typename KernelType>
-class OperatorWithKernel : public OperatorBase<Dtype> {
- public:
-  OperatorWithKernel(const std::string &type, const VariableNameMap &inputs,
-                     const VariableNameMap &outputs, const AttributeMap &attrs,
-                     framework::Scope *scope)
-      : OperatorBase<Dtype>(type, inputs, outputs, attrs, scope),
-        param_(inputs, outputs, attrs, scope) {
-#ifdef PADDLE_MOBILE_CL
-    kernel_.InitCLHelper(scope->GetCLScpoe());
-#endif
-  }
-  virtual void RunImpl() { this->kernel_.Compute(this->param_); }
-
-  virtual void InferShape() const = 0;
-
-  void Init() {
-    PADDLE_MOBILE_ENFORCE(kernel_.Init(&param_), "  %s kernel init failed",
-                          this->type_.c_str());
-  }
-
- protected:
-  KernelType kernel_;
-  ParamType param_;
-};
-
-template <typename Dtype, typename P>
-class OpKernelBase {
- public:
-  OpKernelBase() = default;
-
-#ifdef PADDLE_MOBILE_CL
-  virtual void InitCLHelper(CLScope *clScope) {
-    cl_helper_ = CLHelper(clScope);
-  }
-#endif
-
-  virtual void Compute(const P &para) = 0;
-  virtual bool Init(P *para) { return true; }
-  virtual ~OpKernelBase() = default;
-
- protected:
-#ifdef PADDLE_MOBILE_CL
-  CLHelper cl_helper_;
-#endif
-
- private:
-};
-
-class FusionOpMatcher {
- public:
-  FusionOpMatcher() {}
-
-  virtual std::string Type() = 0;
-
-  virtual void FolderNodes(
-      Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(), {}, removed_nodes);
-  }
-
-  virtual Node &BeginNode() { return node_; }
-
-  std::string BeginType() { return node_.Type(); }
-
-  virtual std::vector<std::pair<int, std::string>> NeedCheck() { return {}; }
-
- protected:
-  Node node_;
-  std::string type_;
-  std::shared_ptr<OpDesc> new_opdesc_;
-};
-
-#define DECLARE_OPERATOR(OpName, OpParam, OpKernel)                           \
-  template <typename DeviceType, typename T>                                  \
-  class OpName##Op : public framework::OperatorWithKernel<                    \
-                         DeviceType, OpParam<DeviceType>,                     \
-                         operators::OpKernel<DeviceType, T>> {                \
-   public:                                                                    \
-    OpName##Op(const std::string &type, const VariableNameMap &inputs,        \
-               const VariableNameMap &outputs,                                \
-               const framework::AttributeMap &attrs, framework::Scope *scope) \
-        : framework::OperatorWithKernel<DeviceType, OpParam<DeviceType>,      \
-                                        operators::OpKernel<DeviceType, T>>(  \
-              type, inputs, outputs, attrs, scope) {}                         \
-                                                                              \
-    void InferShape() const override;                                         \
-  };
-
-#define DECLARE_KERNEL(OpName, OpParam)                                   \
-  template <typename DeviceType, typename T>                              \
-  class OpName##Kernel                                                    \
-      : public framework::OpKernelBase<DeviceType, OpParam<DeviceType>> { \
-   public:                                                                \
-    bool Init(OpParam<DeviceType> *param);                                \
-    void Compute(const OpParam<DeviceType> &param);                       \
-  };
-
-#define DEFINE_OP_CONSTRUCTOR(cls, parent_cls)                                 \
-  cls(const std::string &type, const ::paddle_mobile::VariableNameMap &inputs, \
-      const ::paddle_mobile::VariableNameMap &outputs,                         \
-      const ::paddle_mobile::framework::AttributeMap &attrs,                   \
-      ::paddle_mobile::framework::Scope *scope)                                \
-      : parent_cls<Dtype, T>(type, inputs, outputs, attrs, scope) {}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/program/block_desc.cpp b/mobile/src/framework/program/block_desc.cpp
deleted file mode 100644
index 4e3eb79d07..0000000000
--- a/mobile/src/framework/program/block_desc.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "block_desc.h"
-#include <algorithm>
-
-namespace paddle_mobile {
-namespace framework {
-
-std::vector<std::shared_ptr<VarDesc>> BlockDesc::Vars() const { return vars_; }
-
-std::vector<std::shared_ptr<OpDesc>> BlockDesc::Ops() const { return ops_; }
-
-BlockDesc::BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc)
-    : index_(desc->idx), parent_index_(desc->idx) {
-  for (int i = 0; i < desc->n_vars; ++i) {
-    PaddleMobile__Framework__Proto__VarDesc *var_desc = desc->vars[i];
-    vars_.emplace_back(std::shared_ptr<VarDesc>(new VarDesc(var_desc)));
-  }
-
-  std::sort(vars_.begin(), vars_.end(),
-            [](std::shared_ptr<VarDesc> left, std::shared_ptr<VarDesc> right) {
-              return left->Name() < right->Name();
-            });
-
-  for (int j = 0; j < desc->n_ops; ++j) {
-    PaddleMobile__Framework__Proto__OpDesc *op_desc = desc->ops[j];
-    ops_.emplace_back(new framework::OpDesc(op_desc));
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/program/block_desc.h b/mobile/src/framework/program/block_desc.h
deleted file mode 100644
index 86dd832d1b..0000000000
--- a/mobile/src/framework/program/block_desc.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <vector>
-#include "framework/framework.pb-c.h"
-#include "framework/program/op_desc.h"
-#include "framework/program/var_desc.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class BlockDesc {
- public:
-  friend class Node;
-  friend class ProgramOptimize;
-  BlockDesc() {}
-  explicit BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc);
-  explicit BlockDesc(const BlockDesc &block_desc)
-      : index_(block_desc.index_), parent_index_(block_desc.parent_index_) {
-    for (auto &op_desc : block_desc.ops_) {
-      std::shared_ptr<OpDesc> copy_op_desc = std::make_shared<OpDesc>(*op_desc);
-      ops_.push_back(copy_op_desc);
-    }
-
-    for (int i = 0; i < block_desc.vars_.size(); ++i) {
-      auto &var_desc = block_desc.vars_[i];
-      vars_.emplace_back(std::make_shared<VarDesc>(*var_desc));
-    }
-  }
-
-  const int &ID() const { return index_; }
-
-  const bool &MultiThread() const { return multi_thread_; }
-
-  const int &Parent() const { return parent_index_; }
-
-  bool operator==(const paddle_mobile::framework::BlockDesc &in_block) const {
-    return this->ID() == in_block.ID() && this->Parent() == in_block.Parent();
-  }
-
-  bool operator<(const paddle_mobile::framework::BlockDesc &in_block) const {
-    return this->ID() < in_block.ID() && this->Parent() < in_block.Parent();
-  }
-
-  std::vector<std::shared_ptr<VarDesc>> Vars() const;
-  std::vector<std::shared_ptr<OpDesc>> Ops() const;
-
- private:
-  int index_;
-  bool multi_thread_;
-  int parent_index_;
-  std::vector<std::shared_ptr<OpDesc>> ops_;
-  std::vector<std::shared_ptr<VarDesc>> vars_;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
-
-namespace std {
-
-template <>
-struct hash<paddle_mobile::framework::BlockDesc> {
-  typedef paddle_mobile::framework::BlockDesc argument_type;
-  typedef std::size_t result_type;
-  result_type operator()(argument_type const &s) const noexcept {
-    result_type const h1(std::hash<int>{}(s.ID()));
-    result_type const h2(std::hash<int>{}(s.ID()));
-    return h1 ^ (h2 << 1);
-  }
-};
-
-}  // namespace std
diff --git a/mobile/src/framework/program/op_desc.cpp b/mobile/src/framework/program/op_desc.cpp
deleted file mode 100644
index ba3105778e..0000000000
--- a/mobile/src/framework/program/op_desc.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <vector>
-
-#include "framework/program/op_desc.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-OpDesc::OpDesc(PaddleMobile__Framework__Proto__OpDesc *desc) {
-  this->type_ = std::string(desc->type);
-  for (int i = 0; i < desc->n_inputs; ++i) {
-    PaddleMobile__Framework__Proto__OpDesc__Var *var = desc->inputs[i];
-    std::vector<std::string> &args = inputs_[std::string(var->parameter)];
-    for (int j = 0; j < var->n_arguments; ++j) {
-      args.emplace_back(std::string(var->arguments[j]));
-    }
-  }
-
-  for (int i = 0; i < desc->n_outputs; ++i) {
-    PaddleMobile__Framework__Proto__OpDesc__Var *var = desc->outputs[i];
-    std::vector<std::string> &args = outputs_[std::string(var->parameter)];
-    for (int j = 0; j < var->n_arguments; ++j) {
-      args.emplace_back(std::string(var->arguments[j]));
-    }
-  }
-
-  for (int k = 0; k < desc->n_attrs; ++k) {
-    PaddleMobile__Framework__Proto__OpDesc__Attr *attr = desc->attrs[k];
-    std::string attr_name(attr->name);
-    attrs_[attr_name] = Attribute::GetAttrValue(attr);
-    proto_attrs_.push_back(*attr);
-  }
-}
-
-const std::vector<PaddleMobile__Framework__Proto__OpDesc__Attr>
-    &OpDesc::GetProtoAttr() const {
-  return proto_attrs_;
-}
-
-const std::vector<std::string> &OpDesc::Input(const std::string &name) const {
-  return inputs_.find(name)->second;
-}
-
-const std::vector<std::string> &OpDesc::Output(const std::string &name) const {
-  return outputs_.find(name)->second;
-}
-
-Attribute OpDesc::GetAttr(const std::string &name) const {
-  auto it = attrs_.find(name);
-  return it->second;
-}
-
-void OpDesc::SetBlockAttr(const std::string &name, BlockDesc *block) {
-  this->attrs_[name].Set<BlockDesc *>(block);
-}
-
-void OpDesc::SetBlocksAttr(const std::string &name,
-                           std::vector<BlockDesc *> blocks) {
-  this->attrs_[name].Set<std::vector<BlockDesc *>>(blocks);
-}
-
-std::unordered_map<std::string, Attribute> &OpDesc::GetAttrMap() {
-  return attrs_;
-}
-
-Print &operator<<(Print &printer, const OpDesc &op_desc) {
-  OpDesc &no_const_op_desc = const_cast<OpDesc &>(op_desc);
-  printer << "inputs: \n";
-  for (const auto &input : no_const_op_desc.GetInputs()) {
-    printer << input.first << " : " << input.second << "\n";
-  }
-
-  printer << "outputs: \n";
-  for (const auto &output : no_const_op_desc.GetOutputs()) {
-    printer << output.first << " : " << output.second << "\n";
-  }
-
-  printer << "outputs: \n";
-  for (const auto &attr : no_const_op_desc.GetAttrMap()) {
-    printer << attr.first << " : " << attr.second << "\n";
-  }
-  return printer;
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/program/op_desc.h b/mobile/src/framework/program/op_desc.h
deleted file mode 100644
index 89c877ba12..0000000000
--- a/mobile/src/framework/program/op_desc.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "common/log.h"
-#include "common/types.h"
-#include "framework/attribute.h"
-#include "framework/framework.pb-c.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class OpDesc {
- public:
-  friend class ProgramOptimize;
-  friend class FusionOpMatcher;
-  friend class Node;
-
-  explicit OpDesc(PaddleMobile__Framework__Proto__OpDesc *op_desc);
-  OpDesc(const OpDesc &op_desc) : type_(op_desc.type_) {
-    this->inputs_ = op_desc.inputs_;
-    this->outputs_ = op_desc.outputs_;
-    this->attrs_ = op_desc.attrs_;
-    this->proto_attrs_ = op_desc.proto_attrs_;
-  }
-
-  OpDesc() {}
-  const std::vector<std::string> &Input(const std::string &name) const;
-  const std::vector<std::string> &Output(const std::string &name) const;
-  Attribute GetAttr(const std::string &name) const;
-
-  const std::vector<PaddleMobile__Framework__Proto__OpDesc__Attr>
-      &GetProtoAttr() const;
-
-  void SetBlockAttr(const std::string &name, BlockDesc *block);
-  void SetBlocksAttr(const std::string &name, std::vector<BlockDesc *> block);
-
-  VariableNameMap &GetInputs() { return inputs_; }
-
-  VariableNameMap &GetOutputs() { return outputs_; }
-
-  AttributeMap &GetAttrMap();
-
-  const std::string &Type() { return type_; }
-
-  void SetInputs(VariableNameMap inputs) { inputs_ = inputs; }
-
-  void SetOutputs(VariableNameMap outputs) { outputs_ = outputs; }
-
-  void SetAttrMap(AttributeMap attrs) { attrs_ = attrs; }
-
- private:
-  std::string type_;
-  VariableNameMap inputs_;
-  VariableNameMap outputs_;
-  AttributeMap attrs_;
-  std::vector<PaddleMobile__Framework__Proto__OpDesc__Attr> proto_attrs_;
-};
-
-Print &operator<<(Print &printer, const OpDesc &op_desc);
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/program/program-optimize/fusion_op_register.h b/mobile/src/framework/program/program-optimize/fusion_op_register.h
deleted file mode 100644
index 1bf04bd6ec..0000000000
--- a/mobile/src/framework/program/program-optimize/fusion_op_register.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <map>
-#include <string>
-#include <vector>
-
-#include "framework/operator.h"
-#include "framework/program/program-optimize/node.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class FusionOpRegister {
- public:
-  static FusionOpRegister* Instance() {
-    static FusionOpRegister* regist = nullptr;
-    if (regist == nullptr) {
-      regist = new FusionOpRegister();
-    }
-    return regist;
-  }
-
-  void regist(FusionOpMatcher* matcher) {
-    if (matchers_.find(matcher->Type()) != matchers_.end()) {
-      return;
-    }
-
-    std::shared_ptr<FusionOpMatcher> shared_matcher(matcher);
-    matchers_[matcher->Type()] = shared_matcher;
-  }
-
-  const std::vector<std::shared_ptr<FusionOpMatcher>> Matchers() {
-    std::vector<std::shared_ptr<FusionOpMatcher>> matchers;
-    for (const auto& match : matchers_) {
-      matchers.push_back(match.second);
-    }
-    std::sort(matchers.begin(), matchers.end(),
-              [](std::shared_ptr<FusionOpMatcher> first,
-                 std::shared_ptr<FusionOpMatcher> second) {
-                return first->BeginNode().Depth() > second->BeginNode().Depth();
-              });
-    return matchers;
-  }
-
- private:
-  std::map<std::string, std::shared_ptr<FusionOpMatcher>> matchers_;
-  FusionOpRegister() {}
-};
-
-class FusionOpRegistrar {
- public:
-  explicit FusionOpRegistrar(FusionOpMatcher* matcher) {
-    FusionOpRegister::Instance()->regist(matcher);
-  }
-  void Touch() {}
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
-
-#define REGISTER_FUSION_MATCHER(op_type, matcher)          \
-  static paddle_mobile::framework::FusionOpRegistrar       \
-      __fusion_matcher_registrar_##op_type(new matcher()); \
-  int TouchFusionMatcherRegistrar_##op_type() {            \
-    __fusion_matcher_registrar_##op_type.Touch();          \
-    return 0;                                              \
-  }
diff --git a/mobile/src/framework/program/program-optimize/node.cpp b/mobile/src/framework/program/program-optimize/node.cpp
deleted file mode 100644
index 68bd89b768..0000000000
--- a/mobile/src/framework/program/program-optimize/node.cpp
+++ /dev/null
@@ -1,281 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/program/program-optimize/node.h"
-#include <algorithm>
-#include <map>
-#include <memory>
-#include "framework/operator.h"
-
-namespace paddle_mobile {
-
-namespace framework {
-
-std::vector<Node *> Node::operator[](int index) {
-  std::vector<Node *> nodes;
-  GetNodesWithLocation(index, 0, &nodes);
-  return nodes;
-}
-
-void Node::GetNodesWithLocation(int index, int now_index,
-                                std::vector<Node *> *nodes) {
-  if (index == now_index) {
-    nodes->push_back(this);
-  }
-
-  for (int i = 0; i < this->outputs_.size(); ++i) {
-    this->outputs_[i]->GetNodesWithLocation(index, now_index + 1, nodes);
-  }
-}
-
-Node &Node::operator>(std::shared_ptr<Node> node) {
-  outputs_.push_back(node);
-  node->inputs_.push_back(this);
-  return *node;
-}
-
-bool Node::operator==(const Node &in) {
-  if (in.type_ == this->type_) {
-    if (this->outputs_.size() == in.outputs_.size()) {
-      for (int i = 0; i < outputs_.size(); ++i) {
-        if (!(this->outputs_[i]->MedianEqual(*in.outputs_[i]))) {
-          return false;
-        }
-      }
-    } else {
-      return false;
-    }
-  } else {
-    return false;
-  }
-  return true;
-}
-
-bool Node::MedianEqual(const Node &in) {
-  if (in.type_ == this->type_) {
-    if (this->outputs_.size() == in.outputs_.size()) {
-      //      if (this->inputs_.size() != in.inputs_.size()) {
-      //        DLOG << " == - this input size: " << this->inputs_.size();
-      //        DLOG << " == - ptr of this " << this;
-      //        DLOG << " == - in input size: " << in.inputs_.size();
-      //        DLOG << " == - input size not equal ";
-      //        return false;
-      //      } else {
-      //        for (int i = 0; i < this->inputs_.size(); ++i) {
-      //          if (this->inputs_[i]->type_ != in.inputs_[i]->type_) {
-      //            DLOG << " == - input type not equal ";
-      //            return false;
-      //          }
-      //        }
-      //      }
-
-      for (int i = 0; i < outputs_.size(); ++i) {
-        if (!((*outputs_[i]).MedianEqual(*in.outputs_[i]))) {
-          return false;
-        }
-      }
-    } else {
-      //      DLOG << " == - output size not equal ";
-      return false;
-    }
-  } else {
-    //    DLOG << " == - median type is not equal ";
-    return false;
-  }
-  return true;
-}
-
-std::map<std::string, Node *> Node::Relationship() {
-  std::map<std::string, Node *> map;
-  RelationshipPrivate(&map);
-  return map;
-}
-
-void Node::RelationshipPrivate(std::map<std::string, Node *> *map) {
-  for (auto output : op_desc_->outputs_) {
-    for (auto output_key : output.second) {
-      (*map)[output_key] = this;
-    }
-  }
-  for (auto output : this->outputs_) {
-    output->RelationshipPrivate(map);
-  }
-}
-
-std::shared_ptr<Node> Node::To(int size) {
-  std::shared_ptr<Node> node = std::make_shared<Node>();
-  this->To(size - 1, node);
-  return node;
-}
-
-void Node::To(int index, std::shared_ptr<Node> node) {
-  node->op_desc_ = this->op_desc_;
-  node->type_ = this->type_;
-  node->inputs_ = this->inputs_;
-  if (index != 0) {
-  } else {
-    return;
-  }
-
-  for (int j = 0; j < this->outputs_.size(); ++j) {
-    std::shared_ptr<Node> sub_node = std::make_shared<Node>();
-    node->outputs_.push_back(sub_node);
-    outputs_[j]->To(index - 1, sub_node);
-  }
-}
-
-int Node::Depth(int begin) {
-  int depth = 0;
-  begin++;
-  for (int i = 0; i < outputs_.size(); ++i) {
-    int output_depth = outputs_[i]->Depth(begin);
-    depth = output_depth > depth ? output_depth : depth;
-  }
-  return begin > depth ? begin : depth;
-}
-
-Node &Node::Folder(
-    int size, std::string type,
-    std::map<std::string, std::vector<std::pair<std::string, std::string>>>
-        change,
-    std::vector<std::shared_ptr<Node>> *removed_nodes) {
-  std::shared_ptr<framework::OpDesc> op_desc =
-      std::make_shared<framework::OpDesc>();
-  op_desc->inputs_ = this->op_desc_->inputs_;
-  std::vector<std::shared_ptr<Node>> outputs;
-  this->Folder(op_desc, &outputs, size - 1, &change, this, removed_nodes);
-  this->outputs_ = outputs;
-  this->type_ = type;
-  this->op_desc_ = op_desc;
-  this->op_desc_->type_ = type;
-  return *this;
-}
-
-void Node::Folder(
-    std::shared_ptr<framework::OpDesc> op_desc,
-    std::vector<std::shared_ptr<Node>> *outputs, int index,
-    std::map<std::string, std::vector<std::pair<std::string, std::string>>>
-        *change,
-    Node *begin_node, std::vector<std::shared_ptr<Node>> *removed_nodes) {
-  if (change->find(this->type_) != change->end()) {
-    auto change_pairs = (*change)[this->type_];
-    for (const auto &change_pair : change_pairs) {
-      std::map<std::string, int> f;
-      if (this->op_desc_->GetInputs().find(change_pair.first) !=
-          this->op_desc_->GetInputs().end()) {
-        if (op_desc->GetInputs().find(change_pair.second) !=
-            op_desc->GetInputs().end()) {
-          for (auto value : this->op_desc_->GetInputs()[change_pair.first]) {
-            op_desc->GetInputs()[change_pair.second].push_back(value);
-          }
-        } else {
-          op_desc->GetInputs()[change_pair.second] =
-              this->op_desc_->GetInputs()[change_pair.first];
-        }
-      }
-    }
-  }
-
-  for (auto &attr_pair : this->op_desc_->attrs_) {
-    op_desc->attrs_.emplace(attr_pair.first, attr_pair.second);
-  }
-  if (index > 0) {
-    --index;
-
-    for (auto output : outputs_) {
-      if (change->find(this->type_) != change->end()) {
-        auto change_pairs = (*change)[this->type_];
-        for (const auto &change_pair : change_pairs) {
-          std::map<std::string, int> f;
-          if (this->op_desc_->GetOutputs().find(change_pair.first) !=
-              this->op_desc_->GetOutputs().end()) {
-            if (op_desc->GetInputs().find(change_pair.second) !=
-                op_desc->GetInputs().end()) {
-              for (auto value :
-                   this->op_desc_->GetOutputs()[change_pair.first]) {
-                op_desc->GetInputs()[change_pair.second].push_back(value);
-              }
-            } else {
-              op_desc->GetInputs()[change_pair.second] =
-                  this->op_desc_->GetOutputs()[change_pair.first];
-            }
-          }
-        }
-      }
-
-      removed_nodes->push_back(output);
-      output->Folder(op_desc, outputs, index, change, begin_node,
-                     removed_nodes);
-    }
-  } else {
-    for (auto &op_output : this->op_desc_->outputs_) {
-      auto output_key = op_output.first;
-      if (change->find(this->type_) != change->end()) {
-        const auto change_pairs = (*change)[this->type_];
-        for (const auto &target : change_pairs) {
-          if (target.first == output_key) {
-            output_key = target.second;
-          }
-        }
-      }
-      op_desc->outputs_.emplace(output_key, op_output.second);
-    }
-
-    for (auto &output : this->outputs_) {
-      auto iter =
-          std::find(output->inputs_.begin(), output->inputs_.end(), this);
-
-      if (iter != output->inputs_.end()) {
-        output->inputs_.erase(iter);
-      }
-      output->inputs_.push_back(begin_node);
-      outputs->push_back(output);
-    }
-  }
-}
-#ifdef PADDLE_MOBILE_DEBUG
-std::string Node::ToString(std::string blank, const Node *node) const {
-  std::stringstream ss;
-  ss << type_ << "-> \n";
-
-  if (inputs_.size() > 1 && node != inputs_.back()) {
-    return ss.str();
-  } else if (inputs_.size() > 1 && node == inputs_.back()) {
-    ss << "\n" << blank << type_ << "\n";
-  }
-
-  for (int i = 0; i < outputs_.size(); ++i) {
-    ss << blank << outputs_[i]->ToString(blank + "  ", this) << "";
-  }
-  return ss.str();
-}
-
-std::string Node::ToString() const { return this->ToString("  ", this); }
-
-void Node::Description() {
-  if (op_desc_.get()) {
-    DLOG << *op_desc_;
-  } else {
-    DLOG << " null ";
-  }
-}
-
-Print &operator<<(Print &printer, const Node &node) {
-  printer << node.ToString();
-  return printer;
-}
-#endif
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/program/program-optimize/node.h b/mobile/src/framework/program/program-optimize/node.h
deleted file mode 100644
index 5b5ae7796f..0000000000
--- a/mobile/src/framework/program/program-optimize/node.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cinttypes>
-#include <map>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "common/log.h"
-#include "framework/program/op_desc.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class Node {
-  friend class ProgramOptimize;
-
- public:
-  Node() {}
-  explicit Node(const std::string &type) : type_(type) {}
-  explicit Node(std::shared_ptr<OpDesc> op_desc)
-      : op_desc_(op_desc), type_(op_desc->Type()) {}
-  Node &operator>(std::shared_ptr<Node> node);
-  bool operator==(const Node &in);
-  bool MedianEqual(const Node &in);
-
-#ifdef PADDLE_MOBILE_DEBUG
-  std::string ToString() const;
-  void Description();
-#endif
-  std::shared_ptr<Node> To(int size);
-  int Depth(int begin = 0);
-  Node &Folder(
-      int size, std::string type,
-      std::map<std::string, std::vector<std::pair<std::string, std::string>>>
-          change,
-      std::vector<std::shared_ptr<Node>> *removed_nodes);
-  std::shared_ptr<framework::OpDesc> OpDescOfNode() { return op_desc_; }
-  std::string Type() { return type_; }
-
-  std::vector<Node *> operator[](int index);
-
-  std::map<std::string, Node *> Relationship();
-
- private:
-  void RelationshipPrivate(std::map<std::string, Node *> *map);
-  void GetNodesWithLocation(int index, int now_index,
-                            std::vector<Node *> *nodes);
-  void To(int index, std::shared_ptr<Node>);
-  void Folder(
-      std::shared_ptr<framework::OpDesc> op_desc,
-      std::vector<std::shared_ptr<Node>> *outputs, int index,
-      std::map<std::string, std::vector<std::pair<std::string, std::string>>>
-          *change,
-      Node *begin_node, std::vector<std::shared_ptr<Node>> *removed_nodes);
-  std::shared_ptr<framework::OpDesc> op_desc_;
-#ifdef PADDLE_MOBILE_DEBUG
-  std::string ToString(std::string blank, const Node *node) const;
-#endif
-  std::vector<std::shared_ptr<Node>> outputs_;
-  std::vector<Node *> inputs_;
-  std::string type_;
-};
-
-Print &operator<<(Print &printer, const Node &node);
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/program/program-optimize/program_optimize.cpp b/mobile/src/framework/program/program-optimize/program_optimize.cpp
deleted file mode 100644
index eba27314ad..0000000000
--- a/mobile/src/framework/program/program-optimize/program_optimize.cpp
+++ /dev/null
@@ -1,300 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/program/program-optimize/program_optimize.h"
-#include <algorithm>
-#include <utility>
-#include "framework/program/program-optimize/fusion_op_register.h"
-
-namespace paddle_mobile {
-
-namespace framework {
-
-std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
-    std::shared_ptr<ProgramDesc> ori_des, bool add_split) {
-  std::shared_ptr<ProgramDesc> optimize_program =
-      std::make_shared<ProgramDesc>(*ori_des);
-  current_block_ = optimize_program->Blocks().size();
-
-  for (int i = 0; i < optimize_program->Blocks().size(); ++i) {
-    std::unordered_map<std::string, std::shared_ptr<Node>> output_nodes;
-    std::unordered_map<
-        std::string,
-        std::vector<
-            std::pair<std::shared_ptr<Node>,
-                      std::unordered_map<std::string, std::shared_ptr<Node>>>>>
-        type_map;
-    std::vector<std::shared_ptr<Node>> nodes;
-    std::shared_ptr<Node> begin_node;
-
-    auto block = optimize_program->Block(i);
-    for (int j = 0; j < block->Ops().size(); ++j) {
-      auto op = block->Ops()[j];
-      std::shared_ptr<Node> node = std::make_shared<Node>(op);
-      if (j == 0) {
-        begin_node = node;
-      }
-
-      const std::string op_type = op->Type();
-      nodes.push_back(node);
-      type_map[op_type].push_back({node, output_nodes});
-      const VariableNameMap &op_inputs = op->GetInputs();
-      const VariableNameMap &op_outpus = op->GetOutputs();
-
-      for (const auto &input : op_inputs) {
-        for (const auto &input_name : input.second) {
-          if (output_nodes.find(input_name) != output_nodes.end()) {
-            auto input_node = output_nodes[input_name];
-            *input_node > node;
-          }
-        }
-      }
-
-      for (const auto &output : op_outpus) {
-        for (const auto &output_name : output.second) {
-          output_nodes[output_name] = node;
-        }
-      }
-    }
-
-    for (auto &registed : FusionOpRegister::Instance()->Matchers()) {
-      std::string fusion_type = registed->Type();
-      std::shared_ptr<FusionOpMatcher> matcher = registed;
-
-      auto match_vector = type_map[matcher->BeginType()];
-
-      for (auto &match_node_pair : match_vector) {
-        auto match_node = match_node_pair.first;
-
-        auto node_has = match_node_pair.second;
-
-        auto depth = matcher->BeginNode().Depth();
-        auto sub_node = match_node->To(depth);
-        //  DLOG << " sub node: " << *sub_node;
-        if (*sub_node == matcher->BeginNode()) {
-          bool can_folder = true;
-
-          auto relationship_map = sub_node->Relationship();
-
-          for (auto to_check : matcher->NeedCheck()) {
-            auto nodes = (*sub_node)[to_check.first];
-            for (auto node : nodes) {
-              auto inputs_to_check =
-                  node->OpDescOfNode()->Input(to_check.second);
-
-              for (auto input_to_check : inputs_to_check) {
-                if (node_has.find(input_to_check) == node_has.end()) {
-                  if (relationship_map.find(input_to_check) ==
-                      relationship_map.end()) {
-                    can_folder = false;
-                  } else {
-                  }
-                }
-              }
-            }
-          }
-
-          if (!can_folder) {
-            continue;
-          }
-
-          std::vector<std::shared_ptr<Node>> removed_nodes;
-          matcher->FolderNodes(match_node.get(), &removed_nodes);
-          for (int k = removed_nodes.size() - 1; k >= 0; --k) {
-            auto removed_node = removed_nodes[k];
-            auto removed_ite =
-                std::find(nodes.begin(), nodes.end(), removed_node);
-            if (removed_ite != nodes.end()) {
-              nodes.erase(removed_ite);
-            }
-          }
-        }
-      }
-    }
-
-    std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
-    if (add_split) {
-      GenerateOps(&op_descs, begin_node.get(), add_split);
-    } else {
-      for (int m = 0; m < nodes.size(); ++m) {
-        auto &node = nodes[m];
-        op_descs.push_back(node->op_desc_);
-      }
-    }
-    block->ops_ = op_descs;
-  }
-
-  for (int m = 0; m < new_blocks_.size(); ++m) {
-    std::shared_ptr<BlockDesc> new_block = new_blocks_[m];
-    new_block->index_ = m + ori_des->blocks_.size();
-    optimize_program->blocks_.push_back(new_block);
-  }
-  return optimize_program;
-}
-
-void ProgramOptimize::GenerateOps(
-    std::vector<std::shared_ptr<framework::OpDesc>> *op_desc, Node *input_node,
-    Node *current_node) {
-  if (current_node->inputs_.size() > 1 &&
-      input_node != current_node->inputs_.back()) {
-    DLOG << " current type " << current_node->Type();
-
-    DLOG << " inputs size of current node > 0 ";
-
-    for (int i = 0; i < current_node->inputs_.size(); ++i) {
-      DLOG << " input i: " << current_node->inputs_[i]->Type();
-    }
-
-    return;
-  } else if (current_node->inputs_.size() > 1 &&
-             input_node == current_node->inputs_.back()) {
-    op_desc->push_back(current_node->op_desc_);
-  } else {
-    op_desc->push_back(current_node->op_desc_);
-  }
-
-  for (int i = 0; i < current_node->outputs_.size(); ++i) {
-    auto &output = current_node->outputs_[i];
-    GenerateOps(op_desc, current_node, output.get());
-  }
-}
-
-void ProgramOptimize::GenerateOps(
-    std::vector<std::shared_ptr<framework::OpDesc>> *op_desc, Node *input_node,
-    Node *current_node, bool adding_thread, int thread_num,
-    std::shared_ptr<BlockDesc> new_block) {
-  if (current_node->outputs_.size() > 1) {
-    adding_thread = false;
-  }
-
-  bool can_add_split = false;
-  const auto current_desc = current_node->OpDescOfNode();
-  const VariableNameMap &current_op_inputs = current_desc->GetInputs();
-  const VariableNameMap &current_op_outputs = current_desc->GetOutputs();
-  // 如果当前节点有多个输出 并且 只有当前节点对应的 op_desc_ 输出数为 1 时支持
-  if (current_node->outputs_.size() > 1 && current_op_outputs.size() == 1) {
-    can_add_split = true;
-
-    // 遍历当前节点的 output 节点
-    for (const auto &output : current_node->outputs_) {
-      // 不支持 output 有多个 output 的情况
-      if (output->outputs_.size() > 1) {
-        DLOG << "don't support multi output of output";
-        can_add_split = false;
-        break;
-      }
-
-      //与节点关联的 OpDesc
-      std::shared_ptr<framework::OpDesc> &op_desc = output->op_desc_;
-      //获取这个 op 的 inputs key 和 outputs key
-      const VariableNameMap &op_inputs = op_desc->GetInputs();
-      const VariableNameMap &op_outputs = op_desc->GetOutputs();
-
-      //判断现在 是否存在这个 op
-      //判断这个 output 和 input key 的 size 等于 1
-      if (op_outputs.size() == 1 && op_inputs.size() == 1) {
-        auto inputs_of_output = op_inputs.begin()->second;
-        auto outputs_of_output = op_outputs.begin()->second;
-
-        // 判断一下, 如果输入和输出没有同名, 是支持的
-        for (int i = 0; i < inputs_of_output.size(); ++i) {
-          std::string input_of_output = inputs_of_output[i];
-          for (int j = 0; j < outputs_of_output.size(); ++j) {
-            std::string output_of_output = outputs_of_output[j];
-            if (input_of_output == output_of_output) {
-              DLOG << "output的 output 包含 input" << input_of_output;
-              can_add_split = false;
-              break;
-            }
-          }
-        }
-      } else {  // 如果模型中包含没有的 op, 则不支持添加 split
-        DLOG << "找不到 这个 op 类型: " << output->op_desc_->Type();
-        can_add_split = false;
-      }
-    }
-  }
-
-  if (current_node->inputs_.size() > 1 &&
-      input_node != current_node->inputs_.back()) {
-    return;
-  } else if (current_node->inputs_.size() > 1 &&
-             input_node == current_node->inputs_.back()) {
-    new_block.reset();
-    adding_thread = false;
-    op_desc->push_back(current_node->op_desc_);
-  } else {
-    if (new_block.get() && adding_thread) {
-      new_block->ops_.push_back(current_node->op_desc_);
-    } else {
-      op_desc->push_back(current_node->op_desc_);
-    }
-  }
-  if (adding_thread) {
-    Attribute attr;
-    attr.Set<int>(thread_num);
-    current_node->op_desc_->attrs_["thread"] = attr;
-  }
-
-  if (can_add_split) {
-    new_block = std::make_shared<BlockDesc>();
-    new_block->multi_thread_ = true;
-    new_block->index_ = current_block_;
-    new_blocks_.push_back(new_block);
-
-    adding_thread = true;
-    std::shared_ptr<OpDesc> split_op_desc = std::make_shared<OpDesc>();
-    split_op_desc->type_ = G_OP_TYPE_SPLIT;
-    auto outputs = current_node->op_desc_->Output(
-        op_input_output_key[current_node->op_desc_->Type()].second[0]);
-    split_op_desc->inputs_ = {
-        {op_input_output_key[G_OP_TYPE_SPLIT].first[0], outputs}};
-    auto &split_outputs =
-        split_op_desc->outputs_[op_input_output_key[G_OP_TYPE_SPLIT].second[0]];
-    for (const auto &output : current_node->outputs_) {
-      split_outputs.push_back(outputs[0]);
-    }
-
-    Attribute attr;
-    attr.Set<int>(current_block_);
-    split_op_desc->attrs_["block_id"] = attr;
-
-    op_desc->push_back(split_op_desc);
-    current_block_++;
-  }
-
-  for (int i = 0; i < current_node->outputs_.size(); ++i) {
-    auto &output = current_node->outputs_[i];
-    if (can_add_split) {
-      GenerateOps(op_desc, current_node, output.get(), adding_thread, i,
-                  new_block);
-    } else {
-      GenerateOps(op_desc, current_node, output.get(), adding_thread,
-                  thread_num, new_block);
-    }
-  }
-}
-
-void ProgramOptimize::GenerateOps(
-    std::vector<std::shared_ptr<framework::OpDesc>> *op_descs, Node *begin_node,
-    bool can_add_split) {
-  if (can_add_split) {
-    this->GenerateOps(op_descs, begin_node, begin_node, false, -1, nullptr);
-  } else {
-    this->GenerateOps(op_descs, begin_node, begin_node);
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/program/program-optimize/program_optimize.h b/mobile/src/framework/program/program-optimize/program_optimize.h
deleted file mode 100644
index 57b282926d..0000000000
--- a/mobile/src/framework/program/program-optimize/program_optimize.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "framework/operator.h"
-#include "framework/program/program-optimize/node.h"
-#include "framework/program/program_desc.h"
-
-namespace paddle_mobile {
-
-namespace framework {
-class ProgramOptimize {
- public:
-  ProgramOptimize() {}
-  std::shared_ptr<ProgramDesc> FusionOptimize(
-      std::shared_ptr<ProgramDesc> ori_des, bool add_split = false);
-
- private:
-  int current_block_;
-  std::vector<std::shared_ptr<BlockDesc>> new_blocks_;
-  void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_descs,
-                   Node *begin_node, bool can_add_split);
-  void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
-                   Node *input_node, Node *current_node);
-  void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
-                   Node *input_node, Node *current_node, bool adding_thread,
-                   int thread_num, std::shared_ptr<BlockDesc> new_block);
-};
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/program/program.h b/mobile/src/framework/program/program.h
deleted file mode 100644
index f05aba8565..0000000000
--- a/mobile/src/framework/program/program.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "common/types.h"
-#include "framework/program/program_desc.h"
-#include "framework/scope.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Device, typename T = float>
-class Program {
- public:
-  std::shared_ptr<ProgramDesc> originProgram;
-  std::shared_ptr<ProgramDesc> optimizeProgram;
-  std::shared_ptr<Scope> scope;
-  std::string model_path;
-  std::string para_path;
-  bool combined = false;
-  bool quantification = false;
-  size_t combined_params_len;
-  uint8_t *combined_params_buf;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/program/program_desc.cpp b/mobile/src/framework/program/program_desc.cpp
deleted file mode 100644
index a75bf01be1..0000000000
--- a/mobile/src/framework/program/program_desc.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-
-#include "framework/program/program_desc.h"
-#include "framework/program/tensor_desc.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-ProgramDesc::ProgramDesc(PaddleMobile__Framework__Proto__ProgramDesc *desc) {
-  for (int i = 0; i < desc->n_blocks; ++i) {
-    blocks_.emplace_back(std::make_shared<BlockDesc>(desc->blocks[i]));
-  }
-  for (auto &block : blocks_) {
-    for (auto op : block->Ops()) {
-      for (const auto &attr : op->GetProtoAttr()) {
-        if (attr.type == PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK) {
-          size_t blk_idx = attr.block_idx;
-          op->SetBlockAttr(attr.name, this->MutableBlock(blk_idx));
-        } else if (attr.type ==
-                   PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCKS) {
-          size_t n_blocks_idx = attr.n_blocks_idx;
-          int32_t *blks_idx = attr.blocks_idx;
-          std::vector<BlockDesc *> block_descs;
-          for (size_t i = 0; i < n_blocks_idx; ++i) {
-            block_descs.push_back(this->MutableBlock(blks_idx[i]));
-          }
-          op->SetBlocksAttr(attr.name, block_descs);
-        }
-      }
-    }
-  }
-}
-
-void ProgramDesc::Description(std::string header) const {
-#ifdef PADDLE_MOBILE_DEBUG
-  if (header.size()) {
-    LOG(kLOG_INFO) << header;
-  }
-  for (int i = 0; i < this->blocks_.size(); ++i) {
-    auto block = this->blocks_[i];
-    for (int j = 0; j < block->Ops().size(); ++j) {
-      std::shared_ptr<OpDesc> op_desc = block->Ops()[j];
-      auto op_info_ptr =
-          OpInfoMap<CPU>::Instance()->GetNullable(op_desc->Type());
-      if (op_info_ptr == nullptr) {
-        DLOG << "Operator has not been registered :" << op_desc->Type().c_str();
-      }
-    }
-  }
-
-  for (int i = 0; i < this->blocks_.size(); ++i) {
-    auto block = this->blocks_[i];
-    LOG(kLOG_DEBUG) << "block: " << block->ID();
-    LOG(kLOG_INFO) << "block ops size: " << block->Ops().size();
-    for (int j = 0; j < block->Ops().size(); ++j) {
-      auto op = block->Ops()[j];
-      LOG(kLOG_DEBUG1) << j << "th, op: " << op->Type();
-      for (auto &input : op->GetInputs()) {
-        LOG(kLOG_DEBUG2) << "input parameter: " << input.first;
-        for (auto &n : input.second) {
-          LOG(kLOG_DEBUG3) << "argument - " << n;
-        }
-      }
-      for (auto &output : op->GetOutputs()) {
-        LOG(kLOG_DEBUG2) << "output parameter: " << output.first;
-        for (auto &n : output.second) {
-          LOG(kLOG_DEBUG3) << "argument - " << n;
-        }
-      }
-      for (auto &attr : op->GetAttrMap()) {
-        if (attr.first == "op_callstack" || attr.first == "sub_block") continue;
-        LOG(kLOG_DEBUG2) << "attr name: " << attr.first;
-        LOG(kLOG_DEBUG3) << "argument - " << attr.second;
-      }
-    }
-
-    for (const auto &var_desc : block->Vars()) {
-      LOG(kLOG_DEBUG1) << "var name: " << var_desc->Name();
-      if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
-        const TensorDesc &tensor_desc = var_desc->Tensor_desc();
-
-        LOG(kLOG_DEBUG2) << "in var tensor desc dims size: "
-                         << tensor_desc.Dims().size();
-        for (int l = 0; l < tensor_desc.Dims().size(); ++l) {
-          LOG(kLOG_DEBUG3) << "var tensor desc dim " << l
-                           << " value: " << tensor_desc.Dims()[l];
-        }
-      }
-    }
-  }
-
-  for (const auto &block : this->blocks_) {
-  }
-#endif
-}
-
-std::shared_ptr<BlockDesc> ProgramDesc::Block(size_t idx) {
-  return blocks_[idx];
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/program/program_desc.h b/mobile/src/framework/program/program_desc.h
deleted file mode 100644
index f4551509ee..0000000000
--- a/mobile/src/framework/program/program_desc.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "common/types.h"
-#include "framework/framework.pb-c.h"
-#include "framework/program/block_desc.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class ProgramDesc {
- public:
-  friend class Node;
-  friend class ProgramOptimize;
-  explicit ProgramDesc(PaddleMobile__Framework__Proto__ProgramDesc *desc);
-
-  ProgramDesc(const ProgramDesc &program_desc) {
-    for (auto &block : program_desc.blocks_) {
-      std::shared_ptr<BlockDesc> copy_block =
-          std::make_shared<BlockDesc>(*block);
-      blocks_.push_back(copy_block);
-    }
-  }
-
-  std::shared_ptr<BlockDesc> Block(size_t idx);
-
-  BlockDesc *MutableBlock(size_t idx) {
-    if (idx == -1) {
-      return nullptr;
-    } else {
-      return blocks_[idx].get();
-    }
-  }
-
-  const std::vector<std::shared_ptr<BlockDesc>> &Blocks() const {
-    return blocks_;
-  }
-
-  void Description(std::string header = "") const;
-
- private:
-  std::vector<std::shared_ptr<BlockDesc>> blocks_;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/program/tensor_desc.h b/mobile/src/framework/program/tensor_desc.h
deleted file mode 100644
index f1634c6503..0000000000
--- a/mobile/src/framework/program/tensor_desc.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-
-#include "framework/framework.pb-c.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-enum VarType_Type {
-  VARTYPE_TYPE_BOOL = 0,
-  VARTYPE_TYPE_INT16 = 1,
-  VARTYPE_TYPE_INT32 = 2,
-  VARTYPE_TYPE_INT64 = 3,
-  VARTYPE_TYPE_FP16 = 4,
-  VARTYPE_TYPE_FP32 = 5,
-  VARTYPE_TYPE_FP64 = 6,
-  VARTYPE_TYPE_LOD_TENSOR = 7,
-  VARTYPE_TYPE_SELECTED_ROWS = 8,
-  VARTYPE_TYPE_FEED_MINIBATCH = 9,
-  VARTYPE_TYPE_FETCH_LIST = 10,
-  VARTYPE_TYPE_STEP_SCOPES = 11,
-  VARTYPE_TYPE_STEP_LOD_RANK_TABLE = 12,
-  VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY = 13,
-  VARTYPE_TYPE_STEP_PLACE_LIST = 14,
-  VARTYPE_TYPE_READER = 15,
-  VARTYPE_TYPE_CHANNEL = 16,
-  VARTYPE_TYPE_RAW = 17,
-  VARTYPE_TYPE_TUPLE = 18,
-  VARTYPE_TYPE_SIZE_T = 19,
-  VARTYPE_TYPE_UINT8 = 20,
-  VARTYPE_TYPE_INT8 = 21,
-};
-
-class TensorDesc {
- public:
-  TensorDesc() = default;
-  TensorDesc(const TensorDesc &desc) {
-    this->dims_ = desc.dims_;
-    this->data_type_ = desc.data_type_;
-  }
-
-  TensorDesc(PaddleMobile__Framework__Proto__VarType__TensorDesc *desc) {
-    for (int i = 0; i < desc->n_dims; ++i) {
-      int64_t d = desc->dims[i];
-      dims_.emplace_back(d);
-    }
-    data_type_ = (VarType_Type)desc->data_type;
-  }
-  // return tensor dim as a vector
-  std::vector<int64_t> Dims() const { return dims_; };
-  // return tensor data type
-  VarType_Type DataType() const { return data_type_; }
-
- private:
-  std::vector<int64_t> dims_;
-  VarType_Type data_type_;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/program/var_desc.h b/mobile/src/framework/program/var_desc.h
deleted file mode 100644
index ede7263a72..0000000000
--- a/mobile/src/framework/program/var_desc.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-
-#include "framework/framework.pb-c.h"
-#include "framework/program/tensor_desc.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class VarDesc {
- public:
-  VarDesc(const VarDesc &var_desc) {
-    this->data_type_ = var_desc.data_type_;
-    this->name_ = var_desc.name_;
-    this->persistable_ = var_desc.persistable_;
-    this->tensor_desc_ = var_desc.tensor_desc_;
-    this->type_ = var_desc.type_;
-  }
-
-  VarDesc(PaddleMobile__Framework__Proto__VarDesc *desc) {
-    type_ = (VarType_Type)desc->type->type;
-    name_ = std::string(desc->name);
-    persistable_ = (bool)desc->persistable;
-
-    switch (type_) {
-      case VARTYPE_TYPE_SELECTED_ROWS:
-        tensor_desc_ = TensorDesc(desc->type->selected_rows);
-        break;
-      case VARTYPE_TYPE_LOD_TENSOR:
-        tensor_desc_ = TensorDesc(desc->type->lod_tensor->tensor);
-        break;
-      case VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY:
-        tensor_desc_ = TensorDesc(desc->type->tensor_array->tensor);
-        break;
-      default:
-        break;
-    }
-    switch (type_) {
-      case VARTYPE_TYPE_CHANNEL:
-        data_type_ = (VarType_Type)desc->type->channel->data_type;
-        break;
-      default:
-        data_type_ = tensor_desc_.DataType();
-        break;
-    }
-  }
-
-  std::string Name() const { return name_; }
-
-  VarType_Type Type() const { return type_; }
-
-  bool Persistable() const { return persistable_; }
-
-  const TensorDesc &Tensor_desc() const { return tensor_desc_; }
-
- private:
-  std::string name_;
-  bool persistable_;
-  TensorDesc tensor_desc_;
-  VarType_Type type_;
-  VarType_Type data_type_;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/scope.cpp b/mobile/src/framework/scope.cpp
deleted file mode 100644
index e60148f3c6..0000000000
--- a/mobile/src/framework/scope.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/scope.h"
-
-#include <algorithm>
-#include <set>
-#include <string>
-#include <vector>
-
-namespace paddle_mobile {
-namespace framework {
-
-Scope &Scope::NewScope() const {
-  kids_.push_back(new Scope(this));
-  return *kids_.back();
-}
-
-Variable *Scope::Var() {
-  auto *pvar = new Variable;
-  unnamed_vars_.push_back(pvar);
-  return pvar;
-}
-
-Variable *Scope::Var(const std::string &name) {
-  auto *pvar = FindVarLocally(name);
-  if (pvar != nullptr) {
-    return pvar;
-  }
-  pvar = new Variable;
-  named_vars_[name] = pvar;
-  pvar->name_ = named_vars_.find(name)->first;
-  return pvar;
-}
-
-Variable *Scope::FindVar(const std::string &name) const {
-  auto *pvar = FindVarLocally(name);
-  if (pvar != nullptr) {
-    return pvar;
-  }
-  return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
-}
-
-const Scope *Scope::FindScope(const Variable *var) const {
-  for (auto &name_var : named_vars_) {
-    if (name_var.second == var) {
-      return this;
-    }
-  }
-  return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
-}
-
-void Scope::DropKids() {
-  for (Scope *s : kids_) {
-    delete s;
-  }
-  kids_.clear();
-}
-
-std::vector<std::string> Scope::LocalVarNames() const {
-  std::vector<std::string> known_vars;
-  known_vars.reserve(named_vars_.size());
-  for (auto &name_var : named_vars_) {
-    known_vars.emplace_back(name_var.first);
-  }
-  return known_vars;
-}
-
-void Scope::DeleteScope(Scope *scope) const {
-  auto it = std::find(kids_.begin(), kids_.end(), scope);
-  kids_.erase(it);
-  delete scope;
-}
-
-void Scope::EraseVars(const std::vector<std::string> &var_names) {
-  std::set<std::string> var_set(var_names.begin(), var_names.end());
-  for (auto it = named_vars_.begin(); it != named_vars_.end();) {
-    if (var_set.find(it->first) != var_set.end()) {
-      delete it->second;
-      it = named_vars_.erase(it);
-    } else {
-      ++it;
-    }
-  }
-}
-
-void Scope::Rename(const std::string &origin_name,
-                   const std::string &new_name) const {
-  auto origin_it = named_vars_.find(origin_name);
-  if (origin_it == named_vars_.end()) {
-    return;
-  }
-  auto new_it = named_vars_.find(new_name);
-  if (new_it != named_vars_.end()) {
-    return;
-  }
-  named_vars_[new_name] = origin_it->second;
-  named_vars_.erase(origin_it);
-}
-
-Variable *Scope::FindVarLocally(const std::string &name) const {
-  auto it = named_vars_.find(name);
-  if (it != named_vars_.end()) {
-    return it->second;
-  }
-  return nullptr;
-}
-
-#ifdef PADDLE_MOBILE_FPGA
-Variable *Scope::Var(const std::string &name, const int id) {
-  return Var(name + std::to_string(id));
-}
-
-std::vector<Variable *> Scope::VarContain(const std::string substring,
-                                          int *min) {
-  std::vector<Variable *> v;
-
-  int temp = 9999;
-  auto len0 = substring.length();
-  for (auto pair : named_vars_) {
-    if (pair.first.find(substring) == 0) {
-      v.push_back(pair.second);
-      auto len1 = pair.first.length();
-      int index = std::stoi(pair.first.substr(len0, len1));
-      if (index < temp) {
-        temp = index;
-      }
-    }
-  }
-  *min = temp;
-  return v;
-}
-
-void Scope::print_vars() {
-  DLOG << "====================start to print variables=================";
-  for (auto pair : named_vars_) {
-    DLOG << pair.first;
-  }
-  DLOG << "==================complete printing variables================";
-}
-#endif
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/scope.h b/mobile/src/framework/scope.h
deleted file mode 100644
index 47642cc3f1..0000000000
--- a/mobile/src/framework/scope.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <list>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#ifdef PADDLE_MOBILE_CL
-#include "framework/cl/cl_scope.h"
-#endif
-#include "framework/variable.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class Scope {
- public:
-  Scope() = default;
-
-  ~Scope() {
-    // clear named variables
-    for (auto &var : named_vars_) {
-      delete var.second;
-    }
-    named_vars_.clear();
-    // clear unnamed variables
-    for (auto &var : unnamed_vars_) {
-      delete var;
-    }
-    unnamed_vars_.clear();
-    DropKids();
-
-#ifdef PADDLE_MOBILE_CL
-    delete cl_scope_;
-#endif
-  }
-
-  Scope &NewScope() const;
-
-  /// Create a variable without name if it doesn't exist.
-  Variable *Var();
-
-  /// Create a variable with given name if it doesn't exist.
-  Variable *Var(const std::string &name);
-
-  void EraseVars(const std::vector<std::string> &var_names);
-
-  /// Find a variable in the scope or any of its ancestors.  Returns
-  /// nullptr if cannot find.
-  Variable *FindVar(const std::string &name) const;
-
-  const Scope *parent() const { return parent_; }
-
-  /// Find the scope or an ancestor scope that contains the given
-  /// variable.
-  const Scope *FindScope(const Variable *var) const;
-
-  void DeleteScope(Scope *scope) const;
-
-  /// Drop all kids scopes belonged to this scope.
-  void DropKids();
-
-  // enumerate all the variables current contains.
-  std::vector<std::string> LocalVarNames() const;
-
-  // Rename variable to a new name
-  void Rename(const std::string &origin_name,
-              const std::string &new_name) const;
-
-  // Rename variable to a new name and return the new name
-  std::string Rename(const std::string &origin_name) const;
-
-  Variable *FindVarLocally(const std::string &name) const;
-
-#ifdef PADDLE_MOBILE_FPGA
-  Variable *Var(const std::string &name, const int id);
-  std::vector<Variable *> VarContain(const std::string substring, int *min);
-  void print_vars();
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-  CLScope *GetCLScpoe() { return cl_scope_; }
-#endif
-
- private:
-  // Call Scope::NewScope for a sub-scope.
-  explicit Scope(Scope const *parent) : parent_(parent) {}
-
-  mutable std::unordered_map<std::string, Variable *> named_vars_;
-  mutable std::vector<Variable *> unnamed_vars_;
-  mutable std::list<Scope *> kids_;
-  Scope const *parent_{nullptr};
-
-#ifdef PADDLE_MOBILE_CL
-  CLScope *cl_scope_ = new CLScope();
-#endif
-};
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/selected_rows.cpp b/mobile/src/framework/selected_rows.cpp
deleted file mode 100644
index 96e72051e5..0000000000
--- a/mobile/src/framework/selected_rows.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/selected_rows.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-struct ReAllocateVisitor {
-  ReAllocateVisitor(framework::Tensor* tensor, const framework::DDim& dims)
-      : tensor_(tensor), dims_(dims) {}
-
-  template <typename T>
-  void operator()() const {
-    framework::Tensor cpu_tensor;
-    T* ptr = cpu_tensor.mutable_data<T>(dims_);
-    const T* old_ptr =
-        tensor_->memory_size() == 0 ? nullptr : tensor_->data<T>();
-    if (old_ptr != nullptr) {
-      std::copy(old_ptr, old_ptr + tensor_->numel(), ptr);
-    }
-    tensor_->ShareDataWith(cpu_tensor);
-  }
-
-  framework::Tensor* tensor_;
-  framework::DDim dims_;
-};
-// TensorCopyVisitor(value, i * value_width, *value_.get(),
-//    index * value_width, value_width));
-struct TensorCopyVisitor {
-  TensorCopyVisitor(framework::Tensor* dst, int64_t dst_offset,
-                    const framework::Tensor src, int64_t src_offset,
-                    int64_t size)
-      : dst_(dst),
-        dst_offset_(dst_offset),
-        src_(src),
-        src_offset_(src_offset),
-        size_(size) {}
-
-  template <typename T>
-  void operator()() const {
-    // TODO(Yancey1989): support other place
-    memory::Copy(dst_->mutable_data<T>() + dst_offset_,
-                 src_.data<T>() + src_offset_, size_ * sizeof(T));
-  }
-
-  framework::Tensor* dst_;
-  int64_t dst_offset_;
-  framework::Tensor src_;
-  int64_t src_offset_;
-  int64_t size_;
-};
-
-bool SelectedRows::HasKey(int64_t key) const {
-  return std::find(rows_.begin(), rows_.end(), key) == rows_.end() ? false
-                                                                   : true;
-}
-
-// std::vector<int64_t> SelectedRows::Get(std::vector<int64_t> keys,
-//                                       framework::Tensor* value) const {
-//  PADDLE_MOBILE_ENFORCE(value->IsInitialized(),
-//                 "The value tensor should be initialized.");
-//  std::vector<int64_t> non_keys;
-//  int64_t value_width = value_->numel() / value_->dims()[0];
-//  PADDLE_MOBILE_ENFORCE(value_width == value->numel() / value->dims()[0],
-//                    "output tensor should have the same shape with table "
-//                    "execpt the dims[0].");
-//
-//  for (size_t i = 0; i < keys.size(); ++i) {
-//    int64_t index = Index(keys[i]);
-//    if (index == -1) {
-//      non_keys.push_back(keys[i]);
-//    } else {
-//      framework::VisitDataType(
-//          framework::ToDataType(value_->type()),
-//          TensorCopyVisitor(value, i * value_width, *value_.get(),
-//                            index * value_width, value_width));
-//    }
-//  }
-//  return non_keys;
-//}
-
-// bool SelectedRows::Set(int64_t key, const framework::Tensor& value) {
-//  PADDLE_MOBILE_ENFORCE(value.IsInitialized(), "The value should be
-//  initialized."); if (value_->IsInitialized()) {
-//    PADDLE_MOBILE_ENFORCE(
-//        value.type() == value_->type(),
-//        "The type of the value should be same with the original value");
-//  }
-//  PADDLE_MOBILE_ENFORCE(value.dims()[0] == static_cast<size_t>(1),
-//                    "The first dim of value should be 1.");
-//  auto index = Index(key);
-//  bool is_new_key = false;
-//  if (index == -1) {
-//    rows_.push_back(key);
-//    index = rows_.size() - 1;
-//    is_new_key = true;
-//    // whether need to resize the table
-//    if (static_cast<int64_t>(rows_.size()) > value_->dims()[0]) {
-//      auto dims = value_->dims();
-//      dims[0] = (dims[0] + 1) << 1;
-//      framework::VisitDataType(framework::ToDataType(value.type()),
-//                               ReAllocateVisitor(value_.get(), dims));
-//    }
-//  }
-//
-//  framework::VisitDataType(
-//      framework::ToDataType(value.type()),
-//      TensorCopyVisitor(value_.get(),
-//                        index * value_->numel() / value_->dims()[0], value,
-//                        static_cast<int64_t>(0), value.numel()));
-//  return is_new_key;
-//}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/selected_rows.h b/mobile/src/framework/selected_rows.h
deleted file mode 100644
index db49bd9115..0000000000
--- a/mobile/src/framework/selected_rows.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <vector>
-
-#include "framework/lod_tensor.h"
-#include "framework/mixed_vector.h"
-#include "framework/tensor.h"
-#include "memory/t_malloc.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class SelectedRows {
-  /*
-   * @brief We can use the SelectedRows structure to reproduce a sparse table.
-   *  A sparse table is a key-value structure that the key is an `int64_t`
-   * number,
-   *  and the value is a Tensor which the first dimension is 0.
-   *  You can use the following interface to operate the sparse table, and you
-   * can find
-   *  some detail information from the comments of each interface:
-   *
-   *  HasKey(key), whether the sparse table has the specified key.
-   *  Set(key, value), set a key-value pair into the sparse table.
-   *  Get(keys, value*), get value by given key list and apply it to the given
-   * value pointer
-   *    with the specified offset.
-   *
-   */
- public:
-  SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
-      : rows_(rows), height_(height) {
-    value_.reset(new Tensor());
-  }
-
-  SelectedRows() {
-    height_ = 0;
-    value_.reset(new Tensor());
-  }
-
-  // platform::Place place() const { return value_->place(); }
-
-  const Tensor& value() const { return *value_; }
-
-  Tensor* mutable_value() { return value_.get(); }
-
-  int64_t height() const { return height_; }
-
-  void set_height(int64_t height) { height_ = height; }
-
-  const Vector<int64_t>& rows() const { return rows_; }
-
-  Vector<int64_t>* mutable_rows() { return &rows_; }
-
-  void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }
-
-  /*
-   * @brief wheter has the specified key in the table.
-   *
-   * @return true if the key is exists.
-   */
-  bool HasKey(int64_t key) const;
-
-  /*
-   * @brief Get value by the key list, if the
-   *
-   * @return a list of keys which does not exists in table
-   */
-  std::vector<int64_t> Get(std::vector<int64_t> keys,
-                           framework::Tensor* tensor) const;
-
-  /*
-   * @brief Set a key-value pair into the table.
-   *  This function will double the value memory if it's not engouth.
-   *
-   * @note:
-   *    1. The first dim of the value should be 1
-   *    2. The value should be initialized and the data type
-   *       should be the same with the table.
-   *
-   * @return true if the key is a new one, otherwise false
-   *
-   */
-  bool Set(int64_t key, const Tensor& value);
-
-  /*
-   * @brief Get the index of key in rows
-   *
-   * @return -1 if the key does not exists.
-   */
-  int64_t Index(int64_t key) const {
-    auto it = std::find(rows_.begin(), rows_.end(), key);
-    if (it == rows_.end()) {
-      return static_cast<int64_t>(-1);
-    }
-    return static_cast<int64_t>(std::distance(rows_.begin(), it));
-  }
-
-  DDim GetCompleteDims() const {
-    std::vector<int64_t> dims = vectorize(value_->dims());
-    dims[0] = height_;
-    return make_ddim(dims);
-  }
-
- private:
-  // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here.
-  // SelectedRows are simply concated when adding together. Until a
-  // SelectedRows add a Tensor, will the duplicate rows be handled.
-  Vector<int64_t> rows_;
-  std::unique_ptr<Tensor> value_{nullptr};
-  int64_t height_;
-};
-
-/*
- * Serialize/Desiralize SelectedRows to std::ostream
- * You can pass ofstream or ostringstream to serilize to file
- * or to a in memory string. GPU tensor will be copied to CPU.
- */
-void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows);
-void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows);
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/tensor.h b/mobile/src/framework/tensor.h
deleted file mode 100644
index 7cab1408da..0000000000
--- a/mobile/src/framework/tensor.h
+++ /dev/null
@@ -1,355 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdint>
-#include <cstring>
-#include <fstream>
-#include <functional>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "common/enforce.h"
-#include "framework/data_layout.h"
-#include "framework/tensor_base.h"
-#include "memory/t_malloc.h"
-
-#ifdef PADDLE_MOBILE_FPGA_KD
-#include "framework/zynqmp/ztensor.hpp"
-#endif
-
-#ifndef PADDLE_MOBILE_FPGA_KD
-
-namespace paddle_mobile {
-namespace framework {
-
-enum LayoutType {
-  LAYOUT_CHW = 1,
-  LAYOUT_HWC = 0,
-};
-
-class LoDTensor;
-
-class Tensor : public TensorBase {
- public:
-  Tensor() {}
-  template <typename T>
-  Tensor(std::vector<T> input, DDim ddim) {
-    PADDLE_MOBILE_ENFORCE(
-        input.size() == framework::product(ddim),
-        "input vector'length should be equal to tensor's length");
-
-    auto input_ptr = mutable_data<T>(ddim);
-    for (int i = 0; i < input.size(); ++i) {
-      input_ptr[i] = input[i];
-    }
-  }
-
-  template <typename T>
-  Tensor(T *input, DDim ddim) {
-    // input pointer is allocated by external sources. can't calculate its
-    // length. PADDLE_MOBILE_ENFORCE(
-    //     (sizeof(input) / sizeof(input[0])) == framework::product(ddim),
-    //     "input vector'length should be equal to tensor's length");
-
-    Resize(ddim);
-    auto type = type_id<T>().hash_code();
-    int64_t size = numel() * SizeOfType(type);
-    holder_.reset(
-        new PlaceholderImpl(size, type, reinterpret_cast<uint8_t *>(input)));
-    holder_->set_type(type);
-    offset_ = 0;
-  }
-
-  Tensor(const Tensor &inTensor) {
-    this->dims_ = inTensor.dims_;
-    this->holder_ = inTensor.holder_;
-    this->offset_ = inTensor.offset_;
-  }
-
-  /*! Resize the dimensions of the memory block. */
-  inline Tensor &Resize(const DDim &dims) {
-    dims_ = dims;
-    return *this;
-  }
-
-  /*! The internal of two tensors share the same memory block. */
-  inline Tensor &ShareDataWith(const Tensor &src) {
-    src.check_memory_size();
-    if (holder_.get() != src.holder_.get() || dims_ != src.dims()) {
-      *this = src;
-    }
-    return *this;
-  }
-
-  /*! The internal of two tensors share the same memory block. */
-  inline Tensor &ShareHolderWith(const Tensor &src) {
-    src.check_memory_size();
-    if (holder_.get() != src.holder_.get()) {
-      holder_ = src.holder_;
-    }
-    return *this;
-  }
-
-  template <typename T>
-  inline T *mutable_data_new() {
-    static_assert(std::is_pod<T>::value, "T must be POD");
-    const kTypeId_t type = type_id<T>().hash_code();
-
-    if (holder_ != nullptr) {
-      holder_->set_type(type);
-    }
-
-    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
-    int64_t size = numel() * SizeOfType(type);
-    if (holder_ == nullptr || holder_->size() != size + offset_) {
-      if (holder_ == nullptr) {
-        holder_.reset(new PlaceholderImpl(size, type));
-      } else {
-        holder_->realloc(size);
-      }
-      offset_ = 0;
-    }
-    return reinterpret_cast<T *>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                 offset_);
-  }
-
-  inline void *mutable_data(const kTypeId_t type) {
-    if (holder_ != nullptr) {
-      holder_->set_type(type);
-    }
-    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
-    int64_t size = numel() * SizeOfType(type);
-    if (holder_ == nullptr || holder_->size() < size + offset_) {
-      if (holder_ == nullptr) {
-        holder_.reset(new PlaceholderImpl(size, type));
-      } else {
-        holder_->resize(size);
-      }
-      offset_ = 0;
-    }
-    return reinterpret_cast<void *>(
-        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
-  }
-
-  /**
-   * @brief   Return a pointer to mutable memory block.
-   * @note    If not exist, then allocation.
-   */
-  template <typename T>
-  inline T *mutable_data() {
-    static_assert(std::is_pod<T>::value, "T must be POD");
-    return reinterpret_cast<T *>(mutable_data(type_id<T>().hash_code()));
-  }
-
-  /**
-   * @brief     Return a pointer to mutable memory block.
-   *
-   * @param[in] dims    The dimensions of the memory block.
-   * @param[in] place   The place of the memory block.
-   *
-   * @note      If not exist, then allocation.
-   */
-  template <typename T>
-  inline T *mutable_data(DDim dims) {
-    static_assert(std::is_pod<T>::value, "T must be POD");
-    Resize(dims);
-    return mutable_data<T>();
-  }
-
-  /**
-   * @brief  Return a sub-tensor of the given tensor.
-   *
-   * @param[in] begin_idx   The index of the start row(inclusive) to
-   * slice.
-   *                        The index number begins from 0.
-   * @param[in] end_idx     The index of the end row(exclusive) to
-   * slice.
-   *                        The index number begins from 0.
-   */
-  inline Tensor Slice(int begin_idx, int end_idx) const {
-    check_memory_size();
-    PADDLE_MOBILE_ENFORCE(begin_idx >= 0,
-                          "The start row index must be greater than 0.")
-    PADDLE_MOBILE_ENFORCE(end_idx <= dims_[0],
-                          "The end row index is out of bound.")
-    PADDLE_MOBILE_ENFORCE(
-        begin_idx < end_idx,
-        "The start row index must be lesser than the end row index")
-    if (dims_[0] == 1) {
-      return *this;
-    } else {
-      size_t base = numel() / dims_[0];
-      Tensor dst;
-      dst.holder_ = holder_;
-      DDim dst_dims = dims_;
-      dst_dims[0] = end_idx - begin_idx;
-      dst.Resize(dst_dims);
-      dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
-      return dst;
-    }
-  }
-
-  /*! Return a pointer to mutable memory block. */
-  template <typename T>
-  inline T *data() {
-    check_memory_size();
-    PADDLE_MOBILE_ENFORCE(
-        (std::is_same<T, void>::value ||
-         holder_->type() == type_id<T>().hash_code()),
-        "Tensor holds the wrong type, it holds %d, requested %d",
-        this->holder_->type(), type_id<T>().hash_code());
-
-    return reinterpret_cast<T *>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                 offset_);
-  }
-
-  /*! Return a pointer to constant memory block. */
-  template <typename T>
-  inline const T *data() const {
-    check_memory_size();
-    PADDLE_MOBILE_ENFORCE(
-        (std::is_same<T, void>::value ||
-         holder_->type() == type_id<T>().hash_code()),
-        "Tensor holds the wrong type, it holds %d, requested %d",
-        this->holder_->type(), type_id<T>().hash_code());
-
-    return reinterpret_cast<const T *>(
-        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
-  }
-
- private:
-  struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(size_t size, const kTypeId_t type)
-        : ptr_(static_cast<uint8_t *>(memory::Alloc(size)),
-               [](uint8_t *ptr) { memory::PODDeleter<uint8_t>()(ptr); }),
-          size_(size),
-          capatity_(size),
-          type_(type) {
-      PADDLE_MOBILE_ENFORCE(ptr_ != nullptr,
-                            "Insufficient memory to allocation");
-    }
-
-    PlaceholderImpl(size_t size, const kTypeId_t type, uint8_t *ptr)
-        : ptr_(ptr, [](uint8_t *ptr) {}),
-          size_(size),
-          capatity_(size),
-          type_(type) {
-      PADDLE_MOBILE_ENFORCE(ptr_ != nullptr,
-                            "Insufficient memory to allocation");
-    }
-
-    virtual size_t size() const { return size_; }
-
-    virtual void *ptr() const { return static_cast<void *>(ptr_.get()); }
-
-    virtual kTypeId_t type() const { return type_; }
-
-    virtual void set_type(const kTypeId_t type) { type_ = type; }
-
-    virtual void resize(size_t size) {
-      if (size > capatity_) {
-        capatity_ = size;
-        ptr_.reset(static_cast<uint8_t *>(memory::Alloc(capatity_)));
-      }
-      size_ = size;
-    }
-
-    virtual void realloc(size_t size) {
-      capatity_ = size;
-      ptr_.reset(static_cast<uint8_t *>(memory::Alloc(capatity_)));
-      size_ = size;
-    }
-
-    std::unique_ptr<uint8_t, std::function<void(uint8_t *)>> ptr_;
-
-    /*! the size of memory block. */
-    size_t size_;
-
-    size_t capatity_;
-
-    /* the current type of memory */
-    kTypeId_t type_;
-  };
-
-#ifdef PADDLE_MOBILE_FPGA
- public:  // NOLINT
-  inline void reset_data_ptr(void *p) {
-    ((PlaceholderImpl *)(holder_.get()))->ptr_.reset((uint8_t *)p);  // NOLINT
-  }
-  inline void set_type(const kTypeId_t type) { holder_->set_type(type); }
-  inline void *get_data() {
-    return (
-        void *)(((PlaceholderImpl *)(holder_.get()))->ptr_.get());  // NOLINT
-  }
-
-  inline void *init(const kTypeId_t type) {
-    if (holder_ != nullptr) {
-      holder_->set_type(type);
-    }
-    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
-    int64_t size = 1 * SizeOfType(type);
-    if (holder_ == nullptr || holder_->size() < size + offset_) {
-      holder_.reset(new PlaceholderImpl(size, type));
-      offset_ = 0;
-    }
-    return reinterpret_cast<void *>(
-        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
-  }
-
-  float scale[2];                 // scale[0]= MAX/127.0, scale[1]= 127.0/MAX
-  void *external_data = nullptr;  // only used for Feed
-  LayoutType layout = LAYOUT_HWC;
-  int64_t fpga_data_num;
-#endif
-};
-
-#ifdef PADDLE_MOBILE_DEBUG
-inline Print &operator<<(Print &printer, const Tensor &tensor) {
-  printer << " dims: " << tensor.dims() << "\n";
-  int stride = tensor.numel() / 20;
-  stride = stride > 0 ? stride : 1;
-#ifndef PADDLE_MOBILE_FPGA
-  for (int i = 0; i < tensor.numel(); i += stride) {
-    if (tensor.type() == type_id<float>()) {
-      printer << tensor.data<float>()[i] << " ";
-    } else if (tensor.type() == type_id<int32_t>()) {
-      printer << tensor.data<int32_t>()[i] << " ";
-    } else if (tensor.type() == type_id<int64_t>()) {
-      printer << tensor.data<int64_t>()[i] << " ";
-    } else if (tensor.type() == type_id<int8_t>()) {
-      printer << static_cast<int>(tensor.data<int8_t>()[i]) << " ";
-    } else if (tensor.type() == type_id<int32_t>()) {
-      printer << tensor.data<int32_t>()[i] << " ";
-    }
-  }
-#endif
-  return printer;
-}
-
-#endif
-
-inline Tensor ReshapeToMatrix(const Tensor &src, int num_col_dims) {
-  Tensor res;
-  res.ShareDataWith(src);
-  res.Resize(flatten_to_2d(src.dims(), num_col_dims));
-  return res;
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/framework/tensor_base.h b/mobile/src/framework/tensor_base.h
deleted file mode 100644
index a7f4aa1b8a..0000000000
--- a/mobile/src/framework/tensor_base.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "common/enforce.h"
-#include "common/type_define.h"
-#include "common/types.h"
-#include "framework/ddim.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename... T>
-struct SizeOfTypeFunctor;
-
-template <typename T>
-struct SizeOfTypeFunctor<T> {
-  size_t operator()(const kTypeId_t type) const {
-    if (type_id<T>().hash_code() == type) {
-      return sizeof(T);
-    } else {
-      return 0UL;
-    }
-  }
-};
-
-template <>
-struct SizeOfTypeFunctor<> {
-  size_t operator()(const kTypeId_t type) const { return 0UL; }
-};
-
-template <typename HEAD, typename... TAIL>
-struct SizeOfTypeFunctor<HEAD, TAIL...> {
-  size_t operator()(const kTypeId_t type) const {
-    SizeOfTypeFunctor<HEAD> head;
-    size_t head_size = head(type);
-    if (head_size != 0) {
-      return head_size;
-    }
-    SizeOfTypeFunctor<TAIL...> tail;
-    return tail(type);
-  }
-};
-
-static inline size_t SizeOfType(const kTypeId_t type) {
-  SizeOfTypeFunctor<int8_t, int, half, float, double, int16_t, int64_t, bool,
-                    size_t>
-      functor;
-  size_t size = functor(type);
-
-  PADDLE_MOBILE_ENFORCE(size != 0UL, "Cannot get size of type %d", type);
-  return size;
-}
-
-class TensorBase {
- public:
-  virtual inline TensorBase &Resize(const DDim &dims) = 0;
-
-  inline bool IsInitialized() const { return holder_ != nullptr; }
-
-  /*! Return the dimensions of the memory block. */
-  inline const DDim &dims() const { return dims_; }
-
-  /*! Return the numel of the memory block. */
-  inline int64_t numel() const { return product(dims_); }
-
-  kTypeId_t type() const {
-    PADDLE_MOBILE_ENFORCE(
-        holder_ != nullptr,
-        "Tensor not initialized yet when Tensor::type() is called.")
-    return holder_->type();
-  }
-
-  // memory size returns the holding memory size in byte.
-  size_t memory_size() const {
-    return holder_ == nullptr ? 0UL : holder_->size() - offset_;
-  }
-
-  inline void check_memory_size() const {
-#ifdef PADDLE_MOBILE_FPGA
-    return;
-#endif
-    PADDLE_MOBILE_ENFORCE(
-        holder_ != nullptr,
-        "Tensor holds no memory. Call Tensor::mutable_data first.");
-    PADDLE_MOBILE_ENFORCE(numel() * SizeOfType(type()) <= memory_size(),
-                          "Tensor's dims_ is out of bound. ");
-  }
-
- protected:
-  /**
-   * @note    Placeholder hides type T, so it doesn't appear as a
-   * template
-   *          parameter of Variable.
-   */
-  struct Placeholder {
-    virtual ~Placeholder() = default;
-
-    virtual void *ptr() const = 0;
-
-    virtual size_t size() const = 0;
-
-    virtual kTypeId_t type() const = 0;
-
-    virtual void set_type(kTypeId_t type) = 0;
-
-    virtual void resize(size_t size) = 0;
-
-    virtual void realloc(size_t size) = 0;
-  };
-
-  /**
-   * @brief points to elements dimensions.
-   *
-   * @note dims_ do not indicate the memory block size.
-   */
-
-  DDim dims_;
-
-  /*! holds the memory block if allocated. */
-  std::shared_ptr<Placeholder> holder_;
-
-  /**
-   * @brief   A PlaceHolder may be shared by more than one tensor.
-   *
-   * @note    Some of them may be slices of the others. So the offset_
-   *          is introduced here to indicate the byte offset between
-   *          PlaceHolder::ptr_ and where the tensor data really
-   * begins.
-   */
-  size_t offset_ = 0;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/tensor_util.cpp b/mobile/src/framework/tensor_util.cpp
deleted file mode 100644
index 6722ec3e37..0000000000
--- a/mobile/src/framework/tensor_util.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "tensor_util.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-void TensorCopy(const Tensor &src, Tensor *dst) {
-  src.check_memory_size();
-  dst->Resize(src.dims());
-  auto src_ptr = src.data<void>();
-  auto dst_ptr = dst->mutable_data(src.type());
-  auto size = src.numel() * SizeOfType(src.type());
-  memory::Copy(dst_ptr, src_ptr, size);
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/tensor_util.h b/mobile/src/framework/tensor_util.h
deleted file mode 100644
index 31fc5148c7..0000000000
--- a/mobile/src/framework/tensor_util.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "framework/tensor.h"
-#include "memory/t_malloc.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-void TensorCopy(const Tensor& src, Tensor* dst);
-
-template <typename T>
-void TensorFromVector(const std::vector<T>& src, Tensor* dst);
-
-template <typename T>
-void TensorFromVector(const std::vector<T>& src, Tensor* dst) {
-  auto src_ptr = static_cast<const void*>(src.data());
-  dst->Resize({static_cast<int64_t>(src.size())});
-  auto dst_ptr = static_cast<void*>(dst->mutable_data<T>());
-  auto size = src.size() * sizeof(T);
-
-  memory::Copy(dst_ptr, src_ptr, size);
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/type_trait.h b/mobile/src/framework/type_trait.h
deleted file mode 100644
index d1a8e30522..0000000000
--- a/mobile/src/framework/type_trait.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <common/types.h>
-#include <string>
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-struct DtypeTensorTrait {
-  // This is the type we obtained in variable.
-  typedef framework::LoDTensor gtype;
-  // This type will be the parent class type
-  // or the same type.
-  typedef framework::Tensor rtype;
-};
-
-#ifdef PADDLE_MOBILE_CL
-template <>
-struct DtypeTensorTrait<GPU_CL> {
-  // This is the type we obtained in variable.
-  typedef framework::CLImage gtype;
-  // This type will be the parent class type
-  // or the same type.
-  typedef framework::CLImage rtype;
-};
-#endif
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/variable.h b/mobile/src/framework/variable.h
deleted file mode 100644
index 30486cb347..0000000000
--- a/mobile/src/framework/variable.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include "common/variant.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class Variable {
- public:
-  template <typename T>
-  const T *Get() const {
-    return static_cast<const T *>(holder_->Ptr());
-  }
-
-  template <typename T>
-  const T GetValue() const {
-    if (type_id<T>().hash_code() == type_id<std::string>().hash_code()) {
-      PADDLE_MOBILE_THROW_EXCEPTION(
-          "Please use getString to get an string (to avoid of an issue with "
-          "gcc "
-          "stl lib with string copy)");
-      exit(0);
-    }
-    return variant.Get<T>();
-  }
-
-  template <typename T>
-  void SetValue(T value) {
-    variant.Set<T>(value);
-  }
-
-  bool IsInitialized() const { return holder_ != nullptr; }
-
-  template <typename T>
-  T *GetMutable() {
-    if (!IsType<T>()) {
-      holder_.reset(new PlaceholderImp<T>(new T()));
-    }
-    return static_cast<T *>(holder_->Ptr());
-  }
-
-  template <typename T>
-  bool IsType() const {
-    return holder_ != nullptr && holder_->Type() == type_id<T>().hash_code();
-  }
-
-  void Clear() { holder_.reset(); }
-
-  kTypeId_t Type() const { return holder_->Type(); }
-
- private:
-  struct Placeholder {
-    Placeholder() = default;
-    virtual ~Placeholder() = default;
-
-    virtual kTypeId_t Type() const = 0;
-    virtual void *Ptr() const = 0;
-  };
-
-  template <typename T>
-  struct PlaceholderImp : public Placeholder {
-    explicit PlaceholderImp(T *ptr)
-        : ptr_(ptr), type_(type_id<T>().hash_code()) {}
-
-    kTypeId_t Type() const override { return type_; }
-    void *Ptr() const override { return static_cast<void *>(ptr_.get()); }
-
-    std::unique_ptr<T> ptr_;
-    kTypeId_t type_;
-  };
-
-  friend class Scope;
-
-  Variant<int, bool, std::string, float, double> variant;
-  std::unique_ptr<Placeholder> holder_;
-  std::string name_;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/zynqmp/ztensor.hpp b/mobile/src/framework/zynqmp/ztensor.hpp
deleted file mode 100644
index d68e43b6dc..0000000000
--- a/mobile/src/framework/zynqmp/ztensor.hpp
+++ /dev/null
@@ -1,312 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdint>
-#include <cstring>
-#include <fstream>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "common/enforce.h"
-#include "framework/data_layout.h"
-#include "framework/tensor_base.h"
-#include "memory/t_malloc.h"
-
-#ifdef PADDLE_MOBILE_FPGA_KD
-
-#include "fpga/KD/tensor.hpp"
-
-namespace paddle_mobile {
-namespace framework {
-
-class LoDTensor;
-
-class Tensor : public TensorBase {
- public:
-  Tensor() {}
-  template <typename T>
-  Tensor(std::vector<T> input, DDim ddim) {
-    PADDLE_MOBILE_ENFORCE(
-        input.size() == framework::product(ddim),
-        "input vector'length should be equal to tensor's length");
-
-    auto input_ptr = mutable_data<T>(ddim);
-    for (int i = 0; i < input.size(); ++i) {
-      input_ptr[i] = input[i];
-    }
-  }
-
-  Tensor(const Tensor &inTensor) {
-    this->dims_ = inTensor.dims_;
-    this->holder_ = inTensor.holder_;
-    this->offset_ = inTensor.offset_;
-  }
-
-  /*! Resize the dimensions of the memory block. */
-  inline Tensor &Resize(const DDim &dims) {
-    dims_ = dims;
-    // TODO(chonwhite) resize holder?
-    return *this;
-  }
-
-  /*! The internal of two tensors share the same memory block. */
-  inline Tensor &ShareDataWith(const Tensor &src) {
-    src.check_memory_size();
-    if (holder_.get() != src.holder_.get()) {
-      *this = src;
-    }
-    return *this;
-  }
-
-  /*! The internal of two tensors share the same memory block. */
-  inline Tensor &ShareHolderWith(const Tensor &src) {
-    src.check_memory_size();
-    if (holder_.get() != src.holder_.get()) {
-      holder_ = src.holder_;
-    }
-    return *this;
-  }
-
-  inline zynqmp::Tensor *zynqmpTensor() const {
-    PlaceholderImpl *holder = static_cast<PlaceholderImpl *>(holder_.get());
-    // mutable_data(holder->type());
-    return holder->tensor_;
-  }
-
-  inline void *mutable_data(const kTypeId_t type) {
-    if (holder_ != nullptr) {
-      holder_->set_type(type);
-    }
-    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
-    int64_t size = numel() * SizeOfType(type);
-    if (holder_ == nullptr || holder_->size() < size + offset_) {
-      PlaceholderImpl *impl = nullptr;
-      if (holder_ == nullptr) {
-        std::cout << "holder null" << std::endl;
-        impl = new PlaceholderImpl(dims_, type);
-        holder_.reset(impl);
-      } else {
-        impl = static_cast<PlaceholderImpl *>(holder_.get());
-        std::cout << "holder reize" << std::endl;
-        // holder_->resize(size);
-      }
-      impl->resize(dims_, type);
-      offset_ = 0;
-    }
-    return reinterpret_cast<void *>(
-        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
-  }
-
-  /**
-   * @brief   Return a pointer to mutable memory block.
-   * @note    If not exist, then allocation.
-   */
-  template <typename T>
-  inline T *mutable_data() {
-    static_assert(std::is_pod<T>::value, "T must be POD");
-    return reinterpret_cast<T *>(mutable_data(type_id<T>().hash_code()));
-  }
-
-  /**
-   * @brief     Return a pointer to mutable memory block.
-   *
-   * @param[in] dims    The dimensions of the memory block.
-   * @param[in] place   The place of the memory block.
-   *
-   * @note      If not exist, then allocation.
-   */
-  template <typename T>
-  inline T *mutable_data(DDim dims) {
-    static_assert(std::is_pod<T>::value, "T must be POD");
-    Resize(dims);
-    return mutable_data<T>();
-  }
-
-  /**
-   * @brief  Return a sub-tensor of the given tensor.
-   *
-   * @param[in] begin_idx   The index of the start row(inclusive) to
-   * slice.
-   *                        The index number begins from 0.
-   * @param[in] end_idx     The index of the end row(exclusive) to
-   * slice.
-   *                        The index number begins from 0.
-   */
-  inline Tensor Slice(int begin_idx, int end_idx) const {
-    check_memory_size();
-    PADDLE_MOBILE_ENFORCE(begin_idx >= 0,
-                          "The start row index must be greater than 0.")
-    PADDLE_MOBILE_ENFORCE(end_idx <= dims_[0],
-                          "The end row index is out of bound.")
-    PADDLE_MOBILE_ENFORCE(
-        begin_idx < end_idx,
-        "The start row index must be lesser than the end row index")
-    if (dims_[0] == 1) {
-      return *this;
-    } else {
-      size_t base = numel() / dims_[0];
-      Tensor dst;
-      dst.holder_ = holder_;
-      DDim dst_dims = dims_;
-      dst_dims[0] = end_idx - begin_idx;
-      dst.Resize(dst_dims);
-      dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
-      return dst;
-    }
-  }
-
-  /*! Return a pointer to mutable memory block. */
-  template <typename T>
-  inline T *data() {
-    check_memory_size();
-    PADDLE_MOBILE_ENFORCE(
-        (std::is_same<T, void>::value ||
-         holder_->type() == type_id<T>().hash_code()),
-        "Tensor holds the wrong type, it holds %d, requested %d",
-        this->holder_->type(), type_id<T>().hash_code());
-
-    return reinterpret_cast<T *>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                 offset_);
-  }
-
-  /*! Return a pointer to constant memory block. */
-  template <typename T>
-  inline const T *data() const {
-    check_memory_size();
-    PADDLE_MOBILE_ENFORCE(
-        (std::is_same<T, void>::value ||
-         holder_->type() == type_id<T>().hash_code()),
-        "Tensor holds the wrong type, it holds %d, requested %d",
-        this->holder_->type(), type_id<T>().hash_code());
-
-    return reinterpret_cast<const T *>(
-        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
-  }
-
- private:
-  struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(DDim ddim, const kTypeId_t type) {
-      tensor_ = new zynqmp::Tensor();
-      type_ = type;
-      std::vector<int> v = framework::vectorize2int(ddim);
-
-      zynqmp::LayoutType layout_type = zynqmp::NCHW;
-      switch (v.size()) {
-        case 1:
-          layout_type = zynqmp::N;
-          break;
-        case 2:
-          layout_type = zynqmp::NC;
-          break;
-        case 3:
-          layout_type = zynqmp::NHW;
-          break;
-        case 4:
-          layout_type = zynqmp::NCHW;
-          break;
-      }
-      zynqmp::Shape input_shape(layout_type, v);
-
-      // for (int i = 0; i < v.size(); i++) {
-      //   std::cout << ":" << v[i] << std::endl;
-      // }
-      zynqmp::DataType dtype = type == _float ? zynqmp::FP32 : zynqmp::FP16;
-      tensor_->mutableData<float>(dtype, input_shape);
-    }
-
-    virtual size_t size() const { return size_; }
-
-    virtual void *ptr() const {
-      void *ptr = tensor_->data<void *>();
-      return ptr;
-    }
-
-    virtual kTypeId_t type() const { return type_; }
-
-    virtual void set_type(const kTypeId_t type) { type_ = type; }
-
-    virtual void resize(size_t size) {
-      if (size > capatity_) {
-        capatity_ = size;
-        // TODO(chonwhite) implement;
-      }
-      size_ = size;
-    }
-
-    virtual void realloc(size_t size) {
-      capatity_ = size;
-      // TODO(chonwhite) implement;
-      size_ = size;
-    }
-
-    void resize(DDim ddim, const kTypeId_t type) {
-      std::vector<int> v = framework::vectorize2int(ddim);
-
-      zynqmp::LayoutType layout_type = zynqmp::NCHW;
-      switch (v.size()) {
-        case 1:
-          layout_type = zynqmp::N;
-          break;
-        case 2:
-          layout_type = zynqmp::NC;
-          break;
-        case 3:
-          layout_type = zynqmp::NHW;
-          break;
-        case 4:
-          layout_type = zynqmp::NCHW;
-          break;
-      }
-      zynqmp::Shape input_shape(layout_type, v);
-      zynqmp::DataType dtype = type == _float ? zynqmp::FP32 : zynqmp::FP16;
-      tensor_->mutableData<float>(dtype, input_shape);
-    }
-
-    /*! the size of memory block. */
-    size_t size_;
-
-    size_t capatity_;
-
-    /* the current type of memory */
-    kTypeId_t type_;
-
-    zynqmp::Tensor *tensor_;
-    // zynqmp::Shape* shape_;
-  };
-};
-
-#ifdef PADDLE_MOBILE_DEBUG
-inline Print &operator<<(Print &printer, const Tensor &tensor) {
-  printer << " dims: " << tensor.dims() << "\n";
-  int stride = tensor.numel() / 20;
-  stride = stride > 0 ? stride : 1;
-  return printer;
-}
-
-#endif
-
-inline Tensor ReshapeToMatrix(const Tensor &src, int num_col_dims) {
-  Tensor res;
-  res.ShareDataWith(src);
-  res.Resize(flatten_to_2d(src.dims(), num_col_dims));
-  return res;
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/io/api.cc b/mobile/src/io/api.cc
deleted file mode 100644
index 0e254aa15a..0000000000
--- a/mobile/src/io/api.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "cstring"
-#include "io/paddle_inference_api.h"
-
-namespace paddle_mobile {
-
-int PaddleDtypeSize(PaddleDType dtype) {
-  switch (dtype) {
-    case PaddleDType::FLOAT32:
-      return sizeof(float);
-    case PaddleDType::INT64:
-      return sizeof(int64_t);
-    default:
-      assert(false);
-      return -1;
-  }
-}
-
-PaddleBuf::PaddleBuf(PaddleBuf&& other)
-    : data_(other.data_),
-      length_(other.length_),
-      memory_owned_(other.memory_owned_) {
-  other.memory_owned_ = false;
-  other.data_ = nullptr;
-  other.length_ = 0;
-}
-
-PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; }
-
-PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
-  // only the buffer with external memory can be copied
-  if (!other.memory_owned_) {
-    data_ = other.data_;
-    length_ = other.length_;
-    memory_owned_ = other.memory_owned_;
-  } else {
-    Resize(other.length());
-    memcpy(data_, other.data(), other.length());
-    length_ = other.length();
-    memory_owned_ = true;
-  }
-  return *this;
-}
-
-void PaddleBuf::Resize(size_t length) {
-  // Only the owned memory can be reset, the external memory can't be changed.
-  if (length_ == length) return;
-  if (memory_owned_) {
-    Free();
-  }
-  data_ = new char[length];
-  length_ = length;
-  memory_owned_ = true;
-}
-
-void PaddleBuf::Reset(void* data, size_t length) {
-  Free();
-  memory_owned_ = false;
-  data_ = data;
-  length_ = length;
-}
-
-void PaddleBuf::Free() {
-  if (memory_owned_ && data_) {
-    assert(length_ > 0);
-    delete[] static_cast<char*>(data_);
-    data_ = nullptr;
-    length_ = 0;
-  }
-}
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/io/api_paddle_mobile.cc b/mobile/src/io/api_paddle_mobile.cc
deleted file mode 100644
index fd77941823..0000000000
--- a/mobile/src/io/api_paddle_mobile.cc
+++ /dev/null
@@ -1,259 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "io/api_paddle_mobile.h"
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "common/enforce.h"
-#include "framework/tensor.h"
-#ifdef PADDLE_MOBILE_FPGA
-#include <fpga/common/fpga_common.h>
-#endif
-
-namespace paddle_mobile {
-
-template <typename Device, typename T>
-PaddleMobilePredictor<Device, T>::PaddleMobilePredictor(
-    const PaddleMobileConfig &config) {
-  PADDLE_MOBILE_ENFORCE(Init(config) == true,
-                        "paddle mobile predictor init failed!");
-  config_ = config;
-}
-
-template <typename Device, typename T>
-bool PaddleMobilePredictor<Device, T>::Init(const PaddleMobileConfig &config) {
-  paddle_mobile_.reset(new PaddleMobile<Device, T>());
-#ifdef PADDLE_MOBILE_CL
-  paddle_mobile_->SetCLPath(config.cl_path);
-#endif
-  if (config.memory_pack.from_memory) {
-    DLOG << "load from memory!";
-    paddle_mobile_->LoadCombinedMemory(
-        config.memory_pack.model_size, config.memory_pack.model_buf,
-        config.memory_pack.combined_params_size,
-        config.memory_pack.combined_params_buf, config.optimize,
-        config.quantification, config.batch_size, config.lod_mode);
-  } else if (!config.model_dir.empty()) {
-    paddle_mobile_->Load(config.model_dir, config.optimize,
-                         config.quantification, config.batch_size,
-                         config.lod_mode);
-  } else if (!config.prog_file.empty() && !config.param_file.empty()) {
-    paddle_mobile_->Load(config.prog_file, config.param_file, config.optimize,
-                         config.quantification, config.batch_size,
-                         config.lod_mode);
-  } else {
-    LOG(kLOG_ERROR) << "fail to load inference model!";
-    return false;
-  }
-  // If the openmp is open, set the thread num
-  paddle_mobile_->SetThreadNum(config.thread_num);
-  return true;
-}
-template <typename Device, typename T>
-bool PaddleMobilePredictor<Device, T>::Run(
-    const std::vector<PaddleTensor> &inputs,
-    std::vector<PaddleTensor> *output_data, int batch_size) {
-  if (inputs.empty()) {
-    LOG(kLOG_ERROR) << "At least one output should be set with tensors' names.";
-    return false;
-  }
-  auto input = inputs[0];
-
-  if (input.lod.size() == 0 && input.shape.size() != 4) {
-    LOG(kLOG_ERROR) << "input shape not equal to 4!";
-    return false;
-  }
-  std::vector<int64_t> dims;
-  for (auto d : input.shape) {
-    dims.push_back(static_cast<int64_t>(d));
-  }
-
-  // use tensor
-  framework::DDim ddim = framework::make_ddim(dims);
-
-  framework::Tensor input_tensor;
-  framework::LoDTensor input_lod_tensor;
-  paddle_mobile::framework::LoD lod{{}};
-  for (int i = 0; i < input.lod.size(); ++i) {
-    lod[0].push_back(input.lod[i]);
-  }
-  input_lod_tensor.set_lod(lod);
-
-  int input_length = framework::product(ddim);
-  if (input.lod.size() > 0) {
-    input_lod_tensor.Resize(ddim);
-    memcpy(input_lod_tensor.mutable_data<T>(),
-           static_cast<T *>(input.data.data()), input_length * sizeof(T));
-    paddle_mobile_->Predict(input_lod_tensor);
-  } else {
-    input_tensor.Resize(ddim);
-    memcpy(input_tensor.mutable_data<T>(), static_cast<T *>(input.data.data()),
-           input_length * sizeof(T));
-    paddle_mobile_->Predict(input_tensor);
-  }
-
-  auto output_tensor = paddle_mobile_->Fetch();
-
-  if (output_data->empty()) {
-    LOG(kLOG_ERROR) << "At least one output should be set with tensors' names.";
-    return false;
-  }
-
-  auto &output = (*output_data)[0];
-  int output_length = output_tensor->numel();
-  std::vector<int64_t> tensor_shape =
-      framework::vectorize(output_tensor->dims());
-
-  for (auto d : tensor_shape) {
-    output.shape.push_back(static_cast<int>(d));
-  }
-
-  if (output.data.length() < output_length * sizeof(T)) {
-    output.data.Resize(output_length * sizeof(T));
-  }
-
-  memcpy(output.data.data(), output_tensor->template data<T>(),
-         output_length * sizeof(T));
-
-  return true;
-}
-
-#ifdef PADDLE_MOBILE_FPGA
-void ConvertPaddleTensors(const PaddleTensor &src, framework::Tensor *des) {
-  des->Resize(framework::make_ddim(src.shape));
-  des->external_data = src.data.data();
-  des->set_type(src.dtypeid);
-  des->layout =
-      src.layout == LAYOUT_HWC ? framework::LAYOUT_HWC : framework::LAYOUT_CHW;
-}
-
-void ConvertTensors(const framework::Tensor &src, PaddleTensor *des) {
-  des->shape = framework::vectorize2int(src.dims());
-  des->dtypeid = src.type();
-  des->layout = src.layout == framework::LAYOUT_HWC ? LAYOUT_HWC : LAYOUT_CHW;
-
-  auto num = src.numel();
-  if (src.type() == type_id<float>()) {
-    des->data.Reset(const_cast<float *>(src.data<float>()),
-                    num * sizeof(float));
-  } else if (src.type() == type_id<half>()) {
-    des->data.Reset(const_cast<int16_t *>(src.data<int16_t>()),
-                    num * sizeof(int16_t));
-  } else {
-    des->data.Reset(const_cast<int8_t *>(src.data<int8_t>()),
-                    num * sizeof(int8_t));
-  }
-}
-
-template <typename Device, typename T>
-void PaddleMobilePredictor<Device, T>::FeedPaddleTensors(
-    const std::vector<PaddleTensor> &inputs) {
-  auto num = inputs.size();
-  std::vector<framework::Tensor> tensors(num, framework::Tensor());
-  for (int i = 0; i < num; i++) {
-    if (inputs[i].dtypeid == type_id<int8_t>().hash_code()) {
-      tensors[i].init(type_id<int8_t>().hash_code());
-    } else {
-      tensors[i].init(type_id<float>().hash_code());
-    }
-    ConvertPaddleTensors(inputs[i], &tensors[i]);
-  }
-  paddle_mobile_->FeedTensorData(tensors);
-}
-
-template <typename Device, typename T>
-void PaddleMobilePredictor<Device, T>::FetchPaddleTensors(
-    std::vector<PaddleTensor> *outputs) {
-  //  auto num = outputs->size();
-  //  PADDLE_MOBILE_ENFORCE(num > 0, "0 output pointers is not permitted");
-  //  std::vector<framework::Tensor *> tensors(num, nullptr);
-  outputs->clear();
-  std::vector<framework::Tensor *> tensors;
-  paddle_mobile_->GetTensorResults(&tensors);
-  auto num = tensors.size();
-  outputs->resize(num, PaddleTensor());
-  for (int i = 0; i < num; i++) {
-    ConvertTensors(*tensors[i], &(*outputs)[i]);
-  }
-}
-
-template <typename Device, typename T>
-void PaddleMobilePredictor<Device, T>::FetchPaddleTensors(PaddleTensor *output,
-                                                          int id) {
-  std::shared_ptr<framework::Tensor> tensor_ptr =
-      paddle_mobile_->FetchResult(id);
-  void *data_addr = nullptr;
-  int data_sizeof = 1;
-  if (tensor_ptr.get()->type() == type_id<half>().hash_code()) {
-    data_addr = tensor_ptr.get()->data<half>();
-    data_sizeof = sizeof(half);
-  } else if (tensor_ptr.get()->type() == type_id<float>().hash_code()) {
-    data_addr = tensor_ptr.get()->data<float>();
-    data_sizeof = sizeof(float);
-  } else if (tensor_ptr.get()->type() == type_id<int8_t>().hash_code()) {
-    data_addr = tensor_ptr.get()->data<int8_t>();
-    data_sizeof = sizeof(int8_t);
-  } else {
-    PADDLE_MOBILE_ENFORCE(0, "output typeid is not supported");
-  }
-  size_t size = tensor_ptr.get()->numel() * data_sizeof;
-  fpga::fpga_invalidate(data_addr, size);
-  ConvertTensors(*(tensor_ptr.get()), output);
-  return;
-}
-template <typename Device, typename T>
-void PaddleMobilePredictor<Device, T>::GetPaddleTensor(const std::string &name,
-                                                       PaddleTensor *output) {
-  framework::Tensor *t = paddle_mobile_->GetTensorByName(name);
-  ConvertTensors(*t, output);
-}
-
-template <typename Device, typename T>
-void PaddleMobilePredictor<Device, T>::Predict_From_To(int start, int end) {
-  paddle_mobile_->Predict_From_To(start, end);
-}
-
-#endif
-template <typename Device, typename T>
-PaddleMobilePredictor<Device, T>::~PaddleMobilePredictor() {
-  paddle_mobile_->Clear();
-}
-
-// A factory to help create difference predictor.
-template <>
-std::unique_ptr<PaddlePredictor>
-CreatePaddlePredictor<PaddleMobileConfig, PaddleEngineKind::kPaddleMobile>(
-    const PaddleMobileConfig &config) {
-  std::unique_ptr<PaddlePredictor> x;
-  if (config.precision == PaddleMobileConfig::FP32) {
-    if (config.device == PaddleMobileConfig::kCPU) {
-      x.reset(new PaddleMobilePredictor<CPU, float>(config));
-    } else if (config.device == PaddleMobileConfig::kFPGA) {
-      x.reset(new PaddleMobilePredictor<FPGA, float>(config));
-    } else if (config.device == PaddleMobileConfig::kGPU_CL) {
-      x.reset(new PaddleMobilePredictor<GPU_CL, float>(config));
-    } else {
-      LOG(kLOG_ERROR) << "unsupport device type!";
-      return nullptr;
-    }
-  } else {
-    LOG(kLOG_ERROR) << "unsupport precision type!";
-    return nullptr;
-  }
-  return std::move(x);
-}
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/io/api_paddle_mobile.h b/mobile/src/io/api_paddle_mobile.h
deleted file mode 100644
index 11c993b3f8..0000000000
--- a/mobile/src/io/api_paddle_mobile.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-#include <string>
-#include <vector>
-#include "common/types.h"
-#include "io/paddle_inference_api.h"
-#include "io/paddle_mobile.h"
-
-namespace paddle_mobile {
-
-template <typename Device = CPU, typename T = float>
-class PaddleMobilePredictor : public PaddlePredictor {
- public:
-  PaddleMobilePredictor() = delete;
-
-  explicit PaddleMobilePredictor(const PaddleMobileConfig& config);
-
-  bool Run(const std::vector<PaddleTensor>& inputs,
-           std::vector<PaddleTensor>* output_data,
-           int batch_size = -1) override;
-#ifdef PADDLE_MOBILE_FPGA
-  void Predict_From_To(int start, int end) override;
-  void FeedPaddleTensors(const std::vector<PaddleTensor>& inputs) override;
-  void FetchPaddleTensors(std::vector<PaddleTensor>* outputs) override;
-  void FetchPaddleTensors(PaddleTensor* outputs, int id) override;
-  void GetPaddleTensor(const std::string& name, PaddleTensor* output) override;
-
-#endif
-
-  ~PaddleMobilePredictor() override;
-
- private:
-  std::unique_ptr<PaddleMobile<Device, T>> paddle_mobile_;
-  bool Init(const PaddleMobileConfig& config);
-
-  PaddleMobileConfig config_;
-};
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/io/ios_io/PaddleMobileCPU.h b/mobile/src/io/ios_io/PaddleMobileCPU.h
deleted file mode 100644
index 07e10c0671..0000000000
--- a/mobile/src/io/ios_io/PaddleMobileCPU.h
+++ /dev/null
@@ -1,184 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#pragma once
-
-#import <CoreImage/CoreImage.h>
-#import <Foundation/Foundation.h>
-
-@interface PaddleMobileCPUResult: NSObject
-
-/**
- @b 输出指针
- */
-@property (assign, nonatomic, readonly) float *output;
-
-/**
- @b 输出的 float 数
- * */
-@property (assign, nonatomic, readonly) int outputSize;
-
-/**
- @b 维度信息, longlongValue
- */
-@property (strong, nonatomic, readonly) NSArray <NSNumber *> *dim;
-
--(void)releaseOutput;
-
-@end
-
-@interface  PaddleMobileCPUConfig: NSObject
-
-/**
- @b 默认为 1, 多线程时, 建议设置为 2
- */
-@property (assign, nonatomic) int threadNum;
-
-/**
- @b 是否开启运行时 infershape
- */
-@property  (assign, nonatomic) BOOL loddable;
-
-/**
- @b 是否开启模型 op 融合优化
- */
-@property  (assign, nonatomic) BOOL optimize;
-
-/**
- @b 是否预测时初始化内存，用于处理可变输入
- */
-@property  (assign, nonatomic) BOOL loadWhenPredict;
-
-@end
-
-@interface PaddleMobileCPU : NSObject
-
-/**
- @b 创建对象
-
- @param config 配置
- @return paddlemobile CPU 对象
- */
-- (instancetype)initWithConfig:(PaddleMobileCPUConfig *)config;
-
-/**
- @b 加载模型
-
- @param modelPath 模型路径
- @param weighsPath 权重路径
- @return 是否加载成功
- */
-- (BOOL)loadModel:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
-
-/**
- @b 加载散开形式的模型, 需传入模型的目录
-
- @param modelAndWeightPath 模型和权重的路径
- @return 是否加载成功
- */
-- (BOOL)load:(NSString *)modelAndWeightPath;
-
-/**
- @b 从内存中加载模型
-
- @param modelLen 模型大小(字节数)
- @param modelBuf 模型在内存中的位置
- @param combinedParamsLen 权重大小(字节数)
- @param combinedParamsBuf 权重在内存中的位置
- @return 是否加载成功
- */
-- (BOOL)LoadCombinedMemory:(size_t)modelLen
-               andModelBuf:(const uint8_t *)modelBuf
-         andModelParamsLen:(size_t)combinedParamsLen
-      andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf;
-
-/**
- @b 对图像进行预处理, 需要外部开辟 output 内存, 外部释放 output 内存, 每一个像素经过这样的预处理 (x + means) * scale, 其中 x 为像素值
-
- @param image 输入的图像
- @param output 预处理后的输出
- @param means 预处理中 means
- @param scale 预处理中的 scale
- @param dim 预处理后的维度
- */
--(void)preprocess:(CGImageRef)image
-           output:(float *)output
-            means:(NSArray<NSNumber *> *)means
-        scale:(float)scale
-        dim:(NSArray<NSNumber *> *)dim;
-
-/**
- 进行预测
-
- @param input 输入
- @param dim 输入维度
- @return 输出结果
- */
-- (PaddleMobileCPUResult *)predictInput:(float *)input
-                                    dim:(NSArray<NSNumber *> *)dim;
-
-/**
- @b 进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict, 每一个像素经过这样的预处理 (x + means) * scale, 其中 x 为像素值
-
- @param image 输入图像
- @param dim 输入维度
- @param means 预处理中 means
- @param scale 预处理中 scale
- @return 预测结果
- */
-- (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
-
-/**
- @b 进行预测, means stds和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict, 每一个像素经过这样的预处理 (x + means) * scale, 其中 x 为像素值
-
- @param image 输入图像
- @param dim 输入维度
- @param means 预处理中 means
- @param stds 预处理中 stds
- @param scale 预处理中 scale
- @return 预测结果
- */
-- (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means stds:(NSArray<NSNumber *> *)stds scale:(float)scale;
-
-/**
- @b 进行预测, 预处理 means 值为 0, scale 值为 1
-
- @param image 输入图像
- @param dim 输入维度
- @return 预测结果
- */
-- (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
-
-
-/**
- @b 取出模型描述中 key 为 "fetch" 对应的输出
-
- @return 预测结果
- */
-- (PaddleMobileCPUResult *)fetchOutput;
-
-/**
- @b 当输出为多个时, 可用此函数取出对应的输出
-
- @param key 模型中输出的key
- @return 预测结果
- */
-- (PaddleMobileCPUResult *)fetchOutputWithKey:(NSString *)key;
-
-/**
- @b 清理内存
- */
-- (void)clear;
-
-@end
diff --git a/mobile/src/io/ios_io/PaddleMobileCPU.mm b/mobile/src/io/ios_io/PaddleMobileCPU.mm
deleted file mode 100644
index b952ad8e60..0000000000
--- a/mobile/src/io/ios_io/PaddleMobileCPU.mm
+++ /dev/null
@@ -1,410 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#import "PaddleMobileCPU.h"
-#import "framework/load_ops.h"
-#import "framework/tensor.h"
-#import "io/paddle_mobile.h"
-#import <memory>
-#import <vector>
-
-@interface PaddleMobileCPUResult()
-
--(void)toSetOutput:(float *)output;
-
--(void)toSetOutputSize:(int)outputSize;
-
-@end
-
-@implementation PaddleMobileCPUResult
-
--(void)releaseOutput {
-  delete [] _output;
-  _output = nil;
-  _outputSize = 0;
-}
-
--(void)toSetOutput:(float *)output {
-  _output = output;
-}
-
--(void)toSetOutputSize:(int)outputSize {
-  _outputSize = outputSize;
-}
-
--(void)toSetDim:(NSArray <NSNumber *> *)dim {
-  _dim = dim;
-}
-
-@end
-
-@implementation  PaddleMobileCPUConfig
-
--(instancetype)init {
-  if (self = [super init]) {
-    self.threadNum = 1;
-    self.optimize = YES;
-  }
-  return self;
-}
-
-@end
-
-@interface  PaddleMobileCPU()
-{
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU, float> *pam_;
-  BOOL loaded_;
-}
-
-@property (strong, nonatomic) PaddleMobileCPUConfig *config;
-
-@end
-
-@implementation PaddleMobileCPU
-
-static std::mutex shared_mutex;
-
-- (instancetype)initWithConfig:(PaddleMobileCPUConfig *)config {
-  if (self = [super init]) {
-    paddle_mobile::PaddleMobileConfigInternal configInternal;
-    configInternal.load_when_predict = config.loadWhenPredict;
-    pam_ = new paddle_mobile::PaddleMobile<paddle_mobile::CPU, float>();
-    _config = config;
-  }
-  return self;
-}
-
--(instancetype)init {
-  if (self = [super init]) {
-    _config = [[PaddleMobileCPUConfig alloc] init];
-    pam_ = new paddle_mobile::PaddleMobile<paddle_mobile::CPU, float>();
-  }
-  return self;
-}
-
-- (void)dealloc {
-  if (pam_) {
-    delete pam_;
-    pam_ = nullptr;
-  }
-}
-
-+ (instancetype)sharedInstance{
-  static dispatch_once_t onceToken;
-  static id sharedManager = nil;
-  dispatch_once(&onceToken, ^{
-    sharedManager = [[[self class] alloc] init];
-  });
-  return sharedManager;
-}
-
-- (BOOL)loadModel:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-  std::string model_path_str = std::string([modelPath UTF8String]);
-  std::string weights_path_str = std::string([weighsPath UTF8String]);
-  pam_->SetThreadNum(self.config.threadNum);
-  if (loaded_ = pam_->Load(model_path_str, weights_path_str, self.config.optimize, false, 1, self.config.loddable)) {
-    return YES;
-  } else {
-    return NO;
-  }
-}
-
-- (BOOL)LoadCombinedMemory:(size_t)modelLen
-               andModelBuf:(const uint8_t *)modelBuf
-         andModelParamsLen:(size_t)combinedParamsLen
-      andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-  pam_->SetThreadNum(self.config.threadNum);
-  return loaded_ = pam_->LoadCombinedMemory(modelLen, modelBuf, combinedParamsLen,
-          const_cast<uint8_t*>(combinedParamsBuf), self.config.optimize, false, 1, self.config.loddable);
-}
-
-- (BOOL)load:(NSString *)modelAndWeightPath{
-  std::lock_guard<std::mutex> lock(shared_mutex);
-  std::string model_path_str = std::string([modelAndWeightPath UTF8String]);
-  if (loaded_ = pam_->Load(model_path_str, self.config.optimize, false, 1, self.config.loddable)) {
-    return YES;
-  } else {
-    return NO;
-  }
-}
-
-
--(void)preprocess:(CGImageRef)image
-           output:(float *)output
-            means:(NSArray<NSNumber *> *)means
-        scale:(float)scale
-        dim:(NSArray<NSNumber *> *)dim {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-
-  if (means == nil) {
-    means = @[@0, @0, @0];
-  }
-
-  // dim to c++ vector, get numel
-  std::vector<int64_t > dim_vec;
-  int numel = 1;
-  for (int k = 0; k < dim.count; ++k) {
-    int d = dim[k].intValue;
-    numel *= d;
-    dim_vec.push_back(d);
-  }
-
-  const int sourceRowBytes = CGImageGetBytesPerRow(image);
-  const int imageWidth = CGImageGetWidth(image);
-  const int imageHeight = CGImageGetHeight(image);
-  const int imageChannels = 4;
-  CGDataProviderRef provider = CGImageGetDataProvider(image);
-  CFDataRef cfData = CGDataProviderCopyData(provider);
-  const UInt8 *input = CFDataGetBytePtr(cfData);
-
-  int wanted_input_width = dim_vec[3];
-  int wanted_input_height = dim_vec[2];
-  int wanted_input_channels = dim_vec[1];
-
-  for (int c = 0; c < wanted_input_channels; ++c) {
-    float *out_channel = output + c * wanted_input_height * wanted_input_width;
-    for (int y = 0; y < wanted_input_height; ++y) {
-      float *out_row = out_channel + y * wanted_input_width;
-      for (int x = 0; x < wanted_input_width; ++x) {
-        int in_row = (y * imageHeight) / wanted_input_height;
-        int in_col = (x * imageWidth) / wanted_input_width;
-        const UInt8 *in_pixel = input + (in_row * sourceRowBytes) + (in_col * imageChannels);
-        float *out_pos = out_row + x;
-        *out_pos = (in_pixel[2 - c] - means[c].floatValue) * scale;
-      }
-    }
-  }
-
-}
-
--(void)preprocess:(const UInt8 *)input output:(float *)output bytesPerRow:(int)bytesPerRow imageWidth:(int)imageWidth imageHeight:(int)imageHeight imageChannels:(int)imageChannels means:(NSArray<NSNumber *> *)means stds:(NSArray<NSNumber *> *)stds scale:(float)scale dim:(std::vector<int64_t>)dim {
-  if (means == nil) {
-    means = @[@0, @0, @0];
-  }
-  if (stds == nil) {
-    stds = @[@1, @1, @1];
-  }
-
-  int wanted_input_width = dim[3];
-  int wanted_input_height = dim[2];
-  int wanted_input_channels = dim[1];
-
-  for (int c = 0; c < wanted_input_channels; ++c) {
-    float *out_channel = output + c * wanted_input_height * wanted_input_width;
-    for (int y = 0; y < wanted_input_height; ++y) {
-      float *out_row = out_channel + y * wanted_input_width;
-      for (int x = 0; x < wanted_input_width; ++x) {
-        int in_row = (y * imageHeight) / wanted_input_height;
-        int in_col = (x * imageWidth) / wanted_input_width;
-        const UInt8 *in_pixel = input + (in_row * bytesPerRow) + (in_col * imageChannels);
-        float *out_pos = out_row + x;
-        *out_pos = (in_pixel[2 - c] - means[c].floatValue) / stds[c].floatValue * scale;
-      }
-    }
-  }
-}
-
-- (PaddleMobileCPUResult *)predictInput:(float *)input
-                      dim:(NSArray<NSNumber *> *)dim {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-  if (!loaded_) {
-    printf("PaddleMobile doesn't be loaded yet");
-    return nil;
-  }
-
-  if (dim.count != 4) {
-    printf("dim must have 4 elements");
-    return nil;
-  }
-
-  // dim to c++ vector, get numel
-  std::vector<int64_t > dim_vec;
-  int numel = 1;
-  for (int k = 0; k < dim.count; ++k) {
-    int d = dim[k].intValue;
-    numel *= d;
-    dim_vec.push_back(d);
-  }
-
-  paddle_mobile::framework::Tensor input_tensor;
-  paddle_mobile::framework::DDim dims = paddle_mobile::framework::make_ddim(dim_vec);
-  float *input_ptr = input_tensor.mutable_data<float>(dims);
-  memcpy(input_ptr, input,
-         numel * sizeof(float));
-
-  pam_->Predict(input_tensor);
-  std::shared_ptr<paddle_mobile::framework::Tensor> output = pam_->Fetch();
-
-  auto output_dims = output->dims();
-  std::vector<int64_t> output_dim_vec = vectorize(output_dims);
-  NSMutableArray <NSNumber *> *ocDim = [NSMutableArray array];
-  for (int i = 0; i < output_dim_vec.size(); ++i) {
-    NSNumber *num = [NSNumber numberWithLongLong:output_dim_vec[i]];
-    [ocDim addObject:num];
-  }
-
-  float *output_pointer = new float[output->numel()];
-
-  memcpy(output_pointer, output->data<float>(),
-         output->numel() * sizeof(float));
-
-  PaddleMobileCPUResult *cpuResult = [[PaddleMobileCPUResult alloc] init];
-  [cpuResult toSetOutput: output_pointer];
-  [cpuResult toSetDim: ocDim];
-  [cpuResult toSetOutputSize: output->numel()];
-
-  return cpuResult;
-}
-
-- (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means stds:(NSArray<NSNumber *> *)stds scale:(float)scale {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-  if (!loaded_) {
-    printf("PaddleMobile doesn't be loaded yet");
-    return nil;
-  }
-
-  if (dim.count != 4) {
-    printf("dim must have 4 elements");
-    return nil;
-  }
-
-  // dim to c++ vector, get numel
-  std::vector<int64_t > dim_vec;
-  int numel = 1;
-  for (int k = 0; k < dim.count; ++k) {
-    int d = dim[k].intValue;
-    numel *= d;
-    dim_vec.push_back(d);
-  }
-
-  const int sourceRowBytes = CGImageGetBytesPerRow(image);
-  const int image_width = CGImageGetWidth(image);
-  const int image_height = CGImageGetHeight(image);
-  const int image_channels = 4;
-  CGDataProviderRef provider = CGImageGetDataProvider(image);
-  CFDataRef cfData = CGDataProviderCopyData(provider);
-  const UInt8 *input = CFDataGetBytePtr(cfData);
-
-  // sample image
-  float *output = (float *)malloc(numel*sizeof(float));
-  [self preprocess:input output:output bytesPerRow:sourceRowBytes imageWidth:image_width imageHeight:image_height imageChannels:image_channels means:means stds:stds scale:scale dim:dim_vec];
-  float *dataPointer = nullptr;
-  if (nullptr != output) {
-    dataPointer = output;
-  } else {
-    return nil;
-  }
-
-  paddle_mobile::framework::Tensor input_tensor;
-  paddle_mobile::framework::DDim dims = paddle_mobile::framework::make_ddim(dim_vec);
-  float *input_ptr = input_tensor.mutable_data<float>(dims);
-  memcpy(input_ptr, dataPointer,
-         numel * sizeof(float));
-
-  pam_->Predict(input_tensor);
-  std::shared_ptr<paddle_mobile::framework::Tensor> output_tensor = pam_->Fetch();
-
-  auto output_dims = output_tensor->dims();
-  std::vector<int64_t> output_dim_vec = vectorize(output_dims);
-  NSMutableArray <NSNumber *> *ocDim = [NSMutableArray array];
-  for (int i = 0; i < output_dim_vec.size(); ++i) {
-    NSNumber *num = [NSNumber numberWithLongLong:output_dim_vec[i]];
-    [ocDim addObject:num];
-  }
-
-  float *output_pointer = new float[output_tensor->numel()];
-  memcpy(output_pointer, output_tensor->data<float>(),
-         output_tensor->numel() * sizeof(float));
-  PaddleMobileCPUResult *cpuResult = [[PaddleMobileCPUResult alloc] init];
-  [cpuResult toSetOutput: output_pointer];
-  [cpuResult toSetDim: ocDim];
-  [cpuResult toSetOutputSize: output_tensor->numel()];
-
-  free(output);
-  CFRelease(cfData);
-  cfData = NULL;
-
-  return cpuResult;
-}
-
-- (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim {
-  return [self predict:image dim:dim means:nil stds:nil scale:1];
-}
-
-- (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale {
-  return [self predict:image dim:dim means:means stds:nil scale:scale];
-}
-
-- (PaddleMobileCPUResult *)fetchOutput{
-  if (pam_ && loaded_) {
-    auto tensorPtr = pam_->Fetch();
-    float *output_pointer = new float[tensorPtr->numel()];
-    memcpy(output_pointer, tensorPtr->data<float>(),
-           tensorPtr->numel() * sizeof(float));
-    auto dims = tensorPtr->dims();
-    std::vector<int64_t> dim_vec = vectorize(dims);
-
-
-    NSMutableArray <NSNumber *> *ocDim = [NSMutableArray array];
-    for (int i = 0; i < dim_vec.size(); ++i) {
-      NSNumber *num = [NSNumber numberWithLongLong:dim_vec[i]];
-      [ocDim addObject:num];
-    }
-
-    PaddleMobileCPUResult *cpuResult = [[PaddleMobileCPUResult alloc] init];
-    [cpuResult toSetOutput: output_pointer];
-    [cpuResult toSetDim: ocDim];
-    [cpuResult toSetOutputSize: tensorPtr->numel()];
-
-    return cpuResult;
-  }
-  return nil;
-}
-
-- (PaddleMobileCPUResult *)fetchOutputWithKey:(NSString *)key{
-  if (pam_ && loaded_ && key.length) {
-    auto tensorPtr = pam_->Fetch(std::string([key cStringUsingEncoding:NSUTF8StringEncoding]));
-    float *output_pointer = new float[tensorPtr->numel()];
-    memcpy(output_pointer, tensorPtr->data<float>(),
-           tensorPtr->numel() * sizeof(float));
-
-    auto dims = tensorPtr->dims();
-    std::vector<int64_t> dim_vec = vectorize(dims);
-
-    NSMutableArray <NSNumber *> *ocDim = [NSMutableArray array];
-    for (int i = 0; i < dim_vec.size(); ++i) {
-      NSNumber *num = [NSNumber numberWithLongLong:dim_vec[i]];
-      [ocDim addObject:num];
-    }
-
-    PaddleMobileCPUResult *cpuResult = [[PaddleMobileCPUResult alloc] init];
-    [cpuResult toSetOutput: output_pointer];
-    [cpuResult toSetDim: ocDim];
-    [cpuResult toSetOutputSize: tensorPtr->numel()];
-
-    return cpuResult;
-  }
-  return nil;
-}
-
-- (void)clear{
-  std::lock_guard<std::mutex> lock(shared_mutex);
-  if (pam_) {
-    pam_->Clear();
-  }
-}
-
-@end
diff --git a/mobile/src/io/jni/PML.java b/mobile/src/io/jni/PML.java
deleted file mode 100644
index 3f162dcf9e..0000000000
--- a/mobile/src/io/jni/PML.java
+++ /dev/null
@@ -1,66 +0,0 @@
-package com.baidu.paddle;
-
-public class PML {
-    /**
-     * load seperated model
-     *
-     * @param modelDir model dir
-     * @return isloadsuccess
-     */
-    public static native boolean load(String modelDir, Boolean lodMode);
-
-    /**
-     * load combined model
-     *
-     * @param modelPath model file path
-     * @param paramPath param file path
-     * @return isloadsuccess
-     */
-    public static native boolean loadCombined(String modelPath, String paramPath, Boolean lodMode);
-
-    /**
-     * load model and qualified params
-     *
-     * @param modelDir qualified model dir
-     * @return isloadsuccess
-     */
-    public static native boolean loadQualified(String modelDir, Boolean lodMode);
-
-    /**
-     * load model and qualified combined params
-     *
-     * @param modelPath model file path
-     * @param paramPath qualified param path
-     * @return isloadsuccess
-     */
-    public static native boolean loadCombinedQualified(String modelPath, String paramPath, Boolean lodMode);
-
-    /**
-     * predict image
-     *
-     * @param buf   of pretreated image (as your model like)
-     * @param ddims format of your input
-     * @return result
-     */
-    public static native float[] predictImage(float[] buf, int[] ddims);
-
-    public static native float[] fetch(String varName);
-
-    public static native float[] predictYuv(byte[] buf, int imgWidth, int imgHeight, int[] ddims, float[] meanValues);
-
-    // predict with variable length input
-    // support only one input and one output currently
-    public static native float[] predictLod(float[] buf);
-
-    /**
-     * clear model data
-     */
-    public static native void clear();
-
-    /**
-     * setThread num when u enable openmp
-     *
-     * @param threadCount threadCount
-     */
-    public static native void setThread(int threadCount);
-}
diff --git a/mobile/src/io/jni/paddle_mobile_jni.cpp b/mobile/src/io/jni/paddle_mobile_jni.cpp
deleted file mode 100644
index ee336889a2..0000000000
--- a/mobile/src/io/jni/paddle_mobile_jni.cpp
+++ /dev/null
@@ -1,465 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ANDROID
-
-#include "io/jni/paddle_mobile_jni.h"
-#include <cmath>
-#include <string>
-#include <vector>
-#include "common/log.h"
-#include "framework/tensor.h"
-#include "io/paddle_mobile.h"
-
-#ifdef ENABLE_EXCEPTION
-#include "common/enforce.h"
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-namespace paddle_mobile {
-namespace jni {
-
-using framework::DDim;
-using framework::Program;
-using framework::Tensor;
-using paddle_mobile::CPU;
-using std::string;
-
-paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-static std::mutex shared_mutex;
-
-PaddleMobile<CPU> *getPaddleMobileInstance() { return &paddle_mobile; }
-
-string jstring2cppstring(JNIEnv *env, jstring jstr) {
-  const char *cstr = env->GetStringUTFChars(jstr, 0);
-  string cppstr(cstr);
-  env->ReleaseStringUTFChars(jstr, cstr);
-  return cppstr;
-}
-
-JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
-                                                          jclass thiz,
-                                                          jstring modelPath,
-                                                          jboolean lodMode) {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-  ANDROIDLOGI("load invoked");
-  bool optimize = true;
-  bool isLoadOk = false;
-#ifdef ENABLE_EXCEPTION
-  try {
-    isLoadOk = getPaddleMobileInstance()->Load(
-        jstring2cppstring(env, modelPath), optimize, false, 1,
-        static_cast<bool>(lodMode));
-  } catch (paddle_mobile::PaddleMobileException &e) {
-    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
-    isLoadOk = false;
-  }
-#else
-  isLoadOk = getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
-                                             optimize, false, 1,
-                                             static_cast<bool>(lodMode));
-#endif
-  return static_cast<jboolean>(isLoadOk);
-}
-
-JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadQualified(
-    JNIEnv *env, jclass thiz, jstring modelPath, jboolean lodMode) {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-
-  ANDROIDLOGI("loadQualified invoked");
-  bool optimize = true;
-  bool qualified = true;
-  bool isLoadOk = false;
-
-#ifdef ENABLE_EXCEPTION
-  try {
-    isLoadOk = getPaddleMobileInstance()->Load(
-        jstring2cppstring(env, modelPath), optimize, qualified, 1,
-        static_cast<bool>(lodMode));
-  } catch (paddle_mobile::PaddleMobileException &e) {
-    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
-    isLoadOk = false;
-  }
-#else
-  isLoadOk = getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
-                                             optimize, qualified, 1,
-                                             static_cast<bool>(lodMode));
-#endif
-
-  return static_cast<jboolean>(isLoadOk);
-}
-
-JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined(
-    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath,
-    jboolean lodMode) {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-  ANDROIDLOGI("loadCombined invoked");
-  bool optimize = true;
-  bool isLoadOk = false;
-
-#ifdef ENABLE_EXCEPTION
-  try {
-    isLoadOk = getPaddleMobileInstance()->Load(
-        jstring2cppstring(env, modelPath), jstring2cppstring(env, paramPath),
-        optimize, false, 1, static_cast<bool>(lodMode));
-  } catch (paddle_mobile::PaddleMobileException &e) {
-    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
-    isLoadOk = false;
-  }
-#else
-  isLoadOk = getPaddleMobileInstance()->Load(
-      jstring2cppstring(env, modelPath), jstring2cppstring(env, paramPath),
-      optimize, false, 1, static_cast<bool>(lodMode));
-#endif
-  return static_cast<jboolean>(isLoadOk);
-}
-
-JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombinedQualified(
-    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath,
-    jboolean lodMode) {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-  ANDROIDLOGI("loadCombinedQualified invoked");
-  bool optimize = true;
-  bool qualified = true;
-  bool isLoadOk = false;
-
-#ifdef ENABLE_EXCEPTION
-  try {
-    isLoadOk = getPaddleMobileInstance()->Load(
-        jstring2cppstring(env, modelPath), jstring2cppstring(env, paramPath),
-        optimize, qualified, 1, static_cast<bool>(lodMode));
-  } catch (paddle_mobile::PaddleMobileException &e) {
-    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
-    isLoadOk = false;
-  }
-#else
-  isLoadOk = getPaddleMobileInstance()->Load(
-      jstring2cppstring(env, modelPath), jstring2cppstring(env, paramPath),
-      optimize, qualified, 1, static_cast<bool>(lodMode));
-#endif
-  return static_cast<jboolean>(isLoadOk);
-}
-
-JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
-    JNIEnv *env, jclass thiz, jfloatArray buf, jintArray ddims) {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-
-  ANDROIDLOGI("predictImage invoked");
-  jfloatArray result = NULL;
-
-#ifdef ENABLE_EXCEPTION
-  ANDROIDLOGE("ENABLE_EXCEPTION!");
-
-  try {
-    jsize ddim_size = env->GetArrayLength(ddims);
-    if (ddim_size != 4) {
-      ANDROIDLOGE("ddims size not equal to 4");
-    }
-    jint *ddim_ptr = env->GetIntArrayElements(ddims, NULL);
-    framework::DDim ddim = framework::make_ddim(
-        {ddim_ptr[0], ddim_ptr[1], ddim_ptr[2], ddim_ptr[3]});
-    int length = framework::product(ddim);
-    int count = 0;
-    float *dataPointer = nullptr;
-    if (nullptr != buf) {
-      dataPointer = env->GetFloatArrayElements(buf, NULL);
-    }
-    framework::Tensor input;
-    input.Resize(ddim);
-    auto input_ptr = input.mutable_data<float>();
-    for (int i = 0; i < length; i++) {
-      input_ptr[i] = dataPointer[i];
-    }
-    getPaddleMobileInstance()->Predict(input);
-    auto output = getPaddleMobileInstance()->Fetch();
-    count = output->numel();
-    result = env->NewFloatArray(count);
-    env->SetFloatArrayRegion(result, 0, count, output->data<float>());
-    env->ReleaseIntArrayElements(ddims, ddim_ptr, 0);
-    env->DeleteLocalRef(ddims);
-    env->ReleaseFloatArrayElements(buf, dataPointer, 0);
-    env->DeleteLocalRef(buf);
-  } catch (paddle_mobile::PaddleMobileException &e) {
-    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
-  }
-#else
-  jsize ddim_size = env->GetArrayLength(ddims);
-  if (ddim_size != 4) {
-    ANDROIDLOGE("ddims size not equal to 4");
-  }
-  jint *ddim_ptr = env->GetIntArrayElements(ddims, NULL);
-  framework::DDim ddim = framework::make_ddim(
-      {ddim_ptr[0], ddim_ptr[1], ddim_ptr[2], ddim_ptr[3]});
-  int length = framework::product(ddim);
-  int count = 0;
-  float *dataPointer = nullptr;
-  if (nullptr != buf) {
-    dataPointer = env->GetFloatArrayElements(buf, NULL);
-  }
-  framework::Tensor input;
-  input.Resize(ddim);
-  auto input_ptr = input.mutable_data<float>();
-  for (int i = 0; i < length; i++) {
-    input_ptr[i] = dataPointer[i];
-  }
-  getPaddleMobileInstance()->Predict(input);
-  auto output = getPaddleMobileInstance()->Fetch();
-  count = output->numel();
-  result = env->NewFloatArray(count);
-  env->SetFloatArrayRegion(result, 0, count, output->data<float>());
-  env->ReleaseIntArrayElements(ddims, ddim_ptr, 0);
-  env->DeleteLocalRef(ddims);
-  env->ReleaseFloatArrayElements(buf, dataPointer, 0);
-  env->DeleteLocalRef(buf);
-//  env->DeleteLocalRef(dataPointer);
-#endif
-
-  ANDROIDLOGI("predictImage finished");
-  return result;
-}
-
-JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_fetch(JNIEnv *env,
-                                                              jclass thiz,
-                                                              jstring varName) {
-  jfloatArray result = NULL;
-
-#ifdef ENABLE_EXCEPTION
-  try {
-    auto output =
-        getPaddleMobileInstance()->Fetch(jstring2cppstring(env, varName));
-    int count = output->numel();
-    result = env->NewFloatArray(count);
-    env->SetFloatArrayRegion(result, 0, count, output->data<float>());
-  } catch (paddle_mobile::PaddleMobileException &e) {
-    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
-  }
-#else
-  auto output =
-      getPaddleMobileInstance()->Fetch(jstring2cppstring(env, varName));
-  int count = output->numel();
-  result = env->NewFloatArray(count);
-  env->SetFloatArrayRegion(result, 0, count, output->data<float>());
-#endif
-
-  return result;
-}
-
-inline int yuv_to_rgb(int y, int u, int v, float *r, float *g, float *b) {
-  int r1 = (int)(y + 1.370705 * (v - 128));                         // NOLINT
-  int g1 = (int)(y - 0.698001 * (u - 128) - 0.703125 * (v - 128));  // NOLINT
-  int b1 = (int)(y + 1.732446 * (u - 128));                         // NOLINT
-
-  r1 = (int)fminf(255, fmaxf(0, r1));  // NOLINT
-  g1 = (int)fminf(255, fmaxf(0, g1));  // NOLINT
-  b1 = (int)fminf(255, fmaxf(0, b1));  // NOLINT
-  *r = r1;
-  *g = g1;
-  *b = b1;
-
-  return 0;
-}
-void convert_nv21_to_matrix(uint8_t *nv21, float *matrix, int width, int height,
-                            int targetWidth, int targetHeight, float *means) {
-  const uint8_t *yData = nv21;
-  const uint8_t *vuData = nv21 + width * height;
-
-  const int yRowStride = width;
-  const int vuRowStride = width;
-
-  float scale_x = width * 1.0 / targetWidth;
-  float scale_y = height * 1.0 / targetHeight;
-
-  for (int j = 0; j < targetHeight; ++j) {
-    int y = j * scale_y;
-    const uint8_t *pY = yData + y * yRowStride;
-    const uint8_t *pVU = vuData + (y >> 1) * vuRowStride;
-    for (int i = 0; i < targetWidth; ++i) {
-      int x = i * scale_x;
-      const int offset = ((x >> 1) << 1);
-      float r = 0;
-      float g = 0;
-      float b = 0;
-      yuv_to_rgb(pY[x], pVU[offset + 1], pVU[offset], &r, &g, &b);
-      int r_index = j * targetWidth + i;
-      int g_index = r_index + targetWidth * targetHeight;
-      int b_index = g_index + targetWidth * targetHeight;
-      matrix[r_index] = r - means[0];
-      matrix[g_index] = g - means[1];
-      matrix[b_index] = b - means[2];
-    }
-  }
-}
-
-JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictYuv(
-    JNIEnv *env, jclass thiz, jbyteArray yuv_, jint imgwidth, jint imgHeight,
-    jintArray ddims, jfloatArray meanValues) {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-
-  ANDROIDLOGI("predictYuv invoked");
-  jfloatArray result = NULL;
-
-#ifdef ENABLE_EXCEPTION
-  try {
-    jsize ddim_size = env->GetArrayLength(ddims);
-    if (ddim_size != 4) {
-      ANDROIDLOGE("ddims size not equal to 4");
-    }
-    jint *ddim_ptr = env->GetIntArrayElements(ddims, NULL);
-    framework::DDim ddim = framework::make_ddim(
-        {ddim_ptr[0], ddim_ptr[1], ddim_ptr[2], ddim_ptr[3]});
-    int length = framework::product(ddim);
-    float matrix[length];  // NOLINT
-    jbyte *yuv = env->GetByteArrayElements(yuv_, NULL);
-    float *meansPointer = nullptr;
-    if (nullptr != meanValues) {
-      meansPointer = env->GetFloatArrayElements(meanValues, NULL);
-    }
-    convert_nv21_to_matrix(reinterpret_cast<uint8_t *>(yuv), matrix, imgwidth,
-                           imgHeight, ddim[3], ddim[2], meansPointer);
-    int count = 0;
-    framework::Tensor input;
-    input.Resize(ddim);
-    auto input_ptr = input.mutable_data<float>();
-    for (int i = 0; i < length; i++) {
-      input_ptr[i] = matrix[i];
-    }
-    getPaddleMobileInstance()->Predict(input);
-    auto output = getPaddleMobileInstance()->Fetch();
-    count = output->numel();
-    result = env->NewFloatArray(count);
-    env->SetFloatArrayRegion(result, 0, count, output->data<float>());
-    env->ReleaseByteArrayElements(yuv_, yuv, 0);
-    env->ReleaseIntArrayElements(ddims, ddim_ptr, 0);
-    env->ReleaseFloatArrayElements(meanValues, meansPointer, 0);
-    ANDROIDLOGI("predictYuv finished");
-  } catch (paddle_mobile::PaddleMobileException &e) {
-    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
-  }
-#else
-  jsize ddim_size = env->GetArrayLength(ddims);
-  if (ddim_size != 4) {
-    ANDROIDLOGE("ddims size not equal to 4");
-  }
-  jint *ddim_ptr = env->GetIntArrayElements(ddims, NULL);
-  framework::DDim ddim = framework::make_ddim(
-      {ddim_ptr[0], ddim_ptr[1], ddim_ptr[2], ddim_ptr[3]});
-  int length = framework::product(ddim);
-  float matrix[length];  // NOLINT
-  jbyte *yuv = env->GetByteArrayElements(yuv_, NULL);
-  float *meansPointer = nullptr;
-  if (nullptr != meanValues) {
-    meansPointer = env->GetFloatArrayElements(meanValues, NULL);
-  }
-  convert_nv21_to_matrix((uint8_t *)yuv, matrix, imgwidth,  // NOLINT
-                         imgHeight, ddim[3], ddim[2], meansPointer);
-  int count = 0;
-  framework::Tensor input;
-  input.Resize(ddim);
-  auto input_ptr = input.mutable_data<float>();
-  for (int i = 0; i < length; i++) {
-    input_ptr[i] = matrix[i];
-  }
-  getPaddleMobileInstance()->Predict(input);
-  auto output = getPaddleMobileInstance()->Fetch();
-  count = output->numel();
-  result = env->NewFloatArray(count);
-  env->SetFloatArrayRegion(result, 0, count, output->data<float>());
-  env->ReleaseByteArrayElements(yuv_, yuv, 0);
-  env->ReleaseIntArrayElements(ddims, ddim_ptr, 0);
-  env->ReleaseFloatArrayElements(meanValues, meansPointer, 0);
-  ANDROIDLOGI("predictYuv finished");
-#endif
-
-  return result;
-}
-JNIEXPORT jlongArray JNICALL
-Java_com_baidu_paddle_PML_predictLod(JNIEnv *env, jclass thiz, jlongArray buf) {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-
-  jlong *ddim_ptr = env->GetLongArrayElements(buf, NULL);
-  jsize ddim_size = env->GetArrayLength(buf);
-  std::vector<int64_t> ids;
-
-  for (int i = 0; i < ddim_size; ++i) {
-    jlong x = ddim_ptr[i];
-    ids.push_back((int64_t)x);
-  }
-
-  paddle_mobile::framework::LoDTensor words;
-
-  auto size = static_cast<int>(ids.size());
-
-  paddle_mobile::framework::LoD lod{{0, ids.size()}};
-  DDim dims{size, 1};
-  words.Resize(dims);
-  words.set_lod(lod);
-  auto *pdata = words.mutable_data<int64_t>();
-  size_t n = words.numel() * sizeof(int64_t);
-  memcpy(pdata, ids.data(), n);
-  paddle_mobile.Predict(words);
-  auto vec_result = paddle_mobile.Fetch();
-  int count = vec_result->numel();
-  jlongArray result = NULL;
-  ANDROIDLOGE("predict nlp size %d", count);
-
-  result = env->NewLongArray(count);
-  env->SetLongArrayRegion(result, 0, count, vec_result->data<int64_t>());
-
-  env->ReleaseLongArrayElements(buf, ddim_ptr, 0);
-  return result;
-}
-
-JNIEXPORT void JNICALL Java_com_baidu_paddle_PML_setThread(JNIEnv *env,
-                                                           jclass thiz,
-                                                           jint threadCount) {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-
-  ANDROIDLOGI("setThreadCount %d", threadCount);
-#ifdef ENABLE_EXCEPTION
-  try {
-    getPaddleMobileInstance()->SetThreadNum(static_cast<int>(threadCount));
-  } catch (paddle_mobile::PaddleMobileException &e) {
-    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
-  }
-#else
-  getPaddleMobileInstance()->SetThreadNum(static_cast<int>(threadCount));
-#endif
-}
-
-JNIEXPORT void JNICALL Java_com_baidu_paddle_PML_clear(JNIEnv *env,
-                                                       jclass thiz) {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-
-#ifdef ENABLE_EXCEPTION
-  try {
-    getPaddleMobileInstance()->Clear();
-  } catch (paddle_mobile::PaddleMobileException &e) {
-    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
-  }
-#else
-  getPaddleMobileInstance()->Clear();
-#endif
-}
-
-}  // namespace jni
-}  // namespace paddle_mobile
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/mobile/src/io/jni/paddle_mobile_jni.h b/mobile/src/io/jni/paddle_mobile_jni.h
deleted file mode 100644
index 16d6768723..0000000000
--- a/mobile/src/io/jni/paddle_mobile_jni.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#ifdef ANDROID
-#include <jni.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-namespace paddle_mobile {
-namespace jni {
-/**
- * load separated model for android
- */
-JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
-                                                          jclass thiz,
-                                                          jstring modelPath,
-                                                          jboolean lodMode);
-
-/**
- * load separated qualified model for android
- */
-JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadQualified(
-    JNIEnv *env, jclass thiz, jstring modelPath, jboolean lodMode);
-/**
- * load combined model  for android
- */
-JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined(
-    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath,
-    jboolean lodMode);
-
-/**
- * load combined qualified model for android
- */
-JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombinedQualified(
-    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath,
-    jboolean lodMode);
-
-/**
- * object detection for anroid
- */
-JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
-    JNIEnv *env, jclass thiz, jfloatArray buf, jintArray ddims);
-
-JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_fetch(JNIEnv *env,
-                                                              jclass thiz,
-                                                              jstring varName);
-
-/**
- * object detection for anroid
- */
-JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictYuv(
-    JNIEnv *env, jclass thiz, jbyteArray yuv, jint imgwidth, jint imgHeight,
-    jintArray ddims, jfloatArray meanValues);
-
-/**
- * object detection for anroid
- */
-JNIEXPORT jlongArray JNICALL
-Java_com_baidu_paddle_PML_predictLod(JNIEnv *env, jclass thiz, jlongArray buf);
-
-/**
- * setThreadCount for multithread
- */
-JNIEXPORT void JNICALL Java_com_baidu_paddle_PML_setThread(JNIEnv *env,
-                                                           jclass thiz,
-                                                           jint threadCount);
-/**
- * clear data of the net when destroy for android
- */
-JNIEXPORT void JNICALL Java_com_baidu_paddle_PML_clear(JNIEnv *env,
-                                                       jclass thiz);
-}  // namespace jni
-}  // namespace paddle_mobile
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/mobile/src/io/loader.h b/mobile/src/io/loader.h
deleted file mode 100644
index 7a04da1230..0000000000
--- a/mobile/src/io/loader.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-
-#include "common/types.h"
-#include "framework/program/program.h"
-
-namespace paddle_mobile {
-
-template <typename Dtype = CPU, Precision P = Precision::FP32>
-class Loader {
- public:
-  const framework::Program<Dtype, P> Load(const std::string &dirname,
-                                          bool optimize = false,
-                                          bool quantification = false,
-                                          bool can_add_split = false);
-
-  const framework::Program<Dtype, P> Load(const std::string &model_path,
-                                          const std::string &para_path,
-                                          bool optimize = false,
-                                          bool quantification = false);
-
-  const framework::Program<Dtype, P> LoadCombinedMemory(
-      size_t model_len, const uint8_t *model_buf, size_t combined_params_len,
-      const uint8_t *combined_params_buf, bool optimize = false,
-      bool quantification = false);
-
- private:
-  const framework::Program<Dtype, P> LoadProgram(const std::string &model_path,
-                                                 bool optimize = false,
-                                                 bool quantification = false,
-                                                 bool can_add_split = false);
-};
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/io/opencl_interface.cpp b/mobile/src/io/opencl_interface.cpp
deleted file mode 100644
index 1df5b48339..0000000000
--- a/mobile/src/io/opencl_interface.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef PADDLE_MOBILE_CL
-
-#include "io/opencl_interface.h"
-#include "framework/cl/cl_engine.h"
-#include "framework/cl/cl_scope.h"
-
-namespace paddle_mobile {
-
-cl_context getContext() {
-  return framework::CLEngine::Instance()->getContext();
-}
-
-cl_command_queue getClCommandQueue() {
-  return framework::CLEngine::Instance()->getClCommandQueue();
-}
-
-bool isInitSuccess() {
-  return framework::CLEngine::Instance()->isInitSuccess();
-}
-
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/io/opencl_interface.h b/mobile/src/io/opencl_interface.h
deleted file mode 100644
index f1039f1373..0000000000
--- a/mobile/src/io/opencl_interface.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#ifdef PADDLE_MOBILE_CL
-#include "CL/cl.h"
-
-namespace paddle_mobile {
-
-cl_context getContext();
-cl_command_queue getClCommandQueue();
-bool isInitSuccess();
-
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/io/paddle_inference_api.h b/mobile/src/io/paddle_inference_api.h
deleted file mode 100644
index ae7d34bd51..0000000000
--- a/mobile/src/io/paddle_inference_api.h
+++ /dev/null
@@ -1,178 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * This file contains the definition of a simple Inference API for Paddle.
- *
- * ATTENTION: It requires some C++ features, for lower version C++ or C, we
- * might release another API.
- */
-
-#pragma once
-
-#include <cassert>
-#include <memory>
-#include <string>
-#include <vector>
-#include "common/type_define.h"
-
-namespace paddle_mobile {
-
-#ifdef PADDLE_MOBILE_FPGA
-
-namespace fpga {
-int open_device();
-int close_device();
-void* fpga_malloc(size_t size);
-void fpga_free(void* ptr);
-
-//  Usage:
-//  auto version = fpga::paddle_mobile_version();
-//  std::cout << "0X0" << std::hex << version << std::endl;
-uint32_t paddle_mobile_version();
-}  // namespace fpga
-#endif
-
-enum PaddleDType {
-  FLOAT32,
-  FLOAT16,
-  INT64,
-  INT8,
-};
-
-enum LayoutType {
-  LAYOUT_CHW = 1,
-  LAYOUT_HWC = 0,
-};
-
-class PaddleBuf {
- public:
-  PaddleBuf() = default;
-  PaddleBuf(PaddleBuf&& other);
-  // Copy only available when memory is managed externally.
-  explicit PaddleBuf(const PaddleBuf&);
-  PaddleBuf& operator=(const PaddleBuf&);
-  // Do not own the memory.
-  PaddleBuf(void* data, size_t length)
-      : data_(data), length_(length), memory_owned_{false} {}
-  // Own memory.
-  explicit PaddleBuf(size_t length)
-      : data_(new char[length]), length_(length), memory_owned_(true) {}
-  // Resize to `length` bytes.
-  void Resize(size_t length);
-  // Reset to external memory.
-  void Reset(void* data, size_t length);
-  bool empty() const { return length_ == 0; }
-  void* data() const { return data_; }
-  size_t length() const { return length_; }
-
-  ~PaddleBuf() { Free(); }
-
- private:
-  void Free();
-  void* data_{nullptr};  // pointer to the data memory.
-  size_t length_{0};     // number of memory bytes.
-  bool memory_owned_{true};
-};
-
-struct PaddleTensor {
-  PaddleTensor() = default;
-  std::string name;  // variable name.
-  std::vector<int> shape;
-  std::vector<int> lod;
-  PaddleBuf data;  // blob of data.
-  PaddleDType dtype;
-  kTypeId_t dtypeid;
-  LayoutType layout;
-};
-
-enum class PaddleEngineKind {
-  kPaddleMobile,
-  // TODO(Superjomn) support following engines latter.
-  // kTensorRT,           // Use TensorRT for inference.
-  // kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
-  // kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
-};
-
-/*
- * A simple Inference API for Paddle. Currently this API can be used by
- * non-sequence scenerios.
- */
-class PaddlePredictor {
- public:
-  struct Config;
-  PaddlePredictor(const PaddlePredictor&) = delete;
-  PaddlePredictor& operator=(const PaddlePredictor&) = delete;
-
-  // Predict an record.
-  // The caller should be responsible for allocating and releasing the memory of
-  // `inputs`. `inputs` should be available until Run returns. Caller should be
-  // responsible for the output tensor's buffer, either allocated or passed from
-  // outside.
-
-  virtual bool Run(const std::vector<PaddleTensor>& inputs,
-                   std::vector<PaddleTensor>* output_data,
-                   int batch_size = -1) = 0;
-  // Destroy the Predictor.
-  virtual ~PaddlePredictor() = default;
-
-  // The common configs for all the predictors.
-  struct Config {
-    std::string model_dir;  // path to the model directory.
-    std::string prog_file;
-    std::string param_file;
-  };
-#ifdef PADDLE_MOBILE_FPGA
-  virtual void Predict_From_To(int start, int end) = 0;
-  virtual void FeedPaddleTensors(const std::vector<PaddleTensor>& inputs) = 0;
-  virtual void FetchPaddleTensors(std::vector<PaddleTensor>* outputs) = 0;
-  virtual void FetchPaddleTensors(PaddleTensor* outputs, int id) = 0;
-  virtual void GetPaddleTensor(const std::string& name,
-                               PaddleTensor* output) = 0;
-#endif
-
- protected:
-  PaddlePredictor() = default;
-};
-
-struct PaddleModelMemoryPack {
-  bool from_memory = false;
-  size_t model_size = 0;
-  uint8_t* model_buf = nullptr;
-  size_t combined_params_size = 0;
-  uint8_t* combined_params_buf = nullptr;
-};
-
-struct PaddleMobileConfig : public PaddlePredictor::Config {
-  enum Precision { FP32 = 0 };
-  enum Device { kCPU = 0, kFPGA = 1, kGPU_MALI = 2, kGPU_CL = 3 };
-
-  enum Precision precision;
-  enum Device device;
-
-  int batch_size = 1;
-  bool optimize = true;
-  bool quantification = false;
-  bool lod_mode = false;
-  int thread_num = 1;
-  std::string cl_path;
-  struct PaddleModelMemoryPack memory_pack;
-};
-
-// A factory to help create different predictors.
-template <typename ConfigT,
-          PaddleEngineKind engine = PaddleEngineKind::kPaddleMobile>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/io/paddle_mobile.cpp b/mobile/src/io/paddle_mobile.cpp
deleted file mode 100644
index 95ae3763a2..0000000000
--- a/mobile/src/io/paddle_mobile.cpp
+++ /dev/null
@@ -1,545 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "io/paddle_mobile.h"
-#include <utility>
-#include "common/common.h"
-#ifdef _OPENMP
-#include <omp.h>
-#endif  // _OPENMP
-#ifdef PADDLE_MOBILE_CL
-#include <CL/cl.h>
-#include <mutex>  // NOLINT
-#include "framework/cl/cl_engine.h"
-#include "framework/cl/cl_tensor.h"
-#endif
-#include "operators/math/gemm.h"
-
-namespace paddle_mobile {
-
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::SetThreadNum(int thread_num,
-                                           PowerMode power_mode) {
-  executor_->SetThreadNum(thread_num, power_mode);
-}
-
-template <typename Device, typename T>
-PMStatus PaddleMobile<Device, T>::Load(const std::string &dirname,
-                                       bool optimize, bool quantification,
-                                       int batch_size, bool lod_mode) {
-  if (loader_.get() == nullptr) {
-    loader_ = std::make_shared<framework::Loader<Device, T>>();
-  } else {
-    LOG(kLOG_INFO) << "loader inited";
-  }
-
-  if (executor_.get() == nullptr) {
-    executor_ = std::make_shared<framework::Executor<Device, T>>(
-        loader_->Load(dirname, optimize, quantification), config_, batch_size,
-        optimize, lod_mode);
-  } else {
-    LOG(kLOG_INFO) << "executor inited";
-  }
-
-  return PMSuccess;
-}
-
-template <typename Device, typename T>
-PMStatus PaddleMobile<Device, T>::Load(const std::string &model_path,
-                                       const std::string &para_path,
-                                       bool optimize, bool quantification,
-                                       int batch_size, bool lod_mode) {
-  if (loader_.get() == nullptr) {
-    loader_ = std::make_shared<framework::Loader<Device, T>>();
-  } else {
-    LOG(kLOG_INFO) << "loader inited";
-    LOG(kLOG_INFO) << "loader inited";
-  }
-
-  if (executor_.get() == nullptr) {
-    executor_ = std::make_shared<framework::Executor<Device, T>>(
-        loader_->Load(model_path, para_path, optimize, quantification), config_,
-        batch_size, optimize, lod_mode);
-  } else {
-    LOG(kLOG_INFO) << "executor inited";
-  }
-
-  return PMSuccess;
-}
-
-template <typename Device, typename T>
-PMStatus PaddleMobile<Device, T>::Load(const PaddleMobileConfig &config) {
-  if (!config.model_dir.empty()) {
-    return this->Load(config.model_dir, config.optimize, config.quantification,
-                      config.batch_size, config.lod_mode);
-  } else if (!config.prog_file.empty() && !config.param_file.empty()) {
-    return this->Load(config.prog_file, config.param_file, config.optimize,
-                      config.quantification, config.batch_size,
-                      config.lod_mode);
-  } else {
-    LOG(kLOG_ERROR) << "Failed to load inference model";
-    return PMNotInitialized;
-  }
-}
-
-template <typename Device, typename T>
-bool PaddleMobile<Device, T>::LoadCombinedMemory(
-    size_t model_len, const uint8_t *model_buf, size_t combined_params_len,
-    uint8_t *combined_params_buf, bool optimize, bool quantification,
-    int batch_size, bool lod_mode) {
-  if (loader_.get() == nullptr) {
-    loader_ = std::make_shared<framework::Loader<Device, T>>();
-  } else {
-    LOG(kLOG_INFO) << "loader inited";
-  }
-  if (executor_.get() == nullptr) {
-    executor_ = std::make_shared<framework::Executor<Device, T>>(
-        loader_->LoadCombinedMemory(model_len, model_buf, combined_params_len,
-                                    combined_params_buf, optimize,
-                                    quantification),
-        config_, batch_size, optimize, lod_mode);
-  } else {
-    LOG(kLOG_INFO) << "executor inited";
-  }
-
-  return PMSuccess;
-}
-
-template <typename Device, typename T>
-PMStatus PaddleMobile<Device, T>::Predict(const framework::Tensor &input) {
-  std::vector<std::pair<std::string, framework::Tensor>> inputs;
-  inputs.push_back(std::make_pair("feed", input));
-  return this->Predict(inputs);
-}
-
-template <typename Device, typename T>
-PMStatus PaddleMobile<Device, T>::Predict(const framework::LoDTensor &input) {
-  std::vector<std::pair<std::string, framework::LoDTensor>> inputs;
-  inputs.push_back(std::make_pair("feed", input));
-  return this->Predict(inputs);
-}
-
-template <typename Device, typename T>
-PMStatus PaddleMobile<Device, T>::Predict(
-    const std::vector<std::pair<std::string, framework::Tensor>> &inputs) {
-  return executor_->Predict(inputs);
-}
-
-template <typename Device, typename T>
-PMStatus PaddleMobile<Device, T>::Predict(
-    const std::vector<std::pair<std::string, framework::LoDTensor>> &inputs) {
-  return executor_->Predict(inputs);
-}
-
-template <typename Device, typename T>
-std::vector<T> PaddleMobile<Device, T>::Predict(
-    const std::vector<T> &input, const std::vector<int64_t> &dims) {
-  return executor_->Predict(input, dims);
-}
-
-template <typename Device, typename T>
-PMStatus PaddleMobile<Device, T>::Predict() {
-  return executor_->Predict();
-}
-
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::Feed(const std::string &var_name,
-                                   const framework::Tensor &input) {
-  executor_->SetInput(input, var_name);
-}
-
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::Feed(const std::string &var_name,
-                                   const framework::LoDTensor &input) {
-  executor_->SetInput(input, var_name);
-}
-
-typedef std::shared_ptr<framework::LoDTensor> LoDTensorPtr;
-template <typename Device, typename T>
-LoDTensorPtr PaddleMobile<Device, T>::Fetch(const std::string &var_name) {
-  return executor_->GetOutput(var_name);
-}
-
-#ifdef PADDLE_MOBILE_CL
-template <typename Device, typename T>
-const framework::CLImage *PaddleMobile<Device, T>::FetchImage(
-    const std::string &var_name) {
-  return executor_->GetOutputImage(var_name);
-}
-#endif
-
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::Clear() {
-  executor_ = nullptr;
-  loader_ = nullptr;
-}
-
-template <typename Device, typename T>
-double PaddleMobile<Device, T>::GetPredictTime() {}
-
-template <typename Device, typename T>
-std::string PaddleMobile<Device, T>::GetExceptionMsg() {
-  if (executor_.get() != nullptr) {
-    return executor_->GetExceptionMsg();
-  }
-  return "";
-}
-
-#ifdef PADDLE_MOBILE_CPU
-template <>
-double PaddleMobile<CPU, float>::GetPredictTime() {
-  int m = 32;
-  int n = 224 * 224;
-  int k = 27;
-  int lda = k;
-  int ldb = n;
-  int ldc = n;
-  float *a =
-      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * k));
-  float *b =
-      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * k * n));
-  float *c =
-      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
-  int t1 = 1;
-  int t2 = 1;
-  for (int i = 0; i < m * k; ++i) {
-    a[i] = t1 + rand() % t2;  // NOLINT
-  }
-  for (int i = 0; i < k * n; ++i) {
-    b[i] = t1 + rand() % t2;  // NOLINT
-  }
-
-  operators::math::Gemm gemm;
-  auto time1 = paddle_mobile::time();
-  int times = 4;
-  for (int j = 0; j < times; ++j) {
-    gemm.Sgemm(m, n, k, static_cast<float>(1), a, lda, b, ldb,
-               static_cast<float>(0), c, ldc, false,
-               static_cast<float *>(nullptr));
-  }
-
-  auto time2 = paddle_mobile::time();
-  double cost = paddle_mobile::time_diff(time1, time2) / times;
-  paddle_mobile::memory::Free(a);
-  paddle_mobile::memory::Free(b);
-  paddle_mobile::memory::Free(c);
-  return cost;
-}
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::InjectVariable(const framework::Tensor &t,
-                                             std::string var_name) {
-  executor_->InjectVariable(t, var_name);
-}
-
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::FeedData(const framework::Tensor &t) {
-  executor_->FeedData(t);
-}
-
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::FeedData(const std::vector<void *> &v) {
-  executor_->FeedData(v);
-}
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::FeedTensorData(
-    const std::vector<framework::Tensor> &v) {
-  executor_->FeedTensorData(v);
-}
-
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::GetResults(std::vector<void *> *v) {
-  executor_->GetResults(v);
-}
-
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::GetTensorResults(
-    std::vector<framework::Tensor *> *v) {
-  executor_->GetTensorResults(v);
-}
-
-template <typename Device, typename T>
-framework::Tensor *PaddleMobile<Device, T>::GetTensorByName(
-    const std::string &name) {
-  return executor_->GetTensorByName(name);
-}
-
-template <typename Device, typename T>
-std::shared_ptr<framework::Tensor> PaddleMobile<Device, T>::FetchResult(
-    int id) {
-  return executor_->FetchResult(id);
-}
-
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::Predict_From_To(int start, int end) {
-  executor_->Predict_From_To(start, end);
-}
-
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::Predict_From(int start) {
-  executor_->Predict_From(start);
-}
-
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::Predict_To(int end) {
-  executor_->Predict_To(end);
-}
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-static std::mutex lc;
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::SetCLPath(std::string path) {
-  std::lock_guard<std::mutex> lock(lc);
-  if (framework::CLEngine::Instance()->GetCLPath() == "") {
-    framework::CLEngine::Instance()->setClPath(path);
-  }
-}
-template <>
-double PaddleMobile<GPU_CL, float>::GetPredictTime() {
-  cl_int status;
-  if (!framework::CLEngine::Instance()->isInitSuccess()) {
-    return -1;
-  }
-  cl_context context = framework::CLEngine::Instance()->getContext();
-  cl_command_queue queue = framework::CLEngine::Instance()->getClCommandQueue();
-
-  int n = 1;
-  int c = 3;
-  int h = 224;
-  int w = 224;
-  float *input = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * 3 * 224 * 224));
-  float *filter = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * 32 * 27));
-  int input_w = w * (c + 3) / 4;
-  int input_h = n * h;
-  int filter_w = 3 * (3 + 3) / 4;
-  int filter_h = 32 * 3;
-  int output_w = 224 * (32 + 3) / 4;
-  int output_h = 1 * 224;
-
-  framework::DDim input_dims = {1, 3, 224, 224};
-  framework::CLTensor input_cl_tensor(context, queue);
-  input_cl_tensor.Resize(input_dims);
-  cl_mem inputBuffer = input_cl_tensor.mutable_with_data<float>(input);
-
-  framework::DDim filter_dims = {32, 3, 3, 3};
-  framework::CLTensor filter_cl_tensor(context, queue);
-  input_cl_tensor.Resize(filter_dims);
-  cl_mem filterBuffer = filter_cl_tensor.mutable_with_data<float>(filter);
-
-  cl_mem cl_filter_image = NULL;
-  cl_mem cl_input_image = NULL;
-  cl_mem cl_output_image = NULL;
-  cl_image_format cf = {.image_channel_order = CL_RGBA,
-                        .image_channel_data_type = CL_HALF_FLOAT};
-  cl_input_image = clCreateImage2D(context, CL_MEM_READ_WRITE | 0, &cf, input_w,
-                                   input_h, 0, NULL, &status);
-  cl_filter_image = clCreateImage2D(context, CL_MEM_READ_WRITE | 0, &cf,
-                                    filter_w, filter_h, 0, NULL, &status);
-  cl_output_image = clCreateImage2D(context, CL_MEM_READ_WRITE | 0, &cf,
-                                    output_w, output_h, 0, NULL, &status);
-  char *code;
-  std::string path = framework::CLEngine::Instance()->GetCLPath() +
-                     "/cl_kernel/feed_kernel.cl";
-  size_t length = readText(path.c_str(), &code);
-  cl_program program = clCreateProgramWithSource(
-      context, 1, (const char **)&code, &length, NULL);
-  std::string path1 = "-cl-fast-relaxed-math -I " +
-                      framework::CLEngine::Instance()->GetCLPath() +
-                      "/cl_kernel";
-  clBuildProgram(program, 0, 0, path1.c_str(), NULL, NULL);
-  cl_kernel kernel = clCreateKernel(program, "feed", &status);
-
-  int out_H = 224;
-  int out_W = 224;
-  int out_C = 3;
-  int Stride2 = out_C * out_H * out_W;
-  int Stride1 = out_H * out_W;
-  int Stride0 = out_W;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_input_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2);
-  CL_CHECK_ERRORS(status);
-
-  size_t global_work_size[3] = {1, 224, 224};
-
-  //  cl_event out_event = param.Out()->GetClEvent();
-
-  status = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size,
-                                  NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-
-  out_H = 3;
-  out_W = 3;
-  out_C = 3;
-  Stride2 = out_C * out_H * out_W;
-  Stride1 = out_H * out_W;
-  Stride0 = out_W;
-
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &filterBuffer);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_filter_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2);
-  CL_CHECK_ERRORS(status);
-
-  size_t global_work_size1[3] = {1, 3, 96};
-
-  //  cl_event out_event = param.Out()->GetClEvent();
-
-  status = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size1,
-                                  NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-
-  clFinish(queue);
-  //  queue = clCreateCommandQueue(context, listDevice[0], 0, &status);
-
-  path = framework::CLEngine::Instance()->GetCLPath() +
-         "/cl_kernel/conv_kernel.cl";
-  size_t length1 = readText(path.c_str(), &code);
-  program = clCreateProgramWithSource(context, 1, (const char **)&code,
-                                      &length1, &status);
-  CL_CHECK_ERRORS(status);
-  clBuildProgram(program, 0, 0, path1.c_str(), NULL, NULL);
-  kernel = clCreateKernel(program, "conv_3x3", &status);
-  CL_CHECK_ERRORS(status);
-
-  int c_block = (32 + 3) / 4;
-  int nh = n * h;
-  int stride = 1;
-  int offset = 0;
-  int input_c = (c + 3) / 4;
-  int dilation = 1;
-  int input_width = 224;
-  int input_height = 224;
-  int output_width = 224;
-  int output_height = 224;
-  int has_group = 0;
-  int filter_channel = 3;
-  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &cl_input_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &cl_filter_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &cl_output_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 14, sizeof(int), &filter_channel);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 15, sizeof(int), &has_group);
-  CL_CHECK_ERRORS(status);
-
-  //  cl_event out_event = param.Output()->GetClEvent();
-  //  cl_event wait_event = param.Input()->GetClEvent();
-  size_t global_work_size2[3] = {8, 224, 224};
-  auto time1 = paddle_mobile::time();
-  int times = 10;
-  for (int i = 0; i < times; ++i) {
-    status = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size2,
-                                    NULL, 0, NULL, NULL);
-  }
-  CL_CHECK_ERRORS(status);
-  clFinish(queue);
-  auto time2 = paddle_mobile::time();
-  paddle_mobile::memory::Free(input);
-  paddle_mobile::memory::Free(filter);
-  if (status == CL_SUCCESS) {
-    return paddle_mobile::time_diff(time1, time2) / times;
-  } else {
-    return -1;
-  }
-}
-template <typename Device, typename T>
-int PaddleMobile<Device, T>::readText(
-    const char *kernelPath,
-    char **pcode) {  // 读取文本文件放入 pcode，返回字符串长度
-  FILE *fp;
-  int size;
-  // printf("<readText> File: %s\n", kernelPath);
-  fp = fopen(kernelPath, "rb");
-  if (!fp) {
-    printf("<readText> Open file failed\n");
-    return -1;
-  }
-  if (fseek(fp, 0, SEEK_END) != 0) {
-    printf("<readText> Seek end of file failed\n");
-    return -1;
-  }
-  if ((size = ftell(fp)) < 0) {
-    printf("<readText> Get file position failed\n");
-    return -1;
-  }
-  rewind(fp);
-  if ((*pcode = reinterpret_cast<char *>(malloc(size + 1))) == NULL) {
-    printf("<readText> Allocate space failed\n");
-    return -1;
-  }
-  fread(*pcode, 1, size, fp);
-  (*pcode)[size] = '\0';
-  fclose(fp);
-  return size + 1;
-}
-#endif
-
-template class PaddleMobile<CPU, float>;
-template class PaddleMobile<FPGA, float>;
-template class PaddleMobile<GPU_CL, float>;
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/io/paddle_mobile.h b/mobile/src/io/paddle_mobile.h
deleted file mode 100644
index e39d712447..0000000000
--- a/mobile/src/io/paddle_mobile.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "common/types.h"
-#include "framework/executor.h"
-#include "framework/load_ops.h"
-#include "framework/loader.h"
-#include "framework/tensor.h"
-#include "io/paddle_inference_api.h"
-#ifdef PADDLE_MOBILE_CL
-#include "framework/cl/cl_engine.h"
-#endif
-
-namespace paddle_mobile {
-
-template <typename Device, typename T = float>
-class PaddleMobile {
- public:
-  explicit PaddleMobile(PaddleMobileConfigInternal config) : config_(config) {
-#ifndef PADDLE_MOBILE_CL
-    bool is_gpu = std::is_same<DeviceType<kGPU_CL>, Device>::value;
-    PADDLE_MOBILE_ENFORCE(!is_gpu, "Please recompile with GPU_CL is on");
-#endif
-  }
-
-  PaddleMobile() {
-#ifndef PADDLE_MOBILE_CL
-    bool is_gpu = std::is_same<DeviceType<kGPU_CL>, Device>::value;
-    PADDLE_MOBILE_ENFORCE(!is_gpu, "Please recompile with GPU_CL is on");
-#endif
-  }
-  virtual ~PaddleMobile() { Clear(); }
-
-  PMStatus Load(const std::string &dirname, const bool optimize = false,
-                const bool quantification = false, const int batch_size = 1,
-                const bool lod_mode = false);
-  PMStatus Load(const std::string &model_path, const std::string &para_path,
-                const bool optimize = false, const bool quantification = false,
-                const int batch_size = 1, const bool lod_mode = false);
-
-  PMStatus Load(const PaddleMobileConfig &config);
-
-  PMStatus Predict(const framework::Tensor &input);
-  PMStatus Predict(const framework::LoDTensor &input);
-
-  PMStatus Predict(
-      const std::vector<std::pair<std::string, framework::Tensor>> &inputs);
-  PMStatus Predict(
-      const std::vector<std::pair<std::string, framework::LoDTensor>> &inputs);
-
-  std::vector<T> Predict(const std::vector<T> &input,
-                         const std::vector<int64_t> &dims);
-  PMStatus Predict();
-
-  void Feed(const std::string &var_name, const framework::LoDTensor &input);
-  void Feed(const std::string &var_name, const framework::Tensor &input);
-
-  typedef std::shared_ptr<framework::LoDTensor> LoDTensorPtr;
-  LoDTensorPtr Fetch(const std::string &var_name);
-#ifdef PADDLE_MOBILE_CL
-  const framework::CLImage *FetchImage(const std::string &var_name);
-#endif
-
-  LoDTensorPtr Fetch() { return Fetch("fetch"); }
-
-  bool LoadCombinedMemory(size_t model_len, const uint8_t *model_buf,
-                          size_t combined_params_len,
-                          uint8_t *combined_params_buf, bool optimize = false,
-                          bool quantification = false, int batch_size = 1,
-                          bool lod_mode = false);
-
-  void SetThreadNum(int thread_num,
-                    PowerMode power_mode = PERFORMANCE_PRIORITY);
-  void Clear();
-  double GetPredictTime();
-  std::string GetExceptionMsg();
-
-#ifdef PADDLE_MOBILE_FPGA
-  void InjectVariable(const framework::Tensor &t, std::string var_name);
-  void FeedData(const framework::Tensor &t);
-  void FeedData(const std::vector<void *> &v);
-  void FeedTensorData(const std::vector<framework::Tensor> &v);
-
-  void GetResults(std::vector<void *> *v);
-  void GetTensorResults(std::vector<framework::Tensor *> *v);
-  framework::Tensor *GetTensorByName(const std::string &name);
-
-  std::shared_ptr<framework::Tensor> FetchResult(int id = -1);
-  void Predict_From_To(int start = 0, int end = -1);
-  void Predict_From(int start);
-  void Predict_To(int end);
-#endif
-
-#ifdef PADDLE_MOBILE_CL
- public:  // NOLINT
-  void SetCLPath(std::string cl_path);
-  int readText(const char *kernelPath,
-               char **pcode);  // 读取文本文件放入 pcode，返回字符串长度
-#endif
-
- private:
-  std::shared_ptr<framework::Loader<Device, T>> loader_;
-  std::shared_ptr<framework::Executor<Device, T>> executor_;
-  PaddleMobileConfigInternal config_;
-};
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/io/paddle_mobile_wrap.cpp b/mobile/src/io/paddle_mobile_wrap.cpp
deleted file mode 100644
index b8fd3097e2..0000000000
--- a/mobile/src/io/paddle_mobile_wrap.cpp
+++ /dev/null
@@ -1,361 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "io/paddle_mobile_wrap.h"
-
-#include "io/api_paddle_mobile.h"
-#include "io/paddle_mobile.h"
-
-namespace paddle_mobile {
-namespace wrap {
-
-#ifndef PADDLE_MOBILE_FPGA
-
-// ddim class
-int DDim::size() { return dims.size(); }
-
-int64_t &DDim::operator[](int idx) {
-  if (0 <= idx && idx < dims.size()) {
-    return dims[idx];
-  }
-  int64_t non_exist = 0;
-  return non_exist;
-}
-
-int64_t DDim::operator[](int idx) const {
-  if (0 <= idx && idx < dims.size()) {
-    return dims[idx];
-  }
-  return 0;
-}
-
-DDim make_ddim(const std::vector<int64_t> &dims) {
-  DDim ddim;
-  for (auto dim : dims) {
-    ddim.dims.push_back(dim);
-  }
-  return ddim;
-}
-
-// tensor class
-
-Tensor::Tensor(float *data, DDim ddim) {
-  this->data_ = data;
-  this->ddim_ = ddim;
-}
-
-float *Tensor::data() const { return this->data_; }
-
-DDim Tensor::dims() const { return this->ddim_; }
-
-// net class
-
-void Net::SetThreadNum(int threads) {
-  if (this->device_ == kCPU) {
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::CPU> *)this->engine_;
-    if (engine != nullptr) {
-      engine->SetThreadNum(threads);
-    }
-  }
-}
-
-void Net::SetCLPath(std::string path) {
-#ifdef PADDLE_MOBILE_CL
-  if (this->device_ == kGPU_CL) {
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> *)this->engine_;
-    engine->SetCLPath(path);
-  }
-#endif
-}
-
-bool Net::Load(const std::string &dirname, const bool optimize,
-               const bool quantification, const int batch_size,
-               const bool lod_mode) {
-  if (this->device_ == kCPU) {
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::CPU> *)this->engine_;
-    if (engine != nullptr) {
-      paddle_mobile::PMStatus status =
-          engine->Load(dirname, optimize, quantification, batch_size, lod_mode);
-      return status == paddle_mobile::PMSuccess;
-    }
-  } else if (this->device_ == kGPU_CL) {
-#ifdef PADDLE_MOBILE_CL
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> *)this->engine_;
-    if (engine != nullptr) {
-      paddle_mobile::PMStatus status =
-          engine->Load(dirname, optimize, quantification, batch_size, lod_mode);
-      return status == paddle_mobile::PMSuccess;
-    }
-#else
-    return false;
-#endif
-  }
-  return false;
-}
-
-bool Net::Load(const std::string &model_path, const std::string &para_path,
-               const bool optimize, const bool quantification,
-               const int batch_size, const bool lod_mode) {
-  if (this->device_ == kCPU) {
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::CPU> *)this->engine_;
-    if (engine != nullptr) {
-      paddle_mobile::PMStatus status =
-          engine->Load(model_path, para_path, optimize, quantification,
-                       batch_size, lod_mode);
-      return status == paddle_mobile::PMSuccess;
-    }
-  } else if (this->device_ == kGPU_CL) {
-#ifdef PADDLE_MOBILE_CL
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> *)this->engine_;
-    if (engine != nullptr) {
-      paddle_mobile::PMStatus status =
-          engine->Load(model_path, para_path, optimize, quantification,
-                       batch_size, lod_mode);
-      return status == paddle_mobile::PMSuccess;
-    }
-#else
-    return false;
-#endif
-  }
-  return false;
-}
-
-bool Net::LoadCombinedMemory(size_t model_len, const uint8_t *model_buf,
-                             size_t combined_params_len,
-                             uint8_t *combined_params_buf, bool optimize,
-                             bool quantification, int batch_size,
-                             bool lod_mode) {
-  if (this->device_ == kCPU) {
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::CPU> *)this->engine_;
-    if (engine != nullptr) {
-      bool status = engine->LoadCombinedMemory(
-          model_len, model_buf, combined_params_len, combined_params_buf,
-          optimize, quantification, batch_size, lod_mode);
-      return status;
-    }
-  } else if (this->device_ == kGPU_CL) {
-#ifdef PADDLE_MOBILE_CL
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> *)this->engine_;
-    if (engine != nullptr) {
-      bool status = engine->LoadCombinedMemory(
-          model_len, model_buf, combined_params_len, combined_params_buf,
-          optimize, quantification, batch_size, lod_mode);
-      return status;
-    }
-#else
-    return false;
-#endif
-  }
-  return false;
-}
-
-std::vector<float> Net::Predict(const std::vector<float> &input,
-                                const std::vector<int64_t> &dims) {
-  if (this->device_ == kCPU) {
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::CPU> *)this->engine_;
-    if (engine != nullptr) {
-      auto result = engine->Predict(input, dims);
-      return result;
-    }
-  } else if (this->device_ == kGPU_CL) {
-#ifdef PADDLE_MOBILE_CL
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> *)this->engine_;
-    if (engine != nullptr) {
-      auto result = engine->Predict(input, dims);
-      return result;
-    }
-#else
-    return std::vector<float>();
-#endif
-  }
-  return std::vector<float>();
-}
-
-bool Net::Predict() {
-  if (this->device_ == kCPU) {
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::CPU> *)this->engine_;
-    if (engine != nullptr) {
-      paddle_mobile::PMStatus status = engine->Predict();
-      return status == paddle_mobile::PMSuccess;
-    }
-  } else if (this->device_ == kGPU_CL) {
-#ifdef PADDLE_MOBILE_CL
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> *)this->engine_;
-    if (engine != nullptr) {
-      paddle_mobile::PMStatus status = engine->Predict();
-      return status == paddle_mobile::PMSuccess;
-    }
-#else
-    return false;
-#endif
-  }
-  return false;
-}
-
-bool Net::Predict(const Tensor &input) {
-  if (this->device_ == kCPU) {
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::CPU> *)this->engine_;
-    if (engine != nullptr) {
-      auto input_data = input.data();
-      auto input_dims = input.dims();
-      std::vector<int64_t> input_dims_as_vector = input_dims.dims;
-      paddle_mobile::framework::Tensor input_inner(
-          input_data,
-          paddle_mobile::framework::make_ddim(input_dims_as_vector));
-      paddle_mobile::PMStatus status = engine->Predict(input_inner);
-      return status == paddle_mobile::PMSuccess;
-    }
-  } else if (this->device_ == kGPU_CL) {
-#ifdef PADDLE_MOBILE_CL
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> *)this->engine_;
-    if (engine != nullptr) {
-      auto input_data = input.data();
-      auto input_dims = input.dims();
-      std::vector<int64_t> input_dims_as_vector = input_dims.dims;
-      paddle_mobile::framework::Tensor input_inner(
-          input_data,
-          paddle_mobile::framework::make_ddim(input_dims_as_vector));
-      paddle_mobile::PMStatus status = engine->Predict(input_inner);
-      return status == paddle_mobile::PMSuccess;
-    }
-#else
-    return false;
-#endif
-  }
-  return false;
-}
-
-void Net::Feed(const std::string &var_name, const Tensor &input) {
-  if (this->device_ == kCPU) {
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::CPU> *)this->engine_;
-    if (engine != nullptr) {
-      auto input_data = input.data();
-      auto input_dims = input.dims();
-      std::vector<int64_t> input_dims_as_vector = input_dims.dims;
-      paddle_mobile::framework::Tensor input_inner(
-          input_data,
-          paddle_mobile::framework::make_ddim(input_dims_as_vector));
-      engine->Feed(var_name, input_inner);
-    }
-  } else if (this->device_ == kGPU_CL) {
-#ifdef PADDLE_MOBILE_CL
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> *)this->engine_;
-    if (engine != nullptr) {
-      auto input_data = input.data();
-      auto input_dims = input.dims();
-      std::vector<int64_t> input_dims_as_vector = input_dims.dims;
-      paddle_mobile::framework::Tensor input_inner(
-          input_data,
-          paddle_mobile::framework::make_ddim(input_dims_as_vector));
-      engine->Feed(var_name, input_inner);
-    }
-#else
-    return;
-#endif
-  }
-}
-
-std::shared_ptr<Tensor> Net::Fetch(const std::string &var_name) {
-  if (this->device_ == kCPU) {
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::CPU> *)this->engine_;
-    if (engine != nullptr) {
-      auto output_inner = engine->Fetch(var_name);
-      auto ddim_inner = output_inner->dims();
-      std::vector<int64_t> ddim_as_vector;
-      for (int i = 0; i < ddim_inner.size(); i++) {
-        ddim_as_vector.push_back(ddim_inner[i]);
-      }
-      auto ddim = make_ddim(ddim_as_vector);
-      auto output_data = output_inner->data<float>();
-      std::shared_ptr<Tensor> ptr(new Tensor(output_data, ddim));
-      return ptr;
-    }
-  } else if (this->device_ == kGPU_CL) {
-#ifdef PADDLE_MOBILE_CL
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> *)this->engine_;
-    if (engine != nullptr) {
-      auto output_inner = engine->Fetch(var_name);
-      auto ddim_inner = output_inner->dims();
-      std::vector<int64_t> ddim_as_vector;
-      for (int i = 0; i < ddim_inner.size(); i++) {
-        ddim_as_vector.push_back(ddim_inner[i]);
-      }
-      auto ddim = make_ddim(ddim_as_vector);
-      auto output_data = output_inner->data<float>();
-      std::shared_ptr<Tensor> ptr(new Tensor(output_data, ddim));
-      return ptr;
-    }
-#else
-    return nullptr;
-#endif
-  }
-  return nullptr;
-}
-
-Net::Net(DeviceTypeEnum device) {
-  if (this->engine_ == nullptr) {
-    PaddleMobileConfigInternal config;
-    this->device_ = device;
-    if (this->device_ == kCPU) {
-      this->engine_ =
-          new paddle_mobile::PaddleMobile<paddle_mobile::CPU>(config);
-    } else if (this->device_ == kGPU_CL) {
-#ifdef PADDLE_MOBILE_CL
-      this->engine_ =
-          new paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL>(config);
-#endif
-    }
-  }
-}
-
-Net::~Net() {
-  if (this->engine_ != nullptr) {
-    if (this->device_ == kCPU) {
-      auto engine =
-          (paddle_mobile::PaddleMobile<paddle_mobile::CPU> *)this->engine_;
-      delete engine;
-      this->engine_ = nullptr;
-    } else if (this->device_ == kGPU_CL) {
-#ifdef PADDLE_MOBILE_CL
-      auto engine =
-          (paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> *)this->engine_;
-      delete engine;
-      this->engine_ = nullptr;
-#endif
-    }
-  }
-}
-
-#endif
-
-}  // namespace wrap
-}  // namespace paddle_mobile
diff --git a/mobile/src/io/paddle_mobile_wrap.h b/mobile/src/io/paddle_mobile_wrap.h
deleted file mode 100644
index 28c954dbc7..0000000000
--- a/mobile/src/io/paddle_mobile_wrap.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdint>
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-namespace paddle_mobile {
-namespace wrap {
-
-#ifndef PADDLE_MOBILE_FPGA
-
-// device type
-__attribute__((__visibility__("default"))) enum DeviceTypeEnum {
-  kCPU = 0,
-  kGPU_CL = 1
-};
-
-// ddim class
-class DDim {
- public:
-  __attribute__((__visibility__("default"))) int size();
-  __attribute__((__visibility__("default"))) int64_t &operator[](int idx);
-  __attribute__((__visibility__("default"))) int64_t operator[](int idx) const;
-
-  __attribute__((__visibility__("default"))) std::vector<int64_t> dims;
-};
-__attribute__((__visibility__("default"))) DDim make_ddim(
-    const std::vector<int64_t> &dims);
-
-// tensor class
-class Tensor {
- public:
-  __attribute__((__visibility__("default"))) Tensor(float *data, DDim ddim);
-
-  __attribute__((__visibility__("default"))) float *data() const;
-  __attribute__((__visibility__("default"))) DDim dims() const;
-
- private:
-  float *data_;
-  DDim ddim_;
-};
-
-// net class
-class Net {
- public:
-  __attribute__((__visibility__("default"))) Net(DeviceTypeEnum device);
-  __attribute__((__visibility__("default"))) ~Net();
-  __attribute__((__visibility__("default"))) void SetThreadNum(int thread_num);
-  __attribute__((__visibility__("default"))) void SetCLPath(std::string path);
-  __attribute__((__visibility__("default"))) bool Load(
-      const std::string &dirname, const bool optimize = false,
-      const bool quantification = false, const int batch_size = 1,
-      const bool lod_mode = false);
-  __attribute__((__visibility__("default"))) bool Load(
-      const std::string &model_path, const std::string &para_path,
-      const bool optimize = false, const bool quantification = false,
-      const int batch_size = 1, const bool lod_mode = false);
-  __attribute__((__visibility__("default"))) bool LoadCombinedMemory(
-      size_t model_len, const uint8_t *model_buf, size_t combined_params_len,
-      uint8_t *combined_params_buf, bool optimize = false,
-      bool quantification = false, int batch_size = 1, bool lod_mode = false);
-  __attribute__((__visibility__("default"))) std::vector<float> Predict(
-      const std::vector<float> &input, const std::vector<int64_t> &dims);
-  __attribute__((__visibility__("default"))) bool Predict();
-  __attribute__((__visibility__("default"))) bool Predict(const Tensor &input);
-  __attribute__((__visibility__("default"))) void Feed(
-      const std::string &var_name, const Tensor &input);
-  __attribute__((__visibility__("default"))) std::shared_ptr<Tensor> Fetch(
-      const std::string &var_name);
-
- private:
-  void *engine_ = nullptr;
-  DeviceTypeEnum device_;
-};
-
-#endif
-
-}  // namespace wrap
-}  // namespace paddle_mobile
diff --git a/mobile/src/io/paddle_test_inference_api.cpp b/mobile/src/io/paddle_test_inference_api.cpp
deleted file mode 100644
index d0c6c48c20..0000000000
--- a/mobile/src/io/paddle_test_inference_api.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "io/paddle_test_inference_api.h"
-#include "io/paddle_mobile.h"
-
-namespace paddle_mobile {
-
-template <typename Device, typename T>
-double PaddleTester<Device, T>::CaculatePredictTime(std::string *cl_path) {
-  PaddleMobile<Device, T> paddle_mobile;
-#ifdef PADDLE_MOBILE_CL
-  if (cl_path) {
-    paddle_mobile.SetCLPath(*cl_path);
-  }
-
-#endif
-  return paddle_mobile.GetPredictTime();
-}
-template class PaddleTester<CPU, float>;
-template class PaddleTester<FPGA, float>;
-
-template class PaddleTester<GPU_CL, float>;
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/io/paddle_test_inference_api.h b/mobile/src/io/paddle_test_inference_api.h
deleted file mode 100644
index 47680a49da..0000000000
--- a/mobile/src/io/paddle_test_inference_api.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * This file contains the definition of a simple Inference API for Paddle.
- *
- * ATTENTION: It requires some C++ features, for lower version C++ or C, we
- * might release another API.
- */
-
-#pragma once
-
-#include "common/types.h"
-#include "string"
-
-namespace paddle_mobile {
-
-template <typename Device, typename T = float>
-class PaddleTester {
- public:
-  double CaculatePredictTime(std::string *cl_path = nullptr);
-};
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/memory/t_malloc.cpp b/mobile/src/memory/t_malloc.cpp
deleted file mode 100755
index f48a75d3f6..0000000000
--- a/mobile/src/memory/t_malloc.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "memory/t_malloc.h"
-#include <cstdlib>
-#include <cstring>
-
-#ifdef PADDLE_MOBILE_FPGA_V1
-#include "fpga/V1/api.h"
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA_V2
-#include "fpga/V2/api.h"
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA_KD
-#include "fpga/KD/llapi/zynqmp_api.h"
-#endif
-
-namespace paddle_mobile {
-namespace memory {
-const int MALLOC_ALIGN = 64;
-
-#ifdef PADDLE_MOBILE_FPGA
-namespace fpga = paddle_mobile::fpga;
-
-void Copy(void *dst, const void *src, size_t num) {
-  fpga::fpga_copy(dst, src, num);
-}
-
-void *Alloc(size_t size) { return fpga::fpga_malloc(size); }
-
-void Free(void *ptr) {
-  if (ptr) {
-    fpga::fpga_free(ptr);
-  }
-}
-
-#elif defined(PADDLE_MOBILE_FPGA_KD)
-
-void Copy(void *dst, const void *src, size_t num) {
-  std::memcpy(dst, src, num);
-}
-
-void *Alloc(size_t size) { return zynqmp::fpga_malloc(size); }
-
-void Free(void *ptr) {
-  if (ptr) {
-    zynqmp::fpga_free(ptr);
-  }
-}
-#else
-
-void Copy(void *dst, const void *src, size_t num) {
-  std::memcpy(dst, src, num);
-}
-
-void *Alloc(size_t size) {
-  // segmentation fault if size_t overflow on 32-bit platforms
-  // user should check before calling this function
-  size_t offset = sizeof(void *) + MALLOC_ALIGN - 1;
-  char *p = static_cast<char *>(malloc(offset + size));
-  if (!p) {
-    return nullptr;
-  }
-  void *r = reinterpret_cast<void *>(reinterpret_cast<size_t>(p + offset) &
-                                     (~(MALLOC_ALIGN - 1)));
-  static_cast<void **>(r)[-1] = p;
-  return r;
-}
-
-void Free(void *ptr) {
-  if (ptr) {
-    free(static_cast<void **>(ptr)[-1]);
-  }
-}
-
-#endif
-
-}  // namespace memory
-}  // namespace paddle_mobile
diff --git a/mobile/src/memory/t_malloc.h b/mobile/src/memory/t_malloc.h
deleted file mode 100644
index b57403b515..0000000000
--- a/mobile/src/memory/t_malloc.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstddef>
-#include <type_traits>
-
-namespace paddle_mobile {
-namespace memory {
-
-void Copy(void *dst, const void *src, size_t num);
-
-void *Alloc(size_t size);
-
-void Free(void *ptr);
-
-/**
- * \brief   Free memory block in one place.
- *
- * \note    In some cases, custom deleter is used to
- *          deallocate the memory automatically for
- *          std::unique_ptr<T> in tensor.h.
- *          static_cast
- */
-template <typename T>
-class PODDeleter {
-  static_assert(std::is_pod<T>::value, "T must be POD");
-
- public:
-  explicit PODDeleter(){};
-
-  void operator()(T *ptr) { Free(static_cast<void *>(ptr)); }
-};
-
-/**
- * \brief   Free memory block in one place does not meet POD
- *
- * \note    In some cases, custom deleter is used to
- *          deallocate the memory automatically for
- *          std::unique_ptr<T> in tensor.h.
- *          reinterpret_cast
- */
-template <typename T>
-class PlainDeleter {
- public:
-  explicit PlainDeleter(){};
-
-  void operator()(T *ptr) { Free(reinterpret_cast<void *>(ptr)); }
-};
-}  // namespace memory
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/activation_op.cpp b/mobile/src/operators/activation_op.cpp
deleted file mode 100755
index 905b881fee..0000000000
--- a/mobile/src/operators/activation_op.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/activation_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#define DEFINE_ACTIVATION_INFERSHAPE(OpName)                \
-  template <typename Dtype, typename T>                     \
-  void OpName##Op<Dtype, T>::InferShape() const {           \
-    const auto &input_dims = this->param_.InputX()->dims(); \
-    this->param_.Out()->Resize(input_dims);                 \
-  }
-
-#ifdef RELU_OP
-DEFINE_ACTIVATION_INFERSHAPE(Relu);
-DEFINE_ACTIVATION_INFERSHAPE(Relu6);
-#endif  // RELU_OP
-
-#ifdef SIGMOID_OP
-DEFINE_ACTIVATION_INFERSHAPE(Sigmoid);
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(sigmoid, ops::SigmoidOp);
-#endif
-#endif  // SIGMOID_OP
-
-#ifdef TANH_OP
-DEFINE_ACTIVATION_INFERSHAPE(Tanh);
-#endif  // TANH_OP
-
-#ifdef LOG_OP
-DEFINE_ACTIVATION_INFERSHAPE(Log);
-#endif  // LOG_OP
-
-#ifdef LEAKY_RELU_OP
-DEFINE_ACTIVATION_INFERSHAPE(LeakyRelu);
-#endif  // LEAKY_RELU_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef RELU_OP
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(relu, ops::ReluOp);
-REGISTER_OPERATOR_CPU(relu6, ops::Relu6Op);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(relu, ops::ReluOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(relu, ops::ReluOp);
-REGISTER_OPERATOR_CL(relu6, ops::Relu6Op);
-#endif
-#endif  // RELU_OP
-
-#ifdef SIGMOID_OP
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(sigmoid, ops::SigmoidOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(sigmoid, ops::SigmoidOp);
-#endif
-#endif  // SIGMOID_OP
-
-#ifdef TANH_OP
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(tanh, ops::TanhOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(tanh, ops::TanhOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(tanh, ops::TanhOp);
-#endif
-#endif  // TANH_OP
-
-#ifdef PADDLE_MOBILE_CPU
-#ifdef LOG_OP
-REGISTER_OPERATOR_CPU(log, ops::LogOp);
-#endif  // LOG_OP
-#endif
-
-#ifdef LEAKY_RELU_OP
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(leaky_relu, ops::LeakyReluOp);
-#endif  // LEAKY_RELU_OP
-
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(leaky_relu, ops::LeakyReluOp);
-#endif
-#endif
diff --git a/mobile/src/operators/activation_op.h b/mobile/src/operators/activation_op.h
deleted file mode 100644
index cd250080e5..0000000000
--- a/mobile/src/operators/activation_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/activation_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef RELU_OP
-DECLARE_OPERATOR(Relu, ReluParam, ReluKernel);
-DECLARE_OPERATOR(Relu6, Relu6Param, Relu6Kernel);
-#endif
-
-#ifdef SIGMOID_OP
-DECLARE_OPERATOR(Sigmoid, SigmoidParam, SigmoidKernel);
-#endif
-
-#ifdef TANH_OP
-DECLARE_OPERATOR(Tanh, TanhParam, TanhKernel);
-#endif
-
-#ifdef LOG_OP
-DECLARE_OPERATOR(Log, ReluParam, LogKernel);
-#endif
-
-#ifdef LEAKY_RELU_OP
-DECLARE_OPERATOR(LeakyRelu, LeakyReluParam, LeakyReluKernel);
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/assign_op.cpp b/mobile/src/operators/assign_op.cpp
deleted file mode 100644
index adc038a223..0000000000
--- a/mobile/src/operators/assign_op.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ASSIGN_OP
-
-#include "operators/assign_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void AssignOp<Dtype, T>::InferShape() const {
-  PADDLE_MOBILE_ENFORCE(this->param_.Input() != nullptr,
-                        "Input (X) of Assign op should not be null.");
-  PADDLE_MOBILE_ENFORCE(this->param_.Output() != nullptr,
-                        "Output (Output) of Assign op should not be null.");
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(assign, ops::AssignOp);
-#endif
-
-#endif  // ASSIGN_OP
diff --git a/mobile/src/operators/assign_op.h b/mobile/src/operators/assign_op.h
deleted file mode 100644
index 478330bc3b..0000000000
--- a/mobile/src/operators/assign_op.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ASSIGN_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/assign_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-DECLARE_OPERATOR(Assign, AssignParam, AssignKernel);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/assign_value_op.cpp b/mobile/src/operators/assign_value_op.cpp
deleted file mode 100644
index 49494929de..0000000000
--- a/mobile/src/operators/assign_value_op.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ASSIGN_VALUE_OP
-
-#include "operators/assign_value_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void AssignValueOp<Dtype, T>::InferShape() const {
-  const auto &shape = this->param_.shape_;
-  this->param_.output_->Resize(framework::make_ddim(shape));
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(assign_value, ops::AssignValueOp);
-#endif
-
-#endif  // ASSIGN_VALUE_OP
diff --git a/mobile/src/operators/assign_value_op.h b/mobile/src/operators/assign_value_op.h
deleted file mode 100644
index ce319d333a..0000000000
--- a/mobile/src/operators/assign_value_op.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ASSIGN_VALUE_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/assign_value_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-DECLARE_OPERATOR(AssignValue, AssignValueParam, AssignValueKernel);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/batchnorm_op.cpp b/mobile/src/operators/batchnorm_op.cpp
deleted file mode 100644
index 3a272845cc..0000000000
--- a/mobile/src/operators/batchnorm_op.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BATCHNORM_OP
-
-#include "operators/batchnorm_op.h"
-#include "framework/op_proto_maker.h"
-#include "framework/op_registry.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void BatchNormOp<Dtype, T>::InferShape() const {
-  auto x_dims = this->param_.InputX()->dims();
-  this->param_.OutputY()->Resize(x_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(batch_norm, ops::BatchNormOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(batch_norm, ops::BatchNormOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/batchnorm_op.h b/mobile/src/operators/batchnorm_op.h
deleted file mode 100644
index ed46c8657f..0000000000
--- a/mobile/src/operators/batchnorm_op.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BATCHNORM_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/batchnorm_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-template <typename DeviceType, typename T>
-class BatchNormOp
-    : public framework::OperatorWithKernel<DeviceType,
-                                           BatchNormParam<DeviceType>,
-                                           BatchNormKernel<DeviceType, T>> {
- public:
-  BatchNormOp(const string &type, const VariableNameMap &inputs,
-              const VariableNameMap &outputs,
-              const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, BatchNormParam<DeviceType>,
-                                      BatchNormKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/beam_search_decode_op.cpp b/mobile/src/operators/beam_search_decode_op.cpp
deleted file mode 100644
index 1038234fe8..0000000000
--- a/mobile/src/operators/beam_search_decode_op.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BEAM_SEARCH_DECODE_OP
-
-#include "operators/beam_search_decode_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void BeamSearchDecodeOp<Dtype, T>::InferShape() const {}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(beam_search_decode, ops::BeamSearchDecodeOp);
-#endif
-
-#endif  // BEAM_SEARCH_DECODE_OP
diff --git a/mobile/src/operators/beam_search_decode_op.h b/mobile/src/operators/beam_search_decode_op.h
deleted file mode 100644
index f212959474..0000000000
--- a/mobile/src/operators/beam_search_decode_op.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BEAM_SEARCH_DECODE_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/beam_search_decode_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-DECLARE_OPERATOR(BeamSearchDecode, BeamSearchDecodeParam,
-                 BeamSearchDecodeKernel);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // BEAM_SEARCH_DECODE_OP
diff --git a/mobile/src/operators/beam_search_op.cpp b/mobile/src/operators/beam_search_op.cpp
deleted file mode 100644
index 5f83e53667..0000000000
--- a/mobile/src/operators/beam_search_op.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BEAM_SEARCH_OP
-
-#include "operators/beam_search_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void BeamSearchOp<Dtype, T>::InferShape() const {}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(beam_search, ops::BeamSearchOp);
-#endif
-
-#endif  // BEAM_SEARCH_OP
diff --git a/mobile/src/operators/beam_search_op.h b/mobile/src/operators/beam_search_op.h
deleted file mode 100644
index 985552d9f6..0000000000
--- a/mobile/src/operators/beam_search_op.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BEAM_SEARCH_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/beam_search_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-DECLARE_OPERATOR(BeamSearch, BeamSearchParam, BeamSearchKernel);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // BEAM_SEARCH_OP
diff --git a/mobile/src/operators/bilinear_interp_op.cpp b/mobile/src/operators/bilinear_interp_op.cpp
deleted file mode 100644
index 5db21396b0..0000000000
--- a/mobile/src/operators/bilinear_interp_op.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BILINEAR_INTERP_OP
-
-#include "operators/bilinear_interp_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-template <typename DeviceType, typename T>
-void BilinearOp<DeviceType, T>::InferShape() const {
-  PADDLE_MOBILE_ENFORCE(this->param_.InputX() != nullptr,
-                        "Input(X) of BilinearInterOp should not be null.");
-  PADDLE_MOBILE_ENFORCE(this->param_.Out() != nullptr,
-                        "Output(Out) of BilinearInterOp should not be null.");
-
-  auto dim_x = this->param_.InputX()->dims();  // NCHW format
-  int out_h = this->param_.OutH();
-  int out_w = this->param_.OutW();
-  PADDLE_MOBILE_ENFORCE(dim_x.size() == 4, "X's dimension must be 4");
-
-  if (this->param_.InputOutPutSize() != nullptr) {
-    auto out_size_dim = this->param_.InputOutPutSize()->dims();
-
-    PADDLE_MOBILE_ENFORCE(out_size_dim.size() == 1,
-                          "OutSize's dimension size must be 1");
-    PADDLE_MOBILE_ENFORCE(out_size_dim[0] == 2, "OutSize's dim[0] must be 2");
-  }
-  std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
-  this->param_.Out()->Resize(framework::make_ddim(dim_out));
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(bilinear_interp, ops::BilinearOp);
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
-#endif
diff --git a/mobile/src/operators/bilinear_interp_op.h b/mobile/src/operators/bilinear_interp_op.h
deleted file mode 100644
index 2fee40859b..0000000000
--- a/mobile/src/operators/bilinear_interp_op.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BILINEAR_INTERP_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/bilinear_interp_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class BilinearOp : public framework::OperatorWithKernel<
-                       DeviceType, BilinearInterpParam<DeviceType>,
-                       operators::BilinearInterpKernel<DeviceType, T>> {
- public:
-  BilinearOp(const std::string &type, const VariableNameMap &inputs,
-             const VariableNameMap &outputs,
-             const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, BilinearInterpParam<DeviceType>,
-            operators::BilinearInterpKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/box_coder_op.cpp b/mobile/src/operators/box_coder_op.cpp
deleted file mode 100644
index 6511266e68..0000000000
--- a/mobile/src/operators/box_coder_op.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BOXCODER_OP
-
-#include "operators/box_coder_op.h"
-#include <vector>
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void BoxCoderOp<Dtype, T>::InferShape() const {
-  auto input_priorbox_dims = this->param_.InputPriorBox()->dims();
-  auto input_priorboxvar_dims = this->param_.InputPriorBoxVar()->dims();
-  auto input_targetbox_dims = this->param_.InputTargetBox()->dims();
-
-  auto code_type = this->param_.CodeType();
-
-  if (code_type == "encode_center_size") {
-    if (input_targetbox_dims.size() != 2) {
-      LOG(kLOG_ERROR) << " The rank of Input of TargetBox must be 2";
-    }
-    if (input_targetbox_dims[1] != 4) {
-      LOG(kLOG_ERROR) << " The shape of TargetBox is [M, 4]";
-    }
-  }
-  if (code_type == "decode_center_size") {
-    if (input_targetbox_dims.size() != 3) {
-      LOG(kLOG_ERROR) << "The rank of Input of TargetBox must be 3";
-    }
-    if (input_targetbox_dims[1] != input_priorbox_dims[0] ||
-        input_targetbox_dims[2] != input_priorbox_dims[1]) {
-      LOG(kLOG_ERROR) << " dimension not match";
-    }
-  }
-  this->param_.OutputBox()->Resize(framework::make_ddim(
-      {input_targetbox_dims[0], input_priorbox_dims[0], 4}));
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(box_coder, ops::BoxCoderOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(box_coder, ops::BoxCoderOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
-#endif
diff --git a/mobile/src/operators/box_coder_op.h b/mobile/src/operators/box_coder_op.h
deleted file mode 100644
index 417783ca93..0000000000
--- a/mobile/src/operators/box_coder_op.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BOXCODER_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/box_coder_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class BoxCoderOp : public framework::OperatorWithKernel<
-                       DeviceType, BoxCoderParam<DeviceType>,
-                       operators::BoxCoderKernel<DeviceType, T>> {
- public:
-  BoxCoderOp(const std::string &type, const VariableNameMap &inputs,
-             const VariableNameMap &outputs,
-             const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, BoxCoderParam<DeviceType>,
-                                      operators::BoxCoderKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/cast_op.cpp b/mobile/src/operators/cast_op.cpp
deleted file mode 100644
index 70a3ff6646..0000000000
--- a/mobile/src/operators/cast_op.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CAST_OP
-
-#include "operators/cast_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-void CastOp<DeviceType, T>::InferShape() const {
-  const auto &dims = this->param_.input_->dims();
-  this->param_.output_->Resize(dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(cast, ops::CastOp);
-#endif
-
-#endif  // CAST_OP
diff --git a/mobile/src/operators/cast_op.h b/mobile/src/operators/cast_op.h
deleted file mode 100644
index a244d5cfaf..0000000000
--- a/mobile/src/operators/cast_op.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CAST_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/kernels.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class CastOp : public framework::OperatorWithKernel<
-                   DeviceType, CastParam<DeviceType>,
-                   operators::CastKernel<DeviceType, T>> {
- public:
-  CastOp(const std::string &type, const VariableNameMap &inputs,
-         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-         framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, CastParam<DeviceType>,
-                                      operators::CastKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  // inference output shape
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // CAST_OP
diff --git a/mobile/src/operators/compare_op.cpp b/mobile/src/operators/compare_op.cpp
deleted file mode 100644
index 7332e33c62..0000000000
--- a/mobile/src/operators/compare_op.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/compare_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef LESS_THAN_OP
-template <typename Dtype, typename T>
-void LessThanOp<Dtype, T>::InferShape() const {
-  const auto &input_dims = this->param_.input_x_->dims();
-  this->param_.output_->Resize(input_dims);
-}
-#endif  // LESS_THAN_OP
-
-#ifdef EQUAL_OP
-template <typename Dtype, typename T>
-void EqualOp<Dtype, T>::InferShape() const {
-  const auto &input_dims = this->param_.input_x_->dims();
-  this->param_.output_->Resize(input_dims);
-}
-#endif  // EQUAL_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef LESS_THAN_OP
-REGISTER_OPERATOR_CPU(less_than, ops::LessThanOp);
-#endif  // LESS_THAN_OP
-#ifdef EQUAL_OP
-REGISTER_OPERATOR_CPU(equal, ops::EqualOp);
-#endif  // EQUAL_OP
diff --git a/mobile/src/operators/compare_op.h b/mobile/src/operators/compare_op.h
deleted file mode 100644
index 5fbc350053..0000000000
--- a/mobile/src/operators/compare_op.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/compare_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef LESS_THAN_OP
-DECLARE_OPERATOR(LessThan, CompareParam, LessThanKernel);
-#endif  // LESS_THAN_OP
-
-#ifdef EQUAL_OP
-DECLARE_OPERATOR(Equal, CompareParam, EqualKernel);
-#endif  // EQUAL_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/concat_op.cpp b/mobile/src/operators/concat_op.cpp
deleted file mode 100644
index 3f026a91ef..0000000000
--- a/mobile/src/operators/concat_op.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONCAT_OP
-
-#include <vector>
-
-#include "operators/concat_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void ConcatOp<Dtype, T>::InferShape() const {
-  auto inputs = this->param_.Inputs();
-  const size_t n = inputs.size();
-
-  std::vector<DDim> inputs_dims;
-  inputs_dims.reserve(n);
-  for (int i = 0; i < n; i++) {
-    inputs_dims.push_back(inputs[i]->dims());
-  }
-
-  if (n == 1) {
-    DLOG << "Warning: concat op have only one input, "
-            "may waste memory";
-  }
-
-  /// add all dim[axis] and check other dims if equal.
-  auto out_dims = inputs_dims[0];
-  auto axis = static_cast<size_t>(this->param_.Axis()) -
-              (this->param_.original_output_dims_size_ - out_dims.size());
-  int in_zero_dims_size = out_dims.size();
-  for (size_t i = 1; i < n; i++) {
-    for (size_t j = 0; j < in_zero_dims_size; j++) {
-      if (j == axis) {
-        out_dims[axis] += inputs_dims[i][j];
-      } else {
-        assert(out_dims[j] == inputs_dims[i][j]);
-      }
-    }
-  }
-
-  if (out_dims[axis] < 0) {
-    out_dims[axis] = -1;
-  }
-
-  this->param_.Out()->Resize(out_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(concat, ops::ConcatOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(concat, ops::ConcatOp);
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(concat, ops::ConcatOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/concat_op.h b/mobile/src/operators/concat_op.h
deleted file mode 100644
index 94c402cd85..0000000000
--- a/mobile/src/operators/concat_op.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONCAT_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/concat_kernel.h"
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-template <typename DeviceType, typename T>
-class ConcatOp : public framework::OperatorWithKernel<
-                     DeviceType, ConcatParam<DeviceType>,
-                     operators::ConcatKernel<DeviceType, T>> {
- public:
-  ConcatOp(const string &type, const VariableNameMap &inputs,
-           const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-           framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, ConcatParam<DeviceType>,
-                                      operators::ConcatKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/conditional_block_op.cpp b/mobile/src/operators/conditional_block_op.cpp
deleted file mode 100644
index 0f1e6f7556..0000000000
--- a/mobile/src/operators/conditional_block_op.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONDITIONAL_BLOCK_OP
-
-#include "operators/conditional_block_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void ConditionalBlockOp<Dtype, T>::InferShape() const {}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(conditional_block, ops::ConditionalBlockOp);
-#endif
-
-#endif  // CONDITIONAL_BLOCK_OP
diff --git a/mobile/src/operators/conditional_block_op.h b/mobile/src/operators/conditional_block_op.h
deleted file mode 100644
index 8a5dfa5634..0000000000
--- a/mobile/src/operators/conditional_block_op.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONDITIONAL_BLOCK_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/conditional_block_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-DECLARE_OPERATOR(ConditionalBlock, ConditionalBlockParam,
-                 ConditionalBlockKernel);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/controlflow/tensor_array_read_write_op.cpp b/mobile/src/operators/controlflow/tensor_array_read_write_op.cpp
deleted file mode 100644
index 0ea8ac01c6..0000000000
--- a/mobile/src/operators/controlflow/tensor_array_read_write_op.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/controlflow/tensor_array_read_write_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef WRITE_TO_ARRAY_OP
-template <typename Dtype, typename T>
-void WriteToArrayOp<Dtype, T>::InferShape() const {}
-#endif  // WRITE_TO_ARRAY_OP
-
-#ifdef READ_FROM_ARRAY_OP
-template <typename Dtype, typename T>
-void ReadFromArrayOp<Dtype, T>::InferShape() const {}
-#endif  // READ_FROM_ARRAY_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CPU
-#ifdef WRITE_TO_ARRAY_OP
-REGISTER_OPERATOR_CPU(write_to_array, ops::WriteToArrayOp);
-#endif  // WRITE_TO_ARRAY_OP
-
-#ifdef READ_FROM_ARRAY_OP
-REGISTER_OPERATOR_CPU(read_from_array, ops::ReadFromArrayOp);
-#endif  // READ_FROM_ARRAY_OP
-#endif
diff --git a/mobile/src/operators/controlflow/tensor_array_read_write_op.h b/mobile/src/operators/controlflow/tensor_array_read_write_op.h
deleted file mode 100644
index 21d3ca10ef..0000000000
--- a/mobile/src/operators/controlflow/tensor_array_read_write_op.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/tensor_array_read_write_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef WRITE_TO_ARRAY_OP
-DECLARE_OPERATOR(WriteToArray, WriteToArrayParam, WriteToArrayKernel);
-#endif  // WRITE_TO_ARRAY_OP
-
-#ifdef READ_FROM_ARRAY_OP
-DECLARE_OPERATOR(ReadFromArray, ReadFromArrayParam, ReadFromArrayKernel);
-#endif  // WRITE_TO_ARRAY_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/controlflow/while_op.cpp b/mobile/src/operators/controlflow/while_op.cpp
deleted file mode 100644
index 06eb7c5709..0000000000
--- a/mobile/src/operators/controlflow/while_op.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/controlflow/while_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef WHILE_OP
-template <typename Dtype, typename T>
-void WhileOp<Dtype, T>::InferShape() const {
-  // TODO(hjchen2)
-}
-#endif  // WHILE_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CPU
-#ifdef WHILE_OP
-REGISTER_OPERATOR_CPU(while, ops::WhileOp);
-#endif  // WHILE_OP
-#endif
diff --git a/mobile/src/operators/controlflow/while_op.h b/mobile/src/operators/controlflow/while_op.h
deleted file mode 100644
index 6f753a08ef..0000000000
--- a/mobile/src/operators/controlflow/while_op.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/while_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef WHILE_OP
-DECLARE_OPERATOR(While, WhileParam, WhileKernel);
-#endif  // WHILE_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/conv_op.cpp b/mobile/src/operators/conv_op.cpp
deleted file mode 100644
index 88c1262546..0000000000
--- a/mobile/src/operators/conv_op.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_OP
-
-#include "operators/conv_op.h"
-#include <vector>
-#include "framework/op_proto_maker.h"
-#include "framework/op_registry.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void ConvOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                          dilations[i], paddings[i],
-                                          strides[i]));
-  }
-
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(conv2d, ops::ConvOp);
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp);
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(conv2d, ops::ConvOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/conv_op.h b/mobile/src/operators/conv_op.h
deleted file mode 100644
index f023e60e72..0000000000
--- a/mobile/src/operators/conv_op.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/conv_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-template <typename DeviceType, typename T>
-class ConvOp : public framework::OperatorWithKernel<
-                   DeviceType, ConvParam<DeviceType>,
-                   operators::ConvKernel<DeviceType, T>> {
- public:
-  ConvOp(const std::string &type, const VariableNameMap &inputs,
-         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-         framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, ConvParam<DeviceType>,
-                                      operators::ConvKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- private:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/conv_transpose_op.cpp b/mobile/src/operators/conv_transpose_op.cpp
deleted file mode 100755
index 522337284f..0000000000
--- a/mobile/src/operators/conv_transpose_op.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_TRANSPOSE_OP
-
-#include "operators/conv_transpose_op.h"
-
-namespace paddle_mobile {
-namespace operators {}
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(conv2d_transpose, ops::ConvOpTranspose);
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(conv2d_transpose, ops::ConvOpTranspose);
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(conv2d_transpose, ops::ConvOpTranspose);
-#endif
-
-#endif
diff --git a/mobile/src/operators/conv_transpose_op.h b/mobile/src/operators/conv_transpose_op.h
deleted file mode 100755
index ace1893311..0000000000
--- a/mobile/src/operators/conv_transpose_op.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_TRANSPOSE_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "framework/operator.h"
-#include "operators/kernel/conv_transpose_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-template <typename DeviceType, typename T>
-class ConvOpTranspose : public framework::OperatorWithKernel<
-                            DeviceType, ConvTransposeParam<DeviceType>,
-                            operators::ConvTransposeKernel<DeviceType, T>> {
- public:
-  ConvOpTranspose(const std::string &type, const VariableNameMap &inputs,
-                  const VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, ConvTransposeParam<DeviceType>,
-            operators::ConvTransposeKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const {
-    auto input = this->param_.Input();
-    auto in_dims = input->dims();
-
-    auto filter = this->param_.Filter();
-    auto filter_dims = filter->dims();
-
-    std::vector<int> strides = this->param_.Strides();
-    std::vector<int> paddings = this->param_.Paddings();
-    std::vector<int> dilations = this->param_.Dilations();
-    std::vector<int> output_size = this->param_.OutputSize();
-
-    int groups = this->param_.Groups();
-
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() == 4 || in_dims.size() == 5,
-        "ConvTransposeOp intput should be 4-D or 5-D tensor.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() == filter_dims.size(),
-        "ConvTransposeOp input dimension and filter dimension "
-        "should be the same.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() - strides.size() == 2U,
-        "ConvTransposeOp input dimension and strides dimension should "
-        "be consistent.");
-    PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
-                          "ConvTransposeOp paddings dimension and strides "
-                          "dimension should be the same.");
-    PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
-                          "ConvTransposeOp paddings dimension and dilations "
-                          "dimension should be the same.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims[1] == filter_dims[0],
-        "In ConvTransposeOp, The number of input channels should "
-        "be equal to the number of filter's channels.");
-
-    std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
-    if (output_size.size() == 2) {
-      output_shape.push_back(output_size[0]);
-      output_shape.push_back(output_size[1]);
-    } else {
-      for (size_t i = 0; i < strides.size(); ++i) {
-        auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
-        output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
-                               2 * paddings[i] + filter_extent);
-      }
-    }
-
-    this->param_.Output()->Resize(framework::make_ddim(output_shape));
-  }
-
- private:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/crf_op.cpp b/mobile/src/operators/crf_op.cpp
deleted file mode 100644
index 4ab299ebf4..0000000000
--- a/mobile/src/operators/crf_op.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CRF_OP
-
-#include <vector>
-
-#include "common/enforce.h"
-#include "operators/crf_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void CrfOp<Dtype, T>::InferShape() const {
-  PADDLE_MOBILE_ENFORCE(this->param_.InputEmission(),
-                        "Input(Emission) should be not null.");
-  PADDLE_MOBILE_ENFORCE(this->param_.InputTransition(),
-                        "Input(Transition) should be not null.");
-  PADDLE_MOBILE_ENFORCE(this->param_.outputVBP(),
-                        "Input(ViterbiPath) should be not null.");
-
-  auto emission_dims = this->param_.InputEmission()->dims();
-  PADDLE_MOBILE_ENFORCE(emission_dims.size() == 2U,
-                        "The Input(Emission) should be a 2-D tensor.");
-  PADDLE_MOBILE_ENFORCE(emission_dims[0],
-                        "An empty mini-batch is not allowed.");
-
-  this->param_.outputVBP()->Resize(
-      {this->param_.InputEmission()->dims()[0], 1});
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(crf_decoding, ops::CrfOp);
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
-#endif
diff --git a/mobile/src/operators/crf_op.h b/mobile/src/operators/crf_op.h
deleted file mode 100644
index fb0fd90889..0000000000
--- a/mobile/src/operators/crf_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CRF_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/crf_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class CrfOp : public framework::OperatorWithKernel<
-                  DeviceType, CrfParam<DeviceType>,
-                  operators::CrfKernel<DeviceType, T>> {
- public:
-  CrfOp(const std::string &type, const VariableNameMap &inputs,
-        const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-        framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, CrfParam<DeviceType>,
-                                      operators::CrfKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/depthwise_conv_op.cpp b/mobile/src/operators/depthwise_conv_op.cpp
deleted file mode 100644
index 5413af6ff7..0000000000
--- a/mobile/src/operators/depthwise_conv_op.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DEPTHWISECONV_OP
-
-#include "operators/depthwise_conv_op.h"
-#include <vector>
-#include "framework/op_proto_maker.h"
-#include "framework/op_registry.h"
-#include "operators/conv_op.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void DepthwiseConvOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                          dilations[i], paddings[i],
-                                          strides[i]));
-  }
-
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(depthwise_conv2d, ops::DepthwiseConvOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(depthwise_conv2d, ops::DepthwiseConvOp);
-#endif
-#endif
diff --git a/mobile/src/operators/depthwise_conv_op.h b/mobile/src/operators/depthwise_conv_op.h
deleted file mode 100644
index d1cbeeab06..0000000000
--- a/mobile/src/operators/depthwise_conv_op.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DEPTHWISECONV_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/conv_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class DepthwiseConvOp : public framework::OperatorWithKernel<
-                            DeviceType, ConvParam<DeviceType>,
-                            operators::ConvKernel<DeviceType, T>> {
- public:
-  DepthwiseConvOp(const std::string &type, const VariableNameMap &inputs,
-                  const VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, ConvParam<DeviceType>,
-                                      operators::ConvKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/dequantize_op.cpp b/mobile/src/operators/dequantize_op.cpp
deleted file mode 100644
index 1c04b3a95f..0000000000
--- a/mobile/src/operators/dequantize_op.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DEQUANT_OP
-
-#include "operators/dequantize_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-void DequantizeOp<DeviceType, T>::InferShape() const {
-  const auto& input_dims = this->param_.input_->dims();
-  this->param_.output_->Resize(input_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(dequantize, ops::DequantizeOp);
-#endif
-
-#endif  // DEQUANT_OP
diff --git a/mobile/src/operators/dequantize_op.h b/mobile/src/operators/dequantize_op.h
deleted file mode 100644
index 81ab62bee8..0000000000
--- a/mobile/src/operators/dequantize_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DEQUANT_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/dequantize_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class DequantizeOp
-    : public framework::OperatorWithKernel<DeviceType,
-                                           DequantizeParam<DeviceType>,
-                                           DequantizeKernel<DeviceType, T>> {
- public:
-  DequantizeOp(const std::string &type, const VariableNameMap &inputs,
-               const VariableNameMap &outputs,
-               const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, DequantizeParam<DeviceType>,
-                                      DequantizeKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  // inference output shape
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // DEQUANT_OP
diff --git a/mobile/src/operators/detection_ops.cpp b/mobile/src/operators/detection_ops.cpp
deleted file mode 100644
index 50df7229e1..0000000000
--- a/mobile/src/operators/detection_ops.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/detection_ops.h"
-#include <vector>
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef ANCHOR_GENERATOR_OP
-template <typename DeviceType, typename T>
-void AnchorGeneratorOp<DeviceType, T>::InferShape() const {
-  const auto &input_dims = this->param_.input_->dims();
-  // DLOG << "AnchorGenerator input dim =" << input_dims.size();
-  PADDLE_MOBILE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
-  const auto &anchor_sizes = this->param_.anchor_sizes_;
-  const auto &aspect_ratios = this->param_.aspect_ratios_;
-
-  size_t num_anchors = aspect_ratios.size() * anchor_sizes.size();
-  std::vector<int64_t> dim_vec(4);
-  dim_vec[0] = input_dims[2];
-  dim_vec[1] = input_dims[3];
-  dim_vec[2] = num_anchors;
-  dim_vec[3] = 4;
-
-  this->param_.output_anchors_->Resize(framework::make_ddim(dim_vec));
-  this->param_.output_variances_->Resize(framework::make_ddim(dim_vec));
-}
-#endif
-
-#ifdef PROPOSAL_OP
-template <typename DeviceType, typename T>
-void ProposalOp<DeviceType, T>::InferShape() const {
-  this->param_.rpn_rois_->Resize(framework::make_ddim({-1, 4}));
-  this->param_.rpn_probs_->Resize(framework::make_ddim({-1, 1}));
-}
-#endif
-
-#ifdef PSROI_POOL_OP
-template <typename DeviceType, typename T>
-void PSRoiPoolOp<DeviceType, T>::InferShape() const {
-  const auto &rois_dims = this->param_.input_rois_->dims();
-  const int pooled_height = this->param_.pooled_height_;
-  const int pooled_width = this->param_.pooled_width_;
-  const int output_channels = this->param_.output_channels_;
-
-  auto out_dims = this->param_.input_x_->dims();
-  out_dims[0] = rois_dims[0];
-  out_dims[1] =
-      output_channels;  // input_dims[1] / (pooled_height * pooled_width);
-  out_dims[2] = pooled_height;
-  out_dims[3] = pooled_width;
-  this->param_.output_->Resize(out_dims);
-}
-#endif
-
-#ifdef ROIALIGN_POOL_OP
-template <typename DeviceType, typename T>
-void RoiAlignPoolOp<DeviceType, T>::InferShape() const {
-  const auto &rois_dims = this->param_.input_rois_->dims();
-  const int pooled_height = this->param_.pooled_height_;
-  const int pooled_width = this->param_.pooled_width_;
-
-  auto out_dims = this->param_.input_x_->dims();
-  out_dims[0] = rois_dims[0];
-  // out_dims[1] =
-  //     output_channels;  // input_dims[1] / (pooled_height * pooled_width);
-  out_dims[2] = pooled_height;
-  out_dims[3] = pooled_width;
-  this->param_.output_->Resize(out_dims);
-}
-#endif
-
-#ifdef ROI_PERSPECTIVE_OP
-template <typename DeviceType, typename T>
-void RoiPerspectiveOp<DeviceType, T>::InferShape() const {
-  const auto &input_dims = this->param_.input_x_->dims();
-  const auto &rois_dims = this->param_.input_rois_->dims();
-  const int transformed_height = this->param_.transformed_height_;
-  const int transformed_width = this->param_.transformed_width_;
-  std::vector<int64_t> out_dims_v({rois_dims[0],   // num_rois
-                                   input_dims[1],  // channels
-                                   static_cast<int64_t>(transformed_height),
-                                   static_cast<int64_t>(transformed_width)});
-  auto out_dims = framework::make_ddim(out_dims_v);
-  this->param_.output_->Resize(out_dims);
-
-  std::vector<int64_t> mask_dims_v({rois_dims[0],  // num_rois
-                                    1,             // channels
-                                    static_cast<int64_t>(transformed_height),
-                                    static_cast<int64_t>(transformed_width)});
-  auto mask_dims = framework::make_ddim(mask_dims_v);
-
-  std::vector<int64_t> matrix_dims_v({rois_dims[0], 9});
-  auto matrix_dims = framework::make_ddim(matrix_dims_v);
-  this->param_.transform_Matrix_->Resize(matrix_dims);
-  this->param_.mask->Resize(mask_dims);
-}
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-#ifdef ANCHOR_GENERATOR_OP
-REGISTER_OPERATOR_CPU(anchor_generator, ops::AnchorGeneratorOp);
-#endif
-#ifdef PROPOSAL_OP
-REGISTER_OPERATOR_CPU(generate_proposals, ops::ProposalOp);
-#endif
-#ifdef PSROI_POOL_OP
-REGISTER_OPERATOR_CPU(psroi_pool, ops::PSRoiPoolOp);
-#endif
-#ifdef ROI_PERSPECTIVE_OP
-REGISTER_OPERATOR_CPU(roi_perspective_transform, ops::RoiPerspectiveOp);
-#endif
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-#ifdef ANCHOR_GENERATOR_OP
-REGISTER_OPERATOR_FPGA(anchor_generator, ops::AnchorGeneratorOp);
-#endif
-#ifdef PROPOSAL_OP
-REGISTER_OPERATOR_FPGA(generate_proposals, ops::ProposalOp);
-#endif
-#ifdef PSROI_POOL_OP
-REGISTER_OPERATOR_FPGA(psroi_pool, ops::PSRoiPoolOp);
-#endif
-#ifdef ROIALIGN_POOL_OP
-REGISTER_OPERATOR_FPGA(roialign_pool, ops::RoiAlignPoolOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/detection_ops.h b/mobile/src/operators/detection_ops.h
deleted file mode 100644
index 3b3a54dc4b..0000000000
--- a/mobile/src/operators/detection_ops.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/detection_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef ANCHOR_GENERATOR_OP
-DECLARE_OPERATOR(AnchorGenerator, AnchorGeneratorParam, AnchorGeneratorKernel);
-#endif
-
-#ifdef PROPOSAL_OP
-DECLARE_OPERATOR(Proposal, ProposalParam, ProposalKernel);
-#endif
-
-#ifdef PSROI_POOL_OP
-DECLARE_OPERATOR(PSRoiPool, PSRoiPoolParam, PSRoiPoolKernel);
-#endif
-
-#ifdef ROIALIGN_POOL_OP
-DECLARE_OPERATOR(RoiAlignPool, RoiAlignPoolParam, RoiAlignPoolKernel);
-#endif
-
-#ifdef ROI_PERSPECTIVE_OP
-DECLARE_OPERATOR(RoiPerspective, RoiPerspectiveParam, RoiPerspectiveKernel);
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/dropout_op.cpp b/mobile/src/operators/dropout_op.cpp
deleted file mode 100644
index c0dafa424e..0000000000
--- a/mobile/src/operators/dropout_op.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DROPOUT_OP
-#include "operators/dropout_op.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void DropoutOp<Dtype, T>::InferShape() const {
-  auto input_dims = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(input_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(dropout, ops::DropoutOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(dropout, ops::DropoutOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(dropout, ops::DropoutOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/dropout_op.h b/mobile/src/operators/dropout_op.h
deleted file mode 100644
index 132b94af69..0000000000
--- a/mobile/src/operators/dropout_op.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DROPOUT_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/dropout_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class DropoutOp : public framework::OperatorWithKernel<
-                      DeviceType, DropoutParam<DeviceType>,
-                      operators::DropoutKernel<DeviceType, T>> {
- public:
-  DropoutOp(const std::string &type, const VariableNameMap &inputs,
-            const VariableNameMap &outputs, const framework::AttributeMap attrs,
-            framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, DropoutParam<DeviceType>,
-                                      operators::DropoutKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/elementwise_add_op.cpp b/mobile/src/operators/elementwise_add_op.cpp
deleted file mode 100644
index 1f198aeb03..0000000000
--- a/mobile/src/operators/elementwise_add_op.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEADD_OP
-
-#include "operators/elementwise_add_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void ElementwiseAddOp<Dtype, T>::InferShape() const {
-  auto x_dim = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(x_dim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp);
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(elementwise_add, ops::ElementwiseAddOp);
-#endif
-
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(elementwise_add, ops::ElementwiseAddOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/elementwise_add_op.h b/mobile/src/operators/elementwise_add_op.h
deleted file mode 100644
index 7819765813..0000000000
--- a/mobile/src/operators/elementwise_add_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEADD_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "kernel/elementwise_add_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-template <typename DeviceType, typename T>
-class ElementwiseAddOp : public framework::OperatorWithKernel<
-                             DeviceType, ElementwiseAddParam<DeviceType>,
-                             operators::ElementwiseAddKernel<DeviceType, T>> {
- public:
-  ElementwiseAddOp(const string &type, const VariableNameMap &inputs,
-                   const VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs,
-                   framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, ElementwiseAddParam<DeviceType>,
-            operators::ElementwiseAddKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/elementwise_mul_op.cpp b/mobile/src/operators/elementwise_mul_op.cpp
deleted file mode 100644
index 61001ff4ec..0000000000
--- a/mobile/src/operators/elementwise_mul_op.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEMUL_OP
-
-#include "operators/elementwise_mul_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void ElementwiseMulOp<Dtype, T>::InferShape() const {
-  auto x_dim = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(x_dim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(elementwise_mul, ops::ElementwiseMulOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(elementwise_mul, ops::ElementwiseMulOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/elementwise_mul_op.h b/mobile/src/operators/elementwise_mul_op.h
deleted file mode 100644
index 53a90180b6..0000000000
--- a/mobile/src/operators/elementwise_mul_op.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEMUL_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "kernel/elementwise_mul_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-template <typename DeviceType, typename T>
-class ElementwiseMulOp : public framework::OperatorWithKernel<
-                             DeviceType, ElementwiseMulParam<DeviceType>,
-                             operators::ElementwiseMulKernel<DeviceType, T>> {
- public:
-  ElementwiseMulOp(const string &type, const VariableNameMap &inputs,
-                   const VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs,
-                   framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, ElementwiseMulParam<DeviceType>,
-            operators::ElementwiseMulKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, ElementwiseMulParam<DeviceType>,
-      operators::ElementwiseMulKernel<DeviceType, T>>::OperatorWithKernel;
-  void InferShape() const override;
-
- protected:
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/elementwise_sub_op.cpp b/mobile/src/operators/elementwise_sub_op.cpp
deleted file mode 100644
index 9b9d89073a..0000000000
--- a/mobile/src/operators/elementwise_sub_op.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISESUB_OP
-
-#include "operators/elementwise_sub_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void ElementwiseSubOp<Dtype, T>::InferShape() const {
-  auto x_dim = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(x_dim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(elementwise_sub, ops::ElementwiseSubOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
-#endif
diff --git a/mobile/src/operators/elementwise_sub_op.h b/mobile/src/operators/elementwise_sub_op.h
deleted file mode 100644
index ce3b310ef3..0000000000
--- a/mobile/src/operators/elementwise_sub_op.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISESUB_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "kernel/elementwise_sub_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-template <typename DeviceType, typename T>
-class ElementwiseSubOp : public framework::OperatorWithKernel<
-                             DeviceType, ElementwiseSubParam<DeviceType>,
-                             operators::ElementwiseSubKernel<DeviceType, T>> {
- public:
-  ElementwiseSubOp(const string &type, const VariableNameMap &inputs,
-                   const VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs,
-                   framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, ElementwiseSubParam<DeviceType>,
-            operators::ElementwiseSubKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, ElementwiseSubParam<DeviceType>,
-      operators::ElementwiseSubKernel<DeviceType, T>>::OperatorWithKernel;
-  void InferShape() const override;
-
- protected:
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/exp_op.cpp b/mobile/src/operators/exp_op.cpp
deleted file mode 100644
index 549108d72e..0000000000
--- a/mobile/src/operators/exp_op.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef EXP_OP
-#include "exp_op.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-void EXPOp<DeviceType, T>::InferShape() const {
-  auto shape = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(shape);
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(exp, ops::EXPOp);
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(exp, ops::EXPOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/exp_op.h b/mobile/src/operators/exp_op.h
deleted file mode 100644
index 6f8cd099b7..0000000000
--- a/mobile/src/operators/exp_op.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/exp_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef EXP_OP
-DECLARE_OPERATOR(EXP, EXPParam, EXPKernel);
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/feed_op.cpp b/mobile/src/operators/feed_op.cpp
deleted file mode 100644
index ffd253073a..0000000000
--- a/mobile/src/operators/feed_op.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/feed_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-void FeedOp<DeviceType, T>::InferShape() const {
-  auto out_dims = this->param_.Out()->dims();
-  out_dims[0] = this->param_.BatchSize();
-  int col = this->param_.Col();
-  auto input_dims = this->param_.InputX()->at(col).dims();
-  this->param_.Out()->Resize(input_dims);
-  if (input_dims.size() == 4 || input_dims.size() == 2) {
-    this->param_.Out()->Resize(input_dims);
-  } else {
-    this->param_.Out()->Resize(out_dims);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(feed, ops::FeedOp);
-#endif
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(feed, ops::FeedOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(feed, ops::FeedOp);
-#endif
diff --git a/mobile/src/operators/feed_op.h b/mobile/src/operators/feed_op.h
deleted file mode 100644
index fda259b585..0000000000
--- a/mobile/src/operators/feed_op.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/feed_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using std::string;
-
-template <typename DeviceType, typename T>
-class FeedOp
-    : public framework::OperatorWithKernel<DeviceType, FeedParam<DeviceType>,
-                                           FeedKernel<DeviceType, T>> {
- public:
-  FeedOp(const std::string &type, const VariableNameMap &inputs,
-         const VariableNameMap &outputs, const framework::AttributeMap attrs,
-         framework::Scope *scope)
-
-      : framework::OperatorWithKernel<DeviceType, FeedParam<DeviceType>,
-                                      FeedKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/fetch_op.cpp b/mobile/src/operators/fetch_op.cpp
deleted file mode 100644
index 104e8214a0..0000000000
--- a/mobile/src/operators/fetch_op.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/fetch_op.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-void FetchOp<DeviceType, T>::InferShape() const {
-  int col = this->param_.Col();
-  auto x_dims = this->param_.InputX()->dims();
-  this->param_.Out()->at(col).Resize(x_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fetch, ops::FetchOp);
-#endif
-
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(fetch, ops::FetchOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(fetch, ops::FetchOp);
-#endif
diff --git a/mobile/src/operators/fetch_op.h b/mobile/src/operators/fetch_op.h
deleted file mode 100644
index 72c8e1997f..0000000000
--- a/mobile/src/operators/fetch_op.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/fetch_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-
-template <typename DeviceType, typename T>
-class FetchOp
-    : public framework::OperatorWithKernel<DeviceType, FetchParam<DeviceType>,
-                                           FetchKernel<DeviceType, T>> {
- public:
-  FetchOp(const string &type, const VariableNameMap &inputs,
-          const VariableNameMap &outputs, const framework::AttributeMap attrs,
-          framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, FetchParam<DeviceType>,
-                                      FetchKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/fill_constant_batch_size_like_op.cpp b/mobile/src/operators/fill_constant_batch_size_like_op.cpp
deleted file mode 100644
index 848ab436f2..0000000000
--- a/mobile/src/operators/fill_constant_batch_size_like_op.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FILL_CONSTANT_BATCH_SIZE_LIKE_OP
-
-#include "operators/fill_constant_batch_size_like_op.h"
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fill_constant_batch_size_like,
-                      ops::FillConstantBatchSizeLikeOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fill_constant_batch_size_like_op.h b/mobile/src/operators/fill_constant_batch_size_like_op.h
deleted file mode 100644
index dff76d85d1..0000000000
--- a/mobile/src/operators/fill_constant_batch_size_like_op.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FILL_CONSTANT_BATCH_SIZE_LIKE_OP
-
-#pragma once
-
-#include <algorithm>
-#include <string>
-#include "framework/data_type.h"
-#include "framework/operator.h"
-#include "framework/selected_rows.h"
-#include "operators/math/math_function.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class FillConstantBatchSizeLikeOp : public framework::OperatorBase<DeviceType> {
- public:
-  FillConstantBatchSizeLikeOp(const std::string &type,
-                              const VariableNameMap &inputs,
-                              const VariableNameMap &outputs,
-                              const framework::AttributeMap attrs,
-                              framework::Scope *scope)
-      : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
-                                            scope),
-        param_(inputs, outputs, attrs, scope) {}
-  void RunImpl() {
-    auto data_type =
-        static_cast<_PaddleMobile__Framework__Proto__VarType__Type>(
-            param_.DataDtype());
-    framework::Tensor *tensor = nullptr;
-    auto value = param_.Value();
-    auto *outvar = param_.OutVar();
-
-    if (outvar->template IsType<framework::LoDTensor>()) {
-      tensor = outvar->template GetMutable<framework::LoDTensor>();
-    } else if (outvar->template IsType<framework::SelectedRows>()) {
-      tensor = outvar->template GetMutable<framework::SelectedRows>()
-                   ->mutable_value();
-    } else {
-      PADDLE_MOBILE_THROW_EXCEPTION(
-          "fill constant batch size like op's output only"
-          "supports SelectedRows and LoDTensor");
-    }
-    auto shape = param_.Shape();
-    std::vector<int64_t> shape_int64(shape.size(), 0);
-    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
-                   [](int a) { return static_cast<int64_t>(a); });
-    auto ddim = framework::make_ddim(shape_int64);
-    ddim[param_.OutputDimIdx()] = param_.Input()->dims()[param_.InputDimIdx()];
-    tensor->Resize(ddim);
-    tensor->mutable_data(framework::ToTypeIndex(data_type));
-
-    math::SetConstant(tensor, value);
-  }
-
-  void Init() {}
-
-  void InferShape() const {
-    PADDLE_MOBILE_ENFORCE(
-        param_.Out() != nullptr,
-        "Output (Out) of fill_constant_batch_size_like op should not be null.");
-
-    auto shape = param_.Shape();
-
-    std::vector<int64_t> shape_int64(shape.size(), 0);
-    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
-                   [](int a) { return static_cast<int64_t>(a); });
-    DLOG << shape_int64;
-    auto ddim = framework::make_ddim(shape_int64);
-    ddim[param_.OutputDimIdx()] = param_.Input()->dims()[param_.InputDimIdx()];
-    param_.Out()->Resize(ddim);
-  }
-
- protected:
-  FillConstantBatchSizeLikeParam<DeviceType> param_;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fill_constant_op.cpp b/mobile/src/operators/fill_constant_op.cpp
deleted file mode 100644
index 0c13c57ceb..0000000000
--- a/mobile/src/operators/fill_constant_op.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FILL_CONSTANT_OP
-
-#include "operators/fill_constant_op.h"
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fill_constant, ops::FillConstantOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(fill_constant, ops::FillConstantOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fill_constant_op.h b/mobile/src/operators/fill_constant_op.h
deleted file mode 100644
index 0a51f8494d..0000000000
--- a/mobile/src/operators/fill_constant_op.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FILL_CONSTANT_OP
-
-#pragma once
-
-#include <string>
-#include "framework/data_type.h"
-#include "framework/operator.h"
-#include "framework/selected_rows.h"
-#include "operators/math/math_function.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class FillConstantOp : public framework::OperatorBase<DeviceType> {
- public:
-  FillConstantOp(const std::string &type, const VariableNameMap &inputs,
-                 const VariableNameMap &outputs,
-                 const framework::AttributeMap attrs, framework::Scope *scope)
-      : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
-                                            scope),
-        param_(inputs, outputs, attrs, scope) {}
-  void RunImpl() {
-    auto data_type =
-        static_cast<_PaddleMobile__Framework__Proto__VarType__Type>(
-            param_.DataDtype());
-    framework::Tensor *tensor = nullptr;
-    auto value = param_.Value();
-    auto *outvar = param_.OutVar();
-
-    if (outvar->template IsType<framework::LoDTensor>()) {
-      tensor = outvar->template GetMutable<framework::LoDTensor>();
-    } else if (outvar->template IsType<framework::SelectedRows>()) {
-      tensor = outvar->template GetMutable<framework::SelectedRows>()
-                   ->mutable_value();
-    } else {
-      PADDLE_MOBILE_THROW_EXCEPTION(
-          "fill constant op's output only"
-          "supports SelectedRows and LoDTensor");
-    }
-    tensor->Resize(framework::make_ddim(param_.Shape()));
-    tensor->mutable_data(framework::ToTypeIndex(data_type));
-
-    math::SetConstant(tensor, value);
-  }
-
-  void Init() {}
-
-  void InferShape() const {
-    PADDLE_MOBILE_ENFORCE(
-        param_.Out() != nullptr,
-        "Output (Out) of fill_constant op should not be null.");
-    framework::DDim ddim = framework::make_ddim(param_.Shape());
-    param_.Out()->Resize(ddim);
-  }
-
- protected:
-  FillConstantParam<DeviceType> param_;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/flatten2_op.cpp b/mobile/src/operators/flatten2_op.cpp
deleted file mode 100644
index 78e933e278..0000000000
--- a/mobile/src/operators/flatten2_op.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FLATTEN2_OP
-#include "operators/flatten2_op.h"
-#include <operators/kernel/reshape_kernel.h>
-
-namespace paddle_mobile {
-namespace operators {
-template <typename DeviceType, typename T>
-void Flatten2Op<DeviceType, T>::InferShape() const {
-  const auto* input = this->param_.InputX();
-  auto* output = this->param_.Out();
-  auto input_x_dims = input->dims();
-  if (input->dims().size() == 4) {
-    PADDLE_MOBILE_ENFORCE(this->param_.Axis() == 1,
-                          "flatten 2 only support axis == 1");
-    if (this->param_.Axis() == 1) {
-      std::vector<int> temp_output_dims(2);
-      temp_output_dims[0] = input->dims()[0];
-      temp_output_dims[1] =
-          input->dims()[1] * input->dims()[2] * input->dims()[3];
-      output->Resize(framework::make_ddim(temp_output_dims));
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(flatten2, ops::Flatten2Op);
-#endif
-
-#endif
diff --git a/mobile/src/operators/flatten2_op.h b/mobile/src/operators/flatten2_op.h
deleted file mode 100644
index 9c08e9c335..0000000000
--- a/mobile/src/operators/flatten2_op.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FLATTEN2_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "framework/operator.h"
-#include "operators/kernel/flatten2_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-DECLARE_OPERATOR(Flatten2, FlattenParam, Flatten2Kernel);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/flatten_op.cpp b/mobile/src/operators/flatten_op.cpp
deleted file mode 100644
index 4e52485345..0000000000
--- a/mobile/src/operators/flatten_op.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FLATTEN_OP
-
-#include "operators/flatten_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-void FlattenOp<DeviceType, T>::InferShape() const {
-  PADDLE_MOBILE_ENFORCE(this->param_.InputX() != nullptr,
-                        "Input (X) of Flatten op should not be null.");
-  PADDLE_MOBILE_ENFORCE(this->param_.Out() != nullptr,
-                        "Output (Output) of Flatten op should not be null.");
-
-  auto &axis = this->param_.Axis();
-  PADDLE_MOBILE_ENFORCE(axis >= 0,
-                        "The axis should be greater than or equal to 0.");
-
-  auto &in_dims = this->param_.InputX()->dims();
-  PADDLE_MOBILE_ENFORCE(
-      axis <= in_dims.size(),
-      "The axis should be less than or equal to input tensor's rank.");
-
-  const auto &out_dims = GetOutputShape(axis, in_dims);
-  this->param_.Out()->Resize(framework::make_ddim(out_dims));
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(flatten, ops::FlattenOp);
-REGISTER_OPERATOR_CPU(flatten2, ops::Flatten2Op);
-#endif
-
-#endif  // FLATTEN_OP
diff --git a/mobile/src/operators/flatten_op.h b/mobile/src/operators/flatten_op.h
deleted file mode 100644
index ef97994dc1..0000000000
--- a/mobile/src/operators/flatten_op.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FLATTEN_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "framework/operator.h"
-#include "operators/kernel/flatten_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-inline std::vector<int32_t> GetOutputShape(const int axis,
-                                           const framework::DDim &in_dims) {
-  int64_t outer = 1, inner = 1;
-  for (int i = 0; i < in_dims.size(); ++i) {
-    if (i < axis) {
-      outer *= in_dims[i];
-    } else {
-      inner *= in_dims[i];
-    }
-  }
-  std::vector<int32_t> out_shape(2);
-  out_shape[0] = static_cast<int>(outer);
-  out_shape[1] = static_cast<int>(inner);
-  return out_shape;
-}
-
-template <typename DeviceType, typename T>
-class FlattenOp : public framework::OperatorWithKernel<
-                      DeviceType, FlattenParam<DeviceType>,
-                      operators::FlattenKernel<DeviceType, T>> {
- public:
-  FlattenOp(const std::string &type, const VariableNameMap &inputs,
-            const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, FlattenParam<DeviceType>,
-                                      operators::FlattenKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-};
-
-template <typename DeviceType, typename T>
-class Flatten2Op : public FlattenOp<DeviceType, T> {
- public:
-  Flatten2Op(const std::string &type, const VariableNameMap &inputs,
-             const VariableNameMap &outputs,
-             const framework::AttributeMap &attrs, framework::Scope *scope)
-      : FlattenOp<DeviceType, T>(type, inputs, outputs, attrs, scope) {}
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fusion_conv_add_bn_op.cpp b/mobile/src/operators/fusion_conv_add_bn_op.cpp
deleted file mode 100644
index 27e3c04d62..0000000000
--- a/mobile/src/operators/fusion_conv_add_bn_op.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDBN_OP
-
-#include "operators/fusion_conv_add_bn_op.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionConvAddBNOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                          dilations[i], paddings[i],
-                                          strides[i]));
-  }
-
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_conv_add_bn, ops::FusionConvAddBNMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_conv_add_bn, ops::FusionConvAddBNOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(fusion_conv_add_bn, ops::FusionConvAddBNOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_conv_add_bn_op.h b/mobile/src/operators/fusion_conv_add_bn_op.h
deleted file mode 100644
index 0618f80512..0000000000
--- a/mobile/src/operators/fusion_conv_add_bn_op.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDBN_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/conv_add_bn_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusionConvAddBNMatcher : public framework::FusionOpMatcher {
- public:
-  FusionConvAddBNMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
-                  {G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "Scale"},
-                    {"Mean", "Mean"},
-                    {"Bias", "Bias"},
-                    {"Variance", "Variance"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_BN; }
-};
-
-template <typename DeviceType, typename T>
-class FusionConvAddBNOp : public framework::OperatorWithKernel<
-                              DeviceType, FusionConvAddBNParam<DeviceType>,
-                              operators::ConvAddBNKernel<DeviceType, T>> {
- public:
-  FusionConvAddBNOp(const string &type, const VariableNameMap &inputs,
-                    const VariableNameMap &outputs,
-                    const framework::AttributeMap &attrs,
-                    framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionConvAddBNParam<DeviceType>,
-            operators::ConvAddBNKernel<DeviceType, T>>(type, inputs, outputs,
-                                                       attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fusion_conv_add_bn_relu_op.cpp b/mobile/src/operators/fusion_conv_add_bn_relu_op.cpp
deleted file mode 100644
index 4cf7e70112..0000000000
--- a/mobile/src/operators/fusion_conv_add_bn_relu_op.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDBNRELU_OP
-
-#include "operators/fusion_conv_add_bn_relu_op.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionConvAddBNReluOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                          dilations[i], paddings[i],
-                                          strides[i]));
-  }
-
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_conv_add_bn_relu,
-                        ops::FusionConvAddBNReluMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
-#endif
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
-#endif
-#endif
diff --git a/mobile/src/operators/fusion_conv_add_bn_relu_op.h b/mobile/src/operators/fusion_conv_add_bn_relu_op.h
deleted file mode 100644
index 9dd2fd406a..0000000000
--- a/mobile/src/operators/fusion_conv_add_bn_relu_op.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDBNRELU_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/conv_add_bn_relu_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusionConvAddBNReluMatcher : public framework::FusionOpMatcher {
- public:
-  FusionConvAddBNReluMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
-        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
-                  {G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "Scale"},
-                    {"Mean", "Mean"},
-                    {"Bias", "Bias"},
-                    {"Variance", "Variance"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_BN_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionConvAddBNReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionConvAddBNReluParam<DeviceType>,
-          operators::ConvAddBNReluKernel<DeviceType, T>> {
- public:
-  FusionConvAddBNReluOp(const string &type, const VariableNameMap &inputs,
-                        const VariableNameMap &outputs,
-                        const framework::AttributeMap &attrs,
-                        framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionConvAddBNReluParam<DeviceType>,
-            operators::ConvAddBNReluKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fusion_conv_add_op.cpp b/mobile/src/operators/fusion_conv_add_op.cpp
deleted file mode 100644
index c611f1084f..0000000000
--- a/mobile/src/operators/fusion_conv_add_op.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADD_OP
-
-#include "operators/fusion_conv_add_op.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionConvAddOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                          dilations[i], paddings[i],
-                                          strides[i]));
-  }
-
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_conv_add, ops::FusionConvAddMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_conv_add, ops::FusionConvAddOp);
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(fusion_conv_add, ops::FusionConvAddOp);
-#endif
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(fusion_conv_add, ops::FusionConvAddOp);
-#endif
-#endif
diff --git a/mobile/src/operators/fusion_conv_add_op.h b/mobile/src/operators/fusion_conv_add_op.h
deleted file mode 100644
index 22ecab45e6..0000000000
--- a/mobile/src/operators/fusion_conv_add_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADD_OP
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/conv_add_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusionConvAddMatcher : public framework::FusionOpMatcher {
- public:
-  FusionConvAddMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}}, removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD; }
-};
-
-template <typename DeviceType, typename T>
-class FusionConvAddOp : public framework::OperatorWithKernel<
-                            DeviceType, FusionConvAddParam<DeviceType>,
-                            operators::ConvAddKernel<DeviceType, T>> {
- public:
-  FusionConvAddOp(const string &type, const VariableNameMap &inputs,
-                  const VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType,
-                                      FusionConvAddParam<DeviceType>,
-                                      operators::ConvAddKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fusion_conv_add_relu_op.cpp b/mobile/src/operators/fusion_conv_add_relu_op.cpp
deleted file mode 100644
index d827d845e1..0000000000
--- a/mobile/src/operators/fusion_conv_add_relu_op.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDRELU_OP
-
-#include "operators/fusion_conv_add_relu_op.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionConvAddReluOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                          dilations[i], paddings[i],
-                                          strides[i]));
-  }
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_conv_add_relu, ops::FusionConvAddReluOpMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_conv_add_relu, ops::FusionConvAddReluOp);
-#endif
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(fusion_conv_add_relu, ops::FusionConvAddReluOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(fusion_conv_add_relu, ops::FusionConvAddReluOp);
-#endif
-#endif
diff --git a/mobile/src/operators/fusion_conv_add_relu_op.h b/mobile/src/operators/fusion_conv_add_relu_op.h
deleted file mode 100644
index 7a1cfd1941..0000000000
--- a/mobile/src/operators/fusion_conv_add_relu_op.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDRELU_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/conv_add_relu_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-class FusionConvAddReluOpMatcher : public framework::FusionOpMatcher {
- public:
-  FusionConvAddReluOpMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}}, removed_nodes);
-  }
-  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionConvAddReluOp : public framework::OperatorWithKernel<
-                                DeviceType, FusionConvAddReluParam<DeviceType>,
-                                operators::ConvAddReluKernel<DeviceType, T>> {
- public:
-  FusionConvAddReluOp(const string &type, const VariableNameMap &inputs,
-                      const VariableNameMap &outputs,
-                      const framework::AttributeMap &attrs,
-                      framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionConvAddReluParam<DeviceType>,
-            operators::ConvAddReluKernel<DeviceType, T>>(type, inputs, outputs,
-                                                         attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fusion_conv_bn_add_relu_op.cpp b/mobile/src/operators/fusion_conv_bn_add_relu_op.cpp
deleted file mode 100644
index 759c0df8d4..0000000000
--- a/mobile/src/operators/fusion_conv_bn_add_relu_op.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBNADDRELU_OP
-
-#include "operators/fusion_conv_bn_add_relu_op.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionConvBNAddReluOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                          dilations[i], paddings[i],
-                                          strides[i]));
-  }
-
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_conv_bn_add_relu,
-                        ops::FusionConvBNAddReluMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
-#endif
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_conv_bn_add_relu_op.h b/mobile/src/operators/fusion_conv_bn_add_relu_op.h
deleted file mode 100644
index 676d30ce26..0000000000
--- a/mobile/src/operators/fusion_conv_bn_add_relu_op.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBNADDRELU_OP
-
-#pragma once
-
-#include <string>
-#include <utility>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/conv_bn_add_relu_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusionConvBNAddReluMatcher : public framework::FusionOpMatcher {
- public:
-  FusionConvBNAddReluMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
-        std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}, {"X", "X"}}},
-                  {G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "Scale"},
-                    {"Mean", "Mean"},
-                    {"Bias", "Bias"},
-                    {"Variance", "Variance"},
-                    {"Y", "BNY"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_CONV_BN_ADD_RELU; }
-  std::vector<std::pair<int, std::string>> NeedCheck() {
-    DLOG << " conv bn add relu check add X ";
-    return {{2, "Y"}, {2, "X"}};
-  }
-};
-
-template <typename DeviceType, typename T>
-class FusionConvBNAddReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionConvBNAddReluParam<DeviceType>,
-          operators::ConvBNAddReluKernel<DeviceType, T>> {
- public:
-  FusionConvBNAddReluOp(const string &type, const VariableNameMap &inputs,
-                        const VariableNameMap &outputs,
-                        const framework::AttributeMap &attrs,
-                        framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionConvBNAddReluParam<DeviceType>,
-            operators::ConvBNAddReluKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fusion_conv_bn_op.cpp b/mobile/src/operators/fusion_conv_bn_op.cpp
deleted file mode 100644
index 3c6fa5b1a3..0000000000
--- a/mobile/src/operators/fusion_conv_bn_op.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBN_OP
-
-#include "operators/fusion_conv_bn_op.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionConvBNOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                          dilations[i], paddings[i],
-                                          strides[i]));
-  }
-
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_conv_bn, ops::FusionConvBNMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_conv_bn, ops::FusionConvBNOp);
-#endif
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(fusion_conv_bn, ops::FusionConvBNOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_conv_bn_op.h b/mobile/src/operators/fusion_conv_bn_op.h
deleted file mode 100644
index 385bb539fd..0000000000
--- a/mobile/src/operators/fusion_conv_bn_op.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBN_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/conv_bn_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusionConvBNMatcher : public framework::FusionOpMatcher {
- public:
-  FusionConvBNMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "Scale"},
-                    {"Mean", "Mean"},
-                    {"Bias", "Bias"},
-                    {"Variance", "Variance"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_CONV_BN; }
-};
-
-template <typename DeviceType, typename T>
-class FusionConvBNOp : public framework::OperatorWithKernel<
-                           DeviceType, FusionConvBNParam<DeviceType>,
-                           operators::ConvBNKernel<DeviceType, T>> {
- public:
-  FusionConvBNOp(const string &type, const VariableNameMap &inputs,
-                 const VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, FusionConvBNParam<DeviceType>,
-                                      operators::ConvBNKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fusion_conv_bn_relu_op.cpp b/mobile/src/operators/fusion_conv_bn_relu_op.cpp
deleted file mode 100644
index 4561ec7b93..0000000000
--- a/mobile/src/operators/fusion_conv_bn_relu_op.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBNRELU_OP
-
-#include "operators/fusion_conv_bn_relu_op.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionConvBNReluOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                          dilations[i], paddings[i],
-                                          strides[i]));
-  }
-
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_conv_bn_relu, ops::FusionConvBNReluMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_conv_bn_relu, ops::FusionConvBNReluOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(fusion_conv_bn_relu, ops::FusionConvBNReluOp);
-#endif
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(fusion_conv_bn_relu, ops::FusionConvBNReluOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_conv_bn_relu_op.h b/mobile/src/operators/fusion_conv_bn_relu_op.h
deleted file mode 100644
index 2f49df081c..0000000000
--- a/mobile/src/operators/fusion_conv_bn_relu_op.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBNRELU_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/conv_bn_relu_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusionConvBNReluMatcher : public framework::FusionOpMatcher {
- public:
-  FusionConvBNReluMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
-        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "Scale"},
-                    {"Mean", "Mean"},
-                    {"Bias", "Bias"},
-                    {"Variance", "Variance"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_CONV_BN_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionConvBNReluOp : public framework::OperatorWithKernel<
-                               DeviceType, FusionConvBNReluParam<DeviceType>,
-                               operators::ConvBNReluKernel<DeviceType, T>> {
- public:
-  FusionConvBNReluOp(const string &type, const VariableNameMap &inputs,
-                     const VariableNameMap &outputs,
-                     const framework::AttributeMap &attrs,
-                     framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionConvBNReluParam<DeviceType>,
-            operators::ConvBNReluKernel<DeviceType, T>>(type, inputs, outputs,
-                                                        attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fusion_conv_relu_op.cpp b/mobile/src/operators/fusion_conv_relu_op.cpp
deleted file mode 100644
index d403ceae2f..0000000000
--- a/mobile/src/operators/fusion_conv_relu_op.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVRELU_OP
-
-#include "operators/fusion_conv_relu_op.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionConvReluOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                          dilations[i], paddings[i],
-                                          strides[i]));
-  }
-
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_conv_relu, ops::FusionConvReluMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_conv_relu, ops::FusionConvReluOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(fusion_conv_relu, ops::FusionConvReluOp);
-#endif
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(fusion_conv_relu, ops::FusionConvReluOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_conv_relu_op.h b/mobile/src/operators/fusion_conv_relu_op.h
deleted file mode 100644
index 6444b6b739..0000000000
--- a/mobile/src/operators/fusion_conv_relu_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVRELU_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/conv_relu_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-class FusionConvReluMatcher : public framework::FusionOpMatcher {
- public:
-  FusionConvReluMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(), {}, removed_nodes);
-  }
-  std::string Type() { return G_OP_TYPE_FUSION_CONV_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionConvReluOp : public framework::OperatorWithKernel<
-                             DeviceType, FusionConvReluParam<DeviceType>,
-                             operators::ConvReluKernel<DeviceType, T>> {
- public:
-  FusionConvReluOp(const string &type, const VariableNameMap &inputs,
-                   const VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs,
-                   framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType,
-                                      FusionConvReluParam<DeviceType>,
-                                      operators::ConvReluKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fusion_deconv_add_bn_op.cpp b/mobile/src/operators/fusion_deconv_add_bn_op.cpp
deleted file mode 100644
index e83e29d2ea..0000000000
--- a/mobile/src/operators/fusion_deconv_add_bn_op.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADDBN_OP
-
-#include "operators/fusion_deconv_add_bn_op.h"
-
-namespace paddle_mobile {
-namespace operators {}
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_deconv_add_bn, ops::FusionDeconvAddBNMatcher);
-#ifdef PADDLE_MOBILE_CPU
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(fusion_deconv_add_bn, ops::FusionDeconvAddBNOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_deconv_add_bn_op.h b/mobile/src/operators/fusion_deconv_add_bn_op.h
deleted file mode 100644
index 6185450441..0000000000
--- a/mobile/src/operators/fusion_deconv_add_bn_op.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_DECONVADDBN_OP
-#pragma once
-#include <string>
-#include <vector>
-
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/deconv_add_bn_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusionDeconvAddBNMatcher : public framework::FusionOpMatcher {
- public:
-  FusionDeconvAddBNMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}, {"X", "X"}}},
-                  {G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "Scale"},
-                    {"Mean", "Mean"},
-                    {"Bias", "Bias"},
-                    {"Variance", "Variance"},
-                    {"Y", "BNY"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_DECONV_ADD_BN; }
-};
-
-template <typename DeviceType, typename T>
-class FusionDeconvAddBNOp : public framework::OperatorWithKernel<
-                                DeviceType, FusionDeconvAddBNParam<DeviceType>,
-                                operators::DeconvAddBNKernel<DeviceType, T>> {
- public:
-  FusionDeconvAddBNOp(const string &type, const VariableNameMap &inputs,
-                      const VariableNameMap &outputs,
-                      const framework::AttributeMap &attrs,
-                      framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionDeconvAddBNParam<DeviceType>,
-            operators::DeconvAddBNKernel<DeviceType, T>>(type, inputs, outputs,
-                                                         attrs, scope) {}
-
-  void InferShape() const {
-    auto input = this->param_.Input();
-    auto in_dims = input->dims();
-
-    auto filter = this->param_.Filter();
-    auto filter_dims = filter->dims();
-
-    std::vector<int> strides = this->param_.Strides();
-    std::vector<int> paddings = this->param_.Paddings();
-    std::vector<int> dilations = this->param_.Dilations();
-
-    int groups = this->param_.Groups();
-
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() == 4 || in_dims.size() == 5,
-        "ConvTransposeOp intput should be 4-D or 5-D tensor.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() == filter_dims.size(),
-        "ConvTransposeOp input dimension and filter dimension "
-        "should be the same.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() - strides.size() == 2U,
-        "ConvTransposeOp input dimension and strides dimension should "
-        "be consistent.");
-    PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
-                          "ConvTransposeOp paddings dimension and strides "
-                          "dimension should be the same.");
-    PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
-                          "ConvTransposeOp paddings dimension and dilations "
-                          "dimension should be the same.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims[1] == filter_dims[0],
-        "In ConvTransposeOp, The number of input channels should "
-        "be equal to the number of filter's channels.");
-
-    std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
-    for (size_t i = 0; i < strides.size(); ++i) {
-      auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
-      output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
-                             2 * paddings[i] + filter_extent);
-    }
-    this->param_.Output()->Resize(framework::make_ddim(output_shape));
-  }
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // FUSION_DECONV_ADD_BN_OP
diff --git a/mobile/src/operators/fusion_deconv_add_bn_relu_op.cpp b/mobile/src/operators/fusion_deconv_add_bn_relu_op.cpp
deleted file mode 100755
index 9f3ca09c3e..0000000000
--- a/mobile/src/operators/fusion_deconv_add_bn_relu_op.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADDBNRELU_OP
-
-#include "operators/fusion_deconv_add_bn_relu_op.h"
-
-namespace paddle_mobile {
-namespace operators {}
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_deconv_add_bn_relu,
-                        ops::FusionDeconvAddBNReluMatcher);
-#ifdef PADDLE_MOBILE_CPU
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(fusion_deconv_add_bn_relu, ops::FusionDeconvAddBNReluOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_deconv_add_bn_relu_op.h b/mobile/src/operators/fusion_deconv_add_bn_relu_op.h
deleted file mode 100644
index 1c6cfd7318..0000000000
--- a/mobile/src/operators/fusion_deconv_add_bn_relu_op.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_DECONVADDBNRELU_OP
-#pragma once
-#include <string>
-#include <vector>
-
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/deconv_add_bn_relu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusionDeconvAddBNReluMatcher : public framework::FusionOpMatcher {
- public:
-  FusionDeconvAddBNReluMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
-        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}, {"X", "X"}}},
-                  {G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "Scale"},
-                    {"Mean", "Mean"},
-                    {"Bias", "Bias"},
-                    {"Variance", "Variance"},
-                    {"Y", "BNY"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionDeconvAddBNReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionDeconvAddBNReluParam<DeviceType>,
-          operators::DeconvAddBNReluKernel<DeviceType, T>> {
- public:
-  FusionDeconvAddBNReluOp(const string &type, const VariableNameMap &inputs,
-                          const VariableNameMap &outputs,
-                          const framework::AttributeMap &attrs,
-                          framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionDeconvAddBNReluParam<DeviceType>,
-            operators::DeconvAddBNReluKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const {
-    auto input = this->param_.Input();
-    auto in_dims = input->dims();
-
-    auto filter = this->param_.Filter();
-    auto filter_dims = filter->dims();
-
-    std::vector<int> strides = this->param_.Strides();
-    std::vector<int> paddings = this->param_.Paddings();
-    std::vector<int> dilations = this->param_.Dilations();
-
-    int groups = this->param_.Groups();
-
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() == 4 || in_dims.size() == 5,
-        "ConvTransposeOp intput should be 4-D or 5-D tensor.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() == filter_dims.size(),
-        "ConvTransposeOp input dimension and filter dimension "
-        "should be the same.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() - strides.size() == 2U,
-        "ConvTransposeOp input dimension and strides dimension should "
-        "be consistent.");
-    PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
-                          "ConvTransposeOp paddings dimension and strides "
-                          "dimension should be the same.");
-    PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
-                          "ConvTransposeOp paddings dimension and dilations "
-                          "dimension should be the same.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims[1] == filter_dims[0],
-        "In ConvTransposeOp, The number of input channels should "
-        "be equal to the number of filter's channels.");
-
-    std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
-    for (size_t i = 0; i < strides.size(); ++i) {
-      auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
-      output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
-                             2 * paddings[i] + filter_extent);
-    }
-    this->param_.Output()->Resize(framework::make_ddim(output_shape));
-  }
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // FUSION_DECONV_ADD_BN_RELU_OP
diff --git a/mobile/src/operators/fusion_deconv_add_op.cpp b/mobile/src/operators/fusion_deconv_add_op.cpp
deleted file mode 100644
index 717039cd3d..0000000000
--- a/mobile/src/operators/fusion_deconv_add_op.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADD_OP
-
-#include "operators/fusion_deconv_add_op.h"
-
-namespace paddle_mobile {
-namespace operators {}
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_deconv_add, ops::FusionDeconvAddMatcher);
-#ifdef PADDLE_MOBILE_CPU
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(fusion_deconv_add, ops::FusionDeconvAddOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_deconv_add_op.h b/mobile/src/operators/fusion_deconv_add_op.h
deleted file mode 100644
index 406f81318a..0000000000
--- a/mobile/src/operators/fusion_deconv_add_op.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_DECONVADD_OP
-#pragma once
-#include <string>
-#include <vector>
-
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/deconv_add_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusionDeconvAddMatcher : public framework::FusionOpMatcher {
- public:
-  FusionDeconvAddMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}}, removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_DECONV_ADD; }
-};
-
-template <typename DeviceType, typename T>
-class FusionDeconvAddOp : public framework::OperatorWithKernel<
-                              DeviceType, FusionDeconvAddParam<DeviceType>,
-                              operators::DeconvAddKernel<DeviceType, T>> {
- public:
-  FusionDeconvAddOp(const string &type, const VariableNameMap &inputs,
-                    const VariableNameMap &outputs,
-                    const framework::AttributeMap &attrs,
-                    framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionDeconvAddParam<DeviceType>,
-            operators::DeconvAddKernel<DeviceType, T>>(type, inputs, outputs,
-                                                       attrs, scope) {}
-
-  void InferShape() const {
-    auto input = this->param_.Input();
-    auto in_dims = input->dims();
-
-    auto filter = this->param_.Filter();
-    auto filter_dims = filter->dims();
-
-    std::vector<int> strides = this->param_.Strides();
-    std::vector<int> paddings = this->param_.Paddings();
-    std::vector<int> dilations = this->param_.Dilations();
-
-    int groups = this->param_.Groups();
-
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() == 4 || in_dims.size() == 5,
-        "ConvTransposeOp intput should be 4-D or 5-D tensor.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() == filter_dims.size(),
-        "ConvTransposeOp input dimension and filter dimension "
-        "should be the same.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() - strides.size() == 2U,
-        "ConvTransposeOp input dimension and strides dimension should "
-        "be consistent.");
-    PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
-                          "ConvTransposeOp paddings dimension and strides "
-                          "dimension should be the same.");
-    PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
-                          "ConvTransposeOp paddings dimension and dilations "
-                          "dimension should be the same.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims[1] == filter_dims[0],
-        "In ConvTransposeOp, The number of input channels should "
-        "be equal to the number of filter's channels.");
-
-    std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
-    for (size_t i = 0; i < strides.size(); ++i) {
-      auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
-      output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
-                             2 * paddings[i] + filter_extent);
-    }
-    this->param_.Output()->Resize(framework::make_ddim(output_shape));
-  }
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // FUSION_DECONV_ADD_OP
diff --git a/mobile/src/operators/fusion_deconv_add_relu_op.cpp b/mobile/src/operators/fusion_deconv_add_relu_op.cpp
deleted file mode 100644
index a461bce2ef..0000000000
--- a/mobile/src/operators/fusion_deconv_add_relu_op.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADDRELU_OP
-
-#include "operators/fusion_deconv_add_relu_op.h"
-
-namespace paddle_mobile {
-namespace operators {}
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_deconv_add_relu,
-                        ops::FusionDeconvAddReluMatcher);
-#ifdef PADDLE_MOBILE_CPU
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(fusion_deconv_add_relu, ops::FusionDeconvAddReluOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_deconv_add_relu_op.h b/mobile/src/operators/fusion_deconv_add_relu_op.h
deleted file mode 100644
index 735e126b03..0000000000
--- a/mobile/src/operators/fusion_deconv_add_relu_op.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_DECONVADDRELU_OP
-#pragma once
-#include <string>
-#include <vector>
-
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/deconv_add_relu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusionDeconvAddReluMatcher : public framework::FusionOpMatcher {
- public:
-  FusionDeconvAddReluMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}}, removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_DECONV_ADD_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionDeconvAddReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionDeconvAddReluParam<DeviceType>,
-          operators::DeconvAddReluKernel<DeviceType, T>> {
- public:
-  FusionDeconvAddReluOp(const string &type, const VariableNameMap &inputs,
-                        const VariableNameMap &outputs,
-                        const framework::AttributeMap &attrs,
-                        framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionDeconvAddReluParam<DeviceType>,
-            operators::DeconvAddReluKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const {
-    auto input = this->param_.Input();
-    auto in_dims = input->dims();
-
-    auto filter = this->param_.Filter();
-    auto filter_dims = filter->dims();
-
-    std::vector<int> strides = this->param_.Strides();
-    std::vector<int> paddings = this->param_.Paddings();
-    std::vector<int> dilations = this->param_.Dilations();
-
-    int groups = this->param_.Groups();
-
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() == 4 || in_dims.size() == 5,
-        "ConvTransposeOp intput should be 4-D or 5-D tensor.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() == filter_dims.size(),
-        "ConvTransposeOp input dimension and filter dimension "
-        "should be the same.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() - strides.size() == 2U,
-        "ConvTransposeOp input dimension and strides dimension should "
-        "be consistent.");
-    PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
-                          "ConvTransposeOp paddings dimension and strides "
-                          "dimension should be the same.");
-    PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
-                          "ConvTransposeOp paddings dimension and dilations "
-                          "dimension should be the same.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims[1] == filter_dims[0],
-        "In ConvTransposeOp, The number of input channels should "
-        "be equal to the number of filter's channels.");
-
-    std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
-    for (size_t i = 0; i < strides.size(); ++i) {
-      auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
-      output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
-                             2 * paddings[i] + filter_extent);
-    }
-    this->param_.Output()->Resize(framework::make_ddim(output_shape));
-  }
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // FUSION_DECONV_ADD_RELU_OP
diff --git a/mobile/src/operators/fusion_deconv_bn_relu_op.cpp b/mobile/src/operators/fusion_deconv_bn_relu_op.cpp
deleted file mode 100644
index 207acd9380..0000000000
--- a/mobile/src/operators/fusion_deconv_bn_relu_op.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVBNRELU_OP
-
-#include "operators/fusion_deconv_bn_relu_op.h"
-
-namespace paddle_mobile {
-namespace operators {}
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_deconv_bn_relu, ops::FusionDeconvBNReluMatcher);
-#ifdef PADDLE_MOBILE_CPU
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(fusion_deconv_bn_relu, ops::FusionDeconvBNReluOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_deconv_bn_relu_op.h b/mobile/src/operators/fusion_deconv_bn_relu_op.h
deleted file mode 100644
index 92bb97445d..0000000000
--- a/mobile/src/operators/fusion_deconv_bn_relu_op.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_DECONVBNRELU_OP
-#pragma once
-#include <string>
-#include <vector>
-
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/deconv_bn_relu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusionDeconvBNReluMatcher : public framework::FusionOpMatcher {
- public:
-  FusionDeconvBNReluMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
-        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "Scale"},
-                    {"Mean", "Mean"},
-                    {"Bias", "Bias"},
-                    {"Variance", "Variance"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_DECONV_BN_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionDeconvBNReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionDeconvBNReluParam<DeviceType>,
-          operators::DeconvBNReluKernel<DeviceType, T>> {
- public:
-  FusionDeconvBNReluOp(const string &type, const VariableNameMap &inputs,
-                       const VariableNameMap &outputs,
-                       const framework::AttributeMap &attrs,
-                       framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionDeconvBNReluParam<DeviceType>,
-            operators::DeconvBNReluKernel<DeviceType, T>>(type, inputs, outputs,
-                                                          attrs, scope) {}
-
-  void InferShape() const {
-    auto input = this->param_.Input();
-    auto in_dims = input->dims();
-
-    auto filter = this->param_.Filter();
-    auto filter_dims = filter->dims();
-
-    std::vector<int> strides = this->param_.Strides();
-    std::vector<int> paddings = this->param_.Paddings();
-    std::vector<int> dilations = this->param_.Dilations();
-
-    int groups = this->param_.Groups();
-
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() == 4 || in_dims.size() == 5,
-        "ConvTransposeOp intput should be 4-D or 5-D tensor.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() == filter_dims.size(),
-        "ConvTransposeOp input dimension and filter dimension "
-        "should be the same.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() - strides.size() == 2U,
-        "ConvTransposeOp input dimension and strides dimension should "
-        "be consistent.");
-    PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
-                          "ConvTransposeOp paddings dimension and strides "
-                          "dimension should be the same.");
-    PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
-                          "ConvTransposeOp paddings dimension and dilations "
-                          "dimension should be the same.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims[1] == filter_dims[0],
-        "In ConvTransposeOp, The number of input channels should "
-        "be equal to the number of filter's channels.");
-
-    std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
-    for (size_t i = 0; i < strides.size(); ++i) {
-      auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
-      output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
-                             2 * paddings[i] + filter_extent);
-    }
-    this->param_.Output()->Resize(framework::make_ddim(output_shape));
-  }
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // FUSION_DECONV_BN_RELU_OP
diff --git a/mobile/src/operators/fusion_deconv_relu_op.cpp b/mobile/src/operators/fusion_deconv_relu_op.cpp
deleted file mode 100644
index 7c48c4f14c..0000000000
--- a/mobile/src/operators/fusion_deconv_relu_op.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVRELU_OP
-
-#include "operators/fusion_deconv_relu_op.h"
-
-namespace paddle_mobile {
-namespace operators {}
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(fusion_deconv_relu, ops::FusionDeconvReluOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_deconv_relu_op.h b/mobile/src/operators/fusion_deconv_relu_op.h
deleted file mode 100644
index c290a8da08..0000000000
--- a/mobile/src/operators/fusion_deconv_relu_op.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_DECONVRELU_OP
-#pragma once
-#include <string>
-#include <vector>
-
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/deconv_relu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusionDeconvReluMatcher : public framework::FusionOpMatcher {
- public:
-  FusionDeconvReluMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(), {}, removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_FC_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionDeconvReluOp : public framework::OperatorWithKernel<
-                               DeviceType, FusionDeconvReluParam<DeviceType>,
-                               operators::DeconvReluKernel<DeviceType, T>> {
- public:
-  FusionDeconvReluOp(const string &type, const VariableNameMap &inputs,
-                     const VariableNameMap &outputs,
-                     const framework::AttributeMap &attrs,
-                     framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionDeconvReluParam<DeviceType>,
-            operators::DeconvReluKernel<DeviceType, T>>(type, inputs, outputs,
-                                                        attrs, scope) {}
-
-  void InferShape() const {
-    auto input = this->param_.Input();
-    auto in_dims = input->dims();
-
-    auto filter = this->param_.Filter();
-    auto filter_dims = filter->dims();
-
-    std::vector<int> strides = this->param_.Strides();
-    std::vector<int> paddings = this->param_.Paddings();
-    std::vector<int> dilations = this->param_.Dilations();
-
-    int groups = this->param_.Groups();
-
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() == 4 || in_dims.size() == 5,
-        "ConvTransposeOp intput should be 4-D or 5-D tensor.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() == filter_dims.size(),
-        "ConvTransposeOp input dimension and filter dimension "
-        "should be the same.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() - strides.size() == 2U,
-        "ConvTransposeOp input dimension and strides dimension should "
-        "be consistent.");
-    PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
-                          "ConvTransposeOp paddings dimension and strides "
-                          "dimension should be the same.");
-    PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
-                          "ConvTransposeOp paddings dimension and dilations "
-                          "dimension should be the same.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims[1] == filter_dims[0],
-        "In ConvTransposeOp, The number of input channels should "
-        "be equal to the number of filter's channels.");
-
-    std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
-    for (size_t i = 0; i < strides.size(); ++i) {
-      auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
-      output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
-                             2 * paddings[i] + filter_extent);
-    }
-    this->param_.Output()->Resize(framework::make_ddim(output_shape));
-  }
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // FUSION_FC_RELU_OP
diff --git a/mobile/src/operators/fusion_dequant_add_bn_op.cpp b/mobile/src/operators/fusion_dequant_add_bn_op.cpp
deleted file mode 100644
index 4df50af22b..0000000000
--- a/mobile/src/operators/fusion_dequant_add_bn_op.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DEQUANT_ADD_BN_OP
-
-#include "operators/fusion_dequant_add_bn_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionDequantAddBNOp<Dtype, T>::InferShape() const {
-  const auto& input_dims = this->param_.input_->dims();
-  this->param_.output_->Resize(input_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_dequant_add_bn, ops::FusionDequantAddBNMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_dequant_add_bn, ops::FusionDequantAddBNOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_dequant_add_bn_op.h b/mobile/src/operators/fusion_dequant_add_bn_op.h
deleted file mode 100644
index b838b544ce..0000000000
--- a/mobile/src/operators/fusion_dequant_add_bn_op.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DEQUANT_ADD_BN_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/dequant_bn_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-class FusionDequantAddBNMatcher : public framework::FusionOpMatcher {
- public:
-  FusionDequantAddBNMatcher() {
-    node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
-                  {G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "BNScale"},
-                    {"Mean", "BNMean"},
-                    {"Bias", "BNBias"},
-                    {"Variance", "BNVariance"},
-                    {"Y", "Out"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_ADD_BN; }
-};
-
-template <typename DeviceType, typename T>
-class FusionDequantAddBNOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionDequantAddBNParam<DeviceType>,
-          operators::FusionDequantAddBNKernel<DeviceType, T>> {
- public:
-  FusionDequantAddBNOp(const std::string &type, const VariableNameMap &inputs,
-                       const VariableNameMap &outputs,
-                       const framework::AttributeMap &attrs,
-                       framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionDequantAddBNParam<DeviceType>,
-            operators::FusionDequantAddBNKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  // inference output shape
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fusion_dequant_add_bn_relu_op.cpp b/mobile/src/operators/fusion_dequant_add_bn_relu_op.cpp
deleted file mode 100644
index 80d9040afb..0000000000
--- a/mobile/src/operators/fusion_dequant_add_bn_relu_op.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
-
-#include "operators/fusion_dequant_add_bn_relu_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionDequantAddBNReluOp<Dtype, T>::InferShape() const {
-  const auto& input_dims = this->param_.input_->dims();
-  this->param_.output_->Resize(input_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_dequant_add_bn_relu,
-                        ops::FusionDequantAddBNReluMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_dequant_add_bn_relu,
-                      ops::FusionDequantAddBNReluOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_dequant_add_bn_relu_op.h b/mobile/src/operators/fusion_dequant_add_bn_relu_op.h
deleted file mode 100644
index e2762923c5..0000000000
--- a/mobile/src/operators/fusion_dequant_add_bn_relu_op.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/dequant_bn_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-class FusionDequantAddBNReluMatcher : public framework::FusionOpMatcher {
- public:
-  FusionDequantAddBNReluMatcher() {
-    node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
-        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
-                  {G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "BNScale"},
-                    {"Mean", "BNMean"},
-                    {"Bias", "BNBias"},
-                    {"Variance", "BNVariance"},
-                    {"Y", "Out"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionDequantAddBNReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionDequantAddBNParam<DeviceType>,
-          operators::FusionDequantAddBNReluKernel<DeviceType, T>> {
- public:
-  FusionDequantAddBNReluOp(const std::string &type,
-                           const VariableNameMap &inputs,
-                           const VariableNameMap &outputs,
-                           const framework::AttributeMap &attrs,
-                           framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionDequantAddBNParam<DeviceType>,
-            operators::FusionDequantAddBNReluKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  // inference output shape
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fusion_dequant_add_bn_relu_quant_op.cpp b/mobile/src/operators/fusion_dequant_add_bn_relu_quant_op.cpp
deleted file mode 100644
index 82eacd7f47..0000000000
--- a/mobile/src/operators/fusion_dequant_add_bn_relu_quant_op.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/fusion_dequant_add_bn_relu_quant_op.h"
-
-#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionDequantAddBNReluQuantOp<Dtype, T>::InferShape() const {
-  const auto& input_dims = this->param_.input_->dims();
-  this->param_.output_->Resize(input_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_dequant_add_bn_relu_quant,
-                        ops::FusionDequantAddBNReluQuantMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_dequant_add_bn_relu_quant,
-                      ops::FusionDequantAddBNReluQuantOp);
-#endif
-#endif  // FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
-
-#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionDequantAddBNQuantOp<Dtype, T>::InferShape() const {
-  const auto& input_dims = this->param_.input_->dims();
-  this->param_.output_->Resize(input_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_dequant_add_bn_quant,
-                        ops::FusionDequantAddBNQuantMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_dequant_add_bn_quant,
-                      ops::FusionDequantAddBNQuantOp);
-#endif
-
-#endif  // FUSION_DEQUANT_ADD_BN_QUANT_OP
diff --git a/mobile/src/operators/fusion_dequant_add_bn_relu_quant_op.h b/mobile/src/operators/fusion_dequant_add_bn_relu_quant_op.h
deleted file mode 100644
index 6caa8daeb3..0000000000
--- a/mobile/src/operators/fusion_dequant_add_bn_relu_quant_op.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/dequant_bn_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
-class FusionDequantAddBNReluQuantMatcher : public framework::FusionOpMatcher {
- public:
-  FusionDequantAddBNReluQuantMatcher() {
-    node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
-        std::make_shared<framework::Node>(G_OP_TYPE_RELU) >
-        std::make_shared<framework::Node>(G_OP_TYPE_QUANTIZE);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
-                  {G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "BNScale"},
-                    {"Mean", "BNMean"},
-                    {"Bias", "BNBias"},
-                    {"Variance", "BNVariance"},
-                    {"Y", "Out"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT; }
-};
-
-template <typename DeviceType, typename T>
-class FusionDequantAddBNReluQuantOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionDequantAddBNReluQuantParam<DeviceType>,
-          operators::FusionDequantAddBNReluQuantKernel<DeviceType, T>> {
- public:
-  FusionDequantAddBNReluQuantOp(const std::string &type,
-                                const VariableNameMap &inputs,
-                                const VariableNameMap &outputs,
-                                const framework::AttributeMap &attrs,
-                                framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionDequantAddBNReluQuantParam<DeviceType>,
-            operators::FusionDequantAddBNReluQuantKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  // inference output shape
-  void InferShape() const override;
-};
-#endif  // FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
-
-#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
-class FusionDequantAddBNQuantMatcher : public framework::FusionOpMatcher {
- public:
-  FusionDequantAddBNQuantMatcher() {
-    node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
-        std::make_shared<framework::Node>(G_OP_TYPE_QUANTIZE);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
-                  {G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "BNScale"},
-                    {"Mean", "BNMean"},
-                    {"Bias", "BNBias"},
-                    {"Variance", "BNVariance"},
-                    {"Y", "Out"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_ADD_BN_QUANT; }
-};
-
-template <typename DeviceType, typename T>
-class FusionDequantAddBNQuantOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionDequantAddBNQuantParam<DeviceType>,
-          operators::FusionDequantAddBNQuantKernel<DeviceType, T>> {
- public:
-  FusionDequantAddBNQuantOp(const std::string &type,
-                            const VariableNameMap &inputs,
-                            const VariableNameMap &outputs,
-                            const framework::AttributeMap &attrs,
-                            framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionDequantAddBNQuantParam<DeviceType>,
-            operators::FusionDequantAddBNQuantKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  // inference output shape
-  void InferShape() const override;
-};
-#endif  // FUSION_DEQUANT_ADD_BN_QUANT_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/fusion_dequant_bn_op.cpp b/mobile/src/operators/fusion_dequant_bn_op.cpp
deleted file mode 100644
index 3c944c0158..0000000000
--- a/mobile/src/operators/fusion_dequant_bn_op.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/fusion_dequant_bn_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef FUSION_DEQUANT_BN_OP
-template <typename Dtype, typename T>
-void FusionDequantBNOp<Dtype, T>::InferShape() const {
-  const auto& input_dims = this->param_.input_->dims();
-  this->param_.output_->Resize(input_dims);
-}
-#endif  // FUSION_DEQUANT_BN_OP
-
-#ifdef FUSION_DEQUANT_BN_RELU_OP
-template <typename Dtype, typename T>
-void FusionDequantBNReluOp<Dtype, T>::InferShape() const {
-  const auto& input_dims = this->param_.input_->dims();
-  this->param_.output_->Resize(input_dims);
-}
-#endif  // FUSION_DEQUANT_BN_RELU_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef FUSION_DEQUANT_BN_OP
-REGISTER_FUSION_MATCHER(fusion_dequant_bn, ops::FusionDequantBNMatcher);
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_dequant_bn, ops::FusionDequantBNOp);
-#endif  // PADDLE_MOBILE_CPU
-#endif  // FUSION_DEQUANT_BN_OP
-
-#ifdef FUSION_DEQUANT_BN_RELU_OP
-REGISTER_FUSION_MATCHER(fusion_dequant_bn_relu,
-                        ops::FusionDequantBNReluMatcher);
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_dequant_bn_relu, ops::FusionDequantBNReluOp);
-#endif  // PADDLE_MOBILE_CPU
-#endif  // FUSION_DEQUANT_BN_RELU_OP
diff --git a/mobile/src/operators/fusion_dequant_bn_op.h b/mobile/src/operators/fusion_dequant_bn_op.h
deleted file mode 100644
index ac2237b77a..0000000000
--- a/mobile/src/operators/fusion_dequant_bn_op.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/dequant_bn_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_BN_RELU_OP)
-class FusionDequantBNMatcher : public framework::FusionOpMatcher {
- public:
-  FusionDequantBNMatcher() {
-    node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM);
-  }
-
-  virtual void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "BNScale"},
-                    {"Mean", "BNMean"},
-                    {"Bias", "BNBias"},
-                    {"Variance", "BNVariance"},
-                    {"Y", "Out"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() override { return G_OP_TYPE_FUSION_DEQUANT_BN; }
-};
-#endif  // FUSION_DEQUANT_BN_OP || FUSION_DEQUANT_BN_RELU_OP
-
-#ifdef FUSION_DEQUANT_BN_OP
-template <typename DeviceType, typename T>
-class FusionDequantBNOp : public framework::OperatorWithKernel<
-                              DeviceType, FusionDequantBNParam<DeviceType>,
-                              operators::FusionDequantBNKernel<DeviceType, T>> {
- public:
-  FusionDequantBNOp(const std::string &type, const VariableNameMap &inputs,
-                    const VariableNameMap &outputs,
-                    const framework::AttributeMap &attrs,
-                    framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionDequantBNParam<DeviceType>,
-            operators::FusionDequantBNKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  // inference output shape
-  void InferShape() const override;
-};
-#endif  // FUSION_DEQUANT_BN_OP
-
-#ifdef FUSION_DEQUANT_BN_RELU_OP
-class FusionDequantBNReluMatcher : public FusionDequantBNMatcher {
- public:
-  FusionDequantBNReluMatcher() : FusionDequantBNMatcher() {
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  virtual std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_BN_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionDequantBNReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionDequantBNParam<DeviceType>,
-          operators::FusionDequantBNReluKernel<DeviceType, T>> {
- public:
-  FusionDequantBNReluOp(const std::string &type, const VariableNameMap &inputs,
-                        const VariableNameMap &outputs,
-                        const framework::AttributeMap &attrs,
-                        framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionDequantBNParam<DeviceType>,
-            operators::FusionDequantBNReluKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const override;
-};
-#endif  // FUSION_DEQUANT_BN_RELU_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/fusion_dequant_bn_relu_op.h b/mobile/src/operators/fusion_dequant_bn_relu_op.h
deleted file mode 100644
index be3b5293a3..0000000000
--- a/mobile/src/operators/fusion_dequant_bn_relu_op.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DEQUANT_BN_RELU_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/dequant_bn_relu_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-class FusionDequantBNReluMatcher : public framework::FusionOpMatcher {
- public:
-  FusionDequantBNReluMatcher() {
-    node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
-        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "BNScale"},
-                    {"Mean", "BNMean"},
-                    {"Bias", "BNBias"},
-                    {"Variance", "BNVariance"},
-                    {"Y", "Out"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_BN_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionDequantBNReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionDequantBNReluParam<DeviceType>,
-          operators::FusionDequantBNReluKernel<DeviceType, T>> {
- public:
-  FusionDequantBNReluOp(const std::string &type, const VariableNameMap &inputs,
-                        const VariableNameMap &outputs,
-                        const framework::AttributeMap &attrs,
-                        framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionDequantBNReluParam<DeviceType>,
-            operators::FusionDequantBNReluKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  // inference output shape
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fusion_dwconv_bn_relu_op.cpp b/mobile/src/operators/fusion_dwconv_bn_relu_op.cpp
deleted file mode 100644
index d4c04f67fc..0000000000
--- a/mobile/src/operators/fusion_dwconv_bn_relu_op.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DWCONVBNRELU_OP
-
-#include "operators/fusion_dwconv_bn_relu_op.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionDWConvBNReluOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                          dilations[i], paddings[i],
-                                          strides[i]));
-  }
-
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_dwconv_bn_relu, ops::FusionDWConvBNReluMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_dwconv_bn_relu, ops::FusionDWConvBNReluOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(fusion_dwconv_bn_relu, ops::FusionDWConvBNReluOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_dwconv_bn_relu_op.h b/mobile/src/operators/fusion_dwconv_bn_relu_op.h
deleted file mode 100644
index 0fb2e5c70c..0000000000
--- a/mobile/src/operators/fusion_dwconv_bn_relu_op.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DWCONVBNRELU_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/dwconv_bn_relu_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusionDWConvBNReluMatcher : public framework::FusionOpMatcher {
- public:
-  FusionDWConvBNReluMatcher() {
-    node_ = framework::Node(G_OP_TYPE_DEPTHWISE_CONV);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
-        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "Scale"},
-                    {"Mean", "Mean"},
-                    {"Bias", "Bias"},
-                    {"Variance", "Variance"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_DWCONV_BN_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionDWConvBNReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionDWConvBNReluParam<DeviceType>,
-          operators::DWConvBNReluKernel<DeviceType, T>> {
- public:
-  FusionDWConvBNReluOp(const string &type, const VariableNameMap &inputs,
-                       const VariableNameMap &outputs,
-                       const framework::AttributeMap &attrs,
-                       framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionDWConvBNReluParam<DeviceType>,
-            operators::DWConvBNReluKernel<DeviceType, T>>(type, inputs, outputs,
-                                                          attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fusion_elementwise_add_relu_op.cpp b/mobile/src/operators/fusion_elementwise_add_relu_op.cpp
deleted file mode 100644
index def932a589..0000000000
--- a/mobile/src/operators/fusion_elementwise_add_relu_op.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_ELEMENTWISEADDRELU_OP
-
-#include "operators/fusion_elementwise_add_relu_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionElementwiseAddReluOp<Dtype, T>::InferShape() const {
-  auto x_dim = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(x_dim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_elementwise_add_relu,
-                        ops::FusioneElementwiseAddReluMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-// REGISTER_OPERATOR_CPU(fusion_elementwise_add_relu,
-//                      ops::FusionElementwiseAddReluOp);
-#endif
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(fusion_elementwise_add_relu,
-                       ops::FusionElementwiseAddReluOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_elementwise_add_relu_op.h b/mobile/src/operators/fusion_elementwise_add_relu_op.h
deleted file mode 100644
index c90d4e041e..0000000000
--- a/mobile/src/operators/fusion_elementwise_add_relu_op.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_ELEMENTWISEADDRELU_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/elementwise_add_relu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusioneElementwiseAddReluMatcher : public framework::FusionOpMatcher {
- public:
-  FusioneElementwiseAddReluMatcher() {
-    node_ = framework::Node(G_OP_TYPE_ELEMENTWISE_ADD);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(), {}, removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionElementwiseAddReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, ElementwiseAddReluParam<DeviceType>,
-          operators::ElementwiseAddReluKernel<DeviceType, T>> {
- public:
-  FusionElementwiseAddReluOp(const string &type, const VariableNameMap &inputs,
-                             const VariableNameMap &outputs,
-                             const framework::AttributeMap &attrs,
-                             framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, ElementwiseAddReluParam<DeviceType>,
-            operators::ElementwiseAddReluKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fusion_fc_op.cpp b/mobile/src/operators/fusion_fc_op.cpp
deleted file mode 100644
index 0e6bb28ea2..0000000000
--- a/mobile/src/operators/fusion_fc_op.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_FC_OP
-
-#include "operators/fusion_fc_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionFcOp<Dtype, T>::InferShape() const {
-  auto x_dims = this->param_.InputX()->dims();
-  auto y_dims = this->param_.InputY()->dims();
-  int x_num_col_dims = this->param_.XNumColDims();
-  int y_num_col_dims = this->param_.YNumColDims();
-
-  assert(x_dims.size() > x_num_col_dims);
-  assert(y_dims.size() > y_num_col_dims);
-
-  /// (1,2,3,4) , x_num_col_dims = 2  -> (2,12)
-  auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
-  auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
-
-  assert(x_mat_dims[1] == y_mat_dims[0]);
-
-  std::vector<int64_t> output_dims;
-  output_dims.reserve(
-      static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims));
-
-  for (int i = 0; i < x_num_col_dims; ++i) {
-    output_dims.push_back(x_dims[i]);
-  }
-
-  for (int i = y_num_col_dims; i < y_dims.size(); ++i) {
-    output_dims.push_back(y_dims[i]);
-  }
-
-  framework::DDim ddim = framework::make_ddim(output_dims);
-  this->param_.Out()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_fc, ops::FusionFcMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_fc, ops::FusionFcOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(fusion_fc, ops::FusionFcOp);
-#endif
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(fusion_fc, ops::FusionFcOp);
-#endif
-
-#endif  // FUSION_FC_OP
diff --git a/mobile/src/operators/fusion_fc_op.h b/mobile/src/operators/fusion_fc_op.h
deleted file mode 100644
index a88add4584..0000000000
--- a/mobile/src/operators/fusion_fc_op.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_FC_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/fusion_fc_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-class FusionFcMatcher : public framework::FusionOpMatcher {
- public:
-  FusionFcMatcher() {
-    node_ = framework::Node(G_OP_TYPE_MUL);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Z"}}}}, removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FC; }
-};
-
-template <typename DeviceType, typename T>
-class FusionFcOp : public framework::OperatorWithKernel<
-                       DeviceType, FusionFcParam<DeviceType>,
-                       operators::FusionFcKernel<DeviceType, T>> {
- public:
-  FusionFcOp(const std::string &type, const VariableNameMap &inputs,
-             const VariableNameMap &outputs,
-             const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, FusionFcParam<DeviceType>,
-                                      operators::FusionFcKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // FUSION_FC_OP
diff --git a/mobile/src/operators/fusion_fc_relu_op.cpp b/mobile/src/operators/fusion_fc_relu_op.cpp
deleted file mode 100644
index f47b220e36..0000000000
--- a/mobile/src/operators/fusion_fc_relu_op.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_FCRELU_OP
-
-#include "operators/fusion_fc_relu_op.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionFcReluOp<Dtype, T>::InferShape() const {
-  auto x_dims = this->param_.InputX()->dims();
-  auto y_dims = this->param_.InputY()->dims();
-  int x_num_col_dims = this->param_.XNumColDims();
-  int y_num_col_dims = this->param_.YNumColDims();
-
-  assert(x_dims.size() > x_num_col_dims);
-  assert(y_dims.size() > y_num_col_dims);
-
-  /// (1,2,3,4) , x_num_col_dims = 2  -> (2,12)
-  auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
-  auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
-
-  assert(x_mat_dims[1] == y_mat_dims[0]);
-
-  std::vector<int64_t> output_dims;
-  output_dims.reserve(
-      static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims));
-
-  for (int i = 0; i < x_num_col_dims; ++i) {
-    output_dims.push_back(x_dims[i]);
-  }
-
-  for (int i = y_num_col_dims; i < y_dims.size(); ++i) {
-    output_dims.push_back(y_dims[i]);
-  }
-
-  framework::DDim ddim = framework::make_ddim(output_dims);
-  this->param_.Out()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-REGISTER_FUSION_MATCHER(fusion_fc_relu, ops::FusionFcReluMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_fc_relu, ops::FusionFcReluOp);
-#endif
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(fusion_fc_relu, ops::FusionFcReluOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_fc_relu_op.h b/mobile/src/operators/fusion_fc_relu_op.h
deleted file mode 100644
index 253335c8f2..0000000000
--- a/mobile/src/operators/fusion_fc_relu_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_FCRELU_OP
-#pragma once
-#include <string>
-#include <vector>
-
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/fc_relu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusionFcReluMatcher : public framework::FusionOpMatcher {
- public:
-  FusionFcReluMatcher() {
-    node_ = framework::Node(G_OP_TYPE_MUL);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Z"}}}}, removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_FC_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionFcReluOp : public framework::OperatorWithKernel<
-                           DeviceType, FusionFcReluParam<DeviceType>,
-                           operators::FusionFcReluKernel<DeviceType, T>> {
- public:
-  FusionFcReluOp(const string &type, const VariableNameMap &inputs,
-                 const VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionFcReluParam<DeviceType>,
-            operators::FusionFcReluKernel<DeviceType, T>>(type, inputs, outputs,
-                                                          attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // FUSION_FC_RELU_OP
diff --git a/mobile/src/operators/fusion_instancenorm_relu_op.cpp b/mobile/src/operators/fusion_instancenorm_relu_op.cpp
deleted file mode 100644
index f6299fa72d..0000000000
--- a/mobile/src/operators/fusion_instancenorm_relu_op.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_INSTANCENORM_RELU_OP
-
-#include "operators/fusion_instancenorm_relu_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionInstanceNormReluOp<Dtype, T>::InferShape() const {
-  auto x_dims = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(x_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_instancenorm_relu,
-                        ops::FusionInstanceNormReluMatcher);
-
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(fusion_instancenorm_relu, ops::FusionInstanceNormReluOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_instancenorm_relu_op.h b/mobile/src/operators/fusion_instancenorm_relu_op.h
deleted file mode 100644
index ce2623e4dd..0000000000
--- a/mobile/src/operators/fusion_instancenorm_relu_op.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_INSTANCENORM_RELU_OP
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/instancenorm_relu_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-class FusionInstanceNormReluMatcher : public framework::FusionOpMatcher {
- public:
-  FusionInstanceNormReluMatcher() {
-    node_ = framework::Node(G_OP_TYPE_INSTANCENORM);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(), {}, removed_nodes);
-  }
-  std::string Type() { return G_OP_TYPE_FUSION_INSTANCENORM_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionInstanceNormReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, InstanceNormParam<DeviceType>,
-          operators::InstanceNormReluKernel<DeviceType, T>> {
- public:
-  FusionInstanceNormReluOp(const string &type, const VariableNameMap &inputs,
-                           const VariableNameMap &outputs,
-                           const framework::AttributeMap &attrs,
-                           framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, InstanceNormParam<DeviceType>,
-            operators::InstanceNormReluKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/gru_op.cpp b/mobile/src/operators/gru_op.cpp
deleted file mode 100644
index db0936d00c..0000000000
--- a/mobile/src/operators/gru_op.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef GRU_OP
-
-#include "operators/gru_op.h"
-#include <vector>
-#include "common/enforce.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void GruOp<Dtype, T>::InferShape() const {
-  auto input_dims = this->param_.InputInput()->dims();
-  auto weight_dims = this->param_.InputWeight()->dims();
-  int input_size = input_dims[1];
-  int frame_size = weight_dims[0];
-  PADDLE_MOBILE_ENFORCE(
-      (input_size == frame_size * 3),
-      "The input_size must be 3 times of frame_size in GRUOp.");
-  PADDLE_MOBILE_ENFORCE(
-      (weight_dims[1] == frame_size * 3),
-      "The shape of Weight matrix must be [frame_size, frame_size * 3].");
-  if (this->param_.InputH0()) {
-    auto h0_dims = this->param_.InputH0()->dims();
-    PADDLE_MOBILE_ENFORCE((h0_dims[1] == frame_size),
-                          "The width of H0 must be equal to frame_size.");
-  }
-  if (this->param_.InputBias()) {
-    auto bias_dims = this->param_.InputBias()->dims();
-    int bias_height = bias_dims[0];
-    int bias_width = bias_dims[1];
-    PADDLE_MOBILE_ENFORCE((bias_height == 1),
-                          "The shape of Bias must be [1, frame_size * 3].");
-    PADDLE_MOBILE_ENFORCE((bias_width == frame_size * 3),
-                          "The shape of Bias must be [1, frame_size * 3].");
-  }
-  this->param_.OutBatchGate()->Resize(input_dims);
-  this->param_.OutBatchResetHiddenPrev()->Resize({input_dims[0], frame_size});
-  this->param_.OutBatchHidden()->Resize({input_dims[0], frame_size});
-  this->param_.OutHidden()->Resize({input_dims[0], frame_size});
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(gru, ops::GruOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
-#endif
diff --git a/mobile/src/operators/gru_op.h b/mobile/src/operators/gru_op.h
deleted file mode 100644
index 80bbd7c222..0000000000
--- a/mobile/src/operators/gru_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef GRU_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/gru_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class GruOp : public framework::OperatorWithKernel<
-                  DeviceType, GruParam<DeviceType>,
-                  operators::GruKernel<DeviceType, T>> {
- public:
-  GruOp(const std::string &type, const VariableNameMap &inputs,
-        const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-        framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, GruParam<DeviceType>,
-                                      operators::GruKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/gru_unit_op.cpp b/mobile/src/operators/gru_unit_op.cpp
deleted file mode 100644
index 5dd1cd3dd3..0000000000
--- a/mobile/src/operators/gru_unit_op.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef GRU_UNIT_OP
-
-#include "operators/gru_unit_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-void GruUnitOp<DeviceType, T>::InferShape() const {
-  auto input_dims = this->param_.InputInput()->dims();
-  auto hidden_prev_dims = this->param_.InputHiddenPrev()->dims();
-  auto weight_dims = this->param_.InputWeight()->dims();
-  int batch_size = input_dims[0];
-  int input_size = input_dims[1];
-  int frame_size = hidden_prev_dims[1];
-  int weight_height = weight_dims[0];
-  int weight_width = weight_dims[1];
-  PADDLE_MOBILE_ENFORCE(
-      (input_size == frame_size * 3),
-      "The input_size must be 3 times of frame_size in GRUUnitOp.");
-  PADDLE_MOBILE_ENFORCE(
-      (weight_height == frame_size),
-      "The shape of Weight matrix must be [frame_size, frame_size * 3].");
-  PADDLE_MOBILE_ENFORCE(
-      (weight_width == frame_size * 3),
-      "The shape of Weight matrix must be [frame_size, frame_size * 3].");
-  if (this->param_.InputBias()) {
-    auto bias_dims = this->param_.InputBias()->dims();
-    int bias_height = bias_dims[0];
-    int bias_width = bias_dims[1];
-    PADDLE_MOBILE_ENFORCE((bias_height == 1),
-                          "The shape of Bias must be [1, frame_size * 3].");
-    PADDLE_MOBILE_ENFORCE((bias_width == frame_size * 3),
-                          "The shape of Bias must be [1, frame_size * 3].");
-  }
-  this->param_.OutGate()->Resize({batch_size, frame_size * 3});
-  this->param_.OutResetHiddenPrev()->Resize({batch_size, frame_size});
-  this->param_.OutHidden()->Resize({batch_size, frame_size});
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(gru_unit, ops::GruUnitOp);
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-#endif
-
-#endif
diff --git a/mobile/src/operators/gru_unit_op.h b/mobile/src/operators/gru_unit_op.h
deleted file mode 100644
index 8821212bfa..0000000000
--- a/mobile/src/operators/gru_unit_op.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef GRU_UNIT_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/gru_unit_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class GruUnitOp : public framework::OperatorWithKernel<
-                      DeviceType, GruUnitParam<DeviceType>,
-                      operators::GruUnitKernel<DeviceType, T>> {
- public:
-  GruUnitOp(const std::string &type, const VariableNameMap &inputs,
-            const VariableNameMap &outputs, const AttributeMap &attrs,
-            framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, GruUnitParam<DeviceType>,
-                                      operators::GruUnitKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/im2sequence_op.cpp b/mobile/src/operators/im2sequence_op.cpp
deleted file mode 100644
index 75a3c8c350..0000000000
--- a/mobile/src/operators/im2sequence_op.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef IM2SEQUENCE_OP
-
-#include "operators/im2sequence_op.h"
-#include <vector>
-
-namespace paddle_mobile {
-namespace operators {
-
-int Im2SequenceOutputSize(int input_size, int kernel, int padding_1,
-                          int padding_2, int stride) {
-  int output_size =
-      1 + (padding_1 + padding_2 + input_size - kernel + stride - 1) / stride;
-  return output_size;
-}
-
-template <typename Dtype, typename T>
-void Im2SequenceOp<Dtype, T>::InferShape() const {
-  auto in_x_dims = this->param_.Input()->dims();
-  const std::vector<int> &kernels = this->param_.Kernels();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
-
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(Im2SequenceOutputSize(in_x_dims[i + 2], kernels[i],
-                                                 paddings[i], paddings[i + 2],
-                                                 strides[i]));
-  }
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(im2sequence, ops::Im2SequenceOp);
-#endif
-
-#endif  // IM2SEQUENCE_OP
diff --git a/mobile/src/operators/im2sequence_op.h b/mobile/src/operators/im2sequence_op.h
deleted file mode 100644
index 4361380b8f..0000000000
--- a/mobile/src/operators/im2sequence_op.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef IM2SEQUENCE_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/im2sequence_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class Im2SequenceOp : public framework::OperatorWithKernel<
-                          DeviceType, Im2SequenceParam<DeviceType>,
-                          operators::Im2SequenceKernel<DeviceType, T>> {
- public:
-  Im2SequenceOp(const std::string &type, const VariableNameMap &inputs,
-                const VariableNameMap &outputs,
-                const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, Im2SequenceParam<DeviceType>,
-            operators::Im2SequenceKernel<DeviceType, T>>(type, inputs, outputs,
-                                                         attrs, scope) {}
-
-  void InferShape() const override;
-
- private:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/increment_op.cpp b/mobile/src/operators/increment_op.cpp
deleted file mode 100644
index 7a04ae9b77..0000000000
--- a/mobile/src/operators/increment_op.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef INCREMENT_OP
-
-#include "operators/increment_op.h"
-#include "framework/op_proto_maker.h"
-#include "framework/op_registry.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void IncrementOp<Dtype, T>::InferShape() const {
-  auto input = this->param_.InputX();
-  auto out = this->param_.Out();
-  PADDLE_MOBILE_ENFORCE(input->numel() == 1, "input's numel should be 1");
-  out->Resize(input->dims());
-  if (std::is_same<DeviceType<kCPU>, Dtype>::value) {
-    out->set_lod(input->lod());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(increment, ops::IncrementOp);
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-#endif
-
-#endif
diff --git a/mobile/src/operators/increment_op.h b/mobile/src/operators/increment_op.h
deleted file mode 100644
index e0455b9113..0000000000
--- a/mobile/src/operators/increment_op.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef INCREMENT_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/increment_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-template <typename DeviceType, typename T>
-class IncrementOp
-    : public framework::OperatorWithKernel<DeviceType,
-                                           IncrementParam<DeviceType>,
-                                           IncrementKernel<DeviceType, T>> {
- public:
-  IncrementOp(const string &type, const VariableNameMap &inputs,
-              const VariableNameMap &outputs,
-              const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, IncrementParam<DeviceType>,
-                                      IncrementKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/instancenorm_op.cpp b/mobile/src/operators/instancenorm_op.cpp
deleted file mode 100644
index 82cdf36f47..0000000000
--- a/mobile/src/operators/instancenorm_op.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef INSTANCENORM_OP
-
-#include "operators/instancenorm_op.h"
-#include "framework/op_proto_maker.h"
-#include "framework/op_registry.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void InstanceNormOp<Dtype, T>::InferShape() const {
-  auto x_dims = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(x_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(instance_norm, ops::InstanceNormOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/instancenorm_op.h b/mobile/src/operators/instancenorm_op.h
deleted file mode 100644
index 0047ce47ad..0000000000
--- a/mobile/src/operators/instancenorm_op.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef INSTANCENORM_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/instancenorm_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-template <typename DeviceType, typename T>
-class InstanceNormOp
-    : public framework::OperatorWithKernel<DeviceType,
-                                           InstanceNormParam<DeviceType>,
-                                           InstanceNormKernel<DeviceType, T>> {
- public:
-  InstanceNormOp(const string &type, const VariableNameMap &inputs,
-                 const VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, InstanceNormParam<DeviceType>,
-                                      InstanceNormKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/is_empty_op.cpp b/mobile/src/operators/is_empty_op.cpp
deleted file mode 100644
index e3d71c8427..0000000000
--- a/mobile/src/operators/is_empty_op.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef IS_EMPTY_OP
-
-#include "operators/is_empty_op.h"
-#include "framework/op_proto_maker.h"
-#include "framework/op_registry.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void IsEmptyOp<Dtype, T>::InferShape() const {
-  auto out = this->param_.Out();
-  out->Resize({1});
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(is_empty, ops::IsEmptyOp);
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-#endif
-
-#endif
diff --git a/mobile/src/operators/is_empty_op.h b/mobile/src/operators/is_empty_op.h
deleted file mode 100644
index 1f31f25796..0000000000
--- a/mobile/src/operators/is_empty_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef IS_EMPTY_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/is_empty_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-template <typename DeviceType, typename T>
-class IsEmptyOp
-    : public framework::OperatorWithKernel<DeviceType, IsEmptyParam<DeviceType>,
-                                           IsEmptyKernel<DeviceType, T>> {
- public:
-  IsEmptyOp(const string &type, const VariableNameMap &inputs,
-            const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, IsEmptyParam<DeviceType>,
-                                      IsEmptyKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/activation_kernel.h b/mobile/src/operators/kernel/activation_kernel.h
deleted file mode 100644
index b27691d521..0000000000
--- a/mobile/src/operators/kernel/activation_kernel.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef RELU_OP
-DECLARE_KERNEL(Relu, ReluParam);
-DECLARE_KERNEL(Relu6, Relu6Param);
-#endif
-
-#ifdef SIGMOID_OP
-DECLARE_KERNEL(Sigmoid, SigmoidParam);
-#endif
-
-#ifdef TANH_OP
-DECLARE_KERNEL(Tanh, TanhParam);
-#endif
-
-#ifdef LOG_OP
-DECLARE_KERNEL(Log, ReluParam);
-#endif
-
-#ifdef LEAKY_RELU_OP
-DECLARE_KERNEL(LeakyRelu, LeakyReluParam);
-#endif
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/arm/activation_kernel.cpp b/mobile/src/operators/kernel/arm/activation_kernel.cpp
deleted file mode 100644
index be8ebc532f..0000000000
--- a/mobile/src/operators/kernel/arm/activation_kernel.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/activation_kernel.h"
-#include "common/types.h"
-#include "operators/kernel/central-arm-func/activation_arm_func.h"
-#include "operators/math/activation.h"
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef RELU_OP
-template <>
-bool ReluKernel<CPU, float>::Init(ReluParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ReluKernel<CPU, float>::Compute(const ReluParam<CPU> &param) {
-  const LoDTensor *input = param.InputX();
-  LoDTensor *output = param.Out();
-  ActivationCompute<float, RELU>()(input, output);
-  output->set_lod(input->lod());
-}
-
-template <>
-bool Relu6Kernel<CPU, float>::Init(Relu6Param<CPU> *param) {
-  return true;
-}
-
-template <>
-void Relu6Kernel<CPU, float>::Compute(const Relu6Param<CPU> &param) {
-  const LoDTensor *input = param.InputX();
-  LoDTensor *output = param.Out();
-  float threshold = param.getThreshold();
-  ActivationCompute<float, RELU6>()(input, output, threshold);
-  output->set_lod(input->lod());
-}
-#endif
-
-#ifdef SIGMOID_OP
-template <>
-bool SigmoidKernel<CPU, float>::Init(SigmoidParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void SigmoidKernel<CPU, float>::Compute(const SigmoidParam<CPU> &param) {
-  const LoDTensor *input = param.InputX();
-  LoDTensor *output = param.Out();
-  ActivationCompute<float, SIGMOID>()(input, output);
-  output->set_lod(input->lod());
-}
-#endif
-
-#ifdef TANH_OP
-template <>
-bool TanhKernel<CPU, float>::Init(TanhParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void TanhKernel<CPU, float>::Compute(const TanhParam<CPU> &param) {
-  const LoDTensor *input = param.InputX();
-  LoDTensor *output = param.Out();
-  ActivationCompute<float, TANH>()(input, output);
-  output->set_lod(input->lod());
-}
-#endif
-
-#ifdef LOG_OP
-template <>
-bool LogKernel<CPU, float>::Init(ReluParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void LogKernel<CPU, float>::Compute(const ReluParam<CPU> &param) {
-  const LoDTensor *input = param.InputX();
-  LoDTensor *output = param.Out();
-  ActivationCompute<float, LOG>()(input, output);
-  output->set_lod(input->lod());
-}
-#endif
-
-#ifdef LEAKY_RELU_OP
-template <>
-bool LeakyReluKernel<CPU, float>::Init(LeakyReluParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void LeakyReluKernel<CPU, float>::Compute(const LeakyReluParam<CPU> &param) {
-  const LoDTensor *input = param.InputX();
-  LoDTensor *output = param.Out();
-  ActivationCompute<float, LEAKY_RELU>()(input, output, param.Alpha());
-  output->set_lod(input->lod());
-}
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/arm/anchor_generator_kernel.cpp b/mobile/src/operators/kernel/arm/anchor_generator_kernel.cpp
deleted file mode 100644
index c493d78bb0..0000000000
--- a/mobile/src/operators/kernel/arm/anchor_generator_kernel.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ANCHOR_GENERATOR_OP
-
-#include <vector>
-#include "operators/kernel/detection_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool AnchorGeneratorKernel<CPU, float>::Init(AnchorGeneratorParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void AnchorGeneratorKernel<CPU, float>::Compute(
-    const AnchorGeneratorParam<CPU> &param) {
-  // TODO(hjchen2)
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // ANCHOR_GENERATOR_OP
diff --git a/mobile/src/operators/kernel/arm/assign_kernel.cpp b/mobile/src/operators/kernel/arm/assign_kernel.cpp
deleted file mode 100644
index 823bb3ca41..0000000000
--- a/mobile/src/operators/kernel/arm/assign_kernel.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ASSIGN_OP
-
-#include "operators/kernel/assign_kernel.h"
-#include "framework/data_type.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool AssignKernel<CPU, float>::Init(AssignParam<CPU>* param) {
-  return true;
-}
-
-template <>
-void AssignKernel<CPU, float>::Compute(const AssignParam<CPU>& param) {
-  const auto* input = param.Input();
-  auto* out = param.Output();
-  out->mutable_data<float>();
-  framework::TensorCopy(*input, out);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // ASSIGN_OP
diff --git a/mobile/src/operators/kernel/arm/assign_value_kernel.cpp b/mobile/src/operators/kernel/arm/assign_value_kernel.cpp
deleted file mode 100644
index 7390f77ed1..0000000000
--- a/mobile/src/operators/kernel/arm/assign_value_kernel.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ASSIGN_VALUE_OP
-
-#include "operators/kernel/assign_value_kernel.h"
-#include "framework/data_type.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-struct AssignValueOpFunctor {
-  framework::LoDTensor* output_;
-  const std::vector<int> shape_;
-  const std::vector<int> int32_values_;
-  const std::vector<float> fp32_values_;
-
-  AssignValueOpFunctor(framework::LoDTensor* output,
-                       const std::vector<int>& shape,
-                       const std::vector<float>& fp32_values,
-                       const std::vector<int>& int32_values)
-      : output_(output),
-        shape_(shape),
-        int32_values_(int32_values),
-        fp32_values_(fp32_values) {}
-
-  template <typename T>
-  inline void apply() const {
-    PADDLE_MOBILE_THROW_EXCEPTION("Assign value: not supported data type.");
-  }
-};
-
-template <>
-inline void AssignValueOpFunctor::apply<int>() const {
-  framework::TensorFromVector<int>(int32_values_, output_);
-  output_->Resize(framework::make_ddim(shape_));
-}
-
-template <>
-inline void AssignValueOpFunctor::apply<float>() const {
-  framework::TensorFromVector<float>(fp32_values_, output_);
-  output_->Resize(framework::make_ddim(shape_));
-}
-
-template <>
-bool AssignValueKernel<CPU, float>::Init(AssignValueParam<CPU>* param) {
-  return true;
-}
-
-template <>
-void AssignValueKernel<CPU, float>::Compute(
-    const AssignValueParam<CPU>& param) {
-  framework::VisitDataType(
-      framework::ToDataType(param.dtype_),
-      AssignValueOpFunctor(param.output_, param.shape_, param.fp32_values_,
-                           param.int32_values_));
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // ASSIGN_VALUE_OP
diff --git a/mobile/src/operators/kernel/arm/batchnorm_kernel.cpp b/mobile/src/operators/kernel/arm/batchnorm_kernel.cpp
deleted file mode 100644
index f31c4426db..0000000000
--- a/mobile/src/operators/kernel/arm/batchnorm_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BATCHNORM_OP
-
-#include "operators/kernel/batchnorm_kernel.h"
-#include "operators/kernel/central-arm-func/batchnorm_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool BatchNormKernel<CPU, float>::Init(BatchNormParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void BatchNormKernel<CPU, float>::Compute(const BatchNormParam<CPU> &param) {
-  BatchnormCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/beam_search_decode_kernel.cpp b/mobile/src/operators/kernel/arm/beam_search_decode_kernel.cpp
deleted file mode 100644
index 97aaffe7c2..0000000000
--- a/mobile/src/operators/kernel/arm/beam_search_decode_kernel.cpp
+++ /dev/null
@@ -1,278 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BEAM_SEARCH_DECODE_OP
-
-#include "operators/kernel/beam_search_decode_kernel.h"
-#include <algorithm>
-#include "framework/data_type.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using LoDTensorArray = framework::LoDTensorArray;
-
-// all the lod have 2 levels.
-// The first is source level, the second is sentence level.
-// source level describe how many prefixes (branchs) for each source sentece
-// (beam). sentence level describe how these candidates belong to the prefixes.
-const size_t kSourceLevel = 0;
-const size_t kSentenceLevel = 1;
-
-template <typename T>
-struct Sentence {
-  std::vector<int64_t> word_ids;
-  std::vector<T> scores;
-};
-
-template <typename T>
-using SentenceVector = std::vector<Sentence<T>>;
-
-template <typename T>
-struct BeamSearchDecoder {
-  BeamSearchDecoder(size_t beam_size, int end_id)
-      : beam_size_(beam_size), end_id_(end_id) {}
-
-  /**
-   * convert the result sentence_vector for each source sentence into two
-   * LodTensor.
-   * One is all candidate sentences with word id, one is all candidate sentences
-   * with word score.
-   * Param:
-   *  sentence_vector_list: sentence_vector for each source sentence.
-   *  id_tensor: result LoDTensor for sentences of id.
-   *  score_tensor: result LoDTensor for sentences of score.
-   *  reverse: whether ids of sentence in sentence_vector_list is reversed
-   *  sort_by_score: whether to sort hypotheses of each sentence by scores.
-   */
-  void ConvertSentenceVectorToLodTensor(
-      std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
-      LoDTensor* score_tensor, bool reverse = true,
-      bool sort_by_score = true) const;
-
-  /**
-   * Gather the hypotheses for each source sentence by backtrace though the
-   * LoDTensorArray step_ids whose lods reserve the path in the tree.
-   */
-  void Backtrace(const LoDTensorArray& step_ids,
-                 const LoDTensorArray& step_scores, LoDTensor* id_tensor,
-                 LoDTensor* score_tensor) const;
-
-  size_t beam_size_;
-  int end_id_;
-};
-
-template <typename T>
-void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
-    std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
-    LoDTensor* score_tensor, bool reverse, bool sort_by_score) const {
-  size_t src_num = sentence_vector_list.size();
-
-  PADDLE_MOBILE_ENFORCE(src_num > 0, "src_num should be larger than 0");
-
-  std::vector<size_t> source_level_lod = {0};
-  std::vector<size_t> sentence_level_lod = {0};
-  std::vector<int64_t> id_data;
-  std::vector<T> score_data;
-
-  for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
-    if (sort_by_score) {
-      sort(sentence_vector_list[src_idx].begin(),
-           sentence_vector_list[src_idx].end(),
-           [reverse](const Sentence<T>& a, const Sentence<T>& b) {
-             if (reverse)
-               return a.scores.front() > b.scores.front();
-             else
-               return a.scores.back() > b.scores.back();
-           });
-    }
-    for (Sentence<T>& sentence : sentence_vector_list[src_idx]) {
-      if (reverse) {
-        id_data.insert(id_data.end(), sentence.word_ids.rbegin(),
-                       sentence.word_ids.rend());
-        score_data.insert(score_data.end(), sentence.scores.rbegin(),
-                          sentence.scores.rend());
-      } else {
-        id_data.insert(id_data.end(), sentence.word_ids.begin(),
-                       sentence.word_ids.end());
-        score_data.insert(score_data.end(), sentence.scores.begin(),
-                          sentence.scores.end());
-      }
-
-      sentence_level_lod.push_back(sentence_level_lod.back() +
-                                   sentence.word_ids.size());
-    }
-    source_level_lod.push_back(source_level_lod.back() +
-                               sentence_vector_list[src_idx].size());
-  }
-
-  framework::LoD lod;
-  lod.push_back(source_level_lod);
-  lod.push_back(sentence_level_lod);
-
-  id_tensor->set_lod(lod);
-  id_tensor->Resize({static_cast<int64_t>(id_data.size())});
-  id_tensor->mutable_data<int64_t>();
-  framework::TensorFromVector<int64_t>(id_data, id_tensor);
-
-  score_tensor->set_lod(lod);
-  score_tensor->Resize({static_cast<int64_t>(score_data.size())});
-  score_tensor->mutable_data<T>();
-  framework::TensorFromVector<T>(score_data, score_tensor);
-}
-
-template <typename T>
-void BeamSearchDecoder<T>::Backtrace(const LoDTensorArray& step_ids,
-                                     const LoDTensorArray& step_scores,
-                                     LoDTensor* id_tensor,
-                                     LoDTensor* score_tensor) const {
-  PADDLE_MOBILE_ENFORCE(!step_ids.empty(), "step num should be larger than 0");
-  PADDLE_MOBILE_ENFORCE(step_ids.size() == step_scores.size(),
-                        "step_ids and step_scores should be the same");
-  const size_t step_num = step_ids.size();
-  const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1;
-  std::vector<SentenceVector<T>> sentence_vector_list(
-      src_num, SentenceVector<T>(beam_size_));
-  std::vector<std::vector<size_t>> prefix_idx_vector_list(src_num);
-  for (int step_id = step_num - 1; step_id >= 0; --step_id) {
-    auto& cur_ids = step_ids.at(step_id);
-    auto& cur_scores = step_scores.at(step_id);
-    for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
-      // for each source sentence
-      auto& sentence_vector = sentence_vector_list.at(src_idx);
-      auto& prefix_idx_vector = prefix_idx_vector_list.at(src_idx);
-      size_t src_prefix_start = cur_ids.lod().at(kSourceLevel)[src_idx];
-      size_t src_prefix_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
-      if (prefix_idx_vector.empty()) {  // be finished and pruned at this step
-                                        // or the last time step
-        for (size_t prefix_idx = src_prefix_start; prefix_idx < src_prefix_end;
-             ++prefix_idx) {
-          size_t candidate_start = cur_ids.lod().at(kSentenceLevel)[prefix_idx];
-          size_t candidate_end =
-              cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1];
-          for (size_t candidate_idx = candidate_start;
-               candidate_idx < candidate_end; ++candidate_idx) {
-            prefix_idx_vector.push_back(prefix_idx);
-            size_t idx = prefix_idx_vector.size() - 1;
-            auto cur_id = cur_ids.data<int64_t>()[candidate_idx];
-            auto cur_score = cur_scores.data<T>()[candidate_idx];
-            sentence_vector.at(idx).word_ids.push_back(cur_id);
-            sentence_vector.at(idx).scores.push_back(cur_score);
-          }
-        }
-      } else {  // use prefix_idx_vector to backtrace
-        size_t src_candidate_start =
-            cur_ids.lod().at(kSentenceLevel)[src_prefix_start];
-        size_t prefix_idx = src_prefix_start;
-        size_t candidate_num =
-            cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
-            cur_ids.lod().at(kSentenceLevel)[prefix_idx];
-        for (size_t idx = 0; idx < prefix_idx_vector.size(); ++idx) {
-          auto candidate_idx = prefix_idx_vector.at(idx);
-          auto cur_id = cur_ids.data<int64_t>()[candidate_idx];
-          auto cur_score = cur_scores.data<T>()[candidate_idx];
-          if (cur_id != end_id_ || sentence_vector.at(idx).word_ids.empty()) {
-            // to skip redundant end tokens
-            sentence_vector.at(idx).word_ids.push_back(cur_id);
-            sentence_vector.at(idx).scores.push_back(cur_score);
-          }
-
-          while (src_candidate_start + candidate_num <=
-                 candidate_idx) {  // search the corresponding prefix
-            prefix_idx++;
-            candidate_num += cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
-                             cur_ids.lod().at(kSentenceLevel)[prefix_idx];
-          }
-          prefix_idx_vector.at(idx) = prefix_idx;
-        }
-      }
-    }
-  }
-
-  ConvertSentenceVectorToLodTensor(sentence_vector_list, id_tensor,
-                                   score_tensor, true, true);
-}
-
-struct BeamSearchDecodeFunctor {
-  BeamSearchDecodeFunctor(const LoDTensorArray& step_ids,
-                          const LoDTensorArray& step_scores,
-                          LoDTensor* id_tensor, LoDTensor* score_tensor,
-                          size_t beam_size, int end_id)
-      : beam_size_(beam_size),
-        end_id_(end_id),
-        step_ids_(step_ids),
-        step_scores_(step_scores),
-        id_tensor_(id_tensor),
-        score_tensor_(score_tensor) {}
-
-  template <typename T>
-  void apply() const;
-
-  size_t beam_size_;
-  int end_id_;
-  const LoDTensorArray& step_ids_;
-  const LoDTensorArray& step_scores_;
-  LoDTensor* id_tensor_;
-  LoDTensor* score_tensor_;
-};
-
-template <typename T>
-void BeamSearchDecodeFunctor::apply() const {
-  BeamSearchDecoder<T> beam_search_decoder(beam_size_, end_id_);
-  beam_search_decoder.Backtrace(step_ids_, step_scores_, id_tensor_,
-                                score_tensor_);
-}
-
-template <>
-void BeamSearchDecodeFunctor::apply<bool>() const {
-  PADDLE_MOBILE_THROW_EXCEPTION("beam search decode op does not support bool.");
-}
-
-template <>
-bool BeamSearchDecodeKernel<CPU, float>::Init(
-    BeamSearchDecodeParam<CPU>* param) {
-  return true;
-}
-
-template <>
-void BeamSearchDecodeKernel<CPU, float>::Compute(
-    const BeamSearchDecodeParam<CPU>& param) {
-  const LoDTensorArray* ids = param.ids_;
-  const LoDTensorArray* scores = param.scores_;
-
-  const size_t step_num = ids->size();
-  PADDLE_MOBILE_ENFORCE(step_num > 0,
-                        "beam search steps should be larger than 0");
-
-  for (size_t i = 0; i < step_num; ++i) {
-    PADDLE_MOBILE_ENFORCE(ids->at(i).lod().size() == 2,
-                          "Level of LodTensor should be 2");
-  }
-  const size_t source_num = ids->at(0).lod().at(0).size() - 1;
-  PADDLE_MOBILE_ENFORCE(source_num > 0, "source num should be larger than 0");
-
-  LoDTensor* sentence_ids = param.sentence_ids_;
-  LoDTensor* sentence_scores = param.sentence_scores_;
-
-  framework::VisitDataType(
-      framework::ToDataType(scores->at(0).type()),
-      BeamSearchDecodeFunctor(*ids, *scores, sentence_ids, sentence_scores,
-                              param.beam_size_, param.end_id_));
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/beam_search_kernel.cpp b/mobile/src/operators/kernel/arm/beam_search_kernel.cpp
deleted file mode 100644
index 9128c57c64..0000000000
--- a/mobile/src/operators/kernel/arm/beam_search_kernel.cpp
+++ /dev/null
@@ -1,262 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BEAM_SEARCH_OP
-
-#include "operators/kernel/beam_search_kernel.h"
-#include <cmath>
-#include <numeric>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Device, typename T>
-class BeamSearchFunctor {
- public:
-  void operator()(const framework::LoDTensor *pre_ids,
-                  const framework::LoDTensor *pre_scores,
-                  const framework::LoDTensor *ids,
-                  const framework::LoDTensor *scores,
-                  framework::LoDTensor *selected_ids,
-                  framework::LoDTensor *selected_scores,
-                  framework::Tensor *parent_idx, size_t level, size_t beam_size,
-                  int end_id, bool is_accumulated) {
-    auto abs_lod = framework::ToAbsOffset(scores->lod());
-    auto &high_level = abs_lod[level];
-
-    auto items = SelectTopBeamSizeItems(pre_ids, pre_scores, ids, scores, level,
-                                        beam_size, end_id, is_accumulated);
-    auto selected_items = ToMap(items, high_level.back());
-
-    PruneEndBeams(pre_ids, abs_lod, &selected_items, level, end_id);
-    // calculate the output tensor's height
-    size_t num_instances = std::accumulate(
-        std::begin(selected_items), std::end(selected_items), 0,
-        [](size_t a, std::vector<Item> &b) { return a + b.size(); });
-    // the output tensor shape should be [num_instances, 1]
-    auto dims = framework::make_ddim(
-        std::vector<int64_t>({static_cast<int>(num_instances), 1}));
-    selected_ids->Resize(dims);
-    selected_scores->Resize(dims);
-    parent_idx->Resize({static_cast<int64_t>(num_instances)});
-
-    auto *selected_ids_data = selected_ids->mutable_data<int64_t>();
-    auto *selected_scores_data = selected_scores->mutable_data<float>();
-    auto *parent_idx_data = parent_idx->mutable_data<int>();
-
-    // fill in data
-    std::vector<size_t> low_level;
-    size_t low_offset = 0;
-    for (auto &items : selected_items) {
-      low_level.push_back(low_offset);
-      for (auto &item : items) {
-        parent_idx_data[low_offset] = static_cast<int>(low_level.size() - 1);
-        selected_ids_data[low_offset] = item.id;
-        selected_scores_data[low_offset] = item.score;
-        low_offset++;
-      }
-    }
-    low_level.push_back(low_offset);
-
-    // fill lod
-    framework::LoD lod(2);
-    lod[0].assign(high_level.begin(), high_level.end());
-    lod[1].assign(low_level.begin(), low_level.end());
-    selected_ids->set_lod(lod);
-    selected_scores->set_lod(lod);
-  }
-
-  /*
-   * The basic items help to sort.
-   */
-  struct Item {
-    Item() {}
-    Item(size_t offset, size_t id, float score)
-        : offset(offset), id(id), score(score) {}
-    // offset in the higher lod level.
-    size_t offset;
-    // prefix id in the lower lod level.
-    // size_t prefix;
-    // the candidate id
-    size_t id;
-    // the corresponding score
-    float score;
-
-    inline bool operator<(const Item &in) const {
-      return (score < in.score) ||
-             ((score == in.score) && (offset < in.offset));
-    }
-
-    inline void operator=(const Item &in) {
-      offset = in.offset;
-      id = in.id;
-      score = in.score;
-    }
-  };
-
- protected:
-  /*
-   * Prune the source sentences all branchs finished, and it is optional.
-   * Pruning must one step later than finishing (thus pre_ids is needed here),
-   * since the end tokens must be writed out.
-   */
-  void PruneEndBeams(const framework::LoDTensor *pre_ids,
-                     const framework::LoD &abs_lod,
-                     std::vector<std::vector<Item>> *items, size_t lod_level,
-                     int end_id) {
-    auto *pre_ids_data = pre_ids->data<int64_t>();
-    auto &high_level = abs_lod[lod_level];
-    for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) {
-      size_t src_prefix_start = high_level[src_idx];
-      size_t src_prefix_end = high_level[src_idx + 1];
-      bool finish_flag = true;
-      for (size_t offset = src_prefix_start; offset < src_prefix_end;
-           offset++) {
-        for (auto &item : items->at(offset)) {
-          if (item.id != static_cast<size_t>(end_id) ||
-              pre_ids_data[offset] != end_id) {
-            finish_flag = false;
-            break;
-          }
-        }
-        if (!finish_flag) break;
-      }
-      if (finish_flag) {  // all branchs of the beam (source sentence) end and
-                          // prune this beam
-        for (size_t offset = src_prefix_start; offset < src_prefix_end;
-             offset++)
-          items->at(offset).clear();
-      }
-    }
-  }
-
-  /*
-   * Transform the items into a map whose key is offset, value is the items.
-   * NOTE low performance.
-   */
-  std::vector<std::vector<Item>> ToMap(
-      const std::vector<std::vector<Item>> &items, size_t element_num) {
-    std::vector<std::vector<Item>> result;
-    result.resize(element_num);
-    for (auto &entries : items) {
-      for (const auto &item : entries) {
-        result[item.offset].push_back(item);
-      }
-    }
-    return result;
-  }
-
-  void Insert(std::vector<Item> *top_beam_ptr, const Item &item,
-              size_t beam_size) {
-    std::vector<Item> &top_beam = *top_beam_ptr;
-
-    size_t num_beams = top_beam.size();
-    if (num_beams < beam_size) {
-      top_beam.resize(num_beams + 1);
-      num_beams++;
-    } else {
-      if (item < top_beam[beam_size - 1]) {
-        return;
-      }
-    }
-
-    for (int k = static_cast<int>(num_beams) - 2; k >= 0; --k) {
-      if (top_beam[k] < item) {
-        top_beam[k + 1] = top_beam[k];
-      } else {
-        top_beam[k + 1] = item;
-        return;
-      }
-    }
-    top_beam[0] = item;
-  }
-
-  /*
-   * For each source, select top beam_size records.
-   */
-  std::vector<std::vector<Item>> SelectTopBeamSizeItems(
-      const framework::LoDTensor *pre_ids,
-      const framework::LoDTensor *pre_scores, const framework::LoDTensor *ids,
-      const framework::LoDTensor *scores, size_t lod_level, size_t beam_size,
-      int end_id, bool is_accumulated) {
-    std::vector<std::vector<Item>> result;
-
-    // find the current candidates
-    auto abs_lod = framework::ToAbsOffset(scores->lod());
-
-    auto *pre_ids_data = pre_ids->data<int64_t>();
-    auto *pre_scores_data = pre_scores->data<float>();
-
-    auto *ids_data = ids ? ids->data<int64_t>() : nullptr;
-    auto *scores_data = scores->data<float>();
-
-    size_t num_seqs = scores->NumElements(lod_level);
-    size_t seq_width = 1;
-    for (int i = 1; i < scores->dims().size(); i++) {
-      seq_width *= scores->dims()[i];
-    }
-
-    for (size_t seq_id = 0; seq_id < num_seqs; ++seq_id) {
-      size_t seq_offset_start = abs_lod[lod_level][seq_id];
-      size_t seq_offset_end = abs_lod[lod_level][seq_id + 1];
-
-      std::vector<Item> top_beam;
-      top_beam.reserve(beam_size);
-
-      for (size_t offset = seq_offset_start; offset < seq_offset_end;
-           ++offset) {
-        auto pre_id = pre_ids_data[offset];
-        auto pre_score = pre_scores_data[offset];
-        if (pre_id == end_id) {
-          // Allocate all probability mass to end_id for finished branchs and
-          // the other candidate ids can be ignored.
-          Item item(offset, end_id, pre_score);
-          Insert(&top_beam, item, beam_size);
-        } else {
-          size_t index = offset * seq_width;
-          for (size_t d = 0; d < seq_width; d++, index++) {
-            int64_t id = ids_data ? ids_data[index] : static_cast<int64_t>(d);
-            float score = is_accumulated
-                              ? scores_data[index]
-                              : pre_score + std::log(scores_data[index]);
-            Item item(offset, id, score);
-            Insert(&top_beam, item, beam_size);
-          }
-        }
-      }
-
-      result.emplace_back(top_beam);
-    }
-
-    return result;
-  }
-};
-
-template <>
-bool BeamSearchKernel<CPU, float>::Init(BeamSearchParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void BeamSearchKernel<CPU, float>::Compute(const BeamSearchParam<CPU> &param) {
-  BeamSearchFunctor<CPU, float> alg;
-  alg(param.pre_ids_, param.pre_scores_, param.ids_, param.scores_,
-      param.selected_ids_, param.selected_scores_, param.parent_idx_,
-      param.level_, param.beam_size_, param.end_id_, param.is_accumulated_);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/bilinear_interp_kernel.cpp b/mobile/src/operators/kernel/arm/bilinear_interp_kernel.cpp
deleted file mode 100644
index 85192e28ed..0000000000
--- a/mobile/src/operators/kernel/arm/bilinear_interp_kernel.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BILINEAR_INTERP_OP
-
-#include "operators/kernel/bilinear_interp_kernel.h"
-#include "operators/kernel/central-arm-func/bilinear_interp_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool BilinearInterpKernel<CPU, float>::Init(BilinearInterpParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void BilinearInterpKernel<CPU, float>::Compute(
-    const BilinearInterpParam<CPU> &param) {
-  BilinearInterpCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/box_coder_kernel.cpp b/mobile/src/operators/kernel/arm/box_coder_kernel.cpp
deleted file mode 100644
index 30ede12dff..0000000000
--- a/mobile/src/operators/kernel/arm/box_coder_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BOXCODER_OP
-
-#include "operators/kernel/box_coder_kernel.h"
-#include "operators/kernel/central-arm-func/box_coder_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool BoxCoderKernel<CPU, float>::Init(BoxCoderParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void BoxCoderKernel<CPU, float>::Compute(const BoxCoderParam<CPU> &param) {
-  BoxCoderCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/cast_kernel.cpp b/mobile/src/operators/kernel/arm/cast_kernel.cpp
deleted file mode 100644
index 166e821172..0000000000
--- a/mobile/src/operators/kernel/arm/cast_kernel.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CAST_OP
-
-#include <algorithm>
-#include <vector>
-#include "framework/data_type.h"
-#include "operators/kernel/kernels.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename InT>
-struct CastOutOpFunctor {
-  const framework::Tensor* in_;
-  framework::Tensor* out_;
-  CastOutOpFunctor(const framework::Tensor* in, framework::Tensor* out)
-      : in_(in), out_(out) {}
-
-  template <typename OutT>
-  void apply() const {
-    const InT* input = in_->data<InT>();
-    OutT* output = out_->mutable_data<OutT>();
-    size_t numel = in_->numel();
-    for (int i = 0; i < numel; ++i) {
-      output[i] = static_cast<OutT>(input[i]);
-    }
-  }
-};
-
-// struct CastOpFunctor {
-//  const framework::Tensor* in_;
-//  framework::Tensor* out_;
-//  int output_type_;
-//  CastOpFunctor(const framework::Tensor* in, framework::Tensor* out,
-//                const int output_type)
-//      : in_(in), out_(out), output_type_(output_type) {}
-//
-//  template <typename InT>
-//  void apply() const {
-//    framework::VisitDataType(framework::ToDataType(output_type_),
-//                             CastOutOpFunctor<InT>(in_, out_));
-//  }
-//};
-
-template <>
-bool CastKernel<CPU, float>::Init(CastParam<CPU>* param) {
-  return true;
-}
-
-template <>
-void CastKernel<CPU, float>::Compute(const CastParam<CPU>& param) {
-  const Tensor* input = param.input_;
-  Tensor* output = param.output_;
-  if (input->type() == type_id<float>()) {
-    framework::VisitDataType(framework::ToDataType(param.output_type_),
-                             CastOutOpFunctor<float>(input, output));
-  } else if (input->type() == type_id<int64_t>()) {
-    framework::VisitDataType(framework::ToDataType(param.output_type_),
-                             CastOutOpFunctor<int64_t>(input, output));
-  } else if (input->type() == type_id<int>()) {
-    framework::VisitDataType(framework::ToDataType(param.output_type_),
-                             CastOutOpFunctor<int>(input, output));
-  } else {
-    PADDLE_MOBILE_ENFORCE(0, "input tpye not support now!")
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // CAST_OP
diff --git a/mobile/src/operators/kernel/arm/compare_kernel.cpp b/mobile/src/operators/kernel/arm/compare_kernel.cpp
deleted file mode 100644
index d321740fd2..0000000000
--- a/mobile/src/operators/kernel/arm/compare_kernel.cpp
+++ /dev/null
@@ -1,274 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/compare_kernel.h"
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-
-typedef enum {
-  LESS_THAN = 0,
-  LESS_EQUAL = 1,
-  GREATER_THAN = 2,
-  GREATER_EQUAL = 3,
-  EQUAL = 4,
-  NOT_EQUAL = 5,
-} CompareType;
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-template <CompareType Comp = LESS_THAN>
-inline uint32x4_t vcmpq_f32(const float32x4_t x, const float32x4_t y) {
-  return vcleq_f32(x, y);
-}
-#endif
-
-template <CompareType Comp = LESS_THAN>
-inline uint8_t Compare(const float x, const float y) {
-  return static_cast<uint8_t>(x < y);
-}
-
-template <CompareType Comp = EQUAL>
-inline uint8_t Compare(const int x, const int y) {
-  return static_cast<uint8_t>(x == y);
-}
-
-template <CompareType Comp = LESS_THAN>
-inline uint8_t Compare(const int64_t x, const int64_t y) {
-  return static_cast<uint8_t>(x < y);
-}
-
-template <typename Dtype, CompareType Comp>
-struct CompareCompute {
-  void operator()(const Tensor *X, const Tensor *Y, const int Axis,
-                  Tensor *Out) {}
-};
-
-template <CompareType Comp>
-struct CompareCompute<float, Comp> {
-  void operator()(const Tensor *X, const Tensor *Y, const int Axis,
-                  Tensor *Out) {
-    const float *x = X->data<float>();
-    const float *y = Y->data<float>();
-    uint8_t *output = reinterpret_cast<uint8_t *>(Out->mutable_data<bool>());
-    const auto &x_dims = X->dims();
-    const auto &y_dims = Y->dims();
-    /// axis = -1 represent the last dimensions.
-    int axis = (Axis == -1 ? x_dims.size() - y_dims.size() : Axis);
-    int batch = 1;
-    int channels = 1;
-    int elementwise_num = 1;
-    for (int i = 0; i < axis; ++i) {
-      batch *= x_dims[i];
-    }
-    for (int i = 0; i < y_dims.size(); ++i) {
-      channels *= y_dims[i];
-    }
-    for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
-      elementwise_num *= x_dims[i];
-    }
-    // if elementwise_num == 1, compare rowwise
-    if (elementwise_num == 1) {
-      int remain_start = 0;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-      remain_start = channels & 0xfffffff8;
-      uint8x8_t __mask = vdup_n_u8(0x1);
-      for (int i = 0; i < batch; ++i) {
-        for (int j = 0; j < channels - 7; j += 8) {
-          int x_offset = i * channels + j;
-          float32x4_t __x0 = vld1q_f32(x + x_offset);
-          float32x4_t __x1 = vld1q_f32(x + x_offset + 4);
-          float32x4_t __y0 = vld1q_f32(y + j);
-          float32x4_t __y1 = vld1q_f32(y + j + 4);
-          uint32x4_t __cmp0 = vcmpq_f32<Comp>(__x0, __y0);
-          uint32x4_t __cmp1 = vcmpq_f32<Comp>(__x1, __y1);
-          uint16x4_t __ncmp0 = vmovn_u32(__cmp0);
-          uint16x4_t __ncmp1 = vmovn_u32(__cmp1);
-          uint16x8_t __ncmp = vcombine_u16(__ncmp0, __ncmp1);
-          uint8x8_t __nncmp = vmovn_u16(__ncmp);
-          __nncmp = vand_u8(__nncmp, __mask);
-          vst1_u8(output + x_offset, __nncmp);
-        }
-      }
-#endif  // __ARM_NEON__
-      for (int i = 0; i < batch; ++i) {
-        for (int j = remain_start; j < channels; ++j) {
-          int x_offset = i * channels + j;
-          output[x_offset] = Compare<Comp>(x[x_offset], y[j]);
-        }
-      }
-    } else {
-      for (int i = 0; i < batch; ++i) {
-        for (int j = 0; j < channels; ++j) {
-          int x_offset = (i * channels + j) * elementwise_num;
-          int y_offset = j * elementwise_num;
-          int remain_start = 0;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-          remain_start = elementwise_num & 0xfffffff8;
-          uint8x8_t __mask = vdup_n_u8(0x1);
-          for (int k = 0; k < elementwise_num - 7; k += 8) {
-            float32x4_t __x0 = vld1q_f32(x + x_offset);
-            float32x4_t __x1 = vld1q_f32(x + x_offset + 4);
-            float32x4_t __y0 = vld1q_f32(y + y_offset);
-            uint32x4_t __cmp0 = vcmpq_f32<Comp>(__x0, __y0);
-            uint32x4_t __cmp1 = vcmpq_f32<Comp>(__x1, __y0);
-            uint16x4_t __ncmp0 = vmovn_u32(__cmp0);
-            uint16x4_t __ncmp1 = vmovn_u32(__cmp1);
-            uint16x8_t __ncmp = vcombine_u16(__ncmp0, __ncmp1);
-            uint8x8_t __nncmp = vmovn_u16(__ncmp);
-            __nncmp = vand_u8(__nncmp, __mask);
-            vst1_u8(output + x_offset, __nncmp);
-            x_offset += 8;
-            y_offset += 8;
-          }
-#endif  // __ARM_NEON__
-          for (int k = remain_start; k < elementwise_num; ++k) {
-            output[x_offset + k] = Compare<Comp>(x[x_offset + k], y[y_offset]);
-          }
-        }
-      }
-    }
-  }
-};
-
-template <CompareType Comp>
-struct CompareCompute<int64_t, Comp> {
-  void operator()(const Tensor *X, const Tensor *Y, const int Axis,
-                  Tensor *Out) {
-    const int64_t *x = X->data<int64_t>();
-    const int64_t *y = Y->data<int64_t>();
-    uint8_t *output = reinterpret_cast<uint8_t *>(Out->mutable_data<bool>());
-    const auto &x_dims = X->dims();
-    const auto &y_dims = Y->dims();
-    /// axis = -1 represent the last dimensions.
-    int axis = (Axis == -1 ? x_dims.size() - y_dims.size() : Axis);
-    int batch = 1;
-    int channels = 1;
-    int elementwise_num = 1;
-    for (int i = 0; i < axis; ++i) {
-      batch *= x_dims[i];
-    }
-    for (int i = 0; i < y_dims.size(); ++i) {
-      channels *= y_dims[i];
-    }
-    for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
-      elementwise_num *= x_dims[i];
-    }
-    // if elementwise_num == 1, compare rowwise
-    if (elementwise_num == 1) {
-      for (int i = 0; i < batch; ++i) {
-        for (int j = 0; j < channels; ++j) {
-          int x_offset = i * channels + j;
-          output[x_offset] = Compare<Comp>(x[x_offset], y[j]);
-        }
-      }
-    } else {
-      for (int i = 0; i < batch; ++i) {
-        for (int j = 0; j < channels; ++j) {
-          int x_offset = (i * channels + j) * elementwise_num;
-          int y_offset = j * elementwise_num;
-          for (int k = 0; k < elementwise_num; ++k) {
-            output[x_offset + k] = Compare<Comp>(x[x_offset + k], y[y_offset]);
-          }
-        }
-      }
-    }
-  }
-};
-
-template <CompareType Comp>
-struct CompareCompute<int, Comp> {
-  void operator()(const Tensor *X, const Tensor *Y, const int Axis,
-                  Tensor *Out) {
-    const int *x = X->data<int>();
-    const int *y = Y->data<int>();
-    uint8_t *output = reinterpret_cast<uint8_t *>(Out->mutable_data<bool>());
-    const auto &x_dims = X->dims();
-    const auto &y_dims = Y->dims();
-    /// axis = -1 represent the last dimensions.
-    int axis = (Axis == -1 ? x_dims.size() - y_dims.size() : Axis);
-    int batch = 1;
-    int channels = 1;
-    int elementwise_num = 1;
-    for (int i = 0; i < axis; ++i) {
-      batch *= x_dims[i];
-    }
-    for (int i = 0; i < y_dims.size(); ++i) {
-      channels *= y_dims[i];
-    }
-    for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
-      elementwise_num *= x_dims[i];
-    }
-    // if elementwise_num == 1, compare rowwise
-    if (elementwise_num == 1) {
-      for (int i = 0; i < batch; ++i) {
-        for (int j = 0; j < channels; ++j) {
-          int x_offset = i * channels + j;
-          output[x_offset] = Compare<Comp>(x[x_offset], y[j]);
-        }
-      }
-    } else {
-      for (int i = 0; i < batch; ++i) {
-        for (int j = 0; j < channels; ++j) {
-          int x_offset = (i * channels + j) * elementwise_num;
-          int y_offset = j * elementwise_num;
-          for (int k = 0; k < elementwise_num; ++k) {
-            output[x_offset + k] = Compare<Comp>(x[x_offset + k], y[y_offset]);
-          }
-        }
-      }
-    }
-  }
-};
-
-#ifdef LESS_THAN_OP
-template <>
-bool LessThanKernel<CPU, float>::Init(CompareParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void LessThanKernel<CPU, float>::Compute(const CompareParam<CPU> &param) {
-  if (param.input_x_->type() == type_id<int64_t>().hash_code()) {
-    CompareCompute<int64_t, LESS_THAN>()(param.input_x_, param.input_y_,
-                                         param.axis_, param.output_);
-  } else if (param.input_x_->type() == type_id<float>().hash_code()) {
-    CompareCompute<float, LESS_THAN>()(param.input_x_, param.input_y_,
-                                       param.axis_, param.output_);
-  } else {
-    PADDLE_MOBILE_THROW_EXCEPTION(
-        "LessThan only support int64_t and float data type.");
-  }
-}
-#endif  // LESS_THAN_OP
-
-#ifdef EQUAL_OP
-template <>
-bool EqualKernel<CPU, float>::Init(CompareParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void EqualKernel<CPU, float>::Compute(const CompareParam<CPU> &param) {
-  if (param.input_x_->type() == type_id<int>().hash_code()) {
-    CompareCompute<int, EQUAL>()(param.input_x_, param.input_y_, param.axis_,
-                                 param.output_);
-  }
-}
-#endif  // EQUAL_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/arm/concat_kernel.cpp b/mobile/src/operators/kernel/arm/concat_kernel.cpp
deleted file mode 100644
index 3e585ec721..0000000000
--- a/mobile/src/operators/kernel/arm/concat_kernel.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONCAT_OP
-
-#include "operators/kernel/concat_kernel.h"
-#include "operators/kernel/central-arm-func/concat_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConcatKernel<CPU, float>::Init(ConcatParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ConcatKernel<CPU, float>::Compute(const ConcatParam<CPU> &param) {
-  if (param.Inputs()[0]->type() == type_id<int8_t>().hash_code()) {
-    ConcatCompute<int8_t>(param);
-  } else {
-    ConcatCompute<float>(param);
-  }
-  param.Out()->set_lod(param.Inputs()[0]->lod());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/conditional_block_kernel.cpp b/mobile/src/operators/kernel/arm/conditional_block_kernel.cpp
deleted file mode 100644
index a5530559d1..0000000000
--- a/mobile/src/operators/kernel/arm/conditional_block_kernel.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONDITIONAL_BLOCK_OP
-
-#include "operators/kernel/conditional_block_kernel.h"
-#include <framework/program/block_desc.h>
-#include <framework/program/op_desc.h>
-#include <algorithm>
-#include "framework/data_type.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-class StepExecutor {
-  typedef std::shared_ptr<framework::OperatorBase<CPU>> OperatorPtr;
-
- public:
-  StepExecutor(const framework::BlockDesc *block, framework::Scope *scope)
-      : scope_(scope) {
-    std::vector<std::shared_ptr<framework::OpDesc>> ops = block->Ops();
-    ops_of_block_.resize(ops.size());
-    for (int i = 0; i < ops.size(); ++i) {
-      std::shared_ptr<framework::OpDesc> op_desc = ops[i];
-      DLOG << "conditional block create op: " << ops.size() << ","
-           << op_desc->Type();
-      auto op_handler = framework::OpRegistry<CPU>::CreateOp(
-          op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(),
-          op_desc->GetAttrMap(), scope_);
-      op_handler->Init();
-      ops_of_block_[i] = op_handler;
-    }
-  }
-
-  void Run() {
-    for (int i = 0; i < ops_of_block_.size(); ++i) {
-      auto &op_handler = ops_of_block_[i];
-      DLOG << "conditional block op InferShape: " << i
-           << "th: " << op_handler->Type();
-      op_handler->InferShape();
-      DLOG << "conditional block op Run: " << i << "th: " << op_handler->Type();
-      op_handler->Run();
-    }
-  }
-
- private:
-  framework::Scope *scope_;
-  std::vector<OperatorPtr> ops_of_block_;
-};
-
-template <>
-bool ConditionalBlockKernel<CPU, float>::Init(
-    ConditionalBlockParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ConditionalBlockKernel<CPU, float>::Compute(
-    const ConditionalBlockParam<CPU> &param) {
-  bool need_run;
-  if (param.isScalarCondition()) {
-    auto xs = param.Cond();
-    PADDLE_MOBILE_ENFORCE(
-        xs[0]->type() == type_id<bool>().hash_code() && xs[0]->numel() == 1,
-        "condition input's data type should be bool, "
-        "numel should be 1, actual numel is %d",
-        xs[0]->numel());
-    need_run = xs[0]->data<bool>()[0];
-  } else {
-    auto xs = param.Input();
-    need_run = std::all_of(
-        xs.begin(), xs.end(),
-        [](const framework::LoDTensor *t) { return t->numel() != 0; });
-  }
-
-  if (need_run) {
-    auto input = param.Input();
-    auto sub = param.getSubBlock();
-    auto &current_scope = param.GetScope()->NewScope();
-    StepExecutor executor(sub, &current_scope);
-    executor.Run();
-    param.GetScope()->DeleteScope(&current_scope);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // CONDITIONAL_BLOCK_OP
diff --git a/mobile/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp b/mobile/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp
deleted file mode 100644
index 229b96b550..0000000000
--- a/mobile/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,178 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDBNRELU_OP
-
-#include "operators/kernel/conv_add_bn_relu_kernel.h"
-#include <cmath>
-#include "framework/context.h"
-#include "operators/kernel/arm/convolution/conv_common.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-#include "operators/math/element_wise.h"
-#include "operators/math/gemm/gemm1x1s1.h"
-#include "operators/math/slidingwindow_utils.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddBNReluKernel<CPU, float>::Init(
-    FusionConvAddBNReluParam<CPU> *param) {
-  const Tensor *mean = param->InputMean();
-  const Tensor *variance = param->InputVariance();
-  const Tensor *scale = param->InputScale();
-  const Tensor *bias = param->InputBias();
-  const Tensor *bias1 = param->Bias();
-  const float epsilon = param->Epsilon();
-
-  auto mean_ptr = mean->data<float>();
-  auto variance_ptr = variance->data<float>();
-  auto scale_ptr = scale->data<float>();
-  auto bias_ptr = bias->data<float>();
-  auto bias1_ptr = bias1->data<float>();
-
-  const int C = mean->numel();
-  float inv_std_ptr[C];
-  for (int i = 0; i < C; i++) {
-    inv_std_ptr[i] =
-        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
-  }
-
-  Variable *scale_var = param->GetScope()->Var();
-  Variable *bias_var = param->GetScope()->Var();
-  LoDTensor *new_scale = scale_var->GetMutable<LoDTensor>();
-  LoDTensor *new_bias = bias_var->GetMutable<LoDTensor>();
-  float *new_scale_ptr = new_scale->mutable_data<float>({C});
-  float *new_bias_ptr = new_bias->mutable_data<float>({C});
-  for (int i = 0; i < C; i++) {
-    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
-    new_bias_ptr[i] = bias_ptr[i] + (bias1_ptr[i] - mean_ptr[i]) *
-                                        inv_std_ptr[i] * scale_ptr[i];
-  }
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);
-
-  InitBaseConvKernel(param);
-
-  // try to use faster depthwise conv
-  switch (param->ExecMode()) {
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT:
-      use_slidingwindow_add_bn_relu = true;
-      break;
-    case ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT:
-      use_gemm_add_bn_relu = true;
-      break;
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      const std::vector<int> &paddings = param->Paddings();
-      const std::vector<int> &strides = param->Strides();
-      if (paddings.size() == 2 && paddings[0] == paddings[1] &&
-          strides.size() == 2 && strides[0] == strides[1]) {
-        int pad = paddings[0];
-        int stride = strides[0];
-        const int win = param->Input()->dims()[3];
-        if (pad == 1) {
-          if (stride == 1) {
-            could_use_faster_depthwise_conv_ = true;
-          } else if (stride == 2 && win > 7) {
-            could_use_faster_depthwise_conv_ = true;
-          }
-        }
-      }
-      break;
-  }
-
-  if (could_use_faster_depthwise_conv_ || use_gemm_add_bn_relu ||
-      use_slidingwindow_add_bn_relu) {
-    auto filter_data = param->Filter()->data<float>();
-    auto filter_dim = param->Filter()->dims();
-    int len = 1;
-    for (int i = 0; i < filter_dim.size(); i++) {
-      len *= filter_dim[i];
-    }
-    int batch = filter_dim[0];
-    int step = len / batch;
-    for (int i = 0; i < batch; i++) {
-      for (int k = 0; k < step; k++) {
-        filter_data[i * step + k] =
-            filter_data[i * step + k] * new_scale_ptr[i];
-      }
-    }
-    if (use_gemm_add_bn_relu) {
-      ARMArch arch = framework::CPUContext::Context()->get_arch();
-      math::gemm1x1s1_transform_weight(*param->Filter(), *param->Output(),
-                                       param->transformed_filter_,
-                                       param->groups, arch);
-    }
-    if (use_slidingwindow_add_bn_relu) {
-      math::slidingwindow_transform_weight<float>(*param->Filter(),
-                                                  param->transformed_filter_);
-    }
-  }
-
-  return true;
-}
-
-template <>
-void ConvAddBNReluKernel<CPU, float>::Compute(
-    const FusionConvAddBNReluParam<CPU> &param) {
-  bool fusion_has_been_computed = false;
-  switch (param.ExecMode()) {
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      if (could_use_faster_depthwise_conv_) {
-        FasterDepthwiseConv3x3_bias_relu(param, param.NewBias()->data<float>(),
-                                         true);
-        fusion_has_been_computed = true;
-      } else {
-        DepthwiseConv3x3<float, float>(param);
-      }
-      break;
-    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
-      DepthwiseConv5x5<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<8, 3>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM_FLOAT:
-      GemmConv<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT:
-      fusion_has_been_computed = true;
-      GemmConv1x1s1<float, float>(param, param.NewBias()->data<float>(), true,
-                                  true);
-      break;
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT:
-      SlidingwindowConv3x3<float, float>(param, param.NewBias()->data<float>(),
-                                         true, true);
-      fusion_has_been_computed = true;
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-  if (!fusion_has_been_computed) {
-    math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                    param.NewBias(), param.Output());
-  }
-}
-
-template class ConvAddBNReluKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/convolution/conv_add_kernel.cpp b/mobile/src/operators/kernel/arm/convolution/conv_add_kernel.cpp
deleted file mode 100644
index 66ed513ac9..0000000000
--- a/mobile/src/operators/kernel/arm/convolution/conv_add_kernel.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADD_OP
-
-#include "operators/kernel/conv_add_kernel.h"
-#include "operators/kernel/arm/convolution/conv_common.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-#include "operators/math/element_wise.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddKernel<CPU, float>::Init(FusionConvAddParam<CPU> *param) {
-  InitBaseConvKernel(param);
-  return true;
-}
-
-template <>
-void ConvAddKernel<CPU, float>::Compute(const FusionConvAddParam<CPU> &param) {
-  bool fusion_has_been_computed = false;
-  switch (param.ExecMode()) {
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      DepthwiseConv3x3<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
-      DepthwiseConv5x5<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<8, 3>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM_FLOAT:
-      GemmConv<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT:
-      fusion_has_been_computed = true;
-      GemmConv1x1s1<float, float>(param, param.Bias()->data<float>(), true,
-                                  false);
-      break;
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT:
-      SlidingwindowConv3x3<float, float>(param, param.Bias()->data<float>(),
-                                         true, false);
-      fusion_has_been_computed = true;
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-  if (!fusion_has_been_computed) {
-    if (param.Bias()->dims() == param.Output()->dims()) {
-      math::AddElememtWise<IDENTITY>(param.Output(), param.Bias(), param.Axis(),
-                                     param.Output());
-    } else {
-      math::AddChannelWise<IDENTITY>(param.Output(), param.Bias(),
-                                     param.Output());
-    }
-  }
-}
-
-template class ConvAddKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp b/mobile/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp
deleted file mode 100644
index 54eb2ca23b..0000000000
--- a/mobile/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDRELU_OP
-
-#include "operators/kernel/conv_add_relu_kernel.h"
-#include "operators/kernel/arm/convolution/conv_common.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-#include "operators/math/element_wise.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddReluKernel<CPU, float>::Init(FusionConvAddReluParam<CPU> *param) {
-  InitBaseConvKernel(param);
-  return true;
-}
-
-template <>
-void ConvAddReluKernel<CPU, float>::Compute(
-    const FusionConvAddReluParam<CPU> &param) {
-  bool fusion_has_been_computed = false;
-  switch (param.ExecMode()) {
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      DepthwiseConv3x3<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
-      DepthwiseConv5x5<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<8, 3>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM_FLOAT:
-      GemmConv<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT:
-      fusion_has_been_computed = true;
-      GemmConv1x1s1<float, float>(param, param.Bias()->data<float>(), true,
-                                  true);
-      break;
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT:
-      SlidingwindowConv3x3<float, float>(param, nullptr, false, false);
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-  if (!fusion_has_been_computed) {
-    if (param.Bias()->dims() == param.Output()->dims()) {
-      math::AddElememtWise<RELU>(param.Output(), param.Bias(), param.Axis(),
-                                 param.Output());
-    } else {
-      math::AddChannelWise<RELU>(param.Output(), param.Bias(), param.Output());
-    }
-  }
-}
-
-template class ConvAddReluKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp b/mobile/src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp
deleted file mode 100644
index 138e34d78e..0000000000
--- a/mobile/src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBNADDRELU_OP
-
-#include "operators/kernel/conv_bn_add_relu_kernel.h"
-#include <cmath>
-#include "operators/kernel/arm/convolution/conv_common.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-#include "operators/math/element_wise.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvBNAddReluKernel<CPU, float>::Init(
-    FusionConvBNAddReluParam<CPU> *param) {
-  const Tensor *mean = param->InputMean();
-  const Tensor *variance = param->InputVariance();
-  const Tensor *scale = param->InputScale();
-  const Tensor *bias = param->InputBias();
-  const float epsilon = param->Epsilon();
-
-  auto mean_ptr = mean->data<float>();
-  auto variance_ptr = variance->data<float>();
-  auto scale_ptr = const_cast<float *>(scale->data<float>());
-  auto bias_ptr = const_cast<float *>(bias->data<float>());
-
-  for (int c = 0; c < scale->numel(); ++c) {
-    float inv_scale = 1.f / (pow(variance_ptr[c] + epsilon, 0.5));
-    bias_ptr[c] -= inv_scale * scale_ptr[c] * mean_ptr[c];
-    scale_ptr[c] *= inv_scale;
-  }
-
-  InitBaseConvKernel(param);
-  return true;
-}
-
-template <>
-void ConvBNAddReluKernel<CPU, float>::Compute(
-    const FusionConvBNAddReluParam<CPU> &param) {
-  switch (param.ExecMode()) {
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      DepthwiseConv3x3<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
-      DepthwiseConv5x5<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<8, 3>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM_FLOAT:
-      GemmConv<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT:
-      GemmConv1x1s1<float, float>(param, nullptr, false, false);
-      break;
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT:
-      SlidingwindowConv3x3<float, float>(param, nullptr, false, false);
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-
-  if (param.Bias()->dims() == param.Output()->dims()) {
-    math::ScaleAddChannelWise<RELU>(param.Output(), param.InputScale(),
-                                    param.InputBias(), param.Bias(),
-                                    param.Output());
-  } else {
-    math::ScaleAddChannelWise<IDENTITY>(param.Output(), param.InputScale(),
-                                        param.InputBias(), param.Output());
-    math::AddElememtWise<RELU>(param.Output(), param.Bias(), param.Axis(),
-                               param.Output());
-  }
-}
-
-template class ConvBNAddReluKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp
deleted file mode 100644
index f217902bf2..0000000000
--- a/mobile/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBNRELU_OP
-
-#include "operators/kernel/conv_bn_relu_kernel.h"
-#include <cmath>
-#include "framework/context.h"
-#include "operators/kernel/arm/convolution/conv_common.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-#include "operators/math/element_wise.h"
-#include "operators/math/gemm/gemm1x1s1.h"
-#include "operators/math/slidingwindow_utils.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvBNReluKernel<CPU, float>::Init(FusionConvBNReluParam<CPU> *param) {
-  const Tensor *mean = param->InputMean();
-  const Tensor *variance = param->InputVariance();
-  const Tensor *scale = param->InputScale();
-  const Tensor *bias = param->InputBias();
-  const float epsilon = param->Epsilon();
-
-  auto mean_ptr = mean->data<float>();
-  auto variance_ptr = variance->data<float>();
-  auto scale_ptr = scale->data<float>();
-  auto bias_ptr = bias->data<float>();
-
-  const int C = mean->numel();
-  float inv_std_ptr[C];
-  for (int i = 0; i < C; i++) {
-    inv_std_ptr[i] =
-        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
-  }
-
-  Variable *scale_var = param->GetScope()->Var();
-  Variable *bias_var = param->GetScope()->Var();
-  LoDTensor *new_scale = scale_var->GetMutable<LoDTensor>();
-  LoDTensor *new_bias = bias_var->GetMutable<LoDTensor>();
-  float *new_scale_ptr = new_scale->mutable_data<float>({C});
-  float *new_bias_ptr = new_bias->mutable_data<float>({C});
-  for (int i = 0; i < C; i++) {
-    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
-    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
-  }
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);
-
-  InitBaseConvKernel(param);
-
-  switch (param->ExecMode()) {
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT:
-      use_slidingwindow_bn_relu = true;
-      break;
-    case ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT:
-      use_gemm_bn_relu = true;
-      break;
-  }
-
-  if (use_gemm_bn_relu || use_slidingwindow_bn_relu) {
-    auto filter_data = param->Filter()->data<float>();
-    auto filter_dim = param->Filter()->dims();
-    int len = 1;
-    for (int i = 0; i < filter_dim.size(); i++) {
-      len *= filter_dim[i];
-    }
-    int batch = filter_dim[0];
-    int step = len / batch;
-    for (int i = 0; i < batch; i++) {
-      for (int k = 0; k < step; k++) {
-        filter_data[i * step + k] =
-            filter_data[i * step + k] * new_scale_ptr[i];
-      }
-    }
-    if (use_gemm_bn_relu) {
-      ARMArch arch = framework::CPUContext::Context()->get_arch();
-      math::gemm1x1s1_transform_weight(*param->Filter(), *param->Output(),
-                                       param->transformed_filter_,
-                                       param->groups, arch);
-    }
-    if (use_slidingwindow_bn_relu) {
-      math::slidingwindow_transform_weight<float>(*param->Filter(),
-                                                  param->transformed_filter_);
-    }
-  }
-  return true;
-}
-
-template <>
-void ConvBNReluKernel<CPU, float>::Compute(
-    const FusionConvBNReluParam<CPU> &param) {
-  bool fusion_has_been_computed = false;
-  switch (param.ExecMode()) {
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      DepthwiseConv3x3<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
-      DepthwiseConv5x5<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<8, 3>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM_FLOAT:
-      GemmConv<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT:
-      GemmConv1x1s1<float, float>(param, param.NewBias()->data<float>(), true,
-                                  true);
-      fusion_has_been_computed = true;
-      break;
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT:
-      SlidingwindowConv3x3<float, float>(param, param.NewBias()->data<float>(),
-                                         true, true);
-      fusion_has_been_computed = true;
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-  if (!fusion_has_been_computed) {
-    math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                    param.NewBias(), param.Output());
-  }
-}
-template class ConvBNReluKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/convolution/conv_common.cpp b/mobile/src/operators/kernel/arm/convolution/conv_common.cpp
deleted file mode 100644
index c0906e23a3..0000000000
--- a/mobile/src/operators/kernel/arm/convolution/conv_common.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/arm/convolution/conv_common.h"
-#include "framework/context.h"
-#include "operators/math/gemm/gemm1x1s1.h"
-#include "operators/math/slidingwindow_utils.h"
-#include "operators/math/winograd/winograd_transform.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-void InitBaseConvKernel(ConvParam<CPU> *param) {
-  bool conv1x1 = param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-                 param->Filter()->dims()[2] == 1;
-  bool conv3x3 = param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-                 param->Filter()->dims()[2] == 3;
-  bool conv5x5 = param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-                 param->Filter()->dims()[2] == 5;
-  bool depth3x3 = conv3x3 && param->Groups() == param->Input()->dims()[1] &&
-                  param->Input()->dims()[1] == param->Output()->dims()[1];
-
-  bool depth5x5 = conv5x5 && param->Groups() == param->Input()->dims()[1] &&
-                  param->Input()->dims()[1] == param->Output()->dims()[1];
-
-  if (param->Filter()->type() == type_id<int8_t>().hash_code()) {
-#ifndef __aarch64__
-    if (depth3x3 && param->Strides()[0] < 3 &&
-        param->Strides()[0] == param->Strides()[1]) {
-      param->ExecMode() = ConvParam<CPU>::EXEC_DEPTHWISE3x3_INT8;
-    } else if (depth5x5 && param->Strides()[0] < 2 &&
-               param->Strides()[0] == param->Strides()[1]) {
-      param->ExecMode() = ConvParam<CPU>::EXEC_DEPTHWISE5x5_INT8;
-    } else {
-#endif  // __aarch64__
-      param->ExecMode() = ConvParam<CPU>::EXEC_GEMM_INT8;
-#ifndef __aarch64__
-    }
-#endif  // __aarch64__
-  } else {
-    if (depth3x3 && param->Strides()[0] == param->Strides()[1] &&
-        param->Strides()[0] == 1) {
-      param->ExecMode() = ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT;
-    } else if (depth3x3 && param->Strides()[0] == param->Strides()[1] &&
-               param->Strides()[0] == 2) {
-      param->ExecMode() = ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT;
-    } else if (depth5x5 && param->Strides()[0] == param->Strides()[1] &&
-               param->Strides()[0] == 1) {
-      param->ExecMode() = ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT;
-    } else if (conv3x3 && param->Groups() == 1 &&
-               param->Strides()[0] == param->Strides()[1] &&
-               param->Dilations()[0] == param->Dilations()[1] &&
-               param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
-      // transform weight
-      Variable *transformed_var = param->GetScope()->Var();
-      param->transformed_filter_ =
-          transformed_var->GetMutable<framework::LoDTensor>();
-      if (param->Input()->dims()[1] >= 32 && param->Output()->dims()[1] >= 32 &&
-          param->Output()->dims()[2] > 16 && param->Output()->dims()[3] > 16) {
-        math::winograd_transform_weight<8, 3>(*param->Filter(),
-                                              param->transformed_filter_);
-        param->ExecMode() = ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT;
-      } else {
-        math::slidingwindow_transform_weight<float>(*param->Filter(),
-                                                    param->transformed_filter_);
-        param->ExecMode() = ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S1_FLOAT;
-      }
-    } else if (conv3x3 && param->Groups() == 1 &&
-               param->Strides()[0] == param->Strides()[1] &&
-               param->Dilations()[0] == param->Dilations()[1] &&
-               param->Strides()[0] == 2 && param->Dilations()[0] == 1) {
-      // transform weight
-      Variable *transformed_var = param->GetScope()->Var();
-      param->transformed_filter_ =
-          transformed_var->GetMutable<framework::LoDTensor>();
-      math::slidingwindow_transform_weight<float>(*param->Filter(),
-                                                  param->transformed_filter_);
-      param->ExecMode() = ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT;
-    } else if (conv1x1 && param->Groups() == 1 &&
-               param->Paddings()[0] == param->Paddings()[1] &&
-               param->Paddings()[0] == 0 && param->Input()->dims()[1] > 1 &&
-               param->Strides()[0] == param->Strides()[1] &&
-               param->Dilations()[0] == param->Dilations()[1] &&
-               param->Strides()[0] == 1 && param->Dilations()[0] == 1 &&
-               param->Output()->dims()[2] * param->Output()->dims()[3] > 1) {
-      // transform weight
-      Variable *transformed_var = param->GetScope()->Var();
-      ARMArch arch = framework::CPUContext::Context()->get_arch();
-      param->transformed_filter_ =
-          transformed_var->GetMutable<framework::LoDTensor>();
-      math::gemm1x1s1_transform_weight(*param->Filter(), *param->Output(),
-                                       param->transformed_filter_,
-                                       param->groups, arch);
-      param->ExecMode() = ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT;
-    } else {
-      param->ExecMode() = ConvParam<CPU>::EXEC_GEMM_FLOAT;
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/arm/convolution/conv_common.h b/mobile/src/operators/kernel/arm/convolution/conv_common.h
deleted file mode 100644
index 4db37715c4..0000000000
--- a/mobile/src/operators/kernel/arm/convolution/conv_common.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-void InitBaseConvKernel(ConvParam<CPU> *param);
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/arm/convolution/conv_kernel.cpp b/mobile/src/operators/kernel/arm/convolution/conv_kernel.cpp
deleted file mode 100644
index f5dc35cdf6..0000000000
--- a/mobile/src/operators/kernel/arm/convolution/conv_kernel.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_OP
-
-#include "operators/kernel/conv_kernel.h"
-#include "operators/kernel/arm/convolution/conv_common.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvKernel<CPU, float>::Init(ConvParam<CPU> *param) {
-  InitBaseConvKernel(param);
-  return true;
-}
-
-template <>
-void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) {
-  switch (param.ExecMode()) {
-    case ConvParam<CPU>::EXEC_GEMM_INT8:
-      GemmConv<int8_t, int32_t>(param);
-      break;
-#ifndef __aarch64__
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3_INT8:
-      DepthwiseConv3x3<int8_t, int32_t>(param);
-      break;
-    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_INT8:
-      DepthwiseConv5x5<int8_t, int32_t>(param);
-      break;
-#endif  // __aarch64__
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      DepthwiseConv3x3<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
-      DepthwiseConv5x5<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<8, 3>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM_FLOAT:
-      GemmConv<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT:
-      GemmConv1x1s1<float, float>(param, nullptr, false, false);
-      break;
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT:
-      SlidingwindowConv3x3<float, float>(param, nullptr, false, false);
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-}
-
-template class ConvKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/convolution/conv_relu_kernel.cpp b/mobile/src/operators/kernel/arm/convolution/conv_relu_kernel.cpp
deleted file mode 100644
index 477bd55e55..0000000000
--- a/mobile/src/operators/kernel/arm/convolution/conv_relu_kernel.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVRELU_OP
-
-#include "operators/kernel/conv_relu_kernel.h"
-#include "operators/kernel/arm/convolution/conv_common.h"
-#include "operators/kernel/central-arm-func/activation_arm_func.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvReluKernel<CPU, float>::Init(FusionConvReluParam<CPU> *param) {
-  InitBaseConvKernel(param);
-  return true;
-}
-
-template <>
-void ConvReluKernel<CPU, float>::Compute(
-    const FusionConvReluParam<CPU> &param) {
-  switch (param.ExecMode()) {
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      DepthwiseConv3x3<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
-      DepthwiseConv5x5<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<8, 3>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM_FLOAT:
-      GemmConv<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT:
-      GemmConv1x1s1<float, float>(param, nullptr, false, false);
-      break;
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT:
-      SlidingwindowConv3x3<float, float>(param, nullptr, false, false);
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-  ActivationCompute<float, RELU>()(param.Output(), param.Output());
-}
-template class ConvReluKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/convolution/conv_transpose_kernel.cpp b/mobile/src/operators/kernel/arm/convolution/conv_transpose_kernel.cpp
deleted file mode 100644
index 771a846ed6..0000000000
--- a/mobile/src/operators/kernel/arm/convolution/conv_transpose_kernel.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_TRANSPOSE_OP
-
-#include "operators/kernel/conv_transpose_kernel.h"
-#include "operators/kernel/central-arm-func/conv_transpose_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvTransposeKernel<CPU, float>::Init(ConvTransposeParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ConvTransposeKernel<CPU, float>::Compute(
-    const ConvTransposeParam<CPU> &param) {
-  ConvTransposeCompute<float>(param);
-}
-
-template class ConvTransposeKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp
deleted file mode 100644
index 0eefeae1d1..0000000000
--- a/mobile/src/operators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DWCONVBNRELU_OP
-
-#include "operators/kernel/dwconv_bn_relu_kernel.h"
-#include <cmath>
-#include "operators/kernel/arm/convolution/conv_common.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-#include "operators/math/element_wise.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DWConvBNReluKernel<CPU, float>::Init(FusionDWConvBNReluParam<CPU> *param) {
-  const Tensor *mean = param->InputMean();
-  const Tensor *variance = param->InputVariance();
-  const Tensor *scale = param->InputScale();
-  const Tensor *bias = param->InputBias();
-  const float epsilon = param->Epsilon();
-
-  auto mean_ptr = mean->data<float>();
-  auto variance_ptr = variance->data<float>();
-  auto scale_ptr = scale->data<float>();
-  auto bias_ptr = bias->data<float>();
-
-  const int C = mean->numel();
-  float inv_std_ptr[C];
-  for (int i = 0; i < C; i++) {
-    inv_std_ptr[i] =
-        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
-  }
-  Variable *scale_var = param->GetScope()->Var();
-  Variable *bias_var = param->GetScope()->Var();
-  LoDTensor *new_scale = scale_var->GetMutable<LoDTensor>();
-  LoDTensor *new_bias = bias_var->GetMutable<LoDTensor>();
-  float *new_scale_ptr = new_scale->mutable_data<float>({C});
-  float *new_bias_ptr = new_bias->mutable_data<float>({C});
-  for (int i = 0; i < C; i++) {
-    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
-    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
-  }
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);
-
-  InitBaseConvKernel(param);
-  return true;
-}
-
-template <>
-void DWConvBNReluKernel<CPU, float>::Compute(
-    const FusionDWConvBNReluParam<CPU> &param) {
-  switch (param.ExecMode()) {
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      DepthwiseConv3x3<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
-      DepthwiseConv5x5<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<8, 3>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM_FLOAT:
-      GemmConv<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT:
-      GemmConv1x1s1<float, float>(param, nullptr, false, false);
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-  math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                  param.NewBias(), param.Output());
-}
-
-template class DWConvBNReluKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/crf_kernel.cpp b/mobile/src/operators/kernel/arm/crf_kernel.cpp
deleted file mode 100644
index d30c28b357..0000000000
--- a/mobile/src/operators/kernel/arm/crf_kernel.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CRF_OP
-
-#include "operators/kernel/crf_kernel.h"
-#include "common/types.h"
-#include "operators/kernel/central-arm-func/crf_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool CrfKernel<CPU, float>::Init(CrfParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void CrfKernel<CPU, float>::Compute(const CrfParam<CPU> &param) {
-  CrfCompute<float>(param);
-}
-
-template class CrfKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/density_prior_box_kernel.cpp b/mobile/src/operators/kernel/arm/density_prior_box_kernel.cpp
deleted file mode 100644
index 8aff3984e8..0000000000
--- a/mobile/src/operators/kernel/arm/density_prior_box_kernel.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DENSITY_PRIORBOX_OP
-
-#include "operators/kernel/central-arm-func/density_prior_box_arm_func.h"
-#include "operators/kernel/prior_box_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DensityPriorBoxKernel<CPU, float>::Init(DensityPriorBoxParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void DensityPriorBoxKernel<CPU, float>::Compute(
-    const DensityPriorBoxParam<CPU> &param) {
-  DensityPriorBoxCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // DENSITY_PRIORBOX_OP
diff --git a/mobile/src/operators/kernel/arm/dequantize_bn_kernel.cpp b/mobile/src/operators/kernel/arm/dequantize_bn_kernel.cpp
deleted file mode 100644
index 4fa00f3a37..0000000000
--- a/mobile/src/operators/kernel/arm/dequantize_bn_kernel.cpp
+++ /dev/null
@@ -1,340 +0,0 @@
-/* Copyright (c) 201f8 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-#include "operators/kernel/dequant_bn_kernel.h"
-#include "operators/math/activation.h"
-#include "operators/math/quantize.h"
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-
-#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_ADD_BN_OP) || \
-    defined(FUSION_DEQUANT_BN_RELU_OP) ||                                 \
-    defined(FUSION_DEQUANT_ADD_BN_RELU_OP) ||                             \
-    defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) ||                            \
-    defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
-void PublicFusionDequantBNInitParam(FusionDequantBNParam<CPU> *param,
-                                    const framework::Tensor *bias) {
-  // batch norm params
-  const Tensor *bn_mean = param->bn_mean_;
-  const Tensor *bn_variance = param->bn_variance_;
-  Tensor *bn_scale = param->bn_scale_;
-  Tensor *bn_bias = param->bn_bias_;
-  const float epsilon = param->epsilon_;
-
-  const float *mean_ptr = bn_mean->data<float>();
-  const float *var_ptr = bn_variance->data<float>();
-  float *bn_scale_ptr = bn_scale->mutable_data<float>();
-  float *bn_bias_ptr = bn_bias->mutable_data<float>();
-  for (int c = 0; c < bn_scale->numel(); ++c) {
-    float inv_scale = 1.f / (std::sqrt(var_ptr[c] + epsilon));
-    float val = bias ? bias->data<float>()[c] : 0;
-    bn_bias_ptr[c] =
-        inv_scale * bn_scale_ptr[c] * (val - mean_ptr[c]) + bn_bias_ptr[c];
-    bn_scale_ptr[c] = inv_scale * bn_scale_ptr[c];
-  }
-}
-#endif
-
-#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_ADD_BN_OP) || \
-    defined(FUSION_DEQUANT_BN_RELU_OP) ||                                 \
-    defined(FUSION_DEQUANT_ADD_BN_RELU_OP)
-template <ActivationType Act>
-void DequantBNCompute(const FusionDequantBNParam<CPU> *param) {
-  const int32_t *input = param->input_->data<int32_t>();
-  const float *bn_scale = param->bn_scale_->data<float>();
-  const float *bn_bias = param->bn_bias_->data<float>();
-  // dequantize params
-  const float activation_scale = param->activation_scale_->data<float>()[0];
-  const float weight_scale = param->weight_scale_;
-  const float dequant_scale = activation_scale / weight_scale;
-
-  float *output = param->output_->mutable_data<float>();
-  int batch_size = param->input_->dims()[0];
-  int channels = param->input_->dims()[1];
-  size_t spatial_size = param->input_->dims()[2] * param->input_->dims()[3];
-
-  #pragma omp parallel for collapse(2)
-  for (int batch = 0; batch < batch_size; ++batch) {
-    for (int c = 0; c < channels; ++c) {
-      // not fuse bn and dequant scale to minimize precision difference
-      // float scale = bn_scale[c] * dequant_scale;
-      float scale = bn_scale[c];
-      float bias = bn_bias[c];
-      size_t offset = (batch * channels + c) * spatial_size;
-      const int32_t *x = input + offset;
-      float *y = output + offset;
-      size_t remain = spatial_size;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-      int loop = spatial_size >> 4;
-      remain = spatial_size & 0xF;
-      float32x4_t __dequant_scale = vdupq_n_f32(dequant_scale);
-      float32x4_t __scale = vdupq_n_f32(scale);
-      float32x4_t __bias = vdupq_n_f32(bias);
-      for (int k = 0; k < loop; ++k, x += 16, y += 16) {
-        int32x4_t r0 = vld1q_s32(x);
-        int32x4_t r1 = vld1q_s32(x + 4);
-        int32x4_t r2 = vld1q_s32(x + 8);
-        int32x4_t r3 = vld1q_s32(x + 12);
-        float32x4_t f0 = vcvtq_f32_s32(r0);
-        float32x4_t f1 = vcvtq_f32_s32(r1);
-        float32x4_t f2 = vcvtq_f32_s32(r2);
-        float32x4_t f3 = vcvtq_f32_s32(r3);
-        f0 = vmulq_f32(__dequant_scale, f0);
-        f1 = vmulq_f32(__dequant_scale, f1);
-        f2 = vmulq_f32(__dequant_scale, f2);
-        f3 = vmulq_f32(__dequant_scale, f3);
-        f0 = vmlaq_f32(__bias, __scale, f0);
-        f1 = vmlaq_f32(__bias, __scale, f1);
-        f2 = vmlaq_f32(__bias, __scale, f2);
-        f3 = vmlaq_f32(__bias, __scale, f3);
-        f0 = math::vActiveq_f32<Act>(f0);
-        f1 = math::vActiveq_f32<Act>(f1);
-        f2 = math::vActiveq_f32<Act>(f2);
-        f3 = math::vActiveq_f32<Act>(f3);
-        vst1q_f32(y, f0);
-        vst1q_f32(y + 4, f1);
-        vst1q_f32(y + 8, f2);
-        vst1q_f32(y + 12, f3);
-      }
-#endif  // __ARM_NEON__
-      for (int k = 0; k < remain; ++k) {
-        y[k] = math::Active<Act>(scale * (dequant_scale * x[k]) + bias);
-      }
-    }
-  }
-}
-#endif
-
-#ifdef FUSION_DEQUANT_BN_OP
-template <>
-bool FusionDequantBNKernel<CPU, float>::Init(FusionDequantBNParam<CPU> *param) {
-  PublicFusionDequantBNInitParam(param, nullptr);
-  return true;
-}
-
-template <>
-void FusionDequantBNKernel<CPU, float>::Compute(
-    const FusionDequantBNParam<CPU> &param) {
-  DequantBNCompute<IDENTITY>(&param);
-}
-#endif  // FUSION_DEQUANT_BN_OP
-
-#ifdef FUSION_DEQUANT_BN_RELU_OP
-template <>
-bool FusionDequantBNReluKernel<CPU, float>::Init(
-    FusionDequantBNParam<CPU> *param) {
-  PublicFusionDequantBNInitParam(param, nullptr);
-  return true;
-}
-
-template <>
-void FusionDequantBNReluKernel<CPU, float>::Compute(
-    const FusionDequantBNParam<CPU> &param) {
-  DequantBNCompute<RELU>(&param);
-}
-#endif  // FUSION_DEQUANT_BN_RELU_OP
-
-#ifdef FUSION_DEQUANT_ADD_BN_OP
-template <>
-bool FusionDequantAddBNKernel<CPU, float>::Init(
-    FusionDequantAddBNParam<CPU> *param) {
-  const framework::Tensor *bias = param->bias_;
-  PublicFusionDequantBNInitParam(param, bias);
-  return true;
-}
-
-template <>
-void FusionDequantAddBNKernel<CPU, float>::Compute(
-    const FusionDequantAddBNParam<CPU> &param) {
-  DequantBNCompute<IDENTITY>(&param);
-}
-#endif  // FUSION_DEQUANT_ADD_BN_OP
-
-#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
-template <>
-bool FusionDequantAddBNReluKernel<CPU, float>::Init(
-    FusionDequantAddBNParam<CPU> *param) {
-  const framework::Tensor *bias = param->bias_;
-  PublicFusionDequantBNInitParam(param, bias);
-  return true;
-}
-
-template <>
-void FusionDequantAddBNReluKernel<CPU, float>::Compute(
-    const FusionDequantAddBNParam<CPU> &param) {
-  DequantBNCompute<RELU>(&param);
-}
-#endif  // FUSION_DEQUANT_ADD_BN_RELU_OP
-
-#if defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) || \
-    defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
-template <Activation Act, RoundType R>
-void DequantBNQuantCompute(const FusionDequantAddBNQuantParam<CPU> *param) {
-  const int32_t *input = param->input_->data<int32_t>();
-  const float *bn_scale = param->bn_scale_->data<float>();
-  const float *bn_bias = param->bn_bias_->data<float>();
-  // dequantize params
-  const float activation_scale = param->activation_scale_->data<float>()[0];
-  const float weight_scale = param->weight_scale_;
-  const float dequant_scale = activation_scale / weight_scale;
-  // quantize params
-  Tensor *output_scale = param->online_scale_;
-  float max_abs = 0.f;
-
-  int8_t *output = param->output_->mutable_data<int8_t>();
-  int batch_size = param->input_->dims()[0];
-  int channels = param->input_->dims()[1];
-  size_t spatial_size = param->input_->dims()[2] * param->input_->dims()[3];
-
-  //  if (param->is_static_) {
-  if (true) {
-    max_abs = param->static_scale_;
-    float quant_scale = 127.f / max_abs;
-    #pragma omp parallel for collapse(2)
-    for (int batch = 0; batch < batch_size; ++batch) {
-      for (int c = 0; c < channels; ++c) {
-        // not fuse bn and dequant scale to minimize precision difference
-        // float scale = bn_scale[c] * dequant_scale;
-        float scale = bn_scale[c];
-        float bias = bn_bias[c];
-        size_t offset = (batch * channels + c) * spatial_size;
-        const int32_t *x = input + offset;
-        int8_t *y = output + offset;
-        size_t remain = spatial_size;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-        int loop = spatial_size >> 4;
-        remain = spatial_size & 0xF;
-        float32x4_t __dequant_scale = vdupq_n_f32(dequant_scale);
-        float32x4_t __scale = vdupq_n_f32(scale);
-        float32x4_t __bias = vdupq_n_f32(bias);
-        float32x4_t __quant_scale = vdupq_n_f32(quant_scale);
-        for (int k = 0; k < loop; ++k, x += 16, y += 16) {
-          int32x4_t r0 = vld1q_s32(x);
-          int32x4_t r1 = vld1q_s32(x + 4);
-          int32x4_t r2 = vld1q_s32(x + 8);
-          int32x4_t r3 = vld1q_s32(x + 12);
-          float32x4_t f0 = vcvtq_f32_s32(r0);
-          float32x4_t f1 = vcvtq_f32_s32(r1);
-          float32x4_t f2 = vcvtq_f32_s32(r2);
-          float32x4_t f3 = vcvtq_f32_s32(r3);
-          f0 = vmulq_f32(__dequant_scale, f0);
-          f1 = vmulq_f32(__dequant_scale, f1);
-          f2 = vmulq_f32(__dequant_scale, f2);
-          f3 = vmulq_f32(__dequant_scale, f3);
-          f0 = vmlaq_f32(__bias, __scale, f0);
-          f1 = vmlaq_f32(__bias, __scale, f1);
-          f2 = vmlaq_f32(__bias, __scale, f2);
-          f3 = vmlaq_f32(__bias, __scale, f3);
-          f0 = math::vActiveq_f32<Act>(f0);
-          f1 = math::vActiveq_f32<Act>(f1);
-          f2 = math::vActiveq_f32<Act>(f2);
-          f3 = math::vActiveq_f32<Act>(f3);
-          f0 = vmulq_f32(__quant_scale, f0);
-          f1 = vmulq_f32(__quant_scale, f1);
-          f2 = vmulq_f32(__quant_scale, f2);
-          f3 = vmulq_f32(__quant_scale, f3);
-          int32x4_t q0 = math::vRoundq_f32<R>(f0);
-          int32x4_t q1 = math::vRoundq_f32<R>(f1);
-          int32x4_t q2 = math::vRoundq_f32<R>(f2);
-          int32x4_t q3 = math::vRoundq_f32<R>(f3);
-          int16x4_t d0 = vmovn_s32(q0);
-          int16x4_t d1 = vmovn_s32(q1);
-          int16x4_t d2 = vmovn_s32(q2);
-          int16x4_t d3 = vmovn_s32(q3);
-          int16x8_t q5 = vcombine_s16(d0, d1);
-          int16x8_t q6 = vcombine_s16(d2, d3);
-          int8x8_t d5 = vmovn_s16(q5);
-          int8x8_t d6 = vmovn_s16(q6);
-          vst1_s8(y, d5);
-          vst1_s8(y + 8, d6);
-        }
-#endif  // __ARM_NEON__
-        for (int k = 0; k < remain; ++k) {
-          float x_temp =
-              math::Active<Act>(scale * (dequant_scale * x[k]) + bias);
-          y[k] = math::Round<R>(x_temp * quant_scale);
-        }
-      }
-    }
-  } else {
-    // TODO(hjchen2)
-    max_abs = std::max(max_abs, 1e-6f);
-  }
-  param->online_scale_->mutable_data<float>()[0] = max_abs;
-}
-
-template <>
-bool FusionDequantAddBNQuantKernel<CPU, float>::Init(
-    FusionDequantAddBNQuantParam<CPU> *param) {
-  const framework::Tensor *bias = param->bias_;
-  PublicFusionDequantBNInitParam(param, bias);
-  return true;
-}
-
-template <>
-void FusionDequantAddBNQuantKernel<CPU, float>::Compute(
-    const FusionDequantAddBNQuantParam<CPU> &param) {
-  switch (param.round_type_) {
-    case ROUND_NEAREST_TO_EVEN:
-      DequantBNQuantCompute<IDENTITY, ROUND_NEAREST_TO_EVEN>(&param);
-      break;
-    case ROUND_NEAREST_TOWARDS_ZERO:
-      DequantBNQuantCompute<IDENTITY, ROUND_NEAREST_TOWARDS_ZERO>(&param);
-      break;
-    case ROUND_NEAREST_AWAY_ZERO:
-      DequantBNQuantCompute<IDENTITY, ROUND_NEAREST_AWAY_ZERO>(&param);
-      break;
-    default:
-      LOG(kLOG_ERROR) << "round type is not supported.";
-      break;
-  }
-}
-#endif  // FUSION_DEQUANT_ADD_BN_QUANT_OP
-
-#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
-template <>
-bool FusionDequantAddBNReluQuantKernel<CPU, float>::Init(
-    FusionDequantAddBNQuantParam<CPU> *param) {
-  const framework::Tensor *bias = param->bias_;
-  PublicFusionDequantBNInitParam(param, bias);
-  return true;
-}
-
-template <>
-void FusionDequantAddBNReluQuantKernel<CPU, float>::Compute(
-    const FusionDequantAddBNQuantParam<CPU> &param) {
-  switch (param.round_type_) {
-    case ROUND_NEAREST_TO_EVEN:
-      DequantBNQuantCompute<RELU, ROUND_NEAREST_TO_EVEN>(&param);
-      break;
-    case ROUND_NEAREST_TOWARDS_ZERO:
-      DequantBNQuantCompute<RELU, ROUND_NEAREST_TOWARDS_ZERO>(&param);
-      break;
-    case ROUND_NEAREST_AWAY_ZERO:
-      DequantBNQuantCompute<RELU, ROUND_NEAREST_AWAY_ZERO>(&param);
-      break;
-    default:
-      LOG(kLOG_ERROR) << "round type is not supported.";
-      break;
-  }
-}
-#endif  // FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/arm/dequantize_kernel.cpp b/mobile/src/operators/kernel/arm/dequantize_kernel.cpp
deleted file mode 100644
index 7c0d1cea18..0000000000
--- a/mobile/src/operators/kernel/arm/dequantize_kernel.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DEQUANT_OP
-
-#include "operators/kernel/dequantize_kernel.h"
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DequantizeKernel<CPU, float>::Init(DequantizeParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void DequantizeKernel<CPU, float>::Compute(const DequantizeParam<CPU> &param) {
-  const LoDTensor *input = param.input_;
-  LoDTensor *output = param.output_;
-  float activation_scale = param.activation_scale_->data<float>()[0];
-  float weight_scale = param.weight_scale_;
-  const int32_t *x = input->data<int32_t>();
-  float *y = output->mutable_data<float>();
-  size_t size = output->numel();
-  // float scale = 1.f / (activation_scale * weight_scale);
-  float scale = activation_scale / weight_scale;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-  size_t loop = size >> 4;
-  size_t remain = size & 0xF;
-  float32x4_t s = vdupq_n_f32(scale);
-
-  #pragma omp parallel for
-  for (size_t i = 0; i < loop; ++i) {
-    const int32_t *local_x = x + (i << 4);
-    float *local_y = y + (i << 4);
-    int32x4_t r0 = vld1q_s32(local_x);
-    int32x4_t r1 = vld1q_s32(local_x + 4);
-    int32x4_t r2 = vld1q_s32(local_x + 8);
-    int32x4_t r3 = vld1q_s32(local_x + 12);
-    float32x4_t f0 = vcvtq_f32_s32(r0);
-    float32x4_t f1 = vcvtq_f32_s32(r1);
-    float32x4_t f2 = vcvtq_f32_s32(r2);
-    float32x4_t f3 = vcvtq_f32_s32(r3);
-    f0 = vmulq_f32(f0, s);
-    f1 = vmulq_f32(f1, s);
-    f2 = vmulq_f32(f2, s);
-    f3 = vmulq_f32(f3, s);
-    vst1q_f32(local_y, f0);
-    vst1q_f32(local_y + 4, f1);
-    vst1q_f32(local_y + 8, f2);
-    vst1q_f32(local_y + 12, f3);
-  }
-  size = remain;
-  x += (loop << 4);
-  y += (loop << 4);
-#endif
-  for (size_t i = 0; i < size; ++i) {
-    y[i] = x[i] * scale;
-  }
-  output->set_lod(input->lod());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/dropout_kernel.cpp b/mobile/src/operators/kernel/arm/dropout_kernel.cpp
deleted file mode 100644
index 964773ad69..0000000000
--- a/mobile/src/operators/kernel/arm/dropout_kernel.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DROPOUT_OP
-
-#include "operators/kernel/dropout_kernel.h"
-#include <operators/math/transform.h>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DropoutKernel<CPU, float>::Init(DropoutParam<CPU> *para) {
-  return true;
-}
-
-template <typename T>
-struct DropoutFunctor {
-  explicit DropoutFunctor(T drop_pro) : dropout_pro_(drop_pro) {}
-  inline T operator()(T in) const { return (1 - dropout_pro_) * in; }
-
- private:
-  T dropout_pro_;
-};
-
-template <>
-void DropoutKernel<CPU, float>::Compute(const DropoutParam<CPU> &param) {
-  const auto *input_x = param.InputX();
-  auto *input_x_ptr = input_x->data<float>();
-  auto *out = param.Out();
-  auto *out_ptr = out->mutable_data<float>();
-  const float dropoutProb = param.DropoutProb();
-  DropoutFunctor<float> func_(dropoutProb);
-  math::Transform trans;
-  trans(input_x_ptr, input_x_ptr + input_x->numel(), out_ptr, func_);
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/elementwise_add_kernel.cpp b/mobile/src/operators/kernel/arm/elementwise_add_kernel.cpp
deleted file mode 100644
index c4bcbf6f7e..0000000000
--- a/mobile/src/operators/kernel/arm/elementwise_add_kernel.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEADD_OP
-
-#include "operators/kernel/elementwise_add_kernel.h"
-#include "operators/kernel/central-arm-func/elementwise_add_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ElementwiseAddKernel<CPU, float>::Init(ElementwiseAddParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ElementwiseAddKernel<CPU, float>::Compute(
-    const ElementwiseAddParam<CPU> &param) {
-  if (param.InputX()->type() == type_id<float>().hash_code()) {
-    ElementwiseAddCompute<float>(param);
-  } else if (param.InputX()->type() == type_id<int>().hash_code()) {
-    AddElememtWiseStruct<int, IDENTITY>()(param.InputX(), param.InputY(),
-                                          param.Axis(), param.Out());
-  }
-  param.Out()->set_lod(param.InputX()->lod());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/elementwise_mul_kernel.cpp b/mobile/src/operators/kernel/arm/elementwise_mul_kernel.cpp
deleted file mode 100644
index 9c245707da..0000000000
--- a/mobile/src/operators/kernel/arm/elementwise_mul_kernel.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEMUL_OP
-
-#include "operators/kernel/elementwise_mul_kernel.h"
-#include "operators/kernel/central-arm-func/elementwise_mul_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ElementwiseMulKernel<CPU, float>::Init(ElementwiseMulParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ElementwiseMulKernel<CPU, float>::Compute(
-    const ElementwiseMulParam<CPU> &param) {
-  ElementwiseMulCompute<float>(param);
-  param.Out()->set_lod(param.InputX()->lod());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/elementwise_sub_kernel.cpp b/mobile/src/operators/kernel/arm/elementwise_sub_kernel.cpp
deleted file mode 100644
index 30f607155c..0000000000
--- a/mobile/src/operators/kernel/arm/elementwise_sub_kernel.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISESUB_OP
-
-#include "operators/kernel/elementwise_sub_kernel.h"
-#include "operators/kernel/central-arm-func/elementwise_sub_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ElementwiseSubKernel<CPU, float>::Init(ElementwiseSubParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ElementwiseSubKernel<CPU, float>::Compute(
-    const ElementwiseSubParam<CPU> &param) {
-  ElementwiseSubCompute<float>(param);
-  param.Out()->set_lod(param.InputX()->lod());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/exp_kernel.cpp b/mobile/src/operators/kernel/arm/exp_kernel.cpp
deleted file mode 100644
index 0323a2b045..0000000000
--- a/mobile/src/operators/kernel/arm/exp_kernel.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-//
-// Created by hujie09 on 2019-07-16.
-//
-
-#ifdef EXP_OP
-#pragma once
-
-#include <math.h>
-#include <operators/kernel/exp_kernel.h>
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool EXPKernel<CPU, float>::Init(
-    paddle_mobile::operators::EXPParam<paddle_mobile::CPU> *param) {
-  return true;
-}
-
-template <>
-void EXPKernel<CPU, float>::Compute(
-    const paddle_mobile::operators::EXPParam<paddle_mobile::CPU> &param) {
-  const auto input_ = param.InputX();
-  auto output = param.Out();
-  float *output_data = output->mutable_data<float>();
-  const float *input_data = input_->data<float>();
-  for (int i = 0; i < output->numel(); ++i, output_data++, input_data++) {
-    *output_data = exp(*input_data);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // EXP_OP
diff --git a/mobile/src/operators/kernel/arm/feed_kernel.cpp b/mobile/src/operators/kernel/arm/feed_kernel.cpp
deleted file mode 100644
index 26ea2ac5f7..0000000000
--- a/mobile/src/operators/kernel/arm/feed_kernel.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/feed_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FeedKernel<CPU, float>::Init(FeedParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void FeedKernel<CPU, float>::Compute(const FeedParam<CPU> &param) {
-  int col = param.Col();
-  param.Out()->ShareDataWith(param.InputX()->at(col));
-  param.Out()->set_lod(param.InputX()->at(col).lod());
-}
-
-template class FeedKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/arm/fetch_kernel.cpp b/mobile/src/operators/kernel/arm/fetch_kernel.cpp
deleted file mode 100644
index 8a97fa934b..0000000000
--- a/mobile/src/operators/kernel/arm/fetch_kernel.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-     http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/fetch_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FetchKernel<CPU, float>::Init(FetchParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void FetchKernel<CPU, float>::Compute(const FetchParam<CPU> &param) {
-  int col = param.Col();
-  param.Out()->at(col).ShareDataWith(*(param.InputX()));
-}
-
-template class FetchKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/arm/flatten_kernel.cpp b/mobile/src/operators/kernel/arm/flatten_kernel.cpp
deleted file mode 100644
index 4d00e49454..0000000000
--- a/mobile/src/operators/kernel/arm/flatten_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FLATTEN_OP
-
-#include "operators/kernel/flatten_kernel.h"
-#include "operators/kernel/central-arm-func/flatten_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FlattenKernel<CPU, float>::Init(FlattenParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void FlattenKernel<CPU, float>::Compute(const FlattenParam<CPU> &param) {
-  FlattenCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/fusion_fc_kernel.cpp b/mobile/src/operators/kernel/arm/fusion_fc_kernel.cpp
deleted file mode 100644
index 54ad5f788b..0000000000
--- a/mobile/src/operators/kernel/arm/fusion_fc_kernel.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_FC_OP
-
-#include "operators/kernel/fusion_fc_kernel.h"
-#include "operators/kernel/central-arm-func/fusion_fc_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FusionFcKernel<CPU, float>::Init(FusionFcParam<CPU> *param) {
-  int M = (int)param->InputX()->dims()[0];
-  if (M == 1) {
-    int r = param->InputY()->dims()[0];
-    int c = param->InputY()->dims()[1];
-    float *B = param->InputY()->data<float>();
-    framework::Tensor matrix_trans;
-    float *trans_b = matrix_trans.mutable_data<float>({r, c});
-    int index = 0;
-    for (int j = 0; j < c; j++) {
-      for (int i = 0; i < r; i++) {
-        trans_b[index++] = B[i * c + j];
-      }
-    }
-    index = 0;
-    for (int j = 0; j < c; j++) {
-      for (int i = 0; i < r; i++) {
-        B[index] = trans_b[index];
-        index++;
-      }
-    }
-  }
-  return true;
-}
-
-template <>
-void FusionFcKernel<CPU, float>::Compute(const FusionFcParam<CPU> &param) {
-  FusionFcCompute<float, float>(param);
-  param.Out()->set_lod(param.InputX()->lod());
-}
-
-template class FusionFcKernel<CPU, float>;
-
-#ifdef FUSION_FC_INT8_OP
-template <>
-bool FusionFcKernel<CPU, int8_t>::Init(FusionFcParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void FusionFcKernel<CPU, int8_t>::Compute(const FusionFcParam<CPU> &param) {
-  FusionFcCompute<int8_t, int32_t>(param);
-  param.Out()->set_lod(param.InputX()->lod());
-}
-
-template class FusionFcKernel<CPU, int8_t>;
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/gru_kernel.cpp b/mobile/src/operators/kernel/arm/gru_kernel.cpp
deleted file mode 100644
index 15459c8251..0000000000
--- a/mobile/src/operators/kernel/arm/gru_kernel.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef GRU_OP
-
-#include "operators/kernel/gru_kernel.h"
-#include "operators/kernel/central-arm-func/gru_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool GruKernel<CPU, float>::Init(GruParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void GruKernel<CPU, float>::Compute(const GruParam<CPU> &param) {
-  GruCompute<float>(param);
-  param.OutHidden()->set_lod(param.InputInput()->lod());
-}
-
-template class GruKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/gru_unit_kernel.cpp b/mobile/src/operators/kernel/arm/gru_unit_kernel.cpp
deleted file mode 100644
index bf20f25d72..0000000000
--- a/mobile/src/operators/kernel/arm/gru_unit_kernel.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef GRU_UNIT_OP
-
-#include "operators/kernel/gru_unit_kernel.h"
-#include "operators/kernel/central-arm-func/gru_unit_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool GruUnitKernel<CPU, float>::Init(GruUnitParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void GruUnitKernel<CPU, float>::Compute(const GruUnitParam<CPU> &param) {
-  GruUnitCompute<float>(param);
-}
-
-template class GruUnitKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/im2sequence_kernel.cpp b/mobile/src/operators/kernel/arm/im2sequence_kernel.cpp
deleted file mode 100644
index 07ce0314fa..0000000000
--- a/mobile/src/operators/kernel/arm/im2sequence_kernel.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef IM2SEQUENCE_OP
-
-#include "operators/kernel/im2sequence_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool Im2SequenceKernel<CPU, float>::Init(Im2SequenceParam<CPU> *para) {
-  return true;
-}
-
-inline int Im2SeqOutputSize(int input_size, int filter_size, int padding_0,
-                            int padding_1, int stride) {
-  const int output_size =
-      (input_size + padding_0 + padding_1 - filter_size) / stride + 1;
-  return output_size;
-}
-
-template <>
-void Im2SequenceKernel<CPU, float>::Compute(
-    const Im2SequenceParam<CPU> &param) {
-  const Tensor *in_x = param.Input();
-  framework::LoDTensor *out = param.Output();
-  out->mutable_data<float>();
-
-  std::vector<int> kernels = param.Kernels();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-
-  auto in_x_dim = in_x->dims();
-  const int batch_size = static_cast<int>(in_x_dim[0]);
-  const int img_channels = static_cast<int>(in_x_dim[1]);
-  const int img_height = static_cast<int>(in_x_dim[2]);
-  const int img_width = static_cast<int>(in_x_dim[3]);
-
-  int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
-                                       paddings[2], strides[0]);
-  int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
-                                      paddings[3], strides[1]);
-
-  out->mutable_data<float>({batch_size * output_height * output_width,
-                            img_channels * kernels[0] * kernels[1]});
-  const std::vector<int> dilations({1, 1});
-  // TODO(): verify
-  auto out_dims = out->dims();
-  out->Resize({batch_size, out->numel() / batch_size});
-  for (int i = 0; i < batch_size; i++) {
-    const Tensor src =
-        in_x->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
-    Tensor dst = out->Slice(i, i + 1).Resize(
-        {output_height, output_width, img_channels, kernels[0], kernels[1]});
-    math::Im2ColFunctor<math::ColFormat::kOCF, CPU, float> f;
-    f(src, dilations, strides, paddings, &dst);
-  }
-  out->Resize(out_dims);
-  framework::LoD lod(1);
-  lod[0].reserve(batch_size + 1);
-  int offset = 0;
-  lod[0].push_back(offset);
-  for (int i = 0; i < batch_size; ++i) {
-    offset += output_height * output_width;
-    lod[0].push_back(offset);
-  }
-  out->set_lod(lod);
-}
-
-template class Im2SequenceKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/increment_kernel.cpp b/mobile/src/operators/kernel/arm/increment_kernel.cpp
deleted file mode 100644
index 27fd48d084..0000000000
--- a/mobile/src/operators/kernel/arm/increment_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef INCREMENT_OP
-
-#include "operators/kernel/increment_kernel.h"
-#include <operators/kernel/central-arm-func/increment_arm_func.h>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool IncrementKernel<CPU, float>::Init(IncrementParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void IncrementKernel<CPU, float>::Compute(const IncrementParam<CPU> &param) {
-  IncrementCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/is_empty_kernel.cpp b/mobile/src/operators/kernel/arm/is_empty_kernel.cpp
deleted file mode 100644
index 070d3d16d7..0000000000
--- a/mobile/src/operators/kernel/arm/is_empty_kernel.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef INCREMENT_OP
-
-#include "operators/kernel/is_empty_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool IsEmptyKernel<CPU, float>::Init(IsEmptyParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void IsEmptyKernel<CPU, float>::Compute(const IsEmptyParam<CPU> &param) {
-  const framework::Tensor *input = param.InputX();
-  framework::Tensor *out = param.Out();
-  out->mutable_data<bool>()[0] = input->numel() == 0;
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/lod_reset_kernel.cpp b/mobile/src/operators/kernel/arm/lod_reset_kernel.cpp
deleted file mode 100644
index 264611be01..0000000000
--- a/mobile/src/operators/kernel/arm/lod_reset_kernel.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef LOD_RESET_OP
-
-#include <algorithm>
-#include "operators/kernel/kernels.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool LodResetKernel<CPU, float>::Init(LodResetParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void LodResetKernel<CPU, float>::Compute(const LodResetParam<CPU> &param) {
-  const auto *input = param.input_x_;
-  const auto *lod_t = param.input_y_;
-  bool append = param.append;
-  auto *output = param.output_;
-
-  output->ShareDataWith(*input);
-
-  std::vector<int> level0;
-  if (lod_t) {
-    if (lod_t->lod().size() > 0) {
-      output->set_lod(lod_t->lod());
-      return;  // early return, since lod already set
-    } else {
-      auto *lod = lod_t->data<int>();
-      level0 = std::vector<int>(lod, lod + lod_t->numel());
-    }
-  } else {
-    level0 = param.target_lod_;
-  }
-
-  // cast level0 to size_t
-  std::vector<size_t> ulevel0(level0.size(), 0);
-  std::transform(level0.begin(), level0.end(), ulevel0.begin(),
-                 [](int a) { return static_cast<size_t>(a); });
-
-  if (append) {
-    auto *out_lod = output->mutable_lod();
-    out_lod->push_back(ulevel0);
-  } else {
-    framework::LoD target_lod;
-    target_lod.push_back(ulevel0);
-    output->set_lod(target_lod);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // LOD_RESET_OP
diff --git a/mobile/src/operators/kernel/arm/logical_kernel.cpp b/mobile/src/operators/kernel/arm/logical_kernel.cpp
deleted file mode 100644
index 3cffcf5c69..0000000000
--- a/mobile/src/operators/kernel/arm/logical_kernel.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/logical_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-struct LogicalAndFunctor {
-  bool operator()(const T& a, const T& b) const { return a && b; }
-};
-
-template <typename T>
-struct LogicalOrFunctor {
-  bool operator()(const T& a, const T& b) const { return a || b; }
-};
-
-template <typename T>
-struct LogicalNotFunctor {
-  bool operator()(const T& a) const { return !a; }
-};
-
-template <typename T>
-struct LogicalXorFunctor {
-  bool operator()(const T& a, const T& b) const {
-    return (a || b) && !(a && b);
-  }
-};
-
-template <typename T, typename Functor>
-void UnaryLogicalCompute(const Tensor* inputX, Tensor* output) {
-  Functor func;
-  std::transform(inputX->data<T>(), inputX->data<T>() + inputX->numel(),
-                 output->data<T>(), func);
-}
-
-template <typename T, typename Functor>
-void BinaryLogicalCompute(const Tensor* inputX, const Tensor* inputY,
-                          Tensor* output) {
-  Functor func;
-  std::transform(inputX->data<T>(), inputX->data<T>() + inputX->numel(),
-                 inputY->data<T>(), output->data<T>(), func);
-}
-
-#ifdef LOGICAL_AND_OP
-template <>
-bool LogicalAndKernel<CPU, float>::Init(LogicalBinaryParam<CPU>* param) {
-  return true;
-}
-
-template <>
-void LogicalAndKernel<CPU, float>::Compute(
-    const LogicalBinaryParam<CPU>& param) {
-  auto* inputX = param.InputX();
-  auto* inputY = param.InputY();
-  auto* out = param.Out();
-  out->mutable_data<bool>();
-  BinaryLogicalCompute<bool, LogicalAndFunctor<bool>>(inputX, inputY, out);
-}
-#endif
-
-#ifdef LOGICAL_OR_OP
-template <>
-bool LogicalOrKernel<CPU, float>::Init(LogicalBinaryParam<CPU>* param) {
-  return true;
-}
-
-template <>
-void LogicalOrKernel<CPU, float>::Compute(
-    const LogicalBinaryParam<CPU>& param) {
-  auto* inputX = param.InputX();
-  auto* inputY = param.InputY();
-  auto* out = param.Out();
-  out->mutable_data<bool>();
-  BinaryLogicalCompute<bool, LogicalOrFunctor<bool>>(inputX, inputY, out);
-}
-#endif
-
-#ifdef LOGICAL_NOT_OP
-template <>
-bool LogicalNotKernel<CPU, float>::Init(LogicalUnaryParam<CPU>* param) {
-  return true;
-}
-
-template <>
-void LogicalNotKernel<CPU, float>::Compute(
-    const LogicalUnaryParam<CPU>& param) {
-  auto* inputX = param.InputX();
-  auto* out = param.Out();
-  out->mutable_data<bool>();
-  UnaryLogicalCompute<bool, LogicalNotFunctor<bool>>(inputX, out);
-}
-#endif
-
-#ifdef LOGICAL_XOR_OP
-template <>
-bool LogicalXorKernel<CPU, float>::Init(LogicalBinaryParam<CPU>* param) {
-  return true;
-}
-
-template <>
-void LogicalXorKernel<CPU, float>::Compute(
-    const LogicalBinaryParam<CPU>& param) {
-  auto* inputX = param.InputX();
-  auto* inputY = param.InputY();
-  auto* out = param.Out();
-  out->mutable_data<bool>();
-  BinaryLogicalCompute<bool, LogicalXorFunctor<bool>>(inputX, inputY, out);
-}
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/arm/lookup_kernel.cpp b/mobile/src/operators/kernel/arm/lookup_kernel.cpp
deleted file mode 100644
index 0e6df6ab6b..0000000000
--- a/mobile/src/operators/kernel/arm/lookup_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef LOOKUP_OP
-
-#include "operators/kernel/lookup_kernel.h"
-#include "operators/kernel/central-arm-func/lookup_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool LookupKernel<CPU, float>::Init(LookupParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void LookupKernel<CPU, float>::Compute(const LookupParam<CPU> &param) {
-  LookupCompute<float>(param);
-  param.Out()->set_lod(param.InputIds()->lod());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/lrn_kernel.cpp b/mobile/src/operators/kernel/arm/lrn_kernel.cpp
deleted file mode 100644
index bf049053fc..0000000000
--- a/mobile/src/operators/kernel/arm/lrn_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef LRN_OP
-
-#include "operators/kernel/lrn_kernel.h"
-#include "operators/kernel/central-arm-func/lrn_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool LrnKernel<CPU, float>::Init(LrnParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void LrnKernel<CPU, float>::Compute(const LrnParam<CPU> &param) {
-  LrnCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/mul_kernel.cpp b/mobile/src/operators/kernel/arm/mul_kernel.cpp
deleted file mode 100644
index 59d16600d7..0000000000
--- a/mobile/src/operators/kernel/arm/mul_kernel.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MUL_OP
-
-#include "operators/kernel/mul_kernel.h"
-#include "operators/kernel/central-arm-func/mul_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool MulKernel<CPU, float>::Init(MulParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void MulKernel<CPU, float>::Compute(const MulParam<CPU> &param) {
-  MulCompute<float>(param);
-  param.Out()->set_lod(param.InputX()->lod());
-}
-
-template class MulKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/multiclass_nms_kernel.cpp b/mobile/src/operators/kernel/arm/multiclass_nms_kernel.cpp
deleted file mode 100644
index 61638da005..0000000000
--- a/mobile/src/operators/kernel/arm/multiclass_nms_kernel.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MULTICLASSNMS_OP
-
-#include "operators/kernel/multiclass_nms_kernel.h"
-#include "operators/kernel/central-arm-func/multiclass_nms_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool MultiClassNMSKernel<CPU, float>::Init(MultiClassNMSParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void MultiClassNMSKernel<CPU, float>::Compute(
-    const MultiClassNMSParam<CPU> &param) {
-  MultiClassNMSCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/nearest_interp_kernel.cpp b/mobile/src/operators/kernel/arm/nearest_interp_kernel.cpp
deleted file mode 100644
index d412ec1a5d..0000000000
--- a/mobile/src/operators/kernel/arm/nearest_interp_kernel.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef NEAREST_INTERP_OP
-
-#include "operators/kernel/nearest_interp_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool NearestInterpolationKernel<CPU, float>::Init(
-    NearestInterpolationParam<CPU>* param) {
-  return true;
-}
-
-template <>
-void NearestInterpolationKernel<CPU, float>::Compute(
-    const NearestInterpolationParam<CPU>& param) {
-  auto out_dims = param.Out()->dims();
-  auto* input = param.InputX()->data<float>();
-  auto out_size_t = param.InputOutPutSize();
-
-  int out_h = param.OutH();
-  int out_w = param.OutW();
-  if (out_size_t != nullptr) {
-    auto out_size_data = out_size_t->data<int>();
-    out_h = out_size_data[0];
-    out_w = out_size_data[1];
-  }
-  auto* output = param.Out()->mutable_data<float>(
-      {out_dims[0], out_dims[1], out_h, out_w});
-  auto batch_size = param.InputX()->dims()[0];
-  auto channels = param.InputX()->dims()[1];
-  auto in_h = param.InputX()->dims()[2];
-  auto in_w = param.InputX()->dims()[3];
-
-  auto in_hw = in_h * in_w;
-  auto out_hw = out_h * out_w;
-  auto in_chw = channels * in_hw;
-  auto out_chw = channels * out_hw;
-
-  float ratio_h =
-      (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
-  float ratio_w =
-      (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
-
-  if (in_h == out_h && in_w == out_w) {
-    memcpy(output, input, param.InputX()->numel() * sizeof(float));
-  } else {
-    for (int k = 0; k < batch_size; ++k) {  // loop for batches
-      for (int i = 0; i < out_h; ++i) {     // loop for images
-        int h = ratio_h * i + 0.5f;
-
-        for (int j = 0; j < out_w; ++j) {
-          int w = ratio_w * j + 0.5f;
-
-          // calculate four position for bilinear interpolation
-          const float* in_pos = &input[k * in_chw + h * in_w + w];
-          float* out_pos = &output[k * out_chw + i * out_w + j];
-
-          for (int c = 0; c < channels; ++c) {  // loop for channels
-            // nearest interpolation
-            out_pos[0] = in_pos[0];
-            in_pos += in_hw;
-            out_pos += out_hw;
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/norm_kernel.cpp b/mobile/src/operators/kernel/arm/norm_kernel.cpp
deleted file mode 100644
index 32617992cb..0000000000
--- a/mobile/src/operators/kernel/arm/norm_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef NORM_OP
-
-#include "operators/kernel/norm_kernel.h"
-#include "operators/kernel/central-arm-func/norm_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool NormKernel<CPU, float>::Init(NormParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void NormKernel<CPU, float>::Compute(const NormParam<CPU> &param) {
-  NormCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/one_hot_kernel.cpp b/mobile/src/operators/kernel/arm/one_hot_kernel.cpp
deleted file mode 100644
index 208b34ea2c..0000000000
--- a/mobile/src/operators/kernel/arm/one_hot_kernel.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ONE_HOT_OP
-
-#include "operators/kernel/one_hot_kernel.h"
-#include "framework/data_type.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename InT>
-struct OnehotOpFunctor {
-  const framework::LoDTensor* in_;
-  framework::LoDTensor* out_;
-  int depth_;
-
-  OnehotOpFunctor(const framework::LoDTensor* in, framework::LoDTensor* out,
-                  int depth)
-      : in_(in), out_(out), depth_(depth) {}
-
-  template <typename OutT>
-  void apply() const {
-    auto* p_in_data = in_->data<InT>();
-    auto numel = in_->numel();
-    auto* p_out_data = out_->mutable_data<OutT>();
-    memset(p_out_data, 0, out_->numel() * sizeof(OutT));
-
-    for (int i = 0; i < numel; ++i) {
-      *(p_out_data + i * depth_ + p_in_data[i]) = 1.0;
-    }
-  }
-};
-
-template <>
-bool OnehotKernel<CPU, float>::Init(OnehotParam<CPU>* param) {
-  return true;
-}
-
-template <>
-void OnehotKernel<CPU, float>::Compute(const OnehotParam<CPU>& param) {
-  framework::VisitDataType(
-      framework::ToDataType(param.dtype_),
-      OnehotOpFunctor<int64_t>(param.input_, param.output_, param.depth_));
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // ONE_HOT_OP
diff --git a/mobile/src/operators/kernel/arm/pad2d_kernel.cpp b/mobile/src/operators/kernel/arm/pad2d_kernel.cpp
deleted file mode 100755
index f71058519c..0000000000
--- a/mobile/src/operators/kernel/arm/pad2d_kernel.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PAD2D_OP
-
-#include "operators/kernel/pad2d_kernel.h"
-#include "operators/math/pad.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool Pad2DKernel<CPU, float>::Init(Pad2DParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void Pad2DKernel<CPU, float>::Compute(const Pad2DParam<CPU> &param) {
-  const auto *input = param.InputX();
-  auto *output = param.Out();
-  const auto &paddings = param.paddings_;
-  //  if (param.mode_ == "constant" && param.pad_value_ == 0) {
-  math::PadFunctor<CPU, float> pad;
-  pad(*input, paddings[0], paddings[1], paddings[2], paddings[3], output);
-  //  } else {
-  //    PADDLE_MOBILE_THROW_EXCEPTION("Pad2D has not been implemented.");
-  //  }
-  output->set_lod(input->lod());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // PAD2D_OP
diff --git a/mobile/src/operators/kernel/arm/polygon_box_transform_kernel.cpp b/mobile/src/operators/kernel/arm/polygon_box_transform_kernel.cpp
deleted file mode 100644
index 1ae11aba41..0000000000
--- a/mobile/src/operators/kernel/arm/polygon_box_transform_kernel.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POLYGONBOXTRANSFORM_OP
-
-#include "operators/kernel/polygon_box_transform_kernel.h"
-#include "operators/kernel/central-arm-func/polygon_box_transform_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool PolygonBoxTransformKernel<CPU, float>::Init(
-    PolygonBoxTransformParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void PolygonBoxTransformKernel<CPU, float>::Compute(
-    const PolygonBoxTransformParam<CPU> &param) {
-  PolygonBoxTransformCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/pool_kernel.cpp b/mobile/src/operators/kernel/arm/pool_kernel.cpp
deleted file mode 100644
index 703a73d64b..0000000000
--- a/mobile/src/operators/kernel/arm/pool_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POOL_OP
-
-#include "operators/kernel/pool_kernel.h"
-#include "operators/kernel/central-arm-func/pool_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool PoolKernel<CPU, float>::Init(PoolParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void PoolKernel<CPU, float>::Compute(const PoolParam<CPU> &param) {
-  PoolCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // POOL_OP
diff --git a/mobile/src/operators/kernel/arm/prelu_kernel.cpp b/mobile/src/operators/kernel/arm/prelu_kernel.cpp
deleted file mode 100644
index 591bd64416..0000000000
--- a/mobile/src/operators/kernel/arm/prelu_kernel.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PRELU_OP
-
-#include "operators/kernel/prelu_kernel.h"
-#include <operators/math/transform.h>
-#if __ARM_NEON
-#include <arm_neon.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-struct PReluFunctor {
-  explicit PReluFunctor(float slope) { this->slope_ = slope; }
-  inline T operator()(T in) const { return in > 0 ? in : in * slope_; }
-
-  float slope_ = 0.0f;
-};
-
-/*
- * @b 特化到具体平台的实现, param 从 op 层传入
- * */
-template <>
-void PReluKernel<CPU, float>::Compute(const PReluParam<CPU> &param) {
-  auto *x = param.InputX();
-  auto *alpha = param.InputAlpha();
-  auto *out = param.Out();
-  std::string mode = param.Mode();
-  auto *x_ptr = x->data<float>();
-  auto *o_ptr = out->mutable_data<float>();
-  auto *alpha_ptr = alpha->data<float>();
-  int numel = x->numel();
-  auto dim = x->dims();
-  int k = dim[0] * dim[1];
-  int n = dim[2] * dim[3];
-  int index = 0;
-  int i = 0;
-  int temp = 0;
-#if __ARM_NEON
-  #pragma omp parallel for
-  for (int i = 0; i < k; i++) {
-    float32x4_t zero = vdupq_n_f32(0.0);
-    float32x4_t cv;
-    float32x4_t cv1;
-    float32x4_t cv2;
-    float32x4_t pv;
-    for (int j = 0; (j + 3) < n; j += 4) {
-      const float *in = x_ptr + i * n + j;
-      float *out = o_ptr + i * n + j;
-      cv = vld1q_f32(in);
-      cv1 = vmaxq_f32(cv, zero);
-      cv2 = vminq_f32(cv, zero);
-      if (mode == "channel") {
-        cv2 = vmulq_n_f32(cv2, alpha_ptr[i]);
-      } else if (mode == "element") {
-        pv = vld1q_f32(alpha_ptr + i * n + j);
-        cv2 = vmulq_f32(cv2, pv);
-      } else {
-        cv2 = vmulq_n_f32(cv2, alpha_ptr[0]);
-      }
-      cv = vaddq_f32(cv1, cv2);
-      vst1q_f32(out, cv);
-    }
-    int j;
-    for (j = 0; (j + 3) < n; j += 4) {
-    }
-    for (int m = j; m < n; m++) {
-      if (mode == "channel") {
-        o_ptr[i * n + m] = x_ptr[i * n + m] > 0
-                               ? x_ptr[i * n + m]
-                               : alpha_ptr[i] * x_ptr[i * n + m];
-      } else if (mode == "element") {
-        o_ptr[i * n + m] = x_ptr[i * n + m] > 0
-                               ? x_ptr[i * n + m]
-                               : alpha_ptr[i * n + m] * x_ptr[i * n + m];
-      } else {
-        o_ptr[i * n + m] = x_ptr[i * n + m] > 0
-                               ? x_ptr[i * n + m]
-                               : alpha_ptr[0] * x_ptr[i * n + m];
-      }
-    }
-  }
-
-#else
-  if (mode == "channel") {
-    temp = numel / (dim[0] * dim[1]);
-#pragma omp parallel for
-    for (i = 0; i < numel; i++) {
-      index = (i / temp) % dim[1];
-      o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
-    }
-  } else if (mode == "element") {
-#pragma omp parallel for
-    for (i = 0; i < numel; i++) {
-      o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[i] * x_ptr[i];
-    }
-  } else {
-#pragma omp parallel for
-    for (i = 0; i < numel; i++) {
-      o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[0] * x_ptr[i];
-    }
-  }
-#endif
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/prior_box_kernel.cpp b/mobile/src/operators/kernel/arm/prior_box_kernel.cpp
deleted file mode 100644
index c067d3388d..0000000000
--- a/mobile/src/operators/kernel/arm/prior_box_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PRIORBOX_OP
-
-#include "operators/kernel/prior_box_kernel.h"
-#include "operators/kernel/central-arm-func/prior_box_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool PriorBoxKernel<CPU, float>::Init(PriorBoxParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void PriorBoxKernel<CPU, float>::Compute(const PriorBoxParam<CPU> &param) {
-  PriorBoxCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/proposal_kernel.cpp b/mobile/src/operators/kernel/arm/proposal_kernel.cpp
deleted file mode 100644
index c9d0c18448..0000000000
--- a/mobile/src/operators/kernel/arm/proposal_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PROPOSAL_OP
-
-#include <vector>
-#include "operators/kernel/detection_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ProposalKernel<CPU, float>::Init(ProposalParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ProposalKernel<CPU, float>::Compute(const ProposalParam<CPU> &param) {
-  // TODO(hjchen2)
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // PROPOSAL_OP
diff --git a/mobile/src/operators/kernel/arm/psroi_pool_kernel.cpp b/mobile/src/operators/kernel/arm/psroi_pool_kernel.cpp
deleted file mode 100644
index 6ed4c77d2d..0000000000
--- a/mobile/src/operators/kernel/arm/psroi_pool_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PSROI_POOL_OP
-
-#include <vector>
-#include "operators/kernel/detection_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool PSRoiPoolKernel<CPU, float>::Init(PSRoiPoolParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void PSRoiPoolKernel<CPU, float>::Compute(const PSRoiPoolParam<CPU> &param) {
-  // TODO(hjchen2)
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // PSROI_POOL_OP
diff --git a/mobile/src/operators/kernel/arm/quantize_kernel.cpp b/mobile/src/operators/kernel/arm/quantize_kernel.cpp
deleted file mode 100644
index 515e9cf40d..0000000000
--- a/mobile/src/operators/kernel/arm/quantize_kernel.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef QUANT_OP
-
-#include "operators/kernel/quantize_kernel.h"
-#include <cmath>
-#include "operators/math/quantize.h"
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#ifndef __aarch64__
-inline float32_t vmaxvq_f32(float32x4_t r) {
-  float32x2_t v = vmax_f32(vget_high_f32(r), vget_low_f32(r));
-  return vget_lane_f32(vpmax_f32(v, v), 0);
-}
-#endif
-
-template <RoundType R>
-inline void QuantizeOffline(const Tensor *input, const float scale,
-                            const float max_abs, Tensor *output) {
-  const float *x = input->data<float>();
-  int8_t *y = output->mutable_data<int8_t>();
-  size_t remain = input->numel();
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-  size_t loop = remain >> 4;
-  remain = remain & 0xF;
-  float32x4_t __scale = vdupq_n_f32(scale);
-  float32x4_t __postive_max = vdupq_n_f32(max_abs);
-  float32x4_t __negtive_max = vdupq_n_f32(-max_abs);
-  #pragma omp parallel for
-  for (size_t i = 0; i < loop; ++i) {
-    const float *local_x = x + (i << 4);
-    int8_t *local_y = y + (i << 4);
-    float32x4_t r0 = vld1q_f32(local_x);
-    float32x4_t r1 = vld1q_f32(local_x + 4);
-    float32x4_t r2 = vld1q_f32(local_x + 8);
-    float32x4_t r3 = vld1q_f32(local_x + 12);
-    r0 = vmaxq_f32(vminq_f32(r0, __postive_max), __negtive_max);
-    r1 = vmaxq_f32(vminq_f32(r1, __postive_max), __negtive_max);
-    r2 = vmaxq_f32(vminq_f32(r2, __postive_max), __negtive_max);
-    r3 = vmaxq_f32(vminq_f32(r3, __postive_max), __negtive_max);
-    r0 = vmulq_f32(r0, __scale);
-    r1 = vmulq_f32(r1, __scale);
-    r2 = vmulq_f32(r2, __scale);
-    r3 = vmulq_f32(r3, __scale);
-    int32x4_t q0 = math::vRoundq_f32<R>(r0);
-    int32x4_t q1 = math::vRoundq_f32<R>(r1);
-    int32x4_t q2 = math::vRoundq_f32<R>(r2);
-    int32x4_t q3 = math::vRoundq_f32<R>(r3);
-    int16x4_t d0 = vmovn_s32(q0);
-    int16x4_t d1 = vmovn_s32(q1);
-    int16x4_t d2 = vmovn_s32(q2);
-    int16x4_t d3 = vmovn_s32(q3);
-    int16x8_t q5 = vcombine_s16(d0, d1);
-    int16x8_t q6 = vcombine_s16(d2, d3);
-    int8x8_t d5 = vmovn_s16(q5);
-    int8x8_t d6 = vmovn_s16(q6);
-    vst1_s8(local_y, d5);
-    vst1_s8(local_y + 8, d6);
-  }
-  x += (loop << 4);
-  y += (loop << 4);
-#endif
-  for (size_t i = 0; i < remain; ++i) {
-    float x_temp = std::max(std::min(x[i], max_abs), -max_abs);
-    y[i] = math::Round<R>(x_temp * scale);
-  }
-}
-
-template <RoundType R>
-inline void QuantizeOnline(const Tensor *input, const float scale,
-                           Tensor *output) {
-  const float *x = input->data<float>();
-  int8_t *y = output->mutable_data<int8_t>();
-  size_t remain = input->numel();
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-  size_t loop = remain >> 4;
-  remain = remain & 0xF;
-  float32x4_t __scale = vdupq_n_f32(scale);
-  #pragma omp parallel for
-  for (size_t i = 0; i < loop; ++i) {
-    const float *local_x = x + (i << 4);
-    int8_t *local_y = y + (i << 4);
-    float32x4_t r0 = vld1q_f32(local_x);
-    float32x4_t r1 = vld1q_f32(local_x + 4);
-    float32x4_t r2 = vld1q_f32(local_x + 8);
-    float32x4_t r3 = vld1q_f32(local_x + 12);
-    r0 = vmulq_f32(r0, __scale);
-    r1 = vmulq_f32(r1, __scale);
-    r2 = vmulq_f32(r2, __scale);
-    r3 = vmulq_f32(r3, __scale);
-    int32x4_t q0 = math::vRoundq_f32<R>(r0);
-    int32x4_t q1 = math::vRoundq_f32<R>(r1);
-    int32x4_t q2 = math::vRoundq_f32<R>(r2);
-    int32x4_t q3 = math::vRoundq_f32<R>(r3);
-    int16x4_t d0 = vmovn_s32(q0);
-    int16x4_t d1 = vmovn_s32(q1);
-    int16x4_t d2 = vmovn_s32(q2);
-    int16x4_t d3 = vmovn_s32(q3);
-    int16x8_t q5 = vcombine_s16(d0, d1);
-    int16x8_t q6 = vcombine_s16(d2, d3);
-    int8x8_t d5 = vmovn_s16(q5);
-    int8x8_t d6 = vmovn_s16(q6);
-    vst1_s8(local_y, d5);
-    vst1_s8(local_y + 8, d6);
-  }
-  x += (loop << 4);
-  y += (loop << 4);
-#endif
-  for (size_t i = 0; i < remain; ++i) {
-    y[i] = math::Round<R>(x[i] * scale);
-  }
-}
-
-template <RoundType R>
-static void Quantize(const Tensor *input, const float max_abs,
-                     const bool offline, Tensor *output) {
-  float scale = 127.f / max_abs;
-  if (offline) {
-    QuantizeOffline<R>(input, scale, max_abs, output);
-  } else {
-    QuantizeOnline<R>(input, scale, output);
-  }
-}
-
-float find_abs_max(const Tensor *input) {
-  float max_abs = 0.f;
-  const float *x = input->data<float>();
-  size_t remain = input->numel();
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-  size_t loop = remain >> 4;
-  remain = remain & 0xF;
-  float32x4_t __max = {0.f, 0.f, 0.f, 0.f};
-
-  for (size_t i = 0; i < loop; ++i, x += 16) {
-    float32x4_t r0 = vld1q_f32(x);
-    float32x4_t r1 = vld1q_f32(x + 4);
-    float32x4_t r2 = vld1q_f32(x + 8);
-    float32x4_t r3 = vld1q_f32(x + 12);
-    r0 = vabsq_f32(r0);
-    r1 = vabsq_f32(r1);
-    r2 = vabsq_f32(r2);
-    r3 = vabsq_f32(r3);
-    r0 = vmaxq_f32(r0, r1);
-    r1 = vmaxq_f32(r2, r3);
-    r0 = vmaxq_f32(r0, r1);
-    __max = vmaxq_f32(r0, __max);
-  }
-  max_abs = vmaxvq_f32(__max);
-#endif
-  for (size_t i = 0; i < remain; ++i) {
-    max_abs = std::max(max_abs, static_cast<float>(fabs(x[i])));
-  }
-  return max_abs;
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif  // __ARM_NEON__
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool QuantizeKernel<CPU, float>::Init(QuantizeParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void QuantizeKernel<CPU, float>::Compute(const QuantizeParam<CPU> &param) {
-  const LoDTensor *input = param.input_;
-  LoDTensor *output = param.output_;
-  Tensor *output_scale = param.online_scale_;
-  float max_abs = 0.f;
-  if (param.offline_) {
-    max_abs = param.offline_scale_->data<float>()[0];
-  } else {
-    max_abs = find_abs_max(input);
-  }
-  max_abs = std::max(max_abs, 1e-6f);
-  param.online_scale_->mutable_data<float>()[0] = max_abs;
-  switch (param.round_type_) {
-    case ROUND_NEAREST_TO_EVEN:
-      Quantize<ROUND_NEAREST_TO_EVEN>(input, max_abs, param.offline_, output);
-      break;
-    case ROUND_NEAREST_TOWARDS_ZERO:
-      Quantize<ROUND_NEAREST_TOWARDS_ZERO>(input, max_abs, param.offline_,
-                                           output);
-      break;
-    case ROUND_NEAREST_AWAY_ZERO:
-      Quantize<ROUND_NEAREST_AWAY_ZERO>(input, max_abs, param.offline_, output);
-      break;
-    default:
-      LOG(kLOG_ERROR) << "round type is not supported.";
-      break;
-  }
-  output->set_lod(input->lod());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // QUANT_OP
diff --git a/mobile/src/operators/kernel/arm/reshape2_kernel.cpp b/mobile/src/operators/kernel/arm/reshape2_kernel.cpp
deleted file mode 100644
index 093105f906..0000000000
--- a/mobile/src/operators/kernel/arm/reshape2_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE2_OP
-
-#include "operators/kernel/reshape2_kernel.h"
-#include "operators/kernel/central-arm-func/reshape2_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool Reshape2Kernel<CPU, float>::Init(Reshape2Param<CPU> *param) {
-  return true;
-}
-
-template <>
-void Reshape2Kernel<CPU, float>::Compute(const Reshape2Param<CPU> &param) {
-  Reshape2Compute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/reshape_kernel.cpp b/mobile/src/operators/kernel/arm/reshape_kernel.cpp
deleted file mode 100644
index 800808f9c2..0000000000
--- a/mobile/src/operators/kernel/arm/reshape_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE_OP
-
-#include "operators/kernel/reshape_kernel.h"
-#include "operators/kernel/central-arm-func/reshape_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ReshapeKernel<CPU, float>::Init(ReshapeParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ReshapeKernel<CPU, float>::Compute(const ReshapeParam<CPU> &param) {
-  ReshapeCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/resize_kernel.cpp b/mobile/src/operators/kernel/arm/resize_kernel.cpp
deleted file mode 100644
index 6a6af36788..0000000000
--- a/mobile/src/operators/kernel/arm/resize_kernel.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESIZE_OP
-
-#include "operators/kernel/resize_kernel.h"
-#include <cmath>
-
-namespace paddle_mobile {
-namespace operators {
-void BiLinearResizeTensor(const float* src, const int src_height,
-                          const int src_width, float* dst, const int dst_height,
-                          const int dst_width) {
-  const float scale_w = src_width / static_cast<float>(dst_width);
-  const float scale_h = src_height / static_cast<float>(dst_height);
-  float* dst_data = dst;
-  const float* src_data = src;
-
-  for (int dst_h = 0; dst_h < dst_height; ++dst_h) {
-    float fh = dst_h * scale_h;
-
-    int src_h = std::floor(fh);
-
-    fh -= src_h;
-    const float w_h0 = fabs(1.0 - fh);
-    const float w_h1 = fabs(fh);
-
-    const int dst_offset_1 = dst_h * dst_width;
-    const int src_offset_1 = src_h * src_width;
-
-    float* dst_data_ptr = dst_data + dst_offset_1;
-
-    for (int dst_w = 0; dst_w < dst_width; ++dst_w) {
-      float fw = dst_w * scale_w;
-      int src_w = std::floor(fw);
-      fw -= src_w;
-      const float w_w0 = fabs(1.0 - fw);
-      const float w_w1 = fabs(fw);
-
-      float dst_value = 0;
-
-      const int src_idx = src_offset_1 + src_w;
-      dst_value += (w_h0 * w_w0 * src_data[src_idx]);
-      int flag = 0;
-      if (src_w + 1 < src_width) {
-        dst_value += (w_h0 * w_w1 * src_data[src_idx + 1]);
-        ++flag;
-      }
-      if (src_h + 1 < src_height) {
-        dst_value += (w_h1 * w_w0 * src_data[src_idx + src_width]);
-        ++flag;
-      }
-
-      if (flag > 1) {
-        dst_value += (w_h1 * w_w1 * src_data[src_idx + src_width + 1]);
-        //                ++flag;
-      }
-      *(dst_data_ptr++) = dst_value;
-    }
-  }
-}
-
-void ResizeTensor(const Tensor* src, const int src_n, const int src_c,
-                  Tensor* dst, const int dst_n, const int dst_c) {
-  framework::DDim in_dims = src->dims();
-  const int src_chans = in_dims[1];
-  const int src_height = in_dims[2];
-  const int src_width = in_dims[3];
-  const int src_offset = (src_n * src_chans + src_c) * src_height * src_width;
-
-  framework::DDim out_dims = dst->dims();
-  const int dst_chans = out_dims[1];
-  const int dst_height = out_dims[2];
-  const int dst_width = out_dims[3];
-  const int dst_offset = (dst_n * dst_chans + dst_c) * dst_height * dst_width;
-
-  const auto* src_ptr = src->data<float>();
-  auto* dst_ptr = dst->data<float>();
-  const auto* src_data = &(src_ptr[src_offset]);
-  auto* dst_data = &(dst_ptr[dst_offset]);
-  BiLinearResizeTensor(src_data, src_height, src_width, dst_data, dst_height,
-                       dst_width);
-}
-
-void ResizeTensor(const Tensor* src, Tensor* dst) {
-  framework::DDim in_dims = src->dims();
-  framework::DDim out_dims = dst->dims();
-  PADDLE_MOBILE_ENFORCE(in_dims[0] == out_dims[0],
-                        "src tensor batch num not equal to dst tensor");
-  PADDLE_MOBILE_ENFORCE(in_dims[1] == out_dims[1],
-                        "src tensor channel num not equal to dst tensor");
-  for (int n = 0, batch_num = in_dims[0]; n < batch_num; ++n) {
-    for (int c = 0, chan_num = in_dims[1]; c < chan_num; ++c) {
-      ResizeTensor(src, n, c, dst, n, c);
-    }
-  }
-}
-
-template <>
-void ResizeKernel<CPU, float>::Compute(const ResizeParam<CPU>& param) {
-  const auto* input_x = param.InputX();
-  const auto& input_x_dims = input_x->dims();
-  auto* out = param.Out();
-  framework::DDim out_dims = CalOutputShape(param);
-
-  out->Resize(out_dims);
-  ResizeTensor(input_x, out);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/roi_perspective_kernel.cpp b/mobile/src/operators/kernel/arm/roi_perspective_kernel.cpp
deleted file mode 100644
index c8b0cb8bf2..0000000000
--- a/mobile/src/operators/kernel/arm/roi_perspective_kernel.cpp
+++ /dev/null
@@ -1,291 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ROI_PERSPECTIVE_OP
-
-#include <cmath>
-#include "operators/kernel/detection_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-inline bool GT_E(T a, T b) {
-  return (a > b) || fabs(a - b) < 1e-4;
-}
-
-template <typename T>
-inline bool LT_E(T a, T b) {
-  return (a < b) || fabs(a - b) < 1e-4;
-}
-
-// check if (x, y) is in the boundary of roi
-template <typename T>
-bool in_quad(T x, T y, T roi_x[], T roi_y[]) {
-  for (int i = 0; i < 4; i++) {
-    T xs = roi_x[i];
-    T ys = roi_y[i];
-    T xe = roi_x[(i + 1) % 4];
-    T ye = roi_y[(i + 1) % 4];
-    if (fabs(ys - ye) < 1e-4) {
-      if (fabs(y - ys) < 1e-4 && fabs(y - ye) < 1e-4 &&
-          GT_E<T>(x, std::min(xs, xe)) && LT_E<T>(x, std::max(xs, xe))) {
-        return true;
-      }
-    } else {
-      T intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs;
-      if (fabs(intersec_x - x) < 1e-4 && GT_E<T>(y, std::min(ys, ye)) &&
-          LT_E<T>(y, std::max(ys, ye))) {
-        return true;
-      }
-    }
-  }
-
-  int n_cross = 0;
-  for (int i = 0; i < 4; i++) {
-    T xs = roi_x[i];
-    T ys = roi_y[i];
-    T xe = roi_x[(i + 1) % 4];
-    T ye = roi_y[(i + 1) % 4];
-    if (fabs(ys - ye) < 1e-4) {
-      continue;
-    }
-    if (LT_E<T>(y, std::min(ys, ye)) || (y > std::max(ys, ye))) {
-      continue;
-    }
-    T intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs;
-    if (fabs(intersec_x - x) < 1e-4) {
-      return true;
-    }
-    if (intersec_x > x) {
-      n_cross++;
-    }
-  }
-  return (n_cross % 2 == 1);
-}
-
-template <typename T>
-void get_transform_matrix(const int transformed_width,
-                          const int transformed_height, T roi_x[], T roi_y[],
-                          T matrix[]) {
-  T x0 = roi_x[0];
-  T x1 = roi_x[1];
-  T x2 = roi_x[2];
-  T x3 = roi_x[3];
-  T y0 = roi_y[0];
-  T y1 = roi_y[1];
-  T y2 = roi_y[2];
-  T y3 = roi_y[3];
-
-  // Estimate the height and width of RoI
-  T len1 = sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
-  T len2 = sqrt((x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2));
-  T len3 = sqrt((x2 - x3) * (x2 - x3) + (y2 - y3) * (y2 - y3));
-  T len4 = sqrt((x3 - x0) * (x3 - x0) + (y3 - y0) * (y3 - y0));
-  T estimated_height = (len2 + len4) / 2.0;
-  T estimated_width = (len1 + len3) / 2.0;
-
-  // Get the normalized height and normalized width
-  int normalized_height = transformed_height;
-  int normalized_width =
-      std::round(estimated_width * (normalized_height - 1) / estimated_height) +
-      1;
-  normalized_width = std::min(normalized_width, transformed_width);
-
-  T dx1 = x1 - x2;
-  T dx2 = x3 - x2;
-  T dx3 = x0 - x1 + x2 - x3;
-  T dy1 = y1 - y2;
-  T dy2 = y3 - y2;
-  T dy3 = y0 - y1 + y2 - y3;
-
-  matrix[6] = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) /
-              (normalized_width - 1);
-  matrix[7] = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) /
-              (normalized_height - 1);
-  matrix[8] = 1;
-
-  matrix[3] = (y1 - y0 + matrix[6] * (normalized_width - 1) * y1) /
-              (normalized_width - 1);
-  matrix[4] = (y3 - y0 + matrix[7] * (normalized_height - 1) * y3) /
-              (normalized_height - 1);
-  matrix[5] = y0;
-
-  matrix[0] = (x1 - x0 + matrix[6] * (normalized_width - 1) * x1) /
-              (normalized_width - 1);
-  matrix[1] = (x3 - x0 + matrix[7] * (normalized_height - 1) * x3) /
-              (normalized_height - 1);
-  matrix[2] = x0;
-}
-
-// Get the source coordinates in the input feature map.
-// (u, v, w)^matrix = matrix * (out_w, out_h, 1)^matrix
-// in_w = u / w
-// in_h = v / w
-template <typename T>
-void get_source_coords(T matrix[], int out_w, int out_h, T *in_w, T *in_h) {
-  T u = matrix[0] * out_w + matrix[1] * out_h + matrix[2];
-  T v = matrix[3] * out_w + matrix[4] * out_h + matrix[5];
-  T w = matrix[6] * out_w + matrix[7] * out_h + matrix[8];
-
-  in_w[0] = u / w;
-  in_h[0] = v / w;
-}
-
-template <typename T>
-void bilinear_interpolate(const T *in_data, const int channels, const int width,
-                          const int height, int in_n, int in_c, T in_w, T in_h,
-                          T *val) {
-  // Deal with cases that source coords are out of feature map boundary
-  if ((-0.5 > in_w) || (in_w > width - 0.5) || (-0.5 > in_h) ||
-      (in_h > height - 0.5)) {
-    // empty
-    val[0] = 0.0;
-    return;
-  }
-
-  if (in_w < 0) {
-    in_w = 0;
-  }
-  if (in_h < 0) {
-    in_h = 0;
-  }
-
-  int in_w_floor = floor(in_w);
-  int in_h_floor = floor(in_h);
-  int in_w_ceil;
-  int in_h_ceil;
-
-  if (GT_E<T>(in_w_floor, width - 1)) {
-    in_w_ceil = in_w_floor = width - 1;
-    in_w = static_cast<T>(in_w_floor);
-  } else {
-    in_w_ceil = in_w_floor + 1;
-  }
-
-  if (GT_E<T>(in_h_floor, height - 1)) {
-    in_h_ceil = in_h_floor = height - 1;
-    in_h = static_cast<T>(in_h_floor);
-  } else {
-    in_h_ceil = in_h_floor + 1;
-  }
-  T w_floor = in_w - in_w_floor;
-  T h_floor = in_h - in_h_floor;
-  T w_ceil = 1 - w_floor;
-  T h_ceil = 1 - h_floor;
-  const T *data = in_data + (in_n * channels + in_c) * height * width;
-  // Do bilinear interpolation
-  T v1 = data[in_h_floor * width + in_w_floor];
-  T v2 = data[in_h_ceil * width + in_w_floor];
-  T v3 = data[in_h_ceil * width + in_w_ceil];
-  T v4 = data[in_h_floor * width + in_w_ceil];
-  T w1 = w_ceil * h_ceil;
-  T w2 = w_ceil * h_floor;
-  T w3 = w_floor * h_floor;
-  T w4 = w_floor * h_ceil;
-  val[0] = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
-}
-
-template <>
-bool RoiPerspectiveKernel<CPU, float>::Init(RoiPerspectiveParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void RoiPerspectiveKernel<CPU, float>::Compute(
-    const RoiPerspectiveParam<CPU> &param) {
-  const auto *input_x = param.input_x_;
-  const auto *input_rois = param.input_rois_;
-  auto *output = param.output_;
-  auto *transform_Matrix = param.transform_Matrix_;
-  auto *mask = param.mask;
-
-  const auto &in_dims = input_x->dims();
-  const int channels = in_dims[1];
-  const int in_height = in_dims[2];
-  const int in_width = in_dims[3];
-  const int rois_num = input_rois->dims()[0];
-  const int transformed_height = param.transformed_height_;
-  const int transformed_width = param.transformed_width_;
-  const float spatial_scale = param.spatial_scale_;
-
-  const float *input_data = input_x->data<float>();
-  const float *rois_data = input_rois->data<float>();
-  float *output_data = output->mutable_data<float>();
-  int *mask_data = mask->mutable_data<int>();
-  float *transform_matrix =
-      transform_Matrix->mutable_data<float>({rois_num, 9});
-
-  std::vector<int> roi2image(rois_num);
-  const auto &lod = input_rois->lod().back();
-  for (size_t i = 0; i < lod.size() - 1; ++i) {
-    for (size_t j = lod[i]; j < lod[i + 1]; ++j) {
-      roi2image[j] = i;
-    }
-  }
-
-  for (int n = 0; n < rois_num; ++n) {
-    const float *n_rois = rois_data + n * 8;
-    float roi_x[4];
-    float roi_y[4];
-    for (int k = 0; k < 4; ++k) {
-      roi_x[k] = n_rois[2 * k] * spatial_scale;
-      roi_y[k] = n_rois[2 * k + 1] * spatial_scale;
-    }
-    int image_id = roi2image[n];
-    // Get transform matrix
-    //    float transform_matrix[9];
-    float matrix[9];
-    get_transform_matrix<float>(transformed_width, transformed_height, roi_x,
-                                roi_y, matrix);
-    for (int i = 0; i < 9; i++) {
-      transform_matrix[n * 9 + i] = matrix[i];
-    }
-    for (int c = 0; c < channels; ++c) {
-      for (int out_h = 0; out_h < transformed_height; ++out_h) {
-        for (int out_w = 0; out_w < transformed_width; ++out_w) {
-          int out_index =
-              n * channels * transformed_height * transformed_width +
-              c * transformed_height * transformed_width +
-              out_h * transformed_width + out_w;
-          float in_w, in_h;
-          get_source_coords<float>(matrix, out_w, out_h, &in_w, &in_h);
-          if (in_quad<float>(in_w, in_h, roi_x, roi_y)) {
-            if ((-0.5 > in_w) || (in_w > (in_width - 0.5)) || (-0.5 > in_h) ||
-                (in_h > (in_height - 0.5))) {
-              output_data[out_index] = 0.0;
-              mask_data[(n * transformed_height + out_h) * transformed_width +
-                        out_w] = 0;
-            } else {
-              bilinear_interpolate<float>(input_data, channels, in_width,
-                                          in_height, image_id, c, in_w, in_h,
-                                          output_data + out_index);
-              mask_data[(n * transformed_height + out_h) * transformed_width +
-                        out_w] = 1;
-            }
-          } else {
-            output_data[out_index] = 0.0;
-            mask_data[(n * transformed_height + out_h) * transformed_width +
-                      out_w] = 1;
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // ROI_PERSPECTIVE_OP
diff --git a/mobile/src/operators/kernel/arm/scale_kernel.cpp b/mobile/src/operators/kernel/arm/scale_kernel.cpp
deleted file mode 100644
index fffcb07533..0000000000
--- a/mobile/src/operators/kernel/arm/scale_kernel.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SCALE_OP
-
-#include "operators/kernel/scale_kernel.h"
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ScaleKernel<CPU, float>::Init(ScaleParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ScaleKernel<CPU, float>::Compute(const ScaleParam<CPU> &param) {
-  const auto input = param.InputX();
-  auto output = param.Out();
-  if (input->dims() != output->dims()) {
-    output->Resize(input->dims());
-  }
-  const float scale = param.Scale();
-  const float bias = param.Bias();
-  if (input->type() == type_id<int64_t>().hash_code()) {
-    const int64_t *input_data = input->data<int64_t>();
-    int64_t *output_data = output->mutable_data<int64_t>();
-
-    int i = 0;
-    for (; i < output->numel(); ++i, ++output_data, ++input_data) {
-      *output_data = scale * (*input_data) + bias;
-    }
-  } else {
-    const float *input_data = input->data<float>();
-    float *output_data = output->mutable_data<float>();
-
-    int i = 0;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-    float32x4_t vscale = vdupq_n_f32(scale);
-    float32x4_t vbias = vdupq_n_f32(bias);
-    for (; i < output->numel() - 15; i += 16) {
-      float32x4_t _in0 = vld1q_f32(input_data);
-      float32x4_t _in1 = vld1q_f32(input_data + 4);
-      float32x4_t _in2 = vld1q_f32(input_data + 8);
-      float32x4_t _in3 = vld1q_f32(input_data + 12);
-      _in0 = vmlaq_f32(vbias, vscale, _in0);
-      _in1 = vmlaq_f32(vbias, vscale, _in1);
-      _in2 = vmlaq_f32(vbias, vscale, _in2);
-      _in3 = vmlaq_f32(vbias, vscale, _in3);
-      vst1q_f32(output_data, _in0);
-      vst1q_f32(output_data + 4, _in1);
-      vst1q_f32(output_data + 8, _in2);
-      vst1q_f32(output_data + 12, _in3);
-      input_data += 16;
-      output_data += 16;
-    }
-    for (; i < output->numel() - 3; i += 4) {
-      float32x4_t _in0 = vld1q_f32(input_data);
-      _in0 = vmlaq_f32(vbias, vscale, _in0);
-      vst1q_f32(output_data, _in0);
-      input_data += 4;
-      output_data += 4;
-    }
-#endif
-    for (; i < output->numel(); ++i, ++output_data, ++input_data) {
-      *output_data = scale * (*input_data) + bias;
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/sequence_expand_kernel.cpp b/mobile/src/operators/kernel/arm/sequence_expand_kernel.cpp
deleted file mode 100644
index 82941ff0d5..0000000000
--- a/mobile/src/operators/kernel/arm/sequence_expand_kernel.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SEQUENCE_EXPAND_OP
-
-#include <vector>
-#include "operators/kernel/sequence_kernels.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-typedef int (*LoDElementFunctor)(const std::vector<size_t> &x_lod, int index);
-
-int element_with_lod(const std::vector<size_t> &x_lod, int index) {
-  return x_lod[index];
-}
-
-int element_without_lod(const std::vector<size_t> &x_lod, int index) {
-  return index;
-}
-
-template <typename T>
-inline void SequenceExpandImpl(const framework::LoDTensor &x,
-                               const std::vector<size_t> &ref_lod,
-                               framework::LoDTensor *output) {
-  const T *x_data = x.data<T>();
-  auto &x_lod = x.lod();
-  LoDElementFunctor lod_element = element_without_lod;
-  if (x_lod.size() == 1) lod_element = element_with_lod;
-
-  T *output_data = output->mutable_data<T>();
-  int x_item_length = x.numel() / x.dims()[0];
-  int out_offset = 0;
-
-  for (size_t i = 1; i < ref_lod.size(); ++i) {
-    int repeat_num = ref_lod[i] - ref_lod[i - 1];
-    int x_start = lod_element(x_lod[0], i - 1);
-    int x_end = lod_element(x_lod[0], i);
-    int x_seq_len = x_end - x_start;
-    if (repeat_num > 0) {
-      int out_start = out_offset;
-      if (output->lod().size() == 1) {
-        out_start = output->lod()[0][out_offset];
-      }
-      for (int j = 0; j < repeat_num; j++) {
-        for (int k = 0; k < x_seq_len; k++) {
-          memcpy(output_data + (out_start + j * x_seq_len + k) * x_item_length,
-                 x_data + (x_start + k) * x_item_length,
-                 x_item_length * sizeof(T));
-        }
-      }
-    }
-    out_offset += repeat_num;
-  }
-}
-
-template <typename T>
-class SequenceExpandKernel<CPU, T>
-    : public framework::OpKernelBase<CPU, SequenceExpandParam<CPU>> {
- public:
-  bool Init(SequenceExpandParam<CPU> *param) { return true; }
-
-  void Compute(const SequenceExpandParam<CPU> &param) {
-    const framework::LoDTensor *input_x = param.input_x_;
-    const framework::LoDTensor *input_y = param.input_y_;
-    framework::LoDTensor *output = param.output_;
-    output->mutable_data<T>();
-
-    const auto &x_lod = input_x->lod();
-    const auto &y_lod = input_y->lod();
-    int ref_level = param.ref_level_;
-    if (ref_level == -1) ref_level = y_lod.size() - 1;
-
-    if (y_lod[ref_level].size() <= 1) {
-      framework::TensorCopy(*input_x, output);
-      output->set_lod(input_x->lod());
-      return;
-    }
-
-    std::vector<size_t> out_lod;
-    if (x_lod.size() == 1) {
-      out_lod.push_back(0);
-      for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
-        int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
-        int x_start = x_lod[0][i - 1];
-        int x_end = x_lod[0][i];
-        int x_seq_len = x_end - x_start;
-        for (int j = 0; j < repeat_num; ++j) {
-          out_lod.push_back(out_lod.back() + x_seq_len);
-        }
-      }
-      output->set_lod({out_lod});
-    }
-    SequenceExpandImpl<T>(*input_x, y_lod[ref_level], output);
-  }
-};
-
-template class SequenceExpandKernel<CPU, float>;
-// template class SequenceExpandKernel<CPU, int64_t>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // SEQUENCE_EXPAND_OP
diff --git a/mobile/src/operators/kernel/arm/sequence_pool_kernel.cpp b/mobile/src/operators/kernel/arm/sequence_pool_kernel.cpp
deleted file mode 100644
index db1939d4d0..0000000000
--- a/mobile/src/operators/kernel/arm/sequence_pool_kernel.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SEQUENCE_POOL_OP
-
-#include <cmath>
-#include <limits>
-#include <string>
-#include <vector>
-#include "common/types.h"
-#include "operators/kernel/sequence_kernels.h"
-#include "operators/math/pooling.h"
-#ifdef __ARM_NEON__
-#include <arm_neon.h>
-#endif  // __ARM_NEON__
-
-namespace paddle_mobile {
-namespace operators {
-
-template <PoolingType P = MAX, typename T = float>
-void SequencePoolImpl(const framework::LoDTensor &input,
-                      framework::LoDTensor *output) {
-  const float *input_ptr = input.data<float>();
-  float *output_ptr = output->mutable_data<float>();
-  const auto &lod = input.lod()[0];
-  int64_t width = input.numel() / input.dims()[0];
-
-  #pragma omp parallel for
-  for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-    const float *in_ptr = input_ptr + lod[i] * width;
-    float *out_ptr = output_ptr + i * width;
-    int64_t height = static_cast<int64_t>(lod[i + 1] - lod[i]);
-    if (width == 1) {
-      float max = -std::numeric_limits<float>::max();
-      int remain_h = height;
-#ifdef __ARM_NEON__
-      int loop = remain_h >> 2;
-      remain_h = remain_h & 0x3;
-      float32x4_t __max4 = math::vPoolInitq_f32<MAX>();
-      for (int h = 0; h < loop; ++h) {
-        float32x4_t r0 = vld1q_f32(in_ptr);
-        __max4 = vmaxq_f32(__max4, r0);
-        in_ptr += 4;
-      }
-      float32x2_t __max2 =
-          vpmax_f32(vget_low_f32(__max4), vget_high_f32(__max4));
-      __max2 = vpmax_f32(__max2, __max2);
-      max = std::max(max, vget_lane_f32(__max2, 0));
-#endif  // __ARM_NEON__
-      for (int h = 0; h < remain_h; ++h) {
-        max = std::max(max, in_ptr[h]);
-      }
-      *out_ptr = max;
-    } else {
-      memcpy(out_ptr, in_ptr, width * sizeof(float));
-      in_ptr += width;
-      int remain_h = height - 1;
-      int remain_w_start = 0;
-#ifdef __ARM_NEON__
-      remain_w_start = width & 0xfffffffc;
-#endif  // __ARM_NEON__
-      for (int h = 0; h < remain_h; ++h) {
-#ifdef __ARM_NEON__
-        for (int w = 0; w < width; w += 4) {
-          float32x4_t __in = vld1q_f32(in_ptr + w);
-          float32x4_t __out = vld1q_f32(out_ptr + w);
-          __out = vmaxq_f32(__out, __in);
-          vst1q_f32(out_ptr + w, __out);
-        }
-#endif  // __ARM_NEON__
-        for (int w = remain_w_start; w < width; ++w) {
-          out_ptr[w] = std::max(out_ptr[w], in_ptr[w]);
-        }
-        in_ptr += width;
-      }
-    }
-  }
-}
-
-template <>
-void SequencePoolImpl<SUM, float>(const framework::LoDTensor &input,
-                                  framework::LoDTensor *output) {
-  const float *input_ptr = input.data<float>();
-  float *output_ptr = output->mutable_data<float>();
-  const auto &lod = input.lod()[0];
-  int64_t width = input.numel() / input.dims()[0];
-
-  #pragma omp parallel for
-  for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-    const float *in_ptr = input_ptr + lod[i] * width;
-    float *out_ptr = output_ptr + i * width;
-    int64_t height = static_cast<int64_t>(lod[i + 1] - lod[i]);
-    if (width == 1) {
-      float sum = 0.f;
-      int remain_h = height;
-#ifdef __ARM_NEON__
-      int loop = remain_h >> 2;
-      remain_h = remain_h & 0x3;
-      float32x4_t __sum4 = vdupq_n_f32(0.f);
-      for (int h = 0; h < loop; ++h) {
-        float32x4_t r0 = vld1q_f32(in_ptr);
-        __sum4 = vaddq_f32(__sum4, r0);
-        in_ptr += 4;
-      }
-      float32x2_t __sum2 =
-          vpadd_f32(vget_low_f32(__sum4), vget_high_f32(__sum4));
-      sum += vget_lane_f32(__sum2, 0) + vget_lane_f32(__sum2, 1);
-#endif  // __ARM_NEON__
-      for (int h = 0; h < remain_h; ++h) {
-        sum += in_ptr[h];
-      }
-      *out_ptr = sum;
-    } else {
-      memcpy(out_ptr, in_ptr, width * sizeof(float));
-      in_ptr += width;
-      int remain_h = height - 1;
-      int remain_w_start = 0;
-#ifdef __ARM_NEON__
-      int loop_w = width >> 2;
-      remain_w_start = width & 0xfffffffc;
-#endif  // __ARM_NEON__
-      for (int h = 0; h < remain_h; ++h) {
-#ifdef __ARM_NEON__
-        for (int w = 0; w < width - 3; w += 4) {
-          float32x4_t __in = vld1q_f32(in_ptr + w);
-          float32x4_t __out = vld1q_f32(out_ptr + w);
-          __out = vaddq_f32(__out, __in);
-          vst1q_f32(out_ptr + w, __out);
-        }
-#endif  // __ARM_NEON__
-        for (int w = remain_w_start; w < width; ++w) {
-          out_ptr[w] += in_ptr[w];
-        }
-        in_ptr += width;
-      }
-    }
-  }
-}
-
-template <>
-void SequencePoolImpl<FIRST, float>(const framework::LoDTensor &input,
-                                    framework::LoDTensor *output) {
-  const float *input_ptr = input.data<float>();
-  float *output_ptr = output->mutable_data<float>();
-  const auto &lod = input.lod()[0];
-  int64_t width = input.numel() / input.dims()[0];
-
-  for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-    const float *in_ptr = input_ptr + lod[i] * width;
-    float *out_ptr = output_ptr + i * width;
-    memcpy(out_ptr, in_ptr, width * sizeof(float));
-  }
-}
-
-template <>
-void SequencePoolImpl<LAST, float>(const framework::LoDTensor &input,
-                                   framework::LoDTensor *output) {
-  const float *input_ptr = input.data<float>();
-  float *output_ptr = output->mutable_data<float>();
-  const auto &lod = input.lod()[0];
-  int64_t width = input.numel() / input.dims()[0];
-
-  for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-    int64_t seq_len = static_cast<int64_t>(lod[i + 1] - lod[i]);
-    const float *in_ptr = input_ptr + seq_len * width;
-    float *out_ptr = output_ptr + i * width;
-    memcpy(out_ptr, in_ptr - width, width * sizeof(float));
-  }
-}
-
-template <typename T>
-class SequencePoolKernel<CPU, T>
-    : public framework::OpKernelBase<CPU, SequencePoolParam<CPU>> {
- public:
-  bool Init(SequencePoolParam<CPU> *param) { return true; }
-
-  void Compute(const SequencePoolParam<CPU> &param) {
-    const framework::LoDTensor *input = param.input_;
-    framework::LoDTensor *output = param.output_;
-    output->mutable_data<T>();
-    const std::string pooling_type = param.pool_type_;
-
-    if (param.pool_type_ == "MAX") {
-      SequencePoolImpl<MAX, T>(*input, output);
-    } else if (param.pool_type_ == "FIRST") {
-      SequencePoolImpl<FIRST, T>(*input, output);
-    } else if (param.pool_type_ == "LAST") {
-      SequencePoolImpl<LAST, T>(*input, output);
-    } else if (param.pool_type_ == "SUM") {
-      SequencePoolImpl<SUM, T>(*input, output);
-    } else {
-      PADDLE_MOBILE_THROW_EXCEPTION(
-          "pooling type `%s` has not been implemented.",
-          param.pool_type_.c_str());
-    }
-  }
-};
-
-template class SequencePoolKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // SEQUENCE_POOL_OP
diff --git a/mobile/src/operators/kernel/arm/sequence_softmax_kernel.cpp b/mobile/src/operators/kernel/arm/sequence_softmax_kernel.cpp
deleted file mode 100644
index b0df21fac5..0000000000
--- a/mobile/src/operators/kernel/arm/sequence_softmax_kernel.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SEQUENCE_SOFTMAX_OP
-
-#include "framework/lod_tensor.h"
-#include "operators/kernel/sequence_kernels.h"
-#include "operators/math/softmax.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-class SequenceSoftmaxKernel<CPU, T>
-    : public framework::OpKernelBase<CPU, SoftmaxParam<CPU>> {
- public:
-  bool Init(SoftmaxParam<CPU> *param) { return true; }
-
-  void Compute(const SoftmaxParam<CPU> &param) {
-    param.Out()->mutable_data<float>();
-    const framework::LoDTensor *input = param.InputX();
-    framework::LoDTensor *output = param.Out();
-    math::SequenceSoftmaxFuntor<CPU, T> sequence_softmax;
-    sequence_softmax(input, output);
-  }
-};
-
-template class SequenceSoftmaxKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // SEQUENCE_SOFTMAX_OP
diff --git a/mobile/src/operators/kernel/arm/shape_kernel.cpp b/mobile/src/operators/kernel/arm/shape_kernel.cpp
deleted file mode 100644
index 4adbf8fa13..0000000000
--- a/mobile/src/operators/kernel/arm/shape_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SHAPE_OP
-
-#include "operators/kernel/shape_kernel.h"
-#include "operators/kernel/central-arm-func/shape_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ShapeKernel<CPU, float>::Init(ShapeParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ShapeKernel<CPU, float>::Compute(const ShapeParam<CPU> &param) {
-  ShapeCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/slice_kernel.cpp b/mobile/src/operators/kernel/arm/slice_kernel.cpp
deleted file mode 100644
index aeb18c8d20..0000000000
--- a/mobile/src/operators/kernel/arm/slice_kernel.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SLICE_OP
-
-#include "operators/kernel/slice_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype>
-void SliceCompute(const SliceParam<CPU>& param) {
-  auto input = param.input_;
-  auto output = param.output_;
-  auto* input_ptr = input->data<Dtype>();
-  auto* output_ptr = output->mutable_data<Dtype>();
-  auto out_dims = output->dims();
-  auto in_dims = input->dims();
-  auto starts = param.starts_;
-  auto ends = param.ends_;
-  int axes = param.axes_[0];
-  int HW = 1;
-  if (in_dims.size() >= 2 && axes <= in_dims.size() - 2) {
-    HW = in_dims[axes + 1] * input->dims()[axes + 2];
-  }
-  int batch_size = (out_dims.size() == 1) ? 1 : out_dims[axes - 1];
-  int input_channel = in_dims[axes];
-  int output_channel = out_dims[axes];
-
-  for (int c1 = 0; c1 < batch_size; ++c1) {
-    for (int c2 = starts[0], c3 = 0; c2 < ends[0]; ++c2, ++c3) {
-      size_t out_offset = c1 * output_channel * HW + c3 * HW;
-      size_t in_offset = c1 * input_channel * HW + c2 * HW;
-      memcpy(output_ptr + out_offset, input_ptr + in_offset,
-             HW * sizeof(float));
-    }
-  }
-}
-
-template <>
-bool SliceKernel<CPU, float>::Init(SliceParam<CPU>* param) {
-  return true;
-}
-
-template <>
-void SliceKernel<CPU, float>::Compute(const SliceParam<CPU>& param) {
-  int rank = param.input_->dims().size();
-  switch (rank) {
-    case 1:
-      if (param.input_->type() == type_id<int>().hash_code()) {
-        SliceCompute<int>(param);
-      } else if (param.input_->type() == type_id<float>().hash_code()) {
-        SliceCompute<float>(param);
-      }
-      break;
-    case 2:
-      SliceCompute<float>(param);
-      break;
-    case 4:
-      SliceCompute<float>(param);
-      break;
-    case 5:
-      if (param.input_->dims()[0] == 1) {
-        SliceCompute<float>(param);
-      }
-      break;
-    default:
-      PADDLE_MOBILE_ENFORCE(0, "input dims not support now");
-      break;
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/arm/softmax_kernel.cpp b/mobile/src/operators/kernel/arm/softmax_kernel.cpp
deleted file mode 100644
index bdb05656d4..0000000000
--- a/mobile/src/operators/kernel/arm/softmax_kernel.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SOFTMAX_OP
-
-#include "../softmax_kernel.h"
-#include "../central-arm-func/softmax_arm_func.h"
-#include "operators/math/softmax.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool SoftmaxKernel<CPU, float>::Init(SoftmaxParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void SoftmaxKernel<CPU, float>::Compute(const SoftmaxParam<CPU> &param) {
-  SoftmaxCompute<float>(param);
-  param.Out()->set_lod(param.InputX()->lod());
-}
-
-template class SoftmaxKernel<CPU, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/split_kernel.cpp b/mobile/src/operators/kernel/arm/split_kernel.cpp
deleted file mode 100644
index 13c7567e3d..0000000000
--- a/mobile/src/operators/kernel/arm/split_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SPLIT_OP
-
-#include "operators/kernel/split_kernel.h"
-#include "operators/kernel/central-arm-func/split_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool SplitKernel<CPU, float>::Init(SplitParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void SplitKernel<CPU, float>::Compute(const SplitParam<CPU> &param) {
-  SplitCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/sum_kernel.cpp b/mobile/src/operators/kernel/arm/sum_kernel.cpp
deleted file mode 100644
index 2b36a382a1..0000000000
--- a/mobile/src/operators/kernel/arm/sum_kernel.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SUM_OP
-
-#include "operators/kernel/sum_kernel.h"
-#include "operators/kernel/central-arm-func/sum_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool SumKernel<CPU, float>::Init(SumParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void SumKernel<CPU, float>::Compute(const SumParam<CPU> &param) {
-  SumCompute<float>(param);
-  param.Out()->set_lod(param.Inputs()[0]->lod());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/tensor_array_read_write_kernel.cpp b/mobile/src/operators/kernel/arm/tensor_array_read_write_kernel.cpp
deleted file mode 100644
index bdf10574a8..0000000000
--- a/mobile/src/operators/kernel/arm/tensor_array_read_write_kernel.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/tensor_array_read_write_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef WRITE_TO_ARRAY_OP
-template <>
-bool WriteToArrayKernel<CPU, float>::Init(WriteToArrayParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void WriteToArrayKernel<CPU, float>::Compute(
-    const WriteToArrayParam<CPU> &param) {
-  int64_t offset = param.index_->data<int64_t>()[0];
-  if (offset >= param.output_->size()) {
-    while (param.output_->size() <= offset) {
-      param.output_->emplace_back();
-    }
-  }
-
-  framework::LoDTensor *out_tensor = &(param.output_->at(offset));
-  out_tensor->set_lod(param.input_->lod());
-  if (param.input_->memory_size() > 0) {
-    TensorCopy(*(param.input_), out_tensor);
-  }
-}
-#endif  // WRITE_TO_ARRAY_OP
-
-#ifdef READ_FROM_ARRAY_OP
-template <>
-bool ReadFromArrayKernel<CPU, float>::Init(ReadFromArrayParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ReadFromArrayKernel<CPU, float>::Compute(
-    const ReadFromArrayParam<CPU> &param) {
-  int64_t offset = param.index_->data<int64_t>()[0];
-  if (offset < param.input_->size()) {
-    TensorCopy(param.input_->at(offset), param.output_);
-    param.output_->set_lod(param.input_->at(offset).lod());
-  } else {
-    PADDLE_MOBILE_THROW_EXCEPTION(
-        "Can not read tensor which index is `%d` since it only has `%d` inputs",
-        offset, param.input_->size());
-  }
-}
-#endif  // READ_FROM_ARRAY_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/arm/top_k_kernel.cpp b/mobile/src/operators/kernel/arm/top_k_kernel.cpp
deleted file mode 100644
index 54a4f5b1a9..0000000000
--- a/mobile/src/operators/kernel/arm/top_k_kernel.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef TOP_K_OP
-
-#include <algorithm>
-#include <iostream>
-#include <vector>
-#include "operators/kernel/kernels.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool TopKKernel<CPU, float>::Init(TopKParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void TopKKernel<CPU, float>::Compute(const TopKParam<CPU> &param) {
-  const Tensor *input = param.input_;
-  Tensor *output = param.output_;
-  Tensor *indices = param.indices_;
-  const float *input_data = input->data<float>();
-  float *output_data = output->mutable_data<float>();
-  int64_t *indices_data = indices->mutable_data<int64_t>();
-
-  framework::DDim input_dims = input->dims();
-  const size_t row = framework::product(
-      framework::slice_ddim(input_dims, 0, input_dims.size() - 1));
-  const size_t col = input_dims[input_dims.size() - 1];
-
-  #pragma omp parallel for
-  for (size_t i = 0; i < row; i++) {
-    std::vector<std::pair<float, size_t>> vec(col);
-    const float *input_ptr = input_data + i * col;
-    float *output_ptr = output_data + i * param.k_;
-    int64_t *indices_ptr = indices_data + i * param.k_;
-
-    for (size_t j = 0; j < col; j++) {
-      vec[j] = std::move(std::pair<float, size_t>(input_ptr[j], j));
-    }
-    std::partial_sort(
-        vec.begin(), vec.begin() + param.k_, vec.end(),
-        [](const std::pair<float, size_t> &l,
-           const std::pair<float, size_t> &r) { return l.first > r.first; });
-    for (int j = 0; j < param.k_; ++j) {
-      output_ptr[j] = vec[j].first;
-      indices_ptr[j] = static_cast<int64_t>(vec[j].second);
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // TOP_K_OP
diff --git a/mobile/src/operators/kernel/arm/transpose2_kernel.cpp b/mobile/src/operators/kernel/arm/transpose2_kernel.cpp
deleted file mode 100644
index 54c88015cb..0000000000
--- a/mobile/src/operators/kernel/arm/transpose2_kernel.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef TRANSPOSE2_OP
-
-#include "operators/kernel/transpose2_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-bool IsShuffleChannel(const std::vector<int> &axis) {
-  bool is_shuffle_channel = true;
-  if (axis.size() > 2 && axis[0] == 0 && axis[1] == 2 && axis[2] == 1) {
-    for (int i = 3; i < axis.size(); ++i) {
-      if (axis[i] != i) {
-        is_shuffle_channel = false;
-        break;
-      }
-    }
-  } else {
-    return false;
-  }
-  return is_shuffle_channel;
-}
-
-template <typename Dtype>
-void ShuffleChannelCompute(const Transpose2Param<CPU> &param) {
-  const std::vector<int> &axis = param.Axis();
-  const Tensor *input = param.InputX();
-  const Dtype *input_ptr = input->data<Dtype>();
-  Tensor *output = param.Out();
-  Dtype *output_ptr = output->mutable_data<Dtype>();
-  // input and output's shape dimension must >= 2 && <= 6.
-  const framework::DDim &in_dim = input->dims();
-  const framework::DDim &out_dim = output->dims();
-  size_t offset = 1;
-  for (int i = 3; i < axis.size(); ++i) {
-    offset *= in_dim[i];
-  }
-
-  #pragma omp parallel for collapse(3)
-  for (int batch = 0; batch < out_dim[0]; ++batch) {
-    for (int c1 = 0; c1 < out_dim[1]; ++c1) {
-      for (int c2 = 0; c2 < out_dim[2]; ++c2) {
-        size_t out_offset =
-            ((batch * out_dim[1] + c1) * out_dim[2] + c2) * offset;
-        size_t in_offset = ((batch * in_dim[1] + c2) * in_dim[2] + c1) * offset;
-        memcpy(output_ptr + out_offset, input_ptr + in_offset,
-               offset * sizeof(Dtype));
-      }
-    }
-  }
-}
-
-template <typename Dtype>
-void Transpose2Compute(const Transpose2Param<CPU> &param) {
-  const std::vector<int> &axis = param.Axis();
-  const Tensor *input = param.InputX();
-  const Dtype *input_ptr = input->data<Dtype>();
-  Tensor *output = param.Out();
-  Dtype *output_ptr = output->mutable_data<Dtype>();
-  // input and output's shape dimension must >= 2 && <= 6.
-  const framework::DDim &in_dim = input->dims();
-  const framework::DDim &out_dim = output->dims();
-
-  // precompute inverted output dim and strides
-  size_t rout_dim[6], strides[6];
-  int permute = axis.size();  // permute must >=2 && <= 6.
-  for (int i = 0; i < permute; ++i) {
-    int k = permute - 1 - i;
-    strides[k] = 1;
-    for (int j = axis[i] + 1; j < permute; ++j) {
-      strides[k] *= in_dim[j];
-    }
-    rout_dim[k] = out_dim[i];
-  }
-  // unroll the first 2 dimensions
-  int reamin_dim = 1;
-  for (int i = 2; i < out_dim.size(); ++i) {
-    reamin_dim *= out_dim[i];
-  }
-
-  #pragma omp parallel for collapse(2)
-  for (int batch = 0; batch < out_dim[0]; ++batch) {
-    for (int j = 0; j < out_dim[1]; ++j) {
-      size_t offset = batch * strides[permute - 1] + j * strides[permute - 2];
-      Dtype *out_ptr = output_ptr + (batch * out_dim[1] + j) * reamin_dim;
-      int indics[4] = {0, 0, 0, 0};
-      for (int k = 0; k < reamin_dim; ++k) {
-        out_ptr[k] = input_ptr[offset];
-        indics[0] += 1;
-        offset += strides[0];
-        for (int p = 0; p < permute - 3; ++p) {
-          if (indics[p] == rout_dim[p]) {
-            indics[p + 1] += 1;
-            indics[p] = 0;
-            offset += strides[p + 1];
-            offset -= rout_dim[p] * strides[p];
-          } else {
-            break;
-          }
-        }
-      }
-    }
-  }
-}
-
-template <>
-bool Transpose2Kernel<CPU, float>::Init(Transpose2Param<CPU> *param) {
-  return true;
-}
-
-template <>
-void Transpose2Kernel<CPU, float>::Compute(const Transpose2Param<CPU> &param) {
-  const std::vector<int> &axis = param.Axis();
-  bool shuffle_channel = IsShuffleChannel(axis);
-  if (shuffle_channel) {
-    if (param.InputX()->type() == type_id<int8_t>().hash_code()) {
-      ShuffleChannelCompute<int8_t>(param);
-    } else {
-      ShuffleChannelCompute<float>(param);
-    }
-  } else {
-    if (param.InputX()->type() == type_id<int8_t>().hash_code()) {
-      Transpose2Compute<int8_t>(param);
-    } else {
-      Transpose2Compute<float>(param);
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // TRANSPOSE2_OP
diff --git a/mobile/src/operators/kernel/arm/transpose_kernel.cpp b/mobile/src/operators/kernel/arm/transpose_kernel.cpp
deleted file mode 100644
index f90376eb50..0000000000
--- a/mobile/src/operators/kernel/arm/transpose_kernel.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef TRANSPOSE_OP
-
-#include "operators/kernel/transpose_kernel.h"
-#include "operators/kernel/central-arm-func/transpose_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool TransposeKernel<CPU, float>::Init(TransposeParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void TransposeKernel<CPU, float>::Compute(const TransposeParam<CPU> &param) {
-  TransposeCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/while_kernel.cpp b/mobile/src/operators/kernel/arm/while_kernel.cpp
deleted file mode 100644
index 43e88aad4d..0000000000
--- a/mobile/src/operators/kernel/arm/while_kernel.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef WHILE_OP
-
-#include "operators/kernel/while_kernel.h"
-#include "framework/loader.h"
-#include "framework/lod_tensor.h"
-#include "framework/op_registry.h"
-#include "framework/operator.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-class WhileStepExecutor {
-  typedef std::shared_ptr<framework::OperatorBase<CPU>> OperatorPtr;
-
- public:
-  WhileStepExecutor(const framework::BlockDesc *block, framework::Scope *scope)
-      : scope_(scope) {
-    std::vector<std::shared_ptr<framework::OpDesc>> ops = block->Ops();
-    ops_of_block_.resize(ops.size());
-    for (int i = 0; i < ops.size(); ++i) {
-      std::shared_ptr<framework::OpDesc> op_desc = ops[i];
-      DLOG << "while kernel create op: " << op_desc->Type();
-      auto op_handler = framework::OpRegistry<CPU>::CreateOp(
-          op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(),
-          op_desc->GetAttrMap(), scope_);
-      op_handler->Init();
-      ops_of_block_[i] = op_handler;
-    }
-  }
-
-  void Run() {
-    for (int i = 0; i < ops_of_block_.size(); ++i) {
-      auto &op_handler = ops_of_block_[i];
-      DLOG << "while kernel InferShape op: " << i
-           << "th : " << op_handler->Type();
-      op_handler->InferShape();
-      DLOG << "while kernel Run op: " << i << "th : " << op_handler->Type();
-      op_handler->Run();
-    }
-  }
-
-  void CreateVariables(Scope &scope, const WhileParam<CPU> &param) {
-    for (const auto &var_desc : param.sub_block_->Vars()) {
-      auto var = scope.Var(var_desc->Name());
-      if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
-        if (var_desc->Persistable()) {
-          auto dim = var_desc->Tensor_desc().Dims();
-          auto tensor = var->framework::Variable::GetMutable<LoDTensor>();
-          tensor->Resize(framework::make_ddim(dim));
-        } else {
-          auto dim = var_desc->Tensor_desc().Dims();
-          if (dim.size() == 0) {
-            auto tensor = var->framework::Variable::GetMutable<LoDTensor>();
-            framework::DDim dDim = {0};
-            tensor->Resize(dDim);
-          } else {
-            for (auto &d : dim) {
-              if (d < 0) {
-                d *= -1;
-              }
-            }
-            auto tensor = var->framework::Variable::GetMutable<LoDTensor>();
-            tensor->Resize(framework::make_ddim(dim));
-          }
-        }
-      } else {
-        // TODO(codeWorm)
-      }
-    }
-  }
-
- private:
-  framework::Scope *scope_;
-  std::vector<OperatorPtr> ops_of_block_;
-};
-
-template <>
-bool WhileKernel<CPU, float>::Init(WhileParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void WhileKernel<CPU, float>::Compute(const WhileParam<CPU> &param) {
-  DLOG << "WhileKernel Compute";
-  WhileStepExecutor executor(param.sub_block_, param.scope_);
-  auto &current_scope = param.scope_->NewScope();
-  executor.CreateVariables(current_scope, param);
-  while (param.cond_->data<bool>()[0]) {
-    if (param.is_test) {
-      for (auto &name : current_scope.LocalVarNames()) {
-        auto *var = current_scope.Var(name);
-        if (var->IsType<framework::LoDTensor>()) {
-          // Clear all lod information for all lod_tensors.
-          auto *t = var->GetMutable<framework::LoDTensor>();
-          framework::LoD empty_lod;
-          t->set_lod(empty_lod);
-        } else if (var->IsType<framework::LoDTensorArray>()) {
-          // Clear elements of all tensor arrays.
-          auto *t = var->GetMutable<framework::LoDTensorArray>();
-          t->clear();
-        } else {
-          // todo
-        }
-      }
-    }
-    executor.Run();
-  }
-  param.scope_->DeleteScope(&current_scope);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // WHILE_OP
diff --git a/mobile/src/operators/kernel/assign_kernel.h b/mobile/src/operators/kernel/assign_kernel.h
deleted file mode 100644
index 0d06bb7521..0000000000
--- a/mobile/src/operators/kernel/assign_kernel.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ASSIGN_OP
-
-#pragma once
-
-#include <vector>
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype>
-class AssignParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  AssignParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-              const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = OpParam::InputXFrom<GType>(inputs, *scope);
-    output_ = OpParam::OutFrom<GType>(outputs, *scope);
-  }
-
-  const GType *Input() const { return input_; }
-
-  GType *Output() const { return output_; }
-
- private:
-  GType *input_;
-  GType *output_;
-};
-
-DECLARE_KERNEL(Assign, AssignParam);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // ASSIGN_OP
diff --git a/mobile/src/operators/kernel/assign_value_kernel.h b/mobile/src/operators/kernel/assign_value_kernel.h
deleted file mode 100644
index 5fae921876..0000000000
--- a/mobile/src/operators/kernel/assign_value_kernel.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ASSIGN_VALUE_OP
-
-#pragma once
-
-#include <vector>
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype>
-class AssignValueParam : public OpParam {
- public:
-  AssignValueParam(const VariableNameMap &inputs,
-                   const VariableNameMap &outputs, const AttributeMap &attrs,
-                   Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    output_ = GET_VAR_AS_LOD_TENSOR("Out", outputs, *scope);
-    shape_ = OpParam::GetAttr<std::vector<int>>("shape", attrs);
-    fp32_values_ = OpParam::GetAttr<std::vector<float>>("fp32_values", attrs);
-    int32_values_ = OpParam::GetAttr<std::vector<int>>("int32_values", attrs);
-    dtype_ = OpParam::GetAttr<int>("dtype", attrs);
-  }
-
- public:
-  framework::LoDTensor *output_;
-  std::vector<int> shape_;
-  std::vector<float> fp32_values_;
-  std::vector<int> int32_values_;
-  int dtype_;
-};
-
-DECLARE_KERNEL(AssignValue, AssignValueParam);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // ASSIGN_VALUE_OP
diff --git a/mobile/src/operators/kernel/batchnorm_kernel.h b/mobile/src/operators/kernel/batchnorm_kernel.h
deleted file mode 100644
index 1f2db456d3..0000000000
--- a/mobile/src/operators/kernel/batchnorm_kernel.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BATCHNORM_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class BatchNormKernel
-    : public framework::OpKernelBase<DeviceType, BatchNormParam<DeviceType>> {
- public:
-  void Compute(const BatchNormParam<DeviceType> &param);
-  bool Init(BatchNormParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/beam_search_decode_kernel.h b/mobile/src/operators/kernel/beam_search_decode_kernel.h
deleted file mode 100644
index 36cc7f9f2d..0000000000
--- a/mobile/src/operators/kernel/beam_search_decode_kernel.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BEAM_SEARCH_DECODE_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype>
-class BeamSearchDecodeParam : public OpParam {
- public:
-  BeamSearchDecodeParam(const VariableNameMap &inputs,
-                        const VariableNameMap &outputs,
-                        const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    ids_ =
-        OpParam::GetVarValue<framework::LoDTensorArray>("Ids", inputs, *scope);
-    scores_ = OpParam::GetVarValue<framework::LoDTensorArray>("Scores", inputs,
-                                                              *scope);
-    sentence_ids_ = OpParam::GetVarValue<framework::LoDTensor>("SentenceIds",
-                                                               outputs, *scope);
-    sentence_scores_ = OpParam::GetVarValue<framework::LoDTensor>(
-        "SentenceScores", outputs, *scope);
-    beam_size_ = OpParam::GetAttr<int>("beam_size", attrs);
-    end_id_ = OpParam::GetAttr<int>("end_id", attrs);
-  }
-
- public:
-  framework::LoDTensorArray *ids_;
-  framework::LoDTensorArray *scores_;
-  framework::LoDTensor *sentence_ids_;
-  framework::LoDTensor *sentence_scores_;
-  int beam_size_;
-  int end_id_;
-};
-
-DECLARE_KERNEL(BeamSearchDecode, BeamSearchDecodeParam);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // BEAM_SEARCH_DECODE_OP
diff --git a/mobile/src/operators/kernel/beam_search_kernel.h b/mobile/src/operators/kernel/beam_search_kernel.h
deleted file mode 100644
index bb4a3ced17..0000000000
--- a/mobile/src/operators/kernel/beam_search_kernel.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BEAM_SEARCH_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype>
-class BeamSearchParam : public OpParam {
- public:
-  BeamSearchParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                  const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    pre_ids_ = GET_VAR_AS_LOD_TENSOR("pre_ids", inputs, *scope);
-    pre_scores_ = GET_VAR_AS_LOD_TENSOR("pre_scores", inputs, *scope);
-    ids_ = GET_VAR_AS_LOD_TENSOR("ids", inputs, *scope);
-    scores_ = GET_VAR_AS_LOD_TENSOR("scores", inputs, *scope);
-
-    selected_ids_ = GET_VAR_AS_LOD_TENSOR("selected_ids", outputs, *scope);
-    selected_scores_ =
-        GET_VAR_AS_LOD_TENSOR("selected_scores", outputs, *scope);
-    if (outputs.count("parent_idx")) {
-      parent_idx_ = GET_VAR_AS_LOD_TENSOR("parent_idx", outputs, *scope);
-    } else {
-      parent_idx_ = new framework::Tensor();
-    }
-
-    level_ = OpParam::GetAttr<int>("level", attrs);
-    beam_size_ = OpParam::GetAttr<int>("beam_size", attrs);
-    end_id_ = OpParam::GetAttr<int>("end_id", attrs);
-    if (OpParam::HasAttr("is_accumulated", attrs)) {
-      is_accumulated_ = OpParam::GetAttr<bool>("is_accumulated", attrs);
-    }
-  }
-
- public:
-  framework::LoDTensor *pre_ids_;
-  framework::LoDTensor *pre_scores_;
-  framework::LoDTensor *ids_;
-  framework::LoDTensor *scores_;
-
-  framework::LoDTensor *selected_ids_;
-  framework::LoDTensor *selected_scores_;
-  framework::Tensor *parent_idx_;
-
-  int level_;
-  int beam_size_;
-  int end_id_;
-  bool is_accumulated_ = true;
-};
-
-DECLARE_KERNEL(BeamSearch, BeamSearchParam);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // BEAM_SEARCH_OP
diff --git a/mobile/src/operators/kernel/bilinear_interp_kernel.h b/mobile/src/operators/kernel/bilinear_interp_kernel.h
deleted file mode 100644
index 9a68fe65a5..0000000000
--- a/mobile/src/operators/kernel/bilinear_interp_kernel.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BILINEAR_INTERP_OP
-
-#pragma once
-
-#include <vector>
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class BilinearInterpKernel
-    : public framework::OpKernelBase<DeviceType,
-                                     BilinearInterpParam<DeviceType>> {
- public:
-  void Compute(const BilinearInterpParam<DeviceType>& param);
-  bool Init(BilinearInterpParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/box_coder_kernel.h b/mobile/src/operators/kernel/box_coder_kernel.h
deleted file mode 100644
index eadb21b3d5..0000000000
--- a/mobile/src/operators/kernel/box_coder_kernel.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BOXCODER_OP
-
-#pragma once
-
-#include <vector>
-
-#include "framework/operator.h"
-#include "operators/math/transform.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class BoxCoderKernel
-    : public framework::OpKernelBase<DeviceType, BoxCoderParam<DeviceType>> {
- public:
-  void Compute(const BoxCoderParam<DeviceType>& param);
-  bool Init(BoxCoderParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/activation_arm_func.h b/mobile/src/operators/kernel/central-arm-func/activation_arm_func.h
deleted file mode 100644
index 07663ae2ae..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/activation_arm_func.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "operators/math/activation.h"
-#include "operators/op_param.h"
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#endif  // __ARM_NEON__
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, ActivationType Act>
-struct ActivationCompute {
-  void operator()(const Tensor *input, Tensor *output) {}
-  void operator()(const Tensor *input, Tensor *output, float alpha) {}
-};
-
-template <ActivationType Act>
-struct ActivationCompute<float, Act> {
-  void operator()(const Tensor *input, Tensor *output) {
-    const float *x = input->data<float>();
-    float *y = output->mutable_data<float>();
-    size_t remain = input->numel();
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-    size_t loop = remain >> 4;
-    remain = remain & 0xF;
-
-#pragma omp parallel for
-    for (size_t i = 0; i < loop; ++i) {
-      const float *local_x = x + (i << 4);
-      float *local_y = y + (i << 4);
-      float32x4_t r0 = vld1q_f32(local_x);
-      float32x4_t r1 = vld1q_f32(local_x + 4);
-      float32x4_t r2 = vld1q_f32(local_x + 8);
-      float32x4_t r3 = vld1q_f32(local_x + 12);
-      r0 = math::vActiveq_f32<Act>(r0);
-      r1 = math::vActiveq_f32<Act>(r1);
-      r2 = math::vActiveq_f32<Act>(r2);
-      r3 = math::vActiveq_f32<Act>(r3);
-      vst1q_f32(local_y, r0);
-      vst1q_f32(local_y + 4, r1);
-      vst1q_f32(local_y + 8, r2);
-      vst1q_f32(local_y + 12, r3);
-    }
-    x += (loop << 4);
-    y += (loop << 4);
-#endif
-    for (size_t i = 0; i < remain; ++i) {
-      y[i] = math::Active<Act>(x[i]);
-    }
-  }
-
-  void operator()(const Tensor *input, Tensor *output, float falpha) {
-    const float *x = input->data<float>();
-    float *y = output->mutable_data<float>();
-    size_t remain = input->numel();
-    float alphas[4] = {falpha, falpha, falpha, falpha};
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-    size_t loop = remain >> 4;
-    remain = remain & 0xF;
-
-#pragma omp parallel for
-    for (size_t i = 0; i < loop; ++i) {
-      const float *local_x = x + (i << 4);
-      float *local_y = y + (i << 4);
-      float32x4_t r0 = vld1q_f32(local_x);
-      float32x4_t r1 = vld1q_f32(local_x + 4);
-      float32x4_t r2 = vld1q_f32(local_x + 8);
-      float32x4_t r3 = vld1q_f32(local_x + 12);
-      float32x4_t a_r0 = vld1q_f32(alphas);
-      float32x4_t a_r1 = vld1q_f32(alphas);
-      float32x4_t a_r2 = vld1q_f32(alphas);
-      float32x4_t a_r3 = vld1q_f32(alphas);
-      r0 = math::vActiveq_f32<Act>(r0, a_r0);
-      r1 = math::vActiveq_f32<Act>(r1, a_r1);
-      r2 = math::vActiveq_f32<Act>(r2, a_r2);
-      r3 = math::vActiveq_f32<Act>(r3, a_r3);
-      vst1q_f32(local_y, r0);
-      vst1q_f32(local_y + 4, r1);
-      vst1q_f32(local_y + 8, r2);
-      vst1q_f32(local_y + 12, r3);
-    }
-    x += (loop << 4);
-    y += (loop << 4);
-#endif
-    for (size_t i = 0; i < remain; ++i) {
-      y[i] = math::Active<Act>(x[i], falpha);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/central-arm-func/batchnorm_arm_func.h b/mobile/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
deleted file mode 100644
index 300cd32a69..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BATCHNORM_OP
-
-#pragma once
-
-#include <cmath>
-#include "operators/op_param.h"
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#endif  // __ARM_NEON__
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void BatchnormCompute(const BatchNormParam<CPU> &param) {
-  const float epsilon = param.Epsilon();
-  const float *mean_ptr = param.InputMean()->data<float>();
-  const float *variance_ptr = param.InputVariance()->data<float>();
-  const float *scale_ptr = param.InputScale()->data<float>();
-  const float *bias_ptr = param.InputBias()->data<float>();
-
-  const framework::Tensor *input = param.InputX();
-  const float *input_ptr = input->data<float>();
-  framework::Tensor *output = param.OutputY();
-  float *output_ptr = output->mutable_data<float>();
-  size_t spatial_size = output->dims()[2] * output->dims()[3];
-  int channels = output->dims()[1];
-
-  #pragma omp parallel for collapse(2)
-  for (int batch = 0; batch < output->dims()[0]; ++batch) {
-    for (int c = 0; c < channels; ++c) {
-      float inv_scale = 1.f / (std::sqrt(variance_ptr[c] + epsilon));
-      float bias = bias_ptr[c] - inv_scale * scale_ptr[c] * mean_ptr[c];
-      float scale = inv_scale * scale_ptr[c];
-      size_t offset = (batch * channels + c) * spatial_size;
-      const float *x = input_ptr + offset;
-      float *y = output_ptr + offset;
-      size_t remain = spatial_size;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-      int loop = spatial_size >> 4;
-      remain = spatial_size & 0xF;
-      float32x4_t __scale = vdupq_n_f32(scale);
-      float32x4_t __bias = vdupq_n_f32(bias);
-      for (int k = 0; k < loop; ++k, x += 16, y += 16) {
-        float32x4_t r0 = vld1q_f32(x);
-        float32x4_t r1 = vld1q_f32(x + 4);
-        float32x4_t r2 = vld1q_f32(x + 8);
-        float32x4_t r3 = vld1q_f32(x + 12);
-        r0 = vmlaq_f32(__bias, __scale, r0);
-        r1 = vmlaq_f32(__bias, __scale, r1);
-        r2 = vmlaq_f32(__bias, __scale, r2);
-        r3 = vmlaq_f32(__bias, __scale, r3);
-        vst1q_f32(y, r0);
-        vst1q_f32(y + 4, r1);
-        vst1q_f32(y + 8, r2);
-        vst1q_f32(y + 12, r3);
-      }
-#endif  // __ARM_NEON__
-      for (int k = 0; k < remain; ++k) {
-        y[k] = scale * x[k] + bias;
-      }
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/bilinear_interp_arm_func.h b/mobile/src/operators/kernel/central-arm-func/bilinear_interp_arm_func.h
deleted file mode 100644
index 3840985ab8..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/bilinear_interp_arm_func.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BILINEAR_INTERP_OP
-#pragma once
-
-#include <vector>
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void BilinearInterpCompute(const BilinearInterpParam<CPU>& param) {
-  auto out_dims = param.Out()->dims();
-  auto* input = param.InputX()->data<float>();
-  auto out_size_t = param.InputOutPutSize();
-
-  int out_h = param.OutH();
-  int out_w = param.OutW();
-  if (out_size_t != nullptr) {
-    auto out_size_data = out_size_t->data<int>();
-    out_h = out_size_data[0];
-    out_w = out_size_data[1];
-  }
-  auto* output = param.Out()->mutable_data<float>(
-      {out_dims[0], out_dims[1], out_h, out_w});
-  auto batch_size = param.InputX()->dims()[0];
-  auto channels = param.InputX()->dims()[1];
-  auto in_h = param.InputX()->dims()[2];
-  auto in_w = param.InputX()->dims()[3];
-
-  auto in_hw = in_h * in_w;
-  auto out_hw = out_h * out_w;
-  auto in_chw = channels * in_hw;
-  auto out_chw = channels * out_hw;
-
-  float ratio_h =
-      (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
-  float ratio_w =
-      (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
-
-  if (in_h == out_h && in_w == out_w) {
-    memcpy(output, input, param.InputX()->numel() * sizeof(float));
-  } else {
-    for (int k = 0; k < batch_size; ++k) {  // loop for batches
-      for (int i = 0; i < out_h; ++i) {     // loop for images
-        int h = ratio_h * i;
-        int hid = (h < in_h - 1) ? 1 : 0;
-        float h1lambda = ratio_h * i - h;
-        float h2lambda = 1.f - h1lambda;
-
-        for (int j = 0; j < out_w; ++j) {
-          int w = ratio_w * j;
-          int wid = (w < in_w - 1) ? 1 : 0;
-          float w1lambda = ratio_w * j - w;
-          float w2lambda = 1.f - w1lambda;
-          // calculate four position for bilinear interpolation
-          const float* in_pos = &input[k * in_chw + h * in_w + w];
-          float* out_pos = &output[k * out_chw + i * out_w + j];
-
-          for (int c = 0; c < channels; ++c) {  // loop for channels
-            // bilinear interpolation
-            out_pos[0] = static_cast<float>(
-                h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[wid]) +
-                h1lambda * (w2lambda * in_pos[hid * in_w] +
-                            w1lambda * in_pos[hid * in_w + wid]));
-            in_pos += in_hw;
-            out_pos += out_hw;
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/box_coder_arm_func.h b/mobile/src/operators/kernel/central-arm-func/box_coder_arm_func.h
deleted file mode 100644
index 9cdc22cff0..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/box_coder_arm_func.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BOXCODER_OP
-#pragma once
-
-#include <cmath>
-#include "framework/tensor.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-void EncodeCenterSize(const framework::Tensor& target_box,
-                      const framework::Tensor& prior_box,
-                      const framework::Tensor& prior_box_var, T* output) {
-  int64_t row = target_box.dims()[0];
-  int64_t col = prior_box.dims()[0];
-  int64_t len = prior_box.dims()[1];
-  auto* target_box_data = target_box.data<T>();
-  auto* prior_box_data = prior_box.data<T>();
-  auto* prior_box_var_data = prior_box_var.data<T>();
-
-  for (int64_t i = 0; i < row; ++i) {
-    for (int64_t j = 0; j < col; ++j) {
-      T prior_box_width = prior_box_data[j * len + 2] - prior_box_data[j * len];
-      T prior_box_height =
-          prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
-      T prior_box_center_x =
-          (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
-      T prior_box_center_y =
-          (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
-
-      T target_box_center_x =
-          (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
-      T target_box_center_y =
-          (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
-      T target_box_width =
-          target_box_data[i * len + 2] - target_box_data[i * len];
-      T target_box_height =
-          target_box_data[i * len + 3] - target_box_data[i * len + 1];
-
-      size_t offset = i * col * len + j * len;
-      output[offset] = (target_box_center_x - prior_box_center_x) /
-                       prior_box_width / prior_box_var_data[j * len];
-      output[offset + 1] = (target_box_center_y - prior_box_center_y) /
-                           prior_box_height / prior_box_var_data[j * len + 1];
-      output[offset + 2] =
-          std::log(std::fabs(target_box_width / prior_box_width)) /
-          prior_box_var_data[j * len + 2];
-      output[offset + 3] =
-          std::log(std::fabs(target_box_height / prior_box_height)) /
-          prior_box_var_data[j * len + 3];
-    }
-  }
-}
-
-template <typename T>
-void DecodeCenterSize(const framework::Tensor& target_box,
-                      const framework::Tensor& prior_box,
-                      const framework::Tensor& prior_box_var, T* output) {
-  int64_t row = target_box.dims()[0];
-  int64_t col = prior_box.dims()[0];
-  int64_t len = prior_box.dims()[1];
-
-  auto* target_box_data = target_box.data<T>();
-  auto* prior_box_data = prior_box.data<T>();
-  auto* prior_box_var_data = prior_box_var.data<T>();
-
-  for (int64_t i = 0; i < row; ++i) {
-    for (int64_t j = 0; j < col; ++j) {
-      size_t offset = i * col * len + j * len;
-      T prior_box_width = prior_box_data[j * len + 2] - prior_box_data[j * len];
-      T prior_box_height =
-          prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
-      T prior_box_center_x =
-          (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
-      T prior_box_center_y =
-          (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
-
-      T target_box_center_x = prior_box_var_data[j * len] *
-                                  target_box_data[offset] * prior_box_width +
-                              prior_box_center_x;
-      T target_box_center_y = prior_box_var_data[j * len + 1] *
-                                  target_box_data[offset + 1] *
-                                  prior_box_height +
-                              prior_box_center_y;
-      T target_box_width = std::exp(prior_box_var_data[j * len + 2] *
-                                    target_box_data[offset + 2]) *
-                           prior_box_width;
-      T target_box_height = std::exp(prior_box_var_data[j * len + 3] *
-                                     target_box_data[offset + 3]) *
-                            prior_box_height;
-
-      output[offset] = target_box_center_x - target_box_width / 2;
-      output[offset + 1] = target_box_center_y - target_box_height / 2;
-      output[offset + 2] = target_box_center_x + target_box_width / 2;
-      output[offset + 3] = target_box_center_y + target_box_height / 2;
-    }
-  }
-}
-
-template <typename P>
-void BoxCoderCompute(const BoxCoderParam<CPU>& param) {
-  const auto* input_priorbox = param.InputPriorBox();
-  const auto* input_priorboxvar = param.InputPriorBoxVar();
-  const auto* input_targetbox = param.InputTargetBox();
-
-  const auto& code_type = param.CodeType();
-
-  auto row = input_targetbox->dims()[0];
-  auto col = input_priorbox->dims()[0];
-  auto len = input_priorbox->dims()[1];
-
-  framework::Tensor* output_box = param.OutputBox();
-  auto* output_box_dataptr = output_box->mutable_data<float>({row, col, len});
-
-  if (code_type == "encode_center_size") {
-    EncodeCenterSize<float>(*input_targetbox, *input_priorbox,
-                            *input_priorboxvar, output_box_dataptr);
-  }
-  if (code_type == "decode_center_size") {
-    DecodeCenterSize<float>(*input_targetbox, *input_priorbox,
-                            *input_priorboxvar, output_box_dataptr);
-  }
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/concat_arm_func.h b/mobile/src/operators/kernel/central-arm-func/concat_arm_func.h
deleted file mode 100644
index 4b22857302..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/concat_arm_func.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONCAT_OP
-#pragma once
-
-#include <vector>
-
-namespace paddle_mobile {
-namespace operators {
-template <typename T>
-class ConcatFunctor {
- public:
-  void operator()(const std::vector<framework::Tensor> &input, const int axis,
-                  framework::Tensor *output) {
-    size_t num = input.size();
-    int rows = 1;
-    auto dim_0 = input[0].dims();
-    for (int i = 0; i < axis; ++i) {
-      rows *= dim_0[i];
-    }
-    int out_rows = rows, out_cols = 0;
-
-    std::vector<int64_t> input_cols(input.size());
-    for (int i = 0; i < num; ++i) {
-      int t_cols = input[i].numel() / rows;
-      out_cols += t_cols;
-      input_cols[i] = t_cols;
-    }
-
-    // computation
-    for (int k = 0; k < out_rows; ++k) {
-      T *dst_ptr = output->data<T>() + k * out_cols;
-      int col_idx = 0;
-      for (int j = 0; j < num; ++j) {
-        int col_len = input_cols[j];
-        const T *src_prt = input[j].data<T>() + k * col_len;
-        memory::Copy(dst_ptr + col_idx, src_prt, sizeof(T) * col_len);
-        col_idx += col_len;
-      }
-    }
-  }
-};
-
-template <typename P>
-void ConcatCompute(const ConcatParam<CPU> &param) {
-  auto inputs = param.Inputs();
-  auto *out = param.Out();
-  int axis = param.Axis();
-  out->mutable_data<P>();
-
-  /// Sometimes direct copies will be faster, this maybe need deeply analysis.
-  if (axis == 0 && inputs.size() < 10) {
-    size_t output_offset = 0;
-    for (auto *in : inputs) {
-      auto in_stride = framework::stride_numel(in->dims());
-      auto out_stride = framework::stride_numel(out->dims());
-      auto dst = out->data<P>() + output_offset;
-      auto src = in->data<P>();
-      PADDLE_MOBILE_ENFORCE(
-          in_stride.size() == out_stride.size(),
-          "src and dst tensor should have the same dims size.");
-      memory::Copy(dst, src, sizeof(P) * in_stride[0]);
-      output_offset += in_stride[0];
-    }
-  } else {
-    std::vector<framework::Tensor> inputs_concat(inputs.size());
-    for (int j = 0; j < inputs.size(); ++j) {
-      inputs_concat[j] = *inputs[j];
-    }
-    ConcatFunctor<P> concat_functor;
-    concat_functor(inputs_concat, axis, out);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/conv_add_arm_func.h b/mobile/src/operators/kernel/central-arm-func/conv_add_arm_func.h
deleted file mode 100644
index 0051fc9ae8..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/conv_add_arm_func.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADD_OP
-#pragma once
-
-#include <vector>
-#include "operators/math/conv_func.h"
-#include "operators/math/depthwise_conv3x3.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-void ConvAddBasic(const FusionConvAddParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor bias = *param.Bias();
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-  float *biase_data = bias.data<float>();
-
-  int axis = param.Axis();
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::MatMul<float, float>(filter_slice, false, col_matrix, false,
-                                 static_cast<float>(1), &out_slice,
-                                 static_cast<float>(1), false, biase_data);
-    }
-  }
-}
-
-template <typename P>
-void ConvAddCompute(const FusionConvAddParam<CPU> &param) {
-  param.Output()->mutable_data<float>();
-  if (param.Groups() == param.Input()->dims()[1] &&
-      param.Input()->dims()[1] == param.Output()->dims()[1] &&
-      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 &&
-      param.paddings_[0] == 1) {
-    math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
-                               param.Bias(), true, false);
-  } else if (param.Groups() == param.Input()->dims()[1] &&
-             param.Input()->dims()[1] == param.Output()->dims()[1] &&
-             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
-    //        math::DepthwiseConv3x3(param.Input(), param.Strides(),
-    //        param.Paddings(),
-    //                               param.Filter(), param.Bias(),
-    //                               param.Output(), false);
-    if (param.Paddings()[0] == 0) {
-      math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(),
-                                 param.Bias(), true, false);
-    } else {
-      math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(),
-                                   param.Output(), param.Bias(), true, false);
-    }
-  } else {
-    ConvAddBasic(param);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h b/mobile/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h
deleted file mode 100644
index 5ee1e251d9..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDBNRELU_OP
-
-#pragma once
-
-#include <vector>
-#include "operators/math/depthwise_conv3x3.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-void ConvAddBNReluBasic(const FusionConvAddBNReluParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor new_bias = *param.NewBias();
-  Tensor new_scale = *param.NewScale();
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-
-      math::MatMulWithBn(filter_slice, false, col_matrix, false,
-                         static_cast<float>(1), &out_slice,
-                         static_cast<float>(0), true, &new_scale, &new_bias, g);
-    }
-  }
-}
-
-template <typename P>
-void ConvAddBNReluCompute(const FusionConvAddBNReluParam<CPU> &param) {
-  if (param.Groups() == param.Input()->dims()[1] &&
-      param.Input()->dims()[1] == param.Output()->dims()[1] &&
-      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 &&
-      param.paddings_[0] == 1) {
-    math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
-                                        param.Output(), param.NewScale(),
-                                        param.NewBias(), true);
-  } else if (param.Groups() == param.Input()->dims()[1] &&
-             param.Input()->dims()[1] == param.Output()->dims()[1] &&
-             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
-    math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(),
-                                          param.Output(), param.NewScale(),
-                                          param.NewBias(), true);
-  } else {
-    ConvAddBNReluBasic(param);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h b/mobile/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
deleted file mode 100644
index 9f8e885a31..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDRELU_OP
-
-#pragma once
-#include <operators/math/depthwise_conv3x3.h>
-#include <vector>
-#include "operators/math/conv_func.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Itype, typename Otype>
-void ConvAddReluBasic(const FusionConvAddReluParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor bias = *param.Bias();
-  int32_t axis = param.Axis();
-  Otype *bias_data = bias.data<Otype>();
-  Tensor *output = param.Output();
-  output->mutable_data<Otype>();
-
-  float alpha = 1.0f;
-  float beta = 1.0f;
-  int32_t groups = param.Groups();
-  std::vector<int32_t> strides = param.Strides();
-  std::vector<int32_t> paddings = param.Paddings();
-  std::vector<int32_t> dilations = param.Dilations();
-
-  const int32_t batch_size = static_cast<int32_t>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<Itype>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int32_t>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int32_t in_step = static_cast<int32_t>(input->dims()[1]) / groups;
-  int32_t out_step = static_cast<int32_t>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, Itype> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, Itype> im2col;
-
-  for (int32_t i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int32_t g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int32_t>{paddings[0], paddings[1], paddings[0],
-                                    paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-
-      math::MatMul<Itype, Otype>(filter_slice, false, col_matrix, false, alpha,
-                                 &out_slice, beta, true, bias_data);
-    }
-  }
-}
-
-template <typename Itype, typename Otype>
-void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
-  param.Output()->mutable_data<float>();
-  if (param.Groups() == param.Input()->dims()[1] &&
-      param.Input()->dims()[1] == param.Output()->dims()[1] &&
-      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 &&
-      param.paddings_[0] == 1) {
-    math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
-                               param.Bias(), true, true);
-  } else if (param.Groups() == param.Input()->dims()[1] &&
-             param.Input()->dims()[1] == param.Output()->dims()[1] &&
-             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
-    //        math::DepthwiseConv3x3(param.Input(), param.Strides(),
-    //        param.Paddings(),
-    //                               param.Filter(), param.Bias(),
-    //                               param.Output(), false);
-    if (param.Paddings()[0] == 0) {
-      math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(),
-                                 param.Bias(), true, true);
-    } else {
-      math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(),
-                                   param.Output(), param.Bias(), true, true);
-    }
-  } else {
-    ConvAddReluBasic<Itype, Otype>(param);
-  }
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/conv_arm_func.cpp b/mobile/src/operators/kernel/central-arm-func/conv_arm_func.cpp
deleted file mode 100644
index 43bdbd532a..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/conv_arm_func.cpp
+++ /dev/null
@@ -1,377 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-#include <vector>
-#include "framework/context.h"
-#include "operators/math/depthwise/faster_depthwise_conv3x3.h"
-#include "operators/math/depthwise_conv3x3.h"
-#include "operators/math/depthwise_conv5x5.h"
-#include "operators/math/gemm/gemm1x1s1.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/pad.h"
-#include "operators/math/slidingwindow_conv3x3.h"
-#include "operators/math/vol2col.h"
-#include "operators/math/winograd/winograd_transform.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-int ConvOutputSize(int input_size, int filter_size, int dilation, int padding,
-                   int stride) {
-  const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
-  return output_size;
-}
-
-bool IsExpand(const std::vector<int64_t> &filter_dim,
-              const std::vector<int> &strides, const std::vector<int> &paddings,
-              const std::vector<int> &dilations) {
-  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
-  for (size_t j = 0; j < strides.size(); ++j) {
-    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
-    strides_1 = strides_1 && (strides[j] == 1);
-    padding_0 = padding_0 && (paddings[j] == 0);
-    dilation_1 = dilation_1 && (dilations[j] == 1);
-  }
-
-  return !(filter_1 && strides_1 && padding_0 && dilation_1);
-}
-
-#ifdef PADDLE_MOBILE_CPU
-template <typename Itype, typename Otype>
-void GemmConv(const ConvParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor *output = param.Output();
-  output->mutable_data<Otype>();
-
-  int groups = param.Groups();
-  const std::vector<int> strides = param.Strides();
-  const std::vector<int> paddings = param.Paddings();
-  const std::vector<int> dilations = param.Dilations();
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<Itype>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, Itype> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, Itype> im2col;
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        // col_matrix.ShareDataWith(in_slice);
-        col_matrix = in_slice;
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-
-      math::MatMul<Itype, Otype>(filter_slice, false, col_matrix, false,
-                                 static_cast<float>(1), &out_slice,
-                                 static_cast<float>(0), false,
-                                 static_cast<Otype *>(nullptr));
-    }
-  }
-}
-
-template <typename Itype, typename Otype>
-void GemmConv1x1s1(const ConvParam<CPU> &param, const float *bias, bool is_bias,
-                   bool is_relu) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.transformed_filter_;
-  Tensor *output = param.Output();
-  output->mutable_data<Otype>();
-
-  const float *din = input->data<Itype>();
-  float *dout = output->mutable_data<Otype>();
-  const int num = input->dims()[0];
-  const int chin = input->dims()[1];
-  const int hin = input->dims()[2];
-  const int win = input->dims()[3];
-  const int chout = output->dims()[1];
-  const int hout = output->dims()[2];
-  const int wout = output->dims()[3];
-  const float *weights = filter.mutable_data<float>();
-  int channel_size_out = wout * hout;
-  int channel_size_in = win * hin;
-  const int group = param.Groups();
-  const int m = chout / group;
-  const int n = hout * wout;
-  const int k = chin / group;
-
-  bool flag_relu = true;
-  bool flag_bias = true;
-
-  if (!is_bias) {
-    bias = nullptr;
-    flag_bias = false;
-  }
-  if (!is_relu) {
-    flag_relu = false;
-  }
-  ARMArch arch = framework::CPUContext::Context()->get_arch();
-  int hblock = math::get_hblock(arch);
-
-  int m_roundup = hblock * ((m + hblock - 1) / hblock);
-  int weights_size_per_group = m * k;
-  if (n > 1) {
-    weights_size_per_group = ((m_roundup * k + 15) / 16) * 16;
-  }
-
-  for (int b = 0; b < num; ++b) {
-    // dC
-    for (int g = 0; g < group; ++g) {
-      float *dout_group =
-          static_cast<float *>(dout) + (b * chout + g * m) * channel_size_out;
-      const float *din_group = static_cast<const float *>(din) +
-                               (b * chin + g * k) * channel_size_in;
-      const float *weights_group =
-          static_cast<const float *>(weights) + g * weights_size_per_group;
-      const float *bias_group = static_cast<const float *>(bias) + g * m;
-      if (n > 1) {
-        math::sgemm_prepack(weights_group, din_group, bias_group, dout_group, m,
-                            n, k, flag_bias, flag_relu, false, arch);
-      }
-    }
-  }
-}
-
-template <int tile, int kernel>
-void WinogradConv3x3(const ConvParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  const Tensor *filter = param.transformed_filter_;
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-  int batch_size = input->dims()[0];
-  int groups = param.Groups();
-  const std::vector<int> &paddings = param.Paddings();
-
-  auto winograd_pad = [&](int width, int pad) {
-    int output_tile = tile - kernel + 1;
-    // int tiles = (width + pad - kernel) / output_tile + 1;
-    // return (tiles - 1) * output_tile + tile - width;
-    int pad_width = (width + 2 * pad - kernel) / output_tile * output_tile;
-    return pad_width + tile - width;
-  };
-
-  math::PadFunctor<CPU, float> pad;
-  Tensor input_pad;
-  framework::Tensor transformed_input;
-  for (int i = 0; i < batch_size; ++i) {
-    Tensor in_batch = input->Slice(i, i + 1);
-    Tensor out_batch = output->Slice(i, i + 1);
-    // int pad_bottom = winograd_pad(in_batch.dims()[2], paddings[0]);
-    // int pad_right = winograd_pad(in_batch.dims()[3], paddings[1]);
-    int pad_bottom = paddings[0];
-    int pad_right = paddings[1];
-    if (paddings[0] || paddings[1] || pad_bottom || pad_right) {
-      framework::DDim pad_shape = in_batch.dims();
-      pad_shape[2] += paddings[0] + pad_bottom;
-      pad_shape[3] += paddings[1] + pad_right;
-      input_pad.mutable_data<float>(pad_shape);
-      pad(in_batch, paddings[0], pad_bottom, paddings[1], pad_right,
-          &input_pad);
-    } else {
-      input_pad = in_batch;
-    }
-    // tile input and transform
-    math::winograd_transform_input<tile, kernel>(input_pad, &transformed_input);
-    // caculate output
-    math::winograd_transform_output<tile, kernel>(transformed_input, *filter,
-                                                  output);
-  }
-}
-
-template <typename Itype, typename Otype>
-void DepthwiseConv3x3(const ConvParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  const Tensor *filter = param.Filter();
-  const std::vector<int> &paddings = param.Paddings();
-  const std::vector<int> &strides = param.Strides();
-  const int batch_size = input->dims()[0];
-  Tensor *output = param.Output();
-  output->mutable_data<Otype>();
-
-  if (strides[0] == 1) {
-    for (int i = 0; i < batch_size; i++) {
-      Tensor in_batch = input->Slice(i, i + 1);
-      Tensor out_batch = output->Slice(i, i + 1);
-      math::DepthwiseConv3x3S1<Itype, Otype>(in_batch, *filter, paddings,
-                                             &out_batch);
-    }
-  } else if (strides[0] == 2) {
-    for (int i = 0; i < batch_size; i++) {
-      Tensor in_batch = input->Slice(i, i + 1);
-      Tensor out_batch = output->Slice(i, i + 1);
-      math::DepthwiseConv3x3S2<Itype, Otype>(in_batch, *filter, paddings,
-                                             &out_batch);
-    }
-  } else {
-    GemmConv<Itype, Otype>(param);
-  }
-}
-
-void FasterDepthwiseConv3x3_bias_relu(const ConvParam<CPU> &param,
-                                      const float *bias, bool flag_relu) {
-  const Tensor *input = param.Input();
-  const Tensor *filter = param.Filter();
-  const std::vector<int> &paddings = param.Paddings();
-  const std::vector<int> &strides = param.Strides();
-  const int batch_size = input->dims()[0];
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-
-  int pad = paddings[0];
-  int stride = strides[0];
-  const float *din = input->data<float>();
-  float *dout = output->mutable_data<float>();
-  const float *weights = filter->data<float>();
-  const int num = input->dims()[0];
-  const int chin = input->dims()[1];
-  const int hin = input->dims()[2];
-  const int win = input->dims()[3];
-  const int chout = output->dims()[1];
-  const int hout = output->dims()[2];
-  const int wout = output->dims()[3];
-  bool flag_bias = bias != nullptr;
-  if (pad == 1) {
-    math::depthwise::conv_depthwise_3x3p1(din, dout, num, chout, hout, wout,
-                                          chin, hin, win, weights, bias, stride,
-                                          flag_bias, flag_relu);
-  }
-}
-
-template <typename Itype, typename Otype>
-void DepthwiseConv5x5(const ConvParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  const Tensor *filter = param.Filter();
-  const std::vector<int> &paddings = param.Paddings();
-  const std::vector<int> &strides = param.Strides();
-  const int batch_size = input->dims()[0];
-  Tensor *output = param.Output();
-  output->mutable_data<Otype>();
-
-  if (strides[0] == 1) {
-    for (int i = 0; i < batch_size; i++) {
-      Tensor in_batch = input->Slice(i, i + 1);
-      Tensor out_batch = output->Slice(i, i + 1);
-      math::DepthwiseConv5x5S1<Itype, Otype>(in_batch, *filter, paddings,
-                                             &out_batch);
-    }
-  } else {
-    GemmConv<Itype, Otype>(param);
-  }
-}
-
-template <typename Itype, typename Otype>
-void SlidingwindowConv3x3(const ConvParam<CPU> &param, const float *bias,
-                          bool is_bias, bool is_relu) {
-  const Tensor *input = param.Input();
-  const Tensor *filter = param.Filter();
-  const std::vector<int> &paddings = param.Paddings();
-  const std::vector<int> &strides = param.Strides();
-  Tensor *output = param.Output();
-  output->mutable_data<Otype>();
-
-  if (strides[0] == 1) {
-    // math::SlidingwindowConv3x3s1<Itype, Otype>(input, filter, paddings,
-    // output);
-    math::SlidingwindowConv3x3s1Faster<Itype, Otype>(
-        input, param.transformed_filter_, paddings, output, bias, is_bias,
-        is_relu);
-  } else if (strides[0] == 2) {
-    // math::SlidingwindowConv3x3s2<Itype, Otype>(input, filter, paddings,
-    // output);
-    math::SlidingwindowConv3x3s2Faster<Itype, Otype>(
-        input, param.transformed_filter_, paddings, output, bias, is_bias,
-        is_relu);
-  } else {
-    GemmConv<Itype, Otype>(param);
-  }
-}
-
-template void GemmConv<float, float>(const ConvParam<CPU> &param);
-template void GemmConv1x1s1<float, float>(const ConvParam<CPU> &param,
-                                          const float *bias, bool is_bias,
-                                          bool is_relu);
-template void WinogradConv3x3<8, 3>(const ConvParam<CPU> &param);
-template void DepthwiseConv3x3<float, float>(const ConvParam<CPU> &param);
-template void DepthwiseConv5x5<float, float>(const ConvParam<CPU> &param);
-template void SlidingwindowConv3x3<float, float>(const ConvParam<CPU> &param,
-                                                 const float *bias,
-                                                 bool is_bias, bool is_relu);
-
-template void GemmConv<int8_t, int32_t>(const ConvParam<CPU> &param);
-#ifndef __aarch64__
-template void DepthwiseConv3x3<int8_t, int32_t>(const ConvParam<CPU> &param);
-template void DepthwiseConv5x5<int8_t, int32_t>(const ConvParam<CPU> &param);
-#endif
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/central-arm-func/conv_arm_func.h b/mobile/src/operators/kernel/central-arm-func/conv_arm_func.h
deleted file mode 100644
index 89b91f9d11..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_OP
-
-#pragma once
-
-#include <vector>
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-int ConvOutputSize(int input_size, int filter_size, int dilation, int padding,
-                   int stride);
-
-bool IsExpand(const std::vector<int64_t> &filter_dim,
-              const std::vector<int> &strides, const std::vector<int> &paddings,
-              const std::vector<int> &dilations);
-
-template <typename Itype, typename Otype>
-void GemmConv(const ConvParam<CPU> &param);
-
-template <typename Itype, typename Otype>
-void GemmConv1x1s1(const ConvParam<CPU> &param, const float *bias, bool is_bias,
-                   bool is_relu);
-
-template <int tile, int kernel>
-void WinogradConv3x3(const ConvParam<CPU> &param);
-
-template <typename Itype, typename Otype>
-void DepthwiseConv3x3(const ConvParam<CPU> &param);
-
-template <typename Itype, typename Otype>
-void DepthwiseConv5x5(const ConvParam<CPU> &param);
-
-template <typename Itype, typename Otype>
-void SlidingwindowConv3x3(const ConvParam<CPU> &param, const float *bias,
-                          bool is_bias, bool is_relu);
-
-void FasterDepthwiseConv3x3_bias_relu(const ConvParam<CPU> &param,
-                                      const float *bias, bool flag_relu);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h b/mobile/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
deleted file mode 100644
index 1ff51aa39c..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBNADDRELU_OP
-
-#pragma once
-
-#include <vector>
-#include "operators/math/depthwise_conv3x3.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-void ConvBNAddReluBasic(const FusionConvBNAddReluParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor new_bias = *param.NewBias();
-  Tensor new_scale = *param.NewScale();
-  Tensor *bias1 = param.Bias();
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-    Tensor bias_batch = bias1->Slice(i, i + 1).Resize(output_matrix_shape);
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      Tensor bias_data = bias_batch.Slice(g * out_step, (g + 1) * out_step);
-      math::MatMulWithBn(filter_slice, false, col_matrix, false,
-                         static_cast<float>(1), &out_slice,
-                         static_cast<float>(1), true, &new_scale, &new_bias, g,
-                         bias_data.data<float>());
-    }
-  }
-}
-template <typename P>
-void ConvBNAddReluCompute(const FusionConvBNAddReluParam<CPU> &param) {
-  Tensor Bias;
-  Bias.mutable_data<float>({param.Groups()});
-  if (param.Groups() == param.Input()->dims()[1] &&
-      param.Input()->dims()[1] == param.Output()->dims()[1] &&
-      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 &&
-      param.paddings_[0] == 1) {
-    math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
-                                        param.Output(), param.NewScale(),
-                                        param.NewBias(), true);
-  } else if (param.Groups() == param.Input()->dims()[1] &&
-             param.Input()->dims()[1] == param.Output()->dims()[1] &&
-             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
-    //    math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
-    //                                        param.Output(), param.NewScale(),
-    //                                        param.NewBias(), 1);
-    math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(),
-                                          param.Output(), param.NewScale(),
-                                          param.NewBias(), true);
-  } else {
-    ConvBNAddReluBasic(param);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h b/mobile/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
deleted file mode 100644
index 5606eb3304..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBNRELU_OP
-
-#pragma once
-#include <vector>
-#include "operators/math/depthwise_conv3x3.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-void ConvBNReluBasic(const FusionConvBNReluParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor new_bias = *param.NewBias();
-  Tensor new_scale = *param.NewScale();
-
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-
-      math::MatMulWithBn(filter_slice, false, col_matrix, false,
-                         static_cast<float>(1), &out_slice,
-                         static_cast<float>(0), true, &new_scale, &new_bias, g);
-    }
-  }
-}
-
-template <typename P>
-void ConvBNReluCompute(const FusionConvBNReluParam<CPU> &param) {
-  if (param.Groups() == param.Input()->dims()[1] &&
-      param.Input()->dims()[1] == param.Output()->dims()[1] &&
-      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 &&
-      param.paddings_[0] == 1) {
-    math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
-                                        param.Output(), param.NewScale(),
-                                        param.NewBias(), true);
-  } else if (param.Groups() == param.Input()->dims()[1] &&
-             param.Input()->dims()[1] == param.Output()->dims()[1] &&
-             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
-    //    math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
-    //                                        param.Output(), param.NewScale(),
-    //                                        param.NewBias(), 1);
-    math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(),
-                                          param.Output(), param.NewScale(),
-                                          param.NewBias(), true);
-  } else {
-    ConvBNReluBasic(param);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h b/mobile/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
deleted file mode 100644
index 33ceefadd8..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef CONV_TRANSPOSE_OP
-
-#include <vector>
-#include "framework/ddim.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void ConvTransposeCompute(const ConvTransposeParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor *output = param.Output();
-  output->mutable_data<P>();
-
-  auto strides = param.Strides();
-  auto paddings = param.Paddings();
-  auto dilations = param.Dilations();
-  auto groups = param.Groups();
-
-  const int batch_size = input->dims()[0];
-
-  std::vector<int64_t> input_shape_vec = framework::vectorize(input->dims());
-  std::vector<int64_t> filter_shape_vec = framework::vectorize(filter.dims());
-
-  size_t data_dim = filter_shape_vec.size() - 2;
-
-  // 5 或者 7
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-
-  // output c / groups
-  col_shape_vec[0] = output->dims()[1] / groups;
-  for (size_t i = 0; i < data_dim; ++i) {
-    // filter shape  filter h  filter w
-    col_shape_vec[i + 1] = filter_shape_vec[i + 2];
-    // input shape  input h  input w
-    col_shape_vec[i + 1 + data_dim] = input_shape_vec[i + 2];
-  }
-
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  Tensor col;
-  col.mutable_data<P>(col_shape);
-
-  Tensor col_matrix;
-  col_matrix.ShareDataWith(col);
-  col_matrix.Resize(col_matrix_shape);
-
-  framework::DDim output_shape =
-      framework::slice_ddim(output->dims(), 1, output->dims().size());
-
-  framework::DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};
-
-  // filter size: (m, c/g * k_h * k_w) or (m, c/g * k_d * k_h * k_w)
-  framework::DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
-  filter.Resize(filter_matrix_shape);
-
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Col2ImFunctor<math::ColFormat::kCFO, CPU, P> col2im;
-  math::Col2VolFunctor<CPU, P> col2vol;
-
-  for (int i = 0; i < batch_size; ++i) {
-    Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
-    Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape);
-
-    for (int g = 0; g < groups; ++g) {
-      Tensor in_slice = input_batch.Slice(g * in_step, (g + 1) * in_step);
-      Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step);
-      Tensor out_slice = output_batch.Slice(g * out_step, (g + 1) * out_step);
-
-      math::MatMul<P, P>(filter_slice, true, in_slice, false,
-                         static_cast<P>(1.0), &col_matrix, static_cast<P>(0.0));
-      if (data_dim == 2U) {
-        col2im(col, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &out_slice);
-      } else if (data_dim == 3U) {
-        col2vol(col, dilations, strides, paddings, &out_slice);
-      }
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/crf_arm_func.h b/mobile/src/operators/kernel/central-arm-func/crf_arm_func.h
deleted file mode 100644
index 2cf95081e9..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/crf_arm_func.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CRF_OP
-#pragma once
-
-#include <limits>
-#include <vector>
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-template <typename P>
-void Decode(const Tensor& emission_weights, const Tensor& transition_weights,
-            Tensor* decoded_path) {
-  auto emission_dims = emission_weights.dims();
-  const size_t seq_len = emission_dims[0];
-  const size_t tag_num = emission_dims[1];
-
-  const size_t state_trans_base_idx = 2;
-
-  const P* x = emission_weights.data<P>();
-  const P* w = transition_weights.data<P>();
-  int64_t* path = decoded_path->data<int64_t>();
-
-  // alpha is a memo table. An element alpha(k, v) records the score of the
-  // best sequence of tags from position 1 to position k with v being the end
-  // tag.
-  Tensor alpha;
-  P* alpha_value = alpha.mutable_data<P>(emission_dims);
-  Tensor track;
-  int* track_value = track.mutable_data<int>(emission_dims);
-  for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
-
-  for (size_t k = 1; k < seq_len; ++k) {
-    for (size_t i = 0; i < tag_num; ++i) {
-      P max_score = -std::numeric_limits<P>::max();
-      int max_j = 0;
-      for (size_t j = 0; j < tag_num; ++j) {
-        P score = alpha_value[(k - 1) * tag_num + j] +
-                  w[(j + state_trans_base_idx) * tag_num + i];
-        if (score > max_score) {
-          max_score = score;
-          max_j = j;
-        }
-      }
-
-      alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i];
-      track_value[k * tag_num + i] = max_j;
-    }
-  }
-  P max_score = -std::numeric_limits<P>::max();
-  int max_i = 0;
-  for (size_t i = 0; i < tag_num; ++i) {
-    P score = alpha_value[(seq_len - 1) * tag_num + i] + w[tag_num + i];
-    if (score > max_score) {
-      max_score = score;
-      max_i = i;
-    }
-  }
-  path[seq_len - 1] = max_i;
-  for (int k = seq_len - 1; k >= 1; --k) {
-    path[k - 1] = max_i = track_value[k * tag_num + max_i];
-  }
-}
-template <typename P>
-void CrfCompute(const CrfParam<CPU>& param) {
-  auto* emission = param.InputEmission();
-  auto* transition = param.InputTransition();
-  auto* label = param.InputLabel();
-  auto* decoded_path = param.outputVBP();
-  //  DLOG<<*emission;
-  //  DLOG<<*transition;
-  //  DLOG<<*label;
-
-  PADDLE_MOBILE_ENFORCE(emission->NumLevels() == 1U,
-                        "The Input(Emission) should be a sequence.");
-  auto lod = emission->lod();
-  PADDLE_MOBILE_ENFORCE(lod.size(),
-                        "The Input(Emission) should be a sequence.");
-  const size_t level = 0;
-  const size_t seq_num = lod[level].size() - 1;
-  int64_t* path = decoded_path->mutable_data<int64_t>();
-  int numel = decoded_path->numel();
-  memset(static_cast<void*>(path), 0, sizeof(int64_t) * numel);
-  for (size_t i = 0; i < seq_num; ++i) {
-    int start_pos = static_cast<int>(lod[level][i]);
-    int end_pos = static_cast<int>(lod[level][i + 1]);
-    Tensor decoded_path_one_seq = decoded_path->Slice(start_pos, end_pos);
-    Decode<P>(emission->Slice(start_pos, end_pos), *transition,
-              &decoded_path_one_seq);
-  }
-  if (label) {
-    PADDLE_MOBILE_ENFORCE(label->NumLevels() == 1U,
-                          "The Input(Label) should be a sequence.");
-    const int64_t* label_value = label->data<int64_t>();
-    size_t batch_size = emission->dims()[0];
-    for (size_t i = 0; i < batch_size; ++i) {
-      path[i] = label_value[i] == path[i] ? 1 : 0;
-    }
-  }
-}
-}  // namespace operators
-
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/density_prior_box_arm_func.h b/mobile/src/operators/kernel/central-arm-func/density_prior_box_arm_func.h
deleted file mode 100644
index 7e4c3599d0..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/density_prior_box_arm_func.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DENSITY_PRIORBOX_OP
-#pragma once
-
-#include <operators/kernel/prior_box_kernel.h>
-#include <algorithm>
-#include <cmath>
-#include <vector>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-struct ClipFunctor {
-  inline T operator()(T in) const {
-    return std::min<T>(std::max<T>(in, 0.), 1.);
-  }
-};
-
-template <typename P>
-void DensityPriorBoxCompute(const DensityPriorBoxParam<CPU> &param) {
-  const auto *input_ = param.Input();
-  const auto &input_dims = input_->dims();
-
-  const auto *input_image = param.InputImage();
-  const auto &input_image_dims = input_image->dims();
-
-  auto densities = param.Densities();
-  auto fixed_ratios = param.FixedRatios();
-
-  auto fixed_sizes = param.FixedSizes();
-
-  const auto &variances = param.Variances();
-  const bool &clip = param.Clip();
-
-  const float &step_w = param.StepW();
-  const float &step_h = param.StepH();
-  const float &offset = param.Offset();
-
-  Tensor *output_boxes = param.OutputBoxes();
-  auto output_boxes_dataptr = output_boxes->mutable_data<float>();
-  Tensor *output_variances = param.OutputVariances();
-  auto output_variances_dataptr = output_variances->mutable_data<float>();
-
-  auto img_width = input_image_dims[3];
-  auto img_height = input_image_dims[2];
-
-  auto feature_width = input_dims[3];
-  auto feature_height = input_dims[2];
-
-  auto stride0 = output_boxes->dims()[1] * output_boxes->dims()[2] *
-                 output_boxes->dims()[3];
-  auto stride1 = output_boxes->dims()[2] * output_boxes->dims()[3];
-  auto stride2 = output_boxes->dims()[3];
-
-  float step_width, step_height;
-  /// 300 / 19
-  if (step_w == 0 || step_h == 0) {
-    step_width = static_cast<float>(img_width) / feature_width;
-    step_height = static_cast<float>(img_height) / feature_height;
-  } else {
-    step_width = step_w;
-    step_height = step_h;
-  }
-
-  int num_priors = 0;
-  for (size_t i = 0; i < densities.size(); ++i) {
-    num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
-  }
-
-  auto box_dim = output_variances->dims();
-
-  output_boxes->Resize({feature_height, feature_width, num_priors, 4});
-  int step_average = static_cast<int>((step_width + step_height) * 0.5);
-
-  std::vector<float> sqrt_fixed_ratios;
-  for (size_t i = 0; i < fixed_ratios.size(); i++) {
-    sqrt_fixed_ratios.push_back(sqrt(fixed_ratios[i]));
-  }
-
-  for (int h = 0; h < feature_height; ++h) {
-    for (int w = 0; w < feature_width; ++w) {
-      /// map origin image
-      float center_x = (w + offset) * step_width;
-      float center_y = (h + offset) * step_height;
-      int idx = 0;
-      for (size_t s = 0; s < fixed_sizes.size(); ++s) {
-        auto fixed_size = fixed_sizes[s];
-        int density = densities[s];
-        int shift = step_average / density;
-        // Generate density prior boxes with fixed ratios.
-        for (size_t r = 0; r < fixed_ratios.size(); ++r) {
-          float box_width_ratio = fixed_size * sqrt_fixed_ratios[r];
-          float box_height_ratio = fixed_size / sqrt_fixed_ratios[r];
-          float density_center_x = center_x - step_average / 2. + shift / 2.;
-          float density_center_y = center_y - step_average / 2. + shift / 2.;
-          for (int di = 0; di < density; ++di) {
-            for (int dj = 0; dj < density; ++dj) {
-              float center_x_temp = density_center_x + dj * shift;
-              float center_y_temp = density_center_y + di * shift;
-              output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                   0] =
-                  std::max((center_x_temp - box_width_ratio / 2.) / img_width,
-                           0.);
-              output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                   1] =
-                  std::max((center_y_temp - box_height_ratio / 2.) / img_height,
-                           0.);
-              output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                   2] =
-                  std::min((center_x_temp + box_width_ratio / 2.) / img_width,
-                           1.);
-              output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                   3] =
-                  std::min((center_y_temp + box_height_ratio / 2.) / img_height,
-                           1.);
-              idx++;
-            }
-          }
-        }
-      }
-    }
-  }
-  if (clip) {
-    math::Transform trans;
-    ClipFunctor<float> clip_func;
-    trans(output_boxes_dataptr, output_boxes_dataptr + output_boxes->numel(),
-          output_boxes_dataptr, clip_func);
-  }
-
-  if ((variances.size() != 4)) {
-    LOG(kLOG_ERROR) << " variances.size() must be 4.";
-  }
-
-  int64_t box_num = feature_height * feature_width * num_priors;
-
-  for (int i = 0; i < box_num; i++) {
-    output_variances_dataptr[4 * i] = variances[0];
-    output_variances_dataptr[4 * i + 1] = variances[1];
-    output_variances_dataptr[4 * i + 2] = variances[2];
-    output_variances_dataptr[4 * i + 3] = variances[3];
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h b/mobile/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h
deleted file mode 100644
index 1504850324..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h
+++ /dev/null
@@ -1,144 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DWCONVBNRELU_OP
-
-#pragma once
-#include <vector>
-#include "operators/math/depthwise_conv3x3.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-void DWConvBNReluBasic(const FusionDWConvBNReluParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor new_bias = *param.NewBias();
-  Tensor new_scale = *param.NewScale();
-
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::MatMulWithBn(filter_slice, false, col_matrix, false,
-                         static_cast<float>(1), &out_slice,
-                         static_cast<float>(0), true, &new_scale, &new_bias, g);
-    }
-  }
-}
-template <typename P>
-void DWConvBNReluCompute(const FusionDWConvBNReluParam<CPU> &param) {
-  if (param.Groups() == param.Input()->dims()[1] &&
-      param.Input()->dims()[1] == param.Output()->dims()[1] &&
-      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 &&
-      param.paddings_[0] == 1) {
-    math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
-                                        param.Output(), param.NewScale(),
-                                        param.NewBias(), true);
-  } else if (param.Groups() == param.Input()->dims()[1] &&
-             param.Input()->dims()[1] == param.Output()->dims()[1] &&
-             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
-    //    math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
-    //                                        param.Output(), param.NewScale(),
-    //                                        param.NewBias(), 1);
-    math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(),
-                                          param.Output(), param.NewScale(),
-                                          param.NewBias(), true);
-  } else {
-    DWConvBNReluBasic(param);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h b/mobile/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
deleted file mode 100644
index 877ae712cf..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEADD_OP
-
-#pragma once
-
-#include "operators/math/element_wise.h"
-#include "operators/op_param.h"
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-inline void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
-  const framework::Tensor *input_x = param.InputX();
-  const framework::Tensor *input_y = param.InputY();
-  framework::Tensor *output = param.Out();
-  int axis = param.Axis();
-  math::AddElememtWise<IDENTITY>(input_x, input_y, axis, output);
-}
-
-template <typename Dtype, ActivationType Act>
-struct AddElememtWiseStruct {
-  void operator()(const Tensor *X, const Tensor *Y, const int Axis,
-                  Tensor *Out) {}
-};
-
-template <ActivationType Act>
-struct AddElememtWiseStruct<int, Act> {
-  void operator()(const Tensor *input, const Tensor *bias, const int Axis,
-                  Tensor *output) {
-    const auto &x_dims = input->dims();
-    const auto &y_dims = bias->dims();
-    const int *input_data = input->data<int>();
-    const int *bias_data = bias->data<int>();
-    int *output_data = output->mutable_data<int>();
-
-    if (x_dims == y_dims) {
-      size_t channels = 1;
-      size_t elementwise_num = 1;
-      for (int i = 0; i < y_dims.size(); ++i) {
-        channels *= y_dims[i];
-      }
-#pragma omp parallel for
-      for (int j = 0; j < channels; ++j) {
-        size_t offset = (0 * channels + j) * elementwise_num;
-        const int *input = input_data + offset;
-        const int bias = bias_data[j];
-        int *output = output_data + offset;
-        for (int k = 0; k < elementwise_num; ++k) {
-          output[k] = math::Active<Act>(input[k] + bias);
-        }
-      }
-    }
-  }
-};
-
-template class ElementwiseAddKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/elementwise_mul_arm_func.h b/mobile/src/operators/kernel/central-arm-func/elementwise_mul_arm_func.h
deleted file mode 100644
index 0aed7ff8d4..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/elementwise_mul_arm_func.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEMUL_OP
-
-#pragma once
-#include "operators/math/elementwise_op_function.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-struct MulFunctor {
-  inline T operator()(T a, T b) const { return a * b; }
-};
-
-template <typename P>
-void ElementwiseMulCompute(const ElementwiseMulParam<CPU> &param) {
-  const Tensor *input_x = param.InputX();
-  const Tensor *input_y = param.InputY();
-  Tensor *Out = param.Out();
-  Out->mutable_data<float>();
-  int axis = param.Axis();
-  ElementwiseComputeEx<MulFunctor<float>, float>(input_x, input_y, axis,
-                                                 MulFunctor<float>(), Out);
-}
-
-template class ElementwiseMulKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/elementwise_sub_arm_func.h b/mobile/src/operators/kernel/central-arm-func/elementwise_sub_arm_func.h
deleted file mode 100644
index cb5bbc91c3..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/elementwise_sub_arm_func.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISESUB_OP
-
-#pragma once
-
-#include "framework/data_type.h"
-#include "operators/math/elementwise_op_function.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-struct SubFunctor {
-  inline T operator()(T a, T b) const { return a - b; }
-};
-
-struct SubOpFunctor {
-  const framework::Tensor* x_;
-  const framework::Tensor* y_;
-  const int axis_;
-  framework::Tensor* out_;
-
-  SubOpFunctor(const framework::Tensor* x, const framework::Tensor* y,
-               framework::Tensor* out, const int axis)
-      : x_(x), y_(y), out_(out), axis_(axis) {}
-
-  template <typename T>
-  void apply() const {
-    out_->mutable_data<T>();
-    ElementwiseComputeEx<SubFunctor<T>, T>(x_, y_, axis_, SubFunctor<T>(),
-                                           out_);
-  }
-};
-
-template <typename P>
-void ElementwiseSubCompute(const ElementwiseSubParam<CPU>& param) {
-  const Tensor* input_x = param.InputX();
-  const Tensor* input_y = param.InputY();
-  Tensor* out = param.Out();
-
-  int axis = param.Axis();
-  framework::VisitDataType(framework::ToDataType(input_x->type()),
-                           SubOpFunctor(input_x, input_y, out, axis));
-}
-
-template class ElementwiseSubKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/flatten_arm_func.h b/mobile/src/operators/kernel/central-arm-func/flatten_arm_func.h
deleted file mode 100644
index 3966580133..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/flatten_arm_func.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FLATTEN_OP
-
-#ifndef RESHAPE_OP
-#define RESHAPE_OP
-#endif
-
-#pragma once
-
-#include <operators/kernel/reshape_kernel.h>
-#include <vector>
-#include "operators/flatten_op.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void FlattenCompute(const FlattenParam<CPU> &param) {
-  const auto *input_x = param.InputX();
-  const auto axis = param.Axis();
-  const auto &input_x_dims = input_x->dims();
-  auto *out = param.Out();
-
-  const auto &out_shape_v = GetOutputShape(axis, input_x_dims);
-  const framework::DDim &out_dim = ValidateShape(out_shape_v, input_x_dims);
-
-  out->Resize(out_dim);
-  out->mutable_data<float>();
-  framework::TensorCopy(*input_x, out);
-  out->Resize(out_dim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h b/mobile/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
deleted file mode 100644
index 9adc4a273a..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_FC_OP
-
-#pragma once
-
-#include <type_traits>
-#include "operators/math/math_function.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Itype, typename Otype>
-void FusionFcCompute(const FusionFcParam<CPU> &param) {
-  const Tensor *input_x = param.InputX();
-  const Tensor *input_y = param.InputY();
-  Tensor *input_z = param.InputZ();
-  Otype *input_z_data = input_z->data<Otype>();
-  int axis = param.Axis();
-  Tensor *out = param.Out();
-  auto *out_data = out->mutable_data<Itype>();
-  int M = (int)input_x->dims()[0];
-
-  const Tensor x_matrix =
-      input_x->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
-          : *input_x;
-  const Tensor y_matrix =
-      input_y->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
-          : *input_y;
-  auto out_dim = out->dims();
-  if (out_dim.size() != 2) {
-    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-  }
-  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
-  PADDLE_MOBILE_ENFORCE(input_z->dims().size() == 1, "inpu_z size must be 1");
-  PADDLE_MOBILE_ENFORCE(out_dim[1] == input_z->dims()[0],
-                        " out_dim.size must be 2.");
-  axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
-  PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ");
-
-  // bias_data的维度和out的第二个维度一致
-  int64_t classes = input_z->numel();
-  for (int i = 0; i < out_dim[0]; i++) {
-    memory::Copy(out_data + i * classes, input_z_data, sizeof(Otype) * classes);
-  }
-  if (M == 1) {
-    math::MatMul<Itype, Otype>(x_matrix, false, y_matrix, true,
-                               static_cast<float>(1), out,
-                               static_cast<float>(1), false);
-  } else {
-    math::MatMul<Itype, Otype>(x_matrix, false, y_matrix, false,
-                               static_cast<float>(1), out,
-                               static_cast<float>(1), false);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/gru_arm_func.h b/mobile/src/operators/kernel/central-arm-func/gru_arm_func.h
deleted file mode 100644
index 8975382732..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/gru_arm_func.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef GRU_OP
-#pragma once
-
-#include <operators/math/sequence2batch.h>
-#include <vector>
-#include "common/types.h"
-#include "operators/math/gru_compute.h"
-#include "operators/math/math_function.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Device, typename T>
-inline void ReorderInitState(const framework::Tensor& src,
-                             std::vector<size_t> index_lod,
-                             framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<Device, T> row_shuffle;
-  dst->mutable_data<T>(src.dims());
-  row_shuffle(src, index_lod, dst, indexed_src);
-}
-
-template <typename T>
-void GruCompute(const GruParam<CPU>& param) {
-  auto* input = param.InputInput();
-  auto* h0 = param.InputH0();
-  auto* weight = param.InputWeight();
-  const auto* weight_data = weight->data<float>();
-  auto* bias = param.InputBias();
-  auto* batch_gate = param.OutBatchGate();
-  batch_gate->mutable_data<float>();
-  auto* batch_reset_hidden_prev = param.OutBatchResetHiddenPrev();
-  batch_reset_hidden_prev->mutable_data<float>();
-  auto* batch_hidden = param.OutBatchHidden();
-  batch_hidden->mutable_data<float>();
-  auto* hidden = param.OutHidden();
-  hidden->mutable_data<float>();
-
-  auto hidden_dims = hidden->dims();
-
-  bool is_reverse = param.IsReverse();
-  math::LoDTensor2BatchFunctor<CPU, float> to_batch;
-  to_batch(*input, batch_gate, true, is_reverse);
-  if (bias) {
-    math::RowwiseAdd<CPU, float> add_bias;
-    add_bias(*batch_gate, *bias, batch_gate);
-  }
-  int frame_size = hidden_dims[1];
-  math::GRUMetaValue<float> gru_value;
-  gru_value.gate_weight = const_cast<float*>(weight_data);
-  gru_value.state_weight =
-      const_cast<float*>(weight_data + 2 * frame_size * frame_size);
-  framework::Tensor ordered_h0;
-  std::vector<size_t> order(batch_gate->lod()[2]);
-  if (h0) {
-    // Since the batch computing for GRU reorders the input sequences
-    // according to their length. The initialized cell state also needs
-    // to reorder.
-    ReorderInitState<CPU, float>(*h0, order, &ordered_h0, true);
-    gru_value.prev_out_value = ordered_h0.data<float>();
-  } else {
-    gru_value.prev_out_value = nullptr;
-  }
-  auto batch_starts = batch_gate->lod()[0];
-  size_t seq_len = batch_starts.size() - 1;
-  auto active_node = math::GetActivationType(param.Activation());
-  auto active_gate = math::GetActivationType(param.GateActivation());
-  for (size_t n = 0; n < seq_len; n++) {
-    int bstart = static_cast<int>(batch_starts[n]);
-    int bend = static_cast<int>(batch_starts[n + 1]);
-    int cur_batch_size = bend - bstart;
-    framework::Tensor gate_t = batch_gate->Slice(bstart, bend);
-    framework::Tensor reset_hidden_prev_t =
-        batch_reset_hidden_prev->Slice(bstart, bend);
-    framework::Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-    gru_value.output_value = hidden_t.data<float>();
-    gru_value.gate_value = gate_t.data<float>();
-    gru_value.reset_output_value = reset_hidden_prev_t.data<float>();
-
-    math::GRUUnitFunctor<CPU, float>::compute(
-        gru_value, frame_size, cur_batch_size, active_node, active_gate);
-
-    gru_value.prev_out_value = gru_value.output_value;
-  }
-  math::Batch2LoDTensorFunctor<CPU, float> to_seq;
-  batch_hidden->set_lod(batch_gate->lod());
-  to_seq(*batch_hidden, hidden);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // GRU_OP
diff --git a/mobile/src/operators/kernel/central-arm-func/gru_unit_arm_func.h b/mobile/src/operators/kernel/central-arm-func/gru_unit_arm_func.h
deleted file mode 100644
index 568273e873..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/gru_unit_arm_func.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef GRU_UNIT_OP
-
-#pragma once
-
-#include <operators/math/gru_compute.h>
-#include "operators/kernel/activation_kernel.h"
-#include "operators/math/gemm.h"
-#include "operators/math/math_function.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void GruUnitCompute(const GruUnitParam<CPU>& param) {
-  // inputs
-  auto* input = param.InputInput();
-  auto* hidden_prev = param.InputHiddenPrev();
-  auto* weight = param.InputWeight();
-  auto* bias = param.InputBias();
-  // outputs
-  auto* gate = param.OutGate();
-  gate->mutable_data<P>();
-  auto* reset_hidden_prev = param.OutResetHiddenPrev();
-  reset_hidden_prev->mutable_data<P>();
-  auto* hidden = param.OutHidden();
-  hidden->mutable_data<P>();
-
-  // add bias
-  if (bias) {
-    math::RowwiseAdd<CPU, float> add_bias;
-    add_bias(*input, *bias, gate);
-  }
-
-  int batch_size = input->dims()[0];
-  int frame_size = hidden_prev->dims()[1];
-  const P* weight_data = weight->data<P>();
-
-  math::GRUMetaValue<P> gru_value;
-  gru_value.gate_weight = const_cast<P*>(weight_data);
-  gru_value.state_weight =
-      const_cast<P*>(weight_data + 2 * frame_size * frame_size);
-  gru_value.prev_out_value = const_cast<P*>(hidden_prev->data<P>());
-
-  gru_value.output_value = hidden->data<P>();
-  gru_value.gate_value = gate->data<P>();
-  gru_value.reset_output_value = reset_hidden_prev->data<P>();
-
-  auto active_node = math::GetActivationType(param.Activation());
-  auto active_gate = math::GetActivationType(param.GateActivation());
-  math::GRUUnitFunctor<CPU, float>::compute(gru_value, frame_size, batch_size,
-                                            active_node, active_gate);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/increment_arm_func.h b/mobile/src/operators/kernel/central-arm-func/increment_arm_func.h
deleted file mode 100644
index 96473fef81..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/increment_arm_func.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef INCREMENT_OP
-
-#pragma once
-
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void IncrementCompute(const IncrementParam<CPU> &param) {
-  const framework::Tensor *input = param.InputX();
-  framework::Tensor *out = param.Out();
-  float step = param.Step();
-
-  out->mutable_data<int64_t>();
-  const int64_t *input_data = input->data<int64_t>();
-  int64_t *out_data = out->data<int64_t>();
-  *out_data = *input_data + step;
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/lookup_arm_func.h b/mobile/src/operators/kernel/central-arm-func/lookup_arm_func.h
deleted file mode 100644
index 917973822f..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/lookup_arm_func.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef LOOKUP_OP
-#pragma once
-
-#include <vector>
-#include "framework/ddim.h"
-#include "operators/op_param.h"
-
-constexpr int64_t kNoPadding = -1;
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void LookupCompute(const LookupParam<CPU> &param) {
-  auto *ids_t = param.InputIds();
-  auto *table_t = param.InputW();
-  auto *output_t = param.Out();
-  int64_t padding_idx = param.PaddingIdx();
-  const framework::DDim &table_dim = table_t->dims();
-  int64_t ids_numel;
-  const auto *ids = ids_t->data<int64_t>();
-  ids_numel = ids_t->numel();
-  int64_t row_number = table_t->dims()[0];
-  int64_t row_width = table_t->dims()[1];
-  auto *table = table_t->data<float>();
-  auto *output = output_t->mutable_data<float>();
-  for (int64_t i = 0; i < ids_numel; ++i) {
-    if (padding_idx != kNoPadding && ids[i] == padding_idx) {
-      memset(output + i * row_width, 0, row_width * sizeof(float));
-    } else {
-      PADDLE_MOBILE_ENFORCE(ids[i] < row_number,
-                            "look uptable ids[i] <row_number check failed");
-      PADDLE_MOBILE_ENFORCE(ids[i] >= 0,
-                            "lookuptable ids[i] >= 0 check failed");
-
-      memcpy(output + i * row_width, table + ids[i] * row_width,
-             row_width * sizeof(float));
-    }
-  }
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/lrn_arm_func.h b/mobile/src/operators/kernel/central-arm-func/lrn_arm_func.h
deleted file mode 100644
index 165ad8dd8a..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/lrn_arm_func.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef LRN_OP
-
-#pragma once
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void LrnCompute(const LrnParam<CPU> &param) {
-  const Tensor *input_x = param.InputX();
-  auto x_dims = input_x->dims();
-  Tensor *out = param.Out();
-  out->mutable_data<float>();
-  /// data_format = NCHW
-  const int N = x_dims[0];
-  const int C = x_dims[1];
-  const int H = x_dims[2];
-  const int W = x_dims[3];
-
-  const int n = param.N();
-  const float alpha = param.Alpha();
-  const float beta = param.Beta();
-  const float k = param.K();
-  LRNFunctor<float> lrnFunctor;
-  lrnFunctor(*input_x, out, N, C, H, W, n, k, alpha, beta);
-}
-
-template class LrnKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/mul_arm_func.h b/mobile/src/operators/kernel/central-arm-func/mul_arm_func.h
deleted file mode 100644
index 01d668021b..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/mul_arm_func.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MUL_OP
-
-#pragma once
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void MulCompute(const MulParam<CPU> &param) {
-  const Tensor *input_x = param.InputX();
-  const Tensor *input_y = param.InputY();
-  Tensor *out = param.Out();
-
-  const Tensor x_matrix =
-      input_x->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
-          : *input_x;
-  const Tensor y_matrix =
-      input_y->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
-          : *input_y;
-  auto out_dim = out->dims();
-  if (out_dim.size() != 2) {
-    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-  }
-  if (param.InputX()->type() == type_id<int8_t>().hash_code()) {
-    out->mutable_data<int32_t>();
-    math::MatMul<int8_t, int32_t>(x_matrix, false, y_matrix, false,
-                                  static_cast<float>(1), out,
-                                  static_cast<float>(0));
-  } else {
-    out->mutable_data<float>();
-    math::MatMul<float, float>(x_matrix, false, y_matrix, false,
-                               static_cast<float>(1), out,
-                               static_cast<float>(0));
-  }
-  if (out_dim.size() != 2) {
-    out->Resize(out_dim);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h b/mobile/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h
deleted file mode 100644
index f44f348aa6..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h
+++ /dev/null
@@ -1,307 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MULTICLASSNMS_OP
-#pragma once
-
-#include <algorithm>
-#include <map>
-#include <utility>
-#include <vector>
-#include "framework/tensor.h"
-#include "operators/math/poly_util.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <class T>
-bool SortScorePairDescend(const std::pair<float, T>& pair1,
-                          const std::pair<float, T>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-template <class T>
-static inline void GetMaxScoreIndex(
-    const std::vector<T>& scores, const T threshold, int top_k,
-    std::vector<std::pair<T, int>>* sorted_indices) {
-  for (size_t i = 0; i < scores.size(); ++i) {
-    if (scores[i] > threshold) {
-      sorted_indices->push_back(std::make_pair(scores[i], i));
-    }
-  }
-  // Sort the score pair according to the scores in descending order
-  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
-                   SortScorePairDescend<int>);
-  // Keep top_k scores if needed.
-  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
-    sorted_indices->resize(top_k);
-  }
-}
-
-template <class T>
-static inline T BBoxArea(const T* box, const bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
-    }
-  }
-}
-
-template <class T>
-static inline T JaccardOverlap(const T* box1, const T* box2,
-                               const bool normalized) {
-  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
-      box2[3] < box1[1]) {
-    return static_cast<T>(0.);
-  } else {
-    const T inter_xmin = std::max(box1[0], box2[0]);
-    const T inter_ymin = std::max(box1[1], box2[1]);
-    const T inter_xmax = std::min(box1[2], box2[2]);
-    const T inter_ymax = std::min(box1[3], box2[3]);
-    const T inter_w = inter_xmax - inter_xmin;
-    const T inter_h = inter_ymax - inter_ymin;
-    const T inter_area = inter_w * inter_h;
-    const T bbox1_area = BBoxArea<T>(box1, normalized);
-    const T bbox2_area = BBoxArea<T>(box2, normalized);
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
-}
-
-template <class T>
-static inline T PolyIoU(const T* box1, const T* box2, const size_t box_size,
-                        const bool normalized) {
-  T bbox1_area = math::PolyArea<T>(box1, box_size, normalized);
-  T bbox2_area = math::PolyArea<T>(box2, box_size, normalized);
-  T inter_area = math::PolyOverlapArea<T>(box1, box2, box_size, normalized);
-  if (bbox1_area == 0 || bbox2_area == 0 || inter_area == 0) {
-    // If coordinate values are is invalid
-    // if area size <= 0,  return 0.
-    return static_cast<T>(0.);
-  } else {
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
-}
-
-template <typename T>
-static inline void NMSFast(const framework::Tensor& bbox,
-                           const framework::Tensor& scores,
-                           const T score_threshold, const T nms_threshold,
-                           const T eta, const int64_t top_k,
-                           std::vector<int>* selected_indices) {
-  // The total boxes for each instance.
-  int64_t num_boxes = bbox.dims()[0];
-  // 4: [xmin ymin xmax ymax]
-  int64_t box_size = bbox.dims()[1];
-
-  std::vector<T> scores_data(num_boxes);
-  std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
-  std::vector<std::pair<T, int>> sorted_indices;
-  GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
-
-  selected_indices->clear();
-  T adaptive_threshold = nms_threshold;
-  const T* bbox_data = bbox.data<T>();
-
-  while (sorted_indices.size() != 0) {
-    const int idx = sorted_indices.front().second;
-    bool keep = true;
-    for (size_t k = 0; k < selected_indices->size(); ++k) {
-      if (keep) {
-        const int kept_idx = (*selected_indices)[k];
-        T overlap = T(0.);
-        if (box_size == 4) {
-          overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
-                                      bbox_data + kept_idx * box_size, true);
-        } else {
-          overlap = PolyIoU<T>(bbox_data + idx * box_size,
-                               bbox_data + kept_idx * box_size, box_size, true);
-        }
-        keep = overlap <= adaptive_threshold;
-      } else {
-        break;
-      }
-    }
-    if (keep) {
-      selected_indices->push_back(idx);
-    }
-    sorted_indices.erase(sorted_indices.begin());
-    if (keep && eta < 1 && adaptive_threshold > 0.5) {
-      adaptive_threshold *= eta;
-    }
-  }
-}
-
-template <typename T>
-void MultiClassNMS(const framework::Tensor& scores,
-                   const framework::Tensor& bboxes,
-                   std::map<int, std::vector<int>>* indices, int* num_nmsed_out,
-                   const int& background_label, const int& nms_top_k,
-                   const int& keep_top_k, const T& nms_threshold,
-                   const T& nms_eta, const T& score_threshold) {
-  int64_t class_num = scores.dims()[0];
-  int64_t predict_dim = scores.dims()[1];
-  int num_det = 0;
-  for (int64_t c = 0; c < class_num; ++c) {
-    if (c == background_label) continue;
-    framework::Tensor score = scores.Slice(c, c + 1);
-    /// [c] is key
-    NMSFast<float>(bboxes, score, score_threshold, nms_threshold, nms_eta,
-                   nms_top_k, &((*indices)[c]));
-    num_det += (*indices)[c].size();
-  }
-
-  *num_nmsed_out = num_det;
-  const T* scores_data = scores.data<T>();
-  if (keep_top_k > -1 && num_det > keep_top_k) {
-    std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
-    for (const auto& it : *indices) {
-      int label = it.first;
-      const T* sdata = scores_data + label * predict_dim;
-      const std::vector<int>& label_indices = it.second;
-      for (size_t j = 0; j < label_indices.size(); ++j) {
-        int idx = label_indices[j];
-        // PADDLE_ENFORCE_LT(idx, predict_dim);
-        score_index_pairs.push_back(
-            std::make_pair(sdata[idx], std::make_pair(label, idx)));
-      }
-    }
-    // Keep top k results per image.
-    std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
-                     SortScorePairDescend<std::pair<int, int>>);
-    score_index_pairs.resize(keep_top_k);
-
-    // Store the new indices.
-    std::map<int, std::vector<int>> new_indices;
-    for (size_t j = 0; j < score_index_pairs.size(); ++j) {
-      int label = score_index_pairs[j].second.first;
-      int idx = score_index_pairs[j].second.second;
-      new_indices[label].push_back(idx);
-    }
-    new_indices.swap(*indices);
-    *num_nmsed_out = keep_top_k;
-  }
-}
-
-template <typename T>
-void MultiClassOutput(const framework::Tensor& scores,
-                      const framework::Tensor& bboxes,
-                      const std::map<int, std::vector<int>>& selected_indices,
-                      framework::Tensor* outs) {
-  int predict_dim = scores.dims()[1];
-  int box_size = bboxes.dims()[1];
-  int out_dim = bboxes.dims()[1] + 2;
-  auto* scores_data = scores.data<T>();
-  auto* bboxes_data = bboxes.data<T>();
-  auto* odata = outs->data<T>();
-
-  int count = 0;
-  for (const auto& it : selected_indices) {
-    /// one batch
-    int label = it.first;
-    const T* sdata = scores_data + label * predict_dim;
-    const std::vector<int>& indices = it.second;
-    for (size_t j = 0; j < indices.size(); ++j) {
-      int idx = indices[j];
-      const T* bdata = bboxes_data + idx * box_size;
-      odata[count * out_dim] = label;           // label
-      odata[count * out_dim + 1] = sdata[idx];  // score
-      // xmin, ymin, xmax, ymax
-      std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T));
-      count++;
-    }
-  }
-}
-
-template <typename P>
-void MultiClassNMSCompute(const MultiClassNMSParam<CPU>& param) {
-  const auto* input_bboxes = param.InputBBoxes();
-  const auto& input_bboxes_dims = input_bboxes->dims();
-
-  const auto* input_scores = param.InputScores();
-  const auto& input_scores_dims = input_scores->dims();
-
-  auto* outs = param.Out();
-  auto background_label = param.BackGroundLabel();
-  auto nms_top_k = param.NMSTopK();
-  auto keep_top_k = param.KeepTopK();
-  auto nms_threshold = param.NMSThreshold();
-  auto nms_eta = param.NMSEta();
-  auto score_threshold = param.ScoreThreshold();
-
-  int64_t batch_size = input_scores_dims[0];
-  int64_t class_num = input_scores_dims[1];
-  int64_t predict_dim = input_scores_dims[2];
-  int64_t box_dim = input_bboxes_dims[2];
-
-  std::vector<std::map<int, std::vector<int>>> all_indices;
-  std::vector<size_t> batch_starts = {0};
-  for (int64_t i = 0; i < batch_size; ++i) {
-    framework::Tensor ins_score = input_scores->Slice(i, i + 1);
-    ins_score.Resize({class_num, predict_dim});
-
-    framework::Tensor ins_boxes = input_bboxes->Slice(i, i + 1);
-    ins_boxes.Resize({predict_dim, box_dim});
-
-    std::map<int, std::vector<int>> indices;
-    int num_nmsed_out = 0;
-    MultiClassNMS<float>(ins_score, ins_boxes, &indices, &num_nmsed_out,
-                         background_label, nms_top_k, keep_top_k, nms_threshold,
-                         nms_eta, score_threshold);
-    all_indices.push_back(indices);
-    batch_starts.push_back(batch_starts.back() + num_nmsed_out);
-  }
-
-  int num_kept = batch_starts.back();
-  if (num_kept == 0) {
-    float* od = outs->mutable_data<float>({1});
-    od[0] = -1;
-  } else {
-    int64_t out_dim = box_dim + 2;
-    outs->mutable_data<float>({num_kept, out_dim});
-    for (int64_t i = 0; i < batch_size; ++i) {
-      framework::Tensor ins_score = input_scores->Slice(i, i + 1);
-      ins_score.Resize({class_num, predict_dim});
-
-      framework::Tensor ins_boxes = input_bboxes->Slice(i, i + 1);
-      ins_boxes.Resize({predict_dim, box_dim});
-
-      int64_t s = batch_starts[i];
-      int64_t e = batch_starts[i + 1];
-      if (e > s) {
-        framework::Tensor out = outs->Slice(s, e);
-        MultiClassOutput<float>(ins_score, ins_boxes, all_indices[i], &out);
-      }
-    }
-  }
-
-  framework::LoD lod;
-  lod.emplace_back(batch_starts);
-
-  outs->set_lod(lod);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/norm_arm_func.h b/mobile/src/operators/kernel/central-arm-func/norm_arm_func.h
deleted file mode 100644
index 71b4c5515e..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/norm_arm_func.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef NORM_OP
-
-#pragma once
-
-#include <cmath>
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-inline void GetDims(const framework::DDim &dim, int axis, int *pre, int *n,
-                    int *post) {
-  *pre = 1;
-  *post = 1;
-  *n = dim[axis];
-  for (int i = 0; i < axis; ++i) {
-    (*pre) *= dim[i];
-  }
-  for (int i = axis + 1; i < dim.size(); ++i) {
-    (*post) *= dim[i];
-  }
-}
-
-template <typename P>
-void NormCompute(const NormParam<CPU> &param) {
-  const float epsilon = param.Epsilon();
-  int axis = param.Axis();
-
-  const framework::Tensor *input = param.InputX();
-  framework::Tensor *norm = param.OutputNorm();
-  framework::Tensor *out = param.Out();
-
-  auto x_dims = input->dims();
-  if (axis < 0) {
-    axis += x_dims.size();
-  }
-
-  int pre, n, post;
-  GetDims(x_dims, axis, &pre, &n, &post);
-
-  const float *input_ptr = input->data<float>();
-  float *norm_ptr = norm->mutable_data<float>();
-  float *out_ptr = out->mutable_data<float>();
-
-  for (int p = 0; p < pre; ++p) {
-    const float *in_tmp = input_ptr + p * n * post;
-    float *norm_tmp = norm_ptr + p * post;
-
-    // in_ch = 0; norm = epsilon + x * x
-    for (int i = 0; i < post; ++i) {
-      *norm_tmp = epsilon;
-      *norm_tmp += (*in_tmp) * (*in_tmp);
-      norm_tmp++;
-      in_tmp++;
-    }
-
-    // in_ch >= 1; norm += x * x
-    for (int c = 1; c < n; ++c) {
-      norm_tmp = norm_ptr + p * post;
-      for (int i = 0; i < post; ++i) {
-        *norm_tmp += (*in_tmp) * (*in_tmp);
-        norm_tmp++;
-        in_tmp++;
-      }
-    }
-
-    // norm = sqart(norm)
-    norm_tmp = norm_ptr + p * post;
-    for (int i = 0; i < post; ++i) {
-      *norm_tmp = sqrtf(*norm_tmp);
-      norm_tmp++;
-    }
-
-    // out = input / norm
-    in_tmp = input_ptr + p * n * post;
-    float *out_tmp = out_ptr + p * n * post;
-    for (int c = 0; c < n; ++c) {
-      norm_tmp = norm_ptr + p * post;
-      for (int j = 0; j < post; ++j) {
-        *out_tmp = *in_tmp / *norm_tmp;
-        in_tmp++;
-        norm_tmp++;
-        out_tmp++;
-      }
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h b/mobile/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h
deleted file mode 100644
index 9cbac1035f..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POLYGONBOXTRANSFORM_OP
-#pragma once
-
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void PolygonBoxTransformCompute(const PolygonBoxTransformParam<CPU>& param) {
-  const auto* input = param.Input();
-  const auto& input_dims = input->dims();
-  const auto* input_data = input->data<float>();
-  auto* output = param.Output();
-  auto* output_data = output->mutable_data<float>(input_dims);
-
-  int64_t batch_size = input_dims[0];
-  int64_t geo_channel = input_dims[1];
-  int64_t height = input_dims[2];
-  int64_t width = input_dims[3];
-  int64_t id = 0;
-  for (int64_t id_n = 0; id_n < batch_size * geo_channel; ++id_n) {
-    for (int64_t id_h = 0; id_h < height; ++id_h) {
-      for (int64_t id_w = 0; id_w < width; ++id_w) {
-        id = id_n * height * width + width * id_h + id_w;
-        if (id_n % 2 == 0) {
-          output_data[id] = id_w * 4 - input_data[id];
-        } else {
-          output_data[id] = id_h * 4 - input_data[id];
-        }
-      }
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/pool_arm_func.h b/mobile/src/operators/kernel/central-arm-func/pool_arm_func.h
deleted file mode 100644
index 82c24d0ab4..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/pool_arm_func.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POOL_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "common/types.h"
-#include "operators/math/pooling.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void PoolCompute(const PoolParam<CPU> &param) {
-  const framework::Tensor *input = param.Input();
-  framework::Tensor *output = param.Output();
-  const std::string &pooling_type = param.PoolingType();
-  std::vector<int> ksize = param.Ksize();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  const bool exclusive = param.isExclusive();
-  if (param.isGlobalPooling()) {
-    for (size_t i = 0; i < ksize.size(); ++i) {
-      paddings[i] = 0;
-      ksize[i] = static_cast<int>(input->dims()[i + 2]);
-    }
-  }
-  if (ksize[0] == 3 && ksize[0] == ksize[1]) {
-    if (pooling_type == "max" && strides[0] == strides[1]) {
-      if (strides[0] == 1) {
-        math::Pooling3x3<MAX, 1>()(*input, paddings, exclusive, output);
-      } else if (strides[0] == 2) {
-        math::Pooling3x3<MAX, 2>()(*input, paddings, exclusive, output);
-      } else {
-        math::Pooling<MAX>()(*input, ksize, strides, paddings, output);
-      }
-    } else if (pooling_type == "avg" && strides[0] == strides[1]) {
-      if (strides[0] == 1) {
-        math::Pooling3x3<AVG, 1>()(*input, paddings, exclusive, output);
-      } else if (strides[0] == 2) {
-        math::Pooling3x3<AVG, 2>()(*input, paddings, exclusive, output);
-      } else {
-        math::Pooling<AVG>()(*input, ksize, strides, paddings, output);
-      }
-    }
-  } else if (ksize[0] == 2 && ksize[0] == ksize[1]) {
-    if (pooling_type == "max" && strides[0] == strides[1]) {
-      if (strides[0] == 1) {
-        math::Pooling2x2<MAX, 1>()(*input, paddings, output);
-      } else if (strides[0] == 2) {
-        math::Pooling2x2<MAX, 2>()(*input, paddings, output);
-      } else {
-        math::Pooling<MAX>()(*input, ksize, strides, paddings, output);
-      }
-    } else if (pooling_type == "avg" && strides[0] == strides[1]) {
-      if (strides[0] == 1) {
-        math::Pooling2x2<AVG, 1>()(*input, paddings, output);
-      } else if (strides[0] == 2) {
-        math::Pooling2x2<AVG, 2>()(*input, paddings, output);
-      } else {
-        math::Pooling<AVG>()(*input, ksize, strides, paddings, output);
-      }
-    }
-  } else {
-    if (pooling_type == "max") {
-      math::Pooling<MAX>()(*input, ksize, strides, paddings, output);
-    } else if (pooling_type == "avg") {
-      math::Pooling<AVG>()(*input, ksize, strides, paddings, output);
-    } else {
-      // Others
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/prior_box_arm_func.h b/mobile/src/operators/kernel/central-arm-func/prior_box_arm_func.h
deleted file mode 100644
index e783c52f81..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/prior_box_arm_func.h
+++ /dev/null
@@ -1,199 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PRIORBOX_OP
-#pragma once
-
-#include <algorithm>
-#include <cmath>
-#include <vector>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-struct ClipFunctor {
-  inline T operator()(T in) const {
-    return std::min<T>(std::max<T>(in, 0.), 1.);
-  }
-};
-
-template <typename P>
-void PriorBoxCompute(const PriorBoxParam<CPU> &param) {
-  const auto *input_ = param.Input();
-  const auto &input_dims = input_->dims();
-
-  const auto *input_image = param.InputImage();
-  const auto &input_image_dims = input_image->dims();
-
-  const auto &min_sizes = param.MinSizes();
-  const auto &max_sizes = param.MaxSizes();
-  const auto &variances = param.Variances();
-  const auto &input_aspect_ratio = param.AspectRatios();
-  const bool &flip = param.Flip();
-  const bool &clip = param.Clip();
-  const float &step_w = param.StepW();
-  const float &step_h = param.StepH();
-  const float &offset = param.Offset();
-
-  Tensor *output_boxes = param.OutputBoxes();
-  auto output_boxes_dataptr = output_boxes->mutable_data<float>();
-  Tensor *output_variances = param.OutputVariances();
-  auto output_variances_dataptr = output_variances->mutable_data<float>();
-
-  std::vector<float> aspect_ratios;
-  ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
-
-  auto img_width = input_image_dims[3];
-  auto img_height = input_image_dims[2];
-
-  auto feature_width = input_dims[3];
-  auto feature_height = input_dims[2];
-
-  auto stride0 = output_boxes->dims()[1] * output_boxes->dims()[2] *
-                 output_boxes->dims()[3];
-  auto stride1 = output_boxes->dims()[2] * output_boxes->dims()[3];
-  auto stride2 = output_boxes->dims()[3];
-
-  float step_width, step_height;
-  /// 300 / 19
-  if (step_w == 0 || step_h == 0) {
-    step_width = static_cast<float>(img_width) / feature_width;
-    step_height = static_cast<float>(img_height) / feature_height;
-  } else {
-    step_width = step_w;
-    step_height = step_h;
-  }
-
-  int num_priors = aspect_ratios.size() * min_sizes.size();
-  if (!max_sizes.empty()) {
-    num_priors += max_sizes.size();
-  }
-
-  for (int h = 0; h < feature_height; ++h) {
-    for (int w = 0; w < feature_width; ++w) {
-      /// map origin image
-      float center_x = (w + offset) * step_width;
-      float center_y = (h + offset) * step_height;
-      float box_width, box_height;
-      int idx = 0;
-      for (size_t s = 0; s < min_sizes.size(); ++s) {
-        auto min_size = min_sizes[s];
-        if (param.MinMaxAspectRatiosOrder()) {
-          box_width = box_height = min_size / 2.;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 0] =
-              (center_x - box_width) / img_width;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 1] =
-              (center_y - box_height) / img_height;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 2] =
-              (center_x + box_width) / img_width;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 3] =
-              (center_y + box_height) / img_height;
-          idx++;
-
-          if (max_sizes.size() > 0) {
-            auto max_size = max_sizes[s];
-            // square prior with size sqrt(minSize * maxSize)
-            box_width = box_height = sqrt(min_size * max_size) / 2.;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 0] = (center_x - box_width) / img_width;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 1] = (center_y - box_height) / img_height;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 2] = (center_x + box_width) / img_width;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 3] = (center_y + box_height) / img_height;
-            idx++;
-          }
-
-          // priors with different aspect ratios
-          for (float ar : aspect_ratios) {
-            if (fabs(ar - 1.) < 1e-6) {
-              continue;
-            }
-            box_width = min_size * sqrt(ar) / 2.;
-            box_height = min_size / sqrt(ar) / 2.;
-            /// box_width/2 , / img_width 为了得到feature map 相对于
-            /// 原图的归一化位置的比例。
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 0] = (center_x - box_width) / img_width;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 1] = (center_y - box_height) / img_height;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 2] = (center_x + box_width) / img_width;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 3] = (center_y + box_height) / img_height;
-            idx++;
-          }
-
-        } else {
-          // priors with different aspect ratios
-          for (float ar : aspect_ratios) {
-            box_width = min_size * sqrt(ar) / 2.;
-            box_height = min_size / sqrt(ar) / 2.;
-            /// box_width/2 , / img_width 为了得到feature map 相对于
-            /// 原图的归一化位置的比例。
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 0] = (center_x - box_width) / img_width;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 1] = (center_y - box_height) / img_height;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 2] = (center_x + box_width) / img_width;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 3] = (center_y + box_height) / img_height;
-            idx++;
-          }
-          if (!max_sizes.empty()) {
-            auto max_size = max_sizes[s];
-            // square prior with size sqrt(minSize * maxSize)
-            box_width = box_height = sqrt(min_size * max_size) / 2.;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 0] = (center_x - box_width) / img_width;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 1] = (center_y - box_height) / img_height;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 2] = (center_x + box_width) / img_width;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 3] = (center_y + box_height) / img_height;
-            idx++;
-          }
-        }
-      }
-    }
-  }
-  if (clip) {
-    math::Transform trans;
-    ClipFunctor<float> clip_func;
-    trans(output_boxes_dataptr, output_boxes_dataptr + output_boxes->numel(),
-          output_boxes_dataptr, clip_func);
-  }
-
-  if ((variances.size() != 4)) {
-    LOG(kLOG_ERROR) << " variances.size() must be 4.";
-  }
-
-  int64_t box_num = feature_height * feature_width * num_priors;
-
-  for (int i = 0; i < box_num; i++) {
-    output_variances_dataptr[4 * i] = variances[0];
-    output_variances_dataptr[4 * i + 1] = variances[1];
-    output_variances_dataptr[4 * i + 2] = variances[2];
-    output_variances_dataptr[4 * i + 3] = variances[3];
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/reshape2_arm_func.h b/mobile/src/operators/kernel/central-arm-func/reshape2_arm_func.h
deleted file mode 100644
index c22cf12031..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/reshape2_arm_func.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE2_OP
-#pragma once
-
-#include <vector>
-#include "operators/kernel/reshape_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void Reshape2Compute(const Reshape2Param<CPU> &param) {
-  const auto *input_x = param.InputX();
-  const auto &input_x_dims = input_x->dims();
-  auto *out = param.Out();
-  framework::DDim out_dims = out->dims();
-  const auto *input_shape = param.InputShape();
-
-  if (input_shape) {
-    auto *shape_data = input_shape->data<int>();
-    framework::Tensor cpu_shape_tensor;
-    auto shape =
-        std::vector<int>(shape_data, shape_data + input_shape->numel());
-    out_dims = ValidateShape(shape, input_x->dims());
-  } else {
-    auto &shape = param.Shape();
-    out_dims = ValidateShape(shape, input_x_dims);
-  }
-
-  bool inplace = param.Inplace();
-  out->Resize(out_dims);
-  if (!inplace) {
-    out->mutable_data<float>();
-    framework::TensorCopy(*input_x, out);
-    out->Resize(out_dims);
-  } else {
-    out->ShareDataWith(*input_x);
-    out->Resize(out_dims);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/reshape_arm_func.h b/mobile/src/operators/kernel/central-arm-func/reshape_arm_func.h
deleted file mode 100644
index 6e1a29dee6..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/reshape_arm_func.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE_OP
-#pragma once
-
-#include <vector>
-#include "operators/kernel/reshape_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void ReshapeCompute(const ReshapeParam<CPU> &param) {
-  const auto *input_x = param.InputX();
-  const auto &input_x_dims = input_x->dims();
-  auto *out = param.Out();
-  framework::DDim out_dims = out->dims();
-  const auto *input_shape = param.InputShape();
-
-  if (input_shape) {
-    auto *shape_data = input_shape->data<int>();
-    framework::Tensor cpu_shape_tensor;
-    auto shape =
-        std::vector<int>(shape_data, shape_data + input_shape->numel());
-    out_dims = ValidateShape(shape, input_x->dims());
-  }
-
-  bool inplace = param.Inplace();
-  out->Resize(out_dims);
-  if (!inplace) {
-    out->mutable_data<float>();
-    framework::TensorCopy(*input_x, out);
-    out->Resize(out_dims);
-  } else {
-    out->ShareDataWith(*input_x);
-    out->Resize(out_dims);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/shape_arm_func.h b/mobile/src/operators/kernel/central-arm-func/shape_arm_func.h
deleted file mode 100644
index fa9154211f..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/shape_arm_func.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SHAPE_OP
-#pragma once
-
-#include <vector>
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void ShapeCompute(const ShapeParam<CPU>& param) {
-  auto* in_t = param.Input();
-  auto* out_t = param.Out();
-  auto out_data = out_t->mutable_data<int32_t>();
-  auto in_dims = in_t->dims();
-  for (int i = 0; i < in_dims.size(); ++i) {
-    out_data[i] = static_cast<int32_t>(in_dims[i]);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/softmax_arm_func.h b/mobile/src/operators/kernel/central-arm-func/softmax_arm_func.h
deleted file mode 100644
index a94c8299c5..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/softmax_arm_func.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SOFTMAX_OP
-#pragma once
-#include "../../math/softmax.h"
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-template <typename P>
-void SoftmaxCompute(const SoftmaxParam<CPU> &param) {
-  const Tensor *in_x = param.InputX();
-  Tensor *out = param.Out();
-  auto x_dims = in_x->dims();
-  out->Resize(x_dims);
-  out->mutable_data<float>();
-  math::SoftmaxFuntor<CPU, float>()(in_x, out);
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/split_arm_func.h b/mobile/src/operators/kernel/central-arm-func/split_arm_func.h
deleted file mode 100644
index 24ab2f83a4..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/split_arm_func.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SPLIT_OP
-#pragma once
-
-#include <vector>
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-// Strided numel memory copy from src to dst by the specified axis
-//
-// For example, for a tensor dims [4, 20, 100], the strieded numel is
-// [8000, 2000, 100]
-//
-// NOTE: The src and dst tensor should have the same elements
-// except the specified axis.
-template <typename T>
-inline void StridedNumelCopyWithAxis(int64_t axis, T* dst,
-                                     const framework::DDim& dst_stride_numel,
-                                     const T* src,
-                                     const framework::DDim& src_stride_numel,
-                                     int64_t size) {
-  int64_t before = dst_stride_numel[0] / dst_stride_numel[axis];
-  int64_t src_after = src_stride_numel[axis];
-  int64_t dst_after = dst_stride_numel[axis];
-
-  PADDLE_MOBILE_ENFORCE(src_stride_numel.size() == dst_stride_numel.size(),
-                        "src and dst tensor should have the same dims size.");
-
-  for (int64_t i = 0; i < axis; ++i) {
-    if (i < axis) {
-      PADDLE_MOBILE_ENFORCE(src_stride_numel[i] / src_stride_numel[axis] ==
-                                dst_stride_numel[i] / dst_stride_numel[axis],
-                            "src and dst should have the same elements "
-                            "except the specified axis.");
-    } else if (i == axis) {
-      continue;
-    } else {
-      PADDLE_MOBILE_ENFORCE(src_stride_numel[i] == dst_stride_numel[i],
-                            "src and dst should have the same elements "
-                            "except the specified axis.");
-    }
-  }
-
-  for (int64_t i = 0; i < before; ++i) {
-    memory::Copy(dst + i * dst_after, src + i * src_after, sizeof(T) * size);
-  }
-}
-
-template <typename P>
-void SplitCompute(const SplitParam<CPU>& param) {
-  auto* in = param.InputX();
-  auto outs = param.Outs();
-  auto in_stride = framework::stride_numel(in->dims());
-  int64_t axis = param.Axis();
-
-  size_t input_offset = 0;
-  for (auto& out : outs) {
-    out->mutable_data<float>();
-    auto out_stride = framework::stride_numel(out->dims());
-
-    StridedNumelCopyWithAxis<float>(axis, out->data<float>(), out_stride,
-                                    in->data<float>() + input_offset, in_stride,
-                                    out_stride[axis]);
-    input_offset += out_stride[axis];
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/sum_arm_func.h b/mobile/src/operators/kernel/central-arm-func/sum_arm_func.h
deleted file mode 100644
index 7d41c898db..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/sum_arm_func.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SUM_OP
-#pragma once
-
-#include <vector>
-#include "operators/math/selected_rows_functor.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using LoDTensorArray = std::vector<LoDTensor>;
-
-template <typename P>
-void SumCompute(const SumParam<CPU> &param) {
-  auto inputsvars = param.InputsVars();
-  int N = inputsvars.size();
-  auto *outvar = param.OutVar();
-
-  bool in_place = outvar == inputsvars[0];
-  if (outvar->IsType<framework::LoDTensor>()) {
-    auto *out = outvar->GetMutable<LoDTensor>();
-    if (!in_place) {
-      out->mutable_data<float>();
-    }
-    auto *outptr = out->data<float>();
-    // auto result = Flatten(*out);
-
-    if (!in_place) {
-      std::fill(out->data<float>(), out->data<float>() + out->numel(), 0);
-    }
-    math::SelectedRowsAddToTensor<float> functor;
-    for (int i = in_place ? 1 : 0; i < N; i++) {
-      if (inputsvars[i]->IsType<framework::LoDTensor>()) {
-        auto *in_t = inputsvars[i]->Get<framework::LoDTensor>();
-        auto *inptr = in_t->data<float>();
-        if (in_t->numel() == 0) {
-          continue;
-        }
-        for (int j = 0; j < out->numel(); ++j) {
-          outptr[j] = outptr[j] + inptr[j];
-        }
-
-      } else if (inputsvars[i]->IsType<framework::SelectedRows>()) {
-        auto *in_t = inputsvars[i]->Get<framework::SelectedRows>();
-        functor(*in_t, out);
-      } else {
-        PADDLE_MOBILE_THROW_EXCEPTION(
-            "Variable type must be LoDTensor/SelectedRows.");
-      }
-    }
-
-  } else if (outvar->IsType<framework::SelectedRows>()) {
-    std::unique_ptr<framework::SelectedRows> in0;
-    if (in_place) {
-      // If is in_place, we store the input[0] to in0
-      auto *in_sel0 = inputsvars[0]->Get<framework::SelectedRows>();
-      auto &rows = in_sel0->rows();
-      in0.reset(new framework::SelectedRows(rows, in_sel0->height()));
-      in0->mutable_value()->ShareDataWith(in_sel0->value());
-    }
-
-    auto get_selected_row = [&](size_t i) -> const framework::SelectedRows & {
-      if (i == 0 && in0) {
-        return *in0.get();
-      } else {
-        return *(inputsvars[i]->Get<framework::SelectedRows>());
-      }
-    };
-
-    auto *out = outvar->GetMutable<framework::SelectedRows>();
-    out->mutable_rows()->clear();
-    auto *out_value = out->mutable_value();
-
-    // Runtime InferShape
-    size_t first_dim = 0;
-    for (int i = 0; i < N; i++) {
-      auto &sel_row = get_selected_row(i);
-      first_dim += sel_row.rows().size();
-    }
-    auto in_dim = framework::vectorize(get_selected_row(N - 1).value().dims());
-    in_dim[0] = static_cast<int64_t>(first_dim);
-
-    out_value->Resize(framework::make_ddim(in_dim));
-
-    // if all the input sparse vars are empty, no need to
-    // merge these vars.
-    if (first_dim == 0UL) {
-      return;
-    }
-    out_value->mutable_data<float>();
-    math::SelectedRowsAddTo<float> functor;
-
-    int64_t offset = 0;
-    for (int i = 0; i < N; i++) {
-      auto &sel_row = get_selected_row(i);
-      if (sel_row.rows().size() == 0) {
-        continue;
-      }
-      PADDLE_MOBILE_ENFORCE(out->height() == sel_row.height(),
-                            "seletrows height != outheight");
-      functor(sel_row, offset, out);
-      offset += sel_row.value().numel();
-    }
-  } else if (outvar->IsType<LoDTensorArray>()) {
-    auto &out_array = *outvar->GetMutable<LoDTensorArray>();
-    for (size_t i = in_place ? 1 : 0; i < inputsvars.size(); ++i) {
-      PADDLE_MOBILE_ENFORCE(inputsvars[i]->IsType<LoDTensorArray>(),
-                            "Only support all inputs are TensorArray");
-      auto *in_array = inputsvars[i]->Get<LoDTensorArray>();
-
-      for (size_t i = 0; i < in_array->size(); ++i) {
-        if ((*in_array)[i].numel() != 0) {
-          if (i >= out_array.size()) {
-            out_array.resize(i + 1);
-          }
-          if (out_array[i].numel() == 0) {
-            framework::TensorCopy((*in_array)[i], &out_array[i]);
-            out_array[i].set_lod((*in_array)[i].lod());
-          } else {
-            PADDLE_MOBILE_ENFORCE(out_array[i].lod() == (*in_array)[i].lod(),
-                                  "outLod != inLod");
-            auto *inptr = (*in_array)[i].data<float>();
-            auto *outptr = out_array[i].data<float>();
-
-            for (int j = 0; j < (*in_array)[i].numel(); ++j) {
-              outptr[j] = inptr[j] + outptr[j];
-            }
-          }
-        }
-      }
-    }
-  } else {
-    PADDLE_MOBILE_THROW_EXCEPTION(
-        "Unexpected branch, output variable type is %d", outvar->Type());
-  }
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/transpose_arm_func.h b/mobile/src/operators/kernel/central-arm-func/transpose_arm_func.h
deleted file mode 100644
index ef3d38eff2..0000000000
--- a/mobile/src/operators/kernel/central-arm-func/transpose_arm_func.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef TRANSPOSE_OP
-#pragma once
-
-#include <vector>
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void TransposeCompute(const TransposeParam<CPU>& param) {
-  const auto* input_x = param.InputX();
-  const auto input_x_dims = input_x->dims();
-  auto* out = param.Out();
-  const auto axis = param.Axis();
-  const auto* input_x_data = input_x->data<float>();
-  auto* out_data = out->mutable_data<float>();
-
-  size_t ndim = axis.size();
-  std::vector<int> xdim(ndim);
-  std::vector<int> xstride(ndim);
-  std::vector<int> xout(ndim);
-  for (int i = 0; i < ndim; i++) {
-    int j = ndim - 1 - i;
-    xdim[j] = input_x_dims[axis[i]];
-    xstride[j] = 1;
-    for (int k = axis[i] + 1; k < ndim; k++) {
-      xstride[j] *= input_x_dims[k];
-    }
-    xout[j] = xstride[j] * xdim[j];
-  }
-
-  auto numel = input_x->numel();
-  size_t pind = 0;
-  std::vector<int> ind(ndim);
-  for (int i = 0; i < numel; i++) {
-    out_data[i] = input_x_data[pind];
-    ind[0]++;
-    pind += xstride[0];
-    for (int j = 0; j < ndim - 1; j++) {
-      if (ind[j] == xdim[j]) {
-        ind[j + 1]++;
-        ind[j] = 0;
-        pind += xstride[j + 1];
-        pind -= xout[j];
-      } else {
-        break;
-      }
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/batchnorm_kernel.cpp b/mobile/src/operators/kernel/cl/batchnorm_kernel.cpp
deleted file mode 100644
index 6e5039cf05..0000000000
--- a/mobile/src/operators/kernel/cl/batchnorm_kernel.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BATCHNORM_OP
-
-#include "operators/kernel/batchnorm_kernel.h"
-#include <cmath>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool BatchNormKernel<GPU_CL, float>::Init(BatchNormParam<GPU_CL> *param) {
-  this->cl_helper_.AddKernel("batchnorm", "batchnorm_kernel.cl");
-  const framework::CLImage *mean = param->InputMean();
-  const framework::CLImage *variance = param->InputVariance();
-  const framework::CLImage *scale = param->InputScale();
-  const framework::CLImage *bias = param->InputBias();
-  const float epsilon = param->Epsilon();
-
-  auto mean_ptr = mean->data<float>();
-  auto variance_ptr = variance->data<float>();
-  auto scale_ptr = scale->data<float>();
-  auto bias_ptr = bias->data<float>();
-
-  const int C = mean->numel();
-  float inv_std_ptr[C];
-  for (int i = 0; i < C; i++) {
-    inv_std_ptr[i] =
-        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
-  }
-  float *new_scale_ptr = new float[C];
-  float *new_bias_ptr = new float[C];
-
-  for (int i = 0; i < C; i++) {
-    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
-    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
-  }
-
-  framework::CLImage *new_scale = new framework::CLImage();
-  new_scale->SetTensorData(new_scale_ptr, variance->dims());
-  new_scale->InitCLImage(this->cl_helper_.CLContext(),
-                         this->cl_helper_.CLCommandQueue());
-
-  framework::CLImage *new_bias = new framework::CLImage();
-  new_bias->SetTensorData(new_bias_ptr, variance->dims());
-  new_bias->InitCLImage(this->cl_helper_.CLContext(),
-                        this->cl_helper_.CLCommandQueue());
-
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);
-
-  delete[](new_scale_ptr);
-  delete[](new_bias_ptr);
-
-  return true;
-}
-
-template <>
-void BatchNormKernel<GPU_CL, float>::Compute(
-    const BatchNormParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.OutputY());
-
-  auto input = param.InputX()->GetCLImage();
-  auto out = param.OutputY()->GetCLImage();
-  auto new_scale = param.NewScale()->GetCLImage();
-  auto new_bias = param.NewBias()->GetCLImage();
-  const int out_width = default_work_size[1];
-  DLOG << *param.InputX();
-  DLOG << *param.NewBias();
-  DLOG << *param.NewScale();
-  DLOG << default_work_size[0];
-  DLOG << default_work_size[1];
-  DLOG << default_work_size[2];
-  DLOG << out_width;
-  DLOG << *param.OutputY();
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_int), &out_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &new_scale);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &new_bias);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &out);
-  CL_CHECK_ERRORS(status);
-  status =
-      clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
-                             default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-
-template class BatchNormKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/box_coder_kernel.cpp b/mobile/src/operators/kernel/cl/box_coder_kernel.cpp
deleted file mode 100644
index b98435f9b0..0000000000
--- a/mobile/src/operators/kernel/cl/box_coder_kernel.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BOXCODER_OP
-
-#include "operators/kernel/box_coder_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool BoxCoderKernel<GPU_CL, float>::Init(BoxCoderParam<GPU_CL>* param) {
-  if (param->CodeType() == "decode_center_size") {
-    this->cl_helper_.AddKernel("box_decoder", "box_coder_kernel.cl");
-  }
-  return true;
-}
-
-template <>
-void BoxCoderKernel<GPU_CL, float>::Compute(
-    const BoxCoderParam<GPU_CL>& param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.OutputBox());
-  const auto* input_priorbox = param.InputPriorBox();
-  const auto* input_priorboxvar = param.InputPriorBoxVar();
-  const auto* input_targetbox = param.InputTargetBox();
-  const auto& code_type = param.CodeType();
-  if (code_type == "decode_center_size") {
-    auto prior_box_image = input_priorbox->GetCLImage();
-    auto prior_box_var_image = input_priorboxvar->GetCLImage();
-    auto target_box_image = input_targetbox->GetCLImage();
-    auto output_image = param.OutputBox()->GetCLImage();
-    auto& outputDim = param.OutputBox()->dims();
-    int new_dims[4] = {1, 1, 1, 1};
-    for (int i = 0; i < outputDim.size(); i++) {
-      new_dims[4 - outputDim.size() + i] = outputDim[i];
-    }
-    int out_C = new_dims[1];
-    int out_H = new_dims[2];
-    DLOG << "out_C=" << out_C;
-    DLOG << "out_H=" << out_H;
-    DLOG << "default_work_size=" << default_work_size;
-    cl_int status;
-    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &prior_box_image);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &prior_box_var_image);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &target_box_image);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &output_image);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 4, sizeof(int), &out_C);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 5, sizeof(int), &out_H);
-    CL_CHECK_ERRORS(status);
-    size_t global_work_size[2] = {default_work_size[0], default_work_size[2]};
-    status =
-        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                               NULL, global_work_size, NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
deleted file mode 100644
index 13c4d58885..0000000000
--- a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
+++ /dev/null
@@ -1,760 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
-#include <vector>
-#include "framework/cl/cl_image_converter.h"
-#include "framework/cl/cl_tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-bool use_lws = true;
-
-template <>
-void winograd_transform_weight<4, 3>(framework::CLHelper *cl_helper,
-                                     framework::CLImage *weight) {}
-
-template <>
-void WinogradConv3x3<4, 3>(framework::CLHelper *cl_helper,
-                           const ConvParam<GPU_CL> &param, bool ifRelu,
-                           const framework::CLImage *biase,
-                           const framework::CLImage *new_scale,
-                           const framework::CLImage *new_bias) {}
-
-void ConvAddBnRelu(framework::CLHelper *cl_helper,
-                   const ConvParam<GPU_CL> &param, bool ifRelu,
-                   const framework::CLImage *biase,
-                   const framework::CLImage *new_scale,
-                   const framework::CLImage *new_bias) {
-  auto kernel = cl_helper->KernelAt(0);
-  auto default_work_size = cl_helper->DefaultWorkSize(*param.Output());
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-  auto input = param.Input()->GetCLImage();
-  auto filter = param.Filter()->GetCLImage();
-
-  auto output = param.Output()->GetCLImage();
-  int stride = param.Strides()[0];
-  int offset = param.Offset();
-  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
-                    param.Input()->Converter())
-                    ->GetCBlock();
-  int dilation = param.Dilations()[0];
-  int input_width = param.Input()->dims()[3];
-  int input_height = param.Input()->dims()[2];
-  int output_width = param.Output()->dims()[3];
-  int output_height = param.Output()->dims()[2];
-  int filter_channel = param.Filter()->dims()[1];
-  int input_channel = param.Input()->dims()[1];
-
-  //  DLOG << " c block " << c_block;
-  //  DLOG << " w " << w;
-  //  DLOG << " nh " << nh;
-  //  DLOG << " stride " << stride;
-  //  DLOG << " offset " << offset;
-  //  DLOG << " input_c " << input_c;
-  //  DLOG << " dilation " << dilation;
-  //  DLOG << " input width " << input_width;
-  //  DLOG << " input height " << input_height;
-  //  DLOG << " output width " << output_width;
-  //  DLOG << " output height " << output_height;
-  //  DLOG << " input dim " << param.Input()->dims();
-  //  DLOG << " output dim " << param.Output()->dims();
-  //  DLOG << " filter dim " << param.Filter()->dims();
-
-  cl_int status;
-  int index = 0;
-
-  if (param.Filter()->dims()[2] == 1 && param.Filter()->dims()[3] == 1) {
-    status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
-    CL_CHECK_ERRORS(status);
-
-    int maped_w = maptofactor(w, 4);
-    status = clSetKernelArg(kernel, index++, sizeof(int), &maped_w);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &nh);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter);
-    CL_CHECK_ERRORS(status);
-
-    if (biase) {
-      auto bias_mem = biase->GetCLImage();
-      status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem);
-      CL_CHECK_ERRORS(status);
-    }
-
-    if (new_scale && new_bias) {
-      auto new_scale_mem = new_scale->GetCLImage();
-      status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem);
-      CL_CHECK_ERRORS(status);
-
-      auto new_bias_mem = new_bias->GetCLImage();
-      status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem);
-      CL_CHECK_ERRORS(status);
-    }
-
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &stride);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &offset);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &input_c);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &input_width);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &input_height);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &output_width);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &w);
-    CL_CHECK_ERRORS(status);
-
-    const size_t work_size[3] = {
-        static_cast<const uint32_t>(default_work_size.data()[0]),
-        static_cast<const uint32_t>(maped_w),
-        static_cast<const uint32_t>(default_work_size.data()[2])};
-
-    if (work_size[1] % 60 == 0 && use_lws) {
-      const size_t local_work_size[3] = {static_cast<const uint32_t>(1),
-                                         static_cast<const uint32_t>(60),
-                                         static_cast<const uint32_t>(1)};
-      status = clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel,
-                                      default_work_size.size(), NULL, work_size,
-                                      local_work_size, 0, NULL, NULL);
-    } else {
-      status = clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel,
-                                      default_work_size.size(), NULL, work_size,
-                                      NULL, 0, NULL, NULL);
-    }
-    CL_CHECK_ERRORS(status);
-  } else {
-    status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &w);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &nh);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter);
-    CL_CHECK_ERRORS(status);
-
-    if (biase) {
-      auto bias_mem = biase->GetCLImage();
-      status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem);
-      CL_CHECK_ERRORS(status);
-    }
-
-    if (new_scale && new_bias) {
-      auto new_scale_mem = new_scale->GetCLImage();
-      status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem);
-      CL_CHECK_ERRORS(status);
-
-      auto new_bias_mem = new_bias->GetCLImage();
-      status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem);
-      CL_CHECK_ERRORS(status);
-    }
-
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &stride);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &offset);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &input_c);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &input_width);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &input_height);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &output_width);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
-    CL_CHECK_ERRORS(status);
-
-    if (param.Filter()->dims()[2] == 3 && param.Filter()->dims()[3] == 3) {
-      if (filter_channel != input_channel) {
-        if (filter_channel != 1) {
-          status =
-              clSetKernelArg(kernel, index++, sizeof(int), &filter_channel);
-          CL_CHECK_ERRORS(status);
-          int has_group = 1;
-          status = clSetKernelArg(kernel, index++, sizeof(int), &has_group);
-          CL_CHECK_ERRORS(status);
-        }
-      } else {
-        status = clSetKernelArg(kernel, index++, sizeof(int), &filter_channel);
-        CL_CHECK_ERRORS(status);
-        int has_group = 0;
-        status = clSetKernelArg(kernel, index++, sizeof(int), &has_group);
-        CL_CHECK_ERRORS(status);
-      }
-    }
-
-    status = clEnqueueNDRangeKernel(
-        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
-        default_work_size.data(), NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
-  }
-}
-
-void DWConvAddBnRelu(framework::CLHelper *cl_helper,
-                     const ConvParam<GPU_CL> &param, bool ifRelu,
-                     const framework::CLImage *biase,
-                     const framework::CLImage *new_scale,
-                     const framework::CLImage *new_bias) {
-  auto kernel = cl_helper->KernelAt(0);
-  auto default_work_size = cl_helper->DefaultWorkSize(*param.Output());
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-  int w_blk_size = 2;
-  int w_blk = (w + w_blk_size - 1) / w_blk_size;
-
-  default_work_size[1] = w_blk;
-  auto input = param.Input()->GetCLImage();
-  auto filter = param.Filter()->GetCLImage();
-
-  auto output = param.Output()->GetCLImage();
-  int stride = param.Strides()[0];
-  int pad = param.Paddings()[0];
-  int dilation = param.Dilations()[0];
-
-  int input_channel = param.Input()->dims()[1];
-  int input_height = param.Input()->dims()[2];
-  int input_width = param.Input()->dims()[3];
-
-  int output_height = param.Output()->dims()[2];
-  int output_width = param.Output()->dims()[3];
-
-  //  DLOG << " w " << w;
-  //  DLOG << " nh " << nh;
-  //  DLOG << " stride " << stride;
-  //  DLOG << " dilation " << dilation;
-  //  DLOG << " input width " << input_width;
-  //  DLOG << " input height " << input_height;
-  //  DLOG << " output width " << output_width;
-  //  DLOG << " output height " << output_height;
-  //  DLOG << " input dim " << param.Input()->dims();
-  //  DLOG << " output dim " << param.Output()->dims();
-  //  DLOG << " filter dim " << param.Filter()->dims();
-
-  cl_int status;
-  int index = 0;
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &w_blk);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &nh);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter);
-  CL_CHECK_ERRORS(status);
-
-  if (biase) {
-    auto bias_mem = biase->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem);
-    CL_CHECK_ERRORS(status);
-  }
-
-  if (new_scale && new_bias) {
-    auto new_scale_mem = new_scale->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem);
-    CL_CHECK_ERRORS(status);
-
-    auto new_bias_mem = new_bias->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem);
-    CL_CHECK_ERRORS(status);
-  }
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &stride);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &pad);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_channel);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_height);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &output_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
-  CL_CHECK_ERRORS(status);
-
-  if (default_work_size.data()[1] % 60 == 0 && use_lws) {
-    const size_t local_work_size[3] = {static_cast<const uint32_t>(1),
-                                       static_cast<const uint32_t>(60),
-                                       static_cast<const uint32_t>(1)};
-    status = clEnqueueNDRangeKernel(
-        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
-        default_work_size.data(), local_work_size, 0, NULL, NULL);
-  } else {
-    status = clEnqueueNDRangeKernel(
-        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
-        default_work_size.data(), NULL, 0, NULL, NULL);
-  }
-
-  CL_CHECK_ERRORS(status);
-}
-
-void SWConvAddBnRelu(framework::CLHelper *cl_helper,
-                     const ConvParam<GPU_CL> &param, bool ifRelu,
-                     const framework::CLImage *biase,
-                     const framework::CLImage *new_scale,
-                     const framework::CLImage *new_bias) {
-  auto kernel = cl_helper->KernelAt(0);
-  auto default_work_size = cl_helper->DefaultWorkSize(*param.Output());
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-
-  int w_blk_size = 5;
-  int w_blk = (w + w_blk_size - 1) / w_blk_size;
-  default_work_size[1] = w_blk;
-
-  int h_blk_size = 1;
-  int h_blk = (nh + h_blk_size - 1) / h_blk_size;
-  default_work_size[2] = h_blk;
-
-  auto input = param.Input()->GetCLImage();
-  auto filter = param.Filter()->GetCLImage();
-
-  auto output = param.Output()->GetCLImage();
-  int stride = param.Strides()[0];
-  int pad = param.Paddings()[0];
-  int dilation = param.Dilations()[0];
-
-  int input_channel = param.Input()->dims()[1];
-  int input_height = param.Input()->dims()[2];
-  int input_width = param.Input()->dims()[3];
-
-  int output_height = param.Output()->dims()[2];
-  int output_width = param.Output()->dims()[3];
-
-  cl_int status;
-  int index = 0;
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &w_blk);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &h_blk);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter);
-  CL_CHECK_ERRORS(status);
-
-  if (biase) {
-    auto bias_mem = biase->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem);
-    CL_CHECK_ERRORS(status);
-  }
-
-  if (new_scale && new_bias) {
-    auto new_scale_mem = new_scale->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem);
-    CL_CHECK_ERRORS(status);
-
-    auto new_bias_mem = new_bias->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem);
-    CL_CHECK_ERRORS(status);
-  }
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &stride);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &pad);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_channel);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_height);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, index++, sizeof(int), &output_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
-  CL_CHECK_ERRORS(status);
-
-  if (default_work_size.data()[1] % 60 == 0 && use_lws) {
-    const size_t local_work_size[3] = {static_cast<const uint32_t>(1),
-                                       static_cast<const uint32_t>(60),
-                                       static_cast<const uint32_t>(1)};
-    status = clEnqueueNDRangeKernel(
-        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
-        default_work_size.data(), local_work_size, 0, NULL, NULL);
-  } else {
-    status = clEnqueueNDRangeKernel(
-        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
-        default_work_size.data(), NULL, 0, NULL, NULL);
-  }
-  CL_CHECK_ERRORS(status);
-}
-
-void DWConvTransposeAddBnRelu(framework::CLHelper *cl_helper,
-                              const ConvTransposeParam<GPU_CL> &param,
-                              bool ifRelu, const framework::CLImage *biase,
-                              const framework::CLImage *new_scale,
-                              const framework::CLImage *new_bias) {
-  auto kernel = cl_helper->KernelAt(0);
-  auto default_work_size = cl_helper->DefaultWorkSize(*param.Output());
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-
-  int w_blk_size = 1;
-  int w_blk = (w + w_blk_size - 1) / w_blk_size;
-  default_work_size[1] = w_blk;
-
-  int h_blk_size = 1;
-  int h_blk = (nh + h_blk_size - 1) / h_blk_size;
-  default_work_size[2] = h_blk;
-
-  auto input = param.Input()->GetCLImage();
-  auto filter = param.Filter()->GetCLImage();
-
-  auto output = param.Output()->GetCLImage();
-  int stride = param.Strides()[0];
-  int pad = param.Paddings()[0];
-  int dilation = param.Dilations()[0];
-
-  int input_channel = param.Input()->dims()[1];
-  int input_height = param.Input()->dims()[2];
-  int input_width = param.Input()->dims()[3];
-
-  int output_height = param.Output()->dims()[2];
-  int output_width = param.Output()->dims()[3];
-
-  int filter_height = param.Filter()->dims()[2];
-  int filter_width = param.Filter()->dims()[3];
-
-  cl_int status;
-  int index = 0;
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &w_blk);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &h_blk);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter);
-  CL_CHECK_ERRORS(status);
-
-  if (biase) {
-    auto bias_mem = biase->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem);
-    CL_CHECK_ERRORS(status);
-  }
-
-  if (new_scale && new_bias) {
-    auto new_scale_mem = new_scale->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem);
-    CL_CHECK_ERRORS(status);
-
-    auto new_bias_mem = new_bias->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem);
-    CL_CHECK_ERRORS(status);
-  }
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &stride);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &pad);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_channel);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_height);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &output_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &filter_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &filter_height);
-  CL_CHECK_ERRORS(status);
-
-  if (default_work_size.data()[1] % 60 == 0 && use_lws) {
-    const size_t local_work_size[3] = {static_cast<const uint32_t>(1),
-                                       static_cast<const uint32_t>(60),
-                                       static_cast<const uint32_t>(1)};
-    status = clEnqueueNDRangeKernel(
-        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
-        default_work_size.data(), local_work_size, 0, NULL, NULL);
-  } else {
-    status = clEnqueueNDRangeKernel(
-        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
-        default_work_size.data(), NULL, 0, NULL, NULL);
-  }
-  CL_CHECK_ERRORS(status);
-}
-
-void ConvTransposeAddBnRelu(framework::CLHelper *cl_helper,
-                            const ConvTransposeParam<GPU_CL> &param,
-                            bool ifRelu, const framework::CLImage *biase,
-                            const framework::CLImage *new_scale,
-                            const framework::CLImage *new_bias) {
-  auto kernel = cl_helper->KernelAt(0);
-  const auto *input = param.Input();
-  auto *output = param.Output();
-  auto *filter = param.Filter();
-  const int n = input->dims()[0];
-  const int input_c = input->dims()[1];
-  const int input_c_block = (input_c + 3) / 4;
-  const int input_width = input->dims()[3];
-  const int input_height = input->dims()[2];
-  const int output_c = output->dims()[1];
-  const int output_c_block = (output_c + 3) / 4;
-  const int output_width = output->dims()[3];
-  const int output_height = output->dims()[2];
-
-  auto inputImage = input->GetCLImage();
-  auto outputImage = output->GetCLImage();
-  auto filterImage = filter->GetCLImage();
-
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(int), &input_c_block);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(int), &input_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(int), &input_height);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(int), &output_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(int), &output_height);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &inputImage);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &filterImage);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &outputImage);
-  CL_CHECK_ERRORS(status);
-
-  const size_t work_size[3] = {(size_t)output_c_block, (size_t)input_width,
-                               (size_t)(n * input_height)};
-
-  DLOG << "conv transpose " << input_c_block << input_width << input_height
-       << output_width << output_height << work_size[0] << work_size[1]
-       << work_size[2];
-
-  clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel, 3, NULL,
-                         work_size, NULL, 0, NULL, NULL);
-}
-
-void ConvTranspose3x3s2AddBnRelu(framework::CLHelper *cl_helper,
-                                 const ConvTransposeParam<GPU_CL> &param,
-                                 bool ifRelu, const framework::CLImage *biase,
-                                 const framework::CLImage *new_scale,
-                                 const framework::CLImage *new_bias) {
-  auto kernel = cl_helper->KernelAt(0);
-  auto default_work_size = cl_helper->DefaultWorkSize(*param.Output());
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-
-  int w_blk_size = 5;
-  int w_blk = (w + w_blk_size - 1 + 5) / w_blk_size / 2 * 2;
-  default_work_size[1] = w_blk;
-
-  int h_blk_size = 1;
-  int h_blk = (nh + h_blk_size - 1) / h_blk_size;
-  default_work_size[2] = h_blk;
-
-  auto input = param.Input()->GetCLImage();
-  auto filter = param.Filter()->GetCLImage();
-
-  auto output = param.Output()->GetCLImage();
-  int stride = param.Strides()[0];
-  int pad = param.Paddings()[0];
-  int dilation = param.Dilations()[0];
-
-  int input_channel = param.Input()->dims()[1];
-  int input_height = param.Input()->dims()[2];
-  int input_width = param.Input()->dims()[3];
-
-  int output_height = param.Output()->dims()[2];
-  int output_width = param.Output()->dims()[3];
-
-  int filter_height = param.Filter()->dims()[2];
-  int filter_width = param.Filter()->dims()[3];
-
-  cl_int status;
-  int index = 0;
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &w_blk);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &h_blk);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter);
-  CL_CHECK_ERRORS(status);
-
-  if (biase) {
-    auto bias_mem = biase->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem);
-    CL_CHECK_ERRORS(status);
-  }
-
-  if (new_scale && new_bias) {
-    auto new_scale_mem = new_scale->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem);
-    CL_CHECK_ERRORS(status);
-
-    auto new_bias_mem = new_bias->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem);
-    CL_CHECK_ERRORS(status);
-  }
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &stride);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &pad);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_channel);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_height);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &output_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &filter_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &filter_height);
-  CL_CHECK_ERRORS(status);
-
-  if (default_work_size.data()[1] % 60 == 0 && use_lws) {
-    const size_t local_work_size[3] = {static_cast<const uint32_t>(1),
-                                       static_cast<const uint32_t>(60),
-                                       static_cast<const uint32_t>(1)};
-    status = clEnqueueNDRangeKernel(
-        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
-        default_work_size.data(), local_work_size, 0, NULL, NULL);
-  } else {
-    status = clEnqueueNDRangeKernel(
-        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
-        default_work_size.data(), NULL, 0, NULL, NULL);
-  }
-  CL_CHECK_ERRORS(status);
-}
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.h b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.h
deleted file mode 100644
index 6254455eac..0000000000
--- a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined(CONV_OP) || defined(CONV_TRANSPOSE_OP)
-
-#pragma once
-
-#include "framework/cl/cl_helper.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-inline int maptofactor(int i, int factor) { return (i + factor - 1) / factor; }
-
-template <int tile, int kernel>
-void winograd_transform_weight(framework::CLHelper *cl_helper,
-                               framework::CLImage *weight);
-
-template <int tile, int kernel>
-void WinogradConv3x3(framework::CLHelper *cl_helper,
-                     const ConvParam<GPU_CL> &param, bool ifRelu = false,
-                     const framework::CLImage *biase = nullptr,
-                     const framework::CLImage *new_scale = nullptr,
-                     const framework::CLImage *new_bias = nullptr);
-
-void ConvAddBnRelu(framework::CLHelper *cl_helper,
-                   const ConvParam<GPU_CL> &param, bool ifRelu = false,
-                   const framework::CLImage *biase = nullptr,
-                   const framework::CLImage *new_scale = nullptr,
-                   const framework::CLImage *new_bias = nullptr);
-
-void DWConvAddBnRelu(framework::CLHelper *cl_helper,
-                     const ConvParam<GPU_CL> &param, bool ifRelu = false,
-                     const framework::CLImage *biase = nullptr,
-                     const framework::CLImage *new_scale = nullptr,
-                     const framework::CLImage *new_bias = nullptr);
-
-void SWConvAddBnRelu(framework::CLHelper *cl_helper,
-                     const ConvParam<GPU_CL> &param, bool ifRelu = false,
-                     const framework::CLImage *biase = nullptr,
-                     const framework::CLImage *new_scale = nullptr,
-                     const framework::CLImage *new_bias = nullptr);
-void DWConvTransposeAddBnRelu(framework::CLHelper *cl_helper,
-                              const ConvTransposeParam<GPU_CL> &param,
-                              bool ifRelu = false,
-                              const framework::CLImage *biase = nullptr,
-                              const framework::CLImage *new_scale = nullptr,
-                              const framework::CLImage *new_bias = nullptr);
-void ConvTransposeAddBnRelu(framework::CLHelper *cl_helper,
-                            const ConvTransposeParam<GPU_CL> &param,
-                            bool ifRelu = false,
-                            const framework::CLImage *biase = nullptr,
-                            const framework::CLImage *new_scale = nullptr,
-                            const framework::CLImage *new_bias = nullptr);
-void ConvTranspose3x3s2AddBnRelu(framework::CLHelper *cl_helper,
-                                 const ConvTransposeParam<GPU_CL> &param,
-                                 bool ifRelu = false,
-                                 const framework::CLImage *biase = nullptr,
-                                 const framework::CLImage *new_scale = nullptr,
-                                 const framework::CLImage *new_bias = nullptr);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl
deleted file mode 100644
index 9d0857a45e..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void batchnorm(__private const int out_width,
-                        __read_only image2d_t input,
-                        __read_only image2d_t new_scale_image,
-                        __read_only image2d_t new_bias_image,
-                        __write_only image2d_t output) {
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  half4 new_scale = read_imageh(new_scale_image, sampler, (int2)(out_c, 0));
-  half4 new_bias = read_imageh(new_bias_image, sampler, (int2)(out_c, 0));
-
-  int pos_x = mad24(out_c, out_width, out_w);
-  half4 in = read_imageh(input, sampler, (int2)(pos_x, out_nh));
-  half4 out = mad(in, new_scale, new_bias);
-
-  write_imageh(output, (int2)(pos_x, out_nh), out);
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/box_coder_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/box_coder_kernel.cl
deleted file mode 100644
index 60000c994e..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/box_coder_kernel.cl
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void box_decoder(__read_only image2d_t prior_box_image,
-                      __read_only image2d_t prior_box_var_image,
-                      __read_only image2d_t target_box_image,
-                      __write_only image2d_t output_image,
-                      __private const int out_C,
-                      __private const int out_H
-                      ){
-                       const int out_c = get_global_id(0);
-                       const int out_nh = get_global_id(1);
-                       const int out_h = out_nh%out_H;
-                       const int out_n =  1;
-
-                       const int prior_box_n = 1;
-                       const int prior_box_c = 0;
-                       const int prior_box_h = out_h;
-
-
-                       const int prior_box_var_n = 1;
-                       const int prior_box_var_c = 0;
-                       const int prior_box_var_h = out_h;
-
-                       const int target_box_n = 1;
-                       const int target_box_c = out_c;
-                       const int target_box_h = out_h;
-
-                       const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                 CLK_ADDRESS_CLAMP      |
-                                                 CLK_FILTER_NEAREST;
-                       int2  prior_box_pos;
-                       int2  prior_box_var_pos;
-                       int2  target_box_pos;
-                       int2  output_pos;
-
-                       prior_box_pos.x = prior_box_c * 4;
-                       prior_box_pos.y = prior_box_n * prior_box_h;
-
-                       prior_box_var_pos.x = prior_box_var_c * 4;
-                       prior_box_var_pos.y = prior_box_var_n * prior_box_var_h;
-
-                       target_box_pos.x = target_box_c * 4;
-                       target_box_pos.y = target_box_n * target_box_h;
-
-                       output_pos.x = out_c * 4;
-                       output_pos.y = out_n * out_h;
-
-                       half4 prior_box_input[4];
-                       half4 prior_box_var_input[4];
-                       half4 target_box_input[4];
-
-                       prior_box_input[0] = read_imageh(prior_box_image, sampler,(int2)(prior_box_pos.x + 0,prior_box_pos.y));
-                       prior_box_input[1] = read_imageh(prior_box_image, sampler,(int2)(prior_box_pos.x + 1,prior_box_pos.y));
-                       prior_box_input[2] = read_imageh(prior_box_image, sampler,(int2)(prior_box_pos.x + 2,prior_box_pos.y));
-                       prior_box_input[3] = read_imageh(prior_box_image, sampler,(int2)(prior_box_pos.x + 3,prior_box_pos.y));
-
-                       prior_box_var_input[0] = read_imageh(prior_box_var_image, sampler,(int2)(prior_box_var_pos.x + 0,prior_box_var_pos.y));
-                       prior_box_var_input[1] = read_imageh(prior_box_var_image, sampler,(int2)(prior_box_var_pos.x + 1,prior_box_var_pos.y));
-                       prior_box_var_input[2] = read_imageh(prior_box_var_image, sampler,(int2)(prior_box_var_pos.x + 2,prior_box_var_pos.y));
-                       prior_box_var_input[3] = read_imageh(prior_box_var_image, sampler,(int2)(prior_box_var_pos.x + 3,prior_box_var_pos.y));
-
-
-
-                       target_box_input[0] = read_imageh(target_box_image, sampler,(int2)(target_box_pos.x + 0,target_box_pos.y));
-                       target_box_input[1] = read_imageh(target_box_image, sampler,(int2)(target_box_pos.x + 1,target_box_pos.y));
-                       target_box_input[2] = read_imageh(target_box_image, sampler,(int2)(target_box_pos.x + 2,target_box_pos.y));
-                       target_box_input[3] = read_imageh(target_box_image, sampler,(int2)(target_box_pos.x + 3,target_box_pos.y));
-
-                       half prior_box_width = prior_box_input[2].x - prior_box_input[0].x;
-                       half prior_box_height = prior_box_input[3].x - prior_box_input[1].x;
-                       half prior_box_center_x = (prior_box_input[2].x + prior_box_input[0].x)/(half)2;
-                       half prior_box_center_y = (prior_box_input[3].x + prior_box_input[1].x)/(half)2;
-
-                       half4 target_box_center_x;
-                       half4 target_box_center_y;
-                       half4 target_box_width;
-                       half4 target_box_height;
-                       half4 output[4];
-
-                       output[0] = 0.0f;
-                       output[1] = 0.0f;
-                       output[2] = 0.0f;
-                       output[3] = 0.0f;
-
-                       target_box_center_x.x = prior_box_var_input[0].x * target_box_input[0].x * prior_box_width + prior_box_center_x;
-                       target_box_center_y.x = prior_box_var_input[1].x * target_box_input[1].x * prior_box_height + prior_box_center_y;
-                       target_box_width.x = exp(prior_box_var_input[2].x * target_box_input[2].x) * prior_box_width;
-                       target_box_height.x = exp(prior_box_var_input[3].x * target_box_input[3].x) * prior_box_height;
-
-                       output[0].x = target_box_center_x.x - target_box_width.x/(half)2;
-                       output[1].x = target_box_center_y.x - target_box_height.x/(half)2;
-                       output[2].x = target_box_center_x.x + target_box_width.x/(half)2;
-                       output[3].x = target_box_center_y.x + target_box_height.x/(half)2;
-
-                       if(out_C - out_c * 4 >= 2){
-                       target_box_center_x.y = prior_box_var_input[0].x * target_box_input[0].y * prior_box_width + prior_box_center_x;
-                       target_box_center_y.y = prior_box_var_input[1].x * target_box_input[1].y * prior_box_height + prior_box_center_y;
-                       target_box_width.y = exp(prior_box_var_input[2].x * target_box_input[2].y) * prior_box_width;
-                       target_box_height.y = exp(prior_box_var_input[3].x * target_box_input[3].y) * prior_box_height;
-                       output[0].y = target_box_center_x.y - target_box_width.y/(half)2;
-                       output[1].y = target_box_center_y.y - target_box_height.y/(half)2;
-                       output[2].y = target_box_center_x.y + target_box_width.y/(half)2;
-                       output[3].y = target_box_center_y.y + target_box_height.y/(half)2;
-
-                       }
-                       if(out_C - out_c * 4 >= 3){
-                       target_box_center_x.z = prior_box_var_input[0].x * target_box_input[0].z * prior_box_width + prior_box_center_x;
-                       target_box_center_y.z = prior_box_var_input[1].x * target_box_input[1].z * prior_box_height + prior_box_center_y;
-                       target_box_width.z = exp(prior_box_var_input[2].x * target_box_input[2].z) * prior_box_width;
-                       target_box_height.z = exp(prior_box_var_input[3].x * target_box_input[3].z) * prior_box_height;
-                       output[0].z = target_box_center_x.z - target_box_width.z/(half)2;
-                       output[1].z = target_box_center_y.z - target_box_height.z/(half)2;
-                       output[2].z = target_box_center_x.z + target_box_width.z/(half)2;
-                       output[3].z = target_box_center_y.z + target_box_height.z/(half)2;
-                       }
-                       if(out_C - out_c * 4 >= 4){
-                       target_box_center_x.w = prior_box_var_input[0].x * target_box_input[0].w * prior_box_width + prior_box_center_x;
-                       target_box_center_y.w = prior_box_var_input[1].x * target_box_input[1].w * prior_box_height + prior_box_center_y;
-                       target_box_width.w = exp(prior_box_var_input[2].x * target_box_input[2].w) * prior_box_width;
-                       target_box_height.w = exp(prior_box_var_input[3].x * target_box_input[3].w) * prior_box_height;
-                       output[0].w = target_box_center_x.w - target_box_width.w/(half)2;
-                       output[1].w = target_box_center_y.w - target_box_height.w/(half)2;
-                       output[2].w = target_box_center_x.w + target_box_width.w/(half)2;
-                       output[3].w = target_box_center_y.w + target_box_height.w/(half)2;
-                       }
-
-
-                       write_imageh(output_image, (int2)(output_pos.x + 0, output_pos.y), output[0]);
-                       write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output[1]);
-                       write_imageh(output_image, (int2)(output_pos.x + 2, output_pos.y), output[2]);
-                       write_imageh(output_image, (int2)(output_pos.x + 3, output_pos.y), output[3]);
-
-}
\ No newline at end of file
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl
deleted file mode 100644
index 964cc7e75d..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__kernel void channel_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage,int w) {
-     int x = get_global_id(0);
-     int y = get_global_id(1);
-     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-     int2 coords;
-     coords.x = x;
-     coords.y = y;
-     int2 coords_bias;
-     coords_bias.x = x/w;
-     coords_bias.y = 0;
-     half4 in = read_imageh(input, sampler, coords);
-     half4 biase = read_imageh(bias, sampler, coords_bias);
-     half4 output = in + biase;
-     write_imageh(outputImage,coords,output);
- }
-
-__kernel void width_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t
-outputImage,int w) {
-  int x = get_global_id(0);
-  int y = get_global_id(1);
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-  int2 coords;
-  coords.x = x;
-  coords.y = y;
-  int2 coords_bias;
-  coords_bias.x = x % w;
-  coords_bias.y = 0;
-  half4 in = read_imageh(input, sampler, coords);
-  half4 biase = read_imageh(bias, sampler, coords_bias);
-  half4 output;
-  output.x = in.x + biase.x;
-  output.y = in.y + biase.x;
-  output.z = in.z + biase.x;
-  output.w = in.w + biase.x;
-  write_imageh(outputImage,coords,output);
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/cl_common.h b/mobile/src/operators/kernel/cl/cl_kernel/cl_common.h
deleted file mode 100644
index 34f36eb9a3..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/cl_common.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-inline half4 activation(half4 in
-#ifdef PRELU
-                        ,
-                        half4 prelu_alpha
-#endif
-) {
-  half4 output;
-#ifdef PRELU
-  output = select(prelu_alpha * in, in, in >= (half4)0.0);
-#endif
-
-#ifdef RELU
-  output = fmax(in, (half4)(0.0f));
-#endif
-  return output;
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/concat_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/concat_kernel.cl
deleted file mode 100644
index c636bf5fd4..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/concat_kernel.cl
+++ /dev/null
@@ -1,291 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-
-__kernel void concatByCWith2Inputs(__read_only image2d_t input_image_0,
-                    __read_only image2d_t input_image_1,
-                    __private const int C_0,
-                    __private const int C_1,
-                    __write_only image2d_t output_image,
-                    __private const int out_C,
-                    __private const int out_W) {
-                      const int out_c = get_global_id(0);
-                      const int out_w = get_global_id(1);
-                      const int out_nh = get_global_id(2);
-
-                      const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                CLK_ADDRESS_CLAMP |
-                                                CLK_FILTER_NEAREST;
-
-                      int2 output_pos;
-                      output_pos.x = out_c * out_W + out_w;
-                      output_pos.y = out_nh;
-                      half4 output_data;
-
-                      for (int i = 0; i < 4; i++) {
-                        int c = out_c * 4 + i;
-                        if (c >= out_C) {
-                            break;
-                        }
-                        int c_in;
-                        half4 input_data;
-                        if (c < C_0) {
-                          c_in = c;
-                          int2 input_pos;
-                          input_pos.x = (c_in / 4) * out_W + out_w;
-                          input_pos.y = out_nh;
-                          input_data = read_imageh(input_image_0, sampler, input_pos);
-                        } else {
-                          c_in = c - C_0;
-                          int2 input_pos;
-                          input_pos.x = (c_in / 4) * out_W + out_w;
-                          input_pos.y = out_nh;
-                          input_data = read_imageh(input_image_1, sampler, input_pos);
-                        }
-                        int value_offset = c_in % 4;
-                        float value;
-                        if (value_offset == 0) {
-                          value = input_data.x;
-                        } else if (value_offset == 1) {
-                          value = input_data.y;
-                        } else if (value_offset == 2) {
-                          value = input_data.z;
-                        } else if (value_offset == 3) {
-                          value = input_data.w;
-                        }
-                        if (i == 0) {
-                          output_data.x = value;
-                        } else if (i == 1) {
-                          output_data.y = value;
-                        } else if (i == 2) {
-                          output_data.z = value;
-                        } else if (i == 3) {
-                          output_data.w = value;
-                        }
-                      }
-                      write_imageh(output_image, output_pos, output_data);
-}
-
-__kernel void concatByCWith3Inputs(__read_only image2d_t input_image_0,
-                    __read_only image2d_t input_image_1,
-                    __read_only image2d_t input_image_2,
-                    __private const int C_0,
-                    __private const int C_1,
-                    __private const int C_2,
-                    __write_only image2d_t output_image,
-                    __private const int out_C,
-                    __private const int out_W) {
-                      const int out_c = get_global_id(0);
-                      const int out_w = get_global_id(1);
-                      const int out_nh = get_global_id(2);
-
-                      const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                CLK_ADDRESS_CLAMP |
-                                                CLK_FILTER_NEAREST;
-
-                      int2 output_pos;
-                      output_pos.x = out_c * out_W + out_w;
-                      output_pos.y = out_nh;
-                      half4 output_data;
-
-                      for (int i = 0; i < 4; i++) {
-                        int c = out_c * 4 + i;
-                        if (c >= out_C) {
-                            break;
-                        }
-                        int c_in;
-                        half4 input_data;
-                        if (c < C_0) {
-                          c_in = c;
-                          int2 input_pos;
-                          input_pos.x = (c_in / 4) * out_W + out_w;
-                          input_pos.y = out_nh;
-                          input_data = read_imageh(input_image_0, sampler, input_pos);
-                        } else if (c < C_0 + C_1) {
-                          c_in = c - C_0;
-                          int2 input_pos;
-                          input_pos.x = (c_in / 4) * out_W + out_w;
-                          input_pos.y = out_nh;
-                          input_data = read_imageh(input_image_1, sampler, input_pos);
-                        } else {
-                          c_in = c - C_0 - C_1;
-                          int2 input_pos;
-                          input_pos.x = (c_in / 4) * out_W + out_w;
-                          input_pos.y = out_nh;
-                          input_data = read_imageh(input_image_2, sampler, input_pos);
-                        }
-                        int value_offset = c_in % 4;
-                        float value;
-                        if (value_offset == 0) {
-                          value = input_data.x;
-                        } else if (value_offset == 1) {
-                          value = input_data.y;
-                        } else if (value_offset == 2) {
-                          value = input_data.z;
-                        } else if (value_offset == 3) {
-                          value = input_data.w;
-                        }
-                        if (i == 0) {
-                          output_data.x = value;
-                        } else if (i == 1) {
-                          output_data.y = value;
-                        } else if (i == 2) {
-                          output_data.z = value;
-                        } else if (i == 3) {
-                          output_data.w = value;
-                        }
-                      }
-                      write_imageh(output_image, output_pos, output_data);
-}
-
-
-__kernel void concatByCWith4Inputs(__read_only image2d_t input_image_0,
-                    __read_only image2d_t input_image_1,
-                    __read_only image2d_t input_image_2,
-                    __read_only image2d_t input_image_3,
-                    __private const int C_0,
-                    __private const int C_1,
-                    __private const int C_2,
-                    __private const int C_3,
-                    __write_only image2d_t output_image,
-                    __private const int out_C,
-                    __private const int out_W) {
-                      const int out_c = get_global_id(0);
-                      const int out_w = get_global_id(1);
-                      const int out_nh = get_global_id(2);
-
-                      const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                CLK_ADDRESS_CLAMP |
-                                                CLK_FILTER_NEAREST;
-
-                      int2 output_pos;
-                      output_pos.x = out_c * out_W + out_w;
-                      output_pos.y = out_nh;
-                      half4 output_data;
-
-                      for (int i = 0; i < 4; i++) {
-                        int c = out_c * 4 + i;
-                        if (c >= out_C) {
-                            break;
-                        }
-                        int c_in;
-                        half4 input_data;
-                        if (c < C_0) {
-                          c_in = c;
-                          int2 input_pos;
-                          input_pos.x = (c_in / 4) * out_W + out_w;
-                          input_pos.y = out_nh;
-                          input_data = read_imageh(input_image_0, sampler, input_pos);
-                        } else if (c < C_0 + C_1) {
-                          c_in = c - C_0;
-                          int2 input_pos;
-                          input_pos.x = (c_in / 4) * out_W + out_w;
-                          input_pos.y = out_nh;
-                          input_data = read_imageh(input_image_1, sampler, input_pos);
-                        } else if (c < C_0 + C_1 + C_2) {
-                          c_in = c - C_0 - C_1;
-                          int2 input_pos;
-                          input_pos.x = (c_in / 4) * out_W + out_w;
-                          input_pos.y = out_nh;
-                          input_data = read_imageh(input_image_2, sampler, input_pos);
-                        }else if (c < C_0 + C_1 + C_2 + C_3){
-                          c_in = c - C_0 - C_1 - C_2;
-                          int2 input_pos;
-                          input_pos.x = (c_in / 4) * out_W + out_w;
-                          input_pos.y = out_nh;
-                          input_data = read_imageh(input_image_3, sampler, input_pos);
-                        }
-                        int value_offset = c_in % 4;
-                        float value;
-                        if (value_offset == 0) {
-                          value = input_data.x;
-                        } else if (value_offset == 1) {
-                          value = input_data.y;
-                        } else if (value_offset == 2) {
-                          value = input_data.z;
-                        } else if (value_offset == 3) {
-                          value = input_data.w;
-                        }
-                        if (i == 0) {
-                          output_data.x = value;
-                        } else if (i == 1) {
-                          output_data.y = value;
-                        } else if (i == 2) {
-                          output_data.z = value;
-                        } else if (i == 3) {
-                          output_data.w = value;
-                        }
-                      }
-                      write_imageh(output_image, output_pos, output_data);
-}
-
-__kernel void concatByH(__read_only image2d_t input_image,
-                      __write_only image2d_t output_image,
-                      __private const int out_W,
-                      __private const int out_H_Start) {
-
-                      const int in_c = get_global_id(0);
-                      const int in_w = get_global_id(1);
-                      const int in_nh = get_global_id(2);
-
-                      int2 input_pos;
-                      input_pos.x = in_c * out_W + in_w;
-                      input_pos.y = in_nh;
-
-                      const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                CLK_ADDRESS_CLAMP |
-                                                CLK_FILTER_NEAREST;
-                      half4 input;
-                      input = read_imageh(input_image, sampler,input_pos);
-
-                      int2 output_pos;
-                      output_pos.x = input_pos.x;
-                      output_pos.y = out_H_Start + input_pos.y;
-
-                      write_imageh(output_image, output_pos, input);
-
-}
-
-__kernel void concatByW(__read_only image2d_t input_image,
-                      __write_only image2d_t output_image,
-                      __private const int in_W,
-                      __private const int pre_Width,
-                      __private const int out_Width) {
-
-                      const int in_c = get_global_id(0);
-                      const int in_w = get_global_id(1);
-                      const int in_nh = get_global_id(2);
-
-                      int2 input_pos;
-                      input_pos.x = in_c * in_W + in_w;
-                      input_pos.y = in_nh;
-
-                      const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                CLK_ADDRESS_CLAMP |
-                                                CLK_FILTER_NEAREST;
-                      half4 input;
-                      input = read_imageh(input_image, sampler,input_pos);
-
-                      int2 output_pos;
-                      output_pos.x = input_pos.x + pre_Width + out_Width * in_c;
-                      output_pos.y = input_pos.y;
-                      write_imageh(output_image, output_pos, input);
-
-}
-
-
-
-
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
deleted file mode 100644
index 2a5c823295..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
+++ /dev/null
@@ -1,15 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "conv_kernel.inc.cl"
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
deleted file mode 100755
index a292869c6f..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ /dev/null
@@ -1,2801 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
-conv
-conv_bn
-conv_add
-conv_relu
-conv_bn_relu
-conv_add_relu
-conv_add_bn_relu
-*/
-
-#include "cl_common.h"
-
-__kernel void conv_3x3(__private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-                                              __private const int global_size_dim2,
-                                              __read_only image2d_t input_image,
-                                              __read_only image2d_t filter,
-
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
-#endif
-
-#ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int offset,
-                                              __private const int input_c,
-                                              __private const int dilation,
-                                              __private const int input_width,/* of one block */
-                                              __private const int input_height,/* of one block */
-                                              __private const int output_width,
-                                              __private const int output_height,
-                                              __private const int filter_channel,
-                                              __private const int has_group) {
-
-    const int out_c = get_global_id(0);
-    const int out_w = get_global_id(1);
-    const int out_nh = get_global_id(2);
-    
-    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-
-    if (out_c >= global_size_dim0 ||
-        out_w >= global_size_dim1 ||
-        out_nh >= global_size_dim2) {
-        return;
-    }
-
-
-    int2 stride_xy;
-    stride_xy.x = stride;
-    stride_xy.y = stride;
-
-    int2 ouput_pos_in_one_block;
-    ouput_pos_in_one_block.x = out_w;
-    ouput_pos_in_one_block.y = out_nh;
-
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
-
-    int2 in_pos_in_one_block;
-    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
-    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
-
-#ifdef BIASE_CH
-    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
-#elif defined(BIASE_ELE)
-    half4 output = read_imageh(bias, sampler, output_pos);
-#else
-    half4 output = 0.0f;
-#endif
-
-    half4 input[9];
-    if (has_group == 0) {
-        for (int i = 0; i < input_c; ++i) {
-            int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-            input[0] = select(read_imageh(input_image, sampler,
-                                (int2)(pos_in.x - dilation, pos_in.y - dilation)),
-                                (half4)(0.0f),
-                                (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-
-            input[1] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x, pos_in.y - dilation)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-
-            input[2] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x + dilation, pos_in.y - dilation)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-
-            input[3] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x - dilation, pos_in.y)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-
-            input[4] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x, pos_in.y)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-
-            input[5] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x + dilation, pos_in.y)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-
-            input[6] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x - dilation, pos_in.y + dilation)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-
-            input[7] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x, pos_in.y + dilation)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-
-            input[8] = select(read_imageh(input_image, sampler,
-                              (int2)(pos_in.x + dilation, pos_in.y + dilation)),
-                              (half4)(0.0f),
-                              (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-
-
-/*
-            for (int j = 0; j < 9; ++j) {
-                int2 pos_of_weight;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-            }
-*/
-                int j = 0;
-                int2 pos_of_weight;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-                j = 1;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-                j = 2;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-                j = 3;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-                j = 4;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-                j = 5;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-
-               j = 6;
-               pos_of_weight.x = i * 3 + j % 3;
-               pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-               weight_x = read_imageh(filter, sampler, pos_of_weight);
-               output.x += dot(input[j], weight_x);
-
-               pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-               weight_y = read_imageh(filter, sampler, pos_of_weight);
-               output.y += dot(input[j], weight_y);
-
-               pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-               weight_z = read_imageh(filter, sampler, pos_of_weight);
-               output.z += dot(input[j], weight_z);
-
-               pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-               weight_w = read_imageh(filter, sampler, pos_of_weight);
-               output.w += dot(input[j], weight_w);
-
-               j = 7;
-               pos_of_weight.x = i * 3 + j % 3;
-               pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-               weight_x = read_imageh(filter, sampler, pos_of_weight);
-               output.x += dot(input[j], weight_x);
-
-               pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-               weight_y = read_imageh(filter, sampler, pos_of_weight);
-               output.y += dot(input[j], weight_y);
-
-               pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-               weight_z = read_imageh(filter, sampler, pos_of_weight);
-               output.z += dot(input[j], weight_z);
-
-               pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-               weight_w = read_imageh(filter, sampler, pos_of_weight);
-               output.w += dot(input[j], weight_w);
-
-               j = 8;
-               pos_of_weight.x = i * 3 + j % 3;
-               pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-               weight_x = read_imageh(filter, sampler, pos_of_weight);
-               output.x += dot(input[j], weight_x);
-
-               pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-               weight_y = read_imageh(filter, sampler, pos_of_weight);
-               output.y += dot(input[j], weight_y);
-
-               pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-               weight_z = read_imageh(filter, sampler, pos_of_weight);
-               output.z += dot(input[j], weight_z);
-
-               pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-               weight_w = read_imageh(filter, sampler, pos_of_weight);
-               output.w += dot(input[j], weight_w);
-
-        }
-    } else {
-        for (int i = 0; i < 4; i++) {
-            int used_input_channel_num = (out_c * 4 + i) * filter_channel;
-            for (int f_c = 0; f_c < filter_channel; ++f_c) {
-                int input_c = used_input_channel_num + f_c;
-                int input_block = input_c / 4;
-                int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-                input[0] = select(read_imageh(input_image, sampler,
-                                    (int2)(pos_in.x - dilation, pos_in.y - dilation)),
-                                    (half4)(0.0f),
-                                    (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-                input[1] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x, pos_in.y - dilation)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-                input[2] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x + dilation, pos_in.y - dilation)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-                input[3] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x - dilation, pos_in.y)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-                input[4] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x, pos_in.y)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-                input[5] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x + dilation, pos_in.y)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-                input[6] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x - dilation, pos_in.y + dilation)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-                input[7] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x, pos_in.y + dilation)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-                input[8] = select(read_imageh(input_image, sampler,
-                                  (int2)(pos_in.x + dilation, pos_in.y + dilation)),
-                                  (half4)(0.0f),
-                                  (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-
-                half tmp_out = 0;
-                for (int j = 0; j < 9; j++) {
-                    int2 pos_of_weight;
-                    pos_of_weight.x = (f_c / 4) * 3 + j % 3;
-                    pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3;
-                    half4 weight = read_imageh(filter, sampler, pos_of_weight);
-                    int f_c_offset = f_c % 4;
-                    half f_value;
-                    if (f_c_offset == 0) {
-                        f_value = weight.x;
-                    } else if (f_c_offset == 1) {
-                        f_value = weight.y;
-                    } else if (f_c_offset == 2) {
-                        f_value = weight.z;
-                    } else if (f_c_offset == 3) {
-                        f_value = weight.w;
-                    }
-                    int input_c_offset = input_c % 4;
-                    half input_value;
-                    if (input_c_offset == 0) {
-                        input_value = input[j].x;
-                    } else if (input_c_offset == 1) {
-                        input_value = input[j].y;
-                    } else if (input_c_offset == 2) {
-                        input_value = input[j].z;
-                    } else if (input_c_offset == 3) {
-                        input_value = input[j].w;
-                    }
-                    tmp_out += f_value * input_value;
-                }
-
-                if (i == 0) {
-                    output.x += tmp_out;
-                } else if (i == 1) {
-                    output.y += tmp_out;
-                } else if (i == 2) {
-                    output.z += tmp_out;
-                } else if (i == 3) {
-                    output.w += tmp_out;
-                }
-            }
-        }
-    }
-
-
-#ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
-#endif
-
-#ifdef RELU
-    output = activation(output);
-#endif
-
-    write_imageh(output_image, output_pos, output);
-}
-
-   // dilation == 1
-__kernel void conv_3x3spl(__private const int item_ch,
-                               __private const int item_w,
-                               __private const int item_h,
-                               __read_only image2d_t input_image,
-                               __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-        __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-__read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-                               __write_only image2d_t output_image,
-                               __private const int stride,
-                               __private const int pad,
-                               __private const int dilation,
-                               __private const int in_ch,
-                               __private const int in_w,
-                               __private const int in_h,
-                               __private const int out_w,
-                               __private const int out_h) {
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
-
-    // item_id
-    const int item_ch_id = get_global_id(0);
-    const int item_w_id = get_global_id(1);
-    const int item_h_id = get_global_id(2);
-
-    // out_width_id_per_blk and out_batch_id
-    int out_batch_id = item_h_id / in_h;
-    int out_w_base_id = item_ch_id * out_w;
-    int out_w_id0 = item_w_id;
-    int out_w_id1 = out_w_id0 + item_w;
-    int out_w_id2 = out_w_id1 + item_w;
-    int out_w_id3 = out_w_id2 + item_w;
-    int out_w_id4 = out_w_id3 + item_w;
-
-    // in_width_id_per_blk and in_height_id_per_batch
-    int in_h_id = (item_h_id % out_h) * stride - pad;
-    int in_w_id0 = item_w_id * stride - pad;
-    int in_w_id1 = in_w_id0 + item_w * stride;
-    int in_w_id2 = in_w_id1 + item_w * stride;
-    int in_w_id3 = in_w_id2 + item_w * stride;
-    int in_w_id4 = in_w_id3 + item_w * stride;
-
-#ifdef BIASE_CH
-
-    half4 output[5];
-    output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
-    output[1] = output[0];
-    output[2] = output[0];
-    output[3] = output[0];
-    output[4] = output[0];
-
-#elif defined(BIASE_ELE)
-
-    half4 output[5];
-    output[0] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
-    if (out_w_id1 < out_w) {
-        output[1] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id1, item_h_id));
-    }
-    if (out_w_id2 < out_w) {
-        output[2] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id2, item_h_id));
-    }
-    if (out_w_id3 < out_w) {
-        output[3] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id3, item_h_id));
-    }
-    if (out_w_id4 < out_w) {
-        output[4] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id4, item_h_id));
-    }
-#else
-    half4 output[5] = {0.0f};
-#endif
-
-    half4 filter[4] = {0.0f};
-    half4 filter_trans[4] = {0.0f};
-    half4 input[5] = {0.0f};
-
-    int filter_h_val0 = item_ch_id * 4 * 3;
-    int filter_h_val1 = filter_h_val0 + 3;
-    int filter_h_val2 = filter_h_val1 + 3;
-    int filter_h_val3 = filter_h_val2 + 3;
-
-    for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
-        int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
-
-        const int in_w_base_id = mul24(ch, in_w);
-
-        int filter_w_val = ch * 3;
-
-        for (int h = 0; h < 3; h++) {
-
-            int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
-                                 (out_batch_id * in_h + in_h_id + h < 0 || out_batch_id * in_h + in_h_id + h >= in_h));
-
-            for (int w = 0; w < 3; w++) {
-
-                int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
-                                  (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
-                int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
-                                   (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
-                int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
-                                   (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
-                int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
-                                   (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
-                int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
-                                   (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
-
-                filter[0] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val0 + h)); // in_ch:0-3,out_ch:0
-                filter[1] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val1 + h)); // in_ch:0-3,out_ch:1
-                filter[2] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val2 + h)); // in_ch:0-3,out_ch:2
-                filter[3] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val3 + h)); // in_ch:0-3,out_ch:3
-
-                filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x);    // in_ch:0,out_ch:0-3
-                filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y);    // in_ch:1,out_ch:0-3
-                filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z);    // in_ch:2,out_ch:0-3
-                filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, filter[3].w);    // in_ch:3,out_ch:0-3
-
-                input[0] = read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val));
-                input[1] = read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val));
-                input[2] = read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val));
-                input[3] = read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val));
-                input[4] = read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val));
-
-                output[0] = mad(input[0].x, filter_trans[0], output[0]);
-                output[1] = mad(input[1].x, filter_trans[0], output[1]);
-                output[2] = mad(input[2].x, filter_trans[0], output[2]);
-                output[3] = mad(input[3].x, filter_trans[0], output[3]);
-                output[4] = mad(input[4].x, filter_trans[0], output[4]);
-
-                if (ch_surplus < 3) {
-                    output[0] = mad(input[0].y, filter_trans[1], output[0]);
-                    output[1] = mad(input[1].y, filter_trans[1], output[1]);
-                    output[2] = mad(input[2].y, filter_trans[1], output[2]);
-                    output[3] = mad(input[3].y, filter_trans[1], output[3]);
-                    output[4] = mad(input[4].y, filter_trans[1], output[4]);
-                }
-                if (ch_surplus < 2) {
-                    output[0] = mad(input[0].z, filter_trans[2], output[0]);
-                    output[1] = mad(input[1].z, filter_trans[2], output[1]);
-                    output[2] = mad(input[2].z, filter_trans[2], output[2]);
-                    output[3] = mad(input[3].z, filter_trans[2], output[3]);
-                    output[4] = mad(input[4].z, filter_trans[2], output[4]);
-                }
-                if (ch_surplus < 1) {
-                    output[0] = mad(input[0].w, filter_trans[3], output[0]);
-                    output[1] = mad(input[1].w, filter_trans[3], output[1]);
-                    output[2] = mad(input[2].w, filter_trans[3], output[2]);
-                    output[3] = mad(input[3].w, filter_trans[3], output[3]);
-                    output[4] = mad(input[4].w, filter_trans[3], output[4]);
-                }
-            }
-        }
-    }
-#ifdef BATCH_NORM
-    half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
-    half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
-    output[0] = mad(scale, output[0], biase);
-    if (out_w_id1 < out_w) {
-        output[1] =  mad(scale, output[1], biase);
-    }
-    if (out_w_id2 < out_w) {
-        output[2] =  mad(scale, output[2], biase);
-    }
-    if (out_w_id3 < out_w) {
-        output[3] =  mad(scale, output[3], biase);
-    }
-    if (out_w_id4 < out_w) {
-        output[4] =  mad(scale, output[4], biase);
-    }
-#endif
-
-#ifdef RELU
-    output[0] = activation(output[0]);
-    output[1] = activation(output[1]);
-    output[2] = activation(output[2]);
-    output[3] = activation(output[3]);
-    output[4] = activation(output[4]);
-#endif
-    write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), output[0]);
-    if (out_w_id1 < out_w) {
-        write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), output[1]);
-    }
-    if (out_w_id2 < out_w) {
-        write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), output[2]);
-    }
-    if (out_w_id3 < out_w) {
-        write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), output[3]);
-    }
-    if (out_w_id4 < out_w) {
-        write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), output[4]);
-    }
-}
-
-
-
-__kernel void depth_conv_3x3(__private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-                                              __private const int global_size_dim2,
-                                              __read_only image2d_t input,
-                                              __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int offset,
-                                              __private const int input_c,
-                                              __private const int dilation,
-                                              __private const int input_width,/* of one block */
-                                              __private const int input_height, /* of one block */
-                                              __private const int output_width,
-                                              __private const int output_height) {
-
-    const int out_c = get_global_id(0);
-    const int out_w = get_global_id(1);
-    const int out_nh = get_global_id(2);
-
-    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
-
-    const int batch_index = out_nh / output_height;
-
-    const int out_nh_in_one_batch = out_nh % output_height;
-
-
-    int2 stride_xy = (int2)(stride, stride);
-    int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
-
-    int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
-
-#ifdef BIASE_CH
-    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
-#elif defined(BIASE_ELE)
-    half4 output = read_imageh(bias, sampler, output_pos);
-#else
-    half4 output = 0.0f;
-#endif
-
-    const int filter_width = 3;
-    const int filter_height = 3;
-
-    int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height);
-
-    int2 pos_in_filter_block = (int2)(out_c * filter_width, batch_index * filter_height);
-
-    int filter_x = pos_in_filter_block.x ;
-    int filter_y = pos_in_filter_block.y ;
-
-    half4 inputs[9];
-
-        inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
-
-        inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
-
-        inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
-
-        inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-        /*
-        if (output_pos.x == 112 && output_pos.y == 0) {
-              half4 input1 = inputs[3];
-              float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
-              printf(" input4 3 - %v4hlf \n", in);
-              printf(" --- %d ---\n", in_pos_in_one_block.x - 1);
-        }
-        */
-
-
-        inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-
-        inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-
-        inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
-
-        inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
-
-        inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
-                           (half4)(0.0f),
-                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
-
-    half4 filters[9];
-    filters[0] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y));
-    filters[1] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y));
-    filters[2] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y));
-    filters[3] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y + 1));
-    filters[4] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 1));
-    filters[5] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 1));
-    filters[6] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y + 2));
-    filters[7] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 2));
-    filters[8] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 2));
-
-    for(int i = 0 ;i < 9 ; i++){
-     output += inputs[i] * filters[i];
-    }
-#ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
-#endif
-
-#ifdef RELU
-    output = activation(output);
-#endif
-
-
-    /*
-
-    if (output_pos.x == 112 && output_pos.y == 0) {
-
-        for (int i = 0; i < 9; ++i) {
-            half4 input1 = inputs[i];
-            float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
-            printf(" input4 %d - %v4hlf \n", i, in);
-        }
-
-        float4 out = (float4)(output.x, output.y, output.z, output.w);
-        printf(" depth wise output output4 = %v4hlf \n", out);
-        printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);
-        printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y);
-        printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x);
-        printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y);
-    }
-
-    */
-
-    write_imageh(output_image, output_pos, output);
-
-}
-
-
-
-__kernel void depth_conv_3x3s1(__private const int ou_ch_blk,
-                                              __private const int ou_w_blk,
-                                              __private const int ou_nh,
-                                              __read_only image2d_t input,
-                                              __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int pad,
-                                              __private const int dilation,
-                                              __private const int in_ch,
-                                              __private const int in_w,/* of one block */
-                                              __private const int in_h, /* of one block */
-                                              __private const int ou_w,
-                                              __private const int ou_h) {
-
-    const int ou_ch_blk_id = get_global_id(0);
-    const int ou_w_blk_id = get_global_id(1);
-    const int ou_nh_id = get_global_id(2);
-    const int w_blk_size = 2;
-
-    const int batch_id = ou_nh_id / ou_h;
-    int ou_col_id = ou_w_blk_id * w_blk_size;
-    int ou_row_id = ou_nh_id % ou_h;
-    int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id);
-
-    // input pos in one block and on batch
-    int col_id = ou_col_id - pad;
-    int row_id = ou_row_id - pad;
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
-
-#ifdef BIASE_CH
-    half4 output[2];
-    output[0] = read_imageh(bias, sampler, (int2)(ou_ch_blk_id, 0));
-    output[1] = output[0];
-#elif defined(BIASE_ELE)
-    half4 output[2];
-    output[0] = read_imageh(bias, sampler, (int2)(ou_x, ou_nh_id));
-    if (ou_col_id + 1 < ou_w) {
-        output[1] = read_imageh(bias, sampler, (int2)(ou_x + 1, ou_nh_id));
-    }
-#else
-    half4 output[2] = {0.0f};
-#endif
-
-    half4 inputs[12];
-
-    int filter_x = ou_ch_blk_id * 3;
-    int filter_y = 0;
-    half4 filters[9];
-    filters[0] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y));
-    filters[1] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y));
-    filters[2] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y));
-
-    int in_x = mad24(ou_ch_blk_id, in_w, col_id);
-    int in_y = mad24(batch_id, in_h, row_id);
-
-    int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h);
-    int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w);
-    inputs[0] = read_imageh(input, sampler, (int2)(x0, y0));
-    int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w);
-    inputs[1] = read_imageh(input, sampler, (int2)(x1, y0));
-    int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w);
-    inputs[2] = read_imageh(input, sampler, (int2)(x2, y0));
-    int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w);
-    inputs[3] = read_imageh(input, sampler, (int2)(x3, y0));
-
-    output[0] = mad(inputs[0], filters[0], output[0]);
-    output[1] = mad(inputs[1], filters[0], output[1]);
-
-    output[0] = mad(inputs[1], filters[1], output[0]);
-    output[1] = mad(inputs[2], filters[1], output[1]);
-
-    output[0] = mad(inputs[2], filters[2], output[0]);
-    output[1] = mad(inputs[3], filters[2], output[1]);
-
-
-    filters[3] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y + 1));
-    filters[4] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 1));
-    filters[5] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 1));
-
-
-    int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h);
-    inputs[4] = read_imageh(input, sampler, (int2)(x0, y1));
-    inputs[5] = read_imageh(input, sampler, (int2)(x1, y1));
-    inputs[6] = read_imageh(input, sampler, (int2)(x2, y1));
-    inputs[7] = read_imageh(input, sampler, (int2)(x3, y1));
-
-
-    output[0] = mad(inputs[4], filters[3], output[0]);
-    output[1] = mad(inputs[5], filters[3], output[1]);
-
-    output[0] = mad(inputs[5], filters[4], output[0]);
-    output[1] = mad(inputs[6], filters[4], output[1]);
-
-    output[0] = mad(inputs[6], filters[5], output[0]);
-    output[1] = mad(inputs[7], filters[5], output[1]);
-
-
-    filters[6] =  read_imageh(filter, sampler,(int2)(filter_x,filter_y + 2));
-    filters[7] =  read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 2));
-    filters[8] =  read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 2));
-
-    int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h);
-    inputs[8] = read_imageh(input, sampler, (int2)(x0, y2));
-    inputs[9] = read_imageh(input, sampler, (int2)(x1, y2));
-    inputs[10] = read_imageh(input, sampler, (int2)(x2, y2));
-    inputs[11] = read_imageh(input, sampler, (int2)(x3, y2));
-
-
-    output[0] = mad(inputs[8], filters[6], output[0]);
-    output[1] = mad(inputs[9], filters[6], output[1]);
-
-    output[0] = mad(inputs[9], filters[7], output[0]);
-    output[1] = mad(inputs[10], filters[7], output[1]);
-
-    output[0] = mad(inputs[10], filters[8], output[0]);
-    output[1] = mad(inputs[11], filters[8], output[1]);
-#ifdef BATCH_NORM
-    half4 scale = read_imageh(new_scale, sampler, (int2)(ou_ch_blk_id, 0));
-    half4 biase = read_imageh(new_biase, sampler, (int2)(ou_ch_blk_id, 0));
-    output[0] = mad(scale, output[0], biase);
-    if (ou_col_id + 1 < ou_w) {
-        output[1] = mad(scale, output[1], biase);
-    }
-#endif
-
-#ifdef RELU
-    output[0] = activation(output[0]);
-    output[1] = activation(output[1]);
-#endif
-
-    write_imageh(output_image, (int2)(ou_x, ou_nh_id), output[0]);
-    if (ou_col_id + 1 < ou_w) {
-        write_imageh(output_image, (int2)(ou_x + 1, ou_nh_id), output[1]);
-    }
-
-}
-
-__kernel void conv_1x1(__private const int global_size_dim0,
-                       __private const int global_size_dim1,
-                       __private const int global_size_dim2,
-                       __read_only image2d_t input_image,
-                       __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-                       __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-                       __read_only image2d_t new_scale,
-                       __read_only image2d_t new_biase,
-#endif
-                       __write_only image2d_t output_image,
-                       __private const int stride,
-                       __private const int offset,
-                       __private const int input_c,
-                       __private const int dilation,
-                       __private const int input_width,/* of one block */
-                       __private const int input_height,/* of one block */
-                       __private const int output_width,
-                       __private const int output_height) {
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-    
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                           CLK_ADDRESS_CLAMP         |
-                           CLK_FILTER_NEAREST;
-
-  const uint kernelHXW = 1;
-  int2 stride_xy = (int2)(stride, stride);
-  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
-  int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
-
-#ifdef BIASE_CH
-    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
-#elif defined(BIASE_ELE)
-    half4 output = read_imageh(bias, sampler, output_pos);
-#else
-    half4 output = 0.0f;
-#endif
-
-   for (int i = 0; i < input_c; ++i) {
-        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-        half4 input = read_imageh(input_image, sampler, pos_in);
-
-        half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
-        half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
-        half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
-        half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
-/*
-        output.x = dot(input, weight0);
-        output.y = dot(input, weight1);
-        output.z = dot(input, weight2);
-        output.w = dot(input, weight3);
-*/
-
-        output = mad(input.x, weight0, output);
-        output = mad(input.y, weight1, output);
-        output = mad(input.z, weight2, output);
-        output = mad(input.w, weight3, output);
-
-   }
-
-#ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
-#endif
-
-#ifdef RELU
-  output = activation(output);
-#endif
-
-  write_imageh(output_image, output_pos, output);
-}
-
-__kernel void conv_1x1_spl(
-    __private const int global_size_dim0, __private const int global_size_dim1,
-    __private const int global_size_dim2, __read_only image2d_t input_image,
-    __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-    __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
-#endif
-    __write_only image2d_t output_image, __private const int stride,
-    __private const int offset, __private const int input_c,
-    __private const int dilation,
-    __private const int input_width,  /* of one block */
-    __private const int input_height, /* of one block */
-    __private const int output_width,
-    __private const int output_height,
-    __private const int old_w
-    ) {
-
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  int out_w0 = out_w;
-  int out_w1 = out_w + global_size_dim1;
-  int out_w2 = out_w + global_size_dim1 * 2;
-  int out_w3 = out_w + global_size_dim1 * 3;
-
-//  int out_w1 = out_w + global_size_dim1;
-//  int out_w2 = out_w + global_size_dim1 * 2;
-//  int out_w3 = out_w + global_size_dim1 * 3;
-    
-  int outpos_main = mul24(out_c , old_w);
-  int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
-  int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
-  int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
-  int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  int2 stride_xy = (int2)(stride, stride);
-
-  int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
-  int2 in_pos_in_one_block0 =
-      ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
-  int2 in_pos_in_one_block1 =
-      ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
-  int2 in_pos_in_one_block2 =
-      ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
-  int2 in_pos_in_one_block3 =
-      ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
-
-#ifdef BIASE_CH
-    half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
-    half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0));
-    half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0));
-    half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0));
-#elif defined(BIASE_ELE)
-    half4 output0 = read_imageh(bias, sampler, output_pos0);
-    half4 output1 = read_imageh(bias, sampler, output_pos1);
-    half4 output2 = read_imageh(bias, sampler, output_pos2);
-    half4 output3 = read_imageh(bias, sampler, output_pos3);
-
-#else
-  half4 output0 = 0.0f;
-  half4 output1 = 0.0f;
-  half4 output2 = 0.0f;
-  half4 output3 = 0.0f;
-#endif
-  for (int i = 0; i < input_c; ++i) {
-    // ------------0---------------
-    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
-    half4 input0 = read_imageh(input_image, sampler, pos_in);
-
-    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
-    half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
-    half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
-    half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
-
-    output0 = mad(input0.x, weight0, output0);
-    output0 = mad(input0.y, weight1, output0);
-    output0 = mad(input0.z, weight2, output0);
-    output0 = mad(input0.w, weight3, output0);
-
-    // -------------1--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
-    half4 input1 = read_imageh(input_image, sampler, pos_in);
-    //
-    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-    //    * 4 + 3));
-
-    output1 = mad(input1.x, weight0, output1);
-    output1 = mad(input1.y, weight1, output1);
-    output1 = mad(input1.z, weight2, output1);
-    output1 = mad(input1.w, weight3, output1);
-
-    // -------------2--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
-    half4 input2 = read_imageh(input_image, sampler, pos_in);
-
-    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-    //    * 4 + 3));
-
-    output2 = mad(input2.x, weight0, output2);
-    output2 = mad(input2.y, weight1, output2);
-    output2 = mad(input2.z, weight2, output2);
-    output2 = mad(input2.w, weight3, output2);
-
-    // -------------3--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
-    half4 input3 = read_imageh(input_image, sampler, pos_in);
-
-    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-    //    * 4 + 3));
-
-    output3 = mad(input3.x, weight0, output3);
-    output3 = mad(input3.y, weight1, output3);
-    output3 = mad(input3.z, weight2, output3);
-    output3 = mad(input3.w, weight3, output3);
-  }
-
-#ifdef BATCH_NORM
-    output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-    output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-    output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-    output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-#endif
-
-#ifdef RELU
-     output0 = activation(output0);
-     output1 = activation(output1);
-     output2 = activation(output2);
-     output3 = activation(output3);
-#endif
-
-  if (out_w0 < old_w) {
-    write_imageh(output_image, output_pos0, output0);
-  }
-
-  if (out_w1 < old_w){
-    write_imageh(output_image, output_pos1, output1);
-  }
-
-  if (out_w2 < old_w){
-    write_imageh(output_image, output_pos2, output2);
-  }
-
-  if (out_w3 < old_w){
-    write_imageh(output_image, output_pos3, output3);
-  }
-}
-
-__kernel void conv_1x1_spl2(
-    __private const int global_size_dim0, __private const int global_size_dim1,
-    __private const int global_size_dim2, __read_only image2d_t input_image,
-    __read_only image2d_t filter,
-#ifdef BIASE
-    __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
-#endif
-    __write_only image2d_t output_image, __private const int stride,
-    __private const int offset, __private const int input_c,
-    __private const int dilation,
-    __private const int input_width,  /* of one block */
-    __private const int input_height, /* of one block */
-    __private const int output_width,
-    __private const int output_height,
-    __private const int old_w
-) {
-
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  int out_w0 = out_w;
-  int out_w1 = out_w + global_size_dim1;
-  int out_w2 = out_w + global_size_dim1 * 2;
-  int out_w3 = out_w + global_size_dim1 * 3;
-  int out_w4 = out_w + global_size_dim1 * 4;
-  int out_w5 = out_w + global_size_dim1 * 5;
-  int out_w6 = out_w + global_size_dim1 * 6;
-  int out_w7 = out_w + global_size_dim1 * 7;
-
-//  int out_w1 = out_w + global_size_dim1;
-//  int out_w2 = out_w + global_size_dim1 * 2;
-//  int out_w3 = out_w + global_size_dim1 * 3;
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  int2 stride_xy = (int2)(stride, stride);
-
-  int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
-  int2 in_pos_in_one_block0 =
-      ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
-  int2 in_pos_in_one_block1 =
-      ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
-  int2 in_pos_in_one_block2 =
-      ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
-  int2 in_pos_in_one_block3 =
-      ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block4 = (int2)(out_w4, out_nh);
-  int2 in_pos_in_one_block4 =
-      ouput_pos_in_one_block4 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block5 = (int2)(out_w5, out_nh);
-  int2 in_pos_in_one_block5 =
-      ouput_pos_in_one_block5 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block6 = (int2)(out_w6, out_nh);
-  int2 in_pos_in_one_block6 =
-      ouput_pos_in_one_block6 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block7 = (int2)(out_w7, out_nh);
-  int2 in_pos_in_one_block7 =
-      ouput_pos_in_one_block7 * stride_xy + (int2)(offset, offset);
-
-#ifdef BIASE
-  half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
-  half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0));
-  half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0));
-  half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0));
-  half4 output4 = read_imageh(bias, sampler, (int2)(out_c, 0));
-  half4 output5 = read_imageh(bias, sampler, (int2)(out_c, 0));
-  half4 output6 = read_imageh(bias, sampler, (int2)(out_c, 0));
-  half4 output7 = read_imageh(bias, sampler, (int2)(out_c, 0));
-//  half4 output0 = 0.0f;
-//  half4 output1 = 0.0f;
-//  half4 output2 = 0.0f;
-//  half4 output3 = 0.0f;
-
-#else
-  half4 output0 = 0.0f;
-  half4 output1 = 0.0f;
-  half4 output2 = 0.0f;
-  half4 output3 = 0.0f;
-  half4 output4 = 0.0f;
-  half4 output5 = 0.0f;
-  half4 output6 = 0.0f;
-  half4 output7 = 0.0f;
-#endif
-  for (int i = 0; i < input_c; ++i) {
-    // ------------0---------------
-    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
-    half4 input0 = read_imageh(input_image, sampler, pos_in);
-
-    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
-    half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
-    half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
-    half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
-
-    output0 = mad(input0.x, weight0, output0);
-    output0 = mad(input0.y, weight1, output0);
-    output0 = mad(input0.z, weight2, output0);
-    output0 = mad(input0.w, weight3, output0);
-
-    // -------------1--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
-    half4 input1 = read_imageh(input_image, sampler, pos_in);
-    //
-    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-    //    * 4 + 3));
-
-    output1 = mad(input1.x, weight0, output1);
-    output1 = mad(input1.y, weight1, output1);
-    output1 = mad(input1.z, weight2, output1);
-    output1 = mad(input1.w, weight3, output1);
-
-    // -------------2--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
-    half4 input2 = read_imageh(input_image, sampler, pos_in);
-
-    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-    //    * 4 + 3));
-
-    output2 = mad(input2.x, weight0, output2);
-    output2 = mad(input2.y, weight1, output2);
-    output2 = mad(input2.z, weight2, output2);
-    output2 = mad(input2.w, weight3, output2);
-
-    // -------------3--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
-    half4 input3 = read_imageh(input_image, sampler, pos_in);
-
-    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-    //    * 4 + 3));
-
-    output3 = mad(input3.x, weight0, output3);
-    output3 = mad(input3.y, weight1, output3);
-    output3 = mad(input3.z, weight2, output3);
-    output3 = mad(input3.w, weight3, output3);
-
-
-    // -------------4--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block4.x, in_pos_in_one_block4.y);
-    half4 input4 = read_imageh(input_image, sampler, pos_in);
-
-    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-    //    * 4 + 3));
-
-    output4 = mad(input4.x, weight0, output4);
-    output4 = mad(input4.y, weight1, output4);
-    output4 = mad(input4.z, weight2, output4);
-    output4 = mad(input4.w, weight3, output4);
-
-
-
-    // -------------5--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block5.x, in_pos_in_one_block5.y);
-    half4 input5 = read_imageh(input_image, sampler, pos_in);
-
-    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-    //    * 4 + 3));
-
-    output5= mad(input5.x, weight0, output5);
-    output5 = mad(input5.y, weight1, output5);
-    output5 = mad(input5.z, weight2, output5);
-    output5 = mad(input5.w, weight3, output5);
-
-
-    // -------------6--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block6.x, in_pos_in_one_block6.y);
-    half4 input6 = read_imageh(input_image, sampler, pos_in);
-
-    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-    //    * 4 + 3));
-
-    output6 = mad(input6.x, weight0, output6);
-    output6 = mad(input6.y, weight1, output6);
-    output6 = mad(input6.z, weight2, output6);
-    output6 = mad(input6.w, weight3, output6);
-
-
-    // -------------7--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block7.x, in_pos_in_one_block7.y);
-    half4 input7 = read_imageh(input_image, sampler, pos_in);
-
-    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-    //    * 4 + 3));
-
-    output7 = mad(input7.x, weight0, output7);
-    output7 = mad(input7.y, weight1, output7);
-    output7 = mad(input7.z, weight2, output7);
-    output7 = mad(input7.w, weight3, output7);
-  }
-
-#ifdef BATCH_NORM
-    output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-    output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-    output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-    output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-    output4 = output4 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-    output5 = output5 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-    output6 = output6 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-    output7 = output7 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-#endif
-
-#ifdef RELU
-     output0 = activation(output0);
-     output1 = activation(output1);
-     output2 = activation(output2);
-     output3 = activation(output3);
-     output4 = activation(output4);
-     output5 = activation(output5);
-     output6 = activation(output6);
-     output7 = activation(output7);
-#endif
-  int outpos_main = mul24(out_c , old_w);
-  int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
-
-  if (out_w0 < old_w) {
-    write_imageh(output_image, output_pos0, output0);
-  }
-  int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
-  if (out_w1 < old_w){
-    write_imageh(output_image, output_pos1, output1);
-  }
-
-  int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
-  if (out_w2 < old_w){
-    write_imageh(output_image, output_pos2, output2);
-  }
-
-  int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
-  if (out_w3 < old_w){
-    write_imageh(output_image, output_pos3, output3);
-  }
-
-  int2 output_pos4 = (int2)(outpos_main + out_w4, out_nh);
-  if (out_w4 < old_w){
-    write_imageh(output_image, output_pos4, output4);
-  }
-
-  int2 output_pos5 = (int2)(outpos_main + out_w5, out_nh);
-  if (out_w5 < old_w){
-    write_imageh(output_image, output_pos5, output5);
-
-  }
-  int2 output_pos6 = (int2)(outpos_main + out_w6, out_nh);
-  if (out_w6 < old_w){
-    write_imageh(output_image, output_pos6, output6);
-  }
-
-  int2 output_pos7 = (int2)(outpos_main + out_w7, out_nh);
-  if (out_w7 < old_w){
-    write_imageh(output_image, output_pos7, output7);
-  }
-
-}
-__kernel void conv_1x1_spl3(
-    __private const int global_size_dim0, __private const int global_size_dim1,
-    __private const int global_size_dim2, __read_only image2d_t input_image,
-    __read_only image2d_t filter,
-#ifdef BIASE
-    __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
-#endif
-    __write_only image2d_t output_image, __private const int stride,
-    __private const int offset, __private const int input_c,
-    __private const int dilation,
-    __private const int input_width,  /* of one block */
-    __private const int input_height, /* of one block */
-    __private const int output_width,
-    __private const int output_height,
-    __private const int old_w
-) {
-
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  int out_w0 = out_w;
-  int out_w1 = out_w + global_size_dim1;
-  int out_w2 = out_w + global_size_dim1 * 2;
-//  int out_w3 = out_w + global_size_dim1 * 3;
-//  int out_w4 = out_w + global_size_dim1 * 4;
-//  int out_w5 = out_w + global_size_dim1 * 5;
-//  int out_w6 = out_w + global_size_dim1 * 6;
-//  int out_w7 = out_w + global_size_dim1 * 7;
-
-//  int out_w1 = out_w + global_size_dim1;
-//  int out_w2 = out_w + global_size_dim1 * 2;
-//  int out_w3 = out_w + global_size_dim1 * 3;
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  int2 stride_xy = (int2)(stride, stride);
-
-  int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
-  int2 in_pos_in_one_block0 =
-      ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
-  int2 in_pos_in_one_block1 =
-      ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
-
-//  int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
-//  int2 in_pos_in_one_block2 =
-//      ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
-//
-//  int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
-//  int2 in_pos_in_one_block3 =
-//      ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
-//
-//  int2 ouput_pos_in_one_block4 = (int2)(out_w4, out_nh);
-//  int2 in_pos_in_one_block4 =
-//      ouput_pos_in_one_block4 * stride_xy + (int2)(offset, offset);
-//
-//  int2 ouput_pos_in_one_block5 = (int2)(out_w5, out_nh);
-//  int2 in_pos_in_one_block5 =
-//      ouput_pos_in_one_block5 * stride_xy + (int2)(offset, offset);
-//
-//  int2 ouput_pos_in_one_block6 = (int2)(out_w6, out_nh);
-//  int2 in_pos_in_one_block6 =
-//      ouput_pos_in_one_block6 * stride_xy + (int2)(offset, offset);
-//
-//  int2 ouput_pos_in_one_block7 = (int2)(out_w7, out_nh);
-//  int2 in_pos_in_one_block7 =
-//      ouput_pos_in_one_block7 * stride_xy + (int2)(offset, offset);
-
-#ifdef BIASE
-  half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
-  half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0));
-//  half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0));
-//  half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0));
-//  half4 output4 = read_imageh(bias, sampler, (int2)(out_c, 0));
-//  half4 output5 = read_imageh(bias, sampler, (int2)(out_c, 0));
-//  half4 output6 = read_imageh(bias, sampler, (int2)(out_c, 0));
-//  half4 output7 = read_imageh(bias, sampler, (int2)(out_c, 0));
-//  half4 output0 = 0.0f;
-//  half4 output1 = 0.0f;
-//  half4 output2 = 0.0f;
-//  half4 output3 = 0.0f;
-
-#else
-  half4 output0 = 0.0f;
-  half4 output1 = 0.0f;
-//  half4 output2 = 0.0f;
-//  half4 output3 = 0.0f;
-//  half4 output4 = 0.0f;
-//  half4 output5 = 0.0f;
-//  half4 output6 = 0.0f;
-//  half4 output7 = 0.0f;
-#endif
-  for (int i = 0; i < input_c; ++i) {
-    // ------------0---------------
-    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
-    half4 input0 = read_imageh(input_image, sampler, pos_in);
-
-    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
-    half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
-    half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
-    half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
-
-    output0 = mad(input0.x, weight0, output0);
-    output0 = mad(input0.y, weight1, output0);
-    output0 = mad(input0.z, weight2, output0);
-    output0 = mad(input0.w, weight3, output0);
-
-    // -------------1--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
-    half4 input1 = read_imageh(input_image, sampler, pos_in);
-    //
-    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-    //    * 4 + 3));
-
-    output1 = mad(input1.x, weight0, output1);
-    output1 = mad(input1.y, weight1, output1);
-    output1 = mad(input1.z, weight2, output1);
-    output1 = mad(input1.w, weight3, output1);
-//
-//    // -------------2--------------
-//    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
-//    half4 input2 = read_imageh(input_image, sampler, pos_in);
-//
-//    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-//    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-//    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-//    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-//    //    * 4 + 3));
-//
-//    output2 = mad(input2.x, weight0, output2);
-//    output2 = mad(input2.y, weight1, output2);
-//    output2 = mad(input2.z, weight2, output2);
-//    output2 = mad(input2.w, weight3, output2);
-//
-//    // -------------3--------------
-//    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
-//    half4 input3 = read_imageh(input_image, sampler, pos_in);
-//
-//    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-//    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-//    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-//    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-//    //    * 4 + 3));
-//
-//    output3 = mad(input3.x, weight0, output3);
-//    output3 = mad(input3.y, weight1, output3);
-//    output3 = mad(input3.z, weight2, output3);
-//    output3 = mad(input3.w, weight3, output3);
-//
-//
-//    // -------------4--------------
-//    pos_in = (int2)(i * input_width + in_pos_in_one_block4.x, in_pos_in_one_block4.y);
-//    half4 input4 = read_imageh(input_image, sampler, pos_in);
-//
-//    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-//    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-//    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-//    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-//    //    * 4 + 3));
-//
-//    output4 = mad(input4.x, weight0, output4);
-//    output4 = mad(input4.y, weight1, output4);
-//    output4 = mad(input4.z, weight2, output4);
-//    output4 = mad(input4.w, weight3, output4);
-//
-//
-//
-//    // -------------5--------------
-//    pos_in = (int2)(i * input_width + in_pos_in_one_block5.x, in_pos_in_one_block5.y);
-//    half4 input5 = read_imageh(input_image, sampler, pos_in);
-//
-//    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-//    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-//    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-//    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-//    //    * 4 + 3));
-//
-//    output5= mad(input5.x, weight0, output5);
-//    output5 = mad(input5.y, weight1, output5);
-//    output5 = mad(input5.z, weight2, output5);
-//    output5 = mad(input5.w, weight3, output5);
-//
-//
-//    // -------------6--------------
-//    pos_in = (int2)(i * input_width + in_pos_in_one_block6.x, in_pos_in_one_block6.y);
-//    half4 input6 = read_imageh(input_image, sampler, pos_in);
-//
-//    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-//    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-//    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-//    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-//    //    * 4 + 3));
-//
-//    output6 = mad(input6.x, weight0, output6);
-//    output6 = mad(input6.y, weight1, output6);
-//    output6 = mad(input6.z, weight2, output6);
-//    output6 = mad(input6.w, weight3, output6);
-//
-//
-//    // -------------7--------------
-//    pos_in = (int2)(i * input_width + in_pos_in_one_block7.x, in_pos_in_one_block7.y);
-//    half4 input7 = read_imageh(input_image, sampler, pos_in);
-//
-//    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-//    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-//    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-//    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-//    //    * 4 + 3));
-//
-//    output7 = mad(input7.x, weight0, output7);
-//    output7 = mad(input7.y, weight1, output7);
-//    output7 = mad(input7.z, weight2, output7);
-//    output7 = mad(input7.w, weight3, output7);
-  }
-
-#ifdef BATCH_NORM
-  output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-    output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-//
-//    output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-//          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-//
-//    output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-//          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-//
-//    output4 = output4 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-//          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-//
-//    output5 = output5 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-//          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-//
-//    output6 = output6 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-//          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-//
-//    output7 = output7 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-//          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-#endif
-
-#ifdef RELU
-  output0 = activation(output0);
-     output1 = activation(output1);
-//     output2 = activation(output2);
-//     output3 = activation(output3);
-//     output4 = activation(output4);
-//     output5 = activation(output5);
-//     output6 = activation(output6);
-//     output7 = activation(output7);
-#endif
-  int outpos_main = mul24(out_c , old_w);
-  int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
-
-  if (out_w0 < old_w) {
-    write_imageh(output_image, output_pos0, output0);
-  }
-  int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
-  if (out_w1 < old_w){
-    write_imageh(output_image, output_pos1, output1);
-  }
-//
-//  int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
-//  if (out_w2 < old_w){
-//    write_imageh(output_image, output_pos2, output2);
-//  }
-//
-//  int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
-//  if (out_w3 < old_w){
-//    write_imageh(output_image, output_pos3, output3);
-//  }
-//
-//  int2 output_pos4 = (int2)(outpos_main + out_w4, out_nh);
-//  if (out_w4 < old_w){
-//    write_imageh(output_image, output_pos4, output4);
-//  }
-//
-//  int2 output_pos5 = (int2)(outpos_main + out_w5, out_nh);
-//  if (out_w5 < old_w){
-//    write_imageh(output_image, output_pos5, output5);
-//
-//  }
-//  int2 output_pos6 = (int2)(outpos_main + out_w6, out_nh);
-//  if (out_w6 < old_w){
-//    write_imageh(output_image, output_pos6, output6);
-//  }
-//
-//  int2 output_pos7 = (int2)(outpos_main + out_w7, out_nh);
-//  if (out_w7 < old_w){
-//    write_imageh(output_image, output_pos7, output7);
-//  }
-
-}
-//__kernel void conv_1x1_c(
-//    __private const int global_size_dim0,
-//    __private const int global_size_dim1,
-//    __private const int global_size_dim2,
-//    __read_only image2d_t input_image,
-//    __read_only image2d_t filter,
-//#ifdef BIASE
-//    __read_only image2d_t bias,
-//#endif
-//#ifdef BATCH_NORM
-//    __read_only image2d_t new_scale,
-//    __read_only image2d_t new_biase,
-//#endif
-//    __write_only image2d_t output_image,
-//    __private const int stride,
-//    __private const int offset,
-//    __private const int input_c,
-//    __private const int dilation,
-//    __private const int input_width,  /* of one block */
-//    __private const int input_height, /* of one block */
-//    __private const int output_width,
-//    __private const int output_height,
-//    __private const int old_w) {
-//
-//  const int out_c = get_global_id(0);
-//  const int out_w = get_global_id(1);
-//  const int out_nh = get_global_id(2);
-//
-//  const sampler_t sampler =
-//      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-//  const int2 stride_xy = (int2)(stride, stride);
-//
-//  for (int i = 0; i < input_c; ++i) {
-//    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
-//    half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
-//    half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
-//    half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
-//
-//#pragma unroll
-//  for (int j = 0; j < 4; ++j) {
-//    int out_w0 = out_w + global_size_dim1 * j;
-//    int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
-//    int2 in_pos_in_one_block0 = ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
-//
-//#ifdef BIASE
-//    half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
-//#else
-//    half4 output0 = 0.0f;
-//#endif
-//      int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
-//      half4 input0 = read_imageh(input_image, sampler, pos_in);
-//
-//      output0 = mad(input0.x, weight0, output0);
-//      output0 = mad(input0.y, weight1, output0);
-//      output0 = mad(input0.z, weight2, output0);
-//      output0 = mad(input0.w, weight3, output0);
-//
-//#ifdef BATCH_NORM
-//      output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
-//#endif
-//
-//#ifdef RELU
-//      output0 = activation(output0);
-//#endif
-//      int outpos_main = mul24(out_c, old_w);
-//      int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
-//
-//      if (out_w0 < old_w) {
-//        write_imageh(output_image, output_pos0, output0);
-//      }
-//    }
-//  }
-//}
-
-/*
-
-__kernel void conv_1x1_4(__private const int global_size_dim0,
-                       __private const int global_size_dim1,
-                       __private const int global_size_dim2,
-                       __read_only image2d_t input_image,
-                       __read_only image2d_t filter,
-#ifdef BIASE
-                       __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-                       __read_only image2d_t new_scale,
-                       __read_only image2d_t new_biase,
-#endif
-                       __write_only image2d_t output_image,
-                       __private const int stride,
-                       __private const int offset,
-                       __private const int input_c,
-                       __private const int dilation,
-                       __private const int input_width,
-                       __private const int input_height,
-                       __private const int output_width,
-                       __private const int output_height) {
-  const int out_c = get_global_id(0) * 4;
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                           CLK_ADDRESS_CLAMP         |
-                           CLK_FILTER_NEAREST;
-
-  int2 stride_xy = (int2)(stride, stride);
-  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
-  int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
-
-#ifdef BIASE
-    half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
-    half4 output1 = read_imageh(bias, sampler, (int2)(out_c + 1, 0));
-    half4 output2 = read_imageh(bias, sampler, (int2)(out_c + 2, 0));
-    half4 output3 = read_imageh(bias, sampler, (int2)(out_c + 3, 0));
-#else
-    half4 output0 = 0.0f;
-    half4 output1 = 0.0f;
-    half4 output2 = 0.0f;
-    half4 output3 = 0.0f;
-#endif
-
-   for (int i = 0; i < input_c; ++i) {
-        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-        half4 input = read_imageh(input_image, sampler, pos_in);
-
-        half4 weight0_0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
-        half4 weight0_1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
-        half4 weight0_2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
-        half4 weight0_3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
-
-        output0 = mad(input.x, weight0_0, output0);
-        output0 = mad(input.y, weight0_1, output0);
-        output0 = mad(input.z, weight0_2, output0);
-        output0 = mad(input.w, weight0_3, output0);
-
-        half4 weight1_0 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 0));
-        half4 weight1_1 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 1));
-        half4 weight1_2 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 2));
-        half4 weight1_3 = read_imageh(filter, sampler, (int2)(out_c + 1, i * 4 + 3));
-
-        output1 = mad(input.x, weight1_0, output1);
-        output1 = mad(input.y, weight1_1, output1);
-        output1 = mad(input.z, weight1_2, output1);
-        output1 = mad(input.w, weight1_3, output1);
-
-        half4 weight2_0 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 0));
-        half4 weight2_1 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 1));
-        half4 weight2_2 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 2));
-        half4 weight2_3 = read_imageh(filter, sampler, (int2)(out_c + 2, i * 4 + 3));
-
-        output2 = mad(input.x, weight2_0, output2);
-        output2 = mad(input.y, weight2_1, output2);
-        output2 = mad(input.z, weight2_2, output2);
-        output2 = mad(input.w, weight2_3, output2);
-
-        half4 weight3_0 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 0));
-        half4 weight3_1 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 1));
-        half4 weight3_2 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 2));
-        half4 weight3_3 = read_imageh(filter, sampler, (int2)(out_c + 3, i * 4 + 3));
-
-        output3 = mad(input.x, weight3_0, output3);
-        output3 = mad(input.y, weight3_1, output3);
-        output3 = mad(input.z, weight3_2, output3);
-        output3 = mad(input.w, weight3_3, output3);
-
-   }
-
-#ifdef BATCH_NORM
-    output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c + 0, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 0, 0));
-
-    output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c + 1, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 1, 0));
-
-    output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c + 2, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 2, 0));
-
-    output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c + 3, 0)) + read_imageh(new_biase, sampler, (int2)(out_c + 3, 0));
-
-#endif
-
-#ifdef RELU
-  output0 = activation(output0);
-  output1 = activation(output1);
-  output2 = activation(output2);
-  output3 = activation(output3);
-#endif
-
-  int2 output_pos0 = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-  write_imageh(output_image, output_pos0, output0);
-
-
-  int2 output_pos1 = (int2)((out_c + 1) * global_size_dim1 + out_w, out_nh);
-  write_imageh(output_image, output_pos1, output1);
-
-
-  int2 output_pos2 = (int2)((out_c + 2) * global_size_dim1 + out_w, out_nh);
-  write_imageh(output_image, output_pos2, output2);
-
-
-  int2 output_pos3 = (int2)((out_c + 3) * global_size_dim1 + out_w, out_nh);
-  write_imageh(output_image, output_pos3, output3);
-}
-
-*/
-
-__kernel void conv_7x7(__private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-                                              __private const int global_size_dim2,
-                                              __read_only image2d_t input_image,
-                                              __read_only image2d_t filter_image,
-
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
-#endif
-
-#ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int offset,
-                                              __private const int input_c,
-                                              __private const int dilation,
-                                              __private const int input_width,/* of one block */
-                                              __private const int input_height,/* of one block */
-                                              __private const int output_width,
-                                              __private const int output_height) {
-
-    const int out_c = get_global_id(0);
-    const int out_w = get_global_id(1);
-    const int out_nh = get_global_id(2);
-    
-    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-
-    if (out_c >= global_size_dim0 ||
-        out_w >= global_size_dim1 ||
-        out_nh >= global_size_dim2) {
-        return;
-    }
-    const int filter_n0 = 4 * out_c + 0;
-    const int filter_n1 = 4 * out_c + 1;
-    const int filter_n2 = 4 * out_c + 2;
-    const int filter_n3 = 4 * out_c + 3;
-
-    int2 stride_xy;
-    stride_xy.x = stride;
-    stride_xy.y = stride;
-
-    int2 ouput_pos_in_one_block;
-    ouput_pos_in_one_block.x = out_w;
-    ouput_pos_in_one_block.y = out_nh;
-
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
-
-    int2 in_pos_in_one_block;
-    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
-    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
-
-#ifdef BIASE_CH
-    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
-#elif defined(BIASE_ELE)
-    half4 output = read_imageh(bias, sampler, output_pos);
-#else
-    half4 output = 0.0f;
-#endif
-
-   half4 input;
-   half4 filter[4];
-   int2 filter_pos0;
-   int2 filter_pos1;
-   int2 filter_pos2;
-   int2 filter_pos3;
-   for (int i = 0; i < input_c; ++i) {
-   int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-        for(int j = 0; j < 7; j++){
-         for(int k = 0; k < 7; k++){
-          input  =  select(read_imageh(input_image, sampler,
-                                (int2)(pos_in.x + (j - 3) * dilation, pos_in.y +  (k - 3) * dilation)),
-                                (half4)(0.0f),
-                                (ushort4)((in_pos_in_one_block.x + (j - 3) * dilation < 0 || in_pos_in_one_block.y + (k - 3) * dilation < 0 || in_pos_in_one_block.x + (j - 3) * dilation >= input_width || in_pos_in_one_block.y + (k - 3) * dilation >= input_height) << 15));
-         int filter_h = k;
-         int filter_w = j;
-         int filter_c = i;
-
-         filter_pos0.x = filter_c * 7 + filter_w;
-         filter_pos0.y = filter_n0 * 7 + filter_h;
-
-         filter_pos1.x = filter_c * 7 + filter_w;
-         filter_pos1.y = filter_n1 * 7 + filter_h;
-
-         filter_pos2.x = filter_c * 7 + filter_w;
-         filter_pos2.y = filter_n2 * 7 + filter_h;
-
-         filter_pos3.x = filter_c * 7 + filter_w;
-         filter_pos3.y = filter_n3 * 7 + filter_h;
-
-         filter[0] =  read_imageh(filter_image, sampler, filter_pos0);
-         filter[1] =  read_imageh(filter_image, sampler, filter_pos1);
-         filter[2] =  read_imageh(filter_image, sampler, filter_pos2);
-         filter[3] =  read_imageh(filter_image, sampler, filter_pos3);
-
-         output.x += dot(input, filter[0]);
-         output.y += dot(input, filter[1]);
-         output.z += dot(input, filter[2]);
-         output.w += dot(input, filter[3]);
-         }
-        }
-    }
-
-#ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
-#endif
-
-#ifdef RELU
-    output = activation(output);
-#endif
-
-    write_imageh(output_image, output_pos, output);
-}
-
-__kernel void conv_5x5(__private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-                                              __private const int global_size_dim2,
-                                              __read_only image2d_t input_image,
-                                              __read_only image2d_t filter_image,
-
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
-#endif
-
-#ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int offset,
-                                              __private const int input_c,
-                                              __private const int dilation,
-                                              __private const int input_width,/* of one block */
-                                              __private const int input_height,/* of one block */
-                                              __private const int output_width,
-                                              __private const int output_height) {
-
-    const int out_c = get_global_id(0);
-    const int out_w = get_global_id(1);
-    const int out_nh = get_global_id(2);
-    
-    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-
-    if (out_c >= global_size_dim0 ||
-        out_w >= global_size_dim1 ||
-        out_nh >= global_size_dim2) {
-        return;
-    }
-    const filter_n0 = 4 * out_c + 0;
-    const filter_n1 = 4 * out_c + 1;
-    const filter_n2 = 4 * out_c + 2;
-    const filter_n3 = 4 * out_c + 3;
-
-    int2 stride_xy;
-    stride_xy.x = stride;
-    stride_xy.y = stride;
-
-    int2 ouput_pos_in_one_block;
-    ouput_pos_in_one_block.x = out_w;
-    ouput_pos_in_one_block.y = out_nh;
-
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
-
-    int2 in_pos_in_one_block;
-    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
-    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
-
-#ifdef BIASE_CH
-    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
-#elif defined(BIASE_ELE)
-    half4 output = read_imageh(bias, sampler, output_pos);
-#else
-    half4 output = 0.0f;
-#endif
-
-   half4 input;
-   half4 filter[4];
-   int2 filter_pos0;
-   int2 filter_pos1;
-   int2 filter_pos2;
-   int2 filter_pos3;
-   for (int i = 0; i < input_c; ++i) {
-   int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-        for(int j = 0; j < 5; j++){
-         for(int k = 0; k < 5; k++){
-          input  =  select(read_imageh(input_image, sampler,
-                                (int2)(pos_in.x + (j - 2) * dilation, pos_in.y +  (k - 2) * dilation)),
-                                (half4)(0.0f),
-                                (ushort4)((in_pos_in_one_block.x + (j - 2) * dilation < 0 || in_pos_in_one_block.y + (k - 2) * dilation < 0 || in_pos_in_one_block.x + (j - 2) * dilation >= input_width || in_pos_in_one_block.y + (k - 2) * dilation >= input_height) << 15));
-         int filter_h = k;
-         int filter_w = j;
-         int filter_c = i;
-
-         filter_pos0.x = filter_c * 5 + filter_w;
-         filter_pos0.y = filter_n0 * 5 + filter_h;
-
-         filter_pos1.x = filter_c * 5 + filter_w;
-         filter_pos1.y = filter_n1 * 5 + filter_h;
-
-         filter_pos2.x = filter_c * 5 + filter_w;
-         filter_pos2.y = filter_n2 * 5 + filter_h;
-
-         filter_pos3.x = filter_c * 5 + filter_w;
-         filter_pos3.y = filter_n3 * 5 + filter_h;
-
-         filter[0] =  read_imageh(filter_image, sampler, filter_pos0);
-         filter[1] =  read_imageh(filter_image, sampler, filter_pos1);
-         filter[2] =  read_imageh(filter_image, sampler, filter_pos2);
-         filter[3] =  read_imageh(filter_image, sampler, filter_pos3);
-
-         output.x += dot(input, filter[0]);
-         output.y += dot(input, filter[1]);
-         output.z += dot(input, filter[2]);
-         output.w += dot(input, filter[3]);
-         }
-        }
-    }
-
-#ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
-#endif
-
-#ifdef RELU
-    output = activation(output);
-#endif
-
-    write_imageh(output_image, output_pos, output);
-}
-
-__kernel void convBNAdd_3x3(__private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-                                              __private const int global_size_dim2,
-                                              __read_only image2d_t input_image,
-                                              __read_only image2d_t filter,
-
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-                                              __read_only image2d_t bias,
-#endif
-
-#ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-
-                                              __write_only image2d_t output_image,
-                                              __private const int stride,
-                                              __private const int offset,
-                                              __private const int input_c,
-                                              __private const int dilation,
-                                              __private const int input_width,/* of one block */
-                                              __private const int input_height,/* of one block */
-                                              __private const int output_width,
-                                              __private const int output_height) {
-
-    const int out_c = get_global_id(0);
-    const int out_w = get_global_id(1);
-    const int out_nh = get_global_id(2);
-    
-    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-
-    if (out_c >= global_size_dim0 ||
-        out_w >= global_size_dim1 ||
-        out_nh >= global_size_dim2) {
-        return;
-    }
-
-
-    int2 stride_xy;
-    stride_xy.x = stride;
-    stride_xy.y = stride;
-
-    int2 ouput_pos_in_one_block;
-    ouput_pos_in_one_block.x = out_w;
-    ouput_pos_in_one_block.y = out_nh;
-
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
-
-    int2 in_pos_in_one_block;
-    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
-    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
-
-
-    half4 output = (half4)0.0f;
-
-   half4 input[9];
-
-   for (int i = 0; i < input_c; ++i) {
-        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-        input[0] = select(read_imageh(input_image, sampler,
-                            (int2)(pos_in.x - dilation, pos_in.y - dilation)),
-                            (half4)(0.0f),
-                            (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-
-        input[1] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x, pos_in.y - dilation)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-
-        input[2] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x + dilation, pos_in.y - dilation)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
-
-        input[3] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x - dilation, pos_in.y)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-
-        input[4] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x, pos_in.y)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-
-        input[5] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x + dilation, pos_in.y)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
-
-        input[6] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x - dilation, pos_in.y + dilation)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-
-        input[7] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x, pos_in.y + dilation)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-
-        input[8] = select(read_imageh(input_image, sampler,
-                          (int2)(pos_in.x + dilation, pos_in.y + dilation)),
-                          (half4)(0.0f),
-                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
-
-
-/*
-        for (int j = 0; j < 9; ++j) {
-            int2 pos_of_weight;
-            pos_of_weight.x = i * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
-            output.x += dot(input[j], weight_x);
-
-            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
-            output.y += dot(input[j], weight_y);
-
-            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
-            output.z += dot(input[j], weight_z);
-
-            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
-            output.w += dot(input[j], weight_w);
-        }
-*/
-            int j = 0;
-            int2 pos_of_weight;
-            pos_of_weight.x = i * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
-            output.x += dot(input[j], weight_x);
-
-            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
-            output.y += dot(input[j], weight_y);
-
-            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
-            output.z += dot(input[j], weight_z);
-
-            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
-            output.w += dot(input[j], weight_w);
-
-            j = 1;
-            pos_of_weight.x = i * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            weight_x = read_imageh(filter, sampler, pos_of_weight);
-            output.x += dot(input[j], weight_x);
-
-            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            weight_y = read_imageh(filter, sampler, pos_of_weight);
-            output.y += dot(input[j], weight_y);
-
-            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            weight_z = read_imageh(filter, sampler, pos_of_weight);
-            output.z += dot(input[j], weight_z);
-
-            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            weight_w = read_imageh(filter, sampler, pos_of_weight);
-            output.w += dot(input[j], weight_w);
-
-            j = 2;
-            pos_of_weight.x = i * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            weight_x = read_imageh(filter, sampler, pos_of_weight);
-            output.x += dot(input[j], weight_x);
-
-            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            weight_y = read_imageh(filter, sampler, pos_of_weight);
-            output.y += dot(input[j], weight_y);
-
-            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            weight_z = read_imageh(filter, sampler, pos_of_weight);
-            output.z += dot(input[j], weight_z);
-
-            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            weight_w = read_imageh(filter, sampler, pos_of_weight);
-            output.w += dot(input[j], weight_w);
-
-            j = 3;
-            pos_of_weight.x = i * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            weight_x = read_imageh(filter, sampler, pos_of_weight);
-            output.x += dot(input[j], weight_x);
-
-            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            weight_y = read_imageh(filter, sampler, pos_of_weight);
-            output.y += dot(input[j], weight_y);
-
-            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            weight_z = read_imageh(filter, sampler, pos_of_weight);
-            output.z += dot(input[j], weight_z);
-
-            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            weight_w = read_imageh(filter, sampler, pos_of_weight);
-            output.w += dot(input[j], weight_w);
-
-            j = 4;
-            pos_of_weight.x = i * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            weight_x = read_imageh(filter, sampler, pos_of_weight);
-            output.x += dot(input[j], weight_x);
-
-            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            weight_y = read_imageh(filter, sampler, pos_of_weight);
-            output.y += dot(input[j], weight_y);
-
-            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            weight_z = read_imageh(filter, sampler, pos_of_weight);
-            output.z += dot(input[j], weight_z);
-
-            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            weight_w = read_imageh(filter, sampler, pos_of_weight);
-            output.w += dot(input[j], weight_w);
-
-            j = 5;
-            pos_of_weight.x = i * 3 + j % 3;
-            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-            weight_x = read_imageh(filter, sampler, pos_of_weight);
-            output.x += dot(input[j], weight_x);
-
-            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-            weight_y = read_imageh(filter, sampler, pos_of_weight);
-            output.y += dot(input[j], weight_y);
-
-            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-            weight_z = read_imageh(filter, sampler, pos_of_weight);
-            output.z += dot(input[j], weight_z);
-
-            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-            weight_w = read_imageh(filter, sampler, pos_of_weight);
-            output.w += dot(input[j], weight_w);
-
-           j = 6;
-           pos_of_weight.x = i * 3 + j % 3;
-           pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-           weight_x = read_imageh(filter, sampler, pos_of_weight);
-           output.x += dot(input[j], weight_x);
-
-           pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-           weight_y = read_imageh(filter, sampler, pos_of_weight);
-           output.y += dot(input[j], weight_y);
-
-           pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-           weight_z = read_imageh(filter, sampler, pos_of_weight);
-           output.z += dot(input[j], weight_z);
-
-           pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-           weight_w = read_imageh(filter, sampler, pos_of_weight);
-           output.w += dot(input[j], weight_w);
-
-           j = 7;
-           pos_of_weight.x = i * 3 + j % 3;
-           pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-           weight_x = read_imageh(filter, sampler, pos_of_weight);
-           output.x += dot(input[j], weight_x);
-
-           pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-           weight_y = read_imageh(filter, sampler, pos_of_weight);
-           output.y += dot(input[j], weight_y);
-
-           pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-           weight_z = read_imageh(filter, sampler, pos_of_weight);
-           output.z += dot(input[j], weight_z);
-
-           pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-           weight_w = read_imageh(filter, sampler, pos_of_weight);
-           output.w += dot(input[j], weight_w);
-
-           j = 8;
-           pos_of_weight.x = i * 3 + j % 3;
-           pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-           weight_x = read_imageh(filter, sampler, pos_of_weight);
-           output.x += dot(input[j], weight_x);
-
-           pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-           weight_y = read_imageh(filter, sampler, pos_of_weight);
-           output.y += dot(input[j], weight_y);
-
-           pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-           weight_z = read_imageh(filter, sampler, pos_of_weight);
-           output.z += dot(input[j], weight_z);
-
-           pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-           weight_w = read_imageh(filter, sampler, pos_of_weight);
-           output.w += dot(input[j], weight_w);
-
-    }
-
-#ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
-#endif
-
-#ifdef BIASE_CH
-    output += read_imageh(bias, sampler, (int2)(out_c, 0));
-#elif defined(BIASE_ELE)
-    output += read_imageh(bias, sampler, output_pos);
-#endif
-
-#ifdef RELU
-    output = activation(output);
-#endif
-
-    write_imageh(output_image, output_pos, output);
-}
-
-__kernel void convBNAdd_1x1(__private const int global_size_dim0,
-                       __private const int global_size_dim1,
-                       __private const int global_size_dim2,
-                       __read_only image2d_t input_image,
-                       __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-                       __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-                       __read_only image2d_t new_scale,
-                       __read_only image2d_t new_biase,
-#endif
-                       __write_only image2d_t output_image,
-                       __private const int stride,
-                       __private const int offset,
-                       __private const int input_c,
-                       __private const int dilation,
-                       __private const int input_width,/* of one block */
-                       __private const int input_height,/* of one block */
-                       __private const int output_width,
-                       __private const int output_height) {
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-    
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                           CLK_ADDRESS_CLAMP         |
-                           CLK_FILTER_NEAREST;
-
-  const uint kernelHXW = 1;
-  int2 stride_xy = (int2)(stride, stride);
-  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
-  int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
-
-
-  half4 output = 0.0f;
-
-   for (int i = 0; i < input_c; ++i) {
-        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-        half4 input = read_imageh(input_image, sampler, pos_in);
-
-        half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
-        half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
-        half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
-        half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
-/*
-        output.x = dot(input, weight0);
-        output.y = dot(input, weight1);
-        output.z = dot(input, weight2);
-        output.w = dot(input, weight3);
-*/
-
-        output = mad(input.x, weight0, output);
-        output = mad(input.y, weight1, output);
-        output = mad(input.z, weight2, output);
-        output = mad(input.w, weight3, output);
-
-   }
-
-#ifdef BATCH_NORM
-    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
-#endif
-
-#ifdef BIASE_CH
-    output += read_imageh(bias, sampler, (int2)(out_c, 0));
-#elif defined(BIASE_ELE)
-    output += read_imageh(bias, sampler, output_pos);
-#endif
-
-#ifdef RELU
-  output = activation(output);
-#endif
-
-  write_imageh(output_image, output_pos, output);
-}
-
-__kernel void convBNAdd_1x1_spl(
-        __private const int global_size_dim0, __private const int global_size_dim1,
-        __private const int global_size_dim2, __read_only image2d_t input_image,
-        __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-        __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-        __read_only image2d_t new_scale, __read_only image2d_t new_biase,
-#endif
-        __write_only image2d_t output_image, __private const int stride,
-        __private const int offset, __private const int input_c,
-        __private const int dilation,
-        __private const int input_width,  /* of one block */
-        __private const int input_height, /* of one block */
-        __private const int output_width,
-        __private const int output_height,
-        __private const int old_w
-) {
-
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  int out_w0 = out_w;
-  int out_w1 = out_w + global_size_dim1;
-  int out_w2 = out_w + global_size_dim1 * 2;
-  int out_w3 = out_w + global_size_dim1 * 3;
-
-  int outpos_main = mul24(out_c , old_w);
-  int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
-  int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
-  int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
-  int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
-
-  const sampler_t sampler =
-          CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  int2 stride_xy = (int2)(stride, stride);
-
-  int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
-  int2 in_pos_in_one_block0 =
-          ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
-  int2 in_pos_in_one_block1 =
-          ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
-  int2 in_pos_in_one_block2 =
-          ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
-  int2 in_pos_in_one_block3 =
-          ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
-
-  
-  half4 output0 = 0.0f;
-  half4 output1 = 0.0f;
-  half4 output2 = 0.0f;
-  half4 output3 = 0.0f;
-
-  for (int i = 0; i < input_c; ++i) {
-    // ------------0---------------
-    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
-    half4 input0 = read_imageh(input_image, sampler, pos_in);
-
-    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
-    half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
-    half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
-    half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
-
-    output0 = mad(input0.x, weight0, output0);
-    output0 = mad(input0.y, weight1, output0);
-    output0 = mad(input0.z, weight2, output0);
-    output0 = mad(input0.w, weight3, output0);
-
-    // -------------1--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
-    half4 input1 = read_imageh(input_image, sampler, pos_in);
-    //
-    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-    //    * 4 + 3));
-
-    output1 = mad(input1.x, weight0, output1);
-    output1 = mad(input1.y, weight1, output1);
-    output1 = mad(input1.z, weight2, output1);
-    output1 = mad(input1.w, weight3, output1);
-
-    // -------------2--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
-    half4 input2 = read_imageh(input_image, sampler, pos_in);
-
-    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-    //    * 4 + 3));
-
-    output2 = mad(input2.x, weight0, output2);
-    output2 = mad(input2.y, weight1, output2);
-    output2 = mad(input2.z, weight2, output2);
-    output2 = mad(input2.w, weight3, output2);
-
-    // -------------3--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
-    half4 input3 = read_imageh(input_image, sampler, pos_in);
-
-    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-    //    * 4 + 3));
-
-    output3 = mad(input3.x, weight0, output3);
-    output3 = mad(input3.y, weight1, output3);
-    output3 = mad(input3.z, weight2, output3);
-    output3 = mad(input3.w, weight3, output3);
-  }
-
-#ifdef BATCH_NORM
-    output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-    output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-    output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-    output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-          read_imageh(new_biase, sampler, (int2)(out_c, 0));
-#endif
-
-#ifdef BIASE_CH
-    output0 += read_imageh(bias, sampler, (int2)(out_c, 0));
-    output1 += read_imageh(bias, sampler, (int2)(out_c, 0));
-    output2 += read_imageh(bias, sampler, (int2)(out_c, 0));
-    output3 += read_imageh(bias, sampler, (int2)(out_c, 0));
-#elif defined(BIASE_ELE)
-    output0 += read_imageh(bias, sampler, output_pos0);
-    output1 += read_imageh(bias, sampler, output_pos1);
-    output2 += read_imageh(bias, sampler, output_pos2);
-    output3 += read_imageh(bias, sampler, output_pos3);
-#endif
-      
-#ifdef RELU
-  output0 = activation(output0);
-  output1 = activation(output1);
-  output2 = activation(output2);
-  output3 = activation(output3);
-#endif
-
-  if (out_w0 < old_w) {
-    write_imageh(output_image, output_pos0, output0);
-  }
-
-  if (out_w1 < old_w){
-    write_imageh(output_image, output_pos1, output1);
-  }
-
-  if (out_w2 < old_w){
-    write_imageh(output_image, output_pos2, output2);
-  }
-
-  if (out_w3 < old_w){
-    write_imageh(output_image, output_pos3, output3);
-  }
-}
-
-
-
-
-
-
-
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/conv_transpose_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/conv_transpose_kernel.cl
deleted file mode 100644
index a67ad9a017..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/conv_transpose_kernel.cl
+++ /dev/null
@@ -1,443 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "cl_common.h"
-
-__kernel void conv_transpose(__private const int input_c_block,
-                             __private const int input_width,/* of one block */
-                             __private const int input_height,/* of one block */
-                             __private const int output_width,
-                             __private const int output_height,
-                             __read_only image2d_t input_image,
-                             __read_only image2d_t filter,
-                             __write_only image2d_t output_image) {
-
-    const int out_c = get_global_id(0);
-    const int in_w = get_global_id(1);
-    const int in_nh = get_global_id(2);
-    const int n = in_nh / input_height;
-    const int h = in_nh % input_height;
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
-
-    half4 input1, input2, input3, input4;
-    half4 output1 = 0.0f, output2 = 0.0f, output3 = 0.0f, output4 = 0.0f;
-    half4 w = 0.0f;
-    int2 pos_in;
-    for (int i = 0; i < input_c_block; i += 1) {
-        pos_in = (int2)(mad24(i, input_width, in_w), in_nh);
-        input1 = select(read_imageh(input_image, sampler,
-                                    (int2)(pos_in.x, pos_in.y)),
-                        (half4)(0.0f),
-                        (ushort4)((in_w < 0 || h < 0 || in_w >= input_width || h >= input_height) << 15));
-        input2 = select(read_imageh(input_image, sampler,
-                                    (int2)(pos_in.x + 1, pos_in.y)),
-                        (half4)(0.0f),
-                        (ushort4)((in_w + 1 < 0 || h < 0 || in_w + 1 >= input_width || h >= input_height) << 15));
-        input3 = select(read_imageh(input_image, sampler,
-                                    (int2)(pos_in.x, pos_in.y + 1)),
-                        (half4)(0.0f),
-                        (ushort4)((in_w < 0 || h + 1 < 0 || in_w >= input_width || h + 1 >= input_height) << 15));
-        input4 = select(read_imageh(input_image, sampler,
-                                    (int2)(pos_in.x + 1, pos_in.y + 1)),
-                        (half4)(0.0f),
-                        (ushort4)((in_w + 1 < 0 || h + 1 < 0 || in_w + 1 >= input_width || h + 1 >= input_height) << 15));
-
-        int wx = i * 3;
-        int wy = out_c * 4 * 3;
-        w = read_imageh(filter, sampler, (int2)(wx, wy));
-        output4.x += dot(input4, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 1, wy));
-        output3.x += dot(input3, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 2, wy));
-        output4.x += dot(input3, w);
-        w = read_imageh(filter, sampler, (int2)(wx, wy + 1));
-        output2.x += dot(input2, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1));
-        output1.x += dot(input1, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1));
-        output2.x += dot(input1, w);
-        w = read_imageh(filter, sampler, (int2)(wx, wy + 2));
-        output4.x += dot(input2, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2));
-        output3.x += dot(input1, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2));
-        output4.x += dot(input1, w);
-
-        wy = (out_c * 4 + 1) * 3;
-        w = read_imageh(filter, sampler, (int2)(wx, wy));
-        output4.y += dot(input4, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 1, wy));
-        output3.y += dot(input3, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 2, wy));
-        output4.y += dot(input3, w);
-        w = read_imageh(filter, sampler, (int2)(wx, wy + 1));
-        output2.y += dot(input2, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1));
-        output1.y += dot(input1, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1));
-        output2.y += dot(input1, w);
-        w = read_imageh(filter, sampler, (int2)(wx, wy + 2));
-        output4.y += dot(input2, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2));
-        output3.y += dot(input1, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2));
-        output4.y += dot(input1, w);
-
-        wy = (out_c * 4 + 2) * 3;
-        w = read_imageh(filter, sampler, (int2)(wx, wy));
-        output4.z += dot(input4, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 1, wy));
-        output3.z += dot(input3, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 2, wy));
-        output4.z += dot(input3, w);
-        w = read_imageh(filter, sampler, (int2)(wx, wy + 1));
-        output2.z += dot(input2, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1));
-        output1.z += dot(input1, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1));
-        output2.z += dot(input1, w);
-        w = read_imageh(filter, sampler, (int2)(wx, wy + 2));
-        output4.z += dot(input2, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2));
-        output3.z += dot(input1, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2));
-        output4.z += dot(input1, w);
-
-        wy = (out_c * 4 + 3) * 3;
-        w = read_imageh(filter, sampler, (int2)(wx, wy));
-        output4.w += dot(input4, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 1, wy));
-        output3.w += dot(input3, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 2, wy));
-        output4.w += dot(input3, w);
-        w = read_imageh(filter, sampler, (int2)(wx, wy + 1));
-        output2.w += dot(input2, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1));
-        output1.w += dot(input1, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1));
-        output2.w += dot(input1, w);
-        w = read_imageh(filter, sampler, (int2)(wx, wy + 2));
-        output4.w += dot(input2, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2));
-        output3.w += dot(input1, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2));
-        output4.w += dot(input1, w);
-    }
-
-    int2 pos_out = (int2)(out_c * output_width + 2 * in_w, n * output_height + 2 * h);
-    write_imageh(output_image, pos_out, output1);
-    write_imageh(output_image, (int2)(pos_out.x + 1, pos_out.y), output2);
-    write_imageh(output_image, (int2)(pos_out.x, pos_out.y + 1), output3);
-    write_imageh(output_image, (int2)(pos_out.x + 1, pos_out.y + 1), output4);
-}
-
-__kernel void depthwise_transpose(__private const int item_ch,
-                               __private const int item_w,
-                               __private const int item_h,
-                               __read_only image2d_t input_image,
-                               __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-        __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-__read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-                               __write_only image2d_t output_image,
-                               __private const int stride,
-                               __private const int pad,
-                               __private const int dilation,
-                               __private const int in_ch,
-                               __private const int in_w,
-                               __private const int in_h,
-                               __private const int out_w,
-                               __private const int out_h,
-                               __private const int filter_w,
-                               __private const int filter_h) {
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
-
-    // item_id
-    const int item_ch_id = get_global_id(0);
-    const int item_w_id = get_global_id(1);
-    const int item_h_id = get_global_id(2);
-
-    // out_id
-    int out_b_id = item_h_id / out_h;
-    int out_w_id_per_ch_blk = item_w_id;
-    int out_h_id_per_batch = item_h_id % out_h;
-    int out_w_id = item_ch_id * out_w + out_w_id_per_ch_blk;
-
-    // in_id
-    int in_w_id_per_ch_blk = (out_w_id_per_ch_blk + pad - filter_w + stride) / stride;
-    in_w_id_per_ch_blk = in_w_id_per_ch_blk > 0 ? in_w_id_per_ch_blk : 0;
-    int in_h_id_per_batch = (out_h_id_per_batch + pad - filter_h + stride) / stride;
-    in_h_id_per_batch = in_h_id_per_batch > 0 ? in_h_id_per_batch : 0;
-
-    // filter_id
-    int align_w_i = out_w_id_per_ch_blk + pad - filter_w + 1;
-    int align_w = align_w_i % stride > 0 ?
-                  align_w_i % stride - stride : align_w_i % stride;
-    int filter_w_id_per_ch_blk = out_w_id_per_ch_blk + pad < filter_w ? out_w_id_per_ch_blk + pad : filter_w + align_w - 1;
-
-    int align_h_i = out_h_id_per_batch + pad - filter_h + 1;
-    int align_h = align_h_i % stride > 0 ?
-                  align_h_i % stride - stride : align_h_i % stride;
-    int filter_h_id = out_h_id_per_batch + pad < filter_h ? out_h_id_per_batch + pad : filter_h + align_h - 1;
-
-#ifdef BIASE_CH
-    half4 output;
-    output = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
-#elif defined(BIASE_ELE)
-    half4 output;
-    output = read_imageh(bias, sampler, (int2)(out_w_id, item_h_id));
-#else
-    half4 output = 0.0f;
-#endif
-    half4 filter = 0.0f;
-    half4 input = 0.0f;
-    for (int h = filter_h_id; h >= 0; h -= stride) {
-        int in_h_id = select(out_b_id * in_h + in_h_id_per_batch, -1,
-                             in_h_id_per_batch < 0 || in_h_id_per_batch >= in_h);
-        for (int w = filter_w_id_per_ch_blk; w >= 0; w -= stride) {
-            int in_w_id = select(item_ch_id * in_w + in_w_id_per_ch_blk, -1,
-                                 in_w_id_per_ch_blk < 0 || in_w_id_per_ch_blk >= in_w);
-            int filter_w_id = item_ch_id * filter_w + w;
-            input = read_imageh(input_image, sampler, (int2)(in_w_id, in_h_id));
-            filter = read_imageh(filter_image, sampler, (int2)(filter_w_id, h));
-
-            output = mad(input, filter, output);
-            in_w_id_per_ch_blk++;
-        }
-        in_h_id_per_batch++;
-    }
-
-#ifdef BATCH_NORM
-    half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
-    half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
-    output = mad(scale, output, biase);
-#endif
-
-#ifdef RELU
-    output = activation(output);
-#endif
-
-    write_imageh(output_image, (int2)(out_w_id, item_h_id), output);
-}
-
-
-/* batch == 1 pad(output) == 1 out_w % 2 == 0 */
-__kernel void conv_transpose3x3s2(__private const int item_ch,
-                                  __private const int item_w,
-                                  __private const int item_h,
-                                  __read_only image2d_t input_image,
-                                  __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-        __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-__read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-                                  __write_only image2d_t output_image,
-                                  __private const int stride,
-                                  __private const int pad,
-                                  __private const int dilation,
-                                  __private const int in_ch,
-                                  __private const int in_w,
-                                  __private const int in_h,
-                                  __private const int out_w,
-                                  __private const int out_h,
-                                  __private const int filter_w,
-                                  __private const int filter_h) {
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
-    // item_id
-    const int item_ch_id = get_global_id(0);
-    const int item_w_id = get_global_id(1);
-    const int item_h_id = get_global_id(2);
-
-    // out_id
-    int out_w_id_per_ch_blk = item_w_id / 2 * 10 + item_w_id % 2;
-    int out_h_id = item_h_id;
-    int out_w_id0 = item_ch_id * out_w + out_w_id_per_ch_blk;
-    int out_w_id1 = out_w_id0 + 2;
-    int out_w_id2 = out_w_id1 + 2;
-    int out_w_id3 = out_w_id2 + 2;
-    int out_w_id4 = out_w_id3 + 2;
-
-    // in_id
-    int in_w_id_per_ch_blk = (out_w_id_per_ch_blk) / 2;
-    in_w_id_per_ch_blk = in_w_id_per_ch_blk > 0 ? in_w_id_per_ch_blk : 0;
-    int in_h_id_per_batch = (out_h_id) / 2;
-    in_h_id_per_batch = in_h_id_per_batch > 0 ? in_h_id_per_batch : 0;
-
-    // filter_id
-    int align_w_i = out_w_id_per_ch_blk - 1;
-    int align_w = align_w_i % 2 > 0 ?
-                  align_w_i % 2 - 2 : align_w_i % 2;
-    int filter_w_id_per_ch_blk = out_w_id_per_ch_blk + 1 < 3 ? out_w_id_per_ch_blk + 1 : 2 + align_w;
-
-    int align_h_i = out_h_id - 1;
-    int align_h = align_h_i % 2 > 0 ?
-                  align_h_i % 2 - 2 : align_h_i % 2;
-    int filter_h_id_per_out_ch = out_h_id + 1 < 3 ? out_h_id + 1 : 2 + align_h;
-
-#ifdef BIASE_CH
-    half4 output[5];
-    output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
-    output[1] = output[0];
-    output[2] = output[0];
-    output[3] = output[0];
-    output[4] = output[0];
-
-#elif defined(BIASE_ELE)
-    half4 output[5];
-    output[0] = read_imageh(bias, sampler, (int2)(out_w_id0, item_h_id));
-    if (out_w_id_per_ch_blk + 2 < out_w) {
-        output[1] = read_imageh(bias, sampler, (int2)(out_w_id1, item_h_id));
-    }
-    if (out_w_id_per_ch_blk + 4 < out_w) {
-        output[2] = read_imageh(bias, sampler, (int2)(out_w_id2, item_h_id));
-    }
-    if (out_w_id_per_ch_blk + 6 < out_w) {
-        output[3] = read_imageh(bias, sampler, (int2)(out_w_id3, item_h_id));
-    }
-    if (out_w_id_per_ch_blk + 8 < out_w) {
-        output[4] = read_imageh(bias, sampler, (int2)(out_w_id4, item_h_id));
-    }
-
-#else
-    half4 output[5] = {0.0f};
-#endif
-    half4 filter[4] = {0.0f};
-    half4 filter_trans[4] = {0.0f};
-
-    half4 input[5] = {0.0f};
-    for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
-        int filter_w_id = ch * 3;
-        int h_idx = 0;
-        for (int h = filter_h_id_per_out_ch; h >= 0; h -= 2) {
-            int in_h_id = select(in_h_id_per_batch + h_idx, -1,
-                                 in_h_id_per_batch + h_idx < 0 || in_h_id_per_batch + h_idx >= in_h);
-            int filter_h_id = item_ch_id * 12 + h;
-            int w_idx = 0;
-            for (int w = filter_w_id_per_ch_blk; w >= 0; w -= 2) {
-                int in_w_id0 = select(ch * in_w + in_w_id_per_ch_blk + w_idx, -1,
-                                     in_w_id_per_ch_blk + w_idx < 0 || in_w_id_per_ch_blk + w_idx >= in_w);
-                int in_w_id1 = select(ch * in_w + in_w_id_per_ch_blk + 1 + w_idx, -1,
-                                     in_w_id_per_ch_blk + 1 + w_idx < 0 || in_w_id_per_ch_blk + 1 + w_idx >= in_w);
-                int in_w_id2 = select(ch * in_w + in_w_id_per_ch_blk + 2 + w_idx, -1,
-                                     in_w_id_per_ch_blk + 2 + w_idx < 0 || in_w_id_per_ch_blk + 2 + w_idx >= in_w);
-                int in_w_id3 = select(ch * in_w + in_w_id_per_ch_blk + 3 + w_idx, -1,
-                                     in_w_id_per_ch_blk + 3 + w_idx < 0 || in_w_id_per_ch_blk + 3 + w_idx >= in_w);
-                int in_w_id4 = select(ch * in_w + in_w_id_per_ch_blk + 4 + w_idx, -1,
-                                     in_w_id_per_ch_blk + 4 + w_idx < 0 || in_w_id_per_ch_blk + 4 + w_idx >= in_w);
-
-                input[0] = read_imageh(input_image, sampler, (int2)(in_w_id0, in_h_id));
-                input[1] = read_imageh(input_image, sampler, (int2)(in_w_id1, in_h_id));
-                input[2] = read_imageh(input_image, sampler, (int2)(in_w_id2, in_h_id));
-                input[3] = read_imageh(input_image, sampler, (int2)(in_w_id3, in_h_id));
-                input[4] = read_imageh(input_image, sampler, (int2)(in_w_id4, in_h_id));
-
-                filter[0] = read_imageh(filter_image, sampler, (int2)(filter_w_id + w, filter_h_id));      // in_ch:0-3,out_ch:0
-                filter[1] = read_imageh(filter_image, sampler, (int2)(filter_w_id + w, filter_h_id + 3));  // in_ch:0-3,out_ch:1
-                filter[2] = read_imageh(filter_image, sampler, (int2)(filter_w_id + w, filter_h_id + 6));  // in_ch:0-3,out_ch:2
-                filter[3] = read_imageh(filter_image, sampler, (int2)(filter_w_id + w, filter_h_id + 9));  // in_ch:0-3,out_ch:3
-
-                filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x);             // in_ch:0,out_ch:0-3
-                filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y);             // in_ch:1,out_ch:0-3
-                filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z);             // in_ch:2,out_ch:0-3
-                filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, filter[3].w);             // in_ch:3,out_ch:0-3
-
-                output[0] = mad(input[0].x, filter_trans[0], output[0]);
-                output[0] = mad(input[0].y, filter_trans[1], output[0]);
-                output[0] = mad(input[0].z, filter_trans[2], output[0]);
-                output[0] = mad(input[0].w, filter_trans[3], output[0]);
-
-                output[1] = mad(input[1].x, filter_trans[0], output[1]);
-                output[1] = mad(input[1].y, filter_trans[1], output[1]);
-                output[1] = mad(input[1].z, filter_trans[2], output[1]);
-                output[1] = mad(input[1].w, filter_trans[3], output[1]);
-
-                output[2] = mad(input[2].x, filter_trans[0], output[2]);
-                output[2] = mad(input[2].y, filter_trans[1], output[2]);
-                output[2] = mad(input[2].z, filter_trans[2], output[2]);
-                output[2] = mad(input[2].w, filter_trans[3], output[2]);
-
-                output[3] = mad(input[3].x, filter_trans[0], output[3]);
-                output[3] = mad(input[3].y, filter_trans[1], output[3]);
-                output[3] = mad(input[3].z, filter_trans[2], output[3]);
-                output[3] = mad(input[3].w, filter_trans[3], output[3]);
-
-                output[4] = mad(input[4].x, filter_trans[0], output[4]);
-                output[4] = mad(input[4].y, filter_trans[1], output[4]);
-                output[4] = mad(input[4].z, filter_trans[2], output[4]);
-                output[4] = mad(input[4].w, filter_trans[3], output[4]);
-                w_idx++;
-            }
-            h_idx++;
-        }
-    }
-#ifdef BATCH_NORM
-    half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
-    half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
-    output[0] = mad(scale, output[0], biase);
-    if (out_w_id_per_ch_blk + 2 < out_w) {
-        output[1] = mad(scale, output[1], biase);
-    }
-    if (out_w_id_per_ch_blk + 4 < out_w) {
-        output[2] = mad(scale, output[2], biase);
-    }
-    if (out_w_id_per_ch_blk + 6 < out_w) {
-        output[3] = mad(scale, output[3], biase);
-    }
-    if (out_w_id_per_ch_blk + 8 < out_w) {
-        output[4] = mad(scale, output[4], biase);
-    }
-#endif
-
-#ifdef RELU
-    output[0] = activation(output[0]);
-    output[1] = activation(output[1]);
-    output[2] = activation(output[2]);
-    output[3] = activation(output[3]);
-    output[4] = activation(output[4]);
-
-#endif
-
-    write_imageh(output_image, (int2)(out_w_id0, item_h_id), output[0]);
-
-    if (out_w_id_per_ch_blk + 2 < out_w) {
-        write_imageh(output_image, (int2)(out_w_id1, item_h_id), output[1]);
-    }
-    if (out_w_id_per_ch_blk + 4 < out_w) {
-        write_imageh(output_image, (int2)(out_w_id2, item_h_id), output[2]);
-    }
-    if (out_w_id_per_ch_blk + 6 < out_w) {
-        write_imageh(output_image, (int2)(out_w_id3, item_h_id), output[3]);
-    }
-    if (out_w_id_per_ch_blk + 8 < out_w) {
-        write_imageh(output_image, (int2)(out_w_id4, item_h_id), output[4]);
-    }
-}
-
-
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/density_prior_box_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/density_prior_box_kernel.cl
deleted file mode 100644
index ff5daa8d01..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/density_prior_box_kernel.cl
+++ /dev/null
@@ -1,114 +0,0 @@
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#define MIN_VALUE -FLT_MAX
-__kernel void density_prior_box(__write_only image2d_t output_boxes,
-                                __write_only image2d_t output_variances,
-                                __global float *densities,
-                                __private const float step_h,
-                                __private const float step_w,
-                                __private float variances0,
-                                __private float variances1,
-                                __private float variances2,
-                                __private float variances3,
-                                __private float offset,
-                                __private int den_and_fix_size,
-                                __private int img_width,
-                                __private int img_height,
-                                __private int C,
-                                __private int num_density,
-                                __private int step_average,
-                                __private int input_width,
-                                __private int wid,
-                                __private int fix_ratio_size
-                                ){
-
-                                const int out_c = get_global_id(0);
-                                const int out_w = get_global_id(1);
-                                const int out_nh = get_global_id(2);
-                                int2 output_pos;
-                                output_pos.x = out_c * 4 + out_w;
-                                output_pos.y = out_nh;
-                                half4 output;
-                                half4 variances;
-                                for (int c = 0; c < 4; c++) {
-                                    int idx = out_nh % num_density;
-                                    int input_h = out_nh / num_density;
-                                    int input_w = out_c * 4 + c;
-                                    int density_idx;
-                                    int density;
-                                    int ratio_idx;
-                                    int density_i;
-                                    int density_j;
-                                    int sum = 0;
-                                    int pre_sum = 0;
-                                    for (int i = 0; i < den_and_fix_size; i++) {
-                                        pre_sum = sum;
-                                        density = densities[i];
-                                        sum += density * density * fix_ratio_size;
-                                        if (idx < sum) {
-                                            density_idx = i;
-                                            break;
-                                        }
-                                    }
-                                    idx = idx - pre_sum;
-                                    ratio_idx = idx / (density * density);
-                                    idx = idx % (density * density);
-                                    density_i = idx / density;
-                                    density_j = idx % density;
-                                    half fixed_size = densities[den_and_fix_size + density_idx];
-                                    half ratio = densities[2 * den_and_fix_size + ratio_idx];
-                                    half box_width = fixed_size * ratio;
-                                    half box_height = fixed_size / ratio;
-                                    int shift = step_average / density;
-                                    half center_x;
-                                    half center_y;
-                                    center_x = (input_w + offset) * step_w;
-                                    center_x = center_x - step_average / 2.0 + shift / 2.0;
-                                    center_x = center_x + density_j * shift;
-                                    center_y = (input_h + offset) * step_h;
-                                    center_y = center_y - step_average / 2.0 + shift / 2.0;
-                                    center_y = center_y + density_i * shift;
-                                    half4 box;
-                                    box.x = (center_x - box_width / 2.0) / img_width;
-                                    box.y = (center_y - box_height / 2.0) / img_height;
-                                    box.z = (center_x + box_width / 2.0) / img_width;
-                                    box.w = (center_y + box_height / 2.0) / img_height;
-                                    box.x = max((float)box.x, 0.0);
-                                    box.y = max((float)box.y, 0.0);
-                                    box.z = min((float)box.z, 1.0);
-                                    box.w = min((float)box.w, 1.0);
-                                    half res;
-                                    half var;
-                                    if (out_w == 0) {
-                                        res = box.x;
-                                        var = convert_half(variances0);
-                                    } else if (out_w == 1) {
-                                        res = box.y;
-                                        var = convert_half(variances1);
-                                    } else if (out_w == 2) {
-                                        res = box.z;
-                                        var = convert_half(variances2);
-                                    } else if (out_w == 3) {
-                                        res = box.w;
-                                        var = convert_half(variances3);
-                                    }
-                                    variances.x = var;
-                                    variances.y = var;
-                                    variances.z = var;
-                                    variances.w = var;
-                                    if (c == 0) {
-                                        output.x = res;
-                                    } else if (c == 1) {
-                                        output.y = res;
-                                    } else if (c == 2) {
-                                        output.z = res;
-                                    } else if (c == 3) {
-                                        output.w = res;
-                                    }
-                                }
-
-                                write_imageh(output_boxes, (int2)(output_pos.x, output_pos.y), output);
-
-                                write_imageh(output_variances, (int2)(output_pos.x, output_pos.y), variances);
-
-}
\ No newline at end of file
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl
deleted file mode 100644
index 3c3497f917..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define BIASE
-#define BATCH_NORM
-#define RELU
-#include "conv_kernel.inc.cl"
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl
deleted file mode 100644
index 2a5c823295..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl
+++ /dev/null
@@ -1,15 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "conv_kernel.inc.cl"
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/dropout_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/dropout_kernel.cl
deleted file mode 100644
index fc9dfc8726..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/dropout_kernel.cl
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void dropout(__read_only image2d_t input_image,
-                      __write_only image2d_t output_image,
-                      __private const int out_W,
-                      __private const float dropoutPro) {
-
-                       const int out_c = get_global_id(0);
-                       const int out_w = get_global_id(1);
-                       const int out_nh = get_global_id(2);
-
-                       int2 output_pos;
-                       output_pos.x = out_c * out_W + out_w;
-                       output_pos.y = out_nh;
-
-                       const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                 CLK_ADDRESS_CLAMP      |
-                                                 CLK_FILTER_NEAREST;
-                       half4 input;
-                       half4 output;
-
-                       input = read_imageh(input_image, sampler,output_pos);
-                       half4 dropout = (half4)(1 - dropoutPro);
-                       output =  dropout * input;
-
-                       write_imageh(output_image, output_pos, output);
-}
-
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl
deleted file mode 100644
index f304764868..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__kernel void elementwise_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) {
-     int x = get_global_id(0);
-     int y = get_global_id(1);
-     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-     int2 coords;
-     coords.x = x;
-     coords.y = y;
-     half4 in = read_imageh(input, sampler, coords);
-     half4 biase = read_imageh(bias, sampler, coords);
-     half4 output = in + biase;
-     write_imageh(outputImage,coords,output);
- }
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/exp_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/exp_kernel.cl
deleted file mode 100644
index 2227aaab47..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/exp_kernel.cl
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
-#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
-
-__kernel void exp_impl(__read_only image2d_t input, __write_only image2d_t output) {
-   const int x = get_global_id(0);
-   const int y = get_global_id(1);
-
-   const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                             CLK_ADDRESS_CLAMP |
-                             CLK_FILTER_NEAREST;
-
-   half4 in = read_imageh(input, sampler, (int2)(x, y));
-   half4 out;
-   out.x = pow(2.71828182, (float)(in.x));
-   out.y = pow(2.71828182, (float)(in.y));
-   out.z = pow(2.71828182, (float)(in.z));
-   out.w = pow(2.71828182, (float)(in.w));
-   write_imageh(output, (int2)(x, y), out);
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/feed_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
deleted file mode 100644
index bb661f3cf7..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__kernel void feed(__global float *in,
-                   __write_only image2d_t output_image,
-                   __private const int out_H,
-                   __private const int out_W,
-                   __private const int out_C,
-                   __private const int Stride0,
-                   __private const int Stride1,
-                   __private const int Stride2){
-
-            const int out_c = get_global_id(0);
-            const int out_w = get_global_id(1);
-            const int out_nh = get_global_id(2);
-            const int out_n = out_nh/out_H;
-            const int out_h = out_nh%out_H;
-
-            const int in_n = out_n;
-            const int in_c0 = out_c * 4 + 0;
-            const int in_c1 = out_c * 4 + 1;
-            const int in_c2 = out_c * 4 + 2;
-            const int in_c3 = out_c * 4 + 3;
-            const int in_h = out_h;
-            const int in_w = out_w;
-
-
-            int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w;
-            int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w;
-            int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w;
-            int input_pos3 = in_n * Stride2 + in_c3 * Stride1 + in_h * Stride0 + in_w;
-
-            int2 output_pos;
-            output_pos.x = out_c * out_W + out_w;
-            output_pos.y = out_nh;
-
-            half4 output = (half4)0.0f;
-            output.x = convert_half(in[input_pos0]);
-            if(out_C - 4 * out_c>=2){
-             output.y = convert_half(in[input_pos1]);
-            }
-            if(out_C - 4 * out_c>=3){
-            output.z = convert_half(in[input_pos2]);
-            }
-            if(out_C - 4 * out_c>=4){
-             output.w = convert_half(in[input_pos3]);
-            }
-            write_imageh(output_image, output_pos, output);
-
- }
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
deleted file mode 100644
index f6014b7323..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void fetch(__private const int in_height,
-                    __private const int in_width,
-                    __read_only image2d_t input,
-                    __global float* out,
-                    __private const int size_ch,
-                    __private const int size_block,
-                    __private const int size_batch,
-                    __private const int C) {
-  const int in_c = get_global_id(0);
-  const int in_w = get_global_id(1);
-  const int in_nh = get_global_id(2);
-  const int in_n = in_nh / in_height;
-  const int in_h = in_nh % in_height;
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  const int pos_x = mad24(in_c, in_width, in_w);
-  half4 in = read_imageh(input, sampler, (int2)(pos_x, in_nh));
-
-  const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
-  out[index] = convert_float(in.x);
-  if(C - 4 * in_c>=2){
-   out[index + size_ch] = convert_float(in.y);
-  }
-  if(C - 4 * in_c>=3){
-  out[index + size_ch * 2] = convert_float(in.z);
-  }
-
-  if(C - 4 * in_c>=4){
-   out[index + size_ch * 3] = convert_float(in.w);
-  }
-
-}
-
-__kernel void fetch_2d(__private const int in_height,
-                       __private const int in_width,
-                       __read_only image2d_t input,
-                       __global float* out) {
-  const int in_w = get_global_id(1);
-  const int in_h = get_global_id(2);
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  half4 in = read_imageh(input, sampler, (int2)(in_w, in_h));
-
-  const int index = (in_h * in_width + in_w) * 4;
-  out[index] = convert_float(in.x);
-  out[index + 1] = convert_float(in.y);
-  out[index + 2] = convert_float(in.z);
-  out[index + 3] = convert_float(in.w);
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/flatten2_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/flatten2_kernel.cl
deleted file mode 100644
index 337fc7ae62..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/flatten2_kernel.cl
+++ /dev/null
@@ -1,48 +0,0 @@
-
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-
-__kernel void flatten2(__read_only image2d_t input_img,
-                       __write_only image2d_t output_img,
-                       __private int out_width,
-                       __private int in_width,
-                       __private int in_height,
-                       __private int in_C
-                      ){
-
-                        const int out_c = get_global_id(0);
-                        const int out_w = get_global_id(1);
-                        const int out_nh = get_global_id(2);
-
-                        const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                    CLK_ADDRESS_CLAMP |
-                                                    CLK_FILTER_NEAREST;
-
-                        int2 output_pos;
-                        output_pos.x = out_c * out_width + out_w;
-                        output_pos.y = out_nh;
-
-                        int channel_size = in_width * in_height;
-
-                        int in_c = output_pos.x / channel_size / 4;
-                        int2 input_pos;
-                        input_pos.x = (output_pos.x % in_width) + (in_c * in_width);
-                        input_pos.y = (output_pos.x % channel_size) / in_width + out_nh * in_height;
-                        half4 input_data = read_imageh(input_img, sampler, input_pos);
-
-                        half4 output_data;
-                        int in_c_offset = output_pos.x / channel_size % 4;
-                        if(in_c_offset == 0){
-                            output_data.x = input_data.x;
-                        } else if(in_c_offset == 1){
-                            output_data.x = input_data.y;
-                        } else if(in_c_offset == 2){
-                            output_data.x = input_data.z;
-                        } else if(in_c_offset == 3){
-                            output_data.x = input_data.w;
-                        }
-
-                        write_imageh(output_img, output_pos, output_data);
-}
-
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/instancenorm_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/instancenorm_kernel.cl
deleted file mode 100644
index 0c95a285ff..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/instancenorm_kernel.cl
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "cl_common.h"
-
-__kernel void instancenorm(__private const int in_width,
-                        __private const int in_height,
-                        __private const int in_c_group,
-                        __private const int local_work_size_x,
-                        __private const int local_work_size_y,
-                        __private const float epsilon,
-                        __read_only image2d_t input,
-                        __write_only image2d_t output) {
-  const int out_cn = get_global_id(0);
-  const int n = out_cn / in_c_group;
-  const int c = out_cn % in_c_group;
-  const int w = get_local_id(1);
-  const int h = get_local_id(2);
-  const int local_id = w * local_work_size_y + h;
-  const int local_total_size = local_work_size_x * local_work_size_y;
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  __local float4 shared_mem[256];
-
-  float4 sum = 0.0f;
-  for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) {
-    for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) {
-      sum += read_imagef(input, sampler, (int2)(mad24(c, in_width, xIndex), mad24(n, in_height, yIndex)));
-    }
-  }
-  shared_mem[local_id] = sum;
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  sum = 0.0f;
-  if (local_id < 32) {
-    for (int i = local_id + 32; i < local_total_size; i += 32) {
-      sum += shared_mem[i];
-    }
-  }
-  shared_mem[local_id] += sum;
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  sum = 0.0f;
-  if (local_id == 0) {
-    int top = min(32, local_total_size);
-    for (int i = 0; i < top; i += 1) {
-      sum += shared_mem[i];
-    }
-    shared_mem[0] = sum / (in_width * in_height);
-  }
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  const float4 mean_val = shared_mem[0];
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  sum = 0.0f;
-  for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) {
-    for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) {
-      sum += pow(read_imagef(input, sampler, (int2)(mad24(c, in_width, xIndex), mad24(n, in_height, yIndex))) - mean_val, 2);
-    }
-  }
-  shared_mem[local_id] = sum;
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  sum = 0.0f;
-  if (local_id < 32) {
-    for (int i = local_id + 32; i < local_total_size; i += 32) {
-      sum += shared_mem[i];
-    }
-  }
-  shared_mem[local_id] += sum;
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  sum = 0.0f;
-  if (local_id == 0) {
-    int top = min(32, local_total_size);
-    for (int i = 0; i < top; i += 1) {
-      sum += shared_mem[i];
-    }
-    shared_mem[0] = sum / (in_width * in_height);
-  }
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  const float4 sigma = sqrt(shared_mem[0] + (float4)(epsilon));
-
-  float4 s = 1 / sigma;
-
-  for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) {
-    for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) {
-      int2 intout_pos = (int2)(mad24(c, in_width, xIndex), mad24(n, in_height, yIndex));
-      float4 in_val = read_imagef(input, sampler, intout_pos);
-      half4 out_val = convert_half4((in_val - mean_val) * s);
-#ifdef RELU
-      out_val = activation(out_val);
-#endif
-      write_imageh(output, intout_pos, out_val);
-    }
-  }
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/leakyrelu_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/leakyrelu_kernel.cl
deleted file mode 100644
index d8c0129928..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/leakyrelu_kernel.cl
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__kernel void leakyrelu(__read_only image2d_t input,
-    __write_only image2d_t output, __private const float alpha, __private const int dims_w) {
-    const int c = get_global_id(0);
-    const int w = get_global_id(1);
-    const int nh = get_global_id(2);
-    int2 input_pos;
-    input_pos.x = c * dims_w + w;
-    input_pos.y = nh;
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
-
-    half4 in = read_imageh(input, sampler, (int2)(input_pos.x, input_pos.y));
-
-    half4 output_data;
-    output_data.x = max((float)(in.x), (float)(alpha * (in.x)));
-    output_data.y = max((float)(in.y), (float)(alpha * (in.y)));
-    output_data.z = max((float)(in.z), (float)(alpha * (in.z)));
-    output_data.w = max((float)(in.w), (float)(alpha * (in.w)));
-
-    write_imageh(output, (int2)(input_pos.x, input_pos.y), output_data);
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/lrn_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/lrn_kernel.cl
deleted file mode 100644
index 080928b235..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/lrn_kernel.cl
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void lrn(__read_only image2d_t input_image,
-                        __write_only image2d_t output_image,
-                        __private const int out_C,
-                        __private const int out_W,
-                        __private const int n,
-                        __private const float k,
-                        __private const float alpha,
-                        __private const float beta){
-
-                        const int out_c = get_global_id(0);
-                        const int out_w = get_global_id(1);
-                        const int out_nh = get_global_id(2);
-
-                        const int out_c0 = out_c * 4;
-                        const int out_c1 = out_c * 4 + 1;
-                        const int out_c2 = out_c * 4+ 2;
-                        const int out_c3 = out_c * 4+ 3;
-                        const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                    CLK_ADDRESS_CLAMP |
-                                                    CLK_FILTER_NEAREST;
-
-                        const int start = -(n-1)/2;
-                        const end = start + n;
-                        float sqr_sum0 = 0.0f;
-                        float sqr_sum1 = 0.0f;
-                        float sqr_sum2 = 0.0f;
-                        float sqr_sum3 = 0.0f;
-                        int input_c0,input_c1,input_c2,input_c3;
-                        int2 input_pos0,input_pos1,input_pos2,input_pos3;
-                        float4 input0,input1,input2,input3;
-                        for(int i = start; i < end ;i++){
-                         if(out_c0 + i>=0&&out_c0 + i<out_C){
-                          input_c0 = (out_c0 + i)/4;
-                          input_pos0.x = input_c0 * out_W + out_w;
-                          input_pos0.y = out_nh;
-                          input0 = convert_float4(read_imageh(input_image, sampler,input_pos0));
-                          if((out_c0 + i)%4 == 0){
-                           sqr_sum0 += input0.x * input0.x;
-                          }else if((out_c0 + i)%4 == 1){
-                           sqr_sum0 += input0.y * input0.y;
-                          }else if((out_c0 + i)%4 == 2){
-                           sqr_sum0 += input0.z * input0.z;
-                          }else{
-                           sqr_sum0 += input0.w * input0.w;
-                          }
-                         }
-
-                       if(out_c1 + i>=0&&out_c1 + i<out_C){
-                          input_c1 = (out_c1 + i)/4;
-                          input_pos1.x = input_c1 * out_W + out_w;
-                          input_pos1.y = out_nh;
-                          input1 = convert_float4(read_imageh(input_image, sampler,input_pos1));
-                          if((out_c1 + i)%4 == 0){
-                           sqr_sum1 += input1.x * input1.x;
-                          }else if((out_c1 + i)%4 == 1){
-                           sqr_sum1 += input1.y * input1.y;
-                          }else if((out_c1 + i)%4 == 2){
-                           sqr_sum1 += input1.z * input1.z;
-                          }else{
-                           sqr_sum1 += input1.w * input1.w;
-                          }
-                         }
-
-
-                         if(out_c2 + i>=0&&out_c2 + i<out_C){
-                          input_c2 = (out_c2 + i)/4;
-                          input_pos2.x = input_c2 * out_W + out_w;
-                          input_pos2.y = out_nh;
-                          input2 = convert_float4(read_imageh(input_image, sampler,input_pos2));
-                          if((out_c2 + i)%4 == 0){
-                           sqr_sum2 += input2.x * input2.x;
-                          }else if((out_c2 + i)%4 == 1){
-                           sqr_sum2 += input2.y * input2.y;
-                          }else if((out_c2 + i)%4 == 2){
-                           sqr_sum2 += input2.z * input2.z;
-                          }else{
-                           sqr_sum2 += input2.w * input2.w;
-                          }
-                         }
-
-                         if(out_c3 + i>=0&&out_c3 + i<out_C){
-                          input_c3 = (out_c3 + i)/4;
-                          input_pos3.x = input_c3 * out_W + out_w;
-                          input_pos3.y = out_nh;
-                          input3 = convert_float4(read_imageh(input_image, sampler,input_pos3));
-                          if((out_c3 + i)%4 == 0){
-                           sqr_sum3 += input3.x * input3.x;
-                          }else if((out_c3 + i)%4 == 1){
-                           sqr_sum3 += input3.y * input3.y;
-                          }else if((out_c3 + i)%4 == 2){
-                           sqr_sum3 += input3.z * input3.z;
-                          }else{
-                           sqr_sum3 += input3.w * input3.w;
-                          }
-                         }
-
-                        }
-
-                        float4 output = (float4)0.0f;
-                        float4 input;
-                        int2 output_pos;
-                        output_pos.x = out_c * out_W + out_w;
-                        output_pos.y = out_nh;
-                        input = convert_float4(read_imageh(input_image, sampler,output_pos));
-
-                        output.x = input.x / (pow(k + alpha * (sqr_sum0),beta));
-
-                        if(out_C - 4 * out_c>=2){
-                        output.y = input.y / (pow(k + alpha * (sqr_sum1),beta));
-                        }
-                        if(out_C - 4 * out_c>=3){
-                        output.z = input.z / (pow(k + alpha * (sqr_sum2),beta));
-                        }
-                        if(out_C - 4 * out_c>=4){
-                        output.w = input.w / (pow(k + alpha * (sqr_sum3),beta));
-                        }
-                        half4 tmp = convert_half4(output);
-                        write_imageh(output_image, output_pos, tmp);
-
-}
\ No newline at end of file
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/nearest_interp_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/nearest_interp_kernel.cl
deleted file mode 100644
index b74449d9c8..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/nearest_interp_kernel.cl
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__kernel void nearest_interp(__read_only image2d_t input, __write_only image2d_t output,
-                             __private const float scale_h, __private const float scale_w,
-                             __private const int in_dims_h, __private const int out_dims_h,
-                             __private const int in_dims_w, __private const int out_dims_w) {
-                             const int c = get_global_id(0);
-                             const int w = get_global_id(1);
-                             const int nh = get_global_id(2);
-                             int2 output_pos;
-                             output_pos.x = c * out_dims_w + w;
-                             output_pos.y = nh;
-                             int out_n = nh / out_dims_h;
-                             int out_h = nh % out_dims_h;
-                             int2 input_pos;
-                             input_pos.x = c * in_dims_w + w / scale_w;
-                             input_pos.y = out_n * in_dims_h + out_h / scale_h;
-
-                             const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                             CLK_ADDRESS_CLAMP |
-                                                             CLK_FILTER_NEAREST;
-                             half4 input_data = read_imageh(input, sampler, (int2)(input_pos.x, input_pos.y));
-                             write_imageh(output, (int2)(output_pos.x , output_pos.y), input_data);
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/pad2d_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/pad2d_kernel.cl
deleted file mode 100644
index 6d9142a16d..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/pad2d_kernel.cl
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void pad2d(
-    __private const int in_height, __private const int in_width,
-    __private const int out_height, __private const int out_width,
-    __private const int pad_top, __private const int pad_bottom,
-    __private const int pad_left, __private const int pad_right,
-    __private const int mode, __private const float pad_value,
-    __read_only image2d_t input, __write_only image2d_t output) {
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-  const int out_n = out_nh / out_height;
-  const int out_h = out_nh % out_height;
-
-  int2 output_pos = (int2)(mad24(out_c, out_width, out_w), out_nh);
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  int x = out_w - pad_left;
-  int y = out_h - pad_top;
-
-  if (mode == 0) {
-    if (x < 0 || y < 0 || x >= in_width || y >= in_height) {
-      write_imageh(output, output_pos, (half4)(pad_value));
-    } else {
-      write_imageh(output, output_pos, read_imageh(input, sampler, (int2)(out_c * in_width + x, out_n * in_height + y)));
-    }
-  } else if (mode == 1) {
-      x = abs(x);
-      y = abs(y);
-      x = x < in_width ? x : 2 * in_width - 2 - x;
-      y = y < in_height ? y : 2 * in_height - 2 - y;
-      write_imageh(output, output_pos, read_imageh(input, sampler, (int2)(out_c * in_width + x, out_n * in_height + y)));
-  } else if (mode == 2) {
-      x = x > 0 ? x : 0;
-      x = x < in_width ? x : in_width - 1;
-      y = y > 0 ? y : 0;
-      y = y < in_height ? y : in_height - 1;
-      write_imageh(output, output_pos, read_imageh(input, sampler, (int2)(out_c * in_width + x, out_n * in_height + y)));
-  }
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/pool_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/pool_kernel.cl
deleted file mode 100644
index fd4cc07799..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/pool_kernel.cl
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#define MIN_VALUE -FLT_MAX
-
-__kernel void pool_max(
-    __private const int in_height, __private const int in_width,
-    __private const int out_height, __private const int out_width,
-    __private const int pad_top, __private const int pad_left,
-    __private const int stride_h, __private const int stride_w,
-    __private const int ksize_h, __private const int ksize_w,
-    __read_only image2d_t input, __write_only image2d_t output) {
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-  const int out_n = out_nh / out_height;
-  const int out_h = out_nh % out_height;
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  int start_h = out_h * stride_h - pad_top;
-  int end_h = min(start_h + ksize_h, in_height);
-  start_h = max(start_h,0);
-
-  int start_w = out_w * stride_w - pad_left;
-  int end_w = min(start_w + ksize_w, in_width);
-  start_w = max(start_w,0);
-
-  const int pos_in_x = out_c * in_width;
-  const int pos_in_y = out_n * in_height;
-  half4 max_value = (half4)(MIN_VALUE);
-  for (int y = start_h; y < end_h; ++y) {
-    for (int x = start_w; x < end_w; ++x) {
-      half4 tmp = read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
-      max_value = max(max_value, tmp);
-    }
-  }
-
-  const int pos_out_x = mad24(out_c, out_width, out_w);
-  write_imageh(output, (int2)(pos_out_x, out_nh), max_value);
-}
-
-__kernel void pool_avg(
-    __private const int in_height, __private const int in_width,
-    __private const int out_height, __private const int out_width,
-    __private const int pad_top, __private const int pad_left,
-    __private const int stride_h, __private const int stride_w,
-    __private const int ksize_h, __private const int ksize_w,
-    __read_only image2d_t input, __write_only image2d_t output) {
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-  const int out_n = out_nh / out_height;
-  const int out_h = out_nh % out_height;
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  int start_h = out_h * stride_h - pad_top;
-  int end_h = min(start_h + ksize_h, in_height);
-  start_h = max(start_h, 0);
-
-  int start_w = out_w * stride_w - pad_left;
-  int end_w = min(start_w + ksize_w, in_width);
-  start_w = max(start_w, 0);
-
-  const int pos_in_x = out_c * in_width;
-  const int pos_in_y = out_n * in_height;
-  half4 sum = (half4)(0.0f);
-  int num = 0 ;
-  for (int y = start_h; y < end_h; ++y) {
-    for (int x = start_w; x < end_w; ++x) {
-      sum += read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
-    }
-  }
-
-  num = ksize_w * ksize_h;
-  half4 avg = sum / num;
-
-  const int pos_out_x = mad24(out_c, out_width, out_w);
-  write_imageh(output, (int2)(pos_out_x, out_nh), avg);
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl
deleted file mode 100644
index 886f62df68..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void prior_box(__private const int global_size_dim0,
-                        __private const int global_size_dim1,
-                        __private const int global_size_dim2,
-                        __global float *box_width,
-                        __global float *box_height,
-                        __global float *variances_Buffer,
-                        __write_only image2d_t output_boxes,
-                        __write_only image2d_t output_variances,
-                        __private const float step_width,
-                        __private const float step_height,
-                        __private const float offset,
-                        __private const int img_width,
-                        __private const int img_height,
-                        __private const int num_priors,
-                        __private const int C,
-                        __private const int clip){
-
-                        const int out_c = get_global_id(0);
-                        const int out_nh = get_global_id(1);
-                        const int out_n = out_nh/num_priors;
-                        const int out_h = out_nh%num_priors;
-
-                        int2 output_pos;
-                        output_pos.x = out_c * 4;
-                        output_pos.y = out_nh;
-                        float center_x0 = (offset + (float)(out_c * 4)) * step_width;
-                        float center_x1 = (offset + (float)(out_c * 4 + 1)) * step_width;
-                        float center_x2 = (offset + (float)(out_c * 4 + 2)) * step_width;
-                        float center_x3 = (offset + (float)(out_c * 4 + 3)) * step_width;
-                        float center_y = ((float)out_n + offset) * step_height;
-
-                        half4 output[4];
-                        half4 variances[4];
-                        output[0].x = convert_half((center_x0 - box_width[out_h]) / (float)img_width);
-                        output[1].x = convert_half((center_y - box_height[out_h]) / (float)img_height);
-                        output[2].x = convert_half((center_x0 + box_width[out_h]) / (float)img_width);
-                        output[3].x = convert_half((center_y + box_height[out_h]) / (float)img_height);
-                        variances[0].x = convert_half(variances_Buffer[0]);
-                        variances[1].x = convert_half(variances_Buffer[1]);
-                        variances[2].x = convert_half(variances_Buffer[2]);
-                        variances[3].x = convert_half(variances_Buffer[3]);
-
-                        if(C - 4 * out_c>=2){
-                        output[0].y = convert_half((center_x1 - box_width[out_h]) / (float)img_width);
-                        output[1].y = convert_half((center_y - box_height[out_h]) / (float)img_height);
-                        output[2].y = convert_half((center_x1 + box_width[out_h]) / (float)img_width);
-                        output[3].y = convert_half((center_y + box_height[out_h]) / (float)img_height);
-                        variances[0].y = convert_half(variances_Buffer[0]);
-                        variances[1].y = convert_half(variances_Buffer[1]);
-                        variances[2].y = convert_half(variances_Buffer[2]);
-                        variances[3].y = convert_half(variances_Buffer[3]);
-                        }else{
-                         output[0].y = 0.0f;
-                         output[1].y = 0.0f;
-                         output[2].y = 0.0f;
-                         output[3].y = 0.0f;
-                        }
-                        if(C - 4 * out_c>=3){
-                        output[0].z = convert_half((center_x2 - box_width[out_h]) / (float)img_width);
-                        output[1].z = convert_half((center_y - box_height[out_h]) / (float)img_height);
-                        output[2].z = convert_half((center_x2 + box_width[out_h]) / (float)img_width);
-                        output[3].z = convert_half((center_y + box_height[out_h]) / (float)img_height);
-                        variances[0].z = convert_half(variances_Buffer[0]);
-                        variances[1].z = convert_half(variances_Buffer[1]);
-                        variances[2].z = convert_half(variances_Buffer[2]);
-                        variances[3].z = convert_half(variances_Buffer[3]);
-                        }else{
-                        output[0].z = 0.0f;
-                        output[1].z = 0.0f;
-                        output[2].z = 0.0f;
-                        output[3].z = 0.0f;
-                        }
-                        if(C - 4 * out_c>=4){
-                        output[0].w = convert_half((center_x3 - box_width[out_h]) / (float)img_width);
-                        output[1].w = convert_half((center_y - box_height[out_h]) / (float)img_height);
-                        output[2].w = convert_half((center_x3 + box_width[out_h]) / (float)img_width);
-                        output[3].w = convert_half((center_y + box_height[out_h]) / (float)img_height);
-                        variances[0].w = convert_half(variances_Buffer[0]);
-                        variances[1].w = convert_half(variances_Buffer[1]);
-                        variances[2].w = convert_half(variances_Buffer[2]);
-                        variances[3].w = convert_half(variances_Buffer[3]);
-                        }else{
-                        output[0].w = 0.0f;
-                        output[1].w = 0.0f;
-                        output[2].w = 0.0f;
-                        output[3].w = 0.0f;
-                        }
-                        if(clip==1){
-                         output[0] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[0]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
-                         output[1] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[1]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
-                         output[2] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[2]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
-                         output[3] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[3]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
-                        }
-                        /*
-                        if(output_pos.x == 0 && output_pos.y == 1){
-                          float4 out = (float4)(output[0].x, output[1].x, output[2].x, output[3].x);
-                          printf("output = %v4hlf \n", out);
-
-                        }
-                        */
-
-                        write_imageh(output_boxes, (int2)(output_pos.x + 0, output_pos.y), output[0]);
-                        write_imageh(output_boxes, (int2)(output_pos.x + 1, output_pos.y), output[1]);
-                        write_imageh(output_boxes, (int2)(output_pos.x + 2, output_pos.y), output[2]);
-                        write_imageh(output_boxes, (int2)(output_pos.x + 3, output_pos.y), output[3]);
-
-                        write_imageh(output_variances, (int2)(output_pos.x + 0, output_pos.y), variances[0]);
-                        write_imageh(output_variances, (int2)(output_pos.x + 1, output_pos.y), variances[1]);
-                        write_imageh(output_variances, (int2)(output_pos.x + 2, output_pos.y), variances[2]);
-                        write_imageh(output_variances, (int2)(output_pos.x + 3, output_pos.y), variances[3]);
-
-
-}
\ No newline at end of file
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/relu.cl b/mobile/src/operators/kernel/cl/cl_kernel/relu.cl
deleted file mode 100644
index cc8f9c3742..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/relu.cl
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void relu(__read_only image2d_t input,
-                   __write_only image2d_t output){
-
-  const int x = get_global_id(0);
-  const int y = get_global_id(1);
-
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
-
-  half4 in = read_imageh(input, sampler, (int2)(x, y));
-  in = max((half4)(0.0f, 0.0f, 0.0f, 0.0f), in);
-  write_imageh(output, (int2)(x, y), in);
-}
-
-__kernel void relu_p0(__read_only image2d_t input,
-                   __write_only image2d_t output){
-
-  const int x = get_global_id(0);
-  const int y = get_global_id(1);
-
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
-
-  half4 in = read_imageh(input, sampler, (int2)(x, y));
-  in = max((half4)(0.0f, 0.0f, 0.0f, 0.0f), in);
-  write_imageh(output, (int2)(x, y), in);
-}
-__kernel void relu_p1(__read_only image2d_t input,
-                   __write_only image2d_t output){
-
-  const int x = get_global_id(0);
-  const int y = get_global_id(1);
-
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
-
-  half4 in = read_imageh(input, sampler, (int2)(x, y));
-  write_imageh(output, (int2)(x, y), in);
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/relu6.cl b/mobile/src/operators/kernel/cl/cl_kernel/relu6.cl
deleted file mode 100644
index 7a2f0e022f..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/relu6.cl
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void relu6(__read_only image2d_t input,
-                   __write_only image2d_t output,
-                    __private const float threshold){
-
-  const int x = get_global_id(0);
-  const int y = get_global_id(1);
-
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
-
-  half4 in = read_imageh(input, sampler, (int2)(x, y));
-  in = max((half4)(0.0f, 0.0f, 0.0f, 0.0f), in);
-  in = min((half4)(threshold, threshold, threshold, threshold), in);
-  write_imageh(output, (int2)(x, y), in);
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/reshape.cl b/mobile/src/operators/kernel/cl/cl_kernel/reshape.cl
deleted file mode 100644
index 7957001c96..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/reshape.cl
+++ /dev/null
@@ -1,202 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void reshape(__read_only image2d_t input_image,
-                      __write_only image2d_t output_image,
-                      __private const int out_C,
-                      __private const int out_H,
-                      __private const int out_W,
-                      __private const int in_W,
-                      __private const int in_H,
-                      __private const int in_Stride0,
-                      __private const int in_Stride1,
-                      __private const int in_Stride2,
-                      __private const int out_Stride0,
-                      __private const int out_Stride1,
-                      __private const int out_Stride2) {
-
-                       const int out_c = get_global_id(0);
-                       const int out_w = get_global_id(1);
-                       const int out_nh = get_global_id(2);
-                       const int out_n = out_nh/out_H;
-                       const int out_h = out_nh%out_H;
-                       const int out_c0 = out_c * 4;
-                       const int out_c1 = out_c * 4 + 1;
-                       const int out_c2 = out_c * 4+ 2;
-                       const int out_c3 = out_c * 4+ 3;
-
-                       int count0 =  out_n * out_Stride2 + out_c0 * out_Stride1 + out_h * out_Stride0 + out_w;
-                       int count1 =  out_n * out_Stride2 + out_c1 * out_Stride1 + out_h * out_Stride0 + out_w;
-                       int count2 =  out_n * out_Stride2 + out_c2 * out_Stride1 + out_h * out_Stride0 + out_w;
-                       int count3 =  out_n * out_Stride2 + out_c3 * out_Stride1 + out_h * out_Stride0 + out_w;
-
-                       int in_n0 = count0/in_Stride2;
-                       int in_n1 = count1/in_Stride2;
-                       int in_n2 = count1/in_Stride2;
-                       int in_n3 = count2/in_Stride2;
-
-                       count0 = count0%in_Stride2;
-                       count1 = count1%in_Stride2;
-                       count2 = count2%in_Stride2;
-                       count3 = count3%in_Stride2;
-
-                       int in_c0 = count0/in_Stride1;
-                       int in_c1 = count1/in_Stride1;
-                       int in_c2 = count2/in_Stride1;
-                       int in_c3 = count3/in_Stride1;
-
-                       int in_h0 = (count0%in_Stride1)/in_Stride0;
-                       int in_h1 = (count1%in_Stride1)/in_Stride0;
-                       int in_h2 = (count2%in_Stride1)/in_Stride0;
-                       int in_h3 = (count3%in_Stride1)/in_Stride0;
-
-                       int in_w0 = (count0%in_Stride1)%in_Stride0;
-                       int in_w1 = (count1%in_Stride1)%in_Stride0;
-                       int in_w2 = (count2%in_Stride1)%in_Stride0;
-                       int in_w3 = (count3%in_Stride1)%in_Stride0;
-
-
-                       int2 input_pos0;
-                       int2 input_pos1;
-                       int2 input_pos2;
-                       int2 input_pos3;
-
-                       input_pos0.x = (in_c0/4) * in_W + in_w0;
-                       input_pos0.y = in_n0 * in_H + in_h0;
-
-                       input_pos1.x = (in_c1/4) * in_W + in_w1;
-                       input_pos1.y = in_n1 * in_H + in_h1;
-
-                       input_pos2.x = (in_c2/4) * in_W + in_w2;
-                       input_pos2.y = in_n2 * in_H + in_h2;
-
-                       input_pos3.x = (in_c3/4) * in_W + in_w3;
-                       input_pos3.y = in_n3 * in_H + in_h3;
-
-                       int2 output_pos;
-                       output_pos.x = out_c * out_W + out_w;
-                       output_pos.y = out_nh;
-
-                       const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                 CLK_ADDRESS_CLAMP      |
-                                                 CLK_FILTER_NEAREST;
-
-                       half4 input0;
-                       half4 input1;
-                       half4 input2;
-                       half4 input3;
-                       half4 output;
-
-                       input0 = read_imageh(input_image, sampler,input_pos0);
-                       if(in_c0%4==0){
-                          output.x = input0.x;
-                       }else if(in_c0%4==1){
-                          output.x = input0.y;
-                       }else if(in_c0%4==2){
-                          output.x = input0.z;
-                       }else{
-                          output.x = input0.w;
-                       }
-                       if(out_C - out_c * 4>=2){
-                          input1 = read_imageh(input_image, sampler,input_pos1);
-                       if(in_c1%4==0){
-                          output.y = input1.x;
-                       }else if(in_c1%4==1){
-                          output.y = input1.y;
-                       }else if(in_c1%4==2){
-                          output.y = input1.z;
-                       }else{
-                          output.y = input1.w;
-                       }
-
-                       }else{
-                          output.y = 0.0f;
-                       }
-
-                       if(out_C - out_c * 4>=3){
-                          input2 = read_imageh(input_image, sampler,input_pos2);
-
-                       if(in_c2%4==0){
-                          output.z = input2.x;
-                       }else if(in_c2%4==1){
-                          output.z = input1.y;
-                       }else if(in_c2%4==2){
-                          output.z = input2.z;
-                       }else{
-                          output.z = input2.w;
-                       }
-                       }else{
-                          output.z = 0.0f;
-                       }
-
-                       if(out_C - out_c * 4>=4){
-                          input3 = read_imageh(input_image, sampler,input_pos3);
-                       if(in_c3%4==0){
-                          output.w = input3.x;
-                       }else if(in_c3%4==1){
-                          output.w = input3.y;
-                       }else if(in_c3%4==2){
-                          output.w = input3.z;
-                       }else{
-                          output.w = input3.w;
-                       }
-                       }else{
-                          output.w = 0.0f;
-                       }
-
-                       write_imageh(output_image, output_pos, output);
-}
-
-
-/*
-
-__kernel void reshape(__read_only image2d_t input,
-                      __write_only image2d_t output,
-                      __private const int d0,
-                      __private const int d1,
-                      __private const int d2,
-                      __private const int d3,
-                      __private const int x0,
-                      __private const int x1,
-                      __private const int x2,
-                      __private const int x3) {
-  const int x = get_global_id(0);
-  const int y = get_global_id(1);
-  int obx = x / x3;
-  int oby = y / x2;
-  int ox = x % x3;
-  int oy = y % x2;
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
-  half4 r;
-  for (int i = 0; i < 4; i++) {
-    int t = obx * 4 + i;
-    if (t > x1) break;
-    int oindex = oby * x1 * x2 * x3 + t * x2 * x3 + ox * x3 + oy;
-    int i3 = oindex % d3; oindex /= d3;
-    int i2 = oindex % d2; oindex /= d2;
-    int i1 = oindex % d1; oindex /= d1;
-    int i0 = oindex;
-    int ix = (i1 / 4) * d3 + i3;
-    int iy = i0 * d2 + i2;
-    half4 p = read_imageh(input, sampler, (int2)(ix, iy));
-    ((half*)&r)[i] = ((half*)&p)[i1%4];
-  }
-  write_imageh(output, (int2)(x, y), r);
-}
-
-*/
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/scale_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/scale_kernel.cl
deleted file mode 100644
index 57d775b22b..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/scale_kernel.cl
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void scale(__read_only image2d_t input,
-                    __write_only image2d_t output,
-                    __private float scale,
-                    __private float bias,
-                    __private int out_width){
-
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
-
-  int pos_x = mad24(out_c, out_width, out_w);
-  half4 in = read_imageh(input, sampler, (int2)(pos_x, out_nh));
-  in = convert_half(scale) * in + convert_half(bias);
-  write_imageh(output, (int2)(pos_x, out_nh), in);
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/sigmoid.cl b/mobile/src/operators/kernel/cl/cl_kernel/sigmoid.cl
deleted file mode 100644
index 0a1995d42c..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/sigmoid.cl
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void sigmoid(__read_only image2d_t input,
-                   __write_only image2d_t output){
-
-   const int x = get_global_id(0);
-   const int y = get_global_id(1);
-
-   const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                             CLK_ADDRESS_CLAMP |
-                             CLK_FILTER_NEAREST;
-
-   half4 in = read_imageh(input, sampler, (int2)(x, y));
-   half4 out;
-   out.x = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.x)));
-   out.y = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.y)));
-   out.z = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.z)));
-   out.w = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.w)));
-   write_imageh(output, (int2)(x, y), out);
-}
\ No newline at end of file
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/slice_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/slice_kernel.cl
deleted file mode 100644
index aab8357d82..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/slice_kernel.cl
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void slice(__read_only image2d_t input, __write_only image2d_t output,
-                    __private const int start, __private const int end,
-                    __private const int dims_w){
-
-                    const int c = get_global_id(0);
-                    const int w = get_global_id(1);
-                    const int nh = get_global_id(2);
-                    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                CLK_ADDRESS_CLAMP |
-                                                CLK_FILTER_NEAREST;
-
-                    int2 output_pos;
-                    output_pos.x = c * dims_w + w;
-                    output_pos.y = nh;
-
-                    int2 input_pos;
-                    half4 input_data;
-                    half4 output_data;
-
-                    if (start % 4 == 0) {
-                        input_pos.x = (4 * c + start) / 4 * dims_w + w;
-                        input_pos.y = nh;
-                        input_data = read_imageh(input, sampler,input_pos);
-                        output_data = input_data;
-                    } else if (start % 4 == 1) {
-                        input_pos.x = (4 * c + start) / 4 * dims_w + w;
-                        input_pos.y = nh;
-                        input_data = read_imageh(input, sampler,input_pos);
-                        output_data.x = input_data.y;
-                        output_data.y = input_data.z;
-                        output_data.z = input_data.w;
-                        input_pos.x = input_pos.x + dims_w;
-                        input_pos.y = nh;
-                        input_data = read_imageh(input, sampler,input_pos);
-                        output_data.w = input_data.x;
-                    } else if (start % 4 == 2) {
-                        input_pos.x = (4 * c + start) / 4 * dims_w + w;
-                        input_pos.y = nh;
-                        input_data = read_imageh(input, sampler,input_pos);
-                        output_data.x = input_data.z;
-                        output_data.y = input_data.w;
-                        input_pos.x = input_pos.x + dims_w;
-                        input_pos.y = nh;
-                        input_data = read_imageh(input, sampler,input_pos);
-                        output_data.z = input_data.x;
-                        output_data.w = input_data.y;
-                    } else if (start % 4 == 3) {
-                        input_pos.x = (4 * c + start) / 4 * dims_w + w;
-                        input_pos.y = nh;
-                        input_data = read_imageh(input, sampler,input_pos);
-                        output_data.x = input_data.w;
-                        input_pos.x = input_pos.x + dims_w;
-                        input_pos.y = nh;
-                        input_data = read_imageh(input, sampler,input_pos);
-                        output_data.y = input_data.x;
-                        output_data.z = input_data.y;
-                        output_data.w = input_data.z;
-                    }
-                    write_imageh(output, output_pos, output_data);
-
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/softmax.cl b/mobile/src/operators/kernel/cl/cl_kernel/softmax.cl
deleted file mode 100644
index a1fa014e00..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/softmax.cl
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void softmax(__read_only image2d_t input_image,
-                      __write_only image2d_t output_image,
-                      __private const int out_W
-                      ) {
-    const int out_c = get_global_id(0);   //  block index
-    const int out_w = get_global_id(1);   // index in one block
-    const int out_nh = get_global_id(2);
-
-    const int in_c = out_c;
-    const int in_w = out_w;
-    const int in_nh = out_nh;
-
-    int2 input_pos;
-    int2 output_pos;
-
-    input_pos.x = in_c * out_W + in_w;
-    input_pos.y = in_nh;
-
-    output_pos.x = out_c * out_W + out_w;
-    output_pos.y = out_nh;
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                CLK_ADDRESS_CLAMP |
-                                CLK_FILTER_NEAREST;
-
-    half4 input_max = 0.0f;
-    half4 input_tmp;
-    for(int i=0;i<out_W;i++){
-     input_tmp = read_imageh(input_image, sampler,(int2)(in_c * out_W + i,in_nh));
-     input_max = max(input_max,input_tmp);
-    }
-
-    half4 sum = (half4)0.0f;
-    for(int i=0;i<out_W;i++){
-        input_tmp = read_imageh(input_image, sampler,(int2)(in_c * out_W + i,in_nh));
-        sum += exp(input_tmp - input_max);
-       }
-
-       half4 input = read_imageh(input_image, sampler,input_pos);
-       half4 output = exp(input - input_max)/sum;
-       write_imageh(output_image, output_pos, output);
-
-}
-
-/*
-
-__kernel void softmax(__read_only image2d_t input,
-                      __write_only image2d_t output,
-                      __private const int d0,
-                      __private const int d1,
-                      __private const int d2,
-                      __private const int d3) {
-  const int z = get_global_id(0);
-  const int x = get_global_id(1);
-  const int y = get_global_id(2);
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
-  half4 cv = read_imageh(input, sampler, (int2)(x, y));
-  half4 maxv = cv;
-  for (int i = 0; i < d3; i++) {
-    half4 temp = read_imageh(input, sampler, (int2)(z * d3 + i, y));
-    maxv = max(maxv, temp);
-  }
-  half4 sum = (half4)0.0f;
-  // half4 x = = (half4)0.0f;
-  for (int i = 0; i < d3; i++) {
-    half4 temp = read_imageh(input, sampler, (int2)(z * d3 + i, y));
-    sum += exp(temp - maxv);
-  }
-  half4 r = exp(cv - maxv) / sum;
-
-  write_imageh(output, (int2)(z * d3 + x, y), r);
-}
-
-*/
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/tanh_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/tanh_kernel.cl
deleted file mode 100644
index 067a4bd1fb..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/tanh_kernel.cl
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void tanh_kernel(__read_only image2d_t input,
-                   __write_only image2d_t output){
-
-  const int x = get_global_id(0);
-  const int y = get_global_id(1);
-
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
-
-  half4 in = read_imageh(input, sampler, (int2)(x, y));
-  write_imageh(output, (int2)(x, y), tanh(in));
-}
-
-
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/transpose_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/transpose_kernel.cl
deleted file mode 100644
index f09cf0141e..0000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/transpose_kernel.cl
+++ /dev/null
@@ -1,169 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__kernel void transpose_4d( __read_only image2d_t input_image,
-                            __write_only image2d_t output_image,
-                            __private const int out_C,
-                            __private const int out_H,
-                            __private const int out_W,
-                            __private const int in_W
-                           ){
-                           const int out_c = get_global_id(0);
-                           const int out_w = get_global_id(1);
-                           const int out_nh = get_global_id(2);
-                           const int out_n =  1;
-                           const int out_h = out_nh%out_H;
-                           const int out_c0 = out_c * 4;
-                           const int out_c1 = out_c * 4 + 1;
-                           const int out_c2 = out_c * 4+ 2;
-                           const int out_c3 = out_c * 4+ 3;
-
-                           const int in_n = out_n;
-                           const int in_c = out_w / 4;
-                           const int in_h0 = out_c0;
-                           const int in_h1 = out_c1;
-                           const int in_h2 = out_c2;
-                           const int in_h3 = out_c3;
-                           const int in_w = out_h;
-
-                           int2 output_pos;
-                           output_pos.x = out_c * out_W + out_w;
-                           output_pos.y = out_nh;
-
-                           int2 input_pos0;
-                           int2 input_pos1;
-                           int2 input_pos2;
-                           int2 input_pos3;
-
-                           input_pos0.x = in_W * in_c + in_w;
-                           input_pos0.y = in_n * in_h0;
-
-                           input_pos1.x = in_W * in_c + in_w;
-                           input_pos1.y = in_n * in_h1;
-
-                           input_pos2.x = in_W * in_c + in_w;
-                           input_pos2.y = in_n * in_h2;
-
-                           input_pos3.x = in_W * in_c + in_w;
-                           input_pos3.y = in_n * in_h3;
-
-                           const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                         CLK_ADDRESS_CLAMP      |
-                                                         CLK_FILTER_NEAREST;
-
-                           half4 input0;
-                           half4 input1;
-                           half4 input2;
-                           half4 input3;
-                           half4 output;
-                           input0 = read_imageh(input_image, sampler,input_pos0);
-
-                            if(out_w%4==0){
-                            output.x = input0.x;
-                           }else if(out_w%4==1){
-                            output.x = input0.y;
-                           }else if(out_w%4==2){
-                            output.x = input0.z;
-                           }else{
-                            output.x = input0.w;
-                           }
-                           if(out_C - out_c * 4>=2){
-                            input1 = read_imageh(input_image, sampler,input_pos1);
-                            if(out_w%4==0){
-                            output.y = input1.x;
-                            }else if(out_w%4==1){
-                             output.y = input1.y;
-                            }else if(out_w%4==2){
-                             output.y = input1.z;
-                            }else{
-                             output.y = input1.w;
-                            }
-
-                           }else{
-                           output.y = 0.0f;
-                           }
-
-                           if(out_C - out_c * 4>=3){
-                            input2 = read_imageh(input_image, sampler,input_pos2);
-
-                          if(out_w%4==0){
-                            output.z = input2.x;
-                            }else if(out_w%4==1){
-                             output.z = input2.y;
-                            }else if(out_w%4==2){
-                             output.z = input2.z;
-                            }else{
-                             output.z = input2.w;
-                            }
-                           }else{
-                            output.z = 0.0f;
-                           }
-
-                           if(out_C - out_c * 4>=4){
-                            input3 = read_imageh(input_image, sampler,input_pos3);
-                           if(out_w%4==0){
-                            output.w = input3.x;
-                            }else if(out_w%4==1){
-                             output.w = input3.y;
-                            }else if(out_w%4==2){
-                             output.w = input3.z;
-                            }else{
-                             output.w = input3.w;
-                            }
-                           }else{
-                           output.w = 0.0f;
-                           }
-                           write_imageh(output_image, output_pos, output);
-}
-
-__kernel void transpose( __read_only image2d_t input_image,
-                            __write_only image2d_t output_image,
-                            __private const int out_C,
-                            __private const int out_H,
-                            __private const int out_W,
-                            __private const int in_W
-                           ){
-                           const int out_c = get_global_id(0);
-                           const int out_w = get_global_id(1);
-                           const int out_nh = get_global_id(2);
-                           const int out_n =  1;
-                           const int out_h = out_nh%out_H;
-
-                           const int in_n = 1;
-                           const int in_c = out_c;
-                           const int in_w = out_h;
-                           const int in_h = out_w;
-
-                           int2 input_pos;
-                           int2 output_pos;
-
-                           input_pos.x = in_c * in_W + in_w;
-                           input_pos.y = in_n * in_h;
-
-                           output_pos.x = out_c * out_W + out_w;
-                           output_pos.y = out_n * out_h;
-
-                           const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                      CLK_ADDRESS_CLAMP      |
-                                                      CLK_FILTER_NEAREST;
-
-                           half4 input;
-                           half4 output;
-                           input = read_imageh(input_image, sampler,input_pos);
-
-                           output = input;
-                           write_imageh(output_image, output_pos, output);
-
-}
\ No newline at end of file
diff --git a/mobile/src/operators/kernel/cl/concat_kernel.cpp b/mobile/src/operators/kernel/cl/concat_kernel.cpp
deleted file mode 100644
index 013faa3fd1..0000000000
--- a/mobile/src/operators/kernel/cl/concat_kernel.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONCAT_OP
-
-#include "operators/kernel/concat_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConcatKernel<GPU_CL, float>::Init(ConcatParam<GPU_CL> *param) {
-  if (param->Out()->dims().size() < 4) {
-    if (param->Out()->dims().size() - param->axis_ == 1) {
-      this->cl_helper_.AddKernel("concatByW", "concat_kernel.cl");
-    } else {
-      this->cl_helper_.AddKernel("concatByH", "concat_kernel.cl");
-    }
-  } else if (param->Out()->dims().size() >= 4) {
-    if (param->Inputs().size() == 2) {
-      this->cl_helper_.AddKernel("concatByCWith2Inputs", "concat_kernel.cl");
-    } else if (param->Inputs().size() == 3) {
-      this->cl_helper_.AddKernel("concatByCWith3Inputs", "concat_kernel.cl");
-    } else if (param->Inputs().size() == 4) {
-      this->cl_helper_.AddKernel("concatByCWith4Inputs", "concat_kernel.cl");
-    } else {
-      return false;
-    }
-  }
-  return true;
-}
-
-template <>
-void ConcatKernel<GPU_CL, float>::Compute(const ConcatParam<GPU_CL> &param) {
-  if (param.Out()->dims().size() < 4) {
-    auto kernel = this->cl_helper_.KernelAt(0);
-    auto inputs = param.Inputs();
-    auto *output_image = param.Out()->GetCLImage();
-    int out_W = 0;
-    if (param.Out()->dims().size() == 3) {
-      out_W = param.Out()->dims()[2];
-    } else if (param.Out()->dims().size() == 2) {
-      out_W = param.Out()->dims()[1];
-    }
-    int out_H_Start = 0;
-    if (param.Out()->dims().size() - param.axis_ == 1) {
-      for (int i = 0; i < inputs.size(); i++) {
-        int pre_Width = 0;
-        for (int k = 0; k < i; ++k) {
-          pre_Width += inputs[k]->dims()[inputs[k]->dims().size() - 1];
-        }
-        int in_w = inputs[i]->dims()[param.Out()->dims().size() - 2];
-        auto input_image = inputs[i]->GetCLImage();
-        auto default_work_size = this->cl_helper_.DefaultWorkSize(*inputs[i]);
-        cl_int status;
-        status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-        CL_CHECK_ERRORS(status);
-        status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-        CL_CHECK_ERRORS(status);
-        status = clSetKernelArg(kernel, 2, sizeof(int), &in_w);
-        CL_CHECK_ERRORS(status);
-        status = clSetKernelArg(kernel, 3, sizeof(int), &pre_Width);
-        CL_CHECK_ERRORS(status);
-        status = clSetKernelArg(kernel, 4, sizeof(int), &out_W);
-        CL_CHECK_ERRORS(status);
-        status = clEnqueueNDRangeKernel(
-            this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
-            NULL, default_work_size.data(), NULL, 0, NULL, NULL);
-        CL_CHECK_ERRORS(status);
-      }
-
-    } else {
-      for (int i = 0; i < inputs.size(); i++) {
-        auto input_image = inputs[i]->GetCLImage();
-        auto default_work_size = this->cl_helper_.DefaultWorkSize(*inputs[i]);
-        cl_int status;
-        status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-        CL_CHECK_ERRORS(status);
-        status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-        CL_CHECK_ERRORS(status);
-        status = clSetKernelArg(kernel, 2, sizeof(int), &out_W);
-        CL_CHECK_ERRORS(status);
-        status = clSetKernelArg(kernel, 3, sizeof(int), &out_H_Start);
-        CL_CHECK_ERRORS(status);
-        status = clEnqueueNDRangeKernel(
-            this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
-            NULL, default_work_size.data(), NULL, 0, NULL, NULL);
-        CL_CHECK_ERRORS(status);
-        if (param.Out()->dims().size() == 3) {
-          out_H_Start += inputs[i]->dims()[1];
-        } else if (param.Out()->dims().size() == 2) {
-          out_H_Start += inputs[i]->dims()[0];
-        }
-      }
-    }
-
-  } else {
-    auto kernel0 = this->cl_helper_.KernelAt(0);
-    auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
-    auto inputs = param.Inputs();
-    int arg_offset;
-    cl_int status;
-    if (inputs.size() == 2) {
-      auto input_image_0 = inputs[0]->GetCLImage();
-      status = clSetKernelArg(kernel0, 0, sizeof(cl_mem), &input_image_0);
-      CL_CHECK_ERRORS(status);
-      auto input_image_1 = inputs[1]->GetCLImage();
-      status = clSetKernelArg(kernel0, 1, sizeof(cl_mem), &input_image_1);
-      CL_CHECK_ERRORS(status);
-      int C_0 = inputs[0]->dims()[1];
-      status = clSetKernelArg(kernel0, 2, sizeof(int), &C_0);
-      CL_CHECK_ERRORS(status);
-      int C_1 = inputs[1]->dims()[1];
-      status = clSetKernelArg(kernel0, 3, sizeof(int), &C_1);
-      CL_CHECK_ERRORS(status);
-      arg_offset = 4;
-    } else if (inputs.size() == 3) {
-      auto input_image_0 = inputs[0]->GetCLImage();
-      status = clSetKernelArg(kernel0, 0, sizeof(cl_mem), &input_image_0);
-      CL_CHECK_ERRORS(status);
-      auto input_image_1 = inputs[1]->GetCLImage();
-      status = clSetKernelArg(kernel0, 1, sizeof(cl_mem), &input_image_1);
-      CL_CHECK_ERRORS(status);
-      auto input_image_2 = inputs[2]->GetCLImage();
-      status = clSetKernelArg(kernel0, 2, sizeof(cl_mem), &input_image_2);
-      CL_CHECK_ERRORS(status);
-      int C_0 = inputs[0]->dims()[1];
-      status = clSetKernelArg(kernel0, 3, sizeof(int), &C_0);
-      CL_CHECK_ERRORS(status);
-      int C_1 = inputs[1]->dims()[1];
-      status = clSetKernelArg(kernel0, 4, sizeof(int), &C_1);
-      CL_CHECK_ERRORS(status);
-      int C_2 = inputs[2]->dims()[1];
-      status = clSetKernelArg(kernel0, 5, sizeof(int), &C_2);
-      CL_CHECK_ERRORS(status);
-      arg_offset = 6;
-    } else if (inputs.size() == 4) {
-      auto input_image_0 = inputs[0]->GetCLImage();
-      status = clSetKernelArg(kernel0, 0, sizeof(cl_mem), &input_image_0);
-      CL_CHECK_ERRORS(status);
-      auto input_image_1 = inputs[1]->GetCLImage();
-      status = clSetKernelArg(kernel0, 1, sizeof(cl_mem), &input_image_1);
-      CL_CHECK_ERRORS(status);
-      auto input_image_2 = inputs[2]->GetCLImage();
-      status = clSetKernelArg(kernel0, 2, sizeof(cl_mem), &input_image_2);
-      CL_CHECK_ERRORS(status);
-      auto input_image_3 = inputs[3]->GetCLImage();
-      status = clSetKernelArg(kernel0, 3, sizeof(cl_mem), &input_image_3);
-      CL_CHECK_ERRORS(status);
-      int C_0 = inputs[0]->dims()[1];
-      status = clSetKernelArg(kernel0, 4, sizeof(int), &C_0);
-      CL_CHECK_ERRORS(status);
-      int C_1 = inputs[1]->dims()[1];
-      status = clSetKernelArg(kernel0, 5, sizeof(int), &C_1);
-      CL_CHECK_ERRORS(status);
-      int C_2 = inputs[2]->dims()[1];
-      status = clSetKernelArg(kernel0, 6, sizeof(int), &C_2);
-      CL_CHECK_ERRORS(status);
-      int C_3 = inputs[3]->dims()[1];
-      status = clSetKernelArg(kernel0, 7, sizeof(int), &C_3);
-      CL_CHECK_ERRORS(status);
-      arg_offset = 8;
-    }
-    auto *output_image = param.Out()->GetCLImage();
-    status =
-        clSetKernelArg(kernel0, arg_offset + 0, sizeof(cl_mem), &output_image);
-    CL_CHECK_ERRORS(status);
-    int out_C = param.Out()->dims()[1];
-    status = clSetKernelArg(kernel0, arg_offset + 1, sizeof(int), &out_C);
-    CL_CHECK_ERRORS(status);
-    int out_W = param.Out()->dims()[3];
-    status = clSetKernelArg(kernel0, arg_offset + 2, sizeof(int), &out_W);
-    CL_CHECK_ERRORS(status);
-
-    status = clEnqueueNDRangeKernel(
-        this->cl_helper_.CLCommandQueue(), kernel0, default_work_size.size(),
-        NULL, default_work_size.data(), NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
deleted file mode 100644
index 66e839c0fb..0000000000
--- a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,230 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDBNRELU_OP
-
-#include "operators/kernel/conv_add_bn_relu_kernel.h"
-#include <cmath>
-#include "framework/cl/cl_image.h"
-#include "framework/cl/cl_tool.h"
-#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool ConvAddBNReluKernel<GPU_CL, float>::Init(
-    FusionConvAddBNReluParam<GPU_CL> *param) {
-  PADDLE_MOBILE_ENFORCE(
-      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-          param->Paddings()[0] == param->Paddings()[1],
-      "need equal");
-
-  if (!param->Bias()->isInit()) {
-    param->Bias()->InitCLImage(cl_helper_.CLContext(),
-                               cl_helper_.CLCommandQueue());
-  }
-
-  //  const CL *mean = param->InputMean();
-  const framework::CLImage *mean = param->InputMean();
-  const framework::CLImage *variance = param->InputVariance();
-  const framework::CLImage *scale = param->InputScale();
-  const framework::CLImage *bias = param->InputBias();
-  const float epsilon = param->Epsilon();
-
-  const int C = mean->numel();
-
-  //  for (int j = 0; j < C; ++j) {
-  //    DLOG << " mean - " << j << mean->data<float>()[j];
-  //  }
-  //
-  //  for (int j = 0; j < C; ++j) {
-  //    DLOG << " variance - " << j << variance->data<float>()[j];
-  //  }
-  //
-  //  for (int j = 0; j < C; ++j) {
-  //    DLOG << " scale - " << j << scale->data<float>()[j];
-  //  }
-  //
-  //  for (int j = 0; j < C; ++j) {
-  //    DLOG << " bias - " << j << bias->data<float>()[j];
-  //  }
-
-  //
-  //  DLOG << " climage mean: " << *mean;
-  //  DLOG << " climage variance: " << *variance;
-  //  DLOG << " climage scale: " << *scale;
-  //  DLOG << " climage bias: " << *bias;
-
-  auto mean_ptr = mean->data<float>();
-  auto variance_ptr = variance->data<float>();
-  auto scale_ptr = scale->data<float>();
-  auto bias_ptr = bias->data<float>();
-
-  float inv_std_ptr[C];
-  for (int i = 0; i < C; i++) {
-    inv_std_ptr[i] =
-        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
-  }
-  float *new_scale_ptr = new float[C];
-  float *new_bias_ptr = new float[C];
-
-  for (int i = 0; i < C; i++) {
-    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
-    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
-  }
-
-  framework::CLImage *new_scale = new framework::CLImage();
-
-  //  for (int j = 0; j < C; ++j) {
-  //    DLOG << " new scale - " << j << new_scale_ptr[j];
-  //  }
-  //
-  //  for (int j = 0; j < C; ++j) {
-  //    DLOG << " new bias - " << j << new_bias_ptr[j];
-  //  }
-
-  new_scale->SetTensorData(new_scale_ptr, variance->dims());
-  new_scale->InitCLImage(this->cl_helper_.CLContext(),
-                         cl_helper_.CLCommandQueue());
-
-  //  DLOG << " climage - y bias: " << *(param->Bias());
-  //
-  //  DLOG << " climage - new scale: " << *new_scale;
-
-  framework::CLImage *new_bias = new framework::CLImage();
-
-  new_bias->SetTensorData(new_bias_ptr, variance->dims());
-  new_bias->InitCLImage(this->cl_helper_.CLContext(),
-                        cl_helper_.CLCommandQueue());
-
-  //  DLOG << " climage - new bias: " << *new_bias;
-  //
-  //  DLOG << " climage - filter: " << *(param->Filter());
-
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);
-
-  delete[](new_scale_ptr);
-  delete[](new_bias_ptr);
-
-  PADDLE_MOBILE_ENFORCE(
-      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-          param->Paddings()[0] == param->Paddings()[1],
-      "need equal");
-
-  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
-               static_cast<int>(param->Paddings()[1]);
-
-  param->SetOffset(offset);
-
-  const std::string conv_kernel_file = "conv_kernel.cl";
-  const std::string wino_kernel_file = "winograd_transform.cl";
-  std::string build_options = "-DBATCH_NORM -DRELU";
-  if (param->Output()->dims() == param->Bias()->dims()) {
-    build_options += " -DBIASE_ELE";
-  } else {
-    build_options += " -DBIASE_CH";
-  }
-
-  /*
-  if (param->Filter()->dims()[2] == 1 &&
-      param->Filter()->dims()[3] == 1 &&
-      (param->Filter()->dims()[0] % 16) == 0) {
-    param->Filter()->InitNImage(cl_helper_.CLContext(),
-                                cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("conv_1x1_4", "conv_add_bn_relu_kernel.cl");
-    DLOG << " conv add bn relu conv 1x1 4";
-  }
-  */
-  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
-    param->Filter()->InitNImage(cl_helper_.CLContext(),
-                                cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("conv_1x1_spl", conv_kernel_file, build_options);
-
-  } else if (param->Filter()->dims()[1] == 1 &&
-             param->Input()->dims()[1] == param->Output()->dims()[1] &&
-             param->Filter()->dims()[2] == 3) {
-    param->Filter()->InitDWImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-    if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT;
-      this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file,
-                                 build_options);
-    } else {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
-      this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
-                                 build_options);
-    }
-
-  } else if (param->Filter()->dims()[2] == 3 &&
-             param->Filter()->dims()[3] == 3) {
-    //    if (param->Strides()[0] == param->Strides()[1] &&
-    //        param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) {
-    //      param->ExecMode() = ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT;
-    //      this->cl_helper_.AddKernel("winograd_filter_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //      this->cl_helper_.AddKernel("winograd_input_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //      this->cl_helper_.AddKernel("matmul", "matmul.cl");
-    //      this->cl_helper_.AddKernel("winograd_output_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //
-    //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
-    //
-    //    } else {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
-    param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
-    //    }
-  } else {
-    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
-  }
-
-  return true;
-}
-
-template <>
-void ConvAddBNReluKernel<GPU_CL, float>::Compute(
-    const FusionConvAddBNReluParam<GPU_CL> &param) {
-  switch (param.ExecMode()) {
-    case ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<4, 3>(&this->cl_helper_, param, true, param.Bias(),
-                            param.NewScale(), param.NewBias());
-      break;
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
-      ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(),
-                    param.NewScale(), param.NewBias());
-      break;
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
-      DWConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(),
-                      param.NewScale(), param.NewBias());
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-}
-
-template class ConvAddBNReluKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
deleted file mode 100644
index dc74cf0a6f..0000000000
--- a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADD_OP
-
-#include "operators/kernel/conv_add_kernel.h"
-#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
-  PADDLE_MOBILE_ENFORCE(
-      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-          param->Paddings()[0] == param->Paddings()[1],
-      "need equal");
-  if (!param->Bias()->isInit()) {
-    param->Bias()->InitCLImage(cl_helper_.CLContext(),
-                               this->cl_helper_.CLCommandQueue());
-  }
-
-  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
-               static_cast<int>(param->Paddings()[1]);
-  param->SetOffset(offset);
-
-  const std::string conv_kernel_file = "conv_kernel.cl";
-  const std::string wino_kernel_file = "winograd_transform.cl";
-  std::string build_options;
-  if (param->Output()->dims() == param->Bias()->dims()) {
-    build_options = "-DBIASE_ELE";
-  } else {
-    build_options = "-DBIASE_CH";
-  }
-
-  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
-    param->Filter()->InitNImage(cl_helper_.CLContext(),
-                                cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("conv_1x1_spl", conv_kernel_file, build_options);
-
-  } else if (param->Filter()->dims()[1] == 1 &&
-             param->Input()->dims()[1] == param->Output()->dims()[1] &&
-             param->Filter()->dims()[2] == 3) {
-    param->Filter()->InitDWImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-    if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT;
-      this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file,
-                                 build_options);
-    } else {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
-      this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
-                                 build_options);
-    }
-
-  } else if (param->Filter()->dims()[2] == 3 &&
-             param->Filter()->dims()[3] == 3) {
-    //    if (param->Strides()[0] == param->Strides()[1] &&
-    //        param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) {
-    //      param->ExecMode() = ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT;
-    //      this->cl_helper_.AddKernel("winograd_filter_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //      this->cl_helper_.AddKernel("winograd_input_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //      this->cl_helper_.AddKernel("matmul", "matmul.cl");
-    //      this->cl_helper_.AddKernel("winograd_output_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //
-    //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
-    //
-    //    } else {
-
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
-    param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file, build_options);
-    //    }
-
-  } else if (param->Filter()->dims()[2] == 7 &&
-             param->Filter()->dims()[3] == 7) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT;
-    param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("conv_7x7", conv_kernel_file, build_options);
-
-  } else if (param->Filter()->dims()[2] == 5 &&
-             param->Filter()->dims()[3] == 5) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT;
-    param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("conv_5x5", conv_kernel_file, build_options);
-  }
-
-  return true;
-}
-
-template <>
-void ConvAddKernel<GPU_CL, float>::Compute(
-    const FusionConvAddParam<GPU_CL> &param) {
-  switch (param.ExecMode()) {
-    case ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<4, 3>(&this->cl_helper_, param, false, param.Bias());
-      break;
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
-      ConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
-      break;
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
-      DWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
-      break;
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
-      SWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-}
-
-template class ConvAddKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
deleted file mode 100644
index a6b4af7231..0000000000
--- a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDRELU_OP
-
-#include "operators/kernel/conv_add_relu_kernel.h"
-#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddReluKernel<GPU_CL, float>::Init(
-    FusionConvAddReluParam<GPU_CL> *param) {
-  PADDLE_MOBILE_ENFORCE(
-      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-          param->Paddings()[0] == param->Paddings()[1],
-      "need equal");
-  if (!param->Bias()->isInit()) {
-    param->Bias()->InitCLImage(cl_helper_.CLContext(),
-                               this->cl_helper_.CLCommandQueue());
-  }
-
-  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
-               static_cast<int>(param->Paddings()[1]);
-  param->SetOffset(offset);
-
-  const std::string conv_kernel_file = "conv_kernel.cl";
-  const std::string wino_kernel_file = "winograd_transform.cl";
-  std::string build_options = "-DRELU";
-  if (param->Output()->dims() == param->Bias()->dims()) {
-    build_options += " -DBIASE_ELE";
-  } else {
-    build_options += " -DBIASE_CH";
-  }
-
-  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
-    param->Filter()->InitNImage(cl_helper_.CLContext(),
-                                cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("conv_1x1_spl", conv_kernel_file, build_options);
-
-  } else if (param->Filter()->dims()[1] == 1 &&
-             param->Input()->dims()[1] == param->Output()->dims()[1] &&
-             param->Filter()->dims()[2] == 3) {
-    param->Filter()->InitDWImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-    if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT;
-      this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file,
-                                 build_options);
-    } else {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
-      this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
-                                 build_options);
-    }
-
-  } else if (param->Filter()->dims()[2] == 3 &&
-             param->Filter()->dims()[3] == 3) {
-    //    if (param->Strides()[0] == param->Strides()[1] &&
-    //        param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) {
-    //      param->ExecMode() = ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT;
-    //      this->cl_helper_.AddKernel("winograd_filter_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //      this->cl_helper_.AddKernel("winograd_input_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //      this->cl_helper_.AddKernel("matmul", "matmul.cl");
-    //      this->cl_helper_.AddKernel("winograd_output_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //
-    //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
-    //
-    //    } else {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
-    param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
-    //    }
-
-  } else if (param->Filter()->dims()[2] == 7 &&
-             param->Filter()->dims()[3] == 7) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT;
-    param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("conv_7x7", conv_kernel_file, build_options);
-
-  } else if (param->Filter()->dims()[2] == 5 &&
-             param->Filter()->dims()[3] == 5) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT;
-    param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("conv_5x5", conv_kernel_file, build_options);
-
-  } else {
-    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
-  }
-
-  return true;
-}
-
-template <>
-void ConvAddReluKernel<GPU_CL, float>::Compute(
-    const FusionConvAddReluParam<GPU_CL> &param) {
-  switch (param.ExecMode()) {
-    case ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<4, 3>(&this->cl_helper_, param, true, param.Bias());
-      break;
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
-      ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias());
-      break;
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
-      DWConvAddBnRelu(&this->cl_helper_, param, true, param.Bias());
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-}
-
-template class ConvAddReluKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp
deleted file mode 100644
index 7e8a44ced0..0000000000
--- a/mobile/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBNADDRELU_OP
-
-#include "operators/kernel/conv_bn_add_relu_kernel.h"
-#include <cmath>
-#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvBNAddReluKernel<GPU_CL, float>::Init(
-    FusionConvBNAddReluParam<GPU_CL> *param) {
-  PADDLE_MOBILE_ENFORCE(
-      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-          param->Paddings()[0] == param->Paddings()[1],
-      "need equal");
-
-  const framework::CLImage *mean = param->InputMean();
-  const framework::CLImage *variance = param->InputVariance();
-  const framework::CLImage *scale = param->InputScale();
-  const framework::CLImage *bias = param->InputBias();
-
-  const float epsilon = param->Epsilon();
-
-  const int C = mean->numel();
-
-  auto mean_ptr = mean->data<float>();
-  auto variance_ptr = variance->data<float>();
-  auto scale_ptr = scale->data<float>();
-  auto bias_ptr = bias->data<float>();
-
-  float inv_std_ptr[C];
-  for (int i = 0; i < C; i++) {
-    inv_std_ptr[i] =
-        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
-  }
-  float *new_scale_ptr = new float[C];
-  float *new_bias_ptr = new float[C];
-
-  for (int i = 0; i < C; i++) {
-    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
-    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
-  }
-
-  framework::CLImage *new_scale = new framework::CLImage();
-
-  //  for (int j = 0; j < C; ++j) {
-  //    DLOG << " new scale - " << j << new_scale_ptr[j];
-  //  }
-  //
-  //  for (int j = 0; j < C; ++j) {
-  //    DLOG << " new bias - " << j << new_bias_ptr[j];
-  //  }
-
-  new_scale->SetTensorData(new_scale_ptr, variance->dims());
-  new_scale->InitCLImage(this->cl_helper_.CLContext(),
-                         cl_helper_.CLCommandQueue());
-
-  //  DLOG << " climage - y bias: " << *(param->Bias());
-  //
-  //  DLOG << " climage - new scale: " << *new_scale;
-
-  framework::CLImage *new_bias = new framework::CLImage();
-
-  new_bias->SetTensorData(new_bias_ptr, variance->dims());
-  new_bias->InitCLImage(this->cl_helper_.CLContext(),
-                        cl_helper_.CLCommandQueue());
-
-  //  DLOG << " climage - new bias: " << *new_bias;
-  //
-  //  DLOG << " climage - filter: " << *(param->Filter());
-
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);
-
-  delete[](new_scale_ptr);
-  delete[](new_bias_ptr);
-
-  PADDLE_MOBILE_ENFORCE(
-      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-          param->Paddings()[0] == param->Paddings()[1],
-      "need equal");
-
-  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
-               static_cast<int>(param->Paddings()[1]);
-
-  param->SetOffset(offset);
-
-  const std::string conv_kernel_file = "conv_kernel.cl";
-  const std::string wino_kernel_file = "winograd_transform.cl";
-  std::string build_options = "-DBATCH_NORM -DRELU";
-  if (param->Output()->dims() == param->Bias()->dims()) {
-    build_options += " -DBIASE_ELE";
-  } else {
-    build_options += " -DBIASE_CH";
-  }
-
-  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
-    param->Filter()->InitNImage(cl_helper_.CLContext(),
-                                cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("convBNAdd_1x1_spl", conv_kernel_file,
-                               build_options);
-
-  } else if (param->Filter()->dims()[1] == 1 &&
-             param->Input()->dims()[1] == param->Output()->dims()[1] &&
-             param->Filter()->dims()[2] == 3) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
-    param->Filter()->InitDWImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("depth_convBNAdd_3x3", conv_kernel_file,
-                               build_options);
-
-  } else if (param->Filter()->dims()[2] == 3 &&
-             param->Filter()->dims()[3] == 3) {
-    //    if (param->Strides()[0] == param->Strides()[1] &&
-    //        param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) {
-    //      param->ExecMode() = ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT;
-    //      this->cl_helper_.AddKernel("winograd_filter_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //      this->cl_helper_.AddKernel("winograd_input_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //      this->cl_helper_.AddKernel("matmul", "matmul.cl");
-    //      this->cl_helper_.AddKernel("winograd_output_transform_2x2_bn_add",
-    //                                 wino_kernel_file, build_options);
-    //
-    //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
-    //
-    //    } else {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
-    param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("convBNAdd_3x3", conv_kernel_file,
-                               build_options);
-    //    }
-  } else {
-    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
-  }
-
-  return true;
-}
-
-template <>
-void ConvBNAddReluKernel<GPU_CL, float>::Compute(
-    const FusionConvBNAddReluParam<GPU_CL> &param) {
-  switch (param.ExecMode()) {
-    case ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<4, 3>(&this->cl_helper_, param, true, param.Bias(),
-                            param.NewScale(), param.NewBias());
-      break;
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
-      ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(),
-                    param.NewScale(), param.NewBias());
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-}
-template class ConvBNAddReluKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
deleted file mode 100644
index a69280a32b..0000000000
--- a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBNRELU_OP
-
-#include "operators/kernel/conv_bn_relu_kernel.h"
-#include <cmath>
-#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvBNReluKernel<GPU_CL, float>::Init(
-    FusionConvBNReluParam<GPU_CL> *param) {
-  PADDLE_MOBILE_ENFORCE(
-      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-          param->Paddings()[0] == param->Paddings()[1],
-      "need equal");
-  const framework::CLImage *mean = param->InputMean();
-  const framework::CLImage *variance = param->InputVariance();
-  const framework::CLImage *scale = param->InputScale();
-  const framework::CLImage *bias = param->InputBias();
-  const float epsilon = param->Epsilon();
-
-  const int C = mean->numel();
-
-  auto mean_ptr = mean->data<float>();
-  auto variance_ptr = variance->data<float>();
-  auto scale_ptr = scale->data<float>();
-  auto bias_ptr = bias->data<float>();
-
-  float inv_std_ptr[C];
-  for (int i = 0; i < C; i++) {
-    inv_std_ptr[i] =
-        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
-  }
-  float *new_scale_ptr = new float[C];
-  float *new_bias_ptr = new float[C];
-
-  for (int i = 0; i < C; i++) {
-    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
-    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
-  }
-
-  framework::CLImage *new_scale = new framework::CLImage();
-
-  //  for (int j = 0; j < C; ++j) {
-  //    DLOG << " new scale - " << j << new_scale_ptr[j];
-  //  }
-  //
-  //  for (int j = 0; j < C; ++j) {
-  //    DLOG << " new bias - " << j << new_bias_ptr[j];
-  //  }
-
-  new_scale->SetTensorData(new_scale_ptr, variance->dims());
-  new_scale->InitCLImage(this->cl_helper_.CLContext(),
-                         cl_helper_.CLCommandQueue());
-
-  //  DLOG << " climage - y bias: " << *(param->Bias());
-  //
-  //  DLOG << " climage - new scale: " << *new_scale;
-
-  framework::CLImage *new_bias = new framework::CLImage();
-
-  new_bias->SetTensorData(new_bias_ptr, variance->dims());
-  new_bias->InitCLImage(this->cl_helper_.CLContext(),
-                        cl_helper_.CLCommandQueue());
-
-  //  DLOG << " climage - new bias: " << *new_bias;
-  //
-  //  DLOG << " climage - filter: " << *(param->Filter());
-
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);
-
-  delete[](new_scale_ptr);
-  delete[](new_bias_ptr);
-
-  PADDLE_MOBILE_ENFORCE(
-      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-          param->Paddings()[0] == param->Paddings()[1],
-      "need equal");
-
-  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
-               static_cast<int>(param->Paddings()[1]);
-
-  param->SetOffset(offset);
-
-  const std::string conv_kernel_file = "conv_kernel.cl";
-  const std::string wino_kernel_file = "winograd_transform.cl";
-  const std::string build_options = "-DBATCH_NORM -DRELU";
-
-  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
-    param->Filter()->InitNImage(cl_helper_.CLContext(),
-                                cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("conv_1x1_spl", conv_kernel_file, build_options);
-
-  } else if (param->Filter()->dims()[1] == 1 &&
-             param->Input()->dims()[1] == param->Output()->dims()[1] &&
-             param->Filter()->dims()[2] == 3) {
-    param->Filter()->InitDWImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-    if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT;
-      this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file,
-                                 build_options);
-    } else {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
-      this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
-                                 build_options);
-    }
-
-  } else if (param->Filter()->dims()[2] == 3 &&
-             param->Filter()->dims()[3] == 3) {
-    //    if (param->Strides()[0] == param->Strides()[1] &&
-    //        param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) {
-    //      param->ExecMode() = ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT;
-    //      this->cl_helper_.AddKernel("winograd_filter_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //      this->cl_helper_.AddKernel("winograd_input_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //      this->cl_helper_.AddKernel("matmul", "matmul.cl");
-    //      this->cl_helper_.AddKernel("winograd_output_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //
-    //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
-    //
-    //    } else {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
-    param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
-    //    }
-  } else {
-    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
-  }
-  return true;
-}
-
-template <>
-void ConvBNReluKernel<GPU_CL, float>::Compute(
-    const FusionConvBNReluParam<GPU_CL> &param) {
-  switch (param.ExecMode()) {
-    case ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<4, 3>(&this->cl_helper_, param, true, nullptr,
-                            param.NewScale(), param.NewBias());
-      break;
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
-      ConvAddBnRelu(&this->cl_helper_, param, true, nullptr, param.NewScale(),
-                    param.NewBias());
-      break;
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
-      DWConvAddBnRelu(&this->cl_helper_, param, true, nullptr, param.NewScale(),
-                      param.NewBias());
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-}
-template class ConvBNReluKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/conv_kernel.cpp b/mobile/src/operators/kernel/cl/conv_kernel.cpp
deleted file mode 100755
index 71c67d59ea..0000000000
--- a/mobile/src/operators/kernel/cl/conv_kernel.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_OP
-
-#include "operators/kernel/conv_kernel.h"
-#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
-  PADDLE_MOBILE_ENFORCE(
-      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-          param->Paddings()[0] == param->Paddings()[1],
-      "need equal");
-
-  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
-               static_cast<int>(param->Paddings()[1]);
-  param->SetOffset(offset);
-
-  DLOG << " init helper: " << &cl_helper_;
-  DLOG << " conv kernel add kernel ~ ";
-  DLOG << " width of one block: " << param->Filter()->dims()[3];
-  DLOG << " height of one block: " << param->Filter()->dims()[2];
-  DLOG << " filter dims: " << param->Filter()->dims();
-
-  const std::string conv_kernel_file = "conv_kernel.cl";
-  const std::string wino_kernel_file = "winograd_transform.cl";
-
-  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
-    param->Filter()->InitNImage(cl_helper_.CLContext(),
-                                cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("conv_1x1_spl", conv_kernel_file);
-    DLOG << "conv 1x1";
-
-  } else if (param->Filter()->dims()[1] == 1 &&
-             param->Input()->dims()[1] == param->Output()->dims()[1] &&
-             param->Filter()->dims()[2] == 3) {
-    param->Filter()->InitDWImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-    if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT;
-      this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file);
-    } else {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
-      this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file);
-    }
-    DLOG << "depth_conv 3x3";
-
-  } else if (param->Filter()->dims()[2] == 3 &&
-             param->Filter()->dims()[3] == 3) {
-    //    if (param->Strides()[0] == param->Strides()[1] &&
-    //        param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) {
-    //      param->ExecMode() = ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT;
-    //      this->cl_helper_.AddKernel("winograd_filter_transform_2x2",
-    //                                 wino_kernel_file);
-    //      this->cl_helper_.AddKernel("winograd_input_transform_2x2",
-    //                                 wino_kernel_file);
-    //      this->cl_helper_.AddKernel("matmul", "matmul.cl");
-    //      this->cl_helper_.AddKernel("winograd_output_transform_2x2",
-    //                                 wino_kernel_file);
-    //
-    //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
-    //
-    //    } else {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
-    param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file);
-    //    }
-    DLOG << "conv 3x3";
-  } else if (param->Filter()->dims()[2] == 7 &&
-             param->Filter()->dims()[3] == 7) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT;
-    param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("conv_7x7", conv_kernel_file);
-    //    }
-    DLOG << "conv 7x7";
-  } else {
-    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
-  }
-
-  return true;
-}
-
-template <>
-void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
-  switch (param.ExecMode()) {
-    case ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<4, 3>(&this->cl_helper_, param);
-      break;
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
-      ConvAddBnRelu(&this->cl_helper_, param);
-      break;
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
-      DWConvAddBnRelu(&this->cl_helper_, param);
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-}
-
-template class ConvKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
deleted file mode 100644
index 1aedbeec7a..0000000000
--- a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVRELU_OP
-
-#include "operators/kernel/conv_relu_kernel.h"
-#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvReluKernel<GPU_CL, float>::Init(FusionConvReluParam<GPU_CL> *param) {
-  PADDLE_MOBILE_ENFORCE(
-      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-          param->Paddings()[0] == param->Paddings()[1],
-      "need equal");
-
-  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
-               static_cast<int>(param->Paddings()[1]);
-  param->SetOffset(offset);
-
-  DLOG << " init helper: " << &cl_helper_;
-  DLOG << " conv kernel add kernel ~ ";
-  DLOG << " width of one block: " << param->Filter()->dims()[3];
-  DLOG << " height of one block: " << param->Filter()->dims()[2];
-  DLOG << " filter dims: " << param->Filter()->dims();
-
-  const std::string conv_kernel_file = "conv_kernel.cl";
-  const std::string wino_kernel_file = "winograd_transform.cl";
-  const std::string build_options = "-DRELU";
-
-  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
-    param->Filter()->InitNImage(cl_helper_.CLContext(),
-                                cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("conv_1x1_spl", conv_kernel_file, build_options);
-    DLOG << "conv 1x1";
-
-  } else if (param->Filter()->dims()[1] == 1 &&
-             param->Input()->dims()[1] == param->Output()->dims()[1] &&
-             param->Filter()->dims()[2] == 3) {
-    param->Filter()->InitDWImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-    if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT;
-      this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file,
-                                 build_options);
-    } else {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
-      this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
-                                 build_options);
-    }
-
-    DLOG << "depth_conv 3x3";
-
-  } else if (param->Filter()->dims()[2] == 3 &&
-             param->Filter()->dims()[3] == 3) {
-    //    if (param->Strides()[0] == param->Strides()[1] &&
-    //        param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) {
-    //      param->ExecMode() = ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT;
-    //      this->cl_helper_.AddKernel("winograd_filter_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //      this->cl_helper_.AddKernel("winograd_input_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //      this->cl_helper_.AddKernel("matmul", "matmul.cl", build_options);
-    //      this->cl_helper_.AddKernel("winograd_output_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //
-    //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
-    //
-    //    } else {
-    if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3S1_FLOAT;
-      param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                   cl_helper_.CLCommandQueue());
-      this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file,
-                                 build_options);
-    } else {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
-      param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                   cl_helper_.CLCommandQueue());
-      this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
-    }
-    //    }
-    DLOG << "conv 3x3";
-
-  } else {
-    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
-  }
-
-  return true;
-}
-
-template <>
-void ConvReluKernel<GPU_CL, float>::Compute(
-    const FusionConvReluParam<GPU_CL> &param) {
-  switch (param.ExecMode()) {
-    case ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<4, 3>(&this->cl_helper_, param, true);
-      break;
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
-      ConvAddBnRelu(&this->cl_helper_, param, true);
-      break;
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
-      DWConvAddBnRelu(&this->cl_helper_, param, true);
-      break;
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
-      SWConvAddBnRelu(&this->cl_helper_, param, true);
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-}
-
-template class ConvReluKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp b/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp
deleted file mode 100644
index 8d66b50a99..0000000000
--- a/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef CONV_TRANSPOSE_OP
-
-#include "operators/kernel/conv_transpose_kernel.h"
-#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvTransposeKernel<GPU_CL, float>::Init(
-    ConvTransposeParam<GPU_CL>* param) {
-  PADDLE_MOBILE_ENFORCE(param->Strides()[0] == param->Strides()[1] &&
-                            param->Paddings()[0] == param->Paddings()[1] &&
-                            param->Dilations()[0] == param->Dilations()[1] &&
-                            param->Dilations()[0] == 1,
-                        "need equal");
-
-  if (param->Filter()->dims()[1] == 1 &&
-      param->Input()->dims()[1] == param->Output()->dims()[1]) {
-    param->ExecMode() = ConvTransposeParam<GPU_CL>::EXEC_DEPTHWISETRANS_FLOAT;
-    param->Filter()->InitDWImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("depthwise_transpose",
-                               "conv_transpose_kernel.cl");
-  } else if (param->Filter()->dims()[2] == 3 &&
-             param->Filter()->dims()[3] == 3 && param->Strides()[0] == 2) {
-    param->ExecMode() = ConvTransposeParam<GPU_CL>::EXEC_CONVTRANS3x3s2_FLOAT;
-    param->Filter()->InitConv2dTransposeFilterCLImage(
-        cl_helper_.CLContext(), cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("conv_transpose3x3s2",
-                               "conv_transpose_kernel.cl");
-  } else {
-    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
-  }
-  return true;
-}
-
-template <>
-void ConvTransposeKernel<GPU_CL, float>::Compute(
-    const ConvTransposeParam<GPU_CL>& param) {
-  switch (param.ExecMode()) {
-    case ConvTransposeParam<GPU_CL>::EXEC_DEPTHWISETRANS_FLOAT:
-      DWConvTransposeAddBnRelu(&this->cl_helper_, param);
-      break;
-    case ConvTransposeParam<GPU_CL>::EXEC_CONVTRANS3x3s2_FLOAT:
-      ConvTranspose3x3s2AddBnRelu(&this->cl_helper_, param);
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION(
-          "Invalid convolution transpose execute mode %d", param.ExecMode());
-  }
-}
-
-template class ConvTransposeKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/cl/density_prior_box_kernel.cpp b/mobile/src/operators/kernel/cl/density_prior_box_kernel.cpp
deleted file mode 100644
index 0a281ed103..0000000000
--- a/mobile/src/operators/kernel/cl/density_prior_box_kernel.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DENSITY_PRIORBOX_OP
-
-#include <operators/kernel/prior_box_kernel.h>
-#include "framework/cl/cl_tensor.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DensityPriorBoxKernel<GPU_CL, float>::Init(
-    paddle_mobile::operators::DensityPriorBoxParam<paddle_mobile::GPU_CL>
-        *param) {
-  this->cl_helper_.AddKernel("density_prior_box",
-                             "density_prior_box_kernel.cl");
-  return true;
-}
-
-template <>
-void DensityPriorBoxKernel<GPU_CL, float>::Compute(
-    const paddle_mobile::operators::DensityPriorBoxParam<paddle_mobile::GPU_CL>
-        &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  const auto *input = param.Input();
-  const auto input_dims = input->dims();
-  const auto input_image_dims = param.InputImage()->dims();
-
-  auto output_boxes = param.OutputBoxes()->GetCLImage();
-  auto output_var = param.OutputVariances()->GetCLImage();
-
-  float step_w = param.StepW();
-  float step_h = param.StepH();
-  float offset = param.Offset();
-  vector<float> fixed_sizes = param.FixedSizes();
-  vector<float> fixed_ratios = param.FixedRatios();
-  vector<int> densities = param.Densities();
-  vector<float> variances = param.Variances();
-
-  // feature map
-  auto input_heigh = input_dims[2];
-  auto input_width = input_dims[3];
-
-  auto image_heigh = input_image_dims[2];
-  auto image_width = input_image_dims[3];
-
-  const int C = param.OutputBoxes()->dims()[1];
-
-  if (step_w == 0 || step_h == 0) {
-    step_h = static_cast<float>(image_heigh) / input_heigh;
-    step_w = static_cast<float>(image_width) / input_width;
-  }
-  int num_density = 0;
-  for (int l = 0; l < densities.size(); ++l) {
-    num_density += densities[l] * densities[l] * fixed_ratios.size();
-  }
-
-  param.OutputBoxes()->Resize({input_heigh, input_width, num_density, 4});
-  int step_average = static_cast<int>((step_w + step_h) * 0.5);
-  int densities_and_fixedsize_size = densities.size();
-  int fix_ratio_size = fixed_ratios.size();
-
-  auto default_work = this->cl_helper_.DefaultWorkSize(*param.OutputBoxes());
-
-  float *densities_data[densities.size() + fixed_sizes.size() + fix_ratio_size];
-
-  int status;
-
-  for (int i = 0; i < densities.size(); ++i) {
-    float density = densities[i];
-    densities_data[i] = &density;
-  }
-
-  for (int k = 0; k < fixed_sizes.size(); ++k) {
-    densities_data[k + densities.size()] = &fixed_sizes[k];
-  }
-
-  for (int j = 0; j < fixed_ratios.size(); ++j) {
-    float sqrt_ratios = sqrt(fixed_ratios[j]);
-    densities_data[j + densities.size() + fixed_sizes.size()] = &sqrt_ratios;
-  }
-
-  cl_mem densities_memobj = clCreateBuffer(
-      this->cl_helper_.CLContext(), CL_MEM_READ_WRITE,
-      sizeof(float) * (densities.size() * 2 + fix_ratio_size), NULL, &status);
-  status = clEnqueueWriteBuffer(
-      this->cl_helper_.CLCommandQueue(), densities_memobj, CL_FALSE, 0,
-      (densities.size() * 2 + fix_ratio_size) * sizeof(float), densities_data,
-      0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-
-  float variances0 = variances[0];
-  float variances1 = variances[1];
-  float variances2 = variances[2];
-  float variances3 = variances[3];
-
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &output_boxes);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_var);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &densities_memobj);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(float), &step_h);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(float), &step_w);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(int), &variances0);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(int), &variances1);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(int), &variances2);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 8, sizeof(int), &variances3);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 9, sizeof(float), &offset);
-  CL_CHECK_ERRORS(status);
-  status =
-      clSetKernelArg(kernel, 10, sizeof(int), &densities_and_fixedsize_size);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 11, sizeof(int), &image_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 12, sizeof(int), &image_heigh);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 13, sizeof(int), &C);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 14, sizeof(int), &num_density);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 15, sizeof(int), &step_average);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 16, sizeof(int), &input_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 17, sizeof(int), &default_work[0]);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 18, sizeof(int), &fix_ratio_size);
-  CL_CHECK_ERRORS(status);
-  status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel,
-                                  default_work.size(), NULL,
-                                  default_work.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/depthwise_conv_kernel.cpp b/mobile/src/operators/kernel/cl/depthwise_conv_kernel.cpp
deleted file mode 100644
index 372c25b596..0000000000
--- a/mobile/src/operators/kernel/cl/depthwise_conv_kernel.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-///* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License. */
-//
-//#ifdef DEQUANT_OP
-//
-//#include "operators/kernel/dequantize_kernel.h"
-//
-// namespace paddle_mobile {
-// namespace operators {
-//
-// template <>
-// bool DequantizeKernel<GPU_CL, float>::Init(DequantizeParam<GPU_CL> *param) {
-//  DLOG << " depthwise conv kernel init begin ";
-//  PADDLE_MOBILE_ENFORCE(
-//      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-//          param->Paddings()[0] == param->Paddings()[1],
-//      "need equal");
-//  param->Filter()->InitCLImage(cl_helper_.CLContext(),
-//                               this->cl_helper_.CLCommandQueue());
-//  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
-//               static_cast<int>(param->Paddings()[1]);
-//  param->SetOffset(offset);
-//  this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
-//  DLOG << " depthwise conv kernel init end ";
-//  return true;
-//}
-//
-// template <>
-// void DequantizeKernel<GPU_CL, float>::Compute(
-//    const DequantizeParam<GPU_CL> &param) {
-//  auto kernel = this->cl_helper_.KernelAt(0);
-//  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
-//  int c_block = default_work_size[0];
-//  int w = default_work_size[1];
-//  int nh = default_work_size[2];
-//  auto input = param.Input()->GetCLImage();
-//  auto filter = param.Filter()->GetCLImage();
-//  auto output = param.Output()->GetCLImage();
-//  int stride = param.Strides()[0];
-//  int offset = param.Offset();
-//  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
-//                    param.Input()->Converter())
-//                    ->GetCBlock();
-//  int dilation = param.Dilations()[0];
-//
-//  int input_width = param.Input()->dims()[3];
-//  int input_height = param.Input()->dims()[2];
-//  int output_width = param.Output()->dims()[3];
-//  int output_height = param.Output()->dims()[2];
-//
-//  cl_int status;
-//
-//  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-//  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
-//  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-//  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-//  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-//  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
-//  status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
-//  status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
-//  status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
-//  status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
-//  status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
-//  status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
-//  status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
-//  status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
-//
-//  CL_CHECK_ERRORS(status);
-//
-//  //  cl_event out_event = param.Output()->GetClEvent();
-//  //  cl_event wait_event = param.Input()->GetClEvent();
-//
-//  status = clEnqueueNDRangeKernel(
-//      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
-//      NULL, default_work_size.data(), NULL, 0, NULL, NULL);
-//
-//  CL_CHECK_ERRORS(status);
-//}
-//
-// template class DepthwiseConvKernel<GPU_CL, float>;
-//
-//}  // namespace operators
-//}  // namespace paddle_mobile
-//
-//#endif
diff --git a/mobile/src/operators/kernel/cl/dropout_kernel.cpp b/mobile/src/operators/kernel/cl/dropout_kernel.cpp
deleted file mode 100644
index db9437841b..0000000000
--- a/mobile/src/operators/kernel/cl/dropout_kernel.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DROPOUT_OP
-
-#include "operators/kernel/dropout_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DropoutKernel<GPU_CL, float>::Init(DropoutParam<GPU_CL> *param) {
-  this->cl_helper_.AddKernel("dropout", "dropout_kernel.cl");
-  return true;
-}
-
-template <>
-void DropoutKernel<GPU_CL, float>::Compute(const DropoutParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
-  auto *input_image = param.InputX()->GetCLImage();
-  auto *output_image = param.Out()->GetCLImage();
-  const float dropoutProb = param.DropoutProb();
-  const auto &inputDim = param.InputX()->dims();
-  int input_dims[4] = {1, 1, 1, 1};
-  // 1 1000 1 1
-  for (int i = 0; i < inputDim.size(); i++) {
-    input_dims[4 - inputDim.size() + i] = inputDim[i];
-  }
-  int out_W = input_dims[1];
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(int), &out_W);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(float), &dropoutProb);
-  CL_CHECK_ERRORS(status);
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/dwconv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/cl/dwconv_bn_relu_kernel.cpp
deleted file mode 100644
index 03362a8d9f..0000000000
--- a/mobile/src/operators/kernel/cl/dwconv_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DWCONVBNRELU_OP
-
-#include "operators/kernel/dwconv_bn_relu_kernel.h"
-#include <cmath>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DWConvBNReluKernel<GPU_CL, float>::Init(
-    FusionDWConvBNReluParam<GPU_CL> *param) {
-  PADDLE_MOBILE_ENFORCE(
-      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-          param->Paddings()[0] == param->Paddings()[1],
-      "need equal");
-  const framework::CLImage *mean = param->InputMean();
-  const framework::CLImage *variance = param->InputVariance();
-  const framework::CLImage *scale = param->InputScale();
-  const framework::CLImage *bias = param->InputBias();
-  const float epsilon = param->Epsilon();
-
-  const int C = mean->numel();
-
-  auto mean_ptr = mean->data<float>();
-  auto variance_ptr = variance->data<float>();
-  auto scale_ptr = scale->data<float>();
-  auto bias_ptr = bias->data<float>();
-
-  float inv_std_ptr[C];
-  for (int i = 0; i < C; i++) {
-    inv_std_ptr[i] =
-        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
-  }
-  float *new_scale_ptr = new float[C];
-  float *new_bias_ptr = new float[C];
-
-  for (int i = 0; i < C; i++) {
-    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
-    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
-  }
-
-  framework::CLImage *new_scale = new framework::CLImage();
-
-  new_scale->SetTensorData(new_scale_ptr, variance->dims());
-  new_scale->InitCLImage(this->cl_helper_.CLContext(),
-                         cl_helper_.CLCommandQueue());
-
-  framework::CLImage *new_bias = new framework::CLImage();
-
-  new_bias->SetTensorData(new_bias_ptr, variance->dims());
-  new_bias->InitCLImage(this->cl_helper_.CLContext(),
-                        cl_helper_.CLCommandQueue());
-
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);
-
-  delete[](new_scale_ptr);
-  delete[](new_bias_ptr);
-
-  PADDLE_MOBILE_ENFORCE(
-      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-          param->Paddings()[0] == param->Paddings()[1],
-      "need equal");
-
-  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
-               static_cast<int>(param->Paddings()[1]);
-
-  param->SetOffset(offset);
-
-  param->Filter()->InitDWImage(cl_helper_.CLContext(),
-                               cl_helper_.CLCommandQueue());
-  this->cl_helper_.AddKernel("depth_conv_3x3", "conv_bn_relu_kernel.cl");
-  DLOG << " conv bn relu depth_conv_3x3";
-
-  return true;
-}
-
-template <>
-void DWConvBNReluKernel<GPU_CL, float>::Compute(
-    const FusionDWConvBNReluParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-  auto input = param.Input()->GetCLImage();
-  auto filter = param.Filter()->GetCLImage();
-  auto new_scale = param.NewScale()->GetCLImage();
-  auto new_bias = param.NewBias()->GetCLImage();
-  auto output = param.Output()->GetCLImage();
-  int stride = param.Strides()[0];
-  int offset = param.Offset();
-  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
-                    param.Input()->Converter())
-                    ->GetCBlock();
-  int dilation = param.Dilations()[0];
-  int input_width = param.Input()->dims()[3];
-  int input_height = param.Input()->dims()[2];
-  int output_width = param.Output()->dims()[3];
-  int output_height = param.Output()->dims()[2];
-
-  cl_int status;
-
-  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &new_scale);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_bias);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &output);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 8, sizeof(int), &stride);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 9, sizeof(int), &offset);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 10, sizeof(int), &input_c);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 11, sizeof(int), &dilation);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 12, sizeof(int), &input_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 13, sizeof(int), &input_height);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 14, sizeof(int), &output_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 15, sizeof(int), &output_height);
-  CL_CHECK_ERRORS(status);
-
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-template class DWConvBNReluKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/elementwise_add_kernel.cpp b/mobile/src/operators/kernel/cl/elementwise_add_kernel.cpp
deleted file mode 100644
index 1506956280..0000000000
--- a/mobile/src/operators/kernel/cl/elementwise_add_kernel.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEADD_OP
-
-#include "operators/kernel/elementwise_add_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ElementwiseAddKernel<GPU_CL, float>::Init(
-    ElementwiseAddParam<GPU_CL> *param) {
-  DLOG << "-----init add-----";
-  CLImage *bias =
-      reinterpret_cast<CLImage *>(const_cast<CLImage *>(param->InputY()));
-  if (bias->dims().size() == 4) {
-    if (!bias->isInit()) {
-      bias->InitNormalCLImage(cl_helper_.CLContext(),
-                              this->cl_helper_.CLCommandQueue());
-    }
-    DLOG << " bias: " << *bias;
-    this->cl_helper_.AddKernel("elementwise_add", "elementwise_add_kernel.cl");
-  } else if (param->InputY()->dims().size() == 1) {
-    if (param->Axis() == param->InputX()->dims().size() - 1) {
-      if (!bias->isInit()) {
-        bias->InitNormalCLImage(cl_helper_.CLContext(),
-                                this->cl_helper_.CLCommandQueue());
-      }
-      DLOG << " bias: " << *bias;
-      this->cl_helper_.AddKernel("width_add", "channel_add_kernel.cl");
-    } else if (param->Axis() == param->InputX()->dims().size() - 3) {
-      if (!bias->isInit()) {
-        bias->InitCLImage(cl_helper_.CLContext(),
-                          this->cl_helper_.CLCommandQueue());
-      }
-      DLOG << " bias: " << *bias;
-      this->cl_helper_.AddKernel("channel_add", "channel_add_kernel.cl");
-    } else {
-      DLOG << "error:bias dims is error";
-    }
-  } else {
-    DLOG << "error:bias dims is error";
-  }
-  return true;
-}
-
-template <>
-void ElementwiseAddKernel<GPU_CL, float>::Compute(
-    const ElementwiseAddParam<GPU_CL> &param) {
-  auto input = param.InputX();
-  auto bias = param.InputY();
-  auto output = param.Out();
-  cl_int status;
-  auto kernel = this->cl_helper_.KernelAt(0);
-  if (bias->dims().size() == 4) {
-    cl_mem input_image = input->GetCLImage();
-    cl_mem bias_image = bias->GetCLImage();
-    cl_mem output_image = output->GetCLImage();
-    status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
-                            reinterpret_cast<void *>(&input_image));
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
-                            reinterpret_cast<void *>(&bias_image));
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
-                            reinterpret_cast<void *>(&output_image));
-    CL_CHECK_ERRORS(status);
-    int width = input->ImageWidth();
-    int height = input->ImageHeight();
-    size_t global_work_size[2] = {width, height};
-    status =
-        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                               NULL, global_work_size, NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
-  } else if (bias->dims().size() == 1) {
-    if (param.Axis() == param.InputX()->dims().size() - 1 ||
-        param.Axis() == param.InputX()->dims().size() - 3) {
-      cl_mem input_image = input->GetCLImage();
-      cl_mem bias_image = bias->GetCLImage();
-      cl_mem output_image = output->GetCLImage();
-      int tensor_w = input->dims()[input->dims().size() - 1];
-      status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
-                              reinterpret_cast<void *>(&input_image));
-      CL_CHECK_ERRORS(status);
-      status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
-                              reinterpret_cast<void *>(&bias_image));
-      CL_CHECK_ERRORS(status);
-      status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
-                              reinterpret_cast<void *>(&output_image));
-      CL_CHECK_ERRORS(status);
-      status = clSetKernelArg(kernel, 3, sizeof(cl_int),
-                              reinterpret_cast<void *>(&tensor_w));
-      CL_CHECK_ERRORS(status);
-      int width = input->ImageWidth();
-      int height = input->ImageHeight();
-      DLOG << "dede:" << width << "," << height;
-      size_t global_work_size[2] = {width, height};
-      cl_event out_event = param.Out()->GetClEvent();
-      cl_event wait_event = param.InputX()->GetClEvent();
-      status =
-          clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                                 NULL, global_work_size, NULL, 0, NULL, NULL);
-      CL_CHECK_ERRORS(status);
-    } else {
-      DLOG << "error:bias dims is error";
-    }
-  } else {
-    DLOG << "error:bias dims is error";
-  }
-}
-
-template class ElementwiseAddKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/exp_kernel.cpp b/mobile/src/operators/kernel/cl/exp_kernel.cpp
deleted file mode 100644
index 76cbae1efd..0000000000
--- a/mobile/src/operators/kernel/cl/exp_kernel.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef EXP_OP
-
-#include <framework/cl/cl_tensor.h>
-#include <operators/kernel/exp_kernel.h>
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool EXPKernel<GPU_CL, float>::Init(
-    paddle_mobile::operators::EXPParam<paddle_mobile::GPU_CL>* param) {
-  this->cl_helper_.AddKernel("exp_impl", "exp_kernel.cl");
-  return true;
-}
-
-template <>
-void EXPKernel<GPU_CL, float>::Compute(
-    const paddle_mobile::operators::EXPParam<paddle_mobile::GPU_CL>& param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  const auto* input = param.InputX();
-  auto* output = param.Out();
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*output);
-  auto inputImage = input->GetCLImage();
-  auto outputImage = output->GetCLImage();
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
-  CL_CHECK_ERRORS(status);
-  const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()};
-  status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                                  NULL, work_size, NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-
-template class EXPKernel<GPU_CL, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/cl/feed_kernel.cpp b/mobile/src/operators/kernel/cl/feed_kernel.cpp
deleted file mode 100644
index 0522905fee..0000000000
--- a/mobile/src/operators/kernel/cl/feed_kernel.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/feed_kernel.h"
-#include "framework/cl/cl_tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FeedKernel<GPU_CL, float>::Init(FeedParam<GPU_CL> *param) {
-  DLOG << "Init feed";
-  this->cl_helper_.AddKernel("feed", "feed_kernel.cl");
-  return true;
-}
-
-template <>
-void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
-  const int col = param.Col();
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
-  cl_int status;
-  auto output = param.Out();
-  const Tensor *input = &param.InputX()->at(col);
-  //  DLOG << *input;
-  const float *input_data = input->data<float>();
-  int numel = input->numel();
-  cl_mem output_image = output->GetCLImage();
-  const int out_C = output->dims()[1];
-  const int out_H = output->dims()[2];
-  const int out_W = output->dims()[3];
-  const int Stride2 = out_C * out_H * out_W;
-  const int Stride1 = out_H * out_W;
-  const int Stride0 = out_W;
-  framework::CLTensor input_cl_tensor(this->cl_helper_.CLContext(),
-                                      this->cl_helper_.CLCommandQueue());
-  input_cl_tensor.Resize(input->dims());
-  cl_mem inputBuffer = input_cl_tensor.mutable_with_data<float>(input_data);
-
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2);
-  CL_CHECK_ERRORS(status);
-
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-
-  CL_CHECK_ERRORS(status);
-}
-
-template class FeedKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/cl/fetch_kernel.cpp b/mobile/src/operators/kernel/cl/fetch_kernel.cpp
deleted file mode 100644
index e1e1522a44..0000000000
--- a/mobile/src/operators/kernel/cl/fetch_kernel.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/fetch_kernel.h"
-#include "framework/cl/cl_tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FetchKernel<GPU_CL, float>::Init(FetchParam<GPU_CL> *param) {
-  this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
-  return true;
-}
-
-template <>
-void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.InputX());
-
-  const int col = param.Col();
-  auto input = param.InputX()->GetCLImage();
-  auto *out = &param.Out()->at(col);
-  out->Resize(param.InputX()->dims());
-  out->mutable_data<float>();
-
-  DLOG << "fetch kernel out dims = " << out->dims();
-  DLOG << "fetch kernel out memory size = " << out->memory_size();
-
-  auto dim = param.InputX()->dims();
-  size_t new_dims[] = {1, 1, 1, 1};
-
-  for (int j = 0; j < dim.size(); ++j) {
-    new_dims[4 - dim.size() + j] = dim[j];
-  }
-
-  size_t in_ch, in_height, in_width;
-
-  in_ch = new_dims[1];
-  in_height = new_dims[2];
-  in_width = new_dims[3];
-  int size_ch = in_height * in_width;
-  int size_block = size_ch * 4;
-  int size_batch = size_ch * in_ch;
-
-  framework::CLTensor out_cl_tensor(this->cl_helper_.CLContext(),
-                                    this->cl_helper_.CLCommandQueue());
-  out_cl_tensor.Resize(out->dims());
-  cl_mem outBuffer = out_cl_tensor.mutable_data<float>();
-
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(int), &in_height);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(int), &in_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(int), &size_ch);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(int), &size_block);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(int), &size_batch);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(int), &in_ch);
-  CL_CHECK_ERRORS(status);
-
-  //  cl_event wait_event = param.InpdutX()->GetClEvent();
-  status =
-      clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
-                             default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-
-  clFinish(this->cl_helper_.CLCommandQueue());
-
-  DLOG << "fetch kernel out dims = " << out->dims();
-  DLOG << "fetch kernel out memory size = " << out->memory_size();
-
-  DLOG << "fetch kernel out_cl_tensor dims = " << out_cl_tensor.dims();
-  DLOG << "fetch kernel out_cl_tensor memery size = "
-       << out_cl_tensor.memory_size();
-  memcpy(out->data<float>(), out_cl_tensor.Data<float>(),
-         sizeof(float) * out->numel());
-}
-
-template class FetchKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/cl/flatten2_kernel.cpp b/mobile/src/operators/kernel/cl/flatten2_kernel.cpp
deleted file mode 100644
index 43eeffe072..0000000000
--- a/mobile/src/operators/kernel/cl/flatten2_kernel.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FLATTEN2_OP
-
-#include "operators/kernel/flatten2_kernel.h"
-#include <operators/kernel/reshape_kernel.h>
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool Flatten2Kernel<GPU_CL, float>::Init(
-    paddle_mobile::operators::FlattenParam<paddle_mobile::GPU_CL> *param) {
-  this->cl_helper_.AddKernel("flatten2", "flatten2_kernel.cl");
-  return true;
-}
-
-template <>
-void Flatten2Kernel<GPU_CL, float>::Compute(
-    const paddle_mobile::operators::FlattenParam<paddle_mobile::GPU_CL>
-        &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  const auto *input = param.InputX();
-  auto *output = param.Out();
-  auto input_image = input->GetCLImage();
-  auto output_image = output->GetCLImage();
-
-  int in_width = input->dims()[3];
-  int in_height = input->dims()[2];
-  int in_c = input->dims()[1];
-
-  int out_width = output->dims()[1];
-  DLOG << "flatten2 dims :" << output->dims() << " in: " << input->dims();
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*output);
-  DLOG << "flatten2 work size :" << default_work_size.data()[0] << " "
-       << default_work_size.data()[1] << "  " << default_work_size.data()[2]
-       << "   " << default_work_size.size();
-
-  // const size_t work_size[2] = {output->ImageWidth(), output->ImageHeight()};
-  DLOG << "flatten2 work data :" << output->ImageWidth() << "  "
-       << output->ImageHeight();
-
-  DLOG << "flatten2 work data 4:" << out_width << " " << in_width << "  "
-       << in_height << "   " << in_c;
-
-  int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(int), &out_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(int), &in_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(int), &in_height);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(int), &in_c);
-  CL_CHECK_ERRORS(status);
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/fusion_fc_kernel.cpp b/mobile/src/operators/kernel/cl/fusion_fc_kernel.cpp
deleted file mode 100644
index a9d6b80608..0000000000
--- a/mobile/src/operators/kernel/cl/fusion_fc_kernel.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_FC_OP
-
-#include "operators/kernel/fusion_fc_kernel.h"
-#include "operators/math/math_function.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FusionFcKernel<GPU_CL, float>::Init(FusionFcParam<GPU_CL> *param) {
-  param->InputY()->InitNormalCLImage(cl_helper_.CLContext(),
-                                     this->cl_helper_.CLCommandQueue());
-  param->InputZ()->InitNormalCLImage(cl_helper_.CLContext(),
-                                     this->cl_helper_.CLCommandQueue());
-  this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
-  this->cl_helper_.AddKernel("feed", "feed_kernel.cl");
-  return true;
-}
-
-template <typename P>
-void FusionFcCompute(const FusionFcParam<GPU_CL> &param, cl_context context,
-                     cl_command_queue commandQueue, cl_kernel kernel0,
-                     cl_kernel kernel1) {
-  auto *input_x_image = param.InputX();
-  auto *input_y_image = param.InputY();
-  auto *input_z_image = param.InputZ();
-
-  int axis = param.Axis();
-  auto *out_image = param.Out();
-
-  Tensor *input_x = new Tensor();
-  input_x->Resize(input_x_image->dims());
-  input_x->mutable_data<float>();
-  framework::CLImageToTensor(input_x_image, input_x, context, commandQueue,
-                             kernel0);
-
-  Tensor *input_y = new Tensor();
-  input_y->Resize(input_y_image->dims());
-  input_y->mutable_data<float>();
-  framework::CLImageToTensor(input_y_image, input_y, context, commandQueue,
-                             kernel0);
-
-  Tensor *input_z = new Tensor();
-  input_z->Resize(input_z_image->dims());
-  input_z->mutable_data<float>();
-  framework::CLImageToTensor(input_z_image, input_z, context, commandQueue,
-                             kernel0);
-  auto *input_z_data = input_z->data<float>();
-
-  DLOG << *input_x;
-  DLOG << *input_y;
-  DLOG << *input_z;
-
-  Tensor *out = new Tensor();
-  out->Resize(out_image->dims());
-  out->mutable_data<float>();
-  auto *out_data = out->mutable_data<float>();
-
-  const Tensor x_matrix =
-      input_x->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
-          : *input_x;
-  const Tensor y_matrix =
-      input_y->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
-          : *input_y;
-  auto out_dim = out->dims();
-  if (out_dim.size() != 2) {
-    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-  }
-  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
-  PADDLE_MOBILE_ENFORCE(input_z->dims().size() == 1, "inpu_z size must be 1");
-  PADDLE_MOBILE_ENFORCE(out_dim[1] == input_z->dims()[0],
-                        " out_dim.size must be 2.");
-  axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
-  PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ");
-
-  int64_t classes = input_z->numel();
-  for (int i = 0; i < out_dim[0]; i++) {
-    memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
-  }
-
-  math::MatMul<float, float>(x_matrix, false, y_matrix, false,
-                             static_cast<float>(1), out, static_cast<float>(1),
-                             false);
-
-  out_image->InitEmptyImage(context, commandQueue, out->dims());
-  framework::TensorToCLImage(out, out_image, context, commandQueue, kernel1);
-
-  delete (input_x);
-  delete (input_y);
-  delete (input_z);
-  delete (out);
-  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
-}
-
-template <>
-void FusionFcKernel<GPU_CL, float>::Compute(
-    const FusionFcParam<GPU_CL> &param) {
-  auto kernel0 = this->cl_helper_.KernelAt(0);
-  auto kernel1 = this->cl_helper_.KernelAt(1);
-  FusionFcCompute<float>(param, this->cl_helper_.CLContext(),
-                         this->cl_helper_.CLCommandQueue(), kernel0, kernel1);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/gen_code.py b/mobile/src/operators/kernel/cl/gen_code.py
deleted file mode 100644
index 888c06e9a4..0000000000
--- a/mobile/src/operators/kernel/cl/gen_code.py
+++ /dev/null
@@ -1,208 +0,0 @@
-#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import re
-import os
-import sys
-
-def gen_opencl_kernels():
-    source = """
-    #pragma
-    #ifdef PADDLE_MOBILE_CL
-    #include <map>
-    #include <string>
-    #include <vector>
-    namespace paddle_mobile {
-        // func name => source
-        extern const std::map<std::string, std::vector<unsigned char>> opencl_kernels = {
-    %s
-        };
-        // file name => header
-        extern const std::map<std::string, std::vector<unsigned char>> opencl_headers = {
-    %s
-        };
-    }
-    #endif
-    """
-
-    def string_to_hex(str):
-        hex_list = []
-        for i in range(len(code_str)):
-            hex_ = hex(ord(code_str[i]))
-            hex_list.append(hex_)
-        return hex_list
-
-    def clean_source(content):
-        new_content = re.sub(r"/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/", "", content, flags=re.DOTALL)
-        lines = new_content.split("\n")
-        new_lines = []
-        for i in range(len(lines)):
-            line = lines[i]
-            line = re.sub(r"//.*$", "", line)
-            line = line.strip()
-            if line == "":
-                continue
-            new_lines.append(line)
-        new_content = "\n".join(new_lines)
-        return new_content
-
-    infile = open("cl_kernel/cl_common.h", "r")
-    common_content = infile.read()
-    infile.close()
-    common_content = clean_source(common_content)
-
-    infile = open("cl_kernel/conv_kernel.inc.cl", "r")
-    inc_content = infile.read()
-    infile.close()
-    inc_content = clean_source(inc_content)
-
-    def get_header_raw(content):
-        lines = content.split("\n")
-        new_lines = []
-        for line in lines:
-            if "__kernel void" in line:
-                break
-            new_lines.append(line)
-        header = "\n".join(new_lines)
-        return header
-    common_header = get_header_raw(common_content)
-    inc_header = get_header_raw(inc_content)
-
-    def get_header(content):
-        lines = content.split("\n")
-        new_lines = []
-        for line in lines:
-            if "__kernel void" in line:
-                break
-            new_lines.append(line)
-        for i in range(len(new_lines)):
-            if "#include \"conv_kernel.inc.cl\"" in new_lines[i]:
-                new_lines[i] = inc_header
-        header = "\n".join(new_lines)
-        new_lines = header.split("\n")
-        for i in range(len(new_lines)):
-            if "#include \"cl_common.h\"" in new_lines[i]:
-                new_lines[i] = common_header
-        header = "\n".join(new_lines)
-        return header
-
-    def get_funcs(content):
-        funcs = {}
-        lines = content.split("\n")
-        first_kernel_idx = None
-        for i in range(len(lines)):
-            if "__kernel void" in lines[i]:
-                first_kernel_idx = i
-                break
-        if first_kernel_idx is None:
-            return funcs
-        lines = lines[first_kernel_idx:]
-        func = []
-        name = ""
-        for line in lines:
-            if "__kernel void" in line:
-                if name != "":
-                    funcs[name] = "\n".join(func)
-                    name = ""
-                    func = []
-                pattern = re.compile("__kernel void ([^(]+)\(")
-                match = pattern.search(line)
-                name = match.group(1)
-            func.append(line)
-        if name != "":
-            funcs[name] = "\n".join(func)
-            name = ""
-            func = []
-        return funcs
-
-    filenames = os.listdir("cl_kernel")
-    file_count = len(filenames)
-
-    headers = {}
-    funcs = {}
-    for i in range(file_count):
-        filename = filenames[i]
-        infile = open("cl_kernel/" + filename, "r")
-        content = infile.read()
-        infile.close()
-        content = clean_source(content)
-        header = get_header(content)
-        headers[filename] = header
-        funcs_temp = get_funcs(content)
-        for key in funcs_temp:
-            funcs[key] = funcs_temp[key]
-
-    core1 = ""
-    core2 = ""
-
-    for i in range(len(funcs)):
-        func_name = list(funcs.keys())[i]
-        content = funcs[func_name]
-        if content == "":
-            content = " "
-        hexes = []
-        for char in content:
-            hexes.append(hex(ord(char)))
-        core = "        {\"%s\", {" % func_name
-        for item in hexes:
-            core += str(item) + ", "
-        core = core[: -2]
-        core += "}}"
-        if i != len(funcs) - 1:
-            core += ",\n"
-        core1 += core
-
-    for i in range(len(headers)):
-        file_name = list(headers.keys())[i]
-        content = headers[file_name]
-        if content == "":
-            content = " "
-        hexes = []
-        for char in content:
-            hexes.append(hex(ord(char)))
-        core = "        {\"%s\", {" % file_name
-        for item in hexes:
-            core += str(item) + ", "
-        core = core[: -2]
-        core += "}}"
-        if i != len(headers) - 1:
-            core += ",\n"
-        core2 += core
-    source = source % (core1, core2)
-    print(source)
-
-def gen_empty_opencl_kernels():
-    source = """
-    #pragma
-    #ifdef PADDLE_MOBILE_CL
-    #include <map>
-    #include <string>
-    #include <vector>
-    namespace paddle_mobile {
-        // func name => source
-        extern const std::map<std::string, std::vector<unsigned char>> opencl_kernels = {
-        };
-        // file name => header
-        extern const std::map<std::string, std::vector<unsigned char>> opencl_headers = {
-        };
-    }
-    #endif
-    """
-    print(source)
-
-if __name__ == "__main__":
-    if sys.argv[1] == "0":
-        gen_empty_opencl_kernels()
-    elif sys.argv[1] == "1":
-        gen_opencl_kernels()
diff --git a/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp b/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
deleted file mode 100644
index ffd0e6bc92..0000000000
--- a/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef INSTANCENORM_OP
-
-#include "operators/kernel/instancenorm_kernel.h"
-#include <cmath>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool InstanceNormKernel<GPU_CL, float>::Init(InstanceNormParam<GPU_CL> *param) {
-  this->cl_helper_.AddKernel("instancenorm", "instancenorm_kernel.cl");
-  return true;
-}
-
-template <>
-void InstanceNormKernel<GPU_CL, float>::Compute(
-    const InstanceNormParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto &dims = param.Out()->dims();
-
-  const int n = dims[0];
-  const int c_group = (dims[1] + 3) / 4;
-  const int h = dims[2];
-  const int w = dims[3];
-  auto epsilon = param.Epsilon();
-  auto input = param.InputX()->GetCLImage();
-  auto out = param.Out()->GetCLImage();
-
-  DLOG << "Epsilon: " << epsilon;
-
-  auto local_work_size_info = this->cl_helper_.LocalWorkSizeInfo();
-
-  DLOG << local_work_size_info.max_work_group_size;
-  DLOG << local_work_size_info.max_work_item_size0;
-  DLOG << local_work_size_info.max_work_item_size1;
-  DLOG << local_work_size_info.max_work_item_size2;
-
-  int local_work_size1 =
-      std::min(static_cast<int>(local_work_size_info.max_work_item_size1),
-               std::min(256, w));
-  int local_work_size2 = 1;
-  const size_t work_size[3] = {(size_t)(n * c_group), (size_t)local_work_size1,
-                               (size_t)local_work_size2};
-  const size_t local_work_size[3] = {(size_t)1, (size_t)local_work_size1,
-                                     (size_t)local_work_size2};
-
-  DLOG << "work_size" << work_size[0] << " " << work_size[1] << " "
-       << work_size[2];
-  DLOG << "local_work_size" << local_work_size[0] << " " << local_work_size[1]
-       << " " << local_work_size[2];
-
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_int), &w);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_int), &h);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(cl_int), &c_group);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_int), &local_work_size1);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(cl_int), &local_work_size2);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(cl_float), &epsilon);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &out);
-  CL_CHECK_ERRORS(status);
-  status =
-      clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
-                             work_size, local_work_size, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-
-template class InstanceNormKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp b/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
deleted file mode 100644
index eb4c13fde1..0000000000
--- a/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_INSTANCENORM_RELU_OP
-
-#include "operators/kernel/instancenorm_relu_kernel.h"
-#include <cmath>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool InstanceNormReluKernel<GPU_CL, float>::Init(
-    InstanceNormParam<GPU_CL> *param) {
-  const std::string build_options = "-DRELU";
-  this->cl_helper_.AddKernel("instancenorm", "instancenorm_kernel.cl",
-                             build_options);
-  return true;
-}
-
-template <>
-void InstanceNormReluKernel<GPU_CL, float>::Compute(
-    const InstanceNormParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto &dims = param.Out()->dims();
-
-  const int n = dims[0];
-  const int c_group = (dims[1] + 3) / 4;
-  const int h = dims[2];
-  const int w = dims[3];
-  auto epsilon = param.Epsilon();
-  auto input = param.InputX()->GetCLImage();
-  auto out = param.Out()->GetCLImage();
-
-  DLOG << "Epsilon: " << epsilon;
-
-  auto local_work_size_info = this->cl_helper_.LocalWorkSizeInfo();
-
-  DLOG << local_work_size_info.max_work_group_size;
-  DLOG << local_work_size_info.max_work_item_size0;
-  DLOG << local_work_size_info.max_work_item_size1;
-  DLOG << local_work_size_info.max_work_item_size2;
-
-  int local_work_size1 =
-      std::min(static_cast<int>(local_work_size_info.max_work_item_size1),
-               std::min(256, w));
-  int local_work_size2 = 1;
-  const size_t work_size[3] = {(size_t)(n * c_group), (size_t)local_work_size1,
-                               (size_t)local_work_size2};
-  const size_t local_work_size[3] = {(size_t)1, (size_t)local_work_size1,
-                                     (size_t)local_work_size2};
-
-  DLOG << "work_size" << work_size[0] << " " << work_size[1] << " "
-       << work_size[2];
-  DLOG << "local_work_size" << local_work_size[0] << " " << local_work_size[1]
-       << " " << local_work_size[2];
-
-  cl_int status;
-  clSetKernelArg(kernel, 0, sizeof(cl_int), &w);
-  CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 1, sizeof(cl_int), &h);
-  CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 2, sizeof(cl_int), &c_group);
-  CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 3, sizeof(cl_int), &local_work_size1);
-  CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 4, sizeof(cl_int), &local_work_size2);
-  CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 5, sizeof(cl_float), &epsilon);
-  CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 6, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 7, sizeof(cl_mem), &out);
-  CL_CHECK_ERRORS(status);
-  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
-                         work_size, local_work_size, 0, NULL, NULL);
-}
-
-template class InstanceNormReluKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/leakyrelu_kernel.cpp b/mobile/src/operators/kernel/cl/leakyrelu_kernel.cpp
deleted file mode 100644
index 9487d57b2c..0000000000
--- a/mobile/src/operators/kernel/cl/leakyrelu_kernel.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef LEAKY_RELU_OP
-
-#include <operators/kernel/activation_kernel.h>
-
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool LeakyReluKernel<GPU_CL, float>::Init(
-    paddle_mobile::operators::LeakyReluParam<paddle_mobile::GPU_CL> *param) {
-  this->cl_helper_.AddKernel("leakyrelu", "leakyrelu_kernel.cl");
-  return true;
-}
-
-template <>
-void LeakyReluKernel<GPU_CL, float>::Compute(
-    const paddle_mobile::operators::LeakyReluParam<paddle_mobile::GPU_CL>
-        &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
-  auto input = param.InputX();
-  cl_mem input_image = input->GetCLImage();
-  auto output = param.Out();
-  cl_mem out_image = output->GetCLImage();
-  float alpha = param.Alpha();
-  int out_dims_w = output->dims()[3];
-
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(float), &alpha);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(int), &out_dims_w);
-  CL_CHECK_ERRORS(status);
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-template class LeakyReluKernel<GPU_CL, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/lrn_kernel.cpp b/mobile/src/operators/kernel/cl/lrn_kernel.cpp
deleted file mode 100644
index e7e949e5ab..0000000000
--- a/mobile/src/operators/kernel/cl/lrn_kernel.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef LRN_OP
-
-#include "operators/kernel/lrn_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool LrnKernel<GPU_CL, float>::Init(LrnParam<GPU_CL> *param) {
-  this->cl_helper_.AddKernel("lrn", "lrn_kernel.cl");
-  return true;
-}
-
-template <>
-void LrnKernel<GPU_CL, float>::Compute(const LrnParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
-
-  auto input_image = param.InputX()->GetCLImage();
-  auto x_dims = param.InputX()->dims();
-  auto output_image = param.Out()->GetCLImage();
-
-  const int N = x_dims[0];
-  const int C = x_dims[1];
-  const int H = x_dims[2];
-  const int W = x_dims[3];
-
-  const int n = param.N();
-  const float alpha = param.Alpha();
-  const float beta = param.Beta();
-  const float k = param.K();
-  DLOG << "n=" << n;
-  DLOG << "alpha=" << alpha;
-  DLOG << "beta=" << beta;
-  DLOG << "k=" << k;
-  DLOG << default_work_size;
-  DLOG << C;
-  DLOG << W;
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(int), &C);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(int), &W);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(int), &n);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(float), &k);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(float), &alpha);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(float), &beta);
-
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/mul_kernel.cpp b/mobile/src/operators/kernel/cl/mul_kernel.cpp
deleted file mode 100644
index d021aa6d74..0000000000
--- a/mobile/src/operators/kernel/cl/mul_kernel.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MUL_OP
-
-#include "operators/kernel/mul_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool MulKernel<GPU_CL, float>::Init(MulParam<GPU_CL> *param) {
-  this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
-  this->cl_helper_.AddKernel("feed", "feed_kernel.cl");
-  return true;
-}
-
-template <typename Dtype>
-void MulCompute(const MulParam<GPU_CL> &param, cl_context context,
-                cl_command_queue commandQueue, cl_kernel kernel0,
-                cl_kernel kernel1) {
-  auto input_x = param.InputX();
-  Tensor *input_x_tensor = new Tensor();
-  input_x_tensor->Resize(input_x->dims());
-  input_x_tensor->mutable_data<float>();
-
-  framework::CLImageToTensor(input_x, input_x_tensor, context, commandQueue,
-                             kernel0);
-
-  auto input_y = param.InputY();
-  Tensor input_y_tensor(input_y->data<float>(), input_y->dims());
-
-  const Tensor x_matrix =
-      input_x_tensor->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_x_tensor, param.XNumColDims())
-          : *input_x_tensor;
-  const Tensor y_matrix =
-      input_y_tensor.dims().size() > 2
-          ? framework::ReshapeToMatrix(input_y_tensor, param.YNumColDims())
-          : input_y_tensor;
-
-  auto out_dim = param.Out()->dims();
-  if (out_dim.size() != 2) {
-    param.Out()->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-  }
-
-  auto output = param.Out();
-  Tensor *output_tensor = new Tensor();
-  output_tensor->Resize(output->dims());
-  output_tensor->mutable_data<float>();
-  math::MatMul<float, float>(x_matrix, false, y_matrix, false,
-                             static_cast<float>(1), output_tensor,
-                             static_cast<float>(0));
-
-  output->InitEmptyImage(context, commandQueue, output_tensor->dims());
-  framework::TensorToCLImage(output_tensor, output, context, commandQueue,
-                             kernel1);
-
-  delete (input_x_tensor);
-  delete (output_tensor);
-}
-
-template <>
-void MulKernel<GPU_CL, float>::Compute(const MulParam<GPU_CL> &param) {
-  auto kernel0 = this->cl_helper_.KernelAt(0);
-  auto kernel1 = this->cl_helper_.KernelAt(1);
-
-  MulCompute<float>(param, this->cl_helper_.CLContext(),
-                    this->cl_helper_.CLCommandQueue(), kernel0, kernel1);
-}
-
-template class MulKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/multiclass_nms_kernel.cpp b/mobile/src/operators/kernel/cl/multiclass_nms_kernel.cpp
deleted file mode 100644
index ce435b8997..0000000000
--- a/mobile/src/operators/kernel/cl/multiclass_nms_kernel.cpp
+++ /dev/null
@@ -1,340 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MULTICLASSNMS_OP
-
-#include "operators/kernel/multiclass_nms_kernel.h"
-#include <algorithm>
-#include "operators/math/poly_util.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool MultiClassNMSKernel<GPU_CL, float>::Init(
-    MultiClassNMSParam<GPU_CL>* param) {
-  this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
-  this->cl_helper_.AddKernel("feed", "feed_kernel.cl");
-  return true;
-}
-template <class T>
-bool SortScorePairDescend(const std::pair<float, T>& pair1,
-                          const std::pair<float, T>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-template <class T>
-static inline void GetMaxScoreIndex(
-    const std::vector<T>& scores, const T threshold, int top_k,
-    std::vector<std::pair<T, int>>* sorted_indices) {
-  for (size_t i = 0; i < scores.size(); ++i) {
-    if (scores[i] > threshold) {
-      sorted_indices->push_back(std::make_pair(scores[i], i));
-    }
-  }
-  // Sort the score pair according to the scores in descending order
-  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
-                   SortScorePairDescend<int>);
-  // Keep top_k scores if needed.
-  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
-    sorted_indices->resize(top_k);
-  }
-}
-
-template <class T>
-static inline T BBoxArea(const T* box, const bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
-    }
-  }
-}
-
-template <class T>
-static inline T JaccardOverlap(const T* box1, const T* box2,
-                               const bool normalized) {
-  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
-      box2[3] < box1[1]) {
-    return static_cast<T>(0.);
-  } else {
-    const T inter_xmin = std::max(box1[0], box2[0]);
-    const T inter_ymin = std::max(box1[1], box2[1]);
-    const T inter_xmax = std::min(box1[2], box2[2]);
-    const T inter_ymax = std::min(box1[3], box2[3]);
-    const T inter_w = inter_xmax - inter_xmin;
-    const T inter_h = inter_ymax - inter_ymin;
-    const T inter_area = inter_w * inter_h;
-    const T bbox1_area = BBoxArea<T>(box1, normalized);
-    const T bbox2_area = BBoxArea<T>(box2, normalized);
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
-}
-
-template <class T>
-static inline T PolyIoU(const T* box1, const T* box2, const size_t box_size,
-                        const bool normalized) {
-  T bbox1_area = math::PolyArea<T>(box1, box_size, normalized);
-  T bbox2_area = math::PolyArea<T>(box2, box_size, normalized);
-  T inter_area = math::PolyOverlapArea<T>(box1, box2, box_size, normalized);
-  if (bbox1_area == 0 || bbox2_area == 0 || inter_area == 0) {
-    // If coordinate values are is invalid
-    // if area size <= 0,  return 0.
-    return static_cast<T>(0.);
-  } else {
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
-}
-
-template <typename T>
-static inline void NMSFast(const framework::Tensor& bbox,
-                           const framework::Tensor& scores,
-                           const T score_threshold, const T nms_threshold,
-                           const T eta, const int64_t top_k,
-                           std::vector<int>* selected_indices) {
-  // The total boxes for each instance.
-  int64_t num_boxes = bbox.dims()[0];
-  // 4: [xmin ymin xmax ymax]
-  int64_t box_size = bbox.dims()[1];
-
-  std::vector<T> scores_data(num_boxes);
-  std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
-  std::vector<std::pair<T, int>> sorted_indices;
-  GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
-
-  selected_indices->clear();
-  T adaptive_threshold = nms_threshold;
-  const T* bbox_data = bbox.data<T>();
-
-  while (sorted_indices.size() != 0) {
-    const int idx = sorted_indices.front().second;
-    bool keep = true;
-    for (size_t k = 0; k < selected_indices->size(); ++k) {
-      if (keep) {
-        const int kept_idx = (*selected_indices)[k];
-        T overlap = T(0.);
-        if (box_size == 4) {
-          overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
-                                      bbox_data + kept_idx * box_size, true);
-        } else {
-          overlap = PolyIoU<T>(bbox_data + idx * box_size,
-                               bbox_data + kept_idx * box_size, box_size, true);
-        }
-        keep = overlap <= adaptive_threshold;
-      } else {
-        break;
-      }
-    }
-    if (keep) {
-      selected_indices->push_back(idx);
-    }
-    sorted_indices.erase(sorted_indices.begin());
-    if (keep && eta < 1 && adaptive_threshold > 0.5) {
-      adaptive_threshold *= eta;
-    }
-  }
-}
-
-template <typename T>
-void MultiClassNMS(const framework::Tensor& scores,
-                   const framework::Tensor& bboxes,
-                   std::map<int, std::vector<int>>* indices, int* num_nmsed_out,
-                   const int& background_label, const int& nms_top_k,
-                   const int& keep_top_k, const T& nms_threshold,
-                   const T& nms_eta, const T& score_threshold) {
-  int64_t class_num = scores.dims()[0];
-  int64_t predict_dim = scores.dims()[1];
-  int num_det = 0;
-  for (int64_t c = 0; c < class_num; ++c) {
-    if (c == background_label) continue;
-    framework::Tensor score = scores.Slice(c, c + 1);
-    /// [c] is key
-    NMSFast<float>(bboxes, score, score_threshold, nms_threshold, nms_eta,
-                   nms_top_k, &((*indices)[c]));
-    num_det += (*indices)[c].size();
-  }
-
-  *num_nmsed_out = num_det;
-  const T* scores_data = scores.data<T>();
-  if (keep_top_k > -1 && num_det > keep_top_k) {
-    std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
-    for (const auto& it : *indices) {
-      int label = it.first;
-      const T* sdata = scores_data + label * predict_dim;
-      const std::vector<int>& label_indices = it.second;
-      for (size_t j = 0; j < label_indices.size(); ++j) {
-        int idx = label_indices[j];
-        // PADDLE_ENFORCE_LT(idx, predict_dim);
-        score_index_pairs.push_back(
-            std::make_pair(sdata[idx], std::make_pair(label, idx)));
-      }
-    }
-    // Keep top k results per image.
-    std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
-                     SortScorePairDescend<std::pair<int, int>>);
-    score_index_pairs.resize(keep_top_k);
-
-    // Store the new indices.
-    std::map<int, std::vector<int>> new_indices;
-    for (size_t j = 0; j < score_index_pairs.size(); ++j) {
-      int label = score_index_pairs[j].second.first;
-      int idx = score_index_pairs[j].second.second;
-      new_indices[label].push_back(idx);
-    }
-    new_indices.swap(*indices);
-    *num_nmsed_out = keep_top_k;
-  }
-}
-
-template <typename T>
-void MultiClassOutput(const framework::Tensor& scores,
-                      const framework::Tensor& bboxes,
-                      const std::map<int, std::vector<int>>& selected_indices,
-                      framework::Tensor* outs) {
-  int predict_dim = scores.dims()[1];
-  int box_size = bboxes.dims()[1];
-  int out_dim = bboxes.dims()[1] + 2;
-  auto* scores_data = scores.data<T>();
-  auto* bboxes_data = bboxes.data<T>();
-  auto* odata = outs->data<T>();
-
-  int count = 0;
-  for (const auto& it : selected_indices) {
-    /// one batch
-    int label = it.first;
-    const T* sdata = scores_data + label * predict_dim;
-    const std::vector<int>& indices = it.second;
-    for (size_t j = 0; j < indices.size(); ++j) {
-      int idx = indices[j];
-      const T* bdata = bboxes_data + idx * box_size;
-      odata[count * out_dim] = label;           // label
-      odata[count * out_dim + 1] = sdata[idx];  // score
-      // xmin, ymin, xmax, ymax
-      std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T));
-      count++;
-    }
-  }
-}
-
-template <typename P>
-void MultiClassNMSCompute(const MultiClassNMSParam<GPU_CL>& param,
-                          cl_context context, cl_command_queue commandQueue,
-                          cl_kernel kernel0, cl_kernel kernel1) {
-  auto* input_bboxes_image = param.InputBBoxes();
-  auto& input_bboxes_dims = input_bboxes_image->dims();
-  Tensor* input_bboxes = new Tensor();
-  input_bboxes->Resize(input_bboxes_dims);
-  input_bboxes->mutable_data<float>();
-  DLOG << "yangfei20";
-  framework::CLImageToTensor(input_bboxes_image, input_bboxes, context,
-                             commandQueue, kernel0);
-  DLOG << "yangfei20";
-  auto* input_scores_image = param.InputScores();
-  auto& input_scores_dims = input_scores_image->dims();
-
-  Tensor* input_scores = new Tensor();
-  input_scores->Resize(input_scores_dims);
-  input_scores->mutable_data<float>();
-  framework::CLImageToTensor(input_scores_image, input_scores, context,
-                             commandQueue, kernel0);
-  DLOG << "yangfei20";
-  auto outs_image = param.Out();
-  Tensor* outs = new Tensor();
-  outs->Resize(outs_image->dims());
-  outs->mutable_data<float>();
-  DLOG << *input_bboxes;
-  DLOG << *input_scores;
-  DLOG << *outs;
-  auto background_label = param.BackGroundLabel();
-  auto nms_top_k = param.NMSTopK();
-  auto keep_top_k = param.KeepTopK();
-  auto nms_threshold = param.NMSThreshold();
-  auto nms_eta = param.NMSEta();
-  auto score_threshold = param.ScoreThreshold();
-
-  int64_t batch_size = input_scores_dims[0];
-  int64_t class_num = input_scores_dims[1];
-  int64_t predict_dim = input_scores_dims[2];
-  int64_t box_dim = input_bboxes_dims[2];
-
-  std::vector<std::map<int, std::vector<int>>> all_indices;
-  std::vector<size_t> batch_starts = {0};
-  for (int64_t i = 0; i < batch_size; ++i) {
-    framework::Tensor ins_score = input_scores->Slice(i, i + 1);
-    ins_score.Resize({class_num, predict_dim});
-
-    framework::Tensor ins_boxes = input_bboxes->Slice(i, i + 1);
-    ins_boxes.Resize({predict_dim, box_dim});
-
-    std::map<int, std::vector<int>> indices;
-    int num_nmsed_out = 0;
-    MultiClassNMS<float>(ins_score, ins_boxes, &indices, &num_nmsed_out,
-                         background_label, nms_top_k, keep_top_k, nms_threshold,
-                         nms_eta, score_threshold);
-    all_indices.push_back(indices);
-    batch_starts.push_back(batch_starts.back() + num_nmsed_out);
-  }
-
-  int num_kept = batch_starts.back();
-  if (num_kept == 0) {
-    float* od = outs->mutable_data<float>({1});
-    od[0] = -1;
-  } else {
-    int64_t out_dim = box_dim + 2;
-    outs->mutable_data<float>({num_kept, out_dim});
-    for (int64_t i = 0; i < batch_size; ++i) {
-      framework::Tensor ins_score = input_scores->Slice(i, i + 1);
-      ins_score.Resize({class_num, predict_dim});
-
-      framework::Tensor ins_boxes = input_bboxes->Slice(i, i + 1);
-      ins_boxes.Resize({predict_dim, box_dim});
-
-      int64_t s = batch_starts[i];
-      int64_t e = batch_starts[i + 1];
-      if (e > s) {
-        framework::Tensor out = outs->Slice(s, e);
-        MultiClassOutput<float>(ins_score, ins_boxes, all_indices[i], &out);
-      }
-    }
-  }
-  DLOG << "yangfei20";
-  outs_image->InitEmptyImage(context, commandQueue, outs->dims());
-  framework::TensorToCLImage(outs, outs_image, context, commandQueue, kernel1);
-  DLOG << *outs;
-  delete (input_bboxes);
-  delete (input_scores);
-  delete (outs);
-  DLOG << "yangfei20";
-}
-template <>
-void MultiClassNMSKernel<GPU_CL, float>::Compute(
-    const MultiClassNMSParam<GPU_CL>& param) {
-  auto kernel0 = this->cl_helper_.KernelAt(0);
-  auto kernel1 = this->cl_helper_.KernelAt(1);
-  MultiClassNMSCompute<float>(param, this->cl_helper_.CLContext(),
-                              this->cl_helper_.CLCommandQueue(), kernel0,
-                              kernel1);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/nearest_interp_kernel.cpp b/mobile/src/operators/kernel/cl/nearest_interp_kernel.cpp
deleted file mode 100644
index 285602757b..0000000000
--- a/mobile/src/operators/kernel/cl/nearest_interp_kernel.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef NEAREST_INTERP_OP
-
-#include <operators/kernel/nearest_interp_kernel.h>
-
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool NearestInterpolationKernel<GPU_CL, float>::Init(
-    paddle_mobile::operators::NearestInterpolationParam<paddle_mobile::GPU_CL>
-        *param) {
-  this->cl_helper_.AddKernel("nearest_interp", "nearest_interp_kernel.cl");
-  return true;
-}
-
-template <>
-void NearestInterpolationKernel<GPU_CL, float>::Compute(
-    const paddle_mobile::operators::NearestInterpolationParam<
-        paddle_mobile::GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
-  auto input = param.InputX();
-  cl_mem input_image = input->GetCLImage();
-  auto output = param.Out();
-  cl_mem output_image = output->GetCLImage();
-  float scale_h = output->dims()[2] / input->dims()[2];
-  float scale_w = output->dims()[3] / input->dims()[3];
-  int in_dims_h = input->dims()[2];
-  int out_dims_h = output->dims()[2];
-  int in_dims_w = input->dims()[3];
-  int out_dims_w = output->dims()[3];
-
-  cl_int status;
-
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-  CL_CHECK_ERRORS(status)
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-  CL_CHECK_ERRORS(status)
-  status = clSetKernelArg(kernel, 2, sizeof(float), &scale_h);
-  CL_CHECK_ERRORS(status)
-  status = clSetKernelArg(kernel, 3, sizeof(float), &scale_w);
-  CL_CHECK_ERRORS(status)
-  status = clSetKernelArg(kernel, 4, sizeof(int), &in_dims_h);
-  CL_CHECK_ERRORS(status)
-  status = clSetKernelArg(kernel, 5, sizeof(int), &out_dims_h);
-  CL_CHECK_ERRORS(status)
-  status = clSetKernelArg(kernel, 6, sizeof(int), &in_dims_w);
-  CL_CHECK_ERRORS(status)
-  status = clSetKernelArg(kernel, 7, sizeof(int), &out_dims_w);
-  CL_CHECK_ERRORS(status)
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status)
-}
-template class NearestInterpolationKernel<GPU_CL, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/pad2d_kernel.cpp b/mobile/src/operators/kernel/cl/pad2d_kernel.cpp
deleted file mode 100644
index 3999995b4a..0000000000
--- a/mobile/src/operators/kernel/cl/pad2d_kernel.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PAD2D_OP
-
-#include "operators/kernel/pad2d_kernel.h"
-#include "framework/cl/cl_tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool Pad2DKernel<GPU_CL, float>::Init(Pad2DParam<GPU_CL> *param) {
-  DLOG << "Init pad2d";
-  this->cl_helper_.AddKernel("pad2d", "pad2d_kernel.cl");
-  return true;
-}
-
-template <>
-void Pad2DKernel<GPU_CL, float>::Compute(const Pad2DParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
-  cl_int status;
-  auto output = param.Out();
-  auto input = param.InputX();
-  auto output_image = output->GetCLImage();
-  auto input_image = input->GetCLImage();
-  const int out_H = output->dims()[2];
-  const int out_W = output->dims()[3];
-  const int input_H = input->dims()[2];
-  const int input_W = input->dims()[3];
-  const auto &paddings = param.paddings_;
-  const int pad_top = paddings[0];
-  const int pad_bottom = paddings[1];
-  const int pad_left = paddings[2];
-  const int pad_right = paddings[3];
-  const float pad_value = param.pad_value_;
-  const auto &modeStr = param.mode_;
-  int mode = 0;
-  if (modeStr == "reflect") {
-    mode = 1;
-  } else if (modeStr == "edge") {
-    mode = 2;
-  }
-  DLOG << "input_H: " << input_H;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_int), &input_H);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_int), &input_W);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(cl_int), &pad_top);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(cl_int), &pad_bottom);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(cl_int), &pad_left);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(cl_int), &pad_right);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 8, sizeof(cl_int), &mode);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 9, sizeof(cl_float), &pad_value);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 10, sizeof(cl_mem), &input_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 11, sizeof(cl_mem), &output_image);
-  CL_CHECK_ERRORS(status);
-
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-
-  CL_CHECK_ERRORS(status);
-}
-
-template class Pad2DKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // PAD2D_OP
diff --git a/mobile/src/operators/kernel/cl/pool_kernel.cpp b/mobile/src/operators/kernel/cl/pool_kernel.cpp
deleted file mode 100644
index ed0731c31b..0000000000
--- a/mobile/src/operators/kernel/cl/pool_kernel.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POOL_OP
-
-#include "operators/kernel/pool_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool PoolKernel<GPU_CL, float>::Init(PoolParam<GPU_CL> *param) {
-  std::string pooling_type = param->PoolingType();
-  this->cl_helper_.AddKernel("pool_" + pooling_type, "pool_kernel.cl");
-  return true;
-}
-
-template <>
-void PoolKernel<GPU_CL, float>::Compute(const PoolParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
-
-  auto input = param.Input()->GetCLImage();
-  auto out = param.Output()->GetCLImage();
-
-  framework::CLImageConverterFolder *input_folder_converter =
-      reinterpret_cast<framework::CLImageConverterFolder *>(
-          param.Input()->Converter());
-  framework::CLImageConverterFolder *output_folder_converter =
-      reinterpret_cast<framework::CLImageConverterFolder *>(
-          param.Output()->Converter());
-
-  const int in_height = input_folder_converter->HeightOfOneBlock();
-  const int in_width = input_folder_converter->WidthOfOneBlock();
-  const int out_height = output_folder_converter->HeightOfOneBlock();
-  const int out_width = output_folder_converter->WidthOfOneBlock();
-
-  std::string pooling_type = param.PoolingType();
-  std::vector<int> ksize = param.Ksize();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  const int pad_top = paddings[0];
-  const int pad_left = paddings[1];
-  const int stride_h = strides[0];
-  const int stride_w = strides[1];
-  const int ksize_h = ksize[0];
-  const int ksize_w = ksize[1];
-
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_int), &in_height);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_int), &in_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_height);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(cl_int), &pad_top);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(cl_int), &pad_left);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(cl_int), &stride_h);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(cl_int), &stride_w);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 8, sizeof(cl_int), &ksize_h);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 9, sizeof(cl_int), &ksize_w);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 10, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 11, sizeof(cl_mem), &out);
-  CL_CHECK_ERRORS(status);
-
-  //  cl_event out_event = param.Output()->GetClEvent();
-  //  cl_event wait_event = param.Input()->GetClEvent();
-  status =
-      clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
-                             default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-
-template class PoolKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/prior_box_kernel.cpp b/mobile/src/operators/kernel/cl/prior_box_kernel.cpp
deleted file mode 100644
index 92764b379e..0000000000
--- a/mobile/src/operators/kernel/cl/prior_box_kernel.cpp
+++ /dev/null
@@ -1,216 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PRIORBOX_OP
-
-#include "operators/kernel/prior_box_kernel.h"
-#include "framework/cl/cl_tensor.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool PriorBoxKernel<GPU_CL, float>::Init(PriorBoxParam<GPU_CL> *param) {
-  this->cl_helper_.AddKernel("prior_box", "prior_box_kernel.cl");
-  return true;
-}
-
-template <>
-void PriorBoxKernel<GPU_CL, float>::Compute(
-    const PriorBoxParam<GPU_CL> &param) {
-  const auto *input_ = param.Input();
-  const auto &input_dims = input_->dims();
-
-  const auto &input_image_dims = param.InputImage()->dims();
-
-  const auto &min_sizes = param.MinSizes();
-  const auto &max_sizes = param.MaxSizes();
-  const auto &variances = param.Variances();
-  const auto &input_aspect_ratio = param.AspectRatios();
-  const bool &flip = param.Flip();
-  const bool &clip = param.Clip();
-  int isclip = 0;
-  if (clip) {
-    isclip = 1;
-  }
-  const float &step_w = param.StepW();
-  const float &step_h = param.StepH();
-  const float &offset = param.Offset();
-  const int C = param.OutputBoxes()->dims()[1];
-
-  auto output_boxes = param.OutputBoxes()->GetCLImage();
-  auto output_variances = param.OutputVariances()->GetCLImage();
-
-  std::vector<float> aspect_ratios;
-  ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
-
-  auto img_width = input_image_dims[3];
-  auto img_height = input_image_dims[2];
-
-  auto feature_width = input_dims[3];
-  auto feature_height = input_dims[2];
-
-  float step_width, step_height;
-  /// 300 / 19
-  if (step_w == 0 || step_h == 0) {
-    step_width = static_cast<float>(img_width) / feature_width;
-    step_height = static_cast<float>(img_height) / feature_height;
-  } else {
-    step_width = step_w;
-    step_height = step_h;
-  }
-
-  int num_priors = aspect_ratios.size() * min_sizes.size();
-  if (!max_sizes.empty()) {
-    num_priors += max_sizes.size();
-  }
-
-  float *box_width = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * num_priors));
-  float *box_height = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * num_priors));
-  float *variancesptr =
-      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * 4));
-  int idx = 0;
-  for (size_t s = 0; s < min_sizes.size(); ++s) {
-    auto min_size = min_sizes[s];
-    if (param.MinMaxAspectRatiosOrder()) {
-      box_width[idx] = box_height[idx] = min_size / 2.;
-      idx++;
-      if (max_sizes.size() > 0) {
-        auto max_size = max_sizes[s];
-        box_width[idx] = box_height[idx] = sqrt(min_size * max_size) / 2.;
-        idx++;
-      }
-      for (float ar : aspect_ratios) {
-        if (fabs(ar - 1.) < 1e-6) {
-          continue;
-        }
-        box_width[idx] = min_size * sqrt(ar) / 2.;
-        box_height[idx] = min_size / sqrt(ar) / 2.;
-        idx++;
-      }
-
-    } else {
-      for (float ar : aspect_ratios) {
-        box_width[idx] = min_size * sqrt(ar) / 2.;
-        box_height[idx] = min_size / sqrt(ar) / 2.;
-        idx++;
-      }
-      if (!max_sizes.empty()) {
-        auto max_size = max_sizes[s];
-        box_width[idx] = box_height[idx] = sqrt(min_size * max_size) / 2.;
-        idx++;
-      }
-    }
-  }
-  for (int i = 0; i < variances.size(); i++) {
-    variancesptr[i] = variances[i];
-  }
-  cl_int status;
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size =
-      this->cl_helper_.DefaultWorkSize(*param.OutputBoxes());
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-
-  std::vector<int64_t> box_shape({num_priors});
-  framework::DDim ddim = framework::make_ddim(box_shape);
-
-  framework::CLTensor box_width_cl_tensor(this->cl_helper_.CLContext(),
-                                          this->cl_helper_.CLCommandQueue());
-  box_width_cl_tensor.Resize(ddim);
-  cl_mem box_width_Buffer =
-      box_width_cl_tensor.mutable_with_data<float>(box_width);
-
-  framework::CLTensor box_height_cl_tensor(this->cl_helper_.CLContext(),
-                                           this->cl_helper_.CLCommandQueue());
-  box_height_cl_tensor.Resize(ddim);
-  cl_mem box_height_Buffer =
-      box_height_cl_tensor.mutable_with_data<float>(box_height);
-
-  framework::CLTensor variances_cl_tensor(this->cl_helper_.CLContext(),
-                                          this->cl_helper_.CLCommandQueue());
-
-  std::vector<int64_t> variances_shape({4});
-  framework::DDim vddim = framework::make_ddim(variances_shape);
-
-  variances_cl_tensor.Resize(vddim);
-  cl_mem variances_Buffer =
-      variances_cl_tensor.mutable_with_data<float>(variancesptr);
-
-  //            DLOG << "c_block:" << c_block;
-  //            DLOG << "w:" << w;
-  //            DLOG << "nh:" << nh;
-  //            DLOG << "step_width:" << step_width;
-  //            DLOG << "step_height:" << step_height;
-  //            DLOG << "offset:" << offset;
-  //            DLOG << "img_width:" << img_width;
-  //            DLOG << "img_height:" << img_height;
-  //            DLOG << "num_priors:" << num_priors;
-  //            DLOG << "C:" << C;
-  //            DLOG << "isclip:" << isclip;
-  //            printf("param.MinMaxAspectRatiosOrder() =
-  //            %d\n",param.MinMaxAspectRatiosOrder()); for (int i = 0; i <
-  //            num_priors; i++) {
-  //                DLOG << box_width[i];
-  //                DLOG << box_height[i];
-  //            }
-  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &box_width_Buffer);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &box_height_Buffer);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &variances_Buffer);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output_boxes);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &output_variances);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 8, sizeof(float), &step_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 9, sizeof(float), &step_height);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 10, sizeof(float), &offset);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 11, sizeof(int), &img_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 12, sizeof(int), &img_height);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 13, sizeof(int), &num_priors);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 14, sizeof(int), &C);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 15, sizeof(int), &isclip);
-  CL_CHECK_ERRORS(status);
-  size_t global_work_size[2] = {c_block, nh};
-  status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                                  NULL, global_work_size, NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-
-  paddle_mobile::memory::Free(box_width);
-  paddle_mobile::memory::Free(box_height);
-  paddle_mobile::memory::Free(variancesptr);
-}
-template class PriorBoxKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/relu6_kernel.cpp b/mobile/src/operators/kernel/cl/relu6_kernel.cpp
deleted file mode 100644
index 20a6d9815b..0000000000
--- a/mobile/src/operators/kernel/cl/relu6_kernel.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef RELU_OP
-
-#include "operators/kernel/activation_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool Relu6Kernel<GPU_CL, float>::Init(Relu6Param<GPU_CL>* param) {
-  this->cl_helper_.AddKernel("relu6", "relu6.cl");
-  return true;
-}
-
-template <>
-void Relu6Kernel<GPU_CL, float>::Compute(const Relu6Param<GPU_CL>& param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  const auto* input = param.InputX();
-  auto* output = param.Out();
-  float threshold = param.getThreshold();
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*output);
-  auto inputImage = input->GetCLImage();
-  auto outputImage = output->GetCLImage();
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(float), &threshold);
-  CL_CHECK_ERRORS(status);
-  const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()};
-
-  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL,
-                         work_size, NULL, 0, NULL, NULL);
-}
-
-template class Relu6Kernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/cl/relu_kernel.cpp b/mobile/src/operators/kernel/cl/relu_kernel.cpp
deleted file mode 100644
index f166963d94..0000000000
--- a/mobile/src/operators/kernel/cl/relu_kernel.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef RELU_OP
-
-#include "operators/kernel/activation_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ReluKernel<GPU_CL, float>::Init(ReluParam<GPU_CL>* param) {
-  this->cl_helper_.AddKernel("relu", "relu.cl");
-  //  this->cl_helper_.AddKernel("relu_p0", "relu.cl");
-  //  this->cl_helper_.AddKernel("relu_p1", "relu.cl");
-  //  const auto dim =
-  //      const_cast<framework::CLImage*>(param->InputX())->ImageDims();
-  //  param->getMidImage().InitEmptyImage(this->cl_helper_.CLContext(),
-  //                                      this->cl_helper_.CLCommandQueue(),
-  //                                      dim);
-  return true;
-}
-
-template <>
-void ReluKernel<GPU_CL, float>::Compute(const ReluParam<GPU_CL>& param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  //  auto kernel_p0 = this->cl_helper_.KernelAt(1);
-  //  auto kernel_p1 = this->cl_helper_.KernelAt(2);
-  const auto* input = param.InputX();
-  auto* output = param.Out();
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*output);
-  auto inputImage = input->GetCLImage();
-  auto outputImage = output->GetCLImage();
-  //  auto tImage =
-  //      const_cast<ReluParam<GPU_CL>&>(param).getMidImage().GetCLImage();
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
-  CL_CHECK_ERRORS(status);
-  //  clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &inputImage);
-  //  clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &tImage);
-  //  clSetKernelArg(kernel_p1, 0, sizeof(cl_mem), &tImage);
-  //  clSetKernelArg(kernel_p1, 1, sizeof(cl_mem), &outputImage);
-  const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()};
-
-  //  cl_event out_event = param.Out()->GetClEvent();
-  //  cl_event wait_event = param.InputX()->GetClEvent();
-
-  status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                                  NULL, work_size, NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-  //  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel_p1, 3,
-  //  NULL,
-  //                         work_size, NULL, 0, NULL, NULL);
-}
-
-template class ReluKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/cl/reshape2_kernel.cpp b/mobile/src/operators/kernel/cl/reshape2_kernel.cpp
deleted file mode 100644
index 7dbea06a51..0000000000
--- a/mobile/src/operators/kernel/cl/reshape2_kernel.cpp
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef RESHAPE2_OP
-
-#include "operators/kernel/reshape2_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool Reshape2Kernel<GPU_CL, float>::Init(Reshape2Param<GPU_CL> *param) {
-  this->cl_helper_.AddKernel("reshape", "reshape.cl");
-  return true;
-}
-
-inline framework::DDim ValidateShape(const std::vector<int> shape,
-                                     const framework::DDim &in_dims) {
-  const int64_t in_size = framework::product(in_dims);
-  // only one dimension can be set to -1, whose size will be automatically
-  // infered.
-  const int64_t unk_dim_val = -1;
-  const int64_t copy_dim_val = 0;
-
-  std::vector<int64_t> output_shape(shape.size(), 0);
-  int64_t capacity = 1;
-  int unk_dim_idx = -1;
-  for (size_t i = 0; i < shape.size(); ++i) {
-    if (shape[i] == unk_dim_val) {
-      PADDLE_MOBILE_ENFORCE(
-          unk_dim_idx == -1,
-          "Only one input dimension of Attr(shape) can be unknown.");
-      unk_dim_idx = i;
-    } else if (shape[i] == copy_dim_val) {
-      PADDLE_MOBILE_ENFORCE(
-          static_cast<int>(i) < in_dims.size(),
-          "The index of dimension to copy from input shape must be less "
-          "than the size of input shape.");
-    } else {
-      PADDLE_MOBILE_ENFORCE(
-          shape[i] > 0,
-          "Each input dimension of Attr(shape) must not be negtive except "
-          "one unknown dimension.");
-    }
-
-    capacity *= (shape[i] ? shape[i] : in_dims[i]);
-    output_shape[i] = (shape[i] ? static_cast<int64_t>(shape[i]) : in_dims[i]);
-  }
-
-  if (unk_dim_idx != -1) {
-    output_shape[unk_dim_idx] = -in_size / capacity;
-    PADDLE_MOBILE_ENFORCE(output_shape[unk_dim_idx] * capacity == -in_size,
-                          "Invalid shape is given.");
-  } else {
-    PADDLE_MOBILE_ENFORCE(capacity == in_size, "Invalid shape is given.");
-  }
-  return framework::make_ddim(output_shape);
-}
-
-template <>
-void Reshape2Kernel<GPU_CL, float>::Compute(
-    const Reshape2Param<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
-  const auto *input = param.InputX();
-  auto *output = param.Out();
-  auto input_image = input->GetCLImage();
-  auto output_image = output->GetCLImage();
-  const auto &inputDim = input->dims();
-  const auto &outputDim = output->dims();
-  int input_dims[4] = {1, 1, 1, 1};
-  int output_dims[4] = {1, 1, 1, 1};
-  // 1 1000 1 1
-  for (int i = 0; i < inputDim.size(); i++) {
-    input_dims[4 - inputDim.size() + i] = inputDim[i];
-  }
-
-  // 1 1 1 1000
-  for (int i = 0; i < outputDim.size(); i++) {
-    output_dims[4 - outputDim.size() + i] = outputDim[i];
-  }
-
-  int out_C = output_dims[1];
-  int out_H = output_dims[2];
-  int out_W = output_dims[3];
-  int in_W = input_dims[3];
-  int in_H = input_dims[2];
-  int in_Stride0 = in_W;
-  int in_Stride1 = input_dims[2] * input_dims[3];
-  int in_Stride2 = input_dims[1] * input_dims[2] * input_dims[3];
-  int out_Stride0 = out_W;
-  int out_Stride1 = out_H * out_W;
-  int out_Stride2 = out_C * out_H * out_W;
-  DLOG << "out_C=" << out_C;
-  DLOG << "out_H=" << out_H;
-  DLOG << "out_W=" << out_W;
-  DLOG << "in_W=" << in_W;
-  DLOG << "default_work_size=" << default_work_size;
-  DLOG << "in_Stride0=" << in_Stride0;
-  DLOG << "in_Stride1=" << in_Stride1;
-  DLOG << "out_Stride0=" << out_Stride0;
-  DLOG << "out_Stride1=" << out_Stride1;
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(int), &out_C);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(int), &out_H);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(int), &out_W);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(int), &in_W);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(int), &in_H);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(int), &in_Stride0);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 8, sizeof(int), &in_Stride1);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 9, sizeof(int), &in_Stride2);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 10, sizeof(int), &out_Stride0);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 11, sizeof(int), &out_Stride1);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 12, sizeof(int), &out_Stride2);
-  CL_CHECK_ERRORS(status);
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-
-template class Reshape2Kernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/cl/reshape_kernel.cpp b/mobile/src/operators/kernel/cl/reshape_kernel.cpp
deleted file mode 100644
index 18d98b0ff9..0000000000
--- a/mobile/src/operators/kernel/cl/reshape_kernel.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef RESHAPE_OP
-
-#include "operators/kernel/reshape_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ReshapeKernel<GPU_CL, float>::Init(ReshapeParam<GPU_CL> *param) {
-  this->cl_helper_.AddKernel("reshape", "reshape.cl");
-  return true;
-}
-
-template <>
-void ReshapeKernel<GPU_CL, float>::Compute(const ReshapeParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
-  const auto *input = param.InputX();
-  auto *output = param.Out();
-  auto input_image = input->GetCLImage();
-  auto output_image = output->GetCLImage();
-  const auto &inputDim = input->dims();
-  const auto &outputDim = output->dims();
-  int input_dims[4] = {1, 1, 1, 1};
-  int output_dims[4] = {1, 1, 1, 1};
-  // 1 1000 1 1
-  for (int i = 0; i < inputDim.size(); i++) {
-    input_dims[4 - inputDim.size() + i] = inputDim[i];
-  }
-
-  // 1 1 1 1000
-  for (int i = 0; i < outputDim.size(); i++) {
-    output_dims[4 - outputDim.size() + i] = outputDim[i];
-  }
-
-  int out_C = output_dims[1];
-  int out_H = output_dims[2];
-  int out_W = output_dims[3];
-  int in_W = input_dims[3];
-  int in_H = input_dims[2];
-  int in_Stride0 = in_W;
-  int in_Stride1 = input_dims[2] * input_dims[3];
-  int in_Stride2 = input_dims[1] * input_dims[2] * input_dims[3];
-  int out_Stride0 = out_W;
-  int out_Stride1 = out_H * out_W;
-  int out_Stride2 = out_C * out_H * out_W;
-  DLOG << "out_C=" << out_C;
-  DLOG << "out_H=" << out_H;
-  DLOG << "out_W=" << out_W;
-  DLOG << "in_W=" << in_W;
-  DLOG << "default_work_size=" << default_work_size;
-  DLOG << "in_Stride0=" << in_Stride0;
-  DLOG << "in_Stride1=" << in_Stride1;
-  DLOG << "out_Stride0=" << out_Stride0;
-  DLOG << "out_Stride1=" << out_Stride1;
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(int), &out_C);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(int), &out_H);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(int), &out_W);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(int), &in_W);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(int), &in_H);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(int), &in_Stride0);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 8, sizeof(int), &in_Stride1);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 9, sizeof(int), &in_Stride2);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 10, sizeof(int), &out_Stride0);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 11, sizeof(int), &out_Stride1);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 12, sizeof(int), &out_Stride2);
-  CL_CHECK_ERRORS(status);
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-
-template class ReshapeKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/cl/scale_kernel.cpp b/mobile/src/operators/kernel/cl/scale_kernel.cpp
deleted file mode 100644
index 4ab2be7c3f..0000000000
--- a/mobile/src/operators/kernel/cl/scale_kernel.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SCALE_OP
-
-#include "operators/kernel/scale_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ScaleKernel<GPU_CL, float>::Init(ScaleParam<GPU_CL>* param) {
-  this->cl_helper_.AddKernel("scale", "scale_kernel.cl");
-  return true;
-}
-
-template <>
-void ScaleKernel<GPU_CL, float>::Compute(const ScaleParam<GPU_CL>& param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  const auto* input = param.InputX();
-  auto* output = param.Out();
-  const float scale = param.Scale();
-  const float bias = param.Bias();
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*output);
-  auto inputImage = input->GetCLImage();
-  auto outputImage = output->GetCLImage();
-  int out_width = (output->dims().size() == 4) ? output->dims()[3] : 1;
-
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(float), &scale);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(float), &bias);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(int), &out_width);
-  CL_CHECK_ERRORS(status);
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-
-template class ScaleKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/sigmoid_kernel.cpp b/mobile/src/operators/kernel/cl/sigmoid_kernel.cpp
deleted file mode 100644
index 33ce051f4a..0000000000
--- a/mobile/src/operators/kernel/cl/sigmoid_kernel.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef SIGMOID_OP
-
-#include "operators/kernel/activation_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool SigmoidKernel<GPU_CL, float>::Init(SigmoidParam<GPU_CL>* param) {
-  this->cl_helper_.AddKernel("sigmoid", "sigmoid.cl");
-  return true;
-}
-
-template <>
-void SigmoidKernel<GPU_CL, float>::Compute(const SigmoidParam<GPU_CL>& param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  const auto* input = param.InputX();
-  auto* output = param.Out();
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*output);
-  auto inputImage = input->GetCLImage();
-  auto outputImage = output->GetCLImage();
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
-  CL_CHECK_ERRORS(status);
-  const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()};
-  status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                                  NULL, work_size, NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-
-template class SigmoidKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/cl/slice_kernel.cpp b/mobile/src/operators/kernel/cl/slice_kernel.cpp
deleted file mode 100644
index 446d003219..0000000000
--- a/mobile/src/operators/kernel/cl/slice_kernel.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SLICE_OP
-
-#include <framework/cl/cl_tensor.h>
-#include <operators/kernel/slice_kernel.h>
-
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool SliceKernel<GPU_CL, float>::Init(
-    paddle_mobile::operators::SliceParam<paddle_mobile::GPU_CL> *param) {
-  this->cl_helper_.AddKernel("slice", "slice_kernel.cl");
-  return true;
-}
-
-template <>
-void SliceKernel<GPU_CL, float>::Compute(
-    const paddle_mobile::operators::SliceParam<paddle_mobile::GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.output_);
-  auto input = param.input_;
-  cl_mem input_image = input->GetCLImage();
-  auto output = param.output_;
-  cl_mem output_image = output->GetCLImage();
-  int starts_0 = param.starts_[0];
-  int ends_0 = param.ends_[0];
-  int axes_0 = param.axes_[0] - (param.original_output_dims_size_ -
-                                 param.output_->dims().size());
-  int dims_w = input->dims()[axes_0 + 2];
-
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(int), &starts_0);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(int), &ends_0);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(int), &dims_w);
-  CL_CHECK_ERRORS(status);
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-template class SliceKernel<GPU_CL, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/softmax_kernel.cpp b/mobile/src/operators/kernel/cl/softmax_kernel.cpp
deleted file mode 100644
index 6447b68d33..0000000000
--- a/mobile/src/operators/kernel/cl/softmax_kernel.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SOFTMAX_OP
-
-#include "operators/kernel/softmax_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool SoftmaxKernel<GPU_CL, float>::Init(SoftmaxParam<GPU_CL> *param) {
-  this->cl_helper_.AddKernel("softmax", "softmax.cl");
-  return true;
-}
-
-template <>
-void SoftmaxKernel<GPU_CL, float>::Compute(const SoftmaxParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
-  const auto *input = param.InputX();
-  auto *output = param.Out();
-  auto inputImage = input->GetCLImage();
-  auto outputImage = output->GetCLImage();
-  const auto &outputDim = output->dims();
-
-  int dims[4] = {1, 1, 1, 1};
-
-  for (int i = 0; i < outputDim.size(); i++) {
-    dims[4 - outputDim.size() + i] = outputDim[i];
-  }
-
-  const int out_W = dims[3];
-
-  cl_int status;
-
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(int), &out_W);
-  CL_CHECK_ERRORS(status);
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-
-  CL_CHECK_ERRORS(status);
-}
-
-template class SoftmaxKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/cl/split_kernel.cpp b/mobile/src/operators/kernel/cl/split_kernel.cpp
deleted file mode 100644
index 58c7361bc5..0000000000
--- a/mobile/src/operators/kernel/cl/split_kernel.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SPLIT_OP
-
-#include "operators/kernel/split_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool SplitKernel<GPU_CL, float>::Init(SplitParam<GPU_CL>* param) {
-  this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
-  this->cl_helper_.AddKernel("feed", "feed_kernel.cl");
-  return true;
-}
-
-// Strided numel memory copy from src to dst by the specified axis
-//
-// For example, for a tensor dims [4, 20, 100], the strieded numel is
-// [8000, 2000, 100]
-//
-// NOTE: The src and dst tensor should have the same elements
-// except the specified axis.
-template <typename T>
-void StridedNumelCopyWithAxis(int64_t axis, T* dst,
-                              const framework::DDim& dst_stride_numel,
-                              const T* src,
-                              const framework::DDim& src_stride_numel,
-                              int64_t size) {
-  int64_t before = dst_stride_numel[0] / dst_stride_numel[axis];
-  int64_t src_after = src_stride_numel[axis];
-  int64_t dst_after = dst_stride_numel[axis];
-
-  PADDLE_MOBILE_ENFORCE(src_stride_numel.size() == dst_stride_numel.size(),
-                        "src and dst tensor should have the same dims size.");
-
-  for (int64_t i = 0; i < axis; ++i) {
-    if (i < axis) {
-      PADDLE_MOBILE_ENFORCE(src_stride_numel[i] / src_stride_numel[axis] ==
-                                dst_stride_numel[i] / dst_stride_numel[axis],
-                            "src and dst should have the same elements "
-                            "except the specified axis.");
-    } else if (i == axis) {
-      continue;
-    } else {
-      PADDLE_MOBILE_ENFORCE(src_stride_numel[i] == dst_stride_numel[i],
-                            "src and dst should have the same elements "
-                            "except the specified axis.");
-    }
-  }
-
-  for (int64_t i = 0; i < before; ++i) {
-    memory::Copy(dst + i * dst_after, src + i * src_after, sizeof(T) * size);
-  }
-}
-
-template <>
-void SplitKernel<GPU_CL, float>::Compute(const SplitParam<GPU_CL>& param) {
-  auto kernel0 = this->cl_helper_.KernelAt(0);
-  auto kernel1 = this->cl_helper_.KernelAt(1);
-  auto* input_image = param.InputX();
-  auto in_stride = framework::stride_numel(input_image->dims());
-  auto input_dims = input_image->dims();
-  auto outs_images = param.Outs();
-  int64_t axis = param.Axis();
-
-  Tensor* input_tensor = new Tensor();
-  input_tensor->Resize(input_image->dims());
-  input_tensor->mutable_data<float>();
-
-  framework::CLImageToTensor(input_image, input_tensor,
-                             this->cl_helper_.CLContext(),
-                             this->cl_helper_.CLCommandQueue(), kernel0);
-
-  size_t input_offset = 0;
-  for (auto out : outs_images) {
-    auto out_stride = framework::stride_numel(out->dims());
-
-    Tensor* temp_out = new Tensor();
-    temp_out->Resize(out->dims());
-    temp_out->mutable_data<float>();
-    framework::CLImageToTensor(out, temp_out, this->cl_helper_.CLContext(),
-                               this->cl_helper_.CLCommandQueue(), kernel0);
-    StridedNumelCopyWithAxis<float>(axis, temp_out->data<float>(), out_stride,
-                                    input_tensor->data<float>() + input_offset,
-                                    in_stride, out_stride[axis]);
-    input_offset += out_stride[axis];
-    out->InitEmptyImage(this->cl_helper_.CLContext(),
-                        this->cl_helper_.CLCommandQueue(), temp_out->dims());
-    framework::TensorToCLImage(temp_out, out, this->cl_helper_.CLContext(),
-                               this->cl_helper_.CLCommandQueue(), kernel1);
-    outs_images.push_back(out);
-
-    delete (temp_out);
-  }
-  delete (input_tensor);
-}
-
-template class SplitKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/tanh_kernel.cpp b/mobile/src/operators/kernel/cl/tanh_kernel.cpp
deleted file mode 100644
index 5c63a3606d..0000000000
--- a/mobile/src/operators/kernel/cl/tanh_kernel.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef TANH_OP
-
-#include "operators/kernel/activation_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool TanhKernel<GPU_CL, float>::Init(TanhParam<GPU_CL>* param) {
-  this->cl_helper_.AddKernel("tanh_kernel", "tanh_kernel.cl");
-  return true;
-}
-
-template <>
-void TanhKernel<GPU_CL, float>::Compute(const TanhParam<GPU_CL>& param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  const auto* input = param.InputX();
-  auto* output = param.Out();
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*output);
-  auto inputImage = input->GetCLImage();
-  auto outputImage = output->GetCLImage();
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
-  CL_CHECK_ERRORS(status);
-  const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()};
-
-  status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                                  NULL, work_size, NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-
-template class TanhKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/cl/transpose2_kernel.cpp b/mobile/src/operators/kernel/cl/transpose2_kernel.cpp
deleted file mode 100644
index 371fbee710..0000000000
--- a/mobile/src/operators/kernel/cl/transpose2_kernel.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef TRANSPOSE2_OP
-
-#include "operators/kernel/transpose2_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool Transpose2Kernel<GPU_CL, float>::Init(Transpose2Param<GPU_CL> *param) {
-  this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
-  this->cl_helper_.AddKernel("feed", "feed_kernel.cl");
-  return true;
-}
-
-inline bool IsShuffleChannel(const std::vector<int> &axis) {
-  bool is_shuffle_channel = true;
-  if (axis.size() > 2 && axis[0] == 0 && axis[1] == 2 && axis[2] == 1) {
-    for (int i = 3; i < axis.size(); ++i) {
-      if (axis[i] != i) {
-        is_shuffle_channel = false;
-        break;
-      }
-    }
-  } else {
-    return false;
-  }
-  return is_shuffle_channel;
-}
-
-template <typename Dtype>
-void ShuffleChannelCompute(const Transpose2Param<GPU_CL> &param,
-                           cl_context context, cl_command_queue commandQueue,
-                           cl_kernel kernel0, cl_kernel kernel1) {
-  auto axis = param.Axis();
-  int axis_size = axis.size();
-
-  bool shouldResize = true;
-  int diff_dim = 0;
-  if (axis_size > 4) {
-    for (int i = 0; i < axis_size - 4; ++i) {
-      if (axis[i] != i) {
-        shouldResize = false;
-        break;
-      } else {
-        diff_dim++;
-      }
-    }
-    if (shouldResize) {
-      std::vector<int> temp_axis_dims;
-      temp_axis_dims.reserve(static_cast<size_t>(4));
-      for (int i = axis_size - 4; i < axis_size; ++i) {
-        temp_axis_dims.push_back(axis[i] - diff_dim);
-      }
-      axis.resize(4);
-      axis.clear();
-      axis.insert(axis.begin(), temp_axis_dims.begin(), temp_axis_dims.end());
-    }
-  }
-
-  auto input = param.InputX();
-  Tensor *input_tensor = new Tensor();
-  input_tensor->Resize(input->dims());
-  input_tensor->mutable_data<float>();
-
-  framework::CLImageToTensor(input, input_tensor, context, commandQueue,
-                             kernel0);
-  const Dtype *input_ptr = input_tensor->data<Dtype>();
-
-  auto output = param.Out();
-  Tensor *output_tensor = new Tensor();
-  framework::DDim out_dims(input->dims());
-  for (size_t i = 0; i < axis_size; i++) {
-    out_dims[i] = input->dims()[axis[i]];
-  }
-  output_tensor->Resize(out_dims);
-  output_tensor->mutable_data<float>();
-  Dtype *output_ptr = output_tensor->mutable_data<Dtype>();
-  // input and output's shape dimension must >= 2 && <= 6.
-  const framework::DDim &in_dim = input->dims();
-  const framework::DDim &out_dim = output->dims();
-  size_t offset = 1;
-  for (int i = 3; i < axis.size(); ++i) {
-    offset *= in_dim[i];
-  }
-
-#pragma omp parallel for collapse(3)
-  for (int batch = 0; batch < out_dim[0]; ++batch) {
-    for (int c1 = 0; c1 < out_dim[1]; ++c1) {
-      for (int c2 = 0; c2 < out_dim[2]; ++c2) {
-        size_t out_offset =
-            ((batch * out_dim[1] + c1) * out_dim[2] + c2) * offset;
-        size_t in_offset = ((batch * in_dim[1] + c2) * in_dim[2] + c1) * offset;
-        memcpy(output_ptr + out_offset, input_ptr + in_offset,
-               offset * sizeof(Dtype));
-      }
-    }
-  }
-
-  output->InitEmptyImage(context, commandQueue, output_tensor->dims());
-  framework::TensorToCLImage(output_tensor, output, context, commandQueue,
-                             kernel1);
-
-  delete (input_tensor);
-  delete (output_tensor);
-}
-
-template <typename Dtype>
-void Transpose2Compute(const Transpose2Param<GPU_CL> &param, cl_context context,
-                       cl_command_queue commandQueue, cl_kernel kernel0,
-                       cl_kernel kernel1) {
-  const std::vector<int> &axis = param.Axis();
-
-  auto input = param.InputX();
-  Tensor *input_tensor = new Tensor();
-  input_tensor->Resize(input->dims());
-  input_tensor->mutable_data<float>();
-  framework::CLImageToTensor(input, input_tensor, context, commandQueue,
-                             kernel0);
-  const Dtype *input_ptr = input_tensor->data<Dtype>();
-
-  auto output = param.Out();
-  Tensor *output_tensor = new Tensor();
-  output_tensor->Resize(input->dims());
-  output_tensor->mutable_data<float>();
-  Dtype *output_ptr = output_tensor->mutable_data<Dtype>();
-  // input and output's shape dimension must >= 2 && <= 6.
-  const framework::DDim &in_dim = input->dims();
-  const framework::DDim &out_dim = output->dims();
-
-  // precompute inverted output dim and strides
-  size_t rout_dim[6], strides[6];
-  int permute = axis.size();  // permute must >=2 && <= 6.
-  for (int i = 0; i < permute; ++i) {
-    int k = permute - 1 - i;
-    strides[k] = 1;
-    for (int j = axis[i] + 1; j < permute; ++j) {
-      strides[k] *= in_dim[j];
-    }
-    rout_dim[k] = out_dim[i];
-  }
-  // unroll the first 2 dimensions
-  int reamin_dim = 1;
-  for (int i = 2; i < out_dim.size(); ++i) {
-    reamin_dim *= out_dim[i];
-  }
-
-#pragma omp parallel for collapse(2)
-  for (int batch = 0; batch < out_dim[0]; ++batch) {
-    for (int j = 0; j < out_dim[1]; ++j) {
-      size_t offset = batch * strides[permute - 1] + j * strides[permute - 2];
-      Dtype *out_ptr = output_ptr + (batch * out_dim[1] + j) * reamin_dim;
-      int indics[4] = {0, 0, 0, 0};
-      for (int k = 0; k < reamin_dim; ++k) {
-        out_ptr[k] = input_ptr[offset];
-        indics[0] += 1;
-        offset += strides[0];
-        for (int p = 0; p < permute - 3; ++p) {
-          if (indics[p] == rout_dim[p]) {
-            indics[p + 1] += 1;
-            indics[p] = 0;
-            offset += strides[p + 1];
-            offset -= rout_dim[p] * strides[p];
-          } else {
-            break;
-          }
-        }
-      }
-    }
-  }
-
-  output->InitEmptyImage(context, commandQueue, output_tensor->dims());
-  framework::TensorToCLImage(output_tensor, output, context, commandQueue,
-                             kernel1);
-}
-
-template <>
-void Transpose2Kernel<GPU_CL, float>::Compute(
-    const Transpose2Param<GPU_CL> &param) {
-  auto kernel0 = this->cl_helper_.KernelAt(0);
-  auto kernel1 = this->cl_helper_.KernelAt(1);
-
-  const std::vector<int> &axis = param.Axis();
-  bool shuffle_channel = IsShuffleChannel(axis);
-  if (shuffle_channel) {
-    ShuffleChannelCompute<float>(param, this->cl_helper_.CLContext(),
-                                 this->cl_helper_.CLCommandQueue(), kernel0,
-                                 kernel1);
-  } else {
-    Transpose2Compute<float>(param, this->cl_helper_.CLContext(),
-                             this->cl_helper_.CLCommandQueue(), kernel0,
-                             kernel1);
-  }
-}
-
-template class Transpose2Kernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/transpose_kernel.cpp b/mobile/src/operators/kernel/cl/transpose_kernel.cpp
deleted file mode 100644
index d3133449b9..0000000000
--- a/mobile/src/operators/kernel/cl/transpose_kernel.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef TRANSPOSE_OP
-
-#include "operators/kernel/transpose_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool TransposeKernel<GPU_CL, float>::Init(TransposeParam<GPU_CL> *param) {
-  if (param->Out()->dims().size() == 4) {
-    this->cl_helper_.AddKernel("transpose_4d", "transpose_kernel.cl");
-  } else if (param->Out()->dims().size() < 4) {
-    this->cl_helper_.AddKernel("transpose", "transpose_kernel.cl");
-  }
-  return true;
-}
-
-template <>
-void TransposeKernel<GPU_CL, float>::Compute(
-    const TransposeParam<GPU_CL> &param) {
-  if (param.Out()->dims().size() == 4) {
-    auto kernel = this->cl_helper_.KernelAt(0);
-    auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
-    int out_C = param.Out()->dims()[1];
-    int out_H = param.Out()->dims()[2];
-    int out_W = param.Out()->dims()[3];
-    int in_W = param.InputX()->dims()[3];
-    auto output_image = param.Out()->GetCLImage();
-    auto input_image = param.InputX()->GetCLImage();
-    DLOG << "out_C=" << out_C;
-    DLOG << "out_H=" << out_H;
-    DLOG << "out_W=" << out_W;
-    DLOG << "in_C=" << in_W;
-    DLOG << "default_work_size=" << default_work_size;
-    cl_int status;
-    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 2, sizeof(int), &out_C);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 3, sizeof(int), &out_H);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 4, sizeof(int), &out_W);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 5, sizeof(int), &in_W);
-    CL_CHECK_ERRORS(status);
-    status = clEnqueueNDRangeKernel(
-        this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
-        NULL, default_work_size.data(), NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
-  } else if (param.Out()->dims().size() == 3) {
-    auto kernel = this->cl_helper_.KernelAt(0);
-    auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
-    int out_C = param.Out()->dims()[0];
-    int out_H = param.Out()->dims()[1];
-    int out_W = param.Out()->dims()[2];
-    int in_W = param.InputX()->dims()[2];
-    auto output_image = param.Out()->GetCLImage();
-    auto input_image = param.InputX()->GetCLImage();
-    DLOG << "out_C=" << out_C;
-    DLOG << "out_H=" << out_H;
-    DLOG << "out_W=" << out_W;
-    DLOG << "in_C=" << in_W;
-    DLOG << "default_work_size=" << default_work_size;
-    cl_int status;
-    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 2, sizeof(int), &out_C);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 3, sizeof(int), &out_H);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 4, sizeof(int), &out_W);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 5, sizeof(int), &in_W);
-    CL_CHECK_ERRORS(status);
-    status = clEnqueueNDRangeKernel(
-        this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
-        NULL, default_work_size.data(), NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
-
-  } else if (param.Out()->dims().size() == 2) {
-    auto kernel = this->cl_helper_.KernelAt(0);
-    auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
-    int out_C = 1;
-    int out_H = param.Out()->dims()[0];
-    int out_W = param.Out()->dims()[1];
-    int in_W = param.InputX()->dims()[1];
-    auto output_image = param.Out()->GetCLImage();
-    auto input_image = param.InputX()->GetCLImage();
-    DLOG << "out_C=" << out_C;
-    DLOG << "out_H=" << out_H;
-    DLOG << "out_W=" << out_W;
-    DLOG << "in_C=" << in_W;
-    DLOG << "default_work_size=" << default_work_size;
-    cl_int status;
-    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 2, sizeof(int), &out_C);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 3, sizeof(int), &out_H);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 4, sizeof(int), &out_W);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 5, sizeof(int), &in_W);
-    CL_CHECK_ERRORS(status);
-    status = clEnqueueNDRangeKernel(
-        this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
-        NULL, default_work_size.data(), NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/compare_kernel.h b/mobile/src/operators/kernel/compare_kernel.h
deleted file mode 100644
index 8932ca7757..0000000000
--- a/mobile/src/operators/kernel/compare_kernel.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef LESS_THAN_OP
-DECLARE_KERNEL(LessThan, CompareParam);
-#endif  // LESS_THAN_OP
-
-#ifdef EQUAL_OP
-DECLARE_KERNEL(Equal, CompareParam);
-#endif  // EQUAL_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/concat_kernel.h b/mobile/src/operators/kernel/concat_kernel.h
deleted file mode 100644
index ac9ebca4d5..0000000000
--- a/mobile/src/operators/kernel/concat_kernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONCAT_OP
-
-#pragma once
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using namespace framework;
-
-template <typename DeviceType, typename T>
-class ConcatKernel
-    : public framework::OpKernelBase<DeviceType, ConcatParam<DeviceType>> {
- public:
-  void Compute(const ConcatParam<DeviceType> &param);
-  bool Init(ConcatParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/conditional_block_kernel.h b/mobile/src/operators/kernel/conditional_block_kernel.h
deleted file mode 100644
index 851d558c2c..0000000000
--- a/mobile/src/operators/kernel/conditional_block_kernel.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONDITIONAL_BLOCK_OP
-
-#pragma once
-
-#include <vector>
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype>
-class ConditionalBlockParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ConditionalBlockParam(const VariableNameMap &inputs,
-                        const VariableNameMap &outputs,
-                        const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = OpParam::GetMultiVarValue<GType>("Input", inputs, *scope);
-    cond_ = OpParam::GetMultiVarValue<GType>("Cond", inputs, *scope);
-    output_ = OpParam::OutFrom<GType>(outputs, *scope);
-    scope_ = OpParam::GetVar("Scope", outputs, *scope);
-    is_scalar_condition_ = GetAttr<bool>("is_scalar_condition", attrs);
-    sub_block_ = GetAttr<framework::BlockDesc *>("sub_block", attrs);
-  }
-
-  const vector<GType *> Input() const { return input_; }
-
-  const vector<GType *> Cond() const { return cond_; }
-
-  GType *Output() const { return output_; }
-
-  Variable *OutputScope() const { return scope_; }
-
-  bool isScalarCondition() const { return is_scalar_condition_; }
-
-  framework::BlockDesc *getSubBlock() const { return sub_block_; }
-
- private:
-  vector<GType *> input_;
-  vector<GType *> cond_;
-  GType *output_;
-  Variable *scope_;
-  bool is_scalar_condition_;
-  framework::BlockDesc *sub_block_;
-};
-
-DECLARE_KERNEL(ConditionalBlock, ConditionalBlockParam);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // CONDITIONAL_BLOCK_OP
diff --git a/mobile/src/operators/kernel/conv_add_bn_kernel.h b/mobile/src/operators/kernel/conv_add_bn_kernel.h
deleted file mode 100644
index 757664eb53..0000000000
--- a/mobile/src/operators/kernel/conv_add_bn_kernel.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef FUSION_CONVADDBN_OP
-
-#include <vector>
-#include "framework/ddim.h"
-#include "framework/operator.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::DDim;
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class ConvAddBNKernel
-    : public OpKernelBase<DeviceType, FusionConvAddBNParam<DeviceType>> {
- public:
-  void Compute(const FusionConvAddBNParam<DeviceType> &param);
-  bool Init(FusionConvAddBNParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/conv_add_bn_relu_kernel.h b/mobile/src/operators/kernel/conv_add_bn_relu_kernel.h
deleted file mode 100644
index 2174a6f125..0000000000
--- a/mobile/src/operators/kernel/conv_add_bn_relu_kernel.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef FUSION_CONVADDBNRELU_OP
-
-#include <vector>
-#include "framework/ddim.h"
-#include "framework/operator.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::DDim;
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class ConvAddBNReluKernel
-    : public OpKernelBase<DeviceType, FusionConvAddBNReluParam<DeviceType>> {
- public:
-  void Compute(const FusionConvAddBNReluParam<DeviceType> &param);
-  bool Init(FusionConvAddBNReluParam<DeviceType> *param);
-
- private:
-  bool could_use_faster_depthwise_conv_ = false;
-  bool use_gemm_add_bn_relu = false;
-  bool use_slidingwindow_add_bn_relu = false;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/conv_add_kernel.h b/mobile/src/operators/kernel/conv_add_kernel.h
deleted file mode 100644
index fd3f279a78..0000000000
--- a/mobile/src/operators/kernel/conv_add_kernel.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADD_OP
-
-#pragma once
-
-#include <vector>
-#ifdef __ARM_NEON
-#include <arm_neon.h>
-#endif
-#include "common/common.h"
-#include "framework/ddim.h"
-#include "framework/operator.h"
-#include "operators/math/depthwise_conv3x3.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::DDim;
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class ConvAddKernel
-    : public OpKernelBase<DeviceType, FusionConvAddParam<DeviceType>> {
- public:
-  void Compute(const FusionConvAddParam<DeviceType> &param);
-  bool Init(FusionConvAddParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/conv_add_relu_kernel.h b/mobile/src/operators/kernel/conv_add_relu_kernel.h
deleted file mode 100644
index 8cfc92ef19..0000000000
--- a/mobile/src/operators/kernel/conv_add_relu_kernel.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef FUSION_CONVADDRELU_OP
-
-#include <vector>
-#include "framework/ddim.h"
-#include "framework/operator.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::DDim;
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class ConvAddReluKernel
-    : public OpKernelBase<DeviceType, FusionConvAddReluParam<DeviceType>> {
- public:
-  void Compute(const FusionConvAddReluParam<DeviceType> &param);
-  bool Init(FusionConvAddReluParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/conv_bn_add_relu_kernel.h b/mobile/src/operators/kernel/conv_bn_add_relu_kernel.h
deleted file mode 100644
index 63a86b5653..0000000000
--- a/mobile/src/operators/kernel/conv_bn_add_relu_kernel.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef FUSION_CONVBNADDRELU_OP
-
-#include <vector>
-#include "framework/ddim.h"
-#include "framework/operator.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::DDim;
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class ConvBNAddReluKernel
-    : public OpKernelBase<DeviceType, FusionConvBNAddReluParam<DeviceType>> {
- public:
-  void Compute(const FusionConvBNAddReluParam<DeviceType> &param);
-  bool Init(FusionConvBNAddReluParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/conv_bn_kernel.h b/mobile/src/operators/kernel/conv_bn_kernel.h
deleted file mode 100644
index 1fb0d680cf..0000000000
--- a/mobile/src/operators/kernel/conv_bn_kernel.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef FUSION_CONVBN_OP
-
-#include <vector>
-#include "framework/ddim.h"
-#include "framework/operator.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::DDim;
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class ConvBNKernel
-    : public OpKernelBase<DeviceType, FusionConvBNParam<DeviceType>> {
- public:
-  void Compute(const FusionConvBNParam<DeviceType> &param);
-  bool Init(FusionConvBNParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/conv_bn_relu_kernel.h b/mobile/src/operators/kernel/conv_bn_relu_kernel.h
deleted file mode 100644
index aef735a524..0000000000
--- a/mobile/src/operators/kernel/conv_bn_relu_kernel.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef FUSION_CONVBNRELU_OP
-
-#include <vector>
-#include "framework/ddim.h"
-#include "framework/operator.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::DDim;
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class ConvBNReluKernel
-    : public OpKernelBase<DeviceType, FusionConvBNReluParam<DeviceType>> {
- public:
-  void Compute(const FusionConvBNReluParam<DeviceType> &param);
-  bool Init(FusionConvBNReluParam<DeviceType> *param);
-
- private:
-  bool use_gemm_bn_relu = false;
-  bool use_slidingwindow_bn_relu = false;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/conv_kernel.h b/mobile/src/operators/kernel/conv_kernel.h
deleted file mode 100644
index cac498c36b..0000000000
--- a/mobile/src/operators/kernel/conv_kernel.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_OP
-
-#pragma once
-
-#include <vector>
-#include "framework/operator.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class ConvKernel : public OpKernelBase<DeviceType, ConvParam<DeviceType>> {
- public:
-  void Compute(const ConvParam<DeviceType> &param);
-  bool Init(ConvParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/conv_relu_kernel.h b/mobile/src/operators/kernel/conv_relu_kernel.h
deleted file mode 100644
index 4fb2fe3171..0000000000
--- a/mobile/src/operators/kernel/conv_relu_kernel.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef FUSION_CONVRELU_OP
-
-#include <vector>
-#include "framework/operator.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class ConvReluKernel
-    : public OpKernelBase<DeviceType, FusionConvReluParam<DeviceType>> {
- public:
-  void Compute(const FusionConvReluParam<DeviceType> &param);
-  bool Init(FusionConvReluParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/conv_transpose_kernel.h b/mobile/src/operators/kernel/conv_transpose_kernel.h
deleted file mode 100644
index 6341a87d43..0000000000
--- a/mobile/src/operators/kernel/conv_transpose_kernel.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_TRANSPOSE_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class ConvTransposeKernel
-    : public OpKernelBase<DeviceType, ConvTransposeParam<DeviceType>> {
- public:
-  void Compute(const ConvTransposeParam<DeviceType> &param);
-
-  bool Init(ConvTransposeParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // PADDLE_MOBILE_DE_CONV_KERNEL_H
diff --git a/mobile/src/operators/kernel/crf_kernel.h b/mobile/src/operators/kernel/crf_kernel.h
deleted file mode 100644
index 1436aafc06..0000000000
--- a/mobile/src/operators/kernel/crf_kernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CRF_OP
-
-#pragma once
-
-#include <vector>
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class CrfKernel
-    : public framework::OpKernelBase<DeviceType, CrfParam<DeviceType>> {
- public:
-  void Compute(const CrfParam<DeviceType>& param);
-  bool Init(CrfParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/deconv_add_bn_kernel.h b/mobile/src/operators/kernel/deconv_add_bn_kernel.h
deleted file mode 100755
index 181367031c..0000000000
--- a/mobile/src/operators/kernel/deconv_add_bn_kernel.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADDBN_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class DeconvAddBNKernel
-    : public OpKernelBase<DeviceType, FusionDeconvAddBNParam<DeviceType>> {
- public:
-  void Compute(const FusionDeconvAddBNParam<DeviceType> &param);
-
-  bool Init(FusionDeconvAddBNParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/deconv_add_bn_relu_kernel.h b/mobile/src/operators/kernel/deconv_add_bn_relu_kernel.h
deleted file mode 100755
index c63b4db050..0000000000
--- a/mobile/src/operators/kernel/deconv_add_bn_relu_kernel.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADDBNRELU_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class DeconvAddBNReluKernel
-    : public OpKernelBase<DeviceType, FusionDeconvAddBNReluParam<DeviceType>> {
- public:
-  void Compute(const FusionDeconvAddBNReluParam<DeviceType> &param);
-
-  bool Init(FusionDeconvAddBNReluParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/deconv_add_kernel.h b/mobile/src/operators/kernel/deconv_add_kernel.h
deleted file mode 100644
index 61170f95e2..0000000000
--- a/mobile/src/operators/kernel/deconv_add_kernel.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADD_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class DeconvAddKernel
-    : public OpKernelBase<DeviceType, FusionDeconvAddParam<DeviceType>> {
- public:
-  void Compute(const FusionDeconvAddParam<DeviceType> &param);
-
-  bool Init(FusionDeconvAddParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/deconv_add_relu_kernel.h b/mobile/src/operators/kernel/deconv_add_relu_kernel.h
deleted file mode 100644
index dc48272157..0000000000
--- a/mobile/src/operators/kernel/deconv_add_relu_kernel.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADDRELU_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class DeconvAddReluKernel
-    : public OpKernelBase<DeviceType, FusionDeconvAddReluParam<DeviceType>> {
- public:
-  void Compute(const FusionDeconvAddReluParam<DeviceType> &param);
-
-  bool Init(FusionDeconvAddReluParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/deconv_bn_relu_kernel.h b/mobile/src/operators/kernel/deconv_bn_relu_kernel.h
deleted file mode 100755
index 4ab0257b07..0000000000
--- a/mobile/src/operators/kernel/deconv_bn_relu_kernel.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVBNRELU_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class DeconvBNReluKernel
-    : public OpKernelBase<DeviceType, FusionDeconvBNReluParam<DeviceType>> {
- public:
-  void Compute(const FusionDeconvBNReluParam<DeviceType> &param);
-
-  bool Init(FusionDeconvBNReluParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/deconv_relu_kernel.h b/mobile/src/operators/kernel/deconv_relu_kernel.h
deleted file mode 100644
index bc85f1ffee..0000000000
--- a/mobile/src/operators/kernel/deconv_relu_kernel.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVRELU_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class DeconvReluKernel
-    : public OpKernelBase<DeviceType, FusionDeconvReluParam<DeviceType>> {
- public:
-  void Compute(const FusionDeconvReluParam<DeviceType> &param);
-
-  bool Init(FusionDeconvReluParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/dequant_bn_kernel.h b/mobile/src/operators/kernel/dequant_bn_kernel.h
deleted file mode 100644
index cf759bf69c..0000000000
--- a/mobile/src/operators/kernel/dequant_bn_kernel.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef FUSION_DEQUANT_BN_OP
-DECLARE_KERNEL(FusionDequantBN, FusionDequantBNParam);
-#endif
-
-#ifdef FUSION_DEQUANT_BN_RELU_OP
-DECLARE_KERNEL(FusionDequantBNRelu, FusionDequantBNParam);
-#endif
-
-#ifdef FUSION_DEQUANT_ADD_BN_OP
-DECLARE_KERNEL(FusionDequantAddBN, FusionDequantAddBNParam);
-#endif
-
-#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
-DECLARE_KERNEL(FusionDequantAddBNRelu, FusionDequantAddBNParam);
-#endif
-
-#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
-DECLARE_KERNEL(FusionDequantAddBNQuant, FusionDequantAddBNQuantParam);
-#endif
-
-#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
-DECLARE_KERNEL(FusionDequantAddBNReluQuant, FusionDequantAddBNQuantParam);
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/dequantize_kernel.h b/mobile/src/operators/kernel/dequantize_kernel.h
deleted file mode 100644
index 6ba8ec88c5..0000000000
--- a/mobile/src/operators/kernel/dequantize_kernel.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DEQUANT_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class DequantizeKernel
-    : public framework::OpKernelBase<DeviceType, DequantizeParam<DeviceType>> {
- public:
-  void Compute(const DequantizeParam<DeviceType> &param);
-  bool Init(DequantizeParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/detection_kernel.h b/mobile/src/operators/kernel/detection_kernel.h
deleted file mode 100644
index 89c8348d5b..0000000000
--- a/mobile/src/operators/kernel/detection_kernel.h
+++ /dev/null
@@ -1,232 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <vector>
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef ANCHOR_GENERATOR_OP
-template <typename Dtype>
-class AnchorGeneratorParam : public OpParam {
- public:
-  AnchorGeneratorParam(const VariableNameMap &inputs,
-                       const VariableNameMap &outputs,
-                       const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ =
-        OpParam::GetVarValue<framework::LoDTensor>("Input", inputs, *scope);
-    output_anchors_ =
-        OpParam::GetVarValue<framework::LoDTensor>("Anchors", outputs, *scope);
-    output_variances_ = OpParam::GetVarValue<framework::LoDTensor>(
-        "Variances", outputs, *scope);
-
-    anchor_sizes_ = OpParam::GetAttr<std::vector<float>>("anchor_sizes", attrs);
-    aspect_ratios_ =
-        OpParam::GetAttr<std::vector<float>>("aspect_ratios", attrs);
-    variances_ = OpParam::GetAttr<std::vector<float>>("variances", attrs);
-    stride_ = OpParam::GetAttr<std::vector<float>>("stride", attrs);
-    offset_ = OpParam::GetAttr<float>("offset", attrs);
-  }
-
- public:
-  // input
-  framework::Tensor *input_;
-  // outputs
-  framework::Tensor *output_anchors_;
-  framework::Tensor *output_variances_;
-
-  std::vector<float> anchor_sizes_;
-  std::vector<float> aspect_ratios_;
-  std::vector<float> variances_;
-  std::vector<float> stride_;
-  float offset_;
-};
-
-DECLARE_KERNEL(AnchorGenerator, AnchorGeneratorParam);
-#endif
-
-#ifdef PROPOSAL_OP
-template <typename Dtype>
-class ProposalParam : public OpParam {
- public:
-  ProposalParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    scores_ =
-        OpParam::GetVarValue<framework::LoDTensor>("Scores", inputs, *scope);
-    bbox_deltas_ = OpParam::GetVarValue<framework::LoDTensor>("BboxDeltas",
-                                                              inputs, *scope);
-    im_info_ =
-        OpParam::GetVarValue<framework::LoDTensor>("ImInfo", inputs, *scope);
-    anchors_ =
-        OpParam::GetVarValue<framework::LoDTensor>("Anchors", inputs, *scope);
-    variances_ =
-        OpParam::GetVarValue<framework::LoDTensor>("Variances", inputs, *scope);
-
-    rpn_rois_ =
-        OpParam::GetVarValue<framework::LoDTensor>("RpnRois", outputs, *scope);
-    rpn_probs_ = OpParam::GetVarValue<framework::LoDTensor>("RpnRoiProbs",
-                                                            outputs, *scope);
-
-    pre_nms_topn_ = OpParam::GetAttr<int>("pre_nms_topN", attrs);
-    post_nms_topn_ = OpParam::GetAttr<int>("post_nms_topN", attrs);
-    nms_thresh_ = OpParam::GetAttr<float>("nms_thresh", attrs);
-    min_size_ = OpParam::GetAttr<float>("min_size", attrs);
-    eta_ = OpParam::GetAttr<float>("eta", attrs);
-  }
-
- public:
-  framework::Tensor *scores_;
-  framework::Tensor *bbox_deltas_;
-  framework::Tensor *im_info_;
-  framework::Tensor *anchors_;
-  framework::Tensor *variances_;
-
-  std::shared_ptr<Tensor> score_index_;
-
-  framework::LoDTensor *rpn_rois_;
-  framework::LoDTensor *rpn_probs_;
-
-  int pre_nms_topn_;
-  int post_nms_topn_;
-  float nms_thresh_;
-  float min_size_;
-  float eta_;
-#ifdef PADDLE_MOBILE_FPGA
-  std::shared_ptr<Tensor> float_score, float_bbox;
-  fpga::BypassArgs score_arg, bbox_arg;
-#endif
-};
-
-DECLARE_KERNEL(Proposal, ProposalParam);
-#endif
-
-#ifdef PSROI_POOL_OP
-template <typename Dtype>
-class PSRoiPoolParam : public OpParam {
- public:
-  PSRoiPoolParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                 const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = OpParam::GetVarValue<framework::LoDTensor>("X", inputs, *scope);
-    input_rois_ =
-        OpParam::GetVarValue<framework::LoDTensor>("ROIs", inputs, *scope);
-    output_ =
-        OpParam::GetVarValue<framework::LoDTensor>("Out", outputs, *scope);
-
-    output_channels_ = OpParam::GetAttr<int>("output_channels", attrs);
-    pooled_height_ = OpParam::GetAttr<int>("pooled_height", attrs);
-    pooled_width_ = OpParam::GetAttr<int>("pooled_width", attrs);
-    spatial_scale_ = OpParam::GetAttr<float>("spatial_scale", attrs);
-  }
-
- public:
-  framework::Tensor *input_x_;
-  framework::LoDTensor *input_rois_;
-  framework::Tensor *output_;
-  int output_channels_;
-  int pooled_height_;
-  int pooled_width_;
-  float spatial_scale_;
-#ifdef PADDLE_MOBILE_FPGA
-  std::shared_ptr<Tensor> float_input, float_output;
-  fpga::BypassArgs input_arg, output_arg;
-#endif
-};
-
-DECLARE_KERNEL(PSRoiPool, PSRoiPoolParam);
-#endif
-
-#ifdef ROIALIGN_POOL_OP
-template <typename Dtype>
-class RoiAlignPoolParam : public OpParam {
- public:
-  RoiAlignPoolParam(const VariableNameMap &inputs,
-                    const VariableNameMap &outputs, const AttributeMap &attrs,
-                    Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = OpParam::GetVarValue<framework::LoDTensor>("X", inputs, *scope);
-    input_rois_ =
-        OpParam::GetVarValue<framework::LoDTensor>("ROIs", inputs, *scope);
-    output_ =
-        OpParam::GetVarValue<framework::LoDTensor>("Out", outputs, *scope);
-
-    pooled_height_ = OpParam::GetAttr<int>("pooled_height", attrs);
-    pooled_width_ = OpParam::GetAttr<int>("pooled_width", attrs);
-    spatial_scale_ = OpParam::GetAttr<float>("spatial_scale", attrs);
-    sampling_ratio_ = OpParam::GetAttr<float>("sampling_ratio", attrs);
-  }
-
- public:
-  framework::Tensor *input_x_;
-  framework::LoDTensor *input_rois_;
-  framework::Tensor *output_;
-  int pooled_height_;
-  int pooled_width_;
-  float spatial_scale_;
-  int sampling_ratio_;
-#ifdef PADDLE_MOBILE_FPGA
-  std::shared_ptr<Tensor> float_input, float_output;
-  fpga::BypassArgs input_arg, output_arg;
-#endif
-};
-
-DECLARE_KERNEL(RoiAlignPool, RoiAlignPoolParam);
-#endif
-
-#ifdef ROI_PERSPECTIVE_OP
-template <typename Dtype>
-class RoiPerspectiveParam : public OpParam {
- public:
-  RoiPerspectiveParam(const VariableNameMap &inputs,
-                      const VariableNameMap &outputs, const AttributeMap &attrs,
-                      Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = OpParam::GetVarValue<framework::LoDTensor>("X", inputs, *scope);
-    input_rois_ =
-        OpParam::GetVarValue<framework::LoDTensor>("ROIs", inputs, *scope);
-    output_ =
-        OpParam::GetVarValue<framework::LoDTensor>("Out", outputs, *scope);
-    transform_Matrix_ = OpParam::GetVarValue<framework::LoDTensor>(
-        "TransformMatrix", outputs, *scope);
-    mask = OpParam::GetVarValue<framework::LoDTensor>("Mask", outputs, *scope);
-
-    spatial_scale_ = OpParam::GetAttr<float>("spatial_scale", attrs);
-    transformed_height_ = OpParam::GetAttr<int>("transformed_height", attrs);
-    transformed_width_ = OpParam::GetAttr<int>("transformed_width", attrs);
-  }
-
- public:
-  framework::Tensor *input_x_;
-  framework::LoDTensor *input_rois_;
-  framework::Tensor *output_;
-  framework::Tensor *transform_Matrix_;
-  framework::Tensor *mask;
-
-  float spatial_scale_;
-  int transformed_height_;
-  int transformed_width_;
-};
-
-DECLARE_KERNEL(RoiPerspective, RoiPerspectiveParam);
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/dropout_kernel.h b/mobile/src/operators/kernel/dropout_kernel.h
deleted file mode 100644
index 2f59d01b67..0000000000
--- a/mobile/src/operators/kernel/dropout_kernel.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DROPOUT_OP
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-#pragma once
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class DropoutKernel
-    : public framework::OpKernelBase<DeviceType, DropoutParam<DeviceType>> {
- public:
-  void Compute(const DropoutParam<DeviceType>& param);
-  bool Init(DropoutParam<DeviceType>* para);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/dwconv_bn_relu_kernel.h b/mobile/src/operators/kernel/dwconv_bn_relu_kernel.h
deleted file mode 100644
index 3bd8093adb..0000000000
--- a/mobile/src/operators/kernel/dwconv_bn_relu_kernel.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef FUSION_DWCONVBNRELU_OP
-
-#include <vector>
-#include "framework/ddim.h"
-#include "framework/operator.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::DDim;
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class DWConvBNReluKernel
-    : public OpKernelBase<DeviceType, FusionDWConvBNReluParam<DeviceType>> {
- public:
-  void Compute(const FusionDWConvBNReluParam<DeviceType> &param);
-  bool Init(FusionDWConvBNReluParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/elementwise_add_kernel.h b/mobile/src/operators/kernel/elementwise_add_kernel.h
deleted file mode 100644
index 8fa07e519e..0000000000
--- a/mobile/src/operators/kernel/elementwise_add_kernel.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEADD_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/math/elementwise_op_function.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using namespace framework;
-
-template <typename DeviceType, typename T>
-class ElementwiseAddKernel
-    : public framework::OpKernelBase<DeviceType,
-                                     ElementwiseAddParam<DeviceType>> {
- public:
-  void Compute(const ElementwiseAddParam<DeviceType> &param);
-  bool Init(ElementwiseAddParam<DeviceType> *param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/elementwise_add_relu_kernel.h b/mobile/src/operators/kernel/elementwise_add_relu_kernel.h
deleted file mode 100644
index d18c4e27fa..0000000000
--- a/mobile/src/operators/kernel/elementwise_add_relu_kernel.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_ELEMENTWISEADDRELU_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using namespace framework;
-
-template <typename DeviceType, typename T>
-class ElementwiseAddReluKernel
-    : public framework::OpKernelBase<DeviceType,
-                                     ElementwiseAddReluParam<DeviceType>> {
- public:
-  void Compute(const ElementwiseAddReluParam<DeviceType> &param);
-  bool Init(ElementwiseAddReluParam<DeviceType> *param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/elementwise_mul_kernel.h b/mobile/src/operators/kernel/elementwise_mul_kernel.h
deleted file mode 100644
index f71b6257d5..0000000000
--- a/mobile/src/operators/kernel/elementwise_mul_kernel.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEMUL_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class ElementwiseMulKernel
-    : public framework::OpKernelBase<DeviceType,
-                                     ElementwiseMulParam<DeviceType>> {
- public:
-  void Compute(const ElementwiseMulParam<DeviceType> &param);
-  bool Init(ElementwiseMulParam<DeviceType> *param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/elementwise_sub_kernel.h b/mobile/src/operators/kernel/elementwise_sub_kernel.h
deleted file mode 100644
index 89536b9208..0000000000
--- a/mobile/src/operators/kernel/elementwise_sub_kernel.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEADD_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/math/elementwise_op_function.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class ElementwiseSubKernel
-    : public framework::OpKernelBase<DeviceType,
-                                     ElementwiseSubParam<DeviceType>> {
- public:
-  void Compute(const ElementwiseSubParam<DeviceType> &param);
-  bool Init(ElementwiseSubParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/exp_kernel.h b/mobile/src/operators/kernel/exp_kernel.h
deleted file mode 100644
index ed7c4296f8..0000000000
--- a/mobile/src/operators/kernel/exp_kernel.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef EXP_OP
-
-#include <operators/op_param.h>
-#include "framework/operator.h"
-namespace paddle_mobile {
-namespace operators {
-DECLARE_KERNEL(EXP, EXPParam)
-}
-}  // namespace paddle_mobile
-#endif  // EXP_OP
diff --git a/mobile/src/operators/kernel/fc_relu_kernel.h b/mobile/src/operators/kernel/fc_relu_kernel.h
deleted file mode 100644
index 6735a50bee..0000000000
--- a/mobile/src/operators/kernel/fc_relu_kernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_FCRELU_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/math/math_function.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class FusionFcReluKernel
-    : public framework::OpKernelBase<DeviceType,
-                                     FusionFcReluParam<DeviceType>> {
- public:
-  void Compute(const FusionFcReluParam<DeviceType>& param);
-  bool Init(FusionFcReluParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/feed_kernel.h b/mobile/src/operators/kernel/feed_kernel.h
deleted file mode 100644
index 2f6fb6b31d..0000000000
--- a/mobile/src/operators/kernel/feed_kernel.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class FeedKernel
-    : public framework::OpKernelBase<DeviceType, FeedParam<DeviceType>> {
- public:
-  void Compute(const FeedParam<DeviceType> &param);
-  bool Init(FeedParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/fetch_kernel.h b/mobile/src/operators/kernel/fetch_kernel.h
deleted file mode 100644
index d9ed91855d..0000000000
--- a/mobile/src/operators/kernel/fetch_kernel.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using namespace framework;
-
-template <typename DeviceType, typename T>
-class FetchKernel
-    : public framework::OpKernelBase<DeviceType, FetchParam<DeviceType>> {
- public:
-  void Compute(const FetchParam<DeviceType> &param);
-  bool Init(FetchParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/flatten2_kernel.h b/mobile/src/operators/kernel/flatten2_kernel.h
deleted file mode 100644
index 78b3e820e6..0000000000
--- a/mobile/src/operators/kernel/flatten2_kernel.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-//
-// Created by hujie09 on 2019-07-31.
-//
-
-#ifdef FLATTEN2_OP
-#include <operators/op_param.h>
-#include "framework/operator.h"
-namespace paddle_mobile {
-namespace operators {
-DECLARE_KERNEL(Flatten2, FlattenParam)
-}
-}  // namespace paddle_mobile
-
-#endif  // FLATTEN2_KERNEL
diff --git a/mobile/src/operators/kernel/flatten_kernel.h b/mobile/src/operators/kernel/flatten_kernel.h
deleted file mode 100644
index 4846725bcb..0000000000
--- a/mobile/src/operators/kernel/flatten_kernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FLATTEN_OP
-
-#pragma once
-
-#include <vector>
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class FlattenKernel
-    : public framework::OpKernelBase<DeviceType, FlattenParam<DeviceType>> {
- public:
-  void Compute(const FlattenParam<DeviceType>& param);
-  bool Init(FlattenParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/KD/conv_add_bn_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/conv_add_bn_kernel.cpp
deleted file mode 100644
index 8debe5afac..0000000000
--- a/mobile/src/operators/kernel/fpga/KD/conv_add_bn_kernel.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDBN_OP
-
-#include "operators/kernel/conv_add_bn_kernel.h"
-#include <cmath>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA>* param) {
-  // bool relu_enabled = false;
-  zynqmp::PE<ConvParam>& conv = param.context().convPE();
-  ConvParam& p = conv.param();
-  p.input = param->Input()->ZynqTensor();
-  p.filter = param->Filter()->ZynqTensor();
-
-  BatchnormParam* bn = new BatchnormParam();
-  p.bn = bn;
-
-  return true;
-}
-
-template <>
-void ConvAddBNKernel<FPGA, float>::Compute(
-    const FusionConvAddBNParam<FPGA>& param) {
-  zynqmp::PE<ConvParam>& conv = param.context().convPE();
-  conv.dispatch();
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/KD/conv_add_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/conv_add_kernel.cpp
deleted file mode 100644
index 0214f2231b..0000000000
--- a/mobile/src/operators/kernel/fpga/KD/conv_add_kernel.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADD_OP
-
-#include "operators/kernel/conv_add_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
-  return true;
-}
-
-template <>
-void ConvAddKernel<FPGA, float>::Compute(
-    const FusionConvAddParam<FPGA> &param) {}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/KD/conv_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/conv_add_relu_kernel.cpp
deleted file mode 100644
index e0170a7de5..0000000000
--- a/mobile/src/operators/kernel/fpga/KD/conv_add_relu_kernel.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDRELU_OP
-
-#include "operators/kernel/conv_add_relu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
-  return true;
-}
-
-template <>
-void ConvAddReluKernel<FPGA, float>::Compute(
-    const FusionConvAddReluParam<FPGA> &param) {}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/KD/conv_bn_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/conv_bn_kernel.cpp
deleted file mode 100644
index a137c920c3..0000000000
--- a/mobile/src/operators/kernel/fpga/KD/conv_bn_kernel.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBN_OP
-
-#include "operators/kernel/conv_bn_kernel.h"
-#include "fpga/KD/pes/conv_pe.hpp"
-
-using ConvPE = paddle_mobile::zynqmp::ConvPE;
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA>* param) {
-  param->Output()->mutable_data<half>();
-
-  ConvPE& pe = param->context().pe<ConvPE>();
-  zynqmp::ConvParam& conv_param = pe.param();
-  zynqmp::BatchnormParam* bn_param = new zynqmp::BatchnormParam();
-  bn_param->bias = param->InputBias()->zynqmpTensor();
-  bn_param->scale = param->InputScale()->zynqmpTensor();
-  bn_param->mean = param->InputMean()->zynqmpTensor();
-  bn_param->variance = param->InputVariance()->zynqmpTensor();
-  bn_param->epsilon = param->Epsilon();
-  conv_param.input = param->Input()->zynqmpTensor();
-  conv_param.output = param->Output()->zynqmpTensor();
-  conv_param.filter = param->Filter()->zynqmpTensor();
-  conv_param.batchnorm = bn_param;
-  conv_param.relu.enabled = false;
-  conv_param.groups = param->Groups();
-  conv_param.strides = param->Strides();
-  conv_param.paddings = param->Paddings();
-  pe.init();
-  pe.apply();
-  return true;
-}
-
-template <>
-void ConvBNKernel<FPGA, float>::Compute(const FusionConvBNParam<FPGA>& param) {
-  std::cout << "ConvBNKernel\n";
-  zynqmp::Context& context = const_cast<zynqmp::Context&>(param.context_);
-  ConvPE& pe = context.pe<ConvPE>();
-  pe.dispatch();
-
-  std::string path =
-      "bn_" + std::to_string(param.Output()->zynqmpTensor()->id()) + ".txt";
-  // param.Output()->zynqmpTensor()->saveToFile(path);
-
-  // param.Output()->zynqmpTensor()->saveToFile();
-  std::cout << "Out scale:" << param.Output()->zynqmpTensor()->scale()[0]
-            << std::endl;
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/KD/conv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/conv_bn_relu_kernel.cpp
deleted file mode 100644
index 5b3b1deb1c..0000000000
--- a/mobile/src/operators/kernel/fpga/KD/conv_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBNRELU_OP
-
-#include "operators/kernel/conv_bn_relu_kernel.h"
-#include "fpga/KD/pes/conv_pe.hpp"
-
-#include <math.h>
-
-using ConvPE = paddle_mobile::zynqmp::ConvPE;
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA>* param) {
-  param->Output()->mutable_data<half>();
-
-  ConvPE& pe = param->context().pe<ConvPE>();
-  zynqmp::ConvParam& conv_param = pe.param();
-  zynqmp::BatchnormParam* bn_param = new zynqmp::BatchnormParam();
-  bn_param->bias = param->InputBias()->zynqmpTensor();
-  bn_param->scale = param->InputScale()->zynqmpTensor();
-  bn_param->mean = param->InputMean()->zynqmpTensor();
-  bn_param->variance = param->InputVariance()->zynqmpTensor();
-  bn_param->epsilon = param->Epsilon();
-  conv_param.input = param->Input()->zynqmpTensor();
-  conv_param.output = param->Output()->zynqmpTensor();
-  conv_param.filter = param->Filter()->zynqmpTensor();
-  conv_param.batchnorm = bn_param;
-  conv_param.relu.enabled = true;
-  conv_param.groups = param->Groups();
-  conv_param.strides = param->Strides();
-  conv_param.paddings = param->Paddings();
-  pe.init();
-  pe.apply();
-  return true;
-}
-template <>
-void ConvBNReluKernel<FPGA, float>::Compute(
-    const FusionConvBNReluParam<FPGA>& param) {
-  std::cout << "ConvBNReluKernel\n";
-  zynqmp::Context& context = const_cast<zynqmp::Context&>(param.context_);
-  ConvPE& pe = context.pe<ConvPE>();
-  pe.dispatch();
-
-  std::string path =
-      "bnr_" + std::to_string(param.Output()->zynqmpTensor()->id()) + ".txt";
-  // param.Output()->zynqmpTensor()->saveToFile(path);
-  std::cout << "Out scale:" << param.Output()->zynqmpTensor()->scale()[0]
-            << std::endl;
-
-  if (isinf(param.Output()->zynqmpTensor()->scale()[0])) {
-    // zynqmp::ConvParam& conv_param = pe.param();
-    std::cout << "invalid cale !!!!!!!!!!!!" << std::endl;
-    // std::cout << conv_param.convArgs.conv_arg[0].kernel.width << std::endl;
-    exit(-1);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/KD/elementwise_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/elementwise_add_relu_kernel.cpp
deleted file mode 100644
index 52e95158c4..0000000000
--- a/mobile/src/operators/kernel/fpga/KD/elementwise_add_relu_kernel.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_ELEMENTWISEADDRELU_OP
-
-#include "operators/kernel/elementwise_add_relu_kernel.h"
-#include "fpga/KD/pes/elementwise_add_pe.hpp"
-
-using ElementwiseAddPE = paddle_mobile::zynqmp::ElementwiseAddPE;
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ElementwiseAddReluKernel<FPGA, float>::Init(
-    ElementwiseAddReluParam<FPGA>* param) {
-  param->Out()->mutable_data<half>();
-
-  ElementwiseAddPE& pe = param->context().pe<ElementwiseAddPE>();
-  zynqmp::ElementwiseAddParam& ew_param = pe.param();
-  ew_param.inputs = {
-      param->InputX()->zynqmpTensor(),
-      param->InputY()->zynqmpTensor(),
-  };
-  ew_param.output = param->Out()->zynqmpTensor();
-  ew_param.relu.enabled = true;
-
-  pe.init();
-  pe.apply();
-  return true;
-}
-
-template <>
-void ElementwiseAddReluKernel<FPGA, float>::Compute(
-    const ElementwiseAddReluParam<FPGA>& param) {
-  std::cout << "ElementwiseAddReluKernel\n";
-  zynqmp::Context& context = const_cast<zynqmp::Context&>(param.context_);
-  ElementwiseAddPE& pe = context.pe<ElementwiseAddPE>();
-  pe.dispatch();
-
-  std::string path =
-      "ew_" + std::to_string(param.Out()->zynqmpTensor()->id()) + ".txt";
-  // param.Out()->zynqmpTensor()->saveToFile(path);
-  std::cout << "Out scale:" << param.Out()->zynqmpTensor()->scale()[0]
-            << std::endl;
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/KD/feed_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/feed_kernel.cpp
deleted file mode 100644
index 7a0450c599..0000000000
--- a/mobile/src/operators/kernel/fpga/KD/feed_kernel.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/feed_kernel.h"
-#include "fpga/KD/pes/input_pe.hpp"
-
-using InputParam = paddle_mobile::zynqmp::InputParam;
-using InputPE = paddle_mobile::zynqmp::InputPE;
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA>* param) {
-  int col = param->Col();
-  auto input = const_cast<LoDTensor*>(&param->InputX()->at(col));
-
-  InputPE& pe = param->context().pe<InputPE>();
-  InputParam& input_param = pe.param();
-  input->mutable_data<float>();
-  zynqmp::Tensor* input_tensor = input->zynqmpTensor();
-  input_param.input = input_tensor;
-  param->Out()->mutable_data<half>();
-  auto out = param->Out()->zynqmpTensor();
-  input_param.output = out;
-  pe.init();
-
-  return true;
-}
-
-template <>
-void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA>& param) {
-  std::cout << "FeedKernel\n";
-  zynqmp::Context& context = const_cast<zynqmp::Context&>(param.context_);
-  InputPE& pe = context.pe<InputPE>();
-
-  int col = param.Col();
-  auto input = const_cast<LoDTensor*>(&param.InputX()->at(col));
-  InputParam& input_param = pe.param();
-  input->mutable_data<float>();
-  zynqmp::Tensor* input_tensor = input->zynqmpTensor();
-  input_param.input = input_tensor;
-  param.Out()->Resize(input->dims());
-  param.Out()->mutable_data<half>();
-  auto out = param.Out()->zynqmpTensor();
-  input_param.output = out;
-  pe.dispatch();
-
-  param.Out()->zynqmpTensor()->saveToFile("feed_out.txt");
-}
-template class FeedKernel<FPGA, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/fpga/KD/fetch_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/fetch_kernel.cpp
deleted file mode 100644
index 75b0e0ccf8..0000000000
--- a/mobile/src/operators/kernel/fpga/KD/fetch_kernel.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "operators/kernel/fetch_kernel.h"
-#include "fpga/KD/pes/output_pe.hpp"
-
-namespace paddle_mobile {
-namespace operators {
-
-using OutputPE = zynqmp::OutputPE;
-
-template <>
-bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA>* param) {
-  auto input = param->InputX();
-  int col = param->Col();
-  auto output = &(param->Out()->at(col));
-  output->Resize(input->dims());
-  output->mutable_data<float>();
-
-  zynqmp::Context& context = const_cast<zynqmp::Context&>(param->context_);
-  OutputPE& pe = context.pe<OutputPE>();
-  zynqmp::OutputParam& out_param = pe.param();
-  out_param.input = input->zynqmpTensor();
-  out_param.output = output->zynqmpTensor();
-
-  pe.init();
-  pe.apply();
-  return true;
-}
-
-template <>
-void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA>& param) {
-  std::cout << "FetchKernel\n";
-  zynqmp::Context& context = const_cast<zynqmp::Context&>(param.context_);
-  OutputPE& pe = context.pe<OutputPE>();
-  pe.dispatch();
-
-  int col = param.Col();
-  auto output = &(param.Out()->at(col));
-  output->zynqmpTensor()->saveToFile("fetch_out.txt");
-}
-template class FetchKernel<FPGA, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/fpga/KD/fusion_fc_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/fusion_fc_kernel.cpp
deleted file mode 100644
index 5b564fe4b6..0000000000
--- a/mobile/src/operators/kernel/fpga/KD/fusion_fc_kernel.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_FC_OP
-
-#include "operators/kernel/fusion_fc_kernel.h"
-#include "fpga/KD/pes/fully_connected_pe.hpp"
-
-namespace paddle_mobile {
-namespace operators {
-
-using FullyConnectedPE = zynqmp::FullyConnectedPE;
-
-template <>
-bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA>* param) {
-  param->Out()->mutable_data<half>();
-
-  FullyConnectedPE& pe = param->context().pe<FullyConnectedPE>();
-  zynqmp::FullyConnectedParam& fc_param = pe.param();
-  fc_param.input = param->InputX()->zynqmpTensor();
-  fc_param.output = param->Out()->zynqmpTensor();
-  fc_param.filter = param->InputY()->zynqmpTensor();
-  fc_param.bias = param->InputZ()->zynqmpTensor();
-  pe.init();
-  pe.apply();
-  return true;
-}
-
-template <>
-void FusionFcKernel<FPGA, float>::Compute(const FusionFcParam<FPGA>& param) {
-  std::cout << "FusionFcKernel\n";
-  zynqmp::Context& context = const_cast<zynqmp::Context&>(param.context_);
-  FullyConnectedPE& pe = context.pe<FullyConnectedPE>();
-  pe.dispatch();
-
-  param.Out()->zynqmpTensor()->invalidate();
-  std::string path =
-      "fc_" + std::to_string(param.Out()->zynqmpTensor()->id()) + ".txt";
-  param.Out()->zynqmpTensor()->saveToFile(path);
-  std::cout << "Out scale:" << param.Out()->zynqmpTensor()->scale()[0]
-            << std::endl;
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/KD/pool_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/pool_kernel.cpp
deleted file mode 100644
index 69db4472c9..0000000000
--- a/mobile/src/operators/kernel/fpga/KD/pool_kernel.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef POOL_OP
-
-#include "operators/kernel/pool_kernel.h"
-#include "fpga/KD/pes/pooling_pe.hpp"
-
-class PoolingArgs;
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA>* param) {
-  param->Output()->mutable_data<half>();
-
-  zynqmp::PoolingPE& pe = param->context().pe<zynqmp::PoolingPE>();
-  zynqmp::PoolingParam& pool_param = pe.param();
-
-  pool_param.input = param->Input()->zynqmpTensor();
-  pool_param.output = param->Output()->zynqmpTensor();
-  pool_param.type = param->PoolingType() == "max"
-                        ? zynqmp::PoolingType::MAX
-                        : zynqmp::PoolingType::AVERAGE;
-  pool_param.globalPooling = param->isGlobalPooling();
-  pool_param.kernelSize = param->Ksize();
-  pool_param.strides = param->Strides();
-  pool_param.paddings = param->Paddings();
-
-  pe.init();
-  pe.apply();
-  return true;
-}
-
-template <>
-void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA>& param) {
-  std::cout << "PoolKernel\n";
-  zynqmp::Context& context = const_cast<zynqmp::Context&>(param.context_);
-  zynqmp::PoolingPE& pe = context.pe<zynqmp::PoolingPE>();
-  pe.dispatch();
-
-  std::string path =
-      "pool_" + std::to_string(param.Output()->zynqmpTensor()->id()) + ".txt";
-  param.Output()->zynqmpTensor()->saveToFile(path);
-  // param.Output()->zynqmpTensor()->saveToFile();
-  std::cout << "Out scale:" << param.Output()->zynqmpTensor()->scale()[0]
-            << std::endl;
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/KD/softmax_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/softmax_kernel.cpp
deleted file mode 100644
index dace88c5a2..0000000000
--- a/mobile/src/operators/kernel/fpga/KD/softmax_kernel.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SOFTMAX_OP
-
-#include "operators/kernel/softmax_kernel.h"
-#include "fpga/KD/pes/softmax_pe.hpp"
-#include "operators/kernel/central-arm-func/softmax_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA>* param) {
-  param->Out()->mutable_data<half>();
-
-  zynqmp::SoftmaxPE& pe = param->context().pe<zynqmp::SoftmaxPE>();
-  zynqmp::SoftmaxParam& fc_param = pe.param();
-  fc_param.input = param->InputX()->zynqmpTensor();
-  fc_param.output = param->Out()->zynqmpTensor();
-  pe.init();
-  pe.apply();
-  return true;
-}
-
-template <>
-void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA>& param) {
-  std::cout << "SoftmaxKernel\n";
-  zynqmp::Context& context = const_cast<zynqmp::Context&>(param.context_);
-  zynqmp::SoftmaxPE& pe = context.pe<zynqmp::SoftmaxPE>();
-  pe.dispatch();
-
-  param.Out()->zynqmpTensor()->invalidate();
-  std::string path =
-      "softmax_" + std::to_string(param.Out()->zynqmpTensor()->id()) + ".txt";
-  param.Out()->zynqmpTensor()->saveToFile(path);
-  std::cout << "Out scale:" << param.Out()->zynqmpTensor()->scale()[0]
-            << std::endl;
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp
deleted file mode 100644
index 31872411f7..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ANCHOR_GENERATOR_OP
-#include <string.h>
-#include <iostream>
-#include <memory>
-#include <utility>
-#include <vector>
-#include "operators/kernel/detection_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool AnchorGeneratorKernel<FPGA, float>::Init(
-    AnchorGeneratorParam<FPGA> *param) {
-  auto input = param->input_;
-  auto anchors = param->output_anchors_;
-  auto anchor_ptr = anchors->mutable_data<float>();
-  auto stride = param->stride_;
-  auto feature_width = input->dims()[3], feature_height = input->dims()[2];
-  auto stride_width = stride[0], stride_height = stride[1];
-  auto offset = param->offset_;
-
-  int anchors_offset[] = {-2,  -2,   18,   18,  -10, -9,   26,   25,   -23,
-                          -20, 39,   36,   -43, -34, 59,   49,   -63,  -54,
-                          79,  69,   -96,  -77, 112, 93,   -137, -118, 153,
-                          134, -204, -188, 220, 204, -281, -395, 296,  441};
-
-  int anchors_offset2[] = {-18, -31, 34,  47,  -22, -22, 38,  38,  -33,
-                           -44, 49,  60,  -2,  -2,  18,  18,  -10, -14,
-                           26,  30,  -14, -22, 30,  38,  -9,  -26, 25,
-                           42,  -92, -92, 108, 108, -2,  -15, 18,  31};
-
-  if (offset > 0.6) {
-    memcpy(anchors_offset, anchors_offset2, sizeof(anchors_offset));
-    std::cout << "anchor generator marker" << std::endl;
-  } else {
-    std::cout << "anchor generator rfcn" << std::endl;
-  }
-  int num_anchors = sizeof(anchors_offset) / (sizeof(int) * 4);
-
-  //  DLOG << "feature_height: " << feature_height;
-  //  DLOG << "feature_width: " << feature_width;
-  //  DLOG << "num_anchors: " << num_anchors;
-  //  DLOG << "stride_width: " << stride_width;
-  //  DLOG << "stride_height: " << stride_height;
-
-  for (int h_idx = 0; h_idx < feature_height; ++h_idx) {
-    int offset0 = h_idx * feature_width * num_anchors * 4;
-    for (int w_idx = 0; w_idx < feature_width; ++w_idx) {
-      int offset1 = w_idx * num_anchors * 4;
-      for (int idx = 0; idx < num_anchors; idx++) {
-        int offset = offset0 + offset1 + idx * 4;
-        anchor_ptr[offset + 0] =
-            anchors_offset[idx * 4 + 0] + w_idx * stride_width;
-        anchor_ptr[offset + 1] =
-            anchors_offset[idx * 4 + 1] + h_idx * stride_height;
-        anchor_ptr[offset + 2] =
-            anchors_offset[idx * 4 + 2] + w_idx * stride_width;
-        anchor_ptr[offset + 3] =
-            anchors_offset[idx * 4 + 3] + h_idx * stride_height;
-      }
-    }
-  }
-  return true;
-}
-
-template <>
-void AnchorGeneratorKernel<FPGA, float>::Compute(
-    const AnchorGeneratorParam<FPGA> &param) {}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // ANCHOR_GENERATOR_OP
diff --git a/mobile/src/operators/kernel/fpga/V1/concat_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/concat_kernel.cpp
deleted file mode 100644
index 7690f41ad3..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/concat_kernel.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONCAT_OP
-
-#include "operators/kernel/concat_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
-  auto inputs = param->Inputs();
-  auto out = param->Out();
-  auto image_num = inputs.size();
-  auto images_in =
-      (half **)fpga::fpga_malloc(image_num * sizeof(int *));  // NOLINT
-  auto scales_in =
-      (float **)fpga::fpga_malloc(image_num * sizeof(float *));  // NOLINT
-  auto channel_num =
-      (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t));  // NOLINT
-
-  auto height = inputs[0]->dims()[2];
-  auto width = inputs[0]->dims()[3];
-  for (int i = 0; i < image_num; i++) {
-    auto input = inputs[i];
-    PADDLE_MOBILE_ENFORCE(
-        input->dims()[2] == height && input->dims()[3] == width,
-        "Image height & width should be unified");
-    images_in[i] = input->data<half>();
-    channel_num[i] = (uint32_t)inputs[i]->dims()[1];  // NOLINT
-    scales_in[i] = input->scale;
-  }
-  fpga::format_concat_output(out, height, width, image_num, channel_num);
-
-  fpga::ConcatArgs concatArgs = {0};
-  concatArgs.image_num = image_num;
-  concatArgs.images_in = images_in;
-  concatArgs.scales_in = scales_in;
-  concatArgs.image_out = out->data<half>();
-  concatArgs.scale_out = out->scale;
-  concatArgs.channel_num = channel_num;
-  concatArgs.height = height;
-  concatArgs.width = width;
-  param->SetFpgaArgs(concatArgs);
-  return true;
-}
-
-template <>
-void ConcatKernel<FPGA, float>::Compute(const ConcatParam<FPGA> &param) {
-  ComputeFPGAConcat(param.FpgaArgs());
-}
-template class ConcatKernel<FPGA, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp
deleted file mode 100644
index c052805dfd..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDBN_OP
-
-#include "operators/kernel/conv_add_bn_kernel.h"
-#include <cmath>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
-  // bool relu_enabled = false;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-
-  auto bias = param->Bias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-
-  auto out = param->Output();
-
-  auto bn_mean_ptr = param->InputMean()->data<float>();
-  auto bn_var_ptr = param->InputVariance()->data<float>();
-  auto bn_scale_ptr = param->InputScale()->data<float>();
-  auto bn_bias_ptr = param->InputBias()->data<float>();
-  const float epsilon = param->Epsilon();
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] &&
-                            bias->dims()[0] == param->InputBias()->dims()[0],
-                        "Output channel should be equal to bias number");
-
-  const int channel = out->dims()[1];
-  auto bs_ptr =
-      reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
-  auto new_scale = new Tensor();
-  auto new_bias = new Tensor();
-  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
-  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
-
-  for (int i = 0; i < channel; i++) {
-    new_scale_ptr[i] = bn_scale_ptr[i] /
-                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
-    new_bias_ptr[i] =
-        bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
-    bs_ptr[i + channel] = new_scale_ptr[i];
-    bs_ptr[i] = new_bias_ptr[i];
-  }
-
-  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
-                       leaky_relu_negative_slope, param->Groups(),
-                       param->Strides()[0], param->Strides()[1],
-                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-
-  delete new_scale;
-  delete new_bias;
-
-  return true;
-}
-
-template <>
-void ConvAddBNKernel<FPGA, float>::Compute(
-    const FusionConvAddBNParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
deleted file mode 100755
index a7a93de9ba..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDBNRELU_OP
-
-#include "operators/kernel/conv_add_bn_relu_kernel.h"
-#include <cmath>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddBNReluKernel<FPGA, float>::Init(
-    FusionConvAddBNReluParam<FPGA> *param) {
-  // bool relu_enabled = true;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  auto bias = param->Bias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-
-  vector<int> paddings = param->Paddings();
-  vector<int> strides = param->Strides();
-  auto bn_mean_ptr = param->InputMean()->data<float>();
-  auto bn_var_ptr = param->InputVariance()->data<float>();
-  auto bn_scale_ptr = param->InputScale()->data<float>();
-  auto bn_bias_ptr = param->InputBias()->data<float>();
-  const float epsilon = param->Epsilon();
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] &&
-                            bias->dims()[0] == param->InputBias()->dims()[0],
-                        "Output channel should be equal to bias number");
-
-  const int channel = out->dims()[1];
-  auto bs_ptr =
-      reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
-  auto new_scale = new Tensor();
-  auto new_bias = new Tensor();
-  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
-  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
-
-  for (int i = 0; i < channel; i++) {
-    new_scale_ptr[i] = bn_scale_ptr[i] /
-                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
-    new_bias_ptr[i] =
-        bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
-    bs_ptr[i + channel] = new_scale_ptr[i];
-    bs_ptr[i] = new_bias_ptr[i];
-  }
-
-  const int groups = param->Groups();
-  if (groups == channel) {
-    fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr);
-    fpga::DWconvArgs dwconv_arg = {0};
-    fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, activation_enable,
-                          leaky_relu_negative_slope, strides[0], strides[1],
-                          paddings[0], paddings[1], new_bias_ptr);
-    param->SetFpgaArgs(dwconv_arg);
-    fpga::fpga_free(new_scale_ptr);
-    fpga::fpga_free(bs_ptr);
-  } else {
-    fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-    fpga::SplitConvArgs conv_arg = {0};
-    fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
-                         leaky_relu_negative_slope, param->Groups(), strides[0],
-                         strides[1], paddings[0], paddings[1], bs_ptr);
-    param->SetFpgaArgs(conv_arg);
-    delete new_scale;
-    delete new_bias;
-  }
-  return true;
-}
-
-template <>
-void ConvAddBNReluKernel<FPGA, float>::Compute(
-    const FusionConvAddBNReluParam<FPGA> &param) {
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWConv(param.FpgaDwconvArgs());
-  } else {
-    fpga::ComputeFpgaConv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/conv_add_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/conv_add_kernel.cpp
deleted file mode 100644
index da16af58f1..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/conv_add_kernel.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADD_OP
-
-#include "operators/kernel/conv_add_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
-  // bool relu_enabled = false;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  const Tensor *bias = param->Bias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-                        "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = 1;
-    bs_ptr[i] = bias_ptr[i];
-  }
-
-  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
-                       leaky_relu_negative_slope, param->Groups(),
-                       param->Strides()[0], param->Strides()[1],
-                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-  return true;
-}
-
-template <>
-void ConvAddKernel<FPGA, float>::Compute(
-    const FusionConvAddParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp
deleted file mode 100644
index f1f61da421..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDRELU_OP
-
-#include "operators/kernel/conv_add_relu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
-  // bool relu_enabled = true;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  const Tensor *bias = param->Bias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-                        "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = 1;
-    bs_ptr[i] = bias_ptr[i];
-  }
-
-  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
-                       leaky_relu_negative_slope, param->Groups(),
-                       param->Strides()[0], param->Strides()[1],
-                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-  return true;
-}
-
-template <>
-void ConvAddReluKernel<FPGA, float>::Compute(
-    const FusionConvAddReluParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp
deleted file mode 100644
index 54d99f22d1..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBN_OP
-
-#include "operators/kernel/conv_bn_kernel.h"
-#include <cmath>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
-  // bool relu_enabled = false;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  auto bn_mean_ptr = param->InputMean()->data<float>();
-  auto bn_var_ptr = param->InputVariance()->data<float>();
-  auto bn_scale_ptr = param->InputScale()->data<float>();
-  auto bn_bias_ptr = param->InputBias()->data<float>();
-  const float epsilon = param->Epsilon();
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
-                        "Output channel should be equal to bias number");
-  const int channel = out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // // NOLINT
-  auto new_scale = new Tensor();
-  auto new_bias = new Tensor();
-  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
-  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
-
-  for (int i = 0; i < channel; i++) {
-    new_scale_ptr[i] = bn_scale_ptr[i] /
-                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
-    new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
-    bs_ptr[i + channel] = new_scale_ptr[i];
-    bs_ptr[i] = new_bias_ptr[i];
-  }
-
-  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
-                       leaky_relu_negative_slope, param->Groups(),
-                       param->Strides()[0], param->Strides()[1],
-                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-  delete new_scale;
-  delete new_bias;
-  return true;
-}
-
-template <>
-void ConvBNKernel<FPGA, float>::Compute(const FusionConvBNParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
deleted file mode 100644
index 4ce8265f7f..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBNRELU_OP
-
-#include "operators/kernel/conv_bn_relu_kernel.h"
-#include <cmath>
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  auto bn_mean_ptr = param->InputMean()->data<float>();
-  auto bn_var_ptr = param->InputVariance()->data<float>();
-  auto bn_scale_ptr = param->InputScale()->data<float>();
-  auto bn_bias_ptr = param->InputBias()->data<float>();
-  const float epsilon = param->Epsilon();
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
-                        "Output channel should be equal to bias number");
-  const int channel = out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  auto new_scale = new Tensor();
-  auto new_bias = new Tensor();
-  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
-  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
-  for (int i = 0; i < channel; i++) {
-    new_scale_ptr[i] = bn_scale_ptr[i] /
-                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
-    new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
-    bs_ptr[i + channel] = new_scale_ptr[i];
-    bs_ptr[i] = new_bias_ptr[i];
-  }
-  const int groups = param->Groups();
-  if (groups == channel) {
-    fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr);
-    fpga::DWconvArgs dwconv_arg = {0};
-    fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, activation_enable,
-                          leaky_relu_negative_slope, param->Strides()[0],
-                          param->Strides()[1], param->Paddings()[0],
-                          param->Paddings()[1], new_bias_ptr);
-    param->SetFpgaArgs(dwconv_arg);
-  } else {
-    fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-    fpga::SplitConvArgs conv_arg = {0};
-    fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
-                         leaky_relu_negative_slope, param->Groups(),
-                         param->Strides()[0], param->Strides()[1],
-                         param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(conv_arg);
-  }
-  delete new_scale;
-  delete new_bias;
-  return true;
-}
-template <>
-void ConvBNReluKernel<FPGA, float>::Compute(
-    const FusionConvBNReluParam<FPGA> &param) {
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWConv(param.FpgaDwconvArgs());
-  } else {
-    fpga::ComputeFpgaConv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/conv_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/conv_kernel.cpp
deleted file mode 100644
index 57b5eb754e..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/conv_kernel.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_OP
-
-#include "operators/kernel/conv_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvKernel<FPGA, float>::Init(ConvParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  int channel = out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = 1;
-    bs_ptr[i] = 0;
-  }
-
-  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
-                       leaky_relu_negative_slope, param->Groups(),
-                       param->Strides()[0], param->Strides()[1],
-                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-  return true;
-}
-
-template <>
-void ConvKernel<FPGA, float>::Compute(const ConvParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp
deleted file mode 100644
index 1597885e43..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_TRANSPOSE_OP
-
-#include "operators/kernel/conv_transpose_kernel.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
-  // bool relu_enabled = false;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  // const Tensor *bias = param->Bias();
-  // auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-
-  // PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-  //                      "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-
-  int sub_conv_n = param->Strides()[0];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
-                                           sizeof(float));             // NOLINT
-
-  for (int i = 0; i < channel * sub_conv_n; i++) {
-    bs_ptr[i + sub_conv_n * channel] = 1;
-    bs_ptr[i] = 0;  // bias_ptr[i % (channel)];
-  }
-
-  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
-                        "stride_width should be equal to stride_height ");
-  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
-                        "filter width should be equal to filter height ");
-  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
-                        "filter axis should be the multiple of stride axis ");
-  if (param->Groups() == channel) {
-    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
-                               sub_conv_n);
-    fpga::DWDeconvArgs DWDeconv_arg = {0};
-    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
-                            activation_enable, leaky_relu_negative_slope,
-                            param->Strides()[0], param->Strides()[1],
-                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(DWDeconv_arg);
-  } else {
-    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
-    fpga::DeconvArgs deconv_arg = {0};
-    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
-                          leaky_relu_negative_slope, param->Groups(),
-                          param->Strides()[0], param->Strides()[1],
-                          param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(deconv_arg);
-  }
-  return true;
-}
-
-template <>
-void ConvTransposeKernel<FPGA, float>::Compute(
-    const ConvTransposeParam<FPGA> &param) {
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
-  } else {
-    fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp
deleted file mode 100644
index a8205df3c9..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADDBN_OP
-
-#include "operators/kernel/deconv_add_bn_kernel.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) {
-  // bool relu_enabled = true;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  const Tensor *bias = param->InputBias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-                        "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-
-  int sub_conv_n = param->Strides()[0];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
-                                           sizeof(float));             // NOLINT
-
-  for (int i = 0; i < channel * sub_conv_n; i++) {
-    bs_ptr[i + sub_conv_n * channel] = 1;
-    bs_ptr[i] = bias_ptr[i % (channel)];
-  }
-
-  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
-                        "stride_width should be equal to stride_height ");
-  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
-                        "filter width should be equal to filter height ");
-  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
-                        "filter axis should be the multiple of stride axis ");
-  if (param->Groups() == channel) {
-    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
-                               sub_conv_n);
-    fpga::DWDeconvArgs DWDeconv_arg = {0};
-    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
-                            activation_enable, leaky_relu_negative_slope,
-                            param->Strides()[0], param->Strides()[1],
-                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(DWDeconv_arg);
-  } else {
-    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
-    fpga::DeconvArgs deconv_arg = {0};
-    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
-                          leaky_relu_negative_slope, param->Groups(),
-                          param->Strides()[0], param->Strides()[1],
-                          param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(deconv_arg);
-  }
-  return true;
-}
-
-template <>
-void DeconvAddBNKernel<FPGA, float>::Compute(
-    const FusionDeconvAddBNParam<FPGA> &param) {
-  // fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
-  } else {
-    fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp
deleted file mode 100755
index b27f5cf870..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADDBNRELU_OP
-
-#include "operators/kernel/deconv_add_bn_relu_kernel.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DeconvAddBNReluKernel<FPGA, float>::Init(
-    FusionDeconvAddBNReluParam<FPGA> *param) {
-  // bool relu_enabled = true;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  const Tensor *bias = param->InputBias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-                        "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-
-  int sub_conv_n = param->Strides()[0];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
-                                           sizeof(float));             // NOLINT
-
-  for (int i = 0; i < channel * sub_conv_n; i++) {
-    bs_ptr[i + sub_conv_n * channel] = 1;
-    bs_ptr[i] = bias_ptr[i % (channel)];
-  }
-
-  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
-                        "stride_width should be equal to stride_height ");
-  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
-                        "filter width should be equal to filter height ");
-  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
-                        "filter axis should be the multiple of stride axis ");
-  if (param->Groups() == channel) {
-    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
-                               sub_conv_n);
-    fpga::DWDeconvArgs DWDeconv_arg = {0};
-    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
-                            activation_enable, leaky_relu_negative_slope,
-                            param->Strides()[0], param->Strides()[1],
-                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(DWDeconv_arg);
-  } else {
-    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
-    fpga::DeconvArgs deconv_arg = {0};
-    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
-                          leaky_relu_negative_slope, param->Groups(),
-                          param->Strides()[0], param->Strides()[1],
-                          param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(deconv_arg);
-  }
-  return true;
-}
-
-template <>
-void DeconvAddBNReluKernel<FPGA, float>::Compute(
-    const FusionDeconvAddBNReluParam<FPGA> &param) {
-  // fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
-  } else {
-    fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
deleted file mode 100644
index 41844d008b..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADD_OP
-
-#include "operators/kernel/deconv_add_kernel.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
-  // bool relu_enabled = false;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  const Tensor *bias = param->Bias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-                        "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-
-  int sub_conv_n = param->Strides()[0];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
-                                           sizeof(float));             // NOLINT
-
-  for (int i = 0; i < channel * sub_conv_n; i++) {
-    bs_ptr[i + sub_conv_n * channel] = 1;
-    bs_ptr[i] = bias_ptr[i % (channel)];
-  }
-
-  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
-                        "stride_width should be equal to stride_height ");
-  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
-                        "filter width should be equal to filter height ");
-  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
-                        "filter axis should be the multiple of stride axis ");
-  if (param->Groups() == channel) {
-    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
-                               sub_conv_n);
-    fpga::DWDeconvArgs DWDeconv_arg = {0};
-    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
-                            activation_enable, leaky_relu_negative_slope,
-                            param->Strides()[0], param->Strides()[1],
-                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(DWDeconv_arg);
-  } else {
-    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
-    fpga::DeconvArgs deconv_arg = {0};
-    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
-                          leaky_relu_negative_slope, param->Groups(),
-                          param->Strides()[0], param->Strides()[1],
-                          param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(deconv_arg);
-  }
-
-  return true;
-}
-
-template <>
-void DeconvAddKernel<FPGA, float>::Compute(
-    const FusionDeconvAddParam<FPGA> &param) {
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
-  } else {
-    fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
deleted file mode 100644
index c6fc9d1955..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADDRELU_OP
-
-#include "operators/kernel/deconv_add_relu_kernel.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DeconvAddReluKernel<FPGA, float>::Init(
-    FusionDeconvAddReluParam<FPGA> *param) {
-  // bool relu_enabled = true;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  const Tensor *bias = param->Bias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-                        "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-
-  int sub_conv_n = param->Strides()[0];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
-                                           sizeof(float));             // NOLINT
-
-  for (int i = 0; i < channel * sub_conv_n; i++) {
-    bs_ptr[i + sub_conv_n * channel] = 1;
-    bs_ptr[i] = bias_ptr[i % (channel)];
-  }
-
-  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
-                        "stride_width should be equal to stride_height ");
-  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
-                        "filter width should be equal to filter height ");
-  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
-                        "filter axis should be the multiple of stride axis ");
-  if (param->Groups() == channel) {
-    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
-                               sub_conv_n);
-    fpga::DWDeconvArgs DWDeconv_arg = {0};
-    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
-                            activation_enable, leaky_relu_negative_slope,
-                            param->Strides()[0], param->Strides()[1],
-                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(DWDeconv_arg);
-  } else {
-    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
-    fpga::DeconvArgs deconv_arg = {0};
-    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
-                          leaky_relu_negative_slope, param->Groups(),
-                          param->Strides()[0], param->Strides()[1],
-                          param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(deconv_arg);
-  }
-  return true;
-}
-
-template <>
-void DeconvAddReluKernel<FPGA, float>::Compute(
-    const FusionDeconvAddReluParam<FPGA> &param) {
-  // fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
-  } else {
-    fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp
deleted file mode 100644
index 75597f0ecd..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVBNRELU_OP
-
-#include "operators/kernel/deconv_bn_relu_kernel.h"
-#include <cmath>
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DeconvBNReluKernel<FPGA, float>::Init(
-    FusionDeconvBNReluParam<FPGA> *param) {
-  // bool relu_enabled = true;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  const Tensor *bias = param->InputBias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  auto bn_mean_ptr = param->InputMean()->data<float>();
-  auto bn_var_ptr = param->InputVariance()->data<float>();
-  auto bn_scale_ptr = param->InputScale()->data<float>();
-  auto bn_bias_ptr = param->InputBias()->data<float>();
-  const float epsilon = param->Epsilon();
-
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-                        "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-  auto new_scale = new Tensor();
-  auto new_bias = new Tensor();
-  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
-  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
-  for (int i = 0; i < channel; i++) {
-    new_scale_ptr[i] = bn_scale_ptr[i] /
-                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
-    new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
-  }
-
-  int sub_conv_n = param->Strides()[0];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
-                                           sizeof(float));             // NOLINT
-
-  for (int i = 0; i < channel * sub_conv_n; i++) {
-    bs_ptr[i + sub_conv_n * channel] = new_scale_ptr[i % channel];
-    bs_ptr[i] = new_bias_ptr[i % (channel)];
-  }
-
-  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
-                        "stride_width should be equal to stride_height ");
-  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
-                        "filter width should be equal to filter height ");
-  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
-                        "filter axis should be the multiple of stride axis ");
-  if (param->Groups() == channel) {
-    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
-                               sub_conv_n);
-    fpga::DWDeconvArgs DWDeconv_arg = {0};
-    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
-                            activation_enable, leaky_relu_negative_slope,
-                            param->Strides()[0], param->Strides()[1],
-                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(DWDeconv_arg);
-  } else {
-    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
-    fpga::DeconvArgs deconv_arg = {0};
-    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
-                          leaky_relu_negative_slope, param->Groups(),
-                          param->Strides()[0], param->Strides()[1],
-                          param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(deconv_arg);
-  }
-  delete new_scale;
-  delete new_bias;
-  return true;
-}
-
-template <>
-void DeconvBNReluKernel<FPGA, float>::Compute(
-    const FusionDeconvBNReluParam<FPGA> &param) {
-  // fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
-  } else {
-    fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/dropout_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/dropout_kernel.cpp
deleted file mode 100644
index 8b990d46e0..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/dropout_kernel.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DROPOUT_OP
-
-#include "operators/kernel/dropout_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DropoutKernel<FPGA, float>::Init(DropoutParam<FPGA> *param) {
-  param->Out()->ShareDataWith(*param->InputX());
-  return true;
-}
-
-template <>
-void DropoutKernel<FPGA, float>::Compute(const DropoutParam<FPGA> &param) {}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
deleted file mode 100644
index db4d2afbc1..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
+++ /dev/null
@@ -1,191 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef ELEMENTWISEADD_OP
-
-#include "operators/kernel/elementwise_add_kernel.h"
-
-#include <string>
-#include "fpga/V1/api.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
-  auto *input_y = const_cast<LoDTensor *>(param->InputY());
-  auto *out = param->Out();
-  if (input_y->type() != type_id<float>()) {
-    paddle_mobile::fpga::ActivationType activation_enable =
-        paddle_mobile::fpga::NONE;
-    int16_t leaky_relu_negative_slope = 0;
-    auto *input_x = const_cast<LoDTensor *>(param->InputX());
-    auto input_x_ptr = input_x->data<half>();
-    auto input_y_ptr = input_y->data<half>();
-    fpga::format_fp16_ofm(out);
-    auto out_ptr = out->mutable_data<half>();
-
-    fpga::EWAddArgs ewaddArgs = {0};
-    // ewaddArgs.relu_enabled = relu_enabled;
-    ewaddArgs.output.activation.activation_type = activation_enable;
-    ewaddArgs.output.activation.leaky_relu_negative_slope =
-        leaky_relu_negative_slope;
-    ewaddArgs.const0 = 0x3c00;  // =1
-    ewaddArgs.const1 = 0x3c00;  // =1
-    ewaddArgs.image0.address = input_x_ptr;
-    ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
-    ewaddArgs.image0.scale_address = input_x->scale;
-    ewaddArgs.image0.height = (uint32_t)input_x->dims()[2];
-    ewaddArgs.image0.width = (uint32_t)input_x->dims()[3];
-    ewaddArgs.image0.pad_height = 0;
-    ewaddArgs.image0.pad_width = 0;
-    ewaddArgs.image1.address = input_y_ptr;
-    ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1];
-    ewaddArgs.image1.scale_address = input_y->scale;
-    ewaddArgs.image1.height = (uint32_t)input_y->dims()[2];
-    ewaddArgs.image1.width = (uint32_t)input_y->dims()[3];
-    ewaddArgs.image1.pad_height = 0;
-    ewaddArgs.image1.pad_width = 0;
-    ewaddArgs.output.scale_address = out->scale;
-    ewaddArgs.output.address = out_ptr;
-    fpga::expand_EW_arg(&ewaddArgs);
-    param->SetFpgaArgs(ewaddArgs);
-  } else {
-    param->float_input_x.Resize(param->InputX()->dims());
-    param->float_input_x.init(type_id<float>().hash_code());
-    fpga::format_fp32_ofm(&(param->float_input_x));
-
-    param->float_out.Resize(param->InputX()->dims());
-    param->float_out.mutable_data<float>(param->InputX()->dims());
-    fpga::format_fp32_ofm(&(param->float_out));
-
-    fpga::format_fp16_ofm(out);
-  }
-  return true;
-}
-inline void ElementwiseAddCompute(const ElementwiseAddParam<FPGA> &param) {
-  auto input_x = param.float_input_x;
-  auto input_y = param.InputY();
-  auto Out = param.float_out;
-  int axis = param.Axis();
-
-  const auto &x_dims = input_x.dims();
-  const auto &y_dims = input_y->dims();
-  /// axis = -1 represent the last dimensions.
-  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-  size_t batch = 1;
-  size_t channels = 1;
-  size_t elementwise_num = 1;
-  for (int i = 0; i < axis; ++i) {
-    batch *= x_dims[i];
-  }
-  for (int i = 0; i < y_dims.size(); ++i) {
-    channels *= y_dims[i];
-  }
-  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
-    elementwise_num *= x_dims[i];
-  }
-  const float *bias_data = input_y->data<float>();
-  const float *input_data = input_x.data<float>();
-  float *output_data = Out.mutable_data<float>();
-
-  for (int i = 0; i < batch; ++i) {
-    for (int j = 0; j < channels; ++j) {
-      size_t offset = (i * channels + j) * elementwise_num;
-      const float *input = input_data + offset;
-      const float bias = bias_data[j];
-      float *output = output_data + offset;
-      // DLOG << "output address: "<< output;
-      for (int k = 0; k < elementwise_num; ++k) {
-        output[k] = input[k] + bias;
-        // DLOG << "output[" << k << "]= " << output[k] ;
-      }
-    }
-  }
-}
-template <>
-void ElementwiseAddKernel<FPGA, float>::Compute(
-    const ElementwiseAddParam<FPGA> &param) {
-  auto input_y = const_cast<LoDTensor *>(param.InputY());
-  if (input_y->type() != type_id<float>()) {
-    fpga::ComputeFpgaEWAdd(param.FpgaArgs());
-  } else {
-    auto input_x = const_cast<LoDTensor *>(param.InputX());
-    auto intput_x_float = const_cast<Tensor *>(&(param.float_input_x));
-    fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-    args.input_data_type = fpga::DATA_TYPE_FP16;
-    args.output_data_type = fpga::DATA_TYPE_FP32;
-    args.input_layout_type = fpga::LAYOUT_CHW;
-    args.output_layout_type = fpga::LAYOUT_HWC;
-    args.image.address = input_x->data<half>();
-    args.image.channels = (uint32_t)(input_x->fpga_data_num);
-    args.image.height = 1;
-    args.image.width = 1;
-    args.image.pad_height = 0;
-    args.image.pad_width = 0;
-    args.output.address = intput_x_float->data<float>();
-    args.output.scale_address = intput_x_float->scale;
-
-    // fpga::fpga_flush(input_x->data<half>(),input_x->fpga_data_num *
-    // sizeof(half));
-    fpga::PerformBypass(args);
-    fpga::fpga_invalidate(args.output.address,
-                          input_x->fpga_data_num * sizeof(float));
-
-    // just for test
-    /*    {
-           static int cnt = 0;
-           if(cnt == 0){
-               std::string str= "first_bypass_data";
-               float rslt = 0.0f;
-               fpga::savefile(str, args.output.address, input_x->fpga_data_num,
-       rslt); cnt++;
-           }
-       }*/
-    ElementwiseAddCompute(param);
-
-    auto out_float = const_cast<Tensor *>(&(param.float_out));
-    DLOG << "out float: " << out_float->data<float>();
-    fpga::fpga_flush(out_float->data<float>(),
-                     input_x->fpga_data_num * sizeof(float));
-    // just for test
-    /*{
-       static int cnt = 0;
-       if(cnt == 0){
-           std::string str= "ew_output_data";
-           float rslt = 0.0f;
-
-           fpga::savefile(str, out_float->data<float>(), input_x->fpga_data_num,
-   rslt); cnt++;
-       }
-   }*/
-    auto Out = param.Out();
-    args.input_data_type = fpga::DATA_TYPE_FP32;
-    args.output_data_type = fpga::DATA_TYPE_FP16;
-    args.input_layout_type = fpga::LAYOUT_CHW;
-    args.output_layout_type = fpga::LAYOUT_HWC;
-    args.image.address = out_float->data<float>();
-    args.image.channels = (uint32_t)(input_x->fpga_data_num);
-    args.image.height = 1;
-    args.image.width = 1;
-    args.image.pad_height = 0;
-    args.image.pad_width = 0;
-    args.output.address = Out->data<half>();
-    args.output.scale_address = Out->scale;
-    fpga::PerformBypass(args);
-  }
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
deleted file mode 100644
index f36206a8a1..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_ELEMENTWISEADDRELU_OP
-
-#include "operators/kernel/elementwise_add_relu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ElementwiseAddReluKernel<FPGA, float>::Init(
-    ElementwiseAddReluParam<FPGA> *param) {
-  // bool relu_enabled = true;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
-  auto *input_x = const_cast<LoDTensor *>(param->InputX());
-  auto *input_y = const_cast<LoDTensor *>(param->InputY());
-  auto *out = param->Out();
-  auto input_x_ptr = input_x->data<half>();
-  auto input_y_ptr = input_y->data<half>();
-  fpga::format_fp16_ofm(out);
-  auto out_ptr = out->mutable_data<half>();
-
-  fpga::EWAddArgs ewaddArgs = {0};
-  // ewaddArgs.relu_enabled = relu_enabled;
-  ewaddArgs.output.activation.activation_type = activation_enable;
-  ewaddArgs.output.activation.leaky_relu_negative_slope =
-      leaky_relu_negative_slope;
-  ewaddArgs.const0 = 0x3c00;  // =1
-  ewaddArgs.const1 = 0x3c00;  // =1
-  ewaddArgs.image0.address = input_x_ptr;
-  ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
-  ewaddArgs.image0.scale_address = input_x->scale;
-  ewaddArgs.image0.height = (uint32_t)input_x->dims()[2];
-  ewaddArgs.image0.width = (uint32_t)input_x->dims()[3];
-  ewaddArgs.image0.pad_height = 0;
-  ewaddArgs.image0.pad_width = 0;
-  ewaddArgs.image1.address = input_y_ptr;
-  ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1];
-  ewaddArgs.image1.scale_address = input_y->scale;
-  ewaddArgs.image1.height = (uint32_t)input_y->dims()[2];
-  ewaddArgs.image1.width = (uint32_t)input_y->dims()[3];
-  ewaddArgs.image1.pad_height = 0;
-  ewaddArgs.image1.pad_width = 0;
-  ewaddArgs.output.scale_address = out->scale;
-  ewaddArgs.output.address = out_ptr;
-  fpga::expand_EW_arg(&ewaddArgs);
-  param->SetFpgaArgs(ewaddArgs);
-  return true;
-}
-
-template <>
-void ElementwiseAddReluKernel<FPGA, float>::Compute(
-    const ElementwiseAddReluParam<FPGA> &param) {
-  fpga::ComputeFpgaEWAdd(param.FpgaArgs());
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/elementwise_mul_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/elementwise_mul_kernel.cpp
deleted file mode 100644
index d744ae2c07..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/elementwise_mul_kernel.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEMUL_OP
-
-#include "operators/kernel/elementwise_mul_kernel.h"
-#include "operators/math/elementwise_op_function.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-struct MulFunctor {
-  inline T operator()(T a, T b) const { return a * b; }
-};
-template <>
-bool ElementwiseMulKernel<FPGA, float>::Init(ElementwiseMulParam<FPGA> *param) {
-  param->float_input_x.Resize(param->InputX()->dims());
-  param->float_input_x.init(type_id<float>().hash_code());
-  fpga::format_fp32_ofm(&(param->float_input_x));
-
-  param->float_out.Resize(param->InputX()->dims());
-  param->float_out.init(type_id<float>().hash_code());
-  fpga::format_fp32_ofm(&(param->float_out));
-
-  auto *out = param->Out();
-  fpga::format_fp16_ofm(out);
-  return true;
-}
-
-template <>
-void ElementwiseMulKernel<FPGA, float>::Compute(
-    const ElementwiseMulParam<FPGA> &param) {
-  auto input_x = const_cast<LoDTensor *>(param.InputX());
-  auto intput_x_float = const_cast<Tensor *>(&(param.float_input_x));
-  // auto intput_x_32_ptr =
-  // const_cast<float*>(param.float_input_x.data<float>());
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP32;
-  args.input_layout_type = fpga::LAYOUT_CHW;
-  args.output_layout_type = fpga::LAYOUT_HWC;
-  args.image.address = input_x->data<half>();
-  args.image.channels = (uint32_t)(input_x->fpga_data_num);
-  args.image.height = 1;
-  args.image.width = 1;
-  args.image.pad_height = 0;
-  args.image.pad_width = 0;
-  args.output.address = intput_x_float->data<float>();
-  args.output.scale_address = intput_x_float->scale;
-  fpga::PerformBypass(args);
-  fpga::fpga_invalidate(args.output.address,
-                        input_x->fpga_data_num * sizeof(float));
-
-  auto input_y = param.InputY();
-  int axis = param.Axis();
-  auto out_float = const_cast<Tensor *>(&(param.float_out));
-  ElementwiseComputeEx<MulFunctor<float>, float>(
-      intput_x_float, input_y, axis, MulFunctor<float>(), out_float);
-  fpga::fpga_flush(out_float->data<float>(),
-                   input_x->fpga_data_num * sizeof(float));
-
-  Tensor *Out = param.Out();
-  args.input_data_type = fpga::DATA_TYPE_FP32;
-  args.output_data_type = fpga::DATA_TYPE_FP16;
-  args.input_layout_type = fpga::LAYOUT_CHW;
-  args.output_layout_type = fpga::LAYOUT_HWC;
-  args.image.address = out_float->data<float>();
-  args.image.channels = (uint32_t)(Out->fpga_data_num);
-  args.image.height = 1;
-  args.image.width = 1;
-  args.image.pad_height = 0;
-  args.image.pad_width = 0;
-  args.output.address = Out->data<half>();
-  args.output.scale_address = Out->scale;
-  fpga::PerformBypass(args);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/feed_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/feed_kernel.cpp
deleted file mode 100644
index 28559b2b4b..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/feed_kernel.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/feed_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
-  auto output = param->Out();
-  int col = param->Col();
-  DLOG << "col = " << col;
-  auto input = const_cast<LoDTensor *>(&param->InputX()->at(col));
-  input->init(type_id<float>().hash_code());
-  input->Resize(output->dims());
-
-  if (output->dims().size() != 4) {
-    return true;
-  }
-
-  fpga::format_fp16_ofm(output);
-  return true;
-}
-
-template <>
-void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
-  auto output = param.Out();
-  int col = param.Col();
-  auto input = const_cast<LoDTensor *>(&param.InputX()->at(col));
-  kTypeId_t input_type = input->type();
-
-  if (input_type == type_id<float>()) {
-    input->init(type_id<float>().hash_code());
-  } else {
-    input->init(type_id<int8_t>().hash_code());
-  }
-  input->Resize(output->dims());
-
-  if (output->dims().size() != 4) {
-    size_t size = output->numel() * sizeof(float);
-    auto output_ptr = output->data<float>();
-    auto input_ptr = input->data<float>();
-    auto external_ptr = reinterpret_cast<float *>(input->external_data);
-    float *p_data = external_ptr == nullptr ? input_ptr : external_ptr;
-    memcpy(output_ptr, p_data, size);
-    input->external_data = nullptr;
-    return;
-  }
-
-  fpga::format_image(input);
-  auto output_ptr = output->data<half>();
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
-  if (input_type == type_id<float>()) {
-    auto input_ptr = input->data<float>();
-    auto external_ptr = reinterpret_cast<float *>(input->external_data);
-    float *p_data = external_ptr == nullptr ? input_ptr : external_ptr;
-
-    args.input_data_type = fpga::DATA_TYPE_FP32;
-    args.output_data_type = fpga::DATA_TYPE_FP16;
-    args.input_layout_type = fpga::LAYOUT_CHW;
-    args.output_layout_type = fpga::LAYOUT_HWC;
-    args.image.address = p_data;
-    args.image.channels = (uint32_t)input->dims()[1];
-    args.image.height = (uint32_t)input->dims()[2];
-    args.image.width = (uint32_t)input->dims()[3];
-    args.image.pad_height = 0;
-    args.image.pad_width = 0;
-    args.output.address = output_ptr;
-    args.output.scale_address = output->scale;
-    fpga::PerformBypass(args);
-    input->external_data = nullptr;
-  } else {
-    auto input_ptr = input->data<int8_t>();
-    auto external_ptr = reinterpret_cast<int8_t *>(input->external_data);
-    int8_t *p_data = external_ptr == nullptr ? input_ptr : external_ptr;
-
-    args.input_data_type = fpga::DATA_TYPE_INT8;
-    args.output_data_type = fpga::DATA_TYPE_FP16;
-    args.input_layout_type = fpga::LAYOUT_CHW;
-    args.output_layout_type = fpga::LAYOUT_HWC;
-    args.image.address = p_data;
-    args.image.channels = (uint32_t)input->dims()[1];
-    args.image.height = (uint32_t)input->dims()[2];
-    args.image.width = (uint32_t)input->dims()[3];
-    args.image.pad_height = 0;
-    args.image.pad_width = 0;
-    args.output.address = output_ptr;
-    args.output.scale_address = output->scale;
-    fpga::PerformBypass(args);
-    input->external_data = nullptr;
-  }
-}
-template class FeedKernel<FPGA, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/fpga/V1/fetch_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/fetch_kernel.cpp
deleted file mode 100644
index 87ede2af1a..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/fetch_kernel.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "operators/kernel/fetch_kernel.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
-  auto input = const_cast<LoDTensor *>(param->InputX());
-  int col = param->Col();
-  DLOG << "col = " << col;
-  auto output = &(param->Out()->at(col));
-  if (input->type() == type_id<float>()) {
-    return true;
-  }
-  output->init(type_id<float>().hash_code());
-  output->Resize(input->dims());
-  fpga::format_fp32_ofm(output);
-  int outC = 1;
-  int outH = 1;
-  int outW = 1;
-  if (output->dims().size() == 4) {
-    outC = output->dims()[1];
-    outH = output->dims()[2];
-    outW = output->dims()[3];
-  } else {  // 2
-    outC = output->dims()[1];
-  }
-  int unalignedCW = outC * outW;
-  int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
-  if (alignedCW != unalignedCW) {
-    param->aligned_out.Resize(input->dims());
-    param->aligned_out.mutable_data<float>(input->dims());
-    fpga::fpga_flush(param->aligned_out.data<float>(),
-                     outH * unalignedCW * sizeof(float));
-  }
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP32;
-  args.input_layout_type = fpga::LAYOUT_CHW;
-  args.output_layout_type = fpga::LAYOUT_HWC;
-  args.image.address = input->data<half>();
-  args.image.channels = (uint32_t)(input->fpga_data_num);
-  args.image.height = 1;
-  args.image.width = 1;
-  args.image.pad_height = 0;
-  args.image.pad_width = 0;
-  args.output.address = output->data<float>();
-  args.output.scale_address = output->scale;
-  param->fpga_bypass_args = args;
-
-  return true;
-}
-void dealign(float *src, float *dst, int input_c, int input_h, int input_w) {
-  int alignCW = paddle_mobile::fpga::align_to_x(input_c * input_w, 16);
-  int dealignCW = input_c * input_w;
-  for (int h = 0; h < input_h; ++h) {
-    auto input_offset = h * alignCW;
-    auto output_offset = h * dealignCW;
-    memcpy((dst + output_offset), (src + input_offset),
-           dealignCW * sizeof(float));
-  }
-}
-template <>
-void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
-  auto input = const_cast<LoDTensor *>(param.InputX());
-  int col = param.Col();
-  auto output = &param.Out()->at(col);
-  if (input->type() == type_id<float>()) {
-    output->ShareDataWith(*input);
-    return;
-  }
-
-  fpga::BypassArgs args = param.fpga_bypass_args;
-  auto input_address = (input->data<half>());
-  args.image.address = static_cast<void *>(input_address);
-  float *outdata_ptr =
-      reinterpret_cast<float *>(param.fpga_bypass_args.output.address);
-  const int num_th = 32;
-  if (output->fpga_data_num < num_th) {
-    fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(half));
-
-    for (int idx = 0; idx < product(input->dims()); ++idx) {
-      outdata_ptr[idx] = fpga::fp16_2_fp32(input_address[idx]);
-    }
-    return;
-  }
-
-  fpga::PerformBypass(args);
-  int outC = 1;
-  int outH = 1;
-  int outW = 1;
-  if (output->dims().size() == 4) {
-    outC = output->dims()[1];
-    outH = output->dims()[2];
-    outW = output->dims()[3];
-  } else {  // 2
-    outC = output->dims()[1];
-  }
-
-  fpga::fpga_invalidate(param.fpga_bypass_args.output.address,
-                        output->fpga_data_num * sizeof(float));
-  int unalignedCW = outC * outW;
-  int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
-  if (unalignedCW != alignedCW) {
-    auto aligned_ptr = const_cast<float *>(param.aligned_out.data<float>());
-    dealign(outdata_ptr, aligned_ptr, outC, outH, outW);
-    memcpy(outdata_ptr, aligned_ptr, outC * outH * outW * sizeof(float));
-    fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float));
-  }
-}
-template class FetchKernel<FPGA, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp
deleted file mode 100644
index 3a29104d0f..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_FC_OP
-
-#include "operators/kernel/fusion_fc_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
-  // bool relu_enabled = false;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input_x = const_cast<LoDTensor *>(param->InputX());
-  auto filter = const_cast<LoDTensor *>(param->InputY());
-  const Tensor *input_z = param->InputZ();
-  auto input_z_ptr = input_z->data<float>();
-  auto out = param->Out();
-
-  // PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
-  //                     "Image channel should be equal to weight number");
-  int channel = (uint32_t)out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = 1;
-    bs_ptr[i] = input_z_ptr[i];
-  }
-  int num = (uint32_t)filter->dims()[1];
-  int chw = (uint32_t)filter->dims()[0];
-  PADDLE_MOBILE_ENFORCE(
-      chw == input_x->numel(),
-      "Filter element num should be equal to IFM element num");
-  int height = (uint32_t)input_x->dims()[2];
-  int width = (uint32_t)input_x->dims()[3];
-  int filter_channel = chw / height / width;
-
-  out->Resize(framework::make_ddim({1, channel, 1, 1}));
-  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
-  float max_value = fpga::filter_find_max(filter);
-  fpga::format_fc_filter(filter, max_value);
-
-  int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
-  fpga::format_fp16_ofm(out);
-
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input_x, out, filter, activation_enable,
-                       leaky_relu_negative_slope, 1, 1, 1, 0, 0, bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-  return true;
-}
-
-template <>
-void FusionFcKernel<FPGA, float>::Compute(const FusionFcParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp
deleted file mode 100644
index fef370515e..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_FCRELU_OP
-
-#include "operators/kernel/fc_relu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
-  // bool relu_enabled = false;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input_x = const_cast<LoDTensor *>(param->InputX());
-  auto filter = const_cast<LoDTensor *>(param->InputY());
-  const Tensor *input_z = param->InputZ();
-  auto input_z_ptr = input_z->data<float>();
-  auto out = param->Out();
-
-  // PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
-  //                      "Image channel should be equal to weight number");
-  int channel = (uint32_t)out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = 1;
-    bs_ptr[i] = input_z_ptr[i];
-  }
-  int num = (uint32_t)filter->dims()[1];
-  int chw = (uint32_t)filter->dims()[0];
-  PADDLE_MOBILE_ENFORCE(
-      chw == input_x->numel(),
-      "Filter element num should be equal to IFM element num");
-  int height = (uint32_t)input_x->dims()[2];
-  int width = (uint32_t)input_x->dims()[3];
-  int filter_channel = chw / height / width;
-
-  out->Resize(framework::make_ddim({1, channel, 1, 1}));
-  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
-  float max_value = fpga::filter_find_max(filter);
-  fpga::format_fc_filter(filter, max_value);
-
-  int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
-  fpga::format_fp16_ofm(out);
-
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input_x, out, filter, activation_enable,
-                       leaky_relu_negative_slope, 1, 1, 1, 0, 0, bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-  return true;
-}
-
-template <>
-void FusionFcReluKernel<FPGA, float>::Compute(
-    const FusionFcReluParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/pad2d_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/pad2d_kernel.cpp
deleted file mode 100644
index 370b34e863..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/pad2d_kernel.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/pad2d_kernel.h"
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool Pad2DKernel<FPGA, float>::Init(Pad2DParam<FPGA> *param) {
-  Tensor *output = param->Out();
-  fpga::format_fp16_ofm(output);
-  return true;
-}
-void pad2dFunc(const framework::Tensor *input, framework::Tensor *output) {
-  auto input_data = (input->data<half>());
-  auto output_data = (output->data<half>());
-  auto input_c = input->dims()[1];
-  auto input_h = input->dims()[2];
-  auto input_w = input->dims()[3];
-  auto output_c = output->dims()[1];
-  auto output_w = output->dims()[3];
-  auto copysize = input_c * input_w;
-  for (int h = 0; h < input_h; ++h) {
-    auto input_offset = h * input_c * input_w;
-    auto output_offset = h * paddle_mobile::fpga::align_to_x(
-                                 output_c * output_w, IMAGE_ALIGNMENT);
-    memcpy((output_data + output_offset), (input_data + input_offset),
-           copysize * sizeof(half));
-  }
-}
-template <>
-void Pad2DKernel<FPGA, float>::Compute(const Pad2DParam<FPGA> &param) {
-  auto in_x = param.InputX();
-  auto out = param.Out();
-  fpga::fpga_invalidate((void *)in_x->data<half>(),  // NOLINT
-                        in_x->numel() * sizeof(half));
-  pad2dFunc(in_x, out);
-  (out->scale)[0] = (in_x->scale)[0];
-  (out->scale)[1] = (in_x->scale)[1];
-  DLOG << (out->scale)[0];
-  DLOG << (out->scale)[1];
-  size_t outputSize =
-      out->dims()[2] *
-      paddle_mobile::fpga::align_to_x((out->dims()[1]) * (out->dims()[3]),
-                                      IMAGE_ALIGNMENT) *
-      sizeof(half);
-  fpga::fpga_flush(out->data<half>(), outputSize);
-}
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/fpga/V1/pool_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/pool_kernel.cpp
deleted file mode 100644
index 7c8dba1696..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/pool_kernel.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef POOL_OP
-
-#include "operators/kernel/pool_kernel.h"
-
-class PoolingArgs;
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
-  auto *input = const_cast<LoDTensor *>(param->Input());
-  auto *output = param->Output();
-  vector<int> ksize = param->Ksize();
-  vector<int> strides = param->Strides();
-  vector<int> paddings = param->Paddings();
-  std::string pooling_type = param->PoolingType();
-
-  if (input->type() == type_id<float>()) {
-    int channels = input->dims()[1];
-    int height = input->dims()[2];
-    int width = input->dims()[3];
-    int num = input->dims()[0];
-    int out_width = (width + 2 * paddings[1] - ksize[1]) / strides[1] + 1;
-    int out_height = (height + 2 * paddings[0] - ksize[0]) / strides[0] + 1;
-    framework::DDim dim =
-        framework::make_ddim({num, channels, out_height, out_width});
-    output->mutable_data<float>(dim);
-    return true;
-  }
-
-  auto input_ptr = input->data<half>();
-  fpga::format_fp16_ofm(output);
-  auto output_ptr = output->mutable_data<half>();
-
-  fpga::PoolingArgs poolArgs = {0};
-  poolArgs.mode = pooling_type == "max" ? 0 : 1;  // max:0, avg:1
-  poolArgs.kernel_reciprocal =
-      fpga::fp32_2_fp16(float(1.0 / (ksize[0] * ksize[1])));  // NOLINT
-  poolArgs.image.address = input_ptr;
-  poolArgs.image.channels = (uint32_t)input->dims()[1];
-  poolArgs.image.height = (uint32_t)input->dims()[2];
-  poolArgs.image.width = (uint32_t)input->dims()[3];
-  poolArgs.image.pad_height = (uint32_t)paddings[0];
-  poolArgs.image.pad_width = (uint32_t)paddings[1];
-  poolArgs.image.scale_address = input->scale;
-  poolArgs.output.address = output_ptr;
-  poolArgs.output.scale_address = output->scale;
-  poolArgs.kernel.height = (uint32_t)ksize[0];
-  poolArgs.kernel.width = (uint32_t)ksize[1];
-  poolArgs.kernel.stride_h = (uint32_t)strides[0];
-  poolArgs.kernel.stride_w = (uint32_t)strides[1];
-  param->SetFpgaArgs(poolArgs);
-  return true;
-}
-
-template <>
-void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) {
-  auto *input = const_cast<LoDTensor *>(param.Input());
-
-  if (input->type() == type_id<float>()) {
-    auto *output = param.Output();
-    auto in = input->data<float>();
-    auto N = input->dims()[0];
-    output->Resize(
-        {N, output->dims()[1], output->dims()[2], output->dims()[3]});
-    auto len = output->numel();
-    auto out = output->mutable_data<float>();
-    int C = input->dims()[1], H = input->dims()[2],  // N = input->dims()[0],
-        W = input->dims()[3];
-    int HW = H * W, CHW = C * H * W, WC = W * C;
-
-    for (int n = 0; n < N; n++) {
-      for (int c = 0; c < C; c++) {
-        out[n * C + c] = 0;
-        for (int h = 0; h < H; h++) {
-          for (int w = 0; w < W; w++) {
-            out[n * C + c] += in[n * CHW + h * WC + w * C +
-                                 c];  // in[n * CHW + c * HW + h * W + w]; //
-          }
-        }
-        out[n * C + c] /= HW;
-      }
-    }
-    return;
-  }
-  fpga::ComputeFpgaPool(param.FpgaArgs());
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/proposal_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/proposal_kernel.cpp
deleted file mode 100644
index bd6703bb81..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/proposal_kernel.cpp
+++ /dev/null
@@ -1,567 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PROPOSAL_OP
-
-#include <algorithm>
-#include <cmath>
-#include <vector>
-#include "operators/kernel/detection_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-static const double kBBoxClipDefault = std::log(1000.0 / 16.0);
-
-template <>
-bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) {
-  int post_nms_top_n = param->post_nms_topn_;
-  int64_t batch = param->scores_->dims()[0];
-  auto total = post_nms_top_n * batch;
-  param->rpn_rois_->mutable_data<float>({total, 4});
-  param->rpn_probs_->mutable_data<float>({total, 1});
-
-  //  DLOG << *param->rpn_rois_;
-  //  DLOG << *param->rpn_probs_;
-
-  param->float_bbox = std::make_shared<Tensor>();
-  param->float_bbox->Resize(param->bbox_deltas_->dims());
-  param->float_bbox->init(type_id<float>().hash_code());
-  fpga::format_fp32_ofm(param->float_bbox.get());
-  param->float_score = std::make_shared<Tensor>();
-  param->float_score->Resize(param->scores_->dims());
-  param->float_score->init(type_id<float>().hash_code());
-  fpga::format_fp32_ofm(param->float_score.get());
-
-  auto input = param->bbox_deltas_;
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-  args.input_layout_type = fpga::LAYOUT_HWC;
-  args.output_layout_type = fpga::LAYOUT_HWC;
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP32;
-  args.image.address = input->data<half>();
-  args.image.height = (uint32_t)input->dims()[2];
-  args.image.width = (uint32_t)input->dims()[3];
-  args.image.channels = (uint32_t)input->dims()[1];
-  args.output.address = param->float_bbox->mutable_data<float>();
-  args.output.scale_address = param->float_bbox->scale;
-  param->bbox_arg = args;
-
-  input = param->scores_;
-  args.image.address = input->data<half>();
-  args.image.height = (uint32_t)input->dims()[2];
-  args.image.width = (uint32_t)input->dims()[3];
-  args.image.channels = (uint32_t)input->dims()[1];
-  args.output.address = param->float_score->mutable_data<float>();
-  args.output.scale_address = param->float_score->scale;
-  param->score_arg = args;
-
-  param->score_index_ = std::make_shared<Tensor>();
-  param->score_index_->mutable_data<int32_t>({input->numel()});
-  auto score_index = param->score_index_->data<int32_t>();
-  for (int i = 0; i < input->numel(); ++i) {
-    score_index[i] = i;
-  }
-
-  return true;
-}
-template <typename T>
-void CPUGather(const Tensor &src, const Tensor &index, Tensor *output) {
-  PADDLE_MOBILE_ENFORCE(index.dims().size() == 1 ||
-                            (index.dims().size() == 2 && index.dims()[1] == 1),
-                        "Dim not correct");
-  int64_t index_size = index.dims()[0];
-
-  auto src_dims = src.dims();
-
-  const T *p_src = src.data<T>();
-  const int *p_index = index.data<int>();
-  T *p_output = output->data<T>();
-
-  // slice size
-  int slice_size = 1;
-  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
-
-  const size_t slice_bytes = slice_size * sizeof(T);
-
-  for (int64_t i = 0; i < index_size; ++i) {
-    int index_ = p_index[i];
-    memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
-  }
-}
-
-void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) {
-  auto *out_data = dst->data<void>();
-  auto *to_add_data = src.data<void>();
-  size_t size_of_t = framework::SizeOfType(src.type());
-  offset *= size_of_t;
-  std::memcpy(
-      reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(out_data) + offset),
-      to_add_data, src.numel() * size_of_t);
-}
-
-template <class T>
-static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas,
-                            Tensor *variances, Tensor *proposals) {
-  T *proposals_data = proposals->mutable_data<T>();
-
-  int64_t row = all_anchors->dims()[0];
-  int64_t len = all_anchors->dims()[1];
-
-  auto *bbox_deltas_data = bbox_deltas->data<T>();
-  auto *anchor_data = all_anchors->data<T>();
-  const T *variances_data = nullptr;
-  if (variances) {
-    variances_data = variances->data<T>();
-  }
-
-  for (int64_t i = 0; i < row; ++i) {
-    T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0;
-    T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0;
-
-    T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width;
-    T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height;
-
-    T bbox_center_x = 0, bbox_center_y = 0;
-    T bbox_width = 0, bbox_height = 0;
-
-    /*
-        if (variances) {
-          bbox_center_x =
-              variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width
-       + anchor_center_x; bbox_center_y = variances_data[i * len + 1] *
-                              bbox_deltas_data[i * len + 1] * anchor_height +
-                          anchor_center_y;
-          bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
-                                                bbox_deltas_data[i * len + 2],
-                                            kBBoxClipDefault)) *
-                       anchor_width;
-          bbox_height = std::exp(std::min<T>(variances_data[i * len + 3] *
-                                                 bbox_deltas_data[i * len + 3],
-                                             kBBoxClipDefault)) *
-                        anchor_height;
-        } else {
-    */
-    bbox_center_x = bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
-    bbox_center_y =
-        bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
-
-    /*
-          bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
-                                            kBBoxClipDefault)) *
-                       anchor_width;
-          bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
-                                             kBBoxClipDefault)) *
-                        anchor_height;
-    */
-    bbox_width = std::exp(bbox_deltas_data[i * len + 2]) * anchor_width;
-    bbox_height = std::exp(bbox_deltas_data[i * len + 3]) * anchor_height;
-    //    }
-
-    proposals_data[i * len] = bbox_center_x - bbox_width / 2;
-    proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
-    /*
-        //wong
-        proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1;
-        proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1;
-        //wong
-    */
-    proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2;
-    proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2;
-  }
-  // return proposals;
-}
-
-template <class T>
-static inline void ClipTiledBoxes(const Tensor &im_info, Tensor *boxes) {
-  T *boxes_data = boxes->mutable_data<T>();
-  const T *im_info_data = im_info.data<T>();
-  T zero(0);
-  for (int64_t i = 0; i < boxes->numel(); ++i) {
-    if (i % 4 == 0) {
-      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
-    } else if (i % 4 == 1) {
-      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
-    } else if (i % 4 == 2) {
-      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
-    } else {
-      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
-    }
-  }
-}
-
-template <class T>
-static inline void FilterBoxes(Tensor *boxes, float min_size,
-                               const Tensor &im_info, Tensor *keep) {
-  const T *im_info_data = im_info.data<T>();
-  T *boxes_data = boxes->mutable_data<T>();
-  T im_scale = im_info_data[2];
-  keep->Resize({boxes->dims()[0]});
-  min_size = std::max(min_size, 1.0f);
-  int *keep_data = keep->mutable_data<int>();
-
-  int keep_len = 0;
-  for (int i = 0; i < boxes->dims()[0]; ++i) {
-    T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1;
-    T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1;
-    T ws_origin_scale =
-        (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1;
-    T hs_origin_scale =
-        (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1;
-    T x_ctr = boxes_data[4 * i] + ws / 2;
-    T y_ctr = boxes_data[4 * i + 1] + hs / 2;
-    if (ws_origin_scale >= min_size && hs_origin_scale >= min_size &&
-        x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) {
-      keep_data[keep_len++] = i;
-    }
-  }
-  keep->Resize({keep_len});
-}
-
-template <class T>
-static inline std::vector<std::pair<T, int>> GetSortedScoreIndex(
-    const std::vector<T> &scores) {
-  std::vector<std::pair<T, int>> sorted_indices;
-  sorted_indices.reserve(scores.size());
-  for (size_t i = 0; i < scores.size(); ++i) {
-    sorted_indices.emplace_back(scores[i], i);
-  }
-  // Sort the score pair according to the scores in descending order
-  std::stable_sort(sorted_indices.begin(), sorted_indices.end(),
-                   [](const std::pair<T, int> &a, const std::pair<T, int> &b) {
-                     return a.first < b.first;
-                   });
-  return sorted_indices;
-}
-
-template <class T>
-static inline T BBoxArea(const T *box, bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
-    }
-  }
-}
-
-template <typename T>
-static inline Tensor VectorToTensor(const std::vector<T> &selected_indices,
-                                    int selected_num) {
-  Tensor keep_nms;
-  keep_nms.Resize({selected_num});
-  auto *keep_data = keep_nms.mutable_data<T>();
-  for (int i = 0; i < selected_num; ++i) {
-    keep_data[i] = selected_indices[i];
-  }
-  return keep_nms;
-}
-
-template <class T>
-static inline T JaccardOverlap(const T *box1, const T *box2, bool normalized) {
-  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
-      box2[3] < box1[1]) {
-    return static_cast<T>(0.);
-  } else {
-    const T inter_xmin = std::max(box1[0], box2[0]);
-    const T inter_ymin = std::max(box1[1], box2[1]);
-    const T inter_xmax = std::min(box1[2], box2[2]);
-    const T inter_ymax = std::min(box1[3], box2[3]);
-    const T inter_w = std::max(T(0), inter_xmax - inter_xmin + 1);
-    const T inter_h = std::max(T(0), inter_ymax - inter_ymin + 1);
-    const T inter_area = inter_w * inter_h;
-    const T bbox1_area = BBoxArea<T>(box1, normalized);
-    const T bbox2_area = BBoxArea<T>(box2, normalized);
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
-}
-
-template <class T>
-static inline Tensor NMS(Tensor *bbox, Tensor *scores, T nms_threshold,
-                         float eta, int post_nms_num = 100) {
-  int64_t num_boxes = bbox->dims()[0];
-  // 4: [xmin ymin xmax ymax]
-  int64_t box_size = bbox->dims()[1];
-
-  std::vector<T> scores_data(num_boxes);
-  std::copy_n(scores->data<T>(), num_boxes, scores_data.begin());
-  std::vector<std::pair<T, int>> sorted_indices =
-      GetSortedScoreIndex<T>(scores_data);
-
-  std::vector<int> selected_indices;
-  int selected_num = 0;
-  T adaptive_threshold = nms_threshold;
-  const T *bbox_data = bbox->data<T>();
-  while ((sorted_indices.size() != 0) && (selected_num < post_nms_num)) {
-    int idx = sorted_indices.back().second;
-    bool flag = true;
-    for (int kept_idx : selected_indices) {
-      if (flag) {
-        T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
-                                      bbox_data + kept_idx * box_size, false);
-        flag = (overlap <= adaptive_threshold);
-      } else {
-        break;
-      }
-    }
-    if (flag) {
-      selected_indices.push_back(idx);
-      ++selected_num;
-    }
-    sorted_indices.erase(sorted_indices.end() - 1);
-    if (flag && eta < 1 && adaptive_threshold > 0.5) {
-      adaptive_threshold *= eta;
-    }
-  }
-  return VectorToTensor(selected_indices, selected_num);
-}
-
-template <typename T>
-std::pair<Tensor, Tensor> ProposalForOneImage(
-    const Tensor &im_info_slice, const Tensor &anchors, const Tensor &variances,
-    const Tensor &bbox_deltas_slice,  // [M, 4]
-    const Tensor &scores_slice,       // [N, 1]
-    const Tensor &score_index, int pre_nms_top_n, int post_nms_top_n,
-    float nms_thresh, float min_size, float eta) {
-  auto *scores_data = scores_slice.data<T>();
-
-  // Sort index
-  Tensor index_t;
-  index_t.Resize({scores_slice.numel()});
-  int *index = index_t.mutable_data<int>();
-  /*for (int i = 0; i < scores_slice.numel(); ++i) {
-    index[i] = i;
-  }*/
-  std::memcpy(index, score_index.data<int32_t>(),
-              scores_slice.numel() * sizeof(int));
-
-  auto compare = [scores_data](const int64_t &i, const int64_t &j) {
-    return scores_data[i] > scores_data[j];
-  };
-
-  if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) {
-    std::sort(index, index + scores_slice.numel(), compare);
-  } else {
-    std::nth_element(index, index + pre_nms_top_n, index + scores_slice.numel(),
-                     compare);
-    index_t.Resize({pre_nms_top_n});
-  }
-
-  Tensor scores_sel, bbox_sel, anchor_sel, var_sel;
-  scores_sel.mutable_data<T>({index_t.numel(), 1});
-  bbox_sel.mutable_data<T>({index_t.numel(), 4});
-  anchor_sel.mutable_data<T>({index_t.numel(), 4});
-  var_sel.mutable_data<T>({index_t.numel(), 4});
-
-  CPUGather<T>(scores_slice, index_t, &scores_sel);
-  CPUGather<T>(bbox_deltas_slice, index_t, &bbox_sel);
-  CPUGather<T>(anchors, index_t, &anchor_sel);
-  Tensor proposals;
-  proposals.mutable_data<T>({index_t.numel(), 4});
-  BoxCoder<T>(&anchor_sel, &bbox_sel, nullptr, &proposals);
-
-  ClipTiledBoxes<T>(im_info_slice, &proposals);
-
-  Tensor keep;
-  FilterBoxes<T>(&proposals, min_size, im_info_slice, &keep);
-
-  Tensor scores_filter;
-  bbox_sel.mutable_data<T>({keep.numel(), 4});
-  scores_filter.mutable_data<T>({keep.numel(), 1});
-
-  CPUGather<T>(proposals, keep, &bbox_sel);
-  CPUGather<T>(scores_sel, keep, &scores_filter);
-  if (nms_thresh <= 0) {
-    return std::make_pair(bbox_sel, scores_filter);
-  }
-
-  // Tensor keep_nms = NMS<T>(&bbox_sel, &scores_filter, nms_thresh, eta);
-  Tensor keep_nms =
-      NMS<T>(&bbox_sel, &scores_filter, nms_thresh, eta, post_nms_top_n);
-
-  if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
-    keep_nms.Resize({post_nms_top_n});
-  }
-
-  proposals.mutable_data<T>({keep_nms.numel(), 4});   // original
-  scores_sel.mutable_data<T>({keep_nms.numel(), 1});  // original
-
-  // proposals.mutable_data<T>({post_nms_top_n, 4});   // wong
-  // scores_sel.mutable_data<T>({post_nms_top_n, 1});  // wong
-  CPUGather<T>(bbox_sel, keep_nms, &proposals);
-  CPUGather<T>(scores_filter, keep_nms, &scores_sel);
-  return std::make_pair(proposals, scores_sel);
-}
-
-template <>
-void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
-  auto input_score = param.scores_;
-  auto input_score_data = input_score->data<half>();
-  auto input_score_data_tmp = input_score->data<half>();
-  uint32_t score_n, score_height, score_width, score_channels;
-
-  auto input_bbox = param.bbox_deltas_;
-  auto input_bbox_data = input_bbox->data<half>();
-  auto input_bbox_data_tmp = input_bbox->data<half>();
-  uint32_t bbox_n, bbox_height, bbox_width, bbox_channels;
-
-  score_n = (uint32_t)(input_score->dims()[0]);
-  score_channels = (uint32_t)(input_score->dims()[1]);
-  score_height = (uint32_t)(input_score->dims()[2]);
-  score_width = (uint32_t)(input_score->dims()[3]);
-
-  bbox_n = (uint32_t)(input_bbox->dims()[0]);
-  bbox_channels = (uint32_t)(input_bbox->dims()[1]);
-  bbox_height = (uint32_t)(input_bbox->dims()[2]);
-  bbox_width = (uint32_t)(input_bbox->dims()[3]);
-
-  std::shared_ptr<Tensor> score_tmp = std::make_shared<Tensor>();
-  score_tmp->Resize(param.scores_->dims());
-  score_tmp->mutable_data<half>();
-
-  std::shared_ptr<Tensor> bbox_tmp = std::make_shared<Tensor>();
-  bbox_tmp->Resize(param.bbox_deltas_->dims());
-  bbox_tmp->mutable_data<half>();
-
-  auto score_tmp_data = score_tmp->data<half>();
-  auto bbox_tmp_data = bbox_tmp->data<half>();
-  int64_t amount_per_side = score_width * score_height;
-  int idx = 0;
-  fpga::fpga_invalidate(
-      input_score_data_tmp,
-      score_height * score_width * score_channels * sizeof(half));
-  for (int h = 0; h < score_height; h++) {
-    for (int w = 0; w < score_width; w++) {
-      for (int c = 0; c < score_channels; c++) {
-        idx++;
-        // DLOG  << "wong input_score: "<<
-        // paddle_mobile::fpga::fp16_2_fp32(input_score_data[idx]);
-        *(score_tmp_data + c * amount_per_side + score_width * h + w) =
-            (*(input_score_data_tmp++));
-      }
-    }
-  }
-  amount_per_side = bbox_width * bbox_height;
-  fpga::fpga_invalidate(input_bbox_data_tmp, bbox_height * bbox_width *
-                                                 bbox_channels * sizeof(half));
-  for (int h = 0; h < bbox_height; h++) {
-    for (int w = 0; w < bbox_width; w++) {
-      for (int c = 0; c < bbox_channels; c++) {
-        idx++;
-        // DLOG  << "wong input_score: "<<
-        // paddle_mobile::fpga::fp16_2_fp32(input_score_data[idx]);
-        *(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) =
-            (*(input_bbox_data_tmp++));
-      }
-    }
-  }
-  struct paddle_mobile::fpga::BypassArgs temp_score_arg;
-  struct paddle_mobile::fpga::BypassArgs temp_bbox_arg;
-  temp_score_arg = param.score_arg;
-  temp_score_arg.image.address = score_tmp->data<half>();
-
-  temp_bbox_arg = param.bbox_arg;
-  temp_bbox_arg.image.address = bbox_tmp->data<half>();
-  auto score_tensor = param.float_score.get();
-  fpga::PerformBypass(param.score_arg);
-  fpga::fpga_invalidate(score_tensor->data<float>(),
-                        score_tensor->numel() * sizeof(float));
-
-  auto bbox_tensor = param.float_bbox.get();
-  fpga::PerformBypass(param.bbox_arg);
-  fpga::fpga_invalidate(bbox_tensor->data<float>(),
-                        bbox_tensor->numel() * sizeof(float));
-
-  auto *scores = param.float_score.get();
-  auto *bbox_deltas = param.float_bbox.get();
-  auto *im_info = param.im_info_;
-  auto anchors = *param.anchors_;
-  auto variances = *param.variances_;
-
-  auto *rpn_rois = param.rpn_rois_;
-  auto *rpn_roi_probs = param.rpn_probs_;
-
-  auto score_index = *(param.score_index_.get());
-
-  int pre_nms_top_n = param.pre_nms_topn_;
-  int post_nms_top_n = param.post_nms_topn_;
-  // DLOG << " param.post_nms_topn_ : " << param.post_nms_topn_;
-
-  float nms_thresh = param.nms_thresh_ / 2.0f;
-  float min_size = param.min_size_;
-  float eta = param.eta_;
-
-  auto &scores_dim = scores->dims();
-  int64_t num = scores_dim[0];
-  int64_t c_score = scores_dim[1];
-  int64_t h_score = scores_dim[2];
-  int64_t w_score = scores_dim[3];
-
-  auto &bbox_dim = bbox_deltas->dims();
-  int64_t c_bbox = bbox_dim[1];
-  int64_t h_bbox = bbox_dim[2];
-  int64_t w_bbox = bbox_dim[3];
-
-  //
-  rpn_rois->mutable_data<float>({bbox_deltas->numel(), 4});
-  rpn_roi_probs->mutable_data<float>({scores->numel(), 1});
-
-  framework::LoD lod;
-  lod.resize(1);
-  auto &lod0 = lod[0];
-  lod0.push_back(0);
-  anchors.Resize({anchors.numel(), 4});
-  variances.Resize({variances.numel(), 4});
-
-  int64_t num_proposals = 0;
-  for (int64_t i = 0; i < num; ++i) {
-    Tensor im_info_slice = im_info->Slice(i, i + 1);
-    Tensor bbox_deltas_slice = (*bbox_tensor).Slice(i, i + 1);
-    Tensor scores_slice = (*score_tensor).Slice(i, i + 1);
-
-    bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox, 4});
-    scores_slice.Resize({h_score * w_score * c_score, 1});
-
-    std::pair<Tensor, Tensor> tensor_pair = ProposalForOneImage<float>(
-        im_info_slice, anchors, variances, bbox_deltas_slice, scores_slice,
-        score_index, pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta);
-    Tensor &proposals = tensor_pair.first;
-    Tensor &scores = tensor_pair.second;
-
-    AppendProposals(rpn_rois, 4 * num_proposals, proposals);
-    AppendProposals(rpn_roi_probs, num_proposals, scores);
-    num_proposals += proposals.dims()[0];
-    lod0.push_back(num_proposals);
-  }
-  rpn_rois->set_lod(lod);
-  rpn_roi_probs->set_lod(lod);
-  rpn_rois->Resize({num_proposals, 4});
-  rpn_roi_probs->Resize({num_proposals, 1});
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // PROPOSAL_OP
diff --git a/mobile/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
deleted file mode 100644
index 7e0852ca4b..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
+++ /dev/null
@@ -1,284 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PSROI_POOL_OP
-
-#include <cmath>
-#include <vector>
-#include "operators/kernel/detection_kernel.h"
-
-#include "fpga/V1/api.h"
-#include "fpga/V1/image.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
-  auto dims = param->input_x_->dims();
-  PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0,
-                        "data not aligned");
-
-  param->float_input = std::make_shared<Tensor>();
-  param->float_input->mutable_data<float>(param->input_x_->dims());
-  // param->float_output = std::make_shared<Tensor>();
-
-  auto input = param->input_x_;
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-  args.input_layout_type = fpga::LAYOUT_HWC;
-  args.output_layout_type = fpga::LAYOUT_HWC;
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP32;
-  args.image.address = input->data<half>();
-  args.image.height = (uint32_t)input->dims()[2];
-  args.image.width = (uint32_t)input->dims()[3];
-  args.image.channels = (uint32_t)input->dims()[1];
-  args.output.address = param->float_input->mutable_data<float>();
-  args.output.scale_address = param->float_input->scale;
-  param->input_arg = args;
-
-  auto* rois = param->input_rois_;
-  int rois_num = rois->dims()[0];
-  framework::DDim dims_out_new = framework::make_ddim(
-      {rois_num, param->output_->dims()[1], param->output_->dims()[2],
-       param->output_->dims()[3]});
-  param->output_->Resize(dims_out_new);
-  // fpga::format_fp16_ofm(param->output_);
-
-  param->output_->mutable_data<float>(dims_out_new);
-  //  auto output = param->float_output.get();
-  // param->output_ = output;
-  /* args.input_data_type = fpga::DATA_TYPE_FP32;
-   args.output_data_type = fpga::DATA_TYPE_FP16;
-   args.image.address = output->data<float>();
-   args.image.height = (uint32_t)output->dims()[2];
-   args.image.width = (uint32_t)output->dims()[3];
-   args.image.channels = (uint32_t)output->dims()[1]  ;
-   args.output.address = param->output_->mutable_data<half>();
-   args.output.scale_address = param->output_->scale;
-   param->output_arg = args;*/
-
-  return true;
-}
-
-/*
-    template <typename Dtype>
-    void PSROIPoolingForward(
-    const Dtype* bottom_data,
-    const int height, const int width, const int input_channel,
-    Dtype* top_data,
-    const int pooled_height, const int pooled_width, const int output_channel,
-    const Dtype* bottom_rois,
-    const Dtype Bin_size_h, const Dtype Bin_size_w, const Dtype roi_start_h,
-   const Dtype roi_start_w, const int pw, const int ph, const int roi_batch_ind)
-    {
-
-      int hstart = floor(static_cast<Dtype>(ph) * Bin_size_h + roi_start_h);
-      int wstart = floor(static_cast<Dtype>(pw)* Bin_size_w + roi_start_w);
-      int hend = ceil(static_cast<Dtype>(ph + 1) * Bin_size_h + roi_start_h);
-      int wend = ceil(static_cast<Dtype>(pw + 1) * Bin_size_w + roi_start_w);
-
-      hstart = std::min(std::max(hstart, 0), height);
-      hend = std::min(std::max(hend, 0), height);
-      wstart = std::min(std::max(wstart, 0), width);
-      wend = std::min(std::max(wend, 0), width);
-      bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-      float32x4_t sum_pixels_low_c= vdupq_n_f32(0);
-      float32x4_t sum_pixels_high_c= vdupq_n_f32(0);
-
-      if(!is_empty){
-          Dtype bin_area = (hend - hstart) * (wend - wstart);
-          float rev_bin_area = 1 / bin_area;
-          float32x4_t q_bin_area = vdupq_n_f32(rev_bin_area);
-   //static_cast<float>(bin_area) float pixels_c[output_channel];
-
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-                int pixel_offset = (h * width + w) * input_channel;
-                for(int output_c = 0; output_c < output_channel; output_c++){
-                    int input_channel_offset = output_c * pooled_height *
-   pooled_width; int input_bias = pixel_offset + input_channel_offset + ph *
-   pooled_width + pw; pixels_c[output_c] = bottom_data[input_bias];
-                }
-                float32x4_t pixel_low_c = vld1q_f32(pixels_c);
-                float32x4_t pixel_high_c = vld1q_f32(pixels_c + 4);
-                sum_pixels_low_c = vaddq_f32(sum_pixels_low_c, pixel_low_c);
-                sum_pixels_high_c = vaddq_f32(sum_pixels_high_c, pixel_high_c);
-            }
-          }
-          sum_pixels_low_c = vmulq_f32(sum_pixels_low_c, q_bin_area);
-          sum_pixels_high_c = vmulq_f32(sum_pixels_high_c, q_bin_area);
-        }
-
-      int output_index_base = (ph * pooled_width + pw) * output_channel;
-      top_data += output_index_base;
-      vst1q_f32(top_data, sum_pixels_low_c);
-      top_data += 4;
-      vst1q_f32(top_data, sum_pixels_high_c);
-    }*/
-
-template <typename Dtype>
-void PSROIPoolingForward(const Dtype* bottom_data, const int height,
-                         const int width, const int input_channel,
-                         Dtype* top_data, const int pooled_height,
-                         const int pooled_width, const int output_channel,
-                         const Dtype* bottom_rois, const Dtype Bin_size_h,
-                         const Dtype Bin_size_w, const Dtype roi_start_h,
-                         const Dtype roi_start_w, const int pw, const int ph,
-                         const int roi_batch_ind) {
-  int hstart = floor(static_cast<Dtype>(ph) * Bin_size_h + roi_start_h);
-  int wstart = floor(static_cast<Dtype>(pw) * Bin_size_w + roi_start_w);
-  int hend = ceil(static_cast<Dtype>(ph + 1) * Bin_size_h + roi_start_h);
-  int wend = ceil(static_cast<Dtype>(pw + 1) * Bin_size_w + roi_start_w);
-
-  // Add roi offsets and clip to input boundaries
-  hstart = std::min(std::max(hstart, 0), height);
-  hend = std::min(std::max(hend, 0), height);
-  wstart = std::min(std::max(wstart, 0), width);
-  wend = std::min(std::max(wend, 0), width);
-  bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-  float sum_pixels_c[output_channel] = {0};
-  float pixels_c[output_channel] = {0};
-  if (!is_empty) {
-    Dtype bin_area = (hend - hstart) * (wend - wstart);
-    float rec_bin_area = 1 / bin_area;
-
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        int pixel_offset = (h * width + w) * input_channel;
-        for (int output_c = 0; output_c < output_channel; output_c++) {
-          int input_channel_offset = output_c * pooled_height * pooled_width;
-          int input_bias =
-              pixel_offset + input_channel_offset + ph * pooled_width + pw;
-          pixels_c[output_c] = bottom_data[input_bias];
-        }
-
-        for (int output_c = 0; output_c < output_channel; output_c++) {
-          sum_pixels_c[output_c] += pixels_c[output_c];
-        }
-      }
-    }
-    for (int output_c = 0; output_c < output_channel; output_c++) {
-      sum_pixels_c[output_c] *= rec_bin_area;
-    }
-  }
-
-  int output_index_base = (ph * pooled_width + pw) * output_channel;
-  top_data += output_index_base;
-  memcpy(top_data, sum_pixels_c, output_channel * 4);
-}
-
-template <>
-void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
-  auto input_tensor = param.float_input.get();
-  fpga::PerformBypass(param.input_arg);
-  fpga::fpga_invalidate(input_tensor->data<float>(),
-                        input_tensor->numel() * sizeof(float));
-
-  auto* in = input_tensor;
-  auto* rois = param.input_rois_;
-  auto* out = param.output_;  // param.float_output.get();
-
-  auto pooled_height = param.pooled_height_;
-  auto pooled_width = param.pooled_width_;
-  auto spatial_scale = param.spatial_scale_;
-  auto output_channels = param.output_channels_;
-
-  auto in_dims = in->dims();
-  int batch_size = in_dims[0];
-  int input_channels = in_dims[1];
-  int height = in_dims[2];
-  int width = in_dims[3];
-  int rois_num = rois->dims()[0];
-
-  auto data_nhwc = in->mutable_data<float>();
-
-  //  fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width);
-  framework::DDim dims_out_new = framework::make_ddim(
-      {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])),
-       (param.output_)->dims()[3]});
-
-  (param.output_)->Resize(dims_out_new);
-
-  const float* input_data = data_nhwc;  // in->data<float>();
-  framework::Tensor rois_batch_id_list;
-  rois_batch_id_list.Resize({rois_num});
-  auto rois_batch_id_data = rois_batch_id_list.mutable_data<int>();
-
-  PADDLE_MOBILE_ENFORCE(rois->NumLevels() > 0, "ROIS should not be empty");
-
-  auto rois_lod = rois->lod().back();
-  int rois_batch_size = rois_lod.size() - 1;
-  PADDLE_MOBILE_ENFORCE(
-      rois_batch_size == batch_size,
-      "the rois_batch_size and input(X) batch_size should be the same.");
-  int rois_num_with_lod = rois_lod[rois_batch_size];
-  PADDLE_MOBILE_ENFORCE(rois_num_with_lod == rois_num,
-                        "the rois_num from input and lod must be the same");
-
-  PADDLE_MOBILE_ENFORCE(
-      input_channels == output_channels * pooled_height * pooled_width,
-      "the channels of input X should equal the product of "
-      "output_channels x pooled_height x pooled_width");
-
-  // calculate batch id index for each roi according to LoD
-  for (int n = 0; n < rois_batch_size; ++n) {
-    for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-      rois_batch_id_data[i] = n;
-    }
-  }
-  auto output_data = out->mutable_data<float>();
-  auto input_rois = rois->data<float>();
-
-  for (int n = 0; n < rois_num; ++n) {
-    auto offset_input_rois = input_rois + n * 4;
-    auto offset_output_data =
-        output_data + pooled_height * pooled_width * output_channels * n;
-
-    auto roi_start_w =
-        static_cast<float>(round(offset_input_rois[0])) * spatial_scale;
-    auto roi_start_h =
-        static_cast<float>(round(offset_input_rois[1])) * spatial_scale;
-    auto roi_end_w =
-        static_cast<float>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-    auto roi_end_h =
-        static_cast<float>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-
-    // Force too small rois to be 1 x 1
-    auto roi_height = std::max(roi_end_h - roi_start_h, 0.1f);  // avoid 0
-    auto roi_width = std::max(roi_end_w - roi_start_w, 0.1f);
-
-    // Compute bin size w and h at input feature map
-    auto bin_size_h = roi_height / static_cast<float>(pooled_height);
-    auto bin_size_w = roi_width / static_cast<float>(pooled_width);
-
-    int roi_batch_ind = rois_batch_id_data[n];
-
-    for (int ph = 0; ph < pooled_height; ph++) {
-      for (int pw = 0; pw < pooled_width; pw++) {
-        PSROIPoolingForward<float>(input_data, height, width, input_channels,
-                                   offset_output_data, pooled_height,
-                                   pooled_width, output_channels, input_rois,
-                                   bin_size_h, bin_size_w, roi_start_h,
-                                   roi_start_w, pw, ph, roi_batch_ind);
-      }
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // PSROI_POOL_OP
diff --git a/mobile/src/operators/kernel/fpga/V1/relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/relu_kernel.cpp
deleted file mode 100644
index 75dda4bf6d..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/relu_kernel.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RELU_OP
-
-#include "operators/kernel/activation_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ReluKernel<FPGA, float>::Init(ReluParam<FPGA> *param) {
-  param->Out()->ShareDataWith(*param->InputX());
-  return true;
-}
-
-template <>
-void ReluKernel<FPGA, float>::Compute(const ReluParam<FPGA> &param) {
-  PADDLE_MOBILE_ENFORCE(0, "relu as a single op is wrong");
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/reshape2_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/reshape2_kernel.cpp
deleted file mode 100644
index 647ecb5a65..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/reshape2_kernel.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE2_OP
-
-#include "operators/kernel/reshape2_kernel.h"
-#include "framework/ddim.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool Reshape2Kernel<FPGA, float>::Init(Reshape2Param<FPGA> *param) {
-  auto input = const_cast<LoDTensor *>(param->InputX());
-  auto output = param->Out();
-  auto shape = param->Shape();
-
-  auto num_in = framework::product(input->dims());
-  auto num_shape = framework::product(framework::make_ddim(shape));
-  PADDLE_MOBILE_ENFORCE(num_shape != 0, "0 index is not supported");
-
-  for (int i = 0; i < shape.size(); i++) {
-    if (shape[i] == -1) {
-      shape[i] = static_cast<int>(-num_in / num_shape);
-      break;
-    }
-  }
-  output->Resize(framework::make_ddim(shape));
-  output->set_type(input->type());
-  fpga::format_ofm(output);
-  DLOG << "input: " << input;
-  DLOG << "output: " << output;
-
-  return true;
-}
-
-void reshape(LoDTensor *input, LoDTensor *output) {
-  // Subscript r means after reshape
-
-  auto input_ptr = input->data<half>();
-  auto output_ptr = output->data<half>();
-  output->scale[0] = input->scale[0];
-  output->scale[1] = input->scale[1];
-
-  auto C = static_cast<int>(input->dims()[1]);
-  auto H = static_cast<int>(input->dims()[2]);
-  auto W = static_cast<int>(input->dims()[3]);
-  auto Cr = static_cast<int>(output->dims()[1]);
-  auto Hr = static_cast<int>(output->dims()[2]);
-  auto Wr = static_cast<int>(output->dims()[3]);
-  PADDLE_MOBILE_ENFORCE(C * H * W == Cr * Hr * Wr, "Dims don't match");
-  auto WC = W * C;
-  auto WC_align = fpga::align_to_x(WC, IMAGE_ALIGNMENT);
-  auto HW = H * W;
-  auto WCr = Wr * Cr;
-  auto WCr_align = fpga::align_to_x(WCr, IMAGE_ALIGNMENT);
-  auto HWr = Hr * Wr;
-
-  fpga::fpga_invalidate(input_ptr, H * WC_align * sizeof(half));
-
-  int offset_align = 0;
-  int offset_r = 0, offset_align_r = 0;
-  int cr = 0, hr = 0, wr = 0;
-
-  for (int h = 0; h < H; h++) {
-    int offset0 = h * WC_align;
-    for (int w = 0; w < W; w++) {
-      int offset1 = w * C + offset0;
-      for (int c = 0; c < C; c++) {
-        offset_align = offset1 + c;
-        offset_r = c * HW + h * W + w;
-        cr = offset_r / HWr;
-        hr = offset_r % HWr / Wr;
-        wr = offset_r % Wr;
-        offset_align_r = hr * WCr_align + wr * Cr + cr;
-        output_ptr[offset_align_r] = input_ptr[offset_align];
-      }
-    }
-  }
-
-  fpga::fpga_flush(output_ptr, Hr * WCr_align * sizeof(half));
-}
-
-template <>
-void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
-  auto input = const_cast<LoDTensor *>(param.InputX());
-  auto output = param.Out();
-  auto shape = param.Shape();
-
-  auto num_in = framework::product(input->dims());
-  auto num_shape = framework::product(framework::make_ddim(shape));
-  PADDLE_MOBILE_ENFORCE(num_shape != 0, "0 index is not supported");
-
-  for (int i = 0; i < shape.size(); i++) {
-    if (shape[i] == -1) {
-      shape[i] = static_cast<int>(-num_in / num_shape);
-      break;
-    }
-  }
-  output->Resize(framework::make_ddim(shape));
-  if (output->dims() == input->dims()) {
-    DLOG << "No need to reshape";
-    output->ShareDataWith(*input);
-    framework::LoD lod = input->lod();
-    output->set_lod(lod);
-    return;
-  }
-
-  reshape(input, output);
-  //
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/reshape_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/reshape_kernel.cpp
deleted file mode 100644
index 5e01bb74ba..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/reshape_kernel.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE_OP
-
-#include "operators/kernel/reshape_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ReshapeKernel<FPGA, float>::Init(ReshapeParam<FPGA> *param) {
-  param->Out()->ShareDataWith(*param->InputX());
-  const int in_n = param->InputX()->dims()[0];
-  const int in_c = param->InputX()->dims()[1];
-  const int in_h = param->InputX()->dims()[2];
-  const int in_w = param->InputX()->dims()[3];
-  auto out = param->Out();
-  out->Resize(framework::make_ddim({in_n, in_c * in_h * in_w}));
-  return true;
-}
-
-template <>
-void ReshapeKernel<FPGA, float>::Compute(const ReshapeParam<FPGA> &param) {}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp
deleted file mode 100644
index ec8d19db80..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp
+++ /dev/null
@@ -1,296 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ROIALIGN_POOL_OP
-
-#include <cmath>
-#include <vector>
-#include "operators/kernel/detection_kernel.h"
-
-#include "fpga/V1/api.h"
-#include "fpga/V1/image.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool RoiAlignPoolKernel<FPGA, float>::Init(RoiAlignPoolParam<FPGA>* param) {
-  auto dims = param->input_x_->dims();
-  PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0,
-                        "data not aligned");
-
-  param->float_input = std::make_shared<Tensor>();
-  param->float_input->mutable_data<float>(param->input_x_->dims());
-
-  auto input = param->input_x_;
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-  args.input_layout_type = fpga::LAYOUT_HWC;
-  args.output_layout_type = fpga::LAYOUT_HWC;
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP32;
-  args.image.address = input->data<half>();
-  args.image.height = (uint32_t)input->dims()[2];
-  args.image.width = (uint32_t)input->dims()[3];
-  args.image.channels = (uint32_t)input->dims()[1];
-  args.output.address = param->float_input->mutable_data<float>();
-  args.output.scale_address = param->float_input->scale;
-  param->input_arg = args;
-
-  auto* rois = param->input_rois_;
-  int rois_num = rois->dims()[0];
-  framework::DDim dims_out_new = framework::make_ddim(
-      {rois_num, param->output_->dims()[1], param->output_->dims()[2],
-       param->output_->dims()[3]});
-  param->output_->Resize(dims_out_new);
-
-  param->output_->mutable_data<float>(dims_out_new);
-
-  return true;
-}
-
-template <typename T>
-struct PreCalc {
-  int pos1;
-  int pos2;
-  int pos3;
-  int pos4;
-  T w1;
-  T w2;
-  T w3;
-  T w4;
-};
-
-template <typename T>
-void pre_calc_for_bilinear_interpolate(
-    const int height, const int width, const int pooled_height,
-    const int pooled_width, const int iy_upper, const int ix_upper,
-    T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
-    int roi_bin_grid_h, int roi_bin_grid_w,
-    std::vector<PreCalc<T>>& pre_calc) {  // NOLINT
-  int pre_calc_index = 0;
-  for (int ph = 0; ph < pooled_height; ph++) {
-    for (int pw = 0; pw < pooled_width; pw++) {
-      for (int iy = 0; iy < iy_upper; iy++) {
-        const T yy = roi_start_h + ph * bin_size_h +
-                     static_cast<T>(iy + .5f) * bin_size_h /
-                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-        for (int ix = 0; ix < ix_upper; ix++) {
-          const T xx = roi_start_w + pw * bin_size_w +
-                       static_cast<T>(ix + .5f) * bin_size_w /
-                           static_cast<T>(roi_bin_grid_w);
-
-          T x = xx;
-          T y = yy;
-          // deal with: inverse elements are out of feature map boundary
-          if (y < -1.0 || y > height || x < -1.0 || x > width) {
-            // empty
-            PreCalc<T> pc;
-            pc.pos1 = 0;
-            pc.pos2 = 0;
-            pc.pos3 = 0;
-            pc.pos4 = 0;
-            pc.w1 = 0;
-            pc.w2 = 0;
-            pc.w3 = 0;
-            pc.w4 = 0;
-            pre_calc[pre_calc_index] = pc;
-            pre_calc_index += 1;
-            continue;
-          }
-
-          if (y <= 0) {
-            y = 0;
-          }
-          if (x <= 0) {
-            x = 0;
-          }
-
-          int y_low = static_cast<int>(y);
-          int x_low = static_cast<int>(x);
-          int y_high;
-          int x_high;
-
-          if (y_low >= height - 1) {
-            y_high = y_low = height - 1;
-            y = (T)y_low;
-          } else {
-            y_high = y_low + 1;
-          }
-
-          if (x_low >= width - 1) {
-            x_high = x_low = width - 1;
-            x = (T)x_low;
-          } else {
-            x_high = x_low + 1;
-          }
-
-          T ly = y - y_low;
-          T lx = x - x_low;
-          T hy = 1. - ly, hx = 1. - lx;
-          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-          // save weights and indeces
-          PreCalc<T> pc;
-          pc.pos1 = y_low * width + x_low;
-          pc.pos2 = y_low * width + x_high;
-          pc.pos3 = y_high * width + x_low;
-          pc.pos4 = y_high * width + x_high;
-          pc.w1 = w1;
-          pc.w2 = w2;
-          pc.w3 = w3;
-          pc.w4 = w4;
-          pre_calc[pre_calc_index] = pc;
-
-          pre_calc_index += 1;
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-void ROIAlignForward(const int nthreads, const T* bottom_data,
-                     const T& spatial_scale, const int channels,
-                     const int height, const int width, const int pooled_height,
-                     const int pooled_width, const int sampling_ratio,
-                     const T* bottom_rois, T* top_data) {
-  int n_rois = nthreads / channels / pooled_width / pooled_height;
-
-  for (int n = 0; n < n_rois; n++) {
-    int index_n = n * channels * pooled_width * pooled_height;
-
-    // roi could have 4 or 5 columns
-    const T* offset_bottom_rois = bottom_rois + n * 4;
-    int roi_batch_ind = 0;
-    // if (roi_cols == 5) {
-    // roi_batch_ind = offset_bottom_rois[0];
-    // offset_bottom_rois++;
-    // }
-
-    // Do not using rounding; this implementation detail is critical
-    T roi_start_w = offset_bottom_rois[0] * spatial_scale;
-    T roi_start_h = offset_bottom_rois[1] * spatial_scale;
-    T roi_end_w = offset_bottom_rois[2] * spatial_scale;
-    T roi_end_h = offset_bottom_rois[3] * spatial_scale;
-    // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
-    // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
-    // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
-    // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
-
-    // Force malformed ROIs to be 1x1
-    T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
-    T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);  // e.g., = 2
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-
-    // We do average (integral) pooling inside a bin
-    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
-
-    // we want to precalculate indeces and weights shared by all chanels,
-    // this is the key point of optimiation
-    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
-                                     pooled_width * pooled_height);
-    pre_calc_for_bilinear_interpolate(
-        height, width, pooled_height, pooled_width, roi_bin_grid_h,
-        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
-        roi_bin_grid_h, roi_bin_grid_w, pre_calc);
-
-    for (int c = 0; c < channels; c++) {
-      int index_n_c = index_n + c * pooled_width * pooled_height;
-      const T* offset_bottom_data =
-          bottom_data + (roi_batch_ind * channels + c) * height * width;
-      int pre_calc_index = 0;
-
-      for (int ph = 0; ph < pooled_height; ph++) {
-        for (int pw = 0; pw < pooled_width; pw++) {
-          int index = index_n_c + ph * pooled_width + pw;
-
-          T output_val = 0.;
-          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-              PreCalc<T> pc = pre_calc[pre_calc_index];
-              output_val += pc.w1 * offset_bottom_data[pc.pos1] +
-                            pc.w2 * offset_bottom_data[pc.pos2] +
-                            pc.w3 * offset_bottom_data[pc.pos3] +
-                            pc.w4 * offset_bottom_data[pc.pos4];
-
-              pre_calc_index += 1;
-            }
-          }
-          output_val /= count;
-
-          top_data[index] = output_val;
-        }  // for pw
-      }    // for ph
-    }      // for c
-  }        // for n
-}
-
-template <>
-void RoiAlignPoolKernel<FPGA, float>::Compute(
-    const RoiAlignPoolParam<FPGA>& param) {
-  auto input_tensor = param.float_input.get();
-  fpga::PerformBypass(param.input_arg);
-  fpga::fpga_invalidate(input_tensor->data<float>(),
-                        input_tensor->numel() * sizeof(float));
-
-  auto* in = input_tensor;
-  auto* rois = param.input_rois_;
-  auto* out = param.output_;  // param.float_output.get();
-
-  auto pooled_height = param.pooled_height_;
-  auto pooled_width = param.pooled_width_;
-  auto spatial_scale = param.spatial_scale_;
-  auto sampe_ratio = param.sampling_ratio_;
-
-  auto in_dims = in->dims();
-  int batch_size = in_dims[0];
-  int input_channels = in_dims[1];
-  int height = in_dims[2];
-  int width = in_dims[3];
-  int rois_num = rois->dims()[0];
-
-  auto data_nhwc = in->mutable_data<float>();
-
-  fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width);
-  framework::DDim dims_out_new = framework::make_ddim(
-      {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])),
-       (param.output_)->dims()[3]});
-  (param.output_)->Resize(dims_out_new);
-
-  const int index = input_channels * pooled_height * pooled_width * rois_num;
-  auto rois_data = rois->data<float>();
-  auto top_data = param.output_->mutable_data<float>();
-  for (int i = 0; i < index; ++i) {
-    ROIAlignForward<float>(index, data_nhwc, spatial_scale, input_channels,
-                           height, width, pooled_height, pooled_width,
-                           sampe_ratio, rois_data, top_data);
-  }
-
-  fpga::image::convert_to_hwc(&top_data, input_channels, pooled_height,
-                              pooled_width, rois_num);
-  out->reset_data_ptr(top_data);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // ROIALIGN_POOL_OP
diff --git a/mobile/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
deleted file mode 100644
index 8fa6feda7f..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SIGMOID_OP
-
-#include "operators/kernel/activation_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::SIGMOID;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->InputX());
-  auto input_ptr = input->data<half>();
-  auto out = param->Out();
-  fpga::format_fp16_ofm(out);
-
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP16;
-  args.image.address = input_ptr;
-  args.image.height = 1;
-  args.image.width = 1;
-  args.image.channels = input->fpga_data_num;
-  args.output.address = out->data<half>();
-  args.output.scale_address = out->scale;
-  args.output.activation.activation_type = activation_enable;
-  args.output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope;
-  param->SetFpgaArgs(args);
-  return true;
-}
-template <>
-void SigmoidKernel<FPGA, float>::Compute(const SigmoidParam<FPGA> &param) {
-  fpga::PerformBypass(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/slice_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/slice_kernel.cpp
deleted file mode 100644
index 2fd6ef542e..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/slice_kernel.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SLICE_OP
-
-#include "operators/kernel/slice_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
-  auto output = param->output_;
-  fpga::format_fp16_ofm(output);
-  DLOG << "input: " << param->input_;
-  DLOG << "output: " << param->output_;
-  if (param->input_->type() != type_id<half>()) {
-    DLOG << "wrong type";
-  }
-  return true;
-}
-template <>
-void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
-  // Only support slicing in channel dimension
-  // Only support half data
-  // W must be aligned to 16
-
-  auto input = param.input_;
-  auto output = param.output_;
-  int HW = input->dims()[2] * input->dims()[3];
-  int channel = input->dims()[1];
-  auto input_ptr = input->data<half>();
-  auto output_ptr = output->data<half>();
-
-  output->scale[0] = input->scale[0];
-  output->scale[1] = input->scale[1];
-
-  int start = param.starts_[0], end = param.ends_[0];
-  start = start < 0 ? start + channel : start;
-  end = end < 0 ? end + channel : end;
-  start = start > channel ? channel : start;
-  end = end > channel ? channel : end;
-  int len = end - start;
-  size_t size = len * sizeof(half);
-
-  for (int i = 0; i < HW; i++) {
-    memcpy(output_ptr + len * i, input_ptr + i * channel + start, size);
-  }
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/softmax_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/softmax_kernel.cpp
deleted file mode 100644
index ac7a7bdc77..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/softmax_kernel.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SOFTMAX_OP
-
-#include "operators/kernel/softmax_kernel.h"
-#include "operators/kernel/central-arm-func/softmax_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
-  auto input = const_cast<LoDTensor *>(param->InputX());
-  auto dims = framework::vectorize(input->dims());
-  half *input_ptr;
-  auto out = param->Out();
-  if (input->type() == type_id<float>()) {
-    out->Resize(framework::make_ddim(dims));
-    out->mutable_data<float>(framework::make_ddim(dims));
-  } else {
-    input_ptr = input->data<half>();
-  }
-
-  auto float_input = new LoDTensor;
-
-  int input_n = 1, input_c = 1, input_h = 1, input_w = 1;
-  if (dims.size() == 4) {
-    input_h = dims[1];
-    input_w = dims[2];
-    input_c = dims[3];
-    if (input_c == 1) {  // This input is generated by FC op, dims = [N C 1 1]
-      PADDLE_MOBILE_ENFORCE(input_w == 1, "Softmax input must come from FC op");
-      input_c = dims[1];
-      input_h = 1;
-    }
-  } else if (dims.size() == 2) {
-    input_c = dims[1];
-  }
-  input->Resize(framework::make_ddim(dims));
-  float_input->Resize(framework::make_ddim(dims));
-
-  if (input_c == 2 && input->type() == type_id<half>()) {  // Use FPGA
-    fpga::format_fp16_ofm(out);
-    fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-    args.input_layout_type = fpga::LAYOUT_HWC;
-    args.output_layout_type = fpga::LAYOUT_CHW;
-    args.input_data_type = fpga::DATA_TYPE_FP16;
-    args.output_data_type = fpga::DATA_TYPE_FP16;
-    args.image.address = input_ptr;
-    args.image.height = input_h;
-    args.image.width = input_w;
-    args.image.channels = input_c;
-    args.output.address = out->data<half>();
-    args.output.scale_address = out->scale;
-    args.output.activation.activation_type = fpga::SOFTMAX;
-    param->SetFpgaArgs(args);
-  } else {  // Use CPU
-    out->Resize(framework::make_ddim(dims));
-    out->mutable_data<float>(framework::make_ddim(dims));
-    float_input->init(type_id<float>().hash_code());
-    float_input->mutable_data<float>(framework::make_ddim(dims));
-    fpga::format_fp32_ofm(float_input);
-    fpga::format_fp32_ofm(out);
-
-    fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-    args.input_layout_type = fpga::LAYOUT_HWC;
-    args.output_layout_type = fpga::LAYOUT_CHW;
-    args.input_data_type = fpga::DATA_TYPE_FP16;
-    args.output_data_type = fpga::DATA_TYPE_FP32;
-    args.image.address = input_ptr;
-    args.image.height = input_h;
-    args.image.width = input_w;
-    args.image.channels = input_c;
-    args.output.address = float_input->data<float>();
-    args.output.scale_address = float_input->scale;
-    param->SetFloatInput(float_input);
-    param->SetFpgaArgs(args);
-  }
-
-  return true;
-}
-
-template <>
-void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
-  auto *in_x = (param.InputX());
-  auto dims = in_x->dims();
-  auto n = 1;
-  auto h = 1;
-  auto w = 1;
-  auto c = 1;
-  if (dims.size() == 4) {
-    h = dims[1];
-    w = dims[2];
-    c = dims[3];
-    if (c == 1) {  // This input is generated by FC op, dims = [N C 1 1]
-      PADDLE_MOBILE_ENFORCE(w == 1, "Softmax input must come from FC op");
-      c = dims[1];
-      h = 1;
-    }
-  } else if (dims.size() == 2) {
-    c = dims[1];
-  }
-  if (in_x->type() == type_id<half>()) {
-    fpga::PerformBypass(param.FpgaArgs());
-    if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
-      Tensor *out = param.Out();
-      Tensor *in_x2 = param.FloatInput();
-
-      fpga::fpga_invalidate(in_x2->data<float>(),
-                            in_x2->numel() * sizeof(float));
-      math::SoftmaxFuntor<CPU, float>()(in_x2, out);
-      fpga::fpga_flush(out->data<float>(), out->memory_size());
-    }
-  } else {
-    if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
-      Tensor *out = param.Out();
-      out->Resize({n, h, w, c});
-      math::SoftmaxFuntor<CPU, float>()(in_x, out);
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/split_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/split_kernel.cpp
deleted file mode 100644
index 584cb41fb3..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/split_kernel.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SPLIT_OP
-
-#include "operators/kernel/split_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA> *param) {
-  auto *in = const_cast<LoDTensor *>(param->InputX());
-  auto outs = param->Outs();
-  auto sections = param->Sections();
-  int axis = param->Axis();
-  PADDLE_MOBILE_ENFORCE(axis == 1, "Only support split in channel dimension");
-  PADDLE_MOBILE_ENFORCE(outs.size() == sections.size(),
-                        "Output number should be equal to section number");
-  auto image_num = (uint32_t)outs.size();
-  auto images_out =
-      reinterpret_cast<void **>(fpga::fpga_malloc(image_num * sizeof(void *)));
-  auto scales_out = reinterpret_cast<float **>(
-      fpga::fpga_malloc(image_num * sizeof(float *)));
-  auto out_channels = reinterpret_cast<uint32_t *>(
-      fpga::fpga_malloc(image_num * sizeof(uint32_t)));
-  DLOG << "input: " << in;
-  for (int i = 0; i < image_num; i++) {
-    fpga::format_fp16_ofm(outs[i]);
-    DLOG << "output: " << outs[i];
-    images_out[i] = outs[i]->mutable_data<half>();
-    scales_out[i] = outs[i]->scale;
-    out_channels[i] = (uint32_t)sections[i];
-  }
-
-  auto deleter = [](void *p) { fpga::fpga_free(p); };
-
-  fpga::SplitArgs arg = {0};
-  arg.image_num = image_num;
-  arg.image_in = in->data<half>();
-  arg.scale_in = in->scale;
-  arg.images_out = images_out;
-  arg.scales_out = scales_out;
-  arg.out_channel_nums = out_channels;
-  arg.height = (uint32_t)in->dims()[2];
-  arg.width = (uint32_t)in->dims()[3];
-  arg.vector_split_space.push_back(
-      std::shared_ptr<char>(reinterpret_cast<char *>(images_out), deleter));
-  arg.vector_split_space.push_back(
-      std::shared_ptr<char>(reinterpret_cast<char *>(scales_out), deleter));
-  arg.vector_split_space.push_back(
-      std::shared_ptr<char>(reinterpret_cast<char *>(out_channels), deleter));
-
-  param->SetFpgaArgs(arg);
-  return true;
-}
-template <>
-void SplitKernel<FPGA, float>::Compute(const SplitParam<FPGA> &param) {
-  fpga::ComputeFPGASplit(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/tanh_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/tanh_kernel.cpp
deleted file mode 100644
index d7bbc5f043..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/tanh_kernel.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef TANH_OP
-
-#include "operators/kernel/tanh_kernel.h"
-#include <math.h>
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool TanhKernel<FPGA, float>::Init(TanhParam<FPGA> *param) {
-  auto input = const_cast<LoDTensor *>(param->InputX());
-  DLOG << "input: " << input;
-  auto input_ptr = input->data<half>();
-  auto float_input = new LoDTensor;
-
-  float_input->mutable_data<float>(
-      {1, input->dims()[1], input->dims()[2], input->dims()[3]});
-  fpga::format_fp32_ofm(float_input);
-
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-  args.input_layout_type = fpga::LAYOUT_HWC;
-  args.output_layout_type = fpga::LAYOUT_CHW;
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP32;
-  args.image.address = input_ptr;
-  args.image.height = (uint32_t)input->dims()[2];
-  args.image.width = (uint32_t)input->dims()[3];
-  args.image.channels = (uint32_t)input->dims()[1];
-  args.output.address = float_input->data<float>();
-  args.output.scale_address = float_input->scale;
-  param->SetFloatInput(float_input);
-  param->SetFpgaArgs(args);
-  return true;
-}
-
-#define EXP_MAX_INPUT 40.0
-template <typename T>
-T Tanh(const T a) {
-  T tmp = -2.0 * a;
-  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-  return (2.0 / (1.0 + exp(tmp))) - 1.0;
-}
-template <typename T>
-void tanhFuntor(Tensor *input, Tensor *output) {
-  auto *input_ptr = input->data<T>();
-  auto *output_ptr = output->mutable_data<T>();
-  for (int i = 0; i < input->numel(); i++) {
-    *(output_ptr + i) = Tanh<T>(*(input_ptr + i));
-  }
-}
-template <>
-void TanhKernel<FPGA, float>::Compute(const TanhParam<FPGA> &param) {
-  Tensor *in_x = param.FloatInput();
-  Tensor *out = param.Out();
-
-  fpga::PerformBypass(param.FpgaArgs());
-  fpga::fpga_invalidate((void *)in_x->data<float>(),
-                        in_x->numel() * sizeof(float));
-  tanhFuntor<float>(in_x, out);
-  fpga::fpga_flush(out->data<float>(), out->memory_size());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/transpose2_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/transpose2_kernel.cpp
deleted file mode 100644
index cc839a971e..0000000000
--- a/mobile/src/operators/kernel/fpga/V1/transpose2_kernel.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef TRANSPOSE2_OP
-
-#include "operators/kernel/transpose2_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool Transpose2Kernel<FPGA, float>::Init(Transpose2Param<FPGA> *param) {
-  auto input = param->InputX();
-  auto output = param->Out();
-  auto axis = param->Axis();
-  auto dim = input->dims();
-  output->ShareDataWith(*input);
-
-  auto dim_v = vectorize(dim);
-
-  for (int i = 0; i < axis.size(); i++) {
-    dim_v[i] = dim[axis[i]];
-  }
-  output->Resize(framework::make_ddim(dim_v));
-
-  DLOG << "input: " << input;
-  DLOG << "output: " << output;
-  return true;
-}
-
-template <>
-void Transpose2Kernel<FPGA, float>::Compute(
-    const Transpose2Param<FPGA> &param) {
-  // Transpose2Compute<float>(param);
-  auto input = param.InputX();
-  auto output = param.Out();
-
-  output->Resize({input->dims()[0], output->dims()[1], output->dims()[2],
-                  output->dims()[3]});
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp
deleted file mode 100644
index 6046b3d2f0..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ANCHOR_GENERATOR_OP
-
-#include <string.h>
-#include <iostream>
-#include <utility>
-#include <vector>
-#include "operators/kernel/detection_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool AnchorGeneratorKernel<FPGA, float>::Init(
-    AnchorGeneratorParam<FPGA> *param) {
-  auto input = param->input_;
-  auto anchors = param->output_anchors_;
-  auto anchor_ptr = anchors->mutable_data<float>();
-  auto stride = param->stride_;
-  auto feature_width = input->dims()[3], feature_height = input->dims()[2];
-  auto stride_width = stride[0], stride_height = stride[1];
-  auto offset = param->offset_;
-
-  int anchors_offset[] = {-2,  -2,   18,   18,  -10, -9,   26,   25,   -23,
-                          -20, 39,   36,   -43, -34, 59,   49,   -63,  -54,
-                          79,  69,   -96,  -77, 112, 93,   -137, -118, 153,
-                          134, -204, -188, 220, 204, -281, -395, 296,  441};
-
-  int anchors_offset2[] = {0, 0, 51, 77, 0, 0, 30, 35, 0, 0, 81, 103,
-                           0, 0, 20, 21, 0, 0, 36, 44, 0, 0, 43, 58,
-                           0, 0, 34, 68, 0, 0, 24, 28, 0, 0, 19, 46};
-
-  if (offset > 0.6) {
-    memcpy(anchors_offset, anchors_offset2, sizeof(anchors_offset));
-    std::cout << "anchor generator marker" << std::endl;
-  } else {
-    std::cout << "anchor generator rfcn" << std::endl;
-  }
-  int num_anchors = sizeof(anchors_offset) / (sizeof(int) * 4);
-
-  //  DLOG << "feature_height: " << feature_height;
-  //  DLOG << "feature_width: " << feature_width;
-  //  DLOG << "num_anchors: " << num_anchors;
-  //  DLOG << "stride_width: " << stride_width;
-  //  DLOG << "stride_height: " << stride_height;
-
-  for (int h_idx = 0; h_idx < feature_height; ++h_idx) {
-    int offset0 = h_idx * feature_width * num_anchors * 4;
-    for (int w_idx = 0; w_idx < feature_width; ++w_idx) {
-      int offset1 = w_idx * num_anchors * 4;
-      for (int idx = 0; idx < num_anchors; idx++) {
-        int offset = offset0 + offset1 + idx * 4;
-        anchor_ptr[offset + 0] =
-            anchors_offset[idx * 4 + 0] + w_idx * stride_width;
-        anchor_ptr[offset + 1] =
-            anchors_offset[idx * 4 + 1] + h_idx * stride_height;
-        anchor_ptr[offset + 2] =
-            anchors_offset[idx * 4 + 2] + w_idx * stride_width;
-        anchor_ptr[offset + 3] =
-            anchors_offset[idx * 4 + 3] + h_idx * stride_height;
-      }
-    }
-  }
-  return true;
-}
-
-template <>
-void AnchorGeneratorKernel<FPGA, float>::Compute(
-    const AnchorGeneratorParam<FPGA> &param) {}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // ANCHOR_GENERATOR_OP
diff --git a/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp
deleted file mode 100644
index 716531fcab..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONCAT_OP
-
-#include "operators/kernel/concat_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
-  auto inputs = param->Inputs();
-  auto out = param->Out();
-  auto image_num = inputs.size();
-  auto images_in =
-      (int8_t **)fpga::fpga_malloc(image_num * sizeof(int8_t *));  // NOLINT
-  auto scales_in =
-      (float **)fpga::fpga_malloc(image_num * sizeof(float *));  // NOLINT
-  auto channel_num =
-      (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t));  // NOLINT
-
-  auto height = inputs[0]->dims()[2];
-  auto width = inputs[0]->dims()[3];
-  for (int i = 0; i < image_num; i++) {
-    auto input = inputs[i];
-    PADDLE_MOBILE_ENFORCE(
-        input->dims()[2] == height && input->dims()[3] == width,
-        "Image height & width should be unified");
-    images_in[i] = input->data<int8_t>();
-    channel_num[i] = (uint32_t)inputs[i]->dims()[1];  // NOLINT
-    scales_in[i] = input->scale;
-  }
-  fpga::format_concat_output(out, height, width, image_num, channel_num);
-
-  fpga::ConcatArgs concatArgs = {0};
-  concatArgs.image_num = image_num;
-  concatArgs.images_in = images_in;
-  concatArgs.scales_in = scales_in;
-  concatArgs.image_out = out->data<int8_t>();
-  concatArgs.scale_out = out->scale;
-  concatArgs.channel_num = channel_num;
-  concatArgs.height = height;
-  concatArgs.width = width;
-  param->SetFpgaArgs(concatArgs);
-  return true;
-}
-
-template <>
-void ConcatKernel<FPGA, float>::Compute(const ConcatParam<FPGA> &param) {
-  ComputeFPGAConcat(param.FpgaArgs());
-}
-template class ConcatKernel<FPGA, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp
deleted file mode 100644
index 2e4a8871fc..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDBN_OP
-
-#include "operators/kernel/conv_add_bn_kernel.h"
-#include <cmath>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
-  bool relu_enabled = false;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-
-  auto bias = param->Bias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-
-  auto out = param->Output();
-  float Si = input->scale[0];
-  float So = out->scale[0];
-  float Sf = fpga::filter_find_max(filter);
-
-  auto bn_mean_ptr = param->InputMean()->data<float>();
-  auto bn_var_ptr = param->InputVariance()->data<float>();
-  auto bn_scale_ptr = param->InputScale()->data<float>();
-  auto bn_bias_ptr = param->InputBias()->data<float>();
-  const float epsilon = param->Epsilon();
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] &&
-                            bias->dims()[0] == param->InputBias()->dims()[0],
-                        "Output channel should be equal to bias number");
-
-  const int channel = out->dims()[1];
-  auto bs_ptr =
-      reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
-  auto new_scale = new Tensor();
-  auto new_bias = new Tensor();
-  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
-  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
-
-  for (int i = 0; i < channel; i++) {
-    new_scale_ptr[i] = bn_scale_ptr[i] /
-                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
-    new_bias_ptr[i] =
-        bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
-    bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0;
-    bs_ptr[i] = new_bias_ptr[i] * 127.0 / So;
-  }
-
-  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
-                       param->Groups(), param->Strides()[0],
-                       param->Strides()[1], param->Paddings()[0],
-                       param->Paddings()[1], bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-
-  delete new_scale;
-  delete new_bias;
-
-  return true;
-}
-
-template <>
-void ConvAddBNKernel<FPGA, float>::Compute(
-    const FusionConvAddBNParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
deleted file mode 100644
index 8c65ee0627..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDBNRELU_OP
-
-#include "operators/kernel/conv_add_bn_relu_kernel.h"
-#include <cmath>
-
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool ConvAddBNReluKernel<FPGA, float>::Init(
-    FusionConvAddBNReluParam<FPGA> *param) {
-  bool relu_enabled = true;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  auto bias = param->Bias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  const int groups = param->Groups();
-  float Si = input->scale[0];
-  float So = out->scale[0];
-  float Sf = fpga::filter_find_max(filter);
-  vector<int> paddings = param->Paddings();
-  vector<int> strides = param->Strides();
-  auto bn_mean_ptr = param->InputMean()->data<float>();
-  auto bn_var_ptr = param->InputVariance()->data<float>();
-  auto bn_scale_ptr = param->InputScale()->data<float>();
-  auto bn_bias_ptr = param->InputBias()->data<float>();
-  const float epsilon = param->Epsilon();
-
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] &&
-                            bias->dims()[0] == param->InputBias()->dims()[0],
-                        "Output channel should be equal to bias number");
-
-  const int channel = out->dims()[1];
-  auto bs_ptr =
-      reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
-  auto new_scale = new Tensor();
-  auto new_bias = new Tensor();
-  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
-  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
-
-  for (int i = 0; i < channel; i++) {
-    new_scale_ptr[i] = bn_scale_ptr[i] /
-                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
-    new_bias_ptr[i] =
-        bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
-    bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0;
-    bs_ptr[i] = new_bias_ptr[i] * 127.0 / So;
-    if (groups == channel) {
-      new_scale_ptr[i] = new_scale_ptr[i] * Si / So;
-      new_bias_ptr[i] = new_bias_ptr[i] * 127.0f / So;
-    }
-  }
-
-  if (groups == channel) {
-    fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr);
-    fpga::DWconvArgs dwconv_arg = {0};
-    fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, relu_enabled,
-                          strides[0], strides[1], paddings[0], paddings[1],
-                          new_bias_ptr);
-    param->SetFpgaArgs(dwconv_arg);
-    fpga::fpga_free(bs_ptr);
-    delete new_scale;
-  } else {
-    fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-    fpga::SplitConvArgs conv_arg = {0};
-    fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
-                         param->Groups(), strides[0], strides[1], paddings[0],
-                         paddings[1], bs_ptr);
-    param->SetFpgaArgs(conv_arg);
-    delete new_scale;
-    delete new_bias;
-  }
-
-  return true;
-}
-
-template <>
-void ConvAddBNReluKernel<FPGA, float>::Compute(
-    const FusionConvAddBNReluParam<FPGA> &param) {
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWConv(param.FpgaDwconvArgs());
-  } else {
-    fpga::ComputeFpgaConv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/conv_add_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/conv_add_kernel.cpp
deleted file mode 100644
index d0a08abdda..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/conv_add_kernel.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADD_OP
-
-#include "operators/kernel/conv_add_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  const Tensor *bias = param->Bias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  float Si = input->scale[0];
-  float So = out->scale[0];
-  float Sf = fpga::filter_find_max(filter);
-
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-                        "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = Si / So * Sf / 127.0;
-    bs_ptr[i] = bias_ptr[i] * 127.0 / So;
-  }
-
-  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, false, param->Groups(),
-                       param->Strides()[0], param->Strides()[1],
-                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-  return true;
-}
-
-template <>
-void ConvAddKernel<FPGA, float>::Compute(
-    const FusionConvAddParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp
deleted file mode 100644
index 508e835b67..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDRELU_OP
-
-#include "operators/kernel/conv_add_relu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  const Tensor *bias = param->Bias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  float Si = input->scale[0];
-  float So = out->scale[0];
-  float Sf = fpga::filter_find_max(filter);
-
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-                        "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = Si / So * Sf / 127.0;
-    bs_ptr[i] = bias_ptr[i] * 127.0 / So;
-  }
-
-  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, true, param->Groups(),
-                       param->Strides()[0], param->Strides()[1],
-                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-  return true;
-}
-
-template <>
-void ConvAddReluKernel<FPGA, float>::Compute(
-    const FusionConvAddReluParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp
deleted file mode 100644
index d3de98705e..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBN_OP
-
-#include "operators/kernel/conv_bn_kernel.h"
-#include <cmath>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  float Si = input->scale[0];
-  float So = out->scale[0];
-  float Sf = fpga::filter_find_max(filter);
-  auto bn_mean_ptr = param->InputMean()->data<float>();
-  auto bn_var_ptr = param->InputVariance()->data<float>();
-  auto bn_scale_ptr = param->InputScale()->data<float>();
-  auto bn_bias_ptr = param->InputBias()->data<float>();
-  const float epsilon = param->Epsilon();
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
-                        "Output channel should be equal to bias number");
-  const int channel = out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // // NOLINT
-  auto new_scale = new Tensor();
-  auto new_bias = new Tensor();
-  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
-  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
-
-  for (int i = 0; i < channel; i++) {
-    new_scale_ptr[i] = bn_scale_ptr[i] /
-                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
-    new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
-    bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0;
-    bs_ptr[i] = new_bias_ptr[i] * 127.0 / So;
-  }
-
-  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, false, param->Groups(),
-                       param->Strides()[0], param->Strides()[1],
-                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-  delete new_scale;
-  delete new_bias;
-  return true;
-}
-
-template <>
-void ConvBNKernel<FPGA, float>::Compute(const FusionConvBNParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp
deleted file mode 100644
index 9ea962c111..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBNRELU_OP
-
-#include "operators/kernel/conv_bn_relu_kernel.h"
-#include <cmath>
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
-  const int groups = param->Groups();
-  auto input = const_cast<LoDTensor *>(param->Input());
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  float Si = input->scale[0];
-  float So = out->scale[0];
-  float Sf = fpga::filter_find_max(filter);
-  auto bn_mean_ptr = param->InputMean()->data<float>();
-  auto bn_var_ptr = param->InputVariance()->data<float>();
-  auto bn_scale_ptr = param->InputScale()->data<float>();
-  auto bn_bias_ptr = param->InputBias()->data<float>();
-  const float epsilon = param->Epsilon();
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
-                        "Output channel should be equal to bias number");
-  const int channel = out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  auto new_scale = new Tensor();
-  auto new_bias = new Tensor();
-  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
-  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
-  for (int i = 0; i < channel; i++) {
-    new_scale_ptr[i] = bn_scale_ptr[i] /
-                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
-    new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
-    bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0;
-    bs_ptr[i] = new_bias_ptr[i] * 127.0 / So;
-    if (groups == channel) {
-      new_scale_ptr[i] = new_scale_ptr[i] * Si / So;
-      new_bias_ptr[i] = new_bias_ptr[i] * 127.0 / So;
-    }
-  }
-  if (groups == channel) {
-    fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr);
-    fpga::DWconvArgs dwconv_arg = {0};
-    fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, true,
-                          param->Strides()[0], param->Strides()[1],
-                          param->Paddings()[0], param->Paddings()[1],
-                          new_bias_ptr);
-    param->SetFpgaArgs(dwconv_arg);
-    fpga::fpga_free(bs_ptr);
-  } else {
-    fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-    fpga::SplitConvArgs conv_arg = {0};
-    fpga::fill_split_arg(&conv_arg, input, out, filter, true, param->Groups(),
-                         param->Strides()[0], param->Strides()[1],
-                         param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(conv_arg);
-  }
-  delete new_scale;
-  delete new_bias;
-  return true;
-}
-
-template <>
-void ConvBNReluKernel<FPGA, float>::Compute(
-    const FusionConvBNReluParam<FPGA> &param) {
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWConv(param.FpgaDwconvArgs());
-  } else {
-    fpga::ComputeFpgaConv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/conv_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/conv_kernel.cpp
deleted file mode 100644
index 9a003543d5..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/conv_kernel.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_OP
-
-#include "operators/kernel/conv_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvKernel<FPGA, float>::Init(ConvParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  float Si = input->scale[0];
-  float So = out->scale[0];
-  float Sf = fpga::filter_find_max(filter);
-  int channel = out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = Si / So * Sf / 127.0;
-    bs_ptr[i] = 0;
-  }
-
-  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, false, param->Groups(),
-                       param->Strides()[0], param->Strides()[1],
-                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-  return true;
-}
-
-template <>
-void ConvKernel<FPGA, float>::Compute(const ConvParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp
deleted file mode 100644
index c09e1ced8a..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_TRANSPOSE_OP
-
-#include "operators/kernel/conv_transpose_kernel.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  float Si = input->scale[0];
-  float So = out->scale[0];
-  float Sf = fpga::filter_find_max(filter);
-
-  int channel = out->dims()[1];
-
-  int sub_conv_n = param->Strides()[0];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
-                                           sizeof(float));             // NOLINT
-
-  for (int i = 0; i < channel * sub_conv_n; i++) {
-    bs_ptr[i + sub_conv_n * channel] = 1;
-    bs_ptr[i] = 0;
-  }
-
-  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
-                        "stride_width should be equal to stride_height ");
-  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
-                        "filter width should be equal to filter height ");
-  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
-                        "filter axis should be the multiple of stride axis ");
-  if (param->Groups() == channel) {
-    for (int i = 0; i < channel * sub_conv_n; i++) {
-      bs_ptr[i + sub_conv_n * channel] = Si / So;
-      bs_ptr[i] = 0;
-    }
-    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
-                               sub_conv_n);
-    fpga::DWDeconvArgs DWDeconv_arg = {0};
-    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, false,
-                            param->Strides()[0], param->Strides()[1],
-                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(DWDeconv_arg);
-  } else {
-    for (int i = 0; i < channel * sub_conv_n; i++) {
-      bs_ptr[i + sub_conv_n * channel] = Si / So * Sf / 127.0f;
-      bs_ptr[i] = 0;
-    }
-    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
-    fpga::DeconvArgs deconv_arg = {0};
-    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, false,
-                          param->Groups(), param->Strides()[0],
-                          param->Strides()[1], param->Paddings()[0],
-                          param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(deconv_arg);
-  }
-  return true;
-}
-
-template <>
-void ConvTransposeKernel<FPGA, float>::Compute(
-    const ConvTransposeParam<FPGA> &param) {
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
-  } else {
-    fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp
deleted file mode 100644
index 1dcb5d7d41..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADDBN_OP
-
-#include "operators/kernel/deconv_add_bn_kernel.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  const Tensor *bias = param->InputBias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  float Si = input->scale[0];
-  float So = out->scale[0];
-  float Sf = fpga::filter_find_max(filter);
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-                        "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-
-  int sub_conv_n = param->Strides()[0];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
-                                           sizeof(float));             // NOLINT
-
-  for (int i = 0; i < channel * sub_conv_n; i++) {
-    bs_ptr[i + sub_conv_n * channel] = 1;
-    bs_ptr[i] = bias_ptr[i % (channel)];
-  }
-
-  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
-                        "stride_width should be equal to stride_height ");
-  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
-                        "filter width should be equal to filter height ");
-  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
-                        "filter axis should be the multiple of stride axis ");
-  if (param->Groups() == channel) {
-    for (int i = 0; i < channel * sub_conv_n; i++) {
-      bs_ptr[i + sub_conv_n * channel] = Si / So;
-      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
-    }
-    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
-                               sub_conv_n);
-    fpga::DWDeconvArgs DWDeconv_arg = {0};
-    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, false,
-                            param->Strides()[0], param->Strides()[1],
-                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(DWDeconv_arg);
-  } else {
-    for (int i = 0; i < channel * sub_conv_n; i++) {
-      bs_ptr[i + sub_conv_n * channel] = Si / So * Sf / 127.0f;
-      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
-    }
-    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
-    fpga::DeconvArgs deconv_arg = {0};
-    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, false,
-                          param->Groups(), param->Strides()[0],
-                          param->Strides()[1], param->Paddings()[0],
-                          param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(deconv_arg);
-  }
-  return true;
-}
-
-template <>
-void DeconvAddBNKernel<FPGA, float>::Compute(
-    const FusionDeconvAddBNParam<FPGA> &param) {
-  // fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
-  } else {
-    fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp
deleted file mode 100644
index 4c8b4ec3c2..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADDBNRELU_OP
-
-#include "operators/kernel/deconv_add_bn_relu_kernel.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DeconvAddBNReluKernel<FPGA, float>::Init(
-    FusionDeconvAddBNReluParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  const Tensor *bias = param->InputBias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  float Si = input->scale[0];
-  float So = out->scale[0];
-  float Sf = fpga::filter_find_max(filter);
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-                        "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-
-  int sub_conv_n = param->Strides()[0];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
-                                           sizeof(float));             // NOLINT
-
-  for (int i = 0; i < channel * sub_conv_n; i++) {
-    bs_ptr[i + sub_conv_n * channel] = 1;
-    bs_ptr[i] = bias_ptr[i % (channel)];
-  }
-
-  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
-                        "stride_width should be equal to stride_height ");
-  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
-                        "filter width should be equal to filter height ");
-  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
-                        "filter axis should be the multiple of stride axis ");
-  if (param->Groups() == channel) {
-    for (int i = 0; i < channel * sub_conv_n; i++) {
-      bs_ptr[i + sub_conv_n * channel] = Si / So;
-      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
-    }
-    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
-                               sub_conv_n);
-    fpga::DWDeconvArgs DWDeconv_arg = {0};
-    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, true,
-                            param->Strides()[0], param->Strides()[1],
-                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(DWDeconv_arg);
-  } else {
-    for (int i = 0; i < channel * sub_conv_n; i++) {
-      bs_ptr[i + sub_conv_n * channel] = Si / So * Sf / 127.0f;
-      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
-    }
-    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
-    fpga::DeconvArgs deconv_arg = {0};
-    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, true,
-                          param->Groups(), param->Strides()[0],
-                          param->Strides()[1], param->Paddings()[0],
-                          param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(deconv_arg);
-  }
-  return true;
-}
-
-template <>
-void DeconvAddBNReluKernel<FPGA, float>::Compute(
-    const FusionDeconvAddBNReluParam<FPGA> &param) {
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
-  } else {
-    fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp
deleted file mode 100644
index 179d58ac99..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADD_OP
-
-#include "operators/kernel/deconv_add_kernel.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  const Tensor *bias = param->Bias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  float Si = input->scale[0];
-  float So = out->scale[0];
-  float Sf = fpga::filter_find_max(filter);
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-                        "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-
-  int sub_conv_n = param->Strides()[0];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
-                                           sizeof(float));             // NOLINT
-
-  for (int i = 0; i < channel * sub_conv_n; i++) {
-    bs_ptr[i + sub_conv_n * channel] = 1;
-    bs_ptr[i] = bias_ptr[i % (channel)];
-  }
-
-  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
-                        "stride_width should be equal to stride_height ");
-  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
-                        "filter width should be equal to filter height ");
-  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
-                        "filter axis should be the multiple of stride axis ");
-  if (param->Groups() == channel) {
-    for (int i = 0; i < channel * sub_conv_n; i++) {
-      bs_ptr[i + sub_conv_n * channel] = Si / So;
-      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
-    }
-    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
-                               sub_conv_n);
-    fpga::DWDeconvArgs DWDeconv_arg = {0};
-    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, false,
-                            param->Strides()[0], param->Strides()[1],
-                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(DWDeconv_arg);
-  } else {
-    for (int i = 0; i < channel * sub_conv_n; i++) {
-      bs_ptr[i + sub_conv_n * channel] = Si / So * Sf / 127.0f;
-      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
-    }
-    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
-    fpga::DeconvArgs deconv_arg = {0};
-    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, false,
-                          param->Groups(), param->Strides()[0],
-                          param->Strides()[1], param->Paddings()[0],
-                          param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(deconv_arg);
-  }
-
-  return true;
-}
-
-template <>
-void DeconvAddKernel<FPGA, float>::Compute(
-    const FusionDeconvAddParam<FPGA> &param) {
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
-  } else {
-    fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp
deleted file mode 100644
index c7e728a169..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADDRELU_OP
-
-#include "operators/kernel/deconv_add_relu_kernel.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DeconvAddReluKernel<FPGA, float>::Init(
-    FusionDeconvAddReluParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  const Tensor *bias = param->Bias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  float Si = input->scale[0];
-  float So = out->scale[0];
-  float Sf = fpga::filter_find_max(filter);
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-                        "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-
-  int sub_conv_n = param->Strides()[0];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
-                                           sizeof(float));             // NOLINT
-
-  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
-                        "stride_width should be equal to stride_height ");
-  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
-                        "filter width should be equal to filter height ");
-  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
-                        "filter axis should be the multiple of stride axis ");
-  if (param->Groups() == channel) {
-    for (int i = 0; i < channel * sub_conv_n; i++) {
-      bs_ptr[i + sub_conv_n * channel] = Si / So;
-      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
-    }
-    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
-                               sub_conv_n);
-    fpga::DWDeconvArgs DWDeconv_arg = {0};
-    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, true,
-                            param->Strides()[0], param->Strides()[1],
-                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(DWDeconv_arg);
-  } else {
-    for (int i = 0; i < channel * sub_conv_n; i++) {
-      bs_ptr[i + sub_conv_n * channel] = Si / So * Sf / 127.0f;
-      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
-    }
-    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
-    fpga::DeconvArgs deconv_arg = {0};
-    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, true,
-                          param->Groups(), param->Strides()[0],
-                          param->Strides()[1], param->Paddings()[0],
-                          param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(deconv_arg);
-  }
-  return true;
-}
-
-template <>
-void DeconvAddReluKernel<FPGA, float>::Compute(
-    const FusionDeconvAddReluParam<FPGA> &param) {
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
-  } else {
-    fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp
deleted file mode 100644
index 081087b7ad..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVBNRELU_OP
-
-#include "operators/kernel/deconv_bn_relu_kernel.h"
-#include <cmath>
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DeconvBNReluKernel<FPGA, float>::Init(
-    FusionDeconvBNReluParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  const Tensor *bias = param->InputBias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  float Si = input->scale[0];
-  float So = out->scale[0];
-  float Sf = fpga::filter_find_max(filter);
-  auto bn_mean_ptr = param->InputMean()->data<float>();
-  auto bn_var_ptr = param->InputVariance()->data<float>();
-  auto bn_scale_ptr = param->InputScale()->data<float>();
-  auto bn_bias_ptr = param->InputBias()->data<float>();
-  const float epsilon = param->Epsilon();
-
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-                        "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-  auto new_scale = new Tensor();
-  auto new_bias = new Tensor();
-  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
-  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
-  for (int i = 0; i < channel; i++) {
-    new_scale_ptr[i] = bn_scale_ptr[i] /
-                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
-    new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
-  }
-
-  int sub_conv_n = param->Strides()[0];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
-                                           sizeof(float));             // NOLINT
-  if (param->Groups() == channel) {
-    for (int i = 0; i < channel * sub_conv_n; i++) {
-      bs_ptr[i + sub_conv_n * channel] = new_scale_ptr[i % channel] * Si / So;
-      bs_ptr[i] = new_bias_ptr[i % (channel)] * 127.0f / So;
-    }
-  } else {
-    for (int i = 0; i < channel * sub_conv_n; i++) {
-      bs_ptr[i + sub_conv_n * channel] =
-          new_scale_ptr[i % channel] * Si / So * Sf / 127.0f;
-      bs_ptr[i] = new_bias_ptr[i % (channel)] * 127.0f / So;
-    }
-  }
-  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
-                        "stride_width should be equal to stride_height ");
-  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
-                        "filter width should be equal to filter height ");
-  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
-                        "filter axis should be the multiple of stride axis ");
-  if (param->Groups() == channel) {
-    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
-                               sub_conv_n);
-    fpga::DWDeconvArgs DWDeconv_arg = {0};
-    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, true,
-                            param->Strides()[0], param->Strides()[1],
-                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(DWDeconv_arg);
-  } else {
-    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
-    fpga::DeconvArgs deconv_arg = {0};
-    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, true,
-                          param->Groups(), param->Strides()[0],
-                          param->Strides()[1], param->Paddings()[0],
-                          param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(deconv_arg);
-  }
-  delete new_scale;
-  delete new_bias;
-  return true;
-}
-
-template <>
-void DeconvBNReluKernel<FPGA, float>::Compute(
-    const FusionDeconvBNReluParam<FPGA> &param) {
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
-  } else {
-    fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/dropout_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/dropout_kernel.cpp
deleted file mode 100644
index 8b990d46e0..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/dropout_kernel.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DROPOUT_OP
-
-#include "operators/kernel/dropout_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DropoutKernel<FPGA, float>::Init(DropoutParam<FPGA> *param) {
-  param->Out()->ShareDataWith(*param->InputX());
-  return true;
-}
-
-template <>
-void DropoutKernel<FPGA, float>::Compute(const DropoutParam<FPGA> &param) {}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
deleted file mode 100644
index 43b9355c99..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef ELEMENTWISEADD_OP
-
-#include "operators/kernel/elementwise_add_kernel.h"
-
-#include <string>
-#include "fpga/V2/api.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
-  auto *input_y = const_cast<LoDTensor *>(param->InputY());
-  auto *out = param->Out();
-  auto *input_x = const_cast<LoDTensor *>(param->InputX());
-  auto input_x_ptr = input_x->data<int8_t>();
-  auto input_y_ptr = input_y->data<int8_t>();
-  fpga::format_ofm(out);
-  auto out_ptr = out->mutable_data<int8_t>();
-  float Si_1 = input_x->scale[0];
-  float Si_2 = input_y->scale[0];
-  float So = out->scale[0];
-  float C1 = Si_1 / So;
-  float C2 = Si_2 / So;
-  fpga::EWAddArgs ewaddArgs = {0};
-  ewaddArgs.const0 = fpga::fp32_2_fp16(C1);
-  ewaddArgs.const1 = fpga::fp32_2_fp16(C2);
-  ewaddArgs.relu_enabled = 0;
-  ewaddArgs.image0.address = input_x_ptr;
-  ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
-  ewaddArgs.image0.scale_address = input_x->scale;
-  ewaddArgs.image0.height = (uint32_t)input_x->dims()[2];
-  ewaddArgs.image0.width = (uint32_t)input_x->dims()[3];
-  ewaddArgs.image0.pad_height = 0;
-  ewaddArgs.image0.pad_width = 0;
-  ewaddArgs.image1.address = input_y_ptr;
-  ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1];
-  ewaddArgs.image1.scale_address = input_y->scale;
-  ewaddArgs.image1.height = (uint32_t)input_y->dims()[2];
-  ewaddArgs.image1.width = (uint32_t)input_y->dims()[3];
-  ewaddArgs.image1.pad_height = 0;
-  ewaddArgs.image1.pad_width = 0;
-  ewaddArgs.output.scale_address = out->scale;
-  ewaddArgs.output.address = out_ptr;
-  fpga::expand_EW_arg(&ewaddArgs);
-  param->SetFpgaArgs(ewaddArgs);
-  return true;
-}
-
-template <>
-void ElementwiseAddKernel<FPGA, float>::Compute(
-    const ElementwiseAddParam<FPGA> &param) {
-  fpga::ComputeFpgaEWAdd(param.FpgaArgs());
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
deleted file mode 100644
index 6d5ad50573..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_ELEMENTWISEADDRELU_OP
-
-#include "operators/kernel/elementwise_add_relu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ElementwiseAddReluKernel<FPGA, float>::Init(
-    ElementwiseAddReluParam<FPGA> *param) {
-  auto *input_x = const_cast<LoDTensor *>(param->InputX());
-  auto *input_y = const_cast<LoDTensor *>(param->InputY());
-  auto *out = param->Out();
-  auto input_x_ptr = input_x->data<int8_t>();
-  auto input_y_ptr = input_y->data<int8_t>();
-  fpga::format_ofm(out);
-  auto out_ptr = out->mutable_data<int8_t>();
-  float Si_1 = input_x->scale[0];
-  float Si_2 = input_y->scale[0];
-  float So = out->scale[0];
-  float C1 = Si_1 / So;
-  float C2 = Si_2 / So;
-  fpga::EWAddArgs ewaddArgs = {0};
-  ewaddArgs.relu_enabled = 1;
-  ewaddArgs.const0 = fpga::fp32_2_fp16(C1);
-  ewaddArgs.const1 = fpga::fp32_2_fp16(C2);
-  ewaddArgs.image0.address = input_x_ptr;
-  ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
-  ewaddArgs.image0.scale_address = input_x->scale;
-  ewaddArgs.image0.height = (uint32_t)input_x->dims()[2];
-  ewaddArgs.image0.width = (uint32_t)input_x->dims()[3];
-  ewaddArgs.image0.pad_height = 0;
-  ewaddArgs.image0.pad_width = 0;
-  ewaddArgs.image1.address = input_y_ptr;
-  ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1];
-  ewaddArgs.image1.scale_address = input_y->scale;
-  ewaddArgs.image1.height = (uint32_t)input_y->dims()[2];
-  ewaddArgs.image1.width = (uint32_t)input_y->dims()[3];
-  ewaddArgs.image1.pad_height = 0;
-  ewaddArgs.image1.pad_width = 0;
-  ewaddArgs.output.scale_address = out->scale;
-  ewaddArgs.output.address = out_ptr;
-  fpga::expand_EW_arg(&ewaddArgs);
-  param->SetFpgaArgs(ewaddArgs);
-  return true;
-}
-
-template <>
-void ElementwiseAddReluKernel<FPGA, float>::Compute(
-    const ElementwiseAddReluParam<FPGA> &param) {
-  fpga::ComputeFpgaEWAdd(param.FpgaArgs());
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp
deleted file mode 100644
index d1138d06bb..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEMUL_OP
-
-#include "operators/kernel/elementwise_mul_kernel.h"
-#include "operators/math/elementwise_op_function.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-struct MulFunctor {
-  inline T operator()(T a, T b) const { return a * b; }
-};
-template <>
-bool ElementwiseMulKernel<FPGA, float>::Init(ElementwiseMulParam<FPGA> *param) {
-  param->float_input_x.Resize(param->InputX()->dims());
-  param->float_input_x.init(type_id<float>().hash_code());
-  fpga::format_fp32_ofm(&(param->float_input_x));
-
-  param->float_out.Resize(param->InputX()->dims());
-  param->float_out.init(type_id<float>().hash_code());
-  fpga::format_fp32_ofm(&(param->float_out));
-
-  auto *out = param->Out();
-  fpga::format_ofm(out);
-  return true;
-}
-
-template <>
-void ElementwiseMulKernel<FPGA, float>::Compute(
-    const ElementwiseMulParam<FPGA> &param) {
-  auto input_x = const_cast<LoDTensor *>(param.InputX());
-  auto intput_x_float = const_cast<Tensor *>(&(param.float_input_x));
-  // auto intput_x_32_ptr =
-  // const_cast<float*>(param.float_input_x.data<float>());
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP32;
-  args.input_layout_type = fpga::LAYOUT_CHW;
-  args.output_layout_type = fpga::LAYOUT_HWC;
-  args.image.address = input_x->data<half>();
-  args.image.channels = (uint32_t)(input_x->fpga_data_num);
-  args.image.height = 1;
-  args.image.width = 1;
-  args.image.pad_height = 0;
-  args.image.pad_width = 0;
-  args.output.address = intput_x_float->data<float>();
-  args.output.scale_address = intput_x_float->scale;
-  fpga::PerformBypass(args);
-  fpga::fpga_invalidate(args.output.address,
-                        input_x->fpga_data_num * sizeof(float));
-
-  auto input_y = param.InputY();
-  int axis = param.Axis();
-  auto out_float = const_cast<Tensor *>(&(param.float_out));
-  ElementwiseComputeEx<MulFunctor<float>, float>(
-      intput_x_float, input_y, axis, MulFunctor<float>(), out_float);
-  fpga::fpga_flush(out_float->data<float>(),
-                   input_x->fpga_data_num * sizeof(float));
-
-  Tensor *Out = param.Out();
-  args.input_data_type = fpga::DATA_TYPE_FP32;
-  args.output_data_type = fpga::DATA_TYPE_FP16;
-  args.input_layout_type = fpga::LAYOUT_CHW;
-  args.output_layout_type = fpga::LAYOUT_HWC;
-  args.image.address = out_float->data<float>();
-  args.image.channels = (uint32_t)(Out->fpga_data_num);
-  args.image.height = 1;
-  args.image.width = 1;
-  args.image.pad_height = 0;
-  args.image.pad_width = 0;
-  args.output.address = Out->data<half>();
-  args.output.scale_address = Out->scale;
-  fpga::PerformBypass(args);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/feed_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/feed_kernel.cpp
deleted file mode 100644
index b797b3faf8..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/feed_kernel.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/feed_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
-  auto output = param->Out();
-  if (output->dims().size() != 4) {
-    output->init(type_id<float>().hash_code());
-    return true;
-  }
-  fpga::format_ofm(output);
-  return true;
-}
-
-template <>
-void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
-  auto output = param.Out();
-  int col = param.Col();
-  auto input = const_cast<LoDTensor *>(&param.InputX()->at(col));
-  if (output->dims().size() != 4) {
-    size_t size = output->numel() * sizeof(float);
-    auto output_ptr = output->data<float>();
-    auto input_ptr = input->data<float>();
-    auto external_ptr = reinterpret_cast<float *>(input->external_data);
-    float *p_data = external_ptr == nullptr ? input_ptr : external_ptr;
-    memcpy(output_ptr, p_data, size);
-    input->external_data = nullptr;
-    return;
-  }
-  fpga::format_image(input);
-
-  auto output_ptr = output->data<int8_t>();
-  int channel = output->dims()[1];
-  int height = output->dims()[2];
-  int width = output->dims()[3];
-  int size = fpga::align_to_x(channel * width, IMAGE_ALIGNMENT) * height;
-  auto input_ptr = input->data<int8_t>();
-  fpga::fpga_invalidate(input_ptr, size * sizeof(int8_t));
-  memcpy(output_ptr, input_ptr, size * sizeof(int8_t));
-
-  fpga::fpga_flush(output_ptr,
-                   fpga::align_to_x(channel * width, IMAGE_ALIGNMENT) * height *
-                       sizeof(int8_t));
-}
-template class FeedKernel<FPGA, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/fpga/V2/fetch_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/fetch_kernel.cpp
deleted file mode 100644
index c6b8f9e852..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/fetch_kernel.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "operators/kernel/fetch_kernel.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
-  auto input = const_cast<LoDTensor *>(param->InputX());
-  int col = param->Col();
-  DLOG << "col = " << col;
-  auto output = &(param->Out()->at(col));
-  output->init(type_id<float>().hash_code());
-  output->mutable_data<float>(input->dims());
-
-  auto aligned_output = param->aligned_out;
-  int outC = 1;
-  int outW = 1;
-  if (output->dims().size() == 4) {
-    outC = output->dims()[1];
-    outW = output->dims()[3];
-  } else {  // 2
-    outC = output->dims()[1];
-  }
-  int unalignedCW = outC * outW;
-  int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
-  if (alignedCW != unalignedCW) {
-    param->aligned_out = std::make_shared<Tensor>();
-    param->aligned_out->Resize(input->dims());
-    param->aligned_out->init(type_id<float>().hash_code());
-    fpga::format_ofm(param->aligned_out.get());
-  }
-  return true;
-}
-void dealign(float *src, float *dst, int input_c, int input_h, int input_w) {
-  int alignCW =
-      paddle_mobile::fpga::align_to_x(input_c * input_w, IMAGE_ALIGNMENT);
-  int dealignCW = input_c * input_w;
-  for (int h = 0; h < input_h; ++h) {
-    auto input_offset = h * alignCW;
-    auto output_offset = h * dealignCW;
-    memcpy((dst + output_offset), (src + input_offset),
-           dealignCW * sizeof(float));
-  }
-}
-template <>
-void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
-  auto input = const_cast<LoDTensor *>(param.InputX());
-  int col = param.Col();
-  auto output = &param.Out()->at(col);
-  auto outdata_ptr = const_cast<float *>(output->data<float>());
-  int outC = 1;
-  int outH = 1;
-  int outW = 1;
-  if (output->dims().size() == 4) {
-    outC = output->dims()[1];
-    outH = output->dims()[2];
-    outW = output->dims()[3];
-  } else {  // 2
-    outC = output->dims()[1];
-  }
-  int unalignedCW = outC * outW;
-  int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
-  if (input->type() == type_id<float>()) {
-    if ((output->dims().size() != 4) || (unalignedCW == alignedCW)) {
-      output->ShareDataWith(*input);
-    } else {
-      auto input_address = input->data<float>();
-      dealign(input_address, outdata_ptr, outC, outH, outW);
-      fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float));
-    }
-
-    return;
-  }
-  auto input_address = input->data<int8_t>();
-  float Si = input->scale[0];
-
-  const int num_th = 32;
-  fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(int8_t));
-  if (input->fpga_data_num < num_th) {
-    for (int idx = 0; idx < product(input->dims()); ++idx) {
-      outdata_ptr[idx] = input_address[idx] / 127.0 * Si;
-    }
-    fpga::fpga_flush(outdata_ptr, product(input->dims()) * sizeof(float));
-    return;
-  }
-
-  auto aligned_out = param.aligned_out.get();
-  if (unalignedCW != alignedCW) {
-    auto aligned_ptr = aligned_out->data<float>();
-    fpga::fpga_invalidate(aligned_ptr, (input->fpga_data_num) * sizeof(float));
-    for (int idx = 0; idx < input->fpga_data_num; ++idx) {
-      aligned_ptr[idx] = input_address[idx] / 127.0 * Si;
-    }
-    dealign(aligned_ptr, outdata_ptr, outC, outH, outW);
-    fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float));
-    return;
-  }
-  for (int idx = 0; idx < input->fpga_data_num; ++idx) {
-    outdata_ptr[idx] = input_address[idx] / 127.0 * Si;
-  }
-  fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float));
-}
-template class FetchKernel<FPGA, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
deleted file mode 100644
index 4767b08e73..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_FC_OP
-
-#include "operators/kernel/fusion_fc_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
-  bool relu_enabled = false;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input_x = const_cast<LoDTensor *>(param->InputX());
-  auto filter = const_cast<LoDTensor *>(param->InputY());
-  const Tensor *input_z = param->InputZ();
-  auto input_z_ptr = input_z->data<float>();
-  auto out = param->Out();
-  float Si = input_x->scale[0];
-  float Sf = fpga::filter_find_max(filter) / 127;
-  float So = out->scale[0];
-
-  int channel = (uint32_t)out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = Si / So * Sf / 127.0f;
-    bs_ptr[i] = input_z_ptr[i] * 127.0f / So;
-  }
-  int num = (uint32_t)filter->dims()[1];
-  int chw = (uint32_t)filter->dims()[0];
-  PADDLE_MOBILE_ENFORCE(
-      chw == input_x->numel(),
-      "Filter element num should be equal to IFM element num");
-  int height = (uint32_t)input_x->dims()[2];
-  int width = (uint32_t)input_x->dims()[3];
-  int filter_channel = chw / height / width;
-
-  out->Resize(framework::make_ddim({1, channel, 1, 1}));
-  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
-  float max_value = fpga::filter_find_max(filter);
-  fpga::format_fc_filter(filter, max_value);
-
-  int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
-  fpga::format_ofm(out);
-
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
-                       0, 0, bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-  return true;
-}
-
-template <>
-void FusionFcKernel<FPGA, float>::Compute(const FusionFcParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp
deleted file mode 100644
index 9748327355..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_FCRELU_OP
-
-#include "operators/kernel/fc_relu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
-  bool relu_enabled = false;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input_x = const_cast<LoDTensor *>(param->InputX());
-  auto filter = const_cast<LoDTensor *>(param->InputY());
-  const Tensor *input_z = param->InputZ();
-  auto input_z_ptr = input_z->data<float>();
-  auto out = param->Out();
-  float Si = input_x->scale[0];
-  float Sf = fpga::filter_find_max(filter) / 127;
-  float So = out->scale[0];
-
-  int channel = (uint32_t)out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = Si / So * Sf / 127.0f;
-    bs_ptr[i] = input_z_ptr[i] * 127.0f / So;
-  }
-  int num = (uint32_t)filter->dims()[1];
-  int chw = (uint32_t)filter->dims()[0];
-  PADDLE_MOBILE_ENFORCE(
-      chw == input_x->numel(),
-      "Filter element num should be equal to IFM element num");
-  int height = (uint32_t)input_x->dims()[2];
-  int width = (uint32_t)input_x->dims()[3];
-  int filter_channel = chw / height / width;
-
-  out->Resize(framework::make_ddim({1, channel, 1, 1}));
-  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
-  float max_value = fpga::filter_find_max(filter);
-  fpga::format_fc_filter(filter, max_value);
-
-  int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
-  fpga::format_ofm(out);
-
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
-                       0, 0, bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-  return true;
-}
-
-template <>
-void FusionFcReluKernel<FPGA, float>::Compute(
-    const FusionFcReluParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/pool_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/pool_kernel.cpp
deleted file mode 100644
index aafc86d888..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/pool_kernel.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef POOL_OP
-
-#include "operators/kernel/pool_kernel.h"
-
-class PoolingArgs;
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
-  auto *input = const_cast<LoDTensor *>(param->Input());
-  auto *output = param->Output();
-  vector<int> ksize = param->Ksize();
-  vector<int> strides = param->Strides();
-  vector<int> paddings = param->Paddings();
-  std::string pooling_type = param->PoolingType();
-
-  if (input->type() == type_id<float>()) {
-    int channels = input->dims()[1];
-    int height = input->dims()[2];
-    int width = input->dims()[3];
-    int num = input->dims()[0];
-    int out_width = (width + 2 * paddings[1] - ksize[1]) / strides[1] + 1;
-    int out_height = (height + 2 * paddings[0] - ksize[0]) / strides[0] + 1;
-    framework::DDim dim =
-        framework::make_ddim({num, channels, out_height, out_width});
-    output->mutable_data<float>(dim);
-    return true;
-  }
-
-  auto input_ptr = input->data<int8_t>();
-  fpga::format_ofm(output);
-  auto output_ptr = output->mutable_data<int8_t>();
-  float Si = input->scale[0];
-  float So = output->scale[0];
-
-  fpga::PoolingArgs poolArgs = {0};
-  poolArgs.mode = pooling_type == "max" ? 0 : 1;  // max:0, avg:1
-  poolArgs.kernel_reciprocal = fpga::fp32_2_fp16(
-      float(1.0 / (ksize[0] * ksize[1]) * Si / So));  // NOLINT
-  poolArgs.image.address = input_ptr;
-  poolArgs.image.channels = (uint32_t)input->dims()[1];
-  poolArgs.image.height = (uint32_t)input->dims()[2];
-  poolArgs.image.width = (uint32_t)input->dims()[3];
-  poolArgs.image.pad_height = (uint32_t)paddings[0];
-  poolArgs.image.pad_width = (uint32_t)paddings[1];
-  poolArgs.image.scale_address = input->scale;
-  poolArgs.output.address = output_ptr;
-  poolArgs.output.scale_address = output->scale;
-  poolArgs.kernel.height = (uint32_t)ksize[0];
-  poolArgs.kernel.width = (uint32_t)ksize[1];
-  poolArgs.kernel.stride_h = (uint32_t)strides[0];
-  poolArgs.kernel.stride_w = (uint32_t)strides[1];
-  param->SetFpgaArgs(poolArgs);
-  return true;
-}
-
-template <>
-void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) {
-  auto *input = const_cast<LoDTensor *>(param.Input());
-
-  if (input->type() == type_id<float>()) {
-    auto *output = param.Output();
-    auto in = input->data<float>();
-    auto N = input->dims()[0];
-    output->Resize(
-        {N, output->dims()[1], output->dims()[2], output->dims()[3]});
-    auto len = output->numel();
-    auto out = output->mutable_data<float>();
-    int C = input->dims()[1], H = input->dims()[2],  // N = input->dims()[0],
-        W = input->dims()[3];
-    int HW = H * W, CHW = C * H * W, WC = W * C;
-
-    for (int n = 0; n < N; n++) {
-      for (int c = 0; c < C; c++) {
-        out[n * C + c] = 0;
-        for (int h = 0; h < H; h++) {
-          for (int w = 0; w < W; w++) {
-            out[n * C + c] += in[n * CHW + h * WC + w * C +
-                                 c];  // in[n * CHW + c * HW + h * W + w]; //
-          }
-        }
-        out[n * C + c] /= HW;
-      }
-    }
-    return;
-  }
-  fpga::ComputeFpgaPool(param.FpgaArgs());
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/proposal_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/proposal_kernel.cpp
deleted file mode 100644
index ecc2577bd6..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/proposal_kernel.cpp
+++ /dev/null
@@ -1,501 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PROPOSAL_OP
-
-#include <algorithm>
-#include <cmath>
-#include <vector>
-#include "operators/kernel/detection_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-static const double kBBoxClipDefault = std::log(1000.0 / 16.0);
-
-template <>
-bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) {
-  int post_nms_top_n = param->post_nms_topn_;
-  int64_t batch = param->scores_->dims()[0];
-  auto total = post_nms_top_n * batch;
-  param->rpn_rois_->mutable_data<float>({total, 4});
-  param->rpn_probs_->mutable_data<float>({total, 1});
-
-  param->float_bbox = std::make_shared<Tensor>();
-  param->float_bbox->Resize(param->bbox_deltas_->dims());
-  param->float_bbox->init(type_id<float>().hash_code());
-  fpga::format_fp32_ofm(param->float_bbox.get());
-  param->float_score = std::make_shared<Tensor>();
-  param->float_score->Resize(param->scores_->dims());
-  param->float_score->init(type_id<float>().hash_code());
-  fpga::format_fp32_ofm(param->float_score.get());
-
-  auto input = param->scores_;
-  param->score_index_ = std::make_shared<Tensor>();
-  param->score_index_->mutable_data<int32_t>({input->numel()});
-  auto score_index = param->score_index_->data<int32_t>();
-  for (int i = 0; i < input->numel(); ++i) {
-    score_index[i] = i;
-  }
-
-  return true;
-}
-template <typename T>
-void CPUGather(const Tensor &src, const Tensor &index, Tensor *output) {
-  PADDLE_MOBILE_ENFORCE(index.dims().size() == 1 ||
-                            (index.dims().size() == 2 && index.dims()[1] == 1),
-                        "Dim not correct");
-  int64_t index_size = index.dims()[0];
-
-  auto src_dims = src.dims();
-
-  const T *p_src = src.data<T>();
-  const int *p_index = index.data<int>();
-  T *p_output = output->data<T>();
-
-  // slice size
-  int slice_size = 1;
-  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
-
-  const size_t slice_bytes = slice_size * sizeof(T);
-
-  for (int64_t i = 0; i < index_size; ++i) {
-    int index_ = p_index[i];
-    memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
-  }
-}
-
-void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) {
-  auto *out_data = dst->data<void>();
-  auto *to_add_data = src.data<void>();
-  size_t size_of_t = framework::SizeOfType(src.type());
-  offset *= size_of_t;
-  std::memcpy(
-      reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(out_data) + offset),
-      to_add_data, src.numel() * size_of_t);
-}
-
-template <class T>
-static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas,
-                            Tensor *variances, Tensor *proposals) {
-  T *proposals_data = proposals->mutable_data<T>();
-
-  int64_t row = all_anchors->dims()[0];
-  int64_t len = all_anchors->dims()[1];
-
-  auto *bbox_deltas_data = bbox_deltas->data<T>();
-  auto *anchor_data = all_anchors->data<T>();
-  const T *variances_data = nullptr;
-  if (variances) {
-    variances_data = variances->data<T>();
-  }
-
-  for (int64_t i = 0; i < row; ++i) {
-    T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0;
-    T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0;
-
-    T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width;
-    T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height;
-
-    T bbox_center_x = 0, bbox_center_y = 0;
-    T bbox_width = 0, bbox_height = 0;
-
-    bbox_center_x = bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
-    bbox_center_y =
-        bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
-    bbox_width = std::exp(bbox_deltas_data[i * len + 2]) * anchor_width;
-    bbox_height = std::exp(bbox_deltas_data[i * len + 3]) * anchor_height;
-
-    proposals_data[i * len] = bbox_center_x - bbox_width / 2;
-    proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
-    proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2;
-    proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2;
-  }
-}
-
-template <class T>
-static inline void ClipTiledBoxes(const Tensor &im_info, Tensor *boxes) {
-  T *boxes_data = boxes->mutable_data<T>();
-  const T *im_info_data = im_info.data<T>();
-  T zero(0);
-  for (int64_t i = 0; i < boxes->numel(); ++i) {
-    if (i % 4 == 0) {
-      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
-    } else if (i % 4 == 1) {
-      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
-    } else if (i % 4 == 2) {
-      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
-    } else {
-      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
-    }
-  }
-}
-
-template <class T>
-static inline void FilterBoxes(Tensor *boxes, float min_size,
-                               const Tensor &im_info, Tensor *keep) {
-  const T *im_info_data = im_info.data<T>();
-  T *boxes_data = boxes->mutable_data<T>();
-  T im_scale = im_info_data[2];
-  keep->Resize({boxes->dims()[0]});
-  min_size = std::max(min_size, 1.0f);
-  int *keep_data = keep->mutable_data<int>();
-
-  int keep_len = 0;
-  for (int i = 0; i < boxes->dims()[0]; ++i) {
-    T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1;
-    T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1;
-    T ws_origin_scale =
-        (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1;
-    T hs_origin_scale =
-        (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1;
-    T x_ctr = boxes_data[4 * i] + ws / 2;
-    T y_ctr = boxes_data[4 * i + 1] + hs / 2;
-    if (ws_origin_scale >= min_size && hs_origin_scale >= min_size &&
-        x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) {
-      keep_data[keep_len++] = i;
-    }
-  }
-  keep->Resize({keep_len});
-}
-
-template <class T>
-static inline std::vector<std::pair<T, int>> GetSortedScoreIndex(
-    const std::vector<T> &scores) {
-  std::vector<std::pair<T, int>> sorted_indices;
-  sorted_indices.reserve(scores.size());
-  for (size_t i = 0; i < scores.size(); ++i) {
-    sorted_indices.emplace_back(scores[i], i);
-  }
-  // Sort the score pair according to the scores in descending order
-  std::stable_sort(sorted_indices.begin(), sorted_indices.end(),
-                   [](const std::pair<T, int> &a, const std::pair<T, int> &b) {
-                     return a.first < b.first;
-                   });
-  return sorted_indices;
-}
-
-template <class T>
-static inline T BBoxArea(const T *box, bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
-    }
-  }
-}
-
-template <typename T>
-static inline Tensor VectorToTensor(const std::vector<T> &selected_indices,
-                                    int selected_num) {
-  Tensor keep_nms;
-  keep_nms.Resize({selected_num});
-  auto *keep_data = keep_nms.mutable_data<T>();
-  for (int i = 0; i < selected_num; ++i) {
-    keep_data[i] = selected_indices[i];
-  }
-  return keep_nms;
-}
-
-template <class T>
-static inline T JaccardOverlap(const T *box1, const T *box2, bool normalized) {
-  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
-      box2[3] < box1[1]) {
-    return static_cast<T>(0.);
-  } else {
-    const T inter_xmin = std::max(box1[0], box2[0]);
-    const T inter_ymin = std::max(box1[1], box2[1]);
-    const T inter_xmax = std::min(box1[2], box2[2]);
-    const T inter_ymax = std::min(box1[3], box2[3]);
-    const T inter_w = std::max(T(0), inter_xmax - inter_xmin + 1);
-    const T inter_h = std::max(T(0), inter_ymax - inter_ymin + 1);
-    const T inter_area = inter_w * inter_h;
-    const T bbox1_area = BBoxArea<T>(box1, normalized);
-    const T bbox2_area = BBoxArea<T>(box2, normalized);
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
-}
-
-template <class T>
-static inline Tensor NMS(Tensor *bbox, Tensor *scores, T nms_threshold,
-                         float eta, int post_nms_num = 100) {
-  int64_t num_boxes = bbox->dims()[0];
-  // 4: [xmin ymin xmax ymax]
-  int64_t box_size = bbox->dims()[1];
-
-  std::vector<T> scores_data(num_boxes);
-  std::copy_n(scores->data<T>(), num_boxes, scores_data.begin());
-  std::vector<std::pair<T, int>> sorted_indices =
-      GetSortedScoreIndex<T>(scores_data);
-
-  std::vector<int> selected_indices;
-  int selected_num = 0;
-  T adaptive_threshold = nms_threshold;
-  const T *bbox_data = bbox->data<T>();
-  while ((sorted_indices.size() != 0) && (selected_num < post_nms_num)) {
-    int idx = sorted_indices.back().second;
-    bool flag = true;
-    for (int kept_idx : selected_indices) {
-      if (flag) {
-        T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
-                                      bbox_data + kept_idx * box_size, false);
-        flag = (overlap <= adaptive_threshold);
-      } else {
-        break;
-      }
-    }
-    if (flag) {
-      selected_indices.push_back(idx);
-      ++selected_num;
-    }
-    sorted_indices.erase(sorted_indices.end() - 1);
-    if (flag && eta < 1 && adaptive_threshold > 0.5) {
-      adaptive_threshold *= eta;
-    }
-  }
-  return VectorToTensor(selected_indices, selected_num);
-}
-
-template <typename T>
-std::pair<Tensor, Tensor> ProposalForOneImage(
-    const Tensor &im_info_slice, const Tensor &anchors, const Tensor &variances,
-    const Tensor &bbox_deltas_slice,  // [M, 4]
-    const Tensor &scores_slice,       // [N, 1]
-    const Tensor &score_index, int pre_nms_top_n, int post_nms_top_n,
-    float nms_thresh, float min_size, float eta) {
-  auto *scores_data = scores_slice.data<T>();
-
-  // Sort index
-  Tensor index_t;
-  index_t.Resize({scores_slice.numel()});
-  int *index = index_t.mutable_data<int>();
-  std::memcpy(index, score_index.data<int32_t>(),
-              scores_slice.numel() * sizeof(int));
-
-  auto compare = [scores_data](const int64_t &i, const int64_t &j) {
-    return scores_data[i] > scores_data[j];
-  };
-
-  if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) {
-    std::sort(index, index + scores_slice.numel(), compare);
-  } else {
-    std::nth_element(index, index + pre_nms_top_n, index + scores_slice.numel(),
-                     compare);
-    index_t.Resize({pre_nms_top_n});
-  }
-
-  Tensor scores_sel, bbox_sel, anchor_sel, var_sel;
-  scores_sel.mutable_data<T>({index_t.numel(), 1});
-  bbox_sel.mutable_data<T>({index_t.numel(), 4});
-  anchor_sel.mutable_data<T>({index_t.numel(), 4});
-  var_sel.mutable_data<T>({index_t.numel(), 4});
-
-  CPUGather<T>(scores_slice, index_t, &scores_sel);
-  CPUGather<T>(bbox_deltas_slice, index_t, &bbox_sel);
-  CPUGather<T>(anchors, index_t, &anchor_sel);
-  Tensor proposals;
-  proposals.mutable_data<T>({index_t.numel(), 4});
-  BoxCoder<T>(&anchor_sel, &bbox_sel, nullptr, &proposals);
-
-  ClipTiledBoxes<T>(im_info_slice, &proposals);
-
-  Tensor keep;
-  FilterBoxes<T>(&proposals, min_size, im_info_slice, &keep);
-
-  Tensor scores_filter;
-  bbox_sel.mutable_data<T>({keep.numel(), 4});
-  scores_filter.mutable_data<T>({keep.numel(), 1});
-
-  CPUGather<T>(proposals, keep, &bbox_sel);
-  CPUGather<T>(scores_sel, keep, &scores_filter);
-  if (nms_thresh <= 0) {
-    return std::make_pair(bbox_sel, scores_filter);
-  }
-
-  Tensor keep_nms =
-      NMS<T>(&bbox_sel, &scores_filter, nms_thresh, eta, post_nms_top_n);
-
-  if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
-    keep_nms.Resize({post_nms_top_n});
-  }
-
-  proposals.mutable_data<T>({keep_nms.numel(), 4});   // original
-  scores_sel.mutable_data<T>({keep_nms.numel(), 1});  // original
-
-  CPUGather<T>(bbox_sel, keep_nms, &proposals);
-  CPUGather<T>(scores_filter, keep_nms, &scores_sel);
-  return std::make_pair(proposals, scores_sel);
-}
-
-template <>
-void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
-  auto input_score = param.scores_;
-  auto input_score_data = input_score->data<int8_t>();
-  uint32_t score_n, score_height, score_width, score_channels;
-
-  auto input_bbox = param.bbox_deltas_;
-  auto input_bbox_data = input_bbox->data<int8_t>();
-  uint32_t bbox_n, bbox_height, bbox_width, bbox_channels;
-
-  score_n = (uint32_t)(input_score->dims()[0]);
-  score_channels = (uint32_t)(input_score->dims()[1]);
-  score_height = (uint32_t)(input_score->dims()[2]);
-  score_width = (uint32_t)(input_score->dims()[3]);
-
-  bbox_n = (uint32_t)(input_bbox->dims()[0]);
-  bbox_channels = (uint32_t)(input_bbox->dims()[1]);
-  bbox_height = (uint32_t)(input_bbox->dims()[2]);
-  bbox_width = (uint32_t)(input_bbox->dims()[3]);
-
-  std::shared_ptr<Tensor> score_tmp = std::make_shared<Tensor>();
-  score_tmp->Resize(param.scores_->dims());
-  score_tmp->mutable_data<int8_t>();
-
-  std::shared_ptr<Tensor> bbox_tmp = std::make_shared<Tensor>();
-  bbox_tmp->Resize(param.bbox_deltas_->dims());
-  bbox_tmp->mutable_data<int8_t>();
-
-  auto score_tmp_data = score_tmp->data<int8_t>();
-  auto bbox_tmp_data = bbox_tmp->data<int8_t>();
-  int64_t amount_per_side = score_width * score_height;
-  int idx = 0;
-  int alignedCW =
-      fpga::align_to_x(score_width * score_channels, IMAGE_ALIGNMENT);
-  int unalignedCW = score_width * score_channels;
-  fpga::fpga_invalidate(input_score_data,
-                        score_height * alignedCW * sizeof(int8_t));
-  for (int h = 0; h < score_height; h++) {
-    for (int w = 0; w < score_width; w++) {
-      for (int c = 0; c < score_channels; c++) {
-        if (alignedCW == unalignedCW) {
-          *(score_tmp_data + c * amount_per_side + score_width * h + w) =
-              (*(input_score_data++));
-        } else {
-          idx = h * alignedCW + w * score_channels + c;
-          *(score_tmp_data + c * amount_per_side + score_width * h + w) =
-              input_score_data[idx];
-        }
-      }
-    }
-  }
-  amount_per_side = bbox_width * bbox_height;
-  alignedCW = fpga::align_to_x(bbox_width * bbox_channels, IMAGE_ALIGNMENT);
-  unalignedCW = bbox_width * bbox_channels;
-  fpga::fpga_invalidate(input_bbox_data,
-                        bbox_height * alignedCW * sizeof(int8_t));
-  for (int h = 0; h < bbox_height; h++) {
-    for (int w = 0; w < bbox_width; w++) {
-      for (int c = 0; c < bbox_channels; c++) {
-        if (alignedCW == unalignedCW) {
-          *(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) =
-              (*(input_bbox_data++));
-        } else {
-          idx = h * alignedCW + w * bbox_channels + c;
-          *(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) =
-              input_bbox_data[idx];
-        }
-      }
-    }
-  }
-
-  auto score_tensor = param.float_score.get();
-  for (int i = 0; i < score_height * score_width * score_channels; i++) {
-    score_tensor->data<float>()[i] =
-        score_tmp_data[i] / 127.0 * input_score->scale[0];
-  }
-  auto bbox_tensor = param.float_bbox.get();
-  for (int i = 0; i < bbox_height * bbox_width * bbox_channels; i++) {
-    bbox_tensor->data<float>()[i] =
-        bbox_tmp_data[i] / 127.0 * input_bbox->scale[0];
-  }
-  auto *scores = param.float_score.get();
-  auto *bbox_deltas = param.float_bbox.get();
-  auto *im_info = param.im_info_;
-  auto anchors = *param.anchors_;
-  auto variances = *param.variances_;
-
-  auto *rpn_rois = param.rpn_rois_;
-  auto *rpn_roi_probs = param.rpn_probs_;
-
-  auto score_index = *(param.score_index_.get());
-
-  int pre_nms_top_n = param.pre_nms_topn_;
-  int post_nms_top_n = param.post_nms_topn_;
-
-  float nms_thresh = param.nms_thresh_ / 2.0f;
-  float min_size = param.min_size_;
-  float eta = param.eta_;
-
-  auto &scores_dim = scores->dims();
-  int64_t num = scores_dim[0];
-  int64_t c_score = scores_dim[1];
-  int64_t h_score = scores_dim[2];
-  int64_t w_score = scores_dim[3];
-
-  auto &bbox_dim = bbox_deltas->dims();
-  int64_t c_bbox = bbox_dim[1];
-  int64_t h_bbox = bbox_dim[2];
-  int64_t w_bbox = bbox_dim[3];
-
-  //
-  rpn_rois->mutable_data<float>({bbox_deltas->numel(), 4});
-  rpn_roi_probs->mutable_data<float>({scores->numel(), 1});
-
-  framework::LoD lod;
-  lod.resize(1);
-  auto &lod0 = lod[0];
-  lod0.push_back(0);
-  anchors.Resize({anchors.numel(), 4});
-  variances.Resize({variances.numel(), 4});
-
-  int64_t num_proposals = 0;
-  for (int64_t i = 0; i < num; ++i) {
-    Tensor im_info_slice = im_info->Slice(i, i + 1);
-    Tensor bbox_deltas_slice = (*bbox_tensor).Slice(i, i + 1);
-    Tensor scores_slice = (*score_tensor).Slice(i, i + 1);
-
-    bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox, 4});
-    scores_slice.Resize({h_score * w_score * c_score, 1});
-
-    std::pair<Tensor, Tensor> tensor_pair = ProposalForOneImage<float>(
-        im_info_slice, anchors, variances, bbox_deltas_slice, scores_slice,
-        score_index, pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta);
-    Tensor &proposals = tensor_pair.first;
-    Tensor &scores = tensor_pair.second;
-
-    AppendProposals(rpn_rois, 4 * num_proposals, proposals);
-    AppendProposals(rpn_roi_probs, num_proposals, scores);
-    num_proposals += proposals.dims()[0];
-    lod0.push_back(num_proposals);
-  }
-  rpn_rois->set_lod(lod);
-  rpn_roi_probs->set_lod(lod);
-  rpn_rois->Resize({num_proposals, 4});
-  rpn_roi_probs->Resize({num_proposals, 1});
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // PROPOSAL_OP
diff --git a/mobile/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
deleted file mode 100644
index b8b5202e27..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
+++ /dev/null
@@ -1,202 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PSROI_POOL_OP
-
-#include <cmath>
-#include <vector>
-#include "operators/kernel/detection_kernel.h"
-
-#include "fpga/V2/api.h"
-#include "fpga/V2/image.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
-  auto dims = param->input_x_->dims();
-  PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0,
-                        "data not aligned");
-
-  param->float_input = std::make_shared<Tensor>();
-  param->float_input->mutable_data<float>(param->input_x_->dims());
-
-  auto* rois = param->input_rois_;
-  int rois_num = rois->dims()[0];
-  framework::DDim dims_out_new = framework::make_ddim(
-      {rois_num, param->output_->dims()[1], param->output_->dims()[2],
-       param->output_->dims()[3]});
-  param->output_->Resize(dims_out_new);
-
-  param->output_->mutable_data<float>(dims_out_new);
-  return true;
-}
-
-template <typename Dtype>
-void PSROIPoolingForward(const Dtype* bottom_data, const int height,
-                         const int width, const int input_channel,
-                         Dtype* top_data, const int pooled_height,
-                         const int pooled_width, const int output_channel,
-                         const Dtype* bottom_rois, const Dtype Bin_size_h,
-                         const Dtype Bin_size_w, const Dtype roi_start_h,
-                         const Dtype roi_start_w, const int pw, const int ph,
-                         const int roi_batch_ind) {
-  int hstart = floor(static_cast<Dtype>(ph) * Bin_size_h + roi_start_h);
-  int wstart = floor(static_cast<Dtype>(pw) * Bin_size_w + roi_start_w);
-  int hend = ceil(static_cast<Dtype>(ph + 1) * Bin_size_h + roi_start_h);
-  int wend = ceil(static_cast<Dtype>(pw + 1) * Bin_size_w + roi_start_w);
-
-  // Add roi offsets and clip to input boundaries
-  hstart = std::min(std::max(hstart, 0), height);
-  hend = std::min(std::max(hend, 0), height);
-  wstart = std::min(std::max(wstart, 0), width);
-  wend = std::min(std::max(wend, 0), width);
-  bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-  float sum_pixels_c[output_channel] = {0};
-  float pixels_c[output_channel] = {0};
-  if (!is_empty) {
-    Dtype bin_area = (hend - hstart) * (wend - wstart);
-    float rec_bin_area = 1 / bin_area;
-
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        int pixel_offset = (h * width + w) * input_channel;
-        for (int output_c = 0; output_c < output_channel; output_c++) {
-          int input_channel_offset = output_c * pooled_height * pooled_width;
-          int input_bias =
-              pixel_offset + input_channel_offset + ph * pooled_width + pw;
-          pixels_c[output_c] = bottom_data[input_bias];
-        }
-
-        for (int output_c = 0; output_c < output_channel; output_c++) {
-          sum_pixels_c[output_c] += pixels_c[output_c];
-        }
-      }
-    }
-    for (int output_c = 0; output_c < output_channel; output_c++) {
-      sum_pixels_c[output_c] *= rec_bin_area;
-    }
-  }
-
-  int output_index_base = (ph * pooled_width + pw) * output_channel;
-  top_data += output_index_base;
-  memcpy(top_data, sum_pixels_c, output_channel * 4);
-}
-
-template <>
-void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
-  auto input_tensor = param.input_x_;
-  auto input_data = input_tensor->data<int8_t>();
-  auto Si = input_tensor->scale[0];
-  auto float_input_tensor = param.float_input.get();
-  auto float_input_data = float_input_tensor->data<float>();
-  for (int i = 0; i < float_input_tensor->numel(); i++) {
-    float_input_data[i] = input_data[i] / 127.0 * Si;
-  }
-
-  auto* in = float_input_tensor;
-  auto* rois = param.input_rois_;
-  auto* out = param.output_;
-
-  auto pooled_height = param.pooled_height_;
-  auto pooled_width = param.pooled_width_;
-  auto spatial_scale = param.spatial_scale_;
-  auto output_channels = param.output_channels_;
-
-  auto in_dims = in->dims();
-  int batch_size = in_dims[0];
-  int input_channels = in_dims[1];
-  int height = in_dims[2];
-  int width = in_dims[3];
-  int rois_num = rois->dims()[0];
-
-  auto data_nhwc = in->mutable_data<float>();
-
-  framework::DDim dims_out_new = framework::make_ddim(
-      {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])),
-       (param.output_)->dims()[3]});
-
-  (param.output_)->Resize(dims_out_new);
-
-  const float* input_data_tmp = data_nhwc;  // in->data<float>();
-  framework::Tensor rois_batch_id_list;
-  rois_batch_id_list.Resize({rois_num});
-  auto rois_batch_id_data = rois_batch_id_list.mutable_data<int>();
-
-  PADDLE_MOBILE_ENFORCE(rois->NumLevels() > 0, "ROIS should not be empty");
-
-  auto rois_lod = rois->lod().back();
-  int rois_batch_size = rois_lod.size() - 1;
-  PADDLE_MOBILE_ENFORCE(
-      rois_batch_size == batch_size,
-      "the rois_batch_size and input(X) batch_size should be the same.");
-  int rois_num_with_lod = rois_lod[rois_batch_size];
-  PADDLE_MOBILE_ENFORCE(rois_num_with_lod == rois_num,
-                        "the rois_num from input and lod must be the same");
-
-  PADDLE_MOBILE_ENFORCE(
-      input_channels == output_channels * pooled_height * pooled_width,
-      "the channels of input X should equal the product of "
-      "output_channels x pooled_height x pooled_width");
-
-  // calculate batch id index for each roi according to LoD
-  for (int n = 0; n < rois_batch_size; ++n) {
-    for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-      rois_batch_id_data[i] = n;
-    }
-  }
-  auto output_data = out->mutable_data<float>();
-  auto input_rois = rois->data<float>();
-
-  for (int n = 0; n < rois_num; ++n) {
-    auto offset_input_rois = input_rois + n * 4;
-    auto offset_output_data =
-        output_data + pooled_height * pooled_width * output_channels * n;
-
-    auto roi_start_w =
-        static_cast<float>(round(offset_input_rois[0])) * spatial_scale;
-    auto roi_start_h =
-        static_cast<float>(round(offset_input_rois[1])) * spatial_scale;
-    auto roi_end_w =
-        static_cast<float>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-    auto roi_end_h =
-        static_cast<float>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-
-    // Force too small rois to be 1 x 1
-    auto roi_height = std::max(roi_end_h - roi_start_h, 0.1f);  // avoid 0
-    auto roi_width = std::max(roi_end_w - roi_start_w, 0.1f);
-
-    // Compute bin size w and h at input feature map
-    auto bin_size_h = roi_height / static_cast<float>(pooled_height);
-    auto bin_size_w = roi_width / static_cast<float>(pooled_width);
-
-    int roi_batch_ind = rois_batch_id_data[n];
-
-    for (int ph = 0; ph < pooled_height; ph++) {
-      for (int pw = 0; pw < pooled_width; pw++) {
-        PSROIPoolingForward<float>(
-            input_data_tmp, height, width, input_channels, offset_output_data,
-            pooled_height, pooled_width, output_channels, input_rois,
-            bin_size_h, bin_size_w, roi_start_h, roi_start_w, pw, ph,
-            roi_batch_ind);
-      }
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // PSROI_POOL_OP
diff --git a/mobile/src/operators/kernel/fpga/V2/relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/relu_kernel.cpp
deleted file mode 100644
index 6fff10f620..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/relu_kernel.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RELU_OP
-
-#include "operators/kernel/activation_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ReluKernel<FPGA, float>::Init(ReluParam<FPGA> *param) {
-  param->Out()->ShareDataWith(*param->InputX());
-  return true;
-}
-
-template <>
-void ReluKernel<FPGA, float>::Compute(const ReluParam<FPGA> &param) {}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
deleted file mode 100644
index ebaf375940..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE2_OP
-
-#include "operators/kernel/reshape2_kernel.h"
-#include "framework/ddim.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool Reshape2Kernel<FPGA, float>::Init(Reshape2Param<FPGA> *param) {
-  auto input = const_cast<LoDTensor *>(param->InputX());
-  auto output = param->Out();
-  auto shape = param->Shape();
-
-  auto num_in = framework::product(input->dims());
-  auto num_shape = framework::product(framework::make_ddim(shape));
-  PADDLE_MOBILE_ENFORCE(num_shape != 0, "0 index is not supported");
-
-  for (int i = 0; i < shape.size(); i++) {
-    if (shape[i] == -1) {
-      shape[i] = static_cast<int>(-num_in / num_shape);
-      break;
-    }
-  }
-  output->Resize(framework::make_ddim(shape));
-  output->set_type(input->type());
-  fpga::format_ofm(output);
-  DLOG << "input: " << input;
-  DLOG << "output: " << output;
-
-  return true;
-}
-
-void reshape(LoDTensor *input, LoDTensor *output) {
-  // Subscript r means after reshape
-
-  auto input_ptr = input->data<int8_t>();
-  auto output_ptr = output->data<int8_t>();
-  output->scale[0] = input->scale[0];
-  output->scale[1] = input->scale[1];
-
-  auto C = static_cast<int>(input->dims()[1]);
-  auto H = static_cast<int>(input->dims()[2]);
-  auto W = static_cast<int>(input->dims()[3]);
-  auto Cr = static_cast<int>(output->dims()[1]);
-  auto Hr = static_cast<int>(output->dims()[2]);
-  auto Wr = static_cast<int>(output->dims()[3]);
-  PADDLE_MOBILE_ENFORCE(C * H * W == Cr * Hr * Wr, "Dims don't match");
-  auto WC = W * C;
-  auto WC_align = fpga::align_to_x(WC, IMAGE_ALIGNMENT);
-  auto HW = H * W;
-  auto WCr = Wr * Cr;
-  auto WCr_align = fpga::align_to_x(WCr, IMAGE_ALIGNMENT);
-  auto HWr = Hr * Wr;
-
-  fpga::fpga_invalidate(input_ptr, H * WC_align * sizeof(int8_t));
-
-  int offset_align = 0;
-  int offset_r = 0, offset_align_r = 0;
-  int cr = 0, hr = 0, wr = 0;
-
-  for (int h = 0; h < H; h++) {
-    int offset0 = h * WC_align;
-    for (int w = 0; w < W; w++) {
-      int offset1 = w * C + offset0;
-      for (int c = 0; c < C; c++) {
-        offset_align = offset1 + c;
-        offset_r = c * HW + h * W + w;
-        cr = offset_r / HWr;
-        hr = offset_r % HWr / Wr;
-        wr = offset_r % Wr;
-        offset_align_r = hr * WCr_align + wr * Cr + cr;
-        output_ptr[offset_align_r] = input_ptr[offset_align];
-      }
-    }
-  }
-
-  fpga::fpga_flush(output_ptr, Hr * WCr_align * sizeof(int8_t));
-}
-
-template <>
-void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
-  auto input = const_cast<LoDTensor *>(param.InputX());
-  auto output = param.Out();
-  auto shape = param.Shape();
-
-  auto num_in = framework::product(input->dims());
-  auto num_shape = framework::product(framework::make_ddim(shape));
-  PADDLE_MOBILE_ENFORCE(num_shape != 0, "0 index is not supported");
-
-  for (int i = 0; i < shape.size(); i++) {
-    if (shape[i] == -1) {
-      shape[i] = static_cast<int>(-num_in / num_shape);
-      break;
-    }
-  }
-  output->Resize(framework::make_ddim(shape));
-  if (output->dims() == input->dims()) {
-    DLOG << "No need to reshape";
-    output->ShareDataWith(*input);
-    framework::LoD lod = input->lod();
-    output->set_lod(lod);
-    output->scale[0] = input->scale[0];
-    return;
-  }
-
-  reshape(input, output);
-  //
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/reshape_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/reshape_kernel.cpp
deleted file mode 100644
index 5e01bb74ba..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/reshape_kernel.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE_OP
-
-#include "operators/kernel/reshape_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ReshapeKernel<FPGA, float>::Init(ReshapeParam<FPGA> *param) {
-  param->Out()->ShareDataWith(*param->InputX());
-  const int in_n = param->InputX()->dims()[0];
-  const int in_c = param->InputX()->dims()[1];
-  const int in_h = param->InputX()->dims()[2];
-  const int in_w = param->InputX()->dims()[3];
-  auto out = param->Out();
-  out->Resize(framework::make_ddim({in_n, in_c * in_h * in_w}));
-  return true;
-}
-
-template <>
-void ReshapeKernel<FPGA, float>::Compute(const ReshapeParam<FPGA> &param) {}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/roialign_pool_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/roialign_pool_kernel.cpp
deleted file mode 100644
index 985f0fc94c..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/roialign_pool_kernel.cpp
+++ /dev/null
@@ -1,296 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ROIALIGN_POOL_OP
-
-#include <cmath>
-#include <vector>
-#include "operators/kernel/detection_kernel.h"
-
-#include "fpga/V2/api.h"
-#include "fpga/V2/image.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool RoiAlignPoolKernel<FPGA, float>::Init(RoiAlignPoolParam<FPGA>* param) {
-  auto dims = param->input_x_->dims();
-  PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0,
-                        "data not aligned");
-
-  param->float_input = std::make_shared<Tensor>();
-  param->float_input->mutable_data<float>(param->input_x_->dims());
-
-  auto input = param->input_x_;
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-  args.input_layout_type = fpga::LAYOUT_HWC;
-  args.output_layout_type = fpga::LAYOUT_HWC;
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP32;
-  args.image.address = input->data<half>();
-  args.image.height = (uint32_t)input->dims()[2];
-  args.image.width = (uint32_t)input->dims()[3];
-  args.image.channels = (uint32_t)input->dims()[1];
-  args.output.address = param->float_input->mutable_data<float>();
-  args.output.scale_address = param->float_input->scale;
-  param->input_arg = args;
-
-  auto* rois = param->input_rois_;
-  int rois_num = rois->dims()[0];
-  framework::DDim dims_out_new = framework::make_ddim(
-      {rois_num, param->output_->dims()[1], param->output_->dims()[2],
-       param->output_->dims()[3]});
-  param->output_->Resize(dims_out_new);
-
-  param->output_->mutable_data<float>(dims_out_new);
-
-  return true;
-}
-
-template <typename T>
-struct PreCalc {
-  int pos1;
-  int pos2;
-  int pos3;
-  int pos4;
-  T w1;
-  T w2;
-  T w3;
-  T w4;
-};
-
-template <typename T>
-void pre_calc_for_bilinear_interpolate(
-    const int height, const int width, const int pooled_height,
-    const int pooled_width, const int iy_upper, const int ix_upper,
-    T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
-    int roi_bin_grid_h, int roi_bin_grid_w,
-    std::vector<PreCalc<T>>& pre_calc) {  // NOLINT
-  int pre_calc_index = 0;
-  for (int ph = 0; ph < pooled_height; ph++) {
-    for (int pw = 0; pw < pooled_width; pw++) {
-      for (int iy = 0; iy < iy_upper; iy++) {
-        const T yy = roi_start_h + ph * bin_size_h +
-                     static_cast<T>(iy + .5f) * bin_size_h /
-                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-        for (int ix = 0; ix < ix_upper; ix++) {
-          const T xx = roi_start_w + pw * bin_size_w +
-                       static_cast<T>(ix + .5f) * bin_size_w /
-                           static_cast<T>(roi_bin_grid_w);
-
-          T x = xx;
-          T y = yy;
-          // deal with: inverse elements are out of feature map boundary
-          if (y < -1.0 || y > height || x < -1.0 || x > width) {
-            // empty
-            PreCalc<T> pc;
-            pc.pos1 = 0;
-            pc.pos2 = 0;
-            pc.pos3 = 0;
-            pc.pos4 = 0;
-            pc.w1 = 0;
-            pc.w2 = 0;
-            pc.w3 = 0;
-            pc.w4 = 0;
-            pre_calc[pre_calc_index] = pc;
-            pre_calc_index += 1;
-            continue;
-          }
-
-          if (y <= 0) {
-            y = 0;
-          }
-          if (x <= 0) {
-            x = 0;
-          }
-
-          int y_low = static_cast<int>(y);
-          int x_low = static_cast<int>(x);
-          int y_high;
-          int x_high;
-
-          if (y_low >= height - 1) {
-            y_high = y_low = height - 1;
-            y = (T)y_low;
-          } else {
-            y_high = y_low + 1;
-          }
-
-          if (x_low >= width - 1) {
-            x_high = x_low = width - 1;
-            x = (T)x_low;
-          } else {
-            x_high = x_low + 1;
-          }
-
-          T ly = y - y_low;
-          T lx = x - x_low;
-          T hy = 1. - ly, hx = 1. - lx;
-          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-          // save weights and indeces
-          PreCalc<T> pc;
-          pc.pos1 = y_low * width + x_low;
-          pc.pos2 = y_low * width + x_high;
-          pc.pos3 = y_high * width + x_low;
-          pc.pos4 = y_high * width + x_high;
-          pc.w1 = w1;
-          pc.w2 = w2;
-          pc.w3 = w3;
-          pc.w4 = w4;
-          pre_calc[pre_calc_index] = pc;
-
-          pre_calc_index += 1;
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-void ROIAlignForward(const int nthreads, const T* bottom_data,
-                     const T& spatial_scale, const int channels,
-                     const int height, const int width, const int pooled_height,
-                     const int pooled_width, const int sampling_ratio,
-                     const T* bottom_rois, T* top_data) {
-  int n_rois = nthreads / channels / pooled_width / pooled_height;
-
-  for (int n = 0; n < n_rois; n++) {
-    int index_n = n * channels * pooled_width * pooled_height;
-
-    // roi could have 4 or 5 columns
-    const T* offset_bottom_rois = bottom_rois + n * 4;
-    int roi_batch_ind = 0;
-    // if (roi_cols == 5) {
-    // roi_batch_ind = offset_bottom_rois[0];
-    // offset_bottom_rois++;
-    // }
-
-    // Do not using rounding; this implementation detail is critical
-    T roi_start_w = offset_bottom_rois[0] * spatial_scale;
-    T roi_start_h = offset_bottom_rois[1] * spatial_scale;
-    T roi_end_w = offset_bottom_rois[2] * spatial_scale;
-    T roi_end_h = offset_bottom_rois[3] * spatial_scale;
-    // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
-    // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
-    // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
-    // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
-
-    // Force malformed ROIs to be 1x1
-    T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
-    T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);  // e.g., = 2
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-
-    // We do average (integral) pooling inside a bin
-    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
-
-    // we want to precalculate indeces and weights shared by all chanels,
-    // this is the key point of optimiation
-    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
-                                     pooled_width * pooled_height);
-    pre_calc_for_bilinear_interpolate(
-        height, width, pooled_height, pooled_width, roi_bin_grid_h,
-        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
-        roi_bin_grid_h, roi_bin_grid_w, pre_calc);
-
-    for (int c = 0; c < channels; c++) {
-      int index_n_c = index_n + c * pooled_width * pooled_height;
-      const T* offset_bottom_data =
-          bottom_data + (roi_batch_ind * channels + c) * height * width;
-      int pre_calc_index = 0;
-
-      for (int ph = 0; ph < pooled_height; ph++) {
-        for (int pw = 0; pw < pooled_width; pw++) {
-          int index = index_n_c + ph * pooled_width + pw;
-
-          T output_val = 0.;
-          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-              PreCalc<T> pc = pre_calc[pre_calc_index];
-              output_val += pc.w1 * offset_bottom_data[pc.pos1] +
-                            pc.w2 * offset_bottom_data[pc.pos2] +
-                            pc.w3 * offset_bottom_data[pc.pos3] +
-                            pc.w4 * offset_bottom_data[pc.pos4];
-
-              pre_calc_index += 1;
-            }
-          }
-          output_val /= count;
-
-          top_data[index] = output_val;
-        }  // for pw
-      }    // for ph
-    }      // for c
-  }        // for n
-}
-
-template <>
-void RoiAlignPoolKernel<FPGA, float>::Compute(
-    const RoiAlignPoolParam<FPGA>& param) {
-  auto input_tensor = param.float_input.get();
-  fpga::PerformBypass(param.input_arg);
-  fpga::fpga_invalidate(input_tensor->data<float>(),
-                        input_tensor->numel() * sizeof(float));
-
-  auto* in = input_tensor;
-  auto* rois = param.input_rois_;
-  auto* out = param.output_;  // param.float_output.get();
-
-  auto pooled_height = param.pooled_height_;
-  auto pooled_width = param.pooled_width_;
-  auto spatial_scale = param.spatial_scale_;
-  auto sampe_ratio = param.sampling_ratio_;
-
-  auto in_dims = in->dims();
-  int batch_size = in_dims[0];
-  int input_channels = in_dims[1];
-  int height = in_dims[2];
-  int width = in_dims[3];
-  int rois_num = rois->dims()[0];
-
-  auto data_nhwc = in->mutable_data<float>();
-
-  fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width);
-  framework::DDim dims_out_new = framework::make_ddim(
-      {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])),
-       (param.output_)->dims()[3]});
-  (param.output_)->Resize(dims_out_new);
-
-  const int index = input_channels * pooled_height * pooled_width * rois_num;
-  auto rois_data = rois->data<float>();
-  auto top_data = param.output_->mutable_data<float>();
-  for (int i = 0; i < index; ++i) {
-    ROIAlignForward<float>(index, data_nhwc, spatial_scale, input_channels,
-                           height, width, pooled_height, pooled_width,
-                           sampe_ratio, rois_data, top_data);
-  }
-
-  fpga::image::convert_to_hwc(&top_data, input_channels, pooled_height,
-                              pooled_width, rois_num);
-  out->reset_data_ptr(top_data);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // ROIALIGN_POOL_OP
diff --git a/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
deleted file mode 100644
index 194fd5a305..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SIGMOID_OP
-
-#include "operators/kernel/activation_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
-  auto input = const_cast<LoDTensor *>(param->InputX());
-  auto input_ptr = input->data<int8_t>();
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::SIGMOID;
-  int16_t leaky_relu_negative_slope =
-      fpga::fp32_2_fp16(input->scale[0] / 127.0);
-  auto out = param->Out();
-  fpga::format_ofm(out);
-
-  fpga::BypassArgs args = {fpga::DATA_TYPE_INT8};
-  args.input_data_type = fpga::DATA_TYPE_INT8;
-  args.output_data_type = fpga::DATA_TYPE_INT8;
-  args.image.address = input_ptr;
-  args.image.height = 1;
-  args.image.width = 1;
-  args.image.channels = input->fpga_data_num;
-  args.output.address = out->data<int8_t>();
-  args.output.scale_address = out->scale;
-  args.output.activation.activation_type = activation_enable;
-  args.output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope;
-  param->SetFpgaArgs(args);
-  return true;
-}
-
-template <>
-void SigmoidKernel<FPGA, float>::Compute(const SigmoidParam<FPGA> &param) {
-  fpga::PerformBypass(param.FpgaArgs());
-  param.Out()->scale[0] = 127.0;
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp
deleted file mode 100644
index a1500ecdb0..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SLICE_OP
-
-#include "operators/kernel/slice_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
-  auto output = param->output_;
-  fpga::format_ofm(output);
-  DLOG << "input: " << param->input_;
-  DLOG << "output: " << param->output_;
-  if (param->input_->type() != type_id<int8_t>()) {
-    DLOG << "wrong type";
-  }
-  return true;
-}
-template <>
-void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
-  // Only support slicing in channel dimension
-  // Only support half data
-  // W must be aligned to 16
-
-  auto input = param.input_;
-  auto output = param.output_;
-  int HW = input->dims()[2] * input->dims()[3];
-  int channel = input->dims()[1];
-  auto input_ptr = input->data<int8_t>();
-  auto output_ptr = output->data<int8_t>();
-
-  output->scale[0] = input->scale[0];
-  output->scale[1] = input->scale[1];
-
-  int start = param.starts_[0], end = param.ends_[0];
-  start = start < 0 ? start + channel : start;
-  end = end < 0 ? end + channel : end;
-  start = start > channel ? channel : start;
-  end = end > channel ? channel : end;
-  int len = end - start;
-  size_t size = len * sizeof(int8_t);
-
-  for (int i = 0; i < HW; i++) {
-    memcpy(output_ptr + len * i, input_ptr + i * channel + start, size);
-  }
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/softmax_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/softmax_kernel.cpp
deleted file mode 100755
index b7615a8891..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/softmax_kernel.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SOFTMAX_OP
-
-#include "operators/kernel/softmax_kernel.h"
-#include "operators/kernel/central-arm-func/softmax_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
-  auto input = const_cast<LoDTensor *>(param->InputX());
-  auto dims = framework::vectorize(input->dims());
-
-  auto out = param->Out();
-  out->Resize(framework::make_ddim(dims));
-
-  int input_c = 1, input_h = 1, input_w = 1;
-  if (dims.size() == 4) {
-    input_h = dims[1];
-    input_w = dims[2];
-    input_c = dims[3];
-    if (input_c == 1) {  // This input is generated by FC op, dims = [N C 1 1]
-      PADDLE_MOBILE_ENFORCE(input_w == 1, "Softmax input must come from FC op");
-      input_c = dims[1];
-      input_h = 1;
-    }
-  } else if (dims.size() == 2) {
-    input_c = dims[1];
-  }
-
-  input->Resize(framework::make_ddim(dims));
-  if ((input_c == 2) && (input->type() == type_id<int8_t>())) {
-    auto input_ptr = input->data<int8_t>();
-    float Si = input->scale[0];
-    int16_t slope = fpga::fp32_2_fp16(Si / 127);
-    out->mutable_data<int8_t>(framework::make_ddim(dims));
-    fpga::format_ofm(out);
-    fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-    args.input_layout_type = fpga::LAYOUT_HWC;
-    args.output_layout_type = fpga::LAYOUT_CHW;
-    args.input_data_type = fpga::DATA_TYPE_FP16;
-    args.output_data_type = fpga::DATA_TYPE_FP16;
-    args.image.address = input_ptr;
-    args.image.height = input_h;
-    args.image.width = input_w;
-    args.image.channels = input_c;
-    args.output.address = out->data<int8_t>();
-    args.output.scale_address = out->scale;
-    args.output.activation.activation_type = fpga::SOFTMAX;
-    args.output.activation.leaky_relu_negative_slope = slope;
-    param->SetFpgaArgs(args);
-  } else {
-    out->mutable_data<float>(framework::make_ddim(dims));
-    fpga::format_ofm(out);
-  }
-
-  return true;
-}
-
-template <>
-void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
-  auto *in_x = (param.InputX());
-  auto dims = in_x->dims();
-
-  auto n = 1;
-  auto h = 1;
-  auto w = 1;
-  auto c = 1;
-  if (dims.size() == 4) {
-    h = dims[1];
-    w = dims[2];
-    c = dims[3];
-    if (c == 1) {  // This input is generated by FC op, dims = [N C 1 1]
-      PADDLE_MOBILE_ENFORCE(w == 1, "Softmax input must come from FC op");
-      c = dims[1];
-      h = 1;
-    }
-  } else if (dims.size() == 2) {
-    c = dims[1];
-  }
-  if ((c == 2) && (in_x->type() == type_id<int8_t>())) {
-    fpga::PerformBypass(param.FpgaArgs());
-  } else if (in_x->type() == type_id<int8_t>()) {
-    auto in_data = in_x->data<int8_t>();
-    float Si = in_x->scale[0];
-    Tensor *out = param.Out();
-    out->Resize({n, h, w, c});
-    auto float_input_x = param.float_input_x_;
-    float_input_x = std::make_shared<Tensor>();
-    float_input_x->Resize(in_x->dims());
-    float_input_x->init(type_id<float>().hash_code());
-    fpga::format_fp32_ofm(float_input_x.get());
-    auto float_input_x_data = float_input_x->data<float>();
-    int dataNum = n * h * fpga::align_to_x(w * c, IMAGE_ALIGNMENT);
-    for (int i = 0; i < dataNum; i++) {
-      float_input_x_data[i] = in_data[i] * Si / 127;
-    }
-    math::SoftmaxFuntor<CPU, float>()(float_input_x.get(), out);
-  } else {
-    Tensor *out = param.Out();
-    out->Resize({n, h, w, c});
-    math::SoftmaxFuntor<CPU, float>()(in_x, out);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/split_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/split_kernel.cpp
deleted file mode 100644
index af3fe9df00..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/split_kernel.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SPLIT_OP
-
-#include "operators/kernel/split_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA> *param) {
-  auto *in = const_cast<LoDTensor *>(param->InputX());
-  auto outs = param->Outs();
-  auto sections = param->Sections();
-  int axis = param->Axis();
-  PADDLE_MOBILE_ENFORCE(axis == 1, "Only support split in channel dimension");
-  PADDLE_MOBILE_ENFORCE(outs.size() == sections.size(),
-                        "Output number should be equal to section number");
-  auto image_num = (uint32_t)outs.size();
-  auto images_out =
-      reinterpret_cast<void **>(fpga::fpga_malloc(image_num * sizeof(void *)));
-  auto scales_out = reinterpret_cast<float **>(
-      fpga::fpga_malloc(image_num * sizeof(float *)));
-  auto out_channels = reinterpret_cast<uint32_t *>(
-      fpga::fpga_malloc(image_num * sizeof(uint32_t)));
-  DLOG << "input: " << in;
-  for (int i = 0; i < image_num; i++) {
-    fpga::format_ofm(outs[i]);
-    DLOG << "output: " << outs[i];
-    images_out[i] = outs[i]->mutable_data<int8_t>();
-    scales_out[i] = outs[i]->scale;
-    out_channels[i] = (uint32_t)sections[i];
-  }
-
-  auto deleter = [](void *p) { fpga::fpga_free(p); };
-
-  fpga::SplitArgs arg = {0};
-  arg.image_num = image_num;
-  arg.image_in = in->data<int8_t>();
-  arg.scale_in = in->scale;
-  arg.images_out = images_out;
-  arg.scales_out = scales_out;
-  arg.out_channel_nums = out_channels;
-  arg.height = (uint32_t)in->dims()[2];
-  arg.width = (uint32_t)in->dims()[3];
-  arg.vector_split_space.push_back(
-      std::shared_ptr<char>(reinterpret_cast<char *>(images_out), deleter));
-  arg.vector_split_space.push_back(
-      std::shared_ptr<char>(reinterpret_cast<char *>(scales_out), deleter));
-  arg.vector_split_space.push_back(
-      std::shared_ptr<char>(reinterpret_cast<char *>(out_channels), deleter));
-
-  param->SetFpgaArgs(arg);
-  return true;
-}
-template <>
-void SplitKernel<FPGA, float>::Compute(const SplitParam<FPGA> &param) {
-  fpga::ComputeFPGASplit(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/tanh_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/tanh_kernel.cpp
deleted file mode 100644
index 670689e083..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/tanh_kernel.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef TANH_OP
-
-#include "operators/kernel/tanh_kernel.h"
-#include <math.h>
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool TanhKernel<FPGA, float>::Init(TanhParam<FPGA> *param) {
-  auto input = const_cast<LoDTensor *>(param->InputX());
-  DLOG << "input: " << input;
-  auto input_ptr = input->data<half>();
-  auto float_input = new LoDTensor;
-
-  float_input->mutable_data<float>(
-      {1, input->dims()[1], input->dims()[2], input->dims()[3]});
-  fpga::format_fp32_ofm(float_input);
-
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-  args.input_layout_type = fpga::LAYOUT_HWC;
-  args.output_layout_type = fpga::LAYOUT_CHW;
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP32;
-  args.image.address = input_ptr;
-  args.image.height = (uint32_t)input->dims()[2];
-  args.image.width = (uint32_t)input->dims()[3];
-  args.image.channels = (uint32_t)input->dims()[1];
-  args.output.address = float_input->data<float>();
-  args.output.scale_address = float_input->scale;
-  param->SetFloatInput(float_input);
-  param->SetFpgaArgs(args);
-  return true;
-}
-
-#define EXP_MAX_INPUT 40.0
-template <typename T>
-T Tanh(const T a) {
-  T tmp = -2.0 * a;
-  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-  return (2.0 / (1.0 + exp(tmp))) - 1.0;
-}
-template <typename T>
-void tanhFuntor(Tensor *input, Tensor *output) {
-  auto *input_ptr = input->data<T>();
-  auto *output_ptr = output->mutable_data<T>();
-  for (int i = 0; i < input->numel(); i++) {
-    *(output_ptr + i) = Tanh<T>(*(input_ptr + i));
-  }
-}
-template <>
-void TanhKernel<FPGA, float>::Compute(const TanhParam<FPGA> &param) {
-  Tensor *in_x = param.FloatInput();
-  Tensor *out = param.Out();
-
-  fpga::PerformBypass(param.FpgaArgs());
-  fpga::fpga_invalidate(reinterpret_cast<void *>(in_x->data<float>()),
-                        in_x->numel() * sizeof(float));
-  tanhFuntor<float>(in_x, out);
-  fpga::fpga_flush(out->data<float>(), out->memory_size());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/transpose2_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/transpose2_kernel.cpp
deleted file mode 100644
index cc839a971e..0000000000
--- a/mobile/src/operators/kernel/fpga/V2/transpose2_kernel.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef TRANSPOSE2_OP
-
-#include "operators/kernel/transpose2_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool Transpose2Kernel<FPGA, float>::Init(Transpose2Param<FPGA> *param) {
-  auto input = param->InputX();
-  auto output = param->Out();
-  auto axis = param->Axis();
-  auto dim = input->dims();
-  output->ShareDataWith(*input);
-
-  auto dim_v = vectorize(dim);
-
-  for (int i = 0; i < axis.size(); i++) {
-    dim_v[i] = dim[axis[i]];
-  }
-  output->Resize(framework::make_ddim(dim_v));
-
-  DLOG << "input: " << input;
-  DLOG << "output: " << output;
-  return true;
-}
-
-template <>
-void Transpose2Kernel<FPGA, float>::Compute(
-    const Transpose2Param<FPGA> &param) {
-  // Transpose2Compute<float>(param);
-  auto input = param.InputX();
-  auto output = param.Out();
-
-  output->Resize({input->dims()[0], output->dims()[1], output->dims()[2],
-                  output->dims()[3]});
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fusion_fc_kernel.h b/mobile/src/operators/kernel/fusion_fc_kernel.h
deleted file mode 100644
index b8086bc66f..0000000000
--- a/mobile/src/operators/kernel/fusion_fc_kernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_FC_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/math/math_function.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class FusionFcKernel
-    : public framework::OpKernelBase<DeviceType, FusionFcParam<DeviceType>> {
- public:
-  void Compute(const FusionFcParam<DeviceType>& param);
-  bool Init(FusionFcParam<DeviceType>* param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/gru_kernel.h b/mobile/src/operators/kernel/gru_kernel.h
deleted file mode 100644
index b03b2e3ecb..0000000000
--- a/mobile/src/operators/kernel/gru_kernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef GRU_OP
-
-#pragma once
-
-#include <vector>
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class GruKernel
-    : public framework::OpKernelBase<DeviceType, GruParam<DeviceType>> {
- public:
-  void Compute(const GruParam<DeviceType>& param);
-  bool Init(GruParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/gru_unit_kernel.h b/mobile/src/operators/kernel/gru_unit_kernel.h
deleted file mode 100644
index bda17cd205..0000000000
--- a/mobile/src/operators/kernel/gru_unit_kernel.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef GRU_UNIT_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class GruUnitKernel
-    : public framework::OpKernelBase<DeviceType, GruUnitParam<DeviceType>> {
- public:
-  void Compute(const GruUnitParam<DeviceType>& param);
-  bool Init(GruUnitParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/im2sequence_kernel.h b/mobile/src/operators/kernel/im2sequence_kernel.h
deleted file mode 100644
index b15eb68996..0000000000
--- a/mobile/src/operators/kernel/im2sequence_kernel.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef IM2SEQUENCE_OP
-
-#include "framework/operator.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-#pragma once
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class Im2SequenceKernel
-    : public framework::OpKernelBase<DeviceType, Im2SequenceParam<DeviceType>> {
- public:
-  void Compute(const Im2SequenceParam<DeviceType>& param);
-  bool Init(Im2SequenceParam<DeviceType>* para);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/increment_kernel.h b/mobile/src/operators/kernel/increment_kernel.h
deleted file mode 100644
index 43a930c1b9..0000000000
--- a/mobile/src/operators/kernel/increment_kernel.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef INCREMENT_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class IncrementKernel
-    : public framework::OpKernelBase<DeviceType, IncrementParam<DeviceType>> {
- public:
-  void Compute(const IncrementParam<DeviceType> &param);
-  bool Init(IncrementParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/instancenorm_kernel.h b/mobile/src/operators/kernel/instancenorm_kernel.h
deleted file mode 100644
index 2333d0cc0f..0000000000
--- a/mobile/src/operators/kernel/instancenorm_kernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef INSTANCENORM_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class InstanceNormKernel
-    : public framework::OpKernelBase<DeviceType,
-                                     InstanceNormParam<DeviceType>> {
- public:
-  void Compute(const InstanceNormParam<DeviceType> &param);
-  bool Init(InstanceNormParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/instancenorm_relu_kernel.h b/mobile/src/operators/kernel/instancenorm_relu_kernel.h
deleted file mode 100644
index 9a4bedb564..0000000000
--- a/mobile/src/operators/kernel/instancenorm_relu_kernel.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef FUSION_INSTANCENORM_RELU_OP
-
-#include <vector>
-#include "framework/operator.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class InstanceNormReluKernel
-    : public OpKernelBase<DeviceType, InstanceNormParam<DeviceType>> {
- public:
-  void Compute(const InstanceNormParam<DeviceType> &param);
-  bool Init(InstanceNormParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/is_empty_kernel.h b/mobile/src/operators/kernel/is_empty_kernel.h
deleted file mode 100644
index 0a6806d087..0000000000
--- a/mobile/src/operators/kernel/is_empty_kernel.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef IS_EMPTY_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class IsEmptyKernel
-    : public framework::OpKernelBase<DeviceType, IsEmptyParam<DeviceType>> {
- public:
-  void Compute(const IsEmptyParam<DeviceType> &param);
-  bool Init(IsEmptyParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/kernels.h b/mobile/src/operators/kernel/kernels.h
deleted file mode 100644
index 668344674c..0000000000
--- a/mobile/src/operators/kernel/kernels.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef TOP_K_OP
-DECLARE_KERNEL(TopK, TopKParam);
-#endif  // TOP_K_OP
-
-#ifdef CAST_OP
-DECLARE_KERNEL(Cast, CastParam);
-#endif  // CAST_OP
-
-#ifdef LOD_RESET_OP
-DECLARE_KERNEL(LodReset, LodResetParam);
-#endif  // LOD_RESET_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/logical_kernel.h b/mobile/src/operators/kernel/logical_kernel.h
deleted file mode 100644
index b42ae27005..0000000000
--- a/mobile/src/operators/kernel/logical_kernel.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef LOGICAL_AND_OP
-DECLARE_KERNEL(LogicalAnd, LogicalBinaryParam);
-#endif
-
-#ifdef LOGICAL_OR_OP
-DECLARE_KERNEL(LogicalOr, LogicalBinaryParam);
-#endif
-
-#ifdef LOGICAL_NOT_OP
-DECLARE_KERNEL(LogicalNot, LogicalUnaryParam);
-#endif
-
-#ifdef LOGICAL_XOR_OP
-DECLARE_KERNEL(LogicalXor, LogicalBinaryParam);
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/lookup_kernel.h b/mobile/src/operators/kernel/lookup_kernel.h
deleted file mode 100644
index 8c29349e73..0000000000
--- a/mobile/src/operators/kernel/lookup_kernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef LOOKUP_OP
-
-#pragma once
-
-#include <vector>
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class LookupKernel
-    : public framework::OpKernelBase<DeviceType, LookupParam<DeviceType>> {
- public:
-  void Compute(const LookupParam<DeviceType>& param);
-  bool Init(LookupParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/lrn_kernel.h b/mobile/src/operators/kernel/lrn_kernel.h
deleted file mode 100644
index 486c828aca..0000000000
--- a/mobile/src/operators/kernel/lrn_kernel.h
+++ /dev/null
@@ -1,181 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef LRN_OP
-
-#include <cmath>
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-#ifdef __ARM_NEON
-#include <arm_neon.h>
-#include "operators/math/math.h"
-#endif
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-struct LRNFunctor {
-  void operator()(const framework::Tensor &input, framework::Tensor *out, int N,
-                  int C, int H, int W, int n, float k, float alpha,
-                  float beta) {
-    const float *input_ptr = input.data<float>();
-    const int start = -(n - 1) / 2;
-    const int end = start + n;
-    auto out_ptr = out->data<T>();
-
-    const int stride0 = C * H * W;
-    const int stride1 = H * W;
-    const int stride2 = W;
-    framework::Tensor sqr_buffer;
-    auto sqr_buffer_ptr = sqr_buffer.mutable_data<float>(input.dims());
-    std::fill(sqr_buffer_ptr, sqr_buffer_ptr + sqr_buffer.numel(), 0.0);
-
-    for (int a = 0; a < N; a++) {
-#pragma parallel for
-      for (int b = 0; b < C; b++) {
-        for (int index = start; index < end; index++) {
-          int channel = b + index;
-          if (channel >= 0 && channel < C) {
-            int tmp_s = a * stride0 + b * stride1;
-            int tmp_c = a * stride0 + channel * stride1;
-#ifdef __ARM_NEON
-            int n4 = stride1 / 4;
-            int m4 = stride1 % 4;
-            float32x4_t sqr0;
-            float32x4_t in0;
-            float32x4_t res0;
-            for (int i = 0; i < n4; i++) {
-              sqr0 = vld1q_f32(sqr_buffer_ptr + tmp_s);
-              in0 = vld1q_f32(input_ptr + tmp_c);
-
-              res0 = vmlaq_f32(sqr0, in0, in0);
-              vst1q_f32(sqr_buffer_ptr + tmp_s, res0);
-
-              tmp_s += 4;
-              tmp_c += 4;
-            }
-
-            for (int i = 0; i < m4; i++) {
-              int s_i = tmp_s + i;
-              int c_i = tmp_c + i;
-              sqr_buffer_ptr[s_i] += input_ptr[c_i] * input_ptr[c_i];
-            }
-
-#else
-            for (int tmp = 0; tmp < stride1; tmp++) {
-              int s_i = tmp_s + tmp;
-              int c_i = tmp_c + tmp;
-              sqr_buffer_ptr[s_i] += input_ptr[c_i] * input_ptr[c_i];
-            }
-#endif
-          }
-        }
-      }
-    }
-
-#ifdef __ARM_NEON
-
-    float32x4_t sqr1, sqr2, sqr3, sqr4;
-    float32x4_t alpha4;
-    float32x4_t k4;
-    float32x4_t beta4;
-    float32x4_t res1, res2, res3, res4;
-    float32x4_t in1, in2, in3, in4;
-
-    beta4 = vdupq_n_f32(beta);
-    alpha4 = vdupq_n_f32(alpha);
-    k4 = vdupq_n_f32(k);
-    auto out_tmp_ptr = out_ptr;
-
-    int n16 = input.numel() / 16;
-    int m16 = input.numel() % 16;
-    int m16n4 = m16 / 4;
-    int m16m4 = m16 % 4;
-
-    for (int i = 0; i < n16; i++) {
-      sqr1 = vld1q_f32(sqr_buffer_ptr);
-      sqr2 = vld1q_f32(sqr_buffer_ptr + 4);
-      sqr3 = vld1q_f32(sqr_buffer_ptr + 8);
-      sqr4 = vld1q_f32(sqr_buffer_ptr + 12);
-
-      in1 = vld1q_f32(input_ptr);
-      in2 = vld1q_f32(input_ptr + 4);
-      in3 = vld1q_f32(input_ptr + 8);
-      in4 = vld1q_f32(input_ptr + 12);
-
-      sqr1 = vmlaq_f32(k4, sqr1, alpha4);
-      sqr2 = vmlaq_f32(k4, sqr2, alpha4);
-      sqr3 = vmlaq_f32(k4, sqr3, alpha4);
-      sqr4 = vmlaq_f32(k4, sqr4, alpha4);
-
-      sqr1 = pow_ps(sqr1, -beta4);
-      sqr2 = pow_ps(sqr2, -beta4);
-      sqr3 = pow_ps(sqr3, -beta4);
-      sqr4 = pow_ps(sqr4, -beta4);
-
-      sqr1 = vmulq_f32(sqr1, in1);
-      sqr2 = vmulq_f32(sqr2, in2);
-      sqr3 = vmulq_f32(sqr3, in3);
-      sqr4 = vmulq_f32(sqr4, in4);
-
-      vst1q_f32(out_tmp_ptr, sqr1);
-      vst1q_f32(out_tmp_ptr + 4, sqr2);
-      vst1q_f32(out_tmp_ptr + 8, sqr3);
-      vst1q_f32(out_tmp_ptr + 12, sqr4);
-
-      sqr_buffer_ptr += 4 * 4;
-      input_ptr += 4 * 4;
-      out_tmp_ptr += 4 * 4;
-    }
-    for (int i = 0; i < m16n4; i++) {
-      sqr4 = vld1q_f32(sqr_buffer_ptr);
-      in4 = vld1q_f32(input_ptr);
-      sqr4 = vmlaq_f32(k4, sqr4, alpha4);
-      sqr4 = pow_ps(sqr4, -beta4);
-      sqr4 = vmulq_f32(sqr4, in4);
-      vst1q_f32(out_tmp_ptr, sqr4);
-      sqr_buffer_ptr += 4;
-      input_ptr += 4;
-      out_tmp_ptr += 4;
-    }
-
-    for (int i = 0; i < m16m4; i++) {
-      out_tmp_ptr[i] = input_ptr[i] / pow(k + alpha * sqr_buffer_ptr[i], beta);
-    }
-
-#else
-    for (int i = 0; i < input.numel(); i++) {
-      out_ptr[i] = input_ptr[i] / pow(k + alpha * sqr_buffer_ptr[i], beta);
-    }
-#endif
-  }
-};
-
-template <typename DeviceType, typename T>
-class LrnKernel
-    : public framework::OpKernelBase<DeviceType, LrnParam<DeviceType>> {
- public:
-  void Compute(const LrnParam<DeviceType> &param);
-  bool Init(LrnParam<DeviceType> *param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/mul_kernel.h b/mobile/src/operators/kernel/mul_kernel.h
deleted file mode 100644
index 8deb4a2cb7..0000000000
--- a/mobile/src/operators/kernel/mul_kernel.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MUL_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/math/math_function.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using namespace framework;
-
-template <typename DeviceType, typename T>
-class MulKernel
-    : public framework::OpKernelBase<DeviceType, MulParam<DeviceType>> {
- public:
-  void Compute(const MulParam<DeviceType> &param);
-  bool Init(MulParam<DeviceType> *param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/multiclass_nms_kernel.h b/mobile/src/operators/kernel/multiclass_nms_kernel.h
deleted file mode 100644
index 6a4ac0c229..0000000000
--- a/mobile/src/operators/kernel/multiclass_nms_kernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MULTICLASSNMS_OP
-
-#pragma once
-
-#include "framework/operator.h"
-
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class MultiClassNMSKernel
-    : public framework::OpKernelBase<DeviceType,
-                                     MultiClassNMSParam<DeviceType>> {
- public:
-  void Compute(const MultiClassNMSParam<DeviceType>& param);
-  bool Init(MultiClassNMSParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/nearest_interp_kernel.h b/mobile/src/operators/kernel/nearest_interp_kernel.h
deleted file mode 100644
index cb2d186312..0000000000
--- a/mobile/src/operators/kernel/nearest_interp_kernel.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef NEAREST_INTERP_OP
-
-#pragma once
-
-#include <vector>
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class NearestInterpolationKernel
-    : public framework::OpKernelBase<DeviceType,
-                                     NearestInterpolationParam<DeviceType>> {
- public:
-  void Compute(const NearestInterpolationParam<DeviceType>& param);
-  bool Init(NearestInterpolationParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/norm_kernel.h b/mobile/src/operators/kernel/norm_kernel.h
deleted file mode 100644
index 4f945bdb8b..0000000000
--- a/mobile/src/operators/kernel/norm_kernel.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef NORM_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class NormKernel
-    : public framework::OpKernelBase<DeviceType, NormParam<DeviceType>> {
- public:
-  void Compute(const NormParam<DeviceType> &param);
-  bool Init(NormParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/one_hot_kernel.h b/mobile/src/operators/kernel/one_hot_kernel.h
deleted file mode 100644
index 2cb2e59eb3..0000000000
--- a/mobile/src/operators/kernel/one_hot_kernel.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ONE_HOT_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype>
-class OnehotParam : public OpParam {
- public:
-  OnehotParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-              const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = GET_VAR_AS_LOD_TENSOR("X", inputs, *scope);
-    output_ = GET_VAR_AS_LOD_TENSOR("Out", outputs, *scope);
-
-    depth_ = OpParam::GetAttr<int>("depth", attrs);
-    dtype_ = OpParam::GetAttr<int>("dtype", attrs);
-  }
-
- public:
-  framework::LoDTensor *input_;
-  framework::LoDTensor *output_;
-
-  int depth_;
-  int dtype_;
-};
-
-DECLARE_KERNEL(Onehot, OnehotParam);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // ONE_HOT_OP
diff --git a/mobile/src/operators/kernel/pad2d_kernel.h b/mobile/src/operators/kernel/pad2d_kernel.h
deleted file mode 100644
index 0d1d1408ba..0000000000
--- a/mobile/src/operators/kernel/pad2d_kernel.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PAD2D_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-// template <typename Dtype>
-// class Pad2DParam : public OpParam {
-// public:
-//  Pad2DParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-//             const AttributeMap &attrs, Scope *scope)
-//      : OpParam(inputs, outputs, attrs, scope) {
-//    input_ = OpParam::GetVarValue<framework::LoDTensor>("X", inputs, *scope);
-//    output_ =
-//        OpParam::GetVarValue<framework::LoDTensor>("Out", outputs, *scope);
-//    paddings_ = OpParam::GetAttr<std::vector<int>>("paddings", attrs);
-//    pad_value_ = OpParam::GetAttr<float>("pad_value", attrs);
-//    mode_ = OpParam::GetStringAttr("mode", attrs);
-//  }
-//
-// public:
-//  framework::LoDTensor *input_;
-//  framework::LoDTensor *output_;
-//  std::vector<int> paddings_;
-//  float pad_value_;
-//  std::string mode_;
-//};
-
-DECLARE_KERNEL(Pad2D, Pad2DParam);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // PAD2D_OP
diff --git a/mobile/src/operators/kernel/polygon_box_transform_kernel.h b/mobile/src/operators/kernel/polygon_box_transform_kernel.h
deleted file mode 100644
index 6ed003a4c7..0000000000
--- a/mobile/src/operators/kernel/polygon_box_transform_kernel.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POLYGONBOXTRANSFORM_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class PolygonBoxTransformKernel
-    : public framework::OpKernelBase<DeviceType,
-                                     PolygonBoxTransformParam<DeviceType>> {
- public:
-  void Compute(const PolygonBoxTransformParam<DeviceType>& param);
-  bool Init(PolygonBoxTransformParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/pool_kernel.h b/mobile/src/operators/kernel/pool_kernel.h
deleted file mode 100644
index ff80e0e445..0000000000
--- a/mobile/src/operators/kernel/pool_kernel.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POOL_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class PoolKernel : public OpKernelBase<DeviceType, PoolParam<DeviceType>> {
- public:
-  void Compute(const PoolParam<DeviceType> &param);
-  bool Init(PoolParam<DeviceType> *param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/prelu_kernel.h b/mobile/src/operators/kernel/prelu_kernel.h
deleted file mode 100644
index c043149243..0000000000
--- a/mobile/src/operators/kernel/prelu_kernel.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-#pragma once
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class PReluKernel
-    : public framework::OpKernelBase<DeviceType, PReluParam<DeviceType>> {
- public:
-  void Compute(const PReluParam<DeviceType>& param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/prior_box_kernel.h b/mobile/src/operators/kernel/prior_box_kernel.h
deleted file mode 100644
index f691ffb83a..0000000000
--- a/mobile/src/operators/kernel/prior_box_kernel.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <cmath>
-#include <vector>
-#include "framework/operator.h"
-#include "operators/math/transform.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef PRIORBOX_OP
-inline void ExpandAspectRatios(const std::vector<float> &input_aspect_ratior,
-                               bool flip,
-                               std::vector<float> *output_aspect_ratior) {
-  constexpr float epsilon = 1e-6;
-  output_aspect_ratior->clear();
-  output_aspect_ratior->push_back(1.0f);
-  for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
-    float ar = input_aspect_ratior[i];
-    bool already_exist = false;
-    for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
-      if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
-        already_exist = true;
-        break;
-      }
-    }
-    if (!already_exist) {
-      output_aspect_ratior->push_back(ar);
-      if (flip) {
-        output_aspect_ratior->push_back(1.0f / ar);
-      }
-    }
-  }
-}
-
-DECLARE_KERNEL(PriorBox, PriorBoxParam);
-#endif  // PRIORBOX_OP
-
-#ifdef DENSITY_PRIORBOX_OP
-template <typename Dtype>
-class DensityPriorBoxParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-
- public:
-  DensityPriorBoxParam(const VariableNameMap &inputs,
-                       const VariableNameMap &outputs,
-                       const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = InputFrom<GType>(inputs, *scope);
-    input_image_ = InputImageFrom<GType>(inputs, *scope);
-    output_boxes_ = OutputBoxesFrom<GType>(outputs, *scope);
-    output_variances_ = OutputVariancesFrom<GType>(outputs, *scope);
-    variances_ = GetAttr<vector<float>>("variances", attrs);
-    clip_ = GetAttr<bool>("clip", attrs);
-    flatten_to_2d_ = GetAttr<bool>("flatten_to_2d", attrs);
-    step_w_ = GetAttr<float>("step_w", attrs);
-    step_h_ = GetAttr<float>("step_h", attrs);
-    offset_ = GetAttr<float>("offset", attrs);
-    fixed_sizes_ = GetAttr<vector<float>>("fixed_sizes", attrs);
-    fixed_ratios_ = GetAttr<vector<float>>("fixed_ratios", attrs);
-    densities_ = GetAttr<vector<int>>("densities", attrs);
-  }
-
-  const GType *Input() const { return input_; }
-  const GType *InputImage() const { return input_image_; }
-  GType *OutputBoxes() const { return output_boxes_; }
-  GType *OutputVariances() const { return output_variances_; }
-  const bool Clip() const { return clip_; }
-  const bool FlattenTo2d() const { return flatten_to_2d_; }
-  const float StepW() const { return step_w_; }
-  const float StepH() const { return step_h_; }
-  const float Offset() const { return offset_; }
-  const vector<float> &FixedSizes() const { return fixed_sizes_; }
-  const vector<float> &FixedRatios() const { return fixed_ratios_; }
-  const vector<int> &Densities() const { return densities_; }
-  const vector<float> &Variances() const { return variances_; }
-
- public:
-  GType *input_;
-  GType *input_image_;
-  GType *output_boxes_;
-  GType *output_variances_;
-  bool clip_;
-  bool flatten_to_2d_;
-  float step_w_;
-  float step_h_;
-  float offset_;
-  vector<float> fixed_sizes_;
-  vector<float> fixed_ratios_;
-  vector<int> densities_;
-  vector<float> variances_;
-};
-
-DECLARE_KERNEL(DensityPriorBox, DensityPriorBoxParam);
-#endif  // DENSITY_PRIORBOX_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/quantize_kernel.h b/mobile/src/operators/kernel/quantize_kernel.h
deleted file mode 100644
index d864e00d9c..0000000000
--- a/mobile/src/operators/kernel/quantize_kernel.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef QUANT_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class QuantizeKernel
-    : public framework::OpKernelBase<DeviceType, QuantizeParam<DeviceType>> {
- public:
-  void Compute(const QuantizeParam<DeviceType> &param);
-  bool Init(QuantizeParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/range_kernel.cpp b/mobile/src/operators/kernel/range_kernel.cpp
deleted file mode 100644
index 9384eb0195..0000000000
--- a/mobile/src/operators/kernel/range_kernel.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RANGE_OP
-
-#include "operators/kernel/range_kernel.h"
-#include "framework/data_type.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool RangeKernel<CPU, float>::Init(RangeParam<CPU>* param) {
-  return true;
-}
-
-template <>
-void RangeKernel<CPU, float>::Compute(const RangeParam<CPU>& param) {
-  int start = param.Start()->data<int>()[0];
-  int end = param.End()->data<int>()[0];
-  int step = param.Step()->data<int>()[0];
-  auto* out = param.Output();
-
-  int64_t size = 0;
-  GetSize(start, end, step, &size);
-  out->Resize(framework::make_ddim({size}));
-  auto* out_data = out->mutable_data<int>();
-  auto value = start;
-  for (int64_t i = 0; i < size; ++i) {
-    out_data[i] = value;
-    value += step;
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // RANGE_OP
diff --git a/mobile/src/operators/kernel/range_kernel.h b/mobile/src/operators/kernel/range_kernel.h
deleted file mode 100644
index 36429461b2..0000000000
--- a/mobile/src/operators/kernel/range_kernel.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RANGE_OP
-
-#pragma once
-
-#include <cmath>
-#include <vector>
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-inline void GetSize(float start, float end, float step, int64_t *size) {
-  PADDLE_MOBILE_ENFORCE(!std::equal_to<float>()(step, 0),
-                        "The step of range op should not be 0.");
-  PADDLE_MOBILE_ENFORCE(
-      ((start < end) && (step > 0)) || ((start > end) && (step < 0)),
-      "The step should be greater than 0 while start < end. And the "
-      "step should be less than 0 while start > end.");
-  *size = std::is_integral<float>::value
-              ? ((std::abs(end - start) + std::abs(step) - 1) / std::abs(step))
-              : std::ceil(std::abs((end - start) / step));
-}
-
-template <typename Dtype>
-class RangeParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  RangeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-             const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    start_ = OpParam::GetVarValue<GType>("Start", inputs, *scope);
-    end_ = OpParam::GetVarValue<GType>("End", inputs, *scope);
-    step_ = OpParam::GetVarValue<GType>("Step", inputs, *scope);
-    output_ = OpParam::OutFrom<GType>(outputs, *scope);
-  }
-
-  GType *Start() const { return start_; }
-  const GType *End() const { return end_; }
-  const GType *Step() const { return step_; }
-  GType *Output() const { return output_; }
-
- private:
-  GType *start_;
-  GType *end_;
-  GType *step_;
-  GType *output_;
-};
-
-DECLARE_KERNEL(Range, RangeParam);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // RANGE_OP
diff --git a/mobile/src/operators/kernel/reduce_prod_kernel.cpp b/mobile/src/operators/kernel/reduce_prod_kernel.cpp
deleted file mode 100644
index c40e5c4615..0000000000
--- a/mobile/src/operators/kernel/reduce_prod_kernel.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef REDUCE_PROD_OP
-
-#include "operators/kernel/reduce_prod_kernel.h"
-#include <operators/reduce_prod_op.h>
-#include <array>
-#include "framework/data_type.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ReduceProdKernel<CPU, float>::Init(ReduceProdParam<CPU>* param) {
-  return true;
-}
-
-template <>
-void ReduceProdKernel<CPU, float>::Compute(const ReduceProdParam<CPU>& param) {
-  auto* input = param.Input();
-  if (input->type() == type_id<int>().hash_code()) {
-    bool reduce_all = param.isReduceAll();
-    auto* output = param.Output();
-    auto dim = param.getDim();
-    auto* out_data = output->mutable_data<int>();
-    const auto* input_x_data = input->data<int>();
-
-    auto dims = param.getDim();
-    bool keep_dim = param.isKeepDim();
-
-    if (reduce_all) {
-      size_t stride = 1;
-      for (int j = dim[0]; j < input->dims().size(); ++j) {
-        stride *= input->dims()[j];
-      }
-      auto numel = output->numel();
-      for (int i = 0; i < numel; i++) {
-        int64_t mul = 1;
-        for (int j = 0; j < stride; ++j, ++input_x_data) {
-          mul *= (*input_x_data);
-        }
-        out_data[i] = mul;
-      }
-    } else {
-      // todo
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // REDUCE_PROD_OP
diff --git a/mobile/src/operators/kernel/reduce_prod_kernel.h b/mobile/src/operators/kernel/reduce_prod_kernel.h
deleted file mode 100644
index 73c93fdc0b..0000000000
--- a/mobile/src/operators/kernel/reduce_prod_kernel.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef REDUCE_PROD_OP
-
-#pragma once
-
-#include <vector>
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype>
-class ReduceProdParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ReduceProdParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                  const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = OpParam::InputXFrom<GType>(inputs, *scope);
-    output_ = OpParam::OutFrom<GType>(outputs, *scope);
-    reduce_all_ = GetAttr<bool>("reduce_all", attrs);
-    keep_dim_ = GetAttr<bool>("keep_dim", attrs);
-    dim_ = GetAttr<std::vector<int>>("dim", attrs);
-  }
-
-  const GType *Input() const { return input_; }
-
-  GType *Output() const { return output_; }
-
-  bool isReduceAll() const { return reduce_all_; }
-
-  bool isKeepDim() const { return keep_dim_; }
-
-  const vector<int> getDim() const { return dim_; }
-
- private:
-  GType *input_;
-  GType *output_;
-  bool reduce_all_;
-  bool keep_dim_;
-  std::vector<int> dim_;
-};
-
-DECLARE_KERNEL(ReduceProd, ReduceProdParam)
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // REDUCE_PROD_OP
diff --git a/mobile/src/operators/kernel/reshape2_kernel.h b/mobile/src/operators/kernel/reshape2_kernel.h
deleted file mode 100644
index c6ab3cf72a..0000000000
--- a/mobile/src/operators/kernel/reshape2_kernel.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE2_OP
-
-#pragma once
-
-#include <vector>
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class Reshape2Kernel
-    : public framework::OpKernelBase<DeviceType, Reshape2Param<DeviceType>> {
- public:
-  void Compute(const Reshape2Param<DeviceType>& param);
-  bool Init(Reshape2Param<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/reshape_kernel.h b/mobile/src/operators/kernel/reshape_kernel.h
deleted file mode 100644
index a540565487..0000000000
--- a/mobile/src/operators/kernel/reshape_kernel.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE_OP
-
-#pragma once
-
-#include <vector>
-#include "framework/operator.h"
-
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-inline framework::DDim ValidateShape(const std::vector<int> shape,
-                                     const framework::DDim& in_dims) {
-  const int64_t in_size = framework::product(in_dims);
-  // only one dimension can be set to -1, whose size will be automatically
-  // infered.
-  const int64_t unk_dim_val = -1;
-  const int64_t copy_dim_val = 0;
-
-  std::vector<int64_t> output_shape(shape.size(), 0);
-  int64_t capacity = 1;
-  int unk_dim_idx = -1;
-  for (size_t i = 0; i < shape.size(); ++i) {
-    if (shape[i] == unk_dim_val) {
-      PADDLE_MOBILE_ENFORCE(
-          unk_dim_idx == -1,
-          "Only one input dimension of Attr(shape) can be unknown.");
-      unk_dim_idx = i;
-    } else if (shape[i] == copy_dim_val) {
-      PADDLE_MOBILE_ENFORCE(
-          static_cast<int>(i) < in_dims.size(),
-          "The index of dimension to copy from input shape must be less "
-          "than the size of input shape.");
-    } else {
-      PADDLE_MOBILE_ENFORCE(
-          shape[i] > 0,
-          "Each input dimension of Attr(shape) must not be negtive except "
-          "one unknown dimension.");
-    }
-
-    capacity *= (shape[i] ? shape[i] : in_dims[i]);
-    output_shape[i] = (shape[i] ? static_cast<int64_t>(shape[i]) : in_dims[i]);
-  }
-
-  if (unk_dim_idx != -1) {
-    output_shape[unk_dim_idx] = -in_size / capacity;
-    PADDLE_MOBILE_ENFORCE(output_shape[unk_dim_idx] * capacity == -in_size,
-                          "Invalid shape is given.");
-  } else {
-    PADDLE_MOBILE_ENFORCE(capacity == in_size, "Invalid shape is given.");
-  }
-  return framework::make_ddim(output_shape);
-}
-
-template <typename DeviceType, typename T>
-class ReshapeKernel
-    : public framework::OpKernelBase<DeviceType, ReshapeParam<DeviceType>> {
- public:
-  void Compute(const ReshapeParam<DeviceType>& param);
-  bool Init(ReshapeParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/resize_kernel.h b/mobile/src/operators/kernel/resize_kernel.h
deleted file mode 100644
index b25a0dcef5..0000000000
--- a/mobile/src/operators/kernel/resize_kernel.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESIZE_OP
-
-#pragma once
-
-#include <vector>
-#include "framework/operator.h"
-
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType>
-inline framework::DDim CalOutputShape(const ResizeParam<DeviceType> &param) {
-  const auto *input_x = param.InputX();
-  const auto &input_x_dims = input_x->dims();
-  auto *out = param.Out();
-  framework::DDim out_dims = out->dims();
-  const auto *input_shape = param.InputShape();
-
-  if (input_shape) {
-    input_x->dims()[0];
-    auto *shape_data = input_shape->template data<int>();
-    framework::Tensor cpu_shape_tensor;
-    auto shape =
-        std::vector<int>(shape_data, shape_data + input_shape->numel());
-    const int in_batch_size = input_x->dims()[0];
-    const int in_chan_size = input_x->dims()[1];
-    const int in_height = input_x->dims()[2];
-    const int in_width = input_x->dims()[3];
-
-    int out_height = 0;
-    int out_width = 0;
-    bool is_pyramid_test = param.IsPyramidTest();
-    if (is_pyramid_test == false) {
-      out_height = param.Height();
-      out_width = param.Width();
-      PADDLE_MOBILE_ENFORCE(out_height > 0, "output height is required");
-      PADDLE_MOBILE_ENFORCE(out_width > 0, "output width is required");
-
-    } else {
-      float out_height_scale = param.OutHeightScale();
-      float out_width_scale = param.OutWidthScale();
-      PADDLE_MOBILE_ENFORCE(out_height_scale > 0,
-                            "output height scale is required");
-      PADDLE_MOBILE_ENFORCE(out_width_scale > 0,
-                            "output width scale is required");
-
-      out_height = int(out_height_scale * in_height);
-      out_width = int(out_width_scale * in_width);
-    }
-
-    out_dims = framework::make_ddim(
-        {in_batch_size, in_chan_size, in_height, in_width});
-  }
-  return out_dims;
-}
-
-template <typename DeviceType, typename T>
-class ResizeKernel
-    : public framework::OpKernelBase<DeviceType, ResizeParam<DeviceType>> {
- public:
-  void Compute(const ResizeParam<DeviceType> &param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/scale_kernel.h b/mobile/src/operators/kernel/scale_kernel.h
deleted file mode 100644
index 4b0c8f457c..0000000000
--- a/mobile/src/operators/kernel/scale_kernel.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SCALE_OP
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-#pragma once
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class ScaleKernel
-    : public framework::OpKernelBase<DeviceType, ScaleParam<DeviceType>> {
- public:
-  void Compute(const ScaleParam<DeviceType>& param);
-  bool Init(ScaleParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/sequence_kernels.h b/mobile/src/operators/kernel/sequence_kernels.h
deleted file mode 100644
index ccee8c5216..0000000000
--- a/mobile/src/operators/kernel/sequence_kernels.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef SEQUENCE_EXPAND_OP
-DECLARE_KERNEL(SequenceExpand, SequenceExpandParam);
-#endif  // SEQUENCE_EXPAND_OP
-
-#ifdef SEQUENCE_POOL_OP
-DECLARE_KERNEL(SequencePool, SequencePoolParam);
-#endif  // SEQUENCE_POOL_OP
-
-#ifdef SEQUENCE_SOFTMAX_OP
-DECLARE_KERNEL(SequenceSoftmax, SoftmaxParam);
-#endif  // SEQUENCE_SOFTMAX_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/shape_kernel.h b/mobile/src/operators/kernel/shape_kernel.h
deleted file mode 100644
index 9d3c6e1701..0000000000
--- a/mobile/src/operators/kernel/shape_kernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SHAPE_OP
-
-#pragma once
-
-#include <vector>
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class ShapeKernel
-    : public framework::OpKernelBase<DeviceType, ShapeParam<DeviceType>> {
- public:
-  void Compute(const ShapeParam<DeviceType>& param);
-  bool Init(ShapeParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/slice_kernel.h b/mobile/src/operators/kernel/slice_kernel.h
deleted file mode 100644
index 89dba51d9e..0000000000
--- a/mobile/src/operators/kernel/slice_kernel.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-#pragma once
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class SliceKernel
-    : public framework::OpKernelBase<DeviceType, SliceParam<DeviceType>> {
- public:
-  void Compute(const SliceParam<DeviceType>& param);
-  bool Init(SliceParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/softmax_kernel.h b/mobile/src/operators/kernel/softmax_kernel.h
deleted file mode 100644
index d7d7435fd5..0000000000
--- a/mobile/src/operators/kernel/softmax_kernel.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SOFTMAX_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class SoftmaxKernel
-    : public OpKernelBase<DeviceType, SoftmaxParam<DeviceType>> {
- public:
-  void Compute(const SoftmaxParam<DeviceType> &param);
-  bool Init(SoftmaxParam<DeviceType> *param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/split_kernel.h b/mobile/src/operators/kernel/split_kernel.h
deleted file mode 100644
index 3a2c03dce7..0000000000
--- a/mobile/src/operators/kernel/split_kernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SPLIT_OP
-
-#pragma once
-
-#include <vector>
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class SplitKernel
-    : public framework::OpKernelBase<DeviceType, SplitParam<DeviceType>> {
- public:
-  void Compute(const SplitParam<DeviceType>& param);
-  bool Init(SplitParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/sum_kernel.h b/mobile/src/operators/kernel/sum_kernel.h
deleted file mode 100644
index 967d6f8307..0000000000
--- a/mobile/src/operators/kernel/sum_kernel.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SUM_OP
-
-#pragma once
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class SumKernel
-    : public framework::OpKernelBase<DeviceType, SumParam<DeviceType>> {
- public:
-  void Compute(const SumParam<DeviceType> &param);
-  bool Init(SumParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/tanh_kernel.h b/mobile/src/operators/kernel/tanh_kernel.h
deleted file mode 100644
index 035f64f840..0000000000
--- a/mobile/src/operators/kernel/tanh_kernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef TANH_OP
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class TanhKernel : public OpKernelBase<DeviceType, TanhParam<DeviceType>> {
- public:
-  void Compute(const TanhParam<DeviceType>& param);
-  bool Init(TanhParam<DeviceType>* param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/tensor_array_read_write_kernel.h b/mobile/src/operators/kernel/tensor_array_read_write_kernel.h
deleted file mode 100644
index 8b666c0b40..0000000000
--- a/mobile/src/operators/kernel/tensor_array_read_write_kernel.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef WRITE_TO_ARRAY_OP
-DECLARE_KERNEL(WriteToArray, WriteToArrayParam);
-#endif  // WRITE_TO_ARRAY_OP
-
-#ifdef READ_FROM_ARRAY_OP
-DECLARE_KERNEL(ReadFromArray, ReadFromArrayParam);
-#endif  // READ_FROM_ARRAY_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/transpose2_kernel.h b/mobile/src/operators/kernel/transpose2_kernel.h
deleted file mode 100644
index a1fb186db0..0000000000
--- a/mobile/src/operators/kernel/transpose2_kernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef TRANSPOSE2_OP
-
-#pragma once
-
-#include <vector>
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class Transpose2Kernel
-    : public framework::OpKernelBase<DeviceType, Transpose2Param<DeviceType>> {
- public:
-  void Compute(const Transpose2Param<DeviceType>& param);
-  bool Init(Transpose2Param<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/transpose_kernel.h b/mobile/src/operators/kernel/transpose_kernel.h
deleted file mode 100644
index 63ee6eb172..0000000000
--- a/mobile/src/operators/kernel/transpose_kernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef TRANSPOSE_OP
-
-#pragma once
-
-#include <vector>
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class TransposeKernel
-    : public framework::OpKernelBase<DeviceType, TransposeParam<DeviceType>> {
- public:
-  void Compute(const TransposeParam<DeviceType>& param);
-  bool Init(TransposeParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/while_kernel.h b/mobile/src/operators/kernel/while_kernel.h
deleted file mode 100644
index 6882ef047f..0000000000
--- a/mobile/src/operators/kernel/while_kernel.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef WHILE_OP
-template <typename Dtype>
-class WhileParam : public OpParam {
- public:
-  WhileParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-             const AttributeMap &attrs, Scope *scope)
-      : scope_(scope), OpParam(inputs, outputs, attrs, scope) {
-    cond_ =
-        OpParam::GetVarValue<framework::LoDTensor>("Condition", inputs, *scope);
-    sub_block_ = OpParam::GetAttr<framework::BlockDesc *>("sub_block", attrs);
-    is_test = OpParam::GetAttr<bool>("is_test", attrs);
-  }
-
- public:
-  Scope *scope_;
-  framework::LoDTensor *cond_;
-  framework::BlockDesc *sub_block_;
-  bool is_test;
-};
-
-DECLARE_KERNEL(While, WhileParam);
-#endif  // WHILE_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/lod_reset_op.cpp b/mobile/src/operators/lod_reset_op.cpp
deleted file mode 100644
index c4100ba8d7..0000000000
--- a/mobile/src/operators/lod_reset_op.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef LOD_RESET_OP
-
-#include "operators/lod_reset_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void LodResetOp<Dtype, T>::InferShape() const {
-  const auto &input_dims = this->param_.input_x_->dims();
-  this->param_.output_->Resize(input_dims);
-  if (std::is_same<DeviceType<kCPU>, Dtype>::value) {
-    if (this->param_.append) {
-      this->param_.output_->set_lod(this->param_.input_x_->lod());
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(lod_reset, ops::LodResetOp);
-#endif
-
-#endif  // LOD_RESET_OP
diff --git a/mobile/src/operators/lod_reset_op.h b/mobile/src/operators/lod_reset_op.h
deleted file mode 100644
index 46932dcfab..0000000000
--- a/mobile/src/operators/lod_reset_op.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef LOD_RESET_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/kernels.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-DECLARE_OPERATOR(LodReset, LodResetParam, LodResetKernel);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // LOD_RESET_OP
diff --git a/mobile/src/operators/logical_op.cpp b/mobile/src/operators/logical_op.cpp
deleted file mode 100644
index 6478516be0..0000000000
--- a/mobile/src/operators/logical_op.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/logical_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#define DEFINE_LOGICAL_INFERSHAPE(OpName)                   \
-  template <typename Dtype, typename T>                     \
-  void OpName##Op<Dtype, T>::InferShape() const {           \
-    const auto &input_dims = this->param_.InputX()->dims(); \
-    this->param_.Out()->Resize(input_dims);                 \
-  }
-
-#ifdef LOGICAL_AND_OP
-DEFINE_LOGICAL_INFERSHAPE(LogicalAnd);
-#endif  // TLOGICAL_AND_OP
-
-#ifdef LOGICAL_OR_OP
-DEFINE_LOGICAL_INFERSHAPE(LogicalOr);
-#endif  // TLOGICAL_OR_OP
-
-#ifdef LOGICAL_NOT_OP
-DEFINE_LOGICAL_INFERSHAPE(LogicalNot);
-#endif  // LOGICAL_NOT_OP
-
-#ifdef LOGICAL_XOR_OP
-DEFINE_LOGICAL_INFERSHAPE(LogicalXor);
-#endif  // TLOGICAL_XOR_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef LOGICAL_AND_OP
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(logical_and, ops::LogicalAndOp);
-#endif
-#endif  // LOGICAL_AND_OP
-
-#ifdef LOGICAL_OR_OP
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(logical_or, ops::LogicalOrOp);
-#endif
-#endif  // LOGICAL_OR_OP
-
-#ifdef LOGICAL_NOT_OP
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(logical_not, ops::LogicalNotOp);
-#endif
-#endif  // LOGICAL_NOT_OP
-
-#ifdef LOGICAL_XOR_OP
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(logical_xor, ops::LogicalXorOp);
-#endif
-#endif  // LOGICAL_XOR_OP
diff --git a/mobile/src/operators/logical_op.h b/mobile/src/operators/logical_op.h
deleted file mode 100644
index a3cd2fb605..0000000000
--- a/mobile/src/operators/logical_op.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/logical_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef LOGICAL_AND_OP
-DECLARE_OPERATOR(LogicalAnd, LogicalBinaryParam, LogicalAndKernel);
-#endif
-
-#ifdef LOGICAL_OR_OP
-DECLARE_OPERATOR(LogicalOr, LogicalBinaryParam, LogicalOrKernel);
-#endif
-
-#ifdef LOGICAL_NOT_OP
-DECLARE_OPERATOR(LogicalNot, LogicalUnaryParam, LogicalNotKernel);
-#endif
-
-#ifdef LOGICAL_XOR_OP
-DECLARE_OPERATOR(LogicalXor, LogicalBinaryParam, LogicalXorKernel);
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/lookup_op.cpp b/mobile/src/operators/lookup_op.cpp
deleted file mode 100644
index 682e71221e..0000000000
--- a/mobile/src/operators/lookup_op.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef LOOKUP_OP
-
-#include <vector>
-
-#include "common/enforce.h"
-#include "operators/lookup_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void LookupOp<Dtype, T>::InferShape() const {
-  PADDLE_MOBILE_ENFORCE(this->param_.InputW() != nullptr,
-                        "Input(W) of LookupTableOp should not be null.");
-  auto *ids_t = this->param_.InputIds();
-
-  PADDLE_MOBILE_ENFORCE(ids_t != nullptr,
-                        "Input(Ids) of LookupTableOp should not be null.");
-  PADDLE_MOBILE_ENFORCE(this->param_.Out() != nullptr,
-                        "Output(Out) of LookupTableOp should not be null.");
-  //    this->param__.InputW()->
-
-  auto table_dims = this->param_.InputW()->dims();
-  auto ids_dims = ids_t->dims();
-
-  int ids_rank = ids_dims.size();
-
-  PADDLE_MOBILE_ENFORCE(table_dims.size() == 2,
-                        "table_dims.size()==2 check failed");
-
-  PADDLE_MOBILE_ENFORCE(ids_dims[ids_rank - 1] == 1,
-                        "The last dimension of the 'Ids' tensor must be 1.");
-
-  auto output_dims =
-      framework::vectorize(framework::slice_ddim(ids_dims, 0, ids_rank - 1));
-  output_dims.push_back(table_dims[1]);
-
-  this->param_.Out()->Resize(framework::make_ddim(output_dims));
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(lookup_table, ops::LookupOp);
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
-#endif
diff --git a/mobile/src/operators/lookup_op.h b/mobile/src/operators/lookup_op.h
deleted file mode 100644
index e99936a711..0000000000
--- a/mobile/src/operators/lookup_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef LOOKUP_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/lookup_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class LookupOp : public framework::OperatorWithKernel<
-                     DeviceType, LookupParam<DeviceType>,
-                     operators::LookupKernel<DeviceType, T>> {
- public:
-  LookupOp(const std::string &type, const VariableNameMap &inputs,
-           const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-           framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, LookupParam<DeviceType>,
-                                      operators::LookupKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/lrn_op.cpp b/mobile/src/operators/lrn_op.cpp
deleted file mode 100644
index 9b0745b113..0000000000
--- a/mobile/src/operators/lrn_op.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef LRN_OP
-
-#include "operators/lrn_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void LrnOp<Dtype, T>::InferShape() const {
-  auto x_dims = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(x_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(lrn, ops::LrnOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(lrn, ops::LrnOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/lrn_op.h b/mobile/src/operators/lrn_op.h
deleted file mode 100644
index dde4b968af..0000000000
--- a/mobile/src/operators/lrn_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef LRN_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/lrn_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-template <typename DeviceType, typename T>
-class LrnOp : public framework::OperatorWithKernel<
-                  DeviceType, LrnParam<DeviceType>,
-                  operators::LrnKernel<DeviceType, T>> {
- public:
-  LrnOp(const string &type, const VariableNameMap &inputs,
-        const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-        framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, LrnParam<DeviceType>,
-                                      operators::LrnKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/math/activation.h b/mobile/src/operators/math/activation.h
deleted file mode 100644
index d2b465c2bc..0000000000
--- a/mobile/src/operators/math/activation.h
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <cmath>
-#include <string>
-#include "common/enforce.h"
-#include "common/types.h"
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#include "operators/math/math.h"
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-#define SIGMOID_THRESHOLD_MIN -40.0
-#define SIGMOID_THRESHOLD_MAX 13.0
-#define EXP_MAX_INPUT 40.0
-
-inline ActivationType GetActivationType(const std::string &type) {
-  if (type == "sigmoid") {
-    return ActivationType::SIGMOID;
-  } else if (type == "relu") {
-    return ActivationType::RELU;
-  } else if (type == "tanh") {
-    return ActivationType::TANH;
-  } else if (type == "identity" || type == "") {
-    return ActivationType::IDENTITY;
-  }
-  PADDLE_MOBILE_THROW_EXCEPTION("Not support activation type.");
-}
-
-inline ActivationType GetActivationType(const int type) {
-  if (type == 0) {
-    return ActivationType::IDENTITY;
-  } else if (type == 1) {
-    return ActivationType::SIGMOID;
-  } else if (type == 2) {
-    return ActivationType::TANH;
-  } else if (type == 3) {
-    return ActivationType::RELU;
-  }
-  PADDLE_MOBILE_THROW_EXCEPTION("Not support activation type.");
-}
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-template <ActivationType Act = IDENTITY>
-inline float32x4_t vActiveq_f32(const float32x4_t &x) {
-  return x;
-}
-
-template <>
-inline float32x4_t vActiveq_f32<RELU>(const float32x4_t &x) {
-  float32x4_t __zero = vdupq_n_f32(0.f);
-  return vmaxq_f32(x, __zero);
-}
-
-template <>
-inline float32x4_t vActiveq_f32<RELU6>(const float32x4_t &x) {
-  float32x4_t __zero = vdupq_n_f32(0.f);
-  float32x4_t __six = vdupq_n_f32(6.f);
-  return vminq_f32(vmaxq_f32(x, __zero), __six);
-}
-
-template <>
-inline float32x4_t vActiveq_f32<SIGMOID>(const float32x4_t &x) {
-  float32x4_t __one = vdupq_n_f32(1.f);
-  float32x4_t __x = vnegq_f32(x);
-  __x = exp_ps(__x);
-  __x = vaddq_f32(__x, __one);
-  float32x4_t __out = vrecpeq_f32(__x);
-  return vmulq_f32(vrecpsq_f32(__x, __out), __out);
-}
-
-template <>
-inline float32x4_t vActiveq_f32<TANH>(const float32x4_t &x) {
-  float32x4_t __one = vdupq_n_f32(1.f);
-  float32x4_t __x = vnegq_f32(x);
-  __x = vmulq_n_f32(__x, 2.f);
-  __x = exp_ps(__x);
-  __x = vaddq_f32(__x, __one);
-  float32x4_t __out = vrecpeq_f32(__x);
-  __out = vmulq_f32(vrecpsq_f32(__x, __out), __out);
-  __out = vmulq_n_f32(__out, 2.f);
-  return vsubq_f32(__out, __one);
-}
-
-template <>
-inline float32x4_t vActiveq_f32<LOG>(const float32x4_t &x) {
-  return log_ps(x);
-}
-
-template <ActivationType Act = IDENTITY>
-inline float32x4_t vActiveq_f32(const float32x4_t &x,
-                                const float32x4_t &alpha) {
-  return x;
-}
-
-template <>
-inline float32x4_t vActiveq_f32<LEAKY_RELU>(const float32x4_t &x,
-                                            const float32x4_t &alpha) {
-  return vmaxq_f32(x, vmulq_f32(x, alpha));
-}
-
-template <>
-inline float32x4_t vActiveq_f32<RELU6>(const float32x4_t &x,
-                                       const float32x4_t &alpha) {
-  float32x4_t __zero = vdupq_n_f32(0.f);
-  float32x4_t __threshold = vdupq_n_f32(vgetq_lane_f32(alpha, 0));
-  return vminq_f32(vmaxq_f32(x, __zero), __threshold);
-}
-#endif
-
-template <ActivationType Act = IDENTITY>
-inline float Active(const float &x) {
-  return x;
-}
-
-template <ActivationType Act = IDENTITY>
-inline int Active(const int &x) {
-  return x;
-}
-
-template <>
-inline float Active<RELU>(const float &x) {
-  return std::max(x, 0.f);
-}
-
-template <>
-inline float Active<RELU6>(const float &x) {
-  return std::min(std::max(x, 0.f), 6.f);
-}
-
-template <>
-inline float Active<SIGMOID>(const float &x) {
-  //  float tmp = x > SIGMOID_THRESHOLD_MAX ? SIGMOID_THRESHOLD_MAX : x;
-  //  tmp = x > SIGMOID_THRESHOLD_MIN ? x : SIGMOID_THRESHOLD_MIN;
-  //  return 1.f / (1.f + exp(-tmp));
-  return 1.f / (1.f + exp(-x));
-}
-
-template <>
-inline float Active<TANH>(const float &x) {
-  //  float tmp = -2.f * x;
-  //  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-  //  return (2.f / (1.f + exp(tmp))) - 1.f;
-  return 2.f / (1.f + exp(-2.f * x)) - 1.f;
-}
-
-template <>
-inline float Active<LOG>(const float &x) {
-  return log(x);
-}
-
-template <ActivationType Act = IDENTITY>
-inline float Active(const float &x, const float &alpha) {
-  return x;
-}
-
-template <>
-inline float Active<LEAKY_RELU>(const float &x, const float &alpha) {
-  return std::max(x, alpha * x);
-}
-
-template <>
-inline float Active<RELU6>(const float &x, const float &alpha) {
-  return std::min(std::max(x, 0.f), alpha);
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/depthwise/faster_depthwise_conv3x3.h b/mobile/src/operators/math/depthwise/faster_depthwise_conv3x3.h
deleted file mode 100644
index 25011b9f01..0000000000
--- a/mobile/src/operators/math/depthwise/faster_depthwise_conv3x3.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <vector>
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-namespace depthwise {
-
-void conv_depthwise_3x3p1(const float* din, float* dout, int num, int ch_out,
-                          int h_out, int w_out, int ch_in, int h_in, int w_in,
-                          const float* weights, const float* bias, int stride,
-                          bool flag_bias, bool flag_relu);
-
-}  // namespace depthwise
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/depthwise/faster_depthwise_conv3x3p1.cpp b/mobile/src/operators/math/depthwise/faster_depthwise_conv3x3p1.cpp
deleted file mode 100644
index 4f3bebd9bf..0000000000
--- a/mobile/src/operators/math/depthwise/faster_depthwise_conv3x3p1.cpp
+++ /dev/null
@@ -1,2011 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-#include <arm_neon.h>
-#include "framework/context.h"
-#include "operators/math/depthwise/faster_depthwise_conv3x3.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-namespace depthwise {
-
-void conv_depthwise_3x3s1p1_bias_relu(float *dout, const float *din,
-                                      const float *weights, const float *bias,
-                                      bool flag_bias, const int num,
-                                      const int ch_in, const int h_in,
-                                      const int w_in, const int h_out,
-                                      const int w_out);
-
-//! for input width <= 4
-void conv_depthwise_3x3s1p1_bias_s_relu(float *dout, const float *din,
-                                        const float *weights, const float *bias,
-                                        bool flag_bias, const int num,
-                                        const int ch_in, const int h_in,
-                                        const int w_in, const int h_out,
-                                        const int w_out);
-
-void conv_depthwise_3x3s2p1_bias_relu(float *dout, const float *din,
-                                      const float *weights, const float *bias,
-                                      bool flag_bias, const int num,
-                                      const int ch_in, const int h_in,
-                                      const int w_in, const int h_out,
-                                      const int w_out);
-
-void conv_depthwise_3x3p1(const float *din, float *dout, int num, int ch_out,
-                          int h_out, int w_out, int ch_in, int h_in, int w_in,
-                          const float *weights, const float *bias, int stride,
-                          bool flag_bias, bool flag_relu) {
-  if (stride == 1) {
-    if (flag_relu) {
-      if (w_in > 4) {
-        conv_depthwise_3x3s1p1_bias_relu(dout, din, weights, bias, flag_bias,
-                                         num, ch_in, h_in, w_in, h_out, w_out);
-      } else {
-        conv_depthwise_3x3s1p1_bias_s_relu(dout, din, weights, bias, flag_bias,
-                                           num, ch_in, h_in, w_in, h_out,
-                                           w_out);
-      }
-    }
-  } else {  //! stride = 2
-    if (flag_relu) {
-      if (w_in > 7) {
-        conv_depthwise_3x3s2p1_bias_relu(dout, din, weights, bias, flag_bias,
-                                         num, ch_in, h_in, w_in, h_out, w_out);
-      }
-    }
-  }
-}
-
-// 4line
-void conv_depthwise_3x3s1p1_bias_relu(float *dout, const float *din,
-                                      const float *weights, const float *bias,
-                                      bool flag_bias, const int num,
-                                      const int ch_in, const int h_in,
-                                      const int w_in, const int h_out,
-                                      const int w_out) {
-  //! pad is done implicit
-  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
-  //! for 4x6 convolution window
-  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
-
-  // printf("conv3x3_dw start \n");
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  int w_stride = 9;
-
-  int tile_w = (w_in + 3) >> 2;
-  int tile_h = (h_in + 3) >> 2;
-  int cnt_col = tile_w - 2;
-  float *zero_ptr = static_cast<float *>(
-      framework::CPUContext::Context()->get_work_space(w_in * sizeof(float)));
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float *write_ptr = zero_ptr + w_in;
-
-  unsigned int size_pad_right = (unsigned int)(1 + (tile_w << 2) - w_in);
-  int size_pad_bottom = (unsigned int)(1 + (tile_h << 2) - h_in);
-
-  uint32x4_t vmask_rp1 =
-      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_rp2 =
-      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_result =
-      vcgtq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
-
-  unsigned int vmask[8];
-  vst1q_u32(vmask, vmask_rp1);
-  vst1q_u32(vmask + 4, vmask_rp2);
-
-  unsigned int rmask[4];
-  vst1q_u32(rmask, vmask_result);
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-
-  for (int n = 0; n < num; ++n) {
-    const float *din_batch = din + n * ch_in * size_in_channel;
-    float *dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-#ifdef __aarch64__
-    for (int c = 0; c < ch_in; c++) {
-      float *dout_ptr = dout_batch + c * size_out_channel;
-
-      const float *din_ch_ptr = din_batch + c * size_in_channel;
-
-      float bias_val = flag_bias ? bias[c] : 0.f;
-      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
-
-      const float *wei_ptr = weights + c * w_stride;
-
-      float32x4_t wr0 = vld1q_f32(wei_ptr);
-      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
-
-      float *doutr0 = dout_ptr;
-      float *doutr1 = doutr0 + w_out;
-      float *doutr2 = doutr1 + w_out;
-      float *doutr3 = doutr2 + w_out;
-
-      const float *dr0 = din_ch_ptr;
-      const float *dr1 = dr0 + w_in;
-      const float *dr2 = dr1 + w_in;
-      const float *dr3 = dr2 + w_in;
-      const float *dr4 = dr3 + w_in;
-      const float *dr5 = dr4 + w_in;
-
-      const float *din_ptr0 = dr0;
-      const float *din_ptr1 = dr1;
-      const float *din_ptr2 = dr2;
-      const float *din_ptr3 = dr3;
-      const float *din_ptr4 = dr4;
-      const float *din_ptr5 = dr5;
-
-      for (int i = 0; i < h_in; i += 4) {
-        //! process top pad pad_h = 1
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-        din_ptr3 = dr3;
-        din_ptr4 = dr4;
-        din_ptr5 = dr5;
-
-        doutr0 = dout_ptr;
-        doutr1 = doutr0 + w_out;
-        doutr2 = doutr1 + w_out;
-        doutr3 = doutr2 + w_out;
-        if (i == 0) {
-          din_ptr0 = zero_ptr;
-          din_ptr1 = dr0;
-          din_ptr2 = dr1;
-          din_ptr3 = dr2;
-          din_ptr4 = dr3;
-          din_ptr5 = dr4;
-          dr0 = dr3;
-          dr1 = dr4;
-          dr2 = dr5;
-        } else {
-          dr0 = dr4;
-          dr1 = dr5;
-          dr2 = dr1 + w_in;
-        }
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-        dr5 = dr4 + w_in;
-
-        //! process bottom pad
-        if (i + 5 > h_in) {
-          switch (i + 5 - h_in) {
-            case 5:
-              din_ptr1 = zero_ptr;
-            case 4:
-              din_ptr2 = zero_ptr;
-            case 3:
-              din_ptr3 = zero_ptr;
-            case 2:
-              din_ptr4 = zero_ptr;
-            case 1:
-              din_ptr5 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 4 > h_out) {
-          switch (i + 4 - h_out) {
-            case 3:
-              doutr1 = write_ptr;
-            case 2:
-              doutr2 = write_ptr;
-            case 1:
-              doutr3 = write_ptr;
-            default:
-              break;
-          }
-        }
-
-        int cnt = cnt_col;
-        asm volatile(
-            "PRFM PLDL1KEEP, [%[din_ptr0]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr1]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr2]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr3]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr4]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr5]] \n"
-            "movi   v21.4s, #0x0\n" /* out0 = 0 */
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "ld1 {v12.4s}, [%[bias_val]]     \n"  /*vdupq_n_f32(bias_val)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "ext  v16.16b, %[vzero].16b, v0.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v0.16b, v1.16b, #4 \n"        /* v16 = 1234 */
-
-            // left
-            // r0
-            "fmla v12.4s,  v0.4s,  %[w0].s[1]\n" /* outr00 += din0_0123 *
-                                                    w0[1]*/
-
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "sub %[din_ptr0], %[din_ptr0], #4 \n"   /* din_ptr0-- */
-            "sub %[din_ptr1], %[din_ptr1], #4 \n"   /* din_ptr0-- */
-
-            "fmla v12.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din0_0012 *
-                                                      w0[0]*/
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/
-            "sub %[din_ptr2], %[din_ptr2], #4 \n" /* din_ptr0-- */
-            "sub %[din_ptr3], %[din_ptr3], #4 \n" /* din_ptr0-- */
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_1234 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, %[vzero].16b, v2.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v2.16b, v3.16b, #4 \n"        /* v16 = 1234 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[1]\n" /* outr00 += din1_0123 *
-                                                     w0[1]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[1]\n" /* outr00 += din1_0123 *
-                                                     w1[1]*/
-            "sub %[din_ptr4], %[din_ptr4], #4 \n" /* din_ptr0-- */
-            "sub %[din_ptr5], %[din_ptr5], #4 \n" /* din_ptr0-- */
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v4.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v4.16b, v5.16b, #4 \n"        /* v16 = 1234 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[1]\n" /* outr00 += din2_0123 *
-                                                     w0[1]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 *
-                                                     w1[1]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                     w2[1]*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v6.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v6.16b, v7.16b, #4 \n"        /* v16 = 1234 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[1]\n" /*outr00 += din2_0123 *
-                                                     w0[1]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 *
-                                                     w1[1]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                     w2[1]*/
-
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v8.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v8.16b, v9.16b, #4 \n"        /* v16 = 1234 */
-
-            // r4
-            "fmla v15.4s ,  v8.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 *
-                                                     w1[1]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                     w2[1]*/
-
-            "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/
-            "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/
-
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */
-            "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v10.16b, v11.16b, #4 \n"       /* v16 = 1234 */
-            "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            // r5
-            "fmla v15.4s ,  v10.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/
-
-            "ld1 {v10.4s}, [%[din_ptr5]], #16  \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */
-
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-
-            "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/
-
-            "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */
-            "cmp  %[cnt], #1                \n"
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "blt 3f                         \n"
-            // mid
-            "1:                             \n"
-            // r0
-            "fmla v12.4s ,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v12.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "fmax v12.4s, v12.4s, %[vzero].4s \n"  /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v12.4s}, [%[doutr0]], #16     \n"
-
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "fmax v13.4s, v13.4s, %[vzero].4s \n"  /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v13.4s}, [%[doutr1]], #16     \n"
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                      w0[0]*/
-
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "fmax v14.4s, v14.4s, %[vzero].4s \n"   /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16     \n"
-
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-
-            "subs %[cnt], %[cnt], #1 \n"
-
-            "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/
-
-            "st1 {v15.4s}, [%[doutr3]], #16     \n"
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "bne 1b \n"
-
-            // right
-            "3:                             \n"
-            "ld1 {v18.4s, v19.4s}, [%[vmask]]         \n"
-            "ld1 {v22.4s}, [%[doutr0]]         \n"
-            "ld1 {v23.4s}, [%[doutr1]]         \n"
-            "ld1 {v24.4s}, [%[doutr2]]         \n"
-            "ld1 {v25.4s}, [%[doutr3]]         \n"
-
-            "bif v0.16b, %[vzero].16b, v18.16b \n"
-            "bif v1.16b, %[vzero].16b, v19.16b \n"
-            "bif v2.16b, %[vzero].16b, v18.16b \n"
-            "bif v3.16b, %[vzero].16b, v19.16b \n"
-
-            "bif v4.16b, %[vzero].16b, v18.16b \n"
-            "bif v5.16b, %[vzero].16b, v19.16b \n"
-            "bif v6.16b, %[vzero].16b, v18.16b \n"
-            "bif v7.16b, %[vzero].16b, v19.16b \n"
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-
-            // r0
-            "fmla v12.4s,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                    w0[0]*/
-
-            "bif v8.16b, %[vzero].16b, v18.16b \n"
-            "bif v9.16b, %[vzero].16b, v19.16b \n"
-            "bif v10.16b, %[vzero].16b, v18.16b \n"
-            "bif v11.16b, %[vzero].16b, v19.16b \n"
-
-            "fmla v12.4s,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                     w0[1]*/
-
-            "ld1 {v18.4s}, [%[rmask]]         \n"
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "bif v12.16b, v22.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "st1 {v12.4s}, [%[doutr0]], #16     \n"
-            "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "bif v13.16b, v23.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */
-
-            "st1 {v13.4s}, [%[doutr1]], #16     \n"
-
-            // r3
-            "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                      w0[0]*/
-
-            "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "bif v14.16b, v24.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16     \n"
-
-            "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/
-
-            "bif v15.16b, v25.16b, v18.16b \n"
-
-            "st1 {v15.4s}, [%[doutr3]], #16     \n"
-            : [cnt] "+r"(cnt), [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1), [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3), [din_ptr4] "+r"(din_ptr4),
-              [din_ptr5] "+r"(din_ptr5), [doutr0] "+r"(doutr0),
-              [doutr1] "+r"(doutr1), [doutr2] "+r"(doutr2),
-              [doutr3] "+r"(doutr3)
-            : [w0] "w"(wr0), [w1] "w"(wr1), [w2] "w"(wr2),
-              [bias_val] "r"(vbias), [vmask] "r"(vmask), [rmask] "r"(rmask),
-              [vzero] "w"(vzero)
-            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-              "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16",
-              "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25");
-        dout_ptr = dout_ptr + 4 * w_out;
-      }
-    }
-#else
-    for (int i = 0; i < ch_in; ++i) {
-      const float *din_channel = din_batch + i * size_in_channel;
-
-      const float *weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float bias_val = flag_bias ? bias[i] : 0.f;
-
-      float *dout_channel = dout_batch + i * size_out_channel;
-
-      const float *dr0 = din_channel;
-      const float *dr1 = dr0 + w_in;
-      const float *dr2 = dr1 + w_in;
-      const float *dr3 = dr2 + w_in;
-
-      const float *din0_ptr = nullptr;
-      const float *din1_ptr = nullptr;
-      const float *din2_ptr = nullptr;
-      const float *din3_ptr = nullptr;
-
-      float *doutr0 = nullptr;
-      float *doutr1 = nullptr;
-
-      float *ptr_zero = const_cast<float *>(zero);
-
-      for (int i = 0; i < h_in; i += 2) {
-        //! process top pad pad_h = 1
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-
-        doutr0 = dout_channel;
-        doutr1 = dout_channel + w_out;
-        // unsigned int* rst_mask = rmask;
-
-        if (i == 0) {
-          din0_ptr = zero_ptr;
-          din1_ptr = dr0;
-          din2_ptr = dr1;
-          din3_ptr = dr2;
-          dr0 = dr1;
-          dr1 = dr2;
-          dr2 = dr3;
-          dr3 = dr2 + w_in;
-        } else {
-          dr0 = dr2;
-          dr1 = dr3;
-          dr2 = dr1 + w_in;
-          dr3 = dr2 + w_in;
-        }
-        //! process bottom pad
-        if (i + 3 > h_in) {
-          switch (i + 3 - h_in) {
-            case 3:
-              din1_ptr = zero_ptr;
-            case 2:
-              din2_ptr = zero_ptr;
-            case 1:
-              din3_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 2 > h_out) {
-          doutr1 = write_ptr;
-        }
-        int cnt = cnt_col;
-        unsigned int *rmask_ptr = rmask;
-        unsigned int *vmask_ptr = vmask;
-        asm volatile(
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                      @ preload data\n"
-            "pld [%[din2_ptr]]                      @ preload data\n"
-            "pld [%[din3_ptr]]                      @ preload data\n"
-
-            "vld1.32  {d16-d18}, [%[din0_ptr]]!    @ load din r0\n"
-            "vld1.32  {d20-d22}, [%[din1_ptr]]!    @ load din r1\n"
-            "vld1.32  {d24-d26}, [%[din2_ptr]]!    @ load din r2\n"
-            "vld1.32  {d28-d30}, [%[din3_ptr]]!    @ load din r3\n"
-
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-                                                                           // vbias
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q5
-                                                                           // =
-                                                                           // vbias
-
-            "vext.32  q6, %q[vzero], q8, #3     @ 0012\n"
-            "vext.32  q7, q8, q9, #1     @ 1234\n"
-
-            // left
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n"
-            "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n"
-            "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n"
-            "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n"
-
-            "vmla.f32 q4, q6, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                             @ preload data\n"
-            "pld [%[din2_ptr]]                             @ preload data\n"
-            "pld [%[din3_ptr]]                             @ preload data\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, %q[vzero], q10, #3     @ 0012\n"
-            "vext.32  q7, q10, q11, #1     @ 1234\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q10, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"
-            "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q6, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"
-            "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[2]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, %q[vzero], q12, #3     @ 0012\n"
-            "vext.32  q7, q12, q13, #1     @ 1234\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q12, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[2]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, %q[vzero], q14, #3     @ 0012\n"
-            "vext.32  q7, q14, q15, #1     @ 1234\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"
-            "vmax.f32  q4, q4, %q[vzero]  @ relu \n"
-
-            "vmla.f32 q5, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-                                                                           // vbias
-
-            "vmax.f32  q5, q5, %q[vzero]  @ relu \n"
-
-            "cmp %[cnt], #1                             @ check whether has "
-            "mid cols\n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q5
-                                                                           // =
-                                                                           // vbias
-            "blt  3f                                @ jump to main loop start "
-            "point\n"
-
-            // mid
-            "1:                                    @ right pad entry\n"
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                             @ preload data\n"
-            "pld [%[din2_ptr]]                             @ preload data\n"
-            "pld [%[din3_ptr]]                             @ preload data\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"
-            "vmax.f32  q4, q4, %q[vzero]  @ relu \n"
-
-            "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-                                                                           // vbias
-
-            "vmax.f32  q5, q5, %q[vzero]  @ relu \n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            "subs %[cnt], #1 @ loop count minus 1\n"
-
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-                                                                           // vbias
-
-            "bne    1b                             @ jump to main loop start "
-            "point\n"
-
-            // right
-            "3:                                    @ right pad entry\n"
-            "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"
-
-            "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d31}, [%[vmask]]!    @ load din r0\n"
-
-            "vbif d16, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d17, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d18, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vbif d20, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d21, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d22, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vbif d24, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d25, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d26, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vbif d28, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d29, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d30, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d19}, [%[rmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[rmask]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[dout_ptr1]]    @ load din r0\n"
-            "vld1.32  {d20-d21}, [%[dout_ptr2]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vmax.f32  q4, q4, %q[vzero]  @ relu \n"
-
-            "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vbif d8, d16, d19              @ bit select, deal with right pad\n"
-            "vbif d9, d17, d23              @ bit select, deal with right pad\n"
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmax.f32  q5, q5, %q[vzero]  @ relu \n"
-
-            "vbif d10, d20, d19              @ bit select, deal with right "
-            "pad\n"
-            "vbif d11, d21, d23              @ bit select, deal with right "
-            "pad\n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            : [dout_ptr1] "+r"(doutr0), [dout_ptr2] "+r"(doutr1),
-              [din0_ptr] "+r"(din0_ptr), [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr), [din3_ptr] "+r"(din3_ptr),
-              [cnt] "+r"(cnt), [rmask] "+r"(rmask_ptr), [vmask] "+r"(vmask_ptr)
-            : [wr0] "w"(wr0), [wr1] "w"(wr1), [wr2] "w"(wr2),
-              [bias_val] "r"(bias_val), [vzero] "w"(vzero)
-            : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
-              "q12", "q13", "q14", "q15");
-        dout_channel += 2 * w_out;
-      }  //! end of processing mid rows
-    }
-#endif
-  }
-}
-/**
- * \brief depthwise convolution kernel 3x3, stride 2, with reulu
- */
-// w_in > 7
-void conv_depthwise_3x3s2p1_bias_relu(float *dout, const float *din,
-                                      const float *weights, const float *bias,
-                                      bool flag_bias, const int num,
-                                      const int ch_in, const int h_in,
-                                      const int w_in, const int h_out,
-                                      const int w_out) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-  int size_pad_bottom = h_out * 2 - h_in;
-
-  int cnt_col = (w_out >> 2) - 2;
-  int size_right_remain = w_in - (7 + cnt_col * 8);
-  if (size_right_remain >= 9) {
-    cnt_col++;
-    size_right_remain -= 8;
-  }
-  int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4);  //
-
-  int size_right_pad = w_out * 2 - w_in;
-
-  uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-  uint32x4_t wmask =
-      vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));  // 0 1 2 3
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  float *zero_ptr = static_cast<float *>(
-      framework::CPUContext::Context()->get_work_space(w_in * sizeof(float)));
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float *write_ptr = zero_ptr + w_in;
-
-  unsigned int dmask[12];
-
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-  vst1q_u32(dmask + 8, wmask);
-
-  for (int n = 0; n < num; ++n) {
-    const float *din_batch = din + n * ch_in * size_in_channel;
-    float *dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float *din_channel = din_batch + i * size_in_channel;
-      float *dout_channel = dout_batch + i * size_out_channel;
-
-      const float *weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float32x4_t vzero = vdupq_n_f32(0.f);
-
-      float32x4_t wbias;
-      float bias_c = 0.f;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-        bias_c = bias[i];
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      const float *dr0 = din_channel;
-      const float *dr1 = dr0 + w_in;
-      const float *dr2 = dr1 + w_in;
-      const float *dr3 = dr2 + w_in;
-      const float *dr4 = dr3 + w_in;
-
-      const float *din0_ptr = dr0;
-      const float *din1_ptr = dr1;
-      const float *din2_ptr = dr2;
-      const float *din3_ptr = dr3;
-      const float *din4_ptr = dr4;
-
-      float *doutr0 = dout_channel;
-      float *doutr0_ptr = nullptr;
-      float *doutr1_ptr = nullptr;
-
-#ifdef __aarch64__
-      for (int i = 0; i < h_in; i += 4) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-        din4_ptr = dr4;
-
-        doutr0_ptr = doutr0;
-        doutr1_ptr = doutr0 + w_out;
-
-        if (i == 0) {
-          din0_ptr = zero_ptr;
-          din1_ptr = dr0;
-          din2_ptr = dr1;
-          din3_ptr = dr2;
-          din4_ptr = dr3;
-          dr0 = dr3;
-          dr1 = dr4;
-        } else {
-          dr0 = dr4;
-          dr1 = dr0 + w_in;
-        }
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-
-        //! process bottom pad
-        if (i + 4 > h_in) {
-          switch (i + 4 - h_in) {
-            case 4:
-              din1_ptr = zero_ptr;
-            case 3:
-              din2_ptr = zero_ptr;
-            case 2:
-              din3_ptr = zero_ptr;
-            case 1:
-              din4_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process output pad
-        if (i / 2 + 2 > h_out) {
-          doutr1_ptr = write_ptr;
-        }
-        int cnt = cnt_col;
-        asm volatile(
-            // top
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "0:                                      \n"
-            "prfm pldl1keep, [%[inptr0]]             \n"
-            "prfm pldl1keep, [%[inptr1]]             \n"
-            "prfm pldl1keep, [%[inptr2]]             \n"
-            "prfm pldl1keep, [%[inptr3]]             \n"
-            "prfm pldl1keep, [%[inptr4]]             \n"
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "ext  v10.16b, %[vzero].16b, v1.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[1]            \n"   // {0,2,4,6} * w01
-            "fmul v12.4s, v1.4s, %[w0].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v16.4s, v10.4s, %[w0].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v3.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            "sub %[inptr0], %[inptr0], #4            \n"
-            "sub %[inptr1], %[inptr1], #4             \n"
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[1]            \n"   // {0,2,4,6} * w01
-            "fmla v12.4s, v3.4s, %[w1].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v16.4s, v10.4s, %[w1].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v5.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            "sub %[inptr2], %[inptr2], #4            \n"
-            "sub %[inptr3], %[inptr3], #4             \n"
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[1]            \n"  // {0,2,4,6} * w01
-            "fmla v11.4s, v4.4s, %[w2].s[1]            \n"  // {0,2,4,6} * w01
-
-            "fmul v14.4s, v5.4s, %[w0].s[2]            \n"  // {1,3,5,7} * w02
-            "fmla v12.4s, v5.4s, %[w2].s[2]            \n"  // {1,3,5,7} * w02
-
-            "fmla v17.4s, v10.4s, %[w0].s[0]            \n"  // {0,1,3,5} * w00
-            "fmla v16.4s, v10.4s, %[w2].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v7.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            "sub %[inptr4], %[inptr4], #4            \n"
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[1]            \n"   // {0,2,4,6} * w01
-            "fmla v14.4s, v7.4s, %[w1].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v17.4s, v10.4s, %[w1].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v9.16b, #12     \n"  // v10 = {0,1,3,5}
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[1]            \n"   // {0,2,4,6} * w01
-            "fmla v14.4s, v9.4s, %[w2].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v17.4s, v10.4s, %[w2].s[0]            \n"  // {0,1,3,5} * w00
-
-            "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */
-
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */
-
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-
-            "cmp %[cnt], #1                             \n"
-
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "blt 1f                                     \n"
-            // mid
-            "2:                                          \n"
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[0]            \n"   // {0,2,4,6} * w00
-            "fmul v12.4s, v1.4s, %[w0].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v2.16b, v18.16b, #4     \n"     // v10 = {2,4,6,8}
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v12.4s, v3.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v4.16b, v19.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[0]            \n"  // {0,2,4,6} * w00
-            "fmla v11.4s, v4.4s, %[w2].s[0]            \n"  // {0,2,4,6} * w00
-
-            "fmul v14.4s, v5.4s, %[w0].s[1]            \n"  // {1,3,5,7} * w01
-            "fmla v12.4s, v5.4s, %[w2].s[1]            \n"  // {1,3,5,7} * w01
-
-            "fmla v17.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-            "fmla v16.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v6.16b, v20.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v7.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v8.16b, v21.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v9.4s, %[w2].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"      // v10 = {2,4,6,8}
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-            "subs %[cnt], %[cnt], #1                    \n"
-
-            "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "bne  2b                                    \n"
-
-            // right
-            "1:                                          \n"
-            "cmp %[remain], #1                           \n"
-            "blt 4f                                     \n"
-            "3:                                         \n"
-            "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "bif  v2.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v3.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "ext  v10.16b, v0.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "bif  v6.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v7.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[0]            \n"   // {0,2,4,6} * w00
-            "fmul v12.4s, v1.4s, %[w0].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v2.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-            "bif  v8.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v9.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v12.4s, v3.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v4.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[0]            \n"  // {0,2,4,6} * w00
-            "fmla v11.4s, v4.4s, %[w2].s[0]            \n"  // {0,2,4,6} * w00
-
-            "fmul v14.4s, v5.4s, %[w0].s[1]            \n"  // {1,3,5,7} * w01
-            "fmla v12.4s, v5.4s, %[w2].s[1]            \n"  // {1,3,5,7} * w01
-
-            "fmla v17.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-            "fmla v16.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v6.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v7.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v8.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-            "ld1 {v0.4s}, [%[outptr0]]                  \n"
-
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-            "ld1 {v1.4s}, [%[outptr1]]                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v9.4s, %[w2].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "bif  v16.16b, v0.16b, %[wmask].16b    \n"  // pipei
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */
-
-            "bif  v17.16b, v1.16b, %[wmask].16b    \n"  // pipei
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-            "4:                                          \n"
-            : [inptr0] "+r"(din0_ptr), [inptr1] "+r"(din1_ptr),
-              [inptr2] "+r"(din2_ptr), [inptr3] "+r"(din3_ptr),
-              [inptr4] "+r"(din4_ptr), [outptr0] "+r"(doutr0_ptr),
-              [outptr1] "+r"(doutr1_ptr), [cnt] "+r"(cnt)
-            : [vzero] "w"(vzero), [w0] "w"(wr0), [w1] "w"(wr1), [w2] "w"(wr2),
-              [remain] "r"(cnt_remain), [mask1] "w"(vmask_rp1),
-              [mask2] "w"(vmask_rp2), [wmask] "w"(wmask), [vbias] "w"(wbias)
-            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-              "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16",
-              "v17", "v18", "v19", "v20", "v21");
-        doutr0 = doutr0 + 2 * w_out;
-      }
-#else
-
-      for (int i = 0; i < h_in; i += 2) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-
-        doutr0_ptr = doutr0;
-
-        if (i == 0) {
-          din0_ptr = zero_ptr;
-          din1_ptr = dr0;
-          din2_ptr = dr1;
-          dr0 = dr1;
-          dr1 = dr2;
-          dr2 = dr1 + w_in;
-        } else {
-          dr0 = dr2;
-          dr1 = dr0 + w_in;
-          dr2 = dr1 + w_in;
-        }
-
-        //! process bottom pad
-        if (i + 2 > h_in) {
-          switch (i + 2 - h_in) {
-            case 2:
-              din1_ptr = zero_ptr;
-            case 1:
-              din2_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        int cnt = cnt_col;
-
-        unsigned int *mask_ptr = dmask;
-        asm volatile(
-            // top
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "0:                                                     \n"
-            "vmov.u32 q9, #0                                \n"
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r1\n"  // v11={0,2,4,6} v12={1,3,5,7}, q10, q11
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v11={0,2,4,6} v12={1,3,5,7}, q12, q13
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"  // v13={0,2,4,6} v14={1,3,5,7}, q14, q15
-            "pld [%[din0_ptr]]                              @ preload data\n"
-            "pld [%[din1_ptr]]                              @ preload data\n"
-            "pld [%[din2_ptr]]                              @ preload data\n"
-
-            "vdup.32 q3, %[bias]                            @ and \n"  // q10 =
-                                                                       // vbias
-
-            "vext.32 q6, q9, q11, #3                        @ shift right 1 "
-            "data\n"  // q2 = {0,1,3,5}
-            "vext.32 q7, q9, q13, #3                        @ shift right 1 "
-            "data\n"  // q6 = {0,1,3,5}
-            "vext.32 q8, q9, q15, #3                        @ shift right 1 "
-            "data\n"  // q6 = {0,1,3,5}
-
-            "vmul.f32 q4, q10, %e[wr0][1]                   @ mul weight 1, "
-            "out0\n"  // q11 * w01
-            "vmul.f32 q5, q11, %f[wr0][0]                   @ mul weight 1, "
-            "out0\n"  // q12 * w02
-            "vmla.f32 q3,  q6, %e[wr0][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w00
-
-            "sub %[din0_ptr], #4                            @ inpitr0 - 1\n"
-            "sub %[din1_ptr], #4                            @ inpitr1 - 1\n"
-            "sub %[din2_ptr], #4                            @ inpitr2 - 1\n"
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q12, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q11 * w01
-            "vmla.f32 q5, q13, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q12 * w02
-            "vmla.f32 q3,  q7, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w00
-
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vmla.f32 q4, q14, %e[wr2][1]                   @ mul weight 1, "
-            "out1\n"  // q0 * w01
-            "vmla.f32 q5, q15, %f[wr2][0]                   @ mul weight 1, "
-            "out1\n"  // q1 * w02
-            "vmla.f32 q3,  q8, %e[wr2][0]                   @ mul weight 1, "
-            "out1\n"  // q2 * w00
-
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vmax.f32 q3, q3, q9                    @ relu \n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "cmp %[cnt], #1                                 \n"
-            "blt 1f                                         \n"
-            // mid
-            "2:                                             \n"
-            "vld1.32  {d16}, [%[din0_ptr]]                  @ load din r0\n"  // q2={8,10,12,14}
-            "vdup.32  q3, %[bias]                           @ and \n"  // q10 =
-                                                                       // vbias
-            "vext.32  q6, q10, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.32 {d16}, [%[din1_ptr]]                   @ load din r1\n"  // q2={8,10,12,14}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q0 * w00
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6 * w02
-
-            "vext.32  q7, q12, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.32 {d16}, [%[din2_ptr]]                   @ load din r1\n"  // q2={8,10,12,14}
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w02
-
-            "vext.32  q6, q14, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q6 * w02
-
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vmax.f32 q3, q3, q9                    @ relu \n"
-
-            "subs %[cnt], #1                                \n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "bne  2b                                        \n"
-
-            // right
-            "1:                                             \n"
-            "cmp %[remain], #1                              \n"
-            "blt 3f                                         \n"
-
-            "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"
-            "vdup.32  q3, %[bias]                           @ and \n"  // q10 =
-                                                                       // vbias
-
-            "vbif q10, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q11, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q12, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q13, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q14, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q15, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-
-            "vext.32 q6, q10, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vext.32 q7, q12, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q0 * w00
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6 * w02
-
-            "vext.32 q6, q14, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.f32   {d20-d21}, [%[outptr]]              @ load output\n"
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w02
-
-            "vld1.f32   {d22-d23}, [%[mask_ptr]]            @ load mask\n"
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q6 * w02
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vmax.f32 q3, q3, q9                    @ relu \n"
-
-            "vbif.f32 q3, q10, q11                          @ write mask\n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "3:                                             \n"
-            : [din0_ptr] "+r"(din0_ptr), [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr), [outptr] "+r"(doutr0_ptr),
-              [cnt] "+r"(cnt), [mask_ptr] "+r"(mask_ptr)
-            : [remain] "r"(cnt_remain), [wr0] "w"(wr0), [wr1] "w"(wr1),
-              [wr2] "w"(wr2), [bias] "r"(bias_c)
-            : "cc", "memory", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
-              "q11", "q12", "q13", "q14", "q15");
-
-        doutr0 = doutr0 + w_out;
-      }
-#endif
-    }
-  }
-}
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width <= 4
- */
-void conv_depthwise_3x3s1p1_bias_s_relu(float *dout, const float *din,
-                                        const float *weights, const float *bias,
-                                        bool flag_bias, const int num,
-                                        const int ch_in, const int h_in,
-                                        const int w_in, const int h_out,
-                                        const int w_out) {
-  //! 3x3s1 convolution, implemented by direct algorithm
-  //! pad is done implicit
-  //! for 4x6 convolution window
-  const int right_pad_idx[4] = {3, 2, 1, 0};
-  const float zero[4] = {0.f, 0.f, 0.f, 0.f};
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-  uint32x4_t vmask_rp =
-      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in));
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  for (int n = 0; n < num; ++n) {
-    const float *din_batch = din + n * ch_in * size_in_channel;
-    float *dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      float *dout_channel = dout_batch + i * size_out_channel;
-      const float *din_channel = din_batch + i * size_in_channel;
-      const float *weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float32x4_t wbias;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      int hs = -1;
-      int he = 3;
-
-      float out_buf1[4];
-      float out_buf2[4];
-      float trash_buf[4];
-
-      int h_cnt = (h_out + 1) >> 1;
-      float *doutr0 = dout_channel;
-      float *doutr1 = dout_channel + w_out;
-
-      for (int j = 0; j < h_cnt; ++j) {
-        const float *dr0 = din_channel + hs * w_in;
-        const float *dr1 = dr0 + w_in;
-        const float *dr2 = dr1 + w_in;
-        const float *dr3 = dr2 + w_in;
-
-        if (hs == -1) {
-          dr0 = zero;
-        }
-
-        switch (he - h_in) {
-          case 2:
-            dr2 = zero;
-            doutr1 = trash_buf;
-          case 1:
-            dr3 = zero;
-          default:
-            break;
-        }
-#ifdef __aarch64__
-        asm volatile(
-            "prfm pldl1keep, [%[din0]]\n"
-            "prfm pldl1keep, [%[din1]]\n"
-            "prfm pldl1keep, [%[din2]]\n"
-            "prfm pldl1keep, [%[din3]]\n"
-
-            "ld1 {v0.4s}, [%[din0]], #16\n"
-            "ld1 {v1.4s}, [%[din1]], #16\n"
-            "ld1 {v2.4s}, [%[din2]], #16\n"
-            "ld1 {v3.4s}, [%[din3]], #16\n"
-
-            "bif v0.16b, %[zero].16b, %[mask].16b\n"  // d0_1234
-            "bif v1.16b, %[zero].16b, %[mask].16b\n"  // d1_1234
-            "bif v2.16b, %[zero].16b, %[mask].16b\n"  // d2_1234
-            "bif v3.16b, %[zero].16b, %[mask].16b\n"  // d3_1234
-
-            "ext v4.16b, %[zero].16b, v0.16b, #12\n"  // d0_0123
-            "ext v5.16b, %[zero].16b, v1.16b, #12\n"  // d1_0123
-            "ext v6.16b, %[zero].16b, v2.16b, #12\n"  // d2_0123
-            "ext v7.16b, %[zero].16b, v3.16b, #12\n"  // d3_0123
-
-            "ext v8.16b, v0.16b, %[zero].16b, #4\n"   // d0_2340
-            "ext v9.16b, v1.16b, %[zero].16b, #4\n"   // d1_2340
-            "ext v10.16b, v2.16b, %[zero].16b, #4\n"  // d2_2340
-            "ext v11.16b, v3.16b, %[zero].16b, #4\n"  // d3_2340
-
-            "fmul v12.4s, v0.4s, %[wr0].s[1]\n"
-            "fmul v13.4s, v1.4s, %[wr0].s[1]\n"
-
-            "fmul v14.4s, v1.4s, %[wr1].s[1]\n"
-            "fmul v15.4s, v2.4s, %[wr1].s[1]\n"
-
-            "fmul v16.4s, v2.4s, %[wr2].s[1]\n"
-            "fmul v17.4s, v3.4s, %[wr2].s[1]\n"
-
-            "fmla v12.4s, v4.4s, %[wr0].s[0]\n"
-            "fmla v13.4s, v5.4s, %[wr0].s[0]\n"
-
-            "fmla v14.4s, v5.4s, %[wr1].s[0]\n"
-            "fmla v15.4s, v6.4s, %[wr1].s[0]\n"
-
-            "fmla v16.4s, v6.4s, %[wr2].s[0]\n"
-            "fmla v17.4s, v7.4s, %[wr2].s[0]\n"
-
-            "fmla v12.4s, v8.4s, %[wr0].s[2]\n"
-            "fmla v13.4s, v9.4s, %[wr0].s[2]\n"
-
-            "fmla v14.4s, v9.4s, %[wr1].s[2]\n"
-            "fmla v15.4s, v10.4s, %[wr1].s[2]\n"
-
-            "fmla v16.4s, v10.4s, %[wr2].s[2]\n"
-            "fmla v17.4s, v11.4s, %[wr2].s[2]\n"
-
-            "fadd v12.4s, v12.4s, v14.4s\n"
-            "fadd v12.4s, v12.4s, v16.4s\n"
-
-            "fadd v13.4s, v13.4s, v15.4s\n"  // out1
-            "fadd v13.4s, v13.4s, v17.4s\n"  // out2
-
-            "fadd v12.4s, v12.4s, %[bias].4s\n"  // out1 add bias
-            "fadd v13.4s, v13.4s, %[bias].4s\n"  // out2 add bias
-
-            "prfm pldl1keep, [%[out1]]\n"
-            "prfm pldl1keep, [%[out2]]\n"
-
-            "fmax v12.4s, v12.4s, %[zero].4s\n"  // out1 -> relu
-            "fmax v13.4s, v13.4s, %[zero].4s\n"  // out2 -> relu
-
-            "st1 {v12.4s}, [%[out1]]\n"
-            "st1 {v13.4s}, [%[out2]]\n"
-
-            : [din0] "+r"(dr0), [din1] "+r"(dr1), [din2] "+r"(dr2),
-              [din3] "+r"(dr3)
-            : [wr0] "w"(wr0), [wr1] "w"(wr1), [wr2] "w"(wr2), [zero] "w"(vzero),
-              [mask] "w"(vmask_rp), [bias] "w"(wbias), [out1] "r"(out_buf1),
-              [out2] "r"(out_buf2)
-            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-              "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16",
-              "v17");
-#else
-        asm volatile(
-            "pld [%[din0]]\n"
-            "pld [%[din1]]\n"
-            "pld [%[din2]]\n"
-            "pld [%[din3]]\n"
-
-            "vld1.32 {d12-d13}, [%[din0]]!\n"
-            "vld1.32 {d14-d15}, [%[din1]]!\n"
-            "vld1.32 {d16-d17}, [%[din2]]!\n"
-            "vld1.32 {d18-d19}, [%[din3]]!\n"
-
-            "vbif q6, %q[zero], %q[mask]\n"  // d0_1234
-            "vbif q7, %q[zero], %q[mask]\n"  // d1_1234
-            "vbif q8, %q[zero], %q[mask]\n"  // d2_1234
-            "vbif q9, %q[zero], %q[mask]\n"  // d3_1234
-
-            "vmul.f32 q14, q6, %e[wr0][1]\n"
-            "vmul.f32 q15, q7, %e[wr0][1]\n"
-
-            "vmla.f32 q14, q7, %e[wr1][1]\n"
-            "vmla.f32 q15, q8, %e[wr1][1]\n"
-
-            "vmla.f32 q14, q8, %e[wr2][1]\n"
-            "vmla.f32 q15, q9, %e[wr2][1]\n"
-
-            "vext.32 q10, %q[zero], q6, #3\n"  // d0_0123
-            "vext.32 q11, %q[zero], q7, #3\n"  // d1_0123
-            "vext.32 q12, %q[zero], q8, #3\n"  // d2_0123
-            "vext.32 q13, %q[zero], q9, #3\n"  // d3_0123
-
-            "vmla.f32 q14, q10, %e[wr0][0]\n"
-            "vmla.f32 q15, q11, %e[wr0][0]\n"
-
-            "vmla.f32 q14, q11, %e[wr1][0]\n"
-            "vmla.f32 q15, q12, %e[wr1][0]\n"
-
-            "vmla.f32 q14, q12, %e[wr2][0]\n"
-            "vmla.f32 q15, q13, %e[wr2][0]\n"
-
-            "vext.32 q10, q6, %q[zero], #1\n"  // d0_2340
-            "vext.32 q11, q7, %q[zero], #1\n"  // d1_2340
-            "vext.32 q12, q8, %q[zero], #1\n"  // d2_2340
-            "vext.32 q13, q9, %q[zero], #1\n"  // d3_2340
-
-            "vmla.f32 q14, q10, %f[wr0][0]\n"
-            "vmla.f32 q15, q11, %f[wr0][0]\n"
-
-            "vmla.f32 q14, q11, %f[wr1][0]\n"
-            "vmla.f32 q15, q12, %f[wr1][0]\n"
-
-            "vmla.f32 q14, q12, %f[wr2][0]\n"  // out1
-            "vmla.f32 q15, q13, %f[wr2][0]\n"  // out2
-
-            "vadd.f32 q14, q14, %q[bias]\n"  // out1 add bias
-            "vadd.f32 q15, q15, %q[bias]\n"  // out2 add bias
-
-            "pld [%[out1]]\n"
-            "pld [%[out2]]\n"
-
-            "vmax.f32 q14, q14, %q[zero]\n"  // out1 -> relu
-            "vmax.f32 q15, q15, %q[zero]\n"  // out2 -> relu
-
-            "vst1.32 {d28-d29}, [%[out1]]\n"
-            "vst1.32 {d30-d31}, [%[out2]]\n"
-
-            : [din0] "+r"(dr0), [din1] "+r"(dr1), [din2] "+r"(dr2),
-              [din3] "+r"(dr3)
-            : [wr0] "w"(wr0), [wr1] "w"(wr1), [wr2] "w"(wr2), [zero] "w"(vzero),
-              [mask] "w"(vmask_rp), [bias] "w"(wbias), [out1] "r"(out_buf1),
-              [out2] "r"(out_buf2)
-            : "cc", "memory", "q6", "q7", "q8", "q9", "q10", "q11", "q12",
-              "q13", "q14", "q15");
-#endif  //__aarch64__
-        for (int w = 0; w < w_out; ++w) {
-          *doutr0++ = out_buf1[w];
-          *doutr1++ = out_buf2[w];
-        };
-        doutr0 = doutr1;
-        doutr1 += w_out;
-        hs += 2;
-        he += 2;
-      }  // end of processing heights
-    }    // end of processing channels
-  }      // end of processing batchs
-}
-
-}  // namespace depthwise
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/math/depthwise_conv3x3.cpp b/mobile/src/operators/math/depthwise_conv3x3.cpp
deleted file mode 100644
index 11fce28605..0000000000
--- a/mobile/src/operators/math/depthwise_conv3x3.cpp
+++ /dev/null
@@ -1,1060 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-#include "operators/math/depthwise_conv3x3.h"
-#include <arm_neon.h>
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-#ifndef __aarch64__
-inline float32x4_t vpaddq_f32(float32x4_t r0, float32x4_t r1) {
-  float32x2_t sum0 = vpadd_f32(vget_low_f32(r0), vget_high_f32(r0));
-  float32x2_t sum1 = vpadd_f32(vget_low_f32(r1), vget_high_f32(r1));
-  return vcombine_f32(sum0, sum1);
-}
-#endif
-
-template <int Stride = 1>
-inline void Depth3x3NormalRowLoadInput(const float *input, float32x4_t *y) {
-  y[0] = vld1q_f32(input);
-  y[2] = vld1q_f32(input + 4);
-  y[1] = vextq_f32(y[0], y[2], 1);
-  y[2] = vextq_f32(y[0], y[2], 2);
-}
-
-template <>
-inline void Depth3x3NormalRowLoadInput<2>(const float *input, float32x4_t *y) {
-  float32x4x2_t x = vld2q_f32(input);
-  y[0] = x.val[0];
-  y[1] = x.val[1];
-  y[2] = vextq_f32(y[0], y[0], 1);
-  y[2] = vsetq_lane_f32(input[8], y[2], 3);
-}
-
-#define DEPTHWISE_CONV3X3_NORMAL_BORDER(start, end)                      \
-  for (int w = start; w < end; ++w) {                                    \
-    const int w_in_start = -padding_w + w * Stride_w;                    \
-    const int w_in_end = w_in_start + 3;                                 \
-    const int w_start = w_in_start > 0 ? w_in_start : 0;                 \
-    const int w_end = w_in_end < input_w ? w_in_end : input_w;           \
-    float value = 0;                                                     \
-    for (int h_in = h_start; h_in < h_end; ++h_in) {                     \
-      for (int w_in = w_start; w_in < w_end; ++w_in) {                   \
-        value += filter[(h_in - h_in_start) * 3 + (w_in - w_in_start)] * \
-                 input[h_in * input_w + w_in];                           \
-      }                                                                  \
-    }                                                                    \
-    output_ptr[w] = value;                                               \
-  }
-
-template <int Stride_h, int Stride_w>
-inline void DepthwiseConv3x3NormalRow(const float *input, const float *filter,
-                                      const int h_output, const int input_h,
-                                      const int input_w, const int padding_h,
-                                      const int padding_w, const int output_w,
-                                      float *output, float32x4_t *ker) {
-  const int h_in_start = -padding_h + h_output * Stride_h;
-  const int h_in_end = h_in_start + 3;
-  const int h_start = h_in_start > 0 ? h_in_start : 0;
-  const int h_end = h_in_end < input_h ? h_in_end : input_h;
-
-  int valid_w_start = (padding_w + Stride_w - 1) / Stride_w;
-  int valid_w_end = (input_w + padding_w - 3) / Stride_w + 1;
-  if (valid_w_end < valid_w_start) {
-    valid_w_end = valid_w_start;
-  }
-  // const int valid_w_end = output_w - valid_w_start;
-  float *output_ptr = output + h_output * output_w;
-  // border left
-  DEPTHWISE_CONV3X3_NORMAL_BORDER(0, valid_w_start)
-  // middle
-  int output_tiles = (valid_w_end - valid_w_start) >> 2;
-  float32x4_t _sum, _x[3];
-  // valid w
-  for (int w = 0; w < output_tiles * 4; w += 4) {
-    _sum = vdupq_n_f32(0.f);
-    int output_offset = valid_w_start + w;
-    int input_w_offset = output_offset * Stride_w - padding_w;
-    for (int h_in = h_start; h_in < h_end; ++h_in) {
-      int index = h_in - h_in_start;
-      Depth3x3NormalRowLoadInput<Stride_w>(
-          input + h_in * input_w + input_w_offset, _x);
-      _sum = vmlaq_lane_f32(_sum, _x[0], vget_low_f32(ker[index]), 0);
-      _sum = vmlaq_lane_f32(_sum, _x[1], vget_low_f32(ker[index]), 1);
-      _sum = vmlaq_lane_f32(_sum, _x[2], vget_high_f32(ker[index]), 0);
-    }
-    vst1q_f32(output_ptr + output_offset, _sum);
-  }
-  // remain valid w
-  int remain = (valid_w_end - valid_w_start) & 0x3;
-  if (remain > 0) {
-    _sum = vdupq_n_f32(0.f);
-    int remain_start = valid_w_start + (output_tiles << 2);
-    int input_w_offset = remain_start * Stride_w - padding_w;
-    float *output_ptr0 = output_ptr + remain_start;
-
-    for (int h_in = h_start; h_in < h_end; ++h_in) {
-      int index = h_in - h_in_start;
-      Depth3x3NormalRowLoadInput<Stride_w>(
-          input + h_in * input_w + input_w_offset, _x);
-      _sum = vmlaq_lane_f32(_sum, _x[0], vget_low_f32(ker[index]), 0);
-      _sum = vmlaq_lane_f32(_sum, _x[1], vget_low_f32(ker[index]), 1);
-      _sum = vmlaq_lane_f32(_sum, _x[2], vget_high_f32(ker[index]), 0);
-    }
-    switch (remain) {
-      case 3:
-        vst1q_lane_f32(output_ptr0 + 2, _sum, 2);
-      case 2:
-        vst1_f32(output_ptr0, vget_low_f32(_sum));
-        break;
-      case 1:
-        vst1q_lane_f32(output_ptr0, _sum, 0);
-        break;
-    }
-  }
-  // border right
-  DEPTHWISE_CONV3X3_NORMAL_BORDER(valid_w_end, output_w)
-}
-
-template <>
-void DepthwiseConv3x3S1<float, float>(const framework::Tensor &input,
-                                      const framework::Tensor &filter,
-                                      const std::vector<int> &paddings,
-                                      framework::Tensor *output) {
-  const float *input_data = input.data<float>();
-  const float *filter_data = filter.data<float>();
-  float *out_data = output->mutable_data<float>();
-
-  const int input_h = input.dims()[2];
-  const int input_w = input.dims()[3];
-  const int output_h = output->dims()[2];
-  const int output_w = output->dims()[3];
-  const int padding_h = paddings[0];
-  const int padding_w = paddings[1];
-  const int image_size = input_h * input_w;
-  const int out_image_size = output_h * output_w;
-  const int valid_h_start = padding_h;
-  const int valid_h_end = output_h - valid_h_start;
-  const int valid_h = valid_h_end - valid_h_start;
-  const int valid_w_start = padding_w;
-  const int valid_w_end = output_w - valid_w_start;
-  const int valid_w = valid_w_end - valid_w_start;
-
-  #pragma omp parallel for
-  for (int g = 0; g < input.dims()[1]; ++g) {
-    const float *input_ptr = input_data + g * image_size;
-    const float *filter_ptr = filter_data + g * 9;
-    float *output_ptr = out_data + g * out_image_size;
-
-    const float *filter_ptr0 = filter_ptr;
-    const float *filter_ptr1 = filter_ptr0 + 3;
-    const float *filter_ptr2 = filter_ptr1 + 3;
-    float32x4_t _ker[3];
-    _ker[0] = vld1q_f32(filter_ptr0);
-    _ker[1] = vld1q_f32(filter_ptr1);
-    _ker[2] = vld1q_f32(filter_ptr2);
-
-    // pad top
-    for (int h = 0; h < valid_h_start; ++h) {
-      DepthwiseConv3x3NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h,
-                                      input_w, padding_h, padding_w, output_w,
-                                      output_ptr, _ker);
-    }
-
-    // output 2x6
-    int output_w_tiles = valid_w / 6;
-    int output_w_remain = valid_w - output_w_tiles * 6;
-    for (int h = valid_h_start; h < valid_h_end - 1; h += 2) {
-      const float *input_ptr0 = input_ptr + (h - padding_h) * input_w;
-      const float *input_ptr1 = input_ptr0 + input_w;
-      const float *input_ptr2 = input_ptr1 + input_w;
-      const float *input_ptr3 = input_ptr2 + input_w;
-      float *output_ptr0 = output_ptr + h * output_w;
-      float *output_ptr1 = output_ptr0 + output_w;
-      // pad left
-      if (padding_w) {
-        float32x4_t row0 = vld1q_f32(input_ptr0);
-        float32x4_t row1 = vld1q_f32(input_ptr1);
-        float32x4_t row2 = vld1q_f32(input_ptr2);
-        float32x4_t row3 = vld1q_f32(input_ptr3);
-        float32x4_t zero = vdupq_n_f32(0.f);
-        row0 = vextq_f32(zero, row0, 3);
-        row1 = vextq_f32(zero, row1, 3);
-        row2 = vextq_f32(zero, row2, 3);
-        row3 = vextq_f32(zero, row3, 3);
-        float32x4_t acc0, acc1;
-        for (int w = valid_w_start - 1; w >= 0; --w) {
-          int padding = padding_w - w;
-          if (padding >= 3) {
-            output_ptr0[w] = 0.f;
-            output_ptr1[w] = 0.f;
-          } else {
-            acc0 = vmulq_f32(row0, _ker[0]);
-            acc0 = vmlaq_f32(acc0, row1, _ker[1]);
-            acc0 = vmlaq_f32(acc0, row2, _ker[2]);
-            acc0 = vextq_f32(acc0, acc0, 1);
-            acc1 = vmulq_f32(row1, _ker[0]);
-            acc1 = vmlaq_f32(acc1, row2, _ker[1]);
-            acc1 = vmlaq_f32(acc1, row3, _ker[2]);
-            acc1 = vextq_f32(acc1, acc1, 1);
-            float32x2_t sum = vpadd_f32(vget_low_f32(acc0), vget_low_f32(acc1));
-            vst1_lane_f32(output_ptr0 + w, sum, 0);
-            vst1_lane_f32(output_ptr1 + w, sum, 1);
-
-            row0 = vextq_f32(zero, row0, 3);
-            row1 = vextq_f32(zero, row1, 3);
-            row2 = vextq_f32(zero, row2, 3);
-            row3 = vextq_f32(zero, row3, 3);
-          }
-        }
-        output_ptr0 += valid_w_start;
-        output_ptr1 += valid_w_start;
-      }
-      // valid
-      float32x4_t _result0, _result1, _result2, _result3;
-      for (int loop = 0; loop < output_w_tiles; ++loop) {
-        float32x4_t _row00 = vld1q_f32(input_ptr0);
-        float32x4_t _row01 = vld1q_f32(input_ptr0 + 4);
-        float32x4_t _row10 = vld1q_f32(input_ptr1);
-        float32x4_t _row11 = vld1q_f32(input_ptr1 + 4);
-
-        float32x4_t _ext01 = vextq_f32(_row00, _row01, 1);
-        float32x4_t _ext02 = vextq_f32(_row00, _row01, 2);
-        float32x4_t _ext03 = vextq_f32(_row01, _row01, 1);
-        float32x4_t _ext04 = vextq_f32(_row01, _row01, 2);
-
-        _result0 = vmulq_lane_f32(_row00, vget_low_f32(_ker[0]), 0);
-        _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[0]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[0]), 0);
-        _result1 = vmulq_lane_f32(_row01, vget_low_f32(_ker[0]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[0]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[0]), 0);
-
-        _ext01 = vextq_f32(_row10, _row11, 1);
-        _ext02 = vextq_f32(_row10, _row11, 2);
-        _ext03 = vextq_f32(_row11, _row11, 1);
-        _ext04 = vextq_f32(_row11, _row11, 2);
-
-        _result0 = vmlaq_lane_f32(_result0, _row10, vget_low_f32(_ker[1]), 0);
-        _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[1]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[1]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _row11, vget_low_f32(_ker[1]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[1]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[1]), 0);
-
-        _result2 = vmulq_lane_f32(_row10, vget_low_f32(_ker[0]), 0);
-        _result2 = vmlaq_lane_f32(_result2, _ext01, vget_low_f32(_ker[0]), 1);
-        _result2 = vmlaq_lane_f32(_result2, _ext02, vget_high_f32(_ker[0]), 0);
-        _result3 = vmulq_lane_f32(_row11, vget_low_f32(_ker[0]), 0);
-        _result3 = vmlaq_lane_f32(_result3, _ext03, vget_low_f32(_ker[0]), 1);
-        _result3 = vmlaq_lane_f32(_result3, _ext04, vget_high_f32(_ker[0]), 0);
-
-        _row00 = vld1q_f32(input_ptr2);
-        _row01 = vld1q_f32(input_ptr2 + 4);
-        _row10 = vld1q_f32(input_ptr3);
-        _row11 = vld1q_f32(input_ptr3 + 4);
-
-        _ext01 = vextq_f32(_row00, _row01, 1);
-        _ext02 = vextq_f32(_row00, _row01, 2);
-        _ext03 = vextq_f32(_row01, _row01, 1);
-        _ext04 = vextq_f32(_row01, _row01, 2);
-
-        _result0 = vmlaq_lane_f32(_result0, _row00, vget_low_f32(_ker[2]), 0);
-        _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[2]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[2]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _row01, vget_low_f32(_ker[2]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[2]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[2]), 0);
-
-        _result2 = vmlaq_lane_f32(_result2, _row00, vget_low_f32(_ker[1]), 0);
-        _result2 = vmlaq_lane_f32(_result2, _ext01, vget_low_f32(_ker[1]), 1);
-        _result2 = vmlaq_lane_f32(_result2, _ext02, vget_high_f32(_ker[1]), 0);
-        _result3 = vmlaq_lane_f32(_result3, _row01, vget_low_f32(_ker[1]), 0);
-        _result3 = vmlaq_lane_f32(_result3, _ext03, vget_low_f32(_ker[1]), 1);
-        _result3 = vmlaq_lane_f32(_result3, _ext04, vget_high_f32(_ker[1]), 0);
-
-        _ext01 = vextq_f32(_row10, _row11, 1);
-        _ext02 = vextq_f32(_row10, _row11, 2);
-        _ext03 = vextq_f32(_row11, _row11, 1);
-        _ext04 = vextq_f32(_row11, _row11, 2);
-
-        _result2 = vmlaq_lane_f32(_result2, _row10, vget_low_f32(_ker[2]), 0);
-        _result2 = vmlaq_lane_f32(_result2, _ext01, vget_low_f32(_ker[2]), 1);
-        _result2 = vmlaq_lane_f32(_result2, _ext02, vget_high_f32(_ker[2]), 0);
-        _result3 = vmlaq_lane_f32(_result3, _row11, vget_low_f32(_ker[2]), 0);
-        _result3 = vmlaq_lane_f32(_result3, _ext03, vget_low_f32(_ker[2]), 1);
-        _result3 = vmlaq_lane_f32(_result3, _ext04, vget_high_f32(_ker[2]), 0);
-
-        vst1q_f32(output_ptr0, _result0);
-        vst1_f32(output_ptr0 + 4, vget_low_f32(_result1));
-        vst1q_f32(output_ptr1, _result2);
-        vst1_f32(output_ptr1 + 4, vget_low_f32(_result3));
-
-        input_ptr0 += 6;
-        input_ptr1 += 6;
-        input_ptr2 += 6;
-        input_ptr3 += 6;
-        output_ptr0 += 6;
-        output_ptr1 += 6;
-      }
-      // remain w
-      if (output_w_remain > 0) {
-        float32x4_t _row00 = vld1q_f32(input_ptr0);
-        float32x4_t _row01 = vld1q_f32(input_ptr0 + 4);
-        float32x4_t _row10 = vld1q_f32(input_ptr1);
-        float32x4_t _row11 = vld1q_f32(input_ptr1 + 4);
-
-        float32x4_t _ext01 = vextq_f32(_row00, _row01, 1);
-        float32x4_t _ext02 = vextq_f32(_row00, _row01, 2);
-        float32x4_t _ext03 = vextq_f32(_row01, _row01, 1);
-        float32x4_t _ext04 = vextq_f32(_row01, _row01, 2);
-
-        _result0 = vmulq_lane_f32(_row00, vget_low_f32(_ker[0]), 0);
-        _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[0]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[0]), 0);
-        _result1 = vmulq_lane_f32(_row01, vget_low_f32(_ker[0]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[0]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[0]), 0);
-
-        _ext01 = vextq_f32(_row10, _row11, 1);
-        _ext02 = vextq_f32(_row10, _row11, 2);
-        _ext03 = vextq_f32(_row11, _row11, 1);
-        _ext04 = vextq_f32(_row11, _row11, 2);
-
-        _result0 = vmlaq_lane_f32(_result0, _row10, vget_low_f32(_ker[1]), 0);
-        _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[1]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[1]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _row11, vget_low_f32(_ker[1]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[1]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[1]), 0);
-
-        _result2 = vmulq_lane_f32(_row10, vget_low_f32(_ker[0]), 0);
-        _result2 = vmlaq_lane_f32(_result2, _ext01, vget_low_f32(_ker[0]), 1);
-        _result2 = vmlaq_lane_f32(_result2, _ext02, vget_high_f32(_ker[0]), 0);
-        _result3 = vmulq_lane_f32(_row11, vget_low_f32(_ker[0]), 0);
-        _result3 = vmlaq_lane_f32(_result3, _ext03, vget_low_f32(_ker[0]), 1);
-        _result3 = vmlaq_lane_f32(_result3, _ext04, vget_high_f32(_ker[0]), 0);
-
-        _row00 = vld1q_f32(input_ptr2);
-        _row01 = vld1q_f32(input_ptr2 + 4);
-        _row10 = vld1q_f32(input_ptr3);
-        _row11 = vld1q_f32(input_ptr3 + 4);
-
-        _ext01 = vextq_f32(_row00, _row01, 1);
-        _ext02 = vextq_f32(_row00, _row01, 2);
-        _ext03 = vextq_f32(_row01, _row01, 1);
-        _ext04 = vextq_f32(_row01, _row01, 2);
-
-        _result0 = vmlaq_lane_f32(_result0, _row00, vget_low_f32(_ker[2]), 0);
-        _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[2]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[2]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _row01, vget_low_f32(_ker[2]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[2]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[2]), 0);
-
-        _result2 = vmlaq_lane_f32(_result2, _row00, vget_low_f32(_ker[1]), 0);
-        _result2 = vmlaq_lane_f32(_result2, _ext01, vget_low_f32(_ker[1]), 1);
-        _result2 = vmlaq_lane_f32(_result2, _ext02, vget_high_f32(_ker[1]), 0);
-        _result3 = vmlaq_lane_f32(_result3, _row01, vget_low_f32(_ker[1]), 0);
-        _result3 = vmlaq_lane_f32(_result3, _ext03, vget_low_f32(_ker[1]), 1);
-        _result3 = vmlaq_lane_f32(_result3, _ext04, vget_high_f32(_ker[1]), 0);
-
-        _ext01 = vextq_f32(_row10, _row11, 1);
-        _ext02 = vextq_f32(_row10, _row11, 2);
-        _ext03 = vextq_f32(_row11, _row11, 1);
-        _ext04 = vextq_f32(_row11, _row11, 2);
-
-        _result2 = vmlaq_lane_f32(_result2, _row10, vget_low_f32(_ker[2]), 0);
-        _result2 = vmlaq_lane_f32(_result2, _ext01, vget_low_f32(_ker[2]), 1);
-        _result2 = vmlaq_lane_f32(_result2, _ext02, vget_high_f32(_ker[2]), 0);
-        _result3 = vmlaq_lane_f32(_result3, _row11, vget_low_f32(_ker[2]), 0);
-        _result3 = vmlaq_lane_f32(_result3, _ext03, vget_low_f32(_ker[2]), 1);
-        _result3 = vmlaq_lane_f32(_result3, _ext04, vget_high_f32(_ker[2]), 0);
-
-        switch (output_w_remain) {
-          case 5:
-            vst1q_lane_f32(output_ptr0 + 4, _result1, 0);
-            vst1q_lane_f32(output_ptr1 + 4, _result3, 0);
-          case 4:
-            vst1q_f32(output_ptr0, _result0);
-            vst1q_f32(output_ptr1, _result2);
-            break;
-          case 3:
-            vst1q_lane_f32(output_ptr0 + 2, _result0, 2);
-            vst1q_lane_f32(output_ptr1 + 2, _result2, 2);
-          case 2:
-            vst1_f32(output_ptr0, vget_low_f32(_result0));
-            vst1_f32(output_ptr1, vget_low_f32(_result2));
-            break;
-          case 1:
-            vst1q_lane_f32(output_ptr0, _result0, 0);
-            vst1q_lane_f32(output_ptr1, _result2, 0);
-            break;
-        }
-
-        input_ptr0 += output_w_remain;
-        input_ptr1 += output_w_remain;
-        input_ptr2 += output_w_remain;
-        input_ptr3 += output_w_remain;
-        output_ptr0 += output_w_remain;
-        output_ptr1 += output_w_remain;
-      }
-      // pad right
-      if (padding_w) {
-        float32x2_t row0 = vld1_f32(input_ptr0);
-        float32x2_t row1 = vld1_f32(input_ptr1);
-        float32x2_t row2 = vld1_f32(input_ptr2);
-        float32x2_t row3 = vld1_f32(input_ptr3);
-        float32x2_t zero = vdup_n_f32(0.f);
-        float32x2_t acc0, acc1;
-        for (int w = valid_w_end; w < output_w; ++w) {
-          int padding = w + 3 - (padding_w + input_w);
-          if (padding >= 3) {
-            *output_ptr0 = 0.f;
-            *output_ptr1 = 0.f;
-          } else {
-            acc0 = vmul_f32(row0, vget_low_f32(_ker[0]));
-            acc0 = vmla_f32(acc0, row1, vget_low_f32(_ker[1]));
-            acc0 = vmla_f32(acc0, row2, vget_low_f32(_ker[2]));
-            acc1 = vmul_f32(row1, vget_low_f32(_ker[0]));
-            acc1 = vmla_f32(acc1, row2, vget_low_f32(_ker[1]));
-            acc1 = vmla_f32(acc1, row3, vget_low_f32(_ker[2]));
-            float32x2_t sum = vpadd_f32(acc0, acc1);
-            vst1_lane_f32(output_ptr0, sum, 0);
-            vst1_lane_f32(output_ptr1, sum, 1);
-            row0 = vext_f32(row0, zero, 1);
-            row1 = vext_f32(row1, zero, 1);
-            row2 = vext_f32(row2, zero, 1);
-            row3 = vext_f32(row3, zero, 1);
-          }
-          output_ptr0++;
-          output_ptr1++;
-        }
-      }
-    }
-    // remain height
-    int start_h = valid_h_start + (valid_h & 0xfffffffe);
-    if (start_h < valid_h_end) {
-      const float *input_ptr0 = input_ptr + (start_h - padding_h) * input_w;
-      const float *input_ptr1 = input_ptr0 + input_w;
-      const float *input_ptr2 = input_ptr1 + input_w;
-      float *output_ptr0 = output_ptr + start_h * output_w;
-      // pad left
-      if (padding_w) {
-        float32x4_t row0 = vld1q_f32(input_ptr0);
-        float32x4_t row1 = vld1q_f32(input_ptr1);
-        float32x4_t row2 = vld1q_f32(input_ptr2);
-        float32x4_t zero = vdupq_n_f32(0.f);
-        row0 = vextq_f32(zero, row0, 3);
-        row1 = vextq_f32(zero, row1, 3);
-        row2 = vextq_f32(zero, row2, 3);
-        float32x4_t acc;
-        for (int w = valid_w_start - 1; w >= 0; --w) {
-          int padding = padding_w - w;
-          if (padding >= 3) {
-            output_ptr0[w] = 0.f;
-          } else {
-            acc = vmulq_f32(row0, _ker[0]);
-            acc = vmlaq_f32(acc, row1, _ker[1]);
-            acc = vmlaq_f32(acc, row2, _ker[2]);
-            acc = vextq_f32(acc, acc, 1);
-            float32x2_t sum = vpadd_f32(vget_low_f32(acc), vget_low_f32(acc));
-            vst1_lane_f32(output_ptr0 + w, sum, 0);
-
-            row0 = vextq_f32(zero, row0, 3);
-            row1 = vextq_f32(zero, row1, 3);
-            row2 = vextq_f32(zero, row2, 3);
-          }
-        }
-        output_ptr0 += valid_w_start;
-      }
-      // valid
-      float32x4_t _result0, _result1;
-      for (int loop = 0; loop < output_w_tiles; ++loop) {
-        float32x4_t _row00 = vld1q_f32(input_ptr0);
-        float32x4_t _row01 = vld1q_f32(input_ptr0 + 4);
-        float32x4_t _row10 = vld1q_f32(input_ptr1);
-        float32x4_t _row11 = vld1q_f32(input_ptr1 + 4);
-
-        float32x4_t _ext01 = vextq_f32(_row00, _row01, 1);
-        float32x4_t _ext02 = vextq_f32(_row00, _row01, 2);
-        float32x4_t _ext03 = vextq_f32(_row01, _row01, 1);
-        float32x4_t _ext04 = vextq_f32(_row01, _row01, 2);
-
-        _result0 = vmulq_lane_f32(_row00, vget_low_f32(_ker[0]), 0);
-        _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[0]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[0]), 0);
-        _result1 = vmulq_lane_f32(_row01, vget_low_f32(_ker[0]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[0]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[0]), 0);
-
-        _ext01 = vextq_f32(_row10, _row11, 1);
-        _ext02 = vextq_f32(_row10, _row11, 2);
-        _ext03 = vextq_f32(_row11, _row11, 1);
-        _ext04 = vextq_f32(_row11, _row11, 2);
-
-        _result0 = vmlaq_lane_f32(_result0, _row10, vget_low_f32(_ker[1]), 0);
-        _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[1]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[1]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _row11, vget_low_f32(_ker[1]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[1]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[1]), 0);
-
-        _row00 = vld1q_f32(input_ptr2);
-        _row01 = vld1q_f32(input_ptr2 + 4);
-
-        _ext01 = vextq_f32(_row00, _row01, 1);
-        _ext02 = vextq_f32(_row00, _row01, 2);
-        _ext03 = vextq_f32(_row01, _row01, 1);
-        _ext04 = vextq_f32(_row01, _row01, 2);
-
-        _result0 = vmlaq_lane_f32(_result0, _row00, vget_low_f32(_ker[2]), 0);
-        _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[2]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[2]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _row01, vget_low_f32(_ker[2]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[2]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[2]), 0);
-
-        vst1q_f32(output_ptr0, _result0);
-        vst1_f32(output_ptr0 + 4, vget_low_f32(_result1));
-
-        input_ptr0 += 6;
-        input_ptr1 += 6;
-        input_ptr2 += 6;
-        output_ptr0 += 6;
-      }
-
-      if (output_w_remain > 0) {
-        float32x4_t _row00 = vld1q_f32(input_ptr0);
-        float32x4_t _row01 = vld1q_f32(input_ptr0 + 4);
-        float32x4_t _row10 = vld1q_f32(input_ptr1);
-        float32x4_t _row11 = vld1q_f32(input_ptr1 + 4);
-
-        float32x4_t _ext01 = vextq_f32(_row00, _row01, 1);
-        float32x4_t _ext02 = vextq_f32(_row00, _row01, 2);
-        float32x4_t _ext03 = vextq_f32(_row01, _row01, 1);
-        float32x4_t _ext04 = vextq_f32(_row01, _row01, 2);
-
-        _result0 = vmulq_lane_f32(_row00, vget_low_f32(_ker[0]), 0);
-        _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[0]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[0]), 0);
-        _result1 = vmulq_lane_f32(_row01, vget_low_f32(_ker[0]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[0]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[0]), 0);
-
-        _ext01 = vextq_f32(_row10, _row11, 1);
-        _ext02 = vextq_f32(_row10, _row11, 2);
-        _ext03 = vextq_f32(_row11, _row11, 1);
-        _ext04 = vextq_f32(_row11, _row11, 2);
-
-        _result0 = vmlaq_lane_f32(_result0, _row10, vget_low_f32(_ker[1]), 0);
-        _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[1]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[1]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _row11, vget_low_f32(_ker[1]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[1]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[1]), 0);
-
-        _row00 = vld1q_f32(input_ptr2);
-        _row01 = vld1q_f32(input_ptr2 + 4);
-
-        _ext01 = vextq_f32(_row00, _row01, 1);
-        _ext02 = vextq_f32(_row00, _row01, 2);
-        _ext03 = vextq_f32(_row01, _row01, 1);
-        _ext04 = vextq_f32(_row01, _row01, 2);
-
-        _result0 = vmlaq_lane_f32(_result0, _row00, vget_low_f32(_ker[2]), 0);
-        _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[2]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[2]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _row01, vget_low_f32(_ker[2]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[2]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[2]), 0);
-
-        switch (output_w_remain) {
-          case 5:
-            vst1q_lane_f32(output_ptr0 + 4, _result1, 0);
-          case 4:
-            vst1q_f32(output_ptr0, _result0);
-            break;
-          case 3:
-            vst1q_lane_f32(output_ptr0 + 2, _result0, 2);
-          case 2:
-            vst1_f32(output_ptr0, vget_low_f32(_result0));
-            break;
-          case 1:
-            vst1q_lane_f32(output_ptr0, _result0, 0);
-            break;
-        }
-
-        input_ptr0 += output_w_remain;
-        input_ptr1 += output_w_remain;
-        input_ptr2 += output_w_remain;
-        output_ptr0 += output_w_remain;
-      }
-      // pad right
-      if (padding_w) {
-        float32x2_t row0 = vld1_f32(input_ptr0);
-        float32x2_t row1 = vld1_f32(input_ptr1);
-        float32x2_t row2 = vld1_f32(input_ptr2);
-        float32x2_t zero = vdup_n_f32(0.f);
-        float32x2_t acc;
-        for (int w = valid_w_end; w < output_w; ++w) {
-          int padding = w + 3 - (padding_w + input_w);
-          if (padding >= 3) {
-            *output_ptr0 = 0.f;
-          } else {
-            acc = vmul_f32(row0, vget_low_f32(_ker[0]));
-            acc = vmla_f32(acc, row1, vget_low_f32(_ker[1]));
-            acc = vmla_f32(acc, row2, vget_low_f32(_ker[2]));
-            float32x2_t sum = vpadd_f32(acc, acc);
-            vst1_lane_f32(output_ptr0, sum, 0);
-            row0 = vext_f32(row0, zero, 1);
-            row1 = vext_f32(row1, zero, 1);
-            row2 = vext_f32(row2, zero, 1);
-          }
-          output_ptr0++;
-        }
-      }
-    }
-    // pad bottom
-    for (int h = valid_h_end; h < output_h; ++h) {
-      DepthwiseConv3x3NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h,
-                                      input_w, padding_h, padding_w, output_w,
-                                      output_ptr, _ker);
-    }
-  }
-}
-
-template <>
-void DepthwiseConv3x3S2<float, float>(const framework::Tensor &input,
-                                      const framework::Tensor &filter,
-                                      const std::vector<int> &paddings,
-                                      framework::Tensor *output) {
-  const float *input_data = input.data<float>();
-  const float *filter_data = filter.data<float>();
-  float *out_data = output->mutable_data<float>();
-
-  const int input_h = input.dims()[2];
-  const int input_w = input.dims()[3];
-  const int output_h = output->dims()[2];
-  const int output_w = output->dims()[3];
-  const int padding_h = paddings[0];
-  const int padding_w = paddings[1];
-  const int image_size = input_h * input_w;
-  const int out_image_size = output_h * output_w;
-  const int valid_h_start = (padding_h + 1) / 2;
-  const int valid_h_end =
-      std::max((input_h + padding_h - 1) / 2, valid_h_start);
-  const int valid_h = valid_h_end - valid_h_start;
-  const int valid_w_start = (padding_w + 1) / 2;
-  const int valid_w_end =
-      std::max((input_w + padding_w - 1) / 2, valid_w_start);
-  const int valid_w = valid_w_end - valid_w_start;
-  const int input_w_start = 2 * valid_w_start - padding_w;
-
-  #pragma omp parallel for
-  for (int g = 0; g < input.dims()[1]; ++g) {
-    const float *input_ptr = input_data + g * image_size;
-    const float *filter_ptr = filter_data + g * 9;
-    float *output_ptr = out_data + g * out_image_size;
-
-    const float *filter_ptr0 = filter_ptr;
-    const float *filter_ptr1 = filter_ptr0 + 3;
-    const float *filter_ptr2 = filter_ptr1 + 3;
-    float32x4_t _ker[3];
-    _ker[0] = vld1q_f32(filter_ptr0);
-    _ker[1] = vld1q_f32(filter_ptr1);
-    _ker[2] = vld1q_f32(filter_ptr2);
-
-    // pad top
-    for (int h = 0; h < valid_h_start; ++h) {
-      DepthwiseConv3x3NormalRow<2, 2>(input_ptr, filter_ptr, h, input_h,
-                                      input_w, padding_h, padding_w, output_w,
-                                      output_ptr, _ker);
-    }
-    // valid 2x4
-    int output_w_tiles = valid_w / 4;
-    int output_w_remain = valid_w - output_w_tiles * 4;
-    for (int h = valid_h_start; h < valid_h_end - 1; h += 2) {
-      const float *input_ptr0 = input_ptr + (2 * h - padding_h) * input_w;
-      const float *input_ptr1 = input_ptr0 + input_w;
-      const float *input_ptr2 = input_ptr1 + input_w;
-      const float *input_ptr3 = input_ptr2 + input_w;
-      const float *input_ptr4 = input_ptr3 + input_w;
-      float *output_ptr0 = output_ptr + h * output_w;
-      float *output_ptr1 = output_ptr0 + output_w;
-      // pad left
-      if (padding_w) {
-        for (int w = valid_w_start - 1; w >= 0; --w) {
-          int padding = padding_w - (w << 1);
-          if (padding >= 3) {
-            output_ptr0[w] = 0;
-            output_ptr1[w] = 0;
-          } else {
-            float32x4_t row0 = vld1q_f32(input_ptr0 - padding);
-            float32x4_t row1 = vld1q_f32(input_ptr1 - padding);
-            float32x4_t row2 = vld1q_f32(input_ptr2 - padding);
-            float32x4_t row3 = vld1q_f32(input_ptr3 - padding);
-            float32x4_t row4 = vld1q_f32(input_ptr4 - padding);
-            float32x4_t acc0 = vmulq_f32(row0, _ker[0]);
-            float32x4_t acc1 = vmulq_f32(row2, _ker[0]);
-            acc0 = vmlaq_f32(acc0, row1, _ker[1]);
-            acc1 = vmlaq_f32(acc1, row3, _ker[1]);
-            acc0 = vmlaq_f32(acc0, row2, _ker[2]);
-            acc1 = vmlaq_f32(acc1, row4, _ker[2]);
-            float sum0 = vgetq_lane_f32(acc0, 2);
-            float sum1 = vgetq_lane_f32(acc1, 2);
-            if (padding == 1) {
-              sum0 += vgetq_lane_f32(acc0, 1);
-              sum1 += vgetq_lane_f32(acc1, 1);
-            }
-            output_ptr0[w] = sum0;
-            output_ptr1[w] = sum1;
-          }
-        }
-        input_ptr0 += input_w_start;
-        input_ptr1 += input_w_start;
-        input_ptr2 += input_w_start;
-        input_ptr3 += input_w_start;
-        input_ptr4 += input_w_start;
-        output_ptr0 += valid_w_start;
-        output_ptr1 += valid_w_start;
-      }
-      // valid
-      float32x4_t _result0, _result1, _ext;
-      for (int loop = 0; loop < output_w_tiles; ++loop) {
-        float32x4x2_t _row0 = vld2q_f32(input_ptr0);
-        float32x4x2_t _row1 = vld2q_f32(input_ptr1);
-
-        _ext = vextq_f32(_row0.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr0[8], _ext, 3);
-        _result0 = vmulq_lane_f32(_row0.val[0], vget_low_f32(_ker[0]), 0);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row0.val[1], vget_low_f32(_ker[0]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[0]), 0);
-
-        _ext = vextq_f32(_row1.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr1[8], _ext, 3);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row1.val[0], vget_low_f32(_ker[1]), 0);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row1.val[1], vget_low_f32(_ker[1]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[1]), 0);
-
-        _row0 = vld2q_f32(input_ptr2);
-        _row1 = vld2q_f32(input_ptr3);
-
-        _ext = vextq_f32(_row0.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr2[8], _ext, 3);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row0.val[0], vget_low_f32(_ker[2]), 0);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row0.val[1], vget_low_f32(_ker[2]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[2]), 0);
-        _result1 = vmulq_lane_f32(_row0.val[0], vget_low_f32(_ker[0]), 0);
-        _result1 =
-            vmlaq_lane_f32(_result1, _row0.val[1], vget_low_f32(_ker[0]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext, vget_high_f32(_ker[0]), 0);
-
-        _ext = vextq_f32(_row1.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr3[8], _ext, 3);
-        _result1 =
-            vmlaq_lane_f32(_result1, _row1.val[0], vget_low_f32(_ker[1]), 0);
-        _result1 =
-            vmlaq_lane_f32(_result1, _row1.val[1], vget_low_f32(_ker[1]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext, vget_high_f32(_ker[1]), 0);
-
-        _row0 = vld2q_f32(input_ptr4);
-
-        _ext = vextq_f32(_row0.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr4[8], _ext, 3);
-        _result1 =
-            vmlaq_lane_f32(_result1, _row0.val[0], vget_low_f32(_ker[2]), 0);
-        _result1 =
-            vmlaq_lane_f32(_result1, _row0.val[1], vget_low_f32(_ker[2]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext, vget_high_f32(_ker[2]), 0);
-
-        vst1q_f32(output_ptr0, _result0);
-        vst1q_f32(output_ptr1, _result1);
-
-        input_ptr0 += 8;
-        input_ptr1 += 8;
-        input_ptr2 += 8;
-        input_ptr3 += 8;
-        input_ptr4 += 8;
-        output_ptr0 += 4;
-        output_ptr1 += 4;
-      }
-      // remain w
-      if (output_w_remain > 0) {
-        float32x4x2_t _row0 = vld2q_f32(input_ptr0);
-        float32x4x2_t _row1 = vld2q_f32(input_ptr1);
-
-        _ext = vextq_f32(_row0.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr0[8], _ext, 3);
-        _result0 = vmulq_lane_f32(_row0.val[0], vget_low_f32(_ker[0]), 0);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row0.val[1], vget_low_f32(_ker[0]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[0]), 0);
-
-        _ext = vextq_f32(_row1.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr1[8], _ext, 3);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row1.val[0], vget_low_f32(_ker[1]), 0);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row1.val[1], vget_low_f32(_ker[1]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[1]), 0);
-
-        _row0 = vld2q_f32(input_ptr2);
-        _row1 = vld2q_f32(input_ptr3);
-
-        _ext = vextq_f32(_row0.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr2[8], _ext, 3);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row0.val[0], vget_low_f32(_ker[2]), 0);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row0.val[1], vget_low_f32(_ker[2]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[2]), 0);
-        _result1 = vmulq_lane_f32(_row0.val[0], vget_low_f32(_ker[0]), 0);
-        _result1 =
-            vmlaq_lane_f32(_result1, _row0.val[1], vget_low_f32(_ker[0]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext, vget_high_f32(_ker[0]), 0);
-
-        _ext = vextq_f32(_row1.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr3[8], _ext, 3);
-        _result1 =
-            vmlaq_lane_f32(_result1, _row1.val[0], vget_low_f32(_ker[1]), 0);
-        _result1 =
-            vmlaq_lane_f32(_result1, _row1.val[1], vget_low_f32(_ker[1]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext, vget_high_f32(_ker[1]), 0);
-
-        _row0 = vld2q_f32(input_ptr4);
-
-        _ext = vextq_f32(_row0.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr4[8], _ext, 3);
-        _result1 =
-            vmlaq_lane_f32(_result1, _row0.val[0], vget_low_f32(_ker[2]), 0);
-        _result1 =
-            vmlaq_lane_f32(_result1, _row0.val[1], vget_low_f32(_ker[2]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext, vget_high_f32(_ker[2]), 0);
-
-        switch (output_w_remain) {
-          case 3:
-            vst1q_lane_f32(output_ptr0 + 2, _result0, 2);
-            vst1q_lane_f32(output_ptr1 + 2, _result1, 2);
-          case 2:
-            vst1_f32(output_ptr0, vget_low_f32(_result0));
-            vst1_f32(output_ptr1, vget_low_f32(_result1));
-            break;
-          case 1:
-            vst1q_lane_f32(output_ptr0, _result0, 0);
-            vst1q_lane_f32(output_ptr1, _result1, 0);
-            break;
-        }
-        input_ptr0 += output_w_remain * 2;
-        input_ptr1 += output_w_remain * 2;
-        input_ptr2 += output_w_remain * 2;
-        input_ptr3 += output_w_remain * 2;
-        input_ptr4 += output_w_remain * 2;
-        output_ptr0 += output_w_remain;
-        output_ptr1 += output_w_remain;
-      }
-      // pad right
-      if (padding_w > 0) {
-        float32x4_t row0 = vld1q_f32(input_ptr0);
-        float32x4_t row1 = vld1q_f32(input_ptr1);
-        float32x4_t row2 = vld1q_f32(input_ptr2);
-        float32x4_t row3 = vld1q_f32(input_ptr3);
-        float32x4_t row4 = vld1q_f32(input_ptr4);
-        float32x4_t acc0, acc1;
-        for (int w = valid_w_end; w < output_w; ++w) {
-          int padding = 2 * w + 3 - (padding_w + input_w);
-          if (padding >= 3) {
-            *output_ptr0 = 0;
-            *output_ptr1 = 0;
-          } else {
-            acc0 = vmulq_f32(row0, _ker[0]);
-            acc1 = vmulq_f32(row2, _ker[0]);
-            acc0 = vmlaq_f32(acc0, row1, _ker[1]);
-            acc1 = vmlaq_f32(acc1, row3, _ker[1]);
-            acc0 = vmlaq_f32(acc0, row2, _ker[2]);
-            acc1 = vmlaq_f32(acc1, row4, _ker[2]);
-            float sum0 = vgetq_lane_f32(acc0, 0);
-            float sum1 = vgetq_lane_f32(acc1, 0);
-            if (padding == 1) {
-              sum0 += vgetq_lane_f32(acc0, 1);
-              sum1 += vgetq_lane_f32(acc1, 1);
-            }
-            *output_ptr0 = sum0;
-            *output_ptr1 = sum1;
-          }
-          output_ptr0++;
-          output_ptr1++;
-        }
-      }
-    }
-    // remain height
-    int start_h = valid_h_start + (valid_h & 0xfffffffe);
-    if (start_h < valid_h_end) {
-      const float *input_ptr0 = input_ptr + (2 * start_h - padding_h) * input_w;
-      const float *input_ptr1 = input_ptr0 + input_w;
-      const float *input_ptr2 = input_ptr1 + input_w;
-      float *output_ptr0 = output_ptr + start_h * output_w;
-      // pad left
-      if (padding_w) {
-        for (int w = valid_w_start - 1; w >= 0; --w) {
-          int padding = padding_w - (w << 1);
-          if (padding >= 3) {
-            output_ptr0[w] = 0;
-          } else {
-            float32x4_t row0 = vld1q_f32(input_ptr0 - padding);
-            float32x4_t row1 = vld1q_f32(input_ptr1 - padding);
-            float32x4_t row2 = vld1q_f32(input_ptr2 - padding);
-            float32x4_t acc0 = vmulq_f32(row0, _ker[0]);
-            acc0 = vmlaq_f32(acc0, row1, _ker[1]);
-            acc0 = vmlaq_f32(acc0, row2, _ker[2]);
-            float sum0 = vgetq_lane_f32(acc0, 2);
-            if (padding == 1) {
-              sum0 += vgetq_lane_f32(acc0, 1);
-            }
-            output_ptr0[w] = sum0;
-          }
-        }
-        input_ptr0 += input_w_start;
-        input_ptr1 += input_w_start;
-        input_ptr2 += input_w_start;
-        output_ptr0 += valid_w_start;
-      }
-      // valid
-      float32x4_t _result0, _ext;
-      for (int loop = 0; loop < output_w_tiles; ++loop) {
-        float32x4x2_t _row0 = vld2q_f32(input_ptr0);
-        float32x4x2_t _row1 = vld2q_f32(input_ptr1);
-        float32x4x2_t _row2 = vld2q_f32(input_ptr2);
-
-        _ext = vextq_f32(_row0.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr0[8], _ext, 3);
-        _result0 = vmulq_lane_f32(_row0.val[0], vget_low_f32(_ker[0]), 0);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row0.val[1], vget_low_f32(_ker[0]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[0]), 0);
-
-        _ext = vextq_f32(_row1.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr1[8], _ext, 3);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row1.val[0], vget_low_f32(_ker[1]), 0);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row1.val[1], vget_low_f32(_ker[1]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[1]), 0);
-
-        _ext = vextq_f32(_row2.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr2[8], _ext, 3);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row2.val[0], vget_low_f32(_ker[2]), 0);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row2.val[1], vget_low_f32(_ker[2]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[2]), 0);
-
-        vst1q_f32(output_ptr0, _result0);
-
-        input_ptr0 += 8;
-        input_ptr1 += 8;
-        input_ptr2 += 8;
-        output_ptr0 += 4;
-      }
-      // remain w
-      if (output_w_remain > 0) {
-        float32x4x2_t _row0 = vld2q_f32(input_ptr0);
-        float32x4x2_t _row1 = vld2q_f32(input_ptr1);
-        float32x4x2_t _row2 = vld2q_f32(input_ptr2);
-
-        _ext = vextq_f32(_row0.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr0[8], _ext, 3);
-        _result0 = vmulq_lane_f32(_row0.val[0], vget_low_f32(_ker[0]), 0);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row0.val[1], vget_low_f32(_ker[0]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[0]), 0);
-
-        _ext = vextq_f32(_row1.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr1[8], _ext, 3);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row1.val[0], vget_low_f32(_ker[1]), 0);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row1.val[1], vget_low_f32(_ker[1]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[1]), 0);
-
-        _ext = vextq_f32(_row2.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr2[8], _ext, 3);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row2.val[0], vget_low_f32(_ker[2]), 0);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row2.val[1], vget_low_f32(_ker[2]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[2]), 0);
-
-        switch (output_w_remain) {
-          case 3:
-            vst1q_lane_f32(output_ptr0 + 2, _result0, 2);
-          case 2:
-            vst1_f32(output_ptr0, vget_low_f32(_result0));
-            break;
-          case 1:
-            vst1q_lane_f32(output_ptr0, _result0, 0);
-            break;
-        }
-        input_ptr0 += output_w_remain * 2;
-        input_ptr1 += output_w_remain * 2;
-        input_ptr2 += output_w_remain * 2;
-        output_ptr0 += output_w_remain;
-      }
-      // pad right
-      if (padding_w) {
-        float32x4_t row0 = vld1q_f32(input_ptr0);
-        float32x4_t row1 = vld1q_f32(input_ptr1);
-        float32x4_t row2 = vld1q_f32(input_ptr2);
-        float32x4_t acc0;
-        for (int w = valid_w_end; w < output_w; ++w) {
-          int padding = 2 * w + 3 - (padding_w + input_w);
-          if (padding >= 3) {
-            *output_ptr0 = 0;
-          } else {
-            acc0 = vmulq_f32(row0, _ker[0]);
-            acc0 = vmlaq_f32(acc0, row1, _ker[1]);
-            acc0 = vmlaq_f32(acc0, row2, _ker[2]);
-            float sum0 = vgetq_lane_f32(acc0, 0);
-            if (padding == 1) {
-              sum0 += vgetq_lane_f32(acc0, 1);
-            }
-            *output_ptr0 = sum0;
-          }
-          output_ptr0++;
-        }
-      }
-    }
-    // pad bottom
-    for (int h = valid_h_end; h < output_h; ++h) {
-      DepthwiseConv3x3NormalRow<2, 2>(input_ptr, filter_ptr, h, input_h,
-                                      input_w, padding_h, padding_w, output_w,
-                                      output_ptr, _ker);
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // __ARM_NEON__
diff --git a/mobile/src/operators/math/depthwise_conv3x3.h b/mobile/src/operators/math/depthwise_conv3x3.h
deleted file mode 100644
index 1f145c4f94..0000000000
--- a/mobile/src/operators/math/depthwise_conv3x3.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <vector>
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-// TODO(hjchen2) need to be implemented
-// template<typename Itype, typename Otype>
-// void DepthwiseConv3x3(const framework::Tensor *input,
-//                      const framework::Tensor *filter,
-//                      const std::vector<int> &strides,
-//                      const std::vector<int> &paddings,
-//                      framework::Tensor *output);
-
-template <typename Itype, typename Otype>
-void DepthwiseConv3x3S1(const framework::Tensor &input,
-                        const framework::Tensor &filter,
-                        const std::vector<int> &paddings,
-                        framework::Tensor *output);
-
-template <typename Itype, typename Otype>
-void DepthwiseConv3x3S2(const framework::Tensor &input,
-                        const framework::Tensor &filter,
-                        const std::vector<int> &paddings,
-                        framework::Tensor *output);
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/depthwise_conv3x3_int8.cpp b/mobile/src/operators/math/depthwise_conv3x3_int8.cpp
deleted file mode 100644
index e69df3e6be..0000000000
--- a/mobile/src/operators/math/depthwise_conv3x3_int8.cpp
+++ /dev/null
@@ -1,1660 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-#include <arm_neon.h>
-#include "operators/math/depthwise_conv3x3.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-#define DEPTHWISE_CONV_NORMAL_BORDER(start, end)                         \
-  for (int w = start; w < end; ++w) {                                    \
-    const int w_in_start = -padding_w + w * Stride_w;                    \
-    const int w_in_end = w_in_start + 3;                                 \
-    const int w_start = w_in_start > 0 ? w_in_start : 0;                 \
-    const int w_end = w_in_end < input_w ? w_in_end : input_w;           \
-    int32_t value = 0;                                                   \
-    for (int h_in = h_start; h_in < h_end; ++h_in) {                     \
-      for (int w_in = w_start; w_in < w_end; ++w_in) {                   \
-        value += filter[(h_in - h_in_start) * 3 + (w_in - w_in_start)] * \
-                 input[h_in * input_w + w_in];                           \
-      }                                                                  \
-    }                                                                    \
-    output_ptr[w] = value;                                               \
-  }
-
-template <int Stride = 1>
-inline void Depth3x3NormalRowLoadInput(const int8_t *input, int16x8_t *y) {
-  y[0] = vmovl_s8(vld1_s8(input));
-  y[1] = vextq_s16(y[0], y[0], 1);
-  y[2] = vextq_s16(y[1], y[1], 1);
-}
-
-template <>
-inline void Depth3x3NormalRowLoadInput<2>(const int8_t *input, int16x8_t *y) {
-  int8x8x2_t x0 = vld2_s8(input);
-  y[0] = vmovl_s8(x0.val[0]);
-  y[1] = vmovl_s8(x0.val[1]);
-  y[2] = vextq_s16(y[0], y[0], 1);
-}
-
-template <int Stride_h, int Stride_w>
-inline void DepthwiseConv3x3NormalRow(const int8_t *input, const int8_t *filter,
-                                      const int h_output, const int input_h,
-                                      const int input_w, const int padding_h,
-                                      const int padding_w, const int output_w,
-                                      int32_t *output, int16x4_t *ker) {
-  const int h_in_start = -padding_h + h_output * Stride_h;
-  const int h_in_end = h_in_start + 3;
-  const int h_start = h_in_start > 0 ? h_in_start : 0;
-  const int h_end = h_in_end < input_h ? h_in_end : input_h;
-
-  const int valid_w_start = (padding_w + Stride_w - 1) / Stride_w;
-  const int valid_w_end = (input_w + padding_w - 3) / Stride_w + 1;
-  int32_t *output_ptr = output + h_output * output_w;
-  // border left
-  DEPTHWISE_CONV_NORMAL_BORDER(0, valid_w_start)
-  // middle
-  int output_tiles = (valid_w_end - valid_w_start) / 6;
-  int remain_start = valid_w_start + output_tiles * 6;
-  int32x4_t _sum0, _sum1;
-  int16x8_t _y[3];
-  for (int w = 0; w < output_tiles * 6; w += 6) {
-    _sum0 = veorq_s32(_sum0, _sum0);
-    _sum1 = veorq_s32(_sum1, _sum1);
-    int output_offset = valid_w_start + w;
-    int input_w_offset = output_offset * Stride_w - padding_w;
-    for (int h_in = h_start; h_in < h_end; ++h_in) {
-      int index = h_in - h_in_start;
-      Depth3x3NormalRowLoadInput<Stride_w>(
-          input + h_in * input_w + input_w_offset, _y);
-      _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_y[0]), ker[index], 0);
-      _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_y[1]), ker[index], 1);
-      _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_y[2]), ker[index], 2);
-      _sum1 = vmlal_lane_s16(_sum1, vget_high_s16(_y[0]), ker[index], 0);
-      _sum1 = vmlal_lane_s16(_sum1, vget_high_s16(_y[1]), ker[index], 1);
-      _sum1 = vmlal_lane_s16(_sum1, vget_high_s16(_y[2]), ker[index], 2);
-    }
-    vst1q_s32(output_ptr + output_offset, _sum0);
-    vst1_s32(output_ptr + output_offset + 4, vget_low_s32(_sum1));
-  }
-  for (int w = remain_start; w < valid_w_end; ++w) {
-    int32_t value = 0;
-    int input_start = -padding_w + w * Stride_w;
-    for (int h_in = h_start; h_in < h_end; ++h_in) {
-      for (int j = 0; j < 3; ++j) {
-        value += filter[(h_in - h_in_start) * 3 + j] *
-                 input[h_in * input_w + j + input_start];
-      }
-    }
-    output_ptr[w] = value;
-  }
-  // border right
-  DEPTHWISE_CONV_NORMAL_BORDER(valid_w_end, output_w)
-}
-
-template <>
-void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
-                                         const framework::Tensor &filter,
-                                         const std::vector<int> &paddings,
-                                         framework::Tensor *output) {
-  const int8_t *input_data = input.data<int8_t>();
-  const int8_t *filter_data = filter.data<int8_t>();
-  int32_t *out_data = output->mutable_data<int32_t>();
-  int input_h = input.dims()[2];
-  int input_w = input.dims()[3];
-  int output_h = output->dims()[2];
-  int output_w = output->dims()[3];
-  int padding_h = paddings[0];
-  int padding_w = paddings[1];
-  int image_size = input_h * input_w;
-  int out_image_size = output_h * output_w;
-  int valid_h_start = padding_h;
-  int valid_h_end = output_h - valid_h_start;
-  int valid_h = valid_h_end - valid_h_start;
-  int valid_w_start = padding_w;
-  int valid_w_end = output_w - valid_w_start;
-  int valid_w = valid_w_end - valid_w_start;
-
-  #pragma omp parallel for
-  for (int g = 0; g < input.dims()[1]; ++g) {
-    const int8_t *input_ptr = input_data + g * image_size;
-    const int8_t *filter_ptr = filter_data + g * 9;
-    int32_t *output_ptr = out_data + g * out_image_size;
-
-    const int8_t *filter_ptr0 = filter_ptr;
-    const int8_t *filter_ptr1 = filter_ptr0 + 3;
-    const int8_t *filter_ptr2 = filter_ptr1 + 3;
-    int16x4_t _k0 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr0)));
-    int16x4_t _k1 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr1)));
-    int16x4_t _k2 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr2)));
-    int16x8_t _ker0 = vcombine_s16(_k0, _k1);
-    int16x8_t _ker1 = vcombine_s16(_k2, _k2);
-    int16x4_t zero = vdup_n_s16(0);
-    int16x4_t _ker[3] = {_k0, _k1, _k2};
-    // top
-    for (int h = 0; h < valid_h_start; ++h) {
-      DepthwiseConv3x3NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h,
-                                      input_w, padding_h, padding_w, output_w,
-                                      output_ptr, _ker);
-    }
-    // valid
-    int output_w_tiles = valid_w / 6;
-    int output_w_remain = valid_w - output_w_tiles * 6;
-    for (int h = valid_h_start; h < valid_h_end - 3; h += 4) {
-      const int8_t *input_ptr0 = input_ptr + (h - padding_h) * input_w;
-      const int8_t *input_ptr1 = input_ptr0 + input_w;
-      const int8_t *input_ptr2 = input_ptr1 + input_w;
-      const int8_t *input_ptr3 = input_ptr2 + input_w;
-      const int8_t *input_ptr4 = input_ptr3 + input_w;
-      const int8_t *input_ptr5 = input_ptr4 + input_w;
-      int32_t *output_ptr0 = output_ptr + h * output_w;
-      int32_t *output_ptr1 = output_ptr0 + output_w;
-      int32_t *output_ptr2 = output_ptr1 + output_w;
-      int32_t *output_ptr3 = output_ptr2 + output_w;
-      // pad left
-      if (padding_w) {
-        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0)));
-        int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1)));
-        int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2)));
-        int16x4_t row3 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr3)));
-        int16x4_t row4 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr4)));
-        int16x4_t row5 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr5)));
-        int32x4_t acc;
-        for (int w = valid_w_start - 1; w >= 0; --w) {
-          int padding = padding_w - w;
-          if (padding >= 3) {
-            output_ptr0[w] = 0;
-            output_ptr1[w] = 0;
-            output_ptr2[w] = 0;
-            output_ptr3[w] = 0;
-          } else {
-            row0 = vext_s16(zero, row0, 3);
-            row1 = vext_s16(zero, row1, 3);
-            row2 = vext_s16(zero, row2, 3);
-            row3 = vext_s16(zero, row3, 3);
-            row4 = vext_s16(zero, row4, 3);
-            row5 = vext_s16(zero, row5, 3);
-            acc = vmull_s16(row0, _ker[0]);
-            acc = vmlal_s16(acc, row1, _ker[1]);
-            acc = vmlal_s16(acc, row2, _ker[2]);
-            output_ptr0[w] = vgetq_lane_s32(acc, 1) + vgetq_lane_s32(acc, 2);
-            acc = vmull_s16(row1, _ker[0]);
-            acc = vmlal_s16(acc, row2, _ker[1]);
-            acc = vmlal_s16(acc, row3, _ker[2]);
-            output_ptr1[w] = vgetq_lane_s32(acc, 1) + vgetq_lane_s32(acc, 2);
-            acc = vmull_s16(row2, _ker[0]);
-            acc = vmlal_s16(acc, row3, _ker[1]);
-            acc = vmlal_s16(acc, row4, _ker[2]);
-            output_ptr2[w] = vgetq_lane_s32(acc, 1) + vgetq_lane_s32(acc, 2);
-            acc = vmull_s16(row3, _ker[0]);
-            acc = vmlal_s16(acc, row4, _ker[1]);
-            acc = vmlal_s16(acc, row5, _ker[2]);
-            output_ptr3[w] = vgetq_lane_s32(acc, 1) + vgetq_lane_s32(acc, 2);
-          }
-        }
-        output_ptr0 += valid_w_start;
-        output_ptr1 += valid_w_start;
-        output_ptr2 += valid_w_start;
-        output_ptr3 += valid_w_start;
-      }
-#if __aarch64__
-#else
-      // valid
-      int loop = output_w_tiles;
-      asm volatile(
-          "cmp        %[loop], #0                  \n"
-          "ble        start_remain_%=              \n"
-          "mov        r0, #6                       \n"
-          // loop 6 width
-          "loop_4h6w_%=:                           \n"
-          "vld1.32    {d9}, [%[input_ptr0]], r0    \n"
-          "vld1.32    {d10}, [%[input_ptr1]], r0   \n"
-          "vld1.32    {d11}, [%[input_ptr2]], r0   \n"
-          "vext.s8    d12, d9, d9, #1              \n"
-          "vext.s8    d13, d9, d9, #2              \n"
-          "vmovl.s8   q7, d9                       \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmull.s16  q10, d14, %e[ker0][0]        \n"
-          "vmlal.s16  q10, d16, %e[ker0][1]        \n"
-          "vmlal.s16  q10, d18, %e[ker0][2]        \n"
-          "vmull.s16  q11, d15, %e[ker0][0]        \n"
-          "vmlal.s16  q11, d17, %e[ker0][1]        \n"
-          "vmlal.s16  q11, d19, %e[ker0][2]        \n"
-
-          "vext.s8    d12, d10, d10, #1            \n"
-          "vext.s8    d13, d10, d10, #2            \n"
-          "vmovl.s8   q7, d10                      \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q10, d14, %f[ker0][0]        \n"
-          "vmlal.s16  q10, d16, %f[ker0][1]        \n"
-          "vmlal.s16  q10, d18, %f[ker0][2]        \n"
-          "vmlal.s16  q11, d15, %f[ker0][0]        \n"
-          "vmlal.s16  q11, d17, %f[ker0][1]        \n"
-          "vmlal.s16  q11, d19, %f[ker0][2]        \n"
-
-          "vmull.s16  q12, d14, %e[ker0][0]        \n"
-          "vmlal.s16  q12, d16, %e[ker0][1]        \n"
-          "vmlal.s16  q12, d18, %e[ker0][2]        \n"
-          "vmull.s16  q13, d15, %e[ker0][0]        \n"
-          "vmlal.s16  q13, d17, %e[ker0][1]        \n"
-          "vmlal.s16  q13, d19, %e[ker0][2]        \n"
-
-          "vext.s8    d12, d11, d11, #1            \n"
-          "vext.s8    d13, d11, d11, #2            \n"
-          "vmovl.s8   q7, d11                      \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q10, d14, %e[ker1][0]        \n"
-          "vmlal.s16  q10, d16, %e[ker1][1]        \n"
-          "vmlal.s16  q10, d18, %e[ker1][2]        \n"
-          "vmlal.s16  q11, d15, %e[ker1][0]        \n"
-          "vmlal.s16  q11, d17, %e[ker1][1]        \n"
-          "vmlal.s16  q11, d19, %e[ker1][2]        \n"
-          // store row 0, reuse q10/q11
-          "vst1.32    {d20-d22}, [%[output_ptr0]]! \n"
-
-          "vmlal.s16  q12, d14, %f[ker0][0]        \n"
-          "vmlal.s16  q12, d16, %f[ker0][1]        \n"
-          "vmlal.s16  q12, d18, %f[ker0][2]        \n"
-          "vmlal.s16  q13, d15, %f[ker0][0]        \n"
-          "vmlal.s16  q13, d17, %f[ker0][1]        \n"
-          "vmlal.s16  q13, d19, %f[ker0][2]        \n"
-
-          "vmull.s16  q14, d14, %e[ker0][0]        \n"
-          "vmlal.s16  q14, d16, %e[ker0][1]        \n"
-          "vmlal.s16  q14, d18, %e[ker0][2]        \n"
-          "vmull.s16  q15, d15, %e[ker0][0]        \n"
-          "vmlal.s16  q15, d17, %e[ker0][1]        \n"
-          "vmlal.s16  q15, d19, %e[ker0][2]        \n"
-
-          "vld1.32    {d9}, [%[input_ptr3]], r0    \n"
-          "vld1.32    {d10}, [%[input_ptr4]], r0   \n"
-          "vld1.32    {d11}, [%[input_ptr5]], r0   \n"
-          "vext.s8    d12, d9, d9, #1              \n"
-          "vext.s8    d13, d9, d9, #2              \n"
-          "vmovl.s8   q7, d9                       \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q12, d14, %e[ker1][0]        \n"
-          "vmlal.s16  q12, d16, %e[ker1][1]        \n"
-          "vmlal.s16  q12, d18, %e[ker1][2]        \n"
-          "vmlal.s16  q13, d15, %e[ker1][0]        \n"
-          "vmlal.s16  q13, d17, %e[ker1][1]        \n"
-          "vmlal.s16  q13, d19, %e[ker1][2]        \n"
-          // store row 1
-          "vst1.32    {d24-d26}, [%[output_ptr1]]! \n"
-
-          "vmlal.s16  q14, d14, %f[ker0][0]        \n"
-          "vmlal.s16  q14, d16, %f[ker0][1]        \n"
-          "vmlal.s16  q14, d18, %f[ker0][2]        \n"
-          "vmlal.s16  q15, d15, %f[ker0][0]        \n"
-          "vmlal.s16  q15, d17, %f[ker0][1]        \n"
-          "vmlal.s16  q15, d19, %f[ker0][2]        \n"
-
-          "vmull.s16  q10, d14, %e[ker0][0]        \n"
-          "vmlal.s16  q10, d16, %e[ker0][1]        \n"
-          "vmlal.s16  q10, d18, %e[ker0][2]        \n"
-          "vmull.s16  q11, d15, %e[ker0][0]        \n"
-          "vmlal.s16  q11, d17, %e[ker0][1]        \n"
-          "vmlal.s16  q11, d19, %e[ker0][2]        \n"
-
-          "vext.s8    d12, d10, d10, #1            \n"
-          "vext.s8    d13, d10, d10, #2            \n"
-          "vmovl.s8   q7, d10                      \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q14, d14, %e[ker1][0]        \n"
-          "vmlal.s16  q14, d16, %e[ker1][1]        \n"
-          "vmlal.s16  q14, d18, %e[ker1][2]        \n"
-          "vmlal.s16  q15, d15, %e[ker1][0]        \n"
-          "vmlal.s16  q15, d17, %e[ker1][1]        \n"
-          "vmlal.s16  q15, d19, %e[ker1][2]        \n"
-          // store row 2
-          "vst1.32    {d28-d30}, [%[output_ptr2]]! \n"
-
-          "vmlal.s16  q10, d14, %f[ker0][0]        \n"
-          "vmlal.s16  q10, d16, %f[ker0][1]        \n"
-          "vmlal.s16  q10, d18, %f[ker0][2]        \n"
-          "vmlal.s16  q11, d15, %f[ker0][0]        \n"
-          "vmlal.s16  q11, d17, %f[ker0][1]        \n"
-          "vmlal.s16  q11, d19, %f[ker0][2]        \n"
-
-          "vext.s8    d12, d11, d11, #1            \n"
-          "vext.s8    d13, d11, d11, #2            \n"
-          "vmovl.s8   q7, d11                      \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q10, d14, %e[ker1][0]        \n"
-          "vmlal.s16  q10, d16, %e[ker1][1]        \n"
-          "vmlal.s16  q10, d18, %e[ker1][2]        \n"
-          "vmlal.s16  q11, d15, %e[ker1][0]        \n"
-          "vmlal.s16  q11, d17, %e[ker1][1]        \n"
-          "vmlal.s16  q11, d19, %e[ker1][2]        \n"
-          // store row 3
-          "vst1.32    {d20-d22}, [%[output_ptr3]]! \n"
-
-          "subs       %[loop], #1                  \n"
-          "bne        loop_4h6w_%=                 \n"
-
-          "start_remain_%=:                        \n"
-          "cmp        %[remain], #0                \n"
-          "ble        end_%=                       \n"
-
-          "mov        r0, %[remain]                \n"
-          "vld1.32    {d9}, [%[input_ptr0]], r0    \n"
-          "vmovl.s8   q7, d9                       \n"
-          "vext.s8    d9, d9, d9, #1               \n"
-          "vmovl.s8   q8, d9                       \n"
-          "vext.s8    d9, d9, d9, #1               \n"
-          "vmovl.s8   q9, d9                       \n"
-          "vmull.s16  q10, d14, %e[ker0][0]        \n"
-          "vmlal.s16  q10, d16, %e[ker0][1]        \n"
-          "vmlal.s16  q10, d18, %e[ker0][2]        \n"
-          "vld1.32    {d9}, [%[input_ptr1]], r0    \n"
-          "vmull.s16  q11, d15, %e[ker0][0]        \n"
-          "vmlal.s16  q11, d17, %e[ker0][1]        \n"
-          "vmlal.s16  q11, d19, %e[ker0][2]        \n"
-
-          "vmovl.s8   q7, d9                       \n"
-          "vext.s8    d9, d9, d9, #1               \n"
-          "vmovl.s8   q8, d9                       \n"
-          "vext.s8    d9, d9, d9, #1               \n"
-          "vmovl.s8   q9, d9                       \n"
-          "vmlal.s16  q10, d14, %f[ker0][0]        \n"
-          "vmlal.s16  q10, d16, %f[ker0][1]        \n"
-          "vmlal.s16  q10, d18, %f[ker0][2]        \n"
-          "vmlal.s16  q11, d15, %f[ker0][0]        \n"
-          "vmlal.s16  q11, d17, %f[ker0][1]        \n"
-          "vmlal.s16  q11, d19, %f[ker0][2]        \n"
-
-          "vmull.s16  q12, d14, %e[ker0][0]        \n"
-          "vmlal.s16  q12, d16, %e[ker0][1]        \n"
-          "vmlal.s16  q12, d18, %e[ker0][2]        \n"
-          "vld1.32    {d9}, [%[input_ptr2]], r0    \n"
-          "vmull.s16  q13, d15, %e[ker0][0]        \n"
-          "vmlal.s16  q13, d17, %e[ker0][1]        \n"
-          "vmlal.s16  q13, d19, %e[ker0][2]        \n"
-
-          "vmovl.s8   q7, d9                       \n"
-          "vext.s8    d9, d9, d9, #1               \n"
-          "vmovl.s8   q8, d9                       \n"
-          "vext.s8    d9, d9, d9, #1               \n"
-          "vmovl.s8   q9, d9                       \n"
-          "vmlal.s16  q10, d14, %e[ker1][0]        \n"
-          "vmlal.s16  q10, d16, %e[ker1][1]        \n"
-          "vmlal.s16  q10, d18, %e[ker1][2]        \n"
-          "vmlal.s16  q11, d15, %e[ker1][0]        \n"
-          "vmlal.s16  q11, d17, %e[ker1][1]        \n"
-          "vmlal.s16  q11, d19, %e[ker1][2]        \n"
-
-          "vmlal.s16  q12, d14, %f[ker0][0]        \n"
-          "vmlal.s16  q12, d16, %f[ker0][1]        \n"
-          "vmlal.s16  q12, d18, %f[ker0][2]        \n"
-          "vmlal.s16  q13, d15, %f[ker0][0]        \n"
-          "vmlal.s16  q13, d17, %f[ker0][1]        \n"
-          "vmlal.s16  q13, d19, %f[ker0][2]        \n"
-
-          "vmull.s16  q14, d14, %e[ker0][0]        \n"
-          "vmlal.s16  q14, d16, %e[ker0][1]        \n"
-          "vmlal.s16  q14, d18, %e[ker0][2]        \n"
-          "vld1.32    {d9}, [%[input_ptr3]], r0    \n"
-          "vmull.s16  q15, d15, %e[ker0][0]        \n"
-          "vmlal.s16  q15, d17, %e[ker0][1]        \n"
-          "vmlal.s16  q15, d19, %e[ker0][2]        \n"
-
-          "vmovl.s8   q7, d9                       \n"
-          "vext.s8    d9, d9, d9, #1               \n"
-          "vmovl.s8   q8, d9                       \n"
-          "vext.s8    d9, d9, d9, #1               \n"
-          "vmovl.s8   q9, d9                       \n"
-          "vmlal.s16  q12, d14, %e[ker1][0]        \n"
-          "vmlal.s16  q12, d16, %e[ker1][1]        \n"
-          "vmlal.s16  q12, d18, %e[ker1][2]        \n"
-          "vmlal.s16  q13, d15, %e[ker1][0]        \n"
-          "vmlal.s16  q13, d17, %e[ker1][1]        \n"
-          "vmlal.s16  q13, d19, %e[ker1][2]        \n"
-
-          "vmlal.s16  q14, d14, %f[ker0][0]        \n"
-          "vmlal.s16  q14, d16, %f[ker0][1]        \n"
-          "vmlal.s16  q14, d18, %f[ker0][2]        \n"
-          "vmlal.s16  q15, d15, %f[ker0][0]        \n"
-          "vmlal.s16  q15, d17, %f[ker0][1]        \n"
-          "vmlal.s16  q15, d19, %f[ker0][2]        \n"
-
-          "vmull.s16  q5, d14, %e[ker0][0]         \n"
-          "vmlal.s16  q5, d16, %e[ker0][1]         \n"
-          "vmlal.s16  q5, d18, %e[ker0][2]         \n"
-          "vld1.32    {d9}, [%[input_ptr4]], r0    \n"
-          "vmull.s16  q6, d15, %e[ker0][0]         \n"
-          "vmlal.s16  q6, d17, %e[ker0][1]         \n"
-          "vmlal.s16  q6, d19, %e[ker0][2]         \n"
-
-          "vmovl.s8   q7, d9                       \n"
-          "vext.s8    d9, d9, d9, #1               \n"
-          "vmovl.s8   q8, d9                       \n"
-          "vext.s8    d9, d9, d9, #1               \n"
-          "vmovl.s8   q9, d9                       \n"
-          "vmlal.s16  q14, d14, %e[ker1][0]        \n"
-          "vmlal.s16  q14, d16, %e[ker1][1]        \n"
-          "vmlal.s16  q14, d18, %e[ker1][2]        \n"
-          "vmlal.s16  q15, d15, %e[ker1][0]        \n"
-          "vmlal.s16  q15, d17, %e[ker1][1]        \n"
-          "vmlal.s16  q15, d19, %e[ker1][2]        \n"
-
-          "vmlal.s16  q5, d14, %f[ker0][0]         \n"
-          "vmlal.s16  q5, d16, %f[ker0][1]         \n"
-          "vmlal.s16  q5, d18, %f[ker0][2]         \n"
-          "vld1.32    {d9}, [%[input_ptr5]], r0    \n"
-          "vmlal.s16  q6, d15, %f[ker0][0]         \n"
-          "vmlal.s16  q6, d17, %f[ker0][1]         \n"
-          "vmlal.s16  q6, d19, %f[ker0][2]         \n"
-
-          "vmovl.s8   q7, d9                       \n"
-          "vext.s8    d9, d9, d9, #1               \n"
-          "vmovl.s8   q8, d9                       \n"
-          "vext.s8    d9, d9, d9, #1               \n"
-          "vmovl.s8   q9, d9                       \n"
-          "vmlal.s16  q5, d14, %e[ker1][0]         \n"
-          "vmlal.s16  q5, d16, %e[ker1][1]         \n"
-          "vmlal.s16  q5, d18, %e[ker1][2]         \n"
-          "vmlal.s16  q6, d15, %e[ker1][0]         \n"
-          "vmlal.s16  q6, d17, %e[ker1][1]         \n"
-          "vmlal.s16  q6, d19, %e[ker1][2]         \n"
-
-          "cmp        %[remain], #4                \n"
-          "blt        store_4h2w_%=                \n"
-          "vst1.32    {q10}, [%[output_ptr0]]!     \n"
-          "vst1.32    {q12}, [%[output_ptr1]]!     \n"
-          "vst1.32    {q14}, [%[output_ptr2]]!     \n"
-          "vst1.32    {q5}, [%[output_ptr3]]!      \n"
-          "cmp        %[remain], #5                \n"
-          "blt        end_%=                       \n"
-          "vst1.32    {d22[0]}, [%[output_ptr0]]!  \n"
-          "vst1.32    {d26[0]}, [%[output_ptr1]]!  \n"
-          "vst1.32    {d30[0]}, [%[output_ptr2]]!  \n"
-          "vst1.32    {d12[0]}, [%[output_ptr3]]!  \n"
-          "b          end_%=                       \n"
-
-          "store_4h2w_%=:                          \n"
-          "cmp        %[remain], #2                \n"
-          "blt        store_4h1w_%=                \n"
-          "vst1.32    {d20}, [%[output_ptr0]]!     \n"
-          "vst1.32    {d24}, [%[output_ptr1]]!     \n"
-          "vst1.32    {d28}, [%[output_ptr2]]!     \n"
-          "vst1.32    {d10}, [%[output_ptr3]]!     \n"
-          "cmp        %[remain], #3                \n"
-          "blt        end_%=                       \n"
-          "vst1.32    {d21[0]}, [%[output_ptr0]]!  \n"
-          "vst1.32    {d25[0]}, [%[output_ptr1]]!  \n"
-          "vst1.32    {d29[0]}, [%[output_ptr2]]!  \n"
-          "vst1.32    {d11[0]}, [%[output_ptr3]]!  \n"
-          "b          end_%=                       \n"
-
-          "store_4h1w_%=:                          \n"
-          "cmp        %[remain], #1                \n"
-          "blt        end_%=                       \n"
-          "vst1.32    {d20[0]}, [%[output_ptr0]]!  \n"
-          "vst1.32    {d24[0]}, [%[output_ptr1]]!  \n"
-          "vst1.32    {d28[0]}, [%[output_ptr2]]!  \n"
-          "vst1.32    {d10[0]}, [%[output_ptr3]]!  \n"
-          "end_%=:                                 \n"
-          : [output_ptr0] "+r"(output_ptr0), [output_ptr1] "+r"(output_ptr1),
-            [output_ptr2] "+r"(output_ptr2), [output_ptr3] "+r"(output_ptr3),
-            [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1),
-            [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3),
-            [input_ptr4] "+r"(input_ptr4), [input_ptr5] "+r"(input_ptr5),
-            [loop] "+r"(loop)
-          : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1)
-          : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
-            "q12", "q13", "q14", "q15", "r0");
-#endif  // __aarch64__
-      // pad right
-      if (padding_w) {
-        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0 - 2)));
-        int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1 - 2)));
-        int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2 - 2)));
-        int16x4_t row3 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr3 - 2)));
-        int16x4_t row4 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr4 - 2)));
-        int16x4_t row5 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr5 - 2)));
-        row0 = vext_s16(row0, zero, 2);
-        row1 = vext_s16(row1, zero, 2);
-        row2 = vext_s16(row2, zero, 2);
-        row3 = vext_s16(row3, zero, 2);
-        row4 = vext_s16(row4, zero, 2);
-        row5 = vext_s16(row5, zero, 2);
-        int32x4_t acc;
-        for (int w = valid_w_end; w < output_w; ++w) {
-          int padding = w + 3 - (padding_w + input_w);
-          if (padding >= 3) {
-            *output_ptr0 = 0;
-            *output_ptr1 = 0;
-            *output_ptr2 = 0;
-            *output_ptr3 = 0;
-          } else {
-            acc = vmull_s16(row0, _ker[0]);
-            acc = vmlal_s16(acc, row1, _ker[1]);
-            acc = vmlal_s16(acc, row2, _ker[2]);
-            *output_ptr0 = vgetq_lane_s32(acc, 0) + vgetq_lane_s32(acc, 1);
-            acc = vmull_s16(row1, _ker[0]);
-            acc = vmlal_s16(acc, row2, _ker[1]);
-            acc = vmlal_s16(acc, row3, _ker[2]);
-            *output_ptr1 = vgetq_lane_s32(acc, 0) + vgetq_lane_s32(acc, 1);
-            acc = vmull_s16(row2, _ker[0]);
-            acc = vmlal_s16(acc, row3, _ker[1]);
-            acc = vmlal_s16(acc, row4, _ker[2]);
-            *output_ptr2 = vgetq_lane_s32(acc, 0) + vgetq_lane_s32(acc, 1);
-            acc = vmull_s16(row3, _ker[0]);
-            acc = vmlal_s16(acc, row4, _ker[1]);
-            acc = vmlal_s16(acc, row5, _ker[2]);
-            *output_ptr3 = vgetq_lane_s32(acc, 0) + vgetq_lane_s32(acc, 1);
-
-            row0 = vext_s16(row0, zero, 1);
-            row1 = vext_s16(row1, zero, 1);
-            row2 = vext_s16(row2, zero, 1);
-            row3 = vext_s16(row3, zero, 1);
-            row4 = vext_s16(row4, zero, 1);
-            row5 = vext_s16(row5, zero, 1);
-          }
-          output_ptr0++;
-          output_ptr1++;
-          output_ptr2++;
-          output_ptr3++;
-        }
-      }
-    }
-    // remain height
-    int start_h = valid_h_start + (valid_h & 0xFFFFFFFC);
-    for (int h = start_h; h < valid_h_end - 1; h += 2) {
-      const int8_t *input_ptr0 = input_ptr + (h - padding_h) * input_w;
-      const int8_t *input_ptr1 = input_ptr0 + input_w;
-      const int8_t *input_ptr2 = input_ptr1 + input_w;
-      const int8_t *input_ptr3 = input_ptr2 + input_w;
-      int32_t *output_ptr0 = output_ptr + h * output_w;
-      int32_t *output_ptr1 = output_ptr0 + output_w;
-      // pad left
-      if (padding_w) {
-        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0)));
-        int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1)));
-        int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2)));
-        int16x4_t row3 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr3)));
-        int32x4_t acc;
-        for (int w = valid_w_start - 1; w >= 0; --w) {
-          int padding = padding_w - w;
-          if (padding >= 3) {
-            output_ptr0[w] = 0;
-            output_ptr1[w] = 0;
-          } else {
-            row0 = vext_s16(zero, row0, 3);
-            row1 = vext_s16(zero, row1, 3);
-            row2 = vext_s16(zero, row2, 3);
-            row3 = vext_s16(zero, row3, 3);
-            acc = vmull_s16(row0, _ker[0]);
-            acc = vmlal_s16(acc, row1, _ker[1]);
-            acc = vmlal_s16(acc, row2, _ker[2]);
-            output_ptr0[w] = vgetq_lane_s32(acc, 1) + vgetq_lane_s32(acc, 2);
-            acc = vmull_s16(row1, _ker[0]);
-            acc = vmlal_s16(acc, row2, _ker[1]);
-            acc = vmlal_s16(acc, row3, _ker[2]);
-            output_ptr1[w] = vgetq_lane_s32(acc, 1) + vgetq_lane_s32(acc, 2);
-          }
-        }
-        output_ptr0 += valid_w_start;
-        output_ptr1 += valid_w_start;
-      }
-        // valid
-#if __aarch64__
-#else
-      int loop = output_w_tiles;
-      asm volatile(
-          "cmp        %[loop], #0                  \n"
-          "ble        start_remain_%=              \n"
-          "mov        r0, #6                       \n"
-          // loop 6 widths
-          "loop_2h6w_%=:                           \n"
-          "vld1.32    {d9}, [%[input_ptr0]], r0    \n"
-          "vld1.32    {d10}, [%[input_ptr1]], r0   \n"
-          "vld1.32    {d11}, [%[input_ptr2]], r0   \n"
-          "vext.s8    d12, d9, d9, #1              \n"
-          "vext.s8    d13, d9, d9, #2              \n"
-          "vmovl.s8   q7, d9                       \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmull.s16  q10, d14, %e[ker0][0]        \n"
-          "vmlal.s16  q10, d16, %e[ker0][1]        \n"
-          "vmlal.s16  q10, d18, %e[ker0][2]        \n"
-          "vmull.s16  q11, d15, %e[ker0][0]        \n"
-          "vmlal.s16  q11, d17, %e[ker0][1]        \n"
-          "vmlal.s16  q11, d19, %e[ker0][2]        \n"
-
-          "vext.s8    d12, d10, d10, #1            \n"
-          "vext.s8    d13, d10, d10, #2            \n"
-          "vmovl.s8   q7, d10                      \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q10, d14, %f[ker0][0]        \n"
-          "vmlal.s16  q10, d16, %f[ker0][1]        \n"
-          "vmlal.s16  q10, d18, %f[ker0][2]        \n"
-          "vmlal.s16  q11, d15, %f[ker0][0]        \n"
-          "vmlal.s16  q11, d17, %f[ker0][1]        \n"
-          "vmlal.s16  q11, d19, %f[ker0][2]        \n"
-
-          "vmull.s16  q12, d14, %e[ker0][0]        \n"
-          "vmlal.s16  q12, d16, %e[ker0][1]        \n"
-          "vmlal.s16  q12, d18, %e[ker0][2]        \n"
-          "vmull.s16  q13, d15, %e[ker0][0]        \n"
-          "vmlal.s16  q13, d17, %e[ker0][1]        \n"
-          "vmlal.s16  q13, d19, %e[ker0][2]        \n"
-
-          "vext.s8    d12, d11, d11, #1            \n"
-          "vext.s8    d13, d11, d11, #2            \n"
-          "vmovl.s8   q7, d11                      \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q10, d14, %e[ker1][0]        \n"
-          "vmlal.s16  q10, d16, %e[ker1][1]        \n"
-          "vmlal.s16  q10, d18, %e[ker1][2]        \n"
-          "vmlal.s16  q11, d15, %e[ker1][0]        \n"
-          "vmlal.s16  q11, d17, %e[ker1][1]        \n"
-          "vmlal.s16  q11, d19, %e[ker1][2]        \n"
-          // store row 0, reuse q10/q11
-          "vst1.32    {d20-d22}, [%[output_ptr0]]! \n"
-
-          "vmlal.s16  q12, d14, %f[ker0][0]        \n"
-          "vmlal.s16  q12, d16, %f[ker0][1]        \n"
-          "vmlal.s16  q12, d18, %f[ker0][2]        \n"
-          "vmlal.s16  q13, d15, %f[ker0][0]        \n"
-          "vmlal.s16  q13, d17, %f[ker0][1]        \n"
-          "vmlal.s16  q13, d19, %f[ker0][2]        \n"
-
-          "vld1.32    {d9}, [%[input_ptr3]], r0    \n"
-          "vext.s8    d12, d9, d9, #1              \n"
-          "vext.s8    d13, d9, d9, #2              \n"
-          "vmovl.s8   q7, d9                       \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q12, d14, %e[ker1][0]        \n"
-          "vmlal.s16  q12, d16, %e[ker1][1]        \n"
-          "vmlal.s16  q12, d18, %e[ker1][2]        \n"
-          "vmlal.s16  q13, d15, %e[ker1][0]        \n"
-          "vmlal.s16  q13, d17, %e[ker1][1]        \n"
-          "vmlal.s16  q13, d19, %e[ker1][2]        \n"
-          // store row 1
-          "vst1.32    {d24-d26}, [%[output_ptr1]]! \n"
-
-          "subs       %[loop], #1                  \n"
-          "bne        loop_2h6w_%=                 \n"
-
-          "start_remain_%=:                        \n"
-          "cmp        %[remain], #0                \n"
-          "ble        end_%=                       \n"
-
-          "mov        r0, %[remain]                \n"
-          "vld1.32    {d9}, [%[input_ptr0]], r0    \n"
-          "vld1.32    {d10}, [%[input_ptr1]], r0   \n"
-          "vld1.32    {d11}, [%[input_ptr2]], r0   \n"
-          "vext.s8    d12, d9, d9, #1              \n"
-          "vext.s8    d13, d9, d9, #2              \n"
-          "vmovl.s8   q7, d9                       \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmull.s16  q10, d14, %e[ker0][0]        \n"
-          "vmlal.s16  q10, d16, %e[ker0][1]        \n"
-          "vmlal.s16  q10, d18, %e[ker0][2]        \n"
-          "vmull.s16  q11, d15, %e[ker0][0]        \n"
-          "vmlal.s16  q11, d17, %e[ker0][1]        \n"
-          "vmlal.s16  q11, d19, %e[ker0][2]        \n"
-
-          "vext.s8    d12, d10, d10, #1            \n"
-          "vext.s8    d13, d10, d10, #2            \n"
-          "vmovl.s8   q7, d10                      \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q10, d14, %f[ker0][0]        \n"
-          "vmlal.s16  q10, d16, %f[ker0][1]        \n"
-          "vmlal.s16  q10, d18, %f[ker0][2]        \n"
-          "vmlal.s16  q11, d15, %f[ker0][0]        \n"
-          "vmlal.s16  q11, d17, %f[ker0][1]        \n"
-          "vmlal.s16  q11, d19, %f[ker0][2]        \n"
-
-          "vmull.s16  q12, d14, %e[ker0][0]        \n"
-          "vmlal.s16  q12, d16, %e[ker0][1]        \n"
-          "vmlal.s16  q12, d18, %e[ker0][2]        \n"
-          "vmull.s16  q13, d15, %e[ker0][0]        \n"
-          "vmlal.s16  q13, d17, %e[ker0][1]        \n"
-          "vmlal.s16  q13, d19, %e[ker0][2]        \n"
-
-          "vext.s8    d12, d11, d11, #1            \n"
-          "vext.s8    d13, d11, d11, #2            \n"
-          "vmovl.s8   q7, d11                      \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q10, d14, %e[ker1][0]        \n"
-          "vmlal.s16  q10, d16, %e[ker1][1]        \n"
-          "vmlal.s16  q10, d18, %e[ker1][2]        \n"
-          "vmlal.s16  q11, d15, %e[ker1][0]        \n"
-          "vmlal.s16  q11, d17, %e[ker1][1]        \n"
-          "vmlal.s16  q11, d19, %e[ker1][2]        \n"
-
-          "vmlal.s16  q12, d14, %f[ker0][0]        \n"
-          "vmlal.s16  q12, d16, %f[ker0][1]        \n"
-          "vmlal.s16  q12, d18, %f[ker0][2]        \n"
-          "vmlal.s16  q13, d15, %f[ker0][0]        \n"
-          "vmlal.s16  q13, d17, %f[ker0][1]        \n"
-          "vmlal.s16  q13, d19, %f[ker0][2]        \n"
-
-          "vld1.32    {d9}, [%[input_ptr3]], r0    \n"
-          "vext.s8    d12, d9, d9, #1              \n"
-          "vext.s8    d13, d9, d9, #2              \n"
-          "vmovl.s8   q7, d9                       \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q12, d14, %e[ker1][0]        \n"
-          "vmlal.s16  q12, d16, %e[ker1][1]        \n"
-          "vmlal.s16  q12, d18, %e[ker1][2]        \n"
-          "vmlal.s16  q13, d15, %e[ker1][0]        \n"
-          "vmlal.s16  q13, d17, %e[ker1][1]        \n"
-          "vmlal.s16  q13, d19, %e[ker1][2]        \n"
-
-          "cmp        %[remain], #4                \n"
-          "blt        store_2h2w_%=                \n"
-          "vst1.32    {q10}, [%[output_ptr0]]!     \n"
-          "vst1.32    {q12}, [%[output_ptr1]]!     \n"
-          "cmp        %[remain], #5                \n"
-          "blt        end_%=                       \n"
-          "vst1.32    {d22[0]}, [%[output_ptr0]]!  \n"
-          "vst1.32    {d26[0]}, [%[output_ptr1]]!  \n"
-          "b          end_%=                       \n"
-
-          "store_2h2w_%=:                          \n"
-          "cmp        %[remain], #2                \n"
-          "blt        store_2h1w_%=                \n"
-          "vst1.32    {d20}, [%[output_ptr0]]!     \n"
-          "vst1.32    {d24}, [%[output_ptr1]]!     \n"
-          "cmp        %[remain], #3                \n"
-          "blt        end_%=                       \n"
-          "vst1.32    {d21[0]}, [%[output_ptr0]]!  \n"
-          "vst1.32    {d25[0]}, [%[output_ptr1]]!  \n"
-          "b          end_%=                       \n"
-
-          "store_2h1w_%=:                          \n"
-          "cmp        %[remain], #1                \n"
-          "blt        end_%=                       \n"
-          "vst1.32    {d20[0]}, [%[output_ptr0]]!  \n"
-          "vst1.32    {d24[0]}, [%[output_ptr1]]!  \n"
-          "end_%=:                                 \n"
-          : [output_ptr0] "+r"(output_ptr0), [output_ptr1] "+r"(output_ptr1),
-            [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1),
-            [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3),
-            [loop] "+r"(loop)
-          : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1)
-          : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
-            "q12", "q13", "q14", "q15", "r0");
-#endif  // __aarch64__
-      // pad right
-      if (padding_w) {
-        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0 - 2)));
-        int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1 - 2)));
-        int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2 - 2)));
-        int16x4_t row3 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr3 - 2)));
-        row0 = vext_s16(row0, zero, 2);
-        row1 = vext_s16(row1, zero, 2);
-        row2 = vext_s16(row2, zero, 2);
-        row3 = vext_s16(row3, zero, 2);
-        int32x4_t acc;
-        for (int w = valid_w_end; w < output_w; ++w) {
-          int padding = w + 3 - (padding_w + input_w);
-          if (padding >= 3) {
-            *output_ptr0 = 0;
-            *output_ptr1 = 0;
-          } else {
-            acc = vmull_s16(row0, _ker[0]);
-            acc = vmlal_s16(acc, row1, _ker[1]);
-            acc = vmlal_s16(acc, row2, _ker[2]);
-            *output_ptr0 = vgetq_lane_s32(acc, 0) + vgetq_lane_s32(acc, 1);
-            acc = vmull_s16(row1, _ker[0]);
-            acc = vmlal_s16(acc, row2, _ker[1]);
-            acc = vmlal_s16(acc, row3, _ker[2]);
-            *output_ptr1 = vgetq_lane_s32(acc, 0) + vgetq_lane_s32(acc, 1);
-
-            row0 = vext_s16(row0, zero, 1);
-            row1 = vext_s16(row1, zero, 1);
-            row2 = vext_s16(row2, zero, 1);
-            row3 = vext_s16(row3, zero, 1);
-          }
-          output_ptr0++;
-          output_ptr1++;
-        }
-      }
-    }
-
-    start_h = valid_h_start + (valid_h & 0xFFFFFFFE);
-    if (start_h < valid_h_end) {
-      const int8_t *input_ptr0 = input_ptr + (start_h - padding_h) * input_w;
-      const int8_t *input_ptr1 = input_ptr0 + input_w;
-      const int8_t *input_ptr2 = input_ptr1 + input_w;
-      int32_t *output_ptr0 = output_ptr + start_h * output_w;
-      // pad left
-      if (padding_w) {
-        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0)));
-        int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1)));
-        int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2)));
-        int32x4_t acc;
-        for (int w = valid_w_start - 1; w >= 0; --w) {
-          int padding = padding_w - w;
-          if (padding >= 3) {
-            output_ptr0[w] = 0;
-          } else {
-            row0 = vext_s16(zero, row0, 3);
-            row1 = vext_s16(zero, row1, 3);
-            row2 = vext_s16(zero, row2, 3);
-            acc = vmull_s16(row0, _ker[0]);
-            acc = vmlal_s16(acc, row1, _ker[1]);
-            acc = vmlal_s16(acc, row2, _ker[2]);
-            output_ptr0[w] = vgetq_lane_s32(acc, 1) + vgetq_lane_s32(acc, 2);
-          }
-        }
-        output_ptr0 += valid_w_start;
-      }
-        // valid
-#if __aarch64__
-#else
-      int loop = output_w_tiles;
-      asm volatile(
-          "cmp        %[loop], #0                  \n"
-          "ble        start_remain_%=              \n"
-          "mov        r0, #6                       \n"
-          // loop 6 widths
-          "loop_1h6w_%=:                           \n"
-          "vld1.32    {d9}, [%[input_ptr0]], r0    \n"
-          "vld1.32    {d10}, [%[input_ptr1]], r0   \n"
-          "vld1.32    {d11}, [%[input_ptr2]], r0   \n"
-          "vext.s8    d12, d9, d9, #1              \n"
-          "vext.s8    d13, d9, d9, #2              \n"
-          "vmovl.s8   q7, d9                       \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmull.s16  q10, d14, %e[ker0][0]        \n"
-          "vmlal.s16  q10, d16, %e[ker0][1]        \n"
-          "vmlal.s16  q10, d18, %e[ker0][2]        \n"
-          "vmull.s16  q11, d15, %e[ker0][0]        \n"
-          "vmlal.s16  q11, d17, %e[ker0][1]        \n"
-          "vmlal.s16  q11, d19, %e[ker0][2]        \n"
-
-          "vext.s8    d12, d10, d10, #1            \n"
-          "vext.s8    d13, d10, d10, #2            \n"
-          "vmovl.s8   q7, d10                      \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q10, d14, %f[ker0][0]        \n"
-          "vmlal.s16  q10, d16, %f[ker0][1]        \n"
-          "vmlal.s16  q10, d18, %f[ker0][2]        \n"
-          "vmlal.s16  q11, d15, %f[ker0][0]        \n"
-          "vmlal.s16  q11, d17, %f[ker0][1]        \n"
-          "vmlal.s16  q11, d19, %f[ker0][2]        \n"
-
-          "vext.s8    d12, d11, d11, #1            \n"
-          "vext.s8    d13, d11, d11, #2            \n"
-          "vmovl.s8   q7, d11                      \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q10, d14, %e[ker1][0]        \n"
-          "vmlal.s16  q10, d16, %e[ker1][1]        \n"
-          "vmlal.s16  q10, d18, %e[ker1][2]        \n"
-          "vmlal.s16  q11, d15, %e[ker1][0]        \n"
-          "vmlal.s16  q11, d17, %e[ker1][1]        \n"
-          "vmlal.s16  q11, d19, %e[ker1][2]        \n"
-          // store row 0, reuse q10/q11
-          "vst1.32    {d20-d22}, [%[output_ptr0]]! \n"
-
-          "subs       %[loop], #1                  \n"
-          "bne        loop_1h6w_%=                 \n"
-
-          "start_remain_%=:                        \n"
-          "cmp        %[remain], #0                \n"
-          "ble        end_%=                       \n"
-          "mov        r0, %[remain]                \n"
-
-          "vld1.32    {d9}, [%[input_ptr0]], r0    \n"
-          "vld1.32    {d10}, [%[input_ptr1]], r0   \n"
-          "vld1.32    {d11}, [%[input_ptr2]], r0   \n"
-          "vext.s8    d12, d9, d9, #1              \n"
-          "vext.s8    d13, d9, d9, #2              \n"
-          "vmovl.s8   q7, d9                       \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmull.s16  q10, d14, %e[ker0][0]        \n"
-          "vmlal.s16  q10, d16, %e[ker0][1]        \n"
-          "vmlal.s16  q10, d18, %e[ker0][2]        \n"
-          "vmull.s16  q11, d15, %e[ker0][0]        \n"
-          "vmlal.s16  q11, d17, %e[ker0][1]        \n"
-          "vmlal.s16  q11, d19, %e[ker0][2]        \n"
-
-          "vext.s8    d12, d10, d10, #1            \n"
-          "vext.s8    d13, d10, d10, #2            \n"
-          "vmovl.s8   q7, d10                      \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q10, d14, %f[ker0][0]        \n"
-          "vmlal.s16  q10, d16, %f[ker0][1]        \n"
-          "vmlal.s16  q10, d18, %f[ker0][2]        \n"
-          "vmlal.s16  q11, d15, %f[ker0][0]        \n"
-          "vmlal.s16  q11, d17, %f[ker0][1]        \n"
-          "vmlal.s16  q11, d19, %f[ker0][2]        \n"
-
-          "vext.s8    d12, d11, d11, #1            \n"
-          "vext.s8    d13, d11, d11, #2            \n"
-          "vmovl.s8   q7, d11                      \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q10, d14, %e[ker1][0]        \n"
-          "vmlal.s16  q10, d16, %e[ker1][1]        \n"
-          "vmlal.s16  q10, d18, %e[ker1][2]        \n"
-          "vmlal.s16  q11, d15, %e[ker1][0]        \n"
-          "vmlal.s16  q11, d17, %e[ker1][1]        \n"
-          "vmlal.s16  q11, d19, %e[ker1][2]        \n"
-
-          "cmp        %[remain], #4                \n"
-          "blt        store_1h2w_%=                \n"
-          "vst1.32    {q10}, [%[output_ptr0]]!     \n"
-          "cmp        %[remain], #5                \n"
-          "blt        end_%=                       \n"
-          "vst1.32    {d22[0]}, [%[output_ptr0]]!  \n"
-          "b          end_%=                       \n"
-
-          "store_1h2w_%=:                          \n"
-          "cmp        %[remain], #2                \n"
-          "blt        store_1h1w_%=                \n"
-          "vst1.32    {d20}, [%[output_ptr0]]!     \n"
-          "cmp        %[remain], #3                \n"
-          "blt        end_%=                       \n"
-          "vst1.32    {d21[0]}, [%[output_ptr0]]!  \n"
-          "b          end_%=                       \n"
-
-          "store_1h1w_%=:                          \n"
-          "cmp        %[remain], #1                \n"
-          "blt        end_%=                       \n"
-          "vst1.32    {d20[0]}, [%[output_ptr0]]!  \n"
-          "end_%=:                                 \n"
-          : [output_ptr0] "+r"(output_ptr0), [input_ptr0] "+r"(input_ptr0),
-            [input_ptr1] "+r"(input_ptr1), [input_ptr2] "+r"(input_ptr2),
-            [loop] "+r"(loop)
-          : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1)
-          : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
-            "q12", "q13", "q14", "q15", "r0");
-#endif  // __aarch64__
-      // pad right
-      if (padding_w) {
-        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0 - 2)));
-        int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1 - 2)));
-        int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2 - 2)));
-        row0 = vext_s16(row0, zero, 2);
-        row1 = vext_s16(row1, zero, 2);
-        row2 = vext_s16(row2, zero, 2);
-        int32x4_t acc;
-        for (int w = valid_w_end; w < output_w; ++w) {
-          int padding = w + 3 - (padding_w + input_w);
-          if (padding >= 3) {
-            *output_ptr0 = 0;
-          } else {
-            acc = vmull_s16(row0, _ker[0]);
-            acc = vmlal_s16(acc, row1, _ker[1]);
-            acc = vmlal_s16(acc, row2, _ker[2]);
-            *output_ptr0 = vgetq_lane_s32(acc, 0) + vgetq_lane_s32(acc, 1);
-
-            row0 = vext_s16(row0, zero, 1);
-            row1 = vext_s16(row1, zero, 1);
-            row2 = vext_s16(row2, zero, 1);
-          }
-          output_ptr0++;
-        }
-      }
-    }
-    // bottom
-    for (int h = valid_h_end; h < output_h; ++h) {
-      DepthwiseConv3x3NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h,
-                                      input_w, padding_h, padding_w, output_w,
-                                      output_ptr, _ker);
-    }
-  }
-}
-
-template <>
-void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
-                                         const framework::Tensor &filter,
-                                         const std::vector<int> &paddings,
-                                         framework::Tensor *output) {
-  const int8_t *input_data = input.data<int8_t>();
-  const int8_t *filter_data = filter.data<int8_t>();
-  int32_t *out_data = output->mutable_data<int32_t>();
-  int input_h = input.dims()[2];
-  int input_w = input.dims()[3];
-  int output_h = output->dims()[2];
-  int output_w = output->dims()[3];
-  int padding_h = paddings[0];
-  int padding_w = paddings[1];
-  int image_size = input_h * input_w;
-  int out_image_size = output_h * output_w;
-  int valid_h_start = (padding_h + 1) / 2;
-  int valid_h_end = (input_h + padding_h - 1) / 2;
-  int valid_h = valid_h_end - valid_h_start;
-  int valid_w_start = (padding_w + 1) / 2;
-  int valid_w_end = (input_w + padding_w - 1) / 2;
-  int valid_w = valid_w_end - valid_w_start;
-  // for pad left
-  int valid_input_w_start = (valid_w_start << 1) - padding_w;
-
-  //  DLOG << "valid_h_start: " << valid_h_start;
-  //  DLOG << "valid_h_end: " << valid_h_end;
-  //  DLOG << "valid_w_start: " << valid_w_start;
-  //  DLOG << "valid_w_end: " << valid_w_end;
-
-  #pragma omp parallel for
-  for (int g = 0; g < input.dims()[1]; ++g) {
-    const int8_t *input_ptr = input_data + g * image_size;
-    const int8_t *filter_ptr = filter_data + g * 9;
-    int32_t *output_ptr = out_data + g * out_image_size;
-
-    const int8_t *filter_ptr0 = filter_ptr;
-    const int8_t *filter_ptr1 = filter_ptr0 + 3;
-    const int8_t *filter_ptr2 = filter_ptr1 + 3;
-    int16x4_t _k0 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr0)));
-    int16x4_t _k1 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr1)));
-    int16x4_t _k2 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr2)));
-    int16x8_t _ker0 = vcombine_s16(_k0, _k1);
-    int16x8_t _ker1 = vcombine_s16(_k2, _k2);
-    int16x4_t _ker[3] = {_k0, _k1, _k2};
-
-    // top
-    for (int h = 0; h < valid_h_start; ++h) {
-      DepthwiseConv3x3NormalRow<2, 2>(input_ptr, filter_ptr, h, input_h,
-                                      input_w, padding_h, padding_w, output_w,
-                                      output_ptr, _ker);
-    }
-    // valid
-    int input_w_start = 2 * valid_w_start - padding_w;
-    int output_w_tiles = valid_w / 6;
-    int output_w_remain = valid_w - output_w_tiles * 6;
-    for (int h = valid_h_start; h < valid_h_end - 2; h += 3) {
-      const int8_t *input_ptr0 = input_ptr + (2 * h - padding_h) * input_w;
-      const int8_t *input_ptr1 = input_ptr0 + input_w;
-      const int8_t *input_ptr2 = input_ptr1 + input_w;
-      const int8_t *input_ptr3 = input_ptr2 + input_w;
-      const int8_t *input_ptr4 = input_ptr3 + input_w;
-      const int8_t *input_ptr5 = input_ptr4 + input_w;
-      const int8_t *input_ptr6 = input_ptr5 + input_w;
-      int32_t *output_ptr0 = output_ptr + h * output_w;
-      int32_t *output_ptr1 = output_ptr0 + output_w;
-      int32_t *output_ptr2 = output_ptr1 + output_w;
-      // pad left
-      if (padding_w) {
-        for (int w = valid_w_start - 1; w >= 0; --w) {
-          int padding = padding_w - (w << 1);
-          if (padding >= 3) {
-            output_ptr0[w] = 0;
-            output_ptr1[w] = 0;
-            output_ptr2[w] = 0;
-          } else {
-            int16x4_t row0 =
-                vget_low_s16(vmovl_s8(vld1_s8(input_ptr0 - padding)));
-            int16x4_t row1 =
-                vget_low_s16(vmovl_s8(vld1_s8(input_ptr1 - padding)));
-            int16x4_t row2 =
-                vget_low_s16(vmovl_s8(vld1_s8(input_ptr2 - padding)));
-            int16x4_t row3 =
-                vget_low_s16(vmovl_s8(vld1_s8(input_ptr3 - padding)));
-            int16x4_t row4 =
-                vget_low_s16(vmovl_s8(vld1_s8(input_ptr4 - padding)));
-            int16x4_t row5 =
-                vget_low_s16(vmovl_s8(vld1_s8(input_ptr5 - padding)));
-            int16x4_t row6 =
-                vget_low_s16(vmovl_s8(vld1_s8(input_ptr6 - padding)));
-            int32x4_t acc0 = vmull_s16(row0, _ker[0]);
-            acc0 = vmlal_s16(acc0, row1, _ker[1]);
-            acc0 = vmlal_s16(acc0, row2, _ker[2]);
-            int32x4_t acc1 = vmull_s16(row2, _ker[0]);
-            acc1 = vmlal_s16(acc1, row3, _ker[1]);
-            acc1 = vmlal_s16(acc1, row4, _ker[2]);
-            int32x4_t acc2 = vmull_s16(row4, _ker[0]);
-            acc2 = vmlal_s16(acc2, row5, _ker[1]);
-            acc2 = vmlal_s16(acc2, row6, _ker[2]);
-            int32_t sum0 = vgetq_lane_s32(acc0, 2);
-            int32_t sum1 = vgetq_lane_s32(acc1, 2);
-            int32_t sum2 = vgetq_lane_s32(acc2, 2);
-            if (padding == 1) {
-              sum0 += vgetq_lane_s32(acc0, 1);
-              sum1 += vgetq_lane_s32(acc1, 1);
-              sum2 += vgetq_lane_s32(acc2, 1);
-            }
-            output_ptr0[w] = sum0;
-            output_ptr1[w] = sum1;
-            output_ptr2[w] = sum2;
-          }
-        }
-        input_ptr0 += valid_input_w_start;
-        input_ptr1 += valid_input_w_start;
-        input_ptr2 += valid_input_w_start;
-        input_ptr3 += valid_input_w_start;
-        input_ptr4 += valid_input_w_start;
-        input_ptr5 += valid_input_w_start;
-        input_ptr6 += valid_input_w_start;
-        output_ptr0 += valid_w_start;
-        output_ptr1 += valid_w_start;
-        output_ptr2 += valid_w_start;
-      }
-        // valid
-#if __aarch64__
-#else
-      int loop = output_w_tiles;
-      asm volatile(
-          "cmp        %[loop], #0                     \n"
-          "ble        start_remain_%=                 \n"
-          "mov        r0, #12                         \n"
-          // loop 6 widths
-          "loop_3h6w_%=:                              \n"
-          "vld2.8     {d10-d11}, [%[input_ptr0]], r0  \n"
-          "vld2.8     {d12-d13}, [%[input_ptr1]], r0  \n"
-          "vld2.8     {d14-d15}, [%[input_ptr2]], r0  \n"
-          "vext.s8    d9, d10, d10, #1                \n"
-          "vmovl.s8   q10, d9                         \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmovl.s8   q9, d11                         \n"
-          "vmull.s16  q11, d16, %e[ker0][0]           \n"
-          "vmlal.s16  q11, d18, %e[ker0][1]           \n"
-          "vmlal.s16  q11, d20, %e[ker0][2]           \n"
-          "vmull.s16  q12, d17, %e[ker0][0]           \n"
-          "vmlal.s16  q12, d19, %e[ker0][1]           \n"
-          "vmlal.s16  q12, d21, %e[ker0][2]           \n"
-
-          "vext.s8    d9, d12, d12, #1                \n"
-          "vmovl.s8   q10, d9                         \n"
-          "vmovl.s8   q8, d12                         \n"
-          "vmovl.s8   q9, d13                         \n"
-          "vmlal.s16  q11, d16, %f[ker0][0]           \n"
-          "vmlal.s16  q11, d18, %f[ker0][1]           \n"
-          "vmlal.s16  q11, d20, %f[ker0][2]           \n"
-          "vmlal.s16  q12, d17, %f[ker0][0]           \n"
-          "vmlal.s16  q12, d19, %f[ker0][1]           \n"
-          "vmlal.s16  q12, d21, %f[ker0][2]           \n"
-
-          "vext.s8    d9, d14, d14, #1                \n"
-          "vmovl.s8   q10, d9                         \n"
-          "vmovl.s8   q8, d14                         \n"
-          "vmovl.s8   q9, d15                         \n"
-          "vmlal.s16  q11, d16, %e[ker1][0]           \n"
-          "vmlal.s16  q11, d18, %e[ker1][1]           \n"
-          "vmlal.s16  q11, d20, %e[ker1][2]           \n"
-          "vmlal.s16  q12, d17, %e[ker1][0]           \n"
-          "vmlal.s16  q12, d19, %e[ker1][1]           \n"
-          "vmlal.s16  q12, d21, %e[ker1][2]           \n"
-          // store row 0, reuse q11/q12
-          "vst1.32    {d22-d24}, [%[output_ptr0]]!    \n"
-
-          "vmull.s16  q13, d16, %e[ker0][0]           \n"
-          "vmlal.s16  q13, d18, %e[ker0][1]           \n"
-          "vmlal.s16  q13, d20, %e[ker0][2]           \n"
-          "vmull.s16  q14, d17, %e[ker0][0]           \n"
-          "vmlal.s16  q14, d19, %e[ker0][1]           \n"
-          "vmlal.s16  q14, d21, %e[ker0][2]           \n"
-
-          "vld2.8     {d10-d11}, [%[input_ptr3]], r0  \n"
-          "vld2.8     {d12-d13}, [%[input_ptr4]], r0  \n"
-          "vld2.8     {d14-d15}, [%[input_ptr5]], r0  \n"
-          "vext.s8    d9, d10, d10, #1                \n"
-          "vmovl.s8   q10, d9                         \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmovl.s8   q9, d11                         \n"
-          "vmlal.s16  q13, d16, %f[ker0][0]           \n"
-          "vmlal.s16  q13, d18, %f[ker0][1]           \n"
-          "vmlal.s16  q13, d20, %f[ker0][2]           \n"
-          "vmlal.s16  q14, d17, %f[ker0][0]           \n"
-          "vmlal.s16  q14, d19, %f[ker0][1]           \n"
-          "vmlal.s16  q14, d21, %f[ker0][2]           \n"
-
-          "vext.s8    d9, d12, d12, #1                \n"
-          "vmovl.s8   q10, d9                         \n"
-          "vmovl.s8   q8, d12                         \n"
-          "vmovl.s8   q9, d13                         \n"
-          "vmlal.s16  q13, d16, %e[ker1][0]           \n"
-          "vmlal.s16  q13, d18, %e[ker1][1]           \n"
-          "vmlal.s16  q13, d20, %e[ker1][2]           \n"
-          "vmlal.s16  q14, d17, %e[ker1][0]           \n"
-          "vmlal.s16  q14, d19, %e[ker1][1]           \n"
-          "vmlal.s16  q14, d21, %e[ker1][2]           \n"
-          // store row 1
-          "vst1.32    {d26-d28}, [%[output_ptr1]]!    \n"
-
-          "vmull.s16  q11, d16, %e[ker0][0]           \n"
-          "vmlal.s16  q11, d18, %e[ker0][1]           \n"
-          "vmlal.s16  q11, d20, %e[ker0][2]           \n"
-          "vmull.s16  q12, d17, %e[ker0][0]           \n"
-          "vmlal.s16  q12, d19, %e[ker0][1]           \n"
-          "vmlal.s16  q12, d21, %e[ker0][2]           \n"
-
-          "vext.s8    d9, d14, d14, #1                \n"
-          "vmovl.s8   q10, d9                         \n"
-          "vmovl.s8   q8, d14                         \n"
-          "vmovl.s8   q9, d15                         \n"
-          "vmlal.s16  q11, d16, %f[ker0][0]           \n"
-          "vmlal.s16  q11, d18, %f[ker0][1]           \n"
-          "vmlal.s16  q11, d20, %f[ker0][2]           \n"
-          "vmlal.s16  q12, d17, %f[ker0][0]           \n"
-          "vmlal.s16  q12, d19, %f[ker0][1]           \n"
-          "vmlal.s16  q12, d21, %f[ker0][2]           \n"
-
-          "vld2.8     {d10-d11}, [%[input_ptr6]], r0  \n"
-          "vext.s8    d9, d10, d10, #1                \n"
-          "vmovl.s8   q10, d9                         \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmovl.s8   q9, d11                         \n"
-          "vmlal.s16  q11, d16, %e[ker1][0]           \n"
-          "vmlal.s16  q11, d18, %e[ker1][1]           \n"
-          "vmlal.s16  q11, d20, %e[ker1][2]           \n"
-          "vmlal.s16  q12, d17, %e[ker1][0]           \n"
-          "vmlal.s16  q12, d19, %e[ker1][1]           \n"
-          "vmlal.s16  q12, d21, %e[ker1][2]           \n"
-          // store row 2
-          "vst1.32    {d22-d24}, [%[output_ptr2]]!    \n"
-
-          "subs       %[loop], #1                     \n"
-          "bne        loop_3h6w_%=                    \n"
-
-          "start_remain_%=:                           \n"
-          "cmp        %[remain], #0                   \n"
-          "ble        end_%=                          \n"
-          "mov        r0, %[remain], lsl #1           \n"
-
-          "vld2.8     {d10-d11}, [%[input_ptr0]], r0  \n"
-          "vld2.8     {d12-d13}, [%[input_ptr1]], r0  \n"
-          "vext.s8    d9, d10, d10, #1                \n"
-          "vmovl.s8   q9, d9                          \n"
-          "vmovl.s8   q7, d10                         \n"
-          "vmovl.s8   q8, d11                         \n"
-          "vmull.s16  q10, d14, %e[ker0][0]           \n"
-          "vmlal.s16  q10, d16, %e[ker0][1]           \n"
-          "vmlal.s16  q10, d18, %e[ker0][2]           \n"
-          "vmull.s16  q11, d15, %e[ker0][0]           \n"
-          "vmlal.s16  q11, d17, %e[ker0][1]           \n"
-          "vmlal.s16  q11, d19, %e[ker0][2]           \n"
-
-          "vext.s8    d9, d12, d12, #1                \n"
-          "vmovl.s8   q9, d9                          \n"
-          "vmovl.s8   q7, d12                         \n"
-          "vmovl.s8   q8, d13                         \n"
-          "vmlal.s16  q10, d14, %f[ker0][0]           \n"
-          "vmlal.s16  q10, d16, %f[ker0][1]           \n"
-          "vmlal.s16  q10, d18, %f[ker0][2]           \n"
-          "vmlal.s16  q11, d15, %f[ker0][0]           \n"
-          "vmlal.s16  q11, d17, %f[ker0][1]           \n"
-          "vmlal.s16  q11, d19, %f[ker0][2]           \n"
-
-          "vld2.8     {d10-d11}, [%[input_ptr2]], r0  \n"
-          "vld2.8     {d12-d13}, [%[input_ptr3]], r0  \n"
-          "vext.s8    d9, d10, d10, #1                \n"
-          "vmovl.s8   q9, d9                          \n"
-          "vmovl.s8   q7, d10                         \n"
-          "vmovl.s8   q8, d11                         \n"
-          "vmlal.s16  q10, d14, %e[ker1][0]           \n"
-          "vmlal.s16  q10, d16, %e[ker1][1]           \n"
-          "vmlal.s16  q10, d18, %e[ker1][2]           \n"
-          "vmlal.s16  q11, d15, %e[ker1][0]           \n"
-          "vmlal.s16  q11, d17, %e[ker1][1]           \n"
-          "vmlal.s16  q11, d19, %e[ker1][2]           \n"
-
-          "vmull.s16  q12, d14, %e[ker0][0]           \n"
-          "vmlal.s16  q12, d16, %e[ker0][1]           \n"
-          "vmlal.s16  q12, d18, %e[ker0][2]           \n"
-          "vmull.s16  q13, d15, %e[ker0][0]           \n"
-          "vmlal.s16  q13, d17, %e[ker0][1]           \n"
-          "vmlal.s16  q13, d19, %e[ker0][2]           \n"
-
-          "vext.s8    d9, d12, d12, #1                \n"
-          "vmovl.s8   q9, d9                          \n"
-          "vmovl.s8   q7, d12                         \n"
-          "vmovl.s8   q8, d13                         \n"
-          "vmlal.s16  q12, d14, %f[ker0][0]           \n"
-          "vmlal.s16  q12, d16, %f[ker0][1]           \n"
-          "vmlal.s16  q12, d18, %f[ker0][2]           \n"
-          "vmlal.s16  q13, d15, %f[ker0][0]           \n"
-          "vmlal.s16  q13, d17, %f[ker0][1]           \n"
-          "vmlal.s16  q13, d19, %f[ker0][2]           \n"
-
-          "vld2.8     {d10-d11}, [%[input_ptr4]], r0  \n"
-          "vld2.8     {d12-d13}, [%[input_ptr5]], r0  \n"
-          "vext.s8    d9, d10, d10, #1                \n"
-          "vmovl.s8   q9, d9                          \n"
-          "vmovl.s8   q7, d10                         \n"
-          "vmovl.s8   q8, d11                         \n"
-          "vmlal.s16  q12, d14, %e[ker1][0]           \n"
-          "vmlal.s16  q12, d16, %e[ker1][1]           \n"
-          "vmlal.s16  q12, d18, %e[ker1][2]           \n"
-          "vmlal.s16  q13, d15, %e[ker1][0]           \n"
-          "vmlal.s16  q13, d17, %e[ker1][1]           \n"
-          "vmlal.s16  q13, d19, %e[ker1][2]           \n"
-
-          "vmull.s16  q14, d14, %e[ker0][0]           \n"
-          "vmlal.s16  q14, d16, %e[ker0][1]           \n"
-          "vmlal.s16  q14, d18, %e[ker0][2]           \n"
-          "vmull.s16  q15, d15, %e[ker0][0]           \n"
-          "vmlal.s16  q15, d17, %e[ker0][1]           \n"
-          "vmlal.s16  q15, d19, %e[ker0][2]           \n"
-
-          "vext.s8    d9, d12, d12, #1                \n"
-          "vmovl.s8   q9, d9                          \n"
-          "vmovl.s8   q7, d12                         \n"
-          "vmovl.s8   q8, d13                         \n"
-          "vmlal.s16  q14, d14, %f[ker0][0]           \n"
-          "vmlal.s16  q14, d16, %f[ker0][1]           \n"
-          "vmlal.s16  q14, d18, %f[ker0][2]           \n"
-          "vmlal.s16  q15, d15, %f[ker0][0]           \n"
-          "vmlal.s16  q15, d17, %f[ker0][1]           \n"
-          "vmlal.s16  q15, d19, %f[ker0][2]           \n"
-
-          "vld2.8     {d10-d11}, [%[input_ptr6]], r0  \n"
-          "vext.s8    d9, d10, d10, #1                \n"
-          "vmovl.s8   q9, d9                          \n"
-          "vmovl.s8   q7, d10                         \n"
-          "vmovl.s8   q8, d11                         \n"
-          "vmlal.s16  q14, d14, %e[ker1][0]           \n"
-          "vmlal.s16  q14, d16, %e[ker1][1]           \n"
-          "vmlal.s16  q14, d18, %e[ker1][2]           \n"
-          "vmlal.s16  q15, d15, %e[ker1][0]           \n"
-          "vmlal.s16  q15, d17, %e[ker1][1]           \n"
-          "vmlal.s16  q15, d19, %e[ker1][2]           \n"
-
-          "cmp        %[remain], #4                   \n"
-          "blt        store_3h2w_%=                   \n"
-          "vst1.32    {q10}, [%[output_ptr0]]!        \n"
-          "vst1.32    {q12}, [%[output_ptr1]]!        \n"
-          "vst1.32    {q14}, [%[output_ptr2]]!        \n"
-          "cmp        %[remain], #5                   \n"
-          "blt        end_%=                          \n"
-          "vst1.32    {d22[0]}, [%[output_ptr0]]!     \n"
-          "vst1.32    {d26[0]}, [%[output_ptr1]]!     \n"
-          "vst1.32    {d30[0]}, [%[output_ptr2]]!     \n"
-          "b          end_%=                          \n"
-
-          "store_3h2w_%=:                             \n"
-          "cmp        %[remain], #2                   \n"
-          "blt        store_3h1w_%=                   \n"
-          "vst1.32    {d20}, [%[output_ptr0]]!        \n"
-          "vst1.32    {d24}, [%[output_ptr1]]!        \n"
-          "vst1.32    {d28}, [%[output_ptr2]]!        \n"
-          "cmp        %[remain], #3                   \n"
-          "blt        end_%=                          \n"
-          "vst1.32    {d21[0]}, [%[output_ptr0]]!     \n"
-          "vst1.32    {d25[0]}, [%[output_ptr1]]!     \n"
-          "vst1.32    {d29[0]}, [%[output_ptr2]]!     \n"
-          "b          end_%=                          \n"
-
-          "store_3h1w_%=:                             \n"
-          "cmp        %[remain], #1                   \n"
-          "blt        end_%=                          \n"
-          "vst1.32    {d20[0]}, [%[output_ptr0]]!     \n"
-          "vst1.32    {d24[0]}, [%[output_ptr1]]!     \n"
-          "vst1.32    {d28[0]}, [%[output_ptr2]]!     \n"
-          "end_%=:                                    \n"
-          : [output_ptr0] "+r"(output_ptr0), [output_ptr1] "+r"(output_ptr1),
-            [output_ptr2] "+r"(output_ptr2), [input_ptr6] "+r"(input_ptr6),
-            [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1),
-            [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3),
-            [input_ptr4] "+r"(input_ptr4), [input_ptr5] "+r"(input_ptr5),
-            [loop] "+r"(loop)
-          : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1)
-          : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
-            "q12", "q13", "q14", "q15", "r0");
-#endif  // __aarch64__
-      // pad right
-      if (padding_w > 0) {
-        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0)));
-        int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1)));
-        int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2)));
-        int16x4_t row3 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr3)));
-        int16x4_t row4 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr4)));
-        int16x4_t row5 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr5)));
-        int16x4_t row6 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr6)));
-        int32x4_t acc0, acc1, acc2;
-        for (int w = valid_w_end; w < output_w; ++w) {
-          int padding = 2 * w + 3 - (padding_w + input_w);
-          if (padding >= 3) {
-            *output_ptr0 = 0;
-            *output_ptr1 = 0;
-            *output_ptr2 = 0;
-          } else {
-            acc0 = vmull_s16(row0, _ker[0]);
-            acc0 = vmlal_s16(acc0, row1, _ker[1]);
-            acc0 = vmlal_s16(acc0, row2, _ker[2]);
-            acc1 = vmull_s16(row2, _ker[0]);
-            acc1 = vmlal_s16(acc1, row3, _ker[1]);
-            acc1 = vmlal_s16(acc1, row4, _ker[2]);
-            acc2 = vmull_s16(row4, _ker[0]);
-            acc2 = vmlal_s16(acc2, row5, _ker[1]);
-            acc2 = vmlal_s16(acc2, row6, _ker[2]);
-            int32_t sum0 = vgetq_lane_s32(acc0, 0);
-            int32_t sum1 = vgetq_lane_s32(acc1, 0);
-            int32_t sum2 = vgetq_lane_s32(acc2, 0);
-            if (padding == 1) {
-              sum0 += vgetq_lane_s32(acc0, 1);
-              sum1 += vgetq_lane_s32(acc1, 1);
-              sum2 += vgetq_lane_s32(acc2, 1);
-            }
-            *output_ptr0 = sum0;
-            *output_ptr1 = sum1;
-            *output_ptr2 = sum2;
-          }
-          output_ptr0++;
-          output_ptr1++;
-          output_ptr2++;
-        }
-      }
-    }
-    // remain height
-    int start_h = valid_h_start + valid_h / 3 * 3;
-    for (int h = start_h; h < valid_h_end; ++h) {
-      const int8_t *input_ptr0 = input_ptr + (2 * h - padding_h) * input_w;
-      const int8_t *input_ptr1 = input_ptr0 + input_w;
-      const int8_t *input_ptr2 = input_ptr1 + input_w;
-      int32_t *output_ptr0 = output_ptr + h * output_w;
-      // pad left
-      if (padding_w) {
-        for (int w = valid_w_start - 1; w >= 0; --w) {
-          int padding = padding_w - (w << 1);
-          if (padding >= 3) {
-            output_ptr0[w] = 0;
-          } else {
-            int16x4_t row0 =
-                vget_low_s16(vmovl_s8(vld1_s8(input_ptr0 - padding)));
-            int16x4_t row1 =
-                vget_low_s16(vmovl_s8(vld1_s8(input_ptr1 - padding)));
-            int16x4_t row2 =
-                vget_low_s16(vmovl_s8(vld1_s8(input_ptr2 - padding)));
-            int32x4_t acc = vmull_s16(row0, _ker[0]);
-            acc = vmlal_s16(acc, row1, _ker[1]);
-            acc = vmlal_s16(acc, row2, _ker[2]);
-            int32_t sum0 = vgetq_lane_s32(acc, 2);
-            if (padding == 1) {
-              sum0 += vgetq_lane_s32(acc, 1);
-            }
-            output_ptr0[w] = sum0;
-          }
-        }
-        input_ptr0 += valid_input_w_start;
-        input_ptr1 += valid_input_w_start;
-        input_ptr2 += valid_input_w_start;
-        output_ptr0 += valid_w_start;
-      }
-        // valid
-#if __aarch64__
-#else
-      int loop = output_w_tiles;
-      asm volatile(
-          "cmp        %[loop], #0                      \n"
-          "ble        start_remain_%=                  \n"
-          "mov        r0, #12                          \n"
-          // loop 6 widths
-          "loop_1h6w_%=:                               \n"
-          "vld2.8     {d10, d11}, [%[input_ptr0]], r0  \n"
-          "vld2.8     {d12, d13}, [%[input_ptr1]], r0  \n"
-          "vld2.8     {d14, d15}, [%[input_ptr2]], r0  \n"
-          "vext.s8    d9, d10, d10, #1                 \n"
-          "vmovl.s8   q10, d9                          \n"
-          "vmovl.s8   q8, d10                          \n"
-          "vmovl.s8   q9, d11                          \n"
-          "vmull.s16  q11, d16, %e[ker0][0]            \n"
-          "vmlal.s16  q11, d18, %e[ker0][1]            \n"
-          "vmlal.s16  q11, d20, %e[ker0][2]            \n"
-          "vmull.s16  q12, d17, %e[ker0][0]            \n"
-          "vmlal.s16  q12, d19, %e[ker0][1]            \n"
-          "vmlal.s16  q12, d21, %e[ker0][2]            \n"
-
-          "vext.s8    d9, d12, d12, #1                 \n"
-          "vmovl.s8   q10, d9                          \n"
-          "vmovl.s8   q8, d12                          \n"
-          "vmovl.s8   q9, d13                          \n"
-          "vmlal.s16  q11, d16, %f[ker0][0]            \n"
-          "vmlal.s16  q11, d18, %f[ker0][1]            \n"
-          "vmlal.s16  q11, d20, %f[ker0][2]            \n"
-          "vmlal.s16  q12, d17, %f[ker0][0]            \n"
-          "vmlal.s16  q12, d19, %f[ker0][1]            \n"
-          "vmlal.s16  q12, d21, %f[ker0][2]            \n"
-
-          "vext.s8    d9, d14, d14, #1                 \n"
-          "vmovl.s8   q10, d9                          \n"
-          "vmovl.s8   q8, d14                          \n"
-          "vmovl.s8   q9, d15                          \n"
-          "vmlal.s16  q11, d16, %e[ker1][0]            \n"
-          "vmlal.s16  q11, d18, %e[ker1][1]            \n"
-          "vmlal.s16  q11, d20, %e[ker1][2]            \n"
-          "vmlal.s16  q12, d17, %e[ker1][0]            \n"
-          "vmlal.s16  q12, d19, %e[ker1][1]            \n"
-          "vmlal.s16  q12, d21, %e[ker1][2]            \n"
-          // store row 0
-          "vst1.32    {d22-d24}, [%[output_ptr0]]!     \n"
-
-          "subs       %[loop], #1                      \n"
-          "bne        loop_1h6w_%=                     \n"
-
-          "start_remain_%=:                            \n"
-          "cmp        %[remain], #0                    \n"
-          "ble        end_%=                           \n"
-          "mov        r0, %[remain], lsl #1            \n"
-
-          "vld2.8     {d10, d11}, [%[input_ptr0]], r0  \n"
-          "vld2.8     {d12, d13}, [%[input_ptr1]], r0  \n"
-          "vld2.8     {d14, d15}, [%[input_ptr2]], r0  \n"
-          "vext.s8    d9, d10, d10, #1                 \n"
-          "vmovl.s8   q10, d9                          \n"
-          "vmovl.s8   q8, d10                          \n"
-          "vmovl.s8   q9, d11                          \n"
-          "vmull.s16  q11, d16, %e[ker0][0]            \n"
-          "vmlal.s16  q11, d18, %e[ker0][1]            \n"
-          "vmlal.s16  q11, d20, %e[ker0][2]            \n"
-          "vmull.s16  q12, d17, %e[ker0][0]            \n"
-          "vmlal.s16  q12, d19, %e[ker0][1]            \n"
-          "vmlal.s16  q12, d21, %e[ker0][2]            \n"
-
-          "vext.s8    d9, d12, d12, #1                 \n"
-          "vmovl.s8   q10, d9                          \n"
-          "vmovl.s8   q8, d12                          \n"
-          "vmovl.s8   q9, d13                          \n"
-          "vmlal.s16  q11, d16, %f[ker0][0]            \n"
-          "vmlal.s16  q11, d18, %f[ker0][1]            \n"
-          "vmlal.s16  q11, d20, %f[ker0][2]            \n"
-          "vmlal.s16  q12, d17, %f[ker0][0]            \n"
-          "vmlal.s16  q12, d19, %f[ker0][1]            \n"
-          "vmlal.s16  q12, d21, %f[ker0][2]            \n"
-
-          "vext.s8    d9, d14, d14, #1                 \n"
-          "vmovl.s8   q10, d9                          \n"
-          "vmovl.s8   q8, d14                          \n"
-          "vmovl.s8   q9, d15                          \n"
-          "vmlal.s16  q11, d16, %e[ker1][0]            \n"
-          "vmlal.s16  q11, d18, %e[ker1][1]            \n"
-          "vmlal.s16  q11, d20, %e[ker1][2]            \n"
-          "vmlal.s16  q12, d17, %e[ker1][0]            \n"
-          "vmlal.s16  q12, d19, %e[ker1][1]            \n"
-          "vmlal.s16  q12, d21, %e[ker1][2]            \n"
-
-          "cmp        %[remain], #4                    \n"
-          "blt        store_1h2w_%=                    \n"
-          "vst1.32    {q11}, [%[output_ptr0]]!         \n"
-          "cmp        %[remain], #5                    \n"
-          "blt        end_%=                           \n"
-          "vst1.32    {d24[0]}, [%[output_ptr0]]!      \n"
-          "b          end_%=                           \n"
-
-          "store_1h2w_%=:                              \n"
-          "cmp        %[remain], #2                    \n"
-          "blt        store_1h1w_%=                    \n"
-          "vst1.32    {d22}, [%[output_ptr0]]!         \n"
-          "cmp        %[remain], #3                    \n"
-          "blt        end_%=                           \n"
-          "vst1.32    {d23[0]}, [%[output_ptr0]]!      \n"
-          "b          end_%=                           \n"
-
-          "store_1h1w_%=:                              \n"
-          "cmp        %[remain], #1                    \n"
-          "blt        end_%=                           \n"
-          "vst1.32    {d22[0]}, [%[output_ptr0]]!      \n"
-          "end_%=:                                     \n"
-          : [output_ptr0] "+r"(output_ptr0), [input_ptr0] "+r"(input_ptr0),
-            [input_ptr1] "+r"(input_ptr1), [input_ptr2] "+r"(input_ptr2),
-            [loop] "+r"(loop)
-          : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1)
-          : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
-            "q12", "q13", "q14", "q15", "r0");
-#endif  // __aarch64__
-      // pad right
-      if (padding_w > 0) {
-        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0)));
-        int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1)));
-        int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2)));
-        int32x4_t acc;
-        for (int w = valid_w_end; w < output_w; ++w) {
-          int padding = 2 * w + 3 - (padding_w + input_w);
-          if (padding >= 3) {
-            *output_ptr0 = 0;
-          } else {
-            acc = vmull_s16(row0, _ker[0]);
-            acc = vmlal_s16(acc, row1, _ker[1]);
-            acc = vmlal_s16(acc, row2, _ker[2]);
-            int32_t sum0 = vgetq_lane_s32(acc, 0);
-            if (padding == 1) {
-              sum0 += vgetq_lane_s32(acc, 1);
-            }
-            *output_ptr0 = sum0;
-          }
-          output_ptr0++;
-        }
-      }
-    }
-    // bottom
-    for (int h = valid_h_end; h < output_h; ++h) {
-      DepthwiseConv3x3NormalRow<2, 2>(input_ptr, filter_ptr, h, input_h,
-                                      input_w, padding_h, padding_w, output_w,
-                                      output_ptr, _ker);
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // __ARM_NEON__
diff --git a/mobile/src/operators/math/depthwise_conv5x5.cpp b/mobile/src/operators/math/depthwise_conv5x5.cpp
deleted file mode 100644
index a721cce71e..0000000000
--- a/mobile/src/operators/math/depthwise_conv5x5.cpp
+++ /dev/null
@@ -1,1106 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-#include "operators/math/depthwise_conv5x5.h"
-#include <arm_neon.h>
-#include <iostream>
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-#ifndef __aarch64__
-inline float32x4_t vpaddq_f32(float32x4_t r0, float32x4_t r1) {
-  float32x2_t sum0 = vpadd_f32(vget_low_f32(r0), vget_high_f32(r0));
-  float32x2_t sum1 = vpadd_f32(vget_low_f32(r1), vget_high_f32(r1));
-  return vcombine_f32(sum0, sum1);
-}
-#endif
-
-template <int Stride = 1>
-inline void Depth5x5NormalRowLoadInput(const float *input, float32x4_t *y) {
-  y[0] = vld1q_f32(input);
-  y[4] = vld1q_f32(input + 4);
-  y[1] = vextq_f32(y[0], y[4], 1);
-  y[2] = vextq_f32(y[0], y[4], 2);
-  y[3] = vextq_f32(y[0], y[4], 3);
-}
-
-template <>
-inline void Depth5x5NormalRowLoadInput<2>(const float *input, float32x4_t *y) {
-  float32x4x2_t x = vld2q_f32(input);
-  y[0] = x.val[0];
-  y[1] = x.val[1];
-  y[2] = vextq_f32(y[0], y[0], 1);
-  y[3] = vextq_f32(y[1], y[1], 1);
-  y[4] = vextq_f32(y[0], y[0], 2);
-}
-
-#define DEPTHWISE_CONV5X5_NORMAL_BORDER(start, end)                      \
-  for (int w = start; w < end; ++w) {                                    \
-    const int w_in_start = -padding_w + w * Stride_w;                    \
-    const int w_in_end = w_in_start + 5;                                 \
-    const int w_start = w_in_start > 0 ? w_in_start : 0;                 \
-    const int w_end = w_in_end < input_w ? w_in_end : input_w;           \
-    float value = 0;                                                     \
-    for (int h_in = h_start; h_in < h_end; ++h_in) {                     \
-      for (int w_in = w_start; w_in < w_end; ++w_in) {                   \
-        value += filter[(h_in - h_in_start) * 5 + (w_in - w_in_start)] * \
-                 input[h_in * input_w + w_in];                           \
-      }                                                                  \
-    }                                                                    \
-    output_ptr[w] = value;                                               \
-  }
-
-template <int Stride_h, int Stride_w>
-inline void DepthwiseConv5x5NormalRow(const float *input, const float *filter,
-                                      const int h_output, const int input_h,
-                                      const int input_w, const int padding_h,
-                                      const int padding_w, const int output_w,
-                                      float *output, float32x4_t *ker,
-                                      float32_t *ker1) {
-  const int h_in_start = -padding_h + h_output * Stride_h;
-  const int h_in_end = h_in_start + 5;
-  const int h_start = h_in_start > 0 ? h_in_start : 0;
-  const int h_end = h_in_end < input_h ? h_in_end : input_h;
-
-  int valid_w_start = (padding_w + Stride_w - 1) / Stride_w;
-  int valid_w_end = (input_w + padding_w - 5) / Stride_w + 1;
-  if (valid_w_end < valid_w_start) {
-    valid_w_end = valid_w_start;
-  }
-  float *output_ptr = output + h_output * output_w;
-
-  // border left
-  DEPTHWISE_CONV5X5_NORMAL_BORDER(0, valid_w_start)
-  // middle
-  int output_tiles = (valid_w_end - valid_w_start) >> 2;
-  float32x4_t _sum, _x[5];
-  // valid w
-  for (int w = 0; w < output_tiles * 4; w += 4) {
-    _sum = vdupq_n_f32(0.f);
-    int output_offset = valid_w_start + w;
-    int input_w_offset = output_offset * Stride_w - padding_w;
-    for (int h_in = h_start; h_in < h_end; ++h_in) {
-      int index = h_in - h_in_start;
-      Depth5x5NormalRowLoadInput<Stride_w>(
-          input + h_in * input_w + input_w_offset, _x);
-      _sum = vmlaq_n_f32(_sum, _x[0], ker1[index]);
-      _sum = vmlaq_lane_f32(_sum, _x[1], vget_low_f32(ker[index]), 0);
-      _sum = vmlaq_lane_f32(_sum, _x[2], vget_low_f32(ker[index]), 1);
-      _sum = vmlaq_lane_f32(_sum, _x[3], vget_high_f32(ker[index]), 0);
-      _sum = vmlaq_lane_f32(_sum, _x[4], vget_high_f32(ker[index]), 1);
-    }
-    vst1q_f32(output_ptr + output_offset, _sum);
-  }
-  // remain valid w
-  int remain = (valid_w_end - valid_w_start) & 0x3;
-  if (remain > 0) {
-    _sum = vdupq_n_f32(0.f);
-    int remain_start = valid_w_start + (output_tiles << 2);
-    int input_w_offset = remain_start * Stride_w - padding_w;
-    float *output_ptr0 = output_ptr + remain_start;
-
-    for (int h_in = h_start; h_in < h_end; ++h_in) {
-      int index = h_in - h_in_start;
-      Depth5x5NormalRowLoadInput<Stride_w>(
-          input + h_in * input_w + input_w_offset, _x);
-      _sum = vmlaq_n_f32(_sum, _x[0], ker1[index]);
-      _sum = vmlaq_lane_f32(_sum, _x[1], vget_low_f32(ker[index]), 0);
-      _sum = vmlaq_lane_f32(_sum, _x[2], vget_low_f32(ker[index]), 1);
-      _sum = vmlaq_lane_f32(_sum, _x[3], vget_high_f32(ker[index]), 0);
-      _sum = vmlaq_lane_f32(_sum, _x[4], vget_high_f32(ker[index]), 1);
-    }
-    switch (remain) {
-      case 3:
-        vst1q_lane_f32(output_ptr0 + 2, _sum, 2);
-      case 2:
-        vst1_f32(output_ptr0, vget_low_f32(_sum));
-        break;
-      case 1:
-        vst1q_lane_f32(output_ptr0, _sum, 0);
-        break;
-    }
-  }
-  // border right
-  DEPTHWISE_CONV5X5_NORMAL_BORDER(valid_w_end, output_w)
-}
-
-template <>
-void DepthwiseConv5x5S1<float, float>(const framework::Tensor &input,
-                                      const framework::Tensor &filter,
-                                      const std::vector<int> &paddings,
-                                      framework::Tensor *output) {
-  const float *input_data = input.data<float>();
-  const float *filter_data = filter.data<float>();
-  float *out_data = output->mutable_data<float>();
-
-  const int input_h = input.dims()[2];
-  const int input_w = input.dims()[3];
-  const int output_h = output->dims()[2];
-  const int output_w = output->dims()[3];
-  const int padding_h = paddings[0];
-  const int padding_w = paddings[1];
-  const int image_size = input_h * input_w;
-  const int out_image_size = output_h * output_w;
-  const int valid_h_start = padding_h;
-  const int valid_h_end = output_h - valid_h_start;
-  const int valid_h = valid_h_end - valid_h_start;
-  const int valid_w_start = padding_w;
-  const int valid_w_end = output_w - valid_w_start;
-  const int valid_w = valid_w_end - valid_w_start;
-
-  #pragma omp parallel for
-  for (int g = 0; g < output->dims()[1]; ++g) {
-    const float *input_ptr = input_data + g * image_size;
-    const float *filter_ptr = filter_data + g * 25;
-    float *output_ptr = out_data + g * out_image_size;
-
-    const float *filter_ptr0 = filter_ptr;
-    const float *filter_ptr1 = filter_ptr0 + 5;
-    const float *filter_ptr2 = filter_ptr1 + 5;
-    const float *filter_ptr3 = filter_ptr2 + 5;
-    const float *filter_ptr4 = filter_ptr3 + 5;
-    float32x4_t _ker[7];
-    float32_t _ker1[5] = {*filter_ptr0, *filter_ptr1, *filter_ptr2,
-                          *filter_ptr3, *filter_ptr4};
-    _ker[0] = vld1q_f32(filter_ptr0 + 1);
-    _ker[1] = vld1q_f32(filter_ptr1 + 1);
-    _ker[2] = vld1q_f32(filter_ptr2 + 1);
-    _ker[3] = vld1q_f32(filter_ptr3 + 1);
-    _ker[4] = vld1q_f32(filter_ptr4 + 1);
-    _ker[5] = vld1q_f32(_ker1);
-    _ker[6] = vld1q_f32(_ker1 + 4);
-
-    // pad top
-    for (int h = 0; h < valid_h_start; ++h) {
-      DepthwiseConv5x5NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h,
-                                      input_w, padding_h, padding_w, output_w,
-                                      output_ptr, _ker, _ker1);
-    }
-
-    // output 4x4
-    int output_w_tiles = valid_w / 4;
-    int output_w_remain = valid_w - output_w_tiles * 4;
-    for (int h = valid_h_start; h < valid_h_end - 1; h += 2) {
-      const float *input_ptr0 = input_ptr + (h - padding_h) * input_w;
-      const float *input_ptr1 = input_ptr0 + input_w;
-      const float *input_ptr2 = input_ptr1 + input_w;
-      const float *input_ptr3 = input_ptr2 + input_w;
-      const float *input_ptr4 = input_ptr3 + input_w;
-      const float *input_ptr5 = input_ptr4 + input_w;
-      float *output_ptr0 = output_ptr + h * output_w;
-      float *output_ptr1 = output_ptr0 + output_w;
-      // pad left
-      if (padding_w) {
-        float32x4_t row0 = vld1q_f32(input_ptr0);
-        float32x4_t row1 = vld1q_f32(input_ptr1);
-        float32x4_t row2 = vld1q_f32(input_ptr2);
-        float32x4_t row3 = vld1q_f32(input_ptr3);
-        float32x4_t row4 = vld1q_f32(input_ptr4);
-        float32x4_t row5 = vld1q_f32(input_ptr5);
-        float32x4_t zero = vdupq_n_f32(0.f);
-        float32x4_t acc0, acc1;
-        for (int w = valid_w_start - 1; w >= 0; --w) {
-          int padding = padding_w - w;
-          if (padding >= 5) {
-            output_ptr0[w] = 0.f;
-            output_ptr1[w] = 0.f;
-          } else {
-            acc0 = vmulq_f32(row0, _ker[0]);
-            acc0 = vmlaq_f32(acc0, row1, _ker[1]);
-            acc0 = vmlaq_f32(acc0, row2, _ker[2]);
-            acc0 = vmlaq_f32(acc0, row3, _ker[3]);
-            acc0 = vmlaq_f32(acc0, row4, _ker[4]);
-            acc1 = vmulq_f32(row1, _ker[0]);
-            acc1 = vmlaq_f32(acc1, row2, _ker[1]);
-            acc1 = vmlaq_f32(acc1, row3, _ker[2]);
-            acc1 = vmlaq_f32(acc1, row4, _ker[3]);
-            acc1 = vmlaq_f32(acc1, row5, _ker[4]);
-            acc0 = vpaddq_f32(acc0, acc1);
-            float32x2_t sum =
-                vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
-            vst1_lane_f32(output_ptr0 + w, sum, 0);
-            vst1_lane_f32(output_ptr1 + w, sum, 1);
-
-            row0 = vextq_f32(zero, row0, 3);
-            row1 = vextq_f32(zero, row1, 3);
-            row2 = vextq_f32(zero, row2, 3);
-            row3 = vextq_f32(zero, row3, 3);
-            row4 = vextq_f32(zero, row4, 3);
-            row5 = vextq_f32(zero, row5, 3);
-          }
-        }
-        output_ptr0 += valid_w_start;
-        output_ptr1 += valid_w_start;
-      }
-        // valid
-#if __aarch64__
-      float32x4_t _q14, _q15;
-      for (int loop = 0; loop < output_w_tiles; ++loop) {
-        float32x4_t _q7 = vld1q_f32(input_ptr0);
-        float32x4_t _q8 = vld1q_f32(input_ptr0 + 4);
-        float32x4_t _q9 = vld1q_f32(input_ptr1);
-        float32x4_t _q10 = vld1q_f32(input_ptr1 + 4);
-        float32x4_t _q11 = vld1q_f32(input_ptr2);
-        float32x4_t _q12 = vld1q_f32(input_ptr2 + 4);
-
-        _q14 = vmulq_lane_f32(_q7, vget_low_f32(_ker[5]), 0);
-        float32x4_t _q13 = vextq_f32(_q7, _q8, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[0]), 0);
-        _q13 = vextq_f32(_q7, _q8, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[0]), 1);
-        _q13 = vextq_f32(_q7, _q8, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[0]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q8, vget_high_f32(_ker[0]), 1);
-
-        _q14 = vmlaq_lane_f32(_q14, _q9, vget_low_f32(_ker[5]), 1);
-        _q15 = vmulq_lane_f32(_q9, vget_low_f32(_ker[5]), 0);
-        _q13 = vextq_f32(_q9, _q10, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[1]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[0]), 0);
-        _q13 = vextq_f32(_q9, _q10, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[1]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[0]), 1);
-        _q13 = vextq_f32(_q9, _q10, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[1]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[0]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q10, vget_high_f32(_ker[1]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q10, vget_high_f32(_ker[0]), 1);
-
-        _q14 = vmlaq_lane_f32(_q14, _q11, vget_high_f32(_ker[5]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q11, vget_low_f32(_ker[5]), 1);
-        _q13 = vextq_f32(_q11, _q12, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[2]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[1]), 0);
-        _q13 = vextq_f32(_q11, _q12, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[2]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[1]), 1);
-        _q13 = vextq_f32(_q11, _q12, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[2]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[1]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q12, vget_high_f32(_ker[2]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q12, vget_high_f32(_ker[1]), 1);
-
-        _q7 = vld1q_f32(input_ptr3);
-        _q8 = vld1q_f32(input_ptr3 + 4);
-        _q9 = vld1q_f32(input_ptr4);
-        _q10 = vld1q_f32(input_ptr4 + 4);
-        _q11 = vld1q_f32(input_ptr5);
-        _q12 = vld1q_f32(input_ptr5 + 4);
-
-        _q14 = vmlaq_lane_f32(_q14, _q7, vget_high_f32(_ker[5]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q7, vget_high_f32(_ker[5]), 0);
-        _q13 = vextq_f32(_q7, _q8, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[3]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[2]), 0);
-        _q13 = vextq_f32(_q7, _q8, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[3]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[2]), 1);
-        _q13 = vextq_f32(_q7, _q8, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[3]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[2]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q8, vget_high_f32(_ker[3]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q8, vget_high_f32(_ker[2]), 1);
-
-        _q14 = vmlaq_lane_f32(_q14, _q9, vget_low_f32(_ker[6]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q9, vget_high_f32(_ker[5]), 1);
-        _q13 = vextq_f32(_q9, _q10, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[4]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[3]), 0);
-        _q13 = vextq_f32(_q9, _q10, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[4]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[3]), 1);
-        _q13 = vextq_f32(_q9, _q10, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[4]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[3]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q10, vget_high_f32(_ker[4]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q10, vget_high_f32(_ker[3]), 1);
-
-        _q15 = vmlaq_lane_f32(_q15, _q11, vget_low_f32(_ker[6]), 0);
-        _q13 = vextq_f32(_q11, _q12, 1);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[4]), 0);
-        _q13 = vextq_f32(_q11, _q12, 2);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[4]), 1);
-        _q13 = vextq_f32(_q11, _q12, 3);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[4]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q12, vget_high_f32(_ker[4]), 1);
-
-        vst1q_f32(output_ptr0, _q14);
-        vst1q_f32(output_ptr1, _q15);
-
-        input_ptr0 += 4;
-        input_ptr1 += 4;
-        input_ptr2 += 4;
-        input_ptr3 += 4;
-        input_ptr4 += 4;
-        input_ptr5 += 4;
-        output_ptr0 += 4;
-        output_ptr1 += 4;
-      }
-      // remain w
-      if (output_w_remain > 0) {
-        float32x4_t _q7 = vld1q_f32(input_ptr0);
-        float32x4_t _q8 = vld1q_f32(input_ptr0 + 4);
-        float32x4_t _q9 = vld1q_f32(input_ptr1);
-        float32x4_t _q10 = vld1q_f32(input_ptr1 + 4);
-        float32x4_t _q11 = vld1q_f32(input_ptr2);
-        float32x4_t _q12 = vld1q_f32(input_ptr2 + 4);
-
-        _q14 = vmulq_lane_f32(_q7, vget_low_f32(_ker[5]), 0);
-        float32x4_t _q13 = vextq_f32(_q7, _q8, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[0]), 0);
-        _q13 = vextq_f32(_q7, _q8, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[0]), 1);
-        _q13 = vextq_f32(_q7, _q8, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[0]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q8, vget_high_f32(_ker[0]), 1);
-
-        _q14 = vmlaq_lane_f32(_q14, _q9, vget_low_f32(_ker[5]), 1);
-        _q15 = vmulq_lane_f32(_q9, vget_low_f32(_ker[5]), 0);
-        _q13 = vextq_f32(_q9, _q10, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[1]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[0]), 0);
-        _q13 = vextq_f32(_q9, _q10, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[1]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[0]), 1);
-        _q13 = vextq_f32(_q9, _q10, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[1]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[0]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q10, vget_high_f32(_ker[1]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q10, vget_high_f32(_ker[0]), 1);
-
-        _q14 = vmlaq_lane_f32(_q14, _q11, vget_high_f32(_ker[5]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q11, vget_low_f32(_ker[5]), 1);
-        _q13 = vextq_f32(_q11, _q12, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[2]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[1]), 0);
-        _q13 = vextq_f32(_q11, _q12, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[2]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[1]), 1);
-        _q13 = vextq_f32(_q11, _q12, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[2]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[1]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q12, vget_high_f32(_ker[2]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q12, vget_high_f32(_ker[1]), 1);
-
-        _q7 = vld1q_f32(input_ptr3);
-        _q8 = vld1q_f32(input_ptr3 + 4);
-        _q9 = vld1q_f32(input_ptr4);
-        _q10 = vld1q_f32(input_ptr4 + 4);
-        _q11 = vld1q_f32(input_ptr5);
-        _q12 = vld1q_f32(input_ptr5 + 4);
-
-        _q14 = vmlaq_lane_f32(_q14, _q7, vget_high_f32(_ker[5]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q7, vget_high_f32(_ker[5]), 0);
-        _q13 = vextq_f32(_q7, _q8, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[3]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[2]), 0);
-        _q13 = vextq_f32(_q7, _q8, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[3]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[2]), 1);
-        _q13 = vextq_f32(_q7, _q8, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[3]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[2]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q8, vget_high_f32(_ker[3]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q8, vget_high_f32(_ker[2]), 1);
-
-        _q14 = vmlaq_lane_f32(_q14, _q9, vget_low_f32(_ker[6]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q9, vget_high_f32(_ker[5]), 1);
-        _q13 = vextq_f32(_q9, _q10, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[4]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[3]), 0);
-        _q13 = vextq_f32(_q9, _q10, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[4]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[3]), 1);
-        _q13 = vextq_f32(_q9, _q10, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[4]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[3]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q10, vget_high_f32(_ker[4]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q10, vget_high_f32(_ker[3]), 1);
-
-        _q15 = vmlaq_lane_f32(_q15, _q11, vget_low_f32(_ker[6]), 0);
-        _q13 = vextq_f32(_q11, _q12, 1);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[4]), 0);
-        _q13 = vextq_f32(_q11, _q12, 2);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[4]), 1);
-        _q13 = vextq_f32(_q11, _q12, 3);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[4]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q12, vget_high_f32(_ker[4]), 1);
-
-        switch (output_w_remain) {
-          case 3:
-            vst1q_lane_f32(output_ptr0 + 2, _q14, 2);
-            vst1q_lane_f32(output_ptr1 + 2, _q15, 2);
-          case 2:
-            vst1_f32(output_ptr0, vget_low_f32(_q14));
-            vst1_f32(output_ptr1, vget_low_f32(_q15));
-            break;
-          case 1:
-            vst1q_lane_f32(output_ptr0, _q14, 0);
-            vst1q_lane_f32(output_ptr1, _q15, 0);
-            break;
-        }
-        input_ptr0 += output_w_remain;
-        input_ptr1 += output_w_remain;
-        input_ptr2 += output_w_remain;
-        input_ptr3 += output_w_remain;
-        input_ptr4 += output_w_remain;
-        input_ptr5 += output_w_remain;
-        output_ptr0 += output_w_remain;
-        output_ptr1 += output_w_remain;
-      }
-#else
-      int loop = output_w_tiles;
-      asm volatile(
-          "cmp        %[loop], #0                     \n"
-          "ble        start_remain_%=                 \n"
-          "mov        r0, #16                         \n"
-          "loop_2h4w_%=:                              \n"
-          "vld1.32    {d14-d17}, [%[input_ptr0]], r0  \n"
-          "vld1.32    {d18-d21}, [%[input_ptr1]], r0  \n"
-          "vld1.32    {d22-d25}, [%[input_ptr2]], r0  \n"
-          "vmul.f32   q14, q7, %e[ker0][0]            \n"
-          "vext.32    q13, q7, q8, #1                 \n"
-          "vmla.f32   q14, q13, %e[kr0][0]            \n"
-          "vext.32    q13, q7, q8, #2                 \n"
-          "vmla.f32   q14, q13, %e[kr0][1]            \n"
-          "vext.32    q13, q7, q8, #3                 \n"
-          "vmla.f32   q14, q13, %f[kr0][0]            \n"
-          "vmla.f32   q14, q8, %f[kr0][1]             \n"
-
-          "vmla.f32   q14, q9, %e[ker0][1]            \n"
-          "vmul.f32   q15, q9, %e[ker0][0]            \n"
-          "vext.32    q13, q9, q10, #1                \n"
-          "vmla.f32   q14, q13, %e[kr1][0]            \n"
-          "vmla.f32   q15, q13, %e[kr0][0]            \n"
-          "vext.32    q13, q9, q10, #2                \n"
-          "vmla.f32   q14, q13, %e[kr1][1]            \n"
-          "vmla.f32   q15, q13, %e[kr0][1]            \n"
-          "vext.32    q13, q9, q10, #3                \n"
-          "vmla.f32   q14, q13, %f[kr1][0]            \n"
-          "vmla.f32   q15, q13, %f[kr0][0]            \n"
-          "vmla.f32   q14, q10, %f[kr1][1]            \n"
-          "vmla.f32   q15, q10, %f[kr0][1]            \n"
-
-          "vmla.f32   q14, q11, %f[ker0][0]           \n"
-          "vmla.f32   q15, q11, %e[ker0][1]           \n"
-          "vext.32    q13, q11, q12, #1               \n"
-          "vmla.f32   q14, q13, %e[kr2][0]            \n"
-          "vmla.f32   q15, q13, %e[kr1][0]            \n"
-          "vext.32    q13, q11, q12, #2               \n"
-          "vmla.f32   q14, q13, %e[kr2][1]            \n"
-          "vmla.f32   q15, q13, %e[kr1][1]            \n"
-          "vext.32    q13, q11, q12, #3               \n"
-          "vmla.f32   q14, q13, %f[kr2][0]            \n"
-          "vmla.f32   q15, q13, %f[kr1][0]            \n"
-          "vmla.f32   q14, q12, %f[kr2][1]            \n"
-          "vmla.f32   q15, q12, %f[kr1][1]            \n"
-
-          "vld1.32    {d14-d17}, [%[input_ptr3]], r0  \n"
-          "vld1.32    {d18-d21}, [%[input_ptr4]], r0  \n"
-          "vld1.32    {d22-d25}, [%[input_ptr5]], r0  \n"
-          "vmla.f32   q14, q7, %f[ker0][1]            \n"
-          "vmla.f32   q15, q7, %f[ker0][0]            \n"
-          "vext.32    q13, q7, q8, #1                 \n"
-          "vmla.f32   q14, q13, %e[kr3][0]            \n"
-          "vmla.f32   q15, q13, %e[kr2][0]            \n"
-          "vext.32    q13, q7, q8, #2                 \n"
-          "vmla.f32   q14, q13, %e[kr3][1]            \n"
-          "vmla.f32   q15, q13, %e[kr2][1]            \n"
-          "vext.32    q13, q7, q8, #3                 \n"
-          "vmla.f32   q14, q13, %f[kr3][0]            \n"
-          "vmla.f32   q15, q13, %f[kr2][0]            \n"
-          "vmla.f32   q14, q8, %f[kr3][1]             \n"
-          "vmla.f32   q15, q8, %f[kr2][1]             \n"
-
-          "vmla.f32   q14, q9, %e[ker1][0]            \n"
-          "vmla.f32   q15, q9, %f[ker0][1]            \n"
-          "vext.32    q13, q9, q10, #1                \n"
-          "vmla.f32   q14, q13, %e[kr4][0]            \n"
-          "vmla.f32   q15, q13, %e[kr3][0]            \n"
-          "vext.32    q13, q9, q10, #2                \n"
-          "vmla.f32   q14, q13, %e[kr4][1]            \n"
-          "vmla.f32   q15, q13, %e[kr3][1]            \n"
-          "vext.32    q13, q9, q10, #3                \n"
-          "vmla.f32   q14, q13, %f[kr4][0]            \n"
-          "vmla.f32   q15, q13, %f[kr3][0]            \n"
-          "vmla.f32   q14, q10, %f[kr4][1]            \n"
-          "vmla.f32   q15, q10, %f[kr3][1]            \n"
-
-          "vmla.f32   q15, q11, %e[ker1][0]           \n"
-          "vext.32    q13, q11, q12, #1               \n"
-          "vmla.f32   q15, q13, %e[kr4][0]            \n"
-          "vext.32    q13, q11, q12, #2               \n"
-          "vmla.f32   q15, q13, %e[kr4][1]            \n"
-          "vext.32    q13, q11, q12, #3               \n"
-          "vmla.f32   q15, q13, %f[kr4][0]            \n"
-          "vmla.f32   q15, q12, %f[kr4][1]            \n"
-          // restore output
-          "vst1.32    {q14}, [%[output_ptr0]]!        \n"
-          "vst1.32    {q15}, [%[output_ptr1]]!        \n"
-          "subs       %[loop], #1                     \n"
-          "bne        loop_2h4w_%=                    \n"
-
-          "start_remain_%=:                           \n"
-          "cmp        %[remain], #0                   \n"
-          "ble        end_%=                          \n"
-          "mov        r0, %[remain], lsl #2           \n"
-          "vld1.32    {d14-d17}, [%[input_ptr0]], r0  \n"
-          "vld1.32    {d18-d21}, [%[input_ptr1]], r0  \n"
-          "vld1.32    {d22-d25}, [%[input_ptr2]], r0  \n"
-          "vmul.f32   q14, q7, %e[ker0][0]            \n"
-          "vext.32    q13, q7, q8, #1                 \n"
-          "vmla.f32   q14, q13, %e[kr0][0]            \n"
-          "vext.32    q13, q7, q8, #2                 \n"
-          "vmla.f32   q14, q13, %e[kr0][1]            \n"
-          "vext.32    q13, q7, q8, #3                 \n"
-          "vmla.f32   q14, q13, %f[kr0][0]            \n"
-          "vmla.f32   q14, q8, %f[kr0][1]             \n"
-
-          "vmla.f32   q14, q9, %e[ker0][1]            \n"
-          "vmul.f32   q15, q9, %e[ker0][0]            \n"
-          "vext.32    q13, q9, q10, #1                \n"
-          "vmla.f32   q14, q13, %e[kr1][0]            \n"
-          "vmla.f32   q15, q13, %e[kr0][0]            \n"
-          "vext.32    q13, q9, q10, #2                \n"
-          "vmla.f32   q14, q13, %e[kr1][1]            \n"
-          "vmla.f32   q15, q13, %e[kr0][1]            \n"
-          "vext.32    q13, q9, q10, #3                \n"
-          "vmla.f32   q14, q13, %f[kr1][0]            \n"
-          "vmla.f32   q15, q13, %f[kr0][0]            \n"
-          "vmla.f32   q14, q10, %f[kr1][1]            \n"
-          "vmla.f32   q15, q10, %f[kr0][1]            \n"
-
-          "vmla.f32   q14, q11, %f[ker0][0]           \n"
-          "vmla.f32   q15, q11, %e[ker0][1]           \n"
-          "vext.32    q13, q11, q12, #1               \n"
-          "vmla.f32   q14, q13, %e[kr2][0]            \n"
-          "vmla.f32   q15, q13, %e[kr1][0]            \n"
-          "vext.32    q13, q11, q12, #2               \n"
-          "vmla.f32   q14, q13, %e[kr2][1]            \n"
-          "vmla.f32   q15, q13, %e[kr1][1]            \n"
-          "vext.32    q13, q11, q12, #3               \n"
-          "vmla.f32   q14, q13, %f[kr2][0]            \n"
-          "vmla.f32   q15, q13, %f[kr1][0]            \n"
-          "vmla.f32   q14, q12, %f[kr2][1]            \n"
-          "vmla.f32   q15, q12, %f[kr1][1]            \n"
-
-          "vld1.32    {d14-d17}, [%[input_ptr3]], r0  \n"
-          "vld1.32    {d18-d21}, [%[input_ptr4]], r0  \n"
-          "vld1.32    {d22-d25}, [%[input_ptr5]], r0  \n"
-          "vmla.f32   q14, q7, %f[ker0][1]            \n"
-          "vmla.f32   q15, q7, %f[ker0][0]            \n"
-          "vext.32    q13, q7, q8, #1                 \n"
-          "vmla.f32   q14, q13, %e[kr3][0]            \n"
-          "vmla.f32   q15, q13, %e[kr2][0]            \n"
-          "vext.32    q13, q7, q8, #2                 \n"
-          "vmla.f32   q14, q13, %e[kr3][1]            \n"
-          "vmla.f32   q15, q13, %e[kr2][1]            \n"
-          "vext.32    q13, q7, q8, #3                 \n"
-          "vmla.f32   q14, q13, %f[kr3][0]            \n"
-          "vmla.f32   q15, q13, %f[kr2][0]            \n"
-          "vmla.f32   q14, q8, %f[kr3][1]             \n"
-          "vmla.f32   q15, q8, %f[kr2][1]             \n"
-
-          "vmla.f32   q14, q9, %e[ker1][0]            \n"
-          "vmla.f32   q15, q9, %f[ker0][1]            \n"
-          "vext.32    q13, q9, q10, #1                \n"
-          "vmla.f32   q14, q13, %e[kr4][0]            \n"
-          "vmla.f32   q15, q13, %e[kr3][0]            \n"
-          "vext.32    q13, q9, q10, #2                \n"
-          "vmla.f32   q14, q13, %e[kr4][1]            \n"
-          "vmla.f32   q15, q13, %e[kr3][1]            \n"
-          "vext.32    q13, q9, q10, #3                \n"
-          "vmla.f32   q14, q13, %f[kr4][0]            \n"
-          "vmla.f32   q15, q13, %f[kr3][0]            \n"
-          "vmla.f32   q14, q10, %f[kr4][1]            \n"
-          "vmla.f32   q15, q10, %f[kr3][1]            \n"
-
-          "vmla.f32   q15, q11, %e[ker1][0]           \n"
-          "vext.32    q13, q11, q12, #1               \n"
-          "vmla.f32   q15, q13, %e[kr4][0]            \n"
-          "vext.32    q13, q11, q12, #2               \n"
-          "vmla.f32   q15, q13, %e[kr4][1]            \n"
-          "vext.32    q13, q11, q12, #3               \n"
-          "vmla.f32   q15, q13, %f[kr4][0]            \n"
-          "vmla.f32   q15, q12, %f[kr4][1]            \n"
-
-          "cmp        %[remain], #2                   \n"
-          "blt        store_2h1w_%=                   \n"
-          "vst1.32    {d28}, [%[output_ptr0]]!        \n"
-          "vst1.32    {d30}, [%[output_ptr1]]!        \n"
-          "cmp        %[remain], #3                   \n"
-          "blt        end_%=                          \n"
-          "vst1.32    {d29[0]}, [%[output_ptr0]]!     \n"
-          "vst1.32    {d31[0]}, [%[output_ptr1]]!     \n"
-          "b          end_%=                          \n"
-
-          "store_2h1w_%=:                             \n"
-          "vst1.32    {d28[0]}, [%[output_ptr0]]!     \n"
-          "vst1.32    {d30[0]}, [%[output_ptr1]]!     \n"
-          "end_%=:                                    \n"
-          : [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1),
-            [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3),
-            [input_ptr4] "+r"(input_ptr4), [input_ptr5] "+r"(input_ptr5),
-            [output_ptr0] "+r"(output_ptr0), [output_ptr1] "+r"(output_ptr1),
-            [loop] "+r"(loop)
-          : [remain] "r"(output_w_remain), [kr0] "w"(_ker[0]),
-            [kr1] "w"(_ker[1]), [kr2] "w"(_ker[2]), [kr3] "w"(_ker[3]),
-            [kr4] "w"(_ker[4]), [ker0] "w"(_ker[5]), [ker1] "w"(_ker[6])
-          : "cc", "memory", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
-            "q15", "r0");
-#endif  // __aarch64__
-      // pad right
-      if (padding_w) {
-        float32x4_t row0 = vld1q_f32(input_ptr0);
-        float32x4_t row1 = vld1q_f32(input_ptr1);
-        float32x4_t row2 = vld1q_f32(input_ptr2);
-        float32x4_t row3 = vld1q_f32(input_ptr3);
-        float32x4_t row4 = vld1q_f32(input_ptr4);
-        float32x4_t row5 = vld1q_f32(input_ptr5);
-        float32x4_t zero = vdupq_n_f32(0.f);
-        float32x4_t acc0, acc1;
-        for (int w = valid_w_end; w < output_w; ++w) {
-          int padding = w + 5 - (padding_w + input_w);
-          if (padding >= 5) {
-            *output_ptr0 = 0.f;
-            *output_ptr1 = 0.f;
-          } else {
-            int iw = w - valid_w_end;
-            float sum0 = input_ptr0[iw] * filter_ptr0[0] +
-                         input_ptr1[iw] * filter_ptr1[0] +
-                         input_ptr2[iw] * filter_ptr2[0] +
-                         input_ptr3[iw] * filter_ptr3[0] +
-                         input_ptr4[iw] * filter_ptr4[0];
-            float sum1 = input_ptr1[iw] * filter_ptr0[0] +
-                         input_ptr2[iw] * filter_ptr1[0] +
-                         input_ptr3[iw] * filter_ptr2[0] +
-                         input_ptr4[iw] * filter_ptr3[0] +
-                         input_ptr5[iw] * filter_ptr4[0];
-            row0 = vextq_f32(row0, zero, 1);
-            row1 = vextq_f32(row1, zero, 1);
-            row2 = vextq_f32(row2, zero, 1);
-            row3 = vextq_f32(row3, zero, 1);
-            row4 = vextq_f32(row4, zero, 1);
-            row5 = vextq_f32(row5, zero, 1);
-            acc0 = vmulq_f32(row0, _ker[0]);
-            acc0 = vmlaq_f32(acc0, row1, _ker[1]);
-            acc0 = vmlaq_f32(acc0, row2, _ker[2]);
-            acc0 = vmlaq_f32(acc0, row3, _ker[3]);
-            acc0 = vmlaq_f32(acc0, row4, _ker[4]);
-            acc1 = vmulq_f32(row1, _ker[0]);
-            acc1 = vmlaq_f32(acc1, row2, _ker[1]);
-            acc1 = vmlaq_f32(acc1, row3, _ker[2]);
-            acc1 = vmlaq_f32(acc1, row4, _ker[3]);
-            acc1 = vmlaq_f32(acc1, row5, _ker[4]);
-            acc0 = vpaddq_f32(acc0, acc1);
-            float32x2_t sum =
-                vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
-            sum0 += vget_lane_f32(sum, 0);
-            sum1 += vget_lane_f32(sum, 1);
-            *output_ptr0 = sum0;
-            *output_ptr1 = sum1;
-          }
-          output_ptr0++;
-          output_ptr1++;
-        }
-      }
-    }
-    // remain height
-    int start_h = valid_h_start + (valid_h & 0xfffffffe);
-    if (start_h < valid_h_end) {
-      const float *input_ptr0 = input_ptr + (start_h - padding_h) * input_w;
-      const float *input_ptr1 = input_ptr0 + input_w;
-      const float *input_ptr2 = input_ptr1 + input_w;
-      const float *input_ptr3 = input_ptr2 + input_w;
-      const float *input_ptr4 = input_ptr3 + input_w;
-      float *output_ptr0 = output_ptr + start_h * output_w;
-      // pad left
-      if (padding_w) {
-        float32x4_t row0 = vld1q_f32(input_ptr0);
-        float32x4_t row1 = vld1q_f32(input_ptr1);
-        float32x4_t row2 = vld1q_f32(input_ptr2);
-        float32x4_t row3 = vld1q_f32(input_ptr3);
-        float32x4_t row4 = vld1q_f32(input_ptr4);
-        float32x4_t zero = vdupq_n_f32(0.f);
-        float32x4_t acc;
-        for (int w = valid_w_start - 1; w >= 0; --w) {
-          int padding = padding_w - w;
-          if (padding >= 5) {
-            output_ptr0[w] = 0.f;
-          } else {
-            acc = vmulq_f32(row0, _ker[0]);
-            acc = vmlaq_f32(acc, row1, _ker[1]);
-            acc = vmlaq_f32(acc, row2, _ker[2]);
-            acc = vmlaq_f32(acc, row3, _ker[3]);
-            acc = vmlaq_f32(acc, row4, _ker[4]);
-            float32x2_t sum = vpadd_f32(vget_low_f32(acc), vget_high_f32(acc));
-            sum = vpadd_f32(sum, sum);
-            vst1_lane_f32(output_ptr0 + w, sum, 0);
-
-            row0 = vextq_f32(zero, row0, 3);
-            row1 = vextq_f32(zero, row1, 3);
-            row2 = vextq_f32(zero, row2, 3);
-            row3 = vextq_f32(zero, row3, 3);
-            row4 = vextq_f32(zero, row4, 3);
-          }
-        }
-        output_ptr0 += valid_w_start;
-      }
-        // valid
-#if __aarch64__
-      float32x4_t _q14;
-      for (int loop = 0; loop < output_w_tiles; ++loop) {
-        float32x4_t _q7 = vld1q_f32(input_ptr0);
-        float32x4_t _q8 = vld1q_f32(input_ptr0 + 4);
-        float32x4_t _q9 = vld1q_f32(input_ptr1);
-        float32x4_t _q10 = vld1q_f32(input_ptr1 + 4);
-        float32x4_t _q11 = vld1q_f32(input_ptr2);
-        float32x4_t _q12 = vld1q_f32(input_ptr2 + 4);
-
-        _q14 = vmulq_lane_f32(_q7, vget_low_f32(_ker[5]), 0);
-        float32x4_t _q13 = vextq_f32(_q7, _q8, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[0]), 0);
-        _q13 = vextq_f32(_q7, _q8, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[0]), 1);
-        _q13 = vextq_f32(_q7, _q8, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[0]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q8, vget_high_f32(_ker[0]), 1);
-
-        _q14 = vmlaq_lane_f32(_q14, _q9, vget_low_f32(_ker[5]), 1);
-        _q13 = vextq_f32(_q9, _q10, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[1]), 0);
-        _q13 = vextq_f32(_q9, _q10, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[1]), 1);
-        _q13 = vextq_f32(_q9, _q10, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[1]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q10, vget_high_f32(_ker[1]), 1);
-
-        _q14 = vmlaq_lane_f32(_q14, _q11, vget_high_f32(_ker[5]), 0);
-        _q13 = vextq_f32(_q11, _q12, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[2]), 0);
-        _q13 = vextq_f32(_q11, _q12, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[2]), 1);
-        _q13 = vextq_f32(_q11, _q12, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[2]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q12, vget_high_f32(_ker[2]), 1);
-
-        _q7 = vld1q_f32(input_ptr3);
-        _q8 = vld1q_f32(input_ptr3 + 4);
-        _q9 = vld1q_f32(input_ptr4);
-        _q10 = vld1q_f32(input_ptr4 + 4);
-
-        _q14 = vmlaq_lane_f32(_q14, _q7, vget_high_f32(_ker[5]), 1);
-        _q13 = vextq_f32(_q7, _q8, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[3]), 0);
-        _q13 = vextq_f32(_q7, _q8, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[3]), 1);
-        _q13 = vextq_f32(_q7, _q8, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[3]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q8, vget_high_f32(_ker[3]), 1);
-
-        _q14 = vmlaq_lane_f32(_q14, _q9, vget_low_f32(_ker[6]), 0);
-        _q13 = vextq_f32(_q9, _q10, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[4]), 0);
-        _q13 = vextq_f32(_q9, _q10, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[4]), 1);
-        _q13 = vextq_f32(_q9, _q10, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[4]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q10, vget_high_f32(_ker[4]), 1);
-
-        vst1q_f32(output_ptr0, _q14);
-
-        input_ptr0 += 4;
-        input_ptr1 += 4;
-        input_ptr2 += 4;
-        input_ptr3 += 4;
-        input_ptr4 += 4;
-        output_ptr0 += 4;
-      }
-      // remain w
-      if (output_w_remain > 0) {
-        float32x4_t _q7 = vld1q_f32(input_ptr0);
-        float32x4_t _q8 = vld1q_f32(input_ptr0 + 4);
-        float32x4_t _q9 = vld1q_f32(input_ptr1);
-        float32x4_t _q10 = vld1q_f32(input_ptr1 + 4);
-        float32x4_t _q11 = vld1q_f32(input_ptr2);
-        float32x4_t _q12 = vld1q_f32(input_ptr2 + 4);
-
-        _q14 = vmulq_lane_f32(_q7, vget_low_f32(_ker[5]), 0);
-        float32x4_t _q13 = vextq_f32(_q7, _q8, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[0]), 0);
-        _q13 = vextq_f32(_q7, _q8, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[0]), 1);
-        _q13 = vextq_f32(_q7, _q8, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[0]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q8, vget_high_f32(_ker[0]), 1);
-
-        _q14 = vmlaq_lane_f32(_q14, _q9, vget_low_f32(_ker[5]), 1);
-        _q13 = vextq_f32(_q9, _q10, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[1]), 0);
-        _q13 = vextq_f32(_q9, _q10, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[1]), 1);
-        _q13 = vextq_f32(_q9, _q10, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[1]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q10, vget_high_f32(_ker[1]), 1);
-
-        _q14 = vmlaq_lane_f32(_q14, _q11, vget_high_f32(_ker[5]), 0);
-        _q13 = vextq_f32(_q11, _q12, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[2]), 0);
-        _q13 = vextq_f32(_q11, _q12, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[2]), 1);
-        _q13 = vextq_f32(_q11, _q12, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[2]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q12, vget_high_f32(_ker[2]), 1);
-
-        _q7 = vld1q_f32(input_ptr3);
-        _q8 = vld1q_f32(input_ptr3 + 4);
-        _q9 = vld1q_f32(input_ptr4);
-        _q10 = vld1q_f32(input_ptr4 + 4);
-
-        _q14 = vmlaq_lane_f32(_q14, _q7, vget_high_f32(_ker[5]), 1);
-        _q13 = vextq_f32(_q7, _q8, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[3]), 0);
-        _q13 = vextq_f32(_q7, _q8, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[3]), 1);
-        _q13 = vextq_f32(_q7, _q8, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[3]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q8, vget_high_f32(_ker[3]), 1);
-
-        _q14 = vmlaq_lane_f32(_q14, _q9, vget_low_f32(_ker[6]), 0);
-        _q13 = vextq_f32(_q9, _q10, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[4]), 0);
-        _q13 = vextq_f32(_q9, _q10, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[4]), 1);
-        _q13 = vextq_f32(_q9, _q10, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[4]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q10, vget_high_f32(_ker[4]), 1);
-
-        switch (output_w_remain) {
-          case 3:
-            vst1q_lane_f32(output_ptr0 + 2, _q14, 2);
-          case 2:
-            vst1_f32(output_ptr0, vget_low_f32(_q14));
-            break;
-          case 1:
-            vst1q_lane_f32(output_ptr0, _q14, 0);
-            break;
-        }
-
-        input_ptr0 += output_w_remain;
-        input_ptr1 += output_w_remain;
-        input_ptr2 += output_w_remain;
-        input_ptr3 += output_w_remain;
-        input_ptr4 += output_w_remain;
-        output_ptr0 += output_w_remain;
-      }
-#else
-      int loop = output_w_tiles;
-      asm volatile(
-          "cmp        %[loop], #0                     \n"
-          "ble        start_remain_%=                 \n"
-          "mov        r0, #16                         \n"
-          "loop_1h4w_%=:                              \n"
-          "vld1.32    {d14-d17}, [%[input_ptr0]], r0  \n"
-          "vld1.32    {d18-d21}, [%[input_ptr1]], r0  \n"
-          "vld1.32    {d22-d25}, [%[input_ptr2]], r0  \n"
-          "vmul.f32   q14, q7, %e[ker0][0]            \n"
-          "vext.32    q13, q7, q8, #1                 \n"
-          "vmla.f32   q14, q13, %e[kr0][0]            \n"
-          "vext.32    q13, q7, q8, #2                 \n"
-          "vmla.f32   q14, q13, %e[kr0][1]            \n"
-          "vext.32    q13, q7, q8, #3                 \n"
-          "vmla.f32   q14, q13, %f[kr0][0]            \n"
-          "vmla.f32   q14, q8, %f[kr0][1]             \n"
-
-          "vmla.f32   q14, q9, %e[ker0][1]            \n"
-          "vext.32    q13, q9, q10, #1                \n"
-          "vmla.f32   q14, q13, %e[kr1][0]            \n"
-          "vext.32    q13, q9, q10, #2                \n"
-          "vmla.f32   q14, q13, %e[kr1][1]            \n"
-          "vext.32    q13, q9, q10, #3                \n"
-          "vmla.f32   q14, q13, %f[kr1][0]            \n"
-          "vmla.f32   q14, q10, %f[kr1][1]            \n"
-
-          "vmla.f32   q14, q11, %f[ker0][0]           \n"
-          "vext.32    q13, q11, q12, #1               \n"
-          "vmla.f32   q14, q13, %e[kr2][0]            \n"
-          "vext.32    q13, q11, q12, #2               \n"
-          "vmla.f32   q14, q13, %e[kr2][1]            \n"
-          "vext.32    q13, q11, q12, #3               \n"
-          "vmla.f32   q14, q13, %f[kr2][0]            \n"
-          "vmla.f32   q14, q12, %f[kr2][1]            \n"
-
-          "vld1.32    {d14-d17}, [%[input_ptr3]], r0  \n"
-          "vld1.32    {d18-d21}, [%[input_ptr4]], r0  \n"
-          "vmla.f32   q14, q7, %f[ker0][1]            \n"
-          "vext.32    q13, q7, q8, #1                 \n"
-          "vmla.f32   q14, q13, %e[kr3][0]            \n"
-          "vext.32    q13, q7, q8, #2                 \n"
-          "vmla.f32   q14, q13, %e[kr3][1]            \n"
-          "vext.32    q13, q7, q8, #3                 \n"
-          "vmla.f32   q14, q13, %f[kr3][0]            \n"
-          "vmla.f32   q14, q8, %f[kr3][1]             \n"
-
-          "vmla.f32   q14, q9, %e[ker1][0]            \n"
-          "vext.32    q13, q9, q10, #1                \n"
-          "vmla.f32   q14, q13, %e[kr4][0]            \n"
-          "vext.32    q13, q9, q10, #2                \n"
-          "vmla.f32   q14, q13, %e[kr4][1]            \n"
-          "vext.32    q13, q9, q10, #3                \n"
-          "vmla.f32   q14, q13, %f[kr4][0]            \n"
-          "vmla.f32   q14, q10, %f[kr4][1]            \n"
-
-          // restore output
-          "vst1.32    {q14}, [%[output_ptr0]]!        \n"
-          "subs       %[loop], #1                     \n"
-          "bne        loop_1h4w_%=                    \n"
-
-          "start_remain_%=:                           \n"
-          "cmp        %[remain], #0                   \n"
-          "ble        end_%=                          \n"
-          "mov        r0, %[remain], lsl #2           \n"
-          "vld1.32    {d14-d17}, [%[input_ptr0]], r0  \n"
-          "vld1.32    {d18-d21}, [%[input_ptr1]], r0  \n"
-          "vld1.32    {d22-d25}, [%[input_ptr2]], r0  \n"
-          "vmul.f32   q14, q7, %e[ker0][0]            \n"
-          "vext.32    q13, q7, q8, #1                 \n"
-          "vmla.f32   q14, q13, %e[kr0][0]            \n"
-          "vext.32    q13, q7, q8, #2                 \n"
-          "vmla.f32   q14, q13, %e[kr0][1]            \n"
-          "vext.32    q13, q7, q8, #3                 \n"
-          "vmla.f32   q14, q13, %f[kr0][0]            \n"
-          "vmla.f32   q14, q8, %f[kr0][1]             \n"
-
-          "vmla.f32   q14, q9, %e[ker0][1]            \n"
-          "vext.32    q13, q9, q10, #1                \n"
-          "vmla.f32   q14, q13, %e[kr1][0]            \n"
-          "vext.32    q13, q9, q10, #2                \n"
-          "vmla.f32   q14, q13, %e[kr1][1]            \n"
-          "vext.32    q13, q9, q10, #3                \n"
-          "vmla.f32   q14, q13, %f[kr1][0]            \n"
-          "vmla.f32   q14, q10, %f[kr1][1]            \n"
-
-          "vmla.f32   q14, q11, %f[ker0][0]           \n"
-          "vext.32    q13, q11, q12, #1               \n"
-          "vmla.f32   q14, q13, %e[kr2][0]            \n"
-          "vext.32    q13, q11, q12, #2               \n"
-          "vmla.f32   q14, q13, %e[kr2][1]            \n"
-          "vext.32    q13, q11, q12, #3               \n"
-          "vmla.f32   q14, q13, %f[kr2][0]            \n"
-          "vmla.f32   q14, q12, %f[kr2][1]            \n"
-
-          "vld1.32    {d14-d17}, [%[input_ptr3]], r0  \n"
-          "vld1.32    {d18-d21}, [%[input_ptr4]], r0  \n"
-          "vmla.f32   q14, q7, %f[ker0][1]            \n"
-          "vext.32    q13, q7, q8, #1                 \n"
-          "vmla.f32   q14, q13, %e[kr3][0]            \n"
-          "vext.32    q13, q7, q8, #2                 \n"
-          "vmla.f32   q14, q13, %e[kr3][1]            \n"
-          "vext.32    q13, q7, q8, #3                 \n"
-          "vmla.f32   q14, q13, %f[kr3][0]            \n"
-          "vmla.f32   q14, q8, %f[kr3][1]             \n"
-
-          "vmla.f32   q14, q9, %e[ker1][0]            \n"
-          "vext.32    q13, q9, q10, #1                \n"
-          "vmla.f32   q14, q13, %e[kr4][0]            \n"
-          "vext.32    q13, q9, q10, #2                \n"
-          "vmla.f32   q14, q13, %e[kr4][1]            \n"
-          "vext.32    q13, q9, q10, #3                \n"
-          "vmla.f32   q14, q13, %f[kr4][0]            \n"
-          "vmla.f32   q14, q10, %f[kr4][1]            \n"
-
-          "cmp        %[remain], #2                   \n"
-          "blt        store_1h1w_%=                   \n"
-          "vst1.32    {d28}, [%[output_ptr0]]!        \n"
-          "cmp        %[remain], #3                   \n"
-          "blt        end_%=                          \n"
-          "vst1.32    {d29[0]}, [%[output_ptr0]]!     \n"
-          "b          end_%=                          \n"
-
-          "store_1h1w_%=:                             \n"
-          "vst1.32    {d28[0]}, [%[output_ptr0]]!     \n"
-          "end_%=:                                    \n"
-          : [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1),
-            [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3),
-            [input_ptr4] "+r"(input_ptr4), [output_ptr0] "+r"(output_ptr0),
-            [loop] "+r"(loop)
-          : [remain] "r"(output_w_remain), [kr0] "w"(_ker[0]),
-            [kr1] "w"(_ker[1]), [kr2] "w"(_ker[2]), [kr3] "w"(_ker[3]),
-            [kr4] "w"(_ker[4]), [ker0] "w"(_ker[5]), [ker1] "w"(_ker[6])
-          : "cc", "memory", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
-            "q15", "r0");
-#endif  // __aarch64__
-      // pad right
-      if (padding_w) {
-        float32x4_t row0 = vld1q_f32(input_ptr0);
-        float32x4_t row1 = vld1q_f32(input_ptr1);
-        float32x4_t row2 = vld1q_f32(input_ptr2);
-        float32x4_t row3 = vld1q_f32(input_ptr3);
-        float32x4_t row4 = vld1q_f32(input_ptr4);
-        float32x4_t zero = vdupq_n_f32(0.f);
-        float32x4_t acc;
-        for (int w = valid_w_end; w < output_w; ++w) {
-          int padding = w + 5 - (padding_w + input_w);
-          if (padding >= 5) {
-            *output_ptr0 = 0.f;
-          } else {
-            int iw = w - valid_w_end;
-            float sum0 = input_ptr0[iw] * filter_ptr0[0] +
-                         input_ptr1[iw] * filter_ptr1[0] +
-                         input_ptr2[iw] * filter_ptr2[0] +
-                         input_ptr3[iw] * filter_ptr3[0] +
-                         input_ptr4[iw] * filter_ptr4[0];
-            row0 = vextq_f32(row0, zero, 1);
-            row1 = vextq_f32(row1, zero, 1);
-            row2 = vextq_f32(row2, zero, 1);
-            row3 = vextq_f32(row3, zero, 1);
-            row4 = vextq_f32(row4, zero, 1);
-            acc = vmulq_f32(row0, _ker[0]);
-            acc = vmlaq_f32(acc, row1, _ker[1]);
-            acc = vmlaq_f32(acc, row2, _ker[2]);
-            acc = vmlaq_f32(acc, row3, _ker[3]);
-            acc = vmlaq_f32(acc, row4, _ker[4]);
-            float32x2_t sum = vpadd_f32(vget_low_f32(acc), vget_high_f32(acc));
-            sum = vpadd_f32(sum, sum);
-            sum0 += vget_lane_f32(sum, 0);
-            *output_ptr0 = sum0;
-          }
-          output_ptr0++;
-        }
-      }
-    }
-    // pad bottom
-    for (int h = valid_h_end; h < output_h; ++h) {
-      DepthwiseConv5x5NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h,
-                                      input_w, padding_h, padding_w, output_w,
-                                      output_ptr, _ker, _ker1);
-    }
-  }
-}
-
-template <>
-void DepthwiseConv5x5S2<float, float>(const framework::Tensor &input,
-                                      const framework::Tensor &filter,
-                                      const std::vector<int> &paddings,
-                                      framework::Tensor *output) {}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // __ARM_NEON__
diff --git a/mobile/src/operators/math/depthwise_conv5x5.h b/mobile/src/operators/math/depthwise_conv5x5.h
deleted file mode 100644
index 11d96b078a..0000000000
--- a/mobile/src/operators/math/depthwise_conv5x5.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <vector>
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-// TODO(hjchen2) need to be implemented
-// template<typename Itype, typename Otype>
-// void DepthwiseConv5x5(const framework::Tensor *input,
-//                      const framework::Tensor *filter,
-//                      const std::vector<int> &strides,
-//                      const std::vector<int> &paddings,
-//                      framework::Tensor *output);
-
-template <typename Itype, typename Otype>
-void DepthwiseConv5x5S1(const framework::Tensor &input,
-                        const framework::Tensor &filter,
-                        const std::vector<int> &paddings,
-                        framework::Tensor *output);
-
-template <typename Itype, typename Otype>
-void DepthwiseConv5x5S2(const framework::Tensor &input,
-                        const framework::Tensor &filter,
-                        const std::vector<int> &paddings,
-                        framework::Tensor *output);
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/depthwise_conv5x5_int8.cpp b/mobile/src/operators/math/depthwise_conv5x5_int8.cpp
deleted file mode 100644
index 1e9482beb4..0000000000
--- a/mobile/src/operators/math/depthwise_conv5x5_int8.cpp
+++ /dev/null
@@ -1,1041 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined(__ARM_NEON__) && !defined(__aarch64__)
-
-#include <arm_neon.h>
-#include "operators/math/depthwise_conv5x5.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-#ifndef __aarch64__
-inline int32x4_t vpaddq_s32(int32x4_t r0, int32x4_t r1) {
-  int32x2_t sum0 = vpadd_s32(vget_low_s32(r0), vget_high_s32(r0));
-  int32x2_t sum1 = vpadd_s32(vget_low_s32(r1), vget_high_s32(r1));
-  return vcombine_s32(sum0, sum1);
-}
-#endif
-
-template <int Stride = 1>
-inline void Depth5x5NormalRowLoadInput(const int8_t *input, int16x4_t *y) {
-  int16x8_t x = vmovl_s8(vld1_s8(input));
-  y[0] = vget_low_s16(x);
-  y[4] = vget_high_s16(x);
-  y[1] = vext_s16(y[0], y[4], 1);
-  y[2] = vext_s16(y[0], y[4], 2);
-  y[3] = vext_s16(y[0], y[4], 3);
-}
-
-template <>
-inline void Depth5x5NormalRowLoadInput<2>(const int8_t *input, int16x4_t *y) {
-  int8x8x2_t x = vld2_s8(input);
-  y[0] = vget_low_s16(vmovl_s8(x.val[0]));
-  y[1] = vget_low_s16(vmovl_s8(x.val[1]));
-  y[2] = vext_s16(y[0], y[0], 1);
-  y[3] = vext_s16(y[1], y[1], 1);
-  y[4] = vext_s16(y[0], y[0], 2);
-}
-
-#define DEPTHWISE_CONV_NORMAL_BORDER(start, end)                         \
-  for (int w = start; w < end; ++w) {                                    \
-    const int w_in_start = -padding_w + w * Stride_w;                    \
-    const int w_in_end = w_in_start + 5;                                 \
-    const int w_start = w_in_start > 0 ? w_in_start : 0;                 \
-    const int w_end = w_in_end < input_w ? w_in_end : input_w;           \
-    int32_t value = 0;                                                   \
-    for (int h_in = h_start; h_in < h_end; ++h_in) {                     \
-      for (int w_in = w_start; w_in < w_end; ++w_in) {                   \
-        value += filter[(h_in - h_in_start) * 5 + (w_in - w_in_start)] * \
-                 input[h_in * input_w + w_in];                           \
-      }                                                                  \
-    }                                                                    \
-    output_ptr[w] = value;                                               \
-  }
-
-template <int Stride_h, int Stride_w>
-inline void DepthwiseConv5x5NormalRow(const int8_t *input, const int8_t *filter,
-                                      const int h_output, const int input_h,
-                                      const int input_w, const int padding_h,
-                                      const int padding_w, const int output_w,
-                                      int32_t *output, int16x4_t *ker,
-                                      int16_t *ker1) {
-  const int h_in_start = -padding_h + h_output * Stride_h;
-  const int h_in_end = h_in_start + 5;
-  const int h_start = h_in_start > 0 ? h_in_start : 0;
-  const int h_end = h_in_end < input_h ? h_in_end : input_h;
-
-  int valid_w_start = (padding_w + Stride_w - 1) / Stride_w;
-  int valid_w_end = output_w - valid_w_start;
-  int32_t *output_ptr = output + h_output * output_w;
-  // border left
-  DEPTHWISE_CONV_NORMAL_BORDER(0, valid_w_start)
-  // middle
-  int output_tiles = (valid_w_end - valid_w_start) >> 2;
-  int16x4_t _x[5];
-  int32x4_t _sum;
-  // valid w
-  for (int w = 0; w < output_tiles * 4; w += 4) {
-    _sum = vdupq_n_s32(0);
-    int output_offset = valid_w_start + w;
-    int input_w_offset = output_offset * Stride_w - padding_w;
-    for (int h_in = h_start; h_in < h_end; ++h_in) {
-      int index = h_in - h_in_start;
-      Depth5x5NormalRowLoadInput<Stride_w>(
-          input + h_in * input_w + input_w_offset, _x);
-      _sum = vmlal_n_s16(_sum, _x[0], ker1[index]);
-      _sum = vmlal_lane_s16(_sum, _x[1], ker[index], 0);
-      _sum = vmlal_lane_s16(_sum, _x[2], ker[index], 1);
-      _sum = vmlal_lane_s16(_sum, _x[3], ker[index], 2);
-      _sum = vmlal_lane_s16(_sum, _x[4], ker[index], 3);
-    }
-    vst1q_s32(output_ptr + output_offset, _sum);
-  }
-  // remain valid w
-  int remain = (valid_w_end - valid_w_start) & 0x3;
-  if (remain > 0) {
-    _sum = vdupq_n_s32(0);
-    int remain_start = valid_w_start + (output_tiles << 2);
-    int input_w_offset = remain_start * Stride_w - padding_w;
-    int32_t *output_ptr0 = output_ptr + remain_start;
-
-    for (int h_in = h_start; h_in < h_end; ++h_in) {
-      int index = h_in - h_in_start;
-      Depth5x5NormalRowLoadInput<Stride_w>(
-          input + h_in * input_w + input_w_offset, _x);
-      _sum = vmlal_n_s16(_sum, _x[0], ker1[index]);
-      _sum = vmlal_lane_s16(_sum, _x[1], ker[index], 0);
-      _sum = vmlal_lane_s16(_sum, _x[2], ker[index], 1);
-      _sum = vmlal_lane_s16(_sum, _x[3], ker[index], 2);
-      _sum = vmlal_lane_s16(_sum, _x[4], ker[index], 3);
-    }
-    switch (remain) {
-      case 1:
-        vst1_lane_s32(output_ptr0, vget_low_s32(_sum), 0);
-        break;
-      case 2:
-        vst1_s32(output_ptr0, vget_low_s32(_sum));
-        break;
-      case 3:
-        vst1_s32(output_ptr0, vget_low_s32(_sum));
-        vst1_lane_s32(output_ptr0 + 2, vget_high_s32(_sum), 0);
-        break;
-    }
-  }
-  // border right
-  DEPTHWISE_CONV_NORMAL_BORDER(valid_w_end, output_w)
-}
-
-template <>
-void DepthwiseConv5x5S1<int8_t, int32_t>(const framework::Tensor &input,
-                                         const framework::Tensor &filter,
-                                         const std::vector<int> &paddings,
-                                         framework::Tensor *output) {
-  const int8_t *input_data = input.data<int8_t>();
-  const int8_t *filter_data = filter.data<int8_t>();
-  int32_t *out_data = output->mutable_data<int32_t>();
-  int input_h = input.dims()[2];
-  int input_w = input.dims()[3];
-  int output_h = output->dims()[2];
-  int output_w = output->dims()[3];
-  int padding_h = paddings[0];
-  int padding_w = paddings[1];
-  int image_size = input_h * input_w;
-  int out_image_size = output_h * output_w;
-  int valid_h_start = padding_h;
-  int valid_h_end = output_h - valid_h_start;
-  int valid_h = valid_h_end - valid_h_start;
-  int valid_w_start = padding_w;
-  int valid_w_end = output_w - valid_w_start;
-  int valid_w = valid_w_end - valid_w_start;
-
-  #pragma omp parallel for
-  for (int g = 0; g < input.dims()[1]; ++g) {
-    const int8_t *input_ptr = input_data + g * image_size;
-    const int8_t *filter_ptr = filter_data + g * 25;
-    int32_t *output_ptr = out_data + g * out_image_size;
-
-    const int8_t *filter_ptr0 = filter_ptr;
-    const int8_t *filter_ptr1 = filter_ptr0 + 5;
-    const int8_t *filter_ptr2 = filter_ptr1 + 5;
-    const int8_t *filter_ptr3 = filter_ptr2 + 5;
-    const int8_t *filter_ptr4 = filter_ptr3 + 5;
-    int16_t kernel[5] = {*filter_ptr0, *filter_ptr1, *filter_ptr2, *filter_ptr3,
-                         *filter_ptr4};
-    int16x4_t _k0 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr0 + 1)));
-    int16x4_t _k1 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr1 + 1)));
-    int16x4_t _k2 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr2 + 1)));
-    int16x4_t _k3 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr3 + 1)));
-    int16x4_t _k4 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr4 + 1)));
-    int16x4_t _k5 = vld1_s16(kernel);
-    int16x4_t _k6 = vld1_s16(kernel + 4);
-    int16x8_t _ker0 = vcombine_s16(_k0, _k1);
-    int16x8_t _ker1 = vcombine_s16(_k2, _k3);
-    int16x8_t _ker2 = vcombine_s16(_k4, _k5);
-    int16x8_t _ker3 = vcombine_s16(_k6, _k6);
-    int16x4_t _ker[7] = {_k0, _k1, _k2, _k3, _k4, _k5, _k6};
-
-    // pad top
-    for (int h = 0; h < valid_h_start; ++h) {
-      DepthwiseConv5x5NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h,
-                                      input_w, padding_h, padding_w, output_w,
-                                      output_ptr, _ker, kernel);
-    }
-
-    // output 4x4
-    int output_w_tiles = valid_w / 8;
-    int output_w_remain = valid_w - output_w_tiles * 8;
-    for (int h = valid_h_start; h < valid_h_end - 1; h += 2) {
-      const int8_t *input_ptr0 = input_ptr + (h - padding_h) * input_w;
-      const int8_t *input_ptr1 = input_ptr0 + input_w;
-      const int8_t *input_ptr2 = input_ptr1 + input_w;
-      const int8_t *input_ptr3 = input_ptr2 + input_w;
-      const int8_t *input_ptr4 = input_ptr3 + input_w;
-      const int8_t *input_ptr5 = input_ptr4 + input_w;
-      int32_t *output_ptr0 = output_ptr + h * output_w;
-      int32_t *output_ptr1 = output_ptr0 + output_w;
-      // pad left
-      if (padding_w) {
-        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0)));
-        int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1)));
-        int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2)));
-        int16x4_t row3 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr3)));
-        int16x4_t row4 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr4)));
-        int16x4_t row5 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr5)));
-        int16x4_t zero = vdup_n_s16(0);
-        int32x4_t acc0, acc1;
-        for (int w = valid_w_start - 1; w >= 0; --w) {
-          int padding = padding_w - w;
-          if (padding >= 5) {
-            output_ptr0[w] = 0;
-            output_ptr1[w] = 0;
-          } else {
-            acc0 = vmull_s16(row0, _ker[0]);
-            acc0 = vmlal_s16(acc0, row1, _ker[1]);
-            acc0 = vmlal_s16(acc0, row2, _ker[2]);
-            acc0 = vmlal_s16(acc0, row3, _ker[3]);
-            acc0 = vmlal_s16(acc0, row4, _ker[4]);
-            acc1 = vmull_s16(row1, _ker[0]);
-            acc1 = vmlal_s16(acc1, row2, _ker[1]);
-            acc1 = vmlal_s16(acc1, row3, _ker[2]);
-            acc1 = vmlal_s16(acc1, row4, _ker[3]);
-            acc1 = vmlal_s16(acc1, row5, _ker[4]);
-            acc0 = vpaddq_s32(acc0, acc1);
-            int32x2_t sum = vpadd_s32(vget_low_s32(acc0), vget_high_s32(acc0));
-            vst1_lane_s32(output_ptr0 + w, sum, 0);
-            vst1_lane_s32(output_ptr1 + w, sum, 1);
-
-            row0 = vext_s16(zero, row0, 3);
-            row1 = vext_s16(zero, row1, 3);
-            row2 = vext_s16(zero, row2, 3);
-            row3 = vext_s16(zero, row3, 3);
-            row4 = vext_s16(zero, row4, 3);
-            row5 = vext_s16(zero, row5, 3);
-          }
-        }
-        output_ptr0 += valid_w_start;
-        output_ptr1 += valid_w_start;
-      }
-      // valid
-      int loop = output_w_tiles;
-      int w_remain = output_w_remain;
-      asm volatile(
-          "cmp        %[loop], #0                     \n"
-          "ble        start_remain4_%=                \n"
-          "mov        r0, #8                          \n"
-          "loop_2h8w_%=:                              \n"
-          "vld1.s8    {d10-d11}, [%[input_ptr0]], r0  \n"
-          "vld1.s8    {d12-d13}, [%[input_ptr1]], r0  \n"
-          "vld1.s8    {d14-d15}, [%[input_ptr2]], r0  \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmovl.s8   q9, d11                         \n"
-          "vmull.s16  q12, d16, %f[ker2][0]           \n"
-          "vmull.s16  q13, d17, %f[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][0]           \n"
-          "vmlal.s16  q13, d21, %e[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][1]           \n"
-          "vmlal.s16  q13, d21, %e[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][2]           \n"
-          "vmlal.s16  q13, d21, %e[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][3]           \n"
-          "vmlal.s16  q13, d21, %e[ker0][3]           \n"
-
-          "vmovl.s8   q8, d12                         \n"
-          "vmovl.s8   q9, d13                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][1]           \n"
-          "vmlal.s16  q13, d17, %f[ker2][1]           \n"
-          "vmull.s16  q14, d16, %f[ker2][0]           \n"
-          "vmull.s16  q15, d17, %f[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][0]           \n"
-          "vmlal.s16  q13, d21, %f[ker0][0]           \n"
-          "vmlal.s16  q14, d20, %e[ker0][0]           \n"
-          "vmlal.s16  q15, d21, %e[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][1]           \n"
-          "vmlal.s16  q13, d21, %f[ker0][1]           \n"
-          "vmlal.s16  q14, d20, %e[ker0][1]           \n"
-          "vmlal.s16  q15, d21, %e[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][2]           \n"
-          "vmlal.s16  q13, d21, %f[ker0][2]           \n"
-          "vmlal.s16  q14, d20, %e[ker0][2]           \n"
-          "vmlal.s16  q15, d21, %e[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][3]           \n"
-          "vmlal.s16  q13, d21, %f[ker0][3]           \n"
-          "vmlal.s16  q14, d20, %e[ker0][3]           \n"
-          "vmlal.s16  q15, d21, %e[ker0][3]           \n"
-
-          "vmovl.s8   q8, d14                         \n"
-          "vmovl.s8   q9, d15                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][2]           \n"
-          "vmlal.s16  q13, d17, %f[ker2][2]           \n"
-          "vmlal.s16  q14, d16, %f[ker2][1]           \n"
-          "vmlal.s16  q15, d17, %f[ker2][1]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][0]           \n"
-          "vmlal.s16  q13, d21, %e[ker1][0]           \n"
-          "vmlal.s16  q14, d20, %f[ker0][0]           \n"
-          "vmlal.s16  q15, d21, %f[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][1]           \n"
-          "vmlal.s16  q13, d21, %e[ker1][1]           \n"
-          "vmlal.s16  q14, d20, %f[ker0][1]           \n"
-          "vmlal.s16  q15, d21, %f[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][2]           \n"
-          "vmlal.s16  q13, d21, %e[ker1][2]           \n"
-          "vmlal.s16  q14, d20, %f[ker0][2]           \n"
-          "vmlal.s16  q15, d21, %f[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][3]           \n"
-          "vmlal.s16  q13, d21, %e[ker1][3]           \n"
-          "vmlal.s16  q14, d20, %f[ker0][3]           \n"
-          "vmlal.s16  q15, d21, %f[ker0][3]           \n"
-
-          "vld1.s8    {d10-d11}, [%[input_ptr3]], r0  \n"
-          "vld1.s8    {d12-d13}, [%[input_ptr4]], r0  \n"
-          "vld1.s8    {d14-d15}, [%[input_ptr5]], r0  \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmovl.s8   q9, d11                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][3]           \n"
-          "vmlal.s16  q13, d17, %f[ker2][3]           \n"
-          "vmlal.s16  q14, d16, %f[ker2][2]           \n"
-          "vmlal.s16  q15, d17, %f[ker2][2]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][0]           \n"
-          "vmlal.s16  q13, d21, %f[ker1][0]           \n"
-          "vmlal.s16  q14, d20, %e[ker1][0]           \n"
-          "vmlal.s16  q15, d21, %e[ker1][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][1]           \n"
-          "vmlal.s16  q13, d21, %f[ker1][1]           \n"
-          "vmlal.s16  q14, d20, %e[ker1][1]           \n"
-          "vmlal.s16  q15, d21, %e[ker1][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][2]           \n"
-          "vmlal.s16  q13, d21, %f[ker1][2]           \n"
-          "vmlal.s16  q14, d20, %e[ker1][2]           \n"
-          "vmlal.s16  q15, d21, %e[ker1][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][3]           \n"
-          "vmlal.s16  q13, d21, %f[ker1][3]           \n"
-          "vmlal.s16  q14, d20, %e[ker1][3]           \n"
-          "vmlal.s16  q15, d21, %e[ker1][3]           \n"
-
-          "vmovl.s8   q8, d12                         \n"
-          "vmovl.s8   q9, d13                         \n"
-          "vmlal.s16  q12, d16, %e[ker3][0]           \n"
-          "vmlal.s16  q13, d17, %e[ker3][0]           \n"
-          "vmlal.s16  q14, d16, %f[ker2][3]           \n"
-          "vmlal.s16  q15, d17, %f[ker2][3]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][0]           \n"
-          "vmlal.s16  q13, d21, %e[ker2][0]           \n"
-          "vmlal.s16  q14, d20, %f[ker1][0]           \n"
-          "vmlal.s16  q15, d21, %f[ker1][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][1]           \n"
-          "vmlal.s16  q13, d21, %e[ker2][1]           \n"
-          "vmlal.s16  q14, d20, %f[ker1][1]           \n"
-          "vmlal.s16  q15, d21, %f[ker1][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][2]           \n"
-          "vmlal.s16  q13, d21, %e[ker2][2]           \n"
-          "vmlal.s16  q14, d20, %f[ker1][2]           \n"
-          "vmlal.s16  q15, d21, %f[ker1][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][3]           \n"
-          "vmlal.s16  q13, d21, %e[ker2][3]           \n"
-          "vmlal.s16  q14, d20, %f[ker1][3]           \n"
-          "vmlal.s16  q15, d21, %f[ker1][3]           \n"
-
-          "vmovl.s8   q8, d14                         \n"
-          "vmovl.s8   q9, d15                         \n"
-          "vmlal.s16  q14, d16, %e[ker3][0]           \n"
-          "vmlal.s16  q15, d17, %e[ker3][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q14, d20, %e[ker2][0]           \n"
-          "vmlal.s16  q15, d21, %e[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q14, d20, %e[ker2][1]           \n"
-          "vmlal.s16  q15, d21, %e[ker2][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q14, d20, %e[ker2][2]           \n"
-          "vmlal.s16  q15, d21, %e[ker2][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q14, d20, %e[ker2][3]           \n"
-          "vmlal.s16  q15, d21, %e[ker2][3]           \n"
-
-          // restore output
-          "vst1.32    {q12-q13}, [%[output_ptr0]]!    \n"
-          "vst1.32    {q14-q15}, [%[output_ptr1]]!    \n"
-          "subs       %[loop], #1                     \n"
-          "bne        loop_2h8w_%=                    \n"
-
-          "start_remain4_%=:                          \n"
-          "cmp        %[remain], #4                   \n"
-          "blt        start_remain_%=                 \n"
-          "mov        r0, #4                          \n"
-          "vld1.s8    {d10}, [%[input_ptr0]], r0      \n"
-          "vld1.s8    {d12}, [%[input_ptr1]], r0      \n"
-          "vld1.s8    {d14}, [%[input_ptr2]], r0      \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmull.s16  q12, d16, %f[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][3]           \n"
-
-          "vmovl.s8   q8, d12                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][1]           \n"
-          "vmull.s16  q14, d16, %f[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][0]           \n"
-          "vmlal.s16  q14, d20, %e[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][1]           \n"
-          "vmlal.s16  q14, d20, %e[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][2]           \n"
-          "vmlal.s16  q14, d20, %e[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][3]           \n"
-          "vmlal.s16  q14, d20, %e[ker0][3]           \n"
-
-          "vmovl.s8   q8, d14                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][2]           \n"
-          "vmlal.s16  q14, d16, %f[ker2][1]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][0]           \n"
-          "vmlal.s16  q14, d20, %f[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][1]           \n"
-          "vmlal.s16  q14, d20, %f[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][2]           \n"
-          "vmlal.s16  q14, d20, %f[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][3]           \n"
-          "vmlal.s16  q14, d20, %f[ker0][3]           \n"
-
-          "vld1.s8    {d10}, [%[input_ptr3]], r0      \n"
-          "vld1.s8    {d12}, [%[input_ptr4]], r0      \n"
-          "vld1.s8    {d14}, [%[input_ptr5]], r0      \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][3]           \n"
-          "vmlal.s16  q14, d16, %f[ker2][2]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][0]           \n"
-          "vmlal.s16  q14, d20, %e[ker1][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][1]           \n"
-          "vmlal.s16  q14, d20, %e[ker1][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][2]           \n"
-          "vmlal.s16  q14, d20, %e[ker1][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][3]           \n"
-          "vmlal.s16  q14, d20, %e[ker1][3]           \n"
-
-          "vmovl.s8   q8, d12                         \n"
-          "vmlal.s16  q12, d16, %e[ker3][0]           \n"
-          "vmlal.s16  q14, d16, %f[ker2][3]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][0]           \n"
-          "vmlal.s16  q14, d20, %f[ker1][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][1]           \n"
-          "vmlal.s16  q14, d20, %f[ker1][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][2]           \n"
-          "vmlal.s16  q14, d20, %f[ker1][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][3]           \n"
-          "vmlal.s16  q14, d20, %f[ker1][3]           \n"
-
-          "vmovl.s8   q8, d14                         \n"
-          "vmlal.s16  q14, d16, %e[ker3][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q14, d20, %e[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q14, d20, %e[ker2][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q14, d20, %e[ker2][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q14, d20, %e[ker2][3]           \n"
-
-          // restore output
-          "vst1.32    {d24-d25}, [%[output_ptr0]]!    \n"
-          "vst1.32    {d28-d29}, [%[output_ptr1]]!    \n"
-          "sub        %[remain], #4                   \n"
-
-          "start_remain_%=:                           \n"
-          "cmp        %[remain], #0                   \n"
-          "ble        end_%=                          \n"
-          "mov        r0, %[remain]                   \n"
-          "vld1.s8    {d10}, [%[input_ptr0]], r0      \n"
-          "vld1.s8    {d12}, [%[input_ptr1]], r0      \n"
-          "vld1.s8    {d14}, [%[input_ptr2]], r0      \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmull.s16  q12, d16, %f[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][3]           \n"
-
-          "vmovl.s8   q8, d12                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][1]           \n"
-          "vmull.s16  q14, d16, %f[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][0]           \n"
-          "vmlal.s16  q14, d20, %e[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][1]           \n"
-          "vmlal.s16  q14, d20, %e[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][2]           \n"
-          "vmlal.s16  q14, d20, %e[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][3]           \n"
-          "vmlal.s16  q14, d20, %e[ker0][3]           \n"
-
-          "vmovl.s8   q8, d14                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][2]           \n"
-          "vmlal.s16  q14, d16, %f[ker2][1]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][0]           \n"
-          "vmlal.s16  q14, d20, %f[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][1]           \n"
-          "vmlal.s16  q14, d20, %f[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][2]           \n"
-          "vmlal.s16  q14, d20, %f[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][3]           \n"
-          "vmlal.s16  q14, d20, %f[ker0][3]           \n"
-
-          "vld1.s8    {d10}, [%[input_ptr3]], r0      \n"
-          "vld1.s8    {d12}, [%[input_ptr4]], r0      \n"
-          "vld1.s8    {d14}, [%[input_ptr5]], r0      \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][3]           \n"
-          "vmlal.s16  q14, d16, %f[ker2][2]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][0]           \n"
-          "vmlal.s16  q14, d20, %e[ker1][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][1]           \n"
-          "vmlal.s16  q14, d20, %e[ker1][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][2]           \n"
-          "vmlal.s16  q14, d20, %e[ker1][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][3]           \n"
-          "vmlal.s16  q14, d20, %e[ker1][3]           \n"
-
-          "vmovl.s8   q8, d12                         \n"
-          "vmlal.s16  q12, d16, %e[ker3][0]           \n"
-          "vmlal.s16  q14, d16, %f[ker2][3]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][0]           \n"
-          "vmlal.s16  q14, d20, %f[ker1][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][1]           \n"
-          "vmlal.s16  q14, d20, %f[ker1][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][2]           \n"
-          "vmlal.s16  q14, d20, %f[ker1][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][3]           \n"
-          "vmlal.s16  q14, d20, %f[ker1][3]           \n"
-
-          "vmovl.s8   q8, d14                         \n"
-          "vmlal.s16  q14, d16, %e[ker3][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q14, d20, %e[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q14, d20, %e[ker2][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q14, d20, %e[ker2][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q14, d20, %e[ker2][3]           \n"
-
-          "cmp        %[remain], #2                   \n"
-          "blt        store_2h1w_%=                   \n"
-          "vst1.32    {d24}, [%[output_ptr0]]!        \n"
-          "vst1.32    {d28}, [%[output_ptr1]]!        \n"
-          "cmp        %[remain], #3                   \n"
-          "blt        end_%=                          \n"
-          "vst1.32    {d25[0]}, [%[output_ptr0]]!     \n"
-          "vst1.32    {d29[0]}, [%[output_ptr1]]!     \n"
-          "b          end_%=                          \n"
-
-          "store_2h1w_%=:                             \n"
-          "vst1.32    {d24[0]}, [%[output_ptr0]]!     \n"
-          "vst1.32    {d28[0]}, [%[output_ptr1]]!     \n"
-          "end_%=:                                    \n"
-          : [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1),
-            [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3),
-            [input_ptr4] "+r"(input_ptr4), [input_ptr5] "+r"(input_ptr5),
-            [output_ptr0] "+r"(output_ptr0), [output_ptr1] "+r"(output_ptr1),
-            [loop] "+r"(loop), [remain] "+r"(w_remain)
-          : [ker0] "w"(_ker0), [ker1] "w"(_ker1), [ker2] "w"(_ker2),
-            [ker3] "w"(_ker3)
-          : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
-            "q12", "q13", "q14", "q15", "r0");
-      // pad right
-      if (padding_w) {
-        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0)));
-        int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1)));
-        int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2)));
-        int16x4_t row3 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr3)));
-        int16x4_t row4 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr4)));
-        int16x4_t row5 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr5)));
-        int16x4_t zero = vdup_n_s16(0);
-        int32x4_t acc0, acc1;
-        for (int w = valid_w_end; w < output_w; ++w) {
-          int padding = w + 5 - (padding_w + input_w);
-          if (padding >= 5) {
-            *output_ptr0 = 0;
-            *output_ptr1 = 0;
-          } else {
-            int iw = w - valid_w_end;
-            int32_t sum0 = input_ptr0[iw] * filter_ptr0[0] +
-                           input_ptr1[iw] * filter_ptr1[0] +
-                           input_ptr2[iw] * filter_ptr2[0] +
-                           input_ptr3[iw] * filter_ptr3[0] +
-                           input_ptr4[iw] * filter_ptr4[0];
-            int32_t sum1 = input_ptr1[iw] * filter_ptr0[0] +
-                           input_ptr2[iw] * filter_ptr1[0] +
-                           input_ptr3[iw] * filter_ptr2[0] +
-                           input_ptr4[iw] * filter_ptr3[0] +
-                           input_ptr5[iw] * filter_ptr4[0];
-            row0 = vext_s16(row0, zero, 1);
-            row1 = vext_s16(row1, zero, 1);
-            row2 = vext_s16(row2, zero, 1);
-            row3 = vext_s16(row3, zero, 1);
-            row4 = vext_s16(row4, zero, 1);
-            row5 = vext_s16(row5, zero, 1);
-            acc0 = vmull_s16(row0, _ker[0]);
-            acc0 = vmlal_s16(acc0, row1, _ker[1]);
-            acc0 = vmlal_s16(acc0, row2, _ker[2]);
-            acc0 = vmlal_s16(acc0, row3, _ker[3]);
-            acc0 = vmlal_s16(acc0, row4, _ker[4]);
-            acc1 = vmull_s16(row1, _ker[0]);
-            acc1 = vmlal_s16(acc1, row2, _ker[1]);
-            acc1 = vmlal_s16(acc1, row3, _ker[2]);
-            acc1 = vmlal_s16(acc1, row4, _ker[3]);
-            acc1 = vmlal_s16(acc1, row5, _ker[4]);
-            acc0 = vpaddq_s32(acc0, acc1);
-            int32x2_t sum = vpadd_s32(vget_low_s32(acc0), vget_high_s32(acc0));
-            sum0 += vget_lane_s32(sum, 0);
-            sum1 += vget_lane_s32(sum, 1);
-            *output_ptr0 = sum0;
-            *output_ptr1 = sum1;
-          }
-          output_ptr0++;
-          output_ptr1++;
-        }
-      }
-    }
-    // remain height
-    int start_h = valid_h_start + (valid_h & 0xfffffffe);
-    if (start_h < valid_h_end) {
-      const int8_t *input_ptr0 = input_ptr + (start_h - padding_h) * input_w;
-      const int8_t *input_ptr1 = input_ptr0 + input_w;
-      const int8_t *input_ptr2 = input_ptr1 + input_w;
-      const int8_t *input_ptr3 = input_ptr2 + input_w;
-      const int8_t *input_ptr4 = input_ptr3 + input_w;
-      int32_t *output_ptr0 = output_ptr + start_h * output_w;
-      // pad left
-      if (padding_w) {
-        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0)));
-        int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1)));
-        int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2)));
-        int16x4_t row3 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr3)));
-        int16x4_t row4 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr4)));
-        int16x4_t zero = vdup_n_s16(0);
-        int32x4_t acc;
-        for (int w = valid_w_start - 1; w >= 0; --w) {
-          int padding = padding_w - w;
-          if (padding >= 5) {
-            output_ptr0[w] = 0;
-          } else {
-            acc = vmull_s16(row0, _ker[0]);
-            acc = vmlal_s16(acc, row1, _ker[1]);
-            acc = vmlal_s16(acc, row2, _ker[2]);
-            acc = vmlal_s16(acc, row3, _ker[3]);
-            acc = vmlal_s16(acc, row4, _ker[4]);
-            int32x2_t sum = vpadd_s32(vget_low_s32(acc), vget_high_s32(acc));
-            sum = vpadd_s32(sum, sum);
-            vst1_lane_s32(output_ptr0 + w, sum, 0);
-
-            row0 = vext_s16(zero, row0, 3);
-            row1 = vext_s16(zero, row1, 3);
-            row2 = vext_s16(zero, row2, 3);
-            row3 = vext_s16(zero, row3, 3);
-            row4 = vext_s16(zero, row4, 3);
-          }
-        }
-        output_ptr0 += valid_w_start;
-      }
-      // valid
-      int loop = output_w_tiles;
-      int w_remain = output_w_remain;
-      asm volatile(
-          "cmp        %[loop], #0                     \n"
-          "ble        start_remain4_%=                \n"
-          "mov        r0, #8                          \n"
-          "loop_1h8w_%=:                              \n"
-          "vld1.s8    {d10-d11}, [%[input_ptr0]], r0  \n"
-          "vld1.s8    {d12-d13}, [%[input_ptr1]], r0  \n"
-          "vld1.s8    {d14-d15}, [%[input_ptr2]], r0  \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmovl.s8   q9, d11                         \n"
-          "vmull.s16  q12, d16, %f[ker2][0]           \n"
-          "vmull.s16  q13, d17, %f[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][0]           \n"
-          "vmlal.s16  q13, d21, %e[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][1]           \n"
-          "vmlal.s16  q13, d21, %e[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][2]           \n"
-          "vmlal.s16  q13, d21, %e[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][3]           \n"
-          "vmlal.s16  q13, d21, %e[ker0][3]           \n"
-
-          "vmovl.s8   q8, d12                         \n"
-          "vmovl.s8   q9, d13                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][1]           \n"
-          "vmlal.s16  q13, d17, %f[ker2][1]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][0]           \n"
-          "vmlal.s16  q13, d21, %f[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][1]           \n"
-          "vmlal.s16  q13, d21, %f[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][2]           \n"
-          "vmlal.s16  q13, d21, %f[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][3]           \n"
-          "vmlal.s16  q13, d21, %f[ker0][3]           \n"
-
-          "vmovl.s8   q8, d14                         \n"
-          "vmovl.s8   q9, d15                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][2]           \n"
-          "vmlal.s16  q13, d17, %f[ker2][2]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][0]           \n"
-          "vmlal.s16  q13, d21, %e[ker1][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][1]           \n"
-          "vmlal.s16  q13, d21, %e[ker1][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][2]           \n"
-          "vmlal.s16  q13, d21, %e[ker1][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][3]           \n"
-          "vmlal.s16  q13, d21, %e[ker1][3]           \n"
-
-          "vld1.s8    {d10-d11}, [%[input_ptr3]], r0  \n"
-          "vld1.s8    {d12-d13}, [%[input_ptr4]], r0  \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmovl.s8   q9, d11                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][3]           \n"
-          "vmlal.s16  q13, d17, %f[ker2][3]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][0]           \n"
-          "vmlal.s16  q13, d21, %f[ker1][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][1]           \n"
-          "vmlal.s16  q13, d21, %f[ker1][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][2]           \n"
-          "vmlal.s16  q13, d21, %f[ker1][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][3]           \n"
-          "vmlal.s16  q13, d21, %f[ker1][3]           \n"
-
-          "vmovl.s8   q8, d12                         \n"
-          "vmovl.s8   q9, d13                         \n"
-          "vmlal.s16  q12, d16, %e[ker3][0]           \n"
-          "vmlal.s16  q13, d17, %e[ker3][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][0]           \n"
-          "vmlal.s16  q13, d21, %e[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][1]           \n"
-          "vmlal.s16  q13, d21, %e[ker2][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][2]           \n"
-          "vmlal.s16  q13, d21, %e[ker2][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][3]           \n"
-          "vmlal.s16  q13, d21, %e[ker2][3]           \n"
-
-          // restore output
-          "vst1.32    {q12-q13}, [%[output_ptr0]]!    \n"
-          "subs       %[loop], #1                     \n"
-          "bne        loop_1h8w_%=                    \n"
-
-          "start_remain4_%=:                          \n"
-          "cmp        %[remain], #4                   \n"
-          "blt        start_remain_%=                 \n"
-          "mov        r0, #4                          \n"
-          "vld1.s8    {d10}, [%[input_ptr0]], r0      \n"
-          "vld1.s8    {d12}, [%[input_ptr1]], r0      \n"
-          "vld1.s8    {d14}, [%[input_ptr2]], r0      \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmull.s16  q12, d16, %f[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][3]           \n"
-
-          "vmovl.s8   q8, d12                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][1]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][3]           \n"
-
-          "vmovl.s8   q8, d14                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][2]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][3]           \n"
-
-          "vld1.s8    {d10}, [%[input_ptr3]], r0      \n"
-          "vld1.s8    {d12}, [%[input_ptr4]], r0      \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][3]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][3]           \n"
-
-          "vmovl.s8   q8, d12                         \n"
-          "vmlal.s16  q12, d16, %e[ker3][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][3]           \n"
-
-          // restore output
-          "vst1.32    {d24-d25}, [%[output_ptr0]]!    \n"
-          "sub        %[remain], #4                   \n"
-
-          "start_remain_%=:                           \n"
-          "cmp        %[remain], #0                   \n"
-          "ble        end_%=                          \n"
-          "mov        r0, %[remain]                   \n"
-          "vld1.s8    {d10}, [%[input_ptr0]], r0      \n"
-          "vld1.s8    {d12}, [%[input_ptr1]], r0      \n"
-          "vld1.s8    {d14}, [%[input_ptr2]], r0      \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmull.s16  q12, d16, %f[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][3]           \n"
-
-          "vmovl.s8   q8, d12                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][1]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][3]           \n"
-
-          "vmovl.s8   q8, d14                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][2]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][3]           \n"
-
-          "vld1.s8    {d10}, [%[input_ptr3]], r0      \n"
-          "vld1.s8    {d12}, [%[input_ptr4]], r0      \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][3]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][3]           \n"
-
-          "vmovl.s8   q8, d12                         \n"
-          "vmlal.s16  q12, d16, %e[ker3][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][3]           \n"
-
-          "cmp        %[remain], #2                   \n"
-          "blt        store_1h1w_%=                   \n"
-          "vst1.32    {d24}, [%[output_ptr0]]!        \n"
-          "cmp        %[remain], #3                   \n"
-          "blt        end_%=                          \n"
-          "vst1.32    {d25[0]}, [%[output_ptr0]]!     \n"
-          "b          end_%=                          \n"
-
-          "store_1h1w_%=:                             \n"
-          "vst1.32    {d24[0]}, [%[output_ptr0]]!     \n"
-          "end_%=:                                    \n"
-          : [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1),
-            [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3),
-            [input_ptr4] "+r"(input_ptr4), [output_ptr0] "+r"(output_ptr0),
-            [loop] "+r"(loop), [remain] "+r"(w_remain)
-          : [ker0] "w"(_ker0), [ker1] "w"(_ker1), [ker2] "w"(_ker2),
-            [ker3] "w"(_ker3)
-          : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
-            "q12", "q13", "q14", "q15", "r0");
-      // pad right
-      if (padding_w) {
-        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0)));
-        int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1)));
-        int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2)));
-        int16x4_t row3 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr3)));
-        int16x4_t row4 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr4)));
-        int16x4_t zero = vdup_n_s16(0);
-        int32x4_t acc;
-        for (int w = valid_w_end; w < output_w; ++w) {
-          int padding = w + 5 - (padding_w + input_w);
-          if (padding >= 5) {
-            *output_ptr0 = 0;
-          } else {
-            int iw = w - valid_w_end;
-            int32_t sum0 = input_ptr0[iw] * filter_ptr0[0] +
-                           input_ptr1[iw] * filter_ptr1[0] +
-                           input_ptr2[iw] * filter_ptr2[0] +
-                           input_ptr3[iw] * filter_ptr3[0] +
-                           input_ptr4[iw] * filter_ptr4[0];
-            row0 = vext_s16(row0, zero, 1);
-            row1 = vext_s16(row1, zero, 1);
-            row2 = vext_s16(row2, zero, 1);
-            row3 = vext_s16(row3, zero, 1);
-            row4 = vext_s16(row4, zero, 1);
-            acc = vmull_s16(row0, _ker[0]);
-            acc = vmlal_s16(acc, row1, _ker[1]);
-            acc = vmlal_s16(acc, row2, _ker[2]);
-            acc = vmlal_s16(acc, row3, _ker[3]);
-            acc = vmlal_s16(acc, row4, _ker[4]);
-            int32x2_t sum = vpadd_s32(vget_low_s32(acc), vget_high_s32(acc));
-            sum = vpadd_s32(sum, sum);
-            sum0 += vget_lane_s32(sum, 0);
-            *output_ptr0 = sum0;
-          }
-          output_ptr0++;
-        }
-      }
-    }
-    // pad bottom
-    for (int h = valid_h_end; h < output_h; ++h) {
-      DepthwiseConv5x5NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h,
-                                      input_w, padding_h, padding_w, output_w,
-                                      output_ptr, _ker, kernel);
-    }
-  }
-}
-
-template <>
-void DepthwiseConv5x5S2<int8_t, int32_t>(const framework::Tensor &input,
-                                         const framework::Tensor &filter,
-                                         const std::vector<int> &paddings,
-                                         framework::Tensor *output) {}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // __ARM_NEON__
diff --git a/mobile/src/operators/math/element_wise.h b/mobile/src/operators/math/element_wise.h
deleted file mode 100644
index f81931930f..0000000000
--- a/mobile/src/operators/math/element_wise.h
+++ /dev/null
@@ -1,396 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "framework/tensor.h"
-#include "operators/math/activation.h"
-#ifdef __ARM_NEON
-#include <arm_neon.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <ActivationType Act>
-void AddChannelWise(const framework::Tensor *input,
-                    const framework::Tensor *bias, framework::Tensor *output) {
-  const float *input_ptr = input->data<float>();
-  const float *bias_ptr = bias->data<float>();
-  float *output_ptr = output->mutable_data<float>();
-  // maybe check shape
-  int batch_size = input->dims()[0];
-  int channels = input->dims()[1];
-  int spatial_size = input->dims()[2] * input->dims()[3];
-
-  for (int batch = 0; batch < batch_size; ++batch) {
-    for (int channel = 0; channel < channels; ++channel) {
-      size_t offset = (batch * channels + channel) * spatial_size;
-      const float *x = input_ptr + offset;
-      float *y = output_ptr + offset;
-      float beta = bias_ptr[channel];
-      int j = 0;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-      float32x4_t __bias = vdupq_n_f32(beta);
-      for (; j < spatial_size - 15; j += 16, x += 16, y += 16) {
-        float32x4_t in0 = vld1q_f32(x);
-        float32x4_t in1 = vld1q_f32(x + 4);
-        float32x4_t in2 = vld1q_f32(x + 8);
-        float32x4_t in3 = vld1q_f32(x + 12);
-        in0 = vaddq_f32(__bias, in0);
-        in1 = vaddq_f32(__bias, in1);
-        in2 = vaddq_f32(__bias, in2);
-        in3 = vaddq_f32(__bias, in3);
-        in0 = math::vActiveq_f32<Act>(in0);
-        in1 = math::vActiveq_f32<Act>(in1);
-        in2 = math::vActiveq_f32<Act>(in2);
-        in3 = math::vActiveq_f32<Act>(in3);
-        vst1q_f32(y, in0);
-        vst1q_f32(y + 4, in1);
-        vst1q_f32(y + 8, in2);
-        vst1q_f32(y + 12, in3);
-      }
-      for (; j < spatial_size - 3; j += 4, x += 4, y += 4) {
-        float32x4_t in0 = vld1q_f32(x);
-        in0 = vaddq_f32(__bias, in0);
-        in0 = math::vActiveq_f32<Act>(in0);
-        vst1q_f32(y, in0);
-      }
-#endif
-      for (; j < spatial_size; ++j, ++x, ++y) {
-        *y = math::Active<Act>((*x) + beta);
-      }
-    }
-  }
-}
-
-template <ActivationType Act>
-void ScaleAddChannelWise(const framework::Tensor *input,
-                         const framework::Tensor *scale,
-                         const framework::Tensor *bias,
-                         framework::Tensor *output) {
-  const float *input_ptr = input->data<float>();
-  const float *scale_ptr = scale->data<float>();
-  const float *bias_ptr = bias->data<float>();
-  float *output_ptr = output->mutable_data<float>();
-  // maybe check shape
-  int batch_size = input->dims()[0];
-  int channels = input->dims()[1];
-  int spatial_size = input->dims()[2] * input->dims()[3];
-
-  for (int batch = 0; batch < batch_size; ++batch) {
-    for (int channel = 0; channel < channels; ++channel) {
-      size_t offset = (batch * channels + channel) * spatial_size;
-      const float *x = input_ptr + offset;
-      float *y = output_ptr + offset;
-      float alpha = scale_ptr[channel];
-      float beta = bias_ptr[channel];
-      int j = 0;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-      float32x4_t __scale = vdupq_n_f32(alpha);
-      float32x4_t __bias = vdupq_n_f32(beta);
-      for (; j < spatial_size - 15; j += 16, x += 16, y += 16) {
-        float32x4_t in0 = vld1q_f32(x);
-        float32x4_t in1 = vld1q_f32(x + 4);
-        float32x4_t in2 = vld1q_f32(x + 8);
-        float32x4_t in3 = vld1q_f32(x + 12);
-        in0 = vmlaq_f32(__bias, __scale, in0);
-        in1 = vmlaq_f32(__bias, __scale, in1);
-        in2 = vmlaq_f32(__bias, __scale, in2);
-        in3 = vmlaq_f32(__bias, __scale, in3);
-        in0 = math::vActiveq_f32<Act>(in0);
-        in1 = math::vActiveq_f32<Act>(in1);
-        in2 = math::vActiveq_f32<Act>(in2);
-        in3 = math::vActiveq_f32<Act>(in3);
-        vst1q_f32(y, in0);
-        vst1q_f32(y + 4, in1);
-        vst1q_f32(y + 8, in2);
-        vst1q_f32(y + 12, in3);
-      }
-      for (; j < spatial_size - 3; j += 4, x += 4, y += 4) {
-        float32x4_t in0 = vld1q_f32(x);
-        in0 = vmlaq_f32(__bias, __scale, in0);
-        in0 = math::vActiveq_f32<Act>(in0);
-        vst1q_f32(y, in0);
-      }
-#endif
-      for (; j < spatial_size; ++j, ++x, ++y) {
-        *y = math::Active<Act>(alpha * (*x) + beta);
-      }
-    }
-  }
-}
-
-template <ActivationType Act>
-void ScaleAddChannelWise(const framework::Tensor *input,
-                         const framework::Tensor *scale,
-                         const framework::Tensor *bias,
-                         const framework::Tensor *tensorwise_bias,
-                         framework::Tensor *output) {
-  const float *input_ptr = input->data<float>();
-  const float *scale_ptr = scale->data<float>();
-  const float *bias_ptr = bias->data<float>();
-  const float *tensorwise_bias_ptr = tensorwise_bias->data<float>();
-  float *output_ptr = output->mutable_data<float>();
-  // maybe check shape
-  int batch_size = input->dims()[0];
-  int channels = input->dims()[1];
-  int spatial_size = input->dims()[2] * input->dims()[3];
-
-  for (int batch = 0; batch < batch_size; ++batch) {
-    for (int channel = 0; channel < channels; ++channel) {
-      size_t offset = (batch * channels + channel) * spatial_size;
-      const float *x = input_ptr + offset;
-      const float *b = tensorwise_bias_ptr + offset;
-      float *y = output_ptr + offset;
-      float alpha = scale_ptr[channel];
-      float beta = bias_ptr[channel];
-      int j = 0;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-      float32x4_t __scale = vdupq_n_f32(alpha);
-      float32x4_t __bias = vdupq_n_f32(beta);
-      for (; j < spatial_size - 15; j += 16, x += 16, b += 16, y += 16) {
-        float32x4_t in0 = vld1q_f32(x);
-        float32x4_t in1 = vld1q_f32(x + 4);
-        float32x4_t in2 = vld1q_f32(x + 8);
-        float32x4_t in3 = vld1q_f32(x + 12);
-        float32x4_t b0 = vld1q_f32(b);
-        float32x4_t b1 = vld1q_f32(b + 4);
-        float32x4_t b2 = vld1q_f32(b + 8);
-        float32x4_t b3 = vld1q_f32(b + 12);
-        in0 = vmlaq_f32(__bias, __scale, in0);
-        in1 = vmlaq_f32(__bias, __scale, in1);
-        in2 = vmlaq_f32(__bias, __scale, in2);
-        in3 = vmlaq_f32(__bias, __scale, in3);
-        in0 = vaddq_f32(in0, b0);
-        in1 = vaddq_f32(in1, b1);
-        in2 = vaddq_f32(in2, b2);
-        in3 = vaddq_f32(in3, b3);
-        in0 = math::vActiveq_f32<Act>(in0);
-        in1 = math::vActiveq_f32<Act>(in1);
-        in2 = math::vActiveq_f32<Act>(in2);
-        in3 = math::vActiveq_f32<Act>(in3);
-        vst1q_f32(y, in0);
-        vst1q_f32(y + 4, in1);
-        vst1q_f32(y + 8, in2);
-        vst1q_f32(y + 12, in3);
-      }
-      for (; j < spatial_size - 3; j += 4, x += 4, b += 4, y += 4) {
-        float32x4_t in0 = vld1q_f32(x);
-        float32x4_t b0 = vld1q_f32(b);
-        in0 = vmlaq_f32(__bias, __scale, in0);
-        in0 = vaddq_f32(in0, b0);
-        in0 = math::vActiveq_f32<Act>(in0);
-        vst1q_f32(y, in0);
-      }
-#endif
-      for (; j < spatial_size; ++j, ++x, ++b, ++y) {
-        *y = math::Active<Act>(alpha * (*x) + beta + (*b));
-      }
-    }
-  }
-}
-
-template <ActivationType Act>
-void AddElememtWise(const framework::Tensor *input,
-                    const framework::Tensor *bias, const int axis,
-                    framework::Tensor *output) {
-  const auto &x_dims = input->dims();
-  const auto &y_dims = bias->dims();
-  const float *input_data = input->data<float>();
-  const float *bias_data = bias->data<float>();
-  float *output_data = output->mutable_data<float>();
-
-  if (x_dims == y_dims) {
-    size_t channels = 1;
-    size_t elementwise_num = 1;
-    for (int i = 0; i < y_dims.size(); ++i) {
-      channels *= y_dims[i];
-    }
-#pragma omp parallel for
-    for (int j = 0; j < channels; ++j) {
-      size_t offset = (0 * channels + j) * elementwise_num;
-      const float *input = input_data + offset;
-      const float bias = bias_data[j];
-      float *output = output_data + offset;
-#if 0
-      int loop = elementwise_num >> 0x4;
-      int remain = elementwise_num & 0xF;
-      float32x4_t rb = vdupq_n_f32(bias);
-      for (int k = 0; k < loop; ++k) {
-        float32x4_t r0 = vld1q_f32(input);
-        float32x4_t r1 = vld1q_f32(input + 4);
-        float32x4_t r2 = vld1q_f32(input + 8);
-        float32x4_t r3 = vld1q_f32(input + 12);
-        r0 = vaddq_f32(r0, rb);
-        r1 = vaddq_f32(r1, rb);
-        r2 = vaddq_f32(r2, rb);
-        r3 = vaddq_f32(r3, rb);
-        r0 = math::vActiveq_f32<Act>(r0);
-        r1 = math::vActiveq_f32<Act>(r1);
-        r2 = math::vActiveq_f32<Act>(r2);
-        r3 = math::vActiveq_f32<Act>(r3);
-        vst1q_f32(output, r0);
-        vst1q_f32(output + 4, r1);
-        vst1q_f32(output + 8, r2);
-        vst1q_f32(output + 12, r3);
-        input += 16;
-        output += 16;
-      }
-      if (remain >= 8) {
-        float32x4_t r0 = vld1q_f32(input);
-        float32x4_t r1 = vld1q_f32(input + 4);
-        r0 = vaddq_f32(r0, rb);
-        r1 = vaddq_f32(r1, rb);
-        r0 = math::vActiveq_f32<Act>(r0);
-        r1 = math::vActiveq_f32<Act>(r1);
-        vst1q_f32(output, r0);
-        vst1q_f32(output + 4, r1);
-        input += 8;
-        output += 8;
-        remain -= 8;
-      }
-      if (remain >= 4) {
-        float32x4_t r0 = vld1q_f32(input);
-        r0 = vaddq_f32(r0, rb);
-        r0 = math::vActiveq_f32<Act>(r0);
-        vst1q_f32(output, r0);
-        input += 4;
-        output += 4;
-        remain -= 4;
-      }
-      if (remain > 0) {
-        float32x4_t r0 = vld1q_f32(input);
-        r0 = vaddq_f32(r0, rb);
-        r0 = math::vActiveq_f32<Act>(r0);
-        switch (remain) {
-          case 1:
-            vst1q_lane_f32(output, r0, 0);
-            break;
-          case 2:
-            vst1_f32(output, vget_low_f32(r0));
-            break;
-          case 3:
-            vst1_f32(output, vget_low_f32(r0));
-            vst1q_lane_f32(output, r0, 2);
-            break;
-        }
-      }
-#else
-      for (int k = 0; k < elementwise_num; ++k) {
-        output[k] = math::Active<Act>(input[k] + bias);
-      }
-#endif  // __ARM_NEON__
-    }
-
-  } else {
-    // axis = -1 represent the last dimensions.
-    int dim = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-    size_t batch = 1;
-    size_t channels = 1;
-    size_t elementwise_num = 1;
-    for (int i = 0; i < dim; ++i) {
-      batch *= x_dims[i];
-    }
-    for (int i = 0; i < y_dims.size(); ++i) {
-      channels *= y_dims[i];
-    }
-    for (int i = y_dims.size() + dim; i < x_dims.size(); ++i) {
-      elementwise_num *= x_dims[i];
-    }
-
-#pragma omp parallel for collapse(2)
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
-        size_t offset = (i * channels + j) * elementwise_num;
-        const float *input = input_data + offset;
-        const float bias = bias_data[j];
-        float *output = output_data + offset;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-        int loop = elementwise_num >> 0x4;
-        int remain = elementwise_num & 0xF;
-        float32x4_t rb = vdupq_n_f32(bias);
-        for (int k = 0; k < loop; ++k) {
-          float32x4_t r0 = vld1q_f32(input);
-          float32x4_t r1 = vld1q_f32(input + 4);
-          float32x4_t r2 = vld1q_f32(input + 8);
-          float32x4_t r3 = vld1q_f32(input + 12);
-          r0 = vaddq_f32(r0, rb);
-          r1 = vaddq_f32(r1, rb);
-          r2 = vaddq_f32(r2, rb);
-          r3 = vaddq_f32(r3, rb);
-          r0 = math::vActiveq_f32<Act>(r0);
-          r1 = math::vActiveq_f32<Act>(r1);
-          r2 = math::vActiveq_f32<Act>(r2);
-          r3 = math::vActiveq_f32<Act>(r3);
-          vst1q_f32(output, r0);
-          vst1q_f32(output + 4, r1);
-          vst1q_f32(output + 8, r2);
-          vst1q_f32(output + 12, r3);
-          input += 16;
-          output += 16;
-        }
-        if (remain >= 8) {
-          float32x4_t r0 = vld1q_f32(input);
-          float32x4_t r1 = vld1q_f32(input + 4);
-          r0 = vaddq_f32(r0, rb);
-          r1 = vaddq_f32(r1, rb);
-          r0 = math::vActiveq_f32<Act>(r0);
-          r1 = math::vActiveq_f32<Act>(r1);
-          vst1q_f32(output, r0);
-          vst1q_f32(output + 4, r1);
-          input += 8;
-          output += 8;
-          remain -= 8;
-        }
-        if (remain >= 4) {
-          float32x4_t r0 = vld1q_f32(input);
-          r0 = vaddq_f32(r0, rb);
-          r0 = math::vActiveq_f32<Act>(r0);
-          vst1q_f32(output, r0);
-          input += 4;
-          output += 4;
-          remain -= 4;
-        }
-        if (remain > 0) {
-          float32x4_t r0 = vld1q_f32(input);
-          r0 = vaddq_f32(r0, rb);
-          r0 = math::vActiveq_f32<Act>(r0);
-          switch (remain) {
-            case 1:
-              vst1q_lane_f32(output, r0, 0);
-              break;
-            case 2:
-              vst1_f32(output, vget_low_f32(r0));
-              break;
-            case 3:
-              vst1_f32(output, vget_low_f32(r0));
-              vst1q_lane_f32(output, r0, 2);
-              break;
-          }
-        }
-#else
-        for (int k = 0; k < elementwise_num; ++k) {
-          output[k] = math::Active<Act>(input[k] + bias);
-        }
-#endif  // __ARM_NEON__
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/elementwise_op_function.h b/mobile/src/operators/math/elementwise_op_function.h
deleted file mode 100644
index 95fd037988..0000000000
--- a/mobile/src/operators/math/elementwise_op_function.h
+++ /dev/null
@@ -1,178 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "transform.h"
-
-#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
-
-namespace paddle_mobile {
-namespace operators {
-
-/*
- * Out = X ⊙ Y
- * If Y's shape does not match X' shape, they will be reshaped.
- * For example:
- * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
- *    pre=2, n=3*4, post=5
- *    x.shape(2, 12, 5) * y.shape(1, 12, 1).broadcast(2, 12, 5)
- * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5)
- *    pre=2*3, n=4*5, post=1
- *    x.shape(6, 20, 1) * y.shape(1, 20, 1).broadcast(6, 20, 1)
- */
-inline void get_mid_dims(const framework::DDim &x_dims,
-                         const framework::DDim &y_dims, const int axis,
-                         int *pre, int *n, int *post) {
-  *pre = 1;
-  *n = 1;
-  *post = 1;
-  // compute pre
-  for (int i = 0; i < axis; ++i) {
-    (*pre) *= x_dims[i];
-  }
-
-  for (int i = 0; i < y_dims.size(); ++i) {
-    assert(x_dims[i + axis] == y_dims[i]);
-    /// "Broadcast dimension mismatch.");
-    (*n) *= y_dims[i];
-  }
-
-  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
-    (*post) *= x_dims[i];
-  }
-}
-
-/// remove dims tail 1. (4,20,1,1) -> (4,20)
-inline void trim_trailing_singular_dims(framework::DDim *dims) {
-  // Remove trailing dimensions of size 1 for y
-  auto actual_dims_size = dims->size();
-  for (; actual_dims_size != 0; --actual_dims_size) {
-    if ((*dims)[actual_dims_size - 1] != 1) break;
-  }
-  if (actual_dims_size != dims->size()) {
-    auto actual_dims = framework::vectorize(*dims);
-    actual_dims.resize(actual_dims_size);
-    *dims = framework::make_ddim(actual_dims);
-  }
-}
-
-/// (4,20,2)+(20,): (20,) just as (20,1), when move 2 strides in last
-/// dimension
-/// in (4,20,2) is 2 ,
-/// (20,1) move 1 stride , to fill(add) 2 element with the same number.
-template <typename T>
-class MidWiseTransformIterator {
- public:
-  MidWiseTransformIterator(const T *ptr, int n, int post)
-      : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}
-
-  MidWiseTransformIterator<T> &operator++() {
-    if (post_ != 1) {
-      ++j_;
-      if (UNLIKELY(j_ == post_)) {
-        ++i_;
-        j_ = 0;
-        if (UNLIKELY(i_ == n_)) {
-          i_ = 0;
-        }
-      }
-      return *this;
-    } else {
-      ++i_;
-      if (UNLIKELY(i_ == n_)) {
-        i_ = 0;
-      }
-      return *this;
-    }
-  }
-
-  bool operator==(const MidWiseTransformIterator<T> &rhs) const {
-    return (ptr_ + i_) == &(*rhs);
-  }
-
-  bool operator!=(const MidWiseTransformIterator<T> &rhs) const {
-    return (ptr_ + i_) != &(*rhs);
-  }
-
-  const T &operator*() { return ptr_[i_]; }
-
- private:
-  const T *ptr_;
-  int64_t i_;
-  int64_t j_;
-  int64_t n_;
-  int64_t post_;
-};
-
-template <typename Functor, typename T, typename OutType = T>
-class TransformFunctor {
- public:
-  TransformFunctor(const framework::Tensor *x, const framework::Tensor *y,
-                   framework::Tensor *z, Functor func)
-      : x_(x->data<T>()),
-        y_(y->data<T>()),
-        z_(z->mutable_data<OutType>()),
-        nx_(x->numel()),
-        func_(func) {}
-
-  inline void Run() const {
-    math::Transform trans;
-    // 同时执行func(x_, y_)传入z_。
-    trans(x_, x_ + nx_, y_, z_, func_);
-  }
-
-  inline void RunMidWise(int n, int pre, int post) const {
-    math::Transform trans;
-    trans(x_, x_ + nx_, MidWiseTransformIterator<T>(y_, n, post), z_, func_);
-  }
-
- private:
-  const T *x_;
-  const T *y_;
-  OutType *z_;
-  int64_t nx_;
-  Functor func_;
-};
-
-template <typename Functor, typename T, typename OutType = T>
-void ElementwiseComputeEx(const framework::Tensor *x,
-                          const framework::Tensor *y, int axis, Functor func,
-                          framework::Tensor *z) {
-  TransformFunctor<Functor, T, OutType> functor(x, y, z, func);
-
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  PADDLE_MOBILE_ENFORCE(x_dims.size() >= y_dims.size(),
-                        "Rank of first input must >= rank of second input.");
-
-  if (x_dims == y_dims) {
-    functor.Run();
-    return;
-  }
-
-  /// axis = -1 represent the last dimensions.
-  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-  PADDLE_MOBILE_ENFORCE(axis >= 0 && axis < x_dims.size(),
-                        "Axis should be in range [0, x_dims)");
-  trim_trailing_singular_dims(&y_dims);
-  axis = (y_dims.size() == 0) ? x_dims.size() : axis;
-
-  int pre, n, post;
-  get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
-
-  functor.RunMidWise(n, pre, post);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/gemm.cpp b/mobile/src/operators/math/gemm.cpp
deleted file mode 100644
index 1fa78d1616..0000000000
--- a/mobile/src/operators/math/gemm.cpp
+++ /dev/null
@@ -1,3807 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/math/gemm.h"
-#include <string.h>
-#include "common/log.h"
-#include "memory/t_malloc.h"
-#if __ARM_NEON
-#include <arm_neon.h>
-#endif
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-#if __ARM_NEON
-inline float32x4_t vandq_f32(float32x4_t x, uint32x4_t mask) {
-  return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
-}
-#endif
-
-void Gemm::PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
-                          float *buffer, const bool parallel) {
-  uint32_t mask[8] = {0, 1, 2, 3, 4, 5, 4, 5};
-  int remain_k = k & 0x3;
-  uint32x4_t vzero = vdupq_n_u32(0);
-  uint32x4_t vmask1 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_k));
-
-  #pragma omp parallel for if (parallel)
-  for (int i = 0; i < m - 5; i += 6) {
-    const float *a0 = A + i * lda;
-    const float *a1 = A + (i + 1) * lda;
-    const float *a2 = A + (i + 2) * lda;
-    const float *a3 = A + (i + 3) * lda;
-    const float *a4 = A + (i + 4) * lda;
-    const float *a5 = A + (i + 5) * lda;
-    float *out_ptr = buffer + i * k;
-
-    int loops = k >> 2;
-    if (loops > 0) {
-#if __aarch64__
-      for (int l = 0; l < loops; ++l) {
-        float32x4_t _d0 = vld1q_f32(a0);
-        float32x4_t _d1 = vld1q_f32(a1);
-        float32x4_t _d2 = vld1q_f32(a2);
-        float32x4_t _d3 = vld1q_f32(a3);
-        float32x4_t _d4 = vld1q_f32(a4);
-        float32x4_t _d5 = vld1q_f32(a5);
-
-        float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
-        float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
-        float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
-        _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0]));
-        _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1]));
-        _d2 =
-            vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0]));
-        _d3 =
-            vcombine_f32(vget_high_f32(_q0.val[1]), vget_high_f32(_q1.val[1]));
-
-        vst1q_f32(out_ptr, _d0);
-        vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0]));
-        vst1q_f32(out_ptr + 6, _d1);
-        vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1]));
-        vst1q_f32(out_ptr + 12, _d2);
-        vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0]));
-        vst1q_f32(out_ptr + 18, _d3);
-        vst1_f32(out_ptr + 22, vget_high_f32(_q3.val[1]));
-
-        a0 += 4;
-        a1 += 4;
-        a2 += 4;
-        a3 += 4;
-        a4 += 4;
-        a5 += 4;
-        out_ptr += 24;
-      }
-#else
-      asm volatile(
-          "loop_4k_%=:                        \n"
-          "vld1.32    {d0-d1}, [%[a0]]!       \n"
-          "vld1.32    {d2-d3}, [%[a1]]!       \n"
-          "vld1.32    {d4-d5}, [%[a2]]!       \n"
-          "vld1.32    {d6-d7}, [%[a3]]!       \n"
-          "vld1.32    {d8-d9}, [%[a4]]!       \n"
-          "vld1.32    {d10-d11}, [%[a5]]!     \n"
-          "vtrn.32    q0, q1                  \n"
-          "vtrn.32    q2, q3                  \n"
-          "vtrn.32    q4, q5                  \n"
-          "vswp.32    d1, d4                  \n"
-          "vswp.32    d3, d6                  \n"
-
-          "vst1.32    {q0}, [%[out]]!         \n"
-          "vst1.32    {d8}, [%[out]]!         \n"
-          "vst1.32    {q1}, [%[out]]!         \n"
-          "vst1.32    {d10}, [%[out]]!        \n"
-          "vst1.32    {q2}, [%[out]]!         \n"
-          "vst1.32    {d9}, [%[out]]!         \n"
-          "vst1.32    {q3}, [%[out]]!         \n"
-          "vst1.32    {d11}, [%[out]]!        \n"
-
-          "subs       %[loops], #1            \n"
-          "bne        loop_4k_%=              \n"
-          : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2] "+r"(a2),
-            [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5), [loops] "+r"(loops)
-          :
-          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5");
-#endif
-    }
-
-    if (remain_k > 0) {
-      float32x4_t _d0 = vld1q_f32(a0);
-      float32x4_t _d1 = vld1q_f32(a1);
-      float32x4_t _d2 = vld1q_f32(a2);
-      float32x4_t _d3 = vld1q_f32(a3);
-      float32x4_t _d4 = vld1q_f32(a4);
-      float32x4_t _d5 = vld1q_f32(a5);
-
-      _d0 = vandq_f32(_d0, vmask1);
-      _d1 = vandq_f32(_d1, vmask1);
-      _d2 = vandq_f32(_d2, vmask1);
-      _d3 = vandq_f32(_d3, vmask1);
-      _d4 = vandq_f32(_d4, vmask1);
-      _d5 = vandq_f32(_d5, vmask1);
-
-      float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
-      float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
-      float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
-      _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0]));
-      _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1]));
-      _d2 = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0]));
-
-      switch (remain_k) {
-        case 3:
-          vst1q_f32(out_ptr + 12, _d2);
-          vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0]));
-        case 2:
-          vst1q_f32(out_ptr + 6, _d1);
-          vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1]));
-        case 1:
-          vst1q_f32(out_ptr, _d0);
-          vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0]));
-        default:
-          break;
-      }
-    }
-  }
-
-  int remain_m = m % 6;
-  if (remain_m) {
-    int remain_m_start = m - remain_m;
-    const float *a0 = A + remain_m_start * lda;
-    const float *a1 = a0 + lda;
-    const float *a2 = a0 + 2 * lda;
-    const float *a3 = a0 + 3 * lda;
-    const float *a4 = a0 + 4 * lda;
-    const float *a5 = a0 + 5 * lda;
-    float *out_ptr = buffer + remain_m_start * k;
-
-    uint32x4_t vmask2 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_m));
-    uint32x4_t vmask3 = vcltq_u32(vld1q_u32(mask + 4), vdupq_n_u32(remain_m));
-
-    int loops = k >> 2;
-    if (loops > 0) {
-#if __aarch64__
-      for (int l = 0; l < loops; ++l) {
-        float32x4_t _d0 = vld1q_f32(a0);
-        float32x4_t _d1 = vld1q_f32(a1);
-        float32x4_t _d2 = vld1q_f32(a2);
-        float32x4_t _d3 = vld1q_f32(a3);
-        float32x4_t _d4 = vld1q_f32(a4);
-        float32x4_t _d5 = vld1q_f32(a5);
-
-        float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
-        float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
-        float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
-        _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0]));
-        _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1]));
-        _d2 =
-            vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0]));
-        _d3 =
-            vcombine_f32(vget_high_f32(_q0.val[1]), vget_high_f32(_q1.val[1]));
-
-        _d0 = vandq_f32(_d0, vmask2);
-        _d1 = vandq_f32(_d1, vmask2);
-        _d2 = vandq_f32(_d2, vmask2);
-        _d3 = vandq_f32(_d3, vmask2);
-        _d4 = vandq_f32(_q3.val[0], vmask3);
-        _d5 = vandq_f32(_q3.val[1], vmask3);
-
-        vst1q_f32(out_ptr, _d0);
-        vst1_f32(out_ptr + 4, vget_low_f32(_d4));
-        vst1q_f32(out_ptr + 6, _d1);
-        vst1_f32(out_ptr + 10, vget_low_f32(_d5));
-        vst1q_f32(out_ptr + 12, _d2);
-        vst1_f32(out_ptr + 16, vget_high_f32(_d4));
-        vst1q_f32(out_ptr + 18, _d3);
-        vst1_f32(out_ptr + 22, vget_high_f32(_d5));
-
-        a0 += 4;
-        a1 += 4;
-        a2 += 4;
-        a3 += 4;
-        a4 += 4;
-        a5 += 4;
-        out_ptr += 24;
-      }
-#else
-      asm volatile(
-          "loop_4k_%=:                        \n"
-          "vld1.32    {d0-d1}, [%[a0]]!       \n"
-          "vld1.32    {d2-d3}, [%[a1]]!       \n"
-          "vld1.32    {d4-d5}, [%[a2]]!       \n"
-          "vld1.32    {d6-d7}, [%[a3]]!       \n"
-          "vld1.32    {d8-d9}, [%[a4]]!       \n"
-          "vld1.32    {d10-d11}, [%[a5]]!     \n"
-          "vtrn.32    q0, q1                  \n"
-          "vtrn.32    q2, q3                  \n"
-          "vtrn.32    q4, q5                  \n"
-          "vswp.32    d1, d4                  \n"
-          "vswp.32    d3, d6                  \n"
-
-          "vbif       q0, %q[vzero], %q[vmask2] \n"
-          "vbif       q1, %q[vzero], %q[vmask2] \n"
-          "vbif       q2, %q[vzero], %q[vmask2] \n"
-          "vbif       q3, %q[vzero], %q[vmask2] \n"
-          "vbif       q4, %q[vzero], %q[vmask3] \n"
-          "vbif       q5, %q[vzero], %q[vmask3] \n"
-
-          "vst1.32    {q0}, [%[out]]!         \n"
-          "vst1.32    {d8}, [%[out]]!         \n"
-          "vst1.32    {q1}, [%[out]]!         \n"
-          "vst1.32    {d10}, [%[out]]!        \n"
-          "vst1.32    {q2}, [%[out]]!         \n"
-          "vst1.32    {d9}, [%[out]]!         \n"
-          "vst1.32    {q3}, [%[out]]!         \n"
-          "vst1.32    {d11}, [%[out]]!        \n"
-
-          "subs       %[loops], #1            \n"
-          "bne        loop_4k_%=              \n"
-          : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2] "+r"(a2),
-            [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5), [loops] "+r"(loops)
-          : [vmask2] "w"(vmask2), [vmask3] "w"(vmask3), [vzero] "w"(vzero)
-          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5");
-#endif
-    }
-
-    if (remain_k > 0) {
-      float32x4_t _d0 = vld1q_f32(a0);
-      float32x4_t _d1 = vld1q_f32(a1);
-      float32x4_t _d2 = vld1q_f32(a2);
-      float32x4_t _d3 = vld1q_f32(a3);
-      float32x4_t _d4 = vld1q_f32(a4);
-      float32x4_t _d5 = vld1q_f32(a5);
-
-      _d0 = vandq_f32(_d0, vmask1);
-      _d1 = vandq_f32(_d1, vmask1);
-      _d2 = vandq_f32(_d2, vmask1);
-      _d3 = vandq_f32(_d3, vmask1);
-      _d4 = vandq_f32(_d4, vmask1);
-      _d5 = vandq_f32(_d5, vmask1);
-
-      float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
-      float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
-      float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
-      _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0]));
-      _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1]));
-      _d2 = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0]));
-      // _d3 = vcombine_f32(vget_high_f32(_q0.val[1]),
-      // vget_high_f32(_q1.val[1]));
-
-      _d0 = vandq_f32(_d0, vmask2);
-      _d1 = vandq_f32(_d1, vmask2);
-      _d2 = vandq_f32(_d2, vmask2);
-      // _d3 = vandq_f32(_d3, vmask2);
-      _d4 = vandq_f32(_q3.val[0], vmask3);
-      _d5 = vandq_f32(_q3.val[1], vmask3);
-
-      switch (remain_k) {
-        case 3:
-          vst1q_f32(out_ptr + 12, _d2);
-          vst1_f32(out_ptr + 16, vget_high_f32(_d4));
-        case 2:
-          vst1q_f32(out_ptr + 6, _d1);
-          vst1_f32(out_ptr + 10, vget_low_f32(_d5));
-        case 1:
-          vst1q_f32(out_ptr, _d0);
-          vst1_f32(out_ptr + 4, vget_low_f32(_d4));
-        default:
-          break;
-      }
-    }
-  }
-}
-
-// 将B矩阵分块复制到连续内存(RowMajor)
-void Gemm::PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
-                          float *buffer, const bool parallel) {
-  const int j_length = n - n_tail;
-
-  #pragma omp parallel for if (parallel)
-  for (int i = 0; i < k; ++i) {
-    int j = 0;
-    for (; j < j_length - 31; j += 32) {
-      float *local_buffer0 = buffer + j * k + i * NR;
-      float *local_buffer1 = buffer + (j + 8) * k + i * NR;
-      float *local_buffer2 = buffer + (j + 16) * k + i * NR;
-      float *local_buffer3 = buffer + (j + 24) * k + i * NR;
-      const float *b0 = B + i * ldb + j;
-#if __aarch64__
-      asm volatile(
-          "prfm   pldl1keep,       [%[b0]]                 \n"
-          "ld1    {v0.4s, v1.4s},  [%[b0]], #32            \n"
-          "ld1    {v2.4s, v3.4s},  [%[b0]], #32            \n"
-          "ld1    {v4.4s, v5.4s},  [%[b0]], #32            \n"
-          "ld1    {v6.4s, v7.4s},  [%[b0]]                 \n"
-          "st1    {v0.4s, v1.4s},  [%[local_buffer0]], #32 \n"
-          "st1    {v2.4s, v3.4s},  [%[local_buffer1]], #32 \n"
-          "st1    {v4.4s, v5.4s},  [%[local_buffer2]], #32 \n"
-          "st1    {v6.4s, v7.4s},  [%[local_buffer3]], #32 \n"
-          : [local_buffer0] "+r"(local_buffer0),
-            [local_buffer1] "+r"(local_buffer1),
-            [local_buffer2] "+r"(local_buffer2),
-            [local_buffer3] "+r"(local_buffer3), [b0] "+r"(b0)
-          :
-          : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-#else
-      asm volatile(
-          // "pld      [%[b]]                          \n"
-          "vld1.32  {q0, q1},   [%[b0]]!             \n"
-          "vld1.32  {q2, q3},   [%[b0]]!             \n"
-          "vld1.32  {q4, q5},   [%[b0]]!             \n"
-          "vld1.32  {q6, q7},   [%[b0]]!             \n"
-          "vst1.32  {q0, q1},   [%[local_buffer0]]!  \n"
-          "vst1.32  {q2, q3},   [%[local_buffer1]]!  \n"
-          "vst1.32  {q4, q5},   [%[local_buffer2]]!  \n"
-          "vst1.32  {q6, q7},   [%[local_buffer3]]!  \n"
-          : [local_buffer0] "+r"(local_buffer0),
-            [local_buffer1] "+r"(local_buffer1),
-            [local_buffer2] "+r"(local_buffer2),
-            [local_buffer3] "+r"(local_buffer3), [b0] "+r"(b0)
-          :
-          : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
-#endif  // __aarch64__
-    }
-    for (; j < j_length - 15; j += 16) {
-      float *local_buffer0 = buffer + j * k + i * NR;
-      float *local_buffer1 = buffer + (j + 8) * k + i * NR;
-      const float *b0 = &B(i, j);
-#if __ARM_NEON
-#if __aarch64__
-      asm volatile(
-          "prfm   pldl1keep,        [%[b0]]            \n"
-          "ld1    {v0.4s, v1.4s},   [%[b0]], #32       \n"
-          "ld1    {v2.4s, v3.4s},   [%[b0]]            \n"
-          "st1    {v0.4s, v1.4s},   [%[local_buffer0]],  #32 \n"
-          "st1    {v2.4s, v3.4s},   [%[local_buffer1]],  #32 \n"
-          : [local_buffer0] "+r"(local_buffer0),
-            [local_buffer1] "+r"(local_buffer1), [b0] "+r"(b0)
-          :
-          : "memory", "v0", "v1", "v2", "v3");
-#else
-      asm volatile(
-          //          "pld        [%[b0]]                     \n"
-          "vld1.32    {q0, q1},   [%[b0]]!               \n"
-          "vld1.32    {q2, q3},   [%[b0]]                \n"
-          "vst1.32    {q0, q1},   [%[local_buffer0]]!    \n"
-          "vst1.32    {q2, q3},   [%[local_buffer1]]!    \n"
-          : [local_buffer0] "+r"(local_buffer0),
-            [local_buffer1] "+r"(local_buffer1), [b0] "+r"(b0)
-          :
-          : "memory", "q0", "q1", "q2", "q3");
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-    }
-    for (; j < j_length; j += NR) {
-      float *local_buffer = buffer + j * k + i * NR;
-      const float *b0 = &B(i, j);
-#if __aarch64__
-      asm volatile(
-          "prfm     pldl1keep,       [%[b0]]            \n"
-          "ld1      {v0.4s, v1.4s},  [%[b0]]            \n"
-          "st1      {v0.4s, v1.4s},  [%[local_buffer]], #32 \n"
-          : [local_buffer] "+r"(local_buffer)
-          : [b0] "r"(b0)
-          : "memory", "v0", "v1");
-#else
-      asm volatile(
-          // "pld      [%[b]]                          \n"
-          "vld1.32  {q0, q1},   [%[b0]]              \n"
-          "vst1.32  {q0, q1},   [%[local_buffer]]        \n"
-          : [local_buffer] "+r"(local_buffer)
-          : [b0] "r"(b0)
-          : "memory", "q0", "q1");
-#endif  // __aarch64__
-    }
-  }
-  if (n_tail != 0) {
-    uint32_t mask[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-    uint32x4_t vzero = vdupq_n_u32(0);
-    uint32x4_t vmask1 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(n_tail));
-    uint32x4_t vmask2 = vcltq_u32(vld1q_u32(mask + 4), vdupq_n_u32(n_tail));
-
-    float *local_buffer = buffer + j_length * k;
-    for (int i = 0; i < k; ++i) {
-      const float *b0 = &B(i, j_length);
-#if __aarch64__
-      asm volatile(
-          "prfm   pldl1keep,       [%[b0]]            \n"
-          "ld1    {v0.4s, v1.4s},  [%[b0]]            \n"
-          "BIF    v0.8b, %[vzero].8b, %[vmask1].8b    \n"
-          "BIF    v1.8b, %[vzero].8b, %[vmask2].8b    \n"
-          "st1      {v0.4s, v1.4s},  [%[local_buffer]], #32 \n"
-          : [local_buffer] "+r"(local_buffer)
-          : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [vzero] "w"(vzero),
-            [b0] "r"(b0)
-          : "memory", "v0", "v1");
-#else
-      asm volatile(
-          "vld1.32  {q0, q1},   [%[b0]]              \n"
-          "vbif     q0, %q[vzero], %q[vmask1]        \n"
-          "vbif     q1, %q[vzero], %q[vmask2]        \n"
-          "vst1.32  {q0, q1},   [%[local_buffer]]!   \n"
-          : [local_buffer] "+r"(local_buffer)
-          : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [vzero] "w"(vzero),
-            [b0] "r"(b0)
-          : "memory", "q0", "q1");
-#endif
-    }
-  }
-}
-
-#if __ARM_NEON
-#if __aarch64__
-void Gemm::PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
-                           float *buffer, const bool parallel) {
-  const int j_length = n - n_tail;
-
-  #pragma omp parallel for if (parallel)
-  for (int j = 0; j < j_length; j += NR) {
-    float *local_buffer = buffer + j * k;
-    for (int i = 0; i < k; ++i) {
-      const float *b0 = &B(i, j);
-      asm volatile(
-          "prfm   pldl2keep,        [%[b0], #64]           \n\t"
-          "ld1    {v0.4s, v1.4s, v2.4s},   [%[b0]]           \n\t"
-          "st1    {v0.4s, v1.4s, v2.4s},   [%[local_buffer]],  #48 \n\t"
-          : [local_buffer] "+r"(local_buffer)
-          : [b0] "r"(b0)
-          : "memory", "v0", "v1", "v2");
-    }
-  }
-  if (n_tail != 0) {
-    float *local_buffer = buffer + j_length * k;
-    for (int i = 0; i < k; ++i) {
-      const float *b0 = &B(i, j_length);
-      for (int j = j_length; j < n; ++j) {
-        *local_buffer++ = *b0++;
-      }
-      for (int j = n; j < j_length + NR; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-}
-
-void Gemm::PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
-                           float *buffer, const bool parallel) {
-  const int j_length = n - n_tail;
-
-  #pragma omp parallel for if (parallel)
-  for (int j = 0; j < n - n_tail; j += NR) {
-    float *local_buffer = buffer + j * k;
-    for (int i = 0; i < k; ++i) {
-      const float *b0 = &B(i, j);
-      asm volatile(
-          "prfm   pldl2keep,        [%[b0], #64]           \n\t"
-          "ld1    {v0.4s, v1.4s, v2.4s, v3.4s},   [%[b0]]           \n\t"
-          "st1    {v0.4s, v1.4s, v2.4s, v3.4s},   [%[local_buffer]],  #64 \n\t"
-          : [local_buffer] "+r"(local_buffer)
-          : [b0] "r"(b0)
-          : "memory", "v0", "v1", "v2", "v3");
-    }
-  }
-  if (n_tail != 0) {
-    float *local_buffer = buffer + j_length * k;
-    for (int i = 0; i < k; ++i) {
-      const float *b0 = &B(i, j_length);
-      for (int j = j_length; j < n; ++j) {
-        *local_buffer++ = *b0++;
-      }
-      for (int j = n; j < j_length + NR; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-}
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-
-// 分块矩阵乘法
-void Gemm::InnerKernel(int mc, int nc, float alpha, const float *a,
-                       const float *b, float beta, float *c, float *C, int ldc,
-                       bool relu) {
-#pragma omp parallel for
-  for (int j = 0; j < nc; j += NR) {
-    for (int i = 0; i < mc; i += MR) {
-#if __aarch64__
-      // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#else
-      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#endif
-    }
-  }
-
-  if (alpha != 1) {
-    WriteWithAlphaBeta(mc, nc, c, C, ldc);
-    return;
-  }
-  if (beta == 0) {
-    WriteBasic(mc, nc, c, C, ldc);
-    return;
-  }
-  if (beta == 1 && !relu) {
-    WriteWithAdd(mc, nc, c, C, ldc);
-    return;
-  }
-  if (beta == 1 && relu) {
-    WriteWithAddRelu(mc, nc, c, C, ldc);
-    return;
-  }
-}
-
-// 分块矩阵乘法
-void Gemm::InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
-                               const float *b, float beta, float *c, float *C,
-                               int ldc, bool relu, float *bias) {
-#pragma omp parallel for
-  for (int j = 0; j < nc; j += NR) {
-    for (int i = 0; i < mc; i += MR) {
-#if __aarch64__
-      // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#else
-      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#endif
-    }
-  }
-
-  if (alpha != 1) {
-    WriteWithAlphaBeta(mc, nc, c, C, ldc);
-    return;
-  }
-  if (beta == 0) {
-    WriteBasic(mc, nc, c, C, ldc);
-    return;
-  }
-  if (beta == 1 && !relu) {
-    if (bias == nullptr) {
-      WriteWithAdd(mc, nc, c, C, ldc);
-    } else {
-      WriteWithAddV1(mc, nc, c, C, ldc, bias);
-    }
-    return;
-  }
-  if (beta == 1 && relu) {
-    if (bias == nullptr) {
-      WriteWithAddRelu(mc, nc, c, C, ldc);
-    } else {
-      WriteWithAddReluV1(mc, nc, c, C, ldc, bias);
-    }
-    return;
-  }
-}
-
-// 分块矩阵乘法
-void Gemm::InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
-                             const float *b, float beta, float *c, float *C,
-                             int ldc, bool relu, float *new_scale,
-                             float *new_bias) {
-#pragma omp parallel for
-  for (int j = 0; j < nc; j += NR) {
-    for (int i = 0; i < mc; i += MR) {
-#if __aarch64__
-      // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#else
-      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#endif
-    }
-  }
-
-  if (relu) {
-    WriteWithBnRelu(mc, nc, c, C, ldc, new_scale, new_bias);
-  } else {
-    WriteWithBn(mc, nc, c, C, ldc, new_scale, new_bias);
-  }
-}
-
-// 分块矩阵乘法
-void Gemm::InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
-                                const float *b, float beta, float *c, float *C,
-                                int ldc, bool relu, float *new_scale,
-                                float *new_bias, float *bias) {
-#pragma omp parallel for
-  for (int j = 0; j < nc; j += NR) {
-    for (int i = 0; i < mc; i += MR) {
-#if __aarch64__
-      // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#else
-      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#endif
-    }
-  }
-  WriteWithBnAddRelu(mc, nc, c, C, ldc, new_scale, new_bias, bias);
-}
-
-void Gemm::InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
-                                float *c, float *C, int ldc, float *p,
-                                std::string mode, float *bias, float *bias1) {
-#pragma omp parallel for
-  for (int j = 0; j < nc; j += NR) {
-    for (int i = 0; i < mc; i += MR) {
-#if __aarch64__
-      // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#else
-      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#endif
-    }
-  }
-  WriteWithAddPRelu(mc, nc, c, C, ldc, p, mode, bias, bias1);
-}
-
-#if __ARM_NEON
-#if __aarch64__
-
-void Gemm::AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
-  // init C
-  float32x4_t cv0 = vdupq_n_f32(0.0);
-  float32x4_t cv1 = vdupq_n_f32(0.0);
-  float32x4_t cv2 = vdupq_n_f32(0.0);
-  float32x4_t cv3 = vdupq_n_f32(0.0);
-  float32x4_t cv4 = vdupq_n_f32(0.0);
-  float32x4_t cv5 = vdupq_n_f32(0.0);
-  float32x4_t cv6 = vdupq_n_f32(0.0);
-  float32x4_t cv7 = vdupq_n_f32(0.0);
-  float32x4_t cv8 = vdupq_n_f32(0.0);
-  float32x4_t cv9 = vdupq_n_f32(0.0);
-  float32x4_t cv10 = vdupq_n_f32(0.0);
-  float32x4_t cv11 = vdupq_n_f32(0.0);
-
-  float32x4_t av;
-  float32x4_t bv0;
-  float32x4_t bv1;
-
-  float32x2_t av01;
-  float32x2_t av23;
-  float32x2_t av45;
-
-  for (int p = 0; p < k; p += 1) {
-    av = vld1q_f32(a);
-    av01 = vget_low_f32(av);
-    av23 = vget_high_f32(av);
-    av45 = vld1_f32(a + 4);
-    bv0 = vld1q_f32(b);
-    bv1 = vld1q_f32(b + 4);
-
-    cv0 = vmlaq_lane_f32(cv0, bv0, av01, 0);
-    cv1 = vmlaq_lane_f32(cv1, bv1, av01, 0);
-    cv2 = vmlaq_lane_f32(cv2, bv0, av01, 1);
-    cv3 = vmlaq_lane_f32(cv3, bv1, av01, 1);
-
-    cv4 = vmlaq_lane_f32(cv4, bv0, av23, 0);
-    cv5 = vmlaq_lane_f32(cv5, bv1, av23, 0);
-    cv6 = vmlaq_lane_f32(cv6, bv0, av23, 1);
-    cv7 = vmlaq_lane_f32(cv7, bv1, av23, 1);
-
-    cv8 = vmlaq_lane_f32(cv8, bv0, av45, 0);
-    cv9 = vmlaq_lane_f32(cv9, bv1, av45, 0);
-    cv10 = vmlaq_lane_f32(cv10, bv0, av45, 1);
-    cv11 = vmlaq_lane_f32(cv11, bv1, av45, 1);
-
-    a += MR;
-    b += NR;
-  }
-
-  vst1q_f32(c, cv0);
-  vst1q_f32(c + 4, cv1);
-  vst1q_f32(c + ldc, cv2);
-  vst1q_f32(c + ldc + 4, cv3);
-  vst1q_f32(c + 2 * ldc, cv4);
-  vst1q_f32(c + 2 * ldc + 4, cv5);
-  vst1q_f32(c + 3 * ldc, cv6);
-  vst1q_f32(c + 3 * ldc + 4, cv7);
-  vst1q_f32(c + 4 * ldc, cv8);
-  vst1q_f32(c + 4 * ldc + 4, cv9);
-  vst1q_f32(c + 5 * ldc, cv10);
-  vst1q_f32(c + 5 * ldc + 4, cv11);
-}
-
-void Gemm::AddDot8x12(int k, const float *a, const float *b, float *c,
-                      int ldc) {
-  const float *a_ptr, *b_ptr;
-  a_ptr = a;
-  b_ptr = b;
-  int kc1 = k;
-  int step = 4 * ldc;
-  asm volatile(
-      "dup      v5.4s,     wzr     \n\t"
-      "dup      v6.4s,     wzr     \n\t"
-      "dup      v7.4s,     wzr     \n\t"
-      "dup      v8.4s,     wzr     \n\t"
-      "dup      v9.4s,     wzr     \n\t"
-      "dup      v10.4s,    wzr     \n\t"
-      "dup      v11.4s,    wzr     \n\t"
-      "dup      v12.4s,    wzr     \n\t"
-      "dup      v13.4s,    wzr     \n\t"
-      "dup      v14.4s,    wzr     \n\t"
-      "dup      v15.4s,    wzr     \n\t"
-      "dup      v16.4s,    wzr     \n\t"
-
-      "dup      v17.4s,    wzr     \n\t"
-      "dup      v18.4s,    wzr     \n\t"
-      "dup      v19.4s,    wzr     \n\t"
-      "dup      v20.4s,    wzr     \n\t"
-      "dup      v21.4s,    wzr     \n\t"
-      "dup      v22.4s,    wzr     \n\t"
-      "dup      v23.4s,    wzr     \n\t"
-      "dup      v24.4s,    wzr     \n\t"
-      "dup      v25.4s,    wzr     \n\t"
-      "dup      v26.4s,    wzr     \n\t"
-      "dup      v27.4s,    wzr     \n\t"
-      "dup      v28.4s,    wzr     \n\t"
-
-      "subs       %[kc1], %[kc1], #1    \n\t"
-      "blt        2f                    \n\t"
-      "1:                               \n\t"
-
-      "prfm     pldl1keep,         [%[a_ptr],   #32]  \n\t"
-      "prfm     pldl1keep,         [%[b_ptr],   #48]  \n\t"
-
-      "ld1      {v0.4s, v1.4s},         [%[a_ptr]],   #32   \n\t"
-      "ld1      {v2.4s, v3.4s, v4.4s},  [%[b_ptr]],   #48   \n\t"
-
-      "fmla     v5.4s,    v2.4s,   v0.s[0]       \n\t"
-      "fmla     v6.4s,    v3.4s,   v0.s[0]       \n\t"
-      "fmla     v7.4s,    v4.4s,   v0.s[0]       \n\t"
-      "fmla     v8.4s,    v2.4s,   v0.s[1]       \n\t"
-      "fmla     v9.4s,    v3.4s,   v0.s[1]       \n\t"
-      "fmla     v10.4s,   v4.4s,   v0.s[1]       \n\t"
-      "fmla     v11.4s,   v2.4s,   v0.s[2]       \n\t"
-      "fmla     v12.4s,   v3.4s,   v0.s[2]       \n\t"
-      "fmla     v13.4s,   v4.4s,   v0.s[2]       \n\t"
-      "fmla     v14.4s,   v2.4s,   v0.s[3]       \n\t"
-      "fmla     v15.4s,   v3.4s,   v0.s[3]       \n\t"
-      "fmla     v16.4s,   v4.4s,   v0.s[3]       \n\t"
-
-      "fmla     v17.4s,   v2.4s,   v1.s[0]       \n\t"
-      "fmla     v18.4s,   v3.4s,   v1.s[0]       \n\t"
-      "fmla     v19.4s,   v4.4s,   v1.s[0]       \n\t"
-      "fmla     v20.4s,   v2.4s,   v1.s[1]       \n\t"
-      "fmla     v21.4s,   v3.4s,   v1.s[1]       \n\t"
-      "fmla     v22.4s,   v4.4s,   v1.s[1]       \n\t"
-      "fmla     v23.4s,   v2.4s,   v1.s[2]       \n\t"
-      "fmla     v24.4s,   v3.4s,   v1.s[2]       \n\t"
-      "fmla     v25.4s,   v4.4s,   v1.s[2]       \n\t"
-      "fmla     v26.4s,   v2.4s,   v1.s[3]       \n\t"
-      "fmla     v27.4s,   v3.4s,   v1.s[3]       \n\t"
-      "fmla     v28.4s,   v4.4s,   v1.s[3]       \n\t"
-
-      "subs       %[kc1], %[kc1], #1      \n\t"
-      "bge        1b                      \n\t"
-      "2:                                 \n\t"
-
-      "st1      {v5.4s,   v6.4s,  v7.4s},    [%[c]],   %[step]   \n\t"
-      "st1      {v8.4s,   v9.4s,  v10.4s},   [%[c]],   %[step]   \n\t"
-      "st1      {v11.4s,  v12.4s, v13.4s},   [%[c]],   %[step]   \n\t"
-      "st1      {v14.4s,  v15.4s, v16.4s},   [%[c]],   %[step]   \n\t"
-      "st1      {v17.4s,  v18.4s, v19.4s},   [%[c]],   %[step]   \n\t"
-      "st1      {v20.4s,  v21.4s, v22.4s},   [%[c]],   %[step]   \n\t"
-      "st1      {v23.4s,  v24.4s, v25.4s},   [%[c]],   %[step]   \n\t"
-      "st1      {v26.4s,  v27.4s, v28.4s},   [%[c]],   %[step]   \n\t"
-      :
-      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
-        [step] "r"(step)
-      : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
-        "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
-        "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28");
-}
-
-void Gemm::AddDot6x16(int k, const float *a, const float *b, float *c,
-                      int ldc) {
-  const float *a_ptr, *b_ptr;
-  a_ptr = a;
-  b_ptr = b;
-  int kc1 = k;
-  int step = 4 * ldc;
-  int step1 = 4 * 6;
-  asm volatile(
-
-      "dup      v6.4s,     wzr     \n\t"
-      "dup      v7.4s,     wzr     \n\t"
-      "dup      v8.4s,     wzr     \n\t"
-      "dup      v9.4s,     wzr     \n\t"
-      "dup      v10.4s,    wzr     \n\t"
-      "dup      v11.4s,    wzr     \n\t"
-      "dup      v12.4s,    wzr     \n\t"
-      "dup      v13.4s,    wzr     \n\t"
-
-      "dup      v14.4s,    wzr     \n\t"
-      "dup      v15.4s,    wzr     \n\t"
-      "dup      v16.4s,    wzr     \n\t"
-      "dup      v17.4s,    wzr     \n\t"
-      "dup      v18.4s,    wzr     \n\t"
-      "dup      v19.4s,    wzr     \n\t"
-      "dup      v20.4s,    wzr     \n\t"
-      "dup      v21.4s,    wzr     \n\t"
-
-      "dup      v22.4s,    wzr     \n\t"
-      "dup      v23.4s,    wzr     \n\t"
-      "dup      v24.4s,    wzr     \n\t"
-      "dup      v25.4s,    wzr     \n\t"
-      "dup      v26.4s,    wzr     \n\t"
-      "dup      v27.4s,    wzr     \n\t"
-      "dup      v28.4s,    wzr     \n\t"
-      "dup      v29.4s,    wzr     \n\t"
-
-      "subs       %[kc1], %[kc1], #1    \n\t"
-      "blt        2f                    \n\t"
-      "1:                               \n\t"
-
-      "prfm   pldl1keep,  [%[a_ptr],  #24]  \n\t"
-      "prfm   pldl1keep,  [%[b_ptr],  #64]  \n\t"
-
-      "ld1      {v0.4s, v1.4s},  [%[a_ptr]],   %[step1]       \n\t"
-      "ld1      {v2.4s, v3.4s, v4.4s, v5.4s},  [%[b_ptr]],    #64   \n\t"
-
-      "fmla     v6.4s,    v2.4s,   v0.s[0]       \n\t"
-      "fmla     v7.4s,    v3.4s,   v0.s[0]       \n\t"
-      "fmla     v8.4s,    v4.4s,   v0.s[0]       \n\t"
-      "fmla     v9.4s,    v5.4s,   v0.s[0]       \n\t"
-
-      "fmla     v10.4s,   v2.4s,   v0.s[1]       \n\t"
-      "fmla     v11.4s,   v3.4s,   v0.s[1]       \n\t"
-      "fmla     v12.4s,   v4.4s,   v0.s[1]       \n\t"
-      "fmla     v13.4s,   v5.4s,   v0.s[1]       \n\t"
-
-      "fmla     v14.4s,   v2.4s,   v0.s[2]       \n\t"
-      "fmla     v15.4s,   v3.4s,   v0.s[2]       \n\t"
-      "fmla     v16.4s,   v4.4s,   v0.s[2]       \n\t"
-      "fmla     v17.4s,   v5.4s,   v0.s[2]       \n\t"
-
-      "fmla     v18.4s,   v2.4s,   v0.s[3]       \n\t"
-      "fmla     v19.4s,   v3.4s,   v0.s[3]       \n\t"
-      "fmla     v20.4s,   v4.4s,   v0.s[3]       \n\t"
-      "fmla     v21.4s,   v5.4s,   v0.s[3]       \n\t"
-
-      "fmla     v22.4s,   v2.4s,   v1.s[0]       \n\t"
-      "fmla     v23.4s,   v3.4s,   v1.s[0]       \n\t"
-      "fmla     v24.4s,   v4.4s,   v1.s[0]       \n\t"
-      "fmla     v25.4s,   v5.4s,   v1.s[0]       \n\t"
-
-      "fmla     v26.4s,   v2.4s,   v1.s[1]       \n\t"
-      "fmla     v27.4s,   v3.4s,   v1.s[1]       \n\t"
-      "fmla     v28.4s,   v4.4s,   v1.s[1]       \n\t"
-      "fmla     v29.4s,   v5.4s,   v1.s[1]       \n\t"
-
-      "subs       %[kc1], %[kc1], #1      \n\t"
-      "bge        1b                      \n\t"
-      "2:                                 \n\t"
-
-      "st1      {v6.4s,  v7.4s,  v8.4s,  v9.4s},    [%[c]],   %[step]   \n\t"
-      "st1      {v10.4s, v11.4s, v12.4s, v13.4s},   [%[c]],   %[step]   \n\t"
-      "st1      {v14.4s, v15.4s, v16.4s, v17.4s},   [%[c]],   %[step]   \n\t"
-      "st1      {v18.4s, v19.4s, v20.4s, v21.4s},   [%[c]],   %[step]   \n\t"
-      "st1      {v22.4s, v23.4s, v24.4s, v25.4s},   [%[c]],   %[step]   \n\t"
-      "st1      {v26.4s, v27.4s, v28.4s, v29.4s},   [%[c]],   %[step]   \n\t"
-      :
-      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
-        [step] "r"(step), [step1] "r"(step1)
-      : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
-        "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
-        "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29");
-}
-
-#else
-
-void Gemm::AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
-  const float *a_ptr, *b_ptr;
-  a_ptr = a;
-  b_ptr = b;
-  int kc1 = k / 4;
-  int kc2 = k % 4;
-  int step = 4 * ldc;
-  asm volatile(
-      "pld        [%[a_ptr]]          \n\t"
-      "pld        [%[b_ptr]]          \n\t"
-      "vmov.f32   q10,    #0.0        \n\t"
-      "vmov.f32   q11,    #0.0        \n\t"
-      "vmov.f32   q12,    #0.0        \n\t"
-      "vmov.f32   q13,    #0.0        \n\t"
-
-      "subs       %[kc1], %[kc1], #1  \n\t"
-      "blt        end_kc1_%=          \n\t"
-      "loop_kc1_%=:                   \n\t"
-      "pld        [%[a_ptr], #64]     \n\t"
-      "pld        [%[b_ptr], #64]     \n\t"
-      "vld1.32    {q0, q1}, [%[a_ptr]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
-      "vmla.f32   q10, q2, d0[0]      \n\t"
-      "vmla.f32   q11, q2, d0[1]      \n\t"
-      "vmla.f32   q12, q2, d1[0]      \n\t"
-      "vmla.f32   q13, q2, d1[1]      \n\t"
-      "vmla.f32   q10, q3, d2[0]      \n\t"
-      "vmla.f32   q11, q3, d2[1]      \n\t"
-      "vmla.f32   q12, q3, d3[0]      \n\t"
-      "vmla.f32   q13, q3, d3[1]      \n\t"
-      "vld1.32    {q4, q5}, [%[a_ptr]]!   \n\t"
-      "vld1.32    {q6, q7}, [%[b_ptr]]!   \n\t"
-      "vmla.f32   q10, q6, d8[0]      \n\t"
-      "vmla.f32   q11, q6, d8[1]      \n\t"
-      "vmla.f32   q12, q6, d9[0]      \n\t"
-      "vmla.f32   q13, q6, d9[1]      \n\t"
-      "vmla.f32   q10, q7, d10[0]     \n\t"
-      "vmla.f32   q11, q7, d10[1]     \n\t"
-      "vmla.f32   q12, q7, d11[0]     \n\t"
-      "vmla.f32   q13, q7, d11[1]     \n\t"
-      "subs       %[kc1], %[kc1], #1  \n\t"
-      "bge        loop_kc1_%=         \n\t"
-      "end_kc1_%=:                    \n\t"
-
-      "subs       %[kc2], %[kc2], #1  \n\t"
-      "blt        end_kc2_%=          \n\t"
-      "loop_kc2_%=:                   \n\t"
-      "vld1.32    {q0}, [%[a_ptr]]!   \n\t"
-      "vld1.32    {q1}, [%[b_ptr]]!   \n\t"
-      "vmla.f32   q10, q1, d0[0]      \n\t"
-      "vmla.f32   q11, q1, d0[1]      \n\t"
-      "vmla.f32   q12, q1, d1[0]      \n\t"
-      "vmla.f32   q13, q1, d1[1]      \n\t"
-      "subs       %[kc2], %[kc2], #1  \n\t"
-      "bge        loop_kc2_%=         \n\t"
-      "end_kc2_%=:                    \n\t"
-
-      "mov        r5,     %[c]        \n\t"
-      "mov        r6,     %[step]     \n\t"
-      "vst1.32    {q10}, [r5], r6     \n\t"
-      "vst1.32    {q11}, [r5], r6     \n\t"
-      "vst1.32    {q12}, [r5], r6     \n\t"
-      "vst1.32    {q13}, [r5]         \n\t"
-      :
-      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
-        [kc2] "r"(kc2), [step] "r"(step)
-      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-        "q10", "q11", "q12", "q13");
-}
-
-void Gemm::AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
-  const float *a_ptr, *b_ptr;
-  a_ptr = a;
-  b_ptr = b;
-  int kc1 = k / 4;
-  int kc2 = k % 4;
-  int step = 4 * ldc;
-  asm volatile(
-      "pld        [%[a_ptr]]          \n\t"
-      "pld        [%[b_ptr]]          \n\t"
-
-      "vmov.f32   q8,     #0.0        \n\t"
-      "vmov.f32   q9,     #0.0        \n\t"
-      "vmov.f32   q10,    #0.0        \n\t"
-      "vmov.f32   q11,    #0.0        \n\t"
-      "vmov.f32   q12,    #0.0        \n\t"
-      "vmov.f32   q13,    #0.0        \n\t"
-      "vmov.f32   q14,    #0.0        \n\t"
-      "vmov.f32   q15,    #0.0        \n\t"
-
-      "subs       %[kc1], %[kc1], #1  \n\t"
-      "blt        end_kc1_%=          \n\t"
-      "loop_kc1_%=:                   \n\t"
-
-      "pld        [%[a_ptr], #64]     \n\t"
-      "pld        [%[b_ptr], #64]     \n\t"
-
-      "vld1.32    {q0, q1}, [%[a_ptr]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
-      "vld1.32    {q4, q5}, [%[b_ptr]]!   \n\t"
-
-      "vmla.f32   q8,   q2,   d0[0]      \n\t"
-      "vmla.f32   q9,   q3,   d0[0]      \n\t"
-      "vmla.f32   q10,  q2,   d0[1]      \n\t"
-      "vmla.f32   q11,  q3,   d0[1]      \n\t"
-      "vmla.f32   q12,  q2,   d1[0]      \n\t"
-      "vmla.f32   q13,  q3,   d1[0]      \n\t"
-      "vmla.f32   q14,  q2,   d1[1]      \n\t"
-      "vmla.f32   q15,  q3,   d1[1]      \n\t"
-
-      "vmla.f32   q8,   q4,   d2[0]      \n\t"
-      "vmla.f32   q9,   q5,   d2[0]      \n\t"
-      "vmla.f32   q10,  q4,   d2[1]      \n\t"
-      "vmla.f32   q11,  q5,   d2[1]      \n\t"
-      "vmla.f32   q12,  q4,   d3[0]      \n\t"
-      "vmla.f32   q13,  q5,   d3[0]      \n\t"
-      "vmla.f32   q14,  q4,   d3[1]      \n\t"
-      "vmla.f32   q15,  q5,   d3[1]      \n\t"
-
-      "pld        [%[b_ptr], #64]     \n\t"
-
-      "vld1.32    {q0, q1}, [%[a_ptr]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
-      "vld1.32    {q4, q5}, [%[b_ptr]]!   \n\t"
-
-      "vmla.f32   q8,   q2,   d0[0]      \n\t"
-      "vmla.f32   q9,   q3,   d0[0]      \n\t"
-      "vmla.f32   q10,  q2,   d0[1]      \n\t"
-      "vmla.f32   q11,  q3,   d0[1]      \n\t"
-      "vmla.f32   q12,  q2,   d1[0]      \n\t"
-      "vmla.f32   q13,  q3,   d1[0]      \n\t"
-      "vmla.f32   q14,  q2,   d1[1]      \n\t"
-      "vmla.f32   q15,  q3,   d1[1]      \n\t"
-
-      "vmla.f32   q8,   q4,   d2[0]      \n\t"
-      "vmla.f32   q9,   q5,   d2[0]      \n\t"
-      "vmla.f32   q10,  q4,   d2[1]      \n\t"
-      "vmla.f32   q11,  q5,   d2[1]      \n\t"
-      "vmla.f32   q12,  q4,   d3[0]      \n\t"
-      "vmla.f32   q13,  q5,   d3[0]      \n\t"
-      "vmla.f32   q14,  q4,   d3[1]      \n\t"
-      "vmla.f32   q15,  q5,   d3[1]      \n\t"
-
-      "subs       %[kc1], %[kc1], #1  \n\t"
-      "bge        loop_kc1_%=         \n\t"
-      "end_kc1_%=:                    \n\t"
-
-      "subs       %[kc2], %[kc2], #1  \n\t"
-      "blt        end_kc2_%=          \n\t"
-      "loop_kc2_%=:                   \n\t"
-      "vld1.32    {q0},     [%[a_ptr]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
-      "vmla.f32   q8,   q2,   d0[0]      \n\t"
-      "vmla.f32   q9,   q3,   d0[0]      \n\t"
-      "vmla.f32   q10,  q2,   d0[1]      \n\t"
-      "vmla.f32   q11,  q3,   d0[1]      \n\t"
-      "vmla.f32   q12,  q2,   d1[0]      \n\t"
-      "vmla.f32   q13,  q3,   d1[0]      \n\t"
-      "vmla.f32   q14,  q2,   d1[1]      \n\t"
-      "vmla.f32   q15,  q3,   d1[1]      \n\t"
-      "subs       %[kc2], %[kc2], #1  \n\t"
-      "bge        loop_kc2_%=         \n\t"
-      "end_kc2_%=:                    \n\t"
-
-      "mov        r5,     %[c]        \n\t"
-      "mov        r6,     %[step]     \n\t"
-      "vst1.32    {q8, q9},   [r5], r6     \n\t"
-      "vst1.32    {q10, q11}, [r5], r6     \n\t"
-      "vst1.32    {q12, q13}, [r5], r6     \n\t"
-      "vst1.32    {q14, q15}, [r5]         \n\t"
-      :
-      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
-        [kc2] "r"(kc2), [step] "r"(step)
-      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9",
-        "q10", "q11", "q12", "q13", "q14", "q15");
-}
-
-void Gemm::AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
-  const float *a_ptr, *b_ptr;
-  a_ptr = a;
-  b_ptr = b;
-  int kc1 = k / 8;
-  int kc2 = k % 8;
-  int step = sizeof(float) * ldc;
-  asm volatile(
-      "pld        [%[a_ptr]]            \n\t"
-      "pld        [%[a_ptr],  #64]      \n\t"
-      "pld        [%[b_ptr]]            \n\t"
-      "pld        [%[b_ptr],  #64]      \n\t"
-
-      "vmov.f32   q4,     #0.0          \n\t"
-      "vmov.f32   q5,     #0.0          \n\t"
-      "vmov.f32   q6,     #0.0          \n\t"
-      "vmov.f32   q7,     #0.0          \n\t"
-      "vmov.f32   q8,     #0.0          \n\t"
-      "vmov.f32   q9,     #0.0          \n\t"
-      "vmov.f32   q10,    #0.0          \n\t"
-      "vmov.f32   q11,    #0.0          \n\t"
-      "vmov.f32   q12,    #0.0          \n\t"
-      "vmov.f32   q13,    #0.0          \n\t"
-      "vmov.f32   q14,    #0.0          \n\t"
-      "vmov.f32   q15,    #0.0          \n\t"
-
-      "subs       %[kc1], %[kc1], #1    \n\t"
-      "blt        2f                    \n\t"
-      "1:                               \n\t"
-
-      "pld        [%[a_ptr], #128]       \n\t"
-      "pld        [%[b_ptr], #128]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "pld        [%[a_ptr], #128]       \n\t"
-      "pld        [%[b_ptr], #128]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "pld        [%[a_ptr], #128]       \n\t"
-      "pld        [%[b_ptr], #128]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "pld        [%[a_ptr], #128]       \n\t"
-      "pld        [%[b_ptr], #128]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "subs       %[kc1], %[kc1], #1      \n\t"
-      "bge        1b                      \n\t"
-      "2:                                 \n\t"
-
-      "subs       %[kc2], %[kc2], #1      \n\t"
-      "blt        4f                      \n\t"
-      "3:                                 \n\t"
-
-      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "subs       %[kc2], %[kc2], #1      \n\t"
-      "bge        3b                      \n\t"
-      "4:                                 \n\t"
-
-      "mov        r5,     %[c]            \n\t"
-      "mov        r6,     %[step]         \n\t"
-      "vst1.32    {q4, q5},   [r5], r6    \n\t"
-      "vst1.32    {q6, q7},   [r5], r6    \n\t"
-      "vst1.32    {q8, q9},   [r5], r6    \n\t"
-      "vst1.32    {q10, q11}, [r5], r6    \n\t"
-      "vst1.32    {q12, q13}, [r5], r6    \n\t"
-      "vst1.32    {q14, q15}, [r5]        \n\t"
-
-      :
-      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
-        [kc2] "r"(kc2), [step] "r"(step)
-      : "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
-        "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-}
-
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-
-#if __ARM_NEON
-#if __aarch64__
-
-// 分块矩阵乘法结果回写
-// C = A * B
-void Gemm::WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  float *c_ptr, *C_ptr;
-  float32x4_t cv;
-  for (int i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      if (_nc1 >= 1) {
-        vst1q_lane_f32(C_ptr, cv, 0);
-        C_ptr++;
-      }
-      if (_nc1 >= 2) {
-        vst1q_lane_f32(C_ptr, cv, 1);
-        C_ptr++;
-      }
-      if (_nc1 >= 3) {
-        vst1q_lane_f32(C_ptr, cv, 2);
-      }
-    }
-  }
-}
-
-// C = alpha * A * B + beta * C
-void Gemm::WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
-
-// C = A * B + C
-void Gemm::WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  float *c_ptr, *C_ptr;
-  float32x4_t cv;
-  float32x4_t cv1;
-  for (int i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      cv1 = vld1q_f32(C_ptr);
-      cv = vaddq_f32(cv, cv1);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      cv1 = vld1q_f32(C_ptr);
-      cv = vaddq_f32(cv, cv1);
-      if (_nc1 >= 1) {
-        vst1q_lane_f32(C_ptr, cv, 0);
-        C_ptr++;
-      }
-      if (_nc1 >= 2) {
-        vst1q_lane_f32(C_ptr, cv, 1);
-        C_ptr++;
-      }
-      if (_nc1 >= 3) {
-        vst1q_lane_f32(C_ptr, cv, 2);
-      }
-    }
-  }
-}
-// C = A * B + bias
-void Gemm::WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc,
-                          float *bias) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  float *c_ptr, *C_ptr;
-  float32x4_t cv;
-  float32x4_t biasv;
-  for (int i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    biasv = vld1q_dup_f32(bias + i);
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      cv = vaddq_f32(cv, biasv);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      cv = vaddq_f32(cv, biasv);
-      if (_nc1 >= 1) {
-        vst1q_lane_f32(C_ptr, cv, 0);
-        C_ptr++;
-      }
-      if (_nc1 >= 2) {
-        vst1q_lane_f32(C_ptr, cv, 1);
-        C_ptr++;
-      }
-      if (_nc1 >= 3) {
-        vst1q_lane_f32(C_ptr, cv, 2);
-        C_ptr++;
-      }
-    }
-  }
-}
-
-// C = A * B + C, relu(C)
-void Gemm::WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  float *c_ptr, *C_ptr;
-  float32x4_t cv;
-  float32x4_t cv1;
-  float32x4_t zero = vdupq_n_f32(0.0);
-  for (int i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      cv1 = vld1q_f32(C_ptr);
-      cv = vaddq_f32(cv, cv1);
-      cv = vmaxq_f32(cv, zero);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      cv1 = vld1q_f32(C_ptr);
-      cv = vaddq_f32(cv, cv1);
-      cv = vmaxq_f32(cv, zero);
-      if (_nc1 >= 1) {
-        vst1q_lane_f32(C_ptr, cv, 0);
-        C_ptr++;
-      }
-      if (_nc1 >= 2) {
-        vst1q_lane_f32(C_ptr, cv, 1);
-        C_ptr++;
-      }
-      if (_nc1 >= 3) {
-        vst1q_lane_f32(C_ptr, cv, 2);
-      }
-    }
-  }
-}
-
-// C = A * B + bias, relu(C)
-void Gemm::WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
-                              float *bias) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  float *c_ptr, *C_ptr;
-  float32x4_t cv;
-  float32x4_t biasv;
-  float32x4_t zero = vdupq_n_f32(0.0);
-  for (int i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    biasv = vld1q_dup_f32(bias + i);
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      cv = vaddq_f32(cv, biasv);
-      cv = vmaxq_f32(cv, zero);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      cv = vaddq_f32(cv, biasv);
-      cv = vmaxq_f32(cv, zero);
-      if (_nc1 >= 1) {
-        vst1q_lane_f32(C_ptr, cv, 0);
-        C_ptr++;
-      }
-      if (_nc1 >= 2) {
-        vst1q_lane_f32(C_ptr, cv, 1);
-        C_ptr++;
-      }
-      if (_nc1 >= 3) {
-        vst1q_lane_f32(C_ptr, cv, 2);
-        C_ptr++;
-      }
-    }
-  }
-}
-
-// C = A * B + C,prelu(C)
-void Gemm::WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc,
-                             float *p, std::string mode, float *bias,
-                             float *bias1) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  float *c_ptr, *C_ptr;
-  float32x4_t cv;
-  float32x4_t cv1;
-  float32x4_t biasv;
-  float32x4_t biasv1;
-  float32x4_t zero = vdupq_n_f32(0.0);
-  float32x4_t pv;
-  float *ptr = p;
-  for (int i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    biasv = vld1q_dup_f32(bias + i);
-    if (bias1 == nullptr) {
-      biasv1 = zero;
-    } else {
-      biasv1 = vld1q_dup_f32(bias1 + i);
-    }
-
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      cv = vaddq_f32(cv, biasv);
-      cv = vaddq_f32(cv, biasv1);
-      cv = vmaxq_f32(cv, zero);
-      cv1 = vminq_f32(cv, zero);
-      if (mode == "channel") {
-        cv1 = vmulq_n_f32(cv1, ptr[i]);
-      } else if (mode == "element") {
-        pv = vld1q_f32(ptr);
-        cv1 = vmulq_f32(cv1, pv);
-        ptr = ptr + 4;
-      } else {
-        cv1 = vmulq_n_f32(cv1, ptr[0]);
-      }
-      cv = vaddq_f32(cv, cv1);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      cv = vaddq_f32(cv, biasv);
-      cv = vaddq_f32(cv, biasv1);
-      cv = vmaxq_f32(cv, zero);
-      cv1 = vminq_f32(cv, zero);
-      if (mode == "channel") {
-        cv1 = vmulq_n_f32(cv1, ptr[i]);
-      } else if (mode == "element") {
-        pv = vld1q_f32(ptr);
-        cv1 = vmulq_f32(cv1, pv);
-        ptr = ptr + 4;
-      } else {
-        cv1 = vmulq_n_f32(cv1, ptr[0]);
-      }
-      cv = vaddq_f32(cv, cv1);
-      if (_nc1 >= 1) {
-        vst1q_lane_f32(C_ptr, cv, 0);
-        C_ptr++;
-      }
-      if (_nc1 >= 2) {
-        vst1q_lane_f32(C_ptr, cv, 1);
-        C_ptr++;
-      }
-      if (_nc1 >= 3) {
-        vst1q_lane_f32(C_ptr, cv, 2);
-        C_ptr++;
-      }
-    }
-  }
-}
-
-// C = A * B, batchnorm(C)
-void Gemm::WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
-                       float *new_scale, float *new_bias) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  float *c_ptr, *C_ptr;
-  float32x4_t cv;
-  float32x4_t cv1;
-  float32x4_t bias;
-  float32x2_t scale;
-  for (int i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    bias = vld1q_dup_f32(new_bias);
-    scale = vld1_dup_f32(new_scale);
-    new_bias++;
-    new_scale++;
-    float scale0 = vget_lane_f32(scale, 0);
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      cv = vmlaq_n_f32(bias, cv, scale0);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      cv = vmlaq_n_f32(bias, cv, scale0);
-      if (_nc1 >= 1) {
-        vst1q_lane_f32(C_ptr, cv, 0);
-        C_ptr++;
-      }
-      if (_nc1 >= 2) {
-        vst1q_lane_f32(C_ptr, cv, 1);
-        C_ptr++;
-      }
-      if (_nc1 >= 3) {
-        vst1q_lane_f32(C_ptr, cv, 2);
-        C_ptr++;
-      }
-    }
-  }
-}
-
-// C = A * B, batchnorm(C), relu(C)
-void Gemm::WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
-                           float *new_scale, float *new_bias) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  float *c_ptr, *C_ptr;
-  float32x4_t cv;
-  float32x4_t bias;
-  float32x2_t scale;
-  float32x4_t zero = vdupq_n_f32(0.0);
-  for (int i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    bias = vld1q_dup_f32(new_bias);
-    scale = vld1_dup_f32(new_scale);
-    new_bias++;
-    new_scale++;
-    float scale0 = vget_lane_f32(scale, 0);
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      cv = vmlaq_n_f32(bias, cv, scale0);
-      cv = vmaxq_f32(cv, zero);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      cv = vmlaq_n_f32(bias, cv, scale0);
-      cv = vmaxq_f32(cv, zero);
-      if (_nc1 >= 1) {
-        vst1q_lane_f32(C_ptr, cv, 0);
-        C_ptr++;
-      }
-      if (_nc1 >= 2) {
-        vst1q_lane_f32(C_ptr, cv, 1);
-        C_ptr++;
-      }
-      if (_nc1 >= 3) {
-        vst1q_lane_f32(C_ptr, cv, 2);
-      }
-    }
-  }
-}
-
-// C = A * B, batchnorm(C),C = C + bias; relu(C)
-void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
-                              float *new_scale, float *new_bias, float *bias) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  float *c_ptr, *C_ptr, *bias_ptr;
-  float32x4_t cv;
-  float32x4_t nbias;
-  float32x2_t scale;
-  float32x4_t biasv;
-  float32x4_t zero = vdupq_n_f32(0.0);
-  for (int i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    bias_ptr = bias + i * ldc;
-    nbias = vld1q_dup_f32(new_bias);
-    scale = vld1_dup_f32(new_scale);
-    new_bias++;
-    new_scale++;
-    float scale0 = vget_lane_f32(scale, 0);
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      biasv = vld1q_f32(bias_ptr);
-      cv = vmlaq_n_f32(nbias, cv, scale0);
-      cv = vaddq_f32(cv, biasv);
-      cv = vmaxq_f32(cv, zero);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-      bias_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      biasv = vld1q_f32(bias_ptr);
-      cv = vmlaq_n_f32(nbias, cv, scale0);
-      cv = vaddq_f32(cv, biasv);
-      cv = vmaxq_f32(cv, zero);
-      if (_nc1 >= 1) {
-        vst1q_lane_f32(C_ptr, cv, 0);
-        C_ptr++;
-      }
-      if (_nc1 >= 2) {
-        vst1q_lane_f32(C_ptr, cv, 1);
-        C_ptr++;
-      }
-      if (_nc1 >= 3) {
-        vst1q_lane_f32(C_ptr, cv, 2);
-      }
-    }
-  }
-}
-
-#else
-
-void Gemm::VectorKernel(int m, int n, int k, float alpha, const float *A,
-                        int lda, const float *B, int ldb, float beta, float *C,
-                        int ldc, bool relu) {
-  float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
-
-  const float *a0, *b0, *b1, *b2, *b3;
-  float *c0, *C0;
-
-  int volatile kc1 = k / 4;
-  int volatile kc2 = k % 4;
-  int volatile nc1 = n / 16;
-  int _nc1 = n % 16;
-  int volatile nc2 = _nc1 / 4;
-  int volatile nc3 = _nc1 % 4;
-  for (int i = 0; i < kc1; i++) {
-    a0 = A + i * 4;
-    b0 = B + i * 4 * ldb;
-    b1 = b0 + ldb;
-    b2 = b1 + ldb;
-    b3 = b2 + ldb;
-    c0 = bufferC;
-    asm volatile(
-        "pld        [%[a0], #16]          \n\t"
-        "vld1.32    {q0}, [%[a0]]         \n\t"
-
-        "subs       %[nc1], %[nc1], #1    \n\t"
-        "blt        end_nc1_%=            \n\t"
-        "loop_nc1_%=:                     \n\t"
-
-        "cmp        %[i],       #0        \n\t"
-        "beq        i_eq0_%=              \n\t"
-        "bne        i_ne0_%=              \n\t"
-
-        "i_eq0_%=:                        \n\t"
-        "vmov.f32   q10,    #0.0          \n\t"
-        "vmov.f32   q11,    #0.0          \n\t"
-        "vmov.f32   q12,    #0.0          \n\t"
-        "vmov.f32   q13,    #0.0          \n\t"
-        "b          gemm_nc1_%=           \n\t"
-
-        "i_ne0_%=:                        \n\t"
-        "pld        [%[c0], #64]          \n\t"
-        "vld1.32    {q10, q11}, [%[c0]]!  \n\t"
-        "vld1.32    {q12, q13}, [%[c0]]   \n\t"
-        "sub        %[c0], %[c0], #32     \n\t"
-
-        "gemm_nc1_%=:                     \n\t"
-        "pld        [%[b0], #64]          \n\t"
-        "vld1.32    {q2, q3}, [%[b0]]!    \n\t"
-        "vld1.32    {q4, q5}, [%[b0]]!    \n\t"
-        "vmla.f32   q10, q2, d0[0]        \n\t"
-        "vmla.f32   q11, q3, d0[0]        \n\t"
-        "vmla.f32   q12, q4, d0[0]        \n\t"
-        "vmla.f32   q13, q5, d0[0]        \n\t"
-
-        "pld        [%[b1], #64]          \n\t"
-        "vld1.32    {q2, q3}, [%[b1]]!    \n\t"
-        "vld1.32    {q4, q5}, [%[b1]]!    \n\t"
-        "vmla.f32   q10, q2, d0[1]        \n\t"
-        "vmla.f32   q11, q3, d0[1]        \n\t"
-        "vmla.f32   q12, q4, d0[1]        \n\t"
-        "vmla.f32   q13, q5, d0[1]        \n\t"
-
-        "pld        [%[b2], #64]          \n\t"
-        "vld1.32    {q2, q3}, [%[b2]]!    \n\t"
-        "vld1.32    {q4, q5}, [%[b2]]!    \n\t"
-        "vmla.f32   q10, q2, d1[0]        \n\t"
-        "vmla.f32   q11, q3, d1[0]        \n\t"
-        "vmla.f32   q12, q4, d1[0]        \n\t"
-        "vmla.f32   q13, q5, d1[0]        \n\t"
-
-        "pld        [%[b3], #64]          \n\t"
-        "vld1.32    {q2, q3}, [%[b3]]!    \n\t"
-        "vld1.32    {q4, q5}, [%[b3]]!    \n\t"
-        "vmla.f32   q10, q2, d1[1]        \n\t"
-        "vmla.f32   q11, q3, d1[1]        \n\t"
-        "vmla.f32   q12, q4, d1[1]        \n\t"
-        "vmla.f32   q13, q5, d1[1]        \n\t"
-
-        "vst1.32    {q10, q11}, [%[c0]]!  \n\t"
-        "vst1.32    {q12, q13}, [%[c0]]!  \n\t"
-
-        "subs       %[nc1], %[nc1], #1    \n\t"
-        "bge        loop_nc1_%=           \n\t"
-        "end_nc1_%=:                      \n\t"
-
-        "subs       %[nc2], %[nc2], #1    \n\t"
-        "blt        end_nc2_%=            \n\t"
-        "loop_nc2_%=:                     \n\t"
-
-        "cmp        %[i],       #0        \n\t"
-        "beq        ii_eq0_%=             \n\t"
-        "bne        ii_ne0_%=             \n\t"
-
-        "ii_eq0_%=:                       \n\t"
-        "vmov.f32   q10,    #0.0          \n\t"
-        "b          gemm_nc2_%=           \n\t"
-
-        "ii_ne0_%=:                       \n\t"
-        "pld        [%[c0], #16]          \n\t"
-        "vld1.32    {q10}, [%[c0]]        \n\t"
-
-        "gemm_nc2_%=:                     \n\t"
-        "pld        [%[b0], #16]          \n\t"
-        "vld1.32    {q2}, [%[b0]]!        \n\t"
-        "vmla.f32   q10, q2, d0[0]        \n\t"
-
-        "pld        [%[b1], #16]          \n\t"
-        "vld1.32    {q3}, [%[b1]]!        \n\t"
-        "vmla.f32   q10, q3, d0[1]        \n\t"
-
-        "pld        [%[b2], #16]          \n\t"
-        "vld1.32    {q4}, [%[b2]]!        \n\t"
-        "vmla.f32   q10, q4, d1[0]        \n\t"
-
-        "pld        [%[b3], #16]          \n\t"
-        "vld1.32    {q5}, [%[b3]]!        \n\t"
-        "vmla.f32   q10, q5, d1[1]        \n\t"
-
-        "vst1.32    {q10}, [%[c0]]!       \n\t"
-
-        "subs       %[nc2], %[nc2], #1    \n\t"
-        "bge        loop_nc2_%=           \n\t"
-        "end_nc2_%=:                      \n\t"
-
-        : [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3),
-          [c0] "+r"(c0)
-        : [a0] "r"(a0), [i] "r"(i), [nc1] "r"(nc1), [nc2] "r"(nc2)
-        : "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13");
-
-    for (int j = 0; j < nc3; j++) {
-      if (i == 0) {
-        *c0 = (*a0) * (*b0++);
-      } else {
-        *c0 += (*a0) * (*b0++);
-      }
-      *c0 += (*(a0 + 1)) * (*b1++);
-      *c0 += (*(a0 + 2)) * (*b2++);
-      *c0 += (*(a0 + 3)) * (*b3++);
-      c0++;
-    }
-  }
-
-  for (int i = 0; i < kc2; ++i) {
-    a0 = A + 4 * kc1 + i;
-    b0 = B + (4 * kc1 + i) * ldb;
-    c0 = bufferC;
-    asm volatile(
-        "pld        [%[a0], #16]          \n\t"
-        "vld1.32    {d0}, [%[a0]]         \n\t"
-
-        "subs       %[nc1], %[nc1], #1    \n\t"
-        "blt        end_nc1_%=            \n\t"
-        "loop_nc1_%=:                     \n\t"
-
-        "pld        [%[c0], #64]          \n\t"
-        "vld1.32    {q10, q11}, [%[c0]]!  \n\t"
-        "vld1.32    {q12, q13}, [%[c0]]   \n\t"
-        "sub        %[c0], %[c0], #32     \n\t"
-
-        "gemm_nc1_%=:                     \n\t"
-        "pld        [%[b0], #64]          \n\t"
-        "vld1.32    {q2, q3}, [%[b0]]!    \n\t"
-        "vld1.32    {q4, q5}, [%[b0]]!    \n\t"
-        "vmla.f32   q10, q2, d0[0]        \n\t"
-        "vmla.f32   q11, q3, d0[0]        \n\t"
-        "vmla.f32   q12, q4, d0[0]        \n\t"
-        "vmla.f32   q13, q5, d0[0]        \n\t"
-
-        "vst1.32    {q10, q11}, [%[c0]]!  \n\t"
-        "vst1.32    {q12, q13}, [%[c0]]!  \n\t"
-
-        "subs       %[nc1], %[nc1], #1    \n\t"
-        "bge        loop_nc1_%=           \n\t"
-        "end_nc1_%=:                      \n\t"
-
-        "subs       %[nc2], %[nc2], #1    \n\t"
-        "blt        end_nc2_%=            \n\t"
-        "loop_nc2_%=:                     \n\t"
-
-        "pld        [%[c0], #16]          \n\t"
-        "vld1.32    {q10}, [%[c0]]        \n\t"
-
-        "gemm_nc2_%=:                     \n\t"
-        "vld1.32    {q2}, [%[b0]]!        \n\t"
-        "vmla.f32   q10, q2, d0[0]        \n\t"
-
-        "vst1.32    {q10}, [%[c0]]!       \n\t"
-
-        "subs       %[nc2], %[nc2], #1    \n\t"
-        "bge        loop_nc2_%=           \n\t"
-        "end_nc2_%=:                      \n\t"
-
-        : [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3),
-          [c0] "+r"(c0)
-        : [a0] "r"(a0), [nc1] "r"(nc1), [nc2] "r"(nc2)
-        : "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13");
-
-    for (int j = 0; j < nc3; j++) {
-      *c0 += (*a0) * (*b0++);
-      c0++;
-    }
-  }
-
-  if (alpha != 1) {
-    VecWriteWithAlphaBeta(n, bufferC, C, ldc);
-    return;
-  }
-  if (beta == 0) {
-    VecWriteBasic(n, bufferC, C, ldc);
-    return;
-  }
-  if (beta == 1 && !relu) {
-    VecWriteWithAdd(n, bufferC, C, ldc);
-    return;
-  }
-  if (beta == 1 && relu) {
-    VecWriteWithAddRelu(n, bufferC, C, ldc);
-    return;
-  }
-}
-
-void Gemm::VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
-                              int lda, const float *B, int ldb, float beta,
-                              float *C, int ldc, bool relu, float *new_scale,
-                              float *new_bias) {
-  float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
-
-  const float *a0, *b0, *b1, *b2, *b3;
-  float *c0, *C0;
-
-  int volatile kc1 = k / 4;
-  int volatile kc2 = k % 4;
-  int volatile nc1 = n / 16;
-  int _nc1 = n % 16;
-  int volatile nc2 = _nc1 / 4;
-  int volatile nc3 = _nc1 % 4;
-  for (int i = 0; i < kc1; i++) {
-    a0 = A + i * 4;
-    b0 = B + i * 4 * ldb;
-    b1 = b0 + ldb;
-    b2 = b1 + ldb;
-    b3 = b2 + ldb;
-    c0 = bufferC;
-    asm volatile(
-        "pld        [%[a0], #16]          \n\t"
-        "vld1.32    {q0}, [%[a0]]         \n\t"
-
-        "subs       %[nc1], %[nc1], #1    \n\t"
-        "blt        end_nc1_%=            \n\t"
-        "loop_nc1_%=:                     \n\t"
-
-        "cmp        %[i],       #0        \n\t"
-        "beq        i_eq0_%=              \n\t"
-        "bne        i_ne0_%=              \n\t"
-
-        "i_eq0_%=:                        \n\t"
-        "vmov.f32   q10,    #0.0          \n\t"
-        "vmov.f32   q11,    #0.0          \n\t"
-        "vmov.f32   q12,    #0.0          \n\t"
-        "vmov.f32   q13,    #0.0          \n\t"
-        "b          gemm_nc1_%=           \n\t"
-
-        "i_ne0_%=:                        \n\t"
-        "pld        [%[c0], #64]          \n\t"
-        "vld1.32    {q10, q11}, [%[c0]]!  \n\t"
-        "vld1.32    {q12, q13}, [%[c0]]   \n\t"
-        "sub        %[c0], %[c0], #32     \n\t"
-
-        "gemm_nc1_%=:                     \n\t"
-        "pld        [%[b0], #64]          \n\t"
-        "vld1.32    {q2, q3}, [%[b0]]!    \n\t"
-        "vld1.32    {q4, q5}, [%[b0]]!    \n\t"
-        "vmla.f32   q10, q2, d0[0]        \n\t"
-        "vmla.f32   q11, q3, d0[0]        \n\t"
-        "vmla.f32   q12, q4, d0[0]        \n\t"
-        "vmla.f32   q13, q5, d0[0]        \n\t"
-
-        "pld        [%[b1], #64]          \n\t"
-        "vld1.32    {q2, q3}, [%[b1]]!    \n\t"
-        "vld1.32    {q4, q5}, [%[b1]]!    \n\t"
-        "vmla.f32   q10, q2, d0[1]        \n\t"
-        "vmla.f32   q11, q3, d0[1]        \n\t"
-        "vmla.f32   q12, q4, d0[1]        \n\t"
-        "vmla.f32   q13, q5, d0[1]        \n\t"
-
-        "pld        [%[b2], #64]          \n\t"
-        "vld1.32    {q2, q3}, [%[b2]]!    \n\t"
-        "vld1.32    {q4, q5}, [%[b2]]!    \n\t"
-        "vmla.f32   q10, q2, d1[0]        \n\t"
-        "vmla.f32   q11, q3, d1[0]        \n\t"
-        "vmla.f32   q12, q4, d1[0]        \n\t"
-        "vmla.f32   q13, q5, d1[0]        \n\t"
-
-        "pld        [%[b3], #64]          \n\t"
-        "vld1.32    {q2, q3}, [%[b3]]!    \n\t"
-        "vld1.32    {q4, q5}, [%[b3]]!    \n\t"
-        "vmla.f32   q10, q2, d1[1]        \n\t"
-        "vmla.f32   q11, q3, d1[1]        \n\t"
-        "vmla.f32   q12, q4, d1[1]        \n\t"
-        "vmla.f32   q13, q5, d1[1]        \n\t"
-
-        "vst1.32    {q10, q11}, [%[c0]]!  \n\t"
-        "vst1.32    {q12, q13}, [%[c0]]!  \n\t"
-
-        "subs       %[nc1], %[nc1], #1    \n\t"
-        "bge        loop_nc1_%=           \n\t"
-        "end_nc1_%=:                      \n\t"
-
-        "subs       %[nc2], %[nc2], #1    \n\t"
-        "blt        end_nc2_%=            \n\t"
-        "loop_nc2_%=:                     \n\t"
-
-        "cmp        %[i],       #0        \n\t"
-        "beq        ii_eq0_%=             \n\t"
-        "bne        ii_ne0_%=             \n\t"
-
-        "ii_eq0_%=:                       \n\t"
-        "vmov.f32   q10,    #0.0          \n\t"
-        "b          gemm_nc2_%=           \n\t"
-
-        "ii_ne0_%=:                       \n\t"
-        "pld        [%[c0], #16]          \n\t"
-        "vld1.32    {q10}, [%[c0]]        \n\t"
-
-        "gemm_nc2_%=:                     \n\t"
-        "pld        [%[b0], #16]          \n\t"
-        "vld1.32    {q2}, [%[b0]]!        \n\t"
-        "vmla.f32   q10, q2, d0[0]        \n\t"
-
-        "pld        [%[b1], #16]          \n\t"
-        "vld1.32    {q3}, [%[b1]]!        \n\t"
-        "vmla.f32   q10, q3, d0[1]        \n\t"
-
-        "pld        [%[b2], #16]          \n\t"
-        "vld1.32    {q4}, [%[b2]]!        \n\t"
-        "vmla.f32   q10, q4, d1[0]        \n\t"
-
-        "pld        [%[b3], #16]          \n\t"
-        "vld1.32    {q5}, [%[b3]]!        \n\t"
-        "vmla.f32   q10, q5, d1[1]        \n\t"
-
-        "vst1.32    {q10}, [%[c0]]!       \n\t"
-
-        "subs       %[nc2], %[nc2], #1    \n\t"
-        "bge        loop_nc2_%=           \n\t"
-        "end_nc2_%=:                      \n\t"
-
-        : [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3),
-          [c0] "+r"(c0)
-        : [a0] "r"(a0), [i] "r"(i), [nc1] "r"(nc1), [nc2] "r"(nc2)
-        : "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13");
-
-    for (int j = 0; j < nc3; j++) {
-      if (i == 0) {
-        *c0 = (*a0) * (*b0++);
-      } else {
-        *c0 += (*a0) * (*b0++);
-      }
-      *c0 += (*(a0 + 1)) * (*b1++);
-      *c0 += (*(a0 + 2)) * (*b2++);
-      *c0 += (*(a0 + 3)) * (*b3++);
-      c0++;
-    }
-  }
-
-  for (int i = 0; i < kc2; ++i) {
-    a0 = A + 4 * kc1 + i;
-    b0 = B + (4 * kc1 + i) * ldb;
-    c0 = bufferC;
-    asm volatile(
-        "pld        [%[a0], #16]          \n\t"
-        "vld1.32    {d0}, [%[a0]]         \n\t"
-
-        "subs       %[nc1], %[nc1], #1    \n\t"
-        "blt        end_nc1_%=            \n\t"
-        "loop_nc1_%=:                     \n\t"
-
-        "pld        [%[c0], #64]          \n\t"
-        "vld1.32    {q10, q11}, [%[c0]]!  \n\t"
-        "vld1.32    {q12, q13}, [%[c0]]   \n\t"
-        "sub        %[c0], %[c0], #32     \n\t"
-
-        "gemm_nc1_%=:                     \n\t"
-        "pld        [%[b0], #64]          \n\t"
-        "vld1.32    {q2, q3}, [%[b0]]!    \n\t"
-        "vld1.32    {q4, q5}, [%[b0]]!    \n\t"
-        "vmla.f32   q10, q2, d0[0]        \n\t"
-        "vmla.f32   q11, q3, d0[0]        \n\t"
-        "vmla.f32   q12, q4, d0[0]        \n\t"
-        "vmla.f32   q13, q5, d0[0]        \n\t"
-
-        "vst1.32    {q10, q11}, [%[c0]]!  \n\t"
-        "vst1.32    {q12, q13}, [%[c0]]!  \n\t"
-
-        "subs       %[nc1], %[nc1], #1    \n\t"
-        "bge        loop_nc1_%=           \n\t"
-        "end_nc1_%=:                      \n\t"
-
-        "subs       %[nc2], %[nc2], #1    \n\t"
-        "blt        end_nc2_%=            \n\t"
-        "loop_nc2_%=:                     \n\t"
-
-        "pld        [%[c0], #16]          \n\t"
-        "vld1.32    {q10}, [%[c0]]        \n\t"
-
-        "gemm_nc2_%=:                     \n\t"
-        "vld1.32    {q2}, [%[b0]]!        \n\t"
-        "vmla.f32   q10, q2, d0[0]        \n\t"
-
-        "vst1.32    {q10}, [%[c0]]!       \n\t"
-
-        "subs       %[nc2], %[nc2], #1    \n\t"
-        "bge        loop_nc2_%=           \n\t"
-        "end_nc2_%=:                      \n\t"
-
-        : [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3),
-          [c0] "+r"(c0)
-        : [a0] "r"(a0), [nc1] "r"(nc1), [nc2] "r"(nc2)
-        : "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13");
-
-    for (int j = 0; j < nc3; j++) {
-      *c0 += (*a0) * (*b0++);
-      c0++;
-    }
-  }
-
-  if (relu) {
-    VecWriteWithBnRelu(n, bufferC, C, ldc, new_scale, new_bias);
-  } else {
-    VecWriteWithBn(n, bufferC, C, ldc, new_scale, new_bias);
-  }
-}
-
-// C = A * B
-void Gemm::WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
-  int nc1 = nc / 16;
-  int _nc1 = nc % 16;
-  int step = 4 * ldc;
-  int step1 = 4 * (NC - 16 * nc1);
-  int volatile m = mc;
-
-  float *volatile c_ptr, *volatile C_ptr;
-  float *C0, *c0;
-  c_ptr = c;
-  C_ptr = C;
-  if (nc1 > 0) {
-    asm volatile(
-        "subs       %[mc], %[mc], #1        \n\t"
-        "blt        end_mc_%=               \n\t"
-        "loop_mc_%=:                        \n\t"
-
-        "mov        r6,   %[C_ptr]          \n\t"
-        "mov        r5,   %[nc1]            \n\t"
-        "subs       r5,   r5,   #1          \n\t"
-        "blt        end_nc1_%=              \n\t"
-        "loop_nc1_%=:                       \n\t"
-
-        "vld1.32    {q0, q1}, [%[c_ptr]]!   \n\t"
-        "vst1.32    {q0, q1}, [r6]!         \n\t"
-
-        "vld1.32    {q2, q3}, [%[c_ptr]]!   \n\t"
-        "vst1.32    {q2, q3}, [r6]!         \n\t"
-
-        "subs       r5,   r5,   #1          \n\t"
-        "bge        loop_nc1_%=             \n\t"
-        "end_nc1_%=:                        \n\t"
-
-        "add        %[C_ptr], %[C_ptr], %[step]   \n\t"
-        "add        %[c_ptr], %[c_ptr], %[step1]  \n\t"
-        "subs       %[mc], %[mc], #1        \n\t"
-        "bge        loop_mc_%=              \n\t"
-        "end_mc_%=:                         \n\t"
-
-        :
-        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
-          [step] "r"(step), [step1] "r"(step1)
-        : "memory", "r5", "r6", "q0", "q1", "q2", "q3");
-  }
-
-  if (_nc1 != 0) {
-    for (int i = 0; i < mc; i++) {
-      C0 = C_ptr + nc1 * 16 + i * ldc;
-      c0 = c_ptr + nc1 * 16 + i * NC;
-      for (int j = 0; j < _nc1; j++) {
-        *C0++ = *c0++;
-      }
-    }
-  }
-}
-
-// C = alpha * A * B + beta * C
-void Gemm::WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
-
-// C = A * B + C
-void Gemm::WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
-  int nc1 = nc / 16;
-  int _nc1 = nc % 16;
-  int step = 4 * ldc;
-  int step1 = 4 * (NC - 16 * nc1);
-  int volatile m = mc;
-
-  float *volatile c_ptr, *volatile C_ptr;
-  float *C0, *c0;
-  c_ptr = c;
-  C_ptr = C;
-  if (nc1 > 0) {
-    asm volatile(
-        "subs       %[mc], %[mc], #1        \n\t"
-        "blt        end_mc_%=               \n\t"
-        "loop_mc_%=:                        \n\t"
-
-        "mov        r6,   %[C_ptr]          \n\t"
-        "mov        r5,   %[nc1]            \n\t"
-        "subs       r5,   r5,   #1          \n\t"
-        "blt        end_nc1_%=              \n\t"
-        "loop_nc1_%=:                       \n\t"
-
-        "vld1.32    {q0, q1},   [r6]        \n\t"
-        "vld1.32    {q2, q3},   [%[c_ptr]]! \n\t"
-        "vadd.f32   q10,  q0,   q2          \n\t"
-        "vadd.f32   q11,  q1,   q3          \n\t"
-        "vst1.32    {q10, q11}, [r6]!       \n\t"
-
-        "vld1.32    {q4, q5},   [r6]        \n\t"
-        "vld1.32    {q6, q7},   [%[c_ptr]]! \n\t"
-        "vadd.f32   q12,  q4,   q6          \n\t"
-        "vadd.f32   q13,  q5,   q7          \n\t"
-        "vst1.32    {q12, q13}, [r6]!       \n\t"
-
-        "subs       r5,   r5,   #1          \n\t"
-        "bge        loop_nc1_%=             \n\t"
-        "end_nc1_%=:                        \n\t"
-
-        "add        %[C_ptr], %[C_ptr], %[step]     \n\t"
-        "add        %[c_ptr], %[c_ptr], %[step1]    \n\t"
-        "subs       %[mc], %[mc], #1        \n\t"
-        "bge        loop_mc_%=              \n\t"
-        "end_mc_%=:                         \n\t"
-
-        :
-        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
-          [step] "r"(step), [step1] "r"(step1)
-        : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-          "q10", "q11", "q12", "q13");
-  }
-
-  if (_nc1 != 0) {
-    for (int i = 0; i < mc; i++) {
-      C0 = C_ptr + nc1 * 16 + i * ldc;
-      c0 = c_ptr + nc1 * 16 + i * NC;
-      for (int j = 0; j < _nc1; j++) {
-        *C0++ += *c0++;
-      }
-    }
-  }
-}
-
-// C = A * B + bias
-void Gemm::WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc,
-                          float *bias) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  float *c_ptr, *C_ptr;
-  float32x4_t cv;
-  float32x4_t biasv;
-  for (int i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    biasv = vld1q_dup_f32(bias + i);
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      cv = vaddq_f32(cv, biasv);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      cv = vaddq_f32(cv, biasv);
-      if (_nc1 >= 1) {
-        vst1q_lane_f32(C_ptr, cv, 0);
-        C_ptr++;
-      }
-      if (_nc1 >= 2) {
-        vst1q_lane_f32(C_ptr, cv, 1);
-        C_ptr++;
-      }
-      if (_nc1 >= 3) {
-        vst1q_lane_f32(C_ptr, cv, 2);
-        C_ptr++;
-      }
-    }
-  }
-}
-
-// C = A * B + C, relu(C)
-void Gemm::WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
-  int nc1 = nc / 16;
-  int _nc1 = nc % 16;
-  int step = 4 * ldc;
-  int step1 = 4 * (NC - 16 * nc1);
-  int volatile m = mc;
-
-  float *volatile c_ptr, *volatile C_ptr;
-  float *C0, *c0;
-  c_ptr = c;
-  C_ptr = C;
-  if (nc1 > 0) {
-    asm volatile(
-        "vmov.f32   q14,    #0.0            \n\t"
-        "subs       %[mc], %[mc], #1        \n\t"
-        "blt        end_mc_%=               \n\t"
-        "loop_mc_%=:                        \n\t"
-
-        "mov        r6,   %[C_ptr]          \n\t"
-        "mov        r5,   %[nc1]            \n\t"
-        "subs       r5,   r5,   #1          \n\t"
-        "blt        end_nc1_%=              \n\t"
-        "loop_nc1_%=:                       \n\t"
-
-        "vld1.32    {q0, q1},   [r6]        \n\t"
-        "vld1.32    {q2, q3},   [%[c_ptr]]! \n\t"
-        "vadd.f32   q10,  q0,   q2          \n\t"
-        "vadd.f32   q11,  q1,   q3          \n\t"
-        "vmax.f32   q10,  q10,  q14         \n\t"
-        "vmax.f32   q11,  q11,  q14         \n\t"
-        "vst1.32    {q10, q11}, [r6]!       \n\t"
-
-        "vld1.32    {q4, q5},   [r6]        \n\t"
-        "vld1.32    {q6, q7},   [%[c_ptr]]! \n\t"
-        "vadd.f32   q12,  q4,   q6          \n\t"
-        "vadd.f32   q13,  q5,   q7          \n\t"
-        "vmax.f32   q12,  q12,  q14         \n\t"
-        "vmax.f32   q13,  q13,  q14         \n\t"
-        "vst1.32    {q12, q13}, [r6]!       \n\t"
-
-        "subs       r5,   r5,   #1          \n\t"
-        "bge        loop_nc1_%=             \n\t"
-        "end_nc1_%=:                        \n\t"
-
-        "add        %[C_ptr], %[C_ptr], %[step]     \n\t"
-        "add        %[c_ptr], %[c_ptr], %[step1]    \n\t"
-        "subs       %[mc], %[mc], #1        \n\t"
-        "bge        loop_mc_%=              \n\t"
-        "end_mc_%=:                         \n\t"
-
-        :
-        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
-          [step] "r"(step), [step1] "r"(step1)
-        : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-          "q10", "q11", "q12", "q13");
-  }
-
-  if (_nc1 != 0) {
-    for (int i = 0; i < mc; i++) {
-      C0 = C_ptr + nc1 * 16 + i * ldc;
-      c0 = c_ptr + nc1 * 16 + i * NC;
-      for (int j = 0; j < _nc1; j++) {
-        *C0 += *c0;
-        if (*C0 < 0) {
-          *C0 = 0;
-        }
-        C0++;
-        c0++;
-      }
-    }
-  }
-}
-
-// C = A * B + bias, relu(C)
-void Gemm::WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
-                              float *bias) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  float *c_ptr, *C_ptr;
-  float32x4_t cv;
-  float32x4_t biasv;
-  float32x4_t zero = vdupq_n_f32(0.0);
-  for (int i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    biasv = vld1q_dup_f32(bias + i);
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      cv = vaddq_f32(cv, biasv);
-      cv = vmaxq_f32(cv, zero);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      cv = vaddq_f32(cv, biasv);
-      cv = vmaxq_f32(cv, zero);
-      if (_nc1 >= 1) {
-        vst1q_lane_f32(C_ptr, cv, 0);
-        C_ptr++;
-      }
-      if (_nc1 >= 2) {
-        vst1q_lane_f32(C_ptr, cv, 1);
-        C_ptr++;
-      }
-      if (_nc1 >= 3) {
-        vst1q_lane_f32(C_ptr, cv, 2);
-        C_ptr++;
-      }
-    }
-  }
-}
-
-void Gemm::WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc,
-                             float *p, std::string mode, float *bias,
-                             float *bias1) {
-  if (nc < 4) {
-    if (bias1 == nullptr) {
-      for (int i = 0; i < mc; ++i) {
-        for (int j = 0; j < nc; ++j) {
-          float r = c[i * NC + j] + bias[i];
-          if (r < 0) {
-            r *= p[i];
-          }
-          C[i * ldc + j] = r;
-        }
-      }
-    } else {
-      for (int i = 0; i < mc; ++i) {
-        for (int j = 0; j < nc; ++j) {
-          float r = c[i * NC + j] + bias[i];
-          r += bias1[i * ldc + j];
-          if (r < 0) {
-            r *= p[i];
-          }
-          C[i * ldc + j] = r;
-        }
-      }
-    }
-    return;
-  }
-
-  int nc1 = nc / 16;
-  int _nc1 = nc % 16;
-  int nc2 = _nc1 / 4;
-  int nc3 = 16 - 4 * (_nc1 % 4);
-  int step = 4 * (ldc - nc);
-  int step1 = 4 * (NC - nc);
-
-  if (bias1 == nullptr) {
-    asm volatile(
-        "vmov.f32   q14,    #0.0            \n\t"
-        "subs       %[mc], %[mc], #1        \n\t"
-        "blt        end_mc_%=               \n\t"
-        "loop_mc_%=:                        \n\t"
-
-        "mov        r5,     %[nc1]          \n\t"
-        "mov        r6,     %[nc2]          \n\t"
-        "vld1.32    {d0},   [%[bias]]       \n\t"
-        "vld1.32    {d1},   [%[p]]          \n\t"
-        "vdup.32    q1,     d0[0]           \n\t"
-        "vdup.32    q2,     d1[0]           \n\t"
-
-        "subs       r5,   r5,   #1          \n\t"
-        "blt        end_nc1_%=              \n\t"
-        "loop_nc1_%=:                       \n\t"
-
-        "pld        [%[c], #32]             \n\t"
-        "vld1.32    {q3, q4},   [%[c]]!     \n\t"
-        "vld1.32    {q9, q10},  [%[c]]!     \n\t"
-
-        "vadd.f32   q3,   q3,   q1          \n\t"
-        "vadd.f32   q4,   q4,   q1          \n\t"
-        "vadd.f32   q9,   q9,   q1          \n\t"
-        "vadd.f32   q10,  q10,  q1          \n\t"
-
-        "vmax.f32   q5,   q3,   q14         \n\t"
-        "vmin.f32   q7,   q3,   q14         \n\t"
-        "vmax.f32   q6,   q4,   q14         \n\t"
-        "vmin.f32   q8,   q4,   q14         \n\t"
-
-        "vmax.f32   q11,  q9,   q14         \n\t"
-        "vmin.f32   q13,  q9,   q14         \n\t"
-        "vmax.f32   q12,  q10,  q14         \n\t"
-        "vmin.f32   q15,  q10,  q14         \n\t"
-
-        "vmla.f32   q5,   q7,   q2          \n\t"
-        "vmla.f32   q6,   q8,   q2          \n\t"
-        "vmla.f32   q11,  q13,  q2          \n\t"
-        "vmla.f32   q12,  q15,  q2          \n\t"
-
-        "vst1.32    {q5, q6},   [%[C]]!     \n\t"
-        "vst1.32    {q11, q12}, [%[C]]!     \n\t"
-
-        "subs       r5,   r5,   #1          \n\t"
-        "bge        loop_nc1_%=             \n\t"
-        "end_nc1_%=:                        \n\t"
-
-        "subs       r6,  r6,   #1           \n\t"
-        "blt        end_nc2_%=              \n\t"
-        "loop_nc2_%=:                       \n\t"
-
-        "vld1.32    {q3},       [%[c]]!     \n\t"
-        "vadd.f32   q3,   q3,   q1          \n\t"
-        "vmax.f32   q5,   q3,   q14         \n\t"
-        "vmin.f32   q7,   q3,   q14         \n\t"
-        "vmla.f32   q5,   q7,   q2          \n\t"
-        "vst1.32    {q5},       [%[C]]!     \n\t"
-
-        "subs       r6,   r6,   #1          \n\t"
-        "bge        loop_nc2_%=             \n\t"
-        "end_nc2_%=:                        \n\t"
-
-        "cmp        %[nc3],    #16          \n\t"
-        "beq        end_nc3_%=              \n\t"
-
-        "sub        %[c],     %[c],   %[nc3]      \n\t"
-        "sub        %[C],     %[C],   %[nc3]      \n\t"
-
-        "vld1.32    {q4},       [%[c]]!     \n\t"
-        "vadd.f32   q4,   q4,   q1          \n\t"
-        "vmax.f32   q6,   q4,   q14         \n\t"
-        "vmin.f32   q8,   q4,   q14         \n\t"
-        "vmla.f32   q6,   q8,   q2          \n\t"
-        "vst1.32    {q6},       [%[C]]!     \n\t"
-        "end_nc3_%=:                        \n\t"
-
-        "add        %[p],     %[p],     #4        \n\t"
-        "add        %[bias],  %[bias],  #4        \n\t"
-        "add        %[c],     %[c],     %[step1]  \n\t"
-        "add        %[C],     %[C],     %[step]   \n\t"
-
-        "subs       %[mc], %[mc], #1        \n\t"
-        "bge        loop_mc_%=              \n\t"
-        "end_mc_%=:                         \n\t"
-
-        :
-        : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2),
-          [nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1), [p] "r"(p),
-          [bias] "r"(bias), [bias1] "r"(bias1)
-        : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-          "q8");
-  } else {
-    asm volatile(
-        "vmov.f32   q14,    #0.0            \n\t"
-        "subs       %[mc], %[mc], #1        \n\t"
-        "blt        end_mc_%=               \n\t"
-        "loop_mc_%=:                        \n\t"
-
-        "mov        r5,     %[nc1]          \n\t"
-        "mov        r6,     %[nc2]          \n\t"
-        "vld1.32    {d0},   [%[bias]]       \n\t"
-        "vld1.32    {d1},   [%[p]]          \n\t"
-        "vdup.32    q1,     d0[0]           \n\t"
-        "vdup.32    q2,     d1[0]           \n\t"
-
-        "subs       r5,   r5,   #1          \n\t"
-        "blt        end_nc1_%=              \n\t"
-        "loop_nc1_%=:                       \n\t"
-
-        "pld        [%[c], #32]             \n\t"
-        "pld        [%[bias1], #32]         \n\t"
-        "vld1.32    {q3, q4},   [%[c]]!     \n\t"
-        "vld1.32    {q9, q10},  [%[bias1]]! \n\t"
-        "vadd.f32   q3,   q3,   q1          \n\t"
-        "vadd.f32   q4,   q4,   q1          \n\t"
-        "vadd.f32   q3,   q3,   q9          \n\t"
-        "vadd.f32   q4,   q4,   q10         \n\t"
-        "vmax.f32   q5,   q3,   q14         \n\t"
-        "vmin.f32   q7,   q3,   q14         \n\t"
-        "vmax.f32   q6,   q4,   q14         \n\t"
-        "vmin.f32   q8,   q4,   q14         \n\t"
-        "vmla.f32   q5,   q7,   q2          \n\t"
-        "vmla.f32   q6,   q8,   q2          \n\t"
-        "vst1.32    {q5, q6},   [%[C]]!     \n\t"
-
-        "vld1.32    {q3, q4},   [%[c]]!     \n\t"
-        "vld1.32    {q9, q10},  [%[bias1]]! \n\t"
-        "vadd.f32   q3,   q3,   q1          \n\t"
-        "vadd.f32   q4,   q4,   q1          \n\t"
-        "vadd.f32   q3,   q3,   q9          \n\t"
-        "vadd.f32   q4,   q4,   q10         \n\t"
-        "vmax.f32   q5,   q3,   q14         \n\t"
-        "vmin.f32   q7,   q3,   q14         \n\t"
-        "vmax.f32   q6,   q4,   q14         \n\t"
-        "vmin.f32   q8,   q4,   q14         \n\t"
-        "vmla.f32   q5,   q7,   q2          \n\t"
-        "vmla.f32   q6,   q8,   q2          \n\t"
-        "vst1.32    {q5, q6},   [%[C]]!     \n\t"
-
-        "subs       r5,   r5,   #1          \n\t"
-        "bge        loop_nc1_%=             \n\t"
-        "end_nc1_%=:                        \n\t"
-
-        "subs       r6,  r6,   #1           \n\t"
-        "blt        end_nc2_%=              \n\t"
-        "loop_nc2_%=:                       \n\t"
-
-        "vld1.32    {q3},       [%[c]]!     \n\t"
-        "vld1.32    {q9},       [%[bias1]]! \n\t"
-        "vadd.f32   q3,   q3,   q1          \n\t"
-        "vadd.f32   q3,   q3,   q9          \n\t"
-        "vmax.f32   q5,   q3,   q14         \n\t"
-        "vmin.f32   q7,   q3,   q14         \n\t"
-        "vmla.f32   q5,   q7,   q2          \n\t"
-        "vst1.32    {q5},      [%[C]]!      \n\t"
-
-        "subs       r6,   r6,   #1          \n\t"
-        "bge        loop_nc2_%=             \n\t"
-        "end_nc2_%=:                        \n\t"
-
-        "cmp        %[nc3],    #16          \n\t"
-        "beq        end_nc3_%=              \n\t"
-
-        "sub        %[c],     %[c],     %[nc3]    \n\t"
-        "sub        %[C],     %[C],     %[nc3]    \n\t"
-        "sub        %[bias1], %[bias1], %[nc3]    \n\t"
-
-        "vld1.32    {q4},       [%[c]]!     \n\t"
-        "vld1.32    {q10},      [%[bias1]]! \n\t"
-        "vadd.f32   q4,   q4,   q1          \n\t"
-        "vadd.f32   q4,   q4,   q10         \n\t"
-        "vmax.f32   q6,   q4,   q14         \n\t"
-        "vmin.f32   q8,   q4,   q14         \n\t"
-        "vmla.f32   q6,   q8,   q2          \n\t"
-        "vst1.32    {q6},       [%[C]]!     \n\t"
-        "end_nc3_%=:                        \n\t"
-
-        "add        %[p],     %[p],     #4        \n\t"
-        "add        %[bias],  %[bias],  #4        \n\t"
-        "add        %[c],     %[c],     %[step1]  \n\t"
-        "add        %[C],     %[C],     %[step]   \n\t"
-        "add        %[bias1], %[bias1], %[step]   \n\t"
-
-        "subs       %[mc], %[mc], #1        \n\t"
-        "bge        loop_mc_%=              \n\t"
-        "end_mc_%=:                         \n\t"
-
-        :
-        : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2),
-          [nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1), [p] "r"(p),
-          [bias] "r"(bias), [bias1] "r"(bias1)
-        : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-          "q8", "q9", "q10");
-  }
-}
-
-// C = A * B, batchnorm(C)
-void Gemm::WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
-                       float *scale, float *bias) {
-  if (nc < 4) {
-    for (int i = 0; i < mc; ++i) {
-      for (int j = 0; j < nc; ++j) {
-        *C = (*c) * (*scale) + (*bias);
-        C++;
-        c++;
-      }
-      C += (ldc - nc);
-      c += (NC - nc);
-      scale++;
-      bias++;
-    }
-    return;
-  }
-
-  int volatile nc1 = nc / 16;
-  int _nc1 = nc % 16;
-  int volatile nc2 = _nc1 / 4;
-  int volatile nc3 = 16 - 4 * (_nc1 % 4);
-  int volatile step = 4 * (ldc - nc);
-  int volatile step1 = 4 * (NC - nc);
-
-  asm volatile(
-      "subs       %[mc], %[mc], #1        \n\t"
-      "blt        end_mc_%=               \n\t"
-      "loop_mc_%=:                        \n\t"
-
-      "mov        r5,   %[nc1]            \n\t"
-      "mov        r6,   %[nc2]            \n\t"
-      "vld1.32    {d0},   [%[scale]]      \n\t"
-      "vld1.32    {d1},   [%[bias]]       \n\t"
-      "vdup.32    q1,   d0[0]             \n\t"
-      "vdup.32    q2,   d1[0]             \n\t"
-
-      "subs       r5,   r5,   #1          \n\t"
-      "blt        end_nc1_%=              \n\t"
-      "loop_nc1_%=:                       \n\t"
-
-      "vld1.32    {q3, q4},   [%[c]]!     \n\t"
-      "vmul.f32   q10,  q3,   q1          \n\t"
-      "vmul.f32   q11,  q4,   q1          \n\t"
-      "vadd.f32   q10,  q10,  q2          \n\t"
-      "vadd.f32   q11,  q11,  q2          \n\t"
-      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
-
-      "vld1.32    {q5, q6},   [%[c]]!     \n\t"
-      "vmul.f32   q12,  q5,   q1          \n\t"
-      "vmul.f32   q13,  q6,   q1          \n\t"
-      "vadd.f32   q12,  q12,  q2          \n\t"
-      "vadd.f32   q13,  q13,  q2          \n\t"
-      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
-
-      "subs       r5,   r5,   #1          \n\t"
-      "bge        loop_nc1_%=             \n\t"
-      "end_nc1_%=:                        \n\t"
-
-      "subs       r6,  r6,   #1           \n\t"
-      "blt        end_nc2_%=              \n\t"
-      "loop_nc2_%=:                       \n\t"
-
-      "vld1.32    {q7},       [%[c]]!     \n\t"
-      "vmul.f32   q10,  q7,   q1          \n\t"
-      "vadd.f32   q10,  q10,  q2          \n\t"
-      "vst1.32    {q10},      [%[C]]!     \n\t"
-
-      "subs       r6,   r6,   #1          \n\t"
-      "bge        loop_nc2_%=             \n\t"
-      "end_nc2_%=:                        \n\t"
-
-      "cmp        %[nc3],    #16          \n\t"
-      "beq        end_nc3_%=              \n\t"
-
-      "sub        %[c],     %[c],   %[nc3]      \n\t"
-      "sub        %[C],     %[C],   %[nc3]      \n\t"
-
-      "vld1.32    {q8},       [%[c]]!     \n\t"
-      "vmul.f32   q11,  q8,   q1          \n\t"
-      "vadd.f32   q11,  q11,  q2          \n\t"
-      "vst1.32    {q11},      [%[C]]!     \n\t"
-      "end_nc3_%=:                        \n\t"
-
-      "add        %[scale], %[scale], #4        \n\t"
-      "add        %[bias],  %[bias],  #4        \n\t"
-      "add        %[c],     %[c],     %[step1]  \n\t"
-      "add        %[C],     %[C],     %[step]   \n\t"
-
-      "subs       %[mc], %[mc], #1        \n\t"
-      "bge        loop_mc_%=              \n\t"
-      "end_mc_%=:                         \n\t"
-
-      :
-      : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2),
-        [nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1),
-        [scale] "r"(scale), [bias] "r"(bias)
-      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-        "q8", "q10", "q11", "q12", "q13");
-}
-
-// C = A * B, batchnorm(C), relu(C)
-void Gemm::WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
-                           float *scale, float *bias) {
-  if (nc < 4) {
-    for (int i = 0; i < mc; ++i) {
-      for (int j = 0; j < nc; ++j) {
-        *C = (*c) * (*scale) + (*bias);
-        if (*C < 0) {
-          *C = 0;
-        }
-        C++;
-        c++;
-      }
-      C += (ldc - nc);
-      c += (NC - nc);
-      scale++;
-      bias++;
-    }
-    return;
-  }
-
-  int nc1 = nc / 16;
-  int _nc1 = nc % 16;
-  int nc2 = _nc1 / 4;
-  int nc3 = 16 - 4 * (_nc1 % 4);
-  int step = 4 * (ldc - nc);
-  int step1 = 4 * (NC - nc);
-
-  asm volatile(
-      "vmov.f32   q14,    #0.0            \n\t"
-      "subs       %[mc], %[mc], #1        \n\t"
-      "blt        end_mc_%=               \n\t"
-      "loop_mc_%=:                        \n\t"
-
-      "mov        r5,   %[nc1]            \n\t"
-      "mov        r6,   %[nc2]            \n\t"
-      "vld1.32    {d0},   [%[scale]]      \n\t"
-      "vld1.32    {d1},   [%[bias]]       \n\t"
-      "vdup.32    q1,   d0[0]             \n\t"
-      "vdup.32    q2,   d1[0]             \n\t"
-
-      "subs       r5,   r5,   #1          \n\t"
-      "blt        end_nc1_%=              \n\t"
-      "loop_nc1_%=:                       \n\t"
-
-      "vld1.32    {q3, q4},   [%[c]]!     \n\t"
-      "vmul.f32   q10,  q3,   q1          \n\t"
-      "vmul.f32   q11,  q4,   q1          \n\t"
-      "vadd.f32   q10,  q10,  q2          \n\t"
-      "vadd.f32   q11,  q11,  q2          \n\t"
-      "vmax.f32   q10,  q10,  q14         \n\t"
-      "vmax.f32   q11,  q11,  q14         \n\t"
-      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
-
-      "vld1.32    {q5, q6},   [%[c]]!     \n\t"
-      "vmul.f32   q12,  q5,   q1          \n\t"
-      "vmul.f32   q13,  q6,   q1          \n\t"
-      "vadd.f32   q12,  q12,  q2          \n\t"
-      "vadd.f32   q13,  q13,  q2          \n\t"
-      "vmax.f32   q12,  q12,  q14         \n\t"
-      "vmax.f32   q13,  q13,  q14         \n\t"
-      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
-
-      "subs       r5,   r5,   #1          \n\t"
-      "bge        loop_nc1_%=             \n\t"
-      "end_nc1_%=:                        \n\t"
-
-      "subs       r6,  r6,   #1           \n\t"
-      "blt        end_nc2_%=              \n\t"
-      "loop_nc2_%=:                       \n\t"
-
-      "vld1.32    {q7},       [%[c]]!     \n\t"
-      "vmul.f32   q10,  q7,   q1          \n\t"
-      "vadd.f32   q10,  q10,  q2          \n\t"
-      "vmax.f32   q10,  q10,  q14         \n\t"
-      "vst1.32    {q10},      [%[C]]!     \n\t"
-
-      "subs       r6,   r6,   #1          \n\t"
-      "bge        loop_nc2_%=             \n\t"
-      "end_nc2_%=:                        \n\t"
-
-      "cmp        %[nc3],    #16          \n\t"
-      "beq        end_nc3_%=              \n\t"
-
-      "sub        %[c],     %[c],   %[nc3]      \n\t"
-      "sub        %[C],     %[C],   %[nc3]      \n\t"
-
-      "vld1.32    {q8},       [%[c]]!     \n\t"
-      "vmul.f32   q11,  q8,   q1          \n\t"
-      "vadd.f32   q11,  q11,  q2          \n\t"
-      "vmax.f32   q11,  q11,  q14         \n\t"
-      "vst1.32    {q11},      [%[C]]!     \n\t"
-      "end_nc3_%=:                        \n\t"
-
-      "add        %[scale], %[scale], #4        \n\t"
-      "add        %[bias],  %[bias],  #4        \n\t"
-      "add        %[c],     %[c],     %[step1]  \n\t"
-      "add        %[C],     %[C],     %[step]   \n\t"
-
-      "subs       %[mc], %[mc], #1        \n\t"
-      "bge        loop_mc_%=              \n\t"
-      "end_mc_%=:                         \n\t"
-
-      :
-      : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2),
-        [nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1),
-        [scale] "r"(scale), [bias] "r"(bias)
-      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-        "q8", "q10", "q11", "q12", "q13", "q14");
-}
-
-// C = A * B, batchnorm(C),C = C + bias; relu(C)
-void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
-                              float *new_scale, float *new_bias, float *bias) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  float *c_ptr, *C_ptr, *bias_ptr;
-  float32x4_t cv;
-  float32x4_t nbias;
-  float32x2_t scale;
-  float32x4_t biasv;
-  float32x4_t zero = vdupq_n_f32(0.0);
-  for (int i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    bias_ptr = bias + i * ldc;
-    nbias = vld1q_dup_f32(new_bias);
-    scale = vld1_dup_f32(new_scale);
-    new_bias++;
-    new_scale++;
-    float scale0 = vget_lane_f32(scale, 0);
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      biasv = vld1q_f32(bias_ptr);
-      cv = vmlaq_n_f32(nbias, cv, scale0);
-      cv = vaddq_f32(cv, biasv);
-      cv = vmaxq_f32(cv, zero);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-      bias_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      biasv = vld1q_f32(bias_ptr);
-      cv = vmlaq_n_f32(nbias, cv, scale0);
-      cv = vaddq_f32(cv, biasv);
-      cv = vmaxq_f32(cv, zero);
-      if (_nc1 >= 1) {
-        vst1q_lane_f32(C_ptr, cv, 0);
-        C_ptr++;
-      }
-      if (_nc1 >= 2) {
-        vst1q_lane_f32(C_ptr, cv, 1);
-        C_ptr++;
-      }
-      if (_nc1 >= 3) {
-        vst1q_lane_f32(C_ptr, cv, 2);
-      }
-    }
-  }
-}
-
-// C = A * B
-void Gemm::VecWriteBasic(int n, float *c, float *C, int ldc) {
-  int nc1 = n / 16;
-  int _nc1 = n % 16;
-  int nc2 = _nc1 / 4;
-  int nc3 = 16 - 4 * (_nc1 % 4);
-
-  asm volatile(
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "blt        end_nc1_%=              \n\t"
-      "loop_nc1_%=:                       \n\t"
-
-      "vld1.32    {q0, q1}, [%[c]]!       \n\t"
-      "vst1.32    {q0, q1}, [%[C]]!       \n\t"
-
-      "vld1.32    {q2, q3}, [%[c]]!       \n\t"
-      "vst1.32    {q2, q3}, [%[C]]!       \n\t"
-
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "bge        loop_nc1_%=             \n\t"
-      "end_nc1_%=:                        \n\t"
-
-      "subs       %[nc2],   %[nc2],   #1  \n\t"
-      "blt        end_nc2_%=              \n\t"
-      "loop_nc2_%=:                       \n\t"
-
-      "vld1.32    {q4},     [%[c]]!       \n\t"
-      "vst1.32    {q4},     [%[C]]!       \n\t"
-
-      "subs       %[nc2],   %[nc2],   #1  \n\t"
-      "bge        loop_nc2_%=             \n\t"
-      "end_nc2_%=:                        \n\t"
-
-      "cmp        %[nc3],    #16          \n\t"
-      "beq        end_nc3_%=              \n\t"
-      "sub        %[c],     %[c],   %[nc3]    \n\t"
-      "sub        %[C],     %[C],   %[nc3]    \n\t"
-      "vld1.32    {q5},     [%[c]]!       \n\t"
-      "vst1.32    {q5},     [%[C]]!       \n\t"
-      "end_nc3_%=:                        \n\t"
-
-      :
-      : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q5");
-}
-
-// C = alpha * A * B + beta * C
-void Gemm::VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}
-
-// C = A * B + C
-void Gemm::VecWriteWithAdd(int n, float *c, float *C, int ldc) {
-  int nc1 = n / 16;
-  int _nc1 = n % 16;
-
-  asm volatile(
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "blt        end_nc1_%=              \n\t"
-      "loop_nc1_%=:                       \n\t"
-
-      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
-      "vld1.32    {q2, q3},   [%[C]]      \n\t"
-      "vadd.f32   q10,  q0,   q2          \n\t"
-      "vadd.f32   q11,  q1,   q3          \n\t"
-      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
-
-      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
-      "vld1.32    {q6, q7},   [%[C]]      \n\t"
-      "vadd.f32   q12,  q4,   q6          \n\t"
-      "vadd.f32   q13,  q5,   q7          \n\t"
-      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
-
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "bge        loop_nc1_%=             \n\t"
-      "end_nc1_%=:                        \n\t"
-
-      : [C] "+r"(C), [c] "+r"(c)
-      : [nc1] "r"(nc1)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
-        "q12", "q13");
-
-  if (_nc1 != 0) {
-    for (int j = 0; j < _nc1; j++) {
-      *C++ += *c++;
-    }
-  }
-}
-
-// C = A * B + C, relu(C)
-void Gemm::VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
-  int nc1 = n / 16;
-  int _nc1 = n % 16;
-
-  asm volatile(
-      "vmov.f32   q14,      #0.0          \n\t"
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "blt        end_nc1_%=              \n\t"
-      "loop_nc1_%=:                       \n\t"
-
-      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
-      "vld1.32    {q2, q3},   [%[C]]      \n\t"
-      "vadd.f32   q10,  q0,   q2          \n\t"
-      "vadd.f32   q11,  q1,   q3          \n\t"
-      "vmax.f32   q10,  q10,  q14         \n\t"
-      "vmax.f32   q11,  q11,  q14         \n\t"
-      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
-
-      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
-      "vld1.32    {q6, q7},   [%[C]]      \n\t"
-      "vadd.f32   q12,  q4,   q6          \n\t"
-      "vadd.f32   q13,  q5,   q7          \n\t"
-      "vmax.f32   q12,  q12,  q14         \n\t"
-      "vmax.f32   q13,  q13,  q14         \n\t"
-      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
-
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "bge        loop_nc1_%=             \n\t"
-      "end_nc1_%=:                        \n\t"
-
-      : [C] "+r"(C), [c] "+r"(c)
-      : [nc1] "r"(nc1)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
-        "q12", "q13");
-
-  if (_nc1 != 0) {
-    for (int j = 0; j < _nc1; j++) {
-      *C += *c;
-      if (*C < 0) {
-        *C = 0;
-      }
-      C++;
-      c++;
-    }
-  }
-}
-
-// C = A * B, batchnorm(C)
-void Gemm::VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
-                          float *bias) {
-  int nc1 = n / 16;
-  int _nc1 = n % 16;
-  int nc2 = _nc1 / 4;
-  int nc3 = 16 - 4 * (_nc1 % 4);
-
-  asm volatile(
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "blt        end_nc1_%=              \n\t"
-      "loop_nc1_%=:                       \n\t"
-
-      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
-      "vld1.32    {q2, q3},   [%[scale]]! \n\t"
-      "vld1.32    {q10, q11}, [%[bias]]!  \n\t"
-      "vmla.f32   q10,  q0,   q2          \n\t"
-      "vmla.f32   q11,  q1,   q3          \n\t"
-      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
-
-      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
-      "vld1.32    {q6, q7},   [%[scale]]! \n\t"
-      "vld1.32    {q12, q13}, [%[bias]]!  \n\t"
-      "vmla.f32   q12,  q4,   q6          \n\t"
-      "vmla.f32   q13,  q5,   q7          \n\t"
-      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
-
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "bge        loop_nc1_%=             \n\t"
-      "end_nc1_%=:                        \n\t"
-
-      "subs       %[nc2],   %[nc2],   #1  \n\t"
-      "blt        end_nc2_%=              \n\t"
-      "loop_nc2_%=:                       \n\t"
-
-      "vld1.32    {q0},   [%[c]]!         \n\t"
-      "vld1.32    {q1},   [%[scale]]!     \n\t"
-      "vld1.32    {q10},  [%[bias]]!      \n\t"
-      "vmla.f32   q10,    q0,   q1        \n\t"
-      "vst1.32    {q10},  [%[C]]!         \n\t"
-
-      "subs       %[nc2],   %[nc2],   #1  \n\t"
-      "bge        loop_nc2_%=             \n\t"
-      "end_nc2_%=:                        \n\t"
-
-      "cmp        %[nc3],    #16          \n\t"
-      "beq        end_nc3_%=              \n\t"
-
-      "sub        %[c],     %[c],   %[nc3]      \n\t"
-      "sub        %[scale], %[scale],  %[nc3]   \n\t"
-      "sub        %[bias],  %[bias],   %[nc3]   \n\t"
-      "sub        %[C],     %[C],   %[nc3]      \n\t"
-
-      "vld1.32    {q0},   [%[c]]!         \n\t"
-      "vld1.32    {q1},   [%[scale]]!     \n\t"
-      "vld1.32    {q10},  [%[bias]]!      \n\t"
-      "vmla.f32   q10,    q0,   q1        \n\t"
-      "vst1.32    {q10},  [%[C]]!         \n\t"
-      "end_nc3_%=:                        \n\t"
-
-      :
-      : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3),
-        [scale] "r"(scale), [bias] "r"(bias)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
-        "q12", "q13");
-}
-
-// C = A * B, batchnorm(C), relu(C)
-void Gemm::VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale,
-                              float *bias) {
-  int nc1 = n / 16;
-  int _nc1 = n % 16;
-  int nc2 = _nc1 / 4;
-  int nc3 = 16 - 4 * (_nc1 % 4);
-
-  asm volatile(
-      "vmov.f32   q14,      #0.0          \n\t"
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "blt        end_nc1_%=              \n\t"
-      "loop_nc1_%=:                       \n\t"
-
-      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
-      "vld1.32    {q2, q3},   [%[scale]]! \n\t"
-      "vld1.32    {q10, q11}, [%[bias]]!  \n\t"
-      "vmla.f32   q10,  q0,   q2          \n\t"
-      "vmla.f32   q11,  q1,   q3          \n\t"
-      "vmax.f32   q10,  q10,  q14         \n\t"
-      "vmax.f32   q11,  q11,  q14         \n\t"
-      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
-
-      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
-      "vld1.32    {q6, q7},   [%[scale]]! \n\t"
-      "vld1.32    {q12, q13}, [%[bias]]!  \n\t"
-      "vmla.f32   q12,  q4,   q6          \n\t"
-      "vmla.f32   q13,  q5,   q7          \n\t"
-      "vmax.f32   q12,  q12,  q14         \n\t"
-      "vmax.f32   q13,  q13,  q14         \n\t"
-      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
-
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "bge        loop_nc1_%=             \n\t"
-      "end_nc1_%=:                        \n\t"
-
-      "subs       %[nc2],   %[nc2],   #1  \n\t"
-      "blt        end_nc2_%=              \n\t"
-      "loop_nc2_%=:                       \n\t"
-
-      "vld1.32    {q0},   [%[c]]!         \n\t"
-      "vld1.32    {q1},   [%[scale]]!     \n\t"
-      "vld1.32    {q10},  [%[bias]]!      \n\t"
-      "vmla.f32   q10,    q0,   q1        \n\t"
-      "vmax.f32   q10,    q10,  q14       \n\t"
-      "vst1.32    {q10},  [%[C]]!         \n\t"
-
-      "subs       %[nc2],   %[nc2],   #1  \n\t"
-      "bge        loop_nc2_%=             \n\t"
-      "end_nc2_%=:                        \n\t"
-
-      "cmp        %[nc3],    #16          \n\t"
-      "beq        end_nc3_%=              \n\t"
-
-      "sub        %[c],     %[c],   %[nc3]      \n\t"
-      "sub        %[scale], %[scale],  %[nc3]   \n\t"
-      "sub        %[bias],  %[bias],   %[nc3]   \n\t"
-      "sub        %[C],     %[C],   %[nc3]      \n\t"
-
-      "vld1.32    {q0},   [%[c]]!         \n\t"
-      "vld1.32    {q1},   [%[scale]]!     \n\t"
-      "vld1.32    {q10},  [%[bias]]!      \n\t"
-      "vmla.f32   q10,    q0,   q1        \n\t"
-      "vmax.f32   q10,    q10,  q14       \n\t"
-      "vst1.32    {q10},  [%[C]]!         \n\t"
-      "end_nc3_%=:                        \n\t"
-
-      :
-      : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3),
-        [scale] "r"(scale), [bias] "r"(bias)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
-        "q12", "q13", "q14");
-}
-
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-
-// 32位 float 矩阵乘法
-void Gemm::Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
-                 const float *B, int ldb, float beta, float *C, int ldc,
-                 bool relu, float *bias) {
-  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
-  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
-  int L1 = 32 * 1024;
-  int L2 = 512 * 1024;
-
-  KC = k;
-  MC = L1 / (KC * sizeof(float));
-  NC = L2 / (KC * sizeof(float));
-
-  // make sure MC is multiple of MR, and NC is multiple of NR
-  if (MC == 0) {
-    MC = MR;
-  } else {
-    int mblock_num = (m + MC - 1) / MC;
-    MC = (m + mblock_num - 1) / mblock_num;
-    MC = (MC + MR - 1) / MR * MR;
-  }
-  //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
-  if (NC == 0) {
-    NC = NR;
-  } else {
-    int nblock_num = (n + NC - 1) / NC;
-    NC = (n + nblock_num - 1) / nblock_num;
-    NC = (NC + NR - 1) / NR * NR;
-  }
-  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
-
-  packedA = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
-  packedB = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
-  packedC = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
-
-  int mc, nc;
-  for (int j = 0; j < n; j += NC) {
-    nc = s_min(n - j, NC);
-#if __aarch64__
-    // PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
-    PackMatrixB_16c(KC, nc, nc % NR, &B(0, j), ldb, packedB, false);
-#else
-    PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB, false);
-#endif
-    for (int i = 0; i < m; i += MC) {
-      mc = s_min(m - i, MC);
-#if __aarch64__
-      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA, false);
-      // PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
-#else
-      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA, false);
-#endif
-      if (bias == nullptr) {
-        InnerKernelWithBias(mc, nc, alpha, packedA, packedB, beta, packedC,
-                            &C(i, j), ldc, relu, nullptr);
-      } else {
-        InnerKernelWithBias(mc, nc, alpha, packedA, packedB, beta, packedC,
-                            &C(i, j), ldc, relu, bias + i);
-      }
-    }
-  }
-
-  paddle_mobile::memory::Free(packedA);
-  paddle_mobile::memory::Free(packedB);
-  paddle_mobile::memory::Free(packedC);
-}
-
-void Gemm::SgemmWithBn(int m, int n, int k, float alpha, const float *A,
-                       int lda, const float *B, int ldb, float beta, float *C,
-                       int ldc, bool relu, float *new_scale, float *new_bias,
-                       float *bias) {
-  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
-  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
-  int L1 = 32 * 1024;
-  int L2 = 512 * 1024;
-
-  KC = k;
-  MC = L1 / (KC * sizeof(float));
-  NC = L2 / (KC * sizeof(float));
-
-  // make sure MC is multiple of MR, and NC is multiple of NR
-  if (MC == 0) {
-    MC = MR;
-  } else {
-    int mblock_num = (m + MC - 1) / MC;
-    MC = (m + mblock_num - 1) / mblock_num;
-    MC = (MC + MR - 1) / MR * MR;
-  }
-  //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
-  if (NC == 0) {
-    NC = NR;
-  } else {
-    int nblock_num = (n + NC - 1) / NC;
-    NC = (n + nblock_num - 1) / nblock_num;
-    NC = (NC + NR - 1) / NR * NR;
-  }
-  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
-
-  packedA = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
-  packedB = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
-  packedC = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
-
-  int mc, nc;
-  for (int j = 0; j < n; j += NC) {
-    nc = s_min(n - j, NC);
-#if __aarch64__
-    // PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
-    PackMatrixB_16c(KC, nc, nc % NR, &B(0, j), ldb, packedB, false);
-#else
-    PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB, false);
-#endif
-    for (int i = 0; i < m; i += MC) {
-      mc = s_min(m - i, MC);
-#if __aarch64__
-      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA, false);
-      // PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
-#else
-      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA, false);
-#endif
-      if (bias == nullptr) {
-        InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC,
-                          &C(i, j), ldc, relu, new_scale + i, new_bias + i);
-      } else {
-        InnerKernelWithBnAdd(mc, nc, alpha, packedA, packedB, beta, packedC,
-                             &C(i, j), ldc, relu, new_scale + i, new_bias + i,
-                             bias + i * ldc + j);
-      }
-    }
-  }
-
-  paddle_mobile::memory::Free(packedA);
-  paddle_mobile::memory::Free(packedB);
-  paddle_mobile::memory::Free(packedC);
-}
-
-void Gemm::SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
-                          const float *B, int ldb, float *C, int ldc, float *p,
-                          std::string mode, float *bias, float *bias1) {
-  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
-  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
-  int L1 = 32 * 1024;
-  int L2 = 0.5 * 1024 * 1024;
-
-  KC = k;
-  MC = L1 / (KC * sizeof(float));
-  NC = L2 / (KC * sizeof(float));
-
-  // make sure MC is multiple of MR, and NC is multiple of NR
-  if (MC == 0) {
-    MC = MR;
-  } else {
-    int mblock_num = (m + MC - 1) / MC;
-    MC = (m + mblock_num - 1) / mblock_num;
-    MC = (MC + MR - 1) / MR * MR;
-  }
-  //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
-  if (NC == 0) {
-    NC = NR;
-  } else {
-    int nblock_num = (n + NC - 1) / NC;
-    NC = (n + nblock_num - 1) / nblock_num;
-    NC = (NC + NR - 1) / NR * NR;
-  }
-  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
-
-  packedA = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
-  packedB = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
-  packedC = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
-
-  int mc, nc;
-  for (int j = 0; j < n; j += NC) {
-    nc = s_min(n - j, NC);
-#if __aarch64__
-    // PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
-    PackMatrixB_16c(KC, nc, nc % NR, &B(0, j), ldb, packedB, false);
-#else
-    PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB, false);
-#endif
-    for (int i = 0; i < m; i += MC) {
-      mc = s_min(m - i, MC);
-#if __aarch64__
-      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA, false);
-      // PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
-#else
-      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA, false);
-#endif
-      if (bias1 == nullptr) {
-        InnerKernelWithPRelu(mc, nc, packedA, packedB, packedC, &C(i, j), ldc,
-                             p + i, mode, bias + i, nullptr);
-      } else {
-        InnerKernelWithPRelu(mc, nc, packedA, packedB, packedC, &C(i, j), ldc,
-                             p + i, mode, bias + i, bias1 + i * ldc + j);
-      }
-    }
-  }
-
-  paddle_mobile::memory::Free(packedA);
-  paddle_mobile::memory::Free(packedB);
-  paddle_mobile::memory::Free(packedC);
-}
-
-// 32位 float 矩阵乘法
-void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
-                     const float *B, int ldb, float beta, float *C, int ldc,
-                     bool relu, float *bias) {
-#ifndef __aarch64__
-  if (m == 1 && bias == nullptr) {
-    return VectorKernel(m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, relu);
-  }
-#endif  // __aarch64__
-#ifdef _OPENMP
-  int max_threads = omp_get_max_threads();
-#else
-  int max_threads = 1;
-#endif
-
-  //  int L1 = 64 / max_threads * 1024;
-  int L = (max_threads > 2) ? 64 : 32;
-  int L1 = L / max_threads * 1024;
-  KC = k;
-  if (m > n) {
-    // 对 A 分块
-    MC = L1 / (KC * sizeof(float));
-    if (MC == 0) {
-      MC = MR;
-    } else {
-      int mblock_num = (m + MC - 1) / MC;
-      MC = (m + mblock_num - 1) / mblock_num;
-      MC = (MC + MR - 1) / MR * MR;
-    }
-    // 补齐 B
-    NC = (n + NR - 1) / NR * NR;
-
-#if __aarch64__
-    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = &Gemm::PackMatrixB_16c;
-    procAddDot = &Gemm::AddDot6x16;
-#else
-    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = &Gemm::PackMatrixB_8c;
-    procAddDot = &Gemm::AddDot6x8;
-#endif
-
-    packedB = static_cast<float *>(
-        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
-    (*this.*procPackB)(KC, n, n % NR, B, ldb, packedB, true);
-    packedA = static_cast<float *>(
-        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
-  } else {
-    // 对 B 分块
-    NC = L1 / (KC * sizeof(float));
-    if (NC == 0) {
-      NC = NR;
-    } else {
-      int nblock_num = (n + NC - 1) / NC;
-      NC = (n + nblock_num - 1) / nblock_num;
-      NC = (NC + NR - 1) / NR * NR;
-    }
-    // 补齐 A
-    MC = (m + MR - 1) / MR * MR;
-
-#if __aarch64__
-    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = &Gemm::PackMatrixB_16c;
-    procAddDot = &Gemm::AddDot6x16;
-#else
-
-    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = &Gemm::PackMatrixB_8c;
-    procAddDot = &Gemm::AddDot6x8;
-#endif
-
-    packedA = static_cast<float *>(
-        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
-    (*this.*procPackA)(m, KC, m % MR, A, lda, packedA, true);
-    packedB = static_cast<float *>(
-        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
-  }
-  packedC = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC * max_threads));
-
-  if (m > n) {
-#pragma omp parallel for
-    for (int i = 0; i < m; i += MC) {
-#ifdef _OPENMP
-      int local_threads = omp_get_thread_num();
-#else
-      int local_threads = 0;
-#endif
-
-      int mc;
-      mc = s_min(m - i, MC);
-      float *local_A = packedA + MC * KC * local_threads;
-      float *local_C = packedC + MC * NC * local_threads;
-      (*this.*procPackA)(mc, KC, mc % MR, &A(i, 0), lda, local_A, false);
-      if (bias == nullptr) {
-        InnerKernelWithBias(mc, n, alpha, local_A, packedB, beta, local_C,
-                            &C(i, 0), ldc, relu, nullptr);
-      } else {
-        InnerKernelWithBias(mc, n, alpha, local_A, packedB, beta, local_C,
-                            &C(i, 0), ldc, relu, bias + i);
-      }
-    }
-  } else {
-#pragma omp parallel for
-    for (int j = 0; j < n; j += NC) {
-#ifdef _OPENMP
-      int local_threads = omp_get_thread_num();
-#else
-      int local_threads = 0;
-#endif
-
-      int nc;
-      nc = s_min(n - j, NC);
-      float *local_B = packedB + KC * NC * local_threads;
-      float *local_C = packedC + MC * NC * local_threads;
-      (*this.*procPackB)(KC, nc, nc % NR, &B(0, j), ldb, local_B, false);
-      InnerKernelWithBias(m, nc, alpha, packedA, local_B, beta, local_C,
-                          &C(0, j), ldc, relu, bias);
-    }
-  }
-
-  paddle_mobile::memory::Free(packedA);
-  paddle_mobile::memory::Free(packedB);
-  paddle_mobile::memory::Free(packedC);
-}
-
-void Gemm::SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
-                           int lda, const float *B, int ldb, float beta,
-                           float *C, int ldc, bool relu, float *new_scale,
-                           float *new_bias, float *bias) {
-#ifdef _OPENMP
-  int max_threads = omp_get_max_threads();
-#else
-  int max_threads = 1;
-#endif
-
-  int L1 = 64 / max_threads * 1024;
-  KC = k;
-  if (m > n) {
-    // 对 A 分块
-    MC = L1 / (KC * sizeof(float));
-    if (MC == 0) {
-      MC = MR;
-    } else {
-      int mblock_num = (m + MC - 1) / MC;
-      MC = (m + mblock_num - 1) / mblock_num;
-      MC = (MC + MR - 1) / MR * MR;
-    }
-    // 补齐 B
-    NC = (n + NR - 1) / NR * NR;
-
-#if __aarch64__
-    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = &Gemm::PackMatrixB_16c;
-    procAddDot = &Gemm::AddDot6x16;
-#else
-    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = &Gemm::PackMatrixB_8c;
-    procAddDot = &Gemm::AddDot6x8;
-#endif
-
-    packedB = static_cast<float *>(
-        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
-    (*this.*procPackB)(KC, n, n % NR, B, ldb, packedB, true);
-    packedA = static_cast<float *>(
-        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
-  } else {
-    // 对 B 分块
-    NC = L1 / (KC * sizeof(float));
-    if (NC == 0) {
-      NC = NR;
-    } else {
-      int nblock_num = (n + NC - 1) / NC;
-      NC = (n + nblock_num - 1) / nblock_num;
-      NC = (NC + NR - 1) / NR * NR;
-    }
-    // 补齐 A
-    MC = (m + MR - 1) / MR * MR;
-
-#if __aarch64__
-    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = &Gemm::PackMatrixB_16c;
-    procAddDot = &Gemm::AddDot6x16;
-#else
-    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = &Gemm::PackMatrixB_8c;
-    procAddDot = &Gemm::AddDot6x8;
-#endif
-
-    packedA = static_cast<float *>(
-        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
-    (*this.*procPackA)(m, KC, m % MR, A, lda, packedA, true);
-    packedB = static_cast<float *>(
-        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
-  }
-  packedC = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC * max_threads));
-
-  if (m > n) {
-#pragma omp parallel for
-    for (int i = 0; i < m; i += MC) {
-#ifdef _OPENMP
-      int local_threads = omp_get_thread_num();
-#else
-      int local_threads = 0;
-#endif
-
-      int mc;
-      mc = s_min(m - i, MC);
-      float *local_A = packedA + MC * KC * local_threads;
-      float *local_C = packedC + MC * NC * local_threads;
-      (*this.*procPackA)(mc, KC, mc % MR, &A(i, 0), lda, local_A, false);
-      if (bias == nullptr) {
-        InnerKernelWithBn(mc, n, alpha, local_A, packedB, beta, local_C,
-                          &C(i, 0), ldc, relu, new_scale + i, new_bias + i);
-      } else {
-        InnerKernelWithBnAdd(mc, n, alpha, local_A, packedB, beta, local_C,
-                             &C(i, 0), ldc, relu, new_scale + i, new_bias + i,
-                             bias + i * ldc);
-      }
-    }
-  } else {
-#pragma omp parallel for
-    for (int j = 0; j < n; j += NC) {
-#ifdef _OPENMP
-      int local_threads = omp_get_thread_num();
-#else
-      int local_threads = 0;
-#endif
-
-      int nc;
-      nc = s_min(n - j, NC);
-      float *local_B = packedB + KC * NC * local_threads;
-      float *local_C = packedC + MC * NC * local_threads;
-      (*this.*procPackB)(KC, nc, nc % NR, &B(0, j), ldb, local_B, false);
-      if (bias == nullptr) {
-        InnerKernelWithBn(m, nc, alpha, packedA, local_B, beta, local_C,
-                          &C(0, j), ldc, relu, new_scale, new_bias);
-      } else {
-        InnerKernelWithBnAdd(m, nc, alpha, packedA, local_B, beta, local_C,
-                             &C(0, j), ldc, relu, new_scale, new_bias,
-                             bias + j);
-      }
-    }
-  }
-
-  paddle_mobile::memory::Free(packedA);
-  paddle_mobile::memory::Free(packedB);
-  paddle_mobile::memory::Free(packedC);
-}
-
-void Gemm::SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
-                              const float *B, int ldb, float *C, int ldc,
-                              float *p, std::string mode, float *bias,
-                              float *bias1) {
-#ifdef _OPENMP
-  int max_threads = omp_get_max_threads();
-#else
-  int max_threads = 1;
-#endif
-
-  int L1 = 8 * 1024;
-  KC = k;
-  if (m > n) {
-    // 对 A 分块
-    MC = L1 / (KC * sizeof(float));
-    if (MC == 0) {
-      MC = MR;
-    } else {
-      int mblock_num = (m + MC - 1) / MC;
-      MC = (m + mblock_num - 1) / mblock_num;
-      MC = (MC + MR - 1) / MR * MR;
-    }
-    // 补齐 B
-    NC = (n + NR - 1) / NR * NR;
-
-#if __aarch64__
-    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = &Gemm::PackMatrixB_16c;
-    procAddDot = &Gemm::AddDot6x16;
-#else
-    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = &Gemm::PackMatrixB_8c;
-    procAddDot = &Gemm::AddDot6x8;
-#endif
-
-    packedB = static_cast<float *>(
-        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
-    (*this.*procPackB)(KC, n, n % NR, B, ldb, packedB, true);
-    packedA = static_cast<float *>(
-        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
-  } else {
-    // 对 B 分块
-    NC = L1 / (KC * sizeof(float));
-    if (NC == 0) {
-      NC = NR;
-    } else {
-      int nblock_num = (n + NC - 1) / NC;
-      NC = (n + nblock_num - 1) / nblock_num;
-      NC = (NC + NR - 1) / NR * NR;
-    }
-    // 补齐 A
-    MC = (m + MR - 1) / MR * MR;
-
-#if __aarch64__
-    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = &Gemm::PackMatrixB_16c;
-    procAddDot = &Gemm::AddDot6x16;
-#else
-    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = &Gemm::PackMatrixB_8c;
-    procAddDot = &Gemm::AddDot6x8;
-#endif
-
-    packedA = static_cast<float *>(
-        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
-    (*this.*procPackA)(m, KC, m % MR, A, lda, packedA, true);
-    packedB = static_cast<float *>(
-        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
-  }
-  packedC = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC * max_threads));
-
-  if (m > n) {
-#pragma omp parallel for
-    for (int i = 0; i < m; i += MC) {
-#ifdef _OPENMP
-      int local_threads = omp_get_thread_num();
-#else
-      int local_threads = 0;
-#endif
-
-      int mc;
-      mc = s_min(m - i, MC);
-      float *local_A = packedA + MC * KC * local_threads;
-      float *local_C = packedC + MC * NC * local_threads;
-      (*this.*procPackA)(mc, KC, mc % MR, &A(i, 0), lda, local_A, false);
-      if (bias1 == nullptr) {
-        InnerKernelWithPRelu(mc, n, local_A, packedB, local_C, &C(i, 0), ldc,
-                             p + i, mode, bias + i, nullptr);
-      } else {
-        InnerKernelWithPRelu(mc, n, local_A, packedB, local_C, &C(i, 0), ldc,
-                             p + i, mode, bias + i, bias1 + i * ldc);
-      }
-    }
-  } else {
-#pragma omp parallel for
-    for (int j = 0; j < n; j += NC) {
-#ifdef _OPENMP
-      int local_threads = omp_get_thread_num();
-#else
-      int local_threads = 0;
-#endif
-
-      int nc;
-      nc = s_min(n - j, NC);
-      float *local_B = packedB + KC * NC * local_threads;
-      float *local_C = packedC + MC * NC * local_threads;
-      (*this.*procPackB)(KC, nc, nc % NR, &B(0, j), ldb, local_B, false);
-      if (bias1 == nullptr) {
-        InnerKernelWithPRelu(m, nc, packedA, local_B, local_C, &C(0, j), ldc, p,
-                             mode, bias, nullptr);
-      } else {
-        InnerKernelWithPRelu(m, nc, packedA, local_B, local_C, &C(0, j), ldc, p,
-                             mode, bias, bias1 + j);
-      }
-    }
-  }
-
-  paddle_mobile::memory::Free(packedA);
-  paddle_mobile::memory::Free(packedB);
-  paddle_mobile::memory::Free(packedC);
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/gemm.h b/mobile/src/operators/math/gemm.h
deleted file mode 100644
index fdbae47112..0000000000
--- a/mobile/src/operators/math/gemm.h
+++ /dev/null
@@ -1,492 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cstring>
-#include <string>
-#include "common/log.h"
-#include "memory/t_malloc.h"
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-// 矩阵取值运算宏，假设矩阵按行存储
-#define A(i, j) A[(i)*lda + (j)]
-#define B(i, j) B[(i)*ldb + (j)]
-#define C(i, j) C[(i)*ldc + (j)]
-
-#if __aarch64__
-#define MR_INT8 4
-#define NR_INT8 4
-#define MR 6
-#define NR 16
-#else
-#define MR_INT8 4
-#define NR_INT8 2
-#define MR 6
-#define NR 8
-#endif
-
-#define s_min(i, j) ((i) < (j) ? (i) : (j))
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-class Gemm {
- public:
-  typedef void (Gemm::*FnPack)(int, int, int, const float *, int, float *,
-                               const bool);
-  typedef void (Gemm::*FnAddDot)(int, const float *, const float *, float *,
-                                 int);
-  FnPack procPackA;
-  FnPack procPackB;
-  FnAddDot procAddDot;
-
-  void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
-                      float *buffer, const bool parallel);
-  void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
-                      float *buffer, const bool parallel);
-  void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
-                      float *buffer, const bool parallel);
-#if __aarch64__
-  void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
-                       float *buffer, const bool parallel);
-  void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
-                       float *buffer, const bool parallel);
-#endif
-
-  // 分块矩阵乘法
-  void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
-                   float beta, float *c, float *C, int ldc, bool relu);
-  void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
-                           const float *b, float beta, float *c, float *C,
-                           int ldc, bool relu, float *bias);
-
-  void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
-                         const float *b, float beta, float *c, float *C,
-                         int ldc, bool relu, float *new_scale, float *new_bias);
-  void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
-                            const float *b, float beta, float *c, float *C,
-                            int ldc, bool relu, float *new_scale,
-                            float *new_bias, float *bias);
-  void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
-                            float *c, float *C, int ldc, float *p,
-                            std::string mode, float *bias, float *bias1);
-
-  // 计算一个更小的 C 矩阵分块
-#if __aarch64__
-  void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc);
-  void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc);
-  void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc);
-#else
-  void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
-  void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
-  void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc);
-#endif
-
-  // 分块矩阵乘法结果回写
-  // C = A * B
-  void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
-  // C = alpha * A * B + beta * C
-  void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc);
-  // C = A * B + C
-  void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
-  // C = A * B + bias
-  void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias);
-  // C = A * B + C, relu(C)
-  void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
-  // C = A * B + C,prelu(C)
-  void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
-                         std::string mode, float *bias, float *bias1);
-  // C = A * B + bias ,relu(C)
-  void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
-                          float *bias);
-  // C = A * B, batchnorm(C)
-  void WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
-                   float *new_scale, float *new_bias);
-  // C = A * B, batchnorm(C), relu(C)
-  void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
-                       float *new_scale, float *new_bias);
-  void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
-                          float *new_scale, float *new_bias, float *bias1);
-
-  // 向量矩阵乘法 (M = 1)
-#if __aarch64__
-#else
-  void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
-                    const float *B, int ldb, float beta, float *C, int ldc,
-                    bool relu);
-
-  void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
-                          int lda, const float *B, int ldb, float beta,
-                          float *C, int ldc, bool relu, float *new_scale,
-                          float *new_bias);
-
-  // 向量矩阵乘法结果回写
-  // C = A * B
-  void VecWriteBasic(int n, float *c, float *C, int ldc);
-  // C = alpha * A * B + beta * C
-  void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
-  // C = A * B + C
-  void VecWriteWithAdd(int n, float *c, float *C, int ldc);
-  // C = A * B + C, relu(C)
-  void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
-  // C = A * B, batchnorm(C)
-  void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
-                      float *new_bias);
-  // C = A * B, batchnorm(C), relu(C)
-  void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
-                          float *new_bias);
-#endif
-
-  // 32位 float 矩阵乘法
-  void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
-             const float *B, int ldb, float beta, float *C, int ldc, bool relu,
-             float *bias);
-
-  // 32位 float 矩阵乘法, 并对结果进行 batchnrom
-  void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
-                   const float *B, int ldb, float beta, float *C, int ldc,
-                   bool relu, float *new_scale, float *new_bias, float *bias);
-
-  void SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
-                      const float *B, int ldb, float *C, int ldc, float *p,
-                      std::string mode, float *bias, float *bias1);
-
-  // 32位 float 矩阵乘法（openmp 多线程版本）
-  void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
-                 const float *B, int ldb, float beta, float *C, int ldc,
-                 bool relu, float *bias);
-
-  // 32位 float 矩阵乘法, 并对结果进行 batchnrom（openmp 多线程版本）
-  void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
-                       int lda, const float *B, int ldb, float beta, float *C,
-                       int ldc, bool relu, float *new_scale, float *new_bias,
-                       float *bias);
-
-  void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
-                          const float *B, int ldb, float *C, int ldc, float *p,
-                          std::string mode, float *bias, float *bias1);
-
-  // 8 bits function cluster begins
-  // 8 bits int small block inner product, data packed k = 1
-  void AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
-                 int32_t ldc);
-  void AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
-                 int32_t ldc);
-  // 8 bits int small block inner product, data packed k = 16
-  void AddDot4x2(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
-                 int32_t ldc);
-  void AddDot4x4(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
-                 int32_t ldc);
-
-  // 8 bits int inner product
-  template <typename Otype>
-  void InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a,
-                   const int8_t *b, float beta, int32_t *c, Otype *C,
-                   int32_t ldc, bool relu);
-  template <typename Otype>
-  void InnerKernelWithBias(int32_t mc, int32_t nc, float alpha, const int8_t *a,
-                           const int8_t *b, float beta, int32_t *c, Otype *C,
-                           int32_t ldc, bool relu, int32_t *bias,
-                           bool addOnRow = false);
-
-  // 8 bits int pack function
-  void PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
-                      int32_t lda, int8_t *buffer);
-  void PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
-                      int32_t lda, int8_t *buffer);
-  void PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
-                      int32_t ldb, int8_t *buffer);
-  void PackMatrixA_4r_16(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
-                         int32_t lda, int8_t *buffer);
-  void PackMatrixB_2c_16(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
-                         int32_t ldb, int8_t *buffer);
-  void PackMatrixB_4c_16(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
-                         int32_t ldb, int8_t *buffer);
-  void PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
-                          int32_t lda, int8_t *buffer);
-  void PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
-                          int32_t ldb, int8_t *buffer);
-  void PackMatrixA_omp_4r_16(int32_t m, int32_t k, int32_t m_tail,
-                             const int8_t *A, int32_t lda, int8_t *buffer);
-  void PackMatrixB_omp_2c_16(int32_t k, int32_t n, int32_t n_tail,
-                             const int8_t *B, int32_t ldb, int8_t *buffer);
-  void PackMatrixB_omp_4c_16(int32_t k, int32_t n, int32_t n_tail,
-                             const int8_t *B, int32_t ldb, int8_t *buffer);
-
-  // 8 bits int matrix product
-  template <typename Itype, typename Btype, typename Otype>
-  void Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, const Itype *A,
-                 int32_t lda, const Itype *B, int32_t ldb, float beta, Otype *C,
-                 int32_t ldc, bool relu, Btype *bias, bool addOnRow = false);
-  template <typename Otype>
-  void Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
-                 int32_t lda, const int8_t *B, int32_t ldb, float beta,
-                 Otype *C, int32_t ldc, bool relu, int32_t *bias,
-                 bool addOnRow = false);
-  template <typename Itype, typename Btype, typename Otype>
-  void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const Itype *A,
-             int32_t lda, const Itype *B, int32_t ldb, float beta, Otype *C,
-             int32_t ldc, bool relu, Btype *bias, bool addOnRow = false);
-  template <typename Otype>
-  void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
-             int32_t lda, const int8_t *B, int32_t ldb, float beta, Otype *C,
-             int32_t ldc, bool relu, int32_t *bias, bool addOnRow = false);
-  // 8 bits int write back
-  // C = A * B
-  void WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, int32_t ldc);
-  // C = A * B + bias, scale * relu(C)
-  void WriteWithAddReluScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
-                             int32_t ldc, int32_t *bias, float scale);
-  // C = A * B + bias, scale * C, bias is added on column
-  void WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
-                         int32_t ldc, int32_t *bias, float scale);
-  // C = A * B + bias, scale * C, bias is added on row
-  void WriteWithAddScaleT(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
-                          int32_t ldc, int32_t *bias, float scale);
-
- private:
-  int MC = 0;
-  int KC = 0;
-  int NC = 0;
-
-  // 32位 float
-  float *packedA;
-  float *packedB;
-  float *packedC;
-
-  // 8 bits int
-  int8_t *packedA_int8;
-  int8_t *packedB_int8;
-  int32_t *packedC_int32;
-  int8_t *zero_int8;
-};
-
-// 8 bits int matrix product (m*k x k*n)
-template <typename Otype>
-void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
-                 int32_t lda, const int8_t *B, int32_t ldb, float beta,
-                 Otype *C, int32_t ldc, bool relu, int32_t *bias,
-                 bool addOnRow) {
-  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
-  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
-  int32_t L1 = 32 * 1024;
-  int32_t L2 = 512 * 1024;
-
-  const int32_t k_complete = (k + 15) - ((k + 15) & 15);
-  KC = k_complete;
-  MC = L1 / (KC * sizeof(int8_t));
-  NC = L2 / (KC * sizeof(int8_t));
-
-  // make sure MC is multiple of MR_INT8, and NC is multiple of NR_INT8
-  if (MC == 0) {
-    MC = MR_INT8;
-  } else {
-    int32_t mblock_num = (m + MC - 1) / MC;
-    MC = (m + mblock_num - 1) / mblock_num;
-    MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
-  }
-  // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
-  if (NC == 0) {
-    NC = NR_INT8;
-  } else {
-    int32_t nblock_num = (n + NC - 1) / NC;
-    NC = (n + nblock_num - 1) / nblock_num;
-    NC = (NC + NR_INT8 - 1) / NR_INT8 * NR_INT8;
-  }
-  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
-  packedA_int8 = static_cast<int8_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
-  packedB_int8 = static_cast<int8_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
-  packedC_int32 = static_cast<int32_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC));
-  zero_int8 =
-      static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * k));
-
-  memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * k);
-  int32_t mc, nc;
-  for (int32_t j = 0; j < n; j += NC) {
-    nc = s_min(n - j, NC);
-#if __aarch64__
-    PackMatrixB_4c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, packedB_int8);
-#else
-    PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, packedB_int8);
-#endif
-    for (int32_t i = 0; i < m; i += MC) {
-      mc = s_min(m - i, MC);
-      PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, packedA_int8);
-      if (bias == nullptr) {
-        InnerKernel(mc, nc, alpha, packedA_int8, packedB_int8, beta,
-                    packedC_int32, &C(i, j), ldc, relu);
-      } else {
-        if (addOnRow) {
-          InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
-                              packedC_int32, &C(i, j), ldc, relu, bias + j,
-                              addOnRow);
-        } else {
-          InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
-                              packedC_int32, &C(i, j), ldc, relu, bias + i,
-                              addOnRow);
-        }
-      }
-    }
-  }
-
-  paddle_mobile::memory::Free(packedA_int8);
-  paddle_mobile::memory::Free(packedB_int8);
-  paddle_mobile::memory::Free(packedC_int32);
-  paddle_mobile::memory::Free(zero_int8);
-}
-
-// 8 bits int matrix product (m*k x k*n), omp version
-template <typename Otype>
-void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
-                     const int8_t *A, int32_t lda, const int8_t *B, int32_t ldb,
-                     float beta, Otype *C, int32_t ldc, bool relu,
-                     int32_t *bias, bool addOnRow) {
-#ifdef _OPENMP
-  int32_t max_threads = omp_get_max_threads();
-#else
-  int32_t max_threads = 1;
-#endif
-
-  int32_t L1 = 64 / max_threads * 1024;
-  const int32_t k_complete = (k + 15) - ((k + 15) & 15);
-  KC = k_complete;
-  zero_int8 =
-      static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * k));
-  memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * k);
-  if (m > n) {
-    // 对 A 分块
-    MC = L1 / (KC * sizeof(int8_t));
-    if (MC == 0) {
-      MC = MR_INT8;
-    } else {
-      int32_t mblock_num = (m + MC - 1) / MC;
-      MC = (m + mblock_num - 1) / mblock_num;
-      MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
-    }
-    // 补齐 B
-    NC = (n + NR_INT8 - 1) / NR_INT8 * NR_INT8;
-
-    packedB_int8 = static_cast<int8_t *>(
-        paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
-#if __aarch64__
-    PackMatrixB_omp_4c_16(k, n, n % NR_INT8, B, ldb, packedB_int8);
-#else
-    PackMatrixB_omp_2c_16(k, n, n % NR_INT8, B, ldb, packedB_int8);
-#endif
-    packedA_int8 = static_cast<int8_t *>(
-        paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC * max_threads));
-  } else {
-    // 对 B 分块
-    NC = L1 / (KC * sizeof(int8_t));
-    if (NC == 0) {
-      NC = NR_INT8;
-    } else {
-      int32_t nblock_num = (n + NC - 1) / NC;
-      NC = (n + nblock_num - 1) / nblock_num;
-      NC = (NC + NR_INT8 - 1) / NR_INT8 * NR_INT8;
-    }
-    // 补齐 A
-    MC = (m + MR_INT8 - 1) / MR_INT8 * MR_INT8;
-
-    packedA_int8 = static_cast<int8_t *>(
-        paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
-#if __aarch64__
-    PackMatrixA_omp_4r_16(m, k, m % MR_INT8, A, lda, packedA_int8);
-#else
-    PackMatrixA_omp_4r_16(m, k, m % MR_INT8, A, lda, packedA_int8);
-#endif
-    packedB_int8 = static_cast<int8_t *>(
-        paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC * max_threads));
-  }
-  packedC_int32 = static_cast<int32_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC * max_threads));
-
-  if (m > n) {
-#pragma omp parallel for
-    for (int32_t i = 0; i < m; i += MC) {
-#ifdef _OPENMP
-      int32_t local_threads = omp_get_thread_num();
-#else
-      int32_t local_threads = 0;
-#endif
-
-      int32_t mc;
-      mc = s_min(m - i, MC);
-      int8_t *local_A = packedA_int8 + MC * KC * local_threads;
-      int32_t *local_C = packedC_int32 + MC * NC * local_threads;
-#if __aarch64__
-      PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, local_A);
-#else
-      PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, local_A);
-#endif
-      if (bias == nullptr) {
-        InnerKernel(mc, n, alpha, local_A, packedB_int8, beta, local_C,
-                    &C(i, 0), ldc, relu);
-      } else {
-        if (addOnRow) {
-          InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta,
-                              local_C, &C(i, 0), ldc, relu, bias, addOnRow);
-        } else {
-          InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta,
-                              local_C, &C(i, 0), ldc, relu, bias + i, addOnRow);
-        }
-      }
-    }
-  } else {
-#pragma omp parallel for
-    for (int32_t j = 0; j < n; j += NC) {
-#ifdef _OPENMP
-      int32_t local_threads = omp_get_thread_num();
-#else
-      int32_t local_threads = 0;
-#endif
-      int32_t nc;
-      nc = s_min(n - j, NC);
-      int8_t *local_B = packedB_int8 + KC * NC * local_threads;
-      int32_t *local_C = packedC_int32 + MC * NC * local_threads;
-#if __aarch64__
-      PackMatrixB_4c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, local_B);
-#else
-      PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, local_B);
-#endif
-      if (bias == nullptr) {
-        InnerKernel(m, nc, alpha, packedA_int8, local_B, beta, local_C,
-                    &C(0, j), ldc, relu);
-      } else {
-        if (addOnRow) {
-          InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta,
-                              local_C, &C(0, j), ldc, relu, bias + j, addOnRow);
-        } else {
-          InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta,
-                              local_C, &C(0, j), ldc, relu, bias, addOnRow);
-        }
-      }
-    }
-  }
-
-  paddle_mobile::memory::Free(packedA_int8);
-  paddle_mobile::memory::Free(packedB_int8);
-  paddle_mobile::memory::Free(packedC_int32);
-  paddle_mobile::memory::Free(zero_int8);
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/gemm/cblas.cc b/mobile/src/operators/math/gemm/cblas.cc
deleted file mode 100644
index 4428826552..0000000000
--- a/mobile/src/operators/math/gemm/cblas.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-#include "operators/math/gemm/cblas.h"
-#include "operators/math/gemm/executor.h"
-#include "operators/math/gemm/strategy.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-void cblas_sgemm(const bool transA, const bool transB, const int M, const int N,
-                 const int K, const float alpha, const float *A, const int lda,
-                 const float *B, const int ldb, const float beta, float *C,
-                 const int ldc) {
-  if (N == 1) {
-    return cblas_sgemv(transA, M, K, alpha, A, lda, B, beta, C);
-  } else if (M == 1) {
-    return cblas_sgemv(!transB, N, K, alpha, B, ldb, A, beta, C);
-  } else {
-    GemmExecutor<SgemmStrategy> exec(transA, transB, M, N, K);
-    exec(alpha, A, lda, B, ldb, beta, C, ldc);
-  }
-}
-
-void cblas_sgemv(const bool trans, const int M, const int N, const float alpha,
-                 const float *A, const int lda, const float *B,
-                 const float beta, float *C) {
-  GemvExecutor<SgemvStrategy> exec(trans, M, N);
-  exec(alpha, A, lda, B, beta, C);
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/math/gemm/cblas.h b/mobile/src/operators/math/gemm/cblas.h
deleted file mode 100644
index c7c9201869..0000000000
--- a/mobile/src/operators/math/gemm/cblas.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-void cblas_sgemm(const bool transA, const bool transB, const int M, const int N,
-                 const int K, const float alpha, const float *A, const int lda,
-                 const float *B, const int ldb, const float beta, float *C,
-                 const int ldc);
-
-void cblas_sgemv(const bool trans, const int M, const int N, const float alpha,
-                 const float *A, const int lda, const float *B,
-                 const float beta, float *C);
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/gemm/executor.h b/mobile/src/operators/math/gemm/executor.h
deleted file mode 100644
index 976415b9ac..0000000000
--- a/mobile/src/operators/math/gemm/executor.h
+++ /dev/null
@@ -1,266 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-// #include <sys/time.h>
-#include <iostream>
-#include "common/log.h"
-#include "framework/context.h"
-#include "memory/t_malloc.h"
-#include "operators/math/gemm/gemm_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-int CeilDiv(const int &x, const int &y) { return (x + y - 1) / y; }
-unsigned int ResetL1Cache(const unsigned int L1_size, const int thread_num,
-                          const int N, const int K) {
-  unsigned int L1 = L1_size;
-  if (thread_num == 1) {
-    if (N >= 30000 && K > 100) {
-      L1 *= 4;
-    } else if (N >= 10000 && K > 100) {
-      L1 *= 2;
-    }
-  }
-  return L1;
-}
-
-class Executor {
- public:
-  Executor() : num_threads_(1) {
-#ifdef _OPENMP
-    num_threads_ = omp_get_max_threads();
-#endif
-  }
-  virtual ~Executor() {}
-
- protected:
-  int num_threads_;
-};
-
-template <typename Strategy>
-class GemmExecutor : public Executor {
-  typedef typename Strategy::Itype Itype;
-  typedef typename Strategy::Otype Otype;
-
- public:
-  GemmExecutor(const bool transA, const bool transB, const int M, const int N,
-               const int K)
-      : Executor(), transA_(transA), transB_(transB), M_(M), N_(N), K_(K) {
-    unsigned int L1_size = 0;
-    unsigned int L2_size = 0;
-    if (M_ > N_) {
-      L2_size =
-          ResetL1Cache(framework::CPUContext::Context()->get_l1_cache_size(),
-                       num_threads_, M_, K_);
-      L1_size = framework::CPUContext::Context()->get_l2_cache_size();
-    } else {
-      L1_size =
-          ResetL1Cache(framework::CPUContext::Context()->get_l1_cache_size(),
-                       num_threads_, N_, K_);
-      L2_size = framework::CPUContext::Context()->get_l2_cache_size();
-    }
-
-    rhs_tile_num_ = L1_size / (K_ * sizeof(Itype));
-    if (rhs_tile_num_ == 0) {
-      rhs_tile_num_ = Strategy::out_width();
-    } else {
-      int n_block = CeilDiv(N_, rhs_tile_num_);
-      rhs_tile_num_ = CeilDiv(N_, n_block);
-      rhs_tile_num_ = CeilDiv(rhs_tile_num_, Strategy::out_width());
-      rhs_tile_num_ *= Strategy::out_width();
-    }
-
-    //  lhs_tile_num_ = CeilDiv(M, Strategy::out_height()) *
-    //  Strategy::out_height();
-    lhs_tile_num_ = L2_size / (K_ * sizeof(Itype));
-    if (lhs_tile_num_ == 0) {
-      lhs_tile_num_ = Strategy::out_height();
-    } else {
-      int m_block = CeilDiv(M_, lhs_tile_num_);
-      lhs_tile_num_ = CeilDiv(M_, m_block);
-      lhs_tile_num_ = CeilDiv(lhs_tile_num_, Strategy::out_height());
-      lhs_tile_num_ *= Strategy::out_height();
-    }
-  }
-
-  void operator()(const float alpha, const Itype *A, const int lda,
-                  const Itype *B, const int ldb, const float beta, Otype *C,
-                  const int ldc) {
-    //  struct timeval tv_begin, tv_end;
-    //  gettimeofday(&tv_begin,NULL);
-    if (M_ > N_) {
-      nblock = CeilDiv(N_, Strategy::out_width()) * Strategy::out_width();
-      lhs_worksize_ = sizeof(Itype) * lhs_tile_num_ * K_ * num_threads_;
-      rhs_worksize_ = sizeof(Itype) * K_ * nblock;
-      out_worksize_ = sizeof(Otype) * lhs_tile_num_ * nblock * num_threads_;
-      ldc_ = nblock;
-    } else {
-      mblock = CeilDiv(M_, Strategy::out_height()) * Strategy::out_height();
-      lhs_worksize_ = sizeof(Itype) * mblock * K_;
-      rhs_worksize_ = sizeof(Itype) * K_ * rhs_tile_num_ * num_threads_;
-      out_worksize_ = sizeof(Otype) * mblock * rhs_tile_num_ * num_threads_;
-      ldc_ = rhs_tile_num_;
-    }
-
-    lhs_workspace_ =
-        static_cast<Itype *>(paddle_mobile::memory::Alloc(lhs_worksize_));
-    rhs_workspace_ =
-        static_cast<Itype *>(paddle_mobile::memory::Alloc(rhs_worksize_));
-    out_workspace_ =
-        static_cast<Otype *>(paddle_mobile::memory::Alloc(out_worksize_));
-
-    //  std::cout << "M: " << M_ << ", N: " << N_ << ", K: " << K_ << std::endl;
-    //  std::cout << "lhs_block: " << CeilDiv(M_, lhs_tile_num_) << ", "
-    //            << "rhs_block: " << CeilDiv(N_, rhs_tile_num_) << std::endl;
-
-    if (M_ > N_) {
-      strategy_.pack_rhs(K_, N_, B, ldb, rhs_workspace_, true);
-
-      #pragma omp parallel for
-      for (int lhs_block = 0; lhs_block < M_; lhs_block += lhs_tile_num_) {
-        int lhs_range = std::min(M_ - lhs_block, lhs_tile_num_);
-#ifdef _OPENMP
-        int thread_id = omp_get_thread_num();
-#else
-        int thread_id = 0;
-#endif
-        float *local_A = lhs_workspace_ + lhs_tile_num_ * K_ * thread_id;
-        float *local_C = out_workspace_ + lhs_tile_num_ * ldc_ * thread_id;
-        // load lhs into lhs_workspace
-        strategy_.pack_lhs(lhs_range, K_, A + lhs_block * lda, lda, local_A,
-                           false);
-        for (int rhs_block = 0; rhs_block < N_; rhs_block += rhs_tile_num_) {
-          int rhs_range = std::min(N_ - rhs_block, rhs_tile_num_);
-          float *local_B = rhs_workspace_ + K_ * rhs_block;
-          for (int rhs_tile = 0; rhs_tile < rhs_range;
-               rhs_tile += Strategy::out_width()) {
-            for (int lhs_tile = 0; lhs_tile < lhs_range;
-                 lhs_tile += Strategy::out_height()) {
-              int offset = lhs_tile * ldc_ + rhs_block + rhs_tile;
-              strategy_.kernel(local_A + lhs_tile * K_, local_B + rhs_tile * K_,
-                               K_, local_C + offset, ldc_);
-            }
-          }
-        }
-        strategy_.write(lhs_range, N_, alpha, local_C, ldc_, beta,
-                        C + lhs_block * ldc, ldc);
-      }
-    } else {
-      strategy_.pack_lhs(M_, K_, A, lda, lhs_workspace_, true);
-
-      #pragma omp parallel for
-      for (int rhs_block = 0; rhs_block < N_; rhs_block += rhs_tile_num_) {
-        int rhs_range = std::min(N_ - rhs_block, rhs_tile_num_);
-#ifdef _OPENMP
-        int thread_id = omp_get_thread_num();
-#else
-        int thread_id = 0;
-#endif
-        float *local_B = rhs_workspace_ + K_ * rhs_tile_num_ * thread_id;
-        float *local_C = out_workspace_ + mblock * ldc_ * thread_id;
-        // load rhs into rhs_workspace
-        strategy_.pack_rhs(K_, rhs_range, B + rhs_block, ldb, local_B, false);
-        for (int lhs_block = 0; lhs_block < M_; lhs_block += lhs_tile_num_) {
-          int lhs_range = std::min(M_ - lhs_block, lhs_tile_num_);
-          float *local_A = lhs_workspace_ + lhs_block * K_;
-          for (int lhs_tile = 0; lhs_tile < lhs_range;
-               lhs_tile += Strategy::out_height()) {
-            for (int rhs_tile = 0; rhs_tile < rhs_range;
-                 rhs_tile += Strategy::out_width()) {
-              int offset = (lhs_block + lhs_tile) * ldc_ + rhs_tile;
-              strategy_.kernel(local_A + lhs_tile * K_, local_B + rhs_tile * K_,
-                               K_, local_C + offset, ldc_);
-            }
-          }
-        }
-        strategy_.write(M_, rhs_range, alpha, local_C, ldc_, beta,
-                        C + rhs_block, ldc);
-      }
-    }
-
-    paddle_mobile::memory::Free(lhs_workspace_);
-    paddle_mobile::memory::Free(rhs_workspace_);
-    paddle_mobile::memory::Free(out_workspace_);
-
-    //  gettimeofday(&tv_end,NULL);
-    //  float elapsed = (tv_end.tv_sec - tv_begin.tv_sec) * 1000.f +
-    //                  (tv_end.tv_usec - tv_begin.tv_usec) / 1000.f;
-    //  std::cout << "elapsed: " << elapsed << "ms, speed: "
-    //            << (M_ * N_ * K_ / 1000.f / 1000.f) / elapsed
-    //            << " gflops" << std::endl;
-  }
-
-  virtual ~GemmExecutor() {}
-
- private:
-  const unsigned int M_;
-  const unsigned int N_;
-  const unsigned int K_;
-  const bool transA_;
-  const bool transB_;
-
-  unsigned int lhs_tile_num_ = 0;
-  unsigned int rhs_tile_num_ = 0;
-  unsigned int out_tile_num_ = 0;
-
-  unsigned int lhs_worksize_ = 0;
-  unsigned int rhs_worksize_ = 0;
-  unsigned int out_worksize_ = 0;
-  unsigned int ldc_ = 0;
-
-  unsigned int mblock = 0;
-  unsigned int nblock = 0;
-
-  Itype *lhs_workspace_ = nullptr;
-  Itype *rhs_workspace_ = nullptr;
-  Otype *out_workspace_ = nullptr;
-
-  Strategy strategy_;
-};
-
-template <typename Strategy>
-class GemvExecutor : public Executor {
-  typedef typename Strategy::Itype Itype;
-  typedef typename Strategy::Otype Otype;
-
- public:
-  GemvExecutor(const bool transA, const int M, const int N)
-      : Executor(), M_(M), N_(N), trans_(transA) {}
-
-  void operator()(const float alpha, const Itype *A, const int lda,
-                  const Itype *B, const float beta, Otype *C) {
-    strategy_.kernel(trans_, M_, N_, alpha, A, lda, B, beta, C);
-  }
-
-  virtual ~GemvExecutor() {}
-
- private:
-  const unsigned int M_;
-  const unsigned int N_;
-  const bool trans_;
-
-  Strategy strategy_;
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/gemm/gemm1x1s1.cpp b/mobile/src/operators/math/gemm/gemm1x1s1.cpp
deleted file mode 100644
index fd997dc48d..0000000000
--- a/mobile/src/operators/math/gemm/gemm1x1s1.cpp
+++ /dev/null
@@ -1,2221 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-#ifdef CONV_OP
-
-#include "operators/math/gemm/gemm1x1s1.h"
-#include <arm_neon.h>
-#include "framework/context.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-#ifdef __aarch64__
-void prepackA_8x12(float *out, const float *in, const int ldin, const int m0,
-                   const int mmax, const int k0, const int kmax) {
-  int x_len = kmax - k0;
-  uint32_t zerobuff[x_len];
-  memset(zerobuff, 0, sizeof(uint32_t) * x_len);
-
-  uint32_t *dout = reinterpret_cast<uint32_t *>(out);
-  const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in);
-  int stride = x_len * 8;
-
-#pragma omp parallel for
-  for (int y = m0; y < mmax; y += 8) {
-    uint32_t *outptr = dout + stride * (y - m0) / 8;
-
-    const uint32_t *inptr0 = inptr + y * ldin + k0;
-    const uint32_t *inptr1 = inptr0 + ldin;
-    const uint32_t *inptr2 = inptr1 + ldin;
-    const uint32_t *inptr3 = inptr2 + ldin;
-    const uint32_t *inptr4 = inptr3 + ldin;
-    const uint32_t *inptr5 = inptr4 + ldin;
-    const uint32_t *inptr6 = inptr5 + ldin;
-    const uint32_t *inptr7 = inptr6 + ldin;
-
-    asm volatile(
-        "prfm   pldl1keep, [%[ptr0]]                \n"
-        "prfm   pldl1keep, [%[ptr0], #64]   \n"
-        "prfm   pldl1keep, [%[ptr1]]        \n"
-        "prfm   pldl1keep, [%[ptr1], #64]   \n"
-        "prfm   pldl1keep, [%[ptr2]]        \n"
-        "prfm   pldl1keep, [%[ptr2], #64]   \n"
-        "prfm   pldl1keep, [%[ptr3]]        \n"
-        "prfm   pldl1keep, [%[ptr3], #64]   \n"
-        "prfm   pldl1keep, [%[ptr4]]        \n"
-        "prfm   pldl1keep, [%[ptr4], #64]   \n"
-        "prfm   pldl1keep, [%[ptr5]]        \n"
-        "prfm   pldl1keep, [%[ptr5], #64]   \n"
-        "prfm   pldl1keep, [%[ptr6]]        \n"
-        "prfm   pldl1keep, [%[ptr6], #64]   \n"
-        "prfm   pldl1keep, [%[ptr7]]        \n"
-        "prfm   pldl1keep, [%[ptr7], #64]   \n"
-        :
-        : [ptr0] "r"(inptr0), [ptr1] "r"(inptr1), [ptr2] "r"(inptr2),
-          [ptr3] "r"(inptr3), [ptr4] "r"(inptr4), [ptr5] "r"(inptr5),
-          [ptr6] "r"(inptr6), [ptr7] "r"(inptr7)
-        : "memory");
-
-    int x = x_len;
-    //! cope with row index exceed real size, set to zero buffer
-    if ((y + 7) >= mmax) {
-      switch ((y + 7) - mmax) {
-        case 6:
-          inptr1 = zerobuff;
-        case 5:
-          inptr2 = zerobuff;
-        case 4:
-          inptr3 = zerobuff;
-        case 3:
-          inptr4 = zerobuff;
-        case 2:
-          inptr5 = zerobuff;
-        case 1:
-          inptr6 = zerobuff;
-        case 0:
-          inptr7 = zerobuff;
-        default:
-          break;
-      }
-    }
-    for (; x > 7; x -= 8) {
-      asm volatile(
-          // Load up 8 elements (2 vectors) from each of 8 sources.
-          "LDP        q0, q1, [%[inptr0]], #32\n"  // q0=A0A1A2A3
-          "LDP        q2, q3, [%[inptr1]], #32\n"  // q2=B0B1B2B3
-          "LDP        q4, q5, [%[inptr2]], #32\n"  // q4=C0C1C2C3
-          "ZIP1       v16.4s, v0.4s, v4.4s\n"      // q16=A0C0A1C1
-          "prfm   pldl1keep, [%[inptr0], #128] \n"
-          "LDP        q6, q7, [%[inptr3]], #32\n"  // q6=D0D1D2D3
-          "ZIP1       v17.4s, v2.4s, v6.4s\n"      // q17=B0D0B1D1
-          "LDP        q8, q9, [%[inptr4]], #32\n"
-          "LDP        q10, q11, [%[inptr5]], #32\n"
-          "LDP        q12, q13, [%[inptr6]], #32\n"
-          "ZIP1       v18.4s, v8.4s, v12.4s\n"
-          "prfm   pldl1keep, [%[inptr1], #128]\n"
-          "LDP        q14, q15, [%[inptr7]], #32\n"
-          "ZIP1       v19.4s, v10.4s, v14.4s\n"
-
-          "ZIP1       v20.4s, v16.4s, v17.4s\n"  // q20=A0B0C0D0
-          "prfm   pldl1keep, [%[inptr2], #128]\n"
-          "ZIP1       v21.4s, v18.4s, v19.4s\n"
-          "ZIP2       v22.4s, v16.4s, v17.4s\n"
-          "ZIP2       v23.4s, v18.4s, v19.4s\n"
-
-          "ZIP2       v16.4s, v0.4s, v4.4s\n"
-          "prfm   pldl1keep, [%[inptr3], #128]\n"
-          "ZIP2       v17.4s, v2.4s, v6.4s\n"
-          "STP        q20, q21, [%[outptr]], #32\n"  // Write back the first
-                                                     // element of each source
-
-          "ZIP2       v18.4s, v8.4s, v12.4s\n"
-          "ZIP2       v19.4s, v10.4s, v14.4s\n"
-          "STP        q22, q23, [%[outptr]], #32\n"  // Write back the second
-                                                     // element of each source
-
-          "ZIP1       v20.4s, v16.4s, v17.4s\n"
-          "prfm   pldl1keep, [%[inptr4], #128]\n"
-          "ZIP1       v21.4s, v18.4s, v19.4s\n"
-          "ZIP2       v22.4s, v16.4s, v17.4s\n"
-          "ZIP2       v23.4s, v18.4s, v19.4s\n"
-
-          "ZIP1       v16.4s, v1.4s, v5.4s\n"
-          "prfm   pldl1keep, [%[inptr5], #128]\n"
-          "ZIP1       v17.4s, v3.4s, v7.4s\n"
-          "STP        q20, q21, [%[outptr]], #32\n"  // Third element
-
-          "ZIP1       v18.4s, v9.4s, v13.4s\n"
-          "ZIP1       v19.4s, v11.4s, v15.4s\n"
-          "STP        q22, q23, [%[outptr]], #32\n"  // Fourth element
-
-          "ZIP1       v20.4s, v16.4s, v17.4s\n"
-          "ZIP1       v21.4s, v18.4s, v19.4s\n"
-          "ZIP2       v22.4s, v16.4s, v17.4s\n"
-          "prfm   pldl1keep, [%[inptr6], #128]\n"
-          "ZIP2       v23.4s, v18.4s, v19.4s\n"
-
-          "ZIP2       v16.4s, v1.4s, v5.4s\n"
-          "ZIP2       v17.4s, v3.4s, v7.4s\n"
-          "STP        q20, q21, [%[outptr]], #32\n"  // Fifth element
-
-          "ZIP2       v18.4s, v9.4s, v13.4s\n"
-          "prfm   pldl1keep, [%[inptr7], #128]\n"
-          "ZIP2       v19.4s, v11.4s, v15.4s\n"
-          "STP        q22, q23, [%[outptr]], #32\n"  // Sixth element
-
-          "ZIP1       v20.4s, v16.4s, v17.4s\n"
-          "ZIP1       v21.4s, v18.4s, v19.4s\n"
-          "STP        q20, q21, [%[outptr]], #32\n"  // Seventh element
-
-          "ZIP2       v22.4s, v16.4s, v17.4s\n"
-          "ZIP2       v23.4s, v18.4s, v19.4s\n"
-          "STP        q22, q23, [%[outptr]], #32\n"  // Eighth element
-          : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
-            [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
-            [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
-          :
-          : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-            "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
-            "v20", "v21", "v22", "v23", "cc", "memory");
-    }
-
-    for (; x > 0; x--) {
-      *outptr++ = *inptr0++;
-      *outptr++ = *inptr1++;
-      *outptr++ = *inptr2++;
-      *outptr++ = *inptr3++;
-      *outptr++ = *inptr4++;
-      *outptr++ = *inptr5++;
-      *outptr++ = *inptr6++;
-      *outptr++ = *inptr7++;
-    }
-  }
-}
-
-#else   //__aarch64__
-void prepackA_6x8(float* out, const float* in, const int ldin, const int m0,
-                  const int mmax, const int k0, const int kmax) {
-  int x_len = kmax - k0;
-  uint32_t zerobuff[x_len];
-  memset(zerobuff, 0, sizeof(uint32_t) * x_len);
-
-  uint32_t* dout = reinterpret_cast<uint32_t*>(out);
-  const uint32_t* inptr = reinterpret_cast<const uint32_t*>(in);
-  uint32_t* outptr = dout;
-
-  //! data A is not transposed, transpose A to k * 6
-  for (int y = m0; y < mmax; y += 6) {
-    const uint32_t* inptr0 = inptr + y * ldin + k0;
-    const uint32_t* inptr1 = inptr0 + ldin;
-    const uint32_t* inptr2 = inptr1 + ldin;
-    const uint32_t* inptr3 = inptr2 + ldin;
-    const uint32_t* inptr4 = inptr3 + ldin;
-    const uint32_t* inptr5 = inptr4 + ldin;
-
-    int x = x_len;
-    //! cope with row index exceed real size, set to zero buffer
-    if ((y + 5) >= mmax) {
-      switch ((y + 5) - mmax) {
-        case 4:
-          inptr1 = zerobuff;
-        case 3:
-          inptr2 = zerobuff;
-        case 2:
-          inptr3 = zerobuff;
-        case 1:
-          inptr4 = zerobuff;
-        case 0:
-          inptr5 = zerobuff;
-        default:
-          break;
-      }
-    }
-
-    for (; x > 7; x -= 8) {
-      //! zip load 8 elements (2 neon Q registers) from each of 6 rows
-      asm volatile(
-          "vld4.32  {d0-d3}, [%[inptr0]]!   @ zip load r0, "
-          "q0,q1=r00,r04,r01,r05,r02,r06,r03,r07\n"
-          "vld4.32  {d4-d7}, [%[inptr1]]!   @ zip load r1, "
-          "q2,q3=r10,r14,r11,r15,r12,r16,r13,r17\n"
-          "vld4.32  {d8-d11}, [%[inptr2]]!  @ zip load r2, "
-          "q4,q5=r20,r24,r21,r25,r22,r26,r23,r27\n"
-          "vld4.32  {d12-d15}, [%[inptr3]]! @ zip load r3, "
-          "q6,q7=r30,r34,r31,r35,r32,r36,r33,r37\n"
-          "vld4.32  {d16-d19}, [%[inptr4]]! @ zip load r4, "
-          "q8,q9=r40,r44,r41,r45,r42,r46,r43,r47\n"
-          "vld4.32  {d20-d23}, [%[inptr5]]! @ zip load r5, "
-          "q10,q11=r50,r54,r51,r55,r52,r56,r53,r57\n"
-
-          "vtrn.32  q0, q2                  @ trans data: q0=r00,r10,r01,r11; "
-          "q2=r04,r14,r05,r15\n"
-          "vtrn.32  q4, q6                  @ trans data: q4=r20,r30,r21,r31; "
-          "q6=r24,r34,r25,r35\n"
-          "vtrn.32  q8, q10                 @ trans data: q8=r40,r50,r41,r51; "
-          "q10=r44,r54,r45,r55\n"
-
-          "vswp     d1, d8                  @ swap d1, d8, q0=r00,r10,r20,r30; "
-          "q4=r01,r11,r21,r31\n"
-          "vst1.32  {d0-d1},  [%[outptr]]!  @ write q0:r00,r10,r20,r30\n"
-          "vst1.32  {d16},    [%[outptr]]!  @ write d16(q8,low),r40,r50\n"
-          "vst1.32  {d8-d9},  [%[outptr]]!  @ write q4:r01,r11,r21,r31\n"
-          "vst1.32  {d17},    [%[outptr]]!  @ write d16(q8,high),r41,r51\n"
-
-          "vtrn.32  q1, q3                  @ trans data: q1=r02,r12,r03,r13; "
-          "q3=r06,r16,r07,r17\n"
-          "vtrn.32  q5, q7                  @ trans data: q5=r22,r32,r23,r33; "
-          "q7=r26,r36,r27,r37\n"
-          "vtrn.32  q9, q11                 @ trans data: q9=r42,r52,r43,r53; "
-          "q11=r46,r56,r47,r57\n"
-
-          "vswp     d3, d10                 @ swap d3, d10, "
-          "q1=r02,r12,r22,r32; q5=r03,r13,r23,r33\n"
-          "vst1.32  {d2-d3},  [%[outptr]]!  @ write q1:r02,r12,r22,r32\n"
-          "vst1.32  {d18},    [%[outptr]]!  @ write d18(q9,low),r42,r52\n"
-          "vst1.32  {d10-d11},[%[outptr]]!  @ write q5:r03,r13,r23,r33\n"
-          "vst1.32  {d19},    [%[outptr]]!  @ write d19(q9,high),r43,r53\n"
-
-          "vswp     d5, d12                 @ swap d5, d12,q2=r04,r14,r24,r34; "
-          "q6=r05,r15,r25,r35\n"
-          "vst1.32  {d4-d5},  [%[outptr]]!  @ write q2:r04,r14,r24,r34\n"
-          "vst1.32  {d20},    [%[outptr]]!  @ write d20(q10,low),r44,r54\n"
-          "vst1.32  {d12-d13},[%[outptr]]!  @ write q6:r05,r15,r25,r35\n"
-          "vst1.32  {d21},    [%[outptr]]!  @ write d21(q10,high),r45,r55\n"
-
-          "vswp     d7, d14                 @ swap d7, d14, "
-          "q3=r06,r16,r26,r36; q7=r07,r17,r27,r37\n"
-          "vst1.32  {d6-d7},  [%[outptr]]!  @ write q3:r06,r16,r26,r36\n"
-          "vst1.32  {d22},    [%[outptr]]!  @ write d22(q11,low),r46,r56\n"
-          "vst1.32  {d14-d15},[%[outptr]]!  @ write q7:r07,r17,r27,r37\n"
-          "vst1.32  {d23},    [%[outptr]]!  @ write d23(q11,high),r47,r57\n"
-          : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
-            [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
-            [outptr] "+r"(outptr)
-          :
-          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
-            "q11", "cc", "memory");
-    }
-
-    for (; x > 0; x--) {
-      *outptr++ = *inptr0++;
-      *outptr++ = *inptr1++;
-      *outptr++ = *inptr2++;
-      *outptr++ = *inptr3++;
-      *outptr++ = *inptr4++;
-      *outptr++ = *inptr5++;
-    }
-  }
-}
-
-void prepackA_4x8(float* out, const float* in, const int ldin, const int m0,
-                  const int mmax, const int k0, const int kmax) {
-  int x_len = kmax - k0;
-  uint32_t zerobuff[x_len];
-  memset(zerobuff, 0, sizeof(uint32_t) * x_len);
-
-  uint32_t* dout = reinterpret_cast<uint32_t*>(out);
-  const uint32_t* inptr = reinterpret_cast<const uint32_t*>(in);
-
-  uint32_t* outptr = dout;
-  //! data A is not transposed, transpose A to k * 4
-  for (int y = m0; y < mmax; y += 4) {
-    const uint32_t* inptr0 = inptr + y * ldin + k0;
-    const uint32_t* inptr1 = inptr0 + ldin;
-    const uint32_t* inptr2 = inptr1 + ldin;
-    const uint32_t* inptr3 = inptr2 + ldin;
-
-    int x = x_len;
-    //! cope with row index exceed real size, set to zero buffer
-    if ((y + 3) >= mmax) {
-      switch ((y + 3) - mmax) {
-        case 2:
-          inptr1 = zerobuff;
-        case 1:
-          inptr2 = zerobuff;
-        case 0:
-          inptr3 = zerobuff;
-        default:
-          break;
-      }
-    }
-
-    for (; x > 7; x -= 8) {
-      //! zip load 8 elements (2 neon Q registers) from each of 4 rows
-      asm volatile(
-          "vld4.32  {d0-d3}, [%[inptr0]]!   @ zip load r0, "
-          "q0,q1=r00,r04,r01,r05,r02,r06,r03,r07\n"
-          "vld4.32  {d4-d7}, [%[inptr1]]!   @ zip load r1, "
-          "q2,q3=r10,r14,r11,r15,r12,r16,r13,r17\n"
-          "vld4.32  {d8-d11}, [%[inptr2]]!  @ zip load r2, "
-          "q4,q5=r20,r24,r21,r25,r22,r26,r23,r27\n"
-          "vld4.32  {d12-d15}, [%[inptr3]]! @ zip load r3, "
-          "q6,q7=r30,r34,r31,r35,r32,r36,r33,r37\n"
-
-          "vtrn.32  q0, q2                  @ trans data: q0=r00,r10,r01,r11; "
-          "q2=r04,r14,r05,r15\n"
-          "vtrn.32  q4, q6                  @ trans data: q4=r20,r30,r21,r31; "
-          "q6=r24,r34,r25,r35\n"
-
-          "vswp     d1, d8                  @ swap d1, d8, q0=r00,r10,r20,r30; "
-          "q4=r01,r11,r21,r31\n"
-          "vst1.32  {d0-d1},  [%[outptr]]!  @ write q0:r00,r10,r20,r30\n"
-          "vst1.32  {d8-d9},  [%[outptr]]!  @ write q4:r01,r11,r21,r31\n"
-
-          "vtrn.32  q1, q3                  @ trans data: q1=r02,r12,r03,r13; "
-          "q3=r06,r16,r07,r17\n"
-          "vtrn.32  q5, q7                  @ trans data: q5=r22,r32,r23,r33; "
-          "q7=r26,r36,r27,r37\n"
-
-          "vswp     d3, d10                 @ swap d3, d10, "
-          "q1=r02,r12,r22,r32; q5=r03,r13,r23,r33\n"
-          "vst1.32  {d2-d3},  [%[outptr]]!  @ write q1:r02,r12,r22,r32\n"
-          "vst1.32  {d10-d11},[%[outptr]]!  @ write q5:r03,r13,r23,r33\n"
-
-          "vswp     d5, d12                 @ swap d5, d12,q2=r04,r14,r24,r34; "
-          "q6=r05,r15,r25,r35\n"
-          "vst1.32  {d4-d5},  [%[outptr]]!  @ write q2:r04,r14,r24,r34\n"
-          "vst1.32  {d12-d13},[%[outptr]]!  @ write q6:r05,r15,r25,r35\n"
-
-          "vswp     d7, d14                 @ swap d7, d14, "
-          "q3=r06,r16,r26,r36; q7=r07,r17,r27,r37\n"
-          "vst1.32  {d6-d7},  [%[outptr]]!  @ write q3:r06,r16,r26,r36\n"
-          "vst1.32  {d14-d15},[%[outptr]]!  @ write q7:r07,r17,r27,r37\n"
-          : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
-            [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
-          :
-          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
-            "q11", "cc", "memory");
-    }
-
-    for (; x > 0; x--) {
-      *outptr++ = *inptr0++;
-      *outptr++ = *inptr1++;
-      *outptr++ = *inptr2++;
-      *outptr++ = *inptr3++;
-    }
-  }
-}
-#endif  //__aarch64__
-
-void prepackA(float *out, const float *in, const int ldin, const int m0,
-              const int mmax, const int k0, const int kmax, bool is_trans,
-              ARMArch arch) {
-#ifdef __aarch64__
-  if (!is_trans) {
-    prepackA_8x12(out, in, ldin, m0, mmax, k0, kmax);
-  }
-#else
-  if (arch == A73) {
-    if (!is_trans) {
-      prepackA_4x8(out, in, ldin, m0, mmax, k0, kmax);
-    }
-  } else {
-    if (!is_trans) {
-      prepackA_6x8(out, in, ldin, m0, mmax, k0, kmax);
-    }
-  }
-#endif
-}
-
-void gemm1x1s1_transform_weight(const framework::Tensor &weight,
-                                const framework::Tensor &output,
-                                framework::Tensor *trans_weight,
-                                const int group, ARMArch arch) {
-  const int chout = weight.dims()[0];
-  const int chin = weight.dims()[1];
-  const int hout = output.dims()[2];
-  const int wout = output.dims()[3];
-  const int m = chout / group;
-  const int n = hout * wout;
-  const int k = chin / group;
-
-  if (n > 1) {
-    int hblock = get_hblock(arch);
-    int m_roundup = hblock * ((m + hblock - 1) / hblock);
-    int weights_size_per_group = ((m_roundup * k + 15) / 16) * 16;
-    int weight_worksize = sizeof(float) * weights_size_per_group * group;
-    float *w_trans_ptr = trans_weight->mutable_data<float>({weight_worksize});
-    for (int g = 0; g < group; ++g) {
-      const float *weights_group = weight.data<float>() + g * m * k;
-      float *weights_trans_ptr = w_trans_ptr + g * weights_size_per_group;
-      prepackA(weights_trans_ptr, weights_group, k, 0, m, 0, k, false, arch);
-    }
-  }
-}
-
-#ifdef __aarch64__
-void loadb(float *out, const float *in, const int ldin, const int k0,
-           const int kmax, const int n0, const int nmax) {
-  uint32_t *outptr = reinterpret_cast<uint32_t *>(out);
-  const uint32_t *inptr =
-      reinterpret_cast<const uint32_t *>(in) + k0 * ldin + n0;
-  uint32_t mask_buffer[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  int x_len = nmax - n0;
-  int y_len = kmax - k0;
-  int right_remain = x_len - 12 * (x_len / 12);
-  int right_pad = 12 - right_remain;
-  const size_t copy_len_remain = sizeof(float) * right_remain;
-  const size_t copy_len_pad = sizeof(float) * right_pad;
-  const size_t size_ldin = sizeof(float) * ldin;
-
-  uint32_t *outptr_row = outptr;
-  int stride_out = 12 * y_len;
-
-  uint32x4_t vzero = vdupq_n_u32(0);
-  uint32x4_t vmask1 =
-      vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain));
-  uint32x4_t vmask2 =
-      vcltq_u32(vld1q_u32(mask_buffer + 4), vdupq_n_u32(right_remain));
-  uint32x4_t vmask3 =
-      vcltq_u32(vld1q_u32(mask_buffer + 8), vdupq_n_u32(right_remain));
-
-#pragma omp parallel for
-  for (int y = 0; y < y_len - 3; y += 4) {
-    const uint32_t *ptr0 = inptr + y * ldin;
-    const uint32_t *ptr1 = ptr0 + ldin;
-    const uint32_t *ptr2 = ptr1 + ldin;
-    const uint32_t *ptr3 = ptr2 + ldin;
-    asm volatile(
-        "prfm   pldl1keep, [%[ptr0]]                \n"
-        "prfm   pldl1keep, [%[ptr0], #64]   \n"
-        "prfm   pldl1keep, [%[ptr1]]        \n"
-        "prfm   pldl1keep, [%[ptr1], #64]   \n"
-        "prfm   pldl1keep, [%[ptr2]]        \n"
-        "prfm   pldl1keep, [%[ptr2], #64]   \n"
-        "prfm   pldl1keep, [%[ptr3]]        \n"
-        "prfm   pldl1keep, [%[ptr3], #64]   \n"
-        :
-        : [ptr0] "r"(ptr0), [ptr1] "r"(ptr1), [ptr2] "r"(ptr2), [ptr3] "r"(ptr3)
-        : "memory");
-
-    uint32_t *outptr_row_col = outptr_row + y * 12;
-
-    int i = 0;
-    for (; i < x_len - 11; i += 12) {
-      uint32x4_t vr00 = vld1q_u32(ptr0);
-      uint32x4_t vr01 = vld1q_u32(ptr0 + 4);
-      uint32x4_t vr02 = vld1q_u32(ptr0 + 8);
-
-      uint32x4_t vr10 = vld1q_u32(ptr1);
-      uint32x4_t vr11 = vld1q_u32(ptr1 + 4);
-      uint32x4_t vr12 = vld1q_u32(ptr1 + 8);
-
-      vst1q_u32(outptr_row_col, vr00);
-      vst1q_u32(outptr_row_col + 4, vr01);
-      vst1q_u32(outptr_row_col + 8, vr02);
-
-      uint32x4_t vr20 = vld1q_u32(ptr2);
-      uint32x4_t vr21 = vld1q_u32(ptr2 + 4);
-      uint32x4_t vr22 = vld1q_u32(ptr2 + 8);
-
-      vst1q_u32(outptr_row_col + 12, vr10);
-      vst1q_u32(outptr_row_col + 16, vr11);
-      vst1q_u32(outptr_row_col + 20, vr12);
-
-      uint32x4_t vr30 = vld1q_u32(ptr3);
-      uint32x4_t vr31 = vld1q_u32(ptr3 + 4);
-      uint32x4_t vr32 = vld1q_u32(ptr3 + 8);
-
-      vst1q_u32(outptr_row_col + 24, vr20);
-      vst1q_u32(outptr_row_col + 28, vr21);
-      vst1q_u32(outptr_row_col + 32, vr22);
-
-      vst1q_u32(outptr_row_col + 36, vr30);
-      vst1q_u32(outptr_row_col + 40, vr31);
-      vst1q_u32(outptr_row_col + 44, vr32);
-
-      ptr0 += 12;
-      ptr1 += 12;
-      ptr2 += 12;
-      ptr3 += 12;
-
-      outptr_row_col += stride_out;
-    }
-    if (right_remain > 0) {
-      uint32x4_t vr00 = vld1q_u32(ptr0);
-      uint32x4_t vr01 = vld1q_u32(ptr0 + 4);
-      uint32x4_t vr02 = vld1q_u32(ptr0 + 8);
-
-      uint32x4_t vr10 = vld1q_u32(ptr1);
-      uint32x4_t vr11 = vld1q_u32(ptr1 + 4);
-      uint32x4_t vr12 = vld1q_u32(ptr1 + 8);
-
-      uint32x4_t vr00_1 = vbslq_u32(vmask1, vr00, vzero);
-      uint32x4_t vr01_1 = vbslq_u32(vmask2, vr01, vzero);
-      uint32x4_t vr02_1 = vbslq_u32(vmask3, vr02, vzero);
-
-      uint32x4_t vr20 = vld1q_u32(ptr2);
-      uint32x4_t vr21 = vld1q_u32(ptr2 + 4);
-      uint32x4_t vr22 = vld1q_u32(ptr2 + 8);
-
-      vst1q_u32(outptr_row_col, vr00_1);
-      vst1q_u32(outptr_row_col + 4, vr01_1);
-      vst1q_u32(outptr_row_col + 8, vr02_1);
-
-      uint32x4_t vr10_1 = vbslq_u32(vmask1, vr10, vzero);
-      uint32x4_t vr11_1 = vbslq_u32(vmask2, vr11, vzero);
-      uint32x4_t vr12_1 = vbslq_u32(vmask3, vr12, vzero);
-
-      uint32x4_t vr30 = vld1q_u32(ptr3);
-      uint32x4_t vr31 = vld1q_u32(ptr3 + 4);
-      uint32x4_t vr32 = vld1q_u32(ptr3 + 8);
-
-      vst1q_u32(outptr_row_col + 12, vr10_1);
-      vst1q_u32(outptr_row_col + 16, vr11_1);
-      vst1q_u32(outptr_row_col + 20, vr12_1);
-
-      uint32x4_t vr20_1 = vbslq_u32(vmask1, vr20, vzero);
-      uint32x4_t vr21_1 = vbslq_u32(vmask2, vr21, vzero);
-      uint32x4_t vr22_1 = vbslq_u32(vmask3, vr22, vzero);
-
-      uint32x4_t vr30_1 = vbslq_u32(vmask1, vr30, vzero);
-      uint32x4_t vr31_1 = vbslq_u32(vmask2, vr31, vzero);
-      uint32x4_t vr32_1 = vbslq_u32(vmask3, vr32, vzero);
-
-      vst1q_u32(outptr_row_col + 24, vr20_1);
-      vst1q_u32(outptr_row_col + 28, vr21_1);
-      vst1q_u32(outptr_row_col + 32, vr22_1);
-
-      vst1q_u32(outptr_row_col + 36, vr30_1);
-      vst1q_u32(outptr_row_col + 40, vr31_1);
-      vst1q_u32(outptr_row_col + 44, vr32_1);
-    }
-  }
-
-#pragma omp parallel for
-  for (int y = 4 * (y_len / 4); y < y_len; ++y) {
-    const uint32_t *ptr0 = inptr + y * ldin;
-    uint32_t *outptr_row_col = outptr_row + y * 12;
-
-    int i = 0;
-    for (; i < x_len - 11; i += 12) {
-      uint32x4_t vr0 = vld1q_u32(ptr0);
-      uint32x4_t vr1 = vld1q_u32(ptr0 + 4);
-      uint32x4_t vr2 = vld1q_u32(ptr0 + 8);
-      vst1q_u32(outptr_row_col, vr0);
-      vst1q_u32(outptr_row_col + 4, vr1);
-      vst1q_u32(outptr_row_col + 8, vr2);
-
-      ptr0 += 12;
-
-      outptr_row_col += stride_out;
-    }
-    if (right_remain > 0) {
-      uint32x4_t vr0 = vld1q_u32(ptr0);
-      uint32x4_t vr1 = vld1q_u32(ptr0 + 4);
-      uint32x4_t vr2 = vld1q_u32(ptr0 + 8);
-
-      uint32x4_t vr0_1 = vbslq_u32(vmask1, vr0, vzero);
-      uint32x4_t vr1_1 = vbslq_u32(vmask2, vr1, vzero);
-      uint32x4_t vr2_1 = vbslq_u32(vmask3, vr2, vzero);
-
-      vst1q_u32(outptr_row_col, vr0_1);
-      vst1q_u32(outptr_row_col + 4, vr1_1);
-      vst1q_u32(outptr_row_col + 8, vr2_1);
-    }
-  }
-}
-#else  //__aarch64__
-void loadb(float* out, const float* in, const int ldin, const int k0,
-           const int kmax, const int n0, const int nmax) {
-  uint32_t* outptr = reinterpret_cast<uint32_t*>(out);
-  const uint32_t* inptr =
-      reinterpret_cast<const uint32_t*>(in) + k0 * ldin + n0;
-  uint32_t mask_buffer[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  int x_len = nmax - n0;
-  int y_len = kmax - k0;
-  int right_remain = x_len - 8 * (x_len / 8);
-  int right_pad = 8 - right_remain;
-  const size_t copy_len_remain = sizeof(float) * right_remain;
-  const size_t copy_len_pad = sizeof(float) * right_pad;
-  const size_t size_ldin = sizeof(float) * ldin;
-
-  uint32_t* outptr_row = outptr;
-  int stride_out = 8 * y_len;
-
-  uint32x4_t vzero = vdupq_n_u32(0);
-  uint32x4_t vmask1 =
-      vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain));
-  uint32x4_t vmask2 =
-      vcltq_u32(vld1q_u32(mask_buffer + 4), vdupq_n_u32(right_remain));
-
-#pragma omp parallel for
-  for (int y = 0; y < y_len - 3; y += 4) {
-    const uint32_t* ptr0 = inptr + y * ldin;
-    const uint32_t* ptr1 = ptr0 + ldin;
-    const uint32_t* ptr2 = ptr1 + ldin;
-    const uint32_t* ptr3 = ptr2 + ldin;
-    uint32_t* outptr_row_col = outptr_row + y * 8;
-    int i = 0;
-    for (; i < x_len - 7; i += 8) {
-      uint32_t* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d3}, [%[ptr0]]!        @ load r0, 8 elements\n"
-          "vld1.32 {d4-d7}, [%[ptr1]]!        @ load r1, 8 elements\n"
-          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d4-d7}, [%[outptr]]!      @ write to output ptr\n"
-
-          "vld1.32 {d0-d3}, [%[ptr2]]!        @ load r2, 8 elements\n"
-          "vld1.32 {d4-d7}, [%[ptr3]]!        @ load r3, 8 elements\n"
-          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d4-d7}, [%[outptr]]!      @ write to output ptr\n"
-          : [outptr] "+r"(ptr_out), [ptr0] "+r"(ptr0), [ptr1] "+r"(ptr1),
-            [ptr2] "+r"(ptr2), [ptr3] "+r"(ptr3)
-          :
-          : "q0", "q1", "q2", "q3", "cc", "memory");
-      outptr_row_col += stride_out;
-    }
-    if (right_remain > 0) {
-      uint32_t* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d3}, [%[ptr0]]!        @ load r0, 8 elements\n"
-          "vld1.32 {d4-d7}, [%[ptr1]]!        @ load r1, 8 elements\n"
-          "vbif   q0, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   q1, %q[vzero], %q[vmask2]   @ bit select, pad zero\n"
-          //"vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          "vbif   q2, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   q3, %q[vzero], %q[vmask2]   @ bit select, pad zero\n"
-          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d4-d7}, [%[outptr]]!      @ write to output ptr\n"
-
-          "vld1.32 {d0-d3}, [%[ptr2]]!        @ load r2, 8 elements\n"
-          "vld1.32 {d4-d7}, [%[ptr3]]!        @ load r3, 8 elements\n"
-          "vbif   q0, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   q1, %q[vzero], %q[vmask2]   @ bit select, pad zero\n"
-          //"vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          "vbif   q2, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   q3, %q[vzero], %q[vmask2]   @ bit select, pad zero\n"
-          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d4-d7}, [%[outptr]]!      @ write to output ptr\n"
-          : [outptr] "+r"(ptr_out), [ptr0] "+r"(ptr0), [ptr1] "+r"(ptr1),
-            [ptr2] "+r"(ptr2), [ptr3] "+r"(ptr3)
-          : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [vzero] "w"(vzero)
-          : "q0", "q1", "q2", "q3", "cc", "memory");
-    }
-  }
-#pragma omp parallel for
-  for (int y = 4 * (y_len / 4); y < y_len; ++y) {
-    const uint32_t* ptr0 = inptr + y * ldin;
-    uint32_t* outptr_row_col = outptr_row + y * 8;
-    int i = 0;
-    for (; i < x_len - 7; i += 8) {
-      uint32_t* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d3}, [%[ptr0]]!        @ load r0, 8 elements\n"
-          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out)
-          :
-          : "q0", "q1", "cc", "memory");
-      outptr_row_col += stride_out;
-    }
-    if (right_remain > 0) {
-      uint32_t* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d3}, [%[ptr0]]!        @ load r0, 8 elements\n"
-          "vbif   q0, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   q1, %q[vzero], %q[vmask2]   @ bit select, pad zero\n"
-          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out)
-          : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [vzero] "w"(vzero)
-          : "q0", "q1", "cc", "memory");
-    }
-  }
-}
-#endif  //__aarch64__
-
-#ifdef __aarch64__
-void sgemm_conv_8x12(const float *A_packed, const float *B, const float *bias,
-                     float *C, int M, int N, int K, bool is_bias, bool is_relu,
-                     bool transB) {
-  const int threads = framework::CPUContext::Context()->get_thread_num();
-  int l2_size =
-      framework::CPUContext::Context()->get_l2_cache_size() / sizeof(float);
-  int l2_cache = l2_size > 0 ? l2_size : 512 * 1024;
-
-  //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
-  int x_block = (l2_cache - (MBLOCK * K)) / (sizeof(float) * (K + MBLOCK));
-  x_block /= NBLOCK;
-  x_block *= NBLOCK;
-  int x_num = (N + (x_block - 1)) / x_block;
-  x_block = (N + x_num - 1) / x_num;
-  x_block = (x_block + NBLOCK - 1) / NBLOCK;
-  x_block *= NBLOCK;
-  x_block = x_block < NBLOCK ? NBLOCK : x_block;
-
-  // unroll 2 loop
-  int tail_pre = (K & (KBLOCK - 1));
-  int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1;
-
-  bool flag_p_remain = false;
-  int remain = 0;
-
-  //! apanel is pre_compute outside gemm
-  for (unsigned int x0 = 0; x0 < N; x0 += x_block) {
-    unsigned int xmax = x0 + x_block;
-    if (xmax > N) {
-      xmax = N;
-    }
-    int bblocks = (xmax - x0 + NBLOCK - 1) / NBLOCK;
-    remain = xmax - x0 - (bblocks - 1) * NBLOCK;
-    if (remain > 0) {
-      flag_p_remain = true;
-    }
-    //! load bpanel
-    float *b_pannel =
-        static_cast<float *>(framework::CPUContext::Context()->get_work_space(
-            K * (xmax - x0) * sizeof(float)));
-    if (!transB) {
-      loadb(b_pannel, B, N, 0, K, x0, xmax);
-    }
-#pragma omp parallel for num_threads(threads)
-    for (unsigned int y = 0; y < M; y += MBLOCK) {
-      unsigned int ymax = y + MBLOCK;
-      if (ymax > M) {
-        ymax = M;
-      }
-
-      float bias_local[8] = {0};
-      if (is_bias) {
-        bias_local[0] = bias[y];
-        bias_local[1] = bias[y + 1];
-        bias_local[2] = bias[y + 2];
-        bias_local[3] = bias[y + 3];
-        bias_local[4] = bias[y + 4];
-        bias_local[5] = bias[y + 5];
-        bias_local[6] = bias[y + 6];
-        bias_local[7] = bias[y + 7];
-      }
-
-      float cout0[NBLOCK];
-      float cout1[NBLOCK];
-      float cout2[NBLOCK];
-      float cout3[NBLOCK];
-      float cout4[NBLOCK];
-      float cout5[NBLOCK];
-      float cout6[NBLOCK];
-      float cout7[NBLOCK];
-
-      float *c_ptr0 = C + y * N + x0;
-      float *c_ptr1 = c_ptr0 + N;
-      float *c_ptr2 = c_ptr1 + N;
-      float *c_ptr3 = c_ptr2 + N;
-      float *c_ptr4 = c_ptr3 + N;
-      float *c_ptr5 = c_ptr4 + N;
-      float *c_ptr6 = c_ptr5 + N;
-      float *c_ptr7 = c_ptr6 + N;
-
-      float *pout0 = c_ptr0;
-      float *pout1 = c_ptr1;
-      float *pout2 = c_ptr2;
-      float *pout3 = c_ptr3;
-      float *pout4 = c_ptr4;
-      float *pout5 = c_ptr5;
-      float *pout6 = c_ptr6;
-      float *pout7 = c_ptr7;
-
-      const float *a_ptr_l = A_packed + y * K;
-      const float *b_ptr = b_pannel;
-      for (int xb = 0; xb < bblocks; xb++) {
-        if ((y + 7) >= ymax) {
-          switch ((y + 7) - ymax) {
-            case 6:
-              c_ptr1 = cout1;
-            case 5:
-              c_ptr2 = cout2;
-            case 4:
-              c_ptr3 = cout3;
-            case 3:
-              c_ptr4 = cout4;
-            case 2:
-              c_ptr5 = cout5;
-            case 1:
-              c_ptr6 = cout6;
-            case 0:
-              c_ptr7 = cout7;
-            default:
-              break;
-          }
-        }
-        if (flag_p_remain && (xb == bblocks - 1)) {
-          pout0 = c_ptr0;
-          pout1 = c_ptr1;
-          pout2 = c_ptr2;
-          pout3 = c_ptr3;
-          pout4 = c_ptr4;
-          pout5 = c_ptr5;
-          pout6 = c_ptr6;
-          pout7 = c_ptr7;
-
-          c_ptr0 = cout0;
-          c_ptr1 = cout1;
-          c_ptr2 = cout2;
-          c_ptr3 = cout3;
-          c_ptr4 = cout4;
-          c_ptr5 = cout5;
-          c_ptr6 = cout6;
-          c_ptr7 = cout7;
-        }
-        const float *a_ptr = a_ptr_l;
-        int tail = tail_pre;
-        int k = k_pre;
-
-        asm volatile(
-            // Initialize result registers, load initial operands, prime
-            // prefetches.
-            "ldp    q2, q3, [%[bias_ptr]]\n"       /* load bias to q2, q3*/
-            "ldp    q0, q1, [%[a_ptr]], #32\n"     /* load a00,a01 to q0, q1*/
-            "ldp    q4, q5, [%[b_ptr]], #32\n"     /* load b0, b1 to q4, q5*/
-            "dup    v8.4s,  v2.s[0]\n"             /* out0 = 0 */
-            "dup    v9.4s,  v2.s[0]\n"             /* out1 = 0*/
-            "dup    v10.4s, v2.s[0]\n"             /* out2 = 0*/
-            "dup    v11.4s, v2.s[1]\n"             /* out3 = 0*/
-            "dup    v12.4s, v2.s[1]\n"             /* out4 = 0*/
-            "prfm   pldl1keep, [%[b_ptr], #64]\n"  /* preload b*/
-            "dup    v13.4s, v2.s[1]\n"             /* out5 = 0*/
-            "prfm   pldl1keep, [%[a_ptr], #64]\n"  /* preload a*/
-            "dup    v14.4s, v2.s[2]\n"             /* out6 = 0*/
-            "prfm   pldl1keep, [%[b_ptr], #128]\n" /* preload b*/
-            "dup    v15.4s, v2.s[2]\n"             /* out7 = 0*/
-            "prfm   pldl1keep, [%[a_ptr], #128]\n" /* preload a*/
-            "dup    v16.4s, v2.s[2]\n"             /* out8 = 0*/
-            "prfm   pldl1keep, [%[b_ptr], #192]\n" /* preload b*/
-            "dup    v17.4s, v2.s[3]\n"             /* out9 = 0*/
-            "prfm   pldl1keep, [%[b_ptr], #256]\n" /* preload b*/
-            "dup    v18.4s, v2.s[3]\n"             /* out10 = 0*/
-            "prfm   pldl1keep, [%[a_ptr], #192]\n" /* preload a*/
-            "dup    v19.4s, v2.s[3]\n"             /* out11 = 0*/
-            "prfm   pldl1keep, [%[b_ptr], #320]\n" /* preload b*/
-            "dup    v20.4s, v3.s[0]\n"             /* out12 = 0*/
-            "prfm   pldl1keep, [%[a_ptr], #256]\n" /* preload a*/
-            "dup    v21.4s, v3.s[0]\n"             /* out13 = 0*/
-            "prfm   pldl1keep, [%[b_ptr], #384]\n" /* preload b*/
-            "dup    v22.4s, v3.s[0]\n"             /* out14 = 0*/
-            "dup    v23.4s, v3.s[1]\n"             /* out15 = 0*/
-            "dup    v24.4s, v3.s[1]\n"             /* out16 = 0*/
-            "dup    v25.4s, v3.s[1]\n"             /* out17 = 0*/
-            "dup    v26.4s, v3.s[2]\n"             /* out18 = 0*/
-            "dup    v27.4s, v3.s[2]\n"             /* out19 = 0*/
-            "dup    v28.4s, v3.s[2]\n"             /* out20 = 0*/
-            "dup    v29.4s, v3.s[3]\n"             /* out21 = 0*/
-            "dup    v30.4s, v3.s[3]\n"             /* out22 = 0*/
-            "dup    v31.4s, v3.s[3]\n"             /* out23 = 0*/
-            "cbz    %w[k], 2f\n"                   /* check loop count > 0 */
-            /* main loop */
-            /* unrool 0*/
-            "1:\n"                               /* main loop */
-            "fmla   v8.4s ,  v4.4s,  v0.s[0]\n"  /* out0 = b0 * a00[0], b0 = q4
-                                                  */
-            "fmla   v11.4s ,  v4.4s,  v0.s[1]\n" /* out1 = b0 * a00[1], b0 = q4
-                                                  */
-            "ldp    q6, q7, [%[b_ptr]], #32\n" /* load b2, b0 to q6, q7       */
-            "fmla   v14.4s,  v4.4s,  v0.s[2]\n" /* out2 = b0 * a00[2], b0 = q4
-                                                 */
-            "fmla   v17.4s,  v4.4s,  v0.s[3]\n" /* out3 = b0 * a00[3], b0 = q4
-                                                 */
-            "ldp    q2, q3, [%[a_ptr]], #32\n" /* load a10, a11 to q3, q4     */
-            "fmla   v20.4s,  v4.4s,  v1.s[0]\n" /* out4 = b0 * a01[0], b0 = q4
-                                                 */
-            "fmla   v23.4s,  v4.4s,  v1.s[1]\n" /* out5 = b0 * a01[1], b0 = q4
-                                                 */
-            "fmla   v26.4s,  v4.4s,  v1.s[2]\n" /* out6 = b0 * a01[2], b0 = q4
-                                                 */
-            "fmla   v29.4s,  v4.4s,  v1.s[3]\n" /* out7 = b0 * a01[3], b0 = q4
-                                                 */
-
-            "fmla   v9.4s,  v5.4s,  v0.s[0]\n" /* out8 = b1 * a00[0], b1 = q5 */
-            "fmla   v12.4s,  v5.4s,  v0.s[1]\n" /* out9 = b1 * a00[1], b1 = q5
-                                                 */
-            "fmla   v15.4s,  v5.4s,  v0.s[2]\n" /* out10 = b1 * a00[2], b1 =
-                                                   q5*/
-            "fmla   v18.4s,  v5.4s,  v0.s[3]\n" /* out11 = b1 * a00[3], b1 =
-                                                   q5*/
-            "fmla   v21.4s,  v5.4s,  v1.s[0]\n" /* out12 = b1 * a01[0], b1 =
-                                                   q5*/
-            "fmla   v24.4s,  v5.4s,  v1.s[1]\n" /* out13 = b1 * a01[1], b1 =
-                                                   q5*/
-            "fmla   v27.4s,  v5.4s,  v1.s[2]\n" /* out14 = b1 * a01[2], b1 =
-                                                   q5*/
-            "fmla   v30.4s,  v5.4s,  v1.s[3]\n" /* out15 = b1 * a01[3], b1 =
-                                                   q5*/
-
-            "ldp    q4, q5, [%[b_ptr]], #32\n" /* load b1, b2 to q4, q5       */
-
-            "fmla   v10.4s,  v6.4s,  v0.s[0]\n" /* out16 = b2 * a00[0], b2 =
-                                                   q6*/
-            "fmla   v13.4s,  v6.4s,  v0.s[1]\n" /* out17 = b2 * a00[1], b2 =
-                                                   q6*/
-            "prfm   pldl1keep, [%[b_ptr], #384]\n"
-            "fmla   v16.4s,  v6.4s,  v0.s[2]\n" /* out18 = b2 * a00[2], b2 =
-                                                   q6*/
-            "fmla   v19.4s,  v6.4s,  v0.s[3]\n" /* out19 = b2 * a00[3], b2 =
-                                                   q6*/
-            "fmla   v22.4s,  v6.4s,  v1.s[0]\n" /* out20 = b2 * a00[0], b2 =
-                                                   q6*/
-            "fmla   v25.4s,  v6.4s,  v1.s[1]\n" /* out21 = b2 * a00[1], b2 =
-                                                   q6*/
-            "fmla   v28.4s,  v6.4s,  v1.s[2]\n" /* out22 = b2 * a00[2], b2 =
-                                                   q6*/
-            "fmla   v31.4s,  v6.4s,  v1.s[3]\n" /* out23 = b2 * a00[3], b2 =
-                                                   q6*/
-
-            "ldp    q0, q1, [%[a_ptr]], #32\n" /* load a00, a01 to q0, q1     */
-
-            /* unrool 1 */
-            "fmla   v8.4s ,  v7.4s,  v2.s[0]\n"  /* out0 = b0 * a10[0], b0 = q7
-                                                  */
-            "fmla   v11.4s ,  v7.4s,  v2.s[1]\n" /* out1 = b0 * a10[1], b0 = q7
-                                                  */
-            "fmla   v14.4s,  v7.4s,  v2.s[2]\n"  /* out2 = b0 * a10[2], b0 = q7
-                                                  */
-            "prfm   pldl1keep, [%[a_ptr], #256]\n"
-            "fmla   v17.4s,  v7.4s,  v2.s[3]\n" /* out3 = b0 * a10[3], b0 = q7
-                                                 */
-            "fmla   v20.4s,  v7.4s,  v3.s[0]\n" /* out4 = b0 * a11[0], b0 = q7
-                                                 */
-            "fmla   v23.4s,  v7.4s,  v3.s[1]\n" /* out5 = b0 * a11[1], b0 = q7
-                                                 */
-            "fmla   v26.4s,  v7.4s,  v3.s[2]\n" /* out6 = b0 * a11[2], b0 = q7
-                                                 */
-            "fmla   v29.4s,  v7.4s,  v3.s[3]\n" /* out7 = b0 * a11[3], b0 = q7
-                                                 */
-
-            "ldp    q6, q7, [%[b_ptr]], #32\n" /* load b0, b1 to q6, q7       */
-
-            "fmla   v9.4s,  v4.4s,  v2.s[0]\n" /* out8 = b0 * a10[0], b1 = q4 */
-            "fmla   v12.4s,  v4.4s,  v2.s[1]\n" /* out9 = b0 * a10[1], b1 = q4
-                                                 */
-            "fmla   v15.4s,  v4.4s,  v2.s[2]\n" /* out10 = b1 * a10[2], b1 =
-                                                   q4*/
-            "fmla   v18.4s,  v4.4s,  v2.s[3]\n" /* out11 = b1 * a10[3], b1 =
-                                                   q4*/
-            "fmla   v21.4s,  v4.4s,  v3.s[0]\n" /* out12 = b1 * a10[0], b1 =
-                                                   q4*/
-            "fmla   v24.4s,  v4.4s,  v3.s[1]\n" /* out13 = b1 * a10[1], b1 =
-                                                   q4*/
-            "fmla   v27.4s,  v4.4s,  v3.s[2]\n" /* out14 = b1 * a10[2], b1 =
-                                                   q4*/
-            "fmla   v30.4s,  v4.4s,  v3.s[3]\n" /* out15 = b1 * a10[3], b1 =
-                                                   q4*/
-
-            "fmla   v10.4s,  v5.4s,  v2.s[0]\n" /* out16 = b2 * a10[0], b2 =
-                                                   q5*/
-            "fmla   v13.4s,  v5.4s,  v2.s[1]\n" /* out17 = b2 * a10[0], b2 =
-                                                   q5*/
-            "fmla   v16.4s,  v5.4s,  v2.s[2]\n" /* out18 = b2 * a10[0], b2 =
-                                                   q5*/
-            "fmla   v19.4s,  v5.4s,  v2.s[3]\n" /* out19 = b2 * a10[0], b2 =
-                                                   q5*/
-            "fmla   v22.4s,  v5.4s,  v3.s[0]\n" /* out20 = b2 * a10[0], b2 =
-                                                   q5*/
-            "fmla   v25.4s,  v5.4s,  v3.s[1]\n" /* out21 = b2 * a10[0], b2 =
-                                                   q5*/
-            "fmla   v28.4s,  v5.4s,  v3.s[2]\n" /* out22 = b2 * a10[0], b2 =
-                                                   q5*/
-            "fmla   v31.4s,  v5.4s,  v3.s[3]\n" /* out23 = b2 * a10[0], b2 =
-                                                   q5*/
-            "ldp    q4, q5, [%[b_ptr]], #32\n" /* load b2, b0 to q4, q5       */
-            /* unrool 2*/
-            "fmla   v8.4s ,  v6.4s,  v0.s[0]\n"  /* out0 = b0 * a00[0], b0 = q6
-                                                  */
-            "fmla   v11.4s ,  v6.4s,  v0.s[1]\n" /* out1 = b0 * a00[1], b0 = q6
-                                                  */
-            "ldp    q2, q3, [%[a_ptr]], #32\n"   /* load a10, a11 to q3, q4*/
-            "fmla   v14.4s,  v6.4s,  v0.s[2]\n" /* out2 = b0 * a00[2], b0 = q6*/
-            "fmla   v17.4s,  v6.4s,  v0.s[3]\n" /* out3 = b0 * a00[3], b0 = q6*/
-            "fmla   v20.4s,  v6.4s,  v1.s[0]\n" /* out4 = b0 * a01[0], b0 = q6*/
-            "fmla   v23.4s,  v6.4s,  v1.s[1]\n" /* out5 = b0 * a01[1], b0 = q6*/
-            "fmla   v26.4s,  v6.4s,  v1.s[2]\n" /* out6 = b0 * a01[2], b0 = q6*/
-            "fmla   v29.4s,  v6.4s,  v1.s[3]\n" /* out7 = b0 * a01[3], b0 = q6*/
-            "fmla   v9.4s,  v7.4s,  v0.s[0]\n"  /* out8 = b1 * a00[0], b1 = q7*/
-            "fmla   v12.4s,  v7.4s,  v0.s[1]\n" /* out9 = b1 * a00[1], b1 = q7*/
-            "prfm   pldl1keep, [%[b_ptr], #384]\n"
-            "fmla   v15.4s,  v7.4s,  v0.s[2]\n" /* out10 = b1 * a00[2], b1 =
-                                                   q7*/
-            "fmla   v18.4s,  v7.4s,  v0.s[3]\n" /* out11 = b1 * a00[3], b1 =
-                                                   q7*/
-            "fmla   v21.4s,  v7.4s,  v1.s[0]\n" /* out12 = b1 * a01[0], b1 =
-                                                   q7*/
-            "fmla   v24.4s,  v7.4s,  v1.s[1]\n" /* out13 = b1 * a01[1], b1 =
-                                                   q7*/
-            "fmla   v27.4s,  v7.4s,  v1.s[2]\n" /* out14 = b1 * a01[2], b1 =
-                                                   q7*/
-            "fmla   v30.4s,  v7.4s,  v1.s[3]\n" /* out15 = b1 * a01[3], b1 =
-                                                   q7*/
-
-            "ldp    q6, q7, [%[b_ptr]], #32\n" /* load b1, b2 to q6, q7*/
-
-            "fmla   v10.4s,  v4.4s,  v0.s[0]\n" /* out16 = b2 * a00[0], b2 =
-                                                   q4*/
-            "fmla   v13.4s,  v4.4s,  v0.s[1]\n" /* out17 = b2 * a00[1], b2 =
-                                                   q4*/
-            "fmla   v16.4s,  v4.4s,  v0.s[2]\n" /* out18 = b2 * a00[2], b2 =
-                                                   q4*/
-            "fmla   v19.4s,  v4.4s,  v0.s[3]\n" /* out19 = b2 * a00[3], b2 =
-                                                   q4*/
-            "fmla   v22.4s,  v4.4s,  v1.s[0]\n" /* out20 = b2 * a00[0], b2 =
-                                                   q4*/
-            "fmla   v25.4s,  v4.4s,  v1.s[1]\n" /* out21 = b2 * a00[1], b2 =
-                                                   q4*/
-            "fmla   v28.4s,  v4.4s,  v1.s[2]\n" /* out22 = b2 * a00[2], b2 =
-                                                   q4*/
-            "fmla   v31.4s,  v4.4s,  v1.s[3]\n" /* out23 = b2 * a00[3], b2 =
-                                                   q4*/
-            "ldp    q0, q1, [%[a_ptr]], #32\n"  /* load a00, a01 to q0, q1*/
-            /* unrool 3*/
-            "fmla   v8.4s ,  v5.4s,  v2.s[0]\n" /* out0 = b0 * a10[0], b0 = q5*/
-            "fmla   v11.4s ,  v5.4s,  v2.s[1]\n" /* out1 = b0 * a10[1], b0 =
-                                                    q5*/
-            "fmla   v14.4s,  v5.4s,  v2.s[2]\n" /* out2 = b0 * a10[2], b0 = q5*/
-            "fmla   v17.4s,  v5.4s,  v2.s[3]\n" /* out3 = b0 * a10[3], b0 = q5*/
-            "fmla   v20.4s,  v5.4s,  v3.s[0]\n" /* out4 = b0 * a11[0], b0 = q5*/
-            "fmla   v23.4s,  v5.4s,  v3.s[1]\n" /* out5 = b0 * a11[1], b0 = q5*/
-            "fmla   v26.4s,  v5.4s,  v3.s[2]\n" /* out6 = b0 * a11[2], b0 = q5*/
-            "fmla   v29.4s,  v5.4s,  v3.s[3]\n" /* out7 = b0 * a11[3], b0 = q5*/
-            "ldp    q4, q5, [%[b_ptr]], #32\n"  /* load b0, b1 to q4, q5*/
-            "fmla   v9.4s,  v6.4s,  v2.s[0]\n"  /* out8 = b0 * a10[0], b1 = q6*/
-            "fmla   v12.4s,  v6.4s,  v2.s[1]\n" /* out9 = b0 * a10[1], b1 = q6*/
-            "prfm   pldl1keep, [%[a_ptr], #256]\n"
-            "fmla   v15.4s,  v6.4s,  v2.s[2]\n" /* out10 = b1 * a10[2], b1 =
-                                                   q6*/
-            "fmla   v18.4s,  v6.4s,  v2.s[3]\n" /* out11 = b1 * a10[3], b1 =
-                                                   q6*/
-            "fmla   v21.4s,  v6.4s,  v3.s[0]\n" /* out12 = b1 * a10[0], b1 =
-                                                   q6*/
-            "fmla   v24.4s,  v6.4s,  v3.s[1]\n" /* out13 = b1 * a10[1], b1 =
-                                                   q6*/
-            "fmla   v27.4s,  v6.4s,  v3.s[2]\n" /* out14 = b1 * a10[2], b1 =
-                                                   q6*/
-            "prfm   pldl1keep, [%[b_ptr], #384]\n"
-            "fmla   v30.4s,  v6.4s,  v3.s[3]\n" /* out15 = b1 * a10[3], b1 =
-                                                   q6*/
-            "fmla   v10.4s,  v7.4s,  v2.s[0]\n" /* out16 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v13.4s,  v7.4s,  v2.s[1]\n" /* out17 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v16.4s,  v7.4s,  v2.s[2]\n" /* out18 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v19.4s,  v7.4s,  v2.s[3]\n" /* out19 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v22.4s,  v7.4s,  v3.s[0]\n" /* out20 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v25.4s,  v7.4s,  v3.s[1]\n" /* out21 = b2 * a10[0], b2 =
-                                                   q7*/
-            "subs   %w[k], %w[k], #1\n"         /* loop count - 1*/
-            "fmla   v28.4s,  v7.4s,  v3.s[2]\n" /* out22 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v31.4s,  v7.4s,  v3.s[3]\n" /* out23 = b2 * a10[0], b2 =
-                                                   q7*/
-            "bne    1b\n"
-            /* Target to use when K is 1 or 2 (i.e. zero iterations of main
-               loop)*/
-            "2:\n"                                /* process tail*/
-            "subs       %w[tail], %w[tail], #1\n" /* tail--*/
-            "beq        3f\n"                     /*jump to tail = 1*/
-            /* final unrool 0*/
-            /* unrool 0, tail > 1*/
-            "fmla   v8.4s ,  v4.4s,  v0.s[0]\n" /* out0 = b0 * a00[0], b0 = q4*/
-            "fmla   v11.4s ,  v4.4s,  v0.s[1]\n" /* out1 = b0 * a00[1], b0 =
-                                                    q4*/
-            "ldp    q6, q7, [%[b_ptr]], #32\n"   /* load b2, b0 to q6, q7*/
-            "fmla   v14.4s,  v4.4s,  v0.s[2]\n" /* out2 = b0 * a00[2], b0 = q4*/
-            "fmla   v17.4s,  v4.4s,  v0.s[3]\n" /* out3 = b0 * a00[3], b0 = q4*/
-            "ldp    q2, q3, [%[a_ptr]], #32\n"  /* load a10, a11 to q2, q3*/
-            "fmla   v20.4s,  v4.4s,  v1.s[0]\n" /* out4 = b0 * a01[0], b0 = q4*/
-            "fmla   v23.4s,  v4.4s,  v1.s[1]\n" /* out5 = b0 * a01[1], b0 = q4*/
-            "fmla   v26.4s,  v4.4s,  v1.s[2]\n" /* out6 = b0 * a01[2], b0 = q4*/
-            "fmla   v29.4s,  v4.4s,  v1.s[3]\n" /* out7 = b0 * a01[3], b0 = q4*/
-            "subs   %w[tail], %w[tail], #1\n"   /* tail--*/
-            "fmla   v9.4s,  v5.4s,  v0.s[0]\n"  /* out8 = b1 * a00[0], b1 = q5*/
-            "fmla   v12.4s,  v5.4s,  v0.s[1]\n" /* out9 = b1 * a00[1], b1 = q5*/
-            "fmla   v15.4s,  v5.4s,  v0.s[2]\n" /* out10 = b1 * a00[2], b1 =
-                                                   q5*/
-            "fmla   v18.4s,  v5.4s,  v0.s[3]\n" /* out11 = b1 * a00[3], b1 =
-                                                   q5*/
-            "fmla   v21.4s,  v5.4s,  v1.s[0]\n" /* out12 = b1 * a01[0], b1 =
-                                                   q5*/
-            "fmla   v24.4s,  v5.4s,  v1.s[1]\n" /* out13 = b1 * a01[1], b1 =
-                                                   q5*/
-            "fmla   v27.4s,  v5.4s,  v1.s[2]\n" /* out14 = b1 * a01[2], b1 =
-                                                   q5*/
-            "fmla   v30.4s,  v5.4s,  v1.s[3]\n" /* out15 = b1 * a01[3], b1 =
-                                                   q5*/
-            "ldp    q4, q5, [%[b_ptr]], #32\n"  /* load b1, b2 to q4, q5*/
-            "fmla   v10.4s,  v6.4s,  v0.s[0]\n" /* out16 = b2 * a00[0], b2 =
-                                                   q6*/
-            "fmla   v13.4s,  v6.4s,  v0.s[1]\n" /* out17 = b2 * a00[1], b2 =
-                                                   q6*/
-            "fmla   v16.4s,  v6.4s,  v0.s[2]\n" /* out18 = b2 * a00[2], b2 =
-                                                   q6*/
-            "fmla   v19.4s,  v6.4s,  v0.s[3]\n" /* out19 = b2 * a00[3], b2 =
-                                                   q6*/
-            "fmla   v22.4s,  v6.4s,  v1.s[0]\n" /* out20 = b2 * a00[0], b2 =
-                                                   q6*/
-            "fmla   v25.4s,  v6.4s,  v1.s[1]\n" /* out21 = b2 * a00[1], b2 =
-                                                   q6*/
-            "fmla   v28.4s,  v6.4s,  v1.s[2]\n" /* out22 = b2 * a00[2], b2 =
-                                                   q6*/
-            "fmla   v31.4s,  v6.4s,  v1.s[3]\n" /* out23 = b2 * a00[3], b2 =
-                                                   q6*/
-            "beq        4f\n"                   /*jump to tail = 2*/
-            /* unrool 1, tail > 2*/
-            "ldp    q0, q1, [%[a_ptr]], #32\n"  /* load a00, a01 to q0, q1*/
-            "fmla   v8.4s ,  v7.4s,  v2.s[0]\n" /* out0 = b0 * a10[0], b0 = q7*/
-            "fmla   v11.4s ,  v7.4s,  v2.s[1]\n" /* out1 = b0 * a10[1], b0 =
-                                                    q7*/
-            "fmla   v14.4s,  v7.4s,  v2.s[2]\n" /* out2 = b0 * a10[2], b0 = q7*/
-            "fmla   v17.4s,  v7.4s,  v2.s[3]\n" /* out3 = b0 * a10[3], b0 = q7*/
-            "fmla   v20.4s,  v7.4s,  v3.s[0]\n" /* out4 = b0 * a11[0], b0 = q7*/
-            "fmla   v23.4s,  v7.4s,  v3.s[1]\n" /* out5 = b0 * a11[1], b0 = q7*/
-            "fmla   v26.4s,  v7.4s,  v3.s[2]\n" /* out6 = b0 * a11[2], b0 = q7*/
-            "fmla   v29.4s,  v7.4s,  v3.s[3]\n" /* out7 = b0 * a11[3], b0 = q7*/
-            "ldp    q6, q7, [%[b_ptr]], #32\n"  /* load b0, b1 to q6, q7*/
-            "fmla   v9.4s,  v4.4s,  v2.s[0]\n"  /* out8 = b0 * a10[0], b1 = q4*/
-            "fmla   v12.4s,  v4.4s,  v2.s[1]\n" /* out9 = b0 * a10[1], b1 = q4*/
-            "fmla   v15.4s,  v4.4s,  v2.s[2]\n" /* out10 = b1 * a10[2], b1 =
-                                                   q4*/
-            "fmla   v18.4s,  v4.4s,  v2.s[3]\n" /* out11 = b1 * a10[3], b1 =
-                                                   q4*/
-            "fmla   v21.4s,  v4.4s,  v3.s[0]\n" /* out12 = b1 * a10[0], b1 =
-                                                   q4*/
-            "fmla   v24.4s,  v4.4s,  v3.s[1]\n" /* out13 = b1 * a10[1], b1 =
-                                                   q4*/
-            "fmla   v27.4s,  v4.4s,  v3.s[2]\n" /* out14 = b1 * a10[2], b1 =
-                                                   q4*/
-            "fmla   v30.4s,  v4.4s,  v3.s[3]\n" /* out15 = b1 * a10[3], b1 =
-                                                   q4*/
-            "subs   %w[tail], %w[tail], #1\n"   /* tail--*/
-            "fmla   v10.4s,  v5.4s,  v2.s[0]\n" /* out16 = b2 * a10[0], b2 =
-                                                   q5*/
-            "fmla   v13.4s,  v5.4s,  v2.s[1]\n" /* out17 = b2 * a10[0], b2 =
-                                                   q5*/
-            "fmla   v16.4s,  v5.4s,  v2.s[2]\n" /* out18 = b2 * a10[0], b2 =
-                                                   q5*/
-            "fmla   v19.4s,  v5.4s,  v2.s[3]\n" /* out19 = b2 * a10[0], b2 =
-                                                   q5*/
-            "fmla   v22.4s,  v5.4s,  v3.s[0]\n" /* out20 = b2 * a10[0], b2 =
-                                                   q5*/
-            "fmla   v25.4s,  v5.4s,  v3.s[1]\n" /* out21 = b2 * a10[0], b2 =
-                                                   q5*/
-            "fmla   v28.4s,  v5.4s,  v3.s[2]\n" /* out22 = b2 * a10[0], b2 =
-                                                   q5*/
-            "fmla   v31.4s,  v5.4s,  v3.s[3]\n" /* out23 = b2 * a10[0], b2 =
-                                                   q5*/
-            "beq        5f\n"                   /*jump to tail = 3*/
-            /* unrool 2, tail = 4*/
-            "ldp    q4, q5, [%[b_ptr]], #32\n"  /* load b2, b0 to q4, q5*/
-            "fmla   v8.4s ,  v6.4s,  v0.s[0]\n" /* out0 = b0 * a00[0], b0 = q6*/
-            "fmla   v11.4s ,  v6.4s,  v0.s[1]\n" /* out1 = b0 * a00[1], b0 =
-                                                    q6*/
-            "ldp    q2, q3, [%[a_ptr]], #32\n"   /* load a10, a11 to q3, q4*/
-            "fmla   v14.4s,  v6.4s,  v0.s[2]\n" /* out2 = b0 * a00[2], b0 = q6*/
-            "fmla   v17.4s,  v6.4s,  v0.s[3]\n" /* out3 = b0 * a00[3], b0 = q6*/
-            "fmla   v20.4s,  v6.4s,  v1.s[0]\n" /* out4 = b0 * a01[0], b0 = q6*/
-            "fmla   v23.4s,  v6.4s,  v1.s[1]\n" /* out5 = b0 * a01[1], b0 = q6*/
-            "fmla   v26.4s,  v6.4s,  v1.s[2]\n" /* out6 = b0 * a01[2], b0 = q6*/
-            "fmla   v29.4s,  v6.4s,  v1.s[3]\n" /* out7 = b0 * a01[3], b0 = q6*/
-            "fmla   v9.4s,  v7.4s,  v0.s[0]\n"  /* out8 = b1 * a00[0], b1 = q7*/
-            "fmla   v12.4s,  v7.4s,  v0.s[1]\n" /* out9 = b1 * a00[1], b1 = q7*/
-            "fmla   v15.4s,  v7.4s,  v0.s[2]\n" /* out10 = b1 * a00[2], b1 =
-                                                   q7*/
-            "fmla   v18.4s,  v7.4s,  v0.s[3]\n" /* out11 = b1 * a00[3], b1 =
-                                                   q7*/
-            "fmla   v21.4s,  v7.4s,  v1.s[0]\n" /* out12 = b1 * a01[0], b1 =
-                                                   q7*/
-            "fmla   v24.4s,  v7.4s,  v1.s[1]\n" /* out13 = b1 * a01[1], b1 =
-                                                   q7*/
-            "fmla   v27.4s,  v7.4s,  v1.s[2]\n" /* out14 = b1 * a01[2], b1 =
-                                                   q7*/
-            "fmla   v30.4s,  v7.4s,  v1.s[3]\n" /* out15 = b1 * a01[3], b1 =
-                                                   q7*/
-            "ldp    q6, q7, [%[b_ptr]], #32\n"  /* load b1, b2 to q6, q7*/
-            "fmla   v10.4s,  v4.4s,  v0.s[0]\n" /* out16 = b2 * a00[0], b2 =
-                                                   q4*/
-            "fmla   v13.4s,  v4.4s,  v0.s[1]\n" /* out17 = b2 * a00[1], b2 =
-                                                   q4*/
-            "fmla   v16.4s,  v4.4s,  v0.s[2]\n" /* out18 = b2 * a00[2], b2 =
-                                                   q4*/
-            "fmla   v19.4s,  v4.4s,  v0.s[3]\n" /* out19 = b2 * a00[3], b2 =
-                                                   q4*/
-            "fmla   v22.4s,  v4.4s,  v1.s[0]\n" /* out20 = b2 * a00[0], b2 =
-                                                   q4*/
-            "fmla   v25.4s,  v4.4s,  v1.s[1]\n" /* out21 = b2 * a00[1], b2 =
-                                                   q4*/
-            "fmla   v28.4s,  v4.4s,  v1.s[2]\n" /* out22 = b2 * a00[2], b2 =
-                                                   q4*/
-            "fmla   v31.4s,  v4.4s,  v1.s[3]\n" /* out23 = b2 * a00[3], b2 =
-                                                   q4*/
-            /* unrool 3, tail = 4*/
-            "fmla   v8.4s ,  v5.4s,  v2.s[0]\n" /* out0 = b0 * a10[0], b0 = q5*/
-            "fmla   v11.4s ,  v5.4s,  v2.s[1]\n" /* out1 = b0 * a10[1], b0 =
-                                                    q5*/
-            "fmla   v14.4s,  v5.4s,  v2.s[2]\n" /* out2 = b0 * a10[2], b0 = q5*/
-            "fmla   v17.4s,  v5.4s,  v2.s[3]\n" /* out3 = b0 * a10[3], b0 = q5*/
-            "fmla   v20.4s,  v5.4s,  v3.s[0]\n" /* out4 = b0 * a11[0], b0 = q5*/
-            "fmla   v23.4s,  v5.4s,  v3.s[1]\n" /* out5 = b0 * a11[1], b0 = q5*/
-            "fmla   v26.4s,  v5.4s,  v3.s[2]\n" /* out6 = b0 * a11[2], b0 = q5*/
-            "fmla   v29.4s,  v5.4s,  v3.s[3]\n" /* out7 = b0 * a11[3], b0 = q5*/
-            "fmla   v9.4s,  v6.4s,  v2.s[0]\n"  /* out8 = b0 * a10[0], b1 = q6*/
-            "fmla   v12.4s,  v6.4s,  v2.s[1]\n" /* out9 = b1 * a10[1], b1 = q6*/
-            "fmla   v15.4s,  v6.4s,  v2.s[2]\n" /* out10 = b1 * a10[2], b1 =
-                                                   q6*/
-            "fmla   v18.4s,  v6.4s,  v2.s[3]\n" /* out11 = b1 * a10[3], b1 =
-                                                   q6*/
-            "fmla   v21.4s,  v6.4s,  v3.s[0]\n" /* out12 = b1 * a10[0], b1 =
-                                                   q6*/
-            "fmla   v24.4s,  v6.4s,  v3.s[1]\n" /* out13 = b1 * a10[1], b1 =
-                                                   q6*/
-            "fmla   v27.4s,  v6.4s,  v3.s[2]\n" /* out14 = b1 * a10[2], b1 =
-                                                   q6*/
-            "fmla   v30.4s,  v6.4s,  v3.s[3]\n" /* out15 = b1 * a10[3], b1 =
-                                                   q6*/
-            "fmla   v10.4s,  v7.4s,  v2.s[0]\n" /* out16 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v13.4s,  v7.4s,  v2.s[1]\n" /* out17 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v16.4s,  v7.4s,  v2.s[2]\n" /* out18 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v19.4s,  v7.4s,  v2.s[3]\n" /* out19 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v22.4s,  v7.4s,  v3.s[0]\n" /* out20 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v25.4s,  v7.4s,  v3.s[1]\n" /* out21 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v28.4s,  v7.4s,  v3.s[2]\n" /* out22 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v31.4s,  v7.4s,  v3.s[3]\n" /* out23 = b2 * a10[0], b2 =
-                                                   q7*/
-            "b      11f\n"
-            /* tails==1 final tail*/
-            "3: \n"                             /* tail=1*/
-            "ldr    q6, [%[b_ptr]], #16\n"      /* load b2 to q6*/
-            "fmla   v8.4s ,  v4.4s,  v0.s[0]\n" /* out0 = b0 * a10[0], b0 = q5*/
-            "fmla   v11.4s ,  v4.4s,  v0.s[1]\n" /* out1 = b0 * a10[1], b0 =
-                                                    q5*/
-            "fmla   v14.4s,  v4.4s,  v0.s[2]\n" /* out2 = b0 * a10[2], b0 = q5*/
-            "fmla   v17.4s,  v4.4s,  v0.s[3]\n" /* out3 = b0 * a10[3], b0 = q5*/
-            "fmla   v20.4s,  v4.4s,  v1.s[0]\n" /* out4 = b0 * a11[0], b0 = q5*/
-            "fmla   v23.4s,  v4.4s,  v1.s[1]\n" /* out5 = b0 * a11[1], b0 = q5*/
-            "fmla   v26.4s,  v4.4s,  v1.s[2]\n" /* out6 = b0 * a11[2], b0 = q5*/
-            "fmla   v29.4s,  v4.4s,  v1.s[3]\n" /* out7 = b0 * a11[3], b0 = q5*/
-            "fmla   v9.4s,  v5.4s,  v0.s[0]\n"  /* out8 = b0 * a10[0], b1 = q6*/
-            "fmla   v12.4s,  v5.4s,  v0.s[1]\n" /* out9 = b1 * a10[1], b1 = q6*/
-            "fmla   v15.4s,  v5.4s,  v0.s[2]\n" /* out10 = b1 * a10[2], b1 =
-                                                   q6*/
-            "fmla   v18.4s,  v5.4s,  v0.s[3]\n" /* out11 = b1 * a10[3], b1 =
-                                                   q6*/
-            "fmla   v21.4s,  v5.4s,  v1.s[0]\n" /* out12 = b1 * a10[0], b1 =
-                                                   q6*/
-            "fmla   v24.4s,  v5.4s,  v1.s[1]\n" /* out13 = b1 * a10[1], b1 =
-                                                   q6*/
-            "fmla   v27.4s,  v5.4s,  v1.s[2]\n" /* out14 = b1 * a10[2], b1 =
-                                                   q6*/
-            "fmla   v30.4s,  v5.4s,  v1.s[3]\n" /* out15 = b1 * a10[3], b1 =
-                                                   q6*/
-            "fmla   v10.4s,  v6.4s,  v0.s[0]\n" /* out16 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v13.4s,  v6.4s,  v0.s[1]\n" /* out17 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v16.4s,  v6.4s,  v0.s[2]\n" /* out18 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v19.4s,  v6.4s,  v0.s[3]\n" /* out19 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v22.4s,  v6.4s,  v1.s[0]\n" /* out20 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v25.4s,  v6.4s,  v1.s[1]\n" /* out21 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v28.4s,  v6.4s,  v1.s[2]\n" /* out22 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v31.4s,  v6.4s,  v1.s[3]\n" /* out23 = b2 * a10[0], b2 =
-                                                   q7*/
-            "b      11f\n"
-            /* tails==2 final tail*/
-            "4:\n"                              /* tail = 2*/
-            "fmla   v8.4s ,  v7.4s,  v2.s[0]\n" /* out0 = b0 * a10[0], b0 = q5*/
-            "fmla   v11.4s ,  v7.4s,  v2.s[1]\n" /* out1 = b0 * a10[1], b0 =
-                                                    q5*/
-            "fmla   v14.4s,  v7.4s,  v2.s[2]\n" /* out2 = b0 * a10[2], b0 = q5*/
-            "fmla   v17.4s,  v7.4s,  v2.s[3]\n" /* out3 = b0 * a10[3], b0 = q5*/
-            "fmla   v20.4s,  v7.4s,  v3.s[0]\n" /* out4 = b0 * a11[0], b0 = q5*/
-            "fmla   v23.4s,  v7.4s,  v3.s[1]\n" /* out5 = b0 * a11[1], b0 = q5*/
-            "fmla   v26.4s,  v7.4s,  v3.s[2]\n" /* out6 = b0 * a11[2], b0 = q5*/
-            "fmla   v29.4s,  v7.4s,  v3.s[3]\n" /* out7 = b0 * a11[3], b0 = q5*/
-            "fmla   v9.4s,  v4.4s,  v2.s[0]\n"  /* out8 = b0 * a10[0], b1 = q6*/
-            "fmla   v12.4s,  v4.4s,  v2.s[1]\n" /* out9 = b1 * a10[1], b1 = q6*/
-            "fmla   v15.4s,  v4.4s,  v2.s[2]\n" /* out10 = b1 * a10[2], b1 =
-                                                   q6*/
-            "fmla   v18.4s,  v4.4s,  v2.s[3]\n" /* out11 = b1 * a10[3], b1 =
-                                                   q6*/
-            "fmla   v21.4s,  v4.4s,  v3.s[0]\n" /* out12 = b1 * a10[0], b1 =
-                                                   q6*/
-            "fmla   v24.4s,  v4.4s,  v3.s[1]\n" /* out13 = b1 * a10[1], b1 =
-                                                   q6*/
-            "fmla   v27.4s,  v4.4s,  v3.s[2]\n" /* out14 = b1 * a10[2], b1 =
-                                                   q6*/
-            "fmla   v30.4s,  v4.4s,  v3.s[3]\n" /* out15 = b1 * a10[3], b1 =
-                                                   q6*/
-            "fmla   v10.4s,  v5.4s,  v2.s[0]\n" /* out16 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v13.4s,  v5.4s,  v2.s[1]\n" /* out17 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v16.4s,  v5.4s,  v2.s[2]\n" /* out18 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v19.4s,  v5.4s,  v2.s[3]\n" /* out19 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v22.4s,  v5.4s,  v3.s[0]\n" /* out20 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v25.4s,  v5.4s,  v3.s[1]\n" /* out21 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v28.4s,  v5.4s,  v3.s[2]\n" /* out22 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v31.4s,  v5.4s,  v3.s[3]\n" /* out23 = b2 * a10[0], b2 =
-                                                   q7*/
-            "b      11f\n"
-            /* tails==3 final tail*/
-            "5:\n"                              /* tail = 3*/
-            "ldr    q4, [%[b_ptr]], #16\n"      /* load b2, b0 to q4*/
-            "fmla   v8.4s ,  v6.4s,  v0.s[0]\n" /* out0 = b0 * a10[0], b0 = q5*/
-            "fmla   v11.4s ,  v6.4s,  v0.s[1]\n" /* out1 = b0 * a10[1], b0 =
-                                                    q5*/
-            "fmla   v14.4s,  v6.4s,  v0.s[2]\n" /* out2 = b0 * a10[2], b0 = q5*/
-            "fmla   v17.4s,  v6.4s,  v0.s[3]\n" /* out3 = b0 * a10[3], b0 = q5*/
-            "fmla   v20.4s,  v6.4s,  v1.s[0]\n" /* out4 = b0 * a11[0], b0 = q5*/
-            "fmla   v23.4s,  v6.4s,  v1.s[1]\n" /* out5 = b0 * a11[1], b0 = q5*/
-            "fmla   v26.4s,  v6.4s,  v1.s[2]\n" /* out6 = b0 * a11[2], b0 = q5*/
-            "fmla   v29.4s,  v6.4s,  v1.s[3]\n" /* out7 = b0 * a11[3], b0 = q5*/
-            "fmla   v9.4s,  v7.4s,  v0.s[0]\n"  /* out8 = b0 * a10[0], b1 = q6*/
-            "fmla   v12.4s,  v7.4s,  v0.s[1]\n" /* out9 = b1 * a10[1], b1 = q6*/
-            "fmla   v15.4s,  v7.4s,  v0.s[2]\n" /* out10 = b1 * a10[2], b1 =
-                                                   q6*/
-            "fmla   v18.4s,  v7.4s,  v0.s[3]\n" /* out11 = b1 * a10[3], b1 =
-                                                   q6*/
-            "fmla   v21.4s,  v7.4s,  v1.s[0]\n" /* out12 = b1 * a10[0], b1 =
-                                                   q6*/
-            "fmla   v24.4s,  v7.4s,  v1.s[1]\n" /* out13 = b1 * a10[1], b1 =
-                                                   q6*/
-            "fmla   v27.4s,  v7.4s,  v1.s[2]\n" /* out14 = b1 * a10[2], b1 =
-                                                   q6*/
-            "fmla   v30.4s,  v7.4s,  v1.s[3]\n" /* out15 = b1 * a10[3], b1 =
-                                                   q6*/
-            "fmla   v10.4s,  v4.4s,  v0.s[0]\n" /* out16 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v13.4s,  v4.4s,  v0.s[1]\n" /* out17 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v16.4s,  v4.4s,  v0.s[2]\n" /* out18 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v19.4s,  v4.4s,  v0.s[3]\n" /* out19 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v22.4s,  v4.4s,  v1.s[0]\n" /* out20 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v25.4s,  v4.4s,  v1.s[1]\n" /* out21 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v28.4s,  v4.4s,  v1.s[2]\n" /* out22 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v31.4s,  v4.4s,  v1.s[3]\n" /* out23 = b2 * a10[0], b2 =
-                                                   q7*/
-            "11: \n"                            /* check if relu */
-            "cbz    %w[relu],   12f\n"          /* skip relu */
-            "movi   v2.4s, #0\n"                /* for relu*/
-            "fmax   v8.4s, v8.4s, v2.4s\n"      /* relu*/
-            "fmax   v9.4s, v9.4s, v2.4s\n"      /* relu*/
-            "fmax   v10.4s, v10.4s, v2.4s\n"    /* relu*/
-            "fmax   v11.4s, v11.4s, v2.4s\n"    /* relu*/
-            "fmax   v12.4s, v12.4s, v2.4s\n"    /* relu*/
-            "fmax   v13.4s, v13.4s, v2.4s\n"    /* relu*/
-            "fmax   v14.4s, v14.4s, v2.4s\n"    /* relu*/
-            "fmax   v15.4s, v15.4s, v2.4s\n"    /* relu*/
-            "fmax   v16.4s,v16.4s,v2.4s\n"      /* relu*/
-            "fmax   v17.4s,v17.4s,v2.4s\n"      /* relu*/
-            "fmax   v18.4s, v18.4s, v2.4s\n"    /* relu*/
-            "fmax   v19.4s, v19.4s, v2.4s\n"    /* relu*/
-            "fmax   v20.4s, v20.4s, v2.4s\n"    /* relu*/
-            "fmax   v21.4s, v21.4s, v2.4s\n"    /* relu*/
-            "fmax   v22.4s, v22.4s, v2.4s\n"    /* relu*/
-            "fmax   v23.4s, v23.4s, v2.4s\n"    /* relu*/
-            "fmax   v24.4s,v24.4s,v2.4s\n"      /* relu*/
-            "fmax   v25.4s,v25.4s,v2.4s\n"      /* relu*/
-            "fmax   v26.4s, v26.4s, v2.4s\n"    /* relu*/
-            "fmax   v27.4s, v27.4s, v2.4s\n"    /* relu*/
-            "fmax   v28.4s, v28.4s, v2.4s\n"    /* relu*/
-            "fmax   v29.4s, v29.4s, v2.4s\n"    /* relu*/
-            "fmax   v30.4s, v30.4s, v2.4s\n"    /* relu*/
-            "fmax   v31.4s, v31.4s, v2.4s\n"    /* relu*/
-            "12: \n"
-            "st1 {v8.4s, v9.4s, v10.4s},[%[c_ptr0]], #48\n"   /* store r0 */
-            "st1 {v11.4s, v12.4s, v13.4s},[%[c_ptr1]], #48\n" /* store r1 */
-            "st1 {v14.4s, v15.4s, v16.4s},[%[c_ptr2]], #48\n" /* store r2 */
-            "st1 {v17.4s, v18.4s, v19.4s},[%[c_ptr3]], #48\n" /* store r3 */
-            "st1 {v20.4s, v21.4s, v22.4s},[%[c_ptr4]], #48\n" /* store r4 */
-            "st1 {v23.4s, v24.4s, v25.4s},[%[c_ptr5]], #48\n" /* store r5 */
-            "st1 {v26.4s, v27.4s, v28.4s},[%[c_ptr6]], #48\n" /* store r6 */
-            "st1 {v29.4s, v30.4s, v31.4s},[%[c_ptr7]], #48\n" /* store r7 */
-
-            : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [k] "+r"(k),
-              [tail] "+r"(tail), [c_ptr0] "+r"(c_ptr0), [c_ptr1] "+r"(c_ptr1),
-              [c_ptr2] "+r"(c_ptr2), [c_ptr3] "+r"(c_ptr3),
-              [c_ptr4] "+r"(c_ptr4), [c_ptr5] "+r"(c_ptr5),
-              [c_ptr6] "+r"(c_ptr6), [c_ptr7] "+r"(c_ptr7)
-            : [bias_ptr] "r"(bias_local), [relu] "r"(is_relu)
-            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-              "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
-              "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28",
-              "v29", "v30", "v31");
-        if (flag_p_remain && (xb == bblocks - 1)) {
-          for (int i = 0; i < remain; ++i) {
-            *pout0++ = cout0[i];
-            *pout1++ = cout1[i];
-            *pout2++ = cout2[i];
-            *pout3++ = cout3[i];
-            *pout4++ = cout4[i];
-            *pout5++ = cout5[i];
-            *pout6++ = cout6[i];
-            *pout7++ = cout7[i];
-          }
-        }
-      }
-    }
-  }
-}
-#else  //__aarch64__
-/**
- * \brief gemm with ablock = 6, bblock = 8, output 6x8
- * @param A
- * @param B
- * @param C
- * @param M
- * @param N
- * @param K
- * @param threads
- * @param workspace
- */
-void sgemm_conv_6x8(const float* A_packed, const float* B, const float* bias,
-                    float* C, int M, int N, int K, bool is_bias, bool is_relu,
-                    bool transB) {
-  const int threads = framework::CPUContext::Context()->get_thread_num();
-  int l2_size =
-      framework::CPUContext::Context()->get_l2_cache_size() / sizeof(float);
-  int l2_cache = l2_size > 0 ? l2_size : 512 * 1024;
-
-  //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
-  int x_block =
-      (l2_cache - (MBLOCK_OTH * K)) / (sizeof(float) * (K + MBLOCK_OTH));
-  x_block /= NBLOCK;
-  x_block *= NBLOCK;
-  int x_num = (N + (x_block - 1)) / x_block;
-  x_block = (N + x_num - 1) / x_num;
-  x_block = (x_block + NBLOCK - 1) / NBLOCK;
-  x_block *= NBLOCK;
-  x_block = x_block < NBLOCK ? NBLOCK : x_block;
-  int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1;
-  int tail_pre = (K & (KBLOCK - 1));
-  if (tail_pre == 0) {
-    tail_pre = KBLOCK;
-  }
-
-  bool flag_p_remain = false;
-  int remain = 0;
-
-  //! apanel is pre_compute outside gemm
-  for (unsigned int x0 = 0; x0 < N; x0 += x_block) {
-    unsigned int xmax = x0 + x_block;
-    if (xmax > N) {
-      xmax = N;
-    }
-    int bblocks = (xmax - x0 + NBLOCK - 1) / NBLOCK;
-    remain = xmax - x0 - (bblocks - 1) * NBLOCK;
-    if (remain > 0) {
-      flag_p_remain = true;
-    }
-    //! load bpanel
-    float* b_pannel =
-        static_cast<float*>(framework::CPUContext::Context()->get_work_space(
-            K * (xmax - x0) * sizeof(float)));
-    if (!transB) {
-      loadb(b_pannel, B, N, 0, K, x0, xmax);
-    }
-#pragma omp parallel for num_threads(threads)
-    for (unsigned int y = 0; y < M; y += MBLOCK_OTH) {
-      unsigned int ymax = y + MBLOCK_OTH;
-      if (ymax > M) {
-        ymax = M;
-      }
-      float* c_ptr0 = C + y * N + x0;
-      float* c_ptr1 = c_ptr0 + N;
-      float* c_ptr2 = c_ptr1 + N;
-      float* c_ptr3 = c_ptr2 + N;
-      float* c_ptr4 = c_ptr3 + N;
-      float* c_ptr5 = c_ptr4 + N;
-
-      float* pout0 = c_ptr0;
-      float* pout1 = c_ptr1;
-      float* pout2 = c_ptr2;
-      float* pout3 = c_ptr3;
-      float* pout4 = c_ptr4;
-      float* pout5 = c_ptr5;
-
-      float bias_local[6] = {0};
-      if (is_bias) {
-        bias_local[0] = bias[y];
-        bias_local[1] = bias[y + 1];
-        bias_local[2] = bias[y + 2];
-        bias_local[3] = bias[y + 3];
-        bias_local[4] = bias[y + 4];
-        bias_local[5] = bias[y + 5];
-      }
-
-      float cout0[NBLOCK];
-      float cout1[NBLOCK];
-      float cout2[NBLOCK];
-      float cout3[NBLOCK];
-      float cout4[NBLOCK];
-      float cout5[NBLOCK];
-
-      const float* a_ptr_l = A_packed + y * K;
-      const float* b_ptr = b_pannel;
-      for (int xb = 0; xb < bblocks; xb++) {
-        if ((y + 5) >= ymax) {
-          switch ((y + 5) - ymax) {
-            case 4:
-              c_ptr1 = cout1;
-            case 3:
-              c_ptr2 = cout2;
-            case 2:
-              c_ptr3 = cout3;
-            case 1:
-              c_ptr4 = cout4;
-            case 0:
-              c_ptr5 = cout5;
-            default:
-              break;
-          }
-        }
-        if (flag_p_remain && (xb == bblocks - 1)) {
-          pout0 = c_ptr0;
-          pout1 = c_ptr1;
-          pout2 = c_ptr2;
-          pout3 = c_ptr3;
-          pout4 = c_ptr4;
-          pout5 = c_ptr5;
-
-          c_ptr0 = cout0;
-          c_ptr1 = cout1;
-          c_ptr2 = cout2;
-          c_ptr3 = cout3;
-          c_ptr4 = cout4;
-          c_ptr5 = cout5;
-        }
-        const float* a_ptr = a_ptr_l;
-        int tails = tail_pre;
-        int k = k_pre;
-        asm volatile(
-            // sgemm 6x8
-            "vld1.32    {d2-d4}, [%[bias_ptr]]      @ load bias 6 elements\n"
-            "vld1.32    {d0-d1}, [%[a_ptr] :64]!    @ load a0~a3\n"
-            "pld [%[a_ptr]]                         @ preload a\n"
-            "vdup.i32   q12,d4[0]                   @ out40=0\n"
-            "pld [%[b_ptr]]                         @ preload b\n"
-            "vdup.i32   q13,d4[0]                   @ out41=0\n"
-            "pld [%[a_ptr], #64]                    @ preload a\n"
-            "vdup.i32   q14,d4[1]                   @ out50=0\n"
-            "pld [%[b_ptr], #64]                    @ preload b\n"
-            "vdup.i32   q15,d4[1]                   @ out51=0\n"
-            "pld [%[a_ptr], #128]                   @ preload a\n"
-            "vdup.i32   q4, d2[0]                   @ out00=0\n"
-            "pld [%[b_ptr], #128]                   @ preload b\n"
-            "vdup.i32   q5, d2[0]                   @ out01=0\n"
-            "vld1.32    {d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vdup.i32   q6, d2[1]                   @ out10=0\n"
-            "pld [%[a_ptr], #192]                   @ preload a\n"
-            "vdup.i32   q7, d2[1]                   @ out11=0\n"
-            "pld [%[b_ptr], #192]                   @ preload a\n"
-            "vdup.i32   q8, d3[0]                   @ out20=0\n"
-            "pld [%[a_ptr], #256]                   @ preload a\n"
-            "vdup.i32   q9, d3[0]                   @ out21=0\n"
-            "pld [%[b_ptr], #256]                   @ preload a\n"
-            "vdup.i32   q10,d3[1]                   @ out30=0\n"
-            "pld [%[b_ptr], #320]                   @ preload b\n"
-            "vdup.i32   q11,d3[1]                   @ out31=0\n"
-            "pld [%[b_ptr], #384]                   @ preload b\n"
-            "cmp %[k], #0                           @ check weather k is "
-            "bigger than 0\n"
-            "beq 0f                                 @ jump to tail\n"
-            "1:                                     @ main loop for k\n"
-            /* Unroll 0*/
-            "vld1.32    {d2-d3}, [%[a_ptr] :64]!    @ load a4, a5, and next "
-            "a0, a1\n"
-            "vmla.f32   q4, q2, d0[0]               @ out0 += b1 * a0\n"
-            "vld1.32    {d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            "vmla.f32   q6, q2, d0[1]               @ out1 += b1 * a1\n"
-            "vmla.f32   q8, q2, d1[0]               @ out2 += b1 * a2\n"
-            "vmla.f32   q10, q2, d1[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q12, q2, d2[0]              @ out4 += b1 * a4\n"
-            "vmla.f32   q14, q2, d2[1]              @ out5 += b1 * a5\n"
-            "vld1.32    {d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vmla.f32   q5, q3, d0[0]               @ out6 += b2 * a0\n"
-            "vmla.f32   q7, q3, d0[1]               @ out7 += b2 * a1\n"
-            "vmla.f32   q9, q3, d1[0]               @ out8 += b2 * a2\n"
-            "vmla.f32   q11, q3, d1[1]              @ out9 += b2 * a3\n"
-            "vld1.32    {d0-d1}, [%[a_ptr] :64]!    @ load a2~a5\n"
-            "vmla.f32   q13, q3, d2[0]              @ out10 += b2 * a4\n"
-            "vmla.f32   q15, q3, d2[1]              @ out11 += b2 * a5\n"
-            "vld1.32    {d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            /* Unroll 1 */
-            "vmla.f32   q4, q2, d3[0]               @ out0 += b1 * a0\n"
-            "vmla.f32   q6, q2, d3[1]               @ out1 += b1 * a1\n"
-            /*"pld [%[a_ptr], #64]                    @ preload a\n"*/
-            "vmla.f32   q8, q2, d0[0]               @ out2 += b1 * a2\n"
-            "vmla.f32   q10, q2, d0[1]              @ out3 += b1 * a3\n"
-            /*"pld [%[b_ptr], #192]\n"*/
-            "vmla.f32   q12, q2, d1[0]              @ out4 += b1 * a4\n"
-            "vmla.f32   q14, q2, d1[1]              @ out5 += b1 * a5\n"
-            "vld1.32    {d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vmla.f32   q5, q3, d3[0]               @ out6 += b2 * a0\n"
-            "vmla.f32   q7, q3, d3[1]               @ out7 += b2 * a1\n"
-            "vld1.32    {d2-d3}, [%[a_ptr] :64]!    @ load a0~a3\n"
-            "vmla.f32   q9, q3, d0[0]               @ out8 += b2 * a2\n"
-            "vmla.f32   q11, q3, d0[1]              @ out9 += b2 * a3\n"
-            "vmla.f32   q13, q3, d1[0]              @ out10 += b2 * a4\n"
-            "vmla.f32   q15, q3, d1[1]              @ out11 += b2 * a5\n"
-            "vld1.32    {d0-d1}, [%[a_ptr] :64]!    @ load a4, a5, a0, a1\n"
-            /* Unroll 2 */
-            "vmla.f32   q4, q2, d2[0]               @ out0 += b1 * a0\n"
-            "vld1.32    {d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            "vmla.f32   q6, q2, d2[1]               @ out1 += b1 * a1\n"
-            "vmla.f32   q8, q2, d3[0]               @ out2 += b1 * a2\n"
-            "vmla.f32   q10, q2, d3[1]              @ out3 += b1 * a3\n"
-            /*"pld [%[a_ptr], #240]                   @ preload\n"*/
-            "vmla.f32   q12, q2, d0[0]              @ out4 += b1 * a4\n"
-            "vmla.f32   q14, q2, d0[1]              @ out5 += b1 * a5\n"
-            "vld1.32    {d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vmla.f32   q5, q3, d2[0]               @ out6 += b2 * a0\n"
-            "vmla.f32   q7, q3, d2[1]               @ out7 += b2 * a1\n"
-            /*"pld [%[b_ptr], #208]\n"*/
-            "vmla.f32   q9, q3, d3[0]               @ out8 += b2 * a2\n"
-            "vmla.f32   q11, q3, d3[1]              @ out9 += b2 * a3\n"
-            "vld1.32    {d2-d3}, [%[a_ptr] :64]!    @ load a2~a5\n"
-            "vmla.f32   q13, q3, d0[0]              @ out10 += b2 * a4\n"
-            "vmla.f32   q15, q3, d0[1]              @ out11 += b2 * a5\n"
-            "vld1.32    {d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            /* Unroll 3 */
-            "vmla.f32   q4, q2, d1[0]               @ out0 += b1 * a0\n"
-            "vmla.f32   q6, q2, d1[1]               @ out1 += b1 * a1\n"
-            "vmla.f32   q8, q2, d2[0]               @ out2 += b1 * a2\n"
-            "vmla.f32   q10, q2, d2[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q12, q2, d3[0]              @ out4 += b1 * a4\n"
-            "vmla.f32   q14, q2, d3[1]              @ out5 += b1 * a5\n"
-            "vld1.32    {d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vmla.f32   q5, q3, d1[0]               @ out6 += b2 * a0\n"
-            "vmla.f32   q7, q3, d1[1]               @ out7 += b2 * a1\n"
-            "vld1.32    {d0-d1}, [%[a_ptr] :64]!    @ load a0~a3\n"
-            "vmla.f32   q9, q3, d2[0]               @ out8 += b2 * a2\n"
-            "vmla.f32   q11, q3, d2[1]              @ out9 += b2 * a3\n"
-            "subs       %[k], %[k], #1              @ k--\n"
-            "vmla.f32   q13, q3, d3[0]              @ out10 += b2 * a4\n"
-            "vmla.f32   q15, q3, d3[1]              @ out11 += b2 * a5\n"
-            "bne        1b                          @ jump to main loop\n"
-            "0:                                     @ process tail\n"
-            "subs       %[tails], %[tails], #1      @ tail--\n"
-            "beq        3f                          @ jump to tail = 1\n"
-            /* Unroll 0*/
-            "vld1.32    {d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            "vmla.f32   q4, q2, d0[0]               @ out0 += b1 * a0\n"
-            "vld1.32    {d2-d3}, [%[a_ptr] :64]!    @ load a4,5, a0, a1\n"
-            "vmla.f32   q6, q2, d0[1]               @ out1 += b1 * a1\n"
-            "vmla.f32   q8, q2, d1[0]               @ out2 += b1 * a2\n"
-            "vmla.f32   q10, q2, d1[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q12, q2, d2[0]              @ out4 += b1 * a4\n"
-            "subs       %[tails], %[tails], #1      @ tail--\n"
-            "vmla.f32   q14, q2, d2[1]              @ out5 += b1 * a5\n"
-            "vld1.32    {d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vmla.f32   q5, q3, d0[0]               @ out6 += b2 * a0\n"
-            "vmla.f32   q7, q3, d0[1]               @ out7 += b2 * a1\n"
-            "vmla.f32   q9, q3, d1[0]               @ out8 += b2 * a2\n"
-            "vmla.f32   q11, q3, d1[1]              @ out9 += b2 * a3\n"
-            "vld1.32    {d0-d1}, [%[a_ptr] :64]!    @ load a2~a5\n"
-            "vmla.f32   q13, q3, d2[0]              @ out10 += b2 * a4\n"
-            "vmla.f32   q15, q3, d2[1]              @ out11 += b2 * a5\n"
-            "vld1.32    {d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            "beq        4f                          @ jump to tail==2\n"
-            /* Unroll 1*/
-            "vmla.f32   q4, q2, d3[0]               @ out0 += b1 * a0\n"
-            "vmla.f32   q6, q2, d3[1]               @ out1 += b1 * a1\n"
-            "subs       %[tails], %[tails], #1      @ tail--\n"
-            "vmla.f32   q8, q2, d0[0]               @ out2 += b1 * a2\n"
-            "vmla.f32   q10, q2, d0[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q12, q2, d1[0]              @ out4 += b1 * a4\n"
-            "vmla.f32   q14, q2, d1[1]              @ out5 += b1 * a5\n"
-            "vld1.32    {d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vmla.f32   q5, q3, d3[0]               @ out6 += b2 * a0\n"
-            "vmla.f32   q7, q3, d3[1]               @ out7 += b2 * a1\n"
-            "vld1.32    {d2-d3}, [%[a_ptr] :64]!    @ load a0~a3\n"
-            "vmla.f32   q9, q3, d0[0]               @ out8 += b2 * a2\n"
-            "vmla.f32   q11, q3, d0[1]              @ out9 += b2 * a3\n"
-            "vmla.f32   q13, q3, d1[0]              @ out10 += b2 * a4\n"
-            "vmla.f32   q15, q3, d1[1]              @ out11 += b2 * a5\n"
-            "vld1.32    {d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            "beq        5f                          @ jump to tail==3\n"
-            /* Unroll 2 */
-            "vld1.32    {d0-d1}, [%[a_ptr] :64]!    @ load a4,a5, a0,a1\n"
-            "vmla.f32   q4, q2, d2[0]               @ out0 += b1 * a0\n"
-            "vmla.f32   q6, q2, d2[1]               @ out1 += b1 * a1\n"
-            "vmla.f32   q8, q2, d3[0]               @ out2 += b1 * a2\n"
-            "vmla.f32   q10, q2, d3[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q12, q2, d0[0]              @ out4 += b1 * a4\n"
-            "vmla.f32   q14, q2, d0[1]              @ out5 += b1 * a5\n"
-            "vld1.32    {d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vmla.f32   q5, q3, d2[0]               @ out6 += b2 * a0\n"
-            "vmla.f32   q7, q3, d2[1]               @ out7 += b2 * a1\n"
-            "vmla.f32   q9, q3, d3[0]               @ out8 += b2 * a2\n"
-            "vmla.f32   q11, q3, d3[1]              @ out9 += b2 * a3\n"
-            "vld1.32    {d2-d3}, [%[a_ptr] :64]!    @ load a2~a5\n"
-            "vmla.f32   q13, q3, d0[0]              @ out10 += b2 * a4\n"
-            "vmla.f32   q15, q3, d0[1]              @ out11 += b2 * a5\n"
-            "vld1.32    {d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            /* Unroll 3*/
-            "vmla.f32   q4, q2, d1[0]               @ out0 += b1 * a0\n"
-            "vmla.f32   q6, q2, d1[1]               @ out1 += b1 * a1\n"
-            "vmla.f32   q8, q2, d2[0]               @ out2 += b1 * a2\n"
-            "vmla.f32   q10, q2, d2[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q12, q2, d3[0]              @ out4 += b1 * a4\n"
-            "vmla.f32   q14, q2, d3[1]              @ out5  += b1 * a5\n"
-            "vmla.f32   q5, q3, d1[0]               @ out6 += b2 * a0\n"
-            "vmla.f32   q7, q3, d1[1]               @ out7 += b2 * a1\n"
-            "vmla.f32   q9, q3, d2[0]               @ out8 += b2 * a2\n"
-            "vmla.f32   q11, q3, d2[1]              @ out9 += b2 * a3\n"
-            "vmla.f32   q13, q3, d3[0]              @ out10 += b2 * a4\n"
-            "vmla.f32   q15, q3, d3[1]              @ out11 += b2 * a5\n"
-            "b      2f\n"
-            /* tails==1 final tail*/
-            "3:                                     @ tail=1\n"
-            "vmla.f32   q4, q2, d0[0]               @ out0 += b1 * a0\n"
-            "vld1.32    {d2}, [%[a_ptr] :64]!       @ load a4,a5\n"
-            "vmla.f32   q6, q2, d0[1]               @ out1 += b1 * a1\n"
-            "vld1.32    {d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            "vmla.f32   q8, q2, d1[0]               @ out2 += b1 * a2\n"
-            "vmla.f32   q10, q2, d1[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q12, q2, d2[0]              @ out4 += b1 * a4\n"
-            "vmla.f32   q14, q2, d2[1]              @ out5 += b1 * a5\n"
-            "vmla.f32   q5, q3, d0[0]               @ out6 += b2 * a0\n"
-            "vmla.f32   q7, q3, d0[1]               @ out7 += b2 * a1\n"
-            "vmla.f32   q9, q3, d1[0]               @ out8 += b2 * a2\n"
-            "vmla.f32   q11, q3, d1[1]              @ out9 += b2 * a3\n"
-            "vmla.f32   q13, q3, d2[0]              @ out10 += b2 * a4\n"
-            "vmla.f32   q15, q3, d2[1]              @ out11 += b2 * a5\n"
-            "b      2f                              @ jump to end\n"
-            /* tails==2 final tail*/
-            "4:                                     @ tail == 2\n"
-            "vmla.f32   q4, q2, d3[0]               @ out0 += b1 * a0\n"
-            "vmla.f32   q6, q2, d3[1]               @ out1 += b1 * a1\n"
-            "vmla.f32   q8, q2, d0[0]               @ out2 += b1 * a2\n"
-            "vmla.f32   q10, q2, d0[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q12, q2, d1[0]              @ out4 += b1 * a4\n"
-            "vmla.f32   q14, q2, d1[1]              @ out5 += b1 * a5\n"
-            "vmla.f32   q5, q3, d3[0]               @ out6 += b2 * a0\n"
-            "vmla.f32   q7, q3, d3[1]               @ out7 += b2 * a1\n"
-            "vmla.f32   q9, q3, d0[0]               @ out8 += b2 * a2\n"
-            "vmla.f32   q11, q3, d0[1]              @ out9 += b2 * a3\n"
-            "vmla.f32   q13, q3, d1[0]              @ out10 += b2 * a4\n"
-            "vmla.f32   q15, q3, d1[1]              @ out11 += b2 * a5\n"
-            "b      2f                              @ jump to end\n"
-            /* tails==3 final tail*/
-            "5:                                     @ tail=3\n"
-            "vmla.f32   q4, q2, d2[0]               @ out0 += b1 * a0\n"
-            "vld1.32    {d0}, [%[a_ptr] :64]!       @ load a4,a5\n"
-            "vmla.f32   q6, q2, d2[1]               @ out1 += b1 * a1\n"
-            "vmla.f32   q8, q2, d3[0]               @ out2 += b1 * a2\n"
-            "vmla.f32   q10, q2, d3[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q12, q2, d0[0]              @ out4 += b1 * a4\n"
-            "vmla.f32   q14, q2, d0[1]              @ out5 += b1 * a5\n"
-            "vmla.f32   q5, q3, d2[0]               @ out6 += b2 * a0\n"
-            "vmla.f32   q7, q3, d2[1]               @ out7 += b2 * a1\n"
-            "vmla.f32   q9, q3, d3[0]               @ out8 += b2 * a2\n"
-            "vmla.f32   q11, q3, d3[1]              @ out9 += b2 * a3\n"
-            "vmla.f32   q13, q3, d0[0]              @ out10 += b2 * a4\n"
-            "vmla.f32   q15, q3, d0[1]              @ out11 += b2 * a5\n"
-            "2:                                     @ check relu\n"
-            "cmp    %[relu], #0                     @ check if has relu\n"
-            "ble    6f                              @ skip relu if relu <= 0\n"
-            "vmov.u32    q0, #0                     @ for relu\n"
-            "vmax.f32   q4, q4, q0                  @ for relu\n"
-            "vmax.f32   q5, q5, q0                  @ for relu\n"
-            "vmax.f32   q6, q6, q0                  @ for relu\n"
-            "vmax.f32   q7, q7, q0                  @ for relu\n"
-            "vmax.f32   q8, q8, q0                  @ for relu\n"
-            "vmax.f32   q9, q9, q0                  @ for relu\n"
-            "vmax.f32   q10, q10, q0                @ for relu\n"
-            "vmax.f32   q11, q11, q0                @ for relu\n"
-            "vmax.f32   q12, q12, q0                @ for relu\n"
-            "vmax.f32   q13, q13, q0                @ for relu\n"
-            "vmax.f32   q14, q14, q0                @ for relu\n"
-            "vmax.f32   q15, q15, q0                @ for relu\n"
-            "6:                                     @ store result\n"
-            "vst1.32    {d8-d11},   [%[c_ptr0]]!    @ store r0\n"
-            "vst1.32    {d12-d15},  [%[c_ptr1]]!    @ store r1\n"
-            "vst1.32    {d16-d19},  [%[c_ptr2]]!    @ store r2\n"
-            "vst1.32    {d20-d23},  [%[c_ptr3]]!    @ store r3\n"
-            "vst1.32    {d24-d27},  [%[c_ptr4]]!    @ store r4\n"
-            "vst1.32    {d28-d31},  [%[c_ptr5]]!    @ store r5\n"
-            : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr0] "+r"(c_ptr0),
-              [c_ptr1] "+r"(c_ptr1), [c_ptr2] "+r"(c_ptr2),
-              [c_ptr3] "+r"(c_ptr3), [c_ptr4] "+r"(c_ptr4),
-              [c_ptr5] "+r"(c_ptr5), [k] "+r"(k), [tails] "+r"(tails)
-            : [bias_ptr] "r"(bias_local), [relu] "r"(is_relu)
-            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
-              "q11", "q12", "q13", "q14", "q15", "cc", "memory");
-
-        if (flag_p_remain && (xb == bblocks - 1)) {
-          for (int i = 0; i < remain; ++i) {
-            *pout0++ = cout0[i];
-            *pout1++ = cout1[i];
-            *pout2++ = cout2[i];
-            *pout3++ = cout3[i];
-            *pout4++ = cout4[i];
-            *pout5++ = cout5[i];
-          }
-        }
-      }
-    }
-  }
-}
-
-void sgemm_conv_4x8(const float* A_packed, const float* B, const float* bias,
-                    float* C, int M, int N, int K, bool is_bias, bool is_relu,
-                    bool transB) {
-  const int threads = framework::CPUContext::Context()->get_thread_num();
-  int l2_size =
-      framework::CPUContext::Context()->get_l2_cache_size() / sizeof(float);
-  int l2_cache = l2_size > 0 ? l2_size : 512 * 1024;
-
-  //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
-  int x_block =
-      (l2_cache - (MBLOCK_A73 * K)) / (sizeof(float) * (K + MBLOCK_A73));
-  x_block /= NBLOCK;
-  x_block *= NBLOCK;
-  int x_num = (N + (x_block - 1)) / x_block;
-  x_block = (N + x_num - 1) / x_num;
-  x_block = (x_block + NBLOCK - 1) / NBLOCK;
-  x_block *= NBLOCK;
-  x_block = x_block < NBLOCK ? NBLOCK : x_block;
-
-  int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1;
-  int tail_pre = (K & (KBLOCK - 1));
-  if (tail_pre == 0) {
-    tail_pre = KBLOCK;
-  }
-
-  bool flag_p_remain = false;
-  int remain = 0;
-
-  //! apanel is pre_compute outside gemm
-  for (unsigned int x0 = 0; x0 < N; x0 += x_block) {
-    unsigned int xmax = x0 + x_block;
-    if (xmax > N) {
-      xmax = N;
-    }
-    int bblocks = (xmax - x0 + NBLOCK - 1) / NBLOCK;
-    remain = xmax - x0 - (bblocks - 1) * NBLOCK;
-    if (remain > 0) {
-      flag_p_remain = true;
-    }
-    //! load bpanel
-    float* b_pannel =
-        static_cast<float*>(framework::CPUContext::Context()->get_work_space(
-            K * (xmax - x0) * sizeof(float)));
-
-    if (!transB) {
-      loadb(b_pannel, B, N, 0, K, x0, xmax);
-    }
-#pragma omp parallel for num_threads(threads)
-    for (unsigned int y = 0; y < M; y += MBLOCK_A73) {
-      unsigned int ymax = y + MBLOCK_A73;
-      if (ymax > M) {
-        ymax = M;
-      }
-
-      float cout0[NBLOCK];
-      float cout1[NBLOCK];
-      float cout2[NBLOCK];
-      float cout3[NBLOCK];
-
-      float bias_local[4] = {0};
-      if (is_bias) {
-        bias_local[0] = bias[y];
-        bias_local[1] = bias[y + 1];
-        bias_local[2] = bias[y + 2];
-        bias_local[3] = bias[y + 3];
-      }
-
-      float* c_ptr0 = C + y * N + x0;
-      float* c_ptr1 = c_ptr0 + N;
-      float* c_ptr2 = c_ptr1 + N;
-      float* c_ptr3 = c_ptr2 + N;
-
-      float* pout0 = c_ptr0;
-      float* pout1 = c_ptr1;
-      float* pout2 = c_ptr2;
-      float* pout3 = c_ptr3;
-
-      const float* a_ptr_l = A_packed + y * K;
-      const float* b_ptr = b_pannel;
-      for (int xb = 0; xb < bblocks; xb++) {
-        if ((y + 3) >= ymax) {
-          switch ((y + 3) - ymax) {
-            case 2:
-              c_ptr1 = cout1;
-            case 1:
-              c_ptr2 = cout1;
-            case 0:
-              c_ptr3 = cout1;
-            default:
-              break;
-          }
-        }
-        if (flag_p_remain && (xb == bblocks - 1)) {
-          pout0 = c_ptr0;
-          pout1 = c_ptr1;
-          pout2 = c_ptr2;
-          pout3 = c_ptr3;
-
-          c_ptr0 = cout0;
-          c_ptr1 = cout1;
-          c_ptr2 = cout2;
-          c_ptr3 = cout3;
-        }
-        const float* a_ptr = a_ptr_l;
-        int tails = tail_pre;
-        int k = k_pre;
-        asm volatile(
-            "vld1.32    {d4-d5}, [%[bias_ptr]]      @ load bias\n"
-            "vld1.32    {d0-d3}, [%[a_ptr] :128]!   @ load a0~a3\n"
-            "vdup.32    q8, d4[0]                   @ add bias to out00\n"
-            "pld [%[a_ptr]]                         @ preload a, 64byte\n"
-            "vdup.32    q9, d4[0]                   @ add bias to out01\n"
-            "pld [%[b_ptr]]                         @ preload b\n"
-            "vdup.32    q10, d4[1]                  @ add bias to out10\n"
-            "pld [%[a_ptr], #64]                    @ preload a\n"
-            "vdup.32    q11, d4[1]                  @ add bias to out11\n"
-            "vld1.32   {d8-d11}, [%[b_ptr] :128]!   @ load b1\n"
-            "vdup.32    q12, d5[0]                  @ add bias to out20\n"
-            "pld [%[b_ptr], #64]                    @ preload b\n"
-            "vdup.32    q13, d5[0]                  @ add bias to out21\n"
-            "pld [%[a_ptr], #128]                   @ preload a\n"
-            "vdup.32    q14, d5[1]                  @ add bias to out30\n"
-            "pld [%[b_ptr], #128]                   @ preload b\n"
-            "vdup.32    q15, d5[1]                  @ add bias to out31\n"
-            "pld [%[b_ptr], #192]                   @ preload b\n"
-            "cmp %[k], #0                           @ check weather k is "
-            "bigger than 0\n"
-            "beq 0f                                 @ jump to tail\n"
-
-            "1:                                     @ main loop for k\n"
-            /* Unroll 0*/
-            "vld1.32  {d12-d15}, [%[b_ptr] :128]!   @ load next b1, b2\n"
-            "vmla.f32   q8, q4, d0[0]               @ out0 += b1 * a0\n"
-            "vld1.32    {d4-d7}, [%[a_ptr] :128]!   @ load next 2xa0~a3\n"
-            "vmla.f32   q10, q4, d0[1]              @ out1 += b1 * a1\n"
-            "vmla.f32   q12, q4, d1[0]              @ out2 += b1 * a2\n"
-            "vmla.f32   q14, q4, d1[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q9, q5, d0[0]               @ out4 += b2 * a0\n"
-            "vmla.f32   q11, q5, d0[1]              @ out5 += b2 * a1\n"
-            "vmla.f32   q13, q5, d1[0]              @ out6 += b2 * a2\n"
-            "vmla.f32   q15, q5, d1[1]              @ out7 += b2 * a3\n"
-            "vld1.32    {d8-d11}, [%[b_ptr] :128]!  @ load next b1, b2\n"
-            /* Unroll 1 */
-            "vmla.f32   q8, q6, d2[0]               @ out0 += b1 * a0\n"
-            "pld [%[b_ptr], #64]                    @ preload b\n"
-            "vmla.f32   q10, q6, d2[1]              @ out1 += b1 * a1\n"
-            "vmla.f32   q12, q6, d3[0]              @ out2 += b1 * a2\n"
-            "vmla.f32   q14, q6, d3[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q9, q7, d2[0]               @ out6 += b2 * a0\n"
-            "vmla.f32   q11, q7, d2[1]              @ out7 += b2 * a1\n"
-            "vmla.f32   q13, q7, d3[0]              @ out8 += b2 * a2\n"
-            "vmla.f32   q15, q7, d3[1]              @ out9 += b2 * a3\n"
-            "vld1.32    {d12-d15}, [%[b_ptr] :128]! @ load next b1,b2\n"
-            /* Unroll 2 */
-            "vmla.f32   q8, q4, d4[0]               @ out0 += b1 * a0\n"
-            "vld1.32    {d0-d3}, [%[a_ptr] :128]!   @ load next a0~a3\n"
-            "vmla.f32   q10, q4, d4[1]              @ out1 += b1 * a1\n"
-            "vmla.f32   q12, q4, d5[0]              @ out2 += b1 * a2\n"
-            "vmla.f32   q14, q4, d5[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q9, q5, d4[0]               @ out4 += b2 * a0\n"
-            "vmla.f32   q11, q5, d4[1]              @ out5 += b2 * a1\n"
-            "vmla.f32   q13, q5, d5[0]              @ out6 += b2 * a2\n"
-            "vmla.f32   q15, q5, d5[1]              @ out7 += b2 * a3\n"
-            "vld1.32    {d8-d11}, [%[b_ptr] :128]!  @ load next b1, b2\n"
-            /* Unroll 3 */
-            "vmla.f32   q8, q6, d6[0]               @ out0 += b1 * a0\n"
-            "pld [%[a_ptr], #64]                    @ preload a\n"
-            "vmla.f32   q10, q6, d6[1]              @ out1 += b1 * a1\n"
-            "vmla.f32   q12, q6, d7[0]              @ out2 += b1 * a2\n"
-            "vmla.f32   q14, q6, d7[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q9, q7, d6[0]               @ out4 += b2 * a0\n"
-            "vmla.f32   q11, q7, d6[1]              @ out5 += b2 * a1\n"
-            "vmla.f32   q13, q7, d7[0]              @ out6 += b2 * a2\n"
-            "vmla.f32   q15, q7, d7[1]              @ out7 += b2 * a3\n"
-            "subs       %[k], %[k], #1              @ k--\n"
-            "bne        1b                          @ jump to main loop\n"
-
-            "0:                                     @ process tail\n"
-            "subs       %[tails], %[tails], #1      @ tail--\n"
-            "beq        3f                          @ jump to tail = 1\n"
-            /* Unroll 0*/
-            "vld1.32  {d12-d15}, [%[b_ptr] :128]!   @ load next b1, b2\n"
-            "vmla.f32   q8, q4, d0[0]               @ out0 += b1 * a0\n"  // b1*a1
-            "vmla.f32   q10, q4, d0[1]              @ out1 += b1 * a1\n"
-            "subs       %[tails], %[tails], #1      @ tail--\n"
-            "vmla.f32   q12, q4, d1[0]              @ out2 += b1 * a2\n"
-            "vmla.f32   q14, q4, d1[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q9, q5, d0[0]               @ out4 += b2 * a0\n"
-            "vmla.f32   q11, q5, d0[1]              @ out5 += b2 * a1\n"
-            "vmla.f32   q13, q5, d1[0]              @ out6 += b2 * a2\n"
-            "vmla.f32   q15, q5, d1[1]              @ out7 += b2 * a3\n"
-            "beq        4f                          @ jump to tail==2\n"
-            /* Unroll 1 */
-            "vld1.32    {d8-d11}, [%[b_ptr] :128]!  @ load next b1, b2\n"
-            "vmla.f32   q8, q6, d2[0]               @ out0 += b1 * a0\n"  // b6*a2
-            "vld1.32    {d4-d7}, [%[a_ptr] :128]!   @ load next 2xa0~a3\n"
-            "vmla.f32   q10, q6, d2[1]              @ out1 += b1 * a1\n"
-            "subs       %[tails], %[tails], #1      @ tail--\n"
-            "vmla.f32   q12, q6, d3[0]              @ out2 += b1 * a2\n"
-            "vmla.f32   q14, q6, d3[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q9, q7, d2[0]               @ out6 += b2 * a0\n"
-            "vmla.f32   q11, q7, d2[1]              @ out7 += b2 * a1\n"
-            "vmla.f32   q13, q7, d3[0]              @ out8 += b2 * a2\n"
-            "vmla.f32   q15, q7, d3[1]              @ out9 += b2 * a3\n"
-            "beq        5f                          @ jump to tail==3\n"
-            /* Unroll 2 */
-            "vld1.32    {d12-d15}, [%[b_ptr] :128]! @ load next b1,b2\n"
-            "vmla.f32   q8, q4, d4[0]               @ out0 += b1 * a0\n"  // b11
-                                                                          // *
-                                                                          // a3
-            "vmla.f32   q10, q4, d4[1]              @ out1 += b1 * a1\n"
-            "vmla.f32   q12, q4, d5[0]              @ out2 += b1 * a2\n"
-            "vmla.f32   q14, q4, d5[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q9, q5, d4[0]               @ out4 += b2 * a0\n"
-            "vmla.f32   q11, q5, d4[1]              @ out5 += b2 * a1\n"
-            "vmla.f32   q13, q5, d5[0]              @ out6 += b2 * a2\n"
-            "vmla.f32   q15, q5, d5[1]              @ out7 += b2 * a3\n"
-            /* Unroll 3 */
-            "vmla.f32   q8, q6, d6[0]               @ out0 += b1 * a0\n"  // b16
-                                                                          // *
-                                                                          // a4
-            "vmla.f32   q10, q6, d6[1]              @ out1 += b1 * a1\n"
-            "vmla.f32   q12, q6, d7[0]              @ out2 += b1 * a2\n"
-            "vmla.f32   q14, q6, d7[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q9, q7, d6[0]               @ out4 += b2 * a0\n"
-            "vmla.f32   q11, q7, d6[1]              @ out5 += b2 * a1\n"
-            "vmla.f32   q13, q7, d7[0]              @ out6 += b2 * a2\n"
-            "vmla.f32   q15, q7, d7[1]              @ out7 += b2 * a3\n"
-            "b      2f\n"
-            /* tails==1 final tail */
-            "3:                                     @ tail=1\n"
-            "vmla.f32   q8, q4, d0[0]               @ out0 += b1 * a0\n"
-            "vmla.f32   q10, q4, d0[1]              @ out1 += b1 * a1\n"
-            "vmla.f32   q12, q4, d1[0]              @ out2 += b1 * a2\n"
-            "vmla.f32   q14, q4, d1[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q9, q5, d0[0]               @ out4 += b2 * a0\n"
-            "vmla.f32   q11, q5, d0[1]              @ out5 += b2 * a1\n"
-            "vmla.f32   q13, q5, d1[0]              @ out6 += b2 * a2\n"
-            "vmla.f32   q15, q5, d1[1]              @ out7 += b2 * a3\n"
-            /*aptr - 16 */
-            "sub        %[a_ptr], %[a_ptr], #16     @ tail--\n"
-            "b      2f                              @ jump to end\n"
-            /* tails==2 final tail*/
-            "4:                                     @ tail == 2\n"
-            "vmla.f32   q8, q6, d2[0]               @ out0 += b1 * a0\n"
-            "vmla.f32   q10, q6, d2[1]              @ out1 += b1 * a1\n"
-            "vmla.f32   q12, q6, d3[0]              @ out2 += b1 * a2\n"
-            "vmla.f32   q14, q6, d3[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q9, q7, d2[0]               @ out4 += b2 * a0\n"
-            "vmla.f32   q11, q7, d2[1]              @ out5 += b2 * a1\n"
-            "vmla.f32   q13, q7, d3[0]              @ out6 += b2 * a2\n"
-            "vmla.f32   q15, q7, d3[1]              @ out7 += b2 * a3\n"
-            "b      2f                              @ jump to end\n"
-            /* tails==3 final tail*/
-            "5:                                     @ tail=3\n"
-            "vmla.f32   q8, q4, d4[0]               @ out0 += b1 * a0\n"
-            "vmla.f32   q10, q4, d4[1]              @ out1 += b1 * a1\n"
-            "vmla.f32   q12, q4, d5[0]              @ out2 += b1 * a2\n"
-            "vmla.f32   q14, q4, d5[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q9, q5, d4[0]               @ out4 += b2 * a0\n"
-            "vmla.f32   q11, q5, d4[1]              @ out5 += b2 * a1\n"
-            "vmla.f32   q13, q5, d5[0]              @ out6 += b2 * a2\n"
-            "vmla.f32   q15, q5, d5[1]              @ out7 += b2 * a3\n"
-            /*aptr - 16*/
-            "sub        %[a_ptr], %[a_ptr], #16     @ tail--\n"
-            "2:                                      @ check relu\n"
-            "cmp    %[relu], #0                     @ check if has relu\n"
-            "ble    6f                              @ skip relu if relu <= 0\n"
-            "vmov.u32    q0, #0                     @ for relu\n"
-            "vmax.f32   q8, q8, q0                  @ for relu\n"
-            "vmax.f32   q9, q9, q0                  @ for relu\n"
-            "vmax.f32   q10, q10, q0                @ for relu\n"
-            "vmax.f32   q11, q11, q0                @ for relu\n"
-            "vmax.f32   q12, q12, q0                @ for relu\n"
-            "vmax.f32   q13, q13, q0                @ for relu\n"
-            "vmax.f32   q14, q14, q0                @ for relu\n"
-            "vmax.f32   q15, q15, q0                @ for relu\n"
-            "6:                                     @ store result\n"
-            "vst1.32    {d16-d19},  [%[c_ptr0]]!    @ store r0\n"
-            "vst1.32    {d20-d23},  [%[c_ptr1]]!    @ store r1\n"
-            "vst1.32    {d24-d27},  [%[c_ptr2]]!    @ store r2\n"
-            "vst1.32    {d28-d31},  [%[c_ptr3]]!    @ store r3\n"
-            : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr0] "+r"(c_ptr0),
-              [c_ptr1] "+r"(c_ptr1), [c_ptr2] "+r"(c_ptr2),
-              [c_ptr3] "+r"(c_ptr3), [k] "+r"(k), [tails] "+r"(tails)
-            : [bias_ptr] "r"(bias_local), [relu] "r"(is_relu)
-            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
-              "q11", "q12", "q13", "q14", "q15", "cc", "memory");
-
-        if (flag_p_remain && (xb == bblocks - 1)) {
-          for (int i = 0; i < remain; ++i) {
-            *pout0++ = cout0[i];
-            *pout1++ = cout1[i];
-            *pout2++ = cout2[i];
-            *pout3++ = cout3[i];
-          }
-        }
-      }
-    }
-  }
-}
-
-#endif  //__aarch64__
-/// a: m*k  b: k*n  c: m*n
-void sgemm_prepack(const float *A_packed, const float *B, const float *bias,
-                   float *C, int M, int N, int K, bool is_bias, bool is_relu,
-                   bool is_transB, ARMArch arch) {
-#ifdef __aarch64__
-  sgemm_conv_8x12(A_packed, B, bias, C, M, N, K, is_bias, is_relu, is_transB);
-#else   // armv7
-  if (arch == A73) {
-    sgemm_conv_4x8(A_packed, B, bias, C, M, N, K, is_bias, is_relu, is_transB);
-  } else {
-    sgemm_conv_6x8(A_packed, B, bias, C, M, N, K, is_bias, is_relu, is_transB);
-  }
-#endif  // arm64
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // CONV_OP
-#endif  // __ARM_NEON__
diff --git a/mobile/src/operators/math/gemm/gemm1x1s1.h b/mobile/src/operators/math/gemm/gemm1x1s1.h
deleted file mode 100644
index 19dcdccdb9..0000000000
--- a/mobile/src/operators/math/gemm/gemm1x1s1.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_OP
-
-#pragma once
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-#ifdef __aarch64__
-const int MBLOCK = 8;
-const int NBLOCK = 12;
-const int KBLOCK = 4;
-inline int get_hblock(ARMArch arch) { return MBLOCK; }
-#else
-const int MBLOCK_A73 = 4;
-const int MBLOCK_OTH = 6;
-const int NBLOCK = 8;
-const int KBLOCK = 4;
-
-inline int get_hblock(ARMArch arch) {
-  if (arch == A73) {
-    return MBLOCK_A73;
-  } else {
-    return MBLOCK_OTH;
-  }
-}
-#endif  // __aarch64__
-
-void gemm1x1s1_transform_weight(const framework::Tensor& weight,
-                                const framework::Tensor& output,
-                                framework::Tensor* trans_weight,
-                                const int group, ARMArch arch);
-
-void sgemm_prepack(const float* A_packed, const float* B, const float* bias,
-                   float* C, int M, int N, int K, bool is_bias, bool is_relu,
-                   bool is_transB, ARMArch arch);
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // CONV_OP
diff --git a/mobile/src/operators/math/gemm/gemm_kernel.h b/mobile/src/operators/math/gemm/gemm_kernel.h
deleted file mode 100644
index 0f3089b204..0000000000
--- a/mobile/src/operators/math/gemm/gemm_kernel.h
+++ /dev/null
@@ -1,792 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-#include <arm_neon.h>
-#include <string.h>
-#include "operators/math/math.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-#if __aarch64__
-void sgemm_6x16(const float *lhs, const float *rhs, const int k, float *output,
-                const int ldc) {
-  int kc1 = k;
-  int step = 4 * ldc;
-  int step1 = 4 * 6;
-  asm volatile(
-      "dup      v6.4s,     wzr     \n\t"
-      "dup      v7.4s,     wzr     \n\t"
-      "dup      v8.4s,     wzr     \n\t"
-      "dup      v9.4s,     wzr     \n\t"
-      "dup      v10.4s,    wzr     \n\t"
-      "dup      v11.4s,    wzr     \n\t"
-      "dup      v12.4s,    wzr     \n\t"
-      "dup      v13.4s,    wzr     \n\t"
-
-      "dup      v14.4s,    wzr     \n\t"
-      "dup      v15.4s,    wzr     \n\t"
-      "dup      v16.4s,    wzr     \n\t"
-      "dup      v17.4s,    wzr     \n\t"
-      "dup      v18.4s,    wzr     \n\t"
-      "dup      v19.4s,    wzr     \n\t"
-      "dup      v20.4s,    wzr     \n\t"
-      "dup      v21.4s,    wzr     \n\t"
-
-      "dup      v22.4s,    wzr     \n\t"
-      "dup      v23.4s,    wzr     \n\t"
-      "dup      v24.4s,    wzr     \n\t"
-      "dup      v25.4s,    wzr     \n\t"
-      "dup      v26.4s,    wzr     \n\t"
-      "dup      v27.4s,    wzr     \n\t"
-      "dup      v28.4s,    wzr     \n\t"
-      "dup      v29.4s,    wzr     \n\t"
-
-      "subs     %[kc1], %[kc1], #1          \n\t"
-      "blt      2f                          \n\t"
-      "1:                                   \n\t"
-
-      "prfm     pldl1keep,  [%[lhs],  #32]  \n\t"
-      "prfm     pldl1keep,  [%[rhs],  #64]  \n\t"
-
-      "ld1      {v0.4s, v1.4s},  [%[lhs]],    %[step1]      \n\t"
-      "ld1      {v2.4s, v3.4s, v4.4s, v5.4s}, [%[rhs]], #64 \n\t"
-
-      "fmla     v6.4s,    v2.4s,   v0.s[0]       \n\t"
-      "fmla     v7.4s,    v3.4s,   v0.s[0]       \n\t"
-      "fmla     v8.4s,    v4.4s,   v0.s[0]       \n\t"
-      "fmla     v9.4s,    v5.4s,   v0.s[0]       \n\t"
-
-      "fmla     v10.4s,   v2.4s,   v0.s[1]       \n\t"
-      "fmla     v11.4s,   v3.4s,   v0.s[1]       \n\t"
-      "fmla     v12.4s,   v4.4s,   v0.s[1]       \n\t"
-      "fmla     v13.4s,   v5.4s,   v0.s[1]       \n\t"
-
-      "fmla     v14.4s,   v2.4s,   v0.s[2]       \n\t"
-      "fmla     v15.4s,   v3.4s,   v0.s[2]       \n\t"
-      "fmla     v16.4s,   v4.4s,   v0.s[2]       \n\t"
-      "fmla     v17.4s,   v5.4s,   v0.s[2]       \n\t"
-
-      "fmla     v18.4s,   v2.4s,   v0.s[3]       \n\t"
-      "fmla     v19.4s,   v3.4s,   v0.s[3]       \n\t"
-      "fmla     v20.4s,   v4.4s,   v0.s[3]       \n\t"
-      "fmla     v21.4s,   v5.4s,   v0.s[3]       \n\t"
-
-      "fmla     v22.4s,   v2.4s,   v1.s[0]       \n\t"
-      "fmla     v23.4s,   v3.4s,   v1.s[0]       \n\t"
-      "fmla     v24.4s,   v4.4s,   v1.s[0]       \n\t"
-      "fmla     v25.4s,   v5.4s,   v1.s[0]       \n\t"
-
-      "fmla     v26.4s,   v2.4s,   v1.s[1]       \n\t"
-      "fmla     v27.4s,   v3.4s,   v1.s[1]       \n\t"
-      "fmla     v28.4s,   v4.4s,   v1.s[1]       \n\t"
-      "fmla     v29.4s,   v5.4s,   v1.s[1]       \n\t"
-
-      "subs       %[kc1], %[kc1], #1      \n\t"
-      "bge        1b                      \n\t"
-      "2:                                 \n\t"
-
-      "st1      {v6.4s,  v7.4s,  v8.4s,  v9.4s},    [%[c]],   %[step]   \n\t"
-      "st1      {v10.4s, v11.4s, v12.4s, v13.4s},   [%[c]],   %[step]   \n\t"
-      "st1      {v14.4s, v15.4s, v16.4s, v17.4s},   [%[c]],   %[step]   \n\t"
-      "st1      {v18.4s, v19.4s, v20.4s, v21.4s},   [%[c]],   %[step]   \n\t"
-      "st1      {v22.4s, v23.4s, v24.4s, v25.4s},   [%[c]],   %[step]   \n\t"
-      "st1      {v26.4s, v27.4s, v28.4s, v29.4s},   [%[c]],   %[step]   \n\t"
-      : [lhs] "+r"(lhs), [rhs] "+r"(rhs), [c] "+r"(output), [kc1] "+r"(kc1)
-      : [step] "r"(step), [step1] "r"(step1)
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
-        "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-        "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28",
-        "v29");
-}
-#else
-void sgemm_6x8(const float *lhs, const float *rhs, const int k, float *output,
-               const int ldc) {
-  int kc1 = k >> 3;   // k / 8
-  int kc2 = k & 0x7;  // k % 8
-  int step = sizeof(float) * ldc;
-  asm volatile(
-      "pld        [%[lhs]]            \n\t"
-      "pld        [%[lhs],  #64]      \n\t"
-      "pld        [%[rhs]]            \n\t"
-      "pld        [%[rhs],  #64]      \n\t"
-
-      "vmov.f32   q4,     #0.0          \n\t"
-      "vmov.f32   q5,     #0.0          \n\t"
-      "vmov.f32   q6,     #0.0          \n\t"
-      "vmov.f32   q7,     #0.0          \n\t"
-      "vmov.f32   q8,     #0.0          \n\t"
-      "vmov.f32   q9,     #0.0          \n\t"
-      "vmov.f32   q10,    #0.0          \n\t"
-      "vmov.f32   q11,    #0.0          \n\t"
-      "vmov.f32   q12,    #0.0          \n\t"
-      "vmov.f32   q13,    #0.0          \n\t"
-      "vmov.f32   q14,    #0.0          \n\t"
-      "vmov.f32   q15,    #0.0          \n\t"
-
-      "subs       %[kc1], %[kc1], #1    \n\t"
-      "blt        2f                    \n\t"
-      "1:                               \n\t"
-
-      "pld        [%[lhs], #128]       \n\t"
-      "pld        [%[rhs], #128]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[lhs]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[rhs]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[lhs]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[rhs]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "pld        [%[lhs], #128]       \n\t"
-      "pld        [%[rhs], #128]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[lhs]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[rhs]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[lhs]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[rhs]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "pld        [%[lhs], #128]       \n\t"
-      "pld        [%[rhs], #128]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[lhs]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[rhs]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[lhs]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[rhs]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "pld        [%[lhs], #128]       \n\t"
-      "pld        [%[rhs], #128]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[lhs]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[rhs]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[lhs]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[rhs]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "subs       %[kc1], %[kc1], #1      \n\t"
-      "bge        1b                      \n\t"
-      "2:                                 \n\t"
-
-      "subs       %[kc2], %[kc2], #1      \n\t"
-      "blt        4f                      \n\t"
-      "3:                                 \n\t"
-
-      "vld1.32    {d0-d2},  [%[lhs]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[rhs]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "subs       %[kc2], %[kc2], #1      \n\t"
-      "bge        3b                      \n\t"
-      "4:                                 \n\t"
-
-      "mov        r5,     %[c]            \n\t"
-      "mov        r6,     %[step]         \n\t"
-      "vst1.32    {q4, q5},   [r5], r6    \n\t"
-      "vst1.32    {q6, q7},   [r5], r6    \n\t"
-      "vst1.32    {q8, q9},   [r5], r6    \n\t"
-      "vst1.32    {q10, q11}, [r5], r6    \n\t"
-      "vst1.32    {q12, q13}, [r5], r6    \n\t"
-      "vst1.32    {q14, q15}, [r5]        \n\t"
-      :
-      : [lhs] "r"(lhs), [rhs] "r"(rhs), [c] "r"(output), [kc1] "r"(kc1),
-        [kc2] "r"(kc2), [step] "r"(step)
-      : "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
-        "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-}
-#endif  // __aarch64__
-
-void sgemv_notrans_mx1(const int M, const int N, const float alpha,
-                       const float *A, const int lda, const float *B,
-                       const float beta, float *C) {
-  uint32_t mask[4] = {0, 1, 2, 3};
-  int remain_n = N & 0x3;
-  uint32x4_t vmask = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_n));
-  float32x4_t _valpha = vdupq_n_f32(alpha);
-
-  #pragma omp parallel for
-  for (int m = 0; m < M - 3; m += 4) {
-    const float *in0 = A + m * lda;
-    const float *in1 = in0 + lda;
-    const float *in2 = in1 + lda;
-    const float *in3 = in2 + lda;
-    float *output = C + m;
-
-    float32x4_t _sum0, _sum1, _sum2, _sum3;
-    _sum0 = vdupq_n_f32(0.f);
-    _sum1 = vdupq_n_f32(0.f);
-    _sum2 = vdupq_n_f32(0.f);
-    _sum3 = vdupq_n_f32(0.f);
-    int n = 0;
-    for (; n < N - 3; n += 4) {
-      float32x4_t _r0 = vld1q_f32(in0 + n);
-      float32x4_t _r1 = vld1q_f32(in1 + n);
-      float32x4_t _r2 = vld1q_f32(in2 + n);
-      float32x4_t _r3 = vld1q_f32(in3 + n);
-      float32x4_t _b = vld1q_f32(B + n);
-      _sum0 = vmlaq_f32(_sum0, _r0, _b);
-      _sum1 = vmlaq_f32(_sum1, _r1, _b);
-      _sum2 = vmlaq_f32(_sum2, _r2, _b);
-      _sum3 = vmlaq_f32(_sum3, _r3, _b);
-    }
-    if (n < N) {
-      float32x4_t _r0 = vld1q_f32(in0 + n);
-      float32x4_t _r1 = vld1q_f32(in1 + n);
-      float32x4_t _r2 = vld1q_f32(in2 + n);
-      float32x4_t _r3 = vld1q_f32(in3 + n);
-      float32x4_t _b = vld1q_f32(B + n);
-      _r0 = vandq_f32_u32(_r0, vmask);
-      _r1 = vandq_f32_u32(_r1, vmask);
-      _r2 = vandq_f32_u32(_r2, vmask);
-      _r3 = vandq_f32_u32(_r3, vmask);
-      _b = vandq_f32_u32(_b, vmask);
-      _sum0 = vmlaq_f32(_sum0, _r0, _b);
-      _sum1 = vmlaq_f32(_sum1, _r1, _b);
-      _sum2 = vmlaq_f32(_sum2, _r2, _b);
-      _sum3 = vmlaq_f32(_sum3, _r3, _b);
-    }
-    _sum0 = vpaddq_f32(_sum0, _sum1);
-    _sum2 = vpaddq_f32(_sum2, _sum3);
-    _sum0 = vpaddq_f32(_sum0, _sum2);
-    _sum0 = vmulq_f32(_sum0, _valpha);
-    if (beta != 0.f) {
-      _sum2 = vmulq_n_f32(vld1q_f32(output), beta);
-      _sum0 = vaddq_f32(_sum0, _sum2);
-    }
-    // restore
-    vst1q_f32(output, _sum0);
-  }
-  // remain m
-  for (int m = (M & 0xfffffffc); m < M; ++m) {
-    const float *in0 = A + m * lda;
-    float *output = C + m;
-    float32x4_t _sum0 = vdupq_n_f32(0.f);
-
-    int n = 0;
-    for (; n < N - 3; n += 4) {
-      float32x4_t _r0 = vld1q_f32(in0 + n);
-      float32x4_t _b = vld1q_f32(B + n);
-      _sum0 = vmlaq_f32(_sum0, _r0, _b);
-    }
-    if (n < N) {
-      float32x4_t _r0 = vld1q_f32(in0 + n);
-      float32x4_t _b = vld1q_f32(B + n);
-      _r0 = vandq_f32_u32(_r0, vmask);
-      _b = vandq_f32_u32(_b, vmask);
-      _sum0 = vmlaq_f32(_sum0, _r0, _b);
-    }
-    float32x2_t _ss = vadd_f32(vget_low_f32(_sum0), vget_high_f32(_sum0));
-    float32x2_t _sss2 = vpadd_f32(_ss, _ss);
-    *output =
-        vget_lane_f32(_sss2, 0) * vgetq_lane_f32(_valpha, 0) + beta * (*output);
-  }
-}
-
-void sgemv_notrans_mx1_faster(const int M, const int N, const float alpha,
-                              const float *A, const int lda, const float *B,
-                              const float beta, float *C) {
-#pragma omp parallel for
-  for (int m = 0; m < M - 3; m += 4) {
-    const float *a_ptr0 = A + m * lda;
-    const float *a_ptr1 = a_ptr0 + lda;
-    const float *a_ptr2 = a_ptr1 + lda;
-    const float *a_ptr3 = a_ptr2 + lda;
-    const float *b_ptr = B;
-    float *c_ptr = C + m;
-    float sum0 = 0.f;
-    float sum1 = 0.f;
-    float sum2 = 0.f;
-    float sum3 = 0.f;
-    int n = 0;
-
-#if __ARM_NEON
-    /* matrix_mul_float:
-     * Calculate matrix A(4xN) * matrix B(Nx1) and store to a result array
-     * sum_arr[4], a 4x8 * 8x1 will be calculated on each iteration.
-     *
-     * Variable: a_ptr0 = pointer to the first row of matrix A, row major order
-     * Variable: a_ptr1 = pointer to the second row of matrix A, row major order
-     * Variable: a_ptr2 = pointer to the third row of matrix A, row major order
-     * Variable: a_ptr3 = pointer to the fourth row of matrix A, row major order
-     * Variable: b_ptr  = pointer to the first col of matrix B, col major order
-     * Variable: s_ptr  = pointer to the sum result array
-     * Variable: loop   = the numbers of loops
-     *
-     * Register: Q(V)4-Q(V)11  = matrix A
-     * Register: Q(V)0-Q(V)1   = matrix B
-     * Register: Q(V)12-Q(V)15 = matrix C
-     */
-
-    float sum_arr[4] = {0.f};
-    float *s_ptr = sum_arr;
-    int loop = N / 8;
-
-#if __aarch64__
-
-    if (loop > 0) {
-      asm volatile(
-          // set v12-v15 to 0
-          "movi   v12.4s,            #0             \n"
-          "movi   v13.4s,            #0             \n"
-          "movi   v14.4s,            #0             \n"
-          "movi   v15.4s,            #0             \n"
-
-          "0:                                       \n"
-          // load A and B
-          "ld1   {v0.4s, v1.4s},   [%[b_ptr]] , #32 \n"
-          "ld1   {v4.4s, v5.4s},   [%[a_ptr0]], #32 \n"
-          "ld1   {v6.4s, v7.4s},   [%[a_ptr1]], #32 \n"
-          "ld1   {v8.4s, v9.4s},   [%[a_ptr2]], #32 \n"
-          "ld1   {v10.4s, v11.4s}, [%[a_ptr3]], #32 \n"
-
-          "fmla   v12.4s, v4.4s,  v0.4s             \n"  // s0=A(r0c0-r0c3)*B(r0-r3)
-          "fmla   v13.4s, v6.4s,  v0.4s             \n"  // s1=A(r1c0-r1c3)*B(r0-r3)
-          "fmla   v14.4s, v8.4s,  v0.4s             \n"  // s2=A(r2c0-r2c3)*B(r0-r3)
-          "fmla   v15.4s, v10.4s, v0.4s             \n"  // s3=A(r3c0-r3c3)*B(r0-r3)
-
-          "fmla   v12.4s, v5.4s,  v1.4s             \n"  // s0=A(r0c4-r0c7)*B(r4-r7)
-          "fmla   v13.4s, v7.4s,  v1.4s             \n"  // s1=A(r1c4-r1c7)*B(r4-r7)
-          "fmla   v14.4s, v9.4s,  v1.4s             \n"  // s2=A(r2c4-r2c7)*B(r4-r7)
-          "fmla   v15.4s, v11.4s, v1.4s             \n"  // s3=A(r3c4-r3c7)*B(r4-r7)
-
-          // cycle
-          "subs   %[loop], %[loop], #1              \n"
-          "bne    0b                                \n"
-
-          // add and store
-          "faddp   v4.4s, v12.4s, v13.4s            \n"
-          "faddp   v5.4s, v14.4s, v15.4s            \n"
-          "faddp   v6.4s, v4.4s, v5.4s              \n"
-          "st1    {v6.4s}, [%[s_ptr]]               \n"
-
-          : [loop] "+r"(loop), [a_ptr0] "+r"(a_ptr0), [a_ptr1] "+r"(a_ptr1),
-            [a_ptr2] "+r"(a_ptr2), [a_ptr3] "+r"(a_ptr3), [b_ptr] "+r"(b_ptr)
-          : [s_ptr] "r"(s_ptr)
-          : "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
-            "v13", "v14", "v15", "cc", "memory");
-    }
-#else   // __aarch64__
-
-    if (loop > 0) {
-      asm volatile(
-
-          // set Q12-Q15 to 0
-          "vmov.i32    q12,       #0           \n"
-          "vmov.i32    q13,       #0           \n"
-          "vmov.i32    q14,       #0           \n"
-          "vmov.i32    q15,       #0           \n"
-
-          "0:                                  \n"
-          // load A and B
-          "vld1.f32    {d0-d3},   [%[b_ptr]]!  \n"
-          "vld1.f32    {d8-d11},  [%[a_ptr0]]! \n"
-          "vld1.f32    {d12-d15}, [%[a_ptr1]]! \n"
-          "vld1.f32    {d16-d19}, [%[a_ptr2]]! \n"
-          "vld1.f32    {d20-d23}, [%[a_ptr3]]! \n"
-
-          "vmla.f32    q12, q4,   q0           \n"  // s0=A(r0c0-r0c3)*B(r0-r3)
-          "vmla.f32    q13, q6,   q0           \n"  // s1=A(r1c0-r1c3)*B(r0-r3)
-          "vmla.f32    q14, q8,   q0           \n"  // s2=A(r2c0-r2c3)*B(r0-r3)
-          "vmla.f32    q15, q10,  q0           \n"  // s3=A(r3c0-r3c3)*B(r0-r3)
-
-          "vmla.f32    q12, q5,   q1           \n"  // s0=A(r0c4-r0c7)*B(r4-r7)
-          "vmla.f32    q13, q7,   q1           \n"  // s1=A(r1c4-r1c7)*B(r4-r7)
-          "vmla.f32    q14, q9,   q1           \n"  // s2=A(r2c4-r2c7)*B(r4-r7)
-          "vmla.f32    q15, q11,  q1           \n"  // s3=A(r3c4-r3c7)*B(r4-r7)
-
-          // cycle
-          "subs        %[loop],   #1           \n"
-          "bne         0b                      \n"
-          // add and store
-          "vpadd.f32   d8, d24,   d25          \n"
-          "vpadd.f32   d9, d26,   d27          \n"
-          "vpadd.f32   d10, d28,  d29          \n"
-          "vpadd.f32   d11, d30,  d31          \n"
-
-          "vpadd.f32   d12, d8,   d9           \n"
-          "vpadd.f32   d13, d10,  d11          \n"
-          "vst1.32     {d12-d13}, [%[s_ptr]]   \n"
-
-          : [loop] "+r"(loop), [a_ptr0] "+r"(a_ptr0), [a_ptr1] "+r"(a_ptr1),
-            [a_ptr2] "+r"(a_ptr2), [a_ptr3] "+r"(a_ptr3), [b_ptr] "+r"(b_ptr)
-          : [s_ptr] "r"(s_ptr)
-          : "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12",
-            "q13", "q14", "q15", "cc", "memory");
-    }
-#endif  // __aarch64__
-    sum0 += s_ptr[0];
-    sum1 += s_ptr[1];
-    sum2 += s_ptr[2];
-    sum3 += s_ptr[3];
-    n = N - (N & 0x07);
-#endif  // __ARM_NEON
-
-    for (; n < N - 7; n += 8) {
-      sum0 += a_ptr0[0] * b_ptr[0];
-      sum1 += a_ptr1[0] * b_ptr[0];
-      sum2 += a_ptr2[0] * b_ptr[0];
-      sum3 += a_ptr3[0] * b_ptr[0];
-
-      sum0 += a_ptr0[1] * b_ptr[1];
-      sum1 += a_ptr1[1] * b_ptr[1];
-      sum2 += a_ptr2[1] * b_ptr[1];
-      sum3 += a_ptr3[1] * b_ptr[1];
-
-      sum0 += a_ptr0[2] * b_ptr[2];
-      sum1 += a_ptr1[2] * b_ptr[2];
-      sum2 += a_ptr2[2] * b_ptr[2];
-      sum3 += a_ptr3[2] * b_ptr[2];
-
-      sum0 += a_ptr0[3] * b_ptr[3];
-      sum1 += a_ptr1[3] * b_ptr[3];
-      sum2 += a_ptr2[3] * b_ptr[3];
-      sum3 += a_ptr3[3] * b_ptr[3];
-
-      sum0 += a_ptr0[4] * b_ptr[4];
-      sum1 += a_ptr1[4] * b_ptr[4];
-      sum2 += a_ptr2[4] * b_ptr[4];
-      sum3 += a_ptr3[4] * b_ptr[4];
-
-      sum0 += a_ptr0[5] * b_ptr[5];
-      sum1 += a_ptr1[5] * b_ptr[5];
-      sum2 += a_ptr2[5] * b_ptr[5];
-      sum3 += a_ptr3[5] * b_ptr[5];
-
-      sum0 += a_ptr0[6] * b_ptr[6];
-      sum1 += a_ptr1[6] * b_ptr[6];
-      sum2 += a_ptr2[6] * b_ptr[6];
-      sum3 += a_ptr3[6] * b_ptr[6];
-
-      sum0 += a_ptr0[7] * b_ptr[7];
-      sum1 += a_ptr1[7] * b_ptr[7];
-      sum2 += a_ptr2[7] * b_ptr[7];
-      sum3 += a_ptr3[7] * b_ptr[7];
-
-      a_ptr0 += 8;
-      a_ptr1 += 8;
-      a_ptr2 += 8;
-      a_ptr3 += 8;
-      b_ptr += 8;
-    }
-
-    for (; n < N; ++n) {
-      sum0 += a_ptr0[0] * b_ptr[0];
-      sum1 += a_ptr1[0] * b_ptr[0];
-      sum2 += a_ptr2[0] * b_ptr[0];
-      sum3 += a_ptr3[0] * b_ptr[0];
-
-      a_ptr0 += 1;
-      a_ptr1 += 1;
-      a_ptr2 += 1;
-      a_ptr3 += 1;
-      b_ptr += 1;
-    }
-    c_ptr[0] = alpha * sum0 + beta * c_ptr[0];
-    c_ptr[1] = alpha * sum1 + beta * c_ptr[1];
-    c_ptr[2] = alpha * sum2 + beta * c_ptr[2];
-    c_ptr[3] = alpha * sum3 + beta * c_ptr[3];
-  }
-
-  int m_tail_start = M - (M & 0x03);
-  for (int m = m_tail_start; m < M; ++m) {
-    const float *a_ptr = A + m * lda;
-    const float *b_ptr = B;
-    float *c_ptr = C + m;
-    float sum = 0.f;
-    for (int n = 0; n < N; n++) {
-      sum += a_ptr[0] * b_ptr[0];
-      a_ptr += 1;
-      b_ptr += 1;
-    }
-    c_ptr[0] = alpha * sum + beta * c_ptr[0];
-  }
-}
-
-void sgemv_trans_mx1(const int M, const int N, const float alpha,
-                     const float *A, const int lda, const float *B,
-                     const float beta, float *C) {
-// create buff_c to store temp computation result for each threading
-#ifdef _OPENMP
-  int threads_num = omp_get_max_threads();
-#else
-  int threads_num = 1;
-#endif  // _OPENMP
-  float *buf_c = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * threads_num * M));
-  memset(buf_c, 0, threads_num * M * sizeof(float));
-
-  #pragma omp parallel for
-  for (int n = 0; n < N - 3; n += 4) {
-#ifdef _OPENMP
-    const int tid = omp_get_thread_num();
-#else
-    const int tid = 0;
-#endif  // _OPENMP
-    float *thread_buf_c = buf_c + tid * M;
-    const float *in0 = A + n * lda;
-    const float *in1 = in0 + lda;
-    const float *in2 = in1 + lda;
-    const float *in3 = in2 + lda;
-    float32x4_t _b = vld1q_f32(B + n);
-    float32x4_t _sum0;
-    int m = 0;
-    for (; m < M - 3; m += 4) {
-      float32x4_t _r0 = vld1q_f32(in0 + m);
-      float32x4_t _r1 = vld1q_f32(in1 + m);
-      float32x4_t _r2 = vld1q_f32(in2 + m);
-      float32x4_t _r3 = vld1q_f32(in3 + m);
-      float32x4_t _vbuff_c = vld1q_f32(thread_buf_c + m);
-
-      _sum0 = vmulq_lane_f32(_r0, vget_low_f32(_b), 0);
-      _sum0 = vmlaq_lane_f32(_sum0, _r1, vget_low_f32(_b), 1);
-      _sum0 = vmlaq_lane_f32(_sum0, _r2, vget_high_f32(_b), 0);
-      _sum0 = vmlaq_lane_f32(_sum0, _r3, vget_high_f32(_b), 1);
-      _sum0 = vaddq_f32(_sum0, _vbuff_c);
-
-      vst1q_f32(thread_buf_c + m, _sum0);
-    }
-    if (m < M) {
-      float32x4_t _sum0 = vdupq_n_f32(0.0f);
-      float32x4_t _r0 = vld1q_f32(in0 + m);
-      float32x4_t _r1 = vld1q_f32(in1 + m);
-      float32x4_t _r2 = vld1q_f32(in2 + m);
-      float32x4_t _r3 = vld1q_f32(in3 + m);
-      float32x4_t _vbuff_c = vld1q_f32(thread_buf_c + m);
-
-      _sum0 = vmulq_lane_f32(_r0, vget_low_f32(_b), 0);
-      _sum0 = vmlaq_lane_f32(_sum0, _r1, vget_low_f32(_b), 1);
-      _sum0 = vmlaq_lane_f32(_sum0, _r2, vget_high_f32(_b), 0);
-      _sum0 = vmlaq_lane_f32(_sum0, _r3, vget_high_f32(_b), 1);
-      _sum0 = vaddq_f32(_sum0, _vbuff_c);
-      switch (M - m) {
-        case 3:
-          vst1q_lane_f32(thread_buf_c + m + 2, _sum0, 2);
-        case 2:
-          vst1_f32(thread_buf_c + m, vget_low_f32(_sum0));
-          break;
-        case 1:
-          vst1q_lane_f32(thread_buf_c + m, _sum0, 0);
-          break;
-      }
-    }
-  }
-
-  // remain n
-  #pragma omp parallel for
-  for (int n = (N & 0xfffffffc); n < N; ++n) {
-#ifdef _OPENMP
-    const int tid = omp_get_thread_num();
-#else
-    const int tid = 0;
-#endif  // _OPENMP
-    float *thread_buf_c = buf_c + tid * M;
-    const float *in0 = A + n * lda;
-    float32x4_t _b = vld1q_dup_f32(B + n);
-    float32x4_t _sum0;
-    int m = 0;
-    for (; m < M - 3; m += 4) {
-      float32x4_t _r0 = vld1q_f32(in0 + m);
-      float32x4_t _vbuff_c = vld1q_f32(thread_buf_c + m);
-      _sum0 = vmulq_f32(_r0, _b);
-      _sum0 = vaddq_f32(_sum0, _vbuff_c);
-      vst1q_f32(thread_buf_c + m, _sum0);
-    }
-    for (; m < M; ++m) {
-      thread_buf_c[m] += in0[m] * B[n];
-    }
-  }
-
-  // reduction operate for buf_c, sum to C and do left operations
-  // y := alpha * A' * X + beta * y
-  // reduction operate: sum multi-threadings result for over-all: A' * X
-  float32x4_t _valpha = vdupq_n_f32(alpha);
-  if (beta == 0.f) {
-    #pragma omp parallel for
-    for (int m = 0; m < M - 3; m += 4) {
-      float32x4_t _sum0 = vld1q_f32(buf_c + m);
-      for (int tid = 1; tid < threads_num; ++tid) {
-        _sum0 += vld1q_f32(buf_c + tid * M + m);
-      }
-      vst1q_f32(C + m, _sum0 * _valpha);
-    }
-
-    for (int m = (M & 0xfffffffc); m < M; ++m) {
-      float _sum0 = *(buf_c + m);
-      for (int tid = 1; tid < threads_num; ++tid) {
-        _sum0 += *(buf_c + tid * M + m);
-      }
-      C[m] = _sum0 * alpha;
-    }
-  } else {  // beta != 0.f
-    float32x4_t _vbeta = vdupq_n_f32(beta);
-    #pragma omp parallel for
-    for (int m = 0; m < M - 3; m += 4) {
-      float32x4_t _sum0 = vld1q_f32(buf_c + m);
-      for (int tid = 1; tid < threads_num; ++tid) {
-        _sum0 += vld1q_f32(buf_c + tid * M + m);
-      }
-      float32x4_t _vc = vld1q_f32(C + m);
-      vst1q_f32(C + m, _sum0 * _valpha + _vbeta * _vc);
-    }
-
-    for (int m = (M & 0xfffffffc); m < M; ++m) {
-      float _sum0 = *(buf_c + m);
-      for (int tid = 1; tid < threads_num; ++tid) {
-        _sum0 += *(buf_c + tid * M + m);
-      }
-      C[m] = _sum0 * alpha + beta * C[m];
-    }
-  }
-
-  // free buff_c
-  paddle_mobile::memory::Free(buf_c);
-}
-
-void sgemv_mx1(const bool trans, const int M, const int N, const float alpha,
-               const float *A, const int lda, const float *B, const float beta,
-               float *C) {
-  if (trans) {
-    sgemv_trans_mx1(M, N, alpha, A, lda, B, beta, C);
-  } else {
-    //    sgemv_notrans_mx1(M, N, alpha, A, lda, B, beta, C);
-    sgemv_notrans_mx1_faster(M, N, alpha, A, lda, B, beta, C);
-  }
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // __ARM_NEON__
diff --git a/mobile/src/operators/math/gemm/pack_kernel.h b/mobile/src/operators/math/gemm/pack_kernel.h
deleted file mode 100644
index d3b1359610..0000000000
--- a/mobile/src/operators/math/gemm/pack_kernel.h
+++ /dev/null
@@ -1,801 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-#include <arm_neon.h>
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-#include "operators/math/math.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-void pack_lhs_6r(const int m, const int k, const float *A, const int lda,
-                 float *output, const bool unroll) {
-  uint32_t mask[8] = {0, 1, 2, 3, 4, 5, 4, 5};
-  int remain_k = k & 0x3;
-  uint32x4_t vzero = vdupq_n_u32(0);
-  uint32x4_t vmask1 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_k));
-
-  #pragma omp parallel for if (unroll)
-  for (int i = 0; i < m - 5; i += 6) {
-    const float *a0 = A + i * lda;
-    const float *a1 = A + (i + 1) * lda;
-    const float *a2 = A + (i + 2) * lda;
-    const float *a3 = A + (i + 3) * lda;
-    const float *a4 = A + (i + 4) * lda;
-    const float *a5 = A + (i + 5) * lda;
-    float *out_ptr = output + i * k;
-
-    int loops = k >> 2;
-    if (loops > 0) {
-#if __aarch64__
-      for (int l = 0; l < loops; ++l) {
-        float32x4_t _d0 = vld1q_f32(a0);
-        float32x4_t _d1 = vld1q_f32(a1);
-        float32x4_t _d2 = vld1q_f32(a2);
-        float32x4_t _d3 = vld1q_f32(a3);
-        float32x4_t _d4 = vld1q_f32(a4);
-        float32x4_t _d5 = vld1q_f32(a5);
-
-        float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
-        float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
-        float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
-        _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0]));
-        _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1]));
-        _d2 =
-            vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0]));
-        _d3 =
-            vcombine_f32(vget_high_f32(_q0.val[1]), vget_high_f32(_q1.val[1]));
-
-        vst1q_f32(out_ptr, _d0);
-        vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0]));
-        vst1q_f32(out_ptr + 6, _d1);
-        vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1]));
-        vst1q_f32(out_ptr + 12, _d2);
-        vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0]));
-        vst1q_f32(out_ptr + 18, _d3);
-        vst1_f32(out_ptr + 22, vget_high_f32(_q3.val[1]));
-
-        a0 += 4;
-        a1 += 4;
-        a2 += 4;
-        a3 += 4;
-        a4 += 4;
-        a5 += 4;
-        out_ptr += 24;
-      }
-#else
-      asm volatile(
-          "loop_4k_%=:                        \n"
-          "vld1.32    {d0-d1}, [%[a0]]!       \n"
-          "vld1.32    {d2-d3}, [%[a1]]!       \n"
-          "vld1.32    {d4-d5}, [%[a2]]!       \n"
-          "vld1.32    {d6-d7}, [%[a3]]!       \n"
-          "vld1.32    {d8-d9}, [%[a4]]!       \n"
-          "vld1.32    {d10-d11}, [%[a5]]!     \n"
-          "vtrn.32    q0, q1                  \n"
-          "vtrn.32    q2, q3                  \n"
-          "vtrn.32    q4, q5                  \n"
-          "vswp.32    d1, d4                  \n"
-          "vswp.32    d3, d6                  \n"
-
-          "vst1.32    {q0}, [%[out]]!         \n"
-          "vst1.32    {d8}, [%[out]]!         \n"
-          "vst1.32    {q1}, [%[out]]!         \n"
-          "vst1.32    {d10}, [%[out]]!        \n"
-          "vst1.32    {q2}, [%[out]]!         \n"
-          "vst1.32    {d9}, [%[out]]!         \n"
-          "vst1.32    {q3}, [%[out]]!         \n"
-          "vst1.32    {d11}, [%[out]]!        \n"
-
-          "subs       %[loops], #1            \n"
-          "bne        loop_4k_%=              \n"
-          : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2] "+r"(a2),
-            [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5), [loops] "+r"(loops)
-          :
-          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5");
-#endif
-    }
-
-    if (remain_k > 0) {
-      float32x4_t _d0 = vld1q_f32(a0);
-      float32x4_t _d1 = vld1q_f32(a1);
-      float32x4_t _d2 = vld1q_f32(a2);
-      float32x4_t _d3 = vld1q_f32(a3);
-      float32x4_t _d4 = vld1q_f32(a4);
-      float32x4_t _d5 = vld1q_f32(a5);
-
-      _d0 = vandq_f32_u32(_d0, vmask1);
-      _d1 = vandq_f32_u32(_d1, vmask1);
-      _d2 = vandq_f32_u32(_d2, vmask1);
-      _d3 = vandq_f32_u32(_d3, vmask1);
-      _d4 = vandq_f32_u32(_d4, vmask1);
-      _d5 = vandq_f32_u32(_d5, vmask1);
-
-      float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
-      float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
-      float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
-      _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0]));
-      _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1]));
-      _d2 = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0]));
-
-      switch (remain_k) {
-        case 3:
-          vst1q_f32(out_ptr + 12, _d2);
-          vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0]));
-        case 2:
-          vst1q_f32(out_ptr + 6, _d1);
-          vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1]));
-        case 1:
-          vst1q_f32(out_ptr, _d0);
-          vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0]));
-        default:
-          break;
-      }
-    }
-  }
-
-  int remain_m = m % 6;
-  if (remain_m) {
-    int remain_m_start = m - remain_m;
-    const float *a0 = A + remain_m_start * lda;
-    const float *a1 = a0 + lda;
-    const float *a2 = a0 + 2 * lda;
-    const float *a3 = a0 + 3 * lda;
-    const float *a4 = a0 + 4 * lda;
-    const float *a5 = a0 + 5 * lda;
-    float *out_ptr = output + remain_m_start * k;
-
-    uint32x4_t vmask2 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_m));
-    uint32x4_t vmask3 = vcltq_u32(vld1q_u32(mask + 4), vdupq_n_u32(remain_m));
-    const float zerobuff[4] = {0.f, 0.f, 0.f, 0.f};
-
-    int lk = 0;
-    for (; lk < k - 3; lk += 4) {
-      switch (remain_m) {
-        case 1:
-          a1 = zerobuff;
-        case 2:
-          a2 = zerobuff;
-        case 3:
-          a3 = zerobuff;
-        case 4:
-          a4 = zerobuff;
-        case 5:
-          a5 = zerobuff;
-        default:
-          break;
-      }
-#if __aarch64__
-      float32x4_t _d0 = vld1q_f32(a0);
-      float32x4_t _d1 = vld1q_f32(a1);
-      float32x4_t _d2 = vld1q_f32(a2);
-      float32x4_t _d3 = vld1q_f32(a3);
-      float32x4_t _d4 = vld1q_f32(a4);
-      float32x4_t _d5 = vld1q_f32(a5);
-
-      float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
-      float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
-      float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
-      _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0]));
-      _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1]));
-      _d2 = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0]));
-      _d3 = vcombine_f32(vget_high_f32(_q0.val[1]), vget_high_f32(_q1.val[1]));
-
-      _d0 = vandq_f32_u32(_d0, vmask2);
-      _d1 = vandq_f32_u32(_d1, vmask2);
-      _d2 = vandq_f32_u32(_d2, vmask2);
-      _d3 = vandq_f32_u32(_d3, vmask2);
-      _d4 = vandq_f32_u32(_q3.val[0], vmask3);
-      _d5 = vandq_f32_u32(_q3.val[1], vmask3);
-
-      vst1q_f32(out_ptr, _d0);
-      vst1_f32(out_ptr + 4, vget_low_f32(_d4));
-      vst1q_f32(out_ptr + 6, _d1);
-      vst1_f32(out_ptr + 10, vget_low_f32(_d5));
-      vst1q_f32(out_ptr + 12, _d2);
-      vst1_f32(out_ptr + 16, vget_high_f32(_d4));
-      vst1q_f32(out_ptr + 18, _d3);
-      vst1_f32(out_ptr + 22, vget_high_f32(_d5));
-
-      a0 += 4;
-      a1 += 4;
-      a2 += 4;
-      a3 += 4;
-      a4 += 4;
-      a5 += 4;
-      out_ptr += 24;
-#else
-      asm volatile(
-          "vld1.32    {d0-d1}, [%[a0]]!       \n"
-          "vld1.32    {d2-d3}, [%[a1]]!       \n"
-          "vld1.32    {d4-d5}, [%[a2]]!       \n"
-          "vld1.32    {d6-d7}, [%[a3]]!       \n"
-          "vld1.32    {d8-d9}, [%[a4]]!       \n"
-          "vld1.32    {d10-d11}, [%[a5]]!     \n"
-          "vtrn.32    q0, q1                  \n"
-          "vtrn.32    q2, q3                  \n"
-          "vtrn.32    q4, q5                  \n"
-          "vswp.32    d1, d4                  \n"
-          "vswp.32    d3, d6                  \n"
-
-          "vbif       q0, %q[vzero], %q[vmask2] \n"
-          "vbif       q1, %q[vzero], %q[vmask2] \n"
-          "vbif       q2, %q[vzero], %q[vmask2] \n"
-          "vbif       q3, %q[vzero], %q[vmask2] \n"
-          "vbif       q4, %q[vzero], %q[vmask3] \n"
-          "vbif       q5, %q[vzero], %q[vmask3] \n"
-
-          "vst1.32    {q0}, [%[out]]!         \n"
-          "vst1.32    {d8}, [%[out]]!         \n"
-          "vst1.32    {q1}, [%[out]]!         \n"
-          "vst1.32    {d10}, [%[out]]!        \n"
-          "vst1.32    {q2}, [%[out]]!         \n"
-          "vst1.32    {d9}, [%[out]]!         \n"
-          "vst1.32    {q3}, [%[out]]!         \n"
-          "vst1.32    {d11}, [%[out]]!        \n"
-          : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2] "+r"(a2),
-            [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5)
-          : [vmask2] "w"(vmask2), [vmask3] "w"(vmask3), [vzero] "w"(vzero)
-          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5");
-#endif
-    }
-    // remain k
-    switch (remain_m) {
-      case 1:
-        a1 = zerobuff;
-      case 2:
-        a2 = zerobuff;
-      case 3:
-        a3 = zerobuff;
-      case 4:
-        a4 = zerobuff;
-      case 5:
-        a5 = zerobuff;
-      default:
-        break;
-    }
-    for (; lk < k; ++lk) {
-      *out_ptr++ = *a0++;
-      *out_ptr++ = *a1++;
-      *out_ptr++ = *a2++;
-      *out_ptr++ = *a3++;
-      *out_ptr++ = *a4++;
-      *out_ptr++ = *a5++;
-    }
-  }
-}
-
-#if __aarch64__
-void pack_rhs_16c(int k, int n, const float *B, int ldb, float *output,
-                  const bool unroll) {
-  uint32_t mask[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  uint32_t remain_n = n & 0x7;
-  float32x4_t vzero = vdupq_n_f32(0.f);
-  uint32x4_t vmask1 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_n));
-  uint32x4_t vmask2 = vcltq_u32(vld1q_u32(mask + 4), vdupq_n_u32(remain_n));
-
-  #pragma omp parallel for if (unroll)
-  for (int i = 0; i < k - 3; i += 4) {
-    const float *b0 = B + i * ldb;
-    const float *b1 = b0 + ldb;
-    const float *b2 = b1 + ldb;
-    const float *b3 = b2 + ldb;
-    int j = 0;
-    asm volatile(
-        "prfm   pldl1keep,       [%[b0]]            \n"
-        "prfm   pldl1keep,       [%[b1]]            \n"
-        "prfm   pldl1keep,       [%[b2]]            \n"
-        "prfm   pldl1keep,       [%[b3]]            \n"
-        :
-        : [b0] "r"(b0), [b1] "r"(b1), [b2] "r"(b2), [b3] "r"(b3));
-
-    for (; j < n - 15; j += 16) {
-      float *out_ptr0 = output + j * k + 16 * i;
-      asm volatile(
-          "ld1    {v0.4s, v1.4s, v2.4s, v3.4s},  [%[b0]], #64  \n"
-          "ld1    {v4.4s, v5.4s, v6.4s, v7.4s},  [%[b1]], #64  \n"
-          "st1    {v0.4s, v1.4s, v2.4s, v3.4s},  [%[out_ptr0]], #64 \n"
-          "st1    {v4.4s, v5.4s, v6.4s, v7.4s},  [%[out_ptr0]], #64 \n"
-
-          "ld1    {v0.4s, v1.4s, v2.4s, v3.4s},  [%[b2]], #64  \n"
-          "ld1    {v4.4s, v5.4s, v6.4s, v7.4s},  [%[b3]], #64  \n"
-          "st1    {v0.4s, v1.4s, v2.4s, v3.4s},  [%[out_ptr0]], #64 \n"
-          "st1    {v4.4s, v5.4s, v6.4s, v7.4s},  [%[out_ptr0]], #64 \n"
-          : [out_ptr0] "+r"(out_ptr0), [b0] "+r"(b0), [b1] "+r"(b1),
-            [b2] "+r"(b2), [b3] "+r"(b3)
-          :
-          : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-    }
-    for (; j < n - 7; j += 8) {
-      float *out_ptr0 = output + (j & 0xFFFFFFF0) * k + 16 * i + (j & 0xF);
-      int step = 64;
-      asm volatile(
-          "ld1    {v0.4s, v1.4s},  [%[b0]], #32  \n"
-          "ld1    {v2.4s, v3.4s},  [%[b1]], #32  \n"
-          "ld1    {v4.4s, v5.4s},  [%[b2]], #32  \n"
-          "ld1    {v6.4s, v7.4s},  [%[b3]], #32  \n"
-
-          "st1    {v0.4s, v1.4s},  [%[out_ptr0]], %[step] \n"
-          "st1    {v2.4s, v3.4s},  [%[out_ptr0]], %[step] \n"
-          "st1    {v4.4s, v5.4s},  [%[out_ptr0]], %[step] \n"
-          "st1    {v6.4s, v7.4s},  [%[out_ptr0]], %[step] \n"
-          : [out_ptr0] "+r"(out_ptr0), [b0] "+r"(b0), [b1] "+r"(b1),
-            [b2] "+r"(b2), [b3] "+r"(b3)
-          : [step] "r"(step)
-          : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-    }
-    if (j < n) {
-      float *out_ptr0 = output + (j & 0xFFFFFFF0) * k + 16 * i + (j & 0xF);
-      int step = 64;
-      asm volatile(
-          "ld1    {v0.4s, v1.4s},  [%[b0]]         \n"
-          "ld1    {v2.4s, v3.4s},  [%[b1]]         \n"
-          "ld1    {v4.4s, v5.4s},  [%[b2]]         \n"
-          "ld1    {v6.4s, v7.4s},  [%[b3]]         \n"
-
-          "and    v0.16b, v0.16b, %[vmask1].16b    \n"
-          "and    v1.16b, v1.16b, %[vmask2].16b    \n"
-          "and    v2.16b, v2.16b, %[vmask1].16b    \n"
-          "and    v3.16b, v3.16b, %[vmask2].16b    \n"
-          "and    v4.16b, v4.16b, %[vmask1].16b    \n"
-          "and    v5.16b, v5.16b, %[vmask2].16b    \n"
-          "and    v6.16b, v6.16b, %[vmask1].16b    \n"
-          "and    v7.16b, v7.16b, %[vmask2].16b    \n"
-
-          "st1    {v0.4s, v1.4s},  [%[out_ptr0]], %[step]  \n"
-          "st1    {v2.4s, v3.4s},  [%[out_ptr0]], %[step]  \n"
-          "st1    {v4.4s, v5.4s},  [%[out_ptr0]], %[step]  \n"
-          "st1    {v6.4s, v7.4s},  [%[out_ptr0]], %[step]  \n"
-          : [out_ptr0] "+r"(out_ptr0)
-          : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [b0] "r"(b0),
-            [b1] "r"(b1), [b2] "r"(b2), [b3] "r"(b3), [step] "r"(step)
-          : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-      j += 8;
-    }
-
-    if (j & 0xf) {
-      float *out_ptr0 = output + (j & 0xFFFFFFF0) * k + 16 * i + (j & 0xF);
-      vst1q_f32(out_ptr0, vzero);
-      vst1q_f32(out_ptr0 + 4, vzero);
-      out_ptr0 += 16;
-      vst1q_f32(out_ptr0, vzero);
-      vst1q_f32(out_ptr0 + 4, vzero);
-      out_ptr0 += 16;
-      vst1q_f32(out_ptr0, vzero);
-      vst1q_f32(out_ptr0 + 4, vzero);
-      out_ptr0 += 16;
-      vst1q_f32(out_ptr0, vzero);
-      vst1q_f32(out_ptr0 + 4, vzero);
-    }
-  }
-  // remain k
-  for (int i = (k & 0xFFFFFFFC); i < k; ++i) {
-    const float *b0 = B + i * ldb;
-    int j = 0;
-    asm volatile("prfm   pldl1keep,       [%[b0]]            \n"
-                 :
-                 : [b0] "r"(b0));
-
-    for (; j < n - 15; j += 16) {
-      float *out_ptr0 = output + j * k + 16 * i;
-      asm volatile(
-          "ld1    {v0.4s, v1.4s, v2.4s, v3.4s},     [%[b0]], #64  \n"
-          "st1    {v0.4s, v1.4s, v2.4s, v3.4s},     [%[out_ptr0]], #64 \n"
-          : [out_ptr0] "+r"(out_ptr0), [b0] "+r"(b0)
-          :
-          : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-    }
-    for (; j < n - 7; j += 8) {
-      float *out_ptr0 = output + (j & 0xFFFFFFF0) * k + 16 * i + (j & 0xF);
-      int step = 64;
-      asm volatile(
-          "ld1    {v0.4s, v1.4s},  [%[b0]], #32  \n"
-          "st1    {v0.4s, v1.4s},  [%[out_ptr0]], %[step] \n"
-          : [out_ptr0] "+r"(out_ptr0), [b0] "+r"(b0)
-          : [step] "r"(step)
-          : "memory", "v0", "v1");
-    }
-    if (j < n) {
-      float *out_ptr0 = output + (j & 0xFFFFFFF0) * k + 16 * i + (j & 0xF);
-      asm volatile(
-          "ld1    {v0.4s, v1.4s},  [%[b0]]          \n"
-          "and    v0.16b, v0.16b,  %[vmask1].16b    \n"
-          "and    v1.16b, v1.16b,  %[vmask2].16b    \n"
-          "st1    {v0.4s, v1.4s},  [%[out_ptr0]]    \n"
-          : [out_ptr0] "+r"(out_ptr0)
-          : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [b0] "r"(b0)
-          : "memory", "v0", "v1");
-      j += 8;
-    }
-    if (j & 0xf) {
-      float *out_ptr0 = output + (j & 0xFFFFFFF0) * k + 16 * i + (j & 0xF);
-      vst1q_f32(out_ptr0, vzero);
-      vst1q_f32(out_ptr0 + 4, vzero);
-    }
-  }
-}
-#else
-
-void pack_rhs_8c(int k, int n, const float *B, int ldb, float *output,
-                 const bool unroll) {
-  uint32_t mask[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  uint32_t remain_n = n & 0x7;
-  uint32x4_t vmask1 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_n));
-  uint32x4_t vmask2 = vcltq_u32(vld1q_u32(mask + 4), vdupq_n_u32(remain_n));
-
-  #pragma omp parallel for if (unroll)
-  for (int i = 0; i < k - 3; i += 4) {
-    const float *b0 = B + i * ldb;
-    const float *b1 = b0 + ldb;
-    const float *b2 = b1 + ldb;
-    const float *b3 = b2 + ldb;
-    int j = 0;
-    for (; j < n - 15; j += 16) {
-      float *out_ptr0 = output + j * k + 8 * i;
-      float *out_ptr1 = out_ptr0 + 8 * k;
-      asm volatile(
-          "vld1.32  {q0, q1},   [%[b0]]!          \n"
-          "vld1.32  {q2, q3},   [%[b1]]!          \n"
-          "vld1.32  {q4, q5},   [%[b0]]!          \n"
-          "vld1.32  {q6, q7},   [%[b1]]!          \n"
-          "vst1.32  {q0, q1},   [%[out_ptr0]]!    \n"
-          "vst1.32  {q2, q3},   [%[out_ptr0]]!    \n"
-          "vst1.32  {q4, q5},   [%[out_ptr1]]!    \n"
-          "vst1.32  {q6, q7},   [%[out_ptr1]]!    \n"
-
-          "vld1.32  {q0, q1},   [%[b2]]!          \n"
-          "vld1.32  {q2, q3},   [%[b3]]!          \n"
-          "vld1.32  {q4, q5},   [%[b2]]!          \n"
-          "vld1.32  {q6, q7},   [%[b3]]!          \n"
-          "vst1.32  {q0, q1},   [%[out_ptr0]]!    \n"
-          "vst1.32  {q2, q3},   [%[out_ptr0]]!    \n"
-          "vst1.32  {q4, q5},   [%[out_ptr1]]!    \n"
-          "vst1.32  {q6, q7},   [%[out_ptr1]]!    \n"
-          : [out_ptr0] "+r"(out_ptr0), [out_ptr1] "+r"(out_ptr1), [b0] "+r"(b0),
-            [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3)
-          :
-          : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
-    }
-    for (; j < n - 7; j += 8) {
-      float *out_ptr0 = output + j * k + 8 * i;
-      asm volatile(
-          "vld1.32  {q0, q1},   [%[b0]]!          \n"
-          "vld1.32  {q2, q3},   [%[b1]]!          \n"
-          "vld1.32  {q4, q5},   [%[b2]]!          \n"
-          "vld1.32  {q6, q7},   [%[b3]]!          \n"
-          "vst1.32  {q0, q1},   [%[out_ptr0]]!    \n"
-          "vst1.32  {q2, q3},   [%[out_ptr0]]!    \n"
-          "vst1.32  {q4, q5},   [%[out_ptr0]]!    \n"
-          "vst1.32  {q6, q7},   [%[out_ptr0]]!    \n"
-          : [out_ptr0] "+r"(out_ptr0), [b0] "+r"(b0), [b1] "+r"(b1),
-            [b2] "+r"(b2), [b3] "+r"(b3)
-          :
-          : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
-    }
-    if (j < n) {
-      float *out_ptr0 = output + j * k + 8 * i;
-      asm volatile(
-          "vld1.32  {q0, q1},   [%[b0]]               \n"
-          "vld1.32  {q2, q3},   [%[b1]]               \n"
-          "vld1.32  {q4, q5},   [%[b2]]               \n"
-          "vld1.32  {q6, q7},   [%[b3]]               \n"
-          "vand     q0, q0, %q[vmask1]         \n"
-          "vand     q1, q1, %q[vmask2]         \n"
-          "vand     q2, q2, %q[vmask1]         \n"
-          "vand     q3, q3, %q[vmask2]         \n"
-          "vand     q4, q4, %q[vmask1]         \n"
-          "vand     q5, q5, %q[vmask2]         \n"
-          "vand     q6, q6, %q[vmask1]         \n"
-          "vand     q7, q7, %q[vmask2]         \n"
-
-          "vst1.32  {q0, q1},   [%[out_ptr0]]!        \n"
-          "vst1.32  {q2, q3},   [%[out_ptr0]]!        \n"
-          "vst1.32  {q4, q5},   [%[out_ptr0]]!        \n"
-          "vst1.32  {q6, q7},   [%[out_ptr0]]!        \n"
-          : [out_ptr0] "+r"(out_ptr0)
-          : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [b0] "r"(b0),
-            [b1] "r"(b1), [b2] "r"(b2), [b3] "r"(b3)
-          : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
-    }
-  }
-  // remain k
-  for (int i = (k & 0xFFFFFFFC); i < k; ++i) {
-    const float *b0 = B + i * ldb;
-    int j = 0;
-    for (; j < n - 15; j += 16) {
-      float *out_ptr0 = output + j * k + 8 * i;
-      float *out_ptr1 = out_ptr0 + 8 * k;
-      asm volatile(
-          "vld1.32  {q0, q1},   [%[b0]]!          \n"
-          "vld1.32  {q2, q3},   [%[b0]]!          \n"
-          "vst1.32  {q0, q1},   [%[out_ptr0]]!    \n"
-          "vst1.32  {q2, q3},   [%[out_ptr1]]!    \n"
-          : [out_ptr0] "+r"(out_ptr0), [out_ptr1] "+r"(out_ptr1), [b0] "+r"(b0)
-          :
-          : "memory", "q0", "q1", "q2", "q3");
-    }
-    for (; j < n - 7; j += 8) {
-      float *out_ptr0 = output + j * k + 8 * i;
-      asm volatile(
-          "vld1.32  {q0, q1},   [%[b0]]!          \n"
-          "vst1.32  {q0, q1},   [%[out_ptr0]]!    \n"
-          : [out_ptr0] "+r"(out_ptr0), [b0] "+r"(b0)
-          :
-          : "memory", "q0", "q1");
-    }
-    if (j < n) {
-      float *out_ptr0 = output + j * k + 8 * i;
-      asm volatile(
-          "vld1.32  {q0, q1},   [%[b0]]           \n"
-          "vand     q0, q0, %q[vmask1]            \n"
-          "vand     q1, q1, %q[vmask2]            \n"
-          "vst1.32  {q0, q1},   [%[out_ptr0]]     \n"
-          : [out_ptr0] "+r"(out_ptr0)
-          : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [b0] "r"(b0)
-          : "memory", "q0", "q1");
-    }
-  }
-}
-#endif  // __aarch64__
-
-void write_back_alpha_beta(const int mc, const int nc, const float alpha,
-                           const float *c, const int ldc1, const float beta,
-                           float *C, const int ldc2) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  float32x4_t _alpha = vdupq_n_f32(alpha);
-  float32x4_t _beta = vdupq_n_f32(beta);
-  float32x4_t cv, cv2;
-  for (int i = 0; i < mc; ++i) {
-    const float *c_ptr = c + i * ldc1;
-    float *C_ptr = C + i * ldc2;
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      cv = vmulq_f32(_alpha, cv);
-      cv2 = vld1q_f32(C_ptr);
-      cv = vmlaq_f32(cv, _beta, cv2);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      cv = vmulq_f32(_alpha, cv);
-      cv2 = vld1q_f32(C_ptr);
-      cv = vmlaq_f32(cv, _beta, cv2);
-      switch (_nc1) {
-        case 3:
-          vst1q_lane_f32(C_ptr + 2, cv, 2);
-        case 2:
-          vst1_f32(C_ptr, vget_low_f32(cv));
-          break;
-        case 1:
-          vst1q_lane_f32(C_ptr, cv, 0);
-          break;
-      }
-    }
-  }
-}
-
-#if __aarch64__
-void write_back_alpha1_beta0(const int mc, const int nc, const float *c,
-                             const int ldc1, float *C, const int ldc2) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  const float *c_ptr;
-  float *C_ptr;
-  float32x4_t cv;
-  for (int i = 0; i < mc; ++i) {
-    c_ptr = c + i * ldc1;
-    C_ptr = C + i * ldc2;
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      switch (_nc1) {
-        case 3:
-          vst1q_lane_f32(C_ptr + 2, cv, 2);
-        case 2:
-          vst1_f32(C_ptr, vget_low_f32(cv));
-          break;
-        case 1:
-          vst1q_lane_f32(C_ptr, cv, 0);
-          break;
-      }
-    }
-  }
-}
-
-void write_back_alpha1_beta1(const int mc, const int nc, const float *c,
-                             const int ldc1, float *C, const int ldc2) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  const float *c_ptr;
-  float *C_ptr;
-  float32x4_t cv, cv2;
-  for (int i = 0; i < mc; ++i) {
-    c_ptr = c + i * ldc1;
-    C_ptr = C + i * ldc2;
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      cv2 = vld1q_f32(C_ptr);
-      cv = vaddq_f32(cv, cv2);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      cv2 = vld1q_f32(C_ptr);
-      cv = vaddq_f32(cv, cv2);
-      switch (_nc1) {
-        case 3:
-          vst1q_lane_f32(C_ptr + 2, cv, 2);
-        case 2:
-          vst1_f32(C_ptr, vget_low_f32(cv));
-          break;
-        case 1:
-          vst1q_lane_f32(C_ptr, cv, 0);
-          break;
-      }
-    }
-  }
-}
-
-#else
-void write_back_alpha1_beta0(const int mc, const int nc, const float *c,
-                             const int ldc1, float *C, const int ldc2) {
-  int nc1 = nc / 16;
-  int nc2 = nc % 16;
-  int step1 = 4 * (ldc1 - 16 * nc1);
-  int step2 = 4 * ldc2;
-  int volatile m = mc;
-
-  const float *volatile c_ptr = c;
-  float *volatile C_ptr = C;
-  if (nc1 > 0) {
-    asm volatile(
-        "subs       %[mc], %[mc], #1              \n\t"
-        "blt        end_mc_%=                     \n\t"
-        "loop_mc_%=:                              \n\t"
-
-        "mov        r6,   %[C_ptr]                \n\t"
-        "mov        r5,   %[nc1]                  \n\t"
-        "subs       r5,   r5,   #1                \n\t"
-        "blt        end_nc1_%=                    \n\t"
-        "loop_nc1_%=:                             \n\t"
-
-        "vld1.32    {q0, q1}, [%[c_ptr]]!         \n\t"
-        "vst1.32    {q0, q1}, [r6]!               \n\t"
-
-        "vld1.32    {q2, q3}, [%[c_ptr]]!         \n\t"
-        "vst1.32    {q2, q3}, [r6]!               \n\t"
-
-        "subs       r5,   r5,   #1                \n\t"
-        "bge        loop_nc1_%=                   \n\t"
-        "end_nc1_%=:                              \n\t"
-
-        "add        %[c_ptr], %[c_ptr], %[step1]  \n\t"
-        "add        %[C_ptr], %[C_ptr], %[step2]  \n\t"
-        "subs       %[mc], %[mc], #1              \n\t"
-        "bge        loop_mc_%=                    \n\t"
-        "end_mc_%=:                               \n\t"
-        :
-        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
-          [step1] "r"(step1), [step2] "r"(step2)
-        : "memory", "r5", "r6", "q0", "q1", "q2", "q3");
-  }
-
-  if (nc2 != 0) {
-    for (int i = 0; i < mc; i++) {
-      const float *c0 = c_ptr + nc1 * 16 + i * ldc1;
-      float *C0 = C_ptr + nc1 * 16 + i * ldc2;
-      for (int j = 0; j < nc2; j++) {
-        *C0++ = *c0++;
-      }
-    }
-  }
-}
-
-void write_back_alpha1_beta1(const int mc, const int nc, const float *c,
-                             const int ldc1, float *C, const int ldc2) {
-  int nc1 = nc / 16;
-  int nc2 = nc % 16;
-  int step1 = 4 * (ldc1 - 16 * nc1);
-  int step2 = 4 * ldc2;
-  int volatile m = mc;
-
-  const float *volatile c_ptr = c;
-  float *volatile C_ptr = C;
-  if (nc1 > 0) {
-    asm volatile(
-        "subs       %[mc], %[mc], #1              \n\t"
-        "blt        end_mc_%=                     \n\t"
-        "loop_mc_%=:                              \n\t"
-
-        "mov        r6,   %[C_ptr]                \n\t"
-        "mov        r5,   %[nc1]                  \n\t"
-        "subs       r5,   r5,   #1                \n\t"
-        "blt        end_nc1_%=                    \n\t"
-        "loop_nc1_%=:                             \n\t"
-
-        "vld1.32    {q0, q1}, [%[c_ptr]]!         \n\t"
-        "vld1.32    {q2, q3}, [r6]                \n\t"
-        "vadd.f32   q0, q0, q2                    \n\t"
-        "vadd.f32   q1, q1, q3                    \n\t"
-        "vst1.32    {q0, q1}, [r6]!               \n\t"
-
-        "vld1.32    {q0, q1}, [%[c_ptr]]!         \n\t"
-        "vld1.32    {q2, q3}, [r6]                \n\t"
-        "vadd.f32   q0, q0, q2                    \n\t"
-        "vadd.f32   q1, q1, q3                    \n\t"
-        "vst1.32    {q0, q1}, [r6]!               \n\t"
-
-        "subs       r5,   r5,   #1                \n\t"
-        "bge        loop_nc1_%=                   \n\t"
-        "end_nc1_%=:                              \n\t"
-
-        "add        %[c_ptr], %[c_ptr], %[step1]  \n\t"
-        "add        %[C_ptr], %[C_ptr], %[step2]  \n\t"
-        "subs       %[mc], %[mc], #1              \n\t"
-        "bge        loop_mc_%=                    \n\t"
-        "end_mc_%=:                               \n\t"
-        :
-        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
-          [step1] "r"(step1), [step2] "r"(step2)
-        : "memory", "r5", "r6", "q0", "q1", "q2", "q3");
-  }
-
-  if (nc2 != 0) {
-    for (int i = 0; i < mc; i++) {
-      const float *c0 = c_ptr + nc1 * 16 + i * ldc1;
-      float *C0 = C_ptr + nc1 * 16 + i * ldc2;
-      for (int j = 0; j < nc2; j++) {
-        *C0++ += *c0++;
-      }
-    }
-  }
-}
-#endif  // __aarch64__
-
-void write_back(const int mc, const int nc, const float alpha, const float *c,
-                const int ldc1, const float beta, float *C, const int ldc2) {
-  if (alpha == 1.f && beta == 0.f) {
-    write_back_alpha1_beta0(mc, nc, c, ldc1, C, ldc2);
-  } else if (alpha == 1.f && beta == 1.f) {
-    write_back_alpha1_beta1(mc, nc, c, ldc1, C, ldc2);
-  } else {
-    write_back_alpha_beta(mc, nc, alpha, c, ldc1, beta, C, ldc2);
-  }
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // __ARM_NEON__
diff --git a/mobile/src/operators/math/gemm/strategy.h b/mobile/src/operators/math/gemm/strategy.h
deleted file mode 100644
index 11e24fb1c3..0000000000
--- a/mobile/src/operators/math/gemm/strategy.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "operators/math/gemm/gemm_kernel.h"
-#include "operators/math/gemm/pack_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-struct SgemmStrategy {
-  typedef float Itype;
-  typedef float Otype;
-
-  typedef void (*packLhsFunc)(const int, const int, const Itype *, const int,
-                              Itype *, const bool);
-  typedef void (*packRhsFunc)(const int, const int, const Itype *, const int,
-                              Itype *, const bool);
-  typedef void (*kernelFunc)(const Itype *, const Itype *, const int, Otype *,
-                             const int);
-  typedef void (*WriteFunc)(const int, const int, const float alpha,
-                            const Otype *, const int, const float beta, Otype *,
-                            const int);
-
-  packLhsFunc pack_lhs;
-  packRhsFunc pack_rhs;
-  kernelFunc kernel;
-  WriteFunc write;
-
-  static int out_width() {
-#if __aarch64__
-    return 16;
-#else
-    return 8;
-#endif
-  }
-
-  static int out_height() { return 6; }
-
-  SgemmStrategy() {
-    pack_lhs = pack_lhs_6r;
-#if __aarch64__
-    pack_rhs = pack_rhs_16c;
-    kernel = sgemm_6x16;
-#else
-    pack_rhs = pack_rhs_8c;
-    kernel = sgemm_6x8;
-#endif
-    write = write_back;
-  }
-};
-
-struct I8o32gemmStrategy {
-  typedef int8_t Itype;
-  typedef int32_t Otype;
-
-  typedef void (*kern_type)(const Itype *, const Itype *, const int, Otype *,
-                            const int);
-  kern_type kernel;
-
-  static int out_width() { return 8; }
-
-  static int out_height() {
-#if __aarch64__
-    return 12;
-#else
-    return 6;
-#endif
-  }
-
-  I8o32gemmStrategy() {}
-};
-
-struct SgemvStrategy {
-  typedef float Itype;
-  typedef float Otype;
-
-  typedef void (*kernelFunc)(const bool, const int, const int, const float,
-                             const Itype *, const int, const Itype *,
-                             const float, Otype *);
-  kernelFunc kernel;
-
-  SgemvStrategy() { kernel = sgemv_mx1; }
-};
-
-struct I8o32gemvStrategy {
-  typedef int8_t Itype;
-  typedef int32_t Otype;
-
-  typedef void (*kern_type)(const Itype *, const Itype *, const int, Otype *,
-                            const int);
-  kern_type kernel;
-
-  static int out_width() { return 1; }
-
-  static int out_height() {
-#if __aarch64__
-    return 12;
-#else
-    return 6;
-#endif
-  }
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/gemm_int8.cpp b/mobile/src/operators/math/gemm_int8.cpp
deleted file mode 100644
index 19a5b88cbe..0000000000
--- a/mobile/src/operators/math/gemm_int8.cpp
+++ /dev/null
@@ -1,2077 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string.h>
-#include "common/log.h"
-#include "operators/math/gemm.h"
-#if __ARM_NEON
-#include <arm_neon.h>
-#include <iostream>
-
-#endif
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
-                     int32_t ldc) {
-#if __ARM_NEON
-#if __aarch64__
-// AddDot4x8 used only for aarch32
-#else
-  const int8_t *a_ptr, *b_ptr;
-  a_ptr = a;
-  b_ptr = b;
-  int32_t kc1 = k >> 3;
-  int32_t kc2 = k & 7;
-  int32_t kc3 = kc2 >> 2;
-  int32_t kc4 = kc2 & 3;
-  int32_t kc5 = kc4 >> 1;
-  int32_t kc6 = kc4 & 1;
-  int32_t step = sizeof(int32_t) * ldc;
-  asm volatile(
-      // q8-q15: save 32 results
-      "pld          [%[a_ptr]]                     \n\t"
-      "pld          [%[b_ptr]]                     \n\t"
-      "pld          [%[b_ptr], #64]                \n\t"
-      "vmov.s32     q8,         #0                 \n\t"
-      "vmov.s32     q9,         q8                 \n\t"
-      "vmov.s32     q10,        q8                 \n\t"
-      "vmov.s32     q11,        q8                 \n\t"
-      "vmov.s32     q12,        q8                 \n\t"
-      "vmov.s32     q13,        q8                 \n\t"
-      "vmov.s32     q14,        q8                 \n\t"
-      "vmov.s32     q15,        q8                 \n\t"
-      "subs         %[kc1],     %[kc1],       #1   \n\t"
-      "blt          1f                             \n\t"
-      "0:                                          \n\t"
-      "pld          [%[a_ptr], #64]                \n\t"
-      "pld          [%[b_ptr], #128]               \n\t"
-      "vld1.s8      {d0-d3},    [%[a_ptr]]!        \n\t"  // load A 8 cols
-      "vld1.s8      {d8-d11},   [%[b_ptr]]!        \n\t"  // load B first 4 rows
-      "vmovl.s8     q2,         d0                 \n\t"  // process B first
-                                                          // rows
-      "vmovl.s8     q3,         d8                 \n\t"
-      "vmlal.s16    q8,         d6,            d4[0]\n\t"
-      "vmlal.s16    q9,         d7,            d4[0]\n\t"
-      "vmlal.s16    q10,        d6,            d4[1]\n\t"
-      "vmlal.s16    q11,        d7,            d4[1]\n\t"
-      "vmlal.s16    q12,        d6,            d4[2]\n\t"
-      "vmlal.s16    q13,        d7,            d4[2]\n\t"
-      "vmlal.s16    q14,        d6,            d4[3]\n\t"
-      "vmlal.s16    q15,        d7,            d4[3]\n\t"
-      "vmovl.s8     q3,         d9                 \n\t"
-      "vmlal.s16    q8,         d6,            d5[0]\n\t"
-      "vmlal.s16    q9,         d7,            d5[0]\n\t"
-      "vmlal.s16    q10,        d6,            d5[1]\n\t"
-      "vmlal.s16    q11,        d7,            d5[1]\n\t"
-      "vmlal.s16    q12,        d6,            d5[2]\n\t"
-      "vmlal.s16    q13,        d7,            d5[2]\n\t"
-      "vmlal.s16    q14,        d6,            d5[3]\n\t"
-      "vmlal.s16    q15,        d7,            d5[3]\n\t"
-      "vld1.s8      {d12-d15},  [%[b_ptr]]!        \n\t"  // load B second 4
-                                                          // rows
-      "vmovl.s8     q2,         d1                 \n\t"
-      "vmovl.s8     q3,         d10                 \n\t"
-      "vmlal.s16    q8,         d6,            d4[0]\n\t"
-      "vmlal.s16    q9,         d7,            d4[0]\n\t"
-      "vmlal.s16    q10,        d6,            d4[1]\n\t"
-      "vmlal.s16    q11,        d7,            d4[1]\n\t"
-      "vmlal.s16    q12,        d6,            d4[2]\n\t"
-      "vmlal.s16    q13,        d7,            d4[2]\n\t"
-      "vmlal.s16    q14,        d6,            d4[3]\n\t"
-      "vmlal.s16    q15,        d7,            d4[3]\n\t"
-      "vmovl.s8     q3,         d11                 \n\t"
-      "vmlal.s16    q8,         d6,            d5[0]\n\t"
-      "vmlal.s16    q9,         d7,            d5[0]\n\t"
-      "vmlal.s16    q10,        d6,            d5[1]\n\t"
-      "vmlal.s16    q11,        d7,            d5[1]\n\t"
-      "vmlal.s16    q12,        d6,            d5[2]\n\t"
-      "vmlal.s16    q13,        d7,            d5[2]\n\t"
-      "vmlal.s16    q14,        d6,            d5[3]\n\t"
-      "vmlal.s16    q15,        d7,            d5[3]\n\t"
-      "vmovl.s8     q2,         d2                 \n\t"  // process B second 4
-                                                          // rows
-      "vmovl.s8     q3,         d12                 \n\t"
-      "vmlal.s16    q8,         d6,            d4[0]\n\t"
-      "vmlal.s16    q9,         d7,            d4[0]\n\t"
-      "vmlal.s16    q10,        d6,            d4[1]\n\t"
-      "vmlal.s16    q11,        d7,            d4[1]\n\t"
-      "vmlal.s16    q12,        d6,            d4[2]\n\t"
-      "vmlal.s16    q13,        d7,            d4[2]\n\t"
-      "vmlal.s16    q14,        d6,            d4[3]\n\t"
-      "vmlal.s16    q15,        d7,            d4[3]\n\t"
-      "vmovl.s8     q3,         d13                 \n\t"
-      "vmlal.s16    q8,         d6,            d5[0]\n\t"
-      "vmlal.s16    q9,         d7,            d5[0]\n\t"
-      "vmlal.s16    q10,        d6,            d5[1]\n\t"
-      "vmlal.s16    q11,        d7,            d5[1]\n\t"
-      "vmlal.s16    q12,        d6,            d5[2]\n\t"
-      "vmlal.s16    q13,        d7,            d5[2]\n\t"
-      "vmlal.s16    q14,        d6,            d5[3]\n\t"
-      "vmlal.s16    q15,        d7,            d5[3]\n\t"
-      "vmovl.s8     q2,         d3                 \n\t"
-      "vmovl.s8     q3,         d14                 \n\t"
-      "vmlal.s16    q8,         d6,            d4[0]\n\t"
-      "vmlal.s16    q9,         d7,            d4[0]\n\t"
-      "vmlal.s16    q10,        d6,            d4[1]\n\t"
-      "vmlal.s16    q11,        d7,            d4[1]\n\t"
-      "vmlal.s16    q12,        d6,            d4[2]\n\t"
-      "vmlal.s16    q13,        d7,            d4[2]\n\t"
-      "vmlal.s16    q14,        d6,            d4[3]\n\t"
-      "vmlal.s16    q15,        d7,            d4[3]\n\t"
-      "vmovl.s8     q3,         d15                 \n\t"
-      "vmlal.s16    q8,         d6,            d5[0]\n\t"
-      "vmlal.s16    q9,         d7,            d5[0]\n\t"
-      "vmlal.s16    q10,        d6,            d5[1]\n\t"
-      "vmlal.s16    q11,        d7,            d5[1]\n\t"
-      "vmlal.s16    q12,        d6,            d5[2]\n\t"
-      "vmlal.s16    q13,        d7,            d5[2]\n\t"
-      "vmlal.s16    q14,        d6,            d5[3]\n\t"
-      "vmlal.s16    q15,        d7,            d5[3]\n\t"
-
-      "subs         %[kc1],     %[kc1],        #1  \n\t"
-      "bge          0b                             \n\t"
-      "1:                                          \n\t"  // last 4 rows
-      "subs         %[kc3],     %[kc3],        #1  \n\t"
-      "blt          2f                             \n\t"
-      "vld1.s8      {d0-d1},    [%[a_ptr]]!        \n\t"  // load A 4 cols
-      "vld1.s8      {d8-d11},   [%[b_ptr]]!        \n\t"  // load B 4 rows
-      "vmovl.s8     q2,         d0                 \n\t"
-      "vmovl.s8     q3,         d8                 \n\t"
-      "vmlal.s16    q8,         d6,            d4[0]\n\t"
-      "vmlal.s16    q9,         d7,            d4[0]\n\t"
-      "vmlal.s16    q10,        d6,            d4[1]\n\t"
-      "vmlal.s16    q11,        d7,            d4[1]\n\t"
-      "vmlal.s16    q12,        d6,            d4[2]\n\t"
-      "vmlal.s16    q13,        d7,            d4[2]\n\t"
-      "vmlal.s16    q14,        d6,            d4[3]\n\t"
-      "vmlal.s16    q15,        d7,            d4[3]\n\t"
-      "vmovl.s8     q3,         d9                 \n\t"
-      "vmlal.s16    q8,         d6,            d5[0]\n\t"
-      "vmlal.s16    q9,         d7,            d5[0]\n\t"
-      "vmlal.s16    q10,        d6,            d5[1]\n\t"
-      "vmlal.s16    q11,        d7,            d5[1]\n\t"
-      "vmlal.s16    q12,        d6,            d5[2]\n\t"
-      "vmlal.s16    q13,        d7,            d5[2]\n\t"
-      "vmlal.s16    q14,        d6,            d5[3]\n\t"
-      "vmlal.s16    q15,        d7,            d5[3]\n\t"
-      "vmovl.s8     q2,         d1                 \n\t"
-      "vmovl.s8     q3,         d10                 \n\t"
-      "vmlal.s16    q8,         d6,            d4[0]\n\t"
-      "vmlal.s16    q9,         d7,            d4[0]\n\t"
-      "vmlal.s16    q10,        d6,            d4[1]\n\t"
-      "vmlal.s16    q11,        d7,            d4[1]\n\t"
-      "vmlal.s16    q12,        d6,            d4[2]\n\t"
-      "vmlal.s16    q13,        d7,            d4[2]\n\t"
-      "vmlal.s16    q14,        d6,            d4[3]\n\t"
-      "vmlal.s16    q15,        d7,            d4[3]\n\t"
-      "vmovl.s8     q3,         d11                 \n\t"
-      "vmlal.s16    q8,         d6,            d5[0]\n\t"
-      "vmlal.s16    q9,         d7,            d5[0]\n\t"
-      "vmlal.s16    q10,        d6,            d5[1]\n\t"
-      "vmlal.s16    q11,        d7,            d5[1]\n\t"
-      "vmlal.s16    q12,        d6,            d5[2]\n\t"
-      "vmlal.s16    q13,        d7,            d5[2]\n\t"
-      "vmlal.s16    q14,        d6,            d5[3]\n\t"
-      "vmlal.s16    q15,        d7,            d5[3]\n\t"
-      "2:                                          \n\t"  // last 2 rows
-      "subs         %[kc5],     %[kc5],        #1  \n\t"
-      "blt          3f                             \n\t"
-      "vld1.s8      {d0},       [%[a_ptr]]!        \n\t"  // load A 2 cols
-      "vld1.s8      {d8-d9},    [%[b_ptr]]!        \n\t"  // load B 2 rows
-      "vmovl.s8     q2,         d0                 \n\t"
-      "vmovl.s8     q3,         d8                 \n\t"
-      "vmlal.s16    q8,         d6,            d4[0]\n\t"
-      "vmlal.s16    q9,         d7,            d4[0]\n\t"
-      "vmlal.s16    q10,        d6,            d4[1]\n\t"
-      "vmlal.s16    q11,        d7,            d4[1]\n\t"
-      "vmlal.s16    q12,        d6,            d4[2]\n\t"
-      "vmlal.s16    q13,        d7,            d4[2]\n\t"
-      "vmlal.s16    q14,        d6,            d4[3]\n\t"
-      "vmlal.s16    q15,        d7,            d4[3]\n\t"
-      "vmovl.s8     q3,         d9                 \n\t"
-      "vmlal.s16    q8,         d6,            d5[0]\n\t"
-      "vmlal.s16    q9,         d7,            d5[0]\n\t"
-      "vmlal.s16    q10,        d6,            d5[1]\n\t"
-      "vmlal.s16    q11,        d7,            d5[1]\n\t"
-      "vmlal.s16    q12,        d6,            d5[2]\n\t"
-      "vmlal.s16    q13,        d7,            d5[2]\n\t"
-      "vmlal.s16    q14,        d6,            d5[3]\n\t"
-      "vmlal.s16    q15,        d7,            d5[3]\n\t"
-      "3:                                          \n\t"  // last 1 row
-      "subs         %[kc6],     %[kc6],        #1  \n\t"
-      "blt          4f                             \n\t"
-      "vld1.s8      {d0},       [%[a_ptr]]         \n\t"  // load A 1 col
-      "vld1.s8      {d8},       [%[b_ptr]]        \n\t"   // load B 1 row
-      "vmovl.s8     q2,         d0                 \n\t"
-      "vmovl.s8     q3,         d8                 \n\t"
-      "vmlal.s16    q8,         d6,            d4[0]\n\t"
-      "vmlal.s16    q9,         d7,            d4[0]\n\t"
-      "vmlal.s16    q10,        d6,            d4[1]\n\t"
-      "vmlal.s16    q11,        d7,            d4[1]\n\t"
-      "vmlal.s16    q12,        d6,            d4[2]\n\t"
-      "vmlal.s16    q13,        d7,            d4[2]\n\t"
-      "vmlal.s16    q14,        d6,            d4[3]\n\t"
-      "vmlal.s16    q15,        d7,            d4[3]\n\t"
-      "4:                                          \n\t"
-      "vst1.32      {q8, q9},   [%[c]],   %[step]  \n\t"
-      "vst1.32      {q10, q11}, [%[c]],   %[step]  \n\t"
-      "vst1.32      {q12, q13}, [%[c]],   %[step]  \n\t"
-      "vst1.32      {q14, q15}, [%[c]]             \n\t"
-      :
-      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
-        [kc3] "r"(kc3), [kc5] "r"(kc5), [kc6] "r"(kc6), [step] "r"(step)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
-        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-}
-
-// The core idea of AddDot4x2 and AddDot4x4 function is borrowed from the
-// Google's gemmlowp open source library. The address of gemmlowp is
-// https://github.com/google/gemmlowp.
-void Gemm::AddDot4x2(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
-                     int32_t ldc) {
-#if __ARM_NEON
-#if __aarch64__
-// AddDot4x2 used only for aarch32
-#else
-#define PADDLE_LABEL_LOOP "1"
-#define PADDLE_LABEL_AFTER_LOOP "2"
-  asm volatile(
-      "lsl %[ldc], %[ldc], #2 \n\t"  // sizeof(int32) == 4
-      "vldr d0, [%[b], #0] \n\t"
-      "vmov.s32 q8, #0 \n\t"
-      "vldr d4, [%[a], #0] \n\t"
-      "vmov.s32 q9, q8 \n\t"
-      "vldr d2, [%[b], #16] \n\t"
-      "vmov.s32 q10, q8 \n\t"
-      "vldr d6, [%[a], #16] \n\t"
-      "vmov.s32 q11, q8 \n\t"
-      "vldr d1, [%[b], #8]\n\t"
-      "vmov.s32 q12, q8 \n\t"
-      "vldr d5, [%[a], #8]\n"
-      "vmov.s32 q13, q8 \n\t"
-      "vldr d3, [%[b], #24]\n\t"
-      "vmov.s32 q14, q8 \n\t"
-      "vldr d7, [%[a], #24]\n"
-      "vmov.s32 q15, q8 \n\t"
-
-      PADDLE_LABEL_LOOP
-      ": \n\t"
-      "vmull.s8    q4,  d0,  d4 \n\t"  // first half
-      "add %[b], %[b], #32 \n\t"
-      "vmull.s8    q5,  d2,  d4 \n\t"
-      "vldr d4, [%[a], #32] \n\t"
-      "vmull.s8    q6,  d0,  d6 \n\t"
-      "vmull.s8    q7,  d2,  d6 \n\t"
-      "vldr d6, [%[a], #48] \n\t"
-
-      "vmlal.s8    q4,  d1,  d5 \n\t"  // second half
-      "vmlal.s8    q5,  d3,  d5 \n\t"
-      "vldr d5, [%[a], #40] \n\t"
-      "vmlal.s8    q6,  d1,  d7 \n\t"
-      "vmlal.s8    q7,  d3,  d7 \n\t"
-      "vldr d7, [%[a], #56] \n\t"
-
-      "vpadal.s16   q8,  q4 \n\t"  // pairwise-add
-      "add %[a], %[a], #64 \n\t"
-      "vpadal.s16   q9,  q5 \n\t"
-      "subs %[k], %[k], #16 \n\t"
-      "vpadal.s16   q10, q6 \n\t"
-      "vpadal.s16   q11, q7 \n\t"
-
-      "beq " PADDLE_LABEL_AFTER_LOOP
-      "f \n\t"
-
-      "vmull.s8    q4,  d0,  d4 \n\t"  // first half
-      "vmull.s8    q5,  d2,  d4 \n\t"
-      "vldr d4, [%[a], #0] \n\t"
-      "vmull.s8    q6,  d0,  d6 \n\t"
-      "vldr d0, [%[b], #0] \n\t"
-      "vmull.s8    q7,  d2,  d6 \n\t"
-      "vldr d2, [%[b], #16] \n\t"
-
-      "vmlal.s8    q4,  d1,  d5 \n\t"  // second half
-      "vldr d6, [%[a], #16] \n\t"
-      "vmlal.s8    q5,  d3,  d5 \n\t"
-      "vldr d5, [%[a], #8] \n\t"
-      "vmlal.s8    q6,  d1,  d7 \n\t"
-      "vldr d1, [%[b], #8] \n\t"
-      "vmlal.s8    q7,  d3,  d7 \n\t"
-      "vldr d3, [%[b], #24] \n\t"
-
-      "vpadal.s16   q12, q4 \n\t"  // pairwise-add
-      "vldr d7, [%[a], #24] \n\t"
-      "vpadal.s16   q13, q5 \n\t"
-      "vpadal.s16   q14, q6 \n\t"
-      "vpadal.s16   q15, q7 \n\t"
-
-      "b " PADDLE_LABEL_LOOP "b \n\t"
-
-      PADDLE_LABEL_AFTER_LOOP
-      ": \n\t"
-      "vmull.s8    q4,  d0,  d4 \n\t"  // first half
-      "vmull.s8    q5,  d2,  d4 \n\t"
-      "vmull.s8    q6,  d0,  d6 \n\t"
-      "vmull.s8    q7,  d2,  d6 \n\t"
-
-      "vmlal.s8    q4,  d1,  d5 \n\t"  // second half
-      "vmlal.s8    q5,  d3,  d5 \n\t"
-      "vmlal.s8    q6,  d1,  d7 \n\t"
-      "vmlal.s8    q7,  d3,  d7 \n\t"
-
-      "vpadal.s16   q12, q4 \n\t"  // pairwise-add
-      "vpadal.s16   q13, q5 \n\t"
-      "vpadal.s16   q14, q6 \n\t"
-      "vpadal.s16   q15, q7 \n\t"
-
-      "vpadd.s32 d0, d16, d17 \n\t"  // reduce to int32
-      "vpadd.s32 d1, d18, d19 \n\t"
-      "vpadd.s32 d2, d20, d21 \n\t"
-      "vpadd.s32 d3, d22, d23 \n\t"
-      "vpadd.s32 d4, d24, d25 \n\t"
-      "vpadd.s32 d5, d26, d27 \n\t"
-      "vpadd.s32 d6, d28, d29 \n\t"
-      "vpadd.s32 d7, d30, d31 \n\t"
-
-      "vpadd.s32 d8, d0, d1 \n\t"  // reduce to int32 again
-      "vpadd.s32 d9, d2, d3 \n\t"
-      "vpadd.s32 d10, d4, d5 \n\t"
-      "vpadd.s32 d11, d6, d7 \n\t"
-
-      "vst1.32 {d8}, [%[c]], %[ldc] \n\t"
-      "vst1.32 {d9}, [%[c]], %[ldc] \n\t"
-      "vst1.32 {d10}, [%[c]], %[ldc] \n\t"
-      "vst1.32 {d11}, [%[c]]  \n\t"
-
-      : [k] "+r"(k), [a] "+r"(a), [b] "+r"(b), [c] "+r"(c)
-      : [ldc] "r"(ldc)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
-        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-#undef PADDLE_LABEL_AFTER_LOOP
-#undef PADDLE_LABEL_LOOP
-
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-}
-
-void Gemm::AddDot4x4(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
-                     int32_t ldc) {
-#if __ARM_NEON
-#if __aarch64__
-#define PADDLE_LABEL_LOOP "1"
-#define PADDLE_LABEL_AFTER_LOOP "2"
-  asm volatile(
-      // load data from matrix a and b，and set zero to result register
-      "ld1 {v0.16b}, [%[b]], #16\n"
-      "dup v16.4s, wzr\n"
-      "ld1 {v4.16b}, [%[a]], #16\n"
-      "dup v17.4s, wzr\n"
-      "ld1 {v1.16b}, [%[b]], #16\n"
-      "dup v18.4s, wzr\n"
-      "ld1 {v5.16b}, [%[a]], #16\n"
-      "dup v19.4s, wzr\n"
-      "ld1 {v2.16b}, [%[b]], #16\n"
-      "dup v20.4s, wzr\n"
-      "ld1 {v3.16b}, [%[b]], #16\n"
-      "dup v21.4s, wzr\n"
-      "ld1 {v6.16b}, [%[a]], #16\n"
-      "dup v22.4s, wzr\n"
-      "ld1 {v7.16b}, [%[a]], #16\n"
-      "dup v23.4s, wzr\n"
-      "dup v24.4s, wzr\n"
-      "dup v25.4s, wzr\n"
-      "dup v26.4s, wzr\n"
-      "dup v27.4s, wzr\n"
-      "dup v28.4s, wzr\n"
-      "dup v29.4s, wzr\n"
-      "dup v30.4s, wzr\n"
-      "dup v31.4s, wzr\n"
-
-      // Multiply ldc by 4 == sizeof(int32)
-      "lsl %[ldc], %[ldc], #2\n"
-
-      // first half
-      "smull    v8.8h,  v0.8b,  v4.8b\n"
-      "smull    v9.8h,  v1.8b,  v4.8b\n"
-      "smull    v10.8h,  v2.8b,  v4.8b\n"
-      "smull    v11.8h,  v3.8b,  v4.8b\n"
-      "smull    v12.8h,  v0.8b,  v5.8b\n"
-      "smull    v13.8h,  v1.8b,  v5.8b\n"
-      "smull    v14.8h,  v2.8b,  v5.8b\n"
-      "smull    v15.8h,  v3.8b,  v5.8b\n"
-
-      // Multiply-accumulate second-half
-      "smlal2   v8.8h,  v0.16b,  v4.16b\n"
-      "smlal2   v9.8h,  v1.16b,  v4.16b\n"
-      "smlal2   v10.8h,  v2.16b,  v4.16b\n"
-      "smlal2   v11.8h,  v3.16b,  v4.16b\n"
-      "smlal2   v12.8h,  v0.16b,  v5.16b\n"
-      "smlal2   v13.8h,  v1.16b,  v5.16b\n"
-      "smlal2   v14.8h,  v2.16b,  v5.16b\n"
-      "smlal2   v15.8h,  v3.16b,  v5.16b\n"
-
-      "subs %[k], %[k], #16\n"
-
-      // skip the loop
-      "beq " PADDLE_LABEL_AFTER_LOOP "f\n"
-
-      // loop
-      PADDLE_LABEL_LOOP
-      ":\n"
-
-      // first half
-      "sadalp  v16.4s, v8.8h\n"
-      "ld1 {v4.16b}, [%[a]], #16\n"
-      "smull    v8.8h,  v0.8b,  v6.8b\n"
-      "sadalp  v17.4s, v9.8h\n"
-      "ld1 {v5.16b}, [%[a]], #16\n"
-      "smull    v9.8h,  v1.8b,  v6.8b\n"
-      "sadalp  v18.4s, v10.8h\n"
-      "smull    v10.8h,  v2.8b,  v6.8b\n"
-      "sadalp  v19.4s, v11.8h\n"
-      "smull    v11.8h,  v3.8b,  v6.8b\n"
-      "sadalp  v20.4s, v12.8h\n"
-      "smull    v12.8h,  v0.8b,  v7.8b\n"
-      "sadalp  v21.4s, v13.8h\n"
-      "smull    v13.8h,  v1.8b,  v7.8b\n"
-      "sadalp  v22.4s, v14.8h\n"
-      "smull    v14.8h,  v2.8b,  v7.8b\n"
-      "sadalp  v23.4s, v15.8h\n"
-      "smull    v15.8h,  v3.8b,  v7.8b\n"
-
-      // Multiply-accumulate second-half
-      "smlal2   v8.8h,  v0.16b,  v6.16b\n"
-      "smlal2   v9.8h,  v1.16b,  v6.16b\n"
-      "smlal2   v10.8h,  v2.16b,  v6.16b\n"
-      "smlal2   v11.8h,  v3.16b,  v6.16b\n"
-
-      "ld1 {v6.16b}, [%[a]], #16\n"
-
-      "smlal2   v12.8h,  v0.16b,  v7.16b\n"
-      "ld1 {v0.16b}, [%[b]], #16\n"
-      "smlal2   v13.8h,  v1.16b,  v7.16b\n"
-      "ld1 {v1.16b}, [%[b]], #16\n"
-      "smlal2   v14.8h,  v2.16b,  v7.16b\n"
-      "ld1 {v2.16b}, [%[b]], #16\n"
-      "smlal2   v15.8h,  v3.16b,  v7.16b\n"
-      "ld1 {v3.16b}, [%[b]], #16\n"
-
-      // first half
-      "sadalp  v24.4s, v8.8h\n"
-      "smull    v8.8h,  v0.8b,  v4.8b\n"
-      "sadalp  v25.4s, v9.8h\n"
-      "ld1 {v7.16b}, [%[a]], #16\n"
-      "smull    v9.8h,  v1.8b,  v4.8b\n"
-      "sadalp  v26.4s, v10.8h\n"
-      "smull    v10.8h,  v2.8b,  v4.8b\n"
-      "sadalp  v27.4s, v11.8h\n"
-      "smull    v11.8h,  v3.8b,  v4.8b\n"
-      "sadalp  v28.4s, v12.8h\n"
-      "smull    v12.8h,  v0.8b,  v5.8b\n"
-      "sadalp  v29.4s, v13.8h\n"
-      "smull    v13.8h,  v1.8b,  v5.8b\n"
-      "sadalp  v30.4s, v14.8h\n"
-      "smull    v14.8h,  v2.8b,  v5.8b\n"
-      "sadalp  v31.4s, v15.8h\n"
-      "smull    v15.8h,  v3.8b,  v5.8b\n"
-
-      // Multiply-accumulate second-half
-      "smlal2   v8.8h,  v0.16b,  v4.16b\n"
-      "smlal2   v9.8h,  v1.16b,  v4.16b\n"
-      "smlal2   v10.8h,  v2.16b,  v4.16b\n"
-      "smlal2   v11.8h,  v3.16b,  v4.16b\n"
-
-      // Loop
-      "subs %[k], %[k], #16\n"
-
-      "smlal2   v12.8h,  v0.16b,  v5.16b\n"
-      "smlal2   v13.8h,  v1.16b,  v5.16b\n"
-      "smlal2   v14.8h,  v2.16b,  v5.16b\n"
-      "smlal2   v15.8h,  v3.16b,  v5.16b\n"
-
-      "bne " PADDLE_LABEL_LOOP "b\n"
-
-      // Final
-      PADDLE_LABEL_AFTER_LOOP
-      ":\n"
-
-      // first half
-      "sadalp  v16.4s, v8.8h\n"
-      "smull    v8.8h,  v0.8b,  v6.8b\n"
-      "sadalp  v17.4s, v9.8h\n"
-      "smull    v9.8h,  v1.8b,  v6.8b\n"
-      "sadalp  v18.4s, v10.8h\n"
-      "smull    v10.8h,  v2.8b,  v6.8b\n"
-      "sadalp  v19.4s, v11.8h\n"
-      "smull    v11.8h,  v3.8b,  v6.8b\n"
-      "sadalp  v20.4s, v12.8h\n"
-      "smull    v12.8h,  v0.8b,  v7.8b\n"
-      "sadalp  v21.4s, v13.8h\n"
-      "smull    v13.8h,  v1.8b,  v7.8b\n"
-      "sadalp  v22.4s, v14.8h\n"
-      "smull    v14.8h,  v2.8b,  v7.8b\n"
-      "sadalp  v23.4s, v15.8h\n"
-      "smull    v15.8h,  v3.8b,  v7.8b\n"
-
-      // Multiply-accumulate second-half
-      "smlal2   v8.8h,  v0.16b,  v6.16b\n"
-      "smlal2   v9.8h,  v1.16b,  v6.16b\n"
-      "smlal2   v10.8h,  v2.16b,  v6.16b\n"
-      "smlal2   v11.8h,  v3.16b,  v6.16b\n"
-      "smlal2   v12.8h,  v0.16b,  v7.16b\n"
-      "smlal2   v13.8h,  v1.16b,  v7.16b\n"
-      "smlal2   v14.8h,  v2.16b,  v7.16b\n"
-      "smlal2   v15.8h,  v3.16b,  v7.16b\n"
-
-      "sadalp  v24.4s, v8.8h\n"
-      "sadalp  v25.4s, v9.8h\n"
-      "sadalp  v26.4s, v10.8h\n"
-      "sadalp  v27.4s, v11.8h\n"
-      "sadalp  v28.4s, v12.8h\n"
-      "sadalp  v29.4s, v13.8h\n"
-      "sadalp  v30.4s, v14.8h\n"
-      "sadalp  v31.4s, v15.8h\n"
-
-      // Reduce 32bit accumulators horizontally.
-      "addp v0.4s, v16.4s, v17.4s\n"
-      "addp v1.4s, v18.4s, v19.4s\n"
-      "addp v2.4s, v20.4s, v21.4s\n"
-      "addp v3.4s, v22.4s, v23.4s\n"
-      "addp v4.4s, v24.4s, v25.4s\n"
-      "addp v5.4s, v26.4s, v27.4s\n"
-      "addp v6.4s, v28.4s, v29.4s\n"
-      "addp v7.4s, v30.4s, v31.4s\n"
-
-      // Reduce 32bit accumulators horizontally, second pass
-      // (each pass adds pairwise. we need to add 4-wise).
-      "addp v12.4s, v0.4s, v1.4s\n"
-      "addp v13.4s, v2.4s, v3.4s\n"
-      "addp v14.4s, v4.4s, v5.4s\n"
-      "addp v15.4s, v6.4s, v7.4s\n"
-
-      "st1 {v12.4s}, [%[c]], %[ldc] \n\t"
-      "st1 {v13.4s}, [%[c]], %[ldc] \n\t"
-      "st1 {v14.4s}, [%[c]], %[ldc] \n\t"
-      "st1 {v15.4s}, [%[c]]  \n\t"
-
-      : [k] "+r"(k), [a] "+r"(a), [b] "+r"(b), [c] "+r"(c)  // outputs
-      : [ldc] "r"(ldc)                                      // inputs
-      : "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-        "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
-        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
-        "v28", "v29", "v30", "v31");  // clobbers
-#undef PADDLE_LABEL_AFTER_LOOP
-#undef PADDLE_LABEL_LOOP
-#else
-// AddDot4x2 used only for aarch64
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-}
-
-// 8 bits int small block inner product
-void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
-                     int32_t ldc) {
-#if __ARM_NEON
-#if __aarch64__
-// AddDot6x8 used only for aarch32
-#else
-  const int8_t *a_ptr, *b_ptr;
-  a_ptr = a;
-  b_ptr = b;
-  int32_t kc1 = k >> 3;
-  int32_t kc2 = k & 7;
-  int32_t kc3 = kc2 >> 2;
-  int32_t kc4 = kc2 & 3;
-  int32_t kc5 = kc4 >> 1;
-  int32_t kc6 = kc4 & 1;
-  int32_t step = sizeof(int32_t) * ldc;
-  asm volatile(
-      // q4-q15: save 48 results
-      "pld          [%[a_ptr]]                     \n\t"
-      "pld          [%[b_ptr]]                     \n\t"
-      "pld          [%[b_ptr], #64]                \n\t"
-      "vmov.s32     q4,         #0                 \n\t"
-      "vmov.s32     q5,         q4                 \n\t"
-      "vmov.s32     q6,         q4                 \n\t"
-      "vmov.s32     q7,         q4                 \n\t"
-      "vmov.s32     q8,         q4                 \n\t"
-      "vmov.s32     q9,         q4                 \n\t"
-      "vmov.s32     q10,        q4                 \n\t"
-      "vmov.s32     q11,        q4                 \n\t"
-      "vmov.s32     q12,        q4                 \n\t"
-      "vmov.s32     q13,        q4                 \n\t"
-      "vmov.s32     q14,        q4                 \n\t"
-      "vmov.s32     q15,        q4                 \n\t"
-      "mov r0,      #12                            \n\t"
-      "subs         %[kc1],     %[kc1],       #1   \n\t"
-      "blt          1f                             \n\t"
-      "0:                                          \n\t"
-      "pld          [%[a_ptr], #64]                \n\t"
-      "pld          [%[b_ptr], #128]               \n\t"
-      "vld1.s8      {d0-d2},    [%[a_ptr]]!        \n\t"  // A 4 cols
-      "vld1.s8      {d3},       [%[b_ptr]]!        \n\t"  // B 1st row
-      "vmovl.s8     q2,         d0                 \n\t"
-      "vmovl.s8     q3,         d3                 \n\t"
-      "vmlal.s16    q4,         d6,            d4[0]\n\t"
-      "vmlal.s16    q5,         d7,            d4[0]\n\t"
-      "vmlal.s16    q6,         d6,            d4[1]\n\t"
-      "vmlal.s16    q7,         d7,            d4[1]\n\t"
-      "vmlal.s16    q8,         d6,            d4[2]\n\t"
-      "vmlal.s16    q9,         d7,            d4[2]\n\t"
-      "vmlal.s16    q10,        d6,            d4[3]\n\t"
-      "vmlal.s16    q11,        d7,            d4[3]\n\t"
-      "vmlal.s16    q12,        d6,            d5[0]\n\t"
-      "vmlal.s16    q13,        d7,            d5[0]\n\t"
-      "vmlal.s16    q14,        d6,            d5[1]\n\t"
-      "vmlal.s16    q15,        d7,            d5[1]\n\t"
-      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 2nd row
-      "vmovl.s8     q3,         d3                  \n\t"
-      "vmlal.s16    q4,         d6,            d5[2]\n\t"
-      "vmlal.s16    q5,         d7,            d5[2]\n\t"
-      "vmlal.s16    q6,         d6,            d5[3]\n\t"
-      "vmlal.s16    q7,         d7,            d5[3]\n\t"
-      "vmovl.s8     q2,         d1                  \n\t"
-      "vmlal.s16    q8,         d6,            d4[0]\n\t"
-      "vmlal.s16    q9,         d7,            d4[0]\n\t"
-      "vmlal.s16    q10,        d6,            d4[1]\n\t"
-      "vmlal.s16    q11,        d7,            d4[1]\n\t"
-      "vmlal.s16    q12,        d6,            d4[2]\n\t"
-      "vmlal.s16    q13,        d7,            d4[2]\n\t"
-      "vmlal.s16    q14,        d6,            d4[3]\n\t"
-      "vmlal.s16    q15,        d7,            d4[3]\n\t"
-      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 3th row
-      "vmovl.s8     q3,         d3                  \n\t"
-      "vmlal.s16    q4,         d6,            d5[0]\n\t"
-      "vmlal.s16    q5,         d7,            d5[0]\n\t"
-      "vmlal.s16    q6,         d6,            d5[1]\n\t"
-      "vmlal.s16    q7,         d7,            d5[1]\n\t"
-      "vmlal.s16    q8,         d6,            d5[2]\n\t"
-      "vmlal.s16    q9,         d7,            d5[2]\n\t"
-      "vmlal.s16    q10,        d6,            d5[3]\n\t"
-      "vmlal.s16    q11,        d7,            d5[3]\n\t"
-      "vmovl.s8     q2,         d2                  \n\t"
-      "vmlal.s16    q12,        d6,            d4[0]\n\t"
-      "vmlal.s16    q13,        d7,            d4[0]\n\t"
-      "vmlal.s16    q14,        d6,            d4[1]\n\t"
-      "vmlal.s16    q15,        d7,            d4[1]\n\t"
-      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 4th row
-      "vmovl.s8     q3,         d3                  \n\t"
-      "vmlal.s16    q4,         d6,            d4[2]\n\t"
-      "vmlal.s16    q5,         d7,            d4[2]\n\t"
-      "vmlal.s16    q6,         d6,            d4[3]\n\t"
-      "vmlal.s16    q7,         d7,            d4[3]\n\t"
-      "vmlal.s16    q8,         d6,            d5[0]\n\t"
-      "vmlal.s16    q9,         d7,            d5[0]\n\t"
-      "vmlal.s16    q10,        d6,            d5[1]\n\t"
-      "vmlal.s16    q11,        d7,            d5[1]\n\t"
-      "vmlal.s16    q12,        d6,            d5[2]\n\t"
-      "vmlal.s16    q13,        d7,            d5[2]\n\t"
-      "vmlal.s16    q14,        d6,            d5[3]\n\t"
-      "vmlal.s16    q15,        d7,            d5[3]\n\t"
-
-      "vld1.s8      {d0-d2},    [%[a_ptr]]!        \n\t"  // A 4 cols
-      "vld1.s8      {d3},       [%[b_ptr]]!        \n\t"  // B 1st row
-      "vmovl.s8     q2,         d0                 \n\t"
-      "vmovl.s8     q3,         d3                 \n\t"
-      "vmlal.s16    q4,         d6,            d4[0]\n\t"
-      "vmlal.s16    q5,         d7,            d4[0]\n\t"
-      "vmlal.s16    q6,         d6,            d4[1]\n\t"
-      "vmlal.s16    q7,         d7,            d4[1]\n\t"
-      "vmlal.s16    q8,         d6,            d4[2]\n\t"
-      "vmlal.s16    q9,         d7,            d4[2]\n\t"
-      "vmlal.s16    q10,        d6,            d4[3]\n\t"
-      "vmlal.s16    q11,        d7,            d4[3]\n\t"
-      "vmlal.s16    q12,        d6,            d5[0]\n\t"
-      "vmlal.s16    q13,        d7,            d5[0]\n\t"
-      "vmlal.s16    q14,        d6,            d5[1]\n\t"
-      "vmlal.s16    q15,        d7,            d5[1]\n\t"
-      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 2nd row
-      "vmovl.s8     q3,         d3                  \n\t"
-      "vmlal.s16    q4,         d6,            d5[2]\n\t"
-      "vmlal.s16    q5,         d7,            d5[2]\n\t"
-      "vmlal.s16    q6,         d6,            d5[3]\n\t"
-      "vmlal.s16    q7,         d7,            d5[3]\n\t"
-      "vmovl.s8     q2,         d1                  \n\t"
-      "vmlal.s16    q8,         d6,            d4[0]\n\t"
-      "vmlal.s16    q9,         d7,            d4[0]\n\t"
-      "vmlal.s16    q10,        d6,            d4[1]\n\t"
-      "vmlal.s16    q11,        d7,            d4[1]\n\t"
-      "vmlal.s16    q12,        d6,            d4[2]\n\t"
-      "vmlal.s16    q13,        d7,            d4[2]\n\t"
-      "vmlal.s16    q14,        d6,            d4[3]\n\t"
-      "vmlal.s16    q15,        d7,            d4[3]\n\t"
-      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 3th row
-      "vmovl.s8     q3,         d3                  \n\t"
-      "vmlal.s16    q4,         d6,            d5[0]\n\t"
-      "vmlal.s16    q5,         d7,            d5[0]\n\t"
-      "vmlal.s16    q6,         d6,            d5[1]\n\t"
-      "vmlal.s16    q7,         d7,            d5[1]\n\t"
-      "vmlal.s16    q8,         d6,            d5[2]\n\t"
-      "vmlal.s16    q9,         d7,            d5[2]\n\t"
-      "vmlal.s16    q10,        d6,            d5[3]\n\t"
-      "vmlal.s16    q11,        d7,            d5[3]\n\t"
-      "vmovl.s8     q2,         d2                  \n\t"
-      "vmlal.s16    q12,        d6,            d4[0]\n\t"
-      "vmlal.s16    q13,        d7,            d4[0]\n\t"
-      "vmlal.s16    q14,        d6,            d4[1]\n\t"
-      "vmlal.s16    q15,        d7,            d4[1]\n\t"
-      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 4th row
-      "vmovl.s8     q3,         d3                  \n\t"
-      "vmlal.s16    q4,         d6,            d4[2]\n\t"
-      "vmlal.s16    q5,         d7,            d4[2]\n\t"
-      "vmlal.s16    q6,         d6,            d4[3]\n\t"
-      "vmlal.s16    q7,         d7,            d4[3]\n\t"
-      "vmlal.s16    q8,         d6,            d5[0]\n\t"
-      "vmlal.s16    q9,         d7,            d5[0]\n\t"
-      "vmlal.s16    q10,        d6,            d5[1]\n\t"
-      "vmlal.s16    q11,        d7,            d5[1]\n\t"
-      "vmlal.s16    q12,        d6,            d5[2]\n\t"
-      "vmlal.s16    q13,        d7,            d5[2]\n\t"
-      "vmlal.s16    q14,        d6,            d5[3]\n\t"
-      "vmlal.s16    q15,        d7,            d5[3]\n\t"
-
-      "subs         %[kc1],     %[kc1],        #1  \n\t"
-      "bge          0b                             \n\t"
-      "1:                                          \n\t"  // last <8 rows
-      "subs         %[kc3],     %[kc3],        #1  \n\t"
-      "blt          2f                             \n\t"
-      "vld1.s8      {d0-d2},    [%[a_ptr]]!        \n\t"  // A 4 cols
-      "vld1.s8      {d3},       [%[b_ptr]]!        \n\t"  // B 1st row
-      "vmovl.s8     q2,         d0                 \n\t"
-      "vmovl.s8     q3,         d3                 \n\t"
-      "vmlal.s16    q4,         d6,            d4[0]\n\t"
-      "vmlal.s16    q5,         d7,            d4[0]\n\t"
-      "vmlal.s16    q6,         d6,            d4[1]\n\t"
-      "vmlal.s16    q7,         d7,            d4[1]\n\t"
-      "vmlal.s16    q8,         d6,            d4[2]\n\t"
-      "vmlal.s16    q9,         d7,            d4[2]\n\t"
-      "vmlal.s16    q10,        d6,            d4[3]\n\t"
-      "vmlal.s16    q11,        d7,            d4[3]\n\t"
-      "vmlal.s16    q12,        d6,            d5[0]\n\t"
-      "vmlal.s16    q13,        d7,            d5[0]\n\t"
-      "vmlal.s16    q14,        d6,            d5[1]\n\t"
-      "vmlal.s16    q15,        d7,            d5[1]\n\t"
-      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 2nd row
-      "vmovl.s8     q3,         d3                  \n\t"
-      "vmlal.s16    q4,         d6,            d5[2]\n\t"
-      "vmlal.s16    q5,         d7,            d5[2]\n\t"
-      "vmlal.s16    q6,         d6,            d5[3]\n\t"
-      "vmlal.s16    q7,         d7,            d5[3]\n\t"
-      "vmovl.s8     q2,         d1                  \n\t"
-      "vmlal.s16    q8,         d6,            d4[0]\n\t"
-      "vmlal.s16    q9,         d7,            d4[0]\n\t"
-      "vmlal.s16    q10,        d6,            d4[1]\n\t"
-      "vmlal.s16    q11,        d7,            d4[1]\n\t"
-      "vmlal.s16    q12,        d6,            d4[2]\n\t"
-      "vmlal.s16    q13,        d7,            d4[2]\n\t"
-      "vmlal.s16    q14,        d6,            d4[3]\n\t"
-      "vmlal.s16    q15,        d7,            d4[3]\n\t"
-      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 3th row
-      "vmovl.s8     q3,         d3                  \n\t"
-      "vmlal.s16    q4,         d6,            d5[0]\n\t"
-      "vmlal.s16    q5,         d7,            d5[0]\n\t"
-      "vmlal.s16    q6,         d6,            d5[1]\n\t"
-      "vmlal.s16    q7,         d7,            d5[1]\n\t"
-      "vmlal.s16    q8,         d6,            d5[2]\n\t"
-      "vmlal.s16    q9,         d7,            d5[2]\n\t"
-      "vmlal.s16    q10,        d6,            d5[3]\n\t"
-      "vmlal.s16    q11,        d7,            d5[3]\n\t"
-      "vmovl.s8     q2,         d2                  \n\t"
-      "vmlal.s16    q12,        d6,            d4[0]\n\t"
-      "vmlal.s16    q13,        d7,            d4[0]\n\t"
-      "vmlal.s16    q14,        d6,            d4[1]\n\t"
-      "vmlal.s16    q15,        d7,            d4[1]\n\t"
-      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 4th row
-      "vmovl.s8     q3,         d3                  \n\t"
-      "vmlal.s16    q4,         d6,            d4[2]\n\t"
-      "vmlal.s16    q5,         d7,            d4[2]\n\t"
-      "vmlal.s16    q6,         d6,            d4[3]\n\t"
-      "vmlal.s16    q7,         d7,            d4[3]\n\t"
-      "vmlal.s16    q8,         d6,            d5[0]\n\t"
-      "vmlal.s16    q9,         d7,            d5[0]\n\t"
-      "vmlal.s16    q10,        d6,            d5[1]\n\t"
-      "vmlal.s16    q11,        d7,            d5[1]\n\t"
-      "vmlal.s16    q12,        d6,            d5[2]\n\t"
-      "vmlal.s16    q13,        d7,            d5[2]\n\t"
-      "vmlal.s16    q14,        d6,            d5[3]\n\t"
-      "vmlal.s16    q15,        d7,            d5[3]\n\t"
-
-      "2:                                          \n\t"  // last <4 rows
-      "subs         %[kc5],     %[kc5],        #1  \n\t"
-      "blt          3f                             \n\t"
-      "vld1.s8      {d0, d1},   [%[a_ptr]],    r0  \n\t"
-      "vld1.s8      {d3},       [%[b_ptr]]!        \n\t"  // B 1st row
-      "vmovl.s8     q2,         d0                 \n\t"
-      "vmovl.s8     q3,         d3                 \n\t"
-      "vmlal.s16    q4,         d6,            d4[0]\n\t"
-      "vmlal.s16    q5,         d7,            d4[0]\n\t"
-      "vmlal.s16    q6,         d6,            d4[1]\n\t"
-      "vmlal.s16    q7,         d7,            d4[1]\n\t"
-      "vmlal.s16    q8,         d6,            d4[2]\n\t"
-      "vmlal.s16    q9,         d7,            d4[2]\n\t"
-      "vmlal.s16    q10,        d6,            d4[3]\n\t"
-      "vmlal.s16    q11,        d7,            d4[3]\n\t"
-      "vmlal.s16    q12,        d6,            d5[0]\n\t"
-      "vmlal.s16    q13,        d7,            d5[0]\n\t"
-      "vmlal.s16    q14,        d6,            d5[1]\n\t"
-      "vmlal.s16    q15,        d7,            d5[1]\n\t"
-      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 2nd row
-      "vmovl.s8     q3,         d3                  \n\t"
-      "vmlal.s16    q4,         d6,            d5[2]\n\t"
-      "vmlal.s16    q5,         d7,            d5[2]\n\t"
-      "vmlal.s16    q6,         d6,            d5[3]\n\t"
-      "vmlal.s16    q7,         d7,            d5[3]\n\t"
-      "vmovl.s8     q2,         d1                  \n\t"
-      "vmlal.s16    q8,         d6,            d4[0]\n\t"
-      "vmlal.s16    q9,         d7,            d4[0]\n\t"
-      "vmlal.s16    q10,        d6,            d4[1]\n\t"
-      "vmlal.s16    q11,        d7,            d4[1]\n\t"
-      "vmlal.s16    q12,        d6,            d4[2]\n\t"
-      "vmlal.s16    q13,        d7,            d4[2]\n\t"
-      "vmlal.s16    q14,        d6,            d4[3]\n\t"
-      "vmlal.s16    q15,        d7,            d4[3]\n\t"
-      "3:                                          \n\t"  // last <2 rows
-      "subs         %[kc6],     %[kc6],        #1  \n\t"
-      "blt          4f                             \n\t"
-      "vld1.s8      {d0},       [%[a_ptr]]         \n\t"
-      "vld1.s8      {d3},       [%[b_ptr]]         \n\t"
-      "vmovl.s8     q2,         d0                 \n\t"
-      "vmovl.s8     q3,         d3                 \n\t"
-      "vmlal.s16    q4,         d6,            d4[0]\n\t"
-      "vmlal.s16    q5,         d7,            d4[0]\n\t"
-      "vmlal.s16    q6,         d6,            d4[1]\n\t"
-      "vmlal.s16    q7,         d7,            d4[1]\n\t"
-      "vmlal.s16    q8,         d6,            d4[2]\n\t"
-      "vmlal.s16    q9,         d7,            d4[2]\n\t"
-      "vmlal.s16    q10,        d6,            d4[3]\n\t"
-      "vmlal.s16    q11,        d7,            d4[3]\n\t"
-      "vmlal.s16    q12,        d6,            d5[0]\n\t"
-      "vmlal.s16    q13,        d7,            d5[0]\n\t"
-      "vmlal.s16    q14,        d6,            d5[1]\n\t"
-      "vmlal.s16    q15,        d7,            d5[1]\n\t"
-      "4:                                          \n\t"
-      "vst1.32      {q4, q5},   [%[c]],   %[step]  \n\t"
-      "vst1.32      {q6, q7},   [%[c]],   %[step]  \n\t"
-      "vst1.32      {q8, q9},   [%[c]],   %[step]  \n\t"
-      "vst1.32      {q10, q11}, [%[c]],   %[step]  \n\t"
-      "vst1.32      {q12, q13}, [%[c]],   %[step]  \n\t"
-      "vst1.32      {q14, q15}, [%[c]]             \n\t"
-      :
-      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
-        [kc3] "r"(kc3), [kc5] "r"(kc5), [kc6] "r"(kc6), [step] "r"(step)
-      : "cc", "memory", "r0", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-        "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-}
-
-// 8 bits int inner product
-template <>
-void Gemm::InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a,
-                       const int8_t *b, float beta, int32_t *c, int8_t *C,
-                       int32_t ldc, bool relu) {}
-template <>
-void Gemm::InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a,
-                       const int8_t *b, float beta, int32_t *c, int32_t *C,
-                       int32_t ldc, bool relu) {
-#pragma omp parallel for
-  for (int32_t j = 0; j < nc; j += NR_INT8) {
-    for (int32_t i = 0; i < mc; i += MR_INT8) {
-#if __aarch64__
-      AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#else
-      AddDot4x2(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#endif  // __aarch64__
-    }
-  }
-  if (!relu) {
-    WriteBasic(mc, nc, c, C, ldc);
-    return;
-  }
-}
-
-template <>
-void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha,
-                               const int8_t *a, const int8_t *b, float beta,
-                               int32_t *c, int8_t *C, int32_t ldc, bool relu,
-                               int32_t *bias, bool addOnRow) {
-#pragma omp parallel for
-  for (int32_t j = 0; j < nc; j += NR_INT8) {
-    for (int32_t i = 0; i < mc; i += MR_INT8) {
-#if __aarch64__
-      AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#else
-      AddDot4x2(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#endif  // __aarch64__
-    }
-  }
-  if (relu) {
-    WriteWithAddReluScale(mc, nc, c, C, ldc, bias, alpha);
-    return;
-  } else {
-    if (addOnRow) {
-      WriteWithAddScaleT(mc, nc, c, C, ldc, bias, alpha);
-    } else {
-      WriteWithAddScale(mc, nc, c, C, ldc, bias, alpha);
-    }
-  }
-}
-
-template <>
-void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha,
-                               const int8_t *a, const int8_t *b, float beta,
-                               int32_t *c, int32_t *C, int32_t ldc, bool relu,
-                               int32_t *bias, bool addOnRow) {}
-
-// 8 bits int PackMatrixA_4r
-void Gemm::PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
-                          int32_t lda, int8_t *buffer) {
-  const int8_t *a0, *a1, *a2, *a3;
-  for (int32_t i = 0; i < m - m_tail; i += 4) {
-    a0 = A + i * lda;
-    a1 = A + (i + 1) * lda;
-    a2 = A + (i + 2) * lda;
-    a3 = A + (i + 3) * lda;
-    for (int32_t j = 0; j < k; ++j) {
-      *buffer++ = *a0++;
-      *buffer++ = *a1++;
-      *buffer++ = *a2++;
-      *buffer++ = *a3++;
-    }
-  }
-
-  if (m_tail != 0) {
-    a0 = &A(m - m_tail, 0);
-    a1 = a0 + lda;
-    a2 = a0 + 2 * lda;
-    a3 = a0 + 3 * lda;
-    switch (m_tail) {
-      case 1:
-        a1 = zero_int8;
-      case 2:
-        a2 = zero_int8;
-      case 3:
-        a3 = zero_int8;
-        break;
-      default:
-        break;
-    }
-    for (int j = 0; j < k; ++j) {
-      *buffer++ = *a0++;
-      *buffer++ = *a1++;
-      *buffer++ = *a2++;
-      *buffer++ = *a3++;
-    }
-  }
-}
-
-// 8 bits int PackMatrixA_6r
-void Gemm::PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
-                          int32_t lda, int8_t *buffer) {
-  const int32_t i_length = m - m_tail;
-  for (int32_t i = 0; i < i_length; i += 6) {
-    const int8_t *a0 = A + i * lda;
-    const int8_t *a1 = A + (i + 1) * lda;
-    const int8_t *a2 = A + (i + 2) * lda;
-    const int8_t *a3 = A + (i + 3) * lda;
-    const int8_t *a4 = A + (i + 4) * lda;
-    const int8_t *a5 = A + (i + 5) * lda;
-    int8_t *local_buffer = buffer + i * k;
-    for (int32_t j = 0; j < k; ++j) {
-      *local_buffer++ = *a0++;
-      *local_buffer++ = *a1++;
-      *local_buffer++ = *a2++;
-      *local_buffer++ = *a3++;
-      *local_buffer++ = *a4++;
-      *local_buffer++ = *a5++;
-    }
-  }
-  if (m_tail != 0) {
-    const int8_t *a0 = &A(i_length, 0);
-    const int8_t *a1 = a0 + lda;
-    const int8_t *a2 = a0 + 2 * lda;
-    const int8_t *a3 = a0 + 3 * lda;
-    const int8_t *a4 = a0 + 4 * lda;
-    const int8_t *a5 = a0 + 5 * lda;
-    int8_t *local_buffer = buffer + i_length * k;
-    switch (m_tail) {
-      case 1:
-        a1 = zero_int8;
-      case 2:
-        a2 = zero_int8;
-      case 3:
-        a3 = zero_int8;
-      case 4:
-        a4 = zero_int8;
-      case 5:
-        a5 = zero_int8;
-        break;
-      default:
-        break;
-    }
-    for (int32_t j = 0; j < k; ++j) {
-      *local_buffer++ = *a0++;
-      *local_buffer++ = *a1++;
-      *local_buffer++ = *a2++;
-      *local_buffer++ = *a3++;
-      *local_buffer++ = *a4++;
-      *local_buffer++ = *a5++;
-    }
-  }
-}
-
-// 8 bits int PackMatrixB
-void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
-                          int32_t ldb, int8_t *buffer) {
-  const int32_t j_length = n - n_tail;
-  for (int32_t j = 0; j < j_length; j += 8) {
-    int8_t *local_buffer = buffer + j * k;
-    for (int32_t i = 0; i < k; ++i) {
-      const int8_t *b0 = &B(i, j);
-#if __ARM_NEON
-#if __aarch64__
-// PackMatrixB_8c used only for aarch32
-#else
-      asm volatile(
-          //          "pld        [%[b0]]                     \n\t"
-          "vld1.s8    {d0},   [%[b0]]         \n\t"
-          "vst1.s8    {d0},   [%[local_buffer]]!    \n\t"
-          : [local_buffer] "+r"(local_buffer)
-          : [b0] "r"(b0)
-          : "memory", "q0");
-#endif  // __aarch64__
-#else
-      *local_buffer++ = *b0++;
-      *local_buffer++ = *b0++;
-      *local_buffer++ = *b0++;
-      *local_buffer++ = *b0++;
-      *local_buffer++ = *b0++;
-      *local_buffer++ = *b0++;
-      *local_buffer++ = *b0++;
-      *local_buffer++ = *b0++;
-#endif  // __ARM_NEON
-    }
-  }
-  if (n_tail != 0) {
-    int8_t *local_buffer = buffer + j_length * k;
-    for (int32_t i = 0; i < k; ++i) {
-      const int8_t *b0 = &B(i, j_length);
-      for (int32_t j = j_length; j < n; ++j) {
-        *local_buffer++ = *b0++;
-      }
-      for (int32_t j = n; j < j_length + 8; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-}
-
-// 8 bits int PackMatrixA_4r
-void Gemm::PackMatrixA_4r_16(int32_t m, int32_t k, int32_t m_tail,
-                             const int8_t *A, int32_t lda, int8_t *buffer) {
-  const int32_t i_length = m - m_tail;
-  const int32_t k_count = k >> 4;
-  const int32_t k_tail = k & 15;
-
-  for (int32_t i = 0; i < i_length; i += 4) {
-    const int8_t *a0 = A + i * lda;
-    const int8_t *a1 = A + (i + 1) * lda;
-    const int8_t *a2 = A + (i + 2) * lda;
-    const int8_t *a3 = A + (i + 3) * lda;
-    int8_t *local_buffer = buffer + i * KC;
-    for (int32_t j = 0; j < k_count; ++j) {
-#if __ARM_NEON
-#if __aarch64__
-      asm volatile(
-          "ld1        {v0.16b},   [%[a0]],  #16    \n\t"
-          "ld1        {v1.16b},   [%[a1]],  #16    \n\t"
-          "ld1        {v2.16b},   [%[a2]],  #16    \n\t"
-          "ld1        {v3.16b},   [%[a3]],  #16    \n\t"
-          "st1        {v0.16b},   [%[local_buffer]],  #16   \n\t"
-          "st1        {v1.16b},   [%[local_buffer]],  #16   \n\t"
-          "st1        {v2.16b},   [%[local_buffer]],  #16   \n\t"
-          "st1        {v3.16b},   [%[local_buffer]],  #16   \n\t"
-          : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
-            [a2] "+r"(a2), [a3] "+r"(a3)
-          :
-          : "memory", "v0", "v1", "v2", "v3");
-#else
-      asm volatile(
-          "vld1.s8    {d0, d1},   [%[a0]]!         \n\t"
-          "vld1.s8    {d2, d3},   [%[a1]]!         \n\t"
-          "vld1.s8    {d4, d5},   [%[a2]]!         \n\t"
-          "vld1.s8    {d6, d7},   [%[a3]]!         \n\t"
-          "vst1.s8    {d0, d1},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d2, d3},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d4, d5},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d6, d7},   [%[local_buffer]]!    \n\t"
-          : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
-            [a2] "+r"(a2), [a3] "+r"(a3)
-          :
-          : "memory", "q0", "q1", "q2", "q3");
-#endif  // __aarch64__
-#else
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a0++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a1++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a2++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a3++;
-      }
-#endif  // __ARM_NEON
-    }
-    if (k_tail != 0) {
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a0++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a1++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a2++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a3++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-
-  if (m_tail != 0) {
-    const int8_t *a0 = &A(i_length, 0);
-    const int8_t *a1 = a0 + lda;
-    const int8_t *a2 = a0 + 2 * lda;
-    const int8_t *a3 = a0 + 3 * lda;
-    int8_t *local_buffer = buffer + i_length * KC;
-    switch (m_tail) {
-      case 1:
-        a1 = zero_int8;
-      case 2:
-        a2 = zero_int8;
-      case 3:
-        a3 = zero_int8;
-        break;
-      default:
-        break;
-    }
-    for (int32_t j = 0; j < k_count; ++j) {
-#if __ARM_NEON
-#if __aarch64__
-      asm volatile(
-          "ld1        {v0.16b},   [%[a0]],  #16    \n\t"
-          "ld1        {v1.16b},   [%[a1]],  #16    \n\t"
-          "ld1        {v2.16b},   [%[a2]],  #16    \n\t"
-          "ld1        {v3.16b},   [%[a3]],  #16    \n\t"
-          "st1        {v0.16b},   [%[local_buffer]],  #16   \n\t"
-          "st1        {v1.16b},   [%[local_buffer]],  #16   \n\t"
-          "st1        {v2.16b},   [%[local_buffer]],  #16   \n\t"
-          "st1        {v3.16b},   [%[local_buffer]],  #16   \n\t"
-          : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
-            [a2] "+r"(a2), [a3] "+r"(a3)
-          :
-          : "memory", "v0", "v1", "v2", "v3");
-#else
-      asm volatile(
-          "vld1.s8    {d0, d1},   [%[a0]]!         \n\t"
-          "vld1.s8    {d2, d3},   [%[a1]]!         \n\t"
-          "vld1.s8    {d4, d5},   [%[a2]]!         \n\t"
-          "vld1.s8    {d6, d7},   [%[a3]]!         \n\t"
-          "vst1.s8    {d0, d1},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d2, d3},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d4, d5},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d6, d7},   [%[local_buffer]]!    \n\t"
-          : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
-            [a2] "+r"(a2), [a3] "+r"(a3)
-          :
-          : "memory", "q0", "q1", "q2", "q3");
-#endif  // __aarch64__
-#else
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a0++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a1++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a2++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a3++;
-      }
-#endif  // __ARM_NEON
-    }
-    if (k_tail != 0) {
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a0++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a1++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a2++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a3++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-}
-
-// 8 bits int PackMatrixB
-void Gemm::PackMatrixB_2c_16(int32_t k, int32_t n, int32_t n_tail,
-                             const int8_t *B, int32_t ldb, int8_t *buffer) {
-  const int32_t j_length = n - n_tail;
-  const int32_t k_count = k >> 4;
-  const int32_t k_tail = k & 15;
-  for (int32_t j = 0; j < j_length; j += 2) {
-    int8_t *local_buffer = buffer + j * KC;
-    for (int32_t i = 0; i < k_count; ++i) {
-      const int8_t *b0 = &B((i << 4), j);
-      const int8_t *b1 = &B((i << 4), j + 1);
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b1;
-        b1 += ldb;
-      }
-    }
-    if (k_tail != 0) {
-      const int8_t *b0 = &B((k_count << 4), j);
-      const int8_t *b1 = &B((k_count << 4), j + 1);
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b1;
-        b1 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-  if (n_tail != 0) {
-    int8_t *local_buffer = buffer + j_length * KC;
-    for (int32_t i = 0; i < k_count; ++i) {
-      const int8_t *b0 = &B((i << 4), j_length);
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = 0;
-      }
-    }
-    if (k_tail != 0) {
-      const int8_t *b0 = &B((k_count << 4), j_length);
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-}
-
-void Gemm::PackMatrixB_4c_16(int32_t k, int32_t n, int32_t n_tail,
-                             const int8_t *B, int32_t ldb, int8_t *buffer) {
-  const int32_t j_length = n - n_tail;
-  const int32_t k_count = k >> 4;
-  const int32_t k_tail = k & 15;
-  for (int32_t j = 0; j < n; j += 4) {
-    int8_t *local_buffer = buffer + j * KC;
-    const int8_t *b0 = &B(0, j);
-    const int8_t *b1 = b0 + 1;
-    const int8_t *b2 = b0 + 2;
-    const int8_t *b3 = b0 + 3;
-    if (j > j_length) {
-      switch (n_tail) {
-        case 1:
-          b1 = zero_int8;
-        case 2:
-          b2 = zero_int8;
-        case 3:
-          b3 = zero_int8;
-          break;
-        default:
-          break;
-      }
-    }
-
-    for (int32_t i = 0; i < k_count; ++i) {
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b1;
-        b1 += ldb;
-      }
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b2;
-        b2 += ldb;
-      }
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b3;
-        b3 += ldb;
-      }
-    }
-    if (k_tail != 0) {
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b1;
-        b1 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b2;
-        b2 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b3;
-        b3 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-}
-
-//  8 bits int write back
-// C = A * B
-void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
-                      int32_t ldc) {
-#if __ARM_NEON
-#if __aarch64__
-  int32_t nc1 = nc / 4;
-  int32_t _nc1 = nc % 4;
-
-  int32_t *c_ptr, *C_ptr;
-  int32x4_t cv;
-  for (int32_t i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    for (int32_t j = 0; j < nc1; ++j) {
-      cv = vld1q_s32(c_ptr);
-      vst1q_s32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_s32(c_ptr);
-      if (_nc1 >= 1) {
-        vst1q_lane_s32(C_ptr, cv, 0);
-        C_ptr++;
-      }
-      if (_nc1 >= 2) {
-        vst1q_lane_s32(C_ptr, cv, 1);
-        C_ptr++;
-      }
-      if (_nc1 >= 3) {
-        vst1q_lane_s32(C_ptr, cv, 2);
-      }
-    }
-  }
-#else
-  int32_t nc1 = nc >> 4;
-  int32_t _nc1 = nc & 15;
-  int32_t step = sizeof(int32_t) * ldc;
-  int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 4));
-  int32_t volatile m = mc;
-  int32_t volatile n = nc1;
-  int32_t *volatile c_ptr, *volatile C_ptr;
-  int32_t *C0, *c0;
-  c_ptr = c;
-  C_ptr = C;
-  if (nc1 > 0) {
-    asm volatile(
-        "subs       %[mc], %[mc], #1        \n\t"
-        "blt        end_mc_%=               \n\t"
-        "loop_mc_%=:                        \n\t"
-
-        "mov        r6,   %[C_ptr]          \n\t"
-        "mov        r5,   %[nc1]            \n\t"
-        "subs       r5,   r5,   #1          \n\t"
-        "blt        end_nc1_%=              \n\t"
-        "loop_nc1_%=:                       \n\t"
-
-        "vld1.32    {q0, q1}, [%[c_ptr]]!   \n\t"
-        "vst1.32    {q0, q1}, [r6]!         \n\t"
-
-        "vld1.32    {q2, q3}, [%[c_ptr]]!   \n\t"
-        "vst1.32    {q2, q3}, [r6]!         \n\t"
-
-        "subs       r5,   r5,   #1          \n\t"
-        "bge        loop_nc1_%=             \n\t"
-        "end_nc1_%=:                        \n\t"
-
-        "add        %[C_ptr], %[C_ptr], %[step]   \n\t"
-        "add        %[c_ptr], %[c_ptr], %[step1]  \n\t"
-        "subs       %[mc], %[mc], #1        \n\t"
-        "bge        loop_mc_%=              \n\t"
-        "end_mc_%=:                         \n\t"
-
-        :
-        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n),
-          [step] "r"(step), [step1] "r"(step1)
-        : "memory", "r5", "r6", "q0", "q1", "q2", "q3");
-  }
-
-  if (_nc1 != 0) {
-    for (int32_t i = 0; i < mc; i++) {
-      C0 = C_ptr + nc1 * 16 + i * ldc;
-      c0 = c_ptr + nc1 * 16 + i * NC;
-      for (int32_t j = 0; j < _nc1; j++) {
-        *C0++ = *c0++;
-      }
-    }
-  }
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-}
-
-// C = A * B + bias, scale * C, bias is added on column
-void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
-                             int32_t ldc, int32_t *bias, float scale) {
-#if __ARM_NEON
-#if __aarch64__
-  int32_t nc1 = nc >> 3;
-  int32_t _nc1 = nc & 7;
-
-  int32_t *c_ptr;
-  int8_t *C_ptr;
-  int32x4_t cv0;
-  int32x4_t cv1;
-  int16x8_t cv_h;
-  int8x8_t cv_b;
-  int32x4_t biasv;
-  int8_t min = -127;
-  int8x8_t minv = vdup_n_s8(min);
-  for (int32_t i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    biasv = vld1q_dup_s32(bias + i);
-    for (int32_t j = 0; j < nc1; ++j) {
-      cv0 = vld1q_s32(c_ptr);
-      cv1 = vld1q_s32(c_ptr + 4);
-      cv0 = vqaddq_s32(cv0, biasv);
-      cv1 = vqaddq_s32(cv1, biasv);
-
-      cv_h = vcombine_s16(vqmovn_s32(cv0), vqmovn_s32(cv1));
-      cv_b = vqmovn_s16(cv_h);
-
-      cv_b = vmax_s8(cv_b, minv);
-      vst1_s8(C_ptr, cv_b);
-      c_ptr += 8;
-      C_ptr += 8;
-    }
-    if (_nc1 != 0) {
-      cv0 = vld1q_s32(c_ptr);
-      cv1 = vld1q_s32(c_ptr + 4);
-      cv0 = vqaddq_s32(cv0, biasv);
-      cv1 = vqaddq_s32(cv1, biasv);
-
-      cv_h = vcombine_s16(vqmovn_s32(cv0), vqmovn_s32(cv1));
-      cv_b = vqmovn_s16(cv_h);
-
-      cv_b = vmax_s8(cv_b, minv);
-
-      switch (_nc1) {
-        case 7:
-          vst1_lane_s8(C_ptr + 6, cv_b, 6);
-        case 6:
-          vst1_lane_s8(C_ptr + 5, cv_b, 5);
-        case 5:
-          vst1_lane_s8(C_ptr + 4, cv_b, 4);
-        case 4:
-          vst1_lane_s8(C_ptr + 3, cv_b, 3);
-        case 3:
-          vst1_lane_s8(C_ptr + 2, cv_b, 2);
-        case 2:
-          vst1_lane_s8(C_ptr + 1, cv_b, 1);
-        case 1:
-          vst1_lane_s8(C_ptr, cv_b, 0);
-        default:
-          break;
-      }
-    }
-  }
-#else
-  int8_t narrow = -128;
-  int32_t nc1 = nc >> 3;
-  int32_t _nc1 = nc & 7;
-  int32_t step = sizeof(int8_t) * ldc;
-  int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 3));
-  int32_t volatile m = mc;
-  int32_t volatile n = nc1;
-  int32_t *volatile c_ptr, *volatile bias_ptr;
-  int8_t *volatile C_ptr;
-  c_ptr = c;
-  C_ptr = C;
-  bias_ptr = bias;
-  if (nc1 > 0) {
-    asm volatile(
-        "subs       %[mc], %[mc], #1        \n\t"
-        "blt        end_mc_%=               \n\t"
-        "vdup.32    q15,  %[scale]          \n\t"
-        "vdup.8     d24,  %[narrow]         \n\t"
-        "loop_mc_%=:                        \n\t"
-        "vld1.32    {d26[0]}, [%[bias_ptr]]!\n\t"
-        "vdup.32    q13,  d26[0]            \n\t"
-        "mov        r6,   %[C_ptr]          \n\t"
-        "mov        r5,   %[nc1]            \n\t"
-        "subs       r5,   r5,   #1          \n\t"
-        "blt        end_nc1_%=              \n\t"
-        "loop_nc1_%=:                       \n\t"
-        "vld1.32    {q0, q1}, [%[c_ptr]]!   \n\t"
-        "vqadd.s32  q0, q0, q13             \n\t"
-        "vqadd.s32  q1, q1, q13             \n\t"
-        "vcvt.f32.s32 q2, q0                \n\t"
-        "vcvt.f32.s32 q3, q1                \n\t"
-        "vmul.f32   q2, q2, q15             \n\t"
-        "vmul.f32   q3, q3, q15             \n\t"
-        "vcvt.s32.f32 q4, q2                \n\t"
-        "vcvt.s32.f32 q5, q3                \n\t"
-        "vqmovn.s32 d12, q4                 \n\t"
-        "vqmovn.s32 d13, q5                 \n\t"
-        "vqmovn.s16 d14, q6                 \n\t"
-        "vceq.s8    d15, d14, d24           \n\t"
-        "vsub.s8    d14, d14, d15           \n\t"
-        "vst1.8     {d14}, [r6]!            \n\t"
-        "subs       r5,   r5,   #1          \n\t"
-        "bge        loop_nc1_%=             \n\t"
-        "end_nc1_%=:                        \n\t"
-
-        "add        %[C_ptr], %[C_ptr], %[step]  \n\t"
-        "add        %[c_ptr], %[c_ptr], %[step1] \n\t"
-        "subs       %[mc], %[mc], #1        \n\t"
-        "bge        loop_mc_%=              \n\t"
-        "end_mc_%=:                         \n\t"
-
-        :
-        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n),
-          [step] "r"(step), [step1] "r"(step1), [bias_ptr] "r"(bias_ptr),
-          [scale] "r"(scale), [narrow] "r"(narrow)
-        : "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
-          "q7", "q12", "q13", "q15");
-  }
-
-  int32_t nc_left;
-  int32_t *c0;
-  int8_t *C0;
-  int32_t bias_v;
-  if (_nc1 != 0) {
-    for (int32_t i = 0; i < mc; i++) {
-      C0 = C_ptr + nc1 * 8 + i * ldc;
-      c0 = c_ptr + nc1 * 8 + i * NC;
-      bias_v = *(bias_ptr + i);
-      nc_left = _nc1;
-      asm volatile(
-          "vdup.32    q15,  %[scale]          \n\t"
-          "vdup.8     d24,  %[narrow]         \n\t"
-          "vdup.32    q13,  %[bias_v]         \n\t"
-          "cmp        %[_nc1], #4             \n\t"
-          "blt        less_four_%=            \n\t"
-          "vld1.32    {q0}, [%[c0]]!          \n\t"
-          "vqadd.s32  q0, q0, q13             \n\t"
-          "vcvt.f32.s32 q1, q0                \n\t"
-          "vmul.f32   q1, q1, q15             \n\t"
-          "vcvt.s32.f32 q2, q1                \n\t"
-          "vqmovn.s32 d6, q2                  \n\t"
-          "vqmovn.s16 d8, q3                  \n\t"
-          "vceq.s8    d9, d8, d24             \n\t"
-          "vsub.s8    d8, d8, d9              \n\t"
-          "vst1.8     {d8[0]}, [%[C0]]!       \n\t"
-          "vst1.8     {d8[1]}, [%[C0]]!       \n\t"
-          "vst1.8     {d8[2]}, [%[C0]]!       \n\t"
-          "vst1.8     {d8[3]}, [%[C0]]!       \n\t"
-          "subs       %[_nc1], %[_nc1], #4    \n\t"
-          "beq        process_over_%=         \n\t"
-          "less_four_%=:                      \n\t"
-          "vld1.32    {q0}, [%[c0]]          \n\t"
-          "vqadd.s32  q0, q0, q13             \n\t"
-          "vcvt.f32.s32 q1, q0                \n\t"
-          "vmul.f32   q1, q1, q15             \n\t"
-          "vcvt.s32.f32 q2, q1                \n\t"
-          "vqmovn.s32 d6, q2                  \n\t"
-          "vqmovn.s16 d8, q3                  \n\t"
-          "vceq.s8    d9, d8, d24             \n\t"
-          "vsub.s8    d8, d8, d9              \n\t"
-          "loop_save_%=:                      \n\t"
-          "vst1.8     {d8[0]}, [%[C0]]!       \n\t"
-          "vext.8 d8, d8, d8, #1              \n\t"
-          "subs       %[_nc1], %[_nc1], #1    \n\t"
-          "bgt        loop_save_%=            \n\t"
-          "process_over_%=:                   \n\t"
-          :
-          : [_nc1] "r"(nc_left), [C0] "r"(C0), [c0] "r"(c0),
-            [bias_v] "r"(bias_v), [scale] "r"(scale), [narrow] "r"(narrow)
-          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q15");
-    }
-  }
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-}
-
-// C = A * B + bias, scale * C, bias is added on row
-void Gemm::WriteWithAddScaleT(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
-                              int32_t ldc, int32_t *bias, float scale) {
-#if __ARM_NEON
-#if __aarch64__
-  int32_t nc1 = nc >> 3;
-  int32_t _nc1 = nc & 7;
-
-  int32_t *c_ptr;
-  int8_t *C_ptr;
-  int32x4_t cv0;
-  int32x4_t cv1;
-  int16x8_t cv_h;
-  int8x8_t cv_b;
-  int32_t *bias_ptr;
-  int32x4_t biasv0;
-  int32x4_t biasv1;
-  int8_t min = -127;
-  int8x8_t minv = vdup_n_s8(min);
-  for (int32_t i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    bias_ptr = bias;
-    for (int32_t j = 0; j < nc1; ++j) {
-      cv0 = vld1q_s32(c_ptr);
-      cv1 = vld1q_s32(c_ptr + 4);
-      biasv0 = vld1q_s32(bias_ptr);
-      biasv1 = vld1q_s32(bias_ptr + 4);
-      cv0 = vqaddq_s32(cv0, biasv0);
-      cv1 = vqaddq_s32(cv1, biasv1);
-
-      cv_h = vcombine_s16(vqmovn_s32(cv0), vqmovn_s32(cv1));
-      cv_b = vqmovn_s16(cv_h);
-
-      cv_b = vmax_s8(cv_b, minv);
-      vst1_s8(C_ptr, cv_b);
-      c_ptr += 8;
-      C_ptr += 8;
-      bias_ptr += 8;
-    }
-    if (_nc1 != 0) {
-      cv0 = vld1q_s32(c_ptr);
-      cv1 = vld1q_s32(c_ptr + 4);
-      biasv0 = vld1q_s32(bias_ptr);
-      biasv1 = vld1q_s32(bias_ptr + 4);
-      cv0 = vqaddq_s32(cv0, biasv0);
-      cv1 = vqaddq_s32(cv1, biasv1);
-
-      cv_h = vcombine_s16(vqmovn_s32(cv0), vqmovn_s32(cv1));
-      cv_b = vqmovn_s16(cv_h);
-
-      cv_b = vmax_s8(cv_b, minv);
-
-      switch (_nc1) {
-        case 7:
-          vst1_lane_s8(C_ptr + 6, cv_b, 6);
-        case 6:
-          vst1_lane_s8(C_ptr + 5, cv_b, 5);
-        case 5:
-          vst1_lane_s8(C_ptr + 4, cv_b, 4);
-        case 4:
-          vst1_lane_s8(C_ptr + 3, cv_b, 3);
-        case 3:
-          vst1_lane_s8(C_ptr + 2, cv_b, 2);
-        case 2:
-          vst1_lane_s8(C_ptr + 1, cv_b, 1);
-        case 1:
-          vst1_lane_s8(C_ptr, cv_b, 0);
-        default:
-          break;
-      }
-    }
-  }
-#else
-  int8_t narrow = -128;
-  int32_t nc1 = nc >> 3;
-  int32_t _nc1 = nc & 7;
-  int32_t step = sizeof(int8_t) * ldc;
-  int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 3));
-  int32_t volatile m = mc;
-  int32_t volatile n = nc1;
-  int32_t *volatile c_ptr, *volatile bias_ptr;
-  int8_t *volatile C_ptr;
-  c_ptr = c;
-  C_ptr = C;
-  bias_ptr = bias;
-  if (nc1 > 0) {
-    asm volatile(
-        "subs       %[mc], %[mc], #1        \n\t"
-        "blt        end_mc_%=               \n\t"
-        "vdup.32    q15,  %[scale]          \n\t"
-        "vdup.8     d24,  %[narrow]         \n\t"
-        "loop_mc_%=:                        \n\t"
-        "mov        r4,   %[bias_ptr]       \n\t"
-        "mov        r6,   %[C_ptr]          \n\t"
-        "mov        r5,   %[nc1]            \n\t"
-        "subs       r5,   r5,   #1          \n\t"
-        "blt        end_nc1_%=              \n\t"
-        "loop_nc1_%=:                       \n\t"
-        "vld1.32    {q13, q14}, [r4]!        \n\t"
-        "vld1.32    {q0, q1}, [%[c_ptr]]!   \n\t"
-        "vqadd.s32  q0, q0, q13             \n\t"
-        "vqadd.s32  q1, q1, q14             \n\t"
-        "vcvt.f32.s32 q2, q0                \n\t"
-        "vcvt.f32.s32 q3, q1                \n\t"
-        "vmul.f32   q2, q2, q15             \n\t"
-        "vmul.f32   q3, q3, q15             \n\t"
-        "vcvt.s32.f32 q4, q2                \n\t"
-        "vcvt.s32.f32 q5, q3                \n\t"
-        "vqmovn.s32 d12, q4                 \n\t"
-        "vqmovn.s32 d13, q5                 \n\t"
-        "vqmovn.s16 d14, q6                 \n\t"
-        "vceq.s8    d15, d14, d24           \n\t"
-        "vsub.s8    d14, d14, d15           \n\t"
-        "vst1.8     {d14}, [r6]!            \n\t"
-        "subs       r5,   r5,   #1          \n\t"
-        "bge        loop_nc1_%=             \n\t"
-        "end_nc1_%=:                        \n\t"
-
-        "add        %[C_ptr], %[C_ptr], %[step]  \n\t"
-        "add        %[c_ptr], %[c_ptr], %[step1] \n\t"
-        "subs       %[mc], %[mc], #1        \n\t"
-        "bge        loop_mc_%=              \n\t"
-        "end_mc_%=:                         \n\t"
-
-        :
-        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n),
-          [step] "r"(step), [step1] "r"(step1), [bias_ptr] "r"(bias_ptr),
-          [scale] "r"(scale), [narrow] "r"(narrow)
-        : "cc", "memory", "r4", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5",
-          "q6", "q7", "q12", "q13", "q15");
-  }
-
-  int32_t nc_left;
-  int32_t *c0;
-  int8_t *C0;
-  int32_t *volatile bias0 = bias_ptr + nc1 * 8;
-  if (_nc1 != 0) {
-    for (int32_t i = 0; i < mc; i++) {
-      C0 = C_ptr + nc1 * 8 + i * ldc;
-      c0 = c_ptr + nc1 * 8 + i * NC;
-      nc_left = _nc1;
-      asm volatile(
-          "vdup.32    q15,  %[scale]          \n\t"
-          "vdup.8     d24,  %[narrow]         \n\t"
-          "cmp        %[_nc1], #4             \n\t"
-          "blt        less_four_%=            \n\t"
-          "vld1.32    {q0}, [%[c0]]!          \n\t"
-          "vld1.32    {q13}, [%[bias0]]!      \n\t"
-          "vqadd.s32  q0, q0, q13             \n\t"
-          "vcvt.f32.s32 q1, q0                \n\t"
-          "vmul.f32   q1, q1, q15             \n\t"
-          "vcvt.s32.f32 q2, q1                \n\t"
-          "vqmovn.s32 d6, q2                  \n\t"
-          "vqmovn.s16 d8, q3                  \n\t"
-          "vceq.s8    d9, d8, d24             \n\t"
-          "vsub.s8    d8, d8, d9              \n\t"
-          "vst1.8     {d8[0]}, [%[C0]]!       \n\t"
-          "vst1.8     {d8[1]}, [%[C0]]!       \n\t"
-          "vst1.8     {d8[2]}, [%[C0]]!       \n\t"
-          "vst1.8     {d8[3]}, [%[C0]]!       \n\t"
-          "subs       %[_nc1], %[_nc1], #4    \n\t"
-          "beq        process_over_%=         \n\t"
-          "less_four_%=:                      \n\t"
-          "vld1.32    {q0}, [%[c0]]           \n\t"
-          "vld1.32    {q13}, [%[bias0]]       \n\t"
-          "vqadd.s32  q0, q0, q13             \n\t"
-          "vcvt.f32.s32 q1, q0                \n\t"
-          "vmul.f32   q1, q1, q15             \n\t"
-          "vcvt.s32.f32 q2, q1                \n\t"
-          "vqmovn.s32 d6, q2                  \n\t"
-          "vqmovn.s16 d8, q3                  \n\t"
-          "vceq.s8    d9, d8, d24             \n\t"
-          "vsub.s8    d8, d8, d9              \n\t"
-          "loop_save_%=:                      \n\t"
-          "vst1.8     {d8[0]}, [%[C0]]!       \n\t"
-          "vext.8 d8, d8, d8, #1              \n\t"
-          "subs       %[_nc1], %[_nc1], #1    \n\t"
-          "bgt        loop_save_%=            \n\t"
-          "process_over_%=:                   \n\t"
-          :
-          : [_nc1] "r"(nc_left), [C0] "r"(C0), [c0] "r"(c0), [bias0] "r"(bias0),
-            [scale] "r"(scale), [narrow] "r"(narrow)
-          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q15");
-    }
-  }
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-}
-
-// C = A * B + bias, scale * relu(C), bias is added on column
-void Gemm::WriteWithAddReluScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
-                                 int32_t ldc, int32_t *bias, float scale) {
-#if __ARM_NEON
-#if __aarch64__
-  int32_t nc1 = nc >> 3;
-  int32_t _nc1 = nc & 7;
-
-  int32_t *c_ptr;
-  int8_t *C_ptr;
-  int32x4_t cv0;
-  int32x4_t cv1;
-  int16x8_t cv_h;
-  int8x8_t cv_b;
-  int32x4_t biasv;
-  int32x4_t zero = vdupq_n_s32(0);
-  for (int32_t i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    biasv = vld1q_dup_s32(bias + i);
-    for (int32_t j = 0; j < nc1; ++j) {
-      cv0 = vld1q_s32(c_ptr);
-      cv1 = vld1q_s32(c_ptr + 4);
-      cv0 = vqaddq_s32(cv0, biasv);
-      cv1 = vqaddq_s32(cv1, biasv);
-      cv0 = vmaxq_s32(cv0, zero);
-      cv1 = vmaxq_s32(cv1, zero);
-
-      cv_h = vcombine_s16(vqmovn_s32(cv0), vqmovn_s32(cv1));
-      cv_b = vqmovn_s16(cv_h);
-
-      vst1_s8(C_ptr, cv_b);
-      c_ptr += 8;
-      C_ptr += 8;
-    }
-    if (_nc1 != 0) {
-      cv0 = vld1q_s32(c_ptr);
-      cv1 = vld1q_s32(c_ptr + 4);
-      cv0 = vqaddq_s32(cv0, biasv);
-      cv1 = vqaddq_s32(cv1, biasv);
-      cv0 = vmaxq_s32(cv0, zero);
-      cv1 = vmaxq_s32(cv1, zero);
-
-      cv_h = vcombine_s16(vqmovn_s32(cv0), vqmovn_s32(cv1));
-      cv_b = vqmovn_s16(cv_h);
-
-      switch (_nc1) {
-        case 7:
-          vst1_lane_s8(C_ptr + 6, cv_b, 6);
-        case 6:
-          vst1_lane_s8(C_ptr + 5, cv_b, 5);
-        case 5:
-          vst1_lane_s8(C_ptr + 4, cv_b, 4);
-        case 4:
-          vst1_lane_s8(C_ptr + 3, cv_b, 3);
-        case 3:
-          vst1_lane_s8(C_ptr + 2, cv_b, 2);
-        case 2:
-          vst1_lane_s8(C_ptr + 1, cv_b, 1);
-        case 1:
-          vst1_lane_s8(C_ptr, cv_b, 0);
-        default:
-          break;
-      }
-    }
-  }
-#else
-  int32_t zero = 0;
-  int32_t nc1 = nc >> 3;
-  int32_t _nc1 = nc & 7;
-  int32_t step = sizeof(int8_t) * ldc;
-  int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 3));
-  int32_t volatile m = mc;
-  int32_t volatile n = nc1;
-  int32_t *volatile c_ptr, *volatile bias_ptr;
-  int8_t *volatile C_ptr;
-  c_ptr = c;
-  C_ptr = C;
-  bias_ptr = bias;
-  if (nc1 > 0) {
-    asm volatile(
-        "subs       %[mc], %[mc], #1        \n\t"
-        "blt        end_mc_%=               \n\t"
-        "vdup.32    q15,  %[scale]          \n\t"
-        "vdup.32    q14,  %[zero]           \n\t"
-        "loop_mc_%=:                        \n\t"
-        "vld1.32    {d26[0]}, [%[bias_ptr]]!\n\t"
-        "vdup.32    q13,  d26[0]            \n\t"
-        "mov        r6,   %[C_ptr]          \n\t"
-        "mov        r5,   %[nc1]            \n\t"
-        "subs       r5,   r5,   #1          \n\t"
-        "blt        end_nc1_%=              \n\t"
-        "loop_nc1_%=:                       \n\t"
-        "vld1.32    {q0, q1}, [%[c_ptr]]!   \n\t"
-        "vqadd.s32  q0, q0, q13             \n\t"
-        "vqadd.s32  q1, q1, q13             \n\t"
-        "vmax.s32   q0, q0, q14             \n\t"
-        "vmax.s32   q1, q1, q14             \n\t"
-        "vcvt.f32.s32 q2, q0                \n\t"
-        "vcvt.f32.s32 q3, q1                \n\t"
-        "vmul.f32   q2, q2, q15             \n\t"
-        "vmul.f32   q3, q3, q15             \n\t"
-        "vcvt.s32.f32 q4, q2                \n\t"
-        "vcvt.s32.f32 q5, q3                \n\t"
-        "vqmovn.s32 d12, q4                 \n\t"
-        "vqmovn.s32 d13, q5                 \n\t"
-        "vqmovn.s16 d14, q6                 \n\t"
-        "vst1.8     {d14}, [r6]!            \n\t"
-        "subs       r5,   r5,   #1          \n\t"
-        "bge        loop_nc1_%=             \n\t"
-        "end_nc1_%=:                        \n\t"
-
-        "add        %[C_ptr], %[C_ptr], %[step]  \n\t"
-        "add        %[c_ptr], %[c_ptr], %[step1] \n\t"
-        "subs       %[mc], %[mc], #1        \n\t"
-        "bge        loop_mc_%=              \n\t"
-        "end_mc_%=:                         \n\t"
-
-        :
-        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n),
-          [step] "r"(step), [step1] "r"(step1), [bias_ptr] "r"(bias_ptr),
-          [scale] "r"(scale), [zero] "r"(zero)
-        : "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
-          "q7", "q13", "q14", "q15");
-  }
-
-  int32_t nc_left;
-  int32_t *c0;
-  int8_t *C0;
-  int32_t bias_v;
-  if (_nc1 != 0) {
-    for (int32_t i = 0; i < mc; i++) {
-      C0 = C_ptr + nc1 * 8 + i * ldc;
-      c0 = c_ptr + nc1 * 8 + i * NC;
-      bias_v = *(bias_ptr + i);
-      nc_left = _nc1;
-      asm volatile(
-          "vdup.32    q15,  %[scale]          \n\t"
-          "vdup.32    q14,  %[zero]           \n\t"
-          "vdup.32    q13,  %[bias_v]         \n\t"
-          "cmp        %[_nc1], #4             \n\t"
-          "blt        less_four_%=            \n\t"
-          "vld1.32    {q0}, [%[c0]]!          \n\t"
-          "vqadd.s32  q0, q0, q13             \n\t"
-          "vmax.s32   q0, q0, q14             \n\t"
-          "vcvt.f32.s32 q1, q0                \n\t"
-          "vmul.f32   q1, q1, q15             \n\t"
-          "vcvt.s32.f32 q2, q1                \n\t"
-          "vqmovn.s32 d6, q2                  \n\t"
-          "vqmovn.s16 d8, q3                  \n\t"
-          "vst1.8     {d8[0]}, [%[C0]]!       \n\t"
-          "vst1.8     {d8[1]}, [%[C0]]!       \n\t"
-          "vst1.8     {d8[2]}, [%[C0]]!       \n\t"
-          "vst1.8     {d8[3]}, [%[C0]]!       \n\t"
-          "subs       %[_nc1], %[_nc1], #4    \n\t"
-          "beq        process_over_%=         \n\t"
-          "less_four_%=:                      \n\t"
-          "vld1.32    {q0}, [%[c0]]!          \n\t"
-          "vqadd.s32  q0, q0, q13             \n\t"
-          "vmax.s32   q0, q0, q14             \n\t"
-          "vcvt.f32.s32 q1, q0                \n\t"
-          "vmul.f32   q1, q1, q15             \n\t"
-          "vcvt.s32.f32 q2, q1                \n\t"
-          "vqmovn.s32 d6, q2                  \n\t"
-          "vqmovn.s16 d8, q3                  \n\t"
-          "loop_save_%=:                      \n\t"
-          "vst1.8     {d8[0]}, [%[C0]]!       \n\t"
-          "vext.8 d8, d8, d8, #1              \n\t"
-          "subs       %[_nc1], %[_nc1], #1    \n\t"
-          "bgt        loop_save_%=            \n\t"
-          "process_over_%=:                   \n\t"
-          :
-          : [_nc1] "r"(nc_left), [C0] "r"(C0), [c0] "r"(c0),
-            [bias_v] "r"(bias_v), [scale] "r"(scale), [zero] "r"(zero)
-          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q13", "q14", "q15");
-    }
-  }
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/gemm_omp_int8.cpp b/mobile/src/operators/math/gemm_omp_int8.cpp
deleted file mode 100644
index 2ea4520181..0000000000
--- a/mobile/src/operators/math/gemm_omp_int8.cpp
+++ /dev/null
@@ -1,453 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string.h>
-#include "common/log.h"
-#include "memory/t_malloc.h"
-#include "operators/math/gemm.h"
-#if __ARM_NEON
-#include <arm_neon.h>
-#endif
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-void Gemm::PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail,
-                              const int8_t *B, int32_t ldb, int8_t *buffer) {
-  const int32_t j_length = n - n_tail;
-#pragma omp parallel for
-  for (int32_t j = 0; j < j_length; j += 8) {
-    int8_t *local_buffer = buffer + j * k;
-    for (int32_t i = 0; i < k; ++i) {
-      const int8_t *b0 = &B(i, j);
-#if __ARM_NEON
-#if __aarch64__
-// PackMatrixB_omp_8c used only for aarch32
-#else
-      asm volatile(
-          //          "pld        [%[b0]]                     \n\t"
-          "vld1.s8    {d0},   [%[b0]]         \n\t"
-          "vst1.s8    {d0},   [%[local_buffer]]!    \n\t"
-          : [local_buffer] "+r"(local_buffer)
-          : [b0] "r"(b0)
-          : "memory", "q0");
-#endif  // __aarch64__
-#else
-      *local_buffer++ = *b0++;
-      *local_buffer++ = *b0++;
-      *local_buffer++ = *b0++;
-      *local_buffer++ = *b0++;
-      *local_buffer++ = *b0++;
-      *local_buffer++ = *b0++;
-      *local_buffer++ = *b0++;
-      *local_buffer++ = *b0++;
-#endif  // __ARM_NEON
-    }
-  }
-  if (n_tail != 0) {
-    int8_t *local_buffer = buffer + j_length * k;
-    for (int32_t i = 0; i < k; ++i) {
-      const int8_t *b0 = &B(i, j_length);
-      for (int32_t j = j_length; j < n; ++j) {
-        *local_buffer++ = *b0++;
-      }
-      for (int32_t j = n; j < j_length + 8; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-}
-
-void Gemm::PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail,
-                              const int8_t *A, int32_t lda, int8_t *buffer) {
-  const int32_t i_length = m - m_tail;
-#pragma omp parallel for
-  for (int32_t i = 0; i < i_length; i += 4) {
-    const int8_t *a0 = A + i * lda;
-    const int8_t *a1 = A + (i + 1) * lda;
-    const int8_t *a2 = A + (i + 2) * lda;
-    const int8_t *a3 = A + (i + 3) * lda;
-    int8_t *local_buffer = buffer + i * k;
-    for (int32_t j = 0; j < k; ++j) {
-      *local_buffer++ = *a0++;
-      *local_buffer++ = *a1++;
-      *local_buffer++ = *a2++;
-      *local_buffer++ = *a3++;
-    }
-  }
-
-  if (m_tail != 0) {
-    const int8_t *a0 = &A(i_length, 0);
-    const int8_t *a1 = a0 + lda;
-    const int8_t *a2 = a0 + 2 * lda;
-    const int8_t *a3 = a0 + 3 * lda;
-    int8_t *local_buffer = buffer + i_length * k;
-    switch (m_tail) {
-      case 1:
-        a1 = zero_int8;
-      case 2:
-        a2 = zero_int8;
-      case 3:
-        a3 = zero_int8;
-        break;
-      default:
-        break;
-    }
-    for (int32_t j = 0; j < k; ++j) {
-      *local_buffer++ = *a0++;
-      *local_buffer++ = *a1++;
-      *local_buffer++ = *a2++;
-      *local_buffer++ = *a3++;
-    }
-  }
-}
-
-// 8 bits int PackMatrixA_4r
-void Gemm::PackMatrixA_omp_4r_16(int32_t m, int32_t k, int32_t m_tail,
-                                 const int8_t *A, int32_t lda, int8_t *buffer) {
-  const int32_t i_length = m - m_tail;
-  const int32_t k_count = k >> 4;
-  const int32_t k_tail = k & 15;
-#pragma omp parallel for
-  for (int32_t i = 0; i < i_length; i += 4) {
-    const int8_t *a0 = A + i * lda;
-    const int8_t *a1 = A + (i + 1) * lda;
-    const int8_t *a2 = A + (i + 2) * lda;
-    const int8_t *a3 = A + (i + 3) * lda;
-    int8_t *local_buffer = buffer + i * KC;
-    for (int32_t j = 0; j < k_count; ++j) {
-#if __ARM_NEON
-#if __aarch64__
-      asm volatile(
-          "ld1        {v0.16b},   [%[a0]],  #16    \n\t"
-          "ld1        {v1.16b},   [%[a1]],  #16    \n\t"
-          "ld1        {v2.16b},   [%[a2]],  #16    \n\t"
-          "ld1        {v3.16b},   [%[a3]],  #16    \n\t"
-          "st1        {v0.16b},   [%[local_buffer]],  #16   \n\t"
-          "st1        {v1.16b},   [%[local_buffer]],  #16   \n\t"
-          "st1        {v2.16b},   [%[local_buffer]],  #16   \n\t"
-          "st1        {v3.16b},   [%[local_buffer]],  #16   \n\t"
-          : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
-            [a2] "+r"(a2), [a3] "+r"(a3)
-          :
-          : "memory", "v0", "v1", "v2", "v3");
-#else
-      asm volatile(
-          "vld1.s8    {d0, d1},   [%[a0]]!         \n\t"
-          "vld1.s8    {d2, d3},   [%[a1]]!         \n\t"
-          "vld1.s8    {d4, d5},   [%[a2]]!         \n\t"
-          "vld1.s8    {d6, d7},   [%[a3]]!         \n\t"
-          "vst1.s8    {d0, d1},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d2, d3},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d4, d5},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d6, d7},   [%[local_buffer]]!    \n\t"
-          : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
-            [a2] "+r"(a2), [a3] "+r"(a3)
-          :
-          : "memory", "q0", "q1", "q2", "q3");
-#endif  // __aarch64__
-#else
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a0++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a1++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a2++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a3++;
-      }
-#endif  // __ARM_NEON
-    }
-    if (k_tail != 0) {
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a0++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a1++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a2++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a3++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-
-  if (m_tail != 0) {
-    const int8_t *a0 = &A(i_length, 0);
-    const int8_t *a1 = a0 + lda;
-    const int8_t *a2 = a0 + 2 * lda;
-    const int8_t *a3 = a0 + 3 * lda;
-    int8_t *local_buffer = buffer + i_length * KC;
-    switch (m_tail) {
-      case 1:
-        a1 = zero_int8;
-      case 2:
-        a2 = zero_int8;
-      case 3:
-        a3 = zero_int8;
-        break;
-      default:
-        break;
-    }
-    for (int32_t j = 0; j < k_count; ++j) {
-#if __ARM_NEON
-#if __aarch64__
-      asm volatile(
-          "ld1        {v0.16b},   [%[a0]],  #16    \n\t"
-          "ld1        {v1.16b},   [%[a1]],  #16    \n\t"
-          "ld1        {v2.16b},   [%[a2]],  #16    \n\t"
-          "ld1        {v3.16b},   [%[a3]],  #16    \n\t"
-          "st1        {v0.16b},   [%[local_buffer]],  #16   \n\t"
-          "st1        {v1.16b},   [%[local_buffer]],  #16   \n\t"
-          "st1        {v2.16b},   [%[local_buffer]],  #16   \n\t"
-          "st1        {v3.16b},   [%[local_buffer]],  #16   \n\t"
-          : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
-            [a2] "+r"(a2), [a3] "+r"(a3)
-          :
-          : "memory", "v0", "v1", "v2", "v3");
-#else
-      asm volatile(
-          "vld1.s8    {d0, d1},   [%[a0]]!         \n\t"
-          "vld1.s8    {d2, d3},   [%[a1]]!         \n\t"
-          "vld1.s8    {d4, d5},   [%[a2]]!         \n\t"
-          "vld1.s8    {d6, d7},   [%[a3]]!         \n\t"
-          "vst1.s8    {d0, d1},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d2, d3},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d4, d5},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d6, d7},   [%[local_buffer]]!    \n\t"
-          : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
-            [a2] "+r"(a2), [a3] "+r"(a3)
-          :
-          : "memory", "q0", "q1", "q2", "q3");
-#endif  // __aarch64__
-#else
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a0++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a1++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a2++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a3++;
-      }
-#endif  // __ARM_NEON
-    }
-    if (k_tail != 0) {
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a0++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a1++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a2++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a3++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-}
-
-// 8 bits int PackMatrixB
-void Gemm::PackMatrixB_omp_2c_16(int32_t k, int32_t n, int32_t n_tail,
-                                 const int8_t *B, int32_t ldb, int8_t *buffer) {
-  const int32_t j_length = n - n_tail;
-  const int32_t k_count = k >> 4;
-  const int32_t k_tail = k & 15;
-#pragma omp parallel for
-  for (int32_t j = 0; j < j_length; j += 2) {
-    int8_t *local_buffer = buffer + j * KC;
-    for (int32_t i = 0; i < k_count; ++i) {
-      const int8_t *b0 = &B((i << 4), j);
-      const int8_t *b1 = &B((i << 4), j + 1);
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b1;
-        b1 += ldb;
-      }
-    }
-    if (k_tail != 0) {
-      const int8_t *b0 = &B((k_count << 4), j);
-      const int8_t *b1 = &B((k_count << 4), j + 1);
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b1;
-        b1 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-  if (n_tail != 0) {
-    int8_t *local_buffer = buffer + j_length * KC;
-    for (int32_t i = 0; i < k_count; ++i) {
-      const int8_t *b0 = &B((i << 4), j_length);
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = 0;
-      }
-    }
-    if (k_tail != 0) {
-      const int8_t *b0 = &B((k_count << 4), j_length);
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-}
-
-// 8 bits int PackMatrixB
-void Gemm::PackMatrixB_omp_4c_16(int32_t k, int32_t n, int32_t n_tail,
-                                 const int8_t *B, int32_t ldb, int8_t *buffer) {
-  const int32_t j_length = n - n_tail;
-  const int32_t k_count = k >> 4;
-  const int32_t k_tail = k & 15;
-#pragma omp parallel for
-  for (int32_t j = 0; j < n; j += 4) {
-    int8_t *local_buffer = buffer + j * KC;
-    const int8_t *b0 = &B(0, j);
-    const int8_t *b1 = b0 + 1;
-    const int8_t *b2 = b0 + 2;
-    const int8_t *b3 = b0 + 3;
-    if (j > j_length) {
-      switch (n_tail) {
-        case 1:
-          b1 = zero_int8;
-        case 2:
-          b2 = zero_int8;
-        case 3:
-          b3 = zero_int8;
-          break;
-        default:
-          break;
-      }
-    }
-
-    for (int32_t i = 0; i < k_count; ++i) {
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b1;
-        b1 += ldb;
-      }
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b2;
-        b2 += ldb;
-      }
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b3;
-        b3 += ldb;
-      }
-    }
-    if (k_tail != 0) {
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b1;
-        b1 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b2;
-        b2 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b3;
-        b3 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/gpc.cpp b/mobile/src/operators/math/gpc.cpp
deleted file mode 100644
index 6b7700081a..0000000000
--- a/mobile/src/operators/math/gpc.cpp
+++ /dev/null
@@ -1,2142 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MULTICLASSNMS_OP
-
-#include "operators/math/gpc.h"
-
-namespace gpc {
-
-typedef struct lmt_shape { /* Local minima table                */
-  double y;                /* Y coordinate at local minimum     */
-  edge_node *first_bound;  /* Pointer to bound list             */
-  struct lmt_shape *next;  /* Pointer to next local minimum     */
-} lmt_node;
-
-typedef struct sbt_t_shape { /* Scanbeam tree                     */
-  double y;                  /* Scanbeam node y value             */
-  struct sbt_t_shape *less;  /* Pointer to nodes with lower y     */
-  struct sbt_t_shape *more;  /* Pointer to nodes with higher y    */
-} sb_tree;
-
-typedef struct it_shape { /* Intersection table                */
-  edge_node *ie[2];       /* Intersecting edge (bundle) pair   */
-  gpc_vertex point;       /* Point of intersection             */
-  struct it_shape *next;  /* The next intersection table node  */
-} it_node;
-
-typedef struct st_shape { /* Sorted edge table                 */
-  edge_node *edge;        /* Pointer to AET edge               */
-  double xb;              /* Scanbeam bottom x coordinate      */
-  double xt;              /* Scanbeam top x coordinate         */
-  double dx;              /* Change in x for a unit y increase */
-  struct st_shape *prev;  /* Previous edge in sorted list      */
-} st_node;
-
-typedef struct bbox_shape { /* Contour axis-aligned bounding box */
-  double xmin;              /* Minimum x coordinate              */
-  double ymin;              /* Minimum y coordinate              */
-  double xmax;              /* Maximum x coordinate              */
-  double ymax;              /* Maximum y coordinate              */
-} bbox;
-
-/*
-===========================================================================
-                               Global Data
-===========================================================================
-*/
-
-/* Horizontal edge state transitions within scanbeam boundary */
-const h_state next_h_state[3][6] = {
-    /*        ABOVE     BELOW     CROSS */
-    /*        L   R     L   R     L   R */
-    /* NH */
-    {BH, TH, TH, BH, NH, NH},
-    /* BH */
-    {NH, NH, NH, NH, TH, TH},
-    /* TH */
-    {NH, NH, NH, NH, BH, BH}};
-
-/*
-===========================================================================
-                             Private Functions
-===========================================================================
-*/
-
-static void reset_it(it_node **it) {
-  it_node *itn;
-
-  while (*it) {
-    itn = (*it)->next;
-    gpc_free<it_node>(*it);
-    *it = itn;
-  }
-}
-
-static void reset_lmt(lmt_node **lmt) {
-  lmt_node *lmtn;
-
-  while (*lmt) {
-    lmtn = (*lmt)->next;
-    gpc_free<lmt_node>(*lmt);
-    *lmt = lmtn;
-  }
-}
-
-static void insert_bound(edge_node **b, edge_node *e) {
-  edge_node *existing_bound = NULL;
-
-  if (!*b) {
-    /* Link node e to the tail of the list */
-    *b = e;
-  } else {
-    /* Do primary sort on the x field */
-    if (e[0].bot.x < (*b)[0].bot.x) {
-      /* Insert a new node mid-list */
-      existing_bound = *b;
-      *b = e;
-      (*b)->next_bound = existing_bound;
-    } else {
-      if (e[0].bot.x == (*b)[0].bot.x) {
-        /* Do secondary sort on the dx field */
-        if (e[0].dx < (*b)[0].dx) {
-          /* Insert a new node mid-list */
-          existing_bound = *b;
-          *b = e;
-          (*b)->next_bound = existing_bound;
-        } else {
-          /* Head further down the list */
-          insert_bound(&((*b)->next_bound), e);
-        }
-      } else {
-        /* Head further down the list */
-        insert_bound(&((*b)->next_bound), e);
-      }
-    }
-  }
-}
-
-static edge_node **bound_list(lmt_node **lmt, double y) {
-  lmt_node *existing_node;
-
-  if (!*lmt) {
-    /* Add node onto the tail end of the LMT */
-    gpc_malloc<lmt_node>(*lmt, sizeof(lmt_node),
-                         const_cast<char *>("LMT insertion"));
-    (*lmt)->y = y;
-    (*lmt)->first_bound = NULL;
-    (*lmt)->next = NULL;
-    return &((*lmt)->first_bound);
-  } else if (y < (*lmt)->y) {
-    /* Insert a new LMT node before the current node */
-    existing_node = *lmt;
-    gpc_malloc<lmt_node>(*lmt, sizeof(lmt_node),
-                         const_cast<char *>("LMT insertion"));
-    (*lmt)->y = y;
-    (*lmt)->first_bound = NULL;
-    (*lmt)->next = existing_node;
-    return &((*lmt)->first_bound);
-  } else {
-    if (y > (*lmt)->y) {
-      /* Head further up the LMT */
-      return bound_list(&((*lmt)->next), y);
-    } else {
-      /* Use this existing LMT node */
-      return &((*lmt)->first_bound);
-    }
-  }
-}
-
-static void add_to_sbtree(int *entries, sb_tree **sbtree, double y) {
-  if (!*sbtree) {
-    /* Add a new tree node here */
-    gpc_malloc<sb_tree>(*sbtree, sizeof(sb_tree),
-                        const_cast<char *>("scanbeam tree insertion"));
-    (*sbtree)->y = y;
-    (*sbtree)->less = NULL;
-    (*sbtree)->more = NULL;
-    (*entries)++;
-  } else {
-    if ((*sbtree)->y > y) {
-      /* Head into the 'less' sub-tree */
-      add_to_sbtree(entries, &((*sbtree)->less), y);
-    } else {
-      if ((*sbtree)->y < y) {
-        /* Head into the 'more' sub-tree */
-        add_to_sbtree(entries, &((*sbtree)->more), y);
-      }
-    }
-  }
-}
-
-static void build_sbt(int *entries, double *sbt, sb_tree *sbtree) {
-  if (sbtree->less) {
-    build_sbt(entries, sbt, sbtree->less);
-  }
-  sbt[*entries] = sbtree->y;
-  (*entries)++;
-  if (sbtree->more) {
-    build_sbt(entries, sbt, sbtree->more);
-  }
-}
-
-static void free_sbtree(sb_tree **sbtree) {
-  if (*sbtree) {
-    free_sbtree(&((*sbtree)->less));
-    free_sbtree(&((*sbtree)->more));
-    gpc_free<sb_tree>(*sbtree);
-  }
-}
-
-static int count_optimal_vertices(gpc_vertex_list c) {
-  int result = 0;
-  int i = 0;
-
-  /* Ignore non-contributing contours */
-  if (c.num_vertices > 0) {
-    for (i = 0; i < c.num_vertices; i++) {
-      /* Ignore superfluous vertices embedded in horizontal edges */
-      if (gpc_optimal(c.vertex, i, c.num_vertices)) {
-        result++;
-      }
-    }
-  }
-  return result;
-}
-
-static edge_node *build_lmt(lmt_node **lmt, sb_tree **sbtree, int *sbt_entries,
-                            gpc_polygon *p, int type, gpc_op op) {
-  int c = 0;
-  int i = 0;
-  int min = 0;
-  int max = 0;
-  int num_edges = 0;
-  int v = 0;
-  int num_vertices = 0;
-  int total_vertices = 0;
-  int e_index = 0;
-  edge_node *e = NULL;
-  edge_node *edge_table = NULL;
-
-  for (c = 0; c < p->num_contours; c++) {
-    total_vertices += count_optimal_vertices(p->contour[c]);
-  }
-
-  /* Create the entire input polygon edge table in one go */
-  gpc_malloc<edge_node>(edge_table, total_vertices * sizeof(edge_node),
-                        const_cast<char *>("edge table creation"));
-
-  for (c = 0; c < p->num_contours; c++) {
-    if (p->contour[c].num_vertices < 0) {
-      /* Ignore the non-contributing contour and repair the vertex count */
-      p->contour[c].num_vertices = -p->contour[c].num_vertices;
-    } else {
-      /* Perform contour optimisation */
-      num_vertices = 0;
-      for (i = 0; i < p->contour[c].num_vertices; i++) {
-        if (gpc_optimal(p->contour[c].vertex, i, p->contour[c].num_vertices)) {
-          edge_table[num_vertices].vertex.x = p->contour[c].vertex[i].x;
-          edge_table[num_vertices].vertex.y = p->contour[c].vertex[i].y;
-
-          /* Record vertex in the scanbeam table */
-          add_to_sbtree(sbt_entries, sbtree, edge_table[num_vertices].vertex.y);
-
-          num_vertices++;
-        }
-      }
-
-      /* Do the contour forward pass */
-      for (min = 0; min < num_vertices; min++) {
-        /* If a forward local minimum... */
-        if (gpc_fwd_min(edge_table, min, num_vertices)) {
-          /* Search for the next local maximum... */
-          num_edges = 1;
-          max = gpc_next_index(min, num_vertices);
-          while (gpc_not_fmax(edge_table, max, num_vertices)) {
-            num_edges++;
-            max = gpc_next_index(max, num_vertices);
-          }
-
-          /* Build the next edge list */
-          e = &edge_table[e_index];
-          e_index += num_edges;
-          v = min;
-          e[0].bstate[BELOW] = UNBUNDLED;
-          e[0].bundle[BELOW][CLIP] = 0;
-          e[0].bundle[BELOW][SUBJ] = 0;
-          for (i = 0; i < num_edges; i++) {
-            e[i].xb = edge_table[v].vertex.x;
-            e[i].bot.x = edge_table[v].vertex.x;
-            e[i].bot.y = edge_table[v].vertex.y;
-
-            v = gpc_next_index(v, num_vertices);
-
-            e[i].top.x = edge_table[v].vertex.x;
-            e[i].top.y = edge_table[v].vertex.y;
-            e[i].dx = (edge_table[v].vertex.x - e[i].bot.x) /
-                      (e[i].top.y - e[i].bot.y);
-            e[i].type = type;
-            e[i].outp[ABOVE] = NULL;
-            e[i].outp[BELOW] = NULL;
-            e[i].next = NULL;
-            e[i].prev = NULL;
-            e[i].succ =
-                ((num_edges > 1) && (i < (num_edges - 1))) ? &(e[i + 1]) : NULL;
-            e[i].pred = ((num_edges > 1) && (i > 0)) ? &(e[i - 1]) : NULL;
-            e[i].next_bound = NULL;
-            e[i].bside[CLIP] = (op == GPC_DIFF) ? RIGHT : LEFT;
-            e[i].bside[SUBJ] = LEFT;
-          }
-          insert_bound(bound_list(lmt, edge_table[min].vertex.y), e);
-        }
-      }
-
-      /* Do the contour reverse pass */
-      for (min = 0; min < num_vertices; min++) {
-        /* If a reverse local minimum... */
-        if (gpc_rev_min(edge_table, min, num_vertices)) {
-          /* Search for the previous local maximum... */
-          num_edges = 1;
-          max = gpc_prev_index(min, num_vertices);
-          while (gpc_not_rmax(edge_table, max, num_vertices)) {
-            num_edges++;
-            max = gpc_prev_index(max, num_vertices);
-          }
-
-          /* Build the previous edge list */
-          e = &edge_table[e_index];
-          e_index += num_edges;
-          v = min;
-          e[0].bstate[BELOW] = UNBUNDLED;
-          e[0].bundle[BELOW][CLIP] = 0;
-          e[0].bundle[BELOW][SUBJ] = 0;
-          for (i = 0; i < num_edges; i++) {
-            e[i].xb = edge_table[v].vertex.x;
-            e[i].bot.x = edge_table[v].vertex.x;
-            e[i].bot.y = edge_table[v].vertex.y;
-
-            v = gpc_prev_index(v, num_vertices);
-
-            e[i].top.x = edge_table[v].vertex.x;
-            e[i].top.y = edge_table[v].vertex.y;
-            e[i].dx = (edge_table[v].vertex.x - e[i].bot.x) /
-                      (e[i].top.y - e[i].bot.y);
-            e[i].type = type;
-            e[i].outp[ABOVE] = NULL;
-            e[i].outp[BELOW] = NULL;
-            e[i].next = NULL;
-            e[i].prev = NULL;
-            e[i].succ =
-                ((num_edges > 1) && (i < (num_edges - 1))) ? &(e[i + 1]) : NULL;
-            e[i].pred = ((num_edges > 1) && (i > 0)) ? &(e[i - 1]) : NULL;
-            e[i].next_bound = NULL;
-            e[i].bside[CLIP] = (op == GPC_DIFF) ? RIGHT : LEFT;
-            e[i].bside[SUBJ] = LEFT;
-          }
-          insert_bound(bound_list(lmt, edge_table[min].vertex.y), e);
-        }
-      }
-    }
-  }
-  return edge_table;
-}  // NOLINT
-
-static void add_edge_to_aet(edge_node **aet, edge_node *edge, edge_node *prev) {
-  if (!*aet) {
-    /* Append edge onto the tail end of the AET */
-    *aet = edge;
-    edge->prev = prev;
-    edge->next = NULL;
-  } else {
-    /* Do primary sort on the xb field */
-    if (edge->xb < (*aet)->xb) {
-      /* Insert edge here (before the AET edge) */
-      edge->prev = prev;
-      edge->next = *aet;
-      (*aet)->prev = edge;
-      *aet = edge;
-    } else {
-      if (edge->xb == (*aet)->xb) {
-        /* Do secondary sort on the dx field */
-        if (edge->dx < (*aet)->dx) {
-          /* Insert edge here (before the AET edge) */
-          edge->prev = prev;
-          edge->next = *aet;
-          (*aet)->prev = edge;
-          *aet = edge;
-        } else {
-          /* Head further into the AET */
-          add_edge_to_aet(&((*aet)->next), edge, *aet);
-        }
-      } else {
-        /* Head further into the AET */
-        add_edge_to_aet(&((*aet)->next), edge, *aet);
-      }
-    }
-  }
-}
-
-static void add_intersection(it_node **it, edge_node *edge0, edge_node *edge1,
-                             double x, double y) {
-  it_node *existing_node;
-
-  if (!*it) {
-    /* Append a new node to the tail of the list */
-    gpc_malloc<it_node>(*it, sizeof(it_node),
-                        const_cast<char *>("IT insertion"));
-    (*it)->ie[0] = edge0;
-    (*it)->ie[1] = edge1;
-    (*it)->point.x = x;
-    (*it)->point.y = y;
-    (*it)->next = NULL;
-  } else {
-    if ((*it)->point.y > y) {
-      /* Insert a new node mid-list */
-      existing_node = *it;
-      gpc_malloc<it_node>(*it, sizeof(it_node),
-                          const_cast<char *>("IT insertion"));
-      (*it)->ie[0] = edge0;
-      (*it)->ie[1] = edge1;
-      (*it)->point.x = x;
-      (*it)->point.y = y;
-      (*it)->next = existing_node;
-    } else {
-      /* Head further down the list */
-      add_intersection(&((*it)->next), edge0, edge1, x, y);
-    }
-  }
-}
-
-static void add_st_edge(st_node **st, it_node **it, edge_node *edge,
-                        double dy) {
-  st_node *existing_node;
-  double den = 0.0;
-  double r = 0.0;
-  double x = 0.0;
-  double y = 0.0;
-
-  if (!*st) {
-    /* Append edge onto the tail end of the ST */
-    gpc_malloc<st_node>(*st, sizeof(st_node),
-                        const_cast<char *>("ST insertion"));
-    (*st)->edge = edge;
-    (*st)->xb = edge->xb;
-    (*st)->xt = edge->xt;
-    (*st)->dx = edge->dx;
-    (*st)->prev = NULL;
-  } else {
-    den = ((*st)->xt - (*st)->xb) - (edge->xt - edge->xb);
-
-    /* If new edge and ST edge don't cross */
-    if ((edge->xt >= (*st)->xt) || (edge->dx == (*st)->dx) ||
-        (fabs(den) <= DBL_EPSILON)) {
-      /* No intersection - insert edge here (before the ST edge) */
-      existing_node = *st;
-      gpc_malloc<st_node>(*st, sizeof(st_node),
-                          const_cast<char *>("ST insertion"));
-      (*st)->edge = edge;
-      (*st)->xb = edge->xb;
-      (*st)->xt = edge->xt;
-      (*st)->dx = edge->dx;
-      (*st)->prev = existing_node;
-    } else {
-      /* Compute intersection between new edge and ST edge */
-      r = (edge->xb - (*st)->xb) / den;
-      x = (*st)->xb + r * ((*st)->xt - (*st)->xb);
-      y = r * dy;
-
-      /* Insert the edge pointers and the intersection point in the IT */
-      add_intersection(it, (*st)->edge, edge, x, y);
-
-      /* Head further into the ST */
-      add_st_edge(&((*st)->prev), it, edge, dy);
-    }
-  }
-}
-
-static void build_intersection_table(it_node **it, edge_node *aet, double dy) {
-  st_node *st;
-  st_node *stp;
-  edge_node *edge = NULL;
-
-  /* Build intersection table for the current scanbeam */
-  reset_it(it);
-  st = NULL;
-
-  /* Process each AET edge */
-  for (edge = aet; edge; edge = edge->next) {
-    if ((edge->bstate[ABOVE] == BUNDLE_HEAD) || edge->bundle[ABOVE][CLIP] ||
-        edge->bundle[ABOVE][SUBJ]) {
-      add_st_edge(&st, it, edge, dy);
-    }
-  }
-
-  /* Free the sorted edge table */
-  while (st) {
-    stp = st->prev;
-    gpc_free<st_node>(st);
-    st = stp;
-  }
-}
-
-static int count_contours(polygon_node *polygon) {
-  int nc = 0;
-  int nv = 0;
-  vertex_node *v = NULL;
-  vertex_node *nextv = NULL;
-
-  for (nc = 0; polygon; polygon = polygon->next) {
-    if (polygon->active) {
-      /* Count the vertices in the current contour */
-      nv = 0;
-      for (v = polygon->proxy->v[LEFT]; v; v = v->next) {
-        nv++;
-      }
-
-      /* Record valid vertex counts in the active field */
-      if (nv > 2) {
-        polygon->active = nv;
-        nc++;
-      } else {
-        /* Invalid contour: just free the heap */
-        for (v = polygon->proxy->v[LEFT]; v; v = nextv) {
-          nextv = v->next;
-          gpc_free<vertex_node>(v);
-        }
-        polygon->active = 0;
-      }
-    }
-  }
-  return nc;
-}
-
-static void add_left(polygon_node *p, double x, double y) {
-  vertex_node *nv = NULL;
-
-  /* Create a new vertex node and set its fields */
-  gpc_malloc<vertex_node>(nv, sizeof(vertex_node),
-                          const_cast<char *>("vertex node creation"));
-  nv->x = x;
-  nv->y = y;
-
-  /* Add vertex nv to the left end of the polygon's vertex list */
-  nv->next = p->proxy->v[LEFT];
-
-  /* Update proxy->[LEFT] to point to nv */
-  p->proxy->v[LEFT] = nv;
-}
-
-static void merge_left(polygon_node *p, polygon_node *q, polygon_node *list) {
-  polygon_node *target = NULL;
-
-  /* Label contour as a hole */
-  q->proxy->hole = 1;
-
-  if (p->proxy != q->proxy) {
-    /* Assign p's vertex list to the left end of q's list */
-    p->proxy->v[RIGHT]->next = q->proxy->v[LEFT];
-    q->proxy->v[LEFT] = p->proxy->v[LEFT];
-
-    /* Redirect any p->proxy references to q->proxy */
-
-    for (target = p->proxy; list; list = list->next) {
-      if (list->proxy == target) {
-        list->active = 0;
-        list->proxy = q->proxy;
-      }
-    }
-  }
-}
-
-static void add_right(polygon_node *p, double x, double y) {
-  vertex_node *nv = NULL;
-
-  /* Create a new vertex node and set its fields */
-  gpc_malloc<vertex_node>(nv, sizeof(vertex_node),
-                          const_cast<char *>("vertex node creation"));
-  nv->x = x;
-  nv->y = y;
-  nv->next = NULL;
-
-  /* Add vertex nv to the right end of the polygon's vertex list */
-  p->proxy->v[RIGHT]->next = nv;
-
-  /* Update proxy->v[RIGHT] to point to nv */
-  p->proxy->v[RIGHT] = nv;
-}
-
-static void merge_right(polygon_node *p, polygon_node *q, polygon_node *list) {
-  polygon_node *target = NULL;
-
-  /* Label contour as external */
-  q->proxy->hole = 0;
-
-  if (p->proxy != q->proxy) {
-    /* Assign p's vertex list to the right end of q's list */
-    q->proxy->v[RIGHT]->next = p->proxy->v[LEFT];
-    q->proxy->v[RIGHT] = p->proxy->v[RIGHT];
-
-    /* Redirect any p->proxy references to q->proxy */
-    for (target = p->proxy; list; list = list->next) {
-      if (list->proxy == target) {
-        list->active = 0;
-        list->proxy = q->proxy;
-      }
-    }
-  }
-}
-
-static void add_local_min(polygon_node **p, edge_node *edge, double x,
-                          double y) {
-  polygon_node *existing_min = NULL;
-  vertex_node *nv = NULL;
-
-  existing_min = *p;
-
-  gpc_malloc<polygon_node>(*p, sizeof(polygon_node),
-                           const_cast<char *>("polygon node creation"));
-
-  /* Create a new vertex node and set its fields */
-  gpc_malloc<vertex_node>(nv, sizeof(vertex_node),
-                          const_cast<char *>("vertex node creation"));
-  nv->x = x;
-  nv->y = y;
-  nv->next = NULL;
-
-  /* Initialise proxy to point to p itself */
-  (*p)->proxy = (*p);
-  (*p)->active = 1;
-  (*p)->next = existing_min;
-
-  /* Make v[LEFT] and v[RIGHT] point to new vertex nv */
-  (*p)->v[LEFT] = nv;
-  (*p)->v[RIGHT] = nv;
-
-  /* Assign polygon p to the edge */
-  edge->outp[ABOVE] = *p;
-}
-
-static int count_tristrips(polygon_node *tn) {
-  int total = 0;
-
-  for (total = 0; tn; tn = tn->next) {
-    if (tn->active > 2) {
-      total++;
-    }
-  }
-  return total;
-}
-
-void add_vertex(vertex_node **t, double x, double y) {
-  if (!(*t)) {
-    gpc_malloc<vertex_node>(*t, sizeof(vertex_node),
-                            const_cast<char *>("tristrip vertex creation"));
-    (*t)->x = x;
-    (*t)->y = y;
-    (*t)->next = NULL;
-  } else {
-    /* Head further down the list */
-    add_vertex(&((*t)->next), x, y);
-  }
-}
-
-void gpc_vertex_create(edge_node *e, int p, int s, double x, double y) {
-  add_vertex(&(e->outp[p]->v[s]), x, y);
-  e->outp[p]->active++;
-}
-
-static void new_tristrip(polygon_node **tn, edge_node *edge, double x,
-                         double y) {
-  if (!(*tn)) {
-    gpc_malloc<polygon_node>(*tn, sizeof(polygon_node),
-                             const_cast<char *>("tristrip node creation"));
-    (*tn)->next = NULL;
-    (*tn)->v[LEFT] = NULL;
-    (*tn)->v[RIGHT] = NULL;
-    (*tn)->active = 1;
-    add_vertex(&((*tn)->v[LEFT]), x, y);
-    edge->outp[ABOVE] = *tn;
-  } else {
-    /* Head further down the list */
-    new_tristrip(&((*tn)->next), edge, x, y);
-  }
-}
-
-static bbox *create_contour_bboxes(gpc_polygon *p) {
-  bbox *box;
-  int c = 0;
-  int v = 0;
-
-  gpc_malloc<bbox>(box, p->num_contours * sizeof(bbox),
-                   const_cast<char *>("Bounding box creation"));
-
-  /* Construct contour bounding boxes */
-  for (c = 0; c < p->num_contours; c++) {
-    /* Initialise bounding box extent */
-    box[c].xmin = DBL_MAX;
-    box[c].ymin = DBL_MAX;
-    box[c].xmax = -DBL_MAX;
-    box[c].ymax = -DBL_MAX;
-
-    for (v = 0; v < p->contour[c].num_vertices; v++) {
-      /* Adjust bounding box */
-      if (p->contour[c].vertex[v].x < box[c].xmin) {
-        box[c].xmin = p->contour[c].vertex[v].x;
-      }
-      if (p->contour[c].vertex[v].y < box[c].ymin) {
-        box[c].ymin = p->contour[c].vertex[v].y;
-      }
-      if (p->contour[c].vertex[v].x > box[c].xmax) {
-        box[c].xmax = p->contour[c].vertex[v].x;
-      }
-      if (p->contour[c].vertex[v].y > box[c].ymax) {
-        box[c].ymax = p->contour[c].vertex[v].y;
-      }
-    }
-  }
-  return box;
-}
-
-static void minimax_test(gpc_polygon *subj, gpc_polygon *clip, gpc_op op) {
-  bbox *s_bbox;
-  bbox *c_bbox;
-  int s = 0;
-  int c = 0;
-  int *o_table = NULL;
-  int overlap = 0;
-
-  s_bbox = create_contour_bboxes(subj);
-  c_bbox = create_contour_bboxes(clip);
-
-  gpc_malloc<int>(o_table,
-                  subj->num_contours * clip->num_contours * sizeof(int),
-                  const_cast<char *>("overlap table creation"));
-
-  /* Check all subject contour bounding boxes against clip boxes */
-  for (s = 0; s < subj->num_contours; s++) {
-    for (c = 0; c < clip->num_contours; c++) {
-      o_table[c * subj->num_contours + s] =
-          (!((s_bbox[s].xmax < c_bbox[c].xmin) ||
-             (s_bbox[s].xmin > c_bbox[c].xmax))) &&
-          (!((s_bbox[s].ymax < c_bbox[c].ymin) ||
-             (s_bbox[s].ymin > c_bbox[c].ymax)));
-    }
-  }
-
-  /* For each clip contour, search for any subject contour overlaps */
-  for (c = 0; c < clip->num_contours; c++) {
-    overlap = 0;
-    for (s = 0; (!overlap) && (s < subj->num_contours); s++) {
-      overlap = o_table[c * subj->num_contours + s];
-    }
-
-    if (!overlap) {
-      /* Flag non contributing status by negating vertex count */
-      clip->contour[c].num_vertices = -clip->contour[c].num_vertices;
-    }
-  }
-
-  if (op == GPC_INT) {
-    /* For each subject contour, search for any clip contour overlaps */
-    for (s = 0; s < subj->num_contours; s++) {
-      overlap = 0;
-      for (c = 0; (!overlap) && (c < clip->num_contours); c++) {
-        overlap = o_table[c * subj->num_contours + s];
-      }
-
-      if (!overlap) {
-        /* Flag non contributing status by negating vertex count */
-        subj->contour[s].num_vertices = -subj->contour[s].num_vertices;
-      }
-    }
-  }
-
-  gpc_free<bbox>(s_bbox);
-  gpc_free<bbox>(c_bbox);
-  gpc_free<int>(o_table);
-}
-
-/*
-===========================================================================
-                             Public Functions
-===========================================================================
-*/
-
-void gpc_free_polygon(gpc_polygon *p) {
-  int c = 0;
-
-  for (c = 0; c < p->num_contours; c++) {
-    gpc_free<gpc_vertex>(p->contour[c].vertex);
-  }
-  gpc_free<int>(p->hole);
-  gpc_free<gpc_vertex_list>(p->contour);
-  p->num_contours = 0;
-}
-
-void gpc_add_contour(gpc_polygon *p, gpc_vertex_list *new_contour, int hole) {
-  int *extended_hole = NULL;
-  int c = 0;
-  int v = 0;
-  gpc_vertex_list *extended_contour = NULL;
-
-  /* Create an extended hole array */
-  gpc_malloc<int>(extended_hole, (p->num_contours + 1) * sizeof(int),
-                  const_cast<char *>("contour hole addition"));
-
-  /* Create an extended contour array */
-  gpc_malloc<gpc_vertex_list>(extended_contour,
-                              (p->num_contours + 1) * sizeof(gpc_vertex_list),
-                              const_cast<char *>("contour addition"));
-
-  /* Copy the old contour and hole data into the extended arrays */
-  for (c = 0; c < p->num_contours; c++) {
-    extended_hole[c] = p->hole[c];
-    extended_contour[c] = p->contour[c];
-  }
-
-  /* Copy the new contour and hole onto the end of the extended arrays */
-  c = p->num_contours;
-  extended_hole[c] = hole;
-  extended_contour[c].num_vertices = new_contour->num_vertices;
-  gpc_malloc<gpc_vertex>(extended_contour[c].vertex,
-                         new_contour->num_vertices * sizeof(gpc_vertex),
-                         const_cast<char *>("contour addition"));
-  for (v = 0; v < new_contour->num_vertices; v++) {
-    extended_contour[c].vertex[v] = new_contour->vertex[v];
-  }
-
-  /* Dispose of the old contour */
-  gpc_free<gpc_vertex_list>(p->contour);
-  gpc_free<int>(p->hole);
-
-  /* Update the polygon information */
-  p->num_contours++;
-  p->hole = extended_hole;
-  p->contour = extended_contour;
-}
-
-// gpc_polygon_clip
-void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
-                      gpc_polygon *result) {
-  sb_tree *sbtree = NULL;
-  it_node *it = NULL;
-  it_node *intersect = NULL;
-  edge_node *edge = NULL;
-  edge_node *prev_edge = NULL;
-  edge_node *next_edge = NULL;
-  edge_node *succ_edge = NULL;
-  edge_node *e0 = NULL;
-  edge_node *e1 = NULL;
-  edge_node *aet = NULL;
-  edge_node *c_heap = NULL;
-  edge_node *s_heap = NULL;
-  lmt_node *lmt = NULL;
-  lmt_node *local_min = NULL;
-  polygon_node *out_poly = NULL;
-  polygon_node *p = NULL;
-  polygon_node *q = NULL;
-  polygon_node *poly = NULL;
-  polygon_node *npoly = NULL;
-  polygon_node *cf = NULL;
-  vertex_node *vtx = NULL;
-  vertex_node *nv = NULL;
-  h_state horiz[2];
-  int in[2];
-  int exists[2];
-  int parity[2] = {LEFT, LEFT};
-  int c = 0;
-  int v = 0;
-  int contributing = 0;
-  int search = 0;
-  int scanbeam = 0;
-  int sbt_entries = 0;
-  int vclass = 0;
-  int bl = 0;
-  int br = 0;
-  int tl = 0;
-  int tr = 0;
-  double *sbt = NULL;
-  double xb = 0.0;
-  double px = 0.0;
-  double yb = 0.0;
-  double yt = 0.0;
-  double dy = 0.0;
-  double ix = 0.0;
-  double iy = 0.0;
-
-  /* Test for trivial NULL result cases */
-  if (((subj->num_contours == 0) && (clip->num_contours == 0)) ||
-      ((subj->num_contours == 0) && ((op == GPC_INT) || (op == GPC_DIFF))) ||
-      ((clip->num_contours == 0) && (op == GPC_INT))) {
-    result->num_contours = 0;
-    result->hole = NULL;
-    result->contour = NULL;
-    return;
-  }
-  /* Identify potentialy contributing contours */
-  if (((op == GPC_INT) || (op == GPC_DIFF)) && (subj->num_contours > 0) &&
-      (clip->num_contours > 0)) {
-    minimax_test(subj, clip, op);
-  }
-  /* Build LMT */
-  if (subj->num_contours > 0) {
-    s_heap = build_lmt(&lmt, &sbtree, &sbt_entries, subj, SUBJ, op);
-  }
-  if (clip->num_contours > 0) {
-    c_heap = build_lmt(&lmt, &sbtree, &sbt_entries, clip, CLIP, op);
-  }
-  /* Return a NULL result if no contours contribute */
-  if (lmt == NULL) {
-    result->num_contours = 0;
-    result->hole = NULL;
-    result->contour = NULL;
-    reset_lmt(&lmt);
-    gpc_free<edge_node>(s_heap);
-    gpc_free<edge_node>(c_heap);
-    return;
-  }
-
-  /* Build scanbeam table from scanbeam tree */
-  gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
-                     const_cast<char *>("sbt creation"));
-  build_sbt(&scanbeam, sbt, sbtree);
-  scanbeam = 0;
-  free_sbtree(&sbtree);
-  /* Allow pointer re-use without causing memory leak */
-  if (subj == result) {
-    gpc_free_polygon(subj);
-  }
-  if (clip == result) {
-    gpc_free_polygon(clip);
-  }
-  /* Invert clip polygon for difference operation */
-  if (op == GPC_DIFF) {
-    parity[CLIP] = RIGHT;
-  }
-  local_min = lmt;
-
-  // Process each scanbeam
-  while (scanbeam < sbt_entries) {
-    /* Set yb and yt to the bottom and top of the scanbeam */
-    yb = sbt[scanbeam++];
-    if (scanbeam < sbt_entries) {
-      yt = sbt[scanbeam];
-      dy = yt - yb;
-    }
-    /* === SCANBEAM BOUNDARY PROCESSING ================================ */
-    /* If LMT node corresponding to yb exists */
-    if (local_min) {
-      if (local_min->y == yb) {
-        /* Add edges starting at this local minimum to the AET */
-        for (edge = local_min->first_bound; edge; edge = edge->next_bound) {
-          add_edge_to_aet(&aet, edge, NULL);
-        }
-        local_min = local_min->next;
-      }
-    }
-    /* Set dummy previous x value */
-    px = -DBL_MAX;
-    /* Create bundles within AET */
-    e0 = aet;
-    e1 = aet;
-    /* Set up bundle fields of first edge */
-    aet->bundle[ABOVE][aet->type] = (aet->top.y != yb);
-    aet->bundle[ABOVE][!aet->type] = 0;
-    aet->bstate[ABOVE] = UNBUNDLED;
-
-    for (next_edge = aet->next; next_edge; next_edge = next_edge->next) {
-      /* Set up bundle fields of next edge */
-      next_edge->bundle[ABOVE][next_edge->type] = (next_edge->top.y != yb);
-      next_edge->bundle[ABOVE][!next_edge->type] = 0;
-      next_edge->bstate[ABOVE] = UNBUNDLED;
-      /* Bundle edges above the scanbeam boundary if they coincide */
-      if (next_edge->bundle[ABOVE][next_edge->type]) {
-        if (gpc_eq(e0->xb, next_edge->xb) && gpc_eq(e0->dx, next_edge->dx) &&
-            (e0->top.y != yb)) {
-          next_edge->bundle[ABOVE][next_edge->type] ^=
-              e0->bundle[ABOVE][next_edge->type];
-          next_edge->bundle[ABOVE][!next_edge->type] =
-              e0->bundle[ABOVE][!next_edge->type];
-          next_edge->bstate[ABOVE] = BUNDLE_HEAD;
-          e0->bundle[ABOVE][CLIP] = 0;
-          e0->bundle[ABOVE][SUBJ] = 0;
-          e0->bstate[ABOVE] = BUNDLE_TAIL;
-        }
-        e0 = next_edge;
-      }
-    }
-    horiz[CLIP] = NH;
-    horiz[SUBJ] = NH;
-
-    // Process each edge at this scanbeam boundary
-    for (edge = aet; edge; edge = edge->next) {
-      exists[CLIP] =
-          edge->bundle[ABOVE][CLIP] + (edge->bundle[BELOW][CLIP] << 1);
-      exists[SUBJ] =
-          edge->bundle[ABOVE][SUBJ] + (edge->bundle[BELOW][SUBJ] << 1);
-      if (exists[CLIP] || exists[SUBJ]) {
-        /* Set bundle side */
-        edge->bside[CLIP] = parity[CLIP];
-        edge->bside[SUBJ] = parity[SUBJ];
-        /* Determine contributing status and quadrant occupancies */
-        switch (op) {
-          case GPC_DIFF:
-          case GPC_INT:
-            contributing = (exists[CLIP] && (parity[SUBJ] || horiz[SUBJ])) ||
-                           (exists[SUBJ] && (parity[CLIP] || horiz[CLIP])) ||
-                           (exists[CLIP] && exists[SUBJ] &&
-                            (parity[CLIP] == parity[SUBJ]));
-            br = (parity[CLIP]) && (parity[SUBJ]);
-            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) &&
-                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
-            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) &&
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
-            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
-                  edge->bundle[BELOW][CLIP]) &&
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
-                  edge->bundle[BELOW][SUBJ]);
-            break;
-          case GPC_XOR:
-            contributing = exists[CLIP] || exists[SUBJ];
-            br = (parity[CLIP]) ^ (parity[SUBJ]);
-            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ^
-                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
-            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ^
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
-            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
-                  edge->bundle[BELOW][CLIP]) ^
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
-                  edge->bundle[BELOW][SUBJ]);
-            break;
-          case GPC_UNION:
-            contributing = (exists[CLIP] && (!parity[SUBJ] || horiz[SUBJ])) ||
-                           (exists[SUBJ] && (!parity[CLIP] || horiz[CLIP])) ||
-                           (exists[CLIP] && exists[SUBJ] &&
-                            (parity[CLIP] == parity[SUBJ]));
-            br = (parity[CLIP]) || (parity[SUBJ]);
-            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ||
-                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
-            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ||
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
-            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
-                  edge->bundle[BELOW][CLIP]) ||
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
-                  edge->bundle[BELOW][SUBJ]);
-            break;
-        }
-        // Update parity
-        parity[CLIP] ^= edge->bundle[ABOVE][CLIP];
-        parity[SUBJ] ^= edge->bundle[ABOVE][SUBJ];
-        /* Update horizontal state */
-        if (exists[CLIP]) {
-          horiz[CLIP] = next_h_state[horiz[CLIP]]
-                                    [((exists[CLIP] - 1) << 1) + parity[CLIP]];
-        }
-        if (exists[SUBJ]) {
-          horiz[SUBJ] = next_h_state[horiz[SUBJ]]
-                                    [((exists[SUBJ] - 1) << 1) + parity[SUBJ]];
-        }
-        vclass = tr + (tl << 1) + (br << 2) + (bl << 3);
-        if (contributing) {
-          xb = edge->xb;
-          switch (vclass) {
-            case EMN:
-            case IMN:
-              add_local_min(&out_poly, edge, xb, yb);
-              px = xb;
-              cf = edge->outp[ABOVE];
-              break;
-            case ERI:
-              if (xb != px) {
-                add_right(cf, xb, yb);
-                px = xb;
-              }
-              edge->outp[ABOVE] = cf;
-              cf = NULL;
-              break;
-            case ELI:
-              add_left(edge->outp[BELOW], xb, yb);
-              px = xb;
-              cf = edge->outp[BELOW];
-              break;
-            case EMX:
-              if (xb != px) {
-                add_left(cf, xb, yb);
-                px = xb;
-              }
-              merge_right(cf, edge->outp[BELOW], out_poly);
-              cf = NULL;
-              break;
-            case ILI:
-              if (xb != px) {
-                add_left(cf, xb, yb);
-                px = xb;
-              }
-              edge->outp[ABOVE] = cf;
-              cf = NULL;
-              break;
-            case IRI:
-              add_right(edge->outp[BELOW], xb, yb);
-              px = xb;
-              cf = edge->outp[BELOW];
-              edge->outp[BELOW] = NULL;
-              break;
-            case IMX:
-              if (xb != px) {
-                add_right(cf, xb, yb);
-                px = xb;
-              }
-              merge_left(cf, edge->outp[BELOW], out_poly);
-              cf = NULL;
-              edge->outp[BELOW] = NULL;
-              break;
-            case IMM:
-              if (xb != px) {
-                add_right(cf, xb, yb);
-                px = xb;
-              }
-              merge_left(cf, edge->outp[BELOW], out_poly);
-              edge->outp[BELOW] = NULL;
-              add_local_min(&out_poly, edge, xb, yb);
-              cf = edge->outp[ABOVE];
-              break;
-            case EMM:
-              if (xb != px) {
-                add_left(cf, xb, yb);
-                px = xb;
-              }
-              merge_right(cf, edge->outp[BELOW], out_poly);
-              edge->outp[BELOW] = NULL;
-              add_local_min(&out_poly, edge, xb, yb);
-              cf = edge->outp[ABOVE];
-              break;
-            case LED:
-              if (edge->bot.y == yb) {
-                add_left(edge->outp[BELOW], xb, yb);
-              }
-              edge->outp[ABOVE] = edge->outp[BELOW];
-              px = xb;
-              break;
-            case RED:
-              if (edge->bot.y == yb) {
-                add_right(edge->outp[BELOW], xb, yb);
-              }
-              edge->outp[ABOVE] = edge->outp[BELOW];
-              px = xb;
-              break;
-            default:
-              break;
-          } /* End of switch */
-        }   /* End of contributing conditional */
-      }     /* End of edge exists conditional */
-    }       // End of AET loop
-
-    /* Delete terminating edges from the AET, otherwise compute xt */
-    for (edge = aet; edge; edge = edge->next) {
-      if (edge->top.y == yb) {
-        prev_edge = edge->prev;
-        next_edge = edge->next;
-        if (prev_edge) {
-          prev_edge->next = next_edge;
-        } else {
-          aet = next_edge;
-        }
-        if (next_edge) {
-          next_edge->prev = prev_edge;
-        }
-        /* Copy bundle head state to the adjacent tail edge if required */
-        if ((edge->bstate[BELOW] == BUNDLE_HEAD) && prev_edge) {
-          if (prev_edge->bstate[BELOW] == BUNDLE_TAIL) {
-            prev_edge->outp[BELOW] = edge->outp[BELOW];
-            prev_edge->bstate[BELOW] = UNBUNDLED;
-            if (prev_edge->prev) {
-              if (prev_edge->prev->bstate[BELOW] == BUNDLE_TAIL) {
-                prev_edge->bstate[BELOW] = BUNDLE_HEAD;
-              }
-            }
-          }
-        }
-      } else {
-        if (edge->top.y == yt) {
-          edge->xt = edge->top.x;
-        } else {
-          edge->xt = edge->bot.x + edge->dx * (yt - edge->bot.y);
-        }
-      }
-    }
-
-    if (scanbeam < sbt_entries) {
-      /* === SCANBEAM INTERIOR PROCESSING ============================== */
-      build_intersection_table(&it, aet, dy);
-      /* Process each node in the intersection table */
-      for (intersect = it; intersect; intersect = intersect->next) {
-        e0 = intersect->ie[0];
-        e1 = intersect->ie[1];
-        /* Only generate output for contributing intersections */
-        if ((e0->bundle[ABOVE][CLIP] || e0->bundle[ABOVE][SUBJ]) &&
-            (e1->bundle[ABOVE][CLIP] || e1->bundle[ABOVE][SUBJ])) {
-          p = e0->outp[ABOVE];
-          q = e1->outp[ABOVE];
-          ix = intersect->point.x;
-          iy = intersect->point.y + yb;
-
-          in[CLIP] = (e0->bundle[ABOVE][CLIP] && !e0->bside[CLIP]) ||
-                     (e1->bundle[ABOVE][CLIP] && e1->bside[CLIP]) ||
-                     (!e0->bundle[ABOVE][CLIP] && !e1->bundle[ABOVE][CLIP] &&
-                      e0->bside[CLIP] && e1->bside[CLIP]);
-          in[SUBJ] = (e0->bundle[ABOVE][SUBJ] && !e0->bside[SUBJ]) ||
-                     (e1->bundle[ABOVE][SUBJ] && e1->bside[SUBJ]) ||
-                     (!e0->bundle[ABOVE][SUBJ] && !e1->bundle[ABOVE][SUBJ] &&
-                      e0->bside[SUBJ] && e1->bside[SUBJ]);
-
-          // Determine quadrant occupancies
-          switch (op) {
-            case GPC_DIFF:
-            case GPC_INT:
-              tr = (in[CLIP]) && (in[SUBJ]);
-              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) &&
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
-              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) &&
-                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
-              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
-                    e0->bundle[ABOVE][CLIP]) &&
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
-                    e0->bundle[ABOVE][SUBJ]);
-              break;
-            case GPC_XOR:
-              tr = (in[CLIP]) ^ (in[SUBJ]);
-              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ^
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
-              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ^
-                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
-              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
-                    e0->bundle[ABOVE][CLIP]) ^
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
-                    e0->bundle[ABOVE][SUBJ]);
-              break;
-            case GPC_UNION:
-              tr = (in[CLIP]) || (in[SUBJ]);
-              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ||
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
-              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ||
-                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
-              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
-                    e0->bundle[ABOVE][CLIP]) ||
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
-                    e0->bundle[ABOVE][SUBJ]);
-              break;
-          }
-          vclass = tr + (tl << 1) + (br << 2) + (bl << 3);
-          switch (vclass) {
-            case EMN:
-              add_local_min(&out_poly, e0, ix, iy);
-              e1->outp[ABOVE] = e0->outp[ABOVE];
-              break;
-            case ERI:
-              if (p) {
-                add_right(p, ix, iy);
-                e1->outp[ABOVE] = p;
-                e0->outp[ABOVE] = NULL;
-              }
-              break;
-            case ELI:
-              if (q) {
-                add_left(q, ix, iy);
-                e0->outp[ABOVE] = q;
-                e1->outp[ABOVE] = NULL;
-              }
-              break;
-            case EMX:
-              if (p && q) {
-                add_left(p, ix, iy);
-                merge_right(p, q, out_poly);
-                e0->outp[ABOVE] = NULL;
-                e1->outp[ABOVE] = NULL;
-              }
-              break;
-            case IMN:
-              add_local_min(&out_poly, e0, ix, iy);
-              e1->outp[ABOVE] = e0->outp[ABOVE];
-              break;
-            case ILI:
-              if (p) {
-                add_left(p, ix, iy);
-                e1->outp[ABOVE] = p;
-                e0->outp[ABOVE] = NULL;
-              }
-              break;
-            case IRI:
-              if (q) {
-                add_right(q, ix, iy);
-                e0->outp[ABOVE] = q;
-                e1->outp[ABOVE] = NULL;
-              }
-              break;
-            case IMX:
-              if (p && q) {
-                add_right(p, ix, iy);
-                merge_left(p, q, out_poly);
-                e0->outp[ABOVE] = NULL;
-                e1->outp[ABOVE] = NULL;
-              }
-              break;
-            case IMM:
-              if (p && q) {
-                add_right(p, ix, iy);
-                merge_left(p, q, out_poly);
-                add_local_min(&out_poly, e0, ix, iy);
-                e1->outp[ABOVE] = e0->outp[ABOVE];
-              }
-              break;
-            case EMM:
-              if (p && q) {
-                add_left(p, ix, iy);
-                merge_right(p, q, out_poly);
-                add_local_min(&out_poly, e0, ix, iy);
-                e1->outp[ABOVE] = e0->outp[ABOVE];
-              }
-              break;
-            default:
-              break;
-          }  // End of switch
-        }    /* End of contributing intersection conditional */
-
-        /* Swap bundle sides in response to edge crossing */
-        if (e0->bundle[ABOVE][CLIP]) {
-          e1->bside[CLIP] = !e1->bside[CLIP];
-        }
-        if (e1->bundle[ABOVE][CLIP]) {
-          e0->bside[CLIP] = !e0->bside[CLIP];
-        }
-        if (e0->bundle[ABOVE][SUBJ]) {
-          e1->bside[SUBJ] = !e1->bside[SUBJ];
-        }
-        if (e1->bundle[ABOVE][SUBJ]) {
-          e0->bside[SUBJ] = !e0->bside[SUBJ];
-        }
-
-        /* Swap e0 and e1 bundles in the AET */
-        prev_edge = e0->prev;
-        next_edge = e1->next;
-        if (next_edge) {
-          next_edge->prev = e0;
-        }
-        if (e0->bstate[ABOVE] == BUNDLE_HEAD) {
-          search = 1;
-          while (search) {
-            prev_edge = prev_edge->prev;
-            if (prev_edge) {
-              if (prev_edge->bstate[ABOVE] != BUNDLE_TAIL) {
-                search = 0;
-              }
-            } else {
-              search = 0;
-            }
-          }
-        }
-        if (!prev_edge) {
-          aet->prev = e1;
-          e1->next = aet;
-          aet = e0->next;
-        } else {
-          prev_edge->next->prev = e1;
-          e1->next = prev_edge->next;
-          prev_edge->next = e0->next;
-        }
-        e0->next->prev = prev_edge;
-        e1->next->prev = e1;
-        e0->next = next_edge;
-      } /* End of IT loop*/
-
-      // Prepare for next scanbeam
-      for (edge = aet; edge; edge = next_edge) {
-        next_edge = edge->next;
-        succ_edge = edge->succ;
-        if ((edge->top.y == yt) && succ_edge) {
-          /* Replace AET edge by its successor */
-          succ_edge->outp[BELOW] = edge->outp[ABOVE];
-          succ_edge->bstate[BELOW] = edge->bstate[ABOVE];
-          succ_edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP];
-          succ_edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ];
-          prev_edge = edge->prev;
-          if (prev_edge) {
-            prev_edge->next = succ_edge;
-          } else {
-            aet = succ_edge;
-          }
-          if (next_edge) {
-            next_edge->prev = succ_edge;
-          }
-          succ_edge->prev = prev_edge;
-          succ_edge->next = next_edge;
-        } else {
-          /* Update this edge */
-          edge->outp[BELOW] = edge->outp[ABOVE];
-          edge->bstate[BELOW] = edge->bstate[ABOVE];
-          edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP];
-          edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ];
-          edge->xb = edge->xt;
-        }
-        edge->outp[ABOVE] = NULL;
-      }
-    }
-  } /* === END OF SCANBEAM PROCESSING ================================== */
-  // Generate result polygon from out_poly
-  result->contour = NULL;
-  result->hole = NULL;
-  result->num_contours = count_contours(out_poly);
-  if (result->num_contours > 0) {
-    gpc_malloc<int>(result->hole, result->num_contours * sizeof(int),
-                    const_cast<char *>("hole flag table creation"));
-    gpc_malloc<gpc_vertex_list>(result->contour,
-                                result->num_contours * sizeof(gpc_vertex_list),
-                                const_cast<char *>("contour creation"));
-
-    c = 0;
-    for (poly = out_poly; poly; poly = npoly) {
-      npoly = poly->next;
-      if (poly->active) {
-        result->hole[c] = poly->proxy->hole;
-        result->contour[c].num_vertices = poly->active;
-        gpc_malloc<gpc_vertex>(
-            result->contour[c].vertex,
-            result->contour[c].num_vertices * sizeof(gpc_vertex),
-            const_cast<char *>("vertex creation"));
-
-        v = result->contour[c].num_vertices - 1;
-        for (vtx = poly->proxy->v[LEFT]; vtx; vtx = nv) {
-          nv = vtx->next;
-          result->contour[c].vertex[v].x = vtx->x;
-          result->contour[c].vertex[v].y = vtx->y;
-          gpc_free<vertex_node>(vtx);
-          v--;
-        }
-        c++;
-      }
-      gpc_free<polygon_node>(poly);
-    }
-  } else {
-    for (poly = out_poly; poly; poly = npoly) {
-      npoly = poly->next;
-      gpc_free<polygon_node>(poly);
-    }
-  }
-
-  // Tidy up
-  reset_it(&it);
-  reset_lmt(&lmt);
-  gpc_free<edge_node>(c_heap);
-  gpc_free<edge_node>(s_heap);
-  gpc_free<double>(sbt);
-}  // NOLINT
-
-void gpc_free_tristrip(gpc_tristrip *t) {
-  int s = 0;
-  for (s = 0; s < t->num_strips; s++) {
-    gpc_free<gpc_vertex>(t->strip[s].vertex);
-  }
-  gpc_free<gpc_vertex_list>(t->strip);
-  t->num_strips = 0;
-}
-
-void gpc_polygon_to_tristrip(gpc_polygon *s, gpc_tristrip *t) {
-  gpc_polygon c;
-  c.num_contours = 0;
-  c.hole = NULL;
-  c.contour = NULL;
-  gpc_tristrip_clip(GPC_DIFF, s, &c, t);
-}
-
-// gpc_tristrip_clip
-void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
-                       gpc_tristrip *result) {
-  sb_tree *sbtree = NULL;
-  it_node *it = NULL;
-  it_node *intersect = NULL;
-  edge_node *edge = NULL;
-  edge_node *prev_edge = NULL;
-  edge_node *next_edge = NULL;
-  edge_node *succ_edge = NULL;
-  edge_node *e0 = NULL;
-  edge_node *e1 = NULL;
-  edge_node *aet = NULL;
-  edge_node *c_heap = NULL;
-  edge_node *s_heap = NULL;
-  edge_node *cf = NULL;
-  lmt_node *lmt = NULL;
-  lmt_node *local_min = NULL;
-  polygon_node *tlist = NULL;
-  polygon_node *tn = NULL;
-  polygon_node *tnn = NULL;
-  polygon_node *p = NULL;
-  polygon_node *q = NULL;
-  vertex_node *lt = NULL;
-  vertex_node *ltn = NULL;
-  vertex_node *rt = NULL;
-  vertex_node *rtn = NULL;
-  h_state horiz[2];
-  vertex_type cft = NUL;
-  int in[2];
-  int exists[2];
-  int parity[2] = {LEFT, LEFT};
-  int s = 0;
-  int v = 0;
-  int contributing = 0;
-  int search = 0;
-  int scanbeam = 0;
-  int sbt_entries = 0;
-  int vclass = 0;
-  int bl = 0;
-  int br = 0;
-  int tl = 0;
-  int tr = 0;
-  double *sbt = NULL;
-  double xb = 0.0;
-  double px = 0.0;
-  double nx = 0.0;
-  double yb = 0.0;
-  double yt = 0.0;
-  double dy = 0.0;
-  double ix = 0.0;
-  double iy = 0.0;
-
-  /* Test for trivial NULL result cases */
-  if (((subj->num_contours == 0) && (clip->num_contours == 0)) ||
-      ((subj->num_contours == 0) && ((op == GPC_INT) || (op == GPC_DIFF))) ||
-      ((clip->num_contours == 0) && (op == GPC_INT))) {
-    result->num_strips = 0;
-    result->strip = NULL;
-    return;
-  }
-
-  /* Identify potentialy contributing contours */
-  if (((op == GPC_INT) || (op == GPC_DIFF)) && (subj->num_contours > 0) &&
-      (clip->num_contours > 0)) {
-    minimax_test(subj, clip, op);
-  }
-  /* Build LMT */
-  if (subj->num_contours > 0) {
-    s_heap = build_lmt(&lmt, &sbtree, &sbt_entries, subj, SUBJ, op);
-  }
-  if (clip->num_contours > 0) {
-    c_heap = build_lmt(&lmt, &sbtree, &sbt_entries, clip, CLIP, op);
-  }
-  /* Return a NULL result if no contours contribute */
-  if (lmt == NULL) {
-    result->num_strips = 0;
-    result->strip = NULL;
-    reset_lmt(&lmt);
-    gpc_free<edge_node>(s_heap);
-    gpc_free<edge_node>(c_heap);
-    return;
-  }
-
-  /* Build scanbeam table from scanbeam tree */
-  gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
-                     const_cast<char *>("sbt creation"));
-  build_sbt(&scanbeam, sbt, sbtree);
-  scanbeam = 0;
-  free_sbtree(&sbtree);
-
-  /* Invert clip polygon for difference operation */
-  if (op == GPC_DIFF) {
-    parity[CLIP] = RIGHT;
-  }
-  local_min = lmt;
-
-  // Process each scanbeam
-  while (scanbeam < sbt_entries) {
-    /* Set yb and yt to the bottom and top of the scanbeam */
-    yb = sbt[scanbeam++];
-    if (scanbeam < sbt_entries) {
-      yt = sbt[scanbeam];
-      dy = yt - yb;
-    }
-
-    /* === SCANBEAM BOUNDARY PROCESSING ================================ */
-    /* If LMT node corresponding to yb exists */
-    if (local_min) {
-      if (local_min->y == yb) {
-        /* Add edges starting at this local minimum to the AET */
-        for (edge = local_min->first_bound; edge; edge = edge->next_bound) {
-          add_edge_to_aet(&aet, edge, NULL);
-        }
-        local_min = local_min->next;
-      }
-    }
-    /* Set dummy previous x value */
-    /* Create bundles within AET */
-    px = -DBL_MAX;
-    e0 = aet;
-    e1 = aet;
-
-    /* Set up bundle fields of first edge */
-    aet->bundle[ABOVE][aet->type] = (aet->top.y != yb);
-    aet->bundle[ABOVE][!aet->type] = 0;
-    aet->bstate[ABOVE] = UNBUNDLED;
-
-    for (next_edge = aet->next; next_edge; next_edge = next_edge->next) {
-      /* Set up bundle fields of next edge */
-      next_edge->bundle[ABOVE][next_edge->type] = (next_edge->top.y != yb);
-      next_edge->bundle[ABOVE][!next_edge->type] = 0;
-      next_edge->bstate[ABOVE] = UNBUNDLED;
-
-      /* Bundle edges above the scanbeam boundary if they coincide */
-      if (next_edge->bundle[ABOVE][next_edge->type]) {
-        if (gpc_eq(e0->xb, next_edge->xb) && gpc_eq(e0->dx, next_edge->dx) &&
-            (e0->top.y != yb)) {
-          next_edge->bundle[ABOVE][next_edge->type] ^=
-              e0->bundle[ABOVE][next_edge->type];
-          next_edge->bundle[ABOVE][!next_edge->type] =
-              e0->bundle[ABOVE][!next_edge->type];
-          next_edge->bstate[ABOVE] = BUNDLE_HEAD;
-          e0->bundle[ABOVE][CLIP] = 0;
-          e0->bundle[ABOVE][SUBJ] = 0;
-          e0->bstate[ABOVE] = BUNDLE_TAIL;
-        }
-        e0 = next_edge;
-      }
-    }
-    horiz[CLIP] = NH;
-    horiz[SUBJ] = NH;
-
-    /* Process each edge at this scanbeam boundary */
-    for (edge = aet; edge; edge = edge->next) {
-      exists[CLIP] =
-          edge->bundle[ABOVE][CLIP] + (edge->bundle[BELOW][CLIP] << 1);
-      exists[SUBJ] =
-          edge->bundle[ABOVE][SUBJ] + (edge->bundle[BELOW][SUBJ] << 1);
-
-      if (exists[CLIP] || exists[SUBJ]) {
-        /* Set bundle side */
-        edge->bside[CLIP] = parity[CLIP];
-        edge->bside[SUBJ] = parity[SUBJ];
-
-        /* Determine contributing status and quadrant occupancies */
-        switch (op) {
-          case GPC_DIFF:
-          case GPC_INT:
-            contributing = (exists[CLIP] && (parity[SUBJ] || horiz[SUBJ])) ||
-                           (exists[SUBJ] && (parity[CLIP] || horiz[CLIP])) ||
-                           (exists[CLIP] && exists[SUBJ] &&
-                            (parity[CLIP] == parity[SUBJ]));
-            br = (parity[CLIP]) && (parity[SUBJ]);
-            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) &&
-                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
-            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) &&
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
-            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
-                  edge->bundle[BELOW][CLIP]) &&
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
-                  edge->bundle[BELOW][SUBJ]);
-            break;
-          case GPC_XOR:
-            contributing = exists[CLIP] || exists[SUBJ];
-            br = (parity[CLIP]) ^ (parity[SUBJ]);
-            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ^
-                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
-            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ^
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
-            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
-                  edge->bundle[BELOW][CLIP]) ^
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
-                  edge->bundle[BELOW][SUBJ]);
-            break;
-          case GPC_UNION:
-            contributing = (exists[CLIP] && (!parity[SUBJ] || horiz[SUBJ])) ||
-                           (exists[SUBJ] && (!parity[CLIP] || horiz[CLIP])) ||
-                           (exists[CLIP] && exists[SUBJ] &&
-                            (parity[CLIP] == parity[SUBJ]));
-            br = (parity[CLIP]) || (parity[SUBJ]);
-            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ||
-                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
-            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ||
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
-            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
-                  edge->bundle[BELOW][CLIP]) ||
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
-                  edge->bundle[BELOW][SUBJ]);
-            break;
-        }
-
-        // Update parity
-        parity[CLIP] ^= edge->bundle[ABOVE][CLIP];
-        parity[SUBJ] ^= edge->bundle[ABOVE][SUBJ];
-
-        /* Update horizontal state */
-        if (exists[CLIP]) {
-          horiz[CLIP] = next_h_state[horiz[CLIP]]
-                                    [((exists[CLIP] - 1) << 1) + parity[CLIP]];
-        }
-        if (exists[SUBJ]) {
-          horiz[SUBJ] = next_h_state[horiz[SUBJ]]
-                                    [((exists[SUBJ] - 1) << 1) + parity[SUBJ]];
-        }
-        vclass = tr + (tl << 1) + (br << 2) + (bl << 3);
-
-        if (contributing) {
-          xb = edge->xb;
-          switch (vclass) {
-            case EMN:
-              new_tristrip(&tlist, edge, xb, yb);
-              cf = edge;
-              break;
-            case ERI:
-              edge->outp[ABOVE] = cf->outp[ABOVE];
-              if (xb != cf->xb) {
-                gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb);
-              }
-              cf = NULL;
-              break;
-            case ELI:
-              gpc_vertex_create(edge, BELOW, LEFT, xb, yb);
-              edge->outp[ABOVE] = NULL;
-              cf = edge;
-              break;
-            case EMX:
-              if (xb != cf->xb) {
-                gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
-              }
-              edge->outp[ABOVE] = NULL;
-              cf = NULL;
-              break;
-            case IMN:
-              if (cft == LED) {
-                if (cf->bot.y != yb) {
-                  gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb);
-                }
-                new_tristrip(&tlist, cf, cf->xb, yb);
-              }
-              edge->outp[ABOVE] = cf->outp[ABOVE];
-              gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb);
-              break;
-            case ILI:
-              new_tristrip(&tlist, edge, xb, yb);
-              cf = edge;
-              cft = ILI;
-              break;
-            case IRI:
-              if (cft == LED) {
-                if (cf->bot.y != yb) {
-                  gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb);
-                }
-                new_tristrip(&tlist, cf, cf->xb, yb);
-              }
-              gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
-              edge->outp[ABOVE] = NULL;
-              break;
-            case IMX:
-              gpc_vertex_create(edge, BELOW, LEFT, xb, yb);
-              edge->outp[ABOVE] = NULL;
-              cft = IMX;
-              break;
-            case IMM:
-              gpc_vertex_create(edge, BELOW, LEFT, xb, yb);
-              edge->outp[ABOVE] = cf->outp[ABOVE];
-              if (xb != cf->xb) {
-                gpc_vertex_create(cf, ABOVE, RIGHT, xb, yb);
-              }
-              cf = edge;
-              break;
-            case EMM:
-              gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
-              edge->outp[ABOVE] = NULL;
-              new_tristrip(&tlist, edge, xb, yb);
-              cf = edge;
-              break;
-            case LED:
-              if (edge->bot.y == yb) {
-                gpc_vertex_create(edge, BELOW, LEFT, xb, yb);
-              }
-              edge->outp[ABOVE] = edge->outp[BELOW];
-              cf = edge;
-              cft = LED;
-              break;
-            case RED:
-              edge->outp[ABOVE] = cf->outp[ABOVE];
-              if (cft == LED) {
-                if (cf->bot.y == yb) {
-                  gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
-                } else {
-                  if (edge->bot.y == yb) {
-                    gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb);
-                    gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
-                  }
-                }
-              } else {
-                gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
-                gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb);
-              }
-              cf = NULL;
-              break;
-            default:
-              break;
-          } /* End of switch */
-        }   /* End of contributing conditional */
-      }     /* End of edge exists conditional */
-    }       // End of AET loop
-
-    /* Delete terminating edges from the AET, otherwise compute xt */
-    for (edge = aet; edge; edge = edge->next) {
-      if (edge->top.y == yb) {
-        prev_edge = edge->prev;
-        next_edge = edge->next;
-        if (prev_edge) {
-          prev_edge->next = next_edge;
-        } else {
-          aet = next_edge;
-        }
-        if (next_edge) {
-          next_edge->prev = prev_edge;
-        }
-
-        /* Copy bundle head state to the adjacent tail edge if required */
-        if ((edge->bstate[BELOW] == BUNDLE_HEAD) && prev_edge) {
-          if (prev_edge->bstate[BELOW] == BUNDLE_TAIL) {
-            prev_edge->outp[BELOW] = edge->outp[BELOW];
-            prev_edge->bstate[BELOW] = UNBUNDLED;
-            if (prev_edge->prev) {
-              if (prev_edge->prev->bstate[BELOW] == BUNDLE_TAIL) {
-                prev_edge->bstate[BELOW] = BUNDLE_HEAD;
-              }
-            }
-          }
-        }
-      } else {
-        if (edge->top.y == yt) {
-          edge->xt = edge->top.x;
-        } else {
-          edge->xt = edge->bot.x + edge->dx * (yt - edge->bot.y);
-        }
-      }
-    }
-
-    if (scanbeam < sbt_entries) {
-      /* === SCANBEAM INTERIOR PROCESSING ============================== */
-      build_intersection_table(&it, aet, dy);
-      /* Process each node in the intersection table */
-      for (intersect = it; intersect; intersect = intersect->next) {
-        e0 = intersect->ie[0];
-        e1 = intersect->ie[1];
-
-        /* Only generate output for contributing intersections */
-        if ((e0->bundle[ABOVE][CLIP] || e0->bundle[ABOVE][SUBJ]) &&
-            (e1->bundle[ABOVE][CLIP] || e1->bundle[ABOVE][SUBJ])) {
-          p = e0->outp[ABOVE];
-          q = e1->outp[ABOVE];
-          ix = intersect->point.x;
-          iy = intersect->point.y + yb;
-
-          in[CLIP] = (e0->bundle[ABOVE][CLIP] && !e0->bside[CLIP]) ||
-                     (e1->bundle[ABOVE][CLIP] && e1->bside[CLIP]) ||
-                     (!e0->bundle[ABOVE][CLIP] && !e1->bundle[ABOVE][CLIP] &&
-                      e0->bside[CLIP] && e1->bside[CLIP]);
-          in[SUBJ] = (e0->bundle[ABOVE][SUBJ] && !e0->bside[SUBJ]) ||
-                     (e1->bundle[ABOVE][SUBJ] && e1->bside[SUBJ]) ||
-                     (!e0->bundle[ABOVE][SUBJ] && !e1->bundle[ABOVE][SUBJ] &&
-                      e0->bside[SUBJ] && e1->bside[SUBJ]);
-
-          switch (op) {  // Determine quadrant occupancies
-            case GPC_DIFF:
-            case GPC_INT:
-              tr = (in[CLIP]) && (in[SUBJ]);
-              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) &&
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
-              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) &&
-                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
-              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
-                    e0->bundle[ABOVE][CLIP]) &&
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
-                    e0->bundle[ABOVE][SUBJ]);
-              break;
-            case GPC_XOR:
-              tr = (in[CLIP]) ^ (in[SUBJ]);
-              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ^
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
-              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ^
-                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
-              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
-                    e0->bundle[ABOVE][CLIP]) ^
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
-                    e0->bundle[ABOVE][SUBJ]);
-              break;
-            case GPC_UNION:
-              tr = (in[CLIP]) || (in[SUBJ]);
-              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ||
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
-              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ||
-                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
-              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
-                    e0->bundle[ABOVE][CLIP]) ||
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
-                    e0->bundle[ABOVE][SUBJ]);
-              break;
-          }
-
-          vclass = tr + (tl << 1) + (br << 2) + (bl << 3);
-          switch (vclass) {
-            case EMN:
-              new_tristrip(&tlist, e1, ix, iy);
-              e0->outp[ABOVE] = e1->outp[ABOVE];
-              break;
-            case ERI:
-              if (p) {
-                gpc_p_edge(prev_edge, e0, ABOVE);
-                gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
-                gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy);
-                e1->outp[ABOVE] = e0->outp[ABOVE];
-                e0->outp[ABOVE] = NULL;
-              }
-              break;
-            case ELI:
-              if (q) {
-                gpc_n_edge(next_edge, e1, ABOVE);
-                gpc_vertex_create(e1, ABOVE, LEFT, ix, iy);
-                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
-                e0->outp[ABOVE] = e1->outp[ABOVE];
-                e1->outp[ABOVE] = NULL;
-              }
-              break;
-            case EMX:
-              if (p && q) {
-                gpc_vertex_create(e0, ABOVE, LEFT, ix, iy);
-                e0->outp[ABOVE] = NULL;
-                e1->outp[ABOVE] = NULL;
-              }
-              break;
-            case IMN:
-              gpc_p_edge(prev_edge, e0, ABOVE);
-              gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
-              gpc_n_edge(next_edge, e1, ABOVE);
-              gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
-              new_tristrip(&tlist, prev_edge, px, iy);
-              e1->outp[ABOVE] = prev_edge->outp[ABOVE];
-              gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy);
-              new_tristrip(&tlist, e0, ix, iy);
-              next_edge->outp[ABOVE] = e0->outp[ABOVE];
-              gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
-              break;
-            case ILI:
-              if (p) {
-                gpc_vertex_create(e0, ABOVE, LEFT, ix, iy);
-                gpc_n_edge(next_edge, e1, ABOVE);
-                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
-                e1->outp[ABOVE] = e0->outp[ABOVE];
-                e0->outp[ABOVE] = NULL;
-              }
-              break;
-            case IRI:
-              if (q) {
-                gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy);
-                gpc_p_edge(prev_edge, e0, ABOVE);
-                gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
-                e0->outp[ABOVE] = e1->outp[ABOVE];
-                e1->outp[ABOVE] = NULL;
-              }
-              break;
-            case IMX:
-              if (p && q) {
-                gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy);
-                gpc_vertex_create(e1, ABOVE, LEFT, ix, iy);
-                e0->outp[ABOVE] = NULL;
-                e1->outp[ABOVE] = NULL;
-                gpc_p_edge(prev_edge, e0, ABOVE);
-                gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
-                new_tristrip(&tlist, prev_edge, px, iy);
-                gpc_n_edge(next_edge, e1, ABOVE);
-                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
-                next_edge->outp[ABOVE] = prev_edge->outp[ABOVE];
-                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
-              }
-              break;
-            case IMM:
-              if (p && q) {
-                gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy);
-                gpc_vertex_create(e1, ABOVE, LEFT, ix, iy);
-                gpc_p_edge(prev_edge, e0, ABOVE);
-                gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
-                new_tristrip(&tlist, prev_edge, px, iy);
-                gpc_n_edge(next_edge, e1, ABOVE);
-                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
-                e1->outp[ABOVE] = prev_edge->outp[ABOVE];
-                gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy);
-                new_tristrip(&tlist, e0, ix, iy);
-                next_edge->outp[ABOVE] = e0->outp[ABOVE];
-                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
-              }
-              break;
-            case EMM:
-              if (p && q) {
-                gpc_vertex_create(e0, ABOVE, LEFT, ix, iy);
-                new_tristrip(&tlist, e1, ix, iy);
-                e0->outp[ABOVE] = e1->outp[ABOVE];
-              }
-              break;
-            default:
-              break;
-          } /* End of switch */
-        }   /* End of contributing intersection conditional */
-
-        // Swap bundle sides in response to edge crossing
-        if (e0->bundle[ABOVE][CLIP]) {
-          e1->bside[CLIP] = !e1->bside[CLIP];
-        }
-        if (e1->bundle[ABOVE][CLIP]) {
-          e0->bside[CLIP] = !e0->bside[CLIP];
-        }
-        if (e0->bundle[ABOVE][SUBJ]) {
-          e1->bside[SUBJ] = !e1->bside[SUBJ];
-        }
-        if (e1->bundle[ABOVE][SUBJ]) {
-          e0->bside[SUBJ] = !e0->bside[SUBJ];
-        }
-
-        /* Swap e0 and e1 bundles in the AET */
-        prev_edge = e0->prev;
-        next_edge = e1->next;
-        if (e1->next) {
-          e1->next->prev = e0;
-        }
-
-        if (e0->bstate[ABOVE] == BUNDLE_HEAD) {
-          search = 1;
-          while (search) {
-            prev_edge = prev_edge->prev;
-            if (prev_edge) {
-              if (prev_edge->bundle[ABOVE][CLIP] ||
-                  prev_edge->bundle[ABOVE][SUBJ] ||
-                  (prev_edge->bstate[ABOVE] == BUNDLE_HEAD)) {
-                search = 0;
-              }
-            } else {
-              search = 0;
-            }
-          }
-        }
-        if (!prev_edge) {
-          e1->next = aet;
-          aet = e0->next;
-        } else {
-          e1->next = prev_edge->next;
-          prev_edge->next = e0->next;
-        }
-        e0->next->prev = prev_edge;
-        e1->next->prev = e1;
-        e0->next = next_edge;
-      } /* End of IT loop*/
-
-      /* Prepare for next scanbeam */
-      for (edge = aet; edge; edge = next_edge) {
-        next_edge = edge->next;
-        succ_edge = edge->succ;
-
-        if ((edge->top.y == yt) && succ_edge) {
-          /* Replace AET edge by its successor */
-          succ_edge->outp[BELOW] = edge->outp[ABOVE];
-          succ_edge->bstate[BELOW] = edge->bstate[ABOVE];
-          succ_edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP];
-          succ_edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ];
-          prev_edge = edge->prev;
-          if (prev_edge) {
-            prev_edge->next = succ_edge;
-          } else {
-            aet = succ_edge;
-          }
-          if (next_edge) {
-            next_edge->prev = succ_edge;
-          }
-          succ_edge->prev = prev_edge;
-          succ_edge->next = next_edge;
-        } else {
-          /* Update this edge */
-          edge->outp[BELOW] = edge->outp[ABOVE];
-          edge->bstate[BELOW] = edge->bstate[ABOVE];
-          edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP];
-          edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ];
-          edge->xb = edge->xt;
-        }
-        edge->outp[ABOVE] = NULL;
-      }
-    }
-  } /* === END OF SCANBEAM PROCESSING ================================== */
-
-  // Generate result tristrip from tlist
-  result->strip = NULL;
-  result->num_strips = count_tristrips(tlist);
-  if (result->num_strips > 0) {
-    gpc_malloc<gpc_vertex_list>(result->strip,
-                                result->num_strips * sizeof(gpc_vertex_list),
-                                const_cast<char *>("tristrip list creation"));
-
-    s = 0;
-    for (tn = tlist; tn; tn = tnn) {
-      tnn = tn->next;
-      if (tn->active > 2) {
-        /* Valid tristrip: copy the vertices and free the heap */
-        result->strip[s].num_vertices = tn->active;
-        gpc_malloc<gpc_vertex>(result->strip[s].vertex,
-                               tn->active * sizeof(gpc_vertex),
-                               const_cast<char *>("tristrip creation"));
-        v = 0;
-        if (0) {
-          lt = tn->v[RIGHT];
-          rt = tn->v[LEFT];
-        } else {
-          lt = tn->v[LEFT];
-          rt = tn->v[RIGHT];
-        }
-        while (lt || rt) {
-          if (lt) {
-            ltn = lt->next;
-            result->strip[s].vertex[v].x = lt->x;
-            result->strip[s].vertex[v].y = lt->y;
-            v++;
-            gpc_free<vertex_node>(lt);
-            lt = ltn;
-          }
-          if (rt) {
-            rtn = rt->next;
-            result->strip[s].vertex[v].x = rt->x;
-            result->strip[s].vertex[v].y = rt->y;
-            v++;
-            gpc_free<vertex_node>(rt);
-            rt = rtn;
-          }
-        }
-        s++;
-      } else {
-        /* Invalid tristrip: just free the heap */
-        for (lt = tn->v[LEFT]; lt; lt = ltn) {
-          ltn = lt->next;
-          gpc_free<vertex_node>(lt);
-        }
-        for (rt = tn->v[RIGHT]; rt; rt = rtn) {
-          rtn = rt->next;
-          gpc_free<vertex_node>(rt);
-        }
-      }
-      gpc_free<polygon_node>(tn);
-    }
-  }
-  // Tidy up
-  reset_it(&it);
-  reset_lmt(&lmt);
-  gpc_free<edge_node>(c_heap);
-  gpc_free<edge_node>(s_heap);
-  gpc_free<double>(sbt);
-}  // NOLINT
-
-}  // namespace gpc
-
-#endif
diff --git a/mobile/src/operators/math/gpc.h b/mobile/src/operators/math/gpc.h
deleted file mode 100644
index 2cae7fe184..0000000000
--- a/mobile/src/operators/math/gpc.h
+++ /dev/null
@@ -1,222 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MULTICLASSNMS_OP
-#pragma once
-
-#include <float.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-namespace gpc {
-
-typedef enum {  // Set operation type
-  GPC_DIFF,     // Difference
-  GPC_INT,      // Intersection
-  GPC_XOR,      // Exclusive or
-  GPC_UNION     // Union
-} gpc_op;
-
-typedef struct {  // Polygon vertex structure
-  double x;       // Vertex x component
-  double y;       // vertex y component
-} gpc_vertex;
-
-typedef struct {       // Vertex list structure
-  int num_vertices;    // Number of vertices in list
-  gpc_vertex *vertex;  // Vertex array pointer
-} gpc_vertex_list;
-
-typedef struct {             // Polygon set structure
-  int num_contours;          // Number of contours in polygon
-  int *hole;                 // Hole  external contour flags
-  gpc_vertex_list *contour;  // Contour array pointer
-} gpc_polygon;
-
-typedef struct {           // Tristrip set structure
-  int num_strips;          // Number of tristrips
-  gpc_vertex_list *strip;  // Tristrip array pointer
-} gpc_tristrip;
-
-typedef enum { LEFT, RIGHT } gpc_left_right;
-
-typedef enum { ABOVE, BELOW } gpc_above_below;
-
-typedef enum { CLIP, SUBJ } gpc_clip_subj;
-
-typedef enum {      /* Edge intersection classes         */
-               NUL, /* Empty non-intersection            */
-               EMX, /* External maximum                  */
-               ELI, /* External left intermediate        */
-               TED, /* Top edge                          */
-               ERI, /* External right intermediate       */
-               RED, /* Right edge                        */
-               IMM, /* Internal maximum and minimum      */
-               IMN, /* Internal minimum                  */
-               EMN, /* External minimum                  */
-               EMM, /* External maximum and minimum      */
-               LED, /* Left edge                         */
-               ILI, /* Internal left intermediate        */
-               BED, /* Bottom edge                       */
-               IRI, /* Internal right intermediate       */
-               IMX, /* Internal maximum                  */
-               FUL  /* Full non-intersection             */
-} vertex_type;
-
-typedef enum {     /* Horizontal edge states            */
-               NH, /* No horizontal edge                */
-               BH, /* Bottom horizontal edge            */
-               TH  /* Top horizontal edge               */
-} h_state;
-
-typedef enum {              /* Edge bundle state                 */
-               UNBUNDLED,   /* Isolated edge not within a bundle */
-               BUNDLE_HEAD, /* Bundle head node                  */
-               BUNDLE_TAIL  /* Passive bundle tail node          */
-} bundle_state;
-
-typedef struct v_shape { /* Internal vertex list datatype     */
-  double x;              /* X coordinate component            */
-  double y;              /* Y coordinate component            */
-  struct v_shape *next;  /* Pointer to next vertex in list    */
-} vertex_node;
-
-typedef struct p_shape { /* Internal contour / tristrip type  */
-  int active;            /* Active flag / vertex count        */
-  int hole;              /* Hole / external contour flag      */
-  vertex_node *v[2];     /* Left and right vertex list ptrs   */
-  struct p_shape *next;  /* Pointer to next polygon contour   */
-  struct p_shape *proxy; /* Pointer to actual structure used  */
-} polygon_node;
-
-typedef struct edge_shape {
-  gpc_vertex vertex;             /* Piggy-backed contour vertex data  */
-  gpc_vertex bot;                /* Edge lower (x, y) coordinate      */
-  gpc_vertex top;                /* Edge upper (x, y) coordinate      */
-  double xb;                     /* Scanbeam bottom x coordinate      */
-  double xt;                     /* Scanbeam top x coordinate         */
-  double dx;                     /* Change in x for a unit y increase */
-  int type;                      /* Clip / subject edge flag          */
-  int bundle[2][2];              /* Bundle edge flags                 */
-  int bside[2];                  /* Bundle left / right indicators    */
-  bundle_state bstate[2];        /* Edge bundle state                 */
-  polygon_node *outp[2];         /* Output polygon / tristrip pointer */
-  struct edge_shape *prev;       /* Previous edge in the AET          */
-  struct edge_shape *next;       /* Next edge in the AET              */
-  struct edge_shape *pred;       /* Edge connected at the lower end   */
-  struct edge_shape *succ;       /* Edge connected at the upper end   */
-  struct edge_shape *next_bound; /* Pointer to next bound in LMT      */
-} edge_node;
-
-inline bool gpc_eq(float a, float b) { return (fabs(a - b) <= 1e-6); }
-
-inline bool gpc_prev_index(float a, float b) { return (fabs(a - b) <= 1e-6); }
-
-inline int gpc_prev_index(int i, int n) { return ((i - 1 + n) % n); }
-
-inline int gpc_next_index(int i, int n) { return ((i + 1) % n); }
-
-inline int gpc_optimal(gpc_vertex *v, int i, int n) {
-  return (v[(i + 1) % n].y != v[i].y || v[(i - 1 + n) % n].y != v[i].y);
-}
-
-inline int gpc_fwd_min(edge_node *v, int i, int n) {
-  return (v[(i + 1) % n].vertex.y > v[i].vertex.y &&
-          v[(i - 1 + n) % n].vertex.y >= v[i].vertex.y);
-}
-
-inline int gpc_not_fmax(edge_node *v, int i, int n) {
-  return (v[(i + 1) % n].vertex.y > v[i].vertex.y);
-}
-
-inline int gpc_rev_min(edge_node *v, int i, int n) {
-  return (v[(i + 1) % n].vertex.y >= v[i].vertex.y &&
-          v[(i - 1 + n) % n].vertex.y > v[i].vertex.y);
-}
-
-inline int gpc_not_rmax(edge_node *v, int i, int n) {
-  return (v[(i - 1 + n) % n].vertex.y > v[i].vertex.y);
-}
-
-// inline void gpc_p_edge(edge_node *d, edge_node *e, int p, double i, double j)
-// {
-inline void gpc_p_edge(edge_node *d, edge_node *e, int p) {
-  d = e;
-  do {
-    d = d->prev;
-  } while (!d->outp[p]);
-  // i = d->bot.x + d->dx * (j - d->bot.y);
-}
-
-// inline void gpc_n_edge(edge_node *d, edge_node *e, int p, double i, double j)
-// {
-inline void gpc_n_edge(edge_node *d, edge_node *e, int p) {
-  d = e;
-  do {
-    d = d->next;
-  } while (!d->outp[p]);
-  // i = d->bot.x + d->dx * (j - d->bot.y);
-}
-
-template <typename T>
-void gpc_malloc(T *&p, int b, char *s) {  // NOLINT
-  if (b > 0) {
-    p = reinterpret_cast<T *>(malloc(b));
-
-    if (!p) {
-      fprintf(stderr, "gpc malloc failure: %s\n", s);
-      exit(0);
-    }
-  } else {
-    p = NULL;
-  }
-}
-
-template <typename T>
-void gpc_free(T *&p) {  // NOLINT
-  if (p) {
-    free(p);
-    p = NULL;
-  }
-}
-
-/*
-===========================================================================
-                       Public Function Prototypes
-===========================================================================
-*/
-
-void add_vertex(vertex_node **t, double x, double y);
-
-void gpc_vertex_create(edge_node *e, int p, int s, double x, double y);
-
-void gpc_add_contour(gpc_polygon *polygon, gpc_vertex_list *contour, int hole);
-
-void gpc_polygon_clip(gpc_op set_operation, gpc_polygon *subject_polygon,
-                      gpc_polygon *clip_polygon, gpc_polygon *result_polygon);
-
-void gpc_tristrip_clip(gpc_op set_operation, gpc_polygon *subject_polygon,
-                       gpc_polygon *clip_polygon,
-                       gpc_tristrip *result_tristrip);
-
-void gpc_polygon_to_tristrip(gpc_polygon *polygon, gpc_tristrip *tristrip);
-
-void gpc_free_polygon(gpc_polygon *polygon);
-
-void gpc_free_tristrip(gpc_tristrip *tristrip);
-
-}  // namespace gpc
-
-#endif
diff --git a/mobile/src/operators/math/gru_compute.cpp b/mobile/src/operators/math/gru_compute.cpp
deleted file mode 100644
index d30ea5aa47..0000000000
--- a/mobile/src/operators/math/gru_compute.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef GRU_OP
-
-#include "operators/math/gru_compute.h"
-#include "common/types.h"
-#include "operators/math/activation.h"
-#include "operators/math/gemm/cblas.h"
-#include "operators/math/gru_cpu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <typename T>
-struct GRUUnitFunctor<CPU, T> {
-  static void compute(GRUMetaValue<T> value, int frame_size, int batch_size,
-                      const ActivationType active_node,
-                      const ActivationType active_gate) {
-    if (value.prev_out_value) {
-      cblas_sgemm(false, false, batch_size, frame_size * 2, frame_size, 1.f,
-                  value.prev_out_value, frame_size, value.gate_weight,
-                  frame_size * 2, 1.f, value.gate_value, frame_size * 3);
-    }
-
-    forward_reset_output(value, frame_size, batch_size, active_gate);
-
-    if (value.prev_out_value) {
-      cblas_sgemm(false, false, batch_size, frame_size, frame_size, 1.f,
-                  value.reset_output_value, frame_size, value.state_weight,
-                  frame_size, 1.f, value.gate_value + frame_size * 2,
-                  frame_size * 3);
-    }
-
-    forward_final_output(value, frame_size, batch_size, active_node);
-  }
-};
-
-template struct GRUUnitFunctor<CPU, float>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/math/gru_compute.h b/mobile/src/operators/math/gru_compute.h
deleted file mode 100644
index 00f4da9022..0000000000
--- a/mobile/src/operators/math/gru_compute.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef GRU_OP
-#pragma once
-
-#include "operators/math/activation.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <typename T>
-struct GRUMetaValue {
-  T *gate_weight;
-  T *state_weight;
-  T *gate_value;
-  T *reset_output_value;
-  T *output_value;
-  T *prev_out_value;
-};
-
-template <typename DeviceType, typename T>
-struct GRUUnitFunctor {
-  static void compute(GRUMetaValue<T> value, int frame_size, int batch_size,
-                      const ActivationType active_node,
-                      const ActivationType active_gate);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/math/gru_cpu_kernel.h b/mobile/src/operators/math/gru_cpu_kernel.h
deleted file mode 100644
index a010fb616b..0000000000
--- a/mobile/src/operators/math/gru_cpu_kernel.h
+++ /dev/null
@@ -1,203 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef GRU_OP
-
-#pragma once
-
-#include <type_traits>
-#include "operators/math/activation.h"
-#include "operators/math/gru_compute.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <typename T, ActivationType Act>
-void hl_naive_gru_forward_reset_output(T *gate_value, T *reset_output_value,
-                                       T *prev_output_value, int frame_size) {
-  T r_value_update_gate;
-  T r_value_reset_gate;
-  T r_value_reset_output;
-  T r_prev_out = 0;
-  T *update_gate = gate_value;
-  T *reset_gate = gate_value + frame_size;
-
-  int remain = frame_size;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-  int loop = remain >> 3;
-  remain = remain & 0x7;
-  float32x4_t prev0 = vdupq_n_f32(0.f);
-  float32x4_t prev1 = vdupq_n_f32(0.f);
-  for (int i = 0; i < loop; ++i) {
-    float32x4_t update0 = vld1q_f32(update_gate);
-    float32x4_t update1 = vld1q_f32(update_gate + 4);
-    float32x4_t reset0 = vld1q_f32(reset_gate);
-    float32x4_t reset1 = vld1q_f32(reset_gate + 4);
-    if (prev_output_value) {
-      prev0 = vld1q_f32(prev_output_value);
-      prev1 = vld1q_f32(prev_output_value + 4);
-      prev_output_value += 8;
-    }
-    update0 = vActiveq_f32<Act>(update0);
-    update1 = vActiveq_f32<Act>(update1);
-    reset0 = vActiveq_f32<Act>(reset0);
-    reset1 = vActiveq_f32<Act>(reset1);
-    float32x4_t output0 = vmulq_f32(prev0, reset0);
-    float32x4_t output1 = vmulq_f32(prev1, reset1);
-    vst1q_f32(update_gate, update0);
-    vst1q_f32(update_gate + 4, update1);
-    vst1q_f32(reset_gate, reset0);
-    vst1q_f32(reset_gate + 4, reset1);
-    vst1q_f32(reset_output_value, output0);
-    vst1q_f32(reset_output_value + 4, output1);
-    update_gate += 8;
-    reset_gate += 8;
-    reset_output_value += 8;
-  }
-#endif  // __ARM_NEON__
-  for (int i = 0; i < remain; i++) {
-    r_value_update_gate = update_gate[i];
-    r_value_reset_gate = reset_gate[i];
-    if (prev_output_value) {
-      r_prev_out = prev_output_value[i];
-    }
-    r_value_update_gate = Active<Act>(r_value_update_gate);
-    r_value_reset_gate = Active<Act>(r_value_reset_gate);
-    r_value_reset_output = r_prev_out * r_value_reset_gate;
-    update_gate[i] = r_value_update_gate;
-    reset_gate[i] = r_value_reset_gate;
-    reset_output_value[i] = r_value_reset_output;
-  }
-}
-
-template <typename T, ActivationType Act>
-void hl_naive_gru_forward_final_output(T *gate_value, T *prev_output_value,
-                                       T *output_value, int frame_size) {
-  T r_value_update_gate;
-  T r_value_frame_state;
-  T r_prev_out = 0;
-  T r_output;
-  T *update_gate = gate_value;
-  T *frame_state = gate_value + frame_size * 2;
-
-  int remain = frame_size;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-  int loop = remain >> 3;
-  remain = remain & 0x7;
-  float32x4_t prev0 = vdupq_n_f32(0.f);
-  float32x4_t prev1 = vdupq_n_f32(0.f);
-  for (int i = 0; i < loop; ++i) {
-    float32x4_t update0 = vld1q_f32(update_gate);
-    float32x4_t update1 = vld1q_f32(update_gate + 4);
-    float32x4_t state0 = vld1q_f32(frame_state);
-    float32x4_t state1 = vld1q_f32(frame_state + 4);
-    if (prev_output_value) {
-      prev0 = vld1q_f32(prev_output_value);
-      prev1 = vld1q_f32(prev_output_value + 4);
-      prev_output_value += 8;
-    }
-    state0 = vActiveq_f32<Act>(state0);
-    state1 = vActiveq_f32<Act>(state1);
-    float32x4_t output0 = vmlsq_f32(prev0, update0, prev0);
-    float32x4_t output1 = vmlsq_f32(prev1, update1, prev1);
-    output0 = vmlaq_f32(output0, update0, state0);
-    output1 = vmlaq_f32(output1, update1, state1);
-    vst1q_f32(frame_state, state0);
-    vst1q_f32(frame_state + 4, state1);
-    vst1q_f32(output_value, output0);
-    vst1q_f32(output_value + 4, output1);
-    update_gate += 8;
-    frame_state += 8;
-    output_value += 8;
-  }
-#endif  // __ARM_NEON__
-  for (int i = 0; i < remain; i++) {
-    r_value_update_gate = update_gate[i];
-    r_value_frame_state = frame_state[i];
-    if (prev_output_value) {
-      r_prev_out = prev_output_value[i];
-    }
-    r_value_frame_state = Active<Act>(r_value_frame_state);
-    r_output = r_prev_out - r_value_update_gate * r_prev_out +
-               r_value_update_gate * r_value_frame_state;
-    frame_state[i] = r_value_frame_state;
-    output_value[i] = r_output;
-  }
-}
-
-#define FORWARD_RESET_OUTPUT(active_type, value, frame_size)            \
-  hl_naive_gru_forward_reset_output<float, active_type>(                \
-      value.gate_value, value.reset_output_value, value.prev_out_value, \
-      frame_size);
-
-template <typename T>
-inline void forward_reset_output(GRUMetaValue<T> value, int frame_size,
-                                 int batch_size, ActivationType active_node) {
-  for (int b = 0; b < batch_size; ++b) {
-    switch (active_node) {
-      case RELU:
-        FORWARD_RESET_OUTPUT(RELU, value, frame_size);
-        break;
-      case SIGMOID:
-        FORWARD_RESET_OUTPUT(SIGMOID, value, frame_size);
-        break;
-      case TANH:
-        FORWARD_RESET_OUTPUT(TANH, value, frame_size);
-        break;
-      default:
-        FORWARD_RESET_OUTPUT(IDENTITY, value, frame_size);
-    }
-    value.gate_value += frame_size * 3;
-    value.reset_output_value += frame_size;
-    if (value.prev_out_value) {
-      value.prev_out_value += frame_size;
-    }
-  }
-}
-
-#define FORWARD_FINAL_OUTPUT(active_type, value, frame_size) \
-  hl_naive_gru_forward_final_output<float, active_type>(     \
-      value.gate_value, value.prev_out_value, value.output_value, frame_size)
-
-template <typename T>
-inline void forward_final_output(GRUMetaValue<T> value, int frame_size,
-                                 int batch_size, ActivationType active_node) {
-  for (int b = 0; b < batch_size; ++b) {
-    switch (active_node) {
-      case RELU:
-        FORWARD_FINAL_OUTPUT(RELU, value, frame_size);
-        break;
-      case SIGMOID:
-        FORWARD_FINAL_OUTPUT(SIGMOID, value, frame_size);
-        break;
-      case TANH:
-        FORWARD_FINAL_OUTPUT(TANH, value, frame_size);
-        break;
-      default:
-        FORWARD_FINAL_OUTPUT(IDENTITY, value, frame_size);
-    }
-    value.gate_value += frame_size * 3;
-    value.output_value += frame_size;
-    if (value.prev_out_value) {
-      value.prev_out_value += frame_size;
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/math/im2col.cpp b/mobile/src/operators/math/im2col.cpp
deleted file mode 100644
index a7b97e5bfc..0000000000
--- a/mobile/src/operators/math/im2col.cpp
+++ /dev/null
@@ -1,668 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <vector>
-#ifdef __ARM_NEON
-#include <arm_neon.h>
-#endif
-#include <algorithm>
-#include "common/types.h"
-#include "operators/math/im2col.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <>
-void ExtractToImg<float>(const float *im_data, float *col_data,
-                         const int im_height, const int im_width,
-                         const int col_height, const int col_width,
-                         const int padding_h, const int padding_w,
-                         const int stride_h, const int stride_w, const int kh,
-                         const int kw) {
-  int h = padding_h - kh;
-  int w = padding_w - kw;
-  int col_start_height = h > 0 ? (h + stride_h - 1) / stride_h : 0;
-  int col_start_width = w > 0 ? (w + stride_w - 1) / stride_w : 0;
-  int start_height = kh + col_start_height * stride_h - padding_h;
-  int start_width = kw + col_start_width * stride_w - padding_w;
-
-  int end_height = (col_height - col_start_height) * stride_h + start_height;
-  end_height = end_height > im_height ? im_height : end_height;
-  int end_width = (col_width - col_start_width) * stride_w + start_width;
-  end_width = end_width > im_width ? im_width : end_width;
-  int extract = (end_width - start_width + stride_w - 1) / stride_w;
-
-  im_data += start_height * im_width + start_width;
-  col_data += col_start_height * col_width + col_start_width;
-  for (int i = start_height; i < end_height; i += stride_h) {
-    int s = 0;
-    if (stride_w == 1) {
-#if __ARM_NEON
-      for (; s < extract - 3; s += 4) {
-        float32x4_t _img = vld1q_f32(im_data + s);
-        vst1q_f32(col_data + s, _img);
-      }
-#endif
-      for (; s < extract; ++s) {
-        col_data[s] = im_data[s];
-      }
-    } else if (stride_w == 2) {
-#if __ARM_NEON
-      for (; s < extract - 3; s += 4) {
-        float32x4x2_t _img = vld2q_f32(im_data + s * 2);
-        vst1q_f32(col_data + s, _img.val[0]);
-      }
-#endif
-      for (; s < extract; ++s) {
-        col_data[s] = im_data[s * 2];
-      }
-    } else if (stride_w == 3) {
-#if __ARM_NEON
-      for (; s < extract - 3; s += 4) {
-        float32x4x3_t _img = vld3q_f32(im_data + s * 3);
-        vst1q_f32(col_data + s, _img.val[0]);
-      }
-#endif
-      for (; s < extract; ++s) {
-        col_data[s] = im_data[s * 3];
-      }
-    } else if (stride_w == 4) {
-#if __ARM_NEON
-      for (; s < extract - 3; s += 4) {
-        float32x4x4_t _img = vld4q_f32(im_data + s * 4);
-        vst1q_f32(col_data + s, _img.val[0]);
-      }
-#endif
-      for (; s < extract; ++s) {
-        col_data[s] = im_data[s * 4];
-      }
-    } else {
-      PADDLE_MOBILE_THROW_EXCEPTION("stride_w must be one of 1, 2, 3 and 4.");
-    }
-    im_data += im_width * stride_h;
-    col_data += col_width;
-  }
-}
-
-template <>
-void ExtractToImg<int8_t>(const int8_t *im_data, int8_t *col_data,
-                          const int im_height, const int im_width,
-                          const int col_height, const int col_width,
-                          const int padding_h, const int padding_w,
-                          const int stride_h, const int stride_w, const int kh,
-                          const int kw) {
-  int h = padding_h - kh;
-  int w = padding_w - kw;
-  int col_start_height = h > 0 ? (h + stride_h - 1) / stride_h : 0;
-  int col_start_width = w > 0 ? (w + stride_w - 1) / stride_w : 0;
-  int start_height = kh + col_start_height * stride_h - padding_h;
-  int start_width = kw + col_start_width * stride_w - padding_w;
-
-  int end_height = (col_height - col_start_height) * stride_h + start_height;
-  end_height = end_height > im_height ? im_height : end_height;
-  int end_width = (col_width - col_start_width) * stride_w + start_width;
-  end_width = end_width > im_width ? im_width : end_width;
-  int extract = (end_width - start_width + stride_w - 1) / stride_w;
-
-  im_data += start_height * im_width + start_width;
-  col_data += col_start_height * col_width + col_start_width;
-  for (int i = start_height; i < end_height; i += stride_h) {
-    int s = 0;
-    if (stride_w == 1) {
-      for (; s < extract - 15; s += 16) {
-        int8x16_t _img = vld1q_s8(im_data + s);
-        vst1q_s8(col_data + s, _img);
-      }
-      for (; s < extract; ++s) {
-        col_data[s] = im_data[s];
-      }
-    } else if (stride_w == 2) {
-#if __ARM_NEON
-      for (; s < extract - 15; s += 16) {
-        int8x16x2_t _img = vld2q_s8(im_data + s * 2);
-        vst1q_s8(col_data + s, _img.val[0]);
-      }
-#endif
-      for (; s < extract; ++s) {
-        col_data[s] = im_data[s * 2];
-      }
-    } else if (stride_w == 3) {
-#if __ARM_NEON
-      for (; s < extract - 15; s += 16) {
-        int8x16x3_t img = vld3q_s8(im_data + s * 3);
-        vst1q_s8(col_data + s, img.val[0]);
-      }
-#endif
-      for (; s < extract; ++s) {
-        col_data[s] = im_data[s * 3];
-      }
-    } else if (stride_w == 4) {
-#if __ARM_NEON
-      for (; s < extract - 15; s += 16) {
-        int8x16x4_t img = vld4q_s8(im_data + s * 4);
-        vst1q_s8(col_data + s, img.val[0]);
-      }
-#endif
-      for (; s < extract; ++s) {
-        col_data[s] = im_data[s * 4];
-      }
-    } else {
-      PADDLE_MOBILE_THROW_EXCEPTION("stride_w must be one of 1, 2, 3 and 4.");
-    }
-    im_data += im_width * stride_h;
-    col_data += col_width;
-  }
-}
-
-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [input_channels, filter_height, filter_width, output_height,
- * output_width]
- */
-template <class T>
-class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
- public:
-  void operator()(const framework::Tensor &im, const std::vector<int> &dilation,
-                  const std::vector<int> &stride,
-                  const std::vector<int> &padding, framework::Tensor *col) {
-    int im_channels = im.dims()[0];
-    int im_height = im.dims()[1];
-    int im_width = im.dims()[2];
-    int filter_height = col->dims()[1];
-    int filter_width = col->dims()[2];
-    int col_height = col->dims()[3];
-    int col_width = col->dims()[4];
-
-    int channels_col = im_channels * filter_height * filter_width;
-    const T *im_data = im.data<T>();
-    T *col_data = col->data<T>();
-#if __ARM_NEON
-    if (stride[0] <= 4 && dilation[0] == 1 && dilation[0] == dilation[1]) {
-      int im_spatial_size = im_height * im_width;
-      int col_spatial_size = col_height * col_width;
-      // pad 0
-      memset(col_data, 0, col->numel() * sizeof(T));
-
-      #pragma omp parallel for
-      for (int ic = 0; ic < im_channels; ++ic) {
-        const T *local_im_data = im_data + ic * im_spatial_size;
-        T *local_col_data =
-            col_data + ic * filter_height * filter_width * col_spatial_size;
-        for (int kh = 0; kh < filter_height; ++kh) {
-          for (int kw = 0; kw < filter_width; ++kw) {
-            ExtractToImg<T>(local_im_data, local_col_data, im_height, im_width,
-                            col_height, col_width, padding[0], padding[1],
-                            stride[0], stride[1], kh, kw);
-            local_col_data += col_spatial_size;
-          }
-        }
-      }
-    } else {
-#endif
-      for (int c = 0; c < channels_col; ++c) {
-        int w_offset = c % filter_width;
-        int h_offset = (c / filter_width) % filter_height;
-        int c_im = c / (filter_width * filter_height);
-        for (int h = 0; h < col_height; ++h) {
-          int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
-          for (int w = 0; w < col_width; ++w) {
-            int im_col_idx =
-                w * stride[1] - padding[1] + w_offset * dilation[1];
-            int col_idx = (c * col_height + h) * col_width + w;
-            int im_idx =
-                (im_row_idx + c_im * im_height) * im_width + im_col_idx;
-
-            col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
-                                 im_col_idx < 0 || im_col_idx >= im_width)
-                                    ? static_cast<T>(0)
-                                    : im_data[im_idx];
-          }
-        }
-      }
-#if __ARM_NEON
-    }
-#endif
-  }
-};
-
-template <>
-void ExtendToImg<float>(const float *col_data, float *im_data,
-                        const int im_height, const int im_width,
-                        const int col_height, const int col_width,
-                        const int padding_h, const int padding_w,
-                        const int stride_h, const int stride_w, const int kh,
-                        const int kw) {
-  int h = padding_h - kh;
-  int w = padding_w - kw;
-  int col_start_height = h > 0 ? (h + stride_h - 1) / stride_h : 0;
-  int col_start_width = w > 0 ? (w + stride_w - 1) / stride_w : 0;
-  int start_height = kh + col_start_height * stride_h - padding_h;
-  int start_width = kw + col_start_width * stride_w - padding_w;
-
-  int end_height = (col_height - col_start_height) * stride_h + start_height;
-  end_height = end_height > im_height ? im_height : end_height;
-  int end_width = (col_width - col_start_width) * stride_w + start_width;
-  end_width = end_width > im_width ? im_width : end_width;
-  // int extract = (end_width - start_width + stride_w - 1) / stride_w;
-  int extend = end_width - start_width;
-
-  im_data += start_height * im_width + start_width;
-  col_data += col_start_height * col_width + col_start_width;
-
-  for (int i = start_height; i < end_height; i += stride_h) {
-    int s = 0;
-    if (stride_w == 1) {
-#if __ARM_NEON
-      for (; s < extend - 3; s += 4) {
-        float32x4_t _col = vld1q_f32(col_data + s);
-        float32x4_t _img = vld1q_f32(im_data + s);
-        _img = vaddq_f32(_img, _col);
-        vst1q_f32(im_data + s, _img);
-      }
-#endif
-      for (; s < extend; ++s) {
-        im_data[s] += col_data[s];
-      }
-    } else if (stride_w == 2) {
-#if __ARM_NEON
-      for (; s < extend - 7; s += 8) {
-        float32x4_t _col = vld1q_f32(col_data + s / 2);
-        float32x4x2_t _img = vld2q_f32(im_data + s);
-        _img.val[0] = vaddq_f32(_img.val[0], _col);
-        vst2q_f32(im_data + s, _img);
-      }
-#endif
-      for (; s < extend; s += 2) {
-        im_data[s] += col_data[s / 2];
-      }
-    } else {
-      PADDLE_MOBILE_THROW_EXCEPTION("stride_w must be one of 1 and 2.");
-    }
-    im_data += im_width * stride_h;
-    col_data += col_width;
-  }
-}
-
-template <>
-void ExtendToImgV2<float>(const float *col_data, float *im_data,
-                          const int im_height, const int im_width,
-                          const int col_height, const int col_width,
-                          const int padding_h, const int padding_w,
-                          const int stride_h, const int stride_w, const int kh,
-                          const int kernel_w) {
-  int col_spatial_size = col_height * col_width;
-  int h = padding_h - kh;
-  int col_start_height = h > 0 ? (h + stride_h - 1) / stride_h : 0;
-  int start_height = kh + col_start_height * stride_h - padding_h;
-  int end_height = (col_height - col_start_height) * stride_h + start_height;
-  end_height = end_height > im_height ? im_height : end_height;
-  im_data += start_height * im_width;
-  col_data += col_start_height * col_width;
-
-  int kw = 0;
-  for (; kw < kernel_w - 1; kw += 2) {
-    int w0 = padding_w - kw;
-    int w1 = padding_w - (kw + 1);
-    int col_start_width0 = w0 > 0 ? (w0 + stride_w - 1) / stride_w : 0;
-    int col_start_width1 = w1 > 0 ? (w1 + stride_w - 1) / stride_w : 0;
-    int start_width0 = kw + col_start_width0 * stride_w - padding_w;
-    int start_width1 = (kw + 1) + col_start_width1 * stride_w - padding_w;
-
-    int end_width0 = (col_width - col_start_width0) * stride_w + start_width0;
-    end_width0 = end_width0 > im_width ? im_width : end_width0;
-    int end_width1 = (col_width - col_start_width1) * stride_w + start_width1;
-    end_width1 = end_width1 > im_width ? im_width : end_width1;
-    int start_width = 0;
-    int end_width = 0;
-    if (stride_w == 1) {
-      start_width = std::max(start_width0, start_width1);
-      end_width = std::min(end_width0, end_width1);
-    } else if (stride_w == 2) {
-      start_width = std::min(start_width0, start_width1);
-      end_width = std::min(end_width0, end_width1);
-    } else {
-      PADDLE_MOBILE_THROW_EXCEPTION("stride_w must be one of 1 and 2.");
-    }
-
-    //    DLOG << "start_width0: " << start_width0 << ", end_width0: " <<
-    //    end_width0; DLOG << "start_width1: " << start_width1 << ", end_width1:
-    //    " << end_width1;
-    int extend = end_width - start_width;
-    float *im_data01 = im_data + start_width;
-    float *im_data0 = im_data + start_width0;
-    float *im_data1 = im_data + start_width1;
-    const float *col_data0 = col_data + col_start_width0;
-    const float *col_data1 = col_data + col_spatial_size + col_start_width1;
-
-    for (int i = start_height; i < end_height; i += stride_h) {
-      int s = 0;
-      if (stride_w == 1) {
-        int offset0 = start_width - start_width0;
-        int offset1 = start_width - start_width1;
-        for (int ss = 0; ss < start_width - start_width0; ++ss) {
-          im_data0[ss] += col_data0[ss];
-        }
-        for (int ss = 0; ss < start_width - start_width1; ++ss) {
-          im_data1[ss] += col_data1[ss];
-        }
-#if __ARM_NEON
-        for (; s < extend - 3; s += 4) {
-          float32x4_t _col0 = vld1q_f32(col_data0 + offset0 + s);
-          float32x4_t _col1 = vld1q_f32(col_data1 + offset1 + s);
-          float32x4_t _img = vld1q_f32(im_data01 + s);
-          _img = vaddq_f32(_img, _col0);
-          _img = vaddq_f32(_img, _col1);
-          vst1q_f32(im_data01 + s, _img);
-        }
-#endif
-        for (int ss = s; ss < end_width0 - start_width0; ++ss) {
-          im_data0[ss] += col_data0[ss];
-        }
-        for (int ss = s; ss < end_width1 - start_width1; ++ss) {
-          im_data1[ss] += col_data1[ss];
-        }
-      } else if (stride_w == 2) {
-        if (start_width0 < start_width1) {
-#if __ARM_NEON
-          for (; s < extend - 7; s += 8) {
-            float32x4_t _col0 = vld1q_f32(col_data0 + s / 2);
-            float32x4_t _col1 = vld1q_f32(col_data1 + s / 2);
-            float32x4x2_t _img = vld2q_f32(im_data01 + s);
-            _img.val[0] = vaddq_f32(_img.val[0], _col0);
-            _img.val[1] = vaddq_f32(_img.val[1], _col1);
-            vst2q_f32(im_data01 + s, _img);
-          }
-#endif
-        } else {
-#if __ARM_NEON
-          for (; s < extend - 7; s += 8) {
-            float32x4_t _col0 = vld1q_f32(col_data0 + s / 2);
-            float32x4_t _col1 = vld1q_f32(col_data1 + s / 2);
-            float32x4x2_t _img = vld2q_f32(im_data01 + s);
-            _img.val[0] = vaddq_f32(_img.val[0], _col1);
-            _img.val[1] = vaddq_f32(_img.val[1], _col0);
-            vst2q_f32(im_data01 + s, _img);
-          }
-#endif
-        }
-        for (int ss = s; ss < end_width0 - start_width0; ss += 2) {
-          im_data0[ss] += col_data0[ss / 2];
-        }
-        for (int ss = s; ss < end_width1 - start_width1; ss += 2) {
-          im_data1[ss] += col_data1[ss / 2];
-        }
-      }
-
-      im_data0 += im_width * stride_h;
-      im_data1 += im_width * stride_h;
-      im_data01 += im_width * stride_h;
-      col_data0 += col_width;
-      col_data1 += col_width;
-    }
-    col_data += 2 * col_spatial_size;
-  }
-
-  for (; kw < kernel_w; ++kw) {
-    int w = padding_w - kw;
-    int col_start_width = w > 0 ? (w + stride_w - 1) / stride_w : 0;
-    int start_width = kw + col_start_width * stride_w - padding_w;
-
-    int end_width = (col_width - col_start_width) * stride_w + start_width;
-    end_width = end_width > im_width ? im_width : end_width;
-    int extend = end_width - start_width;
-
-    float *im_data0 = im_data + start_width;
-    const float *col_data0 = col_data + col_start_width;
-
-    for (int i = start_height; i < end_height; i += stride_h) {
-      int s = 0;
-      if (stride_w == 1) {
-#if __ARM_NEON
-        for (; s < extend - 3; s += 4) {
-          float32x4_t _col = vld1q_f32(col_data + s);
-          float32x4_t _img = vld1q_f32(im_data + s);
-          _img = vaddq_f32(_img, _col);
-          vst1q_f32(im_data + s, _img);
-        }
-#endif
-        for (; s < extend; ++s) {
-          im_data[s] += col_data[s];
-        }
-      } else if (stride_w == 2) {
-#if __ARM_NEON
-        for (; s < extend - 7; s += 8) {
-          float32x4_t _col = vld1q_f32(col_data + s / 2);
-          float32x4x2_t _img = vld2q_f32(im_data + s);
-          _img.val[0] = vaddq_f32(_img.val[0], _col);
-          vst2q_f32(im_data + s, _img);
-        }
-#endif
-        for (; s < extend; s += 2) {
-          im_data[s] += col_data[s / 2];
-        }
-      } else {
-        PADDLE_MOBILE_THROW_EXCEPTION("stride_w must be one of 1 and 2.");
-      }
-      im_data += im_width * stride_h;
-      col_data += col_width;
-    }
-    col_data += col_spatial_size;
-  }
-}
-
-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [input_channels, filter_height, filter_width, output_height,
- * output_width]
- */
-template <class T>
-class Col2ImFunctor<ColFormat::kCFO, CPU, T> {
- public:
-  void operator()(const framework::Tensor &col,
-                  const std::vector<int> &dilation,
-                  const std::vector<int> &stride,
-                  const std::vector<int> &padding, framework::Tensor *im) {
-    int im_channels = im->dims()[0];
-    int im_height = im->dims()[1];
-    int im_width = im->dims()[2];
-    int filter_height = col.dims()[1];
-    int filter_width = col.dims()[2];
-    int col_height = col.dims()[3];
-    int col_width = col.dims()[4];
-
-    int channels_col = im_channels * filter_height * filter_width;
-    const T *col_data = col.data<T>();
-    T *im_data = im->data<T>();
-    memset(static_cast<void *>(im_data), 0, sizeof(T) * im->numel());
-
-#if __ARM_NEON
-    if (stride[0] <= 2 && dilation[0] == 1 && dilation[0] == dilation[1]) {
-      int im_spatial_size = im_height * im_width;
-      int col_spatial_size = col_height * col_width;
-
-      #pragma omp parallel for
-      for (int ic = 0; ic < im_channels; ++ic) {
-        T *local_im_data = im_data + ic * im_spatial_size;
-        const T *local_col_data =
-            col_data + ic * filter_height * filter_width * col_spatial_size;
-        for (int kh = 0; kh < filter_height; ++kh) {
-#if 0
-          for (int kw = 0; kw < filter_width; ++kw) {
-            ExtendToImg<T>(local_col_data, local_im_data, im_height, im_width,
-                           col_height, col_width, padding[0], padding[1],
-                           stride[0], stride[1], kh, kw);
-            local_col_data += col_spatial_size;
-          }
-#else
-          ExtendToImgV2<T>(local_col_data, local_im_data, im_height, im_width,
-                           col_height, col_width, padding[0], padding[1],
-                           stride[0], stride[1], kh, filter_width);
-          local_col_data += col_spatial_size * filter_width;
-#endif
-        }
-      }
-    } else {
-#endif
-      for (int c = 0; c < channels_col; ++c) {
-        int w_offset = c % filter_width;
-        int h_offset = (c / filter_width) % filter_height;
-        int c_im = c / (filter_width * filter_height);
-        for (int h = 0; h < col_height; ++h) {
-          int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
-          for (int w = 0; w < col_width; ++w) {
-            int im_col_idx =
-                w * stride[1] - padding[1] + w_offset * dilation[1];
-            if ((im_row_idx) >= 0 && (im_row_idx) < im_height &&
-                (im_col_idx) >= 0 && (im_col_idx) < im_width) {
-              im_data[(im_row_idx + c_im * im_height) * im_width +
-                      im_col_idx] +=
-                  col_data[(c * col_height + h) * col_width + w];
-            }
-          }
-        }
-      }
-#if __ARM_NEON
-    }
-#endif
-  }
-};
-
-template class Im2ColFunctor<ColFormat::kCFO, CPU, float>;
-template class Im2ColFunctor<ColFormat::kCFO, CPU, int8_t>;
-template class Col2ImFunctor<ColFormat::kCFO, CPU, float>;
-// template class Col2ImFunctor<ColFormat::kCFO, CPU, int8_t>;
-
-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [output_height, output_width, input_channels, filter_height,
- * filter_width]
- */
-template <class T>
-class Im2ColFunctor<ColFormat::kOCF, CPU, T> {
- public:
-  void operator()(const framework::Tensor &im, const std::vector<int> &dilation,
-                  const std::vector<int> &stride,
-                  const std::vector<int> &padding, framework::Tensor *col) {
-    int im_channels = im.dims()[0];
-    int im_height = im.dims()[1];
-    int im_width = im.dims()[2];
-    int filter_height = col->dims()[3];
-    int filter_width = col->dims()[4];
-    int col_height = col->dims()[0];
-    int col_width = col->dims()[1];
-
-    const T *im_data = im.data<T>();
-    T *col_data = col->data<T>();
-    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
-      for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
-        for (int channel = 0; channel < im_channels; ++channel) {
-          for (int filter_row_idx = 0; filter_row_idx < filter_height;
-               ++filter_row_idx) {
-            int im_row_offset =
-                col_row_idx * stride[0] + filter_row_idx - padding[0];
-            for (int filter_col_idx = 0; filter_col_idx < filter_width;
-                 ++filter_col_idx) {
-              int im_col_offset =
-                  col_col_idx * stride[1] + filter_col_idx - padding[1];
-              int col_offset =
-                  ((((col_row_idx)*col_width + col_col_idx) * im_channels +
-                    channel) *
-                       filter_height +
-                   filter_row_idx) *
-                      filter_width +
-                  filter_col_idx;
-              int im_offset = (channel * im_height + im_row_offset) * im_width +
-                              im_col_offset;
-              col_data[col_offset] =
-                  (im_row_offset < 0 || im_row_offset >= im_height ||
-                   im_col_offset < 0 || im_col_offset >= im_width)
-                      ? static_cast<T>(0)
-                      : im_data[im_offset];
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [output_height, output_width, input_channels, filter_height,
- * filter_width]
- */
-template <class T>
-class Col2ImFunctor<ColFormat::kOCF, CPU, T> {
- public:
-  void operator()(const framework::Tensor &col,
-                  const std::vector<int> &dilation,
-                  const std::vector<int> &stride,
-                  const std::vector<int> &padding, framework::Tensor *im) {
-    int im_channels = im->dims()[0];
-    int im_height = im->dims()[1];
-    int im_width = im->dims()[2];
-    int filter_height = col.dims()[3];
-    int filter_width = col.dims()[4];
-    int col_height = col.dims()[0];
-    int col_width = col.dims()[1];
-
-    T *im_data = im->data<T>();
-    const T *col_data = col.data<T>();
-
-    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
-      for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
-        for (int channel = 0; channel < im_channels; ++channel) {
-          for (int filter_row_idx = 0; filter_row_idx < filter_height;
-               ++filter_row_idx) {
-            int im_row_offset =
-                col_row_idx * stride[0] + filter_row_idx - padding[0];
-            for (int filter_col_idx = 0; filter_col_idx < filter_width;
-                 ++filter_col_idx) {
-              int im_col_offset =
-                  col_col_idx * stride[1] + filter_col_idx - padding[1];
-
-              int col_offset =
-                  (((col_row_idx * col_width + col_col_idx) * im_channels +
-                    channel) *
-                       filter_height +
-                   filter_row_idx) *
-                      filter_width +
-                  filter_col_idx;
-
-              if (im_row_offset >= 0 && im_row_offset < im_height &&
-                  im_col_offset >= 0 && im_col_offset < im_width) {
-                int im_offset =
-                    (channel * im_height + im_row_offset) * im_width +
-                    im_col_offset;
-                im_data[im_offset] += col_data[col_offset];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-template class Im2ColFunctor<ColFormat::kOCF, CPU, float>;
-template class Col2ImFunctor<ColFormat::kOCF, CPU, float>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/im2col.h b/mobile/src/operators/math/im2col.h
deleted file mode 100644
index 347f72c917..0000000000
--- a/mobile/src/operators/math/im2col.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-/* The storage format of the coldata in the Im2ColFunctor and
- * Col2ImFunctor. */
-enum class ColFormat { kCFO = 0, kOCF = 1 };
-
-template <class T>
-void ExtractToImg(const T *im_data, T *col_data, const int im_height,
-                  const int im_width, const int col_height, const int col_width,
-                  const int padding_h, const int padding_w, const int stride_h,
-                  const int stride_w, const int kh, const int kw);
-
-template <class T>
-void ExtendToImg(const T *col_data, T *im_data, const int im_height,
-                 const int im_width, const int col_height, const int col_width,
-                 const int padding_h, const int padding_w, const int stride_h,
-                 const int stride_w, const int kh, const int kw);
-
-template <class T>
-void ExtendToImgV2(const T *col_data, T *im_data, const int im_height,
-                   const int im_width, const int col_height,
-                   const int col_width, const int padding_h,
-                   const int padding_w, const int stride_h, const int stride_w,
-                   const int kh, const int kernel_w);
-
-/*
- * \brief Converts the image data of three dimensions(CHW) into a
- * colData of
- *        five dimensions in the Im2ColFunctor calculation,
- *        And in the Col2ImFunctor calculation, it is reversed.
- *
- * \param imData   Image data.
- * \param imShape  The shape of imData,
- *                 [input_channels, input_height, input_width].
- * \param colData  Column data.
- * \param colShape The shape of colData.
- *
- * \param dilations    dilation data.
- * \param 2-dimension  [dilation_height, dilation_width].
- *
- * \param strides      stride data.
- * \param 2-dimension  [stride_height, stride_width].
- *
- * \param paddings     padding data.
- * \param 4-dimension  [up_pad, left_pad, down_pad, right_pad].
- *
- * If the template argument Format is kCFO, the shape of colData is:
- * [input_channels, filter_height, filter_width, output_height,
- * output_width]
- * So, it is easy to reshape into a convolution matrix for
- * convolution
- * calculation based on matrix multiplication.
- * The shape of convolution matrix is [height, width], where the
- * height is equal
- * input_channels * filter_height * filter_width, and the width is
- * equal
- * output_height * output_width.
- *
- * Reshape:
- *     shape of colData           shape of convolution matrix
- *     [input_channels,
- *      filter_height,
- *      filter_width,      ======>      [height, width]
- *      output_height,
- *      output_width]
- *
- * If the template argument Format is kOCF, the shape of colData is:
- * [output_height, output_width, input_channels, filter_height,
- * filter_width]
- * So, it is easy to reshape into a sequence matrix for rnn
- * calculation.
- * The shape of sequence matrix is [seq_length, step_size], where
- * the seq_length
- * is equal output_height * output_width, and the step_size is equal
- * input_channels * filter_height * filter_width.
- *
- * Reshape:
- *     shape of colData             shape of sequence matrix
- *     [output_height,
- *      output_width,
- *      input_channels,    ======>    [seqLength, stepSize]
- *      filter_height,
- *      filter_width]
- *
- * \note The caller needs to ensure that imShape.inputChannels is
- * equal to
- *       colShape.inputChannels.
- */
-template <ColFormat Format, typename DeviceType, typename T>
-class Im2ColFunctor {
- public:
-  void operator()(const framework::Tensor &im, const std::vector<int> &dilation,
-                  const std::vector<int> &stride,
-                  const std::vector<int> &padding, framework::Tensor *col);
-};
-
-template <ColFormat Format, typename DeviceType, typename T>
-class Col2ImFunctor {
- public:
-  void operator()(const framework::Tensor &col,
-                  const std::vector<int> &dilation,
-                  const std::vector<int> &stride,
-                  const std::vector<int> &padding, framework::Tensor *im);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/math.h b/mobile/src/operators/math/math.h
deleted file mode 100644
index 8ff5019e31..0000000000
--- a/mobile/src/operators/math/math.h
+++ /dev/null
@@ -1,342 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/* NEON implementation of sin, cos, exp and log
- *
- *   Inspired by Intel Approximate Math library, and based on the
- *   corresponding algorithms of the cephes math library
- */
-
-/* Copyright (C) 2011  Julien Pommier
- *
- *  This software is provided 'as-is', without any express or implied
- *  warranty.  In no event will the authors be held liable for any damages
- *  arising from the use of this software.
- *
- *  Permission is granted to anyone to use this software for any purpose,
- *  including commercial applications, and to alter it and redistribute it
- *  freely, subject to the following restrictions:
- *
- *  1. The origin of this software must not be misrepresented; you must not
- *     claim that you wrote the original software. If you use this software
- *     in a product, an acknowledgment in the product documentation would be
- *     appreciated but is not required.
- *  2. Altered source versions must be plainly marked as such, and must not be
- *     misrepresented as being the original software.
- *  3. This notice may not be removed or altered from any source distribution.
- *
- *  (this is the zlib license)
- */
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-#pragma once
-
-#include <arm_neon.h>
-
-#define c_inv_mant_mask ~0x7f800000u
-#define c_cephes_SQRTHF 0.707106781186547524
-#define c_cephes_log_p0 7.0376836292E-2
-#define c_cephes_log_p1 -1.1514610310E-1
-#define c_cephes_log_p2 1.1676998740E-1
-#define c_cephes_log_p3 -1.2420140846E-1
-#define c_cephes_log_p4 +1.4249322787E-1
-#define c_cephes_log_p5 -1.6668057665E-1
-#define c_cephes_log_p6 +2.0000714765E-1
-#define c_cephes_log_p7 -2.4999993993E-1
-#define c_cephes_log_p8 +3.3333331174E-1
-#define c_cephes_log_q1 -2.12194440e-4
-#define c_cephes_log_q2 0.693359375
-
-/* natural logarithm computed for 4 simultaneous float
- *   return NaN for x <= 0
- */
-static inline float32x4_t log_ps(float32x4_t x) {
-  float32x4_t one = vdupq_n_f32(1);
-
-  x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
-  uint32x4_t invalid_mask = vcleq_f32(x, vdupq_n_f32(0));
-
-  int32x4_t ux = vreinterpretq_s32_f32(x);
-
-  int32x4_t emm0 = vshrq_n_s32(ux, 23);
-
-  /* keep only the fractional part */
-  ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask));
-  ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f)));
-  x = vreinterpretq_f32_s32(ux);
-
-  emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f));
-  float32x4_t e = vcvtq_f32_s32(emm0);
-
-  e = vaddq_f32(e, one);
-
-  /* part2:
-   *     if( x < SQRTHF ) {
-   *       e -= 1;
-   *       x = x + x - 1.0;
-   *     } else { x = x - 1.0; }
-   */
-  uint32x4_t mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF));
-  float32x4_t tmp =
-      vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
-  x = vsubq_f32(x, one);
-  e = vsubq_f32(
-      e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask)));
-  x = vaddq_f32(x, tmp);
-
-  float32x4_t z = vmulq_f32(x, x);
-
-  float32x4_t y = vdupq_n_f32(c_cephes_log_p0);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8));
-  y = vmulq_f32(y, x);
-
-  y = vmulq_f32(y, z);
-
-  tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1));
-  y = vaddq_f32(y, tmp);
-
-  tmp = vmulq_f32(z, vdupq_n_f32(0.5f));
-  y = vsubq_f32(y, tmp);
-
-  tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2));
-  x = vaddq_f32(x, y);
-  x = vaddq_f32(x, tmp);
-  x = vreinterpretq_f32_u32(vorrq_u32(
-      vreinterpretq_u32_f32(x), invalid_mask));  // negative arg will be NAN
-  return x;
-}
-
-#define c_exp_hi 88.3762626647949f
-#define c_exp_lo -88.3762626647949f
-
-#define c_cephes_LOG2EF 1.44269504088896341
-#define c_cephes_exp_C1 0.693359375
-#define c_cephes_exp_C2 -2.12194440e-4
-
-#define c_cephes_exp_p0 1.9875691500E-4
-#define c_cephes_exp_p1 1.3981999507E-3
-#define c_cephes_exp_p2 8.3334519073E-3
-#define c_cephes_exp_p3 4.1665795894E-2
-#define c_cephes_exp_p4 1.6666665459E-1
-#define c_cephes_exp_p5 5.0000001201E-1
-
-/* exp() computed for 4 float at once */
-static inline float32x4_t exp_ps(float32x4_t x) {
-  float32x4_t tmp, fx;
-
-  float32x4_t one = vdupq_n_f32(1);
-  x = vminq_f32(x, vdupq_n_f32(c_exp_hi));
-  x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo));
-
-  /* express exp(x) as exp(g + n*log(2)) */
-  fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF));
-
-  /* perform a floorf */
-  tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
-
-  /* if greater, substract 1 */
-  uint32x4_t mask = vcgtq_f32(tmp, fx);
-  mask = vandq_u32(mask, vreinterpretq_u32_f32(one));
-
-  fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
-
-  tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1));
-  float32x4_t z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2));
-  x = vsubq_f32(x, tmp);
-  x = vsubq_f32(x, z);
-
-  static const float cephes_exp_p[6] = {c_cephes_exp_p0, c_cephes_exp_p1,
-                                        c_cephes_exp_p2, c_cephes_exp_p3,
-                                        c_cephes_exp_p4, c_cephes_exp_p5};
-  float32x4_t y = vld1q_dup_f32(cephes_exp_p + 0);
-  float32x4_t c1 = vld1q_dup_f32(cephes_exp_p + 1);
-  float32x4_t c2 = vld1q_dup_f32(cephes_exp_p + 2);
-  float32x4_t c3 = vld1q_dup_f32(cephes_exp_p + 3);
-  float32x4_t c4 = vld1q_dup_f32(cephes_exp_p + 4);
-  float32x4_t c5 = vld1q_dup_f32(cephes_exp_p + 5);
-
-  y = vmulq_f32(y, x);
-  z = vmulq_f32(x, x);
-
-  y = vaddq_f32(y, c1);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, c2);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, c3);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, c4);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, c5);
-
-  y = vmulq_f32(y, z);
-  y = vaddq_f32(y, x);
-  y = vaddq_f32(y, one);
-
-  /* build 2^n */
-  int32x4_t mm;
-  mm = vcvtq_s32_f32(fx);
-  mm = vaddq_s32(mm, vdupq_n_s32(0x7f));
-  mm = vshlq_n_s32(mm, 23);
-  float32x4_t pow2n = vreinterpretq_f32_s32(mm);
-
-  y = vmulq_f32(y, pow2n);
-  return y;
-}
-
-#define c_minus_cephes_DP1 -0.78515625
-#define c_minus_cephes_DP2 -2.4187564849853515625e-4
-#define c_minus_cephes_DP3 -3.77489497744594108e-8
-#define c_sincof_p0 -1.9515295891E-4
-#define c_sincof_p1 8.3321608736E-3
-#define c_sincof_p2 -1.6666654611E-1
-#define c_coscof_p0 2.443315711809948E-005
-#define c_coscof_p1 -1.388731625493765E-003
-#define c_coscof_p2 4.166664568298827E-002
-#define c_cephes_FOPI 1.27323954473516  // 4 / M_PI
-
-/* evaluation of 4 sines & cosines at once.
- *
- *   The code is the exact rewriting of the cephes sinf function.
- *   Precision is excellent as long as x < 8192 (I did not bother to
- *   take into account the special handling they have for greater values
- *   -- it does not return garbage for arguments over 8192, though, but
- *   the extra precision is missing).
- *
- *   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
- *   surprising but correct result.
- *
- *   Note also that when you compute sin(x), cos(x) is available at
- *   almost no extra price so both sin_ps and cos_ps make use of
- *   sincos_ps..
- */
-static inline void sincos_ps(float32x4_t x, float32x4_t *ysin,
-                             float32x4_t *ycos) {
-  // any x
-  float32x4_t xmm1, xmm2, xmm3, y;
-
-  uint32x4_t emm2;
-
-  uint32x4_t sign_mask_sin, sign_mask_cos;
-  sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0));
-  x = vabsq_f32(x);
-
-  /* scale by 4/Pi */
-  y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI));
-
-  /* store the integer part of y in mm0 */
-  emm2 = vcvtq_u32_f32(y);
-  /* j=(j+1) & (~1) (see the cephes sources) */
-  emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
-  emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
-  y = vcvtq_f32_u32(emm2);
-
-  /* get the polynom selection mask
-   *     there is one polynom for 0 <= x <= Pi/4
-   *     and another one for Pi/4<x<=Pi/2
-   *
-   *     Both branches will be computed.
-   */
-  uint32x4_t poly_mask = vtstq_u32(emm2, vdupq_n_u32(2));
-
-  /* The magic pass: "Extended precision modular arithmetic"
-   *     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = vmulq_n_f32(y, c_minus_cephes_DP1);
-  xmm2 = vmulq_n_f32(y, c_minus_cephes_DP2);
-  xmm3 = vmulq_n_f32(y, c_minus_cephes_DP3);
-  x = vaddq_f32(x, xmm1);
-  x = vaddq_f32(x, xmm2);
-  x = vaddq_f32(x, xmm3);
-
-  sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, vdupq_n_u32(4)));
-  sign_mask_cos = vtstq_u32(vsubq_u32(emm2, vdupq_n_u32(2)), vdupq_n_u32(4));
-
-  /* Evaluate the first polynom  (0 <= x <= Pi/4) in y1,
-   *     and the second polynom      (Pi/4 <= x <= 0) in y2 */
-  float32x4_t z = vmulq_f32(x, x);
-  float32x4_t y1, y2;
-
-  y1 = vmulq_n_f32(z, c_coscof_p0);
-  y2 = vmulq_n_f32(z, c_sincof_p0);
-  y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p1));
-  y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p1));
-  y1 = vmulq_f32(y1, z);
-  y2 = vmulq_f32(y2, z);
-  y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p2));
-  y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p2));
-  y1 = vmulq_f32(y1, z);
-  y2 = vmulq_f32(y2, z);
-  y1 = vmulq_f32(y1, z);
-  y2 = vmulq_f32(y2, x);
-  y1 = vsubq_f32(y1, vmulq_f32(z, vdupq_n_f32(0.5f)));
-  y2 = vaddq_f32(y2, x);
-  y1 = vaddq_f32(y1, vdupq_n_f32(1));
-
-  /* select the correct result from the two polynoms */
-  float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
-  float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
-  *ysin = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
-  *ycos = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
-}
-
-static inline float32x4_t sin_ps(float32x4_t x) {
-  float32x4_t ysin, ycos;
-  sincos_ps(x, &ysin, &ycos);
-  return ysin;
-}
-
-static inline float32x4_t cos_ps(float32x4_t x) {
-  float32x4_t ysin, ycos;
-  sincos_ps(x, &ysin, &ycos);
-  return ycos;
-}
-
-static inline float32x4_t div_ps(float32x4_t a, float32x4_t b) {
-  float32x4_t reciprocal = vrecpeq_f32(b);
-  reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
-  return vmulq_f32(a, reciprocal);
-}
-
-static inline float32x4_t pow_ps(float32x4_t a, float32x4_t b) {
-  return exp_ps(vmulq_f32(b, log_ps(a)));
-}
-
-#ifndef __aarch64__
-inline float32x4_t vpaddq_f32(float32x4_t r0, float32x4_t r1) {
-  float32x2_t sum0 = vpadd_f32(vget_low_f32(r0), vget_high_f32(r0));
-  float32x2_t sum1 = vpadd_f32(vget_low_f32(r1), vget_high_f32(r1));
-  return vcombine_f32(sum0, sum1);
-}
-#endif
-
-inline float32x4_t vandq_f32_u32(float32x4_t x, uint32x4_t mask) {
-  return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
-}
-
-#endif  // __ARM_NEON__
diff --git a/mobile/src/operators/math/math_function.cpp b/mobile/src/operators/math/math_function.cpp
deleted file mode 100644
index 8724d04b67..0000000000
--- a/mobile/src/operators/math/math_function.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/math/math_function.h"
-#include <string>
-#include "common/enforce.h"
-#include "framework/data_type.h"
-#include "framework/tensor.h"
-#include "operators/math/gemm.h"
-#include "operators/math/gemm/cblas.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-struct TensorSetConstant {
-  TensorSetConstant(framework::Tensor *tensor, float value)
-      : tensor_(tensor), value_(value) {}
-  template <typename T>
-  void apply() const {
-    auto *begin = tensor_->mutable_data<T>();
-    std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
-  }
-  framework::Tensor *tensor_;
-  float value_;
-};
-
-void SetConstant(framework::Tensor *tensor, float value) {
-  framework::VisitDataType(framework::ToDataType(tensor->type()),
-                           TensorSetConstant(tensor, value));
-}
-
-template <>
-void MatMul<float, float>(const framework::Tensor &matrix_a, bool trans_a,
-                          const framework::Tensor &matrix_b, bool trans_b,
-                          float alpha, framework::Tensor *matrix_out,
-                          float beta, bool relu, float *bias) {
-  auto dim_a = matrix_a.dims();
-  auto dim_b = matrix_b.dims();
-  auto dim_out = matrix_out->dims();
-  PADDLE_MOBILE_ENFORCE(
-      dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-      "The input and output of MatMul be matrix");
-
-  int M = dim_out[0];
-  int N = dim_out[1];
-  int K = (!trans_a) ? dim_a[1] : dim_a[0];
-  int ldb = (!trans_b) ? dim_b[1] : dim_b[0];
-
-  Gemm gemm;
-  if (trans_a) {
-    framework::Tensor matrix_trans;
-    int numel = matrix_a.numel();
-    int m = matrix_a.dims()[0];
-    int n = matrix_a.dims()[1];
-    float *tmp = (float *)(matrix_a.data<float>());  // NOLINT
-    float *a = matrix_trans.mutable_data<float>(matrix_a.dims());
-    int index = 0;
-    for (int j = 0; j < n; j++) {
-      for (int i = 0; i < m; i++) {
-        a[index++] = tmp[i * n + j];
-      }
-    }
-    cblas_sgemm(false, trans_b, M, N, K, alpha, a, K, matrix_b.data<float>(),
-                ldb, beta, matrix_out->data<float>(), N);
-  } else {
-    cblas_sgemm(false, trans_b, M, N, K, alpha, matrix_a.data<float>(), K,
-                matrix_b.data<float>(), ldb, beta, matrix_out->data<float>(),
-                N);
-  }
-}
-
-void MatMulWithBn(const framework::Tensor &matrix_a, bool trans_a,
-                  const framework::Tensor &matrix_b, bool trans_b, float alpha,
-                  framework::Tensor *matrix_out, float beta, bool relu,
-                  framework::Tensor *new_scale, framework::Tensor *new_bias,
-                  int group, float *bias) {
-  Gemm gemm;
-  auto dim_a = matrix_a.dims();
-  auto dim_b = matrix_b.dims();
-  auto dim_out = matrix_out->dims();
-  PADDLE_MOBILE_ENFORCE(
-      dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-      "The input and output of MatMul be matrix");
-
-  int M = dim_out[0];
-  int N = dim_out[1];
-  int K = (!trans_a) ? dim_a[1] : dim_a[0];
-
-#ifdef _OPENMP
-  gemm.SgemmWithBn_omp(
-      M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
-      beta, matrix_out->data<float>(), N, relu,
-      new_scale->data<float>() + group, new_bias->data<float>() + group, bias);
-#else
-  gemm.SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K,
-                   matrix_b.data<float>(), N, beta, matrix_out->data<float>(),
-                   N, relu, new_scale->data<float>() + group,
-                   new_bias->data<float>() + group, bias);
-#endif
-}
-void MatMulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
-                     const framework::Tensor &matrix_b, bool trans_b,
-                     framework::Tensor *matrix_out, float *p, std::string mode,
-                     float *bias, float *bias1) {
-  Gemm gemm;
-  auto dim_a = matrix_a.dims();
-  auto dim_b = matrix_b.dims();
-  auto dim_out = matrix_out->dims();
-  PADDLE_MOBILE_ENFORCE(
-      dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-      "The input and output of MatMul be matrix");
-
-  int M = dim_out[0];
-  int N = dim_out[1];
-  int K = (!trans_a) ? dim_a[1] : dim_a[0];
-
-#ifdef _OPENMP
-  gemm.SgemmWithPRelu_omp(M, N, K, matrix_a.data<float>(), K,
-                          matrix_b.data<float>(), N, matrix_out->data<float>(),
-                          N, p, mode, bias, bias1);
-#else
-  gemm.SgemmWithPRelu(M, N, K, matrix_a.data<float>(), K,
-                      matrix_b.data<float>(), N, matrix_out->data<float>(), N,
-                      p, mode, bias, bias1);
-#endif
-}
-
-template <typename T>
-struct ClearTensor<CPU, T> {
-  void operator()(framework::Tensor *tensor) {
-    auto size = tensor->numel();
-    auto *tensor_data = tensor->data<T>();
-    memset((void *)tensor_data, 0, sizeof(T) * size);  // NOLINT
-  }
-};
-
-template <typename T>
-struct RowwiseAdd<CPU, T> {
-  void operator()(const framework::Tensor &input,
-                  const framework::Tensor &vector, framework::Tensor *output) {
-    auto in_dims = input.dims();
-    auto size = input.numel() / in_dims[0];
-    PADDLE_MOBILE_ENFORCE((vector.numel() == size),
-                          "vector.numel() must be equal to size.");
-    PADDLE_MOBILE_ENFORCE((output->dims() == in_dims),
-                          "output->dims() must be equal to in_dims.");
-
-    auto *input_data = input.data<T>();
-    auto *out_data = output->data<T>();
-    auto *vec_data = vector.data<T>();
-    for (int64_t i = 0; i < in_dims[0]; ++i) {
-      for (int64_t j = 0; j < size; ++j) {
-        out_data[i * size + j] = input_data[i * size + j] + vec_data[j];
-      }
-    }
-  }
-};
-
-template struct RowwiseAdd<CPU, float>;
-template struct ClearTensor<CPU, float>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/math_function.h b/mobile/src/operators/math/math_function.h
deleted file mode 100644
index ccc1a2b931..0000000000
--- a/mobile/src/operators/math/math_function.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-void SetConstant(framework::Tensor *tensor, float value);
-
-template <typename Itype, typename Otype>
-void MatMul(const framework::Tensor &matrix_a, bool trans_a,
-            const framework::Tensor &matrix_b, bool trans_b, float alpha,
-            framework::Tensor *matrix_out, float beta, bool relu = false,
-            Otype *bias = nullptr);
-
-template <typename Itype, typename Otype>
-void MatMul(const framework::Tensor &matrix_a, bool trans_a,
-            const framework::Tensor &matrix_b, bool trans_b, float alpha,
-            framework::Tensor *matrix_out, float beta, bool relu, Otype *bias,
-            bool addOnRow);
-
-void MatMulWithBn(const framework::Tensor &matrix_a, bool trans_a,
-                  const framework::Tensor &matrix_b, bool trans_b, float alpha,
-                  framework::Tensor *matrix_out, float beta, bool relu,
-                  framework::Tensor *new_scale, framework::Tensor *new_bias,
-                  int group, float *bias = nullptr);
-
-void MatMulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
-                     const framework::Tensor &matrix_b, bool trans_b,
-                     framework::Tensor *matrix_out, float *p, std::string mode,
-                     float *bias, float *bias1);
-
-template <typename Device, typename T>
-struct ClearTensor {
-  void operator()(framework::Tensor *tensor);
-};
-
-template <typename Device, typename T>
-struct RowwiseAdd {
-  void operator()(const framework::Tensor &input, const framework::Tensor &vec,
-                  framework::Tensor *output);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/math_function_int8.cpp b/mobile/src/operators/math/math_function_int8.cpp
deleted file mode 100644
index 0595a808f0..0000000000
--- a/mobile/src/operators/math/math_function_int8.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cstring>
-#include <string>
-#include "operators/math/gemm.h"
-#include "operators/math/math_function.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <>
-void MatMul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a,
-                             const framework::Tensor &matrix_b, bool trans_b,
-                             float alpha, framework::Tensor *matrix_out,
-                             float beta, bool relu, int32_t *bias,
-                             bool addOnRow) {
-  auto dim_a = matrix_a.dims();
-  auto dim_b = matrix_b.dims();
-  auto dim_out = matrix_out->dims();
-  PADDLE_MOBILE_ENFORCE(
-      dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-      "The input and output of MatMul be matrix");
-
-  int32_t M = dim_out[0];
-  int32_t N = dim_out[1];
-  int32_t K = (!trans_a) ? dim_a[1] : dim_a[0];
-  Gemm gemm;
-
-  if (trans_a) {
-    int32_t numel = matrix_a.numel();
-    int32_t m = matrix_a.dims()[0];
-    int32_t n = matrix_a.dims()[1];
-    int8_t *tmp = (int8_t *)(matrix_a.data<int8_t>());  // NOLINT
-    int8_t *a = static_cast<int8_t *>(
-        paddle_mobile::memory::Alloc(sizeof(int8_t) * numel));
-    int32_t index = 0;
-    for (int32_t j = 0; j < n; j++) {
-      for (int32_t i = 0; i < m; i++) {
-        a[index++] = tmp[i * n + j];
-      }
-    }
-
-#ifdef _OPENMP
-    if (bias != nullptr) {
-      gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
-                     matrix_out->data<int8_t>(), N, relu, bias, addOnRow);
-    } else {
-      gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
-                     matrix_out->data<int32_t>(), N, relu, bias, addOnRow);
-    }
-#else
-    if (bias != nullptr) {
-      gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
-                 matrix_out->data<int8_t>(), N, relu, bias, addOnRow);
-    } else {
-      gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
-                 matrix_out->data<int32_t>(), N, relu, bias, addOnRow);
-    }
-#endif
-  } else {
-#ifdef _OPENMP
-    if (bias != nullptr) {
-      gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<int8_t>(), K,
-                     matrix_b.data<int8_t>(), N, beta,
-                     matrix_out->data<int8_t>(), N, relu, bias, addOnRow);
-    } else {
-      gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<int8_t>(), K,
-                     matrix_b.data<int8_t>(), N, beta,
-                     matrix_out->data<int32_t>(), N, relu, bias, addOnRow);
-    }
-#else
-    if (bias != nullptr) {
-      gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
-                 matrix_b.data<int8_t>(), N, beta, matrix_out->data<int8_t>(),
-                 N, relu, bias, addOnRow);
-    } else {
-      gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
-                 matrix_b.data<int8_t>(), N, beta, matrix_out->data<int32_t>(),
-                 N, relu, bias, addOnRow);
-    }
-#endif
-  }
-}
-
-template <>
-void MatMul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a,
-                             const framework::Tensor &matrix_b, bool trans_b,
-                             float alpha, framework::Tensor *matrix_out,
-                             float beta, bool relu, int32_t *bias) {
-  MatMul<int8_t, int32_t>(matrix_a, trans_a, matrix_b, trans_b, alpha,
-                          matrix_out, beta, relu, bias, false);
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/pad.cpp b/mobile/src/operators/math/pad.cpp
deleted file mode 100644
index 49fede1eb3..0000000000
--- a/mobile/src/operators/math/pad.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/math/pad.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <typename T>
-class PadFunctor<CPU, T> {
- public:
-  void operator()(const framework::Tensor &input, const int pad_top,
-                  const int pad_bottom, const int pad_left, const int pad_right,
-                  framework::Tensor *output) {
-    const T *in_data = input.data<T>();
-    T *out_data = output->mutable_data<T>();
-    // should check output shape is valid for such pad parameters
-    const framework::DDim &input_shape = input.dims();
-    const framework::DDim &output_shape = output->dims();
-    // fill output with 0
-    memset(out_data, 0, sizeof(T) * output->numel());
-    // should make sure the shape of output is match with input
-    for (int i = 0; i < input_shape[0]; ++i) {
-      for (int c = 0; c < input_shape[1]; ++c) {
-        out_data += pad_top * output_shape[3];
-        for (int h = 0; h < input_shape[2]; ++h) {
-          memcpy(out_data + pad_left, in_data, sizeof(T) * input_shape[3]);
-          out_data += output_shape[3];
-          in_data += input_shape[3];
-        }
-        out_data += pad_bottom * output_shape[3];
-      }
-    }
-  }
-};
-
-template class PadFunctor<CPU, float>;
-template class PadFunctor<CPU, int8_t>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/pad.h b/mobile/src/operators/math/pad.h
deleted file mode 100644
index 9031caf36a..0000000000
--- a/mobile/src/operators/math/pad.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <typename DeviceType, typename T>
-class PadFunctor {
- public:
-  void operator()(const framework::Tensor &input, const int pad_top,
-                  const int pad_bottom, const int pad_left, const int pad_right,
-                  framework::Tensor *output);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/poly_util.cpp b/mobile/src/operators/math/poly_util.cpp
deleted file mode 100644
index 1cc1e2a403..0000000000
--- a/mobile/src/operators/math/poly_util.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MULTICLASSNMS_OP
-
-#include "operators/math/poly_util.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <class T>
-void Array2PointVec(const T* box, const size_t box_size,
-                    std::vector<Point_<T>>* vec) {
-  size_t pts_num = box_size / 2;
-  vec->resize(pts_num);
-  for (size_t i = 0; i < pts_num; i++) {
-    vec->at(i).x = box[2 * i];
-    vec->at(i).y = box[2 * i + 1];
-  }
-}
-
-template <class T>
-void Array2Poly(const T* box, const size_t box_size, gpc::gpc_polygon* poly) {
-  size_t pts_num = box_size / 2;
-  poly->num_contours = 1;
-  poly->hole = reinterpret_cast<int*>(malloc(sizeof(int)));
-  poly->hole[0] = 0;
-  poly->contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list));
-  poly->contour->num_vertices = pts_num;
-  poly->contour->vertex =
-      (gpc::gpc_vertex*)malloc(sizeof(gpc::gpc_vertex) * pts_num);
-  for (size_t i = 0; i < pts_num; ++i) {
-    poly->contour->vertex[i].x = box[2 * i];
-    poly->contour->vertex[i].y = box[2 * i + 1];
-  }
-}
-
-template void Array2Poly(const float* box, const size_t box_size,
-                         gpc::gpc_polygon* poly);
-
-template <class T>
-void Poly2PointVec(const gpc::gpc_vertex_list& contour,
-                   std::vector<Point_<T>>* vec) {
-  int pts_num = contour.num_vertices;
-  vec->resize(pts_num);
-  for (size_t i = 0; i < pts_num; i++) {
-    vec->at(i).x = contour.vertex[i].x;
-    vec->at(i).y = contour.vertex[i].y;
-  }
-}
-
-template <class T>
-T GetContourArea(const std::vector<Point_<T>>& vec) {
-  int pts_num = vec.size();
-  if (pts_num < 3) return T(0.);
-  T area = T(0.);
-  for (size_t i = 0; i < pts_num; ++i) {
-    area += vec[i].x * vec[(i + 1) % pts_num].y -
-            vec[i].y * vec[(i + 1) % pts_num].x;
-  }
-  return fabs(area / 2.0);
-}
-
-template <class T>
-T PolyArea(const T* box, const size_t box_size, const bool normalized) {
-  // If coordinate values are is invalid
-  // if area size <= 0,  return 0.
-  std::vector<Point_<T>> vec;
-  Array2PointVec<T>(box, box_size, &vec);
-  return GetContourArea<T>(vec);
-}
-
-template float PolyArea(const float* box, const size_t box_size,
-                        const bool normalized);
-
-template <class T>
-T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size,
-                  const bool normalized) {
-  gpc::gpc_polygon poly1;
-  gpc::gpc_polygon poly2;
-  Array2Poly<T>(box1, box_size, &poly1);
-  Array2Poly<T>(box2, box_size, &poly2);
-  gpc::gpc_polygon respoly;
-  gpc::gpc_op op = gpc::GPC_INT;
-  gpc::gpc_polygon_clip(op, &poly2, &poly1, &respoly);
-
-  T inter_area = T(0.);
-  int contour_num = respoly.num_contours;
-  for (int i = 0; i < contour_num; ++i) {
-    std::vector<Point_<T>> resvec;
-    Poly2PointVec<T>(respoly.contour[i], &resvec);
-    inter_area += GetContourArea<T>(resvec);
-  }
-
-  gpc::gpc_free_polygon(&poly1);
-  gpc::gpc_free_polygon(&poly2);
-  gpc::gpc_free_polygon(&respoly);
-  return inter_area;
-}
-
-template float PolyOverlapArea(const float* box1, const float* box2,
-                               const size_t box_size, const bool normalized);
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/math/poly_util.h b/mobile/src/operators/math/poly_util.h
deleted file mode 100644
index 96951a0ab1..0000000000
--- a/mobile/src/operators/math/poly_util.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MULTICLASSNMS_OP
-#pragma once
-
-#include <vector>
-#include "operators/math/gpc.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <class T>
-class Point_ {
- public:
-  // default constructor
-  Point_() {}
-  Point_(T _x, T _y) {}
-  Point_(const Point_& pt) {}
-
-  Point_& operator=(const Point_& pt);
-  // conversion to another data type
-  // template<typename _T> operator Point_<_T>() const;
-  // conversion to the old-style C structures
-  // operator Vec<T, 2>() const;
-
-  // checks whether the point is inside the specified rectangle
-  // bool inside(const Rect_<T>& r) const;
-  T x;  //!< x coordinate of the point
-  T y;  //!< y coordinate of the point
-};
-
-template <class T>
-void Array2PointVec(const T* box, const size_t box_size,
-                    std::vector<Point_<T>>* vec);
-
-template <class T>
-void Array2Poly(const T* box, const size_t box_size, gpc::gpc_polygon* poly);
-
-template <class T>
-void Poly2PointVec(const gpc::gpc_vertex_list& contour,
-                   std::vector<Point_<T>>* vec);
-
-template <class T>
-T GetContourArea(const std::vector<Point_<T>>& vec);
-
-template <class T>
-T PolyArea(const T* box, const size_t box_size, const bool normalized);
-
-template <class T>
-T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size,
-                  const bool normalized);
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/math/pooling.cpp b/mobile/src/operators/math/pooling.cpp
deleted file mode 100644
index 46b4453e73..0000000000
--- a/mobile/src/operators/math/pooling.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POOL_OP
-
-#include "operators/math/pooling.h"
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <PoolingType P>
-void Pooling<P>::operator()(const framework::Tensor &input,
-                            const std::vector<int> &kernel_size,
-                            const std::vector<int> &strides,
-                            const std::vector<int> &paddings,
-                            framework::Tensor *output) {
-  const int batch_size = input.dims()[0];
-  const int input_height = input.dims()[2];
-  const int input_width = input.dims()[3];
-  const int output_channels = output->dims()[1];
-  const int output_height = output->dims()[2];
-  const int output_width = output->dims()[3];
-  const int ksize_height = kernel_size[0];
-  const int ksize_width = kernel_size[1];
-  const int stride_height = strides[0];
-  const int stride_width = strides[1];
-  const int padding_height = paddings[0];
-  const int padding_width = paddings[1];
-
-  const float *input_data = input.data<float>();
-  float *output_data = output->mutable_data<float>();
-  const size_t input_spatial_size = input_height * input_width;
-  const size_t output_spatial_size = output_height * output_width;
-
-  #pragma omp parallel for collapse(2)
-  for (int i = 0; i < batch_size; i++) {
-    for (int c = 0; c < output_channels; ++c) {
-      int channel = i * output_channels + c;
-      const float *input_ptr = input_data + channel * input_spatial_size;
-      float *output_ptr = output_data + channel * output_spatial_size;
-
-      for (int ph = 0; ph < output_height; ++ph) {
-        int hstart = ph * stride_height - padding_height;
-        int hend = std::min(hstart + ksize_height, input_height);
-        hstart = std::max(hstart, 0);
-        for (int pw = 0; pw < output_width; ++pw) {
-          int wstart = pw * stride_width - padding_width;
-          int wend = std::min(wstart + ksize_width, input_width);
-          wstart = std::max(wstart, 0);
-
-          PoolingVal<P> val;
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              val += input_ptr[h * input_width + w];
-            }
-          }
-          output_ptr[ph * output_width + pw] = val.Value();
-        }
-      }
-    }
-  }
-}
-
-template struct Pooling<MAX>;
-template struct Pooling<AVG>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // POOL_OP
diff --git a/mobile/src/operators/math/pooling.h b/mobile/src/operators/math/pooling.h
deleted file mode 100644
index 70280ad0a0..0000000000
--- a/mobile/src/operators/math/pooling.h
+++ /dev/null
@@ -1,199 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POOL_OP
-
-#pragma once
-
-#include <algorithm>
-#include <cmath>
-#include <limits>
-#include <vector>
-#include "common/types.h"
-#include "framework/tensor.h"
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-#include <arm_neon.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <PoolingType P = MAX>
-struct PoolingVal {
-  float val;
-  int count;
-  PoolingVal() : count(0) { val = -std::numeric_limits<float>::max(); }
-  inline PoolingVal<P> &operator+=(const float &x) {
-    val = std::max(val, x);
-    ++count;
-    return *this;
-  }
-  inline float Value() { return (count > 0) ? val : 0.f; }
-  inline float ExclusiveSum(int total) {
-    return ((count > 0) ? val : 0.f) * total;
-  }
-};
-
-template <>
-struct PoolingVal<AVG> {
-  float val;
-  int count;
-  PoolingVal() : val(0.f), count(0) {}
-  inline PoolingVal<AVG> &operator+=(const float &x) {
-    val += x;
-    ++count;
-    return *this;
-  }
-  inline float Value() { return (count > 0) ? val * (1.f / count) : 0.f; }
-  inline float ExclusiveSum(int total) { return (count > 0) ? val : 0.f; }
-};
-
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-template <PoolingType P = MAX>
-inline float32x4_t vPoolInitq_f32() {
-  return vdupq_n_f32(-std::numeric_limits<float>::max());
-}
-
-template <>
-inline float32x4_t vPoolInitq_f32<AVG>() {
-  return vdupq_n_f32(0.f);
-}
-
-template <PoolingType P = MAX>
-inline float32x2_t vPoolInit_f32() {
-  return vdup_n_f32(-std::numeric_limits<float>::max());
-}
-
-template <>
-inline float32x2_t vPoolInit_f32<AVG>() {
-  return vdup_n_f32(0.f);
-}
-
-template <PoolingType P = MAX>
-inline float32x4_t vPoolPreq_f32(const float32x4_t &x1, const float32x4_t &x2) {
-  return vmaxq_f32(x1, x2);
-}
-
-template <>
-inline float32x4_t vPoolPreq_f32<AVG>(const float32x4_t &x1,
-                                      const float32x4_t &x2) {
-  return vaddq_f32(x1, x2);
-}
-
-template <PoolingType P = MAX>
-inline float32x2_t vPoolPre_f32(const float32x2_t &x1, const float32x2_t &x2) {
-  return vmax_f32(x1, x2);
-}
-
-template <>
-inline float32x2_t vPoolPre_f32<AVG>(const float32x2_t &x1,
-                                     const float32x2_t &x2) {
-  return vadd_f32(x1, x2);
-}
-
-template <PoolingType P = MAX>
-inline float32x2_t vpPoolPre_f32(const float32x2_t &x1, const float32x2_t &x2) {
-  return vpmax_f32(x1, x2);
-}
-
-template <>
-inline float32x2_t vpPoolPre_f32<AVG>(const float32x2_t &x1,
-                                      const float32x2_t &x2) {
-  return vpadd_f32(x1, x2);
-}
-
-template <PoolingType P = MAX>
-inline float32x4_t vPoolPostq_f32(const float32x4_t &x,
-                                  const float32x4_t &post) {
-  return x;
-}
-
-template <>
-inline float32x4_t vPoolPostq_f32<AVG>(const float32x4_t &x,
-                                       const float32x4_t &post) {
-  return vmulq_f32(x, post);
-}
-
-template <PoolingType P = MAX>
-inline float32x2_t vPoolPost_f32(const float32x2_t &x,
-                                 const float32x2_t &post) {
-  return x;
-}
-
-template <>
-inline float32x2_t vPoolPost_f32<AVG>(const float32x2_t &x,
-                                      const float32x2_t &post) {
-  return vmul_f32(x, post);
-}
-#endif  // __ARM_NEON__
-
-template <PoolingType P = MAX>
-inline float PoolPre(const float &x1, const float &x2) {
-  return std::max(x1, x2);
-}
-
-template <>
-inline float PoolPre<AVG>(const float &x1, const float &x2) {
-  return x1 + x2;
-}
-
-template <PoolingType P = MAX>
-inline float PoolPost(const float &x, const float &post) {
-  return x;
-}
-
-template <>
-inline float PoolPost<AVG>(const float &x, const float &post) {
-  return x * post;
-}
-
-template <PoolingType P>
-struct Pooling {
-  void operator()(const framework::Tensor &input,
-                  const std::vector<int> &kernel_size,
-                  const std::vector<int> &strides,
-                  const std::vector<int> &paddings, framework::Tensor *output);
-};
-
-template <PoolingType P, int Stride>
-struct Pooling2x2 {
-  void operator()(const framework::Tensor &input,
-                  const std::vector<int> &paddings, framework::Tensor *output);
-};
-
-template <PoolingType P, int Stride>
-struct Pooling3x3 {
-  void operator()(const framework::Tensor &input,
-                  const std::vector<int> &paddings, const bool exclusive,
-                  framework::Tensor *output);
-};
-
-template <PoolingType P, int Stride>
-struct Pooling5x5 {
-  void operator()(const framework::Tensor &input,
-                  const std::vector<int> &paddings, framework::Tensor *output);
-};
-
-template <PoolingType P, int Stride>
-struct Pooling7x7 {
-  void operator()(const framework::Tensor &input,
-                  const std::vector<int> &paddings, framework::Tensor *output);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/math/pooling2x2.cpp b/mobile/src/operators/math/pooling2x2.cpp
deleted file mode 100644
index 1d8845ce69..0000000000
--- a/mobile/src/operators/math/pooling2x2.cpp
+++ /dev/null
@@ -1,791 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POOL_OP
-
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-
-#include <arm_neon.h>
-#include "operators/math/pooling.h"
-
-// TODO(hjchen2): Optimize Pooling2x2NormalRow and use inline assembly
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-#define POOLING2X2_NORMAL_BORDER(start, end)                   \
-  for (int w = start; w < end; ++w) {                          \
-    const int w_in_start = -padding_w + w * Stride;            \
-    const int w_in_end = w_in_start + 2;                       \
-    const int w_start = w_in_start > 0 ? w_in_start : 0;       \
-    const int w_end = w_in_end < input_w ? w_in_end : input_w; \
-    PoolingVal<P> val;                                         \
-    for (int h_in = h_start; h_in < h_end; ++h_in) {           \
-      for (int w_in = w_start; w_in < w_end; ++w_in) {         \
-        val += input[h_in * input_w + w_in];                   \
-      }                                                        \
-    }                                                          \
-    output_ptr[w] = val.Value();                               \
-  }
-
-template <PoolingType P, int Stride = 1>
-struct Pooling2x2NormalRowLoadInput {
-  void operator()(const float *input, float32x4_t *x0, float32x4_t *x1) {
-    x0[0] = vld1q_f32(input);
-    x0[1] = vld1q_f32(input + 4);
-    x1[0] = vextq_f32(x0[0], x0[1], 1);
-    x1[1] = vextq_f32(x0[1], x0[1], 1);
-  }
-};
-
-template <PoolingType P>
-struct Pooling2x2NormalRowLoadInput<P, 2> {
-  void operator()(const float *input, float32x4_t *x0, float32x4_t *x1) {
-    float32x4x2_t t0 = vld2q_f32(input);
-    float32x4x2_t t1 = vld2q_f32(input + 8);
-    x0[0] = t0.val[0];
-    x0[1] = t1.val[0];
-    x1[0] = t0.val[1];
-    x1[1] = t1.val[1];
-  }
-};
-
-template <PoolingType P, int Stride>
-inline void Pooling2x2NormalRow(const float *input, const int h_output,
-                                const int input_h, const int input_w,
-                                const int padding_h, const int padding_w,
-                                const int output_w, float *output) {
-  const int h_in_start = -padding_h + h_output * Stride;
-  const int h_in_end = h_in_start + 2;
-  const int h_start = h_in_start > 0 ? h_in_start : 0;
-  const int h_end = h_in_end < input_h ? h_in_end : input_h;
-
-  float *output_ptr = output + h_output * output_w;
-  if (h_end - h_start <= 0) {
-    memset(output_ptr, 0, output_w * sizeof(float));
-    return;
-  }
-
-  const int valid_w_start = (padding_w + Stride - 1) / Stride;
-  const int valid_w_end = (input_w + padding_w - 2) / Stride + 1;
-  const int valid_w = valid_w_end - valid_w_start;
-
-  // border left
-  POOLING2X2_NORMAL_BORDER(0, valid_w_start)
-  // valid w
-  Pooling2x2NormalRowLoadInput<P, Stride> load_input;
-  int output_tiles = valid_w / 6;
-  int output_tiles_w = output_tiles * 6;
-  float32x4_t x0[2], x1[2], y0[2];
-  float32x4_t post = vdupq_n_f32(1.f / (2 * (h_end - h_start)));
-  for (int w = 0; w < output_tiles_w; w += 6) {
-    int output_offset = valid_w_start + w;
-    int input_w_offset = output_offset * Stride - padding_w;
-    y0[0] = vPoolInitq_f32<P>();
-    y0[1] = vPoolInitq_f32<P>();
-    for (int h_in = h_start; h_in < h_end; ++h_in) {
-      load_input(input + h_in * input_w + input_w_offset, x0, x1);
-      y0[0] = vPoolPreq_f32<P>(y0[0], x0[0]);
-      y0[0] = vPoolPreq_f32<P>(y0[0], x1[0]);
-      y0[1] = vPoolPreq_f32<P>(y0[1], x0[1]);
-      y0[1] = vPoolPreq_f32<P>(y0[1], x1[1]);
-    }
-    y0[0] = vPoolPostq_f32<P>(y0[0], post);
-    y0[1] = vPoolPostq_f32<P>(y0[1], post);
-    vst1q_f32(output_ptr + output_offset, y0[0]);
-    vst1_f32(output_ptr + output_offset + 4, vget_low_f32(y0[1]));
-  }
-  // remain valid w
-  int remain = valid_w - output_tiles_w;
-  if (remain > 0) {
-    int remain_start = valid_w_start + output_tiles_w;
-    int input_w_offset = remain_start * Stride - padding_w;
-    float *output_ptr0 = output_ptr + remain_start;
-    y0[0] = vPoolInitq_f32<P>();
-    y0[1] = vPoolInitq_f32<P>();
-    for (int h_in = h_start; h_in < h_end; ++h_in) {
-      load_input(input + h_in * input_w + input_w_offset, x0, x1);
-      y0[0] = vPoolPreq_f32<P>(y0[0], x0[0]);
-      y0[0] = vPoolPreq_f32<P>(y0[0], x1[0]);
-      y0[1] = vPoolPreq_f32<P>(y0[1], x0[1]);
-      y0[1] = vPoolPreq_f32<P>(y0[1], x1[1]);
-    }
-    y0[0] = vPoolPostq_f32<P>(y0[0], post);
-    y0[1] = vPoolPostq_f32<P>(y0[1], post);
-    switch (remain) {
-      case 1:
-        vst1q_lane_f32(output_ptr0, y0[0], 0);
-        break;
-      case 2:
-        vst1_f32(output_ptr0, vget_low_f32(y0[0]));
-        break;
-      case 3:
-        vst1_f32(output_ptr0, vget_low_f32(y0[0]));
-        vst1q_lane_f32(output_ptr0 + 2, y0[0], 2);
-        break;
-      case 4:
-        vst1q_f32(output_ptr0, y0[0]);
-        break;
-      case 5:
-        vst1q_f32(output_ptr0, y0[0]);
-        vst1q_lane_f32(output_ptr0 + 4, y0[1], 0);
-        break;
-    }
-  }
-  // border right
-  POOLING2X2_NORMAL_BORDER(valid_w_end, output_w)
-}
-
-template <PoolingType P>
-struct Pooling2x2<P, 1> {
-  inline void operator()(const framework::Tensor &input,
-                         const std::vector<int> &paddings,
-                         framework::Tensor *output) {
-    const float *input_data = input.data<float>();
-    float *output_data = output->mutable_data<float>();
-    int input_h = input.dims()[2];
-    int input_w = input.dims()[3];
-    int output_h = output->dims()[2];
-    int output_w = output->dims()[3];
-    int padding_h = paddings[0];
-    int padding_w = paddings[1];
-    int image_size = input_h * input_w;
-    int out_image_size = output_h * output_w;
-    int valid_h_start = padding_h;
-    int valid_h_end = output_h - valid_h_start;
-    int valid_h = valid_h_end - valid_h_start;
-    int valid_w_start = padding_w;
-    int valid_w_end = output_w - valid_w_start;
-    int valid_w = valid_w_end - valid_w_start;
-
-    #pragma omp parallel for collapse(2)
-    for (int batch = 0; batch < output->dims()[0]; ++batch) {
-      for (int c = 0; c < output->dims()[1]; ++c) {
-        int channel = batch * output->dims()[1] + c;
-        const float *input_ptr = input_data + channel * image_size;
-        float *output_ptr = output_data + channel * out_image_size;
-        // top
-        for (int h = 0; h < valid_h_start; ++h) {
-          Pooling2x2NormalRow<P, 1>(input_ptr, h, input_h, input_w, padding_h,
-                                    padding_w, output_w, output_ptr);
-        }
-        // valid
-        int output_w_tiles = valid_w / 6;
-        int output_w_remain = valid_w - output_w_tiles * 6;
-        for (int h = valid_h_start; h < valid_h_end - 3; h += 4) {
-          const float *input_ptr0 = input_ptr + (h - padding_h) * input_w;
-          const float *input_ptr1 = input_ptr0 + input_w;
-          const float *input_ptr2 = input_ptr1 + input_w;
-          const float *input_ptr3 = input_ptr2 + input_w;
-          const float *input_ptr4 = input_ptr3 + input_w;
-          float *output_ptr0 = output_ptr + h * output_w;
-          float *output_ptr1 = output_ptr0 + output_w;
-          float *output_ptr2 = output_ptr1 + output_w;
-          float *output_ptr3 = output_ptr2 + output_w;
-          // pad left
-          if (padding_w) {
-            for (int w = valid_w_start - 1; w >= 0; --w) {
-              int padding = padding_w - w;
-              if (padding >= 2) {
-                output_ptr0[w] = 0.f;
-                output_ptr1[w] = 0.f;
-                output_ptr2[w] = 0.f;
-                output_ptr3[w] = 0.f;
-              } else {
-                float acc0 = PoolPre<P>(*input_ptr0, *input_ptr1);
-                float acc1 = PoolPre<P>(*input_ptr1, *input_ptr2);
-                float acc2 = PoolPre<P>(*input_ptr2, *input_ptr3);
-                float acc3 = PoolPre<P>(*input_ptr3, *input_ptr4);
-                output_ptr0[w] = PoolPost<P>(acc0, 0.5f);
-                output_ptr1[w] = PoolPost<P>(acc1, 0.5f);
-                output_ptr2[w] = PoolPost<P>(acc2, 0.5f);
-                output_ptr3[w] = PoolPost<P>(acc3, 0.5f);
-              }
-            }
-            output_ptr0 += valid_w_start;
-            output_ptr1 += valid_w_start;
-            output_ptr2 += valid_w_start;
-            output_ptr3 += valid_w_start;
-          }
-          // valid
-          float32x4x2_t x0, x1, q0;
-          float32x4x2_t y0, y1;
-          float32x4_t post = vdupq_n_f32(0.25f);
-          for (int loop = 0; loop < output_w_tiles; ++loop) {
-            x0.val[0] = vld1q_f32(input_ptr0);
-            x0.val[1] = vld1q_f32(input_ptr0 + 4);
-            x1.val[0] = vld1q_f32(input_ptr1);
-            x1.val[1] = vld1q_f32(input_ptr1 + 4);
-            q0.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            q0.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], q0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], q0.val[1]);
-
-            q0.val[0] = vextq_f32(x1.val[0], x1.val[1], 1);
-            q0.val[1] = vextq_f32(x1.val[1], x1.val[1], 1);
-            y1.val[0] = vPoolPreq_f32<P>(x1.val[0], q0.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(x1.val[1], q0.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(y0.val[0], y1.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(y0.val[1], y1.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-            vst1q_f32(output_ptr0, y0.val[0]);
-            vst1_f32(output_ptr0 + 4, vget_low_f32(y0.val[1]));
-
-            x0.val[0] = vld1q_f32(input_ptr2);
-            x0.val[1] = vld1q_f32(input_ptr2 + 4);
-            x1.val[0] = vld1q_f32(input_ptr3);
-            x1.val[1] = vld1q_f32(input_ptr3 + 4);
-            q0.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            q0.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], q0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], q0.val[1]);
-            y1.val[0] = vPoolPreq_f32<P>(y1.val[0], y0.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(y1.val[1], y0.val[1]);
-            y1.val[0] = vPoolPostq_f32<P>(y1.val[0], post);
-            y1.val[1] = vPoolPostq_f32<P>(y1.val[1], post);
-            vst1q_f32(output_ptr1, y1.val[0]);
-            vst1_f32(output_ptr1 + 4, vget_low_f32(y1.val[1]));
-
-            q0.val[0] = vextq_f32(x1.val[0], x1.val[1], 1);
-            q0.val[1] = vextq_f32(x1.val[1], x1.val[1], 1);
-            y1.val[0] = vPoolPreq_f32<P>(x1.val[0], q0.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(x1.val[1], q0.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(y0.val[0], y1.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(y0.val[1], y1.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-            vst1q_f32(output_ptr2, y0.val[0]);
-            vst1_f32(output_ptr2 + 4, vget_low_f32(y0.val[1]));
-
-            x0.val[0] = vld1q_f32(input_ptr4);
-            x0.val[1] = vld1q_f32(input_ptr4 + 4);
-            q0.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            q0.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            y1.val[0] = vPoolPreq_f32<P>(y1.val[0], x0.val[0]);
-            y1.val[0] = vPoolPreq_f32<P>(y1.val[0], q0.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(y1.val[1], x0.val[1]);
-            y1.val[1] = vPoolPreq_f32<P>(y1.val[1], q0.val[1]);
-            y1.val[0] = vPoolPostq_f32<P>(y1.val[0], post);
-            y1.val[1] = vPoolPostq_f32<P>(y1.val[1], post);
-            vst1q_f32(output_ptr3, y1.val[0]);
-            vst1_f32(output_ptr3 + 4, vget_low_f32(y1.val[1]));
-
-            input_ptr0 += 6;
-            input_ptr1 += 6;
-            input_ptr2 += 6;
-            input_ptr3 += 6;
-            input_ptr4 += 6;
-            output_ptr0 += 6;
-            output_ptr1 += 6;
-            output_ptr2 += 6;
-            output_ptr3 += 6;
-          }
-          // remain width
-          if (output_w_remain > 0) {
-            float32x4x2_t y2, y3;
-            x0.val[0] = vld1q_f32(input_ptr0);
-            x0.val[1] = vld1q_f32(input_ptr0 + 4);
-            x1.val[0] = vld1q_f32(input_ptr1);
-            x1.val[1] = vld1q_f32(input_ptr1 + 4);
-            q0.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            q0.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], q0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], q0.val[1]);
-
-            q0.val[0] = vextq_f32(x1.val[0], x1.val[1], 1);
-            q0.val[1] = vextq_f32(x1.val[1], x1.val[1], 1);
-            y1.val[0] = vPoolPreq_f32<P>(x1.val[0], q0.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(x1.val[1], q0.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(y0.val[0], y1.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(y0.val[1], y1.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-
-            x0.val[0] = vld1q_f32(input_ptr2);
-            x0.val[1] = vld1q_f32(input_ptr2 + 4);
-            x1.val[0] = vld1q_f32(input_ptr3);
-            x1.val[1] = vld1q_f32(input_ptr3 + 4);
-            q0.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            q0.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            y2.val[0] = vPoolPreq_f32<P>(x0.val[0], q0.val[0]);
-            y2.val[1] = vPoolPreq_f32<P>(x0.val[1], q0.val[1]);
-            y1.val[0] = vPoolPreq_f32<P>(y1.val[0], y2.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(y1.val[1], y2.val[1]);
-            y1.val[0] = vPoolPostq_f32<P>(y1.val[0], post);
-            y1.val[1] = vPoolPostq_f32<P>(y1.val[1], post);
-
-            q0.val[0] = vextq_f32(x1.val[0], x1.val[1], 1);
-            q0.val[1] = vextq_f32(x1.val[1], x1.val[1], 1);
-            y3.val[0] = vPoolPreq_f32<P>(x1.val[0], q0.val[0]);
-            y3.val[1] = vPoolPreq_f32<P>(x1.val[1], q0.val[1]);
-            y2.val[0] = vPoolPreq_f32<P>(y2.val[0], y3.val[0]);
-            y2.val[1] = vPoolPreq_f32<P>(y2.val[1], y3.val[1]);
-            y2.val[0] = vPoolPostq_f32<P>(y2.val[0], post);
-            y2.val[1] = vPoolPostq_f32<P>(y2.val[1], post);
-
-            x0.val[0] = vld1q_f32(input_ptr4);
-            x0.val[1] = vld1q_f32(input_ptr4 + 4);
-            q0.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            q0.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            y3.val[0] = vPoolPreq_f32<P>(y3.val[0], x0.val[0]);
-            y3.val[0] = vPoolPreq_f32<P>(y3.val[0], q0.val[0]);
-            y3.val[1] = vPoolPreq_f32<P>(y3.val[1], x0.val[1]);
-            y3.val[1] = vPoolPreq_f32<P>(y3.val[1], q0.val[1]);
-            y3.val[0] = vPoolPostq_f32<P>(y3.val[0], post);
-            y3.val[1] = vPoolPostq_f32<P>(y3.val[1], post);
-
-            switch (output_w_remain) {
-              case 1:
-                vst1q_lane_f32(output_ptr0, y0.val[0], 0);
-                vst1q_lane_f32(output_ptr1, y1.val[0], 0);
-                vst1q_lane_f32(output_ptr2, y2.val[0], 0);
-                vst1q_lane_f32(output_ptr3, y3.val[0], 0);
-                break;
-              case 2:
-                vst1_f32(output_ptr0, vget_low_f32(y0.val[0]));
-                vst1_f32(output_ptr1, vget_low_f32(y1.val[0]));
-                vst1_f32(output_ptr2, vget_low_f32(y2.val[0]));
-                vst1_f32(output_ptr3, vget_low_f32(y3.val[0]));
-                break;
-              case 3:
-                vst1_f32(output_ptr0, vget_low_f32(y0.val[0]));
-                vst1_f32(output_ptr1, vget_low_f32(y1.val[0]));
-                vst1_f32(output_ptr2, vget_low_f32(y2.val[0]));
-                vst1_f32(output_ptr3, vget_low_f32(y3.val[0]));
-                vst1q_lane_f32(output_ptr0 + 2, y0.val[0], 2);
-                vst1q_lane_f32(output_ptr1 + 2, y1.val[0], 2);
-                vst1q_lane_f32(output_ptr2 + 2, y2.val[0], 2);
-                vst1q_lane_f32(output_ptr3 + 2, y3.val[0], 2);
-                break;
-              case 4:
-                vst1q_f32(output_ptr0, y0.val[0]);
-                vst1q_f32(output_ptr1, y1.val[0]);
-                vst1q_f32(output_ptr2, y2.val[0]);
-                vst1q_f32(output_ptr3, y3.val[0]);
-                break;
-              case 5:
-                vst1q_f32(output_ptr0, y0.val[0]);
-                vst1q_f32(output_ptr1, y1.val[0]);
-                vst1q_f32(output_ptr2, y2.val[0]);
-                vst1q_f32(output_ptr3, y3.val[0]);
-                vst1q_lane_f32(output_ptr0 + 4, y0.val[1], 0);
-                vst1q_lane_f32(output_ptr1 + 4, y1.val[1], 0);
-                vst1q_lane_f32(output_ptr2 + 4, y2.val[1], 0);
-                vst1q_lane_f32(output_ptr3 + 4, y3.val[1], 0);
-                break;
-            }
-            input_ptr0 += output_w_remain;
-            input_ptr1 += output_w_remain;
-            input_ptr2 += output_w_remain;
-            input_ptr3 += output_w_remain;
-            input_ptr4 += output_w_remain;
-            output_ptr0 += output_w_remain;
-            output_ptr1 += output_w_remain;
-            output_ptr2 += output_w_remain;
-            output_ptr3 += output_w_remain;
-          }
-          // pad right
-          if (padding_w) {
-            for (int w = valid_w_end; w < output_w; ++w) {
-              int padding = w + 2 - (padding_w + input_w);
-              if (padding >= 2) {
-                *output_ptr0 = 0.f;
-                *output_ptr1 = 0.f;
-                *output_ptr2 = 0.f;
-                *output_ptr3 = 0.f;
-              } else {
-                float acc0 = PoolPre<P>(*input_ptr0, *input_ptr1);
-                float acc1 = PoolPre<P>(*input_ptr1, *input_ptr2);
-                float acc2 = PoolPre<P>(*input_ptr2, *input_ptr3);
-                float acc3 = PoolPre<P>(*input_ptr3, *input_ptr4);
-                *output_ptr0 = PoolPost<P>(acc0, 0.5f);
-                *output_ptr1 = PoolPost<P>(acc1, 0.5f);
-                *output_ptr2 = PoolPost<P>(acc2, 0.5f);
-                *output_ptr3 = PoolPost<P>(acc3, 0.5f);
-              }
-              output_ptr0++;
-              output_ptr1++;
-              output_ptr2++;
-              output_ptr3++;
-            }
-          }
-        }
-        // remain height
-        int start_h = valid_h_start + (valid_h & 0xFFFFFFFC);
-        for (int h = start_h; h < valid_h_end; ++h) {
-          const float *input_ptr0 = input_ptr + (h - padding_h) * input_w;
-          const float *input_ptr1 = input_ptr0 + input_w;
-          float *output_ptr0 = output_ptr + h * output_w;
-          // pad left
-          if (padding_w) {
-            for (int w = valid_w_start - 1; w >= 0; --w) {
-              int padding = padding_w - w;
-              if (padding >= 2) {
-                output_ptr0[w] = 0.f;
-              } else {
-                float acc0 = PoolPre<P>(*input_ptr0, *input_ptr1);
-                output_ptr0[w] = PoolPost<P>(acc0, 0.5f);
-              }
-            }
-            output_ptr0 += valid_w_start;
-          }
-          // valid
-          float32x4x2_t x0, x1, q0, y0;
-          float32x4_t post = vdupq_n_f32(0.25f);
-          for (int loop = 0; loop < output_w_tiles; ++loop) {
-            x0.val[0] = vld1q_f32(input_ptr0);
-            x0.val[1] = vld1q_f32(input_ptr0 + 4);
-            x1.val[0] = vld1q_f32(input_ptr1);
-            x1.val[1] = vld1q_f32(input_ptr1 + 4);
-            q0.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            q0.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], q0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], q0.val[1]);
-
-            q0.val[0] = vextq_f32(x1.val[0], x1.val[1], 1);
-            q0.val[1] = vextq_f32(x1.val[1], x1.val[1], 1);
-            y0.val[0] = vPoolPreq_f32<P>(y0.val[0], x1.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(y0.val[1], x1.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(y0.val[0], q0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(y0.val[1], q0.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-            vst1q_f32(output_ptr0, y0.val[0]);
-            vst1_f32(output_ptr0 + 4, vget_low_f32(y0.val[1]));
-
-            input_ptr0 += 6;
-            input_ptr1 += 6;
-            output_ptr0 += 6;
-          }
-          // remain width
-          if (output_w_remain > 0) {
-            x0.val[0] = vld1q_f32(input_ptr0);
-            x0.val[1] = vld1q_f32(input_ptr0 + 4);
-            x1.val[0] = vld1q_f32(input_ptr1);
-            x1.val[1] = vld1q_f32(input_ptr1 + 4);
-            q0.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            q0.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], q0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], q0.val[1]);
-
-            q0.val[0] = vextq_f32(x1.val[0], x1.val[1], 1);
-            q0.val[1] = vextq_f32(x1.val[1], x1.val[1], 1);
-            y0.val[0] = vPoolPreq_f32<P>(y0.val[0], x1.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(y0.val[1], x1.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(y0.val[0], q0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(y0.val[1], q0.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-
-            switch (output_w_remain) {
-              case 1:
-                vst1q_lane_f32(output_ptr0, y0.val[0], 0);
-                break;
-              case 2:
-                vst1_f32(output_ptr0, vget_low_f32(y0.val[0]));
-                break;
-              case 3:
-                vst1_f32(output_ptr0, vget_low_f32(y0.val[0]));
-                vst1q_lane_f32(output_ptr0 + 2, y0.val[0], 2);
-                break;
-              case 4:
-                vst1q_f32(output_ptr0, y0.val[0]);
-                break;
-              case 5:
-                vst1q_f32(output_ptr0, y0.val[0]);
-                vst1q_lane_f32(output_ptr0 + 4, y0.val[1], 0);
-                break;
-            }
-            input_ptr0 += output_w_remain;
-            input_ptr1 += output_w_remain;
-            output_ptr0 += output_w_remain;
-          }
-          // pad right
-          if (padding_w) {
-            for (int w = valid_w_end; w < output_w; ++w) {
-              int padding = w + 2 - (padding_w + input_w);
-              if (padding >= 2) {
-                *output_ptr0 = 0.f;
-              } else {
-                float acc0 = PoolPre<P>(*input_ptr0, *input_ptr1);
-                *output_ptr0 = PoolPost<P>(acc0, 0.5f);
-              }
-              output_ptr0++;
-            }
-          }
-        }
-        // bottom
-        for (int h = valid_h_end; h < output_h; ++h) {
-          Pooling2x2NormalRow<P, 1>(input_ptr, h, input_h, input_w, padding_h,
-                                    padding_w, output_w, output_ptr);
-        }
-      }
-    }
-  }
-};
-
-template <PoolingType P>
-struct Pooling2x2<P, 2> {
-  inline void operator()(const framework::Tensor &input,
-                         const std::vector<int> &paddings,
-                         framework::Tensor *output) {
-    const float *input_data = input.data<float>();
-    float *output_data = output->mutable_data<float>();
-    int input_h = input.dims()[2];
-    int input_w = input.dims()[3];
-    int output_h = output->dims()[2];
-    int output_w = output->dims()[3];
-    int padding_h = paddings[0];
-    int padding_w = paddings[1];
-    int image_size = input_h * input_w;
-    int out_image_size = output_h * output_w;
-    int valid_h_start = (padding_h + 1) / 2;
-    int valid_h_end = (input_h + padding_h) / 2;
-    int valid_h = valid_h_end - valid_h_start;
-    int valid_w_start = (padding_w + 1) / 2;
-    int valid_w_end = (input_w + padding_w) / 2;
-    int valid_w = valid_w_end - valid_w_start;
-
-    bool ceil_mode = (((input_h + 2 * padding_h) / 2) < output_h) ||
-                     (((input_w + 2 * padding_w) / 2) < output_w);
-    int padding_b =
-        padding_h + (ceil_mode ? 2 * output_h - (input_h + 2 * padding_h) : 0);
-    int padding_r =
-        padding_w + (ceil_mode ? 2 * output_w - (input_w + 2 * padding_w) : 0);
-
-    #pragma omp parallel for collapse(2)
-    for (int batch = 0; batch < output->dims()[0]; ++batch) {
-      for (int c = 0; c < output->dims()[1]; ++c) {
-        int channel = batch * output->dims()[1] + c;
-        const float *input_ptr = input_data + channel * image_size;
-        float *output_ptr = output_data + channel * out_image_size;
-        // top
-        for (int h = 0; h < valid_h_start; ++h) {
-          Pooling2x2NormalRow<P, 2>(input_ptr, h, input_h, input_w, padding_h,
-                                    padding_w, output_w, output_ptr);
-        }
-        // valid
-        int output_w_tiles = valid_w / 4;
-        int output_w_remain = valid_w - output_w_tiles * 4;
-        for (int h = valid_h_start; h < valid_h_end - 1; h += 2) {
-          const float *input_ptr0 = input_ptr + (2 * h - padding_h) * input_w;
-          const float *input_ptr1 = input_ptr0 + input_w;
-          const float *input_ptr2 = input_ptr1 + input_w;
-          const float *input_ptr3 = input_ptr2 + input_w;
-          float *output_ptr0 = output_ptr + h * output_w;
-          float *output_ptr1 = output_ptr0 + output_w;
-          // pad left
-          if (padding_w) {
-            for (int w = valid_w_start - 1; w >= 0; --w) {
-              int padding = padding_w - w * 2;
-              if (padding >= 2) {
-                output_ptr0[w] = 0.f;
-                output_ptr1[w] = 0.f;
-              } else {
-                float acc0 = PoolPre<P>(*input_ptr0, *input_ptr1);
-                float acc1 = PoolPre<P>(*input_ptr2, *input_ptr3);
-                output_ptr0[w] = PoolPost<P>(acc0, 0.5f);
-                output_ptr1[w] = PoolPost<P>(acc1, 0.5f);
-              }
-            }
-            input_ptr0 += (padding_w & 0x1);
-            input_ptr1 += (padding_w & 0x1);
-            input_ptr2 += (padding_w & 0x1);
-            input_ptr3 += (padding_w & 0x1);
-            output_ptr0 += valid_w_start;
-            output_ptr1 += valid_w_start;
-          }
-          // valid
-          float32x4x2_t x0, x1, x2, x3;
-          float32x4_t y0, y1;
-          float32x4_t post = vdupq_n_f32(0.25f);
-          for (int loop = 0; loop < output_w_tiles; ++loop) {
-            x0 = vld2q_f32(input_ptr0);
-            x1 = vld2q_f32(input_ptr1);
-            x2 = vld2q_f32(input_ptr2);
-            x3 = vld2q_f32(input_ptr3);
-            y0 = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            y1 = vPoolPreq_f32<P>(x2.val[0], x2.val[1]);
-            y0 = vPoolPreq_f32<P>(y0, x1.val[0]);
-            y1 = vPoolPreq_f32<P>(y1, x3.val[0]);
-            y0 = vPoolPreq_f32<P>(y0, x1.val[1]);
-            y1 = vPoolPreq_f32<P>(y1, x3.val[1]);
-            y0 = vPoolPostq_f32<P>(y0, post);
-            y1 = vPoolPostq_f32<P>(y1, post);
-            vst1q_f32(output_ptr0, y0);
-            vst1q_f32(output_ptr1, y1);
-
-            input_ptr0 += 8;
-            input_ptr1 += 8;
-            input_ptr2 += 8;
-            input_ptr3 += 8;
-            output_ptr0 += 4;
-            output_ptr1 += 4;
-          }
-          // remain width
-          if (output_w_remain > 0) {
-            x0 = vld2q_f32(input_ptr0);
-            x1 = vld2q_f32(input_ptr1);
-            x2 = vld2q_f32(input_ptr2);
-            x3 = vld2q_f32(input_ptr3);
-            y0 = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            y1 = vPoolPreq_f32<P>(x2.val[0], x2.val[1]);
-            y0 = vPoolPreq_f32<P>(y0, x1.val[0]);
-            y1 = vPoolPreq_f32<P>(y1, x3.val[0]);
-            y0 = vPoolPreq_f32<P>(y0, x1.val[1]);
-            y1 = vPoolPreq_f32<P>(y1, x3.val[1]);
-            y0 = vPoolPostq_f32<P>(y0, post);
-            y1 = vPoolPostq_f32<P>(y1, post);
-
-            switch (output_w_remain) {
-              case 1:
-                vst1q_lane_f32(output_ptr0, y0, 0);
-                vst1q_lane_f32(output_ptr1, y1, 0);
-                break;
-              case 2:
-                vst1_f32(output_ptr0, vget_low_f32(y0));
-                vst1_f32(output_ptr1, vget_low_f32(y1));
-                break;
-              case 3:
-                vst1_f32(output_ptr0, vget_low_f32(y0));
-                vst1q_lane_f32(output_ptr0 + 2, y0, 2);
-                vst1_f32(output_ptr1, vget_low_f32(y1));
-                vst1q_lane_f32(output_ptr1 + 2, y1, 2);
-                break;
-            }
-            input_ptr0 += 2 * output_w_remain;
-            input_ptr1 += 2 * output_w_remain;
-            input_ptr2 += 2 * output_w_remain;
-            input_ptr3 += 2 * output_w_remain;
-            output_ptr0 += output_w_remain;
-            output_ptr1 += output_w_remain;
-          }
-          // pad right
-          if (padding_r) {
-            for (int w = valid_w_end; w < output_w; ++w) {
-              int padding = 2 * w + 2 - (padding_w + input_w);
-              if (padding >= 2) {
-                *output_ptr0 = 0.f;
-                *output_ptr1 = 0.f;
-              } else {
-                float acc0 = PoolPre<P>(*input_ptr0, *input_ptr1);
-                float acc1 = PoolPre<P>(*input_ptr2, *input_ptr3);
-                *output_ptr0 = PoolPost<P>(acc0, 0.5f);
-                *output_ptr1 = PoolPost<P>(acc1, 0.5f);
-              }
-              output_ptr0++;
-              output_ptr1++;
-            }
-          }
-        }
-        // remain height
-        int start_h = valid_h_start + (valid_h & 0xfffffffe);
-        for (int h = start_h; h < valid_h_end; ++h) {
-          const float *input_ptr0 = input_ptr + (2 * h - padding_h) * input_w;
-          const float *input_ptr1 = input_ptr0 + input_w;
-          float *output_ptr0 = output_ptr + h * output_w;
-          // pad left
-          if (padding_w) {
-            for (int w = valid_w_start - 1; w >= 0; --w) {
-              int padding = padding_w - 2 * w;
-              if (padding >= 2) {
-                output_ptr0[w] = 0.f;
-              } else {
-                float acc0 = PoolPre<P>(*input_ptr0, *input_ptr1);
-                output_ptr0[w] = PoolPost<P>(acc0, 0.5f);
-              }
-            }
-            input_ptr0 += (padding_w & 0x1);
-            input_ptr1 += (padding_w & 0x1);
-            output_ptr0 += valid_w_start;
-          }
-          // valid
-          float32x4x2_t x0, x1;
-          float32x4_t y0;
-          float32x4_t post = vdupq_n_f32(0.25f);
-          for (int loop = 0; loop < output_w_tiles; ++loop) {
-            x0 = vld2q_f32(input_ptr0);
-            x1 = vld2q_f32(input_ptr1);
-            y0 = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            y0 = vPoolPreq_f32<P>(y0, x1.val[0]);
-            y0 = vPoolPreq_f32<P>(y0, x1.val[1]);
-            y0 = vPoolPostq_f32<P>(y0, post);
-            vst1q_f32(output_ptr0, y0);
-
-            input_ptr0 += 8;
-            input_ptr1 += 8;
-            output_ptr0 += 4;
-          }
-          // remain width
-          if (output_w_remain > 0) {
-            x0 = vld2q_f32(input_ptr0);
-            x1 = vld2q_f32(input_ptr1);
-            y0 = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            y0 = vPoolPreq_f32<P>(y0, x1.val[0]);
-            y0 = vPoolPreq_f32<P>(y0, x1.val[1]);
-            y0 = vPoolPostq_f32<P>(y0, post);
-
-            switch (output_w_remain) {
-              case 1:
-                vst1q_lane_f32(output_ptr0, y0, 0);
-                break;
-              case 2:
-                vst1_f32(output_ptr0, vget_low_f32(y0));
-                break;
-              case 3:
-                vst1_f32(output_ptr0, vget_low_f32(y0));
-                vst1q_lane_f32(output_ptr0 + 2, y0, 2);
-                break;
-            }
-            input_ptr0 += 2 * output_w_remain;
-            input_ptr1 += 2 * output_w_remain;
-            output_ptr0 += output_w_remain;
-          }
-          // pad right
-          if (padding_r) {
-            for (int w = valid_w_end; w < output_w; ++w) {
-              int padding = 2 * w + 2 - (padding_w + input_w);
-              if (padding >= 2) {
-                *output_ptr0 = 0.f;
-              } else {
-                float acc0 = PoolPre<P>(*input_ptr0, *input_ptr1);
-                *output_ptr0 = PoolPost<P>(acc0, 0.5f);
-              }
-              output_ptr0++;
-            }
-          }
-        }
-        // bottom
-        for (int h = valid_h_end; h < output_h; ++h) {
-          Pooling2x2NormalRow<P, 2>(input_ptr, h, input_h, input_w, padding_h,
-                                    padding_w, output_w, output_ptr);
-        }
-      }
-    }
-  }
-};
-
-template struct Pooling2x2<MAX, 1>;
-template struct Pooling2x2<AVG, 1>;
-template struct Pooling2x2<MAX, 2>;
-template struct Pooling2x2<AVG, 2>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // __ARM_NEON__
-#endif  // POOL_OP
diff --git a/mobile/src/operators/math/pooling3x3.cpp b/mobile/src/operators/math/pooling3x3.cpp
deleted file mode 100644
index 3303dabb8d..0000000000
--- a/mobile/src/operators/math/pooling3x3.cpp
+++ /dev/null
@@ -1,1317 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POOL_OP
-
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-
-#include <arm_neon.h>
-#include "operators/math/pooling.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-#define POOLING3X3_NORMAL_BORDER(start, end, exclusive)                  \
-  for (int w = start; w < end; ++w) {                                    \
-    const int w_in_start = -padding_w + w * Stride;                      \
-    const int w_in_end = w_in_start + 3;                                 \
-    const int w_start = w_in_start > 0 ? w_in_start : 0;                 \
-    const int w_end = w_in_end < input_w ? w_in_end : input_w;           \
-    PoolingVal<P> val;                                                   \
-    for (int h_in = h_start; h_in < h_end; ++h_in) {                     \
-      for (int w_in = w_start; w_in < w_end; ++w_in) {                   \
-        val += input[h_in * input_w + w_in];                             \
-      }                                                                  \
-    }                                                                    \
-    output_ptr[w] = exclusive ? val.Value() : val.ExclusiveSum(9) / 9.f; \
-  }
-
-template <PoolingType P, int Stride = 1>
-struct Pooling3x3NormalRowLoadInput {
-  inline void operator()(const float *input, float32x4x2_t &x0,  // NOLINT
-                         float32x4x2_t &x1, float32x4x2_t &x2,   // NOLINT
-                         float32x4x2_t &y0) {                    // NOLINT
-    x0.val[0] = vld1q_f32(input);
-    x0.val[1] = vld1q_f32(input + 4);
-    x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-    x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-    x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-    x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-    y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-    y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-    y0.val[0] = vPoolPreq_f32<P>(x1.val[0], y0.val[0]);
-    y0.val[1] = vPoolPreq_f32<P>(x1.val[1], y0.val[1]);
-    y0.val[0] = vPoolPreq_f32<P>(x2.val[0], y0.val[0]);
-    y0.val[1] = vPoolPreq_f32<P>(x2.val[1], y0.val[1]);
-  }
-};
-
-template <PoolingType P>
-struct Pooling3x3NormalRowLoadInput<P, 2> {
-  inline void operator()(const float *input, float32x4x2_t &x0,  // NOLINT
-                         float32x4x2_t &x1, float32x4x2_t &x2,   // NOLINT
-                         float32x4x2_t &y0) {                    // NOLINT
-    x0 = vld2q_f32(input);
-    x1 = vld2q_f32(input + 8);
-    x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-    x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-    x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-    x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-    x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-    x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-    y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-    y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-  }
-};
-
-template <PoolingType P, int Stride>
-inline void Pooling3x3NormalRow(const float *input, const int h_output,
-                                const int input_h, const int input_w,
-                                const int padding_h, const int padding_w,
-                                const int output_w, const bool exclusive,
-                                float *output) {
-  const int h_in_start = -padding_h + h_output * Stride;
-  const int h_in_end = h_in_start + 3;
-  const int h_start = h_in_start > 0 ? h_in_start : 0;
-  const int h_end = h_in_end < input_h ? h_in_end : input_h;
-
-  float *output_ptr = output + h_output * output_w;
-  if (h_end - h_start <= 0) {
-    memset(output_ptr, 0, output_w * sizeof(float));
-    return;
-  }
-
-  const int valid_w_start = (padding_w + Stride - 1) / Stride;
-  const int valid_w_end = (input_w + padding_w - 3) / Stride + 1;
-  const int valid_w = valid_w_end - valid_w_start;
-
-  // border left
-  POOLING3X3_NORMAL_BORDER(0, valid_w_start, exclusive)
-  // middle
-  int output_tiles = (valid_w_end - valid_w_start) / 6;
-  int output_tiles_w = output_tiles * 6;
-  Pooling3x3NormalRowLoadInput<P, Stride> PoolingCompute;
-  float32x4x2_t x0, x1, x2, y0;
-  float32x4_t post = exclusive ? vdupq_n_f32(1.f / (3 * (h_end - h_start)))
-                               : vdupq_n_f32(1.f / 9);
-  for (int w = 0; w < output_tiles_w; w += 6) {
-    int output_offset = valid_w_start + w;
-    int input_w_offset = output_offset * Stride - padding_w;
-    y0.val[0] = vPoolInitq_f32<P>();
-    y0.val[1] = vPoolInitq_f32<P>();
-    for (int h_in = h_start; h_in < h_end; ++h_in) {
-      PoolingCompute(input + h_in * input_w + input_w_offset, x0, x1, x2, y0);
-    }
-    y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-    y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-    vst1q_f32(output_ptr + output_offset, y0.val[0]);
-    vst1_f32(output_ptr + output_offset + 4, vget_low_f32(y0.val[1]));
-  }
-  int remain = valid_w - output_tiles_w;
-  if (remain > 0) {
-    int remain_start = valid_w_start + output_tiles_w;
-    int input_w_offset = remain_start * Stride - padding_w;
-    float *output_ptr0 = output_ptr + remain_start;
-    y0.val[0] = vPoolInitq_f32<P>();
-    y0.val[1] = vPoolInitq_f32<P>();
-    for (int h_in = h_start; h_in < h_end; ++h_in) {
-      PoolingCompute(input + h_in * input_w + input_w_offset, x0, x1, x2, y0);
-    }
-    y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-    y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-    switch (remain) {
-      case 1:
-        vst1q_lane_f32(output_ptr0, y0.val[0], 0);
-        break;
-      case 2:
-        vst1_f32(output_ptr0, vget_low_f32(y0.val[0]));
-        break;
-      case 3:
-        vst1_f32(output_ptr0, vget_low_f32(y0.val[0]));
-        vst1q_lane_f32(output_ptr0 + 2, y0.val[0], 2);
-        break;
-      case 4:
-        vst1q_f32(output_ptr0, y0.val[0]);
-        break;
-      case 5:
-        vst1q_f32(output_ptr0, y0.val[0]);
-        vst1q_lane_f32(output_ptr0 + 4, y0.val[1], 0);
-        break;
-    }
-  }
-  // border right
-  POOLING3X3_NORMAL_BORDER(valid_w_end, output_w, exclusive)
-}
-
-template <PoolingType P>
-struct Pooling3x3<P, 1> {
-  inline void operator()(const framework::Tensor &input,
-                         const std::vector<int> &paddings, const bool exclusive,
-                         framework::Tensor *output) {
-    const float *input_data = input.data<float>();
-    float *output_data = output->mutable_data<float>();
-    int input_h = input.dims()[2];
-    int input_w = input.dims()[3];
-    int output_h = output->dims()[2];
-    int output_w = output->dims()[3];
-    int padding_h = paddings[0];
-    int padding_w = paddings[1];
-    int image_size = input_h * input_w;
-    int out_image_size = output_h * output_w;
-    int valid_h_start = padding_h;
-    int valid_h = input_h - 2;
-    int valid_h_end = valid_h_start + valid_h;
-    int valid_w_start = padding_w;
-    int valid_w = input_w - 2;
-    int valid_w_end = valid_w_start + valid_w;
-
-    #pragma omp parallel for collapse(2)
-    for (int batch = 0; batch < output->dims()[0]; ++batch) {
-      for (int c = 0; c < output->dims()[1]; ++c) {
-        int channel = batch * output->dims()[1] + c;
-        const float *input_ptr = input_data + channel * image_size;
-        float *output_ptr = output_data + channel * out_image_size;
-        // top
-        for (int h = 0; h < valid_h_start; ++h) {
-          Pooling3x3NormalRow<P, 1>(input_ptr, h, input_h, input_w, padding_h,
-                                    padding_w, output_w, exclusive, output_ptr);
-        }
-        // valid
-        int output_w_tiles = valid_w / 6;
-        int output_w_remain = valid_w - output_w_tiles * 6;
-        for (int h = valid_h_start; h < valid_h_end - 3; h += 4) {
-          const float *input_ptr0 = input_ptr + (h - padding_h) * input_w;
-          const float *input_ptr1 = input_ptr0 + input_w;
-          const float *input_ptr2 = input_ptr1 + input_w;
-          const float *input_ptr3 = input_ptr2 + input_w;
-          const float *input_ptr4 = input_ptr3 + input_w;
-          const float *input_ptr5 = input_ptr4 + input_w;
-          float *output_ptr0 = output_ptr + h * output_w;
-          float *output_ptr1 = output_ptr0 + output_w;
-          float *output_ptr2 = output_ptr1 + output_w;
-          float *output_ptr3 = output_ptr2 + output_w;
-          // pad left
-          if (padding_w) {
-            float32x2_t row0 = vld1_f32(input_ptr0);
-            float32x2_t row1 = vld1_f32(input_ptr1);
-            float32x2_t row2 = vld1_f32(input_ptr2);
-            float32x2_t row3 = vld1_f32(input_ptr3);
-            float32x2_t row4 = vld1_f32(input_ptr4);
-            float32x2_t row5 = vld1_f32(input_ptr5);
-            float32x2_t pad0 = vPoolInit_f32<P>();
-            float32x2_t acc0, acc1, acc2, acc3, acc12, acc34, post;
-            for (int w = valid_w_start - 1; w >= 0; --w) {
-              int padding = padding_w - w;
-              if (padding >= 3) {
-                output_ptr0[w] = 0.f;
-                output_ptr1[w] = 0.f;
-                output_ptr2[w] = 0.f;
-                output_ptr3[w] = 0.f;
-              } else {
-                post = exclusive ? vdup_n_f32(1.f / (3 * (3 - padding)))
-                                 : vdup_n_f32(1.f / 9);
-                acc12 = vPoolPre_f32<P>(row1, row2);
-                acc34 = vPoolPre_f32<P>(row3, row4);
-                acc0 = vPoolPre_f32<P>(row0, acc12);
-                acc1 = vPoolPre_f32<P>(row3, acc12);
-                acc2 = vPoolPre_f32<P>(row2, acc34);
-                acc3 = vPoolPre_f32<P>(row5, acc34);
-                acc0 = vpPoolPre_f32<P>(acc0, acc0);
-                acc1 = vpPoolPre_f32<P>(acc1, acc1);
-                acc2 = vpPoolPre_f32<P>(acc2, acc2);
-                acc3 = vpPoolPre_f32<P>(acc3, acc3);
-                acc0 = vPoolPost_f32<P>(acc0, post);
-                acc1 = vPoolPost_f32<P>(acc1, post);
-                acc2 = vPoolPost_f32<P>(acc2, post);
-                acc3 = vPoolPost_f32<P>(acc3, post);
-                vst1_lane_f32(output_ptr0 + w, acc0, 0);
-                vst1_lane_f32(output_ptr1 + w, acc1, 0);
-                vst1_lane_f32(output_ptr2 + w, acc2, 0);
-                vst1_lane_f32(output_ptr3 + w, acc3, 0);
-                row0 = vext_f32(pad0, row0, 1);
-                row1 = vext_f32(pad0, row1, 1);
-                row2 = vext_f32(pad0, row2, 1);
-                row3 = vext_f32(pad0, row3, 1);
-                row4 = vext_f32(pad0, row4, 1);
-                row5 = vext_f32(pad0, row5, 1);
-              }
-            }
-            output_ptr0 += valid_w_start;
-            output_ptr1 += valid_w_start;
-            output_ptr2 += valid_w_start;
-            output_ptr3 += valid_w_start;
-          }
-          // valid
-          float32x4x2_t x0, x1, x2;
-          float32x4x2_t y0, y1, y2;
-          float32x4_t post = vdupq_n_f32(1.f / 9);
-          for (int loop = 0; loop < output_w_tiles; ++loop) {
-            x0.val[0] = vld1q_f32(input_ptr0);
-            x0.val[1] = vld1q_f32(input_ptr0 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-
-            x0.val[0] = vld1q_f32(input_ptr1);
-            x0.val[1] = vld1q_f32(input_ptr1 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            y1.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(y1.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(y1.val[1], y0.val[1]);
-
-            x0.val[0] = vld1q_f32(input_ptr2);
-            x0.val[1] = vld1q_f32(input_ptr2 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            y2.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y2.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y1.val[0] = vPoolPreq_f32<P>(y2.val[0], y1.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(y2.val[1], y1.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(y2.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(y2.val[1], y0.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-            vst1q_f32(output_ptr0, y0.val[0]);
-            vst1_f32(output_ptr0 + 4, vget_low_f32(y0.val[1]));
-
-            x0.val[0] = vld1q_f32(input_ptr3);
-            x0.val[1] = vld1q_f32(input_ptr3 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y1.val[0] = vPoolPreq_f32<P>(y0.val[0], y1.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(y0.val[1], y1.val[1]);
-            y2.val[0] = vPoolPreq_f32<P>(y0.val[0], y2.val[0]);
-            y2.val[1] = vPoolPreq_f32<P>(y0.val[1], y2.val[1]);
-            y1.val[0] = vPoolPostq_f32<P>(y1.val[0], post);
-            y1.val[1] = vPoolPostq_f32<P>(y1.val[1], post);
-            vst1q_f32(output_ptr1, y1.val[0]);
-            vst1_f32(output_ptr1 + 4, vget_low_f32(y1.val[1]));
-
-            x0.val[0] = vld1q_f32(input_ptr4);
-            x0.val[1] = vld1q_f32(input_ptr4 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-            y2.val[0] = vPoolPreq_f32<P>(x0.val[0], y2.val[0]);
-            y2.val[1] = vPoolPreq_f32<P>(x0.val[1], y2.val[1]);
-            y2.val[0] = vPoolPostq_f32<P>(y2.val[0], post);
-            y2.val[1] = vPoolPostq_f32<P>(y2.val[1], post);
-            vst1q_f32(output_ptr2, y2.val[0]);
-            vst1_f32(output_ptr2 + 4, vget_low_f32(y2.val[1]));
-
-            x0.val[0] = vld1q_f32(input_ptr5);
-            x0.val[1] = vld1q_f32(input_ptr5 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-            vst1q_f32(output_ptr3, y0.val[0]);
-            vst1_f32(output_ptr3 + 4, vget_low_f32(y0.val[1]));
-
-            input_ptr0 += 6;
-            input_ptr1 += 6;
-            input_ptr2 += 6;
-            input_ptr3 += 6;
-            input_ptr4 += 6;
-            input_ptr5 += 6;
-            output_ptr0 += 6;
-            output_ptr1 += 6;
-            output_ptr2 += 6;
-            output_ptr3 += 6;
-          }
-          // remain width
-          if (output_w_remain > 0) {
-            float32x4x2_t y3;
-            x0.val[0] = vld1q_f32(input_ptr0);
-            x0.val[1] = vld1q_f32(input_ptr0 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-
-            x0.val[0] = vld1q_f32(input_ptr1);
-            x0.val[1] = vld1q_f32(input_ptr1 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            y1.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(y1.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(y1.val[1], y0.val[1]);
-
-            x0.val[0] = vld1q_f32(input_ptr2);
-            x0.val[1] = vld1q_f32(input_ptr2 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            y2.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y2.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y1.val[0] = vPoolPreq_f32<P>(y2.val[0], y1.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(y2.val[1], y1.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(y2.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(y2.val[1], y0.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-
-            x0.val[0] = vld1q_f32(input_ptr3);
-            x0.val[1] = vld1q_f32(input_ptr3 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            y3.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y3.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y1.val[0] = vPoolPreq_f32<P>(y3.val[0], y1.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(y3.val[1], y1.val[1]);
-            y2.val[0] = vPoolPreq_f32<P>(y3.val[0], y2.val[0]);
-            y2.val[1] = vPoolPreq_f32<P>(y3.val[1], y2.val[1]);
-            y1.val[0] = vPoolPostq_f32<P>(y1.val[0], post);
-            y1.val[1] = vPoolPostq_f32<P>(y1.val[1], post);
-
-            x0.val[0] = vld1q_f32(input_ptr4);
-            x0.val[1] = vld1q_f32(input_ptr4 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y3.val[0] = vPoolPreq_f32<P>(x0.val[0], y3.val[0]);
-            y3.val[1] = vPoolPreq_f32<P>(x0.val[1], y3.val[1]);
-            y2.val[0] = vPoolPreq_f32<P>(x0.val[0], y2.val[0]);
-            y2.val[1] = vPoolPreq_f32<P>(x0.val[1], y2.val[1]);
-            y2.val[0] = vPoolPostq_f32<P>(y2.val[0], post);
-            y2.val[1] = vPoolPostq_f32<P>(y2.val[1], post);
-
-            x0.val[0] = vld1q_f32(input_ptr5);
-            x0.val[1] = vld1q_f32(input_ptr5 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y3.val[0] = vPoolPreq_f32<P>(x0.val[0], y3.val[0]);
-            y3.val[1] = vPoolPreq_f32<P>(x0.val[1], y3.val[1]);
-            y3.val[0] = vPoolPostq_f32<P>(y3.val[0], post);
-            y3.val[1] = vPoolPostq_f32<P>(y3.val[1], post);
-
-            switch (output_w_remain) {
-              case 1:
-                vst1q_lane_f32(output_ptr0, y0.val[0], 0);
-                vst1q_lane_f32(output_ptr1, y1.val[0], 0);
-                vst1q_lane_f32(output_ptr2, y2.val[0], 0);
-                vst1q_lane_f32(output_ptr3, y3.val[0], 0);
-                break;
-              case 2:
-                vst1_f32(output_ptr0, vget_low_f32(y0.val[0]));
-                vst1_f32(output_ptr1, vget_low_f32(y1.val[0]));
-                vst1_f32(output_ptr2, vget_low_f32(y2.val[0]));
-                vst1_f32(output_ptr3, vget_low_f32(y3.val[0]));
-                break;
-              case 3:
-                vst1_f32(output_ptr0, vget_low_f32(y0.val[0]));
-                vst1_f32(output_ptr1, vget_low_f32(y1.val[0]));
-                vst1_f32(output_ptr2, vget_low_f32(y2.val[0]));
-                vst1_f32(output_ptr3, vget_low_f32(y3.val[0]));
-                vst1q_lane_f32(output_ptr0 + 2, y0.val[0], 2);
-                vst1q_lane_f32(output_ptr1 + 2, y1.val[0], 2);
-                vst1q_lane_f32(output_ptr2 + 2, y2.val[0], 2);
-                vst1q_lane_f32(output_ptr3 + 2, y3.val[0], 2);
-                break;
-              case 4:
-                vst1q_f32(output_ptr0, y0.val[0]);
-                vst1q_f32(output_ptr1, y1.val[0]);
-                vst1q_f32(output_ptr2, y2.val[0]);
-                vst1q_f32(output_ptr3, y3.val[0]);
-                break;
-              case 5:
-                vst1q_f32(output_ptr0, y0.val[0]);
-                vst1q_f32(output_ptr1, y1.val[0]);
-                vst1q_f32(output_ptr2, y2.val[0]);
-                vst1q_f32(output_ptr3, y3.val[0]);
-                vst1q_lane_f32(output_ptr0 + 4, y0.val[1], 0);
-                vst1q_lane_f32(output_ptr1 + 4, y1.val[1], 0);
-                vst1q_lane_f32(output_ptr2 + 4, y2.val[1], 0);
-                vst1q_lane_f32(output_ptr3 + 4, y3.val[1], 0);
-                break;
-            }
-            input_ptr0 += output_w_remain;
-            input_ptr1 += output_w_remain;
-            input_ptr2 += output_w_remain;
-            input_ptr3 += output_w_remain;
-            input_ptr4 += output_w_remain;
-            input_ptr5 += output_w_remain;
-            output_ptr0 += output_w_remain;
-            output_ptr1 += output_w_remain;
-            output_ptr2 += output_w_remain;
-            output_ptr3 += output_w_remain;
-          }
-          // pad right
-          if (padding_w) {
-            float32x2_t row0 = vld1_f32(input_ptr0);
-            float32x2_t row1 = vld1_f32(input_ptr1);
-            float32x2_t row2 = vld1_f32(input_ptr2);
-            float32x2_t row3 = vld1_f32(input_ptr3);
-            float32x2_t row4 = vld1_f32(input_ptr4);
-            float32x2_t row5 = vld1_f32(input_ptr5);
-            float32x2_t pad0 = vPoolInit_f32<P>();
-            float32x2_t acc0, acc1, acc2, acc3, acc12, acc34, post;
-            for (int w = valid_w_end; w < output_w; ++w) {
-              int padding = w + 3 - (padding_w + input_w);
-              if (padding >= 3) {
-                *output_ptr0 = 0.f;
-                *output_ptr1 = 0.f;
-                *output_ptr2 = 0.f;
-                *output_ptr3 = 0.f;
-              } else {
-                post = exclusive ? vdup_n_f32(1.f / (3 * (3 - padding)))
-                                 : vdup_n_f32(1.f / 9);
-                acc12 = vPoolPre_f32<P>(row1, row2);
-                acc34 = vPoolPre_f32<P>(row3, row4);
-                acc0 = vPoolPre_f32<P>(row0, acc12);
-                acc1 = vPoolPre_f32<P>(row3, acc12);
-                acc2 = vPoolPre_f32<P>(row2, acc34);
-                acc3 = vPoolPre_f32<P>(row5, acc34);
-                acc0 = vpPoolPre_f32<P>(acc0, acc0);
-                acc1 = vpPoolPre_f32<P>(acc1, acc1);
-                acc2 = vpPoolPre_f32<P>(acc2, acc2);
-                acc3 = vpPoolPre_f32<P>(acc3, acc3);
-                acc0 = vPoolPost_f32<P>(acc0, post);
-                acc1 = vPoolPost_f32<P>(acc1, post);
-                acc2 = vPoolPost_f32<P>(acc2, post);
-                acc3 = vPoolPost_f32<P>(acc3, post);
-                vst1_lane_f32(output_ptr0, acc0, 0);
-                vst1_lane_f32(output_ptr1, acc1, 0);
-                vst1_lane_f32(output_ptr2, acc2, 0);
-                vst1_lane_f32(output_ptr3, acc3, 0);
-                row0 = vext_f32(row0, pad0, 1);
-                row1 = vext_f32(row1, pad0, 1);
-                row2 = vext_f32(row2, pad0, 1);
-                row3 = vext_f32(row3, pad0, 1);
-                row4 = vext_f32(row4, pad0, 1);
-                row5 = vext_f32(row5, pad0, 1);
-              }
-              output_ptr0++;
-              output_ptr1++;
-              output_ptr2++;
-              output_ptr3++;
-            }
-          }
-        }
-        // remain height
-        int start_h = valid_h_start + (valid_h & 0xFFFFFFFC);
-        for (int h = start_h; h < valid_h_end; ++h) {
-          const float *input_ptr0 = input_ptr + (h - padding_h) * input_w;
-          const float *input_ptr1 = input_ptr0 + input_w;
-          const float *input_ptr2 = input_ptr1 + input_w;
-          float *output_ptr0 = output_ptr + h * output_w;
-          // pad left
-          if (padding_w) {
-            float32x2_t row0 = vld1_f32(input_ptr0);
-            float32x2_t row1 = vld1_f32(input_ptr1);
-            float32x2_t row2 = vld1_f32(input_ptr2);
-            float32x2_t pad0 = vPoolInit_f32<P>();
-            float32x2_t acc0, post;
-            for (int w = valid_w_start - 1; w >= 0; --w) {
-              int padding = padding_w - w;
-              if (padding >= 3) {
-                output_ptr0[w] = 0.f;
-              } else {
-                post = exclusive ? vdup_n_f32(1.f / (3 * (3 - padding)))
-                                 : vdup_n_f32(1.f / 9);
-                acc0 = vPoolPre_f32<P>(row0, row1);
-                acc0 = vPoolPre_f32<P>(acc0, row2);
-                acc0 = vpPoolPre_f32<P>(acc0, acc0);
-                acc0 = vPoolPost_f32<P>(acc0, post);
-                vst1_lane_f32(output_ptr0 + w, acc0, 0);
-                row0 = vext_f32(pad0, row0, 1);
-                row1 = vext_f32(pad0, row1, 1);
-                row2 = vext_f32(pad0, row2, 1);
-              }
-            }
-            output_ptr0 += valid_w_start;
-          }
-          // valid
-          float32x4x2_t x0, x1, x2, y0;
-          float32x4_t post = vdupq_n_f32(1.f / 9);
-          for (int loop = 0; loop < output_w_tiles; ++loop) {
-            x0.val[0] = vld1q_f32(input_ptr0);
-            x0.val[1] = vld1q_f32(input_ptr0 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-
-            x0.val[0] = vld1q_f32(input_ptr1);
-            x0.val[1] = vld1q_f32(input_ptr1 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-
-            x0.val[0] = vld1q_f32(input_ptr2);
-            x0.val[1] = vld1q_f32(input_ptr2 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-            vst1q_f32(output_ptr0, y0.val[0]);
-            vst1_f32(output_ptr0 + 4, vget_low_f32(y0.val[1]));
-
-            input_ptr0 += 6;
-            input_ptr1 += 6;
-            input_ptr2 += 6;
-            output_ptr0 += 6;
-          }
-          // remain width
-          if (output_w_remain > 0) {
-            x0.val[0] = vld1q_f32(input_ptr0);
-            x0.val[1] = vld1q_f32(input_ptr0 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-
-            x0.val[0] = vld1q_f32(input_ptr1);
-            x0.val[1] = vld1q_f32(input_ptr1 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-
-            x0.val[0] = vld1q_f32(input_ptr2);
-            x0.val[1] = vld1q_f32(input_ptr2 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-            // restore
-            switch (output_w_remain) {
-              case 1:
-                vst1q_lane_f32(output_ptr0, y0.val[0], 0);
-                break;
-              case 2:
-                vst1_f32(output_ptr0, vget_low_f32(y0.val[0]));
-                break;
-              case 3:
-                vst1_f32(output_ptr0, vget_low_f32(y0.val[0]));
-                vst1q_lane_f32(output_ptr0 + 2, y0.val[0], 2);
-                break;
-              case 4:
-                vst1q_f32(output_ptr0, y0.val[0]);
-                break;
-              case 5:
-                vst1q_f32(output_ptr0, y0.val[0]);
-                vst1q_lane_f32(output_ptr0 + 4, y0.val[1], 0);
-                break;
-            }
-            input_ptr0 += output_w_remain;
-            input_ptr1 += output_w_remain;
-            input_ptr2 += output_w_remain;
-            output_ptr0 += output_w_remain;
-          }
-          // pad right
-          if (padding_w) {
-            float32x2_t row0 = vld1_f32(input_ptr0);
-            float32x2_t row1 = vld1_f32(input_ptr1);
-            float32x2_t row2 = vld1_f32(input_ptr2);
-            float32x2_t pad0 = vPoolInit_f32<P>();
-            float32x2_t acc0, post;
-            for (int w = valid_w_end; w < output_w; ++w) {
-              int padding = w + 3 - (padding_w + input_w);
-              if (padding >= 3) {
-                *output_ptr0 = 0.f;
-              } else {
-                post = exclusive ? vdup_n_f32(1.f / (3 * (3 - padding)))
-                                 : vdup_n_f32(1.f / 9);
-                acc0 = vPoolPre_f32<P>(row0, row1);
-                acc0 = vPoolPre_f32<P>(acc0, row2);
-                acc0 = vpPoolPre_f32<P>(acc0, acc0);
-                acc0 = vPoolPost_f32<P>(acc0, post);
-                vst1_lane_f32(output_ptr0, acc0, 0);
-                row0 = vext_f32(row0, pad0, 1);
-                row1 = vext_f32(row1, pad0, 1);
-                row2 = vext_f32(row2, pad0, 1);
-              }
-              output_ptr0++;
-            }
-          }
-        }
-        // pad bottom
-        for (int h = valid_h_end; h < output_h; ++h) {
-          Pooling3x3NormalRow<P, 1>(input_ptr, h, input_h, input_w, padding_h,
-                                    padding_w, output_w, exclusive, output_ptr);
-        }
-      }
-    }
-  }
-};
-
-template <PoolingType P>
-struct Pooling3x3<P, 2> {
-  inline void operator()(const framework::Tensor &input,
-                         const std::vector<int> &paddings, const bool exclusive,
-                         framework::Tensor *output) {
-    const float *input_data = input.data<float>();
-    float *output_data = output->mutable_data<float>();
-    int input_h = input.dims()[2];
-    int input_w = input.dims()[3];
-    int output_h = output->dims()[2];
-    int output_w = output->dims()[3];
-    int padding_h = paddings[0];
-    int padding_w = paddings[1];
-    int image_size = input_h * input_w;
-    int out_image_size = output_h * output_w;
-    int valid_h_start = (padding_h + 1) / 2;
-    int valid_h_end = (input_h + padding_h - 1) / 2;
-    int valid_h = valid_h_end - valid_h_start;
-    int valid_w_start = (padding_w + 1) / 2;
-    int valid_w_end = (input_w + padding_w - 1) / 2;
-    int valid_w = valid_w_end - valid_w_start;
-
-    int padding_height = input_h + 2 * padding_h;
-    int padding_width = input_w + 2 * padding_w;
-    bool ceil_mode = (((padding_height - 1) / 2) < output_h) ||
-                     (((padding_width - 1) / 2) < output_w);
-    int padding_b =
-        padding_h + (ceil_mode ? 2 * output_h - (padding_height - 1) : 0);
-    int padding_r =
-        padding_w + (ceil_mode ? 2 * output_w - (padding_width - 1) : 0);
-    // for pad left
-    int valid_input_w_start = (valid_w_start << 1) - padding_w;
-
-    #pragma omp parallel for collapse(2)
-    for (int batch = 0; batch < output->dims()[0]; ++batch) {
-      for (int c = 0; c < output->dims()[1]; ++c) {
-        int channel = batch * output->dims()[1] + c;
-        const float *input_ptr = input_data + channel * image_size;
-        float *output_ptr = output_data + channel * out_image_size;
-        // top
-        for (int h = 0; h < valid_h_start; ++h) {
-          Pooling3x3NormalRow<P, 2>(input_ptr, h, input_h, input_w, padding_h,
-                                    padding_w, output_w, exclusive, output_ptr);
-        }
-        // valid
-        int output_w_tiles = valid_w / 6;
-        int output_w_remain = valid_w - output_w_tiles * 6;
-        for (int h = valid_h_start; h < valid_h_end - 2; h += 3) {
-          const float *input_ptr0 = input_ptr + (2 * h - padding_h) * input_w;
-          const float *input_ptr1 = input_ptr0 + input_w;
-          const float *input_ptr2 = input_ptr1 + input_w;
-          const float *input_ptr3 = input_ptr2 + input_w;
-          const float *input_ptr4 = input_ptr3 + input_w;
-          const float *input_ptr5 = input_ptr4 + input_w;
-          const float *input_ptr6 = input_ptr5 + input_w;
-          float *output_ptr0 = output_ptr + h * output_w;
-          float *output_ptr1 = output_ptr0 + output_w;
-          float *output_ptr2 = output_ptr1 + output_w;
-          // pad left
-          if (padding_w) {
-            float32x2_t row0 = vld1_f32(input_ptr0);
-            float32x2_t row1 = vld1_f32(input_ptr1);
-            float32x2_t row2 = vld1_f32(input_ptr2);
-            float32x2_t row3 = vld1_f32(input_ptr3);
-            float32x2_t row4 = vld1_f32(input_ptr4);
-            float32x2_t row5 = vld1_f32(input_ptr5);
-            float32x2_t row6 = vld1_f32(input_ptr6);
-            float32x2_t pad0 = vPoolInit_f32<P>();
-            float32x2_t acc0, acc1, acc2, post;
-            for (int w = valid_w_start - 1; w >= 0; --w) {
-              int padding = padding_w - (w << 1);
-              if (padding >= 3) {
-                output_ptr0[w] = 0.f;
-                output_ptr1[w] = 0.f;
-                output_ptr2[w] = 0.f;
-              } else {
-                post = exclusive ? vdup_n_f32(1.f / (3 * (3 - padding)))
-                                 : vdup_n_f32(1.f / 9);
-                acc0 = vPoolPre_f32<P>(row0, row1);
-                acc1 = vPoolPre_f32<P>(row2, row3);
-                acc2 = vPoolPre_f32<P>(row4, row5);
-                acc0 = vPoolPre_f32<P>(acc0, row2);
-                acc1 = vPoolPre_f32<P>(acc1, row4);
-                acc2 = vPoolPre_f32<P>(acc2, row6);
-                if (padding == 1) {
-                  acc0 = vpPoolPre_f32<P>(acc0, acc0);
-                  acc1 = vpPoolPre_f32<P>(acc1, acc1);
-                  acc2 = vpPoolPre_f32<P>(acc2, acc2);
-                }
-                acc0 = vPoolPost_f32<P>(acc0, post);
-                acc1 = vPoolPost_f32<P>(acc1, post);
-                acc2 = vPoolPost_f32<P>(acc2, post);
-                vst1_lane_f32(output_ptr0 + w, acc0, 0);
-                vst1_lane_f32(output_ptr1 + w, acc1, 0);
-                vst1_lane_f32(output_ptr2 + w, acc2, 0);
-              }
-            }
-            input_ptr0 += valid_input_w_start;
-            input_ptr1 += valid_input_w_start;
-            input_ptr2 += valid_input_w_start;
-            input_ptr3 += valid_input_w_start;
-            input_ptr4 += valid_input_w_start;
-            input_ptr5 += valid_input_w_start;
-            input_ptr6 += valid_input_w_start;
-            output_ptr0 += valid_w_start;
-            output_ptr1 += valid_w_start;
-            output_ptr2 += valid_w_start;
-          }
-          // valid
-          float32x4x2_t x0, x1, x2;
-          float32x4x2_t y0, y1, y2;
-          float32x4_t post = vdupq_n_f32(1.f / 9);
-          for (int loop = 0; loop < output_w_tiles; ++loop) {
-            x0 = vld2q_f32(input_ptr0);
-            x1 = vld2q_f32(input_ptr0 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-
-            x0 = vld2q_f32(input_ptr1);
-            x1 = vld2q_f32(input_ptr1 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-
-            x0 = vld2q_f32(input_ptr2);
-            x1 = vld2q_f32(input_ptr2 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            y1.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(y1.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(y1.val[1], y0.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-            vst1q_f32(output_ptr0, y0.val[0]);
-            vst1_f32(output_ptr0 + 4, vget_low_f32(y0.val[1]));
-
-            x0 = vld2q_f32(input_ptr3);
-            x1 = vld2q_f32(input_ptr3 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y1.val[0] = vPoolPreq_f32<P>(x0.val[0], y1.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(x0.val[1], y1.val[1]);
-
-            x0 = vld2q_f32(input_ptr4);
-            x1 = vld2q_f32(input_ptr4 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y1.val[0] = vPoolPreq_f32<P>(y0.val[0], y1.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(y0.val[1], y1.val[1]);
-            y1.val[0] = vPoolPostq_f32<P>(y1.val[0], post);
-            y1.val[1] = vPoolPostq_f32<P>(y1.val[1], post);
-            vst1q_f32(output_ptr1, y1.val[0]);
-            vst1_f32(output_ptr1 + 4, vget_low_f32(y1.val[1]));
-
-            x0 = vld2q_f32(input_ptr5);
-            x1 = vld2q_f32(input_ptr5 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-
-            x0 = vld2q_f32(input_ptr6);
-            x1 = vld2q_f32(input_ptr6 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-            vst1q_f32(output_ptr2, y0.val[0]);
-            vst1_f32(output_ptr2 + 4, vget_low_f32(y0.val[1]));
-
-            input_ptr0 += 12;
-            input_ptr1 += 12;
-            input_ptr2 += 12;
-            input_ptr3 += 12;
-            input_ptr4 += 12;
-            input_ptr5 += 12;
-            input_ptr6 += 12;
-            output_ptr0 += 6;
-            output_ptr1 += 6;
-            output_ptr2 += 6;
-          }
-          // remain width
-          if (output_w_remain > 0) {
-            x0 = vld2q_f32(input_ptr0);
-            x1 = vld2q_f32(input_ptr0 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-
-            x0 = vld2q_f32(input_ptr1);
-            x1 = vld2q_f32(input_ptr1 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-
-            x0 = vld2q_f32(input_ptr2);
-            x1 = vld2q_f32(input_ptr2 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            y1.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(y1.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(y1.val[1], y0.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-
-            x0 = vld2q_f32(input_ptr3);
-            x1 = vld2q_f32(input_ptr3 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y1.val[0] = vPoolPreq_f32<P>(x0.val[0], y1.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(x0.val[1], y1.val[1]);
-
-            x0 = vld2q_f32(input_ptr4);
-            x1 = vld2q_f32(input_ptr4 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            y2.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y2.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y1.val[0] = vPoolPreq_f32<P>(y2.val[0], y1.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(y2.val[1], y1.val[1]);
-            y1.val[0] = vPoolPostq_f32<P>(y1.val[0], post);
-            y1.val[1] = vPoolPostq_f32<P>(y1.val[1], post);
-
-            x0 = vld2q_f32(input_ptr5);
-            x1 = vld2q_f32(input_ptr5 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y2.val[0] = vPoolPreq_f32<P>(x0.val[0], y2.val[0]);
-            y2.val[1] = vPoolPreq_f32<P>(x0.val[1], y2.val[1]);
-
-            x0 = vld2q_f32(input_ptr6);
-            x1 = vld2q_f32(input_ptr6 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y2.val[0] = vPoolPreq_f32<P>(x0.val[0], y2.val[0]);
-            y2.val[1] = vPoolPreq_f32<P>(x0.val[1], y2.val[1]);
-            y2.val[0] = vPoolPostq_f32<P>(y2.val[0], post);
-            y2.val[1] = vPoolPostq_f32<P>(y2.val[1], post);
-
-            switch (output_w_remain) {
-              case 1:
-                vst1q_lane_f32(output_ptr0, y0.val[0], 0);
-                vst1q_lane_f32(output_ptr1, y1.val[0], 0);
-                vst1q_lane_f32(output_ptr2, y2.val[0], 0);
-                break;
-              case 2:
-                vst1_f32(output_ptr0, vget_low_f32(y0.val[0]));
-                vst1_f32(output_ptr1, vget_low_f32(y1.val[0]));
-                vst1_f32(output_ptr2, vget_low_f32(y2.val[0]));
-                break;
-              case 3:
-                vst1_f32(output_ptr0, vget_low_f32(y0.val[0]));
-                vst1_f32(output_ptr1, vget_low_f32(y1.val[0]));
-                vst1_f32(output_ptr2, vget_low_f32(y2.val[0]));
-                vst1q_lane_f32(output_ptr0 + 2, y0.val[0], 2);
-                vst1q_lane_f32(output_ptr1 + 2, y1.val[0], 2);
-                vst1q_lane_f32(output_ptr2 + 2, y2.val[0], 2);
-                break;
-              case 4:
-                vst1q_f32(output_ptr0, y0.val[0]);
-                vst1q_f32(output_ptr1, y1.val[0]);
-                vst1q_f32(output_ptr2, y2.val[0]);
-                break;
-              case 5:
-                vst1q_f32(output_ptr0, y0.val[0]);
-                vst1q_f32(output_ptr1, y1.val[0]);
-                vst1q_f32(output_ptr2, y2.val[0]);
-                vst1q_lane_f32(output_ptr0 + 4, y0.val[1], 0);
-                vst1q_lane_f32(output_ptr1 + 4, y1.val[1], 0);
-                vst1q_lane_f32(output_ptr2 + 4, y2.val[1], 0);
-                break;
-            }
-            input_ptr0 += (output_w_remain << 1);
-            input_ptr1 += (output_w_remain << 1);
-            input_ptr2 += (output_w_remain << 1);
-            input_ptr3 += (output_w_remain << 1);
-            input_ptr4 += (output_w_remain << 1);
-            input_ptr5 += (output_w_remain << 1);
-            input_ptr6 += (output_w_remain << 1);
-            output_ptr0 += output_w_remain;
-            output_ptr1 += output_w_remain;
-            output_ptr2 += output_w_remain;
-          }
-          // pad right
-          if (padding_r > 0) {
-            float32x2_t row0 = vld1_f32(input_ptr0);
-            float32x2_t row1 = vld1_f32(input_ptr1);
-            float32x2_t row2 = vld1_f32(input_ptr2);
-            float32x2_t row3 = vld1_f32(input_ptr3);
-            float32x2_t row4 = vld1_f32(input_ptr4);
-            float32x2_t row5 = vld1_f32(input_ptr5);
-            float32x2_t row6 = vld1_f32(input_ptr6);
-            float32x2_t pad0 = vPoolInit_f32<P>();
-            float32x2_t acc0, acc1, acc2, post;
-            for (int w = valid_w_end; w < output_w; ++w) {
-              int padding = 2 * w + 3 - (padding_w + input_w);
-              if (padding >= 3) {
-                *output_ptr0 = 0.f;
-                *output_ptr1 = 0.f;
-                *output_ptr2 = 0.f;
-              } else {
-                post = exclusive ? vdup_n_f32(1.f / (3 * (3 - padding)))
-                                 : vdup_n_f32(1.f / 9);
-                acc0 = vPoolPre_f32<P>(row0, row1);
-                acc1 = vPoolPre_f32<P>(row2, row3);
-                acc2 = vPoolPre_f32<P>(row4, row5);
-                acc0 = vPoolPre_f32<P>(acc0, row2);
-                acc1 = vPoolPre_f32<P>(acc1, row4);
-                acc2 = vPoolPre_f32<P>(acc2, row6);
-                if (padding == 1) {
-                  acc0 = vpPoolPre_f32<P>(acc0, acc0);
-                  acc1 = vpPoolPre_f32<P>(acc1, acc1);
-                  acc2 = vpPoolPre_f32<P>(acc2, acc2);
-                }
-                acc0 = vPoolPost_f32<P>(acc0, post);
-                acc1 = vPoolPost_f32<P>(acc1, post);
-                acc2 = vPoolPost_f32<P>(acc2, post);
-                vst1_lane_f32(output_ptr0, acc0, 0);
-                vst1_lane_f32(output_ptr1, acc1, 0);
-                vst1_lane_f32(output_ptr2, acc2, 0);
-              }
-              output_ptr0++;
-              output_ptr1++;
-              output_ptr2++;
-            }
-          }
-        }
-        // remain height
-        int start_h = valid_h_start + valid_h / 3 * 3;
-        for (int h = start_h; h < valid_h_end; ++h) {
-          const float *input_ptr0 = input_ptr + (2 * h - padding_h) * input_w;
-          const float *input_ptr1 = input_ptr0 + input_w;
-          const float *input_ptr2 = input_ptr1 + input_w;
-          float *output_ptr0 = output_ptr + h * output_w;
-          // pad left
-          if (padding_w) {
-            float32x2_t row0 = vld1_f32(input_ptr0);
-            float32x2_t row1 = vld1_f32(input_ptr1);
-            float32x2_t row2 = vld1_f32(input_ptr2);
-            float32x2_t pad0 = vPoolInit_f32<P>();
-            float32x2_t acc0, post;
-            for (int w = valid_w_start - 1; w >= 0; --w) {
-              int padding = padding_w - (w << 1);
-              if (padding >= 3) {
-                output_ptr0[w] = 0.f;
-              } else {
-                post = exclusive ? vdup_n_f32(1.f / (3 * (3 - padding)))
-                                 : vdup_n_f32(1.f / 9);
-                acc0 = vPoolPre_f32<P>(row0, row1);
-                acc0 = vPoolPre_f32<P>(acc0, row2);
-                if (padding == 1) {
-                  acc0 = vpPoolPre_f32<P>(acc0, acc0);
-                }
-                acc0 = vPoolPost_f32<P>(acc0, post);
-                vst1_lane_f32(output_ptr0 + w, acc0, 0);
-              }
-            }
-            input_ptr0 += valid_input_w_start;
-            input_ptr1 += valid_input_w_start;
-            input_ptr2 += valid_input_w_start;
-            output_ptr0 += valid_w_start;
-          }
-          // valid
-          float32x4x2_t x0, x1, x2, y0;
-          float32x4_t post = vdupq_n_f32(1.f / 9);
-          for (int loop = 0; loop < output_w_tiles; ++loop) {
-            x0 = vld2q_f32(input_ptr0);
-            x1 = vld2q_f32(input_ptr0 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-
-            x0 = vld2q_f32(input_ptr1);
-            x1 = vld2q_f32(input_ptr1 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-
-            x0 = vld2q_f32(input_ptr2);
-            x1 = vld2q_f32(input_ptr2 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-            vst1q_f32(output_ptr0, y0.val[0]);
-            vst1_f32(output_ptr0 + 4, vget_low_f32(y0.val[1]));
-
-            input_ptr0 += 12;
-            input_ptr1 += 12;
-            input_ptr2 += 12;
-            output_ptr0 += 6;
-          }
-          // remain width
-          if (output_w_remain > 0) {
-            x0 = vld2q_f32(input_ptr0);
-            x1 = vld2q_f32(input_ptr0 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-
-            x0 = vld2q_f32(input_ptr1);
-            x1 = vld2q_f32(input_ptr1 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-
-            x0 = vld2q_f32(input_ptr2);
-            x1 = vld2q_f32(input_ptr2 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-            // restore
-            switch (output_w_remain) {
-              case 1:
-                vst1q_lane_f32(output_ptr0, y0.val[0], 0);
-                break;
-              case 2:
-                vst1_f32(output_ptr0, vget_low_f32(y0.val[0]));
-                break;
-              case 3:
-                vst1_f32(output_ptr0, vget_low_f32(y0.val[0]));
-                vst1q_lane_f32(output_ptr0 + 2, y0.val[0], 2);
-                break;
-              case 4:
-                vst1q_f32(output_ptr0, y0.val[0]);
-                break;
-              case 5:
-                vst1q_f32(output_ptr0, y0.val[0]);
-                vst1q_lane_f32(output_ptr0 + 4, y0.val[1], 0);
-                break;
-            }
-            input_ptr0 += (output_w_remain << 1);
-            input_ptr1 += (output_w_remain << 1);
-            input_ptr2 += (output_w_remain << 1);
-            output_ptr0 += output_w_remain;
-          }
-          // pad right
-          if (padding_r > 0) {
-            float32x2_t row0 = vld1_f32(input_ptr0);
-            float32x2_t row1 = vld1_f32(input_ptr1);
-            float32x2_t row2 = vld1_f32(input_ptr2);
-            float32x2_t pad0 = vPoolInit_f32<P>();
-            float32x2_t acc0, post;
-            for (int w = valid_w_end; w < output_w; ++w) {
-              int padding = 2 * w + 3 - (padding_w + input_w);
-              if (padding >= 3) {
-                *output_ptr0 = 0.f;
-              } else {
-                post = exclusive ? vdup_n_f32(1.f / (3 * (3 - padding)))
-                                 : vdup_n_f32(1.f / 9);
-                acc0 = vPoolPre_f32<P>(row0, row1);
-                acc0 = vPoolPre_f32<P>(acc0, row2);
-                if (padding == 1) {
-                  acc0 = vpPoolPre_f32<P>(acc0, acc0);
-                }
-                acc0 = vPoolPost_f32<P>(acc0, post);
-                vst1_lane_f32(output_ptr0, acc0, 0);
-              }
-              output_ptr0++;
-            }
-          }
-        }
-        // bottom
-        for (int h = valid_h_end; h < output_h; ++h) {
-          Pooling3x3NormalRow<P, 2>(input_ptr, h, input_h, input_w, padding_h,
-                                    padding_w, output_w, exclusive, output_ptr);
-        }
-      }
-    }
-  }
-};
-
-template struct Pooling3x3<MAX, 1>;
-template struct Pooling3x3<AVG, 1>;
-template struct Pooling3x3<MAX, 2>;
-template struct Pooling3x3<AVG, 2>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // __ARM_NEON
-#endif  // POOL_OP
diff --git a/mobile/src/operators/math/quantize.h b/mobile/src/operators/math/quantize.h
deleted file mode 100644
index 9f6b2437f5..0000000000
--- a/mobile/src/operators/math/quantize.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef QUANT_OP
-
-#pragma once
-
-#include <cmath>
-#include "common/types.h"
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <RoundType R = ROUND_NEAREST_TOWARDS_ZERO>
-inline int8_t Round(const float &x) {
-  return static_cast<int8_t>(x);
-}
-
-template <>
-inline int8_t Round<ROUND_NEAREST_AWAY_ZERO>(const float &x) {
-  return std::round(x);
-}
-
-template <>
-inline int8_t Round<ROUND_NEAREST_TO_EVEN>(const float &x) {
-  float v = std::round(x);
-  int32_t q = static_cast<int32_t>(v);
-  if (fabs(fabs(q - v) - 0.5) <= 0) {
-    if (abs(q) % 2 != 0) {
-      q = q + ((q > 0) ? -1 : 1);
-    }
-  }
-  return static_cast<int8_t>(q);
-}
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-template <RoundType R = ROUND_NEAREST_TOWARDS_ZERO>
-inline int32x4_t vRoundq_f32(const float32x4_t &x) {
-  return vcvtq_s32_f32(x);
-}
-
-template <>
-inline int32x4_t vRoundq_f32<ROUND_NEAREST_AWAY_ZERO>(const float32x4_t &x) {
-#if __aarch64__
-  return vcvtaq_s32_f32(x);
-#else
-  float32x4_t plus = vdupq_n_f32(0.5);
-  float32x4_t minus = vdupq_n_f32(-0.5);
-  float32x4_t zero = vdupq_n_f32(0);
-  uint32x4_t more_than_zero = vcgtq_f32(x, zero);
-  float32x4_t temp = vbslq_f32(more_than_zero, plus, minus);
-  temp = vaddq_f32(x, temp);
-  int32x4_t ret = vcvtq_s32_f32(temp);
-  return ret;
-#endif
-}
-
-template <>
-inline int32x4_t vRoundq_f32<ROUND_NEAREST_TO_EVEN>(const float32x4_t &x) {
-#if __aarch64__
-  return vcvtnq_s32_f32(x);
-#else
-  float32x4_t point5 = vdupq_n_f32(0.5);
-  int32x4_t one = vdupq_n_s32(1);
-  int32x4_t zero = vdupq_n_s32(0);
-
-  int32x4_t rnd = math::vRoundq_f32<ROUND_NEAREST_AWAY_ZERO>(x);
-  float32x4_t frnd = vcvtq_f32_s32(rnd);
-  frnd = vsubq_f32(frnd, x);
-  frnd = vabsq_f32(frnd);
-  uint32x4_t equal_point5 = vceqq_f32(frnd, point5);
-  int32x4_t abs_rnd = vabsq_s32(rnd);
-  abs_rnd = vandq_s32(abs_rnd, one);
-  uint32x4_t not_mod2 = vreinterpretq_u32_s32(abs_rnd);
-  uint32x4_t mask = vandq_u32(equal_point5, not_mod2);
-  uint32x4_t more_than_zero = vcgtq_s32(rnd, zero);
-  more_than_zero = vandq_u32(more_than_zero, vreinterpretq_u32_s32(one));
-  mask = veorq_u32(more_than_zero, mask);
-  more_than_zero = veorq_u32(more_than_zero, vreinterpretq_u32_s32(one));
-  mask = vaddq_u32(more_than_zero, mask);
-  int32x4_t smask = vreinterpretq_s32_u32(mask);
-  smask = vsubq_s32(smask, one);
-  rnd = vaddq_s32(rnd, smask);
-  return rnd;
-#endif
-}
-#endif  // __ARM_NEON__
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // QUANT_OP
diff --git a/mobile/src/operators/math/selected_rows_functor.h b/mobile/src/operators/math/selected_rows_functor.h
deleted file mode 100644
index f8b5521e4d..0000000000
--- a/mobile/src/operators/math/selected_rows_functor.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "framework/selected_rows.h"
-
-#define INLINE_FOR2(sizei, sizej)     \
-  for (int64_t i = 0; i < sizei; i++) \
-    for (int64_t j = 0; j < sizej; j++)
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-// SelectedRows + SelectedRows will simplely concat value and rows.
-// The real computation happens in dealing with LoDTensor.
-// template <typename T>
-// struct SelectedRowsAdd {
-//  void operator()(
-//                  const framework::SelectedRows& input1,
-//                  const framework::SelectedRows& input2,
-//                  framework::SelectedRows* output);
-//};
-//
-// template <typename T>
-// struct SelectedRowsAddTensor {
-//  void operator()(
-//                  const framework::SelectedRows& input1,
-//                  const framework::Tensor& input2, framework::Tensor* output);
-//};
-
-// input2 = input1 + input2
-template <typename T>
-struct SelectedRowsAddTo {
-  void operator()(const framework::SelectedRows& input1,
-                  const int64_t input2_offset,
-                  framework::SelectedRows* input2) {
-    auto in1_height = input1.height();
-    PADDLE_MOBILE_ENFORCE(in1_height == input2->height(), "height error");
-
-    auto& in1_rows = input1.rows();
-    auto& in2_rows = *(input2->mutable_rows());
-
-    auto& in1_value = input1.value();
-    auto* in2_value = input2->mutable_value();
-
-    // concat rows
-    in2_rows.Extend(in1_rows.begin(), in1_rows.end());
-
-    //    auto in1_place = input1.place();
-    //    PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
-    //    auto in2_place = input2->place();
-    //    PADDLE_ENFORCE(platform::is_cpu_place(in2_place));
-
-    auto* in1_data = in1_value.data<T>();
-    auto* in2_data = in2_value->data<T>();
-    memory::Copy(in2_data + input2_offset, in1_data,
-                 in1_value.numel() * sizeof(T));
-  }
-};
-
-// input2 = input1 + input2
-template <typename T>
-struct SelectedRowsAddToTensor {
-  void operator()(const framework::SelectedRows& input1,
-                  framework::Tensor* input2) {
-    auto in1_height = input1.height();
-    auto in2_dims = input2->dims();
-    PADDLE_MOBILE_ENFORCE(in1_height == in2_dims[0], "height != dims[0]");
-
-    auto& in1_value = input1.value();
-    auto& in1_rows = input1.rows();
-
-    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_MOBILE_ENFORCE(in1_row_numel == input2->numel() / in1_height,
-                          "row_numel error");
-
-    auto* in1_data = in1_value.data<T>();
-    auto* input2_data = input2->data<T>();
-
-    for (size_t i = 0; i < in1_rows.size(); i++) {
-      for (int64_t j = 0; j < in1_row_numel; j++) {
-        input2_data[in1_rows[i] * in1_row_numel + j] +=
-            in1_data[i * in1_row_numel + j];
-      }
-    }
-  }
-};
-
-// namespace scatter {
-//// functors for manuplating SelectedRows data
-// template <typename T>
-// struct MergeAdd {
-//  // unary functor, merge by adding duplicated rows in
-//  // the input SelectedRows object.
-//  framework::SelectedRows operator()(
-//                                     const framework::SelectedRows& input);
-//};
-
-// template <typename T>
-// struct Add {
-//  framework::SelectedRows operator()(
-//                                     const framework::SelectedRows& input1,
-//                                     const framework::SelectedRows& input2) {
-//    framework::SelectedRows out;
-//    out.set_rows(input1.rows());
-//    out.set_height(input1.height());
-//    out.mutable_value()->mutable_data<T>(input1.value().dims(),
-//                                         );
-//    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
-//    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
-//    auto e_in2 = framework::EigenVector<T>::Flatten(input2.value());
-//    e_out.device(*context.eigen_device()) = e_in1 + e_in2;
-//    return out;
-//  }
-//};
-
-// template <typename T>
-// struct Mul {
-//  // multiply two SelectedRows
-//  framework::SelectedRows operator()(
-//                                     const framework::SelectedRows& input1,
-//                                     const framework::SelectedRows& input2) {
-//    framework::SelectedRows out;
-//    out.set_rows(input1.rows());
-//    out.set_height(input1.height());
-//    out.mutable_value()->mutable_data<T>(input1.value().dims()
-//                                         );
-//    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
-//    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
-//    auto e_in2 = framework::EigenVector<T>::Flatten(input2.value());
-//    e_out.device(*context.eigen_device()) = e_in1 * e_in2;
-//    return out;
-//  }
-//  // multiply scalar to SelectedRows
-//  framework::SelectedRows operator()(
-//                                     const framework::SelectedRows& input1,
-//                                     const T input2) {
-//    framework::SelectedRows out;
-//    out.set_rows(input1.rows());
-//    out.set_height(input1.height());
-//    out.mutable_value()->mutable_data<T>(input1.value().dims(),
-//                                         );
-//    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
-//    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
-//    e_out.device(*context.eigen_device()) = input2 * e_in1;
-//    return out;
-//  }
-//};
-
-enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY };
-
-// out = seleted_rows_in / tensor
-template <typename T>
-struct UpdateToTensor {
-  void operator()(const ScatterOps& op, const framework::SelectedRows& input1,
-                  framework::Tensor* input2);
-};
-
-// namespace scatter
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/sequence2batch.cpp b/mobile/src/operators/math/sequence2batch.cpp
deleted file mode 100644
index 097a258ddd..0000000000
--- a/mobile/src/operators/math/sequence2batch.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/math/sequence2batch.h"
-#include <cstring>
-#include "common/types.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <typename T>
-class CopyMatrixRowsFunctor<CPU, T> {
- public:
-  void operator()(const framework::Tensor& src, std::vector<size_t> index_lod,
-                  framework::Tensor* dst, bool is_src_index) {
-    size_t* index = index_lod.data();
-    auto src_dims = src.dims();
-    auto dst_dims = dst->dims();
-    PADDLE_MOBILE_ENFORCE((src_dims.size() == 2UL),
-                          "The src must be matrix with rank 2.");
-    PADDLE_MOBILE_ENFORCE((dst_dims.size() == 2UL),
-                          "The dst must be matrix with rank 2.");
-    PADDLE_MOBILE_ENFORCE((src_dims[1] == dst_dims[1]),
-                          "The width of src and dst must be same.");
-    auto height = dst_dims[0];
-    auto width = dst_dims[1];
-    auto* src_data = src.data<T>();
-    auto* dst_data = dst->data<T>();
-    for (int i = 0; i < height; ++i) {
-      if (is_src_index) {
-        memcpy(dst_data + i * width, src_data + index[i] * width,
-               width * sizeof(T));
-      } else {
-        memcpy(dst_data + index[i] * width, src_data + i * width,
-               width * sizeof(T));
-      }
-    }
-  }
-};
-
-template class CopyMatrixRowsFunctor<CPU, float>;
-
-template class LoDTensor2BatchFunctor<CPU, float>;
-template class Batch2LoDTensorFunctor<CPU, float>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/sequence2batch.h b/mobile/src/operators/math/sequence2batch.h
deleted file mode 100644
index 537f2326d0..0000000000
--- a/mobile/src/operators/math/sequence2batch.h
+++ /dev/null
@@ -1,169 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "framework/lod_tensor.h"
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-template <typename DeviceType, typename T>
-class CopyMatrixRowsFunctor {
- public:
-  // If is_src_index is true,
-  // copy the indexed rows of input src to the output dst.
-  // If is_src_index is false,
-  // copy the input src to the indexed rows of output dst.
-  // The indexed rows are based on the input index.
-  void operator()(const framework::Tensor& src, std::vector<size_t> index_lod,
-                  framework::Tensor* dst, bool is_src_index);
-};
-
-template <typename DeviceType, typename T>
-class LoDTensor2BatchFunctor {
-  // Calculate the length of each sequence and
-  // sort sequence index by the length.
-  // example:  sequences = {s0, s1, s2}
-  //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
-  //           seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
-  //
-  struct SeqInfo {
-    SeqInfo(int start, int length, int seq_idx)
-        : start(start), length(length), seq_idx(seq_idx) {}
-    int start;
-    int length;
-    int seq_idx;
-  };
-
- public:
-  void operator()(const framework::LoDTensor& lod_tensor,
-                  framework::LoDTensor* batch, bool is_cal_batch_lod,
-                  bool is_reverse = false) {
-    if (!is_cal_batch_lod) {
-      auto lods = batch->lod();
-      PADDLE_MOBILE_ENFORCE(
-          (lods.size() > 2UL),
-          "The LoD of LoDTensor should inlcude at least 2-level "
-          "sequence information.");
-      PADDLE_MOBILE_ENFORCE(
-          (lods[1].size() == static_cast<size_t>(lod_tensor.dims()[0])),
-          "The LoD information should be consistent with the dims.");
-      CopyMatrixRowsFunctor<DeviceType, T> to_batch;
-      to_batch(lod_tensor, lods[1], batch, true);
-      return;
-    }
-
-    auto lods = lod_tensor.lod();
-    PADDLE_MOBILE_ENFORCE((lods.size() == 1UL),
-                          "Only support 1 level sequence, but %d is given",
-                          lods.size());
-
-    const auto& lod = lods[0];
-    std::vector<SeqInfo> seq_info;
-    for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
-      int length = lod[seq_id + 1] - lod[seq_id];
-      seq_info.emplace_back(lod[seq_id], length, seq_id);
-    }
-
-    std::sort(seq_info.begin(), seq_info.end(),
-              [](SeqInfo a, SeqInfo b) { return a.length > b.length; });
-
-    // Calculate the start position of each batch.
-    // example:  sequences = {s0, s1, s2}
-    //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
-    //           num_batch = 5,
-    //           batchIndex = {b0, b1, b2, b3, b4}
-    //           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
-    //           batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
-    //              batch_start_positions[0] = len(b0)
-    //              batch_start_positions[1] = len(b0) + len(b1)
-    //              batch_start_positions[2] = len(b0) + len(b1) + len(b2)
-    //              ...
-    //           seq2batch_idx[12] = {4, 0, 9,
-    //                                5, 1, 10,
-    //                                6, 2, 11,
-    //                                7, 3,
-    //                                8}
-    //           seq_order = {1, 0, 2}, the sort order.
-    //               where 1 is the second sequence,
-    //                     0 is the first sequence,
-    //                     2 is the third sequence.
-    // The num_batch represents batch size after rearranging the
-    // input LodTensor. It is also the maximum length of input sequence.
-
-    framework::LoD batch_lods;
-    batch_lods.emplace_back(std::vector<size_t>{0});
-    batch_lods.emplace_back(std::vector<size_t>{0});
-    batch_lods.emplace_back(std::vector<size_t>{0});
-
-    // batch_lods[0] is the start positions for batch LoDTensor
-    int num_batch = seq_info[0].length;
-    batch_lods[0].resize(static_cast<size_t>(num_batch + 1));
-    // batch_lods[1] is the raw index in the input LoDTensor
-    batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
-    // batch_lods[2] is the sort order for the input LoDTensor.
-    batch_lods[2].resize(seq_info.size());
-
-    size_t* batch_starts = batch_lods[0].data();
-    size_t* seq2batch_idx = batch_lods[1].data();
-    batch_starts[0] = 0;
-    for (int n = 0; n < num_batch; n++) {
-      auto batch_id = static_cast<int>(batch_starts[n]);
-      for (size_t i = 0; i < seq_info.size(); ++i) {
-        int seq_len = seq_info[i].length;
-        int start = seq_info[i].start;
-        if (n < seq_len) {
-          seq2batch_idx[batch_id] =
-              is_reverse ? start + seq_len - 1 - n : start + n;
-          batch_id++;
-        } else {
-          break;
-        }
-      }
-      batch_starts[n + 1] = static_cast<size_t>(batch_id);
-    }
-    size_t* seq_order = batch_lods[2].data();
-    for (size_t i = 0; i < seq_info.size(); ++i) {
-      seq_order[i] = seq_info[i].seq_idx;
-    }
-    batch->set_lod(batch_lods);
-
-    CopyMatrixRowsFunctor<DeviceType, T> to_batch;
-    to_batch(lod_tensor, batch_lods[1], batch, true);
-  }
-};
-
-template <typename DeviceType, typename T>
-class Batch2LoDTensorFunctor {
- public:
-  void operator()(const framework::LoDTensor& batch,
-                  framework::LoDTensor* lod_tensor) {
-    auto in_lod = batch.lod();
-    PADDLE_MOBILE_ENFORCE(
-        (in_lod.size() > 2UL),
-        "The LoD of LoDTensor should inlcude at least 2-level "
-        "sequence information.");
-    PADDLE_MOBILE_ENFORCE(
-        (in_lod[1].size() == static_cast<size_t>(lod_tensor->dims()[0])),
-        "The LoD information should be consistent with the dims.");
-    CopyMatrixRowsFunctor<DeviceType, T> to_seq;
-    to_seq(batch, in_lod[1], lod_tensor, false);
-  }
-};
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/slidingwindow_conv3x3.cpp b/mobile/src/operators/math/slidingwindow_conv3x3.cpp
deleted file mode 100644
index 0f4fbcbd93..0000000000
--- a/mobile/src/operators/math/slidingwindow_conv3x3.cpp
+++ /dev/null
@@ -1,5668 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/math/slidingwindow_conv3x3.h"
-#include <vector>
-#include "framework/context.h"
-#include "operators/math/slidingwindow_utils.h"
-#if __ARM_NEON
-#include <arm_neon.h>
-#endif
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-template <>
-void SlidingwindowConv3x3s1<float, float>(const framework::Tensor *input,
-                                          const framework::Tensor *filter,
-                                          const std::vector<int> &paddings,
-                                          framework::Tensor *output) {
-  const int batch = input->dims()[0];
-  const int input_ch = input->dims()[1];
-  const int input_h = input->dims()[2];
-  const int input_w = input->dims()[3];
-  const int output_ch = output->dims()[1];
-  const int output_h = output->dims()[2];
-  const int output_w = output->dims()[3];
-  const int padding_h = paddings[0];
-  const int padding_w = paddings[1];
-
-  const float *input_data = input->data<float>();
-  float *output_data = output->mutable_data<float>();
-  const float *filter_data = filter->data<float>();
-
-  const int in_ch_size = input_h * input_w;
-  const int in_batch_size = input_ch * in_ch_size;
-  const int out_ch_size = output_h * output_w;
-  const int out_batch_size = output_ch * out_ch_size;
-  const int out_size = batch * out_batch_size;
-  const int filter_ch_size = 9;
-  const int pad_filter_ch_size = (2 * padding_h + 3) * (2 * padding_w + 3);
-  const int pad_filter_start =
-      2 * padding_h * (2 * padding_w + 3) + 2 * padding_w;
-  const int pad_filter_w = 3 + padding_w * 2;
-  bool if_nopadding = false;
-
-#if __ARM_NEON
-  float *out_ptr = output_data;
-  int remain = out_size & 0x3;
-  float32x4_t _zero = vdupq_n_f32(0.0);
-
-  for (int i = 0; i < out_size; i += 4) {
-    vst1q_f32(out_ptr, _zero);
-    out_ptr += 4;
-  }
-  switch (remain) {
-    case 1:
-      vst1q_lane_f32(out_ptr, _zero, 0);
-      break;
-    case 2:
-      vst1_f32(out_ptr, vget_low_f32(_zero));
-      break;
-    case 3:
-      vst1_f32(out_ptr, vget_low_f32(_zero));
-      vst1q_lane_f32(out_ptr + 2, _zero, 0);
-      break;
-  }
-#else
-#pragma omp parallel for
-  for (int i = 0; i < out_size; ++i) {
-    output_data[i] = 0;
-  }
-#endif
-  if (padding_h == 0 && padding_w == 0) {
-    if_nopadding = true;
-  }
-
-  for (int b = 0; b < batch; ++b) {
-#pragma omp parallel for
-    for (int o_c = 0; o_c < output_ch - 1; o_c += 2) {
-      bool issamefilter;
-      const float *f1;
-      const float *f1_c2;
-      const float *in_ptr1, *in_ptr2, *in_ptr3, *in_ptr4;
-      const float *pad_filter0, *pad_filter1, *pad_filter2, *pad_filter3;
-      const float *pad_filter0_c2, *pad_filter1_c2, *pad_filter2_c2,
-          *pad_filter3_c2;
-      float pad_filter_arr[pad_filter_ch_size];
-      float pad_filter_arr_c2[pad_filter_ch_size];
-
-      float *output_data_ch;
-      float *output_data_ch_2;
-      const float *input_data_ch;
-      const float *filter_data_ch;
-      const float *filter_data_ch_c2;
-
-      filter_data_ch = filter_data + o_c * filter_ch_size * input_ch;
-      filter_data_ch_c2 = filter_data + (o_c + 1) * filter_ch_size * input_ch;
-
-      input_data_ch = input_data;
-      output_data_ch = output_data + o_c * out_ch_size;
-      output_data_ch_2 = output_data + (o_c + 1) * out_ch_size;
-
-      for (int i_c = 0; i_c < input_ch; ++i_c) {
-        f1 = filter_data_ch;
-        f1_c2 = filter_data_ch_c2;
-
-        if (!if_nopadding) {
-          memset(pad_filter_arr, 0.f, sizeof(pad_filter_arr));
-          memset(pad_filter_arr_c2, 0.f, sizeof(pad_filter_arr_c2));
-          for (int i = 0; i < 9; i++) {
-            int j = i / 3 * (2 * padding_w + 3) + i % 3 + padding_h * 3 +
-                    padding_w * (2 * padding_h + 1);
-            pad_filter_arr[j] = filter_data_ch[i];
-            pad_filter_arr_c2[j] = filter_data_ch_c2[i];
-          }
-          pad_filter1 = pad_filter_arr;
-          pad_filter1 += pad_filter_start;
-          pad_filter0 = pad_filter1 - pad_filter_w;
-          pad_filter2 = pad_filter1 + pad_filter_w;
-          pad_filter3 = pad_filter2 + pad_filter_w;
-
-          pad_filter1_c2 = pad_filter_arr_c2;
-          pad_filter1_c2 += pad_filter_start;
-          pad_filter0_c2 = pad_filter1_c2 - pad_filter_w;
-          pad_filter2_c2 = pad_filter1_c2 + pad_filter_w;
-          pad_filter3_c2 = pad_filter2_c2 + pad_filter_w;
-        } else {
-          pad_filter1 = filter_data_ch;
-          pad_filter2 = pad_filter1 + 3;
-          pad_filter3 = pad_filter2 + 3;
-
-          pad_filter1_c2 = filter_data_ch_c2;
-          pad_filter2_c2 = pad_filter1_c2 + 3;
-          pad_filter3_c2 = pad_filter2_c2 + 3;
-        }
-        float *out_ptr1, *out_ptr2;
-        float *out_ptr1_c2, *out_ptr2_c2;
-
-        out_ptr1 = output_data_ch;
-        out_ptr2 = out_ptr1 + output_w;
-        out_ptr1_c2 = output_data_ch_2;
-        out_ptr2_c2 = out_ptr1_c2 + output_w;
-
-        in_ptr1 = input_data_ch;
-        in_ptr2 = in_ptr1 + input_w;
-        in_ptr3 = in_ptr2 + input_w;
-        in_ptr4 = in_ptr3 + input_w;
-
-        int o_h = 0;
-        for (; o_h < output_h - 1; o_h = o_h + 2) {
-          if (!if_nopadding &&
-              (o_h < padding_h || o_h > output_h - padding_h - 2)) {
-            issamefilter = false;
-          } else {
-            issamefilter = true;
-          }
-          int o_w = 0;
-          // pad left
-          for (; o_w < padding_w; ++o_w) {
-            float sum1 = 0;
-            float sum2 = 0;
-            float sum1_c2 = 0;
-            float sum2_c2 = 0;
-
-            if (issamefilter) {
-#if __ARM_NEON
-              float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-              float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-              float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2);
-              float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-              float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2);
-
-              float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-              float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-              float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2);
-              _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-              _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2);
-              float32x4_t _sum2 = vmulq_f32(_in_ptr2, _pad_filter1);
-              float32x4_t _sum2_c2 = vmulq_f32(_in_ptr2, _pad_filter1_c2);
-
-              float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-              float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-              float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2);
-              _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-              _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2);
-              _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr3, _pad_filter2_c2);
-
-              float32x4_t _in_ptr4 = vld1q_f32(in_ptr4);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr4, _pad_filter3);
-              _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr4, _pad_filter3_c2);
-
-              _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-              _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3);
-              _sum2 = vsetq_lane_f32(sum2, _sum2, 3);
-              _sum2_c2 = vsetq_lane_f32(sum2_c2, _sum2_c2, 3);
-
-              float32x2_t _ss1 =
-                  vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-              float32x2_t _ss1_2 =
-                  vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2));
-              float32x2_t _ss2 =
-                  vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
-              float32x2_t _ss2_2 =
-                  vadd_f32(vget_low_f32(_sum2_c2), vget_high_f32(_sum2_c2));
-              float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2);
-              float32x2_t _ssss1_2_ssss2_2 = vpadd_f32(_ss1_2, _ss2_2);
-
-              sum1 += vget_lane_f32(_ssss1_ssss2, 0);
-              sum1_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 0);
-              sum2 += vget_lane_f32(_ssss1_ssss2, 1);
-              sum2_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 1);
-#else
-              sum1 += in_ptr1[0] * pad_filter1[0];
-              sum1 += in_ptr1[1] * pad_filter1[1];
-              sum1 += in_ptr1[2] * pad_filter1[2];
-              sum1 += in_ptr2[0] * pad_filter2[0];
-              sum1 += in_ptr2[1] * pad_filter2[1];
-              sum1 += in_ptr2[2] * pad_filter2[2];
-              sum1 += in_ptr3[0] * pad_filter3[0];
-              sum1 += in_ptr3[1] * pad_filter3[1];
-              sum1 += in_ptr3[2] * pad_filter3[2];
-
-              sum2 += in_ptr2[0] * pad_filter1[0];
-              sum2 += in_ptr2[1] * pad_filter1[1];
-              sum2 += in_ptr2[2] * pad_filter1[2];
-              sum2 += in_ptr3[0] * pad_filter2[0];
-              sum2 += in_ptr3[1] * pad_filter2[1];
-              sum2 += in_ptr3[2] * pad_filter2[2];
-              sum2 += in_ptr4[0] * pad_filter3[0];
-              sum2 += in_ptr4[1] * pad_filter3[1];
-              sum2 += in_ptr4[2] * pad_filter3[2];
-
-              sum1_c2 += in_ptr1[0] * pad_filter1_c2[0];
-              sum1_c2 += in_ptr1[1] * pad_filter1_c2[1];
-              sum1_c2 += in_ptr1[2] * pad_filter1_c2[2];
-              sum1_c2 += in_ptr2[0] * pad_filter2_c2[0];
-              sum1_c2 += in_ptr2[1] * pad_filter2_c2[1];
-              sum1_c2 += in_ptr2[2] * pad_filter2_c2[2];
-              sum1_c2 += in_ptr3[0] * pad_filter3_c2[0];
-              sum1_c2 += in_ptr3[1] * pad_filter3_c2[1];
-              sum1_c2 += in_ptr3[2] * pad_filter3_c2[2];
-
-              sum2_c2 += in_ptr2[0] * pad_filter1_c2[0];
-              sum2_c2 += in_ptr2[1] * pad_filter1_c2[1];
-              sum2_c2 += in_ptr2[2] * pad_filter1_c2[2];
-              sum2_c2 += in_ptr3[0] * pad_filter2_c2[0];
-              sum2_c2 += in_ptr3[1] * pad_filter2_c2[1];
-              sum2_c2 += in_ptr3[2] * pad_filter2_c2[2];
-              sum2_c2 += in_ptr4[0] * pad_filter3_c2[0];
-              sum2_c2 += in_ptr4[1] * pad_filter3_c2[1];
-              sum2_c2 += in_ptr4[2] * pad_filter3_c2[2];
-#endif
-            } else {
-#if __ARM_NEON
-              float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-              float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-              float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2);
-              float32x4_t _pad_filter0 = vld1q_f32(pad_filter0);
-              float32x4_t _pad_filter0_c2 = vld1q_f32(pad_filter0_c2);
-
-              float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-              float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2);
-              float32x4_t _sum2 = vmulq_f32(_in_ptr1, _pad_filter0);
-              float32x4_t _sum2_c2 = vmulq_f32(_in_ptr1, _pad_filter0_c2);
-
-              float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-              float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-              float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2);
-
-              _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-              _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr2, _pad_filter1);
-              _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr2, _pad_filter1_c2);
-
-              float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-              float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-              float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2);
-
-              _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-              _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2);
-              _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr3, _pad_filter2_c2);
-
-              _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-              _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3);
-              _sum2 = vsetq_lane_f32(sum2, _sum2, 3);
-              _sum2_c2 = vsetq_lane_f32(sum2_c2, _sum2_c2, 3);
-
-              float32x2_t _ss1 =
-                  vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-              float32x2_t _ss1_2 =
-                  vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2));
-              float32x2_t _ss2 =
-                  vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
-              float32x2_t _ss2_2 =
-                  vadd_f32(vget_low_f32(_sum2_c2), vget_high_f32(_sum2_c2));
-              float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2);
-              float32x2_t _ssss1_2_ssss2_2 = vpadd_f32(_ss1_2, _ss2_2);
-
-              sum1 += vget_lane_f32(_ssss1_ssss2, 0);
-              sum1_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 0);
-              sum2 += vget_lane_f32(_ssss1_ssss2, 1);
-              sum2_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 1);
-#else
-              sum1 += in_ptr1[0] * pad_filter1[0];
-              sum1 += in_ptr1[1] * pad_filter1[1];
-              sum1 += in_ptr1[2] * pad_filter1[2];
-              sum1 += in_ptr2[0] * pad_filter2[0];
-              sum1 += in_ptr2[1] * pad_filter2[1];
-              sum1 += in_ptr2[2] * pad_filter2[2];
-              sum1 += in_ptr3[0] * pad_filter3[0];
-              sum1 += in_ptr3[1] * pad_filter3[1];
-              sum1 += in_ptr3[2] * pad_filter3[2];
-
-              sum2 += in_ptr1[0] * pad_filter0[0];
-              sum2 += in_ptr1[1] * pad_filter0[1];
-              sum2 += in_ptr1[2] * pad_filter0[2];
-              sum2 += in_ptr2[0] * pad_filter1[0];
-              sum2 += in_ptr2[1] * pad_filter1[1];
-              sum2 += in_ptr2[2] * pad_filter1[2];
-              sum2 += in_ptr3[0] * pad_filter2[0];
-              sum2 += in_ptr3[1] * pad_filter2[1];
-              sum2 += in_ptr3[2] * pad_filter2[2];
-
-              sum1_c2 += in_ptr1[0] * pad_filter1_c2[0];
-              sum1_c2 += in_ptr1[1] * pad_filter1_c2[1];
-              sum1_c2 += in_ptr1[2] * pad_filter1_c2[2];
-              sum1_c2 += in_ptr2[0] * pad_filter2_c2[0];
-              sum1_c2 += in_ptr2[1] * pad_filter2_c2[1];
-              sum1_c2 += in_ptr2[2] * pad_filter2_c2[2];
-              sum1_c2 += in_ptr3[0] * pad_filter3_c2[0];
-              sum1_c2 += in_ptr3[1] * pad_filter3_c2[1];
-              sum1_c2 += in_ptr3[2] * pad_filter3_c2[2];
-
-              sum2_c2 += in_ptr1[0] * pad_filter0_c2[0];
-              sum2_c2 += in_ptr1[1] * pad_filter0_c2[1];
-              sum2_c2 += in_ptr1[2] * pad_filter0_c2[2];
-              sum2_c2 += in_ptr2[0] * pad_filter1_c2[0];
-              sum2_c2 += in_ptr2[1] * pad_filter1_c2[1];
-              sum2_c2 += in_ptr2[2] * pad_filter1_c2[2];
-              sum2_c2 += in_ptr3[0] * pad_filter2_c2[0];
-              sum2_c2 += in_ptr3[1] * pad_filter2_c2[1];
-              sum2_c2 += in_ptr3[2] * pad_filter2_c2[2];
-#endif
-            }
-            if (!if_nopadding &&
-                (o_w < padding_w || o_w > output_w - padding_w - 2)) {
-              pad_filter0--;
-              pad_filter1--;
-              pad_filter2--;
-              pad_filter3--;
-
-              pad_filter0_c2--;
-              pad_filter1_c2--;
-              pad_filter2_c2--;
-              pad_filter3_c2--;
-            } else {
-              in_ptr1++;
-              in_ptr2++;
-              in_ptr3++;
-              in_ptr4++;
-            }
-            *out_ptr1 += sum1;
-            *out_ptr2 += sum2;
-            *out_ptr1_c2 += sum1_c2;
-            *out_ptr2_c2 += sum2_c2;
-
-            out_ptr1++;
-            out_ptr2++;
-            out_ptr1_c2++;
-            out_ptr2_c2++;
-          }
-            // valid
-#if __ARM_NEON
-#if __aarch64__
-          if (issamefilter) {
-            int loop = (output_w - 2 * padding_w) >> 2;
-            o_w += loop * 4;
-
-            if (loop > 0) {
-              asm volatile(
-                  "prfm   pldl1keep, [%[f1], #256]          \n\t"
-                  "prfm   pldl1keep, [%[f1_c2], #256]       \n\t"
-
-                  "ld1   {v0.4s, v1.4s}, [%[f1]], #32       \n\t"
-                  "ld1   {v2.4s, v3.4s}, [%[f1_c2]], #32    \n\t"
-                  "ld1   {v4.s}[0], [%[f1]]                 \n\t"
-
-                  "sub        %[f1],%[f1], #32              \n\t"
-                  "ld1   {v4.s}[1], [%[f1_c2]]              \n\t"
-                  "sub        %[f1_c2],%[f1_c2], #32        \n\t"
-
-                  "prfm   pldl1keep, [%[in_ptr1], #192]     \n\t"
-                  "prfm   pldl1keep, [%[in_ptr4], #192]     \n\t"
-
-                  "ld1   {v5.4s, v6.4s}, [%[in_ptr1]]       \n\t"
-                  "add        %[in_ptr1],%[in_ptr1], #16    \n\t"
-
-                  "ld1   {v6.d}[1], [%[in_ptr4]]            \n\t"
-                  "add        %[in_ptr4],%[in_ptr4], #8     \n\t"
-                  "ld1   {v7.4s}, [%[in_ptr4]]              \n\t"
-                  "add        %[in_ptr4],%[in_ptr4], #8     \n\t"
-
-                  "0:                                       \n\t"
-                  // load out_ptr
-                  "prfm   pldl1keep, [%[out_ptr1], #128]    \n\t"
-                  "prfm   pldl1keep, [%[out_ptr1_c2], #128] \n\t"
-                  "prfm   pldl1keep, [%[out_ptr2], #128]    \n\t"
-                  "prfm   pldl1keep, [%[out_ptr2_c2], #128] \n\t"
-
-                  "ld1   {v12.4s}, [%[out_ptr1]]            \n\t"
-                  "ld1   {v13.4s}, [%[out_ptr1_c2]]         \n\t"
-                  "ld1   {v14.4s}, [%[out_ptr2]]            \n\t"
-                  "ld1   {v15.4s}, [%[out_ptr2_c2]]         \n\t"
-
-                  // in_ptr1 and in_ptr4 multiply
-                  "ext    v8.16b, v5.16b, v6.16b, #4        \n\t"
-                  "fmla   v12.4s, v5.4s, v0.s[0]            \n\t"
-                  "fmla   v13.4s, v5.4s, v2.s[0]            \n\t"
-
-                  "ext    v9.16b, v6.16b, v7.16b, #8        \n\t"
-                  "fmla   v14.4s, v7.4s, v4.s[0]            \n\t"
-                  "fmla   v15.4s, v7.4s, v4.s[1]            \n\t"
-
-                  "ext    v10.16b, v5.16b, v6.16b, #8       \n\t"
-                  "fmla   v12.4s, v8.4s, v0.s[1]            \n\t"
-                  "fmla   v13.4s, v8.4s, v2.s[1]            \n\t"
-
-                  "ext    v11.16b, v6.16b, v7.16b, #12      \n\t"
-                  "fmla   v14.4s, v9.4s, v1.s[2]            \n\t"
-                  "fmla   v15.4s, v9.4s, v3.s[2]            \n\t"
-
-                  "ld1   {v5.4s, v6.4s}, [%[in_ptr2]]       \n\t"
-                  "fmla   v12.4s, v10.4s, v0.s[2]           \n\t"
-                  "fmla   v13.4s, v10.4s, v2.s[2]           \n\t"
-
-                  "add        %[in_ptr2],%[in_ptr2], #16    \n\t"
-                  "fmla   v14.4s, v11.4s, v1.s[3]           \n\t"
-                  "fmla   v15.4s, v11.4s, v3.s[3]           \n\t"
-
-                  // in_ptr2 multiply
-                  "ext    v8.16b,  v5.16b, v6.16b, #4       \n\t"
-                  "fmla   v12.4s, v5.4s, v0.s[3]            \n\t"
-                  "fmla   v13.4s, v5.4s, v2.s[3]            \n\t"
-
-                  "fmla   v14.4s, v5.4s, v0.s[0]            \n\t"
-                  "fmla   v15.4s, v5.4s, v2.s[0]            \n\t"
-
-                  "ext    v9.16b,  v5.16b, v6.16b, #8       \n\t"
-                  "fmla   v12.4s, v8.4s, v1.s[0]            \n\t"
-                  "fmla   v13.4s, v8.4s, v3.s[0]            \n\t"
-
-                  "ld1   {v6.d}[1], [%[in_ptr3]]            \n\t"
-                  "add        %[in_ptr3],%[in_ptr3], #8     \n\t"
-                  "fmla   v14.4s, v8.4s, v0.s[1]            \n\t"
-                  "fmla   v15.4s, v8.4s, v2.s[1]            \n\t"
-
-                  "ld1   {v7.4s}, [%[in_ptr3]]              \n\t"
-                  "add        %[in_ptr3],%[in_ptr3], #8     \n\t"
-
-                  "fmla   v12.4s, v9.4s, v1.s[1]            \n\t"
-                  "fmla   v13.4s, v9.4s, v3.s[1]            \n\t"
-
-                  "ext    v10.16b, v6.16b, v7.16b, #8       \n\t"
-                  "fmla   v14.4s, v9.4s, v0.s[2]            \n\t"
-                  "fmla   v15.4s, v9.4s, v2.s[2]            \n\t"
-
-                  // in_ptr3 multiply
-                  "fmla   v12.4s, v7.4s, v4.s[0]            \n\t"
-                  "fmla   v13.4s, v7.4s, v4.s[1]            \n\t"
-
-                  "ext    v11.16b, v6.16b,  v7.16b, #12     \n\t"
-                  "fmla   v14.4s, v7.4s, v1.s[1]            \n\t"
-                  "fmla   v15.4s, v7.4s, v3.s[1]            \n\t"
-
-                  "fmla   v12.4s, v10.4s, v1.s[2]           \n\t"
-                  "fmla   v13.4s, v10.4s, v3.s[2]           \n\t"
-
-                  "fmla   v14.4s, v10.4s, v0.s[3]           \n\t"
-                  "fmla   v15.4s, v10.4s, v2.s[3]           \n\t"
-
-                  "fmla   v12.4s, v11.4s, v1.s[3]           \n\t"
-                  "fmla   v13.4s, v11.4s, v3.s[3]           \n\t"
-
-                  "prfm   pldl1keep, [%[in_ptr1], #192]     \n\t"
-                  "fmla   v14.4s, v11.4s, v1.s[0]           \n\t"
-                  "fmla   v15.4s, v11.4s, v3.s[0]           \n\t"
-
-                  // store out_ptr
-                  "prfm   pldl1keep, [%[in_ptr4], #192]     \n\t"
-                  "ld1   {v5.4s, v6.4s}, [%[in_ptr1]]       \n\t"
-                  "add        %[in_ptr1],%[in_ptr1], #16    \n\t"
-                  "st1   {v12.4s}, [%[out_ptr1]], #16       \n\t"
-
-                  "ld1   {v6.d}[1], [%[in_ptr4]]            \n\t"
-                  "add        %[in_ptr4],%[in_ptr4], #8     \n\t"
-                  "st1   {v13.4s}, [%[out_ptr1_c2]], #16     \n\t"
-
-                  "ld1   {v7.4s}, [%[in_ptr4]]              \n\t"
-                  "add        %[in_ptr4],%[in_ptr4], #8     \n\t"
-                  "st1   {v14.4s}, [%[out_ptr2]], #16       \n\t"
-
-                  "subs       %[loop],%[loop], #1   \n\t"
-                  "st1   {v15.4s}, [%[out_ptr2_c2]], #16    \n\t"
-
-                  // cycle
-                  "bne        0b                            \n\t"
-                  "sub       %[in_ptr1],%[in_ptr1], #16     \n\t"
-                  "sub       %[in_ptr4],%[in_ptr4], #16     \n\t"
-
-                  : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1),
-                    [out_ptr2] "+r"(out_ptr2), [out_ptr1_c2] "+r"(out_ptr1_c2),
-                    [out_ptr2_c2] "+r"(out_ptr2_c2), [in_ptr1] "+r"(in_ptr1),
-                    [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3),
-                    [in_ptr4] "+r"(in_ptr4)
-                  : [f1] "r"(f1), [f1_c2] "r"(f1_c2)
-                  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
-                    "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
-            }
-          }
-          if (!if_nopadding && o_w == output_w - padding_w) {
-            pad_filter0--;
-            pad_filter1--;
-            pad_filter2--;
-            pad_filter3--;
-
-            pad_filter0_c2--;
-            pad_filter1_c2--;
-            pad_filter2_c2--;
-            pad_filter3_c2--;
-
-            in_ptr1--;
-            in_ptr2--;
-            in_ptr3--;
-            in_ptr4--;
-          }
-#else
-          if (issamefilter) {
-            int loop = (output_w - 2 * padding_w) >> 2;
-            o_w += loop * 4;
-
-            if (loop > 0) {
-              asm volatile(
-                  "pld        [%[f1], #256]                 \n\t"
-                  "pld        [%[f1_c2], #256]              \n\t"
-
-                  "vld1.f32   {d0-d3}, [%[f1]]              \n\t"
-                  "add        %[f1], #32                    \n\t"
-                  "vld1.f32   {d4-d7}, [%[f1_c2]]           \n\t"
-                  "add        %[f1_c2], #32                 \n\t"
-
-                  "vld1.f32   {d8[0]}, [%[f1]]              \n\t"
-                  "sub        %[f1], #32                    \n\t"
-                  "vld1.f32   {d8[1]}, [%[f1_c2]]           \n\t"
-                  "sub        %[f1_c2], #32                 \n\t"
-
-                  "pld        [%[in_ptr1], #192]            \n\t"
-                  "pld        [%[in_ptr4], #192]            \n\t"
-
-                  "vld1.f32   {d10-d12}, [%[in_ptr1]]       \n\t"
-                  "add        %[in_ptr1], #16               \n\t"
-
-                  "vld1.f32   {d13-d15}, [%[in_ptr4]]       \n\t"
-                  "add        %[in_ptr4], #16               \n\t"
-
-                  "0:                                       \n\t"
-                  // load out_ptr
-                  "pld        [%[out_ptr1], #128]           \n\t"
-                  "pld        [%[out_ptr1_c2], #128]        \n\t"
-                  "pld        [%[out_ptr2], #128]           \n\t"
-                  "pld        [%[out_ptr2_c2], #128]        \n\t"
-
-                  "vld1.f32   {d24, d25}, [%[out_ptr1]]     \n\t"
-                  "vld1.f32   {d26, d27}, [%[out_ptr1_c2]]  \n\t"
-                  "vld1.f32   {d28, d29}, [%[out_ptr2]]     \n\t"
-                  "vld1.f32   {d30, d31}, [%[out_ptr2_c2]]  \n\t"
-
-                  // in_ptr1 + in_ptr4 multiply
-                  "vext.32    q8, q5, q6, #1                \n\t"
-                  "vmla.f32   q12, q5, d0[0]                \n\t"
-                  "vmla.f32   q13, q5, d4[0]                \n\t"
-
-                  "vext.32    q9, q6, q7, #2                \n\t"
-                  "vmla.f32   q14, q7, d8[0]                \n\t"
-                  "vmla.f32   q15, q7, d8[1]                \n\t"
-
-                  "vext.32    q10, q5, q6, #2               \n\t"
-                  "vmla.f32   q12, q8, d0[1]                \n\t"
-                  "vmla.f32   q13, q8, d4[1]                \n\t"
-
-                  "vext.32    q11, q6, q7, #3               \n\t"
-                  "vmla.f32   q14, q9, d3[0]                \n\t"
-                  "vmla.f32   q15, q9, d7[0]                \n\t"
-
-                  "vld1.f32   {d10-d12}, [%[in_ptr2]]       \n\t"
-                  "add        %[in_ptr2], #16               \n\t"
-                  "vmla.f32   q12, q10, d1[0]               \n\t"
-                  "vmla.f32   q13, q10, d5[0]               \n\t"
-
-                  "vmla.f32   q14, q11, d3[1]               \n\t"
-                  "vmla.f32   q15, q11, d7[1]               \n\t"
-
-                  // in_ptr2 multiply
-                  "vext.32    q8, q5, q6, #1                \n\t"
-                  "vmla.f32   q12, q5, d1[1]                \n\t"
-                  "vmla.f32   q13, q5, d5[1]                \n\t"
-
-                  "vmla.f32   q14, q5, d0[0]                \n\t"
-                  "vmla.f32   q15, q5, d4[0]                \n\t"
-
-                  "vext.32    q9, q5, q6, #2                \n\t"
-                  "vmla.f32   q12, q8, d2[0]                \n\t"
-                  "vmla.f32   q13, q8, d6[0]                \n\t"
-
-                  "vld1.f32   {d13-d15}, [%[in_ptr3]]       \n\t"
-                  "add        %[in_ptr3], #16               \n\t"
-                  "vmla.f32   q14, q8, d0[1]                \n\t"
-                  "vmla.f32   q15, q8, d4[1]                \n\t"
-
-                  "vmla.f32   q12, q9, d2[1]                \n\t"
-                  "vmla.f32   q13, q9, d6[1]                \n\t"
-
-                  "vmla.f32   q14, q9, d1[0]                \n\t"
-                  "vmla.f32   q15, q9, d5[0]                \n\t"
-
-                  // in_ptr3 multiply
-                  "vext.32    q10, q6, q7, #2               \n\t"
-                  "vmla.f32   q12, q7, d8[0]                \n\t"
-                  "vmla.f32   q13, q7, d8[1]                \n\t"
-                  "vmla.f32   q14, q7, d2[1]                \n\t"
-                  "vmla.f32   q15, q7, d6[1]                \n\t"
-
-                  "vext.32    q11, q6, q7, #3               \n\t"
-                  "vmla.f32   q12, q10, d3[0]               \n\t"
-                  "vmla.f32   q13, q10, d7[0]               \n\t"
-                  "vmla.f32   q14, q10, d1[1]               \n\t"
-                  "vmla.f32   q15, q10, d5[1]               \n\t"
-
-                  "vmla.f32   q12, q11, d3[1]               \n\t"
-                  "vmla.f32   q13, q11, d7[1]               \n\t"
-                  "vmla.f32   q14, q11, d2[0]               \n\t"
-                  "vmla.f32   q15, q11, d6[0]               \n\t"
-
-                  // store out_ptr
-                  "pld        [%[in_ptr1], #192]            \n\t"
-
-                  "pld        [%[in_ptr4], #192]            \n\t"
-                  "vld1.f32   {d10-d12}, [%[in_ptr1]]       \n\t"
-                  "add        %[in_ptr1], #16               \n\t"
-
-                  "vst1.f32   {d24, d25}, [%[out_ptr1]]!    \n\t"
-
-                  "vst1.f32   {d26, d27}, [%[out_ptr1_c2]]! \n\t"
-                  "vld1.f32   {d13-d15}, [%[in_ptr4]]       \n\t"
-
-                  "add        %[in_ptr4], #16               \n\t"
-                  "vst1.f32   {d28, d29}, [%[out_ptr2]]!    \n\t"
-
-                  "subs       %[loop], #1               \n\t"
-                  "vst1.f32   {d30, d31}, [%[out_ptr2_c2]]! \n\t"
-
-                  // cycle
-                  "bne        0b                            \n\t"
-                  "sub       %[in_ptr1], #16                \n\t"
-                  "sub       %[in_ptr4], #16                \n\t"
-
-                  : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1),
-                    [out_ptr2] "+r"(out_ptr2), [out_ptr1_c2] "+r"(out_ptr1_c2),
-                    [out_ptr2_c2] "+r"(out_ptr2_c2), [in_ptr1] "+r"(in_ptr1),
-                    [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3),
-                    [in_ptr4] "+r"(in_ptr4)
-                  : [f1] "r"(f1), [f1_c2] "r"(f1_c2)
-                  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
-                    "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-            }
-          }
-          if (!if_nopadding && o_w == output_w - padding_w) {
-            pad_filter0--;
-            pad_filter1--;
-            pad_filter2--;
-            pad_filter3--;
-
-            pad_filter0_c2--;
-            pad_filter1_c2--;
-            pad_filter2_c2--;
-            pad_filter3_c2--;
-
-            in_ptr1--;
-            in_ptr2--;
-            in_ptr3--;
-            in_ptr4--;
-          }
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-
-          // remain output_width
-          for (; o_w < output_w; ++o_w) {
-            float sum1 = 0;
-            float sum2 = 0;
-            float sum1_c2 = 0;
-            float sum2_c2 = 0;
-
-            if (issamefilter) {
-#if __ARM_NEON
-              float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-              float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-              float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2);
-              float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-              float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2);
-
-              float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-              float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-              float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2);
-              _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-              _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2);
-              float32x4_t _sum2 = vmulq_f32(_in_ptr2, _pad_filter1);
-              float32x4_t _sum2_c2 = vmulq_f32(_in_ptr2, _pad_filter1_c2);
-
-              float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-              float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-              float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2);
-              _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-              _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2);
-              _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr3, _pad_filter2_c2);
-
-              float32x4_t _in_ptr4 = vld1q_f32(in_ptr4);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr4, _pad_filter3);
-              _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr4, _pad_filter3_c2);
-
-              _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-              _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3);
-              _sum2 = vsetq_lane_f32(sum2, _sum2, 3);
-              _sum2_c2 = vsetq_lane_f32(sum2_c2, _sum2_c2, 3);
-
-              float32x2_t _ss1 =
-                  vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-              float32x2_t _ss1_2 =
-                  vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2));
-              float32x2_t _ss2 =
-                  vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
-              float32x2_t _ss2_2 =
-                  vadd_f32(vget_low_f32(_sum2_c2), vget_high_f32(_sum2_c2));
-              float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2);
-              float32x2_t _ssss1_2_ssss2_2 = vpadd_f32(_ss1_2, _ss2_2);
-
-              sum1 += vget_lane_f32(_ssss1_ssss2, 0);
-              sum1_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 0);
-              sum2 += vget_lane_f32(_ssss1_ssss2, 1);
-              sum2_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 1);
-#else
-              sum1 += in_ptr1[0] * pad_filter1[0];
-              sum1 += in_ptr1[1] * pad_filter1[1];
-              sum1 += in_ptr1[2] * pad_filter1[2];
-              sum1 += in_ptr2[0] * pad_filter2[0];
-              sum1 += in_ptr2[1] * pad_filter2[1];
-              sum1 += in_ptr2[2] * pad_filter2[2];
-              sum1 += in_ptr3[0] * pad_filter3[0];
-              sum1 += in_ptr3[1] * pad_filter3[1];
-              sum1 += in_ptr3[2] * pad_filter3[2];
-
-              sum2 += in_ptr2[0] * pad_filter1[0];
-              sum2 += in_ptr2[1] * pad_filter1[1];
-              sum2 += in_ptr2[2] * pad_filter1[2];
-              sum2 += in_ptr3[0] * pad_filter2[0];
-              sum2 += in_ptr3[1] * pad_filter2[1];
-              sum2 += in_ptr3[2] * pad_filter2[2];
-              sum2 += in_ptr4[0] * pad_filter3[0];
-              sum2 += in_ptr4[1] * pad_filter3[1];
-              sum2 += in_ptr4[2] * pad_filter3[2];
-
-              sum1_c2 += in_ptr1[0] * pad_filter1_c2[0];
-              sum1_c2 += in_ptr1[1] * pad_filter1_c2[1];
-              sum1_c2 += in_ptr1[2] * pad_filter1_c2[2];
-              sum1_c2 += in_ptr2[0] * pad_filter2_c2[0];
-              sum1_c2 += in_ptr2[1] * pad_filter2_c2[1];
-              sum1_c2 += in_ptr2[2] * pad_filter2_c2[2];
-              sum1_c2 += in_ptr3[0] * pad_filter3_c2[0];
-              sum1_c2 += in_ptr3[1] * pad_filter3_c2[1];
-              sum1_c2 += in_ptr3[2] * pad_filter3_c2[2];
-
-              sum2_c2 += in_ptr2[0] * pad_filter1_c2[0];
-              sum2_c2 += in_ptr2[1] * pad_filter1_c2[1];
-              sum2_c2 += in_ptr2[2] * pad_filter1_c2[2];
-              sum2_c2 += in_ptr3[0] * pad_filter2_c2[0];
-              sum2_c2 += in_ptr3[1] * pad_filter2_c2[1];
-              sum2_c2 += in_ptr3[2] * pad_filter2_c2[2];
-              sum2_c2 += in_ptr4[0] * pad_filter3_c2[0];
-              sum2_c2 += in_ptr4[1] * pad_filter3_c2[1];
-              sum2_c2 += in_ptr4[2] * pad_filter3_c2[2];
-#endif
-            } else {
-#if __ARM_NEON
-              float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-              float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-              float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2);
-              float32x4_t _pad_filter0 = vld1q_f32(pad_filter0);
-              float32x4_t _pad_filter0_c2 = vld1q_f32(pad_filter0_c2);
-
-              float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-              float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2);
-              float32x4_t _sum2 = vmulq_f32(_in_ptr1, _pad_filter0);
-              float32x4_t _sum2_c2 = vmulq_f32(_in_ptr1, _pad_filter0_c2);
-
-              float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-              float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-              float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2);
-
-              _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-              _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr2, _pad_filter1);
-              _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr2, _pad_filter1_c2);
-
-              float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-              float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-              float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2);
-
-              _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-              _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2);
-              _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr3, _pad_filter2_c2);
-
-              _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-              _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3);
-              _sum2 = vsetq_lane_f32(sum2, _sum2, 3);
-              _sum2_c2 = vsetq_lane_f32(sum2_c2, _sum2_c2, 3);
-
-              float32x2_t _ss1 =
-                  vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-              float32x2_t _ss1_2 =
-                  vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2));
-              float32x2_t _ss2 =
-                  vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
-              float32x2_t _ss2_2 =
-                  vadd_f32(vget_low_f32(_sum2_c2), vget_high_f32(_sum2_c2));
-              float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2);
-              float32x2_t _ssss1_2_ssss2_2 = vpadd_f32(_ss1_2, _ss2_2);
-
-              sum1 += vget_lane_f32(_ssss1_ssss2, 0);
-              sum1_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 0);
-              sum2 += vget_lane_f32(_ssss1_ssss2, 1);
-              sum2_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 1);
-#else
-              sum1 += in_ptr1[0] * pad_filter1[0];
-              sum1 += in_ptr1[1] * pad_filter1[1];
-              sum1 += in_ptr1[2] * pad_filter1[2];
-              sum1 += in_ptr2[0] * pad_filter2[0];
-              sum1 += in_ptr2[1] * pad_filter2[1];
-              sum1 += in_ptr2[2] * pad_filter2[2];
-              sum1 += in_ptr3[0] * pad_filter3[0];
-              sum1 += in_ptr3[1] * pad_filter3[1];
-              sum1 += in_ptr3[2] * pad_filter3[2];
-
-              sum2 += in_ptr1[0] * pad_filter0[0];
-              sum2 += in_ptr1[1] * pad_filter0[1];
-              sum2 += in_ptr1[2] * pad_filter0[2];
-              sum2 += in_ptr2[0] * pad_filter1[0];
-              sum2 += in_ptr2[1] * pad_filter1[1];
-              sum2 += in_ptr2[2] * pad_filter1[2];
-              sum2 += in_ptr3[0] * pad_filter2[0];
-              sum2 += in_ptr3[1] * pad_filter2[1];
-              sum2 += in_ptr3[2] * pad_filter2[2];
-
-              sum1_c2 += in_ptr1[0] * pad_filter1_c2[0];
-              sum1_c2 += in_ptr1[1] * pad_filter1_c2[1];
-              sum1_c2 += in_ptr1[2] * pad_filter1_c2[2];
-              sum1_c2 += in_ptr2[0] * pad_filter2_c2[0];
-              sum1_c2 += in_ptr2[1] * pad_filter2_c2[1];
-              sum1_c2 += in_ptr2[2] * pad_filter2_c2[2];
-              sum1_c2 += in_ptr3[0] * pad_filter3_c2[0];
-              sum1_c2 += in_ptr3[1] * pad_filter3_c2[1];
-              sum1_c2 += in_ptr3[2] * pad_filter3_c2[2];
-
-              sum2_c2 += in_ptr1[0] * pad_filter0_c2[0];
-              sum2_c2 += in_ptr1[1] * pad_filter0_c2[1];
-              sum2_c2 += in_ptr1[2] * pad_filter0_c2[2];
-              sum2_c2 += in_ptr2[0] * pad_filter1_c2[0];
-              sum2_c2 += in_ptr2[1] * pad_filter1_c2[1];
-              sum2_c2 += in_ptr2[2] * pad_filter1_c2[2];
-              sum2_c2 += in_ptr3[0] * pad_filter2_c2[0];
-              sum2_c2 += in_ptr3[1] * pad_filter2_c2[1];
-              sum2_c2 += in_ptr3[2] * pad_filter2_c2[2];
-#endif
-            }
-            if (!if_nopadding &&
-                (o_w < padding_w || o_w > output_w - padding_w - 2)) {
-              pad_filter0--;
-              pad_filter1--;
-              pad_filter2--;
-              pad_filter3--;
-
-              pad_filter0_c2--;
-              pad_filter1_c2--;
-              pad_filter2_c2--;
-              pad_filter3_c2--;
-            } else {
-              in_ptr1++;
-              in_ptr2++;
-              in_ptr3++;
-              in_ptr4++;
-            }
-            *out_ptr1 += sum1;
-            *out_ptr2 += sum2;
-            *out_ptr1_c2 += sum1_c2;
-            *out_ptr2_c2 += sum2_c2;
-
-            out_ptr1++;
-            out_ptr2++;
-            out_ptr1_c2++;
-            out_ptr2_c2++;
-          }
-          if (if_nopadding) {
-            in_ptr1 += 2 + input_w;
-            in_ptr2 += 2 + input_w;
-            in_ptr3 += 2 + input_w;
-            in_ptr4 += 2 + input_w;
-          } else if (o_h == padding_h - 1 || o_h == output_h - padding_h - 2) {
-            in_ptr1 += 3;
-            in_ptr2 += 3;
-            in_ptr3 += 3;
-            in_ptr4 += 3;
-
-            pad_filter0 -= 2;
-            pad_filter1 -= 2;
-            pad_filter2 -= 2;
-            pad_filter3 -= 2;
-
-            pad_filter0_c2 -= 2;
-            pad_filter1_c2 -= 2;
-            pad_filter2_c2 -= 2;
-            pad_filter3_c2 -= 2;
-
-          } else if (issamefilter) {
-            in_ptr1 += 3 + input_w;
-            in_ptr2 += 3 + input_w;
-            in_ptr3 += 3 + input_w;
-            in_ptr4 += 3 + input_w;
-
-            pad_filter0 += 2 * padding_w + 1;
-            pad_filter1 += 2 * padding_w + 1;
-            pad_filter2 += 2 * padding_w + 1;
-            pad_filter3 += 2 * padding_w + 1;
-
-            pad_filter0_c2 += 2 * padding_w + 1;
-            pad_filter1_c2 += 2 * padding_w + 1;
-            pad_filter2_c2 += 2 * padding_w + 1;
-            pad_filter3_c2 += 2 * padding_w + 1;
-
-          } else {
-            pad_filter0 -= 3 + 2 * padding_w + 2;
-            pad_filter1 -= 3 + 2 * padding_w + 2;
-            pad_filter2 -= 3 + 2 * padding_w + 2;
-            pad_filter3 -= 3 + 2 * padding_w + 2;
-
-            pad_filter0_c2 -= 3 + 2 * padding_w + 2;
-            pad_filter1_c2 -= 3 + 2 * padding_w + 2;
-            pad_filter2_c2 -= 3 + 2 * padding_w + 2;
-            pad_filter3_c2 -= 3 + 2 * padding_w + 2;
-
-            in_ptr1 -= input_w - 3;
-            in_ptr2 -= input_w - 3;
-            in_ptr3 -= input_w - 3;
-            in_ptr4 -= input_w - 3;
-          }
-          out_ptr1 += output_w;
-          out_ptr2 += output_w;
-          out_ptr1_c2 += output_w;
-          out_ptr2_c2 += output_w;
-        }
-        // remain output_height
-        for (; o_h < output_h; ++o_h) {
-          int o_w = 0;
-          // pad left
-          for (; o_w < padding_w; ++o_w) {
-            float sum1 = 0;
-            float sum1_c2 = 0;
-#if __ARM_NEON
-            float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-            float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-            float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2);
-            float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-            float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2);
-
-            float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-            float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-            float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2);
-            _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-            _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2);
-
-            float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-            float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-            float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2);
-            _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-            _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2);
-
-            _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-            _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3);
-
-            float32x2_t _ss1 =
-                vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-            float32x2_t _ss1_2 =
-                vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2));
-            float32x2_t _ssss1_ssss1_2 = vpadd_f32(_ss1, _ss1_2);
-
-            sum1 += vget_lane_f32(_ssss1_ssss1_2, 0);
-            sum1_c2 += vget_lane_f32(_ssss1_ssss1_2, 1);
-#else
-            sum1 += in_ptr1[0] * pad_filter1[0];
-            sum1 += in_ptr1[1] * pad_filter1[1];
-            sum1 += in_ptr1[2] * pad_filter1[2];
-            sum1 += in_ptr2[0] * pad_filter2[0];
-            sum1 += in_ptr2[1] * pad_filter2[1];
-            sum1 += in_ptr2[2] * pad_filter2[2];
-            sum1 += in_ptr3[0] * pad_filter3[0];
-            sum1 += in_ptr3[1] * pad_filter3[1];
-            sum1 += in_ptr3[2] * pad_filter3[2];
-
-            sum1_c2 += in_ptr1[0] * pad_filter1_c2[0];
-            sum1_c2 += in_ptr1[1] * pad_filter1_c2[1];
-            sum1_c2 += in_ptr1[2] * pad_filter1_c2[2];
-            sum1_c2 += in_ptr2[0] * pad_filter2_c2[0];
-            sum1_c2 += in_ptr2[1] * pad_filter2_c2[1];
-            sum1_c2 += in_ptr2[2] * pad_filter2_c2[2];
-            sum1_c2 += in_ptr3[0] * pad_filter3_c2[0];
-            sum1_c2 += in_ptr3[1] * pad_filter3_c2[1];
-            sum1_c2 += in_ptr3[2] * pad_filter3_c2[2];
-#endif
-            if (!if_nopadding &&
-                (o_w < padding_w || o_w > output_w - padding_w - 2)) {
-              pad_filter0--;
-              pad_filter1--;
-              pad_filter2--;
-              pad_filter3--;
-
-              pad_filter0_c2--;
-              pad_filter1_c2--;
-              pad_filter2_c2--;
-              pad_filter3_c2--;
-            } else {
-              in_ptr1++;
-              in_ptr2++;
-              in_ptr3++;
-              in_ptr4++;
-            }
-            *out_ptr1 += sum1;
-            *out_ptr1_c2 += sum1_c2;
-
-            out_ptr1++;
-            out_ptr1_c2++;
-          }
-//             valid
-#if __ARM_NEON
-#if __aarch64__
-          if (if_nopadding) {
-            int loop = (output_w - 2 * padding_w) >> 2;
-            o_w += loop * 4;
-
-            if (loop > 0) {
-              asm volatile(
-                  "prfm   pldl1keep, [%[f1], #256]          \n\t"
-                  "prfm   pldl1keep, [%[f1_c2], #256]        \n\t"
-
-                  "ld1   {v0.4s, v1.4s}, [%[f1]]            \n\t"
-                  "add        %[f1], %[f1], #32             \n\t"
-                  "ld1   {v2.4s, v3.4s}, [%[f1_c2]]          \n\t"
-                  "add        %[f1_c2], %[f1_c2], #32         \n\t"
-
-                  "ld1   {v4.s}[0], [%[f1]]                 \n\t"
-                  "sub        %[f1],%[f1], #32              \n\t"
-                  "ld1   {v4.s}[1], [%[f1_c2]]               \n\t"
-                  "sub        %[f1_c2],%[f1_c2], #32          \n\t"
-
-                  "0:                                       \n\t"
-                  // load out_ptr
-                  "prfm   pldl1keep, [%[out_ptr1], #128]    \n\t"
-                  "prfm   pldl1keep, [%[out_ptr1_c2], #128]  \n\t"
-
-                  "ld1   {v12.4s}, [%[out_ptr1]]            \n\t"
-                  "ld1   {v13.4s}, [%[out_ptr1_c2]]          \n\t"
-
-                  // in_ptr1 multiply
-                  "prfm   pldl1keep, [%[in_ptr1], #192]     \n\t"
-                  "ld1   {v5.4s, v6.4s}, [%[in_ptr1]]       \n\t"
-                  "add        %[in_ptr1],%[in_ptr1], #16    \n\t"
-
-                  "ext    v8.16b, v5.16b, v6.16b, #4        \n\t"
-                  "fmla   v12.4s, v5.4s, v0.s[0]            \n\t"
-                  "fmla   v13.4s, v5.4s, v2.s[0]            \n\t"
-
-                  "ext    v10.16b, v5.16b, v6.16b, #8       \n\t"
-                  "fmla   v12.4s, v8.4s, v0.s[1]            \n\t"
-                  "fmla   v13.4s, v8.4s, v2.s[1]            \n\t"
-
-                  "ld1   {v5.4s, v6.4s}, [%[in_ptr2]]       \n\t"
-                  "add        %[in_ptr2],%[in_ptr2], #16    \n\t"
-                  "fmla   v12.4s, v10.4s, v0.s[2]           \n\t"
-                  "fmla   v13.4s, v10.4s, v2.s[2]           \n\t"
-
-                  // in_ptr2 multiply
-                  "ext    v8.16b,  v5.16b, v6.16b, #4       \n\t"
-                  "fmla   v12.4s, v5.4s, v0.s[3]            \n\t"
-                  "fmla   v13.4s, v5.4s, v2.s[3]            \n\t"
-
-                  "ext    v9.16b,  v5.16b, v6.16b, #8       \n\t"
-                  "fmla   v12.4s, v8.4s, v1.s[0]            \n\t"
-                  "fmla   v13.4s, v8.4s, v3.s[0]            \n\t"
-
-                  "ld1   {v6.d}[1], [%[in_ptr3]]            \n\t"
-                  "add        %[in_ptr3],%[in_ptr3], #8     \n\t"
-                  "ld1   {v7.4s}, [%[in_ptr3]]              \n\t"
-                  "add        %[in_ptr3],%[in_ptr3], #8     \n\t"
-
-                  "fmla   v12.4s, v9.4s, v1.s[1]            \n\t"
-                  "fmla   v13.4s, v9.4s, v3.s[1]            \n\t"
-
-                  // in_ptr3 multiply
-                  "ext    v10.16b, v6.16b, v7.16b, #8       \n\t"
-                  "fmla   v12.4s, v7.4s, v4.s[0]            \n\t"
-                  "fmla   v13.4s, v7.4s, v4.s[1]            \n\t"
-
-                  "ext    v11.16b, v6.16b,  v7.16b, #12     \n\t"
-                  "fmla   v12.4s, v10.4s, v1.s[2]           \n\t"
-                  "fmla   v13.4s, v10.4s, v3.s[2]           \n\t"
-
-                  "fmla   v12.4s, v11.4s, v1.s[3]           \n\t"
-                  "fmla   v13.4s, v11.4s, v3.s[3]           \n\t"
-
-                  // store out_ptr
-                  "st1   {v12.4s}, [%[out_ptr1]], #16       \n\t"
-                  "st1   {v13.4s}, [%[out_ptr1_c2]], #16     \n\t"
-
-                  // cycle
-                  "subs       %[loop],%[loop], #1   \n\t"
-                  "bne        0b                            \n\t"
-
-                  : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1),
-                    [out_ptr2] "+r"(out_ptr2), [out_ptr1_c2] "+r"(out_ptr1_c2),
-                    [out_ptr2_c2] "+r"(out_ptr2_c2), [in_ptr1] "+r"(in_ptr1),
-                    [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3),
-                    [in_ptr4] "+r"(in_ptr4)
-                  : [f1] "r"(f1), [f1_c2] "r"(f1_c2)
-                  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
-                    "v7", "v8", "v9", "v10", "v11", "v12", "v13");
-            }
-          }
-#else
-          if (if_nopadding) {
-            int loop = (output_w - 2 * padding_w) >> 2;
-            o_w += loop * 4;
-
-            if (loop > 0) {
-              asm volatile(
-                  "pld        [%[f1], #256]                 \n\t"
-                  "pld        [%[f1_c2], #256]               \n\t"
-
-                  "vld1.f32   {d0-d3}, [%[f1]]              \n\t"
-                  "add        %[f1], #32                    \n\t"
-                  "vld1.f32   {d4-d7}, [%[f1_c2]]            \n\t"
-                  "add        %[f1_c2], #32                  \n\t"
-
-                  "vld1.f32   {d8[0]}, [%[f1]]              \n\t"
-                  "sub        %[f1], #32                    \n\t"
-                  "vld1.f32   {d8[1]}, [%[f1_c2]]            \n\t"
-                  "sub        %[f1_c2], #32                  \n\t"
-
-                  "0:                                       \n\t"
-                  // load out_ptr
-                  "pld        [%[out_ptr1], #128]           \n\t"
-                  "pld        [%[out_ptr1_c2], #128]         \n\t"
-
-                  "vld1.f32   {d24, d25}, [%[out_ptr1]]     \n\t"
-                  "vld1.f32   {d26, d27}, [%[out_ptr1_c2]]   \n\t"
-
-                  // in_ptr1 multiply
-                  "pld        [%[in_ptr1], #128]            \n\t"
-
-                  "vld1.f32   {d10-d12}, [%[in_ptr1]]       \n\t"
-                  "add        %[in_ptr1], #16               \n\t"
-                  "vext.32    q8, q5, q6, #1                \n\t"
-
-                  "pld        [%[in_ptr2], #128]            \n\t"
-                  "vmla.f32   q12, q5, d0[0]                \n\t"
-                  "vmla.f32   q13, q5, d4[0]                \n\t"
-
-                  "vext.32    q10, q5, q6, #2               \n\t"
-                  "vld1.f32   {d10-d12}, [%[in_ptr2]]       \n\t"
-                  "add        %[in_ptr2], #16               \n\t"
-                  "vmla.f32   q12, q8, d0[1]                \n\t"
-                  "vmla.f32   q13, q8, d4[1]                \n\t"
-
-                  "vmla.f32   q12, q10, d1[0]               \n\t"
-                  "vmla.f32   q13, q10, d5[0]               \n\t"
-
-                  // in_ptr2 multiply
-                  "vext.32    q8, q5, q6, #1                \n\t"
-                  "pld        [%[in_ptr3], #128]            \n\t"
-                  "vmla.f32   q12, q5, d1[1]                \n\t"
-                  "vmla.f32   q13, q5, d5[1]                \n\t"
-
-                  "vext.32    q9, q5, q6, #2                \n\t"
-                  "vld1.f32   {d13-d15}, [%[in_ptr3]]       \n\t"
-                  "add        %[in_ptr3], #16               \n\t"
-                  "vmla.f32   q12, q8, d2[0]                \n\t"
-                  "vmla.f32   q13, q8, d6[0]                \n\t"
-
-                  "vmla.f32   q12, q9, d2[1]                \n\t"
-                  "vmla.f32   q13, q9, d6[1]                \n\t"
-
-                  // in_ptr3 multiply
-                  "vext.32    q10, q6, q7, #2               \n\t"
-                  "vmla.f32   q12, q7, d8[0]                \n\t"
-                  "vmla.f32   q13, q7, d8[1]                \n\t"
-
-                  "vext.32    q11, q6, q7, #3               \n\t"
-                  "vmla.f32   q12, q10, d3[0]               \n\t"
-                  "vmla.f32   q13, q10, d7[0]               \n\t"
-
-                  "vmla.f32   q12, q11, d3[1]               \n\t"
-                  "vmla.f32   q13, q11, d7[1]               \n\t"
-
-                  // store out_ptr
-                  "subs       %[loop], #1               \n\t"
-                  "vst1.f32   {d24, d25}, [%[out_ptr1]]!    \n\t"
-                  "vst1.f32   {d26, d27}, [%[out_ptr1_c2]]!  \n\t"
-
-                  // cycle
-                  "bne        0b                            \n\t"
-
-                  : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1),
-                    [out_ptr2] "+r"(out_ptr2), [out_ptr1_c2] "+r"(out_ptr1_c2),
-                    [out_ptr2_c2] "+r"(out_ptr2_c2), [in_ptr1] "+r"(in_ptr1),
-                    [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3),
-                    [in_ptr4] "+r"(in_ptr4)
-                  : [f1] "r"(f1), [f1_c2] "r"(f1_c2)
-                  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
-                    "q7", "q8", "q9", "q10", "q11", "q12", "q13");
-            }
-          }
-
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-
-          // remain output_width
-          for (; o_w < output_w; ++o_w) {
-            float sum1 = 0;
-            float sum1_c2 = 0;
-#if __ARM_NEON
-            float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-            float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-            float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2);
-            float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-            float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2);
-
-            float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-            float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-            float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2);
-            _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-            _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2);
-
-            float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-            float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-            float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2);
-            _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-            _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2);
-
-            _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-            _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3);
-
-            float32x2_t _ss1 =
-                vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-            float32x2_t _ss1_2 =
-                vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2));
-
-            float32x2_t _ssss1_ssss1_2 = vpadd_f32(_ss1, _ss1_2);
-            sum1 += vget_lane_f32(_ssss1_ssss1_2, 0);
-            sum1_c2 += vget_lane_f32(_ssss1_ssss1_2, 1);
-#else
-            sum1 += in_ptr1[0] * pad_filter1[0];
-            sum1 += in_ptr1[1] * pad_filter1[1];
-            sum1 += in_ptr1[2] * pad_filter1[2];
-            sum1 += in_ptr2[0] * pad_filter2[0];
-            sum1 += in_ptr2[1] * pad_filter2[1];
-            sum1 += in_ptr2[2] * pad_filter2[2];
-            sum1 += in_ptr3[0] * pad_filter3[0];
-            sum1 += in_ptr3[1] * pad_filter3[1];
-            sum1 += in_ptr3[2] * pad_filter3[2];
-
-            sum1_c2 += in_ptr1[0] * pad_filter1_c2[0];
-            sum1_c2 += in_ptr1[1] * pad_filter1_c2[1];
-            sum1_c2 += in_ptr1[2] * pad_filter1_c2[2];
-            sum1_c2 += in_ptr2[0] * pad_filter2_c2[0];
-            sum1_c2 += in_ptr2[1] * pad_filter2_c2[1];
-            sum1_c2 += in_ptr2[2] * pad_filter2_c2[2];
-            sum1_c2 += in_ptr3[0] * pad_filter3_c2[0];
-            sum1_c2 += in_ptr3[1] * pad_filter3_c2[1];
-            sum1_c2 += in_ptr3[2] * pad_filter3_c2[2];
-#endif
-            if (!if_nopadding &&
-                (o_w < padding_w || o_w > output_w - padding_w - 2)) {
-              pad_filter0--;
-              pad_filter1--;
-              pad_filter2--;
-              pad_filter3--;
-
-              pad_filter0_c2--;
-              pad_filter1_c2--;
-              pad_filter2_c2--;
-              pad_filter3_c2--;
-            } else {
-              in_ptr1++;
-              in_ptr2++;
-              in_ptr3++;
-              in_ptr4++;
-            }
-            *out_ptr1 += sum1;
-            *out_ptr1_c2 += sum1_c2;
-
-            out_ptr1++;
-            out_ptr1_c2++;
-          }
-          out_ptr1 += output_w;
-          out_ptr1_c2 += output_w;
-        }
-        filter_data_ch += filter_ch_size;
-        filter_data_ch_c2 += filter_ch_size;
-        input_data_ch += in_ch_size;
-      }
-    }
-
-    int out_ch_remain_start = output_ch - output_ch % 2;
-    // remain output_channel
-    for (int o_c = out_ch_remain_start; o_c < output_ch; ++o_c) {
-      bool issamefilter;
-      const float *in_ptr1, *in_ptr2, *in_ptr3, *in_ptr4;
-      const float *f1;
-      const float *pad_filter0, *pad_filter1, *pad_filter2, *pad_filter3;
-      float pad_filter_arr[pad_filter_ch_size];
-      float *output_data_ch;
-      const float *input_data_ch;
-      const float *filter_data_ch;
-
-      input_data_ch = input_data;
-      output_data_ch = output_data + o_c * out_ch_size;
-      filter_data_ch = filter_data + o_c * filter_ch_size * input_ch;
-
-      for (int i_c = 0; i_c < input_ch; ++i_c) {
-        f1 = filter_data_ch;
-        if (!if_nopadding) {
-          memset(pad_filter_arr, 0.f, sizeof(pad_filter_arr));
-          for (int i = 0; i < 9; ++i) {
-            int j = i / 3 * (2 * padding_w + 3) + i % 3 + padding_h * 3 +
-                    padding_w * (2 * padding_h + 1);
-            pad_filter_arr[j] = filter_data_ch[i];
-          }
-          pad_filter1 = pad_filter_arr;
-          pad_filter1 += pad_filter_start;
-          pad_filter0 = pad_filter1 - pad_filter_w;
-          pad_filter2 = pad_filter1 + pad_filter_w;
-          pad_filter3 = pad_filter2 + pad_filter_w;
-
-        } else {
-          pad_filter1 = filter_data_ch;
-          pad_filter2 = pad_filter1 + 3;
-          pad_filter3 = pad_filter2 + 3;
-        }
-        float *out_ptr1, *out_ptr2;
-        out_ptr1 = output_data_ch;
-        out_ptr2 = out_ptr1 + output_w;
-
-        in_ptr1 = input_data_ch;
-        in_ptr2 = in_ptr1 + input_w;
-        in_ptr3 = in_ptr2 + input_w;
-        in_ptr4 = in_ptr3 + input_w;
-
-        int o_h = 0;
-        for (; o_h < output_h - 1; o_h = o_h + 2) {
-          if (!if_nopadding &&
-              (o_h < padding_h || o_h > output_h - padding_h - 2)) {
-            issamefilter = false;
-          } else {
-            issamefilter = true;
-          }
-          int o_w = 0;
-          // pad left
-          for (; o_w < padding_w; ++o_w) {
-            float sum1 = 0;
-            float sum2 = 0;
-
-            if (issamefilter) {
-#if __ARM_NEON
-              float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-              float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-              float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-
-              float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-              float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-              _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-              float32x4_t _sum2 = vmulq_f32(_in_ptr2, _pad_filter1);
-
-              float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-              float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-              _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2);
-
-              float32x4_t _in_ptr4 = vld1q_f32(in_ptr4);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr4, _pad_filter3);
-
-              _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-              _sum2 = vsetq_lane_f32(sum2, _sum2, 3);
-
-              float32x2_t _ss1 =
-                  vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-              float32x2_t _ss2 =
-                  vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
-              float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2);
-
-              sum1 += vget_lane_f32(_ssss1_ssss2, 0);
-              sum2 += vget_lane_f32(_ssss1_ssss2, 1);
-#else
-              sum1 += in_ptr1[0] * pad_filter1[0];
-              sum1 += in_ptr1[1] * pad_filter1[1];
-              sum1 += in_ptr1[2] * pad_filter1[2];
-              sum1 += in_ptr2[0] * pad_filter2[0];
-              sum1 += in_ptr2[1] * pad_filter2[1];
-              sum1 += in_ptr2[2] * pad_filter2[2];
-              sum1 += in_ptr3[0] * pad_filter3[0];
-              sum1 += in_ptr3[1] * pad_filter3[1];
-              sum1 += in_ptr3[2] * pad_filter3[2];
-
-              sum2 += in_ptr2[0] * pad_filter1[0];
-              sum2 += in_ptr2[1] * pad_filter1[1];
-              sum2 += in_ptr2[2] * pad_filter1[2];
-              sum2 += in_ptr3[0] * pad_filter2[0];
-              sum2 += in_ptr3[1] * pad_filter2[1];
-              sum2 += in_ptr3[2] * pad_filter2[2];
-              sum2 += in_ptr4[0] * pad_filter3[0];
-              sum2 += in_ptr4[1] * pad_filter3[1];
-              sum2 += in_ptr4[2] * pad_filter3[2];
-#endif
-            } else {
-#if __ARM_NEON
-              float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-              float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-              float32x4_t _pad_filter0 = vld1q_f32(pad_filter0);
-
-              float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-              float32x4_t _sum2 = vmulq_f32(_in_ptr1, _pad_filter0);
-              float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-              float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-
-              _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr2, _pad_filter1);
-
-              float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-              float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-
-              _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2);
-
-              _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-              _sum2 = vsetq_lane_f32(sum2, _sum2, 3);
-
-              float32x2_t _ss1 =
-                  vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-              float32x2_t _ss2 =
-                  vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
-              float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2);
-
-              sum1 += vget_lane_f32(_ssss1_ssss2, 0);
-              sum2 += vget_lane_f32(_ssss1_ssss2, 1);
-#else
-              sum1 += in_ptr1[0] * pad_filter1[0];
-              sum1 += in_ptr1[1] * pad_filter1[1];
-              sum1 += in_ptr1[2] * pad_filter1[2];
-              sum1 += in_ptr2[0] * pad_filter2[0];
-              sum1 += in_ptr2[1] * pad_filter2[1];
-              sum1 += in_ptr2[2] * pad_filter2[2];
-              sum1 += in_ptr3[0] * pad_filter3[0];
-              sum1 += in_ptr3[1] * pad_filter3[1];
-              sum1 += in_ptr3[2] * pad_filter3[2];
-
-              sum2 += in_ptr1[0] * pad_filter0[0];
-              sum2 += in_ptr1[1] * pad_filter0[1];
-              sum2 += in_ptr1[2] * pad_filter0[2];
-              sum2 += in_ptr2[0] * pad_filter1[0];
-              sum2 += in_ptr2[1] * pad_filter1[1];
-              sum2 += in_ptr2[2] * pad_filter1[2];
-              sum2 += in_ptr3[0] * pad_filter2[0];
-              sum2 += in_ptr3[1] * pad_filter2[1];
-              sum2 += in_ptr3[2] * pad_filter2[2];
-#endif
-            }
-            if (!if_nopadding &&
-                (o_w < padding_w || o_w > output_w - padding_w - 2)) {
-              pad_filter0--;
-              pad_filter1--;
-              pad_filter2--;
-              pad_filter3--;
-            } else {
-              in_ptr1++;
-              in_ptr2++;
-              in_ptr3++;
-              in_ptr4++;
-            }
-            *out_ptr1 += sum1;
-            *out_ptr2 += sum2;
-
-            out_ptr1++;
-            out_ptr2++;
-          }
-            // valid
-#if __ARM_NEON
-#if __aarch64__
-          if (issamefilter) {
-            int loop = (output_w - 2 * padding_w) >> 2;
-            o_w += loop * 4;
-
-            if (loop > 0) {
-              asm volatile(
-                  "prfm   pldl1keep, [%[f1], #256]          \n\t"
-
-                  "ld1   {v0.4s, v1.4s}, [%[f1]]            \n\t"
-                  "add        %[f1], %[f1], #32             \n\t"
-
-                  "ld1   {v4.s}[0], [%[f1]]                 \n\t"
-                  "sub        %[f1],%[f1], #32              \n\t"
-
-                  "0:                                       \n\t"
-                  // load out_ptr
-                  "prfm   pldl1keep, [%[out_ptr1], #128]    \n\t"
-                  "prfm   pldl1keep, [%[out_ptr2], #128]    \n\t"
-
-                  "ld1   {v12.4s}, [%[out_ptr1]]            \n\t"
-                  "ld1   {v14.4s}, [%[out_ptr2]]            \n\t"
-
-                  // in_ptr1 + in_ptr4 multiply
-                  "prfm   pldl1keep, [%[in_ptr1], #192]     \n\t"
-                  "prfm   pldl1keep, [%[in_ptr4], #192]     \n\t"
-
-                  "ld1   {v5.4s, v6.4s}, [%[in_ptr1]]       \n\t"
-                  "add        %[in_ptr1],%[in_ptr1], #16    \n\t"
-
-                  "ld1   {v6.d}[1], [%[in_ptr4]]            \n\t"
-                  "add        %[in_ptr4],%[in_ptr4], #8     \n\t"
-                  "ld1   {v7.4s}, [%[in_ptr4]]              \n\t"
-                  "add        %[in_ptr4],%[in_ptr4], #8     \n\t"
-
-                  "ext    v8.16b, v5.16b, v6.16b, #4        \n\t"
-                  "fmla   v12.4s, v5.4s, v0.s[0]            \n\t"
-
-                  "ext    v9.16b, v6.16b, v7.16b, #8        \n\t"
-                  "fmla   v14.4s, v7.4s, v4.s[0]            \n\t"
-
-                  "ext    v10.16b, v5.16b, v6.16b, #8       \n\t"
-                  "fmla   v12.4s, v8.4s, v0.s[1]            \n\t"
-
-                  "ext    v11.16b, v6.16b, v7.16b, #12      \n\t"
-                  "fmla   v14.4s, v9.4s, v1.s[2]            \n\t"
-
-                  "ld1   {v5.4s, v6.4s}, [%[in_ptr2]]       \n\t"
-                  "add        %[in_ptr2],%[in_ptr2], #16    \n\t"
-
-                  "fmla   v12.4s, v10.4s, v0.s[2]           \n\t"
-                  "fmla   v14.4s, v11.4s, v1.s[3]           \n\t"
-
-                  // in_ptr2 multiply
-                  "ext    v8.16b,  v5.16b, v6.16b, #4       \n\t"
-                  "fmla   v12.4s, v5.4s, v0.s[3]            \n\t"
-                  "fmla   v14.4s, v5.4s, v0.s[0]            \n\t"
-
-                  "ext    v9.16b,  v5.16b, v6.16b, #8       \n\t"
-                  "fmla   v12.4s, v8.4s, v1.s[0]            \n\t"
-                  "fmla   v14.4s, v8.4s, v0.s[1]            \n\t"
-
-                  "ld1   {v6.d}[1], [%[in_ptr3]]            \n\t"
-                  "add        %[in_ptr3],%[in_ptr3], #8     \n\t"
-                  "ld1   {v7.4s}, [%[in_ptr3]]              \n\t"
-
-                  "add        %[in_ptr3],%[in_ptr3], #8     \n\t"
-                  "fmla   v12.4s, v9.4s, v1.s[1]            \n\t"
-                  "fmla   v14.4s, v9.4s, v0.s[2]            \n\t"
-
-                  // in_ptr3 multiply
-                  "ext    v10.16b, v6.16b, v7.16b, #8       \n\t"
-                  "fmla   v12.4s, v7.4s, v4.s[0]            \n\t"
-                  "fmla   v14.4s, v7.4s, v1.s[1]            \n\t"
-
-                  "ext    v11.16b, v6.16b,  v7.16b, #12     \n\t"
-                  "fmla   v12.4s, v10.4s, v1.s[2]           \n\t"
-                  "fmla   v14.4s, v10.4s, v0.s[3]           \n\t"
-
-                  "fmla   v12.4s, v11.4s, v1.s[3]           \n\t"
-                  "fmla   v14.4s, v11.4s, v1.s[0]           \n\t"
-
-                  // store out_ptr
-                  "st1   {v12.4s}, [%[out_ptr1]], #16       \n\t"
-                  "st1   {v14.4s}, [%[out_ptr2]], #16       \n\t"
-
-                  // cycle
-                  "subs       %[loop],%[loop], #1   \n\t"
-                  "bne        0b                            \n\t"
-
-                  : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1),
-                    [out_ptr2] "+r"(out_ptr2), [in_ptr1] "+r"(in_ptr1),
-                    [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3),
-                    [in_ptr4] "+r"(in_ptr4)
-                  : [f1] "r"(f1)
-                  : "cc", "memory", "v0", "v1", "v4", "v5", "v6", "v7", "v8",
-                    "v9", "v10", "v11", "v12", "v14");
-            }
-          }
-          if (!if_nopadding && o_w == output_w - padding_w) {
-            pad_filter0--;
-            pad_filter1--;
-            pad_filter2--;
-            pad_filter3--;
-
-            in_ptr1--;
-            in_ptr2--;
-            in_ptr3--;
-            in_ptr4--;
-          }
-#else
-          if (issamefilter) {
-            int loop = (output_w - 2 * padding_w) >> 2;
-            o_w += loop * 4;
-
-            if (loop > 0) {
-              asm volatile(
-                  "pld        [%[f1], #256]                 \n\t"
-                  "vld1.f32   {d0-d3}, [%[f1]]              \n\t"
-                  "add        %[f1], #32                    \n\t"
-
-                  "vld1.f32   {d8[0]}, [%[f1]]              \n\t"
-                  "sub        %[f1], #32                    \n\t"
-
-                  "0:                                       \n\t"
-                  // load out_ptr
-                  "pld        [%[out_ptr1], #128]           \n\t"
-                  "pld        [%[out_ptr2], #128]           \n\t"
-
-                  "vld1.f32   {d24, d25}, [%[out_ptr1]]     \n\t"
-                  "vld1.f32   {d28, d29}, [%[out_ptr2]]     \n\t"
-
-                  // in_ptr1 + in_ptr4 multiply
-                  "pld        [%[in_ptr1], #192]            \n\t"
-                  "pld        [%[in_ptr4], #192]            \n\t"
-
-                  "vld1.f32   {d10-d12}, [%[in_ptr1]]       \n\t"
-                  "add        %[in_ptr1], #16               \n\t"
-
-                  "vld1.f32   {d13-d15}, [%[in_ptr4]]       \n\t"
-                  "add        %[in_ptr4], #16               \n\t"
-
-                  "vext.32    q8, q5, q6, #1                \n\t"
-                  "vmla.f32   q12, q5, d0[0]                \n\t"
-
-                  "vext.32    q9, q6, q7, #2                \n\t"
-                  "vmla.f32   q14, q7, d8[0]                \n\t"
-
-                  "vext.32    q10, q5, q6, #2               \n\t"
-                  "vmla.f32   q12, q8, d0[1]                \n\t"
-
-                  "vext.32    q11, q6, q7, #3               \n\t"
-                  "vmla.f32   q14, q9, d3[0]                \n\t"
-
-                  "vld1.f32   {d10-d12}, [%[in_ptr2]]       \n\t"
-                  "add        %[in_ptr2], #16               \n\t"
-
-                  "vmla.f32   q12, q10, d1[0]               \n\t"
-                  "vmla.f32   q14, q11, d3[1]               \n\t"
-
-                  // in_ptr2 multiply
-                  "vext.32    q8, q5, q6, #1                \n\t"
-                  "vmla.f32   q12, q5, d1[1]                \n\t"
-                  "vmla.f32   q14, q5, d0[0]                \n\t"
-
-                  "vext.32    q9, q5, q6, #2                \n\t"
-                  "vmla.f32   q12, q8, d2[0]                \n\t"
-                  "vmla.f32   q14, q8, d0[1]                \n\t"
-
-                  "vld1.f32   {d13-d15}, [%[in_ptr3]]       \n\t"
-                  "add        %[in_ptr3], #16               \n\t"
-
-                  "vmla.f32   q12, q9, d2[1]                \n\t"
-                  "vmla.f32   q14, q9, d1[0]                \n\t"
-
-                  // in_ptr3 multiply
-                  "vext.32    q10, q6, q7, #2               \n\t"
-                  "vmla.f32   q12, q7, d8[0]                \n\t"
-                  "vmla.f32   q14, q7, d2[1]                \n\t"
-
-                  "vext.32    q11, q6, q7, #3               \n\t"
-                  "vmla.f32   q12, q10, d3[0]               \n\t"
-                  "vmla.f32   q14, q10, d1[1]               \n\t"
-
-                  "vmla.f32   q12, q11, d3[1]               \n\t"
-                  "vmla.f32   q14, q11, d2[0]               \n\t"
-
-                  // store out_ptr
-                  "subs       %[loop], #1               \n\t"
-                  "vst1.f32   {d24, d25}, [%[out_ptr1]]!    \n\t"
-                  "vst1.f32   {d28, d29}, [%[out_ptr2]]!    \n\t"
-
-                  // cycle
-                  "bne        0b                            \n\t"
-
-                  : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1),
-                    [out_ptr2] "+r"(out_ptr2), [in_ptr1] "+r"(in_ptr1),
-                    [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3),
-                    [in_ptr4] "+r"(in_ptr4)
-                  : [f1] "r"(f1)
-                  : "cc", "memory", "q0", "q1", "q4", "q5", "q6", "q7", "q8",
-                    "q9", "q10", "q11", "q12", "q14");
-            }
-          }
-          if (!if_nopadding && o_w == output_w - padding_w) {
-            pad_filter0--;
-            pad_filter1--;
-            pad_filter2--;
-            pad_filter3--;
-
-            in_ptr1--;
-            in_ptr2--;
-            in_ptr3--;
-            in_ptr4--;
-          }
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-
-          // remain output_width
-          for (; o_w < output_w; ++o_w) {
-            float sum1 = 0;
-            float sum2 = 0;
-
-            if (issamefilter) {
-#if __ARM_NEON
-              float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-              float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-              float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-
-              float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-              float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-              _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-              float32x4_t _sum2 = vmulq_f32(_in_ptr2, _pad_filter1);
-
-              float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-              float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-              _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2);
-
-              float32x4_t _in_ptr4 = vld1q_f32(in_ptr4);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr4, _pad_filter3);
-
-              _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-              _sum2 = vsetq_lane_f32(sum2, _sum2, 3);
-
-              float32x2_t _ss1 =
-                  vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-              float32x2_t _ss2 =
-                  vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
-              float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2);
-
-              sum1 += vget_lane_f32(_ssss1_ssss2, 0);
-              sum2 += vget_lane_f32(_ssss1_ssss2, 1);
-#else
-              sum1 += in_ptr1[0] * pad_filter1[0];
-              sum1 += in_ptr1[1] * pad_filter1[1];
-              sum1 += in_ptr1[2] * pad_filter1[2];
-              sum1 += in_ptr2[0] * pad_filter2[0];
-              sum1 += in_ptr2[1] * pad_filter2[1];
-              sum1 += in_ptr2[2] * pad_filter2[2];
-              sum1 += in_ptr3[0] * pad_filter3[0];
-              sum1 += in_ptr3[1] * pad_filter3[1];
-              sum1 += in_ptr3[2] * pad_filter3[2];
-
-              sum2 += in_ptr2[0] * pad_filter1[0];
-              sum2 += in_ptr2[1] * pad_filter1[1];
-              sum2 += in_ptr2[2] * pad_filter1[2];
-              sum2 += in_ptr3[0] * pad_filter2[0];
-              sum2 += in_ptr3[1] * pad_filter2[1];
-              sum2 += in_ptr3[2] * pad_filter2[2];
-              sum2 += in_ptr4[0] * pad_filter3[0];
-              sum2 += in_ptr4[1] * pad_filter3[1];
-              sum2 += in_ptr4[2] * pad_filter3[2];
-#endif
-            } else {
-#if __ARM_NEON
-              float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-              float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-              float32x4_t _pad_filter0 = vld1q_f32(pad_filter0);
-
-              float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-              float32x4_t _sum2 = vmulq_f32(_in_ptr1, _pad_filter0);
-              float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-              float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-
-              _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr2, _pad_filter1);
-              float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-              float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-
-              _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2);
-              _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-              _sum2 = vsetq_lane_f32(sum2, _sum2, 3);
-
-              float32x2_t _ss1 =
-                  vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-              float32x2_t _ss2 =
-                  vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
-              float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2);
-
-              sum1 += vget_lane_f32(_ssss1_ssss2, 0);
-              sum2 += vget_lane_f32(_ssss1_ssss2, 1);
-#else
-              sum1 += in_ptr1[0] * pad_filter1[0];
-              sum1 += in_ptr1[1] * pad_filter1[1];
-              sum1 += in_ptr1[2] * pad_filter1[2];
-              sum1 += in_ptr2[0] * pad_filter2[0];
-              sum1 += in_ptr2[1] * pad_filter2[1];
-              sum1 += in_ptr2[2] * pad_filter2[2];
-              sum1 += in_ptr3[0] * pad_filter3[0];
-              sum1 += in_ptr3[1] * pad_filter3[1];
-              sum1 += in_ptr3[2] * pad_filter3[2];
-
-              sum2 += in_ptr1[0] * pad_filter0[0];
-              sum2 += in_ptr1[1] * pad_filter0[1];
-              sum2 += in_ptr1[2] * pad_filter0[2];
-              sum2 += in_ptr2[0] * pad_filter1[0];
-              sum2 += in_ptr2[1] * pad_filter1[1];
-              sum2 += in_ptr2[2] * pad_filter1[2];
-              sum2 += in_ptr3[0] * pad_filter2[0];
-              sum2 += in_ptr3[1] * pad_filter2[1];
-              sum2 += in_ptr3[2] * pad_filter2[2];
-#endif
-            }
-            if (!if_nopadding &&
-                (o_w < padding_w || o_w > output_w - padding_w - 2)) {
-              pad_filter0--;
-              pad_filter1--;
-              pad_filter2--;
-              pad_filter3--;
-            } else {
-              in_ptr1++;
-              in_ptr2++;
-              in_ptr3++;
-              in_ptr4++;
-            }
-            *out_ptr1 += sum1;
-            *out_ptr2 += sum2;
-
-            out_ptr1++;
-            out_ptr2++;
-          }
-          if (if_nopadding) {
-            in_ptr1 += 2 + input_w;
-            in_ptr2 += 2 + input_w;
-            in_ptr3 += 2 + input_w;
-            in_ptr4 += 2 + input_w;
-          } else if (o_h == padding_h - 1 || o_h == output_h - padding_h - 2) {
-            in_ptr1 += 3;
-            in_ptr2 += 3;
-            in_ptr3 += 3;
-            in_ptr4 += 3;
-
-            pad_filter0 -= 2;
-            pad_filter1 -= 2;
-            pad_filter2 -= 2;
-            pad_filter3 -= 2;
-
-          } else if (issamefilter) {
-            in_ptr1 += 3 + input_w;
-            in_ptr2 += 3 + input_w;
-            in_ptr3 += 3 + input_w;
-            in_ptr4 += 3 + input_w;
-
-            pad_filter0 += 2 * padding_w + 1;
-            pad_filter1 += 2 * padding_w + 1;
-            pad_filter2 += 2 * padding_w + 1;
-            pad_filter3 += 2 * padding_w + 1;
-
-          } else {
-            pad_filter0 -= 3 + 2 * padding_w + 2;
-            pad_filter1 -= 3 + 2 * padding_w + 2;
-            pad_filter2 -= 3 + 2 * padding_w + 2;
-            pad_filter3 -= 3 + 2 * padding_w + 2;
-
-            in_ptr1 -= input_w - 3;
-            in_ptr2 -= input_w - 3;
-            in_ptr3 -= input_w - 3;
-            in_ptr4 -= input_w - 3;
-          }
-          out_ptr1 += output_w;
-          out_ptr2 += output_w;
-        }
-
-        // remain output_height
-        for (; o_h < output_h; ++o_h) {
-          for (int o_w = 0; o_w < output_w; ++o_w) {
-            float sum1 = 0;
-#if __ARM_NEON
-            float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-            float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-            float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-
-            float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-            float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-            _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-
-            float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-            float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-            _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-            _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-
-            float32x2_t _ss1 =
-                vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-            float32x2_t _ssss1_ssss1 = vpadd_f32(_ss1, _ss1);
-            sum1 += vget_lane_f32(_ssss1_ssss1, 0);
-#else
-            sum1 += in_ptr1[0] * pad_filter1[0];
-            sum1 += in_ptr1[1] * pad_filter1[1];
-            sum1 += in_ptr1[2] * pad_filter1[2];
-            sum1 += in_ptr2[0] * pad_filter2[0];
-            sum1 += in_ptr2[1] * pad_filter2[1];
-            sum1 += in_ptr2[2] * pad_filter2[2];
-            sum1 += in_ptr3[0] * pad_filter3[0];
-            sum1 += in_ptr3[1] * pad_filter3[1];
-            sum1 += in_ptr3[2] * pad_filter3[2];
-#endif
-            if (!if_nopadding &&
-                (o_w < padding_w || o_w > output_w - padding_w - 2)) {
-              pad_filter0--;
-              pad_filter1--;
-              pad_filter2--;
-              pad_filter3--;
-
-            } else {
-              in_ptr1++;
-              in_ptr2++;
-              in_ptr3++;
-              in_ptr4++;
-            }
-            *out_ptr1 += sum1;
-            out_ptr1++;
-          }
-          out_ptr1 += output_w;
-        }
-        filter_data_ch += filter_ch_size;
-        input_data_ch += in_ch_size;
-      }
-    }
-    input_data += in_batch_size;
-    output_data += out_batch_size;
-  }
-}
-
-template <>
-void SlidingwindowConv3x3s2<float, float>(const framework::Tensor *input,
-                                          const framework::Tensor *filter,
-                                          const std::vector<int> &paddings,
-                                          framework::Tensor *output) {
-  const int batch = input->dims()[0];
-  const int input_ch = input->dims()[1];
-  const int input_h = input->dims()[2];
-  const int input_w = input->dims()[3];
-  const int output_ch = output->dims()[1];
-  const int output_h = output->dims()[2];
-  const int output_w = output->dims()[3];
-  const int padding_h = paddings[0];
-  const int padding_w = paddings[1];
-
-  const float *input_data = input->data<float>();
-  float *output_data = output->mutable_data<float>();
-  const float *filter_data = filter->data<float>();
-
-  const int in_ch_size = input_h * input_w;
-  const int in_batch_size = input_ch * in_ch_size;
-  const int out_ch_size = output_h * output_w;
-  const int out_batch_size = output_ch * out_ch_size;
-  const int out_size = batch * out_batch_size;
-  const int filter_ch_size = 9;
-  const int pad_filter_ch_size = (2 * padding_h + 3) * (2 * padding_w + 3);
-  const int pad_filter_start =
-      2 * padding_h * (2 * padding_w + 3) + 2 * padding_w;
-  const int pad_filter_w = 3 + padding_w * 2;
-
-  bool if_nopadding = false;
-  const bool if_exact_in_w = (input_w + 2 * padding_w - 3) % 2 == 0;
-  const bool if_exact_in_h = (input_h + 2 * padding_h - 3) % 2 == 0;
-  const bool if_odd_pad_w = padding_w % 2 == 1;
-  const bool if_odd_pad_h = padding_h % 2 == 1;
-
-  int valid_w_start = padding_w >> 1;
-  int valid_h_start = padding_h >> 1;
-  int valid_w_end = output_w - valid_w_start - 2;
-  int valid_h_end = output_h - valid_h_start - 2;
-  const int remain_stride_w = input_w + 2 * padding_w - 2 * output_w;
-#if __ARM_NEON
-  float *out_ptr = output_data;
-  int remain = out_size & 0x3;
-  float32x4_t _zero = vdupq_n_f32(0.0);
-
-  for (int i = 0; i < out_size; i += 4) {
-    vst1q_f32(out_ptr, _zero);
-    out_ptr += 4;
-  }
-  switch (remain) {
-    case 1:
-      vst1q_lane_f32(out_ptr, _zero, 0);
-      break;
-    case 2:
-      vst1_f32(out_ptr, vget_low_f32(_zero));
-      break;
-    case 3:
-      vst1_f32(out_ptr, vget_low_f32(_zero));
-      vst1q_lane_f32(out_ptr + 2, _zero, 0);
-      break;
-  }
-#else
-#pragma omp parallel for
-  for (int i = 0; i < out_size; ++i) {
-    output_data[i] = 0;
-  }
-#endif
-
-  if (padding_h == 0 && padding_w == 0) {
-    if_nopadding = true;
-    valid_w_start = -1;
-    valid_h_start = -1;
-    valid_w_end = output_w;
-    valid_h_end = output_h;
-  }
-
-  for (int b = 0; b < batch; ++b) {
-#pragma omp parallel for
-    for (int o_c = 0; o_c < output_ch - 7; o_c += 8) {
-      const float *f1;
-      const float *in_ptr1, *in_ptr2, *in_ptr3;
-      const float *pad_filter1, *pad_filter2, *pad_filter3;
-      const float *pad_filter1_c2, *pad_filter2_c2, *pad_filter3_c2;
-      const float *pad_filter1_c3, *pad_filter2_c3, *pad_filter3_c3;
-      const float *pad_filter1_c4, *pad_filter2_c4, *pad_filter3_c4;
-      const float *pad_filter1_c5, *pad_filter2_c5, *pad_filter3_c5;
-      const float *pad_filter1_c6, *pad_filter2_c6, *pad_filter3_c6;
-      const float *pad_filter1_c7, *pad_filter2_c7, *pad_filter3_c7;
-      const float *pad_filter1_c8, *pad_filter2_c8, *pad_filter3_c8;
-
-      float reform_filter_arr[72];
-      float pad_filter_arr[pad_filter_ch_size];
-      float pad_filter_arr_c2[pad_filter_ch_size];
-      float pad_filter_arr_c3[pad_filter_ch_size];
-      float pad_filter_arr_c4[pad_filter_ch_size];
-      float pad_filter_arr_c5[pad_filter_ch_size];
-      float pad_filter_arr_c6[pad_filter_ch_size];
-      float pad_filter_arr_c7[pad_filter_ch_size];
-      float pad_filter_arr_c8[pad_filter_ch_size];
-
-      float *output_data_ch;
-      float *output_data_ch_2;
-      float *output_data_ch_3;
-      float *output_data_ch_4;
-      float *output_data_ch_5;
-      float *output_data_ch_6;
-      float *output_data_ch_7;
-      float *output_data_ch_8;
-
-      const float *input_data_ch;
-      const float *filter_data_ch;
-      const float *filter_data_ch_c2;
-      const float *filter_data_ch_c3;
-      const float *filter_data_ch_c4;
-      const float *filter_data_ch_c5;
-      const float *filter_data_ch_c6;
-      const float *filter_data_ch_c7;
-      const float *filter_data_ch_c8;
-
-      filter_data_ch = filter_data + o_c * filter_ch_size * input_ch;
-      filter_data_ch_c2 = filter_data + (o_c + 1) * filter_ch_size * input_ch;
-      filter_data_ch_c3 = filter_data + (o_c + 2) * filter_ch_size * input_ch;
-      filter_data_ch_c4 = filter_data + (o_c + 3) * filter_ch_size * input_ch;
-      filter_data_ch_c5 = filter_data + (o_c + 4) * filter_ch_size * input_ch;
-      filter_data_ch_c6 = filter_data + (o_c + 5) * filter_ch_size * input_ch;
-      filter_data_ch_c7 = filter_data + (o_c + 6) * filter_ch_size * input_ch;
-      filter_data_ch_c8 = filter_data + (o_c + 7) * filter_ch_size * input_ch;
-
-      input_data_ch = input_data;
-      output_data_ch = output_data + o_c * out_ch_size;
-      output_data_ch_2 = output_data + (o_c + 1) * out_ch_size;
-      output_data_ch_3 = output_data + (o_c + 2) * out_ch_size;
-      output_data_ch_4 = output_data + (o_c + 3) * out_ch_size;
-      output_data_ch_5 = output_data + (o_c + 4) * out_ch_size;
-      output_data_ch_6 = output_data + (o_c + 5) * out_ch_size;
-      output_data_ch_7 = output_data + (o_c + 6) * out_ch_size;
-      output_data_ch_8 = output_data + (o_c + 7) * out_ch_size;
-
-      for (int i_c = 0; i_c < input_ch; ++i_c) {
-        int k = 0;
-        for (int i = 0; i < 9; ++i) {
-          for (int j = 0; j < 8; ++j) {
-            reform_filter_arr[k++] = filter_data_ch[i + input_ch * 9 * j];
-          }
-        }
-
-        f1 = reform_filter_arr;
-
-        if (!if_nopadding) {
-          memset(pad_filter_arr, 0.f, sizeof(pad_filter_arr));
-          memset(pad_filter_arr_c2, 0.f, sizeof(pad_filter_arr_c2));
-          memset(pad_filter_arr_c3, 0.f, sizeof(pad_filter_arr_c3));
-          memset(pad_filter_arr_c4, 0.f, sizeof(pad_filter_arr_c4));
-          memset(pad_filter_arr_c5, 0.f, sizeof(pad_filter_arr_c5));
-          memset(pad_filter_arr_c6, 0.f, sizeof(pad_filter_arr_c6));
-          memset(pad_filter_arr_c7, 0.f, sizeof(pad_filter_arr_c7));
-          memset(pad_filter_arr_c8, 0.f, sizeof(pad_filter_arr_c8));
-
-          for (int i = 0; i < 9; ++i) {
-            int j = i / 3 * (2 * padding_w + 3) + i % 3 + padding_h * 3 +
-                    padding_w * (2 * padding_h + 1);
-            pad_filter_arr[j] = filter_data_ch[i];
-            pad_filter_arr_c2[j] = filter_data_ch_c2[i];
-            pad_filter_arr_c3[j] = filter_data_ch_c3[i];
-            pad_filter_arr_c4[j] = filter_data_ch_c4[i];
-            pad_filter_arr_c5[j] = filter_data_ch_c5[i];
-            pad_filter_arr_c6[j] = filter_data_ch_c6[i];
-            pad_filter_arr_c7[j] = filter_data_ch_c7[i];
-            pad_filter_arr_c8[j] = filter_data_ch_c8[i];
-          }
-
-          pad_filter1 = pad_filter_arr;
-          pad_filter1 += pad_filter_start;
-          pad_filter2 = pad_filter1 + pad_filter_w;
-          pad_filter3 = pad_filter2 + pad_filter_w;
-
-          pad_filter1_c2 = pad_filter_arr_c2;
-          pad_filter1_c2 += pad_filter_start;
-          pad_filter2_c2 = pad_filter1_c2 + pad_filter_w;
-          pad_filter3_c2 = pad_filter2_c2 + pad_filter_w;
-
-          pad_filter1_c3 = pad_filter_arr_c3;
-          pad_filter1_c3 += pad_filter_start;
-          pad_filter2_c3 = pad_filter1_c3 + pad_filter_w;
-          pad_filter3_c3 = pad_filter2_c3 + pad_filter_w;
-
-          pad_filter1_c4 = pad_filter_arr_c4;
-          pad_filter1_c4 += pad_filter_start;
-          pad_filter2_c4 = pad_filter1_c4 + pad_filter_w;
-          pad_filter3_c4 = pad_filter2_c4 + pad_filter_w;
-
-          pad_filter1_c5 = pad_filter_arr_c5;
-          pad_filter1_c5 += pad_filter_start;
-          pad_filter2_c5 = pad_filter1_c5 + pad_filter_w;
-          pad_filter3_c5 = pad_filter2_c5 + pad_filter_w;
-
-          pad_filter1_c6 = pad_filter_arr_c6;
-          pad_filter1_c6 += pad_filter_start;
-          pad_filter2_c6 = pad_filter1_c6 + pad_filter_w;
-          pad_filter3_c6 = pad_filter2_c6 + pad_filter_w;
-
-          pad_filter1_c7 = pad_filter_arr_c7;
-          pad_filter1_c7 += pad_filter_start;
-          pad_filter2_c7 = pad_filter1_c7 + pad_filter_w;
-          pad_filter3_c7 = pad_filter2_c7 + pad_filter_w;
-
-          pad_filter1_c8 = pad_filter_arr_c8;
-          pad_filter1_c8 += pad_filter_start;
-          pad_filter2_c8 = pad_filter1_c8 + pad_filter_w;
-          pad_filter3_c8 = pad_filter2_c8 + pad_filter_w;
-        } else {
-          pad_filter1 = filter_data_ch;
-          pad_filter2 = pad_filter1 + 3;
-          pad_filter3 = pad_filter2 + 3;
-
-          pad_filter1_c2 = filter_data_ch_c2;
-          pad_filter2_c2 = pad_filter1_c2 + 3;
-          pad_filter3_c2 = pad_filter2_c2 + 3;
-
-          pad_filter1_c3 = filter_data_ch_c3;
-          pad_filter2_c3 = pad_filter1_c3 + 3;
-          pad_filter3_c3 = pad_filter2_c3 + 3;
-
-          pad_filter1_c4 = filter_data_ch_c4;
-          pad_filter2_c4 = pad_filter1_c4 + 3;
-          pad_filter3_c4 = pad_filter2_c4 + 3;
-
-          pad_filter1_c5 = filter_data_ch_c5;
-          pad_filter2_c5 = pad_filter1_c5 + 3;
-          pad_filter3_c5 = pad_filter2_c5 + 3;
-
-          pad_filter1_c6 = filter_data_ch_c6;
-          pad_filter2_c6 = pad_filter1_c6 + 3;
-          pad_filter3_c6 = pad_filter2_c6 + 3;
-
-          pad_filter1_c7 = filter_data_ch_c7;
-          pad_filter2_c7 = pad_filter1_c7 + 3;
-          pad_filter3_c7 = pad_filter2_c7 + 3;
-
-          pad_filter1_c8 = filter_data_ch_c8;
-          pad_filter2_c8 = pad_filter1_c8 + 3;
-          pad_filter3_c8 = pad_filter2_c8 + 3;
-        }
-        float *out_ptr1;
-        float *out_ptr1_c2;
-        float *out_ptr1_c3;
-        float *out_ptr1_c4;
-        float *out_ptr1_c5;
-        float *out_ptr1_c6;
-        float *out_ptr1_c7;
-        float *out_ptr1_c8;
-
-        out_ptr1 = output_data_ch;
-        out_ptr1_c2 = output_data_ch_2;
-        out_ptr1_c3 = output_data_ch_3;
-        out_ptr1_c4 = output_data_ch_4;
-        out_ptr1_c5 = output_data_ch_5;
-        out_ptr1_c6 = output_data_ch_6;
-        out_ptr1_c7 = output_data_ch_7;
-        out_ptr1_c8 = output_data_ch_8;
-
-        in_ptr1 = input_data_ch;
-        in_ptr2 = in_ptr1 + input_w;
-        in_ptr3 = in_ptr2 + input_w;
-
-        int o_h = 0;
-
-        for (; o_h < output_h; ++o_h) {
-          int o_w = 0;
-
-          // pad left
-          for (; o_w <= valid_w_start; ++o_w) {
-            float sum1 = 0;
-            float sum1_c2 = 0;
-            float sum1_c3 = 0;
-            float sum1_c4 = 0;
-            float sum1_c5 = 0;
-            float sum1_c6 = 0;
-            float sum1_c7 = 0;
-            float sum1_c8 = 0;
-#if __ARM_NEON
-            float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-            float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-            float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2);
-            float32x4_t _pad_filter1_c3 = vld1q_f32(pad_filter1_c3);
-            float32x4_t _pad_filter1_c4 = vld1q_f32(pad_filter1_c4);
-            float32x4_t _pad_filter1_c5 = vld1q_f32(pad_filter1_c5);
-            float32x4_t _pad_filter1_c6 = vld1q_f32(pad_filter1_c6);
-            float32x4_t _pad_filter1_c7 = vld1q_f32(pad_filter1_c7);
-            float32x4_t _pad_filter1_c8 = vld1q_f32(pad_filter1_c8);
-
-            float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-            float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2);
-            float32x4_t _sum1_c3 = vmulq_f32(_in_ptr1, _pad_filter1_c3);
-            float32x4_t _sum1_c4 = vmulq_f32(_in_ptr1, _pad_filter1_c4);
-            float32x4_t _sum1_c5 = vmulq_f32(_in_ptr1, _pad_filter1_c5);
-            float32x4_t _sum1_c6 = vmulq_f32(_in_ptr1, _pad_filter1_c6);
-            float32x4_t _sum1_c7 = vmulq_f32(_in_ptr1, _pad_filter1_c7);
-            float32x4_t _sum1_c8 = vmulq_f32(_in_ptr1, _pad_filter1_c8);
-
-            float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-            float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-            float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2);
-            float32x4_t _pad_filter2_c3 = vld1q_f32(pad_filter2_c3);
-            float32x4_t _pad_filter2_c4 = vld1q_f32(pad_filter2_c4);
-            float32x4_t _pad_filter2_c5 = vld1q_f32(pad_filter2_c5);
-            float32x4_t _pad_filter2_c6 = vld1q_f32(pad_filter2_c6);
-            float32x4_t _pad_filter2_c7 = vld1q_f32(pad_filter2_c7);
-            float32x4_t _pad_filter2_c8 = vld1q_f32(pad_filter2_c8);
-
-            _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-            _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2);
-            _sum1_c3 = vmlaq_f32(_sum1_c3, _in_ptr2, _pad_filter2_c3);
-            _sum1_c4 = vmlaq_f32(_sum1_c4, _in_ptr2, _pad_filter2_c4);
-            _sum1_c5 = vmlaq_f32(_sum1_c5, _in_ptr2, _pad_filter2_c5);
-            _sum1_c6 = vmlaq_f32(_sum1_c6, _in_ptr2, _pad_filter2_c6);
-            _sum1_c7 = vmlaq_f32(_sum1_c7, _in_ptr2, _pad_filter2_c7);
-            _sum1_c8 = vmlaq_f32(_sum1_c8, _in_ptr2, _pad_filter2_c8);
-
-            float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-            float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-            float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2);
-            float32x4_t _pad_filter3_c3 = vld1q_f32(pad_filter3_c3);
-            float32x4_t _pad_filter3_c4 = vld1q_f32(pad_filter3_c4);
-            float32x4_t _pad_filter3_c5 = vld1q_f32(pad_filter3_c5);
-            float32x4_t _pad_filter3_c6 = vld1q_f32(pad_filter3_c6);
-            float32x4_t _pad_filter3_c7 = vld1q_f32(pad_filter3_c7);
-            float32x4_t _pad_filter3_c8 = vld1q_f32(pad_filter3_c8);
-
-            _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-            _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2);
-            _sum1_c3 = vmlaq_f32(_sum1_c3, _in_ptr3, _pad_filter3_c3);
-            _sum1_c4 = vmlaq_f32(_sum1_c4, _in_ptr3, _pad_filter3_c4);
-            _sum1_c5 = vmlaq_f32(_sum1_c5, _in_ptr3, _pad_filter3_c5);
-            _sum1_c6 = vmlaq_f32(_sum1_c6, _in_ptr3, _pad_filter3_c6);
-            _sum1_c7 = vmlaq_f32(_sum1_c7, _in_ptr3, _pad_filter3_c7);
-            _sum1_c8 = vmlaq_f32(_sum1_c8, _in_ptr3, _pad_filter3_c8);
-
-            _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-            _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3);
-            _sum1_c3 = vsetq_lane_f32(sum1_c3, _sum1_c3, 3);
-            _sum1_c4 = vsetq_lane_f32(sum1_c4, _sum1_c4, 3);
-            _sum1_c5 = vsetq_lane_f32(sum1_c5, _sum1_c5, 3);
-            _sum1_c6 = vsetq_lane_f32(sum1_c6, _sum1_c6, 3);
-            _sum1_c7 = vsetq_lane_f32(sum1_c7, _sum1_c7, 3);
-            _sum1_c8 = vsetq_lane_f32(sum1_c8, _sum1_c8, 3);
-
-            float32x2_t _ss1 =
-                vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-            float32x2_t _ss1_2 =
-                vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2));
-            float32x2_t _ss1_3 =
-                vadd_f32(vget_low_f32(_sum1_c3), vget_high_f32(_sum1_c3));
-            float32x2_t _ss1_4 =
-                vadd_f32(vget_low_f32(_sum1_c4), vget_high_f32(_sum1_c4));
-            float32x2_t _ss1_5 =
-                vadd_f32(vget_low_f32(_sum1_c5), vget_high_f32(_sum1_c5));
-            float32x2_t _ss1_6 =
-                vadd_f32(vget_low_f32(_sum1_c6), vget_high_f32(_sum1_c6));
-            float32x2_t _ss1_7 =
-                vadd_f32(vget_low_f32(_sum1_c7), vget_high_f32(_sum1_c7));
-            float32x2_t _ss1_8 =
-                vadd_f32(vget_low_f32(_sum1_c8), vget_high_f32(_sum1_c8));
-
-            float32x2_t _ssss1_ssss1_2 = vpadd_f32(_ss1, _ss1_2);
-            float32x2_t _ssss1_3_ssss1_4 = vpadd_f32(_ss1_3, _ss1_4);
-            float32x2_t _ssss1_5_ssss1_6 = vpadd_f32(_ss1_5, _ss1_6);
-            float32x2_t _ssss1_7_ssss1_8 = vpadd_f32(_ss1_7, _ss1_8);
-
-            sum1 += vget_lane_f32(_ssss1_ssss1_2, 0);
-            sum1_c2 += vget_lane_f32(_ssss1_ssss1_2, 1);
-            sum1_c3 += vget_lane_f32(_ssss1_3_ssss1_4, 0);
-            sum1_c4 += vget_lane_f32(_ssss1_3_ssss1_4, 1);
-            sum1_c5 += vget_lane_f32(_ssss1_5_ssss1_6, 0);
-            sum1_c6 += vget_lane_f32(_ssss1_5_ssss1_6, 1);
-            sum1_c7 += vget_lane_f32(_ssss1_7_ssss1_8, 0);
-            sum1_c8 += vget_lane_f32(_ssss1_7_ssss1_8, 1);
-#else
-            sum1 += in_ptr1[0] * pad_filter1[0];
-            sum1 += in_ptr1[1] * pad_filter1[1];
-            sum1 += in_ptr1[2] * pad_filter1[2];
-            sum1 += in_ptr2[0] * pad_filter2[0];
-            sum1 += in_ptr2[1] * pad_filter2[1];
-            sum1 += in_ptr2[2] * pad_filter2[2];
-            sum1 += in_ptr3[0] * pad_filter3[0];
-            sum1 += in_ptr3[1] * pad_filter3[1];
-            sum1 += in_ptr3[2] * pad_filter3[2];
-
-            sum1_c2 += in_ptr1[0] * pad_filter1_c2[0];
-            sum1_c2 += in_ptr1[1] * pad_filter1_c2[1];
-            sum1_c2 += in_ptr1[2] * pad_filter1_c2[2];
-            sum1_c2 += in_ptr2[0] * pad_filter2_c2[0];
-            sum1_c2 += in_ptr2[1] * pad_filter2_c2[1];
-            sum1_c2 += in_ptr2[2] * pad_filter2_c2[2];
-            sum1_c2 += in_ptr3[0] * pad_filter3_c2[0];
-            sum1_c2 += in_ptr3[1] * pad_filter3_c2[1];
-            sum1_c2 += in_ptr3[2] * pad_filter3_c2[2];
-
-            sum1_c3 += in_ptr1[0] * pad_filter1_c3[0];
-            sum1_c3 += in_ptr1[1] * pad_filter1_c3[1];
-            sum1_c3 += in_ptr1[2] * pad_filter1_c3[2];
-            sum1_c3 += in_ptr2[0] * pad_filter2_c3[0];
-            sum1_c3 += in_ptr2[1] * pad_filter2_c3[1];
-            sum1_c3 += in_ptr2[2] * pad_filter2_c3[2];
-            sum1_c3 += in_ptr3[0] * pad_filter3_c3[0];
-            sum1_c3 += in_ptr3[1] * pad_filter3_c3[1];
-            sum1_c3 += in_ptr3[2] * pad_filter3_c3[2];
-
-            sum1_c4 += in_ptr1[0] * pad_filter1_c4[0];
-            sum1_c4 += in_ptr1[1] * pad_filter1_c4[1];
-            sum1_c4 += in_ptr1[2] * pad_filter1_c4[2];
-            sum1_c4 += in_ptr2[0] * pad_filter2_c4[0];
-            sum1_c4 += in_ptr2[1] * pad_filter2_c4[1];
-            sum1_c4 += in_ptr2[2] * pad_filter2_c4[2];
-            sum1_c4 += in_ptr3[0] * pad_filter3_c4[0];
-            sum1_c4 += in_ptr3[1] * pad_filter3_c4[1];
-            sum1_c4 += in_ptr3[2] * pad_filter3_c4[2];
-
-            sum1_c5 += in_ptr1[0] * pad_filter1_c5[0];
-            sum1_c5 += in_ptr1[1] * pad_filter1_c5[1];
-            sum1_c5 += in_ptr1[2] * pad_filter1_c5[2];
-            sum1_c5 += in_ptr2[0] * pad_filter2_c5[0];
-            sum1_c5 += in_ptr2[1] * pad_filter2_c5[1];
-            sum1_c5 += in_ptr2[2] * pad_filter2_c5[2];
-            sum1_c5 += in_ptr3[0] * pad_filter3_c5[0];
-            sum1_c5 += in_ptr3[1] * pad_filter3_c5[1];
-            sum1_c5 += in_ptr3[2] * pad_filter3_c5[2];
-
-            sum1_c6 += in_ptr1[0] * pad_filter1_c6[0];
-            sum1_c6 += in_ptr1[1] * pad_filter1_c6[1];
-            sum1_c6 += in_ptr1[2] * pad_filter1_c6[2];
-            sum1_c6 += in_ptr2[0] * pad_filter2_c6[0];
-            sum1_c6 += in_ptr2[1] * pad_filter2_c6[1];
-            sum1_c6 += in_ptr2[2] * pad_filter2_c6[2];
-            sum1_c6 += in_ptr3[0] * pad_filter3_c6[0];
-            sum1_c6 += in_ptr3[1] * pad_filter3_c6[1];
-            sum1_c6 += in_ptr3[2] * pad_filter3_c6[2];
-
-            sum1_c7 += in_ptr1[0] * pad_filter1_c7[0];
-            sum1_c7 += in_ptr1[1] * pad_filter1_c7[1];
-            sum1_c7 += in_ptr1[2] * pad_filter1_c7[2];
-            sum1_c7 += in_ptr2[0] * pad_filter2_c7[0];
-            sum1_c7 += in_ptr2[1] * pad_filter2_c7[1];
-            sum1_c7 += in_ptr2[2] * pad_filter2_c7[2];
-            sum1_c7 += in_ptr3[0] * pad_filter3_c7[0];
-            sum1_c7 += in_ptr3[1] * pad_filter3_c7[1];
-            sum1_c7 += in_ptr3[2] * pad_filter3_c7[2];
-
-            sum1_c8 += in_ptr1[0] * pad_filter1_c8[0];
-            sum1_c8 += in_ptr1[1] * pad_filter1_c8[1];
-            sum1_c8 += in_ptr1[2] * pad_filter1_c8[2];
-            sum1_c8 += in_ptr2[0] * pad_filter2_c8[0];
-            sum1_c8 += in_ptr2[1] * pad_filter2_c8[1];
-            sum1_c8 += in_ptr2[2] * pad_filter2_c8[2];
-            sum1_c8 += in_ptr3[0] * pad_filter3_c8[0];
-            sum1_c8 += in_ptr3[1] * pad_filter3_c8[1];
-            sum1_c8 += in_ptr3[2] * pad_filter3_c8[2];
-#endif
-            if (if_nopadding) {
-              in_ptr1 += 2;
-              in_ptr2 += 2;
-              in_ptr3 += 2;
-
-            } else if (input_w > 3 &&
-                       (if_odd_pad_w && o_w == valid_w_start ||
-                        o_w == valid_w_end && if_odd_pad_w && if_exact_in_w ||
-                        o_w == valid_w_end + 1 && !if_odd_pad_w &&
-                            !if_exact_in_w)) {
-              pad_filter1--;
-              pad_filter2--;
-              pad_filter3--;
-              pad_filter1_c2--;
-              pad_filter2_c2--;
-              pad_filter3_c2--;
-
-              pad_filter1_c3--;
-              pad_filter2_c3--;
-              pad_filter3_c3--;
-              pad_filter1_c4--;
-              pad_filter2_c4--;
-              pad_filter3_c4--;
-
-              pad_filter1_c5--;
-              pad_filter2_c5--;
-              pad_filter3_c5--;
-              pad_filter1_c6--;
-              pad_filter2_c6--;
-              pad_filter3_c6--;
-
-              pad_filter1_c7--;
-              pad_filter2_c7--;
-              pad_filter3_c7--;
-              pad_filter1_c8--;
-              pad_filter2_c8--;
-              pad_filter3_c8--;
-
-              in_ptr1++;
-              in_ptr2++;
-              in_ptr3++;
-
-            } else if (input_w <= 3 || o_w < valid_w_start ||
-                       o_w > valid_w_end) {
-              pad_filter1 -= 2;
-              pad_filter2 -= 2;
-              pad_filter3 -= 2;
-              pad_filter1_c2 -= 2;
-              pad_filter2_c2 -= 2;
-              pad_filter3_c2 -= 2;
-
-              pad_filter1_c3 -= 2;
-              pad_filter2_c3 -= 2;
-              pad_filter3_c3 -= 2;
-              pad_filter1_c4 -= 2;
-              pad_filter2_c4 -= 2;
-              pad_filter3_c4 -= 2;
-
-              pad_filter1_c5 -= 2;
-              pad_filter2_c5 -= 2;
-              pad_filter3_c5 -= 2;
-              pad_filter1_c6 -= 2;
-              pad_filter2_c6 -= 2;
-              pad_filter3_c6 -= 2;
-
-              pad_filter1_c7 -= 2;
-              pad_filter2_c7 -= 2;
-              pad_filter3_c7 -= 2;
-              pad_filter1_c8 -= 2;
-              pad_filter2_c8 -= 2;
-              pad_filter3_c8 -= 2;
-            } else {
-              in_ptr1 += 2;
-              in_ptr2 += 2;
-              in_ptr3 += 2;
-            }
-            *out_ptr1 += sum1;
-            *out_ptr1_c2 += sum1_c2;
-            *out_ptr1_c3 += sum1_c3;
-            *out_ptr1_c4 += sum1_c4;
-            *out_ptr1_c5 += sum1_c5;
-            *out_ptr1_c6 += sum1_c6;
-            *out_ptr1_c7 += sum1_c7;
-            *out_ptr1_c8 += sum1_c8;
-
-            out_ptr1++;
-            out_ptr1_c2++;
-            out_ptr1_c3++;
-            out_ptr1_c4++;
-            out_ptr1_c5++;
-            out_ptr1_c6++;
-            out_ptr1_c7++;
-            out_ptr1_c8++;
-          }
-            // valid
-#if __ARM_NEON
-#if __aarch64__
-          if (o_h > valid_h_start && o_h <= valid_h_end) {
-            int loop = (valid_w_end - valid_w_start - 1) >> 2;
-            o_w += loop * 4;
-
-            if (loop > 0) {
-              asm volatile(
-
-                  "prfm  pldl1keep, [%[f1], #256]             \n\t"
-                  "prfm  pldl1keep, [%[in_ptr1], #288]        \n\t"
-
-                  "ld1  {v0.4s, v1.4s}, [%[f1]], #32          \n\t"
-                  "ld2   {v4.4s, v5.4s}, [%[in_ptr1]], #32    \n\t"
-                  "ld2   {v6.4s, v7.4s}, [%[in_ptr1]]         \n\t"
-                  "0:                                         \n\t"
-                  // load out_ptr
-                  "prfm  pldl1keep, [%[out_ptr1], #128]       \n\t"
-                  "prfm  pldl1keep, [%[out_ptr1_c2], #128]     \n\t"
-                  "prfm  pldl1keep, [%[out_ptr1_c3], #128]     \n\t"
-                  "prfm  pldl1keep, [%[out_ptr1_c4], #128]     \n\t"
-                  "prfm  pldl1keep, [%[out_ptr1_c5], #128]     \n\t"
-                  "prfm  pldl1keep, [%[out_ptr1_c6], #128]     \n\t"
-                  "prfm  pldl1keep, [%[out_ptr1_c7], #128]     \n\t"
-                  "prfm  pldl1keep, [%[out_ptr1_c8], #128]     \n\t"
-
-                  "ld1   {v8.4s}, [%[out_ptr1]]               \n\t"
-                  "ld1   {v9.4s}, [%[out_ptr1_c2]]             \n\t"
-                  "ld1   {v10.4s}, [%[out_ptr1_c3]]            \n\t"
-                  "ld1   {v11.4s}, [%[out_ptr1_c4]]            \n\t"
-                  "ld1   {v12.4s}, [%[out_ptr1_c5]]            \n\t"
-                  "ld1   {v13.4s}, [%[out_ptr1_c6]]            \n\t"
-                  "ld1   {v14.4s}, [%[out_ptr1_c7]]            \n\t"
-                  "ld1   {v15.4s}, [%[out_ptr1_c8]]            \n\t"
-
-                  // in_ptr1 multiply
-                  "prfm  pldl1keep, [%[f1], #256]             \n\t"
-                  "ld1   {v2.4s, v3.4s}, [%[f1]], #32         \n\t"
-                  "fmla    v8.4s, v4.4s, v0.s[0]              \n\t"
-                  "fmla    v9.4s, v4.4s, v0.s[1]              \n\t"
-                  "fmla   v10.4s, v4.4s, v0.s[2]              \n\t"
-                  "fmla   v11.4s, v4.4s, v0.s[3]              \n\t"
-
-                  "fmla   v12.4s, v4.4s, v1.s[0]              \n\t"
-                  "fmla   v13.4s, v4.4s, v1.s[1]              \n\t"
-                  "fmla   v14.4s, v4.4s, v1.s[2]              \n\t"
-                  "fmla   v15.4s, v4.4s, v1.s[3]              \n\t"
-
-                  "ext    v7.16b, v4.16b, v6.16b, #4          \n\t"
-                  "fmla    v8.4s, v5.4s, v2.s[0]              \n\t"
-                  "fmla    v9.4s, v5.4s, v2.s[1]              \n\t"
-                  "fmla   v10.4s, v5.4s, v2.s[2]              \n\t"
-                  "fmla   v11.4s, v5.4s, v2.s[3]              \n\t"
-
-                  "prfm  pldl1keep, [%[f1], #256]             \n\t"
-                  "ld1   {v0.4s, v1.4s}, [%[f1]], #32         \n\t"
-                  "fmla   v12.4s, v5.4s, v3.s[0]              \n\t"
-                  "fmla   v13.4s, v5.4s, v3.s[1]              \n\t"
-                  "fmla   v14.4s, v5.4s, v3.s[2]              \n\t"
-                  "fmla   v15.4s, v5.4s, v3.s[3]              \n\t"
-
-                  "prfm  pldl1keep, [%[in_ptr2], #288]        \n\t"
-                  "ld2    {v4.4s, v5.4s}, [%[in_ptr2]], #32   \n\t"
-                  "fmla    v8.4s, v7.4s, v0.s[0]              \n\t"
-                  "fmla    v9.4s, v7.4s, v0.s[1]              \n\t"
-                  "fmla   v10.4s, v7.4s, v0.s[2]              \n\t"
-                  "fmla   v11.4s, v7.4s, v0.s[3]              \n\t"
-
-                  "prfm  pldl1keep, [%[f1], #256]             \n\t"
-                  "ld1   {v2.4s, v3.4s}, [%[f1]], #32         \n\t"
-
-                  "fmla   v12.4s, v7.4s, v1.s[0]              \n\t"
-                  "fmla   v13.4s, v7.4s, v1.s[1]              \n\t"
-                  "fmla   v14.4s, v7.4s, v1.s[2]              \n\t"
-                  "fmla   v15.4s, v7.4s, v1.s[3]              \n\t"
-
-                  // in_ptr2 multiply
-                  "ld2    {v6.4s, v7.4s}, [%[in_ptr2]]        \n\t"
-                  "fmla    v8.4s, v4.4s, v2.s[0]              \n\t"
-                  "fmla    v9.4s, v4.4s, v2.s[1]              \n\t"
-                  "fmla   v10.4s, v4.4s, v2.s[2]              \n\t"
-                  "fmla   v11.4s, v4.4s, v2.s[3]              \n\t"
-
-                  "prfm  pldl1keep, [%[f1], #256]             \n\t"
-                  "ld1   {v0.4s, v1.4s}, [%[f1]], #32         \n\t"
-                  "fmla   v12.4s, v4.4s, v3.s[0]              \n\t"
-                  "fmla   v13.4s, v4.4s, v3.s[1]              \n\t"
-                  "fmla   v14.4s, v4.4s, v3.s[2]              \n\t"
-                  "fmla   v15.4s, v4.4s, v3.s[3]              \n\t"
-
-                  "ext    v7.16b, v4.16b, v6.16b, #4          \n\t"
-                  "fmla    v8.4s, v5.4s, v0.s[0]              \n\t"
-                  "fmla    v9.4s, v5.4s, v0.s[1]              \n\t"
-                  "fmla   v10.4s, v5.4s, v0.s[2]              \n\t"
-                  "fmla   v11.4s, v5.4s, v0.s[3]              \n\t"
-
-                  "prfm  pldl1keep, [%[f1], #256]             \n\t"
-                  "ld1    {v2.4s, v3.4s}, [%[f1]], #32        \n\t"
-                  "fmla   v12.4s, v5.4s, v1.s[0]              \n\t"
-                  "fmla   v13.4s, v5.4s, v1.s[1]              \n\t"
-
-                  "prfm  pldl1keep, [%[f1], #256]             \n\t"
-                  "prfm  pldl1keep, [%[in_ptr3], #288]        \n\t"
-                  "fmla   v14.4s, v5.4s, v1.s[2]              \n\t"
-                  "fmla   v15.4s, v5.4s, v1.s[3]              \n\t"
-
-                  "ld1  {v0.4s, v1.4s}, [%[f1]], #32          \n\t"
-                  "ld2   {v4.4s, v5.4s}, [%[in_ptr3]], #32    \n\t"
-                  "fmla    v8.4s, v7.4s, v2.s[0]              \n\t"
-                  "fmla    v9.4s, v7.4s, v2.s[1]              \n\t"
-                  "fmla   v10.4s, v7.4s, v2.s[2]              \n\t"
-                  "fmla   v11.4s, v7.4s, v2.s[3]              \n\t"
-
-                  "fmla   v12.4s, v7.4s, v3.s[0]              \n\t"
-                  "fmla   v13.4s, v7.4s, v3.s[1]              \n\t"
-                  "fmla   v14.4s, v7.4s, v3.s[2]              \n\t"
-                  "fmla   v15.4s, v7.4s, v3.s[3]              \n\t"
-
-                  // in_ptr3 multiply
-                  "ld2   {v6.4s, v7.4s}, [%[in_ptr3]]         \n\t"
-                  "fmla    v8.4s, v4.4s, v0.s[0]              \n\t"
-                  "fmla    v9.4s, v4.4s, v0.s[1]              \n\t"
-                  "fmla   v10.4s, v4.4s, v0.s[2]              \n\t"
-                  "fmla   v11.4s, v4.4s, v0.s[3]              \n\t"
-
-                  "prfm  pldl1keep, [%[f1], #256]             \n\t"
-                  "ld1   {v2.4s, v3.4s}, [%[f1]], #32         \n\t"
-                  "fmla   v12.4s, v4.4s, v1.s[0]              \n\t"
-                  "fmla   v13.4s, v4.4s, v1.s[1]              \n\t"
-                  "fmla   v14.4s, v4.4s, v1.s[2]              \n\t"
-                  "fmla   v15.4s, v4.4s, v1.s[3]              \n\t"
-
-                  "ext    v7.16b, v4.16b, v6.16b, #4          \n\t"
-                  "fmla    v8.4s, v5.4s, v2.s[0]              \n\t"
-                  "fmla    v9.4s, v5.4s, v2.s[1]              \n\t"
-                  "fmla   v10.4s, v5.4s, v2.s[2]              \n\t"
-                  "fmla   v11.4s, v5.4s, v2.s[3]              \n\t"
-
-                  "prfm  pldl1keep, [%[f1], #256]             \n\t"
-                  "ld1   {v0.4s, v1.4s}, [%[f1]], #32         \n\t"
-                  "fmla   v12.4s, v5.4s, v3.s[0]              \n\t"
-                  "fmla   v13.4s, v5.4s, v3.s[1]              \n\t"
-                  "fmla   v14.4s, v5.4s, v3.s[2]              \n\t"
-                  "fmla   v15.4s, v5.4s, v3.s[3]              \n\t"
-
-                  "sub        %[f1], %[f1], #288              \n\t"
-                  "fmla    v8.4s, v7.4s, v0.s[0]              \n\t"
-                  "fmla    v9.4s, v7.4s, v0.s[1]              \n\t"
-                  "fmla   v10.4s, v7.4s, v0.s[2]              \n\t"
-                  "fmla   v11.4s, v7.4s, v0.s[3]              \n\t"
-
-                  "fmla   v12.4s, v7.4s, v1.s[0]              \n\t"
-                  "fmla   v13.4s, v7.4s, v1.s[1]              \n\t"
-                  "fmla   v14.4s, v7.4s, v1.s[2]              \n\t"
-                  "fmla   v15.4s, v7.4s, v1.s[3]              \n\t"
-
-                  // store out_ptr
-                  "prfm  pldl1keep, [%[f1], #256]             \n\t"
-                  "prfm  pldl1keep, [%[in_ptr1], #288]        \n\t"
-
-                  "ld1  {v0.4s, v1.4s}, [%[f1]], #32          \n\t"
-
-                  "ld2   {v4.4s, v5.4s}, [%[in_ptr1]], #32    \n\t"
-                  "st1   {v8.4s}, [%[out_ptr1]], #16          \n\t"
-                  "st1   {v9.4s}, [%[out_ptr1_c2]], #16        \n\t"
-
-                  "st1   {v10.4s}, [%[out_ptr1_c3]], #16       \n\t"
-                  "st1   {v11.4s}, [%[out_ptr1_c4]], #16       \n\t"
-
-                  "st1   {v12.4s}, [%[out_ptr1_c5]], #16       \n\t"
-                  "st1   {v13.4s}, [%[out_ptr1_c6]], #16       \n\t"
-
-                  "ld2   {v6.4s, v7.4s}, [%[in_ptr1]]         \n\t"
-                  "st1   {v14.4s}, [%[out_ptr1_c7]], #16       \n\t"
-                  "subs       %[loop], %[loop], #1    \n\t"
-                  "st1   {v15.4s}, [%[out_ptr1_c8]], #16       \n\t"
-
-                  // cycle
-                  "bne        0b                              \n\t"
-                  "sub       %[f1], %[in_ptr1], #32           \n\t"
-                  "sub       %[in_ptr1], %[in_ptr1], #32      \n\t"
-
-                  : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1),
-                    [out_ptr1_c2] "+r"(out_ptr1_c2),
-                    [out_ptr1_c3] "+r"(out_ptr1_c3),
-                    [out_ptr1_c4] "+r"(out_ptr1_c4),
-                    [out_ptr1_c5] "+r"(out_ptr1_c5),
-                    [out_ptr1_c6] "+r"(out_ptr1_c6),
-                    [out_ptr1_c7] "+r"(out_ptr1_c7),
-                    [out_ptr1_c8] "+r"(out_ptr1_c8), [in_ptr1] "+r"(in_ptr1),
-                    [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3)
-                  : [f1] "r"(f1)
-                  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
-                    "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
-            }
-          }
-#else
-          if (o_h > valid_h_start && o_h <= valid_h_end) {
-            int loop = (valid_w_end - valid_w_start - 1) >> 2;
-            o_w += loop * 4;
-            int in_stride = (input_w - 8) * 4;
-
-            if (loop > 0) {
-              asm volatile(
-
-                  "pld        [%[f1], #256]                   \n\t"
-                  "pld        [%[in_ptr1], #288]              \n\t"
-
-                  "vld1.f32   {d0-d3}, [%[f1]]!               \n\t"
-                  "vld2.f32   {d8-d11}, [%[in_ptr1]]!         \n\t"
-                  "vld2.f32   {d12, d13}, [%[in_ptr1]]        \n\t"
-                  "add        %[in_ptr1], %[in_stride]        \n\t"
-
-                  "0:                                         \n\t"
-                  // load out_ptr
-                  "pld        [%[out_ptr1], #128]             \n\t"
-                  "pld        [%[out_ptr1_c2], #128]           \n\t"
-                  "pld        [%[out_ptr1_c3], #128]           \n\t"
-                  "pld        [%[out_ptr1_c4], #128]           \n\t"
-                  "pld        [%[out_ptr1_c5], #128]           \n\t"
-                  "pld        [%[out_ptr1_c6], #128]           \n\t"
-                  "pld        [%[out_ptr1_c7], #128]           \n\t"
-                  "pld        [%[out_ptr1_c8], #128]           \n\t"
-
-                  "vld1.f32   {d16, d17}, [%[out_ptr1]]       \n\t"
-                  "vld1.f32   {d18, d19}, [%[out_ptr1_c2]]     \n\t"
-                  "vld1.f32   {d20, d21}, [%[out_ptr1_c3]]     \n\t"
-                  "vld1.f32   {d22, d23}, [%[out_ptr1_c4]]     \n\t"
-                  "vld1.f32   {d24, d25}, [%[out_ptr1_c5]]     \n\t"
-                  "vld1.f32   {d26, d27}, [%[out_ptr1_c6]]     \n\t"
-                  "vld1.f32   {d28, d29}, [%[out_ptr1_c7]]     \n\t"
-                  "vld1.f32   {d30, d31}, [%[out_ptr1_c8]]     \n\t"
-
-                  // in_ptr1 multiply
-                  "pld        [%[f1], #256]                   \n\t"
-                  "vld1.f32   {d4-d7}, [%[f1]]!               \n\t"
-                  "vmla.f32   q8, q4, d0[0]                   \n\t"
-                  "vmla.f32   q9, q4, d0[1]                   \n\t"
-
-                  "vmla.f32   q10, q4, d1[0]                  \n\t"
-                  "vmla.f32   q11, q4, d1[1]                  \n\t"
-
-                  "vmla.f32   q12, q4, d2[0]                  \n\t"
-                  "vmla.f32   q13, q4, d2[1]                  \n\t"
-
-                  "pld        [%[f1], #256]                   \n\t"
-                  "vmla.f32   q14, q4, d3[0]                  \n\t"
-                  "vmla.f32   q15, q4, d3[1]                  \n\t"
-
-                  "vld1.f32   {d0-d3}, [%[f1]]!               \n\t"
-                  "vmla.f32   q8, q5, d4[0]                   \n\t"
-                  "vmla.f32   q9, q5, d4[1]                   \n\t"
-
-                  "vext.32    q7, q4, q6, #1                  \n\t"
-                  "vmla.f32   q10, q5, d5[0]                  \n\t"
-                  "vmla.f32   q11, q5, d5[1]                  \n\t"
-
-                  "vmla.f32   q12, q5, d6[0]                  \n\t"
-                  "vmla.f32   q13, q5, d6[1]                  \n\t"
-
-                  "pld        [%[in_ptr1], #288]              \n\t"
-                  "vmla.f32   q14, q5, d7[0]                  \n\t"
-                  "vmla.f32   q15, q5, d7[1]                  \n\t"
-
-                  "vld2.f32   {d8-d11}, [%[in_ptr1]]!         \n\t"
-                  "vmla.f32   q8, q7, d0[0]                   \n\t"
-                  "vmla.f32   q9, q7, d0[1]                   \n\t"
-
-                  "pld        [%[f1], #256]                   \n\t"
-                  "vld1.f32   {d4-d7}, [%[f1]]!               \n\t"
-                  "vmla.f32   q10, q7, d1[0]                  \n\t"
-                  "vmla.f32   q11, q7, d1[1]                  \n\t"
-
-                  "vld2.f32   {d12, d13}, [%[in_ptr1]]        \n\t"
-                  "add        %[in_ptr1], %[in_stride]        \n\t"
-                  "vmla.f32   q12, q7, d2[0]                  \n\t"
-                  "vmla.f32   q13, q7, d2[1]                  \n\t"
-
-                  "pld        [%[f1], #256]                   \n\t"
-                  "vmla.f32   q14, q7, d3[0]                  \n\t"
-                  "vmla.f32   q15, q7, d3[1]                  \n\t"
-
-                  // in_ptr2 multiply
-                  "vld1.f32   {d0-d3}, [%[f1]]!               \n\t"
-                  "vmla.f32   q8, q4, d4[0]                   \n\t"
-                  "vmla.f32   q9, q4, d4[1]                   \n\t"
-
-                  "vmla.f32   q10, q4, d5[0]                  \n\t"
-                  "vmla.f32   q11, q4, d5[1]                  \n\t"
-
-                  "vmla.f32   q12, q4, d6[0]                  \n\t"
-                  "vmla.f32   q13, q4, d6[1]                  \n\t"
-
-                  "pld        [%[f1], #256]                   \n\t"
-                  "vmla.f32   q14, q4, d7[0]                  \n\t"
-                  "vmla.f32   q15, q4, d7[1]                  \n\t"
-
-                  "vld1.f32   {d4-d7}, [%[f1]]!               \n\t"
-                  "vmla.f32   q8, q5, d0[0]                   \n\t"
-                  "vmla.f32   q9, q5, d0[1]                   \n\t"
-
-                  "vext.32    q7, q4, q6, #1                  \n\t"
-                  "vmla.f32   q10, q5, d1[0]                  \n\t"
-                  "vmla.f32   q11, q5, d1[1]                  \n\t"
-
-                  "vmla.f32   q12, q5, d2[0]                  \n\t"
-                  "vmla.f32   q13, q5, d2[1]                  \n\t"
-
-                  "pld        [%[in_ptr1], #288]              \n\t"
-                  "vmla.f32   q14, q5, d3[0]                  \n\t"
-                  "vmla.f32   q15, q5, d3[1]                  \n\t"
-
-                  "vld2.f32   {d8-d11}, [%[in_ptr1]]!         \n\t"
-                  "vmla.f32   q8, q7, d4[0]                   \n\t"
-                  "vmla.f32   q9, q7, d4[1]                   \n\t"
-
-                  "pld        [%[f1], #256]                   \n\t"
-                  "vld1.f32   {d0-d3}, [%[f1]]!               \n\t"
-                  "vmla.f32   q10, q7, d5[0]                  \n\t"
-                  "vmla.f32   q11, q7, d5[1]                  \n\t"
-
-                  "vld2.f32   {d12, d13}, [%[in_ptr1]]        \n\t"
-                  "sub        %[in_ptr1], %[in_stride]        \n\t"
-                  "sub        %[in_ptr1], %[in_stride]        \n\t"
-                  "vmla.f32   q12, q7, d6[0]                  \n\t"
-                  "vmla.f32   q13, q7, d6[1]                  \n\t"
-
-                  "sub        %[in_ptr1], #64                 \n\t"
-                  "pld        [%[f1], #256]                   \n\t"
-                  "vmla.f32   q14, q7, d7[0]                  \n\t"
-                  "vmla.f32   q15, q7, d7[1]                  \n\t"
-
-                  // in_ptr3 multiply
-                  "vld1.f32   {d4-d7}, [%[f1]]!               \n\t"
-                  "vmla.f32   q8, q4, d0[0]                   \n\t"
-                  "vmla.f32   q9, q4, d0[1]                   \n\t"
-
-                  "vmla.f32   q10, q4, d1[0]                  \n\t"
-                  "vmla.f32   q11, q4, d1[1]                  \n\t"
-
-                  "vmla.f32   q12, q4, d2[0]                  \n\t"
-                  "vmla.f32   q13, q4, d2[1]                  \n\t"
-
-                  "pld        [%[f1], #256]                   \n\t"
-                  "vmla.f32   q14, q4, d3[0]                  \n\t"
-                  "vmla.f32   q15, q4, d3[1]                  \n\t"
-
-                  "vld1.f32   {d0-d3}, [%[f1]]!               \n\t"
-                  "vmla.f32   q8, q5, d4[0]                   \n\t"
-                  "vmla.f32   q9, q5, d4[1]                   \n\t"
-
-                  "vext.32    q7, q4, q6, #1                  \n\t"
-                  "vmla.f32   q10, q5, d5[0]                  \n\t"
-                  "vmla.f32   q11, q5, d5[1]                  \n\t"
-
-                  "vmla.f32   q12, q5, d6[0]                  \n\t"
-                  "vmla.f32   q13, q5, d6[1]                  \n\t"
-
-                  "vmla.f32   q14, q5, d7[0]                  \n\t"
-                  "vmla.f32   q15, q5, d7[1]                  \n\t"
-
-                  "sub        %[f1], %[f1], #288              \n\t"
-                  "vmla.f32   q8, q7, d0[0]                   \n\t"
-                  "vmla.f32   q9, q7, d0[1]                   \n\t"
-
-                  "vmla.f32   q10, q7, d1[0]                  \n\t"
-                  "vmla.f32   q11, q7, d1[1]                  \n\t"
-
-                  "vmla.f32   q12, q7, d2[0]                  \n\t"
-                  "vmla.f32   q13, q7, d2[1]                  \n\t"
-
-                  "vmla.f32   q14, q7, d3[0]                  \n\t"
-                  "vmla.f32   q15, q7, d3[1]                  \n\t"
-
-                  // store out_ptr
-                  "pld        [%[f1], #256]                   \n\t"
-                  "vld1.f32   {d0-d3}, [%[f1]]!               \n\t"
-
-                  "pld        [%[in_ptr1], #288]              \n\t"
-                  "vld2.f32   {d8-d11}, [%[in_ptr1]]!         \n\t"
-                  "vst1.f32   {d16, d17}, [%[out_ptr1]]!      \n\t"
-                  "vst1.f32   {d18, d19}, [%[out_ptr1_c2]]!    \n\t"
-
-                  "vst1.f32   {d20, d21}, [%[out_ptr1_c3]]!    \n\t"
-                  "vst1.f32   {d22, d23}, [%[out_ptr1_c4]]!    \n\t"
-
-                  "vst1.f32   {d24, d25}, [%[out_ptr1_c5]]!    \n\t"
-                  "vst1.f32   {d26, d27}, [%[out_ptr1_c6]]!    \n\t"
-
-                  "vld2.f32   {d12, d13}, [%[in_ptr1]]        \n\t"
-                  "add        %[in_ptr1], %[in_stride]        \n\t"
-                  "vst1.f32   {d28, d29}, [%[out_ptr1_c7]]!    \n\t"
-
-                  "subs       %[loop], #1                 \n\t"
-                  "vst1.f32   {d30, d31}, [%[out_ptr1_c8]]!    \n\t"
-
-                  // cycle
-                  "bne        0b                              \n\t"
-                  "sub        %[f1], %[f1], #32               \n\t"
-                  "sub        %[in_ptr1], %[in_ptr1], #32     \n\t"
-                  "sub        %[in_ptr1], %[in_stride]        \n\t"
-
-                  : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1),
-                    [out_ptr1_c2] "+r"(out_ptr1_c2),
-                    [out_ptr1_c3] "+r"(out_ptr1_c3),
-                    [out_ptr1_c4] "+r"(out_ptr1_c4),
-                    [out_ptr1_c5] "+r"(out_ptr1_c5),
-                    [out_ptr1_c6] "+r"(out_ptr1_c6),
-                    [out_ptr1_c7] "+r"(out_ptr1_c7),
-                    [out_ptr1_c8] "+r"(out_ptr1_c8), [in_ptr1] "+r"(in_ptr1)
-                  : [f1] "r"(f1), [in_stride] "r"(in_stride)
-                  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
-                    "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-
-              in_ptr2 = in_ptr1 + input_w;
-              in_ptr3 = in_ptr2 + input_w;
-            }
-          }
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-
-          // remain output_width
-          for (; o_w < output_w; ++o_w) {
-            float sum1 = 0;
-            float sum1_c2 = 0;
-            float sum1_c3 = 0;
-            float sum1_c4 = 0;
-            float sum1_c5 = 0;
-            float sum1_c6 = 0;
-            float sum1_c7 = 0;
-            float sum1_c8 = 0;
-#if __ARM_NEON
-            float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-            float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-            float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2);
-            float32x4_t _pad_filter1_c3 = vld1q_f32(pad_filter1_c3);
-            float32x4_t _pad_filter1_c4 = vld1q_f32(pad_filter1_c4);
-            float32x4_t _pad_filter1_c5 = vld1q_f32(pad_filter1_c5);
-            float32x4_t _pad_filter1_c6 = vld1q_f32(pad_filter1_c6);
-            float32x4_t _pad_filter1_c7 = vld1q_f32(pad_filter1_c7);
-            float32x4_t _pad_filter1_c8 = vld1q_f32(pad_filter1_c8);
-
-            float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-            float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2);
-            float32x4_t _sum1_c3 = vmulq_f32(_in_ptr1, _pad_filter1_c3);
-            float32x4_t _sum1_c4 = vmulq_f32(_in_ptr1, _pad_filter1_c4);
-            float32x4_t _sum1_c5 = vmulq_f32(_in_ptr1, _pad_filter1_c5);
-            float32x4_t _sum1_c6 = vmulq_f32(_in_ptr1, _pad_filter1_c6);
-            float32x4_t _sum1_c7 = vmulq_f32(_in_ptr1, _pad_filter1_c7);
-            float32x4_t _sum1_c8 = vmulq_f32(_in_ptr1, _pad_filter1_c8);
-
-            float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-            float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-            float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2);
-            float32x4_t _pad_filter2_c3 = vld1q_f32(pad_filter2_c3);
-            float32x4_t _pad_filter2_c4 = vld1q_f32(pad_filter2_c4);
-            float32x4_t _pad_filter2_c5 = vld1q_f32(pad_filter2_c5);
-            float32x4_t _pad_filter2_c6 = vld1q_f32(pad_filter2_c6);
-            float32x4_t _pad_filter2_c7 = vld1q_f32(pad_filter2_c7);
-            float32x4_t _pad_filter2_c8 = vld1q_f32(pad_filter2_c8);
-
-            _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-            _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2);
-            _sum1_c3 = vmlaq_f32(_sum1_c3, _in_ptr2, _pad_filter2_c3);
-            _sum1_c4 = vmlaq_f32(_sum1_c4, _in_ptr2, _pad_filter2_c4);
-            _sum1_c5 = vmlaq_f32(_sum1_c5, _in_ptr2, _pad_filter2_c5);
-            _sum1_c6 = vmlaq_f32(_sum1_c6, _in_ptr2, _pad_filter2_c6);
-            _sum1_c7 = vmlaq_f32(_sum1_c7, _in_ptr2, _pad_filter2_c7);
-            _sum1_c8 = vmlaq_f32(_sum1_c8, _in_ptr2, _pad_filter2_c8);
-
-            float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-            float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-            float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2);
-            float32x4_t _pad_filter3_c3 = vld1q_f32(pad_filter3_c3);
-            float32x4_t _pad_filter3_c4 = vld1q_f32(pad_filter3_c4);
-            float32x4_t _pad_filter3_c5 = vld1q_f32(pad_filter3_c5);
-            float32x4_t _pad_filter3_c6 = vld1q_f32(pad_filter3_c6);
-            float32x4_t _pad_filter3_c7 = vld1q_f32(pad_filter3_c7);
-            float32x4_t _pad_filter3_c8 = vld1q_f32(pad_filter3_c8);
-
-            _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-            _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2);
-            _sum1_c3 = vmlaq_f32(_sum1_c3, _in_ptr3, _pad_filter3_c3);
-            _sum1_c4 = vmlaq_f32(_sum1_c4, _in_ptr3, _pad_filter3_c4);
-            _sum1_c5 = vmlaq_f32(_sum1_c5, _in_ptr3, _pad_filter3_c5);
-            _sum1_c6 = vmlaq_f32(_sum1_c6, _in_ptr3, _pad_filter3_c6);
-            _sum1_c7 = vmlaq_f32(_sum1_c7, _in_ptr3, _pad_filter3_c7);
-            _sum1_c8 = vmlaq_f32(_sum1_c8, _in_ptr3, _pad_filter3_c8);
-
-            _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-            _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3);
-            _sum1_c3 = vsetq_lane_f32(sum1_c3, _sum1_c3, 3);
-            _sum1_c4 = vsetq_lane_f32(sum1_c4, _sum1_c4, 3);
-            _sum1_c5 = vsetq_lane_f32(sum1_c5, _sum1_c5, 3);
-            _sum1_c6 = vsetq_lane_f32(sum1_c6, _sum1_c6, 3);
-            _sum1_c7 = vsetq_lane_f32(sum1_c7, _sum1_c7, 3);
-            _sum1_c8 = vsetq_lane_f32(sum1_c8, _sum1_c8, 3);
-
-            float32x2_t _ss1 =
-                vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-            float32x2_t _ss1_2 =
-                vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2));
-            float32x2_t _ss1_3 =
-                vadd_f32(vget_low_f32(_sum1_c3), vget_high_f32(_sum1_c3));
-            float32x2_t _ss1_4 =
-                vadd_f32(vget_low_f32(_sum1_c4), vget_high_f32(_sum1_c4));
-            float32x2_t _ss1_5 =
-                vadd_f32(vget_low_f32(_sum1_c5), vget_high_f32(_sum1_c5));
-            float32x2_t _ss1_6 =
-                vadd_f32(vget_low_f32(_sum1_c6), vget_high_f32(_sum1_c6));
-            float32x2_t _ss1_7 =
-                vadd_f32(vget_low_f32(_sum1_c7), vget_high_f32(_sum1_c7));
-            float32x2_t _ss1_8 =
-                vadd_f32(vget_low_f32(_sum1_c8), vget_high_f32(_sum1_c8));
-
-            float32x2_t _ssss1_ssss1_2 = vpadd_f32(_ss1, _ss1_2);
-            float32x2_t _ssss1_3_ssss1_4 = vpadd_f32(_ss1_3, _ss1_4);
-            float32x2_t _ssss1_5_ssss1_6 = vpadd_f32(_ss1_5, _ss1_6);
-            float32x2_t _ssss1_7_ssss1_8 = vpadd_f32(_ss1_7, _ss1_8);
-
-            sum1 += vget_lane_f32(_ssss1_ssss1_2, 0);
-            sum1_c2 += vget_lane_f32(_ssss1_ssss1_2, 1);
-            sum1_c3 += vget_lane_f32(_ssss1_3_ssss1_4, 0);
-            sum1_c4 += vget_lane_f32(_ssss1_3_ssss1_4, 1);
-            sum1_c5 += vget_lane_f32(_ssss1_5_ssss1_6, 0);
-            sum1_c6 += vget_lane_f32(_ssss1_5_ssss1_6, 1);
-            sum1_c7 += vget_lane_f32(_ssss1_7_ssss1_8, 0);
-            sum1_c8 += vget_lane_f32(_ssss1_7_ssss1_8, 1);
-#else
-            sum1 += in_ptr1[0] * pad_filter1[0];
-            sum1 += in_ptr1[1] * pad_filter1[1];
-            sum1 += in_ptr1[2] * pad_filter1[2];
-            sum1 += in_ptr2[0] * pad_filter2[0];
-            sum1 += in_ptr2[1] * pad_filter2[1];
-            sum1 += in_ptr2[2] * pad_filter2[2];
-            sum1 += in_ptr3[0] * pad_filter3[0];
-            sum1 += in_ptr3[1] * pad_filter3[1];
-            sum1 += in_ptr3[2] * pad_filter3[2];
-
-            sum1_c2 += in_ptr1[0] * pad_filter1_c2[0];
-            sum1_c2 += in_ptr1[1] * pad_filter1_c2[1];
-            sum1_c2 += in_ptr1[2] * pad_filter1_c2[2];
-            sum1_c2 += in_ptr2[0] * pad_filter2_c2[0];
-            sum1_c2 += in_ptr2[1] * pad_filter2_c2[1];
-            sum1_c2 += in_ptr2[2] * pad_filter2_c2[2];
-            sum1_c2 += in_ptr3[0] * pad_filter3_c2[0];
-            sum1_c2 += in_ptr3[1] * pad_filter3_c2[1];
-            sum1_c2 += in_ptr3[2] * pad_filter3_c2[2];
-
-            sum1_c3 += in_ptr1[0] * pad_filter1_c3[0];
-            sum1_c3 += in_ptr1[1] * pad_filter1_c3[1];
-            sum1_c3 += in_ptr1[2] * pad_filter1_c3[2];
-            sum1_c3 += in_ptr2[0] * pad_filter2_c3[0];
-            sum1_c3 += in_ptr2[1] * pad_filter2_c3[1];
-            sum1_c3 += in_ptr2[2] * pad_filter2_c3[2];
-            sum1_c3 += in_ptr3[0] * pad_filter3_c3[0];
-            sum1_c3 += in_ptr3[1] * pad_filter3_c3[1];
-            sum1_c3 += in_ptr3[2] * pad_filter3_c3[2];
-
-            sum1_c4 += in_ptr1[0] * pad_filter1_c4[0];
-            sum1_c4 += in_ptr1[1] * pad_filter1_c4[1];
-            sum1_c4 += in_ptr1[2] * pad_filter1_c4[2];
-            sum1_c4 += in_ptr2[0] * pad_filter2_c4[0];
-            sum1_c4 += in_ptr2[1] * pad_filter2_c4[1];
-            sum1_c4 += in_ptr2[2] * pad_filter2_c4[2];
-            sum1_c4 += in_ptr3[0] * pad_filter3_c4[0];
-            sum1_c4 += in_ptr3[1] * pad_filter3_c4[1];
-            sum1_c4 += in_ptr3[2] * pad_filter3_c4[2];
-
-            sum1_c5 += in_ptr1[0] * pad_filter1_c5[0];
-            sum1_c5 += in_ptr1[1] * pad_filter1_c5[1];
-            sum1_c5 += in_ptr1[2] * pad_filter1_c5[2];
-            sum1_c5 += in_ptr2[0] * pad_filter2_c5[0];
-            sum1_c5 += in_ptr2[1] * pad_filter2_c5[1];
-            sum1_c5 += in_ptr2[2] * pad_filter2_c5[2];
-            sum1_c5 += in_ptr3[0] * pad_filter3_c5[0];
-            sum1_c5 += in_ptr3[1] * pad_filter3_c5[1];
-            sum1_c5 += in_ptr3[2] * pad_filter3_c5[2];
-
-            sum1_c6 += in_ptr1[0] * pad_filter1_c6[0];
-            sum1_c6 += in_ptr1[1] * pad_filter1_c6[1];
-            sum1_c6 += in_ptr1[2] * pad_filter1_c6[2];
-            sum1_c6 += in_ptr2[0] * pad_filter2_c6[0];
-            sum1_c6 += in_ptr2[1] * pad_filter2_c6[1];
-            sum1_c6 += in_ptr2[2] * pad_filter2_c6[2];
-            sum1_c6 += in_ptr3[0] * pad_filter3_c6[0];
-            sum1_c6 += in_ptr3[1] * pad_filter3_c6[1];
-            sum1_c6 += in_ptr3[2] * pad_filter3_c6[2];
-
-            sum1_c7 += in_ptr1[0] * pad_filter1_c7[0];
-            sum1_c7 += in_ptr1[1] * pad_filter1_c7[1];
-            sum1_c7 += in_ptr1[2] * pad_filter1_c7[2];
-            sum1_c7 += in_ptr2[0] * pad_filter2_c7[0];
-            sum1_c7 += in_ptr2[1] * pad_filter2_c7[1];
-            sum1_c7 += in_ptr2[2] * pad_filter2_c7[2];
-            sum1_c7 += in_ptr3[0] * pad_filter3_c7[0];
-            sum1_c7 += in_ptr3[1] * pad_filter3_c7[1];
-            sum1_c7 += in_ptr3[2] * pad_filter3_c7[2];
-
-            sum1_c8 += in_ptr1[0] * pad_filter1_c8[0];
-            sum1_c8 += in_ptr1[1] * pad_filter1_c8[1];
-            sum1_c8 += in_ptr1[2] * pad_filter1_c8[2];
-            sum1_c8 += in_ptr2[0] * pad_filter2_c8[0];
-            sum1_c8 += in_ptr2[1] * pad_filter2_c8[1];
-            sum1_c8 += in_ptr2[2] * pad_filter2_c8[2];
-            sum1_c8 += in_ptr3[0] * pad_filter3_c8[0];
-            sum1_c8 += in_ptr3[1] * pad_filter3_c8[1];
-            sum1_c8 += in_ptr3[2] * pad_filter3_c8[2];
-#endif
-            if (if_nopadding) {
-              in_ptr1 += 2;
-              in_ptr2 += 2;
-              in_ptr3 += 2;
-            } else if (input_w > 3 &&
-                       (if_odd_pad_w && o_w == valid_w_start ||
-                        o_w == valid_w_end && if_odd_pad_w && if_exact_in_w ||
-                        o_w == valid_w_end + 1 && !if_odd_pad_w &&
-                            !if_exact_in_w)) {
-              pad_filter1--;
-              pad_filter2--;
-              pad_filter3--;
-              pad_filter1_c2--;
-              pad_filter2_c2--;
-              pad_filter3_c2--;
-
-              pad_filter1_c3--;
-              pad_filter2_c3--;
-              pad_filter3_c3--;
-              pad_filter1_c4--;
-              pad_filter2_c4--;
-              pad_filter3_c4--;
-
-              pad_filter1_c5--;
-              pad_filter2_c5--;
-              pad_filter3_c5--;
-              pad_filter1_c6--;
-              pad_filter2_c6--;
-              pad_filter3_c6--;
-
-              pad_filter1_c7--;
-              pad_filter2_c7--;
-              pad_filter3_c7--;
-              pad_filter1_c8--;
-              pad_filter2_c8--;
-              pad_filter3_c8--;
-
-              in_ptr1++;
-              in_ptr2++;
-              in_ptr3++;
-            } else if (input_w <= 3 || o_w < valid_w_start ||
-                       o_w > valid_w_end) {
-              pad_filter1 -= 2;
-              pad_filter2 -= 2;
-              pad_filter3 -= 2;
-              pad_filter1_c2 -= 2;
-              pad_filter2_c2 -= 2;
-              pad_filter3_c2 -= 2;
-
-              pad_filter1_c3 -= 2;
-              pad_filter2_c3 -= 2;
-              pad_filter3_c3 -= 2;
-              pad_filter1_c4 -= 2;
-              pad_filter2_c4 -= 2;
-              pad_filter3_c4 -= 2;
-
-              pad_filter1_c5 -= 2;
-              pad_filter2_c5 -= 2;
-              pad_filter3_c5 -= 2;
-              pad_filter1_c6 -= 2;
-              pad_filter2_c6 -= 2;
-              pad_filter3_c6 -= 2;
-
-              pad_filter1_c7 -= 2;
-              pad_filter2_c7 -= 2;
-              pad_filter3_c7 -= 2;
-              pad_filter1_c8 -= 2;
-              pad_filter2_c8 -= 2;
-              pad_filter3_c8 -= 2;
-            } else {
-              in_ptr1 += 2;
-              in_ptr2 += 2;
-              in_ptr3 += 2;
-            }
-            *out_ptr1 += sum1;
-            *out_ptr1_c2 += sum1_c2;
-            *out_ptr1_c3 += sum1_c3;
-            *out_ptr1_c4 += sum1_c4;
-            *out_ptr1_c5 += sum1_c5;
-            *out_ptr1_c6 += sum1_c6;
-            *out_ptr1_c7 += sum1_c7;
-            *out_ptr1_c8 += sum1_c8;
-
-            out_ptr1++;
-            out_ptr1_c2++;
-            out_ptr1_c3++;
-            out_ptr1_c4++;
-            out_ptr1_c5++;
-            out_ptr1_c6++;
-            out_ptr1_c7++;
-            out_ptr1_c8++;
-          }
-          if (if_nopadding) {
-            in_ptr1 += remain_stride_w + input_w;
-            in_ptr2 += remain_stride_w + input_w;
-            in_ptr3 += remain_stride_w + input_w;
-
-          } else if (input_h > 3 &&
-                     (if_odd_pad_h && o_h == valid_h_start ||
-                      o_h == valid_h_end && if_odd_pad_h && if_exact_in_h ||
-                      o_h == valid_h_end + 1 && !if_odd_pad_h &&
-                          !if_exact_in_h)) {
-            in_ptr1 += 3;
-            in_ptr2 += 3;
-            in_ptr3 += 3;
-
-            pad_filter1 -= remain_stride_w;
-            pad_filter2 -= remain_stride_w;
-            pad_filter3 -= remain_stride_w;
-            pad_filter1_c2 -= remain_stride_w;
-            pad_filter2_c2 -= remain_stride_w;
-            pad_filter3_c2 -= remain_stride_w;
-
-            pad_filter1_c3 -= remain_stride_w;
-            pad_filter2_c3 -= remain_stride_w;
-            pad_filter3_c3 -= remain_stride_w;
-            pad_filter1_c4 -= remain_stride_w;
-            pad_filter2_c4 -= remain_stride_w;
-            pad_filter3_c4 -= remain_stride_w;
-
-            pad_filter1_c5 -= remain_stride_w;
-            pad_filter2_c5 -= remain_stride_w;
-            pad_filter3_c5 -= remain_stride_w;
-            pad_filter1_c6 -= remain_stride_w;
-            pad_filter2_c6 -= remain_stride_w;
-            pad_filter3_c6 -= remain_stride_w;
-
-            pad_filter1_c7 -= remain_stride_w;
-            pad_filter2_c7 -= remain_stride_w;
-            pad_filter3_c7 -= remain_stride_w;
-            pad_filter1_c8 -= remain_stride_w;
-            pad_filter2_c8 -= remain_stride_w;
-            pad_filter3_c8 -= remain_stride_w;
-          } else if (input_h <= 3 || o_h < valid_h_start || o_h > valid_h_end) {
-            in_ptr1 -= input_w - 3;
-            in_ptr2 -= input_w - 3;
-            in_ptr3 -= input_w - 3;
-
-            pad_filter1 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter2 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter3 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter1_c2 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter2_c2 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter3_c2 -= 3 + 2 * padding_w + remain_stride_w;
-
-            pad_filter1_c3 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter2_c3 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter3_c3 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter1_c4 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter2_c4 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter3_c4 -= 3 + 2 * padding_w + remain_stride_w;
-
-            pad_filter1_c5 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter2_c5 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter3_c5 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter1_c6 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter2_c6 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter3_c6 -= 3 + 2 * padding_w + remain_stride_w;
-
-            pad_filter1_c7 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter2_c7 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter3_c7 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter1_c8 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter2_c8 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter3_c8 -= 3 + 2 * padding_w + remain_stride_w;
-          } else {
-            pad_filter1 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter2 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter3 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter1_c2 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter2_c2 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter3_c2 += 3 + 2 * padding_w - remain_stride_w;
-
-            pad_filter1_c3 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter2_c3 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter3_c3 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter1_c4 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter2_c4 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter3_c4 += 3 + 2 * padding_w - remain_stride_w;
-
-            pad_filter1_c5 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter2_c5 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter3_c5 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter1_c6 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter2_c6 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter3_c6 += 3 + 2 * padding_w - remain_stride_w;
-
-            pad_filter1_c7 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter2_c7 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter3_c7 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter1_c8 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter2_c8 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter3_c8 += 3 + 2 * padding_w - remain_stride_w;
-
-            in_ptr1 += input_w + 3;
-            in_ptr2 += input_w + 3;
-            in_ptr3 += input_w + 3;
-          }
-        }
-
-        filter_data_ch += filter_ch_size;
-        filter_data_ch_c2 += filter_ch_size;
-        filter_data_ch_c3 += filter_ch_size;
-        filter_data_ch_c4 += filter_ch_size;
-        filter_data_ch_c5 += filter_ch_size;
-        filter_data_ch_c6 += filter_ch_size;
-        filter_data_ch_c7 += filter_ch_size;
-        filter_data_ch_c8 += filter_ch_size;
-        input_data_ch += in_ch_size;
-      }
-    }
-
-    int out_ch_remain_start = output_ch - output_ch % 8;
-
-    // remain output_channel
-#pragma omp parallel for
-    for (int o_c = out_ch_remain_start; o_c < output_ch; ++o_c) {
-      const float *f1, *f9;
-      const float *in_ptr1, *in_ptr2, *in_ptr3;
-      const float *pad_filter1, *pad_filter2, *pad_filter3;
-      float pad_filter_arr[pad_filter_ch_size];
-      float *output_data_ch;
-      const float *input_data_ch;
-      const float *filter_data_ch;
-
-      filter_data_ch = filter_data + o_c * filter_ch_size * input_ch;
-      input_data_ch = input_data;
-      output_data_ch = output_data + o_c * out_ch_size;
-
-      for (int i_c = 0; i_c < input_ch; ++i_c) {
-        f1 = filter_data_ch;
-        f9 = f1 + 8;
-
-        if (!if_nopadding) {
-          memset(pad_filter_arr, 0.f, sizeof(pad_filter_arr));
-          for (int i = 0; i < 9; ++i) {
-            int j = i / 3 * (2 * padding_w + 3) + i % 3 + padding_h * 3 +
-                    padding_w * (2 * padding_h + 1);
-            pad_filter_arr[j] = filter_data_ch[i];
-          }
-          pad_filter1 = pad_filter_arr;
-          pad_filter1 += pad_filter_start;
-          pad_filter2 = pad_filter1 + pad_filter_w;
-          pad_filter3 = pad_filter2 + pad_filter_w;
-        } else {
-          pad_filter1 = filter_data_ch;
-          pad_filter2 = pad_filter1 + 3;
-          pad_filter3 = pad_filter2 + 3;
-        }
-
-        float *out_ptr1;
-        out_ptr1 = output_data_ch;
-        in_ptr1 = input_data_ch;
-        in_ptr2 = in_ptr1 + input_w;
-        in_ptr3 = in_ptr2 + input_w;
-
-        int o_h = 0;
-        for (; o_h < output_h; ++o_h) {
-          int o_w = 0;
-
-          // pad left
-          for (; o_w <= valid_w_start; ++o_w) {
-            float sum1 = 0;
-#if __ARM_NEON
-            float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-            float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-            float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-
-            float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-            float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-            _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-
-            float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-            float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-            _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-
-            _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-            float32x2_t _ss1 =
-                vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-            float32x2_t _ssss1_ssss1 = vpadd_f32(_ss1, _ss1);
-            sum1 += vget_lane_f32(_ssss1_ssss1, 0);
-#else
-            sum1 += in_ptr1[0] * pad_filter1[0];
-            sum1 += in_ptr1[1] * pad_filter1[1];
-            sum1 += in_ptr1[2] * pad_filter1[2];
-            sum1 += in_ptr2[0] * pad_filter2[0];
-            sum1 += in_ptr2[1] * pad_filter2[1];
-            sum1 += in_ptr2[2] * pad_filter2[2];
-            sum1 += in_ptr3[0] * pad_filter3[0];
-            sum1 += in_ptr3[1] * pad_filter3[1];
-            sum1 += in_ptr3[2] * pad_filter3[2];
-#endif
-            if (if_nopadding) {
-              in_ptr1 += 2;
-              in_ptr2 += 2;
-              in_ptr3 += 2;
-            } else if (input_w > 3 &&
-                       (if_odd_pad_w && o_w == valid_w_start ||
-                        o_w == valid_w_end && if_odd_pad_w && if_exact_in_w ||
-                        o_w == valid_w_end + 1 && !if_odd_pad_w &&
-                            !if_exact_in_w)) {
-              pad_filter1--;
-              pad_filter2--;
-              pad_filter3--;
-              in_ptr1++;
-              in_ptr2++;
-              in_ptr3++;
-
-            } else if (input_w <= 3 || o_w < valid_w_start ||
-                       o_w > valid_w_end) {
-              pad_filter1 -= 2;
-              pad_filter2 -= 2;
-              pad_filter3 -= 2;
-            } else {
-              in_ptr1 += 2;
-              in_ptr2 += 2;
-              in_ptr3 += 2;
-            }
-            *out_ptr1 += sum1;
-            out_ptr1++;
-          }
-            // valid
-#if __ARM_NEON
-#if __aarch64__
-          if (o_h > valid_h_start && o_h < valid_h_end) {
-            int loop = (valid_w_end - valid_w_start - 1) >> 2;
-            o_w += loop * 4;
-
-            if (loop > 0) {
-              asm volatile(
-                  "prfm   pldl1keep, [%[f1], #256]            \n\t"
-                  "prfm   pldl1keep, [%[f9], #256]            \n\t"
-
-                  "ld1   {v0.4s, v1.4s}, [%[f1]]              \n\t"
-                  "ld1   {v4.s}[0], [%[f9]]                   \n\t"
-
-                  "0:                                         \n\t"
-                  // load out_ptr
-                  "prfm   pldl1keep, [%[out_ptr1], #128]      \n\t"
-                  "ld1   {v12.4s}, [%[out_ptr1]]              \n\t"
-
-                  // in_ptr1 multiply
-                  "prfm   pldl1keep, [%[in_ptr1], #256]       \n\t"
-                  "ld2   {v5.4s, v6.4s}, [%[in_ptr1]], #32    \n\t"
-                  "ld2   {v7.4s, v8.4s}, [%[in_ptr1]]         \n\t"
-
-                  "fmla   v12.4s, v5.4s, v0.s[0]              \n\t"
-                  "fmla   v14.4s, v5.4s, v2.s[0]              \n\t"
-
-                  "ext    v8.16b, v5.16b, v7.16b, #4          \n\t"
-                  "fmul   v13.4s, v6.4s, v0.s[1]              \n\t"
-                  "fmla   v12.4s, v8.4s, v0.s[2]              \n\t"
-
-                  "ld2   {v5.4s, v6.4s}, [%[in_ptr2]], #32    \n\t"
-                  "ld2   {v7.4s, v8.4s}, [%[in_ptr2]]         \n\t"
-
-                  // in_ptr2 multiply
-                  "fmla   v13.4s, v5.4s, v0.s[3]              \n\t"
-                  "ext    v8.16b, v5.16b, v7.16b, #4          \n\t"
-                  "fmla   v12.4s, v6.4s, v1.s[0]              \n\t"
-
-                  "fmla   v13.4s, v8.4s, v1.s[1]              \n\t"
-                  "ld2   {v5.4s, v6.4s}, [%[in_ptr3]], #32    \n\t"
-                  "ld2   {v7.4s, v8.4s}, [%[in_ptr3]]         \n\t"
-
-                  // in_ptr3 multiply
-                  "fmla   v12.4s, v5.4s, v1.s[2]              \n\t"
-                  "ext    v8.16b, v5.16b, v7.16b, #4          \n\t"
-
-                  "fmla   v13.4s, v6.4s, v1.s[3]              \n\t"
-                  "fmla   v12.4s, v8.4s, v4.s[0]              \n\t"
-
-                  // store out_ptr
-                  "fadd   v12.4s, v12.4s, v13.4s              \n\t"
-                  "st1   {v12.4s}, [%[out_ptr1]], #16         \n\t"
-
-                  // cycle
-                  "subs       %[loop], %[loop], #1      \n\t"
-                  "bne        0b                              \n\t"
-
-                  : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1),
-                    [in_ptr1] "+r"(in_ptr1), [in_ptr2] "+r"(in_ptr2),
-                    [in_ptr3] "+r"(in_ptr3)
-                  : [f1] "r"(f1), [f9] "r"(f9)
-                  : "cc", "memory", "v0", "v1", "v4", "v5", "v6", "v7", "v8",
-                    "v12", "v13");
-            }
-          }
-#else
-          if (o_h > valid_h_start && o_h < valid_h_end) {
-            int loop = (valid_w_end - valid_w_start - 1) >> 2;
-            o_w += loop * 4;
-
-            if (loop > 0) {
-              asm volatile(
-                  "pld        [%[f1], #256]                   \n\t"
-                  "pld        [%[f9], #256]                   \n\t"
-
-                  "vld1.f32   {d0-d3}, [%[f1]]                \n\t"
-                  "vld1.f32   {d8[0]}, [%[f9]]                \n\t"
-
-                  "pld        [%[in_ptr1], #256]              \n\t"
-                  "vld2.f32   {d10-d13}, [%[in_ptr1]]!        \n\t"
-                  "vld2.f32   {d14, d15}, [%[in_ptr1]]        \n\t"
-
-                  "0:                                         \n\t"
-                  // load out_ptr
-                  "pld        [%[out_ptr1], #128]             \n\t"
-                  "vld1.f32   {d24, d25}, [%[out_ptr1]]       \n\t"
-
-                  // in_ptr1 multiply
-                  "pld        [%[in_ptr2], #256]              \n\t"
-                  "vld2.f32   {d4-d7}, [%[in_ptr2]]!          \n\t"
-
-                  "vmla.f32   q12, q5, d0[0]                  \n\t"
-                  "vld2.f32   {d20, d21}, [%[in_ptr2]]        \n\t"
-                  "vext.32    q8, q5, q7, #1                  \n\t"
-
-                  "pld        [%[in_ptr3], #256]              \n\t"
-                  "vmul.f32   q13, q6, d0[1]                  \n\t"
-
-                  "vld2.f32   {d10-d13}, [%[in_ptr3]]!        \n\t"
-                  "vmul.f32   q14, q8, d1[0]                  \n\t"
-                  "vld2.f32   {d14, d15}, [%[in_ptr3]]        \n\t"
-
-                  // in_ptr2 multiply
-                  "vmul.f32   q15, q2, d1[1]                  \n\t"
-                  "vext.32    q8, q2, q10, #1                 \n\t"
-
-                  "vmla.f32   q12, q3, d2[0]                  \n\t"
-                  "vmla.f32   q13, q8, d2[1]                  \n\t"
-
-                  // in_ptr3 multiply
-                  "vmla.f32   q14, q5, d3[0]                  \n\t"
-                  "vext.32    q8, q5, q7, #1                  \n\t"
-
-                  "pld        [%[in_ptr1], #256]              \n\t"
-                  "vmla.f32   q15, q6, d3[1]                  \n\t"
-
-                  "vld2.f32   {d10-d13}, [%[in_ptr1]]!        \n\t"
-                  "vmla.f32   q13, q8, d8[0]                  \n\t"
-
-                  // store out_ptr
-                  "vld2.f32   {d14, d15}, [%[in_ptr1]]        \n\t"
-                  "vadd.f32   q12, q12, q13                   \n\t"
-                  "subs       %[loop], #1                 \n\t"
-
-                  "vadd.f32   q14, q14, q15                   \n\t"
-                  "vadd.f32   q12, q12, q14                   \n\t"
-                  "vst1.f32   {d24, d25}, [%[out_ptr1]]!      \n\t"
-
-                  // cycle
-                  "bne        0b                              \n\t"
-                  "subs       %[in_ptr1], %[in_ptr1], #32     \n\t"
-
-                  : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1),
-                    [in_ptr1] "+r"(in_ptr1), [in_ptr2] "+r"(in_ptr2),
-                    [in_ptr3] "+r"(in_ptr3)
-                  : [f1] "r"(f1), [f9] "r"(f9)
-                  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
-                    "q7", "q8", "q10", "q12", "q13", "q14", "q15");
-            }
-          }
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-          out_ptr1 -= 4;
-          out_ptr1 += 4;
-
-          // remain output_width
-          for (; o_w < output_w; ++o_w) {
-            float sum1 = 0;
-#if __ARM_NEON
-            float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-            float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-            float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-
-            float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-            float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-            _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-
-            float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-            float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-            _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-
-            _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-            float32x2_t _ss1 =
-                vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-            float32x2_t _ssss1_ssss1 = vpadd_f32(_ss1, _ss1);
-            sum1 += vget_lane_f32(_ssss1_ssss1, 0);
-#else
-            sum1 += in_ptr1[0] * pad_filter1[0];
-            sum1 += in_ptr1[1] * pad_filter1[1];
-            sum1 += in_ptr1[2] * pad_filter1[2];
-            sum1 += in_ptr2[0] * pad_filter2[0];
-            sum1 += in_ptr2[1] * pad_filter2[1];
-            sum1 += in_ptr2[2] * pad_filter2[2];
-            sum1 += in_ptr3[0] * pad_filter3[0];
-            sum1 += in_ptr3[1] * pad_filter3[1];
-            sum1 += in_ptr3[2] * pad_filter3[2];
-#endif
-            if (if_nopadding) {
-              in_ptr1 += 2;
-              in_ptr2 += 2;
-              in_ptr3 += 2;
-            } else if (input_w > 3 &&
-                       (if_odd_pad_w && o_w == valid_w_start ||
-                        o_w == valid_w_end && if_odd_pad_w && if_exact_in_w ||
-                        o_w == valid_w_end + 1 && !if_odd_pad_w &&
-                            !if_exact_in_w)) {
-              pad_filter1--;
-              pad_filter2--;
-              pad_filter3--;
-
-              in_ptr1++;
-              in_ptr2++;
-              in_ptr3++;
-
-            } else if (input_w <= 3 || o_w < valid_w_start ||
-                       o_w > valid_w_end) {
-              pad_filter1 -= 2;
-              pad_filter2 -= 2;
-              pad_filter3 -= 2;
-            } else {
-              in_ptr1 += 2;
-              in_ptr2 += 2;
-              in_ptr3 += 2;
-            }
-            *out_ptr1 += sum1;
-            out_ptr1++;
-          }
-          if (if_nopadding) {
-            in_ptr1 += remain_stride_w + input_w;
-            in_ptr2 += remain_stride_w + input_w;
-            in_ptr3 += remain_stride_w + input_w;
-          } else if (input_h > 3 &&
-                     (if_odd_pad_h && o_h == valid_h_start ||
-                      o_h == valid_h_end && if_odd_pad_h && if_exact_in_h ||
-                      o_h == valid_h_end + 1 && !if_odd_pad_h &&
-                          !if_exact_in_h)) {
-            in_ptr1 += 3;
-            in_ptr2 += 3;
-            in_ptr3 += 3;
-
-            pad_filter1 -= remain_stride_w;
-            pad_filter2 -= remain_stride_w;
-            pad_filter3 -= remain_stride_w;
-
-          } else if (input_h <= 3 || o_h < valid_h_start || o_h > valid_h_end) {
-            in_ptr1 -= input_w - 3;
-            in_ptr2 -= input_w - 3;
-            in_ptr3 -= input_w - 3;
-
-            pad_filter1 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter2 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter3 -= 3 + 2 * padding_w + remain_stride_w;
-          } else {
-            pad_filter1 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter2 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter3 += 3 + 2 * padding_w - remain_stride_w;
-
-            in_ptr1 += input_w + 3;
-            in_ptr2 += input_w + 3;
-            in_ptr3 += input_w + 3;
-          }
-        }
-        filter_data_ch += filter_ch_size;
-        input_data_ch += in_ch_size;
-      }
-    }
-    input_data += in_batch_size;
-    output_data += out_batch_size;
-  }
-}
-
-template <>
-void SlidingwindowConv3x3s1Faster<float, float>(
-    const framework::Tensor *input, framework::Tensor *filter,
-    const std::vector<int> &paddings, framework::Tensor *output,
-    const float *bias, bool is_bias, bool is_relu) {
-  const float *din = input->data<float>();
-  float *dout = output->mutable_data<float>();
-  const float *weights = filter->mutable_data<float>();
-  if (!is_bias) {
-    bias = nullptr;
-  }
-  bool relu = is_relu;
-  const int num = input->dims()[0];
-  const int chin = input->dims()[1];
-  const int hin = input->dims()[2];
-  const int win = input->dims()[3];
-  const int chout = output->dims()[1];
-  const int hout = output->dims()[2];
-  const int wout = output->dims()[3];
-  const int pad_h = paddings[0];
-  const int pad_w = paddings[1];
-  const int threads = framework::CPUContext::Context()->get_thread_num();
-  int l2_size =
-      framework::CPUContext::Context()->get_l2_cache_size() / sizeof(float);
-
-  const int hout_c_block = 4;
-  const int hout_r_kernel = 2;
-  const int wout_block = 4;
-  const int wout_round = ((wout + wout_block - 1) / wout_block) * wout_block;
-  const int win_round = wout_round + 2;
-
-  int hout_r_block = (l2_size - 2 * win_round * chin) /
-                     (win_round * chin + hout_c_block * wout_round * threads);
-  hout_r_block = hout_r_block > hout ? hout : hout_r_block;
-  hout_r_block = (hout_r_block / hout_r_kernel) * hout_r_kernel;
-  hout_r_block = hout_r_block < hout_r_kernel ? hout_r_kernel : hout_r_block;
-
-  const int hin_r_block = hout_r_block + 2;
-
-  float ptr_zero[win_round];
-  memset(ptr_zero, 0, sizeof(float) * win_round);
-  float ptr_write[wout_round];
-
-  int in_len = win_round * chin;
-  int pre_in_size = hin_r_block * in_len;
-  int pre_out_size = hout_c_block * hout_r_block * wout_round;
-
-  float *pre_din =
-      static_cast<float *>(framework::CPUContext::Context()->get_work_space(
-          (pre_in_size + threads * pre_out_size) * sizeof(float)));
-
-  int size_in_channel = win * hin;
-  int size_out_channel = wout * hout;
-  int w_stride = chin * 9;               // kernel_w * kernel_h;
-  int w_stride_chin = hout_c_block * 9;  // kernel_w * kernel_h *
-
-  int ws = -pad_w;
-  int we = ws + win_round;
-  int w_loop = wout_round / 4;
-
-  int c_remain = chout - (chout / hout_c_block) * hout_c_block;
-  int c_round_down = (chout / hout_c_block) * hout_c_block;
-
-  int out_row_stride = hout_c_block * wout_round;
-  for (int n = 0; n < num; ++n) {
-    const float *din_batch = din + n * chin * size_in_channel;
-    float *dout_batch = dout + n * chout * size_out_channel;
-    for (int h = 0; h < hout; h += hout_r_block) {
-      int h_kernel = hout_r_block;
-      if (h + hout_r_block > hout) {
-        h_kernel = hout - h;
-      }
-      int hs = h - pad_h;
-      int he = hs + h_kernel + 2;
-      slidingwindow_prepack_input(din_batch, pre_din, 0, chin, hs, he, ws, we,
-                                  chin, win, hin, ptr_zero);
-#pragma omp parallel for
-      for (int c = 0; c < chout - (hout_c_block - 1); c += hout_c_block) {
-#ifdef _OPENMP
-        float *pre_out =
-            pre_din + pre_in_size + omp_get_thread_num() * pre_out_size;
-#else
-        float *pre_out = pre_din + pre_in_size;
-#endif
-        const float *block_inr0 = pre_din;
-        const float *block_inr1 = block_inr0 + in_len;
-        const float *block_inr2 = block_inr1 + in_len;
-        const float *block_inr3 = block_inr2 + in_len;
-
-        const float *weight_c = weights + c * w_stride;
-        const float *bias_ptr = ptr_zero;
-        if (bias != nullptr) {
-          bias_ptr = bias + c;
-        }
-        slidingwindow_fill_bias(pre_out, bias_ptr,
-                                wout_round * hout_c_block * h_kernel);
-
-        for (int hk = 0; hk < h_kernel; hk += hout_r_kernel) {
-          const float *wc0 = weight_c;
-
-          const float *inr0 = block_inr0;
-          const float *inr1 = block_inr1;
-          const float *inr2 = block_inr2;
-          const float *inr3 = block_inr3;
-
-          float *pre_out0 = pre_out + hk * out_row_stride;
-          float *pre_out1 = pre_out0 + out_row_stride;
-#ifdef __aarch64__
-          for (int i = 0; i < chin; ++i) {
-            float *ptr_out0 = pre_out0;
-            float *ptr_out1 = pre_out1;
-
-            float32x4_t w0 = vld1q_f32(wc0);       // w0, v23
-            float32x4_t w1 = vld1q_f32(wc0 + 4);   // w1, v24
-            float32x4_t w2 = vld1q_f32(wc0 + 8);   // w2, v25
-            float32x4_t w3 = vld1q_f32(wc0 + 12);  // w3, v26
-            float32x4_t w4 = vld1q_f32(wc0 + 16);  // w4, v27
-            float32x4_t w5 = vld1q_f32(wc0 + 20);  // w5, v28
-            float32x4_t w6 = vld1q_f32(wc0 + 24);  // w6, v29
-            float32x4_t w7 = vld1q_f32(wc0 + 28);  // w7, v30
-            float32x4_t w8 = vld1q_f32(wc0 + 32);  // w8, v31
-
-            const float *r0 = inr0;
-            const float *r1 = inr1;
-            const float *r2 = inr2;
-            const float *r3 = inr3;
-
-            int cnt = w_loop;
-            asm volatile(
-                "ldp    q15, q16, [%[ptr_out0]]     \n" /* load outr00, outr01*/
-                "ldp    q17, q18, [%[ptr_out0], #32]\n" /* load outr02, outr03*/
-                "ldp    q19, q20, [%[ptr_out1]]     \n" /* load outr10, outr11*/
-                "ldp    q21, q22, [%[ptr_out1], #32]\n" /* load outr10, outr11*/
-                "ldp    q0, q1,   [%[r0]], #16      \n" /* load input r0*/
-                "ldp    q2, q3,   [%[r1]], #16      \n" /* load input r1*/
-                "2:                                 \n" /* main loop*/
-                /*  r0, r1, mul w0, get out r0, r1 */
-                "fmla   v15.4s ,  %[w0].4s,  v0.s[0]\n" /* outr00 = w0 * r0[0]*/
-                "fmla   v16.4s ,  %[w0].4s,  v0.s[1]\n" /* outr01 = w0 * r0[1]*/
-                "fmla   v17.4s ,  %[w0].4s,  v0.s[2]\n" /* outr02 = w0 * r0[2]*/
-                "fmla   v18.4s ,  %[w0].4s,  v0.s[3]\n" /* outr03 = w0 * r0[3]*/
-                "fmla   v19.4s ,  %[w0].4s,  v2.s[0]\n" /* outr10 = w0 * r1[0]*/
-                "fmla   v20.4s ,  %[w0].4s,  v2.s[1]\n" /* outr11 = w0 * r1[1]*/
-                "fmla   v21.4s ,  %[w0].4s,  v2.s[2]\n" /* outr12 = w0 * r1[2]*/
-                "fmla   v22.4s ,  %[w0].4s,  v2.s[3]\n" /* outr13 = w0 * r1[3]*/
-
-                /*  r0, r1, mul w1, get out r0, r1 */
-                "fmla   v15.4s ,  %[w1].4s,  v0.s[1]\n" /* outr00 = w1 * r0[1]*/
-                "fmla   v16.4s ,  %[w1].4s,  v0.s[2]\n" /* outr01 = w1 * r0[2]*/
-                "fmla   v17.4s ,  %[w1].4s,  v0.s[3]\n" /* outr02 = w1 * r0[3]*/
-                "fmla   v18.4s ,  %[w1].4s,  v1.s[0]\n" /* outr03 = w1 * r0[4]*/
-                "fmla   v19.4s ,  %[w1].4s,  v2.s[1]\n" /* outr10 = w1 * r1[1]*/
-                "fmla   v20.4s ,  %[w1].4s,  v2.s[2]\n" /* outr11 = w1 * r1[2]*/
-                "fmla   v21.4s ,  %[w1].4s,  v2.s[3]\n" /* outr12 = w1 * r1[3]*/
-                "fmla   v22.4s ,  %[w1].4s,  v3.s[0]\n" /* outr13 = w1 * r1[4]*/
-
-                "ldp    q4, q5,   [%[r2]], #16      \n" /* load input r2*/
-
-                /*  r0, r1, mul w2, get out r0, r1 */
-                "fmla   v15.4s ,  %[w2].4s,  v0.s[2]\n" /* outr00 = w2 * r0[2]*/
-                "fmla   v16.4s ,  %[w2].4s,  v0.s[3]\n" /* outr01 = w2 * r0[3]*/
-                "fmla   v17.4s ,  %[w2].4s,  v1.s[0]\n" /* outr02 = w2 * r0[0]*/
-                "fmla   v18.4s ,  %[w2].4s,  v1.s[1]\n" /* outr03 = w2 * r0[1]*/
-                "fmla   v19.4s ,  %[w2].4s,  v2.s[2]\n" /* outr10 = w2 * r1[2]*/
-                "fmla   v20.4s ,  %[w2].4s,  v2.s[3]\n" /* outr11 = w2 * r1[3]*/
-                "fmla   v21.4s ,  %[w2].4s,  v3.s[0]\n" /* outr12 = w2 * r1[0]*/
-                "fmla   v22.4s ,  %[w2].4s,  v3.s[1]\n" /* outr13 = w2 * r1[1]*/
-
-                /*  r1, r2, mul w3, get out r0, r1 */
-                "fmla   v15.4s ,  %[w3].4s,  v2.s[0]\n" /* outr00 = w3 * r1[0]*/
-                "fmla   v16.4s ,  %[w3].4s,  v2.s[1]\n" /* outr01 = w3 * r1[1]*/
-                "fmla   v17.4s ,  %[w3].4s,  v2.s[2]\n" /* outr02 = w3 * r1[2]*/
-                "fmla   v18.4s ,  %[w3].4s,  v2.s[3]\n" /* outr03 = w3 * r1[3]*/
-                "fmla   v19.4s ,  %[w3].4s,  v4.s[0]\n" /* outr10 = w3 * r2[0]*/
-                "fmla   v20.4s ,  %[w3].4s,  v4.s[1]\n" /* outr11 = w3 * r2[1]*/
-                "fmla   v21.4s ,  %[w3].4s,  v4.s[2]\n" /* outr12 = w3 * r2[2]*/
-                "fmla   v22.4s ,  %[w3].4s,  v4.s[3]\n" /* outr13 = w3 * r2[3]*/
-
-                "ldp    q0, q1,   [%[r0]], #16      \n" /* load next input r0*/
-
-                /*  r1, r2, mul w4, get out r0, r1 */
-                "fmla   v15.4s ,  %[w4].4s,  v2.s[1]\n" /* outr00 = w4 * r1[1]*/
-                "fmla   v16.4s ,  %[w4].4s,  v2.s[2]\n" /* outr01 = w4 * r1[2]*/
-                "fmla   v17.4s ,  %[w4].4s,  v2.s[3]\n" /* outr02 = w4 * r1[3]*/
-                "fmla   v18.4s ,  %[w4].4s,  v3.s[0]\n" /* outr03 = w4 * r1[4]*/
-                "fmla   v19.4s ,  %[w4].4s,  v4.s[1]\n" /* outr10 = w4 * r2[1]*/
-                "fmla   v20.4s ,  %[w4].4s,  v4.s[2]\n" /* outr11 = w4 * r2[2]*/
-                "fmla   v21.4s ,  %[w4].4s,  v4.s[3]\n" /* outr12 = w4 * r2[3]*/
-                "fmla   v22.4s ,  %[w4].4s,  v5.s[0]\n" /* outr13 = w4 * r2[4]*/
-
-                "ldp    q6, q7,   [%[r3]], #16      \n" /* load input r3*/
-
-                /*  r1, r2, mul w5, get out r0, r1 */
-                "fmla   v15.4s ,  %[w5].4s,  v2.s[2]\n" /* outr00 = w5 * r1[2]*/
-                "fmla   v16.4s ,  %[w5].4s,  v2.s[3]\n" /* outr01 = w5 * r1[3]*/
-                "fmla   v17.4s ,  %[w5].4s,  v3.s[0]\n" /* outr02 = w5 * r1[0]*/
-                "fmla   v18.4s ,  %[w5].4s,  v3.s[1]\n" /* outr03 = w5 * r1[1]*/
-                "fmla   v19.4s ,  %[w5].4s,  v4.s[2]\n" /* outr10 = w5 * r2[2]*/
-                "fmla   v20.4s ,  %[w5].4s,  v4.s[3]\n" /* outr11 = w5 * r2[3]*/
-                "fmla   v21.4s ,  %[w5].4s,  v5.s[0]\n" /* outr12 = w5 * r2[0]*/
-                "fmla   v22.4s ,  %[w5].4s,  v5.s[1]\n" /* outr13 = w5 * r2[1]*/
-
-                /*  r2, r3, mul w6, get out r0, r1 */
-                "fmla   v15.4s ,  %[w6].4s,  v4.s[0]\n" /* outr00 = w6 * r2[0]*/
-                "fmla   v16.4s ,  %[w6].4s,  v4.s[1]\n" /* outr01 = w6 * r2[1]*/
-                "fmla   v17.4s ,  %[w6].4s,  v4.s[2]\n" /* outr02 = w6 * r2[2]*/
-                "fmla   v18.4s ,  %[w6].4s,  v4.s[3]\n" /* outr03 = w6 * r2[3]*/
-                "fmla   v19.4s ,  %[w6].4s,  v6.s[0]\n" /* outr10 = w6 * r3[0]*/
-                "fmla   v20.4s ,  %[w6].4s,  v6.s[1]\n" /* outr11 = w6 * r3[1]*/
-                "fmla   v21.4s ,  %[w6].4s,  v6.s[2]\n" /* outr12 = w6 * r3[2]*/
-                "fmla   v22.4s ,  %[w6].4s,  v6.s[3]\n" /* outr13 = w6 * r3[3]*/
-
-                "ldp    q2, q3,   [%[r1]], #16      \n" /* load next input r1*/
-
-                /*  r2, r3, mul w7, get out r0, r1 */
-                "fmla   v15.4s ,  %[w7].4s,  v4.s[1]\n" /* outr00 = w7 * r2[1]*/
-                "fmla   v16.4s ,  %[w7].4s,  v4.s[2]\n" /* outr01 = w7 * r2[2]*/
-                "fmla   v17.4s ,  %[w7].4s,  v4.s[3]\n" /* outr02 = w7 * r2[3]*/
-                "fmla   v18.4s ,  %[w7].4s,  v5.s[0]\n" /* outr03 = w7 * r2[4]*/
-                "fmla   v19.4s ,  %[w7].4s,  v6.s[1]\n" /* outr10 = w7 * r3[1]*/
-                "fmla   v20.4s ,  %[w7].4s,  v6.s[2]\n" /* outr11 = w7 * r3[2]*/
-                "fmla   v21.4s ,  %[w7].4s,  v6.s[3]\n" /* outr12 = w7 * r3[3]*/
-                "fmla   v22.4s ,  %[w7].4s,  v7.s[0]\n" /* outr13 = w7 * r3[4]*/
-
-                "subs   %w[cnt], %w[cnt], #1        \n" /*loop count -1*/
-
-                /*  r2, r3, mul w8, get out r0, r1 */
-                "fmla   v15.4s ,  %[w8].4s,  v4.s[2]\n" /* outr00 = w8 * r2[2]*/
-                "fmla   v16.4s ,  %[w8].4s,  v4.s[3]\n" /* outr01 = w8 * r2[3]*/
-                "fmla   v17.4s ,  %[w8].4s,  v5.s[0]\n" /* outr02 = w8 * r2[0]*/
-                "fmla   v18.4s ,  %[w8].4s,  v5.s[1]\n" /* outr03 = w8 * r2[1]*/
-
-                "stp    q15, q16, [%[ptr_out0]], #32\n" /* save outr00, outr01*/
-                "fmla   v19.4s ,  %[w8].4s,  v6.s[2]\n" /* outr10 = w8 * r3[2]*/
-                "stp    q17, q18, [%[ptr_out0]], #32\n" /* save outr02, outr03*/
-                "fmla   v20.4s ,  %[w8].4s,  v6.s[3]\n" /* outr11 = w8 * r3[3]*/
-                "ldp    q15, q16, [%[ptr_out0]]     \n" /* load outr00, outr01*/
-                "fmla   v21.4s ,  %[w8].4s,  v7.s[0]\n" /* outr12 = w8 * r3[0]*/
-                "ldp    q17, q18, [%[ptr_out0], #32]\n" /* load outr02, outr03*/
-                "fmla   v22.4s ,  %[w8].4s,  v7.s[1]\n" /* outr13 = w8 * r3[1]*/
-                "stp    q19, q20, [%[ptr_out1]], #32\n" /* save outr10, outr11*/
-                "stp    q21, q22, [%[ptr_out1]], #32\n" /* save outr12, outr13*/
-                "ldp    q19, q20, [%[ptr_out1]]     \n" /* load outr10, outr11*/
-                "ldp    q21, q22, [%[ptr_out1], #32]\n" /* load outr12, outr13*/
-                "bne    2b                          \n" /* jump to main loop*/
-
-                : [cnt] "+r"(cnt), [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2),
-                  [r3] "+r"(r3), [ptr_out0] "+r"(ptr_out0),
-                  [ptr_out1] "+r"(ptr_out1)
-                : [w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2), [w3] "w"(w3),
-                  [w4] "w"(w4), [w5] "w"(w5), [w6] "w"(w6), [w7] "w"(w7),
-                  [w8] "w"(w8)
-                : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
-                  "v7", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22");
-
-            wc0 += 9 * hout_c_block;
-            inr0 += win_round;
-            inr1 += win_round;
-            inr2 += win_round;
-            inr3 += win_round;
-          }
-#else   // not __aarch64__
-          for (int i = 0; i < chin; ++i) {
-            const float *wc0 = weight_c + i * w_stride_chin;
-
-            float *ptr_out0 = pre_out0;
-            float *ptr_out1 = pre_out1;
-
-            const float *r0 = inr0;
-            const float *r1 = inr1;
-            const float *r2 = inr2;
-            const float *r3 = inr3;
-
-            int cnt = w_loop;
-            asm volatile(
-                "vld1.32    {d16-d19}, [%[ptr_out0]]!               @ load "
-                "outr0, w0, w1, c0~c3\n"
-                "vld1.32    {d20-d23}, [%[ptr_out0]]                @ load "
-                "outr0, w2, w3, c0~c3\n"
-
-                /* load weights */
-                "vld1.32    {d10-d13}, [%[wc0]]!                    @ load w0, "
-                "w1, to q5, q6\n"
-                "vld1.32    {d14-d15}, [%[wc0]]!                    @ load w2, "
-                "to q7\n"
-
-                /* load r0, r1 */
-                "vld1.32    {d0-d1}, [%[r0]]!                       @ load r0, "
-                "4 float\n"
-                "vld1.32    {d2}, [%[r0]]                           @ load r0, "
-                "2 float\n"
-
-                "sub    %[ptr_out0], %[ptr_out0], #32               @ ptr_out0 "
-                "- 32, to start address\n"
-
-                /* main loop */
-                "0:                                                 @ main "
-                "loop\n"
-                /* mul r0 with w0, w1, w2, get out r0 */
-                "vld1.32    {d24-d27}, [%[ptr_out1]]!               @ load "
-                "outr1, w0, w1, c0~c3\n"
-                "vmla.f32   q8, q5, d0[0]                           @ w0 * "
-                "inr00\n"
-                "vld1.32    {d28-d31}, [%[ptr_out1]]                @ load "
-                "outr1, w2, w3, c0~c3\n"
-                "vmla.f32   q9, q5, d0[1]                           @ w0 * "
-                "inr01\n"
-                "vmla.f32   q10, q5, d1[0]                          @ w0 * "
-                "inr02\n"
-                "vmla.f32   q11, q5, d1[1]                          @ w0 * "
-                "inr03\n"
-                "vld1.32    {d3-d4}, [%[r1]]!                       @ load r1, "
-                "4 float\n"
-                "vmla.f32   q8, q6, d0[1]                           @ w1 * "
-                "inr01\n"
-                "vmla.f32   q9, q6, d1[0]                           @ w1 * "
-                "inr02\n"
-                "vmla.f32   q10, q6, d1[1]                          @ w1 * "
-                "inr03\n"
-                "vmla.f32   q11, q6, d2[0]                          @ w1 * "
-                "inr04\n"
-                "vld1.32    {d5}, [%[r1]]                           @ load r0, "
-                "2 float\n"
-                "vmla.f32   q8, q7, d1[0]                           @ w2 * "
-                "inr02\n"
-                "vmla.f32   q9, q7, d1[1]                           @ w2 * "
-                "inr03\n"
-                "vmla.f32   q10, q7, d2[0]                          @ w2 * "
-                "inr04\n"
-                "vmla.f32   q11, q7, d2[1]                          @ w2 * "
-                "inr05\n"
-
-                "sub    %[ptr_out1], %[ptr_out1], #32               @ ptr_out1 "
-                "- 32, to start address\n"
-
-                /* mul r1 with w0, w1, w2, get out r1 */
-                "vmla.f32   q12, q5, d3[0]                          @ w0 * "
-                "inr10\n"
-                "vmla.f32   q13, q5, d3[1]                          @ w0 * "
-                "inr11\n"
-                "vmla.f32   q14, q5, d4[0]                          @ w0 * "
-                "inr12\n"
-                "vmla.f32   q15, q5, d4[1]                          @ w0 * "
-                "inr13\n"
-                "vmla.f32   q12, q6, d3[1]                          @ w1 * "
-                "inr11\n"
-                "vmla.f32   q13, q6, d4[0]                          @ w1 * "
-                "inr12\n"
-                "vmla.f32   q14, q6, d4[1]                          @ w1 * "
-                "inr13\n"
-                "vmla.f32   q15, q6, d5[0]                          @ w1 * "
-                "inr14\n"
-                "vld1.32    {d10-d13}, [%[wc0]]!                    @ load w3, "
-                "w4, to q5, q6\n"
-                "vmla.f32   q12, q7, d4[0]                          @ w2 * "
-                "inr12\n"
-                "vmla.f32   q13, q7, d4[1]                          @ w2 * "
-                "inr13\n"
-                "vmla.f32   q14, q7, d5[0]                          @ w2 * "
-                "inr14\n"
-                "vmla.f32   q15, q7, d5[1]                          @ w2 * "
-                "inr15\n"
-                "vld1.32    {d14-d15}, [%[wc0]]!                    @ load w5, "
-                "to q7\n"
-
-                /* mul r1 with w3, w4, w5, get out r0 */
-                "vmla.f32   q8, q5, d3[0]                           @ w3 * "
-                "inr10\n"
-                "vmla.f32   q9, q5, d3[1]                           @ w3 * "
-                "inr11\n"
-                "vmla.f32   q10, q5, d4[0]                          @ w3 * "
-                "inr12\n"
-                "vmla.f32   q11, q5, d4[1]                          @ w3 * "
-                "inr13\n"
-                "vld1.32    {d0-d1}, [%[r2]]!                       @ load r2, "
-                "4 float\n"
-                "vmla.f32   q8, q6, d3[1]                           @ w4 * "
-                "inr11\n"
-                "vmla.f32   q9, q6, d4[0]                           @ w4 * "
-                "inr12\n"
-                "vmla.f32   q10, q6, d4[1]                          @ w4 * "
-                "inr13\n"
-                "vmla.f32   q11, q6, d5[0]                          @ w4 * "
-                "inr14\n"
-                "vld1.32    {d2}, [%[r2]]                           @ load r2, "
-                "2 float\n"
-                "vmla.f32   q8, q7, d4[0]                           @ w5 * "
-                "inr12\n"
-                "vmla.f32   q9, q7, d4[1]                           @ w5 * "
-                "inr13\n"
-                "vmla.f32   q10, q7, d5[0]                          @ w5 * "
-                "inr14\n"
-                "vmla.f32   q11, q7, d5[1]                          @ w5 * "
-                "inr15\n"
-
-                /* mul r2 with w3, w4, w5, get out r1 */
-                "vmla.f32   q12, q5, d0[0]                          @ w3 * "
-                "inr20\n"
-                "vmla.f32   q13, q5, d0[1]                          @ w3 * "
-                "inr21\n"
-                "vmla.f32   q14, q5, d1[0]                          @ w3 * "
-                "inr22\n"
-                "vmla.f32   q15, q5, d1[1]                          @ w3 * "
-                "inr23\n"
-                "vmla.f32   q12, q6, d0[1]                          @ w4 * "
-                "inr21\n"
-                "vmla.f32   q13, q6, d1[0]                          @ w4 * "
-                "inr22\n"
-                "vmla.f32   q14, q6, d1[1]                          @ w4 * "
-                "inr23\n"
-                "vmla.f32   q15, q6, d2[0]                          @ w4 * "
-                "inr24\n"
-                "vld1.32    {d10-d13}, [%[wc0]]!                    @ load w6, "
-                "w7, to q5, q6\n"
-                "vmla.f32   q12, q7, d1[0]                          @ w5 * "
-                "inr22\n"
-                "vmla.f32   q13, q7, d1[1]                          @ w5 * "
-                "inr23\n"
-                "vmla.f32   q14, q7, d2[0]                          @ w5 * "
-                "inr24\n"
-                "vmla.f32   q15, q7, d2[1]                          @ w5 * "
-                "inr25\n"
-                "vld1.32    {d14-d15}, [%[wc0]]!                    @ load w8, "
-                "to q7\n"
-
-                "sub    %[wc0], %[wc0], #144                        @ wc0 - "
-                "144 to start address\n"
-
-                /* mul r2 with w6, w7, w8, get out r0 */
-                "vmla.f32   q8, q5, d0[0]                           @ w6 * "
-                "inr20\n"
-                "vmla.f32   q9, q5, d0[1]                           @ w6 * "
-                "inr21\n"
-                "vld1.32    {d3-d4}, [%[r3]]!                       @ load r3, "
-                "4 float\n"
-                "vmla.f32   q10, q5, d1[0]                          @ w6 * "
-                "inr22\n"
-                "vmla.f32   q11, q5, d1[1]                          @ w6 * "
-                "inr23\n"
-                "vmla.f32   q8, q6, d0[1]                           @ w7 * "
-                "inr21\n"
-                "vmla.f32   q9, q6, d1[0]                           @ w7 * "
-                "inr22\n"
-                "vld1.32    {d5}, [%[r3]]                           @ load r3, "
-                "2 float\n"
-                "vmla.f32   q10, q6, d1[1]                          @ w7 * "
-                "inr23\n"
-                "vmla.f32   q11, q6, d2[0]                          @ w7 * "
-                "inr24\n"
-                "vmla.f32   q8, q7, d1[0]                           @ w8 * "
-                "inr22\n"
-                "vmla.f32   q9, q7, d1[1]                           @ w8 * "
-                "inr23\n"
-                "vld1.32    {d0-d1}, [%[r0]]!                       @ load r0, "
-                "4 float\n"
-                "vmla.f32   q10, q7, d2[0]                          @ w8 * "
-                "inr24\n"
-                "vmla.f32   q11, q7, d2[1]                          @ w8 * "
-                "inr25\n"
-                "vld1.32    {d2}, [%[r0]]                           @ load r0, "
-                "2 float\n"
-
-                /* mul r3 with w6, w7, w8, get out r1 */
-                "vmla.f32   q12, q5, d3[0]                          @ w6 * "
-                "inr20\n"
-                "vmla.f32   q13, q5, d3[1]                          @ w6 * "
-                "inr21\n"
-                "vst1.32    {d16-d19}, [%[ptr_out0]]!               @ save "
-                "r00, r01, c0~c3\n"
-                "vmla.f32   q14, q5, d4[0]                          @ w6 * "
-                "inr22\n"
-                "vmla.f32   q15, q5, d4[1]                          @ w6 * "
-                "inr23\n"
-                "vst1.32    {d20-d23}, [%[ptr_out0]]!               @ save "
-                "r02, r03, c0~c3\n"
-                "vmla.f32   q12, q6, d3[1]                          @ w7 * "
-                "inr21\n"
-                "vmla.f32   q13, q6, d4[0]                          @ w7 * "
-                "inr22\n"
-                "vld1.32    {d16-d19}, [%[ptr_out0]]!               @ load "
-                "outr0, w0, w1, c0~c3\n"
-                "vmla.f32   q14, q6, d4[1]                          @ w7 * "
-                "inr23\n"
-                "vmla.f32   q15, q6, d5[0]                          @ w7 * "
-                "inr24\n"
-                "vld1.32    {d10-d13}, [%[wc0]]!                    @ load w0, "
-                "w1, to q5, q6\n"
-                "vmla.f32   q12, q7, d4[0]                          @ w8 * "
-                "inr22\n"
-                "vmla.f32   q13, q7, d4[1]                          @ w8 * "
-                "inr23\n"
-                "vld1.32    {d20-d23}, [%[ptr_out0]]                @ load "
-                "outr0, w2, w3, c0~c3\n"
-                "vmla.f32   q14, q7, d5[0]                          @ w8 * "
-                "inr24\n"
-                "vmla.f32   q15, q7, d5[1]                          @ w8 * "
-                "inr25\n"
-
-                "vst1.32    {d24-d27}, [%[ptr_out1]]!               @ save "
-                "r10, r11, c0~c3\n"
-                "vst1.32    {d28-d31}, [%[ptr_out1]]!               @ save "
-                "r12, r13, c0~c3\n"
-                "vld1.32    {d14-d15}, [%[wc0]]!                    @ load w2, "
-                "to q7\n"
-
-                "sub    %[ptr_out0], %[ptr_out0], #32               @ ptr_out0 "
-                "- 32, to start address\n"
-
-                "subs   %[cnt], #1                                  @ loop "
-                "count--\n"
-                "bne    0b                                          @ jump to "
-                "main loop\n"
-
-                : [cnt] "+r"(cnt), [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2),
-                  [r3] "+r"(r3), [ptr_out0] "+r"(ptr_out0),
-                  [ptr_out1] "+r"(ptr_out1), [wc0] "+r"(wc0)
-                :
-                : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
-                  "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-
-            inr0 += win_round;
-            inr1 += win_round;
-            inr2 += win_round;
-            inr3 += win_round;
-          }
-#endif  // __aarch64__
-          block_inr0 = block_inr2;
-          block_inr1 = block_inr3;
-          block_inr2 = block_inr1 + in_len;
-          block_inr3 = block_inr2 + in_len;
-        }
-        slidingwindow_writeout_c4_fp32(pre_out, dout_batch, c, c + hout_c_block,
-                                       h, h + h_kernel, 0, wout_round, chout,
-                                       hout, wout, relu, ptr_write);
-      }
-      const float *weight_remain_ptr = weights + c_round_down * w_stride;
-#pragma omp parallel for
-      for (int c = 0; c < c_remain; ++c) {
-#ifdef USE_OPENMP
-        float *pre_out =
-            pre_din + pre_in_size + omp_get_thread_num() * pre_out_size;
-#else
-        float *pre_out = pre_din + pre_in_size;
-#endif
-
-        int c_idx = c_round_down + c;
-
-        int h_kernel = hout_r_block;
-        if (h + hout_r_block > hout) {
-          h_kernel = hout - h;
-        }
-
-        const float *block_inr0 = pre_din;
-        const float *block_inr1 = block_inr0 + in_len;
-        const float *block_inr2 = block_inr1 + in_len;
-        const float *block_inr3 = block_inr2 + in_len;
-
-        const float *bias_ptr = ptr_zero;
-        if (bias != nullptr) {
-          bias_ptr = bias + c_idx;
-        }
-        slidingwindow_fill_bias(pre_out, bias_ptr, 1, wout_round * h_kernel);
-
-        for (int hk = 0; hk < h_kernel; hk += hout_r_kernel) {
-          const float *wc0 = weight_remain_ptr;
-
-          const float *inr0 = block_inr0;
-          const float *inr1 = block_inr1;
-          const float *inr2 = block_inr2;
-          const float *inr3 = block_inr3;
-
-          float *pre_out0 = pre_out + hk * wout_round;
-          float *pre_out1 = pre_out0 + wout_round;
-#ifdef __aarch64__
-          for (int i = 0; i < chin; ++i) {
-            float *ptr_out0 = pre_out0;
-            float *ptr_out1 = pre_out1;
-
-            float32x4_t w0 = vdupq_n_f32(wc0[c]);       // w0, v23
-            float32x4_t w1 = vdupq_n_f32(wc0[4 + c]);   // w1, v24
-            float32x4_t w2 = vdupq_n_f32(wc0[8 + c]);   // w2, v25
-            float32x4_t w3 = vdupq_n_f32(wc0[12 + c]);  // w3, v26
-            float32x4_t w4 = vdupq_n_f32(wc0[16 + c]);  // w4, v27
-            float32x4_t w5 = vdupq_n_f32(wc0[20 + c]);  // w5, v28
-            float32x4_t w6 = vdupq_n_f32(wc0[24 + c]);  // w6, v29
-            float32x4_t w7 = vdupq_n_f32(wc0[28 + c]);  // w7, v30
-            float32x4_t w8 = vdupq_n_f32(wc0[32 + c]);  // w8, v31
-
-            const float *r0 = inr0;
-            const float *r1 = inr1;
-            const float *r2 = inr2;
-            const float *r3 = inr3;
-
-            int cnt = w_loop;
-            asm volatile(
-                "ldr    q21, [%[ptr_out0]]          \n" /* load outr0, w0~w3*/
-                "ldr    q22, [%[ptr_out1]]          \n" /* load outr1, w0~w3*/
-                "ldp    q0, q1,   [%[r0]], #16      \n" /* load input r0*/
-                "ldp    q2, q3,   [%[r1]], #16      \n" /* load input r1*/
-                "ldp    q4, q5,   [%[r2]], #16      \n" /* load input r2*/
-                "ldp    q6, q7,   [%[r3]], #16      \n" /* load input r3*/
-                "2:                                 \n" /* main loop*/
-
-                "fmla   v21.4s ,  %[w0].4s,  v0.4s  \n" /* outr0 = w0 * r0*/
-                "fmla   v22.4s ,  %[w0].4s,  v2.4s  \n" /* outr1 = w0 * r1*/
-
-                "ext    v8.16b,  v0.16b,  v1.16b, #4   \n" /* shift r0 left 1*/
-                "ext    v10.16b,  v2.16b,  v3.16b, #4  \n" /* shift r1 left 1*/
-                "ext    v9.16b,  v0.16b,  v1.16b, #8   \n" /* shift r0 left 2*/
-                "ext    v11.16b,  v2.16b,  v3.16b, #8  \n" /* shift r1 left 2*/
-
-                "ldp    q0, q1,   [%[r0]], #16      \n" /* load input r0*/
-
-                "fmla   v21.4s ,  %[w1].4s,  v8.4s  \n" /* outr0 = w1 * r1*/
-                "fmla   v22.4s ,  %[w1].4s,  v10.4s \n" /* outr1 = w1 * r2*/
-
-                "fmla   v21.4s ,  %[w2].4s,  v9.4s  \n" /* outr0 = w2 * r1*/
-                "fmla   v22.4s ,  %[w2].4s,  v11.4s \n" /* outr1 = w2 * r2*/
-
-                "fmla   v21.4s ,  %[w3].4s,  v2.4s  \n" /* outr0 = w3 * r1*/
-                "fmla   v22.4s ,  %[w3].4s,  v4.4s  \n" /* outr1 = w3 * r2*/
-
-                "ext    v12.16b,  v4.16b,  v5.16b, #4\n" /* shift r2 left 1*/
-                "ext    v14.16b,  v6.16b,  v7.16b, #4\n" /* shift r3 left 1*/
-                "ext    v13.16b,  v4.16b,  v5.16b, #8\n" /* shift r2 left 2*/
-                "ext    v15.16b,  v6.16b,  v7.16b, #8\n" /* shift r3 left 2*/
-
-                "fmla   v21.4s ,  %[w4].4s,  v10.4s \n" /* outr0 = w4 * r1*/
-                "fmla   v22.4s ,  %[w4].4s,  v12.4s \n" /* outr1 = w4 * r2*/
-
-                "fmla   v21.4s ,  %[w5].4s,  v11.4s \n" /* outr0 = w5 * r1*/
-                "fmla   v22.4s ,  %[w5].4s,  v13.4s \n" /* outr1 = w5 * r2*/
-
-                "ldp    q2, q3,   [%[r1]], #16      \n" /* load input r0*/
-
-                "fmla   v21.4s ,  %[w6].4s,  v4.4s  \n" /* outr0 = w6 * r2*/
-                "fmla   v22.4s ,  %[w6].4s,  v6.4s  \n" /* outr1 = w6 * r3*/
-
-                "ldp    q4, q5,   [%[r2]], #16      \n" /* load input r2*/
-
-                "fmla   v21.4s ,  %[w7].4s,  v12.4s \n" /* outr0 = w7 * r1*/
-                "fmla   v22.4s ,  %[w7].4s,  v14.4s \n" /* outr1 = w7 * r2*/
-
-                "ldp    q6, q7,   [%[r3]], #16      \n" /* load input r3*/
-
-                "fmla   v21.4s ,  %[w8].4s,  v13.4s \n" /* outr0 = w8 * r1*/
-                "fmla   v22.4s ,  %[w8].4s,  v15.4s \n" /* outr1 = w8 * r2*/
-
-                "str    q21,    [%[ptr_out0]], #16  \n" /*write output r0*/
-                "str    q22,    [%[ptr_out1]], #16  \n" /*write output r1*/
-
-                "subs   %w[cnt], %w[cnt], #1        \n" /*loop count -1*/
-
-                "ldr    q21, [%[ptr_out0]]          \n" /* load outr0, w0~w3*/
-                "ldr    q22, [%[ptr_out1]]          \n" /* load outr1, w0~w3*/
-
-                "bne    2b                          \n" /* jump to main loop*/
-
-                : [cnt] "+r"(cnt), [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2),
-                  [r3] "+r"(r3), [ptr_out0] "+r"(ptr_out0),
-                  [ptr_out1] "+r"(ptr_out1)
-                : [w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2), [w3] "w"(w3),
-                  [w4] "w"(w4), [w5] "w"(w5), [w6] "w"(w6), [w7] "w"(w7),
-                  [w8] "w"(w8)
-                : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
-                  "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
-                  "v21", "v22");
-
-            wc0 += 9 * hout_c_block;
-            inr0 += win_round;
-            inr1 += win_round;
-            inr2 += win_round;
-            inr3 += win_round;
-          }
-#else   // not __aarch64__
-          for (int i = 0; i < chin; ++i) {
-            float *ptr_out0 = pre_out0;
-            float *ptr_out1 = pre_out1;
-
-            //! get valid weights of current output channel
-            float w_tmp[10] = {
-                wc0[c],      wc0[c + 4],  wc0[c + 8],  wc0[c + 12], wc0[c + 16],
-                wc0[c + 20], wc0[c + 24], wc0[c + 28], wc0[c + 32], 0.f};
-            float32x4_t w0 = vld1q_f32(w_tmp);      // w0, w1, w2, q0
-            float32x4_t w1 = vld1q_f32(w_tmp + 3);  // w3, w4, w5, q1
-            float32x4_t w2 = vld1q_f32(w_tmp + 6);  // w6, w7, w8, q2
-
-            const float *r0 = inr0;
-            const float *r1 = inr1;
-            const float *r2 = inr2;
-            const float *r3 = inr3;
-            int cnt = w_loop / 2;
-            if (cnt > 0) {
-              asm volatile(
-                  "vld1.32    {d24-d27},    [%[ptr_out0]]         @ load or00, "
-                  "or01\n"
-                  "vld1.32    {d6-d9},      [%[r0]]!              @ load r0, 8 "
-                  "float\n"
-                  "vld1.32    {d10},        [%[r0]]               @ load r0, 2 "
-                  "float\n"
-                  /* main loop */
-                  "0:                                             @ main loop\n"
-                  /* r0 * w0, w1, w2, get out r0*/
-                  "vld1.32    {d28-d31},    [%[ptr_out1]]         @ load or10, "
-                  "or11\n"
-                  "vext.32    q8, q3, q4, #1                      @ r0, shift "
-                  "left 1, get 1, 2, 3, 4\n"
-                  "vext.32    q9, q4, q5, #1                      @ r0, shift "
-                  "left 1, get 5, 6, 7, 8\n"
-                  "vmla.f32   q12,    q3, %e[w0][0]               @ w00 * r0, "
-                  "0, 1, 2, 3\n"
-                  "vmla.f32   q13,    q4, %e[w0][0]               @ w00 * r0, "
-                  "4, 5, 6, 7\n"
-                  "vext.32    q10, q3, q4, #2                     @ r0, shift "
-                  "left 2, get 2, 3, 4, 5\n"
-                  "vext.32    q11, q4, q5, #2                     @ r0, shift "
-                  "left 2, get 6, 7, 8, 9\n"
-                  "vmla.f32   q12,    q8, %e[w0][1]               @ w01 * r0, "
-                  "1, 2, 3, 4\n"
-                  "vmla.f32   q13,    q9, %e[w0][1]               @ w01 * r0, "
-                  "5, 6, 7, 8\n"
-                  "vld1.32    {d6-d9},    [%[r1]]!                @ load r1, 8 "
-                  "float\n"
-                  "vmla.f32   q12,    q10, %f[w0][0]              @ w02 * r0, "
-                  "2, 3, 4, 5\n"
-                  "vmla.f32   q13,    q11, %f[w0][0]              @ w02 * r0, "
-                  "6, 7, 8, 9\n"
-                  "vld1.32    {d10},       [%[r1]]                @ load r1, 2 "
-                  "float\n"
-
-                  /* r1 * w3, w4, w5, get out r0*/
-                  /* r1 * w0, w1, w2, get out r1*/
-                  "vmla.f32   q12,    q3, %e[w1][0]               @ w10 * r1, "
-                  "0, 1, 2, 3\n"
-                  "vmla.f32   q13,    q4, %e[w1][0]               @ w10 * r1, "
-                  "4, 5, 6, 7\n"
-                  "vext.32    q8, q3, q4, #1                      @ r1, shift "
-                  "left 1, get 1, 2, 3, 4\n"
-                  "vext.32    q9, q4, q5, #1                      @ r1, shift "
-                  "left 1, get 5, 6, 7, 8\n"
-                  "vmla.f32   q14,    q3, %e[w0][0]               @ w00 * r1, "
-                  "0, 1, 2, 3\n"
-                  "vmla.f32   q15,    q4, %e[w0][0]               @ w00 * r1, "
-                  "4, 5, 6, 7\n"
-                  "vext.32    q10, q3, q4, #2                     @ r1, shift "
-                  "left 2, get 2, 3, 4, 5\n"
-                  "vext.32    q11, q4, q5, #2                     @ r1, shift "
-                  "left 2, get 6, 7, 8, 9\n"
-                  "vmla.f32   q12,    q8, %e[w1][1]               @ w11 * r1, "
-                  "1, 2, 3, 4\n"
-                  "vmla.f32   q13,    q9, %e[w1][1]               @ w11 * r1, "
-                  "5, 6, 7, 8\n"
-                  "vmla.f32   q14,    q8, %e[w0][1]               @ w01 * r1, "
-                  "1, 2, 3, 4\n"
-                  "vmla.f32   q15,    q9, %e[w0][1]               @ w01 * r1, "
-                  "5, 6, 7, 8\n"
-                  "vld1.32    {d6-d9},    [%[r2]]!                @ load r2, 8 "
-                  "float\n"
-                  "vmla.f32   q12,    q10, %f[w1][0]              @ w12 * r1, "
-                  "2, 3, 4, 5\n"
-                  "vmla.f32   q13,    q11, %f[w1][0]              @ w12 * r1, "
-                  "6, 7, 8, 9\n"
-                  "vmla.f32   q14,    q10, %f[w0][0]              @ w02 * r1, "
-                  "2, 3, 4, 5\n"
-                  "vmla.f32   q15,    q11, %f[w0][0]              @ w02 * r1, "
-                  "6, 7, 8, 9\n"
-                  "vld1.32    {d10},    [%[r2]]                   @ load r2, 2 "
-                  "float\n"
-
-                  /* r2 * w6, w7, w8, get out r0*/
-                  /* r2 * w3, w4, w5, get out r1*/
-                  "vmla.f32   q12,    q3, %e[w2][0]               @ w20 * r2, "
-                  "0, 1, 2, 3\n"
-                  "vmla.f32   q13,    q4, %e[w2][0]               @ w20 * r2, "
-                  "4, 5, 6, 7\n"
-                  "vext.32    q8, q3, q4, #1                      @ r2, shift "
-                  "left 1, get 1, 2, 3, 4\n"
-                  "vext.32    q9, q4, q5, #1                      @ r2, shift "
-                  "left 1, get 5, 6, 7, 8\n"
-                  "vmla.f32   q14,    q3, %e[w1][0]               @ w10 * r2, "
-                  "0, 1, 2, 3\n"
-                  "vmla.f32   q15,    q4, %e[w1][0]               @ w10 * r2, "
-                  "4, 5, 6, 7\n"
-                  "vext.32    q10, q3, q4, #2                     @ r2, shift "
-                  "left 2, get 2, 3, 4, 5\n"
-                  "vext.32    q11, q4, q5, #2                     @ r2, shift "
-                  "left 2, get 6, 7, 8, 9\n"
-                  "vmla.f32   q12,    q8, %e[w2][1]               @ w21 * r2, "
-                  "1, 2, 3, 4\n"
-                  "vmla.f32   q13,    q9, %e[w2][1]               @ w21 * r2, "
-                  "5, 6, 7, 8\n"
-                  "vmla.f32   q14,    q8, %e[w1][1]               @ w11 * r2, "
-                  "1, 2, 3, 4\n"
-                  "vmla.f32   q15,    q9, %e[w1][1]               @ w11 * r2, "
-                  "5, 6, 7, 8\n"
-                  "vld1.32    {d6-d9},    [%[r3]]!                @ load r3, 8 "
-                  "float\n"
-                  "vmla.f32   q12,    q10, %f[w2][0]              @ w22 * r2, "
-                  "2, 3, 4, 5\n"
-                  "vmla.f32   q13,    q11, %f[w2][0]              @ w22 * r2, "
-                  "6, 7, 8, 9\n"
-                  "vmla.f32   q14,    q10, %f[w1][0]              @ w12 * r2, "
-                  "2, 3, 4, 5\n"
-                  "vmla.f32   q15,    q11, %f[w1][0]              @ w12 * r2, "
-                  "6, 7, 8, 9\n"
-                  "vld1.32    {d10},    [%[r3]]                   @ load r3, 2 "
-                  "float\n"
-
-                  /* r3 * w6, w7, w8, get out r1*/
-                  "vext.32    q8, q3, q4, #1                      @ r3, shift "
-                  "left 1, get 1, 2, 3, 4\n"
-                  "vext.32    q9, q4, q5, #1                      @ r3, shift "
-                  "left 1, get 5, 6, 7, 8\n"
-                  "vmla.f32   q14,    q3, %e[w2][0]               @ w20 * r3, "
-                  "0, 1, 2, 3\n"
-                  "vmla.f32   q15,    q4, %e[w2][0]               @ w20 * r3, "
-                  "4, 5, 6, 7\n"
-                  "vst1.32    {d24-d27},  [%[ptr_out0]]!          @ save or00, "
-                  "or01\n"
-                  "vext.32    q10, q3, q4, #2                     @ r3, shift "
-                  "left 2, get 2, 3, 4, 5\n"
-                  "vext.32    q11, q4, q5, #2                     @ r3, shift "
-                  "left 2, get 6, 7, 8, 9\n"
-                  "vmla.f32   q14,    q8, %e[w2][1]               @ w21 * r3, "
-                  "0, 1, 2, 3\n"
-                  "vmla.f32   q15,    q9, %e[w2][1]               @ w21 * r3, "
-                  "4, 5, 6, 7\n"
-                  "vld1.32    {d24-d27},  [%[ptr_out0]]           @ load or00, "
-                  "or01\n"
-                  "vld1.32    {d6-d9},    [%[r0]]!                @ load r3, 8 "
-                  "float\n"
-                  "vmla.f32   q14,    q10, %f[w2][0]              @ w22 * r3, "
-                  "2, 3, 4, 5\n"
-                  "vmla.f32   q15,    q11, %f[w2][0]              @ w22 * r3, "
-                  "6, 7, 8, 9\n"
-                  "vld1.32    {d10},    [%[r0]]                   @ load r0, 2 "
-                  "float\n"
-                  "vst1.32    {d28-d31},  [%[ptr_out1]]!          @ save or10, "
-                  "or11\n"
-
-                  "subs   %[cnt], #1                              @loop count "
-                  "-1\n"
-                  "bne    0b                                      @ jump to "
-                  "main loop\n"
-
-                  : [cnt] "+r"(cnt), [r0] "+r"(r0), [r1] "+r"(r1),
-                    [r2] "+r"(r2), [r3] "+r"(r3), [ptr_out0] "+r"(ptr_out0),
-                    [ptr_out1] "+r"(ptr_out1)
-                  : [w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2)
-                  : "cc", "memory", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
-                    "q10", "q11", "q12", "q13", "q14", "q15");
-              r0 -= 8;
-            }
-            //! deal with remain wout
-            if (w_loop & 1) {
-              ptr_out0[0] +=
-                  r0[0] * w_tmp[0] + r0[1] * w_tmp[1] + r0[2] * w_tmp[2] +
-                  r1[0] * w_tmp[3] + r1[1] * w_tmp[4] + r1[2] * w_tmp[5] +
-                  r2[0] * w_tmp[6] + r2[1] * w_tmp[7] + r2[2] * w_tmp[8];
-
-              ptr_out0[1] +=
-                  r0[1] * w_tmp[0] + r0[2] * w_tmp[1] + r0[3] * w_tmp[2] +
-                  r1[1] * w_tmp[3] + r1[2] * w_tmp[4] + r1[3] * w_tmp[5] +
-                  r2[1] * w_tmp[6] + r2[2] * w_tmp[7] + r2[3] * w_tmp[8];
-
-              ptr_out0[2] +=
-                  r0[2] * w_tmp[0] + r0[3] * w_tmp[1] + r0[4] * w_tmp[2] +
-                  r1[2] * w_tmp[3] + r1[3] * w_tmp[4] + r1[4] * w_tmp[5] +
-                  r2[2] * w_tmp[6] + r2[3] * w_tmp[7] + r2[4] * w_tmp[8];
-
-              ptr_out0[3] +=
-                  r0[3] * w_tmp[0] + r0[4] * w_tmp[1] + r0[5] * w_tmp[2] +
-                  r1[3] * w_tmp[3] + r1[4] * w_tmp[4] + r1[5] * w_tmp[5] +
-                  r2[3] * w_tmp[6] + r2[4] * w_tmp[7] + r2[5] * w_tmp[8];
-
-              ptr_out1[0] +=
-                  r1[0] * w_tmp[0] + r1[1] * w_tmp[1] + r1[2] * w_tmp[2] +
-                  r2[0] * w_tmp[3] + r2[1] * w_tmp[4] + r2[2] * w_tmp[5] +
-                  r3[0] * w_tmp[6] + r3[1] * w_tmp[7] + r3[2] * w_tmp[8];
-
-              ptr_out1[1] +=
-                  r1[1] * w_tmp[0] + r1[2] * w_tmp[1] + r1[3] * w_tmp[2] +
-                  r2[1] * w_tmp[3] + r2[2] * w_tmp[4] + r2[3] * w_tmp[5] +
-                  r3[1] * w_tmp[6] + r3[2] * w_tmp[7] + r3[3] * w_tmp[8];
-
-              ptr_out1[2] +=
-                  r1[2] * w_tmp[0] + r1[3] * w_tmp[1] + r1[4] * w_tmp[2] +
-                  r2[2] * w_tmp[3] + r2[3] * w_tmp[4] + r2[4] * w_tmp[5] +
-                  r3[2] * w_tmp[6] + r3[3] * w_tmp[7] + r3[4] * w_tmp[8];
-
-              ptr_out1[3] +=
-                  r1[3] * w_tmp[0] + r1[4] * w_tmp[1] + r1[5] * w_tmp[2] +
-                  r2[3] * w_tmp[3] + r2[4] * w_tmp[4] + r2[5] * w_tmp[5] +
-                  r3[3] * w_tmp[6] + r3[4] * w_tmp[7] + r3[5] * w_tmp[8];
-            }
-
-            wc0 += 36;
-            inr0 += win_round;
-            inr1 += win_round;
-            inr2 += win_round;
-            inr3 += win_round;
-          }
-#endif  // __aarch64__
-          block_inr0 = block_inr2;
-          block_inr1 = block_inr3;
-          block_inr2 = block_inr1 + in_len;
-          block_inr3 = block_inr2 + in_len;
-        }
-        slidingwindow_writeout_c1_fp32(pre_out, dout_batch, c_idx, c_idx + 1, h,
-                                       h + h_kernel, 0, wout_round, chout, hout,
-                                       wout, relu, ptr_write);
-      }
-    }
-  }
-}
-
-template <>
-void SlidingwindowConv3x3s2Faster<float, float>(
-    const framework::Tensor *input, framework::Tensor *filter,
-    const std::vector<int> &paddings, framework::Tensor *output,
-    const float *bias, bool is_bias, bool is_relu) {
-  const float *din = input->data<float>();
-  float *dout = output->mutable_data<float>();
-  const float *weights = filter->mutable_data<float>();
-  if (!is_bias) {
-    bias = nullptr;
-  }
-  bool relu = is_relu;
-  const int num = input->dims()[0];
-  const int chin = input->dims()[1];
-  const int hin = input->dims()[2];
-  const int win = input->dims()[3];
-  const int chout = output->dims()[1];
-  const int hout = output->dims()[2];
-  const int wout = output->dims()[3];
-  const int pad_h = paddings[0];
-  const int pad_w = paddings[1];
-  const int threads = framework::CPUContext::Context()->get_thread_num();
-  int l2_size =
-      framework::CPUContext::Context()->get_l2_cache_size() / sizeof(float);
-  const int hout_c_block = 4;
-  const int hout_r_kernel = 2;
-  const int wout_block = 4;
-  const int wout_round = ((wout + wout_block - 1) / wout_block) * wout_block;
-  const int win_round = wout_round * 2 /*stride_w*/ + 1;
-  //! get h block
-  //! win_round * chin * hin_r_block + wout_round * hout_c_block * hout_r_block
-  //! * threads = l2_size win_round = 2 * wout_round + 1 hin_r_block = 2 *
-  //! hout_r_block + 1
-  int hout_r_block =
-      (l2_size - 2 * wout_round * chin - chin) /
-      ((4 * wout_round + 2) * chin + wout_round * hout_c_block * threads);
-  hout_r_block = hout_r_block > hout ? hout : hout_r_block;
-  hout_r_block = (hout_r_block / hout_r_kernel) * hout_r_kernel;
-  hout_r_block = hout_r_block < hout_r_kernel ? hout_r_kernel : hout_r_block;
-
-  const int hin_r_block = hout_r_block * 2 /*stride_h*/ + 1;
-
-  float ptr_zero[win_round];
-  memset(ptr_zero, 0, sizeof(float) * win_round);
-  float ptr_write[wout_round];
-
-  int in_len = win_round * chin;
-  int pre_in_size = hin_r_block * in_len;
-  int pre_out_size = hout_c_block * hout_r_block * wout_round;
-
-  float *pre_din =
-      static_cast<float *>(framework::CPUContext::Context()->get_work_space(
-          (pre_in_size + threads * pre_out_size) * sizeof(float)));
-
-  int size_in_channel = win * hin;
-  int size_out_channel = wout * hout;
-  int w_stride = chin * 9;               /*kernel_w * kernel_h*/
-  int w_stride_chin = hout_c_block * 9;  // kernel_w * kernel_h *
-
-  int ws = -pad_w;
-  int we = ws + win_round;
-  int w_loop = wout_round / 4;
-
-  int c_remain = chout - (chout / hout_c_block) * hout_c_block;
-  int c_round_down = (chout / hout_c_block) * hout_c_block;
-
-  int out_row_stride = hout_c_block * wout_round;
-
-  for (int n = 0; n < num; ++n) {
-    const float *din_batch = din + n * chin * size_in_channel;
-    float *dout_batch = dout + n * chout * size_out_channel;
-    for (int h = 0; h < hout; h += hout_r_block) {
-      int h_kernel = hout_r_block;
-      if (h + hout_r_block > hout) {
-        h_kernel = hout - h;
-      }
-
-      int hs = h * 2 /*stride_h*/ - pad_h;
-      int he = hs + h_kernel * 2 /*stride_h*/ + 1;
-
-      slidingwindow_prepack_input(din_batch, pre_din, 0, chin, hs, he, ws, we,
-                                  chin, win, hin, ptr_zero);
-
-      const float *cblock_inr0 = pre_din;
-      const float *cblock_inr1 = cblock_inr0 + in_len;
-      const float *cblock_inr2 = cblock_inr1 + in_len;
-      const float *cblock_inr3 = cblock_inr2 + in_len;
-      const float *cblock_inr4 = cblock_inr3 + in_len;
-
-#pragma omp parallel for
-      for (int c = 0; c < c_round_down; c += hout_c_block) {
-#ifdef _OPENMP
-        float *pre_out =
-            pre_din + pre_in_size + omp_get_thread_num() * pre_out_size;
-#else
-        float *pre_out = pre_din + pre_in_size;
-#endif
-        const float *block_inr0 = cblock_inr0;
-        const float *block_inr1 = cblock_inr1;
-        const float *block_inr2 = cblock_inr2;
-        const float *block_inr3 = cblock_inr3;
-        const float *block_inr4 = cblock_inr4;
-
-        const float *weight_c = weights + c * w_stride;
-        const float *bias_ptr = ptr_zero;
-        if (bias != nullptr) {
-          bias_ptr = bias + c;
-        }
-        slidingwindow_fill_bias(pre_out, bias_ptr,
-                                wout_round * hout_c_block * h_kernel);
-
-        for (int hk = 0; hk < h_kernel; hk += hout_r_kernel) {
-          const float *wc0 = weight_c;
-
-          const float *inr0 = block_inr0;
-          const float *inr1 = block_inr1;
-          const float *inr2 = block_inr2;
-          const float *inr3 = block_inr3;
-          const float *inr4 = block_inr4;
-
-          float *pre_out0 = pre_out + hk * out_row_stride;
-          float *pre_out1 = pre_out0 + out_row_stride;
-#ifdef __aarch64__
-          for (int i = 0; i < chin; ++i) {
-            float *ptr_out0 = pre_out0;
-            float *ptr_out1 = pre_out1;
-
-            float32x4_t w0 = vld1q_f32(wc0);       // w0, v23
-            float32x4_t w1 = vld1q_f32(wc0 + 4);   // w1, v24
-            float32x4_t w2 = vld1q_f32(wc0 + 8);   // w2, v25
-            float32x4_t w3 = vld1q_f32(wc0 + 12);  // w3, v26
-            float32x4_t w4 = vld1q_f32(wc0 + 16);  // w4, v27
-            float32x4_t w5 = vld1q_f32(wc0 + 20);  // w5, v28
-            float32x4_t w6 = vld1q_f32(wc0 + 24);  // w6, v29
-            float32x4_t w7 = vld1q_f32(wc0 + 28);  // w7, v30
-            float32x4_t w8 = vld1q_f32(wc0 + 32);  // w8, v31
-
-            const float *r0 = inr0;
-            const float *r1 = inr1;
-            const float *r2 = inr2;
-            const float *r3 = inr3;
-            const float *r4 = inr4;
-
-            int cnt = w_loop;
-            asm volatile(
-                "ldp    q15, q16, [%[ptr_out0]]     \n" /* load outr00, outr01*/
-                "ldp    q17, q18, [%[ptr_out0], #32]\n" /* load outr02, outr03*/
-
-                "ldp    q0, q1,   [%[r0]], #32      \n" /* load input r0*/
-                "ldr    d10,      [%[r0]]           \n" /* load input r0, 9th
-                                                           element*/
-                "ldp    q4, q5,   [%[r2]], #32      \n" /* load input r2*/
-                "ldr    d12,      [%[r2]]           \n" /* load input r2, 9th
-                                                           element*/
-                "2:                                 \n" /* main loop*/
-                /*  r0, r2, mul w0, get out r0, r1 */
-                "ldp    q19, q20, [%[ptr_out1]]     \n" /* load outr10, outr11*/
-                "ldp    q21, q22, [%[ptr_out1], #32]\n" /* load outr12, outr13*/
-                "fmla   v15.4s ,  %[w0].4s,  v0.s[0]\n" /* outr00 = w0 * r0[0]*/
-                "fmla   v16.4s ,  %[w0].4s,  v0.s[2]\n" /* outr01 = w0 * r0[2]*/
-                "fmla   v17.4s ,  %[w0].4s,  v1.s[0]\n" /* outr02 = w0 * r0[4]*/
-                "fmla   v18.4s ,  %[w0].4s,  v1.s[2]\n" /* outr03 = w0 * r0[6]*/
-                "fmla   v19.4s ,  %[w0].4s,  v4.s[0]\n" /* outr10 = w0 * r2[0]*/
-                "fmla   v20.4s ,  %[w0].4s,  v4.s[2]\n" /* outr11 = w0 * r2[2]*/
-                "fmla   v21.4s ,  %[w0].4s,  v5.s[0]\n" /* outr12 = w0 * r2[4]*/
-                "fmla   v22.4s ,  %[w0].4s,  v5.s[2]\n" /* outr13 = w0 * r2[6]*/
-
-                "ldp    q2, q3,   [%[r1]], #32      \n" /* load input r1*/
-
-                /* r2 mul w6, get out r0*/
-                "fmla   v15.4s ,  %[w6].4s,  v4.s[0]\n" /* outr00 = w6 * r2[0]*/
-                "fmla   v16.4s ,  %[w6].4s,  v4.s[2]\n" /* outr01 = w6 * r2[2]*/
-                "fmla   v17.4s ,  %[w6].4s,  v5.s[0]\n" /* outr02 = w6 * r2[4]*/
-                "fmla   v18.4s ,  %[w6].4s,  v5.s[2]\n" /* outr03 = w6 * r2[6]*/
-
-                "ldr    d11,      [%[r1]]           \n" /* load input r1, 9th
-                                                           element*/
-
-                /*  r0, r2, mul w1, get out r0, r1 */
-                "fmla   v15.4s ,  %[w1].4s,  v0.s[1]\n" /* outr00 = w1 * r0[1]*/
-                "fmla   v16.4s ,  %[w1].4s,  v0.s[3]\n" /* outr01 = w1 * r0[3]*/
-                "fmla   v17.4s ,  %[w1].4s,  v1.s[1]\n" /* outr02 = w1 * r0[5]*/
-                "fmla   v18.4s ,  %[w1].4s,  v1.s[3]\n" /* outr03 = w1 * r0[7]*/
-                "fmla   v19.4s ,  %[w1].4s,  v4.s[1]\n" /* outr10 = w1 * r2[1]*/
-                "fmla   v20.4s ,  %[w1].4s,  v4.s[3]\n" /* outr11 = w1 * r2[3]*/
-                "fmla   v21.4s ,  %[w1].4s,  v5.s[1]\n" /* outr12 = w1 * r2[5]*/
-                "fmla   v22.4s ,  %[w1].4s,  v5.s[3]\n" /* outr13 = w1 * r2[7]*/
-
-                "ldp    q6, q7,   [%[r3]], #32      \n" /* load input r3*/
-
-                /*  r2 mul w7, get out r0 */
-                "fmla   v15.4s ,  %[w7].4s,  v4.s[1]\n" /* outr00 = w7 * r2[1]*/
-                "fmla   v16.4s ,  %[w7].4s,  v4.s[3]\n" /* outr01 = w7 * r2[3]*/
-                "fmla   v17.4s ,  %[w7].4s,  v5.s[1]\n" /* outr02 = w7 * r2[5]*/
-                "fmla   v18.4s ,  %[w7].4s,  v5.s[3]\n" /* outr03 = w7 * r2[7]*/
-
-                "ldr    d13,      [%[r3]]           \n" /* load input r3, 9th
-                                                           element*/
-
-                /*  r0, r2, mul w2, get out r0, r1 */
-                "fmla   v15.4s ,  %[w2].4s,  v0.s[2]\n" /* outr00 = w2 * r0[2]*/
-                "fmla   v16.4s ,  %[w2].4s,  v1.s[0]\n" /* outr01 = w2 * r0[4]*/
-                "fmla   v17.4s ,  %[w2].4s,  v1.s[2]\n" /* outr02 = w2 * r0[6]*/
-                "fmla   v18.4s ,  %[w2].4s,  v10.s[0]\n" /* outr03 = w2 *
-                                                            r0[8]*/
-                "fmla   v19.4s ,  %[w2].4s,  v4.s[2]\n" /* outr10 = w2 * r2[2]*/
-                "fmla   v20.4s ,  %[w2].4s,  v5.s[0]\n" /* outr11 = w2 * r2[4]*/
-                "fmla   v21.4s ,  %[w2].4s,  v5.s[2]\n" /* outr12 = w2 * r2[6]*/
-                "fmla   v22.4s ,  %[w2].4s,  v12.s[0]\n" /* outr13 = w2 *
-                                                            r2[8]*/
-
-                "ldp    q8, q9,   [%[r4]], #32      \n" /* load input r4*/
-
-                /*  r2, mul w8, get out r0 */
-                "fmla   v15.4s ,  %[w8].4s,  v4.s[2]\n" /* outr00 = w8 * r2[2]*/
-                "fmla   v16.4s ,  %[w8].4s,  v5.s[0]\n" /* outr01 = w8 * r2[4]*/
-                "fmla   v17.4s ,  %[w8].4s,  v5.s[2]\n" /* outr02 = w8 * r2[6]*/
-                "fmla   v18.4s ,  %[w8].4s,  v12.s[0]\n" /* outr03 = w8 *
-                                                            r2[8]*/
-
-                "ldr    d14,      [%[r4]]           \n" /* load input r4, 9th
-                                                           element*/
-
-                /* r1, r3, mul w3, get out r0, r1 */
-                "fmla   v15.4s ,  %[w3].4s,  v2.s[0]\n" /* outr00 = w3 * r1[0]*/
-                "fmla   v16.4s ,  %[w3].4s,  v2.s[2]\n" /* outr01 = w3 * r1[2]*/
-                "fmla   v17.4s ,  %[w3].4s,  v3.s[0]\n" /* outr02 = w3 * r1[4]*/
-                "fmla   v18.4s ,  %[w3].4s,  v3.s[2]\n" /* outr03 = w3 * r1[6]*/
-                "fmla   v19.4s ,  %[w3].4s,  v6.s[0]\n" /* outr10 = w3 * r3[0]*/
-                "fmla   v20.4s ,  %[w3].4s,  v6.s[2]\n" /* outr11 = w3 * r3[2]*/
-                "fmla   v21.4s ,  %[w3].4s,  v7.s[0]\n" /* outr12 = w3 * r3[4]*/
-                "fmla   v22.4s ,  %[w3].4s,  v7.s[2]\n" /* outr13 = w3 * r3[6]*/
-
-                "ldp    q0, q1,   [%[r0]], #32      \n" /* load input r0*/
-
-                /*  r1, r3, mul w4, get out r0, r1 */
-                "fmla   v15.4s ,  %[w4].4s,  v2.s[1]\n" /* outr00 = w4 * r1[1]*/
-                "fmla   v16.4s ,  %[w4].4s,  v2.s[3]\n" /* outr01 = w4 * r1[3]*/
-                "fmla   v17.4s ,  %[w4].4s,  v3.s[1]\n" /* outr02 = w4 * r1[5]*/
-                "fmla   v18.4s ,  %[w4].4s,  v3.s[3]\n" /* outr03 = w4 * r1[7]*/
-                "fmla   v19.4s ,  %[w4].4s,  v6.s[1]\n" /* outr10 = w4 * r3[1]*/
-                "fmla   v20.4s ,  %[w4].4s,  v6.s[3]\n" /* outr11 = w4 * r3[3]*/
-                "fmla   v21.4s ,  %[w4].4s,  v7.s[1]\n" /* outr12 = w4 * r3[5]*/
-                "fmla   v22.4s ,  %[w4].4s,  v7.s[3]\n" /* outr13 = w4 * r3[7]*/
-
-                "ldr    d10,      [%[r0]]           \n" /* load input r0, 9th
-                                                           element*/
-
-                /*  r1, r3, mul w5, get out r0, r1 */
-                "fmla   v15.4s ,  %[w5].4s,  v2.s[2]\n" /* outr00 = w5 * r1[2]*/
-                "fmla   v16.4s ,  %[w5].4s,  v3.s[0]\n" /* outr01 = w5 * r1[4]*/
-                "fmla   v17.4s ,  %[w5].4s,  v3.s[2]\n" /* outr02 = w5 * r1[6]*/
-                "fmla   v18.4s ,  %[w5].4s,  v11.s[0]\n" /* outr03 = w5 *
-                                                            r1[8]*/
-
-                "ldp    q4, q5,   [%[r2]], #32      \n" /* load input r2*/
-                "stp    q15, q16, [%[ptr_out0]], #32\n" /* save outr00, outr01*/
-
-                "fmla   v19.4s ,  %[w5].4s,  v6.s[2]\n" /* outr10 = w5 * r3[2]*/
-                "fmla   v20.4s ,  %[w5].4s,  v7.s[0]\n" /* outr11 = w5 * r3[4]*/
-                "fmla   v21.4s ,  %[w5].4s,  v7.s[2]\n" /* outr12 = w5 * r3[6]*/
-                "fmla   v22.4s ,  %[w5].4s,  v13.s[0]\n" /* outr13 = w5 *
-                                                            r3[8]*/
-
-                "ldr    d12,      [%[r2]]           \n" /* load input r2, 9th
-                                                           element*/
-                "stp    q17, q18, [%[ptr_out0]], #32\n" /* save outr02, outr03*/
-
-                /*  r4, mul w6, get out r1 */
-                "fmla   v19.4s ,  %[w6].4s,  v8.s[0]\n" /* outr10 = w6 * r4[0]*/
-                "fmla   v20.4s ,  %[w6].4s,  v8.s[2]\n" /* outr11 = w6 * r4[2]*/
-                "fmla   v21.4s ,  %[w6].4s,  v9.s[0]\n" /* outr12 = w6 * r4[4]*/
-                "fmla   v22.4s ,  %[w6].4s,  v9.s[2]\n" /* outr13 = w6 * r4[6]*/
-
-                "ldp    q15, q16, [%[ptr_out0]]     \n" /* load outr00, outr01*/
-
-                /*  r4, mul w7, get out r1 */
-                "fmla   v19.4s ,  %[w7].4s,  v8.s[1]\n" /* outr10 = w7 * r4[1]*/
-                "fmla   v20.4s ,  %[w7].4s,  v8.s[3]\n" /* outr11 = w7 * r4[3]*/
-                "fmla   v21.4s ,  %[w7].4s,  v9.s[1]\n" /* outr12 = w7 * r4[5]*/
-                "fmla   v22.4s ,  %[w7].4s,  v9.s[3]\n" /* outr13 = w7 * r4[7]*/
-
-                "ldp    q17, q18, [%[ptr_out0], #32]\n" /* load outr02, outr03*/
-
-                /*  r4, mul w8, get out r1 */
-                "fmla   v19.4s ,  %[w8].4s,  v8.s[2]\n" /* outr10 = w8 * r4[2]*/
-                "fmla   v20.4s ,  %[w8].4s,  v9.s[0]\n" /* outr11 = w8 * r4[4]*/
-                "fmla   v21.4s ,  %[w8].4s,  v9.s[2]\n" /* outr12 = w8 * r4[6]*/
-                "fmla   v22.4s ,  %[w8].4s,  v14.s[0]\n" /* outr13 = w8 *
-                                                            r4[8]*/
-
-                "subs   %w[cnt], %w[cnt], #1        \n" /*loop count -1*/
-
-                "stp    q19, q20, [%[ptr_out1]], #32\n" /* save outr10, outr11*/
-                "stp    q21, q22, [%[ptr_out1]], #32\n" /* save outr12, outr13*/
-
-                "bne    2b                          \n" /* jump to main loop*/
-
-                : [cnt] "+r"(cnt), [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2),
-                  [r3] "+r"(r3), [r4] "+r"(r4), [ptr_out0] "+r"(ptr_out0),
-                  [ptr_out1] "+r"(ptr_out1)
-                : [w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2), [w3] "w"(w3),
-                  [w4] "w"(w4), [w5] "w"(w5), [w6] "w"(w6), [w7] "w"(w7),
-                  [w8] "w"(w8)
-                : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
-                  "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
-                  "v16", "v17", "v18", "v19", "v20", "v21", "v22");
-
-            wc0 += 9 * hout_c_block;
-            inr0 += win_round;
-            inr1 += win_round;
-            inr2 += win_round;
-            inr3 += win_round;
-            inr4 += win_round;
-          }
-#else   // not __aarch64__
-          for (int i = 0; i < chin; ++i) {
-            const float *wc0 = weight_c + i * w_stride_chin;
-
-            float *ptr_out0 = pre_out0;
-            float *ptr_out1 = pre_out1;
-
-            const float *r0 = inr0;
-            const float *r1 = inr1;
-            const float *r2 = inr2;
-            const float *r3 = inr3;
-            const float *r4 = inr4;
-
-            int cnt = w_loop;
-            asm volatile(
-                "vld1.32    {d16-d19}, [%[ptr_out0]]!               @ load "
-                "outr0, w0, w1, c0~c3\n"
-                "vld1.32    {d20-d23}, [%[ptr_out0]]                @ load "
-                "outr0, w2, w3, c0~c3\n"
-
-                /* load weights */
-                "vld1.32    {d10-d13}, [%[wc0]]!                    @ load w0, "
-                "w1, to q5, q6\n"
-                "vld1.32    {d14-d15}, [%[wc0]]!                    @ load w2, "
-                "to q7\n"
-
-                /* load r0, r2 */
-                "vld1.32    {d0-d3}, [%[r0]]!                       @ load r0, "
-                "8 float\n"
-                "vld1.32    {d8},   [%[r0]]                         @ load r0, "
-                "9th float\n"
-
-                "sub    %[ptr_out0], %[ptr_out0], #32               @ ptr_out0 "
-                "- 32, to start address\n"
-
-                /* main loop */
-                "0:                                                 @ main "
-                "loop\n"
-                /* mul r0, with w0, w1, w2 */
-                "vld1.32    {d24-d27}, [%[ptr_out1]]!               @ load "
-                "outr1, w0, w1, c0~c3\n"
-                "vmla.f32   q8, q5, d0[0]                           @ w0 * "
-                "inr00\n"
-                "vld1.32    {d28-d31}, [%[ptr_out1]]                @ load "
-                "outr1, w2, w3, c0~c3\n"
-                "vmla.f32   q9, q5, d1[0]                           @ w0 * "
-                "inr02\n"
-                "vmla.f32   q10, q5, d2[0]                          @ w0 * "
-                "inr04\n"
-                "vmla.f32   q11, q5, d3[0]                          @ w0 * "
-                "inr06\n"
-                "vld1.32    {d4-d7}, [%[r2]]!                       @ load r2, "
-                "8 float\n"
-                "vmla.f32   q8, q6, d0[1]                           @ w1 * "
-                "inr01\n"
-                "vmla.f32   q9, q6, d1[1]                           @ w1 * "
-                "inr03\n"
-                "vmla.f32   q10, q6, d2[1]                          @ w1 * "
-                "inr05\n"
-                "vmla.f32   q11, q6, d3[1]                          @ w1 * "
-                "inr07\n"
-                "vld1.32    {d9},   [%[r2]]                         @ load r2, "
-                "9th float\n"
-                "vmla.f32   q8, q7, d1[0]                           @ w2 * "
-                "inr02\n"
-                "vmla.f32   q9, q7, d2[0]                           @ w2 * "
-                "inr04\n"
-                "vmla.f32   q10, q7, d3[0]                          @ w2 * "
-                "inr06\n"
-                "vmla.f32   q11, q7, d8[0]                          @ w2 * "
-                "inr08\n"
-
-                "sub    %[r2], %[r2], #32                           @ r2 - 32, "
-                "load r2 twice\n"
-
-                /* mul r2, with w0, w1, w2 */
-                "vld1.32    {d0-d3}, [%[r1]]!                       @ load r1, "
-                "8 float\n"
-                "vmla.f32   q12, q5, d4[0]                          @ w0 * "
-                "inr20\n"
-                "vmla.f32   q13, q5, d5[0]                          @ w0 * "
-                "inr22\n"
-                "vmla.f32   q14, q5, d6[0]                          @ w0 * "
-                "inr24\n"
-                "vmla.f32   q15, q5, d7[0]                          @ w0 * "
-                "inr26\n"
-                "vld1.32    {d8},   [%[r1]]                         @ load r1, "
-                "9th float\n"
-                "vmla.f32   q12, q6, d4[1]                          @ w1 * "
-                "inr21\n"
-                "vmla.f32   q13, q6, d5[1]                          @ w1 * "
-                "inr23\n"
-                "vmla.f32   q14, q6, d6[1]                          @ w1 * "
-                "inr25\n"
-                "vmla.f32   q15, q6, d7[1]                          @ w1 * "
-                "inr27\n"
-                "vld1.32    {d10-d13}, [%[wc0]]!                    @ load w3, "
-                "w4, to q5, q6\n"
-                "vmla.f32   q12, q7, d5[0]                          @ w2 * "
-                "inr22\n"
-                "vmla.f32   q13, q7, d6[0]                          @ w2 * "
-                "inr24\n"
-                "vmla.f32   q14, q7, d7[0]                          @ w2 * "
-                "inr26\n"
-                "vmla.f32   q15, q7, d9[0]                          @ w2 * "
-                "inr28\n"
-                "vld1.32    {d14-d15}, [%[wc0]]!                    @ load w5, "
-                "to q7\n"
-
-                /* mul r1, with w3, w4, w5 */
-                "vmla.f32   q8, q5, d0[0]                           @ w3 * "
-                "inr10\n"
-                "vmla.f32   q9, q5, d1[0]                           @ w3 * "
-                "inr12\n"
-                "vmla.f32   q10, q5, d2[0]                          @ w3 * "
-                "inr14\n"
-                "vmla.f32   q11, q5, d3[0]                          @ w3 * "
-                "inr16\n"
-                "vld1.32    {d4-d7}, [%[r3]]!                       @ load r3, "
-                "8 float\n"
-                "vmla.f32   q8, q6, d0[1]                           @ w4 * "
-                "inr11\n"
-                "vmla.f32   q9, q6, d1[1]                           @ w4 * "
-                "inr13\n"
-                "vmla.f32   q10, q6, d2[1]                          @ w4 * "
-                "inr15\n"
-                "vmla.f32   q11, q6, d3[1]                          @ w4 * "
-                "inr17\n"
-                "vld1.32    {d9},   [%[r3]]                         @ load r3, "
-                "9th float\n"
-                "vmla.f32   q8, q7, d1[0]                           @ w5 * "
-                "inr12\n"
-                "vmla.f32   q9, q7, d2[0]                           @ w5 * "
-                "inr14\n"
-                "vmla.f32   q10, q7, d3[0]                          @ w5 * "
-                "inr16\n"
-                "vmla.f32   q11, q7, d8[0]                          @ w5 * "
-                "inr18\n"
-
-                "sub    %[ptr_out1], %[ptr_out1], #32               @ ptr_out1 "
-                "- 32, to start address\n"
-
-                /* mul r3, with w3, w4, w5 */
-                "vld1.32    {d0-d3}, [%[r2]]!                       @ load r2, "
-                "8 float\n"
-                "vmla.f32   q12, q5, d4[0]                          @ w3 * "
-                "inr30\n"
-                "vmla.f32   q13, q5, d5[0]                          @ w3 * "
-                "inr32\n"
-                "vmla.f32   q14, q5, d6[0]                          @ w3 * "
-                "inr34\n"
-                "vmla.f32   q15, q5, d7[0]                          @ w3 * "
-                "inr36\n"
-                "vld1.32    {d8},   [%[r2]]                         @ load r2, "
-                "9th float\n"
-                "vmla.f32   q12, q6, d4[1]                          @ w4 * "
-                "inr31\n"
-                "vmla.f32   q13, q6, d5[1]                          @ w4 * "
-                "inr33\n"
-                "vmla.f32   q14, q6, d6[1]                          @ w4 * "
-                "inr35\n"
-                "vmla.f32   q15, q6, d7[1]                          @ w4 * "
-                "inr37\n"
-                "vld1.32    {d10-d13}, [%[wc0]]!                    @ load w6, "
-                "w7, to q5, q6\n"
-                "vmla.f32   q12, q7, d5[0]                          @ w5 * "
-                "inr32\n"
-                "vmla.f32   q13, q7, d6[0]                          @ w5 * "
-                "inr34\n"
-                "vmla.f32   q14, q7, d7[0]                          @ w5 * "
-                "inr36\n"
-                "vmla.f32   q15, q7, d9[0]                          @ w5 * "
-                "inr38\n"
-                "vld1.32    {d14-d15}, [%[wc0]]!                    @ load w8, "
-                "to q7\n"
-
-                /* mul r2, with w6, w7, w8 */
-                "vmla.f32   q8, q5, d0[0]                           @ w6 * "
-                "inr20\n"
-                "vmla.f32   q9, q5, d1[0]                           @ w6 * "
-                "inr22\n"
-                "vmla.f32   q10, q5, d2[0]                          @ w6 * "
-                "inr24\n"
-                "vmla.f32   q11, q5, d3[0]                          @ w6 * "
-                "inr26\n"
-                "vld1.32    {d4-d7}, [%[r4]]!                       @ load r4, "
-                "8 float\n"
-                "vmla.f32   q8, q6, d0[1]                           @ w7 * "
-                "inr21\n"
-                "vmla.f32   q9, q6, d1[1]                           @ w7 * "
-                "inr23\n"
-                "vmla.f32   q10, q6, d2[1]                          @ w7 * "
-                "inr25\n"
-                "vmla.f32   q11, q6, d3[1]                          @ w7 * "
-                "inr27\n"
-                "vld1.32    {d9},   [%[r4]]                         @ load r4, "
-                "9th float\n"
-                "vmla.f32   q8, q7, d1[0]                           @ w8 * "
-                "inr22\n"
-                "vmla.f32   q9, q7, d2[0]                           @ w8 * "
-                "inr24\n"
-                "vmla.f32   q10, q7, d3[0]                          @ w8 * "
-                "inr26\n"
-                "vmla.f32   q11, q7, d8[0]                          @ w8 * "
-                "inr28\n"
-
-                "sub    %[wc0], %[wc0], #144                        @ wc0 - "
-                "144 to start address\n"
-
-                /* mul r4, with w6, w7, w8 */
-                "vld1.32    {d0-d3}, [%[r0]]!                       @ load r0, "
-                "8 float\n"
-                "vmla.f32   q12, q5, d4[0]                          @ w3 * "
-                "inr40\n"
-                "vst1.32    {d16-d19}, [%[ptr_out0]]!               @ save "
-                "r00, r01, c0~c3\n"
-                "vmla.f32   q13, q5, d5[0]                          @ w3 * "
-                "inr42\n"
-                "vst1.32    {d20-d23}, [%[ptr_out0]]!               @ save "
-                "r02, r03, c0~c3\n"
-                "vmla.f32   q14, q5, d6[0]                          @ w3 * "
-                "inr44\n"
-                "vmla.f32   q15, q5, d7[0]                          @ w3 * "
-                "inr46\n"
-                "vld1.32    {d8},   [%[r0]]                         @ load r0, "
-                "9th float\n"
-                "vmla.f32   q12, q6, d4[1]                          @ w4 * "
-                "inr41\n"
-                "vmla.f32   q13, q6, d5[1]                          @ w4 * "
-                "inr43\n"
-                "vmla.f32   q14, q6, d6[1]                          @ w4 * "
-                "inr45\n"
-                "vmla.f32   q15, q6, d7[1]                          @ w4 * "
-                "inr47\n"
-                "vld1.32    {d10-d13}, [%[wc0]]!                    @ load w0, "
-                "w1, to q5, q6\n"
-                "vmla.f32   q12, q7, d5[0]                          @ w5 * "
-                "inr42\n"
-                "vmla.f32   q13, q7, d6[0]                          @ w5 * "
-                "inr44\n"
-                "vmla.f32   q14, q7, d7[0]                          @ w5 * "
-                "inr46\n"
-                "vmla.f32   q15, q7, d9[0]                          @ w5 * "
-                "inr48\n"
-                "vld1.32    {d14-d15}, [%[wc0]]!                    @ load w2, "
-                "to q7\n"
-
-                "vst1.32    {d24-d27}, [%[ptr_out1]]!               @ save "
-                "r10, r11, c0~c3\n"
-                "vst1.32    {d28-d31}, [%[ptr_out1]]!               @ save "
-                "r12, r13, c0~c3\n"
-
-                "vld1.32    {d16-d19}, [%[ptr_out0]]!               @ load "
-                "outr0, w0, w1, c0~c3\n"
-                "vld1.32    {d20-d23}, [%[ptr_out0]]                @ load "
-                "outr0, w2, w3, c0~c3\n"
-
-                "sub    %[ptr_out0], %[ptr_out0], #32               @ ptr_out0 "
-                "- 32, to start address\n"
-
-                "subs   %[cnt], #1                                  @ loop "
-                "count--\n"
-                "bne    0b                                          @ jump to "
-                "main loop\n"
-
-                : [cnt] "+r"(cnt), [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2),
-                  [r3] "+r"(r3), [r4] "+r"(r4), [ptr_out0] "+r"(ptr_out0),
-                  [ptr_out1] "+r"(ptr_out1), [wc0] "+r"(wc0)
-                :
-                : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
-                  "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-
-            inr0 += win_round;
-            inr1 += win_round;
-            inr2 += win_round;
-            inr3 += win_round;
-            inr4 += win_round;
-          }
-#endif  // __aarch64__
-          block_inr0 = block_inr4;
-          block_inr1 = block_inr0 + in_len;
-          block_inr2 = block_inr1 + in_len;
-          block_inr3 = block_inr2 + in_len;
-          block_inr4 = block_inr3 + in_len;
-        }
-
-        slidingwindow_writeout_c4_fp32(pre_out, dout_batch, c, c + hout_c_block,
-                                       h, h + h_kernel, 0, wout_round, chout,
-                                       hout, wout, relu, ptr_write);
-      }
-
-#pragma omp parallel for
-      for (int c = 0; c < c_remain; ++c) {
-#ifdef USE_OPENMP
-        float *pre_out =
-            pre_din + pre_in_size + omp_get_thread_num() * pre_out_size;
-#else
-        float *pre_out = pre_din + pre_in_size;
-#endif
-
-        const float *block_inr0 = cblock_inr0;
-        const float *block_inr1 = cblock_inr1;
-        const float *block_inr2 = cblock_inr2;
-        const float *block_inr3 = cblock_inr3;
-        const float *block_inr4 = cblock_inr4;
-
-        //! get weights ptr of remained
-        const float *weight_c = weights + c_round_down * w_stride;
-
-        //! fill bias to one channel
-        const float *bias_ptr = ptr_zero;
-        if (bias != nullptr) {
-          bias_ptr = bias + c_round_down + c;
-        }
-        slidingwindow_fill_bias(pre_out, bias_ptr, 1, wout_round * h_kernel);
-
-        for (int hk = 0; hk < h_kernel; hk += hout_r_kernel) {
-          const float *wc0 = weight_c;
-
-          const float *inr0 = block_inr0;
-          const float *inr1 = block_inr1;
-          const float *inr2 = block_inr2;
-          const float *inr3 = block_inr3;
-          const float *inr4 = block_inr4;
-
-          float *pre_out0 = pre_out + hk * wout_round;
-          float *pre_out1 = pre_out0 + wout_round;
-#ifdef __aarch64__
-          for (int i = 0; i < chin; ++i) {
-            float *ptr_out0 = pre_out0;
-            float *ptr_out1 = pre_out1;
-
-            //! get valid weights of current output channel
-            float32x4_t w0 = vdupq_n_f32(wc0[c]);       // w0, v23
-            float32x4_t w1 = vdupq_n_f32(wc0[c + 4]);   // w1, v24
-            float32x4_t w2 = vdupq_n_f32(wc0[c + 8]);   // w2, v25
-            float32x4_t w3 = vdupq_n_f32(wc0[c + 12]);  // w3, v26
-            float32x4_t w4 = vdupq_n_f32(wc0[c + 16]);  // w4, v27
-            float32x4_t w5 = vdupq_n_f32(wc0[c + 20]);  // w5, v28
-            float32x4_t w6 = vdupq_n_f32(wc0[c + 24]);  // w6, v29
-            float32x4_t w7 = vdupq_n_f32(wc0[c + 28]);  // w7, v30
-            float32x4_t w8 = vdupq_n_f32(wc0[c + 32]);  // w8, v31
-
-            const float *r0 = inr0;
-            const float *r1 = inr1;
-            const float *r2 = inr2;
-            const float *r3 = inr3;
-            const float *r4 = inr4;
-
-            int cnt = w_loop;
-            asm volatile(
-                "ldr    q21, [%[ptr_out0]]          \n" /* load outr00, outr01,
-                                                           outr02, outr03*/
-
-                "ld2  {v0.4s, v1.4s}, [%[r0]], #32  \n" /* load input r0*/
-                "ldr    d10,      [%[r0]]           \n" /* load input r0, 9th
-                                                           element*/
-                "ld2  {v4.4s, v5.4s}, [%[r2]], #32  \n" /* load input r2*/
-                "ldr    d12,      [%[r2]]           \n" /* load input r2, 9th
-                                                           element*/
-                "2:                                 \n" /* main loop*/
-                /*  r0, r2, mul w0, get out r0, r1 */
-                "ldr    q22, [%[ptr_out1]]          \n" /* load outr10, outr11,
-                                                           outr12, outr13*/
-
-                "fmla   v21.4s ,  %[w0].4s,  v0.4s  \n" /* outr0 = w0 * r0[0, 2,
-                                                           4, 6]*/
-                "fmla   v22.4s ,  %[w0].4s,  v4.4s  \n" /* outr1 = w0 * r2[0, 2,
-                                                           4, 6]*/
-
-                "ld2  {v2.4s, v3.4s}, [%[r1]], #32  \n" /* load input r1*/
-
-                /* r2 mul w6, get out r0*/
-                "fmla   v21.4s ,  %[w6].4s,  v4.4s  \n" /* outr0 = w6 * r2[0, 2,
-                                                           4, 6]*/
-                "ldr    d11,      [%[r1]]           \n" /* load input r1, 9th
-                                                           element*/
-
-                /* shift left 1 */
-                "ext    v15.16b, v0.16b, v10.16b, #4\n" /* shift left r0 1*/
-                "ext    v16.16b, v4.16b, v12.16b, #4\n" /* shift left r2 1*/
-
-                /*  r0, r2, mul w1, get out r0, r1 */
-                "fmla   v21.4s ,  %[w1].4s,  v1.4s  \n" /* outr0 = w1 * r0[1, 3,
-                                                           5, 7]*/
-                "fmla   v22.4s ,  %[w1].4s,  v5.4s  \n" /* outr1 = w1 * r2[1, 3,
-                                                           5, 7]*/
-
-                "ld2  {v6.4s, v7.4s}, [%[r3]], #32  \n" /* load input r3*/
-
-                /*  r2 mul w7, get out r0 */
-                "fmla   v21.4s ,  %[w7].4s,  v5.4s  \n" /* outr00 = w7 * r2[1,
-                                                           3, 5, 7]*/
-
-                "ldr    d13,      [%[r3]]           \n" /* load input r3, 9th
-                                                           element*/
-
-                /*  r0, r2, mul w2, get out r0, r1 */
-                "fmla   v21.4s ,  %[w2].4s,  v15.4s \n" /* outr0 = w2 * r0[2, 4,
-                                                           6, 8]*/
-                "fmla   v22.4s ,  %[w2].4s,  v16.4s \n" /* outr1 = w2 * r2[2, 4,
-                                                           6, 8]*/
-
-                "ld2  {v8.4s, v9.4s}, [%[r4]], #32  \n" /* load input r4*/
-
-                /*  r2, mul w8, get out r0 */
-                "fmla   v21.4s ,  %[w8].4s,  v16.4s \n" /* outr00 = w8 * r2[2,
-                                                           4, 6, 8]*/
-
-                "ldr    d14,      [%[r4]]           \n" /* load input r4, 9th
-                                                           element*/
-
-                /* r1, r3, mul w3, get out r0, r1 */
-                "fmla   v21.4s ,  %[w3].4s,  v2.4s  \n" /* outr0 = w3 * r1[0, 2,
-                                                           4, 6]*/
-                "fmla   v22.4s ,  %[w3].4s,  v6.4s  \n" /* outr1 = w3 * r3[0, 2,
-                                                           4, 6]*/
-
-                /* shift left 1 */
-                "ext    v15.16b, v2.16b, v11.16b, #4\n" /* shift left r1 1*/
-                "ext    v16.16b, v6.16b, v13.16b, #4\n" /* shift left r3 1*/
-
-                "ld2  {v0.4s, v1.4s}, [%[r0]], #32  \n" /* load input r0*/
-
-                /*  r1, r3, mul w4, get out r0, r1 */
-                "fmla   v21.4s ,  %[w4].4s,  v3.4s  \n" /* outr0 = w4 * r1[1, 3,
-                                                           5, 7]*/
-                "fmla   v22.4s ,  %[w4].4s,  v7.4s  \n" /* outr1 = w4 * r3[1, 3,
-                                                           5, 7]*/
-
-                "ldr    d10,      [%[r0]]           \n" /* load input r0, 9th
-                                                           element*/
-
-                /*  r1, r3, mul w5, get out r0, r1 */
-                "fmla   v21.4s ,  %[w5].4s,  v15.4s \n" /* outr0 = w5 * r1[2]*/
-                "fmla   v22.4s ,  %[w5].4s,  v16.4s \n" /* outr1 = w5 * r1[4]*/
-
-                "ld2  {v4.4s, v5.4s}, [%[r2]], #32  \n" /* load input r2*/
-                "ldr    d12,      [%[r2]]           \n" /* load input r2, 9th
-                                                           element*/
-                "str    q21, [%[ptr_out0]], #16     \n" /* save outr00, outr01*/
-
-                /*  r4, mul w6, get out r1 */
-                "fmla   v22.4s ,  %[w6].4s,  v8.4s  \n" /* outr1 = w6 * r4[0, 2,
-                                                           4, 6]*/
-
-                "ext    v15.16b, v8.16b, v14.16b, #4\n" /* shift left r1 1*/
-                "ldr    q21, [%[ptr_out0]]          \n" /* load outr0*/
-
-                /*  r4, mul w7, get out r1 */
-                "fmla   v22.4s ,  %[w7].4s,  v9.4s  \n" /* outr1 = w7 * r4[1, 3,
-                                                           5, 7]*/
-
-                /*  r4, mul w8, get out r1 */
-                "fmla   v22.4s ,  %[w8].4s,  v15.4s \n" /* outr1 = w8 * r4[2, 4,
-                                                           6, 8]*/
-
-                "subs   %w[cnt], %w[cnt], #1        \n" /*loop count -1*/
-                "str    q22, [%[ptr_out1]], #16     \n" /* save outr1*/
-                "bne    2b                          \n" /* jump to main loop*/
-
-                : [cnt] "+r"(cnt), [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2),
-                  [r3] "+r"(r3), [r4] "+r"(r4), [ptr_out0] "+r"(ptr_out0),
-                  [ptr_out1] "+r"(ptr_out1)
-                : [w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2), [w3] "w"(w3),
-                  [w4] "w"(w4), [w5] "w"(w5), [w6] "w"(w6), [w7] "w"(w7),
-                  [w8] "w"(w8)
-                : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
-                  "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
-                  "v16", "v21", "v22");
-
-            wc0 += 36;
-            inr0 += win_round;
-            inr1 += win_round;
-            inr2 += win_round;
-            inr3 += win_round;
-            inr4 += win_round;
-          }
-#else   // not __aarch64__
-          for (int i = 0; i < chin; ++i) {
-            float *ptr_out0 = pre_out0;
-            float *ptr_out1 = pre_out1;
-
-            //! get valid weights of current output channel
-            float w_tmp[12] = {wc0[c],      wc0[c + 4],  wc0[c + 8],  0.f,
-                               wc0[c + 12], wc0[c + 16], wc0[c + 20], 0.f,
-                               wc0[c + 24], wc0[c + 28], wc0[c + 32], 0.f};
-            float32x4_t w0 = vld1q_f32(w_tmp);      // w0, w1, w2, q0
-            float32x4_t w1 = vld1q_f32(w_tmp + 4);  // w3, w4, w5, q1
-            float32x4_t w2 = vld1q_f32(w_tmp + 8);  // w6, w7, w8, q2
-
-            const float *r0 = inr0;
-            const float *r1 = inr1;
-            const float *r2 = inr2;
-            const float *r3 = inr3;
-            const float *r4 = inr4;
-
-            int cnt = w_loop / 2;
-            if (cnt > 0) {
-              asm volatile(
-                  /* main loop */
-                  "0:                                                     @ "
-                  "main loop\n"
-                  "vld1.32    {d24-d27},    [%[ptr_out0]]         @ load or00, "
-                  "or01\n"
-                  "vld1.32    {d28-d31},    [%[ptr_out1]]         @ load or10, "
-                  "or11\n"
-                  "vld2.32    {d6-d9},    [%[r2]]!                @ load r2, 8 "
-                  "float, interleave\n"
-                  "vld2.32    {d10-d13},  [%[r2]]!                @ load r2, 8 "
-                  "float, interleave\n"
-                  "vld1.32    {d22},  [%[r2]]                     @ load 16th "
-                  "float\n"
-
-                  /* r2 * w2, r2 * w0, get or0, or1 */
-                  "vmla.f32   q12,    q4, %e[w2][1]               @ w21 * r2, "
-                  "1, 3, 5, 7\n"
-                  "vmla.f32   q13,    q6, %e[w2][1]               @ w21 * r2, "
-                  "9, 11, 13, 15\n"
-                  "vld2.32    {d14-d17},    [%[r0]]!              @ load r0, 8 "
-                  "float, interleave\n"
-                  "vmla.f32   q14,    q4, %e[w0][1]               @ w01 * r2, "
-                  "1, 3, 5, 7\n"
-                  "vmla.f32   q15,    q6, %e[w0][1]               @ w01 * r2, "
-                  "9, 11, 13, 15\n"
-
-                  "vext.32    q4, q3, q5, #1                      @ r2, shift "
-                  "left 1, get 2, 4, 6, 8\n"
-                  "vext.32    q6, q5, q11, #1                     @ r2, shift "
-                  "left 1, get 10, 12, 14, 16\n"
-
-                  "vmla.f32   q12,    q3, %e[w2][0]               @ w20 * r2, "
-                  "0, 2, 4, 6\n"
-                  "vmla.f32   q13,    q5, %e[w2][0]               @ w20 * r2, "
-                  "8, 10, 12, 14\n"
-                  "vld2.32    {d18-d21},  [%[r0]]!                @ load r0, 8 "
-                  "float, interleave\n"
-                  "vmla.f32   q14,    q3, %e[w0][0]               @ w00 * r2, "
-                  "0, 2, 4, 6\n"
-                  "vmla.f32   q15,    q5, %e[w0][0]               @ w00 * r2, "
-                  "8, 10, 12, 14\n"
-
-                  "vld1.32    {d22},  [%[r0]]                     @ load 16th "
-                  "float\n"
-
-                  "vmla.f32   q12,    q4, %f[w2][0]               @ w22 * r2, "
-                  "2, 4, 6, 8\n"
-                  "vmla.f32   q14,    q4, %f[w0][0]               @ w02 * r2, "
-                  "2, 4, 6, 8\n"
-                  "vld2.32    {d6-d9},    [%[r3]]!                @ load r3, 8 "
-                  "float, interleave\n"
-                  "vmla.f32   q13,    q6, %f[w2][0]               @ w22 * r2, "
-                  "10, 12, 14, 16\n"
-                  "vmla.f32   q15,    q6, %f[w0][0]               @ w02 * r2, "
-                  "10, 12, 14, 16\n"
-                  "vld2.32    {d10-d13},  [%[r3]]!                @ load r3, 8 "
-                  "float, interleave\n"
-
-                  /* r0 * w0, get or0, r3 * w1, get or1*/
-                  "vmla.f32   q12,    q8, %e[w0][1]               @ w01 * r0, "
-                  "1, 3, 5, 7\n"
-                  "vmla.f32   q13,    q10, %e[w0][1]              @ w01 * r0, "
-                  "9, 11, 13, 15\n"
-                  "vext.32    q8, q7, q9, #1                      @ r0, shift "
-                  "left 1, get 2, 4, 6, 8\n"
-                  "vext.32    q10, q9, q11, #1                    @ r0, shift "
-                  "left 1, get 10, 12, 14, 16\n"
-                  "vld1.32    {d22},  [%[r3]]                     @ load 16th "
-                  "float\n"
-                  "vmla.f32   q14,    q4, %e[w1][1]               @ w11 * r3, "
-                  "1, 3, 5, 7\n"
-                  "vmla.f32   q15,    q6, %e[w1][1]               @ w11 * r3, "
-                  "9, 11, 13, 15\n"
-
-                  "vmla.f32   q12,    q7, %e[w0][0]               @ w00 * r0, "
-                  "0, 2, 4, 6\n"
-                  "vmla.f32   q13,    q9, %e[w0][0]               @ w00 * r0, "
-                  "8, 10, 12, 14\n"
-                  "vext.32    q4, q3, q5, #1                      @ r3, shift "
-                  "left 1, get 2, 4, 6, 8\n"
-                  "vext.32    q6, q5, q11, #1                     @ r3, shift "
-                  "left 1, get 10, 12, 14, 16\n"
-                  "vmla.f32   q14,    q3, %e[w1][0]               @ w10 * r3, "
-                  "0, 2, 4, 6\n"
-                  "vmla.f32   q15,    q5, %e[w1][0]               @ w10 * r3, "
-                  "8, 10, 12, 14\n"
-
-                  "vmla.f32   q12,    q8, %f[w0][0]               @ w02 * r0, "
-                  "2, 4, 6, 8\n"
-                  "vld2.32    {d14-d17},  [%[r1]]!                @ load r1, 8 "
-                  "float, interleave\n"
-                  "vmla.f32   q13,    q10,%f[w0][0]               @ w02 * r0, "
-                  "10, 12, 14, 16\n"
-                  "vld2.32    {d18-d21},  [%[r1]]!                @ load r1, 8 "
-                  "float, interleave\n"
-                  "vmla.f32   q14,    q4, %f[w1][0]               @ w12 * r3, "
-                  "2, 4, 6, 8\n"
-                  "vld2.32    {d6-d9},    [%[r4]]!                @ load r4, 8 "
-                  "float, interleave\n"
-                  "vmla.f32   q15,    q6, %f[w1][0]               @ w12 * r3, "
-                  "10, 12, 14, 16\n"
-                  "vld2.32    {d10-d13},  [%[r4]]!                @ load r4, 8 "
-                  "float, interleave\n"
-
-                  "vld1.32    {d22},  [%[r1]]                     @ load 16th "
-                  "float\n"
-
-                  /* r1 * w1, get or0, r4 * w2, get or1 */
-                  "vmla.f32   q12,    q8, %e[w1][1]               @ w11 * r1, "
-                  "1, 3, 5, 7\n"
-                  "vmla.f32   q13,    q10, %e[w1][1]              @ w11 * r1, "
-                  "9, 11, 13, 15\n"
-                  "vext.32    q8, q7, q9, #1                      @ r1, shift "
-                  "left 1, get 2, 4, 6, 8\n"
-                  "vext.32    q10, q9, q11, #1                    @ r1, shift "
-                  "left 1, get 10, 12, 14, 16\n"
-                  "vmla.f32   q14,    q4, %e[w2][1]               @ w21 * r4, "
-                  "1, 3, 5, 7\n"
-                  "vmla.f32   q15,    q6, %e[w2][1]               @ w21 * r4, "
-                  "9, 11, 13, 15\n"
-                  "vld1.32    {d22},  [%[r4]]                     @ load 16th "
-                  "float\n"
-
-                  "vmla.f32   q12,    q7, %e[w1][0]               @ w10 * r1, "
-                  "0, 2, 4, 6\n"
-                  "vmla.f32   q13,    q9, %e[w1][0]               @ w10 * r1, "
-                  "8, 10, 12, 14\n"
-                  "vext.32    q4, q3, q5, #1                      @ r1, shift "
-                  "left 1, get 2, 4, 6, 8\n"
-                  "vext.32    q6, q5, q11, #1                     @ r1, shift "
-                  "left 1, get 10, 12, 14, 16\n"
-                  "vmla.f32   q14,    q3, %e[w2][0]               @ w20 * r4, "
-                  "0, 2, 4, 6\n"
-                  "vmla.f32   q15,    q5, %e[w2][0]               @ w20 * r4, "
-                  "8, 10, 12, 14\n"
-
-                  "vmla.f32   q12,    q8, %f[w1][0]               @ w12 * r1, "
-                  "2, 4, 6, 8\n"
-                  "vmla.f32   q13,    q10, %f[w1][0]              @ w12 * r1, "
-                  "10, 12, 14, 16\n"
-                  "vmla.f32   q14,    q4, %f[w2][0]               @ w22 * r4, "
-                  "2, 4, 6, 8\n"
-                  "vmla.f32   q15,    q6, %f[w2][0]               @ w22 * r4, "
-                  "10, 12, 14, 16\n"
-
-                  "vst1.32    {d24-d27},  [%[ptr_out0]]!          @ save or0\n"
-                  "vst1.32    {d28-d31},  [%[ptr_out1]]!          @ save or0\n"
-
-                  "subs   %[cnt], #1                              @loop count "
-                  "-1\n"
-                  "bne    0b                                      @ jump to "
-                  "main loop\n"
-
-                  : [cnt] "+r"(cnt), [r0] "+r"(r0), [r1] "+r"(r1),
-                    [r2] "+r"(r2), [r3] "+r"(r3), [r4] "+r"(r4),
-                    [ptr_out0] "+r"(ptr_out0), [ptr_out1] "+r"(ptr_out1)
-                  : [w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2)
-                  : "cc", "memory", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
-                    "q10", "q11", "q12", "q13", "q14", "q15");
-            }
-            //! deal with remain wout
-            if (w_loop & 1) {
-              ptr_out0[0] +=
-                  r0[0] * w_tmp[0] + r0[1] * w_tmp[1] + r0[2] * w_tmp[2] +
-                  r1[0] * w_tmp[4] + r1[1] * w_tmp[5] + r1[2] * w_tmp[6] +
-                  r2[0] * w_tmp[8] + r2[1] * w_tmp[9] + r2[2] * w_tmp[10];
-
-              ptr_out0[1] +=
-                  r0[2] * w_tmp[0] + r0[3] * w_tmp[1] + r0[4] * w_tmp[2] +
-                  r1[2] * w_tmp[4] + r1[3] * w_tmp[5] + r1[4] * w_tmp[6] +
-                  r2[2] * w_tmp[8] + r2[3] * w_tmp[9] + r2[4] * w_tmp[10];
-
-              ptr_out0[2] +=
-                  r0[4] * w_tmp[0] + r0[5] * w_tmp[1] + r0[6] * w_tmp[2] +
-                  r1[4] * w_tmp[4] + r1[5] * w_tmp[5] + r1[6] * w_tmp[6] +
-                  r2[4] * w_tmp[8] + r2[5] * w_tmp[9] + r2[6] * w_tmp[10];
-
-              ptr_out0[3] +=
-                  r0[6] * w_tmp[0] + r0[7] * w_tmp[1] + r0[8] * w_tmp[2] +
-                  r1[6] * w_tmp[4] + r1[7] * w_tmp[5] + r1[8] * w_tmp[6] +
-                  r2[6] * w_tmp[8] + r2[7] * w_tmp[9] + r2[8] * w_tmp[10];
-
-              ptr_out1[0] +=
-                  r2[0] * w_tmp[0] + r2[1] * w_tmp[1] + r2[2] * w_tmp[2] +
-                  r3[0] * w_tmp[4] + r3[1] * w_tmp[5] + r3[2] * w_tmp[6] +
-                  r4[0] * w_tmp[8] + r4[1] * w_tmp[9] + r4[2] * w_tmp[10];
-
-              ptr_out1[1] +=
-                  r2[2] * w_tmp[0] + r2[3] * w_tmp[1] + r2[4] * w_tmp[2] +
-                  r3[2] * w_tmp[4] + r3[3] * w_tmp[5] + r3[4] * w_tmp[6] +
-                  r4[2] * w_tmp[8] + r4[3] * w_tmp[9] + r4[4] * w_tmp[10];
-
-              ptr_out1[2] +=
-                  r2[4] * w_tmp[0] + r2[5] * w_tmp[1] + r2[6] * w_tmp[2] +
-                  r3[4] * w_tmp[4] + r3[5] * w_tmp[5] + r3[6] * w_tmp[6] +
-                  r4[4] * w_tmp[8] + r4[5] * w_tmp[9] + r4[6] * w_tmp[10];
-
-              ptr_out1[3] +=
-                  r2[6] * w_tmp[0] + r2[7] * w_tmp[1] + r2[8] * w_tmp[2] +
-                  r3[6] * w_tmp[4] + r3[7] * w_tmp[5] + r3[8] * w_tmp[6] +
-                  r4[6] * w_tmp[8] + r4[7] * w_tmp[9] + r4[8] * w_tmp[10];
-            }
-
-            wc0 += 36;
-            inr0 += win_round;
-            inr1 += win_round;
-            inr2 += win_round;
-            inr3 += win_round;
-            inr4 += win_round;
-          }
-#endif  // __aarch64__
-          block_inr0 = block_inr4;
-          block_inr1 = block_inr0 + in_len;
-          block_inr2 = block_inr1 + in_len;
-          block_inr3 = block_inr2 + in_len;
-          block_inr4 = block_inr3 + in_len;
-        }
-        slidingwindow_writeout_c1_fp32(
-            pre_out, dout_batch, c + c_round_down, c + c_round_down + 1, h,
-            h + h_kernel, 0, wout_round, chout, hout, wout, relu, ptr_write);
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/slidingwindow_conv3x3.h b/mobile/src/operators/math/slidingwindow_conv3x3.h
deleted file mode 100644
index 8bdd682cdb..0000000000
--- a/mobile/src/operators/math/slidingwindow_conv3x3.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <vector>
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-template <typename Itype, typename Otype>
-void SlidingwindowConv3x3s1(const framework::Tensor *input,
-                            const framework::Tensor *filter,
-                            const std::vector<int> &paddings,
-                            framework::Tensor *output);
-
-template <typename Itype, typename Otype>
-void SlidingwindowConv3x3s2(const framework::Tensor *input,
-                            const framework::Tensor *filter,
-                            const std::vector<int> &paddings,
-                            framework::Tensor *output);
-
-template <typename Itype, typename Otype>
-void SlidingwindowConv3x3s1Faster(const framework::Tensor *input,
-                                  framework::Tensor *filter,
-                                  const std::vector<int> &paddings,
-                                  framework::Tensor *output, const float *bias,
-                                  bool is_bias, bool is_relu);
-
-template <typename Itype, typename Otype>
-void SlidingwindowConv3x3s2Faster(const framework::Tensor *input,
-                                  framework::Tensor *filter,
-                                  const std::vector<int> &paddings,
-                                  framework::Tensor *output, const float *bias,
-                                  bool is_bias, bool is_relu);
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/slidingwindow_utils.cpp b/mobile/src/operators/math/slidingwindow_utils.cpp
deleted file mode 100644
index cd20612482..0000000000
--- a/mobile/src/operators/math/slidingwindow_utils.cpp
+++ /dev/null
@@ -1,365 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/math/slidingwindow_utils.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-void slidingwindow_fill_bias(float* dout, const float* bias, int ch_num,
-                             int ch_size) {
-  for (int j = 0; j < ch_num; j++) {
-    float32x4_t vb = vdupq_n_f32(bias[j]);
-    int i = 0;
-    for (; i < ch_size - 3; i += 4) {
-      vst1q_f32(dout + i, vb);
-    }
-    for (; i < ch_size; i++) {
-      dout[i] = bias[j];
-    }
-    dout += ch_size;
-  }
-}
-
-/* write result in outputs
- * input din: [n, c, h, w], output dout: [n, c, h, w]
- */
-void slidingwindow_writeout_c1_fp32(const float* din, float* dout, int cs,
-                                    int ce, int hs, int he, int ws, int we,
-                                    int channel, int height, int width,
-                                    bool flag_relu, float* trash_ptr) {
-  if (cs > channel) {
-    return;
-  }
-
-  const int c1 = 1;
-  const int w4 = 4;
-
-  int size_c_out = width * height;
-
-  float* doutc0r0 = dout + cs * size_c_out + hs * width + ws;
-
-  const float* ptr_din = din;
-
-  int size_h = (he > height ? height : he) - hs;  // size_h == hei_n
-
-  int w_round = we - ws;
-  int cnt = (width - ws) / w4;
-
-  for (int i = 0; i < size_h; i++) {
-    int size_w = i * width;
-    float* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
-    const float* din_hei_ptr = ptr_din + i * w_round * c1;
-    if (cnt > 0) {
-      int cnt_loop = cnt;
-      if (flag_relu) {
-#ifdef __aarch64__
-        asm volatile(
-            "ldr q0, [%[ptr_din]], #16      \n" /* load data, c0r0, c0r1, c0r2,
-                                                   c0r3 */
-            "movi v20.4s, #0                \n" /* for relu */
-            "1:                             \n" /* main loop */
-            "fmax   v1.4s, v0.4s, v20.4s    \n" /* relu */
-            "ldr q0, [%[ptr_din]], #16      \n" /* load data, c0r0, c0r1, c0r2,
-                                                   c0r3 */
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1 */
-            "str    q1, [%[doutc0r0]], #16  \n" /* store c0r0 */
-            "bne    1b                      \n" /* jump to main loop */
-            : [doutc0r0] "+r"(doutc0_ptr), [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0", "v1", "v20");
-#else
-        asm volatile(
-            "vld1.32 {d0-d1}, [%[ptr_din]]!         @ load data, c0r0, c1r0, "
-            "c0r1, c1r1, , c0r2, c1r2, c0r3, c1r3\n"
-            "vmov.u32 q15, #0                       @ dump zero\n"
-            "1:                                     @ main loop\n"
-
-            "vmax.f32   q1, q0, q15                 @ relu\n"
-            "vld1.32 {d0-d1}, [%[ptr_din]]!         @ load data \n"
-
-            "vst1.32  {d2-d3}, [%[doutc0r0]]!       @ store result, add "
-            "pointer\n"
-
-            "subs   %[cnt], %[cnt], #1              @ loop count - 1\n"
-
-            "bne    1b                              @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr), [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q15");
-#endif
-      } else {
-#ifdef __aarch64__
-        asm volatile(
-            "ldr q0, [%[ptr_din]], #16      \n" /* load data, c0r0, c0r1, c0r2,
-                                                   c0r3 */
-            "1:                             \n" /* main loop */
-            "str    q0, [%[doutc0r0]], #16  \n" /* store c2r0 */
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1 */
-            "ldr q0, [%[ptr_din]], #16      \n" /* load data, c0r0, c0r1, c0r2,
-                                                   c0r3 */
-            "bne    1b                      \n" /* jump to main loop */
-
-            : [doutc0r0] "+r"(doutc0_ptr), [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0");
-#else
-        asm volatile(
-            "vld1.32 {d0-d1}, [%[ptr_din]]!         @ load data, c0r0, c0r1, "
-            "c0r2, c0r3\n"
-            "1:                                     @ main loop\n"
-            "vst1.32  {d0-d1}, [%[doutc0r0]]!       @ store result, add "
-            "pointer\n"
-            "subs   %[cnt], %[cnt], #1              @ loop count - 1\n"
-            "vld1.32 {d0-d1}, [%[ptr_din]]!         @ load data \n"
-            "bne    1b                              @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr), [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0");
-#endif
-      }
-    }
-    if (we > width) {
-      int offset = i * w_round * c1 + c1 * w4 * cnt;
-      din_hei_ptr = ptr_din + offset;
-      int j = we - w4;
-      if (flag_relu) {
-        for (; j < width; ++j) {
-          *(doutc0_ptr++) = std::max(din_hei_ptr[0], 0.f);
-          din_hei_ptr++;
-        }
-      } else {
-        for (; j < width; ++j) {
-          *(doutc0_ptr++) = *(din_hei_ptr++);
-        }
-      }
-    }
-  }
-}
-
-/* write result in outputs
- * input din: [n, c / 4, h, w * 4], output dout: [n, c, h, w]
- */
-void slidingwindow_writeout_c4_fp32(const float* din, float* dout, int cs,
-                                    int ce, int hs, int he, int ws, int we,
-                                    int channel, int height, int width,
-                                    bool flag_relu, float* trash_ptr) {
-  const int c4 = 4;
-  const int w4 = 4;
-  const int w_round = we - ws;
-  const int ch_n = ce - cs;
-  int size_c_out = width * height;
-
-  float* doutc0r0 = dout + cs * size_c_out + hs * width + ws;
-  float* doutc1r0 = doutc0r0 + size_c_out;
-  float* doutc2r0 = doutc1r0 + size_c_out;
-  float* doutc3r0 = doutc2r0 + size_c_out;
-
-  const float* ptr_din = din;
-
-  int size_h = (he > height ? height : he) - hs;  // size_h == hei_n
-
-  int cnt = (width - ws) / w4;
-
-  for (int i = 0; i < size_h; i++) {
-    int size_w = i * width;
-    float* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
-    float* doutc1_ptr = doutc1r0 + size_w;
-    float* doutc2_ptr = doutc2r0 + size_w;
-    float* doutc3_ptr = doutc3r0 + size_w;
-    if (ce > channel) {
-      switch (ce - channel) {
-        case 3:
-          doutc1_ptr = trash_ptr;
-        case 2:
-          doutc2_ptr = trash_ptr;
-        case 1:
-          doutc3_ptr = trash_ptr;
-        default:
-          break;
-      }
-    }
-    const float* din_hei_ptr = ptr_din + i * w_round * ch_n;
-    if (cnt > 0) {
-      int cnt_loop = cnt;
-      if (flag_relu) {
-#ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "movi v20.4s, #0                \n" /* for relu */
-            "1:                             \n" /* main loop */
-            "trn1   v8.4s, v0.4s, v1.4s     \n" /* trans q0, q1 */
-            "trn2   v9.4s, v0.4s, v1.4s     \n" /* trans q0, q1 */
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "trn1   v10.4s, v2.4s, v3.4s    \n" /* trans q2, q3 */
-            "trn2   v11.4s, v2.4s, v3.4s    \n" /* trans q2, q3 */
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "trn1   v16.2d, v8.2d, v10.2d   \n" /* trans q8, q10 */
-            "trn2   v17.2d, v8.2d, v10.2d   \n" /* trans q8, q10 */
-            "trn1   v18.2d, v9.2d, v11.2d   \n" /* trans q9, q11 */
-            "trn2   v19.2d, v9.2d, v11.2d   \n" /* trans q9, q11 */
-            "fmax   v16.4s, v16.4s, v20.4s  \n" /* relu */
-            "fmax   v17.4s, v17.4s, v20.4s  \n" /* relu */
-            "fmax   v18.4s, v18.4s, v20.4s  \n" /* relu */
-            "fmax   v19.4s, v19.4s, v20.4s  \n" /* relu */
-            "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0 */
-            "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0 */
-            "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0 */
-            "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0 */
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1 */
-            "bne    1b                      \n" /* jump to main loop */
-
-            : [doutc0r0] "+r"(doutc0_ptr), [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr), [doutc3r0] "+r"(doutc3_ptr),
-              [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-              "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20");
-#else
-        asm volatile(
-            "vld1.32 {d0-d3}, [%[ptr_din]]!       @ load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!       @ load data \n"
-            "vmov.u32 q15, #0                     @ dump zero \n"
-            "1:                                   @ main loop \n"
-            "vtrn.32 q0, q1                       @ trans data:c00c01c20c21 "
-            "\n"
-            "vtrn.32 q2, q3                       @ trans data:c02c03c22c23 "
-            "\n"
-
-            "vswp   d1, d4                        @ swap data\n"
-            "vswp   d3, d6                        @ swap data\n"
-
-            "vmax.f32   q0, q0, q15               @ relu\n"
-            "vmax.f32   q1, q1, q15               @ relu\n"
-            "vmax.f32   q2, q2, q15               @ relu\n"
-            "vmax.f32   q3, q3, q15               @ relu\n"
-
-            "vst1.32  {d0-d1}, [%[doutc0r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d2-d3}, [%[doutc1r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d4-d5}, [%[doutc2r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d6-d7}, [%[doutc3r0]]!     @ store result, add pointer\n"
-
-            "subs   %[cnt], %[cnt], #1            @ loop count - 1\n"
-
-            "vld1.32 {d0-d3}, [%[ptr_din]]!       @ load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!       @ load data \n"
-
-            "bne    1b                            @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr), [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr), [doutc3r0] "+r"(doutc3_ptr),
-              [ptr_din] "+r"(din_hei_ptr), [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3", "q15");
-#endif
-      } else {
-#ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "1:                             \n" /* main loop */
-            "trn1   v8.4s, v0.4s, v1.4s     \n" /* trans q0, q1 */
-            "trn2   v9.4s, v0.4s, v1.4s     \n" /* trans q0, q1 */
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "trn1   v10.4s, v2.4s, v3.4s    \n" /* trans q2, q3 */
-            "trn2   v11.4s, v2.4s, v3.4s    \n" /* trans q2, q3 */
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "trn1   v16.2d, v8.2d, v10.2d   \n" /* trans q8, q10 */
-            "trn2   v17.2d, v8.2d, v10.2d   \n" /* trans q8, q10 */
-            "trn1   v18.2d, v9.2d, v11.2d   \n" /* trans q9, q11 */
-            "trn2   v19.2d, v9.2d, v11.2d   \n" /* trans q9, q11 */
-            "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0 */
-            "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0 */
-            "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0 */
-            "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0 */
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1 */
-            "bne    1b                      \n" /* jump to main loop */
-
-            : [doutc0r0] "+r"(doutc0_ptr), [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr), [doutc3r0] "+r"(doutc3_ptr),
-              [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v16", "v17",
-              "v18", "v19");
-#else
-        asm volatile(
-            "vld1.32 {d0-d3}, [%[ptr_din]]!       @ load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!       @ load data \n"
-            "1:                                   @ main loop \n"
-            "vtrn.32 q0, q1                       @ trans data:c00c01c20c21 "
-            "\n"
-            "vtrn.32 q2, q3                       @ trans data:c02c03c22c23 "
-            "\n"
-
-            "vswp   d1, d4                        @ swap data\n"
-            "vswp   d3, d6                        @ swap data\n"
-
-            "vst1.32  {d0-d1}, [%[doutc0r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d2-d3}, [%[doutc1r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d4-d5}, [%[doutc2r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d6-d7}, [%[doutc3r0]]!     @ store result, add pointer\n"
-
-            "subs   %[cnt], %[cnt], #1            @ loop count - 1\n"
-
-            "vld1.32 {d0-d3}, [%[ptr_din]]!       @ load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!       @ load data \n"
-
-            "bne    1b                            @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr), [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr), [doutc3r0] "+r"(doutc3_ptr),
-              [ptr_din] "+r"(din_hei_ptr), [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3");
-#endif
-      }
-    }
-    if (we > width) {
-      int offset = i * w_round * c4 + c4 * w4 * cnt;
-      din_hei_ptr = ptr_din + offset;
-      int j = we - w4;
-      if (flag_relu) {
-        for (; j < width; ++j) {
-          *(doutc0_ptr++) = std::max(din_hei_ptr[0], 0.f);
-          *(doutc1_ptr++) = std::max(din_hei_ptr[1], 0.f);
-          *(doutc2_ptr++) = std::max(din_hei_ptr[2], 0.f);
-          *(doutc3_ptr++) = std::max(din_hei_ptr[3], 0.f);
-          din_hei_ptr += w4;
-        }
-      } else {
-        for (; j < width; ++j) {
-          *(doutc0_ptr++) = din_hei_ptr[0];
-          *(doutc1_ptr++) = din_hei_ptr[1];
-          *(doutc2_ptr++) = din_hei_ptr[2];
-          *(doutc3_ptr++) = din_hei_ptr[3];
-          din_hei_ptr += w4;
-        }
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/slidingwindow_utils.h b/mobile/src/operators/math/slidingwindow_utils.h
deleted file mode 100644
index 6db22bcf5f..0000000000
--- a/mobile/src/operators/math/slidingwindow_utils.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include "framework/tensor.h"
-
-#if __ARM_NEON
-#include <arm_neon.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-/* preprocessing weights
- * input weights: [chout, chin/ group, kh, kw] --> outputs weights: [chout / n,
- * chin/ group, kh, kw, n]
- */
-template <typename dtype>
-void slidingwindow_transform_weight(const framework::Tensor& weight,
-                                    framework::Tensor* output) {
-  int chout = weight.dims()[0];
-  int chin = weight.dims()[1];
-  int kernel_size = weight.dims()[2] * weight.dims()[3];
-  const int n = 4;
-  int cround = (chout + n - 1) / n * n;
-  const dtype* din = weight.data<dtype>();
-  dtype* dout = output->mutable_data<dtype>({cround, chin, 3, 3});
-  int c_loop = chout / n;
-  int chout_round = (chout + n - 1) / n;
-  int win_stride = chin * kernel_size;
-  int wout_stride = n * win_stride;
-  int co = 0;
-  for (; co < c_loop; ++co) {
-    dtype* dout_c = dout + co * wout_stride;
-    const dtype* din_array[n];
-    din_array[0] = din + co * wout_stride;
-    for (int i = 1; i < n; i++) {
-      din_array[i] = din_array[i - 1] + win_stride;
-    }
-    for (int ci = 0; ci < chin; ++ci) {
-      for (int k = 0; k < kernel_size; ++k) {
-        for (int i = 0; i < n; i++) {
-          *(dout_c++) = *(din_array[i]++);
-        }
-      }
-    }
-  }
-  // pad final chout
-  if (chout_round > c_loop) {
-    dtype* dout_c = dout + c_loop * wout_stride;
-    const dtype* din_array[n];
-    din_array[0] = din + c_loop * wout_stride;
-    for (int i = 1; i < n; i++) {
-      din_array[i] = din_array[i - 1] + win_stride;
-    }
-    // deal remain
-    int cremain = chout_round * n - chout;
-    for (int i = 1; i <= cremain; i++) {
-      din_array[n - i] = din_array[0];
-    }
-    for (int ci = 0; ci < chin; ++ci) {
-      for (int k = 0; k < kernel_size; ++k) {
-        for (int i = 0; i < n; i++) {
-          *(dout_c++) = *(din_array[i]++);
-        }
-      }
-    }
-  }
-}
-
-/* preprocessing inputs
- * input din: [1, chin, he-hs, we - ws] --> outputs dout: [n, chin, 1, we - ws]
- * n = he - hs
- */
-template <typename dtype>
-void slidingwindow_prepack_input(const dtype* din, dtype* dout, int cs, int ce,
-                                 int hs, int he, int ws, int we, int channel,
-                                 int width, int height, dtype* zero_ptr) {
-  int n = he - hs;
-  int w0 = ws < 0 ? 0 : ws;
-  int w1 = we > width ? width : we;
-
-  int size_w = we - ws;
-  int size_wc_len = size_w * channel;
-  int size_c = width * height;
-
-  int valid_w = w1 - w0;
-  size_t valid_w_byte = valid_w * sizeof(dtype);
-
-  dtype* out_array[n];
-  out_array[0] = dout;
-  for (int i = 1; i < n; i++) {
-    out_array[i] = out_array[i - 1] + size_wc_len;
-  }
-
-  for (int c = 0; c < channel; ++c) {
-    int j = 0;
-    // valid height
-    for (int i = hs; i < he; i++) {
-      // get address
-      const dtype* in_array;
-      if (i < 0 || i >= height) {
-        in_array = zero_ptr;
-      } else {
-        in_array = din + i * width;
-      }
-
-      for (int w = ws; w < w0; ++w) {
-        *(out_array[j]++) = 0.f;
-      }
-      memcpy(out_array[j], in_array, valid_w_byte);
-      out_array[j] += valid_w;
-      for (int w = w1; w < we; ++w) {
-        *(out_array[j]++) = 0.f;
-      }
-      j++;
-    }
-    din += size_c;
-  }
-}
-
-inline void slidingwindow_fill_bias(float* dout, const float* bias, int size) {
-  float32x4_t vb = vld1q_f32(bias);
-  int cnt = size / 4;
-  for (int i = 0; i < cnt; ++i) {
-    vst1q_f32(dout, vb);
-    dout += 4;
-  }
-}
-
-void slidingwindow_fill_bias(float* dout, const float* bias, int ch_num,
-                             int ch_size);
-
-void slidingwindow_writeout_c1_fp32(const float* din, float* dout, int cs,
-                                    int ce, int hs, int he, int ws, int we,
-                                    int channel, int height, int width,
-                                    bool flag_relu, float* trash_ptr);
-
-void slidingwindow_writeout_c4_fp32(const float* din, float* dout, int cs,
-                                    int ce, int hs, int he, int ws, int we,
-                                    int channel, int height, int width,
-                                    bool flag_relu, float* trash_ptr);
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/softmax.cpp b/mobile/src/operators/math/softmax.cpp
deleted file mode 100644
index e066b0cccd..0000000000
--- a/mobile/src/operators/math/softmax.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SOFTMAX_OP
-
-#include "operators/math/softmax.h"
-#include <math.h>
-#include <algorithm>
-#include <limits>
-#include "common/types.h"
-#include "operators/math/math.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-#ifndef __aarch64__
-inline float32_t vmaxvq_f32(const float32x4_t &r) {
-  float32x2_t v = vmax_f32(vget_high_f32(r), vget_low_f32(r));
-  return vget_lane_f32(vpmax_f32(v, v), 0);
-}
-
-inline float32_t vaddvq_f32(const float32x4_t &r) {
-  float32x2_t v = vadd_f32(vget_high_f32(r), vget_low_f32(r));
-  return vget_lane_f32(vpadd_f32(v, v), 0);
-}
-#endif  // __aarch64__
-#endif  // __ARM_NEON__
-
-float find_max(const float *input, const int num_classes) {
-  int remain = num_classes;
-  float max = -std::numeric_limits<float>::max();
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-  int loop = num_classes >> 3;
-  remain = num_classes & 0x7;
-  float32x4_t __max = vdupq_n_f32(max);
-  for (int i = 0; i < loop; ++i, input += 8) {
-    float32x4_t x0 = vld1q_f32(input);
-    float32x4_t x1 = vld1q_f32(input + 4);
-    __max = vmaxq_f32(x0, __max);
-    __max = vmaxq_f32(x1, __max);
-  }
-  max = vmaxvq_f32(__max);
-#endif
-  for (int i = 0; i < remain; ++i) {
-    max = std::max(max, input[i]);
-  }
-  return max;
-}
-
-void SoftmaxBasic(const float *input, int num_classes, float *y) {
-  float *output = y;
-  // find max
-  float max = find_max(input, num_classes);
-
-  // exp(x - max) and sum(exp(x - max))
-  int remain = num_classes;
-  float sum = 0.f;
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-  int loop = num_classes >> 3;
-  remain = num_classes & 0x7;
-  float32x4_t __max = vdupq_n_f32(max);
-  float32x4_t __sum = vdupq_n_f32(0.f);
-  for (int i = 0; i < loop; ++i, input += 8, output += 8) {
-    float32x4_t x0 = vld1q_f32(input);
-    float32x4_t x1 = vld1q_f32(input + 4);
-    x0 = vsubq_f32(x0, __max);
-    x1 = vsubq_f32(x1, __max);
-    x0 = exp_ps(x0);
-    x1 = exp_ps(x1);
-    __sum = vaddq_f32(x0, __sum);
-    __sum = vaddq_f32(x1, __sum);
-    vst1q_f32(output, x0);
-    vst1q_f32(output + 4, x1);
-  }
-  sum += vaddvq_f32(__sum);
-#endif  // __ARM_NEON__
-  for (int i = 0; i < remain; ++i) {
-    float out = expf(input[i] - max);
-    sum += out;
-    output[i] = out;
-  }
-
-  // exp(x - max) / sum
-  float inv_sum = 1.f / sum;
-  output = y;
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-  float32x4_t __inv_sum = vdupq_n_f32(inv_sum);
-  for (int i = 0; i < loop; ++i, output += 8) {
-    float32x4_t x0 = vld1q_f32(output);
-    float32x4_t x1 = vld1q_f32(output + 4);
-    x0 = vmulq_f32(x0, __inv_sum);
-    x1 = vmulq_f32(x1, __inv_sum);
-    vst1q_f32(output, x0);
-    vst1q_f32(output + 4, x1);
-  }
-#endif
-  for (int i = 0; i < remain; ++i) {
-    output[i] *= inv_sum;
-  }
-}
-
-template <>
-void SoftmaxFuntor<CPU, float>::operator()(const framework::Tensor *X,
-                                           framework::Tensor *Y) {
-  const framework::DDim &dims = X->dims();
-  int batch_size = dims[0];
-  int num_classes = dims[dims.size() - 1];
-  int channels = X->numel() / batch_size / num_classes;
-  const float *x = X->data<float>();
-  float *y = Y->mutable_data<float>();
-
-  #pragma omp parallel for collapse(2)
-  for (int batch = 0; batch < X->dims()[0]; ++batch) {
-    for (int channel = 0; channel < channels; ++channel) {
-      size_t offset = (batch * channels + channel) * num_classes;
-      const float *input = x + offset;
-      float *output = y + offset;
-      SoftmaxBasic(input, num_classes, output);
-    }
-  }
-}
-
-template <>
-void SequenceSoftmaxFuntor<CPU, float>::operator()(
-    const framework::LoDTensor *X, framework::LoDTensor *Y) {
-  const float *x = X->data<float>();
-  const auto &lod = X->lod().back();
-  float *y = Y->mutable_data<float>();
-
-  #pragma omp parallel for
-  for (int batch = 0; batch < lod.size() - 1; ++batch) {
-    int num_classes = lod[batch + 1] - lod[batch];
-    size_t offset = lod[batch];
-    const float *input = x + offset;
-    float *output = y + offset;
-    SoftmaxBasic(input, num_classes, output);
-  }
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // SOFTMAX_OP
diff --git a/mobile/src/operators/math/softmax.h b/mobile/src/operators/math/softmax.h
deleted file mode 100644
index dff25b9d02..0000000000
--- a/mobile/src/operators/math/softmax.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined(SOFTMAX_OP) || defined(SEQUENCE_SOFTMAX_OP)
-
-#pragma once
-
-#include "framework/lod_tensor.h"
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <typename Device, typename T>
-class SoftmaxFuntor {
- public:
-  void operator()(const framework::Tensor *X, framework::Tensor *Y);
-};
-
-template <typename Device, typename T>
-class SequenceSoftmaxFuntor {
- public:
-  void operator()(const framework::LoDTensor *X, framework::LoDTensor *Y);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/math/transform.h b/mobile/src/operators/math/transform.h
deleted file mode 100644
index 7a31e12ef2..0000000000
--- a/mobile/src/operators/math/transform.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-// Transform applys a unary or a binary functor on each element in a
-// range defined by a pair of iterators.
-//
-// - The specialization for CPU calls std::transform.
-// - The specialization for CUDA calls thrust::tranform.
-//
-// NOTE: We need to define InputIter and OutputIter defined as
-//       different types, because the InputIter points op's inputs
-//       and
-//       OutputIter pints to op's outputs.
-//
-// NOTE: We don't assume that InputIter to be const InputType* and
-//       OutputIter to be OutputType*, because we might use a
-//       iterator
-//       class, paddle::fluid::operators::RowwiseTRansformIterator.
-
-struct Transform {
-  template <typename InputIter, typename OutputIter, typename UnaryOperation>
-  void operator()(InputIter first, InputIter last, OutputIter result,
-                  UnaryOperation op) {
-    std::transform(first, last, result, op);
-  }
-
-  template <typename InputIter1, typename InputIter2, typename OutputIter,
-            typename BinaryOperation>
-  void operator()(InputIter1 first1, InputIter1 last1, InputIter2 first2,
-                  OutputIter result, BinaryOperation op) {
-    std::transform(first1, last1, first2, result, op);
-  }
-};
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/vol2col.cpp b/mobile/src/operators/math/vol2col.cpp
deleted file mode 100644
index 9311e9e229..0000000000
--- a/mobile/src/operators/math/vol2col.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/math/vol2col.h"
-#include <vector>
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-using Tensor = paddle_mobile::framework::Tensor;
-/*
- * vol = [input_channels, input_depth, input_height, input_width]
- * col =
- *   [input_channels, filter_depth, filter_height, filter_width,
- *                    output_depth, output_height, output_width]
- */
-template <typename T>
-class Vol2ColFunctor<CPU, T> {
- public:
-  void operator()(const Tensor &vol, const std::vector<int> &dilations,
-                  const std::vector<int> &strides,
-                  const std::vector<int> &paddings, Tensor *col) const {
-    int input_channels = vol.dims()[0];
-    int input_depth = vol.dims()[1];
-    int input_height = vol.dims()[2];
-    int input_width = vol.dims()[3];
-    int filter_depth = col->dims()[1];
-    int filter_height = col->dims()[2];
-    int filter_width = col->dims()[3];
-    int output_depth = col->dims()[4];
-    int output_height = col->dims()[5];
-    int output_width = col->dims()[6];
-    int channels_col =
-        input_channels * filter_depth * filter_height * filter_width;
-
-    const T *vol_data = vol.data<T>();
-    T *col_data = col->data<T>();
-
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int d_offset = (c / filter_width / filter_height) % filter_depth;
-      int c_in = c / filter_width / filter_height / filter_depth;
-      for (int d = 0; d < output_depth; ++d) {
-        int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0];
-        for (int h = 0; h < output_height; ++h) {
-          int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1];
-          for (int w = 0; w < output_width; ++w) {
-            int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2];
-
-            int col_idx =
-                ((c * output_depth + d) * output_height + h) * output_width + w;
-            int vol_idx =
-                ((c_in * input_depth + d_pad) * input_height + h_pad) *
-                    input_width +
-                w_pad;
-            col_data[col_idx] =
-                (h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
-                 w_pad >= input_width || d_pad < 0 || d_pad >= input_depth)
-                    ? static_cast<T>(0)
-                    : vol_data[vol_idx];
-          }
-        }
-      }
-    }
-  }
-};
-
-/*
- * vol = [input_channels,input_depth, input_height, input_width]
- * col =
- *   [input_channels, filter_depth, filter_height, filter_width,
- *                    output_depth, output_height, output_width]
- */
-template <typename T>
-class Col2VolFunctor<CPU, T> {
- public:
-  void operator()(const Tensor &col, const std::vector<int> &dilations,
-                  const std::vector<int> &strides,
-                  const std::vector<int> &paddings, Tensor *vol) const {
-    int input_channels = vol->dims()[0];
-    int input_depth = vol->dims()[1];
-    int input_height = vol->dims()[2];
-    int input_width = vol->dims()[3];
-    int filter_depth = col.dims()[1];
-    int filter_height = col.dims()[2];
-    int filter_width = col.dims()[3];
-    int output_depth = col.dims()[4];
-    int output_height = col.dims()[5];
-    int output_width = col.dims()[6];
-    int channels_col =
-        input_channels * filter_depth * filter_height * filter_width;
-
-    T *vol_data = vol->data<T>();
-    const T *col_data = col.data<T>();
-
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int d_offset = (c / filter_width / filter_height) % filter_depth;
-      int cIm = c / filter_width / filter_height / filter_depth;
-      for (int d = 0; d < output_depth; ++d) {
-        int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0];
-        for (int h = 0; h < output_height; ++h) {
-          int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1];
-          for (int w = 0; w < output_width; ++w) {
-            int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2];
-
-            if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 &&
-                w_pad < input_width && d_pad >= 0 && d_pad < input_depth) {
-              int vol_idx =
-                  ((cIm * input_depth + d_pad) * input_height + h_pad) *
-                      input_width +
-                  w_pad;
-
-              int col_idx =
-                  ((c * output_depth + d) * output_height + h) * output_width +
-                  w;
-              vol_data[vol_idx] += col_data[col_idx];
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-template class Vol2ColFunctor<CPU, float>;
-template class Vol2ColFunctor<CPU, int8_t>;
-template class Col2VolFunctor<CPU, float>;
-template class Col2VolFunctor<CPU, int8_t>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/vol2col.h b/mobile/src/operators/math/vol2col.h
deleted file mode 100644
index 772bdf809a..0000000000
--- a/mobile/src/operators/math/vol2col.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "common/types.h"
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-/*
- * \brief Converts the feature data of four dimensions(CDHW) into a
- * colData of
- *        seven dimensions in the Vol2ColFunctor calculation,
- *        And in the Col2VolFunctor calculation, it is reversed.
- *
- * \param volData   Vol data.
- * \param volShape  The shape of volData,
- *                 [input_channels, input_depth, input_height,
- * input_width].
- * \param colData  Column data.
- * \param colShape The shape of colData.
- *
- * \param dilations    dilation data.
- * \param 3-dimension  [dilation_depth, dilation_height,
- * dilation_width].
- *
- * \param strides      stride data.
- * \param 3-dimension  [stride_depth, stride_height, stride_width].
- *
- * \param paddings     padding data.
- * \param 3-dimension  [d_pad, h_pad, w_pad].
- *
- * The shape of colData is:
- * [input_channels, filter_depth, filter_height, filter_width,
- * output_depth,
- * output_height, output_width]
- * So, it is easy to reshape into a convolution matrix for
- * convolution
- * calculation based on matrix multiplication.
- * The shape of convolution matrix is [height, width], where the
- * height is equal
- * input_channels * filter_depth * filter_height * filter_width, and
- * the width
- * is equal output_depth * output_height * output_width.
- *
- * Reshape:
- *     shape of colData           shape of convolution matrix
- *     [input_channels,
- *      filter_depth,
- *      filter_height,
- *      filter_width,      ======>      [height, width]
- *      output_depth,
- *      output_height,
- *      output_width]
- *
- * \note The caller needs to ensure that volShape.inputChannels is
- * equal to
- *       colShape.inputChannels.
- */
-using Tensor = paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class Vol2ColFunctor {
- public:
-  void operator()(const Tensor &vol, const std::vector<int> &dilations,
-                  const std::vector<int> &strides,
-                  const std::vector<int> &paddings, Tensor *col) const;
-};
-
-template <typename DeviceType, typename T>
-class Col2VolFunctor {
- public:
-  void operator()(const Tensor &col, const std::vector<int> &dilations,
-                  const std::vector<int> &strides,
-                  const std::vector<int> &paddings, Tensor *vol) const;
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/winograd/winograd_transform.h b/mobile/src/operators/math/winograd/winograd_transform.h
deleted file mode 100644
index 599a9b9233..0000000000
--- a/mobile/src/operators/math/winograd/winograd_transform.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_OP
-
-#pragma once
-
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <int tile, int kernel>
-void winograd_transform_weight(const framework::Tensor &weight,
-                               framework::Tensor *output);
-
-template <int tile, int kernel>
-void winograd_transform_input(const framework::Tensor &input,
-                              framework::Tensor *output);
-
-template <int tile, int kernel>
-void winograd_transform_output(const framework::Tensor &input,
-                               const framework::Tensor &weight,
-                               framework::Tensor *output);
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/math/winograd/winograd_transform_f6k3.cpp b/mobile/src/operators/math/winograd/winograd_transform_f6k3.cpp
deleted file mode 100644
index 4ba0ee4cb6..0000000000
--- a/mobile/src/operators/math/winograd/winograd_transform_f6k3.cpp
+++ /dev/null
@@ -1,1681 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// Inspired by https://arxiv.org/abs/1509.09308 and refered from nnpack and ncnn
-// project.
-
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-#ifdef CONV_OP
-
-#include <arm_neon.h>
-#include "operators/math/pad.h"
-#include "operators/math/winograd/winograd_transform.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <>
-void winograd_transform_weight<8, 3>(const framework::Tensor &weight,
-                                     framework::Tensor *output) {
-  /*
-   * w0 = g0
-   * w1 = ((g0 + g2) + g1) * (-2.0 / 9)
-   * w2 = ((g0 + g2) - g1) * (-2.0 / 9)
-   * w3 = ((g0 + 4 * g2) + 2 * g1) * (1.0 / 90)
-   * w4 = ((g0 + 4 * g2) - 2 * g1) * (1.0 / 90)
-   * w5 = ((g2 + 4 * g0) + 2 * g1) * (1.0 / 180)
-   * w6 = ((g2 + 4 * g0) - 2 * g1) * (1.0 / 180)
-   * w7 = g2
-   */
-  // weight shape is [out_channel, in_channel, kernel_h, kernel_w]
-  // package weight into [roundup(out_channel/4), 64, in_channel, 4] tiles
-  int out_channel = weight.dims()[0];
-  int in_channel = weight.dims()[1];
-  // reshape and alloc transformed weight
-  framework::DDim transformed_shape = framework::make_ddim(
-      std::vector<int>{(out_channel + 3) / 4, 64, in_channel, 4});
-  float *trans_outptr = output->mutable_data<float>(transformed_shape);
-  memset(trans_outptr, 0, output->numel() * sizeof(float));
-
-  const float transform_matrix[8] = {2.f, -2.f / 9, 1.f / 90, 1.f / 180};
-  const float *inptr = weight.data<float>();
-
-#if __aarch64__
-  int remain_start = 0;
-#else
-  int remain_start = out_channel & 0xFFFFFFFC;
-
-  #pragma omp parallel for
-  for (int oc = 0; oc < out_channel - 3; oc += 4) {
-    float gw[96];  // gw[3][8][4]
-    const float *inptr0 = inptr + oc * in_channel * 9;
-    const float *inptr1 = inptr + (oc + 1) * in_channel * 9;
-    const float *inptr2 = inptr + (oc + 2) * in_channel * 9;
-    const float *inptr3 = inptr + (oc + 3) * in_channel * 9;
-    // oc * 64 * in_channel
-    float *outptr = trans_outptr + ((oc * in_channel) << 6);
-    for (int ic = 0; ic < in_channel; ++ic) {
-      float *gw_ptr = gw;
-      asm volatile(
-          "vld1.32    {d0-d1}, [%[tm_ptr]]          \n"
-
-          "mov        r0, #24                       \n"
-          "vld1.32    {d2-d5}, [%[inptr0]], r0      \n"
-          "vld1.32    {d6-d9}, [%[inptr1]], r0      \n"
-          "vld1.32    {d10-d13}, [%[inptr2]], r0    \n"
-          "vld1.32    {d14-d17}, [%[inptr3]], r0    \n"
-          "vtrn.32    q1, q3                        \n"
-          "vtrn.32    q2, q4                        \n"
-          "vtrn.32    q5, q7                        \n"
-          "vtrn.32    q6, q8                        \n"
-          "vswp.32    d3, d10                       \n"
-          "vswp.32    d7, d14                       \n"
-          "vswp.32    d5, d12                       \n"
-          "vswp.32    d9, d16                       \n"
-
-          // q1: g0, q3: g1, q5: g2
-          "vst1.32    {d2-d3}, [%[gw_ptr]]!         \n"
-          "vadd.f32   q9, q1, q5                    \n"
-          "vadd.f32   q10, q9, q3                   \n"
-          "vsub.f32   q11, q9, q3                   \n"
-          "vmul.f32   q10, q10, d0[1]               \n"
-          "vst1.32    {d20-d21}, [%[gw_ptr]]!       \n"
-          "vmul.f32   q11, q11, d0[1]               \n"
-          "vst1.32    {d22-d23}, [%[gw_ptr]]!       \n"
-
-          "vmul.f32   q9, q1, d0[0]                 \n"
-          "vmul.f32   q9, q9, d0[0]                 \n"  // 4 * g0
-          "vmul.f32   q10, q3, d0[0]                \n"  // 2 * g1
-          "vmul.f32   q11, q5, d0[0]                \n"
-          "vmul.f32   q11, q11, d0[0]               \n"  // 4 * g2
-
-          "vadd.f32   q12, q1, q11                  \n"
-          "vadd.f32   q13, q12, q10                 \n"
-          "vmul.f32   q13, q13, d1[0]               \n"
-          "vst1.32    {d26-d27}, [%[gw_ptr]]!       \n"
-          "vsub.f32   q13, q12, q10                 \n"
-          "vmul.f32   q13, q13, d1[0]               \n"
-          "vst1.32    {d26-d27}, [%[gw_ptr]]!       \n"
-
-          "vadd.f32   q12, q5, q9                   \n"
-          "vadd.f32   q13, q12, q10                 \n"
-          "vmul.f32   q13, q13, d1[1]               \n"
-          "vst1.32    {d26-d27}, [%[gw_ptr]]!       \n"
-          "vsub.f32   q13, q12, q10                 \n"
-          "vmul.f32   q13, q13, d1[1]               \n"
-          "vst1.32    {d26-d27}, [%[gw_ptr]]!       \n"
-
-          "vst1.32    {d10-d11}, [%[gw_ptr]]!       \n"
-
-          // q7: g0, q2: g1, q4: g2
-          "vst1.32    {d14-d15}, [%[gw_ptr]]!       \n"
-          "vadd.f32   q9, q7, q4                    \n"
-          "vadd.f32   q10, q9, q2                   \n"
-          "vsub.f32   q11, q9, q2                   \n"
-          "vmul.f32   q10, q10, d0[1]               \n"
-          "vst1.32    {d20-d21}, [%[gw_ptr]]!       \n"
-          "vmul.f32   q11, q11, d0[1]               \n"
-          "vst1.32    {d22-d23}, [%[gw_ptr]]!       \n"
-
-          "vmul.f32   q9, q7, d0[0]                 \n"
-          "vmul.f32   q9, q9, d0[0]                 \n"  // 4 * g0
-          "vmul.f32   q10, q2, d0[0]                \n"  // 2 * g1
-          "vmul.f32   q11, q4, d0[0]                \n"
-          "vmul.f32   q11, q11, d0[0]               \n"  // 4 * g2
-
-          "vadd.f32   q12, q7, q11                  \n"
-          "vadd.f32   q13, q12, q10                 \n"
-          "vmul.f32   q13, q13, d1[0]               \n"
-          "vst1.32    {d26-d27}, [%[gw_ptr]]!       \n"
-          "vsub.f32   q13, q12, q10                 \n"
-          "vmul.f32   q13, q13, d1[0]               \n"
-          "vst1.32    {d26-d27}, [%[gw_ptr]]!       \n"
-
-          "vadd.f32   q12, q4, q9                   \n"
-          "vadd.f32   q13, q12, q10                 \n"
-          "vmul.f32   q13, q13, d1[1]               \n"
-          "vst1.32    {d26-d27}, [%[gw_ptr]]!       \n"
-          "vsub.f32   q13, q12, q10                 \n"
-          "vmul.f32   q13, q13, d1[1]               \n"
-          "vst1.32    {d26-d27}, [%[gw_ptr]]!       \n"
-
-          "vst1.32    {d8-d9}, [%[gw_ptr]]!         \n"
-
-          "mov        r0, #12                       \n"
-          "vld1.32    {d2-d3}, [%[inptr0]], r0      \n"
-          "vld1.32    {d6-d7}, [%[inptr1]], r0      \n"
-          "vld1.32    {d10-d11}, [%[inptr2]], r0    \n"
-          "vld1.32    {d14-d15}, [%[inptr3]], r0    \n"
-          "vtrn.32    q1, q3                        \n"
-          "vtrn.32    q5, q7                        \n"
-          "vswp.32    d3, d10                       \n"
-          "vswp.32    d7, d14                       \n"
-
-          // q1: g0, q3: g1, q5: g2
-          "vst1.32    {d2-d3}, [%[gw_ptr]]!         \n"
-          "vadd.f32   q9, q1, q5                    \n"
-          "vadd.f32   q10, q9, q3                   \n"
-          "vsub.f32   q11, q9, q3                   \n"
-          "vmul.f32   q10, q10, d0[1]               \n"
-          "vst1.32    {d20-d21}, [%[gw_ptr]]!       \n"
-          "vmul.f32   q11, q11, d0[1]               \n"
-          "vst1.32    {d22-d23}, [%[gw_ptr]]!       \n"
-
-          "vmul.f32   q9, q1, d0[0]                 \n"
-          "vmul.f32   q9, q9, d0[0]                 \n"  // 4 * g0
-          "vmul.f32   q10, q3, d0[0]                \n"  // 2 * g1
-          "vmul.f32   q11, q5, d0[0]                \n"
-          "vmul.f32   q11, q11, d0[0]               \n"  // 4 * g2
-
-          "vadd.f32   q12, q1, q11                  \n"
-          "vadd.f32   q13, q12, q10                 \n"
-          "vmul.f32   q13, q13, d1[0]               \n"
-          "vst1.32    {d26-d27}, [%[gw_ptr]]!       \n"
-          "vsub.f32   q13, q12, q10                 \n"
-          "vmul.f32   q13, q13, d1[0]               \n"
-          "vst1.32    {d26-d27}, [%[gw_ptr]]!       \n"
-
-          "vadd.f32   q12, q5, q9                   \n"
-          "vadd.f32   q13, q12, q10                 \n"
-          "vmul.f32   q13, q13, d1[1]               \n"
-          "vst1.32    {d26-d27}, [%[gw_ptr]]!       \n"
-          "vsub.f32   q13, q12, q10                 \n"
-          "vmul.f32   q13, q13, d1[1]               \n"
-          "vst1.32    {d26-d27}, [%[gw_ptr]]!       \n"
-
-          "vst1.32    {d10-d11}, [%[gw_ptr]]!       \n"
-          : [gw_ptr] "+r"(gw_ptr), [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1),
-            [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3)
-          : [tm_ptr] "r"((float *)transform_matrix)
-          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-            "q8", "q9", "q10", "q11", "q12", "q13", "r0");
-
-      float *gw_ptr0 = gw;
-      float *gw_ptr1 = gw + 32;
-      float *gw_ptr2 = gw + 64;
-      float *outptr0 = outptr + (ic << 2);            // ic * 4
-      int steps = (in_channel << 2) * sizeof(float);  // in_channel * 4
-      asm volatile(
-          "vld1.32    {d0-d1}, [%[tm_ptr]]               \n"
-          "mov        r0, #8                             \n"
-
-          "loop_8_%=:                                    \n"
-          "vld1.32    {d2-d3}, [%[gw_ptr0]]!             \n"
-          "vld1.32    {d4-d5}, [%[gw_ptr1]]!             \n"
-          "vld1.32    {d6-d7}, [%[gw_ptr2]]!             \n"
-
-          // q1: g0, q2: g1, q3: g2
-          "vst1.32    {d2-d3}, [%[outptr0]], %[steps]    \n"
-          "vadd.f32   q9, q1, q3                         \n"
-          "vadd.f32   q10, q9, q2                        \n"
-          "vsub.f32   q11, q9, q2                        \n"
-          "vmul.f32   q10, q10, d0[1]                    \n"
-          "vst1.32    {d20-d21}, [%[outptr0]], %[steps]  \n"
-          "vmul.f32   q11, q11, d0[1]                    \n"
-          "vst1.32    {d22-d23}, [%[outptr0]], %[steps]  \n"
-
-          "vmul.f32   q9, q1, d0[0]                      \n"
-          "vmul.f32   q9, q9, d0[0]                      \n"  // 4 * g0
-          "vmul.f32   q10, q2, d0[0]                     \n"  // 2 * g1
-          "vmul.f32   q11, q3, d0[0]                     \n"
-          "vmul.f32   q11, q11, d0[0]                    \n"  // 4 * g2
-
-          "vadd.f32   q12, q1, q11                       \n"
-          "vadd.f32   q13, q12, q10                      \n"
-          "vmul.f32   q13, q13, d1[0]                    \n"
-          "vst1.32    {d26-d27}, [%[outptr0]], %[steps]  \n"
-          "vsub.f32   q13, q12, q10                      \n"
-          "vmul.f32   q13, q13, d1[0]                    \n"
-          "vst1.32    {d26-d27}, [%[outptr0]], %[steps]  \n"
-
-          // w5 = ((g2 + 4 * g0) + 2 * g1) * (1.0 / 180)
-          "vadd.f32   q12, q3, q9                        \n"
-          "vadd.f32   q13, q12, q10                      \n"
-          "vmul.f32   q13, q13, d1[1]                    \n"
-          "vst1.32    {d26-d27}, [%[outptr0]], %[steps]  \n"
-          "vsub.f32   q13, q12, q10                      \n"
-          "vmul.f32   q13, q13, d1[1]                    \n"
-          "vst1.32    {d26-d27}, [%[outptr0]], %[steps]  \n"
-
-          "vst1.32    {d6-d7}, [%[outptr0]], %[steps]    \n"
-
-          "subs       r0, #1                             \n"
-          "bne        loop_8_%=                          \n"
-          : [outptr0] "+r"(outptr0), [gw_ptr0] "+r"(gw_ptr0),
-            [gw_ptr1] "+r"(gw_ptr1), [gw_ptr2] "+r"(gw_ptr2)
-          : [tm_ptr] "r"((float *)transform_matrix), [steps] "r"(steps)
-          : "cc", "memory", "q0", "q1", "q2", "q3", "q9", "q10", "q11", "q12",
-            "q13", "r0");
-    }
-  }
-#endif  // __aarch64__
-
-  // remain output channel
-  #pragma omp parallel for
-  for (int oc = remain_start; oc < out_channel; ++oc) {
-    float gw[3][8];                                     // gw[3][8]
-    const float *inptr0 = inptr + oc * in_channel * 9;  //
-    // (oc / 4) * 64 * in_channel * 4 + oc % 4
-    int offset = ((oc & 0xFFFFFFFC) << 6) * in_channel + (oc & 0x3);
-    int steps = (in_channel << 2);  // in_channel * 4
-    float *outptr = trans_outptr + offset;
-    for (int ic = 0; ic < in_channel; ++ic) {
-      for (int i = 0; i < 3; ++i, inptr0 += 3) {
-        float g0 = inptr0[0];
-        float g1 = inptr0[1];
-        float g2 = inptr0[2];
-        float d0 = g0 + g2;
-        float d1 = g0 + 4 * g2;
-        float d2 = g2 + 4 * g0;
-        float d3 = 2 * g1;
-        gw[i][0] = g0;
-        gw[i][1] = -2.f / 9 * (d0 + g1);   // -2.f/9 * (g0 + g1 + g2)
-        gw[i][2] = -2.f / 9 * (d0 - g1);   // -2.f/9 * (g0 - g1 + g2)
-        gw[i][3] = 1.f / 90 * (d1 + d3);   // 1.f/90 * (g0 + 2 * g1 + 4 * g2)
-        gw[i][4] = 1.f / 90 * (d1 - d3);   // 1.f/90 * (g0 - 2 * g1 + 4 * g2)
-        gw[i][5] = 1.f / 180 * (d2 + d3);  // 1.f/180 * (4 * g0 + 2 * g1 + g2)
-        gw[i][6] = 1.f / 180 * (d2 - d3);  // 1.f/180 * (4 * g0 - 2 * g1 + g2)
-        gw[i][7] = g2;
-      }
-      for (int i = 0; i < 8; ++i) {
-        float g0 = gw[0][i];
-        float g1 = gw[1][i];
-        float g2 = gw[2][i];
-        float d0 = g0 + g2;
-        float d1 = g0 + 4 * g2;
-        float d2 = g2 + 4 * g0;
-        float d3 = 2 * g1;
-        int offset = i * 8 * steps;
-        outptr[offset] = g0;
-        outptr[offset + 1 * steps] = -2.f / 9 * (d0 + g1);
-        outptr[offset + 2 * steps] = -2.f / 9 * (d0 - g1);
-        outptr[offset + 3 * steps] = 1.f / 90 * (d1 + d3);
-        outptr[offset + 4 * steps] = 1.f / 90 * (d1 - d3);
-        outptr[offset + 5 * steps] = 1.f / 180 * (d2 + d3);
-        outptr[offset + 6 * steps] = 1.f / 180 * (d2 - d3);
-        outptr[offset + 7 * steps] = g2;
-      }
-      outptr += 4;
-    }
-  }
-}
-
-template <>
-void winograd_transform_input<8, 3>(const framework::Tensor &input,
-                                    framework::Tensor *output) {
-  /*
-   * x0 = (d0 - d6) + (d4 - d2) * 5.25
-   * x1 = (d2 + d6) - 4.25 * (d4 + d3) + (d1 + d5)
-   * x2 = (d2 + d6) - 4.25 * (d4 - d3) - (d1 + d5)
-   * x3 = (0.25 * d2 - 1.25 * d4 + d6) + (0.5 * d1 - 2.5 * d3 + 2 * d5)
-   * x4 = (0.25 * d2 - 1.25 * d4 + d6) - (0.5 * d1 - 2.5 * d3 + 2 * d5)
-   * x5 = (4 * d2 - 5 * d4 + d6) + (2 * d1 - 2.5 * d3 + 0.5 * d5)
-   * x6 = (4 * d2 - 5 * d4 + d6) - (2 * d1 - 2.5 * d3 + 0.5 * d5)
-   * x7 = (d7 - d1) + (d3 - d5) * 5.25
-   */
-  // package input into [roundup(tiles/8), 64, channel, 8] tiles
-  int channel = input.dims()[1];
-  int height = input.dims()[2];
-  int width = input.dims()[3];
-  int h_tiles = (height + 3) / 6;  // (height - 2 + 5) / 6
-  int w_tiles = (width + 3) / 6;   // (width - 2 + 5) / 6
-  int tiles = (h_tiles * w_tiles + 7) / 8;
-  framework::DDim transformed_shape =
-      framework::make_ddim(std::vector<int>{tiles, 64, channel, 8});
-  float *outptr = output->mutable_data<float>(transformed_shape);
-  memset(outptr, 0, output->numel() * sizeof(float));
-
-  const float *inptr = input.data<float>();
-  height = h_tiles * 6 + 2;
-  width = w_tiles * 6 + 2;
-  framework::Tensor input_pad;
-  if (height > input.dims()[2] || width > input.dims()[3]) {
-    framework::DDim input_shape =
-        framework::make_ddim(std::vector<int>{1, channel, height, width});
-    PadFunctor<CPU, float> pad;
-    inptr = input_pad.mutable_data<float>(input_shape);
-    pad(input, 0, height - input.dims()[2], 0, width - input.dims()[3],
-        &input_pad);
-  }
-  size_t image_size = height * width;
-  const float transform_matrix[8] = {5.25f, -5.f,   -4.25f, -2.5f,
-                                     2.f,   -1.25f, 0.5f,   0.25f};
-  #pragma omp parallel for
-  for (int c = 0; c < channel; ++c) {
-    const float *in = inptr + c * image_size;
-    float d_bt[64];  // d * B_t
-    for (int h = 0; h < h_tiles; ++h) {
-      for (int w = 0; w < w_tiles; ++w) {
-        const float *in0 = in + (h * width + w) * 6;
-        const float *in1 = in0 + width;
-        const float *in2 = in1 + width;
-        const float *in3 = in2 + width;
-        float *d_bt_ptr = d_bt;
-#if __aarch64__
-        int steps = 4 * width;
-        float32x4_t _q0 = vld1q_f32(transform_matrix);
-        float32x4_t _q1 = vld1q_f32(transform_matrix + 4);
-        for (int l = 0; l < 2; ++l) {
-          float32x4x2_t _q23, _q45, _q67, _q89;
-          _q23.val[0] = vld1q_f32(in0);
-          _q45.val[0] = vld1q_f32(in0 + 4);
-          _q23.val[1] = vld1q_f32(in1);
-          _q45.val[1] = vld1q_f32(in1 + 4);
-          _q67.val[0] = vld1q_f32(in2);
-          _q89.val[0] = vld1q_f32(in2 + 4);
-          _q67.val[1] = vld1q_f32(in3);
-          _q89.val[1] = vld1q_f32(in3 + 4);
-          _q23 = vtrnq_f32(_q23.val[0], _q23.val[1]);
-          _q45 = vtrnq_f32(_q45.val[0], _q45.val[1]);
-          _q67 = vtrnq_f32(_q67.val[0], _q67.val[1]);
-          _q89 = vtrnq_f32(_q89.val[0], _q89.val[1]);
-          float32x4_t _q2 = vcombine_f32(vget_low_f32(_q23.val[0]),
-                                         vget_low_f32(_q67.val[0]));
-          float32x4_t _q4 = vcombine_f32(vget_low_f32(_q23.val[1]),
-                                         vget_low_f32(_q67.val[1]));
-          float32x4_t _q3 = vcombine_f32(vget_low_f32(_q45.val[0]),
-                                         vget_low_f32(_q89.val[0]));
-          float32x4_t _q5 = vcombine_f32(vget_low_f32(_q45.val[1]),
-                                         vget_low_f32(_q89.val[1]));
-          float32x4_t _q6 = vcombine_f32(vget_high_f32(_q23.val[0]),
-                                         vget_high_f32(_q67.val[0]));
-          float32x4_t _q8 = vcombine_f32(vget_high_f32(_q23.val[1]),
-                                         vget_high_f32(_q67.val[1]));
-          float32x4_t _q7 = vcombine_f32(vget_high_f32(_q45.val[0]),
-                                         vget_high_f32(_q89.val[0]));
-          float32x4_t _q9 = vcombine_f32(vget_high_f32(_q45.val[1]),
-                                         vget_high_f32(_q89.val[1]));
-
-          float32x4_t _q10 = vsubq_f32(_q2, _q7);
-          float32x4_t _q11 = vsubq_f32(_q3, _q6);
-          _q10 = vmlaq_lane_f32(_q10, _q11, vget_low_f32(_q0), 0);
-          vst1q_f32(d_bt_ptr, _q10);
-
-          _q10 = vaddq_f32(_q6, _q7);
-          _q11 = vaddq_f32(_q4, _q5);
-          _q10 = vmlaq_lane_f32(_q10, _q3, vget_high_f32(_q0), 0);
-          _q11 = vmlaq_lane_f32(_q11, _q8, vget_high_f32(_q0), 0);
-          float32x4_t _q12 = vaddq_f32(_q10, _q11);
-          float32x4_t _q13 = vsubq_f32(_q10, _q11);
-          vst1q_f32(d_bt_ptr + 4, _q12);
-          vst1q_f32(d_bt_ptr + 8, _q13);
-
-          _q10 = vmulq_lane_f32(_q6, vget_high_f32(_q1), 1);
-          _q11 = vmulq_lane_f32(_q4, vget_high_f32(_q1), 0);
-          _q10 = vaddq_f32(_q10, _q7);
-          _q11 = vmlaq_lane_f32(_q11, _q5, vget_low_f32(_q1), 0);
-          _q10 = vmlaq_lane_f32(_q10, _q3, vget_low_f32(_q1), 1);
-          _q11 = vmlaq_lane_f32(_q11, _q8, vget_high_f32(_q0), 1);
-          _q12 = vaddq_f32(_q10, _q11);
-          _q13 = vsubq_f32(_q10, _q11);
-          vst1q_f32(d_bt_ptr + 12, _q12);
-          vst1q_f32(d_bt_ptr + 16, _q13);
-
-          _q10 = vmulq_lane_f32(_q6, vget_low_f32(_q1), 0);
-          _q11 = vmulq_lane_f32(_q4, vget_low_f32(_q1), 0);
-          _q10 = vmlaq_lane_f32(_q10, _q3, vget_high_f32(_q0), 1);
-          _q11 = vmlaq_lane_f32(_q11, _q8, vget_high_f32(_q0), 1);
-          _q10 = vmlaq_lane_f32(_q10, _q7, vget_high_f32(_q1), 0);
-          _q11 = vmlaq_lane_f32(_q11, _q5, vget_high_f32(_q1), 0);
-          _q10 = vmulq_lane_f32(_q10, vget_low_f32(_q1), 0);
-          _q12 = vaddq_f32(_q10, _q11);
-          _q13 = vsubq_f32(_q10, _q11);
-          vst1q_f32(d_bt_ptr + 20, _q12);
-          vst1q_f32(d_bt_ptr + 24, _q13);
-
-          _q10 = vsubq_f32(_q9, _q4);
-          _q11 = vsubq_f32(_q8, _q5);
-          _q10 = vmlaq_lane_f32(_q10, _q11, vget_low_f32(_q0), 0);
-          vst1q_f32(d_bt_ptr + 28, _q10);
-
-          in0 += steps;
-          in1 += steps;
-          in2 += steps;
-          in3 += steps;
-          d_bt_ptr += 32;
-        }
-#else
-        int steps = 4 * width * sizeof(float);
-        asm volatile(
-            "vld1.32    {d0-d3}, [%[tm_ptr]]            \n"
-            "mov        r0, #2                          \n"
-            // row loop
-            "loop_r_%=:                                 \n"
-            "vld1.32    {d4-d7}, [%[in0]], %[steps]     \n"
-            "vld1.32    {d8-d11}, [%[in1]], %[steps]    \n"
-            "vld1.32    {d12-d15}, [%[in2]], %[steps]   \n"
-            "vld1.32    {d16-d19}, [%[in3]], %[steps]   \n"
-            "vtrn.32    q2, q4                          \n"  // d0: q2
-            "vtrn.32    q3, q5                          \n"  // d1: q4
-            "vtrn.32    q6, q8                          \n"  // d2: q6
-            "vtrn.32    q7, q9                          \n"  // d3: q8
-            "vswp.32    d5, d12                         \n"  // d4: q3
-            "vswp.32    d9, d16                         \n"  // d5: q5
-            "vswp.32    d7, d14                         \n"  // d6: q7
-            "vswp.32    d11, d18                        \n"  // d7: q9
-
-            "vsub.f32   q10, q2, q7                     \n"
-            "vsub.f32   q11, q3, q6                     \n"
-            "vmla.f32   q10, q11, d0[0]                 \n"  // d0 - d6 + (d4 -
-                                                             // d2) * 5.25"
-            "vst1.32    {d20-d21}, [%[d_bt]]!           \n"
-
-            "vadd.f32   q10, q6, q7                     \n"
-            "vadd.f32   q11, q4, q5                     \n"
-            "vmla.f32   q10, q3, d1[0]                  \n"  // d2 - 4.25 * d4 +
-                                                             // d6
-            "vmla.f32   q11, q8, d1[0]                  \n"  // d1 - 4.25 * d3 +
-                                                             // d5
-            "vadd.f32   q12, q10, q11                   \n"
-            "vsub.f32   q13, q10, q11                   \n"
-            "vst1.32    {d24-d27}, [%[d_bt]]!           \n"
-
-            "vmul.f32   q10, q6, d3[1]                  \n"  // 0.25 * d2
-            "vmul.f32   q11, q4, d3[0]                  \n"  // 0.5 * d1
-            "vadd.f32   q10, q10, q7                    \n"  // 0.25 * d2 + d6
-            "vmla.f32   q11, q5, d2[0]                  \n"  // 0.5 * d1 + 2 *
-                                                             // d5
-            "vmla.f32   q10, q3, d2[1]                  \n"  // 0.25 * d2 + d6
-                                                             // - 1.25 * d4
-            "vmla.f32   q11, q8, d1[1]                  \n"  // 0.5 * d1 + 2 *
-                                                             // d5 - 2.5 * d3
-            "vadd.f32   q12, q10, q11                   \n"
-            "vsub.f32   q13, q10, q11                   \n"
-            "vst1.32    {d24-d27}, [%[d_bt]]!           \n"
-
-            "vmul.f32   q10, q6, d2[0]                  \n"  // 2 * d2
-            "vmul.f32   q11, q4, d2[0]                  \n"  // 2 * d1
-            "vmla.f32   q10, q3, d1[1]                  \n"  // 2 * d2 - 2.5 *
-                                                             // d4
-            "vmla.f32   q11, q8, d1[1]                  \n"  // 2 * d1 - 2.5 *
-                                                             // d3
-            "vmla.f32   q10, q7, d3[0]                  \n"  // 2 * d1 - 2.5 *
-                                                             // d3 + 0.5 * d6
-            "vmla.f32   q11, q5, d3[0]                  \n"  // 2 * d2 - 2.5 *
-                                                             // d4 + 0.5 * d5
-            "vmul.f32   q10, q10, d2[0]                 \n"  // 4 * d1 - 5 * d3
-                                                             // + d6
-            "vadd.f32   q12, q10, q11                   \n"
-            "vsub.f32   q13, q10, q11                   \n"
-            "vst1.32    {d24-d27}, [%[d_bt]]!           \n"
-
-            "vsub.f32   q10, q9, q4                     \n"
-            "vsub.f32   q11, q8, q5                     \n"
-            "vmla.f32   q10, q11, d0[0]                 \n"
-            "vst1.32    {d20-d21}, [%[d_bt]]!           \n"
-
-            "subs       r0, #1                          \n"
-            "bne        loop_r_%=                       \n"
-            : [d_bt] "+r"(d_bt_ptr), [in0] "+r"(in0), [in1] "+r"(in1),
-              [in2] "+r"(in2), [in3] "+r"(in3)
-            : [tm_ptr] "r"((float *)transform_matrix), [steps] "r"(steps)
-            : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-              "q8", "q9", "q10", "q11", "q12", "q13", "r0");
-#endif  // __aarch64__
-        float *ptr0 = d_bt;
-        float *ptr1 = ptr0 + 32;
-        int tile_indics = h * w_tiles + w;
-        int tile_block = tile_indics >> 3;
-        int block_indics = tile_indics & 0x7;
-        // (tiles / 8, 64, channel, 8)
-        float *out0 =
-            outptr + (tile_block * 64 * channel + c) * 8 + block_indics;
-        float *out1 = out0 + channel * 8;
-        float *out2 = out1 + channel * 8;
-        float *out3 = out2 + channel * 8;
-        float *out4 = out3 + channel * 8;
-        float *out5 = out4 + channel * 8;
-        float *out6 = out5 + channel * 8;
-        float *out7 = out6 + channel * 8;
-#if __aarch64__
-        steps = 8 * channel * 8;
-        for (int l = 0; l < 2; ++l) {
-          float32x4x2_t _q23, _q45, _q67, _q89;
-          _q23.val[0] = vld1q_f32(ptr0);
-          _q23.val[1] = vld1q_f32(ptr0 + 4);
-          _q45.val[0] = vld1q_f32(ptr0 + 8);
-          _q45.val[1] = vld1q_f32(ptr0 + 12);
-          _q67.val[0] = vld1q_f32(ptr1);
-          _q67.val[1] = vld1q_f32(ptr1 + 4);
-          _q89.val[0] = vld1q_f32(ptr1 + 8);
-          _q89.val[1] = vld1q_f32(ptr1 + 12);
-          _q23 = vtrnq_f32(_q23.val[0], _q23.val[1]);
-          _q45 = vtrnq_f32(_q45.val[0], _q45.val[1]);
-          _q67 = vtrnq_f32(_q67.val[0], _q67.val[1]);
-          _q89 = vtrnq_f32(_q89.val[0], _q89.val[1]);
-          float32x4_t _q2 = vcombine_f32(vget_low_f32(_q23.val[0]),
-                                         vget_low_f32(_q45.val[0]));
-          float32x4_t _q4 = vcombine_f32(vget_high_f32(_q23.val[0]),
-                                         vget_high_f32(_q45.val[0]));
-          float32x4_t _q3 = vcombine_f32(vget_low_f32(_q23.val[1]),
-                                         vget_low_f32(_q45.val[1]));
-          float32x4_t _q5 = vcombine_f32(vget_high_f32(_q23.val[1]),
-                                         vget_high_f32(_q45.val[1]));
-          float32x4_t _q6 = vcombine_f32(vget_low_f32(_q67.val[0]),
-                                         vget_low_f32(_q89.val[0]));
-          float32x4_t _q8 = vcombine_f32(vget_high_f32(_q67.val[0]),
-                                         vget_high_f32(_q89.val[0]));
-          float32x4_t _q7 = vcombine_f32(vget_low_f32(_q67.val[1]),
-                                         vget_low_f32(_q89.val[1]));
-          float32x4_t _q9 = vcombine_f32(vget_high_f32(_q67.val[1]),
-                                         vget_high_f32(_q89.val[1]));
-
-          float32x4_t _q10 = vsubq_f32(_q2, _q8);
-          float32x4_t _q11 = vsubq_f32(_q6, _q4);
-          _q10 = vmlaq_lane_f32(_q10, _q11, vget_low_f32(_q0), 0);
-          vst1q_lane_f32(out0, _q10, 0);
-          vst1q_lane_f32(out0 + steps, _q10, 1);
-          vst1q_lane_f32(out0 + 2 * steps, _q10, 2);
-          vst1q_lane_f32(out0 + 3 * steps, _q10, 3);
-
-          _q10 = vaddq_f32(_q4, _q8);
-          _q11 = vaddq_f32(_q3, _q7);
-          _q10 = vmlaq_lane_f32(_q10, _q6, vget_high_f32(_q0), 0);
-          _q11 = vmlaq_lane_f32(_q11, _q5, vget_high_f32(_q0), 0);
-          float32x4_t _q12 = vaddq_f32(_q10, _q11);
-          vst1q_lane_f32(out1, _q12, 0);
-          vst1q_lane_f32(out1 + steps, _q12, 1);
-          vst1q_lane_f32(out1 + 2 * steps, _q12, 2);
-          vst1q_lane_f32(out1 + 3 * steps, _q12, 3);
-
-          _q12 = vsubq_f32(_q10, _q11);
-          vst1q_lane_f32(out2, _q12, 0);
-          vst1q_lane_f32(out2 + steps, _q12, 1);
-          vst1q_lane_f32(out2 + 2 * steps, _q12, 2);
-          vst1q_lane_f32(out2 + 3 * steps, _q12, 3);
-
-          _q10 = vmulq_lane_f32(_q4, vget_high_f32(_q1), 1);
-          _q11 = vmulq_lane_f32(_q3, vget_high_f32(_q1), 0);
-          _q10 = vaddq_f32(_q10, _q8);
-          _q11 = vmlaq_lane_f32(_q11, _q7, vget_low_f32(_q1), 0);
-          _q10 = vmlaq_lane_f32(_q10, _q6, vget_low_f32(_q1), 1);
-          _q11 = vmlaq_lane_f32(_q11, _q5, vget_high_f32(_q0), 1);
-          _q12 = vaddq_f32(_q10, _q11);
-          vst1q_lane_f32(out3, _q12, 0);
-          vst1q_lane_f32(out3 + steps, _q12, 1);
-          vst1q_lane_f32(out3 + 2 * steps, _q12, 2);
-          vst1q_lane_f32(out3 + 3 * steps, _q12, 3);
-
-          _q12 = vsubq_f32(_q10, _q11);
-          vst1q_lane_f32(out4, _q12, 0);
-          vst1q_lane_f32(out4 + steps, _q12, 1);
-          vst1q_lane_f32(out4 + 2 * steps, _q12, 2);
-          vst1q_lane_f32(out4 + 3 * steps, _q12, 3);
-
-          _q10 = vmulq_lane_f32(_q4, vget_low_f32(_q1), 0);
-          _q11 = vmulq_lane_f32(_q3, vget_low_f32(_q1), 0);
-          _q10 = vmlaq_lane_f32(_q10, _q6, vget_high_f32(_q0), 1);
-          _q11 = vmlaq_lane_f32(_q11, _q5, vget_high_f32(_q0), 1);
-          _q10 = vmlaq_lane_f32(_q10, _q8, vget_high_f32(_q1), 0);
-          _q11 = vmlaq_lane_f32(_q11, _q7, vget_high_f32(_q1), 0);
-          _q10 = vmulq_lane_f32(_q10, vget_low_f32(_q1), 0);
-          _q12 = vaddq_f32(_q10, _q11);
-          vst1q_lane_f32(out5, _q12, 0);
-          vst1q_lane_f32(out5 + steps, _q12, 1);
-          vst1q_lane_f32(out5 + 2 * steps, _q12, 2);
-          vst1q_lane_f32(out5 + 3 * steps, _q12, 3);
-
-          _q12 = vsubq_f32(_q10, _q11);
-          vst1q_lane_f32(out6, _q12, 0);
-          vst1q_lane_f32(out6 + steps, _q12, 1);
-          vst1q_lane_f32(out6 + 2 * steps, _q12, 2);
-          vst1q_lane_f32(out6 + 3 * steps, _q12, 3);
-
-          _q10 = vsubq_f32(_q9, _q3);
-          _q11 = vsubq_f32(_q5, _q7);
-          _q10 = vmlaq_lane_f32(_q10, _q11, vget_low_f32(_q0), 0);
-          vst1q_lane_f32(out7, _q10, 0);
-          vst1q_lane_f32(out7 + steps, _q10, 1);
-          vst1q_lane_f32(out7 + 2 * steps, _q10, 2);
-          vst1q_lane_f32(out7 + 3 * steps, _q10, 3);
-
-          ptr0 += 16;
-          ptr1 += 16;
-          out0 += 4 * steps;
-          out1 += 4 * steps;
-          out2 += 4 * steps;
-          out3 += 4 * steps;
-          out4 += 4 * steps;
-          out5 += 4 * steps;
-          out6 += 4 * steps;
-          out7 += 4 * steps;
-        }
-#else
-        steps = 8 * channel * 8 * sizeof(float);
-        asm volatile(
-            "mov        r0, #2                          \n"
-            "vld1.32    {d0-d3}, [%[tm_ptr]]            \n"
-            // row loop
-            "loop_r_%=:                                 \n"
-            "vld1.32    {d4-d7}, [%[ptr0]]!             \n"  // q2: d0, q3: d1
-            "vld1.32    {d8-d11}, [%[ptr0]]!            \n"  // q4: d2, q5: d3
-            "vld1.32    {d12-d15}, [%[ptr1]]!           \n"  // q6: d4, q7: d5
-            "vld1.32    {d16-d19}, [%[ptr1]]!           \n"  // q8: d6, q9: d7
-            "vtrn.32    q2, q3                          \n"
-            "vtrn.32    q4, q5                          \n"
-            "vtrn.32    q6, q7                          \n"
-            "vtrn.32    q8, q9                          \n"
-            "vswp.32    d5, d8                          \n"
-            "vswp.32    d7, d10                         \n"
-            "vswp.32    d13, d16                        \n"
-            "vswp.32    d15, d18                        \n"
-
-            "vsub.f32   q10, q2, q8                     \n"  // d0 - d6
-            "vsub.f32   q11, q6, q4                     \n"  // d4 - d2
-            "vmla.f32   q10, q11, d0[0]                 \n"  // d0 - d6 + (d4 -
-                                                             // d2) * 5.25
-            "vst1.32    {d20[0]}, [%[out0]], %[steps]   \n"
-            "vst1.32    {d20[1]}, [%[out0]], %[steps]   \n"
-            "vst1.32    {d21[0]}, [%[out0]], %[steps]   \n"
-            "vst1.32    {d21[1]}, [%[out0]], %[steps]   \n"
-
-            "vadd.f32   q10, q4, q8                     \n"
-            "vadd.f32   q11, q3, q7                     \n"
-            "vmla.f32   q10, q6, d1[0]                  \n"  // d2 - 4.25 * d4 +
-                                                             // d6
-            "vmla.f32   q11, q5, d1[0]                  \n"  // d1 - 4.25 * d3 +
-                                                             // d5
-            "vadd.f32   q12, q10, q11                   \n"
-            "vst1.32    {d24[0]}, [%[out1]], %[steps]   \n"
-            "vst1.32    {d24[1]}, [%[out1]], %[steps]   \n"
-            "vst1.32    {d25[0]}, [%[out1]], %[steps]   \n"
-            "vst1.32    {d25[1]}, [%[out1]], %[steps]   \n"
-            "vsub.f32   q12, q10, q11                   \n"
-            "vst1.32    {d24[0]}, [%[out2]], %[steps]   \n"
-            "vst1.32    {d24[1]}, [%[out2]], %[steps]   \n"
-            "vst1.32    {d25[0]}, [%[out2]], %[steps]   \n"
-            "vst1.32    {d25[1]}, [%[out2]], %[steps]   \n"
-
-            "vmul.f32   q10, q4, d3[1]                  \n"  // 0.25 * d2
-            "vmul.f32   q11, q3, d3[0]                  \n"  // 0.5 * d1
-            "vadd.f32   q10, q10, q8                    \n"  // 0.25 * d2 + d6
-            "vmla.f32   q11, q7, d2[0]                  \n"  // 0.5 * d1 + 2 *
-                                                             // d5
-            "vmla.f32   q10, q6, d2[1]                  \n"  // 0.25 * d2 + d6
-                                                             // - 1.25 * d4
-            "vmla.f32   q11, q5, d1[1]                  \n"  // 0.5 * d1 + 2 *
-                                                             // d5 - 2.5 * d3
-            "vadd.f32   q12, q10, q11                   \n"
-            "vst1.32    {d24[0]}, [%[out3]], %[steps]   \n"
-            "vst1.32    {d24[1]}, [%[out3]], %[steps]   \n"
-            "vst1.32    {d25[0]}, [%[out3]], %[steps]   \n"
-            "vst1.32    {d25[1]}, [%[out3]], %[steps]   \n"
-            "vsub.f32   q12, q10, q11                   \n"
-            "vst1.32    {d24[0]}, [%[out4]], %[steps]   \n"
-            "vst1.32    {d24[1]}, [%[out4]], %[steps]   \n"
-            "vst1.32    {d25[0]}, [%[out4]], %[steps]   \n"
-            "vst1.32    {d25[1]}, [%[out4]], %[steps]   \n"
-
-            "vmul.f32   q10, q4, d2[0]                  \n"  // 2 * d2
-            "vmul.f32   q11, q3, d2[0]                  \n"  // 2 * d1
-            "vmla.f32   q10, q6, d1[1]                  \n"  // 2 * d2 - 2.5 *
-                                                             // d4
-            "vmla.f32   q11, q5, d1[1]                  \n"  // 2 * d1 - 2.5 *
-                                                             // d3
-            "vmla.f32   q10, q8, d3[0]                  \n"  // 2 * d1 - 2.5 *
-                                                             // d3 + 0.5 * d6
-            "vmla.f32   q11, q7, d3[0]                  \n"  // 2 * d2 - 2.5 *
-                                                             // d4 + 0.5 * d5
-            "vmul.f32   q10, q10, d2[0]                 \n"  // 4 * d1 - 5 * d3
-                                                             // + d6
-            "vadd.f32   q12, q10, q11                   \n"
-            "vst1.32    {d24[0]}, [%[out5]], %[steps]   \n"
-            "vst1.32    {d24[1]}, [%[out5]], %[steps]   \n"
-            "vst1.32    {d25[0]}, [%[out5]], %[steps]   \n"
-            "vst1.32    {d25[1]}, [%[out5]], %[steps]   \n"
-            "vsub.f32   q12, q10, q11                   \n"
-            "vst1.32    {d24[0]}, [%[out6]], %[steps]   \n"
-            "vst1.32    {d24[1]}, [%[out6]], %[steps]   \n"
-            "vst1.32    {d25[0]}, [%[out6]], %[steps]   \n"
-            "vst1.32    {d25[1]}, [%[out6]], %[steps]   \n"
-
-            "vsub.f32   q10, q9, q3                     \n"
-            "vsub.f32   q11, q5, q7                     \n"
-            "vmla.f32   q10, q11, d0[0]                 \n"
-            "vst1.32    {d20[0]}, [%[out7]], %[steps]   \n"
-            "vst1.32    {d20[1]}, [%[out7]], %[steps]   \n"
-            "vst1.32    {d21[0]}, [%[out7]], %[steps]   \n"
-            "vst1.32    {d21[1]}, [%[out7]], %[steps]   \n"
-
-            "subs       r0, #1                          \n"
-            "bne        loop_r_%=                       \n"
-            : [out0] "+r"(out0), [out1] "+r"(out1), [out2] "+r"(out2),
-              [out3] "+r"(out3), [out4] "+r"(out4), [out5] "+r"(out5),
-              [out6] "+r"(out6), [out7] "+r"(out7), [ptr0] "+r"(ptr0),
-              [ptr1] "+r"(ptr1)
-            : [tm_ptr] "r"((float *)transform_matrix), [steps] "r"(steps)
-            : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-              "q8", "q9", "q10", "q11", "q12", "q13", "r0");
-#endif  // __aarch64__
-      }
-    }
-  }
-}
-
-template <>
-void winograd_transform_output<8, 3>(const framework::Tensor &input,
-                                     const framework::Tensor &weight,
-                                     framework::Tensor *output) {
-  // weight shape is [out_channel/4, 64, in_channel, 4],
-  // input shape is [hw/8, 64, in_channel, 8]
-  int tiles = input.dims()[0];
-  int in_channel = input.dims()[2];
-  int out_channel = weight.dims()[0];
-
-  // compute U*V first
-  framework::Tensor uv_trans;
-  framework::DDim shape =
-      framework::make_ddim(std::vector<int>{out_channel, tiles, 64, 32});
-  float *uv_trans_ptr = uv_trans.mutable_data<float>(shape);
-  const float *input_ptr = input.data<float>();
-  const float *weight_ptr = weight.data<float>();
-
-  #pragma omp parallel for
-  for (int i = 0; i < out_channel; ++i) {
-    float *uv_ptr = uv_trans_ptr + (i * tiles * 64 * 32);
-    for (int j = 0; j < tiles; ++j) {
-      for (int k = 0; k < 64; ++k) {
-        const float *w_ptr = weight_ptr + (i * 64 + k) * in_channel * 4;
-        const float *in_ptr = input_ptr + (j * 64 + k) * in_channel * 8;
-        int inter_channel = in_channel >> 1;
-        int remain_channel = in_channel & 0x1;
-#if __aarch64__
-        asm volatile(
-            "dup        v8.4s,     wzr                 \n"
-            "dup        v9.4s,     wzr                 \n"
-            "dup        v10.4s,    wzr                 \n"
-            "dup        v11.4s,    wzr                 \n"
-            "dup        v12.4s,    wzr                 \n"
-            "dup        v13.4s,    wzr                 \n"
-            "dup        v14.4s,    wzr                 \n"
-            "dup        v15.4s,    wzr                 \n"
-
-            "cmp        %[inter], #0                       \n"
-            "ble        2f                                 \n"
-            // loop 2 channels
-            "1:                                            \n"
-            "ld1        {v0.4s, v1.4s}, [%[w_ptr]], #32    \n"
-            "ld1        {v2.4s, v3.4s}, [%[in_ptr]], #32   \n"
-            "ld1        {v4.4s, v5.4s}, [%[in_ptr]], #32   \n"
-
-            "fmla       v8.4s, v2.4s, v0.s[0]              \n"
-            "fmla       v9.4s, v3.4s, v0.s[0]              \n"
-            "fmla       v10.4s, v2.4s, v0.s[1]             \n"
-            "fmla       v11.4s, v3.4s, v0.s[1]             \n"
-            "fmla       v12.4s, v2.4s, v0.s[2]             \n"
-            "fmla       v13.4s, v3.4s, v0.s[2]             \n"
-            "fmla       v14.4s, v2.4s, v0.s[3]             \n"
-            "fmla       v15.4s, v3.4s, v0.s[3]             \n"
-
-            "fmla       v8.4s, v4.4s, v1.s[0]              \n"
-            "fmla       v9.4s, v5.4s, v1.s[0]              \n"
-            "fmla       v10.4s, v4.4s, v1.s[1]             \n"
-            "fmla       v11.4s, v5.4s, v1.s[1]             \n"
-            "fmla       v12.4s, v4.4s, v1.s[2]             \n"
-            "fmla       v13.4s, v5.4s, v1.s[2]             \n"
-            "fmla       v14.4s, v4.4s, v1.s[3]             \n"
-            "fmla       v15.4s, v5.4s, v1.s[3]             \n"
-
-            "subs       %[inter], %[inter], #1             \n"
-            "bne        1b                                 \n"
-
-            // loop 1 channel
-            "2:                                            \n"
-            "cmp        %[remain], #0                      \n"
-            "ble        3f                                 \n"
-
-            "ld1        {v0.4s, v1.4s}, [%[w_ptr]], #32    \n"
-            "ld1        {v2.4s, v3.4s}, [%[in_ptr]], #32   \n"
-            "fmla       v8.4s, v2.4s, v0.s[0]              \n"
-            "fmla       v9.4s, v3.4s, v0.s[0]              \n"
-            "fmla       v10.4s, v2.4s, v0.s[1]             \n"
-            "fmla       v11.4s, v3.4s, v0.s[1]             \n"
-            "fmla       v12.4s, v2.4s, v0.s[2]             \n"
-            "fmla       v13.4s, v3.4s, v0.s[2]             \n"
-            "fmla       v14.4s, v2.4s, v0.s[3]             \n"
-            "fmla       v15.4s, v3.4s, v0.s[3]             \n"
-
-            "3:                                            \n"
-            "st1        {v8.4s, v9.4s, v10.4s, v11.4s}, [%[uv_ptr]], #64 \n"
-            "st1        {v12.4s, v13.4s, v14.4s, v15.4s}, [%[uv_ptr]], #64 \n"
-            : [w_ptr] "+r"(w_ptr), [in_ptr] "+r"(in_ptr), [uv_ptr] "+r"(uv_ptr),
-              [inter] "+r"(inter_channel)
-            : [remain] "r"(remain_channel)
-            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-              "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
-#else
-        asm volatile(
-            "veor       q8, q8, q8                     \n"
-            "veor       q9, q9, q9                     \n"
-            "veor       q10, q10, q10                  \n"
-            "veor       q11, q11, q11                  \n"
-            "veor       q12, q12, q12                  \n"
-            "veor       q13, q13, q13                  \n"
-            "veor       q14, q14, q14                  \n"
-            "veor       q15, q15, q15                  \n"
-
-            "cmp        %[inter_channel], #0           \n"
-            "ble        loop_1c_%=                     \n"
-            // loop 2 channels
-            "loop_2c_%=:                               \n"
-            "vld1.32    {d0-d3}, [%[w_ptr]]!           \n"
-            "vld1.32    {d4-d7}, [%[in_ptr]]!          \n"
-            "vld1.32    {d8-d11}, [%[in_ptr]]!         \n"
-            "vmla.f32   q8, q2, d0[0]                  \n"
-            "vmla.f32   q9, q3, d0[0]                  \n"
-            "vmla.f32   q10, q2, d0[1]                 \n"
-            "vmla.f32   q11, q3, d0[1]                 \n"
-            "vmla.f32   q12, q2, d1[0]                 \n"
-            "vmla.f32   q13, q3, d1[0]                 \n"
-            "vmla.f32   q14, q2, d1[1]                 \n"
-            "vmla.f32   q15, q3, d1[1]                 \n"
-
-            "vmla.f32   q8, q4, d2[0]                  \n"
-            "vmla.f32   q9, q5, d2[0]                  \n"
-            "vmla.f32   q10, q4, d2[1]                 \n"
-            "vmla.f32   q11, q5, d2[1]                 \n"
-            "vmla.f32   q12, q4, d3[0]                 \n"
-            "vmla.f32   q13, q5, d3[0]                 \n"
-            "vmla.f32   q14, q4, d3[1]                 \n"
-            "vmla.f32   q15, q5, d3[1]                 \n"
-
-            "subs       %[inter_channel], #1           \n"
-            "bne        loop_2c_%=                     \n"
-
-            // loop 1 channel
-            "loop_1c_%=:                               \n"
-            "cmp        %[remain_channel], #0          \n"
-            "ble        store_res_%=                   \n"
-
-            "vld1.32    {d0-d1}, [%[w_ptr]]!           \n"
-            "vld1.32    {d4-d7}, [%[in_ptr]]!          \n"
-            "vmla.f32   q8, q2, d0[0]                  \n"
-            "vmla.f32   q9, q3, d0[0]                  \n"
-            "vmla.f32   q10, q2, d0[1]                 \n"
-            "vmla.f32   q11, q3, d0[1]                 \n"
-            "vmla.f32   q12, q2, d1[0]                 \n"
-            "vmla.f32   q13, q3, d1[0]                 \n"
-            "vmla.f32   q14, q2, d1[1]                 \n"
-            "vmla.f32   q15, q3, d1[1]                 \n"
-
-            "store_res_%=:                             \n"
-            "vst1.32    {d16-d19}, [%[uv_ptr]]!        \n"
-            "vst1.32    {d20-d23}, [%[uv_ptr]]!        \n"
-            "vst1.32    {d24-d27}, [%[uv_ptr]]!        \n"
-            "vst1.32    {d28-d31}, [%[uv_ptr]]!        \n"
-            : [w_ptr] "+r"(w_ptr), [in_ptr] "+r"(in_ptr), [uv_ptr] "+r"(uv_ptr),
-              [inter_channel] "+r"(inter_channel)
-            : [remain_channel] "r"(remain_channel)
-            : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-              "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-#endif  // __aarch64__
-      }
-    }
-  }
-
-  /*
-   * s0 = m0 + (m1 + m2) +      (m3 + m4) + 32 * (m5 + m6)
-   * s1 =      (m1 - m2) +  2 * (m3 - m4) + 16 * (m5 - m6)
-   * s2 =      (m1 + m2) +  4 * (m3 + m4) +  8 * (m5 + m6)
-   * s3 =      (m1 - m2) +  8 * (m3 - m4) +  4 * (m5 - m6)
-   * s4 =      (m1 + m2) + 16 * (m3 + m4) +  2 * (m5 + m6)
-   * s5 =      (m1 - m2) + 32 * (m3 - m4) +      (m5 - m6) + m7
-   */
-  int out_h = output->dims()[2];
-  int out_w = output->dims()[3];
-  int h_tiles = (out_h + 5) / 6;
-  int w_tiles = (out_w + 5) / 6;
-  int remain_h = out_h - out_h / 6 * 6;
-  int remain_w = out_w - out_w / 6 * 6;
-  float *output_ptr = output->mutable_data<float>();
-  float transform_matrix[8] = {2.f, 4.f, 8.f, 16.f};
-
-  #pragma omp parallel for
-  for (int oc = 0; oc < output->dims()[1]; ++oc) {
-    float at_m[48];        // [6][8]
-    float output_tmp[36];  // [6][6], temporarily restore results
-    // (oc / 4) * tiles * 64 * 32 + (oc & 0x3) * 8
-    const float *uv_ptr =
-        uv_trans_ptr + (oc >> 2) * tiles * 64 * 32 + (oc & 0x3) * 8;
-    for (int tile_h = 0; tile_h < h_tiles; ++tile_h) {
-      for (int tile_w = 0; tile_w < w_tiles; ++tile_w) {
-        float *at_m_ptr = at_m;
-        int tile_indics = tile_h * w_tiles + tile_w;
-        int tile_block = tile_indics >> 3;
-        int block_indics = tile_indics & 0x7;
-        const float *uv_ptr0 = uv_ptr + tile_block * 64 * 32 + block_indics;
-#if __aarch64__
-        float32x4_t _q0 = vld1q_f32(transform_matrix);
-        for (int l = 0; l < 2; ++l) {
-          float32x4_t _q1, _q2, _q3, _q4, _q5, _q6, _q7, _q8;
-          _q1 = vsetq_lane_f32(*uv_ptr0, _q1, 0);
-          uv_ptr0 += 32;
-          _q3 = vsetq_lane_f32(*uv_ptr0, _q3, 0);
-          uv_ptr0 += 32;
-          _q5 = vsetq_lane_f32(*uv_ptr0, _q5, 0);
-          uv_ptr0 += 32;
-          _q7 = vsetq_lane_f32(*uv_ptr0, _q7, 0);
-          uv_ptr0 += 32;
-          _q2 = vsetq_lane_f32(*uv_ptr0, _q2, 0);
-          uv_ptr0 += 32;
-          _q4 = vsetq_lane_f32(*uv_ptr0, _q4, 0);
-          uv_ptr0 += 32;
-          _q6 = vsetq_lane_f32(*uv_ptr0, _q6, 0);
-          uv_ptr0 += 32;
-          _q8 = vsetq_lane_f32(*uv_ptr0, _q8, 0);
-          uv_ptr0 += 32;
-
-          _q1 = vsetq_lane_f32(*uv_ptr0, _q1, 1);
-          uv_ptr0 += 32;
-          _q3 = vsetq_lane_f32(*uv_ptr0, _q3, 1);
-          uv_ptr0 += 32;
-          _q5 = vsetq_lane_f32(*uv_ptr0, _q5, 1);
-          uv_ptr0 += 32;
-          _q7 = vsetq_lane_f32(*uv_ptr0, _q7, 1);
-          uv_ptr0 += 32;
-          _q2 = vsetq_lane_f32(*uv_ptr0, _q2, 1);
-          uv_ptr0 += 32;
-          _q4 = vsetq_lane_f32(*uv_ptr0, _q4, 1);
-          uv_ptr0 += 32;
-          _q6 = vsetq_lane_f32(*uv_ptr0, _q6, 1);
-          uv_ptr0 += 32;
-          _q8 = vsetq_lane_f32(*uv_ptr0, _q8, 1);
-          uv_ptr0 += 32;
-
-          _q1 = vsetq_lane_f32(*uv_ptr0, _q1, 2);
-          uv_ptr0 += 32;
-          _q3 = vsetq_lane_f32(*uv_ptr0, _q3, 2);
-          uv_ptr0 += 32;
-          _q5 = vsetq_lane_f32(*uv_ptr0, _q5, 2);
-          uv_ptr0 += 32;
-          _q7 = vsetq_lane_f32(*uv_ptr0, _q7, 2);
-          uv_ptr0 += 32;
-          _q2 = vsetq_lane_f32(*uv_ptr0, _q2, 2);
-          uv_ptr0 += 32;
-          _q4 = vsetq_lane_f32(*uv_ptr0, _q4, 2);
-          uv_ptr0 += 32;
-          _q6 = vsetq_lane_f32(*uv_ptr0, _q6, 2);
-          uv_ptr0 += 32;
-          _q8 = vsetq_lane_f32(*uv_ptr0, _q8, 2);
-          uv_ptr0 += 32;
-
-          _q1 = vsetq_lane_f32(*uv_ptr0, _q1, 3);
-          uv_ptr0 += 32;
-          _q3 = vsetq_lane_f32(*uv_ptr0, _q3, 3);
-          uv_ptr0 += 32;
-          _q5 = vsetq_lane_f32(*uv_ptr0, _q5, 3);
-          uv_ptr0 += 32;
-          _q7 = vsetq_lane_f32(*uv_ptr0, _q7, 3);
-          uv_ptr0 += 32;
-          _q2 = vsetq_lane_f32(*uv_ptr0, _q2, 3);
-          uv_ptr0 += 32;
-          _q4 = vsetq_lane_f32(*uv_ptr0, _q4, 3);
-          uv_ptr0 += 32;
-          _q6 = vsetq_lane_f32(*uv_ptr0, _q6, 3);
-          uv_ptr0 += 32;
-          _q8 = vsetq_lane_f32(*uv_ptr0, _q8, 3);
-          uv_ptr0 += 32;
-
-          float32x4_t _q9 = vaddq_f32(_q3, _q5);
-          float32x4_t _q10 = vaddq_f32(_q7, _q2);
-          float32x4_t _q11 = vaddq_f32(_q4, _q6);
-          float32x4_t _q12 = vsubq_f32(_q3, _q5);
-          float32x4_t _q13 = vsubq_f32(_q7, _q2);
-          float32x4_t _q14 = vsubq_f32(_q4, _q6);
-          _q2 = vmulq_lane_f32(_q13, vget_low_f32(_q0), 0);
-          _q3 = vmulq_lane_f32(_q11, vget_low_f32(_q0), 0);
-
-          float32x4_t _q15 = vaddq_f32(_q1, _q9);
-          _q15 = vaddq_f32(_q15, _q10);
-          _q15 = vmlaq_lane_f32(_q15, _q3, vget_high_f32(_q0), 1);
-          vst1q_f32(at_m_ptr, _q15);
-
-          _q15 = vaddq_f32(_q12, _q2);
-          _q15 = vmlaq_lane_f32(_q15, _q14, vget_high_f32(_q0), 1);
-          vst1q_f32(at_m_ptr + 4, _q15);
-
-          _q15 = vmlaq_lane_f32(_q9, _q10, vget_low_f32(_q0), 1);
-          _q15 = vmlaq_lane_f32(_q15, _q11, vget_high_f32(_q0), 0);
-          vst1q_f32(at_m_ptr + 8, _q15);
-
-          _q15 = vmlaq_lane_f32(_q12, _q13, vget_high_f32(_q0), 0);
-          _q15 = vmlaq_lane_f32(_q15, _q14, vget_low_f32(_q0), 1);
-          vst1q_f32(at_m_ptr + 12, _q15);
-
-          _q15 = vaddq_f32(_q9, _q3);
-          _q15 = vmlaq_lane_f32(_q15, _q10, vget_high_f32(_q0), 1);
-          vst1q_f32(at_m_ptr + 16, _q15);
-
-          _q15 = vaddq_f32(_q12, _q8);
-          _q15 = vaddq_f32(_q15, _q14);
-          _q15 = vmlaq_lane_f32(_q15, _q2, vget_high_f32(_q0), 1);
-          vst1q_f32(at_m_ptr + 20, _q15);
-
-          at_m_ptr += 24;
-        }
-#else
-        int steps = 32 * sizeof(float);
-        asm volatile(
-            "vld1.32    {d0-d1}, [%[tm_ptr]]              \n"
-            "mov        r0, #2                            \n"
-
-            "loop_%=:                                     \n"
-            "vld1.32    {d2[0]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d6[0]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d10[0]}, [%[uv_ptr0]], %[steps]  \n"
-            "vld1.32    {d14[0]}, [%[uv_ptr0]], %[steps]  \n"
-            "vld1.32    {d4[0]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d8[0]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d12[0]}, [%[uv_ptr0]], %[steps]  \n"
-            "vld1.32    {d16[0]}, [%[uv_ptr0]], %[steps]  \n"
-
-            "vld1.32    {d2[1]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d6[1]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d10[1]}, [%[uv_ptr0]], %[steps]  \n"
-            "vld1.32    {d14[1]}, [%[uv_ptr0]], %[steps]  \n"
-            "vld1.32    {d4[1]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d8[1]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d12[1]}, [%[uv_ptr0]], %[steps]  \n"
-            "vld1.32    {d16[1]}, [%[uv_ptr0]], %[steps]  \n"
-
-            "vld1.32    {d3[0]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d7[0]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d11[0]}, [%[uv_ptr0]], %[steps]  \n"
-            "vld1.32    {d15[0]}, [%[uv_ptr0]], %[steps]  \n"
-            "vld1.32    {d5[0]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d9[0]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d13[0]}, [%[uv_ptr0]], %[steps]  \n"
-            "vld1.32    {d17[0]}, [%[uv_ptr0]], %[steps]  \n"
-
-            "vld1.32    {d3[1]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d7[1]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d11[1]}, [%[uv_ptr0]], %[steps]  \n"
-            "vld1.32    {d15[1]}, [%[uv_ptr0]], %[steps]  \n"
-            "vld1.32    {d5[1]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d9[1]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d13[1]}, [%[uv_ptr0]], %[steps]  \n"
-            "vld1.32    {d17[1]}, [%[uv_ptr0]], %[steps]  \n"
-
-            "vadd.f32   q9, q3, q5                     \n"  // m1 + m2
-            "vadd.f32   q10, q7, q2                    \n"  // m3 + m4
-            "vadd.f32   q11, q4, q6                    \n"  // m5 + m6
-            "vsub.f32   q12, q3, q5                    \n"  // m1 - m2
-            "vsub.f32   q13, q7, q2                    \n"  // m3 - m4
-            "vsub.f32   q14, q4, q6                    \n"  // m5 - m6
-            "vmul.f32   q2, q13, d0[0]                 \n"  // 2 * (m3 - m4)
-            "vmul.f32   q3, q11, d0[0]                 \n"  // 2 * (m5 + m6)
-
-            "vadd.f32   q15, q1, q9                    \n"
-            "vadd.f32   q15, q15, q10                  \n"
-            "vmla.f32   q15, q3, d1[1]                 \n"
-            "vst1.32    {d30-d31}, [%[at_m_ptr]]!      \n"
-
-            "vadd.f32   q15, q12, q2                   \n"
-            "vmla.f32   q15, q14, d1[1]                \n"
-            "vst1.32    {d30-d31}, [%[at_m_ptr]]!      \n"
-
-            "vmov.32    q15, q9                        \n"
-            "vmla.f32   q15, q10, d0[1]                \n"
-            "vmla.f32   q15, q11, d1[0]                \n"
-            "vst1.32    {d30-d31}, [%[at_m_ptr]]!      \n"
-
-            "vmov.32    q15, q12                       \n"
-            "vmla.f32   q15, q13, d1[0]                \n"
-            "vmla.f32   q15, q14, d0[1]                \n"
-            "vst1.32    {d30-d31}, [%[at_m_ptr]]!      \n"
-
-            "vadd.f32   q15, q9, q3                    \n"
-            "vmla.f32   q15, q10, d1[1]                \n"
-            "vst1.32    {d30-d31}, [%[at_m_ptr]]!      \n"
-
-            "vadd.f32   q15, q12, q8                   \n"
-            "vadd.f32   q15, q15, q14                  \n"
-            "vmla.f32   q15, q2, d1[1]                 \n"
-            "vst1.32    {d30-d31}, [%[at_m_ptr]]!      \n"
-
-            "subs       r0, #1                         \n"
-            "bne        loop_%=                        \n"
-            : [uv_ptr0] "+r"(uv_ptr0), [at_m_ptr] "+r"(at_m_ptr)
-            : [tm_ptr] "r"((float *)transform_matrix), [steps] "r"(steps)
-            : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-              "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0");
-#endif  // __aarch64__
-
-        float *at_m_ptr0 = at_m;
-        float *at_m_ptr1 = at_m + 24;
-        if ((remain_w > 0 && tile_w == w_tiles - 1) ||
-            (remain_h > 0 && tile_h == h_tiles - 1)) {
-          float *out_ptr0 = output_tmp;
-          float *out_ptr1 = output_tmp + 6;
-          float *out_ptr2 = output_tmp + 12;
-          float *out_ptr3 = output_tmp + 18;
-          float *out_ptr4 = output_tmp + 24;
-          float *out_ptr5 = output_tmp + 30;
-#if __aarch64__
-          float32x4_t _q0 = vld1q_f32(transform_matrix);
-          float32x4x2_t _q23, _q45, _q67, _q89;
-          _q23.val[0] = vld1q_f32(at_m_ptr0);
-          _q23.val[1] = vld1q_f32(at_m_ptr0 + 4);
-          _q45.val[0] = vld1q_f32(at_m_ptr0 + 8);
-          _q45.val[1] = vld1q_f32(at_m_ptr0 + 12);
-          _q67.val[0] = vld1q_f32(at_m_ptr1);
-          _q67.val[1] = vld1q_f32(at_m_ptr1 + 4);
-          _q89.val[0] = vld1q_f32(at_m_ptr1 + 8);
-          _q89.val[1] = vld1q_f32(at_m_ptr1 + 12);
-          _q23 = vtrnq_f32(_q23.val[0], _q23.val[1]);
-          _q45 = vtrnq_f32(_q45.val[0], _q45.val[1]);
-          _q67 = vtrnq_f32(_q67.val[0], _q67.val[1]);
-          _q89 = vtrnq_f32(_q89.val[0], _q89.val[1]);
-          float32x4_t _q1 = vcombine_f32(vget_low_f32(_q23.val[0]),
-                                         vget_low_f32(_q45.val[0]));
-          float32x4_t _q3 = vcombine_f32(vget_high_f32(_q23.val[0]),
-                                         vget_high_f32(_q45.val[0]));
-          float32x4_t _q2 = vcombine_f32(vget_low_f32(_q23.val[1]),
-                                         vget_low_f32(_q45.val[1]));
-          float32x4_t _q4 = vcombine_f32(vget_high_f32(_q23.val[1]),
-                                         vget_high_f32(_q45.val[1]));
-          float32x4_t _q5 = vcombine_f32(vget_low_f32(_q67.val[0]),
-                                         vget_low_f32(_q89.val[0]));
-          float32x4_t _q7 = vcombine_f32(vget_high_f32(_q67.val[0]),
-                                         vget_high_f32(_q89.val[0]));
-          float32x4_t _q6 = vcombine_f32(vget_low_f32(_q67.val[1]),
-                                         vget_low_f32(_q89.val[1]));
-          float32x4_t _q8 = vcombine_f32(vget_high_f32(_q67.val[1]),
-                                         vget_high_f32(_q89.val[1]));
-
-          float32x4_t _q9 = vaddq_f32(_q2, _q3);
-          float32x4_t _q10 = vaddq_f32(_q4, _q5);
-          float32x4_t _q11 = vaddq_f32(_q6, _q7);
-          float32x4_t _q12 = vsubq_f32(_q2, _q3);
-          float32x4_t _q13 = vsubq_f32(_q4, _q5);
-          float32x4_t _q14 = vsubq_f32(_q6, _q7);
-          _q6 = vmulq_lane_f32(_q13, vget_low_f32(_q0), 0);
-          _q7 = vmulq_lane_f32(_q11, vget_low_f32(_q0), 0);
-
-          _q1 = vaddq_f32(_q1, _q9);
-          _q1 = vaddq_f32(_q1, _q10);
-          _q1 = vmlaq_lane_f32(_q1, _q7, vget_high_f32(_q0), 1);
-
-          _q2 = vaddq_f32(_q12, _q6);
-          _q2 = vmlaq_lane_f32(_q2, _q14, vget_high_f32(_q0), 1);
-
-          _q3 = vmlaq_lane_f32(_q9, _q10, vget_low_f32(_q0), 1);
-          _q3 = vmlaq_lane_f32(_q3, _q11, vget_high_f32(_q0), 0);
-
-          _q4 = vmlaq_lane_f32(_q12, _q13, vget_high_f32(_q0), 0);
-          _q4 = vmlaq_lane_f32(_q4, _q14, vget_low_f32(_q0), 1);
-
-          _q23 = vtrnq_f32(_q1, _q2);
-          _q45 = vtrnq_f32(_q3, _q4);
-          vst1_f32(out_ptr0, vget_low_f32(_q23.val[0]));
-          vst1_f32(out_ptr0 + 2, vget_low_f32(_q45.val[0]));
-          vst1_f32(out_ptr1, vget_low_f32(_q23.val[1]));
-          vst1_f32(out_ptr1 + 2, vget_low_f32(_q45.val[1]));
-          vst1_f32(out_ptr2, vget_high_f32(_q23.val[0]));
-          vst1_f32(out_ptr2 + 2, vget_high_f32(_q45.val[0]));
-          vst1_f32(out_ptr3, vget_high_f32(_q23.val[1]));
-          vst1_f32(out_ptr3 + 2, vget_high_f32(_q45.val[1]));
-
-          _q1 = vaddq_f32(_q9, _q7);
-          _q1 = vmlaq_lane_f32(_q1, _q10, vget_high_f32(_q0), 1);
-          _q2 = vaddq_f32(_q12, _q8);
-          _q2 = vaddq_f32(_q2, _q14);
-          _q2 = vmlaq_lane_f32(_q2, _q6, vget_high_f32(_q0), 1);
-          _q23 = vtrnq_f32(_q1, _q2);
-          vst1_f32(out_ptr0 + 4, vget_low_f32(_q23.val[0]));
-          vst1_f32(out_ptr1 + 4, vget_low_f32(_q23.val[1]));
-          vst1_f32(out_ptr2 + 4, vget_high_f32(_q23.val[0]));
-          vst1_f32(out_ptr3 + 4, vget_high_f32(_q23.val[1]));
-
-          // remain 2 rows
-          _q1 = vld1q_f32(at_m_ptr0 + 16);
-          _q2 = vld1q_f32(at_m_ptr0 + 20);
-          _q3 = vld1q_f32(at_m_ptr1 + 16);
-          _q4 = vld1q_f32(at_m_ptr1 + 20);
-          _q23 = vtrnq_f32(_q1, _q2);
-          _q45 = vtrnq_f32(_q3, _q4);
-
-          float32x2_t _d2 = vget_low_f32(_q23.val[0]);
-          float32x2_t _d3 = vget_high_f32(_q23.val[0]);
-          float32x2_t _d4 = vget_low_f32(_q23.val[1]);
-          float32x2_t _d5 = vget_high_f32(_q23.val[1]);
-          float32x2_t _d6 = vget_low_f32(_q45.val[0]);
-          float32x2_t _d7 = vget_high_f32(_q45.val[0]);
-          float32x2_t _d8 = vget_low_f32(_q45.val[1]);
-          float32x2_t _d9 = vget_high_f32(_q45.val[1]);
-
-          float32x2_t _d10 = vadd_f32(_d4, _d3);
-          float32x2_t _d11 = vadd_f32(_d5, _d6);
-          float32x2_t _d12 = vadd_f32(_d8, _d7);
-          float32x2_t _d13 = vsub_f32(_d4, _d3);
-          float32x2_t _d14 = vsub_f32(_d5, _d6);
-          float32x2_t _d15 = vsub_f32(_d8, _d7);
-          float32x2_t _d16 = vmul_lane_f32(_d14, vget_low_f32(_q0), 0);
-          float32x2_t _d17 = vmul_lane_f32(_d12, vget_low_f32(_q0), 0);
-
-          float32x2_t _d18 = vadd_f32(_d2, _d10);
-          float32x2_t _d20 = vadd_f32(_d13, _d16);
-          float32x2_t _d19 = vmla_lane_f32(_d10, _d11, vget_low_f32(_q0), 1);
-          float32x2_t _d21 = vmla_lane_f32(_d13, _d14, vget_high_f32(_q0), 0);
-          _d18 = vadd_f32(_d18, _d11);
-          _d18 = vmla_lane_f32(_d18, _d17, vget_high_f32(_q0), 1);
-          _d20 = vmla_lane_f32(_d20, _d15, vget_high_f32(_q0), 1);
-          _d19 = vmla_lane_f32(_d19, _d12, vget_high_f32(_q0), 0);
-          _d21 = vmla_lane_f32(_d21, _d15, vget_low_f32(_q0), 1);
-
-          float32x2x2_t _d18d20 = vtrn_f32(_d18, _d20);
-          float32x2x2_t _d19d21 = vtrn_f32(_d19, _d21);
-          vst1_f32(out_ptr4, _d18d20.val[0]);
-          vst1_f32(out_ptr4 + 2, _d19d21.val[0]);
-          vst1_f32(out_ptr5, _d18d20.val[1]);
-          vst1_f32(out_ptr5 + 2, _d19d21.val[1]);
-
-          _d18 = vadd_f32(_d10, _d17);
-          _d18 = vmla_lane_f32(_d18, _d11, vget_high_f32(_q0), 1);
-          _d20 = vadd_f32(_d13, _d9);
-          _d20 = vadd_f32(_d20, _d15);
-          _d20 = vmla_lane_f32(_d20, _d16, vget_high_f32(_q0), 1);
-          _d18d20 = vtrn_f32(_d18, _d20);
-          vst1_f32(out_ptr4 + 4, _d18d20.val[0]);
-          vst1_f32(out_ptr5 + 4, _d18d20.val[1]);
-#else
-          asm volatile(
-              "vld1.32    {d0-d1}, [%[tm_ptr]]          \n"
-              // process 4 rows
-              "vld1.32    {d2-d5}, [%[at_m_ptr0]]!      \n"  // q1: m0, q2: m1
-              "vld1.32    {d6-d9}, [%[at_m_ptr0]]!      \n"  // q3: m2, q4: m3
-              "vld1.32    {d10-d13}, [%[at_m_ptr1]]!    \n"  // q5: m4, q6: m5
-              "vld1.32    {d14-d17}, [%[at_m_ptr1]]!    \n"  // q7: m6, q8: m7
-              "vtrn.32    q1, q2                        \n"
-              "vtrn.32    q3, q4                        \n"
-              "vtrn.32    q5, q6                        \n"
-              "vtrn.32    q7, q8                        \n"
-              "vswp.32    d3, d6                        \n"
-              "vswp.32    d5, d8                        \n"
-              "vswp.32    d11, d14                      \n"
-              "vswp.32    d13, d16                      \n"
-
-              "vadd.f32   q9, q2, q3                    \n"  // m1 + m2
-              "vadd.f32   q10, q4, q5                   \n"  // m3 + m4
-              "vadd.f32   q11, q6, q7                   \n"  // m5 + m6
-              "vsub.f32   q12, q2, q3                   \n"  // m1 - m2
-              "vsub.f32   q13, q4, q5                   \n"  // m3 - m4
-              "vsub.f32   q14, q6, q7                   \n"  // m5 - m6
-              "vmul.f32   q6, q13, d0[0]                \n"  // 2 * (m3 - m4)
-              "vmul.f32   q7, q11, d0[0]                \n"  // 2 * (m5 + m6)
-
-              "vadd.f32   q1, q1, q9                   \n"
-              "vadd.f32   q1, q1, q10                  \n"
-              "vmla.f32   q1, q7, d1[1]                \n"
-
-              "vadd.f32   q2, q12, q6                  \n"
-              "vmla.f32   q2, q14, d1[1]               \n"
-
-              "vmov.32    q3, q9                       \n"
-              "vmla.f32   q3, q10, d0[1]               \n"
-              "vmla.f32   q3, q11, d1[0]               \n"
-
-              "vmov.32    q4, q12                      \n"
-              "vmla.f32   q4, q13, d1[0]               \n"
-              "vmla.f32   q4, q14, d0[1]               \n"
-
-              "vtrn.32    q1, q2                       \n"
-              "vtrn.32    q3, q4                       \n"
-              "vswp.32    d3, d6                       \n"
-              "vswp.32    d5, d8                       \n"
-              "vst1.32    {d2-d3}, [%[out_ptr0]]!      \n"
-              "vst1.32    {d4-d5}, [%[out_ptr1]]!      \n"
-              "vst1.32    {d6-d7}, [%[out_ptr2]]!      \n"
-              "vst1.32    {d8-d9}, [%[out_ptr3]]!      \n"
-
-              "vadd.f32   q1, q9, q7                   \n"
-              "vmla.f32   q1, q10, d1[1]               \n"
-
-              "vadd.f32   q2, q12, q8                  \n"
-              "vadd.f32   q2, q2, q14                  \n"
-              "vmla.f32   q2, q6, d1[1]                \n"
-
-              "vtrn.32    q1, q2                       \n"
-              "vst1.32    {d2}, [%[out_ptr0]]!         \n"
-              "vst1.32    {d4}, [%[out_ptr1]]!         \n"
-              "vst1.32    {d3}, [%[out_ptr2]]!         \n"
-              "vst1.32    {d5}, [%[out_ptr3]]!         \n"
-
-              // remain 2 rows
-              "vld1.32    {d2-d5}, [%[at_m_ptr0]]!      \n"  // d2: m0, d3: m2,
-                                                             // d4: m1, d5: m3
-              "vld1.32    {d6-d9}, [%[at_m_ptr1]]!      \n"  // d6: m4, d7: m6,
-                                                             // d8: m5, d9: m7
-              "vtrn.32    q1, q2                        \n"
-              "vtrn.32    q3, q4                        \n"
-
-              "vadd.f32   d10, d4, d3                   \n"  // m1 + m2
-              "vadd.f32   d11, d5, d6                   \n"  // m3 + m4
-              "vadd.f32   d12, d8, d7                   \n"  // m5 + m6
-              "vsub.f32   d13, d4, d3                   \n"  // m1 - m2
-              "vsub.f32   d14, d5, d6                   \n"  // m3 - m4
-              "vsub.f32   d15, d8, d7                   \n"  // m5 - m6
-              "vmul.f32   d16, d14, d0[0]               \n"  // 2 * (m3 - m4)
-              "vmul.f32   d17, d12, d0[0]               \n"  // 2 * (m5 + m6)
-
-              "vadd.f32   d18, d2, d10                  \n"
-              "vadd.f32   d18, d18, d11                 \n"
-              "vmla.f32   d18, d17, d1[1]               \n"
-
-              "vadd.f32   d20, d13, d16                 \n"
-              "vmla.f32   d20, d15, d1[1]               \n"
-
-              "vmov.32    d19, d10                      \n"
-              "vmla.f32   d19, d11, d0[1]               \n"
-              "vmla.f32   d19, d12, d1[0]               \n"
-
-              "vmov.32    d21, d13                      \n"
-              "vmla.f32   d21, d14, d1[0]               \n"
-              "vmla.f32   d21, d15, d0[1]               \n"
-
-              "vtrn.32    d18, d20                      \n"
-              "vtrn.32    d19, d21                      \n"
-              "vst1.32    {d18-d19}, [%[out_ptr4]]!     \n"
-              "vst1.32    {d20-d21}, [%[out_ptr5]]!     \n"
-
-              "vadd.f32   d18, d10, d17                 \n"
-              "vmla.f32   d18, d11, d1[1]               \n"
-
-              "vadd.f32   d19, d13, d9                  \n"
-              "vadd.f32   d19, d19, d15                 \n"
-              "vmla.f32   d19, d16, d1[1]               \n"
-
-              "vtrn.32    d18, d19                      \n"
-              "vst1.32    {d18}, [%[out_ptr4]]!         \n"
-              "vst1.32    {d19}, [%[out_ptr5]]!         \n"
-              : [out_ptr0] "+r"(out_ptr0), [out_ptr1] "+r"(out_ptr1),
-                [out_ptr2] "+r"(out_ptr2), [out_ptr3] "+r"(out_ptr3),
-                [out_ptr4] "+r"(out_ptr4), [out_ptr5] "+r"(out_ptr5),
-                [at_m_ptr0] "+r"(at_m_ptr0), [at_m_ptr1] "+r"(at_m_ptr1)
-              : [tm_ptr] "r"((float *)transform_matrix)
-              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-                "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-#endif  // __aarch64__
-          size_t offset = (oc * out_h + 6 * tile_h) * out_w + 6 * tile_w;
-          float *out_ptr = output_ptr + offset;
-          int remain_row = out_h - 6 * tile_h;
-          int remain_col = out_w - 6 * tile_w;
-          remain_row = (remain_row > 6) ? 6 : remain_row;
-          remain_col = (remain_col > 6) ? 6 : remain_col;
-          for (int i = 0; i < remain_row; ++i, out_ptr += out_w) {
-            memcpy(out_ptr, output_tmp + i * 6, remain_col * sizeof(float));
-          }
-        } else {
-          size_t offset = (oc * out_h + 6 * tile_h) * out_w + 6 * tile_w;
-          float *out_ptr0 = output_ptr + offset;
-          float *out_ptr1 = out_ptr0 + out_w;
-          float *out_ptr2 = out_ptr1 + out_w;
-          float *out_ptr3 = out_ptr2 + out_w;
-          float *out_ptr4 = out_ptr3 + out_w;
-          float *out_ptr5 = out_ptr4 + out_w;
-#if __aarch64__
-          float32x4_t _q0 = vld1q_f32(transform_matrix);
-          float32x4x2_t _q23, _q45, _q67, _q89;
-          _q23.val[0] = vld1q_f32(at_m_ptr0);
-          _q23.val[1] = vld1q_f32(at_m_ptr0 + 4);
-          _q45.val[0] = vld1q_f32(at_m_ptr0 + 8);
-          _q45.val[1] = vld1q_f32(at_m_ptr0 + 12);
-          _q67.val[0] = vld1q_f32(at_m_ptr1);
-          _q67.val[1] = vld1q_f32(at_m_ptr1 + 4);
-          _q89.val[0] = vld1q_f32(at_m_ptr1 + 8);
-          _q89.val[1] = vld1q_f32(at_m_ptr1 + 12);
-          _q23 = vtrnq_f32(_q23.val[0], _q23.val[1]);
-          _q45 = vtrnq_f32(_q45.val[0], _q45.val[1]);
-          _q67 = vtrnq_f32(_q67.val[0], _q67.val[1]);
-          _q89 = vtrnq_f32(_q89.val[0], _q89.val[1]);
-          float32x4_t _q1 = vcombine_f32(vget_low_f32(_q23.val[0]),
-                                         vget_low_f32(_q45.val[0]));
-          float32x4_t _q3 = vcombine_f32(vget_high_f32(_q23.val[0]),
-                                         vget_high_f32(_q45.val[0]));
-          float32x4_t _q2 = vcombine_f32(vget_low_f32(_q23.val[1]),
-                                         vget_low_f32(_q45.val[1]));
-          float32x4_t _q4 = vcombine_f32(vget_high_f32(_q23.val[1]),
-                                         vget_high_f32(_q45.val[1]));
-          float32x4_t _q5 = vcombine_f32(vget_low_f32(_q67.val[0]),
-                                         vget_low_f32(_q89.val[0]));
-          float32x4_t _q7 = vcombine_f32(vget_high_f32(_q67.val[0]),
-                                         vget_high_f32(_q89.val[0]));
-          float32x4_t _q6 = vcombine_f32(vget_low_f32(_q67.val[1]),
-                                         vget_low_f32(_q89.val[1]));
-          float32x4_t _q8 = vcombine_f32(vget_high_f32(_q67.val[1]),
-                                         vget_high_f32(_q89.val[1]));
-
-          float32x4_t _q9 = vaddq_f32(_q2, _q3);
-          float32x4_t _q10 = vaddq_f32(_q4, _q5);
-          float32x4_t _q11 = vaddq_f32(_q6, _q7);
-          float32x4_t _q12 = vsubq_f32(_q2, _q3);
-          float32x4_t _q13 = vsubq_f32(_q4, _q5);
-          float32x4_t _q14 = vsubq_f32(_q6, _q7);
-          _q6 = vmulq_lane_f32(_q13, vget_low_f32(_q0), 0);
-          _q7 = vmulq_lane_f32(_q11, vget_low_f32(_q0), 0);
-
-          _q1 = vaddq_f32(_q1, _q9);
-          _q1 = vaddq_f32(_q1, _q10);
-          _q1 = vmlaq_lane_f32(_q1, _q7, vget_high_f32(_q0), 1);
-          _q2 = vaddq_f32(_q12, _q6);
-          _q2 = vmlaq_lane_f32(_q2, _q14, vget_high_f32(_q0), 1);
-          _q3 = vmlaq_lane_f32(_q9, _q10, vget_low_f32(_q0), 1);
-          _q3 = vmlaq_lane_f32(_q3, _q11, vget_high_f32(_q0), 0);
-          _q4 = vmlaq_lane_f32(_q12, _q13, vget_high_f32(_q0), 0);
-          _q4 = vmlaq_lane_f32(_q4, _q14, vget_low_f32(_q0), 1);
-
-          _q23 = vtrnq_f32(_q1, _q2);
-          _q45 = vtrnq_f32(_q3, _q4);
-          vst1_f32(out_ptr0, vget_low_f32(_q23.val[0]));
-          vst1_f32(out_ptr0 + 2, vget_low_f32(_q45.val[0]));
-          vst1_f32(out_ptr1, vget_low_f32(_q23.val[1]));
-          vst1_f32(out_ptr1 + 2, vget_low_f32(_q45.val[1]));
-          vst1_f32(out_ptr2, vget_high_f32(_q23.val[0]));
-          vst1_f32(out_ptr2 + 2, vget_high_f32(_q45.val[0]));
-          vst1_f32(out_ptr3, vget_high_f32(_q23.val[1]));
-          vst1_f32(out_ptr3 + 2, vget_high_f32(_q45.val[1]));
-
-          _q1 = vaddq_f32(_q9, _q7);
-          _q1 = vmlaq_lane_f32(_q1, _q10, vget_high_f32(_q0), 1);
-          _q2 = vaddq_f32(_q12, _q8);
-          _q2 = vaddq_f32(_q2, _q14);
-          _q2 = vmlaq_lane_f32(_q2, _q6, vget_high_f32(_q0), 1);
-          _q23 = vtrnq_f32(_q1, _q2);
-          vst1_f32(out_ptr0 + 4, vget_low_f32(_q23.val[0]));
-          vst1_f32(out_ptr1 + 4, vget_low_f32(_q23.val[1]));
-          vst1_f32(out_ptr2 + 4, vget_high_f32(_q23.val[0]));
-          vst1_f32(out_ptr3 + 4, vget_high_f32(_q23.val[1]));
-
-          // remain 2 rows
-          _q1 = vld1q_f32(at_m_ptr0 + 16);
-          _q2 = vld1q_f32(at_m_ptr0 + 20);
-          _q3 = vld1q_f32(at_m_ptr1 + 16);
-          _q4 = vld1q_f32(at_m_ptr1 + 20);
-          _q23 = vtrnq_f32(_q1, _q2);
-          _q45 = vtrnq_f32(_q3, _q4);
-
-          float32x2_t _d2 = vget_low_f32(_q23.val[0]);
-          float32x2_t _d3 = vget_high_f32(_q23.val[0]);
-          float32x2_t _d4 = vget_low_f32(_q23.val[1]);
-          float32x2_t _d5 = vget_high_f32(_q23.val[1]);
-          float32x2_t _d6 = vget_low_f32(_q45.val[0]);
-          float32x2_t _d7 = vget_high_f32(_q45.val[0]);
-          float32x2_t _d8 = vget_low_f32(_q45.val[1]);
-          float32x2_t _d9 = vget_high_f32(_q45.val[1]);
-
-          float32x2_t _d10 = vadd_f32(_d4, _d3);
-          float32x2_t _d11 = vadd_f32(_d5, _d6);
-          float32x2_t _d12 = vadd_f32(_d8, _d7);
-          float32x2_t _d13 = vsub_f32(_d4, _d3);
-          float32x2_t _d14 = vsub_f32(_d5, _d6);
-          float32x2_t _d15 = vsub_f32(_d8, _d7);
-          float32x2_t _d16 = vmul_lane_f32(_d14, vget_low_f32(_q0), 0);
-          float32x2_t _d17 = vmul_lane_f32(_d12, vget_low_f32(_q0), 0);
-
-          float32x2_t _d18 = vadd_f32(_d2, _d10);
-          float32x2_t _d20 = vadd_f32(_d13, _d16);
-          float32x2_t _d19 = vmla_lane_f32(_d10, _d11, vget_low_f32(_q0), 1);
-          float32x2_t _d21 = vmla_lane_f32(_d13, _d14, vget_high_f32(_q0), 0);
-          _d18 = vadd_f32(_d18, _d11);
-          _d18 = vmla_lane_f32(_d18, _d17, vget_high_f32(_q0), 1);
-          _d20 = vmla_lane_f32(_d20, _d15, vget_high_f32(_q0), 1);
-          _d19 = vmla_lane_f32(_d19, _d12, vget_high_f32(_q0), 0);
-          _d21 = vmla_lane_f32(_d21, _d15, vget_low_f32(_q0), 1);
-
-          float32x2x2_t _d18d20 = vtrn_f32(_d18, _d20);
-          float32x2x2_t _d19d21 = vtrn_f32(_d19, _d21);
-          vst1_f32(out_ptr4, _d18d20.val[0]);
-          vst1_f32(out_ptr4 + 2, _d19d21.val[0]);
-          vst1_f32(out_ptr5, _d18d20.val[1]);
-          vst1_f32(out_ptr5 + 2, _d19d21.val[1]);
-
-          _d18 = vadd_f32(_d10, _d17);
-          _d18 = vmla_lane_f32(_d18, _d11, vget_high_f32(_q0), 1);
-          _d20 = vadd_f32(_d13, _d9);
-          _d20 = vadd_f32(_d20, _d15);
-          _d20 = vmla_lane_f32(_d20, _d16, vget_high_f32(_q0), 1);
-          _d18d20 = vtrn_f32(_d18, _d20);
-          vst1_f32(out_ptr4 + 4, _d18d20.val[0]);
-          vst1_f32(out_ptr5 + 4, _d18d20.val[1]);
-#else
-          asm volatile(
-              "vld1.32    {d0-d1}, [%[tm_ptr]]          \n"
-              // process 4 rows
-              "vld1.32    {d2-d5}, [%[at_m_ptr0]]!      \n"  // q1: m0, q2: m1
-              "vld1.32    {d6-d9}, [%[at_m_ptr0]]!      \n"  // q3: m2, q4: m3
-              "vld1.32    {d10-d13}, [%[at_m_ptr1]]!    \n"  // q5: m4, q6: m5
-              "vld1.32    {d14-d17}, [%[at_m_ptr1]]!    \n"  // q7: m6, q8: m7
-              "vtrn.32    q1, q2                        \n"
-              "vtrn.32    q3, q4                        \n"
-              "vtrn.32    q5, q6                        \n"
-              "vtrn.32    q7, q8                        \n"
-              "vswp.32    d3, d6                        \n"
-              "vswp.32    d5, d8                        \n"
-              "vswp.32    d11, d14                      \n"
-              "vswp.32    d13, d16                      \n"
-
-              "vadd.f32   q9, q2, q3                    \n"  // m1 + m2
-              "vadd.f32   q10, q4, q5                   \n"  // m3 + m4
-              "vadd.f32   q11, q6, q7                   \n"  // m5 + m6
-              "vsub.f32   q12, q2, q3                   \n"  // m1 - m2
-              "vsub.f32   q13, q4, q5                   \n"  // m3 - m4
-              "vsub.f32   q14, q6, q7                   \n"  // m5 - m6
-              "vmul.f32   q6, q13, d0[0]                \n"  // 2 * (m3 - m4)
-              "vmul.f32   q7, q11, d0[0]                \n"  // 2 * (m5 + m6)
-
-              "vadd.f32   q1, q1, q9                   \n"
-              "vadd.f32   q1, q1, q10                  \n"
-              "vmla.f32   q1, q7, d1[1]                \n"
-
-              "vadd.f32   q2, q12, q6                  \n"
-              "vmla.f32   q2, q14, d1[1]               \n"
-
-              "vmov.32    q3, q9                       \n"
-              "vmla.f32   q3, q10, d0[1]               \n"
-              "vmla.f32   q3, q11, d1[0]               \n"
-
-              "vmov.32    q4, q12                      \n"
-              "vmla.f32   q4, q13, d1[0]               \n"
-              "vmla.f32   q4, q14, d0[1]               \n"
-
-              "vtrn.32    q1, q2                       \n"
-              "vtrn.32    q3, q4                       \n"
-              "vswp.32    d3, d6                       \n"
-              "vswp.32    d5, d8                       \n"
-              "vst1.32    {d2-d3}, [%[out_ptr0]]!      \n"
-              "vst1.32    {d4-d5}, [%[out_ptr1]]!      \n"
-              "vst1.32    {d6-d7}, [%[out_ptr2]]!      \n"
-              "vst1.32    {d8-d9}, [%[out_ptr3]]!      \n"
-
-              "vadd.f32   q1, q9, q7                   \n"
-              "vmla.f32   q1, q10, d1[1]               \n"
-
-              "vadd.f32   q2, q12, q8                  \n"
-              "vadd.f32   q2, q2, q14                  \n"
-              "vmla.f32   q2, q6, d1[1]                \n"
-
-              "vtrn.32    q1, q2                       \n"
-              "vst1.32    {d2}, [%[out_ptr0]]!         \n"
-              "vst1.32    {d4}, [%[out_ptr1]]!         \n"
-              "vst1.32    {d3}, [%[out_ptr2]]!         \n"
-              "vst1.32    {d5}, [%[out_ptr3]]!         \n"
-
-              // remain 2 rows
-              "vld1.32    {d2-d5}, [%[at_m_ptr0]]!      \n"  // d2: m0, d3: m2,
-                                                             // d4: m1, d5: m3
-              "vld1.32    {d6-d9}, [%[at_m_ptr1]]!      \n"  // d6: m4, d7: m6,
-                                                             // d8: m5, d9: m7
-              "vtrn.32    q1, q2                        \n"
-              "vtrn.32    q3, q4                        \n"
-
-              "vadd.f32   d10, d4, d3                   \n"  // m1 + m2
-              "vadd.f32   d11, d5, d6                   \n"  // m3 + m4
-              "vadd.f32   d12, d8, d7                   \n"  // m5 + m6
-              "vsub.f32   d13, d4, d3                   \n"  // m1 - m2
-              "vsub.f32   d14, d5, d6                   \n"  // m3 - m4
-              "vsub.f32   d15, d8, d7                   \n"  // m5 - m6
-              "vmul.f32   d16, d14, d0[0]               \n"  // 2 * (m3 - m4)
-              "vmul.f32   d17, d12, d0[0]               \n"  // 2 * (m5 + m6)
-
-              "vadd.f32   d18, d2, d10                  \n"
-              "vadd.f32   d18, d18, d11                 \n"
-              "vmla.f32   d18, d17, d1[1]               \n"
-
-              "vadd.f32   d20, d13, d16                 \n"
-              "vmla.f32   d20, d15, d1[1]               \n"
-
-              "vmov.32    d19, d10                      \n"
-              "vmla.f32   d19, d11, d0[1]               \n"
-              "vmla.f32   d19, d12, d1[0]               \n"
-
-              "vmov.32    d21, d13                      \n"
-              "vmla.f32   d21, d14, d1[0]               \n"
-              "vmla.f32   d21, d15, d0[1]               \n"
-
-              "vtrn.32    d18, d20                      \n"
-              "vtrn.32    d19, d21                      \n"
-              "vst1.32    {d18-d19}, [%[out_ptr4]]!     \n"
-              "vst1.32    {d20-d21}, [%[out_ptr5]]!     \n"
-
-              "vadd.f32   d18, d10, d17                 \n"
-              "vmla.f32   d18, d11, d1[1]               \n"
-
-              "vadd.f32   d19, d13, d9                  \n"
-              "vadd.f32   d19, d19, d15                 \n"
-              "vmla.f32   d19, d16, d1[1]               \n"
-
-              "vtrn.32    d18, d19                      \n"
-              "vst1.32    {d18}, [%[out_ptr4]]!         \n"
-              "vst1.32    {d19}, [%[out_ptr5]]!         \n"
-              : [out_ptr0] "+r"(out_ptr0), [out_ptr1] "+r"(out_ptr1),
-                [out_ptr2] "+r"(out_ptr2), [out_ptr3] "+r"(out_ptr3),
-                [out_ptr4] "+r"(out_ptr4), [out_ptr5] "+r"(out_ptr5),
-                [at_m_ptr0] "+r"(at_m_ptr0), [at_m_ptr1] "+r"(at_m_ptr1)
-              : [tm_ptr] "r"((float *)transform_matrix)
-              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-                "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-#endif  // __aarch64__
-        }
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // CONV_OP
-#endif  // __ARM_NEON__
diff --git a/mobile/src/operators/mul_op.cpp b/mobile/src/operators/mul_op.cpp
deleted file mode 100644
index b11f8f95f1..0000000000
--- a/mobile/src/operators/mul_op.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MUL_OP
-
-#include "mul_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void MulOp<Dtype, T>::InferShape() const {
-  auto x_dims = this->param_.InputX()->dims();
-  auto y_dims = this->param_.InputY()->dims();
-  int x_num_col_dims = this->param_.XNumColDims();
-  int y_num_col_dims = this->param_.YNumColDims();
-
-  assert(x_dims.size() > x_num_col_dims);
-  assert(y_dims.size() > y_num_col_dims);
-
-  /// (1,2,3,4) , x_num_col_dims = 2  -> (2,12)
-  auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
-  auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
-
-  assert(x_mat_dims[1] == y_mat_dims[0]);
-
-  std::vector<int64_t> output_dims;
-  output_dims.reserve(
-      static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims));
-
-  for (int i = 0; i < x_num_col_dims; ++i) {
-    output_dims.push_back(x_dims[i]);
-  }
-
-  for (int i = y_num_col_dims; i < y_dims.size(); ++i) {
-    output_dims.push_back(y_dims[i]);
-  }
-
-  framework::DDim ddim = framework::make_ddim(output_dims);
-  this->param_.Out()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(mul, ops::MulOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(mul, ops::MulOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(mul, ops::MulOp);
-#endif
-#endif
diff --git a/mobile/src/operators/mul_op.h b/mobile/src/operators/mul_op.h
deleted file mode 100644
index b08cdbf991..0000000000
--- a/mobile/src/operators/mul_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MUL_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/mul_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class MulOp : public framework::OperatorWithKernel<
-                  DeviceType, MulParam<DeviceType>,
-                  operators::MulKernel<DeviceType, T>> {
- public:
-  MulOp(const std::string &type, const VariableNameMap &inputs,
-        const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-        framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, MulParam<DeviceType>,
-                                      operators::MulKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/multiclass_nms_op.cpp b/mobile/src/operators/multiclass_nms_op.cpp
deleted file mode 100644
index 1dd7883c8b..0000000000
--- a/mobile/src/operators/multiclass_nms_op.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MULTICLASSNMS_OP
-
-#include "operators/multiclass_nms_op.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void MultiClassNMSOp<Dtype, T>::InferShape() const {
-  auto input_bboxes_dims = this->param_.InputBBoxes()->dims();
-  auto input_scores_dims = this->param_.InputScores()->dims();
-  if (input_scores_dims.size() != 3) {
-    LOG(kLOG_ERROR) << "Input Scores size must be 3";
-  }
-  if (input_bboxes_dims[2] % 4 != 0 || input_bboxes_dims[2] < 4) {
-    LOG(kLOG_ERROR) << "Input BBoxes 2nd dimension must be multiples of 4";
-  }
-  if (input_bboxes_dims[1] != input_scores_dims[2]) {
-    LOG(kLOG_ERROR) << "Predict bboxes must be equal";
-  }
-  // pre size, will change in Compute.
-  this->param_.Out()->Resize(
-      framework::make_ddim({input_bboxes_dims[1], input_bboxes_dims[2] + 2}));
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(multiclass_nms, ops::MultiClassNMSOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(multiclass_nms, ops::MultiClassNMSOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/multiclass_nms_op.h b/mobile/src/operators/multiclass_nms_op.h
deleted file mode 100644
index bba701d81a..0000000000
--- a/mobile/src/operators/multiclass_nms_op.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MULTICLASSNMS_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/multiclass_nms_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class MultiClassNMSOp : public framework::OperatorWithKernel<
-                            DeviceType, MultiClassNMSParam<DeviceType>,
-                            operators::MultiClassNMSKernel<DeviceType, T>> {
- public:
-  MultiClassNMSOp(const std::string &type, const VariableNameMap &inputs,
-                  const VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, MultiClassNMSParam<DeviceType>,
-            operators::MultiClassNMSKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/nearest_interp_op.cpp b/mobile/src/operators/nearest_interp_op.cpp
deleted file mode 100644
index 14e71b78f1..0000000000
--- a/mobile/src/operators/nearest_interp_op.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef NEAREST_INTERP_OP
-
-#include "operators/nearest_interp_op.h"
-#include <vector>
-namespace paddle_mobile {
-namespace operators {
-template <typename DeviceType, typename T>
-void NearestInterpolationOp<DeviceType, T>::InferShape() const {
-  PADDLE_MOBILE_ENFORCE(this->param_.InputX() != nullptr,
-                        "Input(X) of BilinearInterOp should not be null.");
-  PADDLE_MOBILE_ENFORCE(this->param_.Out() != nullptr,
-                        "Output(Out) of BilinearInterOp should not be null.");
-
-  auto dim_x = this->param_.InputX()->dims();  // NCHW format
-  int out_h = this->param_.OutH();
-  int out_w = this->param_.OutW();
-  PADDLE_MOBILE_ENFORCE(dim_x.size() == 4, "X's dimension must be 4");
-
-  if (this->param_.InputOutPutSize() != nullptr) {
-    auto out_size_dim = this->param_.InputOutPutSize()->dims();
-
-    PADDLE_MOBILE_ENFORCE(out_size_dim.size() == 1,
-                          "OutSize's dimension size must be 1");
-    PADDLE_MOBILE_ENFORCE(out_size_dim[0] == 2, "OutSize's dim[0] must be 2");
-  }
-  std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
-  this->param_.Out()->Resize(framework::make_ddim(dim_out));
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(nearest_interp, ops::NearestInterpolationOp);
-#endif
-
-#if PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(nearest_interp, ops::NearestInterpolationOp)
-#endif
-
-#endif
diff --git a/mobile/src/operators/nearest_interp_op.h b/mobile/src/operators/nearest_interp_op.h
deleted file mode 100644
index 130de53231..0000000000
--- a/mobile/src/operators/nearest_interp_op.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef NEAREST_INTERP_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/nearest_interp_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class NearestInterpolationOp
-    : public framework::OperatorWithKernel<
-          DeviceType, NearestInterpolationParam<DeviceType>,
-          operators::NearestInterpolationKernel<DeviceType, T>> {
- public:
-  NearestInterpolationOp(const std::string &type, const VariableNameMap &inputs,
-                         const VariableNameMap &outputs,
-                         const framework::AttributeMap &attrs,
-                         framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, NearestInterpolationParam<DeviceType>,
-            operators::NearestInterpolationKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/norm_op.cpp b/mobile/src/operators/norm_op.cpp
deleted file mode 100644
index 5541755eb0..0000000000
--- a/mobile/src/operators/norm_op.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef NORM_OP
-
-#include "operators/norm_op.h"
-#include "framework/op_proto_maker.h"
-#include "framework/op_registry.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void NormOp<Dtype, T>::InferShape() const {
-  auto x_dims = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(x_dims);
-
-  int axis = this->param_.Axis();
-  if (axis < 0) {
-    axis += x_dims.size();
-  }
-  x_dims[axis] = 1;
-  this->param_.OutputNorm()->Resize(x_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(norm, ops::NormOp);
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-#endif
-
-#endif
diff --git a/mobile/src/operators/norm_op.h b/mobile/src/operators/norm_op.h
deleted file mode 100644
index 64d8e7c3cc..0000000000
--- a/mobile/src/operators/norm_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef NORM_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/norm_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-template <typename DeviceType, typename T>
-class NormOp
-    : public framework::OperatorWithKernel<DeviceType, NormParam<DeviceType>,
-                                           NormKernel<DeviceType, T>> {
- public:
-  NormOp(const string &type, const VariableNameMap &inputs,
-         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-         framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, NormParam<DeviceType>,
-                                      NormKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/one_hot_op.cpp b/mobile/src/operators/one_hot_op.cpp
deleted file mode 100644
index 64fcc64785..0000000000
--- a/mobile/src/operators/one_hot_op.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ONE_HOT_OP
-
-#include "operators/one_hot_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void OnehotOp<Dtype, T>::InferShape() const {
-  const auto &x_dims = this->param_.input_->dims();
-  int depth = this->param_.depth_;
-  framework::DDim out_dims(x_dims);
-  out_dims[out_dims.size() - 1] = depth;
-  this->param_.output_->Resize(out_dims);
-  if (std::is_same<DeviceType<kCPU>, Dtype>::value) {
-    this->param_.output_->set_lod(this->param_.input_->lod());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(one_hot, ops::OnehotOp);
-#endif
-
-#endif  // ONE_HOT_OP
diff --git a/mobile/src/operators/one_hot_op.h b/mobile/src/operators/one_hot_op.h
deleted file mode 100644
index 4b7e83bf99..0000000000
--- a/mobile/src/operators/one_hot_op.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ONE_HOT_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/one_hot_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-DECLARE_OPERATOR(Onehot, OnehotParam, OnehotKernel);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // ONE_HOT_OP
diff --git a/mobile/src/operators/op_param.cpp b/mobile/src/operators/op_param.cpp
deleted file mode 100644
index bccff4a274..0000000000
--- a/mobile/src/operators/op_param.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef CONV_OP
-template <>
-Print &operator<<(Print &printer, const ConvParam<CPU> &conv_param) {
-  printer << "parameter of conv: "
-          << "\n";
-  printer << "  stride: "
-          << " (" << conv_param.Strides()[0] << conv_param.Strides()[1] << ") "
-          << "\n";
-  printer << "  paddings: "
-          << " (" << conv_param.Paddings()[0] << conv_param.Paddings()[1]
-          << ") "
-          << "\n";
-  printer << "  dilations: "
-          << " (" << conv_param.Dilations()[0] << conv_param.Dilations()[1]
-          << ") "
-          << "\n";
-  printer << "  groups: " << conv_param.Groups() << "\n";
-  printer << "  input  dims: " << conv_param.Input()->dims() << "\n";
-  printer << "  filter dims: " << conv_param.Filter()->dims() << "\n";
-  printer << "  output dims: " << conv_param.Output()->dims();
-  return printer;
-}
-
-template class ConvParam<CPU>;
-template class ConvParam<FPGA>;
-#endif
-
-#ifdef ELEMENTWISEADD_OP
-template class ElementwiseAddParam<CPU>;
-template class ElementwiseAddParam<FPGA>;
-#endif
-
-#ifdef ELEMENTWISEMUL_OP
-template class ElementwiseMulParam<CPU>;
-template class ElementwiseMulParam<FPGA>;
-#endif
-
-#ifdef MUL_OP
-template class MulParam<CPU>;
-template class MulParam<FPGA>;
-#endif
-
-#ifdef CONCAT_OP
-template class ConcatParam<CPU>;
-template class ConcatParam<FPGA>;
-#endif
-
-#ifdef LRN_OP
-template class LrnParam<CPU>;
-template class LrnParam<FPGA>;
-#endif
-
-#ifdef FUSION_CONVADD_OP
-
-Print &operator<<(Print &printer, const FusionConvAddParam<CPU> &conv_param) {
-  printer << "parameter of conv_add: "
-          << "\n";
-  printer << "  stride: "
-          << " (" << conv_param.Strides()[0] << conv_param.Strides()[1] << ") "
-          << "\n";
-  printer << "  paddings: "
-          << " (" << conv_param.Paddings()[0] << conv_param.Paddings()[1]
-          << ") "
-          << "\n";
-  printer << "  dilations: "
-          << " (" << conv_param.Dilations()[0] << conv_param.Dilations()[1]
-          << ") "
-          << "\n";
-  printer << "  groups: " << conv_param.Groups() << "\n";
-  printer << "  input  dims: " << conv_param.Input()->dims() << "\n";
-  printer << "  filter dims: " << conv_param.Filter()->dims() << "\n";
-  printer << "  bias dims: " << conv_param.Bias()->dims() << "\n";
-  printer << "  output dims: " << conv_param.Output()->dims();
-  return printer;
-}
-
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/op_param.h b/mobile/src/operators/op_param.h
deleted file mode 100644
index 07413fb422..0000000000
--- a/mobile/src/operators/op_param.h
+++ /dev/null
@@ -1,3566 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-#include "common/log.h"
-#include "common/type_define.h"
-#include "common/types.h"
-#include "framework/attribute.h"
-#include "framework/lod_tensor.h"
-#include "framework/scope.h"
-#include "framework/tensor.h"
-#include "framework/type_trait.h"
-#include "framework/variable.h"
-
-#ifdef PADDLE_MOBILE_FPGA_V1
-#include "fpga/V1/api.h"
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA_V2
-#include "fpga/V2/api.h"
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA_KD
-#include "fpga/KD/context.hpp"
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-#include "framework/cl/cl_image.h"
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::Attribute;
-using framework::AttributeMap;
-using framework::LoDTensor;
-using framework::Scope;
-using framework::Tensor;
-using framework::Variable;
-using std::string;
-using std::vector;
-
-using framework::DtypeTensorTrait;
-
-class OpParam {
- public:
-  OpParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-          const AttributeMap &attrs, Scope *scope)
-      : scope_(scope) {}
-
-  Scope *GetScope() const { return scope_; }
-  Scope *scope_ = nullptr;
-
-#ifdef PADDLE_MOBILE_FPGA_KD
-  zynqmp::Context &context() { return context_; }
-
-  zynqmp::Context context_;
-#endif
-
- protected:
-  template <typename T>
-  static T *InputH0From(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("H0", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputHiddenPrevFrom(const VariableNameMap &inputs,
-                                const Scope &scope) {
-    return GetVarValue<T>("HiddenPrev", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputAlphaFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Alpha", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Input", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputXFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("X", inputs, scope);
-  }
-  template <typename T>
-  static T *InputOutSizeFrom(const VariableNameMap &inputs,
-                             const Scope &scope) {
-    return GetVarValue<T>("OutSize", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputWFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("W", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputIdsFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Ids", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputEmissionFrom(const VariableNameMap &inputs,
-                              const Scope &scope) {
-    return GetVarValue<T>("Emission", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputTransitionFrom(const VariableNameMap &inputs,
-                                const Scope &scope) {
-    return GetVarValue<T>("Transition", inputs, scope);
-  }
-  template <typename T>
-  static T *InputLabelFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Label", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputXFrom1(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue1<T>("addX", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputYFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Y", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputYFrom1(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue1<T>("Y", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputZFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Z", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputBiasFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Bias", inputs, scope);
-  }
-  template <typename T>
-  static T *InputWeightFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Weight", inputs, scope);
-  }
-  template <typename T>
-  static T *InputVarianceFrom(const VariableNameMap &inputs,
-                              const Scope &scope) {
-    return GetVarValue<T>("Variance", inputs, scope);
-  }
-  template <typename T>
-  static T *InputMeanFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Mean", inputs, scope);
-  }
-  template <typename T>
-  static T *InputScaleFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Scale", inputs, scope);
-  }
-  template <typename T>
-  static T *InputImageFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Image", inputs, scope);
-  }
-  template <typename T>
-  static T *InputPriorBoxFrom(const VariableNameMap &inputs,
-                              const Scope &scope) {
-    return GetVarValue<T>("PriorBox", inputs, scope);
-  }
-  template <typename T>
-  static T *InputPriorBoxVarFrom(const VariableNameMap &inputs,
-                                 const Scope &scope) {
-    return GetVarValue<T>("PriorBoxVar", inputs, scope);
-  }
-  // LoDTensor but now use Tensor
-  template <typename T>
-  static T *InputTargetBoxFrom(const VariableNameMap &inputs,
-                               const Scope &scope) {
-    return GetVarValue<T>("TargetBox", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputBBoxesFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("BBoxes", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputScoresFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Scores", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputShapeFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Shape", inputs, scope);
-  }
-
-  template <typename T>
-  static vector<T *> InputMultiFrom(const VariableNameMap &inputs,
-                                    const Scope &scope) {
-    return GetMultiVarValue<T>("X", inputs, scope);
-  }
-
-  static vector<Variable *> InputMultiVarsFrom(const VariableNameMap &inputs,
-                                               const Scope &scope) {
-    return GetMultiVar("X", inputs, scope);
-  }
-
-  template <typename T>
-  static T *OutputBatchGateFrom(const VariableNameMap &outputs,
-                                const Scope &scope) {
-    return GetVarValue<T>("BatchGate", outputs, scope);
-  }
-
-  template <typename T>
-  static T *OutputGateFrom(const VariableNameMap &outputs, const Scope &scope) {
-    return GetVarValue<T>("Gate", outputs, scope);
-  }
-
-  template <typename T>
-  static T *OutputViterbiPathFrom(const VariableNameMap &outputs,
-                                  const Scope &scope) {
-    return GetVarValue<T>("ViterbiPath", outputs, scope);
-  }
-  template <typename T>
-  static T *OutputBatchResetHiddenPrevFrom(const VariableNameMap &outputs,
-                                           const Scope &scope) {
-    return GetVarValue<T>("BatchResetHiddenPrev", outputs, scope);
-  }
-
-  template <typename T>
-  static T *OutputResetHiddenPrevFrom(const VariableNameMap &outputs,
-                                      const Scope &scope) {
-    return GetVarValue<T>("ResetHiddenPrev", outputs, scope);
-  }
-
-  template <typename T>
-  static T *OutputBatchHiddenFrom(const VariableNameMap &outputs,
-                                  const Scope &scope) {
-    return GetVarValue<T>("BatchHidden", outputs, scope);
-  }
-
-  template <typename T>
-  static T *OutputHiddenFrom(const VariableNameMap &outputs,
-                             const Scope &scope) {
-    return GetVarValue<T>("Hidden", outputs, scope);
-  }
-
-  template <typename T>
-  static T *OutputFrom(const VariableNameMap &outputs, const Scope &scope) {
-    return GetVarValue<T>("Output", outputs, scope);
-  }
-
-  static Variable *OutVarFrom(const VariableNameMap &outputs,
-                              const Scope &scope) {
-    return GetVar("Out", outputs, scope);
-  }
-
-  template <typename T>
-  static T *OutFrom(const VariableNameMap &outputs, const Scope &scope) {
-    return GetVarValue<T>("Out", outputs, scope);
-  }
-
-  template <typename T>
-  static vector<T *> OutMultiFrom(const VariableNameMap &outputs,
-                                  const Scope &scope) {
-    return GetMultiVarValue<T>("Out", outputs, scope);
-  }
-
-  template <typename T>
-  static T *OutputYFrom(const VariableNameMap &outputs, const Scope &scope) {
-    return GetVarValue<T>("Y", outputs, scope);
-  }
-
-  template <typename T>
-  static T *OutputXShapeFrom(const VariableNameMap &outputs,
-                             const Scope &scope) {
-    return GetVarValue<T>("XShape", outputs, scope);
-  }
-
-  template <typename T>
-  static T *OutputBoxesFrom(const VariableNameMap &outputs,
-                            const Scope &scope) {
-    return GetVarValue<T>("Boxes", outputs, scope);
-  }
-
-  template <typename T>
-  static T *OutputBoxFrom(const VariableNameMap &outputs, const Scope &scope) {
-    return GetVarValue<T>("OutputBox", outputs, scope);
-  }
-
-  template <typename T>
-  static T *OutputNormFrom(const VariableNameMap &outputs, const Scope &scope) {
-    return GetVarValue<T>("Norm", outputs, scope);
-  }
-
-  template <typename T>
-  static T *OutputVariancesFrom(const VariableNameMap &outputs,
-                                const Scope &scope) {
-    return GetVarValue<T>("Variances", outputs, scope);
-  }
-
-  template <typename T>
-  static T *MidOutFrom(const VariableNameMap &outputs, const Scope &scope) {
-    return GetVarValue<T>("MidOut", outputs, scope);
-  }
-
-  template <typename T>
-  static T *FilterFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Filter", inputs, scope);
-  }
-
-  template <typename T>
-  static const T GetAttr(const string &key, const AttributeMap &map) {
-    return ((Attribute)map.at(key)).Get<T>();
-  }
-  static const std::string GetStringAttr(const string &key,
-                                         const AttributeMap &map) {
-    return ((Attribute)map.at(key)).GetString();
-  }
-
-  static const bool HasAttr(const string &key, const AttributeMap &map) {
-    return map.count(key) > 0;
-  }
-
-  template <typename T>
-  static T *GetVarValue(const string &key, const VariableNameMap &var_map,
-                        const Scope &scope) {
-    PADDLE_MOBILE_ENFORCE(var_map.count(key) > 0,
-                          "%s is not contained in var_map", key.c_str())
-    auto var_vec = var_map.at(key);
-    if (!var_vec.empty()) {
-      auto var = scope.FindVar(var_vec[0]);
-      return var->GetMutable<T>();
-    } else {
-      return nullptr;
-    }
-  }
-
-  static Variable *GetVar(const string &key, const VariableNameMap &var_map,
-                          const Scope &scope) {
-    PADDLE_MOBILE_ENFORCE(var_map.count(key) > 0,
-                          "%s is not contained in var_map", key.c_str())
-    auto var_vec = var_map.at(key);
-    if (!var_vec.empty()) {
-      auto var = scope.FindVar(var_vec[0]);
-      return var;
-    } else {
-      return nullptr;
-    }
-  }
-
-  static std::string Getkey(const string &key, const VariableNameMap &var_map,
-                            int index) {
-    PADDLE_MOBILE_ENFORCE(var_map.count(key) > index,
-                          "%s is not contained in var_map", key.c_str())
-    auto var_vec = var_map.at(key);
-    return var_vec[index];
-  }
-
-  template <typename T>
-  static T *GetVarValue1(const string &key, const VariableNameMap &var_map,
-                         const Scope &scope) {
-    PADDLE_MOBILE_ENFORCE(var_map.count(key) > 0,
-                          "%s is not contained in var_map", key.c_str())
-    auto var_vec = var_map.at(key);
-    if (!var_vec.empty()) {
-      auto var = scope.FindVar(var_vec[1]);
-      return var->GetMutable<T>();
-    } else {
-      return nullptr;
-    }
-  }
-
-  template <typename T>
-  static vector<T *> GetMultiVarValue(const string &key,
-                                      const VariableNameMap &var_map,
-                                      const Scope &scope) {
-    auto var_vecs = var_map.at(key);
-    assert(var_vecs.size() > 1);
-    vector<T *> var_res;
-    for (auto &var_vec : var_vecs) {
-      auto var = scope.FindVar(var_vec);
-      var_res.push_back(var->GetMutable<T>());
-    }
-    return var_res;
-  }
-
-  static vector<Variable *> GetMultiVar(const string &key,
-                                        const VariableNameMap &var_map,
-                                        const Scope &scope) {
-    auto var_vecs = var_map.at(key);
-    assert(var_vecs.size() > 1);
-    vector<Variable *> var_res;
-    for (auto &var_vec : var_vecs) {
-      auto var = scope.FindVar(var_vec);
-      var_res.push_back(var);
-    }
-    return var_res;
-  }
-};
-
-#define GET_VAR_AS_TENSOR(name, name_dict, scope) \
-  OpParam::GetVarValue<framework::Tensor>(name, name_dict, scope)
-
-#define GET_VAR_AS_LOD_TENSOR(name, name_dict, scope) \
-  OpParam::GetVarValue<framework::LoDTensor>(name, name_dict, scope)
-
-template <typename Dtype>
-class ConvParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ConvParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    filter_ = OpParam::FilterFrom<GType>(inputs, *scope);
-    input_ = OpParam::InputFrom<GType>(inputs, *scope);
-    if (outputs.count("Output")) {
-      output_ = OpParam::OutputFrom<GType>(outputs, *scope);
-    }
-    strides_ = OpParam::GetAttr<vector<int>>("strides", attrs);
-    paddings_ = OpParam::GetAttr<vector<int>>("paddings", attrs);
-    dilations_ = OpParam::GetAttr<vector<int>>("dilations", attrs);
-    groups = OpParam::GetAttr<int>("groups", attrs);
-  }
-
-  const GType *Input() const { return input_; }
-
-  GType *Filter() const { return filter_; }
-
-  GType *Output() const { return output_; }
-
-  const vector<int> &Strides() const { return strides_; }
-
-  const vector<int> &Paddings() const { return paddings_; }
-
-  const vector<int> &Dilations() const { return dilations_; }
-
-  enum ExecMode {
-    EXEC_INVALID = 0,
-    EXEC_GEMM_FLOAT,
-    EXEC_DEPTHWISE3x3S1_FLOAT,
-    EXEC_DEPTHWISE3x3S2_FLOAT,
-    EXEC_WINOGRAD3X3_FLOAT,
-    EXEC_WINOGRAD5X5_FLOAT,
-    EXEC_DEPTHWISE5x5_FLOAT,
-    EXEC_GEMM_INT8,
-    EXEC_DEPTHWISE3x3_INT8,
-    EXEC_DEPTHWISE5x5_INT8,
-    EXEC_SLIDINGWINDOW3x3S1_FLOAT,
-    EXEC_SLIDINGWINDOW3x3S2_FLOAT,
-    EXEC_DEPTHWISE3x3_FLOAT,
-    EXEC_SLIDINGWINDOW1x1_FLOAT,
-    EXEC_SLIDINGWINDOW3x3_FLOAT,
-    EXEC_SLIDINGWINDOW5x5_FLOAT,
-    EXEC_SLIDINGWINDOW7x7_FLOAT,
-    EXEC_GEMM1x1s1_FLOAT,
-  };
-
-  ExecMode &ExecMode() const { return exec_mode_; }
-
-  const int &Groups() const { return groups; }
-
-#ifdef PADDLE_MOBILE_CL
-  int Offset() const { return offset_; }
-
-  int SetOffset(int in_offset) { offset_ = in_offset; }
-
-#endif
-
- public:
-  GType *input_;
-  GType *output_;
-  GType *filter_;
-  GType *transformed_filter_;
-  vector<int> strides_;
-  vector<int> paddings_;
-  vector<int> dilations_;
-  mutable enum ExecMode exec_mode_;
-  int groups;
-
-#ifdef PADDLE_MOBILE_CL
-  int offset_;
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-
- public:
-  fpga::SplitConvArgs fpga_conv_args;
-
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-
- public:
-  fpga::DWconvArgs fpga_dwconv_args;
-
- public:
-  const fpga::DWconvArgs &FpgaDwconvArgs() const { return fpga_dwconv_args; }
-  void SetFpgaArgs(const fpga::DWconvArgs &args) { fpga_dwconv_args = args; }
-#endif
-};
-template <typename Dtype>
-Print &operator<<(Print &printer, const ConvParam<Dtype> &conv_param);
-
-template <typename Dtype>
-class ElementwiseAddParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ElementwiseAddParam(const VariableNameMap &inputs,
-                      const VariableNameMap &outputs, const AttributeMap &attrs,
-                      Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_y_ = InputYFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    axis_ = GetAttr<int>("axis", attrs);
-  }
-
-  const GType *InputX() const { return input_x_; }
-
-  const GType *InputY() const { return input_y_; }
-
-  GType *Out() const { return out_; }
-
-  const int &Axis() const { return axis_; }
-
- private:
-  GType *input_x_;
-  GType *input_y_;
-  GType *out_;
-  int axis_;
-#ifdef PADDLE_MOBILE_FPGA
-
- private:
-  fpga::EWAddArgs fpga_EW_add_args;
-
- public:
-  const fpga::EWAddArgs &FpgaArgs() const { return fpga_EW_add_args; }
-  void SetFpgaArgs(const fpga::EWAddArgs &args) { fpga_EW_add_args = args; }
-
- public:
-  Tensor float_input_x, float_out;
-
-#endif
-};
-
-#ifdef ELEMENTWISEMUL_OP
-template <typename Dtype>
-class ElementwiseMulParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ElementwiseMulParam(const VariableNameMap &inputs,
-                      const VariableNameMap &outputs, const AttributeMap &attrs,
-                      Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_y_ = InputYFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    axis_ = GetAttr<int>("axis", attrs);
-  }
-
-  const GType *InputX() const { return input_x_; }
-
-  const GType *InputY() const { return input_y_; }
-
-  GType *Out() const { return out_; }
-
-  const int &Axis() const { return axis_; }
-
- private:
-  GType *input_x_;
-  GType *input_y_;
-  GType *out_;
-  int axis_;
-#ifdef PADDLE_MOBILE_FPGA
-
- public:
-  Tensor float_input_x, float_out;
-
-#endif
-};
-#endif
-
-#ifdef FUSION_ELEMENTWISEADDRELU_OP
-template <typename Dtype>
-using ElementwiseAddReluParam = ElementwiseAddParam<Dtype>;
-#endif
-
-#ifdef ELEMENTWISESUB_OP
-template <typename Dtype>
-class ElementwiseSubParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ElementwiseSubParam(const VariableNameMap &inputs,
-                      const VariableNameMap &outputs, const AttributeMap &attrs,
-                      Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_y_ = InputYFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    axis_ = GetAttr<int>("axis", attrs);
-  }
-
-  const GType *InputX() const { return input_x_; }
-
-  const GType *InputY() const { return input_y_; }
-
-  GType *Out() const { return out_; }
-
-  const int &Axis() const { return axis_; }
-
- private:
-  GType *input_x_;
-  GType *input_y_;
-  GType *out_;
-  int axis_;
-};
-#endif
-
-#ifdef MUL_OP
-template <typename Dtype>
-class MulParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  MulParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-           const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_y_ = InputYFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    x_num_col_dims_ = GetAttr<int>("x_num_col_dims", attrs);
-    y_num_col_dims_ = GetAttr<int>("y_num_col_dims", attrs);
-  }
-
-  GType *InputX() const { return input_x_; }
-
-  GType *InputY() const { return input_y_; }
-
-  GType *Out() const { return out_; }
-
-  const int &XNumColDims() const { return x_num_col_dims_; }
-
-  const int &YNumColDims() const { return y_num_col_dims_; }
-
- private:
-  GType *input_x_;
-  GType *input_y_;
-  GType *out_;
-  int x_num_col_dims_;
-  int y_num_col_dims_;
-};
-#endif
-
-#ifdef CONCAT_OP
-template <typename Dtype>
-class ConcatParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ConcatParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-              const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    inputs_ = InputMultiFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    axis_ = GetAttr<int>("axis", attrs);
-    original_output_dims_size_ = out_->dims().size();
-  }
-
-  vector<GType *> Inputs() const { return inputs_; }
-
-  GType *Out() const { return out_; }
-
-  const int &Axis() const { return axis_; }
-
- public:
-  vector<GType *> inputs_;
-  GType *out_;
-  int axis_;
-  int original_output_dims_size_;
-#ifdef PADDLE_MOBILE_FPGA
-
- private:
-  fpga::ConcatArgs fpga_concat_args;
-
- public:
-  const fpga::ConcatArgs &FpgaArgs() const { return fpga_concat_args; }
-  void SetFpgaArgs(const fpga::ConcatArgs &args) { fpga_concat_args = args; }
-#endif
-};
-#endif
-
-#ifdef SUM_OP
-template <typename Dtype>
-class SumParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  SumParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-           const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    inputs_vars_ = InputMultiVarsFrom(inputs, *scope);
-    out_var_ = OutVarFrom(outputs, *scope);
-    inputs_ = InputMultiFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-  }
-
-  vector<Variable *> InputsVars() const { return inputs_vars_; }
-
-  Variable *OutVar() const { return out_var_; }
-
-  vector<GType *> Inputs() const { return inputs_; }
-
-  GType *Out() const { return out_; }
-
- private:
-  vector<Variable *> inputs_vars_;
-  Variable *out_var_;
-  vector<GType *> inputs_;
-  GType *out_;
-};
-#endif
-
-#ifdef LRN_OP
-template <typename Dtype>
-class LrnParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  LrnParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-           const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    mid_out_ = MidOutFrom<GType>(outputs, *scope);
-    n_ = GetAttr<int>("n", attrs);
-    alpha_ = GetAttr<float>("alpha", attrs);
-    beta_ = GetAttr<float>("beta", attrs);
-    k_ = GetAttr<float>("k", attrs);
-    data_format_ = GetStringAttr("data_format", attrs);
-  }
-
-  const GType *InputX() const { return input_x_; }
-
-  GType *Out() const { return out_; }
-
-  GType *MidOut() const { return mid_out_; }
-
-  const int &N() const { return n_; }
-
-  const float &Alpha() const { return alpha_; }
-
-  const float &Beta() const { return beta_; }
-
-  const float &K() const { return k_; }
-
-  const string &DataFormat() const { return data_format_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-  GType *mid_out_;
-  int n_;
-  float alpha_;
-  float beta_;
-  float k_;
-  string data_format_;
-};
-#endif
-
-#ifdef NORM_OP
-template <typename Dtype>
-class NormParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  NormParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    output_norm_ = OutputNormFrom<GType>(outputs, *scope);
-    epsilon_ = GetAttr<float>("epsilon", attrs);
-    axis_ = GetAttr<int>("axis", attrs);
-  }
-
-  const GType *InputX() const { return input_x_; }
-
-  GType *Out() const { return out_; }
-
-  GType *OutputNorm() const { return output_norm_; }
-
-  const float &Epsilon() const { return epsilon_; }
-
-  const int &Axis() const { return axis_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-  GType *output_norm_;
-  float epsilon_;
-  int axis_;
-};
-#endif
-
-#ifdef BATCHNORM_OP
-template <typename Dtype>
-class BatchNormParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  BatchNormParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                 const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    output_y_ = OutputYFrom<GType>(outputs, *scope);
-    input_bias_ = InputBiasFrom<GType>(inputs, *scope);
-    input_mean_ = InputMeanFrom<GType>(inputs, *scope);
-    input_scale_ = InputScaleFrom<GType>(inputs, *scope);
-    input_variance_ = InputVarianceFrom<GType>(inputs, *scope);
-    epsilon_ = GetAttr<float>("epsilon", attrs);
-    momentum_ = GetAttr<float>("momentum", attrs);
-    //    is_test_ = GetAttr<bool>("is_test", attrs);
-  }
-
-  const GType *InputX() const { return input_x_; }
-
-  GType *OutputY() const { return output_y_; }
-
-  const GType *InputBias() const { return input_bias_; }
-
-  const GType *InputMean() const { return input_mean_; }
-
-  const GType *InputScale() const { return input_scale_; }
-
-  const GType *InputVariance() const { return input_variance_; }
-
-  const float &Epsilon() const { return epsilon_; }
-
-  const float &Momentum() const { return momentum_; }
-
-  const bool &IsTest() const { return is_test_; }
-
-  const string &DataFormat() const { return data_format_; }
-
-  void SetNewScale(GType *new_scale) { new_scale_ = new_scale; }
-
-  void SetNewBias(GType *new_bias) { new_bias_ = new_bias; }
-
-  const GType *NewScale() const { return new_scale_; }
-
-  const GType *NewBias() const { return new_bias_; }
-
- private:
-  GType *input_x_;
-  GType *output_y_;
-  GType *input_bias_;
-  GType *input_mean_;
-  GType *input_scale_;
-  GType *input_variance_;
-  float epsilon_;
-  float momentum_;
-  bool is_test_;
-  string data_format_;
-  GType *new_bias_;
-  GType *new_scale_;
-};
-#endif
-
-#ifdef INSTANCENORM_OP
-template <typename Dtype>
-class InstanceNormParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  InstanceNormParam(const VariableNameMap &inputs,
-                    const VariableNameMap &outputs, const AttributeMap &attrs,
-                    Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    epsilon_ = GetAttr<float>("epsilon", attrs);
-  }
-
-  const GType *InputX() const { return input_x_; }
-
-  GType *Out() const { return out_; }
-
-  const float &Epsilon() const { return epsilon_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-  float epsilon_;
-};
-#endif
-
-#ifdef POOL_OP
-template <typename Dtype>
-class PoolParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  PoolParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = InputXFrom<GType>(inputs, *scope);
-
-    output_ = OutFrom<GType>(outputs, *scope);
-    pooling_type_ = GetStringAttr("pooling_type", attrs);
-    ksize_ = GetAttr<vector<int>>("ksize", attrs);
-    strides_ = GetAttr<vector<int>>("strides", attrs);
-    paddings_ = GetAttr<vector<int>>("paddings", attrs);
-    ceil_mode_ = GetAttr<bool>("ceil_mode", attrs);
-    global_pooling_ = GetAttr<bool>("global_pooling", attrs);
-
-    if (HasAttr("exclusive", attrs)) {
-      exclusive_ = GetAttr<bool>("exclusive", attrs);
-    } else {
-      exclusive_ = true;
-    }
-  }
-
-  const GType *Input() const { return input_; }
-
-  GType *Output() const { return output_; }
-
-  const string &PoolingType() const { return pooling_type_; }
-
-  const vector<int> &Ksize() const { return ksize_; }
-
-  const vector<int> &Strides() const { return strides_; }
-
-  const vector<int> &Paddings() const { return paddings_; }
-
-  bool isCeilMode() const { return ceil_mode_; }
-
-  bool isGlobalPooling() const { return global_pooling_; }
-
-  bool isExclusive() const { return exclusive_; }
-
- private:
-  GType *input_;
-  GType *output_;
-  string pooling_type_;
-  vector<int> ksize_;
-  vector<int> strides_;
-  vector<int> paddings_;
-  bool ceil_mode_;
-  bool global_pooling_ = false;
-  bool exclusive_ = true;
-#ifdef PADDLE_MOBILE_FPGA
-
- private:
-  fpga::PoolingArgs fpga_pool_args;
-
- public:
-  const fpga::PoolingArgs &FpgaArgs() const { return fpga_pool_args; }
-  void SetFpgaArgs(const fpga::PoolingArgs &args) { fpga_pool_args = args; }
-#endif
-};
-#endif
-
-#ifdef PRIORBOX_OP
-template <typename Dtype>
-class PriorBoxParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  PriorBoxParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = InputFrom<GType>(inputs, *scope);
-    input_image_ = InputImageFrom<GType>(inputs, *scope);
-    output_boxes_ = OutputBoxesFrom<GType>(outputs, *scope);
-    output_variances_ = OutputVariancesFrom<GType>(outputs, *scope);
-    min_sizes_ = GetAttr<vector<float>>("min_sizes", attrs);
-    max_sizes_ = GetAttr<vector<float>>("max_sizes", attrs);
-    aspect_ratios_ = GetAttr<vector<float>>("aspect_ratios", attrs);
-    variances_ = GetAttr<vector<float>>("variances", attrs);
-
-    if (HasAttr("min_max_aspect_ratios_order", attrs)) {
-      min_max_aspect_ratios_order_ =
-          GetAttr<bool>("min_max_aspect_ratios_order", attrs);
-    } else {
-      min_max_aspect_ratios_order_ = false;
-    }
-    flip_ = GetAttr<bool>("flip", attrs);
-    clip_ = GetAttr<bool>("clip", attrs);
-    step_w_ = GetAttr<float>("step_w", attrs);
-    step_h_ = GetAttr<float>("step_h", attrs);
-    offset_ = GetAttr<float>("offset", attrs);
-  }
-  const GType *Input() const { return input_; }
-
-  const GType *InputImage() const { return input_image_; }
-
-  GType *OutputBoxes() const { return output_boxes_; }
-
-  GType *OutputVariances() const { return output_variances_; }
-
-  const vector<float> &MinSizes() const { return min_sizes_; }
-
-  const vector<float> &MaxSizes() const { return max_sizes_; }
-
-  const vector<float> &AspectRatios() const { return aspect_ratios_; }
-
-  const vector<float> &Variances() const { return variances_; }
-
-  const bool &Flip() const { return flip_; }
-
-  const bool &Clip() const { return clip_; }
-
-  const float &StepW() const { return step_w_; }
-
-  const float &StepH() const { return step_h_; }
-
-  const float &Offset() const { return offset_; }
-
-  const bool &MinMaxAspectRatiosOrder() const {
-    return min_max_aspect_ratios_order_;
-  }
-
- private:
-  GType *input_;
-  GType *input_image_;
-  GType *output_boxes_;
-  GType *output_variances_;
-  vector<float> min_sizes_;
-  vector<float> max_sizes_;
-  vector<float> aspect_ratios_;
-  vector<float> variances_;
-  bool flip_;
-  bool clip_;
-  float step_w_;
-  float step_h_;
-  float offset_;
-  bool min_max_aspect_ratios_order_;
-};
-#endif
-
-#ifdef BOXCODER_OP
-template <typename Dtype>
-class BoxCoderParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  BoxCoderParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_priorbox_ = InputPriorBoxFrom<GType>(inputs, *scope);
-    input_priorboxvar_ = InputPriorBoxVarFrom<GType>(inputs, *scope);
-    input_targetbox_ = InputTargetBoxFrom<GType>(inputs, *scope);
-    output_box_ = OutputBoxFrom<GType>(outputs, *scope);
-    code_type_ = GetStringAttr("code_type", attrs);
-  }
-  const GType *InputPriorBox() const { return input_priorbox_; }
-
-  const GType *InputPriorBoxVar() const { return input_priorboxvar_; }
-
-  const GType *InputTargetBox() const { return input_targetbox_; }
-
-  GType *OutputBox() const { return output_box_; }
-
-  const std::string &CodeType() const { return code_type_; }
-
- private:
-  GType *input_priorbox_;
-  GType *input_priorboxvar_;
-  GType *input_targetbox_;
-  GType *output_box_;
-  std::string code_type_;
-};
-#endif
-
-#ifdef SOFTMAX_OP
-template <typename Dtype>
-class SoftmaxParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  SoftmaxParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-               const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-  }
-  const GType *InputX() const { return input_x_; }
-  GType *Out() const { return out_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-
-#ifdef PADDLE_MOBILE_FPGA
-
-#ifdef PADDLE_MOBILE_FPGA_V1
-
- private:
-  std::shared_ptr<GType> float_input_x_;
-  fpga::BypassArgs fpga_bypass_args;
-
- public:
-  GType *FloatInput() const {
-    return float_input_x_ == nullptr ? input_x_ : float_input_x_.get();
-  }
-  void SetFloatInput(LoDTensor *input) { float_input_x_.reset(input); }
-  const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
-  void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
-#else
-
- private:
-  fpga::BypassArgs fpga_bypass_args;
-
- public:
-  const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
-  void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
-
- public:
-  std::shared_ptr<Tensor> float_input_x_, float_out;
-#endif
-#endif
-};
-#endif
-
-#ifdef SIGMOID_OP
-template <typename Dtype>
-class SigmoidParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  SigmoidParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-               const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-  }
-  const GType *InputX() const { return input_x_; }
-  GType *Out() const { return out_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-#ifdef PADDLE_MOBILE_FPGA
-
- private:
-  fpga::BypassArgs fpga_bypass_args;
-
- public:
-  const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
-  void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
-#endif
-};
-#endif
-
-#ifdef MULTICLASSNMS_OP
-template <typename Dtype>
-class MultiClassNMSParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  MultiClassNMSParam(const VariableNameMap &inputs,
-                     const VariableNameMap &outputs, const AttributeMap &attrs,
-                     Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_bboxes_ = InputBBoxesFrom<GType>(inputs, *scope);
-    input_scores_ = InputScoresFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    background_label_ = GetAttr<int>("background_label", attrs);
-    nms_top_k_ = GetAttr<int>("nms_top_k", attrs);
-    keep_top_k_ = GetAttr<int>("keep_top_k", attrs);
-    nms_threshold_ = GetAttr<float>("nms_threshold", attrs);
-    nms_eta_ = GetAttr<float>("nms_eta", attrs);
-    score_threshold_ = GetAttr<float>("score_threshold", attrs);
-  }
-
-  GType *InputBBoxes() const { return input_bboxes_; }
-
-  GType *InputScores() const { return input_scores_; }
-
-  GType *Out() const { return out_; }
-
-  const int &BackGroundLabel() const { return background_label_; }
-
-  const int &NMSTopK() const { return nms_top_k_; }
-
-  const int &KeepTopK() const { return keep_top_k_; }
-
-  const float &NMSThreshold() const { return nms_threshold_; }
-
-  const float &NMSEta() const { return nms_eta_; }
-
-  const float &ScoreThreshold() const { return score_threshold_; }
-
- private:
-  GType *input_bboxes_;
-  GType *input_scores_;
-  GType *out_;
-  int background_label_;
-  int nms_top_k_;
-  int keep_top_k_;
-  float nms_threshold_;
-  float nms_eta_;
-  float score_threshold_;
-};
-#endif
-
-#ifdef POLYGONBOXTRANSFORM_OP
-template <typename Dtype>
-class PolygonBoxTransformParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  PolygonBoxTransformParam(const VariableNameMap &inputs,
-                           const VariableNameMap &outputs,
-                           const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = InputFrom<GType>(inputs, *scope);
-    output_ = OutputFrom<GType>(outputs, *scope);
-  }
-  const GType *Input() const { return input_; }
-  GType *Output() const { return output_; }
-
- private:
-  GType *input_;
-  GType *output_;
-};
-#endif
-
-template <typename Dtype>
-class FeedParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FeedParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<std::vector<LoDTensor>>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    col_ = GetAttr<int>("col", attrs);
-    auto var = scope->FindVar("batch_size");
-    batch_size = var->GetValue<int>();
-  }
-  const std::vector<LoDTensor> *InputX() const { return input_x_; }
-  GType *Out() const { return out_; }
-  const int Col() const { return col_; }
-  const int BatchSize() const { return batch_size; }
-
- private:
-  std::vector<LoDTensor> *input_x_;
-  GType *out_;
-  int col_;
-  int batch_size;
-};
-
-template <typename Dtype>
-class FetchParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FetchParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-             const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<std::vector<LoDTensor>>(outputs, *scope);
-    col_ = GetAttr<int>("col", attrs);
-  }
-
-  const GType *InputX() const { return input_x_; }
-  std::vector<LoDTensor> *Out() const { return out_; }
-  const int Col() const { return col_; }
-
- private:
-  GType *input_x_;
-  std::vector<LoDTensor> *out_;
-  int col_;
-#ifdef PADDLE_MOBILE_FPGA
-
- public:
-#ifdef PADDLE_MOBILE_FPGA_V1
-  fpga::BypassArgs fpga_bypass_args;
-  Tensor aligned_out;
-#else
-  std::shared_ptr<Tensor> aligned_out;
-#endif
-#endif
-};
-
-#ifdef FILL_CONSTANT_OP
-template <typename Dtype>
-class FillConstantParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FillConstantParam(const VariableNameMap &inputs,
-                    const VariableNameMap &outputs, const AttributeMap &attrs,
-                    Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    out_var_ = OutVarFrom(outputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    dtype_ = GetAttr<int>("dtype", attrs);
-    shape_ = GetAttr<vector<int>>("shape", attrs);
-    value_ = GetAttr<float>("value", attrs);
-  }
-
-  Variable *OutVar() const { return out_var_; }
-
-  GType *Out() const { return out_; }
-
-  const int &DataDtype() const { return dtype_; }
-
-  const vector<int> &Shape() const { return shape_; }
-
-  const float &Value() const { return value_; }
-
- private:
-  Variable *out_var_;
-  GType *out_;
-  int dtype_;
-  vector<int> shape_;
-  float value_;
-};
-#endif
-
-#ifdef FILL_CONSTANT_BATCH_SIZE_LIKE_OP
-template <typename Dtype>
-class FillConstantBatchSizeLikeParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FillConstantBatchSizeLikeParam(const VariableNameMap &inputs,
-                                 const VariableNameMap &outputs,
-                                 const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = InputFrom<GType>(inputs, *scope);
-    out_var_ = OutVarFrom(outputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    dtype_ = GetAttr<int>("dtype", attrs);
-    shape_ = GetAttr<vector<int>>("shape", attrs);
-    value_ = GetAttr<float>("value", attrs);
-    input_dim_idx_ = GetAttr<int>("input_dim_idx", attrs);
-    output_dim_idx_ = GetAttr<int>("output_dim_idx", attrs);
-  }
-
-  Variable *OutVar() const { return out_var_; }
-
-  const GType *Input() const { return input_; }
-
-  GType *Out() const { return out_; }
-
-  const int &DataDtype() const { return dtype_; }
-
-  const vector<int> &Shape() const { return shape_; }
-
-  const float &Value() const { return value_; }
-
-  int InputDimIdx() const { return input_dim_idx_; }
-
-  int OutputDimIdx() const { return output_dim_idx_; }
-
- private:
-  GType *input_;
-  Variable *out_var_;
-  GType *out_;
-  int dtype_;
-  vector<int> shape_;
-  float value_;
-  int input_dim_idx_;
-  int output_dim_idx_;
-};
-#endif
-
-#ifdef TRANSPOSE_OP
-template <typename Dtype>
-class TransposeParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  TransposeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                 const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    axis_ = GetAttr<vector<int>>("axis", attrs);
-  }
-
-  const GType *InputX() const { return input_x_; }
-
-  GType *Out() const { return out_; }
-
-  const vector<int> &Axis() const { return axis_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-  vector<int> axis_;
-};
-#endif
-
-#ifdef TRANSPOSE2_OP
-template <typename Dtype>
-class Transpose2Param : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  Transpose2Param(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                  const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    output_xshape_ = OutputXShapeFrom<GType>(outputs, *scope);
-    axis_ = GetAttr<vector<int>>("axis", attrs);
-  }
-
-  GType *InputX() const { return input_x_; }
-
-  GType *Out() const { return out_; }
-
-  GType *OutputXShape() const { return output_xshape_; }
-
-  const vector<int> &Axis() const { return axis_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-  GType *output_xshape_;
-  vector<int> axis_;
-};
-#endif
-
-#ifdef LOOKUP_OP
-template <typename Dtype>
-class LookupParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  LookupParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-              const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_w_ = InputWFrom<GType>(inputs, *scope);
-    input_ids_ = InputIdsFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    padding_idx_ = GetAttr<int64_t>("padding_idx", attrs);
-  }
-
-  const GType *InputW() const { return input_w_; }
-  const GType *InputIds() const { return input_ids_; }
-  GType *Out() const { return out_; }
-  int64_t PaddingIdx() const { return padding_idx_; }
-
- private:
-  GType *input_w_;
-  GType *input_ids_;
-  GType *out_;
-  int64_t padding_idx_;
-};
-#endif
-
-#ifdef CRF_OP
-template <typename Dtype>
-class CrfParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  //    {G_OP_TYPE_CRF, {{"Emission", "Transition", "Label"}, {"ViterbiPath"}}},
-
-  CrfParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-           const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    // todo crf params
-    input_emission_ = InputEmissionFrom<GType>(inputs, *scope);
-    input_transition_ = InputTransitionFrom<GType>(inputs, *scope);
-    input_label_ = InputLabelFrom<GType>(inputs, *scope);
-    output_viterbipath_ = OutputViterbiPathFrom<GType>(outputs, *scope);
-    //    padding_idx_ = GetAttr<int64_t>("padding_idx", attrs);
-  }
-  const GType *InputEmission() const { return input_emission_; }
-  const GType *InputTransition() const { return input_transition_; }
-  const GType *InputLabel() const { return input_label_; }
-  GType *outputVBP() const { return output_viterbipath_; }
-  //  const GType *InputIds() const { return input_ids_; }
-  //  GType *Out() const { return out_; }
-  //  int64_t PaddingIdx() const { return padding_idx_; }
-
- private:
-  GType *input_emission_;
-  GType *input_transition_;
-  GType *input_label_;
-  GType *output_viterbipath_;
-
-  //  GType *input_ids_;
-  //  GType *out_;
-  //  int64_t padding_idx_;
-};
-#endif
-
-#ifdef RESHAPE_OP
-template <typename Dtype>
-class ReshapeParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ReshapeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-               const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_shape_ = InputShapeFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    shape_ = GetAttr<vector<int>>("shape", attrs);
-
-    if (HasAttr("inplace", attrs)) {
-      inplace_ = GetAttr<bool>("inplace", attrs);
-    } else {
-      inplace_ = false;
-      DLOG << "ReshapeParam lost inplace params. maybe fluid updated";
-    }
-  }
-
-  const GType *InputX() const { return input_x_; }
-
-  const GType *InputShape() const { return input_shape_; }
-
-  GType *Out() const { return out_; }
-
-  const vector<int> &Shape() const { return shape_; }
-
-  const bool &Inplace() const { return inplace_; }
-
- private:
-  GType *input_x_;
-  GType *input_shape_;
-  GType *out_;
-  vector<int> shape_;
-  bool inplace_;
-};
-#endif
-
-#ifdef RESHAPE2_OP
-template <typename Dtype>
-class Reshape2Param : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  Reshape2Param(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_shape_ = InputShapeFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    output_xshape_ = OutputXShapeFrom<GType>(outputs, *scope);
-    shape_ = GetAttr<vector<int>>("shape", attrs);
-    if (HasAttr("inplace", attrs)) {
-      inplace_ = GetAttr<bool>("inplace", attrs);
-    } else {
-      inplace_ = false;
-    }
-  }
-
-  GType *InputX() const { return input_x_; }
-
-  const GType *InputShape() const { return input_shape_; }
-
-  GType *Out() const { return out_; }
-
-  GType *OutputXShape() const { return output_xshape_; }
-
-  const vector<int> &Shape() const { return shape_; }
-
-  const bool &Inplace() const { return inplace_; }
-
- private:
-  GType *input_x_;
-  GType *input_shape_;
-  GType *out_;
-  GType *output_xshape_;
-  vector<int> shape_;
-  bool inplace_;
-};
-#endif
-
-#ifdef SCALE_OP
-template <typename Dtype>
-class ScaleParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ScaleParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-             const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    scale_ = GetAttr<float>("scale", attrs);
-    bias_ = GetAttr<float>("bias", attrs);
-  }
-
-  const GType *InputX() const { return input_x_; }
-
-  GType *Out() const { return out_; }
-
-  const float Scale() const { return scale_; }
-
-  const float Bias() const { return bias_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-  float scale_;
-  float bias_;
-};
-#endif
-
-#ifdef SLICE_OP
-template <typename Dtype>
-class SliceParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  SliceParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-             const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = InputFrom<GType>(inputs, *scope);
-    output_ = OutFrom<GType>(outputs, *scope);
-
-    axes_ = GetAttr<std::vector<int>>("axes", attrs);
-    starts_ = GetAttr<std::vector<int>>("starts", attrs);
-    ends_ = GetAttr<std::vector<int>>("ends", attrs);
-
-    original_output_dims_size_ = output_->dims().size();
-  }
-
- public:
-  GType *input_;
-  GType *output_;
-  std::vector<int> axes_;
-  std::vector<int> starts_;
-  std::vector<int> ends_;
-  int original_output_dims_size_;
-};
-#endif
-
-#ifdef RESIZE_OP
-template <typename Dtype>
-class ResizeParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ResizeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-              const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_shape_ = InputShapeFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    is_pyramid_test_ = GetAttr<bool>("is_pyramid_test", attrs);
-    height_ = GetAttr<int>("height", attrs);
-    width_ = GetAttr<int>("width", attrs);
-    out_height_scale_ = GetAttr<float>("out_height_scale", attrs);
-    out_width_scale_ = GetAttr<float>("out_width_scale", attrs);
-  }
-
-  const GType *InputX() const { return input_x_; }
-
-  const GType *InputShape() const { return input_shape_; }
-
-  GType *Out() const { return out_; }
-
-  const bool &IsPyramidTest() const { return is_pyramid_test_; }
-
-  const int &Height() const { return height_; }
-
-  const int &Width() const { return width_; }
-
-  const float &OutHeightScale() const { return out_height_scale_; }
-
-  const float &OutWidthScale() const { return out_width_scale_; }
-
- private:
-  GType *input_x_;
-  GType *input_shape_;
-  GType *out_;
-  bool is_pyramid_test_;
-  int height_;
-  int width_;
-  float out_height_scale_;
-  float out_width_scale_;
-};
-#endif
-
-#ifdef RELU_OP
-/*
- * @b op 层实例化好这个 param 传递给 kernel 层使用
- * */
-template <typename Dtype>
-class ReluParamBase : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ReluParamBase(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-  }
-
-  const GType *InputX() const { return input_x_; }
-
-  GType *Out() const { return out_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-};
-
-template <typename Dtype>
-class ReluParam : public ReluParamBase<Dtype> {
- public:
-  using ReluParamBase<Dtype>::ReluParamBase;
-};
-
-template <typename Dtype>
-class Relu6Param : public ReluParamBase<Dtype> {
- public:
-  Relu6Param(const VariableNameMap &inputs, const VariableNameMap &outputs,
-             const AttributeMap &attrs, Scope *scope)
-      : ReluParamBase<Dtype>(inputs, outputs, attrs, scope) {
-    threshold = OpParam::GetAttr<float>("threshold", attrs);
-  }
-  float getThreshold() const { return threshold; }
-
- private:
-  float threshold;
-};
-
-#ifdef PADDLE_MOBILE_CL
-template <>
-class ReluParam<GPU_CL> : public ReluParamBase<GPU_CL> {
- public:
-  using ReluParamBase<GPU_CL>::ReluParamBase;
-  framework::CLImage &getMidImage() { return midImage; }
-
- private:
-  framework::CLImage midImage;
-};
-#endif
-
-#endif
-
-#ifdef TANH_OP
-template <typename Dtype>
-class TanhParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  TanhParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-  }
-  const GType *InputX() const { return input_x_; }
-  GType *Out() const { return out_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-#ifdef PADDLE_MOBILE_FPGA
-
- private:
-  std::shared_ptr<GType> float_input_x_;
-  fpga::BypassArgs fpga_bypass_args;
-
- public:
-  GType *FloatInput() const {
-    return float_input_x_ == nullptr ? input_x_ : float_input_x_.get();
-  }
-  void SetFloatInput(LoDTensor *input) { float_input_x_.reset(input); }
-  const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
-  void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
-#endif
-};
-#endif
-
-#ifdef PRELU_OP
-template <typename Dtype>
-class PReluParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  PReluParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-             const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    DLOG << "PReluParam inputs before";
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    alpha_ = InputAlphaFrom<GType>(inputs, *scope);
-    framework::DDim dims = alpha_->dims();
-    out_ = OutFrom<GType>(outputs, *scope);
-    mode_ = GetStringAttr("mode", attrs);
-    DLOG << "PReluParam mode after" << mode_;
-  }
-  const GType *InputX() const { return input_x_; }
-  const GType *InputAlpha() const { return alpha_; }
-  GType *Out() const { return out_; }
-  const std::string &Mode() const { return mode_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-  GType *alpha_;
-  std::string mode_;
-};
-#endif
-
-#ifdef LEAKY_RELU_OP
-template <typename Dtype>
-class LeakyReluParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  LeakyReluParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                 const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    alpha_ = GetAttr<float>("alpha", attrs);
-  }
-  const GType *InputX() const { return input_x_; }
-  const float Alpha() const { return alpha_; }
-  GType *Out() const { return out_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-  float alpha_;
-};
-#endif
-
-template <typename Dtype>
-class FusionFcParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionFcParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_y_ = InputYFrom<GType>(inputs, *scope);
-    input_z_ = InputZFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    x_num_col_dims_ = GetAttr<int>("x_num_col_dims", attrs);
-    y_num_col_dims_ = GetAttr<int>("y_num_col_dims", attrs);
-    axis_ = GetAttr<int>("axis", attrs);
-  }
-  GType *InputX() const { return input_x_; }
-
-  GType *InputY() const { return input_y_; }
-
-  GType *InputZ() const { return input_z_; }
-
-  GType *Out() const { return out_; }
-
-  const int &XNumColDims() const { return x_num_col_dims_; }
-
-  const int &YNumColDims() const { return y_num_col_dims_; }
-
-  const int &Axis() const { return axis_; }
-
- private:
-  GType *input_x_;
-  GType *input_y_;
-  GType *input_z_;
-  GType *out_;
-  int x_num_col_dims_;
-  int y_num_col_dims_;
-  int axis_;
-
-#ifdef PADDLE_MOBILE_FPGA
- private:  // NOLINT
-  fpga::SplitConvArgs fpga_conv_args;
-
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
-};
-
-#ifdef FUSION_FCRELU_OP
-template <typename DeviceType>
-using FusionFcReluParam = FusionFcParam<DeviceType>;
-#endif
-
-template <typename Dtype>
-class FusionConvAddParam : public ConvParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionConvAddParam(const VariableNameMap &inputs,
-                     const VariableNameMap &outputs, const AttributeMap &attrs,
-                     Scope *scope)
-      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
-    bias_ = OpParam::InputYFrom<GType>(inputs, *scope);
-    axis_ = OpParam::GetAttr<int>("axis", attrs);
-    this->output_ = OpParam::OutFrom<GType>(outputs, *scope);
-  }
-  GType *Bias() const { return bias_; }
-
-  const int &Axis() const { return axis_; }
-
- protected:
-  GType *bias_;
-  int axis_;
-};
-
-template <typename Dtype>
-Print &operator<<(Print &printer, const FusionConvAddParam<Dtype> &conv_param);
-
-#ifdef FUSION_CONVADDRELU_OP
-template <typename DeviceType>
-class FusionConvAddReluParam : public FusionConvAddParam<DeviceType> {
- public:
-  FusionConvAddReluParam(const VariableNameMap &inputs,
-                         const VariableNameMap &outputs,
-                         const AttributeMap &attrs, Scope *scope)
-      : FusionConvAddParam<DeviceType>(inputs, outputs, attrs, scope) {}
-};
-#endif
-
-#ifdef FUSION_CONVADDPRELU_OP
-template <typename Dtype>
-class FusionConvAddPReluParam : public ConvParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionConvAddPReluParam(const VariableNameMap &inputs,
-                          const VariableNameMap &outputs,
-                          const AttributeMap &attrs, Scope *scope)
-      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
-    alpha_ = OpParam::InputAlphaFrom<GType>(inputs, *scope);
-    mode_ = OpParam::GetStringAttr("mode", attrs);
-    framework::DDim dims = alpha_->dims();
-    bias_ = OpParam::InputYFrom<GType>(inputs, *scope);
-    axis_ = OpParam::GetAttr<int>("axis", attrs);
-    this->output_ = OpParam::OutFrom<GType>(outputs, *scope);
-  }
-  const GType *InputAlpha() const { return alpha_; }
-  const std::string &Mode() const { return mode_; }
-  GType *Bias() const { return bias_; }
-  const int &Axis() const { return axis_; }
-
- protected:
-  GType *bias_;
-  int axis_;
-  GType *alpha_;
-  std::string mode_;
-};
-#endif
-
-#ifdef FUSION_CONVADDADDPRELU_OP
-template <typename Dtype>
-class FusionConvAddAddPReluParam : public ConvParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionConvAddAddPReluParam(const VariableNameMap &inputs,
-                             const VariableNameMap &outputs,
-                             const AttributeMap &attrs, Scope *scope)
-      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
-    bias1_ = OpParam::InputYFrom1<GType>(inputs, *scope);
-    alpha_ = OpParam::InputAlphaFrom<GType>(inputs, *scope);
-    mode_ = OpParam::GetStringAttr("mode", attrs);
-    framework::DDim dims = alpha_->dims();
-    bias_ = OpParam::InputYFrom<GType>(inputs, *scope);
-    axis_ = OpParam::GetAttr<int>("axis", attrs);
-    keyOutput_ = OpParam::Getkey("addOut", inputs, 0);
-    keyX1_ = OpParam::Getkey("addX", inputs, 1);
-    keyY1_ = OpParam::Getkey("Y", inputs, 1);
-    if (keyX1_ == keyOutput_) {
-      bias1_ = OpParam::InputYFrom1<GType>(inputs, *scope);
-    } else if (keyY1_ == keyOutput_) {
-      bias1_ = OpParam::InputXFrom1<GType>(inputs, *scope);
-    }
-    this->output_ = OpParam::OutFrom<GType>(outputs, *scope);
-  }
-  const GType *InputAlpha() const { return alpha_; }
-  const std::string &Mode() const { return mode_; }
-  const GType *Bias1() const { return bias1_; }
-
-  GType *Bias() const { return bias_; }
-
-  const int &Axis() const { return axis_; }
-
- protected:
-  GType *bias_;
-  int axis_;
-  GType *alpha_;
-  std::string mode_;
-  GType *bias1_;
-  std::string keyOutput_;
-  std::string keyX1_;
-  std::string keyY1_;
-};
-#endif
-
-#ifdef FUSION_CONVADDBNRELU_OP
-template <typename Dtype>
-class FusionConvAddBNReluParam : public ConvParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionConvAddBNReluParam(const VariableNameMap &inputs,
-                           const VariableNameMap &outputs,
-                           const AttributeMap &attrs, Scope *scope)
-      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
-    bias_ = OpParam::InputYFrom<GType>(inputs, *scope);
-    axis_ = OpParam::GetAttr<int>("axis", attrs);
-    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, *scope);
-    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, *scope);
-    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, *scope);
-    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, *scope);
-    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
-    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
-    this->output_ = OpParam::OutFrom<GType>(outputs, *scope);
-  }
-  GType *Bias() const { return bias_; }
-
-  const int &Axis() const { return axis_; }
-
-  const GType *InputBias() const { return input_bias_; }
-
-  const GType *InputMean() const { return input_mean_; }
-
-  const GType *InputScale() const { return input_scale_; }
-
-  const GType *InputVariance() const { return input_variance_; }
-
-  const float &Epsilon() const { return epsilon_; }
-
-  const float &Momentum() const { return momentum_; }
-
-  void SetNewScale(GType *new_scale) { new_scale_ = new_scale; }
-
-  void SetNewBias(GType *new_bias) { new_bias_ = new_bias; }
-
-  const GType *NewScale() const { return new_scale_; }
-
-  const GType *NewBias() const { return new_bias_; }
-
- protected:
-  GType *bias_;
-  int axis_;
-  GType *input_bias_;
-  GType *input_mean_;
-  GType *input_scale_;
-  GType *input_variance_;
-  float epsilon_;
-  float momentum_;
-  GType *new_bias_;
-  GType *new_scale_;
-};
-#endif
-
-#ifdef FUSION_CONVBNADDRELU_OP
-template <typename Dtype>
-class FusionConvBNAddReluParam : public ConvParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionConvBNAddReluParam(const VariableNameMap &inputs,
-                           const VariableNameMap &outputs,
-                           const AttributeMap &attrs, Scope *scope)
-      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
-    bias_ = OpParam::InputYFrom<GType>(inputs, *scope);
-    axis_ = OpParam::GetAttr<int>("axis", attrs);
-    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, *scope);
-    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, *scope);
-    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, *scope);
-    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, *scope);
-    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
-    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
-    keyBNY_ = OpParam::Getkey("BNY", inputs, 0);
-    keyX_ = OpParam::Getkey("X", inputs, 0);
-    keyY_ = OpParam::Getkey("Y", inputs, 0);
-    if (keyX_ == keyBNY_) {
-      bias_ = OpParam::InputYFrom<GType>(inputs, *scope);
-    } else if (keyY_ == keyBNY_) {
-      bias_ = OpParam::InputXFrom<GType>(inputs, *scope);
-    }
-    this->output_ = OpParam::OutFrom<GType>(outputs, *scope);
-  }
-  GType *Bias() const { return bias_; }
-
-  const int &Axis() const { return axis_; }
-
-  const GType *InputBias() const { return input_bias_; }
-
-  const GType *InputMean() const { return input_mean_; }
-
-  const GType *InputScale() const { return input_scale_; }
-
-  const GType *InputVariance() const { return input_variance_; }
-
-  const float &Epsilon() const { return epsilon_; }
-
-  const float &Momentum() const { return momentum_; }
-
-  void SetNewScale(GType *new_scale) { new_scale_ = new_scale; }
-
-  void SetNewBias(GType *new_bias) { new_bias_ = new_bias; }
-
-  const GType *NewScale() const { return new_scale_; }
-
-  const GType *NewBias() const { return new_bias_; }
-
- protected:
-  GType *bias_;
-  int axis_;
-  GType *input_bias_;
-  GType *input_mean_;
-  GType *input_scale_;
-  GType *input_variance_;
-  float epsilon_;
-  float momentum_;
-  GType *new_bias_;
-  GType *new_scale_;
-  std::string keyBNY_;
-  std::string keyX_;
-  std::string keyY_;
-};
-#endif
-
-#ifdef FUSION_CONVBN_OP
-template <typename Dtype>
-class FusionConvBNParam : public ConvParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionConvBNParam(const VariableNameMap &inputs,
-                    const VariableNameMap &outputs, const AttributeMap &attrs,
-                    Scope *scope)
-      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
-    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, *scope);
-    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, *scope);
-    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, *scope);
-    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, *scope);
-    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
-    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
-    this->output_ = OpParam::OutputYFrom<GType>(outputs, *scope);
-  }
-
-  const GType *InputBias() const { return input_bias_; }
-
-  const GType *InputMean() const { return input_mean_; }
-
-  const GType *InputScale() const { return input_scale_; }
-
-  const GType *InputVariance() const { return input_variance_; }
-
-  const float &Epsilon() const { return epsilon_; }
-
-  const float &Momentum() const { return momentum_; }
-
-  void SetNewScale(GType *new_scale) { new_scale_ = new_scale; }
-
-  void SetNewBias(GType *new_bias) { new_bias_ = new_bias; }
-
-  const GType *NewScale() const { return new_scale_; }
-
-  const GType *NewBias() const { return new_bias_; }
-
- protected:
-  GType *input_bias_;
-  GType *input_mean_;
-  GType *input_scale_;
-  GType *input_variance_;
-  float epsilon_;
-  float momentum_;
-  GType *new_bias_;
-  GType *new_scale_;
-};
-#endif
-
-#ifdef FUSION_CONVADDBN_OP
-template <typename Dtype>
-class FusionConvAddBNParam : public ConvParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionConvAddBNParam(const VariableNameMap &inputs,
-                       const VariableNameMap &outputs,
-                       const AttributeMap &attrs, Scope *scope)
-      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
-    bias_ = OpParam::InputYFrom<GType>(inputs, *scope);
-    axis_ = OpParam::GetAttr<int>("axis", attrs);
-    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, *scope);
-    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, *scope);
-    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, *scope);
-    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, *scope);
-    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
-    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
-    this->output_ = OpParam::OutputYFrom<GType>(outputs, *scope);
-  }
-  GType *Bias() const { return bias_; }
-
-  const int &Axis() const { return axis_; }
-
-  const GType *InputBias() const { return input_bias_; }
-
-  const GType *InputMean() const { return input_mean_; }
-
-  const GType *InputScale() const { return input_scale_; }
-
-  const GType *InputVariance() const { return input_variance_; }
-
-  const float &Epsilon() const { return epsilon_; }
-
-  const float &Momentum() const { return momentum_; }
-
-  void SetNewScale(GType *new_scale) { new_scale_ = new_scale; }
-
-  void SetNewBias(GType *new_bias) { new_bias_ = new_bias; }
-
-  const GType *NewScale() const { return new_scale_; }
-
-  const GType *NewBias() const { return new_bias_; }
-
- protected:
-  GType *bias_;
-  int axis_;
-  GType *input_bias_;
-  GType *input_mean_;
-  GType *input_scale_;
-  GType *input_variance_;
-  float epsilon_;
-  float momentum_;
-  GType *new_bias_;
-  GType *new_scale_;
-};
-#endif
-
-#ifdef FUSION_DWCONVBNRELU_OP
-template <typename Dtype>
-class FusionDWConvBNReluParam : public ConvParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionDWConvBNReluParam(const VariableNameMap &inputs,
-                          const VariableNameMap &outputs,
-                          const AttributeMap &attrs, Scope *scope)
-      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
-    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, *scope);
-    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, *scope);
-    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, *scope);
-    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, *scope);
-    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
-    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
-    this->output_ = OpParam::OutFrom<GType>(outputs, *scope);
-  }
-
-  const GType *InputBias() const { return input_bias_; }
-
-  const GType *InputMean() const { return input_mean_; }
-
-  const GType *InputScale() const { return input_scale_; }
-
-  const GType *InputVariance() const { return input_variance_; }
-
-  const float &Epsilon() const { return epsilon_; }
-
-  const float &Momentum() const { return momentum_; }
-
-  void SetNewScale(GType *new_scale) { new_scale_ = new_scale; }
-
-  void SetNewBias(GType *new_bias) { new_bias_ = new_bias; }
-
-  const GType *NewScale() const { return new_scale_; }
-
-  const GType *NewBias() const { return new_bias_; }
-
- protected:
-  GType *input_bias_;
-  GType *input_mean_;
-  GType *input_scale_;
-  GType *input_variance_;
-  float epsilon_;
-  float momentum_;
-  GType *new_bias_;
-  GType *new_scale_;
-};
-
-#endif
-
-#ifdef FUSION_CONVRELU_OP
-template <typename Dtype>
-class FusionConvReluParam : public ConvParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionConvReluParam(const VariableNameMap &inputs,
-                      const VariableNameMap &outputs, const AttributeMap &attrs,
-                      Scope *scope)
-      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
-    this->output_ = OpParam::OutFrom<GType>(outputs, *scope);
-  }
-};
-#endif
-
-#ifdef FUSION_CONVBNRELU_OP
-template <typename Dtype>
-class FusionConvBNReluParam : public ConvParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionConvBNReluParam(const VariableNameMap &inputs,
-                        const VariableNameMap &outputs,
-                        const AttributeMap &attrs, Scope *scope)
-      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
-    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, *scope);
-    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, *scope);
-    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, *scope);
-    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, *scope);
-    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
-    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
-    this->output_ = OpParam::OutFrom<GType>(outputs, *scope);
-  }
-
-  const GType *InputBias() const { return input_bias_; }
-
-  const GType *InputMean() const { return input_mean_; }
-
-  const GType *InputScale() const { return input_scale_; }
-
-  const GType *InputVariance() const { return input_variance_; }
-
-  const float &Epsilon() const { return epsilon_; }
-
-  const float &Momentum() const { return momentum_; }
-
-  void SetNewScale(GType *new_scale) { new_scale_ = new_scale; }
-
-  void SetNewBias(GType *new_bias) { new_bias_ = new_bias; }
-
-  const GType *NewScale() const { return new_scale_; }
-
-  const GType *NewBias() const { return new_bias_; }
-
- protected:
-  GType *input_bias_;
-  GType *input_mean_;
-  GType *input_scale_;
-  GType *input_variance_;
-  float epsilon_;
-  float momentum_;
-  GType *new_bias_;
-  GType *new_scale_;
-};
-#endif
-
-#ifdef IM2SEQUENCE_OP
-template <typename Dtype>
-class Im2SequenceParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  Im2SequenceParam(const VariableNameMap &inputs,
-                   const VariableNameMap &outputs, const AttributeMap &attrs,
-                   Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    kernels_ = GetAttr<vector<int>>("kernels", attrs);
-    strides_ = GetAttr<vector<int>>("strides", attrs);
-    paddings_ = GetAttr<vector<int>>("paddings", attrs);
-  }
-
-  const GType *Input() const { return input_x_; }
-
-  GType *Output() const { return out_; }
-
-  const vector<int> &Kernels() const { return kernels_; }
-
-  const vector<int> &Strides() const { return strides_; }
-
-  const vector<int> &Paddings() const { return paddings_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-  vector<int> kernels_;
-  vector<int> strides_;
-  vector<int> paddings_;
-};
-#endif
-
-#ifdef DROPOUT_OP
-template <typename Dtype>
-class DropoutParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  DropoutParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-               const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-
-    dropout_prob_ = GetAttr<float>("dropout_prob", attrs);
-  }
-
-  const GType *InputX() const { return input_x_; }
-
-  GType *Out() const { return out_; }
-
-  float DropoutProb() const { return dropout_prob_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-  float dropout_prob_;
-};
-#endif
-
-template <typename Dtype>
-class ConvTransposeParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ConvTransposeParam(const VariableNameMap &inputs,
-                     const VariableNameMap &outputs, const AttributeMap &attrs,
-                     Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    filter_ = OpParam::FilterFrom<GType>(inputs, *scope);
-    input_ = OpParam::InputFrom<GType>(inputs, *scope);
-    // output_ = OutputFrom<GType>(outputs, scope);
-    if (outputs.count("Output")) {
-      output_ = OpParam::OutputFrom<GType>(outputs, *scope);
-    }
-    strides_ = GetAttr<vector<int>>("strides", attrs);
-    paddings_ = GetAttr<vector<int>>("paddings", attrs);
-    dilations_ = GetAttr<vector<int>>("dilations", attrs);
-    if (HasAttr("output_size", attrs)) {
-      output_size_ = GetAttr<vector<int>>("output_size", attrs);
-      DLOG << "conv transpose output size: " << output_size_;
-    }
-    groups = GetAttr<int>("groups", attrs);
-  }
-
-  const GType *Input() const { return input_; }
-
-  GType *Filter() const { return filter_; }
-
-  GType *Output() const { return output_; }
-
-  const vector<int> &Strides() const { return strides_; }
-
-  const vector<int> &Paddings() const { return paddings_; }
-
-  const vector<int> &Filters() const { return filter_; }
-
-  const vector<int> &TransFilters() const { return transformed_filter_; }
-
-  const vector<int> &Dilations() const { return dilations_; }
-
-  const vector<int> &OutputSize() const { return output_size_; }
-
-  const int &Groups() const { return groups; }
-
-  enum ExecMode {
-    EXEC_INVALID = 0,
-    EXEC_GEMM_FLOAT,
-    EXEC_DECONV3X3_FLOAT,
-    EXEC_DECONV4X4_FLOAT,
-    EXEC_DEPTHWISETRANS_FLOAT,
-    EXEC_CONVTRANS3x3s2_FLOAT,
-  };
-
-  ExecMode &ExecMode() const { return exec_mode_; }
-
- private:
-  GType *input_;
-  GType *output_;
-  GType *filter_;
-  GType *transformed_filter_;
-  vector<int> strides_;
-  vector<int> paddings_;
-  vector<int> dilations_;
-  vector<int> output_size_;
-  int groups;
-  mutable enum ExecMode exec_mode_;
-
-#ifdef PADDLE_MOBILE_FPGA
-
- private:
-  fpga::DeconvArgs fpga_conv_args;
-  fpga::DWDeconvArgs fpga_DWDeconv_args;
-
- public:
-  const fpga::DeconvArgs &FpgaArgs() const { return fpga_conv_args; }
-  const fpga::DWDeconvArgs &FpgaDWDconvArgs() const {
-    return fpga_DWDeconv_args;
-  }
-  void SetFpgaArgs(const fpga::DeconvArgs &args) { fpga_conv_args = args; }
-  void SetFpgaArgs(const fpga::DWDeconvArgs &args) {
-    fpga_DWDeconv_args = args;
-  }
-#endif
-};
-
-#ifdef FUSION_DECONVADD_OP
-template <typename Dtype>
-class FusionDeconvAddParam : public ConvTransposeParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionDeconvAddParam(const VariableNameMap &inputs,
-                       const VariableNameMap &outputs,
-                       const AttributeMap &attrs, Scope *scope)
-      : ConvTransposeParam<Dtype>(inputs, outputs, attrs, scope) {
-    bias_ = OpParam::InputYFrom<GType>(inputs, *scope);
-    axis_ = OpParam::GetAttr<int>("axis", attrs);
-    output_ = OpParam::OutFrom<GType>(outputs, *scope);
-  }
-  GType *Bias() const { return bias_; }
-
-  const int &Axis() const { return axis_; }
-
-  GType *Output() const { return output_; }
-
- protected:
-  GType *bias_;
-  int axis_;
-  GType *output_;
-};
-#endif
-
-#ifdef FUSION_DECONVADDRELU_OP
-template <typename Dtype>
-using FusionDeconvAddReluParam = FusionDeconvAddParam<Dtype>;
-#endif
-#ifdef FUSION_DECONVADDBN_OP
-template <typename Dtype>
-class FusionDeconvAddBNParam : public ConvTransposeParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionDeconvAddBNParam(const VariableNameMap &inputs,
-                         const VariableNameMap &outputs,
-                         const AttributeMap &attrs, Scope *scope)
-      : ConvTransposeParam<Dtype>(inputs, outputs, attrs, scope) {
-    output_ = OpParam::OutFrom<GType>(outputs, *scope);
-    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, *scope);
-    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, *scope);
-    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, *scope);
-    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, *scope);
-    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
-    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
-    //    is_test_ = OpParam::GetAttr<bool>("is_test", attrs);
-  }
-  RType *Output() const { return output_; }
-
-  const RType *InputBias() const { return input_bias_; }
-
-  const RType *InputMean() const { return input_mean_; }
-
-  const RType *InputScale() const { return input_scale_; }
-
-  const RType *InputVariance() const { return input_variance_; }
-
-  const float &Epsilon() const { return epsilon_; }
-
-  const float &Momentum() const { return momentum_; }
-
-  const bool &IsTest() const { return is_test_; }
-
-  void SetNewScale(RType *new_scale) { new_scale_ = new_scale; }
-
-  void SetNewBias(RType *new_bias) { new_bias_ = new_bias; }
-
-  const RType *NewScale() const { return new_scale_; }
-
-  const RType *NewBias() const { return new_bias_; }
-
- protected:
-  RType *output_;
-  RType *input_bias_;
-  RType *input_mean_;
-  RType *input_scale_;
-  RType *input_variance_;
-  float epsilon_;
-  float momentum_;
-  bool is_test_;
-  RType *new_bias_;
-  RType *new_scale_;
-};
-#endif
-#ifdef FUSION_DECONVBNRELU_OP
-template <typename Dtype>
-class FusionDeconvBNReluParam : public ConvTransposeParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionDeconvBNReluParam(const VariableNameMap &inputs,
-                          const VariableNameMap &outputs,
-                          const AttributeMap &attrs, Scope *scope)
-      : ConvTransposeParam<Dtype>(inputs, outputs, attrs, scope) {
-    output_ = OpParam::OutFrom<GType>(outputs, *scope);
-    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, *scope);
-    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, *scope);
-    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, *scope);
-    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, *scope);
-    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
-    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
-  }
-  RType *Output() const { return output_; }
-
-  const RType *InputBias() const { return input_bias_; }
-
-  const RType *InputMean() const { return input_mean_; }
-
-  const RType *InputScale() const { return input_scale_; }
-
-  const RType *InputVariance() const { return input_variance_; }
-
-  const float &Epsilon() const { return epsilon_; }
-
-  const float &Momentum() const { return momentum_; }
-
-  const bool &IsTest() const { return is_test_; }
-
-  void SetNewScale(RType *new_scale) { new_scale_ = new_scale; }
-
-  void SetNewBias(RType *new_bias) { new_bias_ = new_bias; }
-
-  const RType *NewScale() const { return new_scale_; }
-
-  const RType *NewBias() const { return new_bias_; }
-
- protected:
-  RType *output_;
-  RType *input_bias_;
-  RType *input_mean_;
-  RType *input_scale_;
-  RType *input_variance_;
-  float epsilon_;
-  float momentum_;
-  bool is_test_;
-  RType *new_bias_;
-  RType *new_scale_;
-};
-#endif
-#ifdef FUSION_DECONVADDBNRELU_OP
-template <typename Dtype>
-class FusionDeconvAddBNReluParam : public ConvTransposeParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionDeconvAddBNReluParam(const VariableNameMap &inputs,
-                             const VariableNameMap &outputs,
-                             const AttributeMap &attrs, Scope *scope)
-      : ConvTransposeParam<Dtype>(inputs, outputs, attrs, scope) {
-    output_ = OpParam::OutFrom<GType>(outputs, *scope);
-    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, *scope);
-    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, *scope);
-    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, *scope);
-    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, *scope);
-    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
-    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
-    //    is_test_ = OpParam::GetAttr<bool>("is_test", attrs);
-  }
-  RType *Output() const { return output_; }
-
-  const RType *InputBias() const { return input_bias_; }
-
-  const RType *InputMean() const { return input_mean_; }
-
-  const RType *InputScale() const { return input_scale_; }
-
-  const RType *InputVariance() const { return input_variance_; }
-
-  const float &Epsilon() const { return epsilon_; }
-
-  const float &Momentum() const { return momentum_; }
-
-  const bool &IsTest() const { return is_test_; }
-
-  void SetNewScale(RType *new_scale) { new_scale_ = new_scale; }
-
-  void SetNewBias(RType *new_bias) { new_bias_ = new_bias; }
-
-  const RType *NewScale() const { return new_scale_; }
-
-  const RType *NewBias() const { return new_bias_; }
-
- protected:
-  RType *output_;
-  RType *input_bias_;
-  RType *input_mean_;
-  RType *input_scale_;
-  RType *input_variance_;
-  float epsilon_;
-  float momentum_;
-  bool is_test_;
-  RType *new_bias_;
-  RType *new_scale_;
-};
-#endif
-
-#ifdef FUSION_DECONVRELU_OP
-template <typename Dtype>
-using FusionDeconvReluParam = ConvTransposeParam<Dtype>;
-#endif
-
-#ifdef GRU_OP
-template <typename Dtype>
-class GruParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-
- public:
-  /**
-   *
-   * @param inputs
-   * @param outputs
-   * @param attrs
-   * @param scope
-   * */
-  GruParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-           const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_input_ = InputFrom<GType>(inputs, *scope);
-    input_h0_ = InputH0From<GType>(inputs, *scope);
-    input_bias_ = InputBiasFrom<GType>(inputs, *scope);
-    input_weight_ = InputWeightFrom<GType>(inputs, *scope);
-
-    output_batch_gate_ = OutputBatchGateFrom<GType>(outputs, *scope);
-    output_batch_reset_hidden_prev_ =
-        OutputBatchResetHiddenPrevFrom<GType>(outputs, *scope);
-    output_batch_hidden_ = OutputBatchHiddenFrom<GType>(outputs, *scope);
-    output_hidden_ = OutputHiddenFrom<GType>(outputs, *scope);
-    activation_ = GetStringAttr("activation", attrs);
-    gate_activation_ = GetStringAttr("gate_activation", attrs);
-    is_reverse_ = GetAttr<bool>("is_reverse", attrs);
-  }
-  const GType *InputInput() const { return input_input_; }
-  const GType *InputWeight() const { return input_weight_; }
-  const GType *InputH0() const { return input_h0_; }
-  const GType *InputBias() const { return input_bias_; }
-  const std::string &Activation() const { return activation_; }
-  const std::string &GateActivation() const { return gate_activation_; }
-  const bool &IsReverse() const { return is_reverse_; }
-
-  GType *OutBatchGate() const { return output_batch_gate_; }
-  GType *OutBatchResetHiddenPrev() const {
-    return output_batch_reset_hidden_prev_;
-  }
-  GType *OutBatchHidden() const { return output_batch_hidden_; }
-  GType *OutHidden() const { return output_hidden_; }
-
- private:
-  GType *input_input_;
-  GType *input_h0_;
-  GType *input_bias_;
-  GType *input_weight_;
-
-  GType *output_batch_gate_;
-  GType *output_batch_reset_hidden_prev_;
-  GType *output_batch_hidden_;
-  GType *output_hidden_;
-  std::string activation_;
-  std::string gate_activation_;
-  bool is_reverse_;
-};
-#endif
-
-#ifdef GRU_UNIT_OP
-template <typename Dtype>
-class GruUnitParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-
- public:
-  GruUnitParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-               const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_input_ = InputFrom<GType>(inputs, *scope);
-    input_hidden_prev_ = InputHiddenPrevFrom<GType>(inputs, *scope);
-    input_bias_ = InputBiasFrom<GType>(inputs, *scope);
-    input_weight_ = InputWeightFrom<GType>(inputs, *scope);
-
-    output_gate_ = OutputGateFrom<GType>(outputs, *scope);
-    output_reset_hidden_prev_ =
-        OutputResetHiddenPrevFrom<GType>(outputs, *scope);
-    output_hidden_ = OutputHiddenFrom<GType>(outputs, *scope);
-    activation_ = GetAttr<int>("activation", attrs);
-    gate_activation_ = GetAttr<int>("gate_activation", attrs);
-  }
-  const GType *InputInput() const { return input_input_; }
-  const GType *InputWeight() const { return input_weight_; }
-  const GType *InputHiddenPrev() const { return input_hidden_prev_; }
-  const GType *InputBias() const { return input_bias_; }
-  const int &Activation() const { return activation_; }
-  const int &GateActivation() const { return gate_activation_; }
-
-  GType *OutGate() const { return output_gate_; }
-  GType *OutResetHiddenPrev() const { return output_reset_hidden_prev_; }
-  GType *OutHidden() const { return output_hidden_; }
-
- private:
-  GType *input_input_;
-  GType *input_hidden_prev_;
-  GType *input_bias_;
-  GType *input_weight_;
-
-  GType *output_gate_;
-  GType *output_reset_hidden_prev_;
-  GType *output_hidden_;
-  int activation_;
-  int gate_activation_;
-};
-#endif
-
-#ifdef FLATTEN_OP
-template <typename Dtype>
-class FlattenParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FlattenParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-               const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    axis = GetAttr<int>("axis", attrs);
-  }
-  const GType *InputX() const { return input_x_; }
-  GType *Out() const { return out_; }
-  const int &Axis() const { return axis; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-  int axis;
-};
-#endif
-
-#ifdef SPLIT_OP
-template <typename Dtype>
-class SplitParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  SplitParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-             const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    outs_ = OutMultiFrom<GType>(outputs, *scope);
-    axis = GetAttr<int>("axis", attrs);
-    num = GetAttr<int>("num", attrs);
-    sections = GetAttr<std::vector<int>>("sections", attrs);
-
-    //    for (int i = 0; i < outs_.size(); ++i) {
-    //      out_ts_.push_back(*scope.FindVar(outs_[i])->GetMutable());
-    //    }
-  }
-  GType *InputX() const { return input_x_; }
-  std::vector<GType *> Outs() const { return outs_; }
-  int Axis() const { return axis; }
-  int Num() const { return num; }
-  std::vector<int> Sections() const { return sections; }
-  //  std::vector<GType> OutTs() const { return out_ts_; }
-
- private:
-  GType *input_x_;
-  std::vector<GType *> outs_;
-  int axis;
-  int num;
-  std::vector<int> sections;
-  //  std::vector<GType> out_ts_;
-#ifdef PADDLE_MOBILE_FPGA
-
- private:
-  fpga::SplitArgs fpga_split_args;
-
- public:
-  const fpga::SplitArgs &FpgaArgs() const { return fpga_split_args; }
-  void SetFpgaArgs(const fpga::SplitArgs &args) { fpga_split_args = args; }
-#endif
-};
-#endif
-
-#ifdef BILINEAR_INTERP_OP
-template <typename Dtype>
-class BilinearInterpParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  BilinearInterpParam(const VariableNameMap &inputs,
-                      const VariableNameMap &outputs, const AttributeMap &attrs,
-                      Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_outsize_ = InputOutSizeFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    out_h_ = GetAttr<int>("out_h", attrs);
-    out_w_ = GetAttr<int>("out_w", attrs);
-  }
-  const GType *InputX() const { return input_x_; }
-  const GType *InputOutPutSize() const { return input_outsize_; }
-  GType *Out() const { return out_; }
-  int OutH() const { return out_h_; }
-  int OutW() const { return out_w_; }
-
- private:
-  GType *input_x_;
-  GType *input_outsize_;
-  GType *out_;
-  int out_h_;
-  int out_w_;
-};
-#endif
-
-#ifdef NEAREST_INTERP_OP
-template <typename Dtype>
-class NearestInterpolationParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  NearestInterpolationParam(const VariableNameMap &inputs,
-                            const VariableNameMap &outputs,
-                            const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_outsize_ = InputOutSizeFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    out_h_ = GetAttr<int>("out_h", attrs);
-    out_w_ = GetAttr<int>("out_w", attrs);
-  }
-  const GType *InputX() const { return input_x_; }
-  const GType *InputOutPutSize() const { return input_outsize_; }
-  GType *Out() const { return out_; }
-  int OutH() const { return out_h_; }
-  int OutW() const { return out_w_; }
-
- private:
-  GType *input_x_;
-  GType *input_outsize_;
-  GType *out_;
-  int out_h_;
-  int out_w_;
-};
-#endif
-
-#ifdef SHAPE_OP
-template <typename Dtype>
-class ShapeParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ShapeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-             const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = InputFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-  }
-  const GType *Input() const { return input_; }
-  GType *Out() const { return out_; }
-
- private:
-  GType *input_;
-  GType *out_;
-};
-#endif
-
-#ifdef TOP_K_OP
-template <typename Dtype>
-class TopKParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  TopKParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = OpParam::GetVarValue<GType>("X", inputs, *scope);
-    output_ = OpParam::GetVarValue<GType>("Out", outputs, *scope);
-    indices_ = OpParam::GetVarValue<GType>("Indices", outputs, *scope);
-    k_ = OpParam::GetAttr<int>("k", attrs);
-  }
-
- public:
-  GType *input_;
-  GType *output_;
-  GType *indices_;
-  int k_;
-};
-#endif  // TOP_K_OP
-
-#ifdef CAST_OP
-template <typename Dtype>
-class CastParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  CastParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = OpParam::GetVarValue<GType>("X", inputs, *scope);
-    output_ = OpParam::GetVarValue<GType>("Out", outputs, *scope);
-    input_type_ = OpParam::GetAttr<int>("in_dtype", attrs);
-    output_type_ = OpParam::GetAttr<int>("out_dtype", attrs);
-  }
-
- public:
-  GType *input_;
-  GType *output_;
-  int input_type_;
-  int output_type_;
-};
-#endif  // CAST_OP
-
-#ifdef QUANT_OP
-template <typename Dtype>
-class QuantizeParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  QuantizeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = InputXFrom<GType>(inputs, *scope);
-    output_ = OutFrom<GType>(outputs, *scope);
-    // online
-    // scale = max(abs(x))
-    online_scale_ = OpParam::GetVarValue<GType>("OutScale", outputs, *scope);
-    // offline
-    if (inputs.count("InScale")) {
-      offline_ = true;
-      offline_scale_ = OpParam::GetVarValue<GType>("InScale", inputs, *scope);
-    }
-    // x = round(scale * x)
-    if (OpParam::HasAttr("round_type", attrs)) {
-      round_type_ = OpParam::GetAttr<RoundType>("round_type", attrs);
-    }
-  }
-
- public:
-  // op input
-  GType *input_;
-  // op output
-  GType *output_;
-  GType *online_scale_;
-  // quantize offline scale
-  GType *offline_scale_;
-  // if offine scale or not
-  bool offline_ = false;
-  // round method type
-  // RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO;
-  RoundType round_type_ = ROUND_NEAREST_TOWARDS_ZERO;
-};
-#endif
-
-#ifdef DEQUANT_OP
-template <typename Dtype>
-class DequantizeParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  DequantizeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                  const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = InputXFrom<GType>(inputs, *scope);
-    output_ = OutFrom<GType>(outputs, *scope);
-    activation_scale_ = OpParam::GetVarValue<GType>("Scale", inputs, *scope);
-    // dequantization is performed as x = x / static_scale / online_scale
-    if (OpParam::HasAttr("weight_scale", attrs)) {
-      weight_scale_ = OpParam::GetAttr<float>("weight_scale", attrs);
-    } else {
-      weight_scale_ = OpParam::GetAttr<float>("max_range", attrs);
-    }
-  }
-
- public:
-  // op input
-  GType *input_;
-  // op output
-  GType *output_;
-  GType *activation_scale_;
-  float weight_scale_;
-};
-#endif
-
-#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_ADD_BN_OP) || \
-    defined(FUSION_DEQUANT_ADD_BN_RELU_OP) ||                             \
-    defined(FUSION_DEQUANT_BN_RELU_OP) ||                                 \
-    defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) ||                            \
-    defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
-template <typename Dtype>
-class FusionDequantBNParam : public DequantizeParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionDequantBNParam(const VariableNameMap &inputs,
-                       const VariableNameMap &outputs,
-                       const AttributeMap &attrs, Scope *scope)
-      : DequantizeParam<Dtype>(inputs, outputs, attrs, scope) {
-    // batch norm params
-    bn_mean_ = OpParam::GetVarValue<GType>("BNMean", inputs, *scope);
-    bn_variance_ = OpParam::GetVarValue<GType>("BNVariance", inputs, *scope);
-    bn_scale_ = OpParam::GetVarValue<GType>("BNScale", inputs, *scope);
-    bn_bias_ = OpParam::GetVarValue<GType>("BNBias", inputs, *scope);
-    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
-  }
-
- public:
-  // batch norm
-  GType *bn_mean_;
-  GType *bn_variance_;
-  GType *bn_scale_;
-  GType *bn_bias_;
-  float epsilon_;
-};
-#endif
-
-#if defined(FUSION_DEQUANT_ADD_BN_RELU_OP) ||  \
-    defined(FUSION_DEQUANT_ADD_BN_OP) ||       \
-    defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) || \
-    defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
-template <typename Dtype>
-class FusionDequantAddBNParam : public FusionDequantBNParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionDequantAddBNParam(const VariableNameMap &inputs,
-                          const VariableNameMap &outputs,
-                          const AttributeMap &attrs, Scope *scope)
-      : FusionDequantBNParam<Dtype>(inputs, outputs, attrs, scope) {
-    // element wise add params
-    axis_ = OpParam::GetAttr<int>("axis", attrs);
-    bias_ = OpParam::InputYFrom<GType>(inputs, *scope);
-  }
-
- public:
-  // elementwise add
-  int axis_;
-  GType *bias_;
-};
-#endif
-
-#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
-template <typename Dtype>
-class FusionDequantAddBNQuantParam : public FusionDequantAddBNParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionDequantAddBNQuantParam(const VariableNameMap &inputs,
-                               const VariableNameMap &outputs,
-                               const AttributeMap &attrs, Scope *scope)
-      : FusionDequantAddBNParam<Dtype>(inputs, outputs, attrs, scope) {
-    // scale output
-    online_scale_ = OpParam::GetVarValue<GType>("OutScale", outputs, *scope);
-    // offline
-    if (inputs.count("InScale")) {
-      offline_ = true;
-      offline_scale_ = OpParam::GetVarValue<GType>("InScale", inputs, *scope);
-    }
-    // x = round(scale * x)
-    if (OpParam::HasAttr("round_type", attrs)) {
-      round_type_ = OpParam::GetAttr<RoundType>("round_type", attrs);
-    }
-  }
-
- public:
-  GType *online_scale_;
-  // quantize offline scale
-  GType *offline_scale_;
-  // if offine scale or not
-  bool offline_ = false;
-  // round method type
-  // RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO;
-  RoundType round_type_ = ROUND_NEAREST_TOWARDS_ZERO;
-};
-#endif
-
-#ifdef SEQUENCE_EXPAND_OP
-template <typename Dtype>
-class SequenceExpandParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  SequenceExpandParam(const VariableNameMap &inputs,
-                      const VariableNameMap &outputs, const AttributeMap &attrs,
-                      Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_y_ = InputYFrom<GType>(inputs, *scope);
-    output_ = OutFrom<GType>(outputs, *scope);
-    ref_level_ = -1;
-    if (OpParam::HasAttr("ref_level", attrs)) {
-      ref_level_ = OpParam::GetAttr<int>("ref_level", attrs);
-    }
-  }
-
- public:
-  GType *input_x_;
-  GType *input_y_;
-  GType *output_;
-  int ref_level_;
-};
-#endif  // SEQUENCE_EXPAND_OP
-
-#ifdef SEQUENCE_POOL_OP
-template <typename Dtype>
-class SequencePoolParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  SequencePoolParam(const VariableNameMap &inputs,
-                    const VariableNameMap &outputs, const AttributeMap &attrs,
-                    Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = InputXFrom<GType>(inputs, *scope);
-    output_ = OutFrom<GType>(outputs, *scope);
-    pool_type_ = "MAX";
-    if (OpParam::HasAttr("pooltype", attrs)) {
-      pool_type_ = OpParam::GetStringAttr("pooltype", attrs);
-    }
-  }
-
- public:
-  GType *input_;
-  GType *output_;
-  std::string pool_type_;
-};
-#endif  // SEQUENCE_EXPAND_OP
-
-#ifdef LOD_RESET_OP
-template <typename Dtype>
-class LodResetParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  LodResetParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    output_ = OutFrom<GType>(outputs, *scope);
-    input_y_ = nullptr;
-    if (inputs.count("Y")) {
-      input_y_ = InputYFrom<GType>(inputs, *scope);
-    } else {
-      target_lod_ = OpParam::GetAttr<vector<int>>("target_lod", attrs);
-    }
-    if (HasAttr("append", attrs)) {
-      append = OpParam::GetAttr<bool>("append", attrs);
-    }
-  }
-
- public:
-  GType *input_x_;
-  GType *input_y_;
-  GType *output_;
-  std::vector<int> target_lod_;
-  bool append;
-};
-#endif  // LOD_RESET_OP
-
-#ifdef LESS_THAN_OP
-template <typename Dtype>
-class CompareParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  CompareParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-               const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_y_ = InputYFrom<GType>(inputs, *scope);
-    output_ = OutFrom<GType>(outputs, *scope);
-    axis_ = OpParam::GetAttr<int>("axis", attrs);
-  }
-
- public:
-  GType *input_x_;
-  GType *input_y_;
-  GType *output_;
-  int axis_;
-};
-#endif  // LESS_THAN_OP
-
-#if defined(LOGICAL_AND_OP) || defined(LOGICAL_OR_OP) || defined(LOGICAL_XOR_OP)
-template <typename Dtype>
-class LogicalBinaryParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  LogicalBinaryParam(const VariableNameMap &inputs,
-                     const VariableNameMap &outputs, const AttributeMap &attrs,
-                     Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_y_ = InputYFrom<GType>(inputs, *scope);
-    output_ = OutFrom<GType>(outputs, *scope);
-  }
-
-  const GType *InputX() const { return input_x_; }
-  const GType *InputY() const { return input_y_; }
-  GType *Out() const { return output_; }
-
- public:
-  GType *input_x_;
-  GType *input_y_;
-  GType *output_;
-};
-#endif  // LOGICAL_AND_OP LOGICAL_OR_OP LOGICAL_XOR_OP
-
-#ifdef LOGICAL_NOT_OP
-template <typename Dtype>
-class LogicalUnaryParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  LogicalUnaryParam(const VariableNameMap &inputs,
-                    const VariableNameMap &outputs, const AttributeMap &attrs,
-                    Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    output_ = OutFrom<GType>(outputs, *scope);
-  }
-
-  const GType *InputX() const { return input_x_; }
-  GType *Out() const { return output_; }
-
- public:
-  GType *input_x_;
-  GType *output_;
-};
-#endif  // LOGICAL_NOT_OP
-
-#ifdef WRITE_TO_ARRAY_OP
-template <typename Dtype>
-class WriteToArrayParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  WriteToArrayParam(const VariableNameMap &inputs,
-                    const VariableNameMap &outputs, const AttributeMap &attrs,
-                    Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = OpParam::GetVarValue<GType>("X", inputs, *scope);
-    index_ = OpParam::GetVarValue<GType>("I", inputs, *scope);
-    output_ = OpParam::GetVarValue<std::vector<GType>>("Out", outputs, *scope);
-  }
-
- public:
-  GType *input_;
-  GType *index_;
-  std::vector<GType> *output_;
-};
-#endif
-
-#ifdef READ_FROM_ARRAY_OP
-template <typename Dtype>
-class ReadFromArrayParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ReadFromArrayParam(const VariableNameMap &inputs,
-                     const VariableNameMap &outputs, const AttributeMap &attrs,
-                     Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = OpParam::GetVarValue<std::vector<GType>>("X", inputs, *scope);
-    index_ = OpParam::GetVarValue<GType>("I", inputs, *scope);
-    output_ = OpParam::GetVarValue<GType>("Out", outputs, *scope);
-  }
-
- public:
-  std::vector<GType> *input_;
-  GType *index_;
-  GType *output_;
-};
-#endif
-
-#ifdef IS_EMPTY_OP
-template <typename Dtype>
-class IsEmptyParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  IsEmptyParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-               const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    output_ = OutFrom<GType>(outputs, *scope);
-  }
-
-  const GType *InputX() const { return input_x_; }
-  GType *Out() const { return output_; }
-
- public:
-  GType *input_x_;
-  GType *output_;
-};
-#endif  // IS_EMPTY_OP
-
-#ifdef INCREMENT_OP
-template <typename Dtype>
-class IncrementParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  IncrementParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                 const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    output_ = OutFrom<GType>(outputs, *scope);
-    step_ = OpParam::GetAttr<float>("step", attrs);
-  }
-
-  const GType *InputX() const { return input_x_; }
-  GType *Out() const { return output_; }
-  float Step() const { return step_; }
-
- public:
-  GType *input_x_;
-  GType *output_;
-  float step_;
-};
-#endif  // INCREMENT_OP
-#ifdef PAD2D_OP
-template <typename Dtype>
-class Pad2DParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  Pad2DParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-             const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    paddings_ = OpParam::GetAttr<std::vector<int>>("paddings", attrs);
-    pad_value_ = OpParam::GetAttr<float>("pad_value", attrs);
-    mode_ = OpParam::GetStringAttr("mode", attrs);
-    DLOG << "mode" << mode_;
-  }
-  const GType *InputX() const { return input_x_; }
-  GType *Out() const { return out_; }
-
-  std::vector<int> paddings_;
-  float pad_value_;
-  std::string mode_;
-
- private:
-  GType *input_x_;
-  GType *out_;
-};
-#endif
-#ifdef EXP_OP
-template <typename Dtype>
-class EXPParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  EXPParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-           const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-  }
-  const GType *InputX() const { return input_x_; }
-  GType *Out() const { return out_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-};
-#endif
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/pad2d_op.cpp b/mobile/src/operators/pad2d_op.cpp
deleted file mode 100755
index d3ed4762e4..0000000000
--- a/mobile/src/operators/pad2d_op.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PAD2D_OP
-
-#include "operators/pad2d_op.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void Pad2DOp<Dtype, T>::InferShape() const {
-  auto input_dims = this->param_.InputX()->dims();
-  const auto &paddings = this->param_.paddings_;
-  PADDLE_MOBILE_ENFORCE(paddings.size() == 4,
-                        "Size of paddings should be equal to 4.");
-
-  input_dims[2] += paddings[0] + paddings[1];
-  input_dims[3] += paddings[2] + paddings[3];
-  this->param_.Out()->Resize(input_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(pad2d, ops::Pad2DOp);
-#endif
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(pad2d, ops::Pad2DOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(pad2d, ops::Pad2DOp);
-#endif
-#endif  // PAD2D_OP
diff --git a/mobile/src/operators/pad2d_op.h b/mobile/src/operators/pad2d_op.h
deleted file mode 100644
index 1a80cbac40..0000000000
--- a/mobile/src/operators/pad2d_op.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PAD2D_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/pad2d_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-DECLARE_OPERATOR(Pad2D, Pad2DParam, Pad2DKernel);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // PAD2D_OP
diff --git a/mobile/src/operators/polygon_box_transform_op.cpp b/mobile/src/operators/polygon_box_transform_op.cpp
deleted file mode 100644
index a3eed0e2f3..0000000000
--- a/mobile/src/operators/polygon_box_transform_op.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POLYGONBOXTRANSFORM_OP
-
-#include "operators/polygon_box_transform_op.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void PolygonBoxTransformOp<Dtype, T>::InferShape() const {
-  PADDLE_MOBILE_ENFORCE(this->param_.Input() != nullptr,
-                        "Input (Input) of get_shape op should not be null.");
-  PADDLE_MOBILE_ENFORCE(this->param_.Output() != nullptr,
-                        "Output (Output) of get_shape op should not be null.");
-
-  auto input_dims = this->param_.Input()->dims();
-
-  PADDLE_MOBILE_ENFORCE(input_dims.size() == 4, "input's rank must be 4.");
-  PADDLE_MOBILE_ENFORCE(input_dims[1] % 2 == 0,
-                        "input's second dimension must be even.");
-
-  this->param_.Output()->Resize(input_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(polygon_box_transform, ops::PolygonBoxTransformOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/polygon_box_transform_op.h b/mobile/src/operators/polygon_box_transform_op.h
deleted file mode 100644
index a4d1975e58..0000000000
--- a/mobile/src/operators/polygon_box_transform_op.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POLYGONBOXTRANSFORM_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/polygon_box_transform_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class PolygonBoxTransformOp
-    : public framework::OperatorWithKernel<
-          DeviceType, PolygonBoxTransformParam<DeviceType>,
-          operators::PolygonBoxTransformKernel<DeviceType, T>> {
- public:
-  PolygonBoxTransformOp(const std::string &type, const VariableNameMap &inputs,
-                        const VariableNameMap &outputs,
-                        const framework::AttributeMap &attrs,
-                        framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, PolygonBoxTransformParam<DeviceType>,
-            operators::PolygonBoxTransformKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, PolygonBoxTransformParam<DeviceType>,
-      operators::PolygonBoxTransformKernel<DeviceType, T>>::OperatorWithKernel;
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/pool_op.cpp b/mobile/src/operators/pool_op.cpp
deleted file mode 100644
index f73fe01cc7..0000000000
--- a/mobile/src/operators/pool_op.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POOL_OP
-
-#include "operators/pool_op.h"
-#include <vector>
-#include "framework/op_proto_maker.h"
-#include "framework/op_registry.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-int PoolOutputSize(int input_size, int filter_size, int padding, int stride,
-                   bool ceil_mode) {
-  int output_size;
-  if (!ceil_mode) {
-    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
-  } else {
-    output_size =
-        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
-  }
-  return output_size;
-}
-template <typename DeviceType, typename T>
-void PoolOp<DeviceType, T>::InferShape() const {
-  auto in_x_dims = this->param_.Input()->dims();
-  std::vector<int> ksize = this->param_.Ksize();
-  std::vector<int> paddings = this->param_.Paddings();
-  std::vector<int> strides = this->param_.Strides();
-  bool ceil_mode = this->param_.isCeilMode();
-
-  if (this->param_.isGlobalPooling()) {
-    ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
-    for (size_t i = 0; i < ksize.size(); ++i) {
-      paddings[i] = 0;
-      ksize[i] = static_cast<int>(in_x_dims[i + 2]);
-    }
-  }
-  std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
-  for (size_t i = 0; i < ksize.size(); ++i) {
-    output_shape.push_back(PoolOutputSize(in_x_dims[i + 2], ksize[i],
-                                          paddings[i], strides[i], ceil_mode));
-  }
-  this->param_.Output()->Resize(framework::make_ddim(output_shape));
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(pool2d, ops::PoolOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(pool2d, ops::PoolOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(pool2d, ops::PoolOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/pool_op.h b/mobile/src/operators/pool_op.h
deleted file mode 100644
index 861430f10b..0000000000
--- a/mobile/src/operators/pool_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POOL_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/pool_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class PoolOp : public framework::OperatorWithKernel<
-                   DeviceType, PoolParam<DeviceType>,
-                   operators::PoolKernel<DeviceType, T>> {
- public:
-  PoolOp(const std::string &type, const VariableNameMap &inputs,
-         const VariableNameMap &outputs, const AttributeMap &attrs,
-         framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, PoolParam<DeviceType>,
-                                      operators::PoolKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- private:
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/prelu_op.cpp b/mobile/src/operators/prelu_op.cpp
deleted file mode 100644
index 0c373ca711..0000000000
--- a/mobile/src/operators/prelu_op.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PRELU_OP
-
-#include "operators/prelu_op.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void PReluOp<Dtype, T>::InferShape() const {
-  auto input_dims = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(input_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-/*
- * @b 每一个 op 都需要注册一下的,
- *    USE_OP的参数 和 REGISTER_OPERATOR的第一个参数
- * 都是需要和model中类型对应起来的
- * */
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(prelu, ops::PReluOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/prelu_op.h b/mobile/src/operators/prelu_op.h
deleted file mode 100644
index 92c2e7e620..0000000000
--- a/mobile/src/operators/prelu_op.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PRELU_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/prelu_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class PReluOp : public framework::OperatorWithKernel<
-                    DeviceType, PReluParam<DeviceType>,
-                    operators::PReluKernel<DeviceType, T>> {
- public:
-  PReluOp(const std::string &type, const VariableNameMap &inputs,
-          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-          framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, PReluParam<DeviceType>,
-                                      operators::PReluKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/prior_box_op.cpp b/mobile/src/operators/prior_box_op.cpp
deleted file mode 100644
index da37273de5..0000000000
--- a/mobile/src/operators/prior_box_op.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/prior_box_op.h"
-#include <vector>
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef PRIORBOX_OP
-template <typename Dtype, typename T>
-void PriorBoxOp<Dtype, T>::InferShape() const {
-  auto input_dims = this->param_.Input()->dims();
-  auto input_image_dims = this->param_.InputImage()->dims();
-  auto min_sizes = this->param_.MinSizes();
-  auto max_sizes = this->param_.MaxSizes();
-  auto variances = this->param_.Variances();
-  auto aspect_ratios = this->param_.AspectRatios();
-  bool flip = this->param_.Flip();
-  std::vector<float> aspect_ratios_vec;
-  ExpandAspectRatios(aspect_ratios, flip, &aspect_ratios_vec);
-
-  size_t num_priors = aspect_ratios_vec.size() * min_sizes.size();
-  if (!max_sizes.empty()) {
-    num_priors += max_sizes.size();
-  }
-
-  std::vector<int64_t> dim_vec(4);
-  dim_vec[0] = input_dims[2];
-  dim_vec[1] = input_dims[3];
-  dim_vec[2] = num_priors;
-  dim_vec[3] = 4;
-  this->param_.OutputBoxes()->Resize(framework::make_ddim(dim_vec));
-  this->param_.OutputVariances()->Resize(framework::make_ddim(dim_vec));
-}
-#endif  // PRIORBOX_OP
-
-#ifdef DENSITY_PRIORBOX_OP
-template <typename Dtype, typename T>
-void DensityPriorBoxOp<Dtype, T>::InferShape() const {
-  auto input_dims = this->param_.Input()->dims();
-  auto input_image_dims = this->param_.InputImage()->dims();
-
-  auto &fixed_sizes = this->param_.FixedSizes();
-  auto &fixed_ratios = this->param_.FixedRatios();
-  auto &densities = this->param_.Densities();
-  bool flatten = this->param_.FlattenTo2d();
-
-  size_t num_priors = 0;
-  for (size_t i = 0; i < densities.size(); ++i) {
-    num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
-  }
-  if (!flatten) {
-    std::vector<int64_t> dim_vec(4);
-    dim_vec[0] = input_dims[2];
-    dim_vec[1] = input_dims[3];
-    dim_vec[2] = num_priors;
-    dim_vec[3] = 4;
-    this->param_.OutputBoxes()->Resize(framework::make_ddim(dim_vec));
-    this->param_.OutputVariances()->Resize(framework::make_ddim(dim_vec));
-  } else {
-    int64_t dim0 = input_dims[2] * input_dims[3] * num_priors;
-    this->param_.OutputBoxes()->Resize(framework::make_ddim({dim0, 4}));
-    this->param_.OutputVariances()->Resize(framework::make_ddim({dim0, 4}));
-  }
-}
-#endif  // DENSITY_PRIORBOX_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CPU
-#ifdef PRIORBOX_OP
-REGISTER_OPERATOR_CPU(prior_box, ops::PriorBoxOp);
-#endif  // PRIORBOX_OP
-#ifdef DENSITY_PRIORBOX_OP
-REGISTER_OPERATOR_CPU(density_prior_box, ops::DensityPriorBoxOp);
-#endif  // DENSITY_PRIORBOX_OP
-#endif  // PADDLE_MOBILE_CPU
-
-#ifdef PADDLE_MOBILE_CL
-#ifdef PRIORBOX_OP
-REGISTER_OPERATOR_CL(prior_box, ops::PriorBoxOp);
-#endif  // PRIORBOX_OP
-#ifdef DENSITY_PRIORBOX_OP
-REGISTER_OPERATOR_CL(density_prior_box, ops::DensityPriorBoxOp);
-#endif  // DENSITY_PRIORBOX_OP
-#endif  // PADDLE_MOBILE_CL
diff --git a/mobile/src/operators/prior_box_op.h b/mobile/src/operators/prior_box_op.h
deleted file mode 100644
index 7a3c0466a0..0000000000
--- a/mobile/src/operators/prior_box_op.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/prior_box_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef PRIORBOX_OP
-DECLARE_OPERATOR(PriorBox, PriorBoxParam, PriorBoxKernel);
-#endif
-
-#ifdef DENSITY_PRIORBOX_OP
-DECLARE_OPERATOR(DensityPriorBox, DensityPriorBoxParam, DensityPriorBoxKernel);
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/quantize_op.cpp b/mobile/src/operators/quantize_op.cpp
deleted file mode 100644
index bf12ca2f83..0000000000
--- a/mobile/src/operators/quantize_op.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef QUANT_OP
-
-#include "operators/quantize_op.h"
-#include <vector>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-void QuantizeOp<DeviceType, T>::InferShape() const {
-  const auto &input_dims = this->param_.input_->dims();
-  this->param_.output_->Resize(input_dims);
-  auto scale_dims = framework::make_ddim(std::vector<int>{1});
-  this->param_.online_scale_->Resize(scale_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(quantize, ops::QuantizeOp);
-#endif
-
-#endif  // QUANT_OP
diff --git a/mobile/src/operators/quantize_op.h b/mobile/src/operators/quantize_op.h
deleted file mode 100644
index 253113ad4b..0000000000
--- a/mobile/src/operators/quantize_op.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef QUANT_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/quantize_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class QuantizeOp : public framework::OperatorWithKernel<
-                       DeviceType, QuantizeParam<DeviceType>,
-                       operators::QuantizeKernel<DeviceType, T>> {
- public:
-  QuantizeOp(const std::string &type, const VariableNameMap &inputs,
-             const VariableNameMap &outputs,
-             const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, QuantizeParam<DeviceType>,
-                                      operators::QuantizeKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  // inference output shape
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // QUANT_OP
diff --git a/mobile/src/operators/range_op.cpp b/mobile/src/operators/range_op.cpp
deleted file mode 100644
index b7abb52f0f..0000000000
--- a/mobile/src/operators/range_op.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RANGE_OP
-
-#include "operators/range_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void RangeOp<Dtype, T>::InferShape() const {
-  auto s_dims = this->param_.Start()->dims();
-  PADDLE_MOBILE_ENFORCE((s_dims.size() == 1) && (s_dims[0] == 1),
-                        "The shape of Input(Start) should be [1].");
-  auto e_dims = this->param_.End()->dims();
-  PADDLE_MOBILE_ENFORCE((e_dims.size() == 1) && (e_dims[0] == 1),
-                        "The shape of Input(End) should be [1].");
-  auto step_dims = this->param_.Step()->dims();
-  PADDLE_MOBILE_ENFORCE((step_dims.size() == 1) && (step_dims[0] == 1),
-                        "The shape of Input(Step) should be [1].");
-  this->param_.Output()->Resize({-1});
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(range, ops::RangeOp);
-#endif
-
-#endif  // ASSIGN_OP
diff --git a/mobile/src/operators/range_op.h b/mobile/src/operators/range_op.h
deleted file mode 100644
index a3ca1a56ff..0000000000
--- a/mobile/src/operators/range_op.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RANGE_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/range_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-DECLARE_OPERATOR(Range, RangeParam, RangeKernel);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/reduce_prod_op.cpp b/mobile/src/operators/reduce_prod_op.cpp
deleted file mode 100644
index 9eb4866d4f..0000000000
--- a/mobile/src/operators/reduce_prod_op.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef REDUCE_PROD_OP
-
-#include "operators/reduce_prod_op.h"
-#include <algorithm>
-#include <vector>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void ReduceProdOp<Dtype, T>::InferShape() const {
-  PADDLE_MOBILE_ENFORCE(this->param_.Input() != nullptr,
-                        "Input (X) of ReduceOp op should not be null.");
-  PADDLE_MOBILE_ENFORCE(this->param_.Output() != nullptr,
-                        "Output (Output) of ReduceOp op should not be null.");
-
-  auto x_dims = this->param_.Input()->dims();
-  auto x_rank = x_dims.size();
-  PADDLE_MOBILE_ENFORCE(x_rank <= 6,
-                        "Tensors with rank at most 6 are supported.");
-  auto dims = this->param_.getDim();
-  for (size_t i = 0; i < dims.size(); ++i) {
-    if (dims[i] < 0) dims[i] = x_rank + dims[i];
-    PADDLE_MOBILE_ENFORCE(
-        dims[i] < x_rank,
-        "The dim should be in the range [-rank(input), rank(input)).");
-  }
-  sort(dims.begin(), dims.end());
-  bool reduce_all = this->param_.isReduceAll();
-  bool keep_dim = this->param_.isKeepDim();
-  if (reduce_all) {
-    if (keep_dim)
-      this->param_.Output()->Resize(
-          framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
-    else
-      this->param_.Output()->Resize({1});
-  } else {
-    auto dims_vector = vectorize(x_dims);
-    if (keep_dim) {
-      for (size_t i = 0; i < dims.size(); ++i) {
-        dims_vector[dims[i]] = 1;
-      }
-    } else {
-      const int kDelFlag = -2;
-      for (size_t i = 0; i < dims.size(); ++i) {
-        dims_vector[dims[i]] = kDelFlag;
-      }
-      dims_vector.erase(
-          remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
-          dims_vector.end());
-    }
-    auto out_dims = framework::make_ddim(dims_vector);
-    this->param_.Output()->Resize(out_dims);
-    if (std::is_same<DeviceType<kCPU>, Dtype>::value) {
-      if (dims[0] != 0) {
-        // Only pass LoD when not reducing on the first dim.
-        this->param_.Output()->set_lod(this->param_.Input()->lod());
-      }
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(reduce_prod, ops::ReduceProdOp);
-#endif
-
-#endif  // REDUCE_PROD_OP
diff --git a/mobile/src/operators/reduce_prod_op.h b/mobile/src/operators/reduce_prod_op.h
deleted file mode 100644
index 46af419d25..0000000000
--- a/mobile/src/operators/reduce_prod_op.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef REDUCE_PROD_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/reduce_prod_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-DECLARE_OPERATOR(ReduceProd, ReduceProdParam, ReduceProdKernel);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/reshape2_op.cpp b/mobile/src/operators/reshape2_op.cpp
deleted file mode 100644
index 29712e1818..0000000000
--- a/mobile/src/operators/reshape2_op.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE2_OP
-
-#include "operators/reshape2_op.h"
-#include <vector>
-#include "operators/kernel/reshape_kernel.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void Reshape2Op<Dtype, T>::InferShape() const {
-  if (this->param_.InputShape() != nullptr) {
-    return;
-  }
-  auto &shape = this->param_.Shape();
-  auto input_x_dims = this->param_.InputX()->dims();
-  bool shouldResize = true;
-  if (std::is_same<DeviceType<kGPU_CL>, Dtype>::value) {
-    auto input_dim_size = input_x_dims.size();
-    if (input_dim_size > 4) {
-      for (int i = 0; i < input_dim_size - 4; ++i) {
-        if (input_x_dims[i] != 0 && input_x_dims[i] != 1) {
-          shouldResize = false;
-          break;
-        }
-      }
-      if (shouldResize) {
-        std::vector<int64_t> temp_intput_dims;
-        temp_intput_dims.reserve(static_cast<size_t>(4));
-        for (int i = input_dim_size - 4; i < input_dim_size; ++i) {
-          temp_intput_dims.push_back(input_x_dims[i]);
-        }
-        framework::DDim temp_ddim = framework::make_ddim(temp_intput_dims);
-        this->param_.InputX()->Resize(temp_ddim);
-        input_x_dims = this->param_.InputX()->dims();
-      }
-    }
-  }
-
-  auto out_dims = ValidateShape(shape, input_x_dims);
-  this->param_.Out()->Resize(out_dims);
-  if (std::is_same<DeviceType<kGPU_CL>, Dtype>::value) {
-    input_x_dims = this->param_.InputX()->dims();
-    shouldResize = true;
-    if (out_dims.size() > 4) {
-      for (int i = 0; i < out_dims.size() - 4; ++i) {
-        if (out_dims[i] != 0 && out_dims[i] != 1) {
-          shouldResize = false;
-          break;
-        }
-      }
-      if (shouldResize) {
-        std::vector<int64_t> temp_output_dims;
-        temp_output_dims.reserve(static_cast<size_t>(4));
-        for (int i = out_dims.size() - 4; i < out_dims.size(); ++i) {
-          temp_output_dims.push_back(out_dims[i]);
-        }
-        framework::DDim temp_ddim = framework::make_ddim(temp_output_dims);
-        this->param_.Out()->Resize(temp_ddim);
-      }
-    }
-  }
-  std::vector<int64_t> xshape_dims(input_x_dims.size() + 1, 0);
-  for (int i = 0; i < input_x_dims.size(); ++i) {
-    xshape_dims[i + 1] = input_x_dims[i];
-  }
-  this->param_.OutputXShape()->Resize(framework::make_ddim(xshape_dims));
-  if (std::is_same<DeviceType<kGPU_CL>, Dtype>::value) {
-    this->param_.OutputXShape()->Resize(input_x_dims);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(reshape2, ops::Reshape2Op);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(reshape2, ops::Reshape2Op);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(reshape2, ops::Reshape2Op);
-#endif
-
-#endif
diff --git a/mobile/src/operators/reshape2_op.h b/mobile/src/operators/reshape2_op.h
deleted file mode 100644
index 19c5e59f71..0000000000
--- a/mobile/src/operators/reshape2_op.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE2_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/reshape2_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class Reshape2Op : public framework::OperatorWithKernel<
-                       DeviceType, Reshape2Param<DeviceType>,
-                       operators::Reshape2Kernel<DeviceType, T>> {
- public:
-  Reshape2Op(const std::string &type, const VariableNameMap &inputs,
-             const VariableNameMap &outputs,
-             const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, Reshape2Param<DeviceType>,
-                                      operators::Reshape2Kernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, Reshape2Param<DeviceType>,
-      operators::Reshape2Kernel<DeviceType, T>>::OperatorWithKernel;
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/reshape_op.cpp b/mobile/src/operators/reshape_op.cpp
deleted file mode 100644
index a58a607207..0000000000
--- a/mobile/src/operators/reshape_op.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE_OP
-
-#include "operators/reshape_op.h"
-#include <vector>
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void ReshapeOp<Dtype, T>::InferShape() const {
-  /// todo: add InputShape() detection.
-  auto &shape = this->param_.Shape();
-  auto input_x_dims = this->param_.InputX()->dims();
-  auto out_dims = ValidateShape(shape, input_x_dims);
-  this->param_.Out()->Resize(out_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(reshape, ops::ReshapeOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(reshape, ops::ReshapeOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/reshape_op.h b/mobile/src/operators/reshape_op.h
deleted file mode 100644
index 67e86044ea..0000000000
--- a/mobile/src/operators/reshape_op.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/reshape_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class ReshapeOp : public framework::OperatorWithKernel<
-                      DeviceType, ReshapeParam<DeviceType>,
-                      operators::ReshapeKernel<DeviceType, T>> {
- public:
-  ReshapeOp(const std::string &type, const VariableNameMap &inputs,
-            const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, ReshapeParam<DeviceType>,
-                                      operators::ReshapeKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/resize_op.cpp b/mobile/src/operators/resize_op.cpp
deleted file mode 100644
index fcdf59b473..0000000000
--- a/mobile/src/operators/resize_op.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESIZE_OP
-
-#include "operators/resize_op.h"
-#include <vector>
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void ResizeOp<Dtype, T>::InferShape() const {
-  auto out_dims = CalOutputShape(this->param_);
-  this->param_.Out()->Resize(out_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(resize, ops::ResizeOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/resize_op.h b/mobile/src/operators/resize_op.h
deleted file mode 100644
index 6088ad4f51..0000000000
--- a/mobile/src/operators/resize_op.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESIZE_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/resize_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class ResizeOp : public framework::OperatorWithKernel<
-                     DeviceType, ResizeParam<DeviceType>,
-                     operators::ResizeKernel<DeviceType, T>> {
- public:
-  ResizeOp(const std::string &type, const VariableNameMap &inputs,
-           const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-           framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, ResizeParam<DeviceType>,
-                                      operators::ResizeKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/scale_op.cpp b/mobile/src/operators/scale_op.cpp
deleted file mode 100644
index 4236d1203b..0000000000
--- a/mobile/src/operators/scale_op.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SCALE_OP
-
-#include "operators/scale_op.h"
-#include <vector>
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void ScaleOp<Dtype, T>::InferShape() const {
-  auto input_dims = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(input_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(scale, ops::ScaleOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(scale, ops::ScaleOp);
-#endif
-#endif
diff --git a/mobile/src/operators/scale_op.h b/mobile/src/operators/scale_op.h
deleted file mode 100644
index aacacd9245..0000000000
--- a/mobile/src/operators/scale_op.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SCALE_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/scale_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class ScaleOp : public framework::OperatorWithKernel<
-                    DeviceType, ScaleParam<DeviceType>,
-                    operators::ScaleKernel<DeviceType, T>> {
- public:
-  ScaleOp(const std::string &type, const VariableNameMap &inputs,
-          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-          framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, ScaleParam<DeviceType>,
-                                      operators::ScaleKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/sequence_ops/sequence_expand_op.cpp b/mobile/src/operators/sequence_ops/sequence_expand_op.cpp
deleted file mode 100644
index a1ff839813..0000000000
--- a/mobile/src/operators/sequence_ops/sequence_expand_op.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SEQUENCE_EXPAND_OP
-
-#include "operators/sequence_ops/sequence_expand_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-void SequenceExpandOp<DeviceType, T>::InferShape() const {
-  const auto *input_x = this->param_.input_x_;
-  const auto *input_y = this->param_.input_y_;
-  const auto &x_lod = input_x->lod();
-  const auto &y_lod = input_y->lod();
-  int ref_level = this->param_.ref_level_;
-  if (ref_level == -1) ref_level = y_lod.size() - 1;
-
-  auto out_dims = input_x->dims();
-  int64_t out_first_dim = 0;
-
-  if (y_lod[ref_level].size() > 1) {
-    for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
-      int x_seq_len = 1;
-      if (x_lod.size() == 1) {
-        x_seq_len = x_lod[0][i] - x_lod[0][i - 1];
-      }
-      out_first_dim +=
-          (y_lod[ref_level][i] - y_lod[ref_level][i - 1]) * x_seq_len;
-    }
-    out_dims[0] = out_first_dim;
-  }
-  this->param_.output_->Resize(out_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(sequence_expand, ops::SequenceExpandOp);
-#endif
-
-#endif  // SEQUENCE_EXPAND_OP
diff --git a/mobile/src/operators/sequence_ops/sequence_expand_op.h b/mobile/src/operators/sequence_ops/sequence_expand_op.h
deleted file mode 100644
index f854272d7b..0000000000
--- a/mobile/src/operators/sequence_ops/sequence_expand_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SEQUENCE_EXPAND_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/sequence_kernels.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class SequenceExpandOp : public framework::OperatorWithKernel<
-                             DeviceType, SequenceExpandParam<DeviceType>,
-                             operators::SequenceExpandKernel<DeviceType, T>> {
- public:
-  SequenceExpandOp(const std::string &type, const VariableNameMap &inputs,
-                   const VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs,
-                   framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, SequenceExpandParam<DeviceType>,
-            operators::SequenceExpandKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  // inference output shape
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // SEQUENCE_EXPAND_OP
diff --git a/mobile/src/operators/sequence_ops/sequence_pool_op.cpp b/mobile/src/operators/sequence_ops/sequence_pool_op.cpp
deleted file mode 100644
index 4165d8ef60..0000000000
--- a/mobile/src/operators/sequence_ops/sequence_pool_op.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SEQUENCE_POOL_OP
-
-#include "operators/sequence_ops/sequence_pool_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-void SequencePoolOp<DeviceType, T>::InferShape() const {
-  const auto *input = this->param_.input_;
-  auto out_dims = input->dims();
-  out_dims[0] = input->lod()[0].size() - 1;
-  this->param_.output_->Resize(out_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(sequence_pool, ops::SequencePoolOp);
-#endif
-
-#endif  // SEQUENCE_POOL_OP
diff --git a/mobile/src/operators/sequence_ops/sequence_pool_op.h b/mobile/src/operators/sequence_ops/sequence_pool_op.h
deleted file mode 100644
index aae892f9f3..0000000000
--- a/mobile/src/operators/sequence_ops/sequence_pool_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SEQUENCE_POOL_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/sequence_kernels.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class SequencePoolOp : public framework::OperatorWithKernel<
-                           DeviceType, SequencePoolParam<DeviceType>,
-                           operators::SequencePoolKernel<DeviceType, T>> {
- public:
-  SequencePoolOp(const std::string &type, const VariableNameMap &inputs,
-                 const VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, SequencePoolParam<DeviceType>,
-            operators::SequencePoolKernel<DeviceType, T>>(type, inputs, outputs,
-                                                          attrs, scope) {}
-  // inference output shape
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // SEQUENCE_POOL_OP
diff --git a/mobile/src/operators/sequence_ops/sequence_softmax_op.cpp b/mobile/src/operators/sequence_ops/sequence_softmax_op.cpp
deleted file mode 100644
index 602e0d2975..0000000000
--- a/mobile/src/operators/sequence_ops/sequence_softmax_op.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SEQUENCE_SOFTMAX_OP
-
-#include "operators/sequence_ops/sequence_softmax_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-void SequenceSoftmaxOp<DeviceType, T>::InferShape() const {
-  const auto *input_x = this->param_.InputX();
-  const auto &x_lod = input_x->lod();
-
-  this->param_.Out()->Resize(input_x->dims());
-  this->param_.Out()->set_lod(input_x->lod());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(sequence_softmax, ops::SequenceSoftmaxOp);
-#endif
-
-#endif  // SEQUENCE_SOFTMAX_OP
diff --git a/mobile/src/operators/sequence_ops/sequence_softmax_op.h b/mobile/src/operators/sequence_ops/sequence_softmax_op.h
deleted file mode 100644
index f0578f6ed3..0000000000
--- a/mobile/src/operators/sequence_ops/sequence_softmax_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SEQUENCE_SOFTMAX_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/sequence_kernels.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class SequenceSoftmaxOp : public framework::OperatorWithKernel<
-                              DeviceType, SoftmaxParam<DeviceType>,
-                              operators::SequenceSoftmaxKernel<DeviceType, T>> {
- public:
-  SequenceSoftmaxOp(const std::string &type, const VariableNameMap &inputs,
-                    const VariableNameMap &outputs,
-                    const framework::AttributeMap &attrs,
-                    framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, SoftmaxParam<DeviceType>,
-            operators::SequenceSoftmaxKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  // inference output shape
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // SEQUENCE_SOFTMAX_OP
diff --git a/mobile/src/operators/shape_op.cpp b/mobile/src/operators/shape_op.cpp
deleted file mode 100644
index f3ef72c16f..0000000000
--- a/mobile/src/operators/shape_op.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SHAPE_OP
-
-#include "operators/shape_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-template <typename DeviceType, typename T>
-void ShapeOp<DeviceType, T>::InferShape() const {
-  PADDLE_MOBILE_ENFORCE(this->param_.Input() != nullptr,
-                        "Input (Input) of get_shape op should not be null.");
-  PADDLE_MOBILE_ENFORCE(this->param_.Out() != nullptr,
-                        "Output (Out) of get_shape op should not be null.");
-  this->param_.Out()->Resize({this->param_.Input()->dims().size()});
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(shape, ops::ShapeOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/shape_op.h b/mobile/src/operators/shape_op.h
deleted file mode 100644
index 05bc611bc5..0000000000
--- a/mobile/src/operators/shape_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SHAPE_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/shape_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class ShapeOp : public framework::OperatorWithKernel<
-                    DeviceType, ShapeParam<DeviceType>,
-                    operators::ShapeKernel<DeviceType, T>> {
- public:
-  ShapeOp(const std::string &type, const VariableNameMap &inputs,
-          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-          framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, ShapeParam<DeviceType>,
-                                      operators::ShapeKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/slice_op.cpp b/mobile/src/operators/slice_op.cpp
deleted file mode 100644
index 29fe870ae3..0000000000
--- a/mobile/src/operators/slice_op.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SLICE_OP
-
-#include "operators/slice_op.h"
-#include <algorithm>
-#include <vector>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void SliceOp<Dtype, T>::InferShape() const {
-  auto axes = this->param_.axes_;
-  auto input = this->param_.input_;
-  auto output = this->param_.output_;
-  if (std::is_same<DeviceType<kGPU_CL>, Dtype>::value) {
-    auto output_dims = output->dims();
-    auto output_dims_size = output_dims.size();
-    bool should_resize = true;
-    if (output_dims_size > 4) {
-      for (int i = 0; i < output_dims_size - 4; ++i) {
-        if (output_dims[i] != 0 && output_dims[i] != 1) {
-          should_resize = false;
-          break;
-        }
-      }
-      if (should_resize) {
-        std::vector<int64_t> temp_output_dims;
-        temp_output_dims.reserve(static_cast<size_t>(4));
-        for (int i = output_dims_size - 4; i < output_dims_size; ++i) {
-          temp_output_dims.push_back(output_dims[i]);
-        }
-        framework::DDim temp_ddim = framework::make_ddim(temp_output_dims);
-        this->param_.output_->Resize(temp_ddim);
-      }
-    }
-  }
-  PADDLE_MOBILE_ENFORCE(axes.size() == 1, "axes size should equals 1");
-  PADDLE_MOBILE_ENFORCE(input->dims().size() == output->dims().size(),
-                        "input dim size should equals output dim size");
-  if (std::is_same<DeviceType<kGPU_CL>, Dtype>::value) {
-    PADDLE_MOBILE_ENFORCE(
-        output->dims().size() -
-                (axes[0] - (this->param_.original_output_dims_size_ -
-                            this->param_.output_->dims().size())) ==
-            3,
-        "op only support slice channel now");
-  }
-  auto starts = this->param_.starts_;
-  auto ends = this->param_.ends_;
-  framework::DDim out_dims(input->dims());
-  PADDLE_MOBILE_ENFORCE(starts.size() == ends.size(),
-                        "starts.size should equal ends.size");
-  PADDLE_MOBILE_ENFORCE(axes.size() == starts.size(),
-                        "axes.size should equal starts.size");
-  int dim_value, start, end;
-  for (size_t i = 0; i < axes.size(); ++i) {
-    int axis = axes[i] - (this->param_.original_output_dims_size_ -
-                          this->param_.output_->dims().size());
-    dim_value = out_dims[axis];
-    if (dim_value > 0) {
-      start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
-      end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
-      start = std::max(start, 0);
-      end = std::max(end, 0);
-      // start = std::min(start, dim_value);
-      end = std::min(end, dim_value);
-      // start = std::min(start, end);
-      PADDLE_MOBILE_ENFORCE(end > start, "end should greater than start");
-      out_dims[axis] = end - start;
-    }
-  }
-  output->Resize(out_dims);
-  if (std::is_same<DeviceType<kCPU>, Dtype>::value) {
-    LoDTensor *output_lod = reinterpret_cast<LoDTensor *>(output);
-    LoDTensor *input_lod = reinterpret_cast<LoDTensor *>(input);
-    if (axes[0] != 0) {
-      output_lod->set_lod(input_lod->lod());
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(slice, ops::SliceOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(slice, ops::SliceOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(slice, ops::SliceOp);
-#endif
-#endif  // SLICE_OP
diff --git a/mobile/src/operators/slice_op.h b/mobile/src/operators/slice_op.h
deleted file mode 100644
index 0d01705f7d..0000000000
--- a/mobile/src/operators/slice_op.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SLICE_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/slice_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class SliceOp : public framework::OperatorWithKernel<
-                    DeviceType, SliceParam<DeviceType>,
-                    operators::SliceKernel<DeviceType, T>> {
- public:
-  SliceOp(const std::string &type, const VariableNameMap &inputs,
-          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-          framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, SliceParam<DeviceType>,
-                                      operators::SliceKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/softmax_op.cpp b/mobile/src/operators/softmax_op.cpp
deleted file mode 100644
index d88fc0a9f1..0000000000
--- a/mobile/src/operators/softmax_op.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SOFTMAX_OP
-
-#include "operators/softmax_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-template <typename DeviceType, typename T>
-void SoftmaxOp<DeviceType, T>::InferShape() const {
-  this->param_.Out()->Resize(this->param_.InputX()->dims());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(softmax, ops::SoftmaxOp);
-#endif
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(softmax, ops::SoftmaxOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(softmax, ops::SoftmaxOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/softmax_op.h b/mobile/src/operators/softmax_op.h
deleted file mode 100644
index 2f9285a21d..0000000000
--- a/mobile/src/operators/softmax_op.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SOFTMAX_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/softmax_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-template <typename DeviceType, typename T>
-class SoftmaxOp : public framework::OperatorWithKernel<
-                      DeviceType, SoftmaxParam<DeviceType>,
-                      operators::SoftmaxKernel<DeviceType, T>> {
- public:
-  SoftmaxOp(const std::string &type, const VariableNameMap &inputs,
-            const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, SoftmaxParam<DeviceType>,
-                                      operators::SoftmaxKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- private:
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/split_op.cpp b/mobile/src/operators/split_op.cpp
deleted file mode 100644
index ec82214a48..0000000000
--- a/mobile/src/operators/split_op.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SPLIT_OP
-#include "operators/split_op.h"
-#include <vector>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-void SplitOp<DeviceType, T>::InferShape() const {
-  PADDLE_MOBILE_ENFORCE(this->param_.InputX() != nullptr,
-                        "Input(X) of SplitOp should not be null.");
-  //  std::string str;
-  //  str.size()
-  const auto &outs = this->param_.Outs();
-  PADDLE_MOBILE_ENFORCE(outs.size() >= 1UL,
-                        "Outputs(Out) of SplitOp should not be empty.");
-
-  auto in_dims = this->param_.InputX()->dims();
-  size_t axis = static_cast<size_t>(this->param_.Axis());
-  size_t num = static_cast<size_t>(this->param_.Num());
-
-  const auto &sections = this->param_.Sections();
-
-  const size_t outs_number = outs.size();
-  std::vector<framework::DDim> outs_dims;
-  outs_dims.reserve(outs_number);
-
-  if (num > 0) {
-    int64_t in_axis_dim = in_dims[axis];
-    PADDLE_MOBILE_ENFORCE(in_axis_dim % num == 0,
-                          "tensor split does not result"
-                          " in an equal division");
-    size_t out_axis_dim = in_axis_dim / num;
-    for (size_t i = 0; i < outs_number; ++i) {
-      auto dim = in_dims;
-      dim[axis] = out_axis_dim;
-      outs_dims.push_back(dim);
-    }
-  } else if (sections.size() > 0) {
-    PADDLE_MOBILE_ENFORCE(sections.size() == outs_number,
-                          "tensor split sections size"
-                          "should be equal to output size.");
-    for (size_t i = 0; i < outs_number; ++i) {
-      auto dim = in_dims;
-      dim[axis] = sections[i];
-      outs_dims.push_back(dim);
-    }
-  }
-
-  PADDLE_MOBILE_ENFORCE(outs_dims.size() == outs.size(),
-                        "length==dims.size()  must be true!");
-  for (int j = 0; j < outs_dims.size(); ++j) {
-    outs[j]->Resize(outs_dims[j]);
-  }
-
-  //  todo lod impl
-  //  if (axis != 0) {
-  //    // Only pass LoD when not spliting along the first dim.
-  //    for (size_t i = 0; i < outs_number; ++i) {
-  //      ctx->ShareLoD("X", "Out", 0, i);
-  //    }
-  //  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(split, ops::SplitOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(split, ops::SplitOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(split, ops::SplitOp);
-#endif
-
-#endif  // SPLIT_OP
diff --git a/mobile/src/operators/split_op.h b/mobile/src/operators/split_op.h
deleted file mode 100644
index 4801defb49..0000000000
--- a/mobile/src/operators/split_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SPLIT_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/split_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class SplitOp : public framework::OperatorWithKernel<
-                    DeviceType, SplitParam<DeviceType>,
-                    operators::SplitKernel<DeviceType, T>> {
- public:
-  SplitOp(const std::string &type, const VariableNameMap &inputs,
-          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-          framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, SplitParam<DeviceType>,
-                                      operators::SplitKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/sum_op.cpp b/mobile/src/operators/sum_op.cpp
deleted file mode 100644
index 1049edcbd5..0000000000
--- a/mobile/src/operators/sum_op.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SUM_OP
-
-#include <vector>
-
-#include "operators/sum_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void SumOp<Dtype, T>::InferShape() const {
-  auto inputs = this->param_.Inputs();
-  const size_t n = inputs.size();
-
-  std::vector<framework::DDim> inputs_dims;
-  inputs_dims.reserve(n);
-  for (int i = 0; i < n; i++) {
-    inputs_dims.push_back(inputs[i]->dims());
-  }
-
-  if (n == 1) {
-    DLOG << "Warning: sum op have only one input, "
-            "may waste memory";
-  }
-
-  framework::DDim in_dim({0});
-
-  for (auto& x_dim : inputs_dims) {
-    if (framework::product(x_dim) == 0) {
-      continue;
-    }
-    if (framework::product(in_dim) == 0) {
-      in_dim = x_dim;
-    } else {
-      PADDLE_MOBILE_ENFORCE(in_dim == x_dim,
-                            "input tensors must have same shape");
-    }
-  }
-
-  this->param_.Out()->Resize(in_dim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(sum, ops::SumOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
-#endif
diff --git a/mobile/src/operators/sum_op.h b/mobile/src/operators/sum_op.h
deleted file mode 100644
index 3ee5465fc8..0000000000
--- a/mobile/src/operators/sum_op.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SUM_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/sum_kernel.h"
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-template <typename DeviceType, typename T>
-class SumOp : public framework::OperatorWithKernel<
-                  DeviceType, SumParam<DeviceType>,
-                  operators::SumKernel<DeviceType, T>> {
- public:
-  SumOp(const string &type, const VariableNameMap &inputs,
-        const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-        framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, SumParam<DeviceType>,
-                                      operators::SumKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, SumParam<DeviceType>,
-      operators::SumKernel<DeviceType, T>>::OperatorWithKernel;
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/top_k_op.cpp b/mobile/src/operators/top_k_op.cpp
deleted file mode 100644
index c27b24d7e8..0000000000
--- a/mobile/src/operators/top_k_op.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef TOP_K_OP
-
-#include "operators/top_k_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void TopKOp<Dtype, T>::InferShape() const {
-  const int k = this->param_.k_;
-  auto dims = this->param_.input_->dims();
-  // should check k <= dims[-1] && k >= 1
-  dims[dims.size() - 1] = k;
-  this->param_.output_->Resize(dims);
-  this->param_.indices_->Resize(dims);
-  if (std::is_same<DeviceType<kCPU>, Dtype>::value) {
-    this->param_.output_->set_lod(this->param_.input_->lod());
-    this->param_.indices_->set_lod(this->param_.input_->lod());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(top_k, ops::TopKOp);
-#endif
-
-#endif  // TOP_K_OP
diff --git a/mobile/src/operators/top_k_op.h b/mobile/src/operators/top_k_op.h
deleted file mode 100644
index 4c182d6ffe..0000000000
--- a/mobile/src/operators/top_k_op.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef TOP_K_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/kernels.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class TopKOp : public framework::OperatorWithKernel<
-                   DeviceType, TopKParam<DeviceType>,
-                   operators::TopKKernel<DeviceType, T>> {
- public:
-  TopKOp(const std::string &type, const VariableNameMap &inputs,
-         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-         framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, TopKParam<DeviceType>,
-                                      operators::TopKKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  // inference output shape
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // TOP_K_OP
diff --git a/mobile/src/operators/transpose2_op.cpp b/mobile/src/operators/transpose2_op.cpp
deleted file mode 100644
index ca9ceaafbd..0000000000
--- a/mobile/src/operators/transpose2_op.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef TRANSPOSE2_OP
-
-#include <vector>
-
-#include "common/enforce.h"
-#include "operators/transpose2_op.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void Transpose2Op<Dtype, T>::InferShape() const {
-  auto input_x_dims = this->param_.InputX()->dims();
-  auto axis = this->param_.Axis();
-
-  size_t x_dims_size = input_x_dims.size();
-  size_t axis_size = axis.size();
-
-  if (std::is_same<DeviceType<kGPU_CL>, Dtype>::value) {
-    bool shouldResize = true;
-    int diff_dim = 0;
-    if (axis_size > 4) {
-      for (int i = 0; i < axis_size - 4; ++i) {
-        if (axis[i] != i) {
-          shouldResize = false;
-          break;
-        } else {
-          diff_dim++;
-        }
-      }
-      if (shouldResize) {
-        std::vector<int> temp_axis_dims;
-        temp_axis_dims.reserve(static_cast<size_t>(4));
-        for (int i = axis_size - 4; i < axis_size; ++i) {
-          temp_axis_dims.push_back(axis[i] - diff_dim);
-        }
-        axis.resize(4);
-        axis.clear();
-        axis.insert(axis.begin(), temp_axis_dims.begin(), temp_axis_dims.end());
-      }
-    }
-
-    auto input_dim_size = input_x_dims.size();
-    shouldResize = true;
-    if (input_dim_size > 4) {
-      for (int i = 0; i < input_dim_size - 4; ++i) {
-        if (input_x_dims[i] != 0 && input_x_dims[i] != 1) {
-          shouldResize = false;
-          break;
-        }
-      }
-      if (shouldResize) {
-        std::vector<int64_t> temp_intput_dims;
-        temp_intput_dims.reserve(static_cast<size_t>(4));
-        for (int i = input_dim_size - 4; i < input_dim_size; ++i) {
-          temp_intput_dims.push_back(input_x_dims[i]);
-        }
-        framework::DDim temp_ddim = framework::make_ddim(temp_intput_dims);
-        this->param_.InputX()->Resize(temp_ddim);
-      }
-    }
-
-    axis_size = axis.size();
-    input_x_dims = this->param_.InputX()->dims();
-    x_dims_size = input_x_dims.size();
-  }
-
-  PADDLE_MOBILE_ENFORCE((x_dims_size == axis_size),
-                        "input_dims must "
-                        "be equal to the axis_size. ")
-
-  std::vector<int> count(axis_size, 0);
-  for (size_t i = 0; i < axis_size; i++) {
-    PADDLE_MOBILE_ENFORCE(
-        axis[i] < static_cast<int>(axis_size) && ++count[axis[i]] == 1,
-        "Each element of Attribute axis should be a unique value "
-        "range from 0 to (dims - 1), "
-        "where the dims is the axis's size");
-  }
-  framework::DDim out_dims(input_x_dims);
-  for (size_t i = 0; i < axis_size; i++) {
-    out_dims[i] = input_x_dims[axis[i]];
-  }
-  this->param_.Out()->Resize(out_dims);
-  std::vector<int64_t> xshape_dims(input_x_dims.size() + 1, 0);
-  for (int i = 0; i < input_x_dims.size(); ++i) {
-    xshape_dims[i + 1] = input_x_dims[i];
-  }
-  this->param_.OutputXShape()->Resize(framework::make_ddim(xshape_dims));
-  if (std::is_same<DeviceType<kGPU_CL>, Dtype>::value) {
-    this->param_.OutputXShape()->Resize(input_x_dims);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(transpose2, ops::Transpose2Op);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(transpose2, ops::Transpose2Op);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(transpose2, ops::Transpose2Op);
-#endif
-#endif  // TRANSPOSE_OP
diff --git a/mobile/src/operators/transpose2_op.h b/mobile/src/operators/transpose2_op.h
deleted file mode 100644
index 2552688ca6..0000000000
--- a/mobile/src/operators/transpose2_op.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef TRANSPOSE2_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/transpose2_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class Transpose2Op : public framework::OperatorWithKernel<
-                         DeviceType, Transpose2Param<DeviceType>,
-                         operators::Transpose2Kernel<DeviceType, T>> {
- public:
-  Transpose2Op(const std::string &type, const VariableNameMap &inputs,
-               const VariableNameMap &outputs,
-               const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, Transpose2Param<DeviceType>,
-            operators::Transpose2Kernel<DeviceType, T>>(type, inputs, outputs,
-                                                        attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, Transpose2Param<DeviceType>,
-      operators::Transpose2Kernel<DeviceType, T>>::OperatorWithKernel;
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/transpose_op.cpp b/mobile/src/operators/transpose_op.cpp
deleted file mode 100644
index 820a4e354d..0000000000
--- a/mobile/src/operators/transpose_op.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef TRANSPOSE_OP
-
-#include <vector>
-
-#include "common/enforce.h"
-#include "operators/transpose_op.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void TransposeOp<Dtype, T>::InferShape() const {
-  auto input_x_dims = this->param_.InputX()->dims();
-  auto axis = this->param_.Axis();
-
-  size_t x_dims_size = input_x_dims.size();
-  size_t axis_size = axis.size();
-
-  PADDLE_MOBILE_ENFORCE((x_dims_size == axis_size),
-                        "input_dims must "
-                        "be equal to the axis_size. ")
-
-  std::vector<int> count(axis_size, 0);
-  for (size_t i = 0; i < axis_size; i++) {
-    PADDLE_MOBILE_ENFORCE(
-        axis[i] < static_cast<int>(axis_size) && ++count[axis[i]] == 1,
-        "Each element of Attribute axis should be a unique value "
-        "range from 0 to (dims - 1), "
-        "where the dims is the axis's size");
-  }
-  framework::DDim out_dims(input_x_dims);
-  for (size_t i = 0; i < axis_size; i++) {
-    out_dims[i] = input_x_dims[axis[i]];
-  }
-  this->param_.Out()->Resize(out_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(transpose, ops::TransposeOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(transpose, ops::TransposeOp);
-#endif
-
-#endif  // TRANSPOSE_OP
diff --git a/mobile/src/operators/transpose_op.h b/mobile/src/operators/transpose_op.h
deleted file mode 100644
index cf03cb3825..0000000000
--- a/mobile/src/operators/transpose_op.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef TRANSPOSE_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/transpose_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class TransposeOp : public framework::OperatorWithKernel<
-                        DeviceType, TransposeParam<DeviceType>,
-                        operators::TransposeKernel<DeviceType, T>> {
- public:
-  TransposeOp(const std::string &type, const VariableNameMap &inputs,
-              const VariableNameMap &outputs,
-              const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, TransposeParam<DeviceType>,
-            operators::TransposeKernel<DeviceType, T>>(type, inputs, outputs,
-                                                       attrs, scope) {}
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/pass/memory_optimize.cpp b/mobile/src/pass/memory_optimize.cpp
deleted file mode 100644
index d9cfa13899..0000000000
--- a/mobile/src/pass/memory_optimize.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "pass/memory_optimize.h"
-#include <algorithm>
-#include "framework/lod_tensor.h"
-
-namespace paddle_mobile {
-namespace pass {
-
-void MemoryOptPass::AppendBlockVars(const framework::BlockDesc *block) {
-  // block_vars_.clear();
-  for (const auto var : block->Vars()) {
-    block_vars_[var->Name()] = var.get();
-  }
-}
-
-bool MemoryOptPass::IsPersistable(const std::string name) {
-  const auto it = block_vars_.find(name);
-  if (it != block_vars_.end()) {
-    return it->second->Persistable();
-  }
-  return false;
-}
-
-VarNode *MemoryOptPass::CreateNode(const std::string name) {
-  auto it = created_nodes_.find(name);
-  if (it != created_nodes_.end()) {
-    ++(it->second->count);
-    return it->second;
-  }
-  VarNode *var = new VarNode;
-  var->name = name;
-  var->count = 1;
-  var->visited = false;
-  created_nodes_[name] = var;
-  return var;
-}
-
-void MemoryOptPass::operator()(
-    const framework::ProgramDesc *program, framework::Scope *scope,
-    MemoryOptimizationLevel memory_optimization_level) {
-  const auto &blocks = program->Blocks();
-  for (const auto &block : blocks) {
-    // access all variables in each block
-    AppendBlockVars(block.get());
-
-    reused_nodes_.clear();
-    // collect all not persistable variables, and accumulate
-    // it's reference count
-    std::stack<VarNode *> empty_var_nodes;
-    analysis_nodes_.swap(empty_var_nodes);
-
-    std::vector<std::string> exclude_var_names;
-    for (const auto &op : block->Ops()) {
-      for (const auto &inputs : op->GetInputs()) {
-        for (const auto &input : inputs.second) {
-          if (!IsPersistable(input)) {
-            if (memory_optimization_level == MemoryOptimizationWithoutFeeds) {
-              if (op->Type() == "feed") {
-                exclude_var_names.push_back(input);
-              }
-            }
-          }
-        }
-      }
-    }
-
-    std::vector<VarNode *> fetch_var_nodes;
-    for (const auto &op : block->Ops()) {
-      DLOG << "op_desc->Type(): " << op->Type();
-      for (const auto &outputs : op->GetOutputs()) {
-        for (const auto &output : outputs.second) {
-          if (!IsPersistable(output) &&
-              std::find(exclude_var_names.begin(), exclude_var_names.end(),
-                        output) == exclude_var_names.end()) {
-            DLOG << "output: " << output;
-            VarNode *node = CreateNode(output);
-            analysis_nodes_.push(node);
-          }
-        }
-      }
-      for (const auto &inputs : op->GetInputs()) {
-        for (const auto &input : inputs.second) {
-          if (!IsPersistable(input) &&
-              std::find(exclude_var_names.begin(), exclude_var_names.end(),
-                        input) == exclude_var_names.end()) {
-            DLOG << "input: " << input;
-            VarNode *node = CreateNode(input);
-            analysis_nodes_.push(node);
-            if (op->Type() == "fetch") {
-              fetch_var_nodes.push_back(node);
-            }
-          }
-        }
-      }
-      for (const auto &outputs : op->GetOutputs()) {
-        for (const auto &output : outputs.second) {
-          if (!IsPersistable(output) &&
-              std::find(exclude_var_names.begin(), exclude_var_names.end(),
-                        output) == exclude_var_names.end()) {
-            DLOG << "output: " << output;
-            VarNode *node = CreateNode(output);
-            analysis_nodes_.push(node);
-          }
-        }
-      }
-    }
-
-    // apply optimize
-    while (!analysis_nodes_.empty()) {
-      auto *node = analysis_nodes_.top();
-      analysis_nodes_.pop();
-      // only not visited node can reuse memory between other nodes
-      // with 0 count which indicate they will not be used any more
-      if (!node->visited) {
-        bool reused = false;
-        // find out a possable reuse list
-        for (auto &list : reused_nodes_) {
-          if (list.back()->count == 0 &&
-              std::find(fetch_var_nodes.begin(), fetch_var_nodes.end(),
-                        list.back()) == fetch_var_nodes.end()) {
-            list.push_back(node);
-            reused = true;
-            break;
-          }
-        }
-        // create new list if can't find a reused list
-        if (!reused) {
-          std::vector<VarNode *> list;
-          list.push_back(node);
-          reused_nodes_.push_back(std::move(list));
-        }
-      }
-      node->visited = true;
-      node->count -= 1;
-    }
-
-    // shared data within all variables in the same reused list
-    for (const auto &list : reused_nodes_) {
-      DLOG << "\n";
-      DLOG << "share memory within these variables";
-      std::string name = list[0]->name;
-      auto *reused_var = scope->Var(name);
-      auto *reuse_tensor =
-          reused_var->template GetMutable<framework::LoDTensor>();
-      reuse_tensor->mutable_data<float>();
-      for (const auto &node : list) {
-        DLOG << node->name;
-        auto *var = scope->Var(node->name);
-        auto *tensor = var->template GetMutable<framework::LoDTensor>();
-        tensor->ShareHolderWith(*reuse_tensor);
-      }
-    }
-  }
-}
-
-}  // namespace pass
-}  // namespace paddle_mobile
diff --git a/mobile/src/pass/memory_optimize.h b/mobile/src/pass/memory_optimize.h
deleted file mode 100644
index f0171c5ba6..0000000000
--- a/mobile/src/pass/memory_optimize.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stack>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "framework/program/program.h"
-#include "pass/pass_base.h"
-
-namespace paddle_mobile {
-namespace pass {
-
-typedef struct {
-  std::string name;  // variable name
-  int count;         // reference count
-  bool visited;
-} VarNode;
-
-// MemoryOptPass will analyze the program, and reuse memory between
-// variables as much as possible
-class MemoryOptPass : public PassBase {
- public:
-  MemoryOptPass() {}
-  virtual ~MemoryOptPass() {
-    for (auto &it : created_nodes_) {
-      delete it.second;
-    }
-  }
-
-  void operator()(const framework::ProgramDesc *program,
-                  framework::Scope *scope,
-                  MemoryOptimizationLevel memory_optimization_level);
-
-  void AppendBlockVars(const framework::BlockDesc *block);
-
-  bool IsPersistable(const std::string name);
-
-  VarNode *CreateNode(const std::string name);
-
- private:
-  std::stack<VarNode *> analysis_nodes_;
-  std::vector<std::vector<VarNode *>> reused_nodes_;
-  std::unordered_map<std::string, VarNode *> created_nodes_;
-  std::unordered_map<std::string, framework::VarDesc *> block_vars_;
-};
-
-}  // namespace pass
-}  // namespace paddle_mobile
diff --git a/mobile/src/pass/memory_optimize_super.cpp b/mobile/src/pass/memory_optimize_super.cpp
deleted file mode 100644
index 344b88b02e..0000000000
--- a/mobile/src/pass/memory_optimize_super.cpp
+++ /dev/null
@@ -1,209 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_MOBILE_CL
-#include "pass/memory_optimize_super.h"
-#include <algorithm>
-#include "framework/cl/cl_image.h"
-#include "framework/lod_tensor.h"
-namespace paddle_mobile {
-namespace pass {
-
-void MemoryOptPassSuper::AppendBlockVars(const framework::BlockDesc *block) {
-  // block_vars_.clear();
-  for (const auto var : block->Vars()) {
-    block_vars_[var->Name()] = var.get();
-  }
-}
-
-bool MemoryOptPassSuper::IsPersistable(const std::string name) {
-  const auto it = block_vars_.find(name);
-  if (it != block_vars_.end()) {
-    return it->second->Persistable();
-  }
-  return false;
-}
-
-ClVarNode *MemoryOptPassSuper::CreateNode(const std::string name) {
-  auto it = created_nodes_.find(name);
-  if (it != created_nodes_.end()) {
-    ++(it->second->count);
-    return it->second;
-  }
-  ClVarNode *var = new ClVarNode;
-  var->name = name;
-  var->count = 1;
-  var->visited = false;
-  created_nodes_[name] = var;
-  return var;
-}
-
-void MemoryOptPassSuper::operator()(
-    const framework::ProgramDesc *program, framework::Scope *scope,
-    MemoryOptimizationLevel memory_optimization_level,
-    framework::DDim target_dims) {
-  const auto &blocks = program->Blocks();
-  for (const auto &block : blocks) {
-    // access all variables in each block
-    AppendBlockVars(block.get());
-    reused_nodes_.clear();
-    // collect all not persistable variables, and accumulate
-    // it's reference count
-    std::stack<ClVarNode *> empty_var_nodes;
-    analysis_nodes_.swap(empty_var_nodes);
-
-    std::vector<std::string> exclude_var_names;
-    for (const auto &op : block->Ops()) {
-      for (const auto &inputs : op->GetInputs()) {
-        for (const auto &input : inputs.second) {
-          if (!IsPersistable(input)) {
-            if (memory_optimization_level == MemoryOptimizationWithoutFeeds) {
-              if (op->Type() == "feed") {
-                exclude_var_names.push_back(input);
-              }
-            }
-          }
-        }
-      }
-    }
-
-    std::vector<ClVarNode *> fetch_var_nodes;
-    for (const auto &op : block->Ops()) {
-      DLOG << "op_desc->Type(): " << op->Type();
-      for (const auto &outputs : op->GetOutputs()) {
-        for (const auto &output : outputs.second) {
-          if (!IsPersistable(output) &&
-              std::find(exclude_var_names.begin(), exclude_var_names.end(),
-                        output) == exclude_var_names.end()) {
-            DLOG << "output: " << output;
-            ClVarNode *node = CreateNode(output);
-            analysis_nodes_.push(node);
-          }
-        }
-      }
-      for (const auto &inputs : op->GetInputs()) {
-        for (const auto &input : inputs.second) {
-          if (!IsPersistable(input) &&
-              std::find(exclude_var_names.begin(), exclude_var_names.end(),
-                        input) == exclude_var_names.end()) {
-            DLOG << "input: " << input;
-            ClVarNode *node = CreateNode(input);
-            analysis_nodes_.push(node);
-            if (op->Type() == "fetch") {
-              fetch_var_nodes.push_back(node);
-            }
-          }
-        }
-      }
-      for (const auto &outputs : op->GetOutputs()) {
-        for (const auto &output : outputs.second) {
-          if (!IsPersistable(output) &&
-              std::find(exclude_var_names.begin(), exclude_var_names.end(),
-                        output) == exclude_var_names.end()) {
-            DLOG << "output: " << output;
-            ClVarNode *node = CreateNode(output);
-            analysis_nodes_.push(node);
-          }
-        }
-      }
-    }
-
-    // apply optimize
-    while (!analysis_nodes_.empty()) {
-      auto *node = analysis_nodes_.top();
-      analysis_nodes_.pop();
-      // only not visited node can reuse memory between other nodes
-      // with 0 count which indicate they will not be used any more
-      if (!node->visited) {
-        bool reused = false;
-        // find out a possable reuse list
-        for (auto &list : reused_nodes_) {
-          if (list.back()->count == 0 &&
-              std::find(fetch_var_nodes.begin(), fetch_var_nodes.end(),
-                        list.back()) == fetch_var_nodes.end()) {
-            list.push_back(node);
-            reused = true;
-            break;
-          }
-        }
-        // create new list if can't find a reused list
-        if (!reused) {
-          std::vector<ClVarNode *> list;
-          list.push_back(node);
-          reused_nodes_.push_back(std::move(list));
-        }
-      }
-      node->visited = true;
-      node->count -= 1;
-    }
-
-    // shared data within all variables in the same reused list
-    ShareData(scope, memory_optimization_level, target_dims);
-  }
-}
-
-void MemoryOptPassSuper::ShareData(
-    framework::Scope *scope, MemoryOptimizationLevel memory_optimization_level,
-    framework::DDim target_dims)
-    const {  // shared data within all variables in the same reused list
-  for (const auto &list : reused_nodes_) {
-    DLOG << "\n";
-    DLOG << "gpu . share memory within these variables";
-    // find max dims
-    int64_t max_numl = -1;
-
-    framework::CLImage *reuse_tensor = nullptr;
-    DLOG << "resused nodes group ----------";
-    for (const auto &node : list) {
-      auto *var = scope->Var(node->name);
-      auto *tensor = var->template GetMutable<framework::CLImage>();
-      const int64_t numl = tensor->numel();
-      if (max_numl < numl) {
-        max_numl = numl;
-        reuse_tensor = tensor;
-      }
-      DLOG << node->name << " ----dims: " << tensor->dims()
-           << "----numl----: " << numl;
-    }
-
-    if (reuse_tensor == nullptr) {
-      return;
-    }
-
-    const framework::DDim &dims = reuse_tensor->dims();
-    cl_context context = scope->GetCLScpoe()->Context();
-    cl_command_queue command_queue = scope->GetCLScpoe()->CommandQueue();
-
-    framework::DDim reshaped_dim = framework::make_ddim(
-        {dims[0], dims[1], target_dims[2], target_dims[3]});
-
-    DLOG << "target dims : " << target_dims;
-    DLOG << "reshaped_dim : " << reshaped_dim;
-    reuse_tensor->InitFakeSizeImage(context, command_queue, reshaped_dim,
-                                    reshaped_dim);
-
-    for (const auto &node : list) {
-      auto *var = scope->Var(node->name);
-      auto *tensor = var->template GetMutable<framework::CLImage>();
-      const framework::DDim &temp_dim = tensor->dims();
-      framework::DDim need_dims = framework::make_ddim(
-          {temp_dim[0], temp_dim[1], target_dims[2], target_dims[3]});
-      tensor->InitWithExitedMem(context, command_queue, need_dims,
-                                *reuse_tensor);
-    }
-  }
-}
-
-}  // namespace pass
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/pass/memory_optimize_super.h b/mobile/src/pass/memory_optimize_super.h
deleted file mode 100644
index 08af29919f..0000000000
--- a/mobile/src/pass/memory_optimize_super.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef PADDLE_MOBILE_CL
-
-#pragma once
-
-#include <stack>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "framework/lod_tensor.h"
-#include "framework/program/program.h"
-#include "pass/pass_base.h"
-// use for super resulotion  to be extend for all opencl
-namespace paddle_mobile {
-namespace pass {
-
-typedef struct {
-  std::string name;  // variable name
-  int count;         // reference count
-  bool visited;
-} ClVarNode;
-
-// MemoryOptPass will analyze the program, and reuse memory between
-// variables as much as possible
-class MemoryOptPassSuper : public PassBase {
- public:
-  MemoryOptPassSuper() {}
-  virtual ~MemoryOptPassSuper() {
-    for (auto &it : created_nodes_) {
-      delete it.second;
-    }
-  }
-
-  void operator()(const framework::ProgramDesc *program,
-                  framework::Scope *scope,
-                  MemoryOptimizationLevel memory_optimization_level,
-                  framework::DDim dims);
-
-  void AppendBlockVars(const framework::BlockDesc *block);
-
-  bool IsPersistable(const std::string name);
-
-  ClVarNode *CreateNode(const std::string name);
-
-  void ShareData(framework::Scope *scope,
-                 MemoryOptimizationLevel memory_optimization_level,
-                 framework::DDim dims) const;
-
- private:
-  std::stack<ClVarNode *> analysis_nodes_;
-  std::vector<std::vector<ClVarNode *>> reused_nodes_;
-  std::unordered_map<std::string, ClVarNode *> created_nodes_;
-  std::unordered_map<std::string, framework::VarDesc *> block_vars_;
-};
-
-}  // namespace pass
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/pass/model_obfuscate.cpp b/mobile/src/pass/model_obfuscate.cpp
deleted file mode 100644
index 913b93af25..0000000000
--- a/mobile/src/pass/model_obfuscate.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "pass/model_obfuscate.h"
-
-namespace paddle_mobile {
-namespace pass {
-
-ModelObfuscatePass::ModelObfuscatePass(std::string key) {
-  for (auto c : key) {
-    acc *= base;
-    acc += (int)c;
-    acc %= stride;
-  }
-  acc += stride;
-}
-
-void ModelObfuscatePass::convert_data(char *data, int len) {
-  for (int i = 0; i < len; i += acc) {
-    data[i] = 255 - data[i];
-  }
-}
-
-}  // namespace pass
-}  // namespace paddle_mobile
diff --git a/mobile/src/pass/model_obfuscate.h b/mobile/src/pass/model_obfuscate.h
deleted file mode 100644
index 6c2912e05a..0000000000
--- a/mobile/src/pass/model_obfuscate.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "pass/pass_base.h"
-
-namespace paddle_mobile {
-namespace pass {
-
-class ModelObfuscatePass : public PassBase {
- public:
-  ModelObfuscatePass(std::string key);
-  void convert_data(char *data, int len);
-  int version = 1;
-
- private:
-  int acc = 0;
-  int base = 17;
-  int stride = 100;
-};
-
-}  // namespace pass
-}  // namespace paddle_mobile
diff --git a/mobile/src/pass/pass_base.h b/mobile/src/pass/pass_base.h
deleted file mode 100644
index 925fdb7d50..0000000000
--- a/mobile/src/pass/pass_base.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle_mobile {
-namespace pass {
-
-class PassBase {
- public:
-  PassBase() {}
-  virtual ~PassBase() {}
-};
-
-}  // namespace pass
-}  // namespace paddle_mobile
diff --git a/mobile/src/protobuf-c/protobuf-c.cpp b/mobile/src/protobuf-c/protobuf-c.cpp
deleted file mode 100644
index 8e739df43c..0000000000
--- a/mobile/src/protobuf-c/protobuf-c.cpp
+++ /dev/null
@@ -1,2249 +0,0 @@
-/*
- * Copyright (c) 2008-2015, Dave Benson and the protobuf-c authors.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- *     * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- *     * Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following disclaimer
- * in the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*! \file
- * Support library for `protoc-c` generated code.
- *
- * This file implements the public API used by the code generated
- * by `protoc-c`.
- *
- * \authors Dave Benson and the protobuf-c authors
- *
- * \copyright 2008-2014. Licensed under the terms of the [BSD-2-Clause] license.
- */
-
-/**
- * \todo 64-BIT OPTIMIZATION: certain implementations use 32-bit math
- * even on 64-bit platforms (uint64_size, PaddleMobile__Framework__uint64_pack,
- * PaddleMobile__Framework__parse_uint64).
- *
- * \todo Use size_t consistently.
- */
-
-#include <stdlib.h> /* for malloc, free */
-#include <string.h> /* for strcmp, strlen, memcpy, memmove, memset */
-
-#include "protobuf-c.h"
-
-#define TRUE 1
-#define FALSE 0
-
-#define PROTOBUF_C__ASSERT_NOT_REACHED() assert(0)
-
-/* Workaround for Microsoft compilers. */
-#ifdef _MSC_VER
-#define inline __inline
-#endif
-
-/**
- * \defgroup internal Internal functions and macros
- *
- * These are not exported by the library but are useful to developers working
- * on `libprotobuf-c` itself.
- */
-
-/**
- * \defgroup macros Utility macros for manipulating structures
- *
- * Macros and constants used to manipulate the base "classes" generated by
- * `protobuf-c`. They also define limits and check correctness.
- *
- * \ingroup internal
- * @{
- */
-
-/** The maximum length of a 64-bit integer in varint encoding. */
-#define MAX_UINT64_ENCODED_SIZE 10
-
-#ifndef PROTOBUF_C_UNPACK_ERROR
-#define PROTOBUF_C_UNPACK_ERROR(...)
-#endif
-
-const char PaddleMobile__Framework__protobuf_c_empty_string[] = "";
-
-/**
- * Internal `PaddleMobile__Framework__ProtobufCMessage` manipulation macro.
- *
- * Base macro for manipulating a `PaddleMobile__Framework__ProtobufCMessage`.
- * Used by STRUCT_MEMBER() and STRUCT_MEMBER_PTR().
- */
-#define STRUCT_MEMBER_P(struct_p, struct_offset) \
-  ((void *)((uint8_t *)(struct_p) + (struct_offset)))
-
-/**
- * Return field in a `PaddleMobile__Framework__ProtobufCMessage` based on
- * offset.
- *
- * Take a pointer to a `PaddleMobile__Framework__ProtobufCMessage` and find the
- * field at the offset. Cast it to the passed type.
- */
-#define STRUCT_MEMBER(member_type, struct_p, struct_offset) \
-  (*(member_type *)STRUCT_MEMBER_P((struct_p), (struct_offset)))
-
-/**
- * Return field in a `PaddleMobile__Framework__ProtobufCMessage` based on
- * offset.
- *
- * Take a pointer to a `PaddleMobile__Framework__ProtobufCMessage` and find the
- * field at the offset. Cast it to a pointer to the passed type.
- */
-#define STRUCT_MEMBER_PTR(member_type, struct_p, struct_offset) \
-  ((member_type *)STRUCT_MEMBER_P((struct_p), (struct_offset)))
-
-/* Assertions for magic numbers. */
-
-#define ASSERT_IS_ENUM_DESCRIPTOR(desc) \
-  assert((desc)->magic == PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC)
-
-#define ASSERT_IS_MESSAGE_DESCRIPTOR(desc) \
-  assert((desc)->magic == PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC)
-
-#define ASSERT_IS_MESSAGE(message) \
-  ASSERT_IS_MESSAGE_DESCRIPTOR((message)->descriptor)
-
-#define ASSERT_IS_SERVICE_DESCRIPTOR(desc) \
-  assert((desc)->magic == PROTOBUF_C__SERVICE_DESCRIPTOR_MAGIC)
-
-/**@}*/
-
-/* --- version --- */
-
-const char *PaddleMobile__Framework__protobuf_c_version(void) {
-  return PROTOBUF_C_VERSION;
-}
-
-uint32_t PaddleMobile__Framework__protobuf_c_version_number(void) {
-  return PROTOBUF_C_VERSION_NUMBER;
-}
-
-/* --- allocator --- */
-
-static void *PaddleMobile__Framework__system_alloc(void *allocator_data,
-                                                   size_t size) {
-  return malloc(size);
-}
-
-static void PaddleMobile__Framework__system_free(void *allocator_data,
-                                                 void *data) {
-  free(data);
-}
-
-static inline void *PaddleMobile__Framework__do_alloc(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t size) {
-  return allocator->alloc(allocator->allocator_data, size);
-}
-
-static inline void PaddleMobile__Framework__do_free(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, void *data) {
-  if (data != NULL) allocator->free(allocator->allocator_data, data);
-}
-
-/*
- * This allocator uses the system's malloc() and free(). It is the default
- * allocator used if NULL is passed as the
- * PaddleMobile__Framework__ProtobufCAllocator to an exported function.
- */
-static PaddleMobile__Framework__ProtobufCAllocator protobuf_c__allocator = {
-    .alloc = &PaddleMobile__Framework__system_alloc,
-    .free = &PaddleMobile__Framework__system_free,
-    .allocator_data = NULL,
-};
-
-/* === buffer-simple === */
-
-void PaddleMobile__Framework__protobuf_c_buffer_simple_append(
-    PaddleMobile__Framework__ProtobufCBuffer *buffer, size_t len,
-    const uint8_t *data) {
-  PaddleMobile__Framework__ProtobufCBufferSimple *simp =
-      (PaddleMobile__Framework__ProtobufCBufferSimple *)buffer;
-  size_t new_len = simp->len + len;
-
-  if (new_len > simp->alloced) {
-    PaddleMobile__Framework__ProtobufCAllocator *allocator = simp->allocator;
-    size_t new_alloced = simp->alloced * 2;
-    uint8_t *new_data;
-
-    if (allocator == NULL) allocator = &protobuf_c__allocator;
-    while (new_alloced < new_len) new_alloced += new_alloced;
-    new_data =
-        (uint8_t *)PaddleMobile__Framework__do_alloc(allocator, new_alloced);
-    if (!new_data) return;
-    memcpy(new_data, simp->data, simp->len);
-    if (simp->must_free_data)
-      PaddleMobile__Framework__do_free(allocator, simp->data);
-    else
-      simp->must_free_data = TRUE;
-    simp->data = new_data;
-    simp->alloced = new_alloced;
-  }
-  memcpy(simp->data + simp->len, data, len);
-  simp->len = new_len;
-}
-
-/**
- * \defgroup packedsz
- * PaddleMobile__Framework__protobuf_c_message_get_packed_size() implementation
- *
- * Routines mainly used by
- * PaddleMobile__Framework__protobuf_c_message_get_packed_size().
- *
- * \ingroup internal
- * @{
- */
-
-/**
- * Return the number of bytes required to store the tag for the field. Includes
- * 3 bits for the wire-type, and a single bit that denotes the end-of-tag.
- *
- * \param number
- *      Field tag to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t get_tag_size(uint32_t number) {
-  if (number < (1UL << 4)) {
-    return 1;
-  } else if (number < (1UL << 11)) {
-    return 2;
-  } else if (number < (1UL << 18)) {
-    return 3;
-  } else if (number < (1UL << 25)) {
-    return 4;
-  } else {
-    return 5;
-  }
-}
-
-/**
- * Return the number of bytes required to store a variable-length unsigned
- * 32-bit integer in base-128 varint encoding.
- *
- * \param v
- *      Value to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t uint32_size(uint32_t v) {
-  if (v < (1UL << 7)) {
-    return 1;
-  } else if (v < (1UL << 14)) {
-    return 2;
-  } else if (v < (1UL << 21)) {
-    return 3;
-  } else if (v < (1UL << 28)) {
-    return 4;
-  } else {
-    return 5;
-  }
-}
-
-/**
- * Return the number of bytes required to store a variable-length signed 32-bit
- * integer in base-128 varint encoding.
- *
- * \param v
- *      Value to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t int32_size(int32_t v) {
-  if (v < 0) {
-    return 10;
-  } else if (v < (1L << 7)) {
-    return 1;
-  } else if (v < (1L << 14)) {
-    return 2;
-  } else if (v < (1L << 21)) {
-    return 3;
-  } else if (v < (1L << 28)) {
-    return 4;
-  } else {
-    return 5;
-  }
-}
-
-/**
- * Return the ZigZag-encoded 32-bit unsigned integer form of a 32-bit signed
- * integer.
- *
- * \param v
- *      Value to encode.
- * \return
- *      ZigZag encoded integer.
- */
-static inline uint32_t zigzag32(int32_t v) {
-  if (v < 0)
-    return (-(uint32_t)v) * 2 - 1;
-  else
-    return (uint32_t)(v)*2;
-}
-
-/**
- * Return the number of bytes required to store a signed 32-bit integer,
- * converted to an unsigned 32-bit integer with ZigZag encoding, using base-128
- * varint encoding.
- *
- * \param v
- *      Value to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t sint32_size(int32_t v) { return uint32_size(zigzag32(v)); }
-
-/**
- * Return the number of bytes required to store a 64-bit unsigned integer in
- * base-128 varint encoding.
- *
- * \param v
- *      Value to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t uint64_size(uint64_t v) {
-  uint32_t upper_v = (uint32_t)(v >> 32);
-
-  if (upper_v == 0) {
-    return uint32_size((uint32_t)v);
-  } else if (upper_v < (1UL << 3)) {
-    return 5;
-  } else if (upper_v < (1UL << 10)) {
-    return 6;
-  } else if (upper_v < (1UL << 17)) {
-    return 7;
-  } else if (upper_v < (1UL << 24)) {
-    return 8;
-  } else if (upper_v < (1UL << 31)) {
-    return 9;
-  } else {
-    return 10;
-  }
-}
-
-/**
- * Return the ZigZag-encoded 64-bit unsigned integer form of a 64-bit signed
- * integer.
- *
- * \param v
- *      Value to encode.
- * \return
- *      ZigZag encoded integer.
- */
-static inline uint64_t zigzag64(int64_t v) {
-  if (v < 0)
-    return (-(uint64_t)v) * 2 - 1;
-  else
-    return (uint64_t)(v)*2;
-}
-
-/**
- * Return the number of bytes required to store a signed 64-bit integer,
- * converted to an unsigned 64-bit integer with ZigZag encoding, using base-128
- * varint encoding.
- *
- * \param v
- *      Value to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t sint64_size(int64_t v) { return uint64_size(zigzag64(v)); }
-
-/**
- * Calculate the serialized size of a single required message field, including
- * the space needed by the preceding tag.
- *
- * \param field
- *      Field descriptor for member.
- * \param member
- *      Field to encode.
- * \return
- *      Number of bytes required.
- */
-static size_t PaddleMobile__Framework__required_field_get_packed_size(
-    const PaddleMobile__Framework__ProtobufCFieldDescriptor *field,
-    const void *member) {
-  size_t rv = get_tag_size(field->id);
-
-  switch (field->type) {
-    case PROTOBUF_C_TYPE_SINT32:
-      return rv + sint32_size(*(const int32_t *)member);
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-      return rv + int32_size(*(const int32_t *)member);
-    case PROTOBUF_C_TYPE_UINT32:
-      return rv + uint32_size(*(const uint32_t *)member);
-    case PROTOBUF_C_TYPE_SINT64:
-      return rv + sint64_size(*(const int64_t *)member);
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      return rv + uint64_size(*(const uint64_t *)member);
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-      return rv + 4;
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-      return rv + 8;
-    case PROTOBUF_C_TYPE_BOOL:
-      return rv + 1;
-    case PROTOBUF_C_TYPE_FLOAT:
-      return rv + 4;
-    case PROTOBUF_C_TYPE_DOUBLE:
-      return rv + 8;
-    case PROTOBUF_C_TYPE_STRING: {
-      const char *str = *(char *const *)member;
-      size_t len = str ? strlen(str) : 0;
-      return rv + uint32_size(len) + len;
-    }
-    case PROTOBUF_C_TYPE_BYTES: {
-      size_t len =
-          ((const PaddleMobile__Framework__ProtobufCBinaryData *)member)->len;
-      return rv + uint32_size(len) + len;
-    }
-    case PROTOBUF_C_TYPE_MESSAGE: {
-      const PaddleMobile__Framework__ProtobufCMessage *msg =
-          *(PaddleMobile__Framework__ProtobufCMessage *const *)member;
-      size_t subrv =
-          msg ? PaddleMobile__Framework__protobuf_c_message_get_packed_size(msg)
-              : 0;
-      return rv + uint32_size(subrv) + subrv;
-    }
-  }
-  PROTOBUF_C__ASSERT_NOT_REACHED();
-  return 0;
-}
-
-/**
- * Calculate the serialized size of a single oneof message field, including
- * the space needed by the preceding tag. Returns 0 if the oneof field isn't
- * selected or is not set.
- *
- * \param field
- *      Field descriptor for member.
- * \param oneof_case
- *      Enum value that selects the field in the oneof.
- * \param member
- *      Field to encode.
- * \return
- *      Number of bytes required.
- */
-static size_t PaddleMobile__Framework__oneof_field_get_packed_size(
-    const PaddleMobile__Framework__ProtobufCFieldDescriptor *field,
-    uint32_t oneof_case, const void *member) {
-  if (oneof_case != field->id) {
-    return 0;
-  }
-  if (field->type == PROTOBUF_C_TYPE_MESSAGE ||
-      field->type == PROTOBUF_C_TYPE_STRING) {
-    const void *ptr = *(const void *const *)member;
-    if (ptr == NULL || ptr == field->default_value) return 0;
-  }
-  return PaddleMobile__Framework__required_field_get_packed_size(field, member);
-}
-
-/**
- * Calculate the serialized size of a single optional message field, including
- * the space needed by the preceding tag. Returns 0 if the optional field isn't
- * set.
- *
- * \param field
- *      Field descriptor for member.
- * \param has
- *      True if the field exists, false if not.
- * \param member
- *      Field to encode.
- * \return
- *      Number of bytes required.
- */
-static size_t PaddleMobile__Framework__optional_field_get_packed_size(
-    const PaddleMobile__Framework__ProtobufCFieldDescriptor *field,
-    const protobuf_c_boolean has, const void *member) {
-  if (field->type == PROTOBUF_C_TYPE_MESSAGE ||
-      field->type == PROTOBUF_C_TYPE_STRING) {
-    const void *ptr = *(const void *const *)member;
-    if (ptr == NULL || ptr == field->default_value) return 0;
-  } else {
-    if (!has) return 0;
-  }
-  return PaddleMobile__Framework__required_field_get_packed_size(field, member);
-}
-
-static protobuf_c_boolean PaddleMobile__Framework__field_is_zeroish(
-    const PaddleMobile__Framework__ProtobufCFieldDescriptor *field,
-    const void *member) {
-  protobuf_c_boolean ret = FALSE;
-
-  switch (field->type) {
-    case PROTOBUF_C_TYPE_BOOL:
-      ret = (0 == *(const protobuf_c_boolean *)member);
-      break;
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_SINT32:
-    case PROTOBUF_C_TYPE_INT32:
-    case PROTOBUF_C_TYPE_UINT32:
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-      ret = (0 == *(const uint32_t *)member);
-      break;
-    case PROTOBUF_C_TYPE_SINT64:
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-      ret = (0 == *(const uint64_t *)member);
-      break;
-    case PROTOBUF_C_TYPE_FLOAT:
-      ret = (0 == *(const float *)member);
-      break;
-    case PROTOBUF_C_TYPE_DOUBLE:
-      ret = (0 == *(const double *)member);
-      break;
-    case PROTOBUF_C_TYPE_STRING:
-      ret = (NULL == *(const char *const *)member) ||
-            ('\0' == **(const char *const *)member);
-      break;
-    case PROTOBUF_C_TYPE_BYTES:
-    case PROTOBUF_C_TYPE_MESSAGE:
-      ret = (NULL == *(const void *const *)member);
-      break;
-    default:
-      ret = TRUE;
-      break;
-  }
-
-  return ret;
-}
-
-/**
- * Calculate the serialized size of a single unlabeled message field, including
- * the space needed by the preceding tag. Returns 0 if the field isn't set or
- * if it is set to a "zeroish" value (null pointer or 0 for numerical values).
- * Unlabeled fields are supported only in proto3.
- *
- * \param field
- *      Field descriptor for member.
- * \param member
- *      Field to encode.
- * \return
- *      Number of bytes required.
- */
-static size_t PaddleMobile__Framework__unlabeled_field_get_packed_size(
-    const PaddleMobile__Framework__ProtobufCFieldDescriptor *field,
-    const void *member) {
-  if (PaddleMobile__Framework__field_is_zeroish(field, member)) return 0;
-  return PaddleMobile__Framework__required_field_get_packed_size(field, member);
-}
-
-/**
- * Calculate the serialized size of repeated message fields, which may consist
- * of any number of values (including 0). Includes the space needed by the
- * preceding tags (as needed).
- *
- * \param field
- *      Field descriptor for member.
- * \param count
- *      Number of repeated field members.
- * \param member
- *      Field to encode.
- * \return
- *      Number of bytes required.
- */
-static size_t PaddleMobile__Framework__repeated_field_get_packed_size(
-    const PaddleMobile__Framework__ProtobufCFieldDescriptor *field,
-    size_t count, const void *member) {
-  size_t header_size;
-  size_t rv = 0;
-  unsigned i;
-  void *array = *(void *const *)member;
-
-  if (count == 0) return 0;
-  header_size = get_tag_size(field->id);
-  if (0 == (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED)) header_size *= count;
-
-  switch (field->type) {
-    case PROTOBUF_C_TYPE_SINT32:
-      for (i = 0; i < count; i++) rv += sint32_size(((int32_t *)array)[i]);
-      break;
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-      for (i = 0; i < count; i++) rv += int32_size(((int32_t *)array)[i]);
-      break;
-    case PROTOBUF_C_TYPE_UINT32:
-      for (i = 0; i < count; i++) rv += uint32_size(((uint32_t *)array)[i]);
-      break;
-    case PROTOBUF_C_TYPE_SINT64:
-      for (i = 0; i < count; i++) rv += sint64_size(((int64_t *)array)[i]);
-      break;
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      for (i = 0; i < count; i++) rv += uint64_size(((uint64_t *)array)[i]);
-      break;
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-      rv += 4 * count;
-      break;
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      rv += 8 * count;
-      break;
-    case PROTOBUF_C_TYPE_BOOL:
-      rv += count;
-      break;
-    case PROTOBUF_C_TYPE_STRING:
-      for (i = 0; i < count; i++) {
-        size_t len = strlen(((char **)array)[i]);
-        rv += uint32_size(len) + len;
-      }
-      break;
-    case PROTOBUF_C_TYPE_BYTES:
-      for (i = 0; i < count; i++) {
-        size_t len =
-            ((PaddleMobile__Framework__ProtobufCBinaryData *)array)[i].len;
-        rv += uint32_size(len) + len;
-      }
-      break;
-    case PROTOBUF_C_TYPE_MESSAGE:
-      for (i = 0; i < count; i++) {
-        size_t len =
-            PaddleMobile__Framework__protobuf_c_message_get_packed_size(
-                ((PaddleMobile__Framework__ProtobufCMessage **)array)[i]);
-        rv += uint32_size(len) + len;
-      }
-      break;
-  }
-
-  if (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED))
-    header_size += uint32_size(rv);
-  return header_size + rv;
-}
-
-/**
- * Calculate the serialized size of an unknown field, i.e. one that is passed
- * through mostly uninterpreted. This is required for forward compatibility if
- * new fields are added to the message descriptor.
- *
- * \param field
- *      Unknown field type.
- * \return
- *      Number of bytes required.
- */
-static inline size_t PaddleMobile__Framework__unknown_field_get_packed_size(
-    const PaddleMobile__Framework__ProtobufCMessageUnknownField *field) {
-  return get_tag_size(field->tag) + field->len;
-}
-
-/**@}*/
-
-/*
- * Calculate the serialized size of the message.
- */
-size_t PaddleMobile__Framework__protobuf_c_message_get_packed_size(
-    const PaddleMobile__Framework__ProtobufCMessage *message) {
-  unsigned i;
-  size_t rv = 0;
-
-  ASSERT_IS_MESSAGE(message);
-  for (i = 0; i < message->descriptor->n_fields; i++) {
-    const PaddleMobile__Framework__ProtobufCFieldDescriptor *field =
-        message->descriptor->fields + i;
-    const void *member = ((const char *)message) + field->offset;
-    const void *qmember = ((const char *)message) + field->quantifier_offset;
-
-    if (field->label == PROTOBUF_C_LABEL_REQUIRED) {
-      rv += PaddleMobile__Framework__required_field_get_packed_size(field,
-                                                                    member);
-    } else if ((field->label == PROTOBUF_C_LABEL_OPTIONAL ||
-                field->label == PROTOBUF_C_LABEL_NONE) &&
-               (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_ONEOF))) {
-      rv += PaddleMobile__Framework__oneof_field_get_packed_size(
-          field, *(const uint32_t *)qmember, member);
-    } else if (field->label == PROTOBUF_C_LABEL_OPTIONAL) {
-      rv += PaddleMobile__Framework__optional_field_get_packed_size(
-          field, *(protobuf_c_boolean *)qmember, member);
-    } else if (field->label == PROTOBUF_C_LABEL_NONE) {
-      rv += PaddleMobile__Framework__unlabeled_field_get_packed_size(field,
-                                                                     member);
-    } else {
-      rv += PaddleMobile__Framework__repeated_field_get_packed_size(
-          field, *(const size_t *)qmember, member);
-    }
-  }
-  for (i = 0; i < message->n_unknown_fields; i++)
-    rv += PaddleMobile__Framework__unknown_field_get_packed_size(
-        &message->unknown_fields[i]);
-  return rv;
-}
-
-/**
- * \defgroup pack protobuf_c_message_pack() implementation
- *
- * Routines mainly used by protobuf_c_message_pack().
- *
- * \ingroup internal
- * @{
- */
-
-/**
- * Pack an unsigned 32-bit integer in base-128 varint encoding and return the
- * number of bytes written, which must be 5 or less.
- *
- * \param value
- *      Value to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static inline size_t PaddleMobile__Framework__uint32_pack(uint32_t value,
-                                                          uint8_t *out) {
-  unsigned rv = 0;
-
-  if (value >= 0x80) {
-    out[rv++] = value | 0x80;
-    value >>= 7;
-    if (value >= 0x80) {
-      out[rv++] = value | 0x80;
-      value >>= 7;
-      if (value >= 0x80) {
-        out[rv++] = value | 0x80;
-        value >>= 7;
-        if (value >= 0x80) {
-          out[rv++] = value | 0x80;
-          value >>= 7;
-        }
-      }
-    }
-  }
-  /* assert: value<128 */
-  out[rv++] = value;
-  return rv;
-}
-
-/**
- * Pack a 64-bit unsigned integer using base-128 varint encoding and return the
- * number of bytes written.
- *
- * \param value
- *      Value to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static size_t PaddleMobile__Framework__uint64_pack(uint64_t value,
-                                                   uint8_t *out) {
-  uint32_t hi = (uint32_t)(value >> 32);
-  uint32_t lo = (uint32_t)value;
-  unsigned rv;
-
-  if (hi == 0) return PaddleMobile__Framework__uint32_pack((uint32_t)lo, out);
-  out[0] = (lo) | 0x80;
-  out[1] = (lo >> 7) | 0x80;
-  out[2] = (lo >> 14) | 0x80;
-  out[3] = (lo >> 21) | 0x80;
-  if (hi < 8) {
-    out[4] = (hi << 4) | (lo >> 28);
-    return 5;
-  } else {
-    out[4] = ((hi & 7) << 4) | (lo >> 28) | 0x80;
-    hi >>= 3;
-  }
-  rv = 5;
-  while (hi >= 128) {
-    out[rv++] = hi | 0x80;
-    hi >>= 7;
-  }
-  out[rv++] = hi;
-  return rv;
-}
-
-/**
- * Pack a PaddleMobile__Framework__ProtobufCBinaryData and return the number of
- * bytes written. The output includes a length delimiter.
- *
- * \param bd
- *      PaddleMobile__Framework__ProtobufCBinaryData to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static inline size_t PaddleMobile__Framework__binary_data_pack(
-    const PaddleMobile__Framework__ProtobufCBinaryData *bd, uint8_t *out) {
-  size_t len = bd->len;
-  size_t rv = PaddleMobile__Framework__uint32_pack(len, out);
-  memcpy(out + rv, bd->data, len);
-  return rv + len;
-}
-
-/**
- * Pack a field tag.
- *
- * Wire-type will be added in required_field_pack().
- *
- * \todo Just call PaddleMobile__Framework__uint64_pack on 64-bit platforms.
- *
- * \param id
- *      Tag value to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static size_t PaddleMobile__Framework__tag_pack(uint32_t id, uint8_t *out) {
-  if (id < (1UL << (32 - 3)))
-    return PaddleMobile__Framework__uint32_pack(id << 3, out);
-  else
-    return PaddleMobile__Framework__uint64_pack(((uint64_t)id) << 3, out);
-}
-
-/**
- * Given a field type, return the in-memory size.
- *
- * \todo Implement as a table lookup.
- *
- * \param type
- *      Field type.
- * \return
- *      Size of the field.
- */
-static inline size_t PaddleMobile__Framework__sizeof_elt_in_repeated_array(
-    PaddleMobile__Framework__ProtobufCType type) {
-  switch (type) {
-    case PROTOBUF_C_TYPE_SINT32:
-    case PROTOBUF_C_TYPE_INT32:
-    case PROTOBUF_C_TYPE_UINT32:
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-    case PROTOBUF_C_TYPE_ENUM:
-      return 4;
-    case PROTOBUF_C_TYPE_SINT64:
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      return 8;
-    case PROTOBUF_C_TYPE_BOOL:
-      return sizeof(protobuf_c_boolean);
-    case PROTOBUF_C_TYPE_STRING:
-    case PROTOBUF_C_TYPE_MESSAGE:
-      return sizeof(void *);
-    case PROTOBUF_C_TYPE_BYTES:
-      return sizeof(PaddleMobile__Framework__ProtobufCBinaryData);
-  }
-  PROTOBUF_C__ASSERT_NOT_REACHED();
-  return 0;
-}
-
-static inline int PaddleMobile__Framework__int_range_lookup(
-    unsigned n_ranges, const PaddleMobile__Framework__ProtobufCIntRange *ranges,
-    int value) {
-  unsigned n;
-  unsigned start;
-
-  if (n_ranges == 0) return -1;
-  start = 0;
-  n = n_ranges;
-  while (n > 1) {
-    unsigned mid = start + n / 2;
-
-    if (value < ranges[mid].start_value) {
-      n = mid - start;
-    } else if (value >=
-               ranges[mid].start_value +
-                   (int)(ranges[mid + 1].orig_index - ranges[mid].orig_index)) {
-      unsigned new_start = mid + 1;
-      n = start + n - new_start;
-      start = new_start;
-    } else
-      return (value - ranges[mid].start_value) + ranges[mid].orig_index;
-  }
-  if (n > 0) {
-    unsigned start_orig_index = ranges[start].orig_index;
-    unsigned range_size = ranges[start + 1].orig_index - start_orig_index;
-
-    if (ranges[start].start_value <= value &&
-        value < (int)(ranges[start].start_value + range_size)) {
-      return (value - ranges[start].start_value) + start_orig_index;
-    }
-  }
-  return -1;
-}
-
-static size_t PaddleMobile__Framework__parse_tag_and_wiretype(
-    size_t len, const uint8_t *data, uint32_t *tag_out,
-    PaddleMobile__Framework__ProtobufCWireType *wiretype_out) {
-  unsigned max_rv = len > 5 ? 5 : len;
-  uint32_t tag = (data[0] & 0x7f) >> 3;
-  unsigned shift = 4;
-  unsigned rv;
-
-  *wiretype_out = (PaddleMobile__Framework__ProtobufCWireType)(data[0] & 7);
-  if ((data[0] & 0x80) == 0) {
-    *tag_out = tag;
-    return 1;
-  }
-  for (rv = 1; rv < max_rv; rv++) {
-    if (data[rv] & 0x80) {
-      tag |= (data[rv] & 0x7f) << shift;
-      shift += 7;
-    } else {
-      tag |= data[rv] << shift;
-      *tag_out = tag;
-      return rv + 1;
-    }
-  }
-  return 0; /* error: bad header */
-}
-
-/* sizeof(ScannedMember) must be <= (1UL<<BOUND_SIZEOF_SCANNED_MEMBER_LOG2) */
-#define BOUND_SIZEOF_SCANNED_MEMBER_LOG2 5
-typedef struct PaddleMobile__Framework___ScannedMember ScannedMember;
-/** Field as it's being read. */
-struct PaddleMobile__Framework___ScannedMember {
-  uint32_t tag;              /**< Field tag. */
-  uint8_t wire_type;         /**< Field type. */
-  uint8_t length_prefix_len; /**< Prefix length. */
-  const PaddleMobile__Framework__ProtobufCFieldDescriptor
-      *field;          /**< Field descriptor. */
-  size_t len;          /**< Field length. */
-  const uint8_t *data; /**< Pointer to field data. */
-};
-
-static inline uint32_t PaddleMobile__Framework__scan_length_prefixed_data(
-    size_t len, const uint8_t *data, size_t *prefix_len_out) {
-  unsigned hdr_max = len < 5 ? len : 5;
-  unsigned hdr_len;
-  uint32_t val = 0;
-  unsigned i;
-  unsigned shift = 0;
-
-  for (i = 0; i < hdr_max; i++) {
-    val |= (data[i] & 0x7f) << shift;
-    shift += 7;
-    if ((data[i] & 0x80) == 0) break;
-  }
-  if (i == hdr_max) {
-    PROTOBUF_C_UNPACK_ERROR("error parsing length for length-prefixed data");
-    return 0;
-  }
-  hdr_len = i + 1;
-  *prefix_len_out = hdr_len;
-  if (hdr_len + val > len) {
-    PROTOBUF_C_UNPACK_ERROR("data too short after length-prefix of %u", val);
-    return 0;
-  }
-  return hdr_len + val;
-}
-
-static size_t PaddleMobile__Framework__max_b128_numbers(size_t len,
-                                                        const uint8_t *data) {
-  size_t rv = 0;
-  while (len--)
-    if ((*data++ & 0x80) == 0) ++rv;
-  return rv;
-}
-
-/**@}*/
-
-/**
- * Merge earlier message into a latter message.
- *
- * For numeric types and strings, if the same value appears multiple
- * times, the parser accepts the last value it sees. For embedded
- * message fields, the parser merges multiple instances of the same
- * field. That is, all singular scalar fields in the latter instance
- * replace those in the former, singular embedded messages are merged,
- * and repeated fields are concatenated.
- *
- * The earlier message should be freed after calling this function, as
- * some of its fields may have been reused and changed to their default
- * values during the merge.
- */
-static protobuf_c_boolean PaddleMobile__Framework__merge_messages(
-    PaddleMobile__Framework__ProtobufCMessage *earlier_msg,
-    PaddleMobile__Framework__ProtobufCMessage *latter_msg,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator) {
-  unsigned i;
-  const PaddleMobile__Framework__ProtobufCFieldDescriptor *fields =
-      latter_msg->descriptor->fields;
-  for (i = 0; i < latter_msg->descriptor->n_fields; i++) {
-    if (fields[i].label == PROTOBUF_C_LABEL_REPEATED) {
-      size_t *n_earlier =
-          STRUCT_MEMBER_PTR(size_t, earlier_msg, fields[i].quantifier_offset);
-      uint8_t **p_earlier =
-          STRUCT_MEMBER_PTR(uint8_t *, earlier_msg, fields[i].offset);
-      size_t *n_latter =
-          STRUCT_MEMBER_PTR(size_t, latter_msg, fields[i].quantifier_offset);
-      uint8_t **p_latter =
-          STRUCT_MEMBER_PTR(uint8_t *, latter_msg, fields[i].offset);
-
-      if (*n_earlier > 0) {
-        if (*n_latter > 0) {
-          /* Concatenate the repeated field */
-          size_t el_size =
-              PaddleMobile__Framework__sizeof_elt_in_repeated_array(
-                  fields[i].type);
-          uint8_t *new_field;
-
-          new_field = (uint8_t *)PaddleMobile__Framework__do_alloc(
-              allocator, (*n_earlier + *n_latter) * el_size);
-          if (!new_field) return FALSE;
-
-          memcpy(new_field, *p_earlier, *n_earlier * el_size);
-          memcpy(new_field + *n_earlier * el_size, *p_latter,
-                 *n_latter * el_size);
-
-          PaddleMobile__Framework__do_free(allocator, *p_latter);
-          PaddleMobile__Framework__do_free(allocator, *p_earlier);
-          *p_latter = new_field;
-          *n_latter = *n_earlier + *n_latter;
-        } else {
-          /* Zero copy the repeated field from the earlier message */
-          *n_latter = *n_earlier;
-          *p_latter = *p_earlier;
-        }
-        /* Make sure the field does not get double freed */
-        *n_earlier = 0;
-        *p_earlier = 0;
-      }
-    } else if (fields[i].label == PROTOBUF_C_LABEL_OPTIONAL ||
-               fields[i].label == PROTOBUF_C_LABEL_NONE) {
-      const PaddleMobile__Framework__ProtobufCFieldDescriptor *field;
-      uint32_t *earlier_case_p =
-          STRUCT_MEMBER_PTR(uint32_t, earlier_msg, fields[i].quantifier_offset);
-      uint32_t *latter_case_p =
-          STRUCT_MEMBER_PTR(uint32_t, latter_msg, fields[i].quantifier_offset);
-      protobuf_c_boolean need_to_merge = FALSE;
-      void *earlier_elem;
-      void *latter_elem;
-      const void *def_val;
-
-      if (fields[i].flags & PROTOBUF_C_FIELD_FLAG_ONEOF) {
-        if (*latter_case_p == 0) {
-          /* lookup correct oneof field */
-          int field_index = PaddleMobile__Framework__int_range_lookup(
-              latter_msg->descriptor->n_field_ranges,
-              latter_msg->descriptor->field_ranges, *earlier_case_p);
-          field = latter_msg->descriptor->fields + field_index;
-        } else {
-          /* Oneof is present in the latter message, move on */
-          continue;
-        }
-      } else {
-        field = &fields[i];
-      }
-
-      earlier_elem = STRUCT_MEMBER_P(earlier_msg, field->offset);
-      latter_elem = STRUCT_MEMBER_P(latter_msg, field->offset);
-      def_val = field->default_value;
-
-      switch (field->type) {
-        case PROTOBUF_C_TYPE_MESSAGE: {
-          PaddleMobile__Framework__ProtobufCMessage *em =
-              *(PaddleMobile__Framework__ProtobufCMessage **)earlier_elem;
-          PaddleMobile__Framework__ProtobufCMessage *lm =
-              *(PaddleMobile__Framework__ProtobufCMessage **)latter_elem;
-          if (em != NULL) {
-            if (lm != NULL) {
-              if (!PaddleMobile__Framework__merge_messages(em, lm, allocator))
-                return FALSE;
-              /* Already merged */
-              need_to_merge = FALSE;
-            } else {
-              /* Zero copy the message */
-              need_to_merge = TRUE;
-            }
-          }
-          break;
-        }
-        case PROTOBUF_C_TYPE_BYTES: {
-          uint8_t *e_data =
-              ((PaddleMobile__Framework__ProtobufCBinaryData *)earlier_elem)
-                  ->data;
-          uint8_t *l_data =
-              ((PaddleMobile__Framework__ProtobufCBinaryData *)latter_elem)
-                  ->data;
-          const PaddleMobile__Framework__ProtobufCBinaryData *d_bd =
-              (PaddleMobile__Framework__ProtobufCBinaryData *)def_val;
-
-          need_to_merge =
-              (e_data != NULL && (d_bd == NULL || e_data != d_bd->data)) &&
-              (l_data == NULL || (d_bd != NULL && l_data == d_bd->data));
-          break;
-        }
-        case PROTOBUF_C_TYPE_STRING: {
-          char *e_str = *(char **)earlier_elem;
-          char *l_str = *(char **)latter_elem;
-          const char *d_str = (const char *)def_val;
-
-          need_to_merge = e_str != d_str && l_str == d_str;
-          break;
-        }
-        default: {
-          /* Could be has field or case enum, the logic is
-           * equivalent, since 0 (FALSE) means not set for
-           * oneof */
-          need_to_merge = (*earlier_case_p != 0) && (*latter_case_p == 0);
-          break;
-        }
-      }
-
-      if (need_to_merge) {
-        size_t el_size =
-            PaddleMobile__Framework__sizeof_elt_in_repeated_array(field->type);
-        memcpy(latter_elem, earlier_elem, el_size);
-        /*
-         * Reset the element from the old message to 0
-         * to make sure earlier message deallocation
-         * doesn't corrupt zero-copied data in the new
-         * message, earlier message will be freed after
-         * this function is called anyway
-         */
-        memset(earlier_elem, 0, el_size);
-
-        if (field->quantifier_offset != 0) {
-          /* Set the has field or the case enum,
-           * if applicable */
-          *latter_case_p = *earlier_case_p;
-          *earlier_case_p = 0;
-        }
-      }
-    }
-  }
-  return TRUE;
-}
-
-/**
- * Count packed elements.
- *
- * Given a raw slab of packed-repeated values, determine the number of
- * elements. This function detects certain kinds of errors but not
- * others; the remaining error checking is done by
- * PaddleMobile__Framework__parse_packed_repeated_member().
- */
-static protobuf_c_boolean PaddleMobile__Framework__count_packed_elements(
-    PaddleMobile__Framework__ProtobufCType type, size_t len,
-    const uint8_t *data, size_t *count_out) {
-  switch (type) {
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-      if (len % 4 != 0) {
-        PROTOBUF_C_UNPACK_ERROR(
-            "length must be a multiple of 4 for fixed-length 32-bit types");
-        return FALSE;
-      }
-      *count_out = len / 4;
-      return TRUE;
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      if (len % 8 != 0) {
-        PROTOBUF_C_UNPACK_ERROR(
-            "length must be a multiple of 8 for fixed-length 64-bit types");
-        return FALSE;
-      }
-      *count_out = len / 8;
-      return TRUE;
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-    case PROTOBUF_C_TYPE_SINT32:
-    case PROTOBUF_C_TYPE_UINT32:
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_SINT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      *count_out = PaddleMobile__Framework__max_b128_numbers(len, data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_BOOL:
-      *count_out = len;
-      return TRUE;
-    case PROTOBUF_C_TYPE_STRING:
-    case PROTOBUF_C_TYPE_BYTES:
-    case PROTOBUF_C_TYPE_MESSAGE:
-    default:
-      PROTOBUF_C_UNPACK_ERROR("bad protobuf-c type %u for packed-repeated",
-                              type);
-      return FALSE;
-  }
-}
-
-static inline uint32_t PaddleMobile__Framework__parse_uint32(
-    unsigned len, const uint8_t *data) {
-  uint32_t rv = data[0] & 0x7f;
-  if (len > 1) {
-    rv |= ((uint32_t)(data[1] & 0x7f) << 7);
-    if (len > 2) {
-      rv |= ((uint32_t)(data[2] & 0x7f) << 14);
-      if (len > 3) {
-        rv |= ((uint32_t)(data[3] & 0x7f) << 21);
-        if (len > 4) rv |= ((uint32_t)(data[4]) << 28);
-      }
-    }
-  }
-  return rv;
-}
-
-static inline uint32_t PaddleMobile__Framework__parse_int32(
-    unsigned len, const uint8_t *data) {
-  return PaddleMobile__Framework__parse_uint32(len, data);
-}
-
-static inline int32_t unzigzag32(uint32_t v) {
-  if (v & 1)
-    return -(v >> 1) - 1;
-  else
-    return v >> 1;
-}
-
-static inline uint32_t PaddleMobile__Framework__parse_fixed_uint32(
-    const uint8_t *data) {
-#if !defined(WORDS_BIGENDIAN)
-  uint32_t t;
-  memcpy(&t, data, 4);
-  return t;
-#else
-  return data[0] | ((uint32_t)(data[1]) << 8) | ((uint32_t)(data[2]) << 16) |
-         ((uint32_t)(data[3]) << 24);
-#endif
-}
-
-static uint64_t PaddleMobile__Framework__parse_uint64(unsigned len,
-                                                      const uint8_t *data) {
-  unsigned shift, i;
-  uint64_t rv;
-
-  if (len < 5) return PaddleMobile__Framework__parse_uint32(len, data);
-  rv = ((uint64_t)(data[0] & 0x7f)) | ((uint64_t)(data[1] & 0x7f) << 7) |
-       ((uint64_t)(data[2] & 0x7f) << 14) | ((uint64_t)(data[3] & 0x7f) << 21);
-  shift = 28;
-  for (i = 4; i < len; i++) {
-    rv |= (((uint64_t)(data[i] & 0x7f)) << shift);
-    shift += 7;
-  }
-  return rv;
-}
-
-static inline int64_t PaddleMobile__Framework__unzigzag64(uint64_t v) {
-  if (v & 1)
-    return -(v >> 1) - 1;
-  else
-    return v >> 1;
-}
-
-static inline uint64_t PaddleMobile__Framework__parse_fixed_uint64(
-    const uint8_t *data) {
-#if !defined(WORDS_BIGENDIAN)
-  uint64_t t;
-  memcpy(&t, data, 8);
-  return t;
-#else
-  return (uint64_t)PaddleMobile__Framework__parse_fixed_uint32(data) |
-         (((uint64_t)PaddleMobile__Framework__parse_fixed_uint32(data + 4))
-          << 32);
-#endif
-}
-
-static protobuf_c_boolean PaddleMobile__Framework__parse_boolean(
-    unsigned len, const uint8_t *data) {
-  unsigned i;
-  for (i = 0; i < len; i++)
-    if (data[i] & 0x7f) return TRUE;
-  return FALSE;
-}
-
-static protobuf_c_boolean PaddleMobile__Framework__parse_required_member(
-    ScannedMember *scanned_member, void *member,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator,
-    protobuf_c_boolean maybe_clear) {
-  unsigned len = scanned_member->len;
-  const uint8_t *data = scanned_member->data;
-  PaddleMobile__Framework__ProtobufCWireType wire_type =
-      (PaddleMobile__Framework__ProtobufCWireType)scanned_member->wire_type;
-
-  switch (scanned_member->field->type) {
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
-      *(int32_t *)member = PaddleMobile__Framework__parse_int32(len, data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_UINT32:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
-      *(uint32_t *)member = PaddleMobile__Framework__parse_uint32(len, data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_SINT32:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
-      *(int32_t *)member =
-          unzigzag32(PaddleMobile__Framework__parse_uint32(len, data));
-      return TRUE;
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_32BIT) return FALSE;
-      *(uint32_t *)member = PaddleMobile__Framework__parse_fixed_uint32(data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
-      *(uint64_t *)member = PaddleMobile__Framework__parse_uint64(len, data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_SINT64:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
-      *(int64_t *)member = PaddleMobile__Framework__unzigzag64(
-          PaddleMobile__Framework__parse_uint64(len, data));
-      return TRUE;
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_64BIT) return FALSE;
-      *(uint64_t *)member = PaddleMobile__Framework__parse_fixed_uint64(data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_BOOL:
-      *(protobuf_c_boolean *)member =
-          PaddleMobile__Framework__parse_boolean(len, data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_STRING: {
-      char **pstr = (char **)member;
-      unsigned pref_len = scanned_member->length_prefix_len;
-
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED) return FALSE;
-
-      if (maybe_clear && *pstr != NULL) {
-        const char *def = (const char *)scanned_member->field->default_value;
-        if (*pstr != NULL && *pstr != def)
-          PaddleMobile__Framework__do_free(allocator, *pstr);
-      }
-      *pstr = (char *)PaddleMobile__Framework__do_alloc(allocator,
-                                                        len - pref_len + 1);
-      if (*pstr == NULL) return FALSE;
-      memcpy(*pstr, data + pref_len, len - pref_len);
-      (*pstr)[len - pref_len] = 0;
-      return TRUE;
-    }
-    case PROTOBUF_C_TYPE_BYTES: {
-      PaddleMobile__Framework__ProtobufCBinaryData *bd =
-          (PaddleMobile__Framework__ProtobufCBinaryData *)member;
-      const PaddleMobile__Framework__ProtobufCBinaryData *def_bd;
-      unsigned pref_len = scanned_member->length_prefix_len;
-
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED) return FALSE;
-
-      def_bd = (const PaddleMobile__Framework__ProtobufCBinaryData *)
-                   scanned_member->field->default_value;
-      if (maybe_clear && bd->data != NULL &&
-          (def_bd == NULL || bd->data != def_bd->data)) {
-        PaddleMobile__Framework__do_free(allocator, bd->data);
-      }
-      if (len - pref_len > 0) {
-        bd->data = (uint8_t *)PaddleMobile__Framework__do_alloc(allocator,
-                                                                len - pref_len);
-        if (bd->data == NULL) return FALSE;
-        memcpy(bd->data, data + pref_len, len - pref_len);
-      } else {
-        bd->data = NULL;
-      }
-      bd->len = len - pref_len;
-      return TRUE;
-    }
-    case PROTOBUF_C_TYPE_MESSAGE: {
-      PaddleMobile__Framework__ProtobufCMessage **pmessage =
-          (PaddleMobile__Framework__ProtobufCMessage **)member;
-      PaddleMobile__Framework__ProtobufCMessage *subm;
-      const PaddleMobile__Framework__ProtobufCMessage *def_mess;
-      protobuf_c_boolean merge_successful = TRUE;
-      unsigned pref_len = scanned_member->length_prefix_len;
-
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED) return FALSE;
-
-      def_mess = (const PaddleMobile__Framework__ProtobufCMessage *)
-                     scanned_member->field->default_value;
-      subm = PaddleMobile__Framework__protobuf_c_message_unpack(
-          (const PaddleMobile__Framework__ProtobufCMessageDescriptor *)
-              scanned_member->field->descriptor,
-          allocator, len - pref_len, data + pref_len);
-
-      if (maybe_clear && *pmessage != NULL && *pmessage != def_mess) {
-        if (subm != NULL)
-          merge_successful = PaddleMobile__Framework__merge_messages(
-              *pmessage, subm, allocator);
-        /* Delete the previous message */
-        PaddleMobile__Framework__protobuf_c_message_free_unpacked(*pmessage,
-                                                                  allocator);
-      }
-      *pmessage = subm;
-      if (subm == NULL || !merge_successful) return FALSE;
-      return TRUE;
-    }
-  }
-  return FALSE;
-}
-
-static protobuf_c_boolean PaddleMobile__Framework__parse_oneof_member(
-    ScannedMember *scanned_member, void *member,
-    PaddleMobile__Framework__ProtobufCMessage *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator) {
-  uint32_t *oneof_case = STRUCT_MEMBER_PTR(
-      uint32_t, message, scanned_member->field->quantifier_offset);
-
-  /* If we have already parsed a member of this oneof, free it. */
-  if (*oneof_case != 0) {
-    /* lookup field */
-    int field_index = PaddleMobile__Framework__int_range_lookup(
-        message->descriptor->n_field_ranges, message->descriptor->field_ranges,
-        *oneof_case);
-    const PaddleMobile__Framework__ProtobufCFieldDescriptor *old_field =
-        message->descriptor->fields + field_index;
-    size_t el_size =
-        PaddleMobile__Framework__sizeof_elt_in_repeated_array(old_field->type);
-
-    switch (old_field->type) {
-      case PROTOBUF_C_TYPE_STRING: {
-        char **pstr = (char **)member;
-        const char *def = (const char *)old_field->default_value;
-        if (*pstr != NULL && *pstr != def)
-          PaddleMobile__Framework__do_free(allocator, *pstr);
-        break;
-      }
-      case PROTOBUF_C_TYPE_BYTES: {
-        PaddleMobile__Framework__ProtobufCBinaryData *bd =
-            (PaddleMobile__Framework__ProtobufCBinaryData *)member;
-        const PaddleMobile__Framework__ProtobufCBinaryData *def_bd =
-            (const PaddleMobile__Framework__ProtobufCBinaryData *)
-                old_field->default_value;
-        if (bd->data != NULL && (def_bd == NULL || bd->data != def_bd->data)) {
-          PaddleMobile__Framework__do_free(allocator, bd->data);
-        }
-        break;
-      }
-      case PROTOBUF_C_TYPE_MESSAGE: {
-        PaddleMobile__Framework__ProtobufCMessage **pmessage =
-            (PaddleMobile__Framework__ProtobufCMessage **)member;
-        const PaddleMobile__Framework__ProtobufCMessage *def_mess =
-            (const PaddleMobile__Framework__ProtobufCMessage *)
-                old_field->default_value;
-        if (*pmessage != NULL && *pmessage != def_mess)
-          PaddleMobile__Framework__protobuf_c_message_free_unpacked(*pmessage,
-                                                                    allocator);
-        break;
-      }
-      default:
-        break;
-    }
-
-    memset(member, 0, el_size);
-  }
-  if (!PaddleMobile__Framework__parse_required_member(scanned_member, member,
-                                                      allocator, TRUE))
-    return FALSE;
-
-  *oneof_case = scanned_member->tag;
-  return TRUE;
-}
-
-static protobuf_c_boolean PaddleMobile__Framework__parse_optional_member(
-    ScannedMember *scanned_member, void *member,
-    PaddleMobile__Framework__ProtobufCMessage *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator) {
-  if (!PaddleMobile__Framework__parse_required_member(scanned_member, member,
-                                                      allocator, TRUE))
-    return FALSE;
-  if (scanned_member->field->quantifier_offset != 0)
-    STRUCT_MEMBER(protobuf_c_boolean, message,
-                  scanned_member->field->quantifier_offset) = TRUE;
-  return TRUE;
-}
-
-static protobuf_c_boolean PaddleMobile__Framework__parse_repeated_member(
-    ScannedMember *scanned_member, void *member,
-    PaddleMobile__Framework__ProtobufCMessage *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator) {
-  const PaddleMobile__Framework__ProtobufCFieldDescriptor *field =
-      scanned_member->field;
-  size_t *p_n = STRUCT_MEMBER_PTR(size_t, message, field->quantifier_offset);
-  size_t siz =
-      PaddleMobile__Framework__sizeof_elt_in_repeated_array(field->type);
-  char *array = *(char **)member;
-
-  if (!PaddleMobile__Framework__parse_required_member(
-          scanned_member, array + siz * (*p_n), allocator, FALSE)) {
-    return FALSE;
-  }
-  *p_n += 1;
-  return TRUE;
-}
-
-static unsigned PaddleMobile__Framework__scan_varint(unsigned len,
-                                                     const uint8_t *data) {
-  unsigned i;
-  if (len > 10) len = 10;
-  for (i = 0; i < len; i++)
-    if ((data[i] & 0x80) == 0) break;
-  if (i == len) return 0;
-  return i + 1;
-}
-
-static protobuf_c_boolean PaddleMobile__Framework__parse_packed_repeated_member(
-    ScannedMember *scanned_member, void *member,
-    PaddleMobile__Framework__ProtobufCMessage *message) {
-  const PaddleMobile__Framework__ProtobufCFieldDescriptor *field =
-      scanned_member->field;
-  size_t *p_n = STRUCT_MEMBER_PTR(size_t, message, field->quantifier_offset);
-  size_t siz =
-      PaddleMobile__Framework__sizeof_elt_in_repeated_array(field->type);
-  void *array = *(char **)member + siz * (*p_n);
-  const uint8_t *at = scanned_member->data + scanned_member->length_prefix_len;
-  size_t rem = scanned_member->len - scanned_member->length_prefix_len;
-  size_t count = 0;
-  unsigned i;
-
-  switch (field->type) {
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-      count = (scanned_member->len - scanned_member->length_prefix_len) / 4;
-#if !defined(WORDS_BIGENDIAN)
-      goto no_unpacking_needed;
-#else
-      for (i = 0; i < count; i++) {
-        ((uint32_t *)array)[i] =
-            PaddleMobile__Framework__parse_fixed_uint32(at);
-        at += 4;
-      }
-      break;
-#endif
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      count = (scanned_member->len - scanned_member->length_prefix_len) / 8;
-#if !defined(WORDS_BIGENDIAN)
-      goto no_unpacking_needed;
-#else
-      for (i = 0; i < count; i++) {
-        ((uint64_t *)array)[i] =
-            PaddleMobile__Framework__parse_fixed_uint64(at);
-        at += 8;
-      }
-      break;
-#endif
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-      while (rem > 0) {
-        unsigned s = PaddleMobile__Framework__scan_varint(rem, at);
-        if (s == 0) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated int32 value");
-          return FALSE;
-        }
-        ((int32_t *)array)[count++] =
-            PaddleMobile__Framework__parse_int32(s, at);
-        at += s;
-        rem -= s;
-      }
-      break;
-    case PROTOBUF_C_TYPE_SINT32:
-      while (rem > 0) {
-        unsigned s = PaddleMobile__Framework__scan_varint(rem, at);
-        if (s == 0) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated sint32 value");
-          return FALSE;
-        }
-        ((int32_t *)array)[count++] =
-            unzigzag32(PaddleMobile__Framework__parse_uint32(s, at));
-        at += s;
-        rem -= s;
-      }
-      break;
-    case PROTOBUF_C_TYPE_UINT32:
-      while (rem > 0) {
-        unsigned s = PaddleMobile__Framework__scan_varint(rem, at);
-        if (s == 0) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated enum or uint32 value");
-          return FALSE;
-        }
-        ((uint32_t *)array)[count++] =
-            PaddleMobile__Framework__parse_uint32(s, at);
-        at += s;
-        rem -= s;
-      }
-      break;
-
-    case PROTOBUF_C_TYPE_SINT64:
-      while (rem > 0) {
-        unsigned s = PaddleMobile__Framework__scan_varint(rem, at);
-        if (s == 0) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated sint64 value");
-          return FALSE;
-        }
-        ((int64_t *)array)[count++] = PaddleMobile__Framework__unzigzag64(
-            PaddleMobile__Framework__parse_uint64(s, at));
-        at += s;
-        rem -= s;
-      }
-      break;
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      while (rem > 0) {
-        unsigned s = PaddleMobile__Framework__scan_varint(rem, at);
-        if (s == 0) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated int64/uint64 value");
-          return FALSE;
-        }
-        ((int64_t *)array)[count++] =
-            PaddleMobile__Framework__parse_uint64(s, at);
-        at += s;
-        rem -= s;
-      }
-      break;
-    case PROTOBUF_C_TYPE_BOOL:
-      count = rem;
-      for (i = 0; i < count; i++) {
-        if (at[i] > 1) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated boolean value");
-          return FALSE;
-        }
-        ((protobuf_c_boolean *)array)[i] = at[i];
-      }
-      break;
-    default:
-      PROTOBUF_C__ASSERT_NOT_REACHED();
-  }
-  *p_n += count;
-  return TRUE;
-
-#if !defined(WORDS_BIGENDIAN)
-no_unpacking_needed:
-  memcpy(array, at, count * siz);
-  *p_n += count;
-  return TRUE;
-#endif
-}
-
-static protobuf_c_boolean PaddleMobile__Framework__is_packable_type(
-    PaddleMobile__Framework__ProtobufCType type) {
-  return type != PROTOBUF_C_TYPE_STRING && type != PROTOBUF_C_TYPE_BYTES &&
-         type != PROTOBUF_C_TYPE_MESSAGE;
-}
-
-static protobuf_c_boolean PaddleMobile__Framework__parse_member(
-    ScannedMember *scanned_member,
-    PaddleMobile__Framework__ProtobufCMessage *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator) {
-  const PaddleMobile__Framework__ProtobufCFieldDescriptor *field =
-      scanned_member->field;
-  void *member;
-
-  if (field == NULL) {
-    PaddleMobile__Framework__ProtobufCMessageUnknownField *ufield =
-        message->unknown_fields + (message->n_unknown_fields++);
-    ufield->tag = scanned_member->tag;
-    ufield->wire_type =
-        (PaddleMobile__Framework__ProtobufCWireType)scanned_member->wire_type;
-    ufield->len = scanned_member->len;
-    ufield->data = (uint8_t *)PaddleMobile__Framework__do_alloc(
-        allocator, scanned_member->len);
-    if (ufield->data == NULL) return FALSE;
-    memcpy(ufield->data, scanned_member->data, ufield->len);
-    return TRUE;
-  }
-  member = (char *)message + field->offset;
-  switch (field->label) {
-    case PROTOBUF_C_LABEL_REQUIRED:
-      return PaddleMobile__Framework__parse_required_member(
-          scanned_member, member, allocator, TRUE);
-    case PROTOBUF_C_LABEL_OPTIONAL:
-    case PROTOBUF_C_LABEL_NONE:
-      if (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_ONEOF)) {
-        return PaddleMobile__Framework__parse_oneof_member(
-            scanned_member, member, message, allocator);
-      } else {
-        return PaddleMobile__Framework__parse_optional_member(
-            scanned_member, member, message, allocator);
-      }
-    case PROTOBUF_C_LABEL_REPEATED:
-      if (scanned_member->wire_type == PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED &&
-          (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED) ||
-           PaddleMobile__Framework__is_packable_type(field->type))) {
-        return PaddleMobile__Framework__parse_packed_repeated_member(
-            scanned_member, member, message);
-      } else {
-        return PaddleMobile__Framework__parse_repeated_member(
-            scanned_member, member, message, allocator);
-      }
-  }
-  PROTOBUF_C__ASSERT_NOT_REACHED();
-  return 0;
-}
-
-/**
- * Initialise messages generated by old code.
- *
- * This function is used if desc->message_init == NULL (which occurs
- * for old code, and which would be useful to support allocating
- * descriptors dynamically).
- */
-static void PaddleMobile__Framework__message_init_generic(
-    const PaddleMobile__Framework__ProtobufCMessageDescriptor *desc,
-    PaddleMobile__Framework__ProtobufCMessage *message) {
-  unsigned i;
-
-  memset(message, 0, desc->sizeof_message);
-  message->descriptor = desc;
-  for (i = 0; i < desc->n_fields; i++) {
-    if (desc->fields[i].default_value != NULL &&
-        desc->fields[i].label != PROTOBUF_C_LABEL_REPEATED) {
-      void *field = STRUCT_MEMBER_P(message, desc->fields[i].offset);
-      const void *dv = desc->fields[i].default_value;
-
-      switch (desc->fields[i].type) {
-        case PROTOBUF_C_TYPE_INT32:
-        case PROTOBUF_C_TYPE_SINT32:
-        case PROTOBUF_C_TYPE_SFIXED32:
-        case PROTOBUF_C_TYPE_UINT32:
-        case PROTOBUF_C_TYPE_FIXED32:
-        case PROTOBUF_C_TYPE_FLOAT:
-        case PROTOBUF_C_TYPE_ENUM:
-          memcpy(field, dv, 4);
-          break;
-        case PROTOBUF_C_TYPE_INT64:
-        case PROTOBUF_C_TYPE_SINT64:
-        case PROTOBUF_C_TYPE_SFIXED64:
-        case PROTOBUF_C_TYPE_UINT64:
-        case PROTOBUF_C_TYPE_FIXED64:
-        case PROTOBUF_C_TYPE_DOUBLE:
-          memcpy(field, dv, 8);
-          break;
-        case PROTOBUF_C_TYPE_BOOL:
-          memcpy(field, dv, sizeof(protobuf_c_boolean));
-          break;
-        case PROTOBUF_C_TYPE_BYTES:
-          memcpy(field, dv,
-                 sizeof(PaddleMobile__Framework__ProtobufCBinaryData));
-          break;
-
-        case PROTOBUF_C_TYPE_STRING:
-        case PROTOBUF_C_TYPE_MESSAGE:
-          /*
-           * The next line essentially implements a cast
-           * from const, which is totally unavoidable.
-           */
-          *(const void **)field = dv;
-          break;
-      }
-    }
-  }
-}
-
-/**@}*/
-
-/*
- * ScannedMember slabs (an unpacking implementation detail). Before doing real
- * unpacking, we first scan through the elements to see how many there are (for
- * repeated fields), and which field to use (for non-repeated fields given
- * twice).
- *
- * In order to avoid allocations for small messages, we keep a stack-allocated
- * slab of ScannedMembers of size FIRST_SCANNED_MEMBER_SLAB_SIZE (16). After we
- * fill that up, we allocate each slab twice as large as the previous one.
- */
-#define FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2 4
-
-/*
- * The number of slabs, including the stack-allocated ones; choose the number so
- * that we would overflow if we needed a slab larger than provided.
- */
-#define MAX_SCANNED_MEMBER_SLAB                                      \
-  (sizeof(unsigned int) * 8 - 1 - BOUND_SIZEOF_SCANNED_MEMBER_LOG2 - \
-   FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2)
-
-#define REQUIRED_FIELD_BITMAP_SET(index) \
-  (required_fields_bitmap[(index) / 8] |= (1UL << ((index) % 8)))
-
-#define REQUIRED_FIELD_BITMAP_IS_SET(index) \
-  (required_fields_bitmap[(index) / 8] & (1UL << ((index) % 8)))
-
-PaddleMobile__Framework__ProtobufCMessage *
-PaddleMobile__Framework__protobuf_c_message_unpack(
-    const PaddleMobile__Framework__ProtobufCMessageDescriptor *desc,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data) {
-  PaddleMobile__Framework__ProtobufCMessage *rv;
-  size_t rem = len;
-  const uint8_t *at = data;
-  const PaddleMobile__Framework__ProtobufCFieldDescriptor *last_field =
-      desc->fields + 0;
-  ScannedMember first_member_slab[1UL << FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2];
-
-  /*
-   * scanned_member_slabs[i] is an array of arrays of ScannedMember.
-   * The first slab (scanned_member_slabs[0] is just a pointer to
-   * first_member_slab), above. All subsequent slabs will be allocated
-   * using the allocator.
-   */
-  ScannedMember *scanned_member_slabs[MAX_SCANNED_MEMBER_SLAB + 1];
-  unsigned which_slab = 0;    /* the slab we are currently populating */
-  unsigned in_slab_index = 0; /* number of members in the slab */
-  size_t n_unknown = 0;
-  unsigned f;
-  unsigned j;
-  unsigned i_slab;
-  unsigned last_field_index = 0;
-  unsigned required_fields_bitmap_len;
-  unsigned char required_fields_bitmap_stack[16];
-  unsigned char *required_fields_bitmap = required_fields_bitmap_stack;
-  protobuf_c_boolean required_fields_bitmap_alloced = FALSE;
-
-  ASSERT_IS_MESSAGE_DESCRIPTOR(desc);
-
-  if (allocator == NULL) allocator = &protobuf_c__allocator;
-
-  rv = (PaddleMobile__Framework__ProtobufCMessage *)
-      PaddleMobile__Framework__do_alloc(allocator, desc->sizeof_message);
-  if (!rv) return (NULL);
-  scanned_member_slabs[0] = first_member_slab;
-
-  required_fields_bitmap_len = (desc->n_fields + 7) / 8;
-  if (required_fields_bitmap_len > sizeof(required_fields_bitmap_stack)) {
-    required_fields_bitmap = (unsigned char *)PaddleMobile__Framework__do_alloc(
-        allocator, required_fields_bitmap_len);
-    if (!required_fields_bitmap) {
-      PaddleMobile__Framework__do_free(allocator, rv);
-      return (NULL);
-    }
-    required_fields_bitmap_alloced = TRUE;
-  }
-  memset(required_fields_bitmap, 0, required_fields_bitmap_len);
-
-  /*
-   * Generated code always defines "message_init". However, we provide a
-   * fallback for (1) users of old protobuf-c generated-code that do not
-   * provide the function, and (2) descriptors constructed from some other
-   * source (most likely, direct construction from the .proto file).
-   */
-  if (desc->message_init != NULL)
-    PaddleMobile__Framework__protobuf_c_message_init(desc, rv);
-  else
-    PaddleMobile__Framework__message_init_generic(desc, rv);
-
-  while (rem > 0) {
-    uint32_t tag;
-    PaddleMobile__Framework__ProtobufCWireType wire_type;
-    size_t used = PaddleMobile__Framework__parse_tag_and_wiretype(rem, at, &tag,
-                                                                  &wire_type);
-    const PaddleMobile__Framework__ProtobufCFieldDescriptor *field;
-    ScannedMember tmp;
-
-    if (used == 0) {
-      PROTOBUF_C_UNPACK_ERROR("error parsing tag/wiretype at offset %u",
-                              (unsigned)(at - data));
-      goto error_cleanup_during_scan;
-    }
-    /*
-     * \todo Consider optimizing for field[1].id == tag, if field[1]
-     * exists!
-     */
-    if (last_field == NULL || last_field->id != tag) {
-      /* lookup field */
-      int field_index = PaddleMobile__Framework__int_range_lookup(
-          desc->n_field_ranges, desc->field_ranges, tag);
-      if (field_index < 0) {
-        field = NULL;
-        n_unknown++;
-      } else {
-        field = desc->fields + field_index;
-        last_field = field;
-        last_field_index = field_index;
-      }
-    } else {
-      field = last_field;
-    }
-
-    if (field != NULL && field->label == PROTOBUF_C_LABEL_REQUIRED)
-      REQUIRED_FIELD_BITMAP_SET(last_field_index);
-
-    at += used;
-    rem -= used;
-    tmp.tag = tag;
-    tmp.wire_type = wire_type;
-    tmp.field = field;
-    tmp.data = at;
-    tmp.length_prefix_len = 0;
-
-    switch (wire_type) {
-      case PROTOBUF_C_WIRE_TYPE_VARINT: {
-        unsigned max_len = rem < 10 ? rem : 10;
-        unsigned i;
-
-        for (i = 0; i < max_len; i++)
-          if ((at[i] & 0x80) == 0) break;
-        if (i == max_len) {
-          PROTOBUF_C_UNPACK_ERROR("unterminated varint at offset %u",
-                                  (unsigned)(at - data));
-          goto error_cleanup_during_scan;
-        }
-        tmp.len = i + 1;
-        break;
-      }
-      case PROTOBUF_C_WIRE_TYPE_64BIT:
-        if (rem < 8) {
-          PROTOBUF_C_UNPACK_ERROR("too short after 64bit wiretype at offset %u",
-                                  (unsigned)(at - data));
-          goto error_cleanup_during_scan;
-        }
-        tmp.len = 8;
-        break;
-      case PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED: {
-        size_t pref_len;
-
-        tmp.len = PaddleMobile__Framework__scan_length_prefixed_data(rem, at,
-                                                                     &pref_len);
-        if (tmp.len == 0) {
-          /* NOTE: PaddleMobile__Framework__scan_length_prefixed_data calls
-           * UNPACK_ERROR */
-          goto error_cleanup_during_scan;
-        }
-        tmp.length_prefix_len = pref_len;
-        break;
-      }
-      case PROTOBUF_C_WIRE_TYPE_32BIT:
-        if (rem < 4) {
-          PROTOBUF_C_UNPACK_ERROR("too short after 32bit wiretype at offset %u",
-                                  (unsigned)(at - data));
-          goto error_cleanup_during_scan;
-        }
-        tmp.len = 4;
-        break;
-      default:
-        PROTOBUF_C_UNPACK_ERROR("unsupported tag %u at offset %u", wire_type,
-                                (unsigned)(at - data));
-        goto error_cleanup_during_scan;
-    }
-
-    if (in_slab_index ==
-        (1UL << (which_slab + FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2))) {
-      size_t size;
-
-      in_slab_index = 0;
-      if (which_slab == MAX_SCANNED_MEMBER_SLAB) {
-        PROTOBUF_C_UNPACK_ERROR("too many fields");
-        goto error_cleanup_during_scan;
-      }
-      which_slab++;
-      size = sizeof(ScannedMember)
-             << (which_slab + FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2);
-      scanned_member_slabs[which_slab] =
-          (ScannedMember *)PaddleMobile__Framework__do_alloc(allocator, size);
-      if (scanned_member_slabs[which_slab] == NULL)
-        goto error_cleanup_during_scan;
-    }
-    scanned_member_slabs[which_slab][in_slab_index++] = tmp;
-
-    if (field != NULL && field->label == PROTOBUF_C_LABEL_REPEATED) {
-      size_t *n = STRUCT_MEMBER_PTR(size_t, rv, field->quantifier_offset);
-      if (wire_type == PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED &&
-          (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED) ||
-           PaddleMobile__Framework__is_packable_type(field->type))) {
-        size_t count;
-        if (!PaddleMobile__Framework__count_packed_elements(
-                field->type, tmp.len - tmp.length_prefix_len,
-                tmp.data + tmp.length_prefix_len, &count)) {
-          PROTOBUF_C_UNPACK_ERROR("counting packed elements");
-          goto error_cleanup_during_scan;
-        }
-        *n += count;
-      } else {
-        *n += 1;
-      }
-    }
-
-    at += tmp.len;
-    rem -= tmp.len;
-  }
-
-  /* allocate space for repeated fields, also check that all required fields
-   * have been set */
-  for (f = 0; f < desc->n_fields; f++) {
-    const PaddleMobile__Framework__ProtobufCFieldDescriptor *field =
-        desc->fields + f;
-    if (field->label == PROTOBUF_C_LABEL_REPEATED) {
-      size_t siz =
-          PaddleMobile__Framework__sizeof_elt_in_repeated_array(field->type);
-      size_t *n_ptr = STRUCT_MEMBER_PTR(size_t, rv, field->quantifier_offset);
-      if (*n_ptr != 0) {
-        unsigned n = *n_ptr;
-        void *a;
-        *n_ptr = 0;
-        assert(rv->descriptor != NULL);
-#define CLEAR_REMAINING_N_PTRS()                               \
-  for (f++; f < desc->n_fields; f++) {                         \
-    field = desc->fields + f;                                  \
-    if (field->label == PROTOBUF_C_LABEL_REPEATED)             \
-      STRUCT_MEMBER(size_t, rv, field->quantifier_offset) = 0; \
-  }
-        a = PaddleMobile__Framework__do_alloc(allocator, siz * n);
-        if (!a) {
-          CLEAR_REMAINING_N_PTRS();
-          goto error_cleanup;
-        }
-        STRUCT_MEMBER(void *, rv, field->offset) = a;
-      }
-    } else if (field->label == PROTOBUF_C_LABEL_REQUIRED) {
-      if (field->default_value == NULL && !REQUIRED_FIELD_BITMAP_IS_SET(f)) {
-        CLEAR_REMAINING_N_PTRS();
-        PROTOBUF_C_UNPACK_ERROR("message '%s': missing required field '%s'",
-                                desc->name, field->name);
-        goto error_cleanup;
-      }
-    }
-  }
-#undef CLEAR_REMAINING_N_PTRS
-
-  /* allocate space for unknown fields */
-  if (n_unknown) {
-    rv->unknown_fields =
-        (PaddleMobile__Framework__ProtobufCMessageUnknownField *)
-            PaddleMobile__Framework__do_alloc(
-                allocator,
-                n_unknown *
-                    sizeof(
-                        PaddleMobile__Framework__ProtobufCMessageUnknownField));
-    if (rv->unknown_fields == NULL) goto error_cleanup;
-  }
-
-  /* do real parsing */
-  for (i_slab = 0; i_slab <= which_slab; i_slab++) {
-    unsigned max =
-        (i_slab == which_slab) ? in_slab_index : (1UL << (i_slab + 4));
-    ScannedMember *slab = scanned_member_slabs[i_slab];
-
-    for (j = 0; j < max; j++) {
-      if (!PaddleMobile__Framework__parse_member(slab + j, rv, allocator)) {
-        PROTOBUF_C_UNPACK_ERROR(
-            "error parsing member %s of %s",
-            slab->field ? slab->field->name : "*unknown-field*", desc->name);
-        goto error_cleanup;
-      }
-    }
-  }
-
-  /* cleanup */
-  for (j = 1; j <= which_slab; j++)
-    PaddleMobile__Framework__do_free(allocator, scanned_member_slabs[j]);
-  if (required_fields_bitmap_alloced)
-    PaddleMobile__Framework__do_free(allocator, required_fields_bitmap);
-  return rv;
-
-error_cleanup:
-  PaddleMobile__Framework__protobuf_c_message_free_unpacked(rv, allocator);
-  for (j = 1; j <= which_slab; j++)
-    PaddleMobile__Framework__do_free(allocator, scanned_member_slabs[j]);
-  if (required_fields_bitmap_alloced)
-    PaddleMobile__Framework__do_free(allocator, required_fields_bitmap);
-  return NULL;
-
-error_cleanup_during_scan:
-  PaddleMobile__Framework__do_free(allocator, rv);
-  for (j = 1; j <= which_slab; j++)
-    PaddleMobile__Framework__do_free(allocator, scanned_member_slabs[j]);
-  if (required_fields_bitmap_alloced)
-    PaddleMobile__Framework__do_free(allocator, required_fields_bitmap);
-  return NULL;
-}
-
-void PaddleMobile__Framework__protobuf_c_message_free_unpacked(
-    PaddleMobile__Framework__ProtobufCMessage *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator) {
-  const PaddleMobile__Framework__ProtobufCMessageDescriptor *desc;
-  unsigned f;
-
-  if (message == NULL) return;
-
-  desc = message->descriptor;
-
-  ASSERT_IS_MESSAGE(message);
-
-  if (allocator == NULL) allocator = &protobuf_c__allocator;
-  message->descriptor = NULL;
-  for (f = 0; f < desc->n_fields; f++) {
-    if (0 != (desc->fields[f].flags & PROTOBUF_C_FIELD_FLAG_ONEOF) &&
-        desc->fields[f].id !=
-            STRUCT_MEMBER(uint32_t, message,
-                          desc->fields[f].quantifier_offset)) {
-      /* This is not the selected oneof, skip it */
-      continue;
-    }
-
-    if (desc->fields[f].label == PROTOBUF_C_LABEL_REPEATED) {
-      size_t n =
-          STRUCT_MEMBER(size_t, message, desc->fields[f].quantifier_offset);
-      void *arr = STRUCT_MEMBER(void *, message, desc->fields[f].offset);
-
-      if (arr != NULL) {
-        if (desc->fields[f].type == PROTOBUF_C_TYPE_STRING) {
-          unsigned i;
-          for (i = 0; i < n; i++)
-            PaddleMobile__Framework__do_free(allocator, ((char **)arr)[i]);
-        } else if (desc->fields[f].type == PROTOBUF_C_TYPE_BYTES) {
-          unsigned i;
-          for (i = 0; i < n; i++)
-            PaddleMobile__Framework__do_free(
-                allocator,
-                ((PaddleMobile__Framework__ProtobufCBinaryData *)arr)[i].data);
-        } else if (desc->fields[f].type == PROTOBUF_C_TYPE_MESSAGE) {
-          unsigned i;
-          for (i = 0; i < n; i++)
-            PaddleMobile__Framework__protobuf_c_message_free_unpacked(
-                ((PaddleMobile__Framework__ProtobufCMessage **)arr)[i],
-                allocator);
-        }
-        PaddleMobile__Framework__do_free(allocator, arr);
-      }
-    } else if (desc->fields[f].type == PROTOBUF_C_TYPE_STRING) {
-      char *str = STRUCT_MEMBER(char *, message, desc->fields[f].offset);
-
-      if (str && str != desc->fields[f].default_value)
-        PaddleMobile__Framework__do_free(allocator, str);
-    } else if (desc->fields[f].type == PROTOBUF_C_TYPE_BYTES) {
-      void *data = STRUCT_MEMBER(PaddleMobile__Framework__ProtobufCBinaryData,
-                                 message, desc->fields[f].offset)
-                       .data;
-      const PaddleMobile__Framework__ProtobufCBinaryData *default_bd;
-
-      default_bd =
-          (const PaddleMobile__Framework__ProtobufCBinaryData *)desc->fields[f]
-              .default_value;
-      if (data != NULL && (default_bd == NULL || default_bd->data != data)) {
-        PaddleMobile__Framework__do_free(allocator, data);
-      }
-    } else if (desc->fields[f].type == PROTOBUF_C_TYPE_MESSAGE) {
-      PaddleMobile__Framework__ProtobufCMessage *sm;
-
-      sm = STRUCT_MEMBER(PaddleMobile__Framework__ProtobufCMessage *, message,
-                         desc->fields[f].offset);
-      if (sm && sm != desc->fields[f].default_value)
-        PaddleMobile__Framework__protobuf_c_message_free_unpacked(sm,
-                                                                  allocator);
-    }
-  }
-
-  for (f = 0; f < message->n_unknown_fields; f++)
-    PaddleMobile__Framework__do_free(allocator,
-                                     message->unknown_fields[f].data);
-  if (message->unknown_fields != NULL)
-    PaddleMobile__Framework__do_free(allocator, message->unknown_fields);
-
-  PaddleMobile__Framework__do_free(allocator, message);
-}
-
-void PaddleMobile__Framework__protobuf_c_message_init(
-    const PaddleMobile__Framework__ProtobufCMessageDescriptor *descriptor,
-    void *message) {
-  descriptor->message_init(
-      (PaddleMobile__Framework__ProtobufCMessage *)(message));
-}
-
-protobuf_c_boolean PaddleMobile__Framework__protobuf_c_message_check(
-    const PaddleMobile__Framework__ProtobufCMessage *message) {
-  unsigned i;
-
-  if (!message || !message->descriptor ||
-      message->descriptor->magic != PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC) {
-    return FALSE;
-  }
-
-  for (i = 0; i < message->descriptor->n_fields; i++) {
-    const PaddleMobile__Framework__ProtobufCFieldDescriptor *f =
-        message->descriptor->fields + i;
-    PaddleMobile__Framework__ProtobufCType type = f->type;
-    PaddleMobile__Framework__ProtobufCLabel label = f->label;
-    void *field = STRUCT_MEMBER_P(message, f->offset);
-
-    if (label == PROTOBUF_C_LABEL_REPEATED) {
-      size_t *quantity =
-          (size_t *)STRUCT_MEMBER_P(message, f->quantifier_offset);
-
-      if (*quantity > 0 && *(void **)field == NULL) {
-        return FALSE;
-      }
-
-      if (type == PROTOBUF_C_TYPE_MESSAGE) {
-        PaddleMobile__Framework__ProtobufCMessage **submessage =
-            *(PaddleMobile__Framework__ProtobufCMessage ***)field;
-        unsigned j;
-        for (j = 0; j < *quantity; j++) {
-          if (!PaddleMobile__Framework__protobuf_c_message_check(submessage[j]))
-            return FALSE;
-        }
-      } else if (type == PROTOBUF_C_TYPE_STRING) {
-        char **string = *(char ***)field;
-        unsigned j;
-        for (j = 0; j < *quantity; j++) {
-          if (!string[j]) return FALSE;
-        }
-      } else if (type == PROTOBUF_C_TYPE_BYTES) {
-        PaddleMobile__Framework__ProtobufCBinaryData *bd =
-            *(PaddleMobile__Framework__ProtobufCBinaryData **)field;
-        unsigned j;
-        for (j = 0; j < *quantity; j++) {
-          if (bd[j].len > 0 && bd[j].data == NULL) return FALSE;
-        }
-      }
-
-    } else { /* PROTOBUF_C_LABEL_REQUIRED or PROTOBUF_C_LABEL_OPTIONAL */
-
-      if (type == PROTOBUF_C_TYPE_MESSAGE) {
-        PaddleMobile__Framework__ProtobufCMessage *submessage =
-            *(PaddleMobile__Framework__ProtobufCMessage **)field;
-        if (label == PROTOBUF_C_LABEL_REQUIRED || submessage != NULL) {
-          if (!PaddleMobile__Framework__protobuf_c_message_check(submessage))
-            return FALSE;
-        }
-      } else if (type == PROTOBUF_C_TYPE_STRING) {
-        char *string = *(char **)field;
-        if (label == PROTOBUF_C_LABEL_REQUIRED && string == NULL) return FALSE;
-      } else if (type == PROTOBUF_C_TYPE_BYTES) {
-        protobuf_c_boolean *has = (protobuf_c_boolean *)STRUCT_MEMBER_P(
-            message, f->quantifier_offset);
-        PaddleMobile__Framework__ProtobufCBinaryData *bd =
-            (PaddleMobile__Framework__ProtobufCBinaryData *)field;
-        if (label == PROTOBUF_C_LABEL_REQUIRED || *has == TRUE) {
-          if (bd->len > 0 && bd->data == NULL) return FALSE;
-        }
-      }
-    }
-  }
-
-  return TRUE;
-}
-
-/* === services === */
-
-typedef void (*GenericHandler)(
-    void *service, const PaddleMobile__Framework__ProtobufCMessage *input,
-    ProtobufCClosure closure, void *closure_data);
diff --git a/mobile/src/protobuf-c/protobuf-c.h b/mobile/src/protobuf-c/protobuf-c.h
deleted file mode 100644
index ffb86e8612..0000000000
--- a/mobile/src/protobuf-c/protobuf-c.h
+++ /dev/null
@@ -1,962 +0,0 @@
-/*
- * Copyright (c) 2008-2017, Dave Benson and the protobuf-c authors.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- *     * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- *     * Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following disclaimer
- * in the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*! \file
- * \mainpage Introduction
- *
- * This is [protobuf-c], a C implementation of [Protocol Buffers].
- *
- * This file defines the public API for the `libprotobuf-c` support library.
- * This API includes interfaces that can be used directly by client code as well
- * as the interfaces used by the code generated by the `protoc-c` compiler.
- *
- * The `libprotobuf-c` support library performs the actual serialization and
- * deserialization of Protocol Buffers messages. It interacts with structures,
- * definitions, and metadata generated by the `protoc-c` compiler from .proto
- * files.
- *
- * \authors Dave Benson and the `protobuf-c` authors.
- *
- * \copyright 2008-2014. Licensed under the terms of the [BSD-2-Clause] license.
- *
- * [protobuf-c]:       https://github.com/protobuf-c/protobuf-c
- * [Protocol Buffers]: https://developers.google.com/protocol-buffers/
- * [BSD-2-Clause]:     http://opensource.org/licenses/BSD-2-Clause
- *
- * \page gencode Generated Code
- *
- * For each enum, we generate a C enum. For each message, we generate a C
- * structure which can be cast to a `PaddleMobile__Framework__ProtobufCMessage`.
- *
- * For each enum and message, we generate a descriptor object that allows us to
- * implement a kind of reflection on the structures.
- *
- * First, some naming conventions:
- *
- * - The name of the type for enums and messages and services is camel case
- *   (meaning WordsAreCrammedTogether) except that double underscores are used
- *   to delimit scopes. For example, the following `.proto` file:
- *
-~~~{.proto}
-        package foo.bar;
-        message BazBah {
-            optional int32 val = 1;
-        }
-~~~
- *
- * would generate a C type `Foo__Bar__BazBah`.
- *
- * - Identifiers for functions and globals are all lowercase, with camel case
- *   words separated by single underscores. For example, one of the function
- *   prototypes generated by `protoc-c` for the above example:
- *
-~~~{.c}
-Foo__Bar__BazBah *
-       foo__bar__baz_bah__unpack
-                     (PaddleMobile__Framework__ProtobufCAllocator  *allocator,
-                      size_t               len,
-                      const uint8_t       *data);
-~~~
- *
- * - Identifiers for enum values contain an uppercase prefix which embeds the
- *   package name and the enum type name.
- *
- * - A double underscore is used to separate further components of identifier
- *   names.
- *
- * For example, in the name of the unpack function above, the package name
- * `foo.bar` has become `foo__bar`, the message name BazBah has become
- * `baz_bah`, and the method name is `unpack`. These are all joined with double
- * underscores to form the C identifier `foo__bar__baz_bah__unpack`.
- *
- * We also generate descriptor objects for messages and enums. These are
- * declared in the `.pb-c.h` files:
- *
-~~~{.c}
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-foo__bar__baz_bah__descriptor;
-~~~
- *
- * The message structures all begin with
-`PaddleMobile__Framework__ProtobufCMessageDescriptor *` which is
- * sufficient to allow them to be cast to
-`PaddleMobile__Framework__ProtobufCMessage`.
- *
- * For each message defined in a `.proto` file, we generate a number of
- * functions and macros. Each function name contains a prefix based on the
- * package name and message name in order to make it a unique C identifier.
- *
- * - `INIT`. Statically initializes a message object, initializing its
- *   descriptor and setting its fields to default values. Uninitialized
- *   messages cannot be processed by the protobuf-c library.
- *
-~~~{.c}
-#define FOO__BAR__BAZ_BAH__INIT \
- { PROTOBUF_C_MESSAGE_INIT (&foo__bar__baz_bah__descriptor), 0 }
-~~~
- * - `init()`. Initializes a message object, initializing its descriptor and
- *   setting its fields to default values. Uninitialized messages cannot be
- *   processed by the protobuf-c library.
- *
-~~~{.c}
-void foo__bar__baz_bah__init
-                     (Foo__Bar__BazBah *message);
-~~~
- * - `unpack()`. Unpacks data for a particular message format. Note that the
- *   `allocator` parameter is usually `NULL` to indicate that the system's
- *   `malloc()` and `free()` functions should be used for dynamically allocating
- *   memory.
- *
-~~~{.c}
-Foo__Bar__BazBah *
-       foo__bar__baz_bah__unpack
-                     (PaddleMobile__Framework__ProtobufCAllocator  *allocator,
-                      size_t               len,
-                      const uint8_t       *data);
-~~~
- *
- * - `free_unpacked()`. Frees a message object obtained with the `unpack()`
- *   method. Freeing `NULL` is allowed (the same as with `free()`).
- *
-~~~{.c}
-void   foo__bar__baz_bah__free_unpacked
-                     (Foo__Bar__BazBah *message,
-                      PaddleMobile__Framework__ProtobufCAllocator *allocator);
-~~~
- *
- * - `get_packed_size()`. Calculates the length in bytes of the serialized
- *   representation of the message object.
- *
-~~~{.c}
-size_t foo__bar__baz_bah__get_packed_size
-                     (const Foo__Bar__BazBah   *message);
-~~~
- *
- * - `pack()`. Pack a message object into a preallocated buffer. Assumes that
- *   the buffer is large enough. (Use `get_packed_size()` first.)
- *
-~~~{.c}
-size_t foo__bar__baz_bah__pack
-                     (const Foo__Bar__BazBah   *message,
-                      uint8_t             *out);
-~~~
- *
- * - `pack_to_buffer()`. Packs a message into a "virtual buffer". This is an
- *   object which defines an "append bytes" callback to consume data as it is
- *   serialized.
- *
-~~~{.c}
-size_t foo__bar__baz_bah__pack_to_buffer
-                     (const Foo__Bar__BazBah   *message,
-                      PaddleMobile__Framework__ProtobufCBuffer     *buffer);
-~~~
- *
- * \page pack Packing and unpacking messages
- *
- * To pack a message, first compute the packed size of the message with
- * PaddleMobile__Framework__protobuf_c_message_get_packed_size(), then allocate
-a buffer of at least
- * that size, then call protobuf_c_message_pack().
- *
- * Alternatively, a message can be serialized without calculating the final size
- * first. Use the protobuf_c_message_pack_to_buffer() function and provide a
- * PaddleMobile__Framework__ProtobufCBuffer object which implements an "append"
-method that consumes
- * data.
- *
- * To unpack a message, call the
-PaddleMobile__Framework__protobuf_c_message_unpack() function. The
- * result can be cast to an object of the type that matches the descriptor for
- * the message.
- *
- * The result of unpacking a message should be freed with
- * PaddleMobile__Framework__protobuf_c_message_free_unpacked().
- */
-
-#ifndef PROTOBUF_C_H
-#define PROTOBUF_C_H
-
-#include <assert.h>
-#include <limits.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-#define PROTOBUF_C__BEGIN_DECLS extern "C" {
-#define PROTOBUF_C__END_DECLS }
-#else
-#define PROTOBUF_C__BEGIN_DECLS
-#define PROTOBUF_C__END_DECLS
-#endif
-
-PROTOBUF_C__BEGIN_DECLS
-
-#if defined(_WIN32) && defined(PROTOBUF_C_USE_SHARED_LIB)
-#ifdef PROTOBUF_C_EXPORT
-#define PROTOBUF_C__API __declspec(dllexport)
-#else
-#define PROTOBUF_C__API __declspec(dllimport)
-#endif
-#else
-#define PROTOBUF_C__API
-#endif
-
-#if !defined(PROTOBUF_C__NO_DEPRECATED) && \
-    ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
-#define PROTOBUF_C__DEPRECATED __attribute__((__deprecated__))
-#else
-#define PROTOBUF_C__DEPRECATED
-#endif
-
-#ifndef PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE
-#define PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(enum_name) \
-  , _##enum_name##_IS_INT_SIZE = INT_MAX
-#endif
-
-#define PROTOBUF_C__SERVICE_DESCRIPTOR_MAGIC 0x14159bc3
-#define PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC 0x28aaeef9
-#define PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC 0x114315af
-
-/* Empty string used for initializers */
-extern const char PaddleMobile__Framework__protobuf_c_empty_string[];
-
-/**
- * \defgroup api Public API
- *
- * This is the public API for `libprotobuf-c`. These interfaces are stable and
- * subject to Semantic Versioning guarantees.
- *
- * @{
- */
-
-/**
- * Values for the `flags` word in
- * `PaddleMobile__Framework__ProtobufCFieldDescriptor`.
- */
-typedef enum {
-  /** Set if the field is repeated and marked with the `packed` option. */
-  PROTOBUF_C_FIELD_FLAG_PACKED = (1 << 0),
-
-  /** Set if the field is marked with the `deprecated` option. */
-  PROTOBUF_C_FIELD_FLAG_DEPRECATED = (1 << 1),
-
-  /** Set if the field is a member of a oneof (union). */
-  PROTOBUF_C_FIELD_FLAG_ONEOF = (1 << 2),
-} PaddleMobile__Framework__ProtobufCFieldFlag;
-
-/**
- * Message field rules.
- *
- * \see [Defining A Message Type] in the Protocol Buffers documentation.
- *
- * [Defining A Message Type]:
- *      https://developers.google.com/protocol-buffers/docs/proto#simple
- */
-typedef enum {
-  /** A well-formed message must have exactly one of this field. */
-  PROTOBUF_C_LABEL_REQUIRED,
-
-  /**
-   * A well-formed message can have zero or one of this field (but not
-   * more than one).
-   */
-  PROTOBUF_C_LABEL_OPTIONAL,
-
-  /**
-   * This field can be repeated any number of times (including zero) in a
-   * well-formed message. The order of the repeated values will be
-   * preserved.
-   */
-  PROTOBUF_C_LABEL_REPEATED,
-
-  /**
-   * This field has no label. This is valid only in proto3 and is
-   * equivalent to OPTIONAL but no "has" quantifier will be consulted.
-   */
-  PROTOBUF_C_LABEL_NONE,
-} PaddleMobile__Framework__ProtobufCLabel;
-
-/**
- * Field value types.
- *
- * \see [Scalar Value Types] in the Protocol Buffers documentation.
- *
- * [Scalar Value Types]:
- *      https://developers.google.com/protocol-buffers/docs/proto#scalar
- */
-typedef enum {
-  PROTOBUF_C_TYPE_INT32,    /**< int32 */
-  PROTOBUF_C_TYPE_SINT32,   /**< signed int32 */
-  PROTOBUF_C_TYPE_SFIXED32, /**< signed int32 (4 bytes) */
-  PROTOBUF_C_TYPE_INT64,    /**< int64 */
-  PROTOBUF_C_TYPE_SINT64,   /**< signed int64 */
-  PROTOBUF_C_TYPE_SFIXED64, /**< signed int64 (8 bytes) */
-  PROTOBUF_C_TYPE_UINT32,   /**< unsigned int32 */
-  PROTOBUF_C_TYPE_FIXED32,  /**< unsigned int32 (4 bytes) */
-  PROTOBUF_C_TYPE_UINT64,   /**< unsigned int64 */
-  PROTOBUF_C_TYPE_FIXED64,  /**< unsigned int64 (8 bytes) */
-  PROTOBUF_C_TYPE_FLOAT,    /**< float */
-  PROTOBUF_C_TYPE_DOUBLE,   /**< double */
-  PROTOBUF_C_TYPE_BOOL,     /**< boolean */
-  PROTOBUF_C_TYPE_ENUM,     /**< enumerated type */
-  PROTOBUF_C_TYPE_STRING,   /**< UTF-8 or ASCII string */
-  PROTOBUF_C_TYPE_BYTES,    /**< arbitrary byte sequence */
-  PROTOBUF_C_TYPE_MESSAGE,  /**< nested message */
-} PaddleMobile__Framework__ProtobufCType;
-
-/**
- * Field wire types.
- *
- * \see [Message Structure] in the Protocol Buffers documentation.
- *
- * [Message Structure]:
- *      https://developers.google.com/protocol-buffers/docs/encoding#structure
- */
-typedef enum {
-  PROTOBUF_C_WIRE_TYPE_VARINT = 0,
-  PROTOBUF_C_WIRE_TYPE_64BIT = 1,
-  PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED = 2,
-  /* "Start group" and "end group" wire types are unsupported. */
-  PROTOBUF_C_WIRE_TYPE_32BIT = 5,
-} PaddleMobile__Framework__ProtobufCWireType;
-
-struct PaddleMobile__Framework__ProtobufCAllocator;
-struct PaddleMobile__Framework__ProtobufCBinaryData;
-struct PaddleMobile__Framework__ProtobufCBuffer;
-struct PaddleMobile__Framework__ProtobufCBufferSimple;
-struct PaddleMobile__Framework__ProtobufCEnumDescriptor;
-struct PaddleMobile__Framework__ProtobufCEnumValue;
-struct PaddleMobile__Framework__ProtobufCEnumValueIndex;
-struct PaddleMobile__Framework__ProtobufCFieldDescriptor;
-struct PaddleMobile__Framework__ProtobufCIntRange;
-struct PaddleMobile__Framework__ProtobufCMessage;
-struct PaddleMobile__Framework__ProtobufCMessageDescriptor;
-struct PaddleMobile__Framework__ProtobufCMessageUnknownField;
-struct PaddleMobile__Framework__ProtobufCMethodDescriptor;
-struct PaddleMobile__Framework__ProtobufCService;
-struct PaddleMobile__Framework__ProtobufCServiceDescriptor;
-
-typedef struct PaddleMobile__Framework__ProtobufCAllocator
-    PaddleMobile__Framework__ProtobufCAllocator;
-typedef struct PaddleMobile__Framework__ProtobufCBinaryData
-    PaddleMobile__Framework__ProtobufCBinaryData;
-typedef struct PaddleMobile__Framework__ProtobufCBuffer
-    PaddleMobile__Framework__ProtobufCBuffer;
-typedef struct PaddleMobile__Framework__ProtobufCBufferSimple
-    PaddleMobile__Framework__ProtobufCBufferSimple;
-typedef struct PaddleMobile__Framework__ProtobufCEnumDescriptor
-    PaddleMobile__Framework__ProtobufCEnumDescriptor;
-typedef struct PaddleMobile__Framework__ProtobufCEnumValue
-    PaddleMobile__Framework__ProtobufCEnumValue;
-typedef struct PaddleMobile__Framework__ProtobufCEnumValueIndex
-    PaddleMobile__Framework__ProtobufCEnumValueIndex;
-typedef struct PaddleMobile__Framework__ProtobufCFieldDescriptor
-    PaddleMobile__Framework__ProtobufCFieldDescriptor;
-typedef struct PaddleMobile__Framework__ProtobufCIntRange
-    PaddleMobile__Framework__ProtobufCIntRange;
-typedef struct PaddleMobile__Framework__ProtobufCMessage
-    PaddleMobile__Framework__ProtobufCMessage;
-typedef struct PaddleMobile__Framework__ProtobufCMessageDescriptor
-    PaddleMobile__Framework__ProtobufCMessageDescriptor;
-typedef struct PaddleMobile__Framework__ProtobufCMessageUnknownField
-    PaddleMobile__Framework__ProtobufCMessageUnknownField;
-typedef struct PaddleMobile__Framework__ProtobufCMethodDescriptor
-    PaddleMobile__Framework__ProtobufCMethodDescriptor;
-typedef struct PaddleMobile__Framework__ProtobufCService
-    PaddleMobile__Framework__ProtobufCService;
-typedef struct PaddleMobile__Framework__ProtobufCServiceDescriptor
-    PaddleMobile__Framework__ProtobufCServiceDescriptor;
-
-/** Boolean type. */
-typedef int protobuf_c_boolean;
-
-typedef void (*ProtobufCClosure)(
-    const PaddleMobile__Framework__ProtobufCMessage *, void *closure_data);
-typedef void (*ProtobufCMessageInit)(
-    PaddleMobile__Framework__ProtobufCMessage *);
-typedef void (*ProtobufCServiceDestroy)(
-    PaddleMobile__Framework__ProtobufCService *);
-
-/**
- * Structure for defining a custom memory allocator.
- */
-struct PaddleMobile__Framework__ProtobufCAllocator {
-  /** Function to allocate memory. */
-  void *(*alloc)(void *allocator_data, size_t size);
-
-  /** Function to free memory. */
-  void (*free)(void *allocator_data, void *pointer);
-
-  /** Opaque pointer passed to `alloc` and `free` functions. */
-  void *allocator_data;
-};
-
-/**
- * Structure for the protobuf `bytes` scalar type.
- *
- * The data contained in a `PaddleMobile__Framework__ProtobufCBinaryData` is an
- * arbitrary sequence of bytes. It may contain embedded `NUL` characters and is
- * not required to be `NUL`-terminated.
- */
-struct PaddleMobile__Framework__ProtobufCBinaryData {
-  size_t len;    /**< Number of bytes in the `data` field. */
-  uint8_t *data; /**< Data bytes. */
-};
-
-/**
- * Structure for defining a virtual append-only buffer. Used by
- * protobuf_c_message_pack_to_buffer() to abstract the consumption of serialized
- * bytes.
- *
- * `PaddleMobile__Framework__ProtobufCBuffer` "subclasses" may be defined on the
-stack. For example, to
- * write to a `FILE` object:
- *
-~~~{.c}
-typedef struct {
-        PaddleMobile__Framework__ProtobufCBuffer base;
-        FILE *fp;
-} BufferAppendToFile;
-
-static void
-my_buffer_file_append(PaddleMobile__Framework__ProtobufCBuffer *buffer,
-                      size_t len,
-                      const uint8_t *data)
-{
-        BufferAppendToFile *file_buf = (BufferAppendToFile *) buffer;
-        fwrite(data, len, 1, file_buf->fp); // XXX: No error handling!
-}
-~~~
- *
- * To use this new type of PaddleMobile__Framework__ProtobufCBuffer, it could be
-called as follows:
- *
-~~~{.c}
-...
-BufferAppendToFile tmp = {0};
-tmp.base.append = my_buffer_file_append;
-tmp.fp = fp;
-protobuf_c_message_pack_to_buffer(&message, &tmp);
-...
-~~~
- */
-struct PaddleMobile__Framework__ProtobufCBuffer {
-  /** Append function. Consumes the `len` bytes stored at `data`. */
-  void (*append)(PaddleMobile__Framework__ProtobufCBuffer *buffer, size_t len,
-                 const uint8_t *data);
-};
-
-/**
- * Simple buffer "subclass" of `PaddleMobile__Framework__ProtobufCBuffer`.
- *
- * A `PaddleMobile__Framework__ProtobufCBufferSimple` object is declared on the
-stack and uses a
- * scratch buffer provided by the user for the initial allocation. It performs
- * exponential resizing, using dynamically allocated memory. A
- * `PaddleMobile__Framework__ProtobufCBufferSimple` object can be created and
-used as follows:
- *
-~~~{.c}
-uint8_t pad[128];
-PaddleMobile__Framework__ProtobufCBufferSimple simple =
-PROTOBUF_C_BUFFER_SIMPLE_INIT(pad); PaddleMobile__Framework__ProtobufCBuffer
-*buffer = (PaddleMobile__Framework__ProtobufCBuffer *) &simple;
-~~~
- *
- * `buffer` can now be used with `protobuf_c_message_pack_to_buffer()`. Once a
- * message has been serialized to a
-`PaddleMobile__Framework__ProtobufCBufferSimple` object, the
- * serialized data bytes can be accessed from the `.data` field.
- *
- * To free the memory allocated by a
-`PaddleMobile__Framework__ProtobufCBufferSimple` object, if any,
- * call PROTOBUF_C_BUFFER_SIMPLE_CLEAR() on the object, for example:
- *
-~~~{.c}
-PROTOBUF_C_BUFFER_SIMPLE_CLEAR(&simple);
-~~~
- *
- * \see PROTOBUF_C_BUFFER_SIMPLE_INIT
- * \see PROTOBUF_C_BUFFER_SIMPLE_CLEAR
- */
-struct PaddleMobile__Framework__ProtobufCBufferSimple {
-  /** "Base class". */
-  PaddleMobile__Framework__ProtobufCBuffer base;
-  /** Number of bytes allocated in `data`. */
-  size_t alloced;
-  /** Number of bytes currently stored in `data`. */
-  size_t len;
-  /** Data bytes. */
-  uint8_t *data;
-  /** Whether `data` must be freed. */
-  protobuf_c_boolean must_free_data;
-  /** Allocator to use. May be NULL to indicate the system allocator. */
-  PaddleMobile__Framework__ProtobufCAllocator *allocator;
-};
-
-/**
- * Describes an enumeration as a whole, with all of its values.
- */
-struct PaddleMobile__Framework__ProtobufCEnumDescriptor {
-  /** Magic value checked to ensure that the API is used correctly. */
-  uint32_t magic;
-
-  /** The qualified name (e.g., "namespace.Type"). */
-  const char *name;
-  /** The unqualified name as given in the .proto file (e.g., "Type"). */
-  const char *short_name;
-  /** Identifier used in generated C code. */
-  const char *c_name;
-  /** The dot-separated namespace. */
-  const char *package_name;
-
-  /** Number elements in `values`. */
-  unsigned n_values;
-  /** Array of distinct values, sorted by numeric value. */
-  const PaddleMobile__Framework__ProtobufCEnumValue *values;
-
-  /** Number of elements in `values_by_name`. */
-  unsigned n_value_names;
-  /** Array of named values, including aliases, sorted by name. */
-  const PaddleMobile__Framework__ProtobufCEnumValueIndex *values_by_name;
-
-  /** Number of elements in `value_ranges`. */
-  unsigned n_value_ranges;
-  /** Value ranges, for faster lookups by numeric value. */
-  const PaddleMobile__Framework__ProtobufCIntRange *value_ranges;
-
-  /** Reserved for future use. */
-  void *reserved1;
-  /** Reserved for future use. */
-  void *reserved2;
-  /** Reserved for future use. */
-  void *reserved3;
-  /** Reserved for future use. */
-  void *reserved4;
-};
-
-/**
- * Represents a single value of an enumeration.
- */
-struct PaddleMobile__Framework__ProtobufCEnumValue {
-  /** The string identifying this value in the .proto file. */
-  const char *name;
-
-  /** The string identifying this value in generated C code. */
-  const char *c_name;
-
-  /** The numeric value assigned in the .proto file. */
-  int value;
-};
-
-/**
- * Used by `PaddleMobile__Framework__ProtobufCEnumDescriptor` to look up enum
- * values.
- */
-struct PaddleMobile__Framework__ProtobufCEnumValueIndex {
-  /** Name of the enum value. */
-  const char *name;
-  /** Index into values[] array. */
-  unsigned index;
-};
-
-/**
- * Describes a single field in a message.
- */
-struct PaddleMobile__Framework__ProtobufCFieldDescriptor {
-  /** Name of the field as given in the .proto file. */
-  const char *name;
-
-  /** Tag value of the field as given in the .proto file. */
-  uint32_t id;
-
-  /** Whether the field is `REQUIRED`, `OPTIONAL`, or `REPEATED`. */
-  PaddleMobile__Framework__ProtobufCLabel label;
-
-  /** The type of the field. */
-  PaddleMobile__Framework__ProtobufCType type;
-
-  /**
-   * The offset in bytes of the message's C structure's quantifier field
-   * (the `has_MEMBER` field for optional members or the `n_MEMBER` field
-   * for repeated members or the case enum for oneofs).
-   */
-  unsigned quantifier_offset;
-
-  /**
-   * The offset in bytes into the message's C structure for the member
-   * itself.
-   */
-  unsigned offset;
-
-  /**
-   * A type-specific descriptor.
-   *
-   * If `type` is `PROTOBUF_C_TYPE_ENUM`, then `descriptor` points to the
-   * corresponding `PaddleMobile__Framework__ProtobufCEnumDescriptor`.
-   *
-   * If `type` is `PROTOBUF_C_TYPE_MESSAGE`, then `descriptor` points to
-   * the corresponding `PaddleMobile__Framework__ProtobufCMessageDescriptor`.
-   *
-   * Otherwise this field is NULL.
-   */
-  const void *descriptor; /* for MESSAGE and ENUM types */
-
-  /** The default value for this field, if defined. May be NULL. */
-  const void *default_value;
-
-  /**
-   * A flag word. Zero or more of the bits defined in the
-   * `PaddleMobile__Framework__ProtobufCFieldFlag` enum may be set.
-   */
-  uint32_t flags;
-
-  /** Reserved for future use. */
-  unsigned reserved_flags;
-  /** Reserved for future use. */
-  void *reserved2;
-  /** Reserved for future use. */
-  void *reserved3;
-};
-
-/**
- * Helper structure for optimizing int => index lookups in the case
- * where the keys are mostly consecutive values, as they presumably are for
- * enums and fields.
- *
- * The data structures requires that the values in the original array are
- * sorted.
- */
-struct PaddleMobile__Framework__ProtobufCIntRange {
-  int start_value;
-  unsigned orig_index;
-  /*
-   * NOTE: the number of values in the range can be inferred by looking
-   * at the next element's orig_index. A dummy element is added to make
-   * this simple.
-   */
-};
-
-/**
- * An instance of a message.
- *
- * `PaddleMobile__Framework__ProtobufCMessage` is a light-weight "base class"
- * for all messages.
- *
- * In particular, `PaddleMobile__Framework__ProtobufCMessage` doesn't have any
- * allocation policy associated with it. That's because it's common to create
- * `PaddleMobile__Framework__ProtobufCMessage` objects on the stack. In fact,
- * that's what we recommend for sending messages. If the object is allocated
- * from the stack, you can't really have a memory leak.
- *
- * This means that calls to functions like
- * PaddleMobile__Framework__protobuf_c_message_unpack() which return a
- * `PaddleMobile__Framework__ProtobufCMessage` must be paired with a call to a
- * free function, like
- * PaddleMobile__Framework__protobuf_c_message_free_unpacked().
- */
-struct PaddleMobile__Framework__ProtobufCMessage {
-  /** The descriptor for this message type. */
-  const PaddleMobile__Framework__ProtobufCMessageDescriptor *descriptor;
-  /** The number of elements in `unknown_fields`. */
-  unsigned n_unknown_fields;
-  /** The fields that weren't recognized by the parser. */
-  PaddleMobile__Framework__ProtobufCMessageUnknownField *unknown_fields;
-};
-
-/**
- * Describes a message.
- */
-struct PaddleMobile__Framework__ProtobufCMessageDescriptor {
-  /** Magic value checked to ensure that the API is used correctly. */
-  uint32_t magic;
-
-  /** The qualified name (e.g., "namespace.Type"). */
-  const char *name;
-  /** The unqualified name as given in the .proto file (e.g., "Type"). */
-  const char *short_name;
-  /** Identifier used in generated C code. */
-  const char *c_name;
-  /** The dot-separated namespace. */
-  const char *package_name;
-
-  /**
-   * Size in bytes of the C structure representing an instance of this
-   * type of message.
-   */
-  size_t sizeof_message;
-
-  /** Number of elements in `fields`. */
-  unsigned n_fields;
-  /** Field descriptors, sorted by tag number. */
-  const PaddleMobile__Framework__ProtobufCFieldDescriptor *fields;
-  /** Used for looking up fields by name. */
-  const unsigned *fields_sorted_by_name;
-
-  /** Number of elements in `field_ranges`. */
-  unsigned n_field_ranges;
-  /** Used for looking up fields by id. */
-  const PaddleMobile__Framework__ProtobufCIntRange *field_ranges;
-
-  /** Message initialisation function. */
-  ProtobufCMessageInit message_init;
-
-  /** Reserved for future use. */
-  void *reserved1;
-  /** Reserved for future use. */
-  void *reserved2;
-  /** Reserved for future use. */
-  void *reserved3;
-};
-
-/**
- * An unknown message field.
- */
-struct PaddleMobile__Framework__ProtobufCMessageUnknownField {
-  /** The tag number. */
-  uint32_t tag;
-  /** The wire type of the field. */
-  PaddleMobile__Framework__ProtobufCWireType wire_type;
-  /** Number of bytes in `data`. */
-  size_t len;
-  /** Field data. */
-  uint8_t *data;
-};
-
-/**
- * Method descriptor.
- */
-struct PaddleMobile__Framework__ProtobufCMethodDescriptor {
-  /** Method name. */
-  const char *name;
-  /** Input message descriptor. */
-  const PaddleMobile__Framework__ProtobufCMessageDescriptor *input;
-  /** Output message descriptor. */
-  const PaddleMobile__Framework__ProtobufCMessageDescriptor *output;
-};
-
-/**
- * Service.
- */
-struct PaddleMobile__Framework__ProtobufCService {
-  /** Service descriptor. */
-  const PaddleMobile__Framework__ProtobufCServiceDescriptor *descriptor;
-  /** Function to invoke the service. */
-  void (*invoke)(PaddleMobile__Framework__ProtobufCService *service,
-                 unsigned method_index,
-                 const PaddleMobile__Framework__ProtobufCMessage *input,
-                 ProtobufCClosure closure, void *closure_data);
-  /** Function to destroy the service. */
-  void (*destroy)(PaddleMobile__Framework__ProtobufCService *service);
-};
-
-/**
- * Service descriptor.
- */
-struct PaddleMobile__Framework__ProtobufCServiceDescriptor {
-  /** Magic value checked to ensure that the API is used correctly. */
-  uint32_t magic;
-
-  /** Service name. */
-  const char *name;
-  /** Short version of service name. */
-  const char *short_name;
-  /** C identifier for the service name. */
-  const char *c_name;
-  /** Package name. */
-  const char *package;
-  /** Number of elements in `methods`. */
-  unsigned n_methods;
-  /** Method descriptors, in the order defined in the .proto file. */
-  const PaddleMobile__Framework__ProtobufCMethodDescriptor *methods;
-  /** Sort index of methods. */
-  const unsigned *method_indices_by_name;
-};
-
-/**
- * Get the version of the protobuf-c library. Note that this is the version of
- * the library linked against, not the version of the headers compiled against.
- *
- * \return A string containing the version number of protobuf-c.
- */
-PROTOBUF_C__API
-const char *PaddleMobile__Framework__protobuf_c_version(void);
-
-/**
- * Get the version of the protobuf-c library. Note that this is the version of
- * the library linked against, not the version of the headers compiled against.
- *
- * \return A 32 bit unsigned integer containing the version number of
- *      protobuf-c, represented in base-10 as (MAJOR*1E6) + (MINOR*1E3) + PATCH.
- */
-PROTOBUF_C__API
-uint32_t PaddleMobile__Framework__protobuf_c_version_number(void);
-
-/**
- * The version of the protobuf-c headers, represented as a string using the same
- * format as PaddleMobile__Framework__protobuf_c_version().
- */
-#define PROTOBUF_C_VERSION "1.3.0"
-
-/**
- * The version of the protobuf-c headers, represented as an integer using the
- * same format as PaddleMobile__Framework__protobuf_c_version_number().
- */
-#define PROTOBUF_C_VERSION_NUMBER 1003000
-
-/**
- * The minimum protoc-c version which works with the current version of the
- * protobuf-c headers.
- */
-#define PROTOBUF_C_MIN_COMPILER_VERSION 1000000
-
-/**
- * Determine the number of bytes required to store the serialised message.
- *
- * \param message
- *      The message object to serialise.
- * \return
- *      Number of bytes.
- */
-PROTOBUF_C__API
-size_t PaddleMobile__Framework__protobuf_c_message_get_packed_size(
-    const PaddleMobile__Framework__ProtobufCMessage *message);
-
-/**
- * Unpack a serialised message into an in-memory representation.
- *
- * \param descriptor
- *      The message descriptor.
- * \param allocator
- *      `PaddleMobile__Framework__ProtobufCAllocator` to use for memory
- * allocation. May be NULL to specify the default allocator. \param len Length
- * in bytes of the serialised message. \param data Pointer to the
- * serialised message. \return An unpacked message object. \retval NULL If
- * an error occurred during unpacking.
- */
-PROTOBUF_C__API
-PaddleMobile__Framework__ProtobufCMessage *
-PaddleMobile__Framework__protobuf_c_message_unpack(
-    const PaddleMobile__Framework__ProtobufCMessageDescriptor *descriptor,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data);
-
-/**
- * Free an unpacked message object.
- *
- * This function should be used to deallocate the memory used by a call to
- * PaddleMobile__Framework__protobuf_c_message_unpack().
- *
- * \param message
- *      The message object to free. May be NULL.
- * \param allocator
- *      `PaddleMobile__Framework__ProtobufCAllocator` to use for memory
- * deallocation. May be NULL to specify the default allocator.
- */
-PROTOBUF_C__API
-void PaddleMobile__Framework__protobuf_c_message_free_unpacked(
-    PaddleMobile__Framework__ProtobufCMessage *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator);
-
-/**
- * Check the validity of a message object.
- *
- * Makes sure all required fields (`PROTOBUF_C_LABEL_REQUIRED`) are present.
- * Recursively checks nested messages.
- *
- * \retval TRUE
- *      Message is valid.
- * \retval FALSE
- *      Message is invalid.
- */
-PROTOBUF_C__API
-protobuf_c_boolean PaddleMobile__Framework__protobuf_c_message_check(
-    const PaddleMobile__Framework__ProtobufCMessage *);
-
-/** Message initialiser. */
-#define PROTOBUF_C_MESSAGE_INIT(descriptor) \
-  { descriptor, 0, NULL }
-
-/**
- * Initialise a message object from a message descriptor.
- *
- * \param descriptor
- *      Message descriptor.
- * \param message
- *      Allocated block of memory of size `descriptor->sizeof_message`.
- */
-PROTOBUF_C__API
-void PaddleMobile__Framework__protobuf_c_message_init(
-    const PaddleMobile__Framework__ProtobufCMessageDescriptor *descriptor,
-    void *message);
-
-/**
- * Initialise a `PaddleMobile__Framework__ProtobufCBufferSimple` object.
- */
-#define PROTOBUF_C_BUFFER_SIMPLE_INIT(array_of_bytes)           \
-  {                                                             \
-    {PaddleMobile__Framework__protobuf_c_buffer_simple_append}, \
-        sizeof(array_of_bytes), 0, (array_of_bytes), 0, NULL    \
-  }
-
-/**
- * Clear a `PaddleMobile__Framework__ProtobufCBufferSimple` object, freeing any
- * allocated memory.
- */
-#define PROTOBUF_C_BUFFER_SIMPLE_CLEAR(simp_buf)                              \
-  do {                                                                        \
-    if ((simp_buf)->must_free_data) {                                         \
-      if ((simp_buf)->allocator != NULL)                                      \
-        (simp_buf)->allocator->free((simp_buf)->allocator, (simp_buf)->data); \
-      else                                                                    \
-        free((simp_buf)->data);                                               \
-    }                                                                         \
-  } while (0)
-
-/**
- * The `append` method for `PaddleMobile__Framework__ProtobufCBufferSimple`.
- *
- * \param buffer
- *      The buffer object to append to. Must actually be a
- *      `PaddleMobile__Framework__ProtobufCBufferSimple` object.
- * \param len
- *      Number of bytes in `data`.
- * \param data
- *      Data to append.
- */
-PROTOBUF_C__API
-void PaddleMobile__Framework__protobuf_c_buffer_simple_append(
-    PaddleMobile__Framework__ProtobufCBuffer *buffer, size_t len,
-    const unsigned char *data);
-
-/**@}*/
-
-PROTOBUF_C__END_DECLS
-
-#endif /* PROTOBUF_C_H */
diff --git a/mobile/test/CMakeLists.txt b/mobile/test/CMakeLists.txt
deleted file mode 100644
index 056ede3fb9..0000000000
--- a/mobile/test/CMakeLists.txt
+++ /dev/null
@@ -1,542 +0,0 @@
-set(dir ${CMAKE_CURRENT_SOURCE_DIR})
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${dir}/build")
-set(FOUND_MATCH OFF)
-set(ENABLE_ALL_TEST ON)
-
-if (ANDROID_ABI STREQUAL "arm64-v8a")
-    message("using google's linker to link armv8 binary")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fuse-ld=gold")
-endif ()
-
-set(CON -1)
-
-message(STATUS "nets :${NET}")
-
-list(FIND NET "net" CON)
-if (CON GREATER -1)
-    # gen test
-    ADD_EXECUTABLE(test-net net/test_net.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-net paddle-mobile)
-    set(FOUND_MATCH ON)
-endif ()
-
-list(FIND NET "googlenet" CON)
-if (CON GREATER -1)
-    # gen test
-    ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-googlenet paddle-mobile)
-
-    # gen test
-    ADD_EXECUTABLE(test-googlenet-quali net/test_googlenet_quali.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-googlenet-quali paddle-mobile)
-    set(FOUND_MATCH ON)
-
-endif ()
-
-list(FIND NET "mobilenet" CON)
-if (CON GREATER -1)
-    # gen test
-    ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-mobilenet paddle-mobile)
-
-    # gen test
-    ADD_EXECUTABLE(test-mobilenet-combine net/test_mobilenet_combine.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-mobilenet-combine paddle-mobile)
-    set(FOUND_MATCH ON)
-
-    # gen test
-    ADD_EXECUTABLE(test-mobilenetgpu  net/test_mobilenet_GPU.cpp test_helper.h  test_include.h)
-    target_link_libraries(test-mobilenetgpu paddle-mobile)
-
-endif ()
-
-list(FIND NET "yolo" CON)
-if (CON GREATER -1)
-    # gen test
-    ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-yolo paddle-mobile)
-    # gen test
-    ADD_EXECUTABLE(test-yolo-combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-yolo-combined paddle-mobile)
-    set(FOUND_MATCH ON)
-
-endif ()
-
-list(FIND NET "squeezenet" CON)
-if (CON GREATER -1)
-    # gen test
-    ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-squeezenet paddle-mobile)
-    set(FOUND_MATCH ON)
-
-endif ()
-
-list(FIND NET "resnet" CON)
-if (CON GREATER -1)
-    # gen test
-    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-resnet paddle-mobile)
-    set(FOUND_MATCH ON)
-
-endif ()
-
-list(FIND NET "FPGA_NET_V1" CON)
-if (CON GREATER -1)
-    #ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
-    #target_link_libraries(test-resnet50 paddle-mobile)
-
-    #ADD_EXECUTABLE(test-densebox fpga/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h)
-    #target_link_libraries(test-densebox paddle-mobile)
-
-    #ADD_EXECUTABLE(test-rfcn fpga/test_rfcn.cpp test_helper.h test_include.h executor_for_test.h)
-    #target_link_libraries(test-rfcn paddle-mobile)
-
-    #ADD_EXECUTABLE(test-marker fpga/test_marker.cpp test_helper.h test_include.h executor_for_test.h)
-    #target_link_libraries(test-marker paddle-mobile)
-
-    ADD_EXECUTABLE(test-rfcn-api fpga/test_rfcn_api.cpp)
-    target_link_libraries(test-rfcn-api paddle-mobile)
-
-    ADD_EXECUTABLE(test-mobilenet-api fpga/test_mobilenet_api.cpp)
-    target_link_libraries(test-mobilenet-api paddle-mobile)
-
-    ADD_EXECUTABLE(test-yolo-api fpga/test_yolo_api.cpp)
-    target_link_libraries(test-yolo-api paddle-mobile)
-
-    ADD_EXECUTABLE(test-marker-api fpga/test_marker_api.cpp)
-    target_link_libraries(test-marker-api paddle-mobile)
-    
-    #ADD_EXECUTABLE(test-marker2 fpga/test_marker2.cpp test_helper.h test_include.h executor_for_test.h )
-    #target_link_libraries(test-marker2 paddle-mobile)
-
-    #ADD_EXECUTABLE(test-mobilenet fpga/test_mobilenet_beijing.cpp test_helper.h test_include.h executor_for_test.h)
-    #target_link_libraries(test-mobilenet paddle-mobile)
-
-    #ADD_EXECUTABLE(test-yolo fpga/test_yolo_combine.cpp test_helper.h test_include.h executor_for_test.h)
-    #target_link_libraries(test-yolo paddle-mobile)
-
-    set(FOUND_MATCH ON)
-endif ()
-
-list(FIND NET "FPGA_NET_V2" CON)
-if (CON GREATER -1)
-    ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-resnet50 paddle-mobile)
-
-    ADD_EXECUTABLE(test-pe fpga/test_pe.cpp)
-    target_link_libraries(test-pe paddle-mobile)
-
-    ADD_EXECUTABLE(test-densebox fpga/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-densebox paddle-mobile)
-
-    set(FOUND_MATCH ON)
-endif ()
-
-list(FIND NET "FPGA_OPS_KD" CON)
-if (CON GREATER -1)
-    ADD_EXECUTABLE(test-ssd fpga/test_ssd.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-ssd paddle-mobile)
-
-    set(FOUND_MATCH ON)
-endif ()
-
-list(FIND NET "mobilenetssd" CON)
-if (CON GREATER -1)
-    # gen test
-    ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-mobilenetssd paddle-mobile)
-
-    set(FOUND_MATCH ON)
-
-endif ()
-
-list(FIND NET "nlp" CON)
-if (CON GREATER -1)
-    # gen test
-    ADD_EXECUTABLE(test-nlp net/test_nlp.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-nlp paddle-mobile)
-
-    # gen test
-    ADD_EXECUTABLE(test-gru-op operators/test_gru_op.cpp test_helper.h test_include.h)
-    target_link_libraries(test-gru-op paddle-mobile)
-    set(FOUND_MATCH ON)
-
-endif ()
-
-list(FIND NET "mobilenetfssd" CON)
-if (CON GREATER -1)
-    # gen test
-    ADD_EXECUTABLE(test-fssd net/test_mobilenet_025_fssd.cpp test_helper.h test_include.h)
-    target_link_libraries(test-fssd paddle-mobile)
-
-    set(FOUND_MATCH ON)
-
-endif ()
-
-list(FIND NET "genet" CON)
-if (CON GREATER -1)
-    # gen test
-    ADD_EXECUTABLE(test-genet net/test_genet_combine.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-genet paddle-mobile)
-    set(FOUND_MATCH ON)
-
-endif ()
-
-list(FIND NET "super" CON)
-if (CON GREATER -1)
-    # gen test
-    ADD_EXECUTABLE(test-super net/test_super.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-super paddle-mobile)
-    set(FOUND_MATCH ON)
-
-endif ()
-
-list(FIND NET "op" CON)
-if (CON GREATER -1)
-    # gen test
-    ADD_EXECUTABLE(test-sigmoid operators/test_sigmoid_op.cpp test_include.h)
-    target_link_libraries(test-sigmoid paddle-mobile)
-
-    # gen test log
-    ADD_EXECUTABLE(test-leakyrelu operators/test_leaky_relu_op.cpp)
-    target_link_libraries(test-leakyrelu paddle-mobile)
-    set(FOUND_MATCH ON)
-endif ()
-
-if (ENABLE_ALL_TEST)
-    if (NOT FOUND_MATCH)
-        # gen test
-        ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-resnet paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-squeezenet paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-yolo paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test_yolo_combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test_yolo_combined paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-op-in-net net/test_op_in_net.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-op-in-net paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-googlenet paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-googlenet-quali net/test_googlenet_quali.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-googlenet-quali paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-conv-op operators/test_conv_op.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-conv-op paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-mul-op operators/test_mul_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-mul-op paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-elementwiseadd-op operators/test_elementwise_add_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-elementwiseadd-op paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-elementwisesub-op operators/test_elementwise_sub_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-elementwisesub-op paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-im2sequence-op operators/test_im2sequence_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-im2sequence-op paddle-mobile)
-    
-    	# gen test
-        ADD_EXECUTABLE(test-concat-op operators/test_concat_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-concat-op paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-lrn-op operators/test_lrn_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-lrn-op paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-batchnorm-op operators/test_batchnorm_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-batchnorm-op paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-priorbox-op operators/test_prior_box_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-priorbox-op paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-boxcoder-op operators/test_box_coder_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-boxcoder-op paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-transpose-op operators/test_transpose_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-transpose-op paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-transpose2-op operators/test_transpose2_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-transpose2-op paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-multiclassnms-op operators/test_multiclass_nms_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-multiclassnms-op paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-polygon-box-transform-op operators/test_polygon_box_transform_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-polygon-box-transform-op paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-fill-constant-op operators/test_fill_constant_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-fill-constant-op paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-reshape-op operators/test_reshape_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-reshape-op paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-reshape2-op operators/test_reshape2_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-reshape2-op paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-relu-op operators/test_relu_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-relu-op paddle-mobile)
-    
-        ADD_EXECUTABLE(test-relu6-op operators/test_relu6_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-relu6-op paddle-mobile)
-    
-        ADD_EXECUTABLE(test-tanh-op operators/test_tanh_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-tanh-op paddle-mobile)
-    
-        ADD_EXECUTABLE(test-log-op operators/test_log_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-log-op paddle-mobile)
-    
-        ADD_EXECUTABLE(test-topk-op operators/test_topk_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-topk-op paddle-mobile)
-    
-        ADD_EXECUTABLE(test-cast-op operators/test_cast_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-cast-op paddle-mobile)
-    
-        ADD_EXECUTABLE(test-less-than-op operators/test_less_than_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-less-than-op paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-fc-op operators/test_fusion_fc_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-fc-op paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-sum-op operators/test_sum_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-sum-op paddle-mobile)
-    
-        # test quantize op
-        ADD_EXECUTABLE(test-quantize-op operators/test_quantize_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-quantize-op paddle-mobile)
-    
-        # test dequantize op
-        ADD_EXECUTABLE(test-dequantize-op operators/test_dequantize_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-dequantize-op paddle-mobile)
-    
-        # gen test log
-        ADD_EXECUTABLE(test-log common/test_log.cpp)
-        target_link_libraries(test-log paddle-mobile)
-    
-        # gen test log
-        ADD_EXECUTABLE(test-load framework/test_load.cpp)
-        target_link_libraries(test-load paddle-mobile)
-    
-        # gen test log
-        ADD_EXECUTABLE(test-loadmemory framework/test_load_memory.cpp)
-        target_link_libraries(test-loadmemory paddle-mobile)
-    
-        # gen test log
-        ADD_EXECUTABLE(test-loadmemory-inference framework/test_load_memory_inference_api.cpp)
-        target_link_libraries(test-loadmemory-inference paddle-mobile)
-    
-        ADD_EXECUTABLE(test-inference-api framework/test_inference_api.cpp)
-        target_link_libraries(test-inference-api paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-optimize framework/test_optimize.cpp)
-        target_link_libraries(test-optimize paddle-mobile)
-    
-        #gen test
-        ADD_EXECUTABLE(test-pool-op operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-pool-op paddle-mobile)
-    
-        #gen test
-        ADD_EXECUTABLE(test-softmax-op operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-softmax-op paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-gemm-accuracy common/test_gemm_accuracy.cpp)
-        target_link_libraries(test-gemm-accuracy paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-gemm-int8-accuracy common/test_gemm_int8_accuracy.cpp)
-        target_link_libraries(test-gemm-int8-accuracy paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-gemm-perf common/test_gemm_perf.cpp)
-        target_link_libraries(test-gemm-perf paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-enforce common/test_enforce.cpp)
-        target_link_libraries(test-enforce paddle-mobile)
-    
-        # gen test - test if openmp works
-        ADD_EXECUTABLE(test-openmp common/test_openmp.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-openmp paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-mobilenetssd paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-mobilenet-combine net/test_mobilenet_combine.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-mobilenet-combine paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-genet net/test_genet_combine.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-genet paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-sigmoid-op operators/test_sigmoid_op.cpp test_include.h)
-        target_link_libraries(test-sigmoid-op paddle-mobile)
-    
-        # gen test log
-        ADD_EXECUTABLE(test-leakyrelu operators/test_leaky_relu_op.cpp)
-        target_link_libraries(test-leakyrelu paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-depthwise-conv-op operators/test_depthwise_conv_op.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-depthwise-conv-op paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-mobilenet paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-conv-add-relu-op paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-conv-add-bn-relu-op paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-nlp net/test_nlp.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-nlp paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-gru-op operators/test_gru_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-gru-op paddle-mobile)
-    
-        # gen test
-    
-        ADD_EXECUTABLE(test-inceptionv4 net/test_inceptionv4.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-inceptionv4 paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-alexnet net/test_alexnet.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-alexnet paddle-mobile)
-    
-        ADD_EXECUTABLE(test-googlenetv1 net/test_googlenetv1_combine.cpp test_helper.h test_include.h)
-        target_link_libraries(test-googlenetv1 paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-fssd net/test_mobilenet_025_fssd.cpp test_helper.h test_include.h)
-        target_link_libraries(test-fssd paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-mobilenetgpu  net/test_mobilenet_GPU.cpp test_helper.h  test_include.h)
-        target_link_libraries(test-mobilenetgpu paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-yologpu net/test_yologpu.cpp test_helper.h  test_include.h executor_for_test.h)
-        target_link_libraries(test-yologpu paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-multi-process net/test_multi_inference_predict.cpp test_helper.h test_include.h)
-        target_link_libraries(test-multi-process paddle-mobile)
-    
-        # gen test benchmark
-        ADD_EXECUTABLE(test-benchmark net/test_benchmark.cpp)
-        target_link_libraries(test-benchmark paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-eng net/test_eng.cpp test_helper.h test_include.h)
-        target_link_libraries(test-eng paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-super net/test_super.cpp test_helper.h test_include.h)
-        target_link_libraries(test-super paddle-mobile)
-       
-        # gen test
-        ADD_EXECUTABLE(test-ocr net/test_ocr.cpp test_helper.h test_include.h)
-        target_link_libraries(test-ocr paddle-mobile)
-    
-        ADD_EXECUTABLE(test-gesture net/test_gesture.cpp test_helper.h test_include.h)
-        target_link_libraries(test-gesture paddle-mobile)
-      
-    
-        ADD_EXECUTABLE(test-sequence-expand-op operators/test_sequence_expand_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-sequence-expand-op paddle-mobile)
-    
-        ADD_EXECUTABLE(test-sequence-pool-op operators/test_sequence_pool_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-sequence-pool-op paddle-mobile)
-    
-        ADD_EXECUTABLE(test-sequence-softmax-op operators/test_sequence_softmax_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-sequence-softmax-op paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-vgg16ssd net/test_vgg16ssd.cpp test_helper.h test_include.h)
-        target_link_libraries(test-vgg16ssd paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-logical-and-op operators/test_logical_and_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-logical-and-op paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-logical-or-op operators/test_logical_or_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-logical-or-op paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-logical-not-op operators/test_logical_not_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-logical-not-op paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-logical-xor-op operators/test_logical_xor_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-logical-xor-op paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-increment-op operators/test_increment_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-increment-op paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-is-empty-op operators/test_is_empty_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-is-empty-op paddle-mobile)
-    
-        ADD_EXECUTABLE(test-conv-bn-relu-op operators/test_conv_bn_relu_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-conv-bn-relu-op paddle-mobile)
-    
-        ADD_EXECUTABLE(test-dwconv-bn-relu-op operators/test_dwconv_bn_relu_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-dwconv-bn-relu-op paddle-mobile)
-    
-        ADD_EXECUTABLE(test-conv-gpu operators/test_conv_gpu.cpp test_helper.h test_include.h)
-        target_link_libraries(test-conv-gpu paddle-mobile)
-    
-        ADD_EXECUTABLE(test-net-benchmark net/test_net_benchmark.cpp test_helper.h test_include.h)
-        target_link_libraries(test-net-benchmark paddle-mobile)
-    
-        # gen test
-        ADD_EXECUTABLE(test-net net/test_net.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-net paddle-mobile)
-    endif ()
-else()
-    # gen test
-    ADD_EXECUTABLE(test-net net/test_net.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-net paddle-mobile)
-endif()
diff --git a/mobile/test/common/test_enforce.cpp b/mobile/test/common/test_enforce.cpp
deleted file mode 100644
index 9bb499315d..0000000000
--- a/mobile/test/common/test_enforce.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "common/enforce.h"
-
-int main() {
-  PADDLE_MOBILE_ENFORCE(false, "enforce");
-  PADDLE_MOBILE_THROW_EXCEPTION("throw a exception");
-  return 0;
-}
diff --git a/mobile/test/common/test_gemm_accuracy.cpp b/mobile/test/common/test_gemm_accuracy.cpp
deleted file mode 100644
index fc1041bde0..0000000000
--- a/mobile/test/common/test_gemm_accuracy.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cstdlib>
-#include <ctime>
-#include <iostream>
-#include "../test_helper.h"
-#include "common/log.h"
-#include "memory/t_malloc.h"
-#include "operators/math/gemm/cblas.h"
-
-#define a(i, j) a[(i)*lda + (j)]
-#define b(i, j) b[(i)*ldb + (j)]
-#define c(i, j) c[(i)*ldc + (j)]
-#define c1(i, j) c1[(i)*ldc + (j)]
-
-void print_matrix(int m, int n, int ldc, float *c) {
-  for (int i = 0; i < m; ++i) {
-    std::cout << c(i, 0);
-    for (int j = 1; j < n; ++j) {
-      std::cout << " | " << c(i, j);
-    }
-    std::cout << std::endl;
-  }
-  std::cout << std::endl;
-}
-
-int do_sgemm(int m, int n, int k, int pr) {
-  const float alpha = 1.f;
-  const float beta = 0.f;
-  const int lda = k;
-  const int ldb = n;
-  const int ldc = n;
-
-  float *a =
-      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * k));
-  float *b =
-      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * k * n));
-  float *c =
-      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
-  float *c1 =
-      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
-
-  std::mt19937 rng(111);
-  std::uniform_real_distribution<double> uniform_dist(0, 1);
-  const float lower = -10.f;
-  const float upper = 10.f;
-
-  for (int i = 0; i < m * k; ++i) {
-    a[i] = static_cast<float>(uniform_dist(rng) * (upper - lower) + lower);
-  }
-  for (int i = 0; i < k * n; ++i) {
-    b[i] = static_cast<float>(uniform_dist(rng) * (upper - lower) + lower);
-  }
-  memcpy(c, c1, sizeof(float) * m * n);
-
-  for (int i = 0; i < m; ++i) {
-    for (int j = 0; j < n; ++j) {
-      float r = 0;
-      for (int p = 0; p < k; p++) {
-        r += a(i, p) * b(p, j);
-      }
-      c1(i, j) = alpha * r;
-    }
-  }
-
-  std::cout << "run cblas_sgemm..." << std::endl;
-  paddle_mobile::operators::math::cblas_sgemm(false, false, m, n, k, alpha, a,
-                                              lda, b, ldb, 0.f, c, ldc);
-
-  std::cout << "compare results..." << std::endl;
-  for (int i = 0; i < m * n; ++i) {
-    if (abs(c[i] - c1[i]) >= 1e-2) {
-      std::cout << "c[" << i << "] != c1[" << i << "]: " << c[i] << " vs "
-                << c1[i] << std::endl;
-      exit(1);
-    }
-  }
-
-  if (pr > 0) {
-    std::cout << "A:" << std::endl;
-    print_matrix(m, k, lda, a);
-    std::cout << "B:" << std::endl;
-    print_matrix(k, n, ldb, b);
-    std::cout << "C:" << std::endl;
-    print_matrix(m, n, ldc, c);
-    std::cout << "C1:" << std::endl;
-    print_matrix(m, n, ldc, c1);
-  }
-
-  paddle_mobile::memory::Free(a);
-  paddle_mobile::memory::Free(b);
-  paddle_mobile::memory::Free(c);
-  paddle_mobile::memory::Free(c1);
-
-  return 0;
-}
-
-int main(int argc, char *argv[]) {
-  do_sgemm(1, 1, 1, 1);
-
-  do_sgemm(9, 9, 1, 1);
-  do_sgemm(999, 99, 1, 0);
-  do_sgemm(999, 1, 1, 0);
-  do_sgemm(1, 9, 9, 1);
-  do_sgemm(1, 99, 999, 0);
-  do_sgemm(1, 1, 999, 0);
-
-  do_sgemm(9, 9, 9, 1);
-  do_sgemm(10, 6, 12, 1);
-  do_sgemm(512, 256, 384, 0);
-  do_sgemm(1366, 768, 256, 0);
-  do_sgemm(1255, 755, 333, 0);
-  do_sgemm(555, 777, 999, 0);
-
-  do_sgemm(10, 6, 12, 1);
-  do_sgemm(512, 256, 384, 0);
-  do_sgemm(1366, 768, 256, 0);
-  do_sgemm(1255, 755, 333, 0);
-  do_sgemm(555, 777, 999, 0);
-
-  return 0;
-}
diff --git a/mobile/test/common/test_gemm_int8_accuracy.cpp b/mobile/test/common/test_gemm_int8_accuracy.cpp
deleted file mode 100644
index 7d20a178c1..0000000000
--- a/mobile/test/common/test_gemm_int8_accuracy.cpp
+++ /dev/null
@@ -1,346 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cstdlib>
-#include <ctime>
-#include <iostream>
-#include <limits>
-#include <random>
-#include <type_traits>
-#include "../test_helper.h"
-#include "common/log.h"
-#include "memory/t_malloc.h"
-#include "operators/math/gemm.h"
-#ifdef _OPENMP
-#include <omp.h>
-#endif  // _OPENMP
-
-#define a(i, j) a[(i)*lda + (j)]
-#define b(i, j) b[(i)*ldb + (j)]
-#define c(i, j) c[(i)*ldc + (j)]
-#define c1(i, j) c1[(i)*ldc + (j)]
-
-using std::default_random_engine;
-using std::uniform_int_distribution;
-
-template <typename T>
-void print_matrix(int m, int n, int ldc, T *c) {
-  for (int i = 0; i < m; ++i) {
-    if (std::is_same<T, int8_t>::value) {
-      std::cout.setf(std::ios::left);
-      std::cout.width(4);
-      std::cout << static_cast<int32_t>(c(i, 0));
-    } else {
-      std::cout.setf(std::ios::left);
-      std::cout.width(6);
-      std::cout << c(i, 0);
-    }
-    for (int j = 1; j < n; ++j) {
-      if (std::is_same<T, int8_t>::value) {
-        std::cout << " | ";
-        std::cout.setf(std::ios::left);
-        std::cout.width(4);
-        std::cout << static_cast<int32_t>(c(i, j));
-      } else {
-        std::cout << " | ";
-        std::cout.setf(std::ios::left);
-        std::cout.width(6);
-        std::cout << c(i, j);
-      }
-    }
-    std::cout << "\n";
-  }
-  std::cout << std::endl;
-}
-
-int32_t qadd_int32(int32_t l, int32_t r) {
-  int64_t res = static_cast<int64_t>(l) + static_cast<int64_t>(r);
-  if (res > std::numeric_limits<int32_t>::max())
-    return std::numeric_limits<int32_t>::max();
-  else if (res < std::numeric_limits<int32_t>::min())
-    return std::numeric_limits<int32_t>::min();
-  else
-    return static_cast<int32_t>(res);
-}
-
-// round to zero
-float round2zero(float v) {
-  float res;
-  if (v > 0)
-    res = std::floor(v);
-  else if (v < 0)
-    res = std::ceil(v);
-  return res;
-}
-
-int8_t qscale_int32(int32_t v, float scale) {
-  float res = static_cast<float>(v) * scale;
-  res = round2zero(res);
-  if (res > 127)
-    return static_cast<int8_t>(127);
-  else if (res < -127)
-    return static_cast<int8_t>(-127);
-  else
-    return static_cast<int8_t>(res);
-}
-
-int do_sgemm(int m, int n, int k, bool relu, int pr) {
-  int lda = k;
-  int ldb = n;
-  int ldc = n;
-  default_random_engine e;
-  uniform_int_distribution<int8_t> pixel(-127, 127);
-  int8_t *a = static_cast<int8_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int8_t) * m * k));
-  int8_t *b = static_cast<int8_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int8_t) * k * n));
-  int32_t *c = static_cast<int32_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int32_t) * m * n));
-  int32_t *c1 = static_cast<int32_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int32_t) * m * n));
-
-  for (int i = 0; i < m * k; ++i) {
-    a[i] = pixel(e);
-  }
-  for (int i = 0; i < k * n; ++i) {
-    b[i] = pixel(e);
-  }
-
-  for (int i = 0; i < m; ++i) {
-    for (int j = 0; j < n; ++j) {
-      int32_t r = 0;
-      for (int p = 0; p < k; p++) {
-        r += static_cast<int32_t>(a(i, p)) * static_cast<int32_t>(b(p, j));
-      }
-      c1(i, j) = r;
-    }
-  }
-
-  paddle_mobile::operators::math::Gemm gemm;
-#ifdef _OPENMP
-  gemm.Sgemm_omp(m, n, k, static_cast<int8_t>(1), a, lda, b, ldb,
-                 static_cast<int8_t>(0), c, ldc, relu, nullptr);
-#else
-  gemm.Sgemm(m, n, k, static_cast<int8_t>(1), a, lda, b, ldb,
-             static_cast<int8_t>(0), c, ldc, relu, nullptr);
-#endif
-  int eq = 0;
-  int neq = 0;
-  for (int i = 0; i < m * n; ++i) {
-    if (c[i] == c1[i]) {
-      ++eq;
-    } else {
-      ++neq;
-    }
-  }
-
-  if (pr > 0) {
-    std::cout << "A:" << std::endl;
-    print_matrix(m, k, lda, a);
-    std::cout << "B:" << std::endl;
-    print_matrix(k, n, ldb, b);
-    std::cout << "C:" << std::endl;
-    print_matrix(m, n, ldc, c);
-    std::cout << "C1:" << std::endl;
-    print_matrix(m, n, ldc, c1);
-  }
-
-  std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu
-            << "   eq=" << eq << " neq=" << neq << std::endl;
-
-  PADDLE_MOBILE_ENFORCE(neq == 0, "The execution of do_sgemm is failed!");
-
-  paddle_mobile::memory::Free(a);
-  paddle_mobile::memory::Free(b);
-  paddle_mobile::memory::Free(c);
-  paddle_mobile::memory::Free(c1);
-
-  return 0;
-}
-
-int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr,
-                       bool addOnRow = false) {
-  int lda = k;
-  int ldb = n;
-  int ldc = n;
-  float scale = 1;
-  default_random_engine e;
-  uniform_int_distribution<int8_t> pixel(-127, 127);
-  int8_t *a = static_cast<int8_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int8_t) * m * k));
-  int8_t *b = static_cast<int8_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int8_t) * k * n));
-  int8_t *c = static_cast<int8_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int8_t) * m * n));
-  int8_t *c1 = static_cast<int8_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int8_t) * m * n));
-
-  int32_t *bias = nullptr;
-  if (addOnRow) {
-    bias = static_cast<int32_t *>(
-        paddle_mobile::memory::Alloc(sizeof(int32_t) * n));
-  } else {
-    bias = static_cast<int32_t *>(
-        paddle_mobile::memory::Alloc(sizeof(int32_t) * m));
-  }
-
-  for (int i = 0; i < m * k; ++i) {
-    a[i] = pixel(e);
-  }
-  for (int i = 0; i < k * n; ++i) {
-    b[i] = pixel(e);
-  }
-
-  if (addOnRow) {
-    for (int i = 0; i < n; ++i) {
-      bias[i] = static_cast<int32_t>(pixel(e));
-    }
-    for (int i = 0; i < m; ++i) {
-      for (int j = 0; j < n; ++j) {
-        int32_t bias_v = bias[j];
-        int32_t r = 0;
-        for (int p = 0; p < k; p++) {
-          r += static_cast<int32_t>(a(i, p)) * static_cast<int32_t>(b(p, j));
-        }
-        r = qadd_int32(r, bias_v);
-        if (relu) r = std::max(0, r);
-        c1(i, j) = qscale_int32(r, scale);
-      }
-    }
-  } else {
-    for (int i = 0; i < m; ++i) {
-      bias[i] = static_cast<int32_t>(pixel(e));
-    }
-    for (int i = 0; i < m; ++i) {
-      int32_t bias_v = bias[i];
-      for (int j = 0; j < n; ++j) {
-        int32_t r = 0;
-        for (int p = 0; p < k; p++) {
-          r += static_cast<int32_t>(a(i, p)) * static_cast<int32_t>(b(p, j));
-        }
-        r = qadd_int32(r, bias_v);
-        if (relu) r = std::max(0, r);
-        c1(i, j) = qscale_int32(r, scale);
-      }
-    }
-  }
-
-  paddle_mobile::operators::math::Gemm gemm;
-#ifdef _OPENMP
-  gemm.Sgemm_omp(m, n, k, scale, a, lda, b, ldb, static_cast<float>(0), c, ldc,
-                 relu, bias, addOnRow);
-#else
-  gemm.Sgemm(m, n, k, scale, a, lda, b, ldb, static_cast<float>(0), c, ldc,
-             relu, bias, addOnRow);
-#endif
-  int eq = 0;
-  int neq = 0;
-  for (int i = 0; i < m * n; ++i) {
-    if (c[i] == c1[i]) {
-      ++eq;
-    } else {
-      ++neq;
-    }
-  }
-
-  if (pr > 0) {
-    std::cout << "A:" << std::endl;
-    print_matrix(m, k, lda, a);
-    std::cout << "B:" << std::endl;
-    print_matrix(k, n, ldb, b);
-    std::cout << "Bias:" << std::endl;
-    if (addOnRow) {
-      print_matrix(1, n, n, bias);
-    } else {
-      print_matrix(m, 1, 1, bias);
-    }
-    std::cout << "C:" << std::endl;
-    print_matrix(m, n, ldc, c);
-    std::cout << "C1:" << std::endl;
-    print_matrix(m, n, ldc, c1);
-  }
-
-  std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu
-            << "   eq=" << eq << " neq=" << neq << std::endl;
-
-  PADDLE_MOBILE_ENFORCE(neq == 0,
-                        "The execution of do_sgemm_with_bias is failed!");
-
-  paddle_mobile::memory::Free(a);
-  paddle_mobile::memory::Free(b);
-  paddle_mobile::memory::Free(c);
-  paddle_mobile::memory::Free(c1);
-  paddle_mobile::memory::Free(bias);
-
-  return 0;
-}
-
-int main() {
-#ifdef _OPENMP
-  omp_set_num_threads(4);
-#endif
-  std::cout << "\n\n******************************************************\n\n"
-            << std::endl;
-  std::cout << "Test gemm without bias:" << std::endl;
-  do_sgemm(9, 9, 9, false, 1);
-  do_sgemm(10, 6, 12, false, 0);
-  do_sgemm(512, 256, 384, false, 0);
-  do_sgemm(1366, 768, 256, false, 0);
-  do_sgemm(1255, 755, 333, false, 0);
-  do_sgemm(599, 1133, 393, false, 0);
-  do_sgemm(777, 555, 999, false, 0);
-  do_sgemm(333, 797, 939, false, 0);
-  do_sgemm(1024, 1024, 1024, false, 0);
-
-  std::cout << "\n\n******************************************************\n\n"
-            << std::endl;
-  std::cout << "Test gemm with bias(bias is added on column):" << std::endl;
-  do_sgemm_with_bias(9, 9, 9, false, 1);
-  do_sgemm_with_bias(10, 6, 12, false, 0);
-  do_sgemm_with_bias(512, 256, 384, false, 0);
-  do_sgemm_with_bias(1366, 768, 256, false, 0);
-  do_sgemm_with_bias(1255, 755, 333, false, 0);
-  do_sgemm_with_bias(599, 1133, 393, false, 0);
-  do_sgemm_with_bias(777, 555, 999, false, 0);
-  do_sgemm_with_bias(333, 797, 939, false, 0);
-  do_sgemm_with_bias(1024, 1024, 1024, false, 0);
-
-  std::cout << "\n\n******************************************************\n\n"
-            << std::endl;
-  std::cout << "Test gemm with bias(bias is added on row):" << std::endl;
-  do_sgemm_with_bias(9, 9, 9, false, 1, true);
-  do_sgemm_with_bias(10, 6, 12, false, 0, true);
-  do_sgemm_with_bias(512, 256, 384, false, 0, true);
-  do_sgemm_with_bias(1366, 768, 256, false, 0, true);
-  do_sgemm_with_bias(1255, 755, 333, false, 0, true);
-  do_sgemm_with_bias(599, 1133, 393, false, 0, true);
-  do_sgemm_with_bias(777, 555, 999, false, 0, true);
-  do_sgemm_with_bias(333, 797, 939, false, 0, true);
-  do_sgemm_with_bias(1024, 1024, 1024, false, 0, true);
-
-  std::cout << "\n\n******************************************************\n\n"
-            << std::endl;
-  std::cout << "Test gemm with relu and bias:" << std::endl;
-  do_sgemm_with_bias(9, 9, 9, true, 1);
-  do_sgemm_with_bias(10, 6, 12, true, 0);
-  do_sgemm_with_bias(512, 256, 384, true, 0);
-  do_sgemm_with_bias(1366, 768, 256, true, 0);
-  do_sgemm_with_bias(1255, 755, 333, true, 0);
-  do_sgemm_with_bias(599, 1133, 393, true, 0);
-  do_sgemm_with_bias(777, 555, 999, true, 0);
-  do_sgemm_with_bias(333, 797, 939, true, 0);
-  do_sgemm_with_bias(1024, 1024, 1024, true, 0);
-
-  return 0;
-}
diff --git a/mobile/test/common/test_gemm_perf.cpp b/mobile/test/common/test_gemm_perf.cpp
deleted file mode 100644
index c88a65625d..0000000000
--- a/mobile/test/common/test_gemm_perf.cpp
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/math/gemm.h"
-#include "operators/math/math_function.h"
-
-#define a(i, j) a[(i)*lda + (j)]
-#define b(i, j) b[(i)*ldb + (j)]
-#define c1(i, j) c1[(i)*ldc + (j)]
-
-#define m 1024
-#define n 1024
-#define k 1024
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  Tensor aa, bb, cc;
-  auto aaptr = aa.mutable_data<float>({m, k});
-  auto bbptr = bb.mutable_data<float>({k, n});
-  auto ccptr = cc.mutable_data<float>({m, n});
-
-  for (int i = 0; i < m * k; ++i) {
-    aaptr[i] = 2;
-  }
-  for (int i = 0; i < k * n; ++i) {
-    bbptr[i] = 2;
-  }
-  for (int i = 0; i < m * n; ++i) {
-    ccptr[i] = 2;
-  }
-
-  Tensor aa_int8, bb_int8, cc_int32, cc_int8;
-  auto aaptr_int8 = aa_int8.mutable_data<int8_t>({m, k});
-  auto bbptr_int8 = bb_int8.mutable_data<int8_t>({k, n});
-  auto ccptr_int32 = cc_int32.mutable_data<int32_t>({m, n});
-  auto ccptr_int8 = cc_int8.mutable_data<int8_t>({m, n});
-  int32_t* bias_data_col = new int32_t[m];
-  int32_t* bias_data_row = new int32_t[n];
-
-  for (int i = 0; i < m * k; ++i) {
-    aaptr_int8[i] = static_cast<int8_t>(2);
-  }
-  for (int i = 0; i < k * n; ++i) {
-    bbptr_int8[i] = static_cast<int8_t>(2);
-  }
-  for (int i = 0; i < m * n; ++i) {
-    ccptr_int32[i] = static_cast<int32_t>(2);
-  }
-
-  for (int i = 0; i < m; ++i) {
-    bias_data_col[i] = 2;
-  }
-
-  for (int i = 0; i < n; ++i) {
-    bias_data_row[i] = 2;
-  }
-
-  // float
-  // warm-up 10 times
-  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::MatMul<float, float>(
-        aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
-        false, nullptr);
-  }
-
-  auto time_start0 = time();
-  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::MatMul<float, float>(
-        aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
-        false, nullptr);
-  }
-  auto time_end0 = time();
-  std::cout << "float gemm  cost :" << time_diff(time_start0, time_end0) / 10
-            << "ms\n";
-
-  // int8_t without bias
-  // warm-up 10 times
-  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
-        aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int32,
-        static_cast<float>(0));
-  }
-
-  auto time_start1 = time();
-  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
-        aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int32,
-        static_cast<float>(0));
-  }
-  auto time_end1 = time();
-  std::cout << "int8_t gemm  cost :" << time_diff(time_start1, time_end1) / 10
-            << "ms\n";
-
-  // int8_t with bias, column element wise add
-  // warm-up 10 times
-  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
-        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
-        static_cast<float>(0), false, bias_data_col, false);
-  }
-  auto time_start2 = time();
-  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
-        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
-        static_cast<float>(0), false, bias_data_col, false);
-  }
-  auto time_end2 = time();
-  std::cout << "int8_t gemm_with_bias(column add) cost :"
-            << time_diff(time_start2, time_end2) / 10 << "ms\n";
-
-  // int8_t with bias, row element wise add
-  // warm-up 10 times
-  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
-        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
-        static_cast<float>(0), false, bias_data_row, true);
-  }
-  auto time_start3 = time();
-  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
-        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
-        static_cast<float>(0), false, bias_data_row, true);
-  }
-  auto time_end3 = time();
-  std::cout << "int8_t gemm_with_bias(row add) cost :"
-            << time_diff(time_start3, time_end3) / 10 << "ms\n";
-
-  // int8_t with bias&relu
-  // warm-up 10 times
-  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
-        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
-        static_cast<float>(0), true, bias_data_col, false);
-  }
-  auto time_start4 = time();
-  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
-        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
-        static_cast<float>(0), true, bias_data_col, false);
-  }
-  auto time_end4 = time();
-  std::cout << "int8_t gemm_with_bias_relu cost :"
-            << time_diff(time_start4, time_end4) / 10 << "ms\n";
-
-  delete[] bias_data_row;
-  delete[] bias_data_col;
-
-  return 0;
-}
diff --git a/mobile/test/common/test_lib_size.cpp b/mobile/test/common/test_lib_size.cpp
deleted file mode 100644
index 805668f359..0000000000
--- a/mobile/test/common/test_lib_size.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-//
-// Created by liuRuiLong on 2018/6/6.
-//
-
-#include "test_lib_size.h"
-
-static test_lib_size t;
diff --git a/mobile/test/common/test_lib_size.h b/mobile/test/common/test_lib_size.h
deleted file mode 100644
index a00a5afe12..0000000000
--- a/mobile/test/common/test_lib_size.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-//
-// Created by liuRuiLong on 2018/6/6.
-//
-
-#ifndef PADDLE_MOBILE_TEST_LIB_SIZE_H
-#define PADDLE_MOBILE_TEST_LIB_SIZE_H
-
-#include <pthread.h>
-#include <thread>
-#include <vector>
-//#include <list>
-//#include <tuple>
-//#include <typeinfo>
-//#include <mutex>
-//#include <initializer_list>
-//#include <map>
-//#include <string>
-//#include <unordered_map>
-//#include <unordered_set>
-//#include <algorithm>
-
-//#include <iostream>
-//#include <sstream>
-//#include <memory>
-//#include <stdio.h>
-//#include <cstring>
-
-void foo() {
-  //  char *str = "1234";
-  //  char dst[10];
-  //  strcpy(dst, str);
-
-  //  std::cout << "12345" << std::endl;
-  std::vector<int> vec = {1, 2, 3, 4, 5};
-  vec.push_back(2);
-
-  pthread_mutex_init(NULL, NULL);
-  pthread_attr_destroy(NULL);
-  //  std::find(vec.begin(), vec.end(), 1);
-
-  //  std::list<int> l;
-  //  std::mutex mutex_;
-
-  //  std::map<int, float> m;
-  //  std::unordered_map<int, float> u_m;
-  //  std::unordered_set<int> u_s;
-  //  std::string ss = "12345";
-  //  printf("%f", ss.c_str());
-
-  //  std::initializer_list<int> init_list = {1, 2};
-  //  std::tuple<int, int> t = {1, 2};
-
-  //  std::tuple_element<I, std::tuple<ARGS...>>::type
-
-  //  std::tuple<>
-
-  //  int i;
-  //  int j;
-  //  if (typeid(i) == typeid(j)){
-  //    int z = 10;
-  //  }
-
-  //  std::shared_ptr<int> s1 = std::make_shared<int>();
-
-  //  std::stringstream ss;
-  //  ss << "12345";
-}
-
-class test_lib_size {
- public:
-  test_lib_size() {}
-  //  std::shared_ptr<int> Test(){
-  //    std::vector<int> vec = {1, 2, 3};
-  //    std::shared_ptr<int> si = std::make_shared<int>();
-  //    return si;
-  //  }
-
-  //  void test(){
-  //    int i = 9;
-  //  }
-};
-
-#endif  // PADDLE_MOBILE_TEST_LIB_SIZE_H
diff --git a/mobile/test/common/test_log.cpp b/mobile/test/common/test_log.cpp
deleted file mode 100644
index 9efcf007d0..0000000000
--- a/mobile/test/common/test_log.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "common/log.h"
-
-int main() {
-  DLOGF("DASJFDAFJ%d -- %f", 12345, 344.234);
-
-  LOGF(paddle_mobile::kLOG_DEBUG, "DASJFDAFJ%d -- %f", 12345, 344.234);
-
-  LOG(paddle_mobile::kLOG_DEBUG) << "test debug"
-                                 << " next log";
-
-  LOG(paddle_mobile::kLOG_DEBUG1) << "test debug1"
-                                  << " next log";
-  LOG(paddle_mobile::kLOG_DEBUG2) << "test debug2"
-                                  << " next log";
-  DLOG << "test DLOG";
-
-  LOG(paddle_mobile::kLOG_ERROR) << " error occur !";
-
-  return 0;
-}
diff --git a/mobile/test/common/test_openmp.cpp b/mobile/test/common/test_openmp.cpp
deleted file mode 100644
index 790c434101..0000000000
--- a/mobile/test/common/test_openmp.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-//#include <omp.h>
-#include <iostream>
-
-int main(void) {
-#ifdef PADDLE_MOBILE_USE_OPENMP
-  #pragma omp parallel num_threads(2)
-  {
-    //        int thread_id = omp_get_thread_num();
-    //        int nthreads = omp_get_num_threads();
-    //        std::cout << "Hello, OMP " << thread_id << "/" << nthreads <<
-    //        "\n";
-  }
-#endif
-  return 0;
-}
diff --git a/mobile/test/executor_for_test.h b/mobile/test/executor_for_test.h
deleted file mode 100644
index bcb5006084..0000000000
--- a/mobile/test/executor_for_test.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "common/log.h"
-#include "framework/executor.h"
-#include "framework/op_registry.h"
-#include "operators/activation_op.h"
-#include "operators/conv_op.h"
-#include "operators/elementwise_add_op.h"
-#include "operators/pool_op.h"
-#include "operators/reshape_op.h"
-#include "operators/softmax_op.h"
-#include "operators/transpose_op.h"
-
-using paddle_mobile::framework::BlockDesc;
-using paddle_mobile::framework::DDim;
-using paddle_mobile::framework::Executor;
-using paddle_mobile::framework::LoDTensor;
-using paddle_mobile::framework::OpDesc;
-using paddle_mobile::framework::Program;
-using paddle_mobile::framework::Tensor;
-using paddle_mobile::framework::Variable;
-using std::string;
-using std::vector;
-
-template <typename DeviceType, typename OpType>
-class Executor4Test : public Executor<DeviceType> {
- public:
-  Executor4Test(Program<DeviceType> p, string op_type,
-                bool use_optimize = false)
-      : Executor<DeviceType>() {
-    this->use_optimize_ = use_optimize;
-    this->program_ = p;
-    if (this->use_optimize_) {
-      this->program_desc_ = this->program_.optimizeProgram;
-    } else {
-      this->program_desc_ = this->program_.originProgram;
-    }
-
-    if (this->program_.originProgram == nullptr) {
-      LOG(paddle_mobile::LogLevel::kLOG_ERROR) << "program_desc_ == nullptr";
-    }
-
-    const std::vector<std::shared_ptr<BlockDesc>> &blocks =
-        this->program_desc_->Blocks();
-    std::vector<std::shared_ptr<OpDesc>> ops = blocks[0]->Ops();
-    for (int i = 0; i < ops.size(); ++i) {
-      auto op = ops[i];
-      if (op->Type() == op_type) {
-        DLOG << "匹配到: " << op->Type();
-
-        /// test first meeting op in program
-        std::shared_ptr<paddle_mobile::framework::OperatorBase<DeviceType>>
-            op_ptr = paddle_mobile::framework::OpRegistry<DeviceType>::CreateOp(
-                op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
-                this->program_.scope.get());
-        this->ops_of_block0_.push_back(op_ptr);
-        break;
-      }
-    }
-
-    this->InitMemory();
-    for (const auto &op : this->ops_of_block0_) {
-      op->Init();
-    }
-  }
-
-  template <typename T = LoDTensor>
-  vector<std::shared_ptr<Tensor>> Predict(const vector<Tensor> &ts,
-                                          const vector<string> &input_names,
-                                          const vector<string> &output_names,
-                                          const vector<DDim> &ddims) {
-    auto scope = this->program_.scope.get();
-    size_t input_size = input_names.size();
-    size_t out_size = output_names.size();
-
-    vector<Variable *> input_vars(input_size);
-    vector<LoDTensor *> input_tensors(input_size);
-    for (int i = 0; i < input_size; i++) {
-      input_vars[i] = scope->Var(input_names[i]);
-      input_tensors[i] = input_vars[i]->GetMutable<T>();
-      input_tensors[i]->ShareDataWith(ts[i]);
-    }
-
-    vector<Variable *> output_vars(out_size);
-    vector<LoDTensor *> output_tensors(out_size);
-    vector<std::shared_ptr<Tensor>> output_tensor_sptrs(out_size);
-
-    for (int i = 0; i < out_size; i++) {
-      output_vars[i] = scope->Var(output_names[i]);
-      output_tensors[i] = output_vars[i]->GetMutable<T>();
-      output_tensors[i]->mutable_data<float>(ddims[i]);
-      output_tensor_sptrs[i] = std::make_shared<LoDTensor>();
-      output_tensor_sptrs[i].reset(output_tensors[i]);
-    }
-
-    for (auto &op : this->ops_of_block0_) {
-      op->Run();
-    }
-
-    return output_tensor_sptrs;
-  }
-
-  std::shared_ptr<Tensor> Predict(const Tensor &t, string input, string output,
-                                  const DDim &dDim) {
-    auto scope = this->program_.scope.get();
-    Variable *g_feed_value = scope->Var(input);
-    auto tensor = g_feed_value->GetMutable<LoDTensor>();
-    tensor->ShareDataWith(t);
-
-    Variable *con_output = scope->Var(output);
-    auto *output_tensor = con_output->GetMutable<LoDTensor>();
-    output_tensor->mutable_data<float>(dDim);
-
-    for (auto &op : this->ops_of_block0_) {
-      op->Run();
-    }
-
-    return std::make_shared<paddle_mobile::framework::Tensor>(
-        paddle_mobile::framework::Tensor(*output_tensor));
-  }
-};
diff --git a/mobile/test/fpga/test_concat_op.cpp b/mobile/test/fpga/test_concat_op.cpp
deleted file mode 100644
index 44b9f4971b..0000000000
--- a/mobile/test/fpga/test_concat_op.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/concat_op.h"
-
-int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::FPGA> loader;
-  auto program = loader.Load(g_googlenet);
-  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
-                        "program file read fail");
-
-  Executor4Test<paddle_mobile::FPGA,
-                paddle_mobile::operators::ConcatOp<paddle_mobile::FPGA, float>>
-      executor(program, "concat");
-
-  // 1. input_tensors;
-  vector<Tensor> input_tensors;
-
-  Tensor input1;
-  auto input1_data = CreateInput<float>(&input1, {4, 10, 2, 2}, 0, 1);
-  input_tensors.push_back(input1);
-  Tensor input2;
-  auto input2_data = CreateInput<float>(&input2, {4, 20, 2, 2}, 0, 1);
-  input_tensors.push_back(input2);
-  Tensor input3;
-  auto input3_data = CreateInput<float>(&input3, {4, 30, 2, 2}, 0, 1);
-  input_tensors.push_back(input3);
-  Tensor input4;
-  auto input4_data = CreateInput<float>(&input4, {4, 40, 2, 2}, 0, 1);
-  input_tensors.push_back(input4);
-  // 2. input_names
-  vector<string> input_names({
-      "conv2d_3.tmp_1",
-      "conv2d_5.tmp_1",
-      "conv2d_7.tmp_1",
-      "conv2d_8.tmp_1",
-  });
-
-  // 3. output_names
-  vector<string> output_names({"concat_0.tmp_0"});
-
-  // 4. out_dims;
-  vector<DDim> out_ddims;
-  auto out_ddim = paddle_mobile::framework::make_ddim({3, 100, 2, 2});
-  out_ddims.push_back(out_ddim);
-
-  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
-                                            output_names, out_ddims);
-
-  auto output0_data = output[0]->data<float>();
-
-  // 5. test one example.
-  int input_n = 1;
-  int input_c = 2;
-  int input_h = 0;
-  int input_w = 1;
-  int stride0 = input3.numel() / input3.dims()[0];
-  int stride1 = input3.numel() / input3.dims()[0] / input3.dims()[1];
-  int stride2 = input3.dims()[3];
-  /// inputx1 (4,10,2,2),
-  /// inputx2 (4,20,2,2),
-  /// inputx3 (4,30,2,2),
-  /// inputx4 (4,40,2,2),
-  /// axis = 1
-  /// output (4,100,2,2)
-  int input_index =
-      input_n * stride0 + input_c * stride1 + input_h * stride2 + input_w;
-  int output_index = input_n * 100 * 2 * 2 +
-                     (input_c + input1.dims()[1] + input2.dims()[1]) * 2 * 2 +
-                     input_h * 2 + input_w;
-
-  DLOG << " input3 [1, 2,0,1] = " << input3_data[input_index];
-  DLOG << " output [1,32,0,1] = " << output0_data[output_index];
-  return 0;
-}
diff --git a/mobile/test/fpga/test_densebox_combine.cpp b/mobile/test/fpga/test_densebox_combine.cpp
deleted file mode 100644
index 056bbe52d8..0000000000
--- a/mobile/test/fpga/test_densebox_combine.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-#ifdef PADDLE_MOBILE_FPGA_V1
-#include "fpga/V1/api.h"
-#endif
-#ifdef PADDLE_MOBILE_FPGA_V2
-#include "fpga/V2/api.h"
-#endif
-
-static const char *g_densebox_combine = "../models/densebox";
-int main() {
-  paddle_mobile::fpga::open_device();
-  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
-  // paddle_mobile.SetThreadNum(4);
-  if (paddle_mobile.Load(std::string(g_densebox_combine) + "/model",
-                         std::string(g_densebox_combine) + "/params", true)) {
-    // std::vector<float> input;
-    // std::vector<int64_t> dims{1, 3, 512, 1024};
-    // GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
-
-    // auto vec_result = paddle_mobile.Predict(input, dims);
-
-    Tensor input_tensor;
-    SetupTensor<float>(&input_tensor, {1, 3, 512, 1024}, static_cast<float>(0),
-                       static_cast<float>(1));
-    // readStream(g_image_src_float,
-    //           input_tensor.mutable_data<float>({1, 3, 224, 224}));
-    paddle_mobile.FeedData(input_tensor);
-    paddle_mobile.Predict_To(-1);
-  }
-
-  return 0;
-}
diff --git a/mobile/test/fpga/test_format_data.cpp b/mobile/test/fpga/test_format_data.cpp
deleted file mode 100644
index 1d67c3110f..0000000000
--- a/mobile/test/fpga/test_format_data.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "fpga/api.h"
-
-namespace frame = paddle_mobile::framework;
-namespace fpga = paddle_mobile::fpga;
-using std::cout;
-using std::endl;
-
-void test_format_image() {
-  std::vector<int> dims{1, 1, 3, 3};
-  std::vector<float> elements{1, 2, 3, 4, 5, 6, 7, 8, 9};
-  frame::DDim ddim = frame::make_ddim(dims);
-  frame::Tensor image(elements, ddim);
-  int num = image.numel();
-  float *data_ptr = image.mutable_data<float>();
-
-  for (int i = 0; i < num; i++) {
-    cout << data_ptr[i] << " ";
-  }
-  cout << endl;
-
-  fpga::format_image(&image);
-  data_ptr = image.mutable_data<float>();
-
-  for (int i = 0; i < 48; i++) {
-    cout << data_ptr[i] << " ";
-  }
-  cout << endl;
-  auto dd = image.dims();
-  cout << dims[0] << dims[1] << dims[2] << dims[3] << endl;
-}
-
-void test_fill_conv_arg() {
-  Tensor input, out, filter;
-  DLOG << "Setup input";
-  SetupTensor<int16_t>(&input, {1, 250, 32, 30}, static_cast<int16_t>(0),
-                       static_cast<int16_t>(1));
-
-  DLOG << "Setup filter";
-  SetupTensor<float>(&filter, {1001, 250, 3, 3}, static_cast<float>(0),
-                     static_cast<float>(1));
-
-  DLOG << "Setup output";
-  SetupTensor<int16_t>(&out, {1, 1001, 32, 30}, static_cast<int16_t>(0),
-                       static_cast<int16_t>(1));
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * 1001 * sizeof(float));
-
-  DLOG << "find max";
-  float max_value = fpga::filter_find_max(&filter);
-  DLOG << "format filter";
-  fpga::format_filter(&filter, max_value, 1);
-
-  DLOG << "format bs_ptr";
-  int element_num_per_div = fpga::get_filter_num_per_div(&filter, 1);
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, 1001);
-
-  DLOG << "format ofm";
-  fpga::format_fp16_ofm(&out);
-  DLOG << "Build arg";
-
-  fpga::WrapperConvArgs arg;
-  fpga::fill_conv_arg(&arg, &input, &out, &filter, true, 1, 1, 1, 1, 1, bs_ptr);
-  DLOG << "splitNum: " << arg.split_num << "  group_num:" << arg.group_num
-       << "  filter_num:" << arg.filter_num;
-
-  for (int i = 0; i < arg.split_num; i++) {
-    DLOG << arg.conv_args[i].filter_num << "   " << arg.conv_args[i].sb_address
-         << "   " << arg.conv_args[i].filter_address << "   "
-         << arg.conv_args[i].filter_scale_address;
-  }
-}
-
-int main() {
-  test_format_image();
-  test_fill_conv_arg();
-  return 0;
-}
diff --git a/mobile/test/fpga/test_marker.cpp b/mobile/test/fpga/test_marker.cpp
deleted file mode 100644
index e0977b57f0..0000000000
--- a/mobile/test/fpga/test_marker.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_MOBILE_FPGA
-#define PADDLE_MOBILE_FPGA
-#endif
-
-#include "../test_helper.h"
-#include "../test_include.h"
-#ifdef PADDLE_MOBILE_FPGA_V1
-#include "fpga/V1/api.h"
-#endif
-#ifdef PADDLE_MOBILE_FPGA_V2
-#include "fpga/V2/api.h"
-#endif
-
-#include <fstream>
-#include <iostream>
-#include "../../src/io/paddle_inference_api.h"
-
-using namespace paddle_mobile;        // NOLINT
-using namespace paddle_mobile::fpga;  // NOLINT
-
-static const char *g_image = "../models/marker/marker1/image.bin";
-static const char *g_model = "../models/marker/marker1/model";
-static const char *g_param = "../models/marker/marker1/params";
-
-void readStream(std::string filename, char *buf) {
-  std::ifstream in;
-  in.open(filename, std::ios::in | std::ios::binary);
-  if (!in.is_open()) {
-    std::cout << "open File Failed." << std::endl;
-    return;
-  }
-
-  in.seekg(0, std::ios::end);  // go to the end
-  auto length = in.tellg();    // report location (this is the length)
-  in.seekg(0, std::ios::beg);  // go back to the beginning
-  in.read(buf, length);
-  in.close();
-}
-
-PaddleMobileConfig GetConfig() {
-  PaddleMobileConfig config;
-  config.precision = PaddleMobileConfig::FP32;
-  config.device = PaddleMobileConfig::kFPGA;
-  config.prog_file = g_model;
-  config.param_file = g_param;
-  config.thread_num = 1;
-  config.batch_size = 1;
-  config.optimize = true;
-  config.lod_mode = true;
-  config.quantification = false;
-  return config;
-}
-
-int main() {
-  open_device();
-
-  PaddleMobileConfig config = GetConfig();
-  auto predictor =
-      CreatePaddlePredictor<PaddleMobileConfig,
-                            PaddleEngineKind::kPaddleMobile>(config);
-
-  std::cout << "Finishing loading model" << std::endl;
-
-  float img_info[3] = {432, 1280, 1.0f};
-  int img_length = 432 * 1280 * 3;
-  auto img = reinterpret_cast<float *>(fpga_malloc(img_length * sizeof(float)));
-  readStream(g_image, reinterpret_cast<char *>(img));
-
-  std::cout << "Finishing initializing data" << std::endl;
-  struct PaddleTensor t_img_info, t_img;
-  t_img.dtypeid = typeid(float);
-  t_img_info.layout = LAYOUT_HWC;
-  t_img_info.shape = std::vector<int>({1, 3});
-  t_img_info.name = "Image information";
-  t_img_info.data.Reset(img_info, 3 * sizeof(float));
-
-  t_img.dtypeid = typeid(float);
-  t_img.layout = LAYOUT_HWC;
-  t_img.shape = std::vector<int>({1, 432, 1280, 3});
-  t_img.name = "Image information";
-  t_img.data.Reset(img, img_length * sizeof(float));
-  predictor->FeedPaddleTensors({t_img_info, t_img});
-
-  std::cout << "Finishing feeding data " << std::endl;
-
-  predictor->Predict_From_To(0, -1);
-  std::cout << "Finishing predicting " << std::endl;
-
-  std::vector<PaddleTensor> v;        // No need to initialize v
-  predictor->FetchPaddleTensors(&v);  // Old data in v will be cleared
-  for (int i = 0; i < v.size(); ++i) {
-    auto p = reinterpret_cast<float *>(v[i].data.data());
-    int len = v[i].data.length();
-    float result = 0.0f;
-    std::string str = "fetch" + std::to_string(i);
-    fpga::savefile<float>(str, p, len, result);
-  }
-
-  std::cout << "Finish getting vector values" << std::endl;
-
-  ////////////////////////////////////////////////////
-
-  // PaddleTensor tensor;
-  // predictor->GetPaddleTensor("fetch2", &tensor);
-  // for (int i = 0; i < post_nms; i++) {
-  // auto p = reinterpret_cast<float *>(tensor.data.data());
-  // std::cout << p[+i] << std::endl;
-  // }
-
-  return 0;
-}
diff --git a/mobile/test/fpga/test_marker2.cpp b/mobile/test/fpga/test_marker2.cpp
deleted file mode 100644
index b4af515c73..0000000000
--- a/mobile/test/fpga/test_marker2.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-#ifdef PADDLE_MOBILE_FPGA_V1
-#include "fpga/V1/api.h"
-#endif
-#ifdef PADDLE_MOBILE_FPGA_V2
-#include "fpga/V2/api.h"
-#endif
-#include <string>
-#ifdef COST_TIME_PRINT
-#include <sys/time.h>
-#include <time.h>
-#include <iomanip>
-#endif
-void readStream(std::string filename, char *buf) {
-  std::ifstream in;
-  in.open(filename, std::ios::in | std::ios::binary);
-  if (!in.is_open()) {
-    std::cout << "open File Failed." << std::endl;
-    return;
-  }
-
-  in.seekg(0, std::ios::end);  // go to the end
-  auto length = in.tellg();    // report location (this is the length)
-  in.seekg(0, std::ios::beg);  // go back to the beginning
-  in.read(buf, length);
-  DLOG << length;
-  in.close();
-}
-
-void convert_to_chw(int16_t **data_in, int channel, int height, int width,
-                    int num, int16_t *data_tmp) {
-  int64_t amount_per_side = width * height;
-  for (int n = 0; n < num; n++) {
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        for (int c = 0; c < channel; c++) {
-          *(data_tmp + n * amount_per_side * channel + c * amount_per_side +
-            width * h + w) = *((*data_in)++);
-        }
-      }
-    }
-  }
-}
-
-void dump_stride_half(std::string filename, Tensor input_tensor,
-                      const int dumpnum, bool use_chw) {
-  // bool use_chw = true;
-  if (input_tensor.dims().size() != 4) return;
-  int c = (input_tensor.dims())[1];
-  int h = (input_tensor.dims())[2];
-  int w = (input_tensor.dims())[3];
-  int n = (input_tensor.dims())[0];
-  auto data_ptr = input_tensor.get_data();
-  auto *data_ptr_16 = reinterpret_cast<half *>(data_ptr);
-  auto data_tmp = data_ptr_16;
-  if (use_chw) {
-    data_tmp =
-        reinterpret_cast<half *>(malloc(n * c * h * w * sizeof(int16_t)));
-    convert_to_chw(&data_ptr_16, c, h, w, n, data_tmp);
-  }
-  std::ofstream out(filename.c_str());
-  float result = 0;
-  int stride = input_tensor.numel() / dumpnum;
-  stride = stride > 0 ? stride : 1;
-  for (int i = 0; i < input_tensor.numel(); i += stride) {
-    result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]);
-    out << result << std::endl;
-  }
-  out.close();
-  if (data_tmp != data_ptr_16) {
-    free(data_tmp);
-  }
-}
-
-void dump_stride_float(std::string filename, Tensor input_tensor,
-                       const int dumpnum) {
-  auto data_ptr = reinterpret_cast<float *>(input_tensor.get_data());
-  std::ofstream out(filename.c_str());
-  float result = 0;
-  int stride = input_tensor.numel() / dumpnum;
-  stride = stride > 0 ? stride : 1;
-  for (int i = 0; i < input_tensor.numel(); i += stride) {
-    result = data_ptr[i];
-    out << result << std::endl;
-  }
-  out.close();
-}
-
-void dump_stride(std::string filename, Tensor input_tensor, const int dumpnum,
-                 bool use_chw) {
-  static int i = 0;
-  if (input_tensor.numel() == 0) {
-    return;
-  }
-  if (input_tensor.type() == typeid(float)) {
-    DLOG << "op: " << i++ << ", float data  " << input_tensor.numel();
-    dump_stride_float(filename, input_tensor, dumpnum);
-  } else {
-    DLOG << "op: " << i++ << ", half data  " << input_tensor.numel();
-    dump_stride_half(filename, input_tensor, dumpnum, use_chw);
-  }
-  DLOG << "dump input address: " << input_tensor.get_data();
-}
-
-static const char *g_marker_combine = "../models/marker/marker_2segment";
-// static const char *g_marker_combine = "../models/marker/model2";
-static const char *g_image_src_float =
-    "../models/marker/marker_2segment/marker_2.bin";
-// static const char *g_image_src_float = "../models/marker/model2/data.bin";
-int main() {
-  paddle_mobile::fpga::open_device();
-  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
-
-  if (paddle_mobile.Load(std::string(g_marker_combine) + "/model",
-                         std::string(g_marker_combine) + "/params", true, false,
-                         1, true)) {
-    // if (paddle_mobile.Load(std::string(g_marker_combine), true)) {
-    float img_info[3] = {432, 1280, 1.0f};
-    auto img = reinterpret_cast<float *>(
-        fpga::fpga_malloc(144 * 14 * 14 * sizeof(float)));
-    readStream(g_image_src_float, reinterpret_cast<char *>(img));
-
-    std::vector<void *> v(3, nullptr);
-    paddle_mobile.FeedData({img});
-    // paddle_mobile.Predict_To(-1);
-#ifdef COST_TIME_PRINT
-    timeval start11, end11;
-    long dif_sec, dif_usec;  // NOLINT
-#endif
-
-#ifdef COST_TIME_PRINT
-    gettimeofday(&start11, NULL);
-#endif
-
-    paddle_mobile.Predict_To(-1);
-
-#ifdef COST_TIME_PRINT
-    gettimeofday(&end11, NULL);
-    dif_sec = end11.tv_sec - start11.tv_sec;
-    dif_usec = end11.tv_usec - start11.tv_usec;
-    std::cout << "total: "
-              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "  us"
-              << std::endl;
-#endif
-
-    for (int i = 0; i < 8; i++) {
-      auto tensor_ptr = paddle_mobile.FetchResult(i);
-      std::string saveName = "marker_" + std::to_string(i);
-      // if(i != 58)
-      paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(),
-                                           tensor_ptr->numel() * sizeof(float));
-      //                                   tensor_ptr->numel() * sizeof(float));
-
-      dump_stride(saveName, (*tensor_ptr), tensor_ptr->numel(),
-                  true);  // 20);//tensor_ptr->numel());
-    }
-
-    //   paddle_mobile.GetResults(&v);
-    DLOG << "Computation done";
-    fpga::fpga_free(img);
-  }
-
-  return 0;
-}
diff --git a/mobile/test/fpga/test_marker_api.cpp b/mobile/test/fpga/test_marker_api.cpp
deleted file mode 100644
index 29cf6561df..0000000000
--- a/mobile/test/fpga/test_marker_api.cpp
+++ /dev/null
@@ -1,241 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_MOBILE_FPGA
-#define PADDLE_MOBILE_FPGA
-#endif
-#include <sys/time.h>
-#include <time.h>
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include "../../src/io/paddle_inference_api.h"
-
-using namespace paddle_mobile;        // NOLINT
-using namespace paddle_mobile::fpga;  // NOLINT
-
-static const char *g_image = "../models/marker/model/image.bin";
-static const char *g_model = "../models/marker/model/model";
-static const char *g_param = "../models/marker/model/params";
-
-static const char *g_image1 = "../models/marker2/model/marker.bin";
-static const char *g_model1 = "../models/marker2/model/model";
-static const char *g_param1 = "../models/marker2/model/params";
-
-void readStream(std::string filename, char *buf) {
-  std::ifstream in;
-  in.open(filename, std::ios::in | std::ios::binary);
-  if (!in.is_open()) {
-    std::cout << "open File Failed." << std::endl;
-    return;
-  }
-
-  in.seekg(0, std::ios::end);  // go to the end
-  auto length = in.tellg();    // report location (this is the length)
-  in.seekg(0, std::ios::beg);  // go back to the beginning
-  in.read(buf, length);
-  in.close();
-}
-signed char float_to_int8(float fdata) {
-  if (fdata < 0.0) {
-    fdata -= 0.5;
-  } else {
-    fdata += 0.5;
-  }
-  return (signed char)fdata;
-}
-void quantize(float **data_in, int data_size) {
-  float *tmp = *data_in;
-  signed char *tmp_data =
-      (signed char *)paddle_mobile::fpga::fpga_malloc(data_size * sizeof(char));
-  for (int i = 0; i < data_size; i++) {
-    tmp_data[i] = float_to_int8((*data_in)[i] + 128);
-  }
-  *data_in = (float *)tmp_data;  // NOLINT
-  paddle_mobile::fpga::fpga_free(tmp);
-}
-
-void convert_to_chw(float **data_in, int channel, int height, int width,
-                    float *data_tmp) {
-  int64_t amount_per_side = width * height;
-  for (int h = 0; h < height; h++) {
-    for (int w = 0; w < width; w++) {
-      for (int c = 0; c < channel; c++) {
-        *(data_tmp + c * amount_per_side + width * h + w) = *((*data_in)++);
-      }
-    }
-  }
-}
-
-void dump_stride_float(std::string filename,
-                       paddle_mobile::PaddleTensor input_tensor) {
-  auto data_ptr = reinterpret_cast<float *>(input_tensor.data.data());
-  int c = (input_tensor.shape)[1];
-  int h = (input_tensor.shape)[2];
-  int w = (input_tensor.shape)[3];
-  int n = (input_tensor.shape)[0];
-  float *data_tmp =
-      reinterpret_cast<float *>(malloc(c * h * w * sizeof(float)));
-  // convert_to_chw(&data_ptr, c, h, w, data_tmp);
-  std::ofstream out(filename.c_str());
-  float result = 0;
-  int datasize = abs(c * h * w * n);
-  if (datasize == 0) {
-    std::cout << "wrong dump data size" << std::endl;
-    return;
-  }
-  for (int i = 0; i < datasize; i++) {
-    result = data_ptr[i];
-    out << result << std::endl;
-  }
-  out.close();
-}
-
-void dump_stride(std::string filename,
-                 paddle_mobile::PaddleTensor input_tensor) {
-  if (input_tensor.dtypeid == type_id<float>().hash_code()) {
-    dump_stride_float(filename, input_tensor);
-  } else {
-    std::cout << "only support dumping float data" << std::endl;
-  }
-}
-PaddleMobileConfig GetConfig() {
-  PaddleMobileConfig config;
-  config.precision = PaddleMobileConfig::FP32;
-  config.device = PaddleMobileConfig::kFPGA;
-  config.prog_file = g_model;
-  config.param_file = g_param;
-  config.thread_num = 1;
-  config.batch_size = 1;
-  config.optimize = true;
-  config.lod_mode = true;
-  config.quantification = false;
-  return config;
-}
-PaddleMobileConfig GetConfig1() {
-  PaddleMobileConfig config;
-  config.precision = PaddleMobileConfig::FP32;
-  config.device = PaddleMobileConfig::kFPGA;
-  config.prog_file = g_model1;
-  config.param_file = g_param1;
-  config.thread_num = 1;
-  config.batch_size = 1;
-  config.optimize = true;
-  config.lod_mode = true;
-  config.quantification = false;
-  return config;
-}
-
-int main() {
-  open_device();
-  timeval start11, end11;
-  long dif_sec, dif_usec;  // NOLINT
-
-  PaddleMobileConfig config = GetConfig();
-  auto predictor =
-      CreatePaddlePredictor<PaddleMobileConfig,
-                            PaddleEngineKind::kPaddleMobile>(config);
-
-  std::cout << "Finishing loading model" << std::endl;
-
-  float img_info[3] = {432, 1280, 1.0f};
-  int img_length = 432 * 1280 * 3;
-  auto img = reinterpret_cast<float *>(fpga_malloc(img_length * sizeof(float)));
-  readStream(g_image, reinterpret_cast<char *>(img));
-
-  std::cout << "Finishing initializing data" << std::endl;
-  struct PaddleTensor t_img_info, t_img;
-  t_img_info.dtypeid = type_id<float>().hash_code();
-  t_img_info.layout = LAYOUT_HWC;
-  t_img_info.shape = std::vector<int>({1, 3});
-  t_img_info.name = "Image information";
-  t_img_info.data.Reset(img_info, 3 * sizeof(float));
-
-  t_img.dtypeid = type_id<float>().hash_code();
-  // quantize(&img, img_length);
-  // t_img.dtypeid = typeid(int8_t);
-  t_img.layout = LAYOUT_HWC;
-  t_img.shape = std::vector<int>({1, 432, 1280, 3});
-  t_img.name = "Image information";
-  t_img.data.Reset(img, img_length * sizeof(float));
-  // t_img.data.Reset(img, img_length * sizeof(int8_t));
-  // for(int i = 0; i < 100; ++i){
-  predictor->FeedPaddleTensors({t_img_info, t_img});
-
-  std::cout << "Finishing feeding data " << std::endl;
-
-  gettimeofday(&start11, NULL);
-  predictor->Predict_From_To(0, -1);
-  gettimeofday(&end11, NULL);
-  dif_sec = end11.tv_sec - start11.tv_sec;
-  dif_usec = end11.tv_usec - start11.tv_usec;
-  std::cout << "marker1 total"
-            << " cost time: " << (dif_sec * 1000000 + dif_usec) << "  us"
-            << std::endl;
-  std::cout << "Finishing predicting " << std::endl;
-
-  std::vector<paddle_mobile::PaddleTensor> v;  // No need to initialize v
-  predictor->FetchPaddleTensors(&v);           // Old data in v will be cleared
-  std::cout << "Output number is " << v.size() << std::endl;
-  for (int fetchNum = 0; fetchNum < v.size(); fetchNum++) {
-    std::string dumpName = "marker_api_fetch_" + std::to_string(fetchNum);
-    // dump_stride(dumpName, v[fetchNum]);
-  }
-  fpga_free(img);
-
-  PaddleMobileConfig config1 = GetConfig1();
-  auto predictor1 =
-      CreatePaddlePredictor<PaddleMobileConfig,
-                            PaddleEngineKind::kPaddleMobile>(config1);
-
-  std::cout << "Finishing loading model" << std::endl;
-  for (int i = 0; i < 1; ++i) {
-    int img_length1 = 144 * 14 * 14;
-    auto img1 =
-        reinterpret_cast<float *>(fpga_malloc(img_length1 * sizeof(float)));
-    readStream(g_image1, reinterpret_cast<char *>(img1));
-
-    std::cout << "Finishing initializing data" << std::endl;
-    struct PaddleTensor t_img1;
-
-    t_img1.dtypeid = type_id<float>().hash_code();
-    t_img1.layout = LAYOUT_HWC;
-    t_img1.shape = std::vector<int>({1, 14, 14, 144});
-    t_img1.name = "Image information";
-    t_img1.data.Reset(img1, img_length1 * sizeof(float));
-    predictor1->FeedPaddleTensors({t_img1});
-
-    std::cout << "Finishing feeding data " << std::endl;
-
-    gettimeofday(&start11, NULL);
-    predictor1->Predict_From_To(0, -1);
-    gettimeofday(&end11, NULL);
-    dif_sec = end11.tv_sec - start11.tv_sec;
-    dif_usec = end11.tv_usec - start11.tv_usec;
-    std::cout << "marker2 total"
-              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "  us"
-              << std::endl;
-    std::cout << "Finishing predicting " << std::endl;
-
-    std::vector<paddle_mobile::PaddleTensor> v1;  // No need to initialize v
-    predictor1->FetchPaddleTensors(&v1);  // Old data in v will be cleared
-    std::cout << "Output number is " << v1.size() << std::endl;
-    for (int fetchNum = 0; fetchNum < v1.size(); fetchNum++) {
-      std::string dumpName = "marker2_api_fetch_" + std::to_string(fetchNum);
-      dump_stride(dumpName, v1[fetchNum]);
-    }
-    fpga_free(img1);
-  }
-  return 0;
-}
diff --git a/mobile/test/fpga/test_mobilenet_api.cpp b/mobile/test/fpga/test_mobilenet_api.cpp
deleted file mode 100644
index 09392e9d38..0000000000
--- a/mobile/test/fpga/test_mobilenet_api.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_MOBILE_FPGA
-#define PADDLE_MOBILE_FPGA
-#endif
-#include <fstream>
-#include <iostream>
-#include "../../src/io/paddle_inference_api.h"
-
-using namespace paddle_mobile;        // NOLINT
-using namespace paddle_mobile::fpga;  // NOLINT
-
-static const char *g_image = "../images/mobilenet_txtdata/1.txt";
-static const char *g_model = "../models/keycurve_l2_regular4_model/__model__";
-static const char *g_param =
-    "../models/keycurve_l2_regular4_model/model.params";
-
-void readStream(std::string filename, float *buf) {
-  std::ifstream in;
-  in.open(filename, std::ios::in);
-  if (!in.is_open()) {
-    std::cout << "open File Failed." << std::endl;
-    return;
-  }
-  int i = 0;
-  while (!in.eof()) {
-    in >> buf[i];
-    i++;
-  }
-  in.close();
-}
-
-signed char float_to_int8(float fdata) {
-  if (fdata < 0.0) {
-    fdata -= 0.5;
-  } else {
-    fdata += 0.5;
-  }
-  return (signed char)fdata;
-}
-void quantize(float **data_in, int data_size) {
-  float *tmp = *data_in;
-  signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
-  for (int i = 0; i < data_size; i++) {
-    tmp_data[i] = float_to_int8((*data_in)[i] + 128);
-  }
-  *data_in = (float *)tmp_data;  // NOLINT
-  fpga_free(tmp);
-}
-
-void convert_to_chw(float **data_in, int channel, int height, int width,
-                    float *data_tmp) {
-  int64_t amount_per_side = width * height;
-  for (int h = 0; h < height; h++) {
-    for (int w = 0; w < width; w++) {
-      for (int c = 0; c < channel; c++) {
-        *(data_tmp + c * amount_per_side + width * h + w) = *((*data_in)++);
-      }
-    }
-  }
-}
-
-void dump_stride_float(std::string filename, PaddleTensor input_tensor) {
-  auto data_ptr = reinterpret_cast<float *>(input_tensor.data.data());
-  int c = (input_tensor.shape)[1];
-  int h = (input_tensor.shape)[2];
-  int w = (input_tensor.shape)[3];
-  int n = (input_tensor.shape)[0];
-  float *data_tmp =
-      reinterpret_cast<float *>(malloc(c * h * w * sizeof(float)));
-  convert_to_chw(&data_ptr, c, h, w, data_tmp);
-  std::ofstream out(filename.c_str());
-  float result = 0;
-  int datasize = abs(c * h * w * n);
-  if (datasize == 0) {
-    std::cout << "wrong dump data size" << std::endl;
-    return;
-  }
-  for (int i = 0; i < datasize; i++) {
-    result = data_tmp[i];
-    out << result << std::endl;
-  }
-  out.close();
-}
-
-void dump_stride(std::string filename, PaddleTensor input_tensor) {
-  if (input_tensor.dtypeid == type_id<float>().hash_code()) {
-    dump_stride_float(filename, input_tensor);
-  } else {
-    std::cout << "only support dumping float data" << std::endl;
-  }
-}
-
-PaddleMobileConfig GetConfig() {
-  PaddleMobileConfig config;
-  config.precision = PaddleMobileConfig::FP32;
-  config.device = PaddleMobileConfig::kFPGA;
-  config.prog_file = g_model;
-  config.param_file = g_param;
-  config.thread_num = 1;
-  config.batch_size = 1;
-  config.optimize = true;
-  config.lod_mode = true;
-  config.quantification = false;
-  return config;
-}
-int main() {
-  open_device();
-  PaddleMobileConfig config = GetConfig();
-  auto predictor =
-      CreatePaddlePredictor<paddle_mobile::PaddleMobileConfig,
-                            PaddleEngineKind::kPaddleMobile>(config);
-
-  std::cout << "Finishing loading model" << std::endl;
-  int img_length = 256 * 416 * 3;
-  auto img = reinterpret_cast<float *>(fpga_malloc(img_length * sizeof(float)));
-  readStream(g_image, img);
-
-  std::cout << "Finishing initializing data" << std::endl;
-  struct PaddleTensor t_img;
-  t_img.dtype = FLOAT32;
-  t_img.dtypeid = type_id<float>().hash_code();
-  // quantize(&img, img_length);
-  // t_img.dtype = INT8;
-  // t_img.dtypeid = typeid(int8_t);
-  t_img.layout = LAYOUT_HWC;
-  t_img.shape = std::vector<int>({1, 256, 416, 3});
-  t_img.name = "Image information";
-  t_img.data.Reset(img, img_length * sizeof(float));
-  // t_img.data.Reset(img, img_length * sizeof(int8_t));
-  predictor->FeedPaddleTensors({t_img});
-
-  std::cout << "Finishing feeding data " << std::endl;
-
-  predictor->Predict_From_To(0, -1);
-  std::cout << "Finishing predicting " << std::endl;
-
-  std::vector<PaddleTensor> v;        // No need to initialize v
-  predictor->FetchPaddleTensors(&v);  // Old data in v will be cleared
-  std::cout << "Output number is " << v.size() << std::endl;
-  for (int fetchNum = 0; fetchNum < v.size(); fetchNum++) {
-    std::string dumpName = "mobilenet_api_fetch_" + std::to_string(fetchNum);
-    dump_stride(dumpName, v[fetchNum]);
-  }
-  return 0;
-}
diff --git a/mobile/test/fpga/test_pe.cpp b/mobile/test/fpga/test_pe.cpp
deleted file mode 100644
index f5f2708b9e..0000000000
--- a/mobile/test/fpga/test_pe.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_MOBILE_FPGA_V2
-#include "fpga/V2/api.h"
-#include "fpga/V2/filter.h"
-
-namespace fpga = paddle_mobile::fpga;
-
-static const uint32_t N = 64;
-static const uint32_t C = 3;
-static const uint32_t H = 224;
-static const uint32_t W = 224;
-static const uint32_t G = 1;
-
-fpga::DataType input_type = fpga::DATA_TYPE_FP32;
-fpga::DataType output_type = fpga::DATA_TYPE_FP16;
-
-void* ifm = nullptr;
-void* ofm = nullptr;
-void* filter = nullptr;
-void* ifm_scale = nullptr;
-void* ofm_scale = nullptr;
-void* filter_scale = nullptr;
-
-int ifm_size = 0, ofm_size = 0;
-
-void format_data() {
-  ifm_scale = fpga::fpga_malloc(8);
-  ofm_scale = fpga::fpga_malloc(8);
-  int ifm_channel = fpga::filter::calc_aligned_channel(C);
-  int ofm_channel = fpga::filter::calc_aligned_channel(N);
-  int num = fpga::filter::calc_aligned_num(N, C);
-  DLOG << "ifm_channel = " << ifm_channel;
-  DLOG << "ofm_channel = " << ofm_channel;
-  DLOG << "aligned_num = " << num;
-  ifm_size = ifm_channel * H * W;
-  ofm_size = ofm_channel * H * W;
-  ifm = fpga::fpga_malloc(ifm_size * sizeof(float));
-  ofm = fpga::fpga_malloc(ofm_size * sizeof(int16_t));
-  memset(ifm, 0, ifm_size * sizeof(float));
-  memset(ofm, 0, ofm_size * sizeof(int16_t));
-
-  for (int h = 0; h < H; h++) {
-    for (int w = 0; w < W; w++) {
-      for (int c = 0; c < C; c++) {
-        int index = h * W * ifm_channel + w * ifm_channel + c;
-        (reinterpret_cast<float*>(ifm))[index] = h + w + c * 0.1f;
-        // DLOG << index << ":" << ((float *) ifm)[index];
-      }
-    }
-  }
-  fpga::fpga_flush(ifm, ifm_size * sizeof(float));
-  fpga::fpga_flush(ofm, ofm_size * sizeof(int16_t));
-}
-
-void print_fp16(int16_t* ptr, int total_size, int num) {
-  fpga::fpga_invalidate(ptr, total_size * sizeof(int16_t));
-  int stride = total_size / num;
-  for (int i = 0; i < total_size; i += stride) {
-    DLOG << fpga::fp16_2_fp32(ptr[i]);
-  }
-}
-
-void print_fp32(float* ptr, int total_size, int num) {
-  fpga::fpga_invalidate(ptr, total_size * sizeof(float));
-  int stride = total_size / num;
-  for (int i = 0; i < total_size; i += stride) {
-    DLOG << ptr[i];
-  }
-}
-
-void test_bypass() {
-  fpga::BypassArgs args;
-  args.input_data_type = input_type;
-  args.output_data_type = output_type;
-  args.image.address = ifm;
-  args.image.height = H;
-  args.image.width = W;
-  args.image.channels = C;
-  args.image.scale_address = reinterpret_cast<float*>(ifm_scale);
-  args.output.address = ofm;
-  args.output.scale_address = reinterpret_cast<float*>(ofm_scale);
-  fpga::PerformBypass(args);
-}
-
-int main() {
-  paddle_mobile::fpga::open_device();
-  format_data();
-  DLOG << "format data done";
-  print_fp32(reinterpret_cast<float*>(ifm), ifm_size, 200);
-  DLOG << "print input done";
-  test_bypass();
-  DLOG << "test done";
-  print_fp16(reinterpret_cast<int16_t*>(ofm), ifm_size, 200);
-  std::cout << "Computation done" << std::endl;
-  return 0;
-}
-
-#endif
diff --git a/mobile/test/fpga/test_resnet50.cpp b/mobile/test/fpga/test_resnet50.cpp
deleted file mode 100644
index e48ad33f36..0000000000
--- a/mobile/test/fpga/test_resnet50.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include "../test_include.h"
-
-#ifdef PADDLE_MOBILE_FPGA_V1
-#include "fpga/V1/api.h"
-#endif
-#ifdef PADDLE_MOBILE_FPGA_V2
-#include "fpga/V2/api.h"
-#endif
-
-void readStream(std::string filename, float *buf) {
-  std::ifstream in;
-  in.open(filename, std::ios::in);
-  if (!in.is_open()) {
-    std::cout << "open File Failed." << std::endl;
-    return;
-  }
-  string strOne;
-  int i = 0;
-  while (!in.eof()) {
-    in >> buf[i];
-    i++;
-  }
-  in.close();
-}
-
-void convert_to_chw(int16_t **data_in, int channel, int height, int width,
-                    int16_t *data_tmp) {
-  int64_t amount_per_side = width * height;
-  for (int h = 0; h < height; h++) {
-    for (int w = 0; w < width; w++) {
-      for (int c = 0; c < channel; c++) {
-        *(data_tmp + c * amount_per_side + width * h + w) = *((*data_in)++);
-      }
-    }
-  }
-}
-
-void dump(std::string filename, Tensor input_tensor) {
-  auto dataptr = reinterpret_cast<half *>(input_tensor.get_data());
-  std::ofstream out(filename.c_str());
-  float result = 0;
-  for (int i = 0; i < input_tensor.numel(); ++i) {
-    result = paddle_mobile::fpga::fp16_2_fp32(dataptr[i]);
-    out << result << std::endl;
-  }
-  out.close();
-}
-void dump_stride_half(std::string filename, Tensor input_tensor,
-                      const int dumpnum) {
-  int c = (input_tensor.dims())[1];
-  int h = (input_tensor.dims())[2];
-  int w = (input_tensor.dims())[3];
-  auto data_ptr = input_tensor.get_data();
-  auto *data_tmp =
-      reinterpret_cast<half *>(malloc(c * h * w * sizeof(int16_t)));
-  auto *data_ptr_16 = reinterpret_cast<half *>(data_ptr);
-  convert_to_chw(&data_ptr_16, c, h, w, data_tmp);
-  std::ofstream out(filename.c_str());
-  float result = 0;
-  int stride = input_tensor.numel() / dumpnum;
-  stride = stride > 0 ? stride : 1;
-  for (int i = 0; i < input_tensor.numel(); i += stride) {
-    result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]);
-    out << result << std::endl;
-  }
-  out.close();
-  free(data_tmp);
-}
-
-void dump_stride_float(std::string filename, Tensor input_tensor,
-                       const int dumpnum) {
-  auto data_ptr = reinterpret_cast<float *>(input_tensor.get_data());
-  std::ofstream out(filename.c_str());
-  float result = 0;
-  int stride = input_tensor.numel() / dumpnum;
-  stride = stride > 0 ? stride : 1;
-  for (int i = 0; i < input_tensor.numel(); i += stride) {
-    result = data_ptr[i];
-    out << result << std::endl;
-  }
-  out.close();
-}
-static const char *g_resnet50 = "../models/resnet50";
-const std::string g_image_src_float = "../images/image_src_float";  // NOLINT
-int main() {
-  paddle_mobile::fpga::open_device();
-  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
-  if (paddle_mobile.Load(std::string(g_resnet50), true)) {
-    Tensor input_tensor;
-    SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(2),
-                       static_cast<float>(2));
-    readStream(g_image_src_float,
-               input_tensor.mutable_data<float>({1, 3, 224, 224}));
-    paddle_mobile.FeedData(input_tensor);
-    paddle_mobile.Predict_To(-1);
-    for (int i = 0; i < 73; i++) {
-      auto tensor_ptr = paddle_mobile.FetchResult(i);
-      std::string saveName = "resnet50_result_" + std::to_string(i);
-      paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(),
-                                           tensor_ptr->numel() * sizeof(half));
-      // dump_stride_half(saveName, (*tensor_ptr), 20);
-      // dump(saveName, (*tensor_ptr));
-    }
-
-    auto tensor_ptr = paddle_mobile.FetchResult(73);
-    // dump_stride_float("resnet50_result_73", (*tensor_ptr), 20);
-    tensor_ptr = paddle_mobile.FetchResult(74);
-    // dump_stride_float("resnet50_result_74", (*tensor_ptr), 9999);
-
-    float max = 0;
-    auto data_ptr = tensor_ptr->data<float>();
-    int maximumIdx = 0;
-    for (int i = 0; i < (*tensor_ptr).numel(); i++) {
-      if (data_ptr[i] > max) {
-        maximumIdx = i;
-        max = data_ptr[i];
-      }
-    }
-    std::cout << "index : " << std::dec << maximumIdx << ",    value : " << max
-              << std::endl;
-    std::cout << "Computation done" << std::endl;
-    return 0;
-  }
-}
diff --git a/mobile/test/fpga/test_rfcn.cpp b/mobile/test/fpga/test_rfcn.cpp
deleted file mode 100644
index 50f8aa863d..0000000000
--- a/mobile/test/fpga/test_rfcn.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-#ifdef PADDLE_MOBILE_FPGA_V1
-#include "fpga/V1/api.h"
-#endif
-#ifdef PADDLE_MOBILE_FPGA_V2
-#include "fpga/V2/api.h"
-#endif
-
-#include <string>
-
-void readStream(std::string filename, char *buf) {
-  std::ifstream in;
-  in.open(filename, std::ios::in | std::ios::binary);
-  if (!in.is_open()) {
-    std::cout << "open File Failed." << std::endl;
-    return;
-  }
-
-  in.seekg(0, std::ios::end);  // go to the end
-  auto length = in.tellg();    // report location (this is the length)
-  in.seekg(0, std::ios::beg);  // go back to the beginning
-  in.read(buf, length);
-  DLOG << length;
-  in.close();
-}
-
-void convert_to_chw(int16_t **data_in, int channel, int height, int width,
-                    int num, int16_t *data_tmp) {
-  int64_t amount_per_side = width * height;
-  for (int n = 0; n < num; n++) {
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        for (int c = 0; c < channel; c++) {
-          *(data_tmp + n * amount_per_side * channel + c * amount_per_side +
-            width * h + w) = *((*data_in)++);
-        }
-      }
-    }
-  }
-}
-
-void dump_stride_half(std::string filename, Tensor input_tensor,
-                      const int dumpnum, bool use_chw) {
-  // bool use_chw = true;
-  if (input_tensor.dims().size() != 4) return;
-  int c = (input_tensor.dims())[1];
-  int h = (input_tensor.dims())[2];
-  int w = (input_tensor.dims())[3];
-  int n = (input_tensor.dims())[0];
-  auto data_ptr = input_tensor.get_data();
-  auto *data_ptr_16 = reinterpret_cast<half *>(data_ptr);
-  auto data_tmp = data_ptr_16;
-  if (use_chw) {
-    data_tmp =
-        reinterpret_cast<half *>(malloc(n * c * h * w * sizeof(int16_t)));
-    convert_to_chw(&data_ptr_16, c, h, w, n, data_tmp);
-  }
-  std::ofstream out(filename.c_str());
-  float result = 0;
-  int stride = input_tensor.numel() / dumpnum;
-  stride = stride > 0 ? stride : 1;
-  for (int i = 0; i < input_tensor.numel(); i += stride) {
-    result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]);
-    out << result << std::endl;
-  }
-  out.close();
-  if (data_tmp != data_ptr_16) {
-    free(data_tmp);
-  }
-}
-
-void dump_stride_float(std::string filename, Tensor input_tensor,
-                       const int dumpnum) {
-  auto data_ptr = reinterpret_cast<float *>(input_tensor.get_data());
-  std::ofstream out(filename.c_str());
-  float result = 0;
-  int stride = input_tensor.numel() / dumpnum;
-  stride = stride > 0 ? stride : 1;
-  for (int i = 0; i < input_tensor.numel(); i += stride) {
-    result = data_ptr[i];
-    out << result << std::endl;
-  }
-  out.close();
-}
-
-void dump_stride(std::string filename, Tensor input_tensor, const int dumpnum,
-                 bool use_chw) {
-  static int i = 0;
-  if (input_tensor.numel() == 0) {
-    return;
-  }
-  if (input_tensor.type() == typeid(float)) {
-    DLOG << "op: " << i++ << ", float data  " << input_tensor.numel();
-
-    dump_stride_float(filename, input_tensor, dumpnum);
-  } else {
-    DLOG << "op: " << i++ << ", half data  " << input_tensor.numel();
-
-    dump_stride_half(filename, input_tensor, dumpnum, use_chw);
-  }
-  DLOG << "dump input address: " << input_tensor.get_data();
-}
-
-static const char *g_rfcn_combine = "../models/rfcn";
-static const char *g_image_src_float = "../models/rfcn/data.bin";
-int main() {
-  paddle_mobile::fpga::open_device();
-  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
-
-  if (paddle_mobile.Load(std::string(g_rfcn_combine) + "/model",
-                         std::string(g_rfcn_combine) + "/params", true, false,
-                         1, true)) {
-    float img_info[3] = {768, 1536, 768.0f / 960.0f};
-    auto img = reinterpret_cast<float *>(
-        fpga::fpga_malloc(768 * 1536 * 3 * sizeof(float)));
-    readStream(g_image_src_float, reinterpret_cast<char *>(img));
-
-    std::vector<void *> v(3, nullptr);
-    paddle_mobile.FeedData(std::vector<void *>({img_info, img}));
-    paddle_mobile.Predict_To(-1);
-
-    for (int i = 65; i < 69; i++) {
-      auto tensor_ptr = paddle_mobile.FetchResult(i);
-      std::string saveName = "rfcn_" + std::to_string(i);
-      paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(),
-                                           tensor_ptr->numel() * sizeof(float));
-      dump_stride(saveName, (*tensor_ptr), tensor_ptr->numel(), true);
-    }
-    //   paddle_mobile.GetResults(&v);
-    DLOG << "Computation done";
-    fpga::fpga_free(img);
-  }
-
-  return 0;
-}
diff --git a/mobile/test/fpga/test_rfcn_api.cpp b/mobile/test/fpga/test_rfcn_api.cpp
deleted file mode 100644
index e86743cc7e..0000000000
--- a/mobile/test/fpga/test_rfcn_api.cpp
+++ /dev/null
@@ -1,172 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_MOBILE_FPGA
-#define PADDLE_MOBILE_FPGA
-#endif
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include "../../src/io/paddle_inference_api.h"
-
-using namespace paddle_mobile;        // NOLINT
-using namespace paddle_mobile::fpga;  // NOLINT
-
-static const char *g_image = "../models/rfcn/data.bin";
-static const char *g_model = "../models/rfcn/model";
-static const char *g_param = "../models/rfcn/params";
-
-void readStream(std::string filename, char *buf) {
-  std::ifstream in;
-  in.open(filename, std::ios::in | std::ios::binary);
-  if (!in.is_open()) {
-    std::cout << "open File Failed." << std::endl;
-    return;
-  }
-
-  in.seekg(0, std::ios::end);  // go to the end
-  auto length = in.tellg();    // report location (this is the length)
-  in.seekg(0, std::ios::beg);  // go back to the beginning
-  in.read(buf, length);
-  in.close();
-}
-
-PaddleMobileConfig GetConfig() {
-  PaddleMobileConfig config;
-  config.precision = PaddleMobileConfig::FP32;
-  config.device = PaddleMobileConfig::kFPGA;
-  config.prog_file = g_model;
-  config.param_file = g_param;
-  config.thread_num = 1;
-  config.batch_size = 1;
-  config.optimize = true;
-  config.lod_mode = true;
-  config.quantification = false;
-  return config;
-}
-
-PaddleMobileConfig GetConfig1() {
-  PaddleMobileConfig config;
-  config.precision = PaddleMobileConfig::FP32;
-  config.device = PaddleMobileConfig::kFPGA;
-  config.model_dir = "../models/resnet50";
-  config.thread_num = 1;
-  config.batch_size = 1;
-  config.optimize = true;
-  config.quantification = false;
-  return config;
-}
-
-int main() {
-  open_device();
-#if 0
-  PaddleMobileConfig config1 = GetConfig1();
-  auto predictor1 =
-      CreatePaddlePredictor<PaddleMobileConfig,
-                            PaddleEngineKind::kPaddleMobile>(config1);
-
-  std::cout << "Finishing loading model" << std::endl;
-
-  int img_length1 = 224 * 224 * 3;
-  auto img1 =
-      reinterpret_cast<float *>(fpga_malloc(img_length1 * sizeof(float)));
-
-  std::cout << "Finishing initializing data" << std::endl;
-
-  struct PaddleTensor t_img1;
-
-  t_img1.dtypeid = type_id<float>().hash_code();
-  t_img1.layout = LAYOUT_HWC;
-  t_img1.shape = std::vector<int>({1, 224, 224, 3});
-  t_img1.name = "Image information";
-  t_img1.data.Reset(img1, img_length1 * sizeof(float));
-  predictor1->FeedPaddleTensors({t_img1});
-  predictor1->Predict_From_To(0, -1);
-  std::cout << "Finishing predicting " << std::endl;
-
-  std::vector<PaddleTensor> v1;         // No need to initialize v
-  predictor1->FetchPaddleTensors(&v1);  // Old data in v will be cleared
-  std::cout << "Output number is " << v1.size() << std::endl;
-  std::cout << "out[0] length " << v1[0].data.length() << std::endl;
-  fpga_free(img1);
-#endif
-  ////////////////////////////
-
-  PaddleMobileConfig config = GetConfig();
-  auto predictor =
-      CreatePaddlePredictor<PaddleMobileConfig,
-                            PaddleEngineKind::kPaddleMobile>(config);
-
-  std::cout << "Finishing loading model" << std::endl;
-
-  float img_info[3] = {432, 1280, 1.0f};
-  int img_length = 432 * 1280 * 3;
-  auto img = reinterpret_cast<float *>(fpga_malloc(img_length * sizeof(float)));
-  readStream(g_image, reinterpret_cast<char *>(img));
-
-  std::cout << "Finishing initializing data" << std::endl;
-  struct PaddleTensor t_img_info, t_img;
-  t_img.dtypeid = type_id<float>().hash_code();
-  t_img_info.layout = LAYOUT_HWC;
-  t_img_info.shape = std::vector<int>({1, 3});
-  t_img_info.name = "Image information";
-  t_img_info.data.Reset(img_info, 3 * sizeof(float));
-
-  t_img.dtypeid = type_id<float>().hash_code();
-  t_img.layout = LAYOUT_HWC;
-  t_img.shape = std::vector<int>({1, 432, 1280, 3});
-  t_img.name = "Image information";
-  t_img.data.Reset(img, img_length * sizeof(float));
-  predictor->FeedPaddleTensors({t_img_info, t_img});
-
-  std::cout << "Finishing feeding data " << std::endl;
-
-  predictor->Predict_From_To(0, -1);
-  std::cout << "Finishing predicting " << std::endl;
-
-  std::vector<PaddleTensor> v;        // No need to initialize v
-  predictor->FetchPaddleTensors(&v);  // Old data in v will be cleared
-  std::cout << "Output number is " << v.size() << std::endl;
-  std::cout << "out[0] length " << v[0].data.length() << std::endl;
-  std::cout << "out[1] length " << v[1].data.length() << std::endl;
-  std::cout << "out[2] length " << v[2].data.length() << std::endl;
-
-  auto post_nms = v[0].data.length() / sizeof(float) / 8;
-  for (int num = 0; num < post_nms; num++) {
-    for (int i = 0; i < 8; i++) {
-      auto p = reinterpret_cast<float *>(v[0].data.data());
-      std::cout << p[num * 8 + i] << std::endl;
-    }
-  }
-  for (int num = 0; num < post_nms; num++) {
-    for (int i = 0; i < 8; i++) {
-      auto p = reinterpret_cast<float *>(v[1].data.data());
-      std::cout << p[num * 8 + i] << std::endl;
-    }
-  }
-  for (int num = 0; num < post_nms; num++) {
-    for (int i = 0; i < 4; i++) {
-      auto p = reinterpret_cast<float *>(v[2].data.data());
-      std::cout << p[num * 4 + i] << std::endl;
-    }
-  }
-  std::cout << "Finish getting vector values" << std::endl;
-  fpga_free(img);
-
-  auto version = fpga::paddle_mobile_version();
-
-  std::cout << "0X0" << std::hex << version << std::endl;
-
-  return 0;
-}
diff --git a/mobile/test/fpga/test_ssd.cpp b/mobile/test/fpga/test_ssd.cpp
deleted file mode 100644
index c6d2b51a8c..0000000000
--- a/mobile/test/fpga/test_ssd.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <typeindex>
-#include <typeinfo>
-#include "../test_include.h"
-
-#include "fpga/KD/float16.hpp"
-#include "fpga/KD/llapi/zynqmp_api.h"
-
-static const char* g_ssd = "../models/resnet50";
-
-int main() {
-  zynqmp::open_device();
-
-  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
-  std::string dir = std::string(g_ssd);
-  std::string model = std::string(g_ssd) + "/model";
-  std::string params = std::string(g_ssd) + "/params";
-
-  // if (paddle_mobile.Load(dir, true)) {
-  if (paddle_mobile.Load(model, params, true)) {
-    Tensor input_tensor;
-    SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(1),
-                       static_cast<float>(1));
-    float* data = input_tensor.mutable_data<float>({1, 3, 224, 224});
-
-    paddle_mobile.Predict(input_tensor);
-    auto result_ptr = paddle_mobile.Fetch();
-    float* result_data = result_ptr->data<float>();
-  }
-  return 0;
-}
diff --git a/mobile/test/fpga/test_tensor_quant.cpp b/mobile/test/fpga/test_tensor_quant.cpp
deleted file mode 100644
index 6cfc27e91c..0000000000
--- a/mobile/test/fpga/test_tensor_quant.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  auto time1 = time();
-  if (paddle_mobile.Load(g_resnet, true)) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-    std::vector<int64_t> dims{1, 3, 32, 32};
-    Tensor input_tensor;
-    SetupTensor<float>(&input_tensor, {1, 3, 32, 32}, static_cast<float>(0),
-                       static_cast<float>(1));
-
-    std::vector<float> input(input_tensor.data<float>(),
-                             input_tensor.data<float>() + input_tensor.numel());
-    // 预热一次
-    paddle_mobile.Predict(input, dims);
-    auto time3 = time();
-    for (int i = 0; i < 10; ++i) {
-      paddle_mobile.Predict(input, dims);
-    }
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
-              << std::endl;
-  }
-
-  return 0;
-}
diff --git a/mobile/test/fpga/test_yolo_api.cpp b/mobile/test/fpga/test_yolo_api.cpp
deleted file mode 100644
index f8f1a48abc..0000000000
--- a/mobile/test/fpga/test_yolo_api.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_MOBILE_FPGA
-#define PADDLE_MOBILE_FPGA
-#endif
-#include <fstream>
-#include <iostream>
-#include "../../src/io/paddle_inference_api.h"
-
-using namespace paddle_mobile;        // NOLINT
-using namespace paddle_mobile::fpga;  // NOLINT
-
-static const char *g_image = "../images/yolo_test_txtimg/1.txt";
-static const char *g_model = "../models/yolo_bn_l2_model/__model__";
-static const char *g_param = "../models/yolo_bn_l2_model/model.params";
-
-void readStream(std::string filename, float *buf) {
-  std::ifstream in;
-  in.open(filename, std::ios::in);
-  if (!in.is_open()) {
-    std::cout << "open File Failed." << std::endl;
-    return;
-  }
-  int i = 0;
-  while (!in.eof()) {
-    in >> buf[i];
-    i++;
-  }
-  in.close();
-}
-
-signed char float_to_int8(float fdata) {
-  if (fdata < 0.0) {
-    fdata -= 0.5;
-  } else {
-    fdata += 0.5;
-  }
-  return (signed char)fdata;
-}
-void quantize(float **data_in, int data_size) {
-  float *tmp = *data_in;
-  signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
-  for (int i = 0; i < data_size; i++) {
-    tmp_data[i] = float_to_int8((*data_in)[i] + 128);
-  }
-  *data_in = (float *)tmp_data;  // NOLINT
-  fpga_free(tmp);
-}
-
-void convert_to_chw(float **data_in, int channel, int height, int width,
-                    float *data_tmp) {
-  int64_t amount_per_side = width * height;
-  for (int h = 0; h < height; h++) {
-    for (int w = 0; w < width; w++) {
-      for (int c = 0; c < channel; c++) {
-        *(data_tmp + c * amount_per_side + width * h + w) = *((*data_in)++);
-      }
-    }
-  }
-}
-
-void dump_stride_float(std::string filename, PaddleTensor input_tensor) {
-  auto data_ptr = reinterpret_cast<float *>(input_tensor.data.data());
-  int c = (input_tensor.shape)[1];
-  int h = (input_tensor.shape)[2];
-  int w = (input_tensor.shape)[3];
-  int n = (input_tensor.shape)[0];
-  float *data_tmp =
-      reinterpret_cast<float *>(malloc(c * h * w * sizeof(float)));
-  convert_to_chw(&data_ptr, c, h, w, data_tmp);
-  std::ofstream out(filename.c_str());
-  float result = 0;
-  int datasize = abs(c * h * w * n);
-  if (datasize == 0) {
-    std::cout << "wrong dump data size" << std::endl;
-    return;
-  }
-  for (int i = 0; i < datasize; i++) {
-    result = data_tmp[i];
-    out << result << std::endl;
-  }
-  out.close();
-}
-
-void dump_stride(std::string filename, PaddleTensor input_tensor) {
-  if (input_tensor.dtypeid == type_id<float>().hash_code()) {
-    dump_stride_float(filename, input_tensor);
-  } else {
-    std::cout << "only support dumping float data" << std::endl;
-  }
-}
-
-PaddleMobileConfig GetConfig() {
-  PaddleMobileConfig config;
-  config.precision = PaddleMobileConfig::FP32;
-  config.device = PaddleMobileConfig::kFPGA;
-  config.prog_file = g_model;
-  config.param_file = g_param;
-  config.thread_num = 1;
-  config.batch_size = 1;
-  config.optimize = true;
-  config.lod_mode = true;
-  config.quantification = false;
-  return config;
-}
-
-int main() {
-  open_device();
-  PaddleMobileConfig config = GetConfig();
-  auto predictor =
-      CreatePaddlePredictor<PaddleMobileConfig,
-                            PaddleEngineKind::kPaddleMobile>(config);
-
-  std::cout << "Finishing loading model" << std::endl;
-  int img_length = 256 * 416 * 3;
-  auto img = reinterpret_cast<float *>(fpga_malloc(img_length * sizeof(float)));
-  readStream(g_image, img);
-
-  std::cout << "Finishing initializing data" << std::endl;
-  struct PaddleTensor t_img;
-  // t_img.dtype = FLOAT32;
-  // t_img.dtypeid = type_id<float>().hash_code();
-  quantize(&img, img_length);
-  t_img.dtype = INT8;
-  t_img.dtypeid = type_id<int8_t>().hash_code();
-  t_img.layout = LAYOUT_HWC;
-  t_img.shape = std::vector<int>({1, 256, 416, 3});
-  t_img.name = "Image information";
-  // t_img.data.Reset(img, img_length * sizeof(float));
-  t_img.data.Reset(img, img_length * sizeof(int8_t));
-  predictor->FeedPaddleTensors({t_img});
-
-  std::cout << "Finishing feeding data " << std::endl;
-
-  predictor->Predict_From_To(0, -1);
-  std::cout << "Finishing predicting " << std::endl;
-
-  std::vector<PaddleTensor> v;        // No need to initialize v
-  predictor->FetchPaddleTensors(&v);  // Old data in v will be cleared
-  std::cout << "Output number is " << v.size() << std::endl;
-  for (int fetchNum = 0; fetchNum < v.size(); fetchNum++) {
-    std::string dumpName = "yolo_api_fetch_" + std::to_string(fetchNum);
-    dump_stride(dumpName, v[fetchNum]);
-  }
-  return 0;
-}
diff --git a/mobile/test/framework/test_inference_api.cpp b/mobile/test/framework/test_inference_api.cpp
deleted file mode 100644
index e1713bb203..0000000000
--- a/mobile/test/framework/test_inference_api.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "io/paddle_inference_api.h"
-
-using namespace paddle_mobile;
-
-PaddleMobileConfig GetConfig() {
-  PaddleMobileConfig config;
-  config.precision = PaddleMobileConfig::FP32;
-  config.device = PaddleMobileConfig::kCPU;
-  config.model_dir = "../models/mobilenet/";
-  config.thread_num = 4;
-  return config;
-}
-
-int main() {
-  PaddleMobileConfig config = GetConfig();
-  auto predictor =
-      CreatePaddlePredictor<PaddleMobileConfig,
-                            PaddleEngineKind::kPaddleMobile>(config);
-
-  float data[1 * 3 * 224 * 224] = {1.0f};
-
-  PaddleTensor tensor;
-  tensor.shape = std::vector<int>({1, 3, 224, 224});
-  tensor.data = PaddleBuf(data, sizeof(data));
-  tensor.dtype = PaddleDType::FLOAT32;
-  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
-
-  PaddleTensor tensor_out;
-  tensor_out.shape = std::vector<int>({});
-  tensor_out.data = PaddleBuf();
-  tensor_out.dtype = PaddleDType::FLOAT32;
-  std::vector<PaddleTensor> outputs(1, tensor_out);
-
-  std::cout << " before predict " << std::endl;
-
-  predictor->Run(paddle_tensor_feeds, &outputs);
-
-  std::cout << " after predict " << std::endl;
-  //  assert();
-
-  float* data_o = static_cast<float*>(outputs[0].data.data());
-  for (size_t j = 0; j < outputs[0].data.length() / sizeof(float); ++j) {
-    std::cout << "output[" << j << "]: " << data_o[j] << std::endl;
-  }
-
-  return 0;
-}
diff --git a/mobile/test/framework/test_load.cpp b/mobile/test/framework/test_load.cpp
deleted file mode 100644
index ed74b63497..0000000000
--- a/mobile/test/framework/test_load.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include <string>
-
-#include "../test_helper.h"
-#include "framework/loader.h"
-
-int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  //  ../../../test/models/googlenet
-  //  ../../../test/models/mobilenet
-
-  std::string g_super = "../models/superresoltion";
-  //  auto program = loader.Load(g_super, true);
-
-  auto program = loader.Load(std::string(g_super) + "/model",
-                             std::string(g_super) + "/params", false);
-  //  program.originProgram->Description("program desc: ");
-
-  return 0;
-}
diff --git a/mobile/test/framework/test_load_memory.cpp b/mobile/test/framework/test_load_memory.cpp
deleted file mode 100644
index afab17d5e7..0000000000
--- a/mobile/test/framework/test_load_memory.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include <string>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-static size_t ReadBuffer(const char *file_name, uint8_t **out) {
-  FILE *fp;
-  fp = fopen(file_name, "rb");
-  PADDLE_MOBILE_ENFORCE(fp != nullptr, " %s open failed !", file_name);
-  fseek(fp, 0, SEEK_END);
-  auto size = static_cast<size_t>(ftell(fp));
-  rewind(fp);
-  DLOG << "model size: " << size;
-  *out = reinterpret_cast<uint8_t *>(malloc(size));
-  size_t cur_len = 0;
-  size_t nread;
-  while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
-    cur_len += nread;
-  }
-  fclose(fp);
-  return cur_len;
-}
-
-static char *Get_binary_data(std::string filename) {
-  FILE *file = fopen(filename.c_str(), "rb");
-  PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
-                        filename.c_str());
-  fseek(file, 0, SEEK_END);
-  int64_t size = ftell(file);
-  PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
-  rewind(file);
-  auto *data = new char[size];
-  size_t bytes_read = fread(data, 1, size, file);
-  PADDLE_MOBILE_ENFORCE(bytes_read == size,
-                        "read binary file bytes do not match with fseek");
-  fclose(file);
-  return data;
-}
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  auto model_path = std::string(g_genet_combine) + "/model";
-  auto params_path = std::string(g_genet_combine) + "/params";
-  uint8_t *bufModel = nullptr;
-  size_t sizeBuf = ReadBuffer(model_path.c_str(), &bufModel);
-  uint8_t *bufParams = nullptr;
-
-  std::cout << "sizeBuf: " << sizeBuf << std::endl;
-  size_t sizeParams = ReadBuffer(params_path.c_str(), &bufParams);
-  std::cout << "sizeParams: " << sizeParams << std::endl;
-
-  paddle_mobile.LoadCombinedMemory(sizeBuf, bufModel, sizeParams, bufParams);
-  return 0;
-}
diff --git a/mobile/test/framework/test_load_memory_inference_api.cpp b/mobile/test/framework/test_load_memory_inference_api.cpp
deleted file mode 100644
index 5b2773f8f1..0000000000
--- a/mobile/test/framework/test_load_memory_inference_api.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-
-#include <iostream>
-#include "../test_helper.h"
-#include "io/paddle_inference_api.h"
-
-static size_t ReadBuffer(const char *file_name, uint8_t **out) {
-  FILE *fp;
-  fp = fopen(file_name, "rb");
-  PADDLE_MOBILE_ENFORCE(fp != nullptr, " %s open failed !", file_name);
-  fseek(fp, 0, SEEK_END);
-  auto size = static_cast<size_t>(ftell(fp));
-  rewind(fp);
-  DLOG << "model size: " << size;
-  *out = reinterpret_cast<uint8_t *>(malloc(size));
-  size_t cur_len = 0;
-  size_t nread;
-  while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
-    cur_len += nread;
-  }
-  fclose(fp);
-  return cur_len;
-}
-
-static char *Get_binary_data(std::string filename) {
-  FILE *file = fopen(filename.c_str(), "rb");
-  PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
-                        filename.c_str());
-  fseek(file, 0, SEEK_END);
-  int64_t size = ftell(file);
-  PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
-  rewind(file);
-  auto *data = new char[size];
-  size_t bytes_read = fread(data, 1, size, file);
-  PADDLE_MOBILE_ENFORCE(bytes_read == size,
-                        "read binary file bytes do not match with fseek");
-  fclose(file);
-  return data;
-}
-
-paddle_mobile::PaddleMobileConfig GetConfig() {
-  paddle_mobile::PaddleMobileConfig config;
-  config.precision = paddle_mobile::PaddleMobileConfig::FP32;
-  config.device = paddle_mobile::PaddleMobileConfig::kGPU_CL;
-  const std::shared_ptr<paddle_mobile::PaddleModelMemoryPack> &memory_pack =
-      std::make_shared<paddle_mobile::PaddleModelMemoryPack>();
-  auto model_path = std::string(g_mobilenet_combined) + "/model";
-  auto params_path = std::string(g_mobilenet_combined) + "/params";
-  memory_pack->model_size =
-      ReadBuffer(model_path.c_str(), &memory_pack->model_buf);
-  std::cout << "sizeBuf: " << memory_pack->model_size << std::endl;
-  memory_pack->combined_params_size =
-      ReadBuffer(params_path.c_str(), &memory_pack->combined_params_buf);
-  std::cout << "sizeParams: " << memory_pack->combined_params_size << std::endl;
-  memory_pack->from_memory = true;
-  config.memory_pack = *memory_pack;
-  config.thread_num = 4;
-  return config;
-}
-int main() {
-  paddle_mobile::PaddleMobileConfig config = GetConfig();
-  auto predictor = paddle_mobile::CreatePaddlePredictor<
-      paddle_mobile::PaddleMobileConfig,
-      paddle_mobile::PaddleEngineKind::kPaddleMobile>(config);
-  return 0;
-}
diff --git a/mobile/test/framework/test_optimize.cpp b/mobile/test/framework/test_optimize.cpp
deleted file mode 100644
index 0392020789..0000000000
--- a/mobile/test/framework/test_optimize.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_helper.h"
-#include "framework/loader.h"
-#include "framework/program/program-optimize/node.h"
-#include "framework/program/program-optimize/program_optimize.h"
-
-int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  //    "../../../test/models/googlenet"
-  auto program = loader.Load(g_mobilenet_ssd, true);
-  paddle_mobile::framework::ProgramOptimize optimize;
-  //  program.originProgram->Description("origin");
-  auto optimize_program = optimize.FusionOptimize(program.originProgram);
-  if (optimize_program != nullptr) {
-    //    optimize_program->Description("optimize");
-  } else {
-    LOG(paddle_mobile::kLOG_ERROR) << "optimize_program is null";
-  }
-  return 0;
-}
diff --git a/mobile/test/net/test_alexnet.cpp b/mobile/test/net/test_alexnet.cpp
deleted file mode 100644
index 50053fe82f..0000000000
--- a/mobile/test/net/test_alexnet.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  auto time1 = time();
-  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
-  //                     std::string(g_mobilenet_detect) + "/params", true);
-
-  auto isok = paddle_mobile.Load(g_alexnet, true);
-  if (isok) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 224, 224};
-    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
-
-    auto vec_result = paddle_mobile.Predict(input, dims);
-    std::vector<float>::iterator biggest =
-        std::max_element(std::begin(vec_result), std::end(vec_result));
-    std::cout << " Max element is " << *biggest << " at position "
-              << std::distance(std::begin(vec_result), biggest) << std::endl;
-
-    // 预热十次
-    for (int i = 0; i < 10; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
-    auto time3 = time();
-    for (int i = 0; i < 10; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
-    DLOG << vec_result;
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
-              << std::endl;
-  }
-
-  std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
-               "是否存在?"
-            << std::endl;
-  return 0;
-}
diff --git a/mobile/test/net/test_benchmark.cpp b/mobile/test/net/test_benchmark.cpp
deleted file mode 100644
index 19d37eeded..0000000000
--- a/mobile/test/net/test_benchmark.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include <sstream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main(int argc, char* argv[]) {
-  if (argc < 4) {
-    std::cout << "Usage: " << std::endl
-              << "./test_benchmark fluid_model feed_shape thread_num [use_fuse]"
-              << std::endl;
-    std::cout << "use_fuse: optional, bool, default is 1\n";
-    return 1;
-  }
-  bool optimize = true;
-  char* fluid_model = argv[1];
-  char* feed_shape = argv[2];
-  int thread_num = atoi(argv[3]);
-  if (argc == 5) {
-    optimize = atoi(argv[4]);
-  }
-
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(thread_num);
-  auto time1 = time();
-  //  if (paddle_mobile.Load(fluid_model, optimize, false, 1, true)) {
-  if (paddle_mobile.Load(std::string(fluid_model) + "/model",
-                         std::string(fluid_model) + "/params", optimize, false,
-                         1, true)) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time2) << "ms\n";
-    paddle_mobile::framework::Tensor input;
-    std::shared_ptr<paddle_mobile::framework::Tensor> output;
-    std::vector<int64_t> dims{1, 3, 224, 224};
-    if (feed_shape) {
-      sscanf(feed_shape, "%lld,%lld,%lld,%lld", &dims[0], &dims[1], &dims[2],
-             &dims[3]);
-    }
-    std::cout << "feed shape: [" << dims[0] << ", " << dims[1] << ", "
-              << dims[2] << ", " << dims[3] << "]\n";
-    paddle_mobile::framework::DDim in_shape =
-        paddle_mobile::framework::make_ddim(dims);
-    SetupTensor<float>(&input, in_shape, 0.f, 255.f);
-    // warmup
-    for (int i = 0; i < 2; ++i) {
-      paddle_mobile.Predict(input);
-    }
-    auto time3 = time();
-    int test_count = 100;
-    for (int i = 0; i < test_count; ++i) {
-      paddle_mobile.Predict(input);
-    }
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) / test_count
-              << "ms\n";
-    std::ostringstream os("output tensor size: ");
-    output = paddle_mobile.Fetch();
-    os << output->numel() << "\n" << output->data<float>()[0];
-    for (int i = 1; i < output->numel(); ++i) {
-      os << ", " << output->data<float>()[i];
-    }
-    std::string output_str = os.str();
-    //    std::cout << output_str << std::endl;
-  }
-  return 0;
-}
diff --git a/mobile/test/net/test_eng.cpp b/mobile/test/net/test_eng.cpp
deleted file mode 100644
index 67b13f1242..0000000000
--- a/mobile/test/net/test_eng.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-#ifdef PADDLE_MOBILE_CPU
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-#endif
-  //    paddle_mobile.SetThreadNum(4);
-  auto time1 = time();
-  if (paddle_mobile.Load(std::string(g_eng) + "/model",
-                         std::string(g_eng) + "/params", true, false, 1,
-                         true)) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-    std::vector<int64_t> dims{1, 1, 48, 400};
-    LoDTensor input_tensor;
-    SetupTensor<float>(&input_tensor, {1, 1, 48, 400}, static_cast<float>(0),
-                       static_cast<float>(1));
-
-    std::vector<float> input(input_tensor.data<float>(),
-                             input_tensor.data<float>() + input_tensor.numel());
-    //   预热十次
-    for (int i = 0; i < 1; ++i) {
-      paddle_mobile.Predict(input_tensor);
-    }
-    auto time3 = time();
-    for (int i = 0; i < 1; ++i) {
-      paddle_mobile.Predict(input_tensor);
-    }
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
-              << std::endl;
-  }
-  return 0;
-}
diff --git a/mobile/test/net/test_genet_combine.cpp b/mobile/test/net/test_genet_combine.cpp
deleted file mode 100644
index e6b0505a67..0000000000
--- a/mobile/test/net/test_genet_combine.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  auto time1 = time();
-  if (paddle_mobile.Load(std::string(g_genet_combine) + "/model",
-                         std::string(g_genet_combine) + "/params", true)) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 128, 128};
-    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
-
-    // 预热一次
-    auto vec_result = paddle_mobile.Predict(input, dims);
-    std::vector<float>::iterator biggest =
-        std::max_element(std::begin(vec_result), std::end(vec_result));
-    std::cout << " Max element is " << *biggest << " at position "
-              << std::distance(std::begin(vec_result), biggest) << std::endl;
-
-    auto time3 = time();
-    for (int i = 0; i < 10; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
-              << std::endl;
-  }
-  std::cout
-      << "如果结果Nan请查看: test/images/test_image_1x3x224x224_float 是否存在?"
-      << std::endl;
-  return 0;
-}
diff --git a/mobile/test/net/test_gesture.cpp b/mobile/test/net/test_gesture.cpp
deleted file mode 100644
index 596d50350e..0000000000
--- a/mobile/test/net/test_gesture.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fstream>
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-const int max_run_times = 10;
-
-int main(int argc, char **argv) {
-  if (argc < 3) {
-    std::cerr
-        << "Usage: ./test_ocr [detect_model_dir|recog_model_dir] image_path"
-        << std::endl;
-    return 1;
-  }
-  std::string model_dir = argv[1];
-  std::string image_path = argv[2];
-
-  // init input, output params
-  std::vector<float> input_vec;
-  std::vector<int64_t> input_shape;
-  std::vector<std::string> output_fetch_nodes;
-  int PRINT_NODE_ELEM_NUM = 10;
-
-  input_shape.emplace_back(1);
-  input_shape.emplace_back(3);
-  input_shape.emplace_back(192);
-  input_shape.emplace_back(192);
-  output_fetch_nodes.emplace_back("detection_output_0.tmp_0");
-  std::shared_ptr<framework::LoDTensor> outputs[output_fetch_nodes.size()];
-
-  // init paddle instance
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(1);
-  std::cout << "start load " << std::endl;
-  auto load_success = paddle_mobile.Load(std::string(model_dir) + "/model",
-                                         std::string(model_dir) + "/params",
-                                         true, false, 1, true);
-  std::cout << "load_success:" << load_success << std::endl;
-  // input image raw tensor, generated by
-  // [scripts](tools/python/imagetools/img2nchw.py)
-  std::cout << "image_path: " << image_path << std::endl;
-  std::cout << "input_shape: " << input_shape[0] << ", " << input_shape[1]
-            << ", " << input_shape[2] << ", " << input_shape[3] << std::endl;
-  GetInput<float>(image_path, &input_vec, input_shape);
-
-  // model predict
-  auto pred_start_time = paddle_mobile::time();
-  for (int run_idx = 0; run_idx < max_run_times; ++run_idx) {
-    paddle_mobile.Predict(input_vec, input_shape);
-    for (int out_idx = 0; out_idx < output_fetch_nodes.size(); ++out_idx) {
-      auto fetch_name = output_fetch_nodes[out_idx];
-      outputs[out_idx] = paddle_mobile.Fetch(fetch_name);
-    }
-  }
-  auto pred_end_time = paddle_mobile::time();
-
-  // inference time
-  double pred_time =
-      paddle_mobile::time_diff(pred_start_time, pred_end_time) / max_run_times;
-  std::cout << "predict time(ms): " << pred_time << std::endl;
-
-  // output result
-  for (int out_idx = 0; out_idx < output_fetch_nodes.size(); ++out_idx) {
-    std::string node_id = output_fetch_nodes[out_idx];
-    auto node_lod_tensor = outputs[out_idx];
-    int node_elem_num = node_lod_tensor->numel();
-    float *node_ptr = node_lod_tensor->data<float>();
-    std::cout << "==== output_fetch_nodes[" << out_idx
-              << "] =====" << std::endl;
-    std::cout << "node_id: " << node_id << std::endl;
-    std::cout << "node_elem_num: " << node_elem_num << std::endl;
-    std::cout << "PRINT_NODE_ELEM_NUM: " << PRINT_NODE_ELEM_NUM << std::endl;
-    PRINT_NODE_ELEM_NUM =
-        (node_elem_num > PRINT_NODE_ELEM_NUM) ? PRINT_NODE_ELEM_NUM : 0;
-    for (int eidx = 0; eidx < PRINT_NODE_ELEM_NUM; ++eidx) {
-      std::cout << node_id << "[" << eidx << "]: " << node_ptr[eidx]
-                << std::endl;
-    }
-    std::cout << std::endl;
-  }
-
-  return 0;
-}
diff --git a/mobile/test/net/test_googlenet.cpp b/mobile/test/net/test_googlenet.cpp
deleted file mode 100644
index ea6c6ce155..0000000000
--- a/mobile/test/net/test_googlenet.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include <sstream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main(int argc, char *argv[]) {
-  if (argc < 4) {
-    std::cout << "Usage: ./test_googlenet fluid-model input-image image-shape "
-                 "[thread-num] [fusion]\n"
-              << " fluid-model: fluid model path. \n"
-              << " input-image: input raw image path. \n"
-              << " image-shape: input tensor shape, such as 1,3,224,224.\n"
-              << " thread-num: optional int, threads count, default is 1.\n"
-              << " fusion: optional bool, default is 0.\n";
-    return 1;
-  }
-  int thread_num = 1;
-  bool optimize = false;
-  char *fluid_model = argv[1];
-  char *input_img = argv[2];
-  char *feed_shape = argv[3];
-  if (argc >= 5) {
-    thread_num = atoi(argv[4]);
-  }
-  if (argc >= 6) {
-    optimize = atoi(argv[5]);
-  }
-#ifdef PADDLE_MOBILE_FPGA
-  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
-#endif
-#ifdef PADDLE_MOBILE_CPU
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-#endif
-  paddle_mobile.SetThreadNum(thread_num);
-  auto time1 = time();
-  std::vector<float> output;
-  if (paddle_mobile.Load(fluid_model, optimize, false, 1, true)) {
-    auto time2 = paddle_mobile::time();
-    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
-              << std::endl;
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 224, 224};
-    if (feed_shape) {
-      sscanf(feed_shape, "%lld,%lld,%lld,%lld", &dims[0], &dims[1], &dims[2],
-             &dims[3]);
-    }
-    std::cout << "feed shape: [" << dims[0] << ", " << dims[1] << ", "
-              << dims[2] << ", " << dims[3] << "]" << std::endl;
-
-    GetInput<float>(input_img, &input, dims);
-
-    // warmup
-    for (int i = 0; i < 10; ++i) {
-      output = paddle_mobile.Predict(input, dims);
-    }
-    auto time3 = time();
-    for (int i = 0; i < 10; ++i) {
-      output = paddle_mobile.Predict(input, dims);
-    }
-    auto time4 = time();
-    std::cout << "predict cost: " << time_diff(time3, time4) / 10 << "ms\n";
-
-    std::ostringstream os;
-    os << output[0];
-    for (int i = 1; i < output.size(); ++i) {
-      os << ", " << output[i];
-    }
-    DLOG << os.str();
-  }
-  return 0;
-}
diff --git a/mobile/test/net/test_googlenet_quali.cpp b/mobile/test/net/test_googlenet_quali.cpp
deleted file mode 100644
index 28cb6207d7..0000000000
--- a/mobile/test/net/test_googlenet_quali.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-#ifdef PADDLE_MOBILE_FPGA
-  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
-#endif
-
-#ifdef PADDLE_MOBILE_CPU
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-#endif
-
-  paddle_mobile.SetThreadNum(4);
-  bool optimize = true;
-  bool quli = true;
-  auto time1 = time();
-  auto isok = paddle_mobile.Load(std::string(g_googlenet_quali) + "/model",
-                                 std::string(g_googlenet_quali) + "/params",
-                                 optimize, quli);
-  if (isok) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 224, 224};
-    GetInput<float>(g_test_image_1x3x224x224, &input, dims);
-    // 预热十次
-    for (int i = 0; i < 10; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
-    auto time3 = time();
-    for (int i = 0; i < 10; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
-    auto time4 = time();
-
-    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
-              << std::endl;
-  }
-  return 0;
-}
diff --git a/mobile/test/net/test_googlenetv1_combine.cpp b/mobile/test/net/test_googlenetv1_combine.cpp
deleted file mode 100644
index 9aab25afd2..0000000000
--- a/mobile/test/net/test_googlenetv1_combine.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  auto time1 = time();
-  if (paddle_mobile.Load(std::string(g_googlenetv1_combined) + "/model",
-                         std::string(g_googlenetv1_combined) + "/params",
-                         false)) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 160, 160};
-    GetInput<float>(g_img, &input, dims);
-
-    for (int i = 0; i < input.size(); i += 1000) {
-      std::cout << input[i] << std::endl;
-    }
-    //    auto vec_result = paddle_mobile.Predict(input, dims);
-    //    std::vector<float>::iterator biggest =
-    //        std::max_element(std::begin(vec_result), std::end(vec_result));
-    //    std::cout << " Max element is " << *biggest << " at position "
-    //              << std::distance(std::begin(vec_result), biggest) <<
-    //              std::endl;
-
-    //    // 预热十次
-    //    for (int i = 0; i < 1; ++i) {
-    //      auto vec_result = paddle_mobile.Predict(input, dims);
-    //    }
-    auto time3 = time();
-
-    auto vec_result = paddle_mobile.Predict(input, dims);
-
-    for (int j = 0; j < vec_result.size(); ++j) {
-      std::cout << j << " : " << vec_result[j] << std::endl;
-    }
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) / 1 << "ms"
-              << std::endl;
-  }
-
-  return 0;
-}
diff --git a/mobile/test/net/test_inceptionv4.cpp b/mobile/test/net/test_inceptionv4.cpp
deleted file mode 100644
index fbbc9dd39e..0000000000
--- a/mobile/test/net/test_inceptionv4.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  auto time1 = time();
-  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
-  //                     std::string(g_mobilenet_detect) + "/params", true);
-
-  auto isok = paddle_mobile.Load(g_inceptionv4, true);
-  if (isok) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 224, 224};
-    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
-
-    auto vec_result = paddle_mobile.Predict(input, dims);
-    std::vector<float>::iterator biggest =
-        std::max_element(std::begin(vec_result), std::end(vec_result));
-    std::cout << " Max element is " << *biggest << " at position "
-              << std::distance(std::begin(vec_result), biggest) << std::endl;
-
-    // 预热十次
-    for (int i = 0; i < 10; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
-    auto time3 = time();
-    for (int i = 0; i < 10; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
-    //        DLOG << vec_result;
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
-              << std::endl;
-  }
-
-  std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
-               "是否存在?"
-            << std::endl;
-  return 0;
-}
diff --git a/mobile/test/net/test_mobilenet+ssd.cpp b/mobile/test/net/test_mobilenet+ssd.cpp
deleted file mode 100644
index 85083ca441..0000000000
--- a/mobile/test/net/test_mobilenet+ssd.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  auto time1 = time();
-  auto isok = paddle_mobile.Load(
-      std::string(g_mobilenet_ssd_gesture) + "/model",
-      std::string(g_mobilenet_ssd_gesture) + "/params", true);
-  //  auto isok = paddle_mobile.Load(g_mobilenet_ssd, false);
-  if (isok) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
-
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 300, 300};
-    GetInput<float>(g_hand, &input, dims);
-
-    // 预热十次
-    for (int i = 0; i < 10; ++i) {
-      auto output = paddle_mobile.Predict(input, dims);
-    }
-    auto time3 = time();
-    for (int i = 0; i < 10; ++i) {
-      auto output = paddle_mobile.Predict(input, dims);
-    }
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
-              << std::endl;
-  }
-  return 0;
-}
diff --git a/mobile/test/net/test_mobilenet.cpp b/mobile/test/net/test_mobilenet.cpp
deleted file mode 100644
index 5cce53e866..0000000000
--- a/mobile/test/net/test_mobilenet.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  auto time1 = paddle_mobile::time();
-  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
-  //                     std::string(g_mobilenet_detect) + "/params", true);
-
-  auto isok = paddle_mobile.Load(g_mobilenet, true);
-  if (isok) {
-    auto time2 = paddle_mobile::time();
-    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time1) << "ms"
-              << std::endl;
-
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 224, 224};
-    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
-
-    auto vec_result = paddle_mobile.Predict(input, dims);
-    std::vector<float>::iterator biggest =
-        std::max_element(std::begin(vec_result), std::end(vec_result));
-    std::cout << " Max element is " << *biggest << " at position "
-              << std::distance(std::begin(vec_result), biggest) << std::endl;
-
-    // 预热十次
-    for (int i = 0; i < 10; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
-    auto time3 = paddle_mobile::time();
-    for (int i = 0; i < 10; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
-    DLOG << vec_result;
-    auto time4 = paddle_mobile::time();
-    std::cout << "predict cost :" << paddle_mobile::time_diff(time3, time4) / 10
-              << "ms" << std::endl;
-  }
-
-  std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
-               "是否存在?"
-            << std::endl;
-  return 0;
-}
diff --git a/mobile/test/net/test_mobilenet_025_fssd.cpp b/mobile/test/net/test_mobilenet_025_fssd.cpp
deleted file mode 100644
index c0d037ceb0..0000000000
--- a/mobile/test/net/test_mobilenet_025_fssd.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main(int argc, char **argv) {
-  int times = 10;
-  if (argc <= 1) {
-    times = 10;
-    std::cout << "没有输入 , 使用默认10次 " << times << std::endl;
-  } else {
-    std::string arstr = argv[1];
-    times = std::stoi(arstr);
-    std::cout << "input times: " << times << std::endl;
-  }
-
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(1);
-  auto isok =
-      paddle_mobile.Load(std::string(g_fluid_fssd_new) + "/model",
-                         std::string(g_fluid_fssd_new) + "/params", true);
-  if (isok) {
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 160, 160};
-    GetInput<float>(g_imgfssd_ar1, &input, dims);
-    std::cout << "预热10次....." << std::endl;
-
-    // 预热十次
-    for (int i = 0; i < 10; ++i) {
-      auto output = paddle_mobile.Predict(input, dims);
-    }
-    std::cout << "开始....." << std::endl;
-
-    double time_sum = 0;
-
-    for (int i = 0; i < times; ++i) {
-      auto time3 = time();
-      auto output = paddle_mobile.Predict(input, dims);
-      auto time4 = time();
-      double timeDiff = time_diff(time3, time4);
-      time_sum += timeDiff;
-      std::cout << "第" << i << "次"
-                << "predict cost :" << timeDiff << "ms" << std::endl;
-    }
-    std::cout << "平均时间:" << time_sum / times << "ms" << std::endl;
-  }
-  return 0;
-}
diff --git a/mobile/test/net/test_mobilenet_GPU.cpp b/mobile/test/net/test_mobilenet_GPU.cpp
deleted file mode 100644
index 54db0f123c..0000000000
--- a/mobile/test/net/test_mobilenet_GPU.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../../src/common/types.h"
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile;
-  //    paddle_mobile.SetThreadNum(4);
-  auto time1 = paddle_mobile::time();
-#ifdef PADDLE_MOBILE_CL
-  paddle_mobile.SetCLPath("/data/local/tmp/bin");
-#endif
-
-  //  auto isok = paddle_mobile.Load(
-  //      std::string(g_mobilenet_vision) + "/vision_mobilenet_model",
-  //      std::string(g_mobilenet_vision) + "/vision_mobilenet_params", true);
-
-  auto isok = paddle_mobile.Load(std::string(g_mobilenet), true);
-  if (isok) {
-    auto time2 = paddle_mobile::time();
-    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
-              << std::endl;
-
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 224, 224};
-    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
-
-    std::vector<float> vec_result = paddle_mobile.Predict(input, dims);
-
-    auto time3 = paddle_mobile::time();
-    int max = 1;
-    for (int i = 0; i < max; ++i) {
-      vec_result = paddle_mobile.Predict(input, dims);
-    }
-    auto time4 = paddle_mobile::time();
-
-    std::cout << "predict cost :"
-              << paddle_mobile::time_diff(time3, time4) / max << "ms"
-              << std::endl;
-    std::vector<float>::iterator biggest =
-        std::max_element(std::begin(vec_result), std::end(vec_result));
-    std::cout << " Max element is " << *biggest << " at position "
-              << std::distance(std::begin(vec_result), biggest) << std::endl;
-  }
-
-  std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
-               "是否存在?"
-            << std::endl;
-  return 0;
-}
diff --git a/mobile/test/net/test_mobilenet_combine.cpp b/mobile/test/net/test_mobilenet_combine.cpp
deleted file mode 100644
index af00085b6d..0000000000
--- a/mobile/test/net/test_mobilenet_combine.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  auto time1 = time();
-
-  if (paddle_mobile.Load(
-          std::string(g_mobilenet_vision) + "/vision_mobilenet_model",
-          std::string(g_mobilenet_vision) + "/vision_mobilenet_params", true)) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 224, 224};
-
-    GetInput<float>(g_test_image_1x3x224x224_vision_mobilenet_input, &input,
-                    dims);
-
-    auto vec_result = paddle_mobile.Predict(input, dims);
-    std::vector<float>::iterator biggest =
-        std::max_element(std::begin(vec_result), std::end(vec_result));
-    std::cout << " Max element is " << *biggest << " at position "
-              << std::distance(std::begin(vec_result), biggest) << std::endl;
-
-    // 预热十次
-    for (int i = 0; i < 10; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
-
-    auto time3 = time();
-    for (int i = 0; i < 1; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
-              << std::endl;
-  }
-  std::cout
-      << "如果结果Nan请查看: test/images/test_image_1x3x224x224_float 是否存在?"
-      << std::endl;
-  return 0;
-}
diff --git a/mobile/test/net/test_multi_inference_predict.cpp b/mobile/test/net/test_multi_inference_predict.cpp
deleted file mode 100644
index 8d97fee8c3..0000000000
--- a/mobile/test/net/test_multi_inference_predict.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include <thread>  // NOLINT
-#include "../test_helper.h"
-#include "../test_include.h"
-
-void fun_yolo();
-int fun_mobilenet();
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile2;
-
-  //  fun_yolo();
-  //  fun_mobilenet();
-
-  std::thread t1(fun_yolo);
-  std::thread t2(fun_mobilenet);
-
-  t1.join();
-  t2.join();
-
-  return 0;
-}
-
-void fun_yolo() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  //  ../../../test/models/googlenet
-  //  ../../../test/models/mobilenet
-  auto time1 = time();
-  if (paddle_mobile.Load(g_yolo, true)) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-
-    vector<int64_t> dims{1, 3, 227, 227};
-    Tensor input_tensor;
-    SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
-                       static_cast<float>(1));
-
-    vector<float> input(input_tensor.data<float>(),
-                        input_tensor.data<float>() + input_tensor.numel());
-
-    auto time3 = time();
-    for (int i = 0; i < 10; ++i) {
-      paddle_mobile.Predict(input, dims);
-    }
-    auto time4 = time();
-    std::cout << "thread 1:   predict cost :" << time_diff(time3, time4) / 10
-              << "ms" << std::endl;
-  }
-}
-
-int fun_mobilenet() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  auto time1 = time();
-  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
-  //                     std::string(g_mobilenet_detect) + "/params", true);
-
-  auto isok = paddle_mobile.Load(g_mobilenet, true);
-  if (isok) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-
-    vector<float> input;
-    vector<int64_t> dims{1, 3, 224, 224};
-    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
-
-    auto vec_result = paddle_mobile.Predict(input, dims);
-    auto biggest = max_element(begin(vec_result), end(vec_result));
-    std::cout << " Max element is " << *biggest << " at position "
-              << distance(begin(vec_result), biggest) << std::endl;
-
-    // 预热十次
-    for (int i = 0; i < 10; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
-    auto time3 = time();
-    for (int i = 0; i < 10; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
-    DLOG << vec_result;
-    auto time4 = time();
-    std::cout << "thread 2:  predict cost :" << time_diff(time3, time4) / 10
-              << "ms" << std::endl;
-  }
-
-  std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
-               "是否存在?"
-            << std::endl;
-  return 0;
-}
diff --git a/mobile/test/net/test_net.cpp b/mobile/test/net/test_net.cpp
deleted file mode 100644
index a1c234dbca..0000000000
--- a/mobile/test/net/test_net.cpp
+++ /dev/null
@@ -1,272 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fstream>
-#include <iostream>
-#include <string>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-void test(int argc, char *argv[]);
-
-int main(int argc, char *argv[]) {
-  test(argc, argv);
-  return 0;
-}
-
-void test(int argc, char *argv[]) {
-  int arg_index = 1;
-  bool fuse = std::stoi(argv[arg_index]) == 1;
-  arg_index++;
-  bool enable_memory_optimization = std::stoi(argv[arg_index]) == 1;
-  arg_index++;
-  paddle_mobile::PaddleMobileConfigInternal config;
-  config.memory_optimization_level = enable_memory_optimization
-                                         ? MemoryOptimizationWithoutFeeds
-                                         : NoMemoryOptimization;
-
-  // save obfuscated model
-  // config.model_obfuscate_key = "asdf";
-  // std::ofstream out_file("new-params", std::ofstream::binary);
-  // char *out_data = ReadFileToBuff("./checked_model/params");
-  // int len = GetFileLength("./checked_model/params");
-  // out_file.write(out_data, len);
-  // out_file.close();
-
-#ifdef PADDLE_MOBILE_CL
-  //  config.load_when_predict = true;
-  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile(config);
-  paddle_mobile.SetCLPath("/data/local/tmp/bin");
-  std::cout << "testing opencl yyz " << std::endl;
-#else
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile(config);
-  paddle_mobile.SetThreadNum(1);
-  std::cout << "testing cpu yyz " << std::endl;
-#endif
-
-  int dim_count = std::stoi(argv[arg_index]);
-  arg_index++;
-  int size = 1;
-  std::vector<int64_t> dims;
-  for (int i = 0; i < dim_count; i++) {
-    int64_t dim = std::stoi(argv[arg_index + i]);
-    size *= dim;
-    dims.push_back(dim);
-  }
-  arg_index += dim_count;
-
-  bool is_lod = std::stoi(argv[arg_index]) == 1;
-  arg_index++;
-  paddle_mobile::framework::LoD lod{{}};
-  if (is_lod) {
-    int lod_count = std::stoi(argv[arg_index]);
-    arg_index++;
-    for (int i = 0; i < lod_count; i++) {
-      int dim = std::stoi(argv[arg_index + i]);
-      lod[0].push_back(dim);
-    }
-    arg_index += lod_count;
-  }
-
-  int var_count = std::stoi(argv[arg_index]);
-  arg_index++;
-  bool is_sample_step = std::stoi(argv[arg_index]) == 1;
-  arg_index++;
-  int sample_arg = std::stoi(argv[arg_index]);
-  int sample_step = sample_arg;
-  int sample_num = sample_arg;
-  arg_index++;
-  std::vector<std::string> var_names;
-  for (int i = 0; i < var_count; i++) {
-    std::string var_name = argv[arg_index + i];
-    var_names.push_back(var_name);
-  }
-  arg_index += var_count;
-  bool check_shape = std::stoi(argv[arg_index]) == 1;
-  arg_index++;
-
-  auto time1 = time();
-  if (paddle_mobile.Load("./checked_model/model", "./checked_model/params",
-                         fuse, false, 1, true)) {
-    auto time2 = time();
-    std::cout << "auto-test"
-              << " load-time-cost :" << time_diff(time1, time2) << "ms"
-              << std::endl;
-
-    float *input_data_array = new float[size];
-    std::ifstream in("input.txt", std::ios::in);
-    for (int i = 0; i < size; i++) {
-      float num;
-      in >> num;
-      input_data_array[i] = num;
-    }
-    in.close();
-
-    auto time3 = time();
-    // std::vector<float> input_data;
-    // for (int i = 0; i < size; i++) {
-    //   float num = input_data_array[i];
-    //   input_data.push_back(num);
-    // }
-    // paddle_mobile::framework::Tensor input_tensor(input_data,
-    // paddle_mobile::framework::make_ddim(dims));
-    paddle_mobile::framework::Tensor input_tensor(
-        input_data_array, paddle_mobile::framework::make_ddim(dims));
-    auto time4 = time();
-    std::cout << "auto-test"
-              << " preprocess-time-cost :" << time_diff(time3, time4) << "ms"
-              << std::endl;
-
-    paddle_mobile::framework::LoDTensor input_lod_tensor;
-    if (is_lod) {
-      input_lod_tensor.Resize(paddle_mobile::framework::make_ddim(dims));
-      input_lod_tensor.set_lod(lod);
-      auto *tensor_data = input_lod_tensor.mutable_data<float>();
-      for (int i = 0; i < size; i++) {
-        tensor_data[i] = input_data_array[i];
-      }
-    }
-
-    // // 预热10次
-    // for (int i = 0; i < 10; i++) {
-    //   if (is_lod) {
-    //     auto out = paddle_mobile.Predict(input_lod_tensor);
-    //   } else {
-    //     paddle_mobile.Feed(var_names[0], input_tensor);
-    //     paddle_mobile.Predict();
-    //   }
-    // }
-
-    // // 测速
-    // auto time5 = time();
-    // for (int i = 0; i < 50; i++) {
-    //   if (is_lod) {
-    //     auto out = paddle_mobile.Predict(input_lod_tensor);
-    //   } else {
-    //     paddle_mobile.Feed(var_names[0], input_tensor);
-    //     paddle_mobile.Predict();
-    //   }
-    // }
-    // auto time6 = time();
-    // std::cout << "auto-test"
-    //           << " predict-time-cost " << time_diff(time5, time6) / 50 <<
-    //           "ms"
-    //           << std::endl;
-
-    // 测试正确性
-    if (is_lod) {
-      auto out = paddle_mobile.Predict(input_lod_tensor);
-    } else {
-      paddle_mobile.Feed(var_names[0], input_tensor);
-      paddle_mobile.Predict();
-    }
-#ifdef PADDLE_MOBILE_CL
-    for (auto var_name : var_names) {
-      auto cl_image = paddle_mobile.FetchImage(var_name);
-      if (cl_image == nullptr || cl_image->GetCLImage() == nullptr) {
-        continue;
-      }
-      auto len = cl_image->numel();
-      if (len == 0) {
-        continue;
-      }
-      int width = cl_image->ImageDims()[0];
-      int height = cl_image->ImageDims()[1];
-      paddle_mobile::framework::half_t *image_data =
-          new paddle_mobile::framework::half_t[height * width * 4];
-      cl_int err;
-      cl_mem image = cl_image->GetCLImage();
-      size_t origin[3] = {0, 0, 0};
-      size_t region[3] = {width, height, 1};
-      err = clEnqueueReadImage(cl_image->CommandQueue(), image, CL_TRUE, origin,
-                               region, 0, 0, image_data, 0, NULL, NULL);
-      CL_CHECK_ERRORS(err);
-      float *tensor_data = new float[cl_image->numel()];
-      auto converter = cl_image->Converter();
-      converter->ImageToNCHW(image_data, tensor_data, cl_image->ImageDims(),
-                             cl_image->dims());
-
-      auto data = tensor_data;
-      std::string sample = "";
-      if (check_shape) {
-        for (int i = 0; i < cl_image->dims().size(); i++) {
-          sample += " " + std::to_string(cl_image->dims()[i]);
-        }
-      }
-      if (!is_sample_step) {
-        sample_step = len / sample_num;
-      }
-      if (sample_step <= 0) {
-        sample_step = 1;
-      }
-      for (int i = 0; i < len; i += sample_step) {
-        sample += " " + std::to_string(data[i]);
-      }
-      std::cout << "auto-test"
-                << " var " << var_name << sample << std::endl;
-    }
-#else
-    for (auto var_name : var_names) {
-      auto out = paddle_mobile.Fetch(var_name);
-      auto len = out->numel();
-      if (len == 0) {
-        continue;
-      }
-      if (out->memory_size() == 0) {
-        continue;
-      }
-      if (out->type() == type_id<int>()) {
-        auto data = out->data<int>();
-        std::string sample = "";
-        if (check_shape) {
-          for (int i = 0; i < out->dims().size(); i++) {
-            sample += " " + std::to_string(out->dims()[i]);
-          }
-        }
-        if (!is_sample_step) {
-          sample_step = len / sample_num;
-        }
-        if (sample_step <= 0) {
-          sample_step = 1;
-        }
-        for (int i = 0; i < len; i += sample_step) {
-          sample += " " + std::to_string(data[i]);
-        }
-        std::cout << "auto-test"
-                  << " var " << var_name << sample << std::endl;
-      } else if (out->type() == type_id<float>()) {
-        auto data = out->data<float>();
-        std::string sample = "";
-        if (check_shape) {
-          for (int i = 0; i < out->dims().size(); i++) {
-            sample += " " + std::to_string(out->dims()[i]);
-          }
-        }
-        if (!is_sample_step) {
-          sample_step = len / sample_num;
-        }
-        if (sample_step <= 0) {
-          sample_step = 1;
-        }
-        for (int i = 0; i < len; i += sample_step) {
-          sample += " " + std::to_string(data[i]);
-        }
-        std::cout << "auto-test"
-                  << " var " << var_name << sample << std::endl;
-      }
-    }
-#endif
-    std::cout << std::endl;
-  }
-}
diff --git a/mobile/test/net/test_net_benchmark.cpp b/mobile/test/net/test_net_benchmark.cpp
deleted file mode 100644
index f874683148..0000000000
--- a/mobile/test/net/test_net_benchmark.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(1);
-  auto time1 = paddle_mobile::time();
-
-  auto isok =
-      paddle_mobile.Load(std::string(g_yolo) + "/model",
-                         std::string(g_yolo) + "/params", true, false, 1, true);
-  if (isok) {
-    auto time2 = paddle_mobile::time();
-    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time1) << "ms"
-              << std::endl;
-
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 64, 64};
-    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
-
-    paddle_mobile::framework::DDim ddim =
-        paddle_mobile::framework::make_ddim(dims);
-    Tensor feed_tensor(input, paddle_mobile::framework::make_ddim(dims));
-
-    // 预热十次
-    for (int i = 0; i < 10; ++i) {
-      //      auto vec_result = paddle_mobile.Predict(input, dims);
-      paddle_mobile.Feed("data", feed_tensor);
-      paddle_mobile.Predict();
-    }
-    auto time3 = paddle_mobile::time();
-    for (int i = 0; i < 100; ++i) {
-      //      auto vec_result = paddle_mobile.Predict(input, dims);
-      paddle_mobile.Feed("data", feed_tensor);
-      paddle_mobile.Predict();
-    }
-    auto time4 = paddle_mobile::time();
-    std::cout << "predict cost :"
-              << paddle_mobile::time_diff(time3, time4) / 100 << "ms"
-              << std::endl;
-  }
-
-  return 0;
-}
diff --git a/mobile/test/net/test_nlp.cpp b/mobile/test/net/test_nlp.cpp
deleted file mode 100644
index db13e2da57..0000000000
--- a/mobile/test/net/test_nlp.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  auto time1 = time();
-  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
-  //                     std::string(g_mobilenet_detect) + "/params", true);
-
-  auto isok = paddle_mobile.Load(g_nlp, true, false, 1, true);
-
-  //  auto isok = paddle_mobile.Load(std::string(g_nlp) + "/model",
-  //                                 std::string(g_nlp) + "/params", false);
-  if (isok) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-    //    1064 1603 644 699 2878 1219 867 1352 8 1 13 312 479
-
-    std::vector<int64_t> ids{1918, 117, 55, 97, 1352, 4272, 1656, 903};
-
-    paddle_mobile::framework::LoDTensor words;
-    auto size = static_cast<int>(ids.size());
-    paddle_mobile::framework::LoD lod{{0, ids.size()}};
-    DDim dims{size, 1};
-    words.Resize(dims);
-    words.set_lod(lod);
-    DLOG << "words lod : " << words.lod();
-    auto *pdata = words.mutable_data<int64_t>();
-    size_t n = words.numel() * sizeof(int64_t);
-    DLOG << "n :" << n;
-    memcpy(pdata, ids.data(), n);
-    DLOG << "words lod 22: " << words.lod();
-    auto time3 = time();
-    for (int i = 0; i < 1; ++i) {
-      paddle_mobile.Predict(words);
-      DLOG << *paddle_mobile.Fetch();
-    }
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) / 1 << "ms"
-              << std::endl;
-  }
-
-  auto time2 = time();
-  std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-  //    1064 1603 644 699 2878 1219 867 1352 8 1 13 312 479
-
-  std::vector<int64_t> ids{
-      2084, 635,  1035, 197,  990,  150,  1132, 2403, 546,  770,  4060, 3352,
-      1798, 1589, 1352, 98,   136,  3461, 3186, 1159, 515,  764,  278,  1178,
-      5044, 4060, 943,  932,  463,  1198, 3352, 374,  1198, 3352, 374,  2047,
-      1069, 1589, 3672, 1178, 1178, 2165, 1178, 2084, 635,  3087, 2236, 546,
-      2047, 1549, 546,  2047, 302,  2202, 398,  804,  397,  657,  804,  866,
-      932,  2084, 515,  2165, 397,  302,  2202, 526,  992,  906,  1215, 1589,
-      4493, 2403, 723,  932,  2084, 635,  1352, 932,  444,  2047, 1159, 1893,
-      1579, 59,   330,  98,   1296, 1159, 3430, 738,  3186, 1071, 2174, 3933};
-
-  paddle_mobile::framework::LoDTensor words;
-  auto size = static_cast<int>(ids.size());
-  paddle_mobile::framework::LoD lod{{0, ids.size()}};
-  DDim dims{size, 1};
-  words.Resize(dims);
-  words.set_lod(lod);
-  DLOG << "words lod : " << words.lod();
-  auto *pdata = words.mutable_data<int64_t>();
-  size_t n = words.numel() * sizeof(int64_t);
-  DLOG << "n :" << n;
-  memcpy(pdata, ids.data(), n);
-  DLOG << "words lod 22: " << words.lod();
-  auto time3 = time();
-  for (int i = 0; i < 1; ++i) {
-    paddle_mobile.Predict(words);
-    DLOG << *paddle_mobile.Fetch();
-  }
-  auto time4 = time();
-  std::cout << "predict cost :" << time_diff(time3, time4) / 1 << "ms"
-            << std::endl;
-  return 0;
-}
diff --git a/mobile/test/net/test_ocr.cpp b/mobile/test/net/test_ocr.cpp
deleted file mode 100644
index d7dde5406e..0000000000
--- a/mobile/test/net/test_ocr.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fstream>
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-const int max_run_times = 10;
-
-int main(int argc, char **argv) {
-  if (argc < 3) {
-    std::cerr
-        << "Usage: ./test_ocr [detect_model_dir|recog_model_dir] image_path"
-        << std::endl;
-    return 1;
-  }
-  std::string model_dir = argv[1];
-  std::string image_path = argv[2];
-
-  // init input, output params
-  std::vector<float> input_vec;
-  std::vector<int64_t> input_shape;
-  std::vector<std::string> output_fetch_nodes;
-  int PRINT_NODE_ELEM_NUM = 10;
-
-  bool is_det_model = model_dir.find("detect") != string::npos;
-  if (is_det_model) {
-    input_shape.emplace_back(1);
-    input_shape.emplace_back(3);
-    input_shape.emplace_back(512);
-    input_shape.emplace_back(512);
-    output_fetch_nodes.emplace_back("sigmoid_0.tmp_0");
-    output_fetch_nodes.emplace_back("tmp_5");
-  } else {
-    input_shape.emplace_back(1);
-    input_shape.emplace_back(3);
-    input_shape.emplace_back(48);
-    input_shape.emplace_back(512);
-    output_fetch_nodes.emplace_back("top_k_1.tmp_0");
-    output_fetch_nodes.emplace_back("cast_330.tmp_0");
-  }
-  std::shared_ptr<framework::LoDTensor> outputs[output_fetch_nodes.size()];
-
-  // init paddle instance
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(1);
-  std::cout << "start load " << std::endl;
-  auto load_success = paddle_mobile.Load(std::string(model_dir) + "/model",
-                                         std::string(model_dir) + "/params",
-                                         true, false, 1, true);
-  std::cout << "load_success:" << load_success << std::endl;
-  // input image raw tensor, generated by
-  // [scripts](tools/python/imagetools/img2nchw.py)
-  std::cout << "image_path: " << image_path << std::endl;
-  std::cout << "input_shape: " << input_shape[0] << ", " << input_shape[1]
-            << ", " << input_shape[2] << ", " << input_shape[3] << std::endl;
-  GetInput<float>(image_path, &input_vec, input_shape);
-
-  // model predict
-  auto pred_start_time = paddle_mobile::time();
-  for (int run_idx = 0; run_idx < max_run_times; ++run_idx) {
-    paddle_mobile.Predict(input_vec, input_shape);
-    for (int out_idx = 0; out_idx < output_fetch_nodes.size(); ++out_idx) {
-      auto fetch_name = output_fetch_nodes[out_idx];
-      outputs[out_idx] = paddle_mobile.Fetch(fetch_name);
-    }
-  }
-  auto pred_end_time = paddle_mobile::time();
-
-  // inference time
-  double pred_time =
-      paddle_mobile::time_diff(pred_start_time, pred_end_time) / max_run_times;
-  std::cout << "predict time(ms): " << pred_time << std::endl;
-
-  // output result
-  for (int out_idx = 0; out_idx < output_fetch_nodes.size(); ++out_idx) {
-    std::string node_id = output_fetch_nodes[out_idx];
-    auto node_lod_tensor = outputs[out_idx];
-    int node_elem_num = node_lod_tensor->numel();
-    float *node_ptr = node_lod_tensor->data<float>();
-    std::cout << "==== output_fetch_nodes[" << out_idx
-              << "] =====" << std::endl;
-    std::cout << "node_id: " << node_id << std::endl;
-    std::cout << "node_elem_num: " << node_elem_num << std::endl;
-    std::cout << "PRINT_NODE_ELEM_NUM: " << PRINT_NODE_ELEM_NUM << std::endl;
-    PRINT_NODE_ELEM_NUM =
-        (node_elem_num > PRINT_NODE_ELEM_NUM) ? PRINT_NODE_ELEM_NUM : 0;
-    for (int eidx = 0; eidx < PRINT_NODE_ELEM_NUM; ++eidx) {
-      std::cout << node_id << "[" << eidx << "]: " << node_ptr[eidx]
-                << std::endl;
-    }
-    std::cout << std::endl;
-  }
-
-  return 0;
-}
diff --git a/mobile/test/net/test_op_in_net.cpp b/mobile/test/net/test_op_in_net.cpp
deleted file mode 100644
index 4666f4133c..0000000000
--- a/mobile/test/net/test_op_in_net.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fstream>
-#include <iostream>
-#include <string>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-void test(int argc, char *argv[]);
-
-int main(int argc, char *argv[]) {
-  test(argc, argv);
-  return 0;
-}
-
-void test(int argc, char *argv[]) {
-  std::vector<int64_t> dims{1, 8, 32, 32};
-  int op_index = 2;
-  std::string input_var_name = "ConvNdBackward2.conv2d.output.1.tmp_0";
-  std::vector<std::string> output_var_names{
-      "ConvNdBackward2.conv2d.output.1.tmp_1"};
-
-  bool fuse = false;
-  bool enable_memory_optimization = true;
-  paddle_mobile::PaddleMobileConfigInternal config;
-  config.memory_optimization_level = enable_memory_optimization
-                                         ? MemoryOptimizationWithoutFeeds
-                                         : NoMemoryOptimization;
-#ifdef PADDLE_MOBILE_CL
-  // config.load_when_predict = true;
-  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile(config);
-  paddle_mobile.SetCLPath("/data/local/tmp/bin");
-#else
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile(config);
-  paddle_mobile.SetThreadNum(1);
-#endif
-
-  int size = 1;
-  for (int i = 0; i < dims.size(); i++) {
-    size *= dims[i];
-  }
-
-  bool is_sample_step = false;
-  int sample_step = 1;
-  int sample_num = 20;
-
-  auto time1 = time();
-  if (paddle_mobile.Load("./checked_model/model", "./checked_model/params",
-                         fuse, false, 1, true)) {
-    auto time2 = time();
-    std::cout << "auto-test"
-              << " load-time-cost :" << time_diff(time1, time2) << "ms"
-              << std::endl;
-
-    float input_data_array[size];
-    std::ifstream in("input.txt", std::ios::in);
-    for (int i = 0; i < size; i++) {
-      float num;
-      in >> num;
-      input_data_array[i] = num;
-    }
-    in.close();
-
-    auto time3 = time();
-    std::vector<float> input_data;
-    for (int i = 0; i < size; i++) {
-      float num = input_data_array[i];
-      input_data.push_back(num);
-    }
-    paddle_mobile::framework::Tensor input_tensor(
-        input_data, paddle_mobile::framework::make_ddim(dims));
-    auto time4 = time();
-    std::cout << "auto-test"
-              << " preprocess-time-cost :" << time_diff(time3, time4) << "ms"
-              << std::endl;
-
-    // 测试正确性
-    // 以下代码依赖paddle_mobile.h及executor.h的属性可见性，如需使用，调整可见性后，放开注释
-    // auto *input_var =
-    //     paddle_mobile.executor_->program_.scope->FindVar(input_var_name);
-    // framework::LoDTensor *target =
-    //     input_var->template GetMutable<framework::LoDTensor>();
-    // target->Resize(input_tensor.dims());
-    // target->ShareDataWith(input_tensor);
-    // paddle_mobile.executor_->ops_of_block0_[op_index]->InferShape();
-    // paddle_mobile.executor_->ops_of_block0_[op_index]->Run();
-
-    for (auto var_name : output_var_names) {
-      auto out = paddle_mobile.Fetch(var_name);
-      auto len = out->numel();
-      if (len == 0) {
-        continue;
-      }
-      if (out->memory_size() == 0) {
-        continue;
-      }
-      auto data = out->data<float>();
-      std::string sample = "";
-      if (!is_sample_step) {
-        sample_step = len / sample_num;
-      }
-      if (sample_step <= 0) {
-        sample_step = 1;
-      }
-      for (int i = 0; i < len; i += sample_step) {
-        sample += " " + std::to_string(data[i]);
-      }
-      std::cout << "auto-test"
-                << " var " << var_name << sample << std::endl;
-    }
-    std::cout << std::endl;
-  }
-}
diff --git a/mobile/test/net/test_resnet.cpp b/mobile/test/net/test_resnet.cpp
deleted file mode 100644
index 9c60bd13cf..0000000000
--- a/mobile/test/net/test_resnet.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-#ifdef PADDLE_MOBILE_FPGA
-  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile;
-  paddle_mobile.SetCLPath("/data/local/tmp/bin");
-#else
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-#endif
-  paddle_mobile.SetThreadNum(4);
-  auto time1 = time();
-  if (paddle_mobile.Load(g_resnet, true)) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-    std::vector<int64_t> dims{1, 3, 32, 32};
-    Tensor input_tensor;
-    SetupTensor<float>(&input_tensor, {1, 3, 32, 32}, static_cast<float>(0),
-                       static_cast<float>(1));
-
-    std::vector<float> input(input_tensor.data<float>(),
-                             input_tensor.data<float>() + input_tensor.numel());
-#ifndef PADDLE_MOBILE_FPGA
-    //   预热十次
-    //    for (int i = 0; i < 10; ++i) {
-    //      paddle_mobile.Predict(input, dims);
-    //    }
-    auto time3 = time();
-    //    for (int i = 0; i < 10; ++i) {
-    paddle_mobile.Predict(input, dims);
-    //    }
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
-              << std::endl;
-
-#else
-    auto time3 = time();
-    paddle_mobile.FeedData(input_tensor);
-    paddle_mobile.Predict_To(-1);
-    /*paddle_mobile.Predict_From(10);
-    auto tensor_ptr = paddle_mobile.FetchResult(9);
-    std::cout << "Tensor element number for op[9]: " << tensor_ptr->numel()
-              << std::endl;
-    auto result_ptr = paddle_mobile.FetchResult();
-    std::cout << "Result tensor element number: " << result_ptr->numel()
-              << std::endl;
-
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
-              << std::endl;*/
-#endif
-  }
-  return 0;
-}
diff --git a/mobile/test/net/test_squeezenet.cpp b/mobile/test/net/test_squeezenet.cpp
deleted file mode 100644
index 02ec8691fe..0000000000
--- a/mobile/test/net/test_squeezenet.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  //  ../../../test/models/googlenet
-  //  ../../../test/models/mobilenet
-  auto time1 = time();
-  if (paddle_mobile.Load(g_squeezenet, true)) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-    std::vector<int64_t> dims{1, 3, 227, 227};
-    Tensor input_tensor;
-    SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
-                       static_cast<float>(1));
-
-    std::vector<float> input(input_tensor.data<float>(),
-                             input_tensor.data<float>() + input_tensor.numel());
-    // 预热十次
-    for (int i = 0; i < 10; ++i) {
-      paddle_mobile.Predict(input, dims);
-    }
-    auto time3 = time();
-    for (int i = 0; i < 10; ++i) {
-      paddle_mobile.Predict(input, dims);
-    }
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
-              << std::endl;
-  }
-
-  return 0;
-}
diff --git a/mobile/test/net/test_super.cpp b/mobile/test/net/test_super.cpp
deleted file mode 100644
index 669859f622..0000000000
--- a/mobile/test/net/test_super.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../../src/common/types.h"
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobileConfigInternal config;
-  config.load_when_predict = true;
-
-#ifdef PADDLE_MOBILE_CL
-  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile(config);
-  paddle_mobile.SetCLPath("/data/local/tmp/bin");
-#else
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-#endif
-  //  paddle_mobile.SetThreadNum(4);
-
-  int max = 10;
-  auto time1 = paddle_mobile::time();
-  auto isok = paddle_mobile.Load(std::string(g_super) + "/model",
-                                 std::string(g_super) + "/params", true, false,
-                                 1, false);
-
-  if (isok) {
-    auto time2 = paddle_mobile::time();
-    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
-              << std::endl;
-
-    // 300 * 300
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 1, 300, 300};
-    GetInput<float>(g_test_image_1x3x224x224, &input, dims);
-    paddle_mobile.Predict(input, dims);
-
-    // 640 * 360 (360P)
-    std::vector<float> input1;
-    std::vector<int64_t> dims1{1, 1, 640, 360};
-    GetInput<float>(g_test_image_1x3x224x224, &input1, dims1);
-    auto time3 = paddle_mobile::time();
-    for (int i = 0; i < max; ++i) {
-      auto time1 = paddle_mobile::time();
-      paddle_mobile.Predict(input1, dims1);
-      auto time2 = paddle_mobile::time();
-      std::cout << "640 * 360 predict cost :第" << i << ": "
-                << paddle_mobile::time_diff(time1, time2) << "ms" << std::endl;
-    }
-    auto time4 = paddle_mobile::time();
-    std::cout << "640 * 360 predict cost :"
-              << paddle_mobile::time_diff(time3, time4) / max << "ms"
-              << std::endl;
-
-    // 720 * 480 (480P)
-    std::vector<float> input2;
-    std::vector<int64_t> dims2{1, 1, 720, 480};
-    GetInput<float>(g_test_image_1x3x224x224, &input2, dims2);
-    auto time5 = paddle_mobile::time();
-    for (int i = 0; i < max; ++i) {
-      auto time1 = paddle_mobile::time();
-      paddle_mobile.Predict(input2, dims2);
-      auto time2 = paddle_mobile::time();
-      std::cout << "720 * 480 predict cost :第" << i << ": "
-                << paddle_mobile::time_diff(time1, time2) << "ms" << std::endl;
-    }
-    auto time6 = paddle_mobile::time();
-    std::cout << "720 * 480 predict cost :"
-              << paddle_mobile::time_diff(time5, time6) / max << "ms"
-              << std::endl;
-
-    // 1024 * 576 (576P)
-    std::vector<float> input3;
-    std::vector<int64_t> dims3{1, 1, 1024, 576};
-    GetInput<float>(g_test_image_1x3x224x224, &input3, dims3);
-    auto time7 = paddle_mobile::time();
-    for (int i = 0; i < max; ++i) {
-      auto time1 = paddle_mobile::time();
-      paddle_mobile.Predict(input3, dims3);
-      auto time2 = paddle_mobile::time();
-      std::cout << "1024 * 576 predict cost :第" << i << ": "
-                << paddle_mobile::time_diff(time1, time2) << "ms" << std::endl;
-    }
-    auto time8 = paddle_mobile::time();
-    std::cout << "1024 * 576 predict cost :"
-              << paddle_mobile::time_diff(time7, time8) / max << "ms"
-              << std::endl;
-
-    // 1280 * 720
-    std::vector<float> input4;
-    std::vector<int64_t> dims4{1, 1, 1280, 720};
-    GetInput<float>(g_test_image_1x3x224x224, &input4, dims4);
-    auto time9 = paddle_mobile::time();
-    for (int i = 0; i < max; ++i) {
-      auto time1 = paddle_mobile::time();
-      paddle_mobile.Predict(input4, dims4);
-      auto time2 = paddle_mobile::time();
-      std::cout << "1280 * 720 predict cost :第" << i << ": "
-                << paddle_mobile::time_diff(time1, time2) << "ms" << std::endl;
-    }
-    auto time10 = paddle_mobile::time();
-    std::cout << "1280 * 720 predict cost :"
-              << paddle_mobile::time_diff(time9, time10) / max << "ms"
-              << std::endl;
-  }
-
-  return 0;
-}
diff --git a/mobile/test/net/test_vgg16ssd.cpp b/mobile/test/net/test_vgg16ssd.cpp
deleted file mode 100644
index 387d6f38ea..0000000000
--- a/mobile/test/net/test_vgg16ssd.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(1);
-  auto time1 = paddle_mobile::time();
-
-  auto isok =
-      paddle_mobile.Load(std::string(g_vgg16_ssd_combined) + "/model",
-                         std::string(g_vgg16_ssd_combined) + "/params", false);
-  if (isok) {
-    auto time2 = paddle_mobile::time();
-    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time1) << "ms"
-              << std::endl;
-
-    std::vector<int64_t> dims{1, 3, 300, 300};
-    Tensor input_tensor;
-    SetupTensor<float>(&input_tensor, {1, 3, 300, 300}, static_cast<float>(0),
-                       static_cast<float>(1));
-
-    std::vector<float> input(input_tensor.data<float>(),
-                             input_tensor.data<float>() + input_tensor.numel());
-
-    auto vec_result = paddle_mobile.Predict(input, dims);
-
-    DLOG << vec_result;
-  }
-
-  return 0;
-}
diff --git a/mobile/test/net/test_wrap.cpp b/mobile/test/net/test_wrap.cpp
deleted file mode 100644
index 69f3e785e8..0000000000
--- a/mobile/test/net/test_wrap.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <vector>
-#include "io/paddle_mobile_wrap.h"
-
-int main(int argc, char *argv[]) {
-#ifndef PADDLE_MOBILE_FPGA
-  paddle_mobile::wrap::Net *net =
-      new paddle_mobile::wrap::Net(paddle_mobile::wrap::kGPU_CL);
-  net->SetCLPath("/data/local/tmp/bin");
-  net->Load("./checked_model/model", "./checked_model/params", false, false, 1,
-            true);
-  int size = 1 * 3 * 416 * 416;
-  std::vector<int64_t> shape{1, 3, 416, 416};
-  float *data = new float[size];
-  for (int i = 0; i < size; i++) {
-    data[i] = 0.0;
-  }
-  std::ifstream infile;
-  infile.open("input.txt");
-  for (int i = 0; i < size; i++) {
-    infile >> data[i];
-  }
-  infile.close();
-  // input as vector
-  // std::vector<float> data_as_vector(data, data + size);
-  // auto output = net->Predict(data_as_vector, shape);
-  // for (auto item : output) {
-  //     std::cout << item << std::endl;
-  // }
-  // input as float pointer
-  paddle_mobile::wrap::Tensor input(data,
-                                    paddle_mobile::wrap::make_ddim(shape));
-  net->Feed("image", input);
-  net->Predict();
-  auto output = net->Fetch("save_infer_model/scale_0");
-  int output_size = 1;
-  std::cout << "output shape: ";
-  for (int i = 0; i < output->dims().size(); i++) {
-    std::cout << output->dims()[i] << " ";
-    output_size *= output->dims()[i];
-  }
-  std::cout << std::endl;
-  std::cout << "output data: ";
-  for (int i = 0; i < output_size; i++) {
-    std::cout << output->data()[i] << std::endl;
-  }
-#endif
-  return 0;
-}
diff --git a/mobile/test/net/test_yolo.cpp b/mobile/test/net/test_yolo.cpp
deleted file mode 100644
index 40aabe92f1..0000000000
--- a/mobile/test/net/test_yolo.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  //  ../../../test/models/googlenet
-  //  ../../../test/models/mobilenet
-  auto time1 = time();
-  if (paddle_mobile.Load(std::string(g_yolo) + "/model",
-                         std::string(g_yolo) + "/params", true)) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
-
-    std::vector<int64_t> dims{1, 3, 227, 227};
-    Tensor input_tensor;
-    SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
-                       static_cast<float>(1));
-
-    std::vector<float> input(input_tensor.data<float>(),
-                             input_tensor.data<float>() + input_tensor.numel());
-    // 预热十次
-    for (int i = 0; i < 10; ++i) {
-      paddle_mobile.Predict(input, dims);
-    }
-    auto time3 = time();
-    for (int i = 0; i < 10; ++i) {
-      paddle_mobile.Predict(input, dims);
-    }
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
-              << std::endl;
-  }
-  return 0;
-}
diff --git a/mobile/test/net/test_yolo_combined.cpp b/mobile/test/net/test_yolo_combined.cpp
deleted file mode 100644
index 5a589878cc..0000000000
--- a/mobile/test/net/test_yolo_combined.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  //  ../../../test/models/googlenet
-  //  ../../../test/models/mobilenet
-  auto time1 = time();
-
-  if (paddle_mobile.Load(std::string(g_yolo_vision) + "/model",
-                         std::string(g_yolo_vision) + "/params", true)) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-
-    std::vector<int64_t> dims{1, 3, 416, 416};
-    std::vector<float> input;
-
-    GetInput<float>(g_test_image_1x3x416x416_vision_yolo_input, &input, dims);
-    std::cout << "input.size():  " << input.size() << std::endl;
-    for (int j = 0; j < 100; ++j) {
-      std::cout << j << " :  " << input[j] << std::endl;
-    }
-    //        // 预热十次
-    //        for (int i = 0; i < 10; ++i) {
-    //            paddle_mobile.Predict(input, dims);
-    //        }
-    auto time3 = time();
-    const vector<float> vector_out = paddle_mobile.Predict(input, dims);
-
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
-              << std::endl;
-  }
-  return 0;
-}
diff --git a/mobile/test/net/test_yologpu.cpp b/mobile/test/net/test_yologpu.cpp
deleted file mode 100644
index 37f4a78019..0000000000
--- a/mobile/test/net/test_yologpu.cpp
+++ /dev/null
@@ -1,190 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include <thread>  // NOLINT
-#include "../../src/common/types.h"
-#include "../../src/io/paddle_test_inference_api.h"
-#include "../test_helper.h"
-#include "../test_include.h"
-void t1() {
-  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile_gpu;
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile_cpu;
-  paddle_mobile::PaddleTester<paddle_mobile::CPU> paddle_test_cpu;
-  paddle_mobile::PaddleTester<paddle_mobile::GPU_CL> paddle_test_gpu;
-  printf("cpu time:%f\n", paddle_test_cpu.CaculatePredictTime());
-  std::string path = "/data/local/tmp/bin";
-  printf("gpu time:%f\n", paddle_test_gpu.CaculatePredictTime(&path));
-  //    paddle_mobile.SetThreadNum(4);
-#ifdef PADDLE_MOBILE_CL
-  paddle_mobile_gpu.SetCLPath("/data/local/tmp/bin");
-#endif
-  auto time1 = paddle_mobile::time();
-  auto isok =
-      paddle_mobile_gpu.Load(std::string(g_yolo_vision) + "/model",
-                             std::string(g_yolo_vision) + "/params", true);
-
-  //  auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true);
-  if (isok) {
-    auto time2 = paddle_mobile::time();
-    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
-              << std::endl;
-
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 416, 416};
-    GetInput<float>(g_test_image_1x3x416x416_vision_yolo_input, &input, dims);
-
-    std::vector<float> vec_result;
-    //            = paddle_mobile.Predict(input, dims);
-
-    auto time3 = paddle_mobile::time();
-    int max = 1;
-    for (int i = 0; i < max; ++i) {
-      vec_result = paddle_mobile_gpu.Predict(input, dims);
-    }
-    auto time4 = paddle_mobile::time();
-
-    //    auto time3 = paddle_mobile::time();
-
-    //    for (int i = 0; i < 10; ++i) {
-    //      auto vec_result = paddle_mobile.Predict(input, dims);
-    //    }
-
-    //    auto time4 = paddle_mobile::time();
-
-    std::cout << "predict cost :"
-              << paddle_mobile::time_diff(time3, time4) / max << "ms"
-              << std::endl;
-    std::vector<float>::iterator biggest =
-        std::max_element(std::begin(vec_result), std::end(vec_result));
-    std::cout << " Max element is " << *biggest << " at position "
-              << std::distance(std::begin(vec_result), biggest) << std::endl;
-    //        for (float i : vec_result) {
-    //            std::cout << i << std::endl;
-    //        }
-  }
-}
-
-void t2() {
-  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile;
-  //    paddle_mobile.SetThreadNum(4);
-#ifdef PADDLE_MOBILE_CL
-  paddle_mobile.SetCLPath("/data/local/tmp/bin");
-#endif
-  auto time1 = paddle_mobile::time();
-  auto isok = paddle_mobile.Load(std::string(g_yolo_mul) + "/model",
-                                 std::string(g_yolo_mul) + "/params", true);
-
-  //  auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true);
-  if (isok) {
-    auto time2 = paddle_mobile::time();
-    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
-              << std::endl;
-
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 416, 416};
-    GetInput<float>(g_yolo_img, &input, dims);
-
-    std::vector<float> vec_result;
-    //            = paddle_mobile.Predict(input, dims);
-
-    auto time3 = paddle_mobile::time();
-    int max = 10;
-    for (int i = 0; i < max; ++i) {
-      vec_result = paddle_mobile.Predict(input, dims);
-    }
-    auto time4 = paddle_mobile::time();
-
-    //    auto time3 = paddle_mobile::time();
-
-    //    for (int i = 0; i < 10; ++i) {
-    //      auto vec_result = paddle_mobile.Predict(input, dims);
-    //    }
-
-    //    auto time4 = paddle_mobile::time();
-
-    std::cout << "predict cost :"
-              << paddle_mobile::time_diff(time3, time4) / max << "ms"
-              << std::endl;
-    std::vector<float>::iterator biggest =
-        std::max_element(std::begin(vec_result), std::end(vec_result));
-    std::cout << " Max element is " << *biggest << " at position "
-              << std::distance(std::begin(vec_result), biggest) << std::endl;
-    //        for (float i : vec_result) {
-    //            std::cout << i << std::endl;
-    //        }
-  }
-}
-
-void t3() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  //    paddle_mobile.SetThreadNum(4);
-  // #ifdef PADDLE_MOBILE_CL
-  //  paddle_mobile.SetCLPath("/data/local/tmp/bin");
-  // #endif
-  auto time1 = paddle_mobile::time();
-  auto isok = paddle_mobile.Load(std::string(g_yolo_mul) + "/model",
-                                 std::string(g_yolo_mul) + "/params", true);
-
-  //  auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true);
-  if (isok) {
-    auto time2 = paddle_mobile::time();
-    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
-              << std::endl;
-
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 416, 416};
-    GetInput<float>(g_yolo_img, &input, dims);
-
-    std::vector<float> vec_result = paddle_mobile.Predict(input, dims);
-
-    auto time3 = paddle_mobile::time();
-    int max = 10;
-    for (int i = 0; i < max; ++i) {
-      vec_result = paddle_mobile.Predict(input, dims);
-    }
-    auto time4 = paddle_mobile::time();
-
-    //    auto time3 = paddle_mobile::time();
-
-    //    for (int i = 0; i < 10; ++i) {
-    //      auto vec_result = paddle_mobile.Predict(input, dims);
-    //    }
-
-    //    auto time4 = paddle_mobile::time();
-
-    std::cout << "predict cost :"
-              << paddle_mobile::time_diff(time3, time4) / max << "ms"
-              << std::endl;
-    std::vector<float>::iterator biggest =
-        std::max_element(std::begin(vec_result), std::end(vec_result));
-    std::cout << " Max element is " << *biggest << " at position "
-              << std::distance(std::begin(vec_result), biggest) << std::endl;
-    //        for (float i : vec_result) {
-    //            std::cout << i << std::endl;
-    //        }
-  }
-}
-
-int main() {
-  //  std::thread th1(t1);
-  //      std::thread th2(t2);
-  //  std::thread th3(t3);
-  std::thread th1(t1);
-  //  th1.join();
-  //      th2.join();
-  //  th3.join();
-  th1.join();
-  return 0;
-}
diff --git a/mobile/test/operators/test_batchnorm_op.cpp b/mobile/test/operators/test_batchnorm_op.cpp
deleted file mode 100644
index 92cb7157c1..0000000000
--- a/mobile/test/operators/test_batchnorm_op.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/batchnorm_op.h"
-
-namespace paddle_mobile {
-
-void BatchNorm(const framework::Tensor *X, const framework::Tensor *Mean,
-               const framework::Tensor *Var, const framework::Tensor *Scale,
-               const framework::Tensor *Bias, const float eps,
-               framework::Tensor *Y) {
-  const float *x = X->data<float>();
-  const float *m = Mean->data<float>();
-  const float *v = Var->data<float>();
-  const float *s = Scale->data<float>();
-  const float *b = Bias->data<float>();
-  float *y = Y->mutable_data<float>();
-
-  int batch_size = X->dims()[0];
-  int channel = X->dims()[1];
-  int hw = X->dims()[2] * X->dims()[3];
-
-  for (int batch = 0; batch < batch_size; ++batch) {
-    for (int c = 0; c < channel; ++c) {
-      float mean = m[c];
-      float inv_var = 1.f / std::sqrt(v[c] + eps);
-      float scale = s[c];
-      float bias = b[c];
-      const float *input = x + (batch * channel + c) * hw;
-      float *output = y + (batch * channel + c) * hw;
-      for (int j = 0; j < hw; ++j) {
-        output[j] = scale * ((input[j] - mean) * inv_var) + bias;
-      }
-    }
-  }
-}
-
-int TestBatchNormOp(const std::vector<int> input_shape) {
-  framework::DDim dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input"});
-  inputs["Mean"] = std::vector<std::string>({"mean"});
-  inputs["Variance"] = std::vector<std::string>({"variance"});
-  inputs["Scale"] = std::vector<std::string>({"scale"});
-  inputs["Bias"] = std::vector<std::string>({"bias"});
-  outputs["Y"] = std::vector<std::string>({"output"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(input, dims, -100.0, 100.0);
-
-  auto mean_var = scope.get()->Var("mean");
-  auto mean = mean_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(mean, framework::make_ddim({input_shape[1]}), -10.0, 10.0);
-
-  auto vari_var = scope.get()->Var("variance");
-  auto vari = vari_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(vari, framework::make_ddim({input_shape[1]}), -10.0, 10.0);
-
-  auto scale_var = scope.get()->Var("scale");
-  auto scale = scale_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(scale, framework::make_ddim({input_shape[1]}), -10.0,
-                     10.0);
-
-  auto bias_var = scope.get()->Var("bias");
-  auto bias = bias_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(bias, framework::make_ddim({input_shape[1]}), -10.0, 10.0);
-
-  auto output_var = scope.get()->Var("output");
-
-  float eps = 1e-6;
-  framework::AttributeMap attrs;
-  attrs["epsilon"].Set<float>(eps);
-  attrs["momentum"].Set<float>(0.f);
-
-  auto *op = new operators::BatchNormOp<CPU, float>(
-      "batch_norm", inputs, outputs, attrs, scope.get());
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-
-  framework::Tensor output_cmp;
-  float *output_cmp_data = output_cmp.mutable_data<float>(output->dims());
-  BatchNorm(input, mean, vari, scale, bias, eps, &output_cmp);
-
-  const float *output_data = output->data<float>();
-  for (int i = 0; i < output->numel(); ++i) {
-    float gap = output_data[i] - output_cmp_data[i];
-    if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-}
-
-}  // namespace paddle_mobile
-
-int main() {
-  TestBatchNormOp({1, 1, 10, 10});
-  TestBatchNormOp({1, 32, 100, 100});
-  return 0;
-}
diff --git a/mobile/test/operators/test_box_coder_op.cpp b/mobile/test/operators/test_box_coder_op.cpp
deleted file mode 100644
index 39b8257e66..0000000000
--- a/mobile/test/operators/test_box_coder_op.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/box_coder_op.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestBoxCoderOp {
- public:
-  explicit TestBoxCoderOp(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    //  DLOG << " **block size " << blocks.size();
-    for (auto block_desc : blocks) {
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      //    DLOG << " ops " << ops.size();
-      for (auto op : ops) {
-        if (op->Type() == "box_coder" &&
-            op->Input("PriorBox")[0] == "concat_0.tmp_0") {
-          DLOG << " mul attr size: " << op->GetAttrMap().size();
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " outputs size: " << op->GetOutputs().size();
-          DLOG << " Input PriorBox is : " << op->Input("PriorBox")[0];
-          DLOG << " Input PriorBoxVar is : " << op->Input("PriorBoxVar")[0];
-          DLOG << " Input TargetBox is : " << op->Input("TargetBox")[0];
-          DLOG << " OutputBox is : " << op->Output("OutputBox")[0];
-          DLOG << " code_type : "
-               << op->GetAttrMap().at("code_type").GetString();
-          std::shared_ptr<operators::BoxCoderOp<Dtype, float>> boxcoder =
-              std::make_shared<operators::BoxCoderOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope.get());
-          ops_of_block_[*block_desc.get()].push_back(boxcoder);
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict_boxcoder(const Tensor &t1, const Tensor &t2,
-                                           const Tensor &t3) {
-    // feed
-    auto scope = program_.scope.get();
-    Variable *prior_box = scope->Var("concat_0.tmp_0");
-    auto tensor_x1 = prior_box->GetMutable<LoDTensor>();
-    tensor_x1->ShareDataWith(t1);
-
-    Variable *prior_box_var = scope->Var("concat_1.tmp_0");
-    auto tensor_x2 = prior_box_var->GetMutable<LoDTensor>();
-    tensor_x2->ShareDataWith(t2);
-
-    Variable *target_box = scope->Var("concat_2.tmp_0");
-    auto tensor_x3 = target_box->GetMutable<LoDTensor>();
-    tensor_x3->ShareDataWith(t3);
-
-    Variable *boxes_output = scope->Var("box_coder_0.tmp_0");
-    auto *boxes_output_tensor = boxes_output->GetMutable<LoDTensor>();
-    boxes_output_tensor->mutable_data<float>({1, 1917, 4});
-
-    //  DLOG << typeid(output_tensor).name();
-    //  DLOG << "output_tensor dims: " << output_tensor->dims();
-
-    std::shared_ptr<Tensor> outbox_tensor = std::make_shared<LoDTensor>();
-    outbox_tensor.reset(boxes_output_tensor);
-
-    predict_boxcoder(t1, t2, t3, 0);
-
-    return outbox_tensor;
-  }
-
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-
-  void predict_boxcoder(const Tensor &t1, const Tensor &t2, const Tensor &t3,
-                        int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      DLOG << "op -> run()";
-      op->Run();
-    }
-  }
-};
-
-template class TestBoxCoderOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-
-int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run BoxCoderOp Test";
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_mobilenet_ssd));
-
-  paddle_mobile::framework::Tensor priorbox;
-  SetupTensor<float>(&priorbox, {1917, 4}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *priorbox_ptr = priorbox.data<float>();
-
-  paddle_mobile::framework::Tensor priorboxvar;
-  SetupTensor<float>(&priorboxvar, {1917, 4}, static_cast<float>(0.1),
-                     static_cast<float>(0.2));
-  auto *priorboxvar_ptr = priorboxvar.data<float>();
-
-  paddle_mobile::framework::Tensor targetbox;
-  SetupTensor<float>(&targetbox, {1, 1917, 4}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *targetbox_ptr = targetbox.data<float>();
-
-  paddle_mobile::framework::TestBoxCoderOp<paddle_mobile::CPU> testBoxCoderOp(
-      program);
-
-  auto output_boxcoder =
-      testBoxCoderOp.predict_boxcoder(priorbox, priorboxvar, targetbox);
-  auto output_boxcoder_ptr = output_boxcoder->data<float>();
-
-  for (int i = 0; i < output_boxcoder->numel(); i++) {
-    DLOG << output_boxcoder_ptr[i];
-  }
-  DLOGF("\n");
-  /// testing 25th bbox.
-  DLOG << "PriorBox**************";
-  DLOG << priorbox_ptr[100];
-  DLOG << priorbox_ptr[101];
-  DLOG << priorbox_ptr[102];
-  DLOG << priorbox_ptr[103];
-  DLOG << "PriorBoxVar**************";
-  DLOG << priorboxvar_ptr[100];
-  DLOG << priorboxvar_ptr[101];
-  DLOG << priorboxvar_ptr[102];
-  DLOG << priorboxvar_ptr[103];
-  DLOG << "TargetBox***************";
-  DLOG << targetbox_ptr[100];
-  DLOG << targetbox_ptr[101];
-  DLOG << targetbox_ptr[102];
-  DLOG << targetbox_ptr[103];
-  DLOG << "OutputBox**************";
-  DLOG << output_boxcoder_ptr[100];
-  DLOG << output_boxcoder_ptr[101];
-  DLOG << output_boxcoder_ptr[102];
-  DLOG << output_boxcoder_ptr[103];
-
-  DLOG << "***********----------------------**************";
-  auto priorbox_w = priorbox_ptr[102] - priorbox_ptr[100];
-  auto priorbox_h = priorbox_ptr[103] - priorbox_ptr[101];
-  auto priorbox_center_x = (priorbox_ptr[100] + priorbox_ptr[102]) / 2;
-  auto priorbox_center_y = (priorbox_ptr[101] + priorbox_ptr[103]) / 2;
-  DLOG << "prior box width : " << priorbox_w;
-  DLOG << "prior box height : " << priorbox_h;
-  DLOG << "prior box center x : " << priorbox_center_x;
-  DLOG << "prior box center y : " << priorbox_center_y;
-  auto target_box_center_x =
-      priorboxvar_ptr[100] * targetbox_ptr[100] * priorbox_w +
-      priorbox_center_x;
-  DLOG << "target_box_center_x : " << target_box_center_x;
-  auto target_box_center_y =
-      priorboxvar_ptr[101] * targetbox_ptr[101] * priorbox_h +
-      priorbox_center_y;
-  DLOG << "target_box_center_y : " << target_box_center_y;
-  auto target_box_width =
-      std::exp(priorboxvar_ptr[102] * targetbox_ptr[102]) * priorbox_w;
-  DLOG << "target_box_width : " << target_box_width;
-  auto target_box_height =
-      std::exp(priorboxvar_ptr[103] * targetbox_ptr[103]) * priorbox_h;
-  DLOG << "target_box_height : " << target_box_height;
-  DLOG << "pre x min : " << target_box_center_x - target_box_width / 2;
-  DLOG << "pre y min : " << target_box_center_y - target_box_height / 2;
-  DLOG << "pre x max : " << target_box_center_x + target_box_width / 2;
-  DLOG << "pre y max : " << target_box_center_y + target_box_height / 2;
-  return 0;
-}
diff --git a/mobile/test/operators/test_cast_op.cpp b/mobile/test/operators/test_cast_op.cpp
deleted file mode 100644
index f330e07eaf..0000000000
--- a/mobile/test/operators/test_cast_op.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/cast_op.h"
-
-namespace paddle_mobile {
-
-template <typename Itype, typename Otype>
-void Cast(const framework::Tensor *X, framework::Tensor *Y) {
-  const Itype *x = X->data<Itype>();
-  Otype *y = Y->mutable_data<Otype>();
-
-  for (int i = 0; i < X->numel(); ++i) {
-    y[i] = static_cast<Otype>(x[i]);
-  }
-}
-
-template <typename T>
-int TypeInt() {}
-template <>
-int TypeInt<bool>() {
-  return 0;
-}
-template <>
-int TypeInt<int>() {
-  return 2;
-}
-template <>
-int TypeInt<int64_t>() {
-  return 3;
-}
-template <>
-int TypeInt<float>() {
-  return 5;
-}
-template <>
-int TypeInt<double>() {
-  return 6;
-}
-template <>
-int TypeInt<size_t>() {
-  return 19;
-}
-template <>
-int TypeInt<uint8_t>() {
-  return 20;
-}
-template <>
-int TypeInt<int8_t>() {
-  return 21;
-}
-
-template <typename Itype, typename Otype>
-int TestCastOp(const std::vector<int> input_shape) {
-  framework::DDim dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<Itype>(input, dims, static_cast<Itype>(-100),
-                     static_cast<Itype>(100));
-
-  auto output_var = scope.get()->Var("output");
-
-  framework::AttributeMap attrs;
-  attrs["in_dtype"].Set<int>(TypeInt<Itype>());
-  attrs["out_dtype"].Set<int>(TypeInt<Otype>());
-  auto *op = new operators::CastOp<CPU, float>("cast", inputs, outputs, attrs,
-                                               scope.get());
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-
-  framework::Tensor output_cmp;
-  Otype *output_cmp_data = output_cmp.mutable_data<Otype>(output->dims());
-  Cast<Itype, Otype>(input, &output_cmp);
-
-  const Otype *output_data = output->data<Otype>();
-  for (int i = 0; i < output->numel(); ++i) {
-    float gap = output_data[i] - output_cmp_data[i];
-    if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main(int argc, char *argv[]) {
-  TestCastOp<float, int>({1, 100});
-  TestCastOp<float, int>({128, 100});
-
-  TestCastOp<float, int64_t>({1, 100});
-  TestCastOp<float, int64_t>({128, 100});
-
-  TestCastOp<int, float>({1, 100});
-  TestCastOp<int, float>({128, 100});
-
-  TestCastOp<int64_t, float>({1, 100});
-  TestCastOp<int64_t, float>({128, 100});
-  return 0;
-}
diff --git a/mobile/test/operators/test_concat_op.cpp b/mobile/test/operators/test_concat_op.cpp
deleted file mode 100644
index 761d1ac51d..0000000000
--- a/mobile/test/operators/test_concat_op.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cstring>
-#include <iostream>
-#include <vector>
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/concat_op.h"
-
-namespace paddle_mobile {
-using framework::AttributeMap;
-using framework::DDim;
-using framework::LoDTensor;
-using framework::Scope;
-using framework::make_ddim;
-
-template <typename T>
-void concat(const std::vector<LoDTensor> &input, LoDTensor *output, int axis) {
-  int num = input.size();
-
-  int rows = 1;
-  auto dim_0 = input[0].dims();
-  for (int i = 0; i < axis; ++i) {
-    rows *= dim_0[i];
-  }
-  int out_rows = rows, out_cols = 0;
-
-  std::vector<int> input_cols(input.size());
-  for (int i = 0; i < num; ++i) {
-    int t_cols = input[i].numel() / rows;
-    out_cols += t_cols;
-    input_cols[i] = t_cols;
-  }
-
-  // computation
-  auto output_data = output->data<T>();
-  int col_idx = 0;
-  for (int j = 0; j < num; ++j) {
-    int col_len = input_cols[j];
-    auto input_data = input[j].data<T>();
-    for (int k = 0; k < out_rows; ++k) {
-      memcpy(output_data + k * out_cols + col_idx, input_data + k * col_len,
-             sizeof(T) * col_len);
-    }
-    col_idx += col_len;
-  }
-}
-
-template <typename T>
-int TestConcatOP() {
-  DDim inputA_shape = make_ddim({10, 4, 2, 2});
-  DDim inputB_shape = make_ddim({20, 4, 2, 2});
-  DDim inputC_shape = make_ddim({30, 4, 2, 2});
-  DDim inputD_shape = make_ddim({40, 4, 2, 2});
-  DDim output_shape = make_ddim({100, 4, 2, 2});
-  int axis_v = 0;
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  std::vector<LoDTensor> input_tensors;
-  auto scope = std::make_shared<Scope>();
-  inputs["X"] =
-      std::vector<std::string>({"inputA", "inputB", "inputC", "inputD"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto inputA_var = scope.get()->Var("inputA");
-  auto inputA = inputA_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<T>(inputA, inputA_shape, -127, 127);
-  input_tensors.push_back(std::move(*inputA));
-
-  auto inputB_var = scope.get()->Var("inputB");
-  auto inputB = inputB_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<T>(inputB, inputB_shape, -127, 127);
-  input_tensors.push_back(std::move(*inputB));
-
-  auto inputC_var = scope.get()->Var("inputC");
-  auto inputC = inputC_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<T>(inputC, inputC_shape, -127, 127);
-  input_tensors.push_back(std::move(*inputC));
-
-  auto inputD_var = scope.get()->Var("inputD");
-  auto inputD = inputD_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<T>(inputD, inputD_shape, -127, 127);
-  input_tensors.push_back(std::move(*inputD));
-
-  auto output_var = scope.get()->Var("output");
-  AttributeMap attrs;
-  attrs["axis"].Set<int>(axis_v);
-
-  auto *op = new operators::ConcatOp<CPU, float>("concat", inputs, outputs,
-                                                 attrs, scope.get());
-  op->InferShape();
-  op->Run();
-  auto output = output_var->template Get<framework::LoDTensor>();
-  const T *output_data = output->data<T>();
-  LoDTensor output_cmp;
-  output_cmp.mutable_data<T>(output_shape);
-  concat<T>(input_tensors, &output_cmp, axis_v);
-  const T *output_cmp_data = output_cmp.data<T>();
-  // compare
-  int eq = 0;
-  int neq = 0;
-  for (int i = 0; i < output->numel(); ++i) {
-    PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],
-                          "The execution of test_concat_op is failed!");
-    if (output_data[i] == output_cmp_data[i]) {
-      ++eq;
-    } else {
-      ++neq;
-    }
-  }
-  std::cout << "eq = " << eq << ", neq = " << neq << std::endl;
-
-  delete op;
-  return 0;
-}
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  paddle_mobile::TestConcatOP<float>();
-  paddle_mobile::TestConcatOP<int8_t>();
-  return 0;
-}
diff --git a/mobile/test/operators/test_conv_add_relu_op.cpp b/mobile/test/operators/test_conv_add_relu_op.cpp
deleted file mode 100644
index f170719218..0000000000
--- a/mobile/test/operators/test_conv_add_relu_op.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/fusion_conv_add_relu_op.h"
-
-int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  //  ../models/image_classification_resnet.inference.model
-  auto program = loader.Load(g_googlenet, true);
-
-  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
-                        "program file read fail");
-
-  Executor4Test<
-      paddle_mobile::CPU,
-      paddle_mobile::operators::FusionConvAddReluOp<paddle_mobile::CPU, float>>
-      executor(program, "fusion_conv_add_relu", true);
-
-  paddle_mobile::framework::Tensor input;
-  GetInput<float>(g_test_image_1x3x224x224, &input, {1, 3, 224, 224});
-  //  // use SetupTensor if not has local input image .
-  //  SetupTensor<float>(&input, {1, 3, 224, 224}, static_cast<float>(0),
-  //                     static_cast<float>(1));
-
-  auto out_ddim = paddle_mobile::framework::make_ddim({1, 64, 112, 112});
-  auto output = executor.Predict(input, "data", "conv2d_0.tmp_2", out_ddim);
-
-  auto output_ptr = output->data<float>();
-  for (int j = 0; j < 25; ++j) {
-    DLOG << " value of output: " << output_ptr[j];
-  }
-  return 0;
-}
diff --git a/mobile/test/operators/test_conv_bn_relu_op.cpp b/mobile/test/operators/test_conv_bn_relu_op.cpp
deleted file mode 100644
index b51bdc0737..0000000000
--- a/mobile/test/operators/test_conv_bn_relu_op.cpp
+++ /dev/null
@@ -1,172 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/fusion_conv_bn_relu_op.h"
-
-namespace paddle_mobile {
-
-// Reference convolution from Caffe for checking results.
-// accumulate through explicit loops over input, output, and filters.
-template <typename Itype, typename Otype, int Kernel, int Pad, int Stride>
-int TestConvBnReluOp(int in_channels, int in_height, int in_width,
-                     int out_channels, int groups, std::string opname) {
-  int kernel_h = Kernel;
-  int kernel_w = Kernel;
-  int pad_h = Pad;
-  int pad_w = Pad;
-  int stride_h = Stride;
-  int stride_w = Stride;
-  int dilation_h = 1;
-  int dilation_w = 1;
-
-  int batch_size = 1;
-  int input_c = in_channels;
-  int input_h = in_height;
-  int input_w = in_width;
-  int output_c = out_channels;
-  framework::DDim input_shape =
-      framework::make_ddim({batch_size, input_c, input_h, input_w});
-  framework::DDim filter_shape =
-      framework::make_ddim({output_c, input_c / groups, kernel_h, kernel_w});
-  framework::DDim shape = framework::make_ddim({output_c});
-
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["Input"] = std::vector<std::string>({"input"});
-  inputs["Filter"] = std::vector<std::string>({"filter"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-  inputs["Mean"] = std::vector<std::string>({"input_mean"});
-  inputs["Variance"] = std::vector<std::string>({"input_variance"});
-  inputs["Scale"] = std::vector<std::string>({"input_scale"});
-  inputs["Bias"] = std::vector<std::string>({"input_bias"});
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<Itype>(input, input_shape, -20.0, 20.0);
-
-  auto filter_var = scope.get()->Var("filter");
-  auto filter = filter_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<Itype>(filter, filter_shape, -20, 20);
-
-  auto input_mean_var = scope.get()->Var("input_mean");
-  auto input_mean = input_mean_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(input_mean, shape, -10.0, 10.0);
-  auto vari_var = scope.get()->Var("input_variance");
-  auto vari = vari_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(vari, shape, -10.0, 10.0);
-  auto scale_var = scope.get()->Var("input_scale");
-  auto scale = scale_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(scale, shape, -10.0, 10.0);
-  auto input_bias_var = scope.get()->Var("input_bias");
-  auto input_bias = input_bias_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(input_bias, shape, -10.0, 10.0);
-
-  auto output_var = scope.get()->Var("output");
-  framework::AttributeMap attrs;
-  attrs["strides"].Set<vector<int>>(std::vector<int>({stride_h, stride_w}));
-  attrs["paddings"].Set<vector<int>>(std::vector<int>({pad_h, pad_w}));
-  attrs["dilations"].Set<vector<int>>(
-      std::vector<int>({dilation_h, dilation_w}));
-  attrs["groups"].Set<int>(groups);
-  attrs["epsilon"].Set<float>(1e-6);
-  attrs["momentum"].Set<float>(0.f);
-  auto *op = new operators::FusionConvBNReluOp<CPU, float>(
-      "fusion_conv_bn_relu", inputs, outputs, attrs, scope.get());
-  op->InferShape();
-  op->Init();
-  for (int i = 0; i < 10; ++i) {
-    op->Run();
-  }
-  auto time1 = time();
-  for (int i = 0; i < 10; ++i) {
-    op->Run();
-  }
-  auto time2 = time();
-  std::ofstream out_file("./out_conv.txt", std::ios::app);
-  out_file << opname << " cost :" << time_diff(time1, time2) / 10.0 << "ms"
-           << std::endl;
-  out_file.close();
-
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main(int argc, char *argv[]) {
-  // kernel = 3, pad = 1, stride = 2
-  paddle_mobile::TestConvBnReluOp<float, float, 3, 1, 2>(3, 48, 48, 16, 1,
-                                                         "conv_bn_relu");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(16, 24, 24, 8, 1,
-                                                         "depthwise_seperable");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(8, 24, 24, 24, 1,
-                                                         "MBConv_3x3_conv1");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(24, 24, 24, 8, 1,
-                                                         "MBConv_3x3_pw1");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(8, 24, 24, 24, 1,
-                                                         "MBConv_3x3_conv2");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(24, 24, 24, 8, 1,
-                                                         "MBConv_3x3_pw2");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(8, 24, 24, 24, 1,
-                                                         "MBConv_3x3_conv3");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(24, 12, 12, 16, 1,
-                                                         "MBConv_3x3_pw3");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
-      16, 12, 12, 48, 1, "MBConv_5x5_stage1_conv1");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
-      48, 12, 12, 16, 1, "MBConv_5x5_stage1_pw1");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
-      16, 12, 12, 48, 1, "MBConv_5x5_stage1_conv2");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
-      48, 12, 12, 16, 1, "MBConv_5x5_stage1_pw2");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
-      16, 12, 12, 48, 1, "MBConv_5x5_stage1_conv3");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
-      48, 6, 6, 32, 1, "MBConv_5x5_stage1_pw3");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
-      32, 6, 6, 192, 1, "MBConv_5x5_stage2_conv1");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
-      192, 6, 6, 32, 1, "MBConv_5x5_stage2_pw1");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
-      32, 6, 6, 192, 1, "MBConv_5x5_stage2_conv2");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
-      192, 6, 6, 32, 1, "MBConv_5x5_stage2_pw2");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
-      32, 6, 6, 192, 1, "MBConv_5x5_stage2_conv3");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
-      192, 6, 6, 64, 1, "MBConv_5x5_stage2_pw3");
-
-  return 0;
-}
diff --git a/mobile/test/operators/test_conv_gpu.cpp b/mobile/test/operators/test_conv_gpu.cpp
deleted file mode 100644
index f9b1782b77..0000000000
--- a/mobile/test/operators/test_conv_gpu.cpp
+++ /dev/null
@@ -1,199 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_MOBILE_CL
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "common/common.h"
-#include "framework/cl/cl_helper.h"
-#include "framework/cl/cl_image.h"
-#include "operators/conv_op.h"
-#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
-
-namespace paddle_mobile {
-
-template <typename Itype, typename Otype, int Kernel, int Pad, int Stride>
-int TestConvOp(int in_channels, int in_height, int in_width, int out_channels,
-               int groups) {
-  int kernel_h = Kernel;
-  int kernel_w = Kernel;
-  int pad_h = Pad;
-  int pad_w = Pad;
-  int stride_h = Stride;
-  int stride_w = Stride;
-  int dilation_h = 1;
-  int dilation_w = 1;
-
-  int batch_size = 1;
-  int input_c = in_channels;
-  int input_h = in_height;
-  int input_w = in_width;
-  int output_c = out_channels;
-  framework::DDim input_shape =
-      framework::make_ddim({batch_size, input_c, input_h, input_w});
-  framework::DDim filter_shape =
-      framework::make_ddim({output_c, input_c / groups, kernel_h, kernel_w});
-
-  //    std::cerr << " init " << std::endl;
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["Input"] = std::vector<std::string>({"input"});
-  inputs["Filter"] = std::vector<std::string>({"filter"});
-  outputs["Output"] = std::vector<std::string>({"output"});
-  cl_context context = scope->GetCLScpoe()->Context();
-  cl_command_queue command_queue = scope->GetCLScpoe()->CommandQueue();
-
-  //    std::cerr << " input " << std::endl;
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::CLImage>();
-  const int in_numel = framework::product(input_shape);
-  float *in_data = new float[in_numel];
-  for (int i = 0; i < in_numel; ++i) {
-    in_data[i] = (i % 36 / 6) + 1;
-  }
-  input->SetTensorData(in_data, input_shape);
-  input->InitNormalCLImage(context, command_queue);
-  DLOG << "input image \n" << *input;
-
-  //    std::cerr << " filter " << std::endl;
-  auto filter_var = scope.get()->Var("filter");
-  auto filter = filter_var->template GetMutable<framework::CLImage>();
-  const int filter_numel = product(filter_shape);
-  float *filter_data = new float[filter_numel];
-  for (int i = 0; i < filter_numel; ++i) {
-    filter_data[i] = i % 9;
-  }
-  filter->SetTensorData(filter_data, filter_shape);
-
-  //    std::cerr << " attrs " << std::endl;
-  framework::AttributeMap attrs;
-  attrs["strides"].Set<vector<int>>(std::vector<int>({stride_h, stride_w}));
-  attrs["paddings"].Set<vector<int>>(std::vector<int>({pad_h, pad_w}));
-  attrs["dilations"].Set<vector<int>>(
-      std::vector<int>({dilation_h, dilation_w}));
-  attrs["groups"].Set<int>(groups);
-
-  std::cerr << " output " << std::endl;
-  auto output_var = scope.get()->Var("output");
-  auto output = output_var->template GetMutable<framework::CLImage>();
-
-  auto *op = new operators::ConvOp<GPU_CL, float>("conv2d", inputs, outputs,
-                                                  attrs, scope.get());
-
-  op->InferShape();
-
-  framework::DDim ddim = output->dims();
-
-  DLOG << "output dims = " << ddim;
-  output->InitEmptyImage(context, command_queue, ddim);
-
-  //    std::cerr << " op->init " << std::endl;
-  op->Init();
-
-  auto time1 = time();
-  op->Run();
-  auto time2 = time();
-  std::cerr << "time cost : " << time_diff(time1, time2) << std::endl;
-
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int TestAll(const int in_channels, const int in_height, const int in_width,
-            const int out_channels, const int groups) {
-  std::cerr << "in_channels=" << in_channels << ", in_height=" << in_height
-            << ", in_width=" << in_width << ", out_channels=" << out_channels
-            << ", groups=" << groups << std::endl;
-  std::cerr << "float, kernel=3, pad=1, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<float, float, 3, 1, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-
-  return 0;
-}
-#endif
-
-int main() {
-  //  TestAll(4, 6, 6, 4, 1);
-  //  TestAll(6, 32, 32, 24, 1);
-  //    TestAll(12, 32, 32, 24, 1);
-  //    TestAll(24, 32, 32, 24, 1);
-  //    TestAll(36, 32, 32, 24, 1);
-  //    TestAll(48, 32, 32, 24, 1);
-  //    TestAll(60, 32, 32, 24, 1);
-  //    TestAll(72, 32, 32, 24, 1);
-  //    TestAll(116, 32, 32, 24, 1);
-  //    TestAll(232, 32, 32, 24, 1);
-  //    TestAll(464, 32, 32, 24, 1);
-  //
-  //    TestAll(6, 64, 64, 24, 1);
-  //    TestAll(12, 64, 64, 24, 1);
-  //    TestAll(24, 64, 64, 24, 1);
-  //    TestAll(36, 64, 64, 24, 1);
-  //    TestAll(48, 64, 64, 24, 1);
-  //    TestAll(60, 64, 64, 24, 1);
-  //    TestAll(72, 64, 64, 24, 1);
-  //    TestAll(116, 64, 64, 24, 1);
-  //    TestAll(232, 64, 64, 24, 1);
-  //    TestAll(464, 64, 64, 24, 1);
-  //
-  //    TestAll(6, 128, 128, 24, 1);
-  //    TestAll(12, 128, 128, 24, 1);
-  //    TestAll(24, 128, 128, 24, 1);
-  //    TestAll(36, 128, 128, 24, 1);
-  //    TestAll(48, 128, 128, 24, 1);
-  //    TestAll(60, 128, 128, 24, 1);
-  //    TestAll(72, 128, 128, 24, 1);
-  //    TestAll(116, 128, 128, 24, 1);
-  //    TestAll(232, 128, 128, 24, 1);
-  //    TestAll(464, 128, 128, 24, 1);
-  //
-  //
-  //    TestAll(6, 32, 32, 6, 1);
-  //    TestAll(12, 32, 32, 12, 1);
-  //    TestAll(24, 32, 32, 24, 1);
-  //    TestAll(36, 32, 32, 36, 1);
-  //    TestAll(48, 32, 32, 48, 1);
-  //    TestAll(60, 32, 32, 60, 1);
-  //    TestAll(72, 32, 32, 72, 1);
-  //    TestAll(116, 32, 32, 116, 1);
-  //    TestAll(232, 32, 32, 232, 1);
-  //    TestAll(464, 32, 32, 464, 1);
-  //
-  //    TestAll(6, 64, 64, 6, 1);
-  //    TestAll(12, 64, 64, 12, 1);
-  //    TestAll(24, 64, 64, 24, 1);
-  //    TestAll(36, 64, 64, 36, 1);
-  //    TestAll(48, 64, 64, 48, 1);
-  //    TestAll(60, 64, 64, 60, 1);
-  //    TestAll(72, 64, 64, 72, 1);
-  //    TestAll(116, 64, 64, 116, 1);
-  //    TestAll(232, 64, 64, 232, 1);
-  //    TestAll(464, 64, 64, 464, 1);
-  //
-  //    TestAll(6, 128, 128, 6, 1);
-  //    TestAll(12, 128, 128, 12, 1);
-  //    TestAll(24, 128, 128, 24, 1);
-  //    TestAll(36, 128, 128, 36, 1);
-  //    TestAll(48, 128, 128, 48, 1);
-  //    TestAll(60, 128, 128, 60, 1);
-  //    TestAll(72, 128, 128, 72, 1);
-  //    TestAll(116, 128, 128, 116, 1);
-  //    TestAll(232, 128, 128, 232, 1);
-  //    TestAll(464, 128, 128, 464, 1);
-  return 0;
-}
diff --git a/mobile/test/operators/test_conv_op.cpp b/mobile/test/operators/test_conv_op.cpp
deleted file mode 100644
index c705e162fe..0000000000
--- a/mobile/test/operators/test_conv_op.cpp
+++ /dev/null
@@ -1,358 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/conv_op.h"
-
-namespace paddle_mobile {
-
-// Reference convolution from Caffe for checking results.
-// accumulate through explicit loops over input, output, and filters.
-template <typename Itype, typename Otype>
-void conv2d(const framework::Tensor *input, const framework::Tensor *filter,
-            const framework::AttributeMap &attrs, framework::Tensor *output) {
-  framework::AttrReader attr_reader(attrs);
-  std::vector<int> paddings = attr_reader.Get<std::vector<int>>("paddings");
-  std::vector<int> strides = attr_reader.Get<std::vector<int>>("strides");
-  std::vector<int> dilations = attr_reader.Get<std::vector<int>>("dilations");
-  int groups = attr_reader.Get<int>("groups");
-  int kernel_h = filter->dims()[2];
-  int kernel_w = filter->dims()[3];
-  int pad_h = paddings[0];
-  int pad_w = paddings[1];
-  int stride_h = strides[0];
-  int stride_w = strides[1];
-  int dilation_h = dilations[0];
-  int dilation_w = dilations[1];
-  auto in_shape = input->dims();
-  auto out_shape = output->dims();
-
-  const bool has_depth = 0;
-  int kernel_d, pad_d, stride_d, dilation_d;
-  if (has_depth) {
-    kernel_d = kernel_h;
-    stride_d = stride_h;
-    pad_d = pad_h;
-    dilation_d = dilation_h;
-  } else {
-    kernel_d = stride_d = dilation_d = 1;
-    pad_d = 0;
-  }
-  // Groups
-  int o_g = out_shape[1] / groups;
-  int k_g = in_shape[1] / groups;
-  int o_head, k_head;
-  // Convolution
-  vector<int> weight_offset(4 + has_depth);
-  vector<int> in_offset(4 + has_depth);
-  vector<int> out_offset(4 + has_depth);
-  auto offset = [](const framework::Tensor *input, const vector<int> &indics) {
-    framework::DDim shape = input->dims();
-    size_t count = 0;
-    for (int i = 0; i < indics.size(); ++i) {
-      count *= shape[i];
-      count += indics[i];
-    }
-    return count;
-  };
-
-  const Itype *in_data = input->data<Itype>();
-  const Itype *w_data = filter->data<Itype>();
-  Otype *out_data = output->mutable_data<Otype>();
-  memset(out_data, 0, output->numel() * sizeof(Otype));
-  for (int n = 0; n < out_shape[0]; n++) {
-    for (int g = 0; g < groups; g++) {
-      o_head = o_g * g;
-      k_head = k_g * g;
-      for (int o = 0; o < o_g; o++) {
-        for (int k = 0; k < k_g; k++) {
-          for (int z = 0; z < (has_depth ? out_shape[2] : 1); z++) {
-            for (int y = 0; y < out_shape[2 + has_depth]; y++) {
-              for (int x = 0; x < out_shape[3 + has_depth]; x++) {
-                for (int r = 0; r < kernel_d; r++) {
-                  for (int p = 0; p < kernel_h; p++) {
-                    for (int q = 0; q < kernel_w; q++) {
-                      int in_z = z * stride_d - pad_d + r * dilation_d;
-                      int in_y = y * stride_h - pad_h + p * dilation_h;
-                      int in_x = x * stride_w - pad_w + q * dilation_w;
-                      if (in_z >= 0 && in_z < (has_depth ? in_shape[2] : 1) &&
-                          in_y >= 0 && in_y < in_shape[2 + has_depth] &&
-                          in_x >= 0 && in_x < in_shape[3 + has_depth]) {
-                        weight_offset[0] = o + o_head;
-                        weight_offset[1] = k;
-                        if (has_depth) {
-                          weight_offset[2] = r;
-                        }
-                        weight_offset[2 + has_depth] = p;
-                        weight_offset[3 + has_depth] = q;
-                        in_offset[0] = n;
-                        in_offset[1] = k + k_head;
-                        if (has_depth) {
-                          in_offset[2] = in_z;
-                        }
-                        in_offset[2 + has_depth] = in_y;
-                        in_offset[3 + has_depth] = in_x;
-                        out_offset[0] = n;
-                        out_offset[1] = o + o_head;
-                        if (has_depth) {
-                          out_offset[2] = z;
-                        }
-                        out_offset[2 + has_depth] = y;
-                        out_offset[3 + has_depth] = x;
-
-                        out_data[offset(output, out_offset)] +=
-                            in_data[offset(input, in_offset)] *
-                            w_data[offset(filter, weight_offset)];
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename Itype, typename Otype, int Kernel, int Pad, int Stride>
-int TestConvOp(int in_channels, int in_height, int in_width, int out_channels,
-               int groups) {
-  int kernel_h = Kernel;
-  int kernel_w = Kernel;
-  int pad_h = Pad;
-  int pad_w = Pad;
-  int stride_h = Stride;
-  int stride_w = Stride;
-  int dilation_h = 1;
-  int dilation_w = 1;
-
-  int batch_size = 1;
-  int input_c = in_channels;
-  int input_h = in_height;
-  int input_w = in_width;
-  int output_c = out_channels;
-  framework::DDim input_shape =
-      framework::make_ddim({batch_size, input_c, input_h, input_w});
-  framework::DDim filter_shape =
-      framework::make_ddim({output_c, input_c / groups, kernel_h, kernel_w});
-
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["Input"] = std::vector<std::string>({"input"});
-  inputs["Filter"] = std::vector<std::string>({"filter"});
-  outputs["Output"] = std::vector<std::string>({"output"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<Itype>(input, input_shape, -20.0, 20.0);
-
-  auto filter_var = scope.get()->Var("filter");
-  auto filter = filter_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<Itype>(filter, filter_shape, -20, 20);
-
-  //  for (int i = 0; i < input->numel(); ++i) {
-  //    DLOG << "input[" << i << "] = " << float(input->data<Itype>()[i]);
-  //  }
-  //  for (int i = 0; i < filter->numel(); ++i) {
-  //    DLOG << "filter[" << i << "] = " << float(filter->data<Itype>()[i]);
-  //  }
-
-  auto output_var = scope.get()->Var("output");
-  framework::AttributeMap attrs;
-  attrs["strides"].Set<vector<int>>(std::vector<int>({stride_h, stride_w}));
-  attrs["paddings"].Set<vector<int>>(std::vector<int>({pad_h, pad_w}));
-  attrs["dilations"].Set<vector<int>>(
-      std::vector<int>({dilation_h, dilation_w}));
-  attrs["groups"].Set<int>(groups);
-
-  auto *op = new operators::ConvOp<CPU, float>("conv2d", inputs, outputs, attrs,
-                                               scope.get());
-  op->InferShape();
-  op->Init();
-  //  struct timespec ts_begin, ts_end;
-  // warmup
-  //  op->Run();
-  //  clock_gettime(CLOCK_MONOTONIC, &ts_begin);
-  //  for (int i = 0; i < 10; ++i) {
-  op->Run();
-  //  }
-  //  clock_gettime(CLOCK_MONOTONIC, &ts_end);
-  //  uint64_t elapsed = (ts_end.tv_sec - ts_begin.tv_sec) * 1e3 +
-  //                     (ts_end.tv_nsec - ts_begin.tv_nsec) / 1e6;
-  //  LOG(kLOG_INFO) << "elapsed: " << elapsed / 10.0 << " ms";
-
-  // compare results
-  auto *output = output_var->template Get<framework::LoDTensor>();
-  framework::Tensor output_cmp;
-  output_cmp.mutable_data<Otype>(output->dims());
-  conv2d<Itype, Otype>(input, filter, attrs, &output_cmp);
-
-  const Otype *output_data = output->data<Otype>();
-  Otype *output_cmp_data = output_cmp.data<Otype>();
-  for (int i = 0; i < output->numel(); ++i) {
-    float gap = abs(output_data[i] - output_cmp_data[i]);
-    //    PADDLE_MOBILE_ENFORCE(std::abs(gap / (output_data[i] + 1e-5)) < 1e-3,
-    //                          "output[%d] = %d, output_cmp[%d] = %d", i,
-    //                          output_data[i], i, output_cmp_data[i]);
-    if (gap > 1e-2 && (gap / (abs(output_data[i]) + 1e-5) > 1e-2)) {
-      std::cerr << "output_data[" << i << "] = " << output_data[i]
-                << ", output_cmp_data[" << i << "] = " << output_cmp_data[i]
-                << std::endl;
-      exit(1);
-    }
-  }
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int TestAll(const int in_channels, const int in_height, const int in_width,
-            const int out_channels, const int groups) {
-  std::cerr << "in_channels=" << in_channels << ", in_height=" << in_height
-            << ", in_width=" << in_width << ", out_channels=" << out_channels
-            << ", groups=" << groups << std::endl;
-  std::cerr << "float, kernel=1, pad=0, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<float, float, 1, 0, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-
-  // kernel = 3, pad = 0, stride = 1
-  std::cerr << "float, kernel=3, pad=0, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<float, float, 3, 0, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 3, pad = 1, stride = 1
-  std::cerr << "float, kernel=3, pad=1, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<float, float, 3, 1, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 3, pad = 2, stride = 1
-  std::cerr << "float, kernel=3, pad=2, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<float, float, 3, 2, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 3, pad = 5, stride = 1
-  std::cerr << "float, kernel=3, pad=5, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<float, float, 3, 5, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-
-  // kernel = 3, pad = 0, stride = 2
-  std::cerr << "float, kernel=3, pad=0, stride=2" << std::endl;
-  paddle_mobile::TestConvOp<float, float, 3, 0, 2>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 3, pad = 1, stride = 2
-  std::cerr << "float, kernel=3, pad=1, stride=2" << std::endl;
-  paddle_mobile::TestConvOp<float, float, 3, 1, 2>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 3, pad = 2, stride = 2
-  std::cerr << "float, kernel=3, pad=2, stride=2" << std::endl;
-  paddle_mobile::TestConvOp<float, float, 3, 2, 2>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 3, pad = 5, stride = 2
-  std::cerr << "float, kernel=3, pad=5, stride=2" << std::endl;
-  paddle_mobile::TestConvOp<float, float, 3, 5, 2>(
-      in_channels, in_height, in_width, out_channels, groups);
-
-#ifndef __aarch64__
-  // kernel = 3, pad = 0, stride = 1
-  std::cerr << "int8, kernel=3, pad=0, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 0, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 3, pad = 1, stride = 1
-  std::cerr << "int8, kernel=3, pad=1, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 1, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 3, pad = 2, stride = 1
-  std::cerr << "int8, kernel=3, pad=2, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 2, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 3, pad = 5, stride = 1
-  std::cerr << "int8, kernel=3, pad=5, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 5, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-
-  // kernel = 3, pad = 0, stride = 2
-  std::cerr << "int8, kernel=3, pad=0, stride=2" << std::endl;
-  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 0, 2>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 3, pad = 1, stride = 2
-  std::cerr << "int8, kernel=3, pad=1, stride=2" << std::endl;
-  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 1, 2>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 3, pad = 2, stride = 2
-  std::cerr << "int8, kernel=3, pad=2, stride=2" << std::endl;
-  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 2, 2>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 3, pad = 5, stride = 2
-  std::cerr << "int8, kernel=3, pad=5, stride=2" << std::endl;
-  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 5, 2>(
-      in_channels, in_height, in_width, out_channels, groups);
-#endif  // __aarch64__
-
-  // kernel = 5, pad = 0, stride = 1
-  std::cerr << "float, kernel=5, pad=0, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<float, float, 5, 0, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 5, pad = 1, stride = 1
-  std::cerr << "float, kernel=5, pad=1, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<float, float, 5, 1, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 5, pad = 2, stride = 1
-  std::cerr << "float, kernel=5, pad=2, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<float, float, 5, 2, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 5, pad = 5, stride = 1
-  std::cerr << "float, kernel=5, pad=5, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<float, float, 5, 5, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-
-#ifndef __aarch64__
-  // kernel = 5, pad = 0, stride = 1
-  std::cerr << "int8, kernel=5, pad=0, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<int8_t, int32_t, 5, 0, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 5, pad = 1, stride = 1
-  std::cerr << "int8, kernel=5, pad=1, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<int8_t, int32_t, 5, 1, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 5, pad = 2, stride = 1
-  std::cerr << "int8, kernel=5, pad=2, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<int8_t, int32_t, 5, 2, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 5, pad = 5, stride = 1
-  std::cerr << "int8, kernel=5, pad=5, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<int8_t, int32_t, 5, 5, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-#endif  // __aarch64__
-
-  return 0;
-}
-
-int main() {
-  TestAll(16, 10, 10, 16, 16);
-  TestAll(1, 5, 5, 1, 1);
-  TestAll(1, 5, 5, 10, 1);
-  TestAll(10, 5, 5, 10, 10);
-
-  TestAll(5, 33, 33, 5, 1);
-  TestAll(5, 33, 33, 13, 1);
-  TestAll(13, 33, 33, 13, 13);
-
-  TestAll(5, 33, 13, 5, 1);
-  TestAll(5, 33, 13, 13, 1);
-  TestAll(13, 33, 13, 13, 13);
-  return 0;
-}
diff --git a/mobile/test/operators/test_depthwise_conv_op.cpp b/mobile/test/operators/test_depthwise_conv_op.cpp
deleted file mode 100644
index 77c76eedc5..0000000000
--- a/mobile/test/operators/test_depthwise_conv_op.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/depthwise_conv_op.h"
-
-int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  //  ../models/image_classification_resnet.inference.model
-  auto program = loader.Load(g_mobilenet_ssd);
-
-  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
-                        "program file read fail");
-
-  Executor4Test<paddle_mobile::CPU, paddle_mobile::operators::DepthwiseConvOp<
-                                        paddle_mobile::CPU, float>>
-      executor(program, "depthwise_conv2d");
-
-  paddle_mobile::framework::LoDTensor input;
-  // GetInput<float>(g_test_image_1x3x224x224, &input, {1, 3, 224, 224});
-  // use SetupTensor if not has local input image .
-  SetupTensor<float>(&input, {1, 32, 150, 150}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto input_ptr = input.data<float>();
-  auto out_ddim = paddle_mobile::framework::make_ddim({1, 32, 150, 150});
-  auto output = executor.Predict(input, "batch_norm_0.tmp_3",
-                                 "depthwise_conv2d_0.tmp_0", out_ddim);
-
-  auto output_ptr = output->data<float>();
-  for (int j = 0; j < output->numel(); ++j) {
-    DLOG << " value of output: " << output_ptr[j];
-  }
-  return 0;
-}
diff --git a/mobile/test/operators/test_dequantize_op.cpp b/mobile/test/operators/test_dequantize_op.cpp
deleted file mode 100644
index 981439c66f..0000000000
--- a/mobile/test/operators/test_dequantize_op.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/dequantize_op.h"
-
-namespace paddle_mobile {
-
-void dequantize(const Tensor* input, const float scale, Tensor* output) {
-  const int32_t* x = input->data<int32_t>();
-  float* y = output->mutable_data<float>();
-  size_t size = output->numel();
-  for (size_t i = 0; i < size; ++i) {
-    y[i] = x[i] * scale;
-  }
-}
-
-int TestDequqntizeOp() {
-  framework::DDim dim = framework::make_ddim({1, 3, 224, 224});
-
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input"});
-  inputs["Scale"] = std::vector<std::string>({"scale"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<int32_t>(input, dim, -1000, 1000);
-
-  auto scale_var = scope.get()->Var("scale");
-  auto scale = scale_var->template GetMutable<framework::LoDTensor>();
-  scale->Resize(framework::make_ddim({1}));
-  scale->mutable_data<float>()[0] = 1.27;
-
-  auto output_var = scope.get()->Var("output");
-  framework::AttributeMap attrs;
-  attrs["weight_scale"].Set<float>(1.74);
-
-  auto* op = new operators::DequantizeOp<CPU, float>(
-      "dequantize", inputs, outputs, attrs, scope.get());
-  op->InferShape();
-  op->Run();
-  auto output = output_var->template Get<framework::LoDTensor>();
-  const float* output_data = output->data<float>();
-
-  framework::Tensor output_cmp;
-  output_cmp.Resize(dim);
-  float dequant_scale = 1.27 / 1.74;
-  dequantize(input, dequant_scale, &output_cmp);
-  const float* output_cmp_data = output_cmp.data<float>();
-  for (int i = 0; i < output->numel(); ++i) {
-    PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],
-                          "output[%d] = %.6f, output_cmp[%d] = %.6f", i,
-                          output_data[i], i, output_cmp_data[i]);
-  }
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main() { return paddle_mobile::TestDequqntizeOp(); }
diff --git a/mobile/test/operators/test_dwconv_bn_relu_op.cpp b/mobile/test/operators/test_dwconv_bn_relu_op.cpp
deleted file mode 100644
index 8b2e6f06b2..0000000000
--- a/mobile/test/operators/test_dwconv_bn_relu_op.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/fusion_dwconv_bn_relu_op.h"
-
-namespace paddle_mobile {
-
-template <typename Itype, typename Otype, int Kernel, int Pad, int Stride>
-int TestDWConvAddBnReluOp(int in_channels, int in_height, int in_width,
-                          int out_channels, int groups, std::string opname) {
-  int kernel_h = Kernel;
-  int kernel_w = Kernel;
-  int pad_h = Pad;
-  int pad_w = Pad;
-  int stride_h = Stride;
-  int stride_w = Stride;
-  int dilation_h = 1;
-  int dilation_w = 1;
-
-  int batch_size = 1;
-  int input_c = in_channels;
-  int input_h = in_height;
-  int input_w = in_width;
-  int output_c = out_channels;
-  framework::DDim input_shape =
-      framework::make_ddim({batch_size, input_c, input_h, input_w});
-  framework::DDim filter_shape =
-      framework::make_ddim({output_c, input_c / groups, kernel_h, kernel_w});
-  framework::DDim shape = framework::make_ddim({output_c});
-
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["Input"] = std::vector<std::string>({"input"});
-  inputs["Filter"] = std::vector<std::string>({"filter"});
-  inputs["Mean"] = std::vector<std::string>({"mean"});
-  inputs["Variance"] = std::vector<std::string>({"variance"});
-  inputs["Scale"] = std::vector<std::string>({"scale"});
-  inputs["Bias"] = std::vector<std::string>({"bias"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<Itype>(input, input_shape, -20.0, 20.0);
-
-  auto filter_var = scope.get()->Var("filter");
-  auto filter = filter_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<Itype>(filter, filter_shape, -20, 20);
-
-  auto mean_var = scope.get()->Var("mean");
-  auto mean = mean_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(mean, shape, -10.0, 10.0);
-
-  auto vari_var = scope.get()->Var("variance");
-  auto vari = vari_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(vari, shape, -10.0, 10.0);
-
-  auto scale_var = scope.get()->Var("scale");
-  auto scale = scale_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(scale, shape, -10.0, 10.0);
-
-  auto bias_var = scope.get()->Var("bias");
-  auto bias = bias_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(bias, shape, -10.0, 10.0);
-
-  auto output_var = scope.get()->Var("output");
-  framework::AttributeMap attrs;
-  attrs["strides"].Set<vector<int>>(std::vector<int>({stride_h, stride_w}));
-  attrs["paddings"].Set<vector<int>>(std::vector<int>({pad_h, pad_w}));
-  attrs["dilations"].Set<vector<int>>(
-      std::vector<int>({dilation_h, dilation_w}));
-  attrs["groups"].Set<int>(groups);
-  attrs["epsilon"].Set<float>(1e-6);
-  attrs["momentum"].Set<float>(0.f);
-
-  auto *op = new operators::FusionDWConvBNReluOp<CPU, float>(
-      "fusion_dwconv_bn_relu", inputs, outputs, attrs, scope.get());
-  op->InferShape();
-  op->Init();
-  for (int i = 0; i < 10; ++i) {
-    op->Run();
-  }
-  auto time1 = time();
-  for (int i = 0; i < 10; ++i) {
-    op->Run();
-  }
-  auto time2 = time();
-  std::ofstream out_file("./out_dwconv.txt", std::ios::app);
-  out_file << opname << " cost :" << time_diff(time1, time2) / 10.0 << "ms"
-           << std::endl;
-  out_file.close();
-
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main(int argc, char *argv[]) {
-  // kernel = 3, pad = 1, stride = 1
-  paddle_mobile::TestDWConvAddBnReluOp<float, float, 3, 1, 1>(
-      16, 24, 24, 16, 16, "depthwise_seperable");
-  // kernel = 3, pad = 1, stride = 1
-  paddle_mobile::TestDWConvAddBnReluOp<float, float, 3, 1, 1>(
-      24, 24, 24, 24, 24, "MBConv_3x3_dw1");
-  // kernel = 3, pad = 1, stride = 1
-  paddle_mobile::TestDWConvAddBnReluOp<float, float, 3, 1, 1>(
-      24, 24, 24, 24, 24, "MBConv_3x3_dw2");
-  // kernel = 3, pad = 1, stride = 2
-  paddle_mobile::TestDWConvAddBnReluOp<float, float, 3, 1, 2>(
-      24, 24, 24, 24, 24, "MBConv_3x3_dw3");
-  // kernel = 5, pad = 2, stride = 1
-  paddle_mobile::TestDWConvAddBnReluOp<float, float, 5, 2, 1>(
-      48, 12, 12, 48, 48, "MBConv_5x5_stage1_dw1");
-  // kernel = 5, pad = 2, stride = 1
-  paddle_mobile::TestDWConvAddBnReluOp<float, float, 5, 2, 1>(
-      48, 12, 12, 48, 48, "MBConv_5x5_stage1_dw2");
-  // kernel = 5, pad = 2, stride = 2
-  paddle_mobile::TestDWConvAddBnReluOp<float, float, 5, 2, 2>(
-      48, 12, 12, 48, 48, "MBConv_5x5_stage1_dw3");
-  // kernel = 5, pad = 2, stride = 1
-  paddle_mobile::TestDWConvAddBnReluOp<float, float, 5, 2, 1>(
-      192, 6, 6, 192, 192, "MBConv_5x5_stage2_dw1");
-  // kernel = 5, pad = 2, stride = 1
-  paddle_mobile::TestDWConvAddBnReluOp<float, float, 5, 2, 1>(
-      192, 6, 6, 192, 192, "MBConv_5x5_stage2_dw2");
-  // kernel = 5, pad = 2, stride = 1
-  paddle_mobile::TestDWConvAddBnReluOp<float, float, 5, 2, 1>(
-      192, 6, 6, 192, 192, "MBConv_5x5_stage2_dw3");
-
-  return 0;
-}
diff --git a/mobile/test/operators/test_elementwise_add_op.cpp b/mobile/test/operators/test_elementwise_add_op.cpp
deleted file mode 100644
index 3922b216cf..0000000000
--- a/mobile/test/operators/test_elementwise_add_op.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(g_resnet);
-  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
-                        "program file read fail");
-
-  Executor4Test<paddle_mobile::CPU, paddle_mobile::operators::ElementwiseAddOp<
-                                        paddle_mobile::CPU, float>>
-      executor(program, "elementwise_add");
-
-  // 1. input_tensors;
-  vector<Tensor> input_tensors;
-
-  Tensor input1;
-  auto input1_data = CreateInput<float>(&input1, {1, 3, 224, 224}, 0, 1);
-  input_tensors.push_back(input1);
-
-  Tensor input2;
-  auto input2_data = CreateInput<float>(&input2, {224}, 0, 1);
-  input_tensors.push_back(input2);
-
-  // 2. input_names
-  vector<string> input_names({
-      "batch_norm_2.tmp_2",
-      "batch_norm_0.tmp_3",
-  });
-
-  // 3. output_names
-  vector<string> output_names({"elementwise_add_0.tmp_0"});
-
-  // 4. out_dims;
-  vector<DDim> out_ddims;
-  auto out_ddim = paddle_mobile::framework::make_ddim({1, 3, 224, 224});
-  out_ddims.push_back(out_ddim);
-
-  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
-                                            output_names, out_ddims);
-
-  auto output0_data = output[0]->data<float>();
-  /// output (1,3,224,224)
-  DLOG << "output memory size : " << output[0]->memory_size();
-  DLOG << "output numel : " << output[0]->numel();
-
-  DLOG << input1_data[226] << " + " << input2_data[2] << " = "
-       << output0_data[226];
-}
diff --git a/mobile/test/operators/test_elementwise_sub_op.cpp b/mobile/test/operators/test_elementwise_sub_op.cpp
deleted file mode 100644
index d07d42849b..0000000000
--- a/mobile/test/operators/test_elementwise_sub_op.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/elementwise_sub_op.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestElementwiseSubOp {
- public:
-  explicit TestElementwiseSubOp(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    //  DLOG << " **block size " << blocks.size();
-    for (int i = 0; i < blocks.size(); ++i) {
-      std::shared_ptr<BlockDesc> block_desc = blocks[i];
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      //    DLOG << " ops " << ops.size();
-      for (int j = 0; j < ops.size(); ++j) {
-        std::shared_ptr<OpDesc> op = ops[j];
-        if (op->Type() == "elementwise_sub" &&
-            op->Input("X")[0] == "sigmoid_1.tmp_0") {
-          DLOG << " elementwise_sub attr size: " << op->GetAttrMap().size();
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " outputs size: " << op->GetOutputs().size();
-
-          std::shared_ptr<operators::ElementwiseSubOp<Dtype, float>> lrn =
-              std::make_shared<operators::ElementwiseSubOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope.get());
-          ops_of_block_[*block_desc.get()].push_back(lrn);
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict_bn(const Tensor &t1, const Tensor &t2) {
-    // feed
-    auto scope = program_.scope.get();
-    Variable *x1_feed_value = scope->Var("tmp_0");
-    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
-    tensor_x1->ShareDataWith(t1);
-
-    Variable *x2_feed_value = scope->Var("sigmoid_1.tmp_0");
-    auto tensor_x2 = x2_feed_value->GetMutable<LoDTensor>();
-    tensor_x2->ShareDataWith(t2);
-
-    Variable *output = scope->Var("tmp_1");
-    auto *output_tensor = output->GetMutable<LoDTensor>();
-    output_tensor->mutable_data<float>({1, 1, 6, 6});
-    //  DLOG << typeid(output_tensor).name();
-    //  DLOG << "output_tensor dims: " << output_tensor->dims();
-
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
-
-    predict_bn(t1, t2, 0);
-    return out_tensor;
-  }
-
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-
-  void predict_bn(const Tensor &t1, const Tensor &t2, int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      DLOG << "op -> run()";
-      op->Run();
-    }
-  }
-};
-
-template class TestElementwiseSubOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-
-int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run ElementwiseSub Test";
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_ocr) + "/model",
-                             std::string(g_ocr) + "/params");
-
-  /// input x1 (1,1,6,6)
-  paddle_mobile::framework::Tensor inputx1;
-  SetupTensor<float>(&inputx1, {1, 1, 6, 6}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx1_ptr = inputx1.data<float>();
-
-  /// input x2 (1,1,6,6)
-  paddle_mobile::framework::Tensor inputx2;
-  SetupTensor<float>(&inputx2, {1, 1, 6, 6}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx2_ptr = inputx2.data<float>();
-
-  paddle_mobile::framework::TestElementwiseSubOp<paddle_mobile::CPU>
-      testElementwiseSubOp(program);
-
-  auto output_op = testElementwiseSubOp.predict_bn(inputx1, inputx2);
-  auto *output_op_ptr = output_op->data<float>();
-
-  auto inputx1_dim = inputx1.numel() / inputx1.dims()[0];
-  DLOG << " input1 : ";
-  for (int i = 0; i < inputx1.dims()[0]; ++i) {
-    for (int j = 0; j < inputx1_dim; ++j) {
-      DLOGF("%f ", inputx1_ptr[i * inputx1_dim + j]);
-    }
-    DLOGF("\n");
-  }
-
-  auto inputx2_dim = inputx2.numel() / inputx2.dims()[0];
-  DLOG << " input2 : ";
-  for (int i = 0; i < inputx2.dims()[0]; ++i) {
-    for (int j = 0; j < inputx2_dim; ++j) {
-      DLOGF("%f ", inputx2_ptr[i * inputx2_dim + j]);
-    }
-    DLOGF("\n");
-  }
-
-  auto output_dim = output_op->numel() / output_op->dims()[0];
-  DLOG << " output : ";
-  for (int i = 0; i < output_op->dims()[0]; ++i) {
-    for (int j = 0; j < output_dim; ++j) {
-      DLOGF("%f ", output_op_ptr[i * output_dim + j]);
-    }
-    DLOGF("\n");
-  }
-
-  return 0;
-}
diff --git a/mobile/test/operators/test_fill_constant_op.cpp b/mobile/test/operators/test_fill_constant_op.cpp
deleted file mode 100644
index 86a4bf0a37..0000000000
--- a/mobile/test/operators/test_fill_constant_op.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/fill_constant_op.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestFillConstantOp {
- public:
-  explicit TestFillConstantOp(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    for (auto block_desc : blocks) {
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      for (auto op : ops) {
-        if (op->Type() == "fill_constant") {
-          DLOG << " attr size: " << op->GetAttrMap().size();
-          std::unordered_map<std::string, Attribute> attrs = op->GetAttrMap();
-          for (std::unordered_map<std::string, Attribute>::iterator it =
-                   attrs.begin();
-               it != attrs.end(); ++it) {
-            DLOG << "  " << it->first << " " << it->second;
-          }
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " outputs size: " << op->GetOutputs().size();
-          DLOG << " output is : " << op->Output("Out")[0];
-          output_var_name = op->Output("Out")[0];
-          std::shared_ptr<operators::FillConstantOp<Dtype, float>> op_ptr =
-              std::make_shared<operators::FillConstantOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope.get());
-          ops_of_block_[*block_desc.get()].push_back(op_ptr);
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict() {
-    auto scope = program_.scope.get();
-
-    Variable *output = scope->Var(output_var_name);
-    auto *output_tensor = output->GetMutable<LoDTensor>();
-
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
-
-    predict(0);
-
-    return out_tensor;
-  }
-
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-  string output_var_name;
-
-  void predict(int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      op->Run();
-    }
-  }
-};
-
-template class TestFillConstantOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-
-int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run FillConstant Test";
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_ocr) + "/model",
-                             std::string(g_ocr) + "/params");
-
-  paddle_mobile::framework::TestFillConstantOp<paddle_mobile::CPU>
-      testFillConstantOp(program);
-
-  auto output = testFillConstantOp.predict();
-  auto *output_ptr = output->data<float>();
-
-  DLOG << "output : ";
-  for (int i = 0; i < output->numel(); ++i) {
-    DLOG << " index " << i << " : " << output_ptr[i];
-  }
-  return 0;
-}
diff --git a/mobile/test/operators/test_fusion_conv_add_bn_relu_op.cpp b/mobile/test/operators/test_fusion_conv_add_bn_relu_op.cpp
deleted file mode 100644
index 347bcb40a6..0000000000
--- a/mobile/test/operators/test_fusion_conv_add_bn_relu_op.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_include.h"
-#include "operators/fusion_conv_add_bn_relu_op.h"
-
-int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  //  ../models/image_classification_resnet.inference.model
-  auto program = loader.Load(g_mobilenet, true);
-
-  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
-                        "program file read fail");
-
-  Executor4Test<paddle_mobile::CPU,
-                paddle_mobile::operators::FusionConvAddBNReluOp<
-                    paddle_mobile::CPU, float>>
-      executor(program, "fusion_conv_add_bn_relu", true);
-
-  std::cout << "executor 4 test: " << std::endl;
-
-  paddle_mobile::framework::Tensor input;
-  GetInput<float>(g_test_image_1x3x224x224_banana, &input, {1, 3, 224, 224});
-  //  // use SetupTensor if not has local input image .
-  //  SetupTensor<float>(&input, {1, 3, 224, 224}, static_cast<float>(0),
-  //                     static_cast<float>(1));
-
-  DLOG << " fuck: " << input;
-
-  auto out_ddim = paddle_mobile::framework::make_ddim({1, 32, 112, 112});
-  std::cout << "before predict: " << std::endl;
-  auto output =
-      executor.Predict(input, "data", "conv2_1_dw_bn.tmp_2", out_ddim);
-  std::cout << "after predict " << std::endl;
-  auto output_ptr = output->data<float>();
-
-  int stride = output->numel() / 100;
-  for (int i = 0; i < 100; i++) {
-    DLOG << " index:" << i * stride << " value: " << output_ptr[i * stride];
-  }
-
-  //  for (int i = 0; i < 100; i++) {
-  //    DLOG << " index:" << i << " value: "<< output_ptr[i];
-  //  }
-
-  //  for (int j = 0; j < output->numel(); ++j) {
-  //    std::cout << " (index: " << j << " value: " << output_ptr[j] << ") ";
-  //  }
-  std::cout << std::endl;
-  return 0;
-}
diff --git a/mobile/test/operators/test_fusion_fc_op.cpp b/mobile/test/operators/test_fusion_fc_op.cpp
deleted file mode 100644
index 60ed4976ec..0000000000
--- a/mobile/test/operators/test_fusion_fc_op.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include <type_traits>
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "framework/operator.h"
-#include "operators/fusion_fc_op.h"
-
-#define a(i, j) a[(i)*lda + (j)]
-#define b(i, j) b[(i)*ldb + (j)]
-#define c(i, j) c[(i)*ldc + (j)]
-
-namespace paddle_mobile {
-using framework::AttributeMap;
-using framework::DDim;
-using framework::Scope;
-using framework::make_ddim;
-
-int32_t qadd_int32(int32_t l, int32_t r) {
-  int64_t res = static_cast<int64_t>(l) + static_cast<int64_t>(r);
-  if (res > std::numeric_limits<int32_t>::max())
-    return std::numeric_limits<int32_t>::max();
-  else if (res < std::numeric_limits<int32_t>::min())
-    return std::numeric_limits<int32_t>::min();
-  else
-    return static_cast<int32_t>(res);
-}
-
-// round to zero
-float round2zero(float v) {
-  float res;
-  if (v > 0)
-    res = std::floor(v);
-  else if (v < 0)
-    res = std::ceil(v);
-  return res;
-}
-
-int8_t qscale_int32(int32_t v, float scale) {
-  float res = static_cast<float>(v) * scale;
-  res = round2zero(res);
-  if (res > 127)
-    return static_cast<int8_t>(127);
-  else if (res < -127)
-    return static_cast<int8_t>(-127);
-  else
-    return static_cast<int8_t>(res);
-}
-
-template <typename T, typename S>
-int TestFcOP() {
-  int32_t m = 377;
-  int32_t n = 1363;
-  int32_t k = 577;
-  int32_t lda = k;
-  int32_t ldb = n;
-  int32_t ldc = n;
-  DDim inputA_shape = make_ddim({m, k});
-  DDim inputB_shape = make_ddim({k, n});
-  DDim bias_shape = make_ddim({n});
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<Scope>();
-  inputs["X"] = std::vector<std::string>({"inputA"});
-  inputs["Y"] = std::vector<std::string>({"inputB"});
-  inputs["Z"] = std::vector<std::string>({"bias"});
-  inputs["Scale"] = std::vector<std::string>({"scale"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto inputA_var = scope.get()->Var("inputA");
-  auto inputA = inputA_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<T>(inputA, inputA_shape, -127, 127);
-  auto inputB_var = scope.get()->Var("inputB");
-  auto inputB = inputB_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<T>(inputB, inputB_shape, -127, 127);
-  auto bias_var = scope.get()->Var("bias");
-  auto bias = bias_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<S>(bias, bias_shape, -127, 127);
-
-  framework::Tensor origin_matrix;
-  T *origin_inputB_ptr = origin_matrix.mutable_data<T>(inputB_shape);
-  memcpy(origin_inputB_ptr, inputB->data<T>(),
-         sizeof(*origin_inputB_ptr) * k * n);
-
-  auto scale_var = scope.get()->Var("scale");
-  auto scale = scale_var->template GetMutable<framework::LoDTensor>();
-  scale->Resize(framework::make_ddim({1}));
-  float scale_v = 0.000828f;
-  scale->mutable_data<float>()[0] = scale_v;
-
-  auto output_var = scope.get()->Var("output");
-  AttributeMap attrs;
-  attrs["x_num_col_dims"].Set<int>(1);
-  attrs["y_num_col_dims"].Set<int>(1);
-  attrs["axis"].Set<int>(1);
-  operators::OperatorBase<CPU> *op = nullptr;
-  op = new operators::FusionFcOp<CPU, T>("fusion_fc", inputs, outputs, attrs,
-                                         scope.get());
-  op->InferShape();
-  op->Init();
-  op->Run();
-  auto output = output_var->template Get<framework::LoDTensor>();
-  const T *output_data = output->data<T>();
-  // compare
-  T *c = static_cast<T *>(memory::Alloc(sizeof(T) * m * n));
-  T *a = inputA->data<T>();
-  T *b = origin_inputB_ptr;
-  S *bias_data = bias->data<S>();
-  for (int32_t i = 0; i < m; ++i) {
-    for (int32_t j = 0; j < n; ++j) {
-      S bias_v = bias_data[j];
-      if (std::is_same<T, int8_t>::value) {
-        int32_t r = 0;
-        for (int32_t p = 0; p < k; p++) {
-          r += static_cast<int32_t>(a(i, p)) * static_cast<int32_t>(b(p, j));
-        }
-        r = qadd_int32(r, bias_v);
-        c(i, j) = qscale_int32(r, scale_v);
-      } else {
-        T r = 0;
-        for (int32_t p = 0; p < k; p++) {
-          r += a(i, p) * b(p, j);
-        }
-        r += bias_v;
-        c(i, j) = r;
-      }
-    }
-  }
-
-  int32_t eq = 0;
-  int32_t neq = 0;
-  for (int32_t i = 0; i < m * n; ++i) {
-    PADDLE_MOBILE_ENFORCE(output_data[i] == c[i],
-                          "The execution of test_fusion_fc_op is failed!");
-    if (output_data[i] == c[i]) {
-      ++eq;
-    } else {
-      ++neq;
-    }
-  }
-  std::cout << "mnk=" << m << " " << n << " " << k << "   eq=" << eq
-            << " neq=" << neq << std::endl;
-  delete op;
-  return 0;
-}
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  paddle_mobile::TestFcOP<float, float>();
-  return 0;
-}
diff --git a/mobile/test/operators/test_gru_op.cpp b/mobile/test/operators/test_gru_op.cpp
deleted file mode 100644
index d17b2d6a2d..0000000000
--- a/mobile/test/operators/test_gru_op.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/gru_op.h"
-
-namespace paddle_mobile {
-
-template <typename Itype, typename Otype>
-int TestGruOp(int in_channels, int out_channels, std::string opname) {
-  size_t input_c = in_channels;
-  size_t output_c = out_channels;
-  paddle_mobile::framework::LoD lod{{0, input_c}};
-  int batch_size = lod.size();
-  framework::DDim input_shape = framework::make_ddim({input_c, output_c * 3});
-  framework::DDim weight_shape = framework::make_ddim({output_c, output_c * 3});
-  framework::DDim h0_shape = framework::make_ddim({batch_size, output_c});
-  framework::DDim bias_shape = framework::make_ddim({batch_size, output_c * 3});
-
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["Input"] = std::vector<std::string>({"input"});
-  inputs["Weight"] = std::vector<std::string>({"weight"});
-  inputs["H0"] = std::vector<std::string>({"h0"});
-  inputs["Bias"] = std::vector<std::string>({"bias"});
-
-  outputs["BatchGate"] = std::vector<std::string>({"output_batch_gate"});
-  outputs["BatchResetHiddenPrev"] =
-      std::vector<std::string>({"output_batch_reset_hidden_prev"});
-  outputs["BatchHidden"] = std::vector<std::string>({"output_batch_hidden"});
-  outputs["Hidden"] = std::vector<std::string>({"output_hidden"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<Itype>(input, input_shape, -127, 127);
-  input->set_lod(lod);
-
-  auto weight_var = scope.get()->Var("weight");
-  auto weight = weight_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<Itype>(weight, weight_shape, -127, 127);
-
-  auto h0_var = scope.get()->Var("h0");
-  auto h0 = h0_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<Itype>(h0, h0_shape, -127, 127);
-
-  auto bias_var = scope.get()->Var("bias");
-  auto bias = bias_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<Itype>(bias, bias_shape, -127, 127);
-
-  auto batch_gate_var = scope.get()->Var("output_batch_gate");
-  auto batch_reset_hidden_prev_var =
-      scope.get()->Var("output_batch_reset_hidden_prev");
-  auto batch_hidden_var = scope.get()->Var("output_batch_hidden");
-  auto hidden_var = scope.get()->Var("output_hidden");
-
-  framework::AttributeMap attrs;
-  attrs["activation"].Set<std::string>(std::string("relu"));
-  attrs["gate_activation"].Set<std::string>(std::string("sigmoid"));
-  attrs["is_reverse"].Set<bool>(false);
-
-  auto *op = new operators::GruOp<CPU, float>("gru", inputs, outputs, attrs,
-                                              scope.get());
-  op->InferShape();
-  op->Init();
-  for (int i = 0; i < 10; ++i) {
-    op->Run();
-  }
-  auto time1 = time();
-  for (int i = 0; i < 10; ++i) {
-    op->Run();
-  }
-  auto time2 = time();
-  std::ofstream out_file("./out_gru.txt", std::ios::app);
-  out_file << opname << " cost :" << time_diff(time1, time2) / 10.0 << "ms"
-           << std::endl;
-  out_file.close();
-
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main(int argc, char *argv[]) {
-  paddle_mobile::TestGruOp<float, float>(384, 120, "gru_forward");
-  return 0;
-}
diff --git a/mobile/test/operators/test_im2sequence_op.cpp b/mobile/test/operators/test_im2sequence_op.cpp
deleted file mode 100644
index 247e6a466f..0000000000
--- a/mobile/test/operators/test_im2sequence_op.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/im2sequence_op.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestIm2SequenceOp {
- public:
-  explicit TestIm2SequenceOp(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    //  DLOG << " **block size " << blocks.size();
-    for (int i = 0; i < blocks.size(); ++i) {
-      std::shared_ptr<BlockDesc> block_desc = blocks[i];
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      //    DLOG << " ops " << ops.size();
-      for (int j = 0; j < ops.size(); ++j) {
-        std::shared_ptr<OpDesc> op = ops[j];
-        if (op->Type() == "im2sequence" &&
-            op->Input("X")[0] == "conv2d_19.tmp_1") {
-          DLOG << " im2squence attr size: " << op->GetAttrMap().size();
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " outputs size: " << op->GetOutputs().size();
-
-          std::shared_ptr<operators::Im2SequenceOp<Dtype, float>> lrn =
-              std::make_shared<operators::Im2SequenceOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope.get());
-          ops_of_block_[*block_desc.get()].push_back(lrn);
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict_bn(const Tensor &t1) {
-    // feed
-    auto scope = program_.scope.get();
-    Variable *x1_feed_value = scope->Var("conv2d_19.tmp_1");
-    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
-    tensor_x1->ShareDataWith(t1);
-    Variable *output = scope->Var("im2sequence_0.tmp_0");
-    auto *output_tensor = output->GetMutable<LoDTensor>();
-    output_tensor->mutable_data<float>({2, 12});
-    //  DLOG << typeid(output_tensor).name();
-    //  DLOG << "output_tensor dims: " << output_tensor->dims();
-
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
-
-    predict_bn(t1, 0);
-    return out_tensor;
-  }
-
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-
-  void predict_bn(const Tensor &t1, int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      DLOG << "op -> run()";
-      op->Run();
-    }
-  }
-};
-
-template class TestIm2SequenceOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-
-int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run Im2Sequence Test";
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_eng) + "/model",
-                             std::string(g_eng) + "/params");
-
-  /// input x (4,10,2,2)
-  paddle_mobile::framework::Tensor inputx;
-  SetupTensor<float>(&inputx, {1, 2, 6, 2}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx_ptr = inputx.data<float>();
-
-  paddle_mobile::framework::TestIm2SequenceOp<paddle_mobile::CPU>
-      testIm2SequenceOp(program);
-
-  auto output_op = testIm2SequenceOp.predict_bn(inputx);
-  auto *output_op_ptr = output_op->data<float>();
-
-  auto input_dim = inputx.numel() / inputx.dims()[0];
-  DLOG << " input : ";
-  for (int i = 0; i < inputx.dims()[0]; ++i) {
-    for (int j = 0; j < input_dim; ++j) {
-      DLOGF("%f ", inputx_ptr[i * input_dim + j]);
-    }
-    DLOGF("\n");
-  }
-
-  auto output_dim = output_op->numel() / output_op->dims()[0];
-  DLOG << " output : ";
-  for (int i = 0; i < output_op->dims()[0]; ++i) {
-    for (int j = 0; j < output_dim; ++j) {
-      DLOGF("%f ", output_op_ptr[i * output_dim + j]);
-    }
-    DLOGF("\n");
-  }
-
-  return 0;
-}
diff --git a/mobile/test/operators/test_increment_op.cpp b/mobile/test/operators/test_increment_op.cpp
deleted file mode 100644
index 32f6a57b60..0000000000
--- a/mobile/test/operators/test_increment_op.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/increment_op.h"
-
-namespace paddle_mobile {
-
-template <typename T>
-void Increment(const framework::Tensor *input, framework::Tensor *out,
-               int step) {
-  auto input_data = input->data<T>();
-  auto out_data = out->data<T>();
-  *out_data = *input_data + step;
-}
-
-int TestIncrementOp(const std::vector<int> input_shape, int step) {
-  framework::DDim input_dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"inputX"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto x_var = scope.get()->Var("inputX");
-  auto x = x_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(x, input_dims, 0, 100);
-
-  auto output_var = scope.get()->Var("output");
-  framework::AttributeMap attrs;
-  attrs["step"].Set<int>(step);
-
-  auto *op = new operators::IncrementOp<CPU, float>(
-      "increment", inputs, outputs, attrs, scope.get());
-
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-  framework::Tensor output_cmp;
-  float *output_cmp_data = output_cmp.mutable_data<float>(output->dims());
-  Increment<float>(x, &output_cmp, step);
-
-  const float *output_data = output->data<float>();
-  for (int i = 0; i < output->numel(); ++i) {
-    float gap = output_data[i] - output_cmp_data[i];
-    if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-}
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::TestIncrementOp({1}, 4);
-  paddle_mobile::TestIncrementOp({1}, 10);
-  DLOG << "test increment op pass.";
-  return 0;
-}
diff --git a/mobile/test/operators/test_is_empty_op.cpp b/mobile/test/operators/test_is_empty_op.cpp
deleted file mode 100644
index 9bf9443acd..0000000000
--- a/mobile/test/operators/test_is_empty_op.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/is_empty_op.h"
-
-namespace paddle_mobile {
-
-void IsEmpty(const framework::Tensor *input, framework::Tensor *out) {
-  out->data<bool>()[0] = input->numel() == 0;
-}
-
-int TestIsEmptyOp(const std::vector<int> input_shape) {
-  framework::DDim input_dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"inputX"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto x_var = scope.get()->Var("inputX");
-  auto x = x_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(x, input_dims, 0, 100);
-
-  auto output_var = scope.get()->Var("output");
-  framework::AttributeMap attrs;
-
-  auto *op = new operators::IsEmptyOp<CPU, float>("is_empty", inputs, outputs,
-                                                  attrs, scope.get());
-
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-  framework::Tensor output_cmp;
-  bool *output_cmp_data = output_cmp.mutable_data<bool>(output->dims());
-  IsEmpty(x, &output_cmp);
-
-  const bool *output_data = output->data<bool>();
-  for (int i = 0; i < output->numel(); ++i) {
-    if (output_data[i] != output_cmp_data[i]) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-}
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::TestIsEmptyOp({1, 3, 100, 100});
-  paddle_mobile::TestIsEmptyOp({0});
-  DLOG << "test is_empty op pass.";
-  return 0;
-}
diff --git a/mobile/test/operators/test_leaky_relu_op.cpp b/mobile/test/operators/test_leaky_relu_op.cpp
deleted file mode 100644
index 3349fbd92c..0000000000
--- a/mobile/test/operators/test_leaky_relu_op.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-#include <iostream>
-#include "../test_include.h"
-#include "operators/activation_op.h"
-
-namespace paddle_mobile {
-
-void LeakyRelu(const framework::Tensor *X, framework::Tensor *Y, float alpha) {
-  const float *x = X->data<float>();
-  float *y = Y->mutable_data<float>();
-
-  for (int i = 0; i < X->numel(); ++i) {
-    y[i] = std::max(x[i], x[i] * alpha);
-  }
-}
-
-int TestLeakyReluOp(const std::vector<int> input_shape, float alpha) {
-  framework::DDim dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(input, dims, -100.0, 100.0);
-  auto output_var = scope.get()->Var("output");
-  framework::AttributeMap attrs;
-  attrs["alpha"].Set<float>(alpha);
-
-  auto *op = new operators::LeakyReluOp<CPU, float>(
-      "leaky_relu", inputs, outputs, attrs, scope.get());
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-
-  framework::Tensor output_cmp;
-  float *output_cmp_data = output_cmp.mutable_data<float>(output->dims());
-  LeakyRelu(input, &output_cmp, alpha);
-
-  const float *output_data = output->data<float>();
-  for (int i = 0; i < output->numel(); ++i) {
-    float gap = output_data[i] - output_cmp_data[i];
-    if (gap > 1e-5 && std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::TestLeakyReluOp({1, 1, 2, 3}, 0.2f);
-  paddle_mobile::TestLeakyReluOp({1, 3, 11, 22}, 0.3f);
-  paddle_mobile::TestLeakyReluOp({1, 32, 112, 112}, 0.4f);
-  std::cout << "test leaky_relu op pass." << std::endl;
-  return 0;
-}
diff --git a/mobile/test/operators/test_less_than_op.cpp b/mobile/test/operators/test_less_than_op.cpp
deleted file mode 100644
index 35f5e6fe74..0000000000
--- a/mobile/test/operators/test_less_than_op.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-#include <iostream>
-#include "../test_include.h"
-#include "operators/compare_op.h"
-
-namespace paddle_mobile {
-
-template <typename T>
-void LessThan(const framework::Tensor *X, const framework::Tensor *Y,
-              const int Axis, framework::Tensor *Out) {
-  const T *x = X->data<T>();
-  const T *y = Y->data<T>();
-  bool *output = Out->mutable_data<bool>();
-  const auto &x_dims = X->dims();
-  const auto &y_dims = Y->dims();
-  /// axis = -1 represent the last dimensions.
-  int axis = (Axis == -1 ? x_dims.size() - y_dims.size() : Axis);
-  int batch = 1;
-  int channels = 1;
-  int elementwise_num = 1;
-  for (int i = 0; i < axis; ++i) {
-    batch *= x_dims[i];
-  }
-  for (int i = 0; i < y_dims.size(); ++i) {
-    channels *= y_dims[i];
-  }
-  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
-    elementwise_num *= x_dims[i];
-  }
-  // less than
-  for (int i = 0; i < batch; ++i) {
-    for (int j = 0; j < channels; ++j) {
-      int x_offset = (i * channels + j) * elementwise_num;
-      int y_offset = j * elementwise_num;
-      for (int k = 0; k < elementwise_num; ++k) {
-        output[x_offset + k] = (x[x_offset + k] < y[y_offset]);
-      }
-    }
-  }
-}
-
-template <typename T>
-int TestLessThanOp(const std::vector<int> &x_shape,
-                   const std::vector<int> &y_shape, const int axis) {
-  framework::DDim xdims = framework::make_ddim(x_shape);
-  framework::DDim ydims = framework::make_ddim(y_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"inputx"});
-  inputs["Y"] = std::vector<std::string>({"inputy"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto inputx_var = scope.get()->Var("inputx");
-  auto inputx = inputx_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<T>(inputx, xdims, static_cast<T>(-100), static_cast<T>(100));
-  auto inputy_var = scope.get()->Var("inputy");
-  auto inputy = inputy_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<T>(inputy, ydims, static_cast<T>(-100), static_cast<T>(100));
-
-  auto output_var = scope.get()->Var("output");
-
-  framework::AttributeMap attrs;
-  attrs["axis"].Set<int>(axis);
-  auto *op = new operators::LessThanOp<CPU, float>("less_than", inputs, outputs,
-                                                   attrs, scope.get());
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-
-  framework::Tensor output_cmp;
-  bool *output_cmp_data = output_cmp.mutable_data<bool>(output->dims());
-  LessThan<T>(inputx, inputy, axis, &output_cmp);
-
-  const bool *output_data = output->data<bool>();
-  for (int i = 0; i < output->numel(); ++i) {
-    if (output_data[i] != output_cmp_data[i]) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::TestLessThanOp<float>({1, 2, 3}, {1, 2, 3}, 0);
-  paddle_mobile::TestLessThanOp<float>({10, 2, 1}, {10, 2, 1}, 0);
-
-  paddle_mobile::TestLessThanOp<float>({2, 10, 1}, {1, 10, 1}, 1);
-  paddle_mobile::TestLessThanOp<float>({10, 2, 1}, {1, 2, 1}, 1);
-
-  paddle_mobile::TestLessThanOp<int64_t>({1, 2, 3}, {1, 2, 3}, 0);
-  paddle_mobile::TestLessThanOp<int64_t>({10, 2, 1}, {10, 2, 1}, 0);
-
-  paddle_mobile::TestLessThanOp<int64_t>({2, 10, 1}, {1, 10, 1}, 1);
-  paddle_mobile::TestLessThanOp<int64_t>({10, 2, 1}, {1, 2, 1}, 1);
-
-  std::cout << "test less_than op pass." << std::endl;
-  return 0;
-}
diff --git a/mobile/test/operators/test_log_op.cpp b/mobile/test/operators/test_log_op.cpp
deleted file mode 100644
index f0bba93d54..0000000000
--- a/mobile/test/operators/test_log_op.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-#include <iostream>
-#include "../test_include.h"
-#include "operators/activation_op.h"
-
-namespace paddle_mobile {
-
-void Log(const framework::Tensor *X, framework::Tensor *Y) {
-  const float *x = X->data<float>();
-  float *y = Y->mutable_data<float>();
-
-  for (int i = 0; i < X->numel(); ++i) {
-    y[i] = log(x[i]);
-  }
-}
-
-int TestLogOp(const std::vector<int> input_shape) {
-  framework::DDim dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(input, dims, 0.0001, 100.0);
-
-  auto output_var = scope.get()->Var("output");
-
-  framework::AttributeMap attrs;
-  auto *op = new operators::LogOp<CPU, float>("log", inputs, outputs, attrs,
-                                              scope.get());
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-
-  framework::Tensor output_cmp;
-  float *output_cmp_data = output_cmp.mutable_data<float>(output->dims());
-  Log(input, &output_cmp);
-
-  const float *output_data = output->data<float>();
-  for (int i = 0; i < output->numel(); ++i) {
-    float gap = output_data[i] - output_cmp_data[i];
-    if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::TestLogOp({1, 1, 2, 3});
-  paddle_mobile::TestLogOp({1, 3, 11, 22});
-  paddle_mobile::TestLogOp({1, 32, 112, 112});
-  return 0;
-}
diff --git a/mobile/test/operators/test_logical_and_op.cpp b/mobile/test/operators/test_logical_and_op.cpp
deleted file mode 100644
index 380b253efe..0000000000
--- a/mobile/test/operators/test_logical_and_op.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/logical_op.h"
-
-namespace paddle_mobile {
-
-void LogicalAnd(const framework::Tensor *inputX,
-                const framework::Tensor *inputY, framework::Tensor *output) {
-  auto x_data = inputX->data<bool>();
-  auto y_data = inputY->data<bool>();
-  auto output_data = output->data<bool>();
-  for (int i = 0; i < inputX->numel(); ++i) {
-    *output_data = *x_data && *y_data;
-    x_data++;
-    y_data++;
-    output_data++;
-  }
-}
-
-int TestLogicalAndOp(const std::vector<int> input_shape) {
-  framework::DDim input_dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"inputX"});
-  inputs["Y"] = std::vector<std::string>({"inputY"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto x_var = scope.get()->Var("inputX");
-  auto x = x_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<bool>(x, input_dims, 0, 1);
-
-  auto y_var = scope.get()->Var("inputY");
-  auto y = y_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<bool>(y, input_dims, 0, 1);
-
-  auto output_var = scope.get()->Var("output");
-  framework::AttributeMap attrs;
-
-  auto *op = new operators::LogicalAndOp<CPU, float>(
-      "logical_and", inputs, outputs, attrs, scope.get());
-
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-  framework::Tensor output_cmp;
-  bool *output_cmp_data = output_cmp.mutable_data<bool>(output->dims());
-  LogicalAnd(x, y, &output_cmp);
-
-  const bool *output_data = output->data<bool>();
-  for (int i = 0; i < output->numel(); ++i) {
-    if (output_data[i] != output_cmp_data[i]) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-}
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::TestLogicalAndOp({1, 1, 2, 3});
-  paddle_mobile::TestLogicalAndOp({1, 3, 11, 12});
-  paddle_mobile::TestLogicalAndOp({1, 16, 32, 32});
-  DLOG << "test logical_and op pass.";
-  return 0;
-}
diff --git a/mobile/test/operators/test_logical_not_op.cpp b/mobile/test/operators/test_logical_not_op.cpp
deleted file mode 100644
index 8d88362210..0000000000
--- a/mobile/test/operators/test_logical_not_op.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/logical_op.h"
-
-namespace paddle_mobile {
-
-void LogicalNot(const framework::Tensor *inputX, framework::Tensor *output) {
-  auto x_data = inputX->data<bool>();
-  auto output_data = output->data<bool>();
-  for (int i = 0; i < inputX->numel(); ++i) {
-    *output_data = !*x_data;
-    x_data++;
-    output_data++;
-  }
-}
-
-int TestLogicalNotOp(const std::vector<int> input_shape) {
-  framework::DDim input_dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"inputX"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto x_var = scope.get()->Var("inputX");
-  auto x = x_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<bool>(x, input_dims, 0, 1);
-
-  auto output_var = scope.get()->Var("output");
-  framework::AttributeMap attrs;
-
-  auto *op = new operators::LogicalNotOp<CPU, float>(
-      "logical_not", inputs, outputs, attrs, scope.get());
-
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-  framework::Tensor output_cmp;
-  bool *output_cmp_data = output_cmp.mutable_data<bool>(output->dims());
-  LogicalNot(x, &output_cmp);
-
-  const bool *output_data = output->data<bool>();
-  for (int i = 0; i < output->numel(); ++i) {
-    if (output_data[i] != output_cmp_data[i]) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-}
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::TestLogicalNotOp({1, 1, 2, 3});
-  paddle_mobile::TestLogicalNotOp({1, 3, 11, 12});
-  paddle_mobile::TestLogicalNotOp({1, 16, 32, 32});
-  DLOG << "test logical_not op pass.";
-  return 0;
-}
diff --git a/mobile/test/operators/test_logical_or_op.cpp b/mobile/test/operators/test_logical_or_op.cpp
deleted file mode 100644
index 9ea555b65b..0000000000
--- a/mobile/test/operators/test_logical_or_op.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/logical_op.h"
-
-namespace paddle_mobile {
-
-void LogicalOr(const framework::Tensor *inputX, const framework::Tensor *inputY,
-               framework::Tensor *output) {
-  auto x_data = inputX->data<bool>();
-  auto y_data = inputY->data<bool>();
-  auto output_data = output->data<bool>();
-  for (int i = 0; i < inputX->numel(); ++i) {
-    *output_data = *x_data || *y_data;
-    x_data++;
-    y_data++;
-    output_data++;
-  }
-}
-
-int TestLogicalOrOp(const std::vector<int> input_shape) {
-  framework::DDim input_dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"inputX"});
-  inputs["Y"] = std::vector<std::string>({"inputY"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto x_var = scope.get()->Var("inputX");
-  auto x = x_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<bool>(x, input_dims, 0, 1);
-
-  auto y_var = scope.get()->Var("inputY");
-  auto y = y_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<bool>(y, input_dims, 0, 1);
-
-  auto output_var = scope.get()->Var("output");
-  framework::AttributeMap attrs;
-
-  auto *op = new operators::LogicalOrOp<CPU, float>(
-      "logical_or", inputs, outputs, attrs, scope.get());
-
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-  framework::Tensor output_cmp;
-  bool *output_cmp_data = output_cmp.mutable_data<bool>(output->dims());
-  LogicalOr(x, y, &output_cmp);
-
-  const bool *output_data = output->data<bool>();
-  for (int i = 0; i < output->numel(); ++i) {
-    if (output_data[i] != output_cmp_data[i]) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-}
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::TestLogicalOrOp({1, 1, 2, 3});
-  paddle_mobile::TestLogicalOrOp({1, 3, 11, 12});
-  paddle_mobile::TestLogicalOrOp({1, 16, 32, 32});
-  DLOG << "test logical_or op pass.";
-  return 0;
-}
diff --git a/mobile/test/operators/test_logical_xor_op.cpp b/mobile/test/operators/test_logical_xor_op.cpp
deleted file mode 100644
index a776de0e8b..0000000000
--- a/mobile/test/operators/test_logical_xor_op.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/logical_op.h"
-
-namespace paddle_mobile {
-
-void LogicalXor(const framework::Tensor *inputX,
-                const framework::Tensor *inputY, framework::Tensor *output) {
-  auto x_data = inputX->data<bool>();
-  auto y_data = inputY->data<bool>();
-  auto output_data = output->data<bool>();
-  for (int i = 0; i < inputX->numel(); ++i) {
-    bool x = *x_data;
-    bool y = *y_data;
-    *output_data = (x || y) && !(x && y);
-    x_data++;
-    y_data++;
-    output_data++;
-  }
-}
-
-int TestLogicalXorOp(const std::vector<int> input_shape) {
-  framework::DDim input_dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"inputX"});
-  inputs["Y"] = std::vector<std::string>({"inputY"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto x_var = scope.get()->Var("inputX");
-  auto x = x_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<bool>(x, input_dims, 0, 1);
-
-  auto y_var = scope.get()->Var("inputY");
-  auto y = y_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<bool>(y, input_dims, 0, 1);
-
-  auto output_var = scope.get()->Var("output");
-  framework::AttributeMap attrs;
-
-  auto *op = new operators::LogicalXorOp<CPU, float>(
-      "logical_xor", inputs, outputs, attrs, scope.get());
-
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-  framework::Tensor output_cmp;
-  bool *output_cmp_data = output_cmp.mutable_data<bool>(output->dims());
-  LogicalXor(x, y, &output_cmp);
-
-  const bool *output_data = output->data<bool>();
-  for (int i = 0; i < output->numel(); ++i) {
-    if (output_data[i] != output_cmp_data[i]) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-}
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::TestLogicalXorOp({1, 1, 2, 3});
-  paddle_mobile::TestLogicalXorOp({1, 3, 11, 12});
-  paddle_mobile::TestLogicalXorOp({1, 16, 32, 32});
-  DLOG << "test logical_xor op pass.";
-  return 0;
-}
diff --git a/mobile/test/operators/test_lrn_op.cpp b/mobile/test/operators/test_lrn_op.cpp
deleted file mode 100644
index 5d1ac9b4dd..0000000000
--- a/mobile/test/operators/test_lrn_op.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/lrn_op.h"
-
-int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(g_googlenet);
-  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
-                        "program file read fail");
-
-  Executor4Test<paddle_mobile::CPU,
-                paddle_mobile::operators::LrnOp<paddle_mobile::CPU, float>>
-      executor(program, "lrn");
-
-  // 1. input_tensors;
-  vector<Tensor> input_tensors;
-
-  Tensor input1;
-  auto input1_data = CreateInput<float>(&input1, {3, 4, 2, 2}, 0, 1);
-  input_tensors.push_back(input1);
-
-  // 2. input_names
-  vector<string> input_names({
-      "pool2d_0.tmp_0",
-  });
-
-  // 3. output_names
-  vector<string> output_names({"pool1_norm1.tmp_1"});
-
-  // 4. out_dims;
-  vector<DDim> out_ddims;
-  auto out_ddim = paddle_mobile::framework::make_ddim({3, 4, 2, 2});
-  out_ddims.push_back(out_ddim);
-
-  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
-                                            output_names, out_ddims);
-
-  auto output0_data = output[0]->data<float>();
-
-  DLOG << " LrnOp input: ";
-  for (int i = 0; i < 3; i++) {
-    for (int j = 0; j < 4; j++) {
-      for (int c = 0; c < 2; c++) {
-        for (int d = 0; d < 2; d++) {
-          DLOGF("%f ", input1_data[i * 16 + j * 4 + c * 2 + d]);
-        }
-        DLOGF("\n");
-      }
-      DLOGF("\n");
-    }
-    DLOGF("\n");
-  }
-  DLOG << " LrnOp output: ";
-  for (int i = 0; i < 3; i++) {
-    for (int j = 0; j < 4; j++) {
-      for (int c = 0; c < 2; c++) {
-        for (int d = 0; d < 2; d++) {
-          DLOGF("%f ", output0_data[i * 16 + j * 4 + c * 2 + d]);
-        }
-        DLOGF("\n");
-      }
-      DLOGF("\n");
-    }
-    DLOGF("\n");
-  }
-  DLOG << input1_data[0] << " / ((1 + 0.00002 * ( " << input1_data[0] << "^2 + "
-       << input1_data[4] << "^2 + " << input1_data[8] << "^2 ))^0.75) = ";
-  DLOG << output0_data[0];
-  return 0;
-}
diff --git a/mobile/test/operators/test_mul_op.cpp b/mobile/test/operators/test_mul_op.cpp
deleted file mode 100644
index 6ac2c45564..0000000000
--- a/mobile/test/operators/test_mul_op.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/mul_op.h"
-
-#define a(i, j) a[(i)*lda + (j)]
-#define b(i, j) b[(i)*ldb + (j)]
-#define c(i, j) c[(i)*ldc + (j)]
-
-namespace paddle_mobile {
-using framework::AttributeMap;
-using framework::DDim;
-using framework::Scope;
-using framework::make_ddim;
-template <typename I, typename O>
-int TestMulOP() {
-  int32_t m = 1024;
-  int32_t n = 1024;
-  int32_t k = 1024;
-  int32_t lda = k;
-  int32_t ldb = n;
-  int32_t ldc = n;
-  DDim inputA_shape = make_ddim({m, k});
-  DDim inputB_shape = make_ddim({k, n});
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<Scope>();
-  inputs["X"] = std::vector<std::string>({"inputA"});
-  inputs["Y"] = std::vector<std::string>({"inputB"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto inputA_var = scope.get()->Var("inputA");
-  auto inputA = inputA_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<I>(inputA, inputA_shape, -127, 127);
-  auto inputB_var = scope.get()->Var("inputB");
-  auto inputB = inputB_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<I>(inputB, inputB_shape, -127, 127);
-
-  auto output_var = scope.get()->Var("output");
-  AttributeMap attrs;
-  attrs["x_num_col_dims"].Set<int>(1);
-  attrs["y_num_col_dims"].Set<int>(1);
-  auto *op = new operators::MulOp<CPU, float>("mul", inputs, outputs, attrs,
-                                              scope.get());
-  op->InferShape();
-  op->Run();
-  auto output = output_var->template Get<framework::LoDTensor>();
-  const O *output_data = output->data<O>();
-  // compare
-  O *c = static_cast<O *>(memory::Alloc(sizeof(O) * m * n));
-  I *a = inputA->data<I>();
-  I *b = inputB->data<I>();
-  for (int32_t i = 0; i < m; ++i) {
-    for (int32_t j = 0; j < n; ++j) {
-      O r = 0;
-      for (int32_t p = 0; p < k; p++) {
-        r += static_cast<O>(a(i, p)) * static_cast<O>(b(p, j));
-      }
-      c(i, j) = r;
-    }
-  }
-
-  int32_t eq = 0;
-  int32_t neq = 0;
-  for (int32_t i = 0; i < m * n; ++i) {
-    PADDLE_MOBILE_ENFORCE(
-        output_data[i] == c[i], "output[%d] = %d, output_cmp[%d] = %d", i,
-        static_cast<int32_t>(output_data[i]), i, static_cast<int32_t>(c[i]));
-    if (output_data[i] == c[i]) {
-      ++eq;
-    } else {
-      ++neq;
-    }
-  }
-  std::cout << "mnk=" << m << " " << n << " " << k << "   eq=" << eq
-            << " neq=" << neq << std::endl;
-  delete op;
-  return 0;
-}
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  paddle_mobile::TestMulOP<int8_t, int32_t>();
-  paddle_mobile::TestMulOP<float, float>();
-  return 0;
-}
diff --git a/mobile/test/operators/test_multiclass_nms_op.cpp b/mobile/test/operators/test_multiclass_nms_op.cpp
deleted file mode 100644
index 782dd6af94..0000000000
--- a/mobile/test/operators/test_multiclass_nms_op.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/multiclass_nms_op.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestMultiClassNMSOp {
- public:
-  explicit TestMultiClassNMSOp(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    for (auto block_desc : blocks) {
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      for (auto op : ops) {
-        if (op->Type() == "multiclass_nms" &&
-            op->Input("BBoxes")[0] == "box_coder_0.tmp_0") {
-          DLOG << " attr size: " << op->GetAttrMap().size();
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " outputs size: " << op->GetOutputs().size();
-          DLOG << " BBoxes is : " << op->Input("BBoxes")[0];
-          DLOG << " Scores is : " << op->Input("Scores")[0];
-          DLOG << " Out is : " << op->Output("Out")[0];
-          DLOG << " keep_top_k : "
-               << op->GetAttrMap().at("keep_top_k").Get<int>();
-          DLOG << " background_label : "
-               << op->GetAttrMap().at("background_label").Get<int>();
-          DLOG << " nms_eta : " << op->GetAttrMap().at("nms_eta").Get<float>();
-          DLOG << " nms_threshold : "
-               << op->GetAttrMap().at("nms_threshold").Get<float>();
-          DLOG << " nms_top_k : "
-               << op->GetAttrMap().at("nms_top_k").Get<int>();
-          DLOG << " score_threshold : "
-               << op->GetAttrMap().at("score_threshold").Get<float>();
-          std::shared_ptr<operators::MultiClassNMSOp<Dtype, float>> priorbox =
-              std::make_shared<operators::MultiClassNMSOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope.get());
-          ops_of_block_[*block_desc.get()].push_back(priorbox);
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict(const Tensor &t1, const Tensor &t2) {
-    // feed
-    auto scope = program_.scope.get();
-    Variable *x1_feed_value = scope->Var("box_coder_0.tmp_0");
-    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
-    tensor_x1->ShareDataWith(t1);
-
-    Variable *x2_feed_value = scope->Var("transpose_12.tmp_0");
-    auto tensor_x2 = x2_feed_value->GetMutable<LoDTensor>();
-    tensor_x2->ShareDataWith(t2);
-
-    Variable *output = scope->Var("detection_output_0.tmp_0");
-    auto *output_tensor = output->GetMutable<LoDTensor>();
-    output_tensor->mutable_data<float>({1917, 6});
-
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
-
-    predict(t1, t2, 0);
-
-    return out_tensor;
-  }
-
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-
-  void predict(const Tensor &t1, const Tensor &t2, int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      DLOG << "op -> run()";
-      op->Run();
-    }
-  }
-};
-
-template class TestMultiClassNMSOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-
-int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run MulticlassNMS Test";
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_mobilenet_ssd));
-  paddle_mobile::framework::Tensor inputx1;
-  SetupTensor<float>(&inputx1, {1, 2, 4}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx1_ptr = inputx1.data<float>();
-  const float x1[] = {0, 0, 100, 100, 50, 50, 150, 150};
-  for (int i = 0; i < 8; ++i) {
-    *(inputx1_ptr + i) = x1[i];
-  }
-
-  paddle_mobile::framework::Tensor inputx2;
-  SetupTensor<float>(&inputx2, {1, 2, 2}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx2_ptr = inputx2.data<float>();
-  const float x2[] = {0.4, 0.3, 0.6, 0.7};
-  for (int i = 0; i < 4; ++i) {
-    *(inputx2_ptr + i) = x2[i];
-  }
-
-  paddle_mobile::framework::TestMultiClassNMSOp<paddle_mobile::CPU>
-      testMultiClassNMSOp(program);
-
-  auto output = testMultiClassNMSOp.predict(inputx1, inputx2);
-  auto *output_ptr = output->data<float>();
-
-  for (int i = 0; i < output->numel(); ++i) {
-    DLOG << output_ptr[i];
-  }
-
-  // test multi point
-  paddle_mobile::framework::Tensor inputx3;
-  SetupTensor<float>(&inputx3, {1, 2, 8}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx3_ptr = inputx3.data<float>();
-  const float x3[] = {0,  0,  100, 0,  100, 100, 0,  100,
-                      50, 50, 150, 50, 150, 150, 50, 150};
-  for (int i = 0; i < 16; ++i) {
-    *(inputx3_ptr + i) = x3[i];
-  }
-
-  auto output2 = testMultiClassNMSOp.predict(inputx3, inputx2);
-  auto *output_ptr2 = output2->data<float>();
-
-  for (int i = 0; i < output2->numel(); ++i) {
-    DLOG << output_ptr2[i];
-  }
-  return 0;
-}
diff --git a/mobile/test/operators/test_polygon_box_transform_op.cpp b/mobile/test/operators/test_polygon_box_transform_op.cpp
deleted file mode 100644
index bfd8fb3cc2..0000000000
--- a/mobile/test/operators/test_polygon_box_transform_op.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/polygon_box_transform_op.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestPolygonBoxTransformOp {
- public:
-  explicit TestPolygonBoxTransformOp(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    for (auto block_desc : blocks) {
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      for (auto op : ops) {
-        if (op->Type() == "polygon_box_transform") {
-          DLOG << " attr size: " << op->GetAttrMap().size();
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " input is : " << op->Input("Input")[0];
-          input_var_name = op->Input("Input")[0];
-          DLOG << " outputs size: " << op->GetOutputs().size();
-          DLOG << " output is : " << op->Output("Output")[0];
-          output_var_name = op->Output("Output")[0];
-          std::shared_ptr<operators::PolygonBoxTransformOp<Dtype, float>>
-              op_ptr = std::make_shared<
-                  operators::PolygonBoxTransformOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope.get());
-          ops_of_block_[*block_desc.get()].push_back(op_ptr);
-          return;
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict(const Tensor &t) {
-    auto scope = program_.scope.get();
-    Variable *input_feed_value = scope->Var(input_var_name);
-    auto tensor_input = input_feed_value->GetMutable<LoDTensor>();
-    tensor_input->ShareDataWith(t);
-
-    Variable *output = scope->Var(output_var_name);
-    auto *output_tensor = output->GetMutable<LoDTensor>();
-
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
-
-    predict(t, 0);
-
-    return out_tensor;
-  }
-
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-  string input_var_name;
-  string output_var_name;
-
-  void predict(const Tensor &t, int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      op->Run();
-    }
-  }
-};
-
-template class TestPolygonBoxTransformOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-
-int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run PolygonBoxTransform Test";
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_ocr));
-
-  paddle_mobile::framework::Tensor input;
-  SetupTensor<float>(&input, {1, 8, 1, 2}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *input_ptr = input.data<float>();
-  for (int i = 0; i < 16; ++i) {
-    *(input_ptr + i) = i;
-  }
-  DLOG << "input : ";
-  for (int i = 0; i < input.numel(); ++i) {
-    DLOG << " index " << i << " : " << input_ptr[i];
-  }
-
-  paddle_mobile::framework::TestPolygonBoxTransformOp<paddle_mobile::CPU>
-      testPolygonBoxTransformOp(program);
-
-  auto output = testPolygonBoxTransformOp.predict(input);
-  auto *output_ptr = output->data<float>();
-
-  DLOG << "output : ";
-  for (int i = 0; i < output->numel(); ++i) {
-    DLOG << " index " << i << " : " << output_ptr[i];
-  }
-  return 0;
-}
diff --git a/mobile/test/operators/test_pool_op.cpp b/mobile/test/operators/test_pool_op.cpp
deleted file mode 100644
index 44bb132e79..0000000000
--- a/mobile/test/operators/test_pool_op.cpp
+++ /dev/null
@@ -1,231 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_include.h"
-#include "operators/math/pooling.h"
-#include "operators/pool_op.h"
-
-namespace paddle_mobile {
-
-namespace math = operators::math;
-
-template <int PoolType, int Kernel, int Pad, int Stride>
-int TestPoolOp(int in_channels, int in_height, int in_width) {
-  int kernel_h = Kernel;
-  int kernel_w = Kernel;
-  int pad_h = Pad;
-  int pad_w = Pad;
-  int stride_h = Stride;
-  int stride_w = Stride;
-  std::string pooling_type = (PoolType == 0 ? "max" : "avg");
-
-  int batch_size = 1;
-  int input_c = in_channels;
-  int input_h = in_height;
-  int input_w = in_width;
-
-  framework::DDim input_shape =
-      framework::make_ddim({batch_size, input_c, input_h, input_w});
-
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(input, input_shape, -127, 127);
-
-  //  for (int i = 0; i < input->numel(); ++i) {
-  //    DLOG << "input[" << i << "] = " << input->data<float>()[i];
-  //  }
-
-  auto output_var = scope.get()->Var("output");
-  framework::AttributeMap attrs;
-  attrs["pooling_type"].Set<std::string>(pooling_type);
-  attrs["ksize"].Set<vector<int>>(std::vector<int>({kernel_h, kernel_w}));
-  attrs["strides"].Set<vector<int>>(std::vector<int>({stride_h, stride_w}));
-  attrs["paddings"].Set<vector<int>>(std::vector<int>({pad_h, pad_w}));
-  attrs["ceil_mode"].Set<bool>(true);
-  //  attrs["ceil_mode"].Set<bool>(false);
-  attrs["global_pooling"].Set<bool>(false);
-  attrs["exclusive"].Set<bool>(true);
-
-  auto *op = new operators::PoolOp<CPU, float>("pool2d", inputs, outputs, attrs,
-                                               scope.get());
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-  framework::Tensor output_cmp;
-  output_cmp.mutable_data<float>(output->dims());
-
-  if (pooling_type == "avg") {
-    math::Pooling<AVG>()(*input, std::vector<int>{kernel_h, kernel_w},
-                         std::vector<int>{stride_h, stride_w},
-                         std::vector<int>{pad_h, pad_w}, &output_cmp);
-  } else {
-    math::Pooling<MAX>()(*input, std::vector<int>{kernel_h, kernel_w},
-                         std::vector<int>{stride_h, stride_w},
-                         std::vector<int>{pad_h, pad_w}, &output_cmp);
-  }
-
-  // compare results
-  const float *output_data = output->data<float>();
-  float *output_cmp_data = output_cmp.data<float>();
-  for (int i = 0; i < output->numel(); ++i) {
-    float gap = output_data[i] - output_cmp_data[i];
-    //    PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],
-    //                          "output[%d] = %d, output_cmp[%d] = %d", i,
-    //                          output_data[i], i, output_cmp_data[i]);
-    if (gap > 1e-5 && std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      exit(1);
-    }
-  }
-  delete op;
-  return 0;
-}
-}  // namespace paddle_mobile
-
-int Test(const int in_channels, const int in_height, const int in_width) {
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=3, pad=0, stride=1";
-  paddle_mobile::TestPoolOp<0, 3, 0, 1>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=3, pad=1, stride=1";
-  paddle_mobile::TestPoolOp<0, 3, 1, 1>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=3, pad=2, stride=1";
-  paddle_mobile::TestPoolOp<0, 3, 2, 1>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=3, pad=5, stride=1";
-  paddle_mobile::TestPoolOp<0, 3, 5, 1>(in_channels, in_height, in_width);
-
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=3, pad=0, stride=1";
-  paddle_mobile::TestPoolOp<1, 3, 0, 1>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=3, pad=1, stride=1";
-  paddle_mobile::TestPoolOp<1, 3, 1, 1>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=3, pad=2, stride=1";
-  paddle_mobile::TestPoolOp<1, 3, 2, 1>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=3, pad=5, stride=1";
-  paddle_mobile::TestPoolOp<1, 3, 5, 1>(in_channels, in_height, in_width);
-
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=3, pad=0, stride=2";
-  paddle_mobile::TestPoolOp<0, 3, 0, 2>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=3, pad=1, stride=2";
-  paddle_mobile::TestPoolOp<0, 3, 1, 2>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=3, pad=2, stride=2";
-  paddle_mobile::TestPoolOp<0, 3, 2, 2>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=3, pad=5, stride=2";
-  paddle_mobile::TestPoolOp<0, 3, 5, 2>(in_channels, in_height, in_width);
-
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=3, pad=0, stride=2";
-  paddle_mobile::TestPoolOp<1, 3, 0, 2>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=3, pad=1, stride=2";
-  paddle_mobile::TestPoolOp<1, 3, 1, 2>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=3, pad=2, stride=2";
-  paddle_mobile::TestPoolOp<1, 3, 2, 2>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=3, pad=5, stride=2";
-  paddle_mobile::TestPoolOp<1, 3, 5, 2>(in_channels, in_height, in_width);
-
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=2, pad=0, stride=1";
-  paddle_mobile::TestPoolOp<0, 2, 0, 1>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=2, pad=1, stride=1";
-  paddle_mobile::TestPoolOp<0, 2, 1, 1>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=2, pad=2, stride=1";
-  paddle_mobile::TestPoolOp<0, 2, 2, 1>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=2, pad=5, stride=1";
-  paddle_mobile::TestPoolOp<0, 2, 5, 1>(in_channels, in_height, in_width);
-
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=2, pad=0, stride=1";
-  paddle_mobile::TestPoolOp<1, 2, 0, 1>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=2, pad=1, stride=1";
-  paddle_mobile::TestPoolOp<1, 2, 1, 1>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=2, pad=2, stride=1";
-  paddle_mobile::TestPoolOp<1, 2, 2, 1>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=2, pad=5, stride=1";
-  paddle_mobile::TestPoolOp<1, 2, 5, 1>(in_channels, in_height, in_width);
-
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=2, pad=0, stride=2";
-  paddle_mobile::TestPoolOp<0, 2, 0, 2>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=2, pad=1, stride=2";
-  paddle_mobile::TestPoolOp<0, 2, 1, 2>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=2, pad=2, stride=2";
-  paddle_mobile::TestPoolOp<0, 2, 2, 2>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=2, pad=5, stride=2";
-  paddle_mobile::TestPoolOp<0, 2, 5, 2>(in_channels, in_height, in_width);
-
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=2, pad=0, stride=2";
-  paddle_mobile::TestPoolOp<1, 2, 0, 2>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=2, pad=1, stride=2";
-  paddle_mobile::TestPoolOp<1, 2, 1, 2>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=2, pad=2, stride=2";
-  paddle_mobile::TestPoolOp<1, 2, 2, 2>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=2, pad=5, stride=2";
-  paddle_mobile::TestPoolOp<1, 2, 5, 2>(in_channels, in_height, in_width);
-}
-
-int main(int argc, char *argv[]) {
-  //  if (argc < 4) {
-  //    LOG(paddle_mobile::kLOG_INFO)
-  //        << "Usage:\n"
-  //        << "  ./test-pool-op in_channels in_height in_width \n"
-  //        << "  params:\n"
-  //        << "   -in_channels: int, input image's channels\n"
-  //        << "   -in_height: int, input image's height\n"
-  //        << "   -in_width: int, input image's width\n";
-  //    return 1;
-  //  }
-  //  int in_channels = atoi(argv[1]);
-  //  int in_height = atoi(argv[2]);
-  //  int in_width = atoi(argv[3]);
-  Test(1, 10, 10);
-  Test(1, 50, 50);
-  Test(32, 10, 10);
-  Test(32, 50, 50);
-}
diff --git a/mobile/test/operators/test_prelu_op.cpp b/mobile/test/operators/test_prelu_op.cpp
deleted file mode 100644
index f98c9904ae..0000000000
--- a/mobile/test/operators/test_prelu_op.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../executor_for_test.h"
-#include "../test_include.h"
-#include "operators/prelu_op.h"
-
-int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(g_resnet);
-  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
-                        "program file read fail");
-
-  Executor4Test<paddle_mobile::CPU,
-                paddle_mobile::operators::PReluOp<paddle_mobile::CPU, float>>
-      executor(program, "prelu");
-
-  // 1. input_tensors;
-  vector<Tensor> input_tensors;
-
-  Tensor input1;
-  auto input1_data = CreateInput<float>(&input1, {1, 2, 3, 4}, -1, 1);
-  input_tensors.push_back(input1);
-
-  // 2. input_names
-  vector<string> input_names({
-      "batch_norm_0.tmp_2",
-  });
-
-  // 3. output_names
-  vector<string> output_names({"batch_norm_0.tmp_3"});
-
-  // 4. out_dims;
-  vector<DDim> out_ddims;
-  auto out_ddim = paddle_mobile::framework::make_ddim({1, 2, 3, 4});
-  out_ddims.push_back(out_ddim);
-
-  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
-                                            output_names, out_ddims);
-
-  auto output0_data = output[0]->data<float>();
-
-  for (int j = 0; j < output[0]->numel(); ++j) {
-    DLOG << " value of output: " << output0_data[j];
-  }
-  return 0;
-}
diff --git a/mobile/test/operators/test_prior_box_op.cpp b/mobile/test/operators/test_prior_box_op.cpp
deleted file mode 100644
index b2f05a18e6..0000000000
--- a/mobile/test/operators/test_prior_box_op.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/prior_box_op.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestPriorBoxOp {
- public:
-  explicit TestPriorBoxOp(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    //  DLOG << " **block size " << blocks.size();
-    for (auto block_desc : blocks) {
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      //    DLOG << " ops " << ops.size();
-      for (auto op : ops) {
-        if (op->Type() == "prior_box" &&
-            op->Input("Input")[0] == "batch_norm_26.tmp_3") {
-          DLOG << " mul attr size: " << op->GetAttrMap().size();
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " outputs size: " << op->GetOutputs().size();
-          DLOG << " Input is : " << op->Input("Input")[0];
-          DLOG << " Image is : " << op->Input("Image")[0];
-          DLOG << " Output Boxes is : " << op->Output("Boxes")[0];
-          DLOG << " Output Variances is : " << op->Output("Variances")[0];
-          DLOG << " offset : " << op->GetAttrMap().at("offset").Get<float>();
-          DLOG << " step_h : " << op->GetAttrMap().at("step_h").Get<float>();
-          DLOG << " step_w : " << op->GetAttrMap().at("step_w").Get<float>();
-          DLOG << " flip : " << op->GetAttrMap().at("flip").Get<bool>();
-          DLOG << " clip : " << op->GetAttrMap().at("clip").Get<bool>();
-          //                            DLOG << " variances : " <<
-          //                            op->GetAttrMap().at("variances").Get<std::vector<float>>();
-          //                            DLOG << " aspect_ratios : " <<
-          //                            op->GetAttrMap().at("aspect_ratios").Get<std::vector<float>>();
-          //                            DLOG << " min_sizes : " <<
-          //                            op->GetAttrMap().at("min_sizes").Get<std::vector<float>>();
-          //                            DLOG << " max_sizes : " <<
-          //                            op->GetAttrMap().at("max_sizes").Get<std::vector<float>>();
-          std::shared_ptr<operators::PriorBoxOp<Dtype, float>> priorbox =
-              std::make_shared<operators::PriorBoxOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope.get());
-          ops_of_block_[*block_desc.get()].push_back(priorbox);
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict_priorbox(const Tensor &t1, const Tensor &t2) {
-    // feed
-    auto scope = program_.scope.get();
-    Variable *x1_feed_value = scope->Var("image");
-    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
-    tensor_x1->ShareDataWith(t1);
-
-    Variable *x2_feed_value = scope->Var("batch_norm_26.tmp_3");
-    auto tensor_x2 = x2_feed_value->GetMutable<LoDTensor>();
-    tensor_x2->ShareDataWith(t2);
-
-    Variable *boxes_output = scope->Var("prior_box_1.tmp_0");
-    auto *boxes_output_tensor = boxes_output->GetMutable<LoDTensor>();
-    boxes_output_tensor->mutable_data<float>({10, 10, 6, 4});
-
-    Variable *variances_output = scope->Var("prior_box_1.tmp_1");
-    auto *variances_output_tesnor = variances_output->GetMutable<LoDTensor>();
-    variances_output_tesnor->mutable_data<float>({10, 10, 6, 4});
-    //  DLOG << typeid(output_tensor).name();
-    //  DLOG << "output_tensor dims: " << output_tensor->dims();
-
-    std::shared_ptr<Tensor> outboxes_tensor = std::make_shared<LoDTensor>();
-    outboxes_tensor.reset(boxes_output_tensor);
-
-    std::shared_ptr<Tensor> outvars_tensor = std::make_shared<LoDTensor>();
-    outvars_tensor.reset(variances_output_tesnor);
-    predict_priorbox(t1, t2, 0);
-
-    return outboxes_tensor;
-    // return outvars_tensor;
-  }
-
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-
-  void predict_priorbox(const Tensor &t1, const Tensor &t2, int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      DLOG << "op -> run()";
-      op->Run();
-    }
-  }
-};
-
-template class TestPriorBoxOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-
-int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run PriorBoxOp Test";
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_mobilenet_ssd));
-
-  /// input x (1,3,300,300)
-  paddle_mobile::framework::Tensor input_image;
-  SetupTensor<float>(&input_image, {1, 3, 300, 300}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *input_image_ptr = input_image.data<float>();
-
-  paddle_mobile::framework::Tensor inputx1;
-  SetupTensor<float>(&inputx1, {1, 1024, 10, 10}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx1_ptr = inputx1.data<float>();
-
-  paddle_mobile::framework::TestPriorBoxOp<paddle_mobile::CPU> testPriorBoxOp(
-      program);
-
-  auto output_priorbox = testPriorBoxOp.predict_priorbox(input_image, inputx1);
-  auto *output_priorbox_ptr = output_priorbox->data<float>();
-
-  for (int i = 0; i < output_priorbox->numel(); i++) {
-    DLOG << output_priorbox_ptr[i];
-  }
-  return 0;
-}
diff --git a/mobile/test/operators/test_quantize_op.cpp b/mobile/test/operators/test_quantize_op.cpp
deleted file mode 100644
index d8e72e9b14..0000000000
--- a/mobile/test/operators/test_quantize_op.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/quantize_op.h"
-
-namespace paddle_mobile {
-namespace round {
-enum RoundType {
-  RoundToEven = 0,
-  RoundAwayZero = 1,
-  RoundTowardsZero = 2,
-};
-}
-
-template <round::RoundType T>
-struct Round {
-  int8_t operator()(float x);
-};
-
-template <>
-struct Round<round::RoundAwayZero> {
-  int8_t operator()(float x) { return std::round(x); }
-};
-
-template <>
-struct Round<round::RoundTowardsZero> {
-  int8_t operator()(float x) { return int8_t(x); }
-};
-
-template <>
-struct Round<round::RoundToEven> {
-  int8_t operator()(float x) {
-    float v = std::round(x);
-    int32_t q = static_cast<int32_t>(v);
-    if (abs(abs(q - v) - 0.5) <= 0) {
-      if (abs(q) % 2 != 0) {
-        q = q + ((q > 0) ? -1 : 1);
-      }
-    }
-    return static_cast<int8_t>(q);
-  }
-};
-
-template <round::RoundType T>
-static void quantize(const Tensor *input, const float scale, Tensor *output) {
-  int batch_size = input->dims()[0];
-  int channels = input->dims()[1];
-  int input_h = input->dims()[2];
-  int input_w = input->dims()[3];
-  int output_h = output->dims()[2];
-  int output_w = output->dims()[3];
-  size_t input_spatial = input_h * input_w;
-  size_t output_spatial = output_h * output_w;
-  const float *x = input->data<float>();
-  int8_t *y = output->mutable_data<int8_t>();
-
-  for (int nc = 0; nc < batch_size * channels; ++nc) {
-    const float *xh = x + nc * input_spatial;
-    int8_t *yh = y + nc * output_spatial;
-    for (int h = 0; h < input_h; ++h, yh += output_w, xh += input_w) {
-      for (int w = 0; w < input_w; ++w) {
-        yh[w] = Round<T>()(xh[w] * scale);
-      }
-    }
-  }
-}
-
-static float find_abs_max(const Tensor *input) {
-  float max_abs = 0.f;
-  const float *x = input->data<float>();
-  size_t size = input->numel();
-  for (size_t i = 0; i < size; ++i) {
-    float value = std::abs(x[i]);
-    if (value > max_abs) {
-      max_abs = value;
-    }
-  }
-  return max_abs;
-}
-
-int TestQuqntizeOp(const int batch_size, const int channel, const int height,
-                   const int width) {
-  DLOG << "batch_size: " << batch_size << ", channel: " << channel
-       << ", height: " << height << ", width: " << width;
-  framework::DDim dim =
-      framework::make_ddim({batch_size, channel, height, width});
-
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-  outputs["OutScale"] = std::vector<std::string>({"output_scale"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(input, dim, -100.f, 100.f);
-
-  auto output_var = scope.get()->Var("output");
-  auto output_scale_var = scope.get()->Var("output_scale");
-
-  framework::AttributeMap attrs;
-  auto *op = new operators::QuantizeOp<CPU, float>("quantize", inputs, outputs,
-                                                   attrs, scope.get());
-  op->InferShape();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-  const int8_t *output_data = output->data<int8_t>();
-  auto output_scale = output_scale_var->template Get<framework::LoDTensor>();
-  const float *output_scale_data = output_scale->data<float>();
-
-  float output_scale_cmp = find_abs_max(input);
-  PADDLE_MOBILE_ENFORCE(output_scale_cmp == output_scale_data[0],
-                        "output_scale = %.6f, output_scale_cmp = %.6f",
-                        output_scale_cmp, output_scale_data[0]);
-
-  framework::Tensor output_cmp;
-  output_cmp.Resize(output->dims());
-  float scale = 127 / output_scale_cmp;
-  quantize<round::RoundTowardsZero>(input, scale, &output_cmp);
-  int8_t *output_cmp_data = output_cmp.data<int8_t>();
-  for (int i = 0; i < output->numel(); ++i) {
-    PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],
-                          "output[%d] = %d, output_cmp[%d] = %d", i,
-                          static_cast<int>(output_data[i]), i,
-                          static_cast<int>(output_cmp_data[i]));
-  }
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main(int argc, char *argv[]) {
-  TestQuqntizeOp(1, 10, 10, 5);
-  TestQuqntizeOp(1, 111, 111, 5);
-  TestQuqntizeOp(5, 111, 111, 5);
-}
diff --git a/mobile/test/operators/test_relu6_op.cpp b/mobile/test/operators/test_relu6_op.cpp
deleted file mode 100644
index 8681c4155d..0000000000
--- a/mobile/test/operators/test_relu6_op.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-#include <iostream>
-#include "../test_include.h"
-#include "operators/activation_op.h"
-
-namespace paddle_mobile {
-
-void Relu6(const framework::Tensor *X, framework::Tensor *Y) {
-  const float *x = X->data<float>();
-  float *y = Y->mutable_data<float>();
-
-  for (int i = 0; i < X->numel(); ++i) {
-    float q = x[i];
-    y[i] = std::min(std::max(0.f, q), 6.f);
-  }
-}
-
-int TestRelu6Op(const std::vector<int> input_shape) {
-  framework::DDim dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(input, dims, -100.0, 100.0);
-
-  auto output_var = scope.get()->Var("output");
-
-  framework::AttributeMap attrs;
-  attrs["threshold"].Set<float>(6.f);
-  auto *op = new operators::Relu6Op<CPU, float>("relu6", inputs, outputs, attrs,
-                                                scope.get());
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-
-  framework::Tensor output_cmp;
-  float *output_cmp_data = output_cmp.mutable_data<float>(output->dims());
-  Relu6(input, &output_cmp);
-
-  const float *output_data = output->data<float>();
-  for (int i = 0; i < output->numel(); ++i) {
-    float gap = output_data[i] - output_cmp_data[i];
-    if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::TestRelu6Op({1, 1, 2, 3});
-  paddle_mobile::TestRelu6Op({1, 3, 11, 22});
-  paddle_mobile::TestRelu6Op({1, 32, 112, 112});
-  std::cout << "test relu6 op pass." << std::endl;
-  return 0;
-}
diff --git a/mobile/test/operators/test_relu_op.cpp b/mobile/test/operators/test_relu_op.cpp
deleted file mode 100644
index d173845386..0000000000
--- a/mobile/test/operators/test_relu_op.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-#include <iostream>
-#include "../test_include.h"
-#include "operators/activation_op.h"
-
-namespace paddle_mobile {
-
-void Relu(const framework::Tensor *X, framework::Tensor *Y) {
-  const float *x = X->data<float>();
-  float *y = Y->mutable_data<float>();
-
-  for (int i = 0; i < X->numel(); ++i) {
-    float q = x[i];
-    y[i] = std::max(0.f, q);
-  }
-}
-
-int TestReluOp(const std::vector<int> input_shape) {
-  framework::DDim dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(input, dims, -100.0, 100.0);
-
-  auto output_var = scope.get()->Var("output");
-
-  framework::AttributeMap attrs;
-  auto *op = new operators::ReluOp<CPU, float>("relu", inputs, outputs, attrs,
-                                               scope.get());
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-
-  framework::Tensor output_cmp;
-  float *output_cmp_data = output_cmp.mutable_data<float>(output->dims());
-  Relu(input, &output_cmp);
-
-  const float *output_data = output->data<float>();
-  for (int i = 0; i < output->numel(); ++i) {
-    float gap = output_data[i] - output_cmp_data[i];
-    if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::TestReluOp({1, 1, 2, 3});
-  paddle_mobile::TestReluOp({1, 3, 11, 22});
-  paddle_mobile::TestReluOp({1, 32, 112, 112});
-  std::cout << "test relu op pass." << std::endl;
-  return 0;
-}
diff --git a/mobile/test/operators/test_reshape2_op.cpp b/mobile/test/operators/test_reshape2_op.cpp
deleted file mode 100644
index 69edd34bf6..0000000000
--- a/mobile/test/operators/test_reshape2_op.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/reshape2_op.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestReshape2Op {
- public:
-  explicit TestReshape2Op(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    for (auto block_desc : blocks) {
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      for (auto op : ops) {
-        if (op->Type() == "reshape2") {
-          DLOG << " attr size: " << op->GetAttrMap().size();
-          std::unordered_map<std::string, Attribute> attrs = op->GetAttrMap();
-          for (std::unordered_map<std::string, Attribute>::iterator it =
-                   attrs.begin();
-               it != attrs.end(); ++it) {
-            DLOG << "  " << it->first << " " << it->second;
-          }
-
-          DLOG << " inputs size: " << op->GetInputs().size();
-          VariableNameMap inputs = op->GetInputs();
-          for (VariableNameMap::iterator it = inputs.begin();
-               it != inputs.end(); ++it) {
-            DLOG << "  " << it->first << " " << it->second;
-          }
-
-          DLOG << " outputs size: " << op->GetOutputs().size();
-          VariableNameMap outputs = op->GetOutputs();
-          for (VariableNameMap::iterator it = outputs.begin();
-               it != outputs.end(); ++it) {
-            DLOG << "  " << it->first << " " << it->second;
-          }
-
-          input_var_name = op->Input("X")[0];
-          output_var_name = op->Output("Out")[0];
-          std::shared_ptr<operators::Reshape2Op<Dtype, float>> op_ptr =
-              std::make_shared<operators::Reshape2Op<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope.get());
-          ops_of_block_[*block_desc.get()].push_back(op_ptr);
-          return;
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict(const Tensor &t) {
-    auto scope = program_.scope.get();
-    Variable *input_feed_value = scope->Var(input_var_name);
-    auto tensor_input = input_feed_value->GetMutable<LoDTensor>();
-    tensor_input->ShareDataWith(t);
-
-    Variable *output = scope->Var(output_var_name);
-    auto *output_tensor = output->GetMutable<LoDTensor>();
-
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
-
-    predict(t, 0);
-
-    return out_tensor;
-  }
-
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-  string input_var_name;
-  string output_var_name;
-
-  void predict(const Tensor &t, int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      op->Run();
-    }
-  }
-};
-
-template class TestReshape2Op<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-
-int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run Reshape2 Test";
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_ocr) + "/model",
-                             std::string(g_ocr) + "/params");
-
-  paddle_mobile::framework::Tensor input;
-  SetupTensor<float>(&input, {1, 4, 4}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *input_ptr = input.data<float>();
-  for (int i = 0; i < 16; ++i) {
-    *(input_ptr + i) = i;
-  }
-  DLOG << "input : ";
-  for (int i = 0; i < input.numel(); ++i) {
-    DLOG << " index " << i << " : " << input_ptr[i];
-  }
-
-  paddle_mobile::framework::TestReshape2Op<paddle_mobile::CPU> testReshape2Op(
-      program);
-
-  auto output = testReshape2Op.predict(input);
-  auto *output_ptr = output->data<float>();
-
-  DLOG << "output : ";
-  for (int i = 0; i < output->numel(); ++i) {
-    DLOG << " index " << i << " : " << output_ptr[i];
-  }
-  return 0;
-}
diff --git a/mobile/test/operators/test_reshape_op.cpp b/mobile/test/operators/test_reshape_op.cpp
deleted file mode 100644
index ff3299f5e8..0000000000
--- a/mobile/test/operators/test_reshape_op.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/reshape_op.h"
-
-int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_mobilenet_ssd));
-  if (program.originProgram == nullptr) {
-    DLOG << "program read file";
-  }
-  Executor4Test<paddle_mobile::CPU,
-                paddle_mobile::operators::ReshapeOp<paddle_mobile::CPU, float>>
-      executor(program, "reshape");
-  paddle_mobile::framework::Tensor input;
-  SetupTensor<float>(&input, {2, 3, 3, 2}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto input_ptr = input.data<float>();
-  auto out_ddim = paddle_mobile::framework::make_ddim({2, 9, 2});
-  auto output =
-      executor.Predict(input, "transpose_0.tmp_0", "reshape_0.tmp_0", out_ddim);
-  auto *output_ptr = output->data<float>();
-
-  DLOG << "input : ";
-  for (int j = 0; j < input.numel(); ++j) {
-    DLOG << " index " << j << " : " << input_ptr[j];
-  }
-
-  DLOG << "output : ";
-  for (int j = 0; j < output->numel(); ++j) {
-    DLOG << " index " << j << " : " << output_ptr[j];
-  }
-
-  return 0;
-}
diff --git a/mobile/test/operators/test_resize_op.cpp b/mobile/test/operators/test_resize_op.cpp
deleted file mode 100644
index c452ef8d85..0000000000
--- a/mobile/test/operators/test_resize_op.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/resize_op.h"
-
-int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_mobilenet_ssd));
-  if (program.originProgram == nullptr) {
-    DLOG << "program read file";
-  }
-  Executor4Test<paddle_mobile::CPU,
-                paddle_mobile::operators::ResizeOp<paddle_mobile::CPU, float>>
-      executor(program, "resize");
-  paddle_mobile::framework::Tensor input;
-  SetupTensor<float>(&input, {2, 3, 3, 2}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto input_ptr = input.data<float>();
-  auto out_ddim = paddle_mobile::framework::make_ddim({2, 9, 2});
-  auto output =
-      executor.Predict(input, "transpose_0.tmp_0", "reshape_0.tmp_0", out_ddim);
-  auto *output_ptr = output->data<float>();
-
-  DLOG << "input : ";
-  for (int j = 0; j < input.numel(); ++j) {
-    DLOG << " index " << j << " : " << input_ptr[j];
-  }
-
-  DLOG << "output : ";
-  for (int j = 0; j < output->numel(); ++j) {
-    DLOG << " index " << j << " : " << output_ptr[j];
-  }
-
-  return 0;
-}
diff --git a/mobile/test/operators/test_scale_op.cpp b/mobile/test/operators/test_scale_op.cpp
deleted file mode 100644
index 574779d71e..0000000000
--- a/mobile/test/operators/test_scale_op.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/scale_op.h"
-
-int main() {}
diff --git a/mobile/test/operators/test_sequence_expand_op.cpp b/mobile/test/operators/test_sequence_expand_op.cpp
deleted file mode 100644
index 731fc8e9e5..0000000000
--- a/mobile/test/operators/test_sequence_expand_op.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_include.h"
-#include "operators/sequence_ops/sequence_expand_op.h"
-
-namespace paddle_mobile {
-
-int TestSequenceExpandOp(const framework::LoDTensor &input_x,
-                         const framework::LoDTensor &input_y, int ref_level,
-                         framework::LoDTensor *output) {
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input_x"});
-  inputs["Y"] = std::vector<std::string>({"input_y"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto input_x_var = scope.get()->Var("input_x");
-  auto *x = input_x_var->template GetMutable<framework::LoDTensor>();
-  x->Resize(input_x.dims());
-  x->ShareDataWith(input_x);
-  x->set_lod(input_x.lod());
-  auto input_y_var = scope.get()->Var("input_y");
-  auto *y = input_y_var->template GetMutable<framework::LoDTensor>();
-  y->Resize(framework::make_ddim({0}));
-  y->mutable_data<float>();
-  y->set_lod(input_y.lod());
-
-  auto output_var = scope.get()->Var("output");
-
-  framework::AttributeMap attrs;
-  attrs["ref_level"].Set<int>(0);
-
-  auto *op = new operators::SequenceExpandOp<CPU, float>(
-      "sequence_expand", inputs, outputs, attrs, scope.get());
-
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto *out = output_var->template Get<framework::LoDTensor>();
-  output->Resize(out->dims());
-  output->ShareDataWith(*out);
-  output->set_lod(out->lod());
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-// namespace framework = paddle_mobile::framework;
-
-int main(int argc, char *argv[]) {
-  framework::LoDTensor input_x, input_y, output;
-  // case 1
-  {
-    std::vector<float> data{1, 2, 3, 4};
-    input_x.Resize(framework::make_ddim({4, 1}));
-    float *in_data = input_x.mutable_data<float>();
-    for (int i = 0; i < 4; ++i) in_data[i] = data[i];
-    input_x.set_lod({{0, 2, 4}});
-    input_y.set_lod({{0, 2, 4}, {0, 3, 6, 7, 8}});
-
-    TestSequenceExpandOp(input_x, input_y, 0, &output);
-    std::vector<float> expect_data{1, 2, 1, 2, 3, 4, 3, 4};
-    std::vector<int> expect_lod{0, 2, 4, 6, 8};
-    for (int i = 0; i < 5; ++i) {
-      if (output.lod()[0][i] != expect_lod[i]) {
-        std::cerr << "output_lod[" << i << "]: " << output.lod()[0][i]
-                  << " != expect_lod[" << i << "]: " << expect_lod[i]
-                  << std::endl;
-        return 1;
-      }
-    }
-    for (int i = 0; i < 8; ++i) {
-      if (output.data<float>()[i] != expect_data[i]) {
-        std::cerr << "output[" << i << "]: " << output.data<float>()[i]
-                  << " != expect[" << i << "]: " << expect_data[i] << std::endl;
-        return 1;
-      }
-    }
-  }
-  return 0;
-}
diff --git a/mobile/test/operators/test_sequence_pool_op.cpp b/mobile/test/operators/test_sequence_pool_op.cpp
deleted file mode 100644
index de945c9ec0..0000000000
--- a/mobile/test/operators/test_sequence_pool_op.cpp
+++ /dev/null
@@ -1,293 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_include.h"
-#include "operators/sequence_ops/sequence_pool_op.h"
-
-namespace paddle_mobile {
-
-int TestSequencePoolOp(const framework::LoDTensor &input_x,
-                       const std::string pool_type,
-                       framework::LoDTensor *output) {
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input_x"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto input_x_var = scope.get()->Var("input_x");
-  auto *x = input_x_var->template GetMutable<framework::LoDTensor>();
-  x->Resize(input_x.dims());
-  x->ShareDataWith(input_x);
-  x->set_lod(input_x.lod());
-
-  auto output_var = scope.get()->Var("output");
-
-  framework::AttributeMap attrs;
-  attrs["pooltype"].Set<std::string>(pool_type);
-
-  auto *op = new operators::SequencePoolOp<CPU, float>(
-      "sequence_pool", inputs, outputs, attrs, scope.get());
-
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto *out = output_var->template Get<framework::LoDTensor>();
-  output->Resize(out->dims());
-  output->ShareDataWith(*out);
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-// namespace framework = paddle_mobile::framework;
-
-int main(int argc, char *argv[]) {
-  framework::LoDTensor input_x, output;
-  // case 1
-  DLOG << "running max case 1";
-  {
-    std::vector<float> data{1, 2, 3, 4};
-    input_x.Resize(framework::make_ddim({4, 1}));
-    float *in_data = input_x.mutable_data<float>();
-    for (int i = 0; i < 4; ++i) in_data[i] = data[i];
-    input_x.set_lod({{0, 2, 4}});
-
-    TestSequencePoolOp(input_x, "MAX", &output);
-    std::vector<float> expect_data{2, 4};
-    for (int i = 0; i < 2; ++i) {
-      if (output.data<float>()[i] != expect_data[i]) {
-        DLOG << "output[" << i << "]: " << output.data<float>()[i]
-             << " != expect[" << i << "]: " << expect_data[i];
-        return 1;
-      }
-    }
-  }
-  // case 2
-  DLOG << "running max case 2";
-  {
-    std::vector<float> data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
-    input_x.Resize(framework::make_ddim({data.size(), 1}));
-    float *in_data = input_x.mutable_data<float>();
-    for (int i = 0; i < data.size(); ++i) in_data[i] = data[i];
-    input_x.set_lod({{0, 3, 10}});
-
-    TestSequencePoolOp(input_x, "MAX", &output);
-    std::vector<float> expect_data{3, 10};
-    for (int i = 0; i < 2; ++i) {
-      if (output.data<float>()[i] != expect_data[i]) {
-        DLOG << "output[" << i << "]: " << output.data<float>()[i]
-             << " != expect[" << i << "]: " << expect_data[i];
-        return 1;
-      }
-    }
-  }
-  DLOG << "running max case 3";
-  // case 3
-  {
-    std::vector<float> data{1, 2, 3, 4, 5, 6, 7, 8};
-    input_x.Resize(framework::make_ddim({4, 2}));
-    float *in_data = input_x.mutable_data<float>();
-    for (int i = 0; i < data.size(); ++i) in_data[i] = data[i];
-    input_x.set_lod({{0, 2, 4}});
-
-    TestSequencePoolOp(input_x, "MAX", &output);
-    std::vector<float> expect_data{3, 4, 7, 8};
-    for (int i = 0; i < 4; ++i) {
-      if (output.data<float>()[i] != expect_data[i]) {
-        DLOG << "output[" << i << "]: " << output.data<float>()[i]
-             << " != expect[" << i << "]: " << expect_data[i];
-        return 1;
-      }
-    }
-  }
-  // case 4
-  DLOG << "running max case 4";
-  {
-    std::vector<float> data{1,  2,  3,  4,  5,  6,  7,  8,  9,  10,
-                            11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
-    input_x.Resize(framework::make_ddim({4, 5}));
-    float *in_data = input_x.mutable_data<float>();
-    for (int i = 0; i < data.size(); ++i) in_data[i] = data[i];
-    input_x.set_lod({{0, 2, 4}});
-
-    TestSequencePoolOp(input_x, "MAX", &output);
-    std::vector<float> expect_data{6, 7, 8, 9, 10, 16, 17, 18, 19, 20};
-    for (int i = 0; i < 10; ++i) {
-      if (output.data<float>()[i] != expect_data[i]) {
-        DLOG << "output[" << i << "]: " << output.data<float>()[i]
-             << " != expect[" << i << "]: " << expect_data[i];
-        return 1;
-      }
-    }
-  }
-  // case 1
-  DLOG << "running sum case 1";
-  {
-    std::vector<float> data{1, 2, 3, 4};
-    input_x.Resize(framework::make_ddim({4, 1}));
-    float *in_data = input_x.mutable_data<float>();
-    for (int i = 0; i < 4; ++i) in_data[i] = data[i];
-    input_x.set_lod({{0, 2, 4}});
-
-    TestSequencePoolOp(input_x, "SUM", &output);
-    std::vector<float> expect_data{3, 7};
-    for (int i = 0; i < 2; ++i) {
-      if (output.data<float>()[i] != expect_data[i]) {
-        DLOG << "output[" << i << "]: " << output.data<float>()[i]
-             << " != expect[" << i << "]: " << expect_data[i];
-        return 1;
-      }
-    }
-  }
-  // case 2
-  DLOG << "running sum case 2";
-  {
-    std::vector<float> data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
-    input_x.Resize(framework::make_ddim({data.size(), 1}));
-    float *in_data = input_x.mutable_data<float>();
-    for (int i = 0; i < data.size(); ++i) in_data[i] = data[i];
-    input_x.set_lod({{0, 3, 10}});
-
-    TestSequencePoolOp(input_x, "SUM", &output);
-    std::vector<float> expect_data{6, 49};
-    for (int i = 0; i < 2; ++i) {
-      if (output.data<float>()[i] != expect_data[i]) {
-        DLOG << "output[" << i << "]: " << output.data<float>()[i]
-             << " != expect[" << i << "]: " << expect_data[i];
-        return 1;
-      }
-    }
-  }
-  // case 3
-  DLOG << "running sum case 3";
-  {
-    std::vector<float> data{1, 2, 3, 4, 5, 6, 7, 8};
-    input_x.Resize(framework::make_ddim({4, 2}));
-    float *in_data = input_x.mutable_data<float>();
-    for (int i = 0; i < data.size(); ++i) in_data[i] = data[i];
-    input_x.set_lod({{0, 2, 4}});
-
-    TestSequencePoolOp(input_x, "SUM", &output);
-    std::vector<float> expect_data{4, 6, 12, 14};
-    for (int i = 0; i < 4; ++i) {
-      if (output.data<float>()[i] != expect_data[i]) {
-        DLOG << "output[" << i << "]: " << output.data<float>()[i]
-             << " != expect[" << i << "]: " << expect_data[i];
-        return 1;
-      }
-    }
-  }
-  // case 4
-  DLOG << "running sum case 4";
-  {
-    std::vector<float> data{1,  2,  3,  4,  5,  6,  7,  8,  9,  10,
-                            11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
-    input_x.Resize(framework::make_ddim({4, 5}));
-    float *in_data = input_x.mutable_data<float>();
-    for (int i = 0; i < data.size(); ++i) in_data[i] = data[i];
-    input_x.set_lod({{0, 2, 4}});
-
-    TestSequencePoolOp(input_x, "SUM", &output);
-    std::vector<float> expect_data{7, 9, 11, 13, 15, 27, 29, 31, 33, 35};
-    for (int i = 0; i < 10; ++i) {
-      if (output.data<float>()[i] != expect_data[i]) {
-        DLOG << "output[" << i << "]: " << output.data<float>()[i]
-             << " != expect[" << i << "]: " << expect_data[i];
-        return 1;
-      }
-    }
-  }
-  // case 1
-  DLOG << "running first case 1";
-  {
-    std::vector<float> data{1, 2, 3, 4};
-    input_x.Resize(framework::make_ddim({4, 1}));
-    float *in_data = input_x.mutable_data<float>();
-    for (int i = 0; i < 4; ++i) in_data[i] = data[i];
-    input_x.set_lod({{0, 2, 4}});
-
-    TestSequencePoolOp(input_x, "FIRST", &output);
-    std::vector<float> expect_data{1, 3};
-    for (int i = 0; i < 2; ++i) {
-      if (output.data<float>()[i] != expect_data[i]) {
-        DLOG << "output[" << i << "]: " << output.data<float>()[i]
-             << " != expect[" << i << "]: " << expect_data[i];
-        return 1;
-      }
-    }
-  }
-  // case 2
-  DLOG << "running first case 2";
-  {
-    std::vector<float> data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
-    input_x.Resize(framework::make_ddim({data.size(), 1}));
-    float *in_data = input_x.mutable_data<float>();
-    for (int i = 0; i < data.size(); ++i) in_data[i] = data[i];
-    input_x.set_lod({{0, 3, 10}});
-
-    TestSequencePoolOp(input_x, "FIRST", &output);
-    std::vector<float> expect_data{1, 4};
-    for (int i = 0; i < 2; ++i) {
-      if (output.data<float>()[i] != expect_data[i]) {
-        DLOG << "output[" << i << "]: " << output.data<float>()[i]
-             << " != expect[" << i << "]: " << expect_data[i];
-        return 1;
-      }
-    }
-  }
-  // case 3
-  DLOG << "running first case 3";
-  {
-    std::vector<float> data{1, 2, 3, 4, 5, 6, 7, 8};
-    input_x.Resize(framework::make_ddim({4, 2}));
-    float *in_data = input_x.mutable_data<float>();
-    for (int i = 0; i < data.size(); ++i) in_data[i] = data[i];
-    input_x.set_lod({{0, 2, 4}});
-
-    TestSequencePoolOp(input_x, "FIRST", &output);
-    std::vector<float> expect_data{1, 2, 5, 6};
-    for (int i = 0; i < 4; ++i) {
-      if (output.data<float>()[i] != expect_data[i]) {
-        DLOG << "output[" << i << "]: " << output.data<float>()[i]
-             << " != expect[" << i << "]: " << expect_data[i];
-        return 1;
-      }
-    }
-  }
-  // case 4
-  DLOG << "running first case 4";
-  {
-    std::vector<float> data{1,  2,  3,  4,  5,  6,  7,  8,  9,  10,
-                            11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
-    input_x.Resize(framework::make_ddim({4, 5}));
-    float *in_data = input_x.mutable_data<float>();
-    for (int i = 0; i < data.size(); ++i) in_data[i] = data[i];
-    input_x.set_lod({{0, 2, 4}});
-
-    TestSequencePoolOp(input_x, "FIRST", &output);
-    std::vector<float> expect_data{1, 2, 3, 4, 5, 11, 12, 13, 14, 15};
-    for (int i = 0; i < 10; ++i) {
-      if (output.data<float>()[i] != expect_data[i]) {
-        DLOG << "output[" << i << "]: " << output.data<float>()[i]
-             << " != expect[" << i << "]: " << expect_data[i];
-        return 1;
-      }
-    }
-  }
-  return 0;
-}
diff --git a/mobile/test/operators/test_sequence_softmax_op.cpp b/mobile/test/operators/test_sequence_softmax_op.cpp
deleted file mode 100644
index d8e67f456f..0000000000
--- a/mobile/test/operators/test_sequence_softmax_op.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <math.h>
-#include <limits>
-#include "../test_include.h"
-#include "operators/sequence_ops/sequence_softmax_op.h"
-
-namespace paddle_mobile {
-
-void SequenceSoftmax(const framework::LoDTensor *X, framework::LoDTensor *Y) {
-  const float *x = X->data<float>();
-  const auto &lod = X->lod().back();
-  float *y = Y->mutable_data<float>();
-  for (int batch = 0; batch < lod.size() - 1; ++batch) {
-    int num_classes = lod[batch + 1] - lod[batch];
-    size_t offset = lod[batch];
-    const float *input = x + offset;
-    float *output = y + offset;
-    float max = -std::numeric_limits<float>::max();
-    for (int j = 0; j < num_classes; ++j) {
-      max = (input[j] > max) ? input[j] : max;
-    }
-    float sum = 0.f;
-    for (int j = 0; j < num_classes; ++j) {
-      float tmp = expf(input[j] - max);
-      sum += tmp;
-      output[j] = tmp;
-    }
-    for (int j = 0; j < num_classes; ++j) {
-      output[j] /= sum;
-    }
-  }
-  Y->set_lod(X->lod());
-}
-
-int TestSequenceSoftmaxOp(const std::vector<int> &input_shape,
-                          const std::vector<size_t> &input_lod) {
-  framework::DDim dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(input, dims, -100.0, 100.0);
-  input->set_lod({input_lod});
-
-  auto output_var = scope.get()->Var("output");
-
-  framework::AttributeMap attrs;
-  auto *op = new operators::SequenceSoftmaxOp<CPU, float>(
-      "sequence_softmax", inputs, outputs, attrs, scope.get());
-
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-
-  framework::LoDTensor output_cmp;
-  float *output_cmp_data = output_cmp.mutable_data<float>(output->dims());
-  SequenceSoftmax(input, &output_cmp);
-
-  const float *output_data = output->data<float>();
-  for (int i = 0; i < output->numel(); ++i) {
-    float gap = output_data[i] - output_cmp_data[i];
-    if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main(int argc, char *argv[]) {
-  TestSequenceSoftmaxOp({2, 1}, {0, 2});
-  TestSequenceSoftmaxOp({100, 1}, {0, 3, 100});
-  TestSequenceSoftmaxOp({100, 1}, {0, 50, 100});
-  return 0;
-}
diff --git a/mobile/test/operators/test_sigmoid_op.cpp b/mobile/test/operators/test_sigmoid_op.cpp
deleted file mode 100644
index bda7a79d94..0000000000
--- a/mobile/test/operators/test_sigmoid_op.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-#include <iostream>
-#include "../test_include.h"
-#include "operators/activation_op.h"
-
-namespace paddle_mobile {
-
-void Sigmoid(const framework::Tensor *X, framework::Tensor *Y) {
-  const float *x = X->data<float>();
-  float *y = Y->mutable_data<float>();
-
-  for (int i = 0; i < X->numel(); ++i) {
-    y[i] = 1.f / (1.f + exp(-x[i]));
-  }
-}
-
-int TestSigmoidOp(const std::vector<int> input_shape) {
-  framework::DDim dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(input, dims, -100.0, 100.0);
-
-  auto output_var = scope.get()->Var("output");
-
-  framework::AttributeMap attrs;
-  auto *op = new operators::SigmoidOp<CPU, float>("sigmoid", inputs, outputs,
-                                                  attrs, scope.get());
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-
-  framework::Tensor output_cmp;
-  float *output_cmp_data = output_cmp.mutable_data<float>(output->dims());
-  Sigmoid(input, &output_cmp);
-
-  const float *output_data = output->data<float>();
-  for (int i = 0; i < output->numel(); ++i) {
-    float gap = output_data[i] - output_cmp_data[i];
-    if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::TestSigmoidOp({1, 1, 2, 3});
-  paddle_mobile::TestSigmoidOp({1, 3, 11, 22});
-  paddle_mobile::TestSigmoidOp({1, 32, 112, 112});
-  return 0;
-}
diff --git a/mobile/test/operators/test_slice_op.cpp b/mobile/test/operators/test_slice_op.cpp
deleted file mode 100644
index 9306bc53c6..0000000000
--- a/mobile/test/operators/test_slice_op.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/slice_op.h"
-
-int main() {}
diff --git a/mobile/test/operators/test_softmax_op.cpp b/mobile/test/operators/test_softmax_op.cpp
deleted file mode 100644
index e9ccb260b5..0000000000
--- a/mobile/test/operators/test_softmax_op.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <math.h>
-#include <limits>
-#include "../test_include.h"
-#include "operators/softmax_op.h"
-
-namespace paddle_mobile {
-
-void Softmax(const framework::Tensor *X, framework::Tensor *Y) {
-  const framework::DDim &dims = X->dims();
-  int batch_size = dims[0];
-  int num_classes = dims[dims.size() - 1];
-  int channels = X->numel() / batch_size / num_classes;
-  const float *x = X->data<float>();
-  float *y = Y->mutable_data<float>();
-
-  for (int batch = 0; batch < batch_size; ++batch) {
-    for (int c = 0; c < channels; ++c) {
-      size_t offset = (batch * channels + c) * num_classes;
-      const float *input = x + offset;
-      float *output = y + offset;
-      float max = -std::numeric_limits<float>::max();
-      for (int j = 0; j < num_classes; ++j) {
-        max = (input[j] > max) ? input[j] : max;
-      }
-      float sum = 0.f;
-      for (int j = 0; j < num_classes; ++j) {
-        float tmp = expf(input[j] - max);
-        sum += tmp;
-        output[j] = tmp;
-      }
-      for (int j = 0; j < num_classes; ++j) {
-        output[j] /= sum;
-      }
-    }
-  }
-}
-
-int TestSoftmaxOp(const std::vector<int> input_shape) {
-  framework::DDim dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(input, dims, -100.0, 100.0);
-
-  auto output_var = scope.get()->Var("output");
-
-  framework::AttributeMap attrs;
-  auto *op = new operators::SoftmaxOp<CPU, float>("softmax", inputs, outputs,
-                                                  attrs, scope.get());
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-
-  framework::Tensor output_cmp;
-  float *output_cmp_data = output_cmp.mutable_data<float>(output->dims());
-  Softmax(input, &output_cmp);
-
-  const float *output_data = output->data<float>();
-  for (int i = 0; i < output->numel(); ++i) {
-    float gap = output_data[i] - output_cmp_data[i];
-    if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main(int argc, char *argv[]) {
-  TestSoftmaxOp({128, 1000});
-  TestSoftmaxOp({128, 10, 1000});
-  return 0;
-}
diff --git a/mobile/test/operators/test_sum_op.cpp b/mobile/test/operators/test_sum_op.cpp
deleted file mode 100644
index 225a113f90..0000000000
--- a/mobile/test/operators/test_sum_op.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/sum_op.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestSumOp {
- public:
-  explicit TestSumOp(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    //  DLOG << " **block size " << blocks.size();
-    for (int i = 0; i < blocks.size(); ++i) {
-      std::shared_ptr<BlockDesc> block_desc = blocks[i];
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      //    DLOG << " ops " << ops.size();
-      for (int j = 0; j < ops.size(); ++j) {
-        std::shared_ptr<OpDesc> op = ops[j];
-        if (op->Type() == "sum" && op->Input("X")[0] == "fc_2.tmp_0") {
-          DLOG << " sum attr size: " << op->GetAttrMap().size();
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " outputs size: " << op->GetOutputs().size();
-
-          std::shared_ptr<operators::SumOp<Dtype, float>> lrn =
-              std::make_shared<operators::SumOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope.get());
-          ops_of_block_[*block_desc.get()].push_back(lrn);
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict_bn(const Tensor &t1, const Tensor &t2) {
-    // feed
-    auto scope = program_.scope.get();
-    Variable *x1_feed_value = scope->Var("fc_2.tmp_0");
-    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
-    tensor_x1->ShareDataWith(t1);
-
-    Variable *x2_feed_value = scope->Var("fc_2.tmp_1");
-    auto tensor_x2 = x2_feed_value->GetMutable<LoDTensor>();
-    tensor_x2->ShareDataWith(t2);
-
-    Variable *output = scope->Var("fc_2.tmp_2");
-    auto *output_tensor = output->GetMutable<LoDTensor>();
-    output_tensor->mutable_data<float>({2, 96});
-    //  DLOG << typeid(output_tensor).name();
-    //  DLOG << "output_tensor dims: " << output_tensor->dims();
-
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
-
-    predict_bn(t1, t2, 0);
-    return out_tensor;
-  }
-
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-
-  void predict_bn(const Tensor &t1, const Tensor &t2, int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      DLOG << "op -> run()";
-      op->Run();
-    }
-  }
-};
-
-template class TestSumOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-
-int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run Sum Test";
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_eng) + "/model",
-                             std::string(g_eng) + "/params");
-
-  /// input x (4,10,2,2)
-  paddle_mobile::framework::Tensor inputx1;
-  SetupTensor<float>(&inputx1, {2, 96}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx1_ptr = inputx1.data<float>();
-
-  paddle_mobile::framework::Tensor inputx2;
-  SetupTensor<float>(&inputx2, {2, 96}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx2_ptr = inputx2.data<float>();
-
-  paddle_mobile::framework::TestSumOp<paddle_mobile::CPU> testSumOp(program);
-
-  auto output_sum = testSumOp.predict_bn(inputx1, inputx2);
-  auto *output_sum_ptr = output_sum->data<float>();
-
-  DLOG << "input1 44: " << inputx1_ptr[44];
-  DLOG << "input2 44: " << inputx2_ptr[44];
-  DLOG << "out 44 :" << output_sum_ptr[44];
-
-  return 0;
-}
diff --git a/mobile/test/operators/test_tanh_op.cpp b/mobile/test/operators/test_tanh_op.cpp
deleted file mode 100644
index 13dfd09b3b..0000000000
--- a/mobile/test/operators/test_tanh_op.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-#include <iostream>
-#include "../test_include.h"
-#include "operators/activation_op.h"
-
-namespace paddle_mobile {
-
-void Tanh(const framework::Tensor *X, framework::Tensor *Y) {
-  const float *x = X->data<float>();
-  float *y = Y->mutable_data<float>();
-
-  for (int i = 0; i < X->numel(); ++i) {
-    y[i] = 2.f / (1.f + exp(-2.f * x[i])) - 1.f;
-  }
-}
-
-int TestTanhOp(const std::vector<int> input_shape) {
-  framework::DDim dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(input, dims, -100.0, 100.0);
-
-  auto output_var = scope.get()->Var("output");
-
-  framework::AttributeMap attrs;
-  auto *op = new operators::TanhOp<CPU, float>("tanh", inputs, outputs, attrs,
-                                               scope.get());
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-
-  framework::Tensor output_cmp;
-  float *output_cmp_data = output_cmp.mutable_data<float>(output->dims());
-  Tanh(input, &output_cmp);
-
-  const float *output_data = output->data<float>();
-  for (int i = 0; i < output->numel(); ++i) {
-    float gap = output_data[i] - output_cmp_data[i];
-    if (gap > 1e-5 && std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::TestTanhOp({1, 1, 2, 3});
-  paddle_mobile::TestTanhOp({1, 3, 11, 22});
-  paddle_mobile::TestTanhOp({1, 32, 112, 112});
-  std::cout << "test sigmoid op pass." << std::endl;
-  return 0;
-}
diff --git a/mobile/test/operators/test_topk_op.cpp b/mobile/test/operators/test_topk_op.cpp
deleted file mode 100644
index cf0fde3705..0000000000
--- a/mobile/test/operators/test_topk_op.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <math.h>
-#include <limits>
-#include "../test_include.h"
-#include "operators/top_k_op.h"
-
-namespace paddle_mobile {
-
-void TopK(const framework::Tensor *X, framework::Tensor *Y,
-          framework::Tensor *Indices, const int K) {
-  const float *x = X->data<float>();
-  float *y = Y->mutable_data<float>();
-  int64_t *indices = Indices->mutable_data<int64_t>();
-
-  int dim_size = X->dims().size();
-  int row = 1;
-  int col = X->dims()[dim_size - 1];
-  for (int i = 0; i < dim_size - 1; ++i) {
-    row *= X->dims()[i];
-  }
-
-  std::vector<float> vec(col);
-  for (int i = 0; i < row; ++i) {
-    for (int j = 0; j < col; ++j) {
-      vec[j] = x[i * col + j];
-    }
-    for (int k = 0; k < K; ++k) {
-      float max = vec[0];
-      int index = 0;
-      for (int j = 1; j < col; ++j) {
-        if (vec[j] > max) {
-          max = vec[j];
-          index = j;
-        }
-      }
-      y[i * K + k] = max;
-      indices[i * K + k] = index;
-      vec[index] = -std::numeric_limits<float>::max();
-    }
-  }
-}
-
-int TestTopKOp(const std::vector<int> input_shape, const int K) {
-  framework::DDim dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-  outputs["Indices"] = std::vector<std::string>({"indices"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(input, dims, -100.0, 100.0);
-
-  auto output_var = scope.get()->Var("output");
-  auto indices_var = scope.get()->Var("indices");
-
-  framework::AttributeMap attrs;
-  attrs["k"].Set<int>(K);
-  auto *op = new operators::TopKOp<CPU, float>("top_k", inputs, outputs, attrs,
-                                               scope.get());
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-  auto indices = indices_var->template Get<framework::LoDTensor>();
-
-  framework::Tensor output_cmp, indices_cmp;
-  float *output_cmp_data = output_cmp.mutable_data<float>(output->dims());
-  int64_t *indices_cmp_data =
-      indices_cmp.mutable_data<int64_t>(indices->dims());
-  TopK(input, &output_cmp, &indices_cmp, K);
-
-  // sort output
-  float *output_data = const_cast<float *>(output->data<float>());
-  int64_t *indices_data = const_cast<int64_t *>(indices->data<int64_t>());
-  //  std::vector<std::pair<float, size_t>> vec(K);
-  //  for (int i = 0; i < output->numel() / K; ++i) {
-  //    for (int j = 0; j < K; ++j) {
-  //      vec[j] = std::move(std::make_pair(output_data[i * K + j],
-  //      indices_data[i * K + j]));
-  //    }
-  //    std::sort(vec.begin(), vec.end(),
-  //              [](const std::pair<float, size_t> &l,
-  //                 const std::pair<float, size_t> &r) {
-  //                   return l.first > r.first; });
-  //    for (int j = 0; j < K; ++j) {
-  //      output_data[i * K + j] = vec[j].first;
-  //      indices_data[i * K + j] = vec[j].second;
-  //    }
-  //  }
-
-  for (int i = 0; i < output->numel(); ++i) {
-    float gap = output_data[i] - output_cmp_data[i];
-    if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-
-  for (int i = 0; i < indices->numel(); ++i) {
-    if (indices_data[i] != indices_cmp_data[i]) {
-      LOG(kLOG_INFO) << "indices_data[" << i << "] = " << indices_data[i]
-                     << ", indices_cmp_data[" << i
-                     << "] = " << indices_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main(int argc, char *argv[]) {
-  TestTopKOp({1, 100}, 1);
-  TestTopKOp({128, 100}, 10);
-  TestTopKOp({128, 2, 100}, 10);
-  return 0;
-}
diff --git a/mobile/test/operators/test_transpose2_op.cpp b/mobile/test/operators/test_transpose2_op.cpp
deleted file mode 100644
index 4c4f5e4c26..0000000000
--- a/mobile/test/operators/test_transpose2_op.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/transpose2_op.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestTranspose2Op {
- public:
-  explicit TestTranspose2Op(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    for (auto block_desc : blocks) {
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      for (auto op : ops) {
-        if (op->Type() == "transpose2") {
-          DLOG << " attr size: " << op->GetAttrMap().size();
-          std::unordered_map<std::string, Attribute> attrs = op->GetAttrMap();
-          for (std::unordered_map<std::string, Attribute>::iterator it =
-                   attrs.begin();
-               it != attrs.end(); ++it) {
-            DLOG << "  " << it->first << " " << it->second;
-          }
-
-          DLOG << " inputs size: " << op->GetInputs().size();
-          VariableNameMap inputs = op->GetInputs();
-          for (VariableNameMap::iterator it = inputs.begin();
-               it != inputs.end(); ++it) {
-            DLOG << "  " << it->first << " " << it->second;
-          }
-
-          DLOG << " outputs size: " << op->GetOutputs().size();
-          VariableNameMap outputs = op->GetOutputs();
-          for (VariableNameMap::iterator it = outputs.begin();
-               it != outputs.end(); ++it) {
-            DLOG << "  " << it->first << " " << it->second;
-          }
-
-          input_var_name = op->Input("X")[0];
-          output_var_name = op->Output("Out")[0];
-          std::shared_ptr<operators::Transpose2Op<Dtype, float>> op_ptr =
-              std::make_shared<operators::Transpose2Op<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope.get());
-          ops_of_block_[*block_desc.get()].push_back(op_ptr);
-          return;
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict(const Tensor &t) {
-    auto scope = program_.scope.get();
-    Variable *input_feed_value = scope->Var(input_var_name);
-    auto tensor_input = input_feed_value->GetMutable<LoDTensor>();
-    tensor_input->ShareDataWith(t);
-
-    Variable *output = scope->Var(output_var_name);
-    auto *output_tensor = output->GetMutable<LoDTensor>();
-    output_tensor->mutable_data<float>({1, 2, 8});
-
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
-
-    predict(t, 0);
-
-    return out_tensor;
-  }
-
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-  string input_var_name;
-  string output_var_name;
-
-  void predict(const Tensor &t, int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      op->Run();
-    }
-  }
-};
-
-template class TestTranspose2Op<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-
-int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run Transpose2 Test";
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_ocr) + "/model",
-                             std::string(g_ocr) + "/params");
-
-  paddle_mobile::framework::Tensor input;
-  SetupTensor<float>(&input, {1, 8, 2}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *input_ptr = input.data<float>();
-  for (int i = 0; i < 16; ++i) {
-    *(input_ptr + i) = i;
-  }
-  DLOG << "input : ";
-  for (int i = 0; i < input.numel(); ++i) {
-    DLOG << " index " << i << " : " << input_ptr[i];
-  }
-
-  paddle_mobile::framework::TestTranspose2Op<paddle_mobile::CPU>
-      testTranspose2Op(program);
-
-  auto output = testTranspose2Op.predict(input);
-  auto *output_ptr = output->data<float>();
-
-  DLOG << "output : ";
-  for (int i = 0; i < output->numel(); ++i) {
-    DLOG << " index " << i << " : " << output_ptr[i];
-  }
-  return 0;
-}
diff --git a/mobile/test/operators/test_transpose_op.cpp b/mobile/test/operators/test_transpose_op.cpp
deleted file mode 100644
index 263fdcfa0e..0000000000
--- a/mobile/test/operators/test_transpose_op.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/transpose_op.h"
-int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_mobilenet_ssd));
-  if (program.originProgram == nullptr) {
-    DLOG << "program read file";
-  }
-  Executor4Test<paddle_mobile::CPU, paddle_mobile::operators::TransposeOp<
-                                        paddle_mobile::CPU, float>>
-      executor(program, "transpose");
-  paddle_mobile::framework::Tensor input;
-  SetupTensor<float>(&input, {1, 2, 3, 4}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto input_ptr = input.data<float>();
-  auto out_ddim = paddle_mobile::framework::make_ddim({1, 3, 4, 2});
-  auto output =
-      executor.Predict(input, "conv2d_22.tmp_1", "transpose_0.tmp_0", out_ddim);
-  auto *output_ptr = output->data<float>();
-
-  DLOG << "input : ";
-  for (int j = 0; j < input.numel(); ++j) {
-    DLOG << " index " << j << " : " << input_ptr[j];
-  }
-
-  DLOG << "output : ";
-  for (int j = 0; j < output->numel(); ++j) {
-    DLOG << " index " << j << " : " << output_ptr[j];
-  }
-  DLOG << " for example : ";
-  DLOG << " you can check if input[16] == output[9] ";
-  DLOG << " you can check if input[12] == output[1] ";
-  return 0;
-}
diff --git a/mobile/test/test_helper.h b/mobile/test/test_helper.h
deleted file mode 100644
index 98893eeac0..0000000000
--- a/mobile/test/test_helper.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <fstream>
-#include <random>
-#include <string>
-#include <vector>
-
-#include "common/common.h"
-#include "common/log.h"
-#include "framework/ddim.h"
-#include "framework/lod_tensor.h"
-
-static const char *g_ocr = "../models/ocr";
-static const char *g_mobilenet_ssd = "../models/mobilenet+ssd";
-static const char *g_genet_combine = "../models/enet";
-static const char *g_eng = "../models/eng_20conv_1_9_fc";
-static const char *g_mobilenet_ssd_gesture = "../models/mobilenet+ssd_gesture";
-static const char *g_mobilenet_combined = "../models/mobilenet_combine";
-static const char *g_googlenetv1_combined = "../models/googlenetv1_combine";
-static const char *g_mobilenet_detect = "../models/mobilenet-detect";
-static const char *g_squeezenet = "../models/squeezenet";
-static const char *g_googlenet = "../models/googlenet";
-static const char *g_googlenet_quali = "../models/googlenet_combine_quali";
-static const char *g_mobilenet = "../models/mobilenet";
-static const char *g_mobilenet_mul = "../models/r";
-static const char *g_alexnet = "../models/alexnet";
-static const char *g_inceptionv4 = "../models/inceptionv4";
-static const char *g_inceptionv3 =
-    "../models/InceptionV3_Spatial_Attention_Model";
-static const char *g_nlp = "../models/nlp";
-static const char *g_super = "../models/superresoltion";
-static const char *g_superv2 = "../models/superv2";
-static const char *g_resnet_50 = "../models/resnet_50";
-static const char *g_resnet = "../models/resnet";
-static const char *g_googlenet_combine = "../models/googlenet_combine";
-static const char *g_yolo = "../models/yolo";
-static const char *g_yolo_combined = "../models/yolo_combined";
-static const char *g_yolo_mul = "../models/d";
-static const char *g_fluid_fssd_new = "../models/fluid_fssd_new";
-static const char *g_vgg16_ssd_combined = "../models/vgg16_ssd_combined";
-static const char *g_mobilenet_vision = "../models/vision_mobilenet";
-static const char *g_yolo_vision = "../models/vision_yolo";
-static const char *g_test_image_1x3x224x224 =
-    "../images/test_image_1x3x224x224_float";
-static const char *g_test_image_1x3x224x224_banana =
-    "../images/input_3x224x224_banana";
-static const char *g_test_image_desktop_1_3_416_416_nchw_float =
-    "../images/in_put_1_3_416_416_2";
-static const char *g_hand = "../images/hand_image";
-static const char *g_moto = "../images/moto_300x300_float";
-static const char *g_imgfssd_ar = "../images/test_image_ssd_ar";
-static const char *g_imgfssd_ar1 = "../images/003_0001.txt";
-static const char *g_img = "../images/img.bin";
-static const char *g_yolo_img = "../images/in_put_1_3_416_416_2";
-static const char *g_super_img = "../images/mingren_input_data";
-static const char *g_mobilenet_img = "../images/image";
-static const char *g_test_image_1x3x224x224_vision_mobilenet_input =
-    "../images/vision_mobilenet_input";
-static const char *g_test_image_1x3x416x416_vision_yolo_input =
-    "../images/yolo_input";
-
-using namespace paddle_mobile;  // NOLINT
-using paddle_mobile::framework::DDim;
-using paddle_mobile::framework::LoDTensor;
-using paddle_mobile::framework::Tensor;
-
-template <typename T>
-void SetupTensor(paddle_mobile::framework::Tensor *input,
-                 paddle_mobile::framework::DDim dims, T lower, T upper) {
-  static unsigned int seed = 100;
-  std::mt19937 rng(seed++);
-  std::uniform_real_distribution<double> uniform_dist(0, 1);
-
-  T *input_ptr = input->mutable_data<T>(dims);
-  for (int i = 0; i < input->numel(); ++i) {
-    input_ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
-  }
-}
-
-template <>
-void SetupTensor<bool>(paddle_mobile::framework::Tensor *input,
-                       paddle_mobile::framework::DDim dims, bool lower,
-                       bool upper) {
-  static unsigned int seed = 100;
-  std::mt19937 rng(seed++);
-  std::uniform_real_distribution<double> uniform_dist(0, 1);
-
-  bool *input_ptr = input->mutable_data<bool>(dims);
-  if (lower == upper) {
-    for (int i = 0; i < input->numel(); ++i) {
-      input_ptr[i] = lower;
-    }
-  } else {
-    for (int i = 0; i < input->numel(); ++i) {
-      input_ptr[i] = uniform_dist(rng) > 0.5;
-    }
-  }
-}
-
-template <typename T>
-T *CreateInput(Tensor *input, DDim dims, T low, T up) {
-  SetupTensor<T>(input, dims, static_cast<float>(low), static_cast<float>(up));
-  return input->data<T>();
-}
-
-template <typename T>
-void GetInput(const std::string &input_name, std::vector<T> *input,
-              const std::vector<int64_t> &dims) {
-  int size = 1;
-  for (const auto &dim : dims) {
-    size *= dim;
-  }
-
-  T *input_ptr = reinterpret_cast<T *>(malloc(sizeof(T) * size));
-  std::ifstream in(input_name, std::ios::in | std::ios::binary);
-  in.read(reinterpret_cast<char *>(input_ptr), size * sizeof(T));
-  in.close();
-  for (int i = 0; i < size; ++i) {
-    input->push_back(input_ptr[i]);
-  }
-  free(input_ptr);
-}
-
-template <typename T>
-void GetInput(const std::string &input_name,
-              paddle_mobile::framework::Tensor *input,
-              paddle_mobile::framework::DDim dims) {
-  T *input_ptr = input->mutable_data<T>(dims);
-
-  std::ifstream in(input_name, std::ios::in | std::ios::binary);
-  in.read(reinterpret_cast<char *>(input_ptr), input->numel() * sizeof(T));
-  in.close();
-}
diff --git a/mobile/test/test_include.h b/mobile/test/test_include.h
deleted file mode 100644
index cce946848c..0000000000
--- a/mobile/test/test_include.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <string>
-#include <vector>
-
-#include "./test_helper.h"
-#include "common/enforce.h"
-#include "common/log.h"
-#include "common/util.h"
-#include "executor_for_test.h"
-#include "framework/ddim.h"
-#include "framework/lod_tensor.h"
-#include "framework/operator.h"
-#include "framework/program/block_desc.h"
-#include "framework/program/program.h"
-#include "framework/program/program_desc.h"
-#include "framework/scope.h"
-#include "framework/tensor.h"
-#include "framework/variable.h"
-#include "io/paddle_mobile.h"
-
-#ifdef PADDLE_MOBILE_CL
-#include "framework/cl/cl_image.h"
-#endif
diff --git a/mobile/third_party/opencl/OpenCL-Headers/CL/cl.h b/mobile/third_party/opencl/OpenCL-Headers/CL/cl.h
deleted file mode 100644
index a301ac6a00..0000000000
--- a/mobile/third_party/opencl/OpenCL-Headers/CL/cl.h
+++ /dev/null
@@ -1,1782 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2008-2018 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
- * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
- * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
- *    https://www.khronos.org/registry/
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- ******************************************************************************/
-
-#ifndef __OPENCL_CL_H
-#define __OPENCL_CL_H
-
-#ifdef __APPLE__
-#include <OpenCL/cl_version.h>
-#include <OpenCL/cl_platform.h>
-#else
-#include <CL/cl_version.h>
-#include <CL/cl_platform.h>
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/******************************************************************************/
-
-typedef struct _cl_platform_id *    cl_platform_id;
-typedef struct _cl_device_id *      cl_device_id;
-typedef struct _cl_context *        cl_context;
-typedef struct _cl_command_queue *  cl_command_queue;
-typedef struct _cl_mem *            cl_mem;
-typedef struct _cl_program *        cl_program;
-typedef struct _cl_kernel *         cl_kernel;
-typedef struct _cl_event *          cl_event;
-typedef struct _cl_sampler *        cl_sampler;
-
-typedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
-typedef cl_ulong            cl_bitfield;
-typedef cl_bitfield         cl_device_type;
-typedef cl_uint             cl_platform_info;
-typedef cl_uint             cl_device_info;
-typedef cl_bitfield         cl_device_fp_config;
-typedef cl_uint             cl_device_mem_cache_type;
-typedef cl_uint             cl_device_local_mem_type;
-typedef cl_bitfield         cl_device_exec_capabilities;
-#ifdef CL_VERSION_2_0
-typedef cl_bitfield         cl_device_svm_capabilities;
-#endif
-typedef cl_bitfield         cl_command_queue_properties;
-#ifdef CL_VERSION_1_2
-typedef intptr_t            cl_device_partition_property;
-typedef cl_bitfield         cl_device_affinity_domain;
-#endif
-
-typedef intptr_t            cl_context_properties;
-typedef cl_uint             cl_context_info;
-#ifdef CL_VERSION_2_0
-typedef cl_bitfield         cl_queue_properties;
-#endif
-typedef cl_uint             cl_command_queue_info;
-typedef cl_uint             cl_channel_order;
-typedef cl_uint             cl_channel_type;
-typedef cl_bitfield         cl_mem_flags;
-#ifdef CL_VERSION_2_0
-typedef cl_bitfield         cl_svm_mem_flags;
-#endif
-typedef cl_uint             cl_mem_object_type;
-typedef cl_uint             cl_mem_info;
-#ifdef CL_VERSION_1_2
-typedef cl_bitfield         cl_mem_migration_flags;
-#endif
-typedef cl_uint             cl_image_info;
-#ifdef CL_VERSION_1_1
-typedef cl_uint             cl_buffer_create_type;
-#endif
-typedef cl_uint             cl_addressing_mode;
-typedef cl_uint             cl_filter_mode;
-typedef cl_uint             cl_sampler_info;
-typedef cl_bitfield         cl_map_flags;
-#ifdef CL_VERSION_2_0
-typedef intptr_t            cl_pipe_properties;
-typedef cl_uint             cl_pipe_info;
-#endif
-typedef cl_uint             cl_program_info;
-typedef cl_uint             cl_program_build_info;
-#ifdef CL_VERSION_1_2
-typedef cl_uint             cl_program_binary_type;
-#endif
-typedef cl_int              cl_build_status;
-typedef cl_uint             cl_kernel_info;
-#ifdef CL_VERSION_1_2
-typedef cl_uint             cl_kernel_arg_info;
-typedef cl_uint             cl_kernel_arg_address_qualifier;
-typedef cl_uint             cl_kernel_arg_access_qualifier;
-typedef cl_bitfield         cl_kernel_arg_type_qualifier;
-#endif
-typedef cl_uint             cl_kernel_work_group_info;
-#ifdef CL_VERSION_2_1
-typedef cl_uint             cl_kernel_sub_group_info;
-#endif
-typedef cl_uint             cl_event_info;
-typedef cl_uint             cl_command_type;
-typedef cl_uint             cl_profiling_info;
-#ifdef CL_VERSION_2_0
-typedef cl_bitfield         cl_sampler_properties;
-typedef cl_uint             cl_kernel_exec_info;
-#endif
-
-typedef struct _cl_image_format {
-    cl_channel_order        image_channel_order;
-    cl_channel_type         image_channel_data_type;
-} cl_image_format;
-
-#ifdef CL_VERSION_1_2
-
-typedef struct _cl_image_desc {
-    cl_mem_object_type      image_type;
-    size_t                  image_width;
-    size_t                  image_height;
-    size_t                  image_depth;
-    size_t                  image_array_size;
-    size_t                  image_row_pitch;
-    size_t                  image_slice_pitch;
-    cl_uint                 num_mip_levels;
-    cl_uint                 num_samples;
-#ifdef __GNUC__
-    __extension__   /* Prevents warnings about anonymous union in -pedantic builds */
-#endif
-    union {
-      cl_mem                  buffer;
-      cl_mem                  mem_object;
-    };
-} cl_image_desc;
-
-#endif
-
-#ifdef CL_VERSION_1_1
-
-typedef struct _cl_buffer_region {
-    size_t                  origin;
-    size_t                  size;
-} cl_buffer_region;
-
-#endif
-
-/******************************************************************************/
-
-/* Error Codes */
-#define CL_SUCCESS                                  0
-#define CL_DEVICE_NOT_FOUND                         -1
-#define CL_DEVICE_NOT_AVAILABLE                     -2
-#define CL_COMPILER_NOT_AVAILABLE                   -3
-#define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
-#define CL_OUT_OF_RESOURCES                         -5
-#define CL_OUT_OF_HOST_MEMORY                       -6
-#define CL_PROFILING_INFO_NOT_AVAILABLE             -7
-#define CL_MEM_COPY_OVERLAP                         -8
-#define CL_IMAGE_FORMAT_MISMATCH                    -9
-#define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
-#define CL_BUILD_PROGRAM_FAILURE                    -11
-#define CL_MAP_FAILURE                              -12
-#ifdef CL_VERSION_1_1
-#define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
-#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
-#endif
-#ifdef CL_VERSION_1_2
-#define CL_COMPILE_PROGRAM_FAILURE                  -15
-#define CL_LINKER_NOT_AVAILABLE                     -16
-#define CL_LINK_PROGRAM_FAILURE                     -17
-#define CL_DEVICE_PARTITION_FAILED                  -18
-#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19
-#endif
-
-#define CL_INVALID_VALUE                            -30
-#define CL_INVALID_DEVICE_TYPE                      -31
-#define CL_INVALID_PLATFORM                         -32
-#define CL_INVALID_DEVICE                           -33
-#define CL_INVALID_CONTEXT                          -34
-#define CL_INVALID_QUEUE_PROPERTIES                 -35
-#define CL_INVALID_COMMAND_QUEUE                    -36
-#define CL_INVALID_HOST_PTR                         -37
-#define CL_INVALID_MEM_OBJECT                       -38
-#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
-#define CL_INVALID_IMAGE_SIZE                       -40
-#define CL_INVALID_SAMPLER                          -41
-#define CL_INVALID_BINARY                           -42
-#define CL_INVALID_BUILD_OPTIONS                    -43
-#define CL_INVALID_PROGRAM                          -44
-#define CL_INVALID_PROGRAM_EXECUTABLE               -45
-#define CL_INVALID_KERNEL_NAME                      -46
-#define CL_INVALID_KERNEL_DEFINITION                -47
-#define CL_INVALID_KERNEL                           -48
-#define CL_INVALID_ARG_INDEX                        -49
-#define CL_INVALID_ARG_VALUE                        -50
-#define CL_INVALID_ARG_SIZE                         -51
-#define CL_INVALID_KERNEL_ARGS                      -52
-#define CL_INVALID_WORK_DIMENSION                   -53
-#define CL_INVALID_WORK_GROUP_SIZE                  -54
-#define CL_INVALID_WORK_ITEM_SIZE                   -55
-#define CL_INVALID_GLOBAL_OFFSET                    -56
-#define CL_INVALID_EVENT_WAIT_LIST                  -57
-#define CL_INVALID_EVENT                            -58
-#define CL_INVALID_OPERATION                        -59
-#define CL_INVALID_GL_OBJECT                        -60
-#define CL_INVALID_BUFFER_SIZE                      -61
-#define CL_INVALID_MIP_LEVEL                        -62
-#define CL_INVALID_GLOBAL_WORK_SIZE                 -63
-#ifdef CL_VERSION_1_1
-#define CL_INVALID_PROPERTY                         -64
-#endif
-#ifdef CL_VERSION_1_2
-#define CL_INVALID_IMAGE_DESCRIPTOR                 -65
-#define CL_INVALID_COMPILER_OPTIONS                 -66
-#define CL_INVALID_LINKER_OPTIONS                   -67
-#define CL_INVALID_DEVICE_PARTITION_COUNT           -68
-#endif
-#ifdef CL_VERSION_2_0
-#define CL_INVALID_PIPE_SIZE                        -69
-#define CL_INVALID_DEVICE_QUEUE                     -70
-#endif
-#ifdef CL_VERSION_2_2
-#define CL_INVALID_SPEC_ID                          -71
-#define CL_MAX_SIZE_RESTRICTION_EXCEEDED            -72
-#endif
-
-
-/* cl_bool */
-#define CL_FALSE                                    0
-#define CL_TRUE                                     1
-#ifdef CL_VERSION_1_2
-#define CL_BLOCKING                                 CL_TRUE
-#define CL_NON_BLOCKING                             CL_FALSE
-#endif
-
-/* cl_platform_info */
-#define CL_PLATFORM_PROFILE                         0x0900
-#define CL_PLATFORM_VERSION                         0x0901
-#define CL_PLATFORM_NAME                            0x0902
-#define CL_PLATFORM_VENDOR                          0x0903
-#define CL_PLATFORM_EXTENSIONS                      0x0904
-#ifdef CL_VERSION_2_1
-#define CL_PLATFORM_HOST_TIMER_RESOLUTION           0x0905
-#endif
-
-/* cl_device_type - bitfield */
-#define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
-#define CL_DEVICE_TYPE_CPU                          (1 << 1)
-#define CL_DEVICE_TYPE_GPU                          (1 << 2)
-#define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
-#ifdef CL_VERSION_1_2
-#define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
-#endif
-#define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
-
-/* cl_device_info */
-#define CL_DEVICE_TYPE                                   0x1000
-#define CL_DEVICE_VENDOR_ID                              0x1001
-#define CL_DEVICE_MAX_COMPUTE_UNITS                      0x1002
-#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS               0x1003
-#define CL_DEVICE_MAX_WORK_GROUP_SIZE                    0x1004
-#define CL_DEVICE_MAX_WORK_ITEM_SIZES                    0x1005
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR            0x1006
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT           0x1007
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT             0x1008
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG            0x1009
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT           0x100A
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE          0x100B
-#define CL_DEVICE_MAX_CLOCK_FREQUENCY                    0x100C
-#define CL_DEVICE_ADDRESS_BITS                           0x100D
-#define CL_DEVICE_MAX_READ_IMAGE_ARGS                    0x100E
-#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS                   0x100F
-#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                     0x1010
-#define CL_DEVICE_IMAGE2D_MAX_WIDTH                      0x1011
-#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                     0x1012
-#define CL_DEVICE_IMAGE3D_MAX_WIDTH                      0x1013
-#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                     0x1014
-#define CL_DEVICE_IMAGE3D_MAX_DEPTH                      0x1015
-#define CL_DEVICE_IMAGE_SUPPORT                          0x1016
-#define CL_DEVICE_MAX_PARAMETER_SIZE                     0x1017
-#define CL_DEVICE_MAX_SAMPLERS                           0x1018
-#define CL_DEVICE_MEM_BASE_ADDR_ALIGN                    0x1019
-#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE               0x101A
-#define CL_DEVICE_SINGLE_FP_CONFIG                       0x101B
-#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE                  0x101C
-#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE              0x101D
-#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE                  0x101E
-#define CL_DEVICE_GLOBAL_MEM_SIZE                        0x101F
-#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE               0x1020
-#define CL_DEVICE_MAX_CONSTANT_ARGS                      0x1021
-#define CL_DEVICE_LOCAL_MEM_TYPE                         0x1022
-#define CL_DEVICE_LOCAL_MEM_SIZE                         0x1023
-#define CL_DEVICE_ERROR_CORRECTION_SUPPORT               0x1024
-#define CL_DEVICE_PROFILING_TIMER_RESOLUTION             0x1025
-#define CL_DEVICE_ENDIAN_LITTLE                          0x1026
-#define CL_DEVICE_AVAILABLE                              0x1027
-#define CL_DEVICE_COMPILER_AVAILABLE                     0x1028
-#define CL_DEVICE_EXECUTION_CAPABILITIES                 0x1029
-#define CL_DEVICE_QUEUE_PROPERTIES                       0x102A    /* deprecated */
-#ifdef CL_VERSION_2_0
-#define CL_DEVICE_QUEUE_ON_HOST_PROPERTIES               0x102A
-#endif
-#define CL_DEVICE_NAME                                   0x102B
-#define CL_DEVICE_VENDOR                                 0x102C
-#define CL_DRIVER_VERSION                                0x102D
-#define CL_DEVICE_PROFILE                                0x102E
-#define CL_DEVICE_VERSION                                0x102F
-#define CL_DEVICE_EXTENSIONS                             0x1030
-#define CL_DEVICE_PLATFORM                               0x1031
-#ifdef CL_VERSION_1_2
-#define CL_DEVICE_DOUBLE_FP_CONFIG                       0x1032
-#endif
-/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG which is already defined in "cl_ext.h" */
-#ifdef CL_VERSION_1_1
-#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF            0x1034
-#define CL_DEVICE_HOST_UNIFIED_MEMORY                    0x1035   /* deprecated */
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR               0x1036
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT              0x1037
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT                0x1038
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG               0x1039
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT              0x103A
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE             0x103B
-#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF               0x103C
-#define CL_DEVICE_OPENCL_C_VERSION                       0x103D
-#endif
-#ifdef CL_VERSION_1_2
-#define CL_DEVICE_LINKER_AVAILABLE                       0x103E
-#define CL_DEVICE_BUILT_IN_KERNELS                       0x103F
-#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE                  0x1040
-#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE                   0x1041
-#define CL_DEVICE_PARENT_DEVICE                          0x1042
-#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES              0x1043
-#define CL_DEVICE_PARTITION_PROPERTIES                   0x1044
-#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN              0x1045
-#define CL_DEVICE_PARTITION_TYPE                         0x1046
-#define CL_DEVICE_REFERENCE_COUNT                        0x1047
-#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC            0x1048
-#define CL_DEVICE_PRINTF_BUFFER_SIZE                     0x1049
-#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT                  0x104A
-#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT           0x104B
-#endif
-#ifdef CL_VERSION_2_0
-#define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS              0x104C
-#define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE               0x104D
-#define CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES             0x104E
-#define CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE         0x104F
-#define CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE               0x1050
-#define CL_DEVICE_MAX_ON_DEVICE_QUEUES                   0x1051
-#define CL_DEVICE_MAX_ON_DEVICE_EVENTS                   0x1052
-#define CL_DEVICE_SVM_CAPABILITIES                       0x1053
-#define CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE   0x1054
-#define CL_DEVICE_MAX_PIPE_ARGS                          0x1055
-#define CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS           0x1056
-#define CL_DEVICE_PIPE_MAX_PACKET_SIZE                   0x1057
-#define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT    0x1058
-#define CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT      0x1059
-#define CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT       0x105A
-#endif
-#ifdef CL_VERSION_2_1
-#define CL_DEVICE_IL_VERSION                             0x105B
-#define CL_DEVICE_MAX_NUM_SUB_GROUPS                     0x105C
-#define CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS 0x105D
-#endif
-
-/* cl_device_fp_config - bitfield */
-#define CL_FP_DENORM                                (1 << 0)
-#define CL_FP_INF_NAN                               (1 << 1)
-#define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
-#define CL_FP_ROUND_TO_ZERO                         (1 << 3)
-#define CL_FP_ROUND_TO_INF                          (1 << 4)
-#define CL_FP_FMA                                   (1 << 5)
-#ifdef CL_VERSION_1_1
-#define CL_FP_SOFT_FLOAT                            (1 << 6)
-#endif
-#ifdef CL_VERSION_1_2
-#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT         (1 << 7)
-#endif
-
-/* cl_device_mem_cache_type */
-#define CL_NONE                                     0x0
-#define CL_READ_ONLY_CACHE                          0x1
-#define CL_READ_WRITE_CACHE                         0x2
-
-/* cl_device_local_mem_type */
-#define CL_LOCAL                                    0x1
-#define CL_GLOBAL                                   0x2
-
-/* cl_device_exec_capabilities - bitfield */
-#define CL_EXEC_KERNEL                              (1 << 0)
-#define CL_EXEC_NATIVE_KERNEL                       (1 << 1)
-
-/* cl_command_queue_properties - bitfield */
-#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
-#define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
-#ifdef CL_VERSION_2_0
-#define CL_QUEUE_ON_DEVICE                          (1 << 2)
-#define CL_QUEUE_ON_DEVICE_DEFAULT                  (1 << 3)
-#endif
-
-/* cl_context_info  */
-#define CL_CONTEXT_REFERENCE_COUNT                  0x1080
-#define CL_CONTEXT_DEVICES                          0x1081
-#define CL_CONTEXT_PROPERTIES                       0x1082
-#ifdef CL_VERSION_1_1
-#define CL_CONTEXT_NUM_DEVICES                      0x1083
-#endif
-
-/* cl_context_properties */
-#define CL_CONTEXT_PLATFORM                         0x1084
-#ifdef CL_VERSION_1_2
-#define CL_CONTEXT_INTEROP_USER_SYNC                0x1085
-#endif
-
-#ifdef CL_VERSION_1_2
-
-/* cl_device_partition_property */
-#define CL_DEVICE_PARTITION_EQUALLY                 0x1086
-#define CL_DEVICE_PARTITION_BY_COUNTS               0x1087
-#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END      0x0
-#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN      0x1088
-
-#endif
-
-#ifdef CL_VERSION_1_2
-
-/* cl_device_affinity_domain */
-#define CL_DEVICE_AFFINITY_DOMAIN_NUMA               (1 << 0)
-#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE           (1 << 1)
-#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE           (1 << 2)
-#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE           (1 << 3)
-#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE           (1 << 4)
-#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5)
-
-#endif
-
-#ifdef CL_VERSION_2_0
-
-/* cl_device_svm_capabilities */
-#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER           (1 << 0)
-#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER             (1 << 1)
-#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM             (1 << 2)
-#define CL_DEVICE_SVM_ATOMICS                       (1 << 3)
-
-#endif
-
-/* cl_command_queue_info */
-#define CL_QUEUE_CONTEXT                            0x1090
-#define CL_QUEUE_DEVICE                             0x1091
-#define CL_QUEUE_REFERENCE_COUNT                    0x1092
-#define CL_QUEUE_PROPERTIES                         0x1093
-#ifdef CL_VERSION_2_0
-#define CL_QUEUE_SIZE                               0x1094
-#endif
-#ifdef CL_VERSION_2_1
-#define CL_QUEUE_DEVICE_DEFAULT                     0x1095
-#endif
-
-/* cl_mem_flags and cl_svm_mem_flags - bitfield */
-#define CL_MEM_READ_WRITE                           (1 << 0)
-#define CL_MEM_WRITE_ONLY                           (1 << 1)
-#define CL_MEM_READ_ONLY                            (1 << 2)
-#define CL_MEM_USE_HOST_PTR                         (1 << 3)
-#define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
-#define CL_MEM_COPY_HOST_PTR                        (1 << 5)
-/* reserved                                         (1 << 6)    */
-#ifdef CL_VERSION_1_2
-#define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
-#define CL_MEM_HOST_READ_ONLY                       (1 << 8)
-#define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
-#endif
-#ifdef CL_VERSION_2_0
-#define CL_MEM_SVM_FINE_GRAIN_BUFFER                (1 << 10)   /* used by cl_svm_mem_flags only */
-#define CL_MEM_SVM_ATOMICS                          (1 << 11)   /* used by cl_svm_mem_flags only */
-#define CL_MEM_KERNEL_READ_AND_WRITE                (1 << 12)
-#endif
-
-#ifdef CL_VERSION_1_2
-
-/* cl_mem_migration_flags - bitfield */
-#define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)
-#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED     (1 << 1)
-
-#endif
-
-/* cl_channel_order */
-#define CL_R                                        0x10B0
-#define CL_A                                        0x10B1
-#define CL_RG                                       0x10B2
-#define CL_RA                                       0x10B3
-#define CL_RGB                                      0x10B4
-#define CL_RGBA                                     0x10B5
-#define CL_BGRA                                     0x10B6
-#define CL_ARGB                                     0x10B7
-#define CL_INTENSITY                                0x10B8
-#define CL_LUMINANCE                                0x10B9
-#ifdef CL_VERSION_1_1
-#define CL_Rx                                       0x10BA
-#define CL_RGx                                      0x10BB
-#define CL_RGBx                                     0x10BC
-#endif
-#ifdef CL_VERSION_1_2
-#define CL_DEPTH                                    0x10BD
-#define CL_DEPTH_STENCIL                            0x10BE
-#endif
-#ifdef CL_VERSION_2_0
-#define CL_sRGB                                     0x10BF
-#define CL_sRGBx                                    0x10C0
-#define CL_sRGBA                                    0x10C1
-#define CL_sBGRA                                    0x10C2
-#define CL_ABGR                                     0x10C3
-#endif
-
-/* cl_channel_type */
-#define CL_SNORM_INT8                               0x10D0
-#define CL_SNORM_INT16                              0x10D1
-#define CL_UNORM_INT8                               0x10D2
-#define CL_UNORM_INT16                              0x10D3
-#define CL_UNORM_SHORT_565                          0x10D4
-#define CL_UNORM_SHORT_555                          0x10D5
-#define CL_UNORM_INT_101010                         0x10D6
-#define CL_SIGNED_INT8                              0x10D7
-#define CL_SIGNED_INT16                             0x10D8
-#define CL_SIGNED_INT32                             0x10D9
-#define CL_UNSIGNED_INT8                            0x10DA
-#define CL_UNSIGNED_INT16                           0x10DB
-#define CL_UNSIGNED_INT32                           0x10DC
-#define CL_HALF_FLOAT                               0x10DD
-#define CL_FLOAT                                    0x10DE
-#ifdef CL_VERSION_1_2
-#define CL_UNORM_INT24                              0x10DF
-#endif
-#ifdef CL_VERSION_2_1
-#define CL_UNORM_INT_101010_2                       0x10E0
-#endif
-
-/* cl_mem_object_type */
-#define CL_MEM_OBJECT_BUFFER                        0x10F0
-#define CL_MEM_OBJECT_IMAGE2D                       0x10F1
-#define CL_MEM_OBJECT_IMAGE3D                       0x10F2
-#ifdef CL_VERSION_1_2
-#define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
-#define CL_MEM_OBJECT_IMAGE1D                       0x10F4
-#define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
-#define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
-#endif
-#ifdef CL_VERSION_2_0
-#define CL_MEM_OBJECT_PIPE                          0x10F7
-#endif
-
-/* cl_mem_info */
-#define CL_MEM_TYPE                                 0x1100
-#define CL_MEM_FLAGS                                0x1101
-#define CL_MEM_SIZE                                 0x1102
-#define CL_MEM_HOST_PTR                             0x1103
-#define CL_MEM_MAP_COUNT                            0x1104
-#define CL_MEM_REFERENCE_COUNT                      0x1105
-#define CL_MEM_CONTEXT                              0x1106
-#ifdef CL_VERSION_1_1
-#define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
-#define CL_MEM_OFFSET                               0x1108
-#endif
-#ifdef CL_VERSION_2_0
-#define CL_MEM_USES_SVM_POINTER                     0x1109
-#endif
-
-/* cl_image_info */
-#define CL_IMAGE_FORMAT                             0x1110
-#define CL_IMAGE_ELEMENT_SIZE                       0x1111
-#define CL_IMAGE_ROW_PITCH                          0x1112
-#define CL_IMAGE_SLICE_PITCH                        0x1113
-#define CL_IMAGE_WIDTH                              0x1114
-#define CL_IMAGE_HEIGHT                             0x1115
-#define CL_IMAGE_DEPTH                              0x1116
-#ifdef CL_VERSION_1_2
-#define CL_IMAGE_ARRAY_SIZE                         0x1117
-#define CL_IMAGE_BUFFER                             0x1118
-#define CL_IMAGE_NUM_MIP_LEVELS                     0x1119
-#define CL_IMAGE_NUM_SAMPLES                        0x111A
-#endif
-
-#ifdef CL_VERSION_2_0
-
-/* cl_pipe_info */
-#define CL_PIPE_PACKET_SIZE                         0x1120
-#define CL_PIPE_MAX_PACKETS                         0x1121
-
-#endif
-
-/* cl_addressing_mode */
-#define CL_ADDRESS_NONE                             0x1130
-#define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
-#define CL_ADDRESS_CLAMP                            0x1132
-#define CL_ADDRESS_REPEAT                           0x1133
-#ifdef CL_VERSION_1_1
-#define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
-#endif
-
-/* cl_filter_mode */
-#define CL_FILTER_NEAREST                           0x1140
-#define CL_FILTER_LINEAR                            0x1141
-
-/* cl_sampler_info */
-#define CL_SAMPLER_REFERENCE_COUNT                  0x1150
-#define CL_SAMPLER_CONTEXT                          0x1151
-#define CL_SAMPLER_NORMALIZED_COORDS                0x1152
-#define CL_SAMPLER_ADDRESSING_MODE                  0x1153
-#define CL_SAMPLER_FILTER_MODE                      0x1154
-#ifdef CL_VERSION_2_0
-#define CL_SAMPLER_MIP_FILTER_MODE                  0x1155
-#define CL_SAMPLER_LOD_MIN                          0x1156
-#define CL_SAMPLER_LOD_MAX                          0x1157
-#endif
-
-/* cl_map_flags - bitfield */
-#define CL_MAP_READ                                 (1 << 0)
-#define CL_MAP_WRITE                                (1 << 1)
-#ifdef CL_VERSION_1_2
-#define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
-#endif
-
-/* cl_program_info */
-#define CL_PROGRAM_REFERENCE_COUNT                  0x1160
-#define CL_PROGRAM_CONTEXT                          0x1161
-#define CL_PROGRAM_NUM_DEVICES                      0x1162
-#define CL_PROGRAM_DEVICES                          0x1163
-#define CL_PROGRAM_SOURCE                           0x1164
-#define CL_PROGRAM_BINARY_SIZES                     0x1165
-#define CL_PROGRAM_BINARIES                         0x1166
-#ifdef CL_VERSION_1_2
-#define CL_PROGRAM_NUM_KERNELS                      0x1167
-#define CL_PROGRAM_KERNEL_NAMES                     0x1168
-#endif
-#ifdef CL_VERSION_2_1
-#define CL_PROGRAM_IL                               0x1169
-#endif
-#ifdef CL_VERSION_2_2
-#define CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT       0x116A
-#define CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT       0x116B
-#endif
-
-/* cl_program_build_info */
-#define CL_PROGRAM_BUILD_STATUS                     0x1181
-#define CL_PROGRAM_BUILD_OPTIONS                    0x1182
-#define CL_PROGRAM_BUILD_LOG                        0x1183
-#ifdef CL_VERSION_1_2
-#define CL_PROGRAM_BINARY_TYPE                      0x1184
-#endif
-#ifdef CL_VERSION_2_0
-#define CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE 0x1185
-#endif
-
-#ifdef CL_VERSION_1_2
-
-/* cl_program_binary_type */
-#define CL_PROGRAM_BINARY_TYPE_NONE                 0x0
-#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT      0x1
-#define CL_PROGRAM_BINARY_TYPE_LIBRARY              0x2
-#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE           0x4
-
-#endif
-
-/* cl_build_status */
-#define CL_BUILD_SUCCESS                            0
-#define CL_BUILD_NONE                               -1
-#define CL_BUILD_ERROR                              -2
-#define CL_BUILD_IN_PROGRESS                        -3
-
-/* cl_kernel_info */
-#define CL_KERNEL_FUNCTION_NAME                     0x1190
-#define CL_KERNEL_NUM_ARGS                          0x1191
-#define CL_KERNEL_REFERENCE_COUNT                   0x1192
-#define CL_KERNEL_CONTEXT                           0x1193
-#define CL_KERNEL_PROGRAM                           0x1194
-#ifdef CL_VERSION_1_2
-#define CL_KERNEL_ATTRIBUTES                        0x1195
-#endif
-#ifdef CL_VERSION_2_1
-#define CL_KERNEL_MAX_NUM_SUB_GROUPS                0x11B9
-#define CL_KERNEL_COMPILE_NUM_SUB_GROUPS            0x11BA
-#endif
-
-#ifdef CL_VERSION_1_2
-
-/* cl_kernel_arg_info */
-#define CL_KERNEL_ARG_ADDRESS_QUALIFIER             0x1196
-#define CL_KERNEL_ARG_ACCESS_QUALIFIER              0x1197
-#define CL_KERNEL_ARG_TYPE_NAME                     0x1198
-#define CL_KERNEL_ARG_TYPE_QUALIFIER                0x1199
-#define CL_KERNEL_ARG_NAME                          0x119A
-
-#endif
-
-#ifdef CL_VERSION_1_2
-
-/* cl_kernel_arg_address_qualifier */
-#define CL_KERNEL_ARG_ADDRESS_GLOBAL                0x119B
-#define CL_KERNEL_ARG_ADDRESS_LOCAL                 0x119C
-#define CL_KERNEL_ARG_ADDRESS_CONSTANT              0x119D
-#define CL_KERNEL_ARG_ADDRESS_PRIVATE               0x119E
-
-#endif
-
-#ifdef CL_VERSION_1_2
-
-/* cl_kernel_arg_access_qualifier */
-#define CL_KERNEL_ARG_ACCESS_READ_ONLY              0x11A0
-#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY             0x11A1
-#define CL_KERNEL_ARG_ACCESS_READ_WRITE             0x11A2
-#define CL_KERNEL_ARG_ACCESS_NONE                   0x11A3
-
-#endif
-
-#ifdef CL_VERSION_1_2
-
-/* cl_kernel_arg_type_qualifier */
-#define CL_KERNEL_ARG_TYPE_NONE                     0
-#define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)
-#define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)
-#define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)
-#ifdef CL_VERSION_2_0
-#define CL_KERNEL_ARG_TYPE_PIPE                     (1 << 3)
-#endif
-
-#endif
-
-/* cl_kernel_work_group_info */
-#define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
-#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
-#define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
-#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
-#define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
-#ifdef CL_VERSION_1_2
-#define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
-#endif
-
-#ifdef CL_VERSION_2_1
-
-/* cl_kernel_sub_group_info */
-#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE    0x2033
-#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE       0x2034
-#define CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT    0x11B8
-
-#endif
-
-#ifdef CL_VERSION_2_0
-
-/* cl_kernel_exec_info */
-#define CL_KERNEL_EXEC_INFO_SVM_PTRS                0x11B6
-#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM   0x11B7
-
-#endif
-
-/* cl_event_info  */
-#define CL_EVENT_COMMAND_QUEUE                      0x11D0
-#define CL_EVENT_COMMAND_TYPE                       0x11D1
-#define CL_EVENT_REFERENCE_COUNT                    0x11D2
-#define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3
-#ifdef CL_VERSION_1_1
-#define CL_EVENT_CONTEXT                            0x11D4
-#endif
-
-/* cl_command_type */
-#define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
-#define CL_COMMAND_TASK                             0x11F1
-#define CL_COMMAND_NATIVE_KERNEL                    0x11F2
-#define CL_COMMAND_READ_BUFFER                      0x11F3
-#define CL_COMMAND_WRITE_BUFFER                     0x11F4
-#define CL_COMMAND_COPY_BUFFER                      0x11F5
-#define CL_COMMAND_READ_IMAGE                       0x11F6
-#define CL_COMMAND_WRITE_IMAGE                      0x11F7
-#define CL_COMMAND_COPY_IMAGE                       0x11F8
-#define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
-#define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
-#define CL_COMMAND_MAP_BUFFER                       0x11FB
-#define CL_COMMAND_MAP_IMAGE                        0x11FC
-#define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
-#define CL_COMMAND_MARKER                           0x11FE
-#define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
-#define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
-#ifdef CL_VERSION_1_1
-#define CL_COMMAND_READ_BUFFER_RECT                 0x1201
-#define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
-#define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
-#define CL_COMMAND_USER                             0x1204
-#endif
-#ifdef CL_VERSION_1_2
-#define CL_COMMAND_BARRIER                          0x1205
-#define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206
-#define CL_COMMAND_FILL_BUFFER                      0x1207
-#define CL_COMMAND_FILL_IMAGE                       0x1208
-#endif
-#ifdef CL_VERSION_2_0
-#define CL_COMMAND_SVM_FREE                         0x1209
-#define CL_COMMAND_SVM_MEMCPY                       0x120A
-#define CL_COMMAND_SVM_MEMFILL                      0x120B
-#define CL_COMMAND_SVM_MAP                          0x120C
-#define CL_COMMAND_SVM_UNMAP                        0x120D
-#endif
-
-/* command execution status */
-#define CL_COMPLETE                                 0x0
-#define CL_RUNNING                                  0x1
-#define CL_SUBMITTED                                0x2
-#define CL_QUEUED                                   0x3
-
-#ifdef CL_VERSION_1_1
-
-/* cl_buffer_create_type  */
-#define CL_BUFFER_CREATE_TYPE_REGION                0x1220
-
-#endif
-
-/* cl_profiling_info  */
-#define CL_PROFILING_COMMAND_QUEUED                 0x1280
-#define CL_PROFILING_COMMAND_SUBMIT                 0x1281
-#define CL_PROFILING_COMMAND_START                  0x1282
-#define CL_PROFILING_COMMAND_END                    0x1283
-#ifdef CL_VERSION_2_0
-#define CL_PROFILING_COMMAND_COMPLETE               0x1284
-#endif
-
-/********************************************************************************************************/
-
-/* Platform API */
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetPlatformIDs(cl_uint          /* num_entries */,
-                 cl_platform_id * /* platforms */,
-                 cl_uint *        /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetPlatformInfo(cl_platform_id   /* platform */,
-                  cl_platform_info /* param_name */,
-                  size_t           /* param_value_size */,
-                  void *           /* param_value */,
-                  size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Device APIs */
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetDeviceIDs(cl_platform_id   /* platform */,
-               cl_device_type   /* device_type */,
-               cl_uint          /* num_entries */,
-               cl_device_id *   /* devices */,
-               cl_uint *        /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetDeviceInfo(cl_device_id    /* device */,
-                cl_device_info  /* param_name */,
-                size_t          /* param_value_size */,
-                void *          /* param_value */,
-                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-#ifdef CL_VERSION_1_2
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clCreateSubDevices(cl_device_id                         /* in_device */,
-                   const cl_device_partition_property * /* properties */,
-                   cl_uint                              /* num_devices */,
-                   cl_device_id *                       /* out_devices */,
-                   cl_uint *                            /* num_devices_ret */) CL_API_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2;
-
-#endif
-
-#ifdef CL_VERSION_2_1
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetDefaultDeviceCommandQueue(cl_context           /* context */,
-                               cl_device_id         /* device */,
-                               cl_command_queue     /* command_queue */) CL_API_SUFFIX__VERSION_2_1;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetDeviceAndHostTimer(cl_device_id    /* device */,
-                        cl_ulong*       /* device_timestamp */,
-                        cl_ulong*       /* host_timestamp */) CL_API_SUFFIX__VERSION_2_1;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetHostTimer(cl_device_id /* device */,
-               cl_ulong *   /* host_timestamp */)  CL_API_SUFFIX__VERSION_2_1;
-
-#endif
-
-/* Context APIs  */
-extern CL_API_ENTRY cl_context CL_API_CALL
-clCreateContext(const cl_context_properties * /* properties */,
-                cl_uint                 /* num_devices */,
-                const cl_device_id *    /* devices */,
-                void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *),
-                void *                  /* user_data */,
-                cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_context CL_API_CALL
-clCreateContextFromType(const cl_context_properties * /* properties */,
-                        cl_device_type          /* device_type */,
-                        void (CL_CALLBACK *     /* pfn_notify*/ )(const char *, const void *, size_t, void *),
-                        void *                  /* user_data */,
-                        cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetContextInfo(cl_context         /* context */,
-                 cl_context_info    /* param_name */,
-                 size_t             /* param_value_size */,
-                 void *             /* param_value */,
-                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Command Queue APIs */
-
-#ifdef CL_VERSION_2_0
-
-extern CL_API_ENTRY cl_command_queue CL_API_CALL
-clCreateCommandQueueWithProperties(cl_context               /* context */,
-                                   cl_device_id             /* device */,
-                                   const cl_queue_properties *    /* properties */,
-                                   cl_int *                 /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
-
-#endif
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetCommandQueueInfo(cl_command_queue      /* command_queue */,
-                      cl_command_queue_info /* param_name */,
-                      size_t                /* param_value_size */,
-                      void *                /* param_value */,
-                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Memory Object APIs */
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateBuffer(cl_context   /* context */,
-               cl_mem_flags /* flags */,
-               size_t       /* size */,
-               void *       /* host_ptr */,
-               cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-#ifdef CL_VERSION_1_1
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateSubBuffer(cl_mem                   /* buffer */,
-                  cl_mem_flags             /* flags */,
-                  cl_buffer_create_type    /* buffer_create_type */,
-                  const void *             /* buffer_create_info */,
-                  cl_int *                 /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
-
-#endif
-
-#ifdef CL_VERSION_1_2
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateImage(cl_context              /* context */,
-              cl_mem_flags            /* flags */,
-              const cl_image_format * /* image_format */,
-              const cl_image_desc *   /* image_desc */,
-              void *                  /* host_ptr */,
-              cl_int *                /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
-
-#endif
-
-#ifdef CL_VERSION_2_0
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreatePipe(cl_context                 /* context */,
-             cl_mem_flags               /* flags */,
-             cl_uint                    /* pipe_packet_size */,
-             cl_uint                    /* pipe_max_packets */,
-             const cl_pipe_properties * /* properties */,
-             cl_int *                   /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
-
-#endif
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetSupportedImageFormats(cl_context           /* context */,
-                           cl_mem_flags         /* flags */,
-                           cl_mem_object_type   /* image_type */,
-                           cl_uint              /* num_entries */,
-                           cl_image_format *    /* image_formats */,
-                           cl_uint *            /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetMemObjectInfo(cl_mem           /* memobj */,
-                   cl_mem_info      /* param_name */,
-                   size_t           /* param_value_size */,
-                   void *           /* param_value */,
-                   size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetImageInfo(cl_mem           /* image */,
-               cl_image_info    /* param_name */,
-               size_t           /* param_value_size */,
-               void *           /* param_value */,
-               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-#ifdef CL_VERSION_2_0
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetPipeInfo(cl_mem           /* pipe */,
-              cl_pipe_info     /* param_name */,
-              size_t           /* param_value_size */,
-              void *           /* param_value */,
-              size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0;
-
-#endif
-
-#ifdef CL_VERSION_1_1
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetMemObjectDestructorCallback(cl_mem /* memobj */,
-                                 void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
-                                 void * /*user_data */ )             CL_API_SUFFIX__VERSION_1_1;
-
-#endif
-
-/* SVM Allocation APIs */
-
-#ifdef CL_VERSION_2_0
-
-extern CL_API_ENTRY void * CL_API_CALL
-clSVMAlloc(cl_context       /* context */,
-           cl_svm_mem_flags /* flags */,
-           size_t           /* size */,
-           cl_uint          /* alignment */) CL_API_SUFFIX__VERSION_2_0;
-
-extern CL_API_ENTRY void CL_API_CALL
-clSVMFree(cl_context        /* context */,
-          void *            /* svm_pointer */) CL_API_SUFFIX__VERSION_2_0;
-
-#endif
-
-/* Sampler APIs */
-
-#ifdef CL_VERSION_2_0
-
-extern CL_API_ENTRY cl_sampler CL_API_CALL
-clCreateSamplerWithProperties(cl_context                     /* context */,
-                              const cl_sampler_properties *  /* normalized_coords */,
-                              cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
-
-#endif
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetSamplerInfo(cl_sampler         /* sampler */,
-                 cl_sampler_info    /* param_name */,
-                 size_t             /* param_value_size */,
-                 void *             /* param_value */,
-                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Program Object APIs  */
-extern CL_API_ENTRY cl_program CL_API_CALL
-clCreateProgramWithSource(cl_context        /* context */,
-                          cl_uint           /* count */,
-                          const char **     /* strings */,
-                          const size_t *    /* lengths */,
-                          cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_program CL_API_CALL
-clCreateProgramWithBinary(cl_context                     /* context */,
-                          cl_uint                        /* num_devices */,
-                          const cl_device_id *           /* device_list */,
-                          const size_t *                 /* lengths */,
-                          const unsigned char **         /* binaries */,
-                          cl_int *                       /* binary_status */,
-                          cl_int *                       /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-#ifdef CL_VERSION_1_2
-
-extern CL_API_ENTRY cl_program CL_API_CALL
-clCreateProgramWithBuiltInKernels(cl_context            /* context */,
-                                  cl_uint               /* num_devices */,
-                                  const cl_device_id *  /* device_list */,
-                                  const char *          /* kernel_names */,
-                                  cl_int *              /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
-
-#endif
-
-#ifdef CL_VERSION_2_1
-
-extern CL_API_ENTRY cl_program CL_API_CALL
-clCreateProgramWithIL(cl_context    /* context */,
-                     const void*    /* il */,
-                     size_t         /* length */,
-                     cl_int*        /* errcode_ret */) CL_API_SUFFIX__VERSION_2_1;
-
-#endif
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clBuildProgram(cl_program           /* program */,
-               cl_uint              /* num_devices */,
-               const cl_device_id * /* device_list */,
-               const char *         /* options */,
-               void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
-               void *               /* user_data */) CL_API_SUFFIX__VERSION_1_0;
-
-#ifdef CL_VERSION_1_2
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clCompileProgram(cl_program           /* program */,
-                 cl_uint              /* num_devices */,
-                 const cl_device_id * /* device_list */,
-                 const char *         /* options */,
-                 cl_uint              /* num_input_headers */,
-                 const cl_program *   /* input_headers */,
-                 const char **        /* header_include_names */,
-                 void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
-                 void *               /* user_data */) CL_API_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_program CL_API_CALL
-clLinkProgram(cl_context           /* context */,
-              cl_uint              /* num_devices */,
-              const cl_device_id * /* device_list */,
-              const char *         /* options */,
-              cl_uint              /* num_input_programs */,
-              const cl_program *   /* input_programs */,
-              void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
-              void *               /* user_data */,
-              cl_int *             /* errcode_ret */ ) CL_API_SUFFIX__VERSION_1_2;
-
-#endif
-
-#ifdef CL_VERSION_2_2
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetProgramReleaseCallback(cl_program          /* program */,
-                            void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
-                            void *              /* user_data */) CL_API_SUFFIX__VERSION_2_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetProgramSpecializationConstant(cl_program  /* program */,
-                                   cl_uint     /* spec_id */,
-                                   size_t      /* spec_size */,
-                                   const void* /* spec_value */) CL_API_SUFFIX__VERSION_2_2;
-
-#endif
-
-#ifdef CL_VERSION_1_2
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clUnloadPlatformCompiler(cl_platform_id /* platform */) CL_API_SUFFIX__VERSION_1_2;
-
-#endif
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetProgramInfo(cl_program         /* program */,
-                 cl_program_info    /* param_name */,
-                 size_t             /* param_value_size */,
-                 void *             /* param_value */,
-                 size_t *           /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetProgramBuildInfo(cl_program            /* program */,
-                      cl_device_id          /* device */,
-                      cl_program_build_info /* param_name */,
-                      size_t                /* param_value_size */,
-                      void *                /* param_value */,
-                      size_t *              /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Kernel Object APIs */
-extern CL_API_ENTRY cl_kernel CL_API_CALL
-clCreateKernel(cl_program      /* program */,
-               const char *    /* kernel_name */,
-               cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clCreateKernelsInProgram(cl_program     /* program */,
-                         cl_uint        /* num_kernels */,
-                         cl_kernel *    /* kernels */,
-                         cl_uint *      /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-#ifdef CL_VERSION_2_1
-
-extern CL_API_ENTRY cl_kernel CL_API_CALL
-clCloneKernel(cl_kernel     /* source_kernel */,
-              cl_int*       /* errcode_ret */) CL_API_SUFFIX__VERSION_2_1;
-
-#endif
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainKernel(cl_kernel    /* kernel */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseKernel(cl_kernel   /* kernel */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetKernelArg(cl_kernel    /* kernel */,
-               cl_uint      /* arg_index */,
-               size_t       /* arg_size */,
-               const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0;
-
-#ifdef CL_VERSION_2_0
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetKernelArgSVMPointer(cl_kernel    /* kernel */,
-                         cl_uint      /* arg_index */,
-                         const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetKernelExecInfo(cl_kernel            /* kernel */,
-                    cl_kernel_exec_info  /* param_name */,
-                    size_t               /* param_value_size */,
-                    const void *         /* param_value */) CL_API_SUFFIX__VERSION_2_0;
-
-#endif
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetKernelInfo(cl_kernel       /* kernel */,
-                cl_kernel_info  /* param_name */,
-                size_t          /* param_value_size */,
-                void *          /* param_value */,
-                size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-#ifdef CL_VERSION_1_2
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetKernelArgInfo(cl_kernel       /* kernel */,
-                   cl_uint         /* arg_indx */,
-                   cl_kernel_arg_info  /* param_name */,
-                   size_t          /* param_value_size */,
-                   void *          /* param_value */,
-                   size_t *        /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2;
-
-#endif
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetKernelWorkGroupInfo(cl_kernel                  /* kernel */,
-                         cl_device_id               /* device */,
-                         cl_kernel_work_group_info  /* param_name */,
-                         size_t                     /* param_value_size */,
-                         void *                     /* param_value */,
-                         size_t *                   /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-#ifdef CL_VERSION_2_1
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetKernelSubGroupInfo(cl_kernel                   /* kernel */,
-                        cl_device_id                /* device */,
-                        cl_kernel_sub_group_info    /* param_name */,
-                        size_t                      /* input_value_size */,
-                        const void*                 /*input_value */,
-                        size_t                      /* param_value_size */,
-                        void*                       /* param_value */,
-                        size_t*                     /* param_value_size_ret */ ) CL_API_SUFFIX__VERSION_2_1;
-
-#endif
-
-/* Event Object APIs */
-extern CL_API_ENTRY cl_int CL_API_CALL
-clWaitForEvents(cl_uint             /* num_events */,
-                const cl_event *    /* event_list */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetEventInfo(cl_event         /* event */,
-               cl_event_info    /* param_name */,
-               size_t           /* param_value_size */,
-               void *           /* param_value */,
-               size_t *         /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-#ifdef CL_VERSION_1_1
-
-extern CL_API_ENTRY cl_event CL_API_CALL
-clCreateUserEvent(cl_context    /* context */,
-                  cl_int *      /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
-
-#endif
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-#ifdef CL_VERSION_1_1
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetUserEventStatus(cl_event   /* event */,
-                     cl_int     /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetEventCallback( cl_event    /* event */,
-                    cl_int      /* command_exec_callback_type */,
-                    void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
-                    void *      /* user_data */) CL_API_SUFFIX__VERSION_1_1;
-
-#endif
-
-/* Profiling APIs */
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetEventProfilingInfo(cl_event            /* event */,
-                        cl_profiling_info   /* param_name */,
-                        size_t              /* param_value_size */,
-                        void *              /* param_value */,
-                        size_t *            /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Flush and Finish APIs */
-extern CL_API_ENTRY cl_int CL_API_CALL
-clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0;
-
-/* Enqueued Commands APIs */
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReadBuffer(cl_command_queue    /* command_queue */,
-                    cl_mem              /* buffer */,
-                    cl_bool             /* blocking_read */,
-                    size_t              /* offset */,
-                    size_t              /* size */,
-                    void *              /* ptr */,
-                    cl_uint             /* num_events_in_wait_list */,
-                    const cl_event *    /* event_wait_list */,
-                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-#ifdef CL_VERSION_1_1
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReadBufferRect(cl_command_queue    /* command_queue */,
-                        cl_mem              /* buffer */,
-                        cl_bool             /* blocking_read */,
-                        const size_t *      /* buffer_offset */,
-                        const size_t *      /* host_offset */,
-                        const size_t *      /* region */,
-                        size_t              /* buffer_row_pitch */,
-                        size_t              /* buffer_slice_pitch */,
-                        size_t              /* host_row_pitch */,
-                        size_t              /* host_slice_pitch */,
-                        void *              /* ptr */,
-                        cl_uint             /* num_events_in_wait_list */,
-                        const cl_event *    /* event_wait_list */,
-                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
-
-#endif
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueWriteBuffer(cl_command_queue   /* command_queue */,
-                     cl_mem             /* buffer */,
-                     cl_bool            /* blocking_write */,
-                     size_t             /* offset */,
-                     size_t             /* size */,
-                     const void *       /* ptr */,
-                     cl_uint            /* num_events_in_wait_list */,
-                     const cl_event *   /* event_wait_list */,
-                     cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-#ifdef CL_VERSION_1_1
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueWriteBufferRect(cl_command_queue    /* command_queue */,
-                         cl_mem              /* buffer */,
-                         cl_bool             /* blocking_write */,
-                         const size_t *      /* buffer_offset */,
-                         const size_t *      /* host_offset */,
-                         const size_t *      /* region */,
-                         size_t              /* buffer_row_pitch */,
-                         size_t              /* buffer_slice_pitch */,
-                         size_t              /* host_row_pitch */,
-                         size_t              /* host_slice_pitch */,
-                         const void *        /* ptr */,
-                         cl_uint             /* num_events_in_wait_list */,
-                         const cl_event *    /* event_wait_list */,
-                         cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
-
-#endif
-
-#ifdef CL_VERSION_1_2
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueFillBuffer(cl_command_queue   /* command_queue */,
-                    cl_mem             /* buffer */,
-                    const void *       /* pattern */,
-                    size_t             /* pattern_size */,
-                    size_t             /* offset */,
-                    size_t             /* size */,
-                    cl_uint            /* num_events_in_wait_list */,
-                    const cl_event *   /* event_wait_list */,
-                    cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_2;
-
-#endif
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueCopyBuffer(cl_command_queue    /* command_queue */,
-                    cl_mem              /* src_buffer */,
-                    cl_mem              /* dst_buffer */,
-                    size_t              /* src_offset */,
-                    size_t              /* dst_offset */,
-                    size_t              /* size */,
-                    cl_uint             /* num_events_in_wait_list */,
-                    const cl_event *    /* event_wait_list */,
-                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-#ifdef CL_VERSION_1_1
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueCopyBufferRect(cl_command_queue    /* command_queue */,
-                        cl_mem              /* src_buffer */,
-                        cl_mem              /* dst_buffer */,
-                        const size_t *      /* src_origin */,
-                        const size_t *      /* dst_origin */,
-                        const size_t *      /* region */,
-                        size_t              /* src_row_pitch */,
-                        size_t              /* src_slice_pitch */,
-                        size_t              /* dst_row_pitch */,
-                        size_t              /* dst_slice_pitch */,
-                        cl_uint             /* num_events_in_wait_list */,
-                        const cl_event *    /* event_wait_list */,
-                        cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_1;
-
-#endif
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReadImage(cl_command_queue     /* command_queue */,
-                   cl_mem               /* image */,
-                   cl_bool              /* blocking_read */,
-                   const size_t *       /* origin[3] */,
-                   const size_t *       /* region[3] */,
-                   size_t               /* row_pitch */,
-                   size_t               /* slice_pitch */,
-                   void *               /* ptr */,
-                   cl_uint              /* num_events_in_wait_list */,
-                   const cl_event *     /* event_wait_list */,
-                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueWriteImage(cl_command_queue    /* command_queue */,
-                    cl_mem              /* image */,
-                    cl_bool             /* blocking_write */,
-                    const size_t *      /* origin[3] */,
-                    const size_t *      /* region[3] */,
-                    size_t              /* input_row_pitch */,
-                    size_t              /* input_slice_pitch */,
-                    const void *        /* ptr */,
-                    cl_uint             /* num_events_in_wait_list */,
-                    const cl_event *    /* event_wait_list */,
-                    cl_event *          /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-#ifdef CL_VERSION_1_2
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueFillImage(cl_command_queue   /* command_queue */,
-                   cl_mem             /* image */,
-                   const void *       /* fill_color */,
-                   const size_t *     /* origin[3] */,
-                   const size_t *     /* region[3] */,
-                   cl_uint            /* num_events_in_wait_list */,
-                   const cl_event *   /* event_wait_list */,
-                   cl_event *         /* event */) CL_API_SUFFIX__VERSION_1_2;
-
-#endif
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueCopyImage(cl_command_queue     /* command_queue */,
-                   cl_mem               /* src_image */,
-                   cl_mem               /* dst_image */,
-                   const size_t *       /* src_origin[3] */,
-                   const size_t *       /* dst_origin[3] */,
-                   const size_t *       /* region[3] */,
-                   cl_uint              /* num_events_in_wait_list */,
-                   const cl_event *     /* event_wait_list */,
-                   cl_event *           /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */,
-                           cl_mem           /* src_image */,
-                           cl_mem           /* dst_buffer */,
-                           const size_t *   /* src_origin[3] */,
-                           const size_t *   /* region[3] */,
-                           size_t           /* dst_offset */,
-                           cl_uint          /* num_events_in_wait_list */,
-                           const cl_event * /* event_wait_list */,
-                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */,
-                           cl_mem           /* src_buffer */,
-                           cl_mem           /* dst_image */,
-                           size_t           /* src_offset */,
-                           const size_t *   /* dst_origin[3] */,
-                           const size_t *   /* region[3] */,
-                           cl_uint          /* num_events_in_wait_list */,
-                           const cl_event * /* event_wait_list */,
-                           cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY void * CL_API_CALL
-clEnqueueMapBuffer(cl_command_queue /* command_queue */,
-                   cl_mem           /* buffer */,
-                   cl_bool          /* blocking_map */,
-                   cl_map_flags     /* map_flags */,
-                   size_t           /* offset */,
-                   size_t           /* size */,
-                   cl_uint          /* num_events_in_wait_list */,
-                   const cl_event * /* event_wait_list */,
-                   cl_event *       /* event */,
-                   cl_int *         /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY void * CL_API_CALL
-clEnqueueMapImage(cl_command_queue  /* command_queue */,
-                  cl_mem            /* image */,
-                  cl_bool           /* blocking_map */,
-                  cl_map_flags      /* map_flags */,
-                  const size_t *    /* origin[3] */,
-                  const size_t *    /* region[3] */,
-                  size_t *          /* image_row_pitch */,
-                  size_t *          /* image_slice_pitch */,
-                  cl_uint           /* num_events_in_wait_list */,
-                  const cl_event *  /* event_wait_list */,
-                  cl_event *        /* event */,
-                  cl_int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueUnmapMemObject(cl_command_queue /* command_queue */,
-                        cl_mem           /* memobj */,
-                        void *           /* mapped_ptr */,
-                        cl_uint          /* num_events_in_wait_list */,
-                        const cl_event *  /* event_wait_list */,
-                        cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-#ifdef CL_VERSION_1_2
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueMigrateMemObjects(cl_command_queue       /* command_queue */,
-                           cl_uint                /* num_mem_objects */,
-                           const cl_mem *         /* mem_objects */,
-                           cl_mem_migration_flags /* flags */,
-                           cl_uint                /* num_events_in_wait_list */,
-                           const cl_event *       /* event_wait_list */,
-                           cl_event *             /* event */) CL_API_SUFFIX__VERSION_1_2;
-
-#endif
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
-                       cl_kernel        /* kernel */,
-                       cl_uint          /* work_dim */,
-                       const size_t *   /* global_work_offset */,
-                       const size_t *   /* global_work_size */,
-                       const size_t *   /* local_work_size */,
-                       cl_uint          /* num_events_in_wait_list */,
-                       const cl_event * /* event_wait_list */,
-                       cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueNativeKernel(cl_command_queue  /* command_queue */,
-                      void (CL_CALLBACK * /*user_func*/)(void *),
-                      void *            /* args */,
-                      size_t            /* cb_args */,
-                      cl_uint           /* num_mem_objects */,
-                      const cl_mem *    /* mem_list */,
-                      const void **     /* args_mem_loc */,
-                      cl_uint           /* num_events_in_wait_list */,
-                      const cl_event *  /* event_wait_list */,
-                      cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-#ifdef CL_VERSION_1_2
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueMarkerWithWaitList(cl_command_queue  /* command_queue */,
-                            cl_uint           /* num_events_in_wait_list */,
-                            const cl_event *  /* event_wait_list */,
-                            cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueBarrierWithWaitList(cl_command_queue  /* command_queue */,
-                             cl_uint           /* num_events_in_wait_list */,
-                             const cl_event *  /* event_wait_list */,
-                             cl_event *        /* event */) CL_API_SUFFIX__VERSION_1_2;
-
-#endif
-
-#ifdef CL_VERSION_2_0
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueSVMFree(cl_command_queue  /* command_queue */,
-                 cl_uint           /* num_svm_pointers */,
-                 void *[]          /* svm_pointers[] */,
-                 void (CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */,
-                                                        cl_uint          /* num_svm_pointers */,
-                                                        void *[]         /* svm_pointers[] */,
-                                                        void *           /* user_data */),
-                 void *            /* user_data */,
-                 cl_uint           /* num_events_in_wait_list */,
-                 const cl_event *  /* event_wait_list */,
-                 cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueSVMMemcpy(cl_command_queue  /* command_queue */,
-                   cl_bool           /* blocking_copy */,
-                   void *            /* dst_ptr */,
-                   const void *      /* src_ptr */,
-                   size_t            /* size */,
-                   cl_uint           /* num_events_in_wait_list */,
-                   const cl_event *  /* event_wait_list */,
-                   cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueSVMMemFill(cl_command_queue  /* command_queue */,
-                    void *            /* svm_ptr */,
-                    const void *      /* pattern */,
-                    size_t            /* pattern_size */,
-                    size_t            /* size */,
-                    cl_uint           /* num_events_in_wait_list */,
-                    const cl_event *  /* event_wait_list */,
-                    cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueSVMMap(cl_command_queue  /* command_queue */,
-                cl_bool           /* blocking_map */,
-                cl_map_flags      /* flags */,
-                void *            /* svm_ptr */,
-                size_t            /* size */,
-                cl_uint           /* num_events_in_wait_list */,
-                const cl_event *  /* event_wait_list */,
-                cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueSVMUnmap(cl_command_queue  /* command_queue */,
-                  void *            /* svm_ptr */,
-                  cl_uint           /* num_events_in_wait_list */,
-                  const cl_event *  /* event_wait_list */,
-                  cl_event *        /* event */) CL_API_SUFFIX__VERSION_2_0;
-
-#endif
-
-#ifdef CL_VERSION_2_1
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueSVMMigrateMem(cl_command_queue         /* command_queue */,
-                       cl_uint                  /* num_svm_pointers */,
-                       const void **            /* svm_pointers */,
-                       const size_t *           /* sizes */,
-                       cl_mem_migration_flags   /* flags */,
-                       cl_uint                  /* num_events_in_wait_list */,
-                       const cl_event *         /* event_wait_list */,
-                       cl_event *               /* event */) CL_API_SUFFIX__VERSION_2_1;
-
-#endif
-
-#ifdef CL_VERSION_1_2
-
-/* Extension function access
- *
- * Returns the extension function address for the given function name,
- * or NULL if a valid function can not be found.  The client must
- * check to make sure the address is not NULL, before using or
- * calling the returned function address.
- */
-extern CL_API_ENTRY void * CL_API_CALL
-clGetExtensionFunctionAddressForPlatform(cl_platform_id /* platform */,
-                                         const char *   /* func_name */) CL_API_SUFFIX__VERSION_1_2;
-
-#endif
-
-#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
-    /*
-     *  WARNING:
-     *     This API introduces mutable state into the OpenCL implementation. It has been REMOVED
-     *  to better facilitate thread safety.  The 1.0 API is not thread safe. It is not tested by the
-     *  OpenCL 1.1 conformance test, and consequently may not work or may not work dependably.
-     *  It is likely to be non-performant. Use of this API is not advised. Use at your own risk.
-     *
-     *  Software developers previously relying on this API are instructed to set the command queue
-     *  properties when creating the queue, instead.
-     */
-    extern CL_API_ENTRY cl_int CL_API_CALL
-    clSetCommandQueueProperty(cl_command_queue              /* command_queue */,
-                              cl_command_queue_properties   /* properties */,
-                              cl_bool                        /* enable */,
-                              cl_command_queue_properties * /* old_properties */) CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED;
-#endif /* CL_USE_DEPRECATED_OPENCL_1_0_APIS */
-
-/* Deprecated OpenCL 1.1 APIs */
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
-clCreateImage2D(cl_context              /* context */,
-                cl_mem_flags            /* flags */,
-                const cl_image_format * /* image_format */,
-                size_t                  /* image_width */,
-                size_t                  /* image_height */,
-                size_t                  /* image_row_pitch */,
-                void *                  /* host_ptr */,
-                cl_int *                /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
-clCreateImage3D(cl_context              /* context */,
-                cl_mem_flags            /* flags */,
-                const cl_image_format * /* image_format */,
-                size_t                  /* image_width */,
-                size_t                  /* image_height */,
-                size_t                  /* image_depth */,
-                size_t                  /* image_row_pitch */,
-                size_t                  /* image_slice_pitch */,
-                void *                  /* host_ptr */,
-                cl_int *                /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
-clEnqueueMarker(cl_command_queue    /* command_queue */,
-                cl_event *          /* event */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
-clEnqueueWaitForEvents(cl_command_queue /* command_queue */,
-                        cl_uint          /* num_events */,
-                        const cl_event * /* event_list */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
-clEnqueueBarrier(cl_command_queue /* command_queue */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
-clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL
-clGetExtensionFunctionAddress(const char * /* func_name */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-
-/* Deprecated OpenCL 2.0 APIs */
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_command_queue CL_API_CALL
-clCreateCommandQueue(cl_context                     /* context */,
-                     cl_device_id                   /* device */,
-                     cl_command_queue_properties    /* properties */,
-                     cl_int *                       /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
-
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_sampler CL_API_CALL
-clCreateSampler(cl_context          /* context */,
-                cl_bool             /* normalized_coords */,
-                cl_addressing_mode  /* addressing_mode */,
-                cl_filter_mode      /* filter_mode */,
-                cl_int *            /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
-
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int CL_API_CALL
-clEnqueueTask(cl_command_queue  /* command_queue */,
-              cl_kernel         /* kernel */,
-              cl_uint           /* num_events_in_wait_list */,
-              const cl_event *  /* event_wait_list */,
-              cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  /* __OPENCL_CL_H */
diff --git a/mobile/third_party/opencl/OpenCL-Headers/CL/cl_d3d10.h b/mobile/third_party/opencl/OpenCL-Headers/CL/cl_d3d10.h
deleted file mode 100644
index 6b6bcaf155..0000000000
--- a/mobile/third_party/opencl/OpenCL-Headers/CL/cl_d3d10.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/**********************************************************************************
- * Copyright (c) 2008-2015 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
- * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
- * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
- *    https://www.khronos.org/registry/
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- **********************************************************************************/
-
-/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
-
-#ifndef __OPENCL_CL_D3D10_H
-#define __OPENCL_CL_D3D10_H
-
-#include <d3d10.h>
-#include <CL/cl.h>
-#include <CL/cl_platform.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/******************************************************************************
- * cl_khr_d3d10_sharing                                                       */
-#define cl_khr_d3d10_sharing 1
-
-typedef cl_uint cl_d3d10_device_source_khr;
-typedef cl_uint cl_d3d10_device_set_khr;
-
-/******************************************************************************/
-
-/* Error Codes */
-#define CL_INVALID_D3D10_DEVICE_KHR                  -1002
-#define CL_INVALID_D3D10_RESOURCE_KHR                -1003
-#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR       -1004
-#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR           -1005
-
-/* cl_d3d10_device_source_nv */
-#define CL_D3D10_DEVICE_KHR                          0x4010
-#define CL_D3D10_DXGI_ADAPTER_KHR                    0x4011
-
-/* cl_d3d10_device_set_nv */
-#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR           0x4012
-#define CL_ALL_DEVICES_FOR_D3D10_KHR                 0x4013
-
-/* cl_context_info */
-#define CL_CONTEXT_D3D10_DEVICE_KHR                  0x4014
-#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
-
-/* cl_mem_info */
-#define CL_MEM_D3D10_RESOURCE_KHR                    0x4015
-
-/* cl_image_info */
-#define CL_IMAGE_D3D10_SUBRESOURCE_KHR               0x4016
-
-/* cl_command_type */
-#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR         0x4017
-#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR         0x4018
-
-/******************************************************************************/
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
-    cl_platform_id             platform,
-    cl_d3d10_device_source_khr d3d_device_source,
-    void *                     d3d_object,
-    cl_d3d10_device_set_khr    d3d_device_set,
-    cl_uint                    num_entries,
-    cl_device_id *             devices,
-    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
-    cl_context     context,
-    cl_mem_flags   flags,
-    ID3D10Buffer * resource,
-    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
-    cl_context        context,
-    cl_mem_flags      flags,
-    ID3D10Texture2D * resource,
-    UINT              subresource,
-    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
-    cl_context        context,
-    cl_mem_flags      flags,
-    ID3D10Texture3D * resource,
-    UINT              subresource,
-    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
-    cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
-    cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  /* __OPENCL_CL_D3D10_H */
diff --git a/mobile/third_party/opencl/OpenCL-Headers/CL/cl_d3d11.h b/mobile/third_party/opencl/OpenCL-Headers/CL/cl_d3d11.h
deleted file mode 100644
index 38cc21a2e5..0000000000
--- a/mobile/third_party/opencl/OpenCL-Headers/CL/cl_d3d11.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/**********************************************************************************
- * Copyright (c) 2008-2015 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
- * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
- * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
- *    https://www.khronos.org/registry/
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- **********************************************************************************/
-
-/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
-
-#ifndef __OPENCL_CL_D3D11_H
-#define __OPENCL_CL_D3D11_H
-
-#include <d3d11.h>
-#include <CL/cl.h>
-#include <CL/cl_platform.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/******************************************************************************
- * cl_khr_d3d11_sharing                                                       */
-#define cl_khr_d3d11_sharing 1
-
-typedef cl_uint cl_d3d11_device_source_khr;
-typedef cl_uint cl_d3d11_device_set_khr;
-
-/******************************************************************************/
-
-/* Error Codes */
-#define CL_INVALID_D3D11_DEVICE_KHR                  -1006
-#define CL_INVALID_D3D11_RESOURCE_KHR                -1007
-#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR       -1008
-#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR           -1009
-
-/* cl_d3d11_device_source */
-#define CL_D3D11_DEVICE_KHR                          0x4019
-#define CL_D3D11_DXGI_ADAPTER_KHR                    0x401A
-
-/* cl_d3d11_device_set */
-#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR           0x401B
-#define CL_ALL_DEVICES_FOR_D3D11_KHR                 0x401C
-
-/* cl_context_info */
-#define CL_CONTEXT_D3D11_DEVICE_KHR                  0x401D
-#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D
-
-/* cl_mem_info */
-#define CL_MEM_D3D11_RESOURCE_KHR                    0x401E
-
-/* cl_image_info */
-#define CL_IMAGE_D3D11_SUBRESOURCE_KHR               0x401F
-
-/* cl_command_type */
-#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR         0x4020
-#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR         0x4021
-
-/******************************************************************************/
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
-    cl_platform_id             platform,
-    cl_d3d11_device_source_khr d3d_device_source,
-    void *                     d3d_object,
-    cl_d3d11_device_set_khr    d3d_device_set,
-    cl_uint                    num_entries,
-    cl_device_id *             devices,
-    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
-    cl_context     context,
-    cl_mem_flags   flags,
-    ID3D11Buffer * resource,
-    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
-    cl_context        context,
-    cl_mem_flags      flags,
-    ID3D11Texture2D * resource,
-    UINT              subresource,
-    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
-    cl_context        context,
-    cl_mem_flags      flags,
-    ID3D11Texture3D * resource,
-    UINT              subresource,
-    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
-    cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
-    cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  /* __OPENCL_CL_D3D11_H */
diff --git a/mobile/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing.h b/mobile/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing.h
deleted file mode 100644
index 484f8cbc77..0000000000
--- a/mobile/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/**********************************************************************************
- * Copyright (c) 2008-2015 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
- * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
- * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
- *    https://www.khronos.org/registry/
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- **********************************************************************************/
-
-/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
-
-#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H
-#define __OPENCL_CL_DX9_MEDIA_SHARING_H
-
-#include <CL/cl.h>
-#include <CL/cl_platform.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/******************************************************************************/
-/* cl_khr_dx9_media_sharing                                                   */
-#define cl_khr_dx9_media_sharing 1
-
-typedef cl_uint             cl_dx9_media_adapter_type_khr;
-typedef cl_uint             cl_dx9_media_adapter_set_khr;
-    
-#if defined(_WIN32)
-#include <d3d9.h>
-typedef struct _cl_dx9_surface_info_khr
-{
-    IDirect3DSurface9 *resource;
-    HANDLE shared_handle;
-} cl_dx9_surface_info_khr;
-#endif
-
-
-/******************************************************************************/
-
-/* Error Codes */
-#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR                -1010
-#define CL_INVALID_DX9_MEDIA_SURFACE_KHR                -1011
-#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR       -1012
-#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR           -1013
-
-/* cl_media_adapter_type_khr */
-#define CL_ADAPTER_D3D9_KHR                              0x2020
-#define CL_ADAPTER_D3D9EX_KHR                            0x2021
-#define CL_ADAPTER_DXVA_KHR                              0x2022
-
-/* cl_media_adapter_set_khr */
-#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR   0x2023
-#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR         0x2024
-
-/* cl_context_info */
-#define CL_CONTEXT_ADAPTER_D3D9_KHR                      0x2025
-#define CL_CONTEXT_ADAPTER_D3D9EX_KHR                    0x2026
-#define CL_CONTEXT_ADAPTER_DXVA_KHR                      0x2027
-
-/* cl_mem_info */
-#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR                0x2028
-#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR                0x2029
-
-/* cl_image_info */
-#define CL_IMAGE_DX9_MEDIA_PLANE_KHR                     0x202A
-
-/* cl_command_type */
-#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR        0x202B
-#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR        0x202C
-
-/******************************************************************************/
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
-    cl_platform_id                   platform,
-    cl_uint                          num_media_adapters,
-    cl_dx9_media_adapter_type_khr *  media_adapter_type,
-    void *                           media_adapters,
-    cl_dx9_media_adapter_set_khr     media_adapter_set,
-    cl_uint                          num_entries,
-    cl_device_id *                   devices,
-    cl_uint *                        num_devices) CL_API_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
-    cl_context                    context,
-    cl_mem_flags                  flags,
-    cl_dx9_media_adapter_type_khr adapter_type,
-    void *                        surface_info,
-    cl_uint                       plane,                                                                          
-    cl_int *                      errcode_ret) CL_API_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
-    cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
-    cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_H */
diff --git a/mobile/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing_intel.h b/mobile/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing_intel.h
deleted file mode 100644
index abae0457a8..0000000000
--- a/mobile/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing_intel.h
+++ /dev/null
@@ -1,181 +0,0 @@
-/**********************************************************************************
- * Copyright (c) 2008-2016 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
- * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
- * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
- *    https://www.khronos.org/registry/
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- **********************************************************************************/
-/*****************************************************************************\
-
-Copyright (c) 2013-2016 Intel Corporation All Rights Reserved.
-
-THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
-MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-File Name: cl_dx9_media_sharing_intel.h
-
-Abstract:
-
-Notes:
-
-\*****************************************************************************/
-
-#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
-#define __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
-
-#include <CL/cl.h>
-#include <CL/cl_platform.h>
-#include <d3d9.h>
-#include <dxvahd.h>
-#include <wtypes.h>
-#include <d3d9types.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/***************************************
-* cl_intel_dx9_media_sharing extension *
-****************************************/
-
-#define cl_intel_dx9_media_sharing 1
-
-typedef cl_uint cl_dx9_device_source_intel;
-typedef cl_uint cl_dx9_device_set_intel;
-
-/* error codes */
-#define CL_INVALID_DX9_DEVICE_INTEL                   -1010
-#define CL_INVALID_DX9_RESOURCE_INTEL                 -1011
-#define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL        -1012
-#define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL            -1013
-
-/* cl_dx9_device_source_intel */
-#define CL_D3D9_DEVICE_INTEL                          0x4022
-#define CL_D3D9EX_DEVICE_INTEL                        0x4070
-#define CL_DXVA_DEVICE_INTEL                          0x4071
-
-/* cl_dx9_device_set_intel */
-#define CL_PREFERRED_DEVICES_FOR_DX9_INTEL            0x4024
-#define CL_ALL_DEVICES_FOR_DX9_INTEL                  0x4025
-
-/* cl_context_info */
-#define CL_CONTEXT_D3D9_DEVICE_INTEL                  0x4026
-#define CL_CONTEXT_D3D9EX_DEVICE_INTEL                0x4072
-#define CL_CONTEXT_DXVA_DEVICE_INTEL                  0x4073
-
-/* cl_mem_info */
-#define CL_MEM_DX9_RESOURCE_INTEL                     0x4027
-#define CL_MEM_DX9_SHARED_HANDLE_INTEL                0x4074
-
-/* cl_image_info */
-#define CL_IMAGE_DX9_PLANE_INTEL                      0x4075
-
-/* cl_command_type */
-#define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL          0x402A
-#define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL          0x402B
-/******************************************************************************/
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetDeviceIDsFromDX9INTEL(
-    cl_platform_id              /* platform */,
-    cl_dx9_device_source_intel  /* dx9_device_source */,
-    void*                       /* dx9_object */,
-    cl_dx9_device_set_intel     /* dx9_device_set */,
-    cl_uint                     /* num_entries */, 
-    cl_device_id*               /* devices */, 
-    cl_uint*                    /* num_devices */) CL_EXT_SUFFIX__VERSION_1_1;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL* clGetDeviceIDsFromDX9INTEL_fn)(
-    cl_platform_id              /* platform */,
-    cl_dx9_device_source_intel  /* dx9_device_source */,
-    void*                       /* dx9_object */,
-    cl_dx9_device_set_intel     /* dx9_device_set */,
-    cl_uint                     /* num_entries */, 
-    cl_device_id*               /* devices */, 
-    cl_uint*                    /* num_devices */) CL_EXT_SUFFIX__VERSION_1_1;
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromDX9MediaSurfaceINTEL(
-    cl_context                  /* context */,
-    cl_mem_flags                /* flags */,
-    IDirect3DSurface9*          /* resource */,
-    HANDLE                      /* sharedHandle */,
-    UINT                        /* plane */,
-    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceINTEL_fn)(
-    cl_context                  /* context */,
-    cl_mem_flags                /* flags */,
-    IDirect3DSurface9*          /* resource */,
-    HANDLE                      /* sharedHandle */,
-    UINT                        /* plane */,
-    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueAcquireDX9ObjectsINTEL(
-    cl_command_queue            /* command_queue */,
-    cl_uint                     /* num_objects */,
-    const cl_mem*               /* mem_objects */,
-    cl_uint                     /* num_events_in_wait_list */,
-    const cl_event*             /* event_wait_list */,
-    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9ObjectsINTEL_fn)(
-    cl_command_queue            /* command_queue */,
-    cl_uint                     /* num_objects */,
-    const cl_mem*               /* mem_objects */,
-    cl_uint                     /* num_events_in_wait_list */,
-    const cl_event*             /* event_wait_list */,
-    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReleaseDX9ObjectsINTEL(
-    cl_command_queue            /* command_queue */,
-    cl_uint                     /* num_objects */,
-    cl_mem*                     /* mem_objects */,
-    cl_uint                     /* num_events_in_wait_list */,
-    const cl_event*             /* event_wait_list */,
-    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9ObjectsINTEL_fn)(
-    cl_command_queue            /* command_queue */,
-    cl_uint                     /* num_objects */,
-    cl_mem*                     /* mem_objects */,
-    cl_uint                     /* num_events_in_wait_list */,
-    const cl_event*             /* event_wait_list */,
-    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H */
diff --git a/mobile/third_party/opencl/OpenCL-Headers/CL/cl_egl.h b/mobile/third_party/opencl/OpenCL-Headers/CL/cl_egl.h
deleted file mode 100644
index a765bd5266..0000000000
--- a/mobile/third_party/opencl/OpenCL-Headers/CL/cl_egl.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2008-2015 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
- * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
- * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
- *    https://www.khronos.org/registry/
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- ******************************************************************************/
-
-#ifndef __OPENCL_CL_EGL_H
-#define __OPENCL_CL_EGL_H
-
-#ifdef __APPLE__
-
-#else
-#include <CL/cl.h>
-#endif  
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
-#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR  0x202F
-#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR    0x202D
-#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR    0x202E
-
-/* Error type for clCreateFromEGLImageKHR */
-#define CL_INVALID_EGL_OBJECT_KHR             -1093
-#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR      -1092
-
-/* CLeglImageKHR is an opaque handle to an EGLImage */
-typedef void* CLeglImageKHR;
-
-/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
-typedef void* CLeglDisplayKHR;
-
-/* CLeglSyncKHR is an opaque handle to an EGLSync object */
-typedef void* CLeglSyncKHR;
-
-/* properties passed to clCreateFromEGLImageKHR */
-typedef intptr_t cl_egl_image_properties_khr;
-
-
-#define cl_khr_egl_image 1
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromEGLImageKHR(cl_context                  /* context */,
-                        CLeglDisplayKHR             /* egldisplay */,
-                        CLeglImageKHR               /* eglimage */,
-                        cl_mem_flags                /* flags */,
-                        const cl_egl_image_properties_khr * /* properties */,
-                        cl_int *                    /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
-	cl_context                  context,
-	CLeglDisplayKHR             egldisplay,
-	CLeglImageKHR               eglimage,
-	cl_mem_flags                flags,
-	const cl_egl_image_properties_khr * properties,
-	cl_int *                    errcode_ret);
-
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueAcquireEGLObjectsKHR(cl_command_queue /* command_queue */,
-                              cl_uint          /* num_objects */,
-                              const cl_mem *   /* mem_objects */,
-                              cl_uint          /* num_events_in_wait_list */,
-                              const cl_event * /* event_wait_list */,
-                              cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
-	cl_command_queue command_queue,
-	cl_uint          num_objects,
-	const cl_mem *   mem_objects,
-	cl_uint          num_events_in_wait_list,
-	const cl_event * event_wait_list,
-	cl_event *       event);
-
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReleaseEGLObjectsKHR(cl_command_queue /* command_queue */,
-                              cl_uint          /* num_objects */,
-                              const cl_mem *   /* mem_objects */,
-                              cl_uint          /* num_events_in_wait_list */,
-                              const cl_event * /* event_wait_list */,
-                              cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
-	cl_command_queue command_queue,
-	cl_uint          num_objects,
-	const cl_mem *   mem_objects,
-	cl_uint          num_events_in_wait_list,
-	const cl_event * event_wait_list,
-	cl_event *       event);
-
-
-#define cl_khr_egl_event 1
-
-extern CL_API_ENTRY cl_event CL_API_CALL
-clCreateEventFromEGLSyncKHR(cl_context      /* context */,
-                            CLeglSyncKHR    /* sync */,
-                            CLeglDisplayKHR /* display */,
-                            cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
-	cl_context      context,
-	CLeglSyncKHR    sync,
-	CLeglDisplayKHR display,
-	cl_int *        errcode_ret);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* __OPENCL_CL_EGL_H */
diff --git a/mobile/third_party/opencl/OpenCL-Headers/CL/cl_ext.h b/mobile/third_party/opencl/OpenCL-Headers/CL/cl_ext.h
deleted file mode 100644
index af3ce461f3..0000000000
--- a/mobile/third_party/opencl/OpenCL-Headers/CL/cl_ext.h
+++ /dev/null
@@ -1,723 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2008-2018 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
- * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
- * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
- *    https://www.khronos.org/registry/
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- ******************************************************************************/
-
-/* cl_ext.h contains OpenCL extensions which don't have external */
-/* (OpenGL, D3D) dependencies.                                   */
-
-#ifndef __CL_EXT_H
-#define __CL_EXT_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifdef __APPLE__
-    #include <OpenCL/cl.h>
-    #include <AvailabilityMacros.h>
-#else
-    #include <CL/cl.h>
-#endif
-
-/* cl_khr_fp64 extension - no extension #define since it has no functions  */
-/* CL_DEVICE_DOUBLE_FP_CONFIG is defined in CL.h for OpenCL >= 120 */
-
-#if CL_TARGET_OPENCL_VERSION <= 110
-#define CL_DEVICE_DOUBLE_FP_CONFIG                       0x1032
-#endif
-
-/* cl_khr_fp16 extension - no extension #define since it has no functions  */
-#define CL_DEVICE_HALF_FP_CONFIG                    0x1033
-
-/* Memory object destruction
- *
- * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
- *
- * Registers a user callback function that will be called when the memory object is deleted and its resources
- * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback
- * stack associated with memobj. The registered user callback functions are called in the reverse order in
- * which they were registered. The user callback functions are called and then the memory object is deleted
- * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be
- * notified when the memory referenced by host_ptr, specified when the memory object is created and used as
- * the storage bits for the memory object, can be reused or freed.
- *
- * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
- *
- * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
- * before using.
- */
-#define cl_APPLE_SetMemObjectDestructor 1
-cl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem /* memobj */,
-                                        void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
-                                        void * /*user_data */ )             CL_EXT_SUFFIX__VERSION_1_0;
-
-
-/* Context Logging Functions
- *
- * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
- * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
- * before using.
- *
- * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger
- */
-#define cl_APPLE_ContextLoggingFunctions 1
-extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * /* errstr */,
-                                            const void * /* private_info */,
-                                            size_t       /* cb */,
-                                            void *       /* user_data */ )  CL_EXT_SUFFIX__VERSION_1_0;
-
-/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
-extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * /* errstr */,
-                                          const void * /* private_info */,
-                                          size_t       /* cb */,
-                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
-
-/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
-extern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * /* errstr */,
-                                          const void * /* private_info */,
-                                          size_t       /* cb */,
-                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
-
-
-/************************
-* cl_khr_icd extension *
-************************/
-#define cl_khr_icd 1
-
-/* cl_platform_info                                                        */
-#define CL_PLATFORM_ICD_SUFFIX_KHR                  0x0920
-
-/* Additional Error Codes                                                  */
-#define CL_PLATFORM_NOT_FOUND_KHR                   -1001
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clIcdGetPlatformIDsKHR(cl_uint          /* num_entries */,
-                       cl_platform_id * /* platforms */,
-                       cl_uint *        /* num_platforms */);
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
-    cl_uint          /* num_entries */,
-    cl_platform_id * /* platforms */,
-    cl_uint *        /* num_platforms */);
-
-
-
-/*******************************
- * cl_khr_il_program extension *
- *******************************/
-#define cl_khr_il_program 1
-
-/* New property to clGetDeviceInfo for retrieving supported intermediate
- * languages
- */
-#define CL_DEVICE_IL_VERSION_KHR                    0x105B
-
-/* New property to clGetProgramInfo for retrieving for retrieving the IL of a
- * program
- */
-#define CL_PROGRAM_IL_KHR                           0x1169
-
-extern CL_API_ENTRY cl_program
-  CL_API_CALL clCreateProgramWithILKHR(
-      cl_context /* context */,
-      const void * /* il */,
-      size_t /* length */,
-      cl_int * /* errcode_ret */);
-
-typedef CL_API_ENTRY cl_program
-  (CL_API_CALL *clCreateProgramWithILKHR_fn)(
-      cl_context /* context */,
-      const void * /* il */,
-      size_t /* length */,
-      cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
-
-/* Extension: cl_khr_image2D_buffer
- *
- * This extension allows a 2D image to be created from a cl_mem buffer without a copy.
- * The type associated with a 2D image created from a buffer in an OpenCL program is image2d_t.
- * Both the sampler and sampler-less read_image built-in functions are supported for 2D images
- * and 2D images created from a buffer.  Similarly, the write_image built-ins are also supported
- * for 2D images created from a buffer.
- *
- * When the 2D image from buffer is created, the client must specify the width,
- * height, image format (i.e. channel order and channel data type) and optionally the row pitch
- *
- * The pitch specified must be a multiple of CL_DEVICE_IMAGE_PITCH_ALIGNMENT pixels.
- * The base address of the buffer must be aligned to CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT pixels.
- */
-
-/**************************************
- * cl_khr_initialize_memory extension *
- **************************************/
-
-#define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x2030
-
-
-/**************************************
- * cl_khr_terminate_context extension *
- **************************************/
-
-#define CL_DEVICE_TERMINATE_CAPABILITY_KHR          0x2031
-#define CL_CONTEXT_TERMINATE_KHR                    0x2032
-
-#define cl_khr_terminate_context 1
-extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
-
-
-/*
- * Extension: cl_khr_spir
- *
- * This extension adds support to create an OpenCL program object from a
- * Standard Portable Intermediate Representation (SPIR) instance
- */
-
-#define CL_DEVICE_SPIR_VERSIONS                     0x40E0
-#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE         0x40E1
-
-
-/*****************************************
- * cl_khr_create_command_queue extension *
- *****************************************/
-#define cl_khr_create_command_queue 1
-
-typedef cl_bitfield cl_queue_properties_khr;
-
-extern CL_API_ENTRY cl_command_queue CL_API_CALL
-clCreateCommandQueueWithPropertiesKHR( cl_context /* context */,
-                                       cl_device_id /* device */,
-                                       const cl_queue_properties_khr* /* properties */,
-                                       cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
-typedef CL_API_ENTRY cl_command_queue
-(CL_API_CALL *clCreateCommandQueueWithPropertiesKHR_fn)( cl_context /* context */,
-                                                         cl_device_id /* device */,
-                                                         const cl_queue_properties_khr* /* properties */,
-                                                         cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
-
-
-/******************************************
-* cl_nv_device_attribute_query extension *
-******************************************/
-
-/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
-#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
-#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
-#define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
-#define CL_DEVICE_WARP_SIZE_NV                      0x4003
-#define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
-#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
-#define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
-
-
-/*********************************
-* cl_amd_device_attribute_query *
-*********************************/
-
-#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD        0x4036
-
-
-/*********************************
-* cl_arm_printf extension
-*********************************/
-
-#define CL_PRINTF_CALLBACK_ARM                      0x40B0
-#define CL_PRINTF_BUFFERSIZE_ARM                    0x40B1
-
-
-/***********************************
-* cl_ext_device_fission extension
-***********************************/
-#define cl_ext_device_fission   1
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
-
-typedef CL_API_ENTRY cl_int
-(CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
-
-typedef CL_API_ENTRY cl_int
-(CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
-
-typedef cl_ulong  cl_device_partition_property_ext;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clCreateSubDevicesEXT(  cl_device_id /*in_device*/,
-                        const cl_device_partition_property_ext * /* properties */,
-                        cl_uint /*num_entries*/,
-                        cl_device_id * /*out_devices*/,
-                        cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
-
-typedef CL_API_ENTRY cl_int
-( CL_API_CALL * clCreateSubDevicesEXT_fn)(  cl_device_id /*in_device*/,
-                                            const cl_device_partition_property_ext * /* properties */,
-                                            cl_uint /*num_entries*/,
-                                            cl_device_id * /*out_devices*/,
-                                            cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
-
-/* cl_device_partition_property_ext */
-#define CL_DEVICE_PARTITION_EQUALLY_EXT             0x4050
-#define CL_DEVICE_PARTITION_BY_COUNTS_EXT           0x4051
-#define CL_DEVICE_PARTITION_BY_NAMES_EXT            0x4052
-#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT  0x4053
-
-/* clDeviceGetInfo selectors */
-#define CL_DEVICE_PARENT_DEVICE_EXT                 0x4054
-#define CL_DEVICE_PARTITION_TYPES_EXT               0x4055
-#define CL_DEVICE_AFFINITY_DOMAINS_EXT              0x4056
-#define CL_DEVICE_REFERENCE_COUNT_EXT               0x4057
-#define CL_DEVICE_PARTITION_STYLE_EXT               0x4058
-
-/* error codes */
-#define CL_DEVICE_PARTITION_FAILED_EXT              -1057
-#define CL_INVALID_PARTITION_COUNT_EXT              -1058
-#define CL_INVALID_PARTITION_NAME_EXT               -1059
-
-/* CL_AFFINITY_DOMAINs */
-#define CL_AFFINITY_DOMAIN_L1_CACHE_EXT             0x1
-#define CL_AFFINITY_DOMAIN_L2_CACHE_EXT             0x2
-#define CL_AFFINITY_DOMAIN_L3_CACHE_EXT             0x3
-#define CL_AFFINITY_DOMAIN_L4_CACHE_EXT             0x4
-#define CL_AFFINITY_DOMAIN_NUMA_EXT                 0x10
-#define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT     0x100
-
-/* cl_device_partition_property_ext list terminators */
-#define CL_PROPERTIES_LIST_END_EXT                  ((cl_device_partition_property_ext) 0)
-#define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
-#define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)
-
-
-/***********************************
- * cl_ext_migrate_memobject extension definitions
- ***********************************/
-#define cl_ext_migrate_memobject 1
-
-typedef cl_bitfield cl_mem_migration_flags_ext;
-
-#define CL_MIGRATE_MEM_OBJECT_HOST_EXT              0x1
-
-#define CL_COMMAND_MIGRATE_MEM_OBJECT_EXT           0x4040
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueMigrateMemObjectEXT( cl_command_queue /* command_queue */,
-                              cl_uint /* num_mem_objects */,
-                              const cl_mem * /* mem_objects */,
-                              cl_mem_migration_flags_ext /* flags */,
-                              cl_uint /* num_events_in_wait_list */,
-                              const cl_event * /* event_wait_list */,
-                              cl_event * /* event */ );
-
-typedef CL_API_ENTRY cl_int
-(CL_API_CALL *clEnqueueMigrateMemObjectEXT_fn)( cl_command_queue /* command_queue */,
-                                                cl_uint /* num_mem_objects */,
-                                                const cl_mem * /* mem_objects */,
-                                                cl_mem_migration_flags_ext /* flags */,
-                                                cl_uint /* num_events_in_wait_list */,
-                                                const cl_event * /* event_wait_list */,
-                                                cl_event * /* event */ );
-
-
-/*********************************
-* cl_qcom_ext_host_ptr extension
-*********************************/
-#define cl_qcom_ext_host_ptr 1
-
-#define CL_MEM_EXT_HOST_PTR_QCOM                  (1 << 29)
-
-#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM   0x40A0
-#define CL_DEVICE_PAGE_SIZE_QCOM                  0x40A1
-#define CL_IMAGE_ROW_ALIGNMENT_QCOM               0x40A2
-#define CL_IMAGE_SLICE_ALIGNMENT_QCOM             0x40A3
-#define CL_MEM_HOST_UNCACHED_QCOM                 0x40A4
-#define CL_MEM_HOST_WRITEBACK_QCOM                0x40A5
-#define CL_MEM_HOST_WRITETHROUGH_QCOM             0x40A6
-#define CL_MEM_HOST_WRITE_COMBINING_QCOM          0x40A7
-
-typedef cl_uint                                   cl_image_pitch_info_qcom;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetDeviceImageInfoQCOM(cl_device_id             device,
-                         size_t                   image_width,
-                         size_t                   image_height,
-                         const cl_image_format   *image_format,
-                         cl_image_pitch_info_qcom param_name,
-                         size_t                   param_value_size,
-                         void                    *param_value,
-                         size_t                  *param_value_size_ret);
-
-typedef struct _cl_mem_ext_host_ptr
-{
-    /* Type of external memory allocation. */
-    /* Legal values will be defined in layered extensions. */
-    cl_uint  allocation_type;
-
-    /* Host cache policy for this external memory allocation. */
-    cl_uint  host_cache_policy;
-
-} cl_mem_ext_host_ptr;
-
-
-/*******************************************
-* cl_qcom_ext_host_ptr_iocoherent extension
-********************************************/
-
-/* Cache policy specifying io-coherence */
-#define CL_MEM_HOST_IOCOHERENT_QCOM               0x40A9
-
-
-/*********************************
-* cl_qcom_ion_host_ptr extension
-*********************************/
-
-#define CL_MEM_ION_HOST_PTR_QCOM                  0x40A8
-
-typedef struct _cl_mem_ion_host_ptr
-{
-    /* Type of external memory allocation. */
-    /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */
-    cl_mem_ext_host_ptr  ext_host_ptr;
-
-    /* ION file descriptor */
-    int                  ion_filedesc;
-
-    /* Host pointer to the ION allocated memory */
-    void*                ion_hostptr;
-
-} cl_mem_ion_host_ptr;
-
-
-/*********************************
-* cl_qcom_android_native_buffer_host_ptr extension
-*********************************/
-
-#define CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM                  0x40C6
-
-typedef struct _cl_mem_android_native_buffer_host_ptr
-{
-    /* Type of external memory allocation. */
-    /* Must be CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM for Android native buffers. */
-    cl_mem_ext_host_ptr  ext_host_ptr;
-
-    /* Virtual pointer to the android native buffer */
-    void*                anb_ptr;
-
-} cl_mem_android_native_buffer_host_ptr;
-
-
-/******************************************
- * cl_img_yuv_image extension *
- ******************************************/
-
-/* Image formats used in clCreateImage */
-#define CL_NV21_IMG                                 0x40D0
-#define CL_YV12_IMG                                 0x40D1
-
-
-/******************************************
- * cl_img_cached_allocations extension *
- ******************************************/
-
-/* Flag values used by clCreteBuffer */
-#define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG          (1 << 26)
-#define CL_MEM_USE_CACHED_CPU_MEMORY_IMG            (1 << 27)
-
-
-/******************************************
- * cl_img_use_gralloc_ptr extension *
- ******************************************/
-#define cl_img_use_gralloc_ptr 1
-
-/* Flag values used by clCreteBuffer */
-#define CL_MEM_USE_GRALLOC_PTR_IMG                  (1 << 28)
-
-/* To be used by clGetEventInfo: */
-#define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG      0x40D2
-#define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG      0x40D3
-
-/* Error code from clEnqueueReleaseGrallocObjectsIMG */
-#define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG        0x40D4
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueAcquireGrallocObjectsIMG(cl_command_queue      /* command_queue */,
-                                  cl_uint               /* num_objects */,
-                                  const cl_mem *        /* mem_objects */,
-                                  cl_uint               /* num_events_in_wait_list */,
-                                  const cl_event *      /* event_wait_list */,
-                                  cl_event *            /* event */) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReleaseGrallocObjectsIMG(cl_command_queue      /* command_queue */,
-                                  cl_uint               /* num_objects */,
-                                  const cl_mem *        /* mem_objects */,
-                                  cl_uint               /* num_events_in_wait_list */,
-                                  const cl_event *      /* event_wait_list */,
-                                  cl_event *            /* event */) CL_EXT_SUFFIX__VERSION_1_2;
-
-
-/*********************************
-* cl_khr_subgroups extension
-*********************************/
-#define cl_khr_subgroups 1
-
-#if !defined(CL_VERSION_2_1)
-/* For OpenCL 2.1 and newer, cl_kernel_sub_group_info is declared in CL.h.
-   In hindsight, there should have been a khr suffix on this type for
-   the extension, but keeping it un-suffixed to maintain backwards
-   compatibility. */
-typedef cl_uint             cl_kernel_sub_group_info;
-#endif
-
-/* cl_kernel_sub_group_info */
-#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR    0x2033
-#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR       0x2034
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetKernelSubGroupInfoKHR(cl_kernel /* in_kernel */,
-                           cl_device_id /*in_device*/,
-                           cl_kernel_sub_group_info /* param_name */,
-                           size_t /*input_value_size*/,
-                           const void * /*input_value*/,
-                           size_t /*param_value_size*/,
-                           void* /*param_value*/,
-                           size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
-
-typedef CL_API_ENTRY cl_int
-(CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel /* in_kernel */,
-                              cl_device_id /*in_device*/,
-                              cl_kernel_sub_group_info /* param_name */,
-                              size_t /*input_value_size*/,
-                              const void * /*input_value*/,
-                              size_t /*param_value_size*/,
-                              void* /*param_value*/,
-                              size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
-
-
-/*********************************
-* cl_khr_priority_hints extension
-*********************************/
-/* This extension define is for backwards compatibility.
-   It shouldn't be required since this extension has no new functions. */
-#define cl_khr_priority_hints 1
-
-typedef cl_uint  cl_queue_priority_khr;
-
-/* cl_command_queue_properties */
-#define CL_QUEUE_PRIORITY_KHR 0x1096
-
-/* cl_queue_priority_khr */
-#define CL_QUEUE_PRIORITY_HIGH_KHR (1<<0)
-#define CL_QUEUE_PRIORITY_MED_KHR (1<<1)
-#define CL_QUEUE_PRIORITY_LOW_KHR (1<<2)
-
-
-/*********************************
-* cl_khr_throttle_hints extension
-*********************************/
-/* This extension define is for backwards compatibility.
-   It shouldn't be required since this extension has no new functions. */
-#define cl_khr_throttle_hints 1
-
-typedef cl_uint  cl_queue_throttle_khr;
-
-/* cl_command_queue_properties */
-#define CL_QUEUE_THROTTLE_KHR 0x1097
-
-/* cl_queue_throttle_khr */
-#define CL_QUEUE_THROTTLE_HIGH_KHR (1<<0)
-#define CL_QUEUE_THROTTLE_MED_KHR (1<<1)
-#define CL_QUEUE_THROTTLE_LOW_KHR (1<<2)
-
-
-/*********************************
-* cl_khr_subgroup_named_barrier
-*********************************/
-/* This extension define is for backwards compatibility.
-   It shouldn't be required since this extension has no new functions. */
-#define cl_khr_subgroup_named_barrier 1
-
-/* cl_device_info */
-#define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR       0x2035
-
-
-/**********************************
- * cl_arm_import_memory extension *
- **********************************/
-#define cl_arm_import_memory 1
-
-typedef intptr_t cl_import_properties_arm;
-
-/* Default and valid proporties name for cl_arm_import_memory */
-#define CL_IMPORT_TYPE_ARM                        0x40B2
-
-/* Host process memory type default value for CL_IMPORT_TYPE_ARM property */
-#define CL_IMPORT_TYPE_HOST_ARM                   0x40B3
-
-/* DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
-#define CL_IMPORT_TYPE_DMA_BUF_ARM                0x40B4
-
-/* Secure DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
-#define CL_IMPORT_TYPE_SECURE_ARM                 0x40B5
-
-/* This extension adds a new function that allows for direct memory import into
- * OpenCL via the clImportMemoryARM function.
- *
- * Memory imported through this interface will be mapped into the device's page
- * tables directly, providing zero copy access. It will never fall back to copy
- * operations and aliased buffers.
- *
- * Types of memory supported for import are specified as additional extension
- * strings.
- *
- * This extension produces cl_mem allocations which are compatible with all other
- * users of cl_mem in the standard API.
- *
- * This extension maps pages with the same properties as the normal buffer creation
- * function clCreateBuffer.
- */
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clImportMemoryARM( cl_context context,
-                   cl_mem_flags flags,
-                   const cl_import_properties_arm *properties,
-                   void *memory,
-                   size_t size,
-                   cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_0;
-
-
-/******************************************
- * cl_arm_shared_virtual_memory extension *
- ******************************************/
-#define cl_arm_shared_virtual_memory 1
-
-/* Used by clGetDeviceInfo */
-#define CL_DEVICE_SVM_CAPABILITIES_ARM                  0x40B6
-
-/* Used by clGetMemObjectInfo */
-#define CL_MEM_USES_SVM_POINTER_ARM                     0x40B7
-
-/* Used by clSetKernelExecInfoARM: */
-#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM                0x40B8
-#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM   0x40B9
-
-/* To be used by clGetEventInfo: */
-#define CL_COMMAND_SVM_FREE_ARM                         0x40BA
-#define CL_COMMAND_SVM_MEMCPY_ARM                       0x40BB
-#define CL_COMMAND_SVM_MEMFILL_ARM                      0x40BC
-#define CL_COMMAND_SVM_MAP_ARM                          0x40BD
-#define CL_COMMAND_SVM_UNMAP_ARM                        0x40BE
-
-/* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */
-#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM           (1 << 0)
-#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM             (1 << 1)
-#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM             (1 << 2)
-#define CL_DEVICE_SVM_ATOMICS_ARM                       (1 << 3)
-
-/* Flag values used by clSVMAllocARM: */
-#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM                (1 << 10)
-#define CL_MEM_SVM_ATOMICS_ARM                          (1 << 11)
-
-typedef cl_bitfield cl_svm_mem_flags_arm;
-typedef cl_uint     cl_kernel_exec_info_arm;
-typedef cl_bitfield cl_device_svm_capabilities_arm;
-
-extern CL_API_ENTRY void * CL_API_CALL
-clSVMAllocARM(cl_context       /* context */,
-              cl_svm_mem_flags_arm /* flags */,
-              size_t           /* size */,
-              cl_uint          /* alignment */) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY void CL_API_CALL
-clSVMFreeARM(cl_context        /* context */,
-             void *            /* svm_pointer */) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueSVMFreeARM(cl_command_queue  /* command_queue */,
-                    cl_uint           /* num_svm_pointers */,
-                    void *[]          /* svm_pointers[] */,
-                    void (CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */,
-                                                           cl_uint          /* num_svm_pointers */,
-                                                           void *[]         /* svm_pointers[] */,
-                                                           void *           /* user_data */),
-                    void *            /* user_data */,
-                    cl_uint           /* num_events_in_wait_list */,
-                    const cl_event *  /* event_wait_list */,
-                    cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueSVMMemcpyARM(cl_command_queue  /* command_queue */,
-                      cl_bool           /* blocking_copy */,
-                      void *            /* dst_ptr */,
-                      const void *      /* src_ptr */,
-                      size_t            /* size */,
-                      cl_uint           /* num_events_in_wait_list */,
-                      const cl_event *  /* event_wait_list */,
-                      cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueSVMMemFillARM(cl_command_queue  /* command_queue */,
-                       void *            /* svm_ptr */,
-                       const void *      /* pattern */,
-                       size_t            /* pattern_size */,
-                       size_t            /* size */,
-                       cl_uint           /* num_events_in_wait_list */,
-                       const cl_event *  /* event_wait_list */,
-                       cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueSVMMapARM(cl_command_queue  /* command_queue */,
-                   cl_bool           /* blocking_map */,
-                   cl_map_flags      /* flags */,
-                   void *            /* svm_ptr */,
-                   size_t            /* size */,
-                   cl_uint           /* num_events_in_wait_list */,
-                   const cl_event *  /* event_wait_list */,
-                   cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueSVMUnmapARM(cl_command_queue  /* command_queue */,
-                     void *            /* svm_ptr */,
-                     cl_uint           /* num_events_in_wait_list */,
-                     const cl_event *  /* event_wait_list */,
-                     cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetKernelArgSVMPointerARM(cl_kernel    /* kernel */,
-                            cl_uint      /* arg_index */,
-                            const void * /* arg_value */) CL_EXT_SUFFIX__VERSION_1_2;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetKernelExecInfoARM(cl_kernel            /* kernel */,
-                       cl_kernel_exec_info_arm  /* param_name */,
-                       size_t               /* param_value_size */,
-                       const void *         /* param_value */) CL_EXT_SUFFIX__VERSION_1_2;
-
-#ifdef __cplusplus
-}
-#endif
-
-
-#endif /* __CL_EXT_H */
diff --git a/mobile/third_party/opencl/OpenCL-Headers/CL/cl_ext_intel.h b/mobile/third_party/opencl/OpenCL-Headers/CL/cl_ext_intel.h
deleted file mode 100644
index 53bd3107c5..0000000000
--- a/mobile/third_party/opencl/OpenCL-Headers/CL/cl_ext_intel.h
+++ /dev/null
@@ -1,428 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2008-2017 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
- * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
- * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
- *    https://www.khronos.org/registry/
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- ******************************************************************************/
-/*****************************************************************************\
-
-Copyright (c) 2013-2017 Intel Corporation All Rights Reserved.
-
-THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
-MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-File Name: cl_ext_intel.h
-
-Abstract:
-
-Notes:
-
-\*****************************************************************************/
-
-#ifndef __CL_EXT_INTEL_H
-#define __CL_EXT_INTEL_H
-
-#ifdef __APPLE__
-    #include <OpenCL/cl.h>
-    #include <OpenCL/cl_platform.h>
-#else
-    #include <CL/cl.h>
-    #include <CL/cl_platform.h>
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/***************************************
-* cl_intel_thread_local_exec extension *
-****************************************/
-
-#define cl_intel_thread_local_exec 1
-
-#define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL      (((cl_bitfield)1) << 31)
-
-/***********************************************
-* cl_intel_device_partition_by_names extension *
-************************************************/
-
-#define cl_intel_device_partition_by_names 1
-
-#define CL_DEVICE_PARTITION_BY_NAMES_INTEL          0x4052
-#define CL_PARTITION_BY_NAMES_LIST_END_INTEL        -1
-
-/************************************************
-* cl_intel_accelerator extension                *
-* cl_intel_motion_estimation extension          *
-* cl_intel_advanced_motion_estimation extension *
-*************************************************/
-
-#define cl_intel_accelerator 1
-#define cl_intel_motion_estimation 1
-#define cl_intel_advanced_motion_estimation 1
-
-typedef struct _cl_accelerator_intel* cl_accelerator_intel;
-typedef cl_uint cl_accelerator_type_intel;
-typedef cl_uint cl_accelerator_info_intel;
-
-typedef struct _cl_motion_estimation_desc_intel {
-    cl_uint mb_block_type;
-    cl_uint subpixel_mode;
-    cl_uint sad_adjust_mode;
-    cl_uint search_path_type;
-} cl_motion_estimation_desc_intel;
-
-/* error codes */
-#define CL_INVALID_ACCELERATOR_INTEL                              -1094
-#define CL_INVALID_ACCELERATOR_TYPE_INTEL                         -1095
-#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL                   -1096
-#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL                   -1097
-
-/* cl_accelerator_type_intel */
-#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL               0x0
-
-/* cl_accelerator_info_intel */
-#define CL_ACCELERATOR_DESCRIPTOR_INTEL                           0x4090
-#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL                      0x4091
-#define CL_ACCELERATOR_CONTEXT_INTEL                              0x4092
-#define CL_ACCELERATOR_TYPE_INTEL                                 0x4093
-
-/* cl_motion_detect_desc_intel flags */
-#define CL_ME_MB_TYPE_16x16_INTEL                                 0x0
-#define CL_ME_MB_TYPE_8x8_INTEL                                   0x1
-#define CL_ME_MB_TYPE_4x4_INTEL                                   0x2
-
-#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL                         0x0
-#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL                            0x1
-#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL                            0x2
-
-#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL                          0x0
-#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL                          0x1
-
-#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL                        0x0
-#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL                        0x1
-#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL                      0x5
-
-#define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL                         0x0
-#define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL                  0x1
-#define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL                    0x2
-#define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL                           0x4
-
-#define CL_ME_FORWARD_INPUT_MODE_INTEL                            0x1
-#define CL_ME_BACKWARD_INPUT_MODE_INTEL                           0x2
-#define CL_ME_BIDIRECTION_INPUT_MODE_INTEL                        0x3
-
-#define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL                          16
-#define CL_ME_BIDIR_WEIGHT_THIRD_INTEL                            21
-#define CL_ME_BIDIR_WEIGHT_HALF_INTEL                             32
-#define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL                        43
-#define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL                    48
-
-#define CL_ME_COST_PENALTY_NONE_INTEL                             0x0
-#define CL_ME_COST_PENALTY_LOW_INTEL                              0x1
-#define CL_ME_COST_PENALTY_NORMAL_INTEL                           0x2
-#define CL_ME_COST_PENALTY_HIGH_INTEL                             0x3
-
-#define CL_ME_COST_PRECISION_QPEL_INTEL                           0x0
-#define CL_ME_COST_PRECISION_HPEL_INTEL                           0x1
-#define CL_ME_COST_PRECISION_PEL_INTEL                            0x2
-#define CL_ME_COST_PRECISION_DPEL_INTEL                           0x3
-
-#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL                  0x0
-#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL                0x1
-#define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL                        0x2
-#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL        0x3
-
-#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL       0x4
-#define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL                     0x4
-#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL            0x5
-#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL           0x6
-#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL             0x7
-#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL             0x8
-
-#define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                      0x0
-#define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL              0x1
-#define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL                0x2
-#define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL                   0x3
-
-/* cl_device_info */
-#define CL_DEVICE_ME_VERSION_INTEL                                0x407E
-
-#define CL_ME_VERSION_LEGACY_INTEL                                0x0
-#define CL_ME_VERSION_ADVANCED_VER_1_INTEL                        0x1
-#define CL_ME_VERSION_ADVANCED_VER_2_INTEL                        0x2
-
-extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL
-clCreateAcceleratorINTEL(
-    cl_context                  /* context */,
-    cl_accelerator_type_intel   /* accelerator_type */,
-    size_t                      /* descriptor_size */,
-    const void*                 /* descriptor */,
-    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_accelerator_intel (CL_API_CALL *clCreateAcceleratorINTEL_fn)(
-    cl_context                  /* context */,
-    cl_accelerator_type_intel   /* accelerator_type */,
-    size_t                      /* descriptor_size */,
-    const void*                 /* descriptor */,
-    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetAcceleratorInfoINTEL(
-    cl_accelerator_intel        /* accelerator */,
-    cl_accelerator_info_intel   /* param_name */,
-    size_t                      /* param_value_size */,
-    void*                       /* param_value */,
-    size_t*                     /* param_value_size_ret */) CL_EXT_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)(
-    cl_accelerator_intel        /* accelerator */,
-    cl_accelerator_info_intel   /* param_name */,
-    size_t                      /* param_value_size */,
-    void*                       /* param_value */,
-    size_t*                     /* param_value_size_ret */) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainAcceleratorINTEL(
-    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clRetainAcceleratorINTEL_fn)(
-    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseAcceleratorINTEL(
-    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clReleaseAcceleratorINTEL_fn)(
-    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
-
-/******************************************
-* cl_intel_simultaneous_sharing extension *
-*******************************************/
-
-#define cl_intel_simultaneous_sharing 1
-
-#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL            0x4104
-#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL        0x4105
-
-/***********************************
-* cl_intel_egl_image_yuv extension *
-************************************/
-
-#define cl_intel_egl_image_yuv 1
-
-#define CL_EGL_YUV_PLANE_INTEL                           0x4107
-
-/********************************
-* cl_intel_packed_yuv extension *
-*********************************/
-
-#define cl_intel_packed_yuv 1
-
-#define CL_YUYV_INTEL                                    0x4076
-#define CL_UYVY_INTEL                                    0x4077
-#define CL_YVYU_INTEL                                    0x4078
-#define CL_VYUY_INTEL                                    0x4079
-
-/********************************************
-* cl_intel_required_subgroup_size extension *
-*********************************************/
-
-#define cl_intel_required_subgroup_size 1
-
-#define CL_DEVICE_SUB_GROUP_SIZES_INTEL                  0x4108
-#define CL_KERNEL_SPILL_MEM_SIZE_INTEL                   0x4109
-#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL           0x410A
-
-/****************************************
-* cl_intel_driver_diagnostics extension *
-*****************************************/
-
-#define cl_intel_driver_diagnostics 1
-
-typedef cl_uint cl_diagnostics_verbose_level;
-
-#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL                0x4106
-
-#define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL           ( 0xff )
-#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL          ( 1 )
-#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL           ( 1 << 1 )
-#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL       ( 1 << 2 )
-
-/********************************
-* cl_intel_planar_yuv extension *
-*********************************/
-
-#define CL_NV12_INTEL                                       0x410E
-
-#define CL_MEM_NO_ACCESS_INTEL                              ( 1 << 24 )
-#define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL              ( 1 << 25 )
-
-#define CL_DEVICE_PLANAR_YUV_MAX_WIDTH_INTEL                0x417E
-#define CL_DEVICE_PLANAR_YUV_MAX_HEIGHT_INTEL               0x417F
-
-/*******************************************************
-* cl_intel_device_side_avc_motion_estimation extension *
-********************************************************/
-
-#define CL_DEVICE_AVC_ME_VERSION_INTEL                      0x410B
-#define CL_DEVICE_AVC_ME_SUPPORTS_TEXTURE_SAMPLER_USE_INTEL 0x410C
-#define CL_DEVICE_AVC_ME_SUPPORTS_PREEMPTION_INTEL          0x410D
-
-#define CL_AVC_ME_VERSION_0_INTEL                           0x0;  // No support.
-#define CL_AVC_ME_VERSION_1_INTEL                           0x1;  // First supported version.
-
-#define CL_AVC_ME_MAJOR_16x16_INTEL                         0x0
-#define CL_AVC_ME_MAJOR_16x8_INTEL                          0x1
-#define CL_AVC_ME_MAJOR_8x16_INTEL                          0x2
-#define CL_AVC_ME_MAJOR_8x8_INTEL                           0x3
-
-#define CL_AVC_ME_MINOR_8x8_INTEL                           0x0
-#define CL_AVC_ME_MINOR_8x4_INTEL                           0x1
-#define CL_AVC_ME_MINOR_4x8_INTEL                           0x2
-#define CL_AVC_ME_MINOR_4x4_INTEL                           0x3
-
-#define CL_AVC_ME_MAJOR_FORWARD_INTEL                       0x0
-#define CL_AVC_ME_MAJOR_BACKWARD_INTEL                      0x1
-#define CL_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL                 0x2
-
-#define CL_AVC_ME_PARTITION_MASK_ALL_INTEL                  0x0
-#define CL_AVC_ME_PARTITION_MASK_16x16_INTEL                0x7E
-#define CL_AVC_ME_PARTITION_MASK_16x8_INTEL                 0x7D
-#define CL_AVC_ME_PARTITION_MASK_8x16_INTEL                 0x7B
-#define CL_AVC_ME_PARTITION_MASK_8x8_INTEL                  0x77
-#define CL_AVC_ME_PARTITION_MASK_8x4_INTEL                  0x6F
-#define CL_AVC_ME_PARTITION_MASK_4x8_INTEL                  0x5F
-#define CL_AVC_ME_PARTITION_MASK_4x4_INTEL                  0x3F
-
-#define CL_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL            0x0
-#define CL_AVC_ME_SEARCH_WINDOW_SMALL_INTEL                 0x1
-#define CL_AVC_ME_SEARCH_WINDOW_TINY_INTEL                  0x2
-#define CL_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL            0x3
-#define CL_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL               0x4
-#define CL_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL         0x5
-#define CL_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL             0x6
-#define CL_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL             0x7
-#define CL_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL                0x8
-#define CL_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL          0x9
-#define CL_AVC_ME_SEARCH_WINDOW_4x4_RADIUS_INTEL            0x2
-#define CL_AVC_ME_SEARCH_WINDOW_2x2_RADIUS_INTEL            0xa
-
-#define CL_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL                0x0
-#define CL_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL                0x2
-
-#define CL_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL               0x0
-#define CL_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL                  0x1
-#define CL_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL                  0x3
-
-#define CL_AVC_ME_COST_PRECISION_QPEL_INTEL                 0x0
-#define CL_AVC_ME_COST_PRECISION_HPEL_INTEL                 0x1
-#define CL_AVC_ME_COST_PRECISION_PEL_INTEL                  0x2
-#define CL_AVC_ME_COST_PRECISION_DPEL_INTEL                 0x3
-
-#define CL_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL                0x10
-#define CL_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL                  0x15
-#define CL_AVC_ME_BIDIR_WEIGHT_HALF_INTEL                   0x20
-#define CL_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL              0x2B
-#define CL_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL          0x30
-
-#define CL_AVC_ME_BORDER_REACHED_LEFT_INTEL                 0x0
-#define CL_AVC_ME_BORDER_REACHED_RIGHT_INTEL                0x2
-#define CL_AVC_ME_BORDER_REACHED_TOP_INTEL                  0x4
-#define CL_AVC_ME_BORDER_REACHED_BOTTOM_INTEL               0x8
-
-#define CL_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL          0x0
-#define CL_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL            0x4000
-
-#define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL        ( 0x3 << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL       ( 0x55 << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL      ( 0xAA << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL          ( 0xFF << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL     ( 0x1 << 26 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL    ( 0x2 << 26 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL     ( 0x1 << 28 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL    ( 0x2 << 28 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL     ( 0x1 << 30 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL    ( 0x2 << 30 )
-
-#define CL_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL                0x00
-#define CL_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL                0x80
-
-#define CL_AVC_ME_INTRA_16x16_INTEL                         0x0
-#define CL_AVC_ME_INTRA_8x8_INTEL                           0x1
-#define CL_AVC_ME_INTRA_4x4_INTEL                           0x2
-
-#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL     0x6
-#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL       0x5
-#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL       0x3 
-
-#define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL         0x60
-#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL        0x10
-#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL  0x8
-#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL   0x4
-
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL            0x0
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL          0x1
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL                  0x2
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL  0x3
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL               0x4
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL      0x5
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL     0x6
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL       0x7
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL       0x8
-#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                0x0
-#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL        0x1
-#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL          0x2
-#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL             0x3
-
-#define CL_AVC_ME_FRAME_FORWARD_INTEL                       0x1
-#define CL_AVC_ME_FRAME_BACKWARD_INTEL                      0x2
-#define CL_AVC_ME_FRAME_DUAL_INTEL                          0x3
-
-#define CL_AVC_ME_SLICE_TYPE_PRED_INTEL                     0x0
-#define CL_AVC_ME_SLICE_TYPE_BPRED_INTEL                    0x1
-#define CL_AVC_ME_SLICE_TYPE_INTRA_INTEL                    0x2
-
-#define CL_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL           0x0
-#define CL_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL        0x1  
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* __CL_EXT_INTEL_H */
diff --git a/mobile/third_party/opencl/OpenCL-Headers/CL/cl_gl.h b/mobile/third_party/opencl/OpenCL-Headers/CL/cl_gl.h
deleted file mode 100644
index 58b6449f9b..0000000000
--- a/mobile/third_party/opencl/OpenCL-Headers/CL/cl_gl.h
+++ /dev/null
@@ -1,175 +0,0 @@
-/**********************************************************************************
- * Copyright (c) 2008-2018 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
- * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
- * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
- *    https://www.khronos.org/registry/
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- **********************************************************************************/
-
-#ifndef __OPENCL_CL_GL_H
-#define __OPENCL_CL_GL_H
-
-#ifdef __APPLE__
-#include <OpenCL/cl.h>
-#else
-#include <CL/cl.h>
-#endif	
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef cl_uint     cl_gl_object_type;
-typedef cl_uint     cl_gl_texture_info;
-typedef cl_uint     cl_gl_platform_info;
-typedef struct __GLsync *cl_GLsync;
-
-/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken           */
-#define CL_GL_OBJECT_BUFFER                     0x2000
-#define CL_GL_OBJECT_TEXTURE2D                  0x2001
-#define CL_GL_OBJECT_TEXTURE3D                  0x2002
-#define CL_GL_OBJECT_RENDERBUFFER               0x2003
-#ifdef CL_VERSION_1_2
-#define CL_GL_OBJECT_TEXTURE2D_ARRAY            0x200E
-#define CL_GL_OBJECT_TEXTURE1D                  0x200F
-#define CL_GL_OBJECT_TEXTURE1D_ARRAY            0x2010
-#define CL_GL_OBJECT_TEXTURE_BUFFER             0x2011
-#endif
-
-/* cl_gl_texture_info           */
-#define CL_GL_TEXTURE_TARGET                    0x2004
-#define CL_GL_MIPMAP_LEVEL                      0x2005
-#ifdef CL_VERSION_1_2
-#define CL_GL_NUM_SAMPLES                       0x2012
-#endif
-
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromGLBuffer(cl_context     /* context */,
-                     cl_mem_flags   /* flags */,
-                     cl_GLuint      /* bufobj */,
-                     int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-#ifdef CL_VERSION_1_2
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromGLTexture(cl_context      /* context */,
-                      cl_mem_flags    /* flags */,
-                      cl_GLenum       /* target */,
-                      cl_GLint        /* miplevel */,
-                      cl_GLuint       /* texture */,
-                      cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
-    
-#endif
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromGLRenderbuffer(cl_context   /* context */,
-                           cl_mem_flags /* flags */,
-                           cl_GLuint    /* renderbuffer */,
-                           cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetGLObjectInfo(cl_mem                /* memobj */,
-                  cl_gl_object_type *   /* gl_object_type */,
-                  cl_GLuint *           /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
-                  
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetGLTextureInfo(cl_mem               /* memobj */,
-                   cl_gl_texture_info   /* param_name */,
-                   size_t               /* param_value_size */,
-                   void *               /* param_value */,
-                   size_t *             /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueAcquireGLObjects(cl_command_queue      /* command_queue */,
-                          cl_uint               /* num_objects */,
-                          const cl_mem *        /* mem_objects */,
-                          cl_uint               /* num_events_in_wait_list */,
-                          const cl_event *      /* event_wait_list */,
-                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReleaseGLObjects(cl_command_queue      /* command_queue */,
-                          cl_uint               /* num_objects */,
-                          const cl_mem *        /* mem_objects */,
-                          cl_uint               /* num_events_in_wait_list */,
-                          const cl_event *      /* event_wait_list */,
-                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-
-/* Deprecated OpenCL 1.1 APIs */
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
-clCreateFromGLTexture2D(cl_context      /* context */,
-                        cl_mem_flags    /* flags */,
-                        cl_GLenum       /* target */,
-                        cl_GLint        /* miplevel */,
-                        cl_GLuint       /* texture */,
-                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-    
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
-clCreateFromGLTexture3D(cl_context      /* context */,
-                        cl_mem_flags    /* flags */,
-                        cl_GLenum       /* target */,
-                        cl_GLint        /* miplevel */,
-                        cl_GLuint       /* texture */,
-                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-    
-/* cl_khr_gl_sharing extension  */
-    
-#define cl_khr_gl_sharing 1
-    
-typedef cl_uint     cl_gl_context_info;
-    
-/* Additional Error Codes  */
-#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
-    
-/* cl_gl_context_info  */
-#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
-#define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
-    
-/* Additional cl_context_properties  */
-#define CL_GL_CONTEXT_KHR                       0x2008
-#define CL_EGL_DISPLAY_KHR                      0x2009
-#define CL_GLX_DISPLAY_KHR                      0x200A
-#define CL_WGL_HDC_KHR                          0x200B
-#define CL_CGL_SHAREGROUP_KHR                   0x200C
-    
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
-                      cl_gl_context_info            /* param_name */,
-                      size_t                        /* param_value_size */,
-                      void *                        /* param_value */,
-                      size_t *                      /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-    
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
-    const cl_context_properties * properties,
-    cl_gl_context_info            param_name,
-    size_t                        param_value_size,
-    void *                        param_value,
-    size_t *                      param_value_size_ret);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  /* __OPENCL_CL_GL_H */
diff --git a/mobile/third_party/opencl/OpenCL-Headers/CL/cl_gl_ext.h b/mobile/third_party/opencl/OpenCL-Headers/CL/cl_gl_ext.h
deleted file mode 100644
index e3c14c6408..0000000000
--- a/mobile/third_party/opencl/OpenCL-Headers/CL/cl_gl_ext.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/**********************************************************************************
- * Copyright (c) 2008-2015 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
- * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
- * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
- *    https://www.khronos.org/registry/
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- **********************************************************************************/
-
-/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
-
-/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have           */
-/* OpenGL dependencies.                                                         */
-
-#ifndef __OPENCL_CL_GL_EXT_H
-#define __OPENCL_CL_GL_EXT_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifdef __APPLE__
-    #include <OpenCL/cl_gl.h>
-#else
-    #include <CL/cl_gl.h>
-#endif
-
-/*
- * For each extension, follow this template
- *  cl_VEN_extname extension  */
-/* #define cl_VEN_extname 1
- * ... define new types, if any
- * ... define new tokens, if any
- * ... define new APIs, if any
- *
- *  If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header
- *  This allows us to avoid having to decide whether to include GL headers or GLES here.
- */
-
-/* 
- *  cl_khr_gl_event  extension
- *  See section 9.9 in the OpenCL 1.1 spec for more information
- */
-#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
-
-extern CL_API_ENTRY cl_event CL_API_CALL
-clCreateEventFromGLsyncKHR(cl_context           /* context */,
-                           cl_GLsync            /* cl_GLsync */,
-                           cl_int *             /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif	/* __OPENCL_CL_GL_EXT_H  */
diff --git a/mobile/third_party/opencl/OpenCL-Headers/CL/cl_platform.h b/mobile/third_party/opencl/OpenCL-Headers/CL/cl_platform.h
deleted file mode 100644
index c2f408fed5..0000000000
--- a/mobile/third_party/opencl/OpenCL-Headers/CL/cl_platform.h
+++ /dev/null
@@ -1,1460 +0,0 @@
-/**********************************************************************************
- * Copyright (c) 2008-2018 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
- * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
- * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
- *    https://www.khronos.org/registry/
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- **********************************************************************************/
-
-#ifndef __CL_PLATFORM_H
-#define __CL_PLATFORM_H
-
-#ifdef __APPLE__
-    #include <OpenCL/cl_version.h>
-
-    /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */
-    #include <AvailabilityMacros.h>
-#else
-    #include <CL/cl_version.h>
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if defined(_WIN32)
-    #define CL_API_ENTRY
-    #define CL_API_CALL     __stdcall
-    #define CL_CALLBACK     __stdcall
-#else
-    #define CL_API_ENTRY
-    #define CL_API_CALL
-    #define CL_CALLBACK
-#endif
-
-/*
- * Deprecation flags refer to the last version of the header in which the
- * feature was not deprecated.
- *
- * E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without
- * deprecation but is deprecated in versions later than 1.1.
- */
-
-#ifdef __APPLE__
-    #define CL_EXTENSION_WEAK_LINK       __attribute__((weak_import))
-    #define CL_API_SUFFIX__VERSION_1_0                  AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
-    #define CL_EXT_SUFFIX__VERSION_1_0                  CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
-    #define CL_API_SUFFIX__VERSION_1_1                  AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
-    #define GCL_API_SUFFIX__VERSION_1_1                 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
-    #define CL_EXT_SUFFIX__VERSION_1_1                  CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
-    #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED       CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
-
-    #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
-        #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
-        #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
-        #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
-        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
-        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
-    #else
-        #warning  This path should never happen outside of internal operating system development.  AvailabilityMacros do not function correctly here!
-        #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
-        #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
-        #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
-        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
-    #endif
-#else
-    #define CL_EXTENSION_WEAK_LINK
-    #define CL_API_SUFFIX__VERSION_1_0
-    #define CL_EXT_SUFFIX__VERSION_1_0
-    #define CL_API_SUFFIX__VERSION_1_1
-    #define CL_EXT_SUFFIX__VERSION_1_1
-    #define CL_API_SUFFIX__VERSION_1_2
-    #define CL_EXT_SUFFIX__VERSION_1_2
-    #define CL_API_SUFFIX__VERSION_2_0
-    #define CL_EXT_SUFFIX__VERSION_2_0
-    #define CL_API_SUFFIX__VERSION_2_1
-    #define CL_EXT_SUFFIX__VERSION_2_1
-    #define CL_API_SUFFIX__VERSION_2_2
-    #define CL_EXT_SUFFIX__VERSION_2_2
-
-    #ifdef __GNUC__
-        #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
-            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
-            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
-        #else
-            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED __attribute__((deprecated))
-            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
-        #endif
-
-        #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
-            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
-        #else
-            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated))
-            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
-        #endif
-
-        #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS
-            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
-            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
-        #else
-            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED __attribute__((deprecated))
-            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
-         #endif
-
-        #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS
-            #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
-            #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
-        #else
-            #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED __attribute__((deprecated))
-            #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
-        #endif
-
-        #ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS
-            #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
-            #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
-        #else
-            #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED __attribute__((deprecated))
-            #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
-        #endif
-    #elif defined(_WIN32)
-        #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
-            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
-            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
-        #else
-            #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
-            #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated)
-        #endif
-
-        #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
-            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
-        #else
-            #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-            #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated)
-        #endif
-
-        #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS
-            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
-            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
-        #else
-            #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
-            #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED __declspec(deprecated)
-        #endif
-
-        #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS
-            #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
-            #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
-        #else
-            #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
-            #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED __declspec(deprecated)
-        #endif
-
-        #ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS
-            #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
-            #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
-        #else
-            #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
-            #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED __declspec(deprecated)
-        #endif
-    #else
-        #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
-        #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
-
-        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
-        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
-
-        #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
-        #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
-
-        #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
-        #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
-
-        #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
-        #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
-    #endif
-#endif
-
-#if (defined (_WIN32) && defined(_MSC_VER))
-
-/* scalar types  */
-typedef signed   __int8         cl_char;
-typedef unsigned __int8         cl_uchar;
-typedef signed   __int16        cl_short;
-typedef unsigned __int16        cl_ushort;
-typedef signed   __int32        cl_int;
-typedef unsigned __int32        cl_uint;
-typedef signed   __int64        cl_long;
-typedef unsigned __int64        cl_ulong;
-
-typedef unsigned __int16        cl_half;
-typedef float                   cl_float;
-typedef double                  cl_double;
-
-/* Macro names and corresponding values defined by OpenCL */
-#define CL_CHAR_BIT         8
-#define CL_SCHAR_MAX        127
-#define CL_SCHAR_MIN        (-127-1)
-#define CL_CHAR_MAX         CL_SCHAR_MAX
-#define CL_CHAR_MIN         CL_SCHAR_MIN
-#define CL_UCHAR_MAX        255
-#define CL_SHRT_MAX         32767
-#define CL_SHRT_MIN         (-32767-1)
-#define CL_USHRT_MAX        65535
-#define CL_INT_MAX          2147483647
-#define CL_INT_MIN          (-2147483647-1)
-#define CL_UINT_MAX         0xffffffffU
-#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
-#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
-#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
-
-#define CL_FLT_DIG          6
-#define CL_FLT_MANT_DIG     24
-#define CL_FLT_MAX_10_EXP   +38
-#define CL_FLT_MAX_EXP      +128
-#define CL_FLT_MIN_10_EXP   -37
-#define CL_FLT_MIN_EXP      -125
-#define CL_FLT_RADIX        2
-#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
-#define CL_FLT_MIN          1.175494350822287507969e-38f
-#define CL_FLT_EPSILON      1.1920928955078125e-7f
-
-#define CL_HALF_DIG          3
-#define CL_HALF_MANT_DIG     11
-#define CL_HALF_MAX_10_EXP   +4
-#define CL_HALF_MAX_EXP      +16
-#define CL_HALF_MIN_10_EXP   -4
-#define CL_HALF_MIN_EXP      -13
-#define CL_HALF_RADIX        2
-#define CL_HALF_MAX          65504.0f
-#define CL_HALF_MIN          6.103515625e-05f
-#define CL_HALF_EPSILON      9.765625e-04f
-
-#define CL_DBL_DIG          15
-#define CL_DBL_MANT_DIG     53
-#define CL_DBL_MAX_10_EXP   +308
-#define CL_DBL_MAX_EXP      +1024
-#define CL_DBL_MIN_10_EXP   -307
-#define CL_DBL_MIN_EXP      -1021
-#define CL_DBL_RADIX        2
-#define CL_DBL_MAX          1.7976931348623158e+308
-#define CL_DBL_MIN          2.225073858507201383090e-308
-#define CL_DBL_EPSILON      2.220446049250313080847e-16
-
-#define CL_M_E              2.7182818284590452354
-#define CL_M_LOG2E          1.4426950408889634074
-#define CL_M_LOG10E         0.43429448190325182765
-#define CL_M_LN2            0.69314718055994530942
-#define CL_M_LN10           2.30258509299404568402
-#define CL_M_PI             3.14159265358979323846
-#define CL_M_PI_2           1.57079632679489661923
-#define CL_M_PI_4           0.78539816339744830962
-#define CL_M_1_PI           0.31830988618379067154
-#define CL_M_2_PI           0.63661977236758134308
-#define CL_M_2_SQRTPI       1.12837916709551257390
-#define CL_M_SQRT2          1.41421356237309504880
-#define CL_M_SQRT1_2        0.70710678118654752440
-
-#define CL_M_E_F            2.718281828f
-#define CL_M_LOG2E_F        1.442695041f
-#define CL_M_LOG10E_F       0.434294482f
-#define CL_M_LN2_F          0.693147181f
-#define CL_M_LN10_F         2.302585093f
-#define CL_M_PI_F           3.141592654f
-#define CL_M_PI_2_F         1.570796327f
-#define CL_M_PI_4_F         0.785398163f
-#define CL_M_1_PI_F         0.318309886f
-#define CL_M_2_PI_F         0.636619772f
-#define CL_M_2_SQRTPI_F     1.128379167f
-#define CL_M_SQRT2_F        1.414213562f
-#define CL_M_SQRT1_2_F      0.707106781f
-
-#define CL_NAN              (CL_INFINITY - CL_INFINITY)
-#define CL_HUGE_VALF        ((cl_float) 1e50)
-#define CL_HUGE_VAL         ((cl_double) 1e500)
-#define CL_MAXFLOAT         CL_FLT_MAX
-#define CL_INFINITY         CL_HUGE_VALF
-
-#else
-
-#include <stdint.h>
-
-/* scalar types  */
-typedef int8_t          cl_char;
-typedef uint8_t         cl_uchar;
-typedef int16_t         cl_short    __attribute__((aligned(2)));
-typedef uint16_t        cl_ushort   __attribute__((aligned(2)));
-typedef int32_t         cl_int      __attribute__((aligned(4)));
-typedef uint32_t        cl_uint     __attribute__((aligned(4)));
-typedef int64_t         cl_long     __attribute__((aligned(8)));
-typedef uint64_t        cl_ulong    __attribute__((aligned(8)));
-
-typedef uint16_t        cl_half     __attribute__((aligned(2)));
-typedef float           cl_float    __attribute__((aligned(4)));
-typedef double          cl_double   __attribute__((aligned(8)));
-
-/* Macro names and corresponding values defined by OpenCL */
-#define CL_CHAR_BIT         8
-#define CL_SCHAR_MAX        127
-#define CL_SCHAR_MIN        (-127-1)
-#define CL_CHAR_MAX         CL_SCHAR_MAX
-#define CL_CHAR_MIN         CL_SCHAR_MIN
-#define CL_UCHAR_MAX        255
-#define CL_SHRT_MAX         32767
-#define CL_SHRT_MIN         (-32767-1)
-#define CL_USHRT_MAX        65535
-#define CL_INT_MAX          2147483647
-#define CL_INT_MIN          (-2147483647-1)
-#define CL_UINT_MAX         0xffffffffU
-#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
-#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
-#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
-
-#define CL_FLT_DIG          6
-#define CL_FLT_MANT_DIG     24
-#define CL_FLT_MAX_10_EXP   +38
-#define CL_FLT_MAX_EXP      +128
-#define CL_FLT_MIN_10_EXP   -37
-#define CL_FLT_MIN_EXP      -125
-#define CL_FLT_RADIX        2
-#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
-#define CL_FLT_MIN          1.175494350822287507969e-38f
-#define CL_FLT_EPSILON      1.1920928955078125e-7f
-
-#define CL_HALF_DIG          3
-#define CL_HALF_MANT_DIG     11
-#define CL_HALF_MAX_10_EXP   +4
-#define CL_HALF_MAX_EXP      +16
-#define CL_HALF_MIN_10_EXP   -4
-#define CL_HALF_MIN_EXP      -13
-#define CL_HALF_RADIX        2
-#define CL_HALF_MAX          65504.0f
-#define CL_HALF_MIN          6.103515625e-05f
-#define CL_HALF_EPSILON      9.765625e-04f
-
-#define CL_DBL_DIG          15
-#define CL_DBL_MANT_DIG     53
-#define CL_DBL_MAX_10_EXP   +308
-#define CL_DBL_MAX_EXP      +1024
-#define CL_DBL_MIN_10_EXP   -307
-#define CL_DBL_MIN_EXP      -1021
-#define CL_DBL_RADIX        2
-#define CL_DBL_MAX          179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
-#define CL_DBL_MIN          2.225073858507201383090e-308
-#define CL_DBL_EPSILON      2.220446049250313080847e-16
-
-#define CL_M_E              2.7182818284590452354
-#define CL_M_LOG2E          1.4426950408889634074
-#define CL_M_LOG10E         0.43429448190325182765
-#define CL_M_LN2            0.69314718055994530942
-#define CL_M_LN10           2.30258509299404568402
-#define CL_M_PI             3.14159265358979323846
-#define CL_M_PI_2           1.57079632679489661923
-#define CL_M_PI_4           0.78539816339744830962
-#define CL_M_1_PI           0.31830988618379067154
-#define CL_M_2_PI           0.63661977236758134308
-#define CL_M_2_SQRTPI       1.12837916709551257390
-#define CL_M_SQRT2          1.41421356237309504880
-#define CL_M_SQRT1_2        0.70710678118654752440
-
-#define CL_M_E_F            2.718281828f
-#define CL_M_LOG2E_F        1.442695041f
-#define CL_M_LOG10E_F       0.434294482f
-#define CL_M_LN2_F          0.693147181f
-#define CL_M_LN10_F         2.302585093f
-#define CL_M_PI_F           3.141592654f
-#define CL_M_PI_2_F         1.570796327f
-#define CL_M_PI_4_F         0.785398163f
-#define CL_M_1_PI_F         0.318309886f
-#define CL_M_2_PI_F         0.636619772f
-#define CL_M_2_SQRTPI_F     1.128379167f
-#define CL_M_SQRT2_F        1.414213562f
-#define CL_M_SQRT1_2_F      0.707106781f
-
-#if defined( __GNUC__ )
-   #define CL_HUGE_VALF     __builtin_huge_valf()
-   #define CL_HUGE_VAL      __builtin_huge_val()
-   #define CL_NAN           __builtin_nanf( "" )
-#else
-   #define CL_HUGE_VALF     ((cl_float) 1e50)
-   #define CL_HUGE_VAL      ((cl_double) 1e500)
-   float nanf( const char * );
-   #define CL_NAN           nanf( "" )
-#endif
-#define CL_MAXFLOAT         CL_FLT_MAX
-#define CL_INFINITY         CL_HUGE_VALF
-
-#endif
-
-#include <stddef.h>
-
-/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */
-typedef unsigned int cl_GLuint;
-typedef int          cl_GLint;
-typedef unsigned int cl_GLenum;
-
-/*
- * Vector types
- *
- *  Note:   OpenCL requires that all types be naturally aligned.
- *          This means that vector types must be naturally aligned.
- *          For example, a vector of four floats must be aligned to
- *          a 16 byte boundary (calculated as 4 * the natural 4-byte
- *          alignment of the float).  The alignment qualifiers here
- *          will only function properly if your compiler supports them
- *          and if you don't actively work to defeat them.  For example,
- *          in order for a cl_float4 to be 16 byte aligned in a struct,
- *          the start of the struct must itself be 16-byte aligned.
- *
- *          Maintaining proper alignment is the user's responsibility.
- */
-
-/* Define basic vector types */
-#if defined( __VEC__ )
-   #include <altivec.h>   /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
-   typedef vector unsigned char     __cl_uchar16;
-   typedef vector signed char       __cl_char16;
-   typedef vector unsigned short    __cl_ushort8;
-   typedef vector signed short      __cl_short8;
-   typedef vector unsigned int      __cl_uint4;
-   typedef vector signed int        __cl_int4;
-   typedef vector float             __cl_float4;
-   #define  __CL_UCHAR16__  1
-   #define  __CL_CHAR16__   1
-   #define  __CL_USHORT8__  1
-   #define  __CL_SHORT8__   1
-   #define  __CL_UINT4__    1
-   #define  __CL_INT4__     1
-   #define  __CL_FLOAT4__   1
-#endif
-
-#if defined( __SSE__ )
-    #if defined( __MINGW64__ )
-        #include <intrin.h>
-    #else
-        #include <xmmintrin.h>
-    #endif
-    #if defined( __GNUC__ )
-        typedef float __cl_float4   __attribute__((vector_size(16)));
-    #else
-        typedef __m128 __cl_float4;
-    #endif
-    #define __CL_FLOAT4__   1
-#endif
-
-#if defined( __SSE2__ )
-    #if defined( __MINGW64__ )
-        #include <intrin.h>
-    #else
-        #include <emmintrin.h>
-    #endif
-    #if defined( __GNUC__ )
-        typedef cl_uchar    __cl_uchar16    __attribute__((vector_size(16)));
-        typedef cl_char     __cl_char16     __attribute__((vector_size(16)));
-        typedef cl_ushort   __cl_ushort8    __attribute__((vector_size(16)));
-        typedef cl_short    __cl_short8     __attribute__((vector_size(16)));
-        typedef cl_uint     __cl_uint4      __attribute__((vector_size(16)));
-        typedef cl_int      __cl_int4       __attribute__((vector_size(16)));
-        typedef cl_ulong    __cl_ulong2     __attribute__((vector_size(16)));
-        typedef cl_long     __cl_long2      __attribute__((vector_size(16)));
-        typedef cl_double   __cl_double2    __attribute__((vector_size(16)));
-    #else
-        typedef __m128i __cl_uchar16;
-        typedef __m128i __cl_char16;
-        typedef __m128i __cl_ushort8;
-        typedef __m128i __cl_short8;
-        typedef __m128i __cl_uint4;
-        typedef __m128i __cl_int4;
-        typedef __m128i __cl_ulong2;
-        typedef __m128i __cl_long2;
-        typedef __m128d __cl_double2;
-    #endif
-    #define __CL_UCHAR16__  1
-    #define __CL_CHAR16__   1
-    #define __CL_USHORT8__  1
-    #define __CL_SHORT8__   1
-    #define __CL_INT4__     1
-    #define __CL_UINT4__    1
-    #define __CL_ULONG2__   1
-    #define __CL_LONG2__    1
-    #define __CL_DOUBLE2__  1
-#endif
-
-#if defined( __MMX__ )
-    #include <mmintrin.h>
-    #if defined( __GNUC__ )
-        typedef cl_uchar    __cl_uchar8     __attribute__((vector_size(8)));
-        typedef cl_char     __cl_char8      __attribute__((vector_size(8)));
-        typedef cl_ushort   __cl_ushort4    __attribute__((vector_size(8)));
-        typedef cl_short    __cl_short4     __attribute__((vector_size(8)));
-        typedef cl_uint     __cl_uint2      __attribute__((vector_size(8)));
-        typedef cl_int      __cl_int2       __attribute__((vector_size(8)));
-        typedef cl_ulong    __cl_ulong1     __attribute__((vector_size(8)));
-        typedef cl_long     __cl_long1      __attribute__((vector_size(8)));
-        typedef cl_float    __cl_float2     __attribute__((vector_size(8)));
-    #else
-        typedef __m64       __cl_uchar8;
-        typedef __m64       __cl_char8;
-        typedef __m64       __cl_ushort4;
-        typedef __m64       __cl_short4;
-        typedef __m64       __cl_uint2;
-        typedef __m64       __cl_int2;
-        typedef __m64       __cl_ulong1;
-        typedef __m64       __cl_long1;
-        typedef __m64       __cl_float2;
-    #endif
-    #define __CL_UCHAR8__   1
-    #define __CL_CHAR8__    1
-    #define __CL_USHORT4__  1
-    #define __CL_SHORT4__   1
-    #define __CL_INT2__     1
-    #define __CL_UINT2__    1
-    #define __CL_ULONG1__   1
-    #define __CL_LONG1__    1
-    #define __CL_FLOAT2__   1
-#endif
-
-#if defined( __AVX__ )
-    #if defined( __MINGW64__ )
-        #include <intrin.h>
-    #else
-        #include <immintrin.h>
-    #endif
-    #if defined( __GNUC__ )
-        typedef cl_float    __cl_float8     __attribute__((vector_size(32)));
-        typedef cl_double   __cl_double4    __attribute__((vector_size(32)));
-    #else
-        typedef __m256      __cl_float8;
-        typedef __m256d     __cl_double4;
-    #endif
-    #define __CL_FLOAT8__   1
-    #define __CL_DOUBLE4__  1
-#endif
-
-/* Define capabilities for anonymous struct members. */
-#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
-#define  __CL_HAS_ANON_STRUCT__ 1
-#define  __CL_ANON_STRUCT__
-#elif defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
-#define  __CL_HAS_ANON_STRUCT__ 1
-#define  __CL_ANON_STRUCT__ __extension__
-#elif defined( _WIN32) && defined(_MSC_VER)
-    #if _MSC_VER >= 1500
-   /* Microsoft Developer Studio 2008 supports anonymous structs, but
-    * complains by default. */
-    #define  __CL_HAS_ANON_STRUCT__ 1
-    #define  __CL_ANON_STRUCT__
-   /* Disable warning C4201: nonstandard extension used : nameless
-    * struct/union */
-    #pragma warning( push )
-    #pragma warning( disable : 4201 )
-    #endif
-#else
-#define  __CL_HAS_ANON_STRUCT__ 0
-#define  __CL_ANON_STRUCT__
-#endif
-
-/* Define alignment keys */
-#if defined( __GNUC__ )
-    #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))
-#elif defined( _WIN32) && (_MSC_VER)
-    /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements     */
-    /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx                                                 */
-    /* #include <crtdefs.h>                                                                                             */
-    /* #define CL_ALIGNED(_x)          _CRT_ALIGN(_x)                                                                   */
-    #define CL_ALIGNED(_x)
-#else
-   #warning  Need to implement some method to align data here
-   #define  CL_ALIGNED(_x)
-#endif
-
-/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
-#if __CL_HAS_ANON_STRUCT__
-    /* .xyzw and .s0123...{f|F} are supported */
-    #define CL_HAS_NAMED_VECTOR_FIELDS 1
-    /* .hi and .lo are supported */
-    #define CL_HAS_HI_LO_VECTOR_FIELDS 1
-#endif
-
-/* Define cl_vector types */
-
-/* ---- cl_charn ---- */
-typedef union
-{
-    cl_char  CL_ALIGNED(2) s[2];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_char  x, y; };
-   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1; };
-   __CL_ANON_STRUCT__ struct{ cl_char  lo, hi; };
-#endif
-#if defined( __CL_CHAR2__)
-    __cl_char2     v2;
-#endif
-}cl_char2;
-
-typedef union
-{
-    cl_char  CL_ALIGNED(4) s[4];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
-   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3; };
-   __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; };
-#endif
-#if defined( __CL_CHAR2__)
-    __cl_char2     v2[2];
-#endif
-#if defined( __CL_CHAR4__)
-    __cl_char4     v4;
-#endif
-}cl_char4;
-
-/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */
-typedef  cl_char4  cl_char3;
-
-typedef union
-{
-    cl_char   CL_ALIGNED(8) s[8];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
-   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; };
-#endif
-#if defined( __CL_CHAR2__)
-    __cl_char2     v2[4];
-#endif
-#if defined( __CL_CHAR4__)
-    __cl_char4     v4[2];
-#endif
-#if defined( __CL_CHAR8__ )
-    __cl_char8     v8;
-#endif
-}cl_char8;
-
-typedef union
-{
-    cl_char  CL_ALIGNED(16) s[16];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; };
-#endif
-#if defined( __CL_CHAR2__)
-    __cl_char2     v2[8];
-#endif
-#if defined( __CL_CHAR4__)
-    __cl_char4     v4[4];
-#endif
-#if defined( __CL_CHAR8__ )
-    __cl_char8     v8[2];
-#endif
-#if defined( __CL_CHAR16__ )
-    __cl_char16    v16;
-#endif
-}cl_char16;
-
-
-/* ---- cl_ucharn ---- */
-typedef union
-{
-    cl_uchar  CL_ALIGNED(2) s[2];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y; };
-   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1; };
-   __CL_ANON_STRUCT__ struct{ cl_uchar  lo, hi; };
-#endif
-#if defined( __cl_uchar2__)
-    __cl_uchar2     v2;
-#endif
-}cl_uchar2;
-
-typedef union
-{
-    cl_uchar  CL_ALIGNED(4) s[4];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
-   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3; };
-   __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; };
-#endif
-#if defined( __CL_UCHAR2__)
-    __cl_uchar2     v2[2];
-#endif
-#if defined( __CL_UCHAR4__)
-    __cl_uchar4     v4;
-#endif
-}cl_uchar4;
-
-/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */
-typedef  cl_uchar4  cl_uchar3;
-
-typedef union
-{
-    cl_uchar   CL_ALIGNED(8) s[8];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
-   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; };
-#endif
-#if defined( __CL_UCHAR2__)
-    __cl_uchar2     v2[4];
-#endif
-#if defined( __CL_UCHAR4__)
-    __cl_uchar4     v4[2];
-#endif
-#if defined( __CL_UCHAR8__ )
-    __cl_uchar8     v8;
-#endif
-}cl_uchar8;
-
-typedef union
-{
-    cl_uchar  CL_ALIGNED(16) s[16];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; };
-#endif
-#if defined( __CL_UCHAR2__)
-    __cl_uchar2     v2[8];
-#endif
-#if defined( __CL_UCHAR4__)
-    __cl_uchar4     v4[4];
-#endif
-#if defined( __CL_UCHAR8__ )
-    __cl_uchar8     v8[2];
-#endif
-#if defined( __CL_UCHAR16__ )
-    __cl_uchar16    v16;
-#endif
-}cl_uchar16;
-
-
-/* ---- cl_shortn ---- */
-typedef union
-{
-    cl_short  CL_ALIGNED(4) s[2];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_short  x, y; };
-   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1; };
-   __CL_ANON_STRUCT__ struct{ cl_short  lo, hi; };
-#endif
-#if defined( __CL_SHORT2__)
-    __cl_short2     v2;
-#endif
-}cl_short2;
-
-typedef union
-{
-    cl_short  CL_ALIGNED(8) s[4];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
-   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3; };
-   __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; };
-#endif
-#if defined( __CL_SHORT2__)
-    __cl_short2     v2[2];
-#endif
-#if defined( __CL_SHORT4__)
-    __cl_short4     v4;
-#endif
-}cl_short4;
-
-/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */
-typedef  cl_short4  cl_short3;
-
-typedef union
-{
-    cl_short   CL_ALIGNED(16) s[8];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
-   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; };
-#endif
-#if defined( __CL_SHORT2__)
-    __cl_short2     v2[4];
-#endif
-#if defined( __CL_SHORT4__)
-    __cl_short4     v4[2];
-#endif
-#if defined( __CL_SHORT8__ )
-    __cl_short8     v8;
-#endif
-}cl_short8;
-
-typedef union
-{
-    cl_short  CL_ALIGNED(32) s[16];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; };
-#endif
-#if defined( __CL_SHORT2__)
-    __cl_short2     v2[8];
-#endif
-#if defined( __CL_SHORT4__)
-    __cl_short4     v4[4];
-#endif
-#if defined( __CL_SHORT8__ )
-    __cl_short8     v8[2];
-#endif
-#if defined( __CL_SHORT16__ )
-    __cl_short16    v16;
-#endif
-}cl_short16;
-
-
-/* ---- cl_ushortn ---- */
-typedef union
-{
-    cl_ushort  CL_ALIGNED(4) s[2];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y; };
-   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1; };
-   __CL_ANON_STRUCT__ struct{ cl_ushort  lo, hi; };
-#endif
-#if defined( __CL_USHORT2__)
-    __cl_ushort2     v2;
-#endif
-}cl_ushort2;
-
-typedef union
-{
-    cl_ushort  CL_ALIGNED(8) s[4];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
-   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3; };
-   __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; };
-#endif
-#if defined( __CL_USHORT2__)
-    __cl_ushort2     v2[2];
-#endif
-#if defined( __CL_USHORT4__)
-    __cl_ushort4     v4;
-#endif
-}cl_ushort4;
-
-/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */
-typedef  cl_ushort4  cl_ushort3;
-
-typedef union
-{
-    cl_ushort   CL_ALIGNED(16) s[8];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
-   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; };
-#endif
-#if defined( __CL_USHORT2__)
-    __cl_ushort2     v2[4];
-#endif
-#if defined( __CL_USHORT4__)
-    __cl_ushort4     v4[2];
-#endif
-#if defined( __CL_USHORT8__ )
-    __cl_ushort8     v8;
-#endif
-}cl_ushort8;
-
-typedef union
-{
-    cl_ushort  CL_ALIGNED(32) s[16];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; };
-#endif
-#if defined( __CL_USHORT2__)
-    __cl_ushort2     v2[8];
-#endif
-#if defined( __CL_USHORT4__)
-    __cl_ushort4     v4[4];
-#endif
-#if defined( __CL_USHORT8__ )
-    __cl_ushort8     v8[2];
-#endif
-#if defined( __CL_USHORT16__ )
-    __cl_ushort16    v16;
-#endif
-}cl_ushort16;
-
-
-/* ---- cl_halfn ---- */
-typedef union
-{
-    cl_half  CL_ALIGNED(4) s[2];
-#if __CL_HAS_ANON_STRUCT__
-    __CL_ANON_STRUCT__ struct{ cl_half  x, y; };
-    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1; };
-    __CL_ANON_STRUCT__ struct{ cl_half  lo, hi; };
-#endif
-#if defined( __CL_HALF2__)
-    __cl_half2     v2;
-#endif
-}cl_half2;
-
-typedef union
-{
-    cl_half  CL_ALIGNED(8) s[4];
-#if __CL_HAS_ANON_STRUCT__
-    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };
-    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3; };
-    __CL_ANON_STRUCT__ struct{ cl_half2 lo, hi; };
-#endif
-#if defined( __CL_HALF2__)
-    __cl_half2     v2[2];
-#endif
-#if defined( __CL_HALF4__)
-    __cl_half4     v4;
-#endif
-}cl_half4;
-
-/* cl_half3 is identical in size, alignment and behavior to cl_half4. See section 6.1.5. */
-typedef  cl_half4  cl_half3;
-
-typedef union
-{
-    cl_half   CL_ALIGNED(16) s[8];
-#if __CL_HAS_ANON_STRUCT__
-    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };
-    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7; };
-    __CL_ANON_STRUCT__ struct{ cl_half4 lo, hi; };
-#endif
-#if defined( __CL_HALF2__)
-    __cl_half2     v2[4];
-#endif
-#if defined( __CL_HALF4__)
-    __cl_half4     v4[2];
-#endif
-#if defined( __CL_HALF8__ )
-    __cl_half8     v8;
-#endif
-}cl_half8;
-
-typedef union
-{
-    cl_half  CL_ALIGNED(32) s[16];
-#if __CL_HAS_ANON_STRUCT__
-    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-    __CL_ANON_STRUCT__ struct{ cl_half8 lo, hi; };
-#endif
-#if defined( __CL_HALF2__)
-    __cl_half2     v2[8];
-#endif
-#if defined( __CL_HALF4__)
-    __cl_half4     v4[4];
-#endif
-#if defined( __CL_HALF8__ )
-    __cl_half8     v8[2];
-#endif
-#if defined( __CL_HALF16__ )
-    __cl_half16    v16;
-#endif
-}cl_half16;
-
-/* ---- cl_intn ---- */
-typedef union
-{
-    cl_int  CL_ALIGNED(8) s[2];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_int  x, y; };
-   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1; };
-   __CL_ANON_STRUCT__ struct{ cl_int  lo, hi; };
-#endif
-#if defined( __CL_INT2__)
-    __cl_int2     v2;
-#endif
-}cl_int2;
-
-typedef union
-{
-    cl_int  CL_ALIGNED(16) s[4];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
-   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3; };
-   __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; };
-#endif
-#if defined( __CL_INT2__)
-    __cl_int2     v2[2];
-#endif
-#if defined( __CL_INT4__)
-    __cl_int4     v4;
-#endif
-}cl_int4;
-
-/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */
-typedef  cl_int4  cl_int3;
-
-typedef union
-{
-    cl_int   CL_ALIGNED(32) s[8];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
-   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; };
-#endif
-#if defined( __CL_INT2__)
-    __cl_int2     v2[4];
-#endif
-#if defined( __CL_INT4__)
-    __cl_int4     v4[2];
-#endif
-#if defined( __CL_INT8__ )
-    __cl_int8     v8;
-#endif
-}cl_int8;
-
-typedef union
-{
-    cl_int  CL_ALIGNED(64) s[16];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; };
-#endif
-#if defined( __CL_INT2__)
-    __cl_int2     v2[8];
-#endif
-#if defined( __CL_INT4__)
-    __cl_int4     v4[4];
-#endif
-#if defined( __CL_INT8__ )
-    __cl_int8     v8[2];
-#endif
-#if defined( __CL_INT16__ )
-    __cl_int16    v16;
-#endif
-}cl_int16;
-
-
-/* ---- cl_uintn ---- */
-typedef union
-{
-    cl_uint  CL_ALIGNED(8) s[2];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_uint  x, y; };
-   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1; };
-   __CL_ANON_STRUCT__ struct{ cl_uint  lo, hi; };
-#endif
-#if defined( __CL_UINT2__)
-    __cl_uint2     v2;
-#endif
-}cl_uint2;
-
-typedef union
-{
-    cl_uint  CL_ALIGNED(16) s[4];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
-   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3; };
-   __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; };
-#endif
-#if defined( __CL_UINT2__)
-    __cl_uint2     v2[2];
-#endif
-#if defined( __CL_UINT4__)
-    __cl_uint4     v4;
-#endif
-}cl_uint4;
-
-/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */
-typedef  cl_uint4  cl_uint3;
-
-typedef union
-{
-    cl_uint   CL_ALIGNED(32) s[8];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
-   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; };
-#endif
-#if defined( __CL_UINT2__)
-    __cl_uint2     v2[4];
-#endif
-#if defined( __CL_UINT4__)
-    __cl_uint4     v4[2];
-#endif
-#if defined( __CL_UINT8__ )
-    __cl_uint8     v8;
-#endif
-}cl_uint8;
-
-typedef union
-{
-    cl_uint  CL_ALIGNED(64) s[16];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; };
-#endif
-#if defined( __CL_UINT2__)
-    __cl_uint2     v2[8];
-#endif
-#if defined( __CL_UINT4__)
-    __cl_uint4     v4[4];
-#endif
-#if defined( __CL_UINT8__ )
-    __cl_uint8     v8[2];
-#endif
-#if defined( __CL_UINT16__ )
-    __cl_uint16    v16;
-#endif
-}cl_uint16;
-
-/* ---- cl_longn ---- */
-typedef union
-{
-    cl_long  CL_ALIGNED(16) s[2];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_long  x, y; };
-   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1; };
-   __CL_ANON_STRUCT__ struct{ cl_long  lo, hi; };
-#endif
-#if defined( __CL_LONG2__)
-    __cl_long2     v2;
-#endif
-}cl_long2;
-
-typedef union
-{
-    cl_long  CL_ALIGNED(32) s[4];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
-   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3; };
-   __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; };
-#endif
-#if defined( __CL_LONG2__)
-    __cl_long2     v2[2];
-#endif
-#if defined( __CL_LONG4__)
-    __cl_long4     v4;
-#endif
-}cl_long4;
-
-/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */
-typedef  cl_long4  cl_long3;
-
-typedef union
-{
-    cl_long   CL_ALIGNED(64) s[8];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
-   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; };
-#endif
-#if defined( __CL_LONG2__)
-    __cl_long2     v2[4];
-#endif
-#if defined( __CL_LONG4__)
-    __cl_long4     v4[2];
-#endif
-#if defined( __CL_LONG8__ )
-    __cl_long8     v8;
-#endif
-}cl_long8;
-
-typedef union
-{
-    cl_long  CL_ALIGNED(128) s[16];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; };
-#endif
-#if defined( __CL_LONG2__)
-    __cl_long2     v2[8];
-#endif
-#if defined( __CL_LONG4__)
-    __cl_long4     v4[4];
-#endif
-#if defined( __CL_LONG8__ )
-    __cl_long8     v8[2];
-#endif
-#if defined( __CL_LONG16__ )
-    __cl_long16    v16;
-#endif
-}cl_long16;
-
-
-/* ---- cl_ulongn ---- */
-typedef union
-{
-    cl_ulong  CL_ALIGNED(16) s[2];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y; };
-   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1; };
-   __CL_ANON_STRUCT__ struct{ cl_ulong  lo, hi; };
-#endif
-#if defined( __CL_ULONG2__)
-    __cl_ulong2     v2;
-#endif
-}cl_ulong2;
-
-typedef union
-{
-    cl_ulong  CL_ALIGNED(32) s[4];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
-   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3; };
-   __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; };
-#endif
-#if defined( __CL_ULONG2__)
-    __cl_ulong2     v2[2];
-#endif
-#if defined( __CL_ULONG4__)
-    __cl_ulong4     v4;
-#endif
-}cl_ulong4;
-
-/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */
-typedef  cl_ulong4  cl_ulong3;
-
-typedef union
-{
-    cl_ulong   CL_ALIGNED(64) s[8];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
-   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; };
-#endif
-#if defined( __CL_ULONG2__)
-    __cl_ulong2     v2[4];
-#endif
-#if defined( __CL_ULONG4__)
-    __cl_ulong4     v4[2];
-#endif
-#if defined( __CL_ULONG8__ )
-    __cl_ulong8     v8;
-#endif
-}cl_ulong8;
-
-typedef union
-{
-    cl_ulong  CL_ALIGNED(128) s[16];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; };
-#endif
-#if defined( __CL_ULONG2__)
-    __cl_ulong2     v2[8];
-#endif
-#if defined( __CL_ULONG4__)
-    __cl_ulong4     v4[4];
-#endif
-#if defined( __CL_ULONG8__ )
-    __cl_ulong8     v8[2];
-#endif
-#if defined( __CL_ULONG16__ )
-    __cl_ulong16    v16;
-#endif
-}cl_ulong16;
-
-
-/* --- cl_floatn ---- */
-
-typedef union
-{
-    cl_float  CL_ALIGNED(8) s[2];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_float  x, y; };
-   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1; };
-   __CL_ANON_STRUCT__ struct{ cl_float  lo, hi; };
-#endif
-#if defined( __CL_FLOAT2__)
-    __cl_float2     v2;
-#endif
-}cl_float2;
-
-typedef union
-{
-    cl_float  CL_ALIGNED(16) s[4];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
-   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3; };
-   __CL_ANON_STRUCT__ struct{ cl_float2  lo, hi; };
-#endif
-#if defined( __CL_FLOAT2__)
-    __cl_float2     v2[2];
-#endif
-#if defined( __CL_FLOAT4__)
-    __cl_float4     v4;
-#endif
-}cl_float4;
-
-/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */
-typedef  cl_float4  cl_float3;
-
-typedef union
-{
-    cl_float   CL_ALIGNED(32) s[8];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
-   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3, s4, s5, s6, s7; };
-   __CL_ANON_STRUCT__ struct{ cl_float4  lo, hi; };
-#endif
-#if defined( __CL_FLOAT2__)
-    __cl_float2     v2[4];
-#endif
-#if defined( __CL_FLOAT4__)
-    __cl_float4     v4[2];
-#endif
-#if defined( __CL_FLOAT8__ )
-    __cl_float8     v8;
-#endif
-}cl_float8;
-
-typedef union
-{
-    cl_float  CL_ALIGNED(64) s[16];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_float  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; };
-#endif
-#if defined( __CL_FLOAT2__)
-    __cl_float2     v2[8];
-#endif
-#if defined( __CL_FLOAT4__)
-    __cl_float4     v4[4];
-#endif
-#if defined( __CL_FLOAT8__ )
-    __cl_float8     v8[2];
-#endif
-#if defined( __CL_FLOAT16__ )
-    __cl_float16    v16;
-#endif
-}cl_float16;
-
-/* --- cl_doublen ---- */
-
-typedef union
-{
-    cl_double  CL_ALIGNED(16) s[2];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_double  x, y; };
-   __CL_ANON_STRUCT__ struct{ cl_double s0, s1; };
-   __CL_ANON_STRUCT__ struct{ cl_double lo, hi; };
-#endif
-#if defined( __CL_DOUBLE2__)
-    __cl_double2     v2;
-#endif
-}cl_double2;
-
-typedef union
-{
-    cl_double  CL_ALIGNED(32) s[4];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
-   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3; };
-   __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; };
-#endif
-#if defined( __CL_DOUBLE2__)
-    __cl_double2     v2[2];
-#endif
-#if defined( __CL_DOUBLE4__)
-    __cl_double4     v4;
-#endif
-}cl_double4;
-
-/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */
-typedef  cl_double4  cl_double3;
-
-typedef union
-{
-    cl_double   CL_ALIGNED(64) s[8];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
-   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7; };
-   __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; };
-#endif
-#if defined( __CL_DOUBLE2__)
-    __cl_double2     v2[4];
-#endif
-#if defined( __CL_DOUBLE4__)
-    __cl_double4     v4[2];
-#endif
-#if defined( __CL_DOUBLE8__ )
-    __cl_double8     v8;
-#endif
-}cl_double8;
-
-typedef union
-{
-    cl_double  CL_ALIGNED(128) s[16];
-#if __CL_HAS_ANON_STRUCT__
-   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
-   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
-   __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; };
-#endif
-#if defined( __CL_DOUBLE2__)
-    __cl_double2     v2[8];
-#endif
-#if defined( __CL_DOUBLE4__)
-    __cl_double4     v4[4];
-#endif
-#if defined( __CL_DOUBLE8__ )
-    __cl_double8     v8[2];
-#endif
-#if defined( __CL_DOUBLE16__ )
-    __cl_double16    v16;
-#endif
-}cl_double16;
-
-/* Macro to facilitate debugging
- * Usage:
- *   Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source.
- *   The first line ends with:   CL_PROGRAM_STRING_DEBUG_INFO \"
- *   Each line thereafter of OpenCL C source must end with: \n\
- *   The last line ends in ";
- *
- *   Example:
- *
- *   const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\
- *   kernel void foo( int a, float * b )             \n\
- *   {                                               \n\
- *      // my comment                                \n\
- *      *b[ get_global_id(0)] = a;                   \n\
- *   }                                               \n\
- *   ";
- *
- * This should correctly set up the line, (column) and file information for your source
- * string so you can do source level debugging.
- */
-#define  __CL_STRINGIFY( _x )               # _x
-#define  _CL_STRINGIFY( _x )                __CL_STRINGIFY( _x )
-#define  CL_PROGRAM_STRING_DEBUG_INFO       "#line "  _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n"
-
-#ifdef __cplusplus
-}
-#endif
-
-#undef __CL_HAS_ANON_STRUCT__
-#undef __CL_ANON_STRUCT__
-#if defined( _WIN32) && defined(_MSC_VER)
-    #if _MSC_VER >=1500
-    #pragma warning( pop )
-    #endif
-#endif
-
-#endif  /* __CL_PLATFORM_H  */
diff --git a/mobile/third_party/opencl/OpenCL-Headers/CL/cl_va_api_media_sharing_intel.h b/mobile/third_party/opencl/OpenCL-Headers/CL/cl_va_api_media_sharing_intel.h
deleted file mode 100644
index 7cb777e846..0000000000
--- a/mobile/third_party/opencl/OpenCL-Headers/CL/cl_va_api_media_sharing_intel.h
+++ /dev/null
@@ -1,171 +0,0 @@
-/**********************************************************************************
- * Copyright (c) 2008-2016 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
- * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
- * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
- *    https://www.khronos.org/registry/
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- **********************************************************************************/
-/*****************************************************************************\
-
-Copyright (c) 2013-2016 Intel Corporation All Rights Reserved.
-
-THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
-MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-File Name: cl_va_api_media_sharing_intel.h
-
-Abstract:
-
-Notes:
-
-\*****************************************************************************/
-
-
-#ifndef __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
-#define __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
-
-#include <CL/cl.h>
-#include <CL/cl_platform.h>
-#include <va/va.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/******************************************
-* cl_intel_va_api_media_sharing extension *
-*******************************************/
-
-#define cl_intel_va_api_media_sharing 1
-
-/* error codes */
-#define CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL               -1098
-#define CL_INVALID_VA_API_MEDIA_SURFACE_INTEL               -1099
-#define CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL      -1100
-#define CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL          -1101
-
-/* cl_va_api_device_source_intel */
-#define CL_VA_API_DISPLAY_INTEL                             0x4094
-
-/* cl_va_api_device_set_intel */
-#define CL_PREFERRED_DEVICES_FOR_VA_API_INTEL               0x4095
-#define CL_ALL_DEVICES_FOR_VA_API_INTEL                     0x4096
-
-/* cl_context_info */
-#define CL_CONTEXT_VA_API_DISPLAY_INTEL                     0x4097
-
-/* cl_mem_info */
-#define CL_MEM_VA_API_MEDIA_SURFACE_INTEL                   0x4098
-
-/* cl_image_info */
-#define CL_IMAGE_VA_API_PLANE_INTEL                         0x4099
-
-/* cl_command_type */
-#define CL_COMMAND_ACQUIRE_VA_API_MEDIA_SURFACES_INTEL      0x409A
-#define CL_COMMAND_RELEASE_VA_API_MEDIA_SURFACES_INTEL      0x409B
-
-typedef cl_uint cl_va_api_device_source_intel;
-typedef cl_uint cl_va_api_device_set_intel;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetDeviceIDsFromVA_APIMediaAdapterINTEL(
-    cl_platform_id                /* platform */,
-    cl_va_api_device_source_intel /* media_adapter_type */,
-    void*                         /* media_adapter */,
-    cl_va_api_device_set_intel    /* media_adapter_set */,
-    cl_uint                       /* num_entries */,
-    cl_device_id*                 /* devices */,
-    cl_uint*                      /* num_devices */) CL_EXT_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL * clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)(
-    cl_platform_id                /* platform */,
-    cl_va_api_device_source_intel /* media_adapter_type */,
-    void*                         /* media_adapter */,
-    cl_va_api_device_set_intel    /* media_adapter_set */,
-    cl_uint                       /* num_entries */,
-    cl_device_id*                 /* devices */,
-    cl_uint*                      /* num_devices */) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromVA_APIMediaSurfaceINTEL(
-    cl_context                    /* context */,
-    cl_mem_flags                  /* flags */,
-    VASurfaceID*                  /* surface */,
-    cl_uint                       /* plane */,
-    cl_int*                       /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL * clCreateFromVA_APIMediaSurfaceINTEL_fn)(
-    cl_context                    /* context */,
-    cl_mem_flags                  /* flags */,
-    VASurfaceID*                  /* surface */,
-    cl_uint                       /* plane */,
-    cl_int*                       /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueAcquireVA_APIMediaSurfacesINTEL(
-    cl_command_queue              /* command_queue */,
-    cl_uint                       /* num_objects */,
-    const cl_mem*                 /* mem_objects */,
-    cl_uint                       /* num_events_in_wait_list */,
-    const cl_event*               /* event_wait_list */,
-    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)(
-    cl_command_queue              /* command_queue */,
-    cl_uint                       /* num_objects */,
-    const cl_mem*                 /* mem_objects */,
-    cl_uint                       /* num_events_in_wait_list */,
-    const cl_event*               /* event_wait_list */,
-    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReleaseVA_APIMediaSurfacesINTEL(
-    cl_command_queue              /* command_queue */,
-    cl_uint                       /* num_objects */,
-    const cl_mem*                 /* mem_objects */,
-    cl_uint                       /* num_events_in_wait_list */,
-    const cl_event*               /* event_wait_list */,
-    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
-	
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)(
-    cl_command_queue              /* command_queue */,
-    cl_uint                       /* num_objects */,
-    const cl_mem*                 /* mem_objects */,
-    cl_uint                       /* num_events_in_wait_list */,
-    const cl_event*               /* event_wait_list */,
-    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  /* __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H */
diff --git a/mobile/third_party/opencl/OpenCL-Headers/CL/cl_version.h b/mobile/third_party/opencl/OpenCL-Headers/CL/cl_version.h
deleted file mode 100644
index bb766cb9bb..0000000000
--- a/mobile/third_party/opencl/OpenCL-Headers/CL/cl_version.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2018 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
- * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
- * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
- *    https://www.khronos.org/registry/
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- ******************************************************************************/
-
-#ifndef __CL_VERSION_H
-#define __CL_VERSION_H
-
-/* Detect which version to target */
-#if !defined(CL_TARGET_OPENCL_VERSION)
-#pragma message("cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 220 (OpenCL 2.2)")
-#define CL_TARGET_OPENCL_VERSION 220
-#endif
-#if CL_TARGET_OPENCL_VERSION != 100 && \
-    CL_TARGET_OPENCL_VERSION != 110 && \
-    CL_TARGET_OPENCL_VERSION != 120 && \
-    CL_TARGET_OPENCL_VERSION != 200 && \
-    CL_TARGET_OPENCL_VERSION != 210 && \
-    CL_TARGET_OPENCL_VERSION != 220
-#pragma message("cl_version: CL_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220). Defaulting to 220 (OpenCL 2.2)")
-#undef CL_TARGET_OPENCL_VERSION
-#define CL_TARGET_OPENCL_VERSION 220
-#endif
-
-
-/* OpenCL Version */
-#if CL_TARGET_OPENCL_VERSION >= 220 && !defined(CL_VERSION_2_2)
-#define CL_VERSION_2_2  1
-#endif
-#if CL_TARGET_OPENCL_VERSION >= 210 && !defined(CL_VERSION_2_1)
-#define CL_VERSION_2_1  1
-#endif
-#if CL_TARGET_OPENCL_VERSION >= 200 && !defined(CL_VERSION_2_0)
-#define CL_VERSION_2_0  1
-#endif
-#if CL_TARGET_OPENCL_VERSION >= 120 && !defined(CL_VERSION_1_2)
-#define CL_VERSION_1_2  1
-#endif
-#if CL_TARGET_OPENCL_VERSION >= 110 && !defined(CL_VERSION_1_1)
-#define CL_VERSION_1_1  1
-#endif
-#if CL_TARGET_OPENCL_VERSION >= 100 && !defined(CL_VERSION_1_0)
-#define CL_VERSION_1_0  1
-#endif
-
-/* Allow deprecated APIs for older OpenCL versions. */
-#if CL_TARGET_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS)
-#define CL_USE_DEPRECATED_OPENCL_2_1_APIS
-#endif
-#if CL_TARGET_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS)
-#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
-#endif
-#if CL_TARGET_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-#endif
-#if CL_TARGET_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
-#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
-#endif
-#if CL_TARGET_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS)
-#define CL_USE_DEPRECATED_OPENCL_1_0_APIS
-#endif
-
-#endif  /* __CL_VERSION_H */
diff --git a/mobile/third_party/opencl/OpenCL-Headers/CL/opencl.h b/mobile/third_party/opencl/OpenCL-Headers/CL/opencl.h
deleted file mode 100644
index b5cd5a62a1..0000000000
--- a/mobile/third_party/opencl/OpenCL-Headers/CL/opencl.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2008-2015 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
- * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
- * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
- *    https://www.khronos.org/registry/
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- ******************************************************************************/
-
-/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
-
-#ifndef __OPENCL_H
-#define __OPENCL_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifdef __APPLE__
-
-#include <OpenCL/cl.h>
-#include <OpenCL/cl_gl.h>
-#include <OpenCL/cl_gl_ext.h>
-#include <OpenCL/cl_ext.h>
-
-#else
-
-#include <CL/cl.h>
-#include <CL/cl_gl.h>
-#include <CL/cl_gl_ext.h>
-#include <CL/cl_ext.h>
-
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  /* __OPENCL_H   */
diff --git a/mobile/third_party/opencl/OpenCL-Headers/LICENSE b/mobile/third_party/opencl/OpenCL-Headers/LICENSE
deleted file mode 100644
index 020ce65fca..0000000000
--- a/mobile/third_party/opencl/OpenCL-Headers/LICENSE
+++ /dev/null
@@ -1,25 +0,0 @@
-Copyright (c) 2008-2015 The Khronos Group Inc.
-
-Permission is hereby granted, free of charge, to any person obtaining a
-copy of this software and/or associated documentation files (the
-"Materials"), to deal in the Materials without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Materials, and to
-permit persons to whom the Materials are furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice shall be included
-in all copies or substantial portions of the Materials.
-
-MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
-KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
-SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
-   https://www.khronos.org/registry/
-
-THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
diff --git a/mobile/third_party/opencl/OpenCL-Headers/README.md b/mobile/third_party/opencl/OpenCL-Headers/README.md
deleted file mode 100644
index 757e56e152..0000000000
--- a/mobile/third_party/opencl/OpenCL-Headers/README.md
+++ /dev/null
@@ -1,50 +0,0 @@
-# OpenCL<sup>TM</sup> API Headers
-
-This repository contains C language headers for the OpenCL API.
-
-The authoritative public repository for these headers is located at:
-
-https://github.com/KhronosGroup/OpenCL-Headers
-
-Issues, proposed fixes for issues, and other suggested changes should be
-created using Github.
-
-## Branch Structure
-
-The OpenCL API headers in this repository are Unified headers and are designed
-to work with all released OpenCL versions.  This differs from previous OpenCL
-API headers, where version-specific API headers either existed in separate
-branches, or in separate folders in a branch.
-
-## Compiling for a Specific OpenCL Version
-
-By default, the OpenCL API headers in this repository are for the latest
-OpenCL version (currently OpenCL 2.2).  To use these API headers to target
-a different OpenCL version, an application may `#define` the preprocessor
-value `CL_TARGET_OPENCL_VERSION` before including the OpenCL API headers.
-The `CL_TARGET_OPENCL_VERSION` is a three digit decimal value representing
-the OpenCL API version.
-
-For example, to enforce usage of no more than the OpenCL 1.2 APIs, you may
-include the OpenCL API headers as follows:
-
-```
-#define CL_TARGET_OPENCL_VERSION 120
-#include <CL/opencl.h>
-```
-
-## Directory Structure
-
-```
-README.md               This file
-LICENSE                 Source license for the OpenCL API headers
-CL/                     Unified OpenCL API headers tree
-```
-
-## License
-
-See [LICENSE](LICENSE).
-
----
-
-OpenCL and the OpenCL logo are trademarks of Apple Inc. used by permission by Khronos.
diff --git a/mobile/tools/android-cmake/android.toolchain.cmake b/mobile/tools/android-cmake/android.toolchain.cmake
deleted file mode 100644
index b897a473d9..0000000000
--- a/mobile/tools/android-cmake/android.toolchain.cmake
+++ /dev/null
@@ -1,784 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Configurable variables.
-# Modeled after the ndk-build system.
-# For any variables defined in:
-#         https://developer.android.com/ndk/guides/android_mk.html
-#         https://developer.android.com/ndk/guides/application_mk.html
-# if it makes sense for CMake, then replace LOCAL, APP, or NDK with ANDROID, and
-# we have that variable below.
-# The exception is ANDROID_TOOLCHAIN vs NDK_TOOLCHAIN_VERSION.
-# Since we only have one version of each gcc and clang, specifying a version
-# doesn't make much sense.
-#
-# ANDROID_TOOLCHAIN
-# ANDROID_ABI
-# ANDROID_PLATFORM
-# ANDROID_STL
-# ANDROID_PIE
-# ANDROID_CPP_FEATURES
-# ANDROID_ALLOW_UNDEFINED_SYMBOLS
-# ANDROID_ARM_MODE
-# ANDROID_ARM_NEON
-# ANDROID_DISABLE_NO_EXECUTE
-# ANDROID_DISABLE_RELRO
-# ANDROID_DISABLE_FORMAT_STRING_CHECKS
-# ANDROID_CCACHE
-
-# cmake_minimum_required(VERSION 3.6.0)
-
-# Inhibit all of CMake's own NDK handling code.
-set(CMAKE_SYSTEM_VERSION 1)
-
-# CMake invokes the toolchain file twice during the first build, but only once
-# during subsequent rebuilds. This was causing the various flags to be added
-# twice on the first build, and on a rebuild ninja would see only one set of the
-# flags and rebuild the world.
-# https://github.com/android-ndk/ndk/issues/323
-if(ANDROID_NDK_TOOLCHAIN_INCLUDED)
-  return()
-endif(ANDROID_NDK_TOOLCHAIN_INCLUDED)
-set(ANDROID_NDK_TOOLCHAIN_INCLUDED true)
-
-# Android NDK
-if(NOT ANDROID_NDK)
-  get_filename_component(ANDROID_NDK "$ENV{NDK_ROOT}" ABSOLUTE)
-else()
-  # Allow the user to specify their own NDK path, but emit a warning. This is an
-  # uncommon use case, but helpful if users want to use a bleeding edge
-  # toolchain file with a stable NDK.
-  # https://github.com/android-ndk/ndk/issues/473
-  message(WARNING "Using custom NDK path (ANDROID_NDK is set): ${ANDROID_NDK}")
-endif()
-file(TO_CMAKE_PATH "${ANDROID_NDK}" ANDROID_NDK)
-
-# Android NDK revision
-message("${ANDROID_NDK}")
-
-file(READ "${ANDROID_NDK}/source.properties" ANDROID_NDK_SOURCE_PROPERTIES)
-set(ANDROID_NDK_SOURCE_PROPERTIES_REGEX
-  "^Pkg\\.Desc = Android NDK\nPkg\\.Revision = ([0-9]+)\\.")
-if(NOT ANDROID_NDK_SOURCE_PROPERTIES MATCHES "${ANDROID_NDK_SOURCE_PROPERTIES_REGEX}")
-  message(SEND_ERROR "Failed to parse Android NDK revision: ${ANDROID_NDK}/source.properties.\n${ANDROID_NDK_SOURCE_PROPERTIES}")
-endif()
-string(REGEX REPLACE "${ANDROID_NDK_SOURCE_PROPERTIES_REGEX}" "\\1"
-  ANDROID_NDK_REVISION "${ANDROID_NDK_SOURCE_PROPERTIES}")
-
-# Touch toolchain variable to suppress "unused variable" warning.
-# This happens if CMake is invoked with the same command line the second time.
-if(CMAKE_TOOLCHAIN_FILE)
-endif()
-
-# Compatibility for configurable variables.
-# Compatible with configurable variables from the other toolchain file:
-#         https://github.com/taka-no-me/android-cmake
-# TODO: We should consider dropping compatibility to simplify things once most
-# of our users have migrated to our standard set of configurable variables.
-if(ANDROID_TOOLCHAIN_NAME AND NOT ANDROID_TOOLCHAIN)
-  if(ANDROID_TOOLCHAIN_NAME MATCHES "-clang([0-9].[0-9])?$")
-    set(ANDROID_TOOLCHAIN clang)
-  elseif(ANDROID_TOOLCHAIN_NAME MATCHES "-[0-9].[0-9]$")
-    set(ANDROID_TOOLCHAIN gcc)
-  endif()
-endif()
-if(ANDROID_ABI STREQUAL "armeabi-v7a with NEON")
-  set(ANDROID_ABI armeabi-v7a)
-  set(ANDROID_ARM_NEON TRUE)
-elseif(ANDROID_TOOLCHAIN_NAME AND NOT ANDROID_ABI)
-  if(ANDROID_TOOLCHAIN_NAME MATCHES "^arm-linux-androideabi-")
-    set(ANDROID_ABI armeabi-v7a)
-  elseif(ANDROID_TOOLCHAIN_NAME MATCHES "^aarch64-linux-android-")
-    set(ANDROID_ABI arm64-v8a)
-  elseif(ANDROID_TOOLCHAIN_NAME MATCHES "^x86-")
-    set(ANDROID_ABI x86)
-  elseif(ANDROID_TOOLCHAIN_NAME MATCHES "^x86_64-")
-    set(ANDROID_ABI x86_64)
-  elseif(ANDROID_TOOLCHAIN_NAME MATCHES "^mipsel-linux-android-")
-    set(ANDROID_ABI mips)
-  elseif(ANDROID_TOOLCHAIN_NAME MATCHES "^mips64el-linux-android-")
-    set(ANDROID_ABI mips64)
-  endif()
-endif()
-if(ANDROID_NATIVE_API_LEVEL AND NOT ANDROID_PLATFORM)
-  if(ANDROID_NATIVE_API_LEVEL MATCHES "^android-[0-9]+$")
-    set(ANDROID_PLATFORM ${ANDROID_NATIVE_API_LEVEL})
-  elseif(ANDROID_NATIVE_API_LEVEL MATCHES "^[0-9]+$")
-    set(ANDROID_PLATFORM android-${ANDROID_NATIVE_API_LEVEL})
-  endif()
-endif()
-if(DEFINED ANDROID_APP_PIE AND NOT DEFINED ANDROID_PIE)
-  set(ANDROID_PIE "${ANDROID_APP_PIE}")
-endif()
-if(ANDROID_STL_FORCE_FEATURES AND NOT DEFINED ANDROID_CPP_FEATURES)
-  set(ANDROID_CPP_FEATURES "rtti exceptions")
-endif()
-if(DEFINED ANDROID_NO_UNDEFINED AND NOT DEFINED ANDROID_ALLOW_UNDEFINED_SYMBOLS)
-  if(ANDROID_NO_UNDEFINED)
-    set(ANDROID_ALLOW_UNDEFINED_SYMBOLS FALSE)
-  else()
-    set(ANDROID_ALLOW_UNDEFINED_SYMBOLS TRUE)
-  endif()
-endif()
-if(DEFINED ANDROID_SO_UNDEFINED AND NOT DEFINED ANDROID_ALLOW_UNDEFINED_SYMBOLS)
-  set(ANDROID_ALLOW_UNDEFINED_SYMBOLS "${ANDROID_SO_UNDEFINED}")
-endif()
-if(DEFINED ANDROID_FORCE_ARM_BUILD AND NOT ANDROID_ARM_MODE)
-  if(ANDROID_FORCE_ARM_BUILD)
-    set(ANDROID_ARM_MODE arm)
-  else()
-    set(ANDROID_ARM_MODE thumb)
-  endif()
-endif()
-if(DEFINED ANDROID_NOEXECSTACK AND NOT DEFINED ANDROID_DISABLE_NO_EXECUTE)
-  if(ANDROID_NOEXECSTACK)
-    set(ANDROID_DISABLE_NO_EXECUTE FALSE)
-  else()
-    set(ANDROID_DISABLE_NO_EXECUTE TRUE)
-  endif()
-endif()
-if(DEFINED ANDROID_RELRO AND NOT DEFINED ANDROID_DISABLE_RELRO)
-  if(ANDROID_RELRO)
-    set(ANDROID_DISABLE_RELRO FALSE)
-  else()
-    set(ANDROID_DISABLE_RELRO TRUE)
-  endif()
-endif()
-if(NDK_CCACHE AND NOT ANDROID_CCACHE)
-  set(ANDROID_CCACHE "${NDK_CCACHE}")
-endif()
-
-# Default values for configurable variables.
-if(NOT ANDROID_TOOLCHAIN)
-  set(ANDROID_TOOLCHAIN gcc)
-endif()
-if(NOT ANDROID_ABI)
-  set(ANDROID_ABI armeabi-v7a)
-endif()
-if(ANDROID_PLATFORM MATCHES "^android-([0-9]|1[0-3])$")
-  set(ANDROID_PLATFORM android-14)
-elseif(ANDROID_PLATFORM STREQUAL android-20)
-  set(ANDROID_PLATFORM android-19)
-elseif(ANDROID_PLATFORM STREQUAL android-25)
-  set(ANDROID_PLATFORM android-24)
-elseif(NOT ANDROID_PLATFORM)
-  set(ANDROID_PLATFORM android-14)
-endif()
-string(REPLACE "android-" "" ANDROID_PLATFORM_LEVEL ${ANDROID_PLATFORM})
-if(ANDROID_ABI MATCHES "64(-v8a)?$" AND ANDROID_PLATFORM_LEVEL LESS 21)
-  set(ANDROID_PLATFORM android-21)
-  set(ANDROID_PLATFORM_LEVEL 21)
-endif()
-if(NOT ANDROID_STL)
-  set(ANDROID_STL gnustl_static)
-endif()
-if(NOT DEFINED ANDROID_PIE)
-  if(ANDROID_PLATFORM_LEVEL LESS 16)
-    set(ANDROID_PIE FALSE)
-  else()
-    set(ANDROID_PIE TRUE)
-  endif()
-endif()
-if(NOT ANDROID_ARM_MODE)
-  set(ANDROID_ARM_MODE thumb)
-endif()
-
-# Export configurable variables for the try_compile() command.
-set(CMAKE_TRY_COMPILE_PLATFORM_VARIABLES
-  ANDROID_TOOLCHAIN
-  ANDROID_ABI
-  ANDROID_PLATFORM
-  ANDROID_STL
-  ANDROID_PIE
-  ANDROID_CPP_FEATURES
-  ANDROID_ALLOW_UNDEFINED_SYMBOLS
-  ANDROID_ARM_MODE
-  ANDROID_ARM_NEON
-  ANDROID_DISABLE_NO_EXECUTE
-  ANDROID_DISABLE_RELRO
-  ANDROID_DISABLE_FORMAT_STRING_CHECKS
-  ANDROID_CCACHE)
-
-# Standard cross-compiling stuff.
-set(ANDROID TRUE)
-set(CMAKE_SYSTEM_NAME Android)
-
-# Allow users to override these values in case they want more strict behaviors.
-# For example, they may want to prevent the NDK's libz from being picked up so
-# they can use their own.
-# https://github.com/android-ndk/ndk/issues/517
-if(NOT CMAKE_FIND_ROOT_PATH_MODE_PROGRAM)
-  set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
-endif()
-
-if(NOT CMAKE_FIND_ROOT_PATH_MODE_LIBRARY)
-  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-endif()
-
-if(NOT CMAKE_FIND_ROOT_PATH_MODE_INCLUDE)
-  set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
-endif()
-
-if(NOT CMAKE_FIND_ROOT_PATH_MODE_PACKAGE)
-  set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
-endif()
-
-# ABI.
-set(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ABI})
-if(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
-  set(ANDROID_SYSROOT_ABI arm)
-  set(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi)
-  set(ANDROID_TOOLCHAIN_ROOT ${ANDROID_TOOLCHAIN_NAME})
-  set(ANDROID_HEADER_TRIPLE arm-linux-androideabi)
-  if(ANDROID_ABI STREQUAL armeabi)
-    message(WARNING "armeabi is deprecated and will be removed in a future NDK "
-                    "release.")
-    set(CMAKE_SYSTEM_PROCESSOR armv5te)
-    set(ANDROID_LLVM_TRIPLE armv5te-none-linux-androideabi)
-  elseif(ANDROID_ABI STREQUAL armeabi-v7a)
-    set(CMAKE_SYSTEM_PROCESSOR armv7-a)
-    set(ANDROID_LLVM_TRIPLE armv7-none-linux-androideabi)
-  endif()
-elseif(ANDROID_ABI STREQUAL arm64-v8a)
-  set(ANDROID_SYSROOT_ABI arm64)
-  set(CMAKE_SYSTEM_PROCESSOR aarch64)
-  set(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
-  set(ANDROID_TOOLCHAIN_ROOT ${ANDROID_TOOLCHAIN_NAME})
-  set(ANDROID_LLVM_TRIPLE aarch64-none-linux-android)
-  set(ANDROID_HEADER_TRIPLE aarch64-linux-android)
-elseif(ANDROID_ABI STREQUAL x86)
-  set(ANDROID_SYSROOT_ABI x86)
-  set(CMAKE_SYSTEM_PROCESSOR i686)
-  set(ANDROID_TOOLCHAIN_NAME i686-linux-android)
-  set(ANDROID_TOOLCHAIN_ROOT ${ANDROID_ABI})
-  set(ANDROID_LLVM_TRIPLE i686-none-linux-android)
-  set(ANDROID_HEADER_TRIPLE i686-linux-android)
-elseif(ANDROID_ABI STREQUAL x86_64)
-  set(ANDROID_SYSROOT_ABI x86_64)
-  set(CMAKE_SYSTEM_PROCESSOR x86_64)
-  set(ANDROID_TOOLCHAIN_NAME x86_64-linux-android)
-  set(ANDROID_TOOLCHAIN_ROOT ${ANDROID_ABI})
-  set(ANDROID_LLVM_TRIPLE x86_64-none-linux-android)
-  set(ANDROID_HEADER_TRIPLE x86_64-linux-android)
-elseif(ANDROID_ABI STREQUAL mips)
-  message(WARNING "mips is deprecated and will be removed in a future NDK "
-                  "release.")
-  set(ANDROID_SYSROOT_ABI mips)
-  set(CMAKE_SYSTEM_PROCESSOR mips)
-  set(ANDROID_TOOLCHAIN_NAME mips64el-linux-android)
-  set(ANDROID_TOOLCHAIN_ROOT ${ANDROID_TOOLCHAIN_NAME})
-  set(ANDROID_LLVM_TRIPLE mipsel-none-linux-android)
-  set(ANDROID_HEADER_TRIPLE mipsel-linux-android)
-elseif(ANDROID_ABI STREQUAL mips64)
-  message(WARNING "mips64 is deprecated and will be removed in a future NDK "
-                  "release.")
-  set(ANDROID_SYSROOT_ABI mips64)
-  set(CMAKE_SYSTEM_PROCESSOR mips64)
-  set(ANDROID_TOOLCHAIN_NAME mips64el-linux-android)
-  set(ANDROID_TOOLCHAIN_ROOT ${ANDROID_TOOLCHAIN_NAME})
-  set(ANDROID_LLVM_TRIPLE mips64el-none-linux-android)
-  set(ANDROID_HEADER_TRIPLE mips64el-linux-android)
-else()
-  message(FATAL_ERROR "Invalid Android ABI: ${ANDROID_ABI}.")
-endif()
-
-set(ANDROID_COMPILER_FLAGS)
-set(ANDROID_COMPILER_FLAGS_CXX)
-set(ANDROID_COMPILER_FLAGS_DEBUG)
-set(ANDROID_COMPILER_FLAGS_RELEASE)
-set(ANDROID_LINKER_FLAGS)
-set(ANDROID_LINKER_FLAGS_EXE)
-
-# Don't re-export libgcc symbols in every binary.
-list(APPEND ANDROID_LINKER_FLAGS -Wl,--exclude-libs,libgcc.a)
-list(APPEND ANDROID_LINKER_FLAGS -Wl,--exclude-libs,libatomic.a)
-
-# STL.
-set(ANDROID_STL_STATIC_LIBRARIES)
-set(ANDROID_STL_SHARED_LIBRARIES)
-if(ANDROID_STL STREQUAL system)
-  if(NOT "x${ANDROID_CPP_FEATURES}" STREQUAL "x")
-    set(ANDROID_STL_STATIC_LIBRARIES supc++)
-  endif()
-elseif(ANDROID_STL STREQUAL stlport_static)
-  set(ANDROID_STL_STATIC_LIBRARIES stlport_static)
-elseif(ANDROID_STL STREQUAL stlport_shared)
-  set(ANDROID_STL_SHARED_LIBRARIES stlport_shared)
-elseif(ANDROID_STL STREQUAL gnustl_static)
-  set(ANDROID_STL_STATIC_LIBRARIES gnustl_static)
-elseif(ANDROID_STL STREQUAL gnustl_shared)
-  set(ANDROID_STL_STATIC_LIBRARIES supc++)
-  set(ANDROID_STL_SHARED_LIBRARIES gnustl_shared)
-elseif(ANDROID_STL STREQUAL c++_static)
-  set(ANDROID_STL_STATIC_LIBRARIES c++)
-elseif(ANDROID_STL STREQUAL c++_shared)
-  set(ANDROID_STL_SHARED_LIBRARIES c++)
-elseif(ANDROID_STL STREQUAL none)
-else()
-  message(FATAL_ERROR "Invalid Android STL: ${ANDROID_STL}.")
-endif()
-
-# Behavior of CMAKE_SYSTEM_LIBRARY_PATH and CMAKE_LIBRARY_PATH are really weird
-# when CMAKE_SYSROOT is set. The library path is appended to the sysroot even if
-# the library path is an abspath. Using a relative path from the sysroot doesn't
-# work either, because the relative path is abspath'd relative to the current
-# CMakeLists.txt file before being appended :(
-#
-# We can try to get out of this problem by providing another root path for cmake
-# to check. CMAKE_FIND_ROOT_PATH is intended for this purpose:
-# https://cmake.org/cmake/help/v3.8/variable/CMAKE_FIND_ROOT_PATH.html
-#
-# In theory this should just be our sysroot, but since we don't have a single
-# sysroot that is correct (there's only one set of headers, but multiple
-# locations for libraries that need to be handled differently).  Some day we'll
-# want to move all the libraries into ${ANDROID_NDK}/sysroot, but we'll need to
-# make some fixes to Clang, various build systems, and possibly CMake itself to
-# get that working.
-list(APPEND CMAKE_FIND_ROOT_PATH "${ANDROID_NDK}")
-
-# Sysroot.
-set(CMAKE_SYSROOT "${ANDROID_NDK}/sysroot")
-
-# CMake 3.9 tries to use CMAKE_SYSROOT_COMPILE before it gets set from
-# CMAKE_SYSROOT, which leads to using the system's /usr/include. Set this
-# manually.
-# https://github.com/android-ndk/ndk/issues/467
-set(CMAKE_SYSROOT_COMPILE "${CMAKE_SYSROOT}")
-
-# The compiler driver doesn't check any arch specific include locations (though
-# maybe we should add that). Architecture specific headers like asm/ and
-# machine/ are installed to an arch-$ARCH subdirectory of the sysroot.
-list(APPEND ANDROID_COMPILER_FLAGS
-  "-isystem ${CMAKE_SYSROOT}/usr/include/${ANDROID_HEADER_TRIPLE}")
-list(APPEND ANDROID_COMPILER_FLAGS
-  "-D__ANDROID_API__=${ANDROID_PLATFORM_LEVEL}")
-
-# We need different sysroots for linking and compiling, but cmake doesn't
-# support that. Pass the sysroot flag manually when linking.
-set(ANDROID_SYSTEM_LIBRARY_PATH
-  "${ANDROID_NDK}/platforms/${ANDROID_PLATFORM}/arch-${ANDROID_SYSROOT_ABI}")
-list(APPEND ANDROID_LINKER_FLAGS "--sysroot ${ANDROID_SYSTEM_LIBRARY_PATH}")
-
-# find_library searches a handful of paths as described by
-# https://cmake.org/cmake/help/v3.6/command/find_library.html.  Since libraries
-# are per-API level and headers aren't, We don't have libraries in the
-# CMAKE_SYSROOT. Set up CMAKE_SYSTEM_LIBRARY_PATH
-# (https://cmake.org/cmake/help/v3.6/variable/CMAKE_SYSTEM_LIBRARY_PATH.html)
-# instead.
-#
-# NB: The suffix is just lib here instead of dealing with lib64 because
-# apparently CMake does some automatic rewriting of that? I've been testing by
-# building my own CMake with a bunch of logging added, and that seems to be the
-# case.
-list(APPEND CMAKE_SYSTEM_LIBRARY_PATH
-  "${ANDROID_SYSTEM_LIBRARY_PATH}/usr/lib")
-
-# Toolchain.
-if(CMAKE_HOST_SYSTEM_NAME STREQUAL Linux)
-  set(ANDROID_HOST_TAG linux-x86_64)
-elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL Darwin)
-  set(ANDROID_HOST_TAG darwin-x86_64)
-elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL Windows)
-  set(ANDROID_HOST_TAG windows-x86_64)
-endif()
-set(ANDROID_TOOLCHAIN_ROOT "${ANDROID_NDK}/toolchains/${ANDROID_TOOLCHAIN_ROOT}-4.9/prebuilt/${ANDROID_HOST_TAG}")
-set(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
-if(CMAKE_HOST_SYSTEM_NAME STREQUAL Windows)
-  set(ANDROID_TOOLCHAIN_SUFFIX .exe)
-endif()
-
-set(ANDROID_HOST_PREBUILTS "${ANDROID_NDK}/prebuilt/${ANDROID_HOST_TAG}")
-
-if(ANDROID_TOOLCHAIN STREQUAL clang)
-  set(ANDROID_LLVM_TOOLCHAIN_PREFIX "${ANDROID_NDK}/toolchains/llvm/prebuilt/${ANDROID_HOST_TAG}/bin/")
-  set(ANDROID_C_COMPILER   "${ANDROID_LLVM_TOOLCHAIN_PREFIX}clang${ANDROID_TOOLCHAIN_SUFFIX}")
-  set(ANDROID_CXX_COMPILER "${ANDROID_LLVM_TOOLCHAIN_PREFIX}clang++${ANDROID_TOOLCHAIN_SUFFIX}")
-  set(ANDROID_ASM_COMPILER "${ANDROID_LLVM_TOOLCHAIN_PREFIX}clang${ANDROID_TOOLCHAIN_SUFFIX}")
-  # Clang can fail to compile if CMake doesn't correctly supply the target and
-  # external toolchain, but to do so, CMake needs to already know that the
-  # compiler is clang. Tell CMake that the compiler is really clang, but don't
-  # use CMakeForceCompiler, since we still want compile checks. We only want
-  # to skip the compiler ID detection step.
-  set(CMAKE_C_COMPILER_ID_RUN TRUE)
-  set(CMAKE_CXX_COMPILER_ID_RUN TRUE)
-  set(CMAKE_C_COMPILER_ID Clang)
-  set(CMAKE_CXX_COMPILER_ID Clang)
-  set(CMAKE_C_COMPILER_VERSION 3.8)
-  set(CMAKE_CXX_COMPILER_VERSION 3.8)
-  set(CMAKE_C_STANDARD_COMPUTED_DEFAULT 11)
-  set(CMAKE_CXX_STANDARD_COMPUTED_DEFAULT 98)
-  set(CMAKE_C_COMPILER_TARGET   ${ANDROID_LLVM_TRIPLE})
-  set(CMAKE_CXX_COMPILER_TARGET ${ANDROID_LLVM_TRIPLE})
-  set(CMAKE_ASM_COMPILER_TARGET ${ANDROID_LLVM_TRIPLE})
-  set(CMAKE_C_COMPILER_EXTERNAL_TOOLCHAIN   "${ANDROID_TOOLCHAIN_ROOT}")
-  set(CMAKE_CXX_COMPILER_EXTERNAL_TOOLCHAIN "${ANDROID_TOOLCHAIN_ROOT}")
-  set(CMAKE_ASM_COMPILER_EXTERNAL_TOOLCHAIN "${ANDROID_TOOLCHAIN_ROOT}")
-  set(ANDROID_AR "${ANDROID_TOOLCHAIN_PREFIX}ar${ANDROID_TOOLCHAIN_SUFFIX}")
-  set(ANDROID_RANLIB "${ANDROID_TOOLCHAIN_PREFIX}ranlib${ANDROID_TOOLCHAIN_SUFFIX}")
-elseif(ANDROID_TOOLCHAIN STREQUAL gcc)
-  set(ANDROID_C_COMPILER   "${ANDROID_TOOLCHAIN_PREFIX}gcc${ANDROID_TOOLCHAIN_SUFFIX}")
-  set(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}g++${ANDROID_TOOLCHAIN_SUFFIX}")
-  set(ANDROID_ASM_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}gcc${ANDROID_TOOLCHAIN_SUFFIX}")
-  set(ANDROID_AR "${ANDROID_TOOLCHAIN_PREFIX}gcc-ar${ANDROID_TOOLCHAIN_SUFFIX}")
-  set(ANDROID_RANLIB "${ANDROID_TOOLCHAIN_PREFIX}gcc-ranlib${ANDROID_TOOLCHAIN_SUFFIX}")
-else()
-  message(FATAL_ERROR "Invalid Android toolchain: ${ANDROID_TOOLCHAIN}.")
-endif()
-
-if(NOT IS_DIRECTORY "${ANDROID_NDK}/platforms/${ANDROID_PLATFORM}")
-  message(FATAL_ERROR "Invalid Android platform: ${ANDROID_PLATFORM}.")
-elseif(NOT IS_DIRECTORY "${CMAKE_SYSROOT}")
-  message(FATAL_ERROR "Invalid Android sysroot: ${CMAKE_SYSROOT}.")
-endif()
-
-# Generic flags.
-list(APPEND ANDROID_COMPILER_FLAGS
-#  -g
-  -DANDROID
-  -ffunction-sections
-  -funwind-tables
-  -fstack-protector-strong
-  -no-canonical-prefixes)
-list(APPEND ANDROID_LINKER_FLAGS
-  -Wl,--build-id
-  -Wl,--warn-shared-textrel
-  -Wl,--fatal-warnings)
-list(APPEND ANDROID_LINKER_FLAGS_EXE
-  -Wl,--gc-sections
-  -Wl,-z,nocopyreloc)
-
-# Debug and release flags.
-list(APPEND ANDROID_COMPILER_FLAGS_DEBUG -O0)
-if(ANDROID_ABI MATCHES "^armeabi")
-  list(APPEND ANDROID_COMPILER_FLAGS_RELEASE -Os)
-else()
-  list(APPEND ANDROID_COMPILER_FLAGS_RELEASE -O2)
-endif()
-list(APPEND ANDROID_COMPILER_FLAGS_RELEASE -DNDEBUG)
-if(ANDROID_TOOLCHAIN STREQUAL clang)
-  list(APPEND ANDROID_COMPILER_FLAGS_DEBUG -fno-limit-debug-info)
-endif()
-
-# Toolchain and ABI specific flags.
-if(ANDROID_ABI STREQUAL armeabi)
-  list(APPEND ANDROID_COMPILER_FLAGS
-    -march=armv5te
-    -mtune=xscale
-    -msoft-float)
-endif()
-if(ANDROID_ABI STREQUAL armeabi-v7a)
-  list(APPEND ANDROID_COMPILER_FLAGS
-    -march=armv7-a
-    -mfloat-abi=softfp
-    -mfpu=vfpv3-d16)
-  list(APPEND ANDROID_LINKER_FLAGS
-    -Wl,--fix-cortex-a8)
-endif()
-if(ANDROID_ABI STREQUAL mips)
-  list(APPEND ANDROID_COMPILER_FLAGS
-    -mips32)
-endif()
-if(ANDROID_ABI STREQUAL "mips64" AND ANDROID_TOOLCHAIN STREQUAL clang)
-  list(APPEND ANDROID_COMPILER_FLAGS "-fintegrated-as")
-endif()
-if(ANDROID_ABI MATCHES "^armeabi" AND ANDROID_TOOLCHAIN STREQUAL clang)
-  # Disable integrated-as for better compatibility.
-  list(APPEND ANDROID_COMPILER_FLAGS
-    -fno-integrated-as)
-endif()
-if(ANDROID_ABI STREQUAL mips AND ANDROID_TOOLCHAIN STREQUAL clang)
-  # Help clang use mips64el multilib GCC
-  list(APPEND ANDROID_LINKER_FLAGS
-    "\"-L${ANDROID_TOOLCHAIN_ROOT}/lib/gcc/${ANDROID_TOOLCHAIN_NAME}/4.9.x/32/mips-r1\"")
-endif()
-if(ANDROID_ABI STREQUAL x86)
-  # http://b.android.com/222239
-  # http://b.android.com/220159 (internal http://b/31809417)
-  # x86 devices have stack alignment issues.
-  list(APPEND ANDROID_COMPILER_FLAGS -mstackrealign)
-endif()
-
-# STL specific flags.
-if(ANDROID_STL STREQUAL system)
-  set(ANDROID_STL_PREFIX gnu-libstdc++/4.9)
-  set(CMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES
-    "${ANDROID_NDK}/sources/cxx-stl/system/include")
-elseif(ANDROID_STL MATCHES "^stlport_")
-  set(ANDROID_STL_PREFIX stlport)
-  set(CMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES
-    "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/stlport"
-    "${ANDROID_NDK}/sources/cxx-stl/gabi++/include")
-elseif(ANDROID_STL MATCHES "^gnustl_")
-  set(ANDROID_STL_PREFIX gnu-libstdc++/4.9)
-  set(CMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES
-    "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/include"
-    "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/libs/${ANDROID_ABI}/include"
-    "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/include/backward")
-elseif(ANDROID_STL MATCHES "^c\\+\\+_")
-  set(ANDROID_STL_PREFIX llvm-libc++)
-  if(ANDROID_ABI MATCHES "^armeabi")
-    list(APPEND ANDROID_LINKER_FLAGS -Wl,--exclude-libs,libunwind.a)
-  endif()
-  list(APPEND ANDROID_COMPILER_FLAGS_CXX
-    -std=c++11)
-  if(ANDROID_TOOLCHAIN STREQUAL gcc)
-    list(APPEND ANDROID_COMPILER_FLAGS_CXX
-      -fno-strict-aliasing)
-  endif()
-
-  # Add the libc++ lib directory to the path so the linker scripts can pick up
-  # the extra libraries.
-  list(APPEND ANDROID_LINKER_FLAGS
-    "-L${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/libs/${ANDROID_ABI}")
-
-  set(CMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES
-    "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/include"
-    "${ANDROID_NDK}/sources/android/support/include"
-    "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}abi/include")
-endif()
-set(ANDROID_CXX_STANDARD_LIBRARIES)
-foreach(library ${ANDROID_STL_STATIC_LIBRARIES})
-  list(APPEND ANDROID_CXX_STANDARD_LIBRARIES
-    "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/libs/${ANDROID_ABI}/lib${library}.a")
-endforeach()
-foreach(library ${ANDROID_STL_SHARED_LIBRARIES})
-  list(APPEND ANDROID_CXX_STANDARD_LIBRARIES
-    "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/libs/${ANDROID_ABI}/lib${library}.so")
-endforeach()
-set(CMAKE_C_STANDARD_LIBRARIES_INIT "-latomic -lm")
-set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_C_STANDARD_LIBRARIES_INIT}")
-if(ANDROID_CXX_STANDARD_LIBRARIES)
-  string(REPLACE ";" "\" \"" ANDROID_CXX_STANDARD_LIBRARIES "\"${ANDROID_CXX_STANDARD_LIBRARIES}\"")
-  set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_CXX_STANDARD_LIBRARIES}")
-endif()
-
-# Configuration specific flags.
-if(ANDROID_PIE)
-  set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
-  list(APPEND ANDROID_LINKER_FLAGS_EXE
-    -pie
-    -fPIE)
-endif()
-if(ANDROID_CPP_FEATURES)
-  separate_arguments(ANDROID_CPP_FEATURES)
-  foreach(feature ${ANDROID_CPP_FEATURES})
-    if(NOT ${feature} MATCHES "^(rtti|exceptions)$")
-      message(FATAL_ERROR "Invalid Android C++ feature: ${feature}.")
-    endif()
-    list(APPEND ANDROID_COMPILER_FLAGS_CXX
-      -f${feature})
-  endforeach()
-  string(REPLACE ";" " " ANDROID_CPP_FEATURES "${ANDROID_CPP_FEATURES}")
-endif()
-if(NOT ANDROID_ALLOW_UNDEFINED_SYMBOLS)
-  list(APPEND ANDROID_LINKER_FLAGS
-    -Wl,--no-undefined)
-endif()
-if(ANDROID_ABI MATCHES "armeabi")
-  if(ANDROID_ARM_MODE STREQUAL thumb)
-    list(APPEND ANDROID_COMPILER_FLAGS
-      -mthumb)
-  elseif(ANDROID_ARM_MODE STREQUAL arm)
-    list(APPEND ANDROID_COMPILER_FLAGS
-      -marm)
-  else()
-    message(FATAL_ERROR "Invalid Android ARM mode: ${ANDROID_ARM_MODE}.")
-  endif()
-  if(ANDROID_ABI STREQUAL armeabi-v7a AND ANDROID_ARM_NEON)
-    list(APPEND ANDROID_COMPILER_FLAGS
-      -mfpu=neon)
-  endif()
-endif()
-if(ANDROID_DISABLE_NO_EXECUTE)
-  list(APPEND ANDROID_COMPILER_FLAGS
-    -Wa,--execstack)
-  list(APPEND ANDROID_LINKER_FLAGS
-    -Wl,-z,execstack)
-else()
-  list(APPEND ANDROID_COMPILER_FLAGS
-    -Wa,--noexecstack)
-  list(APPEND ANDROID_LINKER_FLAGS
-    -Wl,-z,noexecstack)
-endif()
-if(ANDROID_TOOLCHAIN STREQUAL clang)
-  # CMake automatically forwards all compiler flags to the linker,
-  # and clang doesn't like having -Wa flags being used for linking.
-  # To prevent CMake from doing this would require meddling with
-  # the CMAKE_<LANG>_COMPILE_OBJECT rules, which would get quite messy.
-  list(APPEND ANDROID_LINKER_FLAGS
-    -Qunused-arguments)
-endif()
-if(ANDROID_DISABLE_RELRO)
-  list(APPEND ANDROID_LINKER_FLAGS
-    -Wl,-z,norelro -Wl,-z,lazy)
-else()
-  list(APPEND ANDROID_LINKER_FLAGS
-    -Wl,-z,relro -Wl,-z,now)
-endif()
-if(ANDROID_DISABLE_FORMAT_STRING_CHECKS)
-  list(APPEND ANDROID_COMPILER_FLAGS
-    -Wno-error=format-security)
-else()
-  list(APPEND ANDROID_COMPILER_FLAGS
-    -Wformat -Werror=format-security)
-endif()
-
-# Convert these lists into strings.
-string(REPLACE ";" " " ANDROID_COMPILER_FLAGS         "${ANDROID_COMPILER_FLAGS}")
-string(REPLACE ";" " " ANDROID_COMPILER_FLAGS_CXX     "${ANDROID_COMPILER_FLAGS_CXX}")
-string(REPLACE ";" " " ANDROID_COMPILER_FLAGS_DEBUG   "${ANDROID_COMPILER_FLAGS_DEBUG}")
-string(REPLACE ";" " " ANDROID_COMPILER_FLAGS_RELEASE "${ANDROID_COMPILER_FLAGS_RELEASE}")
-string(REPLACE ";" " " ANDROID_LINKER_FLAGS           "${ANDROID_LINKER_FLAGS}")
-string(REPLACE ";" " " ANDROID_LINKER_FLAGS_EXE       "${ANDROID_LINKER_FLAGS_EXE}")
-
-if(ANDROID_CCACHE)
-  set(CMAKE_C_COMPILER_LAUNCHER   "${ANDROID_CCACHE}")
-  set(CMAKE_CXX_COMPILER_LAUNCHER "${ANDROID_CCACHE}")
-endif()
-set(CMAKE_C_COMPILER        "${ANDROID_C_COMPILER}")
-set(CMAKE_CXX_COMPILER      "${ANDROID_CXX_COMPILER}")
-set(CMAKE_AR                "${ANDROID_AR}" CACHE FILEPATH "Archiver")
-set(CMAKE_RANLIB            "${ANDROID_RANLIB}" CACHE FILEPATH "Ranlib")
-set(_CMAKE_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_PREFIX}")
-
-if(ANDROID_ABI STREQUAL "x86" OR ANDROID_ABI STREQUAL "x86_64")
-  set(CMAKE_ASM_NASM_COMPILER
-    "${ANDROID_HOST_PREBUILTS}/bin/yasm${ANDROID_TOOLCHAIN_SUFFIX}")
-  set(CMAKE_ASM_NASM_COMPILER_ARG1 "-DELF")
-endif()
-
-# Set or retrieve the cached flags.
-# This is necessary in case the user sets/changes flags in subsequent
-# configures. If we included the Android flags in here, they would get
-# overwritten.
-set(CMAKE_C_FLAGS ""
-  CACHE STRING "Flags used by the compiler during all build types.")
-set(CMAKE_CXX_FLAGS ""
-  CACHE STRING "Flags used by the compiler during all build types.")
-set(CMAKE_ASM_FLAGS ""
-  CACHE STRING "Flags used by the compiler during all build types.")
-set(CMAKE_C_FLAGS_DEBUG ""
-  CACHE STRING "Flags used by the compiler during debug builds.")
-set(CMAKE_CXX_FLAGS_DEBUG ""
-  CACHE STRING "Flags used by the compiler during debug builds.")
-set(CMAKE_ASM_FLAGS_DEBUG ""
-  CACHE STRING "Flags used by the compiler during debug builds.")
-set(CMAKE_C_FLAGS_RELEASE ""
-  CACHE STRING "Flags used by the compiler during release builds.")
-set(CMAKE_CXX_FLAGS_RELEASE ""
-  CACHE STRING "Flags used by the compiler during release builds.")
-set(CMAKE_ASM_FLAGS_RELEASE ""
-  CACHE STRING "Flags used by the compiler during release builds.")
-set(CMAKE_MODULE_LINKER_FLAGS ""
-  CACHE STRING "Flags used by the linker during the creation of modules.")
-set(CMAKE_SHARED_LINKER_FLAGS ""
-  CACHE STRING "Flags used by the linker during the creation of dll's.")
-set(CMAKE_EXE_LINKER_FLAGS ""
-  CACHE STRING "Flags used by the linker.")
-
-set(CMAKE_C_FLAGS             "${ANDROID_COMPILER_FLAGS} ${CMAKE_C_FLAGS}")
-set(CMAKE_CXX_FLAGS           "${ANDROID_COMPILER_FLAGS} ${ANDROID_COMPILER_FLAGS_CXX} ${CMAKE_CXX_FLAGS}")
-set(CMAKE_ASM_FLAGS           "${ANDROID_COMPILER_FLAGS} ${CMAKE_ASM_FLAGS}")
-set(CMAKE_C_FLAGS_DEBUG       "${ANDROID_COMPILER_FLAGS_DEBUG} ${CMAKE_C_FLAGS_DEBUG}")
-set(CMAKE_CXX_FLAGS_DEBUG     "${ANDROID_COMPILER_FLAGS_DEBUG} ${CMAKE_CXX_FLAGS_DEBUG}")
-set(CMAKE_ASM_FLAGS_DEBUG     "${ANDROID_COMPILER_FLAGS_DEBUG} ${CMAKE_ASM_FLAGS_DEBUG}")
-set(CMAKE_C_FLAGS_RELEASE     "${ANDROID_COMPILER_FLAGS_RELEASE} ${CMAKE_C_FLAGS_RELEASE}")
-set(CMAKE_CXX_FLAGS_RELEASE   "${ANDROID_COMPILER_FLAGS_RELEASE} ${CMAKE_CXX_FLAGS_RELEASE}")
-set(CMAKE_ASM_FLAGS_RELEASE   "${ANDROID_COMPILER_FLAGS_RELEASE} ${CMAKE_ASM_FLAGS_RELEASE}")
-set(CMAKE_SHARED_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}")
-set(CMAKE_MODULE_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${CMAKE_MODULE_LINKER_FLAGS}")
-set(CMAKE_EXE_LINKER_FLAGS    "${ANDROID_LINKER_FLAGS} ${ANDROID_LINKER_FLAGS_EXE} ${CMAKE_EXE_LINKER_FLAGS}")
-
-# Compatibility for read-only variables.
-# Read-only variables for compatibility with the other toolchain file.
-# We'll keep these around for the existing projects that still use them.
-# TODO: All of the variables here have equivalents in our standard set of
-# configurable variables, so we can remove these once most of our users migrate
-# to those variables.
-set(ANDROID_NATIVE_API_LEVEL ${ANDROID_PLATFORM_LEVEL})
-if(ANDROID_ALLOW_UNDEFINED_SYMBOLS)
-  set(ANDROID_SO_UNDEFINED TRUE)
-else()
-  set(ANDROID_NO_UNDEFINED TRUE)
-endif()
-set(ANDROID_FUNCTION_LEVEL_LINKING TRUE)
-set(ANDROID_GOLD_LINKER TRUE)
-if(NOT ANDROID_DISABLE_NO_EXECUTE)
-  set(ANDROID_NOEXECSTACK TRUE)
-endif()
-if(NOT ANDROID_DISABLE_RELRO)
-  set(ANDROID_RELRO TRUE)
-endif()
-if(ANDROID_ARM_MODE STREQUAL arm)
-  set(ANDROID_FORCE_ARM_BUILD TRUE)
-endif()
-if(ANDROID_CPP_FEATURES MATCHES "rtti"
-    AND ANDROID_CPP_FEATURES MATCHES "exceptions")
-  set(ANDROID_STL_FORCE_FEATURES TRUE)
-endif()
-if(ANDROID_CCACHE)
-  set(NDK_CCACHE "${ANDROID_CCACHE}")
-endif()
-if(ANDROID_TOOLCHAIN STREQUAL clang)
-  set(ANDROID_TOOLCHAIN_NAME ${ANDROID_TOOLCHAIN_NAME}-clang)
-else()
-  set(ANDROID_TOOLCHAIN_NAME ${ANDROID_TOOLCHAIN_NAME}-4.9)
-endif()
-set(ANDROID_NDK_HOST_X64 TRUE)
-set(ANDROID_NDK_LAYOUT RELEASE)
-if(ANDROID_ABI STREQUAL armeabi)
-  set(ARMEABI TRUE)
-elseif(ANDROID_ABI STREQUAL armeabi-v7a)
-  set(ARMEABI_V7A TRUE)
-  if(ANDROID_ARM_NEON)
-    set(NEON TRUE)
-  endif()
-elseif(ANDROID_ABI STREQUAL arm64-v8a)
-  set(ARM64_V8A TRUE)
-elseif(ANDROID_ABI STREQUAL x86)
-  set(X86 TRUE)
-elseif(ANDROID_ABI STREQUAL x86_64)
-  set(X86_64 TRUE)
-elseif(ANDROID_ABI STREQUAL mips)
-  set(MIPS TRUE)
-elseif(ANDROID_ABI STREQUAL mips64)
-  set(MIPS64 TRUE)
-endif()
-set(ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_HOST_TAG})
-set(ANDROID_NDK_ABI_NAME ${ANDROID_ABI})
-set(ANDROID_NDK_RELEASE r${ANDROID_NDK_REVISION})
-set(ANDROID_ARCH_NAME ${ANDROID_SYSROOT_ABI})
-set(ANDROID_SYSROOT "${CMAKE_SYSROOT}")
-set(TOOL_OS_SUFFIX ${ANDROID_TOOLCHAIN_SUFFIX})
-if(ANDROID_TOOLCHAIN STREQUAL clang)
-  set(ANDROID_COMPILER_IS_CLANG TRUE)
-endif()
-
-# CMake 3.7+ compatibility.
-if (CMAKE_VERSION VERSION_GREATER 3.7.0)
-  set(CMAKE_ANDROID_NDK ${ANDROID_NDK})
-
-  if(ANDROID_TOOLCHAIN STREQUAL gcc)
-    set(CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION 4.9)
-  else()
-    set(CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION clang)
-  endif()
-
-  set(CMAKE_ANDROID_STL_TYPE ${ANDROID_STL})
-
-  if(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
-    set(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON})
-    set(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE})
-  endif()
-endif()
diff --git a/mobile/tools/android-debug-script/push2android.sh b/mobile/tools/android-debug-script/push2android.sh
deleted file mode 100644
index a367bb6a29..0000000000
--- a/mobile/tools/android-debug-script/push2android.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/usr/bin/env sh
-
-push_fn () {
-MODELS_PATH="../../test/models/*"
-MODELS_SRC="../../test/models"
-IMAGE_PATH="../../test/images/*"
-EXE_FILE="../../test/build/*"
-EXE_DIR="/data/local/tmp/bin"
-adb shell mkdir ${EXE_DIR}
-MODELS_DIR="/data/local/tmp/models"
-adb shell mkdir ${MODELS_DIR}
-for file in `ls ${MODELS_SRC}`
-do
-    adb shell mkdir ${MODELS_DIR}"/"${file}
-done
-
-if [[ -d "../../src/operators/kernel/mali/ACL_Android/build" ]]; then
-ACL_BUILD_PATH="../../src/operators/kernel/mali/ACL_Android/build/*"
-adb push ${ACL_BUILD_PATH} ${EXE_DIR}
-fi
-
-IMAGES_DIR="/data/local/tmp/images"
-adb shell mkdir ${IMAGES_DIR}
-LIB_PATH="../../build/release/arm-v7a/build/*"
-#LIB_PATH="../../build/release/arm-v8a/build/*"
-adb push ${EXE_FILE} ${EXE_DIR}
-for file in ${LIB_PATH}
-do
-    adb push ${file} ${EXE_DIR}
-done
-
-if [[ $1 != "npm" ]]; then
-adb push ${IMAGE_PATH} ${IMAGES_DIR}
-adb push ${MODELS_PATH} ${MODELS_DIR}
-fi
-}
-
-if [[ $1 == "npm" ]]; then
-push_fn $1
-else
-push_fn
-fi
diff --git a/mobile/tools/android-debug-script/run_on_android.sh b/mobile/tools/android-debug-script/run_on_android.sh
deleted file mode 100644
index cb5a634860..0000000000
--- a/mobile/tools/android-debug-script/run_on_android.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/usr/bin/env sh
-
-push_fn () {
-MODELS_PATH="../../test/models/*"
-MODELS_SRC="../../test/models"
-IMAGE_PATH="../../test/images/*"
-EXE_FILE="../../test/build/*"
-EXE_DIR="data/local/tmp/bin"
-adb shell mkdir ${EXE_DIR}
-MODELS_DIR="data/local/tmp/models"
-adb shell mkdir ${MODELS_DIR}
-for file in `ls ${MODELS_SRC}`
-do 
-    adb shell mkdir ${MODELS_DIR}"/"${file}
-done
-
-IMAGES_DIR="data/local/tmp/images"
-adb shell mkdir ${IMAGES_DIR}
-LIB_PATH="../../build/release/arm-v7a/build/*"
-adb push ${EXE_FILE} ${EXE_DIR}
-adb push ${LIB_PATH} ${EXE_DIR}
-if [[ $1 != "npm" ]]; then
-adb push ${IMAGE_PATH} ${IMAGES_DIR}
-adb push ${MODELS_PATH} ${MODELS_DIR}
-fi
-echo "test-op or test-net below : "
-adb shell ls /data/local/tmp/bin
-echo "**** choose OP or NET to test ****"
-read -p "which to test : " test_name
-adb shell "cd /data/local/tmp/bin; LD_LIBRARY_PATH=. ./${test_name}"
-}
-
-if [[ $1 == "npm" ]]; then
-push_fn $1
-else
-push_fn
-fi
diff --git a/mobile/tools/arm-platform.cmake b/mobile/tools/arm-platform.cmake
deleted file mode 100644
index 9f2b6d5e89..0000000000
--- a/mobile/tools/arm-platform.cmake
+++ /dev/null
@@ -1,9 +0,0 @@
-
-set(ARCH "armv7-a")
-
-set(FLOAT_ABI "softfp" CACHE STRING "-mfloat-api chosen")
-set_property(CACHE FLOAT_ABI PROPERTY STRINGS "softfp" "soft" "hard")
-
-set(FPU "neon")
-
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${ARCH} -mfloat-abi=${FLOAT_ABI} -mfpu=${FPU}")
diff --git a/mobile/tools/build.sh b/mobile/tools/build.sh
deleted file mode 100755
index 877791ff7b..0000000000
--- a/mobile/tools/build.sh
+++ /dev/null
@@ -1,225 +0,0 @@
-#!/usr/bin/env bash
-NETS=""
-declare -a supportedNets=("googlenet" "mobilenet" "yolo" "squeezenet" "resnet" "mobilenetssd" "nlp" "mobilenetfssd" "genet" "super" "op")
-
-# merge cl to so
-merge_cl_to_so=1
-opencl_kernels="opencl_kernels.cpp"
-cd ../src/operators/kernel/cl
-if [[ -f "${opencl_kernels}" ]]; then
-    rm "${opencl_kernels}"
-fi
-python gen_code.py "${merge_cl_to_so}" > "${opencl_kernels}"
-cd -
-
-build_for_mac() {
-    if [ ! `which brew` ]; then
-        echo "building failed! homebrew not found, please install homebrew."
-        return
-    fi
-    if [ ! `which cmake` ]; then
-        echo "installing cmake."
-        brew install cmake
-        if [ ! $? ]; then
-            echo "cmake install failed."
-            return
-        fi
-    fi
-    PLATFORM="x86"
-    MODE="Release"
-    BUILD_DIR=../build/release/"${PLATFORM}"
-    mkdir -p ${BUILD_DIR}/build
-
-    mkdir -p ${BUILD_DIR}/test
-    cp -r ../test/models ${BUILD_DIR}/test/models
-
-    cmake .. \
-        -B"${BUILD_DIR}" \
-    	-DCMAKE_BUILD_TYPE="${MODE}" \
-    	-DIS_MAC=true
-
-    cd ${BUILD_DIR}
-    make -j 8
-}
-
-build_for_android() {
-    # rm -rf "../build"
-    if [ -z "${NDK_ROOT}" ]; then
-        echo "NDK_ROOT not found!"
-        exit -1
-    fi
-
-    if [ -z "$PLATFORM" ]; then
-        PLATFORM="arm-v7a"  # Users could choose "arm-v8a" platform.
-        # PLATFORM="arm-v8a"
-    fi
-
-    if [ "${PLATFORM}" = "arm-v7a" ]; then
-        ABI="armeabi-v7a with NEON"
-        ARM_PLATFORM="V7"
-        CXX_FLAGS="-march=armv7-a -mfpu=neon -mfloat-abi=softfp -pie -fPIE -w -Wno-error=format-security"
-    elif [ "${PLATFORM}" = "arm-v8a" ]; then
-        ABI="arm64-v8a"
-        ARM_PLATFORM="V8"
-        CXX_FLAGS="-march=armv8-a  -pie -fPIE -w -Wno-error=format-security -llog"
-    else
-        echo "unknown platform!"
-        exit -1
-    fi
-
-
-    MODE="Release"
-    ANDROID_PLATFORM_VERSION="android-19"
-    TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake"
-    ANDROID_ARM_MODE="arm"
-
-    if [ "${#NETS}" -gt 1 ]; then
-    cmake .. \
-        -B"../build/release/${PLATFORM}" \
-        -DANDROID_ABI="${ABI}" \
-        -DCMAKE_BUILD_TYPE="${MODE}" \
-        -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
-        -DANDROID_PLATFORM="${ANDROID_PLATFORM_VERSION}" \
-        -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
-        -DANDROID_STL=c++_static \
-        -DANDROID=true \
-        -DNET="${NETS}" \
-        -D"${ARM_PLATFORM}"=true
-    else
-
-    cmake .. \
-        -B"../build/release/${PLATFORM}" \
-        -DANDROID_ABI="${ABI}" \
-        -DCMAKE_BUILD_TYPE="${MODE}" \
-        -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
-        -DANDROID_PLATFORM="${ANDROID_PLATFORM_VERSION}" \
-        -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
-        -DANDROID_STL=c++_static \
-        -DANDROID=true \
-        -D"${ARM_PLATFORM}"=true
-    fi
-    cd "../build/release/${PLATFORM}"
-    make -j 8
-    mkdir ./build/cl_kernel
-    cp ../../../src/operators/kernel/cl/cl_kernel/*  ./build/cl_kernel/
-}
-
-build_for_arm_linux() {
-    MODE="Release"
-    ARM_LINUX="arm-linux"
-
-    if [ "${#NETS}" -gt 1 ]; then
-        cmake .. \
-            -B"../build/release/arm-linux" \
-            -DCMAKE_BUILD_TYPE="${MODE}" \
-            -DCMAKE_TOOLCHAIN_FILE="./tools/toolchains/arm-linux-gnueabihf.cmake" \
-            -DCMAKE_CXX_FLAGS="-std=c++14 -mcpu=cortex-a53 -mtune=cortex-a53 -ftree-vectorize -funsafe-math-optimizations  -pipe -mlittle-endian " \
-            -DNET="${NETS}" \
-            -D"V7"=true
-    else
-        cmake .. \
-            -B"../build/release/arm-linux" \
-            -DCMAKE_BUILD_TYPE="${MODE}" \
-            -DCMAKE_TOOLCHAIN_FILE="./tools/toolchains/arm-linux-gnueabihf.cmake" \
-            -DCMAKE_CXX_FLAGS="-std=c++14 -mcpu=cortex-a53 -mtune=cortex-a53 -ftree-vectorize -funsafe-math-optimizations -pipe -mlittle-endian " \
-            -DNET="${NETS}" \
-            -D"V7"=true
-    fi
-
-    cd "../build/release/arm-linux"
-    make -j 2
-
-    cd "../../../test/"
-    DIRECTORY="models"
-    if [ "`ls -A $DIRECTORY`" = "" ]; then
-        echo "$DIRECTORY is indeed empty pull images"
-        wget http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip
-        unzip paddle-mobile%2FmodelsAndImages.zip
-        mv modelsAndImages/images/ images
-        mv modelsAndImages/models/ models
-        rm -rf paddle-mobile%2FmodelsAndImages.zip
-        rm -rf __MACOS
-    else
-        echo "$DIRECTORY is indeed not empty, DONE!"
-    fi
-
-}
-
-build_for_ios() {
-#    rm -rf "../build"
-    PLATFORM="ios"
-    MODE="Release"
-    BUILD_DIR=../build/release/"${PLATFORM}"/
-    TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake"
-    mkdir -p "${BUILD_DIR}"
-    if [ "${#NETS}" -gt 1 ]; then
-        cmake .. \
-            -B"${BUILD_DIR}" \
-            -DCMAKE_BUILD_TYPE="${MODE}" \
-            -DIOS_PLATFORM=OS \
-            -DIOS_ARCH="${IOS_ARCH}" \
-            -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
-            -DNET="${NETS}" \
-            -DIS_IOS="true"
-    else
-        cmake .. \
-            -B"${BUILD_DIR}" \
-            -DCMAKE_BUILD_TYPE="${MODE}" \
-            -DIOS_PLATFORM=OS \
-            -DIOS_ARCH="${IOS_ARCH}" \
-            -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
-            -DIS_IOS="true"
-    fi
-    cd "${BUILD_DIR}"
-    make -j 8
-    cp ../../../src/io/ios_io/PaddleMobileCPU.h ./build/PaddleMobileCPU.h
-    cd ./build
-    # 生成符号表
-    ranlib *.a
-}
-
-build_error() {
-    echo "unknown target : $1"
-}
-
-if [ $# -lt 1 ]; then
-    echo "error: target missing!"
-    echo "available targets: ios|android"
-    echo "sample usage: ./build.sh android"
-else
-    params=($@)
-    for(( i=1; i<$#; i++ )); do
-        if [ ${i} != 1 ]; then
-            NETS=$NETS$";"
-        fi
-        NETS=$NETS$"${params[i]}"
-    done
-    params=${@:2}
-
-    supported=false
-    for name in ${params[@]}; do
-        for net in ${supportedNets[@]}; do
-            match=false
-            if [ "$name"x = "$net"x ];then
-                supported=true
-                match=true
-                break 1
-            fi
-        done
-        if [ "$match" = false ];then
-            echo "${name} not supported!"
-            echo "supported nets are: ${supportedNets[@]}"
-            exit -1
-        fi
-    done
-
-    if [ $1 = "android" ]; then
-        build_for_android
-    elif [ $1 = "arm_linux" ]; then
-        build_for_arm_linux
-    elif [ $1 = "ios" ]; then
-        build_for_ios
-    else
-        build_error "$1"
-    fi
-fi
diff --git a/mobile/tools/ci_build.sh b/mobile/tools/ci_build.sh
deleted file mode 100755
index 8bd892c22d..0000000000
--- a/mobile/tools/ci_build.sh
+++ /dev/null
@@ -1,270 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -e
-source ./ci_run_test.sh
-
-function print_usage() {
-  echo "\n${RED}Usage${NONE}:
-  ${BOLD}${SCRIPT_NAME}${NONE} [Option] [Network]"
-
-  echo "\n${RED}Option${NONE}: required, specify the target platform
-  ${BLUE}android_armv7${NONE}: run build for android armv7 platform
-  ${BLUE}android_armv8${NONE}: run build for android armv8 platform
-  ${BLUE}ios${NONE}: run build for apple ios platform
-  ${BLUE}linux_armv7${NONE}: run build for linux armv7 platform
-  ${BLUE}linux_armv8${NONE}: run build for linux armv8 platform
-  ${BLUE}fpga${NONE}: run build for fpga platform
-  "
-  echo "\n${RED}Network${NONE}: optional, for deep compressing the framework size
-  ${BLUE}googlenet${NONE}: build only googlenet support
-  ${BLUE}mobilenet${NONE}: build only mobilenet support
-  ${BLUE}yolo${NONE}: build only yolo support
-  ${BLUE}squeezenet${NONE}: build only squeezenet support
-  ${BLUE}resnet${NONE}: build only resnet support
-  ${BLUE}mobilenetssd${NONE}: build only mobilenetssd support
-  ${BLUE}nlp${NONE}: build only nlp model support
-  ${BLUE}mobilenetfssd${NONE}: build only mobilenetfssd support
-  ${BLUE}genet${NONE}: build only genet support
-  ${BLUE}super${NONE}: build only super support
-  "
-}
-
-function init() {
-  RED='\033[0;31m'
-  BLUE='\033[0;34m'
-  BOLD='\033[1m'
-  NONE='\033[0m'
-
-  PADDLE_MOBILE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../" && pwd )"
-  if [ -z "${SCRIPT_NAME}" ]; then
-      SCRIPT_NAME=$0
-  fi
-}
-
-function check_ndk() {
-  if [ -z "${NDK_ROOT}" ]; then
-    echo "Should set NDK_ROOT as your android ndk path, such as\n"
-    echo "  export NDK_ROOT=~/android-ndk-r14b\n"
-    exit -1
-  fi
-}
-
-function build_android_armv7_cpu_only() {
-#  rm -rf ../build/armeabi-v7a
-  cmake .. \
-    -B"../build/armeabi-v7a" \
-    -DANDROID_ABI="armeabi-v7a with NEON" \
-    -DCMAKE_BUILD_TYPE="MinSizeRel" \
-    -DCMAKE_TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake" \
-    -DANDROID_PLATFORM="android-22" \
-    -DANDROID_STL=c++_static \
-    -DANDROID=true \
-    -DWITH_LOGGING=OFF \
-    -DCPU=ON \
-    -DGPU_CL=OFF \
-    -DFPGA=OFF
-
-  cd ../build/armeabi-v7a && make -j 8
-  cd -
-}
-
-function build_android_armv7_gpu() {
-  rm -rf ../build/armeabi-v7a
-  cmake .. \
-    -B"../build/armeabi-v7a" \
-    -DANDROID_ABI="armeabi-v7a with NEON" \
-    -DCMAKE_BUILD_TYPE="MinSizeRel" \
-    -DCMAKE_TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake" \
-    -DANDROID_PLATFORM="android-22" \
-    -DANDROID_STL=c++_static \
-    -DANDROID=true \
-    -DWITH_LOGGING=OFF \
-    -DCPU=ON \
-    -DGPU_CL=ON \
-    -DFPGA=OFF
-
-  cd ../build/armeabi-v7a && make -j 8
-  cd -
-}
-
-function build_android_armv8_cpu_only() {
-  rm -rf ../build/arm64-v8a
-  cmake .. \
-    -B"../build/arm64-v8a" \
-    -DANDROID_ABI="arm64-v8a" \
-    -DCMAKE_BUILD_TYPE="MinSizeRel" \
-    -DCMAKE_TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake" \
-    -DANDROID_PLATFORM="android-22" \
-    -DANDROID_STL=c++_static \
-    -DANDROID=true \
-    -DWITH_LOGGING=OFF \
-    -DCPU=ON \
-    -DGPU_CL=OFF \
-    -DFPGA=OFF
-
-  cd ../build/arm64-v8a && make -j 1
-  cd -
-}
-
-function build_android_armv8_gpu() {
-  rm -rf ../build/arm64-v8a
-  cmake .. \
-    -B"../build/arm64-v8a" \
-    -DANDROID_ABI="arm64-v8a" \
-    -DCMAKE_BUILD_TYPE="MinSizeRel" \
-    -DCMAKE_TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake" \
-    -DANDROID_PLATFORM="android-22" \
-    -DANDROID_STL=c++_static \
-    -DANDROID=true \
-    -DWITH_LOGGING=OFF \
-    -DCPU=ON \
-    -DGPU_CL=ON \
-    -DFPGA=OFF
-
-  cd ../build/arm64-v8a && make -j 8
-  cd -
-}
-
-function build_ios_armv8_cpu_only() {
-  rm -rf ../build/ios
-  cmake .. \
-    -B"../build/ios" \
-    -DCMAKE_BUILD_TYPE="MinSizeRel" \
-    -DCMAKE_TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake" \
-    -DIOS_PLATFORM=OS \
-    -DIOS_ARCH="${IOS_ARCH}" \
-    -DIS_IOS=true \
-    -DUSE_OPENMP=OFF \
-    -DCPU=ON \
-    -DGPU_CL=OFF \
-    -DFPGA=OFF
-
-  cd ../build/ios && make -j 8
-  cd -
-}
-
-function build_ios_armv8_gpu() {
-  rm -rf ../build/ios
-  cmake .. \
-    -B"../build/ios" \
-    -DCMAKE_BUILD_TYPE="MinSizeRel" \
-    -DCMAKE_TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake" \
-    -DIOS_PLATFORM=OS \
-    -DIOS_ARCH="${IOS_ARCH}" \
-    -DIS_IOS=true \
-    -DUSE_OPENMP=OFF \
-    -DCPU=ON \
-    -DGPU_CL=ON \
-    -DFPGA=OFF
-
-  cd ../build/ios && make -j 8
-  cd -
-}
-
-function build_linux_armv7_cpu_only() {
-  rm -rf ../build/armv7_linux
-  cmake .. \
-    -B"../build/armv7_linux" \
-    -DCMAKE_BUILD_TYPE="MinSizeRel" \
-    -DCMAKE_TOOLCHAIN_FILE="./tools/toolchains/arm-linux-gnueabihf.cmake" \
-    -DCPU=ON \
-    -DGPU_CL=OFF \
-    -DFPGA=OFF
-
-  cd ../build/armv7_linux && make -j 8
-  cd -
-}
-
-function build_linux_armv7_gpu() {
-  rm -rf ../build/armv7_linux
-  cmake .. \
-    -B"../build/armv7_linux" \
-    -DCMAKE_BUILD_TYPE="MinSizeRel" \
-    -DCMAKE_TOOLCHAIN_FILE="./tools/toolchains/arm-linux-gnueabihf.cmake" \
-    -DCPU=ON \
-    -DGPU_CL=ON \
-    -DFPGA=OFF
-
-  cd ../build/armv7_linux && make -j 8
-  cd -
-}
-
-function build_android_armv7() {
-  check_ndk
-  build_android_armv7_cpu_only
-  # build_android_armv7_gpu
-}
-
-function build_android_armv8() {
-  check_ndk
-  build_android_armv8_cpu_only
-  # build_android_armv8_gpu
-}
-
-function build_ios() {
-  build_ios_armv8_cpu_only
-  # build_ios_armv8_gpu
-}
-
-function build_linux_armv7() {
-  build_linux_armv7_cpu_only
-  # build_linux_armv7_gpu
-}
-
-function build_linux_fpga() {
-  cd ..
-  image=`docker images paddle-mobile:dev | grep 'paddle-mobile'`
-  if [[ "x"$image == "x" ]]; then
-    docker build -t paddle-mobile:dev - < Dockerfile
-  fi
-  docker run --rm -v `pwd`:/workspace paddle-mobile:dev bash /workspace/tools/docker_build_fpga.sh
-  cd -
-}
-
-function run_android_test() {
-  ExecuteAndroidTests $1
-}
-
-function main() {
-  local CMD=$1
-  init
-  case $CMD in
-    android_armv7)
-      build_android_armv7
-      run_android_test armeabi-v7a 
-      ;;
-    android_armv8)
-      build_android_armv8
-      run_android_test arm64-v8a
-      ;;
-    ios)
-      build_ios
-      ;;
-    linux_armv7)
-      build_linux_armv7
-      ;;
-    fpga)
-      build_linux_fpga
-      ;;
-    *)
-      print_usage
-      exit 0
-      ;;
-    esac
-}
-
-main $@
diff --git a/mobile/tools/ci_run_test.sh b/mobile/tools/ci_run_test.sh
deleted file mode 100644
index 6470a97b15..0000000000
--- a/mobile/tools/ci_run_test.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/env bash
-
-operators=
-
-function AddTest() {
-  operators="${operators} $1"
-}
-
-function ExecuteAndroidTests() {
-  platform=$1
-  devices=`adb devices | grep -v devices | grep device | awk -F ' ' '{print $1}'`
-  for device in ${devices}; do
-    adb -s ${device} shell rm -rf /data/local/tmp/*
-    adb -s ${device} push ../build/${platform}/build/libpaddle-mobile.so /data/local/tmp/
-    for op in ${operators}; do
-      adb -s ${device} push ../test/build/test-${op}-op /data/local/tmp/
-      adb -s ${device} shell "cd /data/local/tmp/; LD_LIBRARY_PATH=. ./test-${op}-op"
-      echo "${BLUE}run test ${op} pass${NONE}"
-    done
-  done
-}
-
-AddTest batchnorm
-AddTest cast
-AddTest conv
-AddTest dequantize
-#AddTest elementwiseadd
-AddTest log
-AddTest logical-and
-AddTest logical-not
-AddTest logical-or
-AddTest logical-xor
-AddTest pool
-AddTest quantize
-AddTest relu
-AddTest relu6
-AddTest sequence-expand
-AddTest sequence-pool
-AddTest sequence-softmax
-AddTest sigmoid
-AddTest softmax
-AddTest tanh
-AddTest topk
diff --git a/mobile/tools/docker_build_fpga.sh b/mobile/tools/docker_build_fpga.sh
deleted file mode 100644
index 9ca9406f43..0000000000
--- a/mobile/tools/docker_build_fpga.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/usr/bin/env bash
-
-apt-get update
-apt-get install -y gcc g++ cmake
-
-cd /workspace && mkdir build
-cd build && cmake .. -DCPU=OFF -DGPU_CL=OFF -DFPGA=ON && make -j4
diff --git a/mobile/tools/ios-cmake/ios.toolchain.cmake b/mobile/tools/ios-cmake/ios.toolchain.cmake
deleted file mode 100644
index 12dd1721d4..0000000000
--- a/mobile/tools/ios-cmake/ios.toolchain.cmake
+++ /dev/null
@@ -1,216 +0,0 @@
-# This file is based off of the Platform/Darwin.cmake and Platform/UnixPaths.cmake
-# files which are included with CMake 2.8.4
-# It has been altered for iOS development
-
-# Options:
-#
-# IOS_PLATFORM = OS (default) or SIMULATOR or SIMULATOR64
-#   This decides if SDKS will be selected from the iPhoneOS.platform or iPhoneSimulator.platform folders
-#   OS - the default, used to build for iPhone and iPad physical devices, which have an arm arch.
-#   SIMULATOR - used to build for the Simulator platforms, which have an x86 arch.
-#
-# CMAKE_IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder
-#   By default this location is automatcially chosen based on the IOS_PLATFORM value above.
-#   If set manually, it will override the default location and force the user of a particular Developer Platform
-#
-# CMAKE_IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder
-#   By default this location is automatcially chosen based on the CMAKE_IOS_DEVELOPER_ROOT value.
-#   In this case it will always be the most up-to-date SDK found in the CMAKE_IOS_DEVELOPER_ROOT path.
-#   If set manually, this will force the use of a specific SDK version
-
-# Macros:
-#
-# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE)
-#  A convenience macro for setting xcode specific properties on targets
-#  example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1")
-#
-# find_host_package (PROGRAM ARGS)
-#  A macro used to find executable programs on the host system, not within the iOS environment.
-#  Thanks to the android-cmake project for providing the command
-
-# Standard settings
-set (CMAKE_SYSTEM_NAME Darwin)
-set (CMAKE_SYSTEM_VERSION 1)
-set (UNIX True)
-set (APPLE True)
-set (IOS True)
-set (IOS_ARCH armv7 armv7s arm64)
-
-# Required as of cmake 2.8.10
-set (CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE)
-
-# Determine the cmake host system version so we know where to find the iOS SDKs
-find_program (CMAKE_UNAME uname /bin /usr/bin /usr/local/bin)
-if (CMAKE_UNAME)
-  exec_program(uname ARGS -r OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_VERSION)
-  string (REGEX REPLACE "^([0-9]+)\\.([0-9]+).*$" "\\1" DARWIN_MAJOR_VERSION "${CMAKE_HOST_SYSTEM_VERSION}")
-endif (CMAKE_UNAME)
-
-# Force the compilers to gcc for iOS
-#include (CMakeForceCompiler)
-#CMAKE_C_COMPILER (/usr/bin/gcc)
-#CMAKE_CXX_COMPILER (/usr/bin/g++)
-if(USE_OPENMP)
-    set(CMAKE_C_COMPILER /usr/local/opt/llvm/bin/clang)
-    set(CMAKE_CXX_COMPILER /usr/local/opt/llvm/bin/clang++)
-else()
-    set(CMAKE_C_COMPILER /usr/bin/gcc)
-    set(CMAKE_CXX_COMPILER /usr/bin/g++)
-endif()
-set(CMAKE_AR ar CACHE FILEPATH "" FORCE)
-
-# Skip the platform compiler checks for cross compiling
-set (CMAKE_CXX_COMPILER_WORKS TRUE)
-set (CMAKE_C_COMPILER_WORKS TRUE)
-
-# All iOS/Darwin specific settings - some may be redundant
-set (CMAKE_SHARED_LIBRARY_PREFIX "lib")
-set (CMAKE_SHARED_LIBRARY_SUFFIX ".dylib")
-set (CMAKE_SHARED_MODULE_PREFIX "lib")
-set (CMAKE_SHARED_MODULE_SUFFIX ".so")
-set (CMAKE_MODULE_EXISTS 1)
-set (CMAKE_DL_LIBS "")
-
-set (CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ")
-set (CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ")
-set (CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}")
-set (CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}")
-
-# Hidden visibilty is required for cxx on iOS
-set (CMAKE_C_FLAGS_INIT "")
-set (CMAKE_CXX_FLAGS_INIT "-fvisibility=hidden -fvisibility-inlines-hidden")
-
-set (CMAKE_C_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_C_LINK_FLAGS}")
-set (CMAKE_CXX_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_CXX_LINK_FLAGS}")
-
-set (CMAKE_PLATFORM_HAS_INSTALLNAME 1)
-set (CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -headerpad_max_install_names")
-set (CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -headerpad_max_install_names")
-set (CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,")
-set (CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,")
-set (CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a")
-
-# hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old build tree
-# (where install_name_tool was hardcoded) and where CMAKE_INSTALL_NAME_TOOL isn't in the cache
-# and still cmake didn't fail in CMakeFindBinUtils.cmake (because it isn't rerun)
-# hardcode CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did before, Alex
-if (NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
-  find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool)
-endif (NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
-
-# Setup iOS platform unless specified manually with IOS_PLATFORM
-if (NOT DEFINED IOS_PLATFORM)
-  set (IOS_PLATFORM "OS")
-endif (NOT DEFINED IOS_PLATFORM)
-set (IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
-
-# Setup building for arm64 or not
-if (NOT DEFINED BUILD_ARM64)
-  set (BUILD_ARM64 true)
-endif (NOT DEFINED BUILD_ARM64)
-set (BUILD_ARM64 ${BUILD_ARM64} CACHE STRING "Build arm64 arch or not")
-
-# Check the platform selection and setup for developer root
-if (${IOS_PLATFORM} STREQUAL "OS")
-  set (IOS_PLATFORM_LOCATION "iPhoneOS.platform")
-
-  # This causes the installers to properly locate the output libraries
-  set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphoneos")
-elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR")
-  set (SIMULATOR true)
-  set (IOS_PLATFORM_LOCATION "iPhoneSimulator.platform")
-
-  # This causes the installers to properly locate the output libraries
-  set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator")
-elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR64")
-  set (SIMULATOR true)
-  set (IOS_PLATFORM_LOCATION "iPhoneSimulator.platform")
-
-  # This causes the installers to properly locate the output libraries
-  set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator")
-else (${IOS_PLATFORM} STREQUAL "OS")
-  message (FATAL_ERROR "Unsupported IOS_PLATFORM value selected. Please choose OS or SIMULATOR")
-endif (${IOS_PLATFORM} STREQUAL "OS")
-
-# Setup iOS developer location unless specified manually with CMAKE_IOS_DEVELOPER_ROOT
-# Note Xcode 4.3 changed the installation location, choose the most recent one available
-exec_program(/usr/bin/xcode-select ARGS -print-path OUTPUT_VARIABLE CMAKE_XCODE_DEVELOPER_DIR)
-set (XCODE_POST_43_ROOT "${CMAKE_XCODE_DEVELOPER_DIR}/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
-set (XCODE_PRE_43_ROOT "/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
-if (NOT DEFINED CMAKE_IOS_DEVELOPER_ROOT)
-  if (EXISTS ${XCODE_POST_43_ROOT})
-    set (CMAKE_IOS_DEVELOPER_ROOT ${XCODE_POST_43_ROOT})
-  elseif(EXISTS ${XCODE_PRE_43_ROOT})
-    set (CMAKE_IOS_DEVELOPER_ROOT ${XCODE_PRE_43_ROOT})
-  endif (EXISTS ${XCODE_POST_43_ROOT})
-endif (NOT DEFINED CMAKE_IOS_DEVELOPER_ROOT)
-set (CMAKE_IOS_DEVELOPER_ROOT ${CMAKE_IOS_DEVELOPER_ROOT} CACHE PATH "Location of iOS Platform")
-
-set(CMAKE_IOS_SDK_ROOT "/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk")
-# Find and use the most recent iOS sdk unless specified manually with CMAKE_IOS_SDK_ROOT
-if (NOT DEFINED CMAKE_IOS_SDK_ROOT)
-  file (GLOB _CMAKE_IOS_SDKS "${CMAKE_IOS_DEVELOPER_ROOT}/SDKs/*")
-  if (_CMAKE_IOS_SDKS)
-    list (SORT _CMAKE_IOS_SDKS)
-    list (REVERSE _CMAKE_IOS_SDKS)
-    list (GET _CMAKE_IOS_SDKS 0 CMAKE_IOS_SDK_ROOT)
-  else (_CMAKE_IOS_SDKS)
-    message (FATAL_ERROR "No iOS SDK's found in default search path ${CMAKE_IOS_DEVELOPER_ROOT}. Manually set CMAKE_IOS_SDK_ROOT or install the iOS SDK.")
-  endif (_CMAKE_IOS_SDKS)
-  message (STATUS "Toolchain using default iOS SDK: ${CMAKE_IOS_SDK_ROOT}")
-endif (NOT DEFINED CMAKE_IOS_SDK_ROOT)
-set (CMAKE_IOS_SDK_ROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Location of the selected iOS SDK")
-
-# Set the sysroot default to the most recent SDK
-set (CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support")
-
-# set the architecture for iOS
-if (${IOS_PLATFORM} STREQUAL "OS")
-elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR")
-  set (IOS_ARCH i386)
-elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR64")
-  set (IOS_ARCH x86_64)
-endif (${IOS_PLATFORM} STREQUAL "OS")
-
-set (CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string  "Build architecture for iOS")
-
-# Set the find root to the iOS developer roots and to user defined paths
-set (CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE string  "iOS find search path root")
-
-# default to searching for frameworks first
-set (CMAKE_FIND_FRAMEWORK FIRST)
-
-# set up the default search directories for frameworks
-set (CMAKE_SYSTEM_FRAMEWORK_PATH
-        ${CMAKE_IOS_SDK_ROOT}/System/Library/Frameworks
-        ${CMAKE_IOS_SDK_ROOT}/System/Library/PrivateFrameworks
-        ${CMAKE_IOS_SDK_ROOT}/Developer/Library/Frameworks
-        )
-
-# only search the iOS sdks, not the remainder of the host filesystem
-set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
-set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
-
-
-# This little macro lets you set any XCode specific property
-macro (set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE)
-  set_property (TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} ${XCODE_VALUE})
-endmacro (set_xcode_property)
-
-
-# This macro lets you find executable programs on the host system
-macro (find_host_package)
-  set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
-  set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
-  set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
-  set (IOS FALSE)
-
-  find_package(${ARGN})
-
-  set (IOS TRUE)
-  set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
-  set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-  set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
-endmacro (find_host_package)
-
diff --git a/mobile/tools/net-detail.awk b/mobile/tools/net-detail.awk
deleted file mode 100644
index 84d0166ac7..0000000000
--- a/mobile/tools/net-detail.awk
+++ /dev/null
@@ -1,91 +0,0 @@
-BEGIN {
-print "digraph G {"
-}
-/op:/ {
-    id++
-    opname[id] = $NF
-}
-/input/ {
-    type = "input"
-    para = $NF
-    if (input[id]) {
-        input[id] = input[id] "|"
-    }
-    input[id] = input[id] "<" para ">" para
-}
-/output/ {
-    type = "output"
-    para = $NF
-    if (output[id]) {
-        output[id] = output[id] "|"
-    }
-    output[id] = output[id] "<" para ">" para
-}
-/attr/ {
-    type = "attr"
-    aname = $NF
-    if (attr_key[id]) {
-        attr_key[id] = attr_key[id] "|"
-        attr_value[id] = attr_value[id] "|"
-    }
-    attr_key[id] = attr_key[id] $NF
-}
-/argument/ {
-    if (type == "attr") {
-        split($0, arr, " - ")
-        attr_value[id] = attr_value[id] arr[2]
-    } else if ((type == "input") || (type == "output")) {
-        if (!var2id[$NF]) {
-            var_id++
-            var[var_id] = $NF
-            var2id[$NF] = var_id
-        }
-        varid = var2id[$NF]
-        lid++
-        if (type == "input") {
-            line[lid] = "var_" varid " -> " "op_" id ":<" para ">"
-            if (xout[$NF]) {
-                xi++
-                xline[xi] = "xop_" xout[$NF] " -> " "xop_" id
-            }
-        } else if (type == "output") {
-            line[lid] = "op_" id ":<" para ">" " -> " "var_" varid
-            xout[$NF] = id
-        }
-    }
-}
-/var name/ {
-    varname = $NF
-    vid = var2id[varname]
-}
-/var tensor desc dim / {
-    if (tensor[vid]) tensor[vid] = tensor[vid] " x "
-    tensor[vid] = tensor[vid] $NF
-}
-END {
-
-print "subgraph cluster_G0 {"
-for (i = 1; i <= id; i++) {
-    print "xop_" i "[label=\"" i ". " opname[i] "\"]"
-}
-for (i = 1; i <= xi; i++) {
-    print xline[i]
-}
-print "}"
-
-for (i = 1; i <= id; i++) {
-print "op_" i "[group=op;shape=record;label=\"{{" input[i] "}|<op>" i ". " opname[i] "|{" output[i] "}}\"]"
-}
-for (i = 1; i <= var_id; i++) {
-print "var_" i "[label=\"" var[i] " [" tensor[i] "]\"]"
-}
-for (i = 1; i <= lid; i++) {
-print line[i]
-}
-for (i = 1; i <= id; i++) {
-print "attr_" i "[shape=record;label=\"{" attr_key[i] "}|{" attr_value[i] "}\"]"
-print "attr_" i " -> " "op_" i ":<op>"
-}
-print "}"
-}
-
diff --git a/mobile/tools/net.awk b/mobile/tools/net.awk
deleted file mode 100644
index 25689c90d8..0000000000
--- a/mobile/tools/net.awk
+++ /dev/null
@@ -1,27 +0,0 @@
-BEGIN {
-    print "digraph {"
-}
-/op:/ {
-    id++
-    op = $NF
-    opname = op "_" id
-    print opname "[\"label\"=\"" op " [" id "]" "\"]"
-}
-/input/ {
-    type = "input"
-}
-/output/ {
-    type = "output"
-}
-/argument/ {
-    if (type == "output") {
-        output[$NF] = opname
-    } else if (type == "input") {
-        if (output[$NF]) {
-            print output[$NF] " -> " opname
-        }
-    }
-}
-END {
-    print "}"
-}
diff --git a/mobile/tools/op.cmake b/mobile/tools/op.cmake
deleted file mode 100755
index 9c718dac42..0000000000
--- a/mobile/tools/op.cmake
+++ /dev/null
@@ -1,753 +0,0 @@
-set(FOUND_MATCH OFF)
-set(CON -1)
-
-message(STATUS "nets :${NET}")
-
-list(FIND NET "googlenet" CON)
-if (CON GREATER -1)
-  message("googlenet enabled")
-  set(CONCAT_OP ON)
-  set(CONV_OP ON)
-  set(LRN_OP ON)
-  set(MUL_OP ON)
-  set(ELEMENTWISEADD_OP ON)
-  set(FUSION_FC_OP ON)
-  set(POOL_OP ON)
-  set(RELU_OP ON)
-  set(FUSION_CONVADD_OP ON)
-  set(FUSION_CONVADDRELU_OP ON)
-
-  set(FOUND_MATCH ON)
-endif()
-
-list(FIND NET "mobilenet" CON)
-if (CON GREATER -1)
-  message("mobilenet enabled")
-  set(CONV_OP ON)
-  set(ELEMENTWISEADD_OP ON)
-  set(RELU_OP ON)
-  set(SOFTMAX_OP ON)
-  set(MUL_OP ON)
-  set(DEPTHWISECONV_OP ON)
-  set(BATCHNORM_OP ON)
-  set(POOL_OP ON)
-  set(RESHAPE_OP ON)
-  set(FUSION_CONVADDBNRELU_OP ON)
-  set(FUSION_CONVADDRELU_OP ON)
-  set(FUSION_CONVADD_OP ON)
-
-  set(FOUND_MATCH ON)
-endif()
-
-
-list(FIND NET "mobilenetssd" CON)
-if (CON GREATER -1)
-  message("mobilenetssd enabled")
-  set(FUSION_CONVBNRELU_OP ON)
-  set(FUSION_CONVBNRELU_OP ON)
-  set(FUSION_DWCONVBNRELU_OP ON)
-  set(FUSION_CONVADD_OP ON)
-  set(MULTICLASSNMS_OP ON)
-  set(SOFTMAX_OP ON)
-  set(TRANSPOSE_OP ON)
-    #feed
-  set(PRIORBOX_OP ON)
-  set(CONCAT_OP ON)
-  set(BOXCODER_OP ON)
-  set(RESHAPE_OP ON)
-#fetch
-  #total
-
-  set(FOUND_MATCH ON)
-
-endif()
-
-
-list(FIND NET "yolo" CON)
-if (CON GREATER -1)
-  message("yolo enabled")
-  set(BATCHNORM_OP ON)
-  set(CONV_OP ON)
-  set(RELU_OP ON)
-  set(ELEMENTWISEADD_OP ON)
-
-  set(FOUND_MATCH ON)
-endif()
-
-list(FIND NET "squeezenet" CON)
-if (CON GREATER -1)
-  message("squeezenet enabled")
-  set(CONCAT_OP ON)
-  set(CONV_OP ON)
-  set(RELU_OP ON)
-  set(ELEMENTWISEADD_OP ON)
-  set(POOL_OP ON)
-  set(RESHAPE_OP ON)
-  set(SOFTMAX_OP ON)
-
-  set(FOUND_MATCH ON)
-endif()
-
-
-list(FIND NET "resnet" CON)
-if (CON GREATER -1)
-  message("resnet enabled")
-  set(CONCAT_OP ON)
-  set(CONV_OP ON)
-  set(RELU_OP ON)
-  set(ELEMENTWISEADD_OP ON)
-  set(POOL_OP ON)
-  set(BATCHNORM_OP ON)
-  set(FUSION_CONVBNADDRELU_OP ON)
-  set(MUL_OP ON)
-  set(RESHAPE_OP ON)
-  set(SOFTMAX_OP ON)
-  set(FOUND_MATCH ON)
-endif()
-
-list(FIND NET "FPGA_NET_V1" CON)
-if (CON GREATER -1)
-  message("FPGA_NET_V1 enabled")
-  set(FUSION_CONVADDRELU_OP ON)
-  set(FUSION_ELEMENTWISEADDRELU_OP ON)
-  set(FUSION_FC_OP ON)
-  set(POOL_OP ON)
-  set(SOFTMAX_OP ON)
-  set(FUSION_CONVBNRELU_OP ON)
-  set(FUSION_CONVBN_OP ON)
-  set(TANH_OP ON)
-  set(ELEMENTWISEADD_OP ON)
-  set(TRANSPOSE2_OP ON)
-  set(FUSION_CONVADD_OP ON)
-  set(SPLIT_OP ON)
-  set(FUSION_DECONVADD_OP ON)
-  set(FUSION_DECONVADDRELU_OP ON)
-
-  set(RESHAPE_OP ON)
-  set(FUSION_CONVADDBNRELU_OP ON)
-  set(FUSION_CONVADDBN_OP ON)
-  set(RESHAPE2_OP ON)
-  set(PSROI_POOL_OP ON)
-  set(ROIALIGN_POOL_OP ON)
-  set(PROPOSAL_OP ON)
-  set(ANCHOR_GENERATOR_OP ON)
-  set(SLICE_OP ON)
-  set(SIGMOID_OP ON)
-  set(CONCAT_OP ON)
-  set(PAD2D_OP ON)
-  set(CONV_TRANSPOSE_OP ON)
-  set(FUSION_DECONVADDBNRELU_OP ON)
-  set(FUSION_DECONVADDBN_OP ON)
-  set(FUSION_DECONVBNRELU_OP ON)
-  set(CONV_OP ON)
-  set(ELEMENTWISEMUL_OP ON)
-  set(FUSION_FCRELU_OP ON)
-  set(RELU_OP ON)
-  set(FOUND_MATCH ON)
-endif()
-
-list(FIND NET "FPGA_NET_V2" CON)
-if (CON GREATER -1)
-  message("FPGA_NET_V2 enabled")
-  set(FUSION_CONVADDRELU_OP ON)
-  set(FUSION_ELEMENTWISEADDRELU_OP ON)
-  set(FUSION_FC_OP ON)
-  set(POOL_OP ON)
-  set(SOFTMAX_OP ON)
-  set(FUSION_CONVBNRELU_OP ON)
-  set(FUSION_CONVBN_OP ON)
-  set(TANH_OP ON)
-  set(ELEMENTWISEADD_OP ON)
-  set(TRANSPOSE2_OP ON)
-  set(FUSION_CONVADD_OP ON)
-  set(SPLIT_OP ON)
-  set(FUSION_DECONVADD_OP ON)
-  set(FUSION_DECONVADDRELU_OP ON)
-
-  set(RESHAPE_OP ON)
-  set(FUSION_CONVADDBNRELU_OP ON)
-  set(FUSION_CONVADDBN_OP ON)
-  set(RESHAPE2_OP ON)
-  set(PSROI_POOL_OP ON)
-  set(ROIALIGN_POOL_OP ON)
-  set(PROPOSAL_OP ON)
-  set(ANCHOR_GENERATOR_OP ON)
-  set(SLICE_OP ON)
-  set(SIGMOID_OP ON)
-  set(CONCAT_OP ON)
-  set(CONV_TRANSPOSE_OP ON)
-  set(FUSION_DECONVADDBNRELU_OP ON)
-  set(FUSION_DECONVADDBN_OP ON)
-  set(FUSION_DECONVBNRELU_OP ON)
-  set(CONV_OP ON)
-  set(ELEMENTWISEMUL_OP ON)
-  set(FUSION_FCRELU_OP ON)
-  set(RELU_OP ON)
-  set(FOUND_MATCH ON)
-endif()
-
-list(FIND NET "FPGA_OPS_KD" CON)
-if (CON GREATER -1)
-  message("FPGA_OPS_KD enabled")
-  set(CONV_OP ON)
-  set(FUSION_CONVADDRELU_OP ON)
-  set(FUSION_ELEMENTWISEADDRELU_OP ON)
-  set(FUSION_FC_OP ON)
-  set(POOL_OP ON)
-  set(SOFTMAX_OP ON)
-  set(FUSION_CONVBNRELU_OP ON)
-  set(FUSION_CONVBN_OP ON)
-  set(TANH_OP ON)
-  set(ELEMENTWISEADD_OP ON)
-  set(TRANSPOSE2_OP ON)
-  set(FUSION_CONVADD_OP ON)
-  set(SPLIT_OP ON)
-  set(FUSION_DECONVADD_OP ON)
-  set(FUSION_DECONVADDRELU_OP ON)
-  set(FOUND_MATCH ON)
-endif()
-
-list(FIND NET "nlp" CON)
-if (CON GREATER -1)
-  message("nlp enabled")
-  set(FUSION_FC_OP ON)
-  set(LOOKUP_OP ON)
-  set(GRU_OP ON)
-  set(CRF_OP ON)
-  set(CONCAT_OP ON)
-  set(ELEMENTWISEADD_OP ON)
-
-
-  set(FOUND_MATCH ON)
-endif()
-
-list(FIND NET "mobilenetfssd" CON)
-if (CON GREATER -1)
-  message("mobilenetfssd enabled")
-  set(FUSION_CONVADDRELU_OP ON)
-  set(FUSION_CONVADDBNRELU_OP ON)
-  set(FUSION_CONVADD_OP ON)
-  set(SOFTMAX_OP ON)
-  set(RESHAPE_OP ON)
-  set(BILINEAR_INTERP_OP ON)
-  set(TRANSPOSE_OP ON)
-  set(CONCAT_OP ON)
-  set(PRIORBOX_OP ON)
-  set(BATCHNORM_OP ON)
-  set(BOXCODER_OP ON)
-  set(MULTICLASSNMS_OP ON)
-  set(FLATTEN_OP ON)
-  set(FLATTEN2_OP ON)
-  set(SPLIT_OP ON)
-  set(SHAPE_OP ON)
-
-  set(FOUND_MATCH ON)
-endif()
-
-list(FIND NET "genet" CON)
-if (CON GREATER -1)
-  message("genet enabled")
-  set(FUSION_CONVADDPRELU_OP ON)
-  set(FUSION_CONVADDADDPRELU_OP ON)
-  set(FUSION_CONVADD_OP ON)
-  set(CONV_TRANSPOSE_OP ON)
-  set(FUSION_CONVADDRELU_OP ON)
-  set(ELEMENTWISEADD_OP ON)
-  set(PRELU_OP ON)
-  set(POOL_OP ON)
-  set(CONCAT_OP ON)
-
-  set(FOUND_MATCH ON)
-endif()
-
-list(FIND NET "super" CON)
-if (CON GREATER -1)
-  message("super enabled")
-  set(FUSION_CONVADD_OP ON)
-  set(FUSION_CONVADDRELU_OP ON)
-  set(ELEMENTWISEADD_OP ON)
-
-  set(FOUND_MATCH ON)
-endif()
-
-list(FIND NET "op" CON)
-if (CON GREATER -1)
-  message("op enabled")
-  set(SIGMOID_OP ON)
-  set(LEAKY_RELU_OP ON)
-  set(FOUND_MATCH ON)
-endif()
-
-if(NOT FOUND_MATCH)
-  message("--default--")
-  set(NORM_OP ON)
-  set(BATCHNORM_OP ON)
-  set(INSTANCENORM_OP ON)
-  set(CONV_TRANSPOSE_OP ON)
-  set(BOXCODER_OP ON)
-  set(CONCAT_OP ON)
-  set(CONV_OP ON)
-  set(DEPTHWISECONV_OP ON)
-  set(ELEMENTWISEADD_OP ON)
-  set(ELEMENTWISESUB_OP ON)
-  set(IM2SEQUENCE_OP ON)
-  set(FILL_CONSTANT_OP ON)
-  set(DENSITY_PRIORBOX_OP ON)
-  set(FUSION_CONVADD_OP ON)
-  set(FUSION_CONVADDPRELU_OP ON)
-  set(EXP_OP ON)
-  set(FUSION_CONVADDRELU_OP ON)
-  set(FUSION_FC_OP ON)
-  set(LRN_OP ON)
-  set(MUL_OP ON)
-  set(MULTICLASSNMS_OP ON)
-  set(POLYGONBOXTRANSFORM_OP ON)
-  set(POOL_OP ON)
-  set(PRIORBOX_OP ON)
-  set(RELU_OP ON)
-  set(RESHAPE_OP ON)
-  set(RESHAPE2_OP ON)
-  set(SIGMOID_OP ON)
-  set(SOFTMAX_OP ON)
-  set(TRANSPOSE_OP ON)
-  set(TRANSPOSE2_OP ON)
-  set(FUSION_CONVADDBNRELU_OP ON)
-  set(FUSION_CONVADDADDPRELU_OP ON)
-  set(FUSION_DWCONVBNRELU_OP ON)
-  set(FUSION_CONVBNRELU_OP ON)
-  set(FUSION_CONVRELU_OP ON)
-  set(FUSION_CONVBNADDRELU_OP ON)
-  set(PRELU_OP ON)
-  set(RESIZE_OP ON)
-  set(SCALE_OP ON)
-  set(SLICE_OP ON)
-  set(DROPOUT_OP ON)
-  set(IM2SEQUENCE_OP ON)
-  set(LOOKUP_OP ON)
-  set(GRU_OP ON)
-  set(GRU_UNIT_OP ON)
-  set(CRF_OP ON)
-  set(BILINEAR_INTERP_OP ON)
-  set(SPLIT_OP ON)
-  set(FLATTEN_OP ON)
-  set(FLATTEN2_OP ON)
-  set(SHAPE_OP ON)
-  set(ELEMENTWISEMUL_OP ON)
-  set(SUM_OP ON)
-  set(TOP_K_OP ON)
-  set(CAST_OP ON)
-  set(QUANT_OP ON)
-  set(DEQUANT_OP ON)
-  set(FUSION_DEQUANT_BN_OP ON)
-  set(FUSION_DEQUANT_ADD_BN_OP ON)
-  set(FUSION_DEQUANT_BN_RELU_OP ON)
-  set(FUSION_DEQUANT_ADD_BN_RELU_OP ON)
-  set(FUSION_DEQUANT_ADD_BN_QUANT_OP ON)
-  set(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP ON)
-  set(SEQUENCE_EXPAND_OP ON)
-  set(SEQUENCE_POOL_OP ON)
-  set(SEQUENCE_SOFTMAX_OP ON)
-  set(LOG_OP ON)
-  set(TANH_OP ON)
-  set(LOD_RESET_OP ON)
-  set(LESS_THAN_OP ON)
-  set(LOGICAL_AND_OP ON)
-  set(LOGICAL_OR_OP ON)
-  set(LOGICAL_NOT_OP ON)
-  set(LOGICAL_XOR_OP ON)
-  set(WHILE_OP ON)
-  set(WRITE_TO_ARRAY_OP ON)
-  set(READ_FROM_ARRAY_OP ON)
-  set(IS_EMPTY_OP ON)
-  set(INCREMENT_OP ON)
-  set(ANCHOR_GENERATOR_OP ON)
-  set(PROPOSAL_OP ON)
-  set(PSROI_POOL_OP ON)
-  set(ROI_PERSPECTIVE_OP ON)
-  set(BEAM_SEARCH_OP ON)
-  set(BEAM_SEARCH_DECODE_OP ON)
-  set(PAD2D_OP ON)
-  set(ONE_HOT_OP ON)
-  set(ASSIGN_VALUE_OP ON)
-  set(NEAREST_INTERP_OP ON)
-  set(LEAKY_RELU_OP ON)
-  set(ASSIGN_OP ON)
-  set(CONDITIONAL_BLOCK_OP ON)
-  set(EQUAL_OP ON)
-  set(FILL_CONSTANT_BATCH_SIZE_LIKE_OP ON)
-  set(RANGE_OP ON)
-  set(REDUCE_PROD_OP ON)
-  set(FUSION_INSTANCENORM_RELU_OP ON)
-endif()
-
-  # option(BATCHNORM_OP "" ON)
-  # option(BOXCODER_OP "" ON)
-  # option(CONCAT_OP "" ON)
-  # option(CONV_OP "" ON)
-  # option(DEPTHWISECONV_OP "" ON)
-  # option(ELEMENTWISEADD_OP "" ON)
-  # option(FILL_CONSTANT_OP "" ON)
-  # option(FUSION_CONVADD_OP "" ON)
-  # option(FUSION_CONVADDRELU_OP "" ON)
-  # option(FUSION_FC_OP "" ON)
-  # option(LRN_OP "" ON)
-  # option(MUL_OP "" ON)
-  # option(MULTICLASSNMS_OP "" ON)
-  # option(POLYGONBOXTRANSFORM_OP "" ON)
-  # option(POOL_OP "" ON)
-  # option(PRIORBOX_OP "" ON)
-  # option(RELU_OP "" ON)
-  # option(RESHAPE_OP "" ON)
-  # option(RESHAPE2_OP "" ON)
-  # option(SIGMOID_OP "" ON)
-  # option(SOFTMAX_OP "" ON)
-  # option(TRANSPOSE_OP "" ON)
-  # option(TRANSPOSE2_OP "" ON)
-# endif ()
-
-if (NORM_OP)
-  add_definitions(-DNORM_OP)
-endif()
-if (BATCHNORM_OP)
-  add_definitions(-DBATCHNORM_OP)
-endif()
-if (INSTANCENORM_OP)
-  add_definitions(-DINSTANCENORM_OP)
-endif()
-if (FUSION_INSTANCENORM_RELU_OP)
-  add_definitions(-DFUSION_INSTANCENORM_RELU_OP)
-endif()
-if (BOXCODER_OP)
-  add_definitions(-DBOXCODER_OP)
-endif()
-if (CONCAT_OP)
-  add_definitions(-DCONCAT_OP)
-endif()
-if (CONV_OP)
-  add_definitions(-DCONV_OP)
-endif()
-if (DEPTHWISECONV_OP)
-  add_definitions(-DDEPTHWISECONV_OP)
-endif()
-if (ELEMENTWISEADD_OP)
-  add_definitions(-DELEMENTWISEADD_OP)
-endif()
-if (ELEMENTWISESUB_OP)
-  add_definitions(-DELEMENTWISESUB_OP)
-endif()
-if (FILL_CONSTANT_OP)
-  add_definitions(-DFILL_CONSTANT_OP)
-endif()
-if (FUSION_CONVADD_OP)
-  add_definitions(-DFUSION_CONVADD_OP)
-endif()
-if (FUSION_CONVADDRELU_OP)
-  add_definitions(-DFUSION_CONVADDRELU_OP)
-endif()
-if (FUSION_CONVADDPRELU_OP)
-  add_definitions(-DFUSION_CONVADDPRELU_OP)
-endif()
-if (FUSION_CONVADDADDPRELU_OP)
-  add_definitions(-DFUSION_CONVADDADDPRELU_OP)
-endif()
-if (FUSION_FC_OP)
-  add_definitions(-DFUSION_FC_OP)
-endif()
-if (LRN_OP)
-  add_definitions(-DLRN_OP)
-endif()
-if (MUL_OP)
-  add_definitions(-DMUL_OP)
-endif()
-if (MULTICLASSNMS_OP)
-  add_definitions(-DMULTICLASSNMS_OP)
-endif()
-if (POLYGONBOXTRANSFORM_OP)
-  add_definitions(-DPOLYGONBOXTRANSFORM_OP)
-endif()
-if (POOL_OP)
-  add_definitions(-DPOOL_OP)
-endif()
-if (PRIORBOX_OP)
-  add_definitions(-DPRIORBOX_OP)
-endif()
-if (RELU_OP)
-  add_definitions(-DRELU_OP)
-endif()
-if (RESHAPE_OP)
-  add_definitions(-DRESHAPE_OP)
-endif()
-if (RESHAPE2_OP)
-  add_definitions(-DRESHAPE2_OP)
-endif()
-if (SIGMOID_OP)
-  add_definitions(-DSIGMOID_OP)
-endif()
-if (SOFTMAX_OP)
-  add_definitions(-DSOFTMAX_OP)
-endif()
-if (TRANSPOSE_OP)
-  add_definitions(-DTRANSPOSE_OP)
-endif()
-if (TRANSPOSE2_OP)
-  add_definitions(-DTRANSPOSE2_OP)
-endif()
-if (FUSION_CONVADDBNRELU_OP)
-  add_definitions(-DFUSION_CONVADDBNRELU_OP)
-endif()
-if (FUSION_DWCONVBNRELU_OP)
-  add_definitions(-DFUSION_DWCONVBNRELU_OP)
-endif()
-
-if (FUSION_CONVBNRELU_OP)
-  add_definitions(-DFUSION_CONVBNRELU_OP)
-endif()
-
-if (FUSION_CONVRELU_OP)
-  add_definitions(-DFUSION_CONVRELU_OP)
-endif()
-
-if (FUSION_CONVBNADDRELU_OP)
-  add_definitions(-DFUSION_CONVBNADDRELU_OP)
-endif()
-
-if (PRELU_OP)
-  add_definitions(-DPRELU_OP)
-endif()
-if (RESIZE_OP)
-  add_definitions(-DRESIZE_OP)
-endif()
-if (SCALE_OP)
-  add_definitions(-DSCALE_OP)
-endif()
-if (SLICE_OP)
-  add_definitions(-DSLICE_OP)
-endif()
-if (DROPOUT_OP)
-  add_definitions(-DDROPOUT_OP)
-endif()
-if (IM2SEQUENCE_OP)
-  add_definitions(-DIM2SEQUENCE_OP)
-endif()
-
-if (FUSION_CONVADDBN_OP)
-  add_definitions(-DFUSION_CONVADDBN_OP)
-endif()
-if (FUSION_FCRELU_OP)
-  add_definitions(-DFUSION_FCRELU_OP)
-endif()
-if (FUSION_POOLBN_OP)
-  add_definitions(-DFUSION_POOLBN_OP)
-endif()
-if (FUSION_ELEMENTWISEADDRELU_OP)
-  add_definitions(-DFUSION_ELEMENTWISEADDRELU_OP)
-endif()
-if (FUSION_CONVBN_OP)
-  add_definitions(-DFUSION_CONVBN_OP)
-endif()
-
-if (CONV_TRANSPOSE_OP)
-  add_definitions(-DCONV_TRANSPOSE_OP)
-endif()
-
-if (LOOKUP_OP)
-  add_definitions(-DLOOKUP_OP)
-endif()
-
-if (GRU_OP)
-  add_definitions(-DGRU_OP)
-endif()
-
-if (GRU_UNIT_OP)
-  add_definitions(-DGRU_UNIT_OP)
-endif()
-
-if (CRF_OP)
-  add_definitions(-DCRF_OP)
-endif()
-
-
-if (FLATTEN_OP)
-  add_definitions(-DFLATTEN_OP)
-endif()
-
-if (FLATTEN2_OP)
-  add_definitions(-DFLATTEN2_OP)
-endif()
-
-if (SPLIT_OP)
-  add_definitions(-DSPLIT_OP)
-endif()
-
-if (BILINEAR_INTERP_OP)
-  add_definitions(-DBILINEAR_INTERP_OP)
-endif()
-
-if (SHAPE_OP)
-  add_definitions(-DSHAPE_OP)
-endif()
-
-if (ELEMENTWISEMUL_OP)
-  add_definitions(-DELEMENTWISEMUL_OP)
-endif()
-if (SUM_OP)
-  add_definitions(-DSUM_OP)
-endif()
-if (TOP_K_OP)
-  add_definitions(-DTOP_K_OP)
-endif()
-if (CAST_OP)
-  add_definitions(-DCAST_OP)
-endif()
-if (QUANT_OP)
-  add_definitions(-DQUANT_OP)
-endif()
-if (DEQUANT_OP)
-  add_definitions(-DDEQUANT_OP)
-endif()
-if (FUSION_DEQUANT_BN_OP)
-  add_definitions(-DFUSION_DEQUANT_BN_OP)
-endif()
-if (FUSION_DEQUANT_ADD_BN_OP)
-  add_definitions(-DFUSION_DEQUANT_ADD_BN_OP)
-endif()
-if (FUSION_DEQUANT_BN_RELU_OP)
-  add_definitions(-DFUSION_DEQUANT_BN_RELU_OP)
-endif()
-if (FUSION_DEQUANT_ADD_BN_RELU_OP)
-  add_definitions(-DFUSION_DEQUANT_ADD_BN_RELU_OP)
-endif()
-if (FUSION_DEQUANT_ADD_BN_QUANT_OP)
-#  add_definitions(-DFUSION_DEQUANT_ADD_BN_QUANT_OP)
-endif()
-if (FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
-#  add_definitions(-DFUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
-endif()
-if (SEQUENCE_EXPAND_OP)
-  add_definitions(-DSEQUENCE_EXPAND_OP)
-endif()
-if (SEQUENCE_POOL_OP)
-  add_definitions(-DSEQUENCE_POOL_OP)
-endif()
-if (SEQUENCE_SOFTMAX_OP)
-  add_definitions(-DSEQUENCE_SOFTMAX_OP)
-endif()
-if (LOG_OP)
-  add_definitions(-DLOG_OP)
-endif()
-if (LOD_RESET_OP)
-  add_definitions(-DLOD_RESET_OP)
-endif()
-if (LESS_THAN_OP)
-  add_definitions(-DLESS_THAN_OP)
-endif()
-if (LOGICAL_AND_OP)
-  add_definitions(-DLOGICAL_AND_OP)
-endif()
-if (LOGICAL_OR_OP)
-  add_definitions(-DLOGICAL_OR_OP)
-endif()
-if (LOGICAL_NOT_OP)
-  add_definitions(-DLOGICAL_NOT_OP)
-endif()
-if (LOGICAL_XOR_OP)
-  add_definitions(-DLOGICAL_XOR_OP)
-endif()
-
-if (TANH_OP)
-  add_definitions(-DTANH_OP)
-endif()
-if (FUSION_DECONVRELU_OP)
-  add_definitions(-DFUSION_DECONVRELU_OP)
-endif()
-if (FUSION_DECONVADD_OP)
-  add_definitions(-DFUSION_DECONVADD_OP)
-endif()
-if (FUSION_DECONVADDRELU_OP)
-  add_definitions(-DFUSION_DECONVADDRELU_OP)
-endif()
-if (WHILE_OP)
-  add_definitions(-DWHILE_OP)
-endif()
-if (WRITE_TO_ARRAY_OP)
-  add_definitions(-DWRITE_TO_ARRAY_OP)
-endif()
-if (READ_FROM_ARRAY_OP)
-  add_definitions(-DREAD_FROM_ARRAY_OP)
-endif()
-if (IS_EMPTY_OP)
-  add_definitions(-DIS_EMPTY_OP)
-endif()
-if (INCREMENT_OP)
-  add_definitions(-DINCREMENT_OP)
-endif()
-
-if (ANCHOR_GENERATOR_OP)
-  add_definitions(-DANCHOR_GENERATOR_OP)
-endif()
-if (PROPOSAL_OP)
-  add_definitions(-DPROPOSAL_OP)
-endif()
-if (PSROI_POOL_OP)
-  add_definitions(-DPSROI_POOL_OP)
-endif()
-if (ROIALIGN_POOL_OP)
-  add_definitions(-DROIALIGN_POOL_OP)
-endif()
-if (ROI_PERSPECTIVE_OP)
-  add_definitions(-DROI_PERSPECTIVE_OP)
-endif()
-if (BEAM_SEARCH_OP)
-  add_definitions(-DBEAM_SEARCH_OP)
-endif()
-if (BEAM_SEARCH_DECODE_OP)
-  add_definitions(-DBEAM_SEARCH_DECODE_OP)
-endif()
-if (FUSION_DECONVADDBNRELU_OP)
-  add_definitions(-DFUSION_DECONVADDBNRELU_OP)
-endif()
-if (FUSION_DECONVBNRELU_OP)
-  add_definitions(-DFUSION_DECONVBNRELU_OP)
-endif()
-if (FUSION_DECONVADDBN_OP)
-  add_definitions(-DFUSION_DECONVADDBN_OP)
-endif()
-if (PAD2D_OP)
-  add_definitions(-DPAD2D_OP)
-endif()
-if (ONE_HOT_OP)
-  add_definitions(-DONE_HOT_OP)
-endif()
-if (ASSIGN_VALUE_OP)
-  add_definitions(-DASSIGN_VALUE_OP)
-endif()
-if (LEAKY_RELU_OP)
-  add_definitions(-DLEAKY_RELU_OP)
-endif()
-if (NEAREST_INTERP_OP)
-  add_definitions(-DNEAREST_INTERP_OP)
-endif()
-if (DENSITY_PRIORBOX_OP)
-  add_definitions(-DDENSITY_PRIORBOX_OP)
-endif()
-if (EXP_OP)
-  add_definitions(-DEXP_OP)
-endif ()
-if (ASSIGN_OP)
-  add_definitions(-DASSIGN_OP)
-endif()
-if (CONDITIONAL_BLOCK_OP)
-  add_definitions(-DCONDITIONAL_BLOCK_OP)
-endif()
-if (EQUAL_OP)
-  add_definitions(-DEQUAL_OP)
-endif()
-if (FILL_CONSTANT_BATCH_SIZE_LIKE_OP)
-  add_definitions(-DFILL_CONSTANT_BATCH_SIZE_LIKE_OP)
-endif()
-if (RANGE_OP)
-  add_definitions(-DRANGE_OP)
-endif()
-if (REDUCE_PROD_OP)
-  add_definitions(-DREDUCE_PROD_OP)
-endif()
diff --git a/mobile/tools/pre-commit.hooks/clang-format.hook b/mobile/tools/pre-commit.hooks/clang-format.hook
deleted file mode 100644
index ffba8744f4..0000000000
--- a/mobile/tools/pre-commit.hooks/clang-format.hook
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-# set -e
-
-readonly VERSION="5.0"
-
-version=$(clang-format -version)
-
-if ! [[ $version == *"$VERSION"* ]]; then
-    echo "clang-format version check failed."
-    echo "a version contains '$VERSION' is needed, but get '$version'"
-    echo "you can install the right version, and make an soft-link to '\$PATH' env"
-    exit -1
-fi
-
-# https://medicineyeh.wordpress.com/2017/07/13/clang-format-with-pragma/
-shift
-perl -i -pe 's|^\s+#pragma\s+omp|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> #pragma omp|' "$@"
-(
-# remove clang format ios_io folder
-flist=$(echo "$@" | perl -pe 's|src/io/ios_io/[^ ]*||')
-clang-format -i $flist
-)
-perl -i -pe 's|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> ||' "$@"
diff --git a/mobile/tools/pre-commit.hooks/clang-tidy.hook b/mobile/tools/pre-commit.hooks/clang-tidy.hook
deleted file mode 100755
index 2d7847c330..0000000000
--- a/mobile/tools/pre-commit.hooks/clang-tidy.hook
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-
-bash -c "cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON"
-
-TOTAL_ERRORS=0
-
-# The trick to remove deleted files: https://stackoverflow.com/a/2413151
-for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | grep "src/" | grep -v ".pb." | grep -v ".h"); do
-    echo "clang-tidy check $file";
-    clang-tidy $file --fix --fix-errors --header-filter=.* 
-    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
-    echo "clang-tidy error TOTAL_ERRORS = $TOTAL_ERRORS . "
-done
-
-rm -f compile_commands.json
-
-exit $TOTAL_ERRORS
-
diff --git a/mobile/tools/pre-commit.hooks/copyright.hook b/mobile/tools/pre-commit.hooks/copyright.hook
deleted file mode 100644
index 8fc0028059..0000000000
--- a/mobile/tools/pre-commit.hooks/copyright.hook
+++ /dev/null
@@ -1,124 +0,0 @@
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import argparse
-import io
-import platform
-import re
-import subprocess
-
-COPYRIGHT = '''
-Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-'''
-
-LANG_COMMENT_MARK = None
-
-NEW_LINE_MARK = None
-
-COPYRIGHT_HEADER = None
-
-if platform.system() == "Windows":
-    NEW_LINE_MARK = "\r\n"
-else:
-    NEW_LINE_MARK = '\n'
-    COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1]
-    p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0)
-    process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE)
-    date, err = process.communicate()
-    date = date.decode("utf-8").rstrip("\n")
-    COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date)
-
-
-def generate_copyright(template, lang='C'):
-    end_line = ""
-    if lang == 'Python':
-        lang_coment_mark = '# '
-        start = lang_coment_mark
-        blank = " "
-    else:
-        lang_coment_mark = ""
-        start = "/* "
-        blank = ""
-        end_line = " */"
-    lines = template.split(NEW_LINE_MARK)
-
-    ans = start + blank + COPYRIGHT_HEADER + NEW_LINE_MARK
-
-    for lino, line in enumerate(lines):
-        if lino == 0 or lino == 1 or lino == len(lines) - 1:
-            continue
-        if lino == (len(lines) - 2):
-            ans += lang_coment_mark + blank + line + end_line + NEW_LINE_MARK
-        else:
-            ans += lang_coment_mark + blank + line + NEW_LINE_MARK
-    return ans + "\n"
-
-
-def lang_type(filename):
-    if filename.endswith(".py"):
-        return "Python"
-    elif filename.endswith(".h"):
-        return "C"
-    elif filename.endswith(".c"):
-        return "C"
-    elif filename.endswith(".hpp"):
-        return "C"
-    elif filename.endswith(".cc"):
-        return "C"
-    elif filename.endswith(".cpp"):
-        return "C"
-    else:
-        print("Unsupported filetype %s", filename)
-        exit(0)
-
-
-PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
-
-
-def main(argv=None):
-    parser = argparse.ArgumentParser(
-        description='Checker for copyright declaration.')
-    parser.add_argument('filenames', nargs='*', help='Filenames to check')
-    args = parser.parse_args(argv)
-
-    retv = 0
-    for filename in args.filenames:
-        fd = io.open(filename, encoding="utf-8")
-        first_line = fd.readline()
-        second_line = fd.readline()
-        if "COPYRIGHT (C)" in first_line.upper() or "COPYRIGHT (C)" in second_line.upper():
-            continue
-        if first_line.startswith("/*") or first_line.startswith("#!") or PYTHON_ENCODE.match(
-                second_line) is not None or PYTHON_ENCODE.match(first_line) is not None:
-            continue
-        original_contents = io.open(filename, encoding="utf-8").read()
-        new_contents = generate_copyright(
-            COPYRIGHT, lang_type(filename)) + original_contents
-        print('Auto Insert Copyright Header {}'.format(filename))
-        retv = 1
-        with io.open(filename, 'w') as output_file:
-            output_file.write(new_contents)
-    return retv
-
-
-def test_generate_copyright():
-    print(generate_copyright(COPYRIGHT))
-
-
-if __name__ == '__main__':
-    # test_generate_copyright()
-    exit(main())
-
diff --git a/mobile/tools/pre-commit.hooks/cpplint.hook b/mobile/tools/pre-commit.hooks/cpplint.hook
deleted file mode 100644
index 78ca3cfcdd..0000000000
--- a/mobile/tools/pre-commit.hooks/cpplint.hook
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-TOTAL_ERRORS=0
-
-# The trick to remove deleted files: https://stackoverflow.com/a/2413151
-for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | \
-        grep -v ".pb.cpp" | grep -v ".pb.h" | grep -v ".pb-c.h" | grep -v ".pb-c.c" | \
-        grep -v "protobuf-c.h" | grep -v "protobuf-c.c"); do
-    cpplint $file;
-    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
-done
-
-exit $TOTAL_ERRORS
diff --git a/mobile/tools/prepare_images_and_models.sh b/mobile/tools/prepare_images_and_models.sh
deleted file mode 100755
index 6f224778d9..0000000000
--- a/mobile/tools/prepare_images_and_models.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/usr/bin/env bash
-
-# decalre download paths of images and models
-PADDLE_MOBILE_ROOT="$(pwd)/../"
-IMAGES_AND_MODELS="opencl_test_src"
-IMAGES_AND_MODELS_PATH="http://mms-graph.bj.bcebos.com/paddle-mobile/${IMAGES_AND_MODELS}.zip"
-
-# download and unzip zip-files of images and models
-mkdir ${PADDLE_MOBILE_ROOT}/download/
-cd ${PADDLE_MOBILE_ROOT}/download/
-wget -c ${IMAGES_AND_MODELS_PATH}
-unzip -o ./${IMAGES_AND_MODELS}.zip
-
-# create models and images directories below test
-mkdir ${PADDLE_MOBILE_ROOT}/test/models
-mkdir ${PADDLE_MOBILE_ROOT}/test/images
-
-# move to test directory
-cp ./${IMAGES_AND_MODELS}/input_3x224x224_banana ${PADDLE_MOBILE_ROOT}/test/images/
-cp -r ./${IMAGES_AND_MODELS}/mobilenet ${PADDLE_MOBILE_ROOT}/test/models/
diff --git a/mobile/tools/profile_show.sh b/mobile/tools/profile_show.sh
deleted file mode 100644
index d4a4d84e9d..0000000000
--- a/mobile/tools/profile_show.sh
+++ /dev/null
@@ -1,138 +0,0 @@
-#!/usr/bin/env sh
-cat <<EOF
-<html>
-<head>
-<style>
-html, body {
-position: absolute;
-width: 100%;
-height: 100%;
-margin: 0;
-}
-div.timeview {
-width: 100%;
-position: relative;
-overflow: scroll;
-}
-ul {
-position: absolute;
-margin: 0;
-list-style:none;
-padding: 0;
-margin: 0;
-}
-li {
-height: 15px;
-position: absolute;
-background: blue;
-}
-li:nth-child(odd) {
-background: blue;
-}
-li:nth-child(even) {
-background: rebeccapurple;
-}
-ul.timeline {
-z-index: -1;
-}
-ul.timeline li {
-position: relative;
-height: 15px;
-width: 100%;
-}
-ul.timeline li:nth-child(odd) {
-background: beige;
-}
-ul.timeline li:nth-child(even) {
-background: antiquewhite;
-}
-</style>
-</head>
-<body>
-<div class="timeview">
-<ul>
-EOF
-
-min=$(awk 'NR==1{min=$4} NR>1{if($4 < min) min=$4} END{print min}' $1)
-max=$(awk 'NR==1{max=$5} NR>1{if($5 > max) max=$5} END{print max}' $1)
-sort $1 -k1,1n | awk -v max="$max" -v min="$min" '
-BEGIN {
-  total = max - min
-}
-{
-    opid = $1
-    optype = $2
-    tid = $3
-    cb = $4
-    ce = $5
-    cl = $6
-    sum += $4 - $3
-    print "<li class=\"timeline\"" \
-          " data-opid=\"" opid "\"" \
-          " data-optype=\"" optype "\"" \
-          " data-tid=\"" tid "\"" \
-          " data-begin=\"" cb "\"" \
-          " data-end=\"" ce "\"" \
-          "></li>"
-}
-'
-
-cat <<EOF
-</ul>
-</div>
-<pre>
-EOF
-
-echo "==================[ profile ]==================="
-cat $1 | awk '
-NR>1{
-    optype = $2
-    sum += $5 - $4
-    count[$2] += $6
-}
-END {
-for (t in count) {
-    msg = sprintf("%-16s\t%-10d\t%-.4f", t, count[t], count[t]*100 / sum);
-    print msg
-}
-}' | sort -k2,2nr
-cat $1 | awk '
-NR>1{
-    sum += $5 - $4
-}
-END {
-msg = sprintf("%-16s\t%-10d\t%-.4f", "total", sum, 100);
-print msg
-}'
-
-cat <<EOF
-</pre>
-<script>
-const min= $min;
-const max= $max;
-const px_per_nanosecond = 1/1000000;
-const scale = px_per_nanosecond;
-const li = document.querySelectorAll('li');
-const thread = new Set();
-for (let i = 0; i < li.length; i++) {
-    const prof = li[i].dataset;
-    li[i].style.width = (prof.end - prof.begin)*scale + 'px';
-    li[i].style.left = (prof.begin - min)*scale + 'px';
-    li[i].style.top = prof.tid * 15 + 'px';
-    thread.add(prof.tid);
-}
-const ul = document.createElement('ul');
-ul.classList.add('timeline');
-ul.style.width = (max - min)*scale + 'px';
-thread.forEach(i => {
-    const l = document.createElement('li');
-    ul.appendChild(l);
-});
-const timeview = document.querySelector('.timeview');
-timeview.appendChild(ul);
-timeview.style.height = thread.size * 15 + 'px';
-
-</script>
-</body>
-</html>
-EOF
diff --git a/mobile/tools/python/caffetools/run.py b/mobile/tools/python/caffetools/run.py
deleted file mode 100644
index 914ec83f0f..0000000000
--- a/mobile/tools/python/caffetools/run.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import caffe
-import numpy as np
-
-prototxt_path = ""
-caffemodel_path = ""
-input_path = "input.txt"
-input_name = ""
-output_name = ""
-
-shape = (1, 3, 64, 64)
-
-data = np.loadtxt(input_path).astype("float32").reshape(shape)
-
-net = caffe.Net(prototxt_path, caffemodel_path, caffe.TEST)
-
-# view inputs blob names
-print(net.inputs)
-
-# view outputs blob names
-print(net.outputs)
-
-# set input data
-net.blobs[input_name].reshape(*shape)
-net.blobs[input_name].data[...] = data
-
-# predict
-net.forward()
-
-# view output data
-print(net.blobs[output_name].data)
diff --git a/mobile/tools/python/fluidtools/.gitignore b/mobile/tools/python/fluidtools/.gitignore
deleted file mode 100644
index d80b4c87b2..0000000000
--- a/mobile/tools/python/fluidtools/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-*
-!run.py
-!.gitignore
-!/model-encrypt-tool
-!test_wrap.py
diff --git a/mobile/tools/python/fluidtools/run.py b/mobile/tools/python/fluidtools/run.py
deleted file mode 100644
index a77943e2af..0000000000
--- a/mobile/tools/python/fluidtools/run.py
+++ /dev/null
@@ -1,639 +0,0 @@
-# -*- coding: utf-8 -*
-import os
-import sys
-import math
-import subprocess
-import numpy as np
-import paddle.fluid as fluid
-
-model_path = "model"
-checked_model_path = "checked_model"
-feed_path = "feeds"
-output_path = "outputs"
-diff_threshold = 0.1
-is_lod = False
-mobile_model_path = ""
-fast_check = False
-is_sample_step = False
-sample_step = 1
-sample_num = 20
-need_encrypt = False
-checked_encrypt_model_path = "checked_encrypt_model"
-output_var_filter = []
-output_key_filter = {}
-check_shape = False
-architecture = "arm-v7a"
-# architecture = "arm-v8a"
-
-np.set_printoptions(linewidth=150)
-
-mobile_exec_root = "/data/local/tmp/bin"
-mobile_src_root = os.path.abspath("../../../")
-if mobile_src_root.endswith("/"):
-    mobile_src_root = mobile_src_root[:-1]
-
-dot = "•"
-black = lambda x: "\033[30m" + str(x) + "\033[0m"
-red = lambda x: "\033[31m" + str(x) + "\033[0m"
-green = lambda x: "\033[32m" + str(x) + "\033[0m"
-yellow = lambda x: "\033[33m" + str(x) + "\033[0m"
-reset = lambda x: "\033[0m" + str(x)
-
-def pp_tab(x, level=0):
-    header = ""
-    for i in range(0, level):
-        header += "\t"
-    print(header + str(x))
-def pp_black(x, level=0):
-    pp_tab(black(x) + reset(""), level)
-def pp_red(x, level=0):
-    pp_tab(red(x) + reset(""), level)
-def pp_green(x, level=0):
-    pp_tab(green(x) + reset(""), level)
-def pp_yellow(x, level=0):
-    pp_tab(yellow(x) + reset(""), level)
-
-def sh(command):
-    pipe = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-    return pipe.stdout.read().decode("utf-8")
-def push(src, dest=""):
-    sh("adb push {} {}".format(src, mobile_exec_root + "/" + dest))
-
-pp_yellow(dot + " start inspecting fluid model")
-
-exe = fluid.Executor(fluid.CPUPlace())
-exe.run(fluid.default_startup_program())
-
-# 加载模型
-def load_model(model_path):
-    prog, feeds, fetches = fluid.io.load_inference_model(dirname=model_path, executor=exe, model_filename="model", params_filename="params")
-    return (prog, feeds, fetches)
-
-prog, feeds, fetches = load_model(model_path)
-
-# 强制要求所有张量的形状，在model和params中一致，并重新保存模型
-def resave_model(feed_kv):
-    if len(mobile_model_path) > 0:
-        pp_green("has set mobile_model_path, stop checking model & params", 1)
-        sh("cp {}/* {}".format(mobile_model_path, checked_model_path))
-        return
-    ops = prog.current_block().ops
-    vars = prog.current_block().vars
-    # 强制所有var为可持久化
-    p_names = []
-    for name in vars:
-        name = str(name)
-        v = fluid.framework._get_var(name, prog)
-        if not v.persistable:
-            v.persistable = True
-            p_names.append(name)
-    outputs = run_model(feed_kv=feed_kv)
-    has_found_wrong_shape = False
-    # 修正每个var的形状
-    for name in vars:
-        name = str(name)
-        v = vars[name]
-        if v.persistable:
-            v1 = fluid.global_scope().find_var(name)
-            try:
-                t1 = v1.get_tensor()
-                shape = t1.shape()
-            except:
-                continue
-            if v.desc.shape() != shape:
-                has_found_wrong_shape = True
-            v.desc.set_shape(shape)
-    # 恢复var的可持久化属性
-    for name in p_names:
-        v = fluid.framework._get_var(name, prog)
-        v.persistable = False
-    fluid.io.save_inference_model(dirname=checked_model_path, feeded_var_names=feeds, target_vars=fetches, executor=exe, main_program=prog, model_filename="model", params_filename="params")
-    if has_found_wrong_shape:
-        pp_red("has found wrong shape", 1)
-    else:
-        pp_green("has not found wrong shape", 1)
-    pp_green("new model is saved into directory 【{}】".format(checked_model_path), 1)
-
-# 分别加密model和params，加密key使用同一个
-def encrypt_model():
-    if not need_encrypt:
-        return
-    pp_yellow(dot + dot + " encrypting model")
-    if not os.path.exists(checked_encrypt_model_path):
-        os.mkdir(checked_encrypt_model_path)
-    res = sh("model-encrypt-tool/enc_key_gen -l 20 -c 232")
-    lines = res.split("\n")
-
-    for line in lines:
-        if line.startswith("key:"):
-            line = line.replace('key:','')
-            sh("model-encrypt-tool/enc_model_gen -k '{}' -c 2 -i checked_model/model -o "
-               "checked_model/model.ml".format(line))
-            sh("model-encrypt-tool/enc_model_gen -k '{}' -c 2 -i checked_model/params  -o checked_model/params.ml".format(line))
-            pp_green("model has been encrypted, key is : {}".format(line), 1)
-            sh("mv {} {}".format(checked_model_path + "/*.ml", checked_encrypt_model_path))
-            return
-    pp_red("model encrypt error", 1)
-
-# 生成feed的key-value对
-def gen_feed_kv():
-    feed_kv = {}
-    for feed_name in feeds:
-        feed_shape = get_feed_var_shape(feed_name)
-        data = np.random.random(feed_shape).astype("float32")
-        feed_kv[feed_name] = data
-    return feed_kv
-
-# 保存feed的key-value对
-def save_feed_kv(feed_kv):
-    for feed_name in feed_kv:
-        feed_data = feed_kv[feed_name]
-        feed_list = feed_data.flatten().tolist()
-        if not os.path.exists(feed_path):
-            os.mkdir(feed_path)
-        file_name = feed_name.replace("/", "_")
-        out_file = open(feed_path + "/" + file_name, "w")
-        for feed_item in feed_list:
-            out_file.write("{}\n".format(feed_item))
-        out_file.close()
-
-last_feed_var_name = None
-last_feed_file_name = None
-last_feed_var_lod = None
-# 加载feed的key-value对
-def load_feed_kv():
-    if not os.path.exists(feed_path):
-        return None
-    global last_feed_var_name
-    global last_feed_file_name
-    global last_feed_var_lod
-    feed_kv = {}
-    pp_yellow(dot + dot + " checking feed info")
-    pp_green("feed data is saved into directory 【{}】".format(feed_path), 1)
-    for feed_name in feeds:
-        feed_shape = get_feed_var_shape(feed_name)
-        pp_tab("feed var name : {}; feed var shape : {}".format(feed_name, feed_shape), 1)
-        file_name = feed_name.replace("/", "_")
-        last_feed_var_name = feed_name
-        last_feed_file_name = file_name
-        feed_file_path = feed_path + "/" + file_name
-        if not os.path.exists(feed_file_path):
-            return None
-        data = np.loadtxt(feed_file_path)
-        expected_len = 1
-        for dim in feed_shape:
-            expected_len *= dim
-        if len(np.atleast_1d(data)) != expected_len:
-            return None
-        data = data.reshape(feed_shape).astype("float32")
-        
-        if is_lod:
-            data_shape = [1]
-            for dim in feed_shape:
-                data_shape.append(dim)
-            data = data.reshape(data_shape).astype("float32")
-            tensor = fluid.LoDTensor()
-            seq_lens = [len(seq) for seq in data]
-            cur_len = 0
-            lod = [cur_len]
-            for l in seq_lens:
-                cur_len += l
-                lod.append(cur_len)
-            data = data.reshape(feed_shape)
-            tensor.set(data, fluid.CPUPlace())
-            tensor.set_lod([lod])
-            last_feed_var_lod = lod
-            feed_kv[feed_name] = tensor
-        else:
-            feed_kv[feed_name] = data
-    return feed_kv
-
-# 运行模型
-def run_model(feed_kv=None):
-    if feed_kv is None:
-        feed_kv = gen_feed_kv()
-    outputs = exe.run(prog, feed=feed_kv, fetch_list=fetches, return_numpy=False)
-    results = []
-    for output in outputs:
-        results.append(np.array(output))
-    return results
-
-# 获取变量形状
-def get_var_shape(var_name):
-    vars = prog.current_block().vars
-    shape = vars[var_name].desc.shape()
-    for i in range(len(shape)):
-        dim = shape[i]
-        if dim == -1:
-            shape[i] = 1
-    return shape
-
-# 获取输入变量形状
-def get_feed_var_shape(var_name):
-    # 如果想写死输入形状，放开以下语句
-    # return [1, 3, 224, 224]
-    return get_var_shape(var_name)
-
-persistable_cache = []
-# 所有var，全部变成持久化
-def force_all_vars_to_persistable():
-    global persistable_cache
-    for var_name in vars.keys():
-        var_name = str(var_name)
-        v = fluid.framework._get_var(var_name, prog)
-        persistable = v.persistable
-        if not persistable:
-            persistable_cache.append(var_name)
-            v.persistable = True
-
-# 恢复持久化属性
-def restore_all_vars_persistable():
-    global persistable_cache
-    for var_name in vars.keys():
-        var_name = str(var_name)
-        v = fluid.framework._get_var(var_name, prog)
-        persistable = v.persistable
-        if var_name in persistable_cache:
-            v.persistable = False
-    persistable_cache = []
-
-# 获取var的数据
-def get_var_data(var_name, feed_kv=None):
-    output = np.array(fluid.global_scope().var(var_name).get_tensor())
-    return output
-
-output_var_cache = {}
-def tensor_sample(tensor):
-    if is_sample_step:
-        step = sample_step
-    else:
-        step = math.floor(len(tensor) / sample_num)
-    step = max(step, 1)
-    step = int(step)
-    sample = []
-    for i in range(0, len(tensor), step):
-        sample.append(tensor[i])
-    return sample
-
-op_cache = {}
-# 获取每层输出的数据
-def save_all_op_output(feed_kv=None):
-    force_all_vars_to_persistable()
-    outputs = run_model(feed_kv=feed_kv)
-    if not os.path.exists(output_path):
-        os.mkdir(output_path)
-    ops = prog.current_block().ops
-    fetch_names = []
-    for fetch in fetches:
-        fetch_names.append(fetch.name)
-    feed_names = feeds
-    if len(output_var_filter) > 0:
-        for fetch_name in fetch_names:
-            output_var_filter.append(fetch_name)
-    for i in range(len(ops)):
-        op = ops[i]
-        var_name = None
-        var_name_index = -1
-        for index in range(len(op.output_names)):
-            if op.output_names[index] in ["Y", "Out", "Output"]:
-                var_name_index = index
-                break
-        if var_name_index != -1:
-            var_name = op.output_arg_names[var_name_index]
-        else:
-            for name in op.output_arg_names:
-                var_name = name
-                if "tmp" in name:
-                    break
-        if len(output_var_filter) > 0:
-            if var_name not in output_var_filter:
-                continue
-        # real_var_name = None
-        # if op.type == "fetch":
-        #     for name in op.input_arg_names:
-        #         real_var_name = name
-        #         if "tmp" in name:
-        #             break
-        # else:
-        #     real_var_name = var_name
-        if fast_check:
-            if var_name not in fetch_names and var_name not in feed_names:
-                continue
-        try:
-            data = get_var_data(var_name, feed_kv=feed_kv).flatten().tolist()
-            sample = tensor_sample(data)
-            output_var_cache[var_name] = (sample)
-            op_cache[i] = (var_name, op)
-            file_name = var_name.replace("/", "_")
-            out_file = open(output_path + "/" + file_name, "w")
-            if var_name in feed_names:
-                for item in data:
-                    out_file.write("{}\n".format(item))
-            else:
-                for item in sample:
-                    out_file.write("{}\n".format(item))
-            out_file.close()
-        except:
-            pass
-    for i in range(len(ops)):
-        op = ops[i]
-        if op.type not in output_key_filter:
-            continue
-        var_name = None
-        var_name_index = -1
-        for index in range(len(op.output_names)):
-            if op.output_names[index] in output_key_filter[op.type]:
-                var_name_index = index
-                break
-        if var_name_index != -1:
-            var_name = op.output_arg_names[var_name_index]
-        else:
-            continue
-        if len(output_var_filter) > 0:
-            if var_name not in output_var_filter:
-                continue
-        # real_var_name = None
-        # if op.type == "fetch":
-        #     for name in op.input_arg_names:
-        #         real_var_name = name
-        #         if "tmp" in name:
-        #             break
-        # else:
-        #     real_var_name = var_name
-        if fast_check:
-            if var_name not in fetch_names and var_name not in feed_names:
-                continue
-        try:
-            data = get_var_data(var_name, feed_kv=feed_kv).flatten().tolist()
-            sample = tensor_sample(data)
-            output_var_cache[var_name] = (sample)
-            op_cache[i] = (var_name, op)
-            file_name = var_name.replace("/", "_")
-            out_file = open(output_path + "/" + file_name, "w")
-            if var_name in feed_names:
-                for item in data:
-                    out_file.write("{}\n".format(item))
-            else:
-                for item in sample:
-                    out_file.write("{}\n".format(item))
-            out_file.close()
-        except:
-            pass
-    pp_green("all the op outputs are saved into directory 【{}】".format(output_path), 1)
-    restore_all_vars_persistable()
-
-ops = prog.current_block().ops
-vars = prog.current_block().vars
-
-pp_yellow(dot + dot + " checking op list")
-op_types = set()
-for op in ops:
-    op_types.add(op.type)
-pp_tab("op types : {}".format(op_types), 1)
-
-def check_mobile_results(args, fuse, mem_opt):
-    args = "{} {} {}".format("1" if fuse else "0", "1" if mem_opt else "0", args)
-    res = sh("adb shell \"cd {} && export LD_LIBRARY_PATH=. && ./test-net {}\"".format(mobile_exec_root, args))
-    lines = res.split("\n")
-    # for line in lines:
-    #     print(line)
-    for line in lines:
-        if line.startswith("auto-test-debug"):
-            print(line)
-    pp_yellow(dot + dot + " checking paddle mobile results for {} -- {} ".format(green("【fusion】" if fuse else "【non fusion】"), green("【memory-optimization】" if mem_opt else "【non-memory-optimization】")))
-    mobile_var_cache = {}
-    for line in lines:
-        parts = line.split(" ")
-        if len(parts) < 2:
-            continue
-        if "auto-test" != parts[0]:
-            continue
-        if parts[1] == "load-time-cost":
-            pp_green("load time cost : {}".format(parts[2]), 1) 
-        elif parts[1] == "predict-time-cost":
-            pp_green("predict time cost : {}".format(parts[2]), 1) 
-        elif parts[1] == "preprocess-time-cost":
-            pp_green("preprocess time cost : {}".format(parts[2]), 1)
-        elif parts[1] == "var":
-            var_name = parts[2]
-            values = list(map(lambda x: float(x), parts[3:]))
-            mobile_var_cache[var_name] = values
-    error_index = None
-    error_values1 = None
-    error_values2 = None
-    checked_names = []
-    fetch_names = []
-    for fetch in fetches:
-        fetch_names.append(fetch.name)
-    for index in op_cache:
-        op_output_var_name, op = op_cache[index]
-        if mem_opt:
-            found_in_fetch = False
-            for fetch in fetches:
-                if op_output_var_name == fetch.name:
-                    found_in_fetch = True
-                    break
-            if not found_in_fetch:
-                continue
-        if not op_output_var_name in output_var_cache:
-            continue
-        if not op_output_var_name in mobile_var_cache:
-            continue
-        if op_output_var_name not in fetch_names:
-            continue
-        values1 = output_var_cache[op_output_var_name]
-        values2 = mobile_var_cache[op_output_var_name]
-        shape = get_var_shape(op_output_var_name) if check_shape else []
-        if len(values1) + len(shape) != len(values2):
-            error_index = index
-        for i in range(len(shape)):
-            v1 = shape[i]
-            v2 = values2[i]
-            if v1 != v2:
-                error_index = index
-                break
-        if error_index == None:
-            for i in range(len(values1)):
-                v1 = values1[i]
-                v2 = values2[len(shape) + i]
-                if abs(v1 - v2) > diff_threshold:
-                    error_index = index
-                    break
-        checked_names.append(op_output_var_name)
-        if error_index != None:
-            error_values1 = values1
-            error_values2 = values2
-            break
-    if error_index == None:
-        for name in fetch_names:
-            if name not in checked_names:
-                error_index = -1
-                break
-    if error_index == None:
-        pp_green("outputs are all correct", 1)
-    elif error_index == -1:
-        pp_red("outputs are missing")
-    else:
-        error_values1 = np.array(error_values1)
-        error_values2 = np.array(error_values2)
-        # pp_red("mobile op is not correct, error occurs at {}th op, op's type is {}")
-        pp_red("outputs are incorrect", 1)
-        pp_red("fluid results are : ", 1)
-        pp_red(str(error_values1).replace("\n", "\n" + "\t" * 1), 1)
-        pp_yellow("paddle mobile results are : ", 1)
-        pp_red(str(error_values2).replace("\n", "\n" + "\t" * 1), 1)
-        if not fuse and not mem_opt:
-            pp_yellow("checking individual ops : ", 1)
-            error_index = None
-            error_values1 = None
-            error_values2 = None
-            checked_names = []
-            fetch_names = []
-            for fetch in fetches:
-                fetch_names.append(fetch.name)
-            for index in op_cache:
-                op_output_var_name, op = op_cache[index]
-                if mem_opt:
-                    found_in_fetch = False
-                    for fetch in fetches:
-                        if op_output_var_name == fetch.name:
-                            found_in_fetch = True
-                            break
-                    if not found_in_fetch:
-                        continue
-                if not op_output_var_name in output_var_cache:
-                    continue
-                if not op_output_var_name in mobile_var_cache:
-                    continue
-                if fuse or mem_opt:
-                    if op_output_var_name not in fetch_names:
-                        continue
-                values1 = output_var_cache[op_output_var_name]
-                values2 = mobile_var_cache[op_output_var_name]
-                shape = get_var_shape(op_output_var_name) if check_shape else []
-                if len(values1) + len(shape) != len(values2):
-                    error_index = index
-                for i in range(len(shape)):
-                    v1 = shape[i]
-                    v2 = values2[i]
-                    if v1 != v2:
-                        error_index = index
-                        break
-                if error_index == None:
-                    for i in range(len(values1)):
-                        v1 = values1[i]
-                        v2 = values2[len(shape) + i]
-                        if abs(v1 - v2) > diff_threshold:
-                            error_index = index
-                            break
-                checked_names.append(op_output_var_name)
-                if error_index != None:
-                    error_values1 = values1
-                    error_values2 = values2
-                    break
-            if error_index == None:
-                for name in fetch_names:
-                    if name not in checked_names:
-                        error_index = -1
-                        break
-            if error_index == None:
-                pp_green("outputs are all correct", 1)
-            elif error_index == -1:
-                pp_red("outputs are missing")
-            else:
-                error_values1 = np.array(error_values1)
-                error_values2 = np.array(error_values2)
-                # pp_red("mobile op is not correct, error occurs at {}th op, op's type is {}")
-                pp_red("corresponding fluid op is {}th op, op's type is {}, wrong var name is {}".format(
-                    error_index,op_cache[error_index][1].type,op_output_var_name), 1)
-                pp_red("fluid results are : ", 1)
-                pp_red(str(error_values1).replace("\n", "\n" + "\t" * 1), 1)
-                pp_yellow("paddle mobile results are : ", 1)
-                pp_red(str(error_values2).replace("\n", "\n" + "\t" * 1), 1)
-    # print(output_var_cache)
-    # print(mobile_var_cache)
-
-def main():
-    # 加载kv
-    feed_kv = load_feed_kv()
-    if feed_kv == None:
-        feed_kv = gen_feed_kv()
-        save_feed_kv(feed_kv)
-        feed_kv = load_feed_kv()
-    # 预测
-    pp_yellow(dot + dot + " checking inference")
-    outputs = run_model(feed_kv=feed_kv)
-    pp_tab("fluid output : {}".format(outputs), 1)
-    # 重新保存模型
-    pp_yellow(dot + dot + " checking model correctness")
-    resave_model(feed_kv=feed_kv)
-    # 输出加密模型
-    encrypt_model()
-    # 输出所有中间结果
-    pp_yellow(dot + dot + " checking output result of every op")
-    save_all_op_output(feed_kv=feed_kv)
-    pp_yellow(dot + dot + " checking fetch info")
-    for fetch in fetches:
-        fetch_name = fetch.name
-        fetch_shape = get_var_shape(fetch_name)
-        pp_tab("fetch var name : {}; fetch var shape : {}".format(fetch_name, fetch_shape), 1)
-    # 输出所有op、var信息
-    info_file = open("info.txt", "w")
-    for i in range(len(ops)):
-        op = ops[i]
-        info_file.write("{}th op: type - {}\n".format(i, op.type))
-        info_file.write("inputs:\n")
-        for var_name in op.input_arg_names:
-            try:
-                shape = get_var_shape(var_name)
-                shape_str = ", ".join(list(map(lambda x: str(x), shape)))
-                info_file.write("var {} : {}\n".format(var_name, shape_str))
-            except:
-                pass
-        info_file.write("outputs:\n")
-        for var_name in op.output_arg_names:
-            try:
-                shape = get_var_shape(var_name)
-                shape_str = ", ".join(list(map(lambda x: str(x), shape)))
-                info_file.write("var {} : {}\n".format(var_name, shape_str))
-            except:
-                pass
-    info_file.close()
-    # 开始检查mobile的正确性
-    print("")
-    print("==================================================")
-    print("")
-    pp_yellow(dot + " start inspecting paddle mobile correctness & performance")
-    push(checked_model_path)
-    push(feed_path + "/" + last_feed_file_name, "input.txt")
-    push(mobile_src_root + "/build/release/{}/build/libpaddle-mobile.so".format(architecture))
-    push(mobile_src_root + "/build/release/{}/build/cl_kernel".format(architecture))
-    push(mobile_src_root + "/test/build/test-net")
-    last_feed_var_shape = get_feed_var_shape(last_feed_var_name)
-    args = str(len(last_feed_var_shape))
-    for dim in last_feed_var_shape:
-        args += " " + str(dim)
-    if is_lod:
-        args += " 1"
-        args += " " + str(len(last_feed_var_lod))
-        for dim in last_feed_var_lod:
-            args += " " + str(dim)
-    else:
-        args += " 0"
-    args += " " + str(len(output_var_cache))
-    args += " " + str(1 if is_sample_step else 0)
-    if is_sample_step:
-        args += " " + str(sample_step)
-    else:
-        args += " " + str(sample_num)
-    for var_name in output_var_cache.keys():
-        args += " " + var_name
-    args += " " + str(1 if check_shape else 0)
-    if not fast_check:
-        check_mobile_results(args, False, False)
-        check_mobile_results(args, False, True)
-    check_mobile_results(args, True, False)
-    check_mobile_results(args, True, True)
-
-if __name__ == "__main__":
-    main()
diff --git a/mobile/tools/python/fluidtools/test_wrap.py b/mobile/tools/python/fluidtools/test_wrap.py
deleted file mode 100644
index 527a5a6584..0000000000
--- a/mobile/tools/python/fluidtools/test_wrap.py
+++ /dev/null
@@ -1,546 +0,0 @@
-# -*- coding: utf-8 -*
-import os
-import sys
-import math
-import subprocess
-import numpy as np
-import paddle.fluid as fluid
-
-model_path = "yolov2"
-checked_model_path = "checked_model"
-feed_path = "feeds"
-output_path = "outputs"
-diff_threshold = 0.05
-is_lod = False
-mobile_model_path = ""
-fast_check = False
-is_sample_step = False
-sample_step = 1
-sample_num = 20
-need_encrypt = False
-checked_encrypt_model_path = "checked_encrypt_model"
-output_var_filter = []
-output_key_filter = {}
-check_shape = False
-
-np.set_printoptions(linewidth=150)
-
-mobile_exec_root = "/data/local/tmp/bin"
-mobile_src_root = os.path.abspath("../../../")
-if mobile_src_root.endswith("/"):
-    mobile_src_root = mobile_src_root[:-1]
-
-dot = "•"
-black = lambda x: "\033[30m" + str(x) + "\033[0m"
-red = lambda x: "\033[31m" + str(x) + "\033[0m"
-green = lambda x: "\033[32m" + str(x) + "\033[0m"
-yellow = lambda x: "\033[33m" + str(x) + "\033[0m"
-reset = lambda x: "\033[0m" + str(x)
-
-def pp_tab(x, level=0):
-    header = ""
-    for i in range(0, level):
-        header += "\t"
-    print(header + str(x))
-def pp_black(x, level=0):
-    pp_tab(black(x) + reset(""), level)
-def pp_red(x, level=0):
-    pp_tab(red(x) + reset(""), level)
-def pp_green(x, level=0):
-    pp_tab(green(x) + reset(""), level)
-def pp_yellow(x, level=0):
-    pp_tab(yellow(x) + reset(""), level)
-
-def sh(command):
-    pipe = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-    return pipe.stdout.read().decode("utf-8")
-def push(src, dest=""):
-    sh("adb push {} {}".format(src, mobile_exec_root + "/" + dest))
-
-pp_yellow(dot + " start inspecting fluid model")
-
-exe = fluid.Executor(fluid.CPUPlace())
-exe.run(fluid.default_startup_program())
-
-# 加载模型
-def load_model(model_path):
-    prog, feeds, fetches = fluid.io.load_inference_model(dirname=model_path, executor=exe, model_filename="model", params_filename="params")
-    return (prog, feeds, fetches)
-
-prog, feeds, fetches = load_model(model_path)
-
-# 强制要求所有张量的形状，在model和params中一致，并重新保存模型
-def resave_model(feed_kv):
-    if len(mobile_model_path) > 0:
-        pp_green("has set mobile_model_path, stop checking model & params", 1)
-        sh("cp {}/* {}".format(mobile_model_path, checked_model_path))
-        return
-    ops = prog.current_block().ops
-    vars = prog.current_block().vars
-    # 强制所有var为可持久化
-    p_names = []
-    for name in vars:
-        name = str(name)
-        v = fluid.framework._get_var(name, prog)
-        if not v.persistable:
-            v.persistable = True
-            p_names.append(name)
-    outputs = run_model(feed_kv=feed_kv)
-    has_found_wrong_shape = False
-    # 修正每个var的形状
-    for name in vars:
-        name = str(name)
-        v = vars[name]
-        if v.persistable:
-            v1 = fluid.global_scope().find_var(name)
-            try:
-                t1 = v1.get_tensor()
-                shape = t1.shape()
-            except:
-                continue
-            if v.desc.shape() != shape:
-                has_found_wrong_shape = True
-            v.desc.set_shape(shape)
-    # 恢复var的可持久化属性
-    for name in p_names:
-        v = fluid.framework._get_var(name, prog)
-        v.persistable = False
-    fluid.io.save_inference_model(dirname=checked_model_path, feeded_var_names=feeds, target_vars=fetches, executor=exe, main_program=prog, model_filename="model", params_filename="params")
-    if has_found_wrong_shape:
-        pp_red("has found wrong shape", 1)
-    else:
-        pp_green("has not found wrong shape", 1)
-    pp_green("new model is saved into directory 【{}】".format(checked_model_path), 1)
-
-# 分别加密model和params，加密key使用同一个
-def encrypt_model():
-    if not need_encrypt:
-        return
-    pp_yellow(dot + dot + " encrypting model")
-    if not os.path.exists(checked_encrypt_model_path):
-        os.mkdir(checked_encrypt_model_path)
-    res = sh("model-encrypt-tool/enc_key_gen -l 20 -c 232")
-    lines = res.split("\n")
-
-    for line in lines:
-        if line.startswith("key:"):
-            line = line.replace('key:','')
-            sh("model-encrypt-tool/enc_model_gen -k '{}' -c 2 -i checked_model/model -o "
-               "checked_model/model.ml".format(line))
-            sh("model-encrypt-tool/enc_model_gen -k '{}' -c 2 -i checked_model/params  -o checked_model/params.ml".format(line))
-            pp_green("model has been encrypted, key is : {}".format(line), 1)
-            sh("mv {} {}".format(checked_model_path + "/*.ml", checked_encrypt_model_path))
-            return
-    pp_red("model encrypt error", 1)
-
-# 生成feed的key-value对
-def gen_feed_kv():
-    feed_kv = {}
-    for feed_name in feeds:
-        feed_shape = get_feed_var_shape(feed_name)
-        data = np.random.random(feed_shape).astype("float32")
-        feed_kv[feed_name] = data
-    return feed_kv
-
-# 保存feed的key-value对
-def save_feed_kv(feed_kv):
-    for feed_name in feed_kv:
-        feed_data = feed_kv[feed_name]
-        feed_list = feed_data.flatten().tolist()
-        if not os.path.exists(feed_path):
-            os.mkdir(feed_path)
-        file_name = feed_name.replace("/", "_")
-        out_file = open(feed_path + "/" + file_name, "w")
-        for feed_item in feed_list:
-            out_file.write("{}\n".format(feed_item))
-        out_file.close()
-
-last_feed_var_name = None
-last_feed_file_name = None
-last_feed_var_lod = None
-# 加载feed的key-value对
-def load_feed_kv():
-    if not os.path.exists(feed_path):
-        return None
-    global last_feed_var_name
-    global last_feed_file_name
-    global last_feed_var_lod
-    feed_kv = {}
-    pp_yellow(dot + dot + " checking feed info")
-    pp_green("feed data is saved into directory 【{}】".format(feed_path), 1)
-    for feed_name in feeds:
-        feed_shape = get_feed_var_shape(feed_name)
-        pp_tab("feed var name : {}; feed var shape : {}".format(feed_name, feed_shape), 1)
-        file_name = feed_name.replace("/", "_")
-        last_feed_var_name = feed_name
-        last_feed_file_name = file_name
-        feed_file_path = feed_path + "/" + file_name
-        if not os.path.exists(feed_file_path):
-            return None
-        data = np.loadtxt(feed_file_path)
-        expected_len = 1
-        for dim in feed_shape:
-            expected_len *= dim
-        if len(np.atleast_1d(data)) != expected_len:
-            return None
-        data = data.reshape(feed_shape).astype("float32")
-        
-        if is_lod:
-            data_shape = [1]
-            for dim in feed_shape:
-                data_shape.append(dim)
-            data = data.reshape(data_shape).astype("float32")
-            tensor = fluid.LoDTensor()
-            seq_lens = [len(seq) for seq in data]
-            cur_len = 0
-            lod = [cur_len]
-            for l in seq_lens:
-                cur_len += l
-                lod.append(cur_len)
-            data = data.reshape(feed_shape)
-            tensor.set(data, fluid.CPUPlace())
-            tensor.set_lod([lod])
-            last_feed_var_lod = lod
-            feed_kv[feed_name] = tensor
-        else:
-            feed_kv[feed_name] = data
-    return feed_kv
-
-# 运行模型
-def run_model(feed_kv=None):
-    if feed_kv is None:
-        feed_kv = gen_feed_kv()
-    outputs = exe.run(prog, feed=feed_kv, fetch_list=fetches, return_numpy=False)
-    results = []
-    for output in outputs:
-        results.append(np.array(output))
-    return results
-
-# 获取变量形状
-def get_var_shape(var_name):
-    vars = prog.current_block().vars
-    shape = vars[var_name].desc.shape()
-    for i in range(len(shape)):
-        dim = shape[i]
-        if dim == -1:
-            shape[i] = 1
-    return shape
-
-# 获取输入变量形状
-def get_feed_var_shape(var_name):
-    # 如果想写死输入形状，放开以下语句
-    # return [1, 3, 224, 224]
-    return get_var_shape(var_name)
-
-persistable_cache = []
-# 所有var，全部变成持久化
-def force_all_vars_to_persistable():
-    global persistable_cache
-    for var_name in vars.keys():
-        var_name = str(var_name)
-        v = fluid.framework._get_var(var_name, prog)
-        persistable = v.persistable
-        if not persistable:
-            persistable_cache.append(var_name)
-            v.persistable = True
-
-# 恢复持久化属性
-def restore_all_vars_persistable():
-    global persistable_cache
-    for var_name in vars.keys():
-        var_name = str(var_name)
-        v = fluid.framework._get_var(var_name, prog)
-        persistable = v.persistable
-        if var_name in persistable_cache:
-            v.persistable = False
-    persistable_cache = []
-
-# 获取var的数据
-def get_var_data(var_name, feed_kv=None):
-    output = np.array(fluid.global_scope().var(var_name).get_tensor())
-    return output
-
-output_var_cache = {}
-def tensor_sample(tensor):
-    if is_sample_step:
-        step = sample_step
-    else:
-        step = math.floor(len(tensor) / sample_num)
-    step = max(step, 1)
-    step = int(step)
-    sample = []
-    for i in range(0, len(tensor), step):
-        sample.append(tensor[i])
-    return sample
-
-op_cache = {}
-# 获取每层输出的数据
-def save_all_op_output(feed_kv=None):
-    force_all_vars_to_persistable()
-    outputs = run_model(feed_kv=feed_kv)
-    if not os.path.exists(output_path):
-        os.mkdir(output_path)
-    ops = prog.current_block().ops
-    fetch_names = []
-    for fetch in fetches:
-        fetch_names.append(fetch.name)
-    feed_names = feeds
-    for fetch_name in fetch_names:
-        output_var_filter.append(fetch_name)
-    for i in range(len(ops)):
-        op = ops[i]
-        var_name = None
-        var_name_index = -1
-        for index in range(len(op.output_names)):
-            if op.output_names[index] in ["Y", "Out", "Output"]:
-                var_name_index = index
-                break
-        if var_name_index != -1:
-            var_name = op.output_arg_names[var_name_index]
-        else:
-            for name in op.output_arg_names:
-                var_name = name
-                if "tmp" in name:
-                    break
-        if len(output_var_filter) > 0:
-            if var_name not in output_var_filter:
-                continue
-        # real_var_name = None
-        # if op.type == "fetch":
-        #     for name in op.input_arg_names:
-        #         real_var_name = name
-        #         if "tmp" in name:
-        #             break
-        # else:
-        #     real_var_name = var_name
-        if fast_check:
-            if var_name not in fetch_names and var_name not in feed_names:
-                continue
-        try:
-            data = get_var_data(var_name, feed_kv=feed_kv).flatten().tolist()
-            sample = tensor_sample(data)
-            output_var_cache[var_name] = (sample)
-            op_cache[i] = (var_name, op)
-            file_name = var_name.replace("/", "_")
-            out_file = open(output_path + "/" + file_name, "w")
-            if var_name in feed_names:
-                for item in data:
-                    out_file.write("{}\n".format(item))
-            else:
-                for item in sample:
-                    out_file.write("{}\n".format(item))
-            out_file.close()
-        except:
-            pass
-    for i in range(len(ops)):
-        op = ops[i]
-        if op.type not in output_key_filter:
-            continue
-        var_name = None
-        var_name_index = -1
-        for index in range(len(op.output_names)):
-            if op.output_names[index] in output_key_filter[op.type]:
-                var_name_index = index
-                break
-        if var_name_index != -1:
-            var_name = op.output_arg_names[var_name_index]
-        else:
-            continue
-        if len(output_var_filter) > 0:
-            if var_name not in output_var_filter:
-                continue
-        # real_var_name = None
-        # if op.type == "fetch":
-        #     for name in op.input_arg_names:
-        #         real_var_name = name
-        #         if "tmp" in name:
-        #             break
-        # else:
-        #     real_var_name = var_name
-        if fast_check:
-            if var_name not in fetch_names and var_name not in feed_names:
-                continue
-        try:
-            data = get_var_data(var_name, feed_kv=feed_kv).flatten().tolist()
-            sample = tensor_sample(data)
-            output_var_cache[var_name] = (sample)
-            op_cache[i] = (var_name, op)
-            file_name = var_name.replace("/", "_")
-            out_file = open(output_path + "/" + file_name, "w")
-            if var_name in feed_names:
-                for item in data:
-                    out_file.write("{}\n".format(item))
-            else:
-                for item in sample:
-                    out_file.write("{}\n".format(item))
-            out_file.close()
-        except:
-            pass
-    pp_green("all the op outputs are saved into directory 【{}】".format(output_path), 1)
-    restore_all_vars_persistable()
-
-ops = prog.current_block().ops
-vars = prog.current_block().vars
-
-pp_yellow(dot + dot + " checking op list")
-op_types = set()
-for op in ops:
-    op_types.add(op.type)
-pp_tab("op types : {}".format(op_types), 1)
-
-def check_mobile_results(args, fuse, mem_opt):
-    args = "{} {} {}".format("1" if fuse else "0", "1" if mem_opt else "0", args)
-    res = sh("adb shell \"cd {} && export LD_LIBRARY_PATH=. && ./test-net {}\"".format(mobile_exec_root, args))
-    lines = res.split("\n")
-    for line in lines:
-        print(line)
-    for line in lines:
-        if line.startswith("auto-test-debug"):
-            print(line)
-    pp_yellow(dot + dot + " checking paddle mobile results for {} -- {} ".format(green("【fusion】" if fuse else "【non fusion】"), green("【memory-optimization】" if mem_opt else "【non-memory-optimization】")))
-    mobile_var_cache = {}
-    for line in lines:
-        parts = line.split(" ")
-        if len(parts) < 2:
-            continue
-        if "auto-test" != parts[0]:
-            continue
-        if parts[1] == "load-time-cost":
-            pp_green("load time cost : {}".format(parts[2]), 1) 
-        elif parts[1] == "predict-time-cost":
-            pp_green("predict time cost : {}".format(parts[2]), 1) 
-        elif parts[1] == "preprocess-time-cost":
-            pp_green("preprocess time cost : {}".format(parts[2]), 1)
-        elif parts[1] == "var":
-            var_name = parts[2]
-            values = list(map(lambda x: float(x), parts[3:]))
-            mobile_var_cache[var_name] = values
-    error_index = None
-    error_values1 = None
-    error_values2 = None
-    checked_names = []
-    fetch_names = []
-    for fetch in fetches:
-        fetch_names.append(fetch.name)
-    for index in op_cache:
-        op_output_var_name, op = op_cache[index]
-        if mem_opt:
-            found_in_fetch = False
-            for fetch in fetches:
-                if op_output_var_name == fetch.name:
-                    found_in_fetch = True
-                    break
-            if not found_in_fetch:
-                continue
-        if not op_output_var_name in output_var_cache:
-            continue
-        if not op_output_var_name in mobile_var_cache:
-            continue
-        values1 = output_var_cache[op_output_var_name]
-        values2 = mobile_var_cache[op_output_var_name]
-        shape = get_var_shape(op_output_var_name) if check_shape else []
-        if len(values1) + len(shape) != len(values2):
-            error_index = index
-        for i in range(len(shape)):
-            v1 = shape[i]
-            v2 = values2[i]
-            if v1 != v2:
-                error_index = index
-                break
-        if error_index == None:
-            for i in range(len(values1)):
-                v1 = values1[i]
-                v2 = values2[len(shape) + i]
-                if abs(v1 - v2) > diff_threshold:
-                    error_index = index
-                    break
-        checked_names.append(op_output_var_name)
-        if error_index != None:
-            error_values1 = values1
-            error_values2 = values2
-            break
-    if error_index == None:
-        for name in fetch_names:
-            if name not in checked_names:
-                error_index = -1
-                break
-    if error_index == None:
-        pp_green("outputs are all correct", 1)
-    elif error_index == -1:
-        pp_red("outputs are missing")
-    else:
-        error_values1 = np.array(error_values1)
-        error_values2 = np.array(error_values2)
-        # pp_red("mobile op is not correct, error occurs at {}th op, op's type is {}")
-        pp_red("corresponding fluid op is {}th op, op's type is {}, wrong var name is {}".format(
-            error_index,op_cache[error_index][1].type,op_output_var_name), 1)
-        pp_red("fluid results are : ", 1)
-        pp_red(str(error_values1).replace("\n", "\n" + "\t" * 1), 1)
-        pp_yellow("paddle mobile results are : ", 1)
-        pp_red(str(error_values2).replace("\n", "\n" + "\t" * 1), 1)
-    # print(output_var_cache)
-    # print(mobile_var_cache)
-
-def main():
-    # 加载kv
-    feed_kv = load_feed_kv()
-    if feed_kv == None:
-        feed_kv = gen_feed_kv()
-        save_feed_kv(feed_kv)
-        feed_kv = load_feed_kv()
-    # 预测
-    pp_yellow(dot + dot + " checking inference")
-    outputs = run_model(feed_kv=feed_kv)
-    pp_tab("fluid output : {}".format(outputs), 1)
-    # 重新保存模型
-    pp_yellow(dot + dot + " checking model correctness")
-    resave_model(feed_kv=feed_kv)
-    # 输出加密模型
-    encrypt_model()
-    # 输出所有中间结果
-    pp_yellow(dot + dot + " checking output result of every op")
-    save_all_op_output(feed_kv=feed_kv)
-    pp_yellow(dot + dot + " checking fetch info")
-    for fetch in fetches:
-        fetch_name = fetch.name
-        fetch_shape = get_var_shape(fetch_name)
-        pp_tab("fetch var name : {}; fetch var shape : {}".format(fetch_name, fetch_shape), 1)
-    # 输出所有op、var信息
-    info_file = open("info.txt", "w")
-    for i in range(len(ops)):
-        op = ops[i]
-        info_file.write("{}th op: type - {}\n".format(i, op.type))
-        info_file.write("inputs:\n")
-        for var_name in op.input_arg_names:
-            try:
-                shape = get_var_shape(var_name)
-                shape_str = ", ".join(list(map(lambda x: str(x), shape)))
-                info_file.write("var {} : {}\n".format(var_name, shape_str))
-            except:
-                pass
-        info_file.write("outputs:\n")
-        for var_name in op.output_arg_names:
-            try:
-                shape = get_var_shape(var_name)
-                shape_str = ", ".join(list(map(lambda x: str(x), shape)))
-                info_file.write("var {} : {}\n".format(var_name, shape_str))
-            except:
-                pass
-    info_file.close()
-    # 开始检查mobile的正确性
-    print("")
-    print("==================================================")
-    print("")
-    pp_yellow(dot + " start inspecting paddle mobile correctness & performance")
-    push(checked_model_path)
-    push(feed_path + "/" + last_feed_file_name, "input.txt")
-    push(mobile_src_root + "/build/release/arm-v7a/build/libpaddle-mobile.so")
-    push(mobile_src_root + "/build/release/arm-v7a/build/cl_kernel")
-    push(mobile_src_root + "/test/build/test-wrap")
-    res = sh("adb shell 'cd {} && export LD_LIBRARY_PATH=. && ./test-wrap'".format(mobile_exec_root))
-    lines = res.split("\n")
-    for line in lines:
-        print(line)
-
-if __name__ == "__main__":
-    main()
diff --git a/mobile/tools/python/imagetools/README.md b/mobile/tools/python/imagetools/README.md
deleted file mode 100644
index 91106c8008..0000000000
--- a/mobile/tools/python/imagetools/README.md
+++ /dev/null
@@ -1,24 +0,0 @@
-# imagetools
-
-This directory contains scripts generating input data file for paddle-mobile. The image data `g_test_image_1x3x224x224_banana` (used by `test/net/test_mobilenet.cpp`) of [http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip) is generated by this script.
-
-## Generate Input
-
-Edit script `img2nchw.py` as below according to your need:
-
-```python
-if __name__ == "__main__":
-    # set paras
-    input_image_path = 'banana.jpg'
-    reshape_dict = {"n":1, "c":3, "h":48, "w":512}
-    output_path = input_image_path.replace(input_image_path[-4:],
-                                           "_" + "_".join([str(reshape_dict['n']),
-                                                           str(reshape_dict['c']),
-                                                           str(reshape_dict['h']),
-                                                           str(reshape_dict['w']),
-                                                           "nchw",
-                                                           "float"],))
-    channel_type = ChannelType.BGR
-    mean_bgr = (103.94, 116.78, 123.68) # (0, 0, 0)
-    pixel_scale = 0.017
-```
diff --git a/mobile/tools/python/imagetools/imagetools.py b/mobile/tools/python/imagetools/imagetools.py
deleted file mode 100644
index 2d0864d729..0000000000
--- a/mobile/tools/python/imagetools/imagetools.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# coding=utf-8
-import cv2
-from array import array
-
-
-def resize_take_rgbs(path, shape_h_w, SHOW_IMG=False):
-    print("[INFO] ---- resize_take_rgbs ---- start")
-
-    image = cv2.imread(path)
-    print("[INFO] image.shape:{}".format(image.shape))
-    print("[INFO] shape_h_w:{}".format(shape_h_w))
-
-    if SHOW_IMG:
-        cv2.imshow("before", image)
-
-    print_rgb(image[0, 0])
-    # image len may be for .just check it
-    # image.resize(shape_h_w)
-
-    image = cv2.resize(image, (shape_h_w[0], shape_h_w[1]))
-
-    if SHOW_IMG:
-        cv2.imshow("after", image)
-
-    print("[INFO] resized image.shape:{}".format(image.shape))
-    height = shape_h_w[0]
-    width = shape_h_w[1]
-
-    rs_ = []
-    gs_ = []
-    bs_ = []
-    for h in range(0, height):
-        for w in range(0, width):
-            '''
-            bs_.append(image[h, w, 0])
-            gs_.append(image[h, w, 1])
-            rs_.append(image[h, w, 2])
-            '''
-            bs_.append(image[w, h, 0])
-            gs_.append(image[w, h, 1])
-            rs_.append(image[w, h, 2])
-
-    # print image[2, 2, 0]/255.
-    print len(bs_)
-    print len(gs_)
-    print len(rs_)
-    print("[INFO] ---- resize_take_rgbs ---- end")
-    return bs_, gs_, rs_
-
-
-def print_rgb((b, g, r)):
-    print "像素 - R:%d,G:%d,B:%d" % (r, g, b)  # 显示像素值
-    #
-    # image[0, 0] = (100, 150, 200)  # 更改位置(0,0)处的像素
-    #
-    # (b, g, r) = image[0, 0]  # 再次读取(0,0)像素
-    # print "位置(0,0)处的像素 - 红:%d,绿:%d,蓝:%d" % (r, g, b)  # 显示更改后的像素值
-    #
-    # corner = image[0:100, 0:100]  # 读取像素块
-    # cv2.imshow("Corner", corner)  # 显示读取的像素块
-    #
-    # image[0:100, 0:100] = (0, 255, 0);  # 更改读取的像素块
-    #
-    # cv2.imshow("Updated", image)  # 显示图像
-    #
-    # cv2.waitKey(0)  # 程序暂停
-
-
-def save_to_file(to_file_name, array):
-    with open(to_file_name, "wb") as file_handle:
-        array.tofile(file_handle)
diff --git a/mobile/tools/python/imagetools/img2nchw.py b/mobile/tools/python/imagetools/img2nchw.py
deleted file mode 100644
index f8e7c74a9d..0000000000
--- a/mobile/tools/python/imagetools/img2nchw.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# coding=utf-8
-import cv2
-from array import array
-import imagetools as tools
-from enum import Enum
-
-
-class ChannelType(Enum):
-    RGB = 0,
-    BGR = 1
-
-def combine_bgrs_nchw(bgrs, means_b_g_r=(103.94, 116.78, 123.68), scale=0.017, channel_type=ChannelType.BGR):
-    print("[INFO] ---- combine_bgrs_nchw ---- start")
-    print("[INFO] scale:{}".format(scale))
-    print("[INFO] mean_b_g_r:{}".format(means_b_g_r))
-    #print("[INFO] bgrs:{}".format(bgrs))
-
-    bs = bgrs[0]
-    gs = bgrs[1]
-    rs = bgrs[2]
-    assert len(bs) == len(gs) == len(rs)
-    print("[INFO] element size of blue channel = len(bs) = {}".format(len(bs)))
-
-    bgrs_float_array = array('f')
-    if channel_type == ChannelType.BGR:
-        print('[INFO] bgr format')
-        for i in range(0, len(bs)):
-            bgrs_float_array.append((bs[i] - means_b_g_r[0]) * scale)  # b
-        for i in range(0, len(gs)):
-            bgrs_float_array.append((gs[i] - means_b_g_r[1]) * scale)  # g
-        for i in range(0, len(rs)):
-            bgrs_float_array.append((rs[i] - means_b_g_r[2]) * scale)  # r
-    elif channel_type == ChannelType.RGB:
-        print('[INFO] rgb format')
-        for i in range(0, len(rs)):
-            bgrs_float_array.append((rs[i] - means_b_g_r[2]) * scale)  # r
-        for i in range(0, len(gs)):
-            bgrs_float_array.append((gs[i] - means_b_g_r[1]) * scale)  # g
-        for i in range(0, len(bs)):
-            bgrs_float_array.append((bs[i] - means_b_g_r[0]) * scale)  # b
-
-    '''
-    print("lenI(bgrs_float_array)={}".format(len(bgrs_float_array)))
-    print '------------------'
-    print bgrs_float_array[0]
-    print bgrs_float_array[224 * 224 * 2 + 224 * 2 + 2]
-    # for i in range(0, 9):
-    #     print'bs %d' % i
-    #     print bs[i] / 255.
-    print bs[224 * 2 + 2] / 255.
-    '''
-    print("[INFO] ---- combine_bgrs_nchw ---- end")
-    return bgrs_float_array
-
-
-if __name__ == "__main__":
-    # set paras
-    #input_image_path = 'banana.jpg'
-    #input_image_path = "ocr_detect_512x512.png"
-    input_image_path = "ocr_recog_48x512.png"
-
-    reshape_dict = {"n":1, "c":3, "h":48, "w":512}
-    output_path = input_image_path.replace(input_image_path[-4:],
-                                           "_" + "_".join([str(reshape_dict['n']),
-                                                           str(reshape_dict['c']),
-                                                           str(reshape_dict['h']),
-                                                           str(reshape_dict['w']),
-                                                           "nchw",
-                                                           "float"],))
-    channel_type = ChannelType.BGR
-    mean_bgr = (103.94, 116.78, 123.68)
-    pixel_scale = 0.017
-    #mean_bgr = (0, 0, 0)
-    #pixel_scale = 1. / 255
-
-    print("[INFO] input_image_path:{}".format(input_image_path))
-    print("[INFO] reshape_dict:{}".format(reshape_dict))
-    print("[INFO] output_path:{}".format(output_path))
-    print("[INFO] mean_bgr:{}".format(mean_bgr))
-    print("[INFO] pixel_scale:{}".format(pixel_scale))
-
-    bgrs = tools.resize_take_rgbs(input_image_path, (reshape_dict['h'],
-                                                     reshape_dict['w'],
-                                                     reshape_dict['c']))
-    array = combine_bgrs_nchw(bgrs, mean_bgr, pixel_scale, channel_type)
-    tools.save_to_file(output_path, array)
-    print("[INFO] save {} successfully".format(output_path))
-    #cv2.waitKey(0)
diff --git a/mobile/tools/python/imagetools/img2nhwc.py b/mobile/tools/python/imagetools/img2nhwc.py
deleted file mode 100644
index c982fe303e..0000000000
--- a/mobile/tools/python/imagetools/img2nhwc.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# coding=utf-8
-import cv2
-from array import array
-import imagetools as tools
-
-
-def combine_bgrs_nhwc(bgrs, means_b_g_r, scale):
-    print "scale: %f" % scale
-    print means_b_g_r
-    # print len(bgrs)
-    bs = bgrs[0]
-    gs = bgrs[1]
-    rs = bgrs[2]
-    assert len(bs) == len(gs) == len(rs)
-    # print len(bs)
-    bgrs_float_array = array('f')
-    for i in range(0, len(bs)):
-        bgrs_float_array.append((rs[i] - means_b_g_r[2]) * scale)  # r
-        bgrs_float_array.append((gs[i] - means_b_g_r[1]) * scale)  # g
-        bgrs_float_array.append((bs[i] - means_b_g_r[0]) * scale)  # b
-
-    print len(bgrs_float_array)
-
-    print '------------------'
-    print bgrs_float_array[0]
-    print bgrs_float_array[999]
-    return bgrs_float_array
-
-
-bgrs = tools.resize_take_rgbs('newyolo_1.jpg', (416, 416, 3))
-array = combine_bgrs_nhwc(bgrs, (0, 0, 0), 1.0 / 255)
-tools.save_to_file('desktop_1_3_416_416_nhwc_float', array)
-
-cv2.waitKey(0)
diff --git a/mobile/tools/python/imagetools/numpy2binary.py b/mobile/tools/python/imagetools/numpy2binary.py
deleted file mode 100644
index 9d9a7d0c86..0000000000
--- a/mobile/tools/python/imagetools/numpy2binary.py
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/usr/bin/env bash
-# coding=utf-8
-
-# This script convert numpy format to binary's
-import cv2
-import numpy as np
-import imagetools as tools
-from array import array
-
-
-'''
-image = cv2.imread(path)
-print image.shape
-print_rgb(image[0, 0])
-# mage len may be for .just check it
-image.resize(shape_h_w)
-'''
-
-if __name__ == "__main__":
-    # input params
-    reshape_dict = {"n": 1, "c": 3, "h": 224, "w": 224}
-    np_file_path = 'banana_1_3_224_224_nchw_float'
-    save_file_name = 'in_put_1_3_224_224_nchw'
-
-    # load input etc.
-    np = np.fromfile(np_file_path, 'f')
-    #np = cv2.imread(np_file_path)
-    print("np.size:{}".format(np.size))
-    print("np:{}".format(np))
-    np.reshape(reshape_dict['n'],
-               reshape_dict['c'],
-               reshape_dict['h'],
-               reshape_dict['w'])
-    out_array = array('f')
-
-    '''
-    print("--------------------")
-    print("np.size:{}".format(np.size))
-    print("np[0]:{}".format(np[0])
-
-    print("如果是nhw")
-    # rgb rgb rgb rgb rgb
-    print np[224 * 3 * 2 + 3 * 2 + 2]
-    # print np[2]
-
-    print '如果是nchw --------'
-    # rgb rgb rgb rgb rgb
-    print(np[224 * 224 * 2 + 224 * 2 + 2])
-    # print np[2]
-    # 明明是nchw
-    '''
-
-    for i in range(0, np.size):
-        out_array.append(np[i])
-
-    print("len(out_array):{}".format(len(out_array)))
-    print("out_array[224 * 224 * 2 + 224 * 2 + 2]:{}".format(out_array[224 * 224 * 2 + 224 * 2 + 2]))
-
-    # print out_array
-    tools.save_to_file(save_file_name, out_array)
diff --git a/mobile/tools/python/misc/.gitignore b/mobile/tools/python/misc/.gitignore
deleted file mode 100644
index 2414d1177a..0000000000
--- a/mobile/tools/python/misc/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-0
-1
-images
-__pycache__
diff --git a/mobile/tools/python/misc/fluidtools.py b/mobile/tools/python/misc/fluidtools.py
deleted file mode 100644
index 3032fd5490..0000000000
--- a/mobile/tools/python/misc/fluidtools.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# -*- coding: utf-8 -*
-import os
-import sys
-import math
-import struct
-import subprocess
-import numpy as np
-import paddle.fluid as fluid
-
-fast_check = False
-
-exe = fluid.Executor(fluid.CPUPlace())
-exe.run(fluid.default_startup_program())
-
-ops = None
-def check_model(model_path, dump_data_and_model):
-    check_model_impl(model_path, dump_data_and_model, True)
-    return check_model_impl(model_path, dump_data_and_model, False)
-
-def check_model_impl(model_path, dump_data_and_model, need_check):
-    global ops
-    if need_check:
-        prog, feeds, fetches = fluid.io.load_inference_model(dirname=model_path, executor=exe, model_filename="model", params_filename="params")
-    else:
-        prog, feeds, fetches = fluid.io.load_inference_model(dirname=model_path, executor=exe, model_filename="model-checked", params_filename="params-checked")
-    ops = prog.current_block().ops
-    vars = prog.current_block().vars
-    
-    # 获取变量形状
-    def get_var_shape(var_name):
-        vars = prog.current_block().vars
-        shape = vars[var_name].desc.shape()
-        for i in range(len(shape)):
-            dim = shape[i]
-            if dim == -1:
-                shape[i] = 1
-        return shape
-    
-    # 获取输入变量形状
-    def get_feed_var_shape(var_name):
-        # 如果想写死输入形状，放开以下语句
-        # return [1, 3, 224, 224]
-        return get_var_shape(var_name)
-
-    # 生成feed的key-value对
-    def gen_feed_kv():
-        feed_kv = {}
-        for feed_name in feeds:
-            feed_shape = get_feed_var_shape(feed_name)
-            data = np.random.random(feed_shape).astype("float32")
-            feed_kv[feed_name] = data
-        return feed_kv
-
-    feed_kv = gen_feed_kv()
-
-    # 运行模型
-    def run_model(feed_kv=None):
-        if feed_kv is None:
-            feed_kv = gen_feed_kv()
-        outputs = exe.run(prog, feed=feed_kv, fetch_list=fetches, return_numpy=False)
-        results = []
-        for output in outputs:
-            results.append(np.array(output))
-        return results
-
-    # 获取var的数据
-    def get_var_data(var_name, feed_kv=None):
-        # 强制var为可持久化
-        v = fluid.framework._get_var(var_name, prog)
-        persistable = v.persistable
-        if not persistable:
-            v.persistable = True
-        # outputs = run_model(feed_kv=feed_kv)
-        output = np.array(fluid.global_scope().find_var(var_name).get_tensor())
-        # 恢复var的可持久化属性
-        v.persistable = persistable
-        return output
-
-    # 强制所有var为可持久化
-    p_names = []
-    for name in vars:
-        name = str(name)
-        v = fluid.framework._get_var(name, prog)
-        if not v.persistable:
-            v.persistable = True
-            p_names.append(name)
-    outputs = run_model(feed_kv=feed_kv)
-    has_found_wrong_shape = False
-    # 修正每个var的形状
-    for name in vars:
-        name = str(name)
-        v = vars[name]
-        if v.persistable:
-            v1 = fluid.global_scope().find_var(name)
-            try:
-                t1 = v1.get_tensor()
-                shape = t1.shape()
-            except:
-                continue
-            if v.desc.shape() != shape:
-                has_found_wrong_shape = True
-            v.desc.set_shape(shape)
-    # 恢复var的可持久化属性
-    for name in p_names:
-        v = fluid.framework._get_var(name, prog)
-        v.persistable = False
-    if need_check and dump_data_and_model:
-        fluid.io.save_inference_model(dirname=model_path, feeded_var_names=feeds, target_vars=fetches, executor=exe, main_program=prog, model_filename="model-checked", params_filename="params-checked")
-        return
-    var_cache = {}
-    # 获取每层输出的数据
-    def save_all_op_output(feed_kv=None):
-        output_path = "{}/data".format(model_path)
-        if not os.path.exists(output_path):
-            os.mkdir(output_path)
-        ops = prog.current_block().ops
-        fetch_names = []
-        for fetch in fetches:
-            fetch_names.append(fetch.name)
-        feed_names = feeds
-        for i in range(len(ops)):
-            op = ops[i]
-            var_name = None
-            for name in op.output_arg_names:
-                var_name = name
-                if "tmp" in name:
-                    break
-            real_var_name = None
-            if op.type == "fetch":
-                for name in op.input_arg_names:
-                    real_var_name = name
-                    if "tmp" in name:
-                        break
-            else:
-                real_var_name = var_name
-            if fast_check:
-                if var_name not in fetch_names and var_name not in feed_names:
-                    continue
-            try:
-                shape = get_var_shape(var_name)
-                var_cache[var_name] = shape
-            except:
-                pass
-            if not dump_data_and_model:
-                continue
-            try:
-                np_data = get_var_data(real_var_name, feed_kv=feed_kv)
-                index = -1
-                for i in range(len(fetch_names)):
-                    if real_var_name == fetch_names[i]:
-                        index = i
-                        break
-                if index != -1:
-                    np_data = outputs[index]
-                data = np_data.flatten().tolist()
-                file_name = var_name.replace("/", "_")
-                var_path = output_path + "/" + file_name
-                np_data.tofile(var_path)
-                # out_file = open(var_path, "wb")
-                # if var_name in feed_names:
-                #     for item in data:
-                #         out_file.write(struct.pack("d", item))
-                # else:
-                #     for item in data:
-                #         out_file.write(struct.pack("d", item))
-                # out_file.close()
-            except:
-                print("dump {} {} failed".format(op.type, var_name))
-                pass
-    save_all_op_output()
-    return var_cache
-
-if __name__ == "__main__":
-    model_path = "./1/mobilenet"
-    check_model(model_path, True)
diff --git a/mobile/tools/python/misc/ios-test-server.py b/mobile/tools/python/misc/ios-test-server.py
deleted file mode 100644
index fe2be5733e..0000000000
--- a/mobile/tools/python/misc/ios-test-server.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# -*- coding: utf-8 -*
-import os
-import sys
-import math
-import qrcode
-import subprocess
-import numpy as np
-import paddle.fluid as fluid
-from flask import Flask, request, send_from_directory, jsonify, make_response
-
-# sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
-# from fluidtools import run
-from fluidtools import check_model
-
-dump_data_and_model = False
-
-def get_ip_address():
-    handle = os.popen("ifconfig | grep 172 | grep inet | grep netmask | grep broadcast | cut -d \" \" -f2")
-    ip = handle.read()
-    ip = ip.strip()
-    return ip
-
-app = Flask(__name__, static_url_path='')
-
-param_precisions = [1] # 0 for float16, 1 for float32
-
-def process_model(precision, name):
-    model_dir = "./{}/{}".format(precision, name)
-    os.chdir(model_dir)
-    os.chdir("../..")
-    var_info = check_model(model_dir, dump_data_and_model)
-    return var_info
-
-def get_model_info(precision, name):
-    # model_info = {
-    #     "name": name,
-    #     "params_precision": [precision],
-    #     "fusion": [True, False],
-    #     "reuse_texture": [True, False],
-    #     "use_mps": [True, False],
-    #     "test_performance": True,
-    #     "diff_precision": 0.01,
-    #     "vars_dic": {
-    #     }
-    # }
-    model_info = {
-        "name": name,
-        "params_precision": [precision],
-        "fusion": [True],
-        "reuse_texture": [True],
-        "use_mps": [True, False],
-        "test_performance": False,
-        "diff_precision": 0.01,
-        "vars_dic": {
-        }
-    }
-    var_info = process_model(precision, name)
-    model_info["vars_dic"] = var_info
-    return model_info
-
-model_list = []
-def process_models():
-    for precision in param_precisions:
-        model_names = os.listdir("./{}".format(precision))
-        for name in model_names:
-            model_info = get_model_info(precision, name)
-            model_list.append(model_info)
-
-@app.route('/images/<path:path>')
-def send_image(path):
-    return send_from_directory('images', path)
-
-@app.route('/getFile/<name>/model')
-def send_model(name):
-    precision = 1
-    return send_from_directory("{}/{}".format(precision, name), "model-checked")
-
-@app.route('/getFile/<name>/params/<precision>')
-def send_params(name, precision):
-    return send_from_directory("{}/{}".format(precision, name), "params-checked")
-
-@app.route('/getFile/<name>/data/<var>')
-def send_data(name, var):
-    precision = 1
-    return send_from_directory("{}/{}/data".format(precision, name), var)
-
-@app.route('/getTestInfo', methods=['GET'])
-def test_info():
-    info = {"model_list": model_list}
-    return make_response(jsonify(info), 200)
-
-test_result = None
-@app.route('/putTestResult', methods=['POST'])
-def put_test_result():
-    global test_result
-    test_result = request.get_json()
-    success = True
-    for item in test_result["results"]:
-        result = item["isResultEqual"]
-        if not result:
-            success = False
-            break
-    test_result["aaa-success"] = success
-    os.popen("open -a \"/Applications/Google Chrome.app\" \"{}/showTestResult\"".format(host))
-    return make_response(jsonify({"msg": "ok"}), 200)
-
-@app.route('/showTestResult', methods=['GET'])
-def show_test_result():
-    global test_result
-    return make_response(jsonify(test_result), 200)
-
-@app.route('/', methods=['GET'])
-def home():
-    return "<html><body><img src=\"images/qrcode.png\"/></body></html>"
-
-host = None
-
-if __name__ == "__main__":
-    process_models()
-    host = "http://{}:8080".format(get_ip_address())
-    image = qrcode.make(host)
-    if not os.path.isdir("images"):
-        os.mkdir("images")
-    image.save("images/qrcode.png")
-    os.popen("open -a \"/Applications/Google Chrome.app\" \"{}\"".format(host))
-    app.run(host="0.0.0.0", port=8080)
diff --git a/mobile/tools/python/misc/restore-git.py b/mobile/tools/python/misc/restore-git.py
deleted file mode 100644
index c0613bcb1d..0000000000
--- a/mobile/tools/python/misc/restore-git.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import os
-import sys
-import subprocess
-
-username = ""
-email = ""
-home = ""
-desktop = "{}/Desktop".format(home)
-dir_1 = "{}/1".format(desktop)
-dir_2 = "{}/2".format(desktop)
-src_dir = dir_1
-dest_dir = dir_2
-src_mobile_dir = "{}/paddle-mobile".format(src_dir)
-dest_mobile_dir = "{}/paddle-mobile".format(dest_dir)
-
-def clone_repo(dir):
-    os.chdir(dir)
-    os.system("git clone git@github.com:{}/paddle-mobile.git".format(username))
-    os.chdir("{}/paddle-mobile".format(dir))
-    os.system("git remote add upstream git@github.com:PaddlePaddle/paddle-mobile.git")
-    os.system("git config user.name {}".format(username))
-    os.system("git config user.email {}".format(email))
-
-def get_output(command):
-    out = subprocess.Popen(command.split(" "), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-    stdout, stderr = out.communicate()
-    return stdout.decode("utf-8").split("\n")
-
-if __name__ == "__main__":
-    # if not os.path.isdir(src_dir):
-    #     print("dir 1 not found")
-    #     sys.exit(-1)
-    
-    if not os.path.isdir(dest_dir):
-        os.mkdir(dest_dir)
-    if not os.path.isdir(dest_mobile_dir):
-        clone_repo(dest_dir)
-    sys.exit()
-    
-    items = []
-    # items = ["metal/.gitignore", "metal/VideoSuperResolution"]
-    os.chdir(src_mobile_dir)
-    for line in get_output("git status --porcelain"):
-        line = line.strip()
-        items.append(line.split(" ")[-1])
-    
-    for item in items:
-        src = item
-        if len(src) <= 0:
-            continue
-        dest = dest_mobile_dir + "/" + item
-        cmd = "cp -R " + src + " " + dest
-        print(cmd)
-        os.system(cmd)
diff --git a/mobile/tools/python/misc/test-fluid-op-feature.py b/mobile/tools/python/misc/test-fluid-op-feature.py
deleted file mode 100644
index 1657fd2477..0000000000
--- a/mobile/tools/python/misc/test-fluid-op-feature.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import numpy as np
-import paddle.fluid as fluid
-
-exe = fluid.Executor(fluid.CPUPlace())
-exe.run(fluid.default_startup_program())
-
-data = np.array([5.0])
-x = fluid.layers.data(name="x", shape=[1], dtype="float32")
-y = fluid.layers.relu6(x, threshold=4.0)
-
-prog = fluid.default_main_program()
-outputs = exe.run(prog, feed={"x": data}, fetch_list=[y])
-print(outputs)
diff --git a/mobile/tools/python/modeltools/.gitignore b/mobile/tools/python/modeltools/.gitignore
deleted file mode 100644
index 4108f5244b..0000000000
--- a/mobile/tools/python/modeltools/.gitignore
+++ /dev/null
@@ -1,109 +0,0 @@
-# Created by .ignore support plugin (hsz.mobi)
-### Python template
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-.hypothesis/
-.pytest_cache/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# pyenv
-.python-version
-
-# celery beat schedule file
-celerybeat-schedule
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-
-/yolo/datas/
-/mobilenet/datas/
diff --git a/mobile/tools/python/modeltools/core/__init__.py b/mobile/tools/python/modeltools/core/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/mobile/tools/python/modeltools/core/framework.proto b/mobile/tools/python/modeltools/core/framework.proto
deleted file mode 100644
index 07bfef1c2a..0000000000
--- a/mobile/tools/python/modeltools/core/framework.proto
+++ /dev/null
@@ -1,176 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-syntax = "proto2";
-option optimize_for = LITE_RUNTIME;
-package paddle_mobile.framework.proto;
-
-enum AttrType {
-  INT = 0;
-  FLOAT = 1;
-  STRING = 2;
-  INTS = 3;
-  FLOATS = 4;
-  STRINGS = 5;
-  BOOLEAN = 6;
-  BOOLEANS = 7;
-  BLOCK = 8;
-  LONG = 9;
-}
-
-// OpDesc describes an instance of a C++ framework::OperatorBase
-// derived class type.
-message OpDesc {
-
-  message Attr {
-    required string name = 1;
-    required AttrType type = 2;
-    optional int32 i = 3;
-    optional float f = 4;
-    optional string s = 5;
-    repeated int32 ints = 6;
-    repeated float floats = 7;
-    repeated string strings = 8;
-    optional bool b = 10;
-    repeated bool bools = 11;
-    optional int32 block_idx = 12;
-    optional int64 l = 13;
-  };
-
-  message Var {
-    required string parameter = 1;
-    repeated string arguments = 2;
-  };
-
-  required string type = 3;
-  repeated Var inputs = 1;
-  repeated Var outputs = 2;
-  repeated Attr attrs = 4;
-  optional bool is_target = 5 [ default = false ];
-};
-
-// OpProto describes a C++ framework::OperatorBase derived class.
-message OpProto {
-
-  // VarProto describes the C++ type framework::Variable.
-  message Var {
-    required string name = 1;
-    required string comment = 2;
-
-    optional bool duplicable = 3 [ default = false ];
-    optional bool intermediate = 4 [ default = false ];
-    optional bool dispensable = 5 [ default = false ];
-  }
-
-  // AttrProto describes the C++ type Attribute.
-  message Attr {
-    required string name = 1;
-    required AttrType type = 2;
-    required string comment = 3;
-    // If that attribute is generated, it means the Paddle third
-    // language binding has responsibility to fill that
-    // attribute. End-User should not set that attribute.
-    optional bool generated = 4 [ default = false ];
-  }
-
-  required string type = 1;
-  repeated Var inputs = 2;
-  repeated Var outputs = 3;
-  repeated Attr attrs = 4;
-  required string comment = 5;
-}
-
-message VarType {
-  enum Type {
-    // Pod Types
-    BOOL = 0;
-    INT16 = 1;
-    INT32 = 2;
-    INT64 = 3;
-    FP16 = 4;
-    FP32 = 5;
-    FP64 = 6;
-
-    // Other types that may need additional descriptions
-    LOD_TENSOR = 7;
-    SELECTED_ROWS = 8;
-    FEED_MINIBATCH = 9;
-    FETCH_LIST = 10;
-    STEP_SCOPES = 11;
-    LOD_RANK_TABLE = 12;
-    LOD_TENSOR_ARRAY = 13;
-    PLACE_LIST = 14;
-    READER = 15;
-    CHANNEL = 16;
-    // Any runtime decided variable type is raw
-    // raw variables should manage their own allocations
-    // in operators like nccl_op
-    RAW = 17;
-    TUPLE = 18;
-  }
-
-  required Type type = 1;
-
-  message TensorDesc {
-    // Should only be PODType. Is enforced in C++
-    required Type data_type = 1;
-    repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
-  }
-  optional TensorDesc selected_rows = 2;
-
-  message LoDTensorDesc {
-    required TensorDesc tensor = 1;
-    optional int32 lod_level = 2 [ default = 0 ];
-  }
-  optional LoDTensorDesc lod_tensor = 3;
-
-  message LoDTensorArrayDesc {
-    required TensorDesc tensor = 1;
-    optional int32 lod_level = 2 [ default = 0 ];
-  }
-  optional LoDTensorArrayDesc tensor_array = 4;
-
-  message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; }
-  optional ReaderDesc reader = 5;
-
-  message ChannelDesc {
-    required Type data_type = 1;
-    required int64 capacity = 2;
-  }
-  optional ChannelDesc channel = 6;
-
-  message Tuple { repeated Type element_type = 1; }
-  optional Tuple tuple = 7;
-}
-
-message VarDesc {
-  required string name = 1;
-  required VarType type = 2;
-  optional bool persistable = 3 [ default = false ];
-}
-
-message BlockDesc {
-  required int32 idx = 1;
-  required int32 parent_idx = 2;
-  repeated VarDesc vars = 3;
-  repeated OpDesc ops = 4;
-  optional int32 forward_block_idx = 5 [ default = -1 ];
-}
-
-// Please refer to
-// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
-// for more details.
-// TODO(panyx0718): A model can have multiple programs. Need a
-// way to distinguish them. Maybe ID or name?
-message ProgramDesc { repeated BlockDesc blocks = 1; }
diff --git a/mobile/tools/python/modeltools/core/framework_pb2.py b/mobile/tools/python/modeltools/core/framework_pb2.py
deleted file mode 100644
index 3a43deebc9..0000000000
--- a/mobile/tools/python/modeltools/core/framework_pb2.py
+++ /dev/null
@@ -1,1141 +0,0 @@
-# Generated by the protocol buffer compiler.  DO NOT EDIT!
-# source: framework.proto
-
-import sys
-_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
-from google.protobuf.internal import enum_type_wrapper
-from google.protobuf import descriptor as _descriptor
-from google.protobuf import message as _message
-from google.protobuf import reflection as _reflection
-from google.protobuf import symbol_database as _symbol_database
-from google.protobuf import descriptor_pb2
-# @@protoc_insertion_point(imports)
-
-_sym_db = _symbol_database.Default()
-
-
-
-
-DESCRIPTOR = _descriptor.FileDescriptor(
-  name='framework.proto',
-  package='paddle_mobile.framework.proto',
-  syntax='proto2',
-  serialized_pb=_b('\n\x0f\x66ramework.proto\x12\x1dpaddle_mobile.framework.proto\"\xe5\x03\n\x06OpDesc\x12\x0c\n\x04type\x18\x03 \x02(\t\x12\x39\n\x06inputs\x18\x01 \x03(\x0b\x32).paddle_mobile.framework.proto.OpDesc.Var\x12:\n\x07outputs\x18\x02 \x03(\x0b\x32).paddle_mobile.framework.proto.OpDesc.Var\x12\x39\n\x05\x61ttrs\x18\x04 \x03(\x0b\x32*.paddle_mobile.framework.proto.OpDesc.Attr\x12\x18\n\tis_target\x18\x05 \x01(\x08:\x05\x66\x61lse\x1a\xd3\x01\n\x04\x41ttr\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x35\n\x04type\x18\x02 \x02(\x0e\x32\'.paddle_mobile.framework.proto.AttrType\x12\t\n\x01i\x18\x03 \x01(\x05\x12\t\n\x01\x66\x18\x04 \x01(\x02\x12\t\n\x01s\x18\x05 \x01(\t\x12\x0c\n\x04ints\x18\x06 \x03(\x05\x12\x0e\n\x06\x66loats\x18\x07 \x03(\x02\x12\x0f\n\x07strings\x18\x08 \x03(\t\x12\t\n\x01\x62\x18\n \x01(\x08\x12\r\n\x05\x62ools\x18\x0b \x03(\x08\x12\x11\n\tblock_idx\x18\x0c \x01(\x05\x12\t\n\x01l\x18\r \x01(\x03\x1a+\n\x03Var\x12\x11\n\tparameter\x18\x01 \x02(\t\x12\x11\n\targuments\x18\x02 \x03(\t\"\xcf\x03\n\x07OpProto\x12\x0c\n\x04type\x18\x01 \x02(\t\x12:\n\x06inputs\x18\x02 \x03(\x0b\x32*.paddle_mobile.framework.proto.OpProto.Var\x12;\n\x07outputs\x18\x03 \x03(\x0b\x32*.paddle_mobile.framework.proto.OpProto.Var\x12:\n\x05\x61ttrs\x18\x04 \x03(\x0b\x32+.paddle_mobile.framework.proto.OpProto.Attr\x12\x0f\n\x07\x63omment\x18\x05 \x02(\t\x1ax\n\x03Var\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x0f\n\x07\x63omment\x18\x02 \x02(\t\x12\x19\n\nduplicable\x18\x03 \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0cintermediate\x18\x04 \x01(\x08:\x05\x66\x61lse\x12\x1a\n\x0b\x64ispensable\x18\x05 \x01(\x08:\x05\x66\x61lse\x1av\n\x04\x41ttr\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x35\n\x04type\x18\x02 \x02(\x0e\x32\'.paddle_mobile.framework.proto.AttrType\x12\x0f\n\x07\x63omment\x18\x03 \x02(\t\x12\x18\n\tgenerated\x18\x04 \x01(\x08:\x05\x66\x61lse\"\xb9\n\n\x07VarType\x12\x39\n\x04type\x18\x01 \x02(\x0e\x32+.paddle_mobile.framework.proto.VarType.Type\x12H\n\rselected_rows\x18\x02 \x01(\x0b\x32\x31.paddle_mobile.framework.proto.VarType.TensorDesc\x12H\n\nlod_tensor\x18\x03 \x01(\x0b\x32\x34.paddle_mobile.framework.proto.VarType.LoDTensorDesc\x12O\n\x0ctensor_array\x18\x04 \x01(\x0b\x32\x39.paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc\x12\x41\n\x06reader\x18\x05 \x01(\x0b\x32\x31.paddle_mobile.framework.proto.VarType.ReaderDesc\x12\x43\n\x07\x63hannel\x18\x06 \x01(\x0b\x32\x32.paddle_mobile.framework.proto.VarType.ChannelDesc\x12;\n\x05tuple\x18\x07 \x01(\x0b\x32,.paddle_mobile.framework.proto.VarType.Tuple\x1aZ\n\nTensorDesc\x12>\n\tdata_type\x18\x01 \x02(\x0e\x32+.paddle_mobile.framework.proto.VarType.Type\x12\x0c\n\x04\x64ims\x18\x02 \x03(\x03\x1ah\n\rLoDTensorDesc\x12\x41\n\x06tensor\x18\x01 \x02(\x0b\x32\x31.paddle_mobile.framework.proto.VarType.TensorDesc\x12\x14\n\tlod_level\x18\x02 \x01(\x05:\x01\x30\x1am\n\x12LoDTensorArrayDesc\x12\x41\n\x06tensor\x18\x01 \x02(\x0b\x32\x31.paddle_mobile.framework.proto.VarType.TensorDesc\x12\x14\n\tlod_level\x18\x02 \x01(\x05:\x01\x30\x1aV\n\nReaderDesc\x12H\n\nlod_tensor\x18\x01 \x03(\x0b\x32\x34.paddle_mobile.framework.proto.VarType.LoDTensorDesc\x1a_\n\x0b\x43hannelDesc\x12>\n\tdata_type\x18\x01 \x02(\x0e\x32+.paddle_mobile.framework.proto.VarType.Type\x12\x10\n\x08\x63\x61pacity\x18\x02 \x02(\x03\x1aJ\n\x05Tuple\x12\x41\n\x0c\x65lement_type\x18\x01 \x03(\x0e\x32+.paddle_mobile.framework.proto.VarType.Type\"\x8e\x02\n\x04Type\x12\x08\n\x04\x42OOL\x10\x00\x12\t\n\x05INT16\x10\x01\x12\t\n\x05INT32\x10\x02\x12\t\n\x05INT64\x10\x03\x12\x08\n\x04\x46P16\x10\x04\x12\x08\n\x04\x46P32\x10\x05\x12\x08\n\x04\x46P64\x10\x06\x12\x0e\n\nLOD_TENSOR\x10\x07\x12\x11\n\rSELECTED_ROWS\x10\x08\x12\x12\n\x0e\x46\x45\x45\x44_MINIBATCH\x10\t\x12\x0e\n\nFETCH_LIST\x10\n\x12\x0f\n\x0bSTEP_SCOPES\x10\x0b\x12\x12\n\x0eLOD_RANK_TABLE\x10\x0c\x12\x14\n\x10LOD_TENSOR_ARRAY\x10\r\x12\x0e\n\nPLACE_LIST\x10\x0e\x12\n\n\x06READER\x10\x0f\x12\x0b\n\x07\x43HANNEL\x10\x10\x12\x07\n\x03RAW\x10\x11\x12\t\n\x05TUPLE\x10\x12\"i\n\x07VarDesc\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x34\n\x04type\x18\x02 \x02(\x0b\x32&.paddle_mobile.framework.proto.VarType\x12\x1a\n\x0bpersistable\x18\x03 \x01(\x08:\x05\x66\x61lse\"\xb5\x01\n\tBlockDesc\x12\x0b\n\x03idx\x18\x01 \x02(\x05\x12\x12\n\nparent_idx\x18\x02 \x02(\x05\x12\x34\n\x04vars\x18\x03 \x03(\x0b\x32&.paddle_mobile.framework.proto.VarDesc\x12\x32\n\x03ops\x18\x04 \x03(\x0b\x32%.paddle_mobile.framework.proto.OpDesc\x12\x1d\n\x11\x66orward_block_idx\x18\x05 \x01(\x05:\x02-1\"G\n\x0bProgramDesc\x12\x38\n\x06\x62locks\x18\x01 \x03(\x0b\x32(.paddle_mobile.framework.proto.BlockDesc*}\n\x08\x41ttrType\x12\x07\n\x03INT\x10\x00\x12\t\n\x05\x46LOAT\x10\x01\x12\n\n\x06STRING\x10\x02\x12\x08\n\x04INTS\x10\x03\x12\n\n\x06\x46LOATS\x10\x04\x12\x0b\n\x07STRINGS\x10\x05\x12\x0b\n\x07\x42OOLEAN\x10\x06\x12\x0c\n\x08\x42OOLEANS\x10\x07\x12\t\n\x05\x42LOCK\x10\x08\x12\x08\n\x04LONG\x10\tB\x02H\x03')
-)
-_sym_db.RegisterFileDescriptor(DESCRIPTOR)
-
-_ATTRTYPE = _descriptor.EnumDescriptor(
-  name='AttrType',
-  full_name='paddle_mobile.framework.proto.AttrType',
-  filename=None,
-  file=DESCRIPTOR,
-  values=[
-    _descriptor.EnumValueDescriptor(
-      name='INT', index=0, number=0,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='FLOAT', index=1, number=1,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='STRING', index=2, number=2,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='INTS', index=3, number=3,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='FLOATS', index=4, number=4,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='STRINGS', index=5, number=5,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='BOOLEAN', index=6, number=6,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='BOOLEANS', index=7, number=7,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='BLOCK', index=8, number=8,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='LONG', index=9, number=9,
-      options=None,
-      type=None),
-  ],
-  containing_type=None,
-  options=None,
-  serialized_start=2708,
-  serialized_end=2833,
-)
-_sym_db.RegisterEnumDescriptor(_ATTRTYPE)
-
-AttrType = enum_type_wrapper.EnumTypeWrapper(_ATTRTYPE)
-INT = 0
-FLOAT = 1
-STRING = 2
-INTS = 3
-FLOATS = 4
-STRINGS = 5
-BOOLEAN = 6
-BOOLEANS = 7
-BLOCK = 8
-LONG = 9
-
-
-_VARTYPE_TYPE = _descriptor.EnumDescriptor(
-  name='Type',
-  full_name='paddle_mobile.framework.proto.VarType.Type',
-  filename=None,
-  file=DESCRIPTOR,
-  values=[
-    _descriptor.EnumValueDescriptor(
-      name='BOOL', index=0, number=0,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='INT16', index=1, number=1,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='INT32', index=2, number=2,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='INT64', index=3, number=3,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='FP16', index=4, number=4,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='FP32', index=5, number=5,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='FP64', index=6, number=6,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='LOD_TENSOR', index=7, number=7,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='SELECTED_ROWS', index=8, number=8,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='FEED_MINIBATCH', index=9, number=9,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='FETCH_LIST', index=10, number=10,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='STEP_SCOPES', index=11, number=11,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='LOD_RANK_TABLE', index=12, number=12,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='LOD_TENSOR_ARRAY', index=13, number=13,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PLACE_LIST', index=14, number=14,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='READER', index=15, number=15,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='CHANNEL', index=16, number=16,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='RAW', index=17, number=17,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='TUPLE', index=18, number=18,
-      options=None,
-      type=None),
-  ],
-  containing_type=None,
-  options=None,
-  serialized_start=2072,
-  serialized_end=2342,
-)
-_sym_db.RegisterEnumDescriptor(_VARTYPE_TYPE)
-
-
-_OPDESC_ATTR = _descriptor.Descriptor(
-  name='Attr',
-  full_name='paddle_mobile.framework.proto.OpDesc.Attr',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='name', full_name='paddle_mobile.framework.proto.OpDesc.Attr.name', index=0,
-      number=1, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='type', full_name='paddle_mobile.framework.proto.OpDesc.Attr.type', index=1,
-      number=2, type=14, cpp_type=8, label=2,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='i', full_name='paddle_mobile.framework.proto.OpDesc.Attr.i', index=2,
-      number=3, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='f', full_name='paddle_mobile.framework.proto.OpDesc.Attr.f', index=3,
-      number=4, type=2, cpp_type=6, label=1,
-      has_default_value=False, default_value=float(0),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='s', full_name='paddle_mobile.framework.proto.OpDesc.Attr.s', index=4,
-      number=5, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='ints', full_name='paddle_mobile.framework.proto.OpDesc.Attr.ints', index=5,
-      number=6, type=5, cpp_type=1, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='floats', full_name='paddle_mobile.framework.proto.OpDesc.Attr.floats', index=6,
-      number=7, type=2, cpp_type=6, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='strings', full_name='paddle_mobile.framework.proto.OpDesc.Attr.strings', index=7,
-      number=8, type=9, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='b', full_name='paddle_mobile.framework.proto.OpDesc.Attr.b', index=8,
-      number=10, type=8, cpp_type=7, label=1,
-      has_default_value=False, default_value=False,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='bools', full_name='paddle_mobile.framework.proto.OpDesc.Attr.bools', index=9,
-      number=11, type=8, cpp_type=7, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='block_idx', full_name='paddle_mobile.framework.proto.OpDesc.Attr.block_idx', index=10,
-      number=12, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='l', full_name='paddle_mobile.framework.proto.OpDesc.Attr.l', index=11,
-      number=13, type=3, cpp_type=2, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=280,
-  serialized_end=491,
-)
-
-_OPDESC_VAR = _descriptor.Descriptor(
-  name='Var',
-  full_name='paddle_mobile.framework.proto.OpDesc.Var',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='parameter', full_name='paddle_mobile.framework.proto.OpDesc.Var.parameter', index=0,
-      number=1, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='arguments', full_name='paddle_mobile.framework.proto.OpDesc.Var.arguments', index=1,
-      number=2, type=9, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=493,
-  serialized_end=536,
-)
-
-_OPDESC = _descriptor.Descriptor(
-  name='OpDesc',
-  full_name='paddle_mobile.framework.proto.OpDesc',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='type', full_name='paddle_mobile.framework.proto.OpDesc.type', index=0,
-      number=3, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='inputs', full_name='paddle_mobile.framework.proto.OpDesc.inputs', index=1,
-      number=1, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='outputs', full_name='paddle_mobile.framework.proto.OpDesc.outputs', index=2,
-      number=2, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='attrs', full_name='paddle_mobile.framework.proto.OpDesc.attrs', index=3,
-      number=4, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='is_target', full_name='paddle_mobile.framework.proto.OpDesc.is_target', index=4,
-      number=5, type=8, cpp_type=7, label=1,
-      has_default_value=True, default_value=False,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[_OPDESC_ATTR, _OPDESC_VAR, ],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=51,
-  serialized_end=536,
-)
-
-
-_OPPROTO_VAR = _descriptor.Descriptor(
-  name='Var',
-  full_name='paddle_mobile.framework.proto.OpProto.Var',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='name', full_name='paddle_mobile.framework.proto.OpProto.Var.name', index=0,
-      number=1, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='comment', full_name='paddle_mobile.framework.proto.OpProto.Var.comment', index=1,
-      number=2, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='duplicable', full_name='paddle_mobile.framework.proto.OpProto.Var.duplicable', index=2,
-      number=3, type=8, cpp_type=7, label=1,
-      has_default_value=True, default_value=False,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='intermediate', full_name='paddle_mobile.framework.proto.OpProto.Var.intermediate', index=3,
-      number=4, type=8, cpp_type=7, label=1,
-      has_default_value=True, default_value=False,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='dispensable', full_name='paddle_mobile.framework.proto.OpProto.Var.dispensable', index=4,
-      number=5, type=8, cpp_type=7, label=1,
-      has_default_value=True, default_value=False,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=762,
-  serialized_end=882,
-)
-
-_OPPROTO_ATTR = _descriptor.Descriptor(
-  name='Attr',
-  full_name='paddle_mobile.framework.proto.OpProto.Attr',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='name', full_name='paddle_mobile.framework.proto.OpProto.Attr.name', index=0,
-      number=1, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='type', full_name='paddle_mobile.framework.proto.OpProto.Attr.type', index=1,
-      number=2, type=14, cpp_type=8, label=2,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='comment', full_name='paddle_mobile.framework.proto.OpProto.Attr.comment', index=2,
-      number=3, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='generated', full_name='paddle_mobile.framework.proto.OpProto.Attr.generated', index=3,
-      number=4, type=8, cpp_type=7, label=1,
-      has_default_value=True, default_value=False,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=884,
-  serialized_end=1002,
-)
-
-_OPPROTO = _descriptor.Descriptor(
-  name='OpProto',
-  full_name='paddle_mobile.framework.proto.OpProto',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='type', full_name='paddle_mobile.framework.proto.OpProto.type', index=0,
-      number=1, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='inputs', full_name='paddle_mobile.framework.proto.OpProto.inputs', index=1,
-      number=2, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='outputs', full_name='paddle_mobile.framework.proto.OpProto.outputs', index=2,
-      number=3, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='attrs', full_name='paddle_mobile.framework.proto.OpProto.attrs', index=3,
-      number=4, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='comment', full_name='paddle_mobile.framework.proto.OpProto.comment', index=4,
-      number=5, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[_OPPROTO_VAR, _OPPROTO_ATTR, ],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=539,
-  serialized_end=1002,
-)
-
-
-_VARTYPE_TENSORDESC = _descriptor.Descriptor(
-  name='TensorDesc',
-  full_name='paddle_mobile.framework.proto.VarType.TensorDesc',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='data_type', full_name='paddle_mobile.framework.proto.VarType.TensorDesc.data_type', index=0,
-      number=1, type=14, cpp_type=8, label=2,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='dims', full_name='paddle_mobile.framework.proto.VarType.TensorDesc.dims', index=1,
-      number=2, type=3, cpp_type=2, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1501,
-  serialized_end=1591,
-)
-
-_VARTYPE_LODTENSORDESC = _descriptor.Descriptor(
-  name='LoDTensorDesc',
-  full_name='paddle_mobile.framework.proto.VarType.LoDTensorDesc',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='tensor', full_name='paddle_mobile.framework.proto.VarType.LoDTensorDesc.tensor', index=0,
-      number=1, type=11, cpp_type=10, label=2,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='lod_level', full_name='paddle_mobile.framework.proto.VarType.LoDTensorDesc.lod_level', index=1,
-      number=2, type=5, cpp_type=1, label=1,
-      has_default_value=True, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1593,
-  serialized_end=1697,
-)
-
-_VARTYPE_LODTENSORARRAYDESC = _descriptor.Descriptor(
-  name='LoDTensorArrayDesc',
-  full_name='paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='tensor', full_name='paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc.tensor', index=0,
-      number=1, type=11, cpp_type=10, label=2,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='lod_level', full_name='paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc.lod_level', index=1,
-      number=2, type=5, cpp_type=1, label=1,
-      has_default_value=True, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1699,
-  serialized_end=1808,
-)
-
-_VARTYPE_READERDESC = _descriptor.Descriptor(
-  name='ReaderDesc',
-  full_name='paddle_mobile.framework.proto.VarType.ReaderDesc',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='lod_tensor', full_name='paddle_mobile.framework.proto.VarType.ReaderDesc.lod_tensor', index=0,
-      number=1, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1810,
-  serialized_end=1896,
-)
-
-_VARTYPE_CHANNELDESC = _descriptor.Descriptor(
-  name='ChannelDesc',
-  full_name='paddle_mobile.framework.proto.VarType.ChannelDesc',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='data_type', full_name='paddle_mobile.framework.proto.VarType.ChannelDesc.data_type', index=0,
-      number=1, type=14, cpp_type=8, label=2,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='capacity', full_name='paddle_mobile.framework.proto.VarType.ChannelDesc.capacity', index=1,
-      number=2, type=3, cpp_type=2, label=2,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1898,
-  serialized_end=1993,
-)
-
-_VARTYPE_TUPLE = _descriptor.Descriptor(
-  name='Tuple',
-  full_name='paddle_mobile.framework.proto.VarType.Tuple',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='element_type', full_name='paddle_mobile.framework.proto.VarType.Tuple.element_type', index=0,
-      number=1, type=14, cpp_type=8, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1995,
-  serialized_end=2069,
-)
-
-_VARTYPE = _descriptor.Descriptor(
-  name='VarType',
-  full_name='paddle_mobile.framework.proto.VarType',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='type', full_name='paddle_mobile.framework.proto.VarType.type', index=0,
-      number=1, type=14, cpp_type=8, label=2,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='selected_rows', full_name='paddle_mobile.framework.proto.VarType.selected_rows', index=1,
-      number=2, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='lod_tensor', full_name='paddle_mobile.framework.proto.VarType.lod_tensor', index=2,
-      number=3, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='tensor_array', full_name='paddle_mobile.framework.proto.VarType.tensor_array', index=3,
-      number=4, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='reader', full_name='paddle_mobile.framework.proto.VarType.reader', index=4,
-      number=5, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='channel', full_name='paddle_mobile.framework.proto.VarType.channel', index=5,
-      number=6, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='tuple', full_name='paddle_mobile.framework.proto.VarType.tuple', index=6,
-      number=7, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[_VARTYPE_TENSORDESC, _VARTYPE_LODTENSORDESC, _VARTYPE_LODTENSORARRAYDESC, _VARTYPE_READERDESC, _VARTYPE_CHANNELDESC, _VARTYPE_TUPLE, ],
-  enum_types=[
-    _VARTYPE_TYPE,
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1005,
-  serialized_end=2342,
-)
-
-
-_VARDESC = _descriptor.Descriptor(
-  name='VarDesc',
-  full_name='paddle_mobile.framework.proto.VarDesc',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='name', full_name='paddle_mobile.framework.proto.VarDesc.name', index=0,
-      number=1, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='type', full_name='paddle_mobile.framework.proto.VarDesc.type', index=1,
-      number=2, type=11, cpp_type=10, label=2,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='persistable', full_name='paddle_mobile.framework.proto.VarDesc.persistable', index=2,
-      number=3, type=8, cpp_type=7, label=1,
-      has_default_value=True, default_value=False,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=2344,
-  serialized_end=2449,
-)
-
-
-_BLOCKDESC = _descriptor.Descriptor(
-  name='BlockDesc',
-  full_name='paddle_mobile.framework.proto.BlockDesc',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='idx', full_name='paddle_mobile.framework.proto.BlockDesc.idx', index=0,
-      number=1, type=5, cpp_type=1, label=2,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='parent_idx', full_name='paddle_mobile.framework.proto.BlockDesc.parent_idx', index=1,
-      number=2, type=5, cpp_type=1, label=2,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='vars', full_name='paddle_mobile.framework.proto.BlockDesc.vars', index=2,
-      number=3, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='ops', full_name='paddle_mobile.framework.proto.BlockDesc.ops', index=3,
-      number=4, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='forward_block_idx', full_name='paddle_mobile.framework.proto.BlockDesc.forward_block_idx', index=4,
-      number=5, type=5, cpp_type=1, label=1,
-      has_default_value=True, default_value=-1,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=2452,
-  serialized_end=2633,
-)
-
-
-_PROGRAMDESC = _descriptor.Descriptor(
-  name='ProgramDesc',
-  full_name='paddle_mobile.framework.proto.ProgramDesc',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='blocks', full_name='paddle_mobile.framework.proto.ProgramDesc.blocks', index=0,
-      number=1, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=2635,
-  serialized_end=2706,
-)
-
-_OPDESC_ATTR.fields_by_name['type'].enum_type = _ATTRTYPE
-_OPDESC_ATTR.containing_type = _OPDESC
-_OPDESC_VAR.containing_type = _OPDESC
-_OPDESC.fields_by_name['inputs'].message_type = _OPDESC_VAR
-_OPDESC.fields_by_name['outputs'].message_type = _OPDESC_VAR
-_OPDESC.fields_by_name['attrs'].message_type = _OPDESC_ATTR
-_OPPROTO_VAR.containing_type = _OPPROTO
-_OPPROTO_ATTR.fields_by_name['type'].enum_type = _ATTRTYPE
-_OPPROTO_ATTR.containing_type = _OPPROTO
-_OPPROTO.fields_by_name['inputs'].message_type = _OPPROTO_VAR
-_OPPROTO.fields_by_name['outputs'].message_type = _OPPROTO_VAR
-_OPPROTO.fields_by_name['attrs'].message_type = _OPPROTO_ATTR
-_VARTYPE_TENSORDESC.fields_by_name['data_type'].enum_type = _VARTYPE_TYPE
-_VARTYPE_TENSORDESC.containing_type = _VARTYPE
-_VARTYPE_LODTENSORDESC.fields_by_name['tensor'].message_type = _VARTYPE_TENSORDESC
-_VARTYPE_LODTENSORDESC.containing_type = _VARTYPE
-_VARTYPE_LODTENSORARRAYDESC.fields_by_name['tensor'].message_type = _VARTYPE_TENSORDESC
-_VARTYPE_LODTENSORARRAYDESC.containing_type = _VARTYPE
-_VARTYPE_READERDESC.fields_by_name['lod_tensor'].message_type = _VARTYPE_LODTENSORDESC
-_VARTYPE_READERDESC.containing_type = _VARTYPE
-_VARTYPE_CHANNELDESC.fields_by_name['data_type'].enum_type = _VARTYPE_TYPE
-_VARTYPE_CHANNELDESC.containing_type = _VARTYPE
-_VARTYPE_TUPLE.fields_by_name['element_type'].enum_type = _VARTYPE_TYPE
-_VARTYPE_TUPLE.containing_type = _VARTYPE
-_VARTYPE.fields_by_name['type'].enum_type = _VARTYPE_TYPE
-_VARTYPE.fields_by_name['selected_rows'].message_type = _VARTYPE_TENSORDESC
-_VARTYPE.fields_by_name['lod_tensor'].message_type = _VARTYPE_LODTENSORDESC
-_VARTYPE.fields_by_name['tensor_array'].message_type = _VARTYPE_LODTENSORARRAYDESC
-_VARTYPE.fields_by_name['reader'].message_type = _VARTYPE_READERDESC
-_VARTYPE.fields_by_name['channel'].message_type = _VARTYPE_CHANNELDESC
-_VARTYPE.fields_by_name['tuple'].message_type = _VARTYPE_TUPLE
-_VARTYPE_TYPE.containing_type = _VARTYPE
-_VARDESC.fields_by_name['type'].message_type = _VARTYPE
-_BLOCKDESC.fields_by_name['vars'].message_type = _VARDESC
-_BLOCKDESC.fields_by_name['ops'].message_type = _OPDESC
-_PROGRAMDESC.fields_by_name['blocks'].message_type = _BLOCKDESC
-DESCRIPTOR.message_types_by_name['OpDesc'] = _OPDESC
-DESCRIPTOR.message_types_by_name['OpProto'] = _OPPROTO
-DESCRIPTOR.message_types_by_name['VarType'] = _VARTYPE
-DESCRIPTOR.message_types_by_name['VarDesc'] = _VARDESC
-DESCRIPTOR.message_types_by_name['BlockDesc'] = _BLOCKDESC
-DESCRIPTOR.message_types_by_name['ProgramDesc'] = _PROGRAMDESC
-DESCRIPTOR.enum_types_by_name['AttrType'] = _ATTRTYPE
-
-OpDesc = _reflection.GeneratedProtocolMessageType('OpDesc', (_message.Message,), dict(
-
-  Attr = _reflection.GeneratedProtocolMessageType('Attr', (_message.Message,), dict(
-    DESCRIPTOR = _OPDESC_ATTR,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpDesc.Attr)
-    ))
-  ,
-
-  Var = _reflection.GeneratedProtocolMessageType('Var', (_message.Message,), dict(
-    DESCRIPTOR = _OPDESC_VAR,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpDesc.Var)
-    ))
-  ,
-  DESCRIPTOR = _OPDESC,
-  __module__ = 'framework_pb2'
-  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpDesc)
-  ))
-_sym_db.RegisterMessage(OpDesc)
-_sym_db.RegisterMessage(OpDesc.Attr)
-_sym_db.RegisterMessage(OpDesc.Var)
-
-OpProto = _reflection.GeneratedProtocolMessageType('OpProto', (_message.Message,), dict(
-
-  Var = _reflection.GeneratedProtocolMessageType('Var', (_message.Message,), dict(
-    DESCRIPTOR = _OPPROTO_VAR,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpProto.Var)
-    ))
-  ,
-
-  Attr = _reflection.GeneratedProtocolMessageType('Attr', (_message.Message,), dict(
-    DESCRIPTOR = _OPPROTO_ATTR,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpProto.Attr)
-    ))
-  ,
-  DESCRIPTOR = _OPPROTO,
-  __module__ = 'framework_pb2'
-  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpProto)
-  ))
-_sym_db.RegisterMessage(OpProto)
-_sym_db.RegisterMessage(OpProto.Var)
-_sym_db.RegisterMessage(OpProto.Attr)
-
-VarType = _reflection.GeneratedProtocolMessageType('VarType', (_message.Message,), dict(
-
-  TensorDesc = _reflection.GeneratedProtocolMessageType('TensorDesc', (_message.Message,), dict(
-    DESCRIPTOR = _VARTYPE_TENSORDESC,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.TensorDesc)
-    ))
-  ,
-
-  LoDTensorDesc = _reflection.GeneratedProtocolMessageType('LoDTensorDesc', (_message.Message,), dict(
-    DESCRIPTOR = _VARTYPE_LODTENSORDESC,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.LoDTensorDesc)
-    ))
-  ,
-
-  LoDTensorArrayDesc = _reflection.GeneratedProtocolMessageType('LoDTensorArrayDesc', (_message.Message,), dict(
-    DESCRIPTOR = _VARTYPE_LODTENSORARRAYDESC,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc)
-    ))
-  ,
-
-  ReaderDesc = _reflection.GeneratedProtocolMessageType('ReaderDesc', (_message.Message,), dict(
-    DESCRIPTOR = _VARTYPE_READERDESC,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.ReaderDesc)
-    ))
-  ,
-
-  ChannelDesc = _reflection.GeneratedProtocolMessageType('ChannelDesc', (_message.Message,), dict(
-    DESCRIPTOR = _VARTYPE_CHANNELDESC,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.ChannelDesc)
-    ))
-  ,
-
-  Tuple = _reflection.GeneratedProtocolMessageType('Tuple', (_message.Message,), dict(
-    DESCRIPTOR = _VARTYPE_TUPLE,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.Tuple)
-    ))
-  ,
-  DESCRIPTOR = _VARTYPE,
-  __module__ = 'framework_pb2'
-  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType)
-  ))
-_sym_db.RegisterMessage(VarType)
-_sym_db.RegisterMessage(VarType.TensorDesc)
-_sym_db.RegisterMessage(VarType.LoDTensorDesc)
-_sym_db.RegisterMessage(VarType.LoDTensorArrayDesc)
-_sym_db.RegisterMessage(VarType.ReaderDesc)
-_sym_db.RegisterMessage(VarType.ChannelDesc)
-_sym_db.RegisterMessage(VarType.Tuple)
-
-VarDesc = _reflection.GeneratedProtocolMessageType('VarDesc', (_message.Message,), dict(
-  DESCRIPTOR = _VARDESC,
-  __module__ = 'framework_pb2'
-  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarDesc)
-  ))
-_sym_db.RegisterMessage(VarDesc)
-
-BlockDesc = _reflection.GeneratedProtocolMessageType('BlockDesc', (_message.Message,), dict(
-  DESCRIPTOR = _BLOCKDESC,
-  __module__ = 'framework_pb2'
-  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.BlockDesc)
-  ))
-_sym_db.RegisterMessage(BlockDesc)
-
-ProgramDesc = _reflection.GeneratedProtocolMessageType('ProgramDesc', (_message.Message,), dict(
-  DESCRIPTOR = _PROGRAMDESC,
-  __module__ = 'framework_pb2'
-  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.ProgramDesc)
-  ))
-_sym_db.RegisterMessage(ProgramDesc)
-
-
-DESCRIPTOR.has_options = True
-DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('H\003'))
-# @@protoc_insertion_point(module_scope)
diff --git a/mobile/tools/python/modeltools/core/op_types.py b/mobile/tools/python/modeltools/core/op_types.py
deleted file mode 100644
index 550f87339c..0000000000
--- a/mobile/tools/python/modeltools/core/op_types.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# coding=utf-8
-
-# mdl layers
-layer_mdl_conv = 'ConvolutionLayer'
-layer_mdl_deepwise_conv = 'DepthwiseConvolutionLayer'
-layer_mdl_relu = 'ReluLayer'
-layer_mdl_pointwise_add = 'PointwiseConvolutionLayer'
-layer_mdl_pooling = 'PoolingLayer'
-layer_mdl_softmax = 'SoftmaxLayer'
-
-# fluid ops
-op_fluid_fusion_conv_add = 'fusion_conv_add'
-op_fluid_relu = 'relu'
-op_fluid_pooling = 'pool2d'
-op_fluid_softmax = 'softmax'
-
-# dict mdk layer ---  fluid op
-mdl2fluid_op_layer_dict = {
-    layer_mdl_conv: op_fluid_fusion_conv_add,
-    layer_mdl_deepwise_conv: op_fluid_fusion_conv_add,
-    layer_mdl_relu: op_fluid_relu,
-    layer_mdl_pointwise_add: op_fluid_fusion_conv_add,
-    layer_mdl_pooling: op_fluid_pooling,
-    layer_mdl_softmax: op_fluid_softmax
-}
-
-mdl_outputs_key = "outputs"
-mdl_inputs_key = "inputs"
-mdl_weight_key = "weight"
-mdl_attrs_key = "params"
-
-# dict of mdl-input _out param  to fluid input out attrs
-fusion_conv_add_dict = {
-    mdl_inputs_key: 'Input',
-    mdl_outputs_key: 'Out',
-    mdl_weight_key: ('Filter', 'Y'),
-    mdl_attrs_key: (
-        # 'workspace_size_MB', 'use_mkldnn', 'use_cudnn', 'data_format','dilations',
-        # dilations =  [1,1]
-        'groups', 'paddings', 'strides'
-        # 'axis'
-    )
-}
-
-relu_dict = {
-    mdl_inputs_key: 'X',
-    mdl_outputs_key: 'Out',
-    # mdl_weight_key: ()
-
-}
-
-pool2d_dict = {
-    mdl_inputs_key: 'X',
-    mdl_outputs_key: 'Out',
-    # mdl_weight_key: (),
-    mdl_attrs_key: ('pooling_type', 'global_pooling')
-
-}
-
-softmax_dict = {
-    mdl_inputs_key: 'X',
-    mdl_outputs_key: 'Out',
-    mdl_weight_key: (),
-    mdl_attrs_key: ()
-}
-# mdl layers  ---  fluid ops
-op_io_dict = {
-    'fusion_conv_add': fusion_conv_add_dict,
-    'relu': relu_dict,
-    'pool2d': pool2d_dict,
-    'softmax': softmax_dict
-}
-
-# fluid attr key  ---  mdl params key
-fusion_conv_add_attrs_dict = {
-    'paddings': 'pad',
-    'strides': 'stride',
-    'groups': 'group'
-}
-
-# fluid attr key  ---  mdl params key
-pool2d_attrs_dict = {
-    'global_pooling': 'global_pooling',
-    'pooling_type': 'type'
-}
-
-
-# fluid attr key  ---  mdl params key
-fluid_attrs_type_dict = {
-    'paddings': 0,
-    'strides': 6,
-    'groups': 6
-}
diff --git a/mobile/tools/python/modeltools/mobilenet/__init__.py b/mobile/tools/python/modeltools/mobilenet/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/mobile/tools/python/modeltools/mobilenet/converter_mobilenet.py b/mobile/tools/python/modeltools/mobilenet/converter_mobilenet.py
deleted file mode 100644
index ca1e1f7f4d..0000000000
--- a/mobile/tools/python/modeltools/mobilenet/converter_mobilenet.py
+++ /dev/null
@@ -1,509 +0,0 @@
-# coding=utf-8
-import json
-import os
-
-from core import framework_pb2 as framework_pb2, op_types as types
-from mobilenet.swicher import Swichter
-import shutil
-
-
-def load_mdl(mdl_json_path):
-    # print('mdl json path : ' + mdl_json_path)
-    with open(mdl_json_path, 'r') as f:
-        return json.load(f)
-
-
-def create_if_not_exit(target_dir):
-    if os.path.exists(target_dir):
-        shutil.rmtree(target_dir)
-    os.makedirs(target_dir, 0777)
-
-
-class Converter:
-    'convert mdlmodel to fluidmodel'
-
-    def __init__(self, base_dir, mdl_json_path):
-        print 'base_dir:  ' + base_dir
-        self.mdl_json_path = base_dir + mdl_json_path
-        self.base_dir = base_dir
-        print mdl_json_path
-        self.source_weights_dir = self.base_dir + 'datas/sourcemodels/source_weights/'
-        self.target_weight_dir = self.base_dir + 'datas/target/target_weights/'
-
-        create_if_not_exit(self.target_weight_dir)
-
-        self.mdl_json = load_mdl(self.mdl_json_path)
-        self.program_desc = framework_pb2.ProgramDesc()
-        self.weight_list_ = []
-        self.deepwise_weight_list_ = []
-        # print(json_dick)
-        # layers = (json_dick['layer'])
-        # for layer in layers:
-        #     print(layer)
-
-    def convert(self):
-        print 'convert begin.....'
-        # add block_desc
-        block_desc = self.program_desc.blocks.add()
-        block_desc.idx = 0
-        block_desc.parent_idx = -1
-        self.package_ops(block_desc)
-        self.package_vars(block_desc)
-        print 'blocks: '
-        print self.program_desc.blocks
-        print 'convert end.....'
-        desc_serialize_to_string = self.program_desc.SerializeToString()
-
-        outputmodel_dir = self.base_dir + 'datas/target/mobilenet_classfication/'
-        if os.path.exists(outputmodel_dir):
-            shutil.rmtree(outputmodel_dir)
-        os.makedirs(outputmodel_dir, 0777)
-
-        if os.path.exists(outputmodel_dir):
-            shutil.rmtree(outputmodel_dir)
-        # create_if_not_exit(outputmodel_dir)
-
-        shutil.copytree(self.target_weight_dir, outputmodel_dir)
-
-        f = open(outputmodel_dir + "__model__", "wb")
-        f.write(desc_serialize_to_string)
-        f.close()
-
-    def package_ops(self, block_desc):
-
-        self.add_op_feed(block_desc)
-
-        # add ops with layer
-        if 'layer' in self.mdl_json:
-
-            layers_ = self.mdl_json['layer']
-            for layer in layers_:
-
-                if layer['type'] == 'SoftmaxLayer':
-                    pass
-                else:
-                    desc_ops_add = block_desc.ops.add()
-
-                    # print layer
-                    # for i in layer:
-                    #     print i
-                    if 'name' in layer:
-                        l_name = layer['name']
-                    if 'type' in layer:
-                        self.package_ops_type(desc_ops_add, layer)
-
-                    if 'weight' in layer:
-                        self.package_ops_weight2inputs(desc_ops_add, layer)
-
-                    if 'output' in layer:
-                        self.package_ops_outputs(desc_ops_add, layer)
-
-                    if 'input' in layer:
-                        self.package_ops_inputs(desc_ops_add, layer)
-
-                    self.package_ops_attrs(desc_ops_add, layer)
-
-        self.add_op_fetch(block_desc)
-
-    def add_op_feed(self, block_desc):
-        desc_ops_add = block_desc.ops.add()
-        inputs_add = desc_ops_add.inputs.add()
-        inputs_add.parameter = 'X'
-        inputs_add.arguments.append('feed')
-        desc_ops_add.type = 'feed'
-        outputs_add = desc_ops_add.outputs.add()
-        outputs_add.parameter = 'Out'
-        outputs_add.arguments.append('data')
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'col'
-        # boolean
-        attrs_add.type = 0
-        attrs_add.i = 0
-
-    def add_op_fetch(self, block_desc):
-        desc_ops_add = block_desc.ops.add()
-        inputs_add = desc_ops_add.inputs.add()
-        inputs_add.parameter = 'X'
-        # todo pick last layer --> op output
-        inputs_add.arguments.append('fc7')
-        desc_ops_add.type = 'fetch'
-        outputs_add = desc_ops_add.outputs.add()
-        outputs_add.parameter = 'Out'
-        outputs_add.arguments.append('fetch')
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'col'
-        # boolean
-        attrs_add.type = 0
-        attrs_add.i = 0
-
-    @staticmethod
-    def package_ops_attrs(desc_ops_add, layer):
-        # print l_params
-        # print desc_ops_add.type
-        if desc_ops_add.type == types.op_fluid_fusion_conv_add:
-            Converter.pack_fusion_conv_add_attr(desc_ops_add, layer)
-        elif desc_ops_add.type == types.op_fluid_relu:
-            # fusion_conv_add : attrs
-            attrs_add = desc_ops_add.attrs.add()
-            attrs_add.name = 'use_mkldnn'
-            # boolean
-            attrs_add.type = 6
-            attrs_add.b = 0
-        elif desc_ops_add.type == types.op_fluid_pooling:
-            Converter.pack_pooling_attr(desc_ops_add, layer)
-            pass
-        elif desc_ops_add.type == types.op_fluid_softmax:
-            pass
-
-    @staticmethod
-    def pack_pooling_attr(desc_ops_add, layer):
-        print layer
-        l_params = layer['param']
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'use_mkldnn'
-        # boolean
-        attrs_add.type = 6
-        attrs_add.b = 0
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'use_cudnn'
-        # boolean
-        attrs_add.type = 6
-        attrs_add.b = 1
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'paddings'
-        # ints
-        attrs_add.type = 3
-        attrs_add.ints.append(0)
-        attrs_add.ints.append(0)
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'strides'
-        # ints
-        attrs_add.type = 3
-        attrs_add.ints.append(1)
-        attrs_add.ints.append(1)
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'global_pooling'
-        # boolean
-        attrs_add.type = 6
-        attrs_add.b = (l_params[types.pool2d_attrs_dict.get('global_pooling')])
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'pooling_type'
-        # 2-->STRING
-        attrs_add.type = 2
-        # 注意这里 avg but mdl is ave
-        attrs_add.s = l_params[types.pool2d_attrs_dict.get('pooling_type')]
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'ceil_mode'
-        # boolean
-        attrs_add.type = 6
-        attrs_add.b = 1
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'ksize'
-        # ints
-        attrs_add.type = 3
-        attrs_add.ints.append(7)
-        attrs_add.ints.append(7)
-
-    # type: "pool2d"
-    # attrs
-    # {
-    #     name: "use_mkldnn"
-    #     type: BOOLEAN
-    #     b: false
-    # }
-    # attrs
-    # {
-    #     name: "ceil_mode"
-    #     type: BOOLEAN
-    #     b: true
-    # }
-    # attrs
-    # {
-    #     name: "use_cudnn"
-    #     type: BOOLEAN
-    #     b: true
-    # }
-    # attrs
-    # {
-    #     name: "paddings"
-    #     type: INTS
-    #     ints: 0
-    #     ints: 0
-    # }
-    # attrs
-    # {
-    #     name: "strides"
-    #     type: INTS
-    #     ints: 1
-    #     ints: 1
-    # }
-    # attrs
-    # {
-    #     name: "global_pooling"
-    #     type: BOOLEAN
-    #     b: false
-    # }
-    # attrs
-    # {
-    #     name: "data_format"
-    #     type: STRING
-    #     s: "AnyLayout"
-    # }
-    # attrs
-    # {
-    #     name: "ksize"
-    #     type: INTS
-    #     ints: 7
-    #     ints: 7
-    # }
-    # attrs
-    # {
-    #     name: "pooling_type"
-    #     type: STRING
-    #     s: "avg"
-    # }
-    # is_target: false
-
-    @staticmethod
-    def pack_fusion_conv_add_attr(desc_ops_add, layer):
-
-        # fusion_conv_add : attrs
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'workspace_size_MB'
-        # 0-->INT
-        attrs_add.type = 0
-        attrs_add.i = 4096
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'data_format'
-        # 2-->STRING
-        attrs_add.type = 2
-        attrs_add.s = 'AnyLayout'
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'use_mkldnn'
-        # boolean
-        attrs_add.type = 6
-        attrs_add.b = 0
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'use_cudnn'
-        # boolean
-        attrs_add.type = 6
-        attrs_add.b = 1
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'dilations'
-        # ints
-        attrs_add.type = 3
-        attrs_add.ints.append(1)
-        attrs_add.ints.append(1)
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'axis'
-        # int
-        attrs_add.type = 0
-        attrs_add.i = 1
-
-        if 'param' in layer:
-            l_params = layer['param']
-
-            attrs_add = desc_ops_add.attrs.add()
-            attrs_add.name = 'paddings'
-            # ints
-            attrs_add.type = 3
-            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('paddings')])
-            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('paddings')])
-
-            # attrs_add = desc_ops_add.attrs.add()
-            # attrs_add.name = 'paddings'
-            # # ints
-            # attrs_add.type = 3
-            # attrs_add.ints.append(0)
-            # attrs_add.ints.append(0)
-
-            attrs_add = desc_ops_add.attrs.add()
-            attrs_add.name = 'strides'
-            # ints
-            attrs_add.type = 3
-            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('strides')])
-            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('strides')])
-
-            # attrs_add = desc_ops_add.attrs.add()
-            # attrs_add.name = 'strides'
-            # # ints
-            # attrs_add.type = 3
-            # attrs_add.ints.append(6)
-            # attrs_add.ints.append(6)
-
-            attrs_add = desc_ops_add.attrs.add()
-            attrs_add.name = 'groups'
-            # int
-            attrs_add.type = 0
-            attrs_add.i = l_params[types.fusion_conv_add_attrs_dict.get('groups')]
-            # attrs_add.i = 1
-
-        #
-        # op_attrs_tupl = types.op_io_dict.get(desc_ops_add.type) \
-        #     .get(types.mdl_attrs_key)
-        #
-        #
-        #
-        #
-        # # group stride padding
-        # print '----------------------'
-        # for i, val in enumerate(op_attrs_tupl):
-        #     attrs_add = desc_ops_add.attrs.add()
-        #     attr_name = op_attrs_tupl[i]
-        #     print attr_name
-        #     attrs_add.name = attr_name
-        #     attrs_add.type = types.fluid_attrs_type_dict.get(attr_name)
-        #     attrs_add.
-        #     print l_params[types.fusion_conv_add_attrs_dict.get(attr_name)]
-
-        # for p in l_params:
-        #     attrs_add = desc_ops_add.attrs.add()
-
-    @staticmethod
-    def package_ops_inputs(desc_ops_add, layer):
-        l_inputs = layer['input']
-        for i in l_inputs:
-            inputs_add = desc_ops_add.inputs.add()
-            # print i
-            inputs_add.parameter = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_inputs_key)
-            inputs_add.arguments.append(i)
-
-    @staticmethod
-    def package_ops_outputs(desc_ops_add, layer):
-        l_outputs = layer['output']
-        for o in l_outputs:
-            # print o
-            outputs_add = desc_ops_add.outputs.add()
-            dict = types.op_io_dict.get(desc_ops_add.type)
-            # print 'desc_ops_add.type:  ' + desc_ops_add.type
-            # print dict
-            outputs_add.parameter = dict.get(types.mdl_outputs_key)
-            outputs_add.arguments.append(o)
-
-    def package_ops_weight2inputs(self, desc_ops_add, layer):
-        l_weights = layer['weight']
-        for w in l_weights:
-            self.weight_list_.append(w)
-
-        if layer['type'] == types.layer_mdl_deepwise_conv:
-            # print l_weights[0]
-            self.deepwise_weight_list_.append(l_weights[0])
-
-        op_weight_tup = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_weight_key)
-        if op_weight_tup is not None:
-            # print len(op_weight_tup)
-            for i, val in enumerate(op_weight_tup):
-                # print i
-                # print val
-                inputs_add = desc_ops_add.inputs.add()
-                inputs_add.parameter = op_weight_tup[i]
-                inputs_add.arguments.append(l_weights[i])
-
-        # for w in l_weights:
-        #     inputs_add = desc_ops_add.inputs.add()
-        #     # print w
-        #     inputs_add.parameter = op_weight_tup[0]
-        #     inputs_add.arguments.append(w)
-
-    @staticmethod
-    def package_ops_type(desc_ops_add, layer):
-        l_type = layer['type']
-        # print l_type
-        # print mdl2fluid_op_layer_dict.get(l_type)
-        desc_ops_add.type = types.mdl2fluid_op_layer_dict.get(l_type)
-
-    def package_vars(self, block_desc):
-        vars_add = block_desc.vars.add()
-        vars_add.name = 'feed'
-        vars_add.type.type = 9  # 9 is FEED_MINIBATCH
-        vars_add.persistable = 1
-        # fetch
-        vars_add = block_desc.vars.add()
-        vars_add.name = 'fetch'
-        vars_add.type.type = 10  # 10 is fetch list
-        vars_add.persistable = 1
-
-        json_matrix_ = self.mdl_json['matrix']
-        # print json_matrix_
-        for j in json_matrix_:
-            vars_add = block_desc.vars.add()
-            vars_add.name = j
-            vars_add.type.type = 7  # 7 is lodtensor
-            # print j
-            tensor = vars_add.type.lod_tensor.tensor
-            tensor.data_type = 5  # 5 is FP32
-
-            # print json_matrix_
-
-            dims_of_matrix = json_matrix_.get(j)
-            # dims_size = len(dims_of_matrix)
-            # print dims_size
-
-            # if dims_size == 4:
-            #     tensor.dims.append(dims_of_matrix[0])  # N
-            #     tensor.dims.append(dims_of_matrix[3])  # C
-            #     tensor.dims.append(dims_of_matrix[1])  # H
-            #     tensor.dims.append(dims_of_matrix[2])  # W
-            # else:
-
-            # issues in mdl model filter swich n and c
-            if j in self.deepwise_weight_list_ and len(dims_of_matrix) == 4:
-                print "deep wise issue fit:  " + j
-                tensor.dims.append(dims_of_matrix[1])
-                tensor.dims.append(dims_of_matrix[0])
-                tensor.dims.append(dims_of_matrix[2])
-                tensor.dims.append(dims_of_matrix[3])
-                print tensor.dims
-            else:
-                for dims in dims_of_matrix:
-                    # print dims
-                    tensor.dims.append(dims)
-
-            if j in self.weight_list_:
-                vars_add.persistable = 1
-                dims_size = len(dims_of_matrix)
-                # print dims_size
-                # print 'weight name : ' + j
-                Swichter().copy_add_head(
-                    self.source_weights_dir + j + '.bin',
-                    self.target_weight_dir + j
-                )
-
-                # if dims_size == 4:
-                #     # convert weight from nhwc to nchw
-                #     Swichter().nhwc2nchw_one_slice_add_head(
-                #         'yolo/datas/multiobjects/float32s_nhwc/' + j + '.bin',
-                #         'yolo/datas/multiobjects/float32s_nchw_with_head/' + j,
-                #         'yolo/datas/multiobjects/float32s_nchw/' + j + '.tmp',
-                #         dims_of_matrix[0],
-                #         dims_of_matrix[1],
-                #         dims_of_matrix[2],
-                #         dims_of_matrix[3]
-                #     )
-                # else:
-                #     Swichter().copy_add_head(
-                #         'yolo/datas/multiobjects/float32s_nhwc/' + j + '.bin',
-                #         'yolo/datas/multiobjects/float32s_nchw_with_head/' + j,
-                #         'yolo/datas/multiobjects/float32s_nchw/' + j + '.tmp'
-                #     )
-            else:
-                vars_add.persistable = 0
-
-
-mdl_path = "datas/sourcemodels/source_profile/mobileNetModel.json"
-base_dir = "/Users/xiebaiyuan/PaddleProject/paddle-mobile/tools/python/modeltools/mobilenet/"
-converter = Converter(base_dir, mdl_path)
-converter.convert()
diff --git a/mobile/tools/python/modeltools/mobilenet/swicher.py b/mobile/tools/python/modeltools/mobilenet/swicher.py
deleted file mode 100644
index 90bc6d26f6..0000000000
--- a/mobile/tools/python/modeltools/mobilenet/swicher.py
+++ /dev/null
@@ -1,119 +0,0 @@
-import os
-import shutil
-from array import array
-
-
-class Swichter:
-    def __init__(self):
-        pass
-
-    def nhwc2nchw_one_slice(self, from_file_name, to_file_name, batch, channel, height, width):
-        from_file = open(from_file_name, "rb")
-        to_file = open(to_file_name, "wb")
-
-        float_array = array("f")
-        float_array.fromfile(from_file, width * height * batch * channel)
-        float_write_array = array("f")
-
-        for b in range(batch):
-            for c in range(channel):
-                for h in range(height):
-                    for w in range(width):
-                        float_value = float_array[b * channel * width * height
-                                                  + channel * (h * width + w) + c]
-
-                        float_write_array.append(float_value)
-
-        float_write_array.tofile(to_file)
-        from_file.close()
-        to_file.close()
-
-    def copy(self, from_file_name, to_file_name):
-        from_file = open(from_file_name, "rb")
-        to_file = open(to_file_name, "wb")
-
-        to_file.write(from_file.read())
-        from_file.close()
-        to_file.close()
-
-    def nhwc2nchw_one_slice_add_head(self, from_file_name, to_file_name, tmp_file_name, batch, channel, height, width):
-        from_file = open(from_file_name, "rb")
-        tmp_file = open(tmp_file_name, "wb+")
-        float_array = array("f")
-        float_array.fromfile(from_file, width * height * batch * channel)
-        float_write_array = array("f")
-
-        for b in range(batch):
-            for c in range(channel):
-                for h in range(height):
-                    for w in range(width):
-                        float_value = float_array[b * channel * width * height
-                                                  + channel * (h * width + w) + c]
-
-                        float_write_array.append(float_value)
-
-        float_write_array.tofile(tmp_file)
-        tmp_file.close()
-        from_file.close()
-
-        tmp_file = open(tmp_file_name, "rb")
-        to_file = open(to_file_name, "wb")
-
-        tmp = tmp_file.read()
-        head = self.read_head('yolo/datas/yolo/head')
-        to_file.write(head)
-        to_file.write(tmp)
-        tmp_file.close()
-        to_file.close()
-
-    def read_head(self, head_file):
-        from_file = open(head_file, "rb")
-        read = from_file.read(24)
-        # print read
-        from_file.close()
-        # print read
-        return read
-
-    def copy_add_head(self, from_file_name, to_file_name):
-
-        from_file = open(from_file_name, "rb")
-        to_file = open(to_file_name, "wb")
-        # tmp_file = open(tmp_file_name, "wb")
-
-        head = self.read_head(
-            '/Users/xiebaiyuan/PaddleProject/paddle-mobile/tools/python/modeltools/mobilenet/datas/sourcemodels/head/head')
-        to_file.write(head)
-        to_file.write(from_file.read())
-        from_file.close()
-        to_file.close()
-        pass
-
-    def copy_padding_add_head(self, from_file_name, to_file_name, tmp_file_name, padding):
-        print'padding  = %d' % padding
-        from_file = open(from_file_name, "rb")
-        # print len(from_file.read())
-        from_file.seek(padding, 0)
-
-        read = from_file.read()
-        print len(read)
-
-        to_file = open(to_file_name, "wb")
-        # tmp_file = open(tmp_file_name, "wb")
-
-        head = self.read_head('yolo/datas/yolo/head')
-        to_file.write(head)
-        to_file.write(read)
-        from_file.close()
-        to_file.close()
-        pass
-
-# Swichter().nhwc2nchw_one_slice_add_head(
-#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nhwc/conv1_0.bin',
-#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nchw_with_head/conv1_0',
-#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nchw/.tmp',
-#     32,
-#     3, 3, 3)
-
-# Swichter().read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/yolo/head')
-
-# Swichter().copy_add_head('datas/model.0.0.weight', 'datas/conv1_0', '')
diff --git a/mobile/tools/python/modeltools/tools/__init__.py b/mobile/tools/python/modeltools/tools/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/mobile/tools/python/modeltools/tools/float2halffloat.py b/mobile/tools/python/modeltools/tools/float2halffloat.py
deleted file mode 100644
index 3df8d43f95..0000000000
--- a/mobile/tools/python/modeltools/tools/float2halffloat.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# encoding:utf-8
-import math
-import re
-
-
-def Real2HalfFloat(data):
-    MINNUM = -65536
-    MAXNUM = 65535
-    FloatVal = 0
-    if data:
-        if data < MINNUM:
-            data = MINNUM
-        if data > MAXNUM:
-            data = MAXNUM
-
-        sign = 0
-        if data < 0:
-            sign = 1
-            data = -data
-
-        exp = math.floor((math.log2(data)))
-        expout = exp + 16
-
-        Mantial = round(data / pow(2, exp - 10)) - 1024
-
-        if expout <= 0:
-            FloatVal = 0
-        else:
-            FloatVal = sign * 32768 + expout * 1024 + Mantial
-    return FloatVal
-
-
-def ReadCfloatData(sourcefile):
-    input = []
-    with open(sourcfile, 'r') as f:
-        for line in f.readlines():
-            line = line.strip()
-            line = re.sub('\s+', ' ', line)  # 两个数字间多个空格
-            input.append(line.split(' '))
-    destfile = sourcefile.replace('.dat', '')
-    destfile = destfile.replace('.txt', '')
-    destfile += 'Out.dat'
-    with open(destfile, 'w') as fw:
-        for i in range(len(input)):
-            if len(input[i]) == 2:
-                real = Real2HalfFloat(float(input[i][0]))
-                imag = Real2HalfFloat(float(input[i][1]))
-                result = real * 65536 + imag
-                if imag and not real:
-                    fw.write('0x0000' + "%X" % result + '\n')
-                elif not imag and not real:
-                    fw.write('0x00000000' + '\n')
-                else:
-                    fw.write('0x' + "%X" % result + '\n')
-            elif len(input[i]) == 1:
-                result = Real2HalfFloat(float(input[i][0]))
-                if result:
-                    fw.write('0x' + "%X" % result + '\n')
-                else:
-                    fw.write('0x0000' + '\n')
-
-
-if __name__ == '__main__':
-    print('Tips: Input number 0 if you want to exit!\n')
-    while True:
-        sourcfile = input("input source file:\n")
-        if sourcfile is '0':
-            break
-        ReadCfloatData(sourcfile)
-        print('Transfer Success!')
diff --git a/mobile/tools/python/modeltools/tools/loader.py b/mobile/tools/python/modeltools/tools/loader.py
deleted file mode 100644
index 55d9cdde20..0000000000
--- a/mobile/tools/python/modeltools/tools/loader.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import json
-
-
-def loadmdl(json_path):
-    print('mdl json path : ' + json_path)
-    with open(json_path, 'r') as f:
-        json_dick = json.load(f)
-        # print(json_dick)
-        layers = (json_dick['layer'])
-        for layer in layers:
-            print(layer)
diff --git a/mobile/tools/python/modeltools/tools/model_combine.py b/mobile/tools/python/modeltools/tools/model_combine.py
deleted file mode 100644
index 1fe8e6a9cd..0000000000
--- a/mobile/tools/python/modeltools/tools/model_combine.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# coding=utf-8
-import os
-
-path = "mobilenet/"  # 文件夹目录
-to_file_path = "mobilenet_combine/params"
-files = os.listdir(path)  # 得到文件夹下的所有文件名称
-files.sort(cmp=None, key=str.lower)
-to_file = open(to_file_path, "wb")
-
-for file in files:  # 遍历文件夹
-    if not os.path.isdir(file) and file != ".DS_Store":  # 判断是否是文件夹，不是文件夹才打开
-        f = open(path + "/" + file)  # 打开文件
-        name = f.name
-        print 'name:  ' + name
-        from_file = open(name, "rb")
-        to_file.write(from_file.read())
-        from_file.close()
-
-to_file.close()
diff --git a/mobile/tools/python/modeltools/tools/model_reader.py b/mobile/tools/python/modeltools/tools/model_reader.py
deleted file mode 100644
index 5f6e5f0cb9..0000000000
--- a/mobile/tools/python/modeltools/tools/model_reader.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import os
-
-from core import framework_pb2 as framework_pb2
-
-
-def read_model(model_path):
-    print('read_model.')
-    path_8 = unicode(model_path, 'utf8')
-
-    try:
-        with open(path_8, "rb") as f_model:
-            print get_file_size(model_path)
-            desc = framework_pb2.ProgramDesc()
-            desc.ParseFromString(f_model.read())
-            print desc
-            # print desc.blocks
-
-    except IOError:
-        print ": File not found."
-
-
-def get_file_size(file_path):
-    file_path = unicode(file_path, 'utf8')
-    fsize = os.path.getsize(file_path)
-    fsize = fsize / float(1024 * 1024)
-    return round(fsize, 2)
-
-
-path = '/Users/xiebaiyuan/PaddleProject/paddle-mobile/tools/python/modeltools/mobilenet/datas/sourcemodels/mobilenet_example/mobilenet/__model__'
-read_model(path)
diff --git a/mobile/tools/python/modeltools/yolo/__init__.py b/mobile/tools/python/modeltools/yolo/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/mobile/tools/python/modeltools/yolo/mdl2fluid.py b/mobile/tools/python/modeltools/yolo/mdl2fluid.py
deleted file mode 100644
index 2c2d0f3e94..0000000000
--- a/mobile/tools/python/modeltools/yolo/mdl2fluid.py
+++ /dev/null
@@ -1,333 +0,0 @@
-import json
-
-from core import framework_pb2 as framework_pb2, op_types as types
-from yolo.swicher import Swichter
-import shutil
-
-
-def load_mdl(mdl_json_path):
-    # print('mdl json path : ' + mdl_json_path)
-    with open(mdl_json_path, 'r') as f:
-        return json.load(f)
-
-
-class Converter:
-    'convert mdlmodel to fluidmodel'
-
-    def __init__(self, mdl_json_path):
-        self.mdl_json_path = mdl_json_path
-        print mdl_json_path
-        self.mdl_json = load_mdl(self.mdl_json_path)
-        self.program_desc = framework_pb2.ProgramDesc()
-        self.weight_list_ = []
-        self.deepwise_weight_list_ = []
-        # print(json_dick)
-        # layers = (json_dick['layer'])
-        # for layer in layers:
-        #     print(layer)
-
-    def convert(self):
-        print 'convert begin.....'
-        # add block_desc
-        block_desc = self.program_desc.blocks.add()
-        block_desc.idx = 0
-        block_desc.parent_idx = -1
-        self.package_ops(block_desc)
-        self.package_vars(block_desc)
-        print 'blocks: '
-        print self.program_desc.blocks
-        print 'convert end.....'
-        desc_serialize_to_string = self.program_desc.SerializeToString()
-        shutil.rmtree('yolo/datas/newyolo/')
-        shutil.copytree('yolo/datas/multiobjects/float32s_nchw_with_head/', 'yolo/datas/newyolo/')
-
-        f = open("yolo/datas/newyolo/__model__", "wb")
-        f.write(desc_serialize_to_string)
-        f.close()
-
-    def package_ops(self, block_desc):
-
-        self.add_op_feed(block_desc)
-
-        # add ops with layer
-        if 'layer' in self.mdl_json:
-
-            layers_ = self.mdl_json['layer']
-            for layer in layers_:
-                desc_ops_add = block_desc.ops.add()
-
-                # print layer
-                # for i in layer:
-                #     print i
-                if 'name' in layer:
-                    l_name = layer['name']
-                if 'type' in layer:
-                    self.package_ops_type(desc_ops_add, layer)
-
-                if 'weight' in layer:
-                    self.package_ops_weight2inputs(desc_ops_add, layer)
-
-                if 'output' in layer:
-                    self.package_ops_outputs(desc_ops_add, layer)
-
-                if 'input' in layer:
-                    self.package_ops_inputs(desc_ops_add, layer)
-
-                self.package_ops_attrs(desc_ops_add, layer)
-
-        self.add_op_fetch(block_desc)
-
-    def add_op_feed(self, block_desc):
-        desc_ops_add = block_desc.ops.add()
-        inputs_add = desc_ops_add.inputs.add()
-        inputs_add.parameter = 'X'
-        inputs_add.arguments.append('feed')
-        desc_ops_add.type = 'feed'
-        outputs_add = desc_ops_add.outputs.add()
-        outputs_add.parameter = 'Out'
-        outputs_add.arguments.append('data')
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'col'
-        # boolean
-        attrs_add.type = 0
-        attrs_add.i = 0
-
-    def add_op_fetch(self, block_desc):
-        desc_ops_add = block_desc.ops.add()
-        inputs_add = desc_ops_add.inputs.add()
-        inputs_add.parameter = 'X'
-        inputs_add.arguments.append('conv_pred_87')
-        desc_ops_add.type = 'fetch'
-        outputs_add = desc_ops_add.outputs.add()
-        outputs_add.parameter = 'Out'
-        outputs_add.arguments.append('fetch')
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'col'
-        # boolean
-        attrs_add.type = 0
-        attrs_add.i = 0
-
-    @staticmethod
-    def package_ops_attrs(desc_ops_add, layer):
-        # print l_params
-        # print desc_ops_add.type
-        if desc_ops_add.type == types.op_fluid_fusion_conv_add:
-            Converter.pack_fusion_conv_add_attr(desc_ops_add, layer)
-        elif desc_ops_add.type == types.op_fluid_relu:
-            # fusion_conv_add : attrs
-            attrs_add = desc_ops_add.attrs.add()
-            attrs_add.name = 'use_mkldnn'
-            # boolean
-            attrs_add.type = 6
-            attrs_add.b = 0
-
-    @staticmethod
-    def pack_fusion_conv_add_attr(desc_ops_add, layer):
-
-        # fusion_conv_add : attrs
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'workspace_size_MB'
-        # 0-->INT
-        attrs_add.type = 0
-        attrs_add.i = 4096
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'data_format'
-        # 2-->STRING
-        attrs_add.type = 2
-        attrs_add.s = 'AnyLayout'
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'use_mkldnn'
-        # boolean
-        attrs_add.type = 6
-        attrs_add.b = 0
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'use_cudnn'
-        # boolean
-        attrs_add.type = 6
-        attrs_add.b = 1
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'dilations'
-        # ints
-        attrs_add.type = 3
-        attrs_add.ints.append(1)
-        attrs_add.ints.append(1)
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'axis'
-        # int
-        attrs_add.type = 0
-        attrs_add.i = 1
-
-        if 'param' in layer:
-            l_params = layer['param']
-
-            attrs_add = desc_ops_add.attrs.add()
-            attrs_add.name = 'paddings'
-            # ints
-            attrs_add.type = 3
-            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('paddings')])
-            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('paddings')])
-
-            attrs_add = desc_ops_add.attrs.add()
-            attrs_add.name = 'strides'
-            # ints
-            attrs_add.type = 3
-            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('strides')])
-            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('strides')])
-
-            attrs_add = desc_ops_add.attrs.add()
-            attrs_add.name = 'groups'
-            # int
-            attrs_add.type = 0
-            attrs_add.i = l_params[types.fusion_conv_add_attrs_dict.get('groups')]
-            # attrs_add.i = 1
-
-        #
-        # op_attrs_tupl = types.op_io_dict.get(desc_ops_add.type) \
-        #     .get(types.mdl_attrs_key)
-        #
-        #
-        #
-        #
-        # # group stride padding
-        # print '----------------------'
-        # for i, val in enumerate(op_attrs_tupl):
-        #     attrs_add = desc_ops_add.attrs.add()
-        #     attr_name = op_attrs_tupl[i]
-        #     print attr_name
-        #     attrs_add.name = attr_name
-        #     attrs_add.type = types.fluid_attrs_type_dict.get(attr_name)
-        #     attrs_add.
-        #     print l_params[types.fusion_conv_add_attrs_dict.get(attr_name)]
-
-        # for p in l_params:
-        #     attrs_add = desc_ops_add.attrs.add()
-
-    @staticmethod
-    def package_ops_inputs(desc_ops_add, layer):
-        l_inputs = layer['input']
-        for i in l_inputs:
-            inputs_add = desc_ops_add.inputs.add()
-            # print i
-            inputs_add.parameter = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_inputs_key)
-            inputs_add.arguments.append(i)
-
-    @staticmethod
-    def package_ops_outputs(desc_ops_add, layer):
-        l_outputs = layer['output']
-        for o in l_outputs:
-            # print o
-            outputs_add = desc_ops_add.outputs.add()
-            outputs_add.parameter = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_outputs_key)
-            outputs_add.arguments.append(o)
-
-    def package_ops_weight2inputs(self, desc_ops_add, layer):
-        l_weights = layer['weight']
-        for w in l_weights:
-            self.weight_list_.append(w)
-
-        if layer['type'] == 'DepthwiseConvolutionLayer':
-            # print l_weights[0]
-            self.deepwise_weight_list_.append(l_weights[0])
-
-        op_weight_tup = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_weight_key)
-        # print len(op_weight_tup)
-        for i, val in enumerate(op_weight_tup):
-            # print i
-            # print val
-            inputs_add = desc_ops_add.inputs.add()
-            inputs_add.parameter = op_weight_tup[i]
-            inputs_add.arguments.append(l_weights[i])
-
-        # for w in l_weights:
-        #     inputs_add = desc_ops_add.inputs.add()
-        #     # print w
-        #     inputs_add.parameter = op_weight_tup[0]
-        #     inputs_add.arguments.append(w)
-
-    @staticmethod
-    def package_ops_type(desc_ops_add, layer):
-        l_type = layer['type']
-        # print l_type
-        # print mdl2fluid_op_layer_dict.get(l_type)
-        desc_ops_add.type = types.mdl2fluid_op_layer_dict.get(l_type)
-
-    def package_vars(self, block_desc):
-        vars_add = block_desc.vars.add()
-        vars_add.name = 'feed'
-        vars_add.type.type = 9  # 9 is FEED_MINIBATCH
-        vars_add.persistable = 1
-        # fetch
-        vars_add = block_desc.vars.add()
-        vars_add.name = 'fetch'
-        vars_add.type.type = 10  # 10 is fetch list
-        vars_add.persistable = 1
-
-        json_matrix_ = self.mdl_json['matrix']
-        # print json_matrix_
-        for j in json_matrix_:
-            vars_add = block_desc.vars.add()
-            vars_add.name = j
-            vars_add.type.type = 7  # 7 is lodtensor
-            # print j
-            tensor = vars_add.type.lod_tensor.tensor
-            tensor.data_type = 5  # 5 is FP32
-
-            # print json_matrix_
-
-            dims_of_matrix = json_matrix_.get(j)
-            # dims_size = len(dims_of_matrix)
-            # print dims_size
-
-            # if dims_size == 4:
-            #     tensor.dims.append(dims_of_matrix[0])  # N
-            #     tensor.dims.append(dims_of_matrix[3])  # C
-            #     tensor.dims.append(dims_of_matrix[1])  # H
-            #     tensor.dims.append(dims_of_matrix[2])  # W
-            # else:
-
-            # issues in mdl model filter swich n and c
-            if j in self.deepwise_weight_list_ and len(dims_of_matrix) == 4:
-                print j
-                tensor.dims.append(dims_of_matrix[1])
-                tensor.dims.append(dims_of_matrix[0])
-                tensor.dims.append(dims_of_matrix[2])
-                tensor.dims.append(dims_of_matrix[3])
-                print tensor.dims
-            else:
-                for dims in dims_of_matrix:
-                    # print dims
-                    tensor.dims.append(dims)
-
-            if j in self.weight_list_:
-                vars_add.persistable = 1
-                dims_size = len(dims_of_matrix)
-                # print dims_size
-                if dims_size == 4:
-                    # convert weight from nhwc to nchw
-                    Swichter().nhwc2nchw_one_slice_add_head(
-                        'yolo/datas/multiobjects/float32s_nhwc/' + j + '.bin',
-                        'yolo/datas/multiobjects/float32s_nchw_with_head/' + j,
-                        'yolo/datas/multiobjects/float32s_nchw/' + j + '.tmp',
-                        dims_of_matrix[0],
-                        dims_of_matrix[1],
-                        dims_of_matrix[2],
-                        dims_of_matrix[3]
-                    )
-                else:
-                    Swichter().copy_add_head(
-                        'yolo/datas/multiobjects/float32s_nhwc/' + j + '.bin',
-                        'yolo/datas/multiobjects/float32s_nchw_with_head/' + j,
-                        'yolo/datas/multiobjects/float32s_nchw/' + j + '.tmp'
-                    )
-            else:
-                vars_add.persistable = 0
-
-
-mdl_path = "yolo/datas/multiobjects/YOLO_Universal.json"
-converter = Converter(mdl_path)
-converter.convert()
diff --git a/mobile/tools/python/modeltools/yolo/swicher.py b/mobile/tools/python/modeltools/yolo/swicher.py
deleted file mode 100644
index 713ce93985..0000000000
--- a/mobile/tools/python/modeltools/yolo/swicher.py
+++ /dev/null
@@ -1,115 +0,0 @@
-from array import array
-
-
-class Swichter:
-    def __init__(self):
-        pass
-
-    def nhwc2nchw_one_slice(self, from_file_name, to_file_name, batch, channel, height, width):
-        from_file = open(from_file_name, "rb")
-        to_file = open(to_file_name, "wb")
-
-        float_array = array("f")
-        float_array.fromfile(from_file, width * height * batch * channel)
-        float_write_array = array("f")
-
-        for b in range(batch):
-            for c in range(channel):
-                for h in range(height):
-                    for w in range(width):
-                        float_value = float_array[b * channel * width * height
-                                                  + channel * (h * width + w) + c]
-
-                        float_write_array.append(float_value)
-
-        float_write_array.tofile(to_file)
-        from_file.close()
-        to_file.close()
-
-    def copy(self, from_file_name, to_file_name):
-        from_file = open(from_file_name, "rb")
-        to_file = open(to_file_name, "wb")
-
-        to_file.write(from_file.read())
-        from_file.close()
-        to_file.close()
-
-    def nhwc2nchw_one_slice_add_head(self, from_file_name, to_file_name, tmp_file_name, batch, channel, height, width):
-        from_file = open(from_file_name, "rb")
-        tmp_file = open(tmp_file_name, "wb+")
-        float_array = array("f")
-        float_array.fromfile(from_file, width * height * batch * channel)
-        float_write_array = array("f")
-
-        for b in range(batch):
-            for c in range(channel):
-                for h in range(height):
-                    for w in range(width):
-                        float_value = float_array[b * channel * width * height
-                                                  + channel * (h * width + w) + c]
-
-                        float_write_array.append(float_value)
-
-        float_write_array.tofile(tmp_file)
-        tmp_file.close()
-        from_file.close()
-
-        tmp_file = open(tmp_file_name, "rb")
-        to_file = open(to_file_name, "wb")
-
-        tmp = tmp_file.read()
-        head = self.read_head('yolo/datas/yolo/head')
-        to_file.write(head)
-        to_file.write(tmp)
-        tmp_file.close()
-        to_file.close()
-
-    def read_head(self, head_file):
-        from_file = open(head_file, "rb")
-        read = from_file.read(24)
-        # print read
-        from_file.close()
-        # print read
-        return read
-
-    def copy_add_head(self, from_file_name, to_file_name, tmp_file_name):
-        from_file = open(from_file_name, "rb")
-        to_file = open(to_file_name, "wb")
-        # tmp_file = open(tmp_file_name, "wb")
-
-        head = self.read_head('yolo/datas/yolo/head')
-        to_file.write(head)
-        to_file.write(from_file.read())
-        from_file.close()
-        to_file.close()
-        pass
-
-    def copy_padding_add_head(self, from_file_name, to_file_name, tmp_file_name, padding):
-        print'padding  = %d' % padding
-        from_file = open(from_file_name, "rb")
-        # print len(from_file.read())
-        from_file.seek(padding, 0)
-
-        read = from_file.read()
-        print len(read)
-
-        to_file = open(to_file_name, "wb")
-        # tmp_file = open(tmp_file_name, "wb")
-
-        head = self.read_head('yolo/datas/yolo/head')
-        to_file.write(head)
-        to_file.write(read)
-        from_file.close()
-        to_file.close()
-        pass
-
-# Swichter().nhwc2nchw_one_slice_add_head(
-#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nhwc/conv1_0.bin',
-#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nchw_with_head/conv1_0',
-#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nchw/.tmp',
-#     32,
-#     3, 3, 3)
-
-# Swichter().read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/yolo/head')
-
-# Swichter().copy_add_head('datas/model.0.0.weight', 'datas/conv1_0', '')
diff --git a/mobile/tools/quantification/CMakeLists.txt b/mobile/tools/quantification/CMakeLists.txt
deleted file mode 100644
index 13a4fb87b9..0000000000
--- a/mobile/tools/quantification/CMakeLists.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-cmake_minimum_required(VERSION 3.6)
-project(quali)
-add_definitions(-DENABLE_EXCEPTION)
-
-set(CMAKE_CXX_STANDARD 11)
-file(GLOB_RECURSE QULIFICATON_CC src/*.cc src/*.cpp src/*.c src/*.mm)
-file(GLOB_RECURSE QULIFICATON_H src/*.h)
-include_directories(. src/)
-
-#add_library(paddle-mobile SHARED ${QULIFICATON_CC} ${QULIFICATON_H} convert.cpp)
-
-add_executable(quantify convert.cpp ${QULIFICATON_CC} ${QULIFICATON_H})
diff --git a/mobile/tools/quantification/README.md b/mobile/tools/quantification/README.md
deleted file mode 100644
index c2f9e63249..0000000000
--- a/mobile/tools/quantification/README.md
+++ /dev/null
@@ -1,37 +0,0 @@
-# 模型量化脚本
-
-#### 量化脚本使用指南
-1. 在PaddleMobile项目目录下（如 ~/PaddleProject/paddle-mobile）
-
-2. cd到  tools/quantification/ 目录
-
-3. cmake编译
-
-    ``` sh
-    cmake .
-    make
-    ```
-
-4. 运行量化脚本
-    ```sh
-    ./quantify (0:seperated. 1:combined ) (输入路径) (输出路径)
-    # quantify googlenet seperated   from  /Users/xiebaiyuan/PaddleProject/quali/models/googlenet to ./googlenet_min
-    ./quantify 0 /Users/xiebaiyuan/PaddleProject/quali/models/googlenet ./googlenet_min 
-
-    ```
-
-*注:*
-*量化工具中*
-*1.seperated模型model文件默认命名为 "__model__";*
-*2.combined模型的model文件默认命名为 "model",参数文件默认命名为"params";*
-
-    
-##### 整体如下:
-以googlenet非combined为例：
-
-```sh
-cd tools/quantification/
-cmake .
-make
-./quantify 0 /Users/xiebaiyuan/PaddleProject/quali/models/googlenet ./googlenet_min
-```
diff --git a/mobile/tools/quantification/convert.cpp b/mobile/tools/quantification/convert.cpp
deleted file mode 100644
index 3473f9a118..0000000000
--- a/mobile/tools/quantification/convert.cpp
+++ /dev/null
@@ -1,438 +0,0 @@
-
-
-#include "src/enforce.h"
-#include "src/var_desc.h"
-#include "src/program_desc.h"
-#include <cstring>
-#include <cstdlib>
-#include <cmath>
-#include <iostream>
-#include <utility>
-#include <vector>
-#include "src/framework.pb-c.h"
-#include "src/protobuf-c.h"
-#include <fstream>
-#include <iostream>
-#include <limits>
-
-const size_t kSize64 = sizeof(uint64_t);
-const size_t kSize32 = sizeof(uint32_t);
-
-char *Get_binary_data(const std::string &filename) {
-
-    FILE *file = fopen(filename.c_str(), "rb");
-
-    PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
-                          filename.c_str());
-    fseek(file, 0, SEEK_END);
-    int64_t size = ftell(file);
-
-    PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
-    rewind(file);
-    auto *data = new char[size];
-    size_t bytes_read = fread(data, 1, static_cast<size_t>(size), file);
-    PADDLE_MOBILE_ENFORCE(bytes_read == size,
-                          "read binary file bytes do not match with fseek");
-    fclose(file);
-    return data;
-}
-
-
-static size_t ReadBuffer(const char *file_name, uint8_t **out) {
-    FILE *fp;
-    fp = fopen(file_name, "rb");
-    PADDLE_MOBILE_ENFORCE(fp != nullptr, " %s open failed !", file_name);
-    fseek(fp, 0, SEEK_END);
-    auto size = static_cast<size_t>(ftell(fp));
-    rewind(fp);
-    *out = reinterpret_cast<uint8_t *>(malloc(size));
-    size_t cur_len = 0;
-    size_t nread;
-    while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
-        cur_len += nread;
-    }
-    fclose(fp);
-    return cur_len;
-}
-
-std::shared_ptr<ProgramDesc> loadParams(const std::string &model_path) {
-    PaddleMobile__Framework__Proto__ProgramDesc *c_program;
-    uint8_t *buf = nullptr;
-    size_t read_size = ReadBuffer(model_path.c_str(), &buf);
-    PADDLE_MOBILE_ENFORCE(buf != nullptr, "read from __model__ is null");
-    c_program = paddle_mobile__framework__proto__program_desc__unpack(
-            nullptr, read_size, buf);
-    PADDLE_MOBILE_ENFORCE(c_program != nullptr, "program is null");
-    auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);
-    return originProgramDesc;
-
-}
-
-void LoadWithDumpForInt8(const paddle_mobile::framework::VarDesc &var_desc, char **dataP, FILE *out_file) {
-    // 1. version
-    uint32_t version = *reinterpret_cast<uint32_t *>(*dataP);
-
-    // write version
-    fwrite(&version, kSize32, 1, out_file);
-
-    *dataP += kSize32;
-
-    // 2 Lod information
-    auto *lod_level_ptr = new uint64_t();
-    memcpy(lod_level_ptr, *dataP, kSize64);
-
-    uint64_t lod_level = 0;
-    // write lod Information
-    fwrite(&lod_level, kSize64, 1, out_file);
-    delete lod_level_ptr;
-
-    *dataP += kSize64;
-
-    for (uint64_t i = 0; i < lod_level; ++i) {
-        uint64_t size = *reinterpret_cast<uint64_t *>(*dataP);
-        // write lod size
-        fwrite(&size, kSize64, 1, out_file);
-        (*dataP) += kSize64;
-
-        std::vector<size_t> tmp(size / sizeof(size_t));
-        for (unsigned long &k : tmp) {
-            k = *reinterpret_cast<size_t *>(*dataP);
-            (*dataP) += sizeof(size_t);
-        }
-        // write lod size vector
-        fwrite(&tmp, sizeof(size_t), tmp.size(), out_file);
-    }
-
-    // 3. tensor version
-    uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*dataP);
-    // write tensor version
-    fwrite(&tensor_version, kSize32, 1, out_file);
-    (*dataP) += kSize32;
-
-    // 4. tensor desc
-    int32_t size = *reinterpret_cast<int32_t *>(*dataP);
-    // write tensor desc
-    fwrite(&size, sizeof(int32_t), 1, out_file);
-    (*dataP) += sizeof(int32_t);
-
-    std::unique_ptr<char[]> buf(new char[size]);
-    for (int m = 0; m < size; ++m) {
-        buf.get()[m] = (*dataP)[m];
-    }
-
-    fwrite(buf.get(), sizeof(char), static_cast<size_t>(size), out_file);
-    (*dataP) += (sizeof(char) * size);
-
-    const paddle_mobile::framework::TensorDesc &desc = var_desc.Tensor_desc();
-    int memory_size = 1;
-    for (auto l : desc.Dims()) {
-        memory_size *= l;
-    }
-
-    void *memory = nullptr;
-    int type_size = 0;
-    switch (desc.DataType()) {
-        case paddle_mobile::framework::VARTYPE_TYPE_FP16:
-            type_size = 2;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_FP32:
-            type_size = 4;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_FP64:
-            type_size = 8;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_INT32:
-            type_size = 4;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_INT64:
-            type_size = 8;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_BOOL:
-            type_size = 1;
-            break;
-        default:
-            break;
-    }
-    size_t tensorSize = sizeof(char) * memory_size * type_size;
-
-    memory = new char[tensorSize];
-
-    for (int n = 0; n < tensorSize; ++n) {
-        static_cast<char *>(memory)[n] = (*dataP)[n];
-    }
-    *dataP += tensorSize;
-
-    // for float 32
-    float min_value = std::numeric_limits<float>::max();
-    float max_value = std::numeric_limits<float>::min();
-
-    for (int k = 0; k < memory_size; ++k) {
-        min_value = std::min(min_value, static_cast<float *> (memory)[k]);
-        max_value = std::max(max_value, static_cast<float *> (memory)[k]);
-    }
-
-    fwrite(&min_value, sizeof(float), 1, out_file);
-    fwrite(&max_value, sizeof(float), 1, out_file);
-
-    for (int g = 0; g < memory_size; ++g) {
-        float value = static_cast<float *> (memory)[g];
-        auto factor = (uint8_t) round((value - min_value) / (max_value - min_value) * 255);
-        fwrite(&factor, sizeof(uint8_t), 1, out_file);
-    }
-}
-
-void
-quantificate_combined_int8(const std::string &model_path, const std::string &param_path, const std::string &param_min_path) {
-    auto program = loadParams(model_path);
-    char *origin_data = Get_binary_data(param_path);
-    char *data = origin_data;
-    FILE *out_file = fopen(param_min_path.c_str(), "wb");
-    for (const auto &block : program->Blocks()) {
-        for (const auto &var_desc : block->Vars()) {
-            if (var_desc->Persistable()) {
-                if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-                    continue;
-                }
-                LoadWithDumpForInt8(*var_desc, &data, out_file);
-            }
-        }
-    }
-    fclose(out_file);
-    delete origin_data;
-}
-
-void quantificate_seperated_int8(const std::string model_dir, const std::string param_min_path) {
-    auto program = loadParams(model_dir + "/__model__");
-
-    std::string shell_command = "mkdir " + param_min_path;
-    system(shell_command.c_str());
-
-    for (const auto &block : program->Blocks()) {
-        for (const auto &var_desc : block->Vars()) {
-            if (var_desc->Persistable()) {
-                if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-                    continue;
-                }
-                std::string file_name = param_min_path + "/" + var_desc->Name();
-                FILE *out_file = fopen(file_name.c_str(), "wb");
-                char *origin_data = Get_binary_data(model_dir + "/" + var_desc->Name());
-                char *data = origin_data;
-                LoadWithDumpForInt8(*var_desc, &data, out_file);
-                delete origin_data;
-                fclose(out_file);
-            }
-        }
-    }
-}
-
-void LoadWithDumpForFloat32(const paddle_mobile::framework::VarDesc &var_desc, char **dataP, FILE *out_file) {
-    // 1. version
-    uint32_t version = *reinterpret_cast<uint32_t *>(*dataP);
-
-    // write version
-    fwrite(&version, kSize32, 1, out_file);
-
-    *dataP += kSize32;
-
-    // 2 Lod information
-    auto *lod_level_ptr = new uint64_t();
-    memcpy(lod_level_ptr, *dataP, kSize64);
-
-    uint64_t lod_level = 0;
-    // write lod Information
-    fwrite(&lod_level, kSize64, 1, out_file);
-    delete lod_level_ptr;
-
-    *dataP += kSize64;
-
-    for (uint64_t i = 0; i < lod_level; ++i) {
-        uint64_t size = *reinterpret_cast<uint64_t *>(*dataP);
-        // write lod size
-        fwrite(&size, kSize64, 1, out_file);
-        (*dataP) += kSize64;
-
-        std::vector<size_t> tmp(size / sizeof(size_t));
-        for (unsigned long &k : tmp) {
-            k = *reinterpret_cast<size_t *>(*dataP);
-            (*dataP) += sizeof(size_t);
-        }
-        // write lod size vector
-        fwrite(&tmp, sizeof(size_t), tmp.size(), out_file);
-    }
-
-    // 3. tensor version
-    uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*dataP);
-    // write tensor version
-    fwrite(&tensor_version, kSize32, 1, out_file);
-    (*dataP) += kSize32;
-
-    // 4. tensor desc
-    int32_t size = *reinterpret_cast<int32_t *>(*dataP);
-    // write tensor desc
-    fwrite(&size, sizeof(int32_t), 1, out_file);
-    (*dataP) += sizeof(int32_t);
-
-    std::unique_ptr<char[]> buf(new char[size]);
-    for (int m = 0; m < size; ++m) {
-        buf.get()[m] = (*dataP)[m];
-    }
-
-    fwrite(buf.get(), sizeof(char), static_cast<size_t>(size), out_file);
-    (*dataP) += (sizeof(char) * size);
-
-    const paddle_mobile::framework::TensorDesc &desc = var_desc.Tensor_desc();
-    int memory_size = 1;
-    for (auto l : desc.Dims()) {
-        memory_size *= l;
-    }
-
-    void *memory = nullptr;
-    int type_size = 0;
-    switch (desc.DataType()) {
-        case paddle_mobile::framework::VARTYPE_TYPE_FP16:
-            type_size = 2;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_FP32:
-            type_size = 4;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_FP64:
-            type_size = 8;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_INT32:
-            type_size = 4;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_INT64:
-            type_size = 8;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_BOOL:
-            type_size = 1;
-            break;
-        default:
-            break;
-    }
-    size_t tensorSize = sizeof(char) * memory_size * type_size;
-
-    memory = new char[tensorSize];
-
-    for (int n = 0; n < tensorSize; ++n) {
-        static_cast<char *>(memory)[n] = (*dataP)[n];
-    }
-    *dataP += tensorSize;
-
-    // for float 32
-    float min_value = std::numeric_limits<float>::max();
-    float max_value = std::numeric_limits<float>::min();
-
-    for (int k = 0; k < memory_size; ++k) {
-        min_value = std::min(min_value, static_cast<float *> (memory)[k]);
-        max_value = std::max(max_value, static_cast<float *> (memory)[k]);
-    }
-
-    float diff = 0.0;
-    for (int g = 0; g < memory_size; ++g) {
-        float value = static_cast<float *> (memory)[g];
-        auto factor = (uint8_t) round((value - min_value) / (max_value - min_value) * 255);
-        float value_quantized = min_value + (factor / 255.0) * (max_value - min_value);
-        diff += fabs(value - value_quantized);
-        fwrite(&value_quantized, sizeof(float), 1, out_file);
-    }
-    if (memory_size > 0) {
-        std::cout << "avg diff caused by quantization for var " << var_desc.Name() << " is: " << (diff / memory_size) << std::endl;
-    }
-}
-
-void
-quantificate_combined_float32(const std::string &model_path, const std::string &param_path, const std::string &param_min_path) {
-    auto program = loadParams(model_path);
-    char *origin_data = Get_binary_data(param_path);
-    char *data = origin_data;
-    FILE *out_file = fopen(param_min_path.c_str(), "wb");
-    for (const auto &block : program->Blocks()) {
-        for (const auto &var_desc : block->Vars()) {
-            if (var_desc->Persistable()) {
-                if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-                    continue;
-                }
-                LoadWithDumpForFloat32(*var_desc, &data, out_file);
-            }
-        }
-    }
-    fclose(out_file);
-    delete origin_data;
-}
-
-void quantificate_seperated_float32(const std::string model_dir, const std::string param_min_path) {
-    auto program = loadParams(model_dir + "/__model__");
-
-    std::string shell_command = "mkdir " + param_min_path;
-    system(shell_command.c_str());
-
-    for (const auto &block : program->Blocks()) {
-        for (const auto &var_desc : block->Vars()) {
-            if (var_desc->Persistable()) {
-                if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-                    continue;
-                }
-                std::string file_name = param_min_path + "/" + var_desc->Name();
-                FILE *out_file = fopen(file_name.c_str(), "wb");
-                char *origin_data = Get_binary_data(model_dir + "/" + var_desc->Name());
-                char *data = origin_data;
-                LoadWithDumpForFloat32(*var_desc, &data, out_file);
-                delete origin_data;
-                fclose(out_file);
-            }
-        }
-    }
-}
-
-int main(int argc, char **argv) {
-    const std::string kNoteEg = "( eg:  ./quantify 1 your_combined_model_path output_path  or  ./quantify 0 your_seperated_model_path output_path  or  ./quantify 3 your_seperated_model_path output_path  or  ./quantify 2 your_seperated_model_path output_path)";
-
-    PADDLE_MOBILE_ENFORCE(argc > 1, "wee need params.%s ", kNoteEg.c_str());
-
-    std::string action_type = argv[1];
-    PADDLE_MOBILE_ENFORCE(argc > 1 && (action_type) == "0" || action_type == "1" || action_type == "2" || action_type == "3",
-                          "only 0, 1, 2 or 3 supported, current is %s %s ",
-                          action_type.c_str(),
-                          kNoteEg.c_str());
-
-    PADDLE_MOBILE_ENFORCE(argc > 2, "we need your model path. %s ", kNoteEg.c_str());
-    std::string base_path = argv[2];
-
-    PADDLE_MOBILE_ENFORCE(argc > 3, "we need your output path. %s ", kNoteEg.c_str());
-    std::string output_path = argv[3];
-
-    if (action_type == "0") {
-        // for seperated
-        const std::string &seperated_min_dir = output_path;
-        quantificate_seperated_int8(base_path, seperated_min_dir);
-        return 0;
-    }
-
-    if (action_type == "1") {
-        // for combined
-        const std::string &combined_min_dir = output_path;
-        std::string model_path = base_path + "/model";
-        std::string param_path = base_path + "/params";
-        quantificate_combined_int8(model_path, param_path, combined_min_dir);
-        return 0;
-    }
-
-    if (action_type == "2") {
-        // for seperated
-        const std::string &seperated_min_dir = output_path;
-        quantificate_seperated_float32(base_path, seperated_min_dir);
-        return 0;
-    }
-
-    if (action_type == "3") {
-        // for combined
-        const std::string &combined_min_dir = output_path;
-        std::string model_path = base_path + "/model";
-        std::string param_path = base_path + "/params";
-        quantificate_combined_float32(model_path, param_path, combined_min_dir);
-        return 0;
-    }
-
-    return -1;
-}
diff --git a/mobile/tools/quantification/src/block_desc_local.cpp b/mobile/tools/quantification/src/block_desc_local.cpp
deleted file mode 100644
index 8ad1982c05..0000000000
--- a/mobile/tools/quantification/src/block_desc_local.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-//
-// Created by 谢柏渊 on 2018/7/25.
-//
-#include "src/block_desc_local.h"
-#include <algorithm>
-#include <memory>
-#include <vector>
-
-#include "src/framework.pb-c.h"
-
-std::vector<std::shared_ptr<paddle_mobile::framework::VarDesc>>
-BlockDesc::Vars() const {
-  return vars_;
-}
-
-BlockDesc::BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc)
-    : index_(desc->idx), parent_index_(desc->idx) {
-  for (int i = 0; i < desc->n_vars; ++i) {
-    PaddleMobile__Framework__Proto__VarDesc *var_desc = desc->vars[i];
-    vars_.emplace_back(std::shared_ptr<paddle_mobile::framework::VarDesc>(
-        new paddle_mobile::framework::VarDesc(var_desc)));
-  }
-
-  std::sort(vars_.begin(), vars_.end(),
-            [](std::shared_ptr<paddle_mobile::framework::VarDesc> left,
-               std::shared_ptr<paddle_mobile::framework::VarDesc> right) {
-              return left->Name() < right->Name();
-            });
-
-  //        for (int j = 0; j < desc->n_ops; ++j) {
-  //            PaddleMobile__Framework__Proto__OpDesc *op_desc = desc->ops[j];
-  //            ops_.emplace_back(new OpDesc(op_desc));
-  //        }
-}
diff --git a/mobile/tools/quantification/src/block_desc_local.h b/mobile/tools/quantification/src/block_desc_local.h
deleted file mode 100644
index 2ee8132af7..0000000000
--- a/mobile/tools/quantification/src/block_desc_local.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-//
-// Created by 谢柏渊 on 2018/7/25.
-//
-
-#ifndef TOOLS_QUANTIFICATION_SRC_BLOCK_DESC_LOCAL_H_
-#define TOOLS_QUANTIFICATION_SRC_BLOCK_DESC_LOCAL_H_
-
-#include <memory>
-#include <vector>
-#include "src/var_desc.h"
-
-class BlockDesc {
- public:
-  friend class Node;
-  friend class ProgramOptimize;
-  BlockDesc() {}
-  explicit BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc);
-
-  const int &ID() const { return index_; }
-
-  const bool &MultiThread() const { return multi_thread_; }
-
-  const int &Parent() const { return parent_index_; }
-
-  bool operator==(const BlockDesc &in_block) const {
-    return this->ID() == in_block.ID() && this->Parent() == in_block.Parent();
-  }
-
-  bool operator<(const BlockDesc &in_block) const {
-    return this->ID() < in_block.ID() && this->Parent() < in_block.Parent();
-  }
-
-  std::vector<std::shared_ptr<paddle_mobile::framework::VarDesc>> Vars() const;
-
- private:
-  int index_;
-  bool multi_thread_;
-  int parent_index_;
-  std::vector<std::shared_ptr<paddle_mobile::framework::VarDesc>> vars_;
-};
-
-#endif  // TOOLS_QUANTIFICATION_SRC_BLOCK_DESC_LOCAL_H_
diff --git a/mobile/tools/quantification/src/enforce.h b/mobile/tools/quantification/src/enforce.h
deleted file mode 100644
index 51d2110e32..0000000000
--- a/mobile/tools/quantification/src/enforce.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef ENABLE_EXCEPTION
-#include <stdio.h>
-#include <exception>
-#include <string>
-
-#endif
-
-namespace paddle_mobile {
-
-#ifdef ENABLE_EXCEPTION
-struct PaddleMobileException : public std::exception {
-  const std::string exception_prefix = "paddle mobile C++ Exception: \n";
-  std::string message;
-
-  PaddleMobileException(const char *header, const char *detail,
-                        const char *file, const int line) {
-    char buffer[1500];
-    snprintf(buffer, sizeof(buffer),
-             "%s| %s \n| [in file] : %s\n| [on line] : %d\n| [detail]  : %s\n",
-             exception_prefix.c_str(), header, file, line, detail);
-    message = std::string(buffer);
-  }
-  const char *what() const noexcept { return message.c_str(); }
-};
-
-#define PADDLE_MOBILE_THROW_EXCEPTION(...)                                 \
-  {                                                                        \
-    char buffer[1000];                                                     \
-    snprintf(buffer, sizeof(buffer), __VA_ARGS__);                         \
-    std::string detail(buffer);                                            \
-    throw paddle_mobile::PaddleMobileException("Custom Exception", buffer, \
-                                               __FILE__, __LINE__);        \
-  }
-
-#define PADDLE_MOBILE_ENFORCE(stat, ...)                                      \
-  {                                                                           \
-    if (stat) {                                                               \
-    } else {                                                                  \
-      char buffer[1000];                                                      \
-      snprintf(buffer, sizeof(buffer), __VA_ARGS__);                          \
-      std::string detail(buffer);                                             \
-      throw paddle_mobile::PaddleMobileException("paddle-mobile enforce",     \
-                                                 buffer, __FILE__, __LINE__); \
-    }                                                                         \
-  }
-#else
-#define PADDLE_MOBILE_THROW_EXCEPTION(...)
-#define PADDLE_MOBILE_ENFORCE(stat, ...)
-#endif
-
-}  // namespace paddle_mobile
diff --git a/mobile/tools/quantification/src/framework.pb-c.c b/mobile/tools/quantification/src/framework.pb-c.c
deleted file mode 100644
index aed0a6c9c0..0000000000
--- a/mobile/tools/quantification/src/framework.pb-c.c
+++ /dev/null
@@ -1,1403 +0,0 @@
-/* Generated by the protocol buffer compiler.  DO NOT EDIT! */
-/* Generated from: framework.proto */
-
-/* Do not generate deprecated warnings for self */
-#ifndef PROTOBUF_C__NO_DEPRECATED
-#define PROTOBUF_C__NO_DEPRECATED
-#endif
-
-#include "framework.pb-c.h"
-void paddle_mobile__framework__proto__op_desc__attr__init(
-    PaddleMobile__Framework__Proto__OpDesc__Attr *message) {
-  static const PaddleMobile__Framework__Proto__OpDesc__Attr init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__ATTR__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__op_desc__var__init(
-    PaddleMobile__Framework__Proto__OpDesc__Var *message) {
-  static const PaddleMobile__Framework__Proto__OpDesc__Var init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__VAR__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__op_desc__init(
-    PaddleMobile__Framework__Proto__OpDesc *message) {
-  static const PaddleMobile__Framework__Proto__OpDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__INIT;
-  *message = init_value;
-}
-size_t paddle_mobile__framework__proto__op_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__OpDesc *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__op_desc__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
-
-PaddleMobile__Framework__Proto__OpDesc *
-paddle_mobile__framework__proto__op_desc__unpack(ProtobufCAllocator *allocator,
-                                                 size_t len,
-                                                 const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__OpDesc *)protobuf_c_message_unpack(
-      &paddle_mobile__framework__proto__op_desc__descriptor, allocator, len,
-      data);
-}
-void paddle_mobile__framework__proto__op_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__OpDesc *message,
-    ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__op_desc__descriptor);
-  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
-}
-void paddle_mobile__framework__proto__op_proto__var__init(
-    PaddleMobile__Framework__Proto__OpProto__Var *message) {
-  static const PaddleMobile__Framework__Proto__OpProto__Var init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__VAR__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__op_proto__attr__init(
-    PaddleMobile__Framework__Proto__OpProto__Attr *message) {
-  static const PaddleMobile__Framework__Proto__OpProto__Attr init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__ATTR__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__op_proto__init(
-    PaddleMobile__Framework__Proto__OpProto *message) {
-  static const PaddleMobile__Framework__Proto__OpProto init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__INIT;
-  *message = init_value;
-}
-size_t paddle_mobile__framework__proto__op_proto__get_packed_size(
-    const PaddleMobile__Framework__Proto__OpProto *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__op_proto__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
-
-PaddleMobile__Framework__Proto__OpProto *
-paddle_mobile__framework__proto__op_proto__unpack(ProtobufCAllocator *allocator,
-                                                  size_t len,
-                                                  const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__OpProto *)protobuf_c_message_unpack(
-      &paddle_mobile__framework__proto__op_proto__descriptor, allocator, len,
-      data);
-}
-void paddle_mobile__framework__proto__op_proto__free_unpacked(
-    PaddleMobile__Framework__Proto__OpProto *message,
-    ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__op_proto__descriptor);
-  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
-}
-void paddle_mobile__framework__proto__var_type__tensor_desc__init(
-    PaddleMobile__Framework__Proto__VarType__TensorDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarType__TensorDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TENSOR_DESC__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__lo_dtensor_desc__init(
-    PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarType__LoDTensorDesc
-      init_value =
-          PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_DESC__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__init(
-    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc
-      init_value =
-          PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_ARRAY_DESC__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__reader_desc__init(
-    PaddleMobile__Framework__Proto__VarType__ReaderDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarType__ReaderDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__READER_DESC__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__channel_desc__init(
-    PaddleMobile__Framework__Proto__VarType__ChannelDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarType__ChannelDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__CHANNEL_DESC__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__tuple__init(
-    PaddleMobile__Framework__Proto__VarType__Tuple *message) {
-  static const PaddleMobile__Framework__Proto__VarType__Tuple init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TUPLE__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__init(
-    PaddleMobile__Framework__Proto__VarType *message) {
-  static const PaddleMobile__Framework__Proto__VarType init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__INIT;
-  *message = init_value;
-}
-size_t paddle_mobile__framework__proto__var_type__get_packed_size(
-    const PaddleMobile__Framework__Proto__VarType *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__var_type__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
-PaddleMobile__Framework__Proto__VarType *
-paddle_mobile__framework__proto__var_type__unpack(ProtobufCAllocator *allocator,
-                                                  size_t len,
-                                                  const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__VarType *)protobuf_c_message_unpack(
-      &paddle_mobile__framework__proto__var_type__descriptor, allocator, len,
-      data);
-}
-void paddle_mobile__framework__proto__var_type__free_unpacked(
-    PaddleMobile__Framework__Proto__VarType *message,
-    ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__var_type__descriptor);
-  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
-}
-void paddle_mobile__framework__proto__var_desc__init(
-    PaddleMobile__Framework__Proto__VarDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_DESC__INIT;
-  *message = init_value;
-}
-size_t paddle_mobile__framework__proto__var_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__VarDesc *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__var_desc__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
-
-PaddleMobile__Framework__Proto__VarDesc *
-paddle_mobile__framework__proto__var_desc__unpack(ProtobufCAllocator *allocator,
-                                                  size_t len,
-                                                  const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__VarDesc *)protobuf_c_message_unpack(
-      &paddle_mobile__framework__proto__var_desc__descriptor, allocator, len,
-      data);
-}
-void paddle_mobile__framework__proto__var_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__VarDesc *message,
-    ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__var_desc__descriptor);
-  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
-}
-void paddle_mobile__framework__proto__block_desc__init(
-    PaddleMobile__Framework__Proto__BlockDesc *message) {
-  static const PaddleMobile__Framework__Proto__BlockDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__BLOCK_DESC__INIT;
-  *message = init_value;
-}
-size_t paddle_mobile__framework__proto__block_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__BlockDesc *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__block_desc__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
-
-PaddleMobile__Framework__Proto__BlockDesc *
-paddle_mobile__framework__proto__block_desc__unpack(
-    ProtobufCAllocator *allocator, size_t len, const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__BlockDesc *)protobuf_c_message_unpack(
-      &paddle_mobile__framework__proto__block_desc__descriptor, allocator, len,
-      data);
-}
-void paddle_mobile__framework__proto__block_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__BlockDesc *message,
-    ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__block_desc__descriptor);
-  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
-}
-void paddle_mobile__framework__proto__program_desc__init(
-    PaddleMobile__Framework__Proto__ProgramDesc *message) {
-  static const PaddleMobile__Framework__Proto__ProgramDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__PROGRAM_DESC__INIT;
-  *message = init_value;
-}
-size_t paddle_mobile__framework__proto__program_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__ProgramDesc *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__program_desc__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
-
-PaddleMobile__Framework__Proto__ProgramDesc *
-paddle_mobile__framework__proto__program_desc__unpack(
-    ProtobufCAllocator *allocator, size_t len, const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__ProgramDesc *)
-      protobuf_c_message_unpack(
-          &paddle_mobile__framework__proto__program_desc__descriptor, allocator,
-          len, data);
-}
-void paddle_mobile__framework__proto__program_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__ProgramDesc *message,
-    ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__program_desc__descriptor);
-  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
-}
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_desc__attr__field_descriptors[12] = {
-        {
-            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, name), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "type", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, type),
-            &paddle_mobile__framework__proto__attr_type__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "i", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT32,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_i),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, i), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "f", 4, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_FLOAT,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_f),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, f), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "s", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, s), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "ints", 6, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT32,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_ints),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, ints), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "floats", 7, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_FLOAT,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_floats),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, floats),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "strings", 8, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_STRING,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_strings),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, strings),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "b", 10, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_b),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, b), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "bools", 11, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_bools),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, bools), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "block_idx", 12, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT32,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr,
-                     has_block_idx),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, block_idx),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "l", 13, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT64,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_l),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, l), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name[] = {
-        8,  /* field[8] = b */
-        10, /* field[10] = block_idx */
-        9,  /* field[9] = bools */
-        3,  /* field[3] = f */
-        6,  /* field[6] = floats */
-        2,  /* field[2] = i */
-        5,  /* field[5] = ints */
-        11, /* field[11] = l */
-        0,  /* field[0] = name */
-        4,  /* field[4] = s */
-        7,  /* field[7] = strings */
-        1,  /* field[1] = type */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__op_desc__attr__number_ranges[2 + 1] = {
-        {1, 0}, {10, 8}, {0, 12}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__attr__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpDesc.Attr",
-        "Attr",
-        "PaddleMobile__Framework__Proto__OpDesc__Attr",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpDesc__Attr),
-        12,
-        paddle_mobile__framework__proto__op_desc__attr__field_descriptors,
-        paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name,
-        2,
-        paddle_mobile__framework__proto__op_desc__attr__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__op_desc__attr__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_desc__var__field_descriptors[2] = {
-        {
-            "parameter", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Var, parameter),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "arguments", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_STRING,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Var, n_arguments),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Var, arguments),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_desc__var__field_indices_by_name[] = {
-        1, /* field[1] = arguments */
-        0, /* field[0] = parameter */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__op_desc__var__number_ranges[1 + 1] = {
-        {1, 0}, {0, 2}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__var__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpDesc.Var",
-        "Var",
-        "PaddleMobile__Framework__Proto__OpDesc__Var",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpDesc__Var),
-        2,
-        paddle_mobile__framework__proto__op_desc__var__field_descriptors,
-        paddle_mobile__framework__proto__op_desc__var__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__op_desc__var__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__op_desc__var__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__op_desc__is_target__default_value = 0;
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_desc__field_descriptors[5] = {
-        {
-            "inputs", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, n_inputs),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, inputs),
-            &paddle_mobile__framework__proto__op_desc__var__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "outputs", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, n_outputs),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, outputs),
-            &paddle_mobile__framework__proto__op_desc__var__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "type", 3, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, type), NULL, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "attrs", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, n_attrs),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, attrs),
-            &paddle_mobile__framework__proto__op_desc__attr__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "is_target", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, has_is_target),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, is_target), NULL,
-            &paddle_mobile__framework__proto__op_desc__is_target__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_desc__field_indices_by_name[] = {
-        3, /* field[3] = attrs */
-        0, /* field[0] = inputs */
-        4, /* field[4] = is_target */
-        1, /* field[1] = outputs */
-        2, /* field[2] = type */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__op_desc__number_ranges[1 + 1] = {{1, 0},
-                                                                      {0, 5}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpDesc",
-        "OpDesc",
-        "PaddleMobile__Framework__Proto__OpDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpDesc),
-        5,
-        paddle_mobile__framework__proto__op_desc__field_descriptors,
-        paddle_mobile__framework__proto__op_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__op_desc__number_ranges,
-        (ProtobufCMessageInit)paddle_mobile__framework__proto__op_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__op_proto__var__duplicable__default_value =
-        0;
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__op_proto__var__intermediate__default_value =
-        0;
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__op_proto__var__dispensable__default_value =
-        0;
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_proto__var__field_descriptors[5] = {
-        {
-            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, name), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "comment", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, comment),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "duplicable", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
-                     has_duplicable),
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, duplicable),
-            NULL,
-            &paddle_mobile__framework__proto__op_proto__var__duplicable__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "intermediate", 4, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
-                     has_intermediate),
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
-                     intermediate),
-            NULL,
-            &paddle_mobile__framework__proto__op_proto__var__intermediate__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "dispensable", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
-                     has_dispensable),
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, dispensable),
-            NULL,
-            &paddle_mobile__framework__proto__op_proto__var__dispensable__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_proto__var__field_indices_by_name[] = {
-        1, /* field[1] = comment */
-        4, /* field[4] = dispensable */
-        2, /* field[2] = duplicable */
-        3, /* field[3] = intermediate */
-        0, /* field[0] = name */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__op_proto__var__number_ranges[1 + 1] = {
-        {1, 0}, {0, 5}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__var__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpProto.Var",
-        "Var",
-        "PaddleMobile__Framework__Proto__OpProto__Var",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpProto__Var),
-        5,
-        paddle_mobile__framework__proto__op_proto__var__field_descriptors,
-        paddle_mobile__framework__proto__op_proto__var__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__op_proto__var__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__op_proto__var__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__op_proto__attr__generated__default_value =
-        0;
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_proto__attr__field_descriptors[4] = {
-        {
-            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, name), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "type", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, type),
-            &paddle_mobile__framework__proto__attr_type__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "comment", 3, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, comment),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "generated", 4, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr,
-                     has_generated),
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, generated),
-            NULL,
-            &paddle_mobile__framework__proto__op_proto__attr__generated__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_proto__attr__field_indices_by_name[] = {
-        2, /* field[2] = comment */
-        3, /* field[3] = generated */
-        0, /* field[0] = name */
-        1, /* field[1] = type */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__op_proto__attr__number_ranges[1 + 1] = {
-        {1, 0}, {0, 4}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__attr__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpProto.Attr",
-        "Attr",
-        "PaddleMobile__Framework__Proto__OpProto__Attr",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpProto__Attr),
-        4,
-        paddle_mobile__framework__proto__op_proto__attr__field_descriptors,
-        paddle_mobile__framework__proto__op_proto__attr__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__op_proto__attr__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__op_proto__attr__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_proto__field_descriptors[5] = {
-        {
-            "type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto, type), NULL, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "inputs", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpProto, n_inputs),
-            offsetof(PaddleMobile__Framework__Proto__OpProto, inputs),
-            &paddle_mobile__framework__proto__op_proto__var__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "outputs", 3, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpProto, n_outputs),
-            offsetof(PaddleMobile__Framework__Proto__OpProto, outputs),
-            &paddle_mobile__framework__proto__op_proto__var__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "attrs", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpProto, n_attrs),
-            offsetof(PaddleMobile__Framework__Proto__OpProto, attrs),
-            &paddle_mobile__framework__proto__op_proto__attr__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "comment", 5, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto, comment), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_proto__field_indices_by_name[] = {
-        3, /* field[3] = attrs */
-        4, /* field[4] = comment */
-        1, /* field[1] = inputs */
-        2, /* field[2] = outputs */
-        0, /* field[0] = type */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__op_proto__number_ranges[1 + 1] = {{1, 0},
-                                                                       {0, 5}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpProto",
-        "OpProto",
-        "PaddleMobile__Framework__Proto__OpProto",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpProto),
-        5,
-        paddle_mobile__framework__proto__op_proto__field_descriptors,
-        paddle_mobile__framework__proto__op_proto__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__op_proto__number_ranges,
-        (ProtobufCMessageInit)paddle_mobile__framework__proto__op_proto__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__tensor_desc__field_descriptors
-        [2] = {
-            {
-                "data_type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
-                0, /* quantifier_offset */
-                offsetof(PaddleMobile__Framework__Proto__VarType__TensorDesc,
-                         data_type),
-                &paddle_mobile__framework__proto__var_type__type__descriptor,
-                NULL, 0,      /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-            {
-                "dims", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT64,
-                offsetof(PaddleMobile__Framework__Proto__VarType__TensorDesc,
-                         n_dims),
-                offsetof(PaddleMobile__Framework__Proto__VarType__TensorDesc,
-                         dims),
-                NULL, NULL, 0, /* flags */
-                0, NULL, NULL  /* reserved1,reserved2, etc */
-            },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__tensor_desc__field_indices_by_name
-        [] = {
-            0, /* field[0] = data_type */
-            1, /* field[1] = dims */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__tensor_desc__number_ranges[1 +
-                                                                          1] = {
-        {1, 0}, {0, 2}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__tensor_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.TensorDesc",
-        "TensorDesc",
-        "PaddleMobile__Framework__Proto__VarType__TensorDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__TensorDesc),
-        2,
-        paddle_mobile__framework__proto__var_type__tensor_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_type__tensor_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__tensor_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__tensor_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const int32_t
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__lod_level__default_value =
-        0;
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_descriptors
-        [2] = {
-            {
-                "tensor", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_MESSAGE,
-                0, /* quantifier_offset */
-                offsetof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc,
-                         tensor),
-                &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor,
-                NULL, 0,      /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-            {
-                "lod_level", 2, PROTOBUF_C_LABEL_OPTIONAL,
-                PROTOBUF_C_TYPE_INT32,
-                offsetof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc,
-                         has_lod_level),
-                offsetof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc,
-                         lod_level),
-                NULL,
-                &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__lod_level__default_value,
-                0,            /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_indices_by_name
-        [] = {
-            1, /* field[1] = lod_level */
-            0, /* field[0] = tensor */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__number_ranges
-        [1 + 1] = {{1, 0}, {0, 2}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.LoDTensorDesc",
-        "LoDTensorDesc",
-        "PaddleMobile__Framework__Proto__VarType__LoDTensorDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc),
-        2,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__lo_dtensor_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const int32_t
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__lod_level__default_value =
-        0;
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_descriptors
-        [2] = {
-            {
-                "tensor", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_MESSAGE,
-                0, /* quantifier_offset */
-                offsetof(
-                    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc,
-                    tensor),
-                &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor,
-                NULL, 0,      /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-            {
-                "lod_level", 2, PROTOBUF_C_LABEL_OPTIONAL,
-                PROTOBUF_C_TYPE_INT32,
-                offsetof(
-                    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc,
-                    has_lod_level),
-                offsetof(
-                    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc,
-                    lod_level),
-                NULL,
-                &paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__lod_level__default_value,
-                0,            /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_indices_by_name
-        [] = {
-            1, /* field[1] = lod_level */
-            0, /* field[0] = tensor */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__number_ranges
-        [1 + 1] = {{1, 0}, {0, 2}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc",
-        "LoDTensorArrayDesc",
-        "PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc),
-        2,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__reader_desc__field_descriptors[1] = {
-        {
-            "lod_tensor", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__VarType__ReaderDesc,
-                     n_lod_tensor),
-            offsetof(PaddleMobile__Framework__Proto__VarType__ReaderDesc,
-                     lod_tensor),
-            &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__reader_desc__field_indices_by_name
-        [] = {
-            0, /* field[0] = lod_tensor */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__reader_desc__number_ranges[1 +
-                                                                          1] = {
-        {1, 0}, {0, 1}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__reader_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.ReaderDesc",
-        "ReaderDesc",
-        "PaddleMobile__Framework__Proto__VarType__ReaderDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__ReaderDesc),
-        1,
-        paddle_mobile__framework__proto__var_type__reader_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_type__reader_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__reader_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__reader_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__channel_desc__field_descriptors
-        [2] = {
-            {
-                "data_type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
-                0, /* quantifier_offset */
-                offsetof(PaddleMobile__Framework__Proto__VarType__ChannelDesc,
-                         data_type),
-                &paddle_mobile__framework__proto__var_type__type__descriptor,
-                NULL, 0,      /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-            {
-                "capacity", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_INT64,
-                0, /* quantifier_offset */
-                offsetof(PaddleMobile__Framework__Proto__VarType__ChannelDesc,
-                         capacity),
-                NULL, NULL, 0, /* flags */
-                0, NULL, NULL  /* reserved1,reserved2, etc */
-            },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__channel_desc__field_indices_by_name
-        [] = {
-            1, /* field[1] = capacity */
-            0, /* field[0] = data_type */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__channel_desc__number_ranges[1 +
-                                                                           1] =
-        {{1, 0}, {0, 2}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__channel_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.ChannelDesc",
-        "ChannelDesc",
-        "PaddleMobile__Framework__Proto__VarType__ChannelDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__ChannelDesc),
-        2,
-        paddle_mobile__framework__proto__var_type__channel_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_type__channel_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__channel_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__channel_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__tuple__field_descriptors[1] = {
-        {
-            "element_type", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_ENUM,
-            offsetof(PaddleMobile__Framework__Proto__VarType__Tuple,
-                     n_element_type),
-            offsetof(PaddleMobile__Framework__Proto__VarType__Tuple,
-                     element_type),
-            &paddle_mobile__framework__proto__var_type__type__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__tuple__field_indices_by_name[] =
-        {
-            0, /* field[0] = element_type */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__tuple__number_ranges[1 + 1] = {
-        {1, 0}, {0, 1}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__tuple__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.Tuple",
-        "Tuple",
-        "PaddleMobile__Framework__Proto__VarType__Tuple",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__Tuple),
-        1,
-        paddle_mobile__framework__proto__var_type__tuple__field_descriptors,
-        paddle_mobile__framework__proto__var_type__tuple__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__tuple__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__tuple__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const ProtobufCEnumValue
-    paddle_mobile__framework__proto__var_type__type__enum_values_by_number[19] =
-        {
-            {"BOOL", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL",
-             0},
-            {"INT16", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16",
-             1},
-            {"INT32", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32",
-             2},
-            {"INT64", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64",
-             3},
-            {"FP16", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16",
-             4},
-            {"FP32", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32",
-             5},
-            {"FP64", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64",
-             6},
-            {"LOD_TENSOR",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR", 7},
-            {"SELECTED_ROWS",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SELECTED_ROWS",
-             8},
-            {"FEED_MINIBATCH",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FEED_MINIBATCH",
-             9},
-            {"FETCH_LIST",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FETCH_LIST", 10},
-            {"STEP_SCOPES",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__STEP_SCOPES",
-             11},
-            {"LOD_RANK_TABLE",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_RANK_TABLE",
-             12},
-            {"LOD_TENSOR_ARRAY",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR_"
-             "ARRAY",
-             13},
-            {"PLACE_LIST",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__PLACE_LIST", 14},
-            {"READER",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__READER", 15},
-            {"CHANNEL",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__CHANNEL", 16},
-            {"RAW", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__RAW", 17},
-            {"TUPLE", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__TUPLE",
-             18},
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__type__value_ranges[] = {{0, 0},
-                                                                       {0, 19}};
-static const ProtobufCEnumValueIndex
-    paddle_mobile__framework__proto__var_type__type__enum_values_by_name[19] = {
-        {"BOOL", 0},
-        {"CHANNEL", 16},
-        {"FEED_MINIBATCH", 9},
-        {"FETCH_LIST", 10},
-        {"FP16", 4},
-        {"FP32", 5},
-        {"FP64", 6},
-        {"INT16", 1},
-        {"INT32", 2},
-        {"INT64", 3},
-        {"LOD_RANK_TABLE", 12},
-        {"LOD_TENSOR", 7},
-        {"LOD_TENSOR_ARRAY", 13},
-        {"PLACE_LIST", 14},
-        {"RAW", 17},
-        {"READER", 15},
-        {"SELECTED_ROWS", 8},
-        {"STEP_SCOPES", 11},
-        {"TUPLE", 18},
-};
-const ProtobufCEnumDescriptor
-    paddle_mobile__framework__proto__var_type__type__descriptor = {
-        PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.Type",
-        "Type",
-        "PaddleMobile__Framework__Proto__VarType__Type",
-        "paddle_mobile.framework.proto",
-        19,
-        paddle_mobile__framework__proto__var_type__type__enum_values_by_number,
-        19,
-        paddle_mobile__framework__proto__var_type__type__enum_values_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__type__value_ranges,
-        NULL,
-        NULL,
-        NULL,
-        NULL /* reserved[1234] */
-};
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__field_descriptors[7] = {
-        {
-            "type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, type),
-            &paddle_mobile__framework__proto__var_type__type__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "selected_rows", 2, PROTOBUF_C_LABEL_OPTIONAL,
-            PROTOBUF_C_TYPE_MESSAGE, 0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, selected_rows),
-            &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "lod_tensor", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, lod_tensor),
-            &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "tensor_array", 4, PROTOBUF_C_LABEL_OPTIONAL,
-            PROTOBUF_C_TYPE_MESSAGE, 0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, tensor_array),
-            &paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "reader", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, reader),
-            &paddle_mobile__framework__proto__var_type__reader_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "channel", 6, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, channel),
-            &paddle_mobile__framework__proto__var_type__channel_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "tuple", 7, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, tuple),
-            &paddle_mobile__framework__proto__var_type__tuple__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__field_indices_by_name[] = {
-        5, /* field[5] = channel */
-        2, /* field[2] = lod_tensor */
-        4, /* field[4] = reader */
-        1, /* field[1] = selected_rows */
-        3, /* field[3] = tensor_array */
-        6, /* field[6] = tuple */
-        0, /* field[0] = type */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__number_ranges[1 + 1] = {{1, 0},
-                                                                       {0, 7}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType",
-        "VarType",
-        "PaddleMobile__Framework__Proto__VarType",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType),
-        7,
-        paddle_mobile__framework__proto__var_type__field_descriptors,
-        paddle_mobile__framework__proto__var_type__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__number_ranges,
-        (ProtobufCMessageInit)paddle_mobile__framework__proto__var_type__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__var_desc__persistable__default_value = 0;
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_desc__field_descriptors[3] = {
-        {
-            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarDesc, name), NULL, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "type", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_MESSAGE,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarDesc, type),
-            &paddle_mobile__framework__proto__var_type__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "persistable", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__VarDesc, has_persistable),
-            offsetof(PaddleMobile__Framework__Proto__VarDesc, persistable),
-            NULL,
-            &paddle_mobile__framework__proto__var_desc__persistable__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_desc__field_indices_by_name[] = {
-        0, /* field[0] = name */
-        2, /* field[2] = persistable */
-        1, /* field[1] = type */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__var_desc__number_ranges[1 + 1] = {{1, 0},
-                                                                       {0, 3}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarDesc",
-        "VarDesc",
-        "PaddleMobile__Framework__Proto__VarDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarDesc),
-        3,
-        paddle_mobile__framework__proto__var_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_desc__number_ranges,
-        (ProtobufCMessageInit)paddle_mobile__framework__proto__var_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const int32_t
-    paddle_mobile__framework__proto__block_desc__forward_block_idx__default_value =
-        -1;
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__block_desc__field_descriptors[5] = {
-        {
-            "idx", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_INT32,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, idx), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "parent_idx", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_INT32,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, parent_idx),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "vars", 3, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, n_vars),
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, vars),
-            &paddle_mobile__framework__proto__var_desc__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "ops", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, n_ops),
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, ops),
-            &paddle_mobile__framework__proto__op_desc__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "forward_block_idx", 5, PROTOBUF_C_LABEL_OPTIONAL,
-            PROTOBUF_C_TYPE_INT32,
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc,
-                     has_forward_block_idx),
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc,
-                     forward_block_idx),
-            NULL,
-            &paddle_mobile__framework__proto__block_desc__forward_block_idx__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__block_desc__field_indices_by_name[] = {
-        4, /* field[4] = forward_block_idx */
-        0, /* field[0] = idx */
-        3, /* field[3] = ops */
-        1, /* field[1] = parent_idx */
-        2, /* field[2] = vars */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__block_desc__number_ranges[1 + 1] = {
-        {1, 0}, {0, 5}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__block_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.BlockDesc",
-        "BlockDesc",
-        "PaddleMobile__Framework__Proto__BlockDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__BlockDesc),
-        5,
-        paddle_mobile__framework__proto__block_desc__field_descriptors,
-        paddle_mobile__framework__proto__block_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__block_desc__number_ranges,
-        (ProtobufCMessageInit)paddle_mobile__framework__proto__block_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__program_desc__field_descriptors[1] = {
-        {
-            "blocks", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__ProgramDesc, n_blocks),
-            offsetof(PaddleMobile__Framework__Proto__ProgramDesc, blocks),
-            &paddle_mobile__framework__proto__block_desc__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__program_desc__field_indices_by_name[] = {
-        0, /* field[0] = blocks */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__program_desc__number_ranges[1 + 1] = {
-        {1, 0}, {0, 1}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__program_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.ProgramDesc",
-        "ProgramDesc",
-        "PaddleMobile__Framework__Proto__ProgramDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__ProgramDesc),
-        1,
-        paddle_mobile__framework__proto__program_desc__field_descriptors,
-        paddle_mobile__framework__proto__program_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__program_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__program_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const ProtobufCEnumValue
-    paddle_mobile__framework__proto__attr_type__enum_values_by_number[10] = {
-        {"INT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT", 0},
-        {"FLOAT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT", 1},
-        {"STRING", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING", 2},
-        {"INTS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INTS", 3},
-        {"FLOATS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOATS", 4},
-        {"STRINGS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRINGS", 5},
-        {"BOOLEAN", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN", 6},
-        {"BOOLEANS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS", 7},
-        {"BLOCK", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK", 8},
-        {"LONG", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG", 9},
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__attr_type__value_ranges[] = {{0, 0},
-                                                                  {0, 10}};
-static const ProtobufCEnumValueIndex
-    paddle_mobile__framework__proto__attr_type__enum_values_by_name[10] = {
-        {"BLOCK", 8},  {"BOOLEAN", 6}, {"BOOLEANS", 7}, {"FLOAT", 1},
-        {"FLOATS", 4}, {"INT", 0},     {"INTS", 3},     {"LONG", 9},
-        {"STRING", 2}, {"STRINGS", 5},
-};
-const ProtobufCEnumDescriptor
-    paddle_mobile__framework__proto__attr_type__descriptor = {
-        PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.AttrType",
-        "AttrType",
-        "PaddleMobile__Framework__Proto__AttrType",
-        "paddle_mobile.framework.proto",
-        10,
-        paddle_mobile__framework__proto__attr_type__enum_values_by_number,
-        10,
-        paddle_mobile__framework__proto__attr_type__enum_values_by_name,
-        1,
-        paddle_mobile__framework__proto__attr_type__value_ranges,
-        NULL,
-        NULL,
-        NULL,
-        NULL /* reserved[1234] */
-};
diff --git a/mobile/tools/quantification/src/framework.pb-c.h b/mobile/tools/quantification/src/framework.pb-c.h
deleted file mode 100644
index 3d63bad76a..0000000000
--- a/mobile/tools/quantification/src/framework.pb-c.h
+++ /dev/null
@@ -1,579 +0,0 @@
-/* Generated by the protocol buffer compiler.  DO NOT EDIT! */
-/* Generated from: framework.proto */
-
-#ifndef PROTOBUF_C_framework_2eproto__INCLUDED
-#define PROTOBUF_C_framework_2eproto__INCLUDED
-
-#include "protobuf-c.h"
-
-PROTOBUF_C__BEGIN_DECLS
-
-#if PROTOBUF_C_VERSION_NUMBER < 1000000
-# error This file was generated by a newer version of protoc-c which is incompatible with your libprotobuf-c headers. Please update your headers.
-#elif 1003000 < PROTOBUF_C_MIN_COMPILER_VERSION
-# error This file was generated by an older version of protoc-c which is incompatible with your libprotobuf-c headers. Please regenerate this file with a newer version of protoc-c.
-#endif
-
-typedef struct _PaddleMobile__Framework__Proto__OpDesc
-    PaddleMobile__Framework__Proto__OpDesc;
-typedef struct _PaddleMobile__Framework__Proto__OpDesc__Attr
-    PaddleMobile__Framework__Proto__OpDesc__Attr;
-typedef struct _PaddleMobile__Framework__Proto__OpDesc__Var
-    PaddleMobile__Framework__Proto__OpDesc__Var;
-typedef struct _PaddleMobile__Framework__Proto__OpProto
-    PaddleMobile__Framework__Proto__OpProto;
-typedef struct _PaddleMobile__Framework__Proto__OpProto__Var
-    PaddleMobile__Framework__Proto__OpProto__Var;
-typedef struct _PaddleMobile__Framework__Proto__OpProto__Attr
-    PaddleMobile__Framework__Proto__OpProto__Attr;
-typedef struct _PaddleMobile__Framework__Proto__VarType
-    PaddleMobile__Framework__Proto__VarType;
-typedef struct _PaddleMobile__Framework__Proto__VarType__TensorDesc
-    PaddleMobile__Framework__Proto__VarType__TensorDesc;
-typedef struct _PaddleMobile__Framework__Proto__VarType__LoDTensorDesc
-    PaddleMobile__Framework__Proto__VarType__LoDTensorDesc;
-typedef struct _PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc
-    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc;
-typedef struct _PaddleMobile__Framework__Proto__VarType__ReaderDesc
-    PaddleMobile__Framework__Proto__VarType__ReaderDesc;
-typedef struct _PaddleMobile__Framework__Proto__VarType__ChannelDesc
-    PaddleMobile__Framework__Proto__VarType__ChannelDesc;
-typedef struct _PaddleMobile__Framework__Proto__VarType__Tuple
-    PaddleMobile__Framework__Proto__VarType__Tuple;
-typedef struct _PaddleMobile__Framework__Proto__VarDesc
-    PaddleMobile__Framework__Proto__VarDesc;
-typedef struct _PaddleMobile__Framework__Proto__BlockDesc
-    PaddleMobile__Framework__Proto__BlockDesc;
-typedef struct _PaddleMobile__Framework__Proto__ProgramDesc
-    PaddleMobile__Framework__Proto__ProgramDesc;
-
-/* --- enums --- */
-
-typedef enum _PaddleMobile__Framework__Proto__VarType__Type {
-  /*
-   * Pod Types
-   */
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL = 0,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16 = 1,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32 = 2,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64 = 3,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16 = 4,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32 = 5,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64 = 6,
-  /*
-   * Other types that may need additional descriptions
-   */
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR = 7,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SELECTED_ROWS = 8,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FEED_MINIBATCH = 9,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FETCH_LIST = 10,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__STEP_SCOPES = 11,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_RANK_TABLE = 12,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR_ARRAY = 13,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__PLACE_LIST = 14,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__READER = 15,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__CHANNEL = 16,
-  /*
-   * Any runtime decided variable type is raw
-   * raw variables should manage their own allocations
-   * in operators like nccl_op
-   */
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__RAW = 17,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__TUPLE =
-      18 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(
-          PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE)
-} PaddleMobile__Framework__Proto__VarType__Type;
-typedef enum _PaddleMobile__Framework__Proto__AttrType {
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT = 0,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT = 1,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING = 2,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INTS = 3,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOATS = 4,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRINGS = 5,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN = 6,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS = 7,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK = 8,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG =
-      9 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(
-          PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE)
-} PaddleMobile__Framework__Proto__AttrType;
-
-/* --- messages --- */
-
-struct _PaddleMobile__Framework__Proto__OpDesc__Attr {
-  ProtobufCMessage base;
-  char *name;
-  PaddleMobile__Framework__Proto__AttrType type;
-  protobuf_c_boolean has_i;
-  int32_t i;
-  protobuf_c_boolean has_f;
-  float f;
-  char *s;
-  size_t n_ints;
-  int32_t *ints;
-  size_t n_floats;
-  float *floats;
-  size_t n_strings;
-  char **strings;
-  protobuf_c_boolean has_b;
-  protobuf_c_boolean b;
-  size_t n_bools;
-  protobuf_c_boolean *bools;
-  protobuf_c_boolean has_block_idx;
-  int32_t block_idx;
-  protobuf_c_boolean has_l;
-  int64_t l;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__ATTR__INIT                   \
-  {                                                                            \
-    PROTOBUF_C_MESSAGE_INIT(                                                   \
-        &paddle_mobile__framework__proto__op_desc__attr__descriptor)           \
-    , NULL, PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT, 0, 0, 0, 0, NULL, \
-        0, NULL, 0, NULL, 0, NULL, 0, 0, 0, NULL, 0, 0, 0, 0                   \
-  }
-
-struct _PaddleMobile__Framework__Proto__OpDesc__Var {
-  ProtobufCMessage base;
-  char *parameter;
-  size_t n_arguments;
-  char **arguments;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__VAR__INIT         \
-  {                                                                 \
-    PROTOBUF_C_MESSAGE_INIT(                                        \
-        &paddle_mobile__framework__proto__op_desc__var__descriptor) \
-    , NULL, 0, NULL                                                 \
-  }
-
-/*
- * OpDesc describes an instance of a C++ framework::OperatorBase
- * derived class type.
- */
-struct _PaddleMobile__Framework__Proto__OpDesc {
-  ProtobufCMessage base;
-  char *type;
-  size_t n_inputs;
-  PaddleMobile__Framework__Proto__OpDesc__Var **inputs;
-  size_t n_outputs;
-  PaddleMobile__Framework__Proto__OpDesc__Var **outputs;
-  size_t n_attrs;
-  PaddleMobile__Framework__Proto__OpDesc__Attr **attrs;
-  protobuf_c_boolean has_is_target;
-  protobuf_c_boolean is_target;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__INIT         \
-  {                                                            \
-    PROTOBUF_C_MESSAGE_INIT(                                   \
-        &paddle_mobile__framework__proto__op_desc__descriptor) \
-    , NULL, 0, NULL, 0, NULL, 0, NULL, 0, 0                    \
-  }
-
-/*
- * VarProto describes the C++ type framework::Variable.
- */
-struct _PaddleMobile__Framework__Proto__OpProto__Var {
-  ProtobufCMessage base;
-  char *name;
-  char *comment;
-  protobuf_c_boolean has_duplicable;
-  protobuf_c_boolean duplicable;
-  protobuf_c_boolean has_intermediate;
-  protobuf_c_boolean intermediate;
-  protobuf_c_boolean has_dispensable;
-  protobuf_c_boolean dispensable;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__VAR__INIT         \
-  {                                                                  \
-    PROTOBUF_C_MESSAGE_INIT(                                         \
-        &paddle_mobile__framework__proto__op_proto__var__descriptor) \
-    , NULL, NULL, 0, 0, 0, 0, 0, 0                                   \
-  }
-
-/*
- * AttrProto describes the C++ type Attribute.
- */
-struct _PaddleMobile__Framework__Proto__OpProto__Attr {
-  ProtobufCMessage base;
-  char *name;
-  PaddleMobile__Framework__Proto__AttrType type;
-  char *comment;
-  /*
-   * If that attribute is generated, it means the Paddle third
-   * language binding has responsibility to fill that
-   * attribute. End-User should not set that attribute.
-   */
-  protobuf_c_boolean has_generated;
-  protobuf_c_boolean generated;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__ATTR__INIT           \
-  {                                                                     \
-    PROTOBUF_C_MESSAGE_INIT(                                            \
-        &paddle_mobile__framework__proto__op_proto__attr__descriptor)   \
-    , NULL, PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT, NULL, 0, 0 \
-  }
-
-/*
- * OpProto describes a C++ framework::OperatorBase derived class.
- */
-struct _PaddleMobile__Framework__Proto__OpProto {
-  ProtobufCMessage base;
-  char *type;
-  size_t n_inputs;
-  PaddleMobile__Framework__Proto__OpProto__Var **inputs;
-  size_t n_outputs;
-  PaddleMobile__Framework__Proto__OpProto__Var **outputs;
-  size_t n_attrs;
-  PaddleMobile__Framework__Proto__OpProto__Attr **attrs;
-  char *comment;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__INIT         \
-  {                                                             \
-    PROTOBUF_C_MESSAGE_INIT(                                    \
-        &paddle_mobile__framework__proto__op_proto__descriptor) \
-    , NULL, 0, NULL, 0, NULL, 0, NULL, NULL                     \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__TensorDesc {
-  ProtobufCMessage base;
-  /*
-   * Should only be PODType. Is enforced in C++
-   */
-  PaddleMobile__Framework__Proto__VarType__Type data_type;
-  /*
-   * [UNK, 640, 480] is saved as [-1, 640, 480]
-   */
-  size_t n_dims;
-  int64_t *dims;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TENSOR_DESC__INIT         \
-  {                                                                          \
-    PROTOBUF_C_MESSAGE_INIT(                                                 \
-        &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor) \
-    , PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL, 0, NULL         \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__LoDTensorDesc {
-  ProtobufCMessage base;
-  PaddleMobile__Framework__Proto__VarType__TensorDesc *tensor;
-  protobuf_c_boolean has_lod_level;
-  int32_t lod_level;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_DESC__INIT         \
-  {                                                                              \
-    PROTOBUF_C_MESSAGE_INIT(                                                     \
-        &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor) \
-    , NULL, 0, 0                                                                 \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc {
-  ProtobufCMessage base;
-  PaddleMobile__Framework__Proto__VarType__TensorDesc *tensor;
-  protobuf_c_boolean has_lod_level;
-  int32_t lod_level;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_ARRAY_DESC__INIT         \
-  {                                                                                    \
-    PROTOBUF_C_MESSAGE_INIT(                                                           \
-        &paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor) \
-    , NULL, 0, 0                                                                       \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__ReaderDesc {
-  ProtobufCMessage base;
-  size_t n_lod_tensor;
-  PaddleMobile__Framework__Proto__VarType__LoDTensorDesc **lod_tensor;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__READER_DESC__INIT         \
-  {                                                                          \
-    PROTOBUF_C_MESSAGE_INIT(                                                 \
-        &paddle_mobile__framework__proto__var_type__reader_desc__descriptor) \
-    , 0, NULL                                                                \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__ChannelDesc {
-  ProtobufCMessage base;
-  PaddleMobile__Framework__Proto__VarType__Type data_type;
-  int64_t capacity;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__CHANNEL_DESC__INIT         \
-  {                                                                           \
-    PROTOBUF_C_MESSAGE_INIT(                                                  \
-        &paddle_mobile__framework__proto__var_type__channel_desc__descriptor) \
-    , PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL, 0                \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__Tuple {
-  ProtobufCMessage base;
-  size_t n_element_type;
-  PaddleMobile__Framework__Proto__VarType__Type *element_type;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TUPLE__INIT         \
-  {                                                                    \
-    PROTOBUF_C_MESSAGE_INIT(                                           \
-        &paddle_mobile__framework__proto__var_type__tuple__descriptor) \
-    , 0, NULL                                                          \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType {
-  ProtobufCMessage base;
-  PaddleMobile__Framework__Proto__VarType__Type type;
-  PaddleMobile__Framework__Proto__VarType__TensorDesc *selected_rows;
-  PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *lod_tensor;
-  PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *tensor_array;
-  PaddleMobile__Framework__Proto__VarType__ReaderDesc *reader;
-  PaddleMobile__Framework__Proto__VarType__ChannelDesc *channel;
-  PaddleMobile__Framework__Proto__VarType__Tuple *tuple;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__INIT                        \
-  {                                                                            \
-    PROTOBUF_C_MESSAGE_INIT(                                                   \
-        &paddle_mobile__framework__proto__var_type__descriptor)                \
-    , PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL, NULL, NULL, NULL, \
-        NULL, NULL, NULL                                                       \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarDesc {
-  ProtobufCMessage base;
-  char *name;
-  PaddleMobile__Framework__Proto__VarType *type;
-  protobuf_c_boolean has_persistable;
-  protobuf_c_boolean persistable;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_DESC__INIT         \
-  {                                                             \
-    PROTOBUF_C_MESSAGE_INIT(                                    \
-        &paddle_mobile__framework__proto__var_desc__descriptor) \
-    , NULL, NULL, 0, 0                                          \
-  }
-
-struct _PaddleMobile__Framework__Proto__BlockDesc {
-  ProtobufCMessage base;
-  int32_t idx;
-  int32_t parent_idx;
-  size_t n_vars;
-  PaddleMobile__Framework__Proto__VarDesc **vars;
-  size_t n_ops;
-  PaddleMobile__Framework__Proto__OpDesc **ops;
-  protobuf_c_boolean has_forward_block_idx;
-  int32_t forward_block_idx;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__BLOCK_DESC__INIT         \
-  {                                                               \
-    PROTOBUF_C_MESSAGE_INIT(                                      \
-        &paddle_mobile__framework__proto__block_desc__descriptor) \
-    , 0, 0, 0, NULL, 0, NULL, 0, -1                               \
-  }
-
-/*
- * Please refer to
- * https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
- * for more details.
- * TODO(panyx0718): A model can have multiple programs. Need a
- * way to distinguish them. Maybe ID or name?
- */
-struct _PaddleMobile__Framework__Proto__ProgramDesc {
-  ProtobufCMessage base;
-  size_t n_blocks;
-  PaddleMobile__Framework__Proto__BlockDesc **blocks;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__PROGRAM_DESC__INIT         \
-  {                                                                 \
-    PROTOBUF_C_MESSAGE_INIT(                                        \
-        &paddle_mobile__framework__proto__program_desc__descriptor) \
-    , 0, NULL                                                       \
-  }
-
-/* PaddleMobile__Framework__Proto__OpDesc__Attr methods */
-void paddle_mobile__framework__proto__op_desc__attr__init(
-    PaddleMobile__Framework__Proto__OpDesc__Attr *message);
-/* PaddleMobile__Framework__Proto__OpDesc__Var methods */
-void paddle_mobile__framework__proto__op_desc__var__init(
-    PaddleMobile__Framework__Proto__OpDesc__Var *message);
-/* PaddleMobile__Framework__Proto__OpDesc methods */
-void paddle_mobile__framework__proto__op_desc__init(
-    PaddleMobile__Framework__Proto__OpDesc *message);
-
-size_t paddle_mobile__framework__proto__op_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__OpDesc *message);
-
-PaddleMobile__Framework__Proto__OpDesc *
-paddle_mobile__framework__proto__op_desc__unpack(ProtobufCAllocator *allocator,
-                                                 size_t len,
-                                                 const uint8_t *data);
-void paddle_mobile__framework__proto__op_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__OpDesc *message,
-    ProtobufCAllocator *allocator);
-/* PaddleMobile__Framework__Proto__OpProto__Var methods */
-void paddle_mobile__framework__proto__op_proto__var__init(
-    PaddleMobile__Framework__Proto__OpProto__Var *message);
-/* PaddleMobile__Framework__Proto__OpProto__Attr methods */
-void paddle_mobile__framework__proto__op_proto__attr__init(
-    PaddleMobile__Framework__Proto__OpProto__Attr *message);
-/* PaddleMobile__Framework__Proto__OpProto methods */
-void paddle_mobile__framework__proto__op_proto__init(
-    PaddleMobile__Framework__Proto__OpProto *message);
-size_t paddle_mobile__framework__proto__op_proto__get_packed_size(
-    const PaddleMobile__Framework__Proto__OpProto *message);
-PaddleMobile__Framework__Proto__OpProto *
-paddle_mobile__framework__proto__op_proto__unpack(ProtobufCAllocator *allocator,
-                                                  size_t len,
-                                                  const uint8_t *data);
-void paddle_mobile__framework__proto__op_proto__free_unpacked(
-    PaddleMobile__Framework__Proto__OpProto *message,
-    ProtobufCAllocator *allocator);
-/* PaddleMobile__Framework__Proto__VarType__TensorDesc methods */
-void paddle_mobile__framework__proto__var_type__tensor_desc__init(
-    PaddleMobile__Framework__Proto__VarType__TensorDesc *message);
-/* PaddleMobile__Framework__Proto__VarType__LoDTensorDesc methods */
-void paddle_mobile__framework__proto__var_type__lo_dtensor_desc__init(
-    PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *message);
-/* PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc methods */
-void paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__init(
-    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *message);
-/* PaddleMobile__Framework__Proto__VarType__ReaderDesc methods */
-void paddle_mobile__framework__proto__var_type__reader_desc__init(
-    PaddleMobile__Framework__Proto__VarType__ReaderDesc *message);
-/* PaddleMobile__Framework__Proto__VarType__ChannelDesc methods */
-void paddle_mobile__framework__proto__var_type__channel_desc__init(
-    PaddleMobile__Framework__Proto__VarType__ChannelDesc *message);
-/* PaddleMobile__Framework__Proto__VarType__Tuple methods */
-void paddle_mobile__framework__proto__var_type__tuple__init(
-    PaddleMobile__Framework__Proto__VarType__Tuple *message);
-/* PaddleMobile__Framework__Proto__VarType methods */
-void paddle_mobile__framework__proto__var_type__init(
-    PaddleMobile__Framework__Proto__VarType *message);
-size_t paddle_mobile__framework__proto__var_type__get_packed_size(
-    const PaddleMobile__Framework__Proto__VarType *message);
-PaddleMobile__Framework__Proto__VarType *
-paddle_mobile__framework__proto__var_type__unpack(ProtobufCAllocator *allocator,
-                                                  size_t len,
-                                                  const uint8_t *data);
-void paddle_mobile__framework__proto__var_type__free_unpacked(
-    PaddleMobile__Framework__Proto__VarType *message,
-    ProtobufCAllocator *allocator);
-/* PaddleMobile__Framework__Proto__VarDesc methods */
-void paddle_mobile__framework__proto__var_desc__init(
-    PaddleMobile__Framework__Proto__VarDesc *message);
-size_t paddle_mobile__framework__proto__var_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__VarDesc *message);
-PaddleMobile__Framework__Proto__VarDesc *
-paddle_mobile__framework__proto__var_desc__unpack(ProtobufCAllocator *allocator,
-                                                  size_t len,
-                                                  const uint8_t *data);
-void paddle_mobile__framework__proto__var_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__VarDesc *message,
-    ProtobufCAllocator *allocator);
-/* PaddleMobile__Framework__Proto__BlockDesc methods */
-void paddle_mobile__framework__proto__block_desc__init(
-    PaddleMobile__Framework__Proto__BlockDesc *message);
-size_t paddle_mobile__framework__proto__block_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__BlockDesc *message);
-PaddleMobile__Framework__Proto__BlockDesc *
-paddle_mobile__framework__proto__block_desc__unpack(
-    ProtobufCAllocator *allocator, size_t len, const uint8_t *data);
-void paddle_mobile__framework__proto__block_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__BlockDesc *message,
-    ProtobufCAllocator *allocator);
-/* PaddleMobile__Framework__Proto__ProgramDesc methods */
-void paddle_mobile__framework__proto__program_desc__init(
-    PaddleMobile__Framework__Proto__ProgramDesc *message);
-size_t paddle_mobile__framework__proto__program_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__ProgramDesc *message);
-PaddleMobile__Framework__Proto__ProgramDesc *
-paddle_mobile__framework__proto__program_desc__unpack(
-    ProtobufCAllocator *allocator, size_t len, const uint8_t *data);
-void paddle_mobile__framework__proto__program_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__ProgramDesc *message,
-    ProtobufCAllocator *allocator);
-/* --- per-message closures --- */
-
-typedef void (*PaddleMobile__Framework__Proto__OpDesc__Attr_Closure)(
-    const PaddleMobile__Framework__Proto__OpDesc__Attr *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__OpDesc__Var_Closure)(
-    const PaddleMobile__Framework__Proto__OpDesc__Var *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__OpDesc_Closure)(
-    const PaddleMobile__Framework__Proto__OpDesc *message, void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__OpProto__Var_Closure)(
-    const PaddleMobile__Framework__Proto__OpProto__Var *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__OpProto__Attr_Closure)(
-    const PaddleMobile__Framework__Proto__OpProto__Attr *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__OpProto_Closure)(
-    const PaddleMobile__Framework__Proto__OpProto *message, void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType__TensorDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__TensorDesc *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType__LoDTensorDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *message,
-    void *closure_data);
-typedef void (
-    *PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType__ReaderDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__ReaderDesc *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType__ChannelDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__ChannelDesc *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType__Tuple_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__Tuple *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType_Closure)(
-    const PaddleMobile__Framework__Proto__VarType *message, void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarDesc *message, void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__BlockDesc_Closure)(
-    const PaddleMobile__Framework__Proto__BlockDesc *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__ProgramDesc_Closure)(
-    const PaddleMobile__Framework__Proto__ProgramDesc *message,
-    void *closure_data);
-
-/* --- services --- */
-
-/* --- descriptors --- */
-
-extern const ProtobufCEnumDescriptor
-    paddle_mobile__framework__proto__attr_type__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__attr__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__var__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__var__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__attr__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__tensor_desc__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__reader_desc__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__channel_desc__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__tuple__descriptor;
-extern const ProtobufCEnumDescriptor
-    paddle_mobile__framework__proto__var_type__type__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_desc__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__block_desc__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__program_desc__descriptor;
-
-PROTOBUF_C__END_DECLS
-
-#endif /* PROTOBUF_C_framework_2eproto__INCLUDED */
diff --git a/mobile/tools/quantification/src/program_desc.cpp b/mobile/tools/quantification/src/program_desc.cpp
deleted file mode 100644
index 4f9984832a..0000000000
--- a/mobile/tools/quantification/src/program_desc.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-//
-// Created by 谢柏渊 on 2018/7/25.
-//
-
-#include "src/program_desc.h"
-#include <vector>
-
-ProgramDesc::ProgramDesc(PaddleMobile__Framework__Proto__ProgramDesc *desc) {
-  for (int i = 0; i < desc->n_blocks; ++i) {
-    blocks_.emplace_back(std::make_shared<BlockDesc>(desc->blocks[i]));
-  }
-}
-
-const std::vector<std::shared_ptr<BlockDesc>> ProgramDesc::Blocks() {
-  return blocks_;
-}
diff --git a/mobile/tools/quantification/src/program_desc.h b/mobile/tools/quantification/src/program_desc.h
deleted file mode 100644
index 60a0f757b0..0000000000
--- a/mobile/tools/quantification/src/program_desc.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-//
-// Created by 谢柏渊 on 2018/7/25.
-//
-
-#ifndef TOOLS_QUANTIFICATION_SRC_PROGRAM_DESC_H_
-#define TOOLS_QUANTIFICATION_SRC_PROGRAM_DESC_H_
-
-#include <memory>
-#include <vector>
-#include "src/block_desc_local.h"
-#include "src/framework.pb-c.h"
-
-class ProgramDesc {
- public:
-  //    friend class Node;
-  //
-  //    friend class ProgramOptimize;
-
-  explicit ProgramDesc(PaddleMobile__Framework__Proto__ProgramDesc *desc);
-
-  const std::vector<std::shared_ptr<BlockDesc>> Blocks();
-
- private:
-  std::vector<std::shared_ptr<BlockDesc>> blocks_;
-};
-
-#endif  // TOOLS_QUANTIFICATION_SRC_PROGRAM_DESC_H_
diff --git a/mobile/tools/quantification/src/protobuf-c.c b/mobile/tools/quantification/src/protobuf-c.c
deleted file mode 100644
index 1092e3f78b..0000000000
--- a/mobile/tools/quantification/src/protobuf-c.c
+++ /dev/null
@@ -1,2098 +0,0 @@
-/*
- * Copyright (c) 2008-2015, Dave Benson and the protobuf-c authors.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- *     * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- *     * Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following disclaimer
- * in the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*! \file
- * Support library for `protoc-c` generated code.
- *
- * This file implements the public API used by the code generated
- * by `protoc-c`.
- *
- * \authors Dave Benson and the protobuf-c authors
- *
- * \copyright 2008-2014. Licensed under the terms of the [BSD-2-Clause] license.
- */
-
-/**
- * \todo 64-BIT OPTIMIZATION: certain implementations use 32-bit math
- * even on 64-bit platforms (uint64_size, uint64_pack, parse_uint64).
- *
- * \todo Use size_t consistently.
- */
-
-#include <stdlib.h> /* for malloc, free */
-#include <string.h> /* for strcmp, strlen, memcpy, memmove, memset */
-
-#include "protobuf-c.h"
-
-#define TRUE 1
-#define FALSE 0
-
-#define PROTOBUF_C__ASSERT_NOT_REACHED() assert(0)
-
-/* Workaround for Microsoft compilers. */
-#ifdef _MSC_VER
-#define inline __inline
-#endif
-
-/**
- * \defgroup internal Internal functions and macros
- *
- * These are not exported by the library but are useful to developers working
- * on `libprotobuf-c` itself.
- */
-
-/**
- * \defgroup macros Utility macros for manipulating structures
- *
- * Macros and constants used to manipulate the base "classes" generated by
- * `protobuf-c`. They also define limits and check correctness.
- *
- * \ingroup internal
- * @{
- */
-
-/** The maximum length of a 64-bit integer in varint encoding. */
-#define MAX_UINT64_ENCODED_SIZE 10
-
-#ifndef PROTOBUF_C_UNPACK_ERROR
-#define PROTOBUF_C_UNPACK_ERROR(...)
-#endif
-
-const char protobuf_c_empty_string[] = "";
-
-/**
- * Internal `ProtobufCMessage` manipulation macro.
- *
- * Base macro for manipulating a `ProtobufCMessage`. Used by STRUCT_MEMBER() and
- * STRUCT_MEMBER_PTR().
- */
-#define STRUCT_MEMBER_P(struct_p, struct_offset) \
-  ((void *)((uint8_t *)(struct_p) + (struct_offset)))
-
-/**
- * Return field in a `ProtobufCMessage` based on offset.
- *
- * Take a pointer to a `ProtobufCMessage` and find the field at the offset.
- * Cast it to the passed type.
- */
-#define STRUCT_MEMBER(member_type, struct_p, struct_offset) \
-  (*(member_type *)STRUCT_MEMBER_P((struct_p), (struct_offset)))
-
-/**
- * Return field in a `ProtobufCMessage` based on offset.
- *
- * Take a pointer to a `ProtobufCMessage` and find the field at the offset. Cast
- * it to a pointer to the passed type.
- */
-#define STRUCT_MEMBER_PTR(member_type, struct_p, struct_offset) \
-  ((member_type *)STRUCT_MEMBER_P((struct_p), (struct_offset)))
-
-/* Assertions for magic numbers. */
-
-#define ASSERT_IS_ENUM_DESCRIPTOR(desc) \
-  assert((desc)->magic == PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC)
-
-#define ASSERT_IS_MESSAGE_DESCRIPTOR(desc) \
-  assert((desc)->magic == PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC)
-
-#define ASSERT_IS_MESSAGE(message) \
-  ASSERT_IS_MESSAGE_DESCRIPTOR((message)->descriptor)
-
-#define ASSERT_IS_SERVICE_DESCRIPTOR(desc) \
-  assert((desc)->magic == PROTOBUF_C__SERVICE_DESCRIPTOR_MAGIC)
-
-/**@}*/
-
-/* --- version --- */
-
-const char *protobuf_c_version(void) { return PROTOBUF_C_VERSION; }
-
-uint32_t protobuf_c_version_number(void) { return PROTOBUF_C_VERSION_NUMBER; }
-
-/* --- allocator --- */
-
-static void *system_alloc(void *allocator_data, size_t size) {
-  return malloc(size);
-}
-
-static void system_free(void *allocator_data, void *data) { free(data); }
-
-static inline void *do_alloc(ProtobufCAllocator *allocator, size_t size) {
-  return allocator->alloc(allocator->allocator_data, size);
-}
-
-static inline void do_free(ProtobufCAllocator *allocator, void *data) {
-  if (data != NULL) allocator->free(allocator->allocator_data, data);
-}
-
-/*
- * This allocator uses the system's malloc() and free(). It is the default
- * allocator used if NULL is passed as the ProtobufCAllocator to an exported
- * function.
- */
-static ProtobufCAllocator protobuf_c__allocator = {
-    .alloc = &system_alloc,
-    .free = &system_free,
-    .allocator_data = NULL,
-};
-
-/* === buffer-simple === */
-
-void protobuf_c_buffer_simple_append(ProtobufCBuffer *buffer, size_t len,
-                                     const uint8_t *data) {
-  ProtobufCBufferSimple *simp = (ProtobufCBufferSimple *)buffer;
-  size_t new_len = simp->len + len;
-
-  if (new_len > simp->alloced) {
-    ProtobufCAllocator *allocator = simp->allocator;
-    size_t new_alloced = simp->alloced * 2;
-    uint8_t *new_data;
-
-    if (allocator == NULL) allocator = &protobuf_c__allocator;
-    while (new_alloced < new_len) new_alloced += new_alloced;
-    new_data = do_alloc(allocator, new_alloced);
-    if (!new_data) return;
-    memcpy(new_data, simp->data, simp->len);
-    if (simp->must_free_data)
-      do_free(allocator, simp->data);
-    else
-      simp->must_free_data = TRUE;
-    simp->data = new_data;
-    simp->alloced = new_alloced;
-  }
-  memcpy(simp->data + simp->len, data, len);
-  simp->len = new_len;
-}
-
-/**
- * \defgroup packedsz protobuf_c_message_get_packed_size() implementation
- *
- * Routines mainly used by protobuf_c_message_get_packed_size().
- *
- * \ingroup internal
- * @{
- */
-
-/**
- * Return the number of bytes required to store the tag for the field. Includes
- * 3 bits for the wire-type, and a single bit that denotes the end-of-tag.
- *
- * \param number
- *      Field tag to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t get_tag_size(uint32_t number) {
-  if (number < (1UL << 4)) {
-    return 1;
-  } else if (number < (1UL << 11)) {
-    return 2;
-  } else if (number < (1UL << 18)) {
-    return 3;
-  } else if (number < (1UL << 25)) {
-    return 4;
-  } else {
-    return 5;
-  }
-}
-
-/**
- * Return the number of bytes required to store a variable-length unsigned
- * 32-bit integer in base-128 varint encoding.
- *
- * \param v
- *      Value to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t uint32_size(uint32_t v) {
-  if (v < (1UL << 7)) {
-    return 1;
-  } else if (v < (1UL << 14)) {
-    return 2;
-  } else if (v < (1UL << 21)) {
-    return 3;
-  } else if (v < (1UL << 28)) {
-    return 4;
-  } else {
-    return 5;
-  }
-}
-
-/**
- * Return the number of bytes required to store a variable-length signed 32-bit
- * integer in base-128 varint encoding.
- *
- * \param v
- *      Value to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t int32_size(int32_t v) {
-  if (v < 0) {
-    return 10;
-  } else if (v < (1L << 7)) {
-    return 1;
-  } else if (v < (1L << 14)) {
-    return 2;
-  } else if (v < (1L << 21)) {
-    return 3;
-  } else if (v < (1L << 28)) {
-    return 4;
-  } else {
-    return 5;
-  }
-}
-
-/**
- * Return the ZigZag-encoded 32-bit unsigned integer form of a 32-bit signed
- * integer.
- *
- * \param v
- *      Value to encode.
- * \return
- *      ZigZag encoded integer.
- */
-static inline uint32_t zigzag32(int32_t v) {
-  if (v < 0)
-    return (-(uint32_t)v) * 2 - 1;
-  else
-    return (uint32_t)(v)*2;
-}
-
-/**
- * Return the number of bytes required to store a signed 32-bit integer,
- * converted to an unsigned 32-bit integer with ZigZag encoding, using base-128
- * varint encoding.
- *
- * \param v
- *      Value to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t sint32_size(int32_t v) { return uint32_size(zigzag32(v)); }
-
-/**
- * Return the number of bytes required to store a 64-bit unsigned integer in
- * base-128 varint encoding.
- *
- * \param v
- *      Value to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t uint64_size(uint64_t v) {
-  uint32_t upper_v = (uint32_t)(v >> 32);
-
-  if (upper_v == 0) {
-    return uint32_size((uint32_t)v);
-  } else if (upper_v < (1UL << 3)) {
-    return 5;
-  } else if (upper_v < (1UL << 10)) {
-    return 6;
-  } else if (upper_v < (1UL << 17)) {
-    return 7;
-  } else if (upper_v < (1UL << 24)) {
-    return 8;
-  } else if (upper_v < (1UL << 31)) {
-    return 9;
-  } else {
-    return 10;
-  }
-}
-
-/**
- * Return the ZigZag-encoded 64-bit unsigned integer form of a 64-bit signed
- * integer.
- *
- * \param v
- *      Value to encode.
- * \return
- *      ZigZag encoded integer.
- */
-static inline uint64_t zigzag64(int64_t v) {
-  if (v < 0)
-    return (-(uint64_t)v) * 2 - 1;
-  else
-    return (uint64_t)(v)*2;
-}
-
-/**
- * Return the number of bytes required to store a signed 64-bit integer,
- * converted to an unsigned 64-bit integer with ZigZag encoding, using base-128
- * varint encoding.
- *
- * \param v
- *      Value to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t sint64_size(int64_t v) { return uint64_size(zigzag64(v)); }
-
-/**
- * Calculate the serialized size of a single required message field, including
- * the space needed by the preceding tag.
- *
- * \param field
- *      Field descriptor for member.
- * \param member
- *      Field to encode.
- * \return
- *      Number of bytes required.
- */
-static size_t required_field_get_packed_size(
-    const ProtobufCFieldDescriptor *field, const void *member) {
-  size_t rv = get_tag_size(field->id);
-
-  switch (field->type) {
-    case PROTOBUF_C_TYPE_SINT32:
-      return rv + sint32_size(*(const int32_t *)member);
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-      return rv + int32_size(*(const int32_t *)member);
-    case PROTOBUF_C_TYPE_UINT32:
-      return rv + uint32_size(*(const uint32_t *)member);
-    case PROTOBUF_C_TYPE_SINT64:
-      return rv + sint64_size(*(const int64_t *)member);
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      return rv + uint64_size(*(const uint64_t *)member);
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-      return rv + 4;
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-      return rv + 8;
-    case PROTOBUF_C_TYPE_BOOL:
-      return rv + 1;
-    case PROTOBUF_C_TYPE_FLOAT:
-      return rv + 4;
-    case PROTOBUF_C_TYPE_DOUBLE:
-      return rv + 8;
-    case PROTOBUF_C_TYPE_STRING: {
-      const char *str = *(char *const *)member;
-      size_t len = str ? strlen(str) : 0;
-      return rv + uint32_size(len) + len;
-    }
-    case PROTOBUF_C_TYPE_BYTES: {
-      size_t len = ((const ProtobufCBinaryData *)member)->len;
-      return rv + uint32_size(len) + len;
-    }
-    case PROTOBUF_C_TYPE_MESSAGE: {
-      const ProtobufCMessage *msg = *(ProtobufCMessage *const *)member;
-      size_t subrv = msg ? protobuf_c_message_get_packed_size(msg) : 0;
-      return rv + uint32_size(subrv) + subrv;
-    }
-  }
-  PROTOBUF_C__ASSERT_NOT_REACHED();
-  return 0;
-}
-
-/**
- * Calculate the serialized size of a single oneof message field, including
- * the space needed by the preceding tag. Returns 0 if the oneof field isn't
- * selected or is not set.
- *
- * \param field
- *      Field descriptor for member.
- * \param oneof_case
- *      Enum value that selects the field in the oneof.
- * \param member
- *      Field to encode.
- * \return
- *      Number of bytes required.
- */
-static size_t oneof_field_get_packed_size(const ProtobufCFieldDescriptor *field,
-                                          uint32_t oneof_case,
-                                          const void *member) {
-  if (oneof_case != field->id) {
-    return 0;
-  }
-  if (field->type == PROTOBUF_C_TYPE_MESSAGE ||
-      field->type == PROTOBUF_C_TYPE_STRING) {
-    const void *ptr = *(const void *const *)member;
-    if (ptr == NULL || ptr == field->default_value) return 0;
-  }
-  return required_field_get_packed_size(field, member);
-}
-
-/**
- * Calculate the serialized size of a single optional message field, including
- * the space needed by the preceding tag. Returns 0 if the optional field isn't
- * set.
- *
- * \param field
- *      Field descriptor for member.
- * \param has
- *      True if the field exists, false if not.
- * \param member
- *      Field to encode.
- * \return
- *      Number of bytes required.
- */
-static size_t optional_field_get_packed_size(
-    const ProtobufCFieldDescriptor *field, const protobuf_c_boolean has,
-    const void *member) {
-  if (field->type == PROTOBUF_C_TYPE_MESSAGE ||
-      field->type == PROTOBUF_C_TYPE_STRING) {
-    const void *ptr = *(const void *const *)member;
-    if (ptr == NULL || ptr == field->default_value) return 0;
-  } else {
-    if (!has) return 0;
-  }
-  return required_field_get_packed_size(field, member);
-}
-
-static protobuf_c_boolean field_is_zeroish(
-    const ProtobufCFieldDescriptor *field, const void *member) {
-  protobuf_c_boolean ret = FALSE;
-
-  switch (field->type) {
-    case PROTOBUF_C_TYPE_BOOL:
-      ret = (0 == *(const protobuf_c_boolean *)member);
-      break;
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_SINT32:
-    case PROTOBUF_C_TYPE_INT32:
-    case PROTOBUF_C_TYPE_UINT32:
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-      ret = (0 == *(const uint32_t *)member);
-      break;
-    case PROTOBUF_C_TYPE_SINT64:
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-      ret = (0 == *(const uint64_t *)member);
-      break;
-    case PROTOBUF_C_TYPE_FLOAT:
-      ret = (0 == *(const float *)member);
-      break;
-    case PROTOBUF_C_TYPE_DOUBLE:
-      ret = (0 == *(const double *)member);
-      break;
-    case PROTOBUF_C_TYPE_STRING:
-      ret = (NULL == *(const char *const *)member) ||
-            ('\0' == **(const char *const *)member);
-      break;
-    case PROTOBUF_C_TYPE_BYTES:
-    case PROTOBUF_C_TYPE_MESSAGE:
-      ret = (NULL == *(const void *const *)member);
-      break;
-    default:
-      ret = TRUE;
-      break;
-  }
-
-  return ret;
-}
-
-/**
- * Calculate the serialized size of a single unlabeled message field, including
- * the space needed by the preceding tag. Returns 0 if the field isn't set or
- * if it is set to a "zeroish" value (null pointer or 0 for numerical values).
- * Unlabeled fields are supported only in proto3.
- *
- * \param field
- *      Field descriptor for member.
- * \param member
- *      Field to encode.
- * \return
- *      Number of bytes required.
- */
-static size_t unlabeled_field_get_packed_size(
-    const ProtobufCFieldDescriptor *field, const void *member) {
-  if (field_is_zeroish(field, member)) return 0;
-  return required_field_get_packed_size(field, member);
-}
-
-/**
- * Calculate the serialized size of repeated message fields, which may consist
- * of any number of values (including 0). Includes the space needed by the
- * preceding tags (as needed).
- *
- * \param field
- *      Field descriptor for member.
- * \param count
- *      Number of repeated field members.
- * \param member
- *      Field to encode.
- * \return
- *      Number of bytes required.
- */
-static size_t repeated_field_get_packed_size(
-    const ProtobufCFieldDescriptor *field, size_t count, const void *member) {
-  size_t header_size;
-  size_t rv = 0;
-  unsigned i;
-  void *array = *(void *const *)member;
-
-  if (count == 0) return 0;
-  header_size = get_tag_size(field->id);
-  if (0 == (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED)) header_size *= count;
-
-  switch (field->type) {
-    case PROTOBUF_C_TYPE_SINT32:
-      for (i = 0; i < count; i++) rv += sint32_size(((int32_t *)array)[i]);
-      break;
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-      for (i = 0; i < count; i++) rv += int32_size(((int32_t *)array)[i]);
-      break;
-    case PROTOBUF_C_TYPE_UINT32:
-      for (i = 0; i < count; i++) rv += uint32_size(((uint32_t *)array)[i]);
-      break;
-    case PROTOBUF_C_TYPE_SINT64:
-      for (i = 0; i < count; i++) rv += sint64_size(((int64_t *)array)[i]);
-      break;
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      for (i = 0; i < count; i++) rv += uint64_size(((uint64_t *)array)[i]);
-      break;
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-      rv += 4 * count;
-      break;
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      rv += 8 * count;
-      break;
-    case PROTOBUF_C_TYPE_BOOL:
-      rv += count;
-      break;
-    case PROTOBUF_C_TYPE_STRING:
-      for (i = 0; i < count; i++) {
-        size_t len = strlen(((char **)array)[i]);
-        rv += uint32_size(len) + len;
-      }
-      break;
-    case PROTOBUF_C_TYPE_BYTES:
-      for (i = 0; i < count; i++) {
-        size_t len = ((ProtobufCBinaryData *)array)[i].len;
-        rv += uint32_size(len) + len;
-      }
-      break;
-    case PROTOBUF_C_TYPE_MESSAGE:
-      for (i = 0; i < count; i++) {
-        size_t len =
-            protobuf_c_message_get_packed_size(((ProtobufCMessage **)array)[i]);
-        rv += uint32_size(len) + len;
-      }
-      break;
-  }
-
-  if (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED))
-    header_size += uint32_size(rv);
-  return header_size + rv;
-}
-
-/**
- * Calculate the serialized size of an unknown field, i.e. one that is passed
- * through mostly uninterpreted. This is required for forward compatibility if
- * new fields are added to the message descriptor.
- *
- * \param field
- *      Unknown field type.
- * \return
- *      Number of bytes required.
- */
-static inline size_t unknown_field_get_packed_size(
-    const ProtobufCMessageUnknownField *field) {
-  return get_tag_size(field->tag) + field->len;
-}
-
-/**@}*/
-
-/*
- * Calculate the serialized size of the message.
- */
-size_t protobuf_c_message_get_packed_size(const ProtobufCMessage *message) {
-  unsigned i;
-  size_t rv = 0;
-
-  ASSERT_IS_MESSAGE(message);
-  for (i = 0; i < message->descriptor->n_fields; i++) {
-    const ProtobufCFieldDescriptor *field = message->descriptor->fields + i;
-    const void *member = ((const char *)message) + field->offset;
-    const void *qmember = ((const char *)message) + field->quantifier_offset;
-
-    if (field->label == PROTOBUF_C_LABEL_REQUIRED) {
-      rv += required_field_get_packed_size(field, member);
-    } else if ((field->label == PROTOBUF_C_LABEL_OPTIONAL ||
-                field->label == PROTOBUF_C_LABEL_NONE) &&
-               (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_ONEOF))) {
-      rv += oneof_field_get_packed_size(field, *(const uint32_t *)qmember,
-                                        member);
-    } else if (field->label == PROTOBUF_C_LABEL_OPTIONAL) {
-      rv += optional_field_get_packed_size(
-          field, *(protobuf_c_boolean *)qmember, member);
-    } else if (field->label == PROTOBUF_C_LABEL_NONE) {
-      rv += unlabeled_field_get_packed_size(field, member);
-    } else {
-      rv += repeated_field_get_packed_size(field, *(const size_t *)qmember,
-                                           member);
-    }
-  }
-  for (i = 0; i < message->n_unknown_fields; i++)
-    rv += unknown_field_get_packed_size(&message->unknown_fields[i]);
-  return rv;
-}
-
-/**
- * \defgroup pack protobuf_c_message_pack() implementation
- *
- * Routines mainly used by protobuf_c_message_pack().
- *
- * \ingroup internal
- * @{
- */
-
-/**
- * Pack an unsigned 32-bit integer in base-128 varint encoding and return the
- * number of bytes written, which must be 5 or less.
- *
- * \param value
- *      Value to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static inline size_t uint32_pack(uint32_t value, uint8_t *out) {
-  unsigned rv = 0;
-
-  if (value >= 0x80) {
-    out[rv++] = value | 0x80;
-    value >>= 7;
-    if (value >= 0x80) {
-      out[rv++] = value | 0x80;
-      value >>= 7;
-      if (value >= 0x80) {
-        out[rv++] = value | 0x80;
-        value >>= 7;
-        if (value >= 0x80) {
-          out[rv++] = value | 0x80;
-          value >>= 7;
-        }
-      }
-    }
-  }
-  /* assert: value<128 */
-  out[rv++] = value;
-  return rv;
-}
-
-/**
- * Pack a 64-bit unsigned integer using base-128 varint encoding and return the
- * number of bytes written.
- *
- * \param value
- *      Value to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static size_t uint64_pack(uint64_t value, uint8_t *out) {
-  uint32_t hi = (uint32_t)(value >> 32);
-  uint32_t lo = (uint32_t)value;
-  unsigned rv;
-
-  if (hi == 0) return uint32_pack((uint32_t)lo, out);
-  out[0] = (lo) | 0x80;
-  out[1] = (lo >> 7) | 0x80;
-  out[2] = (lo >> 14) | 0x80;
-  out[3] = (lo >> 21) | 0x80;
-  if (hi < 8) {
-    out[4] = (hi << 4) | (lo >> 28);
-    return 5;
-  } else {
-    out[4] = ((hi & 7) << 4) | (lo >> 28) | 0x80;
-    hi >>= 3;
-  }
-  rv = 5;
-  while (hi >= 128) {
-    out[rv++] = hi | 0x80;
-    hi >>= 7;
-  }
-  out[rv++] = hi;
-  return rv;
-}
-
-/**
- * Pack a ProtobufCBinaryData and return the number of bytes written. The output
- * includes a length delimiter.
- *
- * \param bd
- *      ProtobufCBinaryData to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static inline size_t binary_data_pack(const ProtobufCBinaryData *bd,
-                                      uint8_t *out) {
-  size_t len = bd->len;
-  size_t rv = uint32_pack(len, out);
-  memcpy(out + rv, bd->data, len);
-  return rv + len;
-}
-
-/**
- * Pack a field tag.
- *
- * Wire-type will be added in required_field_pack().
- *
- * \todo Just call uint64_pack on 64-bit platforms.
- *
- * \param id
- *      Tag value to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static size_t tag_pack(uint32_t id, uint8_t *out) {
-  if (id < (1UL << (32 - 3)))
-    return uint32_pack(id << 3, out);
-  else
-    return uint64_pack(((uint64_t)id) << 3, out);
-}
-
-/**
- * Given a field type, return the in-memory size.
- *
- * \todo Implement as a table lookup.
- *
- * \param type
- *      Field type.
- * \return
- *      Size of the field.
- */
-static inline size_t sizeof_elt_in_repeated_array(ProtobufCType type) {
-  switch (type) {
-    case PROTOBUF_C_TYPE_SINT32:
-    case PROTOBUF_C_TYPE_INT32:
-    case PROTOBUF_C_TYPE_UINT32:
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-    case PROTOBUF_C_TYPE_ENUM:
-      return 4;
-    case PROTOBUF_C_TYPE_SINT64:
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      return 8;
-    case PROTOBUF_C_TYPE_BOOL:
-      return sizeof(protobuf_c_boolean);
-    case PROTOBUF_C_TYPE_STRING:
-    case PROTOBUF_C_TYPE_MESSAGE:
-      return sizeof(void *);
-    case PROTOBUF_C_TYPE_BYTES:
-      return sizeof(ProtobufCBinaryData);
-  }
-  PROTOBUF_C__ASSERT_NOT_REACHED();
-  return 0;
-}
-
-static inline int int_range_lookup(unsigned n_ranges,
-                                   const ProtobufCIntRange *ranges, int value) {
-  unsigned n;
-  unsigned start;
-
-  if (n_ranges == 0) return -1;
-  start = 0;
-  n = n_ranges;
-  while (n > 1) {
-    unsigned mid = start + n / 2;
-
-    if (value < ranges[mid].start_value) {
-      n = mid - start;
-    } else if (value >=
-               ranges[mid].start_value +
-                   (int)(ranges[mid + 1].orig_index - ranges[mid].orig_index)) {
-      unsigned new_start = mid + 1;
-      n = start + n - new_start;
-      start = new_start;
-    } else
-      return (value - ranges[mid].start_value) + ranges[mid].orig_index;
-  }
-  if (n > 0) {
-    unsigned start_orig_index = ranges[start].orig_index;
-    unsigned range_size = ranges[start + 1].orig_index - start_orig_index;
-
-    if (ranges[start].start_value <= value &&
-        value < (int)(ranges[start].start_value + range_size)) {
-      return (value - ranges[start].start_value) + start_orig_index;
-    }
-  }
-  return -1;
-}
-
-static size_t parse_tag_and_wiretype(size_t len, const uint8_t *data,
-                                     uint32_t *tag_out,
-                                     ProtobufCWireType *wiretype_out) {
-  unsigned max_rv = len > 5 ? 5 : len;
-  uint32_t tag = (data[0] & 0x7f) >> 3;
-  unsigned shift = 4;
-  unsigned rv;
-
-  *wiretype_out = data[0] & 7;
-  if ((data[0] & 0x80) == 0) {
-    *tag_out = tag;
-    return 1;
-  }
-  for (rv = 1; rv < max_rv; rv++) {
-    if (data[rv] & 0x80) {
-      tag |= (data[rv] & 0x7f) << shift;
-      shift += 7;
-    } else {
-      tag |= data[rv] << shift;
-      *tag_out = tag;
-      return rv + 1;
-    }
-  }
-  return 0; /* error: bad header */
-}
-
-/* sizeof(ScannedMember) must be <= (1UL<<BOUND_SIZEOF_SCANNED_MEMBER_LOG2) */
-#define BOUND_SIZEOF_SCANNED_MEMBER_LOG2 5
-typedef struct _ScannedMember ScannedMember;
-/** Field as it's being read. */
-struct _ScannedMember {
-  uint32_t tag;                          /**< Field tag. */
-  uint8_t wire_type;                     /**< Field type. */
-  uint8_t length_prefix_len;             /**< Prefix length. */
-  const ProtobufCFieldDescriptor *field; /**< Field descriptor. */
-  size_t len;                            /**< Field length. */
-  const uint8_t *data;                   /**< Pointer to field data. */
-};
-
-static inline uint32_t scan_length_prefixed_data(size_t len,
-                                                 const uint8_t *data,
-                                                 size_t *prefix_len_out) {
-  unsigned hdr_max = len < 5 ? len : 5;
-  unsigned hdr_len;
-  uint32_t val = 0;
-  unsigned i;
-  unsigned shift = 0;
-
-  for (i = 0; i < hdr_max; i++) {
-    val |= (data[i] & 0x7f) << shift;
-    shift += 7;
-    if ((data[i] & 0x80) == 0) break;
-  }
-  if (i == hdr_max) {
-    PROTOBUF_C_UNPACK_ERROR("error parsing length for length-prefixed data");
-    return 0;
-  }
-  hdr_len = i + 1;
-  *prefix_len_out = hdr_len;
-  if (hdr_len + val > len) {
-    PROTOBUF_C_UNPACK_ERROR("data too short after length-prefix of %u", val);
-    return 0;
-  }
-  return hdr_len + val;
-}
-
-static size_t max_b128_numbers(size_t len, const uint8_t *data) {
-  size_t rv = 0;
-  while (len--)
-    if ((*data++ & 0x80) == 0) ++rv;
-  return rv;
-}
-
-/**@}*/
-
-/**
- * Merge earlier message into a latter message.
- *
- * For numeric types and strings, if the same value appears multiple
- * times, the parser accepts the last value it sees. For embedded
- * message fields, the parser merges multiple instances of the same
- * field. That is, all singular scalar fields in the latter instance
- * replace those in the former, singular embedded messages are merged,
- * and repeated fields are concatenated.
- *
- * The earlier message should be freed after calling this function, as
- * some of its fields may have been reused and changed to their default
- * values during the merge.
- */
-static protobuf_c_boolean merge_messages(ProtobufCMessage *earlier_msg,
-                                         ProtobufCMessage *latter_msg,
-                                         ProtobufCAllocator *allocator) {
-  unsigned i;
-  const ProtobufCFieldDescriptor *fields = latter_msg->descriptor->fields;
-  for (i = 0; i < latter_msg->descriptor->n_fields; i++) {
-    if (fields[i].label == PROTOBUF_C_LABEL_REPEATED) {
-      size_t *n_earlier =
-          STRUCT_MEMBER_PTR(size_t, earlier_msg, fields[i].quantifier_offset);
-      uint8_t **p_earlier =
-          STRUCT_MEMBER_PTR(uint8_t *, earlier_msg, fields[i].offset);
-      size_t *n_latter =
-          STRUCT_MEMBER_PTR(size_t, latter_msg, fields[i].quantifier_offset);
-      uint8_t **p_latter =
-          STRUCT_MEMBER_PTR(uint8_t *, latter_msg, fields[i].offset);
-
-      if (*n_earlier > 0) {
-        if (*n_latter > 0) {
-          /* Concatenate the repeated field */
-          size_t el_size = sizeof_elt_in_repeated_array(fields[i].type);
-          uint8_t *new_field;
-
-          new_field = do_alloc(allocator, (*n_earlier + *n_latter) * el_size);
-          if (!new_field) return FALSE;
-
-          memcpy(new_field, *p_earlier, *n_earlier * el_size);
-          memcpy(new_field + *n_earlier * el_size, *p_latter,
-                 *n_latter * el_size);
-
-          do_free(allocator, *p_latter);
-          do_free(allocator, *p_earlier);
-          *p_latter = new_field;
-          *n_latter = *n_earlier + *n_latter;
-        } else {
-          /* Zero copy the repeated field from the earlier message */
-          *n_latter = *n_earlier;
-          *p_latter = *p_earlier;
-        }
-        /* Make sure the field does not get double freed */
-        *n_earlier = 0;
-        *p_earlier = 0;
-      }
-    } else if (fields[i].label == PROTOBUF_C_LABEL_OPTIONAL ||
-               fields[i].label == PROTOBUF_C_LABEL_NONE) {
-      const ProtobufCFieldDescriptor *field;
-      uint32_t *earlier_case_p =
-          STRUCT_MEMBER_PTR(uint32_t, earlier_msg, fields[i].quantifier_offset);
-      uint32_t *latter_case_p =
-          STRUCT_MEMBER_PTR(uint32_t, latter_msg, fields[i].quantifier_offset);
-      protobuf_c_boolean need_to_merge = FALSE;
-      void *earlier_elem;
-      void *latter_elem;
-      const void *def_val;
-
-      if (fields[i].flags & PROTOBUF_C_FIELD_FLAG_ONEOF) {
-        if (*latter_case_p == 0) {
-          /* lookup correct oneof field */
-          int field_index = int_range_lookup(
-              latter_msg->descriptor->n_field_ranges,
-              latter_msg->descriptor->field_ranges, *earlier_case_p);
-          field = latter_msg->descriptor->fields + field_index;
-        } else {
-          /* Oneof is present in the latter message, move on */
-          continue;
-        }
-      } else {
-        field = &fields[i];
-      }
-
-      earlier_elem = STRUCT_MEMBER_P(earlier_msg, field->offset);
-      latter_elem = STRUCT_MEMBER_P(latter_msg, field->offset);
-      def_val = field->default_value;
-
-      switch (field->type) {
-        case PROTOBUF_C_TYPE_MESSAGE: {
-          ProtobufCMessage *em = *(ProtobufCMessage **)earlier_elem;
-          ProtobufCMessage *lm = *(ProtobufCMessage **)latter_elem;
-          if (em != NULL) {
-            if (lm != NULL) {
-              if (!merge_messages(em, lm, allocator)) return FALSE;
-              /* Already merged */
-              need_to_merge = FALSE;
-            } else {
-              /* Zero copy the message */
-              need_to_merge = TRUE;
-            }
-          }
-          break;
-        }
-        case PROTOBUF_C_TYPE_BYTES: {
-          uint8_t *e_data = ((ProtobufCBinaryData *)earlier_elem)->data;
-          uint8_t *l_data = ((ProtobufCBinaryData *)latter_elem)->data;
-          const ProtobufCBinaryData *d_bd = (ProtobufCBinaryData *)def_val;
-
-          need_to_merge =
-              (e_data != NULL && (d_bd == NULL || e_data != d_bd->data)) &&
-              (l_data == NULL || (d_bd != NULL && l_data == d_bd->data));
-          break;
-        }
-        case PROTOBUF_C_TYPE_STRING: {
-          char *e_str = *(char **)earlier_elem;
-          char *l_str = *(char **)latter_elem;
-          const char *d_str = def_val;
-
-          need_to_merge = e_str != d_str && l_str == d_str;
-          break;
-        }
-        default: {
-          /* Could be has field or case enum, the logic is
-           * equivalent, since 0 (FALSE) means not set for
-           * oneof */
-          need_to_merge = (*earlier_case_p != 0) && (*latter_case_p == 0);
-          break;
-        }
-      }
-
-      if (need_to_merge) {
-        size_t el_size = sizeof_elt_in_repeated_array(field->type);
-        memcpy(latter_elem, earlier_elem, el_size);
-        /*
-         * Reset the element from the old message to 0
-         * to make sure earlier message deallocation
-         * doesn't corrupt zero-copied data in the new
-         * message, earlier message will be freed after
-         * this function is called anyway
-         */
-        memset(earlier_elem, 0, el_size);
-
-        if (field->quantifier_offset != 0) {
-          /* Set the has field or the case enum,
-           * if applicable */
-          *latter_case_p = *earlier_case_p;
-          *earlier_case_p = 0;
-        }
-      }
-    }
-  }
-  return TRUE;
-}
-
-/**
- * Count packed elements.
- *
- * Given a raw slab of packed-repeated values, determine the number of
- * elements. This function detects certain kinds of errors but not
- * others; the remaining error checking is done by
- * parse_packed_repeated_member().
- */
-static protobuf_c_boolean count_packed_elements(ProtobufCType type, size_t len,
-                                                const uint8_t *data,
-                                                size_t *count_out) {
-  switch (type) {
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-      if (len % 4 != 0) {
-        PROTOBUF_C_UNPACK_ERROR(
-            "length must be a multiple of 4 for fixed-length 32-bit types");
-        return FALSE;
-      }
-      *count_out = len / 4;
-      return TRUE;
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      if (len % 8 != 0) {
-        PROTOBUF_C_UNPACK_ERROR(
-            "length must be a multiple of 8 for fixed-length 64-bit types");
-        return FALSE;
-      }
-      *count_out = len / 8;
-      return TRUE;
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-    case PROTOBUF_C_TYPE_SINT32:
-    case PROTOBUF_C_TYPE_UINT32:
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_SINT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      *count_out = max_b128_numbers(len, data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_BOOL:
-      *count_out = len;
-      return TRUE;
-    case PROTOBUF_C_TYPE_STRING:
-    case PROTOBUF_C_TYPE_BYTES:
-    case PROTOBUF_C_TYPE_MESSAGE:
-    default:
-      PROTOBUF_C_UNPACK_ERROR("bad protobuf-c type %u for packed-repeated",
-                              type);
-      return FALSE;
-  }
-}
-
-static inline uint32_t parse_uint32(unsigned len, const uint8_t *data) {
-  uint32_t rv = data[0] & 0x7f;
-  if (len > 1) {
-    rv |= ((uint32_t)(data[1] & 0x7f) << 7);
-    if (len > 2) {
-      rv |= ((uint32_t)(data[2] & 0x7f) << 14);
-      if (len > 3) {
-        rv |= ((uint32_t)(data[3] & 0x7f) << 21);
-        if (len > 4) rv |= ((uint32_t)(data[4]) << 28);
-      }
-    }
-  }
-  return rv;
-}
-
-static inline uint32_t parse_int32(unsigned len, const uint8_t *data) {
-  return parse_uint32(len, data);
-}
-
-static inline int32_t unzigzag32(uint32_t v) {
-  if (v & 1)
-    return -(v >> 1) - 1;
-  else
-    return v >> 1;
-}
-
-static inline uint32_t parse_fixed_uint32(const uint8_t *data) {
-#if !defined(WORDS_BIGENDIAN)
-  uint32_t t;
-  memcpy(&t, data, 4);
-  return t;
-#else
-  return data[0] | ((uint32_t)(data[1]) << 8) | ((uint32_t)(data[2]) << 16) |
-         ((uint32_t)(data[3]) << 24);
-#endif
-}
-
-static uint64_t parse_uint64(unsigned len, const uint8_t *data) {
-  unsigned shift, i;
-  uint64_t rv;
-
-  if (len < 5) return parse_uint32(len, data);
-  rv = ((uint64_t)(data[0] & 0x7f)) | ((uint64_t)(data[1] & 0x7f) << 7) |
-       ((uint64_t)(data[2] & 0x7f) << 14) | ((uint64_t)(data[3] & 0x7f) << 21);
-  shift = 28;
-  for (i = 4; i < len; i++) {
-    rv |= (((uint64_t)(data[i] & 0x7f)) << shift);
-    shift += 7;
-  }
-  return rv;
-}
-
-static inline int64_t unzigzag64(uint64_t v) {
-  if (v & 1)
-    return -(v >> 1) - 1;
-  else
-    return v >> 1;
-}
-
-static inline uint64_t parse_fixed_uint64(const uint8_t *data) {
-#if !defined(WORDS_BIGENDIAN)
-  uint64_t t;
-  memcpy(&t, data, 8);
-  return t;
-#else
-  return (uint64_t)parse_fixed_uint32(data) |
-         (((uint64_t)parse_fixed_uint32(data + 4)) << 32);
-#endif
-}
-
-static protobuf_c_boolean parse_boolean(unsigned len, const uint8_t *data) {
-  unsigned i;
-  for (i = 0; i < len; i++)
-    if (data[i] & 0x7f) return TRUE;
-  return FALSE;
-}
-
-static protobuf_c_boolean parse_required_member(
-    ScannedMember *scanned_member, void *member, ProtobufCAllocator *allocator,
-    protobuf_c_boolean maybe_clear) {
-  unsigned len = scanned_member->len;
-  const uint8_t *data = scanned_member->data;
-  ProtobufCWireType wire_type = scanned_member->wire_type;
-
-  switch (scanned_member->field->type) {
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
-      *(int32_t *)member = parse_int32(len, data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_UINT32:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
-      *(uint32_t *)member = parse_uint32(len, data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_SINT32:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
-      *(int32_t *)member = unzigzag32(parse_uint32(len, data));
-      return TRUE;
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_32BIT) return FALSE;
-      *(uint32_t *)member = parse_fixed_uint32(data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
-      *(uint64_t *)member = parse_uint64(len, data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_SINT64:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
-      *(int64_t *)member = unzigzag64(parse_uint64(len, data));
-      return TRUE;
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_64BIT) return FALSE;
-      *(uint64_t *)member = parse_fixed_uint64(data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_BOOL:
-      *(protobuf_c_boolean *)member = parse_boolean(len, data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_STRING: {
-      char **pstr = member;
-      unsigned pref_len = scanned_member->length_prefix_len;
-
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED) return FALSE;
-
-      if (maybe_clear && *pstr != NULL) {
-        const char *def = scanned_member->field->default_value;
-        if (*pstr != NULL && *pstr != def) do_free(allocator, *pstr);
-      }
-      *pstr = do_alloc(allocator, len - pref_len + 1);
-      if (*pstr == NULL) return FALSE;
-      memcpy(*pstr, data + pref_len, len - pref_len);
-      (*pstr)[len - pref_len] = 0;
-      return TRUE;
-    }
-    case PROTOBUF_C_TYPE_BYTES: {
-      ProtobufCBinaryData *bd = member;
-      const ProtobufCBinaryData *def_bd;
-      unsigned pref_len = scanned_member->length_prefix_len;
-
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED) return FALSE;
-
-      def_bd = scanned_member->field->default_value;
-      if (maybe_clear && bd->data != NULL &&
-          (def_bd == NULL || bd->data != def_bd->data)) {
-        do_free(allocator, bd->data);
-      }
-      if (len - pref_len > 0) {
-        bd->data = do_alloc(allocator, len - pref_len);
-        if (bd->data == NULL) return FALSE;
-        memcpy(bd->data, data + pref_len, len - pref_len);
-      } else {
-        bd->data = NULL;
-      }
-      bd->len = len - pref_len;
-      return TRUE;
-    }
-    case PROTOBUF_C_TYPE_MESSAGE: {
-      ProtobufCMessage **pmessage = member;
-      ProtobufCMessage *subm;
-      const ProtobufCMessage *def_mess;
-      protobuf_c_boolean merge_successful = TRUE;
-      unsigned pref_len = scanned_member->length_prefix_len;
-
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED) return FALSE;
-
-      def_mess = scanned_member->field->default_value;
-      subm =
-          protobuf_c_message_unpack(scanned_member->field->descriptor,
-                                    allocator, len - pref_len, data + pref_len);
-
-      if (maybe_clear && *pmessage != NULL && *pmessage != def_mess) {
-        if (subm != NULL)
-          merge_successful = merge_messages(*pmessage, subm, allocator);
-        /* Delete the previous message */
-        protobuf_c_message_free_unpacked(*pmessage, allocator);
-      }
-      *pmessage = subm;
-      if (subm == NULL || !merge_successful) return FALSE;
-      return TRUE;
-    }
-  }
-  return FALSE;
-}
-
-static protobuf_c_boolean parse_oneof_member(ScannedMember *scanned_member,
-                                             void *member,
-                                             ProtobufCMessage *message,
-                                             ProtobufCAllocator *allocator) {
-  uint32_t *oneof_case = STRUCT_MEMBER_PTR(
-      uint32_t, message, scanned_member->field->quantifier_offset);
-
-  /* If we have already parsed a member of this oneof, free it. */
-  if (*oneof_case != 0) {
-    /* lookup field */
-    int field_index =
-        int_range_lookup(message->descriptor->n_field_ranges,
-                         message->descriptor->field_ranges, *oneof_case);
-    const ProtobufCFieldDescriptor *old_field =
-        message->descriptor->fields + field_index;
-    size_t el_size = sizeof_elt_in_repeated_array(old_field->type);
-
-    switch (old_field->type) {
-      case PROTOBUF_C_TYPE_STRING: {
-        char **pstr = member;
-        const char *def = old_field->default_value;
-        if (*pstr != NULL && *pstr != def) do_free(allocator, *pstr);
-        break;
-      }
-      case PROTOBUF_C_TYPE_BYTES: {
-        ProtobufCBinaryData *bd = member;
-        const ProtobufCBinaryData *def_bd = old_field->default_value;
-        if (bd->data != NULL && (def_bd == NULL || bd->data != def_bd->data)) {
-          do_free(allocator, bd->data);
-        }
-        break;
-      }
-      case PROTOBUF_C_TYPE_MESSAGE: {
-        ProtobufCMessage **pmessage = member;
-        const ProtobufCMessage *def_mess = old_field->default_value;
-        if (*pmessage != NULL && *pmessage != def_mess)
-          protobuf_c_message_free_unpacked(*pmessage, allocator);
-        break;
-      }
-      default:
-        break;
-    }
-
-    memset(member, 0, el_size);
-  }
-  if (!parse_required_member(scanned_member, member, allocator, TRUE))
-    return FALSE;
-
-  *oneof_case = scanned_member->tag;
-  return TRUE;
-}
-
-static protobuf_c_boolean parse_optional_member(ScannedMember *scanned_member,
-                                                void *member,
-                                                ProtobufCMessage *message,
-                                                ProtobufCAllocator *allocator) {
-  if (!parse_required_member(scanned_member, member, allocator, TRUE))
-    return FALSE;
-  if (scanned_member->field->quantifier_offset != 0)
-    STRUCT_MEMBER(protobuf_c_boolean, message,
-                  scanned_member->field->quantifier_offset) = TRUE;
-  return TRUE;
-}
-
-static protobuf_c_boolean parse_repeated_member(ScannedMember *scanned_member,
-                                                void *member,
-                                                ProtobufCMessage *message,
-                                                ProtobufCAllocator *allocator) {
-  const ProtobufCFieldDescriptor *field = scanned_member->field;
-  size_t *p_n = STRUCT_MEMBER_PTR(size_t, message, field->quantifier_offset);
-  size_t siz = sizeof_elt_in_repeated_array(field->type);
-  char *array = *(char **)member;
-
-  if (!parse_required_member(scanned_member, array + siz * (*p_n), allocator,
-                             FALSE)) {
-    return FALSE;
-  }
-  *p_n += 1;
-  return TRUE;
-}
-
-static unsigned scan_varint(unsigned len, const uint8_t *data) {
-  unsigned i;
-  if (len > 10) len = 10;
-  for (i = 0; i < len; i++)
-    if ((data[i] & 0x80) == 0) break;
-  if (i == len) return 0;
-  return i + 1;
-}
-
-static protobuf_c_boolean parse_packed_repeated_member(
-    ScannedMember *scanned_member, void *member, ProtobufCMessage *message) {
-  const ProtobufCFieldDescriptor *field = scanned_member->field;
-  size_t *p_n = STRUCT_MEMBER_PTR(size_t, message, field->quantifier_offset);
-  size_t siz = sizeof_elt_in_repeated_array(field->type);
-  void *array = *(char **)member + siz * (*p_n);
-  const uint8_t *at = scanned_member->data + scanned_member->length_prefix_len;
-  size_t rem = scanned_member->len - scanned_member->length_prefix_len;
-  size_t count = 0;
-  unsigned i;
-
-  switch (field->type) {
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-      count = (scanned_member->len - scanned_member->length_prefix_len) / 4;
-#if !defined(WORDS_BIGENDIAN)
-      goto no_unpacking_needed;
-#else
-      for (i = 0; i < count; i++) {
-        ((uint32_t *)array)[i] = parse_fixed_uint32(at);
-        at += 4;
-      }
-      break;
-#endif
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      count = (scanned_member->len - scanned_member->length_prefix_len) / 8;
-#if !defined(WORDS_BIGENDIAN)
-      goto no_unpacking_needed;
-#else
-      for (i = 0; i < count; i++) {
-        ((uint64_t *)array)[i] = parse_fixed_uint64(at);
-        at += 8;
-      }
-      break;
-#endif
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-      while (rem > 0) {
-        unsigned s = scan_varint(rem, at);
-        if (s == 0) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated int32 value");
-          return FALSE;
-        }
-        ((int32_t *)array)[count++] = parse_int32(s, at);
-        at += s;
-        rem -= s;
-      }
-      break;
-    case PROTOBUF_C_TYPE_SINT32:
-      while (rem > 0) {
-        unsigned s = scan_varint(rem, at);
-        if (s == 0) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated sint32 value");
-          return FALSE;
-        }
-        ((int32_t *)array)[count++] = unzigzag32(parse_uint32(s, at));
-        at += s;
-        rem -= s;
-      }
-      break;
-    case PROTOBUF_C_TYPE_UINT32:
-      while (rem > 0) {
-        unsigned s = scan_varint(rem, at);
-        if (s == 0) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated enum or uint32 value");
-          return FALSE;
-        }
-        ((uint32_t *)array)[count++] = parse_uint32(s, at);
-        at += s;
-        rem -= s;
-      }
-      break;
-
-    case PROTOBUF_C_TYPE_SINT64:
-      while (rem > 0) {
-        unsigned s = scan_varint(rem, at);
-        if (s == 0) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated sint64 value");
-          return FALSE;
-        }
-        ((int64_t *)array)[count++] = unzigzag64(parse_uint64(s, at));
-        at += s;
-        rem -= s;
-      }
-      break;
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      while (rem > 0) {
-        unsigned s = scan_varint(rem, at);
-        if (s == 0) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated int64/uint64 value");
-          return FALSE;
-        }
-        ((int64_t *)array)[count++] = parse_uint64(s, at);
-        at += s;
-        rem -= s;
-      }
-      break;
-    case PROTOBUF_C_TYPE_BOOL:
-      count = rem;
-      for (i = 0; i < count; i++) {
-        if (at[i] > 1) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated boolean value");
-          return FALSE;
-        }
-        ((protobuf_c_boolean *)array)[i] = at[i];
-      }
-      break;
-    default:
-      PROTOBUF_C__ASSERT_NOT_REACHED();
-  }
-  *p_n += count;
-  return TRUE;
-
-#if !defined(WORDS_BIGENDIAN)
-no_unpacking_needed:
-  memcpy(array, at, count * siz);
-  *p_n += count;
-  return TRUE;
-#endif
-}
-
-static protobuf_c_boolean is_packable_type(ProtobufCType type) {
-  return type != PROTOBUF_C_TYPE_STRING && type != PROTOBUF_C_TYPE_BYTES &&
-         type != PROTOBUF_C_TYPE_MESSAGE;
-}
-
-static protobuf_c_boolean parse_member(ScannedMember *scanned_member,
-                                       ProtobufCMessage *message,
-                                       ProtobufCAllocator *allocator) {
-  const ProtobufCFieldDescriptor *field = scanned_member->field;
-  void *member;
-
-  if (field == NULL) {
-    ProtobufCMessageUnknownField *ufield =
-        message->unknown_fields + (message->n_unknown_fields++);
-    ufield->tag = scanned_member->tag;
-    ufield->wire_type = scanned_member->wire_type;
-    ufield->len = scanned_member->len;
-    ufield->data = do_alloc(allocator, scanned_member->len);
-    if (ufield->data == NULL) return FALSE;
-    memcpy(ufield->data, scanned_member->data, ufield->len);
-    return TRUE;
-  }
-  member = (char *)message + field->offset;
-  switch (field->label) {
-    case PROTOBUF_C_LABEL_REQUIRED:
-      return parse_required_member(scanned_member, member, allocator, TRUE);
-    case PROTOBUF_C_LABEL_OPTIONAL:
-    case PROTOBUF_C_LABEL_NONE:
-      if (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_ONEOF)) {
-        return parse_oneof_member(scanned_member, member, message, allocator);
-      } else {
-        return parse_optional_member(scanned_member, member, message,
-                                     allocator);
-      }
-    case PROTOBUF_C_LABEL_REPEATED:
-      if (scanned_member->wire_type == PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED &&
-          (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED) ||
-           is_packable_type(field->type))) {
-        return parse_packed_repeated_member(scanned_member, member, message);
-      } else {
-        return parse_repeated_member(scanned_member, member, message,
-                                     allocator);
-      }
-  }
-  PROTOBUF_C__ASSERT_NOT_REACHED();
-  return 0;
-}
-
-/**
- * Initialise messages generated by old code.
- *
- * This function is used if desc->message_init == NULL (which occurs
- * for old code, and which would be useful to support allocating
- * descriptors dynamically).
- */
-static void message_init_generic(const ProtobufCMessageDescriptor *desc,
-                                 ProtobufCMessage *message) {
-  unsigned i;
-
-  memset(message, 0, desc->sizeof_message);
-  message->descriptor = desc;
-  for (i = 0; i < desc->n_fields; i++) {
-    if (desc->fields[i].default_value != NULL &&
-        desc->fields[i].label != PROTOBUF_C_LABEL_REPEATED) {
-      void *field = STRUCT_MEMBER_P(message, desc->fields[i].offset);
-      const void *dv = desc->fields[i].default_value;
-
-      switch (desc->fields[i].type) {
-        case PROTOBUF_C_TYPE_INT32:
-        case PROTOBUF_C_TYPE_SINT32:
-        case PROTOBUF_C_TYPE_SFIXED32:
-        case PROTOBUF_C_TYPE_UINT32:
-        case PROTOBUF_C_TYPE_FIXED32:
-        case PROTOBUF_C_TYPE_FLOAT:
-        case PROTOBUF_C_TYPE_ENUM:
-          memcpy(field, dv, 4);
-          break;
-        case PROTOBUF_C_TYPE_INT64:
-        case PROTOBUF_C_TYPE_SINT64:
-        case PROTOBUF_C_TYPE_SFIXED64:
-        case PROTOBUF_C_TYPE_UINT64:
-        case PROTOBUF_C_TYPE_FIXED64:
-        case PROTOBUF_C_TYPE_DOUBLE:
-          memcpy(field, dv, 8);
-          break;
-        case PROTOBUF_C_TYPE_BOOL:
-          memcpy(field, dv, sizeof(protobuf_c_boolean));
-          break;
-        case PROTOBUF_C_TYPE_BYTES:
-          memcpy(field, dv, sizeof(ProtobufCBinaryData));
-          break;
-
-        case PROTOBUF_C_TYPE_STRING:
-        case PROTOBUF_C_TYPE_MESSAGE:
-          /*
-           * The next line essentially implements a cast
-           * from const, which is totally unavoidable.
-           */
-          *(const void **)field = dv;
-          break;
-      }
-    }
-  }
-}
-
-/**@}*/
-
-/*
- * ScannedMember slabs (an unpacking implementation detail). Before doing real
- * unpacking, we first scan through the elements to see how many there are (for
- * repeated fields), and which field to use (for non-repeated fields given
- * twice).
- *
- * In order to avoid allocations for small messages, we keep a stack-allocated
- * slab of ScannedMembers of size FIRST_SCANNED_MEMBER_SLAB_SIZE (16). After we
- * fill that up, we allocate each slab twice as large as the previous one.
- */
-#define FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2 4
-
-/*
- * The number of slabs, including the stack-allocated ones; choose the number so
- * that we would overflow if we needed a slab larger than provided.
- */
-#define MAX_SCANNED_MEMBER_SLAB                                      \
-  (sizeof(unsigned int) * 8 - 1 - BOUND_SIZEOF_SCANNED_MEMBER_LOG2 - \
-   FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2)
-
-#define REQUIRED_FIELD_BITMAP_SET(index) \
-  (required_fields_bitmap[(index) / 8] |= (1UL << ((index) % 8)))
-
-#define REQUIRED_FIELD_BITMAP_IS_SET(index) \
-  (required_fields_bitmap[(index) / 8] & (1UL << ((index) % 8)))
-
-ProtobufCMessage *protobuf_c_message_unpack(
-    const ProtobufCMessageDescriptor *desc, ProtobufCAllocator *allocator,
-    size_t len, const uint8_t *data) {
-  ProtobufCMessage *rv;
-  size_t rem = len;
-  const uint8_t *at = data;
-  const ProtobufCFieldDescriptor *last_field = desc->fields + 0;
-  ScannedMember first_member_slab[1UL << FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2];
-
-  /*
-   * scanned_member_slabs[i] is an array of arrays of ScannedMember.
-   * The first slab (scanned_member_slabs[0] is just a pointer to
-   * first_member_slab), above. All subsequent slabs will be allocated
-   * using the allocator.
-   */
-  ScannedMember *scanned_member_slabs[MAX_SCANNED_MEMBER_SLAB + 1];
-  unsigned which_slab = 0;    /* the slab we are currently populating */
-  unsigned in_slab_index = 0; /* number of members in the slab */
-  size_t n_unknown = 0;
-  unsigned f;
-  unsigned j;
-  unsigned i_slab;
-  unsigned last_field_index = 0;
-  unsigned required_fields_bitmap_len;
-  unsigned char required_fields_bitmap_stack[16];
-  unsigned char *required_fields_bitmap = required_fields_bitmap_stack;
-  protobuf_c_boolean required_fields_bitmap_alloced = FALSE;
-
-  ASSERT_IS_MESSAGE_DESCRIPTOR(desc);
-
-  if (allocator == NULL) allocator = &protobuf_c__allocator;
-
-  rv = do_alloc(allocator, desc->sizeof_message);
-  if (!rv) return (NULL);
-  scanned_member_slabs[0] = first_member_slab;
-
-  required_fields_bitmap_len = (desc->n_fields + 7) / 8;
-  if (required_fields_bitmap_len > sizeof(required_fields_bitmap_stack)) {
-    required_fields_bitmap = do_alloc(allocator, required_fields_bitmap_len);
-    if (!required_fields_bitmap) {
-      do_free(allocator, rv);
-      return (NULL);
-    }
-    required_fields_bitmap_alloced = TRUE;
-  }
-  memset(required_fields_bitmap, 0, required_fields_bitmap_len);
-
-  /*
-   * Generated code always defines "message_init". However, we provide a
-   * fallback for (1) users of old protobuf-c generated-code that do not
-   * provide the function, and (2) descriptors constructed from some other
-   * source (most likely, direct construction from the .proto file).
-   */
-  if (desc->message_init != NULL)
-    protobuf_c_message_init(desc, rv);
-  else
-    message_init_generic(desc, rv);
-
-  while (rem > 0) {
-    uint32_t tag;
-    ProtobufCWireType wire_type;
-    size_t used = parse_tag_and_wiretype(rem, at, &tag, &wire_type);
-    const ProtobufCFieldDescriptor *field;
-    ScannedMember tmp;
-
-    if (used == 0) {
-      PROTOBUF_C_UNPACK_ERROR("error parsing tag/wiretype at offset %u",
-                              (unsigned)(at - data));
-      goto error_cleanup_during_scan;
-    }
-    /*
-     * \todo Consider optimizing for field[1].id == tag, if field[1]
-     * exists!
-     */
-    if (last_field == NULL || last_field->id != tag) {
-      /* lookup field */
-      int field_index =
-          int_range_lookup(desc->n_field_ranges, desc->field_ranges, tag);
-      if (field_index < 0) {
-        field = NULL;
-        n_unknown++;
-      } else {
-        field = desc->fields + field_index;
-        last_field = field;
-        last_field_index = field_index;
-      }
-    } else {
-      field = last_field;
-    }
-
-    if (field != NULL && field->label == PROTOBUF_C_LABEL_REQUIRED)
-      REQUIRED_FIELD_BITMAP_SET(last_field_index);
-
-    at += used;
-    rem -= used;
-    tmp.tag = tag;
-    tmp.wire_type = wire_type;
-    tmp.field = field;
-    tmp.data = at;
-    tmp.length_prefix_len = 0;
-
-    switch (wire_type) {
-      case PROTOBUF_C_WIRE_TYPE_VARINT: {
-        unsigned max_len = rem < 10 ? rem : 10;
-        unsigned i;
-
-        for (i = 0; i < max_len; i++)
-          if ((at[i] & 0x80) == 0) break;
-        if (i == max_len) {
-          PROTOBUF_C_UNPACK_ERROR("unterminated varint at offset %u",
-                                  (unsigned)(at - data));
-          goto error_cleanup_during_scan;
-        }
-        tmp.len = i + 1;
-        break;
-      }
-      case PROTOBUF_C_WIRE_TYPE_64BIT:
-        if (rem < 8) {
-          PROTOBUF_C_UNPACK_ERROR("too short after 64bit wiretype at offset %u",
-                                  (unsigned)(at - data));
-          goto error_cleanup_during_scan;
-        }
-        tmp.len = 8;
-        break;
-      case PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED: {
-        size_t pref_len;
-
-        tmp.len = scan_length_prefixed_data(rem, at, &pref_len);
-        if (tmp.len == 0) {
-          /* NOTE: scan_length_prefixed_data calls UNPACK_ERROR */
-          goto error_cleanup_during_scan;
-        }
-        tmp.length_prefix_len = pref_len;
-        break;
-      }
-      case PROTOBUF_C_WIRE_TYPE_32BIT:
-        if (rem < 4) {
-          PROTOBUF_C_UNPACK_ERROR("too short after 32bit wiretype at offset %u",
-                                  (unsigned)(at - data));
-          goto error_cleanup_during_scan;
-        }
-        tmp.len = 4;
-        break;
-      default:
-        PROTOBUF_C_UNPACK_ERROR("unsupported tag %u at offset %u", wire_type,
-                                (unsigned)(at - data));
-        goto error_cleanup_during_scan;
-    }
-
-    if (in_slab_index ==
-        (1UL << (which_slab + FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2))) {
-      size_t size;
-
-      in_slab_index = 0;
-      if (which_slab == MAX_SCANNED_MEMBER_SLAB) {
-        PROTOBUF_C_UNPACK_ERROR("too many fields");
-        goto error_cleanup_during_scan;
-      }
-      which_slab++;
-      size = sizeof(ScannedMember)
-             << (which_slab + FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2);
-      scanned_member_slabs[which_slab] = do_alloc(allocator, size);
-      if (scanned_member_slabs[which_slab] == NULL)
-        goto error_cleanup_during_scan;
-    }
-    scanned_member_slabs[which_slab][in_slab_index++] = tmp;
-
-    if (field != NULL && field->label == PROTOBUF_C_LABEL_REPEATED) {
-      size_t *n = STRUCT_MEMBER_PTR(size_t, rv, field->quantifier_offset);
-      if (wire_type == PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED &&
-          (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED) ||
-           is_packable_type(field->type))) {
-        size_t count;
-        if (!count_packed_elements(field->type, tmp.len - tmp.length_prefix_len,
-                                   tmp.data + tmp.length_prefix_len, &count)) {
-          PROTOBUF_C_UNPACK_ERROR("counting packed elements");
-          goto error_cleanup_during_scan;
-        }
-        *n += count;
-      } else {
-        *n += 1;
-      }
-    }
-
-    at += tmp.len;
-    rem -= tmp.len;
-  }
-
-  /* allocate space for repeated fields, also check that all required fields
-   * have been set */
-  for (f = 0; f < desc->n_fields; f++) {
-    const ProtobufCFieldDescriptor *field = desc->fields + f;
-    if (field->label == PROTOBUF_C_LABEL_REPEATED) {
-      size_t siz = sizeof_elt_in_repeated_array(field->type);
-      size_t *n_ptr = STRUCT_MEMBER_PTR(size_t, rv, field->quantifier_offset);
-      if (*n_ptr != 0) {
-        unsigned n = *n_ptr;
-        void *a;
-        *n_ptr = 0;
-        assert(rv->descriptor != NULL);
-#define CLEAR_REMAINING_N_PTRS()                               \
-  for (f++; f < desc->n_fields; f++) {                         \
-    field = desc->fields + f;                                  \
-    if (field->label == PROTOBUF_C_LABEL_REPEATED)             \
-      STRUCT_MEMBER(size_t, rv, field->quantifier_offset) = 0; \
-  }
-        a = do_alloc(allocator, siz * n);
-        if (!a) {
-          CLEAR_REMAINING_N_PTRS();
-          goto error_cleanup;
-        }
-        STRUCT_MEMBER(void *, rv, field->offset) = a;
-      }
-    } else if (field->label == PROTOBUF_C_LABEL_REQUIRED) {
-      if (field->default_value == NULL && !REQUIRED_FIELD_BITMAP_IS_SET(f)) {
-        CLEAR_REMAINING_N_PTRS();
-        PROTOBUF_C_UNPACK_ERROR("message '%s': missing required field '%s'",
-                                desc->name, field->name);
-        goto error_cleanup;
-      }
-    }
-  }
-#undef CLEAR_REMAINING_N_PTRS
-
-  /* allocate space for unknown fields */
-  if (n_unknown) {
-    rv->unknown_fields =
-        do_alloc(allocator, n_unknown * sizeof(ProtobufCMessageUnknownField));
-    if (rv->unknown_fields == NULL) goto error_cleanup;
-  }
-
-  /* do real parsing */
-  for (i_slab = 0; i_slab <= which_slab; i_slab++) {
-    unsigned max =
-        (i_slab == which_slab) ? in_slab_index : (1UL << (i_slab + 4));
-    ScannedMember *slab = scanned_member_slabs[i_slab];
-
-    for (j = 0; j < max; j++) {
-      if (!parse_member(slab + j, rv, allocator)) {
-        PROTOBUF_C_UNPACK_ERROR(
-            "error parsing member %s of %s",
-            slab->field ? slab->field->name : "*unknown-field*", desc->name);
-        goto error_cleanup;
-      }
-    }
-  }
-
-  /* cleanup */
-  for (j = 1; j <= which_slab; j++) do_free(allocator, scanned_member_slabs[j]);
-  if (required_fields_bitmap_alloced)
-    do_free(allocator, required_fields_bitmap);
-  return rv;
-
-error_cleanup:
-  protobuf_c_message_free_unpacked(rv, allocator);
-  for (j = 1; j <= which_slab; j++) do_free(allocator, scanned_member_slabs[j]);
-  if (required_fields_bitmap_alloced)
-    do_free(allocator, required_fields_bitmap);
-  return NULL;
-
-error_cleanup_during_scan:
-  do_free(allocator, rv);
-  for (j = 1; j <= which_slab; j++) do_free(allocator, scanned_member_slabs[j]);
-  if (required_fields_bitmap_alloced)
-    do_free(allocator, required_fields_bitmap);
-  return NULL;
-}
-
-void protobuf_c_message_free_unpacked(ProtobufCMessage *message,
-                                      ProtobufCAllocator *allocator) {
-  const ProtobufCMessageDescriptor *desc;
-  unsigned f;
-
-  if (message == NULL) return;
-
-  desc = message->descriptor;
-
-  ASSERT_IS_MESSAGE(message);
-
-  if (allocator == NULL) allocator = &protobuf_c__allocator;
-  message->descriptor = NULL;
-  for (f = 0; f < desc->n_fields; f++) {
-    if (0 != (desc->fields[f].flags & PROTOBUF_C_FIELD_FLAG_ONEOF) &&
-        desc->fields[f].id !=
-            STRUCT_MEMBER(uint32_t, message,
-                          desc->fields[f].quantifier_offset)) {
-      /* This is not the selected oneof, skip it */
-      continue;
-    }
-
-    if (desc->fields[f].label == PROTOBUF_C_LABEL_REPEATED) {
-      size_t n =
-          STRUCT_MEMBER(size_t, message, desc->fields[f].quantifier_offset);
-      void *arr = STRUCT_MEMBER(void *, message, desc->fields[f].offset);
-
-      if (arr != NULL) {
-        if (desc->fields[f].type == PROTOBUF_C_TYPE_STRING) {
-          unsigned i;
-          for (i = 0; i < n; i++) do_free(allocator, ((char **)arr)[i]);
-        } else if (desc->fields[f].type == PROTOBUF_C_TYPE_BYTES) {
-          unsigned i;
-          for (i = 0; i < n; i++)
-            do_free(allocator, ((ProtobufCBinaryData *)arr)[i].data);
-        } else if (desc->fields[f].type == PROTOBUF_C_TYPE_MESSAGE) {
-          unsigned i;
-          for (i = 0; i < n; i++)
-            protobuf_c_message_free_unpacked(((ProtobufCMessage **)arr)[i],
-                                             allocator);
-        }
-        do_free(allocator, arr);
-      }
-    } else if (desc->fields[f].type == PROTOBUF_C_TYPE_STRING) {
-      char *str = STRUCT_MEMBER(char *, message, desc->fields[f].offset);
-
-      if (str && str != desc->fields[f].default_value) do_free(allocator, str);
-    } else if (desc->fields[f].type == PROTOBUF_C_TYPE_BYTES) {
-      void *data =
-          STRUCT_MEMBER(ProtobufCBinaryData, message, desc->fields[f].offset)
-              .data;
-      const ProtobufCBinaryData *default_bd;
-
-      default_bd = desc->fields[f].default_value;
-      if (data != NULL && (default_bd == NULL || default_bd->data != data)) {
-        do_free(allocator, data);
-      }
-    } else if (desc->fields[f].type == PROTOBUF_C_TYPE_MESSAGE) {
-      ProtobufCMessage *sm;
-
-      sm = STRUCT_MEMBER(ProtobufCMessage *, message, desc->fields[f].offset);
-      if (sm && sm != desc->fields[f].default_value)
-        protobuf_c_message_free_unpacked(sm, allocator);
-    }
-  }
-
-  for (f = 0; f < message->n_unknown_fields; f++)
-    do_free(allocator, message->unknown_fields[f].data);
-  if (message->unknown_fields != NULL)
-    do_free(allocator, message->unknown_fields);
-
-  do_free(allocator, message);
-}
-
-void protobuf_c_message_init(const ProtobufCMessageDescriptor *descriptor,
-                             void *message) {
-  descriptor->message_init((ProtobufCMessage *)(message));
-}
-
-protobuf_c_boolean protobuf_c_message_check(const ProtobufCMessage *message) {
-  unsigned i;
-
-  if (!message || !message->descriptor ||
-      message->descriptor->magic != PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC) {
-    return FALSE;
-  }
-
-  for (i = 0; i < message->descriptor->n_fields; i++) {
-    const ProtobufCFieldDescriptor *f = message->descriptor->fields + i;
-    ProtobufCType type = f->type;
-    ProtobufCLabel label = f->label;
-    void *field = STRUCT_MEMBER_P(message, f->offset);
-
-    if (label == PROTOBUF_C_LABEL_REPEATED) {
-      size_t *quantity = STRUCT_MEMBER_P(message, f->quantifier_offset);
-
-      if (*quantity > 0 && *(void **)field == NULL) {
-        return FALSE;
-      }
-
-      if (type == PROTOBUF_C_TYPE_MESSAGE) {
-        ProtobufCMessage **submessage = *(ProtobufCMessage ***)field;
-        unsigned j;
-        for (j = 0; j < *quantity; j++) {
-          if (!protobuf_c_message_check(submessage[j])) return FALSE;
-        }
-      } else if (type == PROTOBUF_C_TYPE_STRING) {
-        char **string = *(char ***)field;
-        unsigned j;
-        for (j = 0; j < *quantity; j++) {
-          if (!string[j]) return FALSE;
-        }
-      } else if (type == PROTOBUF_C_TYPE_BYTES) {
-        ProtobufCBinaryData *bd = *(ProtobufCBinaryData **)field;
-        unsigned j;
-        for (j = 0; j < *quantity; j++) {
-          if (bd[j].len > 0 && bd[j].data == NULL) return FALSE;
-        }
-      }
-
-    } else { /* PROTOBUF_C_LABEL_REQUIRED or PROTOBUF_C_LABEL_OPTIONAL */
-
-      if (type == PROTOBUF_C_TYPE_MESSAGE) {
-        ProtobufCMessage *submessage = *(ProtobufCMessage **)field;
-        if (label == PROTOBUF_C_LABEL_REQUIRED || submessage != NULL) {
-          if (!protobuf_c_message_check(submessage)) return FALSE;
-        }
-      } else if (type == PROTOBUF_C_TYPE_STRING) {
-        char *string = *(char **)field;
-        if (label == PROTOBUF_C_LABEL_REQUIRED && string == NULL) return FALSE;
-      } else if (type == PROTOBUF_C_TYPE_BYTES) {
-        protobuf_c_boolean *has =
-            STRUCT_MEMBER_P(message, f->quantifier_offset);
-        ProtobufCBinaryData *bd = field;
-        if (label == PROTOBUF_C_LABEL_REQUIRED || *has == TRUE) {
-          if (bd->len > 0 && bd->data == NULL) return FALSE;
-        }
-      }
-    }
-  }
-
-  return TRUE;
-}
-
-/* === services === */
-
-typedef void (*GenericHandler)(void *service, const ProtobufCMessage *input,
-                               ProtobufCClosure closure, void *closure_data);
diff --git a/mobile/tools/quantification/src/protobuf-c.h b/mobile/tools/quantification/src/protobuf-c.h
deleted file mode 100644
index bd85695b86..0000000000
--- a/mobile/tools/quantification/src/protobuf-c.h
+++ /dev/null
@@ -1,921 +0,0 @@
-/*
- * Copyright (c) 2008-2017, Dave Benson and the protobuf-c authors.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- *     * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- *     * Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following disclaimer
- * in the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*! \file
- * \mainpage Introduction
- *
- * This is [protobuf-c], a C implementation of [Protocol Buffers].
- *
- * This file defines the public API for the `libprotobuf-c` support library.
- * This API includes interfaces that can be used directly by client code as well
- * as the interfaces used by the code generated by the `protoc-c` compiler.
- *
- * The `libprotobuf-c` support library performs the actual serialization and
- * deserialization of Protocol Buffers messages. It interacts with structures,
- * definitions, and metadata generated by the `protoc-c` compiler from .proto
- * files.
- *
- * \authors Dave Benson and the `protobuf-c` authors.
- *
- * \copyright 2008-2014. Licensed under the terms of the [BSD-2-Clause] license.
- *
- * [protobuf-c]:       https://github.com/protobuf-c/protobuf-c
- * [Protocol Buffers]: https://developers.google.com/protocol-buffers/
- * [BSD-2-Clause]:     http://opensource.org/licenses/BSD-2-Clause
- *
- * \page gencode Generated Code
- *
- * For each enum, we generate a C enum. For each message, we generate a C
- * structure which can be cast to a `ProtobufCMessage`.
- *
- * For each enum and message, we generate a descriptor object that allows us to
- * implement a kind of reflection on the structures.
- *
- * First, some naming conventions:
- *
- * - The name of the type for enums and messages and services is camel case
- *   (meaning WordsAreCrammedTogether) except that double underscores are used
- *   to delimit scopes. For example, the following `.proto` file:
- *
-~~~{.proto}
-        package foo.bar;
-        message BazBah {
-            optional int32 val = 1;
-        }
-~~~
- *
- * would generate a C type `Foo__Bar__BazBah`.
- *
- * - Identifiers for functions and globals are all lowercase, with camel case
- *   words separated by single underscores. For example, one of the function
- *   prototypes generated by `protoc-c` for the above example:
- *
-~~~{.c}
-Foo__Bar__BazBah *
-       foo__bar__baz_bah__unpack
-                     (ProtobufCAllocator  *allocator,
-                      size_t               len,
-                      const uint8_t       *data);
-~~~
- *
- * - Identifiers for enum values contain an uppercase prefix which embeds the
- *   package name and the enum type name.
- *
- * - A double underscore is used to separate further components of identifier
- *   names.
- *
- * For example, in the name of the unpack function above, the package name
- * `foo.bar` has become `foo__bar`, the message name BazBah has become
- * `baz_bah`, and the method name is `unpack`. These are all joined with double
- * underscores to form the C identifier `foo__bar__baz_bah__unpack`.
- *
- * We also generate descriptor objects for messages and enums. These are
- * declared in the `.pb-c.h` files:
- *
-~~~{.c}
-extern const ProtobufCMessageDescriptor foo__bar__baz_bah__descriptor;
-~~~
- *
- * The message structures all begin with `ProtobufCMessageDescriptor *` which is
- * sufficient to allow them to be cast to `ProtobufCMessage`.
- *
- * For each message defined in a `.proto` file, we generate a number of
- * functions and macros. Each function name contains a prefix based on the
- * package name and message name in order to make it a unique C identifier.
- *
- * - `INIT`. Statically initializes a message object, initializing its
- *   descriptor and setting its fields to default values. Uninitialized
- *   messages cannot be processed by the protobuf-c library.
- *
-~~~{.c}
-#define FOO__BAR__BAZ_BAH__INIT \
- { PROTOBUF_C_MESSAGE_INIT (&foo__bar__baz_bah__descriptor), 0 }
-~~~
- * - `init()`. Initializes a message object, initializing its descriptor and
- *   setting its fields to default values. Uninitialized messages cannot be
- *   processed by the protobuf-c library.
- *
-~~~{.c}
-void foo__bar__baz_bah__init
-                     (Foo__Bar__BazBah *message);
-~~~
- * - `unpack()`. Unpacks data for a particular message format. Note that the
- *   `allocator` parameter is usually `NULL` to indicate that the system's
- *   `malloc()` and `free()` functions should be used for dynamically allocating
- *   memory.
- *
-~~~{.c}
-Foo__Bar__BazBah *
-       foo__bar__baz_bah__unpack
-                     (ProtobufCAllocator  *allocator,
-                      size_t               len,
-                      const uint8_t       *data);
-~~~
- *
- * - `free_unpacked()`. Frees a message object obtained with the `unpack()`
- *   method. Freeing `NULL` is allowed (the same as with `free()`).
- *
-~~~{.c}
-void   foo__bar__baz_bah__free_unpacked
-                     (Foo__Bar__BazBah *message,
-                      ProtobufCAllocator *allocator);
-~~~
- *
- * - `get_packed_size()`. Calculates the length in bytes of the serialized
- *   representation of the message object.
- *
-~~~{.c}
-size_t foo__bar__baz_bah__get_packed_size
-                     (const Foo__Bar__BazBah   *message);
-~~~
- *
- * - `pack()`. Pack a message object into a preallocated buffer. Assumes that
- *   the buffer is large enough. (Use `get_packed_size()` first.)
- *
-~~~{.c}
-size_t foo__bar__baz_bah__pack
-                     (const Foo__Bar__BazBah   *message,
-                      uint8_t             *out);
-~~~
- *
- * - `pack_to_buffer()`. Packs a message into a "virtual buffer". This is an
- *   object which defines an "append bytes" callback to consume data as it is
- *   serialized.
- *
-~~~{.c}
-size_t foo__bar__baz_bah__pack_to_buffer
-                     (const Foo__Bar__BazBah   *message,
-                      ProtobufCBuffer     *buffer);
-~~~
- *
- * \page pack Packing and unpacking messages
- *
- * To pack a message, first compute the packed size of the message with
- * protobuf_c_message_get_packed_size(), then allocate a buffer of at least
- * that size, then call protobuf_c_message_pack().
- *
- * Alternatively, a message can be serialized without calculating the final size
- * first. Use the protobuf_c_message_pack_to_buffer() function and provide a
- * ProtobufCBuffer object which implements an "append" method that consumes
- * data.
- *
- * To unpack a message, call the protobuf_c_message_unpack() function. The
- * result can be cast to an object of the type that matches the descriptor for
- * the message.
- *
- * The result of unpacking a message should be freed with
- * protobuf_c_message_free_unpacked().
- */
-
-#ifndef PROTOBUF_C_H
-#define PROTOBUF_C_H
-
-#include <assert.h>
-#include <limits.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-#define PROTOBUF_C__BEGIN_DECLS extern "C" {
-#define PROTOBUF_C__END_DECLS }
-#else
-#define PROTOBUF_C__BEGIN_DECLS
-#define PROTOBUF_C__END_DECLS
-#endif
-
-PROTOBUF_C__BEGIN_DECLS
-
-#if defined(_WIN32) && defined(PROTOBUF_C_USE_SHARED_LIB)
-#ifdef PROTOBUF_C_EXPORT
-#define PROTOBUF_C__API __declspec(dllexport)
-#else
-#define PROTOBUF_C__API __declspec(dllimport)
-#endif
-#else
-#define PROTOBUF_C__API
-#endif
-
-#if !defined(PROTOBUF_C__NO_DEPRECATED) && \
-    ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
-#define PROTOBUF_C__DEPRECATED __attribute__((__deprecated__))
-#else
-#define PROTOBUF_C__DEPRECATED
-#endif
-
-#ifndef PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE
-#define PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(enum_name) \
-  , _##enum_name##_IS_INT_SIZE = INT_MAX
-#endif
-
-#define PROTOBUF_C__SERVICE_DESCRIPTOR_MAGIC 0x14159bc3
-#define PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC 0x28aaeef9
-#define PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC 0x114315af
-
-/* Empty string used for initializers */
-extern const char protobuf_c_empty_string[];
-
-/**
- * \defgroup api Public API
- *
- * This is the public API for `libprotobuf-c`. These interfaces are stable and
- * subject to Semantic Versioning guarantees.
- *
- * @{
- */
-
-/**
- * Values for the `flags` word in `ProtobufCFieldDescriptor`.
- */
-typedef enum {
-  /** Set if the field is repeated and marked with the `packed` option. */
-  PROTOBUF_C_FIELD_FLAG_PACKED = (1 << 0),
-
-  /** Set if the field is marked with the `deprecated` option. */
-  PROTOBUF_C_FIELD_FLAG_DEPRECATED = (1 << 1),
-
-  /** Set if the field is a member of a oneof (union). */
-  PROTOBUF_C_FIELD_FLAG_ONEOF = (1 << 2),
-} ProtobufCFieldFlag;
-
-/**
- * Message field rules.
- *
- * \see [Defining A Message Type] in the Protocol Buffers documentation.
- *
- * [Defining A Message Type]:
- *      https://developers.google.com/protocol-buffers/docs/proto#simple
- */
-typedef enum {
-  /** A well-formed message must have exactly one of this field. */
-  PROTOBUF_C_LABEL_REQUIRED,
-
-  /**
-   * A well-formed message can have zero or one of this field (but not
-   * more than one).
-   */
-  PROTOBUF_C_LABEL_OPTIONAL,
-
-  /**
-   * This field can be repeated any number of times (including zero) in a
-   * well-formed message. The order of the repeated values will be
-   * preserved.
-   */
-  PROTOBUF_C_LABEL_REPEATED,
-
-  /**
-   * This field has no label. This is valid only in proto3 and is
-   * equivalent to OPTIONAL but no "has" quantifier will be consulted.
-   */
-  PROTOBUF_C_LABEL_NONE,
-} ProtobufCLabel;
-
-/**
- * Field value types.
- *
- * \see [Scalar Value Types] in the Protocol Buffers documentation.
- *
- * [Scalar Value Types]:
- *      https://developers.google.com/protocol-buffers/docs/proto#scalar
- */
-typedef enum {
-  PROTOBUF_C_TYPE_INT32,    /**< int32 */
-  PROTOBUF_C_TYPE_SINT32,   /**< signed int32 */
-  PROTOBUF_C_TYPE_SFIXED32, /**< signed int32 (4 bytes) */
-  PROTOBUF_C_TYPE_INT64,    /**< int64 */
-  PROTOBUF_C_TYPE_SINT64,   /**< signed int64 */
-  PROTOBUF_C_TYPE_SFIXED64, /**< signed int64 (8 bytes) */
-  PROTOBUF_C_TYPE_UINT32,   /**< unsigned int32 */
-  PROTOBUF_C_TYPE_FIXED32,  /**< unsigned int32 (4 bytes) */
-  PROTOBUF_C_TYPE_UINT64,   /**< unsigned int64 */
-  PROTOBUF_C_TYPE_FIXED64,  /**< unsigned int64 (8 bytes) */
-  PROTOBUF_C_TYPE_FLOAT,    /**< float */
-  PROTOBUF_C_TYPE_DOUBLE,   /**< double */
-  PROTOBUF_C_TYPE_BOOL,     /**< boolean */
-  PROTOBUF_C_TYPE_ENUM,     /**< enumerated type */
-  PROTOBUF_C_TYPE_STRING,   /**< UTF-8 or ASCII string */
-  PROTOBUF_C_TYPE_BYTES,    /**< arbitrary byte sequence */
-  PROTOBUF_C_TYPE_MESSAGE,  /**< nested message */
-} ProtobufCType;
-
-/**
- * Field wire types.
- *
- * \see [Message Structure] in the Protocol Buffers documentation.
- *
- * [Message Structure]:
- *      https://developers.google.com/protocol-buffers/docs/encoding#structure
- */
-typedef enum {
-  PROTOBUF_C_WIRE_TYPE_VARINT = 0,
-  PROTOBUF_C_WIRE_TYPE_64BIT = 1,
-  PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED = 2,
-  /* "Start group" and "end group" wire types are unsupported. */
-  PROTOBUF_C_WIRE_TYPE_32BIT = 5,
-} ProtobufCWireType;
-
-struct ProtobufCAllocator;
-struct ProtobufCBinaryData;
-struct ProtobufCBuffer;
-struct ProtobufCBufferSimple;
-struct ProtobufCEnumDescriptor;
-struct ProtobufCEnumValue;
-struct ProtobufCEnumValueIndex;
-struct ProtobufCFieldDescriptor;
-struct ProtobufCIntRange;
-struct ProtobufCMessage;
-struct ProtobufCMessageDescriptor;
-struct ProtobufCMessageUnknownField;
-struct ProtobufCMethodDescriptor;
-struct ProtobufCService;
-struct ProtobufCServiceDescriptor;
-
-typedef struct ProtobufCAllocator ProtobufCAllocator;
-typedef struct ProtobufCBinaryData ProtobufCBinaryData;
-typedef struct ProtobufCBuffer ProtobufCBuffer;
-typedef struct ProtobufCBufferSimple ProtobufCBufferSimple;
-typedef struct ProtobufCEnumDescriptor ProtobufCEnumDescriptor;
-typedef struct ProtobufCEnumValue ProtobufCEnumValue;
-typedef struct ProtobufCEnumValueIndex ProtobufCEnumValueIndex;
-typedef struct ProtobufCFieldDescriptor ProtobufCFieldDescriptor;
-typedef struct ProtobufCIntRange ProtobufCIntRange;
-typedef struct ProtobufCMessage ProtobufCMessage;
-typedef struct ProtobufCMessageDescriptor ProtobufCMessageDescriptor;
-typedef struct ProtobufCMessageUnknownField ProtobufCMessageUnknownField;
-typedef struct ProtobufCMethodDescriptor ProtobufCMethodDescriptor;
-typedef struct ProtobufCService ProtobufCService;
-typedef struct ProtobufCServiceDescriptor ProtobufCServiceDescriptor;
-
-/** Boolean type. */
-typedef int protobuf_c_boolean;
-
-typedef void (*ProtobufCClosure)(const ProtobufCMessage *, void *closure_data);
-typedef void (*ProtobufCMessageInit)(ProtobufCMessage *);
-typedef void (*ProtobufCServiceDestroy)(ProtobufCService *);
-
-/**
- * Structure for defining a custom memory allocator.
- */
-struct ProtobufCAllocator {
-  /** Function to allocate memory. */
-  void *(*alloc)(void *allocator_data, size_t size);
-
-  /** Function to free memory. */
-  void (*free)(void *allocator_data, void *pointer);
-
-  /** Opaque pointer passed to `alloc` and `free` functions. */
-  void *allocator_data;
-};
-
-/**
- * Structure for the protobuf `bytes` scalar type.
- *
- * The data contained in a `ProtobufCBinaryData` is an arbitrary sequence of
- * bytes. It may contain embedded `NUL` characters and is not required to be
- * `NUL`-terminated.
- */
-struct ProtobufCBinaryData {
-  size_t len;    /**< Number of bytes in the `data` field. */
-  uint8_t *data; /**< Data bytes. */
-};
-
-/**
- * Structure for defining a virtual append-only buffer. Used by
- * protobuf_c_message_pack_to_buffer() to abstract the consumption of serialized
- * bytes.
- *
- * `ProtobufCBuffer` "subclasses" may be defined on the stack. For example, to
- * write to a `FILE` object:
- *
-~~~{.c}
-typedef struct {
-        ProtobufCBuffer base;
-        FILE *fp;
-} BufferAppendToFile;
-
-static void
-my_buffer_file_append(ProtobufCBuffer *buffer,
-                      size_t len,
-                      const uint8_t *data)
-{
-        BufferAppendToFile *file_buf = (BufferAppendToFile *) buffer;
-        fwrite(data, len, 1, file_buf->fp); // XXX: No error handling!
-}
-~~~
- *
- * To use this new type of ProtobufCBuffer, it could be called as follows:
- *
-~~~{.c}
-...
-BufferAppendToFile tmp = {0};
-tmp.base.append = my_buffer_file_append;
-tmp.fp = fp;
-protobuf_c_message_pack_to_buffer(&message, &tmp);
-...
-~~~
- */
-struct ProtobufCBuffer {
-  /** Append function. Consumes the `len` bytes stored at `data`. */
-  void (*append)(ProtobufCBuffer *buffer, size_t len, const uint8_t *data);
-};
-
-/**
- * Simple buffer "subclass" of `ProtobufCBuffer`.
- *
- * A `ProtobufCBufferSimple` object is declared on the stack and uses a
- * scratch buffer provided by the user for the initial allocation. It performs
- * exponential resizing, using dynamically allocated memory. A
- * `ProtobufCBufferSimple` object can be created and used as follows:
- *
-~~~{.c}
-uint8_t pad[128];
-ProtobufCBufferSimple simple = PROTOBUF_C_BUFFER_SIMPLE_INIT(pad);
-ProtobufCBuffer *buffer = (ProtobufCBuffer *) &simple;
-~~~
- *
- * `buffer` can now be used with `protobuf_c_message_pack_to_buffer()`. Once a
- * message has been serialized to a `ProtobufCBufferSimple` object, the
- * serialized data bytes can be accessed from the `.data` field.
- *
- * To free the memory allocated by a `ProtobufCBufferSimple` object, if any,
- * call PROTOBUF_C_BUFFER_SIMPLE_CLEAR() on the object, for example:
- *
-~~~{.c}
-PROTOBUF_C_BUFFER_SIMPLE_CLEAR(&simple);
-~~~
- *
- * \see PROTOBUF_C_BUFFER_SIMPLE_INIT
- * \see PROTOBUF_C_BUFFER_SIMPLE_CLEAR
- */
-struct ProtobufCBufferSimple {
-  /** "Base class". */
-  ProtobufCBuffer base;
-  /** Number of bytes allocated in `data`. */
-  size_t alloced;
-  /** Number of bytes currently stored in `data`. */
-  size_t len;
-  /** Data bytes. */
-  uint8_t *data;
-  /** Whether `data` must be freed. */
-  protobuf_c_boolean must_free_data;
-  /** Allocator to use. May be NULL to indicate the system allocator. */
-  ProtobufCAllocator *allocator;
-};
-
-/**
- * Describes an enumeration as a whole, with all of its values.
- */
-struct ProtobufCEnumDescriptor {
-  /** Magic value checked to ensure that the API is used correctly. */
-  uint32_t magic;
-
-  /** The qualified name (e.g., "namespace.Type"). */
-  const char *name;
-  /** The unqualified name as given in the .proto file (e.g., "Type"). */
-  const char *short_name;
-  /** Identifier used in generated C code. */
-  const char *c_name;
-  /** The dot-separated namespace. */
-  const char *package_name;
-
-  /** Number elements in `values`. */
-  unsigned n_values;
-  /** Array of distinct values, sorted by numeric value. */
-  const ProtobufCEnumValue *values;
-
-  /** Number of elements in `values_by_name`. */
-  unsigned n_value_names;
-  /** Array of named values, including aliases, sorted by name. */
-  const ProtobufCEnumValueIndex *values_by_name;
-
-  /** Number of elements in `value_ranges`. */
-  unsigned n_value_ranges;
-  /** Value ranges, for faster lookups by numeric value. */
-  const ProtobufCIntRange *value_ranges;
-
-  /** Reserved for future use. */
-  void *reserved1;
-  /** Reserved for future use. */
-  void *reserved2;
-  /** Reserved for future use. */
-  void *reserved3;
-  /** Reserved for future use. */
-  void *reserved4;
-};
-
-/**
- * Represents a single value of an enumeration.
- */
-struct ProtobufCEnumValue {
-  /** The string identifying this value in the .proto file. */
-  const char *name;
-
-  /** The string identifying this value in generated C code. */
-  const char *c_name;
-
-  /** The numeric value assigned in the .proto file. */
-  int value;
-};
-
-/**
- * Used by `ProtobufCEnumDescriptor` to look up enum values.
- */
-struct ProtobufCEnumValueIndex {
-  /** Name of the enum value. */
-  const char *name;
-  /** Index into values[] array. */
-  unsigned index;
-};
-
-/**
- * Describes a single field in a message.
- */
-struct ProtobufCFieldDescriptor {
-  /** Name of the field as given in the .proto file. */
-  const char *name;
-
-  /** Tag value of the field as given in the .proto file. */
-  uint32_t id;
-
-  /** Whether the field is `REQUIRED`, `OPTIONAL`, or `REPEATED`. */
-  ProtobufCLabel label;
-
-  /** The type of the field. */
-  ProtobufCType type;
-
-  /**
-   * The offset in bytes of the message's C structure's quantifier field
-   * (the `has_MEMBER` field for optional members or the `n_MEMBER` field
-   * for repeated members or the case enum for oneofs).
-   */
-  unsigned quantifier_offset;
-
-  /**
-   * The offset in bytes into the message's C structure for the member
-   * itself.
-   */
-  unsigned offset;
-
-  /**
-   * A type-specific descriptor.
-   *
-   * If `type` is `PROTOBUF_C_TYPE_ENUM`, then `descriptor` points to the
-   * corresponding `ProtobufCEnumDescriptor`.
-   *
-   * If `type` is `PROTOBUF_C_TYPE_MESSAGE`, then `descriptor` points to
-   * the corresponding `ProtobufCMessageDescriptor`.
-   *
-   * Otherwise this field is NULL.
-   */
-  const void *descriptor; /* for MESSAGE and ENUM types */
-
-  /** The default value for this field, if defined. May be NULL. */
-  const void *default_value;
-
-  /**
-   * A flag word. Zero or more of the bits defined in the
-   * `ProtobufCFieldFlag` enum may be set.
-   */
-  uint32_t flags;
-
-  /** Reserved for future use. */
-  unsigned reserved_flags;
-  /** Reserved for future use. */
-  void *reserved2;
-  /** Reserved for future use. */
-  void *reserved3;
-};
-
-/**
- * Helper structure for optimizing int => index lookups in the case
- * where the keys are mostly consecutive values, as they presumably are for
- * enums and fields.
- *
- * The data structures requires that the values in the original array are
- * sorted.
- */
-struct ProtobufCIntRange {
-  int start_value;
-  unsigned orig_index;
-  /*
-   * NOTE: the number of values in the range can be inferred by looking
-   * at the next element's orig_index. A dummy element is added to make
-   * this simple.
-   */
-};
-
-/**
- * An instance of a message.
- *
- * `ProtobufCMessage` is a light-weight "base class" for all messages.
- *
- * In particular, `ProtobufCMessage` doesn't have any allocation policy
- * associated with it. That's because it's common to create `ProtobufCMessage`
- * objects on the stack. In fact, that's what we recommend for sending messages.
- * If the object is allocated from the stack, you can't really have a memory
- * leak.
- *
- * This means that calls to functions like protobuf_c_message_unpack() which
- * return a `ProtobufCMessage` must be paired with a call to a free function,
- * like protobuf_c_message_free_unpacked().
- */
-struct ProtobufCMessage {
-  /** The descriptor for this message type. */
-  const ProtobufCMessageDescriptor *descriptor;
-  /** The number of elements in `unknown_fields`. */
-  unsigned n_unknown_fields;
-  /** The fields that weren't recognized by the parser. */
-  ProtobufCMessageUnknownField *unknown_fields;
-};
-
-/**
- * Describes a message.
- */
-struct ProtobufCMessageDescriptor {
-  /** Magic value checked to ensure that the API is used correctly. */
-  uint32_t magic;
-
-  /** The qualified name (e.g., "namespace.Type"). */
-  const char *name;
-  /** The unqualified name as given in the .proto file (e.g., "Type"). */
-  const char *short_name;
-  /** Identifier used in generated C code. */
-  const char *c_name;
-  /** The dot-separated namespace. */
-  const char *package_name;
-
-  /**
-   * Size in bytes of the C structure representing an instance of this
-   * type of message.
-   */
-  size_t sizeof_message;
-
-  /** Number of elements in `fields`. */
-  unsigned n_fields;
-  /** Field descriptors, sorted by tag number. */
-  const ProtobufCFieldDescriptor *fields;
-  /** Used for looking up fields by name. */
-  const unsigned *fields_sorted_by_name;
-
-  /** Number of elements in `field_ranges`. */
-  unsigned n_field_ranges;
-  /** Used for looking up fields by id. */
-  const ProtobufCIntRange *field_ranges;
-
-  /** Message initialisation function. */
-  ProtobufCMessageInit message_init;
-
-  /** Reserved for future use. */
-  void *reserved1;
-  /** Reserved for future use. */
-  void *reserved2;
-  /** Reserved for future use. */
-  void *reserved3;
-};
-
-/**
- * An unknown message field.
- */
-struct ProtobufCMessageUnknownField {
-  /** The tag number. */
-  uint32_t tag;
-  /** The wire type of the field. */
-  ProtobufCWireType wire_type;
-  /** Number of bytes in `data`. */
-  size_t len;
-  /** Field data. */
-  uint8_t *data;
-};
-
-/**
- * Method descriptor.
- */
-struct ProtobufCMethodDescriptor {
-  /** Method name. */
-  const char *name;
-  /** Input message descriptor. */
-  const ProtobufCMessageDescriptor *input;
-  /** Output message descriptor. */
-  const ProtobufCMessageDescriptor *output;
-};
-
-/**
- * Service.
- */
-struct ProtobufCService {
-  /** Service descriptor. */
-  const ProtobufCServiceDescriptor *descriptor;
-  /** Function to invoke the service. */
-  void (*invoke)(ProtobufCService *service, unsigned method_index,
-                 const ProtobufCMessage *input, ProtobufCClosure closure,
-                 void *closure_data);
-  /** Function to destroy the service. */
-  void (*destroy)(ProtobufCService *service);
-};
-
-/**
- * Service descriptor.
- */
-struct ProtobufCServiceDescriptor {
-  /** Magic value checked to ensure that the API is used correctly. */
-  uint32_t magic;
-
-  /** Service name. */
-  const char *name;
-  /** Short version of service name. */
-  const char *short_name;
-  /** C identifier for the service name. */
-  const char *c_name;
-  /** Package name. */
-  const char *package;
-  /** Number of elements in `methods`. */
-  unsigned n_methods;
-  /** Method descriptors, in the order defined in the .proto file. */
-  const ProtobufCMethodDescriptor *methods;
-  /** Sort index of methods. */
-  const unsigned *method_indices_by_name;
-};
-
-/**
- * Get the version of the protobuf-c library. Note that this is the version of
- * the library linked against, not the version of the headers compiled against.
- *
- * \return A string containing the version number of protobuf-c.
- */
-PROTOBUF_C__API
-const char *protobuf_c_version(void);
-
-/**
- * Get the version of the protobuf-c library. Note that this is the version of
- * the library linked against, not the version of the headers compiled against.
- *
- * \return A 32 bit unsigned integer containing the version number of
- *      protobuf-c, represented in base-10 as (MAJOR*1E6) + (MINOR*1E3) + PATCH.
- */
-PROTOBUF_C__API
-uint32_t protobuf_c_version_number(void);
-
-/**
- * The version of the protobuf-c headers, represented as a string using the same
- * format as protobuf_c_version().
- */
-#define PROTOBUF_C_VERSION "1.3.0"
-
-/**
- * The version of the protobuf-c headers, represented as an integer using the
- * same format as protobuf_c_version_number().
- */
-#define PROTOBUF_C_VERSION_NUMBER 1003000
-
-/**
- * The minimum protoc-c version which works with the current version of the
- * protobuf-c headers.
- */
-#define PROTOBUF_C_MIN_COMPILER_VERSION 1000000
-
-/**
- * Determine the number of bytes required to store the serialised message.
- *
- * \param message
- *      The message object to serialise.
- * \return
- *      Number of bytes.
- */
-PROTOBUF_C__API
-size_t protobuf_c_message_get_packed_size(const ProtobufCMessage *message);
-
-/**
- * Unpack a serialised message into an in-memory representation.
- *
- * \param descriptor
- *      The message descriptor.
- * \param allocator
- *      `ProtobufCAllocator` to use for memory allocation. May be NULL to
- *      specify the default allocator.
- * \param len
- *      Length in bytes of the serialised message.
- * \param data
- *      Pointer to the serialised message.
- * \return
- *      An unpacked message object.
- * \retval NULL
- *      If an error occurred during unpacking.
- */
-PROTOBUF_C__API
-ProtobufCMessage *protobuf_c_message_unpack(
-    const ProtobufCMessageDescriptor *descriptor, ProtobufCAllocator *allocator,
-    size_t len, const uint8_t *data);
-
-/**
- * Free an unpacked message object.
- *
- * This function should be used to deallocate the memory used by a call to
- * protobuf_c_message_unpack().
- *
- * \param message
- *      The message object to free. May be NULL.
- * \param allocator
- *      `ProtobufCAllocator` to use for memory deallocation. May be NULL to
- *      specify the default allocator.
- */
-PROTOBUF_C__API
-void protobuf_c_message_free_unpacked(ProtobufCMessage *message,
-                                      ProtobufCAllocator *allocator);
-
-/**
- * Check the validity of a message object.
- *
- * Makes sure all required fields (`PROTOBUF_C_LABEL_REQUIRED`) are present.
- * Recursively checks nested messages.
- *
- * \retval TRUE
- *      Message is valid.
- * \retval FALSE
- *      Message is invalid.
- */
-PROTOBUF_C__API
-protobuf_c_boolean protobuf_c_message_check(const ProtobufCMessage *);
-
-/** Message initialiser. */
-#define PROTOBUF_C_MESSAGE_INIT(descriptor) \
-  { descriptor, 0, NULL }
-
-/**
- * Initialise a message object from a message descriptor.
- *
- * \param descriptor
- *      Message descriptor.
- * \param message
- *      Allocated block of memory of size `descriptor->sizeof_message`.
- */
-PROTOBUF_C__API
-void protobuf_c_message_init(const ProtobufCMessageDescriptor *descriptor,
-                             void *message);
-
-/**
- * Initialise a `ProtobufCBufferSimple` object.
- */
-#define PROTOBUF_C_BUFFER_SIMPLE_INIT(array_of_bytes)             \
-  {                                                               \
-    {protobuf_c_buffer_simple_append}, sizeof(array_of_bytes), 0, \
-        (array_of_bytes), 0, NULL                                 \
-  }
-
-/**
- * Clear a `ProtobufCBufferSimple` object, freeing any allocated memory.
- */
-#define PROTOBUF_C_BUFFER_SIMPLE_CLEAR(simp_buf)                              \
-  do {                                                                        \
-    if ((simp_buf)->must_free_data) {                                         \
-      if ((simp_buf)->allocator != NULL)                                      \
-        (simp_buf)->allocator->free((simp_buf)->allocator, (simp_buf)->data); \
-      else                                                                    \
-        free((simp_buf)->data);                                               \
-    }                                                                         \
-  } while (0)
-
-/**
- * The `append` method for `ProtobufCBufferSimple`.
- *
- * \param buffer
- *      The buffer object to append to. Must actually be a
- *      `ProtobufCBufferSimple` object.
- * \param len
- *      Number of bytes in `data`.
- * \param data
- *      Data to append.
- */
-PROTOBUF_C__API
-void protobuf_c_buffer_simple_append(ProtobufCBuffer *buffer, size_t len,
-                                     const unsigned char *data);
-
-/**@}*/
-
-PROTOBUF_C__END_DECLS
-
-#endif /* PROTOBUF_C_H */
diff --git a/mobile/tools/quantification/src/tensor_desc.h b/mobile/tools/quantification/src/tensor_desc.h
deleted file mode 100644
index 4eadf341db..0000000000
--- a/mobile/tools/quantification/src/tensor_desc.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-
-#include "src/framework.pb-c.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-enum VarType_Type {
-  VARTYPE_TYPE_BOOL = 0,
-  VARTYPE_TYPE_INT16 = 1,
-  VARTYPE_TYPE_INT32 = 2,
-  VARTYPE_TYPE_INT64 = 3,
-  VARTYPE_TYPE_FP16 = 4,
-  VARTYPE_TYPE_FP32 = 5,
-  VARTYPE_TYPE_FP64 = 6,
-  VARTYPE_TYPE_LOD_TENSOR = 7,
-  VARTYPE_TYPE_SELECTED_ROWS = 8,
-  VARTYPE_TYPE_FEED_MINIBATCH = 9,
-  VARTYPE_TYPE_FETCH_LIST = 10,
-  VARTYPE_TYPE_STEP_SCOPES = 11,
-  VARTYPE_TYPE_STEP_LOD_RANK_TABLE = 12,
-  VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY = 13,
-  VARTYPE_TYPE_STEP_PLACE_LIST = 14,
-  VARTYPE_TYPE_READER = 15,
-  VARTYPE_TYPE_CHANNEL = 16,
-  VARTYPE_TYPE_RAW = 17,
-  VARTYPE_TYPE_TUPLE = 18
-};
-
-class TensorDesc {
- public:
-  TensorDesc() = default;
-  TensorDesc(const TensorDesc &desc) {
-    this->dims_ = desc.dims_;
-    this->data_type_ = desc.data_type_;
-  }
-
-  explicit TensorDesc(
-      PaddleMobile__Framework__Proto__VarType__TensorDesc *desc) {
-    for (int i = 0; i < desc->n_dims; ++i) {
-      int64_t d = desc->dims[i];
-      dims_.emplace_back(d);
-    }
-    data_type_ = (VarType_Type)desc->data_type;
-  }
-
-  std::vector<int64_t> Dims() const { return dims_; }
-  VarType_Type DataType() const { return data_type_; }
-
- private:
-  std::vector<int64_t> dims_;
-  VarType_Type data_type_;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/tools/quantification/src/var_desc.h b/mobile/tools/quantification/src/var_desc.h
deleted file mode 100644
index 0b9c5ac4d6..0000000000
--- a/mobile/tools/quantification/src/var_desc.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-
-#include "src/framework.pb-c.h"
-#include "src/tensor_desc.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class VarDesc {
- public:
-  VarDesc(const VarDesc &var_desc) {
-    this->data_type_ = var_desc.data_type_;
-    this->name_ = var_desc.name_;
-    this->persistable_ = var_desc.persistable_;
-    this->tensor_desc_ = var_desc.tensor_desc_;
-    this->type_ = var_desc.type_;
-  }
-  explicit VarDesc(PaddleMobile__Framework__Proto__VarDesc *desc) {
-    type_ = (VarType_Type)desc->type->type;
-    name_ = std::string(desc->name);
-    persistable_ = static_cast<bool>(desc->persistable);
-
-    switch (type_) {
-      case VARTYPE_TYPE_SELECTED_ROWS:
-        tensor_desc_ = TensorDesc(desc->type->selected_rows);
-        break;
-      case VARTYPE_TYPE_LOD_TENSOR:
-        tensor_desc_ = TensorDesc(desc->type->lod_tensor->tensor);
-        break;
-      case VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY:
-        // desc->type->tensor_array->tensor->data_type;
-        tensor_desc_ = TensorDesc(desc->type->tensor_array->tensor);
-
-        break;
-      default:
-        break;
-    }
-    switch (type_) {
-      case VARTYPE_TYPE_CHANNEL:
-        data_type_ = (VarType_Type)desc->type->channel->data_type;
-        break;
-      default:
-        data_type_ = tensor_desc_.DataType();
-        break;
-    }
-  }
-  std::string Name() const { return name_; }
-
-  VarType_Type Type() const { return type_; }
-
-  bool Persistable() const { return persistable_; }
-
-  const TensorDesc &Tensor_desc() const { return tensor_desc_; }
-
- private:
-  std::string name_;
-  bool persistable_;
-  TensorDesc tensor_desc_;
-  VarType_Type type_;
-  VarType_Type data_type_;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/tools/shell/change_mobile_namespace.sh b/mobile/tools/shell/change_mobile_namespace.sh
deleted file mode 100755
index aaad6ac193..0000000000
--- a/mobile/tools/shell/change_mobile_namespace.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/env bash
-
-# set -o xtrace
-
-extension=$1
-
-convert () {
-    perl -pi -e "s/namespace paddle_mobile/namespace paddle_mobile_${1}/g" "${2}"
-    perl -pi -e "s/paddle_mobile::/paddle_mobile_${1}::/g" "${2}"
-}
-
-revert () {
-    perl -pi -e "s/namespace paddle_mobile_[\w]*/namespace paddle_mobile/g" "${2}"
-    perl -pi -e "s/paddle_mobile_[\w]*::/paddle_mobile::/g" "${2}"
-}
-
-if [[ $2 == "revert" ]]; then
-    for file in $(find src -name "*\.*")
-    do
-        echo "reverting ${file}"
-        revert $extension $file
-    done
-    for file in $(find test -name "*\.*")
-    do
-        echo "reverting ${file}"
-        revert $extension $file
-    done
-else
-    for file in $(find src -name "*\.*")
-    do
-        echo "converting ${file}"
-        convert $extension $file
-    done
-    for file in $(find test -name "*\.*")
-    do
-        echo "converting ${file}"
-        convert $extension $file
-    done
-fi
diff --git a/mobile/tools/shell/check-bitcode.sh b/mobile/tools/shell/check-bitcode.sh
deleted file mode 100644
index a13cfac9c7..0000000000
--- a/mobile/tools/shell/check-bitcode.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/sh
-
-archs=(armv7 armv7s arm64)
-libraries=(*.a)
-libtool="/usr/bin/libtool"
-
-echo "checking bitcode in ${libraries[*]}..."
-
-for library in ${libraries[*]}
-do
-    lipo -info $library
-    
-    # Extract individual architectures for this library
-    for arch in ${archs[*]}
-    do
-            lipo -extract $arch $library -o ${library}_${arch}.a
-    done
-done
-
-for arch in ${archs[*]}
-do
-    source_libraries=""
-    
-    for library in ${libraries[*]}
-    do
-        echo "checking ${library}_${arch}.a"
-        printf "\tbitcode symbol number "
-        otool -l ${library}_${arch}.a | grep bitcode | wc -l
-        # Delete intermediate files
-        rm ${library}_${arch}.a
-    done
-done
-
-echo "bitcode checking complete."
diff --git a/mobile/tools/shell/check-filename.sh b/mobile/tools/shell/check-filename.sh
deleted file mode 100644
index 53eacc8c0e..0000000000
--- a/mobile/tools/shell/check-filename.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/sh
-
-archs=(armv7 armv7s arm64)
-libraries=(*.a)
-libtool="/usr/bin/libtool"
-
-echo "checking filename in ${libraries[*]}..."
-
-for library in ${libraries[*]}
-do
-    lipo -info $library
-    
-    # Extract individual architectures for this library
-    for arch in ${archs[*]}
-    do
-        lipo $library -thin armv7 -output ${library}_${arch}.a
-    done
-done
-
-for arch in ${archs[*]}
-do
-    source_libraries=""
-    
-    for library in ${libraries[*]}
-    do
-        archlib=${library}_${arch}.a
-        echo "checking $archlib"
-        mkdir tmp_check_dir
-        cp $archlib tmp_check_dir
-        cd tmp_check_dir
-        ar -x $archlib
-        ls -alh | grep $1
-        echo ""
-        cd ..
-        # Delete intermediate files
-        rm ${library}_${arch}.a
-        rm -rf tmp_check_dir
-    done
-done
-
-echo "filename checking complete."
diff --git a/mobile/tools/shell/generate-include/.gitignore b/mobile/tools/shell/generate-include/.gitignore
deleted file mode 100644
index af9eaaeff8..0000000000
--- a/mobile/tools/shell/generate-include/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-include
-include.zip
diff --git a/mobile/tools/shell/generate-include/check_include_diff.sh b/mobile/tools/shell/generate-include/check_include_diff.sh
deleted file mode 100644
index eb3dd9d1dc..0000000000
--- a/mobile/tools/shell/generate-include/check_include_diff.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env
-
-include1=$1
-include2=$2
-
-root=$(pwd)
-
-cd $include1
-list1=$(find . -name "*" | sort -n | uniq)
-cd $root
-echo "$list1" > include1.list
-
-cd $include2
-list2=$(find . -name "*" | sort -n | uniq)
-cd $root
-echo "$list2" > include2.list
-
-diff include1.list include2.list
-
-if [ "$?" = "0" ]
-then
-    echo "no diff"
-else
-    echo "has diff"
-fi
-
-rm include1.list
-rm include2.list
-
-echo "done"
diff --git a/mobile/tools/shell/generate-include/main.cpp b/mobile/tools/shell/generate-include/main.cpp
deleted file mode 100644
index 720f09f11a..0000000000
--- a/mobile/tools/shell/generate-include/main.cpp
+++ /dev/null
@@ -1,6 +0,0 @@
-#include "io/paddle_mobile.h"
-#include "io/paddle_inference_api.h"
-
-int main() {
-    return 0;
-}
diff --git a/mobile/tools/shell/generate-include/parse.py b/mobile/tools/shell/generate-include/parse.py
deleted file mode 100644
index ba5445c68b..0000000000
--- a/mobile/tools/shell/generate-include/parse.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import sys
-
-output = ""
-for line in sys.stdin:
-    line.strip()
-    tag = "\\"
-    if tag in line:
-        index = line.index("\\")
-        line = line[:index]
-    output += line
-for line in output.split(" "):
-    line = line.strip()
-    if "/Applications" in line:
-        continue
-    if len(line) <= 0:
-        continue
-    if not line.endswith(".h"):
-        continue
-    if not line.startswith("../../../src/"):
-        continue
-    print(line[len("../../../src/"):])
diff --git a/mobile/tools/shell/generate-include/run.sh b/mobile/tools/shell/generate-include/run.sh
deleted file mode 100755
index 1af1bce416..0000000000
--- a/mobile/tools/shell/generate-include/run.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/usr/bin/env bash
-
-rm -rf include
-
-mkdir include
-
-g++ -I../../../src/ -M main.cpp | python parse.py | xargs -I % sh -c "dirname %" | sort | uniq | xargs -I % sh -c "mkdir -p include/%"
-
-g++ -I../../../src/ -M main.cpp | python parse.py | xargs -I % sh -c "cp ../../../src/% include/%"
diff --git a/mobile/tools/shell/merge.sh b/mobile/tools/shell/merge.sh
deleted file mode 100644
index 08c19d9286..0000000000
--- a/mobile/tools/shell/merge.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/sh
-
-# Combined all static libaries in the current directory into a single static library
-# It is hardcoded to use the i386, armv7, and armv7s architectures; this can easily be changed via the 'archs' variable at the top
-# The script takes a single argument, which is the name of the final, combined library to be created.
-#
-#   For example:
-#  =>    combine_static_libraries.sh combined-library
-#
-# Script by Evan Schoenberg, Regular Rate and Rhythm Software
-# Thanks to Claudiu Ursache for his blog post at http://www.cvursache.com/2013/10/06/Combining-Multi-Arch-Binaries/ which detailed the technique automated by this script
-#####
-# $1 = Name of output archive
-#####
-
-# archs=(i386 armv7 armv7s)
-archs=(armv7 arm64)
-libraries=(*.a)
-libtool="/usr/bin/libtool"
-
-echo "Combining ${libraries[*]}..."
-
-for library in ${libraries[*]}
-do
-    lipo -info $library
-    
-    # Extract individual architectures for this library
-    for arch in ${archs[*]}
-    do
-            lipo -extract $arch $library -o ${library}_${arch}.a
-    done
-done
-
-# Combine results of the same architecture into a library for that architecture
-source_combined=""
-for arch in ${archs[*]}
-do
-    source_libraries=""
-    
-    for library in ${libraries[*]}
-    do
-        source_libraries="${source_libraries} ${library}_${arch}.a"
-    done
-    
-    $libtool -static ${source_libraries} -o "${1}_${arch}.a"
-    source_combined="${source_combined} ${1}_${arch}.a"
-    
-    # Delete intermediate files
-    rm ${source_libraries}
-done
-
-# Merge the combined library for each architecture into a single fat binary
-lipo -create $source_combined -o $1.a
-
-# Delete intermediate files
-rm ${source_combined}
-
-# Show info on the output library as confirmation
-echo "Combination complete."
-lipo -info $1.a
diff --git a/mobile/tools/shell/prune_static_library.sh b/mobile/tools/shell/prune_static_library.sh
deleted file mode 100644
index 1b555e92bb..0000000000
--- a/mobile/tools/shell/prune_static_library.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/sh
-
-# Split all static libaries in the current directory into corresponding archtectures
-
-archs=(armv7 arm64)
-libraries=(*.a)
-libtool="/usr/bin/libtool"
-
-rm -rf tmp
-mkdir tmp
-
-echo "splitting and pruning ${libraries[*]}..."
-
-for library in ${libraries[*]}
-do
-    lipo -info $library
-    # Extract individual architectures for this library
-    for arch in ${archs[*]}
-    do
-        mkdir -p tmp/$arch
-        lipo -thin $arch $library -o ./tmp/$arch/${library}
-        cd tmp/$arch
-        ar x $library
-        rm $library
-        ar -rcs $library *.o
-        cd ../..
-    done
-done
-
-echo "joining static libriries..."
-cd tmp
-libtool -static -o $library armv7/$library arm64/$library
-
-# # split static library into objects
-# ar x 1.a
-# # join objects into static library
-# ar -rcs 2.a *.o
-# # join static libraries into one single static library
-# libtool -static -o 3.a 1.a 2.a
-# # list file by file size, prune according to file size
-# ls -Slhr directory
diff --git a/mobile/tools/shell/restore-private-repo.sh b/mobile/tools/shell/restore-private-repo.sh
deleted file mode 100644
index d9d29ed3e5..0000000000
--- a/mobile/tools/shell/restore-private-repo.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/usr/bin/env bash
-
-git clone https://icode.baidu.com/baidu/bdbox/paddle-mobile-private-repo/
-
-cp -R paddle-mobile-private-repo/paddle-mobile-metallib ../../metal/
diff --git a/mobile/tools/toolchains/arm-android-neon.cmake b/mobile/tools/toolchains/arm-android-neon.cmake
deleted file mode 100644
index 5e431059a9..0000000000
--- a/mobile/tools/toolchains/arm-android-neon.cmake
+++ /dev/null
@@ -1,5 +0,0 @@
-set(ANDROID_ARM_NEON ON)
-set(ANDROID_PIE TRUE)
-set(ANDROID_STL "c++_static")
-set(ANDROID_PLATFORM "android-22")
-include("${CMAKE_CURRENT_LIST_DIR}/../android-cmake/android.toolchain.cmake")
diff --git a/mobile/tools/toolchains/arm-linux-gnueabi.cmake b/mobile/tools/toolchains/arm-linux-gnueabi.cmake
deleted file mode 100644
index c2b1b853de..0000000000
--- a/mobile/tools/toolchains/arm-linux-gnueabi.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMake toolchain file for building ARM software on Linux environment
-
-set(CMAKE_SYSTEM_NAME Linux)
-set(CMAKE_SYSTEM_VERSION 1)
-
-set(CMAKE_C_COMPILER   /usr/bin/arm-linux-gnueabi-gcc)
-set(CMAKE_CXX_COMPILER /usr/bin/arm-linux-gnueabi-g++)
-set(CMAKE_STRIP /usr/bin/arm-linux-gnueabi-strip)
-
-set(CMAKE_FIND_ROOT_PATH  /usr/arm-linux-gnueabi)
-
-set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
-set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
-
-set(ARM_LINUX 1)
diff --git a/mobile/tools/toolchains/arm-linux-gnueabihf.cmake b/mobile/tools/toolchains/arm-linux-gnueabihf.cmake
deleted file mode 100644
index 2b8729cd9d..0000000000
--- a/mobile/tools/toolchains/arm-linux-gnueabihf.cmake
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMake toolchain file for building ARM software on Linux environment
-
-set(CMAKE_SYSTEM_NAME Linux)
-set(CMAKE_SYSTEM_PROCESSOR arm)
-set(CMAKE_SYSTEM_VERSION 1)
-
-set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
-set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
diff --git a/model_optimize_tool.md b/model_optimize_tool.md
new file mode 100644
index 0000000000..454f865550
--- /dev/null
+++ b/model_optimize_tool.md
@@ -0,0 +1,61 @@
+# 模型转化方法
+
+Lite架构在预测过程中表现出来的高性能得益于其丰富的优化组件，其中包括量化、子图融合、混合调度、Kernel优选等等策略。为了使优化过程更加方便易用，我们提供了**Model Optimize Tool**来自动完成优化步骤，输出一个轻量的、最优的可执行模型。具体使用方法介绍如下：
+
+## 准备model_optimize_tool
+
+可以选择下载或者手动编译model_optimize_tool可执行文件。
+
+### 下载model_optimize_tool
+
+```sh
+wget https://paddle-inference-dist.bj.bcebos.com/PaddleLite/model_optimize_tool
+chmod 777 model_optimize_tool
+```
+
+### 编译model_optimize_tool
+
+1、参照 [编译安装](./source_compile) 进行环境配置和编译
+
+2、进入docker中PaddleLite根目录，```git checkout develop```切换到develop分支
+
+3、使用cmake构建目标，执行如下命令编译model_optimize_tool
+```bash
+./lite/tools/build.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_static full_publish
+```
+4、编译完成，优化工具在```Paddle-Lite/build.lite.android.armv8.gcc/lite/api/model_optimize_tool```
+
+## 使用方法
+
+1、准备需要优化的fluid模型
+
+fluid模型有两种形式，combined形式（权重保存为一个param文件）和非combined形式（权重保存为一个一个单独的文件），model_optimize_tool支持对这两种形式的fluid模型进行直接优化。
+
+2、将model_optimize_tool和需要优化的模型文件push到手机端
+
+3、使用model_optimize_tool对模型进行优化
+
+```shell
+./model_optimize_tool \
+    --model_dir=<model_param_dir> \
+    --model_file=<model_path> \
+    --param_file=<param_path> \
+    --optimize_out_type=(protobuf|naive_buffer) \
+    --optimize_out=<output_optimize_model_dir> \
+    --valid_targets=(arm|opencl|x86) \
+    --prefer_int8_kernel=(true|false)
+```
+
+| 选项         | 说明 |
+| ------------------- | ------------------------------------------------------------ |
+| --model_dir         | 待优化的fluid模型（非combined形式）的路径，其中包括网络结构文件和一个一个单独保存的权重文件。|
+| --model_file        | 待优化的fluid模型（combined形式）的网络结构路径。 |
+| --param_file        | 待优化的fluid模型（combined形式）的权重文件路径。 |
+| --optimize_out_type | 输出模型类型，目前支持两种类型：protobuf和naive_buffer，其中naive_buffer是一种更轻量级的序列化/反序列化实现。若您需要在mobile端执行模型预测，请将此选项设置为naive_buffer。默认为protobuf。 |
+| --optimize_out      | 优化模型的输出路径。                                         |
+| --valid_targets     | 指定模型可执行的backend，目前可支持x86、arm、opencl，您可以同时指定多个backend(以空格分隔)，Model Optimize Tool将会自动选择最佳方式。默认为arm。 |
+| --prefer_int8_kernel | 是否启用int8量化模型，默认为false。                          |
+
+* 如果待优化的fluid模型是非combined形式，请设置`--model_dir`，忽略`--model_file`和`--param_file`。
+* 如果待优化的fluid模型是combined形式，请设置`--model_file`和`--param_file`，忽略`--model_dir`。
+* 优化后的模型包括__model__.nb和param.nb文件。
\ No newline at end of file
diff --git a/model_quantization.md b/model_quantization.md
new file mode 100644
index 0000000000..0197e2c888
--- /dev/null
+++ b/model_quantization.md
@@ -0,0 +1,305 @@
+<!--ts-->
+* [模型量化](#模型量化)
+  * [一、使用PaddleSlim模型压缩工具获取量化模型](#一使用PaddleSlim模型压缩工具获取量化模型)
+     * [1. 安装PaddlePaddle版本](#1-安装PaddlePaddle版本)
+     * [2. 克隆量化训练所需的代码库](#2-克隆量化训练所需的代码库)
+     * [3. 数据准备](#3-数据准备)
+     * [4. 压缩脚本介绍](#4-压缩脚本介绍)
+     * [5. 执行int8量化训练](#5-执行int8量化训练)
+  * [二、使用Paddle-Lite运行量化模型推理](#二使用Paddle-Lite运行量化模型推理)
+     * [1. 在手机端准备量化模型文件](#1-在手机端准备量化模型文件)
+     * [2. 使用模型优化工具对量化模型进行优化](#2-使用模型优化工具对量化模型进行优化)
+     * [3. 使用mobilenetv1_light_api运行优化后的量化模型](#3-使用mobilenetv1_light_api运行优化后的量化模型)
+  * [三、FAQ](#三FAQ)
+
+<!-- Added by: , at: Thu Aug 29 16:00:45 CST 2019 -->
+
+<!--te-->
+
+# 模型量化
+
+Paddle-Lite支持加载运行[PaddlePaddle框架](https://github.com/PaddlePaddle/Paddle)量化训练产出的模型。本文主要介绍如何基于PaddlePaddle和Paddle-Lite对模型进行端到端的量化训练和推理执行。PaddlePaddle框架中所使用的量化训练原理请猛戳[此处](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/docs/tutorial.md#1-quantization-aware-training%E9%87%8F%E5%8C%96%E4%BB%8B%E7%BB%8D)。如果您是初次接触PaddlePaddle框架，建议首先学习[新人入门](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/index_cn.html)和[使用指南](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/user_guides/index_cn.html)。
+> 备注：本文中所使用的模型量化示例均为MobileNetV1。
+
+### 一、使用PaddleSlim模型压缩工具获取量化模型
+**用户须知**: 现阶段的量化训练主要针对卷积层（包括二维卷积和Depthwise卷积）以及全连接层进行量化。卷积层和全连接层在PaddlePaddle框架中对应算子包括conv2d、depthwise_conv2d和mul等。量化训练会对conv2d、depthwise_conv2d和mul进行量化操作，且要求它们的输入中必须包括激活和参数两部分。
+
+#### 1. 安装PaddlePaddle
+根据操作系统、安装方式、Python版本和CUDA版本，按照[官方说明](https://paddlepaddle.org.cn/start)安装PaddlePaddle1.5.1版本。例如：
+
+Ubuntu 16.04.4 LTS操作系统，CUDA9，cuDNN7，GPU版本安装:
+```bash
+pip install paddlepaddle-gpu==1.5.1.post97 -i https://mirrors.aliyun.com/pypi/simple/
+```
+
+Ubuntu 16.04.4 LTS操作系统，CPU版本安装:
+```bash
+pip install paddlepaddle==1.5.1 -i https://mirrors.aliyun.com/pypi/simple/
+```
+
+#### 2. 克隆量化训练所需的代码库
+克隆[PaddlePaddle/models](https://github.com/PaddlePaddle/models)到本地，并进入models/PaddleSlim路径。执行如下命令：
+```bash
+git clone https://github.com/PaddlePaddle/models.git
+cd models/PaddleSlim
+```
+
+#### 3. 数据准备
+##### 3.1 训练数据准备
+参考[models/PaddleCV/image_classification](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/image_classification#data-preparation)下的数据准备教程准备训练数据，并放入PaddleSlim/data路径下。
+
+##### 3.2 预训练模型准备
+
+脚本run.sh会自动从[models/PaddleCV/image_classification](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification#supported-models-and-performances)下载MobileNetV1的预训练模型，并放入PaddleSlim/pretrain路径下。
+
+经过以上三步，PaddleSlim目录下的文件结构如下所示：
+
+```bash
+.
+├── compress.py # 模型压缩任务主脚本，定义了压缩任务需要的模型相关信息
+├── configs # 压缩任务的配置文件，包括:蒸馏、int8量化量化、filter剪切和组合策略的配置文件
+├── data # 存放训练数据（需要用户自己创建）
+│   └── ILSVRC2012
+├── pretrain # 存放预训练模型参数，执行run.sh自动生成
+│   ├── MobileNetV1_pretrained
+│   ├── MobileNetV1_pretrained.tar
+│   ├── ResNet50_pretrained
+│   └── ResNet50_pretrained.tar
+├── docs # 文档目录
+├── light_nas
+├── models # 模型网络结构的定义，如MobileNetV1
+├── quant_low_level_api # 量化训练的底层API, 用于灵活定制量化训练的过程，适用于高阶用户
+├── reader.py # 定义数据处理逻辑
+├── README.md
+├── run.sh # 模型压缩任务启动脚本
+└── utility.py # 定义了常用的工具方法
+```
+
+#### 4. 压缩脚本介绍
+在`compress.py`中定义了执行压缩任务需要的所有模型相关的信息，这里对几个关键的步骤进行简要介绍：
+
+##### 4.1 目标网络的定义
+
+compress.py的以下代码片段定义了train program, 这里train program只有前向计算操作。
+```python
+out = model.net(input=image, class_dim=args.class_dim)
+cost = fluid.layers.cross_entropy(input=out, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+```
+
+然后，通过clone方法得到eval_program, 用来在压缩过程中评估模型精度，如下：
+
+```python
+val_program = fluid.default_main_program().clone()
+```
+
+定义完目标网络结构，需要对其初始化，并根据需要加载预训练模型。
+
+##### 4.2  定义feed_list和fetch_list
+对于train program, 定义train_feed_list用于指定从train data reader中取的数据feed给哪些variable。定义train_fetch_list用于指定在训练时，需要在log中展示的结果。如果需要在训练过程中在log中打印accuracy信心，则将('acc_top1', acc_top1.name)添加到train_fetch_list中即可。
+```python
+train_feed_list = [('image', image.name), ('label', label.name)]
+train_fetch_list = [('loss', avg_cost.name)]
+```
+
+> 注意： 在train_fetch_list里必须有loss这一项。
+
+对于eval program. 同上定义eval_feed_list和train_fetch_list:
+
+```python
+val_feed_list = [('image', image.name), ('label', label.name)]
+val_fetch_list = [('acc_top1', acc_top1.name), ('acc_top5', acc_top5.name)]
+```
+
+##### 4.3 Compressor和量化配置文件
+I. `compress.py`主要使用Compressor和yaml文件完成对模型的量化训练工作。Compressor类的定义如下：
+```python
+class Compressor(object):
+    def __init__(self,
+                 place,
+                 scope,
+                 train_program,
+                 train_reader=None,
+                 train_feed_list=None,
+                 train_fetch_list=None,
+                 eval_program=None,
+                 eval_reader=None,
+                 eval_feed_list=None,
+                 eval_fetch_list=None,
+                 teacher_programs=[],
+                 checkpoint_path='./checkpoints',
+                 train_optimizer=None,
+                 distiller_optimizer=None):
+```
+
+在定义Compressor对象时，需要注意以下问题：
+* train program如果带反向operators和优化更新相关的operators, 参数train_optimizer需要设置为None.
+* eval_program中parameter的名称需要与train_program中的parameter的名称完全一致。
+* 最终保存的量化模型是在eval_program网络基础上进行剪枝保存的。所以，如果用户希望最终保存的模型可以用于inference, 则eval program需要包含推理阶段需要的各种operators.
+* checkpoint保存的是float数据类型的模型。
+
+II. `configs/quantization.yaml`量化配置文件示例如下：
+
+```python
+version: 1.0
+strategies:
+    quantization_strategy:
+        class: 'QuantizationStrategy'
+        start_epoch: 0
+        end_epoch: 9
+        float_model_save_path: './output/float'
+        mobile_model_save_path: './output/mobile'
+        int8_model_save_path: './output/int8'
+        weight_bits: 8
+        activation_bits: 8
+        weight_quantize_type: 'abs_max'
+        activation_quantize_type: 'moving_average_abs_max'
+        save_in_nodes: ['image']
+        save_out_nodes: ['fc_0.tmp_2']
+compressor:
+    epoch: 10
+    checkpoint_path: './checkpoints_quan/'
+    strategies:
+        - quantization_strategy
+```
+其中，可配置参数包括：
+- **class:** 量化策略的类名称，目前仅支持`QuantizationStrategy`。
+- **start_epoch:** 在start_epoch开始之前，量化训练策略会往train_program和eval_program插入量化operators和反量化operators。 从start_epoch开始，进入量化训练阶段。
+- **end_epoch:** 在end_epoch结束之后，会保存用户指定格式的模型。注意：end_epoch之后并不会停止量化训练，而是继续训练直到epoch数等于compressor.epoch值为止。举例来说，当start_epoch=0，end_epoch=0，compressor.epoch=2时，量化训练开始于epoch0，结束于epoch1，但保存的模型是epoch0结束时的参数状态。
+- **float_model_save_path:**  保存float数据格式的模型路径，即该路径下的模型参数范围为int8范围但参数数据类型为float32。如果设置为None, 则不存储float格式的模型，默认为None。**注意：Paddle-Lite即使用该目录下的模型进行量化模型推理优化，详见本文[使用Paddle-Lite运行量化模型推理](#二使用Paddle-Lite运行量化模型推理)部分。**
+- **int8_model_save_path:** 保存int8数据格式的模型路径，即该路径下的模型参数范围为int8范围且参数数据类型为int8。如果设置为None, 则不存储int8格式的模型，默认为None.
+- **mobile_model_save_path:** 保存兼容paddle-mobile框架的模型路径。如果设置为None, 则不存储paddle-mobile格式的模型，默认为None。目前paddle-mobile已升级为Paddle-Lite。
+- **weight_bits:** 量化weight的bit数，注意偏置(bias)参数不会被量化。
+- **activation_bits:** 量化activation的bit数。
+-  **weight_quantize_type:** weight量化方式，目前量化训练支持`abs_max`、 `channel_wise_abs_max`。
+- **activation_quantize_type:** activation量化方式，目前量化训练支持`abs_max`、 `range_abs_max`和`moving_average_abs_max`。
+- **save_in_nodes:** variable名称列表。在保存量化后模型的时候，需要根据save_in_nodes对eval programg 网络进行前向遍历剪枝。默认为eval_feed_list内指定的variable的名称列表。
+- **save_out_nodes:** varibale名称列表。在保存量化后模型的时候，需要根据save_out_nodes对eval programg 网络进行回溯剪枝。默认为eval_fetch_list内指定的variable的名称列表。
+
+> **备注：**
+>
+> 1）`abs_max`意为在训练的每个step及inference阶段均动态计算量化scale值。`channel_wise_abs_max`与`abs_max`类似，不同点在于它会对卷积权重进行分channel求取量化scale。换言之，`abs_max`属于tensor-wise量化，而`channel_wise_abs_max`属于channel-wise量化，详细说明请猛戳[此处](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/quantization/training_quantization_model_format.md)。
+> 
+> 2）`moving_average_abs_max`和`range_abs_max`意为在训练阶段计算出一个静态的量化scale值，并将其用于inference阶段。`moving_average_abs_max`使用窗口滑动平均的方法计算量化scale，而`range_abs_max`则使用窗口绝对值最大值的方式。
+> 
+> 3）**目前，Paddle-Lite仅支持运行weight量化方式使用`abs_max`且activation量化方式使用`moving_average_abs_max`或`range_abs_max`产出的量化模型**。
+
+#### 5. 执行int8量化训练
+
+修改run.sh，即注释掉`# enable GC strategy`与`# for sensitivity filter pruning`之间的内容并打开`#for quantization`相关的脚本命令（所需打开注释的命令如下所示）。
+
+```bash
+# for quantization
+#---------------------------
+export CUDA_VISIBLE_DEVICES=0
+python compress.py \
+--batch_size 64 \
+--model "MobileNet" \
+--pretrained_model ./pretrain/MobileNetV1_pretrained \
+--compress_config ./configs/quantization.yaml \
+--quant_only True
+```
+最后，运行`sh run.sh`命令开始int8量化训练。
+
+### 二、使用Paddle-Lite运行量化模型推理
+上述量化训练过程完成后，若用户按照本文中所述`configs/quantization.yaml`文件内容配置的模型输出路径，则可在models/PaddleSlim/output目录下看到`float`、`int8`和`mobile`三个目录，其中：
+* float目录: 参数范围为int8范围但参数数据类型为float32的量化模型。Paddle-Lite即使用该目录下的模型文件及参数进行量化模型的部署。
+* int8目录: 参数范围为int8范围且参数数据类型为int8的量化模型。
+* mobile目录：参数特点与int8目录相同且兼容paddle-mobile的量化模型（目前paddle-mobile已升级为Paddle-Lite）。
+
+#### 1. 在手机端准备量化模型文件
+这里我们主要使用float目录下的模型文件（用户亦可以选择使用官方已经预训练好的MobileNetV1量化模型，[点击此处](https://paddle-inference-dist.bj.bcebos.com/int8%2Fpretrain%2Fmobilenet_v1_quant%2Ffloat.zip)进行下载）。
+使用如下命令将float目录下的量化模型文件导入到手机端：
+
+```bash
+adb shell mkdir -p /data/local/tmp/mobilenet_v1_quant
+adb push float/* /data/local/tmp/mobilenet_v1_quant
+```
+
+#### 2. 使用模型优化工具对量化模型进行优化
+克隆[PaddlePaddle/Paddle-Lite](https://github.com/PaddlePaddle/Paddle-Lite)到本地（注意执行以下所有命令时均默认Paddle-Lite源码文件夹在当前目录下）。根据[Docker开发环境的配置说明文档](https://github.com/PaddlePaddle/Paddle-Lite/wiki/source_compile#1-docker%E5%BC%80%E5%8F%91%E7%8E%AF%E5%A2%83)准备Paddle-Lite编译环境。若用户按照文档配置docker编译环境，则进入docker容器可看到宿主机端的Paddle-Lite源码文件夹被映射挂载到容器的/Paddle-Lite目录下。在docker容器中执行以下编译命令：
+
+```bash
+cd /Paddle-Lite
+./lite/tools/build.sh             \
+   --arm_os=android               \
+   --arm_abi=armv8                \
+   --arm_lang=gcc                 \
+   --android_stl=c++_static       \
+   full_publish
+```
+
+* 编译完成后退出docker容器，模型优化工具model\_optimize\_tool在宿主机的存放位置为`Paddle-Lite/build.lite.android.armv8.gcc/lite/api/model_optimize_tool`。此时，目录结构如下所示：
+```bash
+Paddle-Lite/
+|-- build.lite.android.armv8.gcc
+|   |-- lite
+|   |   |-- api
+|   |   |   |-- model_optimize_tool
+```
+* 在宿主机执行如下命令将`model_optimize_tool`文件导入到手机端。
+
+```bash
+adb push Paddle-Lite/build.lite.android.armv8.gcc/lite/api/model_optimize_tool /data/local/tmp
+```
+
+* 在宿主机执行如下命令，完成对量化训练模型的优化，产生适合在移动端直接部署的量化模型。
+```bash
+adb shell rm -rf /data/local/tmp/mobilenet_v1_quant_opt
+adb shell chmod +x /data/local/tmp/model_optimize_tool
+adb shell /data/local/tmp/model_optimize_tool                 \
+--model_file=/data/local/tmp/mobilenet_v1_quant/model         \
+--param_file=/data/local/tmp/mobilenet_v1_quant/weights       \
+--optimize_out_type=naive_buffer                              \
+--optimize_out=/data/local/tmp/mobilenet_v1_quant_opt         \
+--valid_targets=arm                                           \
+--prefer_int8_kernel=true
+```
+model\_optimize\_tool的详细使用方法请猛戳[此处](https://github.com/PaddlePaddle/Paddle-Lite/wiki/model_optimize_tool#%E4%BD%BF%E7%94%A8%E6%96%B9%E6%B3%95)。
+
+> 备注：如前所述，Paddle-Lite所使用的float目录下量化模型参数范围为int8范围但参数数据类型仍为float32类型，仅这样确实没有起到模型参数压缩的效果。但是，经过model\_optimize\_tool工具优化后对应的量化参数均会以int8类型重新存储达到参数压缩的效果，且模型结构也被优化（如进行了各种operator fuse操作）。
+
+#### 3. 使用mobilenetv1\_light\_api运行优化后的量化模型
+
+在docker容器中执行如下命令获取Paddle-Lite轻量级API的demo：
+
+```bash
+cd /Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/mobile_light
+make clean && make -j
+```
+执行完上述命令后退出docker容器，并可在宿主机`Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/mobile_light/`路径下看到`mobilenetv1_light_api`可执行文件。将`mobilenetv1_light_api`导入到手机端并运行量化模型推理。执行命令如下：
+
+```bash
+adb push Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/mobile_light/mobilenetv1_light_api /data/local/tmp
+adb shell chmod +x /data/local/tmp/mobilenetv1_light_api
+adb shell /data/local/tmp/mobilenetv1_light_api               \
+    --model_dir=/data/local/tmp/mobilenet_v1_quant_opt
+```
+**程序运行结果如下：**
+```bash
+Output dim: 1000
+Output[0]: 0.000228
+Output[100]: 0.000260
+Output[200]: 0.000250
+Output[300]: 0.000560
+Output[400]: 0.000950
+Output[500]: 0.000275
+Output[600]: 0.005143
+Output[700]: 0.002509
+Output[800]: 0.000538
+Output[900]: 0.000969
+```
+在C++中使用Paddle-Lite API的方法请猛戳[此处](https://github.com/PaddlePaddle/Paddle-Lite/wiki/demos#如何在代码中使用-api)，用户也可参考[mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc)的代码示例。
+
+### 三、FAQ
+
+**问题**：Compiled with WITH_GPU, but no GPU found in runtime
+
+**解答**：检查本机是否支持GPU训练，如果不支持请使用CPU训练。如果在docker进行GPU训练，请使用nvidia_docker启动容器。
+
+**问题**：Inufficient GPU memory to allocation. at [/paddle/paddle/fluid/platform/gpu_info.cc:262]
+	
+**解答**：正确设置run.sh脚本中`CUDA_VISIBLE_DEVICES`，确保显卡剩余内存大于需要内存。
+
diff --git a/npu.md b/npu.md
new file mode 100644
index 0000000000..c102254abe
--- /dev/null
+++ b/npu.md
@@ -0,0 +1,125 @@
+# Lite支持NPU在线编译
+
+Paddle Lite可以在线分析模型特点，在线编译并生成NPU所需要的IR并实时运行。
+是首个支持NPU在线模型的预测框架。
+
+也可以离线分析并调优模型后，保存离线模型，直接线上部署使用。
+
+# 编译
+
+只需要提前准备华为DKK库和Lite 代码。
+
+我们也提供了编译NPU的[脚本](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/tools/build_npu.sh)可以直接使用。
+
+例如：
+```shell
+$ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --ddk_root=/to/your/ddk_path build
+```
+
+## 细节说明
+
+CMAKE编译选项：
+
+- 设置`LITE_WITH_NPU=ON`和`LITE_WITH_ARM=ON`
+- 设置DDK根目录路径 `NPU_DDK_ROOT`
+
+其他编译选项与ARM编译相同，可以参考[“Paddle Lite在Docker下的ARM编译”](./source_compile)。
+
+示例如下：
+```shell
+    cmake .. \
+        -DWITH_GPU=OFF \
+        -DWITH_MKL=OFF \
+        -DWITH_LITE=ON \
+        -DLITE_WITH_CUDA=OFF \
+        -DLITE_WITH_X86=OFF \
+        -DLITE_WITH_ARM=ON \
+        -DWITH_ARM_DOTPROD=ON   \
+        -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
+        -DWITH_TESTING=ON \
+        -DLITE_WITH_NPU=ON \
+        -DANDROID_API_LEVEL=24 \
+        -DNPU_DDK_ROOT="/path/to/ai_ddk_lib/" \
+        -DARM_TARGET_OS=android -DARM_TARGET_ARCH_ABI=armv8 -DARM_TARGET_LANG=gcc
+    make test_mobilenetv1 -j
+```
+
+Note： 当前仅支持armv8和gcc编译。
+
+# 运行示例
+
+把MobilenetV1的模型和参数push到指定的`working_dir`.
+
+```shell
+
+working_dir=/data/local/tmp
+
+test_bin=test_npu_pass
+model_dir=mobilenet_v1 # as example
+repeats=10
+batch_size=1
+im_channel=3
+im_height=224
+im_width=224
+optimized_model="${model_dir}_opt"
+
+adb shell "mkdir -p ${working_dir}"
+adb push $test_bin $working_dir/
+adb push $model_dir $working_dir
+adb push ai_ddk_lib/lib64/* $working_dir
+adb shell chmod +x "${working_dir}/${test_bin}"
+adb shell "rm -rf ${working_dir}/${optimized_model}"
+
+adb shell "cd ${working_dir} ; export LD_LIBRARY_PATH=./; ./${test_bin} --model_dir=${model_dir} --optimized_model=${optimized_model} --repeats=${repeats} --batch_size=${batch_size} --im_channel=${im_channel} --im_height=${im_height} --im_width=${im_width}"
+
+```
+在华为810的机器上，由运行结果可知单侧通过并且预测速度为6ms左右。
+一般第一次的运行时间略长，可以重复多次得到稳定结果。
+
+# 如何在Code中使用
+
+在Lite中使用NPU非常简单，不需要添加太多额外代码。
+
+- 只需要在添加有效place的时候包括`Place{TARGET(kNPU), PRECISION(kFloat)}`即可。
+后续的运行和使用没有任何变化。
+
+Note：
+唯一需要注意的是，因为构建NPU子图需要提前知晓各个op输入的具体大小，所以生成NPU的`RuntimeProgram`时需要提前初始化输入的大小，主要包括batchsize大小。
+如果不提前设置好大小，生成NPU模型时会报错退出。
+
+代码示例：
+```cpp
+std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
+                                   Place{TARGET(kARM), PRECISION(kFloat)}});
+// if want to use NPU
+valid_places.push_back(Place{TARGET(kNPU), PRECISION(kFloat)});
+
+DeviceInfo::Init();
+DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
+lite::Predictor predictor;
+predictor.Build(model_dir, preferred_place, valid_places);
+
+auto* input_tensor = predictor.GetInput(0);
+input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+auto* data = input_tensor->mutable_data<float>();
+auto item_size = input_tensor->dims().production();
+for (int i = 0; i < item_size; i++) {
+  data[i] = 1;
+}
+
+predictor.Run();
+```
+
+# FAQ
+
+## 关于开发板
+
+由于该框架针对的是华为HiAI最新的NPU架构，应该还没有现成的开发板集成了该架构的NPU，所以通常看到的比如海思2359A上的NPU不一样的。
+
+## 关于手机
+
+支持目前最新的是华为810，以及未来要发布的NPU系列手机。
+
+# Note
+
+注意：由于我们的开发是基于华为内部的最新DDK版本编译，如果您的DDK不是最新的，有可能会遇到编译时某个op找不到定义的情况，此时您可以联系我们尝试一起解决。
diff --git a/opencl.md b/opencl.md
new file mode 100644
index 0000000000..59f033d6af
--- /dev/null
+++ b/opencl.md
@@ -0,0 +1,107 @@
+# 基于OpenCL的ARM GPU预测
+
+Lite支持在Android系统上运行基于OpenCL的程序，目前提供armv8和armv7的交叉编译。
+
+## 编译
+
+- 编译环境: 使用基于`paddle/fluid/lite/tools/Dockerfile.mobile`生成docker镜像。
+- cmake编译选项介绍
+    * `ARM_TARGET_OS` 代表目标操作系统， 目前仅支持android, 亦为默认值。
+    * `ARM_TARGET_ARCH_ABI` 代表体系结构类型，支持输入armv8和armv7。其中，armv8等效于arm64-v8a，亦为默认值；armv7等效于 armeabi-v7a。
+    * `ARM_TARGET_LANG` 代表编译目标文件所使用的编译器， 默认为gcc，支持 gcc和clang两种。
+- 参考示例
+
+```bash
+# 假设处于Lite源码根目录下
+mkdir -p build_opencl && cd build_opencl
+cmake .. \
+    -DLITE_WITH_OPENCL=ON \
+    -DWITH_GPU=OFF \
+    -DWITH_MKL=OFF \
+    -DWITH_LITE=ON \
+    -DLITE_WITH_CUDA=OFF \
+    -DLITE_WITH_X86=OFF \
+    -DLITE_WITH_ARM=ON \
+    -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
+    -DWITH_TESTING=ON \
+    -DARM_TARGET_OS="android" -DARM_TARGET_ARCH_ABI="armv8" -DARM_TARGET_LANG="gcc"
+# 注意：在编译其他目标对象前，需要先执行如下命令完成OpenCL所需头文件的下载和生成
+make opencl_clhpp
+# 接着，用户可以选择完整编译
+make lite_compile_deps -j4
+# 或者选择只编译某一目标文件，例如test_mobilenetv1
+make test_mobilenetv1 -j4
+```
+
+
+## 运行示例
+
+- **运行文件准备**
+
+下面以MobileNetV1为例，介绍如何在手机上执行基于OpenCL的ARM GPU推理过程。
+
+**注意：** 以下命令均在Lite源码根目录下运行。
+
+```bash
+# 在/data/local/tmp目录下创建OpenCL文件目录
+adb shell mkdir -p /data/local/tmp/opencl
+adb shell mkdir -p /data/local/tmp/opencl/cl_kernel/buffer
+adb shell mkdir -p /data/local/tmp/opencl/cl_kernel/image
+# 将OpenCL的kernels文件推送到/data/local/tmp/opencl目录下
+adb push lite/opencl/cl_kernel/cl_common.h /data/local/tmp/opencl/cl_kernel/
+adb push lite/opencl/cl_kernel/buffer/* /data/local/tmp/opencl/cl_kernel/buffer/
+adb push lite/opencl/cl_kernel/image/* /data/local/tmp/opencl/cl_kernel/image/
+# 将mobilenet_v1的模型文件推送到/data/local/tmp/opencl目录下
+adb shell mkdir -p /data/local/tmp/opencl/mobilenet_v1
+adb push build_opencl/third_party/install/mobilenet_v1/* /data/local/tmp/opencl/mobilenet_v1/
+# 将OpenCL测试程序(如test_mobilenetv1)推送到/data/local/tmp/opencl目录下
+adb push build_opencl/lite/api/test_mobilenetv1 /data/local/tmp/opencl
+```
+
+- **执行OpenCL推理过程**
+
+使用如下命令运行OpenCL程序。其中，`--cl_path`指定了OpenCL的kernels文件即cl\_kernel所在目录，
+`--modle_dir`指定了模型文件所在目录。
+
+```bash
+adb shell /data/local/tmp/opencl/test_mobilenetv1 --cl_path=/data/local/tmp/opencl --model_dir=/data/local/tmp/opencl/mobilenet_v1 --warmup=1 --repeats=1
+```
+
+**注意：** 因为权重参数均会在Op Kernel第一次运行时进行加载，所以第一次的执行时间会略长。一般将warmup的值设为1，repeats值设为多次。
+
+
+# 如何在Code中使用
+
+Lite支持对ARM CPU和ARM GPU的混调执行，具体描述如下：
+1. 设置Lite推断执行的有效Places，使其包含ARM CPU(kARM)和ARM GPU(kOpenCL)；
+2. 设置Lite推断执行的偏好Place为ARM GPU(kOpenCL)。
+
+通过以上两步设置，Lite在推断执行过程中如果发现某一Op存在着基于OpenCL的实现，其会优先选择使用该实现执行Op的计算过程。若发现某一Op没有基于OpenCL实现的Kernel，其会自动选择执行基于ARM CPU的实现。
+
+代码示例：
+```cpp
+DeviceInfo::Init();
+DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
+lite::Predictor predictor;
+// 设置Lite推断执行的有效Places为{kHost, kARM, kOpenCL}
+std::vector<Place> valid_places({
+      Place{TARGET(kHost), PRECISION(kFloat)},
+      Place{TARGET(kARM), PRECISION(kFloat)},
+      Place{TARGET(kOpenCL), PRECISION(kFloat)},
+  });
+// 设置Lite推断执行的偏好Place为kOpenCL
+auto preferred_place = Place({TARGET(kOpenCL), PRECISION(kFloat)});
+// 根据有效Places和偏好Place构建模型
+predictor.Build(model_dir, preferred_place, valid_places);
+// 设置模型的输入
+auto* input_tensor = predictor.GetInput(0);
+input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+auto* data = input_tensor->mutable_data<float>();
+auto item_size = input_tensor->dims().production();
+for (int i = 0; i < item_size; i++) {
+  data[i] = 1;
+}
+// 执行模型推断并获取模型的预测结果
+predictor.Run();
+auto* out = predictor.GetOutput(0);
+```
\ No newline at end of file
diff --git a/paddle-mobile.md b/paddle-mobile.md
new file mode 100644
index 0000000000..e90ba42842
--- /dev/null
+++ b/paddle-mobile.md
@@ -0,0 +1 @@
+1. [如何在Android手机运行单测](https://github.com/PaddlePaddle/paddle-mobile/wiki/%E5%A6%82%E4%BD%95%E5%9C%A8Android%E6%89%8B%E6%9C%BA%E4%B8%8A%E8%BF%90%E8%A1%8C%E5%8D%95%E6%B5%8B)
\ No newline at end of file
diff --git a/roadmap.md b/roadmap.md
new file mode 100644
index 0000000000..4197b8f1cf
--- /dev/null
+++ b/roadmap.md
@@ -0,0 +1,28 @@
+# Road map
+
+这篇文档会介绍 Paddle-Lite 近期对外的开源版本和计划。
+
+其中包含的 feature 为最小集合，按最终发布的版本为准。
+
+
+## 2.0.0-beta1-prerelease
+
+预计发布 *2019-8-26 ~ 2days*
+
+- 完善编译和 benchmark 文档
+- 增加第三方依赖代码的离线下载功能，加速编译过程
+- 去掉 `tiny_publish` 模式下无关的第三方代码下载，可以不依赖任何第三方
+
+## 2.0.0-beta1
+
+预计发布 *2019-9-1~2days*
+
+- `model_optimize_tool` 从 ARM 上执行修改为 Host 上执行，只从 kernel 分布来确定计算图优化；后续硬件针对优化会发布新的工具；
+- Paddle 模型支持参数 composed  的格式
+- 增加分层编译来控制常用模型的部署库的大小，分两个模式 `basic`, `extra`；默认 `basic` 模式只发布核心的op 和kernel；将控制流相关的Op和kernel 折叠进 `extra` 按需编译
+- 增加 INT8 量化，从 PaddleSlim 训练到 PaddleLite 部署完整案例
+- 支持内存中加载模型，以支持 APP 的简易加密
+
+## 2.0.0-rc ?
+预计发布 *2019-9-16~7days*
+
diff --git a/source_compile.md b/source_compile.md
new file mode 100644
index 0000000000..b81aee6e98
--- /dev/null
+++ b/source_compile.md
@@ -0,0 +1,286 @@
+<!--ts-->
+* [源码编译指南](#源码编译指南)
+  * [一、环境准备](#一环境准备)
+     * [1、 Docker开发环境](#1-docker开发环境)
+        * [准备Docker镜像](#准备docker镜像)
+        * [进入Docker容器](#进入docker容器)
+        * [Docker常用命令](#docker常用命令)
+     * [2、Linux 开发环境](#2linux-开发环境)
+        * [交叉编译环境要求](#交叉编译环境要求)
+        * [具体步骤](#具体步骤)
+     * [3、Mac OS 开发环境（只支持 tiny publish 编译 ）](#3mac-os-开发环境只支持-tiny-publish-编译-)
+        * [交叉编译环境要求](#交叉编译环境要求-1)
+        * [具体步骤](#具体步骤-1)
+  * [二、编译PaddleLite](#二编译paddlelite)
+     * [下载代码](#下载代码)
+     * [编译模式与参数](#编译模式与参数)
+     * [编译代码](#编译代码)
+     * [加速第三方依赖库的下载](#加速第三方依赖库的下载)
+
+<!-- Added by: yanchunwei, at: Thu Aug 29 16:00:45 CST 2019 -->
+
+<!--te-->
+
+# 源码编译指南
+
+Paddle-Lite 提供了移动端的一键源码编译脚本 `lite/tools/build.sh`，编译流程如下：
+
+1. 环境准备（选择其一）：Docker交叉编译环境、Linux交叉编译环境
+2. 编译：调用`build.sh`脚本一键编译
+
+## 一、环境准备
+
+目前支持两种编译的环境：
+
+1. Docker 容器环境；
+2. Linux（推荐 Ubuntu 16.04）环境。
+
+### 1、 Docker开发环境
+
+[Docker](https://www.docker.com/) 是一个开源的应用容器引擎, 使用沙箱机制创建独立容器，方便运行不同程序。Docker初学者可以参考[Docker使用方法](https://thenewstack.io/docker-station-part-one-essential-docker-concepts-tools-terminology/)正确安装Docker。
+
+#### 准备Docker镜像
+
+有两种方式准备Docker镜像，推荐方式一：从Dockerhub直接拉取Docker镜像
+
+```shell
+# 方式一：从Dockerhub直接拉取Docker镜像
+docker pull paddlepaddle/paddle-lite:2.0.0_beta
+
+# 方式二：本地源码编译Docker镜像
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+cd Paddle-Lite/lite/tools
+mkdir mobile_image
+cp Dockerfile.mobile mobile_image/Dockerfile
+cd mobile_image
+docker build -t paddlepaddle/paddle-lite .
+# 镜像编译成功后，可用`docker images`命令，看到`paddlepaddle/paddle-lite`镜像。
+```
+
+#### 进入Docker容器
+
+在拉取PaddleLite仓库代码的上层目录，执行如下代码，进入Docker容器：
+
+```shell
+docker run -it \
+  --name paddlelite_docker \
+  -v $PWD/Paddle-Lite:/Paddle-Lite \
+  --net=host \
+  paddlepaddle/paddle-lite /bin/bash
+```
+
+该命令的含义：将容器命名为`paddlelite_docker`即`<container-name>`，将当前目录下的`Paddle-Lite`文件夹挂载到容器中的`/Paddle-Lite`这个根目录下，并进入容器中。至此，完成Docker环境的准备。
+
+#### Docker常用命令
+
+```shell
+# 退出容器但不停止/关闭容器：键盘同时按住三个键：CTRL + q + p
+
+# 启动停止的容器
+docker start <container-name>
+
+# 从shell进入已启动的容器
+docker attach <container-name>
+
+# 停止正在运行的Docker容器
+docker stop <container-name>
+
+# 重新启动正在运行的Docker容器
+docker restart <container-name>
+
+# 删除Docker容器
+docker rm <container-name>
+```
+
+### 2、Linux 开发环境
+
+#### 交叉编译环境要求
+
+- gcc、g++、git、make、wget、python、adb
+- Java environment
+- cmake（建议使用3.10或以上版本）
+- Android NDK (建议ndk-r17c)
+
+#### 具体步骤
+
+安装软件部分以 Ubuntu 为例，其他 Linux 发行版类似。
+
+```shell
+# 1. Install basic software
+apt update
+apt-get install -y --no-install-recommends \
+  gcc g++ git make wget python unzip adb
+
+# 2. Prepare Java env.
+apt-get install -y default-jdk
+
+# 3. Install cmake 3.10 or above
+wget -c https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz && \
+    tar xzf cmake-3.10.3-Linux-x86_64.tar.gz && \
+    mv cmake-3.10.3-Linux-x86_64 /opt/cmake-3.10 && \  
+    ln -s /opt/cmake-3.10/bin/cmake /usr/bin/cmake && \
+    ln -s /opt/cmake-3.10/bin/ccmake /usr/bin/ccmake
+
+# 4. Download Android NDK for linux-x86_64
+#     Note: Skip this step if NDK installed
+#     recommand android-ndk-r17c-darwin-x86_64
+#     ref: https://developer.android.com/ndk/downloads
+cd /tmp && curl -O https://dl.google.com/android/repository/android-ndk-r17c-linux-x86_64.zip
+cd /opt && unzip /tmp/android-ndk-r17c-linux-x86_64.zip
+
+# 5. Add environment ${NDK_ROOT} to `~/.bashrc` 
+echo "export NDK_ROOT=/opt/android-ndk-r17c" >> ~/.bashrc
+source ~/.bashrc
+```
+
+至此，完成 Linux 交叉编译环境的准备。
+
+### 3、Mac OS 开发环境（只支持 tiny publish 编译 ）
+
+#### 交叉编译环境要求
+
+- gcc、git、make、curl、unzip、java
+- cmake（建议使用3.10或以上版本）
+- 编译Android: Android NDK (建议ndk-r17c)
+- 编译IOS: XCode(Version 10.1)
+
+#### 具体步骤
+
+```bash
+# 1. Install basic software
+brew install -y curl gcc git make unzip
+
+# 2. Install cmake 3.10 or above
+brew install -y cmake
+
+# 3. Download Android NDK for Mac
+#     recommand android-ndk-r17c-darwin-x86_64
+#     ref: https://developer.android.com/ndk/downloads
+#     Note: Skip this step if NDK installed
+cd ~/Documents && curl -O https://dl.google.com/android/repository/android-ndk-r17c-darwin-x86_64.zip
+cd ~/Library && unzip ~/Documents/android-ndk-r17c-darwin-x86_64.zip
+
+# 4. Add environment ${NDK_ROOT} to `~/.bash_profile` 
+echo "export NDK_ROOT=~/Library/android-ndk-r17c" >> ~/.bash_profile
+source ~/.bash_profile
+
+# 5. Install Java Environment 
+brew cask install java
+```
+
+至此，完成 Mac 交叉编译环境的准备。
+
+## 二、编译PaddleLite
+
+### 下载代码
+
+```shell
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+cd Paddle-Lite
+git checkout <release-version-tag>
+```
+
+### 编译模式与参数
+
+编译脚本`./lite/tools/build.sh`，支持三种编译模式：
+
+| 编译模式 | 介绍 | 适用对象 |
+|:-------:|-----|:-------:|
+| tiny_publish | 编译移动端部署库，无第三方库依赖 | 用户 |
+| full_publish | 编译移动端部署库，有第三方依赖如protobuf、glags等，含有可将模型转换为无需protobuf依赖的naive buffer格式的工具，供tiny_publish库使用 | 用户 |
+| test | 编译指定`arm_os`、`arm_abi`下的移动端单元测试 | 框架开发者 |
+
+编译脚本`./lite/tools/build.sh`，追加参数说明：
+
+|   参数     |     介绍     |     值     |
+|-----------|-------------|-------------|
+| --arm_os   |必选，选择安装平台     | `android`、`ios`、`ios64`、`armlinux` |
+| --arm_abi  |必选，选择编译的arm版本，其中`armv7hf`为ARMLinux编译时选用| `armv8`、`armv7`、`armv7hf`|
+| --arm_lang |arm_os=android时必选，选择编译器 | `gcc`、`clang`|
+| --android_stl |arm_os=android时必选，选择静态链接STL或动态链接STL | `c++_static`、`c++_shared`|
+| --build_extra | 可选，是否编译控制流相关op、kernel。（**编译demo时必须选择为ON**）      | `ON`、`OFF` |                               
+| target |必选，选择编译模式，`tiny_publish`为编译移动端部署库、`full_publish`为带依赖的移动端部署库、`test`为移动端单元测试、`ios`为编译ios端`tiny_publish` | `tiny_publish`、`full_publish`、`test`、 `ios` | 
+
+### 编译代码
+
+注意：编译前请删除lite/api目录下的paddle_use_ops.h和paddle_use_kernels.h
+
+#### 编译`tiny publish`动态库
+
+##### Android
+```shell
+./lite/tools/build.sh \
+  --arm_os=android \
+  --arm_abi=armv8 \
+  --arm_lang=gcc \
+  --android_stl=c++_static \
+  tiny_publish
+```
+##### IOS
+```shell
+./lite/tools/build.sh \
+  --arm_os=ios64 \
+  --arm_abi=armv8 \
+  ios
+```
+ios tiny publish支持的编译选项：
+* `--arm_os`: 可选ios或者ios64
+* `--arm_abi`: 可选armv7和armv8
+* 如果mac编译过程中报错："Invalid CMAKE_DEVELOPER_ROOT: does not exist", 运行：
+```shell
+sudo xcode-select -s /Applications/Xcode.app/Contents/Developer
+```
+##### ARMLinux(目前只支持Docker编译)
+```shell
+./lite/tools/build.sh \
+  --arm_os=armlinux \
+  --arm_abi=armv7hf \
+  --arm_lang=gcc \
+  tiny_publish
+```
+
+#### 编译`full publish`动态库（**Mac OS下不支持**）
+
+注意：编译前请删除lite/api目录下的paddle_use_ops.h和paddle_use_kernels.h
+
+##### Android
+```shell
+./lite/tools/build.sh \
+  --arm_os=android \
+  --arm_abi=armv8 \
+  --arm_lang=gcc \
+  --android_stl=c++_static \
+  full_publish
+```
+##### ARMLinux(目前只支持Docker编译)
+```shell
+./lite/tools/build.sh \
+  --arm_os=armlinux \
+  --arm_abi=armv7hf \
+  --arm_lang=gcc \
+  full_publish
+```
+
+编译最终产物位于 `build.lite.xxx.xxx.xxx` 下的 `inference_lite_lib.xxx.xxx` ，如 Android 下 ARMv7 的产物位于
+![](https://user-images.githubusercontent.com/328693/63631174-5c53e580-c656-11e9-8726-d8cf7500a7f2.png)
+
+相应的内容（可能）如下
+
+![](https://user-images.githubusercontent.com/328693/63631178-65dd4d80-c656-11e9-804e-c091963f6dc0.png)
+
+### 加速第三方依赖库的下载
+
+移动端相关编译所需的第三方库均位于 `<PaddleLite>/third-party` 目录下，默认编译过程中，会利用`git submodule update --init --recursive`链上相关的第三方依赖的仓库。
+
+为加速`full_publish`、`test`编译模式中对`protobuf`等第三方依赖的下载，`build.sh` 和 `ci_build.sh`支持了从国内 CDN 下载第三方依赖的压缩包。
+
+使用方法：`git clone`完`Paddle-Lite`仓库代码后，手动删除本地仓库根目录下的`third-party`目录：
+
+```shell
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+git checkout <release-version-tag>
+cd Paddle-Lite
+rm -rf third-party
+```
+
+之后再根据本文档，进行后续编译时，便会忽略第三方依赖对应的`submodule`，改为下载第三方压缩包。
diff --git a/source_compile.md.toc.2019-08-29_160045 b/source_compile.md.toc.2019-08-29_160045
new file mode 100644
index 0000000000..3f6663dcce
--- /dev/null
+++ b/source_compile.md.toc.2019-08-29_160045
@@ -0,0 +1,20 @@
+-e -e    * [源码编译指南](#源码编译指南)
+      * [一、环境准备](#一环境准备)
+         * [1、 Docker开发环境](#1-docker开发环境)
+            * [准备Docker镜像](#准备docker镜像)
+            * [进入Docker容器](#进入docker容器)
+            * [Docker常用命令](#docker常用命令)
+         * [2、Linux 开发环境](#2linux-开发环境)
+            * [交叉编译环境要求](#交叉编译环境要求)
+            * [具体步骤](#具体步骤)
+         * [3、Mac OS 开发环境（只支持 tiny publish 编译 ）](#3mac-os-开发环境只支持-tiny-publish-编译-)
+            * [交叉编译环境要求](#交叉编译环境要求-1)
+            * [具体步骤](#具体步骤-1)
+      * [二、编译PaddleLite](#二编译paddlelite)
+         * [下载代码](#下载代码)
+         * [编译模式与参数](#编译模式与参数)
+         * [编译代码](#编译代码)
+         * [加速第三方依赖库的下载](#加速第三方依赖库的下载)
+
+<!-- Added by: yanchunwei, at: Thu Aug 29 16:00:45 CST 2019 -->
+
diff --git a/support_operation_list.md b/support_operation_list.md
new file mode 100644
index 0000000000..a48fca9744
--- /dev/null
+++ b/support_operation_list.md
@@ -0,0 +1,196 @@
+# 支持op列表
+
+## Ops
+- argmax
+- axpy
+- batch_norm
+- beam_search
+- beam_search_decode
+- bilinear_interp
+- box_coder
+- calib
+- calib_once
+- cast
+- concat
+- conv2d
+- conv2d_transpose
+- crop
+- decode_bboxes
+- density_prior_box
+- depthwise_conv2d
+- dropout
+- elementwise_add
+- elementwise_max
+- elementwise_mul
+- elementwise_sub
+- exp
+- fake_dequantize_max_abs
+- fake_quantize_moving_average_abs_max
+- fc
+- feed
+- fetch
+- fill_constant
+- fusion_elementwise_add_activation
+- fusion_elementwise_max_activation
+- fusion_elementwise_mul_activation
+- fusion_elementwise_sub_activation
+- graph_op
+- gru
+- gru_unit
+- im2sequence
+- increment
+- io_copy
+- io_copy_once
+- is_empty
+- layout
+- layout_once
+- leaky_relu
+- less_than
+- lod_reset
+- log
+- logical_and
+- logical_xor
+- lookup_table
+- lrn
+- mul
+- multiclass_nms
+- nearest_interp
+- negative
+- norm
+- pad2d
+- pool2d
+- power
+- prelu
+- prior_box
+- read_from_array
+- reduce_max
+- relu
+- relu_clipped
+- reshape
+- reshape2
+- scale
+- sequence_expand
+- sequence_pool
+- sequence_softmax
+- shape
+- shuffle_channel
+- sigmoid
+- slice
+- softmax
+- split
+- square
+- swish
+- tanh
+- top_k
+- transpose
+- transpose2
+- while
+- write_to_array
+- yolo_box
+
+## Kernels
+
+### ARM kernels
+- argmax
+- axpy
+- batch_norm
+- beam_search
+- beam_search_decode
+- bilinear_interp
+- box_coder
+- calib
+- calib_once
+- cast
+- concat
+- conv2d
+- conv2d_transpose
+- crop
+- decode_bboxes
+- density_prior_box
+- depthwise_conv2d
+- dropout
+- elementwise_add
+- elementwise_max
+- elementwise_mul
+- exp
+- fc
+- fill_constant
+- fusion_elementwise_add_activation
+- fusion_elementwise_max_activation
+- fusion_elementwise_mul_activation
+- gru
+- gru_unit
+- im2sequence
+- increment
+- is_empty
+- leaky_relu
+- less_than
+- lod_reset
+- log
+- logical_and
+- logical_xor
+- lookup_table
+- lrn
+- mul
+- multiclass_nms
+- nearest_interp
+- negative
+- norm
+- pad2d
+- pool2d
+- power
+- prelu
+- prior_box
+- read_from_array
+- reduce_max
+- relu
+- relu_clipped
+- scale
+- sequence_expand
+- sequence_pool
+- sequence_softmax
+- shape
+- shuffle_channel
+- sigmoid
+- slice
+- softmax
+- split
+- swish
+- tanh
+- top_k
+- transpose
+- transpose2
+- while
+- write_to_array
+- yolo_box
+
+### X86 kernels
+
+- batch_norm
+- concat
+- conv2d
+- depthwise_conv2d
+- dropout
+- elementwise_add
+- elementwise_sub
+- fc
+- fill_constant
+- mul
+- pool2d
+- relu
+- scale
+- softmax
+- square
+
+### OpenCL kernels
+- conv2d
+- depthwise_conv2d
+- elementwise_add
+- fc
+- fusion_elementwise_add_activation
+- io_copy
+- io_copy_once
+- mul
+- pool2d
+- relu
+
diff --git a/tech_highlights.md b/tech_highlights.md
new file mode 100644
index 0000000000..45925f9bb2
--- /dev/null
+++ b/tech_highlights.md
@@ -0,0 +1,45 @@
+# 技术特点
+
+不同于普通的移动端预测基于类 Caffe 的架构，Lite 架构最早的设计目标来源于 Paddle Server 和 Mobile 两种场景的要求，其中 Server 端需要有完善的图分析和优化能力，而 Mobile 端要求有轻量级部署的能力，两种场景共同的要求是高性能，多硬件支持等。
+
+基于上述要求，Lite 架构完整实现了相应的能力，重点描述如下。
+
+## 多硬件支持
+
+Lite 架构已经验证和完整支持从 Mobile 到 Server 多种硬件的支持需求，包括 ARM CPU, ARM GPU, Huawei NPU, Intel X86 CPU, NV GPU 等。 得益于对不同硬件适度的抽象，在Lite 框架本身清晰的同时支持不同硬件的特殊调度需求，使得Lite架构在框架的清晰程度和硬件的特定调度优化上达到很好的平衡，比如 Nvidia GPU 上复杂的 stream, event 分配，在 Lite 中可以清晰表示。
+
+多种硬件的 Kernel 在代码层和执行层均互不干扰，用户可以自由插拔任何硬件的支持。
+
+## 高性能
+
+高性能来源于两方面，一是 Kernel 优化；二是框架执行。
+
+Kernel 方面，我们对相应硬件上的 Kernel 通过指令集、操作熔合、算法改写等方式进行了深入优化。
+
+框架执行方面，通过简化 Op 和 Kernel 的功能，使得执行期的框架开销极低；此外，框架极大的灵活性可以支持各种硬件的特定调度优化以提升整体效率。
+
+## 量化支持
+
+Lite 支持Paddle Slim 强大的量化训练完毕的模型，因此完整保留了量化计算的高性能以及量化训练的高精度。
+
+## 强大的图分析和优化能力
+
+在图分析优化上，不同于常规的移动端预测引擎基于 Python 脚本工具转化模型， Lite 架构上有完整基于 C++ 开发的 IR 及相应 Pass 集合，以支持操作熔合 (Operator fusion)，计算剪枝 (Computation pruning)，存储优化 (Memory optimization)，量化计算 (Quantitative computation)  等多类计算图优化。
+
+更多的优化策略可以简单通过添加 Pass 的方式模块化支持。 
+
+## 轻量级部署
+
+尽管图优化上有复杂的策略，但并不影响移动端的轻量级部署，图分析模块和最终的执行引擎可以拆开使用，最终部署只有一层薄薄的 Kernel 。
+
+## 可支持任意硬件的混合调度
+
+Lite 支持系统可见任意硬件的混合调度，目前已经支持 ARM CPU  和  ARM GPU 的 Kernel 自动混合调度，并验证了 X86 CPU 和 Nvidia GPU 间的混合调度。
+
+支持混合调度的考量有两点：
+
+1. 当系统内同时存在多种硬件可用时，混合调度可以充分利用各类硬件资源
+2. 随着支持模型的增多，各硬件对kernel的支持丰富度不一，难免需要混合调度才能跑通
+
+Lite架构通过从底层支持 `Type system`  的方式通用建模各类混合执行的行为，从而能够相对完备地支持混调。
+
diff --git a/third-party/gflags b/third-party/gflags
deleted file mode 160000
index 77592648e3..0000000000
--- a/third-party/gflags
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 77592648e3f3be87d6c7123eb81cbad75f9aef5a
diff --git a/third-party/googletest b/third-party/googletest
deleted file mode 160000
index ec44c6c167..0000000000
--- a/third-party/googletest
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit ec44c6c1675c25b9827aacd08c02433cccde7780
diff --git a/third-party/protobuf-host b/third-party/protobuf-host
deleted file mode 160000
index 9f75c5aa85..0000000000
--- a/third-party/protobuf-host
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 9f75c5aa851cd877fb0d93ccc31b8567a6706546
diff --git a/third-party/protobuf-mobile b/third-party/protobuf-mobile
deleted file mode 160000
index 0bccaabfc6..0000000000
--- a/third-party/protobuf-mobile
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 0bccaabfc6fda5b04d7460110ae075b7eebe9d2a
diff --git a/tools/codestyle/.gitignore b/tools/codestyle/.gitignore
deleted file mode 100644
index 0d20b6487c..0000000000
--- a/tools/codestyle/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-*.pyc
diff --git a/tools/codestyle/clang_format.hook b/tools/codestyle/clang_format.hook
deleted file mode 100755
index 1d92821686..0000000000
--- a/tools/codestyle/clang_format.hook
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-set -e
-
-readonly VERSION="3.8"
-
-version=$(clang-format -version)
-
-if ! [[ $version == *"$VERSION"* ]]; then
-    echo "clang-format version check failed."
-    echo "a version contains '$VERSION' is needed, but get '$version'"
-    echo "you can install the right version, and make an soft-link to '\$PATH' env"
-    exit -1
-fi
-
-clang-format $@
diff --git a/tools/codestyle/copyright.hook b/tools/codestyle/copyright.hook
deleted file mode 100644
index 86b16ebdc4..0000000000
--- a/tools/codestyle/copyright.hook
+++ /dev/null
@@ -1,121 +0,0 @@
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import argparse
-import io, re
-import sys, os
-import subprocess
-import platform
-
-COPYRIGHT = '''
-Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-'''
-
-LANG_COMMENT_MARK = None
-
-NEW_LINE_MARK = None
-
-COPYRIGHT_HEADER = None
-
-if platform.system() == "Windows":
-    NEW_LINE_MARK = "\r\n"
-else:
-    NEW_LINE_MARK = '\n'
-    COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1]
-    p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0)
-    process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE)
-    date, err = process.communicate()
-    date = date.decode("utf-8").rstrip("\n")
-    COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date)
-
-
-def generate_copyright(template, lang='C'):
-    if lang == 'Python':
-        LANG_COMMENT_MARK = '#'
-    else:
-        LANG_COMMENT_MARK = "//"
-
-    lines = template.split(NEW_LINE_MARK)
-    BLANK = " "
-    ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK
-    for lino, line in enumerate(lines):
-        if lino == 0 or lino == 1 or lino == len(lines) - 1: continue
-        if len(line)  == 0:
-            BLANK = ""
-        else:
-            BLANK = " "
-        ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK
-
-    return ans + "\n"
-
-
-def lang_type(filename):
-    if filename.endswith(".py"):
-        return "Python"
-    elif filename.endswith(".h"):
-        return "C"
-    elif filename.endswith(".c"):
-        return "C"
-    elif filename.endswith(".hpp"):
-        return "C"
-    elif filename.endswith(".cc"):
-        return "C"
-    elif filename.endswith(".cpp"):
-        return "C"
-    elif filename.endswith(".cu"):
-        return "C"
-    elif filename.endswith(".cuh"):
-        return "C"
-    elif filename.endswith(".go"):
-        return "C"
-    elif filename.endswith(".proto"):
-        return "C"
-    else:
-        print("Unsupported filetype %s", filename)
-        exit(0)
-
-
-PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
-
-
-def main(argv=None):
-    parser = argparse.ArgumentParser(
-        description='Checker for copyright declaration.')
-    parser.add_argument('filenames', nargs='*', help='Filenames to check')
-    args = parser.parse_args(argv)
-
-    retv = 0
-    for filename in args.filenames:
-        fd = io.open(filename, encoding="utf-8")
-        first_line = fd.readline()
-        second_line = fd.readline()
-        if "COPYRIGHT (C)" in first_line.upper(): continue
-        if first_line.startswith("#!") or PYTHON_ENCODE.match(
-                second_line) != None or PYTHON_ENCODE.match(first_line) != None:
-            continue
-        original_contents = io.open(filename, encoding="utf-8").read()
-        new_contents = generate_copyright(
-            COPYRIGHT, lang_type(filename)) + original_contents
-        print('Auto Insert Copyright Header {}'.format(filename))
-        retv = 1
-        with io.open(filename, 'w') as output_file:
-            output_file.write(new_contents)
-
-    return retv
-
-
-if __name__ == '__main__':
-    exit(main())
diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook
deleted file mode 100755
index 658008d852..0000000000
--- a/tools/codestyle/cpplint_pre_commit.hook
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-TOTAL_ERRORS=0
-if [[ ! $TRAVIS_BRANCH ]]; then
-  # install cpplint on local machine.
-  if [[ ! $(which cpplint) ]]; then
-    pip install cpplint
-  fi
-  # diff files on local machine. 
-  files=$(git diff --cached --name-status | awk '$1 != "D" {print $2}')
-else
-  # diff files between PR and latest commit on Travis CI. 
-  branch_ref=$(git rev-parse "$TRAVIS_BRANCH")
-  head_ref=$(git rev-parse HEAD)
-  files=$(git diff --name-status $branch_ref $head_ref | awk '$1 != "D" {print $2}')
-fi
-# The trick to remove deleted files: https://stackoverflow.com/a/2413151
-for file in $files; do
-    if [[ $file =~ ^(patches/grpc/.*) ]]; then
-        continue;
-    else
-        cpplint --filter=-readability/fn_size $file;
-        TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
-    fi
-done
-
-exit $TOTAL_ERRORS
diff --git a/tools/codestyle/docstring_checker.py b/tools/codestyle/docstring_checker.py
deleted file mode 100644
index 8d4b24a0cf..0000000000
--- a/tools/codestyle/docstring_checker.py
+++ /dev/null
@@ -1,349 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""DocstringChecker is used to check python doc string's style."""
-
-import six
-import astroid
-
-from pylint.checkers import BaseChecker, utils
-from pylint.interfaces import IAstroidChecker
-
-from collections import defaultdict
-import re
-
-
-def register(linter):
-    """Register checkers."""
-    linter.register_checker(DocstringChecker(linter))
-
-
-class Docstring(object):
-    """Docstring class holds the parsed doc string elements.
-    """
-
-    def __init__(self):
-        self.d = defaultdict(list)  #name->[]
-        self.clear()
-
-    def clear(self):
-        self.d['Args'] = []
-        self.d['Examples'] = []
-        self.d['Returns'] = []
-        self.d['Raises'] = []
-        self.args = {}  #arg_name->arg_type
-
-    def get_level(self, string, indent='    '):
-        level = 0
-        unit_size = len(indent)
-        while string[:unit_size] == indent:
-            string = string[unit_size:]
-            level += 1
-
-        return level
-
-    def parse(self, doc):
-        """parse gets sections from doc
-        Such as Args, Returns, Raises, Examples s
-        Args:
-            doc (string): is the astroid node doc string.
-        Returns:
-            True if doc is parsed successfully.
-        """
-        self.clear()
-
-        lines = doc.splitlines()
-        state = ("others", -1)
-        for l in lines:
-            c = l.strip()
-            if len(c) <= 0:
-                continue
-
-            level = self.get_level(l)
-            if c.startswith("Args:"):
-                state = ("Args", level)
-            elif c.startswith("Returns:"):
-                state = ("Returns", level)
-            elif c.startswith("Raises:"):
-                state = ("Raises", level)
-            elif c.startswith("Examples:"):
-                state = ("Examples", level)
-            else:
-                if level > state[1]:
-                    self.d[state[0]].append(c)
-                    continue
-
-                state = ("others", -1)
-                self.d[state[0]].append(c)
-
-        self._arg_with_type()
-        return True
-
-    def get_returns(self):
-        return self.d['Returns']
-
-    def get_raises(self):
-        return self.d['Raises']
-
-    def get_examples(self):
-        return self.d['Examples']
-
-    def _arg_with_type(self):
-
-        for t in self.d['Args']:
-            m = re.search('([A-Za-z0-9_-]+)\s{0,4}(\(.+\))\s{0,4}:', t)
-            if m:
-                self.args[m.group(1)] = m.group(2)
-
-        return self.args
-
-
-class DocstringChecker(BaseChecker):
-    """DosstringChecker is pylint checker to
-    check docstring style.
-    """
-    __implements__ = (IAstroidChecker, )
-
-    POSITIONAL_MESSAGE_ID = 'str-used-on-positional-format-argument'
-    KEYWORD_MESSAGE_ID = 'str-used-on-keyword-format-argument'
-
-    name = 'doc-string-checker'
-    symbol = "doc-string"
-    priority = -1
-    msgs = {
-        'W9001': ('One line doc string on > 1 lines', symbol + "-one-line",
-                  'Used when a short doc string is on multiple lines'),
-        'W9002':
-        ('Doc string does not end with "." period', symbol + "-end-with",
-         'Used when a doc string does not end with a period'),
-        'W9003':
-        ('All args with their types must be mentioned in doc string %s',
-         symbol + "-with-all-args",
-         'Used when not all arguments are in the doc string '),
-        'W9005': ('Missing docstring or docstring is too short',
-                  symbol + "-missing", 'Add docstring longer >=10'),
-        'W9006': ('Docstring indent error, use 4 space for indent',
-                  symbol + "-indent-error", 'Use 4 space for indent'),
-        'W9007': ('You should add `Returns` in comments',
-                  symbol + "-with-returns",
-                  'There should be a `Returns` section in comments'),
-        'W9008': ('You should add `Raises` section in comments',
-                  symbol + "-with-raises",
-                  'There should be a `Raises` section in comments'),
-    }
-    options = ()
-
-    def visit_functiondef(self, node):
-        """visit_functiondef checks Function node docstring style.
-        Args:
-            node (astroid.node): The visiting node.
-        Returns:
-            True if successful other wise False.
-        """
-
-        self.check_doc_string(node)
-
-        if node.tolineno - node.fromlineno <= 10:
-            return True
-
-        if not node.doc:
-            return True
-
-        doc = Docstring()
-        doc.parse(node.doc)
-
-        self.all_args_in_doc(node, doc)
-        self.with_returns(node, doc)
-        self.with_raises(node, doc)
-
-    def visit_module(self, node):
-        self.check_doc_string(node)
-
-    def visit_classdef(self, node):
-        self.check_doc_string(node)
-
-    def check_doc_string(self, node):
-        self.missing_doc_string(node)
-        self.one_line(node)
-        self.has_period(node)
-        self.indent_style(node)
-
-    def missing_doc_string(self, node):
-        if node.name.startswith("__") or node.name.startswith("_"):
-            return True
-        if node.tolineno - node.fromlineno <= 10:
-            return True
-
-        if node.doc is None or len(node.doc) < 10:
-            self.add_message('W9005', node=node, line=node.fromlineno)
-        return False
-
-    # FIXME(gongwb): give the docstring line-no
-    def indent_style(self, node, indent=4):
-        """indent_style checks docstring's indent style
-        Args:
-            node (astroid.node): The visiting node.
-            indent (int): The default indent of style
-        Returns:
-            True if successful other wise False.
-        """
-        if node.doc is None:
-            return True
-
-        doc = node.doc
-        lines = doc.splitlines()
-        line_num = 0
-
-        for l in lines:
-            if line_num == 0:
-                continue
-            cur_indent = len(l) - len(l.lstrip())
-            if cur_indent % indent != 0:
-                self.add_message('W9006', node=node, line=node.fromlineno)
-                return False
-            line_num += 1
-
-        return True
-
-    def one_line(self, node):
-        """one_line checks if docstring (len < 40) is on one line.
-        Args:
-            node (astroid.node): The node visiting.
-        Returns:
-            True if successful otherwise False.
-        """
-
-        doc = node.doc
-        if doc is None:
-            return True
-
-        if len(doc) > 40:
-            return True
-        elif sum(doc.find(nl) for nl in ('\n', '\r', '\n\r')) == -3:
-            return True
-        else:
-            self.add_message('W9001', node=node, line=node.fromlineno)
-            return False
-
-        return True
-
-    def has_period(self, node):
-        """has_period checks if one line doc end-with '.' .
-        Args:
-            node (astroid.node): the node is visiting.
-        Returns:
-            True if successful otherwise False.
-        """
-        if node.doc is None:
-            return True
-
-        if len(node.doc.splitlines()) > 1:
-            return True
-
-        if not node.doc.strip().endswith('.'):
-            self.add_message('W9002', node=node, line=node.fromlineno)
-            return False
-
-        return True
-
-    def with_raises(self, node, doc):
-        """with_raises checks if one line doc end-with '.' .
-        Args:
-            node (astroid.node): the node is visiting.
-            doc (Docstring): Docstring object.
-        Returns:
-            True if successful otherwise False.
-        """
-
-        find = False
-        for t in node.body:
-            if not isinstance(t, astroid.Raise):
-                continue
-
-            find = True
-            break
-
-        if not find:
-            return True
-
-        if len(doc.get_raises()) == 0:
-            self.add_message('W9008', node=node, line=node.fromlineno)
-            return False
-
-        return True
-
-    def with_returns(self, node, doc):
-        """with_returns checks if docstring comments what are returned .
-        Args:
-            node (astroid.node): the node is visiting.
-            doc (Docstring): Docstring object.
-        Returns:
-            True if successful otherwise False.
-        """
-
-        if node.name.startswith("__") or node.name.startswith("_"):
-            return True
-        find = False
-        for t in node.body:
-            if not isinstance(t, astroid.Return):
-                continue
-
-            find = True
-            break
-
-        if not find:
-            return True
-
-        if len(doc.get_returns()) == 0:
-            self.add_message('W9007', node=node, line=node.fromlineno)
-            return False
-
-        return True
-
-    def all_args_in_doc(self, node, doc):
-        """all_args_in_doc checks if arguments are mentioned in doc
-        Args:
-            node (astroid.node): the node is visiting.
-            doc (Docstring): Docstring object
-        Returns:
-            True if successful otherwise False.
-        """
-        if node.name.startswith("__") or node.name.startswith("_"):
-            return True
-        args = []
-        for arg in node.args.get_children():
-            if (not isinstance(arg, astroid.AssignName)) \
-                or arg.name == "self":
-                continue
-            args.append(arg.name)
-
-        if len(args) <= 0:
-            return True
-
-        parsed_args = doc.args
-        args_not_documented = set(args) - set(parsed_args)
-        if len(args) > 0 and len(parsed_args) <= 0:
-            self.add_message(
-                'W9003',
-                node=node,
-                line=node.fromlineno,
-                args=list(args_not_documented))
-            return False
-
-        for t in args:
-            if t not in parsed_args:
-                self.add_message(
-                    'W9003', node=node, line=node.fromlineno, args=[t, ])
-                return False
-
-        return True
diff --git a/tools/codestyle/pylint_pre_commit.hook b/tools/codestyle/pylint_pre_commit.hook
deleted file mode 100755
index 150a3f5666..0000000000
--- a/tools/codestyle/pylint_pre_commit.hook
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-TOTAL_ERRORS=0
-
-
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-export PYTHONPATH=$DIR:$PYTHONPATH
-
-# The trick to remove deleted files: https://stackoverflow.com/a/2413151
-for file in $(git diff --name-status | awk '$1 != "D" {print $2}'); do
-    pylint --disable=all --load-plugins=docstring_checker \
-    --enable=doc-string-one-line,doc-string-end-with,doc-string-with-all-args,doc-string-triple-quotes,doc-string-missing,doc-string-indent-error,doc-string-with-returns,doc-string-with-raises $file;
-    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
-done
-
-exit $TOTAL_ERRORS
-#For now, just warning:
-#exit 0
-
diff --git a/tools/codestyle/test_docstring_checker.py b/tools/codestyle/test_docstring_checker.py
deleted file mode 100644
index 0547f7d161..0000000000
--- a/tools/codestyle/test_docstring_checker.py
+++ /dev/null
@@ -1,232 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import docstring_checker
-import pylint.testutils
-import astroid
-import pytest
-import sys
-
-
-class TestDocstring(pylint.testutils.CheckerTestCase):
-    CHECKER_CLASS = docstring_checker.DocstringChecker
-
-    def test_one_line(self):
-        func_node = astroid.extract_node('''
-        def test(): 
-            """get 
-            news.
-            """
-            if True:
-                return 5
-            return 5
-        ''')
-
-        self.checker.visit_functiondef(func_node)
-        got = self.linter.release_messages()
-        assert len(got) == 1
-        assert 'W9001' == got[0][0]
-
-    def test_one_line(self):
-        func_node = astroid.extract_node('''
-        def test(): 
-            """get news"""
-            if True:
-                return 5
-            return 5
-        ''')
-
-        self.checker.visit_functiondef(func_node)
-        got = self.linter.release_messages()
-        assert len(got) == 1
-        assert 'W9002' == got[0][0]
-
-    def test_args(self):
-        func_node = astroid.extract_node('''
-        def test(scale, mean): 
-            """get news.
-            Args:
-                scale (int): scale is the number.
-            """
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-        ''')
-
-        self.checker.visit_functiondef(func_node)
-        got = self.linter.release_messages()
-        assert len(got) == 1
-        assert 'W9003' == got[0][0]
-
-    def test_missing(self):
-        func_node = astroid.extract_node('''
-        def test(): 
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-        ''')
-
-        self.checker.visit_functiondef(func_node)
-        got = self.linter.release_messages()
-        assert len(got) == 1
-        assert 'W9005' == got[0][0]
-
-    def test_indent(self):
-        func_node = astroid.extract_node('''
-        def test(): 
-            """ get get get get get get get get
-              get get get get get get get get.
-            """
-            pass 
-        ''')
-
-        self.checker.visit_functiondef(func_node)
-        got = self.linter.release_messages()
-        assert len(got) == 1
-        assert 'W9006' == got[0][0]
-
-    def test_with_resturns(self):
-        func_node = astroid.extract_node('''
-        def test(): 
-            """get news.
-            Args:
-                scale (int): scale is the number.
-            """
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            return mean
-        ''')
-
-        self.checker.visit_functiondef(func_node)
-        got = self.linter.release_messages()
-        assert len(got) == 1
-        assert 'W9007' == got[0][0]
-
-    def test_with_raises(self):
-        func_node = astroid.extract_node('''
-        def test(): 
-            """get news.
-            Args:
-                scale (int): scale is the number.
-            """
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            mean=scale
-            raise ValueError('A very specific bad thing happened.')
-        ''')
-
-        self.checker.visit_functiondef(func_node)
-        got = self.linter.release_messages()
-        assert len(got) == 1
-        assert 'W9008' == got[0][0]
-
-    def test_no_message(self):
-        p = '''
-def fc(input,
-       size,
-       num_flatten_dims=1,
-       param_attr=None,
-       bias_attr=None,
-       act=None,
-       name=None):
-    """
-    **Fully Connected Layer**
-    The fully connected layer can take multiple tensors as its inputs. It
-    creates a variable called weights for each input tensor, which represents
-    a fully connected weight matrix from each input unit to each output unit.
-    The fully connected layer multiplies each input tensor with its coresponding
-    weight to produce an output Tensor. If multiple input tensors are given,
-    the results of multiple multiplications will be sumed up. If bias_attr is
-    not None, a bias variable will be created and added to the output. Finally,
-    if activation is not None, it will be applied to the output as well.
-    This process can be formulated as follows:
-
-    Args:
-        input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
-            the input tensor(s) is at least 2.
-        size(int): The number of output units in this layer.
-        num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than
-            two dimensions. If this happens, the multidimensional tensor will first be flattened
-            into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
-            tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
-            dimensions will be flatten to form the first dimension of the final matrix (height of
-            the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
-            form the second dimension of the final matrix (width of the matrix). For example, suppose
-            `X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
-            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
-        param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
-            parameters/weights of this layer.
-        bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
-            of this layer. If it is set to None, no bias will be added to the output units.
-        act (str, default None): Activation to be applied to the output of this layer.
-        name (str, default None): The name of this layer.
-    Returns:
-        A tensor variable storing the transformation result.
-    Raises:
-        ValueError: If rank of the input tensor is less than 2.
-    Examples:
-        .. code-block:: python
-            data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
-            fc = fluid.layers.fc(input=data, size=1000, act="tanh")
-    """
-    raise ValueError('A very specific bad thing happened.')
-    size = 1
-    size = 1
-    size = 1
-    size = 1
-    size = 1
-    size = 1
-    size = 1
-    size = 1
-    size = 1
-    size = 1
-    size = 1
-    size = 1
-    size = 1
-    return size
-    '''
-
-        func_node = astroid.extract_node(p)
-        self.checker.visit_functiondef(func_node)
-        got = self.linter.release_messages()
-        assert len(got) == 0
diff --git a/tools/document_preview.sh b/tools/document_preview.sh
deleted file mode 100755
index d0e9b3178a..0000000000
--- a/tools/document_preview.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-PADDLE_ROOT=/paddle
-cd ${PADDLE_ROOT}
-git clone https://github.com/PaddlePaddle/FluidDoc
-git clone https://github.com/tianshuo78520a/PaddlePaddle.org.git
-sh ${PADDLE_ROOT}/FluidDoc/doc/fluid/api/gen_doc.sh
-pip install ${PADDLE_ROOT}/build/opt/paddle/share/wheels/*.whl
-apt-get update && apt-get install -y python-dev build-essential
-cd ${PADDLE_ROOT}/PaddlePaddle.org/portal
-pip install -r requirements.txt
-#If the default port is not occupied, you can use port 8000, you need to replace it with a random port on the CI.
-sed -i "s#8000#$1#g" runserver
-nohup ./runserver --paddle ${PADDLE_ROOT}/FluidDoc &
diff --git a/tutorial.md b/tutorial.md
new file mode 100644
index 0000000000..0281fe7e82
--- /dev/null
+++ b/tutorial.md
@@ -0,0 +1,74 @@
+Lite是一种轻量级、灵活性强、易于扩展的高性能的深度学习预测框架，它可以支持诸如ARM、OpenCL、NPU等等多种终端，同时拥有强大的图优化及预测加速能力。如果您希望将Lite框架集成到自己的项目中，那么只需要如下几步简单操作即可。
+
+# 一. 准备模型
+
+Lite框架目前支持的模型结构为[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)深度学习框架产出的模型格式。因此，在您开始使用 Lite 框架前您需要准备一个由PaddlePaddle框架保存的模型。
+如果您手中的模型是由诸如Caffe2、Tensorflow等框架产出的，那么我们推荐您使用 [X2Paddle](https://github.com/PaddlePaddle/X2Paddle) 工具进行模型格式转换。
+
+# 二. 模型优化
+
+Lite框架拥有强大的加速、优化策略及实现，其中包含诸如量化、子图融合、Kernel优选等等优化手段，为了方便您使用这些优化策略，我们提供了[Model Optimize Tool](./model_optimize_tool)帮助您轻松进行模型优化。优化后的模型更轻量级，耗费资源更少，并且执行速度也更快。
+
+Model Optimize Tool的详细介绍，请您参考 [模型优化方法](./model_optimize_tool) 。
+
+使用Model Optimize Tool，您只需执行以下代码：
+## 1. Android
+- 将Android设备通过USB连接到PC机，在PC机的shell中执行如下命令（不是在docker里面）：
+```shell
+$ cd <PaddleLite_base_path>
+$ adb push ./build.lite.android.armv8.gcc/lite/api/model_optimize_tool /data/local/tmp
+$ adb shell
+```
+注意：如果系统不能识别adb命令，请参考https://developer.android.google.cn/studio/releases/platform-tools 安装适合PC机的Android SDK
+- 进入手机shell后，执行如下命令：
+
+```shell
+./model_optimize_tool \
+    --model_dir=<model_param_dir> \
+    --model_file=<model_path> \
+    --param_file=<param_path> \
+    --optimize_out_type=(protobuf|naive_buffer) \
+    --optimize_out=<output_optimize_model_dir> \
+    --valid_targets=(arm|opencl|x86) \
+    --prefer_int8_kernel=(ture|false)
+```
+
+## 2. ARM Linux
+- 将build.lite.android.armv8.gcc/lite/api目录下的model_optimize_tool拷贝至ARM Linux设备的/home/[user]/目录下(例如采用scp方式)，然后在ARM Linux设备的/home/[user]/目录下直接执行如下命令：
+
+```shell
+./model_optimize_tool \
+    --model_dir=<model_param_dir> \
+    --model_file=<model_path> \
+    --param_file=<param_path> \
+    --optimize_out_type=(protobuf|naive_buffer) \
+    --optimize_out=<output_optimize_model_dir> \
+    --valid_targets=(arm|opencl|x86) \
+    --prefer_int8_kernel=(ture|false)
+```
+
+其中，optimize_out为您希望的优化模型的输出路径。optimize_out_type则可以指定输出模型的序列化方式，其目前支持Protobuf与Naive Buffer两种方式，其中Naive Buffer是一种更轻量级的序列化/反序列化实现。如果你需要使用Lite在mobile端进行预测，那么您需要设置optimize_out_type=naive_buffer。
+
+# 三. 使用Lite框架执行预测
+
+在上一节中，我们已经通过Model Optimize Tool获取到了优化后的模型，使用优化模型进行预测也十分的简单。为了方便您的使用，Lite进行了良好的API设计，隐藏了大量您不需要投入时间研究的细节。您只需要简单的五步即可使用Lite在移动端完成预测（以C++ API进行说明）：
+
+
+1. 声明MobileConfig。在config中可以设置**从文件加载模型**也可以设置**从memory加载模型**。从文件加载模型需要声明模型文件路径，如 `config.set_model_dir(FLAGS_model_dir)` ；从memory加载模型方法现只支持加载优化后模型的naive buffer，实现方法为：
+`void set_model_buffer(model_buffer,model_buffer_size,param_buffer,param_buffer_size) `
+
+2. 创建Predictor。Predictor即为Lite框架的预测引擎，为了方便您的使用我们提供了 `CreatePaddlePredictor` 接口，你只需要简单的执行一行代码即可完成预测引擎的初始化，`std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor(config)` 。
+3. 准备输入。执行predictor->GetInput(0)您将会获得输入的第0个field，同样的，如果您的模型有多个输入，那您可以执行 `predictor->GetInput(i)` 来获取相应的输入变量。得到输入变量后您可以使用Resize方法指定其具体大小，并填入输入值。
+4. 执行预测。您只需要执行 `predictor->Run()` 即可使用Lite框架完成预测。
+5. 获取输出。与输入类似，您可以使用 `predictor->GetOutput(i)` 来获得输出的第i个变量。您可以通过其shape()方法获取输出变量的维度，通过 `data<T>()` 模板方法获取其输出值。
+
+
+
+
+# 四. Lite API
+
+为了方便您的使用，我们提供了C++与Java两种API，并且提供了相应的api使用的[完整示例](./demos)，您可以参考示例中的说明快速了解C++/Java的API使用方法，并集成到您自己的项目中去。需要说明的是，为了减少第三方库的依赖、提高Lite预测框架的通用性，在移动端使用Lite API您需要准备Naive Buffer存储格式的模型，具体方法可参考第2节`模型优化`。
+
+# 五. 测试工具
+
+为了使您更好的了解并使用Lite框架，我们向有进一步使用需求的用户开放了 [Lite Model Debug  Tool](./debug_tools) 和 [Profile Monitor Tool](./debug_tools)。Lite Model Debug Tool可以用来查找Lite框架与PaddlePaddle框架在执行预测时模型中的对应变量值是否有差异，进一步快速定位问题Op，方便复现与排查问题。Profile Monitor Tool可以帮助您了解每个Op的执行时间消耗，其会自动统计Op执行的次数，最长、最短、平均执行时间等等信息，为性能调优做一个基础参考。您可以通过 [相关专题](./debug_tools) 了解更多内容。
diff --git a/web/.editorconfig b/web/.editorconfig
deleted file mode 100644
index e291365a9d..0000000000
--- a/web/.editorconfig
+++ /dev/null
@@ -1,9 +0,0 @@
-root = true
-
-[*]
-charset = utf-8
-indent_style = space
-indent_size = 4
-end_of_line = lf
-insert_final_newline = true
-trim_trailing_whitespace = true
diff --git a/web/.gitignore b/web/.gitignore
deleted file mode 100644
index 7a40821bff..0000000000
--- a/web/.gitignore
+++ /dev/null
@@ -1,78 +0,0 @@
-# Referenced from https://github.com/github/gitignore/blob/master/Node.gitignore
-
-# Logs
-logs
-*.log
-npm-debug.log*
-yarn-debug.log*
-yarn-error.log*
-
-# Runtime data
-pids
-*.pid
-*.seed
-*.pid.lock
-
-# Directory for instrumented libs generated by jscoverage/JSCover
-lib-cov
-
-# Coverage directory used by tools like istanbul
-coverage
-
-# nyc test coverage
-.nyc_output
-
-# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
-.grunt
-
-# Bower dependency directory (https://bower.io/)
-bower_components
-
-# node-waf configuration
-.lock-wscript
-
-# Compiled binary addons (https://nodejs.org/api/addons.html)
-build/Release
-
-# Dependency directories
-node_modules/
-jspm_packages/
-
-# Typescript v1 declaration files
-typings/
-
-# Optional npm cache directory
-.npm
-
-# Optional eslint cache
-.eslintcache
-
-# Optional REPL history
-.node_repl_history
-
-# Output of 'npm pack'
-*.tgz
-
-# Yarn Integrity file
-.yarn-integrity
-
-# dotenv environment variables file
-.env
-
-# next.js build output
-.next
-
-# other stuff
-.DS_Store
-Thumbs.db
-
-# IDE configurations
-.idea
-.vscode
-
-# build assets
-/output
-/dist
-/dll
-.cache
-package-lock.json
diff --git a/web/.npmrc b/web/.npmrc
deleted file mode 100644
index b291ce91ef..0000000000
--- a/web/.npmrc
+++ /dev/null
@@ -1 +0,0 @@
-registry=http://registry.npm.baidu-int.com
diff --git a/web/README.md b/web/README.md
deleted file mode 100644
index 2b2d303b76..0000000000
--- a/web/README.md
+++ /dev/null
@@ -1,46 +0,0 @@
-[中文版](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/web/README_cn.md)
-
-# Web
-
-Web project is an open source deep learning framework designed to work on web browser. It could run on nearly every browser with WebGL support.
-
-## Key Features
-
-### Modular
-
-Web project is built on Atom system which is a versatile framework to support GPGPU operation on WebGL. It is quite modular and could be used to make computation tasks faster by utilizing WebGL.
-
-### High Performance
-
-Web project could run TinyYolo model in less than 30ms on chrome. This is fast enough to run deep learning models in many realtime scenarios.
-
-### Browser Coverage
-
-* PC: Chrome
-* Mac: Chrome
-* Android: Baidu App and QQ Browser
-
-## How To Build & Deploy Demo
-
-```bash
-cd web                        # enter root directory for web project
-npm i                         # install dependencies for npm
-mkdir dist                    # create deployment directory
-cd dist                       # enter deployment directory
-git clone https://github.com/DerekYangMing/Paddle-Web-Models.git # get models
-mv Paddle-Web-Models/separablemodel .                            # move models to specific directory
-cd ..                         # return to root directory for web project
-npm run testVideoDemo         # start demo
-```
-
-## How To Preview Demo
-
-1. Open chrome with url: https://localhost:8123/
-2. Start demo by pressing the 【start detection】 button.
-3. Ensure at least one face is recorded by the camera. The face detection rectangle should be displayed if everything goes fine.
-
-## Feedback and Community Support
-
-- Questions, reports, and suggestions are welcome through Github Issues!
-- Forum: Opinions and questions are welcome at our [PaddlePaddle Forum](https://ai.baidu.com/forum/topic/list/168)！
-- QQ group chat: 696965088
diff --git a/web/README_cn.md b/web/README_cn.md
deleted file mode 100644
index 32472db000..0000000000
--- a/web/README_cn.md
+++ /dev/null
@@ -1,43 +0,0 @@
-# Web
-
-该Web项目是致力于在浏览器中运行的开源深度学习框架，在支持WebGL的浏览器上即可直接运行。
-
-## 主要特点
-
-### 模块化
-
-该Web项目建立于Atom组件之上。Atom组件在WebGL基础上进行了封装，可以方便的进行通用GPU计算任务。它是高度模块化的，不仅可以用于本项目，也可以用于其它的WebGL加速场景。
-
-### 高性能
-
-目前Web项目运行TinyYolo模型可以达到30ms以内，对于一般的实时场景已经足够应对。
-
-### 浏览器覆盖面
-
-* PC: Chrome
-* Mac: Chrome
-* Android: Baidu App and QQ Browser
-
-## 如何构建部署 demo
-
-```bash
-cd web                        # 进入根目录
-npm i                         # 安装依赖
-mkdir dist                    # 创建资源目录
-cd dist                       # 进入资源目录
-git clone https://github.com/DerekYangMing/Paddle-Web-Models.git # 获取模型
-mv Paddle-Web-Models/separablemodel .                            # 移动模型到制定地点
-cd ..                         # 返回根目录
-npm run testVideoDemo         # 启动 demo 服务
-```
-
-## 如何预览 demo
-
-1. 在浏览器中打开url: https://localhost:8123/
-2. 点击【开始检测】按钮。
-3. 将人脸对准摄像头，没有问题的话，可以正常检测到人脸。
-
-##  交流与反馈
-* 欢迎您通过Github Issues来提交问题、报告与建议
-* QQ群: 696965088 
-* 论坛: 欢迎大家在[PaddlePaddle论坛](https://ai.baidu.com/forum/topic/list/168)分享在使用PaddlePaddle中遇到的问题和经验, 营造良好的论坛氛围
diff --git a/web/demo/index.es6 b/web/demo/index.es6
deleted file mode 100644
index 1095b4a8a6..0000000000
--- a/web/demo/index.es6
+++ /dev/null
@@ -1,457 +0,0 @@
-import 'babel-polyfill';
-import VConsole from 'vconsole';
-import Graph from '../src/executor/loader';
-import IO from '../src/feed/imageFeed';
-import Logger from '../tools/logger';
-window.log = new Logger();
-// var vConsole = new VConsole();
-// 统计参数
-window.badCases = [];
-// import Utils from '../src/utils/utils';
-// 获取map表
-// import Map from '../test/data/map';
-
-// import demoPic from './bbt1.jpg';
-// import demoPic2 from './bbt2.jpg';
-// import demoPic3 from './bbt3.jpg';
-// import demoPic4 from './bbt4.jpg';
-// import demoPic5 from './bbt5.jpg';
-// import testOutput from './data.json';
-
-// 后处理测试用例
-// let tempPic = [demoPic, demoPic2, demoPic3, demoPic4, demoPic5];
-/**
- * @file model demo 入口文件
- * @author wangqun@baidu.com
- *
- */
-// 'http://mms-xr.cdn.bcebos.com/paddle/mnist/model.json'
-// 模型输出shape
-const outputShapes = {
-    '608': {
-        from: [19, 19, 25, 1],
-        to: [19, 19, 5, 5]
-    },
-    '320': {
-        from: [10, 10, 25, 1],
-        to: [10, 10, 5, 5]
-    },
-    '320fused': {
-        from: [10, 10, 25, 1],
-        to: [10, 10, 5, 5]
-    },
-    'separate': {
-        from: [10, 10, 25, 1],
-        to: [10, 10, 5, 5]
-    }
-};
-// 模型feed数据
-const feedShape = {
-    '608': {
-        fw: 608,
-        fh: 608
-    },
-    '320': {
-        fw: 320,
-        fh: 320
-    },
-    '320fused': {
-        fw: 320,
-        fh: 320
-    },
-    'separate': {
-        fw: 320,
-        fh: 320
-    }
-};
-// 模型路径
-const modelPath = {
-    '608': 'faceModel',
-    '320': 'facemodel320',
-    '320fused': 'facemodelfused',
-    'separate': 'separablemodel'
-};
-const modelType = 'separate';
-const path = modelPath[modelType];
-// 统计参数
-let loaded = false;
-let model = {};
-window.statistic = [];
-const {fw, fh} = feedShape[modelType];
-// 第一遍执行比较慢 所以预热一下
-async function preheat() {
-    const io = new IO();
-    let feed = io.process({
-        input: video,
-        params: {
-            gapFillWith: '#000', // 缩放后用什么填充不足方形部分
-            targetSize: {
-                height: fw,
-                width: fh
-            },
-            targetShape: [1, 3, fh, fw], // 目标形状 为了兼容之前的逻辑所以改个名
-            // shape: [3, 608, 608], // 预设tensor形状
-            mean: [117.001, 114.697, 97.404], // 预设期望
-            // std: [0.229, 0.224, 0.225]  // 预设方差
-        }
-    });
-    const MODEL_URL = `/${path}/model.json`;
-    const MODEL_CONFIG = {
-        dir: `/${path}/`, // 存放模型的文件夹
-        main: 'model.json', // 主文件
-    };
-    loaded = true;
-    const graphModel = new Graph();
-    log.start('加载模型');
-    model = await graphModel.loadGraphModel(MODEL_CONFIG, {
-        multipart: true,
-        dataType: 'binary',
-        binaryOption: {
-            fileCount: 1, // 切成了多少文件
-            getFileName(i) { // 获取第i个文件的名称
-                return 'chunk_0.dat';
-            }
-        },
-        feed
-    });
-    log.end('加载模型');
-    let inst = model.execute({
-        input: feed
-    });
-}
-async function run(input) {
-    // const input = document.getElementById('mobilenet');
-    log.start('总耗时');
-    const io = new IO();
-    log.start('预处理');
-    let feed = io.process({
-        input: input,
-        params: {
-            gapFillWith: '#000', // 缩放后用什么填充不足方形部分
-            targetSize: {
-                height: fw,
-                width: fh
-            },
-            targetShape: [1, 3, fh, fw], // 目标形状 为了兼容之前的逻辑所以改个名
-            // shape: [3, 608, 608], // 预设tensor形状
-            mean: [117.001, 114.697, 97.404], // 预设期望
-            // std: [0.229, 0.224, 0.225]  // 预设方差
-        }
-    });
-    log.end('预处理');
-    if (!loaded) {
-        const MODEL_URL = `/${path}/model.json`;
-        const MODEL_CONFIG = {
-            dir: `/${path}/`, // 存放模型的文件夹
-            main: 'model.json', // 主文件
-        };
-        loaded = true;
-        const graphModel = new Graph();
-        log.start('加载模型');
-        model = await graphModel.loadGraphModel(MODEL_CONFIG, {
-            multipart: true,
-            dataType: 'binary',
-            binaryOption: {
-                fileCount: 1, // 切成了多少文件
-                getFileName(i) { // 获取第i个文件的名称
-                    return 'chunk_0.dat';
-                }
-            },
-            feed
-        });
-        log.end('加载模型');
-    }
-
-    log.start('运行耗时');
-    let inst = model.execute({
-        input: feed
-    });
-
-    // 其实这里应该有个fetch的执行调用或者fetch的输出
-    let result = await inst.read();
-    log.end('后处理-读取数据');
-    // console.dir(['result', result]);
-    log.start('后处理-形状调整');
-    const newData = [];
-    let newIndex = -1;
-    const [w, h, c, b] = outputShapes[modelType].from;
-    // c channel
-    for (let i = 0; i < c; i++) {
-        // height channel
-        for (let j = 0; j < h; j++) {
-            // width channel
-            for (let k = 0; k < w; k++) {
-                // position: (0, 0, 0, 0)
-                const index = j * (c * h) + k * c + i;
-                // const index = j * (i * k) + k * i + i;
-                newData[++newIndex] = result[index];
-            }
-        }
-    }
-    log.end('后处理-形状调整');
-    log.start('后处理-画框');
-    testRun(newData, input);
-    log.end('后处理-画框');
-    log.end('后处理');
-    log.end('总耗时');
-}
-var image = '';
-
-function selectImage(file) {
-    if (!file.files || !file.files[0]) {
-        return;
-    }
-    let reader = new FileReader();
-    reader.onload = function (evt) {
-        let img = document.getElementById('image');
-        img.src = evt.target.result;
-        img.onload = function() {
-            log.during('每次执行的时间间隔');
-            run(img);
-        };
-        image = evt.target.result;
-    }
-    reader.readAsDataURL(file.files[0]);
-}
-// selectImage
-document.getElementById("uploadImg").onchange = function () {
-    selectImage(this);
-};
-
-/* 后处理图片 by zhangmiao06 */
-let preTestRun = (index) => {
-    let img = document.getElementById('image');
-    img.src = tempPic[index];
-    img.onload = function() {
-        testRun(testOutput.data[index], img);
-    };
-};
-let testRun = (data, img) => {
-    // console.log('ori', data);
-    const {from, to} = outputShapes[modelType];
-    // let shape = [1, 25, 19, 19];
-    let shape = [].concat(from).reverse();
-    // 1.从一维数组到1*25*19*19
-    let formatData = reshapeMany({
-        data: data,
-        reshapeShape: shape
-    });
-    // console.log('一维到多维', formatData);
-    // 2.从1*25*19*19 到 19*19*25*1
-    let formatData2 = transpose({
-        data: formatData,
-        shape: shape,
-        transposeShape: [2, 3, 1, 0]
-    });
-    // console.log('transpose', formatData2);
-    // 3.从19*19*25*1到19*19*5*5
-    let formatData3 = reshape({
-        data: formatData2,
-        shape: from,
-        reshapeShape: to
-    });
-    // console.log('reshape', formatData3);
-    // 4.运算
-    let finalData = handleFinal(formatData3, shape, img);
-    // console.log('final', finalData);
-    // 5.处理画布
-    // handleCanvas(finalData, img);
-    handleDiv(finalData, img);
-};
-
-// sigmoid
-let sigmoid = (x) => {
-    if (x < -100) {
-        return 0.0;
-    }
-    return 1 / (1 + Math.exp(-x));
-}
-
-// transpose
-let transpose = (data) => {
-    let shape = data.shape;
-    let transposeShape = data.transposeShape;
-    let formatData = data.data;
-    let formatData2 = [];
-    for(let n = 0; n < shape[transposeShape[0]]; n++) {
-        let nData = [];
-        for(let c = 0; c < shape[transposeShape[1]]; c++) {
-            let cData = [];
-            for(let row = 0; row < shape[transposeShape[2]]; row++) {
-                let rowData = [];
-                for(let col = 0; col < shape[transposeShape[3]]; col++) {
-                    let tempArr = [n, c, row, col];
-                    let newN = n;
-                    let newC = c;
-                    let newW = row;
-                    let newH = col;
-                    transposeShape.forEach((item, index)=> {
-                        switch(item) {
-                            case 0:
-                                newN = tempArr[index];
-                                break;
-                            case 1:
-                                newC = tempArr[index];
-                                break;
-                            case 2:
-                                newW = tempArr[index];
-                                break;
-                            case 3:
-                                newH = tempArr[index];
-                        }
-                    });
-                    rowData.push(formatData[newN][newC][newW][newH]);
-                }
-                cData.push(rowData);
-            }
-            nData.push(cData);
-        }
-        formatData2.push(nData);
-    }
-    return formatData2;
-};
-
-// reshape
-let reshape = (data) =>{
-    let formatData2 = data.data;
-    let shape = data.shape;
-    let reshapeShape = data.reshapeShape;
-    // 1.变成一维
-    let tempData = reshapeOne({
-        data: formatData2,
-        shape: shape
-    });
-    // 2.变成多维
-    let formatData3 = reshapeMany({
-        data: tempData,
-        reshapeShape: reshapeShape
-    });
-    return formatData3;
-};
-
-// 变成一维
-let reshapeOne = (data) => {
-    let formatData2 = data.data;
-    let shape = data.shape;
-    let tempData = [];
-    for(let n = 0; n < shape[0]; n++) {
-        for(let c = 0; c < shape[1]; c++) {
-            for(let row = 0; row < shape[2]; row++) {
-                for(let col = 0; col < shape[3]; col++) {
-                    tempData.push(formatData2[n][c][row][col]);
-                }
-            }
-        }
-    }
-    return tempData;
-};
-
-// 变成多维
-let reshapeMany = (data) => {
-    let tempData = data.data;
-    let reshapeShape = data.reshapeShape;
-    let formatData3 = [];
-    for(let n = 0; n < reshapeShape[0]; n++) {
-        let nData = [];
-        for(let c = 0; c < reshapeShape[1]; c++) {
-            let cData = [];
-            for(let row = 0; row < reshapeShape[2]; row++) {
-                let rowData = [];
-                for(let col = 0; col < reshapeShape[3]; col++) {
-                    let tempN = n * reshapeShape[1] * reshapeShape[2] * reshapeShape[3];
-                    let tempC = c * reshapeShape[2] * reshapeShape[3];
-                    let tempRow = row * reshapeShape[3];
-                    rowData.push(tempData[tempN + tempC + tempRow + col]);
-                }
-                cData.push(rowData);
-            }
-            nData.push(cData);
-        }
-        formatData3.push(nData);
-    }
-    return formatData3;
-};
-let calSize = (img) => {
-    let w1 = img.width;
-    let h1 = img.height;
-    let wh1 = Math.max(w1, h1);
-    // let factor = 608.0 / wh1;
-    let factor = fw / wh1;
-    let width = Math.round(w1 * factor);
-    let height = Math.round(h1 * factor);
-    return [w1, h1, width, height];
-};
-// 处理运算
-let handleFinal = (formatData3, shape, img) => {
-    let finalData = [];
-    let c = shape[2];
-    let [w1, h1, width, height] = calSize(img);
-    let factorX = Math.max(width, height) / width;
-    let factorY = Math.max(width, height) / height;
-
-    let maxProb = 0.0;
-    let anchors = [[1.603231, 2.094468], [6.041143, 7.080126], [2.882459, 3.518061], [4.266906, 5.178857], [9.041765, 10.66308]];
-
-    for(let i = 0; i < shape[2]; i++) {
-        for(let j = 0; j < shape[3]; j++) {
-            for(let k = 0; k < anchors.length; k++) {
-                let [a1, a2, a3, a4, prob] = formatData3[i][j][k];
-                prob = sigmoid(prob);
-                if (prob > maxProb && prob >= 0.5) {
-                    let ctx = (j + sigmoid(a1)) / c * factorX;
-                    let cty = (i + sigmoid(a2)) / c * factorY;
-                    let col = Math.exp(a3) * anchors[k][0] / c * factorX;
-                    let row = Math.exp(a4) * anchors[k][1] / c * factorY;
-                    let x = (ctx - (col / 2));
-                    let y = (cty - (row / 2));
-                    finalData.push([x * w1, y * h1, col * w1, row * h1, prob]);
-                }
-            }
-        }
-    }
-    return finalData;
-};
-
-// 处理画布
-let handleCanvas = (finalData, img) => {
-    let myCanvas = document.getElementById('myCanvas');
-    let [w1, h1, width, height] = calSize(img);
-    myCanvas.width = w1;
-    myCanvas.height = h1;
-    let ctx = myCanvas.getContext("2d");
-    ctx.drawImage(img, 0, 0, w1, h1);
-
-    finalData.forEach((demoArr,index) => {
-        let [demoLeft, demoTop, demoWidth, demoHeight, prob] = demoArr;
-        ctx.beginPath();
-        ctx.strokeStyle="red";
-        ctx.moveTo(demoLeft, demoTop);
-        ctx.lineTo(demoLeft + demoWidth, demoTop);
-        ctx.lineTo(demoLeft + demoWidth, demoTop + demoHeight);
-        ctx.lineTo(demoLeft, demoTop + demoHeight);
-        ctx.closePath();
-        ctx.stroke();
-    });
-};
-let handleDiv = (finalData, img) => {
-    if (finalData.length < 1) {
-        return false;
-    }
-    let myCanvas = document.getElementById('myDiv');
-    let maxIndex = 0;
-    if (finalData.length > 1) {
-        for(let i = 1; i < finalData.length; i++) {
-            if (finalData[i].prob > finalData[maxIndex].prob) {
-                maxIndex = i;
-            }
-        }
-    }
-    let [demoLeft, demoTop, demoWidth, demoHeight, prob] = finalData[maxIndex];
-    myCanvas.style.width = demoWidth;
-    myCanvas.style.height = demoHeight;
-    myCanvas.style.left = demoLeft;
-    myCanvas.style.top = demoTop;
-};
-// preTestRun(0);
-
-// run(document.getElementById('pic'));
diff --git a/web/demo/index.html b/web/demo/index.html
deleted file mode 100644
index c418dffd78..0000000000
--- a/web/demo/index.html
+++ /dev/null
@@ -1,39 +0,0 @@
-<!DOCYTPE html>
-<html>
-    <head>
-        <meta charset="utf-8">
-        <title>paddle web demo</title>
-        <meta name="viewport" content="width=device-width,minimum-scale=1.0,maximum-scale=1.0,user-scalable=no">
-        <style>
-            .image-wrap {
-                position: relative;
-            }
-            #myDiv {
-                position: absolute;
-                border: 1px solid red;
-                box-sizing: border-box;
-            }
-        </style>
-    </head>
-    <body>
-    <!--<div><img id="pic" src="data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/wAALCAAcABwBAREA/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/9oACAEBAAA/APn+vTPDHwP8TeJ9DtdXiuLCzt7kbo0uWcOU7NgKRgjkc81i+O/hvrPgW8xco1zp7ELHfIm1HYqCRjJIPUc9cHFcbSgEnABJ9BXaafH8Rrrw3NpdjBrkmjohLQLE/l7c5OOPUHgV6Fcw3um/sxXNt4hZo7qW5X7FDdLtlRfOU7QG5zgSH/dPpXhFel/Bzxj4a8H6vfzeILZy86ILe6WLzPI27i3HUZ+XkA9PQ16Pc/Hfw7pM91LaXusa20wDRxSQRQww9eAdob35DfWuNg+Ny67Dfab430SDUNLuQxjW2UK8BwcAZPPOPmyCOvPSvH6KKKK//9k=" ></div>-->
-    <div><img id="mobilenet" src=""></div>
-    <!-- <p>处理后图片</p> -->
-    <!-- <div id="p-c"></div> -->
-    <!-- <p>预处理测试图片</p> -->
-    <!-- <img id="pic" src="./tbbt.jpg"/> -->
-    <p>原图片</p>
-    <div class="image-wrap">
-        <img id="image" src=""/>
-        <div id="myDiv"></div>
-    </div>
-    <p>画布</p>
-    <canvas id="myCanvas"></canvas>
-    <br/>
-    <input type="file" id="uploadImg"/>
-    <div id="txt"></div>
-
-
-</body>
-    <script src="index.es6"></script>
-</html>
diff --git a/web/demo/videoDemo.es6 b/web/demo/videoDemo.es6
deleted file mode 100644
index 98e0a6e2b3..0000000000
--- a/web/demo/videoDemo.es6
+++ /dev/null
@@ -1,57 +0,0 @@
-import 'babel-polyfill';
-import Runner from '../src/executor/runner';
-import Camera from '../src/executor/camera';
-// 调试工具
-// import vConsole from 'vconsole';
-// const theConsole = new vConsole();
-let startBtn = document.getElementById('start');
-let stopBtn = document.getElementById('stop')
-
-const runner = new Runner({
-    // 用哪个模型
-    modelName: 'separate' // '608' | '320' | '320fused' | 'separate'
-});
-startBtn.disabled = true;
-runner.preheat()
-.then(() =>{
-    startBtn.disabled = false
-});
-
-const domElement = document.getElementById('video');
-const myCanvas = document.getElementById('myDiv');
-const videoSelect = document.getElementById('videoSelect');
-let camera = new Camera({
-    // 用来显示摄像头图像的dom
-    videoDom: domElement
-});
-camera.getDevices().then(devices => {
-    if (devices.length) {
-        camera.run(devices[0].deviceId);
-        devices.forEach((element, index) => {
-            let option = document.createElement('option');
-            option.value = element.deviceId;
-            option.text = (index + 1);
-            videoSelect.appendChild(option);
-        });
-        videoSelect.onchange = () => {
-            camera.run(videoSelect.value);
-        };
-    }
-    else {
-        camera.run();
-    }
-});
-const handleDiv = function (data) {
-    myCanvas.style.width = (data ? data[0] : 0) + 'px';
-    myCanvas.style.height = (data ? data[0] : 0) + 'px';
-    myCanvas.style.left = (data ? data[2] : 0) + 'px';
-    myCanvas.style.top = (data ? data[3] : 0) + 'px';
-}
-startBtn.addEventListener('click', function () {
-    startBtn.disabled = true;
-    runner.startStream(() => camera.curVideo, handleDiv);
-});
-stopBtn.addEventListener('click', function () {
-    startBtn.disabled = false;
-    runner.stopStream();
-});
\ No newline at end of file
diff --git a/web/demo/videoDemo.html b/web/demo/videoDemo.html
deleted file mode 100644
index a4009fbd29..0000000000
--- a/web/demo/videoDemo.html
+++ /dev/null
@@ -1,36 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-<head>
-  <meta charset="UTF-8">
-  <meta name="viewport" content="width=device-width, initial-scale=1.0">
-  <meta http-equiv="X-UA-Compatible" content="ie=edge">
-  <title>识别摄像头里的脸</title>
-  <style>
-      body {
-          margin: 0;
-          padding: 0;
-      }
-      #myDiv {
-          position: fixed;
-          border: 1px solid red;
-          box-sizing: border-box;
-      }
-      #video {
-        background: red;
-      }
-  </style>
-</head>
-<body>
-  <video id="video">
-  </video>
-  <p>
-    <button id="start">开始识别</button>
-    <button id="stop">结束</button>
-  </p>
-  <select id="videoSelect"></select>
-  <p id="tips">tips</p>
-  <div id="myDiv"></div>
-
-  <script src="./videoDemo.es6"></script>
-</body>
-</html>
diff --git a/web/package.json b/web/package.json
deleted file mode 100644
index 4c2d0fecc7..0000000000
--- a/web/package.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "name": "paddle-web-demo",
-  "version": "1.0.0",
-  "description": "paddle",
-  "main": "index.js",
-  "scripts": {
-    "server": "parcel ./src/index.html",
-    "testDemo": "parcel ./demo/index.html",
-    "testSDemo": "parcel ./demo/index.html --port 8125 --https",
-    "testVideoDemo": "parcel ./demo/videoDemo.html --port 8123 --https",
-    "test": "echo \"Error: no test specified\" && exit 1"
-  },
-  "devDependencies": {
-    "babel-core": "^6.26.3",
-    "babel-plugin-transform-class-properties": "^6.24.1",
-    "babel-plugin-transform-decorators-legacy": "^1.3.5",
-    "babel-plugin-transform-runtime": "^6.23.0",
-    "babel-polyfill": "^6.26.0",
-    "babel-preset-env": "^1.7.0",
-    "babel-preset-react": "^6.24.1",
-    "babel-preset-stage-0": "^6.24.1",
-    "babel-runtime": "^6.26.0",
-    "parcel-bundler": "^1.10.3"
-  },
-  "keywords": [],
-  "author": "",
-  "license": "ISC",
-  "dependencies": {
-    "js-file-download": "^0.4.5",
-    "vconsole": "^3.3.2"
-  }
-}
diff --git a/web/scripts/build.sh b/web/scripts/build.sh
deleted file mode 100644
index 1b6bb68b12..0000000000
--- a/web/scripts/build.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-export PATH=$NODEJS_BIN_LATEST:$PATH
-
-echo "node: $(node -v)"
-echo "npm: v$(npm -v)"
-
-npm install
-
-npm run build
diff --git a/web/src/banana.jpeg b/web/src/banana.jpeg
deleted file mode 100644
index 36db8e726d55f53f6767ff114208c5184f7da907..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 30262
zcmd?R2Ut_@nlBo<(o3Rrl^zhJSCM9b00BZm3q_hpXwnf71t~(LH!&bm5=iK1=%6T4
z1%wcK5$RP_K<u3OpE>8;yPrL?=gd9#-r3JtdA{V!YVc;g_4j+%I+;FM0&tobLJa{l
zGys4x^#wS24@iOBLVE%LFc?4@005i<Fwk%V=%_Oq>I*=_4`BFf4gi?Y2>g$^6^+!t
z?n4Uz+(!fG|8*Y}_4Cg{J@cQV|MiounC5?;v6%M1?@hN^O!wdCw4;9xJJ|!MTYB9L
zxf$$rGvK_EoIF5X9}1)Yb9U<T*IfLsDJMlJWhNc)lzuDbbpGyF{W~XJ0IpNCqjZ~e
zG$H_6E*d&6nv>UnbJYKWf#$E}ufwQsG_-W|42(>tPBXJmcWC4U(9+P+(bCf~F#P#5
zG||-U0D3M4?h6Wfj685xCXqm(Vr*u~DN+4~mmsULk77!0LHABGpE=9R$1g4+DFv2R
zR#8<`*U&V$3NbV?hMFLd);6|w_9%Cc>z+5f(B8ozp<&^-A|m7B6B6$yC1bL(b8_<@
z=0C!gmf`T_6$E1C(`Tf{rskH`wpZP+dwTo&2L{RGl!?iy>6zL0%PXsE>l>R}+n+uk
z93Fi+{`&3vAGxT%=f8+Weg7B9{zfh?s$8`6^mO!0f8?T}4gVuJ7d^uT1x9W?IFoB2
zkBDOIDWHC4NyE$2qDodDL2f}~%xA=u-;00xBidh*{l^5m_n(sNe+c$J<(dN+0NAM4
zh4~a43o{ElI~xbjc@PgbH;<&C@Y(ZnU<G+Ou<S)8O_M81s)lM8Wp!=!49(1~5LSv;
zP_8IT7ZbRZ#R>H!H0<o`JX|~yAdrNGimZyo|MGIOKvheflUV>O9Sv2P=(qqnfWvL6
z(irCdLx;}E=>OsJkNuO$kLy1wW+=@Toy_~+rnEXI<bR$YK@qIT6ll-?BI#mCW|QYT
zu%Jk&&U@XnH|uJuWrwD$D$j3QNVF_i^nU*K+_`|c@&s_l*)*C-&vB%xW`6sjb4Xq9
z9mv}O-b1=ht3)fN4=ozso=86VNXQ|LPI5lq2*0tS#uXdolf@WxpFSP$J@d8Bv^7O(
zaLi+6<Jz`*ok9dul6@rd%;nV|)&*YjN+Q2MruAh6{{}r&&Ref7bna^qOWG9T1&N-I
zdx@9mAon>?LMCVYcf_Ov#$F65ZENc=Z!6b<X0Msmo7d3yrCAJGG;GV*4J+{>0&kc?
zB#dh0yoX+4q>5?(9}JF@@qe1@sqx~hj|G0Z)S=Kqq3h-c_Vtd>H=+{UY~@zho;^7M
zIEB98Eb;sD>|TI%M%^;UhNI=<hHh1DQ?A%8dlTC*)r4xpk0ieX*6@)Cob^Gu-SHkT
zeOw{!-N$x@jb`#rWI;cHDCF4dO=JeJ8A9hLDa$6(D3)%{*$(BrL4Wt0m^-nrq7j9;
z7_(+V=!b9pR!;JWziB8*$9$^mt-q1Ej%I_<2V<TDvDg+lbEZ0Nt<Y+LX!BfP{MGD4
z<_=nOc5Ugam`mz;DIO_lOa}W~T|Yaw#%K3npB`_{ySRKgJ^@^|>eU&WKJ%lip6%zA
z(g*EcY@$`8p|t%M*L4)r7qTML4fY)-HuAz#Su>xWTQoG_i2M-pqp^bo&GDO8G%Bef
zns2`iPHWWCs(FxO<pIy!amZ(Z04-_$lY{Am^3S6h=g!Ib)}-U^e`7w_sS1fuTgy!k
zRhQ^I*T?k?4R1@&75wn?#rKJ>L5{rmW6(>dmXXa_Y@1SzyCIrR-k#Id;m2}Q%?aQx
zbBUv7VPTpbon7-OVpz>gqULe!z5aG1U{Y8LE%eLMoro%{HnT-+yQKUxIfc|Nt|xH#
zDow|e$BL%hwm~T_A0yM96RUg8o;_lD&xdhs&)qrt>0Lt+S^TUW6LZ>nffB|1U|}<)
zu|dmxu8f0JtGiKg0;qf}cSiqH)|~De_F9gk!^($T{RwFYqz$d;Xzg@j#VZ!Y1V<?$
zMUK_T4Iwm6CPYBq+GX&x&9^qXsTV3`W!ne#3z^*>xBF|lIX)c|@bCust0TU>^vJh)
z$`R_0f;=yO^zrT-flS{7gYB!7#;Y%tylAw4?_!;JC)eW%KB6eMYM`Pa6UtI34<Ct!
zg`-rf&Q$r5drwKv8TabEMUg@N->#QwNFxF#e7(mYpivJX90K^yjJqcj|1=t<*a78u
zT;e3BamwxvXzeq{Jh)HaXR20|$HA0~Z!zJng0qG4byt_WvHdDN7*x3*^iauvls9~(
zNa3qZ`4f?+b@a3bv;b}0{u99S7mwjNca}Da%V+pf3mw#W1Y5NXO~9+!yMbSALTV>c
z$JfHu;-n1IloA1|Z%an*a@mN7w5IY=SnH*On>TOmMr;470y_G(aPSmnAJfKAkD7fW
zG|?{8?h#`k7gQnSjX!%PEeQ(ax=ia>yHNS_XHCMwPUhS9)`L*ivBcS|x;MNl0-c({
z=GjB{eM}6*yiIlVJKU889{8lF4Q}%a)ueUaW#*}$6=`|&^Tj*UJl`hcwTR8=sNXr1
zJaJ}wsCaN62(;;|?i)XItlR&Fo#t(`3jOJ)msfY)kMn=f()_y1+~`U2Xsqph{1~UE
z<nX#UU4ALE&-q?kmB4lKE1U-4#+9SNSLE(B?e;S=DO^+9FCK@bG4!<c)6h`lzYF`k
z?a`3HM)6sj{5Hflz8ZQQu+U@S_<HK8y{Op}ErZl;t=foRGG&R#Jn>9kP+*JT)Eyqq
z%3i9o3tW>Z#mJCiN-<J^|HVOfGWk!UjDONN`Z~=|X9~peKX(yaCTp4P!6P!G*K^%c
z+Uu?9#HPSC#%-@XCFe7U;>;gsj)nu>d=U3YMddl;&+pTPO3Pd}>B>?d!O_x{@0msv
zp4{|B&gP|k$$Ijvf26g4q5F;-an9kXs@UL<6F^vlRb1-wNPnYSgt+v69&ftNp8rOn
z@Zto%d@qy6R(h3gs+mk^Y)x4@0es&&@0n0aix5csCZIldNKIJ0D-`RO$hjkV#wFJk
z8Z2aO*<*4wv{S9qMkXHMOxJwP)CGxG;=?O#rrrv@F$ff>F3e6}(Qe(<?zzU3kb5Eb
z&WO_;N8yU;UplB7tA^4*|6?id3nh>G9`ysHpSq+@YrMU^B0OE8pCEd@Z7}3Q#0I8W
z#9`jhB{$|`{JmSuiye`_AJxc6J)bHm$)w2u+kB5p-?E>q*)4q>yE(L}1_kGcCJh42
zuh@vrodBksYMR(^7t8L&3Az+9@$T7WqE3~OLPf7cRz+q+@049GTAc!JOJg>B%i5Jx
z31M?+={y~Uxv8VAVF&W#;>?<9H^ty{&CLhIq$9$MbQy{uW+c_fq%@ipmu;hRU}|0R
zSd4dsBhW_usIIueO3Bu2-MgDmrfgICzgj}-oJ{=_=;|;vB?U-EVj@<WlLllMEH$ni
zK2%e1(D%r#XWEXI$!l-V`|ULR`E^CYTfJXC4?qu6d1w65r!*8zcfMVESBL|wUaIS~
z!otevn(^#QA)C_Gwa}ewIvRu3t+-GrTs5XS;`+jr;H2YN+(W&WHE846;89FJ>zVNT
z&0;qu+cp<xnZJn9Lzxw}8OM(V5m{2@5*micw4&#~C#><G3qO6%&PIrO+$(QF*I24e
z&()7}WxC6D>hHnka}Hnb4f|a~pE(W?^KCs3z<pVMN_r5}MK{?dwJOHE@_lGU%NX`7
zt@WO%KyQSbh6ao1xfi4EI*)z4&Ppx&7=3fR^N}{<{HLjHrVW%P9<zPMsyidC`~IOy
z@2pC1W|g71KAw@CgZ0I=a-YrtTilO_d<$a7T7atvKbKQh9NL|1-Y+#xXwzEWnqPTT
zJ(pF-Z6UyDrJ43^3;t5eaE7rXSIyieiq6Dj0hXl%Um<6mo`f6|Imm5txZ0%Yg!|XX
zIooW$bg)E&-WAFn<#i$NGcWgk-^qDzV|izXv9;MS;dz-~aB3D?-&YI9nnb3~;2_<E
zG$gm7qON*{!G?aY;8U8dmgRQhy0)0-<`>P2V|K*5w~8MkZ-?x9-wH*B42@pC(;l<j
zejybvs%BzL!4odDB)aCW84356_aD{f4?vx`T)lAp`2H$RI0Ojz2ZQFnJd}XzFQ$JB
zvikB_#`r<&z&2o;GHhU(Hmq=r(>~Km+{oHSh}9IG089=7Z>_<C`fPld%`N&vCx)8G
zBeI_41OSBrzZ-evt-F3niP7}v?nc@sfHkw$vo*7z_H1Fpy6d>$o~yR){DdmHFqy%i
z`1J<=)z2IDOl}-sCmY98+>5K7^xHq~AT$(*$#e(oxoo(0$KHkS$a=L$*ZH=?4!#Cj
zOE{%w7H)`w+8lZkc0c&rS*)39%TzaJKzTK4&zue?-1$8*?lq+~Dk|(V+%w+XkvN68
zB`hQJn&0k4(>u;zKO>}~E%0bjR^)?mdK;unP$NNZ^X12dHx_8|+*d8Po+aEg_|0@k
zWXbpRdQ$mhq*>&iSa=A8wVQqglwIf0pLYWANiZepKD5-hzMp;^`P;3j&>D)SG|Kv4
zIX%cZc&o-(&c^x#;N2Rf)sssndXJrBgG>_1FFz&G`c6g7t<c{;8?HXkPpJ8jXALOw
zj(mPy3{tJXLd4U>S}h4v`uc^GVCT(OV!l-PGHo617+)cMEUyY^<<}(y6|)|L{J{(A
zH}jD$hz198m1dNpSoVCYAo0AGonYt}A_piRNnGHY?JhaDfsJnb-ch=^a#~eUTVOu&
zoc9sIE$1*V<rQ*vf*)z+5s5+VDg7gdtaI{T75gbu2Ed`2J9_6?v%rq@Vv&<J_)6Ip
zg)ReVitw##zdLOyM~>Mb;+Gr0<9_-G(mKfQ6n>w|OzFe5f*%97uGxs5%{?}TO4g!p
z(E&ez<EJpIs~<*ksv)9*v2d4cME=0#!;+Rwd2o5gBN_nfaph;#peXD6g8W2{t{Rux
zfoBUj6~0X}0+#L_=_ZpIM=JMkFSV|BUeu(FKo(@<>@zGD+;r-WO5c0S3jSWVfOUUE
zeol$A)s~iz5)rUhg^~{abDvd=xD^J!tiAo~J%!W9aM5wk8#@XEF@<`DPm)iIFjg?}
zR@aCuK~I4y3T@-3#ISstD%vrwPuAX1tX;Y%akhI7`o(t3N9}we0`D4UiK;vb^HpRC
zZKKhnUs}xV=rHkAlfn;MPM62T#z;ZZZHjx>T?^9<9HZBB<sJk$(+<0e25?A-ZuX6j
zPWc4Z2;C9?n#SFfLV=q3@>ziD(CWFG2aZ44QG!a`K^E_4tMxPzb@i58<{j;CBFY3U
z10Z_zuk2nj;+Z5!FqL5K1ZKUFMQOM(sW=!*YKs#reT>nws4`h?QpOM?eC0&C`U8S+
zp^UZ5`4F3iaNS2|o<v;YbV`5n_1cejzalC9)bjBly>E1)3T=8jEBgKDcg*3(>7tuq
z4*8V(NC#@VJF58asq24Dgl?Fxm~ku<g6!-_n+O`tij>;+mQ;TktX53+o8cK2d7ZG(
z6g%1a%gD;-p4)@9^a@A+6998vM7ZpHeCyJ=+$iQ4Nez`Hw}#J;SA@8xOfaZ^U5x?d
zW$?1<70Sn$yGJ{3f<A<Z88oI!rQe*an1g4hky_i?Uz;mfs;g*WV}~6f9!Y`|T5&H$
zYb<1ziG$GEHXkbwGK7VwDq*}GXlKjLuOAgw;|J+lAwPxl+v^hsBthGwP?<t<yE18W
z5aq9J_hDrmBy)UB%%u;kytH5mC^y!eG`G#Zj>o5LydQ>nfAy>AG*aZ8!6hy0MzeYE
zE_01N_Dr}UctJbT)=(hL2Q$8|FjAp1;~<}PJ{oc>ztYcqhv>(hMk>=NR|0s+sb60D
zZUeW`mn}KM=K(XBH4_a5j72X;MuU^Fg@TyPxTe>n;Io=hcFz&v85V=V$9HE4Z%wwv
zlf|rRX%JIejz>oYvu}D$c)IVJ1i{*K@_(>or2&g%{MA^Cr(_ES<N9kJUs;J{ki-uE
zsQtxwM44p7gPfJHg*Kz%;_0+S;CoLno2L4_mJrEEl%JykFD(b$Y|U7dKs;SuPi%kC
zHt*xa=MPpZR$s?eT3+dUIl=)7zij&~v1U(n-BoHPjjEftf2$Ou`!5GH_WxB^?Lk}I
zvBc15w)$@c!6$%_&}yH8xd$DwpL_4d2Y#Dk7Zozk8mESL#k`&$d)Dp}T5dwWB4dB6
zVAj@xMtUB#JvkubI5+l%J1Z?|(-;|Usiz<-+XsH84Tj`I8(YamEaOg9%_;x1;$IHl
z_0CAdqA9}C{#tC3v?boTsaIiEJR)~ER8VMjb>f<eIn@a!$vkzCHu<JvzU)*Pv5tS#
zP&n!IIwAj{ClXi^=C&|{dOwr`a~R+K*?8c1dv8?u)f0D5$7(XiLmjCjo%bjQfRK!L
zF49(7l;~K&TMp{qvk{C$n#toP!Kqr09<mHnt|XpAHA{_H)L5(c3m;9LEpEeR2+H@$
z)RI@<`B!LHWk4>zw~ccWPTDPd;~(gJN7h+Q^`^m1e<P7X0;4vVhG@tE>R}7LkOm>f
z^vKHvzFo75?Fs1t+}FC*`yT$5KKPY;V6JFIMW-19Z&ky^)ekk;S$Bc&0nrfI!Xn4A
zA~wJ6rYBy_^JR*L>c_RW=w5<y%Eoe;g3FVe0T7e{_RaVz8!!C<N(zm57)Y`MeuC3k
zG4DK3^s)j2DrihG6NcZj0~63VWJ`H2u`r5FQBiuh{ezhr+<N$;MU&p<t*UP8FQF<k
z81Ky&5DB!;yvDzUk#qe24sZ@rGi5^v9~Hqji_^Q_8_sIE&`cu*vhkU__4VTELy)->
zbefBI?)TVwtF+q|VgYNIvo<=0^?I*A(iLfA-E;UfOXSyV(XY2(t;0Msn-#>4HOiN!
zQj>?~7s?tY%<rN4T~hBp$eu4kuNf-DngigN14*Z6B{IHB!Z~h_F$(q?PAPsDD%`L}
zz-$^Ye_nnsmdBm#>Uv>sX@HMiiK^jGOj&3GqJ?s-RIPi^U+q(sn2`nT34@o<i6t(>
z=2;J5jn5A9p(4NcHR!0(%NsjO8JR0|)jR=o_`uwmby(fWZEJ8nd;jaQVWM_j=MojR
zj8{NlU?1ph*uo4O=N>`M$kpZ($zQ#a90aWqIpR0GU1j|`BG;&;<}{BA&@v(m5Xm0w
z1b>{dM+_Pczk>9jAw~O?j_(cmwBDC^MtT!U54f^5d=oP2!u<J@wcKv%=?s@oh~o9)
zl#NC^z2O^BFNtJJWK1B;ne<Vn@>C-4`a>Xz%VtcJ*CV7h+^Xq^_SozdsN$PnkdXXJ
zO%knx+WFgC2bacwu9pV3jiP5B*>L;bwi&u{c}j=JH5SzD{iBSr6+ucxwY7%AjM?<d
z@t_%|l9Y^(=wDm+o;X>0D1c!=Yh@7$p{+(yc4-~&jl7h`2LH1QyjLb<k&V9G6)!|;
zrWzBUXe*CI$S8P~Vm8I3N-==H9si#~rGJvhKUEtv(^QuM_(+)KVM$AdXD|jypO$mA
z=oI;-Nio5>R|!2*HEIOu9f{Gc+T5(Md1(3g`4aP$+&7m)d5@G+`K~0kx-dck*66yu
zm6ZVZO^xdN5gs~Pt#94Xv>bP90<4E^r&ZJV-|@&niLJ|GeLsw_#Eax1i~QA%MjG8c
z)1Cli?}$VHqVkGxyq?alki8B>14afD+FI)sPX}x%1GZH2vh~P;y^863B*4MOFj_iz
zHa1Xs7R`!YFAGdNl5t_3aCne3D(qudzIabSQZO6oUq=hJ-m(5r$HMbgc;bmzd=YLc
zR8!l|&LR0hb{2ZFDbY<V<J_fx)!xC^1IZGCH{yT4u<)KZb@qwx4Ivrygr!Z|YV&lg
z$n^6#K~|*4fvw4h9dcEN3;zb9oib46s~>4e?D)anDAJdWv6p|)7>Nzv6+g2u<Lt}R
zr)LTLQY9Wdd;vA4#V?g}-HbUDck7MtPv$Imk@Rb>5P)^-tf+TyLdvZNMW<_kOGwM}
zrx>MOCp!kS+FQ8$CaWb;i5GgkUolT+xp~Aj8iPv<HK#mSr?=q{T2cco7tA&H%l=_S
zX3qsdD=)4$rsuK)K2f7sCQ9|v=U7Yceo<;FuFmxRu8FB+w_A(1jMihW2?1kokIGL)
zx$-LGu=zpb4E_M+F>}zwS$7FAzN`~Oh`)?U=#`F4?e)>jY-?FQ65yKMzRTRfc0=L(
z4I$}W+2`UnE!o1BBs-v$ZP{=LXfr<n2&d}J|9zbMe-&S)=95&~bC^m)V5!J&3fUua
zlm=>SuR9=mKoI`S`@QYA;3T5zl&l@aVY;EO22}Jti!NTjw<PLYLBed=mSe6GKj*d=
zVfaub2ryzO4gO2b)BPJU<e`m?FdYPpWh*ASDzQ8$FVnD!Z2_C*9awj&y6<!4TY}lU
z0BKAm=lH!T2FH==LzIBDyt4~=a6|4xbgJ6TW)(SYLQjwhY5mtuUY3K1JZZ0EyulP>
z>em+-iw?Btxqy4Ma%#B%lMT!jNL9!m*5a+O;(<C491VId6k}bL0i<~H4U8GNp1m03
zub+|)AJCNAL6&x9q)GEhTov-hO%-w~AthQQFHe3fU0jpOtG%Rto`0yNW!@YHxrp1q
z)}Ddlm>;yJZ#hx6#I(d%*OKV6%w_a@yEDb!9I)#iUU~7$yQt6N^Yid;g=sBYdnf_@
z6dHuhQ`FVO{n86`UR}WA$nh_QW8~J>l-?o+ND3u?82pOk3&E_wDYLm>y%%Qd0wv~i
z2TvAY#<aY>GKFW|Q;@HbJVK4og`7<8+AYOM=CP*_<apB<5XtO-{lz9(%Zo0{<)qze
z3G*t<Ey48+=3ItAS-WIlAcD|E7)6%0gJrn;{%pa5zt)mSiZ0++1&0+hHjUyKLx}2Q
zV=ph`zRB^V)fM8oNy40<yh=@##EyP6@3}2|I4hT<%0z)Ccsn)89LZnbXfwSAL4lA?
z%gI<u1|mT@q=G$n?u=VsTF&Dw2w%<kXhwNvFPM^}5^^dLGO*KQ*QC?lR{`R|L=(R~
zy31d!v{vQ+Ufb<Yr8O=8_bRRbJ!tG-vEPfT#Y%=$HjRoyjAiPwK=}UZ*}xTNXi+W&
zaz}4*MtMHVHSP=3QoQyWj(les6Y?gSL3g%9bT$3=ca^u7Lr?9COn^HxD>Ae&`rQ;8
zt!s{I%=*6=^xum9Iu6nv4t%SQdkzYQ+|Qn$jY`ViH!l9=%|vmz)@FLn)!vn9WzR$u
z)OUFq_Z;X||7rWpO`^t+%oyzvZ>IK^yWhNY4^jSumbpC$A?K)ee(8`g0mdNHhMVrX
z=24V(v~F$Gi2vtS=$wcA`95=W7-Gw7?fFW(=!`BZj7<tZ5XRQfg74A#SXzV7Z^<tM
zc34JQF6WiD<iH`u-N1OOF=)&<OdEX|k5r!SUB<%H3lSWh=0k2gJ7yXsYmiO7O7i@t
z9<S0luuS?5mq0V6%9LztvBKut*5MgXP_xQ)t<W?0HZd3eii@j7jXVRE%8zQ3tc-Eh
zP_ak*)-wY*`UU_kbdxD^v~k@2<9J%f-nq%9uzT%{WGRO^Xh^reK9%tu-`a7o$%D_8
z12nWPf^DqCQw=8s!+LZ;Z>-vj3Pc2abs6Qs@gB&Q<$Mdg%i{s}Q(fg;EuXAh8oEki
z^v1F1j3ZHXy?x83;A|_^;A}II8nz29<=9}v=_&}`H7(G(OA*@t%}57f1<ignzMKwV
zUvD%YTpI9mpI>gw004TW@lWabdK?Tgv1?T_PcIsuBEoNV?O4`HRQF1QtLi;4>O=sJ
zwum8IUsci9Qo$T}uTUl=$%R3kSr`0@WSrLxoQ*chuwYD<ISmS>DbR>cyo>5f3U;tv
z`Ne(h80LOeqfUpXb==6)Yqn@2gt^RbvQGB0Dgj%NjtVHp1k_G{aO-2{K*9AuPi?Qp
zY(aImAX=NW!Z~?A&5Q`RV3KEn@7>5&?O+q`M&pFeNiV9y|G@jV&i>E$bN?w&w@l<;
zdXvD43WXR&sg~1*-W^kx-Ny^Tn-P^k-eov=_6MK5a>ZQjRY&xQgQ!(Tz2K@1u=)5>
z@5cw5rN!alzU?2b;HKEj7Fm{TRqgDxhqvTvMsJhrHY}T{&>^eEsEwq#(I(n3S|-iD
z)99(Ho!yZqaiVlj;7J#Py;diY=i6ie%JNaQFxu(p+{Ds@KI?3f`+@RZ^w8b4jGx>2
zr~pvGkk7Hqba%teABK1Ob9y^l{8P?uN^^n(Ek8}HRmkV6%iUAX(s}18x%h|_wnT}E
zLfpI2=(b^X(;?TR>8c+3CCp5e4r|6?;ymbNG?eJqLr0HzOO633PpIbC(xAi0W7t_`
z(sV(hR1aU7%AFxPpj!^yNSY-XqhN?u8a99eNLHRH6cX(i6e2Or@)hwEOzJ*5mwR^f
zA~e$&tp)%m>LDa*0-))oB}z1lB~h6>_KQEx6wi3ogZOf2kj}w7mX|jsiH2r)+JkwT
z#j1heqD10=TIW@d;~G)ZDp`m~gv7A^=NpqH5z<PMoj3Wzz#@i`$*aSLb`7uYSrk{v
zZyk&Tt+*WdzsN=7Lfyyw5jTb{7~;n>Ec&0<P$nCnKW60(V9}`xt%}wEG(L_Qyp=#Y
z9kh#RY)GSd^|)`Jhw@v+S2``H<2%1btvzgD;hCxu(5(m^&(j0z31mC3@2y)LgcfS2
zG5?hunb|O_TDJjLlgzdfhRI`a%1Qm@jh89DK3A=-dX@%bqSQbFv_rJVHEx)Ck<|@y
zjh^tjAkr(ptLn?AJw6qXq=)Ci^w-gk>crf@t`b|2H1qmI12aZ%MB3v!LbrlJq>b47
zx;aP}qRyXI)Yx&oCdn%U(N!*M%mf-^2v+5-VD{4imD-3WKw>l_v#ls0<)?@QLDZYW
z>iqAMdF9hhuG_l5d{jk;BU<i<g<7G`?*0B)^@=bT{k?<a+T?+pzg5BPRl)qh@FLuN
z4+h-cOncn`23w#^VAJkmI^c4L6F}{?nkyD<-s<c#$jo=3oNN0c?}DDcQQj=YeCNse
z+gjky_x~^eO=T_^N>j$D`IrYasEh)@;L~)Sh?eJe14yS~yNNpUtd2(}vh5-tZmn0n
ziJ;es?u}+r$yFJC-EnqSo4YEh@9szkcBT!5Kt1N{zGly_xZRz}&GP|<z<!B{W&L^$
zvYgLpl^~qEKGA1QY=5dLYRENbCCIaNG%STGl)}%-WVW|Ap-ZQe<TOVCMAdW?SjBU+
za(miJ;GQA)Jq*A#TahDo-Z2zU`;h4esd)U0Uv{8^Ig8;Z(-!v{RM|YNVLlSxqjra>
z#<pnpb*pQ-Z?;1(mK(O~$cw*hZED*1h~?nrOL}*Z>vf}%aWZM`=@|F8`RhW2F>o^T
zY!wY`v8DgA)x;rvvH}z)CY%^!7&)44;c^bbCrnzHsC3b!dKo6HW=w_hI5zASVSrGy
zlrT)2@ieVG|2llJJ_xgIU2BSwe&$FZIPVKd3+xe1*dB({z!atS?f1hhOcH8O?<3HP
zxk_e3#i0b}rLgJ)1#Ou8z&?`L+-h?A?C`)50TmQAp}#SSy>^8(p+LadEAcFrMMQ76
zMy52>06EDQ`J%T~v#|A!&|}fNwF-#Zf`G3J3Bo@oepH2>g{a*`z+^`>hP``mgmojg
z=$0zDz22!`^WM@<Ugv|1yG^5#)B97?opb$VB|FF--!Gl3LrtGfi9)<uJ*-}ourSWp
zm?kn52GTUp{*vx`aPF6PAuId66l7z~>d^e0hyn*nH9?SF+R+Lqi8nP3k(p1*go#ve
z374l<meFO~&wO~<p0TUFRY8jjhVx)^31Z4v*9xp_6+}##;xRrz7@*GNvB@LShAPg6
zyq71(Z27NRWEZ*={%mxU!lXrxpHB(syE0QX+S76`5>ZID;Rzo#cH0QtnynQfiv$3K
zq%9R3$$n3DS&;;AF`}!2!P8Vs2#ltwpm9QP8UU0mXk)-jHW?4*6MI1Cx8SkE#xmtX
zDbpm<y}U+~*Gad4gMy!6?lC_6>2RE<_uF{Lk?(JP;%g0JsKi1LamLVM`R;ehbV5Yq
z<Cz%-uZObtMe1acO@f%F##`4ZgHh?g_r_pIIL@sHxUFaE(C22yZ^D5B%ZmnLlaa}1
zr!o2CXCGc8RJ(t=WK~yE3`R4^Ml(;ZQ!~N*zYTlm{)64H0><p->iEhjTd4mFrQm6s
z;6j&H+-_pFfB&-#&4~!P_v!b#ugp&(r?Q6be3{rSEja;b)Cq7E%*<jnUGMLR!W!AH
zSQDH-Td7~TlWUFKP>~pOb~5WFHHWDni(~Pb&mOvCJ+I)#KZ0elLMzm&T76&{LvTz)
zkNAyX(^T3VW;HASFXjAcw2zyxY;r5+(yQ%hU590~DP!5zgFOVdR1&w52v#tA^b@4I
z8$MkeYq+wBR<o8E^_CmkGp)B`1ny!;`xIOE+YPX}BBgJL&E+S`86WzwApgp0ZHnMH
z_ayJZS`Yu~Mnd0V6)&)*ywcRAmE+-JMo&BaMqJmh83YiBEdRnXo!*SnWrL-A%S$0L
z9Fsg#_#e`yFxQ5iz3P}o;l3M>EbyAM+K`J4(M8?Vv(#wO2Ht&V9a1Cla?5HiDpmQ$
z<#5>O-4#kV!`Sb0KO-B%(oN*S&I`<lhAcmS3)d4sG~!NQP}HRj2LXuOn8l+*y*<{u
z^&^o_IbM@b05=BjrruRT{b0fi7~$p6U(S*g6R``cbemHA*QzT%!ci^4IqtIsIT^vv
zl+l^T;W`h$B6=Tl1PmX!+kVCMnC2u54L-O|dzkEfHU7dk<fT!!ilwkpeDl0Uf#_UO
zM|z~{cygb&jH*`ZoFRA`KfRdK>hYa<MZrW}Me4z1p(q`pDiyrA9Ap4^YXavZ0(Kds
zY7@@-8T)!!YL0c`l@k-IFVuQ7sfezoc}wirAw4q<*yPE101vFI%IPy1@l@+erdppd
zE?%NEIM!q@?TAO!2brj7RNi;yptX%TvgbwivC)oAX}TCd#z-6}v0Ke$=m21DBLD#W
zc4f?($d}PoOktkWqb(H-6*MLcDm0F|;3Z22EfCJ#&4H6SgV_uC$>F7AULTDGo${#C
zBo2<*x~J%C7HmPA4Seq)?&iR3U<#AO_)QzTu2QU28KBmeJU$nP3+JChkXxnIJkcOx
z&fdDC@k)lpIlUeW-pWx-E9ZL0VA}3ZWA20c6o7X>h}B>XBwss!*$)^DTSx<z#+t}H
zl^RH+HWd0capvTZ5=$CVy*=EIG1@)q<Sr%YAHi^#bE&g?XZ9s(?s+IF|96@TX#OF)
z@GoI;+!UZ_Oq-4NXX2~-nzrd(Q-Mw;y=FY_yBJNg*SBMRvC%X+JeoMM+TYFdhMmgq
z$Gw(a+y4B<d*G?jn;gC%-ES}6s-!9tLIu2Ljf~gM=WTa0NoKzj`qjDfw(hLW)^6DN
zTFUK_0dX5XM2{T;`+9tSjUrgtI(Il?w1px->k8^jvtUA}7aW@egf`z<GF~CIR~6)3
zG;z)zBA8k+4y$lBsD4HVV^jDY$===dyX0{_JdUnMJ+wz2aUX@ADN)pA!e6F=F+L=S
zVz=WyE4jt6Nx^kjFmP0zo>StOo(bF(<p4_vQ!Xyw-N2i|*3+z~p$!G-qH9{M-#yEf
z^NFh#CAOk7`zFY=oS#<9Po6o^u129^YOFA9wof0}wzMYI2b%!LV_sH-mB5o!jha$P
zpG;@(!$Mdx6O=(j205@a%KPjE<$HM1*%*y2%gvqQ4@MXTMA#OLuVNQ6WP~v(02W7C
zZhmncGQgTgOW(F~pMBVH!0uu27BBc6IcN08{Mhg3f6HVfqklk&y2Z@Sz1ol=$Xd2h
z4h&|BBpMM~ADv2KC%xxSE5-HjmJ{)#=CD*|z^2q32IYvw)jS1F&%-e&1(WVS#iT?A
zoAS6xABg3-j0RNJ9_lF6KxO<`gBQyrTE*o&eRC7DG~Sw^m3iQ2$}`b4Dm+b6VT8w3
zNQ|+OcL@5$!!7ePq4^iWxo+wYgqdiG10s$!^^}wb!N5IRkrb(|VXk5W61%sHdbNXJ
z#&o56&t+yVW5MxH4u)!|jOiGwSW}FnwHT}y69A+!rzOhW^R)yoA<$i4YLmp0e6ul@
z;If*_wr1Rsj~c*eQlo|LS!J@91QvE(ZIqkyql_r>VX)ZwPmlY)+Z2v~p#FgMdQ-k~
zG)M50SFJMUgT03<;bBnjPd253m^78HDtS3O+oGc|Rsa5`G5~W1HJ!yAZpnxRHK|)+
z^h{(*#S~s)7#JHjFM!J*!&Eq2OA?>>{eHFyuE0aRHQc{iyXI~eXCK@9W2EXrHvT@k
z@$a;KgB7*C`j3Es_)pNxv_`g`p@3Rbs*fAV`v476YK)?B<gJd@<kuc{HL;rR)Ly<S
z|3WF!=Jt3;#kxO-tK|tmGaPjAHRV-}?TnCR8SjUHV_R-#?`k=mD{n@eup%<#@(n?{
zuamb(X#X<5_hs{oaQIJcrJdR4%dql}sTW?!`p=}7;-=_V54Ju}S{G=&gT&?P<@MD>
zA-5|a&@lRkDrwlkc}UB>zCg(*2fNari6FLyU$c2amBY7x*=xX>ZZ^fjBpzeY>kYaR
zUQAd8LvYLzdWqszByHD6H+-2tg+D+=_7oL0GGx2VuRB1qWzLw-4t^a=C0|E}YoJ=5
zIiWaKg*cOH<{wigs8ZhdrciUx(_}Sn49X-7(8G(zi#(BX^aEN(k{e@8q)M>{G+TyX
z-7>-P$yY5j)LCg-YyfSF6-Em5nu=lfX5l;amxFf<!GXhC!1FDmQWb6(KZe&m^j2dS
z8!tIkOqf)USizq<&=i9TV@d7dt$;%k0;PInfp+xNhSz$A3#A#@ych#BNnmxLrB)`n
z;V}e6EVTtK7JCk}9`UFI4w{?I7`|#CB;&#fPQxnPWWiy}_T@}${#ZxrG!+3ML40^~
zB$kI`hwU)u3!k%C^$B;;dZ-025El)e-97<?FBW9YoNo9enSblD_4Ep}!eYri?GEOq
z(r1`<MPJ$<!R7#VN--j)W`#b<*9`v081QcX;<Dnh04YIotS+#zTEx^X)|@s28DsOw
zR=l2WF%Ki-QJus&22n1O7Wg19sXVm&P(`qo%T%EYG^{ZflqIztysW@^x%nC%nQSZs
zu3nEwsreSkT>rpP<wjV@OM4DFOtXLIc3Vn@^r``~v{x;fN3BF&kP7x8s>qn98q=kP
zRG?hOtJIFfuVuwQv6-v~Q!dZI4m{16B<^*6KG?B-YtrENn-C2PamfljP5hWgt+Bqa
z>i?ZPX*m8JCWb!{xqtJ_yq^^d7Q4Kx6~bu5Y{9HVnWQI^$j_Q>HnyDoT`^ze*V1+u
z6Rz9{uWx_T;3<|(SubQWjAO)sK96T-w};<|gnHd)M(G*)$Sd5wDwzMk_>C|-bNVtp
z&yz0ZzGlvdxf08g^r2vCv;Tzi9Dbtx>aXwT4Caoi!M<lSxUTDc79{R!_kFdkyZ`#q
z^?q?7e{|O!U2W;WC#Vu&pO~NTQnd6S)DiK(%+K2b3#Ox$x@==m57$+;R8C;b;rI6%
zKEuLIXToOKDk||X)Kqs&O8KKVV1B&Y6JNqZXCe?wjYVNzu$pAAM|K`J>hTM2g&R+a
z3694p>^t-}6?v#<Zv?E=Fc1H<=9Q#l>Xh;L%oY-LwfX}rIecv0W4ZL@`2*utw&80{
zd03X*GeK%JU&q@hG_5-p+<^|gr5@@HuU$SFd$DSfZ#3f(J|W5yXua}!ZHS6m_^>SJ
zUWf^_O!-l1wp;x+SP24<Wz5P;JP#!0G^)~oUuBRn+!XyWC>B*r^<)qJa9DP<aS)(9
zH}yI)lol99Z?do$qfeJBt#le>8JMhWt7b4`ptqK*Iv(viHp(0tHpF>knM5IB;)b93
zlUHk_Jw6O0eJ^)j-8Vx^pUs%)WxyGKMw+c2t|yJHZK`W;&)aO~FXerhFdv)u!$Aba
zQeM#26EoGjKvEARwJWbXhJZ8&Hcj2R&npx)s_Uh=CD3DYWKs8&viXyk{7E0dx_DYr
z3itGIJ?%p(p#>_vI<EjVg`A<Gnzti7WrD!GB&|!OxtE($Lc|*Y6=^vW^r83~(UOan
z;M0G|0S!AS3l^>Xs65;8;1Qa67&H8$Adr)|8A0uqc^0Uidb)pM%&7B5W!B8)!qq8|
zNn(Av4Oidj>OHN<i3HwqLXR;uCSI4p&gX#!s$AU4k`;!{7{cC{Tn}?q3zNbFnNfSj
z{S4{|h`LQC50K1{fT;wz=?mvP!Yhy44di5WPuA(H_A)hE0kW8gRZA|mcWBi*k_d;!
zp+LCvqolu^nf=>s_DU**+-j7~`;u>z$%{8zgE3pR?mP|yJnTZowbCr<7xI`cAEX=g
z!JojLRndCGV9{n4netTlj%f>X)rORix9a0`lW!*g&p9oigB+<+i((PPfRKw?$C^h?
zNNc}=JDIWtQ499fOH&oT7=NiA@8UcHE{k8GNE$+a&KpK5=7J{ogoqn1KUw!c>Lpwv
zKl4hn(nUji^iMNBZWl?s*L{FRrSRWU#@3^{U*h}A{eh2&^~p39tltqd{f8A`^;NPL
zjGNAaF<-%a%W~ixR*9OJDaV8zq)9DCqe(d*dOVB;m$M^|2m|;X*ue9$r^37&8WpG6
zjL0!w2p><0O!=HN(%w%BAD&NW2ZAyUlLXo=VAYU%1PPK|s+wfZ!_f1$Dc6a5#L$If
zb}J`JD?#tDg0aT>&g$o^4|v8B4&A_gnC*jYLEIAAvus{!_*$d>nwZ2xl{5R1mXrw(
zIV&@A9Q*^&zw%4Jjo7e2r8eaz@K>d)8;n8+-v_q$2v0C1gS~|gBNEg7IM+Pl5A;;H
zvhqDZ1$7{bH)|o*^TW~UywZs19kAP_2JZq%v@SvTT4e0nF#&}d7_?GJ;g+PDN>(vC
zdRsCFk-`RqE?Pt;P&b>k_9{`GEL(L}ummzPoZ6cbM5ua7a>!W2`Bk#;*pxoozofIf
zUM+^^e~;ww_WwM8eqrBoBU;2X`hLKJX&S4mYQd(0w{I`Npn|tuK4FS%DT!ntD|=2E
zHo`Ft3{r3qzwRbyTor3UOJy+EYs5UJsofzc8B@E0Wa=Gn^82P3K!sb45ut#)kl{S=
zJxWYM>aT*|NS_x0t>8xk3+OK>_i8K`QUQgH!o5=P1ts{vd59SKS&WQk#O>qRid7*F
zVyy<{dD56n?(pmJ#i#kFl8x9M;)<rLyjp;n1dtRc!Q|l#e+=lNX%;xfnvhMWqP)B&
zmYn@m9R4TbA$lU~ZH%;B4W;lkR$>fNxC&;n6fd<2s2A&%O|OjgN24+VG-|_+z!%0?
z2g^QHl~&HCG+ZgPZu=4dZ`3+gARL{+NY#$ZQ$zjV-rqv^@7xo>nSxolZSa1N{D2}W
zQnoKMO5D{uJU7`cN0|hqa0dR|mSj+Id#}b69GBcpkd90r#4$Er3Q@%@L#rwIt6Erv
zUV+wg*S7ZUYXoHcuAl$Th50CBnX94VPWf{iLrV@dlj(A=2@Wc7Vg&0_g-T3}1HubK
zOG~O!3}9lIP-Uz)IPGP}cjllbZ%x-LCxBdb8V|c+iJ#WNqL(_=MubiP=H3y!g{D_r
zg<$H?J{4p?o75ez03OKe0P`s1`-B@21%^hz_gXYXi;e7$`vaoc&Uca52{nB}+i-bN
zm4fmycG)v8E9k7_b3PuiU$haIxaX}sgJ(my0OL&9E-2HPF~wm~F3P)<X?^l|u%TBy
zJJm3U#;`Ul91<zi73hTfanL6<YRZqw;C6S=!SMAQew%u`>`;mQNLOXgCqozSe<)Vf
z8kdju+L%nwx93>cH*NV&aeJsd?nAKWzNvguMR;2cKJ?>3PIjokwl-yf0ye%)crt{$
zWO;^)xieP{r0UjRTP#I4SQmN6uD*@L!#dws%96ekgnf@Ihc9*Nf8OZy;Wxybk-gkK
zS&Z)7ZtwT@@?9@nO^utVy3Qcf4QUm4b3|MwVg0X3Q8D(3ohYfAZ*;N?1vU20fwA_3
zm9z93weg&&9uDc#l2yXx<#e@UEtEHB_B)5oIN1R^RwZEXp~6ANnKj>b5PS?`m&TwM
zYa_2<Y@OUy)tL9Xb)J?6O4?v7Dvljy%(gWypavWUJZ+(j9iEccyGQM{f>>usEL(cv
zd#nt&V@;$=&V$fj|4>mkfd=ln_zJg-NsQ{%c_tt^n4E+4lh`XJpH+@u&Zwu&w}^a&
zckoKg?U5)8>nbS_^0~4ed_m#rt=}Tr+A^46zZrh1gx9@6?kPVz-yY(w0%%g;g06;u
z<#HA~6DO`x8_-$cM=3f?fIu3<^_CBsVsEUA)ysSHLE@3>k+-Xyh9qU`sVE+UY?!#a
zs;tl!7=q#?wc<*NUKhY@vKI&$qurAkqmg4N8BCFbn(ZQQRNpk|)TwBz7*5au8xNJs
z-q5GnYc=&z@dU!Xznk;@`;R&hm@T~}n&CRC_TyD~86zGyNjOuWSb%Qwo1~}S?uSmr
z&7Aw@#zd|gv)p{S&piQ^D|a2M(P>-R?8Sy?`S<j#ph(EeDw1Cet6-su><R4hPy0D0
zs|HF;fAYxO?U1>HNp>M#Nw^4sP}b9u!+)**9KQ5LXl#4)O8gWQqTfY!V)?#c@JLf=
zF-YjtcWg4r;oS$~PoK)t`!TpL{5=#6(UQgHk5VUqSj)>=x8~1KJ9eg=p!#pNLOE)Y
z8rMIP;@4`anSWcO_a!#N4rrigx*3hbq5BT%qPTdQh#j_$i1{tlB=e&=^h&39{GxEH
zp)7Fl1OSh7vIORTC+D%}wAqO!5!wnJ+x&KQ?o8x!d|;hGtY7G^0w0Qk-no_Cg7*rG
z-*i*@xszw_>pV2$E<Pcs@5t5bZQb_P`A+UQ3}<h`zH^=F*OQ0etf3uh{_}leggdRj
zL*HDk^?6U3aNBy3%qOzN5gG8<;*;C6#vCV+y$e{Yx$FSH&S&W(MEv2U)TdQ97F#Of
zbDXf6BJagR4%J>)MvcdDpnB~`G?(!X_jJ-#;)=)28S68c10n~{=Kq$y)v3b%U{{h6
zoroX4!O}2)hx93g1H_(vRz`09F}7UJmQU9F?5Vpc$TnZcY7&2;owc6k-P?%QmECa$
zaiHX9dMal=?U?XCl(M_KVww$`ms)__>)sqz3gshGJ%{o<)vdgzt~sDOmmcK-YnO)e
z=PgksipynL*PRb30xXIzGmwTZ(q;3pi&dTdzmI(QW%G)iBf9pji|$6^#rw6{Y@!<Z
zUT#fc7lF+1hSKpL6xl10f)iqo%L6iHw1hDXUUO*0el;icl)E887GG^@x$bo&&N}4I
zT20eQYD*8Po>dYWx!kzxP`4ywB48)`P=Ua<(Eg;i!myv_luCS6$h`}18&Cqgt}m_U
z7p6aZk8WisLQIk@7AyAX;xIiGvK~W)`>h-~LsV#<E<0|v8s`#Las~$qz!(NSbOA7f
zy?VSMz2gbI_$`<WoeM-wkesbc4D;nI{;Tr3DnVs|f#h~h=39bRR4Akg^Fg9F)(}jw
z4k~d3;7c*M-FUuYtce*pxW!HhCdXuA=-*vqgk2-r`(jL%sV&>RcQnU6GOfdT?Ds_%
zxEr!W-d@ysFdY?%U2D%)Nilj8;OUlm?%~%b-<fX*CI}#1*cVBgE6m02fDj^}`udfm
z8WN|hy6k0kO*p1Nd)m>=W4QvO-GMb8FU00EZ<iQ3dtTQF0U}~>!1qp{Y?NH6MIY7a
zA)>}BmnDE!Wd573r(nw1)8*Or(436;W1@@G+oD&c+R_tkyXi$0*eNE9qS9#2^4;Xk
zQp|B$z~70=f1)(qf0(QL#}AWHw>SrOsIqrb;6v~QmGOy1#{CX<3_2~DbVlsFj+~~T
z`{$!Nn@{WoXR7cpzw;aZ$h2b2fp#VePm*r^*1K`;b>P&`69B?7x5RU1PTv;FALfuE
zdPj}@N~2b>HWIN{wzL#r0a8fUuIeJnz{)zcS&y%oq{{&`UVc%*xiU2xx(MG(fCC>H
zjpPLI=U7~7w3n|DwrI3fjAqo8^Z1}%!IE;si2oc-_#91i*2aupZSA@2c!>ynCo`Ka
zorZkXpW{SBX*7|G-R!N5Q05&`FPkoxWqGysqSNB+wLKeqTfUbfWxqa}J6x20PAD33
zB+UG<ICrPu%TpapgWB7lh8Y8wj)}jFFQz_=NIT+tF}K(twD$5_WqZeqMU~R7iWYk+
zKgYFLoQ(`PUCo5n_?Bv$iBtcUt}WxWBubHcXy6a?Tnd@{+E6)M<H-8qR@dG=pEIz&
z*2;CTm`f##QV4(>=3k#PrKjOg(k#~$)rrVP1&^h|+}~>(?z1V@R69Qn{Rs^Iw%q2x
zaeAw%m=8JMuOa@`@dVKRTcf(tg3Tyc&C23xn&(6#+tplBa|B@f_3f%7NzA*3kDtDw
zey>E`eXE|_^}QMK)vZk|)g6M=<%*ON<S5>Q{&;14)kWI^)Q#ZXz^h!JcYI3(s($$r
z_+B(kjIQ~pq_$f<je)-Cm$ZSWhwS?8nJRe{&V^bM9^IGr{~XQ}h-%UUviF6gL~MWp
zMOj94OmZ$Yf%V6l`Ugf<0!?@du>rtc<m`{E=@@D+fo`R7CK?PASH0j>8VBhzv!WZ8
z^P(%Hs6;1fpegqX8E%3?OaXt&Q_O^j11mKy>N8WxuE0dt1!W$y8??`IujUnJv=dXl
zEg@JoN(i&M41vg+XdY#_FwCZ<&GgnAEe3#FrSmqL<HHIg)L{*2-YC?V^VC?fLkl-z
zt@OF#D>m0+4@QmqbAf~6&$TL8Gs4XGKP)azL~}|nMY^yeCKOWA(9>TCIe1xB-FL1f
z=;^L4^BzV#M0TJBF{bEsbpxBv%AVV)32X06%BiVm=+OegM<Q>x)Kao+4E^R&aHxsQ
z)${lsS_~<kw=^$f0@Wpl#*I=9I{A&RG1aIu_13fMeM$F4_5+yA4s2t(`tY_*uu8il
z?*`{Z_J?9}$2YW%!d)qFNLU8&-+08q-Txrt_xF(1zg@>08T@sMVR=Dz&y!m}uK)2z
zJ3)}-{*4`{FVf)Rv}Gm1em>ryQ`OrBkGz~T3$tDrBs{asyu_<*9=$!qNV=&E7WMlQ
z;rfdlaW2)fZGgouPccP1YAQDA)rg{hUlntG)>x7a))HBeAfd-s3t>#<xu7ui6yJ{%
ziGO4(GG{n=M@j|frqFJLzd%ccL{%!2YA>Ml8NkY~yve-r%&v6goJRRZKj5nsK0|4y
zl}5AS5v_?E{?y)DItL~d7CA$2k2I)*ud!2Sd$f<?oIds%_Wf&v&C{i!B{1)o1F}PE
zdLvPx6)(#doDNvUh}*00?_N2YG{DWgOkYUmb!Yf}bG-N`gR`|H@3r2RZBH9AmNGzb
zVh?)z>SKz^YRbkBg7LWrp+6fOY?@S?;?7KN@oC$!J8WA>9#n1lo7*)z#~|WeguAG1
zJ)u>|g&97L(t1PDn838wili2S<hN5b4s)4gtor~{OZBfOLq$PS@9Bz(s!_-o7puu+
zvFfBCzZ4{0<7HVXd?oaBlUlk7o2<CRC;-T>1Ub(vr7Z=ltBFy901;CJ7U*PFA%kp}
z86t+-J<g141a(u>Q#`1{H_2D3d_AMk`atZA5Y}uwSWl%Oe^+kXY}^#-dz4@@AUg<e
z73~KQ_<nh7h_ei+6hu@S@y!T38+2LoNwh|PuQTxK8qEtyHB|yIb-KTa2)=tE@um6*
zzrnHX&USRAamY@=B=<EfvAYKY7GjY+b00Srn=UjPi`uQ|63PHlOE!ZF73(iam@rK@
zRBG45+~+@26yNFX?V)G@d**b&R$}h@KDCF<3~pW?WA_P)i@!kR(!yG&3t}@^Z8C6k
z?c)6as+j?EO53tbuX;`wTa8N9SaIeTTP??El$@tpfvk6KJdlh=VPR&EV^2}t0l&#c
zPOX)mzQi%$j&iQ|qIPjMW%4UZ4mafro8bX4y*D+5L~V&xr@543q+|ZHuKd065Ba)(
z92)<x<|Hs%f}I6V@fZqhOZ!?aQApTlx}<Qk?$%N3r;}vh3E+Bb^`l+?y@*@+>HC*d
zrm<#$DYhzRf|8Y^i@cBejELR-_~FoIO~0BEvy7O&uMgww)r5uaz&hq5<!ix9H_qKh
zJmqaMT^`F{9Y0On7{9r+)MX!yT)krroymEh8+$0y@I?`&98#I-y_sK6c!P<pfK$eA
zPiV8FD}A2^9SL;mdJ2=imbA3!w5DQC08zz1x1QTo)gN&-yhWuL3+6f-@5D}Rq2TEA
zx2L%2(n^?)c2G8xqxwt_`Ac>(l)fMR@+rF2uZR9`?VV>-Q{9@!L)Rdp5EKv)6hi{y
z3npOby#xr5009A!CK8GY2vSr!Xy~An&>;y49i$guM5+ivC{h)WA_3(^K*T!7J2PwD
zHS^y0&fI(FuKQuWo$Qlj<>c(M_p|@c@A>btGtYV?g|E!^e!KkOaiz~^@JOUyxcn-=
z#Lk!EoeFHyvz*e&q4P4tz-RSHTOp-dl(S)mt9)trls<={gyhl(wY8@KJ_$u7Y+xA$
zO#dsB{{A9~BV1oSF3SeYGmZ|zWQQkejMNk)8oE2!*y_2~w2)E5lhGpP@McP(Ty;|s
z5kAX79uz8y>F`I;ODqU#THGeT5PeiFb+3|!iV5z>wAnzVl1MBoVP05#hBKCdBg$CQ
zPqCOmvdB<UEl=-NWcykHgP%yKEUJML5ZL3$<Aq~7%Ps@XnU|r(zYOZT_Ih+$#hIVI
zM$H11)?(t9-iY2Qeb`f!5{5{f1BrSBm8hMHvwZK4Y+`yuPf#F6cOA{@HLID&V^fxT
zLK<Ib#D1EH&U<)>U4{2Nc4%)Ysf40nDt{|&|L_(OaZKhL`cWi1`|kC&yFfiGt``$A
zcvlyytxgbJPTJpDvyNc-2KO{2D0p5JdglE$s)rLq>E?_dn>uyvj08nwv@>|bZAUAF
z$4YXHm|5Z&P{LxTopGnnw<!#_m`<FN;rD@S7T}141F)iSTS*D`Ziv++2bGZqFnW%f
zgqR!*asqKhqPZ9u<?0+h3GVigAExeuHk1@X!?9Xo-D~}H&$n_n@8l@V<!nEE6-g^1
zu8aTP<MW?w|0fkpe`4za?=%=obKfqF5&>u7(=XSU7Ams4S7hHB{uXg@n^}^=mrqw1
zus&{-x|dZBA2QQM?_5%}QunS4RL}qAL%B)?H#)(_w(Z{Pmr9$v_1DnUe05L$cci57
zey^*Hw%htw2mMVo5Vg5*gNufg>m_z}jkB{qK(Fp%E(X|i^zC%)ewFr*PkGWik-J0_
zIxSVjXzRTo6@<8@FjiUkS`b&Pv~EE*C>G8=640$GUMSh;&&h5MpX%r+aWvQLRyfvh
zYHt=ugfs;E7;-45nl)v*rKhG>T)KCVOhHB@7Mz0<5XK}j<49c$2_q{d6CGU?u6Gc)
zi6nR@o$UzGNk9Vm6pIW$0PiGm$m$xogahik2WHkPM7K`hK7x3S5Iu;j*)7lY^itsS
zo}>zKXu9sv@*LYSQrPyR9vrFPclnJIO9b>iceI{JsmWub+nT=AY*>us$A7mUm-Y>L
zOT;eEHp()p@TK0tuHMz~fE|a|wlx6d5qaYsd}nC<n~eSm_XP}%2-@J{r0dIg+vF_9
zKd5eL7N12gS6y*1Tb=o>g8Py>_cfRJ>uNS1%--NTb|Cddt@z;67ZgA86ddM&slI_=
z8BwW{aCq=AgrvqUK{<c{mPg<bi|cr37nsZA1_S`Ky#*z8oSicuxN$PHi&6{$yMw@h
zQVLKeaQC7LaXupaAbBp#0t1a8P#<{_SdcOCBQ}k`VPg$juA6FHMy`amAD~+0@ea^i
zRFwfoOzUdu<TKy5u_Jl{8-mgQr7GmV6IB0TODLe*N*-Q}*<j_9N9QS(TH8L5V}<M|
zyj8)g&Al6YUzSQUHM9uhdP%PYPky$nGu@bmoH-Y>v^T8t<WBFffB8&;5djsy?=q8S
zshu|LGa~@g4t^PT?>W<QM%Q15S8%;4>bP&$dzaadB?s;=jS<5vVnt=1Kh&Sw^t)12
zD}0Lmb}w>1{05f?!iYr_PBr$DH;{ZoG$)UC2zZ>+_QHnA!QCM-4Mu$_gk}mg99>=A
z{250?k)bi~ns^7Hfz%ZeMu(RE#How1QanV^qFJ3&;T;sBvKgV*r^~Ww<tj)vO{D@K
zs{poW$Q&SZHAwHLYFB?o(c+S9RiEIPXU1-qXCN6K2=P5?7LQJ;!ZG7kpd<9$7LM<e
zpC}|me8A|Wg+PkT($;lQ4pdtE<jDl93+JDAJLv=Dl!wQIyk(lOw=zr4IH4-Vz&26k
z;IdNI$H+<lugo3gn@5ua-e2DotM@o!)^tHI(=XJ^9+7>%70sdA)xq}IEc98G7RA1a
z^ZgCGrlgXU(9Q;J4mQx1nV>*powsejP)kywk*GS%)LD<Kxd;8#V^Wh--k5~ASbt}V
zr?H78$;cKiomw4>HAEN_t27TW9P`HJ5vHsaTD5l7;e-z791g(>OTII$VUQ+3cCL4_
z2fAo!ClCpcG$Bm|Sj*&S3Yk2Jk2ppoAfv{bB~`#b*Yo7^oJyQFgC9*Yig_YVu{t+n
z@W!yzCfcFi7uMxDx_${|f}qQ^-_I$JPGCL9b$ucqVBCSCAW5f?8elbG^Lr8@>z~Pn
zy#KILC!i*|fW27N(!xvx-8{)_)YmeWXJ4)%o%j9M1^dkSOUg2Mg6RY5j()=6j^b1H
z<yKgENbA&lUqp=xhr(s{wAX5bhBZKVu~WsxpE*<+ryDR@C>T_eQn<m@EO5Su3!2kZ
z0^UBOIMwRNsSLKq>Rk;<g;4~_kb<i}KowSkjn$bjsR~sv=t;3SehV+rnt0O4){Uv}
z1q^)2<){WYg-%{c_Ak%9)-9<@!Yu8OUqJ3iUs3$Xo;*+R!$6|8jI(CUtH}(kfl<tR
zWH~?>8ko36-w=U^+@xOX_3%^)sBmUzKXT0uqaM3XL99rnb+cb>mxA^bo!d^VW88V$
zA-WbFMk$l-vv~OfRE3ceWx^%7U;awLr@j$ojw?RHdew~ze3uU2WZhLTvdcNNdaBIL
zE_RCV(S<K+kEtyZI?YAVjj{u@U))aSN@@_=l%n`bJ|$^=+M|2jR=JjXe^cYmFNOVV
zGirt(RYUO8J*9HRZ1tAQrph{a*Ol~pg`DY;oD1=>qXj=e>)#a$MWo~E&Ori1c~JcN
z%<jRy!G_-OL;whXPoC2zcww@j?t_S1?MOpz$Gdf7$u{X}$b5Oq0X<ynq!!5>u07gO
zhP|UZ7sb74%54OcY`l`!RTPH+(e(rTeSSbc0^rXB+Q4M~pk9y(DoU?2OfDj7cUXaN
z*8owy8+zE!eAzoD`RfSj8U~rSdoK9!)B9v-cFr~<e&Vo8^#;)HPA03cpbFLf>Vu$}
z%=h3Pb(fAGAmMiJ12N+5h9abmr%n_+8dxzuFt6@jPVyTYtE;naJ)qB>E~Vn#TH_8V
zVE=VY{=L^H{~+7+=YjYi9}hwom@pPp>8*_Z>mTA(0Jwxv*gmG{hT>J58U5-hao=rr
zHgCV`q);Z!0kWq|8}^;B<4q?>-C@*7e%B`PtGNq8HeS9MD$Z`pD{(fG9&pnC_hj{1
zXQDa4=}K^u>_-WmVG!^Q)M6HJi~}qaG?1fc5<(}<$x*0aX#^nEXdPI67@0L(ORzu;
z2CcAtdE6Xnfi6}!8z8E80qGW2T^BWq2IhOG?l2%`1vpF-3v|t75Q<Cym2S$hRalZX
zSSIQcpaWR1R>=-<ne4F8O6_}vL#v!LOmMI2%S);<Zp3^)oab)`g;v)smmdGJhr!p?
zw~R^7fKv+=GQW-QsekWm@xGuRG0H2o>Gpm5FxPOXLw(!0w9KU^Lvs3e60`ko;_VVf
zddEV8eto(=(fF>Ss6jJD9Q*Dy@KE4(zF#(Ss9&|}GP-Nn>E|n&tc-IDH}1Z2Y>I_s
z!ia#)?^NJgSX$|0g?IE&Echax!f?o&$7*W}nj7#Z22Q3^o3>}Xy>yaU-kElTuyqr3
zWP?OCWFB6(#h}a7$ZJv{e<H1`fiRU3ZWXe@I%2Gcj1nw{gbia<3V@Q!YlMTvS>n};
zJ|<G)j5O@ZqX#|7^MRB>gz5;X|9RPANelElL(B!$3DW`wkQ6r=6r%V{-X-*vuYxs%
zZ^{V)23lX4iv?O?s48Rb2!FLT_xP*GIvwYG`l$7F9B#y6$wjczCR1^y6-a;nj~u0b
zvLAn+tJHrO>0pA6vRrip?UARA<_~q8Lxb>o@Ak4X7Z%Jo+C5|jCYrqO8TF2SW70|c
zUjv*511+w4i>S4)aaE}TTm~@1Y}JS4x@>DApEDxSg}kDK#Y`TQaC%rEg$&`xNgxJM
z!f^pX0^-PNmJY@!3zDfRN*L=PO#Sp}M&hc=Wg%IM$)vM@nsAW15<eX4h|DSgzIYhS
z14nicDD=qyF;jRn^LXarU7TIAjLoPQ<=B9)2?s9kH<;@9iMZNe;bO_9yynW5z!2u3
z%i-nLxQSj)nz3(JP8u=hV$VUBO{?2KKyr(6c50wig*nrN7D)O1i&Hb)spuXK?==#e
z@%`(KFx7j#7Q1SXBd83oQ;!_Cohog7W`Fqsx;!?qbNkS$!r=X`4<Xg(bMm!Dv{UqJ
zdP-@<ESmF{b;ATd2g~fVR<~cC+mB$RfO!N(tF{Sn9|VN}1u3Yb>lg}mUIg9Y^TDJS
z6eZJO#v$;HQ!T}w*K|a90qRJ+Gv~0UA^N<nB!uaXY_3@BbN&vn<fGm054$?BXk*JL
zNHZ1a;q5@$h03Po2wOEgtDbFm`BipH;h1Kc2%t_d2;983Lsm!8V7s4`j3MTVbhIQK
z85O2Hil%<>B>A^OlF1}CoaKmJTP$k2eGo#oDwEMWST7JD$Jgr-VOy!=9goa4(6TT9
zcX5t?^>7&92{ftKX3)!&uHrw7gLo;jvI0zUYVA)G9@?Kr0e|h7_wPsxoX``nWv<D-
z=~w-TeO2Tae;g6a1D#7bTT4c_&w%iv%Tvw1Tmyn{A3xp|+2gZU-#-yPVsL$nRxPtC
zpSf?f=TfBNKD@k_nbrMM&xN)Ph+0E_>Q$s0C7!Hea6LJTXo)1qmI(laMHobyN}lW~
z=FBB!ACQk6B?h3#LxjM>Y8IkBgQXi11`h?pm88^YEFB{xw;MqPje?tcVZc2`+@K7(
zS91g5mXs(xnob77RymT>ZA(5=P@jSaPYtcXl;P5jPj}?9tytwVMbjwB^AlT?N@}TS
z7wL`A3VlFcX(d}F`3A3ok#0*KrlAH|83L0Mcx{xV*xjo8aH(J6(hDi?>o$RxaV_Sf
zRZ|Z>`n^~}RmQyN3^?Lc@>=C^Ys+1$IWwLM!z*-RiD;wh{0zqTfPBs6#YsfUH|gCv
z-)UIh8J+619(8jywI3kqoz}Pm!ku>R#WoM$$b2{lh|w#1*Q%d=Ote`rJ*G|sapyl}
zpLhs+rg%w6u8aPP+4C*CSCU&<n$zpryiZ=Qz?~Cp)6JoWA7zq>7cT^b`sHa3$*DaW
zyjgQou!{GDS!%$!J?pcP`Uin@lS7CE5i<G+O@hlE$?JS@a~*A*3}Z2v$j?Jj+4KqB
zQ}@e~WM4pjm8T|A3}tLu-}k#{FO8ML3y9{P?$gStyC+Ej9;XQNTXyi|;VmN`gO-GG
zk0O5E>}Vm{wx$op<g8jYYgUSh+u#L%C*_7Ku+$ZjZB{EXYTKMS=Z=hBN!UGx5s0gB
zZboybU<RRnngGM@<QnUiOu}^BB7<7J&5c7W_9$e{gCzzh`ffx*%XLapwI1Yz-n3du
zz}_eeUvsEDE}ip@Njfbwfn_K^oPWd}DfQfASa2~{FJ$M&Z^b9HfOKHmm@TZR<=`KO
z0l%;9{q4q8RFzU0gb0fc>xt~>#;Cs4NnQ*4uiI@OGW`0Nc#XVwx0<sO@9HfW?3O;;
zSMyw{`ILdTuL(Z=I6=(nR<k<cV7HtCyy{%f9xI`F5Mm{+3FAydHW!qEfsy-O4zBP;
zFa|Orhve&U1-m0R7}a6?iSQ?}w@3A4q0!Ff1QuGa12qGsAj@&z&$!Lv0yf16SiLRz
z=xc5QHdJu#Ls-We38i<~R5b$m%c$XtU~)|I;`ay9eYC8I9DZWL(thBZS9Rvi9+Fh&
z!w76$@2AjQZJWa?Uq;z7yld!gJ8aPU1ep!R)Y*rbc@_JXrf)<&HiY$dd!Y#7ym#LP
z=+_q}tvMGB9(=wjFlWGqyL9Nvy%ILkQx)$7;|m2sWeuNszCB?th&BsAX(U&+a2v|L
zJknW~q-7Bv-W`mrecO8xK?<BwJC%5Mrt@P>vYXq3#9|eLHM=9WPrX{5q9LKR@_1O~
zCtSRufh5z^t{PyE@li*AJz!?f=Act{_LepDowX1b6j7J6bSUwu!iHE++l#J-iV_dM
zO2$a@wsSvfwKS=xd40|8x{kH8D4zn_RqwWUT!2bmP-Px#z@e9-#mxlWK5*H>aShW@
zi%YkKCS_LrI5x#1J{fB=h>plD2RmIN2I^jv%;`SlPA+f>r$B(jewY{!444!B(27yR
zzq!x)0q-mrr=Pby!v$bJnxl{G^a*?;62<ih6nPFard_>AAitqgfi#vr&sFK%OvZ8b
ztb%+GH}^rer765<9&GTZ97)?a=T1!*9F#tr`mM`lbGj}Xj+v6U;N3yOrlYWF@9>4A
z+2!S-bGGdwaJ~h8w&`|zb*$3V3tcDWHG-(oktr3I=lj-Q>hGd(XF6#DP&LAVD&H9J
zL*8f)KO-_(9iX{7|Jp?HN9X@mzka^j+xnMos?bk6QuuF3QMoQN>Pr)=yk3&525Xb@
z45tiU3LvcGbmHn#R?|sInQ`K`%lMSPJ)H~ftanO7tybJn+#a|Ra!NfQvKBk*2;f_<
z>R+zyvGikG!9Mv8yYnt`vyVSa?xcB5a@89uTS|8<lDUW<NHse5sP9*DJjAq4rRwM;
zK*E+RXtL^&J|uI1c!ddm-7>UG$!n)041-4;4U>gU7PCp(1e*XcLGu$Y6~qFmo1ks0
zXUI5?wbpMQqMf;U%D82L`v+eH>i{*Dn(&$|N$(m7=U^976XQVxoZJXKE01kkuuK_}
z{#JS4UuH9Z&nP&t9Jq#ghS$QABe70j&9+ZYw1btr(aeuZtM{XjwvW;iT(iY2uXX(Z
z`D5(k^ncCBjSvs1S>36mIP0C1X4w&d$_$sJYisZ;QR34+VI#d4`Z73I@Sz{A!)+Tr
z;y*Ik<)y>l;oTKx4yj7iDlOyv=3>2^0mR+!&#66@`u@PzCh1I>kj!6~BbR%Ir$k#`
zUc%h8#+9?MyRhT^Pc5G~Rj7W^ew#ZmV^F*1;Q=`oP+l{zlACGF0JA8R&;q?}vd-Sq
zdFgpr1KizI*V1qXnlO5@i&HNURjy#nOfnL1QfMZpvROouo)NunamA0<WrPm~A*ds!
zXR6^o*z#r})zQdEo4WpLDlok9=)>T05Vv?NQ4@v)(5WyHxE_LJKg`1mO#DeCdy;LD
ztr+G4a+cv18*q>p3NWEoOv2koh8nj{oV*)RU!w2#$*OBw+wn8wm?hJrbFGn2q;G+%
zy1(qmn&Q@nC*<pRh1Uks6>1JKxQYW=%;!HqXWeZ+AI{G;6ODfQ?c)Kp{fe6jM{d(V
zpDk*-gKzp0eI{%(2l!rh5`752^=`|Zy<<nJD?MJYm`^mZE-{UVm+3>#r=@QIq_G{}
zUrNvZZ`wouR`J}(7RJ`C>Qrq1cDk!paZUEU-zmf6;V;BT>(sOo`&_+}StFUsNdqaH
z4n13vM7-`HUUo69!JyvwHuaV(8^I5W(G4jLeUU{qANMM~^v=ML{@AjLtl(Ex{zD$q
z)e5l$xoM?UMpdtdz_tPbIkXZtLyyjo9QQ*}_sJ6;*}{=m>X9D;OFbo6AG$$fIdO5#
zo=Ap|RojHyX^{}~4@bL2;_LY=U;@IXe%ptxzA7_TE@YWxHDr6OCau`~ZDqLJRQaPo
zEqQt}K<{Pd5=XCEPyxD!6J0765JJ3t$}YXT%~xqi-BR*SP?=4TvFmZ-Z2O9%-$e22
z{vnU?yh!?UmL{vn*GdXiT(J`7hZu&dGsIVd;9B`SXik6|Ys1QPPGiB+c;C(Jf%HQH
z#cW?zBR^SVt~+^q73hFrViDzU;+<RRS85V#uS#xJtU;V-+>aJ;uIRvb-pb%)0vF6r
zYs$Zs_R+hg9gVsTTYiW(4v+=zdDb6E$p=kb@gkL>;5!f!=3<gVG#ojgWV#yd$K;f5
z9)710PSM%usdq2G5?dm>h@+o*ByXpK`ZhPf_MH)F<vM!(jS{$)WmYWI_U+rspyjbE
zjPBBH1L*<(lYDmQ-5x^BDi_Z~^)~_&M^oZ0gbz$Q6<h2I^s5%}F1QUqJ{M;l2ZxNs
zNvBLSn&x}VIVOGIrwwHgfc-r1&95-$;;Sn3cFwDIy|87ETemwom7c%dO?V365fZkS
zCr96(ycp{O>3dw4=ka-BzNTJOcJ%u`UH53!Ts&_<=|yBqU&j4Q@d9qw1O&eeoS&Z4
zFs#{CXgHqups8XZ&Ehh@#NuULS86swq_s53$pV^nWO*v*n~$yFX9i%_oG}t>oYgXV
z#r>UQ()%tBQ%)6JnETiDQsf&B*K)U+KZ%>};zj?REe8D^ZESx~FZ0*J!V~89GkX4P
zPtS8DJmD~0$HU0u<C!ugH+@F~m0wFv&$j8*Rv*%^E&YD{SF>F|?fKv(>nlWiYt`TC
zR=h>@bZ-n6Fe6X*vx_n3EjSbCJ)kd$-1Y%B(k84tk2PP%Dn9FJWg7isy+#~3<I4%$
zb%4<v?+yr!78wJIgGZSs?ele#FL9Zl&du8M!=L_btm647?ZuvcXztA&t6nzrwR?dr
z@vFZs2-Lcs@;kj+r4saxQK?f%*aC4hc^8-6p8C3ukH>EnVt`G0CbF4$V;NV_*`x8n
zvL*u7B`1yUoN+Ho?~8@*xzLJ;g}*1V{j0V8-&%P6{wm<Fb&-GO`G1Vj)4%?8{-yu)
z|95=*S9SgV=k%$+eMSDy`F($>-~OZ8HU5)Q0?M8Vem@-!NdU~0c6!X5b>uH?a-jdy
R((UitlkH#a^SnQ%{td7jUc3MR

diff --git a/web/src/executor/camera.es6 b/web/src/executor/camera.es6
deleted file mode 100644
index 1050c14bf3..0000000000
--- a/web/src/executor/camera.es6
+++ /dev/null
@@ -1,142 +0,0 @@
-/**
- * @file 视频流类
- * @author zhangmiao06
- */
-import $ from 'webpack-zepto';
-export default class Camera {
-    constructor(option) {
-        this.option = option;
-        this.video = option.videoDom;
-        // 标志是否可以切换摄像头
-        this.haveDevice = false;
-        // 设置视频流宽度
-        if (option.width) {
-            this.video.width = option.width;
-        }
-        else if (option.height) {
-            this.video.height = option.height;
-        }
-        else {
-            this.video.width = window.innerWidth;
-        }
-        this.deviceInfos = [];
-        if(navigator.mediaDevices) {
-            this.haveDevice = true;
-        }
-    }
-
-    // 访问用户媒体设备的兼容方法
-    run(deviceId) {
-        if (window.stream) {
-            window.stream.getTracks().forEach(function (track) {
-                track.stop();
-            });
-        }
-
-        let constraints = {
-            video: {}
-        };
-        const success = this.success.bind(this);
-        const error = this.error.bind(this);
-        if (this.deviceInfos.length) {
-            constraints.video.deviceId=  {exact: deviceId || this.deviceInfos[0]};
-        }
-
-        if (navigator.mediaDevices && navigator.mediaDevices.getUserMedia) {
-            // 最新的标准API
-            navigator.mediaDevices.getUserMedia(constraints).then(success).catch(error);
-        }
-        else if (navigator.webkitGetUserMedia) {
-            // webkit核心浏览器
-            navigator.webkitGetUserMedia(constraints, success, error);
-        }
-        else if (navigator.mozGetUserMedia) {
-            // firfox浏览器
-            navigator.mozGetUserMedia(constraints, success, error);
-        }
-        else if (navigator.getUserMedia) {
-            // 旧版API
-            navigator.getUserMedia(constraints, success, error);
-        }
-        else {
-            console.log('您的浏览器不支持获取视频流~');
-        }
-    }
-
-    success(stream) {
-        const domElement = this.video;
-        // make stream available to console
-        window.stream = stream;
-        // 旧的浏览器可能没有srcObject
-        const URL = window.URL || window.webkitURL || window.mozURL || window.msURL;
-        if ('srcObject' in domElement) {
-            try {
-                domElement.srcObject = stream;
-            } catch (error) {
-                domElement.src = URL.createObjectURL(stream) || stream;
-            }
-        } else {
-            // 防止再新的浏览器里使用它，应为它已经不再支持了
-            domElement.src = URL.createObjectURL(stream) || stream;
-        }
-        domElement.addEventListener('loadeddata', () => {
-            // 设置视频流高度
-            if (this.option.height) {
-                domElement.width = $(domElement).width();
-            }
-            else {
-                domElement.height = $(domElement).height();
-            }
-            domElement.play();
-        }, false);
-    }
-
-    error(error) {
-        alert(`访问用户媒体设备失败${error.name}, ${error.message}`);
-    }
-    // 处理摄像头列表
-    gotDevices(deviceInfos) {
-        const ua = navigator.userAgent;
-        const isIos = /iphone|ipod|ipad/ig.test(ua);
-
-        let delt = -1;
-        const range = deviceInfos.length;
-        let start = range - 1;
-        let end = - 1;
-        // ios机型camare顺序相反
-        if (isIos) {
-            delt = 1;
-            start = 0;
-            end = range;
-        }
-        for (let i = start; i !== end; i += delt) {
-            const deviceInfo = deviceInfos[i];
-            if (deviceInfo.kind === 'videoinput') {
-                this.deviceInfos.push(deviceInfos[i]);
-            }
-        }
-    }
-
-    get curVideo() {
-        return this.video;
-    }
-    getDevices() {
-        return new Promise((resolve, reject)=> {
-            if (this.haveDevice) {
-                if (this.deviceInfos.length) {
-                    resolve(this.deviceInfos);
-                }
-                else {
-                    navigator.mediaDevices.enumerateDevices()
-                    .then(this.gotDevices.bind(this))
-                    .then(()=> {
-                        resolve(this.deviceInfos);
-                    });
-                }
-            }
-            else {
-                resolve([]);
-            }
-        });
-    }
-}
diff --git a/web/src/executor/executor.es6 b/web/src/executor/executor.es6
deleted file mode 100644
index a00dc9b7fd..0000000000
--- a/web/src/executor/executor.es6
+++ /dev/null
@@ -1,105 +0,0 @@
-/* eslint-disable */
-/**
- * @file GraphExecutor，封装可执行单元
- * @author wangqun@baidu.com
- */
-// const fileDownload = require('js-file-download');
-let start;
-export default class GraphExecutor {
-
-    constructor(model) {
-        this.inputs = model.inputs;
-        this.outputs  = model.outputs;
-        this.attrs = model.attrs || model['sub-attrs'];
-        this.type = model.type;
-        this.finish = false;
-        this.next = null;
-        this.opData = null;
-        this.id = +new Date() + model.type + Math.floor(Math.random() * 10 + 1) + model.idx;
-    }
-
-    get inputsName() {
-
-        if (this.type === 'feed') {
-            return this.inputs.X;
-        }
-        else if (this.type === 'batchnorm' || this.type === 'batch_norm') {
-            return this.inputs.X;
-        }
-        else if (this.type === 'conv2d') {
-            return this.inputs.Input;
-        }
-        else if (this.type === 'depthwise_conv2d') {
-            return this.inputs.Input;
-        }
-        else if (this.type === 'elementwise_add') {
-            return this.inputs.X;
-        }
-        else if (this.type === 'relu' || this.type === 'leaky_relu') {
-            return this.inputs.X;
-        }
-        else if (this.type === 'pool2d') {
-            return this.inputs.X;
-        }
-        else if (this.type === 'mul') {
-            return this.inputs.X;
-        }
-        else if (this.type === 'softmax') {
-            return this.inputs.X;
-        }
-        else if (this.type === 'scale') {
-            return this.inputs.X;
-        }
-        else if (this.type === 'fetch') {
-            return this.inputs.X;
-        }
-        return this.inputs.Input || this.inputs.X;
-    }
-
-    get outputsName() {
-        if (this.type === 'conv2d') {
-            return this.outputs.Output;
-        }
-        else if (this.type === 'depthwise_conv2d') {
-            return this.outputs.Output;
-        }
-        else if (this.type === 'batchnorm' || this.type === 'batch_norm') {
-            this.outputs.out = this.outputs.Y;
-            return this.outputs.Y;
-        }
-        else {
-            return this.outputs.Out || this.outputs.Output;
-        }
-
-    }
-
-    /**
-     * 将输入数据和具体op进行关联，触发执行具体每一个op
-     * @param runtime
-     * @param isRendered
-     */
-    execute(runtime, isRendered) {
-        // console.log(inputs, outputs);
-        if (this.type !== 'feed') {
-            // let time = +Date.now();
-            log.start(this.opData.iLayer + '-' + this.type);
-            runtime.run(this.type, this.opData, isRendered);
-            log.end(this.opData.iLayer + '-' + this.type);
-            // if (runtime.gpu.frameBufferIsComplete().isComplete) {
-            //     var result = runtime.read();
-            //     let res = Array.prototype.slice.call(result);
-            //     fileDownload(res, "result.csv");
-            // }
-            // let length = statistic.length;
-            // statistic[length - 1].type = this.type;
-            // statistic[length - 1].runTime = +Date.now() - time;
-            // if (this.type === 'scale') {
-            //     console.log('时间是：' + (+Date.now() - start));
-            // }
-        } else {
-            start = +Date.now();
-        }
-    }
-}
-
-/* eslint-enable */
diff --git a/web/src/executor/loader.es6 b/web/src/executor/loader.es6
deleted file mode 100644
index b9710744ce..0000000000
--- a/web/src/executor/loader.es6
+++ /dev/null
@@ -1,423 +0,0 @@
-/* eslint-disable */
-import GraphExecutor from './executor';
-import IO from '../feed/imageFeed';
-import Runtime from '../../src/runtime/runtime';
-import OpData from '../utils/opData';
-import Factory from '../factory/fshader/factory';
-import Utils from '../utils/utils';
-/**
- * @file GraphModel，绘制生成model网络
- * @author wangqun@baidu.com
- */
-// 生成factory实例
-const factory = new Factory({});
-
-// 获取op的输入配置
-const opConfs = factory.getOpConfs();
-export default class GraphModel  {
-    constructor(modelGonfig, loadOptions) {
-        this.version  = '0.0.1';
-        this.handler = 'io.IOHandler';
-        this.modelGonfig = modelGonfig;
-        this.loadOptions = loadOptions;
-        this.multipart = false;
-        // feed数据
-        this.feed = null;
-        this.index = 0;
-        this.feedOp = null;
-        this.feedItem = null;
-        this.isExecuted = false;
-        // 网络层数
-        this.iLayer = 0;
-        // fetch xhr jsonp
-        this.params = {type: 'fetch'};
-        // 设置分片加载model
-        if (this.loadOptions) {
-            this.multipart = this.loadOptions.multipart;
-            this.feed = {input: this.loadOptions.feed};
-            if (loadOptions.dataType === 'binary') {
-                this.binaryOption = loadOptions.binaryOption;
-            }
-        }
-
-        if (!this.loadOptions) {
-            this.loadOptions = {};
-        } else {
-            // op runner
-            this.inst = Runtime.init();
-            factory.setWebglVersion(this.inst.getWebglVersion());
-            // this.fetchJson(this.modelGonfig.dir + 'x.json').then(data => {
-            //     const [b, c, h, w] = [1, 3, 320, 320];
-            //     const size = data.length;
-            //     const total = 3 * 320 * 320;
-            //     this.testData = new Float32Array(total);
-            //     for (let i = 0; i < size; i++) {
-            //         let j = i / (c * w) | 0;
-            //         let k = i % (c * w);
-            //         let b1 = j / h | 0;
-            //         let h1 = j % h;
-            //         let c1 = k % c;
-            //         let w1 = k / c | 0;
-            //         let l = b1 * (c * h * w) + c1 * (h * w) + h1 * (w) + w1;
-            //         this.testData[i] = data[l];
-            //     }
-            // });
-        }
-    }
-    fetchOneChunk(path) {
-        return this.fetch(path).then(request => {
-            return request.arrayBuffer();
-        })
-    }
-    fetchJson(path) {
-        return this.fetch(path).then(request => {
-            return request.json();
-        })
-    }
-    fetchAllData() {
-        // todo 兼容一下json的模式
-        let counts = this.binaryOption.fileCount;
-        let chunkArray = [];
-        for (let i = 1; i <= counts; i++) {
-            chunkArray.push(
-                this.fetchOneChunk(this.modelGonfig.dir + this.binaryOption.getFileName(i))
-            );
-        }
-        console.time('加载时间');
-        return Promise.all(chunkArray).then(chunks => {
-            console.timeEnd('加载时间');
-            let chunksLength = 0;
-            let f32Array = [];
-            let float32Chunk;
-            chunks.forEach(i => {
-                float32Chunk = new Float32Array(i);
-                f32Array.push(float32Chunk);
-                chunksLength += float32Chunk.length;
-            });
-            this.allData = new Float32Array(chunksLength);
-            let offset = 0;
-            f32Array.forEach(i => {
-                i.forEach(num => {
-                    this.allData[offset] = num;
-                    offset += 1;
-                })
-            });
-        });
-    }
-    traverse (arr) {
-        const TMP_SCHEME_REGEX = /\.tmp/;
-        const TMP_REGEX = /\-/;
-        let marker = 0; // 读到哪个位置了
-        let len; // 当前op长度
-        arr.filter(item => {
-            return item.name
-                && item.name.match(TMP_SCHEME_REGEX) === null
-                && item.name.match(TMP_REGEX) === null;
-            })
-            // .sort((a, b) => {
-            //     if (a.name > b.name) {
-            //         return 1;
-            //     }
-            //     if (a.name < b.name) {
-            //         return -1;
-            //     }
-            //     return 0;
-            // }) // 按字母顺序排列 在model.json里
-            .forEach(item => {
-                len = item.shape.reduce((a, b) => a * b); // 长度为shape的乘积
-                item.data = this.allData.slice(marker, marker + len);
-                marker += len;
-            });
-    }
-
-    fetch(path, params) {
-        params = params || this.params;
-        let method = params.method || 'get';
-        let mode = params.mode || 'cors';
-        let myHeaders = new Headers();
-        return fetch(path, {
-            method: method,
-            mode: mode,
-            credentials: 'include',
-            headers: myHeaders
-        });
-    }
-
-    fetchModel(params) {
-        params = params || this.params;
-        const path = this.modelGonfig.dir + this.modelGonfig.main;
-        let load = null;
-        // jsonp请求方式
-        if (params && params.type === 'jsonp') {
-            let json;
-            let s = document.createElement('script');
-            s.src = path + '&jsonpCallback=fn';
-            window.fn = function(data) {
-                json = data;
-                // console.log(json);
-            };
-            //当script被插入文档中时，src中的资源就会开始加载
-            document.body.appendChild(s);
-            load = new Promise((resolve, reject) => {
-                s.onload = function(e) {
-                    resolve(json);
-                }
-                s.onerror = function() {
-                    reject(json);
-                }
-            });
-            this.handler = load;
-        }
-        // 原生fetch
-        else if (params.type === 'fetch') {
-            load = new Promise((resolve, reject) => {
-                this.fetch(path, params)
-                .then(response => response.json())
-                .then(responseData => resolve(responseData))
-                .then(err => reject(err))
-            });
-            this.handler = load;
-        }
-        // ajax
-        else if (params.type === 'xhr') {
-            this.handler = load;
-        }
-        return load;
-    }
-    async load() {
-        let that = this;
-        const artifacts = this.handler = await this.fetchModel();
-        if (this.multipart === true) {
-            await this.fetchAllData()
-                .then(() => this.traverse(artifacts.vars));
-        }
-        const opsMap = this.createOpsMap(artifacts.ops, artifacts.vars);
-        this.weightMap = this.constructOpsMap(opsMap);
-        // 生成op数据
-        this.weightMap.forEach(op => {
-            const type = op.type;
-            if (type !== 'feed' && type !== 'fetch') {
-                that.buildOpData(op);
-            }
-        });
-        return true;
-    }
-    buildOpData(op) {
-        const tensor = this.constructTensor(op);
-        const opData = new OpData(op.type, tensor.inputs, tensor.outputs, tensor.attrs);
-        const name = opData.name;
-        const fsCode = factory.buildShader(name, opData.data);
-        opData.fsCode = fsCode;
-        opData.program = this.inst.createProgram(fsCode, opData.tensor['out']);
-        opData.renderData = opConfs[name].map(elem => {
-            let item = Object.assign({}, elem);
-            const tensorData = opData.tensor[item.tensor];
-            if (item.type === 'texture') {
-                item.data = tensorData.data;
-                if (this.feedOp.id === op.id && item.tensor === 'origin') {
-                    item.shape = tensorData.shape;
-                    this.feedItem = item;
-                }
-                item['width_texture'] = tensorData['width_texture'];
-                item['height_texture'] = tensorData['height_texture'];
-                item['channel'] = tensorData['channel'];
-            } else if (item.type === 'uniform') {
-                item.data = tensorData[item.variable];
-            }
-            return item;
-        });
-        // console.timeEnd('opData.renderData');
-        opData.iLayer = this.iLayer++;
-        op.opData = opData;
-        // delete op.inputs;
-        // delete op.outputs;
-        // delete op.attrs;
-    }
-    execute_(executor) {
-        if (executor.type === 'fetch') {
-            return;
-        }
-        executor.execute(this.inst, this.isExecuted);
-        if (executor.next) {
-            const id = executor.next;
-            const next = this.getTensor(id);
-            this.execute_(next[0])
-        }
-    }
-    /**
-     * Executes inference for the model for given input tensors.
-     * @param inputs
-     * @param outputs
-     * @returns {*}
-     */
-    execute(inputs) {
-        this.feed = inputs;
-        const executor = this.getNetsStart(this.weightMap);
-        if (!this.inst) {
-            this.inst = Runtime.init({
-                'width_raw_canvas': 512,
-                'height_raw_canvas': 512
-            });
-        }
-        if (this.isExecuted) {
-            this.updateFeed();
-        }
-        let start = +Date.now();
-        this.execute_(executor[0]);
-        this.isExecuted = true;
-        return this.inst;
-    }
-    updateFeed() {
-        this.feedItem.data = this.feed.input[0].data;
-        // Utils.img2texture(this.feedItem);
-    }
-    /**
-     * predict enter
-     * @param inputs
-     * @param config
-     */
-    predict(inputs, config) {
-        return this.execute_(inputs, true, this.outputNodes);
-    }
-    getTensorAttr(name) {
-        return this.handler.vars.filter((item, i) => {
-            if (name === item.name)
-            return item;
-        });
-    }
-    constructTensor(executor) {
-        const that = this;
-        const inputName = executor.inputsName[0];
-        const input = executor.inputs;
-        const output = executor.outputs;
-        Object.keys(output).forEach(function(key){
-            output[key] = that.getTensorAttr(output[key][0]);
-        });
-        Object.keys(input).forEach(function(key){
-            if ((key === 'Input') && (inputName === 'pixel')) {
-                const pixel = that.getTensorAttr(inputName);
-                const io = new IO();
-                input[key] = io.fromPixels(data, pixel);
-            }
-            else if ((key === 'Input') && (inputName === 'image' || inputName === 'x')) {
-                // that.feed.input[0].data = that.testData;
-                input[key] = that.feed.input;
-                that.feedOp = executor;
-            }
-            else {
-                input[key] = that.getTensorAttr(input[key][0]);
-            }
-        });
-        // console.log(input);
-        const tensor = {
-            inputs: input,
-            outputs: output,
-            attrs: executor.attrs,
-            type: executor.type,
-            next: executor.next
-        };
-        return tensor;
-    }
-    /**
-     * Construct Ops Relationship
-     * @param ops
-     * @returns {*}
-     */
-    constructOpsMap(ops) {
-        return ops.map((item, idx) => {
-            const outputsName = item.outputsName[0];
-            const next = this.getNextExecutor(ops, outputsName);
-            if (next.length > 0) {
-                item.next = next[0].id;
-            }
-            return item;
-        });
-    }
-    /**
-     * Get Ops Nets Start Node
-     * @param ops
-     * @returns {*}
-     */
-    getNetsStart(ops) {
-        return ops.filter((item) => {
-            if (item.type === 'feed') {
-                return true;
-            }
-        });
-    }
-    /**
-     * Get Ops Nets Last Node
-     * @param ops
-     * @returns {*}
-     */
-    getNetsEnd(ops) {
-        return ops.filter((item) => {
-            if (item.type === 'fetch') {
-                return true;
-            }
-        });
-    }
-    /**
-     * get tensor by id
-     * @param id
-     * @returns {*}
-     */
-    getTensor(id) {
-        return this.weightMap.filter((item, i) => {
-            if (id === item.id)
-                return item;
-        });
-    }
-    /**
-     * Create Ops Executor Object Map
-     * @param ops
-     * @returns {*}
-     */
-    createOpsMap(ops) {
-        return ops.map((item, idx) => {
-            item.idx = idx;
-            const graphExecutor = new GraphExecutor(item);
-            return graphExecutor;
-        });
-    }
-    /**
-     * Get The Next Executor need Exec
-     * @param ops
-     * @param id
-     * @returns {*}
-     */
-    getNextExecutor(ops, id) {
-        return ops.filter((item, key) => {
-            if (id === item.inputsName[0]) {
-                return true;
-            }
-        });
-    }
-    /**
-     * Load a graph model given a URL to the model definition.
-     * @param modelGonfig
-     * @param options
-     * @returns {Promise<void>}
-     */
-    async loadGraphModel(modelGonfig, options) {
-        if (modelGonfig === null) {
-            // todo saniac 报错提示修改
-            throw new Error(
-                'modelGonfig in loadGraphModel() cannot be null. Please provide a url ' +
-                'or an IOHandler that loads the model');
-        }
-        if (options === null) {
-            options = {};
-        }
-        const model = new GraphModel(modelGonfig, options);
-        await model.load();
-        return model;
-    }
-    /**
-     * dispose
-     */
-    dispose() {
-        this.executor.dispose();
-    }
-}
-/* eslint-enable */
diff --git a/web/src/executor/postProcess.es6 b/web/src/executor/postProcess.es6
deleted file mode 100644
index 5e4d1672e3..0000000000
--- a/web/src/executor/postProcess.es6
+++ /dev/null
@@ -1,262 +0,0 @@
-/* eslint-disable */
-
-/* 后处理图片 by zhangmiao06 */
-// let preTestRun = index => {
-//     let img = document.getElementById('image');
-//     img.src = tempPic[index];
-//     img.onload = function () {
-//         testRun(testOutput.data[index], img);
-//     };
-// };
-
-import models from '../utils/models';
-
-const isSimilar = (r1, r2, threshold = 5) => {
-    return Math.max(Math.abs(r1[0] - r2[0]), Math.abs(r1[1] - r2[1])) < threshold;
-    // return Math.abs((r1[0] + r1[1] + r1[2] + r1[3]) - (r2[0] + r2[1] + r2[2] + r2[3])) < threshold;
-}
-
-// sigmoid
-let sigmoid = (x) => {
-    if (x < -100) {
-        return 0.0;
-    }
-
-    return 1 / (1 + Math.exp(-x));
-};
-
-// transpose
-let transpose = (data) => {
-    let shape = data.shape;
-    let transposeShape = data.transposeShape;
-    let formatData = data.data;
-    let formatData2 = [];
-    for (let n = 0; n < shape[transposeShape[0]]; n++) {
-        let nData = [];
-        for (let c = 0; c < shape[transposeShape[1]]; c++) {
-            let cData = [];
-            for (let row = 0; row < shape[transposeShape[2]]; row++) {
-                let rowData = [];
-                for (let col = 0; col < shape[transposeShape[3]]; col++) {
-                    let tempArr = [n, c, row, col];
-                    let newN = n;
-                    let newC = c;
-                    let newW = row;
-                    let newH = col;
-                    transposeShape.forEach((item, index) => {
-                        switch (item) {
-                            case 0:
-                                newN = tempArr[index];
-                                break;
-                            case 1:
-                                newC = tempArr[index];
-                                break;
-                            case 2:
-                                newW = tempArr[index];
-                                break;
-                            case 3:
-                                newH = tempArr[index];
-                        }
-                    });
-                    rowData.push(formatData[newN][newC][newW][newH]);
-                }
-                cData.push(rowData);
-            }
-            nData.push(cData);
-        }
-        formatData2.push(nData);
-    }
-    return formatData2;
-};
-
-// reshape
-const reshape = (data) => {
-    let formatData2 = data.data;
-    let shape = data.shape;
-    let reshapeShape = data.reshapeShape;
-    // 1.变成一维
-    let tempData = reshapeOne({
-        data: formatData2,
-        shape: shape
-    });
-    // 2.变成多维
-    let formatData3 = reshapeMany({
-        data: tempData,
-        reshapeShape: reshapeShape
-    });
-    return formatData3;
-};
-
-// 变成一维
-const reshapeOne = (data) => {
-    let formatData2 = data.data;
-    let shape = data.shape;
-    let tempData = [];
-    for (let n = 0; n < shape[0]; n++) {
-        for (let c = 0; c < shape[1]; c++) {
-            for (let row = 0; row < shape[2]; row++) {
-                for (let col = 0; col < shape[3]; col++) {
-                    tempData.push(formatData2[n][c][row][col]);
-                }
-            }
-        }
-    }
-    return tempData;
-};
-
-// 变成多维
-const reshapeMany = data => {
-    let tempData = data.data;
-    let reshapeShape = data.reshapeShape;
-    let formatData3 = [];
-    for (let n = 0; n < reshapeShape[0]; n++) {
-        let nData = [];
-        for (let c = 0; c < reshapeShape[1]; c++) {
-            let cData = [];
-            for (let row = 0; row < reshapeShape[2]; row++) {
-                let rowData = [];
-                for (let col = 0; col < reshapeShape[3]; col++) {
-                    let tempN = n * reshapeShape[1] * reshapeShape[2] * reshapeShape[3];
-                    let tempC = c * reshapeShape[2] * reshapeShape[3];
-                    let tempRow = row * reshapeShape[3];
-                    rowData.push(tempData[tempN + tempC + tempRow + col]);
-                }
-                cData.push(rowData);
-            }
-            nData.push(cData);
-        }
-        formatData3.push(nData);
-    }
-    return formatData3;
-};
-
-export default class PostProcess {
-    constructor(options) {
-        this.modelConfig = models[options.modelName];
-        this.count = 0;
-        this.lastRect = [0, 0, 0, 0]
-    }
-    
-    run(data, img, callback) {
-        let {from, to} = this.modelConfig.outputShapes;
-        let shape = [].concat(from).reverse();
-        // 1.从一维数组到1*25*19*19
-        let formatData = reshapeMany({
-            data: data,
-            reshapeShape: shape
-        });
-        // console.log('一维到多维', formatData);
-        // 2.从1*25*19*19 到 19*19*25*1
-        let formatData2 = transpose({
-            data: formatData,
-            shape: shape,
-            transposeShape: [2, 3, 1, 0]
-        });
-        // console.log('transpose', formatData2);
-        // 3.从19*19*25*1到19*19*5*5
-        let formatData3 = reshape({
-            data: formatData2,
-            // shape: [19, 19, 25, 1],
-            // reshapeShape: [19, 19, 5, 5]
-            shape: from,
-            reshapeShape: to
-        });
-        // console.log('reshape', formatData3);
-        // 4.运算
-        let finalData = this.handleFinal(formatData3, shape, img);
-        // console.log('final', finalData);
-        // 5.处理画布
-        // finalData.length && handleCanvas(finalData, img);
-        this.handleDiv(finalData, img, callback);
-    }
-
-    calSize(img) {
-        let w1 = img.width;
-        let h1 = img.height;
-        let wh1 = Math.max(w1, h1);
-        let factor = this.modelConfig.feedShape.fw / wh1;
-        // let factor = 608.0 / wh1;
-        let width = Math.round(w1 * factor);
-        let height = Math.round(h1 * factor);
-        return [w1, h1, width, height];
-    }
-
-    // 处理运算
-    handleFinal(formatData3, shape, img) {
-        let finalData = [];
-        let c = shape[2];
-        let [w1, h1, width, height] = this.calSize(img);
-        let factorX = Math.max(width, height) / width;
-        let factorY = Math.max(width, height) / height;
-
-        let maxProb = 0.0;
-        let anchors = [[1.603231, 2.094468], [6.041143, 7.080126], [2.882459, 3.518061], [4.266906, 5.178857], [9.041765, 10.66308]];
-
-        for (let i = 0; i < shape[2]; i++) {
-            for (let j = 0; j < shape[3]; j++) {
-                for (let k = 0; k < anchors.length; k++) {
-                    let [a1, a2, a3, a4, prob] = formatData3[i][j][k];
-                    prob = sigmoid(prob);
-                    if (prob > maxProb && prob >= 0.5) {
-                        let ctx = (j + sigmoid(a1)) / c * factorX;
-                        let cty = (i + sigmoid(a2)) / c * factorY;
-                        let col = Math.exp(a3) * anchors[k][0] / c * factorX;
-                        let row = Math.exp(a4) * anchors[k][1] / c * factorY;
-                        let x = (ctx - (col / 2));
-                        let y = (cty - (row / 2));
-                        finalData.push([x * w1, y * h1, col * w1, row * h1, prob]);
-                    }
-
-                }
-            }
-        }
-        return finalData;
-    }
-
-    handleDiv(finalData, img, callback) {
-        if (finalData.length < 1) {
-            callback();
-            return false;
-        }
-        let maxIndex = 0;
-        if (finalData.length > 1) {
-            for (let i = 1; i < finalData.length; i++) {
-                if (finalData[i].prob > finalData[maxIndex].prob) {
-                    maxIndex = i;
-                }
-
-            }
-        }
-
-        let [demoLeft, demoTop, demoWidth, demoHeight] = finalData[maxIndex];
-        if (!isSimilar(this.lastRect, [demoLeft, demoTop, demoWidth, demoHeight])) {
-            callback([demoWidth, demoHeight,demoLeft, demoTop]);
-        };
-        this.lastRect = [demoLeft, demoTop, demoWidth, demoHeight];
-    }
-
-    // 处理画布
-    handleCanvas(finalData, img) {
-        let myCanvas = document.getElementById('myCanvas');
-        let [w1, h1, width, height] = calSize(img);
-        myCanvas.width = w1;
-        myCanvas.height = h1;
-        let ctx = myCanvas.getContext('2d');
-        // ctx.drawImage(img, 0, 0, w1, h1);
-
-        // finalData.forEach((demoArr, index) => {
-        // let [demoLeft, demoTop, demoWidth, demoHeight, prob] = demoArr;
-        let [demoLeft, demoTop, demoWidth, demoHeight, prob] = finalData[0];
-        ctx.beginPath();
-        ctx.lineWidth = 4;
-        ctx.strokeStyle = 'red';
-        ctx.moveTo(demoLeft, demoTop);
-        ctx.lineTo(demoLeft + demoWidth, demoTop);
-        ctx.lineTo(demoLeft + demoWidth, demoTop + demoHeight);
-        ctx.lineTo(demoLeft, demoTop + demoHeight);
-        ctx.closePath();
-        ctx.stroke();
-        // });
-    }
-}
-
diff --git a/web/src/executor/runner.es6 b/web/src/executor/runner.es6
deleted file mode 100644
index edb70b6f84..0000000000
--- a/web/src/executor/runner.es6
+++ /dev/null
@@ -1,153 +0,0 @@
-/**
- * @file Runner 整个流程封装一下
- * @author hantian(hantianjiao@baidu.com)
- * 使用方法：
- * const runner = new Runner({
- *      modelName: 'separate' // '608' | '320' | '320fused' | 'separate'
- *  });
- *  runner.preheat().then(r => {
- *      r.run(document.getElementById('test'));
- *  });
- */
-
-import IO from '../feed/ImageFeed';
-import DataFeed from '../feed/dataFeed';
-import Graph from './loader';
-import PostProcess from './postProcess';
-import models from '../utils/models';
-import Logger from '../../tools/logger';
-window.log = new Logger();
-
-export default class Runner {
-    // 加载模型&预热
-    constructor(options) {
-        this.modelConfig = models[options.modelName];
-        this.flags = {
-            isRunning: false,
-            isPreheating: false,
-            runVideoPaused: false
-        };
-        this.buffer = new Float32Array();
-        this.io = new IO();
-        this.postProcess = new PostProcess(options);
-    }
-
-    // 预热 用用空数据跑一遍
-    async preheat() {
-        this.flags.isPreheating = true;
-        let {fh, fw} = this.modelConfig.feedShape;
-        let path = this.modelConfig.modelPath;
-        let feed = [{
-            data: new Float32Array(3 * fh * fw),
-            name: 'image',
-            shape: [1, 3, fh, fw]
-        }];
-        const MODEL_URL = `/${path}/model.json`;
-        const MODEL_CONFIG = {
-            dir: `/${path}/`, // 存放模型的文件夹
-            // dir: `https://graph.baidu.com/mms/graph/static/asset/dll/${path}/`, // rd测试地址
-            // dir: `/src/view/common/lib/paddle/dist/${path}/`, // 本地测试地址
-            main: 'model.json' // 主文件
-        };
-        const graphModel = new Graph();
-        this.model = await graphModel.loadGraphModel(MODEL_CONFIG, {
-            multipart: true,
-            dataType: 'binary',
-            binaryOption: {
-                fileCount: 1, // 切成了多少文件
-                getFileName(i) { // 获取第i个文件的名称
-                    return 'chunk_0.dat';
-                }
-            },
-            feed
-        });
-        this.model.execute({
-            input: feed
-        });
-        this.flags.isPreheating = false;
-        return this;
-    }
-
-    // 跑一遍
-    async run(input, callback) {
-        this.flags.isRunning = true;
-        let {fh, fw} = this.modelConfig.feedShape;
-        let path = this.modelConfig.modelPath;
-        if (!this.model) {
-            console.warn('It\'s better to preheat the model before running.');
-            await this.preheat();
-        }
-        log.start('总耗时'); // eslint-disable-line
-        log.start('预处理'); // eslint-disable-line
-        let feed;
-        if (typeof input === 'string') {
-            const dfIO = new DataFeed();
-            feed = await dfIO.process({
-                input: `/${path}/${input}`,
-                shape: [1, 3, fh, fw]
-            });
-        }
-        else {
-            feed = this.io.process({
-                input: input,
-                params: {
-                    gapFillWith: '#000', // 缩放后用什么填充不足方形部分
-                    targetSize: {
-                        height: fw,
-                        width: fh
-                    },
-                    targetShape: [1, 3, fh, fw], // 目标形状 为了兼容之前的逻辑所以改个名
-                    // shape: [3, 608, 608], // 预设tensor形状
-                    mean: [117.001, 114.697, 97.404] // 预设期望
-                    // std: [0.229, 0.224, 0.225]  // 预设方差
-                }
-            });
-        }
-        log.end('预处理'); // eslint-disable-line
-        log.start('运行耗时'); // eslint-disable-line
-        let inst = this.model.execute({
-            input: feed
-        });
-        let result = await inst.read();
-        log.end('后处理-读取数据'); // eslint-disable-line
-        const newData = [];
-        let newIndex = -1;
-        const [w, h, c, b] = this.modelConfig.outputShapes.from;
-        // c channel
-        for (let i = 0; i < c; i++) {
-            // height channel
-            for (let j = 0; j < h; j++) {
-                // width channel
-                for (let k = 0; k < w; k++) {
-                    // position: (0, 0, 0, 0)
-                    const index = j * (c * h) + k * c + i;
-                    // const index = j * (i * k) + k * i + i;
-                    newData[++newIndex] = result[index];
-                }
-            }
-        }
-        this.postProcess.run(newData, input, callback);
-        log.end('后处理'); // eslint-disable-line
-        this.flags.isRunning = false;
-        log.end('总耗时'); // eslint-disable-line
-    }
-
-    // 传入获取图片的function
-    async runStream(getMedia, callback) {
-        await this.run(getMedia(), callback);
-        if (!this.flags.runVideoPaused) {
-            setTimeout(async () => {
-                await this.runStream(getMedia, callback);
-            }, 0);
-        }
-    }
-
-    stopStream() {
-        this.flags.runVideoPaused = true;
-    }
-
-    startStream(getMedia, callback) {
-        this.flags.runVideoPaused = false;
-        this.runStream(getMedia, callback);
-    }
-}
diff --git a/web/src/factory/fshader/factory.es6 b/web/src/factory/fshader/factory.es6
deleted file mode 100644
index 4be6cfb0a0..0000000000
--- a/web/src/factory/fshader/factory.es6
+++ /dev/null
@@ -1,83 +0,0 @@
-import ops from './ops';
-/**
- * @file 工厂类，生成fragment shader
- * @author yangmingming
- */
-export default class Factory {
-    constructor(opts) {
-        this.defaultOpts = Object.assign({}, opts);
-        this.webglVersion = 2;
-        this.texture2d = 'texture';
-    }
-
-    setWebglVersion(vs = 0) {
-        this.webglVersion = vs;
-        if (vs === 1) {
-            this.texture2d = 'texture2D';
-        }
-    }
-
-    buildShader(opName, data) {
-        let result = '';
-        result = this.buildPrefix(opName);
-        result += this.buildCommon(opName);
-        result += this.buildOp(opName);
-        data.texture2d = this.texture2d;
-        result = this.populateData(result, data);
-        return result;
-    }
-
-    buildPrefix(opName) {
-        if (this.webglVersion === 1) {
-            return ops.common.prefix;
-        }
-        return ops.common.prefix2;
-    }
-
-    buildCommon(opName) {
-        return ops.common.params + ops.common.func;
-    }
-
-    buildOp(opName) {
-        let code = ops.ops[opName].params;
-        // 依赖的方法
-        let atoms = ops.atoms;
-        let confs = ops.ops[opName].confs;
-        let dep = confs.dep || [];
-        dep.map(item => {
-            let func = item.func;
-            let data = item.conf;
-            let snippet = atoms[func];
-            code += this.populateData(snippet, data);
-        });
-        // suffix
-        code += this.buildSuffix(opName);
-        // main方法
-        code += ops.ops[opName].func;
-        return code;
-    }
-
-    buildSuffix(opName) {
-        return ops.common.suffix;
-    }
-
-    populateData(result, data) {
-        let code = result;
-        for (let key in data) {
-            code = code.replace(new RegExp(key.toUpperCase(), 'g'),
-                ((typeof data[key]) === 'undefined') ? 1 : data[key]);
-        }
-        return code;
-    }
-
-    getOpConfs() {
-        const opsConfs = {};
-        for (let key in ops.ops) {
-            if (ops.ops.hasOwnProperty(key)) {
-                opsConfs[key] = ops.ops[key].confs.input;
-            }
-        }
-        return opsConfs;
-    }
-}
-
diff --git a/web/src/factory/fshader/ops.es6 b/web/src/factory/fshader/ops.es6
deleted file mode 100644
index 94c6fd220b..0000000000
--- a/web/src/factory/fshader/ops.es6
+++ /dev/null
@@ -1,166 +0,0 @@
-/* eslint-disable */
-import common_params from '../../shader/atom/common_params';
-import common_func from '../../shader/atom/common_func';
-import prefix from '../../shader/atom/prefix';
-import prefix2 from '../../shader/atom/prefix2';
-import suffix from '../../shader/atom/suffix';
-import ivec56 from '../../shader/atom/type_ivec56';
-
-import conv2d_params from '../../shader/conv2d/params';
-import conv2d_func from '../../shader/conv2d/main';
-import conv2d_conf from '../../shader/conv2d/conf';
-import conv2d_depthwise_params from '../../shader/conv2d_depthwise/params';
-import conv2d_depthwise_func from '../../shader/conv2d_depthwise/main';
-import conv2d_depthwise_conf from '../../shader/conv2d_depthwise/conf';
-import dynamic_params from '../../shader/dynamic/params';
-import dynamic_func from '../../shader/dynamic/main';
-import dynamic_conf from '../../shader/dynamic/conf';
-import pool2d_params from '../../shader/pool2d/params';
-import pool2d_func from '../../shader/pool2d/main';
-import pool2d_conf from '../../shader/pool2d/conf';
-import pool2d_max_params from '../../shader/pool2d_max/params';
-import pool2d_max_func from '../../shader/pool2d_max/main';
-import pool2d_max_conf from '../../shader/pool2d_max/conf';
-import pool2d_winograd_params from '../../shader/pool2d_winograd/params';
-import pool2d_winograd_func from '../../shader/pool2d_winograd/main';
-import pool2d_winograd_conf from '../../shader/pool2d_winograd/conf';
-import elementwise_add_params from '../../shader/elementwise_add/params';
-import elementwise_add_func from '../../shader/elementwise_add/main';
-import elementwise_add_conf from '../../shader/elementwise_add/conf';
-import mul_params from '../../shader/mul/params';
-import mul_func from '../../shader/mul/main';
-import mul_conf from '../../shader/mul/conf';
-import softmax_params from '../../shader/softmax/params';
-import softmax_func from '../../shader/softmax/main';
-import softmax_conf from '../../shader/softmax/conf';
-import batchnorm_params from '../../shader/batchnorm/params';
-import batchnorm_func from '../../shader/batchnorm/main';
-import batchnorm_conf from '../../shader/batchnorm/conf';
-
-import conv2d_elementwise_add_params from '../../shader/conv2d_elementwise_add/params';
-import conv2d_elementwise_add_func from '../../shader/conv2d_elementwise_add/main';
-import conv2d_elementwise_add_conf from '../../shader/conv2d_elementwise_add/conf';
-
-import conv2d_elementwise_add_winograd_params from '../../shader/conv2d_elementwise_add_winograd/params';
-import conv2d_elementwise_add_winograd_func from '../../shader/conv2d_elementwise_add_winograd/main';
-import conv2d_elementwise_add_winograd_conf from '../../shader/conv2d_elementwise_add_winograd/conf';
-
-import getArrayIndexFromTensorPos from '../../shader/atom/getArrayIndexFromTensorPos';
-import getArrayIndexFromTexturePos from '../../shader/atom/getArrayIndexFromTexturePos';
-import getTensorPosFromArrayIndex from '../../shader/atom/getTensorPosFromArrayIndex';
-import getTexturePosFromArrayIndex from '../../shader/atom/getTexturePosFromArrayIndex';
-import getValueFromTexturePos from '../../shader/atom/getValueFromTexturePos';
-import getValueFromTensorPos from '../../shader/atom/getValueFromTensorPos';
-import getValueFromTensorPosPacked from '../../shader/atom/getValueFromTensorPosPacked';
-import moveTexture2PosToReal from '../../shader/atom/moveTexture2PosToReal';
-import getPixelsFromTexturePos from '../../shader/atom/getPixelsFromTexturePos';
-import getRangePowSumFromArrayIndex from '../../shader/atom/getRangePowSumFromArrayIndex';
-import getRangeSumFromArrayIndex from '../../shader/atom/getRangeSumFromArrayIndex';
-import sigmoid from '../../shader/atom/sigmoid';
-import prelu from '../../shader/atom/prelu';
-import scale from '../../shader/atom/scale';
-import softmax from '../../shader/atom/softmax';
-/**
- * @file op文件
- * @author yangmingming
- */
-
-export default {
-    common: {
-        params: common_params,
-        func: common_func,
-        prefix,
-        prefix2,
-        suffix,
-        ivec56
-    },
-    ops: {
-        conv2d: {
-            params: conv2d_params,
-            func: conv2d_func,
-            confs: conv2d_conf
-        },
-        conv2d_depthwise: {
-            params: conv2d_depthwise_params,
-            func: conv2d_depthwise_func,
-            confs: conv2d_depthwise_conf
-        },
-        conv2d_elementwise_add: {
-            params: conv2d_elementwise_add_params,
-            func: conv2d_elementwise_add_func,
-            confs: conv2d_elementwise_add_conf
-        },
-        conv2d_elementwise_add_winograd: {
-            params: conv2d_elementwise_add_winograd_params,
-            func: conv2d_elementwise_add_winograd_func,
-            confs: conv2d_elementwise_add_winograd_conf
-        },
-        dynamic: {
-            params: dynamic_params,
-            func: dynamic_func,
-            confs: dynamic_conf
-        },
-        pool2d: {
-            params: pool2d_params,
-            func: pool2d_func,
-            confs: pool2d_conf
-        },
-        pool2d_max: {
-            params: pool2d_max_params,
-            func: pool2d_max_func,
-            confs: pool2d_max_conf
-        },
-        pool2d_winograd: {
-            params: pool2d_winograd_params,
-            func: pool2d_winograd_func,
-            confs: pool2d_winograd_conf
-        },
-        elementwise_add: {
-            params: elementwise_add_params,
-            func: elementwise_add_func,
-            confs: elementwise_add_conf
-        },
-        mul: {
-            params: mul_params,
-            func: mul_func,
-            confs: mul_conf
-        },
-        relu: {
-            params: dynamic_params,
-            func: dynamic_func,
-            confs: dynamic_conf
-        },
-        scale: {
-            params: dynamic_params,
-            func: dynamic_func,
-            confs: dynamic_conf
-        },
-        softmax: {
-            params: softmax_params,
-            func: softmax_func,
-            confs: softmax_conf
-        },
-        batchnorm: {
-            params: batchnorm_params,
-            func: batchnorm_func,
-            confs: batchnorm_conf
-        }
-    },
-    atoms: {
-        getArrayIndexFromTensorPos,
-        getArrayIndexFromTexturePos,
-        getTensorPosFromArrayIndex,
-        getTexturePosFromArrayIndex,
-        getValueFromTexturePos,
-        getValueFromTensorPos,
-        getValueFromTensorPosPacked,
-        moveTexture2PosToReal,
-        getPixelsFromTexturePos,
-        getRangeSumFromArrayIndex,
-        getRangePowSumFromArrayIndex,
-        sigmoid,
-        prelu,
-        scale,
-        softmax
-    }
-};
diff --git a/web/src/feed/ImageFeed.es6 b/web/src/feed/ImageFeed.es6
deleted file mode 100644
index fcd1820f41..0000000000
--- a/web/src/feed/ImageFeed.es6
+++ /dev/null
@@ -1,237 +0,0 @@
-/* eslint-disable */
-/**
- * @file image，feed 获取图像相关输入
- * @author wangqun@baidu.com
- */
-export default class imageFeed {
-    constructor() {
-        this.fromPixels2DContext = document.createElement('canvas').getContext('2d');
-        this.defaultWidth = 224;
-        this.defaultHeight = 224;
-        this.minPixels = 225;
-        this.pixels = '';
-        this.defaultParams = {
-            gapFillWith: '#000',
-            std: [1, 1, 1]
-        };
-    };
-
-    /**
-     * 处理图像方法
-     * @param inputs
-     */
-    process(inputs) {
-        const input = inputs.input;
-        const mode = inputs.mode;
-        const channel = inputs.channel;
-        const rotate = inputs.rotate;
-        const params = {
-            ...this.defaultParams,
-            ...inputs.params
-        };
-        let output = [];
-        if (!this.result) {
-            const [b, c, h, w] = params.targetShape;
-            this.result = new Float32Array(h * w * 3);
-        }
-        output = this.fromPixels(input, params);
-        return output;
-    };
-
-    /**
-     * crop图像&重新设定图片tensor形状
-     * @param shape
-     */
-    reshape(imageData, opt, scaleSize) {
-        const {sw, sh} = scaleSize;
-        const {width, height} = opt;
-        const hPadding = Math.ceil((sw - width) / 2);
-        const vPadding = Math.ceil((sh - height) / 2);
-
-        let data = imageData.data;
-        let red = [];
-        let green = [];
-        let blue = [];
-        let mean = opt.mean;
-        let std = opt.std;
-        for (let i = 0; i < data.length; i += 4) {
-            // img_mean 0.485, 0.456, 0.406
-            //img_std 0.229, 0.224, 0.225
-            let index = i / 4;
-            let vIndex = Math.floor(index / sw);
-            let hIndex = index - (vIndex * sw) - 1;
-            if (hIndex >= hPadding && hIndex < (hPadding + width) &&
-                vIndex >= vPadding && vIndex < (vPadding + height)) {
-                red.push(((data[i] / 255) - mean[0]) / std[0]); // red
-                green.push(((data[i + 1] / 255) - mean[1]) / std[1]); // green
-                blue.push(((data[i + 2] / 255) - mean[2]) / std[2]); // blue
-            }
-        }
-        let tmp = green.concat(blue);
-        return red.concat(tmp);
-    };
-
-    /**
-     * 全部转rgb * H * W
-     * @param shape
-     */
-    allReshapeToRGB(imageData, opt, scaleSize) {
-        const {sw, sh} = scaleSize;
-        const [b, c, h, w] = opt.targetShape;
-        let data = imageData.data;
-        let mean = opt.mean;
-        let dataLength = data.length;
-        // let result = new Float32Array(dataLength * 3);
-        let result = this.result;
-        // let offsetR = 0;
-        // let offsetG = dataLength / 4;
-        // let offsetB = dataLength / 2;
-        let offset = 0;
-        let size = h * w;
-        // h w c
-        for (let i = 0; i < h; ++i) {
-            let iw = i * w;
-            for (let j = 0; j < w; ++j) {
-                let iwj = iw + j;
-                for (let k = 0; k < c; ++k) {
-                    let a = iwj * 4 + k;
-                    result[offset++] = (data[a] - mean[k]) / 256;
-                }
-            }
-        }
-        return result;
-    };
-
-    /**
-     * 根据scale缩放图像
-     * @param image
-     * @param params
-     * @return {Object} 缩放后的尺寸
-     */
-    reSize(image, params) {
-        // 原始图片宽高
-        const width = this.pixelWidth;
-        const height = this.pixelHeight;
-        // 缩放后的宽高
-        let sw = width;
-        let sh = height;
-        // 最小边缩放到scale
-        if (width < height) {
-            sw = params.scale;
-            sh = Math.round(sw * height / width);
-        } else {
-            sh = params.scale;
-            sw = Math.round(sh * width / height);
-        }
-        this.fromPixels2DContext.canvas.width = sw;
-        this.fromPixels2DContext.canvas.height = sh;
-        this.fromPixels2DContext.drawImage(
-            image, 0, 0, sw, sh);
-        return {sw, sh};
-    };
-
-
-    /**
-     * 缩放成目标尺寸并居中
-     */
-    fitToTargetSize(image, params, center) {
-        // 目标尺寸
-        const targetWidth = params.targetSize.width;
-        const targetHeight = params.targetSize.height;
-        this.fromPixels2DContext.canvas.width = targetWidth;
-        this.fromPixels2DContext.canvas.height = targetHeight;
-        this.fromPixels2DContext.fillStyle = params.gapFillWith;
-        this.fromPixels2DContext.fillRect(0, 0, targetHeight, targetWidth);
-        // 缩放后的宽高
-        let sw = targetWidth;
-        let sh = targetHeight;
-        let x = 0;
-        let y = 0;
-        // target的长宽比大些 就把原图的高变成target那么高
-        if (targetWidth / targetHeight * this.pixelHeight / this.pixelWidth >= 1) {
-            sw = Math.round(sh * this.pixelWidth / this.pixelHeight);
-            x = Math.floor((targetWidth - sw) / 2);
-        }
-        // target的长宽比小些 就把原图的宽变成target那么宽
-        else {
-            sh = Math.round(sw * this.pixelHeight / this.pixelWidth);
-            y = Math.floor((targetHeight - sh) / 2);
-        }
-        // console.log(x, y, sw, sh);
-        if (center) {
-            this.fromPixels2DContext.drawImage(
-                image, x, y, sw, sh);
-        }
-        else {
-            this.fromPixels2DContext.drawImage(
-                image, 0, 0, sw, sh);
-            // currentPic = this.fromPixels2DContext.canvas.toDataURL();
-        }
-        // window.currentPic = this.fromPixels2DContext.canvas;// test only, demele me
-        // document.getElementById('p-c').appendChild(this.fromPixels2DContext.canvas);// test only, demele me
-        return {sw: targetWidth, sh: targetHeight};
-    }
-
-    /**
-     * 获取图像内容
-     * @param pixels
-     * @returns {Uint8ClampedArray}
-     */
-    getImageData(pixels, scaleSize) {
-        const {sw, sh} = scaleSize;
-        let vals = this.fromPixels2DContext
-            .getImageData(0, 0, sw, sh);
-        // crop图像
-        const width = pixels.width;
-        const height = pixels.height;
-        return vals;
-    };
-
-    /**
-     * 计算灰度图
-     * @param imageData
-     * @returns {*}
-     */
-    grayscale (imageData) {
-        let data = imageData.data;
-
-        for (let i = 0; i < data.length; i += 4) {
-            let avg = (data[i] + data[i + 1] + data[i + 2]) / 3;
-            data[i] = avg; // red
-            data[i + 1] = avg; // green
-            data[i + 2] = avg; // blue
-        }
-        return data;
-    };
-
-    fromPixels(pixels, opt) {
-        let data;
-        let scaleSize;
-        if (pixels instanceof HTMLImageElement || pixels instanceof HTMLVideoElement) {
-            this.pixelWidth = pixels.naturalWidth || pixels.width;
-            this.pixelHeight = pixels.naturalHeight || pixels.height;
-            if (opt.scale) { // 兼容以前的，如果有scale就是短边缩放到scale模式
-                scaleSize = this.reSize(pixels, opt);
-                data = this.getImageData(opt, scaleSize);
-            }
-            else if (opt.targetSize) { // 如果有targetSize，就是装在目标宽高里的模式
-                scaleSize = this.fitToTargetSize(pixels, opt);
-                data = this.getImageData(opt, scaleSize);
-            }
-        }
-
-        if (opt.gray) {
-            data = grayscale(data);
-        }
-
-        if (opt.shape) {
-            data = this.reshape(data, opt, scaleSize);
-        }
-
-        if (opt.targetShape) {
-            data = this.allReshapeToRGB(data, opt, scaleSize);
-        }
-        return [{data: data, shape: opt.shape || opt.targetShape, name: 'image'}];
-    }
-}
-/* eslint-enable */
diff --git a/web/src/feed/dataFeed.es6 b/web/src/feed/dataFeed.es6
deleted file mode 100644
index c582d9c853..0000000000
--- a/web/src/feed/dataFeed.es6
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * @file 直接数据输入
- * @author hantianjiao@baidu.com
- */
-
-export default class dataFeed {
-    toFloat32Array(data) {
-        for (let i = 0; i < data.length; i++) {
-            this.f32Arr[i] = data[i];
-        }
-    }
-
-    getLengthFromShape(shape) {
-        return shape.reduce((a, b) => a * b);
-    }
-
-    loadData() {
-        return fetch(this.dataPath).then(res => res.json());
-    }
-
-    getOutput() {
-        return this.loadData().then(data => {
-            this.toFloat32Array(data);
-            return [{
-                data: this.f32Arr,
-                shape: this.shape,
-                name: 'x'
-            }];
-        });
-    }
-
-    async process(input) {
-        this.len = this.getLengthFromShape(input.shape);
-        if (!this.f32Arr || this.len > this.f32Arr.length) {
-            this.f32Arr = new Float32Array(this.len);
-        }
-        this.shape = input.shape;
-        this.dataPath = input.input;
-        let output = await this.getOutput();
-        return output;
-    }
-}
\ No newline at end of file
diff --git a/web/src/feed/io.es6 b/web/src/feed/io.es6
deleted file mode 100644
index 644f66dfb6..0000000000
--- a/web/src/feed/io.es6
+++ /dev/null
@@ -1,854 +0,0 @@
-/* eslint-disable */
-/**
- * @file io，loader相关输入输出
- * @author wangqun@baidu.com
- */
-
-export default class io {
-    constructor() {
-        this.fromPixels2DContext = document.createElement('canvas').getContext('2d');
-    };
-
-    fromPixels(pixels, opt) {
-        pixels = pixels.input;
-        const shape = opt[0].shape;
-        const numChannels = opt[0].shape[0];
-        if (pixels == null) {
-            throw new Error(
-                'pixels passed to tf.browser.fromPixels() can not be null');
-        }
-        let vals;
-        // tslint:disable-next-line:no-any
-        // tslint:disable-next-line:no-any
-        if (pixels.getContext != null) {
-            // tslint:disable-next-line:no-any
-            vals = pixels
-                .getContext('2d')
-                .getImageData(0, 0, pixels.width, pixels.height)
-                .data;
-        } else if (pixels instanceof ImageData) {
-            vals = pixels.data;
-        } else if (
-            pixels instanceof HTMLImageElement ||
-            pixels instanceof HTMLVideoElement) {
-            if (this.fromPixels2DContext == null) {
-                throw new Error(
-                    'Can\'t read pixels from HTMLImageElement outside ' +
-                    'the browser.');
-            }
-            this.fromPixels2DContext.canvas.width = pixels.width;
-            this.fromPixels2DContext.canvas.height = pixels.height;
-            this.fromPixels2DContext.drawImage(
-                pixels, 0, 0, pixels.width, pixels.height);
-            vals = this.fromPixels2DContext
-                .getImageData(0, 0, pixels.width, pixels.height)
-                .data;
-        } else {
-
-        }
-        let values;
-        if (numChannels === 4) {
-            values = new Array(vals);
-        } else {
-            const numPixels = (shape[1] || pixels.width) * (shape[2] ||pixels.height);
-            // console.log(numPixels, numPixels * numChannels);
-            values = new Array(numPixels * numChannels);
-            for (let i = 0; i < numPixels; i++) {
-                for (let channel = 0; channel < numChannels; ++channel) {
-                    values[i * numChannels + channel] = vals[i * 4 + channel];
-                }
-            }
-        }
-        // console.log(pixels.height, pixels.width, numChannels, values);
-        // const outShape: [number, number, number] =
-        //     [pixels.height, pixels.width, numChannels];
-        values = [
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            7.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            6.0,
-            7.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            3.0,
-            1.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            3.0,
-            0.0,
-            0.0,
-            14.0,
-            16.0,
-            8.0,
-            1.0,
-            0.0,
-            0.0,
-            0.0,
-            14.0,
-            1.0,
-            0.0,
-            0.0,
-            14.0,
-            4.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            5.0,
-            13.0,
-            0.0,
-            0.0,
-            0.0,
-            9.0,
-            0.0,
-            27.0,
-            0.0,
-            0.0,
-            0.0,
-            5.0,
-            0.0,
-            0.0,
-            3.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            4.0,
-            0.0,
-            0.0,
-            5.0,
-            11.0,
-            5.0,
-            4.0,
-            8.0,
-            0.0,
-            0.0,
-            15.0,
-            7.0,
-            0.0,
-            2.0,
-            7.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            11.0,
-            2.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            4.0,
-            11.0,
-            3.0,
-            0.0,
-            2.0,
-            0.0,
-            5.0,
-            3.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            2.0,
-            0.0,
-            0.0,
-            10.0,
-            6.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            4.0,
-            9.0,
-            0.0,
-            0.0,
-            2.0,
-            3.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            8.0,
-            0.0,
-            8.0,
-            11.0,
-            0.0,
-            4.0,
-            113.0,
-            202.0,
-            249.0,
-            255.0,
-            255.0,
-            135.0,
-            44.0,
-            0.0,
-            7.0,
-            3.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            2.0,
-            0.0,
-            2.0,
-            0.0,
-            33.0,
-            188.0,
-            230.0,
-            101.0,
-            52.0,
-            6.0,
-            106.0,
-            162.0,
-            183.0,
-            11.0,
-            0.0,
-            4.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            9.0,
-            0.0,
-            4.0,
-            58.0,
-            230.0,
-            189.0,
-            31.0,
-            0.0,
-            3.0,
-            0.0,
-            14.0,
-            0.0,
-            204.0,
-            17.0,
-            7.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            20.0,
-            24.0,
-            231.0,
-            181.0,
-            0.0,
-            0.0,
-            5.0,
-            4.0,
-            2.0,
-            0.0,
-            119.0,
-            228.0,
-            0.0,
-            1.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            173.0,
-            232.0,
-            32.0,
-            4.0,
-            10.0,
-            0.0,
-            0.0,
-            7.0,
-            79.0,
-            230.0,
-            108.0,
-            18.0,
-            0.0,
-            10.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            2.0,
-            100.0,
-            246.0,
-            47.0,
-            0.0,
-            5.0,
-            0.0,
-            1.0,
-            8.0,
-            63.0,
-            216.0,
-            109.0,
-            0.0,
-            0.0,
-            6.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            8.0,
-            122.0,
-            210.0,
-            0.0,
-            31.0,
-            0.0,
-            8.0,
-            28.0,
-            109.0,
-            235.0,
-            182.0,
-            0.0,
-            13.0,
-            0.0,
-            22.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            128.0,
-            233.0,
-            0.0,
-            6.0,
-            66.0,
-            126.0,
-            180.0,
-            191.0,
-            220.0,
-            27.0,
-            0.0,
-            0.0,
-            11.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            78.0,
-            246.0,
-            233.0,
-            220.0,
-            255.0,
-            199.0,
-            59.0,
-            235.0,
-            68.0,
-            12.0,
-            0.0,
-            1.0,
-            2.0,
-            1.0,
-            10.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            2.0,
-            0.0,
-            80.0,
-            120.0,
-            139.0,
-            62.0,
-            0.0,
-            155.0,
-            211.0,
-            5.0,
-            10.0,
-            0.0,
-            0.0,
-            0.0,
-            3.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            1.0,
-            0.0,
-            5.0,
-            2.0,
-            0.0,
-            0.0,
-            90.0,
-            255.0,
-            70.0,
-            0.0,
-            0.0,
-            0.0,
-            9.0,
-            0.0,
-            0.0,
-            9.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            17.0,
-            5.0,
-            0.0,
-            11.0,
-            47.0,
-            227.0,
-            159.0,
-            0.0,
-            0.0,
-            8.0,
-            0.0,
-            0.0,
-            2.0,
-            6.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            5.0,
-            0.0,
-            0.0,
-            0.0,
-            4.0,
-            213.0,
-            207.0,
-            19.0,
-            0.0,
-            0.0,
-            3.0,
-            12.0,
-            0.0,
-            2.0,
-            4.0,
-            2.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            1.0,
-            0.0,
-            16.0,
-            7.0,
-            91.0,
-            253.0,
-            50.0,
-            0.0,
-            0.0,
-            4.0,
-            0.0,
-            2.0,
-            0.0,
-            1.0,
-            2.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            2.0,
-            5.0,
-            0.0,
-            45.0,
-            252.0,
-            131.0,
-            0.0,
-            8.0,
-            0.0,
-            7.0,
-            0.0,
-            15.0,
-            5.0,
-            0.0,
-            0.0,
-            2.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            1.0,
-            8.0,
-            11.0,
-            207.0,
-            205.0,
-            30.0,
-            2.0,
-            0.0,
-            0.0,
-            22.0,
-            0.0,
-            0.0,
-            4.0,
-            9.0,
-            11.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            14.0,
-            155.0,
-            255.0,
-            28.0,
-            0.0,
-            0.0,
-            6.0,
-            4.0,
-            0.0,
-            5.0,
-            150.0,
-            210.0,
-            91.0,
-            17.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            14.0,
-            40.0,
-            250.0,
-            91.0,
-            0.0,
-            0.0,
-            7.0,
-            0.0,
-            0.0,
-            24.0,
-            0.0,
-            10.0,
-            130.0,
-            183.0,
-            147.0,
-            11.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            207.0,
-            146.0,
-            4.0,
-            0.0,
-            4.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            25.0,
-            237.0,
-            29.0,
-            0.0,
-            12.0,
-            0.0,
-            0.0,
-            14.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            13.0,
-            0.0,
-            15.0,
-            7.0,
-            0.0,
-            9.0,
-            2.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            4.0,
-            0.0,
-            4.0,
-            3.0,
-            4.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0,
-            0.0
-        ];
-        return [{data: values, shape: shape, name: 'pixel'}];
-    }
-}
-/* eslint-enable */
diff --git a/web/src/gpu/gpu.es6 b/web/src/gpu/gpu.es6
deleted file mode 100644
index e59b7aa114..0000000000
--- a/web/src/gpu/gpu.es6
+++ /dev/null
@@ -1,550 +0,0 @@
-/* eslint-disable */
-import VSHADER from '../shader/v_shader';
-import VSHADER2 from '../shader/v_shader2';
-/**
- * @file gpu运算
- * @author yangmingming
- */
-const CONF = {
-    alpha: false,
-    antialias: false,
-    premultipliedAlpha: false,
-    preserveDrawingBuffer: false,
-    depth: false,
-    stencil: false,
-    failIfMajorPerformanceCaveat: true
-};
-const MAX_WAIT = 100;
-export default class gpu {
-    constructor(opts = {}) {
-        // 版本, 默认webgl version 2.0
-        this.version = 2;
-        this.opts = opts;
-        opts.width_raw_canvas = Number(opts.width_raw_canvas) || 512;
-        opts.height_raw_canvas = Number(opts.height_raw_canvas) || 512;
-        const canvas = opts.el ? opts.el : document.createElement('canvas');
-        canvas.addEventListener('webglcontextlost', evt => {
-            evt.preventDefault();
-            console.log('webgl context is lost~');
-        }, false);
-        let gl = canvas.getContext('webgl2', CONF);
-        if (!!gl) {
-            // 开启float32
-            this.version = 2;
-            this.textureFloat = gl.getExtension('EXT_color_buffer_float');
-            this.internalFormat = gl.R32F;
-            this.textureFormat = gl.RED;
-            this.downloadInternalFormat = gl.RGBA32F;
-        } else {
-            gl = canvas.getContext('webgl', CONF) || canvas.getContext('experimental-webgl', CONF);
-            this.version = 1;
-            this.internalFormat = gl.RGBA;
-            this.textureFormat = gl.RGBA;
-            this.downloadInternalFormat = gl.RGBA;
-            if (!gl) {
-                this.version = 0;
-                alert('当前环境创建webgl context失败');
-            } else {
-                // 开启扩展
-                this.textureFloat  = gl.getExtension('OES_texture_float');
-                console.log('float extension is started or not? ' + !!this.textureFloat);
-            }
-        }
-        // 关闭相关功能
-        gl.disable(gl.DEPTH_TEST);
-        gl.disable(gl.STENCIL_TEST);
-        gl.disable(gl.BLEND);
-        gl.disable(gl.DITHER);
-        gl.disable(gl.POLYGON_OFFSET_FILL);
-        gl.disable(gl.SAMPLE_COVERAGE);
-        gl.enable(gl.SCISSOR_TEST);
-        gl.enable(gl.CULL_FACE);
-        gl.cullFace(gl.BACK);
-        this.gl = gl;
-        this.initCache();
-        // 同步查看次数
-        this.waits = 0;
-
-        console.log('WebGl版本是 ' + this.version);
-        console.log('MAX_TEXTURE_SIZE is ' + gl.getParameter(gl.MAX_TEXTURE_SIZE));
-        console.log('MAX_TEXTURE_IMAGE_UNITS is ' + gl.getParameter(gl.MAX_TEXTURE_IMAGE_UNITS));
-    }
-
-    getWebglVersion() {
-        return this.version;
-    }
-
-    initCache() {
-        // 运行次数
-        this.times = 0;
-        const gl = this.gl;
-        // 顶点数据
-        let vertices = new Float32Array([
-            -1.0,  1.0, 0.0, 1.0,
-            -1.0, -1.0, 0.0, 0.0,
-            1.0,  1.0, 1.0, 1.0,
-            1.0, -1.0, 1.0, 0.0]);
-        this.vertexBuffer = gl.createBuffer();
-        gl.bindBuffer(gl.ARRAY_BUFFER, this.vertexBuffer);
-        gl.bufferData(gl.ARRAY_BUFFER, vertices, gl.STATIC_DRAW);
-        // shader
-        this.vertexShader = null;
-        // 生成vertextShader
-        this.initShader(this.version === 2 ? VSHADER2 : VSHADER);
-        this.fragmentShader = null;
-        // 上一个texture
-        this.prevTexture = null;
-        // 当前op输出texture
-        this.currentTexture = null;
-        // 帧缓存
-        this.frameBuffer = gl.createFramebuffer();
-        gl.bindFramebuffer(gl.FRAMEBUFFER, this.frameBuffer);
-        // 计算texture cache
-        this.cacheTextures = {};
-        this.uniformLocations = {};
-        // texture buffer
-        this.outTextures = [];
-        // pbo
-        this.pbo = gl.createBuffer();
-    }
-
-    runVertexShader(program) {
-        const gl = this.gl;
-        let aPosition = gl.getAttribLocation(program, 'position');
-        // Turn on the position attribute
-        gl.enableVertexAttribArray(aPosition);
-        // Bind the position buffer.
-        gl.bindBuffer(gl.ARRAY_BUFFER, this.vertexBuffer);
-        gl.vertexAttribPointer(aPosition, 2, gl.FLOAT, false, 16, 0);
-    }
-
-    setOutProps(opts) {
-        this.width_shape_out = opts.width_shape || 1;
-        this.height_shape_out = opts.height_shape || 1;
-        this.width_texture_out = opts.width_texture || 1;
-        this.height_texture_out = opts.height_texture || 1;
-        this.channel = opts.channel || 0;
-        this.total_shape = opts.total_shape || 0;
-    }
-
-    isFloatingTexture() {
-        return (this.textureFloat !== null);
-    }
-
-    createProgram(fshader, out) {
-        const gl = this.gl;
-        const program = gl.createProgram();
-        gl.attachShader(program, this.vertexShader);
-        gl.attachShader(program, fshader);
-        gl.linkProgram(program);
-        // 生成output的texture缓存
-        const texture = gl.createTexture();
-        gl.bindTexture(gl.TEXTURE_2D, texture);
-        gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_MAG_FILTER, gl.NEAREST);
-        gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_MIN_FILTER, gl.NEAREST);
-        gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_WRAP_S, gl.CLAMP_TO_EDGE);
-        gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_WRAP_T, gl.CLAMP_TO_EDGE);
-
-        gl.texImage2D(gl.TEXTURE_2D, // Target, matches bind above.
-            0,             // Level of detail.
-            this.downloadInternalFormat,       // Internal format.
-            out.width_texture,
-            out.height_texture,
-            0,             // Always 0 in OpenGL ES.
-            gl.RGBA,       // Format for each pixel.
-            gl.FLOAT,          // Data type for each chanel.
-            null);
-        gl.bindTexture(gl.TEXTURE_2D, null);
-        this.outTextures.push(texture);
-        return program;
-    }
-
-    setProgram(program, isRendered) {
-        const gl = this.gl;
-        gl.useProgram(program);
-        this.program = program;
-        if (!isRendered) {
-            this.runVertexShader(program);
-        }
-    }
-
-    attachShader(fshader) {
-        const gl = this.gl;
-        // let index = this.textureBufferIndex % 2;
-        // const program = this.programs[index];
-        // this.program = program;
-        const program = this.program;
-        // if (this.times < 2) {
-        //     gl.attachShader(program, this.vertexShader);
-        // }
-        this.textureBufferIndex = (this.textureBufferIndex + 1) >= 2 ? 0 : 1;
-        if (!!this.fragmentShader) {
-            gl.detachShader(program, this.fragmentShader);
-        }
-        this.gl.attachShader(program, fshader);
-        this.fragmentShader = fshader;
-        gl.linkProgram(program);
-        if (this.times++ === 0) {
-            gl.useProgram(program);
-            this.runVertexShader();
-        }
-    }
-
-    create(vshaderCode, fshaderCode) {
-        let gl = this.gl;
-        if (this.program) {
-            this.dispose();
-        }
-        // 创建 & 绑定程序对象
-        let program = this.program = gl.createProgram();
-        // 创建&绑定vertex&frament shader
-        this.initShader(vshaderCode);
-        this.fragmentShader = this.initShader(fshaderCode, 'fragment');
-        this.gl.attachShader(program, this.vertexShader);
-        this.gl.attachShader(program, this.fragmentShader);
-        gl.linkProgram(program);
-        gl.useProgram(program);
-
-        let aPosition = gl.getAttribLocation(program, 'position');
-        // Turn on the position attribute
-        gl.enableVertexAttribArray(aPosition);
-        // Bind the position buffer.
-        gl.bindBuffer(gl.ARRAY_BUFFER, this.vertexBuffer);
-        gl.vertexAttribPointer(aPosition, 2, gl.FLOAT, false, 16, 0);
-    }
-
-    /**
-     * 初始化shader
-     * @param code shader代码
-     * @param type shader类型
-     * @return {object} 初始化成功返回shader
-     */
-    initShader(code, type = 'vertex') {
-        const shaderType = type === 'vertex' ? this.gl.VERTEX_SHADER : this.gl.FRAGMENT_SHADER;
-        let shader;
-        if (type === 'vertex' && this.vertexShader) {
-            shader = this.vertexShader;
-        } else {
-            shader = this.gl.createShader(shaderType);
-            if (type === 'vertex') {
-                this.vertexShader = shader;
-            }
-            this.gl.shaderSource(shader, code);
-            this.gl.compileShader(shader);
-            if (!this.gl.getShaderParameter(shader, this.gl.COMPILE_STATUS)) {
-                throw new Error("compile: " + this.gl.getShaderInfoLog(shader));
-            }
-        }
-
-        return shader;
-    }
-
-    /**
-     * 更新fragment shader
-     * @param code shader代码
-     * @return {boolean} 更新成功过返回true
-     */
-    updateShader(code) {
-        this.gl.useProgram(this.program);
-        // 删除fragment shader
-        if (this.fragmentShader) {
-            this.gl.detachShader(this.program, this.fragmentShader);
-            this.gl.deleteShader(this.fragmentShader);
-            // 删除texture
-            this.gl.deleteTexture(this.texture);
-        }
-        // 更新
-        this.fragmentShader = this.initShader(code, 'fragment');
-        return true;
-    }
-
-    /**
-     * 创建并绑定framebuffer, 之后attach a texture
-     * @param {WebGLTexture} texture 材质
-     * @returns {WebGLFramebuffer} The framebuffer
-     */
-    attachFrameBuffer(iLayer) {
-        this.prevTexture = this.currentTexture;
-        // this.currentTexture = this.textureBuffer[this.textureBufferIndex % 2];
-        // this.textureBufferIndex = (this.textureBufferIndex + 1) >= 2 ? 0 : 1;
-        this.currentTexture = this.outTextures[iLayer];
-        const gl = this.gl;
-        gl.framebufferTexture2D(gl.FRAMEBUFFER, // The target is always a FRAMEBUFFER.
-            gl.COLOR_ATTACHMENT0, // We are providing the color buffer.
-            gl.TEXTURE_2D, // This is a 2D image texture.
-            this.currentTexture, // The texture.
-            0 // 0, we aren't using MIPMAPs
-        );
-        gl.viewport(
-            0,
-            0,
-            this.width_texture_out,
-            this.height_texture_out
-        );
-        gl.scissor(
-            0,
-            0,
-            this.width_texture_out,
-            this.height_texture_out
-        );
-        return this.frameBuffer;
-    }
-
-    // 帧缓存检测
-    frameBufferIsComplete() {
-        let gl = this.gl;
-        let message;
-        let status;
-        let value;
-
-        status = gl.checkFramebufferStatus(gl.FRAMEBUFFER);
-
-        switch (status)
-        {
-            case gl.FRAMEBUFFER_COMPLETE:
-                message = "Framebuffer is complete.";
-                value = true;
-                break;
-            case gl.FRAMEBUFFER_UNSUPPORTED:
-                message = "Framebuffer is unsupported";
-                value = false;
-                break;
-            case gl.FRAMEBUFFER_INCOMPLETE_ATTACHMENT:
-                message = "Framebuffer incomplete attachment";
-                value = false;
-                break;
-            case gl.FRAMEBUFFER_INCOMPLETE_DIMENSIONS:
-                message = "Framebuffer incomplete (missmatched) dimensions";
-                value = false;
-                break;
-            case gl.FRAMEBUFFER_INCOMPLETE_MISSING_ATTACHMENT:
-                message = "Framebuffer incomplete missing attachment";
-                value = false;
-                break;
-            default:
-                message = "Unexpected framebuffer status: " + status;
-                value = false;
-        }
-        return {isComplete: value, message: message};
-    }
-
-    /**
-     * 初始化材质
-     * @param {int} index 材质索引
-     * @param {string} tSampler 材质名称
-     * @param {Object} bufferData 数据
-     * @param {boolean} isRendered 是否已运行过
-     */
-    initTexture(index, item, iLayer, isRendered) {
-        const gl = this.gl;
-        let texture;
-        if (!item.data) {
-            texture = this.prevTexture;
-        } else {
-            // texture = gl.createTexture();
-            if (isRendered && (iLayer > 0 || (iLayer === 0 && item.tensor !== 'origin'))) {
-                const tData = this.cacheTextures['' + iLayer];
-                texture = tData[item.variable + '_' + item.tensor];
-            } else {
-                texture = gl.createTexture();
-                if (index === 0) {
-                    this.cacheTextures['' + iLayer] = this.cacheTextures['' + iLayer] || {};
-                }
-                this.cacheTextures['' + iLayer][item.variable + '_' + item.tensor] = texture;
-            }
-        }
-        gl.activeTexture(gl[`TEXTURE${index}`]);
-        gl.bindTexture(gl.TEXTURE_2D, texture);
-        if (item.data && (!isRendered || (isRendered && iLayer === 0 && item.tensor === 'origin'))) {
-            gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_MAG_FILTER, gl.NEAREST);
-            gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_MIN_FILTER, gl.NEAREST);
-            gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_WRAP_S, gl.CLAMP_TO_EDGE);
-            gl.texParameteri(gl.TEXTURE_2D, gl.TEXTURE_WRAP_T, gl.CLAMP_TO_EDGE);
-            gl.texImage2D(gl.TEXTURE_2D, 0, this.internalFormat, item.width_texture,
-                item.height_texture, 0,
-                this.textureFormat, gl.FLOAT, item.data, 0);
-        }
-    }
-
-    getUniformLoc(name, ilayer, isRendered) {
-        if (isRendered) {
-            return this.uniformLocations['' + ilayer][name];
-        }
-        let loc = this.gl.getUniformLocation(this.program, name);
-        if (loc === null) throw `getUniformLoc ${name} err`;
-        // 缓存
-        this.uniformLocations['' + ilayer] = this.uniformLocations['' + ilayer] || {};
-        this.uniformLocations['' + ilayer][name] = loc;
-        return loc;
-    }
-
-    // 生成帧缓存的texture
-    makeTexure(type, data, opts = {}) {
-        const gl = this.gl;
-        let index = this.textureBufferIndex % 2;
-        let texture = this.textureBuffer[index];
-        gl.bindTexture(gl.TEXTURE_2D, texture);
-
-        // Pixel format and data for the texture
-        gl.texImage2D(gl.TEXTURE_2D, // Target, matches bind above.
-            0,             // Level of detail.
-            gl.RGBA,       // Internal format.
-            opts.width_texture_out || this.width_texture_out,
-            opts.height_texture_out || this.height_texture_out,
-            0,             // Always 0 in OpenGL ES.
-            gl.RGBA,       // Format for each pixel.
-            type,          // Data type for each chanel.
-            data);         // Image data in the described format, or null.
-        // Unbind the texture.
-        // gl.bindTexture(gl.TEXTURE_2D, null);
-        this.attachFrameBuffer();
-
-        return texture;
-    }
-
-    render(data = [], iLayer = 0, isRendered = false) {
-        const gl = this.gl;
-        let textureIndex = 0;
-        data.forEach(item => {
-            if (item.type === 'texture') {
-                this.initTexture(textureIndex, item, iLayer, isRendered);
-                gl.uniform1i(this.getUniformLoc(item.variable + '_' + item.tensor, iLayer, isRendered), textureIndex++);
-            }
-            else if (item.type === 'uniform') {
-                gl[item.setter](this.getUniformLoc(item.variable + '_' + item.tensor, iLayer, isRendered), item.data);
-            }
-        });
-        // gl.clearColor(.0, .0, .0, 1);
-        // gl.clear(gl.COLOR_BUFFER_BIT);
-        gl.drawArrays(gl.TRIANGLE_STRIP, 0, 4);
-    }
-
-    createPBO() {
-        const gl2 = this.gl;
-        const buffer = this.pbo;
-        gl2.bindBuffer(gl2.PIXEL_PACK_BUFFER, buffer);
-        const sizeBytes = 4 * 4 * this.width_texture_out * this.height_texture_out;
-        gl2.bufferData(gl2.PIXEL_PACK_BUFFER, sizeBytes, gl2.STREAM_READ);
-        gl2.readPixels(0, 0, this.width_texture_out, this.height_texture_out, gl2.RGBA, gl2.FLOAT, 0);
-        gl2.bindBuffer(gl2.PIXEL_PACK_BUFFER, null);
-        return buffer;
-    }
-
-    downloadFoat32TensorFromBuffer(buffer) {
-        const gl2 = this.gl;
-        const size = 4 * this.width_texture_out * this.height_texture_out;
-        const pixels = new Float32Array(size);
-        gl2.bindBuffer(gl2.PIXEL_PACK_BUFFER, buffer);
-        gl2.getBufferSubData(gl2.PIXEL_PACK_BUFFER, 0, pixels);
-        gl2.bindBuffer(gl2.PIXEL_PACK_BUFFER, null);
-        log.start('后处理-readloop');
-        // let result = [];
-        // let offset = 0;
-        // for (let h = 0; h < this.height_texture_out; h++) {
-        //     // 纪录第1和2行数据
-        //     let temp1 = [];
-        //     let temp2 = [];
-        //     for (let w = 0; w < this.width_texture_out; w++) {
-        //         temp1.push(pixels[offset]);
-        //         temp1.push(pixels[offset + 1]);
-        //         temp2.push(pixels[offset + 2]);
-        //         temp2.push(pixels[offset + 3]);
-        //         offset += 4;
-        //     }
-        //     result = result.concat(temp1);
-        //     result = result.concat(temp2);
-        // }
-        let result = [];
-        for (let i = 0; i < this.width_texture_out * this.height_texture_out; i++) {
-            result.push(pixels[4 * i]);
-        }
-        // const result = Array.prototype.slice.call(pixels);
-        // console.dir(['result', result]);
-        log.end('后处理-readloop');
-        return result;
-    }
-
-    getWebglError(status) {
-        const gl2 = this.gl;
-        switch (status) {
-            case gl2.NO_ERROR:
-                return 'NO_ERROR';
-            case gl2.INVALID_ENUM:
-                return 'INVALID_ENUM';
-            case gl2.INVALID_VALUE:
-                return 'INVALID_VALUE';
-            case gl2.INVALID_OPERATION:
-                return 'INVALID_OPERATION';
-            case gl2.INVALID_FRAMEBUFFER_OPERATION:
-                return 'INVALID_FRAMEBUFFER_OPERATION';
-            case gl2.OUT_OF_MEMORY:
-                return 'OUT_OF_MEMORY';
-            case gl2.CONTEXT_LOST_WEBGL:
-                return 'CONTEXT_LOST_WEBGL';
-            default:
-                return `Unknown error code ${status}`;
-        }
-    }
-
-    createAndWaitForFence() {
-        const gl2 = this.gl;
-        const isFenceEnabled = (gl2.fenceSync !== null);
-        let isFencePassed = () => true;
-        if (isFenceEnabled) {
-            const sync = gl2.fenceSync(gl2.SYNC_GPU_COMMANDS_COMPLETE, 0);
-            gl2.flush();
-            isFencePassed = () => {
-                const status = gl2.clientWaitSync(sync, 0, 0);
-                return status === gl2.ALREADY_SIGNALED ||
-                    status === gl2.CONDITION_SATISFIED;
-            };
-        }
-        return new Promise(resolve => {
-            this.pollItem(isFencePassed, resolve);
-        });
-    }
-
-    pollItem(isDone, resolveFn) {
-        const fn = () => {
-            if (isDone()) {
-                resolveFn();
-                return;
-            }
-            setTimeout(fn, 1);
-        };
-        fn();
-    }
-
-    compute() {
-        let gl = this.gl;
-        log.start('后处理-readinside');
-        const tt = +Date.now();
-        let pixels = new Float32Array(this.width_texture_out * this.height_texture_out * 4);
-        // gl.pixelStorei(gl.UNPACK_ALIGNMENT, 1);
-        const tt2 = +Date.now();
-        gl.readPixels(0, 0, this.width_texture_out, this.height_texture_out, gl.RGBA, gl.FLOAT, pixels, 0);
-        // console.log('本次读取数据时间是' + (+Date.now() - tt2)+ ',' + (tt2 - tt));
-        log.end('后处理-readinside');
-        log.start('后处理-readloop');
-        let result = [];
-        for (let i = 0; i < this.width_texture_out * this.height_texture_out; i++) {
-            result.push(pixels[4 * i]);
-        }
-        log.end('后处理-readloop');
-        return result;
-    }
-
-    dispose() {
-        const gl = this.gl;
-        // this.cacheTextures.forEach(texture => {
-        //     gl.deleteTexture(texture);
-        // });
-        this.cacheTextures = {};
-        this.programs.forEach(program => {
-            gl.detachShader(program, this.vertexShader);
-            gl.deleteShader(this.vertexShader);
-            gl.deleteProgram(program);
-        });
-        this.programs = [];
-    }
-}
diff --git a/web/src/index.es6 b/web/src/index.es6
deleted file mode 100644
index b085e64f69..0000000000
--- a/web/src/index.es6
+++ /dev/null
@@ -1,20 +0,0 @@
-import 'babel-polyfill';
-import Graph from './executor/loader';
-import IO from './executor/io';
-/**
- * @file model demo 入口文件
- * @author yangmingming@baidu.com
- *
- */
-// 'http://mms-xr.cdn.bcebos.com/paddle/mnist/model.json'
-const MODEL_URL = '../demo/model/model.json';
-const graphModel = new Graph();
-const model = graphModel.loadGraphModel(MODEL_URL);
-const cat = document.getElementById('pic');
-const io = new IO();
-
-let inst = model.execute({input: cat});
-let res = inst.read();
-console.dir(['result', res]);
-var fileDownload = require('js-file-download');
-fileDownload(res, "result.csv");
diff --git a/web/src/index.html b/web/src/index.html
deleted file mode 100644
index b61f210b38..0000000000
--- a/web/src/index.html
+++ /dev/null
@@ -1,13 +0,0 @@
-<!DOCYTPE html>
-<html>
-<head>
-    <meta charset="utf-8">
-    <title>paddle web demo</title>
-    <meta name="viewport" content="width=device-width,minimum-scale=1.0,maximum-scale=1.0,user-scalable=no">
-
-</head>
-<body>
-<div><img id="pic" src="data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/wAALCAAcABwBAREA/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/9oACAEBAAA/APn+vTPDHwP8TeJ9DtdXiuLCzt7kbo0uWcOU7NgKRgjkc81i+O/hvrPgW8xco1zp7ELHfIm1HYqCRjJIPUc9cHFcbSgEnABJ9BXaafH8Rrrw3NpdjBrkmjohLQLE/l7c5OOPUHgV6Fcw3um/sxXNt4hZo7qW5X7FDdLtlRfOU7QG5zgSH/dPpXhFel/Bzxj4a8H6vfzeILZy86ILe6WLzPI27i3HUZ+XkA9PQ16Pc/Hfw7pM91LaXusa20wDRxSQRQww9eAdob35DfWuNg+Ny67Dfab430SDUNLuQxjW2UK8BwcAZPPOPmyCOvPSvH6KKKK//9k=" ></div>
-</body>
-<script src="index.es6"></script>
-</html>
diff --git a/web/src/runtime/runtime.es6 b/web/src/runtime/runtime.es6
deleted file mode 100644
index 9969fd8f08..0000000000
--- a/web/src/runtime/runtime.es6
+++ /dev/null
@@ -1,98 +0,0 @@
-/* eslint-disable */
-import Gpu from '../gpu/gpu';
-import getMaxUniforms from '../test/getMaxUniforms';
-/**
- * @file gpu运行时
- * @author yangmingming
- *
- */
-export default {
-    /**
-     * 初始化, 生成gpu实例
-     * @param {Object} opts 运行时参数，包含el：canvas，dim: 256
-     * @return {Object} this 实例对象
-     */
-    init(opts = {}) {
-        const gpu = this.gpu = new Gpu(opts);
-        if (gpu.isFloatingTexture()) {
-            return this;
-        } else {
-            return null;
-        }
-    },
-
-    getWebglVersion() {
-        return this.gpu.getWebglVersion();
-    },
-
-    run(opName, opData, isRendered) {
-        // console.dir(['fscode', opData.fsCode]);
-        // let time = +Date.now();
-        // let start = time;
-        // let timeObj = {};
-        if (!opData.isPass) {
-            console.log('跳过当前op：' + opName);
-            return this;
-        }
-        // 设置gpu参数
-        const gpu = this.gpu;
-        gpu.setOutProps(opData.tensor['out']);
-        // 生成帧缓存材质
-        gpu.attachFrameBuffer(opData.iLayer);
-        // let end = +Date.now();
-        let bufferStatus = gpu.frameBufferIsComplete();
-        if (bufferStatus.isComplete) {
-            // start = +Date.now();
-            // timeObj['buferstatus-time'] = start - end;
-            // gpu.attachShader(opData.fshader);
-            gpu.setProgram(opData.program, isRendered);
-            // end = +Date.now();
-            // timeObj['createshader-time'] = end - start;
-            // timeObj['jsTime'] = end - time;
-            // statistic.push(timeObj);
-            // 开始计算
-            this.gpu.render(opData.renderData, opData.iLayer, isRendered);
-            return this;
-        } else {
-            return bufferStatus.message;
-        }
-    },
-
-    /**
-     * 读取op计算结果, 并返回数据
-     */
-    read2() {
-        let bufferStatus = this.gpu.frameBufferIsComplete();
-        if (bufferStatus.isComplete) {
-
-            return this.gpu.compute();
-        }
-        return null;
-    },
-
-    async read() {
-        const pbo = this.gpu.createPBO();
-        await this.gpu.createAndWaitForFence();
-        log.end('运行耗时');
-        log.start('后处理');
-        // 其实这里应该有个fetch的执行调用或者fetch的输出
-        log.start('后处理-读取数据');
-        // 开始读数据
-        return this.gpu.downloadFoat32TensorFromBuffer(pbo);
-    },
-
-    createProgram(fsCode, outTensor) {
-        const fshader = this.gpu.initShader(fsCode, 'fragment');
-        const program = this.gpu.createProgram(fshader, outTensor);
-        // test uniforms的个数
-        // const maxUniforms = getMaxUniforms(this.gpu.gl, program);
-        // alert(maxUniforms.maxFragmentShader);
-        // console.table(maxUniforms.uniforms);
-        return program;
-    },
-
-    // 释放资源
-    dispose() {
-        this.gpu.dispose();
-    }
-};
diff --git a/web/src/shader/atom/common_func.es6 b/web/src/shader/atom/common_func.es6
deleted file mode 100644
index 68ac47f652..0000000000
--- a/web/src/shader/atom/common_func.es6
+++ /dev/null
@@ -1,35 +0,0 @@
-/* eslint-disable */
-/**
- * @file 公共方法
- * @author yangmingming
- */
-export default `
-// 激活函数
-float prelu(float x, float p, float b) {
-    float result = x;
-    if (x < 0.0) {
-        result = x * p;
-    }
-    return result;
-}
-
-float leakyRelu(float x, float p, float b) {
-    float result = max(x, x * p);
-    return result;
-}
-
-float scale(float x, float p, float b) {
-    float result = p * x + b;
-    return result;
-}
-
-float sigmoid(float x, float y, float z) {
-    float result = 1.0 / (1.0 + exp(-x));
-    return result;
-}
-
-float softmax(float x, float p, float b) {
-    float result = exp(x) / (10.0 * exp(x));
-    return result;
-}
-`;
diff --git a/web/src/shader/atom/common_params.es6 b/web/src/shader/atom/common_params.es6
deleted file mode 100644
index f4462ddced..0000000000
--- a/web/src/shader/atom/common_params.es6
+++ /dev/null
@@ -1,18 +0,0 @@
-/* eslint-disable */
-/**
- * @file 公共参数
- * @author yangmingming
- */
-export default `
-    // dynamic的input数据
-    const float multi_value = float(MULTI_VALUE);
-    const float bias_value = float(BIAS_VALUE);
-
-    // 输出数据
-    const int width_shape_out = WIDTH_SHAPE_OUT;
-    const int height_shape_out = HEIGHT_SHAPE_OUT;
-    const int width_texture_out = WIDTH_TEXTURE_OUT;
-    const int height_texture_out = HEIGHT_TEXTURE_OUT;
-    const int channel_out = CHANNEL_OUT;
-    const int offset_y_out = OFFSET_Y_OUT;
-`;
diff --git a/web/src/shader/atom/getArrayIndexFromTensorPos.es6 b/web/src/shader/atom/getArrayIndexFromTensorPos.es6
deleted file mode 100644
index 3b28fb839d..0000000000
--- a/web/src/shader/atom/getArrayIndexFromTensorPos.es6
+++ /dev/null
@@ -1,15 +0,0 @@
-/* eslint-disable */
-/**
- * @file 公共方法
- * @author yangmingming
- */
-export default `
-
-int getArrayIndexFromTensorPos_TENSOR_NAME(TENSOR_TYPE tensorPos) {
-    int index = 0;
-    for (int i = 0; i < length_shape_TENSOR_NAME; i++) {
-        index += tensorPos[i] * numbers_shape_TENSOR_NAME[i];
-    }
-    return index;
-}
-`;
diff --git a/web/src/shader/atom/getArrayIndexFromTexturePos.es6 b/web/src/shader/atom/getArrayIndexFromTexturePos.es6
deleted file mode 100644
index c48576d5e2..0000000000
--- a/web/src/shader/atom/getArrayIndexFromTexturePos.es6
+++ /dev/null
@@ -1,14 +0,0 @@
-/* eslint-disable */
-/**
- * @file 公共方法
- * @author yangmingming
- */
-
-export default `
-int getArrayIndexFromTexturePos_TEXTURE_NAME(vec3 pos) {
-    int x = int(floor(pos.x));
-    int y = int(floor(pos.y));
-    int d = int(floor(pos.z));
-    return (width_TEXTURE_NAME * y + x) * 4 + d;
-}
-`;
diff --git a/web/src/shader/atom/getOutputTensorPos.es6 b/web/src/shader/atom/getOutputTensorPos.es6
deleted file mode 100644
index fafe6a198e..0000000000
--- a/web/src/shader/atom/getOutputTensorPos.es6
+++ /dev/null
@@ -1,16 +0,0 @@
-/* eslint-disable */
-/**
- * @file 公共方法
- * @author yangmingming
- */
-export default `
-ivec4 getOutputTensorPos() {
-    // 获取原始长度
-    vec2 outCoord = moveTexture2PosToReal_texture_out(vCoord.xy);
-    int x = int(outCoord.x / float(channel_out));
-    int c = int(mod(outCoord.x, float(channel_out)));
-    int y = int(mod(outCoord.y, float(height_shape_out)));
-    int b = int(outCoord.y / float(height_shape_out));
-    return ivec4(b, c, y, x);
-}
-`;
diff --git a/web/src/shader/atom/getPixelsFromTexturePos.es6 b/web/src/shader/atom/getPixelsFromTexturePos.es6
deleted file mode 100644
index 683b85e9be..0000000000
--- a/web/src/shader/atom/getPixelsFromTexturePos.es6
+++ /dev/null
@@ -1,9 +0,0 @@
-/* eslint-disable */
-/**
- * @file 公共方法
- * @author yangmingming
- */
-// 获取材质中的像素
-export default `
-#define getPixelsFromTexturePos_TEXTURE_NAME(pos) TEXTURE2D(TEXTURE_NAME, pos)
-`;
diff --git a/web/src/shader/atom/getRangePowSumFromArrayIndex.es6 b/web/src/shader/atom/getRangePowSumFromArrayIndex.es6
deleted file mode 100644
index 7361cc1946..0000000000
--- a/web/src/shader/atom/getRangePowSumFromArrayIndex.es6
+++ /dev/null
@@ -1,15 +0,0 @@
-/* eslint-disable */
-/**
- * @file 公共方法, 获取[H, W]的power的总和
- * @author yangmingming
- */
-export default `
-float getRangePowSumFromArrayIndex_TEXTURE_NAME(int start, float p, float mean) {
-    float result = 0.0;
-    for (int i = 0; i < (width_shape_TENSOR_NAME * height_shape_TENSOR_NAME); i++) {
-        vec3 pos = getTexturePosFromArrayIndex_TEXTURE_NAME(i + start);
-        result += pow(getValueFromTexturePos_TEXTURE_NAME(pos) - mean, p); 
-    }
-    return result;
-}
-`;
diff --git a/web/src/shader/atom/getRangeSumFromArrayIndex.es6 b/web/src/shader/atom/getRangeSumFromArrayIndex.es6
deleted file mode 100644
index 47616e6b5c..0000000000
--- a/web/src/shader/atom/getRangeSumFromArrayIndex.es6
+++ /dev/null
@@ -1,15 +0,0 @@
-/* eslint-disable */
-/**
- * @file 公共方法, 获取[H, W]的总和
- * @author yangmingming
- */
-export default `
-float getRangeSumFromArrayIndex_TEXTURE_NAME(int start) {
-    float result = 0.0;
-    for (int i = 0; i < (width_shape_TENSOR_NAME * height_shape_TENSOR_NAME); i++) {
-        vec3 pos = getTexturePosFromArrayIndex_TEXTURE_NAME(i + start);
-        result += getValueFromTexturePos_TEXTURE_NAME(pos); 
-    }
-    return result;
-}
-`;
diff --git a/web/src/shader/atom/getTensorPosFromArrayIndex.es6 b/web/src/shader/atom/getTensorPosFromArrayIndex.es6
deleted file mode 100644
index e59e9f12d6..0000000000
--- a/web/src/shader/atom/getTensorPosFromArrayIndex.es6
+++ /dev/null
@@ -1,18 +0,0 @@
-/* eslint-disable */
-/**
- * @file 公共方法
- * @author yangmingming
- */
-// TENSOR_NAME, tensor name
-// 获取数组元素索引为N的元素，在tensor上的坐标ivec4(batch, channel, height, width)
-export default `
-iTENSOR_TYPE getTensorPosFromArrayIndex_TENSOR_NAME(int n) {
-    iTENSOR_TYPE pos;
-    pos[0] = n / numbers_shape_TENSOR_NAME[0];
-    for (int i = 1; i < length_shape_TENSOR_NAME; i++) {
-        n = int(mod(float(n), float(numbers_shape_TENSOR_NAME[i - 1])));
-        pos[i] = n / numbers_shape_TENSOR_NAME[i];
-    }
-    return pos;
-}
-`;
diff --git a/web/src/shader/atom/getTexturePosFromArrayIndex.es6 b/web/src/shader/atom/getTexturePosFromArrayIndex.es6
deleted file mode 100644
index 51bb108ec4..0000000000
--- a/web/src/shader/atom/getTexturePosFromArrayIndex.es6
+++ /dev/null
@@ -1,25 +0,0 @@
-/* eslint-disable */
-/**
- * @file 公共方法
- * @author yangmingming
- */
-// TEXTURE_NAME, 材质名称
-// WIDTH_TEXTURE_NAME_VALUE, 材质宽度
-// HEIGHT_TEXTURE_NAME_VALUE, 材质高度
-
-// 获取数组元素索引为N的元素，在texture上的坐标
-// const int width_TEXTURE_NAME = WIDTH_TEXTURE_NAME_VALUE;
-// const int height_TEXTURE_NAME = HEIGHT_TEXTURE_NAME_VALUE;
-export default `
-vec3 getTexturePosFromArrayIndex_TEXTURE_NAME(int n) {
-    vec3 pos;
-    pos.z = mod(float(n), 4.0);
-    n /= 4;
-    int y = n / width_TEXTURE_NAME;
-    float width = float(width_TEXTURE_NAME);
-    float x = mod(float(n), width);
-    pos.x = x / width;
-    pos.y = float(y) / float(height_TEXTURE_NAME);
-    return pos;
-}
-`;
diff --git a/web/src/shader/atom/getValueFromTensorPos.es6 b/web/src/shader/atom/getValueFromTensorPos.es6
deleted file mode 100644
index b07d07fa4d..0000000000
--- a/web/src/shader/atom/getValueFromTensorPos.es6
+++ /dev/null
@@ -1,32 +0,0 @@
-/* eslint-disable */
-/**
- * @file 公共方法
- * @author yangmingming
- */
-export default `
-float getValueFromTensorPos_TENSOR_NAME(int r, int g, int b, int a) {
-    vec4 pixels = TEXTURE2D(texture_TENSOR_NAME, 
-        vec2(
-            (float(a * channel_TENSOR_NAME + g) + 0.5) / float(width_texture_TENSOR_NAME), 
-            (float(r * height_shape_TENSOR_NAME + b) + 0.5) / float(height_texture_TENSOR_NAME)
-        )
-    );
-    return pixels.r;
-}
-
-float getValueFromTensorPosLimit_TENSOR_NAME(int r, int g, int b, int a) {
-    float halfW = ceil(float(width_shape_TENSOR_NAME) / 2.0);
-    int x = int(mod(float(a), halfW));
-    int offsetY = 0;
-    if (a > x) {
-        offsetY = height_shape_TENSOR_NAME;
-    }
-    vec4 pixels = TEXTURE2D(texture_TENSOR_NAME, 
-        vec2(
-            (float(x * channel_TENSOR_NAME + g) + 0.5) / float(width_texture_TENSOR_NAME), 
-            (float(r * 2 * height_shape_TENSOR_NAME + b + offsetY) + 0.5) / float(height_texture_TENSOR_NAME)
-        )
-    );
-    return pixels.r;
-}
-`;
diff --git a/web/src/shader/atom/getValueFromTensorPosPacked.es6 b/web/src/shader/atom/getValueFromTensorPosPacked.es6
deleted file mode 100644
index 5b64a7aaed..0000000000
--- a/web/src/shader/atom/getValueFromTensorPosPacked.es6
+++ /dev/null
@@ -1,26 +0,0 @@
-/* eslint-disable */
-/**
- * @file 公共方法
- * @author yangmingming
- */
-export default `
-float getValueFromTensorPosPacked_TENSOR_NAME(int r, int g, int b, int a) {
-    int y = b / 2;
-    int yOffset = int(mod(float(b), 2.0));
-    int x = a / 2;
-    int xOffset = int(mod(float(a), 2.0));
-    int height = height_shape_TENSOR_NAME + offset_y_TENSOR_NAME;
-    vec4 pixels = TEXTURE2D(texture_TENSOR_NAME, vec2((float(x) + 0.5) / float(width_texture_TENSOR_NAME), (float(g * height / 2 + y) + 0.5) / float(height_texture_TENSOR_NAME)));
-    int index = 0;
-    if (xOffset == 0 && yOffset == 0) {
-        return pixels[0];
-    } 
-    else if (xOffset == 1 && yOffset == 0) {
-        return pixels[1];
-    }
-    else if (xOffset == 0 && yOffset == 1) {
-        return pixels[2];
-    }
-    return pixels[3];
-}
-`;
diff --git a/web/src/shader/atom/getValueFromTexturePos.es6 b/web/src/shader/atom/getValueFromTexturePos.es6
deleted file mode 100644
index f810a04bd8..0000000000
--- a/web/src/shader/atom/getValueFromTexturePos.es6
+++ /dev/null
@@ -1,22 +0,0 @@
-/* eslint-disable */
-/**
- * @file 公共方法
- * @author yangmingming
- */
-// TEXTURE_NAME, tensor name
-// 获取材质中的数据
-// uniform sampler2D TEXTURE_NAME;
-export default `
-float getValueFromTexturePos_TEXTURE_NAME(vec3 pos) {
-    vec4 pixels = TEXTURE2D(TEXTURE_NAME, pos.xy);
-    int d = int(pos.z);
-    if (d == 0) {
-        return pixels.r;
-    } else if (d == 1) {
-        return pixels.g;
-    } else if (d == 2) {
-        return pixels.b;
-    }
-    return pixels.a;
-}
-`;
diff --git a/web/src/shader/atom/moveTexture2PosToReal.es6 b/web/src/shader/atom/moveTexture2PosToReal.es6
deleted file mode 100644
index 05a7bbfe8f..0000000000
--- a/web/src/shader/atom/moveTexture2PosToReal.es6
+++ /dev/null
@@ -1,19 +0,0 @@
-/* eslint-disable */
-/**
- * @file 公共方法
- * @author yangmingming
- */
-// TEXTURE_NAME, 材质name
-// 材质坐标转化成真实尺寸坐标
-export default `
-
-vec2 _2d_shape_TEXTURE_NAME = vec2(float(width_TEXTURE_NAME), float(height_TEXTURE_NAME));
-
-vec2 moveTexture2PosToReal_TEXTURE_NAME(vec2 v) {
-    return v * _2d_shape_TEXTURE_NAME;
-    // vec2 v2;
-    // v2.x = v.x * float(width_TEXTURE_NAME);
-    // v2.y = v.y * float(height_TEXTURE_NAME);
-    // return v2;
-}
-`;
diff --git a/web/src/shader/atom/prefix.es6 b/web/src/shader/atom/prefix.es6
deleted file mode 100644
index 0d028a6bba..0000000000
--- a/web/src/shader/atom/prefix.es6
+++ /dev/null
@@ -1,18 +0,0 @@
-/* eslint-disable */
-/**
- * @file 预设条件
- * @author yangmingming
- */
-export default `
-#ifdef GL_FRAGMENT_PRECISION_HIGH
-    precision highp float;
-    precision highp int;
-#else
-    precision mediump float;
-    precision mediump int;
-#endif
-
-    void setOutput(float result) {
-        gl_FragColor.r = result;
-    }
-`;
diff --git a/web/src/shader/atom/prefix2.es6 b/web/src/shader/atom/prefix2.es6
deleted file mode 100644
index e17ad09fa5..0000000000
--- a/web/src/shader/atom/prefix2.es6
+++ /dev/null
@@ -1,22 +0,0 @@
-/* eslint-disable */
-/**
- * @file 预设条件, webgl 2.0版本
- * @author yangmingming
- */
-export default `#version 300 es
-
-#ifdef GL_FRAGMENT_PRECISION_HIGH
-    precision highp float;
-    precision highp int;
-#else
-    precision mediump float;
-    precision mediump int;
-#endif
-
-// 顶点shader透传的材质坐标
-    in vec2 vCoord;
-    out vec4 outColor;
-    void setOutput(float result) {
-        outColor.r = result;
-    }
-`;
diff --git a/web/src/shader/atom/prelu.es6 b/web/src/shader/atom/prelu.es6
deleted file mode 100644
index 36b63f8c82..0000000000
--- a/web/src/shader/atom/prelu.es6
+++ /dev/null
@@ -1,15 +0,0 @@
-/* eslint-disable */
-/**
- * @file 激活函数
- * @author yangmingming
- */
-// 激活函数
-export default `
-float prelu(float x, float p, float b) {
-    float result = x;
-    if (x < 0.0) {
-        result = x * p;
-    }
-    return result;
-}
-`;
diff --git a/web/src/shader/atom/scale.es6 b/web/src/shader/atom/scale.es6
deleted file mode 100644
index 05a69f32fb..0000000000
--- a/web/src/shader/atom/scale.es6
+++ /dev/null
@@ -1,11 +0,0 @@
-/* eslint-disable */
-/**
- * @file 激活函数 
- * @author yangmingming
- */
-export default `
-float scale(float x, float p, float b) {
-    float result = p * x + b;
-    return result;
-}
-`;
diff --git a/web/src/shader/atom/sigmoid.es6 b/web/src/shader/atom/sigmoid.es6
deleted file mode 100644
index a6c3eac0c9..0000000000
--- a/web/src/shader/atom/sigmoid.es6
+++ /dev/null
@@ -1,12 +0,0 @@
-/* eslint-disable */
-/**
- * @file 激活函数
- * @author yangmingming
- */
-// 激活函数
-export default `
-float sigmoid(float x, float y, float z) {
-    float result = 1.0 / (1.0 + exp(-x));
-    return result;
-}
-`;
diff --git a/web/src/shader/atom/softmax.es6 b/web/src/shader/atom/softmax.es6
deleted file mode 100644
index 04bd1a2006..0000000000
--- a/web/src/shader/atom/softmax.es6
+++ /dev/null
@@ -1,14 +0,0 @@
-/* eslint-disable */
-/**
- * @file softmax激活函数
- * @author yangmingming
- */
-export default `
-float softmax(float x, float p, float b) {
-    float result = x;
-    if (x < 0.0) {
-        result = x * p;
-    }
-    return result;
-}
-`;
diff --git a/web/src/shader/atom/suffix.es6 b/web/src/shader/atom/suffix.es6
deleted file mode 100644
index 840ef29de4..0000000000
--- a/web/src/shader/atom/suffix.es6
+++ /dev/null
@@ -1,42 +0,0 @@
-/* eslint-disable */
-/**
- * @file 公共方法-尾部, 方法1: 获取输出坐标
- * @author yangmingming
- */
-export default `
-vec2 _2d_shape_texture_out = vec2(float(width_texture_out), float(height_texture_out));
-ivec4 getOutputTensorPos() {
-    // 获取原始长度
-    vec2 outCoord = vCoord.xy * _2d_shape_texture_out;
-    int x = int(outCoord.x / float(channel_out));
-    int c = int(mod(outCoord.x, float(channel_out)));
-    int y = int(mod(outCoord.y, float(height_shape_out)));
-    int b = int(outCoord.y / float(height_shape_out));
-    return ivec4(b, c, y, x);
-}
-
-ivec4 getOutputTensorPosLimit() {
-    // 获取原始长度
-    vec2 outCoord = vCoord.xy * _2d_shape_texture_out;
-    float offsetY = floor(outCoord.y / float(height_shape_out));
-    int x = int(outCoord.x / float(channel_out));
-    if (mod(offsetY, 2.0) > 0.0) {
-        x += int(ceil(float(width_shape_out) / 2.0));
-    }
-    int y = int(mod(outCoord.y, float(height_shape_out)));
-    int c = int(mod(outCoord.x, float(channel_out)));
-    int b = int(outCoord.y / float(2 * height_shape_out));
-    return ivec4(b, c, y, x);
-}
-
-ivec4 getOutputPackedTensorPos() {
-    // 获取原始长度
-    vec2 outCoord = vCoord.xy * _2d_shape_texture_out;
-    int height = height_shape_out + offset_y_out;
-    int x = int(outCoord.x);
-    int c = int(outCoord.y / float(height / 2));
-    int y = int(mod(outCoord.y, float(height / 2)));
-    int b = 0;
-    return ivec4(b, c, y, x);
-}
-`;
diff --git a/web/src/shader/atom/type_ivec56.es6 b/web/src/shader/atom/type_ivec56.es6
deleted file mode 100644
index 4b70e44ca8..0000000000
--- a/web/src/shader/atom/type_ivec56.es6
+++ /dev/null
@@ -1,23 +0,0 @@
-/* eslint-disable */
-/**
- * @file 新增类型
- * @author yangmingming
- */
-// 扩展shader的ivec类型
-export default `
-struct ivec5 {
-    int x;
-    int y;
-    int z;
-    int w;
-    int u;
-};
-struct ivec6 {
-    int x;
-    int y;
-    int z;
-    int w;
-    int u;
-    int v;
-};
-`;
diff --git a/web/src/shader/batchnorm/conf.es6 b/web/src/shader/batchnorm/conf.es6
deleted file mode 100644
index b64fd4a80b..0000000000
--- a/web/src/shader/batchnorm/conf.es6
+++ /dev/null
@@ -1,58 +0,0 @@
-/* eslint-disable */
-/**
- * @file batchnorm的配置文件
- * @author yangmingming
- */
-export default {
-    dep: [
-        {
-            func: 'getValueFromTensorPos',
-            conf: {
-                TENSOR_NAME: 'origin'
-            }
-        },
-        {
-            func: 'getPixelsFromTexturePos',
-            conf: {
-                TEXTURE_NAME: 'texture_scale'
-            }
-        }
-    ],
-    conf: [
-        'WIDTH_SHAPE_ORIGIN',
-        'HEIGHT_SHAPE_ORIGIN',
-        'LENGTH_SHAPE_ORIGIN',
-        'WIDTH_TEXTURE_ORIGIN',
-        'HEIGHT_TEXTURE_ORIGIN',
-        'CHANNEL_ORIGIN',
-        'TOTAL_SHAPE_ORIGIN',
-
-        'WIDTH_SHAPE_OUT',
-        'HEIGHT_SHAPE_OUT',
-        'WIDTH_TEXTURE_OUT',
-        'HEIGHT_TEXTURE_OUT',
-        'CHANNEL_OUT',
-        'OFFSET_Y_OUT',
-
-        'EPSILON',
-        'WIDTH_TEXTURE_SCALE',
-        'HEIGHT_TEXTURE_SCALE',
-        'MULTI_VALUE',
-        'BIAS_VALUE',
-        'ACTIVE_FUNCTION'
-    ],
-    input: [
-        {
-            tensor: 'scale',
-            variable: 'texture',
-            setter: 'initTexture',
-            type: 'texture'
-        },
-        {
-            tensor: 'origin',
-            variable: 'texture',
-            setter: 'initTexture',
-            type: 'texture'
-        }
-    ]
-};
diff --git a/web/src/shader/batchnorm/main.es6 b/web/src/shader/batchnorm/main.es6
deleted file mode 100644
index e964a02841..0000000000
--- a/web/src/shader/batchnorm/main.es6
+++ /dev/null
@@ -1,18 +0,0 @@
-/* eslint-disable */
-/**
- * @file softmax主函数
- * @author yangmingming
- */
-export default `
-// start函数
-void main(void) {
-    // 输出数据
-    ivec4 oPos = getOutputTensorPos();
-    float o = getValueFromTensorPos_origin(oPos);
-    // 归一化数据
-    vec4 scale = getPixelsFromTexturePos_texture_scale(vec2((float(int(oPos.g)) + 0.5) / float(width_texture_scale), 0.0));
-    float x = (o - scale[3]) / sqrt(scale[2] + epsilon);
-    float res = scale[0] * x + scale[1];
-    setOutput(res);
-}
-`;
diff --git a/web/src/shader/batchnorm/params.es6 b/web/src/shader/batchnorm/params.es6
deleted file mode 100644
index 2592344029..0000000000
--- a/web/src/shader/batchnorm/params.es6
+++ /dev/null
@@ -1,24 +0,0 @@
-/* eslint-disable */
-/**
- * @file batchnorm参数文件
- * @author yangmingming
- */
-export default `
-// 输入数据
-const int width_shape_origin = WIDTH_SHAPE_ORIGIN;
-const int height_shape_origin = HEIGHT_SHAPE_ORIGIN;
-const int length_shape_origin = LENGTH_SHAPE_ORIGIN;
-const int width_texture_origin = WIDTH_TEXTURE_ORIGIN;
-const int height_texture_origin = HEIGHT_TEXTURE_ORIGIN;
-const int channel_origin = CHANNEL_ORIGIN;
-const int total_shape_origin = TOTAL_SHAPE_ORIGIN;
-
-// 计算数据
-const float epsilon = float(EPSILON);
-const int width_texture_scale = WIDTH_TEXTURE_SCALE;
-const int height_texture_scale = HEIGHT_TEXTURE_SCALE;
-
-// 输入数据
-uniform sampler2D texture_origin;
-uniform sampler2D texture_scale;
-`;
diff --git a/web/src/shader/conv2d/conf.es6 b/web/src/shader/conv2d/conf.es6
deleted file mode 100644
index bcd2e5313a..0000000000
--- a/web/src/shader/conv2d/conf.es6
+++ /dev/null
@@ -1,86 +0,0 @@
-/* eslint-disable */
-/**
- * @file conv2d的配置文件
- * @author yangmingming
- */
-export default {
-    dep: [
-        {
-            func: 'getValueFromTensorPos',
-            conf: {
-                TENSOR_NAME: 'origin'
-            }
-        },
-        {
-            func: 'getValueFromTensorPos',
-            conf: {
-                TENSOR_NAME: 'filter'
-            }
-        }
-    ],
-    conf: [
-        'LENGTH_SHAPE_FILTER',
-        'WIDTH_SHAPE_FILTER',
-        'HEIGHT_SHAPE_FILTER',
-        'WIDTH_TEXTURE_FILTER',
-        'HEIGHT_TEXTURE_FILTER',
-        'CHANNEL_FILTER',
-
-        'WIDTH_SHAPE_ORIGIN',
-        'HEIGHT_SHAPE_ORIGIN',
-        'LENGTH_SHAPE_ORIGIN',
-        'WIDTH_TEXTURE_ORIGIN',
-        'HEIGHT_TEXTURE_ORIGIN',
-        'CHANNEL_ORIGIN',
-
-        'WIDTH_SHAPE_OUT',
-        'HEIGHT_SHAPE_OUT',
-        'WIDTH_TEXTURE_OUT',
-        'HEIGHT_TEXTURE_OUT',
-        'CHANNEL_OUT',
-        'OFFSET_Y_OUT',
-
-        'STRIDE_HORIZONTAL',
-        'STRIDE_VERTICAL',
-        'PAD_LEFT',
-        'PAD_TOP',
-        'DILATION_HORIZONTAL',
-        'DILATION_VERTICAL',
-        'GROUPS',
-        'MULTI_VALUE',
-        'BIAS_VALUE',
-        'ACTIVE_FUNCTION'
-    ],
-    input: [
-        // {
-        //     tensor: 'filter',
-        //     variable: 'numbers_shape',
-        //     setter: 'uniform1iv',
-        //     type: 'uniform'
-        // },
-        {
-            tensor: 'filter',
-            variable: 'texture',
-            setter: 'initTexture',
-            type: 'texture'
-        },
-        {
-            tensor: 'origin',
-            variable: 'texture',
-            setter: 'initTexture',
-            type: 'texture'
-        }
-        // {
-        //     tensor: 'origin',
-        //     variable: 'numbers_shape',
-        //     setter: 'uniform1iv',
-        //     type: 'uniform'
-        // },
-        // {
-        //     tensor: 'out',
-        //     variable: 'numbers_shape',
-        //     setter: 'uniform1iv',
-        //     type: 'uniform'
-        // }
-    ]
-};
diff --git a/web/src/shader/conv2d/main.es6 b/web/src/shader/conv2d/main.es6
deleted file mode 100644
index ffa8e5fedc..0000000000
--- a/web/src/shader/conv2d/main.es6
+++ /dev/null
@@ -1,48 +0,0 @@
-/* eslint-disable */
-/**
- * @file 主函数
- * @author yangmingming
- */
-export default `
-    // start函数
-    void main(void) {
-        ivec4 oPos = getOutputTensorPosLIMIT_OUT();
-        int x = oPos.a;
-        int c = oPos.g;
-        int y = oPos.b;
-        int b = oPos.r; 
-        float res = 0.0;
-
-        // 获取output的坐标
-        int oTensorChannel = (c / (channel_out / groups)) * channel_filter;
-        int oy = y * stride_v - padTop;
-        for (int fy = 0; fy < height_shape_filter; fy++) {
-            if (oy >= height_shape_origin) {
-                break;
-            }
-            if (oy < 0) {
-                oy += dilation_v;
-                continue;
-            }
-            int ox = x * stride_h - padLeft;
-            for (int fx = 0; fx < width_shape_filter; fx++) {
-                if (ox >= width_shape_origin) {
-                    break;
-                }
-                if (ox < 0) {
-                    ox += dilation_h;
-                    continue;
-                }
-                // channel计算
-                for (int j = 0; j < channel_filter; j++) {
-                    float f = getValueFromTensorPosLIMIT_FILTER_filter(c, j, fy, fx);
-                    float o = getValueFromTensorPosLIMIT_ORIGIN_origin(b, oTensorChannel + j, oy, ox);
-                    res += f * o;
-                }
-                ox += dilation_h;
-            }
-            oy += dilation_v;
-        }
-        setOutput(res);
-    }
-`;
diff --git a/web/src/shader/conv2d/params.es6 b/web/src/shader/conv2d/params.es6
deleted file mode 100644
index 367072995a..0000000000
--- a/web/src/shader/conv2d/params.es6
+++ /dev/null
@@ -1,45 +0,0 @@
-/* eslint-disable */
-/**
- * @file 参数文件
- * @author yangmingming
- */
-export default `
-    // conv2d的input数据
-    
-    // 常量
-    // 卷积核
-    const int length_shape_filter = LENGTH_SHAPE_FILTER;
-    const int width_shape_filter = WIDTH_SHAPE_FILTER;
-    const int height_shape_filter = HEIGHT_SHAPE_FILTER;
-    const int width_texture_filter = WIDTH_TEXTURE_FILTER;
-    const int height_texture_filter = HEIGHT_TEXTURE_FILTER;
-    const int channel_filter = CHANNEL_FILTER;
-    
-    // 输入数据
-    const int width_shape_origin = WIDTH_SHAPE_ORIGIN;
-    const int height_shape_origin = HEIGHT_SHAPE_ORIGIN;
-    const int length_shape_origin = LENGTH_SHAPE_ORIGIN;
-    const int width_texture_origin = WIDTH_TEXTURE_ORIGIN;
-    const int height_texture_origin = HEIGHT_TEXTURE_ORIGIN;
-    const int channel_origin = CHANNEL_ORIGIN;
-    
-    // 计算相关
-    // 拆分步长
-    const int stride_h = STRIDES_X;
-    const int stride_v = STRIDES_Y;
-    // padding的数目
-    const int padLeft = PADDINGS_X;
-    const int padTop = PADDINGS_Y;
-    // dilation膨胀系数
-    const int dilation_h = DILATIONS_X;
-    const int dilation_v = DILATIONS_Y;
-    // groups
-    const int groups = GROUPS;
-    
-    // uniform变量
-    // 卷积核
-    uniform sampler2D texture_filter;
-    
-    // 输入数据
-    uniform sampler2D texture_origin;
-`;
diff --git a/web/src/shader/conv2d_depthwise/conf.es6 b/web/src/shader/conv2d_depthwise/conf.es6
deleted file mode 100644
index 467320904b..0000000000
--- a/web/src/shader/conv2d_depthwise/conf.es6
+++ /dev/null
@@ -1,67 +0,0 @@
-/* eslint-disable */
-/**
- * @file conv2d的配置文件
- * @author yangmingming
- */
-export default {
-    dep: [
-        {
-            func: 'getValueFromTensorPos',
-            conf: {
-                TENSOR_NAME: 'origin'
-            }
-        },
-        {
-            func: 'getValueFromTensorPos',
-            conf: {
-                TENSOR_NAME: 'filter'
-            }
-        }
-    ],
-    conf: [
-        'LENGTH_SHAPE_FILTER',
-        'WIDTH_SHAPE_FILTER',
-        'HEIGHT_SHAPE_FILTER',
-        'WIDTH_TEXTURE_FILTER',
-        'HEIGHT_TEXTURE_FILTER',
-        'CHANNEL_FILTER',
-
-        'WIDTH_SHAPE_ORIGIN',
-        'HEIGHT_SHAPE_ORIGIN',
-        'LENGTH_SHAPE_ORIGIN',
-        'WIDTH_TEXTURE_ORIGIN',
-        'HEIGHT_TEXTURE_ORIGIN',
-        'CHANNEL_ORIGIN',
-
-        'WIDTH_SHAPE_OUT',
-        'HEIGHT_SHAPE_OUT',
-        'WIDTH_TEXTURE_OUT',
-        'HEIGHT_TEXTURE_OUT',
-        'CHANNEL_OUT',
-        'OFFSET_Y_OUT',
-
-        'STRIDE_HORIZONTAL',
-        'STRIDE_VERTICAL',
-        'PAD_LEFT',
-        'PAD_TOP',
-        'DILATION_HORIZONTAL',
-        'DILATION_VERTICAL',
-        'MULTI_VALUE',
-        'BIAS_VALUE',
-        'ACTIVE_FUNCTION'
-    ],
-    input: [
-        {
-            tensor: 'filter',
-            variable: 'texture',
-            setter: 'initTexture',
-            type: 'texture'
-        },
-        {
-            tensor: 'origin',
-            variable: 'texture',
-            setter: 'initTexture',
-            type: 'texture'
-        }
-    ]
-};
diff --git a/web/src/shader/conv2d_depthwise/main.es6 b/web/src/shader/conv2d_depthwise/main.es6
deleted file mode 100644
index 870607c648..0000000000
--- a/web/src/shader/conv2d_depthwise/main.es6
+++ /dev/null
@@ -1,42 +0,0 @@
-/* eslint-disable */
-/**
- * @file 主函数
- * @author yangmingming
- */
-export default `
-    // start函数
-    void main(void) {
-        ivec4 oPos = getOutputTensorPosLIMIT_OUT();
-        int x = oPos.a;
-        int c = oPos.g;
-        int y = oPos.b;
-        int b = oPos.r; 
-        float res = 0.0;
-
-        int top = y * stride_v - padTop;
-        int left = x * stride_h - padLeft;
-        for (int fy = 0; fy < height_shape_filter; fy++) {
-          int oy = top + fy * dilation_v;
-          if (oy >= height_shape_origin) {
-              break;
-          }
-          if (oy < 0) {
-            continue;
-          }
-          for (int fx = 0; fx < width_shape_filter; fx++) {
-            int ox = left + fx * dilation_h;
-            if (ox >= width_shape_origin) {
-                break;
-            }
-            if (ox < 0) {
-                continue;
-            }
-            // b默认是0
-            float f = getValueFromTensorPosLIMIT_FILTER_filter(c, 0, fy, fx);
-            float o = getValueFromTensorPosLIMIT_ORIGIN_origin(b, c, oy, ox);
-            res += f * o;
-          }
-        }
-        setOutput(res);
-    }
-`;
diff --git a/web/src/shader/conv2d_depthwise/params.es6 b/web/src/shader/conv2d_depthwise/params.es6
deleted file mode 100644
index ae408cceeb..0000000000
--- a/web/src/shader/conv2d_depthwise/params.es6
+++ /dev/null
@@ -1,43 +0,0 @@
-/* eslint-disable */
-/**
- * @file 参数文件
- * @author yangmingming
- */
-export default `
-    // conv2d的input数据
-    
-    // 常量
-    // 卷积核
-    const int length_shape_filter = LENGTH_SHAPE_FILTER;
-    const int width_shape_filter = WIDTH_SHAPE_FILTER;
-    const int height_shape_filter = HEIGHT_SHAPE_FILTER;
-    const int width_texture_filter = WIDTH_TEXTURE_FILTER;
-    const int height_texture_filter = HEIGHT_TEXTURE_FILTER;
-    const int channel_filter = CHANNEL_FILTER;
-    
-    // 输入数据
-    const int width_shape_origin = WIDTH_SHAPE_ORIGIN;
-    const int height_shape_origin = HEIGHT_SHAPE_ORIGIN;
-    const int length_shape_origin = LENGTH_SHAPE_ORIGIN;
-    const int width_texture_origin = WIDTH_TEXTURE_ORIGIN;
-    const int height_texture_origin = HEIGHT_TEXTURE_ORIGIN;
-    const int channel_origin = CHANNEL_ORIGIN;
-    
-    // 计算相关
-    // 拆分步长
-    const int stride_h = STRIDES_X;
-    const int stride_v = STRIDES_Y;
-    // padding的数目
-    const int padLeft = PADDINGS_X;
-    const int padTop = PADDINGS_Y;
-    // dilation膨胀系数
-    const int dilation_h = DILATIONS_X;
-    const int dilation_v = DILATIONS_Y;
-    
-    // uniform变量
-    // 卷积核
-    uniform sampler2D texture_filter;
-    
-    // 输入数据
-    uniform sampler2D texture_origin;
-`;
diff --git a/web/src/shader/conv2d_elementwise_add/conf.es6 b/web/src/shader/conv2d_elementwise_add/conf.es6
deleted file mode 100644
index 7d3a58e294..0000000000
--- a/web/src/shader/conv2d_elementwise_add/conf.es6
+++ /dev/null
@@ -1,77 +0,0 @@
-/* eslint-disable */
-/**
- * @file conv2d-elementwise_add的配置文件
- * @author yangmingming
- */
-export default {
-    dep: [
-        {
-            func: 'getValueFromTensorPos',
-            conf: {
-                TENSOR_NAME: 'origin'
-            }
-        },
-        {
-            func: 'getValueFromTensorPos',
-            conf: {
-                TENSOR_NAME: 'filter'
-            }
-        }
-    ],
-    conf: [
-        'LENGTH_SHAPE_FILTER',
-        'WIDTH_SHAPE_FILTER',
-        'HEIGHT_SHAPE_FILTER',
-        'WIDTH_TEXTURE_FILTER',
-        'HEIGHT_TEXTURE_FILTER',
-        'CHANNEL_FILTER',
-
-        'WIDTH_SHAPE_ORIGIN',
-        'HEIGHT_SHAPE_ORIGIN',
-        'LENGTH_SHAPE_ORIGIN',
-        'WIDTH_TEXTURE_ORIGIN',
-        'HEIGHT_TEXTURE_ORIGIN',
-        'CHANNEL_ORIGIN',
-
-        'WIDTH_SHAPE_OUT',
-        'HEIGHT_SHAPE_OUT',
-        'WIDTH_TEXTURE_OUT',
-        'HEIGHT_TEXTURE_OUT',
-        'CHANNEL_OUT',
-        'OFFSET_Y_OUT',
-
-        'WIDTH_SHAPE_COUNTER',
-
-        'STRIDE_HORIZONTAL',
-        'STRIDE_VERTICAL',
-        'PAD_LEFT',
-        'PAD_TOP',
-        'DILATION_HORIZONTAL',
-        'DILATION_VERTICAL',
-        'GROUPS',
-        'AXIS',
-        'MULTI_VALUE',
-        'BIAS_VALUE',
-        'ACTIVE_FUNCTION'
-    ],
-    input: [
-        {
-            tensor: 'filter',
-            variable: 'texture',
-            setter: 'initTexture',
-            type: 'texture'
-        },
-        {
-            tensor: 'counter',
-            variable: 'texture',
-            setter: 'initTexture',
-            type: 'texture'
-        },
-        {
-            tensor: 'origin',
-            variable: 'texture',
-            setter: 'initTexture',
-            type: 'texture'
-        }
-    ]
-};
diff --git a/web/src/shader/conv2d_elementwise_add/main.es6 b/web/src/shader/conv2d_elementwise_add/main.es6
deleted file mode 100644
index 90607a350f..0000000000
--- a/web/src/shader/conv2d_elementwise_add/main.es6
+++ /dev/null
@@ -1,53 +0,0 @@
-/* eslint-disable */
-/**
- * @file 主函数
- * @author yangmingming
- */
-export default `
-    // start函数
-    void main(void) {
-        ivec4 oPos = getOutputTensorPosLIMIT_OUT();
-        int x = oPos.a;
-        int c = oPos.g;
-        int y = oPos.b;
-        int b = oPos.r;
-        int addAxis = oPos[axis];
-        float res = getValueFromCounter(addAxis);
-
-        // 获取output的坐标
-        int oTensorChannel = (c / (channel_out / groups)) * channel_filter;
-        int oy = y * stride_v - padTop;
-        for (int fy = 0; fy < height_shape_filter; fy++) {
-            if (oy >= height_shape_origin) {
-                break;
-            }
-            if (oy < 0) {
-                oy += dilation_v;
-                continue;
-            }
-            int ox = x * stride_h - padLeft;
-            for (int fx = 0; fx < width_shape_filter; fx++) {
-                if (ox >= width_shape_origin) {
-                    break;
-                }
-                if (ox < 0) {
-                    ox += dilation_h;
-                    continue;
-                }
-                // channel计算
-                for (int j = 0; j < channel_filter; j++) {
-                    float f = getValueFromTensorPosLIMIT_FILTER_filter(c, j, fy, fx);
-                    float o = getValueFromTensorPosLIMIT_ORIGIN_origin(b, oTensorChannel + j, oy, ox);
-                    res += f * o;
-                }
-                ox += dilation_h;
-            }
-            oy += dilation_v;
-        }
-        setOutput(ACTIVE_FUNCTION(res, multi_value, bias_value));
-        // outColor.r = float(b);
-        // outColor.g = float(c);
-        // outColor.b = float(y);
-        // outColor.a = float(x);
-    }
-`;
diff --git a/web/src/shader/conv2d_elementwise_add/params.es6 b/web/src/shader/conv2d_elementwise_add/params.es6
deleted file mode 100644
index e4a4f6aed6..0000000000
--- a/web/src/shader/conv2d_elementwise_add/params.es6
+++ /dev/null
@@ -1,54 +0,0 @@
-/* eslint-disable */
-/**
- * @file 参数文件
- * @author yangmingming
- */
-export default `
-    // 卷积核
-    const int length_shape_filter = LENGTH_SHAPE_FILTER;
-    const int width_shape_filter = WIDTH_SHAPE_FILTER;
-    const int height_shape_filter = HEIGHT_SHAPE_FILTER;
-    const int width_texture_filter = WIDTH_TEXTURE_FILTER;
-    const int height_texture_filter = HEIGHT_TEXTURE_FILTER;
-    const int channel_filter = CHANNEL_FILTER;
-    
-    // 输入数据
-    const int width_shape_origin = WIDTH_SHAPE_ORIGIN;
-    const int height_shape_origin = HEIGHT_SHAPE_ORIGIN;
-    const int length_shape_origin = LENGTH_SHAPE_ORIGIN;
-    const int width_texture_origin = WIDTH_TEXTURE_ORIGIN;
-    const int height_texture_origin = HEIGHT_TEXTURE_ORIGIN;
-    const int channel_origin = CHANNEL_ORIGIN;
-    
-    // 计算相关
-    // 拆分步长
-    const int stride_h = STRIDES_X;
-    const int stride_v = STRIDES_Y;
-    // padding的数目
-    const int padLeft = PADDINGS_X;
-    const int padTop = PADDINGS_Y;
-    // dilation膨胀系数
-    const int dilation_h = DILATIONS_X;
-    const int dilation_v = DILATIONS_Y;
-    // groups
-    const int groups = GROUPS;
-   
-    // 加法
-    const int axis = AXIS;
-     
-    // uniform变量
-    // 卷积核
-    uniform sampler2D texture_filter;
-    
-    // 输入数据
-    uniform sampler2D texture_origin;
-    
-    // 加法
-    uniform sampler2D texture_counter;
-    // 加法用到的函数
-    float getValueFromCounter(int index) {
-        float xPos = float(index) / float(WIDTH_SHAPE_COUNTER);
-        vec4 pixels = TEXTURE2D(texture_counter, vec2(xPos, 0.5));
-        return pixels.r;
-    }
-`;
diff --git a/web/src/shader/conv2d_elementwise_add_winograd/conf.es6 b/web/src/shader/conv2d_elementwise_add_winograd/conf.es6
deleted file mode 100644
index 61bd86fdee..0000000000
--- a/web/src/shader/conv2d_elementwise_add_winograd/conf.es6
+++ /dev/null
@@ -1,72 +0,0 @@
-/* eslint-disable */
-/**
- * @file conv2d-elementwise_add的配置文件
- * @author yangmingming
- */
-export default {
-    dep: [
-        {
-            func: 'getValueFromTensorPos',
-            conf: {
-                TENSOR_NAME: 'origin'
-            }
-        },
-        {
-            func: 'getValueFromTensorPos',
-            conf: {
-                TENSOR_NAME: 'filter'
-            }
-        }
-    ],
-    conf: [
-        'LENGTH_SHAPE_FILTER',
-        'WIDTH_SHAPE_FILTER',
-        'HEIGHT_SHAPE_FILTER',
-        'WIDTH_TEXTURE_FILTER',
-        'HEIGHT_TEXTURE_FILTER',
-        'CHANNEL_FILTER',
-
-        'WIDTH_SHAPE_ORIGIN',
-        'HEIGHT_SHAPE_ORIGIN',
-        'LENGTH_SHAPE_ORIGIN',
-        'WIDTH_TEXTURE_ORIGIN',
-        'HEIGHT_TEXTURE_ORIGIN',
-        'CHANNEL_ORIGIN',
-
-        'WIDTH_SHAPE_OUT',
-        'HEIGHT_SHAPE_OUT',
-        'WIDTH_TEXTURE_OUT',
-        'HEIGHT_TEXTURE_OUT',
-        'CHANNEL_OUT',
-        'OFFSET_Y_OUT',
-
-        'TOTAL_SHAPE_COUNTER',
-
-        'PAD_LEFT',
-        'PAD_TOP',
-        'AXIS',
-        'MULTI_VALUE',
-        'BIAS_VALUE',
-        'ACTIVE_FUNCTION'
-    ],
-    input: [
-        {
-            tensor: 'filter',
-            variable: 'texture',
-            setter: 'initTexture',
-            type: 'texture'
-        },
-        {
-            tensor: 'origin',
-            variable: 'texture',
-            setter: 'initTexture',
-            type: 'texture'
-        },
-        {
-            tensor: 'counter',
-            variable: 'data',
-            setter: 'uniform1fv',
-            type: 'uniform'
-        }
-    ]
-};
diff --git a/web/src/shader/conv2d_elementwise_add_winograd/main.es6 b/web/src/shader/conv2d_elementwise_add_winograd/main.es6
deleted file mode 100644
index bf8da439aa..0000000000
--- a/web/src/shader/conv2d_elementwise_add_winograd/main.es6
+++ /dev/null
@@ -1,98 +0,0 @@
-/* eslint-disable */
-/**
- * @file 主函数
- * @author yangmingming
- */
-export default `
-    // start函数
-    void main(void) {
-        ivec4 oPos = getOutputPackedTensorPos();
-        int x = oPos.a;
-        int c = oPos.g;
-        int y = oPos.b;
-        int b = oPos.r;
-        // b = 0;
-        // c = 1;
-        // y = 0;
-        // x = 0;
-        int addAxis = oPos[axis];
-        float res = getValueFromCounter(addAxis);
-        // 输出结果
-        vec4 v4 = vec4(res);
-
-        float I[16];
-        float B[16];
-        float T[16];
-        float f[16];
-        for (int cl = 0; cl < channel_filter; cl++) {
-            // 获取output的坐标
-            int oy = 2*y - padTop;
-            // 计算输入 4 * 4矩阵 和filter
-            for (int fy = 0; fy < 4; fy++) {
-                int ox = 2*x - padLeft;
-                int index = fy * 4;
-                for (int fx = 0; fx < 4; fx++) {
-                    if (oy < 0 || oy >= height_shape_origin || ox >= width_shape_origin || ox < 0) {
-                        I[index + fx] = 0.0;
-                    } else {
-                        I[index + fx] = getValueFromTensorPos_origin(b, cl, oy, ox);
-                    }
-                    f[index + fx] = getValueFromTensorPos_filter(c, cl, fy, fx);
-                    ox += 1;
-                }
-                oy += 1;
-            }
-            // input转化
-            float tmp1 = I[2] - I[10];
-            float tmp2 = I[9] - I[1];
-            B[0] = I[0] - I[8] - tmp1;
-            B[1] = tmp1 - tmp2;
-            B[2] = tmp1 + tmp2;
-            B[3] = I[3] - I[11] + tmp2;
-            tmp1 = I[6] + I[10];
-            tmp2 = I[5] + I[9];
-            B[4] = I[4] + I[8] - tmp1;
-            B[5] = tmp1 + tmp2;
-            B[6] = tmp1 - tmp2;
-            B[7] = I[7] + I[11] - tmp2;
-            tmp1 = I[10] - I[6];
-            tmp2 = I[5] - I[9];
-            B[8] = I[8] - I[4] - tmp1;
-            B[9] = tmp1 - tmp2;
-            B[10] = tmp1 + tmp2;
-            B[11] = tmp2 - I[7] + I[11];
-            tmp1 = I[14] - I[6];
-            tmp2 = I[5] - I[13];
-            B[12] = I[12] - I[4] - tmp1;
-            B[13] = tmp1 - tmp2;
-            B[14] = tmp1 + tmp2;
-            B[15] = tmp2 - I[7] + I[15];
-            // 点乘
-            for (int i = 0; i < 16; i++) {
-                T[i] = B[i] * f[i];
-            }
-            // final output
-            tmp1 = T[1] + T[5] + T[9];
-            tmp2 = T[2] + T[6] + T[10];
-            v4[0] += T[0] + T[4] + T[8] + tmp1 + tmp2;
-            v4[1] += T[3] + T[7] + T[11] + tmp1 - tmp2;
-            tmp1 = T[5] - T[9] + T[13];
-            tmp2 = T[6] - T[10] + T[14];
-            v4[2] += T[4] - T[8] + T[12] + tmp1 + tmp2;
-            v4[3] += T[7] - T[11] + T[15] + tmp1 - tmp2;
-        }
-        outColor.r = ACTIVE_FUNCTION(v4[0], multi_value, bias_value);
-        outColor.g = ACTIVE_FUNCTION(v4[1], multi_value, bias_value);
-        outColor.b = ACTIVE_FUNCTION(v4[2], multi_value, bias_value);
-        outColor.a = ACTIVE_FUNCTION(v4[3], multi_value, bias_value);
-        // outColor = v4;
-        // outColor.r = I[0];
-        // outColor.g = I[1];
-        // outColor.b = I[2];
-        // outColor.a = I[3];
-        // outColor.r = float(b);
-        // outColor.g = float(c);
-        // outColor.b = float(y);
-        // outColor.a = float(x);
-    }
-`;
diff --git a/web/src/shader/conv2d_elementwise_add_winograd/params.es6 b/web/src/shader/conv2d_elementwise_add_winograd/params.es6
deleted file mode 100644
index 288ad211ac..0000000000
--- a/web/src/shader/conv2d_elementwise_add_winograd/params.es6
+++ /dev/null
@@ -1,47 +0,0 @@
-/* eslint-disable */
-/**
- * @file 参数文件
- * @author yangmingming
- */
-export default `
-    // 卷积核
-    const int length_shape_filter = LENGTH_SHAPE_FILTER;
-    const int width_shape_filter = WIDTH_SHAPE_FILTER;
-    const int height_shape_filter = HEIGHT_SHAPE_FILTER;
-    const int width_texture_filter = WIDTH_TEXTURE_FILTER;
-    const int height_texture_filter = HEIGHT_TEXTURE_FILTER;
-    const int channel_filter = CHANNEL_FILTER;
-
-    // 输入数据
-    const int width_shape_origin = WIDTH_SHAPE_ORIGIN;
-    const int height_shape_origin = HEIGHT_SHAPE_ORIGIN;
-    const int length_shape_origin = LENGTH_SHAPE_ORIGIN;
-    const int width_texture_origin = WIDTH_TEXTURE_ORIGIN;
-    const int height_texture_origin = HEIGHT_TEXTURE_ORIGIN;
-    const int channel_origin = CHANNEL_ORIGIN;
-    
-    // 计算相关
-    // padding的数目
-    const int padLeft = PADDINGS_X;
-    const int padTop = PADDINGS_Y;
-   
-    // 加法
-    const int axis = AXIS;
-    uniform float data_counter[TOTAL_SHAPE_COUNTER];
-     
-    // uniform变量
-    // 卷积核
-    uniform sampler2D texture_filter;
-    
-    // 输入数据
-    uniform sampler2D texture_origin;
-    // 加法用到的函数
-    float getValueFromCounter(int index) {
-        for (int i = 0; i < TOTAL_SHAPE_COUNTER; i++) {
-            if (i == index) {
-                return data_counter[i];
-            }
-        }
-        return 0.0;
-    }
-`;
diff --git a/web/src/shader/dynamic/conf.es6 b/web/src/shader/dynamic/conf.es6
deleted file mode 100644
index eddd1418ca..0000000000
--- a/web/src/shader/dynamic/conf.es6
+++ /dev/null
@@ -1,35 +0,0 @@
-/* eslint-disable */
-/**
- * @file dynamic的配置文件
- * @author yangmingming
- */
-export default {
-    dep: [
-        {
-            func: 'getPixelsFromTexturePos',
-            conf: {
-                TEXTURE_NAME: 'texture_origin'
-            }
-        }
-    ],
-    conf: [
-        'WIDTH_SHAPE_OUT',
-        'HEIGHT_SHAPE_OUT',
-        'WIDTH_TEXTURE_OUT',
-        'HEIGHT_TEXTURE_OUT',
-        'CHANNEL_OUT',
-        'OFFSET_Y_OUT',
-
-        'MULTI_VALUE',
-        'BIAS_VALUE',
-        'ACTIVE_FUNCTION'
-    ],
-    input: [
-        {
-            tensor: 'origin',
-            variable: 'texture',
-            setter: 'initTexture',
-            type: 'texture'
-        }
-    ]
-};
diff --git a/web/src/shader/dynamic/main.es6 b/web/src/shader/dynamic/main.es6
deleted file mode 100644
index 54c068ff9b..0000000000
--- a/web/src/shader/dynamic/main.es6
+++ /dev/null
@@ -1,14 +0,0 @@
-/* eslint-disable */
-/**
- * @file 主函数
- * @author yangmingming
- */
-export default `
-// start函数
-void main(void) {
-    // 输出数据
-    float o = getPixelsFromTexturePos_texture_origin(vCoord).r;
-    float res = ACTIVE_FUNCTION(o, multi_value, bias_value);
-    setOutput(res);
-}
-`;
diff --git a/web/src/shader/dynamic/params.es6 b/web/src/shader/dynamic/params.es6
deleted file mode 100644
index 3eb0295466..0000000000
--- a/web/src/shader/dynamic/params.es6
+++ /dev/null
@@ -1,9 +0,0 @@
-/* eslint-disable */
-/**
- * @file 参数文件
- * @author yangmingming
- */
-export default `
-    // 输入数据
-    uniform sampler2D texture_origin;
-`;
diff --git a/web/src/shader/elementwise_add/conf.es6 b/web/src/shader/elementwise_add/conf.es6
deleted file mode 100644
index bfd3fb24c2..0000000000
--- a/web/src/shader/elementwise_add/conf.es6
+++ /dev/null
@@ -1,57 +0,0 @@
-/* eslint-disable */
-/**
- * @file 加法的配置文件
- * @author yangmingming
- */
-export default {
-    dep: [
-        {
-            func: 'getPixelsFromTexturePos',
-            conf: {
-                TEXTURE_NAME: 'texture_origin'
-            }
-        },
-        {
-            func: 'getPixelsFromTexturePos',
-            conf: {
-                TEXTURE_NAME: 'texture_counter'
-            }
-        }
-    ],
-    conf: [
-        'WIDTH_SHAPE_ORIGIN',
-        'HEIGHT_SHAPE_ORIGIN',
-        'LENGTH_SHAPE_ORIGIN',
-        'WIDTH_TEXTURE_ORIGIN',
-        'HEIGHT_TEXTURE_ORIGIN',
-        'CHANNEL_ORIGIN',
-
-        'TOTAL_SHAPE_COUNTER',
-
-        'WIDTH_SHAPE_OUT',
-        'HEIGHT_SHAPE_OUT',
-        'WIDTH_TEXTURE_OUT',
-        'HEIGHT_TEXTURE_OUT',
-        'CHANNEL_OUT',
-        'OFFSET_Y_OUT',
-
-        'AXIS',
-        'MULTI_VALUE',
-        'BIAS_VALUE',
-        'ACTIVE_FUNCTION'
-    ],
-    input: [
-        {
-            tensor: 'origin',
-            variable: 'texture',
-            setter: 'initTexture',
-            type: 'texture'
-        },
-        {
-            tensor: 'counter',
-            variable: 'data',
-            setter: 'uniform1fv',
-            type: 'uniform'
-        }
-    ]
-};
diff --git a/web/src/shader/elementwise_add/main.es6 b/web/src/shader/elementwise_add/main.es6
deleted file mode 100644
index 2ec3c47cb0..0000000000
--- a/web/src/shader/elementwise_add/main.es6
+++ /dev/null
@@ -1,17 +0,0 @@
-/* eslint-disable */
-/**
- * @file 加法主函数
- * @author yangmingming
- */
-export default `
-// start函数
-void main(void) {
-    // 输出数据
-    ivec4 oPos = getOutputTensorPosLIMIT_OUT();
-    int index = oPos[axis];
-    float o = getPixelsFromTexturePos_texture_origin(vCoord).r;
-    float c = getValueFromCounter(index);
-    float res = ACTIVE_FUNCTION(o + c, multi_value, bias_value);
-    setOutput(res);
-}
-`;
diff --git a/web/src/shader/elementwise_add/params.es6 b/web/src/shader/elementwise_add/params.es6
deleted file mode 100644
index 4ca394ef83..0000000000
--- a/web/src/shader/elementwise_add/params.es6
+++ /dev/null
@@ -1,20 +0,0 @@
-/* eslint-disable */
-/**
- * @file 加法参数
- * @author yangmingming
- */
-export default `
-    // 输入数据
-    const int axis = AXIS;
-    // const int total_shape_counter = TOTAL_SHAPE_COUNTER;
-    uniform float data_counter[TOTAL_SHAPE_COUNTER];
-    uniform sampler2D texture_origin;
-    float getValueFromCounter(int index) {
-        for (int i = 0; i < TOTAL_SHAPE_COUNTER; i++) {
-            if (i == index) {
-                return data_counter[i];
-            }
-        }
-        return 0.0;
-    }
-`;
diff --git a/web/src/shader/mul/conf.es6 b/web/src/shader/mul/conf.es6
deleted file mode 100644
index 1d02279049..0000000000
--- a/web/src/shader/mul/conf.es6
+++ /dev/null
@@ -1,57 +0,0 @@
-/* eslint-disable */
-/**
- * @file mul的配置文件
- * @author yangmingming zhangmiao06
- */
-export default {
-    dep: [
-        {
-            func: 'getValueFromTensorPos',
-            conf: {
-                TENSOR_NAME: 'counter'
-            }
-        },
-        {
-            func: 'getValueFromTensorPos',
-            conf: {
-                TENSOR_NAME: 'origin'
-            }
-        }
-    ],
-    conf: [
-        'LENGTH_SHAPE_COUNTER',
-        'WIDTH_SHAPE_COUNTER',
-        'HEIGHT_SHAPE_COUNTER',
-        'WIDTH_TEXTURE_COUNTER',
-        'HEIGHT_TEXTURE_COUNTER',
-        'CHANNEL_COUNTER',
-
-        'WIDTH_SHAPE_ORIGIN',
-        'HEIGHT_SHAPE_ORIGIN',
-        'LENGTH_SHAPE_ORIGIN',
-        'WIDTH_TEXTURE_ORIGIN',
-        'HEIGHT_TEXTURE_ORIGIN',
-        'CHANNEL_ORIGIN',
-
-        'WIDTH_SHAPE_OUT',
-        'HEIGHT_SHAPE_OUT',
-        'WIDTH_TEXTURE_OUT',
-        'HEIGHT_TEXTURE_OUT',
-        'CHANNEL_OUT',
-        'OFFSET_Y_OUT'
-    ],
-    input: [
-        {
-            tensor: 'counter',
-            variable: 'texture',
-            setter: 'initTexture',
-            type: 'texture'
-        },
-        {
-            tensor: 'origin',
-            variable: 'texture',
-            setter: 'initTexture',
-            type: 'texture'
-        }
-    ]
-};
diff --git a/web/src/shader/mul/main.es6 b/web/src/shader/mul/main.es6
deleted file mode 100644
index bdec1041a9..0000000000
--- a/web/src/shader/mul/main.es6
+++ /dev/null
@@ -1,18 +0,0 @@
-/* eslint-disable */
-/**
- * @file mul主函数
- */
-export default `
-// start函数
-void main(void) {
-    float res = 0.0;
-    // 获取output的坐标
-    ivec4 out_pos = getOutputTensorPos();
-    for (int j = 0; j < width_shape_origin; j++) {
-        float c = getValueFromTensorPos_counter(out_pos[0], out_pos[1], j, out_pos[3]);
-        float o = getValueFromTensorPos_origin(out_pos[0], out_pos[1], out_pos[2], j);
-        res += c * o;
-    }
-    setOutput(res);
-}
-`;
diff --git a/web/src/shader/mul/params.es6 b/web/src/shader/mul/params.es6
deleted file mode 100644
index 4f9ec64503..0000000000
--- a/web/src/shader/mul/params.es6
+++ /dev/null
@@ -1,27 +0,0 @@
-/* eslint-disable */
-/**
- * @file mul参数文件
- */
-export default `
-// mul的input数据
-// 常量
-// 输入数据
-const int length_shape_counter = LENGTH_SHAPE_COUNTER;
-const int width_shape_counter = WIDTH_SHAPE_COUNTER;
-const int height_shape_counter = HEIGHT_SHAPE_COUNTER;
-const int width_texture_counter = WIDTH_TEXTURE_COUNTER;
-const int height_texture_counter = HEIGHT_TEXTURE_COUNTER;
-const int channel_counter = CHANNEL_COUNTER;
-
-const int width_shape_origin = WIDTH_SHAPE_ORIGIN;
-const int height_shape_origin = HEIGHT_SHAPE_ORIGIN;
-const int length_shape_origin = LENGTH_SHAPE_ORIGIN;
-const int width_texture_origin = WIDTH_TEXTURE_ORIGIN;
-const int height_texture_origin = HEIGHT_TEXTURE_ORIGIN;
-const int channel_origin = CHANNEL_ORIGIN;
-
-// uniform变量
-// 输入数据
-uniform sampler2D texture_counter;
-uniform sampler2D texture_origin;
-`;
diff --git a/web/src/shader/pool2d/conf.es6 b/web/src/shader/pool2d/conf.es6
deleted file mode 100644
index ebb6070400..0000000000
--- a/web/src/shader/pool2d/conf.es6
+++ /dev/null
@@ -1,48 +0,0 @@
-/* eslint-disable */
-/**
- * @file pool2d的配置文件
- * @author yangmingming zhangmiao06
- */
-export default {
-    dep: [
-        {
-            func: 'getValueFromTensorPos',
-            conf: {
-                TENSOR_NAME: 'origin'
-            }
-        }
-    ],
-    conf: [
-        'KSIZE_X',
-        'KSIZE_Y',
-        'TYPE_POOL',
-
-        'WIDTH_SHAPE_ORIGIN',
-        'HEIGHT_SHAPE_ORIGIN',
-        'LENGTH_SHAPE_ORIGIN',
-        'WIDTH_TEXTURE_ORIGIN',
-        'HEIGHT_TEXTURE_ORIGIN',
-        'CHANNEL_ORIGIN',
-
-        'WIDTH_SHAPE_OUT',
-        'HEIGHT_SHAPE_OUT',
-        'WIDTH_TEXTURE_OUT',
-        'HEIGHT_TEXTURE_OUT',
-        'CHANNEL_OUT',
-        'OFFSET_Y_OUT',
-
-        'STRIDES_X',
-        'STRIDES_Y',
-        'PADDING_X',
-        'PADDING_Y'
-    ],
-    input: [
-        // texture类型，若添加from: 'prev', 表示读取上一个op的产出
-        {
-            tensor: 'origin',
-            variable: 'texture',
-            setter: 'initTexture',
-            type: 'texture'
-        }
-    ]
-};
diff --git a/web/src/shader/pool2d/main.es6 b/web/src/shader/pool2d/main.es6
deleted file mode 100644
index 113de0b586..0000000000
--- a/web/src/shader/pool2d/main.es6
+++ /dev/null
@@ -1,49 +0,0 @@
-/* eslint-disable */
-/**
- * @file pool2d主函数
- */
-export default `
-// start函数
-void main(void) {
-    float res = (-1.0 / exp(-20.0));
-    // 获取output的坐标
-    ivec4 out_pos = getOutputTensorPosLIMIT_OUT();
-    // X、Y方向的移动步长
-    int count_pool = 0;
-    int oy_base = out_pos[2] * stride_v - padTop;
-    int ox_base = out_pos[3] * stride_h - padLeft;
-    for (int fy = 0; fy < height_shape_pool; fy++) {
-        int oy = oy_base + fy;
-        if (oy >= height_shape_origin) {
-            break;
-        }
-        if (oy < 0) {
-            continue;
-        }
-        for (int fx = 0; fx < width_shape_pool; fx++) {
-            int ox = ox_base + fx;
-            if (ox >= width_shape_origin) {
-                break;
-            }
-            if (ox < 0) {
-                continue;
-            }
-            // origin数据
-            float curr = getValueFromTensorPosLIMIT_ORIGIN_origin(out_pos[0], out_pos[1], oy, ox);
-            if (type_pool == 1) {
-                if (curr > res) {
-                    res = curr;
-                }
-            } else {
-                res += curr;
-                // 在平均池化模式忽略填充值(exclusive默认为true）
-                count_pool++;
-            }
-        }
-    }
-    if (type_pool != 1) {
-        res = res / float(count_pool);
-    }
-    setOutput(res);
-}
-`;
diff --git a/web/src/shader/pool2d/params.es6 b/web/src/shader/pool2d/params.es6
deleted file mode 100644
index e836276e32..0000000000
--- a/web/src/shader/pool2d/params.es6
+++ /dev/null
@@ -1,30 +0,0 @@
-/* eslint-disable */
-/**
- * @file pool2d参数文件
- */
-export default `
-// 常量
-// 池化大小
-const int width_shape_pool = KSIZE_X;
-const int height_shape_pool = KSIZE_Y;
-const int type_pool = TYPE_POOL;
-// 输入数据
-const int width_shape_origin = WIDTH_SHAPE_ORIGIN;
-const int height_shape_origin = HEIGHT_SHAPE_ORIGIN;
-const int length_shape_origin = LENGTH_SHAPE_ORIGIN;
-const int width_texture_origin = WIDTH_TEXTURE_ORIGIN;
-const int height_texture_origin = HEIGHT_TEXTURE_ORIGIN;
-const int channel_origin = CHANNEL_ORIGIN;
-
-// 计算相关
-// 拆分步长
-const int stride_h = STRIDES_X;
-const int stride_v = STRIDES_Y;
-// padding的数目
-const int padLeft = PADDINGS_X;
-const int padTop = PADDINGS_Y;
-
-
-// uniform变量
-uniform sampler2D texture_origin;
-`;
diff --git a/web/src/shader/pool2d_avg/conf.es6 b/web/src/shader/pool2d_avg/conf.es6
deleted file mode 100644
index fe45ac6450..0000000000
--- a/web/src/shader/pool2d_avg/conf.es6
+++ /dev/null
@@ -1,47 +0,0 @@
-/* eslint-disable */
-/**
- * @file pool2d_avg的配置文件
- * @author yangmingming zhangmiao06
- */
-export default {
-    dep: [
-        {
-            func: 'getValueFromTensorPos',
-            conf: {
-                TENSOR_NAME: 'origin'
-            }
-        }
-    ],
-    conf: [
-        'KSIZE_X',
-        'KSIZE_Y',
-
-        'WIDTH_SHAPE_ORIGIN',
-        'HEIGHT_SHAPE_ORIGIN',
-        'LENGTH_SHAPE_ORIGIN',
-        'WIDTH_TEXTURE_ORIGIN',
-        'HEIGHT_TEXTURE_ORIGIN',
-        'CHANNEL_ORIGIN',
-
-        'WIDTH_SHAPE_OUT',
-        'HEIGHT_SHAPE_OUT',
-        'WIDTH_TEXTURE_OUT',
-        'HEIGHT_TEXTURE_OUT',
-        'CHANNEL_OUT',
-        'OFFSET_Y_OUT',
-
-        'STRIDES_X',
-        'STRIDES_Y',
-        'PADDING_X',
-        'PADDING_Y'
-    ],
-    input: [
-        // texture类型，若添加from: 'prev', 表示读取上一个op的产出
-        {
-            tensor: 'origin',
-            variable: 'texture',
-            setter: 'initTexture',
-            type: 'texture'
-        }
-    ]
-};
diff --git a/web/src/shader/pool2d_avg/main.es6 b/web/src/shader/pool2d_avg/main.es6
deleted file mode 100644
index b9f157e402..0000000000
--- a/web/src/shader/pool2d_avg/main.es6
+++ /dev/null
@@ -1,40 +0,0 @@
-/* eslint-disable */
-/**
- * @file pool2d_avg主函数
- * @author yangmingming zhangmiao06
- */
-export default `
-// start函数
-void main(void) {
-    float res = 0.0;
-    // 获取output的坐标
-    ivec4 out_pos = getOutputTensorPos();
-    // X、Y方向的移动步长
-    int oy_base = out_pos[2] * stride_v - padTop;
-    int ox_base = out_pos[3] * stride_h - padLeft;
-    for (int fy = 0; fy < height_shape_pool; fy++) {
-        int oy = oy_base + fy;
-        if (oy >= height_shape_origin) {
-            break;
-        }
-        if (oy < 0) {
-            continue;
-        }
-        for (int fx = 0; fx < width_shape_pool; fx++) {
-            int ox = ox_base + fx;
-            if (ox >= width_shape_origin) {
-                break;
-            }
-            if (ox < 0) {
-                continue;
-            }
-            // origin数据
-            float curr = getValueFromTensorPos_origin(out_pos[0], out_pos[1], oy, ox);
-            res += curr;
-            // 在平均池化模式忽略填充值(exclusive默认为true）
-        }
-    }
-    res = res / float(height_shape_pool * width_shape_pool);
-    setOutput(res);
-}
-`;
diff --git a/web/src/shader/pool2d_avg/params.es6 b/web/src/shader/pool2d_avg/params.es6
deleted file mode 100644
index 17b55948e0..0000000000
--- a/web/src/shader/pool2d_avg/params.es6
+++ /dev/null
@@ -1,30 +0,0 @@
-/* eslint-disable */
-/**
- * @file pool2d_avg参数文件
- * @author yangmingming zhangmiao06
- */
-export default `
-// 常量
-// 池化大小
-const int width_shape_pool = KSIZE_X;
-const int height_shape_pool = KSIZE_Y;
-// 输入数据
-const int width_shape_origin = WIDTH_SHAPE_ORIGIN;
-const int height_shape_origin = HEIGHT_SHAPE_ORIGIN;
-const int length_shape_origin = LENGTH_SHAPE_ORIGIN;
-const int width_texture_origin = WIDTH_TEXTURE_ORIGIN;
-const int height_texture_origin = HEIGHT_TEXTURE_ORIGIN;
-const int channel_origin = CHANNEL_ORIGIN;
-
-// 计算相关
-// 拆分步长
-const int stride_h = STRIDES_X;
-const int stride_v = STRIDES_Y;
-// padding的数目
-const int padLeft = PADDINGS_X;
-const int padTop = PADDINGS_Y;
-
-
-// uniform变量
-uniform sampler2D texture_origin;
-`;
diff --git a/web/src/shader/pool2d_max/conf.es6 b/web/src/shader/pool2d_max/conf.es6
deleted file mode 100644
index 722e7df28a..0000000000
--- a/web/src/shader/pool2d_max/conf.es6
+++ /dev/null
@@ -1,47 +0,0 @@
-/* eslint-disable */
-/**
- * @file pool2d的配置文件
- * @author yangmingming zhangmiao06
- */
-export default {
-    dep: [
-        {
-            func: 'getValueFromTensorPos',
-            conf: {
-                TENSOR_NAME: 'origin'
-            }
-        }
-    ],
-    conf: [
-        'KSIZE_X',
-        'KSIZE_Y',
-
-        'WIDTH_SHAPE_ORIGIN',
-        'HEIGHT_SHAPE_ORIGIN',
-        'LENGTH_SHAPE_ORIGIN',
-        'WIDTH_TEXTURE_ORIGIN',
-        'HEIGHT_TEXTURE_ORIGIN',
-        'CHANNEL_ORIGIN',
-
-        'WIDTH_SHAPE_OUT',
-        'HEIGHT_SHAPE_OUT',
-        'WIDTH_TEXTURE_OUT',
-        'HEIGHT_TEXTURE_OUT',
-        'CHANNEL_OUT',
-        'OFFSET_Y_OUT',
-
-        'STRIDES_X',
-        'STRIDES_Y',
-        'PADDING_X',
-        'PADDING_Y'
-    ],
-    input: [
-        // texture类型，若添加from: 'prev', 表示读取上一个op的产出
-        {
-            tensor: 'origin',
-            variable: 'texture',
-            setter: 'initTexture',
-            type: 'texture'
-        }
-    ]
-};
diff --git a/web/src/shader/pool2d_max/main.es6 b/web/src/shader/pool2d_max/main.es6
deleted file mode 100644
index 2555efc95d..0000000000
--- a/web/src/shader/pool2d_max/main.es6
+++ /dev/null
@@ -1,45 +0,0 @@
-/* eslint-disable */
-/**
- * @file pool2d主函数
- */
-export default `
-// start函数
-void main(void) {
-    float res = (-1.0 / exp(-20.0));
-    // 获取output的坐标
-    ivec4 out_pos = getOutputTensorPosLIMIT_OUT();
-    int b = out_pos[0];
-    int c = out_pos[1];
-    int y = out_pos[2];
-    int x = out_pos[3];
-    // X、Y方向的移动步长
-    int oy_base = out_pos[2] * stride_v - padTop;
-    int ox_base = out_pos[3] * stride_h - padLeft;
-    for (int fy = 0; fy < height_shape_pool; fy++) {
-        int oy = oy_base + fy;
-        if (oy >= height_shape_origin) {
-            break;
-        }
-        if (oy < 0) {
-            continue;
-        }
-        for (int fx = 0; fx < width_shape_pool; fx++) {
-            int ox = ox_base + fx;
-            if (ox >= width_shape_origin) {
-                break;
-            }
-            if (ox < 0) {
-                continue;
-            }
-            // origin数据
-            float curr = getValueFromTensorPosLIMIT_ORIGIN_origin(out_pos[0], out_pos[1], oy, ox);
-            res = max(res, curr);
-        }
-    } 
-    setOutput(res);
-    // outColor.r = float(b);
-    //     outColor.g = float(c);
-    //     outColor.b = float(y);
-    //     outColor.a = float(x);
-}
-`;
diff --git a/web/src/shader/pool2d_max/params.es6 b/web/src/shader/pool2d_max/params.es6
deleted file mode 100644
index 1ddd6befc8..0000000000
--- a/web/src/shader/pool2d_max/params.es6
+++ /dev/null
@@ -1,29 +0,0 @@
-/* eslint-disable */
-/**
- * @file pool2d参数文件
- */
-export default `
-// 常量
-// 池化大小
-const int width_shape_pool = KSIZE_X;
-const int height_shape_pool = KSIZE_Y;
-// 输入数据
-const int width_shape_origin = WIDTH_SHAPE_ORIGIN;
-const int height_shape_origin = HEIGHT_SHAPE_ORIGIN;
-const int length_shape_origin = LENGTH_SHAPE_ORIGIN;
-const int width_texture_origin = WIDTH_TEXTURE_ORIGIN;
-const int height_texture_origin = HEIGHT_TEXTURE_ORIGIN;
-const int channel_origin = CHANNEL_ORIGIN;
-
-// 计算相关
-// 拆分步长
-const int stride_h = STRIDES_X;
-const int stride_v = STRIDES_Y;
-// padding的数目
-const int padLeft = PADDINGS_X;
-const int padTop = PADDINGS_Y;
-
-
-// uniform变量
-uniform sampler2D texture_origin;
-`;
diff --git a/web/src/shader/pool2d_winograd/conf.es6 b/web/src/shader/pool2d_winograd/conf.es6
deleted file mode 100644
index 3c3925a3b4..0000000000
--- a/web/src/shader/pool2d_winograd/conf.es6
+++ /dev/null
@@ -1,50 +0,0 @@
-/* eslint-disable */
-/**
- * @file pool2d的配置文件
- * @author yangmingming zhangmiao06
- */
-export default {
-    dep: [
-        {
-            func: 'getValueFromTensorPosPacked',
-            conf: {
-                TENSOR_NAME: 'origin'
-            }
-        }
-    ],
-    conf: [
-        'KSIZE_X',
-        'KSIZE_Y',
-        'TYPE_POOL',
-
-        'WIDTH_SHAPE_ORIGIN',
-        'HEIGHT_SHAPE_ORIGIN',
-        'LENGTH_SHAPE_ORIGIN',
-        'WIDTH_TEXTURE_ORIGIN',
-        'HEIGHT_TEXTURE_ORIGIN',
-        'CHANNEL_ORIGIN',
-        'OFFSET_X_ORIGIN',
-        'OFFSET_Y_ORIGIN',
-
-        'WIDTH_SHAPE_OUT',
-        'HEIGHT_SHAPE_OUT',
-        'WIDTH_TEXTURE_OUT',
-        'HEIGHT_TEXTURE_OUT',
-        'CHANNEL_OUT',
-        'OFFSET_Y_OUT',
-
-        'STRIDES_X',
-        'STRIDES_Y',
-        'PADDING_X',
-        'PADDING_Y'
-    ],
-    input: [
-        // texture类型，若添加from: 'prev', 表示读取上一个op的产出
-        {
-            tensor: 'origin',
-            variable: 'texture',
-            setter: 'initTexture',
-            type: 'texture'
-        }
-    ]
-};
diff --git a/web/src/shader/pool2d_winograd/main.es6 b/web/src/shader/pool2d_winograd/main.es6
deleted file mode 100644
index 0112654fdd..0000000000
--- a/web/src/shader/pool2d_winograd/main.es6
+++ /dev/null
@@ -1,63 +0,0 @@
-/* eslint-disable */
-/**
- * @file pool2d主函数
- */
-export default `
-// start函数
-void main(void) {
-    float res = (-1.0 / exp(-20.0));
-    // 获取output的坐标
-    ivec4 out_pos = getOutputTensorPos();
-    // int b = out_pos[0];
-    // int c = out_pos[1];
-    // int y = out_pos[2];
-    // int x = out_pos[3];
-    // X、Y方向的移动步长
-    int count_pool = 0;
-    int oy_base = out_pos[2] * stride_v - padTop;
-    int ox_base = out_pos[3] * stride_h - padLeft;
-    // int offset = 0;
-    // vec4 v4 = texture(texture_origin, vec2((float(0) + 0.5) / float(width_texture_origin), (float(1 * height_shape_origin / 2 + 0) + 0.5) / float(height_texture_origin)));
-    for (int fy = 0; fy < height_shape_pool; fy++) {
-        int oy = oy_base + fy;
-        if (oy >= height_shape_origin) {
-            break;
-        }
-        if (oy < 0) {
-            continue;
-        }
-        for (int fx = 0; fx < width_shape_pool; fx++) {
-            int ox = ox_base + fx;
-            if (ox >= width_shape_origin) {
-                break;
-            }
-            if (ox < 0) {
-                continue;
-            }
-            // origin数据
-            float curr = getValueFromTensorPosPacked_origin(out_pos[0], out_pos[1], oy, ox);
-            // y = oy;
-            // x = ox;
-            // v4[offset++] = curr;
-            if (type_pool == 1) {
-                if (curr > res) {
-                    res = curr;
-                }
-            } else {
-                res += curr;
-                // 在平均池化模式忽略填充值(exclusive默认为true）
-                count_pool++;
-            }
-        }
-    }
-    if (type_pool != 1) {
-        res = res / float(count_pool);
-    }
-    setOutput(res);
-    // outColor = v4;
-    // outColor.r = float(b);
-    // outColor.g = float(c);
-    // outColor.b = float(y);
-    // outColor.a = float(x);
-}
-`;
diff --git a/web/src/shader/pool2d_winograd/params.es6 b/web/src/shader/pool2d_winograd/params.es6
deleted file mode 100644
index 4310c0f7b2..0000000000
--- a/web/src/shader/pool2d_winograd/params.es6
+++ /dev/null
@@ -1,33 +0,0 @@
-/* eslint-disable */
-/**
- * @file pool2d参数文件
- */
-export default `
-// 常量
-// 池化大小
-const int width_shape_pool = KSIZE_X;
-const int height_shape_pool = KSIZE_Y;
-const int type_pool = TYPE_POOL;
-// 输入数据
-const int width_shape_origin = WIDTH_SHAPE_ORIGIN;
-const int height_shape_origin = HEIGHT_SHAPE_ORIGIN;
-const int length_shape_origin = LENGTH_SHAPE_ORIGIN;
-const int width_texture_origin = WIDTH_TEXTURE_ORIGIN;
-const int height_texture_origin = HEIGHT_TEXTURE_ORIGIN;
-const int channel_origin = CHANNEL_ORIGIN;
-const int offset_x_origin = OFFSET_X_ORIGIN;
-const int offset_y_origin = OFFSET_Y_ORIGIN;
-
-
-// 计算相关
-// 拆分步长
-const int stride_h = STRIDES_X;
-const int stride_v = STRIDES_Y;
-// padding的数目
-const int padLeft = PADDINGS_X;
-const int padTop = PADDINGS_Y;
-
-
-// uniform变量
-uniform sampler2D texture_origin;
-`;
diff --git a/web/src/shader/softmax/conf.es6 b/web/src/shader/softmax/conf.es6
deleted file mode 100644
index 07a7bbab41..0000000000
--- a/web/src/shader/softmax/conf.es6
+++ /dev/null
@@ -1,29 +0,0 @@
-/* eslint-disable */
-/**
- * @file softmax的配置文件
- * @author yangmingming
- */
-export default {
-    dep: [
-        {
-            func: 'getPixelsFromTexturePos',
-            conf: {
-                TEXTURE_NAME: 'texture_origin'
-            }
-        }
-    ],
-    conf: [
-        'WIDTH_TEXTURE_ORIGIN',
-        'HEIGHT_TEXTURE_ORIGIN',
-        'TOTAL_SHAPE_ORIGIN',
-        'OFFSET_Y_OUT'
-    ],
-    input: [
-        {
-            tensor: 'origin',
-            variable: 'texture',
-            setter: 'initTexture',
-            type: 'texture'
-        }
-    ]
-};
diff --git a/web/src/shader/softmax/main.es6 b/web/src/shader/softmax/main.es6
deleted file mode 100644
index 3f7ba78af0..0000000000
--- a/web/src/shader/softmax/main.es6
+++ /dev/null
@@ -1,55 +0,0 @@
-/* eslint-disable */
-/**
- * @file softmax主函数
- * @author yangmingming
- */
-export default `
-// start函数
-void main(void) {
-    vec4 v4 = getPixelsFromTexturePos_texture_origin(vCoord);
-    vec2 onePixel = vec2(1.0 / float(width_texture_origin), 1.0 / float(height_texture_origin));
-    float total = 0.0;
-    float maxValue = getPixelsFromTexturePos_texture_origin(onePixel).r;
-    int number = 0;
-    vec4 pixels;
-    // 求最大
-    for (int i = 0; i < height_texture_origin; i++) {
-        for (int j = 0; j < width_texture_origin; j++) {
-            pixels = getPixelsFromTexturePos_texture_origin(onePixel * vec2(float(j), float(i)));
-            number = i * width_texture_origin + j;
-            if ((number * 4 + 1) < total_shape_origin) {
-                maxValue = max(pixels.r, maxValue);
-            }
-            if ((number * 4 + 2) < total_shape_origin) {
-                maxValue = max(pixels.g, maxValue);
-            }
-            if ((number * 4 + 3) < total_shape_origin) {
-                maxValue = max(pixels.b, maxValue);
-            }
-            if ((number * 4 + 4) < total_shape_origin) {
-                maxValue = max(pixels.a, maxValue);
-            }
-        }
-    }
-    // 求和
-    for (int i = 0; i < height_texture_origin; i++) {
-        for (int j = 0; j < width_texture_origin; j++) {
-            pixels = getPixelsFromTexturePos_texture_origin(onePixel * vec2(float(j), float(i)));
-            number = i * width_texture_origin + j;
-            if ((number * 4 + 1) < total_shape_origin) {
-                total += exp(pixels.r - maxValue);
-            }
-            if ((number * 4 + 2) < total_shape_origin) {
-                total += exp(pixels.g - maxValue);
-            }
-            if ((number * 4 + 3) < total_shape_origin) {
-                total += exp(pixels.b - maxValue);
-            }
-            if ((number * 4 + 4) < total_shape_origin) {
-                total += exp(pixels.a - maxValue);
-            }
-        }
-    }
-    gl_FragColor = exp(v4 - vec4(maxValue, maxValue, maxValue, maxValue)) / vec4(total, total, total, total) ;
-}
-`;
diff --git a/web/src/shader/softmax/params.es6 b/web/src/shader/softmax/params.es6
deleted file mode 100644
index 514aeba721..0000000000
--- a/web/src/shader/softmax/params.es6
+++ /dev/null
@@ -1,15 +0,0 @@
-/* eslint-disable */
-/**
- * @file softmax参数文件
- * @author yangmingming
- */
-export default `
-// 输入数据
-const int width_texture_origin = WIDTH_TEXTURE_ORIGIN;
-const int height_texture_origin = HEIGHT_TEXTURE_ORIGIN;
-const int total_shape_origin = TOTAL_SHAPE_ORIGIN;
-
-// uniform变量
-// 输入数据
-uniform sampler2D texture_origin;
-`;
diff --git a/web/src/shader/v_shader.es6 b/web/src/shader/v_shader.es6
deleted file mode 100644
index 47ecbc24c2..0000000000
--- a/web/src/shader/v_shader.es6
+++ /dev/null
@@ -1,15 +0,0 @@
-/* eslint-disable */
-/**
- * @file 顶点文件
- * @author yangmingming
- */
-export default `
-attribute vec4 position;
-varying vec2 vCoord;
-
-void main() {
-    vCoord.x = (position.x + 1.0) / 2.0;
-    vCoord.y = (position.y + 1.0) / 2.0;
-    gl_Position = position;
-}
-`;
diff --git a/web/src/shader/v_shader2.es6 b/web/src/shader/v_shader2.es6
deleted file mode 100644
index 529dc61102..0000000000
--- a/web/src/shader/v_shader2.es6
+++ /dev/null
@@ -1,15 +0,0 @@
-/* eslint-disable */
-/**
- * @file 顶点文件,webgl 2.0
- * @author yangmingming
- */
-export default `#version 300 es
-in vec4 position;
-out vec2 vCoord;
-
-void main() {
-    vCoord.x = (position.x + 1.0) / 2.0;
-    vCoord.y = (position.y + 1.0) / 2.0;
-    gl_Position = position;
-}
-`;
diff --git a/web/src/test/getMaxUniforms.es6 b/web/src/test/getMaxUniforms.es6
deleted file mode 100644
index 9fd0f710a8..0000000000
--- a/web/src/test/getMaxUniforms.es6
+++ /dev/null
@@ -1,59 +0,0 @@
-/**
- * @file 获取当前环境的max uniform变量
- * @author yangmingming
- */
-// uniform变量类型
-const enums = {
-    0x8B50: 'FLOAT_VEC2',
-    0x8B51: 'FLOAT_VEC3',
-    0x8B52: 'FLOAT_VEC4',
-    0x8B53: 'INT_VEC2',
-    0x8B54: 'INT_VEC3',
-    0x8B55: 'INT_VEC4',
-    0x8B56: 'BOOL',
-    0x8B57: 'BOOL_VEC2',
-    0x8B58: 'BOOL_VEC3',
-    0x8B59: 'BOOL_VEC4',
-    0x8B5A: 'FLOAT_MAT2',
-    0x8B5B: 'FLOAT_MAT3',
-    0x8B5C: 'FLOAT_MAT4',
-    0x8B5E: 'SAMPLER_2D',
-    0x8B60: 'SAMPLER_CUBE',
-    0x1400: 'BYTE',
-    0x1401: 'UNSIGNED_BYTE',
-    0x1402: 'SHORT',
-    0x1403: 'UNSIGNED_SHORT',
-    0x1404: 'INT',
-    0x1405: 'UNSIGNED_INT',
-    0x1406: 'FLOAT'
-};
-export default function(gl, program) {
-    // max fragment shader, 安卓是256, 桌面chrome浏览器是1024
-    const result = {
-        attributes: [],
-        uniforms: [],
-        attributeCount: 0,
-        uniformCount: 0,
-        maxVertexShader: gl.getParameter(gl.MAX_VERTEX_UNIFORM_VECTORS),
-        maxFragmentShader: gl.getParameter(gl.MAX_FRAGMENT_UNIFORM_VECTORS)
-    };
-    const activeUniforms = gl.getProgramParameter(program, gl.ACTIVE_UNIFORMS);
-    const activeAttributes = gl.getProgramParameter(program, gl.ACTIVE_ATTRIBUTES);
-    // Loop through active uniforms
-    for (let i = 0; i < activeUniforms; i++) {
-        const uniform = gl.getActiveUniform(program, i);
-        uniform.typeName = enums[uniform.type];
-        result.uniforms.push(uniform);
-        result.uniformCount += uniform.size;
-    }
-
-    // Loop through active attributes
-    for (let i = 0; i < activeAttributes; i++) {
-        const attribute = gl.getActiveAttrib(program, i);
-        attribute.typeName = enums[attribute.type];
-        result.attributes.push(attribute);
-        result.attributeCount += attribute.size;
-    }
-
-    return result;
-};
diff --git a/web/src/utils/models.es6 b/web/src/utils/models.es6
deleted file mode 100644
index 80c1270441..0000000000
--- a/web/src/utils/models.es6
+++ /dev/null
@@ -1,46 +0,0 @@
-export default {
-    '608': {
-        modelPath: 'faceModel',
-        feedShape: {
-            fw: 608,
-            fh: 608
-        },
-        outputShapes: {
-            from: [19, 19, 25, 1],
-            to: [19, 19, 5, 5]
-        }
-    },
-    '320': {
-        modelPath: 'facemodel320',
-        feedShape: {
-            fw: 320,
-            fh: 320
-        },
-        outputShapes: {
-            from: [10, 10, 25, 1],
-            to: [10, 10, 5, 5]
-        }
-    },
-    '320fused': {
-        modelPath: 'facemodelfused',
-        feedShape: {
-            fw: 320,
-            fh: 320
-        },
-        outputShapes: {
-            from: [10, 10, 25, 1],
-            to: [10, 10, 5, 5]
-        }
-    },
-    'separate': {
-        modelPath: 'separablemodel',
-        feedShape: {
-            fw: 320,
-            fh: 320
-        },
-        outputShapes: {
-            from: [10, 10, 25, 1],
-            to: [10, 10, 5, 5]
-        }
-    }
-};
diff --git a/web/src/utils/opData.es6 b/web/src/utils/opData.es6
deleted file mode 100644
index 80526f4f8b..0000000000
--- a/web/src/utils/opData.es6
+++ /dev/null
@@ -1,407 +0,0 @@
-/* eslint-disable */
-import Utils from './utils';
-import Tensor from './tensor';
-/**
- * @file op的数据对象
- * @author yangmingming
- *
- */
-const keys = [
-    'paddings',
-    'strides',
-    'dilations',
-    'ksize'
-];
-// 从tensor对象中获取的数据
-const tensorAttrs = [
-    'length_shape',
-    'width_shape',
-    'height_shape',
-    'width_texture',
-    'height_texture',
-    'offset_x',
-    'offset_y',
-    'limit',
-    'channel',
-    'total_shape'
-];
-// shader中需要的常量
-const shaderAttrs = {
-    scale: {
-        'bias': 'bias_value',
-        'scale': 'multi_value'
-    },
-    pool2d: {
-        'pooling_type': 'type_pool'
-    },
-    pool2d_winograd: {
-        'pooling_type': 'type_pool'
-    }
-};
-// model的名字和paddle web的tensor名字mapping
-const tensorName = {
-    'input': 'origin',
-    'x': 'origin',
-    'filter': 'filter',
-    'y': 'counter',
-    'output': 'out',
-    'out': 'out',
-    'scale': 'scale',
-    'bias': 'bias',
-    'mean': 'mean',
-    'variance': 'variance'
-};
-// unique behavior
-const opBehavior = {
-    conv2d: [
-        'needBatch',
-        'isApplySeparableConv'
-    ],
-    batchnorm: [
-        'needBatch',
-        'mergeTensor'
-    ],
-    elementwise_add: [
-        'broadcast',
-        'needBatch'
-    ],
-    conv2d_elementwise_add: [
-        'mergeAttrs',
-        'setActiveFunc',
-        'needBatch'
-    ],
-    pool2d: [
-        'isMax',
-        'needBatch',
-        'setPacked',
-        'isGlobalPooling'
-    ],
-    relu: [
-        'transToPrelu',
-        'needBatch'
-    ],
-    leaky_relu: [
-        'transToLeakyrelu',
-        'needBatch'
-    ],
-    mul: [
-        'reshape',
-        'needBatch'
-    ]
-};
-const mergeType = 'conv2d-elementwise_add';
-export default class OpData {
-    constructor(name, input = {}, output = {}, attrs = {}) {
-        this.realName = name;
-        this.name = name;
-        this.attrs = attrs;
-        // 检查是否是融合op
-        this.checkIsMerge();
-        // 是否忽略当前当前op, 使用dropout
-        this.isPass = this.checkIsPass();
-        if (this.isPass) {
-            this.input = input;
-            this.output = output;
-            // op数据, 当前不扩展
-            this.data = {
-                'active_function': 'scale',
-                'multi_value': '1.0',
-                'bias_value': '0.0'
-            };
-            // tensor数据
-            this.tensor = {};
-            this.buildTensor();
-            this.buildAttrs();
-        }
-    }
-
-    buildTensor() {
-        // todo: 是否需要形状对齐
-        // todo: 是否需要广播tensor
-        const tensorData = [];
-        for (let key in this.input) {
-            if (this.input.hasOwnProperty(key)) {
-                const data = this.input[key] || [{}];
-                // 默认取第一个数据
-                if (tensorName[key.toLowerCase()]) {
-                    data[0].tensorName = tensorName[key.toLowerCase()];
-                    tensorData.push(data[0]);
-                }
-            }
-        }
-        // debugger
-        // todo: 临时删除output里的Y
-        delete this.output.Y;
-        // 输出tensor
-        for (let key in this.output) {
-            if (this.output.hasOwnProperty(key)) {
-                // 默认取第一个数据
-                const data = this.output[key] || [{}];
-                if (tensorName[key.toLowerCase()]) {
-                    data[0].tensorName = tensorName[key.toLowerCase()];
-                    tensorData.push(data[0]);
-                }
-            }
-        }
-        // unique behavior
-        const behavior = opBehavior[this.name] || [];
-        behavior.forEach(behavior => {
-            this[behavior](tensorData);
-        });
-        // 生成tensor对象
-        tensorData.forEach(data => {
-            // console.log(data);
-            if (data) {
-                if (data.notTensor) {
-                    this.tensor[data.tensorName] = {
-                        name: data.tensorName,
-                        data: new Float32Array(data.data),
-                        total_shape: data.data.length
-                    };
-                } else {
-                    this.tensor[data.tensorName] = new Tensor({
-                        type: data.name,
-                        name: data.tensorName,
-                        shape: data.shape,
-                        data: data.data,
-                        needBatch: data.needBatch || false,
-                        notCompressed: data.notCompressed || false,
-                        isPacked: data.isPacked || false
-                    });
-                }
-            }
-        });
-        // console.dir(['tensors', this.tensor]);
-    }
-
-    buildAttrs() {
-        // 计算属性
-        for (let key in this.attrs) {
-            if (this.attrs.hasOwnProperty(key)) {
-                const item = this.attrs[key];
-                if (Object.prototype.toString.call(item) === '[object Array]') {
-                    if (keys.indexOf(key) > -1) {
-                        this.data[key + '_x'] = item[0];
-                        this.data[key + '_y'] = item[1];
-                    }
-                } else {
-                    this.data[key] = item;
-                    // 获取shader所需的数据
-                    let shaderAttr = shaderAttrs[this.name];
-                    if (shaderAttr && shaderAttr.hasOwnProperty(key)) {
-                        this.data[shaderAttr[key]] = item;
-                    }
-                }
-            }
-        }
-        // 获取tensor的数据
-        for (let key in this.tensor) {
-            const tensor = this.tensor[key];
-            tensorAttrs.forEach(attr => {
-                this.data[attr+ '_' + tensor.name] = tensor[attr];
-            });
-        }
-    }
-
-    needBatch(tensorData = []) {
-        tensorData.forEach(data => (data.needBatch = true));
-    }
-
-    isGlobalPooling(tensorData = []) {
-        let counter = tensorData.filter(tensor => (tensor.tensorName === 'origin'))[0] || {};
-        let length = counter.shape && counter.shape.length || 0;
-        if (length > 2 && this.attrs['global_pooling']) {
-            this.attrs.ksize = [counter.shape[length - 2], counter.shape[length - 1]];
-        }
-    }
-
-    mergeAttrs() {
-        this.attrs = this.attrs.reduce((attrs, item) => {
-            return Object.assign(attrs, item);
-        }, {});
-    }
-
-    isApplyWinoGrad(tensorData = []) {
-        const filter = tensorData.filter(item => {
-            const [b, c, h, w] = item.shape;
-            return (h === 3) && (w === 3) && (item.tensorName === 'filter');
-        });
-        // 使用winograd算法
-        if (filter && filter.length) {
-            this.setPacked(tensorData);
-            this.applyWinograd(tensorData);
-            this.setOutputPacked(tensorData);
-            this.name += '_winograd';
-        }
-    }
-
-    isApplySeparableConv(tensorData = []) {
-        const groups = this.attrs.groups;
-        const filter = tensorData.filter(item => {
-            const [b, c, h, w] = item.shape;
-            return (b === groups) && (c === 1) && (item.tensorName === 'filter');
-        });
-        if (filter && filter.length) {
-            // 可以执行separable conv
-            this.name += '_depthwise';
-        }
-    }
-
-    setPacked(tensorData = []) {
-        const isPacked = this.attrs.ispacked;
-        tensorData.forEach(item => {
-            if (item.tensorName === 'origin' && isPacked) {
-                item.isPacked = true;
-                if (this.name.indexOf('pool') > -1) {
-                    this.name += '_winograd';
-                }
-            }
-        });
-    }
-
-    applyWinograd(tensorData = []) {
-        tensorData.forEach(item => {
-            if (item.tensorName === 'filter') {
-                const [b, c, h, w] = item.shape;
-                item.shape = [b, c, 4, 4];
-                item.data = Utils.applyFilterWinograd(item.data, item.shape);
-            }
-        });
-    }
-
-    setOutputPacked(tensorData = []) {
-        tensorData.forEach(item => {
-            if (item.tensorName === 'out') {
-                item.isPacked = true;
-            }
-        });
-    }
-
-    broadcast(tensorData = []) {
-        tensorData.forEach(item => {
-            if (item.tensorName === 'counter') {
-                item.notTensor = true;
-            }
-        });
-        return;
-
-        // mobilenet model
-        // todo: 默认y的shape length是1, 以后需要实现通用版本
-        let shape = Utils.getBroadcastShapeInPaddle(x.shape, y.shape, this.attrs['axis']);
-        // 填充shape数据
-        if (small.shape.length === 1) {
-            const result = [];
-            small.shape = shape;
-            let total = shape.reduce((all, num) => all * num);
-            for (let i = 0; i < small.shape[0]; i++) {
-                let item = small.data[i];
-                for (let j = 0; j < total / shape[0]; j++) {
-                    result.push(item);
-                }
-            }
-            small.data = result;
-        }
-    }
-
-    isMax(tensorData = []) {
-        const type = this.attrs['pooling_type'] === 'max' ? 1 : 0;
-        this.attrs['pooling_type'] = type;
-        if (type === 1) {
-            this.name += '_max';
-        }
-    }
-
-    transToPrelu(tensorData = []) {
-        this.data['multi_value'] = '0.0';
-        this.data['active_function'] = 'prelu';
-    }
-
-    transToLeakyrelu(tensorData = []) {
-        this.data['multi_value'] = this.attrs.alpha;
-        this.data['active_function'] = 'leakyRelu';
-        this.name = 'relu';
-    }
-
-    setActiveFunc() {
-        // 用于融合op
-        const suffix = this.realName.replace(mergeType + '-', '');
-        if (suffix === 'leaky_relu') {
-            this.data['multi_value'] = this.attrs.alpha;
-            this.data['active_function'] = 'leakyRelu';
-        }
-    }
-
-    reshape(tensorData = []) {
-        let input = tensorData[0];
-        let counter = tensorData[1];
-        if (counter.shape.length > input.shape.length) {
-            input = tensorData[1];
-            counter = tensorData[0];
-        }
-        if (input.shape.length > 2 && counter.shape.length === 2) {
-            let shape = Utils.getReshapeInPaddle(input.shape, counter.shape, tensorData[2].shape);
-            input.shape = shape;
-        }
-
-    }
-
-    mergeTensor(tensorData = []) {
-        // 融合scale、bias、variance、mean
-        let constants = ['scale', 'bias', 'variance', 'mean'];
-        let result = {};
-        let data = [];
-        tensorData.forEach((tensor, index) => {
-            result[tensor.tensorName] = tensor;
-            result[tensor.tensorName + 'Index'] = index;
-        });
-        for (let i = 0; i < result[constants[0]].shape[0]; i++) {
-            data.push(result[constants[0]].data[i]);
-            data.push(result[constants[1]].data[i]);
-            data.push(result[constants[2]].data[i]);
-            data.push(result[constants[3]].data[i]);
-        }
-        tensorData[result[constants[0] + 'Index']].data = data;
-        // 充分利用shader空间
-        tensorData[result[constants[0] + 'Index']].notCompressed = true;
-        tensorData[result[constants[0] + 'Index']].shape[0] *= 4;
-        tensorData.splice(result[constants[1] + 'Index'], 1, 0);
-        tensorData.splice(result[constants[2] + 'Index'], 1, 0);
-        tensorData.splice(result[constants[3] + 'Index'], 1, 0);
-    }
-
-    checkIsMerge() {
-        if (this.name.indexOf(mergeType) > -1
-            && Object.prototype.toString.apply(this.attrs) === '[object Array]') {
-            // 第一个融合op
-            this.name  = 'conv2d_elementwise_add';
-            return true;
-        }
-        return false;
-    }
-
-    checkIsPass() {
-        if (this.name === 'dropout') {
-            if (this.attrs['dropout_implementation'] === 'downgrade_in_infer') {
-                this.name = 'scale';
-                this.attrs['scale'] = this.attrs['dropout_prob'];
-                this.attrs['bias'] = 0.0;
-                return true;
-            }
-            return false;
-        }
-        if (this.name === 'depthwise_conv2d') {
-            this.name = 'conv2d';
-        }
-        return true;
-    }
-
-    dispose() {
-        this.input = null;
-        this.output = null;
-        this.attrs = null;
-        for (let key in this.tensor) {
-            this.tensor[key].dispose();
-        }
-        this.tensor = {};
-    }
-}
diff --git a/web/src/utils/tensor.es6 b/web/src/utils/tensor.es6
deleted file mode 100644
index 492d02ba05..0000000000
--- a/web/src/utils/tensor.es6
+++ /dev/null
@@ -1,161 +0,0 @@
-/* eslint-disable */
-import Utils from './utils';
-/**
- * @file Tensor类
- * @author yangmingming
- */
-export default class Tensor {
-    constructor(opts = {}) {
-        this.opts = opts;
-        // 数据存储方式
-        this.isPacked = this.isPacked || false;
-        // 设置tensor名字
-        this.name = opts.name;
-        // tensor的形状
-        let shape = this.shape = opts.shape;
-        // 原始数据个数
-        this.total = shape.reduce((all, num) => all * num);
-        // 图像tensor是否带有batch
-        if (opts.needBatch && shape.length < 4) {
-            let batch = [];
-            for (let i = 0; i < (4 - shape.length); i++) {
-                batch.push(1);
-            }
-            shape = batch.concat(shape);
-            this.shape = shape;
-        }
-        // 获取转换到texture后的信息
-        let {offsetX, offsetY, exceedMax, zeroNumber, shape: shape_texture} = Utils.getTextureInfoFromTensorShape(shape, opts.isPacked);
-        this.shape_texture = shape_texture;
-        this.exceedMax = exceedMax;
-        this.offsetX = offsetX;
-        this.offsetY = offsetY;
-        // tensor数据
-        let data;
-        if (opts.type === 'image' || opts.type === 'x') {
-            this.data = opts.data;
-        }
-        else if (opts.data && opts.data.length) {
-            data = new Float32Array(opts.data.length);
-            if (!opts.notCompressed) {
-                let b = shape[0];
-                let c = shape[1];
-                let h = shape[2];
-                let w = shape[3];
-
-                for (let i = 0; i < opts.data.length; i++) {
-                    let j = i / (c * w) | 0;
-                    let k = i % (c * w);
-                    let b1 = j / h | 0;
-                    let h1 = j % h;
-                    let c1 = k % c;
-                    let w1 = k / c | 0;
-                    let l = b1 * (c * h * w) + c1 * (h * w) + h1 * (w) + w1;
-                    data[i] = opts.data[l];
-                }
-                this.data = data;
-            } else {
-                // batchnorm的scale
-                this.shape_texture = [4, 1, this.total / 4];
-                // data = [].concat(opts.data);
-                this.data = new Float32Array(opts.data);
-            }
-
-            // this.data = new Float32Array(data);
-            // console.log('this.data.length', this.data.length);
-            // 清理缓存
-            opts.data = null;
-        }
-    }
-
-    /**
-     * 获取数组下标, shape例子[M, W, H, D]
-     * @param pos {Array} tensor坐标索引
-     * @return {Number} tensor数据
-     */
-    getValue(pos = []) {
-        let p = [].concat(pos);
-        let len = p.length;
-        let sLen = this.shape.length;
-        // 补齐
-        for (let i = 0; i < (sLen - len); i++) {
-            p.unshift(0);
-        }
-        let index = 0;
-        for (let i = 0; i < sLen; i++) {
-            index += p[i] * this.shapeNumbers[i];
-        }
-        return this.data[index];
-    }
-
-    get width_texture() {
-        let length = this.shape_texture.length;
-        return this.shape_texture[length - 1];
-    }
-
-    get height_texture() {
-        let length = this.shape_texture.length;
-        return this.shape_texture[length - 2];
-    }
-
-    get width_shape() {
-        let length = this.shape.length;
-        return this.shape[length - 1];
-    }
-
-    get height_shape() {
-        let length = this.shape.length;
-        return this.shape[length - 2];
-    }
-
-    get channel() {
-        let length = this.shape.length;
-        if (length >= 3) {
-            return this.shape[length - 3];
-        }
-        return 0;
-    }
-
-    get offset_x() {
-        return this.offsetX;
-    }
-
-    get offset_y() {
-        return this.offsetY;
-    }
-
-    get limit() {
-        return this.exceedMax ? 'Limit' : '';
-    }
-
-    get length_shape() {
-        return this.shape.length || 0;
-    }
-
-    /**
-     * 获取shape对应的个数
-     * @return {Array} 和shape长度相等的对应个数
-     */
-    get numbers_shape() {
-        let numbers = [];
-        let sLen = this.shape.length;
-        for (let i = 0; i < (sLen - 1); i++) {
-            let number = this.shape.slice(i + 1).reduce((total, num) => total * num);
-            numbers.push(number);
-        }
-        // 和shape长度保持一致
-        numbers.push(1);
-        return numbers;
-    }
-
-    get total_shape() {
-        return this.total;
-    }
-
-    dispose() {
-        if (this.data) {
-            this.data = null;
-        }
-    }
-}
-/* eslint-enable */
diff --git a/web/src/utils/utils.es6 b/web/src/utils/utils.es6
deleted file mode 100644
index f27318e9e5..0000000000
--- a/web/src/utils/utils.es6
+++ /dev/null
@@ -1,190 +0,0 @@
-/**
- * @file 工具类
- * @author yangmingming
- */
-/* eslint-disable */
-export default {
-    // todo: 适用2维矩阵乘法，以后实现通用版本
-    getReshapeInPaddle(inputShape = [], counterShape = [], outShape = []) {
-        let total = inputShape.reduce((all, num) => all * num);
-        if (outShape.length === 1) {
-            return [1, total];
-        } else {
-            return [outShape[0], total / outShape[0]];
-        }
-    },
-
-    getBroadcastShapeInPaddle(shapeA= [], shapeB = [], axis = 1) {
-        // todo: 简易版本，以后需要实现一个通用版本
-        let bigger = shapeA;
-        let result = shapeB;
-        if (shapeA.length - shapeB.length < 0) {
-            bigger = shapeB;
-            result = shapeA;
-        }
-        return result.concat(bigger.slice(axis));
-    },
-
-    getBroadcastDims(inShape = [], outShape = []) {
-        const inRank = inShape.length;
-        const dims = [];
-        for (let i = 0; i < inRank; i++) {
-            const dim = inRank - 1 - i;
-            const a = inShape[dim] || 1;
-            const b = outShape[outShape.length - 1 - i] || 1;
-            if (b > 1 && a === 1) {
-                dims.unshift(dim);
-            }
-        }
-        return dims;
-    },
-
-    getBroadcastShape(shapeA = [], shapeB = []) {
-        const result = [];
-        const max = Math.max(shapeA.length, shapeB.length);
-        for (let i = 0; i < max; i++) {
-            let a = shapeA[shapeA.length - i - 1];
-            if (a === null) {
-                a = 1;
-            }
-            let b = shapeB[shapeB.length - i - 1];
-            if (b === null) {
-                b = 1;
-            }
-            if (a === 1) {
-                result.unshift(b);
-            } else if (b === 1) {
-                result.unshift(a);
-            } else if (a !== b) {
-                return null;
-            } else {
-                result.unshift(a);
-            }
-        }
-        return result;
-    },
-
-    applyFilterWinograd(data, shape) {
-        const [b, c, h, w] = shape;
-        let offset = 0;
-        let index = 0;
-        const result = new Float32Array(b * c * 16);
-        // h和w是3、3
-        const size2D = 9;
-        for (let i = 0; i < b; i++) {
-            // let index = i * c * size2D;
-            for (let j = 0; j < c; j++) {
-                // index += j * size2D;
-                const filter = data.subarray(index, index + size2D);
-                const [f11, f12, f13, f21, f22, f23, f31, f32, f33] = filter;
-                const square = [
-                    f11,
-                    0.5 * f11 + 0.5 * f12 + 0.5 * f13,
-                    0.5 * f11 - 0.5 * f12 + 0.5 * f13,
-                    f13,
-                    0.5 * f11 + 0.5 * f21 + 0.5 * f31,
-                    0.25 * f11 + 0.25 * f12 + 0.25 * f13 + 0.25 * f21 + 0.25 * f22 + 0.25 * f23 + 0.25 * f31 + 0.25 * f32 + 0.25 * f33,
-                    0.25 * f11 - 0.25 * f12 + 0.25 * f13 + 0.25 * f21 - 0.25 * f22 + 0.25 * f23 + 0.25 * f31 - 0.25 * f32 + 0.25 * f33,
-                    0.5 * f13 + 0.5 * f23 + 0.5 * f33,
-                    0.5 * f11 - 0.5 * f21 + 0.5 * f31,
-                    0.25 * f11 + 0.25 * f12 + 0.25 * f13 - 0.25 * f21 - 0.25 * f22 - 0.25 * f23 + 0.25 * f31 + 0.25 * f32 + 0.25 * f33,
-                    0.25 * f11 - 0.25 * f12 + 0.25 * f13 - 0.25 * f21 + 0.25 * f22 - 0.25 * f23 + 0.25 * f31 - 0.25 * f32 + 0.25 * f33,
-                    0.5 * f13 - 0.5 * f23 + 0.5 * f33,
-                    f31,
-                    0.5 * f31 + 0.5 * f32 + 0.5 * f33,
-                    0.5 * f31 - 0.5 * f32 + 0.5 * f33,
-                    f33
-                ];
-                result.set(square, offset);
-                offset += 16;
-                index += size2D;
-            }
-        }
-        return result;
-    },
-
-    /**
-     * 获取texture形状和补0个数
-     * @param shape {Array} tensor的形状
-     * @return {{shape: *[], zeroNumber: number}} {Object} texture信息
-     */
-    getTextureInfoFromTensorShape(shape = [], isPacked = false) {
-        let b = shape[0];
-        let c = shape[1];
-        let h = shape[2];
-        let w = shape[3];
-        let height = b * h;
-        let width = c * w;
-        let offsetX = 0;
-        let offsetY = 0;
-        // 安卓和ios的max texture size是4096, 改造存储空间(2bh, cw / 2)
-        let exceedMax = false;
-        if (height > 4096 || width > 4096) {
-            height *= 2;
-            width = c * (Math.ceil(w / 2));
-            exceedMax = true;
-        }
-        if (isPacked) {
-            // 紧凑布局
-            height = b * c * Math.ceil(h / 2);
-            width = Math.ceil(w / 2);
-            offsetX = w % 2;
-            offsetY = h % 2;
-        }
-        return {
-            offsetX,
-            offsetY,
-            exceedMax,
-            shape: [4, height, width],
-            zeroNumber: 0
-        };
-    },
-
-    // 获取数组中的最大值和索引
-    getMaxItem(datas = []) {
-        let max = Math.max.apply(null, datas);
-        let index = datas.indexOf(max);
-        return {value: max, index};
-    },
-
-    // 压缩
-    async loadShader(name) {
-        let shader = await fetch(this.getShaderFile(name));
-        return shader.text();
-    },
-
-    getShaderFile(url) {
-        // todo: 根据脚手架获取shader文件
-        const aa = url.split('/');
-        let length = aa.length;
-        return '/' + aa[length - 1];
-    },
-
-    img2texture(renderData = {}) {
-        const {height_texture, width_texture, shape} = renderData;
-        const total = height_texture * width_texture * 4;
-        const b = shape[0];
-        const c = shape[1];
-        const h = shape[2];
-        const w = shape[3];
-        let data = new Float32Array(b * c * h * w * 4);
-        let offset = 0;
-        for (let i = 0; i < total; i++) {
-            let j = (i / (c * w)) | 0;
-            let k = i % (c * w);
-            let b1 = j / h | 0;
-            let h1 = j % h;
-            let c1 = k % c;
-            let w1 = k / c | 0;
-            let l = b1 * (c * h * w) + c1 * (h * w) + h1 * (w) + w1;
-            data[offset] = renderData.data[l];
-            offset += 4;
-            // data.push(renderData.data[l]);
-            // data.push(0);
-            // data.push(0);
-            // data.push(0);
-        }
-        renderData.data = data;
-    }
-};
-/* eslint-enable */
diff --git a/web/tools/logger.es6 b/web/tools/logger.es6
deleted file mode 100644
index d6f53cdf4a..0000000000
--- a/web/tools/logger.es6
+++ /dev/null
@@ -1,101 +0,0 @@
-/**
- * file tools/logger logger工具
- * author saniac(snailsword@gmail.com)
- */
-
-export default class Logger {
-    constructor() {
-        this.timeTable = {};
-        this.countTable = {};
-        this.duringTable = {};
-        this.lastStopTable = {};
-    }
-
-    start(key) {
-        let arr = this.timeTable[key];
-        if (!arr) {
-            arr = [{}];
-        }
-        else {
-            if (!arr[arr.length - 1].endTime) {
-                console.error('[logger] key:' + key + ' duplicate start logger');
-                return;
-            }
-            arr.push({});
-        }
-        arr[arr.length - 1].startTime = this.time;
-        this.timeTable[key] = arr;
-        return this;
-    }
-
-    end(key) {
-        // console.log(this.timeTable[key]);
-        if (!this.timeTable[key]) {
-            console.log(this.timeTable[key]);
-            console.error('[logger] key:' + key + ' no matching start logger');
-            return;
-        }
-        let currentObj = this.timeTable[key][this.timeTable[key].length - 1];
-        if (currentObj.endTime) {
-            console.error('[logger] key:' + key + ' duplicate end logger');
-            return;
-        }
-        currentObj.endTime = this.time;
-        currentObj.during = currentObj.endTime - currentObj.startTime;
-        return this;
-    }
-
-    // 数次数
-    count(key) {
-        if (this.countTable[key]) {
-            this.countTable[key]++;
-        }
-        else {
-            this.countTable[key] = 1;
-        }
-        return this;
-    }
-
-    // 看每次执行的时间间隔
-    during(key) {
-        if (this.lastStopTable[key]) {
-            this.duringTable[key].push(this.time - this.lastStopTable[key]);
-            this.lastStopTable[key] = this.time;
-        }
-        else {
-            this.lastStopTable[key] = this.time;
-            this.duringTable[key] = [];
-        }
-        return this;
-    }
-
-    get time() {
-        return +new Date().getTime();
-    }
-    get statistics() {
-        // time
-        let timeResult = [];
-        let item;
-        for (let key in this.timeTable) {
-            item = this.timeTable[key];
-            let len = item.length;
-            let max = 0;
-            let min = Number.MAX_VALUE;
-            let sum = 0;
-            for (let i = 0; i < len; i++) {
-                max = Math.max(max, item[i].during);
-                min = Math.min(min, item[i].during);
-                sum += item[i].during;
-            }
-            timeResult.push({
-                name: key,
-                length: len,
-                avg: sum / len,
-                max,
-                min
-            });
-        }
-        console.table(timeResult);
-        return {timeResult};
-    }
-}
diff --git a/web/tools/toBinaryFile.py b/web/tools/toBinaryFile.py
deleted file mode 100644
index e3b4ff91d4..0000000000
--- a/web/tools/toBinaryFile.py
+++ /dev/null
@@ -1,111 +0,0 @@
-#coding:utf-8
-#! /bin/python
-# 把文本文件转为二进制文件的工具
-import os
-import struct
-import random
-import math
-
-class BinaryFileConverter:
-    def __init__(self, delimiter, ignorChar, ignorLine, types, originDir, resultDir, ext, formatter, dotPrintRatio, merge):
-        # 每行中数字间的分隔符
-        self.delimiter = delimiter
-        # 需要忽略的符号
-        self.ignorChar = ignorChar
-        # 需要忽略的行
-        self.ignorLine = ignorLine
-        # 需要转的文件
-        self.types = types
-        # 转之前的
-        self.originDir = originDir
-        # 转之后的文件夹
-        self.resultDir = resultDir
-        # 转换后的后缀
-        self.ext = ext
-        # 格式 可选内容参考：https://docs.python.org/3/library/struct.html?#format-characters
-        self.formatter = formatter
-        # 打点率
-        self.dotPrintRatio = dotPrintRatio
-        # 合成几个文件 0代表不合并
-        self.merge = merge
-        # 存合并的数据的文件
-        self.mergedResultFileName = resultDir + '/mergedData.dat'
-        # 计数器
-        self.i = 0
-
-    def dfs(self, rootDir):
-        for item in sorted(os.listdir(rootDir)):
-            path = os.path.join(rootDir, item)
-            if os.path.isdir(path):
-                self.dfs(path)
-            else:
-                self.process(path)
-                # print(path)
-
-    def process(self, path):
-        (curFile, curType) = os.path.splitext(path)
-        if curType in self.types:
-            originFile = open(path, 'r')
-            if not self.merge:
-                newFilePath = self.resultDir + curFile[len(self.originDir):] + self.ext
-                newFileDir = os.path.dirname(newFilePath)
-                if not os.path.exists(newFileDir):
-                    os.makedirs(newFileDir)
-                self.resultFile = open(newFilePath ,'wb')
-            print('开始写啦' + path)
-            self.writeToFile(originFile, self.resultFile)
-            if not self.merge:
-                self.resultFile.close()
-            print('\n')
-            print('写完啦' + path)
-
-    def writeToFile(self, originFile, resultFile):
-        lines = originFile.readlines()
-        for line in lines:
-            if (line in self.ignorLine) or (line.strip() in self.ignorLine):
-                continue
-            curLine = line.strip().split(self.delimiter)
-            for i in curLine:
-                if (not len(i.strip())) or (i in self.ignorChar):
-                    continue
-                if random.random() < self.dotPrintRatio:
-                    print('.', end = '')
-                self.i += 1
-                parsedata = struct.pack(self.formatter, float(i))
-                resultFile.write(parsedata)
-        originFile.close()
-
-    def convert(self):
-        if self.merge:
-            if not os.path.exists(self.resultDir):
-                os.makedirs(self.resultDir)
-            self.resultFile = open(self.mergedResultFileName ,'wb')
-        self.dfs(self.originDir)
-        print('共写入了%s条数据' % self.i)
-        self.resultFile.close()
-        if self.merge > 1:
-            f = open(self.mergedResultFileName, 'rb')
-            data = f.read() # read the entire content of the file
-            f.close()
-            bytes = len(data)
-            size = (int(bytes / self.merge // 16) + 1) * 16
-            count = 1
-            for i in range(0, bytes + 1, size):
-                fni = self.resultDir + '/chunk_%s' % count
-                f = open(fni, 'wb')
-                f.write(data[i : i + size])
-                f.close()
-                count += 1
-
-
-BinaryFileConverter(
-    delimiter = ',',
-    ignorChar = ['[', ']'],
-    ignorLine = ['[', ']'],
-    types = ['.txt', '.json'],
-    originDir = './mobileNet',
-    resultDir = './binf',
-    ext = '.dat',
-    formatter = 'f',
-    dotPrintRatio = 0,
-    merge = 6).convert()
diff --git a/web/webpack.config.js b/web/webpack.config.js
deleted file mode 100644
index ebfdb84225..0000000000
--- a/web/webpack.config.js
+++ /dev/null
@@ -1,48 +0,0 @@
-
-/**
- * @file 打包到rd机器的配置
- * @author yangmingming zhangmiao06
- */
-
-const path = require('path');
-const ExtractTextPlugin = require('extract-text-webpack-plugin');
-
-const extractLess = new ExtractTextPlugin({
-    filename: '[name].css'
-});
-
-module.exports = {
-    mode: 'development',
-    devtool: 'none',
-    optimization: {
-        minimize: false
-    },
-    entry: {
-        camera: './src/executor/camera',
-        index: './src/executor/runner'
-    },
-    output: {
-        filename: '../graphfe/src/view/common/lib/paddle/[name].js',
-        path: path.resolve(__dirname, './'),
-        library: 'panorama',
-        libraryTarget: 'umd',
-        libraryExport: 'default'
-    },
-    module: {
-        rules: [{
-            test: /\.(eot|woff|woff2|ttf|svg|png|jpg)$/,
-            loader: 'url-loader?limit=30000&name=[name].[ext]'
-        }, {
-            test: /\.less$/,
-            exclude: /node_modules/,
-            loader: ExtractTextPlugin.extract([
-                {loader: 'css-loader', options: {minimize: true}},
-                {loader: 'less-loader'}
-            ])
-        }]
-    },
-    plugins: [extractLess],
-    resolve: {
-        extensions: ['.es6', '.js', '.json']
-    }
-};
diff --git a/x2paddle.md b/x2paddle.md
new file mode 100644
index 0000000000..ddca4be0b7
--- /dev/null
+++ b/x2paddle.md
@@ -0,0 +1,44 @@
+# 通过 X2Paddle 转换模型
+
+[X2Paddle](https://github.com/PaddlePaddle/X2Paddle)支持将Caffe/TensorFlow模型转换为PaddlePaddle模型。目前X2Paddle支持的模型参考[x2paddle_model_zoo](https://github.com/PaddlePaddle/X2Paddle/blob/develop/x2paddle_model_zoo.md)。
+
+## 安装
+
+```
+pip install x2paddle
+```
+
+安装最新版本，可使用如下安装方式
+
+```
+pip install git+https://github.com/PaddlePaddle/X2Paddle.git@develop
+```
+
+## 使用
+
+### Caffe
+
+```
+x2paddle --framework caffe \
+         --prototxt model.proto \
+	 --weight model.caffemodel \
+         --save_dir paddle_model
+```
+
+### TensorFlow
+
+```
+x2paddle --framework tensorflow \
+	 --model model.pb \
+	 --save_dir paddle_model
+```
+
+## 转换结果说明
+
+在指定的`save_dir`下生成两个目录  
+1. inference_model : 模型结构和参数均序列化保存的模型格式
+2. model_with_code : 保存了模型参数文件和模型的python代码
+
+## 问题反馈
+
+X2Paddle使用时存在问题时，欢迎您将问题或Bug报告以[Github Issues](https://github.com/PaddlePaddle/X2Paddle/issues)的形式提交给我们，我们会实时跟进。
diff --git "a/\345\246\202\344\275\225\345\234\250Android\346\211\213\346\234\272\344\270\212\350\277\220\350\241\214\345\215\225\346\265\213.md" "b/\345\246\202\344\275\225\345\234\250Android\346\211\213\346\234\272\344\270\212\350\277\220\350\241\214\345\215\225\346\265\213.md"
new file mode 100644
index 0000000000..02e20ba8a3
--- /dev/null
+++ "b/\345\246\202\344\275\225\345\234\250Android\346\211\213\346\234\272\344\270\212\350\277\220\350\241\214\345\215\225\346\265\213.md"
@@ -0,0 +1,28 @@
+
+运行run_on_android.sh
+1. 到tools目录下执行编译脚本 选择对应的平台目标和网络类型(可不选)  
+```
+cd tools
+sh build.sh android googlenet
+```
+ 
+2. 到scripts目录下，执行run_on_android.sh，将模型文件及运行单测所需要的二进制文件和paddle-mobile动态库push到手机中，可执行文件在data/local/tmp/bin目录下  
+```
+cd android-debug-script
+sh run_on_android.sh (npm) 参数npm选择是否传输模型文件到手机上，第二次可以加上npm参数
+```
+    
+出现:  
+...     
+test-softmax      
+test-squeezenet      
+test-transpose-op      
+test-yolo     
+**** choose OP or NET to test ****     
+which to test :  
+输入想要test的op或net       
+which to test : test-googlenet
+
+3.显示返回结果
+
+注意:如果需要删除可进入adb shell 手动删除/data/local/tmp下所有文件夹，重新run_on_android.sh
\ No newline at end of file
-- 
GitLab